diff --git a/.azure_pipelines/build-pipeline.yml b/.azure_pipelines/build-pipeline.yml
deleted file mode 100644
index 34e39517e350..000000000000
--- a/.azure_pipelines/build-pipeline.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-# PyTorch CI Builds Pipeline on Azure DevOps
-#
-# This pipeline:
-#   1) builds PyTorch on select configurations
-#   2) runs only TestTorch unit tests.
-
-stages:
-- stage: 'Build'
-  displayName: 'Build PyTorch'
-  jobs:
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: 'PyTorch-Linux-CPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_ci_build: True
-      os: ubuntu
-      cuda: cpu
-      customMatrixes:
-        Py_38:
-          configuration: ubuntu_1804_py_38_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev
-
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: 'PyTorch-Linux-GPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_ci_build: True
-      os: ubuntu
-      cuda: gpu
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: ubuntu_1804_py_39_cuda_112_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev
-          CUDA_VERSION: 112
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: 'PyTorch-Win-CPU'
-      build_stage: True
-      is_ci_build: True
-      os: windows
-      cuda: cpu
-      customMatrixes:
-        Py_37:
-          configuration: windows_2019_py_37_cpu
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: 'PyTorch-Win-GPU'
-      build_stage: True
-      is_ci_build: True
-      os: windows
-      cuda: gpu
-      customMatrixes:
-        Py_38_CUDA_102_cuDNN_765:
-          configuration: windows_2019_py_38_cuda_102_cudnn_765
-          CUDA_VERSION: 102
diff --git a/.azure_pipelines/daily-pipeline.yml b/.azure_pipelines/daily-pipeline.yml
deleted file mode 100644
index 2c5c382befc3..000000000000
--- a/.azure_pipelines/daily-pipeline.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-# PyTorch Daily Builds Pipeline on Azure DevOps
-#
-# This pipeline:
-#   1) builds PyTorch on all available configurations
-#   2) runs all PyTorch unit tests
-
-stages:
-- stage: 'BuildTest'
-  displayName: 'Build and Test PyTorch'
-  jobs:
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: 'PyTorch-Linux-CPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_daily_build: True
-      os: ubuntu
-      cuda: cpu
-      customMatrixes:
-        Py_38:
-          configuration: ubuntu_1804_py_38_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev
-        Py_37:
-          configuration: ubuntu_1804_py_37_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev
-
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: 'PyTorch-Linux-GPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_daily_build: True
-      os: ubuntu
-      cuda: gpu
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: ubuntu_1804_py_39_cuda_112_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_810:
-          configuration: ubuntu_1804_py_38_cuda_102_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_765:
-          configuration: ubuntu_1804_py_37_cuda_101_cudnn_765
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev
-          CUDA_VERSION: 101
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: 'PyTorch-Win-CPU'
-      build_stage: True
-      is_daily_build: True
-      os: windows
-      cuda: cpu
-      customMatrixes:
-        Py_38:
-          configuration: windows_2019_py_38_cpu
-        Py_37:
-          configuration: windows_2019_py_37_cpu
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: 'PyTorch-Win-GPU'
-      build_stage: True
-      is_daily_build: True
-      os: windows
-      cuda: gpu
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: windows_2019_py_39_cuda_112_cudnn_810
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_765:
-          configuration: windows_2019_py_38_cuda_102_cudnn_765
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_764:
-          configuration: windows_2019_py_37_cuda_101_cudnn_764
-          CUDA_VERSION: 101
diff --git a/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml b/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml
deleted file mode 100644
index 6d428c1c6647..000000000000
--- a/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml
+++ /dev/null
@@ -1,134 +0,0 @@
-# PyTorch build steps template with Unix images Azure DevOps Instances
-#
-# This build depends on 3 parameters set as environment variables in the pipeline:
-#   - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
-#   - AZURE_DEVOPS_ARTIFACTS_ORGANIZATION: Azure Artifacts Organization name to publish artifacts
-#   - AZURE_DEVOPS_ARTIFACTS_PROJECT: Azure Artifacts Project name to publish artifacts
-
-parameters:
-  name: ''
-  pool: ''
-  container_endpoint: ''
-  os: ''
-  cuda: ''
-  is_ci_build: False
-  is_official_build: False
-  is_daily_build: False
-  build_stage: False
-  verify_stage: False
-  publish_stage: False
-  customMatrixes: ''
-
-jobs:
-- job: ${{parameters.name}}
-  timeoutInMinutes: 300
-  strategy:
-    matrix:
-      ${{ insert }}: ${{parameters.customMatrixes}}
-  pool:
-    name: ${{ parameters.pool}}
-  variables:
-    DECODE_PERCENTS: false
-  container:
-    image: $[variables['container_image']]
-    endpoint: ${{parameters.container_endpoint}}
-
-  steps:
-  # Build stage
-  - ${{ if eq(parameters.build_stage, 'True') }}:
-    # Set up environment variables for specific pipeline build
-    - template: set-environment-variables.yml
-      parameters:
-        os: ${{ parameters.os}}
-        cuda: ${{ parameters.cuda}}
-        is_official_build: ${{ parameters.is_official_build}}
-
-    # Sync and update PyTorch submodules
-    - bash: git submodule update --init --recursive --jobs 0
-      displayName: Update PyTorch submodules
-
-    # Build PyTorch and run unit tests - no packaging
-    - ${{ if or(eq(parameters.is_ci_build, 'True'), eq(parameters.is_daily_build, 'True')) }}:
-      # Build PyTorch from source in develop mode
-      - bash: python setup.py develop
-        displayName: Build PyTorch from source
-
-      - ${{ if eq(parameters.is_ci_build, 'True') }}:
-        # Run TestTorch unit tests to demonstrate successful PyTorch build
-        - bash: python test/test_torch.py TestTorch
-          displayName: Run TestTorch unit tests
-
-      - ${{ if eq(parameters.is_daily_build, 'True') }}:
-        # Run all unit tests to demonstrate successful PyTorch build
-        - bash: python test/run_test.py --continue-through-error --exclude-jit-executor --verbose
-          displayName: Run all unit tests
-
-      # Run ComponentGovernance
-      - task: ComponentGovernanceComponentDetection@0
-        inputs:
-          scanType: 'Register'
-          verbosity: 'Verbose'
-          alertWarningLevel: 'High'
-
-    # Build PyTorch and produce artifacts for verification stage
-    - ${{ if eq(parameters.is_official_build, 'True') }}:
-      # Build PyTorch from source in install mode and exclude test binaries
-      - bash: python setup.py install
-        displayName: Build PyTorch from source without test binaries
-
-      # Package PyTorch Wheel
-      - bash: python setup.py bdist_wheel
-        displayName: Package PyTorch Wheel
-
-      # Publish PyTorch Wheel
-      - task: PublishPipelineArtifact@1
-        inputs:
-          targetPath: $(Build.SourcesDirectory)/dist/
-          artifactName: Build_$(Build.BuildNumber)_$(configuration)
-        displayName: Publish PyTorch Wheel to Pipeline Artifacts
-
-  # Verification stage
-  - ${{ if eq(parameters.verify_stage, 'True') }}:
-    # Download PyTorch Wheel
-    - task: DownloadPipelineArtifact@2
-      inputs:
-        artifact: Build_$(Build.BuildNumber)_$(configuration)
-        path: $(Build.SourcesDirectory)/verify
-      displayName: Download PyTorch Wheel
-
-    # Install PyTorch Wheel on Windows
-    - bash: python -m pip install $(Build.SourcesDirectory)/verify/torch*linux*.whl
-      displayName: Install PyTorch Wheel
-
-    # Ensure PyTorch installed correctly from produced wheel
-    - bash: |
-        cd $(Build.SourcesDirectory)/verify
-        python -c "import torch; print('Installed Torch version: ' + torch.__version__)"
-      displayName: Check PyTorch correctly installed from wheel
-
-  # Publishing stage
-  - ${{ if eq(parameters.publish_stage, 'True') }}:
-    # Download PyTorch Wheel
-    - task: DownloadPipelineArtifact@2
-      inputs:
-        artifact: Build_$(Build.BuildNumber)_$(configuration)
-        path: $(Build.SourcesDirectory)/publish
-      displayName: Download PyTorch Wheel
-
-    # Publish wheel to Azure Artifacts
-    # The flag continueOnError=true is needed as the artifact to be published
-    # may already exist, because the artifact is differentiated based on the
-    # last commit date.
-    - bash: |
-        export TORCH_VERSION=$(head -c 5 ./version.txt)
-        export LAST_COMMIT=$(git rev-parse --short HEAD)
-        export LAST_COMMIT_DATE=$(git log -1 --pretty=%ad --date=format:%Y%m%d)
-        cd $(Build.SourcesDirectory)/publish
-        export TORCH_WHEEL=$(echo torch*linux*whl)
-        az extension add -n azure-devops
-        echo $ADOTOKEN | az devops login
-        az artifacts universal publish --organization $AZURE_DEVOPS_ARTIFACTS_ORGANIZATION --project $AZURE_DEVOPS_ARTIFACTS_PROJECT --scope project --feed "PyTorch" --name $TORCH_WHEEL --description "PyTorch Official Build Artifact" --version $TORCH_VERSION-$LAST_COMMIT_DATE-$LAST_COMMIT --path .
-      env:
-        ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-      continueOnError: true
-      displayName: Upload PyTorch Official Build package to Azure Artifacts
diff --git a/.azure_pipelines/job_templates/build-verify-publish-template-win.yml b/.azure_pipelines/job_templates/build-verify-publish-template-win.yml
deleted file mode 100644
index 42f701e1edb9..000000000000
--- a/.azure_pipelines/job_templates/build-verify-publish-template-win.yml
+++ /dev/null
@@ -1,150 +0,0 @@
-# PyTorch build steps template with Windows images Azure DevOps Instances
-#
-# This build depends on 3 parameters set as environment variables in the pipeline:
-#   - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
-#   - AZURE_DEVOPS_ARTIFACTS_ORGANIZATION: Azure Artifacts Organization name to publish artifacts
-#   - AZURE_DEVOPS_ARTIFACTS_PROJECT: Azure Artifacts Project name to publish artifacts
-
-parameters:
-  name: ''
-  pool: ''
-  os: ''
-  cuda: ''
-  is_ci_build: False
-  is_official_build: False
-  is_daily_build: False
-  build_stage: False
-  verify_stage: False
-  publish_stage: False
-  customMatrixes: ''
-
-jobs:
-- job: ${{parameters.name}}
-  timeoutInMinutes: 300
-  strategy:
-    matrix:
-      ${{ insert }}: ${{parameters.customMatrixes}}
-  pool:
-    name: ${{ parameters.pool}}
-  variables:
-    CMAKE_GENERATOR: Ninja
-    PACKAGE_PDBS: 0
-
-  steps:
-  # Prepare for PyTorch build on Windows
-  - template: prepare-build-template.yml
-    parameters:
-      configuration: $(configuration)
-      build_stage: ${{ parameters.build_stage}}
-
-  # Build Stage
-  - ${{ if eq(parameters.build_stage, 'True') }}:
-    # Set up environment variables for specific pipeline build
-    - template: set-environment-variables.yml
-      parameters:
-        os: ${{ parameters.os}}
-        cuda: ${{ parameters.cuda}}
-        is_official_build: ${{ parameters.is_official_build}}
-
-    # Sync and update PyTorch submodules
-    - script: git submodule update --init --recursive --jobs 0
-      displayName: Update PyTorch submodules
-
-    # Build PyTorch and run unit tests - no packaging
-    - ${{ if or(eq(parameters.is_ci_build, 'True'), eq(parameters.is_daily_build, 'True')) }}:
-      # Build PyTorch from source in develop mode with Ninja
-      - script: call activate $(configuration) && python setup.py develop
-        displayName: Build PyTorch from source
-
-      - ${{ if eq(parameters.is_ci_build, 'True') }}:
-        # Run TestTorch unit tests to demonstrate successful PyTorch build
-        - script: call activate $(configuration) && python test\test_torch.py TestTorch
-          displayName: Run TestTorch unit tests
-
-      - ${{ if eq(parameters.is_daily_build, 'True') }}:
-        # Run all unit tests to demonstrate successful PyTorch build
-        - script: call activate $(configuration) && python test/run_test.py --continue-through-error --exclude-jit-executor --verbose
-          displayName: Run all unit tests
-
-      # Run ComponentGovernance
-      - task: ComponentGovernanceComponentDetection@0
-        inputs:
-          scanType: 'Register'
-          verbosity: 'Verbose'
-          alertWarningLevel: 'High'
-
-    # Build PyTorch and produce artifacts for verification stage
-    - ${{ if eq(parameters.is_official_build, 'True') }}:
-      # Build PyTorch from source in install mode with Ninja and exclude test binaries
-      - script: call activate $(configuration) && python setup.py install
-        displayName: Build PyTorch from source without test binaries
-
-      # Package PyTorch Wheel
-      - script: call activate $(configuration) && python setup.py bdist_wheel
-        displayName: Package PyTorch Wheel
-
-      # Publish PyTorch Wheel
-      - task: PublishPipelineArtifact@1
-        inputs:
-          targetPath: $(Build.SourcesDirectory)\dist\
-          artifactName: Build_$(Build.BuildNumber)_$(configuration)
-        displayName: Publish PyTorch Wheel to Pipeline Artifacts
-
-  # Verification Stage
-  - ${{ if eq(parameters.verify_stage, 'True') }}:
-    # Download PyTorch Wheel
-    - task: DownloadPipelineArtifact@2
-      inputs:
-        artifact: Build_$(Build.BuildNumber)_$(configuration)
-        path: $(Build.SourcesDirectory)\verify
-      displayName: Download PyTorch Wheel
-
-    # Install PyTorch Wheel on Windows
-    - script: |
-        call activate $(configuration)
-        cd $(Build.SourcesDirectory)\verify
-        dir torch*win*.whl /b > whl.txt
-        set /p whl= < whl.txt
-        python -m pip install %whl%
-      displayName: Install PyTorch Wheel
-
-    # Ensure PyTorch installed correctly from produced wheel
-    - script: |
-        call activate $(configuration)
-        cd $(Build.SourcesDirectory)\verify
-        python -c "import torch; print('Installed Torch version: ' + torch.__version__)"
-      displayName: Check PyTorch correctly installed from wheel
-
-  # Publishing stage
-  - ${{ if eq(parameters.publish_stage, 'True') }}:
-    # Download PyTorch Wheel
-    - task: DownloadPipelineArtifact@2
-      inputs:
-        artifact: Build_$(Build.BuildNumber)_$(configuration)
-        path: $(Build.SourcesDirectory)\publish
-      displayName: Download PyTorch Wheel
-
-    # Set up Azure Artifacts for Windows
-    # The pip install --upgrade command is a bug fix for Azure CLI on Windows
-    # More info: https://github.com/Azure/azure-cli/issues/16858
-    - script: |
-        pip install --upgrade pip --target \opt\az\lib\python3.6\site-packages\
-        az extension add -n azure-devops
-      displayName: Set up Azure Artifacts download on Windows
-
-    # Publish wheel to Azure Artifacts
-    # The flag continueOnError=true is needed as the artifact to be published
-    # may already exist, because the artifact is differentiated based on the
-    # last commit date.
-    - script: |
-        set /p TORCH_VERSION= < version.txt
-        cd $(Build.SourcesDirectory)\publish
-        git rev-parse --short HEAD > last_commit.txt && set /p LAST_COMMIT= < last_commit.txt
-        git log -1 --pretty=%ad --date=format:%Y%m%d > last_commit_date.txt && set /p LAST_COMMIT_DATE= < last_commit_date.txt
-        dir torch*win*.whl /b > whl.txt && set /p TORCH_WHEEL= < whl.txt
-        echo %ADOTOKEN% | az devops login
-        az artifacts universal publish --organization %AZURE_DEVOPS_ARTIFACTS_ORGANIZATION% --project %AZURE_DEVOPS_ARTIFACTS_PROJECT% --scope project --feed "PyTorch" --name %TORCH_WHEEL% --description "PyTorch Official Build Artifact" --version %TORCH_VERSION:~0,5%-%LAST_COMMIT_DATE%-%LAST_COMMIT% --path .
-      env:
-        ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-      continueOnError: true
-      displayName: Upload PyTorch nigthly package to Azure Artifacts
diff --git a/.azure_pipelines/job_templates/common-packages.yml b/.azure_pipelines/job_templates/common-packages.yml
deleted file mode 100644
index 2760f673cb77..000000000000
--- a/.azure_pipelines/job_templates/common-packages.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-dependencies:
-  - python=PYTHON_VERSION
-  - numpy
-  - ninja
-  - pyyaml
-  - mkl
-  - mkl-include
-  - setuptools
-  - cmake
-  - cffi
-  - typing_extensions
-  - future
-  - six
-  - requests
-  - dataclasses
-  - pip:
-    - -r ../../requirements.txt
diff --git a/.azure_pipelines/job_templates/notify-webapp-template.yml b/.azure_pipelines/job_templates/notify-webapp-template.yml
deleted file mode 100644
index 3b6a5314e11a..000000000000
--- a/.azure_pipelines/job_templates/notify-webapp-template.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-parameters:
-  name: ''
-  pool: ''
-  customMatrixes: ''
-
-jobs:
-- job: ${{parameters.name}}
-  timeoutInMinutes: 600
-  strategy:
-    matrix:
-      ${{ insert }}: ${{parameters.customMatrixes}}
-  pool:
-    name: ${{ parameters.pool}}
-  steps:
-  # Clone PyTorch Tests repository
-  - bash: |
-      B64_PAT=$(echo -n ":$_ADOTOKEN" | base64)
-      git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL)
-      cd pytorch_tests
-      git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH)
-    env:
-      _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-    displayName: Clone PyTorch Tests repo
-  - bash: |
-      bash $(Build.SourcesDirectory)/pytorch_tests/webapp/notify_webapp.sh
-    displayName: Notify Webapp
diff --git a/.azure_pipelines/job_templates/prepare-build-template.yml b/.azure_pipelines/job_templates/prepare-build-template.yml
deleted file mode 100644
index 0755c07e2672..000000000000
--- a/.azure_pipelines/job_templates/prepare-build-template.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Build prepare steps for PyTorch on Azure DevOps to build from source.
-# These steps share between normal build process and semmle security scan tasks
-
-parameters:
-  build_stage: False
-  configuration: ''
-
-steps:
-# End Python tasks that may be lingering over from previous runs
-# Note: If python.exe isn't currently running, exit code becomes 128,
-# which fails the run. Here exit code is set to 0 to avoid failed run.
-- script: |
-    taskkill /f /im python.exe
-    IF %ERRORLEVEL% EQU 128 exit 0
-  displayName: End previous Python processes
-
-# Clean up env directory in conda for fresh builds and set up conda environment YAML
-- powershell: |
-    Remove-Item 'C:\Miniconda\envs' -Recurse -ErrorAction Ignore
-    $env:PYTHON_VERSION = $env:SYSTEM_JOBNAME.Substring(3,1) + '.' + $env:SYSTEM_JOBNAME.Substring(4,1)
-    (Get-Content .azure_pipelines\job_templates\common-packages.yml) -replace 'PYTHON_VERSION', $env:PYTHON_VERSION | Out-File -encoding ASCII .azure_pipelines\job_templates\common-packages.yml
-  displayName: Clean up previous environments and Set up conda environment YAML
-
-# Make conda environment and install required packages
-- script: |
-    call conda clean --all -y
-    call conda env create -n $(configuration) --file .azure_pipelines\job_templates\common-packages.yml
-    call activate $(configuration)
-    call conda install -c conda-forge libuv=1.39
-  displayName: Set up conda environment for building from source
-
-- ${{ if eq(parameters.build_stage, 'True') }}:
-  # Install MKL
-  - script: |
-      rmdir /s /q mkl
-      del mkl_2020.2.254.7z
-      curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O
-      7z x -aoa mkl_2020.2.254.7z -omkl
-    displayName: Install MKL
-
-  # Install sccache and randomtemp
-  # Related PyTorch GitHub issue: https://github.com/pytorch/pytorch/issues/25393
-  # Related fix: https://github.com/pytorch/builder/pull/448/
-  - script: |
-      mkdir .\tmp_bin
-      curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output .\tmp_bin\sccache.exe
-      curl -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output .\tmp_bin\sccache-cl.exe
-      copy .\tmp_bin\sccache.exe .\tmp_bin\nvcc.exe
-      curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output .\tmp_bin\randomtemp.exe
-    displayName: Install sccache and randomtemp
-    condition: not(eq(variables.CUDA_VERSION, ''))
-
-  # CUDA 11.2's CUB directory conflicts with CUDA 10.2 and 10.1
-  # builds, where CUDA 11.2's CUB is injected into non-CUDA
-  # 11.2 builds.
-  - powershell: Remove-Item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include\cub" -Recurse -ErrorAction Ignore
-    displayName: Remove conflicting CUB from CUDA installation
-    condition: not(eq(variables.CUDA_VERSION, ''))
-
-  - powershell: Copy-Item -Path "F:\cuda_11_2\cub\" -Destination "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -Recurse
-    displayName: Copy CUDA CUB for CUDA 11.2 build
-    condition: eq(variables.CUDA_VERSION, '112')
diff --git a/.azure_pipelines/job_templates/pytorch-template-unix.yml b/.azure_pipelines/job_templates/pytorch-template-unix.yml
deleted file mode 100644
index 7f826e7cd382..000000000000
--- a/.azure_pipelines/job_templates/pytorch-template-unix.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-# PyTorch build steps template with Unix images Azure DevOps Instances
-#
-# This build depends on 5 parameters set as an environment variables in the pipeline:
-#   - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
-#   - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage
-#   - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests
-
-parameters:
-  name: ''
-  pool: ''
-  container_endpoint: ''
-  customMatrixes: ''
-
-jobs:
-- job: ${{parameters.name}}
-  timeoutInMinutes: 600
-  strategy:
-    matrix:
-      ${{ insert }}: ${{parameters.customMatrixes}}
-  pool:
-    name: ${{ parameters.pool}}
-  variables:
-    DECODE_PERCENTS: false
-
-  steps:
-  # Don't checkout repo contents to save time and CPU compute. Environment variables
-  # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available.
-  - checkout: none
-
-  # Delete pytorch_tests repo from previous builds if exists
-  - bash: rm -rf pytorch_tests/
-    displayName: Delete pytorch_tests repo from previous builds if exists
-
-  # Clone PyTorch Tests repository
-  - bash: |
-      B64_PAT=$(echo -n ":$_ADOTOKEN" | base64)
-      git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL)
-      cd pytorch_tests
-      git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH)
-    env:
-      _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-    displayName: Clone PyTorch Tests repo
-
-  # Run PyTorch Unit Tests
-  - bash: bash $(Build.SourcesDirectory)/pytorch_tests/scripts/linux/run.sh
-    env:
-      _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY)
-      _TS_CLONE_P: $(TS_CLONE_PASSWORD)
-      _TS_P: $(TS_PAT)
-      _TS_SM_P: $(TS_SM_PAT)
-      _AZUREML_CLONE_PASSWORD: $(AZUREML_CLONE_PASSWORD)
-      _SPPASSWORD: $(SPPASSWORD)
-    displayName: Run PyTorch Unit Tests
-
-  # Tests results are available outside the docker container since
-  # the current directory is mounted as a volume of the container.
-  - task: PublishTestResults@2
-    condition: always()
-    inputs:
-      testResultsFiles: '**/test-*.xml'
-      testRunTitle: 'Publish test results for Python'
diff --git a/.azure_pipelines/job_templates/pytorch-template-win.yml b/.azure_pipelines/job_templates/pytorch-template-win.yml
deleted file mode 100644
index 5d3704313010..000000000000
--- a/.azure_pipelines/job_templates/pytorch-template-win.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-# PyTorch build steps template with Windows images Azure DevOps Instances
-#
-# This build depends on 5 parameters set as an environment variables in the pipeline:
-#   - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps
-#   - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage
-#   - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests
-
-parameters:
-  name: ''
-  pool: ''
-  customMatrixes: ''
-
-jobs:
-- job: ${{parameters.name}}
-  timeoutInMinutes: 600
-  strategy:
-    matrix:
-      ${{ insert }}: ${{parameters.customMatrixes}}
-  pool:
-    name: ${{ parameters.pool}}
-
-  steps:
-  # Don't checkout repo contents to save time and CPU compute. Environment variables
-  # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available.
-  - checkout: none
-
-  # Delete pytorch_tests repo from previous builds if exists
-  - script: if exist "pytorch_tests/" rmdir "pytorch_tests/" /q /s
-    displayName: Delete pytorch_tests repo from previous builds if exists
-
-  # Clone PyTorch Tests repository
-  - powershell: |
-      $env:B64Pat = [Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes(":$env:_ADOTOKEN"))
-      git -c http.extraHeader="Authorization: Basic $env:B64Pat" clone $env:AZURE_DEVOPS_pytorch_tests_REPO_URL
-      cd pytorch_tests
-      git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH)
-    env:
-      _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-    displayName: Clone PyTorch Tests repo
-
-  # Run PyTorch Unit Tests
-  - script: call $(Build.SourcesDirectory)\pytorch_tests\scripts\windows\run.bat
-    env:
-      _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT)
-      _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY)
-      _TS_CLONE_P: $(TS_CLONE_PASSWORD)
-      _TS_P: $(TS_PAT)
-      _TS_SM_P: $(TS_SM_PAT)
-    displayName: Run PyTorch Unit Tests
-
-  # Tests results are available outside the docker container since
-  # the current directory is mounted as a volume of the container.
-  - task: PublishTestResults@2
-    condition: always()
-    inputs:
-      testResultsFiles: '**\test-*.xml'
-      testRunTitle: 'Publish test results for Python'
diff --git a/.azure_pipelines/job_templates/set-environment-variables.yml b/.azure_pipelines/job_templates/set-environment-variables.yml
deleted file mode 100644
index 40d1cb384b2a..000000000000
--- a/.azure_pipelines/job_templates/set-environment-variables.yml
+++ /dev/null
@@ -1,129 +0,0 @@
-# Set environment variables for specific configurations
-
-parameters:
-  is_official_build: False
-  os: ''
-  cuda: ''
-
-steps:
-  # Environment configuration steps for Ubuntu builds
-  - ${{ if contains(parameters.os, 'ubuntu') }}:
-    # Set configuration specific build flags
-    - ${{ if eq(parameters.is_official_build, True) }}:
-      - bash: |
-          echo "##vso[task.setvariable variable=INSTALL_TEST;]0"
-          echo "##vso[task.setvariable variable=PYTORCH_BUILD_NUMBER;]1"
-          export PYTORCH_VERSION=$(head -c 5 ./version.txt)
-          echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$PYTORCH_VERSION.dev"
-        displayName: Set configuration-specific build flags
-
-      # Set PyTorch CPU/GPU build flags.
-      - ${{ if contains(parameters.cuda, 'cpu') }}:
-        - bash: |
-            echo "##vso[task.setvariable variable=USE_CUDA;]0"
-            echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cpu"
-          displayName: Set CUDA-specific build flag for CPU builds
-
-      - ${{ if contains(parameters.cuda, 'gpu') }}:
-        - bash: |
-            echo "##vso[task.setvariable variable=USE_CUDA;]1"
-            echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cu$(CUDA_VERSION)"
-          displayName: Set CUDA-specific build flag for GPU builds
-
-    # Set MKL environment variables
-    - bash: |
-        echo "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]/opt/intel/lib:$CMAKE_LIBRARY_PATH"
-        echo "##vso[task.setvariable variable=CMAKE_INCLUDE_PATH;]/opt/intel/include:$CMAKE_INCLUDE_PATH"
-      displayName: Set MKL paths
-
-    # View current environment variables
-    - bash:
-        printenv
-      displayName: Show environment variables
-
-  # Environment configuration steps for Windows builds
-  - ${{ if contains(parameters.os, 'windows') }}:
-    # Set Conda Lib Path
-    - powershell: Write-Host "##vso[task.setvariable variable=CONDA_LIB_PATH;]C:\Miniconda\envs\$(configuration)\Library\bin"
-      displayName: Set Conda Lib Path
-
-    # Set configuration specific build flags
-    - ${{ if eq(parameters.is_official_build, True) }}:
-      - powershell: |
-          Write-Host "##vso[task.setvariable variable=INSTALL_TEST;]0"
-          Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_NUMBER;]1"
-          Set-Variable -Name PYTORCH_VERSION -Value (Get-Content .\version.txt).Substring(0,5)
-          Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$PYTORCH_VERSION.dev"
-        displayName: Set configuration-specific build flags
-
-      # Set PyTorch CPU/GPU build flags..
-      - ${{ if contains(parameters.cuda, 'cpu') }}:
-        - powershell: |
-            Write-Host "##vso[task.setvariable variable=USE_CUDA;]0"
-            Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cpu"
-          displayName: Set CUDA-specific build flag for CPU build
-
-      - ${{ if contains(parameters.cuda, 'gpu') }}:
-        - powershell: |
-            Write-Host "##vso[task.setvariable variable=USE_CUDA;]1"
-            Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cu$(CUDA_VERSION)"
-          displayName: Set CUDA-specific build flag for GPU build
-
-    # Set CUDA 11.2, 10.2 or 10.1 specific build flags
-    - ${{ if eq(parameters.cuda, 'gpu') }}:
-      - powershell: |
-          Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0;8.6"
-          Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\"
-        displayName: Set CUDA 11.2 specific build flags
-        condition: eq(variables.CUDA_VERSION, '112')
-
-      - powershell: |
-          Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5"
-          Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\"
-        displayName: Set CUDA 10.2 specific build flags
-        condition: eq(variables.CUDA_VERSION, '102')
-
-      - powershell: |
-          Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5"
-          Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\"
-        displayName: Set CUDA 10.1 specific build flags
-        condition: eq(variables.CUDA_VERSION, '101')
-
-      - powershell: |
-          Write-Host "##vso[task.setvariable variable=CUDA_BIN_PATH;]$env:CUDA_PATH\bin\"
-          Write-Host "##vso[task.setvariable variable=CUDNN_ROOT;]$env:CUDA_PATH"
-          Write-Host "##vso[task.setvariable variable=CUDNN_INCLUDE_DIR;]$env:CUDA_PATH\include\"
-          Write-Host "##vso[task.setvariable variable=CUDNN_LIBRARY;]$env:CUDA_PATH\lib\x64\"
-          Write-Host "##vso[task.prependpath]$env:CUDA_PATH\bin"
-          Write-Host "##vso[task.setvariable variable=TORCH_NVCC_FLAGS;]-Xfatbin -compress-all --no-host-device-move-forward"
-          Write-Host "##vso[task.setvariable variable=THRUST_IGNORE_CUB_VERSION_CHECK;]1"
-          Write-Host "##vso[task.setvariable variable=NVTOOLSEXT_PATH;]C:\Program Files\NVIDIA Corporation\NvToolsExt\"
-        displayName: Set CUDA environment variables
-
-      - powershell: |
-          copy "$(CUDA_BIN_PATH)\cusparse*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\cublas*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\cudart*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\curand*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\cufft*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\cusolver*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\cudnn*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CUDA_BIN_PATH)\nvrtc*64_*.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CONDA_LIB_PATH)\libiomp*5md.dll" $(Build.SourcesDirectory)\torch\lib
-          copy "$(CONDA_LIB_PATH)\uv.dll" $(Build.SourcesDirectory)\torch\lib
-        displayName: Copy CUDA/cuDNN/libomp/libuv dlls to torch\lib
-
-    # Set MKL, sccache and randomtemp environment variables
-    - powershell: |
-        Write-Host "##vso[task.setvariable variable=CMAKE_INCLUDE_PATH;]$(Build.SourcesDirectory)\mkl\include"
-        Write-Host "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]$(Build.SourcesDirectory)\mkl\lib;$env:CMAKE_LIBRARY_PATH"
-        Write-Host "##vso[task.setvariable variable=ADDITIONAL_PATH;]$(Build.SourcesDirectory)\tmp_bin"
-        Write-Host "##vso[task.setvariable variable=SCCACHE_IDLE_TIMEOUT;]1500"
-        Write-Host "##vso[task.setvariable variable=CMAKE_CUDA_COMPILER_LAUNCHER;]$(Build.SourcesDirectory)/tmp_bin/randomtemp.exe;$(Build.SourcesDirectory)/tmp_bin/sccache.exe"
-      displayName: Set MKL, sccache and randomtemp environment variables
-
-    # View current environment variables
-    - script:
-        set
-      displayName: Show environment variables
diff --git a/.azure_pipelines/job_templates/wheel-wait-job-template.yml b/.azure_pipelines/job_templates/wheel-wait-job-template.yml
deleted file mode 100644
index 816eea9cca20..000000000000
--- a/.azure_pipelines/job_templates/wheel-wait-job-template.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Main logic to initiate wait for PR artifact to be ready
-
-steps:
-- task: InvokeRESTAPI@1
-  displayName: 'Wait for job success and wheel ready'
-  timeoutInMinutes: 60
-  inputs:
-    connectionType: 'connectedServiceName'
-    serviceConnection: circleciconn
-    method: 'POST'
-    headers: '{"Content-Type":"application/json", "BranchName":"$(_TARGET_BRANCH_TO_CHECK)", "JobName":"$(TARGET_CIRCLECI_BUILD_PR)", "PRNumber":"$(_TARGET_PR_NUMBER)", "TargetCommit":"$(_TARGET_COMMIT)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}'
-    body: ''
-    urlSuffix: 'api/JobStatus'
-    waitForCompletion: true
diff --git a/.azure_pipelines/job_templates/wheel-wait-template.yml b/.azure_pipelines/job_templates/wheel-wait-template.yml
deleted file mode 100644
index cd2f76ac4d84..000000000000
--- a/.azure_pipelines/job_templates/wheel-wait-template.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Initiate 5 agentless-server waiting jobs to check on the
-# status of PR artifact builds, for a maximum wait time of
-# 11*60 min=660 mins. These jobs will pass immediately
-# once targeted CircleCI build is ready.
-
-jobs:
-- job: checkjob1
-  pool: server
-  timeoutInMinutes: 60
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob2
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob1
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob3
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob2
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob4
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob3
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob5
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob4
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob6
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob5
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob7
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob6
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob8
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob7
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob9
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob8
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob10
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob9
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
-
-- job: checkjob11
-  pool: server
-  timeoutInMinutes: 60
-  dependsOn: checkjob10
-  continueOnError: true
-  steps:
-  - template: wheel-wait-job-template.yml
diff --git a/.azure_pipelines/nightly-pytorch-tests-pipeline.yml b/.azure_pipelines/nightly-pytorch-tests-pipeline.yml
deleted file mode 100644
index 79273c1d3922..000000000000
--- a/.azure_pipelines/nightly-pytorch-tests-pipeline.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-# PyTorch Nightly PyTorch Tests Builds Pipeline on Azure DevOps
-#
-# This pipeline runs custom PyTorch unit-tests on nightly
-# PyTorch wheels.
-
-stages:
-- stage: 'NightlyCustomTests'
-  displayName: 'Run custom unit tests on PyTorch wheels'
-  jobs:
-  - template: job_templates/pytorch-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: $(BUILD_POOL_LIN_1)
-      customMatrixes:
-        Nightly_Custom_Tests:
-          _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_1)
-          _PYTHON_VERSION: $(PYTHON_VERSION_LIN_1)
-          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_1)
-          _RUN_TESTS: $(RUN_TESTS_LIN)
-
-  - template: job_templates/pytorch-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: $(BUILD_POOL_LIN_2)
-      customMatrixes:
-        Nightly_Custom_Tests:
-          _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_2)
-          _PYTHON_VERSION: $(PYTHON_VERSION_LIN_2)
-          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_2)
-          _RUN_TESTS: $(RUN_TESTS_LIN)
-
-  - template: job_templates/pytorch-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: $(BUILD_POOL_WIN_1)
-      customMatrixes:
-        Nightly_Custom_Tests:
-          _PYTHON_VERSION: $(PYTHON_VERSION_WIN_1)
-          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_1)
-          _RUN_TESTS: $(RUN_TESTS_WIN)
-
-  - template: job_templates/pytorch-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: $(BUILD_POOL_WIN_2)
-      customMatrixes:
-        Nightly_Custom_Tests:
-          _PYTHON_VERSION: $(PYTHON_VERSION_WIN_2)
-          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_2)
-          _RUN_TESTS: $(RUN_TESTS_WIN)
-
-- stage: 'NotifyWebapp'
-  displayName: 'Notify Webapp that pipeline is finished'
-  dependsOn: NightlyCustomTests
-  condition: succeededOrFailed()
-  jobs:
-  - template: job_templates/notify-webapp-template.yml
-    parameters:
-      name: ubuntu_1804_CPU
-      pool: $(BUILD_POOL_LIN_1)
diff --git a/.azure_pipelines/pytorch-tests-pipeline.yml b/.azure_pipelines/pytorch-tests-pipeline.yml
deleted file mode 100644
index 56813572169d..000000000000
--- a/.azure_pipelines/pytorch-tests-pipeline.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-# PyTorch PR PyTorch Tests Builds Pipeline on Azure DevOps
-#
-# This pipeline:
-#   1) ensures that CircleCI builds for a given PR
-#      have finished, and that its artifacts are
-#      ready for download
-#   2) runs custom PyTorch unit-tests on PyTorch
-#      wheels generated during PR builds.
-
-resources:
-  webhooks:
-    - webhook: GitHubPyTorchPRTrigger
-      connection: GitHubPyTorchPRTriggerConnection
-      filters:
-        - path: repositoryName
-          value: pytorch_tests
-
-stages:
-- stage: 'EnsureArtifactsReady'
-  displayName: 'Ensure PyTorch PR Artifacts are ready'
-  jobs:
-  - template: job_templates/wheel-wait-template.yml
-  variables:
-    _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}}
-    _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}}
-    _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}}
-
-- stage: 'PRCustomTests'
-  displayName: 'Run custom unit tests on PyTorch wheels'
-  dependsOn: EnsureArtifactsReady
-  condition: succeeded()
-  jobs:
-  - template: job_templates/pytorch-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: $(BUILD_POOL_PR)
-      customMatrixes:
-        PR_Custom_Tests:
-          _PYTHON_VERSION: $(PYTHON_VERSION_PR)
-          _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_PR)
-          _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_BUILD_PR)
-          _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}}
-          _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}}
-          _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}}
-          _DOCKER_IMAGE: $(DOCKER_IMAGE_PR)
-          _RUN_TESTS: $(RUN_TESTS_PR)
-
-- stage: 'NotifyWebapp'
-  displayName: 'Notify Webapp that pipeline is finished'
-  dependsOn: PRCustomTests
-  condition: succeededOrFailed()
-  jobs:
-  - template: job_templates/notify-webapp-template.yml
-    parameters:
-      name: ubuntu_1804_CPU
-      pool: $(BUILD_POOL_LIN_1)
-      customMatrixes:
-        PR_Notify_WebApp:
-          _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_BUILD_PR)
-          _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}}
-          _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}}
-          _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}}
diff --git a/.azure_pipelines/verify-pipeline.yml b/.azure_pipelines/verify-pipeline.yml
deleted file mode 100644
index e0ab4e372a75..000000000000
--- a/.azure_pipelines/verify-pipeline.yml
+++ /dev/null
@@ -1,224 +0,0 @@
-# PyTorch Official Builds Pipeline on Azure DevOps
-#
-# This pipeline:
-#   1) builds PyTorch on all available configurations
-#   2) verifies PyTorch artifacts by installing them in a clean environment
-#      and checking torch.__version_
-#   3) publishes official PyTorch artifacts to Azure DevOps Artifacts for consumption
-
-stages:
-- stage: 'Build'
-  displayName: 'Build PyTorch'
-  jobs:
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: 'PyTorch-Linux-CPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_official_build: True
-      os: ubuntu
-      cuda: cpu
-      customMatrixes:
-        Py_38:
-          configuration: ubuntu_1804_py_38_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev
-        Py_37:
-          configuration: ubuntu_1804_py_37_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev
-
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: 'PyTorch-Linux-GPU'
-      container_endpoint: pytorchms.azurecr.io
-      build_stage: True
-      is_official_build: True
-      os: ubuntu
-      cuda: gpu
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: ubuntu_1804_py_39_cuda_112_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_810:
-          configuration: ubuntu_1804_py_38_cuda_102_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_765:
-          configuration: ubuntu_1804_py_37_cuda_101_cudnn_765
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev
-          CUDA_VERSION: 101
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: 'PyTorch-Win-CPU'
-      build_stage: True
-      is_official_build: True
-      os: windows
-      cuda: cpu
-      customMatrixes:
-        Py_38:
-          configuration: windows_2019_py_38_cpu
-        Py_37:
-          configuration: windows_2019_py_37_cpu
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: 'PyTorch-Win-GPU'
-      build_stage: True
-      is_official_build: True
-      os: windows
-      cuda: gpu
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: windows_2019_py_39_cuda_112_cudnn_810
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_765:
-          configuration: windows_2019_py_38_cuda_102_cudnn_765
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_764:
-          configuration: windows_2019_py_37_cuda_101_cudnn_764
-          CUDA_VERSION: 101
-
-- stage: 'Verify'
-  displayName: 'Verify PyTorch wheels'
-  dependsOn: Build
-  condition: succeeded()
-  jobs:
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: 'PyTorch-Linux-CPU'
-      container_endpoint: pytorchms.azurecr.io
-      verify_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_38:
-          configuration: ubuntu_1804_py_38_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev
-        Py_37:
-          configuration: ubuntu_1804_py_37_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev
-
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: 'PyTorch-Linux-GPU'
-      container_endpoint: pytorchms.azurecr.io
-      verify_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: ubuntu_1804_py_39_cuda_112_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_810:
-          configuration: ubuntu_1804_py_38_cuda_102_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_765:
-          configuration: ubuntu_1804_py_37_cuda_101_cudnn_765
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev
-          CUDA_VERSION: 101
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: 'PyTorch-Win-CPU'
-      verify_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_38:
-          configuration: windows_2019_py_38_cpu
-        Py_37:
-          configuration: windows_2019_py_37_cpu
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: 'PyTorch-Win-GPU'
-      verify_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: windows_2019_py_39_cuda_112_cudnn_810
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_765:
-          configuration: windows_2019_py_38_cuda_102_cudnn_765
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_764:
-          configuration: windows_2019_py_37_cuda_101_cudnn_764
-          CUDA_VERSION: 101
-
-- stage: 'Publish'
-  displayName: 'Publish PyTorch wheels'
-  dependsOn: Verify
-  condition: succeeded()
-  jobs:
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_CPU_docker
-      pool: 'PyTorch-Linux-CPU'
-      container_endpoint: pytorchms.azurecr.io
-      publish_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_38:
-          configuration: ubuntu_1804_py_38_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev
-        Py_37:
-          configuration: ubuntu_1804_py_37_cpu
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev
-
-  - template: job_templates/build-verify-publish-template-unix.yml
-    parameters:
-      name: ubuntu_1804_GPU_docker
-      pool: 'PyTorch-Linux-GPU'
-      container_endpoint: pytorchms.azurecr.io
-      publish_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: ubuntu_1804_py_39_cuda_112_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_810:
-          configuration: ubuntu_1804_py_38_cuda_102_cudnn_810
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_765:
-          configuration: ubuntu_1804_py_37_cuda_101_cudnn_765
-          container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev
-          CUDA_VERSION: 101
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_CPU
-      pool: 'PyTorch-Win-CPU'
-      publish_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_38:
-          configuration: windows_2019_py_38_cpu
-        Py_37:
-          configuration: windows_2019_py_37_cpu
-
-  - template: job_templates/build-verify-publish-template-win.yml
-    parameters:
-      name: windows_2019_GPU
-      pool: 'PyTorch-Win-GPU'
-      publish_stage: True
-      is_official_build: True
-      customMatrixes:
-        Py_39_CUDA_112_cuDNN_810:
-          configuration: windows_2019_py_39_cuda_112_cudnn_810
-          CUDA_VERSION: 112
-        Py_38_CUDA_102_cuDNN_765:
-          configuration: windows_2019_py_38_cuda_102_cudnn_765
-          CUDA_VERSION: 102
-        Py_37_CUDA_101_cuDNN_764:
-          configuration: windows_2019_py_37_cuda_101_cudnn_764
-          CUDA_VERSION: 101
diff --git a/.bazelrc b/.bazelrc
index 1e847054613e..bbde3075f2af 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,10 +1,11 @@
-build --copt=--std=c++14
+build --cxxopt=--std=c++14
 build --copt=-I.
 # Bazel does not support including its cc_library targets as system
 # headers. We work around this for generated code
 # (e.g. c10/macros/cmake_macros.h) by making the generated directory a
 # system include path.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
+build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin
 build --experimental_ui_max_stdouterr_bytes=2048576
 
 # Configuration to disable tty features for environments like CI
diff --git a/.buckconfig.oss b/.buckconfig.oss
new file mode 100644
index 000000000000..638870587d84
--- /dev/null
+++ b/.buckconfig.oss
@@ -0,0 +1,15 @@
+[buildfile]
+name = BUILD.buck
+
+[repositories]
+  bazel_skylib = third_party/bazel-skylib/
+
+[download]
+  in_build = true
+
+[cxx]
+  cxxflags = -std=c++17
+  should_remap_host_platform = true
+
+[project]
+  default_flavors_mode=all
diff --git a/.circleci/README.md b/.circleci/README.md
deleted file mode 100644
index 5b0d56d1df2e..000000000000
--- a/.circleci/README.md
+++ /dev/null
@@ -1,498 +0,0 @@
-Structure of CI
-===============
-
-setup job:
-1. Does a git checkout
-2. Persists CircleCI scripts (everything in `.circleci`) into a workspace.  Why?
-   We don't always do a Git checkout on all subjobs, but we usually
-   still want to be able to call scripts one way or another in a subjob.
-   Persisting files this way lets us have access to them without doing a
-   checkout.  This workspace is conventionally mounted on `~/workspace`
-   (this is distinguished from `~/project`, which is the conventional
-   working directory that CircleCI will default to starting your jobs
-   in.)
-3. Write out the commit message to `.circleci/COMMIT_MSG`.  This is so
-   we can determine in subjobs if we should actually run the jobs or
-   not, even if there isn't a Git checkout.
-
-
-
-
-CircleCI configuration generator
-================================
-
-One may no longer make changes to the `.circleci/config.yml` file directly.
-Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory.
-
-
-Usage
-----------
-
-1. Make changes to these scripts.
-2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`.
-
-You'll see a build failure on GitHub if the scripts don't agree with the checked-in version.
-
-
-Motivation
-----------
-
-These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix.
-The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content.
-
-Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate
-multiple parts of the file.
-
-* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets
-
-Also see https://github.com/pytorch/pytorch/issues/17038
-
-
-Future direction
-----------------
-
-### Declaring sparse config subsets
-See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747):
-
-In contrast with a full recursive tree traversal of configuration dimensions,
-> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this.
-
-----------------
-----------------
-
-# How do the binaries / nightlies / releases work?
-
-### What is a binary?
-
-A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source.
-
-A **binary configuration** is a collection of
-
-* release or nightly
-    * releases are stable, nightlies are beta and built every night
-* python version
-    * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists)
-    * macos: 3.7, 3.8
-    * windows: 3.7, 3.8
-* cpu version
-    * cpu, cuda 9.0, cuda 10.0
-    * The supported cuda versions occasionally change
-* operating system
-    * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu
-    * MacOS
-    * Windows - these are built on Azure pipelines
-* devtoolset version (gcc compiler version)
-    * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string
-
-### Where are the binaries?
-
-The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months.
-
-We have 3 types of binary packages
-
-* pip packages - nightlies are stored on s3 (pip install -f \<a s3 url\>). releases are stored in a pip repo (pip install torch) (ask Soumith about this)
-* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix
-* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only
-    * shared with dependencies (the only supported option for Windows)
-    * static with dependencies
-    * shared without dependencies
-    * static without dependencies
-
-All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release)
-
-# CircleCI structure of the binaries
-
-Some quick vocab:
-
-* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/master/.circleci/config.yml to see the workflows.
-* **jobs** are a sequence of '**steps**'
-* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps*
-* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps.
-
-## How are the workflows structured?
-
-The nightly binaries have 3 workflows. We have one job (actually 3 jobs:  build, test, and upload) per binary configuration
-
-1. binary_builds
-    1. every day midnight EST
-    2. linux: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    3. macos: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. binary_linux_conda_3.7_cpu_build
-            1. Builds the build. On linux jobs this uses the 'docker executor'.
-            2. Persists the package to the workspace
-        2. binary_linux_conda_3.7_cpu_test
-            1. Loads the package to the workspace
-            2. Spins up a docker image (on Linux), mapping the package and code repos into the docker
-            3. Runs some smoke tests in the docker
-            4. (Actually, for macos this is a step rather than a separate job)
-        3. binary_linux_conda_3.7_cpu_upload
-            1. Logs in to aws/conda
-            2. Uploads the package
-2. update_s3_htmls
-    1. every day 5am EST
-    2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml
-    3. See below for what these are for and why they're needed
-    4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3
-3. binarysmoketests
-    1. every day
-    2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. smoke_linux_conda_3.7_cpu
-            1. Downloads the package from the cloud, e.g. using the official pip or conda instructions
-            2. Runs the smoke tests
-
-## How are the jobs structured?
-
-The jobs are in https://github.com/pytorch/pytorch/tree/master/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/master/.circleci/scripts .
-
-* Linux jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    * binary_linux_build.sh
-    * binary_linux_test.sh
-    * binary_linux_upload.sh
-* MacOS jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    * binary_macos_build.sh
-    * binary_macos_test.sh
-    * binary_macos_upload.sh
-* Update html jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/master/cron/update_s3_htmls.sh
-    * https://github.com/pytorch/builder/blob/master/cron/upload_binary_sizes.sh
-* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/master/run_tests.sh
-    * https://github.com/pytorch/builder/blob/master/smoke_test.sh
-    * https://github.com/pytorch/builder/blob/master/check_binary.sh
-* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
-    * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh
-    * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps.
-    * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables
-    * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image
-
-### **Why do the steps all refer to scripts?**
-
-CircleCI creates a  final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems.
-
-### **What is binary_run_in_docker for?**
-
-So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus
-
-* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor
-* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs
-* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use
-* linux smoke test jobs use the machine executor for the same reason as the linux test jobs
-
-binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs
-
-### **Why does binary_checkout also checkout pytorch? Why shouldn't it?**
-
-We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where.
-
-# Azure Pipelines structure of the binaries
-
-TODO: fill in stuff
-
-## How are the workflows structured?
-
-TODO: fill in stuff
-
-## How are the jobs structured?
-
-TODO: fill in stuff
-
-# Code structure of the binaries (circleci agnostic)
-
-## Overview
-
-The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is
-
-
-```
-# All code needed to set-up environments for build code to run in,
-# but only code that is specific to the current CI system
-pytorch/pytorch
-- .circleci/                # Folder that holds all circleci related stuff
-  - config.yml              # GENERATED file that actually controls all circleci behavior
-  - verbatim-sources        # Used to generate job/workflow sections in ^
-  - scripts/                # Code needed to prepare circleci environments for binary build scripts
-
-- setup.py                  # Builds pytorch. This is wrapped in pytorch/builder
-- cmake files               # used in normal building of pytorch
-
-# All code needed to prepare a binary build, given an environment
-# with all the right variables/packages/paths.
-pytorch/builder
-
-# Given an installed binary and a proper python env, runs some checks
-# to make sure the binary was built the proper way. Checks things like
-# the library dependencies, symbols present, etc.
-- check_binary.sh
-
-# Given an installed binary, runs python tests to make sure everything
-# is in order. These should be de-duped. Right now they both run smoke
-# tests, but are called from different places. Usually just call some
-# import statements, but also has overlap with check_binary.sh above
-- run_tests.sh
-- smoke_test.sh
-
-# Folders that govern how packages are built. See paragraphs below
-
-- conda/
-  - build_pytorch.sh          # Entrypoint. Delegates to proper conda build folder
-  - switch_cuda_version.sh    # Switches activate CUDA installation in Docker
-  - pytorch-nightly/          # Build-folder
-- manywheel/
-  - build_cpu.sh              # Entrypoint for cpu builds
-  - build.sh                  # Entrypoint for CUDA builds
-  - build_common.sh           # Actual build script that ^^ call into
-- wheel/
-  - build_wheel.sh            # Entrypoint for wheel builds
-- windows/
-  - build_pytorch.bat         # Entrypoint for wheel builds on Windows
-```
-
-Every type of package has an entrypoint build script that handles the all the important logic.
-
-## Conda
-
-Linux, MacOS and Windows use the same code flow for the conda builds.
-
-Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html
-
-Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing.
-tl;dr on conda-build is
-
-1. Creates a brand new conda environment, based off of deps in the meta.yaml
-    1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml
-    2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is.
-2. Calls build.sh in the environment
-3. Copies the finished package to a new conda env, also specified by the meta.yaml
-4. Runs some simple import tests (if specified in the meta.yaml)
-5. Saves the finished package as a tarball
-
-The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths.
-
-The entrypoint file `builder/conda/build_conda.sh` is complicated because
-
-* It works for Linux, MacOS and Windows
-    * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed
-* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed.
-
-## Manywheels (linux pip and libtorch packages)
-
-Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant.
-
-`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh`
-
-The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because
-
-* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed.
-
-## Wheels (MacOS pip and libtorch packages)
-
-The entrypoint file `builder/wheel/build_wheel.sh` is complicated because
-
-* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * Ditto the comment above. This should definitely be separated out.
-
-Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## Windows Wheels (Windows pip and libtorch packages)
-
-The entrypoint file `builder/windows/build_pytorch.bat` is complicated because
-
-* This used to handle building for several different python versions at the same time. This is why there are loops everywhere
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-
-Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## General notes
-
-### Note on run_tests.sh, smoke_test.sh, and check_binary.sh
-
-* These should all be consolidated
-* These must run on all OS types: MacOS, Linux, and Windows
-* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on master and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up.
-* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package.
-
-### Note on libtorch
-
-Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this
-
-* It’s confusing. Most of those scripts deal with python specifics.
-* The extra conditionals everywhere severely complicate the wheel build scripts
-* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script)
-
-### Note on docker images / Dockerfiles
-
-All linux builds occur in docker images. The docker images are
-
-* pytorch/conda-cuda
-    * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds
-    * Also used for cpu builds
-* pytorch/manylinux-cuda90
-* pytorch/manylinux-cuda100
-    * Also used for cpu builds
-
-The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now.
-
-### General Python
-
-* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2
-
-# How to manually rebuild the binaries
-
-tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159
-
-Sometimes we want to push a change to master and then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/master/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want.
-
-## How to test changes to the binaries via .circleci
-
-Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this.
-
-```sh
-# Make your changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-
-# Regenerate the yaml, has to be in python 3.7
-.circleci/regenerate.sh
-
-# Make a commit
-git add .circleci *
-git commit -m "My real changes"
-git push origin my_branch
-
-# Now hardcode the jobs that you want in the .circleci/config.yml workflows section
-# Also eliminate ensure-consistency and should_run_job checks
-# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d
-
-# Make a commit you won't keep
-git add .circleci
-git commit -m "[DO NOT LAND] testing binaries for above changes"
-git push origin my_branch
-
-# Now you need to make some changes to the first commit.
-git rebase -i HEAD~2 # mark the first commit as 'edit'
-
-# Make the changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-.circleci/regenerate.sh
-
-# Ammend the commit and recontinue
-git add .circleci
-git commit --amend
-git rebase --continue
-
-# Update the PR, need to force since the commits are different now
-git push origin my_branch --force
-```
-
-The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes.
-
-## How to build a binary locally
-
-### Linux
-
-You can build Linux binaries locally easily using docker.
-
-```sh
-# Run the docker
-# Use the correct docker image, pytorch/conda-cuda used here as an example
-#
-# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the
-#    machine that you're running the command on) accessible to the docker
-#    container at path/to/bar. So if you then run `touch path/to/bar/baz`
-#    in the docker container then you will see path/to/foo/baz on your local
-#    machine. You could also clone the pytorch and builder repos in the docker.
-#
-# If you know how, add ccache as a volume too and speed up everything
-docker run \
-    -v your/pytorch/repo:/pytorch \
-    -v your/builder/repo:/builder \
-    -v where/you/want/packages/to/appear:/final_pkgs \
-    -it pytorch/conda-cuda /bin/bash
-
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-
-# Call the entrypoint
-# `|& tee foo.log` just copies all stdout and stderr output to foo.log
-# The builds generate lots of output so you probably need this when
-# building locally.
-/builder/conda/build_pytorch.sh |& tee build_output.log
-```
-
-**Building CUDA binaries on docker**
-
-You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time).
-
-For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast.
-
-### MacOS
-
-There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci    :/
-
-But if you want to try, then I’d recommend
-
-```sh
-# Create a new terminal
-# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you
-# know how to do
-
-# Install a new miniconda
-# First remove any other python or conda installation from your PATH
-# Always install miniconda 3, even if building for Python <3
-new_conda="~/my_new_conda"
-conda_sh="$new_conda/install_miniconda.sh"
-curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x "$conda_sh"
-"$conda_sh" -b -p "$MINICONDA_ROOT"
-rm -f "$conda_sh"
-export PATH="~/my_new_conda/bin:$PATH"
-
-# Create a clean python env
-# All MacOS builds use conda to manage the python env and dependencies
-# that are built with, even the pip packages
-conda create -yn binary python=2.7
-conda activate binary
-
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-
-# Call the entrypoint you want
-path/to/builder/wheel/build_wheel.sh
-```
-
-N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that
-
-1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH.
-2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH`
-3. Now say you (or some code that you ran) call python executable `foo`
-    1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected.
-    2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version!
-
-Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe.
-
-### Windows
-
-TODO: fill in
diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 1c714186568f..5df203b6ce39 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -31,13 +31,6 @@ def get_processor_arch_name(gpu_version):
     )
 
 CONFIG_TREE_DATA = OrderedDict(
-    windows=(
-        # Stop building Win+CU102, see https://github.com/pytorch/pytorch/issues/65648
-        [v for v in dimensions.GPU_VERSIONS if v not in dimensions.ROCM_VERSION_LABELS and v != "cuda102"],
-        OrderedDict(
-            conda=dimensions.STANDARD_PYTHON_VERSIONS,
-        )
-    ),
 )
 
 # GCC config variants:
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index 1a411856a8b2..7f9ebccbcc89 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -2,9 +2,8 @@
 
 CUDA_VERSIONS = [
     "102",
-    "111",
     "113",
-    "115",
+    "116",
 ]
 
 ROCM_VERSIONS = [
diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index b8c83ada6534..09756135fe64 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -71,10 +71,10 @@ def child_constructor(self):
         next_nodes = {
             "asan": AsanConfigNode,
             "xla": XlaConfigNode,
-            "mlc": MLCConfigNode,
+            "mps": MPSConfigNode,
             "vulkan": VulkanConfigNode,
             "parallel_tbb": ParallelTBBConfigNode,
-            "noarch": NoarchConfigNode,
+            "crossref": CrossRefConfigNode,
             "parallel_native": ParallelNativeConfigNode,
             "onnx": ONNXConfigNode,
             "libtorch": LibTorchConfigNode,
@@ -116,12 +116,12 @@ def init2(self, node_name):
     def child_constructor(self):
         return ImportantConfigNode
 
-class MLCConfigNode(TreeConfigNode):
+class MPSConfigNode(TreeConfigNode):
     def modify_label(self, label):
-        return "MLC=" + str(label)
+        return "MPS=" + str(label)
 
     def init2(self, node_name):
-        self.props["is_mlc"] = node_name
+        self.props["is_mps"] = node_name
 
     def child_constructor(self):
         return ImportantConfigNode
@@ -171,9 +171,9 @@ def child_constructor(self):
         return ImportantConfigNode
 
 
-class NoarchConfigNode(TreeConfigNode):
+class CrossRefConfigNode(TreeConfigNode):
     def init2(self, node_name):
-        self.props["is_noarch"] = node_name
+        self.props["is_crossref"] = node_name
 
     def child_constructor(self):
         return ImportantConfigNode
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index 036e8a599191..0eb7b5ec5210 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -185,7 +185,7 @@ def gen_docs_configs(xenial_parent_config):
         HiddenConf(
             "pytorch_python_doc_build",
             parent_build=xenial_parent_config,
-            filters=gen_filter_dict(branches_list=["master", "nightly"],
+            filters=gen_filter_dict(branches_list=["master", "main", "nightly"],
                                     tags_list=RC_PATTERN),
         )
     )
@@ -201,7 +201,7 @@ def gen_docs_configs(xenial_parent_config):
         HiddenConf(
             "pytorch_cpp_doc_build",
             parent_build=xenial_parent_config,
-            filters=gen_filter_dict(branches_list=["master", "nightly"],
+            filters=gen_filter_dict(branches_list=["master", "main", "nightly"],
                                     tags_list=RC_PATTERN),
         )
     )
@@ -239,7 +239,7 @@ def instantiate_configs(only_slow_gradcheck):
         compiler_version = fc.find_prop("compiler_version")
         is_xla = fc.find_prop("is_xla") or False
         is_asan = fc.find_prop("is_asan") or False
-        is_noarch = fc.find_prop("is_noarch") or False
+        is_crossref = fc.find_prop("is_crossref") or False
         is_onnx = fc.find_prop("is_onnx") or False
         is_pure_torch = fc.find_prop("is_pure_torch") or False
         is_vulkan = fc.find_prop("is_vulkan") or False
@@ -283,8 +283,8 @@ def instantiate_configs(only_slow_gradcheck):
             python_version = fc.find_prop("pyver")
             parms_list[0] = fc.find_prop("abbreviated_pyver")
 
-        if is_noarch:
-            parms_list_ignored_for_docker_image.append("noarch")
+        if is_crossref:
+            parms_list_ignored_for_docker_image.append("crossref")
 
         if is_onnx:
             parms_list.append("onnx")
diff --git a/.circleci/cimodel/data/simple/binary_smoketest.py b/.circleci/cimodel/data/simple/binary_smoketest.py
deleted file mode 100644
index 6d1d421d029c..000000000000
--- a/.circleci/cimodel/data/simple/binary_smoketest.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""
-TODO: Refactor circleci/cimodel/data/binary_build_data.py to generate this file
-       instead of doing one offs here
- Binary builds (subset, to smoke test that they'll work)
-
- NB: If you modify this file, you need to also modify
- the binary_and_smoke_tests_on_pr variable in
- pytorch-ci-hud to adjust the allowed build list
- at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js
-
- Note:
- This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
- - binary_linux_conda_3_6_cu90_devtoolset7_build
- - binary_linux_conda_3_6_cu90_devtoolset7_test
-
- TODO
- we should test a libtorch cuda build, but they take too long
- - binary_linux_libtorch_3_6m_cu90_devtoolset7_static-without-deps_build
-"""
-
-import cimodel.lib.miniutils as miniutils
-import cimodel.data.simple.util.branch_filters
-
-
-class SmoketestJob:
-    def __init__(self,
-                 template_name,
-                 build_env_parts,
-                 docker_image,
-                 job_name,
-                 is_master_only=False,
-                 requires=None,
-                 has_libtorch_variant=False,
-                 extra_props=None):
-
-        self.template_name = template_name
-        self.build_env_parts = build_env_parts
-        self.docker_image = docker_image
-        self.job_name = job_name
-        self.is_master_only = is_master_only
-        self.requires = requires or []
-        self.has_libtorch_variant = has_libtorch_variant
-        self.extra_props = extra_props or {}
-
-    def gen_tree(self):
-
-        props_dict = {
-            "build_environment": " ".join(self.build_env_parts),
-            "name": self.job_name,
-            "requires": self.requires,
-        }
-
-        if self.docker_image:
-            props_dict["docker_image"] = self.docker_image
-
-        if self.is_master_only:
-            props_dict["filters"] = cimodel.data.simple.util.branch_filters.gen_filter_dict()
-
-        if self.has_libtorch_variant:
-            props_dict["libtorch_variant"] = "shared-with-deps"
-
-        props_dict.update(self.extra_props)
-
-        return [{self.template_name: props_dict}]
-
-
-WORKFLOW_DATA = [
-    SmoketestJob(
-        "binary_linux_build",
-        ["manywheel", "3.7m", "cu102", "devtoolset7"],
-        "pytorch/manylinux-cuda102",
-        "binary_linux_manywheel_3_7m_cu102_devtoolset7_build",
-        is_master_only=True,
-    ),
-    SmoketestJob(
-        "binary_linux_build",
-        ["libtorch", "3.7m", "cpu", "devtoolset7"],
-        "pytorch/manylinux-cuda102",
-        "binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build",
-        is_master_only=True,
-        has_libtorch_variant=True,
-    ),
-    SmoketestJob(
-        "binary_linux_build",
-        ["libtorch", "3.7m", "cpu", "gcc5.4_cxx11-abi"],
-        "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest",
-        "binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-        is_master_only=False,
-        has_libtorch_variant=True,
-    ),
-    SmoketestJob(
-        "binary_mac_build",
-        ["wheel", "3.7", "cpu"],
-        None,
-        "binary_macos_wheel_3_7_cpu_build",
-        is_master_only=True,
-    ),
-    # This job has an average run time of 3 hours o.O
-    # Now only running this on master to reduce overhead
-    SmoketestJob(
-        "binary_mac_build",
-        ["libtorch", "3.7", "cpu"],
-        None,
-        "binary_macos_libtorch_3_7_cpu_build",
-        is_master_only=True,
-    ),
-    SmoketestJob(
-        "binary_windows_build",
-        ["libtorch", "3.7", "cpu", "debug"],
-        None,
-        "binary_windows_libtorch_3_7_cpu_debug_build",
-        is_master_only=True,
-    ),
-    SmoketestJob(
-        "binary_windows_build",
-        ["libtorch", "3.7", "cpu", "release"],
-        None,
-        "binary_windows_libtorch_3_7_cpu_release_build",
-        is_master_only=True,
-    ),
-    SmoketestJob(
-        "binary_windows_build",
-        ["wheel", "3.7", "cu113"],
-        None,
-        "binary_windows_wheel_3_7_cu113_build",
-        is_master_only=True,
-    ),
-
-    SmoketestJob(
-        "binary_windows_test",
-        ["libtorch", "3.7", "cpu", "debug"],
-        None,
-        "binary_windows_libtorch_3_7_cpu_debug_test",
-        is_master_only=True,
-        requires=["binary_windows_libtorch_3_7_cpu_debug_build"],
-    ),
-    SmoketestJob(
-        "binary_windows_test",
-        ["libtorch", "3.7", "cpu", "release"],
-        None,
-        "binary_windows_libtorch_3_7_cpu_release_test",
-        is_master_only=False,
-        requires=["binary_windows_libtorch_3_7_cpu_release_build"],
-    ),
-    SmoketestJob(
-        "binary_windows_test",
-        ["wheel", "3.7", "cu113"],
-        None,
-        "binary_windows_wheel_3_7_cu113_test",
-        is_master_only=True,
-        requires=["binary_windows_wheel_3_7_cu113_build"],
-        extra_props={
-            "executor": "windows-with-nvidia-gpu",
-        },
-    ),
-
-
-
-    SmoketestJob(
-        "binary_linux_test",
-        ["manywheel", "3.7m", "cu102", "devtoolset7"],
-        "pytorch/manylinux-cuda102",
-        "binary_linux_manywheel_3_7m_cu102_devtoolset7_test",
-        is_master_only=True,
-        requires=["binary_linux_manywheel_3_7m_cu102_devtoolset7_build"],
-        extra_props={
-            "resource_class": "gpu.nvidia.small",
-            "use_cuda_docker_runtime": miniutils.quote((str(1))),
-        },
-    ),
-    SmoketestJob(
-        "binary_linux_test",
-        ["libtorch", "3.7m", "cpu", "devtoolset7"],
-        "pytorch/manylinux-cuda102",
-        "binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test",
-        is_master_only=True,
-        requires=["binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build"],
-        has_libtorch_variant=True,
-    ),
-    SmoketestJob(
-        "binary_linux_test",
-        ["libtorch", "3.7m", "cpu", "gcc5.4_cxx11-abi"],
-        "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest",
-        "binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test",
-        is_master_only=True,
-        requires=["binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build"],
-        has_libtorch_variant=True,
-    ),
-]
-
-
-def get_workflow_jobs():
-    return [item.gen_tree() for item in WORKFLOW_DATA]
diff --git a/.circleci/cimodel/data/simple/util/branch_filters.py b/.circleci/cimodel/data/simple/util/branch_filters.py
index dfbc6e4d63bc..ba4e00a059ef 100644
--- a/.circleci/cimodel/data/simple/util/branch_filters.py
+++ b/.circleci/cimodel/data/simple/util/branch_filters.py
@@ -1,4 +1,5 @@
 NON_PR_BRANCH_LIST = [
+    "main",
     "master",
     r"/ci-all\/.*/",
     r"/release\/.*/",
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1a4bfd3418ec..8828d86294b8 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -436,22 +436,7 @@ binary_windows_params: &binary_windows_params
       default: "windows-xlarge-cpu-with-nvidia-cuda"
   environment:
     BUILD_ENVIRONMENT: << parameters.build_environment >>
-    BUILD_FOR_SYSTEM: windows
     JOB_EXECUTOR: <<parameters.executor>>
-
-promote_common: &promote_common
-  docker:
-    - image: pytorch/release
-  parameters:
-    package_name:
-      description: "package name to promote"
-      type: string
-      default: ""
-  environment:
-    PACKAGE_NAME: << parameters.package_name >>
-    ANACONDA_API_TOKEN: ${CONDA_PYTORCHBOT_TOKEN}
-    AWS_ACCESS_KEY_ID: ${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}
-    AWS_SECRET_ACCESS_KEY: ${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}
 ##############################################################################
 # Job specs
 ##############################################################################
@@ -619,6 +604,7 @@ jobs:
     <<: *binary_mac_params
     macos:
       xcode: "12.0"
+      resource_class: "large"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -857,7 +843,7 @@ jobs:
     parameters:
       branch:
         type: string
-        default: "master"
+        default: "main"
     steps:
     - attach_workspace:
         at: /tmp/workspace
@@ -897,7 +883,7 @@ jobs:
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
           # turn v1.12.0rc3 into 1.12
           tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-          target=${tag:-master}
+          target=${tag:-main}
           echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
@@ -907,7 +893,7 @@ jobs:
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
           mkdir -p ~/workspace/build_artifacts
-          docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/master ~/workspace/build_artifacts
+          docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/main ~/workspace/build_artifacts
           docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io /tmp/workspace
 
           # Save the docs build so we can debug any problems
@@ -919,7 +905,7 @@ jobs:
         paths:
           - .
     - store_artifacts:
-        path: ~/workspace/build_artifacts/master
+        path: ~/workspace/build_artifacts/main
         destination: docs
 
   pytorch_cpp_doc_build:
@@ -943,12 +929,12 @@ jobs:
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
           # turn v1.12.0rc3 into 1.12
           tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-          target=${tag:-master}
+          target=${tag:-main}
           echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" main") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1432,7 +1418,7 @@ jobs:
           time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG})
 
-          echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+          echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
 
           git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
 
@@ -1532,24 +1518,6 @@ jobs:
           export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.jenkins/pytorch/docs-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
-  promote_s3:
-    <<: *promote_common
-    steps:
-      - checkout
-      - run:
-          name: Running promote script
-          command: |
-            scripts/release/promote/wheel_to_s3.sh
-
-  promote_conda:
-    <<: *promote_common
-    steps:
-      - checkout
-      - run:
-          name: Running promote script
-          command: |
-            scripts/release/promote/conda_to_conda.sh
-
   # update_s3_htmls job
   # These jobs create html files for every cpu/cu## folder in s3. The html
   # files just store the names of all the files in that folder (which are
@@ -1676,738 +1644,8 @@ jobs:
 # Workflows
 ##############################################################################
 workflows:
-  binary_builds:
-    jobs:
-      - binary_windows_build:
-          name: binary_windows_conda_3_7_cpu_nightly_build
-          build_environment: "conda 3.7 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_8_cpu_nightly_build
-          build_environment: "conda 3.8 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_9_cpu_nightly_build
-          build_environment: "conda 3.9 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_10_cpu_nightly_build
-          build_environment: "conda 3.10 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_7_cu111_nightly_build
-          build_environment: "conda 3.7 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_8_cu111_nightly_build
-          build_environment: "conda 3.8 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_9_cu111_nightly_build
-          build_environment: "conda 3.9 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_10_cu111_nightly_build
-          build_environment: "conda 3.10 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_7_cu113_nightly_build
-          build_environment: "conda 3.7 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_8_cu113_nightly_build
-          build_environment: "conda 3.8 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_9_cu113_nightly_build
-          build_environment: "conda 3.9 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_10_cu113_nightly_build
-          build_environment: "conda 3.10 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_7_cu115_nightly_build
-          build_environment: "conda 3.7 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_8_cu115_nightly_build
-          build_environment: "conda 3.8 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_9_cu115_nightly_build
-          build_environment: "conda 3.9 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_build:
-          name: binary_windows_conda_3_10_cu115_nightly_build
-          build_environment: "conda 3.10 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - binary_windows_test:
-          name: binary_windows_conda_3_7_cpu_nightly_test
-          build_environment: "conda 3.7 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_7_cpu_nightly_build
-      - binary_windows_test:
-          name: binary_windows_conda_3_8_cpu_nightly_test
-          build_environment: "conda 3.8 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_8_cpu_nightly_build
-      - binary_windows_test:
-          name: binary_windows_conda_3_9_cpu_nightly_test
-          build_environment: "conda 3.9 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_9_cpu_nightly_build
-      - binary_windows_test:
-          name: binary_windows_conda_3_10_cpu_nightly_test
-          build_environment: "conda 3.10 cpu"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_10_cpu_nightly_build
-      - binary_windows_test:
-          name: binary_windows_conda_3_7_cu111_nightly_test
-          build_environment: "conda 3.7 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_7_cu111_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_8_cu111_nightly_test
-          build_environment: "conda 3.8 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_8_cu111_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_9_cu111_nightly_test
-          build_environment: "conda 3.9 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_9_cu111_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_10_cu111_nightly_test
-          build_environment: "conda 3.10 cu111"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_10_cu111_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_7_cu113_nightly_test
-          build_environment: "conda 3.7 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_7_cu113_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_8_cu113_nightly_test
-          build_environment: "conda 3.8 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_8_cu113_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_9_cu113_nightly_test
-          build_environment: "conda 3.9 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_9_cu113_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_10_cu113_nightly_test
-          build_environment: "conda 3.10 cu113"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_10_cu113_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_7_cu115_nightly_test
-          build_environment: "conda 3.7 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_7_cu115_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_8_cu115_nightly_test
-          build_environment: "conda 3.8 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_8_cu115_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_9_cu115_nightly_test
-          build_environment: "conda 3.9 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_9_cu115_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_windows_test:
-          name: binary_windows_conda_3_10_cu115_nightly_test
-          build_environment: "conda 3.10 cu115"
-          filters:
-            branches:
-              only:
-                - /.*/
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          requires:
-            - binary_windows_conda_3_10_cu115_nightly_build
-          executor: windows-with-nvidia-gpu
-      - binary_upload:
-          name: binary_windows_conda_3_7_cpu_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_7_cpu_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cpu
-      - binary_upload:
-          name: binary_windows_conda_3_8_cpu_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_8_cpu_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cpu
-      - binary_upload:
-          name: binary_windows_conda_3_9_cpu_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_9_cpu_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cpu
-      - binary_upload:
-          name: binary_windows_conda_3_10_cpu_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_10_cpu_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cpu
-      - binary_upload:
-          name: binary_windows_conda_3_7_cu111_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_7_cu111_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu111
-      - binary_upload:
-          name: binary_windows_conda_3_8_cu111_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_8_cu111_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu111
-      - binary_upload:
-          name: binary_windows_conda_3_9_cu111_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_9_cu111_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu111
-      - binary_upload:
-          name: binary_windows_conda_3_10_cu111_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_10_cu111_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu111
-      - binary_upload:
-          name: binary_windows_conda_3_7_cu113_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_7_cu113_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu113
-      - binary_upload:
-          name: binary_windows_conda_3_8_cu113_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_8_cu113_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu113
-      - binary_upload:
-          name: binary_windows_conda_3_9_cu113_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_9_cu113_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu113
-      - binary_upload:
-          name: binary_windows_conda_3_10_cu113_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_10_cu113_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu113
-      - binary_upload:
-          name: binary_windows_conda_3_7_cu115_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_7_cu115_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu115
-      - binary_upload:
-          name: binary_windows_conda_3_8_cu115_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_8_cu115_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu115
-      - binary_upload:
-          name: binary_windows_conda_3_9_cu115_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_9_cu115_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu115
-      - binary_upload:
-          name: binary_windows_conda_3_10_cu115_nightly_upload
-          context: org-member
-          requires:
-            - binary_windows_conda_3_10_cu115_nightly_test
-          filters:
-            branches:
-              only:
-                - nightly
-            tags:
-              only:
-                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          package_type: conda
-          upload_subfolder: cu115
-    when: << pipeline.parameters.run_binary_tests >>
   build:
     jobs:
-      - binary_linux_build:
-          build_environment: manywheel 3.7m cu102 devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build
-      - binary_linux_build:
-          build_environment: libtorch 3.7m cpu devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
-      - binary_linux_build:
-          build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi
-          docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
-      - binary_mac_build:
-          build_environment: wheel 3.7 cpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_macos_wheel_3_7_cpu_build
-      - binary_mac_build:
-          build_environment: libtorch 3.7 cpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_macos_libtorch_3_7_cpu_build
-      - binary_windows_build:
-          build_environment: libtorch 3.7 cpu debug
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_windows_libtorch_3_7_cpu_debug_build
-      - binary_windows_build:
-          build_environment: libtorch 3.7 cpu release
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_windows_libtorch_3_7_cpu_release_build
-      - binary_windows_build:
-          build_environment: wheel 3.7 cu113
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_windows_wheel_3_7_cu113_build
-      - binary_windows_test:
-          build_environment: libtorch 3.7 cpu debug
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_windows_libtorch_3_7_cpu_debug_test
-          requires:
-            - binary_windows_libtorch_3_7_cpu_debug_build
-      - binary_windows_test:
-          build_environment: libtorch 3.7 cpu release
-          name: binary_windows_libtorch_3_7_cpu_release_test
-          requires:
-            - binary_windows_libtorch_3_7_cpu_release_build
-      - binary_windows_test:
-          build_environment: wheel 3.7 cu113
-          executor: windows-with-nvidia-gpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_windows_wheel_3_7_cu113_test
-          requires:
-            - binary_windows_wheel_3_7_cu113_build
-      - binary_linux_test:
-          build_environment: manywheel 3.7m cu102 devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test
-          requires:
-            - binary_linux_manywheel_3_7m_cu102_devtoolset7_build
-          resource_class: gpu.nvidia.small
-          use_cuda_docker_runtime: "1"
-      - binary_linux_test:
-          build_environment: libtorch 3.7m cpu devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test
-          requires:
-            - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
-      - binary_linux_test:
-          build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi
-          docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
-          requires:
-            - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
       - binary_ios_build:
           build_environment: libtorch-ios-12.5.1-nightly-x86_64-build
           context: org-member
@@ -2491,278 +1729,4 @@ workflows:
             branches:
               only:
                 - postnightly
-      - update_s3_htmls:
-          context: org-member
-          filters:
-            branches:
-              only:
-                - postnightly
-          name: update_s3_htmls
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_7_cpu_nightly
-          build_environment: "conda 3.7 cpu"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_8_cpu_nightly
-          build_environment: "conda 3.8 cpu"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_9_cpu_nightly
-          build_environment: "conda 3.9 cpu"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_10_cpu_nightly
-          build_environment: "conda 3.10 cpu"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_7_cu111_nightly
-          build_environment: "conda 3.7 cu111"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_8_cu111_nightly
-          build_environment: "conda 3.8 cu111"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_9_cu111_nightly
-          build_environment: "conda 3.9 cu111"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_10_cu111_nightly
-          build_environment: "conda 3.10 cu111"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_7_cu113_nightly
-          build_environment: "conda 3.7 cu113"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_8_cu113_nightly
-          build_environment: "conda 3.8 cu113"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_9_cu113_nightly
-          build_environment: "conda 3.9 cu113"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_10_cu113_nightly
-          build_environment: "conda 3.10 cu113"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_7_cu115_nightly
-          build_environment: "conda 3.7 cu115"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_8_cu115_nightly
-          build_environment: "conda 3.8 cu115"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_9_cu115_nightly
-          build_environment: "conda 3.9 cu115"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
-      - smoke_windows_test:
-          name: smoke_windows_conda_3_10_cu115_nightly
-          build_environment: "conda 3.10 cu115"
-          requires:
-            - update_s3_htmls
-          filters:
-            branches:
-              only:
-                - postnightly
-          executor: windows-with-nvidia-gpu
     when: << pipeline.parameters.run_build >>
-  master_build:
-    jobs:
-      - binary_linux_build:
-          build_environment: manywheel 3.7m cu102 devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build
-      - binary_linux_build:
-          build_environment: libtorch 3.7m cpu devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
-      - binary_linux_build:
-          build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi
-          docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
-      - binary_mac_build:
-          build_environment: wheel 3.7 cpu
-          name: binary_macos_wheel_3_7_cpu_build
-      - binary_mac_build:
-          build_environment: libtorch 3.7 cpu
-          name: binary_macos_libtorch_3_7_cpu_build
-      - binary_windows_build:
-          build_environment: libtorch 3.7 cpu debug
-          name: binary_windows_libtorch_3_7_cpu_debug_build
-      - binary_windows_build:
-          build_environment: libtorch 3.7 cpu release
-          name: binary_windows_libtorch_3_7_cpu_release_build
-      - binary_windows_build:
-          build_environment: wheel 3.7 cu113
-          name: binary_windows_wheel_3_7_cu113_build
-      - binary_windows_test:
-          build_environment: libtorch 3.7 cpu debug
-          name: binary_windows_libtorch_3_7_cpu_debug_test
-          requires:
-            - binary_windows_libtorch_3_7_cpu_debug_build
-      - binary_windows_test:
-          build_environment: wheel 3.7 cu113
-          executor: windows-with-nvidia-gpu
-          name: binary_windows_wheel_3_7_cu113_test
-          requires:
-            - binary_windows_wheel_3_7_cu113_build
-      - binary_linux_test:
-          build_environment: manywheel 3.7m cu102 devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test
-          requires:
-            - binary_linux_manywheel_3_7m_cu102_devtoolset7_build
-          resource_class: gpu.nvidia.small
-          use_cuda_docker_runtime: "1"
-      - binary_linux_test:
-          build_environment: libtorch 3.7m cpu devtoolset7
-          docker_image: pytorch/manylinux-cuda102
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test
-          requires:
-            - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
-      - binary_linux_test:
-          build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi
-          docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest
-          libtorch_variant: shared-with-deps
-          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
-          requires:
-            - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
-    when: << pipeline.parameters.run_master_build >>
-  # Promotion workflow
-  promote:
-    jobs:
-      # Requires manual approval by someone in org-member
-      # CircleCI security context
-      - promote_approval:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          type: approval
-      - promote_s3:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_s3_libtorch
-          package_name: libtorch
-          requires:
-            - promote_approval
-      - promote_s3:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_s3_torch
-          package_name: torch
-          requires:
-            - promote_approval
-      - promote_conda:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_conda_pytorch
-          package_name: pytorch
-          requires:
-            - promote_approval
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index bfab08d8bd0b..330dbbc6f8e8 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -40,6 +40,12 @@ function extract_all_from_image_name() {
   done
 }
 
+# Use the same pre-built XLA test image from PyTorch/XLA
+if [[ "$image" == *xla* ]]; then
+  echo "Using pre-built XLA test image..."
+  exit 0
+fi
+
 if [[ "$image" == *-xenial* ]]; then
   UBUNTU_VERSION=16.04
 elif [[ "$image" == *-artful* ]]; then
@@ -84,7 +90,7 @@ case "$image" in
     ;;
   pytorch-linux-xenial-py3.7-gcc5.4)
     ANACONDA_PYTHON_VERSION=3.7
-    CMAKE_VERSION=3.10.3
+    CMAKE_VERSION=3.12.4  # To make sure XNNPACK is enabled for the BACKWARDS_COMPAT_TEST used with this image
     GCC_VERSION=5
     PROTOBUF=yes
     DB=yes
@@ -116,9 +122,10 @@ case "$image" in
     VISION=yes
     KATEX=yes
     ;;
-  pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7)
-    CUDA_VERSION=11.1
+  pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)
+    CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
     CUDNN_VERSION=8
+    TENSORRT_VERSION=8.0.1.6
     ANACONDA_PYTHON_VERSION=3.7
     CMAKE_VERSION=3.10.3
     GCC_VERSION=7
@@ -127,20 +134,20 @@ case "$image" in
     VISION=yes
     KATEX=yes
     ;;
-  pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)
+  pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)
     CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
     CUDNN_VERSION=8
     TENSORRT_VERSION=8.0.1.6
     ANACONDA_PYTHON_VERSION=3.7
     CMAKE_VERSION=3.10.3
-    GCC_VERSION=7
+    CLANG_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes
     KATEX=yes
     ;;
-  pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7)
-    CUDA_VERSION=11.5.0
+  pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)
+    CUDA_VERSION=11.6.0
     CUDNN_VERSION=8
     ANACONDA_PYTHON_VERSION=3.7
     CMAKE_VERSION=3.10.3
@@ -227,31 +234,21 @@ case "$image" in
     DB=yes
     VISION=yes
     ;;
-  pytorch-linux-bionic-cuda11.0-cudnn8-py3.7-gcc9)
-    CUDA_VERSION=11.0
-    CUDNN_VERSION=8
+  pytorch-linux-bionic-rocm5.0-py3.7)
     ANACONDA_PYTHON_VERSION=3.7
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=3.9
+    ROCM_VERSION=5.0
     ;;
-  pytorch-linux-bionic-rocm4.3.1-py3.7)
+  pytorch-linux-bionic-rocm5.1-py3.7)
     ANACONDA_PYTHON_VERSION=3.7
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=4.3.1
-    ;;
-  pytorch-linux-bionic-rocm4.5-py3.7)
-    ANACONDA_PYTHON_VERSION=3.7
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=4.5.2
+    ROCM_VERSION=5.1.1
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
@@ -298,6 +295,13 @@ fi
 
 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 
+#when using cudnn version 8 install it separately from cuda
+if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
+  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
+    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  fi
+fi
 
 # Build image
 # TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
@@ -336,6 +340,7 @@ docker build \
        --build-arg "KATEX=${KATEX:-}" \
        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
        --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \
+       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
diff --git a/.circleci/docker/centos-rocm/Dockerfile b/.circleci/docker/centos-rocm/Dockerfile
index 832e09c7f664..e0ef9e3296fe 100644
--- a/.circleci/docker/centos-rocm/Dockerfile
+++ b/.circleci/docker/centos-rocm/Dockerfile
@@ -40,8 +40,10 @@ RUN bash ./install_user.sh && rm install_user.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
+ADD requirements-ci.txt /opt/conda/requirements-ci.txt
 ADD ./common/install_conda.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
+RUN rm /opt/conda/requirements-ci.txt
 
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index e2663d6b3bb8..1dc6b0cbaa55 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -20,6 +20,11 @@ install_ubuntu() {
     maybe_libiomp_dev="libiomp-dev"
   fi
 
+  # TODO: Remove this once nvidia package repos are back online
+  # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
+  # shellcheck disable=SC2046
+  sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+
   # Install common dependencies
   apt-get update
   # TODO: Some of these may not be necessary
@@ -45,8 +50,8 @@ install_ubuntu() {
     libasound2-dev \
     libsndfile-dev \
     software-properties-common \
-    sudo \
     wget \
+    sudo \
     vim
 
   # Should resolve issues related to various apt package repository cert issues
diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index 82cfcc6c6e6a..cc7696762a45 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -21,7 +21,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
       ;;
   esac
 
-  mkdir /opt/conda
+  mkdir -p /opt/conda
   chown jenkins:jenkins /opt/conda
 
   # Work around bug where devtoolset replaces sudo and breaks it.
@@ -68,14 +68,16 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     as_jenkins conda install -q -y python="$ANACONDA_PYTHON_VERSION" $*
   }
 
+  pip_install() {
+    as_jenkins pip install --progress-bar off $*
+  }
+
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
   # DO NOT install cmake here as it would install a version newer than 3.10, but
   # we want to pin to version 3.10.
-  SCIPY_VERSION=1.1.0
   if [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
     conda_install numpy=1.19.2 astunparse pyyaml mkl mkl-include setuptools cffi future six llvmdev=8.0.0 -c conda-forge
-    SCIPY_VERSION=1.6.0
   elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
     conda_install numpy=1.18.5 astunparse pyyaml mkl mkl-include setuptools cffi future six llvmdev=8.0.0
@@ -96,34 +98,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   conda_install nnpack -c killeent
 
   # Install some other packages, including those needed for Python test reporting
-  # TODO: Why is scipy pinned
-  # Pin MyPy version because new errors are likely to appear with each release
-  # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
-  as_jenkins pip install --progress-bar off pytest \
-    scipy==$SCIPY_VERSION \
-    scikit-image \
-    psutil \
-    unittest-xml-reporting \
-    boto3==1.16.34 \
-    hypothesis==4.53.2 \
-    expecttest==0.1.3 \
-    mypy==0.812 \
-    tb-nightly
-
-  # Install numba only on python-3.8 or below
-  # For numba issue see https://github.com/pytorch/pytorch/issues/51511
-  if [[ $(python -c "import sys; print(int(sys.version_info < (3, 9)))") == "1" ]]; then
-    as_jenkins pip install --progress-bar off numba==0.54.1 "librosa>=0.6.2,<0.9.0"
-  else
-    as_jenkins pip install --progress-bar off numba==0.49.0 "librosa>=0.6.2,<0.9.0"
-  fi
+  pip_install -r /opt/conda/requirements-ci.txt
 
   # Update scikit-learn to a python-3.8 compatible version
   if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
-    as_jenkins pip install --progress-bar off -U scikit-learn
+    pip_install -U scikit-learn
   else
     # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only)
-    as_jenkins pip install --progress-bar off scikit-learn==0.20.3
+    pip_install scikit-learn==0.20.3
   fi
 
   popd
diff --git a/.circleci/docker/common/install_cudnn.sh b/.circleci/docker/common/install_cudnn.sh
new file mode 100644
index 000000000000..1f1c34ea200d
--- /dev/null
+++ b/.circleci/docker/common/install_cudnn.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+if [[ ${CUDNN_VERSION} == 8 ]]; then
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
+    curl -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
+    tar xf ${CUDNN_NAME}.tar.xz
+    cp -a ${CUDNN_NAME}/include/* /usr/include/
+    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/
+
+    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
+    cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/
+    cd ..
+    rm -rf tmp_cudnn
+    ldconfig
+fi
diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh
index 4ba3ed73db90..4cda40bbdca5 100644
--- a/.circleci/docker/common/install_rocm.sh
+++ b/.circleci/docker/common/install_rocm.sh
@@ -6,8 +6,8 @@ install_magma() {
     # "install" hipMAGMA into /opt/rocm/magma by copying after build
     git clone https://bitbucket.org/icl/magma.git
     pushd magma
-    # fix for magma_queue memory leak issue
-    git checkout c62d700d880c7283b33fb1d615d62fc9c7f7ca21
+    # Fixes memory leaks of magma found while executing linalg UTs
+    git checkout 5959b8783e45f1809812ed96ae762f38ee701972
     cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
     echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
     echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
@@ -35,7 +35,7 @@ ver() {
 }
 
 # Map ROCm version to AMDGPU version
-declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" )
+declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" ["5.0"]="21.50" ["5.1.1"]="22.10.1" )
 
 install_ubuntu() {
     apt-get update
diff --git a/.circleci/docker/common/install_user.sh b/.circleci/docker/common/install_user.sh
index 69c762350bbf..93a436cbfc78 100755
--- a/.circleci/docker/common/install_user.sh
+++ b/.circleci/docker/common/install_user.sh
@@ -3,8 +3,11 @@
 set -ex
 
 # Mirror jenkins user in container
-echo "jenkins:x:1014:1014::/var/lib/jenkins:" >> /etc/passwd
-echo "jenkins:x:1014:" >> /etc/group
+# jenkins user as ec2-user should have the same user-id
+echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
+echo "jenkins:x:1000:" >> /etc/group
+# Needed on focal or newer
+echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow
 
 # Create $HOME
 mkdir -p /var/lib/jenkins
@@ -18,3 +21,6 @@ chown jenkins:jenkins /usr/local
 # Allow sudo
 # TODO: Maybe we shouldn't
 echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
+
+# Test that sudo works
+sudo -u jenkins sudo -v
diff --git a/.circleci/docker/requirements-ci.txt b/.circleci/docker/requirements-ci.txt
new file mode 100644
index 000000000000..ff5a9ba33b7b
--- /dev/null
+++ b/.circleci/docker/requirements-ci.txt
@@ -0,0 +1,212 @@
+# Python dependencies required for unit tests
+
+#awscli==1.6 #this breaks some platforms
+#Description: AWS command line interface
+#Pinned versions: 1.6
+#test that import:
+
+boto3==1.19.12
+#Description: AWS SDK for python
+#Pinned versions: 1.19.12, 1.16.34
+#test that import:
+
+click
+#Description: Command Line Interface Creation Kit
+#Pinned versions:
+#test that import:
+
+coremltools==5.0b5
+#Description: Apple framework for ML integration
+#Pinned versions: 5.0b5
+#test that import:
+
+#dataclasses #this breaks some platforms
+#Description: Provides decorators for auto adding special methods to user classes
+#Pinned versions:
+#test that import:
+
+expecttest==0.1.3
+#Description: method for writing tests where test framework auto populates
+# the expected output based on previous runs
+#Pinned versions: 0.1.3
+#test that import:
+
+flatbuffers==2.0
+#Description: cross platform serialization library
+#Pinned versions: 2.0
+#test that import:
+
+#future #this breaks linux-bionic-rocm4.5-py3.7
+#Description: compatibility layer between python 2 and python 3
+#Pinned versions:
+#test that import:
+
+hypothesis==4.53.2
+# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
+#Description: advanced library for generating parametrized tests
+#Pinned versions: 3.44.6, 4.53.2
+#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
+
+junitparser==2.1.1
+#Description: unitparser handles JUnit/xUnit Result XML files
+#Pinned versions: 2.1.1
+#test that import:
+
+librosa>=0.6.2
+#Description: A python package for music and audio analysis
+#Pinned versions: >=0.6.2
+#test that import: test_spectral_ops.py
+
+#mkl #this breaks linux-bionic-rocm4.5-py3.7
+#Description: Intel oneAPI Math Kernel Library
+#Pinned versions:
+#test that import: test_profiler.py, test_public_bindings.py, test_testing.py,
+#test_nn.py, test_mkldnn.py, test_jit.py, test_fx_experimental.py,
+#test_autograd.py
+
+#mkl-devel
+# see mkl
+
+#mock # breaks ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+#Description: A testing library that allows you to replace parts of your
+#system under test with mock objects
+#Pinned versions:
+#test that import: test_module_init.py, test_modules.py, test_nn.py,
+#test_testing.py
+
+#MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8
+#Description: collects runtime types of function arguments and return
+#values, and can automatically generate stub files
+#Pinned versions:
+#test that import:
+
+mypy==0.812
+# Pin MyPy version because new errors are likely to appear with each release
+#Description: linter
+#Pinned versions: 0.812
+#test that import: test_typing.py, test_type_hints.py
+
+#networkx
+#Description: creation, manipulation, and study of
+#the structure, dynamics, and functions of complex networks
+#Pinned versions: 2.0
+#test that import:
+
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
+#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
+
+numba==0.49.0 ; python_version < "3.9"
+numba==0.54.1 ; python_version == "3.9"
+#Description: Just-In-Time Compiler for Numerical Functions
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#test that import: test_numba_integration.py
+#For numba issue see https://github.com/pytorch/pytorch/issues/51511
+
+#numpy
+#Description: Provides N-dimensional arrays and linear algebra
+#Pinned versions: 1.20
+#test that import: test_view_ops.py, test_unary_ufuncs.py, test_type_promotion.py,
+#test_type_info.py, test_torch.py, test_tensorexpr_pybind.py, test_tensorexpr.py,
+#test_tensorboard.py, test_tensor_creation_ops.py, test_static_runtime.py,
+#test_spectral_ops.py, test_sort_and_select.py, test_shape_ops.py,
+#test_segment_reductions.py, test_reductions.py, test_pruning_op.py,
+#test_overrides.py, test_numpy_interop.py, test_numba_integration.py
+#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
+#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
+#test_binary_ufuncs.py
+
+#onnxruntime
+#Description: scoring engine for Open Neural Network Exchange (ONNX) models
+#Pinned versions: 1.9.0
+#test that import:
+
+#pillow
+#Description:  Python Imaging Library fork
+#Pinned versions:
+#test that import:
+
+#protobuf
+#Description:  Google’s data interchange format
+#Pinned versions:
+#test that import: test_tensorboard.py
+
+psutil
+#Description: information on running processes and system utilization
+#Pinned versions:
+#test that import: test_profiler.py, test_openmp.py, test_dataloader.py
+
+pytest
+#Description: testing framework
+#Pinned versions:
+#test that import: test_typing.py, test_cpp_extensions_aot.py, run_test.py
+
+#pytest-benchmark
+#Description: fixture for benchmarking code
+#Pinned versions: 3.2.3
+#test that import:
+
+#pytest-sugar
+#Description: shows failures and errors instantly
+#Pinned versions:
+#test that import:
+
+#PyYAML
+#Description: data serialization format
+#Pinned versions:
+#test that import:
+
+#requests
+#Description: HTTP library
+#Pinned versions:
+#test that import: test_type_promotion.py
+
+#rich
+#Description: rich text and beautiful formatting in the terminal
+#Pinned versions: 10.9.0
+#test that import:
+
+scikit-image
+#Description: image processing routines
+#Pinned versions:
+#test that import: test_nn.py
+
+#scikit-learn
+#Description: machine learning package
+#Pinned versions: 0.20.3
+#test that import:
+
+scipy==1.6.3
+# Pin SciPy because of failing distribution tests (see #60347)
+#Description: scientific python
+#Pinned versions: 1.6.3
+#test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
+#test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
+#test_linalg.py, test_binary_ufuncs.py
+
+#tabulate
+#Description: Pretty-print tabular data
+#Pinned versions:
+#test that import:
+
+tb-nightly
+#Description: TensorBoard
+#Pinned versions:
+#test that import:
+
+#typing-extensions
+#Description: type hints for python
+#Pinned versions:
+#test that import:
+
+#virtualenv
+#Description: virtual environment for python
+#Pinned versions:
+#test that import:
+
+unittest-xml-reporting<=3.2.0,>=2.0.0
+#Description: saves unit test results to xml
+#Pinned versions:
+#test that import:
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
index 9c9e40387066..241b91cff394 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -1,12 +1,11 @@
 ARG UBUNTU_VERSION
 ARG CUDA_VERSION
-ARG CUDNN_VERSION
+ARG IMAGE_NAME
 
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+FROM ${IMAGE_NAME}
 
 ARG UBUNTU_VERSION
 ARG CUDA_VERSION
-ARG CUDNN_VERSION
 
 ENV DEBIAN_FRONTEND noninteractive
 
@@ -27,8 +26,10 @@ RUN bash ./install_katex.sh && rm install_katex.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
+ADD requirements-ci.txt /opt/conda/requirements-ci.txt
 ADD ./common/install_conda.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
+RUN rm /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
@@ -99,5 +100,11 @@ ENV CUDA_PATH /usr/local/cuda
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 
+# Install CUDNN
+ARG CUDNN_VERSION
+ADD ./common/install_cudnn.sh install_cudnn.sh
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
+RUN rm install_cudnn.sh
+
 USER jenkins
 CMD ["bash"]
diff --git a/.circleci/docker/ubuntu-rocm/Dockerfile b/.circleci/docker/ubuntu-rocm/Dockerfile
index 73f0e1822e89..260592876363 100644
--- a/.circleci/docker/ubuntu-rocm/Dockerfile
+++ b/.circleci/docker/ubuntu-rocm/Dockerfile
@@ -28,8 +28,10 @@ RUN bash ./install_user.sh && rm install_user.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
+ADD requirements-ci.txt /opt/conda/requirements-ci.txt
 ADD ./common/install_conda.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
+RUN rm /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile
index e0ae5c096ec9..d5940c7a1d55 100644
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.circleci/docker/ubuntu/Dockerfile
@@ -36,8 +36,10 @@ RUN bash ./install_katex.sh && rm install_katex.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ENV PATH /opt/conda/bin:$PATH
 ARG ANACONDA_PYTHON_VERSION
+ADD requirements-ci.txt /opt/conda/requirements-ci.txt
 ADD ./common/install_conda.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
+RUN rm /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py
index a801aa978482..e068dd98fd8e 100755
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@@ -10,8 +10,6 @@
 import sys
 from collections import namedtuple
 
-import cimodel.data.binary_build_definitions as binary_build_definitions
-import cimodel.data.simple.binary_smoketest
 import cimodel.data.simple.docker_definitions
 import cimodel.data.simple.mobile_definitions
 import cimodel.data.simple.nightly_ios
@@ -81,11 +79,11 @@ def _for_all_items(items, functor) -> None:
         functor(item_type, item)
 
 def filter_master_only_jobs(items):
-    def _is_master_item(item):
+    def _is_main_or_master_item(item):
         filters = item.get('filters', None)
         branches = filters.get('branches', None) if filters is not None else None
         branches_only = branches.get('only', None) if branches is not None else None
-        return 'master' in branches_only if branches_only is not None else False
+        return ('main' in branches_only or 'master' in branches_only) if branches_only is not None else False
 
     master_deps = set()
 
@@ -94,7 +92,7 @@ def _save_requires_if_master(item_type, item):
         item_name = item.get("name", None)
         if not isinstance(requires, list):
             return
-        if _is_master_item(item) or item_name in master_deps:
+        if _is_main_or_master_item(item) or item_name in master_deps:
             master_deps.update([n.strip('"') for n in requires])
 
     def _do_filtering(items):
@@ -105,7 +103,7 @@ def _do_filtering(items):
         item_type, item = next(iter(items.items()))
         item_name = item.get("name", None)
         item_name = item_name.strip('"') if item_name is not None else None
-        if not _is_master_item(item) and item_name not in master_deps:
+        if not _is_main_or_master_item(item) and item_name not in master_deps:
             return None
         if 'filters' in item:
             item = item.copy()
@@ -113,7 +111,7 @@ def _do_filtering(items):
         return {item_type: item}
 
     # Scan of dependencies twice to pick up nested required jobs
-    # I.e. jobs depending on jobs that master-only job depend on
+    # I.e. jobs depending on jobs that main-only job depend on
     _for_all_items(items, _save_requires_if_master)
     _for_all_items(items, _save_requires_if_master)
     return _do_filtering(items)
@@ -136,11 +134,8 @@ def _requires_docker_image(item_type, item):
 def gen_build_workflows_tree():
     build_workflows_functions = [
         cimodel.data.simple.mobile_definitions.get_workflow_jobs,
-        cimodel.data.simple.binary_smoketest.get_workflow_jobs,
         cimodel.data.simple.nightly_ios.get_workflow_jobs,
         cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs,
-        binary_build_definitions.get_post_upload_jobs,
-        binary_build_definitions.get_binary_smoke_test_jobs,
     ]
     build_jobs = [f() for f in build_workflows_functions]
     build_jobs.extend(
@@ -151,28 +146,20 @@ def gen_build_workflows_tree():
     )
     master_build_jobs = filter_master_only_jobs(build_jobs)
 
-    binary_build_functions = [
-        binary_build_definitions.get_binary_build_jobs,
-        binary_build_definitions.get_nightly_tests,
-        binary_build_definitions.get_nightly_uploads,
-    ]
-
-    return {
+    rc = {
         "workflows": {
-            "binary_builds": {
-                "when": r"<< pipeline.parameters.run_binary_tests >>",
-                "jobs": [f() for f in binary_build_functions],
-            },
             "build": {
                 "when": r"<< pipeline.parameters.run_build >>",
                 "jobs": build_jobs,
             },
-            "master_build": {
-                "when": r"<< pipeline.parameters.run_master_build >>",
-                "jobs": master_build_jobs,
-            },
         }
     }
+    if len(master_build_jobs) > 0:
+        rc["workflows"]["master_build"] = {
+            "when": r"<< pipeline.parameters.run_master_build >>",
+            "jobs": master_build_jobs,
+        }
+    return rc
 
 
 # Order of this list matters to the generated config.yml.
@@ -183,17 +170,14 @@ def gen_build_workflows_tree():
     Header("Build parameters"),
     File("build-parameters/pytorch-build-params.yml"),
     File("build-parameters/binary-build-params.yml"),
-    File("build-parameters/promote-build-params.yml"),
     Header("Job specs"),
     File("job-specs/binary-job-specs.yml"),
     File("job-specs/job-specs-custom.yml"),
-    File("job-specs/job-specs-promote.yml"),
     File("job-specs/binary_update_htmls.yml"),
     File("job-specs/binary-build-tests.yml"),
     File("job-specs/docker_jobs.yml"),
     Header("Workflows"),
     Treegen(gen_build_workflows_tree, 0),
-    File("workflows/workflows-promote.yml"),
 ]
 
 
diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh
index db2b0660d9f5..86bfeb77e6ac 100755
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@@ -49,8 +49,9 @@ if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then
   git reset --hard "$CIRCLE_SHA1"
 elif [[ -n "${CIRCLE_SHA1:-}" ]]; then
   # Scheduled workflows & "smoke" binary build on master on PR merges
+  DEFAULT_BRANCH="$(git remote show $CIRCLE_REPOSITORY_URL | awk '/HEAD branch/ {print $NF}')"
   git reset --hard "$CIRCLE_SHA1"
-  git checkout -q -B master
+  git checkout -q -B $DEFAULT_BRANCH
 else
   echo "Can't tell what to checkout"
   exit 1
diff --git a/.circleci/scripts/binary_linux_build.sh b/.circleci/scripts/binary_linux_build.sh
index 42aa728d55a6..88561fcd80ec 100755
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@@ -26,7 +26,7 @@ else
   build_script='manywheel/build.sh'
 fi
 
-if [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then
+if [[ "$CIRCLE_BRANCH" == "main" ]] || [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then
   export BUILD_DEBUG_INFO=1
 fi
 
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 5be7f7cae213..bdec35d6d5d9 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -53,7 +53,7 @@ if [[ "\$python_nodot" = *39*  ]]; then
   NUMPY_PIN=">=1.20"
 fi
 
-if [[ "$DESIRED_CUDA" == "cu112" || "$DESIRED_CUDA" == "cu115" ]]; then
+if [[ "$DESIRED_CUDA" == "cu116" ]]; then
   EXTRA_CONDA_FLAGS="-c=conda-forge"
 fi
 
@@ -67,7 +67,8 @@ mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to m
 # TODO there is duplicated and inconsistent test-python-env setup across this
 #   file, builder/smoke_test.sh, and builder/run_tests.sh, and also in the
 #   conda build scripts themselves. These should really be consolidated
-pkg="/final_pkgs/\$(ls /final_pkgs)"
+# Pick only one package of multiple available (which happens as result of workflow re-runs)
+pkg="/final_pkgs/\$(ls -1 /final_pkgs|sort|tail -1)"
 if [[ "$PACKAGE_TYPE" == conda ]]; then
   (
     # For some reason conda likes to re-activate the conda environment when attempting this install
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index eab7c2b727fe..b42d58549d68 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -50,7 +50,7 @@ if [[ -z ${IS_GHA:-} ]]; then
   export PACKAGE_TYPE="${configs[0]}"
   export DESIRED_PYTHON="${configs[1]}"
   export DESIRED_CUDA="${configs[2]}"
-  if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
+  if [[ "${OSTYPE}" == "msys" ]]; then
     export DESIRED_DEVTOOLSET=""
     export LIBTORCH_CONFIG="${configs[3]:-}"
     if [[ "$LIBTORCH_CONFIG" == 'debug' ]]; then
@@ -91,11 +91,6 @@ if [[ ${DESIRED_CUDA} == "cpu" ]]; then
   USE_GOLD_LINKER="ON"
 fi
 
-USE_WHOLE_CUDNN="OFF"
-# Link whole cuDNN for CUDA-11.1 to include fp16 fast kernels
-if [[  "$(uname)" == "Linux" && "${DESIRED_CUDA}" == "cu111" ]]; then
-  USE_WHOLE_CUDNN="ON"
-fi
 
 # Default to nightly, since that's where this normally uploads to
 PIP_UPLOAD_FOLDER='nightly/'
@@ -158,10 +153,14 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}"
-if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
+if [[ "${OSTYPE}" == "msys" ]]; then
   export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
-  export DEBUG="${DEBUG:-}"
+  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
+    export DEBUG=1
+  fi
+  export DESIRED_DEVTOOLSET=""
+else
+  export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}"
 fi
 
 export DATE="$DATE"
@@ -184,7 +183,6 @@ export DOCKER_IMAGE="$DOCKER_IMAGE"
 
 export USE_GOLD_LINKER="${USE_GOLD_LINKER}"
 export USE_GLOO_WITH_OPENSSL="ON"
-export USE_WHOLE_CUDNN="${USE_WHOLE_CUDNN}"
 # =================== The above code will be executed inside Docker container ===================
 EOL
 
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 439b2c981cfe..e6500b8d9c93 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -7,15 +7,17 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 export USE_SCCACHE=1
 export SCCACHE_BUCKET=ossci-compiler-cache-windows
-export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
+export SCCACHE_IGNORE_SERVER_IO_ERROR=1
 export VC_YEAR=2019
 
 if [[ "${DESIRED_CUDA}" == *"cu11"* ]]; then
     export BUILD_SPLIT_CUDA=ON
 fi
 
+
 echo "Free Space for CUDA DEBUG BUILD"
 if [[ "${CIRCLECI:-}" == 'true' ]]; then
+    export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
     if [[ -d "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community" ]]; then
         rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community"
     fi
@@ -70,6 +72,7 @@ pushd "$BUILDER_ROOT"
 if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
     ./windows/internal/build_conda.bat
 elif [[ "$PACKAGE_TYPE" == 'wheel' || "$PACKAGE_TYPE" == 'libtorch' ]]; then
+    export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
     ./windows/internal/build_wheels.bat
 fi
 
diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh
index fa68d07e537e..1ade86b31264 100755
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@@ -34,9 +34,9 @@ echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
   exit 1
 fi
 
-is_master_doc=false
+is_main_doc=false
 if [ "$version" == "master" ]; then
-  is_master_doc=true
+  is_main_doc=true
 fi
 
 echo "install_path: $install_path  version: $version"
@@ -56,7 +56,7 @@ sudo apt-get -y install doxygen
 # Generate ATen files
 pushd "${pt_checkout}"
 pip install -r requirements.txt
-time python -m tools.codegen.gen \
+time python -m torchgen.gen \
   -s aten/src/ATen \
   -d build/aten/src/ATen
 
@@ -66,7 +66,7 @@ cp torch/_utils_internal.py tools/shared
 # Generate PyTorch files
 time python tools/setup_helpers/generate_code.py \
   --native-functions-path aten/src/ATen/native/native_functions.yaml \
-  --nn-path aten/src/
+  --tags-path aten/src/ATen/native/tags.yaml
 
 # Build the docs
 pushd docs/cpp
diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh
index ccfc44917400..f9b019ec069b 100755
--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@@ -37,9 +37,9 @@ echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
   exit 1
 fi
 
-is_master_doc=false
+is_main_doc=false
 if [ "$version" == "master" ]; then
-  is_master_doc=true
+  is_main_doc=true
 fi
 
 # Argument 3: The branch to push to. Usually is "site"
@@ -86,7 +86,7 @@ pushd docs
 
 # Build the docs
 pip -q install -r requirements.txt
-if [ "$is_master_doc" = true ]; then
+if [ "$is_main_doc" = true ]; then
   build_docs html
   [ $? -eq 0 ] || exit $?
   make coverage
diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh
index 1f2e6bfaef61..dab183d907a6 100755
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@@ -32,7 +32,7 @@ if ! command -v aws >/dev/null; then
 fi
 
 if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-  DRIVER_FN="NVIDIA-Linux-x86_64-495.44.run"
+  DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run"
   wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
   sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
   nvidia-smi
diff --git a/.circleci/scripts/trigger_azure_pipeline.py b/.circleci/scripts/trigger_azure_pipeline.py
index b35ee5ce9def..9dc9dff2d54d 100644
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@@ -11,7 +11,7 @@
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
 PROJECT_ID = "0628bce4-2d33-499e-bac5-530e12db160f"
-TARGET_BRANCH = os.environ.get("CIRCLE_BRANCH", "master")
+TARGET_BRANCH = os.environ.get("CIRCLE_BRANCH", "main")
 TARGET_COMMIT = os.environ.get("CIRCLE_SHA1", "")
 
 build_base_url = AZURE_PIPELINE_BASE_URL + "_apis/build/builds?api-version=6.0"
diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh
index abcdcf134b37..f06a2b0ab096 100644
--- a/.circleci/scripts/windows_cuda_install.sh
+++ b/.circleci/scripts/windows_cuda_install.sh
@@ -2,25 +2,17 @@
 set -eux -o pipefail
 
 case ${CUDA_VERSION} in
-    10.1)
-        cuda_installer_name="cuda_10.1.243_426.00_win10"
-        cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
-        ;;
     10.2)
         cuda_installer_name="cuda_10.2.89_441.22_win10"
         cuda_install_packages="nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
         ;;
-    11.1)
-        cuda_installer_name="cuda_11.1.1_456.81_win10"
-        cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
-        ;;
     11.3)
         cuda_installer_name="cuda_11.3.0_465.89_win10"
         cuda_install_packages="thrust_11.3 nvcc_11.3 cuobjdump_11.3 nvprune_11.3 nvprof_11.3 cupti_11.3 cublas_11.3 cublas_dev_11.3 cudart_11.3 cufft_11.3 cufft_dev_11.3 curand_11.3 curand_dev_11.3 cusolver_11.3 cusolver_dev_11.3 cusparse_11.3 cusparse_dev_11.3 npp_11.3 npp_dev_11.3 nvrtc_11.3 nvrtc_dev_11.3 nvml_dev_11.3"
         ;;
-    11.5)
-        cuda_installer_name="cuda_11.5.0_496.13_win10"
-        cuda_install_packages="thrust_11.5 nvcc_11.5 cuobjdump_11.5 nvprune_11.5 nvprof_11.5 cupti_11.5 cublas_11.5 cublas_dev_11.5 cudart_11.5 cufft_11.5 cufft_dev_11.5 curand_11.5 curand_dev_11.5 cusolver_11.5 cusolver_dev_11.5 cusparse_11.5 cusparse_dev_11.5 npp_11.5 npp_dev_11.5 nvrtc_11.5 nvrtc_dev_11.5 nvml_dev_11.5"
+    11.6)
+        cuda_installer_name="cuda_11.6.0_511.23_windows"
+        cuda_install_packages="thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
         ;;
     *)
         echo "CUDA_VERSION $CUDA_VERSION is not supported yet"
diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
index 87e8a8dd09bf..a815008ee1e0 100644
--- a/.circleci/scripts/windows_cudnn_install.sh
+++ b/.circleci/scripts/windows_cudnn_install.sh
@@ -5,22 +5,16 @@ set -eux -o pipefail
 windows_s3_link="https://ossci-windows.s3.amazonaws.com"
 
 case ${CUDA_VERSION} in
-    10.1)
-        # This is typically blank but for CUDA 10* it'll be set to 10
-        cudnn_file_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38"
-        ;;
     10.2)
         cudnn_file_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.5.32"
         ;;
-    11.1)
-        cudnn_file_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39"
-        ;;
     11.3)
-        cudnn_file_name="cudnn-${CUDA_VERSION}-windows-x64-v8.2.0.53"
+        # Use cudnn8.3 with hard-coded cuda11.3 version
+        cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive"
         ;;
-    11.5)
-        # Since cudnn 8.3  the filename have changed
-        cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda${CUDA_VERSION}-archive"
+    11.6)
+        # Use cudnn8.3 with hard-coded cuda11.5 version
+        cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive"
         ;;
     *)
         echo "CUDA_VERSION: ${CUDA_VERSION} not supported yet"
diff --git a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
index ca1d1486fef8..6f34c30d5248 100644
--- a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
@@ -62,5 +62,4 @@ binary_windows_params: &binary_windows_params
       default: "windows-xlarge-cpu-with-nvidia-cuda"
   environment:
     BUILD_ENVIRONMENT: << parameters.build_environment >>
-    BUILD_FOR_SYSTEM: windows
     JOB_EXECUTOR: <<parameters.executor>>
diff --git a/.circleci/verbatim-sources/build-parameters/promote-build-params.yml b/.circleci/verbatim-sources/build-parameters/promote-build-params.yml
deleted file mode 100644
index 2827c805f10a..000000000000
--- a/.circleci/verbatim-sources/build-parameters/promote-build-params.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-
-promote_common: &promote_common
-  docker:
-    - image: pytorch/release
-  parameters:
-    package_name:
-      description: "package name to promote"
-      type: string
-      default: ""
-  environment:
-    PACKAGE_NAME: << parameters.package_name >>
-    ANACONDA_API_TOKEN: ${CONDA_PYTORCHBOT_TOKEN}
-    AWS_ACCESS_KEY_ID: ${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}
-    AWS_SECRET_ACCESS_KEY: ${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}
diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
index 581b76c8f942..f6f16ef7dd65 100644
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -162,6 +162,7 @@ jobs:
     <<: *binary_mac_params
     macos:
       xcode: "12.0"
+      resource_class: "large"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index a3c1d932d93e..f0f12e09b2d9 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -5,7 +5,7 @@
     parameters:
       branch:
         type: string
-        default: "master"
+        default: "main"
     steps:
     - attach_workspace:
         at: /tmp/workspace
@@ -45,7 +45,7 @@
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
           # turn v1.12.0rc3 into 1.12
           tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-          target=${tag:-master}
+          target=${tag:-main}
           echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
@@ -55,7 +55,7 @@
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
           mkdir -p ~/workspace/build_artifacts
-          docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/master ~/workspace/build_artifacts
+          docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/main ~/workspace/build_artifacts
           docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io /tmp/workspace
 
           # Save the docs build so we can debug any problems
@@ -67,7 +67,7 @@
         paths:
           - .
     - store_artifacts:
-        path: ~/workspace/build_artifacts/master
+        path: ~/workspace/build_artifacts/main
         destination: docs
 
   pytorch_cpp_doc_build:
@@ -91,12 +91,12 @@
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
           # turn v1.12.0rc3 into 1.12
           tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-          target=${tag:-master}
+          target=${tag:-main}
           echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" main") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -580,7 +580,7 @@
           time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG})
 
-          echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+          echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
 
           git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
 
diff --git a/.circleci/verbatim-sources/workflows/workflows-promote.yml b/.circleci/verbatim-sources/workflows/workflows-promote.yml
deleted file mode 100644
index d3afc0862d63..000000000000
--- a/.circleci/verbatim-sources/workflows/workflows-promote.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-  # Promotion workflow
-  promote:
-    jobs:
-      # Requires manual approval by someone in org-member
-      # CircleCI security context
-      - promote_approval:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          type: approval
-      - promote_s3:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_s3_libtorch
-          package_name: libtorch
-          requires:
-            - promote_approval
-      - promote_s3:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_s3_torch
-          package_name: torch
-          requires:
-            - promote_approval
-      - promote_conda:
-          context: org-member
-          filters:
-            branches:
-              ignore: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*/
-          name: promote_conda_pytorch
-          package_name: pytorch
-          requires:
-            - promote_approval
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000000..51ae28c6e058
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,24 @@
+# 2020-11-12 Enabled ShellCheck on `.jenkins/pytorch`
+65d5004b09fd8d5deac173a3aaa259f46eaa0d67
+# 2021-01-20 Replaced `   ` with `...` in many doctests
+c147aa306c6386a753fdff24b48d04e803070a63
+# 2021-03-05 Removed all trailing whitespace
+8c798e062216278673a75bac0848ea69a8bd3f03
+# 2021-03-30 Normalized trailing newlines
+5bcbbf537327f6e8328289c25a3a453a2444d984
+# 2021-03-31 Autogenerated Markdown ToCs
+a74b10def961ab090385f291ee06e66db99c1a2f
+# 2021-04-02 Enabled more ShellCheck warnings
+09670c7d43b9abce862a6bf71d8cc89e64764bdb
+# 2021-04-08 Removed all non-breaking spaces
+cc11aaaa60aadf28e3ec278bce26a42c1cd68a4f
+# 2021-04-13 Expanded many wildcard imports
+4753100a3baa96273204c361c8452afb7b59836f
+# 2021-04-19 Removed all unqualified `noqa`
+e3900d2ba5c9f91a24a9ce34520794c8366d5c54
+# 2021-04-21 Removed all unqualified `type: ignore`
+75024e228ca441290b6a1c2e564300ad507d7af6
+# 2021-05-14 Removed all versionless Python shebangs
+2e26976ad3b06ce95dd6afccfdbe124802edf28f
+# 2021-06-07 Strictly typed everything in `.github` and `tools`
+737d920b21db9b4292d056ee1329945990656304
diff --git a/.gitattributes b/.gitattributes
index 70246abe9bbb..8bccf04bbb7d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,3 +2,5 @@
 .circleci/config.yml linguist-generated=true
 .github/workflows/generated-*.yml linguist-generated=true
 .github/generated-* linguist-generated=true
+.github/scripts/gql_mocks.json linguist-generated=true
+third_party/LICENSES_BUNDLED.txt linguist-generated=true
diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md
index b248963cfd4d..8178c68d978b 100644
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@@ -1,5 +1,5 @@
 ---
-name: "⚠️CI SEV"
+name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
 ---
 
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 7ffccd6e58e7..cd98b00b0646 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -2,4 +2,4 @@ blank_issues_enabled: true
 contact_links:
   - name: Questions
     url: https://discuss.pytorch.org/
-    about: Ask questions and discuss with other pytorch community members
+    about: Ask questions and discuss with other PyTorch community members
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 42c2317b5cfc..e18d5412dced 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -1,5 +1,5 @@
 name: 🚀 Feature request
-description: Submit a proposal/request for a new pytorch feature
+description: Submit a proposal/request for a new PyTorch feature
 
 body:
 - type: textarea
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 18329c526258..91b6d5af421b 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -1,5 +1,7 @@
 self-hosted-runner:
   labels:
+    - linux.20_04.4x
+    - linux.20_04.16x
     - linux.large
     - linux.2xlarge
     - linux.4xlarge
@@ -9,3 +11,5 @@ self-hosted-runner:
     - windows.4xlarge
     - windows.8xlarge.nvidia.gpu
     - bm-runner
+    - linux.rocm.gpu
+    - macos-12
diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml
new file mode 100644
index 000000000000..2493bb3a7606
--- /dev/null
+++ b/.github/actions/build-android/action.yml
@@ -0,0 +1,82 @@
+name: build android
+
+description: build android for a specific arch
+
+inputs:
+  arch:
+    description: arch to build
+    required: true
+  arch-for-build-env:
+    description: |
+      arch to pass to build environment.
+      This is currently different than the arch name we use elswhere, which
+      should be fixed.
+    required: true
+  github-secret:
+    description: github token
+    required: true
+  build-environment:
+    required: true
+    description: Top-level label for what's being built/tested.
+  docker-image:
+    required: true
+    description: Name of the base docker image to build with.
+  branch:
+    required: true
+    description: What branch we are building on.
+outputs:
+  container_id:
+    description: Docker container identifier used to build the artifacts
+    value: ${{ steps.build.outputs.container_id }}
+
+runs:
+  using: composite
+  steps:
+    - name: Build-${{ inputs.arch }}
+      id: build
+      shell: bash
+      env:
+        BRANCH: ${{ inputs.branch }}
+        JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
+        BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
+        AWS_DEFAULT_REGION: us-east-1
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        DOCKER_IMAGE: ${{ inputs.docker-image  }}
+        MATRIX_ARCH: ${{ inputs.arch }}
+      run: |
+        # detached container should get cleaned up by teardown_ec2_linux
+        set -exo pipefail
+        export container_name
+        container_name=$(docker run \
+          -e BUILD_ENVIRONMENT \
+          -e JOB_BASE_NAME \
+          -e MAX_JOBS="$(nproc --ignore=2)" \
+          -e AWS_DEFAULT_REGION \
+          -e IS_GHA \
+          -e PR_NUMBER \
+          -e SHA1 \
+          -e BRANCH \
+          -e GITHUB_RUN_ID \
+          -e SCCACHE_BUCKET \
+          -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+          -e SKIP_SCCACHE_INITIALIZATION=1 \
+          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+          --security-opt seccomp=unconfined \
+          --cap-add=SYS_PTRACE \
+          --tty \
+          --detach \
+          --user jenkins \
+          -w /var/lib/jenkins/workspace \
+          "${DOCKER_IMAGE}"
+        )
+        git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
+        docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
+        (echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
+
+        # Copy install binaries back
+        mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
+        docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
+        echo "::set-output name=container_id::${container_name}"
diff --git a/.github/actions/calculate-docker-image/action.yml b/.github/actions/calculate-docker-image/action.yml
new file mode 100644
index 000000000000..d32179ac78a7
--- /dev/null
+++ b/.github/actions/calculate-docker-image/action.yml
@@ -0,0 +1,93 @@
+name: Calculate docker image
+
+description: Determine docker image to pull, building a new one if necessary.
+
+inputs:
+  docker-image-name:
+    description: The name of a docker image, like `pytorch-linux-xenial-py3.7-gcc7`
+    required: true
+  xla:
+    description: |
+      Whether or not to use a pre-build XLA docker image.
+      Note that this is a string, either "true" or "false" due to GHA limitations.
+    required: false
+  always-rebuild:
+    description: If set to any value, always build a fresh docker image.
+    required: false
+  pull:
+    description: If set to any value, run `docker pull`` on the calculated image.
+    required: false
+
+outputs:
+  docker-image:
+    description: The docker image to use for the rest of the workflow
+    value: ${{ steps.calculate-tag.outputs.docker-image }}
+
+runs:
+  using: composite
+  steps:
+    - name: Calculate docker image tag
+      shell: bash
+      id: calculate-tag
+      env:
+        IS_XLA: ${{ inputs.xla == 'true' && 'true' || '' }}
+        XLA_IMAGE_TAG: v0.2
+        DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ inputs.docker-image-name }}
+      run: |
+        if [ -n "${IS_XLA}" ]; then
+          echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}"
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker-tag::${DOCKER_TAG}"
+          echo "::set-output name=docker-image::${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}"
+        else
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker-tag::${DOCKER_TAG}"
+          echo "::set-output name=docker-image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+        fi
+
+    - name: Check if image should be built
+      shell: bash
+      id: check
+      if: ${{ !inputs.always-rebuild }}
+      env:
+        BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        DOCKER_IMAGE: ${{ steps.calculate-tag.outputs.docker-image }}
+        DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker-tag }}
+      run: |
+        set -x
+        # Check if image already exists, if it does then skip building it
+        if docker manifest inspect "${DOCKER_IMAGE}"; then
+          exit 0
+        fi
+        if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+          # if we're on the base branch then use the parent commit
+          MERGE_BASE=$(git rev-parse HEAD~)
+        else
+          # otherwise we're on a PR, so use the most recent base commit
+          MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+        fi
+        # Covers the case where a previous tag doesn't exist for the tree
+        # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+        if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+          echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+          exit 1
+        fi
+        PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+        # If no image exists but the hash is the same as the previous hash then we should error out here
+        if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+          echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+          echo "       contact the PyTorch team to restore the original images"
+          exit 1
+        fi
+        echo ::set-output name=rebuild::yes
+
+    - name: Build and push docker image
+      if: inputs.always-rebuild || steps.check.outputs.rebuild
+      env:
+        IMAGE_NAME: ${{inputs.docker-image-name}}
+        DOCKER_SKIP_S3_UPLOAD: "1"
+        DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker-tag }}
+      working-directory: .circleci/docker
+      shell: bash
+      run: |
+        ./build_docker.sh
diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml
new file mode 100644
index 000000000000..6523dab0c64f
--- /dev/null
+++ b/.github/actions/checkout-pytorch/action.yml
@@ -0,0 +1,41 @@
+name: Checkout PyTorch
+
+description: Clean workspace and check out PyTorch
+
+inputs:
+  no-sudo:
+    description: If set to any value, don't use sudo to clean the workspace
+    required: false
+  submodules:
+    description: Works as stated in actions/checkout, but the default value is recursive
+    required: false
+    default: recursive
+  fetch-depth:
+    description: Works as stated in actions/checkout, but the default value is 0
+    required: false
+    default: "0"
+
+runs:
+  using: composite
+  steps:
+    - name: Clean workspace
+      shell: bash
+      env:
+        NO_SUDO: ${{ inputs.no-sudo }}
+      run: |
+        echo "${GITHUB_WORKSPACE}"
+        if [ -z "${NO_SUDO}" ]; then
+          sudo rm -rf "${GITHUB_WORKSPACE}"
+        else
+          rm -rf "${GITHUB_WORKSPACE}"
+        fi
+        mkdir "${GITHUB_WORKSPACE}"
+
+    - name: Checkout PyTorch
+      uses: malfet/checkout@silent-checkout
+      with:
+        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
+        fetch-depth: ${{ inputs.fetch-depth }}
+        submodules: ${{ inputs.submodules }}
+        quiet-checkout: true
diff --git a/.github/actions/chown-workspace/action.yml b/.github/actions/chown-workspace/action.yml
new file mode 100644
index 000000000000..6adc6cdc217d
--- /dev/null
+++ b/.github/actions/chown-workspace/action.yml
@@ -0,0 +1,11 @@
+name: Chown workspace
+
+description: Ensure that the working directory gets chowned back to the current user
+
+runs:
+  using: composite
+  steps:
+    - run: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      shell: bash
+      env:
+        ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
diff --git a/.github/actions/download-build-artifacts/action.yml b/.github/actions/download-build-artifacts/action.yml
new file mode 100644
index 000000000000..a3c9444c1b98
--- /dev/null
+++ b/.github/actions/download-build-artifacts/action.yml
@@ -0,0 +1,34 @@
+name: Download PyTorch Build Artifacts
+
+description: Download and unzip artifacts from a previous PyTorch build.
+
+inputs:
+  name:
+    description: Name of what artifact to download
+    required: true
+  use-gha:
+    description: If set to any value, use GHA to download the artifact. Otherwise use s3.
+    required: false
+
+runs:
+  using: composite
+  steps:
+    - name: Download PyTorch Build Artifacts from S3
+      if: ${{ !inputs.use-gha }}
+      uses: seemethere/download-artifact-s3@v3
+      with:
+        name: ${{ inputs.name }}
+
+    - name: Download PyTorch Build Artifacts from GHA
+      if: inputs.use-gha
+      uses: actions/download-artifact@v2
+      with:
+        name: ${{ inputs.name }}
+
+    - name: Unzip artifacts
+      shell: bash
+      run: unzip -o artifacts.zip
+
+    - name: Output disk space left
+      shell: bash
+      run: df -H
diff --git a/.github/actions/get-workflow-job-id/action.yml b/.github/actions/get-workflow-job-id/action.yml
new file mode 100644
index 000000000000..c7ca1e07d6be
--- /dev/null
+++ b/.github/actions/get-workflow-job-id/action.yml
@@ -0,0 +1,31 @@
+name: Get workflow job id
+
+description: Get the ID of the workflow job that is currently running.
+
+inputs:
+  github-token:
+    description: GITHUB_TOKEN
+    required: true
+
+outputs:
+  job-id:
+    description: The retrieved workflow job id
+    value: ${{ steps.get-job-id.outputs.job-id }}
+
+runs:
+  using: composite
+  steps:
+    - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+      id: get-job-id
+      env:
+        GITHUB_TOKEN: ${{ inputs.github-token }}
+      with:
+        shell: bash
+        timeout_minutes: 10
+        max_attempts: 5
+        retry_wait_seconds: 30
+        command: |
+          set -x
+          python3 -m pip install requests==2.26.0
+          GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
+          echo "::set-output name=job-id::${GHA_WORKFLOW_JOB_ID}"
diff --git a/.github/actions/pull-docker-image/action.yml b/.github/actions/pull-docker-image/action.yml
new file mode 100644
index 000000000000..ad1cc1baf9d3
--- /dev/null
+++ b/.github/actions/pull-docker-image/action.yml
@@ -0,0 +1,19 @@
+name: Pull docker image
+
+description: pull a specific docker image
+
+inputs:
+  docker-image:
+    description: the image to pull
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Pull Docker image
+      shell: bash
+      env:
+        DOCKER_IMAGE: ${{ inputs.docker-image }}
+      run: |
+        retry () { "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@") }
+        retry docker pull "${DOCKER_IMAGE}"
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
new file mode 100644
index 000000000000..d7500f11de7d
--- /dev/null
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,47 @@
+name: Setup Linux
+
+description: Set up Docker workspace on EC2
+
+runs:
+  using: composite
+  steps:
+    - name: Display EC2 information
+      shell: bash
+      run: |
+        set -euo pipefail
+        function get_ec2_metadata() {
+          # Pulled from instance metadata endpoint for EC2
+          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+          category=$1
+          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+        }
+        echo "ami-id: $(get_ec2_metadata ami-id)"
+        echo "instance-id: $(get_ec2_metadata instance-id)"
+        echo "instance-type: $(get_ec2_metadata instance-type)"
+        echo "system info $(uname -a)"
+
+    - name: Start docker if docker deamon is not running
+      shell: bash
+      run: |
+        if systemctl is-active --quiet docker; then
+            echo "Docker daemon is running...";
+        else
+            echo "Starting docker deamon..." && sudo systemctl start docker;
+        fi
+
+    - name: Log in to ECR
+      shell: bash
+      env:
+        AWS_RETRY_MODE: standard
+        AWS_MAX_ATTEMPTS: "5"
+        AWS_DEFAULT_REGION: us-east-1
+      run: |
+        AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+        retry () { "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@") }
+        retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
+            --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+
+    - name: Preserve github env variables for use in docker
+      shell: bash
+      run: |
+        env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
new file mode 100644
index 000000000000..1a109830ee32
--- /dev/null
+++ b/.github/actions/setup-rocm/action.yml
@@ -0,0 +1,64 @@
+name: Setup ROCm host
+
+description: Set up ROCm host for CI
+
+runs:
+  using: composite
+  steps:
+    - name: Set DOCKER_HOST
+      shell: bash
+      run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+
+    - name: Runner health check system info
+      if: always()
+      shell: bash
+      run: |
+        cat /etc/os-release || true
+        cat /etc/apt/sources.list.d/rocm.list || true
+        cat /opt/rocm/.info/version || true
+        whoami
+
+    - name: Runner health check rocm-smi
+      if: always()
+      shell: bash
+      run: |
+        rocm-smi
+
+    - name: Runner health check rocminfo
+      if: always()
+      shell: bash
+      run: |
+        rocminfo
+
+    - name: Runner health check GPU count
+      if: always()
+      shell: bash
+      run: |
+        ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+        if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+            echo "Failed to detect GPUs on the runner"
+            exit 1
+        fi
+
+    - name: Runner health check disconnect on failure
+      if: ${{ failure() }}
+      shell: bash
+      run: |
+        killall runsvc.sh
+
+    - name: Preserve github env variables for use in docker
+      shell: bash
+      run: |
+        env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+
+    - name: ROCm set GPU_FLAG
+      shell: bash
+      run: |
+        # Examine the runner name. If it ends with "-2", this is the second runner on the host.
+        if [[ ${{ runner.name }} == *-2 ]]; then
+            # select the last two GPUs on the host
+            echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri/renderD130 --device=/dev/dri/renderD131 --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+        else
+            # select the first two GPUs on the host
+            echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri/renderD128 --device=/dev/dri/renderD129 --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+        fi
diff --git a/.github/actions/setup-ssh/action.yml b/.github/actions/setup-ssh/action.yml
new file mode 100644
index 000000000000..c2be35a805c4
--- /dev/null
+++ b/.github/actions/setup-ssh/action.yml
@@ -0,0 +1,17 @@
+name: Setup SSH
+
+description: Adds ssh keys for current user to machine
+
+inputs:
+  github-secret:
+    description: GitHub token
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: "Enable SSH (Click me for login details)"
+      uses: seemethere/add-github-ssh-key@v1
+      with:
+        GITHUB_TOKEN: ${{ inputs.github-secret }}
+        activate-with-label: false
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
new file mode 100644
index 000000000000..12f287b23089
--- /dev/null
+++ b/.github/actions/setup-win/action.yml
@@ -0,0 +1,60 @@
+name: Setup Windows
+
+description: Set up for windows jobs
+
+inputs:
+  cuda-version:
+    description: which cuda version to install, 'cpu' for none
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Display EC2 information
+      shell: bash
+      run: |
+        set -euo pipefail
+        function get_ec2_metadata() {
+          # Pulled from instance metadata endpoint for EC2
+          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+          category=$1
+          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+        }
+        echo "ami-id: $(get_ec2_metadata ami-id)"
+        echo "instance-id: $(get_ec2_metadata instance-id)"
+        echo "instance-type: $(get_ec2_metadata instance-type)"
+        echo "system info $(uname -a)"
+
+    # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+    - name: Enable long paths on Windows
+      shell: powershell
+      run: |
+        Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+
+    # Since it's just a defensive command, the workflow should continue even the command fails
+    - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+      shell: powershell
+      run: |
+        Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+
+    - name: Install Visual Studio 2019 toolchain
+      shell: powershell
+      env:
+        VS_VERSION: "16.8.6"
+        INSTALL_WINDOWS_SDK: "1"
+      run: |
+        .\.circleci\scripts\vs_install.ps1
+
+    - name: Install CUDA and CUDNN
+      shell: bash
+      if: inputs.cuda-version != 'cpu'
+      env:
+        CUDA_VERSION: ${{ inputs.cuda-version }}
+      run: |
+        .circleci/scripts/windows_cuda_install.sh
+        .circleci/scripts/windows_cudnn_install.sh
+
+    - name: Setup Python3
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.x"
diff --git a/.github/actions/teardown-linux/action.yml b/.github/actions/teardown-linux/action.yml
new file mode 100644
index 000000000000..9238a073a6b6
--- /dev/null
+++ b/.github/actions/teardown-linux/action.yml
@@ -0,0 +1,28 @@
+name: Teardown Linux
+
+description: Stuff that should always run at the end of a linux job
+
+inputs:
+  skip-wait-ssh:
+    description: If set, don't wait for ssh to drain before tearing down
+    required: false
+    default: ""
+
+runs:
+  using: composite
+  steps:
+    - name: Hold runner for 2 hours or until ssh sessions have drained
+      # TODO working-directory: !{{ pytorch_directory }}
+      # Always hold for active ssh sessions
+      shell: bash
+      if: inputs.skip-wait-ssh == ''
+      run: .github/scripts/wait_for_ssh_to_drain.sh
+
+    - name: Kill containers, clean up images
+      shell: bash
+      run: |
+        # ignore expansion of "docker ps -q" since it could be empty
+        # shellcheck disable=SC2046
+        docker stop $(docker ps -q) || true
+        # Prune all of the docker images
+        docker system prune -af
diff --git a/.github/actions/teardown-rocm/action.yml b/.github/actions/teardown-rocm/action.yml
new file mode 100644
index 000000000000..f23d8e1e2422
--- /dev/null
+++ b/.github/actions/teardown-rocm/action.yml
@@ -0,0 +1,25 @@
+name: Teardown ROCm host
+
+description: Teardown ROCm host for CI
+
+runs:
+  using: composite
+  steps:
+    - name: Kill containers, clean up images
+      if: always()
+      shell: bash
+      run: |
+        # ignore expansion of "docker ps -q" since it could be empty
+        # shellcheck disable=SC2046
+        docker stop $(docker ps -q) || true
+        # Prune all of the docker containers
+        docker container prune -f
+        # Prune everything docker if there are more than 10 images (~200GB).
+        # This is easier than using a time filter, e.g., "until=24h".
+        image_count=$(docker images | wc -l)
+        if [[ ${image_count} -gt 10 ]]; then
+            echo "Purging all docker caches"
+            docker system prune -af
+        else
+            echo "Will not purge docker, only ${image_count} images found"
+        fi
diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml
new file mode 100644
index 000000000000..49c509444e09
--- /dev/null
+++ b/.github/actions/teardown-win/action.yml
@@ -0,0 +1,33 @@
+name: Teardown Windows
+
+description: Set up Docker workspace on linux
+
+inputs:
+  extra-delete-dir:
+    description: If set, cleaning up the workspace will delete this too
+    required: false
+    default: ""
+
+runs:
+  using: composite
+  steps:
+    - name: Wait until all sessions have drained
+      shell: powershell
+      if: always()
+      run: |
+        .github\scripts\wait_for_ssh_to_drain.ps1
+
+    - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+      shell: powershell
+      if: always()
+      run: |
+        .github\scripts\kill_active_ssh_sessions.ps1
+
+    - name: Cleanup workspace
+      if: always()
+      shell: bash
+      env:
+        EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
+      run: |
+        [ ! -z "${EXTRA_DELETE_DIR}" ]  || rm -rf "${EXTRA_DELETE_DIR}"
+        rm -rf ./*
diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
new file mode 100644
index 000000000000..7a00a377fca4
--- /dev/null
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -0,0 +1,94 @@
+name: Upload test artifacts
+
+description: Upload various artifacts produced by our testing process
+
+inputs:
+  use-gha:
+    description: If set to any value, upload GHA. Otherwise upload to S3.
+    required: false
+  file-suffix:
+    description: |
+      Suffix to add to the filename of the artifacts. This should include the
+      workflow job id, see [Job id in artifacts].
+    required: true
+
+runs:
+  using: composite
+  steps:
+    # Mac/Linux zip
+    - name: Zip JSONs for upload
+      if: runner.os != 'Windows' && !inputs.use-gha
+      shell: bash
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # Remove any previous test jsons if they exist
+        rm -f test-jsons-*.zip
+        zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
+
+    - name: Zip test reports for upload
+      if: runner.os != 'Windows' && !inputs.use-gha
+      shell: bash
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # Remove any previous test reports if they exist
+        rm -f test-reports-*.zip
+        zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
+
+    # Windows zip
+    - name: Zip JSONs for upload
+      if: runner.os == 'Windows' && !inputs.use-gha
+      shell: powershell
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # -ir => recursive include all files in pattern
+        7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
+
+    - name: Zip test reports for upload
+      if: runner.os == 'Windows' && !inputs.use-gha
+      shell: powershell
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # -ir => recursive include all files in pattern
+        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
+
+    # S3 upload
+    - name: Store Test Downloaded JSONs on S3
+      uses: seemethere/upload-artifact-s3@v4
+      if: ${{ !inputs.use-gha }}
+      with:
+        retention-days: 14
+        if-no-files-found: warn
+        path: test-jsons-*.zip
+
+    - name: Store Test Reports on S3
+      uses: seemethere/upload-artifact-s3@v4
+      if: ${{ !inputs.use-gha }}
+      with:
+        retention-days: 14
+        if-no-files-found: error
+        path: test-reports-*.zip
+
+    # GHA upload
+    - name: Store Test Downloaded JSONs on Github
+      uses: actions/upload-artifact@v2
+      if: inputs.use-gha
+      with:
+        # Add the run attempt, see [Artifact run attempt]
+        name: test-jsons-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
+        retention-days: 14
+        if-no-files-found: warn
+        path: test/**/*.json
+
+    - name: Store Test Reports on Github
+      uses: actions/upload-artifact@v2
+      if: inputs.use-gha
+      with:
+        # Add the run attempt, see [Artifact run attempt]
+        name: test-reports-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
+        retention-days: 14
+        if-no-files-found: error
+        path: test/**/*.xml
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
deleted file mode 100644
index 186441321a70..000000000000
--- a/.github/generated-ciflow-ruleset.json
+++ /dev/null
@@ -1,304 +0,0 @@
-{
-  "__comment": "@generated DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py",
-  "label_rules": {
-    "ciflow/all": [
-      "caffe2-linux-xenial-py3.7-gcc5.4",
-      "docker-builds",
-      "ios-12-5-1-arm64",
-      "ios-12-5-1-arm64-coreml",
-      "ios-12-5-1-arm64-custom-ops",
-      "ios-12-5-1-arm64-full-jit",
-      "ios-12-5-1-arm64-metal",
-      "ios-12-5-1-x86-64",
-      "ios-12-5-1-x86-64-coreml",
-      "ios-12-5-1-x86-64-full-jit",
-      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-bionic-py3.7-clang9",
-      "linux-bionic-rocm4.5-py3.7",
-      "linux-docs",
-      "linux-docs-push",
-      "linux-vulkan-bionic-py3.7-clang9",
-      "linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-      "linux-xenial-cuda11.3-py3.7-gcc7-no-ops",
-      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static",
-      "linux-xenial-py3.7-clang7-asan",
-      "linux-xenial-py3.7-clang7-onnx",
-      "linux-xenial-py3.7-gcc5.4",
-      "linux-xenial-py3.7-gcc7",
-      "linux-xenial-py3.7-gcc7-no-ops",
-      "macos-10-15-py3-arm64",
-      "macos-10-15-py3-lite-interpreter-x86-64",
-      "macos-11-py3-x86-64",
-      "parallelnative-linux-xenial-py3.7-gcc5.4",
-      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7",
-      "periodic-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
-      "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug",
-      "periodic-win-vs2019-cuda11.1-py3",
-      "periodic-win-vs2019-cuda11.5-py3",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
-      "win-vs2019-cpu-py3",
-      "win-vs2019-cuda11.3-py3"
-    ],
-    "ciflow/android": [
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-    ],
-    "ciflow/bazel": [
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-    ],
-    "ciflow/binaries": [
-      "linux-binary-conda",
-      "linux-binary-libtorch-cxx11-abi",
-      "linux-binary-libtorch-pre-cxx11",
-      "linux-binary-manywheel",
-      "macos-arm64-binary-conda",
-      "macos-arm64-binary-wheel",
-      "macos-binary-conda",
-      "macos-binary-libtorch-cxx11-abi",
-      "macos-binary-libtorch-pre-cxx11",
-      "macos-binary-wheel",
-      "windows-binary-libtorch-cxx11-abi",
-      "windows-binary-libtorch-pre-cxx11",
-      "windows-binary-wheel"
-    ],
-    "ciflow/binaries_conda": [
-      "linux-binary-conda",
-      "macos-arm64-binary-conda",
-      "macos-binary-conda"
-    ],
-    "ciflow/binaries_libtorch": [
-      "linux-binary-libtorch-cxx11-abi",
-      "linux-binary-libtorch-pre-cxx11",
-      "macos-binary-libtorch-cxx11-abi",
-      "macos-binary-libtorch-pre-cxx11",
-      "windows-binary-libtorch-cxx11-abi",
-      "windows-binary-libtorch-pre-cxx11"
-    ],
-    "ciflow/binaries_wheel": [
-      "linux-binary-manywheel",
-      "macos-arm64-binary-wheel",
-      "macos-binary-wheel",
-      "windows-binary-wheel"
-    ],
-    "ciflow/cpu": [
-      "caffe2-linux-xenial-py3.7-gcc5.4",
-      "linux-bionic-py3.7-clang9",
-      "linux-docs",
-      "linux-docs-push",
-      "linux-vulkan-bionic-py3.7-clang9",
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-      "linux-xenial-py3.7-clang7-asan",
-      "linux-xenial-py3.7-clang7-onnx",
-      "linux-xenial-py3.7-gcc5.4",
-      "linux-xenial-py3.7-gcc7",
-      "linux-xenial-py3.7-gcc7-no-ops",
-      "parallelnative-linux-xenial-py3.7-gcc5.4",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
-      "win-vs2019-cpu-py3"
-    ],
-    "ciflow/cuda": [
-      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7-no-ops",
-      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7",
-      "periodic-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
-      "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug",
-      "periodic-win-vs2019-cuda11.1-py3",
-      "periodic-win-vs2019-cuda11.5-py3",
-      "win-vs2019-cuda11.3-py3"
-    ],
-    "ciflow/default": [
-      "linux-binary-conda",
-      "linux-binary-libtorch-cxx11-abi",
-      "linux-binary-libtorch-pre-cxx11",
-      "linux-binary-manywheel",
-      "linux-bionic-py3.7-clang9",
-      "linux-bionic-rocm4.5-py3.7",
-      "linux-docs",
-      "linux-vulkan-bionic-py3.7-clang9",
-      "linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static",
-      "linux-xenial-py3.7-clang7-asan",
-      "linux-xenial-py3.7-clang7-onnx",
-      "linux-xenial-py3.7-gcc5.4",
-      "linux-xenial-py3.7-gcc7",
-      "linux-xenial-py3.7-gcc7-no-ops",
-      "macos-arm64-binary-conda",
-      "macos-arm64-binary-wheel",
-      "macos-binary-conda",
-      "macos-binary-libtorch-cxx11-abi",
-      "macos-binary-libtorch-pre-cxx11",
-      "macos-binary-wheel",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
-      "win-vs2019-cpu-py3",
-      "win-vs2019-cuda11.3-py3",
-      "windows-binary-libtorch-cxx11-abi",
-      "windows-binary-libtorch-pre-cxx11",
-      "windows-binary-wheel"
-    ],
-    "ciflow/docs": [
-      "linux-docs"
-    ],
-    "ciflow/ios": [
-      "ios-12-5-1-arm64",
-      "ios-12-5-1-arm64-coreml",
-      "ios-12-5-1-arm64-custom-ops",
-      "ios-12-5-1-arm64-full-jit",
-      "ios-12-5-1-arm64-metal",
-      "ios-12-5-1-x86-64",
-      "ios-12-5-1-x86-64-coreml",
-      "ios-12-5-1-x86-64-full-jit"
-    ],
-    "ciflow/libtorch": [
-      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7"
-    ],
-    "ciflow/linux": [
-      "caffe2-linux-xenial-py3.7-gcc5.4",
-      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-bionic-py3.7-clang9",
-      "linux-bionic-rocm4.5-py3.7",
-      "linux-docs",
-      "linux-docs-push",
-      "linux-vulkan-bionic-py3.7-clang9",
-      "linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-      "linux-xenial-cuda11.3-py3.7-gcc7-no-ops",
-      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static",
-      "linux-xenial-py3.7-clang7-asan",
-      "linux-xenial-py3.7-clang7-onnx",
-      "linux-xenial-py3.7-gcc5.4",
-      "linux-xenial-py3.7-gcc7",
-      "linux-xenial-py3.7-gcc7-no-ops",
-      "parallelnative-linux-xenial-py3.7-gcc5.4",
-      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7",
-      "periodic-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
-      "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-    ],
-    "ciflow/macos": [
-      "ios-12-5-1-arm64",
-      "ios-12-5-1-arm64-coreml",
-      "ios-12-5-1-arm64-custom-ops",
-      "ios-12-5-1-arm64-full-jit",
-      "ios-12-5-1-arm64-metal",
-      "ios-12-5-1-x86-64",
-      "ios-12-5-1-x86-64-coreml",
-      "ios-12-5-1-x86-64-full-jit",
-      "macos-10-15-py3-arm64",
-      "macos-10-15-py3-lite-interpreter-x86-64",
-      "macos-11-py3-x86-64"
-    ],
-    "ciflow/mobile": [
-      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static"
-    ],
-    "ciflow/noarch": [
-      "linux-bionic-py3.7-clang9"
-    ],
-    "ciflow/onnx": [
-      "linux-xenial-py3.7-clang7-onnx"
-    ],
-    "ciflow/rocm": [
-      "linux-bionic-rocm4.5-py3.7"
-    ],
-    "ciflow/sanitizers": [
-      "linux-xenial-py3.7-clang7-asan"
-    ],
-    "ciflow/scheduled": [
-      "linux-docs-push",
-      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7",
-      "periodic-linux-bionic-cuda11.5-py3.7-gcc7",
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
-      "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug",
-      "periodic-win-vs2019-cuda11.1-py3",
-      "periodic-win-vs2019-cuda11.5-py3"
-    ],
-    "ciflow/slow": [
-      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
-    ],
-    "ciflow/slow-gradcheck": [
-      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
-    ],
-    "ciflow/trunk": [
-      "caffe2-linux-xenial-py3.7-gcc5.4",
-      "docker-builds",
-      "ios-12-5-1-arm64",
-      "ios-12-5-1-arm64-coreml",
-      "ios-12-5-1-arm64-custom-ops",
-      "ios-12-5-1-arm64-full-jit",
-      "ios-12-5-1-arm64-metal",
-      "ios-12-5-1-x86-64",
-      "ios-12-5-1-x86-64-coreml",
-      "ios-12-5-1-x86-64-full-jit",
-      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-bionic-py3.7-clang9",
-      "linux-bionic-rocm4.5-py3.7",
-      "linux-docs",
-      "linux-vulkan-bionic-py3.7-clang9",
-      "linux-xenial-cuda11.3-py3.7-gcc7",
-      "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-      "linux-xenial-cuda11.3-py3.7-gcc7-no-ops",
-      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static",
-      "linux-xenial-py3.7-clang7-asan",
-      "linux-xenial-py3.7-clang7-onnx",
-      "linux-xenial-py3.7-gcc5.4",
-      "linux-xenial-py3.7-gcc7",
-      "linux-xenial-py3.7-gcc7-no-ops",
-      "macos-10-15-py3-arm64",
-      "macos-10-15-py3-lite-interpreter-x86-64",
-      "macos-11-py3-x86-64",
-      "parallelnative-linux-xenial-py3.7-gcc5.4",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
-      "win-vs2019-cpu-py3",
-      "win-vs2019-cuda11.3-py3"
-    ],
-    "ciflow/vulkan": [
-      "linux-vulkan-bionic-py3.7-clang9"
-    ],
-    "ciflow/win": [
-      "periodic-win-vs2019-cuda11.1-py3",
-      "periodic-win-vs2019-cuda11.5-py3",
-      "win-vs2019-cpu-py3",
-      "win-vs2019-cuda11.3-py3"
-    ],
-    "ciflow/xla": [
-      "linux-bionic-py3.7-clang9"
-    ]
-  },
-  "version": "v1"
-}
diff --git a/.github/merge_rules.json b/.github/merge_rules.json
index 6b0e452683fc..2dbd6c4f3107 100644
--- a/.github/merge_rules.json
+++ b/.github/merge_rules.json
@@ -1,20 +1,114 @@
 [
    {
-    "name": "ONNX exporter",
-    "patterns": ["torch/onnx/**", "torch/csrc/jit/passes/onnx/**", "torch/csrc/jit/passes/onnx.*", "test/onnx/**", "docs/source/onnx.rst"],
-    "approved_by": ["BowenBao", "garymm"],
-    "mandatory_app_id": 12274
+      "name": "ONNX exporter",
+      "patterns": [
+         ".jenkins/caffe2/*",
+         "scripts/onnx/**",
+         "docs/source/onnx.rst",
+         "test/onnx/**",
+         "test/jit/test_export_modes.py",
+         "aten/src/ATen/core/interned_strings.h",
+         "tools/onnx/**",
+         "torch/_C/__init__.pyi.in",
+         "torch/csrc/jit/passes/onnx.*",
+         "torch/csrc/jit/passes/onnx/**",
+         "torch/csrc/jit/serialization/export.*",
+         "torch/csrc/jit/serialization/onnx.*",
+         "torch/csrc/onnx/**",
+         "torch/onnx/**"
+      ],
+      "approved_by": ["BowenBao", "garymm"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
    },
    {
-    "name": "NVFuser",
-    "patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"],
-    "approved_by": ["csarofeen", "ngimel"],
-    "mandatory_app_id": 12274
+      "name": "NVFuser",
+      "patterns": [
+         "test/test_jit_cuda_fuser.py",
+         "torch/csrc/jit/codegen/fuser/cuda/**",
+         "torch/csrc/jit/codegen/cuda/**",
+         "benchmarks/cpp/nvfuser/**"
+      ],
+      "approved_by": ["csarofeen", "ngimel", "jjsjann123"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
    },
    {
-    "name": "OSS CI",
-    "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"],
-    "approved_by": ["seemethere", "malfet", "suo"],
-    "mandatory_app_id": 12274
+      "name": "OSS CI",
+      "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"],
+      "approved_by": ["ezyang", "pytorch/pytorch-dev-infra"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "Documentation",
+      "patterns": ["docs/**", "torch/*docs.py"],
+      "approved_by": ["mruberry", "ngimel", "janeyx99"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "Mobile",
+      "patterns": ["ios/**", "android/**", "test/mobile/**"],
+      "approved_by": ["linbinyu", "kit1980", "IvanKobzarev", "dreiss"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "Linear Algebra",
+      "patterns": [
+         "aten/src/ATen/native/cuda/linalg/**",
+         "aten/src/ATen/LinalgBackend.h",
+         "aten/src/ATen/native/**/*LinearAlgebra*",
+         "docs/source/linalg.rst",
+         "torch/linalg/**",
+         "torch/_linalg_utils.py",
+         "torch/**/python_linalg_functions.*",
+         "torch/**/linalg.h",
+         "tools/autograd/templates/python_linalg_functions.cpp",
+         "test/test_linalg.py"
+      ],
+      "approved_by": ["nikitaved", "mruberry", "pearu", "Lezcano", "IvanYashchuk"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "FFT",
+      "patterns": [
+         "aten/src/ATen/native/cuda/*FFT*.h",
+         "aten/src/ATen/native/SpectralOps.cpp",
+         "aten/src/ATen/native/mkl/SpectralOps.cpp",
+         "aten/src/ATen/native/cuda/SpectralOps.*",
+         "docs/source/fft.rst",
+         "torch/fft/**",
+         "torch/csrc/api/include/torch/fft.h",
+         "torch/**/python_fft_functions.*",
+         "tools/autograd/templates/python_fft_functions.cpp",
+         "test/cpp/api/fft.cpp"
+      ],
+      "approved_by": ["mruberry", "peterbell10"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "Sparse",
+      "patterns": [
+         "benchmarks/sparse",
+         "c10/util/sparse_bitset.h",
+         "docs/source/sparse.rst",
+         "torch/**/sparse/**",
+         "torch/**/*sparse*",
+         "torch/optim/sparse*",
+         "torch/ao/nn/sparse/**",
+         "torch/utils/benchmark/**/*sparse*",
+         "aten/src/ATen/native/ao_sparse/**",
+         "aten/src/ATen/native/sparse/**",
+         "aten/src/ATen/**/*Sparse*",
+         "aten/src/ATen/*Sparse*",
+         "torch/_masked/**",
+         "test/*_masked*",
+         "test/**/*sparse*"
+      ],
+      "approved_by": ["nikitaved", "cpuhrsch", "pearu", "IvanYashchuk"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
+   },
+   {
+      "name": "superuser",
+      "patterns": ["*"],
+      "approved_by": ["pytorch/metamates"],
+      "mandatory_checks_name": ["Facebook CLA Check", "Lint"]
    }
 ]
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
index 0670ed9598ae..931ca0ef5f1e 100644
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@@ -30,25 +30,25 @@ runner_types:
   linux.2xlarge:
     instance_type: c5.2xlarge
     os: linux
-    max_available: 500
+    max_available: 1000
     disk_size: 150
     is_ephemeral: false
   linux.4xlarge: # for binary-builds
     instance_type: c5.4xlarge
     os: linux
-    max_available: 250
+    max_available: 500
     disk_size: 150
     is_ephemeral: false
   linux.8xlarge.nvidia.gpu:
     instance_type: g3.8xlarge
     os: linux
-    max_available: 125
+    max_available: 200
     disk_size: 150
     is_ephemeral: false
   linux.4xlarge.nvidia.gpu:
     instance_type: g3.4xlarge
     os: linux
-    max_available: 125
+    max_available: 250
     disk_size: 150
     is_ephemeral: false
   linux.16xlarge.nvidia.gpu:
diff --git a/.github/scripts/README.md b/.github/scripts/README.md
new file mode 100644
index 000000000000..22099c3732ea
--- /dev/null
+++ b/.github/scripts/README.md
@@ -0,0 +1,58 @@
+# pytorch/.github
+
+> NOTE: This README contains information for the `.github` directory but cannot be located there because it will overwrite the
+repo README.
+
+This directory contains workflows and scripts to support our CI infrastructure that runs on Github Actions.
+
+## Workflows
+
+- Pull CI (`pull.yml`) is run on PRs and on master.
+- Trunk CI (`trunk.yml`) is run on trunk to validate incoming commits. Trunk jobs are usually more expensive to run so we do not run them on PRs unless specified.
+- Scheduled CI (`periodic.yml`) is a subset of trunk CI that is run every few hours on master.
+- Binary CI is run to package binaries for distribution for all platforms.
+
+## Templates
+
+Templates written in [Jinja](https://jinja.palletsprojects.com/en/3.0.x/) are located in the `.github/templates` directory
+and used to generate workflow files for binary jobs found in the `.github/workflows/` directory. These are also a
+couple of utility templates used to discern common utilities that can be used amongst different templates.
+
+### (Re)Generating workflow files
+
+You will need `jinja2` in order to regenerate the workflow files which can be installed using:
+```bash
+pip install -r .github/requirements.txt
+```
+
+Workflows can be generated / regenerated using the following command:
+```bash
+.github/regenerate.sh
+```
+
+### Adding a new generated binary workflow
+
+New generated binary workflows can be added in the `.github/scripts/generate_ci_workflows.py` script. You can reference
+examples from that script in order to add the workflow to the stream that is relevant to what you particularly
+care about.
+
+Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc.
+
+#### ciflow (trunk)
+
+The label `ciflow/trunk` can be used to run `trunk` only workflows. This is especially useful if trying to re-land a PR that was
+reverted for failing a `non-default` workflow.
+
+## Infra
+
+Currently most of our self hosted runners are hosted on AWS, for a comprehensive list of available runner types you
+can reference `.github/scale-config.yml`.
+
+Exceptions to AWS for self hosted:
+* ROCM runners
+
+### Adding new runner types
+
+New runner types can be added by committing changes to `.github/scale-config.yml`. Example: https://github.com/pytorch/pytorch/pull/70474
+
+> NOTE: New runner types can only be used once the changes to `.github/scale-config.yml` have made their way into the default branch
diff --git a/.github/scripts/build_publish_nightly_docker.sh b/.github/scripts/build_publish_nightly_docker.sh
index 55c764596eb1..db84704aa3e4 100644
--- a/.github/scripts/build_publish_nightly_docker.sh
+++ b/.github/scripts/build_publish_nightly_docker.sh
@@ -1,9 +1,9 @@
-#!/bin/sh
+#!/usr/bin/env bash
 
 set -xeuo pipefail
 
 PYTORCH_DOCKER_TAG=$(git describe --tags --always)-devel
-CUDA_VERSION=11.1
+CUDA_VERSION=11.3.1
 
 # Build PyTorch nightly docker
 make -f docker.Makefile \
@@ -25,18 +25,20 @@ docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_DOCKER_TAG} \
 docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \
        ghcr.io/pytorch/pytorch-nightly:latest
 
-# Push the nightly docker to GitHub Container Registry
-echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
-make -f docker.Makefile \
-     DOCKER_REGISTRY=ghcr.io \
-     DOCKER_ORG=pytorch \
-     DOCKER_IMAGE=pytorch-nightly \
-     DOCKER_TAG=${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \
-     devel-push
-
-make -f docker.Makefile \
-     DOCKER_REGISTRY=ghcr.io \
-     DOCKER_ORG=pytorch \
-     DOCKER_IMAGE=pytorch-nightly \
-     DOCKER_TAG=latest \
-     devel-push
+if [[ ${WITH_PUSH:-} == "true" ]]; then
+    # Push the nightly docker to GitHub Container Registry
+    echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
+    make -f docker.Makefile \
+         DOCKER_REGISTRY=ghcr.io \
+         DOCKER_ORG=pytorch \
+         DOCKER_IMAGE=pytorch-nightly \
+         DOCKER_TAG=${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \
+         devel-push
+
+    make -f docker.Makefile \
+         DOCKER_REGISTRY=ghcr.io \
+         DOCKER_ORG=pytorch \
+         DOCKER_IMAGE=pytorch-nightly \
+         DOCKER_TAG=latest \
+         devel-push
+fi
diff --git a/.github/scripts/convert_lintrunner_annotations_to_github.py b/.github/scripts/convert_lintrunner_annotations_to_github.py
new file mode 100644
index 000000000000..11901bc300e3
--- /dev/null
+++ b/.github/scripts/convert_lintrunner_annotations_to_github.py
@@ -0,0 +1,63 @@
+import json
+import subprocess
+import sys
+
+from enum import Enum
+from pathlib import Path
+from typing import NamedTuple, Optional
+
+# From: https://docs.github.com/en/rest/reference/checks
+class GitHubAnnotationLevel(str, Enum):
+    NOTICE = "notice"
+    WARNING = "warning"
+    FAILURE = "failure"
+
+
+class GitHubAnnotation(NamedTuple):
+    path: str
+    start_line: int
+    end_line: int
+    start_column: Optional[int]
+    end_column: Optional[int]
+    annotation_level: GitHubAnnotationLevel
+    message: str
+    title: Optional[str]
+    raw_details: Optional[str]
+
+PYTORCH_ROOT = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode('ascii').strip())
+
+annotations = []
+for line in sys.stdin:
+    lint_message = json.loads(line)
+
+    path = lint_message.get("path")
+    line = lint_message.get("line")
+
+
+    code = lint_message["code"]
+    severity = lint_message["severity"]
+    name = lint_message["name"]
+    description = lint_message.get("description")
+
+    # These fields are required by the GitHub API, but optional in lintrunner.
+    # If they don't exist, just skip.
+    if path is None or line is None:
+        print(f"No path/line for lint: ({code}) {name}", file=sys.stderr)
+        continue
+
+    # normalize path relative to git root
+    path = Path(path).relative_to(PYTORCH_ROOT)
+
+    annotations.append(GitHubAnnotation(
+        path=str(path),
+        start_line=int(line),
+        end_line=int(line),
+        start_column=None,
+        end_column=None,
+        annotation_level=GitHubAnnotationLevel.FAILURE,
+        message=description,
+        title=f"({code}) {name}",
+        raw_details=None,
+    )._asdict())
+
+print(json.dumps(annotations), flush=True)
diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py
index a07f4359dd04..c479aefb9fc4 100755
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@@ -9,14 +9,8 @@
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 WORKFLOWS = REPO_ROOT / ".github" / "workflows"
-
-
-def concurrency_key(filename: Path) -> str:
-    workflow_name = filename.with_suffix("").name.replace("_", "-")
-    if workflow_name.startswith("generated-"):
-        workflow_name = workflow_name[len("generated-"):]
-    return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}" \
-        "-${{ github.event_name == 'workflow_dispatch' }}"
+EXPECTED_GROUP = "${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}" \
+    "-${{ github.event_name == 'workflow_dispatch' }}"
 
 
 def should_check(filename: Path) -> bool:
@@ -38,12 +32,19 @@ def should_check(filename: Path) -> bool:
 
     errors_found = False
     files = [f for f in files if should_check(f)]
+    names = set()
     for filename in files:
         with open(filename, "r") as f:
             data = yaml.safe_load(f)
 
+        name = data.get("name")
+        if name is not None and name in names:
+            print("ERROR: duplicate workflow name:", name, file=sys.stderr)
+            errors_found = True
+        names.add(name)
+
         expected = {
-            "group": concurrency_key(filename),
+            "group": EXPECTED_GROUP,
             "cancel-in-progress": True,
         }
         actual = data.get("concurrency", None)
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index d3aaf1844fe5..1d81f72edd8e 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -10,13 +10,13 @@
     * Latest ROCM
 """
 
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 
 
-CUDA_ARCHES = ["10.2", "11.1", "11.3", "11.5"]
+CUDA_ARCHES = ["10.2", "11.3", "11.6"]
 
 
-ROCM_ARCHES = ["4.3.1", "4.5.2"]
+ROCM_ARCHES = ["5.0", "5.1.1"]
 
 
 def arch_type(arch_version: str) -> str:
@@ -47,6 +47,8 @@ def arch_type(arch_version: str) -> str:
 
 PRE_CXX11_ABI = "pre-cxx11"
 CXX11_ABI = "cxx11-abi"
+RELEASE = "release"
+DEBUG = "debug"
 
 LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
     **{
@@ -57,6 +59,14 @@ def arch_type(arch_version: str) -> str:
         (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}"
         for gpu_arch in CUDA_ARCHES
     },
+    **{
+        (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:rocm{gpu_arch}"
+        for gpu_arch in ROCM_ARCHES
+    },
+    **{
+        (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}"
+        for gpu_arch in ROCM_ARCHES
+    },
     ("cpu", PRE_CXX11_ABI): "pytorch/manylinux-builder:cpu",
     ("cpu", CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu",
 }
@@ -110,28 +120,37 @@ def generate_conda_matrix(os: str) -> List[Dict[str, str]]:
     return ret
 
 
-def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]:
-    libtorch_variants = [
-        "shared-with-deps",
-        "shared-without-deps",
-        "static-with-deps",
-        "static-without-deps",
-    ]
+def generate_libtorch_matrix(os: str, abi_version: str,
+                             arches: Optional[List[str]] = None,
+                             libtorch_variants: Optional[List[str]] = None) -> List[Dict[str, str]]:
+    if arches is None:
+        arches = ["cpu"]
+        if os == "linux":
+            arches += CUDA_ARCHES
+            arches += ROCM_ARCHES
+        elif os == "windows":
+            # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648
+            arches += list_without(CUDA_ARCHES, ["10.2"])
+
+    if libtorch_variants is None:
+        libtorch_variants = [
+            "shared-with-deps",
+            "shared-without-deps",
+            "static-with-deps",
+            "static-without-deps",
+        ]
+
     ret: List[Dict[str, str]] = []
-    arches = ["cpu"]
-    if os == "linux":
-        arches += CUDA_ARCHES
-    elif os == "windows":
-        # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648
-        arches += list_without(CUDA_ARCHES, ["10.2"])
     for arch_version in arches:
         for libtorch_variant in libtorch_variants:
-            # We don't currently build libtorch for rocm
             # one of the values in the following list must be exactly
             # CXX11_ABI, but the precise value of the other one doesn't
             # matter
             gpu_arch_type = arch_type(arch_version)
             gpu_arch_version = "" if arch_version == "cpu" else arch_version
+            # ROCm builds without-deps failed even in ROCm runners; skip for now
+            if gpu_arch_type == "rocm" and "without-deps" in libtorch_variant:
+                continue
             ret.append(
                 {
                     "gpu_arch_type": gpu_arch_type,
@@ -140,10 +159,11 @@ def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]:
                         gpu_arch_type, gpu_arch_version
                     ),
                     "libtorch_variant": libtorch_variant,
-                    "devtoolset": abi_version,
+                    "libtorch_config": abi_version if os == "windows" else "",
+                    "devtoolset": abi_version if os != "windows" else "",
                     "container_image": LIBTORCH_CONTAINER_IMAGES[
                         (arch_version, abi_version)
-                    ],
+                    ] if os != "windows" else "",
                     "package_type": "libtorch",
                     "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                         ".", "_"
@@ -153,19 +173,29 @@ def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]:
     return ret
 
 
-def generate_wheels_matrix(os: str) -> List[Dict[str, str]]:
-    arches = ["cpu"]
+def generate_wheels_matrix(os: str,
+                           arches: Optional[List[str]] = None,
+                           python_versions: Optional[List[str]] = None) -> List[Dict[str, str]]:
     package_type = "wheel"
-    python_versions = FULL_PYTHON_VERSIONS
     if os == "linux":
-        arches += CUDA_ARCHES + ROCM_ARCHES
         # NOTE: We only build manywheel packages for linux
         package_type = "manywheel"
-    elif os == "windows":
-        # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648
-        arches += list_without(CUDA_ARCHES, ["10.2"])
-    elif os == "macos-arm64":
-        python_versions = list_without(python_versions, ["3.7"])
+
+    if python_versions is None:
+        # Define default python version
+        python_versions = FULL_PYTHON_VERSIONS
+        if os == "macos-arm64":
+            python_versions = list_without(python_versions, ["3.7"])
+
+    if arches is None:
+        # Define default compute archivectures
+        arches = ["cpu"]
+        if os == "linux":
+            arches += CUDA_ARCHES + ROCM_ARCHES
+        elif os == "windows":
+            # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648
+            arches += list_without(CUDA_ARCHES, ["10.2"])
+
     ret: List[Dict[str, str]] = []
     for python_version in python_versions:
         for arch_version in arches:
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index f90690f2f952..c8b815bf0180 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -5,98 +5,24 @@
 from typing import Dict, Set, List, Iterable
 
 import jinja2
-import json
+
 import os
 import sys
-from typing_extensions import Literal
+from typing_extensions import Literal, TypedDict
 
 import generate_binary_build_matrix  # type: ignore[import]
 
-YamlShellBool = Literal["''", 1]
 Arch = Literal["windows", "linux", "macos"]
 
-DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
 GITHUB_DIR = Path(__file__).resolve().parent.parent
 
-WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge"
-# contains 1 gpu
-WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu"
-WINDOWS_RUNNERS = {
-    WINDOWS_CPU_TEST_RUNNER,
-    WINDOWS_CUDA_TEST_RUNNER,
-}
-
-LINUX_CPU_TEST_RUNNER = "linux.2xlarge"
-# contains 1 gpu
-LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu"
-# contains at least 2 gpus
-LINUX_ROCM_TEST_RUNNER = "linux.rocm.gpu"
-LINUX_RUNNERS = {
-    LINUX_CPU_TEST_RUNNER,
-    LINUX_CUDA_TEST_RUNNER,
-    LINUX_ROCM_TEST_RUNNER,
-}
-
-LINUX_DISTRIBUTED_GPU_RUNNERS = {
-    LINUX_CUDA_TEST_RUNNER : "linux.8xlarge.nvidia.gpu",
-    LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER,
-}
-
-LINUX_MULTIGPU_RUNNERS = {
-    LINUX_CUDA_TEST_RUNNER : "linux.16xlarge.nvidia.gpu",
-    LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER,
-}
-
-MACOS_TEST_RUNNER_10_15 = "macos-10.15"
-MACOS_TEST_RUNNER_11 = "macos-11"
-
-MACOS_RUNNERS = {
-    MACOS_TEST_RUNNER_10_15,
-    MACOS_TEST_RUNNER_11,
-}
-
-CUDA_RUNNERS = {
-    WINDOWS_CUDA_TEST_RUNNER,
-    LINUX_CUDA_TEST_RUNNER,
-}
-ROCM_RUNNERS = {
-    LINUX_ROCM_TEST_RUNNER,
-}
-CPU_RUNNERS = {
-    WINDOWS_CPU_TEST_RUNNER,
-    LINUX_CPU_TEST_RUNNER,
-}
-
-LABEL_CIFLOW_ALL = "ciflow/all"
-LABEL_CIFLOW_BAZEL = "ciflow/bazel"
-LABEL_CIFLOW_CPU = "ciflow/cpu"
-LABEL_CIFLOW_CUDA = "ciflow/cuda"
-LABEL_CIFLOW_ROCM = "ciflow/rocm"
-LABEL_CIFLOW_DOCS = "ciflow/docs"
-LABEL_CIFLOW_DEFAULT = "ciflow/default"
-LABEL_CIFLOW_LIBTORCH = "ciflow/libtorch"
-LABEL_CIFLOW_LINUX = "ciflow/linux"
-LABEL_CIFLOW_MOBILE = "ciflow/mobile"
-LABEL_CIFLOW_ANDROID = "ciflow/android"
-LABEL_CIFLOW_SANITIZERS = "ciflow/sanitizers"
-LABEL_CIFLOW_ONNX = "ciflow/onnx"
-LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled"
-LABEL_CIFLOW_SLOW = "ciflow/slow"
-LABEL_CIFLOW_WIN = "ciflow/win"
-LABEL_CIFLOW_XLA = "ciflow/xla"
-LABEL_CIFLOW_NOARCH = "ciflow/noarch"
-LABEL_CIFLOW_VULKAN = "ciflow/vulkan"
-LABEL_CIFLOW_PREFIX = "ciflow/"
-LABEL_CIFLOW_SLOW_GRADCHECK = "ciflow/slow-gradcheck"
-LABEL_CIFLOW_DOCKER = "ciflow/docker"
-LABEL_CIFLOW_IOS = "ciflow/ios"
-LABEL_CIFLOW_MACOS = "ciflow/macos"
 LABEL_CIFLOW_TRUNK = "ciflow/trunk"
+LABEL_CIFLOW_ALL = "ciflow/all"
 LABEL_CIFLOW_BINARIES = "ciflow/binaries"
-LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
-LABEL_CIFLOW_BINARIES_CONDA = "ciflow/binaries_conda"
+LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
 LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
-
+LABEL_CIFLOW_BINARIES_CONDA = "ciflow/binaries_conda"
+LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
 
 @dataclass
 class CIFlowConfig:
@@ -109,175 +35,12 @@ class CIFlowConfig:
     def __post_init__(self) -> None:
         if not self.isolated_workflow:
             self.labels.add(LABEL_CIFLOW_ALL)
-            if LABEL_CIFLOW_SCHEDULED not in self.labels:
+            if LABEL_CIFLOW_PERIODIC not in self.labels:
                 self.labels.add(LABEL_CIFLOW_TRUNK)
-        assert all(label.startswith(LABEL_CIFLOW_PREFIX) for label in self.labels)
-
-
-@dataclass
-class CIFlowRuleset:
-    version = 'v1'
-    output_file = f'{GITHUB_DIR}/generated-ciflow-ruleset.json'
-    label_rules: Dict[str, Set[str]] = field(default_factory=dict)
-
-    def add_label_rule(self, labels: Set[str], workflow_name: str) -> None:
-        for label in labels:
-            if label in self.label_rules:
-                self.label_rules[label].add(workflow_name)
-            else:
-                self.label_rules[label] = {workflow_name}
-
-    def generate_json(self) -> None:
-        GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
-        output = {
-            "__comment": f"@{GENERATED} DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py",
-            "version": self.version,
-            "label_rules": {
-                label: sorted(list(workflows))
-                for label, workflows in self.label_rules.items()
-            }
-        }
-        with open(self.output_file, 'w') as outfile:
-            json.dump(output, outfile, indent=2, sort_keys=True)
-            outfile.write('\n')
 
-
-@dataclass
-class CIWorkflow:
-    # Required fields
-    arch: Arch
-    build_environment: str
-
-    # Optional fields
-    test_runner_type: str = ''
-    multigpu_runner_type: str = ''
-    distributed_gpu_runner_type: str = ''
-    ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
-    cuda_version: str = ''
-    docker_image_base: str = ''
-    enable_doc_jobs: bool = False
-    exclude_test: bool = False
-    build_generates_artifacts: bool = True
-    build_with_debug: bool = False
-    is_scheduled: str = ''
-    is_default: bool = False
-    num_test_shards: int = 1
-    only_run_smoke_tests_on_pull_request: bool = False
-    num_test_shards_on_pull_request: int = -1
-    distributed_test: bool = True
-    timeout_after: int = 240
-    xcode_version: str = ''
-    only_on_pr: bool = False
-    ios_arch: str = ''
-    ios_platform: str = ''
-
-    # The following variables will be set as environment variables,
-    # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string.
-    enable_jit_legacy_test: YamlShellBool = "''"
-    enable_distributed_test: YamlShellBool = "''"
-    enable_multigpu_test: YamlShellBool = "''"
-    enable_nogpu_no_avx_test: YamlShellBool = "''"
-    enable_nogpu_no_avx2_test: YamlShellBool = "''"
-    enable_slow_test: YamlShellBool = "''"
-    enable_docs_test: YamlShellBool = "''"
-    enable_backwards_compat_test: YamlShellBool = "''"
-    enable_xla_test: YamlShellBool = "''"
-    enable_noarch_test: YamlShellBool = "''"
-    enable_force_on_cpu_test: YamlShellBool = "''"
-
-    def __post_init__(self) -> None:
-        if not self.build_generates_artifacts:
-            self.exclude_test = True
-
-        if self.distributed_test:
-            self.enable_distributed_test = 1
-
-        self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu")
-        self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu")
-
-        if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels:
-            self.is_default = True
-
-        # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are
-        # only running smoke tests on the pull request.
-        if self.num_test_shards_on_pull_request == -1:
-            # Don't run the default if we are only running smoke tests
-            if self.only_run_smoke_tests_on_pull_request:
-                self.num_test_shards_on_pull_request = 0
-            else:
-                self.num_test_shards_on_pull_request = self.num_test_shards
-        self.assert_valid()
-
-    def assert_valid(self) -> None:
-        err_message = f"invalid test_runner_type for {self.arch}: {self.test_runner_type}"
-        if self.arch == 'linux':
-            assert self.test_runner_type in LINUX_RUNNERS, err_message
-        if self.arch == 'windows':
-            assert self.test_runner_type in WINDOWS_RUNNERS, err_message
-
-        if not self.ciflow_config.isolated_workflow:
-            assert LABEL_CIFLOW_ALL in self.ciflow_config.labels
-        if self.arch == 'linux':
-            assert LABEL_CIFLOW_LINUX in self.ciflow_config.labels
-        if self.arch == 'windows':
-            assert LABEL_CIFLOW_WIN in self.ciflow_config.labels
-        if self.arch == 'macos':
-            assert LABEL_CIFLOW_MACOS in self.ciflow_config.labels
-        # Make sure that jobs with tests have a test_runner_type
-        if not self.exclude_test:
-            assert self.test_runner_type != ''
-        if self.test_runner_type in CUDA_RUNNERS:
-            assert LABEL_CIFLOW_CUDA in self.ciflow_config.labels
-        if self.test_runner_type in ROCM_RUNNERS:
-            assert LABEL_CIFLOW_ROCM in self.ciflow_config.labels
-        if self.test_runner_type in CPU_RUNNERS and not self.exclude_test:
-            assert LABEL_CIFLOW_CPU in self.ciflow_config.labels
-        if self.is_scheduled:
-            assert LABEL_CIFLOW_DEFAULT not in self.ciflow_config.labels
-            assert LABEL_CIFLOW_TRUNK not in self.ciflow_config.labels
-            assert LABEL_CIFLOW_SCHEDULED in self.ciflow_config.labels
-        if self.build_with_debug:
-            assert self.build_environment.endswith("-debug")
-
-    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
-        output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml"
-        with open(output_file_path, "w") as output_file:
-            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
-            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
-            try:
-                content = workflow_template.render(asdict(self))
-            except Exception as e:
-                print(f"Failed on template: {workflow_template}", file=sys.stderr)
-                raise e
-            output_file.write(content)
-            if content[-1] != "\n":
-                output_file.write("\n")
-        print(output_file_path)
-
-@dataclass
-class DockerWorkflow:
-    build_environment: str
-    docker_images: List[str]
-
-    # Optional fields
-    ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
-    cuda_version: str = ''
-    is_scheduled: str = ''
-
-    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
-        output_file_path = GITHUB_DIR / "workflows/generated-docker-builds.yml"
-        with open(output_file_path, "w") as output_file:
-            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
-            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
-            try:
-                content = workflow_template.render(asdict(self))
-            except Exception as e:
-                print(f"Failed on template: {workflow_template}", file=sys.stderr)
-                raise e
-            output_file.write(content)
-            if content[-1] != "\n":
-                output_file.write("\n")
-        print(output_file_path)
+class Config(TypedDict):
+    num_shards: int
+    runner: str
 
 @dataclass
 class BinaryBuildWorkflow:
@@ -290,6 +53,7 @@ class BinaryBuildWorkflow:
     abi_version: str = ''
     ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
     is_scheduled: str = ''
+    branches: str = 'nightly'
     # Mainly for macos
     cross_compile_arm64: bool = False
     xcode_version: str = ''
@@ -301,7 +65,7 @@ def __post_init__(self) -> None:
             self.build_environment = f"{self.os}-binary-{self.package_type}"
 
     def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
-        output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml"
+        output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
         with open(output_file_path, "w") as output_file:
             GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
             output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
@@ -315,543 +79,6 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
                 output_file.write("\n")
         print(output_file_path)
 
-WINDOWS_WORKFLOWS = [
-    CIWorkflow(
-        arch="windows",
-        build_environment="win-vs2019-cpu-py3",
-        cuda_version="cpu",
-        test_runner_type=WINDOWS_CPU_TEST_RUNNER,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN}
-        ),
-    ),
-    CIWorkflow(
-        arch="windows",
-        build_environment="win-vs2019-cuda11.3-py3",
-        cuda_version="11.3",
-        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        only_run_smoke_tests_on_pull_request=True,
-        enable_force_on_cpu_test=1,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
-        ),
-    ),
-    CIWorkflow(
-        arch="windows",
-        build_environment="periodic-win-vs2019-cuda11.5-py3",
-        cuda_version="11.5",
-        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        enable_force_on_cpu_test=1,
-        is_scheduled="45 4,10,16,22 * * *",
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
-        ),
-    ),
-    CIWorkflow(
-        arch="windows",
-        build_environment="periodic-win-vs2019-cuda11.1-py3",
-        cuda_version="11.1",
-        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        is_scheduled="45 0,4,8,12,16,20 * * *",
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_WIN, LABEL_CIFLOW_CUDA}
-        ),
-    ),
-]
-
-LINUX_WORKFLOWS = [
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3.7-gcc5.4",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        enable_jit_legacy_test=1,
-        enable_backwards_compat_test=1,
-        enable_docs_test=1,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-docs",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        enable_doc_jobs=True,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_DOCS, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-docs-push",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        enable_doc_jobs=True,
-        exclude_test=True,
-        is_scheduled="0 0 * * *",  # run pushes only on a nightly schedule
-        # NOTE: This is purposefully left without LABEL_CIFLOW_DOCS so that you can run
-        #       docs builds on your PR without the fear of anything pushing
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc7",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
-        ),
-    ),
-    # ParallelTBB does not have a maintainer and is currently flaky
-    # CIWorkflow(
-    #    arch="linux",
-    #    build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
-    #    docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-    #    test_runner_type=LINUX_CPU_TEST_RUNNER,
-    #    ciflow_config=CIFlowConfig(
-    #        labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
-    #    ),
-    # ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="parallelnative-linux-xenial-py3.7-gcc5.4",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
-        ),
-    ),
-    # Build PyTorch with BUILD_CAFFE2=ON
-    CIWorkflow(
-        arch="linux",
-        build_environment="caffe2-linux-xenial-py3.7-gcc5.4",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3-clang5-mobile-build",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        build_generates_artifacts=False,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3-clang5-mobile-custom-build-static",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        build_generates_artifacts=False,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3.7-clang7-asan",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        num_test_shards=3,
-        distributed_test=False,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3.7-clang7-onnx",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        num_test_shards=2,
-        distributed_test=False,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        enable_jit_legacy_test=1,
-        enable_multigpu_test=1,
-        enable_nogpu_no_avx_test=1,
-        enable_nogpu_no_avx2_test=1,
-        enable_slow_test=1,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        build_generates_artifacts=False,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="periodic-linux-bionic-cuda11.5-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        is_scheduled="45 4,10,16,22 * * *",
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        build_generates_artifacts=False,
-        is_scheduled="45 4,10,16,22 * * *",
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-cuda11.3-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-cuda11.3-py3.7-gcc7-no-ops",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-py3.7-gcc7-no-ops",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc7",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-bionic-rocm4.5-py3.7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7",
-        test_runner_type=LINUX_ROCM_TEST_RUNNER,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ROCM]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        build_generates_artifacts=False,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        build_with_debug=True,
-        is_scheduled="45 0,4,8,12,16,20 * * *",
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        build_generates_artifacts=False,
-        exclude_test=True,
-        is_scheduled="45 0,4,8,12,16,20 * * *",
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-bionic-py3.7-clang9",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        num_test_shards=2,
-        distributed_test=False,
-        enable_noarch_test=1,
-        enable_xla_test=1,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA, LABEL_CIFLOW_NOARCH},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-vulkan-bionic-py3.7-clang9",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        num_test_shards=1,
-        distributed_test=False,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        distributed_test=False,
-        timeout_after=360,
-        # Only run this on master 4 times per day since it does take a while
-        is_scheduled="0 */4 * * *",
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_SLOW_GRADCHECK, LABEL_CIFLOW_SLOW, LABEL_CIFLOW_SCHEDULED},
-        ),
-    ),
-]
-
-ANDROID_SHORT_WORKFLOWS = [
-    CIWorkflow(
-        arch="linux",
-        build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID, LABEL_CIFLOW_DEFAULT},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID, LABEL_CIFLOW_DEFAULT},
-        ),
-    ),
-]
-
-ANDROID_WORKFLOWS = [
-    CIWorkflow(
-        arch="linux",
-        build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID},
-        ),
-    ),
-]
-
-BAZEL_WORKFLOWS = [
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-cuda11.3-py3.7-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX},
-        ),
-    ),
-]
-
-IOS_WORKFLOWS = [
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-arm64",
-        ios_arch="arm64",
-        ios_platform="OS",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-arm64-coreml",
-        ios_arch="arm64",
-        ios_platform="OS",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-arm64-full-jit",
-        ios_arch="arm64",
-        ios_platform="OS",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-arm64-custom-ops",
-        ios_arch="arm64",
-        ios_platform="OS",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-arm64-metal",
-        ios_arch="arm64",
-        ios_platform="OS",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-x86-64",
-        ios_arch="x86_64",
-        ios_platform="SIMULATOR",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-x86-64-coreml",
-        ios_arch="x86_64",
-        ios_platform="SIMULATOR",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="ios-12-5-1-x86-64-full-jit",
-        ios_arch="x86_64",
-        ios_platform="SIMULATOR",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS},
-        ),
-    ),
-]
-
-MACOS_WORKFLOWS = [
-    # Distributed tests are still run on MacOS, but part of regular shards
-    CIWorkflow(
-        arch="macos",
-        build_environment="macos-11-py3-x86-64",
-        xcode_version="12.4",
-        test_runner_type=MACOS_TEST_RUNNER_11,
-        num_test_shards=2,
-        distributed_test=False,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="macos-10-15-py3-lite-interpreter-x86-64",
-        xcode_version="12",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        build_generates_artifacts=False,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_MACOS},
-        ),
-    ),
-    CIWorkflow(
-        arch="macos",
-        build_environment="macos-10-15-py3-arm64",
-        test_runner_type=MACOS_TEST_RUNNER_10_15,
-        exclude_test=True,
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_MACOS},
-        ),
-    ),
-]
-
-DOCKER_IMAGES = {
-    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9",  # for pytorch/xla
-    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.7",               # for rocm
-    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7",                 # for rocm
-}
-
-DOCKER_IMAGES.update({
-    workflow.docker_image_base
-    for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS, *ANDROID_WORKFLOWS]
-    if workflow.docker_image_base
-})
-
-DOCKER_WORKFLOWS = [
-    DockerWorkflow(
-        build_environment="docker-builds",
-        docker_images=sorted(DOCKER_IMAGES),
-        # Run every Wednesday at 3:01am to ensure they can build
-        is_scheduled="1 3 * * 3",
-    ),
-]
-
 class OperatingSystem:
     LINUX = "linux"
     WINDOWS = "windows"
@@ -864,7 +91,7 @@ class OperatingSystem:
         package_type="manywheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.LINUX),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
             isolated_workflow=True,
         ),
     ),
@@ -873,7 +100,7 @@ class OperatingSystem:
         package_type="conda",
         build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.LINUX),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
             isolated_workflow=True,
         ),
     ),
@@ -885,7 +112,7 @@ class OperatingSystem:
             OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
@@ -897,58 +124,123 @@ class OperatingSystem:
             OperatingSystem.LINUX, generate_binary_build_matrix.PRE_CXX11_ABI
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
 ]
 
+LINUX_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["10.2"],
+            python_versions=["3.7"]),
+        branches="master",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="master",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.PRE_CXX11_ABI,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="master",
+    ),
+]
+
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="wheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.WINDOWS),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="conda",
+        build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.WINDOWS),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
             isolated_workflow=True,
         ),
     ),
-    # NOTE: conda binaries are currently bugged on the installation step
-    #       See, https://github.com/pytorch/pytorch/pull/71484#issuecomment-1022617195
-    # BinaryBuildWorkflow(
-    #     os=OperatingSystem.WINDOWS,
-    #     package_type="conda",
-    #     build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.WINDOWS),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
-    #         isolated_workflow=True,
-    #     ),
-    # ),
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
-        abi_version=generate_binary_build_matrix.CXX11_ABI,
+        abi_version=generate_binary_build_matrix.RELEASE,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS, generate_binary_build_matrix.CXX11_ABI
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
-        abi_version=generate_binary_build_matrix.PRE_CXX11_ABI,
+        abi_version=generate_binary_build_matrix.DEBUG,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS, generate_binary_build_matrix.PRE_CXX11_ABI
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
 ]
+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="wheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.WINDOWS,
+            arches=["11.3"],
+            python_versions=["3.7"]),
+        branches="master",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="master",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="master",
+    ),
+]
 
 MACOS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
@@ -956,7 +248,7 @@ class OperatingSystem:
         package_type="wheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
             isolated_workflow=True,
         ),
     ),
@@ -965,7 +257,7 @@ class OperatingSystem:
         package_type="conda",
         build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.MACOS),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
             isolated_workflow=True,
         ),
     ),
@@ -977,7 +269,7 @@ class OperatingSystem:
             OperatingSystem.MACOS, generate_binary_build_matrix.CXX11_ABI
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
@@ -989,7 +281,7 @@ class OperatingSystem:
             OperatingSystem.MACOS, generate_binary_build_matrix.PRE_CXX11_ABI
         ),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
             isolated_workflow=True,
         ),
     ),
@@ -999,7 +291,7 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS),
         cross_compile_arm64=True,
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
             isolated_workflow=True,
         ),
     ),
@@ -1009,7 +301,7 @@ class OperatingSystem:
         cross_compile_arm64=True,
         build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.MACOS_ARM64),
         ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
             isolated_workflow=True,
         ),
     ),
@@ -1021,17 +313,13 @@ def main() -> None:
         loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
         undefined=jinja2.StrictUndefined,
     )
+
+    # not ported yet
     template_and_workflows = [
-        (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS),
-        (jinja_env.get_template("windows_ci_workflow.yml.j2"), WINDOWS_WORKFLOWS),
-        (jinja_env.get_template("bazel_ci_workflow.yml.j2"), BAZEL_WORKFLOWS),
-        (jinja_env.get_template("ios_ci_workflow.yml.j2"), IOS_WORKFLOWS),
-        (jinja_env.get_template("macos_ci_workflow.yml.j2"), MACOS_WORKFLOWS),
-        (jinja_env.get_template("docker_builds_ci_workflow.yml.j2"), DOCKER_WORKFLOWS),
-        (jinja_env.get_template("android_ci_full_workflow.yml.j2"), ANDROID_WORKFLOWS),
-        (jinja_env.get_template("android_ci_workflow.yml.j2"), ANDROID_SHORT_WORKFLOWS),
         (jinja_env.get_template("linux_binary_build_workflow.yml.j2"), LINUX_BINARY_BUILD_WORFKLOWS),
+        (jinja_env.get_template("linux_binary_build_workflow.yml.j2"), LINUX_BINARY_SMOKE_WORKFLOWS),
         (jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_BUILD_WORKFLOWS),
+        (jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_SMOKE_WORKFLOWS),
         (jinja_env.get_template("macos_binary_build_workflow.yml.j2"), MACOS_BINARY_BUILD_WORKFLOWS),
     ]
     # Delete the existing generated files first, this should align with .gitattributes file description.
@@ -1042,16 +330,12 @@ def main() -> None:
         except Exception as e:
             print(f"Error occurred when deleting file {w}: {e}")
 
-    ciflow_ruleset = CIFlowRuleset()
     for template, workflows in template_and_workflows:
         # added Iterable check to appease the mypy gods
         if not isinstance(workflows, Iterable):
             raise Exception(f"How is workflows not iterable? {workflows}")
         for workflow in workflows:
             workflow.generate_workflow_file(workflow_template=template)
-            ciflow_ruleset.add_label_rule(workflow.ciflow_config.labels, workflow.build_environment)
-    ciflow_ruleset.generate_json()
-
 
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
deleted file mode 100755
index 967f7222dd36..000000000000
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python3
-
-"""Generates a matrix to be utilized through github actions
-
-Will output a matrix to represent our testing configurations, which is currently
-dictated by just sharding.
-
-"""
-
-import json
-import os
-import re
-from typing import Dict
-
-from typing_extensions import TypedDict
-
-
-BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT')
-assert BUILD_ENVIRONMENT is not None
-
-class Config(TypedDict):
-    num_shards: int
-    runner: str
-
-
-def get_disabled_issues() -> str:
-    pr_body = os.getenv('PR_BODY', '')
-    # The below regex is meant to match all *case-insensitive* keywords that
-    # GitHub has delineated would link PRs to issues, more details here:
-    # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue.
-    # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not
-    # "closes  #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no #
-    regex = '(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) #([0-9]+)'
-    issue_numbers = [x[4] for x in re.findall(regex, pr_body)]
-    return ','.join(issue_numbers)
-
-# When the user specifies labels that are NOT ciflow/default, the expectation is
-# that the workflows should be triggered as if they are on trunk. For example, when
-# ciflow/all is specified, we should run the full test suite for Windows CUDA
-# and NOT only the smoke tests.
-def run_as_if_on_trunk() -> bool:
-    ON_PULL_REQUEST = os.getenv('GITHUB_HEAD_REF')
-    if not ON_PULL_REQUEST:
-        return True
-
-    from pathlib import Path
-    GITHUB_DIR = Path(__file__).resolve().parent.parent
-
-    with open(f'{GITHUB_DIR}/generated-ciflow-ruleset.json') as f:
-        labels_to_workflows = json.load(f)['label_rules']
-
-    pr_labels = json.loads(os.getenv('PR_LABELS', '[]'))
-    current_workflow_triggered_by_label = False
-    for label in pr_labels:
-        if label != 'ciflow/default' and label in labels_to_workflows:
-            workflows_triggered_by_label = labels_to_workflows[label]
-            if any([BUILD_ENVIRONMENT in workflow for workflow in workflows_triggered_by_label]):
-                current_workflow_triggered_by_label = True
-                break
-
-    return current_workflow_triggered_by_label
-
-def main() -> None:
-    TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE')
-    assert TEST_RUNNER_TYPE is not None
-    RUN_SMOKE_TESTS_ONLY_ON_PR = os.getenv('RUN_SMOKE_TESTS_ONLY_ON_PR')
-    RUN_SMOKE_TESTS = RUN_SMOKE_TESTS_ONLY_ON_PR == "true" and not run_as_if_on_trunk()
-    NUM_TEST_SHARDS_ON_PULL_REQUEST = os.getenv('NUM_TEST_SHARDS_ON_PULL_REQUEST')
-    NUM_TEST_SHARDS = int(os.getenv('NUM_TEST_SHARDS', '0'))
-    if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST:
-        NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST)
-    MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE')
-    DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE)
-    NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE')
-    configs: Dict[str, Config] = {}
-    if os.getenv('ENABLE_JIT_LEGACY_TEST'):
-        configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'):
-        configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE}
-    if NOGPU_RUNNER_TYPE is not None:
-        if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'):
-            configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-        if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
-            configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-        if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
-            configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
-    if os.getenv('ENABLE_DISTRIBUTED_TEST'):
-        configs['distributed'] = {
-            'num_shards': 1,
-            'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE
-        }
-    if os.getenv('ENABLE_SLOW_TEST'):
-        configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if os.getenv('ENABLE_DOCS_TEST'):
-        configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
-        configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if os.getenv('ENABLE_XLA_TEST'):
-        configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if os.getenv('ENABLE_NOARCH_TEST'):
-        configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    if RUN_SMOKE_TESTS:
-        configs['smoke_tests'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
-    matrix = {
-        'include': [
-            {
-                'config': 'default',
-                'shard': shard,
-                'num_shards': NUM_TEST_SHARDS,
-                'runner': TEST_RUNNER_TYPE,
-            }
-            for shard in range(1, NUM_TEST_SHARDS + 1)
-        ] + [
-            {
-                'config': name,
-                'shard': shard,
-                'num_shards': config['num_shards'],
-                'runner': config['runner'],
-            }
-            for name, config in configs.items()
-            for shard in range(1, config['num_shards'] + 1)
-        ]
-    }
-    render_matrix = {'config': list(dict.fromkeys(x['config'] for x in matrix['include']))}
-    print(json.dumps({'matrix': matrix, 'render-matrix': render_matrix}, indent=2))
-    print(f'::set-output name=matrix::{json.dumps(matrix)}')
-    print(f'::set-output name=render-matrix::{json.dumps(render_matrix)}')
-    print(f'::set-output name=ignore-disabled-issues::{get_disabled_issues()}')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
new file mode 100644
index 000000000000..72aed91d55ca
--- /dev/null
+++ b/.github/scripts/get_workflow_job_id.py
@@ -0,0 +1,60 @@
+# Helper to get the id of the currently running job in a GitHub Actions
+# workflow. GitHub does not provide this information to workflow runs, so we
+# need to figure it out based on what they *do* provide.
+
+import requests
+import os
+import argparse
+
+# Our strategy is to retrieve the parent workflow run, then filter its jobs on
+# RUNNER_NAME to figure out which job we're currently running.
+#
+# Why RUNNER_NAME? Because it's the only thing that uniquely identifies a job within a workflow.
+# GITHUB_JOB doesn't work, as it corresponds to the job yaml id
+# (https://bit.ly/37e78oI), which has two problems:
+# 1. It's not present in the workflow job JSON object, so we can't use it as a filter.
+# 2. It isn't unique; for matrix jobs the job yaml id is the same for all jobs in the matrix.
+#
+# RUNNER_NAME on the other hand is unique across the pool of runners. Also,
+# since only one job can be scheduled on a runner at a time, we know that
+# looking for RUNNER_NAME will uniquely identify the job we're currently
+# running.
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "workflow_run_id", help="The id of the workflow run, should be GITHUB_RUN_ID"
+)
+parser.add_argument(
+    "runner_name",
+    help="The name of the runner to retrieve the job id, should be RUNNER_NAME",
+)
+
+args = parser.parse_args()
+
+
+PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
+GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+REQUEST_HEADERS = {
+    "Accept": "application/vnd.github.v3+json",
+    "Authorization": "token " + GITHUB_TOKEN,
+}
+
+response = requests.get(
+    f"{PYTORCH_REPO}/actions/runs/{args.workflow_run_id}/jobs?per_page=100",
+    headers=REQUEST_HEADERS,
+)
+
+jobs = response.json()["jobs"]
+while "next" in response.links.keys():
+    response = requests.get(response.links["next"]["url"], headers=REQUEST_HEADERS)
+    jobs.extend(response.json()["jobs"])
+
+# Sort the jobs list by start time, in descending order. We want to get the most
+# recently scheduled job on the runner.
+jobs.sort(key=lambda job: job["started_at"], reverse=True)
+
+for job in jobs:
+    if job["runner_name"] == args.runner_name:
+        print(job["id"])
+        exit(0)
+
+exit(1)
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index 46070c25e632..4c43fc251fb1 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 
+import os
+import re
+import tempfile
 from collections import defaultdict
 from datetime import datetime
 from typing import cast, Any, Dict, Iterator, List, Optional, Tuple, Union
-import os
-import re
 
 
 RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$")
@@ -30,17 +31,17 @@ def fuzzy_list_to_dict(items: List[Tuple[str, str]]) -> Dict[str, List[str]]:
 
 
 def _check_output(items: List[str], encoding: str = "utf-8") -> str:
-    from subprocess import check_output, CalledProcessError
+    from subprocess import check_output, CalledProcessError, STDOUT
     try:
-        return check_output(items).decode(encoding)
+        return check_output(items, stderr=STDOUT).decode(encoding)
     except CalledProcessError as e:
         msg = f"Command `{' '.join(e.cmd)}` returned non-zero exit code {e.returncode}"
         stdout = e.stdout.decode(encoding) if e.stdout is not None else ""
         stderr = e.stderr.decode(encoding) if e.stderr is not None else ""
         if len(stderr) == 0:
-            msg += f"\n{stdout}"
+            msg += f"\n```\n{stdout}```"
         else:
-            msg += f"\nstdout:\n{stdout}\nstderr:\n{stderr}"
+            msg += f"\nstdout:\n```\n{stdout}```\nstderr:\n```\n{stderr}```"
         raise RuntimeError(msg) from e
 
 
@@ -127,7 +128,15 @@ def current_branch(self) -> str:
         return self._run_git("symbolic-ref", "--short", "HEAD").strip()
 
     def checkout(self, branch: str) -> None:
-        self._run_git('checkout', branch)
+        self._run_git("checkout", branch)
+
+    def fetch(self, ref: Optional[str] = None, branch: Optional[str] = None) -> None:
+        if branch is None and ref is None:
+            self._run_git("fetch", self.remote)
+        elif branch is None:
+            self._run_git("fetch", self.remote, ref)
+        else:
+            self._run_git("fetch", self.remote, f"{ref}:{branch}")
 
     def show_ref(self, name: str) -> str:
         refs = self._run_git('show-ref', '-s', name).strip().split('\n')
@@ -185,8 +194,19 @@ def compute_branch_diffs(self, from_branch: str, to_branch: str) -> Tuple[List[s
                 while len(from_values) > 0 and len(to_values) > 0:
                     frc = self.get_commit(from_values.pop())
                     toc = self.get_commit(to_values.pop())
+                    # FRC branch might have PR number added to the title
                     if frc.title != toc.title or frc.author_date != toc.author_date:
-                        raise RuntimeError(f"Unexpected differences between {frc} and {toc}")
+                        # HACK: Same commit were merged, reverted and landed again
+                        # which creates a tracking problem
+                        if (
+                            "pytorch/pytorch" not in self.remote_url() or
+                            frc.commit_hash not in {"0a6a1b27a464ba5be5f587cce2ee12ab8c504dbf",
+                                                    "6d0f4a1d545a8f161df459e8d4ccafd4b9017dbe",
+                                                    "edf909e58f06150f7be41da2f98a3b9de3167bca",
+                                                    "a58c6aea5a0c9f8759a4154e46f544c8b03b8db1",
+                                                    "7106d216c29ca16a3504aa2bedad948ebcf4abc2"}
+                        ):
+                            raise RuntimeError(f"Unexpected differences between {frc} and {toc}")
                     from_commits.remove(frc.commit_hash)
                     to_commits.remove(toc.commit_hash)
                 continue
@@ -194,6 +214,17 @@ def compute_branch_diffs(self, from_branch: str, to_branch: str) -> Tuple[List[s
                 from_commits.remove(commit)
             for commit in to_values:
                 to_commits.remove(commit)
+        # Another HACK: Patch-id is not stable for commits with binary files or for big changes across commits
+        # I.e. cherry-picking those from one branch into another will change patchid
+        if "pytorch/pytorch" in self.remote_url():
+            for excluded_commit in {"8e09e20c1dafcdbdb45c2d1574da68a32e54a3a5",
+                                    "5f37e5c2a39c3acb776756a17730b865f0953432",
+                                    "b5222584e6d6990c6585981a936defd1af14c0ba",
+                                    "84d9a2e42d5ed30ec3b8b4140c38dd83abbce88d",
+                                    "f211ec90a6cdc8a2a5795478b5b5c8d7d7896f7e"}:
+                if excluded_commit in from_commits:
+                    from_commits.remove(excluded_commit)
+
         return (from_commits, to_commits)
 
     def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None:
@@ -209,11 +240,17 @@ def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None:
             self.cherry_pick(commit)
         self.checkout(orig_branch)
 
-    def push(self, branch: str, dry_run: bool) -> None:
-        if dry_run:
-            self._run_git("push", "--dry-run", self.remote, branch)
-        else:
-            self._run_git("push", self.remote, branch)
+    def push(self, branch: str, dry_run: bool, retry: int = 3) -> None:
+        for cnt in range(retry):
+            try:
+                if dry_run:
+                    self._run_git("push", "--dry-run", self.remote, branch)
+                else:
+                    self._run_git("push", self.remote, branch)
+            except RuntimeError as e:
+                print(f"{cnt} push attempt failed with {e}")
+                self.fetch()
+                self._run_git("rebase", f"{self.remote}/{branch}")
 
     def head_hash(self) -> str:
         return self._run_git("show-ref", "--hash", "HEAD").strip()
@@ -237,6 +274,12 @@ def amend_commit_message(self, msg: str) -> None:
         self._run_git("commit", "--amend", "-m", msg)
 
 
+def clone_repo(username: str, password: str, org: str, project: str) -> GitRepo:
+    path = tempfile.mkdtemp()
+    _check_output(['git', 'clone', f'https://{username}:{password}@github.com/{org}/{project}', path]).strip()
+    return GitRepo(path=path)
+
+
 class PeekableIterator(Iterator[str]):
     def __init__(self, val: str) -> None:
         self._val = val
diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json
new file mode 100644
index 000000000000..1b97bf35f47e
--- /dev/null
+++ b/.github/scripts/gql_mocks.json
@@ -0,0 +1,16082 @@
+{
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "seemethere"
+          },
+          "title": "ci: Migrate metrics credentials to managed IAM",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>",
+          "headRefName": "gh/seemethere/215/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/seemethere/215/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            },
+            "totalCount": 2
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658275867"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276090"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cpu-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276092"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276094"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276095"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276097"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276098"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7-no-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815315?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276099"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Test tools"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276100"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-asan"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276101"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU=",
+                      "hasNextPage": true
+                    }
+                  },
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
+              {
+                "path": ".github/templates/common.yml.j2"
+              },
+              {
+                "path": ".github/workflows/generated-macos-11-py3-x86-64.yml"
+              },
+              {
+                "path": ".github/workflows/update_pytorch_labels.yml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "kit1980"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "janeyx99"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068270969
+              },
+              {
+                "bodyText": "@pytorchbot force merge this",
+                "author": {
+                  "login": "seemethere"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068436128
+              },
+              {
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068437098
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "author": {
+                  "login": "seemethere"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068482921
+              },
+              {
+                "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1068484404
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": false,
+          "isCrossRepository": true,
+          "author": {
+            "login": "mingxiaoh"
+          },
+          "title": "improve mkldnn convolution test coverage",
+          "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ",
+          "headRefName": "master",
+          "headRepository": {
+            "nameWithOwner": "mingxiaoh/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "11pikachu"
+                    },
+                    "email": "junx.du@intel.com",
+                    "name": "dujun"
+                  },
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "clang-format"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676797?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHOQYu8fQ==",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281097"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676800?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676817?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676829?check_suite_focus=true"
+                            },
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676840?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHOQYu8qA==",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281099"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "codecov/project",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://codecov.io"
+                            },
+                            {
+                              "name": "codecov/patch",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://codecov.io"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHOQZhcFQ==",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100822"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "codecov/patch",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://codecov.io"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHOQZZsEQ==",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100824"
+                      },
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHOUquzJg==",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1487517306"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHOWKm2eg==",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ]
+          },
+          "changedFiles": 5,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/math_libraries/convolutions.py"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "ailzhang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 673760580
+              },
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry  It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 673816925
+              },
+              {
+                "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 673858224
+              },
+              {
+                "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@           Coverage Diff           @@\n##           master   #31093   +/-   ##\n=======================================\n  Coverage   68.00%   68.00%           \n=======================================\n  Files         382      382           \n  Lines       49527    49527           \n=======================================\n  Hits        33679    33679           \n  Misses      15848    15848           \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute <relative> (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.",
+                "author": {
+                  "login": "codecov"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "codecov"
+                },
+                "databaseId": 686921371
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale.  Feel free to remove the Stale label if you feel this was a mistake.  If you are unable to remove the Stale label please contact a maintainer in order to do so.  Stale pull requests will automatically be closed 30 days after being marked Stale",
+                "author": {
+                  "login": "pytorchbot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1095860944
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOKCjFRA==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=62ce809793481ce6ddce6e1a19d9b0761755ff0ff75decaf8a79419eaf793110 cursor=Y3Vyc29yOnYyOpHOKCjFRA== name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Hi, @mingfeima  @soumith  @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564806270
+              },
+              {
+                "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?",
+                "author": {
+                  "login": "vpirogov"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564808528
+              },
+              {
+                "bodyText": "@vpirogov  The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test.  The spirit of validation is to cross check.\n@gottbrath @gchanan  The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage.  Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 567826907
+              },
+              {
+                "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 574563012
+              },
+              {
+                "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 575272358
+              },
+              {
+                "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 583917522
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "author": {
+                  "login": "dr-ci"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 628466876
+              },
+              {
+                "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 629955767
+              },
+              {
+                "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 629997129
+              },
+              {
+                "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ',  if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 630010734
+              },
+              {
+                "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 630014823
+              },
+              {
+                "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631187735
+              },
+              {
+                "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 631191425
+              },
+              {
+                "bodyText": "@mruberry  we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631886529
+              },
+              {
+                "bodyText": "I understand. Let me know when you're ready for me to review.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 631908011
+              },
+              {
+                "bodyText": "@mruberry thanks, we are ready for review now.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631909442
+              },
+              {
+                "bodyText": "@mingxiaoh Great! I'll take a look ASAP.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 631910556
+              },
+              {
+                "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 633430458
+              },
+              {
+                "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 634432326
+              },
+              {
+                "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 634557563
+              },
+              {
+                "bodyText": "@mruberry  Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 635256214
+              },
+              {
+                "bodyText": "@mruberry  we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637364148
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637444457
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry  thank you",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637479226
+              },
+              {
+                "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637827507
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637912105
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637924703
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap.  Given this, it would be be better if you raise all the requirement at a time,  considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 637960626
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637967153
+              },
+              {
+                "bodyText": "@mruberry  it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637978356
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 638446723
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 638451670
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 653028208
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 654443242
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 656062287
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry  the code is ready for review now, would you please take time for it? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 658071151
+              },
+              {
+                "bodyText": "super nit: renaming files to .json will make it more IDE friendly.",
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 658464685
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 659164401
+              },
+              {
+                "bodyText": "@ngimel  & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 660884305
+              },
+              {
+                "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 662678464
+              },
+              {
+                "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 662930687
+              },
+              {
+                "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 106, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n    {\n        \"case_name\":\"masknet_p1:conv33\",\n        \"mb\":1,\n        \"g\":1,\n        \"ic\":512,\n        \"ih\":64,\n        \"iw\":64,\n        \"oc\":12,\n        \"kh\":1,\n        \"kw\":1,\n        \"sh\":1,\n        \"sw\":1,\n        \"ph\":0,\n        \"pw\":0,\n        \"dh\":0,\n        \"dw\":0,\n        \"bias\":\"False\"\n    },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n            has_bias = case['bias']\n            if dh == 0 or dw == 0:\n                invalid_cases.append(case_name)",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "mruberry"
+                },
+                "databaseId": 663240268
+              },
+              {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 664373079
+              },
+              {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 664569507
+              },
+              {
+                "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 666894774
+              },
+              {
+                "bodyText": "@mruberry  would you please find time to review it again? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 668380451
+              },
+              {
+                "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 670306210
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 670322557
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 670591170
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 673402901
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": false,
+          "isCrossRepository": false,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change with lots of commits",
+          "body": "Draft PR with 100+ commits, to test mergebot ",
+          "headRefName": "malfet/pr-with-lots-of-commits",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "3067f2240afc7a29dc348000aa19eccbd9772303"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "andrewor14"
+                    },
+                    "email": "andrewor@fb.com",
+                    "name": "Andrew Or"
+                  },
+                  "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "alanwaketan"
+                    },
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "alanwaketan"
+                    },
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "aac6204bf710beb5e50a383d426ae6222396335a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "486387e8644afb46edff5aa5925b55c8119f67f0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Krovatkin"
+                    },
+                    "email": "korovaikon@gmail.com",
+                    "name": "Nikolay Korovaiko"
+                  },
+                  "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "suo"
+                    },
+                    "email": "suo@fb.com",
+                    "name": "Michael Suo"
+                  },
+                  "oid": "f70b31f62b1c5159eef2725484b175983517c88c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "46b754a55b63e3168ad5854ad412c124934b675d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "robieta"
+                    },
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "13df69e13ee571fdd716139419a00aec47ade7d6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "7917d789f0a523715041ade5177d271082628236"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kit1980"
+                    },
+                    "email": "sdym@fb.com",
+                    "name": "Sergii Dymchenko (Meta Employee)"
+                  },
+                  "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mruberry"
+                    },
+                    "email": "mruberry@fb.com",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pearu"
+                    },
+                    "email": "pearu.peterson@gmail.com",
+                    "name": "Pearu Peterson"
+                  },
+                  "oid": "28502265cb5925cb7db8dcb2dd2334963092714a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pritamdamania"
+                    },
+                    "email": "pritam.damania@fb.com",
+                    "name": "pritam"
+                  },
+                  "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "MagiaSN"
+                    },
+                    "email": "magialiao@tencent.com",
+                    "name": "magialiao"
+                  },
+                  "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "9608ab28744d5cae32f371490557b248c9549c66"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "rohan-varma"
+                    },
+                    "email": "rvarm1@fb.com",
+                    "name": "Rohan Varma"
+                  },
+                  "oid": "447580dc565f3660eddb2c996c6ed25b88338684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jiyuanzFB"
+                    },
+                    "email": "jiyuanz@fb.com",
+                    "name": "Jiyuan Zhang"
+                  },
+                  "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "a366fd01136292544b7862968ae92feba4b6d8fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "d306c99addc543908f64666baeecacbd0749f4a7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "c2456ea658f41f64ea054a422edf22a9c977399f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "anjali411"
+                    },
+                    "email": "chourdiaanjali123@gmail.com",
+                    "name": "anjali411"
+                  },
+                  "oid": "af761d9a5d058c9188f16589bae4f307d35185be"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "1516554e22136db89d0aeba43a1a1a987e995d68"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "38c1a2028090353e40a019c673c9ab16b39e4825"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "20d798b319cd107a767fe220f7a3027c18a1c844"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "7f821382db5ad08efe5b09a145c606852b8a9272"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "28d6258e62c9fc361a18689877c962c69889dc23"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "HarborYuan"
+                    },
+                    "email": "yuanhaobo@whu.edu.cn",
+                    "name": "Haobo Yuan"
+                  },
+                  "oid": "2350fad8391367ebf81c7236a2c883644b4ff622"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "zou3519"
+                    },
+                    "email": "zou3519@gmail.com",
+                    "name": "Richard Zou"
+                  },
+                  "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jeffdaily"
+                    },
+                    "email": "jeff.daily@amd.com",
+                    "name": "Jeff Daily"
+                  },
+                  "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "george-qi"
+                    },
+                    "email": "georgeqi94@gmail.com",
+                    "name": "George Qi"
+                  },
+                  "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "60fc3277634365b64465712b13db2acb76d6c890"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jerryzh168"
+                    },
+                    "email": "jerryzh168@gmail.com",
+                    "name": "Jerry Zhang"
+                  },
+                  "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ysiraichi"
+                    },
+                    "email": "yukio.siraichi@gmail.com",
+                    "name": "Yukio Siraichi"
+                  },
+                  "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "thiagocrepaldi"
+                    },
+                    "email": "thiago.crepaldi@microsoft.com",
+                    "name": "Thiago Crepaldi"
+                  },
+                  "oid": "83208e7dee4503c1bee1df9f6632794694dffa01"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "fatcat-z"
+                    },
+                    "email": "jiz@microsoft.com",
+                    "name": "Jay Zhang"
+                  },
+                  "oid": "f273961c1696b156e35f8c76f7ad37934031050d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pavithranrao"
+                    },
+                    "email": "pavithran@fb.com",
+                    "name": "Pavithran Ramachandran"
+                  },
+                  "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "43675665fa6b5154de8b25125dd03d7be35c884f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "cf3778a35129a40dee14366515201b7ed2c0f346"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "swolchok"
+                    },
+                    "email": "swolchok@fb.com",
+                    "name": "Scott Wolchok"
+                  },
+                  "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "IvanYashchuk"
+                    },
+                    "email": "ivan.yashchuk@aalto.fi",
+                    "name": "Ivan Yashchuk"
+                  },
+                  "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Chillee"
+                    },
+                    "email": "chilli@fb.com",
+                    "name": "Horace He"
+                  },
+                  "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bigfootjon"
+                    },
+                    "email": "jonjanzen@fb.com",
+                    "name": "Jon Janzen"
+                  },
+                  "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "samdow"
+                    },
+                    "email": "samdow@fb.com",
+                    "name": "samdow"
+                  },
+                  "oid": "128c3ad747093f4970329a82c7c4720420faeff2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "arindamroy-eng"
+                    },
+                    "email": "61168652+arindamroy-eng@users.noreply.github.com",
+                    "name": "arindamroy-eng"
+                  },
+                  "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            },
+            "totalCount": 131
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693698"
+                      },
+                      {
+                        "app": {
+                          "name": "Netlify",
+                          "databaseId": 13473
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693712"
+                      },
+                      {
+                        "app": {
+                          "name": "Azure Pipelines",
+                          "databaseId": 9426
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693725"
+                      },
+                      {
+                        "app": {
+                          "name": "Dependabot",
+                          "databaseId": 29110
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693741"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693761"
+                      },
+                      {
+                        "app": {
+                          "name": "PyTorch Bot",
+                          "databaseId": 40112
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693774"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099388390?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694412"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431378?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431511?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431693?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431829?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432018?check_suite_focus=true"
+                            },
+                            {
+                              "name": "lintrunner",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432195?check_suite_focus=true"
+                            },
+                            {
+                              "name": "workflow-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432331?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694417"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099430906?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431117?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431312?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431677?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431819?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432057?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432191?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432334?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432446?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432577?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432685?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432822?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432932?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433128?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433280?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433402?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433542?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433675?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433758?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433859?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099554424?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099554523?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557184?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557310?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557449?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557512?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557588?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557655?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557717?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557795?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565740?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565906?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565972?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099566036?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099580613?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099580676?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608194?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608322?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608371?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099619007?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099645951?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099646089?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685555?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685664?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685757?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099689530?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099757872?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099757955?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099898234?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099898323?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=",
+                            "hasNextPage": true
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694439"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                }
+              }
+            ]
+          },
+          "changedFiles": 348,
+          "files": {
+            "nodes": [
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_data.py"
+              },
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_definitions.py"
+              },
+              {
+                "path": ".circleci/scripts/cpp_doc_push_script.sh"
+              },
+              {
+                "path": ".circleci/scripts/python_doc_push_script.sh"
+              },
+              {
+                "path": ".github/actions/checkout-pytorch/action.yml"
+              },
+              {
+                "path": ".github/merge_rules.json"
+              },
+              {
+                "path": ".github/scripts/gitutils.py"
+              },
+              {
+                "path": ".github/scripts/gql_mocks.json"
+              },
+              {
+                "path": ".github/scripts/trymerge.py"
+              },
+              {
+                "path": ".github/workflows/_bazel-build-test.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-build.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-test.yml"
+              },
+              {
+                "path": ".github/workflows/_mac-test.yml"
+              },
+              {
+                "path": ".github/workflows/_rocm-test.yml"
+              },
+              {
+                "path": ".github/workflows/_win-test.yml"
+              },
+              {
+                "path": ".github/workflows/buck_build_test.yml"
+              },
+              {
+                "path": ".github/workflows/lint.yml"
+              },
+              {
+                "path": ".github/workflows/periodic.yml"
+              },
+              {
+                "path": ".github/workflows/pull.yml"
+              },
+              {
+                "path": ".github/workflows/trunk.yml"
+              },
+              {
+                "path": ".jenkins/pytorch/macos-test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/win-test.sh"
+              },
+              {
+                "path": ".lintrunner.toml"
+              },
+              {
+                "path": "BUILD.bazel"
+              },
+              {
+                "path": "CODEOWNERS"
+              },
+              {
+                "path": "README.md"
+              },
+              {
+                "path": "aten/src/ATen/BatchingRegistrations.cpp"
+              },
+              {
+                "path": "aten/src/ATen/Dispatch.h"
+              },
+              {
+                "path": "aten/src/ATen/ExpandUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalInverses.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp"
+              },
+              {
+                "path": "aten/src/ATen/NestedTensorImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/OpMathType.h"
+              },
+              {
+                "path": "aten/src/ATen/SparseCsrTensorUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.cpp"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.h"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.cpp"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.h"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.h"
+              },
+              {
+                "path": "aten/src/ATen/core/TensorBase.h"
+              },
+              {
+                "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h"
+              },
+              {
+                "path": "aten/src/ATen/core/dispatch/Dispatcher.h"
+              },
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue_inl.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type_base.h"
+              },
+              {
+                "path": "aten/src/ATen/core/type.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/CUDASparse.h"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_complex.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_jit_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Blas.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/Itertools.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/LinearAlgebra.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/SoftMax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorConversions.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Unique.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/JitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Lerp.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/SoftMax.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Unique.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/ts_native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/record_function.cpp"
+              },
+              {
+                "path": "aten/src/ATen/record_function.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/Operators.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/basic.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/vmap_test.cpp"
+              },
+              {
+                "path": "binaries/record_function_benchmark.cc"
+              },
+              {
+                "path": "c10/core/DispatchKey.cpp"
+              },
+              {
+                "path": "c10/core/DispatchKey.h"
+              },
+              {
+                "path": "c10/core/DispatchKeySet.h"
+              },
+              {
+                "path": "c10/test/core/DispatchKeySet_test.cpp"
+              },
+              {
+                "path": "c10/util/ArrayRef.h"
+              },
+              {
+                "path": "caffe2/core/tensor.h"
+              },
+              {
+                "path": "docs/source/conf.py"
+              },
+              {
+                "path": "docs/source/fx.rst"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:hongxiayang,janeyx99,mehdimashayekhi,tvalentius,yidawang-oss, ...",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104214220
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104215370
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104220908
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104378397
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104379712
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQdD4zA==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=9a7ea963024cb39819e4a560d8d95f41bb3e0dad12c6f05539a994d6f7c38c34 cursor=MTAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "7f560351ae04ea43e58fbfda885bcf216aa26cde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "e8677ed168a036bc7e590d800fe98dd15f10581b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "robieta"
+                    },
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "ac5611caa13642ef8dbe0db453b283b42cbd900b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "robieta"
+                    },
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "1184afbd3bfde0f46133aef09e55e18d3bfb3c3e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "msi@fb.com",
+                    "name": "Min Si"
+                  },
+                  "oid": "1c05604f3d049c67dc678d0295c0add470bff3dc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "eellison@devfair044.h1.fair",
+                    "name": "Elias Ellison"
+                  },
+                  "oid": "76ab5101bd36e8d73637d31bbea125240b7b27f0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "eellison@devfair044.h1.fair",
+                    "name": "Elias Ellison"
+                  },
+                  "oid": "c774050e92c3d8e52968e1eb635dd3e9491104b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "guoyejun"
+                    },
+                    "email": "yejun.guo@intel.com",
+                    "name": "Guo Yejun"
+                  },
+                  "oid": "8981595c5361f07186f4534f3be71f1d829a3046"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "BowenBao"
+                    },
+                    "email": "bowbao@microsoft.com",
+                    "name": "BowenBao"
+                  },
+                  "oid": "036f362904024ac9481248965009f312bec6656b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "457d994933f164a9fd70da5ca2733dd6c046a28b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "f49ebc77520774e71722111d554a0215a26956df"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mikeiovine"
+                    },
+                    "email": "mikeiovine@fb.com",
+                    "name": "Mike Iovine"
+                  },
+                  "oid": "f069e1a4a5f98d3fe961e4fc562ede59f59b4026"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "salilsdesai"
+                    },
+                    "email": "salilsdesai@fb.com",
+                    "name": "Salil Desai"
+                  },
+                  "oid": "30bccf58393b288412a0f5a2423a1a41ffce258e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "angelayi"
+                    },
+                    "email": "angelayi@fb.com",
+                    "name": "Angela Yi"
+                  },
+                  "oid": "f4ba440fe8a632c1ee88e01f7746a8a92c8f3902"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "shirong@fb.com",
+                    "name": "Shirong Wu"
+                  },
+                  "oid": "d203346c93ba96d626c6c02910888198c789ba69"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jamesr66a"
+                    },
+                    "email": "jamesreed@fb.com",
+                    "name": "James Reed"
+                  },
+                  "oid": "73a4e34963e212b799a191fd031d2fa31d17e0ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Krovatkin"
+                    },
+                    "email": "korovaikon@gmail.com",
+                    "name": "Nikolay Korovaiko"
+                  },
+                  "oid": "b9d5206dfb46f09f953aba3ffb0e1e33a99032ee"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "12114e6937573fead54e11ae6cdebe5b31dee302"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "s4ayub"
+                    },
+                    "email": "shababayub@fb.com",
+                    "name": "Shabab Ayub"
+                  },
+                  "oid": "f2323f76ad6f7f590285bf9c6d20c14a79542563"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jaglinux"
+                    },
+                    "email": "jagdish.krishna@gmail.com",
+                    "name": "Jagadish Krishnamoorthy"
+                  },
+                  "oid": "acd4b5abe2739c09c1a02524eceda46ff93fd385"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "cccclai"
+                    },
+                    "email": "chenlai@fb.com",
+                    "name": "Chen Lai"
+                  },
+                  "oid": "04179f533283132fa334a9f91a070b1712f7323d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "zaxtax"
+                    },
+                    "email": "rob@zinkov.com",
+                    "name": "Rob Zinkov"
+                  },
+                  "oid": "5097cdcd6994ad82b3cec942b70e75dbeaee8ca4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "5015ecb5a2b86943f457d71f5a977444dd062732"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "1c42b7789d3966cd541b08fce359b9738fee69f6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "893ac3d334fd3e85e22423a06fe986ce453fe304"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "emcastillo"
+                    },
+                    "email": "ecastill@preferred.jp",
+                    "name": "Emilio Castillo"
+                  },
+                  "oid": "aa5d1b6b031ee2b8bb85f793a842ac1327ae4a19"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "0707a1d00f33d7098f56de339cb30436e8c2ea44"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "NivekT"
+                    },
+                    "email": "ktse@fb.com",
+                    "name": "Kevin Tse"
+                  },
+                  "oid": "ccb082d42af99f6374183cf914cc712bac585f0f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ryandaryl"
+                    },
+                    "email": "ryandarylmills@gmail.com",
+                    "name": "ryandaryl"
+                  },
+                  "oid": "4f2909cc8747808786a1871b0a6825cc4566f48c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "f764010648a29223d9ed4b955073d9d2fb1b2f43"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTMx",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=76123 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "kumpera"
+          },
+          "title": "Introduce distributed checkpoint with ShardedTensor.",
+          "body": "Co-authored-by: Wen Zhang <zhangwen@fb.com>\r\nCo-authored-by: Yifu Wang <yifu@fb.com>\r\n\r\n",
+          "headRefName": "st_checkpoint",
+          "headRepository": {
+            "nameWithOwner": "kumpera/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "6bf248bc20a71f248064b795f38276326fe43aae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755666"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234164?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755785"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234165?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234428?check_suite_focus=true"
+                            },
+                            {
+                              "name": "lintrunner",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234555?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234642?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234701?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234761?check_suite_focus=true"
+                            },
+                            {
+                              "name": "workflow-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234837?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755786"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299245858?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299245958?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246168?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246250?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246281?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246329?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246373?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246442?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246517?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246547?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246591?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246687?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246843?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246972?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247064?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247163?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247261?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247380?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247471?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247519?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299305596?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299305656?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299307925?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299307961?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308001?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308035?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308082?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308120?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308169?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308217?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299312986?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313146?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313195?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313235?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313977?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299314888?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299314937?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332358?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332420?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332476?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332526?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299335580?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299375031?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299375079?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299377190?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378010?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378053?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378105?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378136?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299437798?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=",
+                            "hasNextPage": true
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755806"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "lintrunner",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468155?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468457?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468841?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468942?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469180?check_suite_focus=true"
+                            },
+                            {
+                              "name": "workflow-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469314?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469473?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363240"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468138?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363271"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468956?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469237?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469475?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469750?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470049?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470368?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470787?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471290?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471585?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471734?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472014?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472172?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472411?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472715?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473041?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473226?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473414?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473700?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473992?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309474162?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647069?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647413?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647538?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657055?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657196?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657332?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657575?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657726?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657858?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309658314?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309658433?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665388?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665513?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665597?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665697?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309672367?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309672499?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696458?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696554?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696638?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696725?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309712838?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309767601?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309767717?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792321?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792407?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792546?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792639?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792972?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309939578?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=",
+                            "hasNextPage": true
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363300"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                }
+              }
+            ]
+          },
+          "changedFiles": 11,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/distributed/_shard/checkpoint/test_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/filesystem.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/metadata.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/resharding.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/storage.py"
+              },
+              {
+                "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTE",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wanchaol"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "DISMISSED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=",
+              "hasPreviousPage": true
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118495479
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118511287
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118662274
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118689010
+              },
+              {
+                "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118693497
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=cc0db92500f836c7fc4f9a0235a75b77562e6e4ab939b5cbe5584078df1c22d2 cursor=Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0= name=pytorch number=76123 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yMlQyMDozNzo1NC0wNzowMLkyMDIyLTA0LTIyVDE2OjAyOjA5LTA3OjAwzjip7G8=",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "coolteemf"
+          },
+          "title": "Optimize grid sample 3d",
+          "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n>     * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n>     * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n>     * Changed the CPU kernels:\r\n>       (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorAccessor<scalar_t, 3>* gInp_slice_ptr` instead of `TensorAccessor<scalar_t, 3>& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n>     * Changed CUDA kernel:\r\n>       (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorInfo<scalar_t, index_t>()` instead of `getTensorInfo<scalar_t, index_t>(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n>     * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n>     * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n",
+          "headRefName": "optimize_grid_sample_3d",
+          "headRepository": {
+            "nameWithOwner": "coolteemf/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "563ec73747ad53b63b36736c47c4342f962c2a09"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "f683e8aec7aea76097a264eec01511e704c31154"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "coolteemf"
+                    },
+                    "email": "67541941+coolteemf@users.noreply.github.com",
+                    "name": "Fran\u00e7ois Lecomte"
+                  },
+                  "oid": "b932e9e286c22aaf352375186df851ef060b295a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTY",
+              "hasNextPage": false
+            },
+            "totalCount": 16
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801320"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-onnx"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020089?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302165846?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302165949?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801849"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019921?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801852"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-rocm4.5-py3.7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019934?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302431993?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302432078?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302432150?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801853"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cuda11.3-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019928?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303266925?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303267017?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303267128?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801855"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "mypy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019930?check_suite_focus=true"
+                            },
+                            {
+                              "name": "shellcheck",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020111?check_suite_focus=true"
+                            },
+                            {
+                              "name": "py2-setup-validate-errormsg",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020318?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020421?check_suite_focus=true"
+                            },
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020539?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020668?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020780?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020970?check_suite_focus=true"
+                            },
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302021124?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801856"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-asan"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020084?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302192846?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302192926?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302193029?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801857"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020092?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801862"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020048?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147216?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147336?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147409?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147493?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147622?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147822?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801866"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019929?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801869"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0=",
+                      "hasNextPage": true
+                    }
+                  },
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                }
+              }
+            ]
+          },
+          "changedFiles": 9,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/native/GridSampler.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.h"
+              },
+              {
+                "path": "aten/src/ATen/native/native_functions.yaml"
+              },
+              {
+                "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py"
+              },
+              {
+                "path": "test/test_nn.py"
+              },
+              {
+                "path": "tools/autograd/derivatives.yaml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "OQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048868910
+              },
+              {
+                "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1048983572
+              },
+              {
+                "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049048119
+              },
+              {
+                "bodyText": "@pytorchbot merge this please",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049131992
+              },
+              {
+                "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1049134520
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=75095 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "mruberry"
+          },
+          "title": "Initial prims, references, and test architecture for them",
+          "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ",
+          "headRefName": "prims_and_references",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "a790467c650be92775103cde5e866c90b56f5376"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "63fdd580118477416ae160e0670ae722ea248090"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "442c405e9da0d66744ef03e379224c41eedf5b57"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "031ac49ae9c192989385986b6707fa781e3229e0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MjY",
+              "hasNextPage": false
+            },
+            "totalCount": 26
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            },
+                            {
+                              "name": "Meta Internal-Only Changes Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://opensource.facebook.com/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454954"
+                      },
+                      {
+                        "app": {
+                          "name": "Netlify",
+                          "databaseId": 13473
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454956"
+                      },
+                      {
+                        "app": {
+                          "name": "Azure Pipelines",
+                          "databaseId": 9426
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454965"
+                      },
+                      {
+                        "app": {
+                          "name": "Dependabot",
+                          "databaseId": 29110
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454970"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454974"
+                      },
+                      {
+                        "app": {
+                          "name": "PyTorch Bot",
+                          "databaseId": 40112
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454977"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879695?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455322"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879696?check_suite_focus=true"
+                            },
+                            {
+                              "name": "lintrunner",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879758?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879835?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879901?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879942?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150880005?check_suite_focus=true"
+                            },
+                            {
+                              "name": "workflow-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150880051?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455334"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895177?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895295?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895365?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895428?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895554?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895614?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895698?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895758?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895866?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895923?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895991?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896053?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896146?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896213?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896256?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896288?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896313?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896352?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896403?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896443?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970691?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970749?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970796?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970831?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970876?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970911?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970959?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150971013?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976613?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976667?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976694?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150977190?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150980317?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150980363?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150989669?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150989736?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003389?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003429?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003460?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151007051?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151023043?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151023077?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151040240?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041874?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041915?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041959?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151065166?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151065218?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151165045?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151165103?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=",
+                            "hasNextPage": true
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455360"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                }
+              }
+            ]
+          },
+          "changedFiles": 5,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/test_ops.py"
+              },
+              {
+                "path": "torch/_prims/__init__.py"
+              },
+              {
+                "path": "torch/_prims/utils.py"
+              },
+              {
+                "path": "torch/_refs/__init__.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_methods_invocations.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zou3519"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "peterbell10"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "Lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1105643418
+              },
+              {
+                "bodyText": "@pytorchbot merge this please",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1108072887
+              },
+              {
+                "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1108073536
+              },
+              {
+                "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1108075965
+              },
+              {
+                "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1108351107
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "chunyuan-w"
+          },
+          "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)",
+          "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n  \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.",
+          "headRefName": "chunyuan/llga_preview2",
+          "headRepository": {
+            "nameWithOwner": "chunyuan-w/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "81d44f35b8bc043c38837d0694e5bc072203b832"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "954dc23663125897f4b199eb2a8607dc5fca3274"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "edbfc640ea79a0af85757d9e73796dcc90231519"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "62a4642cf3330524990a69ac29e002c97812320a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ca9b1223be4af2c8b4929303d498eafd71793128"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6f4a23d24514a02954d2ec792830085f612223c9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e88b492be733f24b6aa395829c76add67d0901e7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5157930f7b3921d41a586260582b574c915f6ca1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "62991eaad0e638bb0bced327e03f932f66f68732"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "7496bf1588050191595d833d23b8972b2f22655e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "eb32cc65a975361160948bfc3d6a577991ea262e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a72cd0d02693f45e5354a70654581ad514581ec7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "49a592d9788d08e6cd0593882f867e129057c1cc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0b743523d1430fec759d5fefbb687f17c89335a5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c189eca154b6691919d0e21489d1c322c7435c0b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "29929f48be03dcdd1bbfade572de7feafa825547"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nikita.shulga@gmail.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NjI",
+              "hasNextPage": false
+            },
+            "totalCount": 62
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            },
+                            {
+                              "name": "Meta Internal-Only Changes Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://opensource.facebook.com/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625010"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633826958?check_suite_focus=true"
+                            },
+                            {
+                              "name": "py2-setup-validate-errormsg",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827084?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827160?check_suite_focus=true"
+                            },
+                            {
+                              "name": "shellcheck",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827410?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827566?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827701?check_suite_focus=true"
+                            },
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827899?check_suite_focus=true"
+                            },
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828081?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828249?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828312?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828407?check_suite_focus=true"
+                            },
+                            {
+                              "name": "mypy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828524?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625458"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633826956?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625463"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827223?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827451?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827729?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827956?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828089?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828258?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828406?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828523?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828594?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828765?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828992?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829085?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829195?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829321?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829420?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829488?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829666?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829746?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829845?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829904?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453168?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453232?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453388?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453444?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453499?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453573?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453624?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453683?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634462211?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634462270?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602176?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602239?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602319?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602425?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622529?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622639?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622730?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634637718?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634637817?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634775159?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634775273?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823038?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823099?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823171?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634920855?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921428?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921484?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921543?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634995986?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634996056?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625483"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                }
+              }
+            ]
+          },
+          "changedFiles": 37,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "caffe2/CMakeLists.txt"
+              },
+              {
+                "path": "cmake/Dependencies.cmake"
+              },
+              {
+                "path": "cmake/Modules/FindMKLDNN.cmake"
+              },
+              {
+                "path": "cmake/public/mkldnn.cmake"
+              },
+              {
+                "path": "docs/source/jit.rst"
+              },
+              {
+                "path": "test/test_jit_llga_fuser.py"
+              },
+              {
+                "path": "torch/_C/__init__.pyi.in"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/README.md"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/operator.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/alias_analysis.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/ir.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onednn_graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/python/init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/operator.cpp"
+              },
+              {
+                "path": "torch/jit/__init__.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mzc",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "chunyuan-w"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wukong1992"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.",
+                "author": {
+                  "login": "suo"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498483
+              },
+              {
+                "bodyText": "@pytorchbot revert this",
+                "author": {
+                  "login": "suo"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498550
+              },
+              {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1074499668
+              },
+              {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074508608
+              },
+              {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1082508130
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=62ce809793481ce6ddce6e1a19d9b0761755ff0ff75decaf8a79419eaf793110 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l <ciflow/label_name>\", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\n\nFor more information, please take a look at the CI Flow Wiki.",
+                "author": {
+                  "login": "pytorch-probot"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-probot"
+                },
+                "databaseId": 964902865
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z   IN_CI: 1\n2022-03-21T21:31:38.7044709Z   IS_GHA: 1\n2022-03-21T21:31:38.7044885Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z   IN_CI: 1\n2022-03-21T21:35:19.2707061Z   IS_GHA: 1\n2022-03-21T21:35:19.2707246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z      ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z      -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z   IN_CI: 1\n2022-03-21T23:11:57.5791620Z   IS_GHA: 1\n2022-03-21T23:11:57.5791939Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z   wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z   IN_CI: 1\n2022-03-22T02:17:12.6389143Z   IS_GHA: 1\n2022-03-22T02:17:12.6389368Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z   DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z   IN_CI: 1\n2022-03-21T22:19:24.4958055Z   IS_GHA: 1\n2022-03-21T22:19:24.4958246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z   wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z   IN_CI: 1\n2022-03-22T01:05:07.7103224Z   IS_GHA: 1\n2022-03-22T01:05:07.7103458Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z   DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z   Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z   Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z   IN_CI: 1\n2022-03-21T20:51:39.3697161Z   IS_GHA: 1\n2022-03-21T20:51:39.3697342Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z   IN_CI: 1\n2022-03-21T21:03:36.3979968Z   IS_GHA: 1\n2022-03-21T21:03:36.3980157Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z   Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z      ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z      -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z   IN_CI: 1\n2022-03-22T00:41:15.5792186Z   IS_GHA: 1\n2022-03-22T00:41:15.5792599Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z   Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z   Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z   IN_CI: 1\n2022-03-21T20:50:32.9859977Z   IS_GHA: 1\n2022-03-21T20:50:32.9860144Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z     #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z     #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z     #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z     #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z     #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z     #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z     #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z     #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z     #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z   IN_CI: 1\n2022-03-21T22:06:03.4503038Z   IS_GHA: 1\n2022-03-21T22:06:03.4503302Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z   IN_CI: 1\n2022-03-21T20:50:13.2249738Z   IS_GHA: 1\n2022-03-21T20:50:13.2250025Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z   IN_CI: 1\n2022-03-21T23:47:38.0533649Z   IS_GHA: 1\n2022-03-21T23:47:38.0533902Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z   GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z     #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z     #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z     #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z     #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z     #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z     #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z     #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z     #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z     #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z   Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z   IN_CI: 1\n2022-03-21T22:14:31.8196876Z   IS_GHA: 1\n2022-03-21T22:14:31.8197169Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z   IN_CI: 1\n2022-03-21T21:19:15.8917734Z   IS_GHA: 1\n2022-03-21T21:19:15.8917917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z   IN_CI: 1\n2022-03-21T23:19:48.6008920Z   IS_GHA: 1\n2022-03-21T23:19:48.6009170Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z   GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z      ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z   IN_CI: 1\n2022-03-21T22:54:04.3379600Z   IS_GHA: 1\n2022-03-21T22:54:04.3380023Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z   IN_CI: 1\n2022-03-21T22:09:34.0154728Z   IS_GHA: 1\n2022-03-21T22:09:34.0154917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr<c10::ivalue::Future, c10::detail::intrusive_target_default_null_type<c10::ivalue::Future> >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: <unknown function> + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m  echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m  echo \"       contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z   IN_CI: 1\n2022-03-21T20:01:07.7028159Z   IS_GHA: 1\n2022-03-21T20:01:07.7028346Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z   BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z   IN_CI: 1\n2022-03-22T00:49:54.3032434Z   IS_GHA: 1\n2022-03-22T00:49:54.3032681Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z   GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z      ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z      -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z   IN_CI: 1\n2022-03-21T21:56:12.6240805Z   IS_GHA: 1\n2022-03-21T21:56:12.6241118Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z   IN_CI: 1\n2022-03-21T21:46:39.5541997Z   IS_GHA: 1\n2022-03-21T21:46:39.5542176Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z   Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z   Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z   IN_CI: 1\n2022-03-21T21:34:57.0688930Z   IS_GHA: 1\n2022-03-21T21:34:57.0689109Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z     #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z     #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z     #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z     #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z     #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z     #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z     #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z     #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z     #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z   IN_CI: 1\n2022-03-21T22:48:17.3471538Z   IS_GHA: 1\n2022-03-21T22:48:17.3471802Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z   GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z   Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z   Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z   IN_CI: 1\n2022-03-21T21:16:38.9720793Z   IS_GHA: 1\n2022-03-21T21:16:38.9720970Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 964902894
+              },
+              {
+                "bodyText": "@vitaly-fedyunin @gottbrath  FYI that this is the oneDNN Graph API integration. It depends on the #63748.",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 970451860
+              },
+              {
+                "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 990641309
+              },
+              {
+                "bodyText": "CI failures are unrelated.",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 991281407
+              },
+              {
+                "bodyText": "The CI failure is unrelated.",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 995389295
+              },
+              {
+                "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.",
+                "author": {
+                  "login": "eellison"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1015689390
+              },
+              {
+                "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1016996190
+              },
+              {
+                "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1022709513
+              },
+              {
+                "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the  third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1026330085
+              },
+              {
+                "bodyText": "@sanchitintel mind rebasing and i'll land ?",
+                "author": {
+                  "login": "eellison"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1055813984
+              },
+              {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1057203495
+              },
+              {
+                "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1061230087
+              },
+              {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063276600
+              },
+              {
+                "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074355779
+              },
+              {
+                "bodyText": "And graph_rewriter.cpp is full of DOS newlines...",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074407452
+              },
+              {
+                "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1074471758
+              },
+              {
+                "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1074492365
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change",
+          "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n",
+          "headRefName": "export-D34753911",
+          "headRepository": {
+            "nameWithOwner": "malfet/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-vulkan-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928580?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483086020?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592963"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928547?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592965"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-rocm4.5-py3.7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928602?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235366?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235570?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235708?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592966"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cuda11.3-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928594?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593208?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593337?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593461?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592967"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928554?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592969"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-docs"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928595?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483078289?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483078365?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592970"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928553?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483074693?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483074951?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483075182?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592971"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928556?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592974"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "shellcheck",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928552?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928797?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929069?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929350?check_suite_focus=true"
+                            },
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929628?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929838?check_suite_focus=true"
+                            },
+                            {
+                              "name": "py2-setup-validate-errormsg",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929972?check_suite_focus=true"
+                            },
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482930102?check_suite_focus=true"
+                            },
+                            {
+                              "name": "mypy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482930251?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592975"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928573?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592976"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA=",
+                      "hasNextPage": true
+                    }
+                  },
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
+                }
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
+              {
+                "path": "tools/build_variables.bzl"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1063079053
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1063079113
+              },
+              {
+                "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063079731
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73099 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "BowenBao"
+          },
+          "title": "[ONNX] Make graph name spec-compliant (#71961)",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952",
+          "headRefName": "gh/BowenBao/138/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/BowenBao/138/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "BowenBao"
+                    },
+                    "email": "bowbao@microsoft.com",
+                    "name": "BowenBao"
+                  },
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161498?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189561"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161648?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387496?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387628?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387825?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189562"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7-no-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161681?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189563"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161670?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189564"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161691?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189566"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161678?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286900?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252287072?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252287232?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189567"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-vulkan-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161699?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252302340?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189568"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161696?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189570"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cpu-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161646?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252830090?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252830141?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189571"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161666?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286386?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286526?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286720?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189572"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q=",
+                      "hasNextPage": true
+                    }
+                  },
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                }
+              }
+            ]
+          },
+          "changedFiles": 162,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/onnx/expect/TestOperators.test_acos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addconstant.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_argmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_asin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_at_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_atan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_baddbmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_basic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_bitshift.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_c2_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_chunk.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_concat2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_convtranspose.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cumsum.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_det.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict_str.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_default.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_elu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_equal.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_erf.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_exp.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten2D.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_fmod.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ge.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gelu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_hardtanh.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_index.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_isnan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_le.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_linear.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_master_opset.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_meshgrid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_narrow.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ne.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_nonzero.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ones_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_pad.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_permute2.expect"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "garymm"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n  \n    \n      pytorch/.github/scripts/trymerge.py\n    \n    \n         Line 63\n      in\n      932adf2\n    \n  \n  \n    \n\n        \n          \n                 files(last: 100) { \n        \n    \n  \n\n Can this be relaxed? If not please import.",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048084569
+              },
+              {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048088691
+              },
+              {
+                "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048090640
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1050293881
+              },
+              {
+                "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1050295451
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==",
+              "hasPreviousPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=73099 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "test/onnx/expect/TestOperators.test_pixel_shuffle.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_pow.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_prelu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_prod.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_prod_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_rand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_randn.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reducemax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_reducemin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_remainder.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_repeat.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_round.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_rrelu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_rsqrt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_rsub.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_scatter_add.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_scatter_add_opset11.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_selu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_shape_value_map.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_sign.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_sin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_slice.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_slice_dynamic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_split.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_split_with_sizes.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_sqrt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_std.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_sum.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_sum_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_tan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_topk.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_transpose.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_type_as.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_unfold.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_unique.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_unsqueeze.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_size.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_view.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_view_flatten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_zeros_like.expect"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/export.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/export.h"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTYy",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=74649 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "This should fail flake8",
+          "body": "Test issue for GHF mandatory checks",
+          "headRefName": "malfet-patch-8",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "57c86ff1c5ab948888fd329986c9d55796680e33"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            },
+            "totalCount": 2
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018129"
+                      },
+                      {
+                        "app": {
+                          "name": "Netlify",
+                          "databaseId": 13473
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018131"
+                      },
+                      {
+                        "app": {
+                          "name": "Azure Pipelines",
+                          "databaseId": 9426
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018132"
+                      },
+                      {
+                        "app": {
+                          "name": "Dependabot",
+                          "databaseId": 29110
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018134"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018139"
+                      },
+                      {
+                        "app": {
+                          "name": "PyTorch Bot",
+                          "databaseId": 40112
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018142"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399915?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399990?check_suite_focus=true"
+                            },
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400052?check_suite_focus=true"
+                            },
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400154?check_suite_focus=true"
+                            },
+                            {
+                              "name": "mypy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400239?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (with_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400327?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test collect_env (without_torch)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400361?check_suite_focus=true"
+                            },
+                            {
+                              "name": "Test tools",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400470?check_suite_focus=true"
+                            },
+                            {
+                              "name": "py2-setup-validate-errormsg",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400681?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400789?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400953?check_suite_focus=true"
+                            },
+                            {
+                              "name": "shellcheck",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669401126?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018384"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399917?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018395"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pull"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414276?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414324?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414430?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414605?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414697?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414841?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414951?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415003?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415060?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415120?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415166?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415236?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415288?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415348?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415451?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415561?check_suite_focus=true"
+                            },
+                            {
+                              "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415607?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415642?check_suite_focus=true"
+                            },
+                            {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415706?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415757?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669488974?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669489019?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492162?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492211?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492293?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492341?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492396?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492440?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492497?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492558?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496296?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496350?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496393?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669498726?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669500818?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669500848?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518721?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518760?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518798?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669549301?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669549318?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669559843?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567414?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567499?check_suite_focus=true"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567553?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669619773?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669619803?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724420?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724451?check_suite_focus=true"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724478?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018405"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                }
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
+              {
+                "path": "torch/nn/cpp.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "seemethere"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1076891218
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "dreiss"
+              },
+              {
+                "login": "kumpera"
+              },
+              {
+                "login": "ezyang"
+              },
+              {
+                "login": "stephenroller"
+              },
+              {
+                "login": "swolchok"
+              },
+              {
+                "login": "hyuen"
+              },
+              {
+                "login": "orionr"
+              },
+              {
+                "login": "dhruvbird"
+              },
+              {
+                "login": "likethesky"
+              },
+              {
+                "login": "lw"
+              },
+              {
+                "login": "raziel"
+              },
+              {
+                "login": "simpkins"
+              },
+              {
+                "login": "ebyrne"
+              },
+              {
+                "login": "Babar"
+              },
+              {
+                "login": "kostmo"
+              },
+              {
+                "login": "0x00b1"
+              },
+              {
+                "login": "bhosmer"
+              },
+              {
+                "login": "zdevito"
+              },
+              {
+                "login": "bugra"
+              },
+              {
+                "login": "caraya10"
+              },
+              {
+                "login": "kit1980"
+              },
+              {
+                "login": "shoumikhin"
+              },
+              {
+                "login": "teytaud"
+              },
+              {
+                "login": "xuzhao9"
+              },
+              {
+                "login": "jansel"
+              },
+              {
+                "login": "abhinavarora"
+              },
+              {
+                "login": "b0noI"
+              },
+              {
+                "login": "djthorne"
+              },
+              {
+                "login": "nairbv"
+              },
+              {
+                "login": "Mortimerp9"
+              },
+              {
+                "login": "dadkins20"
+              },
+              {
+                "login": "colesbury"
+              },
+              {
+                "login": "laurencer"
+              },
+              {
+                "login": "nickgg"
+              },
+              {
+                "login": "yzhao30"
+              },
+              {
+                "login": "bearzx"
+              },
+              {
+                "login": "mattjgalloway"
+              },
+              {
+                "login": "chenyang78"
+              },
+              {
+                "login": "yns88"
+              },
+              {
+                "login": "lc0"
+              },
+              {
+                "login": "wenleix"
+              },
+              {
+                "login": "jingsh"
+              },
+              {
+                "login": "mthrok"
+              },
+              {
+                "login": "drdarshan"
+              },
+              {
+                "login": "tvalentius"
+              },
+              {
+                "login": "d4l3k"
+              },
+              {
+                "login": "jamiemccrindle"
+              },
+              {
+                "login": "kazhang"
+              },
+              {
+                "login": "simonhollis"
+              },
+              {
+                "login": "lqiao"
+              },
+              {
+                "login": "ajyu"
+              },
+              {
+                "login": "govardhan"
+              },
+              {
+                "login": "yinghai"
+              },
+              {
+                "login": "zyan0"
+              },
+              {
+                "login": "ajtulloch"
+              },
+              {
+                "login": "pbelevich"
+              },
+              {
+                "login": "VitalyFedyunin"
+              },
+              {
+                "login": "dbish"
+              },
+              {
+                "login": "NicolasHug"
+              },
+              {
+                "login": "efaust"
+              },
+              {
+                "login": "idning"
+              },
+              {
+                "login": "soumith"
+              },
+              {
+                "login": "nimin98"
+              },
+              {
+                "login": "chaekit"
+              },
+              {
+                "login": "radkris-git"
+              },
+              {
+                "login": "javier-m"
+              },
+              {
+                "login": "jmdetloff"
+              },
+              {
+                "login": "mostafaelhoushi"
+              },
+              {
+                "login": "brianjo"
+              },
+              {
+                "login": "ShijunK"
+              },
+              {
+                "login": "suo"
+              },
+              {
+                "login": "vkuzo"
+              },
+              {
+                "login": "seemethere"
+              },
+              {
+                "login": "cpuhrsch"
+              },
+              {
+                "login": "qihqi"
+              },
+              {
+                "login": "jackm321"
+              },
+              {
+                "login": "linbinyu"
+              },
+              {
+                "login": "neerajprad"
+              },
+              {
+                "login": "rsemenov"
+              },
+              {
+                "login": "ziky90"
+              },
+              {
+                "login": "gmagogsfm"
+              },
+              {
+                "login": "zzzwen"
+              },
+              {
+                "login": "ikriv"
+              },
+              {
+                "login": "deeptigp"
+              },
+              {
+                "login": "andrewor14"
+              },
+              {
+                "login": "jianyuh"
+              },
+              {
+                "login": "cykustcc"
+              },
+              {
+                "login": "highker"
+              },
+              {
+                "login": "navahgar"
+              },
+              {
+                "login": "beauby"
+              },
+              {
+                "login": "jeffreyksmithjr"
+              },
+              {
+                "login": "suphoff"
+              },
+              {
+                "login": "smessmer"
+              },
+              {
+                "login": "ananthsub"
+              },
+              {
+                "login": "d1jang"
+              },
+              {
+                "login": "firstprayer"
+              },
+              {
+                "login": "malfet"
+              },
+              {
+                "login": "fegin"
+              },
+              {
+                "login": "hanton"
+              },
+              {
+                "login": "zanqi"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOACa60A=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOACa60A== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "bujar"
+              },
+              {
+                "login": "supriyar"
+              },
+              {
+                "login": "kausv"
+              },
+              {
+                "login": "divchenko"
+              },
+              {
+                "login": "dagitses"
+              },
+              {
+                "login": "rahuln32"
+              },
+              {
+                "login": "bilgeacun"
+              },
+              {
+                "login": "caogao"
+              },
+              {
+                "login": "blefaudeux"
+              },
+              {
+                "login": "miguelmartin75"
+              },
+              {
+                "login": "penguinwu"
+              },
+              {
+                "login": "shz117"
+              },
+              {
+                "login": "ajliu"
+              },
+              {
+                "login": "saketh-are"
+              },
+              {
+                "login": "jessebrizzi"
+              },
+              {
+                "login": "msaroufim"
+              },
+              {
+                "login": "mdundas"
+              },
+              {
+                "login": "davides"
+              },
+              {
+                "login": "alannnna"
+              },
+              {
+                "login": "hlin09"
+              },
+              {
+                "login": "terrychenism"
+              },
+              {
+                "login": "xiaomengy"
+              },
+              {
+                "login": "jisaacso"
+              },
+              {
+                "login": "fkhan1337"
+              },
+              {
+                "login": "xing-liu"
+              },
+              {
+                "login": "alanadakotashine"
+              },
+              {
+                "login": "desertfire"
+              },
+              {
+                "login": "banitag1"
+              },
+              {
+                "login": "letterx"
+              },
+              {
+                "login": "gchanan"
+              },
+              {
+                "login": "dbort"
+              },
+              {
+                "login": "bilalsal"
+              },
+              {
+                "login": "jaceyca"
+              },
+              {
+                "login": "serhaty"
+              },
+              {
+                "login": "yf225"
+              },
+              {
+                "login": "yifuwang"
+              },
+              {
+                "login": "piyushmh"
+              },
+              {
+                "login": "z-a-f"
+              },
+              {
+                "login": "superzgc"
+              },
+              {
+                "login": "tenpercent"
+              },
+              {
+                "login": "bertmaher"
+              },
+              {
+                "login": "chauhang"
+              },
+              {
+                "login": "jiayisuse"
+              },
+              {
+                "login": "bradleyhd"
+              },
+              {
+                "login": "ZolotukhinM"
+              },
+              {
+                "login": "jamesr66a"
+              },
+              {
+                "login": "mullachv"
+              },
+              {
+                "login": "voznesenskym"
+              },
+              {
+                "login": "charliechen0401"
+              },
+              {
+                "login": "bwasti"
+              },
+              {
+                "login": "cryptopic"
+              },
+              {
+                "login": "chinannyang"
+              },
+              {
+                "login": "NivekT"
+              },
+              {
+                "login": "zhxchen17"
+              },
+              {
+                "login": "jerryzh168"
+              },
+              {
+                "login": "MohammadMahdiJavanmard"
+              },
+              {
+                "login": "rajkar86"
+              },
+              {
+                "login": "wconstab"
+              },
+              {
+                "login": "Hangjun"
+              },
+              {
+                "login": "davidberard98"
+              },
+              {
+                "login": "Krovatkin"
+              },
+              {
+                "login": "CamiWilliams"
+              },
+              {
+                "login": "J0Nreynolds"
+              },
+              {
+                "login": "datumbox"
+              },
+              {
+                "login": "aartibasant"
+              },
+              {
+                "login": "xta0"
+              },
+              {
+                "login": "zou3519"
+              },
+              {
+                "login": "xman1979"
+              },
+              {
+                "login": "suraj813"
+              },
+              {
+                "login": "gqchen"
+              },
+              {
+                "login": "jayleverett"
+              },
+              {
+                "login": "george-qi"
+              },
+              {
+                "login": "abhikrish"
+              },
+              {
+                "login": "zhangguanheng66"
+              },
+              {
+                "login": "mikeiovine"
+              },
+              {
+                "login": "Adolfo-Karim"
+              },
+              {
+                "login": "Chillee"
+              },
+              {
+                "login": "albanD"
+              },
+              {
+                "login": "bigfootjon"
+              },
+              {
+                "login": "robotal"
+              },
+              {
+                "login": "MarcioPorto"
+              },
+              {
+                "login": "srsuryadev"
+              },
+              {
+                "login": "IvanKobzarev"
+              },
+              {
+                "login": "eprivezentsev"
+              },
+              {
+                "login": "kwen2501"
+              },
+              {
+                "login": "linux-jedi"
+              },
+              {
+                "login": "chandlerzuo"
+              },
+              {
+                "login": "prateek1404"
+              },
+              {
+                "login": "otsneh"
+              },
+              {
+                "login": "husthyc"
+              },
+              {
+                "login": "briancoutinho"
+              },
+              {
+                "login": "fduwjj"
+              },
+              {
+                "login": "frank-wei"
+              },
+              {
+                "login": "esqu1"
+              },
+              {
+                "login": "prabhat00155"
+              },
+              {
+                "login": "Gamrix"
+              },
+              {
+                "login": "QuentinDuval"
+              },
+              {
+                "login": "atalman"
+              },
+              {
+                "login": "xush6528"
+              },
+              {
+                "login": "dracifer"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOAHSKuw=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAHSKuw== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "SS-JIA"
+              },
+              {
+                "login": "helunwencser"
+              },
+              {
+                "login": "xw285cornell"
+              },
+              {
+                "login": "hhbyyh"
+              },
+              {
+                "login": "rohan-varma"
+              },
+              {
+                "login": "teng-li"
+              },
+              {
+                "login": "larryliu0820"
+              },
+              {
+                "login": "lyoka"
+              },
+              {
+                "login": "cbalioglu"
+              },
+              {
+                "login": "hl475"
+              },
+              {
+                "login": "hwangjeff"
+              },
+              {
+                "login": "Jack-Khuu"
+              },
+              {
+                "login": "alanwaketan"
+              },
+              {
+                "login": "mehtanirav"
+              },
+              {
+                "login": "nateanl"
+              },
+              {
+                "login": "fuqianz"
+              },
+              {
+                "login": "boyuantan"
+              },
+              {
+                "login": "muntaqim"
+              },
+              {
+                "login": "dennysem"
+              },
+              {
+                "login": "ymao1993"
+              },
+              {
+                "login": "fmassa"
+              },
+              {
+                "login": "esantorella"
+              },
+              {
+                "login": "HamidShojanazeri"
+              },
+              {
+                "login": "jubinchheda"
+              },
+              {
+                "login": "mehdimashayekhi"
+              },
+              {
+                "login": "rkindi"
+              },
+              {
+                "login": "wanchaol"
+              },
+              {
+                "login": "zephirefaith"
+              },
+              {
+                "login": "alexbeloi"
+              },
+              {
+                "login": "kapilsh"
+              },
+              {
+                "login": "plahera"
+              },
+              {
+                "login": "SherlockNoMad"
+              },
+              {
+                "login": "pritamdamania87"
+              },
+              {
+                "login": "rahxephon89"
+              },
+              {
+                "login": "iseeyuan"
+              },
+              {
+                "login": "Matphyler"
+              },
+              {
+                "login": "protonu"
+              },
+              {
+                "login": "terhuhf"
+              },
+              {
+                "login": "aruntonic"
+              },
+              {
+                "login": "gcatron"
+              },
+              {
+                "login": "yingrliu"
+              },
+              {
+                "login": "alexanderguzhva"
+              },
+              {
+                "login": "angelayi"
+              },
+              {
+                "login": "zhaoalex"
+              },
+              {
+                "login": "shahofblah"
+              },
+              {
+                "login": "vivekmig"
+              },
+              {
+                "login": "jspisak"
+              },
+              {
+                "login": "akshaypandian"
+              },
+              {
+                "login": "HarutMov"
+              },
+              {
+                "login": "tktrungna"
+              },
+              {
+                "login": "eellison"
+              },
+              {
+                "login": "ziab"
+              },
+              {
+                "login": "NarineK"
+              },
+              {
+                "login": "andrewconnors"
+              },
+              {
+                "login": "wenwei202"
+              },
+              {
+                "login": "jg2912"
+              },
+              {
+                "login": "jwpark1985"
+              },
+              {
+                "login": "robieta"
+              },
+              {
+                "login": "amirhmk"
+              },
+              {
+                "login": "davidxili"
+              },
+              {
+                "login": "mreso"
+              },
+              {
+                "login": "soulitzer"
+              },
+              {
+                "login": "prigoyal"
+              },
+              {
+                "login": "PaliC"
+              },
+              {
+                "login": "anijain2305"
+              },
+              {
+                "login": "pvtuan10"
+              },
+              {
+                "login": "huangyi1979"
+              },
+              {
+                "login": "osalpekar"
+              },
+              {
+                "login": "xiaohui-zhang"
+              },
+              {
+                "login": "jerry39213gh"
+              },
+              {
+                "login": "jarodhou"
+              },
+              {
+                "login": "hlu1"
+              },
+              {
+                "login": "huiguoo"
+              },
+              {
+                "login": "H-Huang"
+              },
+              {
+                "login": "vtsyvina"
+              },
+              {
+                "login": "qchip"
+              },
+              {
+                "login": "Nitrokitty"
+              },
+              {
+                "login": "satgera"
+              },
+              {
+                "login": "ngimel"
+              },
+              {
+                "login": "dongreenberg"
+              },
+              {
+                "login": "sijiac"
+              },
+              {
+                "login": "markkm"
+              },
+              {
+                "login": "EscapeZero"
+              },
+              {
+                "login": "bdhirsh"
+              },
+              {
+                "login": "cccclai"
+              },
+              {
+                "login": "carolineechen"
+              },
+              {
+                "login": "tugsbayasgalan"
+              },
+              {
+                "login": "agunapal"
+              },
+              {
+                "login": "frankseide"
+              },
+              {
+                "login": "YazhiGao"
+              },
+              {
+                "login": "pavithranrao"
+              },
+              {
+                "login": "VirgileHlav"
+              },
+              {
+                "login": "mrshenli"
+              },
+              {
+                "login": "lena-kashtelyan"
+              },
+              {
+                "login": "brad-mengchi"
+              },
+              {
+                "login": "kimishpatel"
+              },
+              {
+                "login": "aaronenyeshi"
+              },
+              {
+                "login": "shajrawi"
+              },
+              {
+                "login": "samdow"
+              },
+              {
+                "login": "dzhulgakov"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOARD9PA=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOARD9PA== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "great-way"
+              },
+              {
+                "login": "ashkan-software"
+              },
+              {
+                "login": "garroud"
+              },
+              {
+                "login": "knottb"
+              },
+              {
+                "login": "jbitton"
+              },
+              {
+                "login": "jdsgomes"
+              },
+              {
+                "login": "zhangxy988"
+              },
+              {
+                "login": "samlurye"
+              },
+              {
+                "login": "EdwardTyantov"
+              },
+              {
+                "login": "anjali411"
+              },
+              {
+                "login": "kryanchun"
+              },
+              {
+                "login": "842974287"
+              },
+              {
+                "login": "JacobSzwejbka"
+              },
+              {
+                "login": "nishantpdce"
+              },
+              {
+                "login": "srinivas212"
+              },
+              {
+                "login": "cherie11"
+              },
+              {
+                "login": "shreyanb98"
+              },
+              {
+                "login": "kavoor"
+              },
+              {
+                "login": "dzdang"
+              },
+              {
+                "login": "naveedgol"
+              },
+              {
+                "login": "Nayef211"
+              },
+              {
+                "login": "zrphercule"
+              },
+              {
+                "login": "HengruiX"
+              },
+              {
+                "login": "langong347"
+              },
+              {
+                "login": "soapisnotfat"
+              },
+              {
+                "login": "ebsmothers"
+              },
+              {
+                "login": "anshuljain1"
+              },
+              {
+                "login": "b-koopman"
+              },
+              {
+                "login": "salilsdesai"
+              },
+              {
+                "login": "vmoens"
+              },
+              {
+                "login": "printfoo"
+              },
+              {
+                "login": "xinyang0"
+              },
+              {
+                "login": "ramvenkat98"
+              },
+              {
+                "login": "fbbradheintz"
+              },
+              {
+                "login": "kauterry"
+              },
+              {
+                "login": "VenkatSubramaniam"
+              },
+              {
+                "login": "yxia11"
+              },
+              {
+                "login": "anirbanraywork"
+              },
+              {
+                "login": "houseroad"
+              },
+              {
+                "login": "erichan1"
+              },
+              {
+                "login": "hsrussell"
+              },
+              {
+                "login": "ilia-cher"
+              },
+              {
+                "login": "ajitmaths"
+              },
+              {
+                "login": "awgu"
+              },
+              {
+                "login": "wz337"
+              },
+              {
+                "login": "LynneD"
+              },
+              {
+                "login": "qxy11"
+              },
+              {
+                "login": "janeyx99"
+              },
+              {
+                "login": "msedwar"
+              },
+              {
+                "login": "dustinh1999"
+              },
+              {
+                "login": "glaringlee"
+              },
+              {
+                "login": "anj-s"
+              },
+              {
+                "login": "liuchen9494"
+              },
+              {
+                "login": "drisspg"
+              },
+              {
+                "login": "RdoubleA"
+              },
+              {
+                "login": "jramseyer"
+              },
+              {
+                "login": "zengk95"
+              },
+              {
+                "login": "gtarjun"
+              },
+              {
+                "login": "mikaylagawarecki"
+              },
+              {
+                "login": "xianxl"
+              },
+              {
+                "login": "lucasgadams"
+              },
+              {
+                "login": "mingzhe09088"
+              },
+              {
+                "login": "Vucibatina"
+              },
+              {
+                "login": "aazzolini"
+              },
+              {
+                "login": "nataliakliushkina"
+              },
+              {
+                "login": "mruberry"
+              },
+              {
+                "login": "HDCharles"
+              },
+              {
+                "login": "mcr229"
+              },
+              {
+                "login": "manuelcandales"
+              },
+              {
+                "login": "guangy10"
+              },
+              {
+                "login": "mengwa41"
+              },
+              {
+                "login": "hx89"
+              },
+              {
+                "login": "kiukchung"
+              },
+              {
+                "login": "hanhsienhuang"
+              },
+              {
+                "login": "clee2000"
+              },
+              {
+                "login": "lhuang04"
+              },
+              {
+                "login": "sidneyfletcher"
+              },
+              {
+                "login": "gottbrath"
+              },
+              {
+                "login": "lessw2020"
+              },
+              {
+                "login": "choward232"
+              },
+              {
+                "login": "mmh683"
+              },
+              {
+                "login": "dwarakrajagopal"
+              },
+              {
+                "login": "lazysjb"
+              },
+              {
+                "login": "zhaojuanmao"
+              },
+              {
+                "login": "johncalab"
+              },
+              {
+                "login": "dhthompson"
+              },
+              {
+                "login": "superwizard2019"
+              },
+              {
+                "login": "fbhuba"
+              },
+              {
+                "login": "shunting314"
+              },
+              {
+                "login": "edward-io"
+              },
+              {
+                "login": "sean-ngo"
+              },
+              {
+                "login": "bzinodev"
+              },
+              {
+                "login": "xcheng16"
+              },
+              {
+                "login": "adamomainz"
+              },
+              {
+                "login": "sluks"
+              },
+              {
+                "login": "poojahp"
+              },
+              {
+                "login": "ansley"
+              },
+              {
+                "login": "mvsampath"
+              },
+              {
+                "login": "cheetah2216"
+              },
+              {
+                "login": "pinaki-mukerji"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOA7KsGw=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOA7KsGw== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "hongxiayang"
+              },
+              {
+                "login": "kyulee-com"
+              },
+              {
+                "login": "sstsai-adl"
+              },
+              {
+                "login": "dahsh"
+              },
+              {
+                "login": "ohgnoes"
+              },
+              {
+                "login": "szewaiyuen7"
+              },
+              {
+                "login": "byterover"
+              },
+              {
+                "login": "ejguan"
+              },
+              {
+                "login": "nimaelyasi"
+              },
+              {
+                "login": "nikithamalgifb"
+              },
+              {
+                "login": "qxu-fb"
+              },
+              {
+                "login": "sshawnwu"
+              },
+              {
+                "login": "andrewyounkins"
+              },
+              {
+                "login": "njuvekar"
+              },
+              {
+                "login": "iramazanli"
+              },
+              {
+                "login": "jnkwok1"
+              },
+              {
+                "login": "kurman"
+              },
+              {
+                "login": "jbschlosser"
+              },
+              {
+                "login": "ccongge"
+              },
+              {
+                "login": "haichuan-fb"
+              },
+              {
+                "login": "wwang84"
+              },
+              {
+                "login": "JustinPinero"
+              },
+              {
+                "login": "gcramer23"
+              },
+              {
+                "login": "woo-kim"
+              },
+              {
+                "login": "yuguo68"
+              },
+              {
+                "login": "chowarfb"
+              },
+              {
+                "login": "priyaramani"
+              },
+              {
+                "login": "yidawang-oss"
+              },
+              {
+                "login": "beback4u"
+              },
+              {
+                "login": "asalioufb"
+              },
+              {
+                "login": "four4fish"
+              },
+              {
+                "login": "kkosik20"
+              },
+              {
+                "login": "KZFB"
+              },
+              {
+                "login": "sisilmehta2000"
+              },
+              {
+                "login": "henryliu-bluehills"
+              },
+              {
+                "login": "madhu-fb"
+              },
+              {
+                "login": "muchulee8"
+              },
+              {
+                "login": "anirbanr-fb-r2p"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": false,
+              "endCursor": "Y3Vyc29yOnYyOpHOBkbBhA=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "docs/source/quantization.rst"
+              },
+              {
+                "path": "docs/source/scripts/build_quantization_configs.py"
+              },
+              {
+                "path": "test/allowlist_for_publicAPI.json"
+              },
+              {
+                "path": "test/cpp/jit/source_range_test.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_backend.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_flatbuffer.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_misc.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_utils.h"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/profiler/record_function.cpp"
+              },
+              {
+                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+              },
+              {
+                "path": "test/distributed/_shard/test_replicated_tensor.py"
+              },
+              {
+                "path": "test/distributed/fsdp/test_fsdp_comm.py"
+              },
+              {
+                "path": "test/distributed/fsdp/test_fsdp_optim_state.py"
+              },
+              {
+                "path": "test/distributed/optim/test_zero_redundancy_optimizer.py"
+              },
+              {
+                "path": "test/jit/test_export_modes.py"
+              },
+              {
+                "path": "test/jit/test_if_hoisting.py"
+              },
+              {
+                "path": "test/jit/test_tracer.py"
+              },
+              {
+                "path": "test/jit/test_upgraders.py"
+              },
+              {
+                "path": "test/mobile/test_lite_script_type.py"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+              },
+              {
+                "path": "test/onnx/test_operators.py"
+              },
+              {
+                "path": "test/onnx/test_pytorch_onnx_onnxruntime.py"
+              },
+              {
+                "path": "test/quantization/ao_migration/test_quantization_fx.py"
+              },
+              {
+                "path": "test/quantization/core/test_quantized_op.py"
+              },
+              {
+                "path": "test/quantization/core/test_quantized_tensor.py"
+              },
+              {
+                "path": "test/quantization/fx/test_numeric_suite_fx.py"
+              },
+              {
+                "path": "test/quantization/fx/test_quantize_fx.py"
+              },
+              {
+                "path": "test/test_autograd.py"
+              },
+              {
+                "path": "test/test_binary_ufuncs.py"
+              },
+              {
+                "path": "test/test_expanded_weights.py"
+              },
+              {
+                "path": "test/test_functionalization.py"
+              },
+              {
+                "path": "test/test_fx_experimental.py"
+              },
+              {
+                "path": "test/test_jit.py"
+              },
+              {
+                "path": "test/test_jit_cuda_fuser.py"
+              },
+              {
+                "path": "test/test_linalg.py"
+              },
+              {
+                "path": "test/test_nestedtensor.py"
+              },
+              {
+                "path": "test/test_nn.py"
+              },
+              {
+                "path": "test/test_ops.py"
+              },
+              {
+                "path": "test/test_ops_gradients.py"
+              },
+              {
+                "path": "test/test_ops_jit.py"
+              },
+              {
+                "path": "test/test_optim.py"
+              },
+              {
+                "path": "test/test_overrides.py"
+              },
+              {
+                "path": "test/test_profiler.py"
+              },
+              {
+                "path": "test/test_public_bindings.py"
+              },
+              {
+                "path": "test/test_pytree.py"
+              },
+              {
+                "path": "test/test_reductions.py"
+              },
+              {
+                "path": "test/test_sort_and_select.py"
+              },
+              {
+                "path": "test/test_sparse.py"
+              },
+              {
+                "path": "test/test_sparse_csr.py"
+              },
+              {
+                "path": "test/test_spectral_ops.py"
+              },
+              {
+                "path": "test/test_tensor_creation_ops.py"
+              },
+              {
+                "path": "test/test_tensorboard.py"
+              },
+              {
+                "path": "test/test_testing.py"
+              },
+              {
+                "path": "test/test_torch.py"
+              },
+              {
+                "path": "test/test_unary_ufuncs.py"
+              },
+              {
+                "path": "third_party/BUCK.github"
+              },
+              {
+                "path": "third_party/fbgemm"
+              },
+              {
+                "path": "tools/autograd/derivatives.yaml"
+              },
+              {
+                "path": "tools/autograd/gen_inplace_or_view_type.py"
+              },
+              {
+                "path": "tools/autograd/load_derivatives.py"
+              },
+              {
+                "path": "tools/build_variables.bzl"
+              },
+              {
+                "path": "tools/codegen/api/autograd.py"
+              },
+              {
+                "path": "tools/codegen/api/cpp.py"
+              },
+              {
+                "path": "tools/codegen/api/dispatcher.py"
+              },
+              {
+                "path": "tools/codegen/api/functionalization.py"
+              },
+              {
+                "path": "tools/codegen/api/lazy.py"
+              },
+              {
+                "path": "tools/codegen/api/meta.py"
+              },
+              {
+                "path": "tools/codegen/api/native.py"
+              },
+              {
+                "path": "tools/codegen/api/python.py"
+              },
+              {
+                "path": "tools/codegen/api/structured.py"
+              },
+              {
+                "path": "tools/codegen/api/translate.py"
+              },
+              {
+                "path": "tools/codegen/api/types.py"
+              },
+              {
+                "path": "tools/codegen/api/ufunc.py"
+              },
+              {
+                "path": "tools/codegen/api/unboxing.py"
+              },
+              {
+                "path": "tools/codegen/code_template.py"
+              },
+              {
+                "path": "tools/codegen/context.py"
+              },
+              {
+                "path": "tools/codegen/decompositions/gen_jit_decompositions.py"
+              },
+              {
+                "path": "tools/codegen/dest/__init__.py"
+              },
+              {
+                "path": "tools/codegen/dest/lazy_ir.py"
+              },
+              {
+                "path": "tools/codegen/dest/lazy_ts_lowering.py"
+              },
+              {
+                "path": "tools/codegen/dest/native_functions.py"
+              },
+              {
+                "path": "tools/codegen/dest/register_dispatch_key.py"
+              },
+              {
+                "path": "tools/codegen/dest/ufunc.py"
+              },
+              {
+                "path": "tools/codegen/gen.py"
+              },
+              {
+                "path": "tools/codegen/gen_backend_stubs.py"
+              },
+              {
+                "path": "tools/codegen/gen_functionalization_type.py"
+              },
+              {
+                "path": "tools/codegen/gen_lazy_tensor.py"
+              },
+              {
+                "path": "tools/codegen/local.py"
+              },
+              {
+                "path": "tools/codegen/model.py"
+              },
+              {
+                "path": "tools/codegen/operator_versions/gen_mobile_upgraders.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MjAw",
+              "hasNextPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MjAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "tools/codegen/selective_build/operator.py"
+              },
+              {
+                "path": "tools/codegen/selective_build/selector.py"
+              },
+              {
+                "path": "tools/codegen/shape_functions/gen_jit_shape_functions.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/config.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/gen_static_runtime_ops.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/gen_structured.py"
+              },
+              {
+                "path": "tools/codegen/utils.py"
+              },
+              {
+                "path": "tools/linter/adapters/circleci_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/clangformat_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/grep_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/nativefunctions_linter.py"
+              },
+              {
+                "path": "tools/setup_helpers/BUILD.bazel"
+              },
+              {
+                "path": "tools/setup_helpers/generate_code.py"
+              },
+              {
+                "path": "torch/_C/__init__.pyi.in"
+              },
+              {
+                "path": "torch/amp/autocast_mode.py"
+              },
+              {
+                "path": "torch/ao/ns/fx/pattern_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/README.md"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/__init__.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/native.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/observation_type.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/tensorrt.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/__init__.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config/fuse_handler.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config/quantize_handler.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/convert.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/fuse.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/fusion_patterns.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/match_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/pattern_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/prepare.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/quantization_patterns.py"
+              },
+              {
+                "path": "torch/ao/quantization/qconfig.py"
+              },
+              {
+                "path": "torch/ao/quantization/quantization_types.py"
+              },
+              {
+                "path": "torch/ao/quantization/quantize_fx.py"
+              },
+              {
+                "path": "torch/autograd/__init__.py"
+              },
+              {
+                "path": "torch/csrc/Module.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/FunctionsManual.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/FunctionsManual.h"
+              },
+              {
+                "path": "torch/csrc/autograd/engine.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/function.h"
+              },
+              {
+                "path": "torch/csrc/autograd/functions/accumulate_grad.h"
+              },
+              {
+                "path": "torch/csrc/autograd/init.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/python_torch_functions_manual.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/python_variable.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/record_function_ops.h"
+              },
+              {
+                "path": "torch/csrc/autograd/utils/grad_layout_contract.h"
+              },
+              {
+                "path": "torch/csrc/deploy/CMakeLists.txt"
+              },
+              {
+                "path": "torch/csrc/distributed/c10d/logger.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/cuda/graph_fuser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/cuda/parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/function_schema_parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/lexer.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/parser.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/script_type_parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_range.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_range.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_ref.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/tracer.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/tracer.h"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/debug_info.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/debug_info.h"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/flatbuffer_loader.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/module.h"
+              },
+              {
+                "path": "torch/csrc/jit/passes/common_expression_hoisting.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/common_expression_hoisting.h"
+              },
+              {
+                "path": "torch/csrc/jit/passes/frozen_graph_optimizations.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/python_tree_views.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/script_init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/graph_executor.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/interpreter.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/script_profile.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.h"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/shape_function_registry.h"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/shape_functions.h"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/shape_functions_1.h"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/static/impl.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/static/passes.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/export_module.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/flatbuffer_serializer.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_export_helpers.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_export_helpers.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_source.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_source.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/source_range_serialization.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/source_range_serialization.h"
+              },
+              {
+                "path": "torch/csrc/jit/testing/file_check.cpp"
+              },
+              {
+                "path": "torch/csrc/lazy/core/dynamic_ir.cpp"
+              },
+              {
+                "path": "torch/csrc/lazy/core/dynamic_ir.h"
+              },
+              {
+                "path": "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MzAw",
+              "hasNextPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MzAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+              },
+              {
+                "path": "torch/csrc/utils/python_arg_parser.cpp"
+              },
+              {
+                "path": "torch/csrc/utils/python_arg_parser.h"
+              },
+              {
+                "path": "torch/csrc/utils/tensor_list.cpp"
+              },
+              {
+                "path": "torch/csrc/utils/tensor_new.cpp"
+              },
+              {
+                "path": "torch/csrc/utils/tensor_new.h"
+              },
+              {
+                "path": "torch/distributed/_shard/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/api.py"
+              },
+              {
+                "path": "torch/distributed/_shard/replicated_tensor.py"
+              },
+              {
+                "path": "torch/distributed/_shard/sharded_tensor/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/sharded_tensor/api.py"
+              },
+              {
+                "path": "torch/distributed/_shard/sharded_tensor/utils.py"
+              },
+              {
+                "path": "torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py"
+              },
+              {
+                "path": "torch/distributed/algorithms/model_averaging/utils.py"
+              },
+              {
+                "path": "torch/distributed/fsdp/_optim_utils.py"
+              },
+              {
+                "path": "torch/distributed/fsdp/fully_sharded_data_parallel.py"
+              },
+              {
+                "path": "torch/distributed/nn/__init__.py"
+              },
+              {
+                "path": "torch/distributed/nn/functional.py"
+              },
+              {
+                "path": "torch/distributed/optim/functional_adagrad.py"
+              },
+              {
+                "path": "torch/fx/experimental/meta_tracer.py"
+              },
+              {
+                "path": "torch/fx/graph.py"
+              },
+              {
+                "path": "torch/jit/_shape_functions.py"
+              },
+              {
+                "path": "torch/nn/parallel/_replicated_tensor_ddp_interop.py"
+              },
+              {
+                "path": "torch/nn/parallel/_replicated_tensor_ddp_utils.py"
+              },
+              {
+                "path": "torch/nn/parallel/distributed.py"
+              },
+              {
+                "path": "torch/nn/utils/_expanded_weights/__init__.py"
+              },
+              {
+                "path": "torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py"
+              },
+              {
+                "path": "torch/onnx/symbolic_opset11.py"
+              },
+              {
+                "path": "torch/onnx/symbolic_opset12.py"
+              },
+              {
+                "path": "torch/onnx/symbolic_opset9.py"
+              },
+              {
+                "path": "torch/optim/adagrad.py"
+              },
+              {
+                "path": "torch/optim/lr_scheduler.py"
+              },
+              {
+                "path": "torch/overrides.py"
+              },
+              {
+                "path": "torch/quantization/fx/pattern_utils.py"
+              },
+              {
+                "path": "torch/quantization/fx/quantization_patterns.py"
+              },
+              {
+                "path": "torch/quantization/fx/quantization_types.py"
+              },
+              {
+                "path": "torch/return_types.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_device_type.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_distributed.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_fx2trt.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_methods_invocations.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_utils.py"
+              },
+              {
+                "path": "torch/testing/_internal/composite_compliance.py"
+              },
+              {
+                "path": "torch/testing/_internal/distributed/distributed_test.py"
+              },
+              {
+                "path": "torch/testing/_internal/jit_metaprogramming_utils.py"
+              },
+              {
+                "path": "torch/utils/cpp_extension.py"
+              },
+              {
+                "path": "torch/utils/data/datapipes/_typing.py"
+              },
+              {
+                "path": "torch/utils/model_dump/__init__.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MzQ4",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=pytorch-dev-infra org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "kit1980"
+              },
+              {
+                "login": "b0noI"
+              },
+              {
+                "login": "seemethere"
+              },
+              {
+                "login": "malfet"
+              },
+              {
+                "login": "tenpercent"
+              },
+              {
+                "login": "atalman"
+              },
+              {
+                "login": "osalpekar"
+              },
+              {
+                "login": "janeyx99"
+              },
+              {
+                "login": "clee2000"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": false,
+              "endCursor": "Y3Vyc29yOnYyOpHOAqnOlw=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=qwertyuiop org=pytorch": {
+    "data": {
+      "organization": {
+        "team": null
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcQU= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276102"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276103"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-onnx"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276104"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815361?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915218?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915270?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915344?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP89A=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276105"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276106"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815353?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObTk=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276107"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-docs"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276110"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cuda11.3-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276111"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815317?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189850?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189908?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189954?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUJII=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276112"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276114"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRI=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcRI= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276115"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-vulkan-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276117"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815309?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918134?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918256?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918319?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP_28=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276119"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7-no-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276122"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-onnx"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815351?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545931419?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545931552?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQMyA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276123"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-asan"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815311?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947543?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947625?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947792?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQcpA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276124"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Lint"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "cmakelint",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815342?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-format",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815564?check_suite_focus=true"
+                            },
+                            {
+                              "name": "clang-tidy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815688?check_suite_focus=true"
+                            },
+                            {
+                              "name": "flake8-py3",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815821?check_suite_focus=true"
+                            },
+                            {
+                              "name": "quick-checks",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816003?check_suite_focus=true"
+                            },
+                            {
+                              "name": "mypy",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816076?check_suite_focus=true"
+                            },
+                            {
+                              "name": "py2-setup-validate-errormsg",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816154?check_suite_focus=true"
+                            },
+                            {
+                              "name": "shellcheck",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816266?check_suite_focus=true"
+                            },
+                            {
+                              "name": "toc",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816398?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcU4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276126"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815207?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKc=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276127"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276129"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276130"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSI=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcSI= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815348?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545954339?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQjCM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276131"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cuda11.3-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815322?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226404?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226489?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226540?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUs2w=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276132"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815307?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276133"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815362?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObUI=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276134"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815337?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObSk=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276135"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-vulkan-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815561?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545929390?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQKq4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276136"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-docs"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815356?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545920544?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545920612?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQCGQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276137"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-rocm4.5-py3.7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815326?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.rocm.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545983951?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.rocm.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545984049?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqRADE=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276140"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-build"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815205?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKU=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276141"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cpu-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815314?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546093287?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546093438?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSq34=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276143"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS8=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcS8= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815359?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545923802?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545923899?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924024?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924110?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924249?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924341?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQFvU=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276145"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276149"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-rocm4.5-py3.7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "CANCELLED",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276152"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Test tools"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815310?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQ4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276157"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815320?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRg=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276159"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "macos-10-15-py3-arm64"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816079?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA8=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276857"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-arm64-coreml"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816078?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA4=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276860"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-arm64"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816071?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAc=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276861"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "macos-11-py3-x86-64"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816073?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, macos-11)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546066712?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, macos-11)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546066787?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSQ2M=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276862"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-arm64-custom-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816081?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBE=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276864"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAA=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCdAA= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-x86-64-coreml"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816077?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276867"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-arm64-metal"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816080?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276869"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "macos-10-15-py3-lite-interpreter-x86-64"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816075?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276873"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "ios-12-5-1-x86-64"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816068?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276881"
+                      },
+                      {
+                        "app": {
+                          "name": "Netlify",
+                          "databaseId": 13473
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277331"
+                      },
+                      {
+                        "app": {
+                          "name": "Azure Pipelines",
+                          "databaseId": 9426
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277340"
+                      },
+                      {
+                        "app": {
+                          "name": "Dependabot",
+                          "databaseId": 29110
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277346"
+                      },
+                      {
+                        "app": {
+                          "name": "Codecov",
+                          "databaseId": 254
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277350"
+                      },
+                      {
+                        "app": {
+                          "name": "PyTorch Bot",
+                          "databaseId": 40112
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [],
+                          "pageInfo": {
+                            "endCursor": null,
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": null,
+                        "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277355"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdes=",
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-RA= name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "4746da707a9912356f5179625da89616b228dc21",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928591?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2c8=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592977"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Test tools"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928555?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2as=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592978"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928570?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483302702?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483302867?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483303104?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbUkMA=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592980"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7-no-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928607?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d8=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592981"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cpu-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928611?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483400398?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483400575?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbWDX8=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592982"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928548?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aQ=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592983"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-asan"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928603?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 3, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483138456?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483138698?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 3, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483139049?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSD-k=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592985"
+                      },
+                      {
+                        "app": {
+                          "name": "Facebook GitHub Tools",
+                          "databaseId": 12274
+                        },
+                        "workflowRun": null,
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "Facebook CLA Check",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://code.intern.facebook.com/cla/"
+                            },
+                            {
+                              "name": "Meta Internal-Only Changes Check",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://opensource.facebook.com/"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO574=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592986"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928559?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483141123?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSGAM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592987"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc5.4"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928593?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106295?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106609?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106835?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107050?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107208?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107483?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRlJs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592997"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-SU=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-SU= name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "4746da707a9912356f5179625da89616b228dc21",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928550?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083368?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083553?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083767?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRN_c=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593001"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-clang7-onnx"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928572?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483120691?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483120938?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRySo=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593014"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928605?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593026"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-UI=",
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAUK_Uc0= name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020053?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302536958?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302537118?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302537373?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwOTJ0=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801870"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "Test tools"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020045?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ80=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801872"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020051?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145103?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (noarch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145224?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145353?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIUUk=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801874"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-docs"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020056?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302148279?check_suite_focus=true"
+                            },
+                            {
+                              "name": "build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302148361?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIXQk=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801876"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020057?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9k=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801877"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "run-torchbench",
+                              "conclusion": "NEUTRAL",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019919?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ08=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SKIPPED",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801878"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020088?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151055?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151166?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (distributed, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151251?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIaFM=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801880"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build-and-test",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020054?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9Y=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801882"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "win-vs2019-cpu-py3"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019942?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303136931?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303137019?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwXcvs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "FAILURE",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801885"
+                      },
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-vulkan-bionic-py3.7-clang9"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020058?check_suite_focus=true"
+                            },
+                            {
+                              "name": "test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302161211?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIjzs=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801895"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uec=",
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAUK_Uec= name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "app": {
+                          "name": "GitHub Actions",
+                          "databaseId": 15368
+                        },
+                        "workflowRun": {
+                          "workflow": {
+                            "name": "linux-xenial-py3.7-gcc7-no-ops"
+                          }
+                        },
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "build",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020052?check_suite_focus=true"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9Q=",
+                            "hasNextPage": false
+                          }
+                        },
+                        "conclusion": "SUCCESS",
+                        "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801896"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ueg=",
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 0db7de71f4fc..b854320c9eaa 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -3,7 +3,7 @@
 set -eou pipefail
 
 DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
-DRIVER_FN="NVIDIA-Linux-x86_64-495.44.run"
+DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run"
 YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
 
 install_nvidia_docker2_amzn2() {
diff --git a/.github/scripts/lint_native_functions.py b/.github/scripts/lint_native_functions.py
index 2e6d4e3e7675..70c43605c54d 100755
--- a/.github/scripts/lint_native_functions.py
+++ b/.github/scripts/lint_native_functions.py
@@ -27,9 +27,9 @@ def fn(base: str) -> str:
     contents = f.read()
 
 yaml = ruamel.yaml.YAML()  # type: ignore[attr-defined]
-yaml.preserve_quotes = True
-yaml.width = 1000
-yaml.boolean_representation = ['False', 'True']
+yaml.preserve_quotes = True  # type: ignore[assignment]
+yaml.width = 1000  # type: ignore[assignment]
+yaml.boolean_representation = ['False', 'True']  # type: ignore[attr-defined]
 r = yaml.load(contents)
 
 # Cuz ruamel's author intentionally didn't include conversion to string
diff --git a/.github/scripts/lint_test_ownership.py b/.github/scripts/lint_test_ownership.py
deleted file mode 100755
index 270019c0f563..000000000000
--- a/.github/scripts/lint_test_ownership.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-'''
-Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232.
-
-This lint verifies that every Python test file (file that matches test_*.py or *_test.py in the test folder)
-has valid ownership information in a comment header. Valid means:
-  - The format of the header follows the pattern "# Owner(s): ["list", "of owner", "labels"]
-  - Each owner label actually exists in PyTorch
-  - Each owner label starts with "module: " or "oncall: " or is in ACCEPTABLE_OWNER_LABELS
-
-This file is expected to run in the root directory of pytorch/pytorch.
-'''
-import boto3  # type: ignore[import]
-import botocore  # type: ignore[import]
-import fnmatch
-import json
-import sys
-from pathlib import Path
-from typing import List, Any
-
-
-# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions
-ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"]
-GLOB_EXCEPTIONS = [
-    "**/test/run_test.py"
-]
-
-PYTORCH_ROOT = Path(__file__).resolve().parent.parent.parent
-TEST_DIR = PYTORCH_ROOT / "test"
-CURRENT_FILE_NAME = Path(__file__).resolve().relative_to(PYTORCH_ROOT)
-
-S3_RESOURCE_READ_ONLY = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED))
-
-
-def get_all_test_files() -> List[Path]:
-    test_files = list(TEST_DIR.glob("**/test_*.py"))
-    test_files.extend(list(TEST_DIR.glob("**/*_test.py")))
-    return [f for f in test_files if not any([fnmatch.fnmatch(str(f), g) for g in GLOB_EXCEPTIONS])]
-
-
-def get_pytorch_labels() -> Any:
-    bucket = S3_RESOURCE_READ_ONLY.Bucket("ossci-metrics")
-    summaries = bucket.objects.filter(Prefix="pytorch_labels.json")
-    for summary in summaries:
-        labels = summary.get()["Body"].read()
-    return json.loads(labels)
-
-
-# Returns a string denoting the error invalidating the label OR an empty string if nothing is wrong
-def validate_label(label: str, pytorch_labels: List[str]) -> str:
-    if label not in pytorch_labels:
-        return f"{label} is not a PyTorch label (please choose from https://github.com/pytorch/pytorch/labels)"
-    if label.startswith("module:") or label.startswith("oncall:") or label in ACCEPTABLE_OWNER_LABELS:
-        return ""
-    return f"{label} is not an acceptable owner (please update to another label or edit ACCEPTABLE_OWNERS_LABELS " \
-        "in {CURRENT_FILE_NAME}"
-
-
-# Returns a string denoting the error invalidating the file OR an empty string if nothing is wrong
-def validate_file(filename: Path, pytorch_labels: List[str]) -> str:
-    prefix = "# Owner(s): "
-    relative_name = Path(filename).relative_to(PYTORCH_ROOT)
-    with open(filename) as f:
-        for line in f.readlines():
-            if line.startswith(prefix):
-                labels = json.loads(line[len(prefix):])
-                labels_msgs = [validate_label(label, pytorch_labels) for label in labels]
-                file_msg = ", ".join([x for x in labels_msgs if x != ""])
-                return f"{relative_name}: {file_msg}" if file_msg != "" else ""
-    return f"{relative_name}: missing a comment header with ownership information."
-
-
-def main() -> None:
-    test_file_paths = get_all_test_files()
-    pytorch_labels = get_pytorch_labels()
-
-    file_msgs = [validate_file(f, pytorch_labels) for f in test_file_paths]
-    err_msg = "\n".join([x for x in file_msgs if x != ""])
-    if err_msg != "":
-        err_msg = err_msg + "\n\nIf you see files with missing ownership information above, " \
-            "please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. " \
-            "The owner should be an existing pytorch/pytorch label."
-        print(err_msg)
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/.github/scripts/process_commit.py b/.github/scripts/process_commit.py
index a7bc4709d6b8..1bfca3237984 100644
--- a/.github/scripts/process_commit.py
+++ b/.github/scripts/process_commit.py
@@ -68,7 +68,7 @@ def get_repo_labels() -> List[str]:
         page_labels = list(map(lambda x: str(x["name"]), response))
         if not page_labels:
             break
-            collected_labels += page_labels
+        collected_labels += page_labels
     return collected_labels
 
 def post_pytorch_comment(pr_number: int, merger: str) -> Any:
diff --git a/.github/scripts/syncbranches.py b/.github/scripts/syncbranches.py
index 163c4b3759b8..8437e1fa9c18 100755
--- a/.github/scripts/syncbranches.py
+++ b/.github/scripts/syncbranches.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-from gitutils import get_git_repo_dir, GitRepo
+from gitutils import get_git_repo_dir, get_git_remote_name, GitRepo
 from typing import Any
 
 
@@ -16,7 +16,7 @@ def parse_args() -> Any:
 
 def main() -> None:
     args = parse_args()
-    repo = GitRepo(get_git_repo_dir(), debug=args.debug)
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=args.debug)
     repo.cherry_pick_commits(args.sync_branch, args.default_branch)
     repo.push(args.default_branch, args.dry_run)
 
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
new file mode 100755
index 000000000000..a668431e3b3a
--- /dev/null
+++ b/.github/scripts/test_trymerge.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+# Tests implemented in this file are relying on GitHub GraphQL APIs
+# In order to avoid test flakiness, results of the queries
+# are cached in gql_mocks.json
+# PyTorch Lint workflow does not have GITHUB_TOKEN defined to avoid
+# flakiness, so if you are making changes to merge_rules or
+# GraphQL queries in trymerge.py, please make sure to delete `gql_mocks.json`
+# And re-run the test locally with ones PAT
+
+import json
+import os
+from hashlib import sha256
+
+from trymerge import find_matching_merge_rule, gh_graphql, gh_get_team_members, GitHubPR, MergeRule, MandatoryChecksMissingError
+from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+from typing import cast, Any, List, Optional
+from unittest import TestCase, main, mock
+from urllib.error import HTTPError
+
+def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
+    gql_db_fname = os.path.join(os.path.dirname(__file__), "gql_mocks.json")
+
+    def get_mocked_queries() -> Any:
+        if not os.path.exists(gql_db_fname):
+            return {}
+        with open(gql_db_fname, encoding="utf-8") as f:
+            return json.load(f)
+
+    def save_mocked_queries(obj: Any) -> None:
+        with open(gql_db_fname, encoding="utf-8", mode="w") as f:
+            json.dump(obj, f, indent=2)
+            f.write("\n")
+
+    key = f"query_sha={sha256(query.encode('utf-8')).hexdigest()} " + " ".join([f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())])
+    mocked_queries = get_mocked_queries()
+
+    if key in mocked_queries:
+        return mocked_queries[key]
+
+    try:
+        rc = gh_graphql(query, **kwargs)
+    except HTTPError as err:
+        if err.code == 401:
+            err_msg = "If you are seeing this message during workflow run, please make sure to update gql_mocks.json"
+            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with "
+            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN environment variable"
+            if os.getenv("GITHUB_TOKEN") is None:
+                err_msg = "Failed to update cached GraphQL queries as GITHUB_TOKEN is not defined." + err_msg
+            raise RuntimeError(err_msg) from err
+    mocked_queries[key] = rc
+
+    save_mocked_queries(mocked_queries)
+
+    return rc
+
+
+def mocked_read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[MergeRule]:
+    mock_merge_rules = """
+    [
+        {
+            "name": "mock with nonexistent check",
+            "patterns": ["*"],
+            "approved_by": [],
+            "mandatory_checks_name": [
+                "Facebook CLA Check",
+                "Lint",
+                "nonexistent"
+            ]
+        }
+    ]
+    """
+    rc = json.loads(mock_merge_rules, object_hook=lambda x: MergeRule(**x))
+    return cast(List[MergeRule], rc)
+
+
+class TestGitHubPR(TestCase):
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_match_rules(self, mocked_gql: Any) -> None:
+        "Tests that PR passes merge rules"
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        self.assertTrue(find_matching_merge_rule(pr, repo) is not None)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_lint_fails(self, mocked_gql: Any) -> None:
+        "Tests that PR fails mandatory lint check"
+        pr = GitHubPR("pytorch", "pytorch", 74649)
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        self.assertRaises(RuntimeError, lambda: find_matching_merge_rule(pr, repo))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_get_last_comment(self, mocked_gql: Any) -> None:
+        "Tests that last comment can be fetched"
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        comment = pr.get_last_comment()
+        self.assertEqual(comment.author_login, "github-actions")
+        self.assertIsNone(comment.editor_login)
+        self.assertTrue("You've committed this PR" in comment.body_text)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_get_author_null(self, mocked_gql: Any) -> None:
+        """ Tests that PR author can be computed
+            If reply contains NULL
+        """
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        author = pr.get_author()
+        self.assertTrue(author is not None)
+        self.assertTrue("@" in author)
+        self.assertTrue(pr.get_diff_revision() is None)
+
+        # PR with multiple contributors, but creator id is not among authors
+        pr = GitHubPR("pytorch", "pytorch", 75095)
+        self.assertEqual(pr.get_pr_creator_login(), "mruberry")
+        author = pr.get_author()
+        self.assertTrue(author is not None)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_large_diff(self, mocked_gql: Any) -> None:
+        "Tests that PR with 100+ files can be fetched"
+        pr = GitHubPR("pytorch", "pytorch", 73099)
+        self.assertTrue(pr.get_changed_files_count() > 100)
+        flist = pr.get_changed_files()
+        self.assertEqual(len(flist), pr.get_changed_files_count())
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_internal_changes(self, mocked_gql: Any) -> None:
+        "Tests that PR with internal changes is detected"
+        pr = GitHubPR("pytorch", "pytorch", 73969)
+        self.assertTrue(pr.has_internal_changes())
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_checksuites_pagination(self, mocked_gql: Any) -> None:
+        "Tests that PR with lots of checksuits can be fetched"
+        pr = GitHubPR("pytorch", "pytorch", 73811)
+        self.assertGreater(len(pr.get_checkrun_conclusions()), 0)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_comments_pagination(self, mocked_gql: Any) -> None:
+        "Tests that PR with 50+ comments can be fetched"
+        pr = GitHubPR("pytorch", "pytorch", 31093)
+        self.assertGreater(len(pr.get_comments()), 50)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_gql_complexity(self, mocked_gql: Any) -> None:
+        "Fetch comments and conclusions for PR with 60 commits"
+        # Previous version of GrapQL query used to cause HTTP/502 error
+        # see https://gist.github.com/malfet/9b93bc7eeddeaf1d84546efc4f0c577f
+        pr = GitHubPR("pytorch", "pytorch", 68111)
+        self.assertGreater(len(pr.get_comments()), 20)
+        self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
+        self.assertGreater(pr.get_commit_count(), 60)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_team_members(self, mocked_gql: Any) -> None:
+        "Test fetching team members works"
+        dev_infra_team = gh_get_team_members("pytorch", "pytorch-dev-infra")
+        self.assertGreater(len(dev_infra_team), 2)
+        with self.assertWarns(Warning):
+            non_existing_team = gh_get_team_members("pytorch", "qwertyuiop")
+            self.assertEqual(len(non_existing_team), 0)
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_get_author_many_commits(self, mocked_gql: Any) -> None:
+        """ Tests that authors for all commits can be fetched
+        """
+        pr = GitHubPR("pytorch", "pytorch", 76118)
+        authors = pr.get_authors()
+        self.assertGreater(pr.get_commit_count(), 100)
+        self.assertGreater(len(authors), 50)
+        self.assertTrue("@" in pr.get_author())
+
+    @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules)
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_pending_status_check(self, mocked_gql: Any, mocked_read_merge_rules: Any) -> None:
+        """ Tests that PR with nonexistent/pending status checks fails with the right reason.
+        """
+        pr = GitHubPR("pytorch", "pytorch", 76118)
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        self.assertRaisesRegex(MandatoryChecksMissingError,
+                               ".*are pending/not yet run.*",
+                               lambda: find_matching_merge_rule(pr, repo))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_get_author_many_reviews(self, mocked_gql: Any) -> None:
+        """ Tests that all reviews can be fetched
+        """
+        pr = GitHubPR("pytorch", "pytorch", 76123)
+        approved_by = pr.get_approved_by()
+        self.assertGreater(len(approved_by), 0)
+        assert pr._reviews is not None  # to pacify mypy
+        self.assertGreater(len(pr._reviews), 100)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/test_tryrebase.py b/.github/scripts/test_tryrebase.py
new file mode 100644
index 000000000000..399f03933633
--- /dev/null
+++ b/.github/scripts/test_tryrebase.py
@@ -0,0 +1,42 @@
+from unittest import TestCase, mock, main
+from test_trymerge import mocked_gh_graphql
+from trymerge import GitHubPR
+from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+from typing import Any
+from tryrebase import rebase_onto
+
+
+class TestRebase(TestCase):
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('gitutils.GitRepo._run_git')
+    @mock.patch('tryrebase.gh_post_comment')
+    def test_rebase(self, mocked_post_comment: Any, mocked_run_git: Any, mocked_gql: Any) -> None:
+        "Tests rebase successfully"
+        pr = GitHubPR("pytorch", "pytorch", 31093)
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        rebase_onto(pr, repo)
+        calls = [mock.call('fetch', 'origin', 'pull/31093/head:pull/31093/head'),
+                 mock.call('rebase', 'master', 'pull/31093/head'),
+                 mock.call('push', '-f', 'https://github.com/mingxiaoh/pytorch.git', 'pull/31093/head:master')]
+        mocked_run_git.assert_has_calls(calls)
+        self.assertTrue("Successfully rebased `master` onto `master`" in mocked_post_comment.call_args[0][3])
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('gitutils.GitRepo._run_git', return_value="Everything up-to-date")
+    @mock.patch('tryrebase.gh_post_comment')
+    def test_no_need_to_rebase(self, mocked_post_comment: Any, mocked_run_git: Any, mocked_gql: Any) -> None:
+        "Tests branch already up to date"
+        pr = GitHubPR("pytorch", "pytorch", 31093)
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        rebase_onto(pr, repo)
+        calls = [mock.call('fetch', 'origin', 'pull/31093/head:pull/31093/head'),
+                 mock.call('rebase', 'master', 'pull/31093/head'),
+                 mock.call('push', '-f', 'https://github.com/mingxiaoh/pytorch.git', 'pull/31093/head:master')]
+        mocked_run_git.assert_has_calls(calls)
+        self.assertTrue(
+            "Tried to rebase and push PR #31093, but it was already up to date" in mocked_post_comment.call_args[0][3])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 927edd685a5e..7747fd0208bd 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -1,13 +1,17 @@
 #!/usr/bin/env python3
 
+import base64
 import json
 import os
 import re
+import time
 from dataclasses import dataclass
 from urllib.request import urlopen, Request
 from urllib.error import HTTPError
 from typing import cast, Any, Callable, Dict, List, Optional, Tuple, Union
 from gitutils import get_git_remote_name, get_git_repo_dir, patterns_to_regex, GitRepo
+from functools import lru_cache
+from warnings import warn
 
 
 GH_GET_PR_INFO_QUERY = """
@@ -36,7 +40,7 @@
       mergeCommit {
         oid
       }
-      commits(first: 100) {
+      commits_with_authors:commits(first: 100) {
         nodes {
           commit {
             author {
@@ -47,34 +51,164 @@
               name
             }
             oid
-            checkSuites(filterBy: {appId: 12274}, first: 1) {
+          }
+        }
+        pageInfo {
+          endCursor
+          hasNextPage
+        }
+        totalCount
+      }
+      commits(last: 1) {
+        nodes {
+          commit {
+            checkSuites(first: 10) {
               nodes {
                 app {
+                  name
                   databaseId
                 }
+                workflowRun {
+                  workflow {
+                    name
+                  }
+                }
+                checkRuns(first: 50) {
+                  nodes {
+                    name
+                    conclusion
+                    detailsUrl
+                  }
+                  pageInfo {
+                    endCursor
+                    hasNextPage
+                  }
+                }
                 conclusion
+                url
+              }
+              pageInfo {
+                endCursor
+                hasNextPage
               }
             }
+            oid
           }
         }
-        totalCount
       }
       changedFiles
-      files(last: 100) {
+      files(first: 100) {
         nodes {
           path
         }
+        pageInfo {
+          endCursor
+          hasNextPage
+        }
       }
-      latestReviews(last: 100) {
+      reviews(last: 100) {
         nodes {
           author {
             login
           }
           state
         }
-        totalCount
+        pageInfo {
+          startCursor
+          hasPreviousPage
+        }
+      }
+      comments(last: 5) {
+        nodes {
+          bodyText
+          author {
+            login
+          }
+          authorAssociation
+          editor {
+            login
+          }
+          databaseId
+        }
+        pageInfo {
+          startCursor
+          hasPreviousPage
+        }
+      }
+    }
+  }
+}
+"""
+
+GH_GET_PR_NEXT_FILES_QUERY = """
+query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) {
+  repository(name: $name, owner: $owner) {
+    pullRequest(number: $number) {
+      files(first: 100, after: $cursor) {
+        nodes {
+          path
+        }
+        pageInfo {
+          endCursor
+          hasNextPage
+        }
+      }
+    }
+  }
+}
+"""
+
+GH_GET_PR_NEXT_CHECK_RUNS = """
+query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) {
+  repository(name: $name, owner: $owner) {
+    pullRequest(number: $number) {
+      commits(last: 1) {
+        nodes {
+          commit {
+            oid
+            checkSuites(first: 10, after: $cursor) {
+              nodes {
+                app {
+                  name
+                  databaseId
+                }
+                workflowRun {
+                  workflow {
+                    name
+                  }
+                }
+                checkRuns(first: 50) {
+                  nodes {
+                    name
+                    conclusion
+                    detailsUrl
+                  }
+                  pageInfo {
+                    endCursor
+                    hasNextPage
+                  }
+                }
+                conclusion
+                url
+              }
+              pageInfo {
+                endCursor
+                hasNextPage
+              }
+            }
+          }
+        }
       }
-      comments(last: 1) {
+    }
+  }
+}
+"""
+
+GH_GET_PR_PREV_COMMENTS = """
+query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) {
+  repository(name: $name, owner: $owner) {
+    pullRequest(number: $number) {
+      comments(last: 100, before: $cursor) {
         nodes {
           bodyText
           author {
@@ -84,6 +218,78 @@
           editor {
             login
           }
+          databaseId
+        }
+        pageInfo {
+          startCursor
+          hasPreviousPage
+        }
+      }
+    }
+  }
+}
+"""
+
+# This query needs read-org permission
+GH_GET_TEAM_MEMBERS_QUERY = """
+query($org: String!, $name: String!, $cursor: String) {
+  organization(login: $org) {
+    team(slug: $name) {
+      members(first: 100, after: $cursor) {
+        nodes {
+          login
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+    }
+  }
+}
+"""
+
+GH_GET_PR_NEXT_AUTHORS_QUERY = """
+query ($owner: String!, $name: String!, $number: Int!, $cursor: String) {
+  repository(name: $name, owner: $owner) {
+    pullRequest(number: $number) {
+      commits_with_authors: commits(first: 100, after: $cursor) {
+        nodes {
+          commit {
+            author {
+              user {
+                login
+              }
+              email
+              name
+            }
+            oid
+          }
+        }
+        pageInfo {
+          endCursor
+          hasNextPage
+        }
+      }
+    }
+  }
+}
+"""
+
+GH_GET_PR_PREV_REVIEWS_QUERY = """
+query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) {
+  repository(name: $name, owner: $owner) {
+    pullRequest(number: $number) {
+      reviews(last: 100, before: $cursor) {
+        nodes {
+          author {
+            login
+          }
+          state
+        }
+        pageInfo {
+          startCursor
+          hasPreviousPage
         }
       }
     }
@@ -99,6 +305,7 @@
     re.MULTILINE
 )
 RE_REVERT_CMD = re.compile(r"@pytorch(merge|)bot\s+revert\s+this")
+RE_REVERT_CMD_CLI = re.compile(r"@pytorch(merge|)bot\s+revert\s+(-m.*-c.*|-c.*-m.*)")
 RE_DIFF_REV = re.compile(r'^Differential Revision:.+?(D[0-9]+)', re.MULTILINE)
 
 
@@ -147,7 +354,7 @@ def gh_add_labels(org: str, project: str, pr_num: int, labels: Union[str, List[s
 def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
     rc = _fetch_url("https://api.github.com/graphql", data={"query": query, "variables": kwargs}, reader=json.load)
     if "errors" in rc:
-        raise RuntimeError(f"GraphQL query {query} failed: {rc['errors']}")
+        raise RuntimeError(f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}")
     return cast(Dict[str, Any], rc)
 
 
@@ -156,14 +363,49 @@ def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
     return rc["data"]["repository"]["pullRequest"]
 
 
+@lru_cache(maxsize=None)
+def gh_get_team_members(org: str, name: str) -> List[str]:
+    rc: List[str] = []
+    team_members: Dict[str, Any] = {"pageInfo": {"hasNextPage": "true", "endCursor": None}}
+    while bool(team_members["pageInfo"]["hasNextPage"]):
+        query = gh_graphql(GH_GET_TEAM_MEMBERS_QUERY, org=org, name=name, cursor=team_members["pageInfo"]["endCursor"])
+        team = query["data"]["organization"]["team"]
+        if team is None:
+            warn(f"Requested non-existing team {org}/{name}")
+            return []
+        team_members = team["members"]
+        rc += [member["login"] for member in team_members["nodes"]]
+    return rc
+
+
 def parse_args() -> Any:
     from argparse import ArgumentParser
     parser = ArgumentParser("Merge PR into default branch")
     parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--on-green", action="store_true")
     parser.add_argument("--revert", action="store_true")
+    parser.add_argument("--force", action="store_true")
+    parser.add_argument("--comment-id", type=int)
     parser.add_argument("pr_num", type=int)
     return parser.parse_args()
 
+def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) -> bool:
+    if comment_id is None:
+        return False
+    comment = pr.get_comment_by_id(comment_id)
+    if comment.editor_login is not None:
+        return False
+    return comment.author_login == "facebook-github-bot"
+
+
+@dataclass
+class GitHubComment:
+    body_text: str
+    author_login: str
+    author_association: str
+    editor_login: Optional[str]
+    database_id: int
+
 
 class GitHubPR:
     def __init__(self, org: str, project: str, pr_num: int) -> None:
@@ -172,6 +414,11 @@ def __init__(self, org: str, project: str, pr_num: int) -> None:
         self.project = project
         self.pr_num = pr_num
         self.info = gh_get_pr_info(org, project, pr_num)
+        self.changed_files: Optional[List[str]] = None
+        self.conclusions: Optional[Dict[str, Tuple[str, str]]] = None
+        self.comments: Optional[List[GitHubComment]] = None
+        self._authors: Optional[List[Tuple[str, str]]] = None
+        self._reviews: Optional[List[Tuple[str, str]]] = None
 
     def is_closed(self) -> bool:
         return bool(self.info["closed"])
@@ -198,39 +445,124 @@ def get_changed_files_count(self) -> int:
         return int(self.info["changedFiles"])
 
     def get_changed_files(self) -> List[str]:
-        rc = [x["path"] for x in self.info["files"]["nodes"]]
-        if len(rc) != self.get_changed_files_count():
+        if self.changed_files is None:
+            info = self.info
+            self.changed_files = []
+            # Do not try to fetch more than 10K files
+            for _ in range(100):
+                self.changed_files += [x["path"] for x in info["files"]["nodes"]]
+                if not info["files"]["pageInfo"]["hasNextPage"]:
+                    break
+                rc = gh_graphql(GH_GET_PR_NEXT_FILES_QUERY,
+                                name=self.project,
+                                owner=self.org,
+                                number=self.pr_num,
+                                cursor=info["files"]["pageInfo"]["endCursor"])
+                info = rc["data"]["repository"]["pullRequest"]
+
+        if len(self.changed_files) != self.get_changed_files_count():
             raise RuntimeError("Changed file count mismatch")
-        return rc
-
-    def _get_reviewers(self) -> List[Tuple[str, str]]:
-        reviews_count = int(self.info["latestReviews"]["totalCount"])
-        if len(self.info["latestReviews"]["nodes"]) != reviews_count:
-            raise RuntimeError("Can't fetch all PR reviews")
-        return [(x["author"]["login"], x["state"]) for x in self.info["latestReviews"]["nodes"]]
+        return self.changed_files
+
+    def _get_reviews(self) -> List[Tuple[str, str]]:
+        if self._reviews is None:
+            self._reviews = []
+            info = self.info
+            for _ in range(100):
+                nodes = info["reviews"]["nodes"]
+                self._reviews = [(node["author"]["login"], node["state"]) for node in nodes] + self._reviews
+                if not info["reviews"]["pageInfo"]["hasPreviousPage"]:
+                    break
+                rc = gh_graphql(GH_GET_PR_PREV_REVIEWS_QUERY,
+                                name=self.project,
+                                owner=self.org,
+                                number=self.pr_num,
+                                cursor=info["reviews"]["pageInfo"]["startCursor"])
+                info = rc["data"]["repository"]["pullRequest"]
+        reviews = {}
+        for (author, state) in self._reviews:
+            if state != "COMMENTED":
+                reviews[author] = state
+        return list(reviews.items())
 
     def get_approved_by(self) -> List[str]:
-        return [login for (login, state) in self._get_reviewers() if state == "APPROVED"]
+        return [login for (login, state) in self._get_reviews() if state == "APPROVED"]
 
     def get_commit_count(self) -> int:
-        return int(self.info["commits"]["totalCount"])
+        return int(self.info["commits_with_authors"]["totalCount"])
 
     def get_pr_creator_login(self) -> str:
         return cast(str, self.info["author"]["login"])
 
+    def _fetch_authors(self) -> List[Tuple[str, str]]:
+        if self._authors is not None:
+            return self._authors
+        authors: List[Tuple[str, str]] = []
+
+        def add_authors(info: Dict[str, Any]) -> None:
+            for node in info["commits_with_authors"]["nodes"]:
+                author_node = node["commit"]["author"]
+                user_node = author_node["user"]
+                author = f"{author_node['name']} <{author_node['email']}>"
+                if user_node is None:
+                    # If author is not github user, user node will be null
+                    authors.append(("", author))
+                else:
+                    authors.append((cast(str, user_node["login"]), author))
+
+        info = self.info
+        for _ in range(100):
+            add_authors(info)
+            if not info["commits_with_authors"]["pageInfo"]["hasNextPage"]:
+                break
+            rc = gh_graphql(GH_GET_PR_NEXT_AUTHORS_QUERY,
+                            name=self.project,
+                            owner=self.org,
+                            number=self.pr_num,
+                            cursor=info["commits_with_authors"]["pageInfo"]["endCursor"])
+            info = rc["data"]["repository"]["pullRequest"]
+        self._authors = authors
+        return authors
+
     def get_committer_login(self, num: int = 0) -> str:
-        return cast(str, self.info["commits"]["nodes"][num]["commit"]["author"]["user"]["login"])
+        return self._fetch_authors()[num][0]
 
     def get_committer_author(self, num: int = 0) -> str:
-        node = self.info["commits"]["nodes"][num]["commit"]["author"]
-        return f"{node['name']} <{node['email']}>"
-
-    def get_check_suite_conclusions(self) -> Dict[int, str]:
-        last_commit = self.info["commits"]["nodes"][-1]["commit"]
-        rc = {}
-        for node in last_commit["checkSuites"]["nodes"]:
-            rc[int(node["app"]["databaseId"])] = node["conclusion"]
-        return rc
+        return self._fetch_authors()[num][1]
+
+    def get_checkrun_conclusions(self) -> Dict[str, Tuple[str, str]]:
+        """ Returns dict of checkrun -> [conclusion, url] """
+        if self.conclusions is not None:
+            return self.conclusions
+        orig_last_commit = self.info["commits"]["nodes"][-1]["commit"]
+        checksuites = orig_last_commit["checkSuites"]
+        conclusions = {}
+
+        def add_conclusions(nodes: List[Dict[str, Any]]) -> None:
+            for node in nodes:
+                workflow_run = node["workflowRun"]
+                checkruns = node["checkRuns"]
+                if workflow_run is not None:
+                    conclusions[workflow_run["workflow"]["name"]] = (node["conclusion"], node["url"])
+                if checkruns is not None:
+                    for checkrun_node in checkruns["nodes"]:
+                        conclusions[checkrun_node["name"]] = (checkrun_node["conclusion"], checkrun_node["detailsUrl"])
+
+        add_conclusions(checksuites["nodes"])
+        while bool(checksuites["pageInfo"]["hasNextPage"]):
+            rc = gh_graphql(GH_GET_PR_NEXT_CHECK_RUNS,
+                            name=self.project,
+                            owner=self.org,
+                            number=self.pr_num,
+                            cursor=checksuites["pageInfo"]["endCursor"])
+            info = rc["data"]["repository"]["pullRequest"]
+            last_commit = info["commits"]["nodes"][-1]["commit"]
+            if last_commit["oid"] != orig_last_commit["oid"]:
+                raise RuntimeError("Last commit changed on PR")
+            checksuites = last_commit["checkSuites"]
+            add_conclusions(checksuites["nodes"])
+        self.conclusions = conclusions
+        return conclusions
 
     def get_authors(self) -> Dict[str, str]:
         rc = {}
@@ -243,7 +575,12 @@ def get_author(self) -> str:
         authors = self.get_authors()
         if len(authors) == 1:
             return next(iter(authors.values()))
-        return self.get_authors()[self.get_pr_creator_login()]
+        creator = self.get_pr_creator_login()
+        # If PR creator is not among authors
+        # Assume it was authored by first commit author
+        if creator not in authors:
+            return self.get_committer_author(0)
+        return authors[creator]
 
     def get_title(self) -> str:
         return cast(str, self.info["title"])
@@ -258,21 +595,66 @@ def get_merge_commit(self) -> Optional[str]:
     def get_pr_url(self) -> str:
         return f"https://github.com/{self.org}/{self.project}/pull/{self.pr_num}"
 
-    def get_comment_body(self, num: int = -1) -> str:
-        return cast(str, self.info["comments"]["nodes"][num]["bodyText"])
-
-    def get_comment_author_login(self, num: int = -1) -> str:
-        return cast(str, self.info["comments"]["nodes"][num]["author"]["login"])
-
-    def get_comment_editor_login(self, num: int = -1) -> Optional[str]:
-        rc = self.info["comments"]["nodes"][num]["editor"]
-        return rc["login"] if rc is not None else None
-
-    def get_comment_author_association(self, num: int = -1) -> str:
-        return cast(str, self.info["comments"]["nodes"][num]["authorAssociation"])
-
-    def merge_ghstack_into(self, repo: GitRepo) -> None:
+    @staticmethod
+    def _comment_from_node(node: Any) -> GitHubComment:
+        editor = node["editor"]
+        return GitHubComment(body_text=node["bodyText"],
+                             author_login=node["author"]["login"],
+                             author_association=node["authorAssociation"],
+                             editor_login=editor["login"] if editor else None,
+                             database_id=node["databaseId"]
+                             )
+
+    def get_comments(self) -> List[GitHubComment]:
+        if self.comments is not None:
+            return self.comments
+        self.comments = []
+        info = self.info["comments"]
+        # Do not try to fetch more than 10K comments
+        for _ in range(100):
+            self.comments = [self._comment_from_node(node) for node in info["nodes"]] + self.comments
+            if not info["pageInfo"]["hasPreviousPage"]:
+                break
+            rc = gh_graphql(GH_GET_PR_PREV_COMMENTS,
+                            name=self.project,
+                            owner=self.org,
+                            number=self.pr_num,
+                            cursor=info["pageInfo"]["startCursor"])
+            info = rc["data"]["repository"]["pullRequest"]["comments"]
+        return self.comments
+
+    def get_last_comment(self) -> GitHubComment:
+        return self._comment_from_node(self.info["comments"]["nodes"][-1])
+
+    def get_comment_by_id(self, database_id: int) -> GitHubComment:
+        if self.comments is None:
+            # Fastpath - try searching in partial prefetched comments
+            for node in self.info["comments"]["nodes"]:
+                comment = self._comment_from_node(node)
+                if comment.database_id == database_id:
+                    return comment
+
+        for comment in self.get_comments():
+            if comment.database_id == database_id:
+                return comment
+        raise RuntimeError(f"Comment with id {database_id} not found")
+
+    def get_diff_revision(self) -> Optional[str]:
+        rc = RE_DIFF_REV.search(self.get_body())
+        return rc.group(1) if rc is not None else None
+
+    def has_internal_changes(self) -> bool:
+        checkrun_name = "Meta Internal-Only Changes Check"
+        if self.get_diff_revision() is None:
+            return False
+        checks = self.get_checkrun_conclusions()
+        if checks is None or checkrun_name not in checks:
+            return False
+        return checks[checkrun_name][0] != "SUCCESS"
+
+    def merge_ghstack_into(self, repo: GitRepo, force: bool, comment_id: Optional[int] = None) -> None:
         assert self.is_ghstack_pr()
+        approved_by = self.get_approved_by()
         # For ghstack, cherry-pick commits based from origin
         orig_ref = f"{repo.remote}/{re.sub(r'/head$', '/orig', self.head_ref())}"
         rev_list = repo.revlist(f"{self.default_branch()}..{orig_ref}")
@@ -289,98 +671,179 @@ def merge_ghstack_into(self, repo: GitRepo) -> None:
                 if pr.is_closed():
                     print(f"Skipping {idx+1} of {len(rev_list)} PR (#{pr_num}) as its already been merged")
                     continue
+                approved_by = pr.get_approved_by()
                 # Raises exception if matching rule is not found
-                find_matching_merge_rule(pr, repo)
+                find_matching_merge_rule(pr, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
 
+            # Adding the url here makes it clickable within the Github UI
+            approved_by_urls = ', '.join(prefix_with_github_url(login) for login in approved_by)
             repo.cherry_pick(rev)
-            repo.amend_commit_message(re.sub(RE_GHSTACK_SOURCE_ID, "", msg))
+            msg = re.sub(RE_GHSTACK_SOURCE_ID, "", msg)
+            msg += f"\nApproved by: {approved_by_urls}\n"
+            repo.amend_commit_message(msg)
 
-    def merge_into(self, repo: GitRepo, dry_run: bool = False) -> None:
+    def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = False, comment_id: Optional[int] = None) -> None:
         # Raises exception if matching rule is not found
-        find_matching_merge_rule(self, repo)
+        find_matching_merge_rule(self, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
         if repo.current_branch() != self.default_branch():
             repo.checkout(self.default_branch())
         if not self.is_ghstack_pr():
-            msg = self.get_title() + "\n\n" + self.get_body()
+            # Adding the url here makes it clickable within the Github UI
+            approved_by_urls = ', '.join(prefix_with_github_url(login) for login in self.get_approved_by())
+            msg = self.get_title() + f" (#{self.pr_num})\n\n" + self.get_body()
             msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
-            repo._run_git("merge", "--squash", f"{repo.remote}/{self.head_ref()}")
+            msg += f"Approved by: {approved_by_urls}\n"
+            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+            repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
+            repo._run_git("merge", "--squash", pr_branch_name)
             repo._run_git("commit", f"--author=\"{self.get_author()}\"", "-m", msg)
         else:
-            self.merge_ghstack_into(repo)
+            self.merge_ghstack_into(repo, force, comment_id=comment_id)
 
         repo.push(self.default_branch(), dry_run)
+        if not dry_run:
+            gh_add_labels(self.org, self.project, self.pr_num, ["merged"])
 
-
+class MandatoryChecksMissingError(Exception):
+    pass
 @dataclass
 class MergeRule:
     name: str
     patterns: List[str]
     approved_by: List[str]
-    mandatory_app_id: Optional[int]
+    mandatory_checks_name: Optional[List[str]]
 
 
-def read_merge_rules(repo: GitRepo) -> List[MergeRule]:
+def read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[MergeRule]:
     from pathlib import Path
-    rules_path = Path(repo.repo_dir) / ".github" / "merge_rules.json"
-    if not rules_path.exists():
-        print(f"{rules_path} does not exist, returning empty rules")
-        return []
-    with open(rules_path) as fp:
-        rc = json.load(fp, object_hook=lambda x: MergeRule(**x))
-    return cast(List[MergeRule], rc)
-
 
-
-def find_matching_merge_rule(pr: GitHubPR, repo: GitRepo) -> MergeRule:
+    repo_relative_rules_path = Path(".github") / "merge_rules.json"
+    if repo is None:
+        json_data = _fetch_url(
+            f"https://api.github.com/repos/{org}/{project}/contents/{repo_relative_rules_path}",
+            headers={'Accept': 'application/vnd.github.v3+json'},
+            reader=json.load,
+        )
+        content = base64.b64decode(json_data["content"])
+        return cast(List[MergeRule], json.loads(content, object_hook=lambda x: MergeRule(**x)))
+    else:
+        rules_path = Path(repo.repo_dir) / repo_relative_rules_path
+        if not rules_path.exists():
+            print(f"{rules_path} does not exist, returning empty rules")
+            return []
+        with open(rules_path) as fp:
+            rc = json.load(fp, object_hook=lambda x: MergeRule(**x))
+        return cast(List[MergeRule], rc)
+
+
+def find_matching_merge_rule(pr: GitHubPR,
+                             repo: Optional[GitRepo] = None,
+                             force: bool = False,
+                             skip_internal_checks: bool = False
+                             ) -> MergeRule:
     """Returns merge rule matching to this pr or raises an exception"""
     changed_files = pr.get_changed_files()
     approved_by = set(pr.get_approved_by())
-    rules = read_merge_rules(repo)
+    rules = read_merge_rules(repo, pr.org, pr.project)
+    reject_reason = f"PR {pr.pr_num} does not match merge rules"
+    #  Used to determine best rejection reason
+    # Score 0 to 10K - how many files rule matched
+    # Score 10K - matched all files, but no overlapping approvers
+    # Score 20K - matched all files and approvers, but mandatory checks are pending
+    # Score 30k - Matched all files and approvers, but mandatory checks failed
+    reject_reason_score = 0
     for rule in rules:
         rule_name = rule.name
-        rule_approvers_set = set(rule.approved_by)
         patterns_re = patterns_to_regex(rule.patterns)
-        approvers_intersection = approved_by.intersection(rule_approvers_set)
-        # If rule requires approvers but they aren't the ones that reviewed PR
-        if len(approvers_intersection) == 0 and len(rule_approvers_set) > 0:
-            print(f"Skipping rule {rule_name} due to no approvers overlap")
-            continue
-        if rule.mandatory_app_id is not None:
-            cs_conslusions = pr.get_check_suite_conclusions()
-            mandatory_app_id = rule.mandatory_app_id
-            if mandatory_app_id not in cs_conslusions or cs_conslusions[mandatory_app_id] != "SUCCESS":
-                print(f"Skipping rule {rule_name} as mandatory app {mandatory_app_id} is not in {cs_conslusions}")
-                continue
         non_matching_files = []
         for fname in changed_files:
             if not patterns_re.match(fname):
                 non_matching_files.append(fname)
         if len(non_matching_files) > 0:
-            print(f"Skipping rule {rule_name} due to non-matching files: {non_matching_files}")
+            num_matching_files = len(changed_files) - len(non_matching_files)
+            if num_matching_files > reject_reason_score:
+                reject_reason_score = num_matching_files
+                reject_reason = (f"{num_matching_files} files matched rule {rule_name}, but there are still non-matching files: " +
+                                 f"{','.join(non_matching_files[:5])}{', ...' if len(non_matching_files) > 5 else ''}")
+            continue
+        # If rule needs approvers but PR has not been reviewed, skip it
+        if len(rule.approved_by) > 0 and len(approved_by) == 0:
+            if reject_reason_score < 10000:
+                reject_reason_score = 10000
+                reject_reason = f"Matched rule {rule_name}, but PR has not been reviewed yet"
             continue
-        print(f"Matched rule {rule_name} for {pr.pr_num}")
+
+        rule_approvers_set = set()
+        for approver in rule.approved_by:
+            if "/" in approver:
+                org, name = approver.split("/")
+                rule_approvers_set.update(gh_get_team_members(org, name))
+            else:
+                rule_approvers_set.add(approver)
+        approvers_intersection = approved_by.intersection(rule_approvers_set)
+        # If rule requires approvers but they aren't the ones that reviewed PR
+        if len(approvers_intersection) == 0 and len(rule_approvers_set) > 0:
+            if reject_reason_score < 10000:
+                reject_reason_score = 10000
+                reject_reason = (f"Matched rule {rule_name}, but it was not reviewed yet by any of:" +
+                                 f"{','.join(list(rule_approvers_set)[:5])}{', ...' if len(rule_approvers_set) > 5 else ''}")
+            continue
+        if rule.mandatory_checks_name is not None:
+            pending_checks: List[Tuple[str, Optional[str]]] = []
+            failed_checks: List[Tuple[str, Optional[str]]] = []
+            checks = pr.get_checkrun_conclusions()
+            # HACK: We don't want to skip CLA check, even when forced
+            for checkname in filter(lambda x: force is False or "CLA Check" in x, rule.mandatory_checks_name):
+                if checkname not in checks:
+                    pending_checks.append((checkname, None))
+                elif checks[checkname][0] is None:
+                    pending_checks.append((checkname, checks[checkname][1]))
+                elif checks[checkname][0] != 'SUCCESS':
+                    failed_checks.append((checkname, checks[checkname][1]))
+
+        def checks_to_str(checks: List[Tuple[str, Optional[str]]]) -> str:
+            return ", ".join(f"[{c[0]}]({c[1]})" if c[1] is not None else c[0] for c in checks)
+
+        if len(failed_checks) > 0:
+            if reject_reason_score < 30000:
+                reject_reason_score = 30000
+                reject_reason = ("Refusing to merge as mandatory check(s)" +
+                                 checks_to_str(failed_checks) + f" failed for rule {rule_name}")
+            continue
+        elif len(pending_checks) > 0:
+            if reject_reason_score < 20000:
+                reject_reason_score = 20000
+                reject_reason = f"Refusing to merge as mandatory check(s) {checks_to_str(pending_checks)}"
+                reject_reason += f" are pending/not yet run for rule {rule_name}"
+            continue
+        if not skip_internal_checks and pr.has_internal_changes():
+            raise RuntimeError("This PR has internal changes and must be landed via Phabricator")
         return rule
-    raise RuntimeError(f"PR {pr.pr_num} does not match merge rules")
+    if reject_reason_score == 20000:
+        raise MandatoryChecksMissingError(reject_reason)
+    raise RuntimeError(reject_reason)
 
 
-def try_revert(repo: GitRepo, pr: GitHubPR, dry_run: bool = False) -> None:
+def try_revert(repo: GitRepo, pr: GitHubPR, *, dry_run: bool = False, comment_id: Optional[int] = None) -> None:
     def post_comment(msg: str) -> None:
         gh_post_comment(pr.org, pr.project, pr.pr_num, msg, dry_run=dry_run)
     if not pr.is_closed():
         return post_comment(f"Can't revert open PR #{pr.pr_num}")
-    if not RE_REVERT_CMD.match(pr.get_comment_body()):
-        raise RuntimeError(f"Comment {pr.get_comment_body()} does not seem to be a valid revert command")
-    if pr.get_comment_editor_login() is not None:
+    comment = pr.get_last_comment() if comment_id is None else pr.get_comment_by_id(comment_id)
+    if not RE_REVERT_CMD.match(comment.body_text) and not RE_REVERT_CMD_CLI.match(comment.body_text):
+        raise RuntimeError(f"Comment {comment.body_text} does not seem to be a valid revert command")
+    if comment.editor_login is not None:
         return post_comment("Don't want to revert based on edited command")
-    author_association = pr.get_comment_author_association()
-    author_login = pr.get_comment_author_login()
+    author_association = comment.author_association
+    author_login = comment.author_login
     # For some reason, one can not be a member of private repo, only CONTRIBUTOR
     expected_association = "CONTRIBUTOR" if pr.is_base_repo_private() else "MEMBER"
     if author_association != expected_association and author_association != "OWNER":
         return post_comment(f"Will not revert as @{author_login} is not a {expected_association}, but {author_association}")
+    skip_internal_checks = can_skip_internal_checks(pr, comment_id)
 
-    # Raises exception if matching rule is not found
-    find_matching_merge_rule(pr, repo)
+    # Raises exception if matching rule is not found, but ignores all status checks
+    find_matching_merge_rule(pr, repo, force=True, skip_internal_checks=skip_internal_checks)
     commit_sha = pr.get_merge_commit()
     if commit_sha is None:
         commits = repo.commits_resolving_gh_pr(pr.pr_num)
@@ -389,51 +852,88 @@ def post_comment(msg: str) -> None:
         commit_sha = commits[0]
     msg = repo.commit_message(commit_sha)
     rc = RE_DIFF_REV.search(msg)
-    if rc is not None:
+    if rc is not None and not can_skip_internal_checks:
         raise RuntimeError(f"Can't revert PR that was landed via phabricator as {rc.group(1)}")
     repo.checkout(pr.default_branch())
     repo.revert(commit_sha)
     msg = repo.commit_message("HEAD")
     msg = re.sub(RE_PULL_REQUEST_RESOLVED, "", msg)
-    msg += f"\nReverted {pr.get_pr_url()} on behalf of @{author_login}\n"
+    msg += f"\nReverted {pr.get_pr_url()} on behalf of {prefix_with_github_url(author_login)}\n"
     repo.amend_commit_message(msg)
     repo.push(pr.default_branch(), dry_run)
     if not dry_run:
         gh_add_labels(pr.org, pr.project, pr.pr_num, ["reverted"])
 
+
+def prefix_with_github_url(suffix_str: str) -> str:
+    return f"https://github.com/{suffix_str}"
+
+
+def merge_on_green(pr_num: int, repo: GitRepo, dry_run: bool = False, timeout_minutes: int = 400) -> None:
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+    org, project = repo.gh_owner_and_name()
+    start_time = time.time()
+    last_exception = ''
+    elapsed_time = 0.0
+    while elapsed_time < timeout_minutes * 60:
+        current_time = time.time()
+        elapsed_time = current_time - start_time
+
+
+        pr = GitHubPR(org, project, pr_num)
+        try:
+            return pr.merge_into(repo, dry_run=dry_run)
+        except MandatoryChecksMissingError as ex:
+            last_exception = str(ex)
+            print(f"Merged failed due to: {ex}. Retrying in 60 seconds.")
+            time.sleep(60)
+    # Finally report timeout back
+    msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team."
+    msg += f"The last exception was: {last_exception}"
+    if not dry_run:
+        gh_add_labels(org, project, pr_num, ["land-failed"])
+    raise RuntimeError(msg)
+
 def main() -> None:
     args = parse_args()
     repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
     org, project = repo.gh_owner_and_name()
-
     pr = GitHubPR(org, project, args.pr_num)
+
+    def handle_exception(e: Exception, msg: str = "Merge failed") -> None:
+        msg += f" due to {e}"
+        run_url = os.getenv("GH_RUN_URL")
+        if run_url is not None:
+            msg += f"\nRaised by {run_url}"
+        gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run)
+        import traceback
+        traceback.print_exc()
+
     if args.revert:
         try:
-            try_revert(repo, pr, dry_run=args.dry_run)
+            try_revert(repo, pr, dry_run=args.dry_run, comment_id=args.comment_id)
         except Exception as e:
-            msg = f"Reverting PR {args.pr_num} failed due to {e}"
-            run_url = os.getenv("GH_RUN_URL")
-            if run_url is not None:
-                msg += f"\nRaised by {run_url}"
-            gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run)
+            handle_exception(e, f"Reverting PR {args.pr_num} failed")
         return
 
     if pr.is_closed():
         gh_post_comment(org, project, args.pr_num, f"Can't merge closed PR #{args.pr_num}", dry_run=args.dry_run)
         return
 
-    if pr.is_cross_repo():
-        gh_post_comment(org, project, args.pr_num, "Cross-repo merges are not supported at the moment", dry_run=args.dry_run)
+    if pr.is_cross_repo() and pr.is_ghstack_pr():
+        gh_post_comment(org, project, args.pr_num, "Cross-repo ghstack merges are not supported", dry_run=args.dry_run)
         return
 
-    try:
-        pr.merge_into(repo, dry_run=args.dry_run)
-    except Exception as e:
-        msg = f"Merge failed due to {e}"
-        run_url = os.getenv("GH_RUN_URL")
-        if run_url is not None:
-            msg += f"\nRaised by {run_url}"
-        gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run)
+    if args.on_green:
+        try:
+            merge_on_green(args.pr_num, repo, dry_run=args.dry_run)
+        except Exception as e:
+            handle_exception(e)
+    else:
+        try:
+            pr.merge_into(repo, dry_run=args.dry_run, force=args.force, comment_id=args.comment_id)
+        except Exception as e:
+            handle_exception(e)
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
new file mode 100755
index 000000000000..a382de511a41
--- /dev/null
+++ b/.github/scripts/tryrebase.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import os
+from typing import Any
+from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+from trymerge import gh_post_comment, GitHubPR
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Rebase PR into branch")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("pr_num", type=int)
+    return parser.parse_args()
+
+
+def rebase_onto(pr: GitHubPR, repo: GitRepo, dry_run: bool = False) -> None:
+    branch = f"pull/{pr.pr_num}/head"
+    onto_branch = pr.default_branch()
+    remote_url = f"https://github.com/{pr.info['headRepository']['nameWithOwner']}.git"
+    refspec = f"{branch}:{pr.head_ref()}"
+
+    repo.fetch(branch, branch)
+    repo._run_git("rebase", onto_branch, branch)
+    if dry_run:
+        push_result = repo._run_git("push", "--dry-run", "-f", remote_url, refspec)
+    else:
+        push_result = repo._run_git("push", "-f", remote_url, refspec)
+    if "Everything up-to-date" in push_result:
+        gh_post_comment(pr.org, pr.project, pr.pr_num,
+                        f"Tried to rebase and push PR #{pr.pr_num}, but it was already up to date", dry_run=dry_run)
+    else:
+        gh_post_comment(pr.org, pr.project, pr.pr_num,
+                        f"Successfully rebased `{pr.head_ref()}` onto `{onto_branch}`, please pull locally " +
+                        f"before adding more changes (for example, via `git checkout {pr.head_ref()} && " +
+                        "git pull --rebase`)", dry_run=dry_run)
+
+
+def main() -> None:
+    args = parse_args()
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True)
+    org, project = repo.gh_owner_and_name()
+
+    pr = GitHubPR(org, project, args.pr_num)
+
+    if pr.is_closed():
+        gh_post_comment(org, project, args.pr_num, f"PR #{args.pr_num} is closed, won't rebase", dry_run=args.dry_run)
+        return
+
+    if pr.is_ghstack_pr():
+        gh_post_comment(org, project, args.pr_num,
+                        f"PR #{args.pr_num} is a ghstack, which is currently not supported", dry_run=args.dry_run)
+        return
+
+    try:
+        rebase_onto(pr, repo, dry_run=args.dry_run)
+    except Exception as e:
+        msg = f"Rebase failed due to {e}"
+        run_url = os.getenv("GH_RUN_URL")
+        if run_url is not None:
+            msg += f"\nRaised by {run_url}"
+        gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/templates/android_ci_full_workflow.yml.j2 b/.github/templates/android_ci_full_workflow.yml.j2
deleted file mode 100644
index b89ae9fd94a5..000000000000
--- a/.github/templates/android_ci_full_workflow.yml.j2
+++ /dev/null
@@ -1,165 +0,0 @@
-{%- extends "linux_ci_workflow.yml.j2" -%}
-{% import 'common_android.yml.j2' as common_android %}
-{%- set exclude_test = true -%}
-{% block name -%}
-# Template is at:    .github/templates/android_ci_full_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif -%}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-  push:
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-
-{% block build +%}
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: !{{ test_runner_type }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build-and-test
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      !{{ common.calculate_docker_image(false) }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      !{{ common.parse_ref() }}
-      !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build", "arm-v7a") }}
-      !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build", "arm-v8a") }}
-      !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build", "x86_32") }}
-      !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build", "x86_64") }}
-      - name: Build-Final-Artifcact
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          set -eux
-
-          docker_image_libtorch_android_x86_32="${DOCKER_IMAGE}-x86_32"
-          docker_image_libtorch_android_x86_64="${DOCKER_IMAGE}-x86_64"
-          docker_image_libtorch_android_arm_v7a="${DOCKER_IMAGE}-arm-v7a"
-          docker_image_libtorch_android_arm_v8a="${DOCKER_IMAGE}-arm-v8a"
-
-          echo "docker_image_commit: ${DOCKER_IMAGE}"
-          echo "docker_image_libtorch_android_x86_32: ${docker_image_libtorch_android_x86_32}"
-          echo "docker_image_libtorch_android_x86_64: ${docker_image_libtorch_android_x86_64}"
-          echo "docker_image_libtorch_android_arm_v7a: ${docker_image_libtorch_android_arm_v7a}"
-          echo "docker_image_libtorch_android_arm_v8a: ${docker_image_libtorch_android_arm_v8a}"
-
-          # x86_32
-          time docker pull "${docker_image_libtorch_android_x86_32}" >/dev/null
-          export id_x86_32
-          id_x86_32=$(docker run -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_32}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_32}" bash) 2>&1
-
-          # arm-v7a
-          time docker pull "${docker_image_libtorch_android_arm_v7a}" >/dev/null
-          export id_arm_v7a
-          id_arm_v7a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v7a}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_arm_v7a}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v7a"
-          docker cp "${id_arm_v7a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v7a"
-
-          # x86_64
-          time docker pull "${docker_image_libtorch_android_x86_64}" >/dev/null
-          export id_x86_64
-          id_x86_64=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_64}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_64}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_x86_64"
-          docker cp "${id_x86_64}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_x86_64"
-
-          # arm-v8a
-          time docker pull "${docker_image_libtorch_android_arm_v8a}" >/dev/null
-          export id_arm_v8a
-          id_arm_v8a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v8a}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v8a"
-          docker cp "${id_arm_v8a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v8a"
-
-          # Putting everything together
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a"
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64"
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a"
-
-          # run gradle buildRelease
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec \
-            -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --user jenkins \
-            -u jenkins -i "${id_x86_32}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts"
-          docker cp "${id_x86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/"
-
-          output_image="${DOCKER_IMAGE}-android-x86_32-gradle"
-          docker commit "${id_x86_32}" "${output_image}"
-          time docker push "${output_image}"
-      !{{ common_android.upload_androind_binary_size("prebuilt", "${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz") }}
-      - uses: !{{ common.upload_artifact_s3_action }}
-        name: Store PyTorch Android Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            build_android_artifacts/artifacts.tgz
-      !{{ common.teardown_ec2_linux() }}
-{%- endblock %}
diff --git a/.github/templates/android_ci_workflow.yml.j2 b/.github/templates/android_ci_workflow.yml.j2
deleted file mode 100644
index c86b94c1ad48..000000000000
--- a/.github/templates/android_ci_workflow.yml.j2
+++ /dev/null
@@ -1,111 +0,0 @@
-{%- extends "linux_ci_workflow.yml.j2" -%}
-{% import 'common_android.yml.j2' as common_android %}
-{%- set exclude_test = true -%}
-{% block name -%}
-# Template is at:    .github/templates/android_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif -%}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-  push:
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-
-{% block build +%}
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: !{{ test_runner_type }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build-and-test
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      !{{ common.calculate_docker_image(false) }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build
-        run: |
-          set -e
-          # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because:
-          # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build;
-          # 2) Not parallelizable by architecture: it only builds libtorch for one architecture;
-
-          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-
-          export BUILD_LITE_INTERPRETER
-          BUILD_LITE_INTERPRETER="1"
-          if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then
-            BUILD_LITE_INTERPRETER="0"
-          fi
-
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          # shellcheck disable=SC2016
-          export id
-          id=$(docker run -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e PR_LABELS \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e BUILD_LITE_INTERPRETER \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "$(pwd):/var/lib/jenkins/workspace" \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}")
-
-          # shellcheck disable=SC2016
-          export COMMAND
-          # shellcheck disable=SC2016
-          COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
-          echo "${COMMAND}" > ./command.sh && bash ./command.sh
-          # Skip docker push as this job is purely for size analysis purpose.
-          # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied.
-      !{{ common.parse_ref() }}
-      !{{ common_android.upload_androind_binary_size("custom-build-single", "") }}
-      !{{ common.teardown_ec2_linux() }}
-{%- endblock %}
diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
deleted file mode 100644
index 0480835794bc..000000000000
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ /dev/null
@@ -1,127 +0,0 @@
-{%- extends "linux_ci_workflow.yml.j2" -%}
-{% import 'common_android.yml.j2' as common_android %}
-{%- set exclude_test = true -%}
-{% block name -%}
-# Template is at:    .github/templates/bazel_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif -%}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-  push:
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-
-{% block build +%}
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: !{{ test_runner_type }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build-and-test
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      !{{ common.calculate_docker_image(false) }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e PR_LABELS \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
-      !{{ common.parse_ref() }}
-      !{{ common_android.upload_androind_binary_size("", "")}}
-      - name: Test
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          export SHARD_NUMBER=0
-          # TODO: Stop building test binaries as part of the build phase
-          # Make sure we copy test results from bazel-testlogs symlink to
-          # a regular directory ./test/test-reports
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e SHARD_NUMBER \
-            -e NUM_TEST_SHARDS \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e PR_LABELS \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      !{{ common.upload_test_reports(name='bazel') }}
-      !{{ common.upload_downloaded_files(name='bazel') }}
-      !{{ common.upload_test_statistics(build_environment) }}
-      !{{ common.teardown_ec2_linux() }}
-{%- endblock %}
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 3df9cec23254..f701f92cf64c 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -1,4 +1,4 @@
-{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%}
+{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v4" -%}
 
 {# squid_proxy is an private ELB that only available for GHA custom runners #}
 {%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
@@ -6,6 +6,10 @@
 {%- set squid_no_proxy = "localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 {%- set timeout_minutes = 240 -%}
 
+# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
+#       the binary builds will check out
+{%- set builder_branch = "main" -%}
+
 {%- macro concurrency(build_environment) -%}
 concurrency:
   group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -18,6 +22,37 @@ concurrency:
           }
 {%- endmacro -%}
 
+{%- macro gen_dispatch_rules(on_pull_request, is_scheduled, ciflow_labels, branches = ['master', 'main', 'release/*'], enable_doc_jobs = True) -%}
+on:
+{%- if on_pull_request %}
+  pull_request:
+{%- endif %}
+  push:
+{%- if enable_doc_jobs and is_scheduled %}
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+{%- endif %}
+{%- for label in ciflow_labels | sort %}
+  {%- if loop.first and not (enable_doc_jobs  and is_scheduled) %}
+    tags:
+  {%- endif %}
+      - '!{{ label }}/*'
+{%- endfor %}
+{%- if not is_scheduled %}
+    branches:
+{%- for branch in branches %}
+      - !{{ branch }}
+{%- endfor %}
+{%- endif %}
+{%- if is_scheduled %}
+  schedule:
+    - cron: !{{ is_scheduled }}
+{%- endif %}
+  workflow_dispatch:
+{%- endmacro -%}
+
 {%- macro display_ec2_information() -%}
       - name: Display EC2 information
         shell: bash
@@ -32,37 +67,46 @@ concurrency:
           echo "ami-id: $(get_ec2_metadata ami-id)"
           echo "instance-id: $(get_ec2_metadata instance-id)"
           echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
 {%- endmacro -%}
 
 {%- macro parse_ref(pytorch_directory="") -%}
       - name: Parse ref
+        shell: bash
 {%- if pytorch_directory %}
         working-directory: !{{ pytorch_directory }}
 {%- endif %}
         id: parse-ref
-        run: .github/scripts/parse_ref.py
+        run: ./.github/scripts/parse_ref.py
 {%- endmacro -%}
 
-{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="") -%}
-      - name: Display and upload test statistics (Click Me)
+{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="", needs_credentials=False) -%}
+      - name: Upload test statistics
 {%- if pytorch_directory %}
         working-directory: !{{ pytorch_directory }}
 {%- endif %}
         if: !{{ when }}
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
         env:
           AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           JOB_BASE_NAME: !{{ build_environment }}-test
           PR_NUMBER: ${{ github.event.pull_request.number }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           TAG: ${{ steps.parse-ref.outputs.tag }}
           WORKFLOW_ID: '${{ github.run_id }}'
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+{%- if needs_credentials %}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+{%- endif %}
         shell: bash
         run: |
+          set -x
           python3 -m pip install -r requirements.txt
           python3 -m pip install boto3==1.19.12
+          GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
+          export GHA_WORKFLOW_JOB_ID
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
 {%- endmacro -%}
 
@@ -80,19 +124,23 @@ concurrency:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
 {%- endmacro -%}
 
 {%- macro setup_ec2_linux() -%}
-      !{{ display_ec2_information() }}
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          !{{ add_retry_to_env() }}
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
       - name: Chown workspace
         run: |
           !{{ add_retry_to_env() }}
@@ -107,9 +155,6 @@ concurrency:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
 {%- endmacro -%}
 
 {%- macro setup_rocm_linux() -%}
@@ -185,10 +230,15 @@ concurrency:
           docker system prune -af
 {%- endmacro -%}
 
-{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch") -%}
+{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch", branch="", checkout_pr_head=True) -%}
       - name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }}
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
+      {%- if branch %}
+          ref: !{{ branch }}
+      {%- elif checkout_pr_head %}
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      {%- endif %}
       {%- if deep_clone %}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -209,13 +259,12 @@ concurrency:
       {%- endif %}
 {%- endmacro -%}
 
-{%- macro upload_downloaded_files(name, artifact_name="", use_s3=True, when="always()") -%}
+{%- macro upload_downloaded_files(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True, when="always()") -%}
       - name: Zip JSONs for upload
         if: !{{ when }}
         env:
 {%- if name == 'linux' or name == 'windows' or name == 'macos' %}
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-{%- else %}
+          FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}'{%- else %}
           FILE_SUFFIX: '!{{ name }}-${{ github.job }}'
 {%- endif %}
 {%- if name == 'windows' %}
@@ -247,12 +296,12 @@ concurrency:
             test-jsons-*.zip
 {%- endmacro -%}
 
-{%- macro upload_test_reports(name, artifact_name="", use_s3=True) -%}
+{%- macro upload_test_reports(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True) -%}
       - name: Zip test reports for upload
         if: always()
         env:
 {%- if name == 'linux' or name == 'windows' or name == 'macos' %}
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
+          FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}'
 {%- else %}
           FILE_SUFFIX: '!{{ name }}-${{ github.job }}'
 {%- endif %}
@@ -285,6 +334,25 @@ concurrency:
             test-reports-*.zip
 {%- endmacro -%}
 
+{%- macro upload_cores(artifact_name="coredumps", config=None, shard=None, use_s3=True) -%}
+{%- if use_s3 %}- uses: !{{ upload_artifact_s3_action }}
+        name: Store Core dumps on S3
+{%- else %}- uses: actions/upload-artifact@v2
+        name: Store Core dumps on Github
+{%- endif %}
+        if: failure()
+        with:
+{%- if config != "" and shard != "" %}
+          name: !{{ artifact_name }}-!{{ config }}-!{{ shard }}
+{%- else %}
+          name: !{{ artifact_name }}
+{%- endif %}
+          retention-days: 14
+          if-no-files-found: ignore
+          path:
+            ./**/core.[1-9]*
+{%- endmacro -%}
+
 {%- macro render_test_results() -%}
       - name: Install render_test_results dependencies
         if: always()
diff --git a/.github/templates/common_android.yml.j2 b/.github/templates/common_android.yml.j2
deleted file mode 100644
index a0e4e781b6ad..000000000000
--- a/.github/templates/common_android.yml.j2
+++ /dev/null
@@ -1,81 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- macro upload_androind_binary_size(build_type, artifacts) -%}
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          # The artifact file is created inside docker container, which contains the result binaries.
-          # Now unpackage it into the project folder. The subsequent script will scan project folder
-          # to locate result binaries and report their sizes.
-          # If artifact file is not provided it assumes that the project folder has been mounted in
-          # the docker during build and already contains the result binaries, so this step can be skipped.
-          export ARTIFACTS=!{{ artifacts }}
-          if [ -n "${ARTIFACTS}" ]; then
-            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
-            cd "${GITHUB_WORKSPACE}"
-          fi
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          ANDROID_BUILD_TYPE=!{{ build_type}}
-          export ANDROID_BUILD_TYPE
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
-{%- endmacro -%}
-
-{%- macro build_android(env_name, container_suffix) -%}
-      - name: Build-!{{ container_suffix }}
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          #!/bin/bash -eo pipefail
-          # Pull Docker image and run build
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-          echo "${DOCKER_IMAGE}"
-          export container_name
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT=!{{ env_name }} \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-          # Copy dist folder back
-          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-!{{ container_suffix }}
-          docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found"
-          docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-{%- endmacro -%}
diff --git a/.github/templates/docker_builds_ci_workflow.yml.j2 b/.github/templates/docker_builds_ci_workflow.yml.j2
deleted file mode 100644
index 224f683a35a4..000000000000
--- a/.github/templates/docker_builds_ci_workflow.yml.j2
+++ /dev/null
@@ -1,60 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- block name -%}
-# Template is at:    .github/templates/docker_builds_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.circleci/docker/**'
-      - '.github/workflows/generated-docker-builds.yml'
-{%- if is_scheduled %}
-  schedule:
-    - cron: !{{ is_scheduled }}
-{%- endif %}
-!{{ common.concurrency(build_environment) }}
-
-env:
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-
-jobs:
-{% block docker_build +%}
-  docker-build:
-    runs-on: linux.2xlarge
-    timeout-minutes: !{{ common.timeout_minutes }}
-    strategy:
-      matrix:
-        include:
-          {%- for docker_image in docker_images %}
-            - docker_image_base: '!{{ docker_image }}'
-              docker_image_short_name: '!{{ docker_image.split('/')[-1] }}'
-          {%- endfor %}
-    env:
-      DOCKER_IMAGE_BASE: '${{ matrix.docker_image_base }}'
-    name: docker-build (${{ matrix.docker_image_short_name }})
-    steps:
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      !{{ common.calculate_docker_image(true) }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      !{{ common.parse_ref() }}
-      !{{ common.teardown_ec2_linux() }}
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-{%- endblock %}
diff --git a/.github/templates/ios_ci_workflow.yml.j2 b/.github/templates/ios_ci_workflow.yml.j2
deleted file mode 100644
index f837a500a264..000000000000
--- a/.github/templates/ios_ci_workflow.yml.j2
+++ /dev/null
@@ -1,183 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- block name -%}
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif -%}
-
-{%- if is_scheduled %}
-  schedule:
-    - cron: !{{ is_scheduled }}
-{%- else %}
-  push:
-    branches:
-      - master
-      - release/*
-{%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: !{{ ios_platform }}
-  IOS_ARCH: !{{ ios_arch }}
-!{{ common.set_xcode_version(xcode_version) }}
-
-jobs:
-{% block build +%}
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.checkout() }}
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-{%- if ios_platform == "SIMULATOR" %}
-      - name: Run Simulator Tests
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-          # generate models for differnet backends
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
-          mkdir -p ../models
-          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-            pip install coremltools==5.0b5
-            pip install six==1.16.0
-            python coreml_backend.py
-          else
-            python trace_model.py
-          fi
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            echo "Setting up the TestApp for LiteInterpreter"
-            ruby setup.rb --lite 1
-          else
-            echo "Setting up the TestApp for Full JIT"
-            ruby setup.rb
-          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp"
-          instruments -s -devices
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-              fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
-            else
-              fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
-            fi
-          else
-            fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
-          fi
-{%- endif -%}
-{% endblock +%}
-
-!{{ common.concurrency(build_environment) }}
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index 86144ff3ddd3..e183a374ffea 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -9,17 +9,22 @@ name: !{{ build_environment }}
 
 on:
   push:
+    {%- if branches == "nightly" %}
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    {%- endif %}
     branches:
-      - nightly
+      - !{{ branches }}
+    {%- if branches == "nightly" %}
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    {%- endif %}
 {%- for label in ciflow_config.labels | sort %}
-  {%- if label != "ciflow/default" %}
+    {%- if loop.first and branches != "nightly" %}
+    tags:
+    {%- endif %}
       - '!{{ label }}/*'
-  {%- endif %}
 {%- endfor %}
   workflow_dispatch:
 
@@ -53,7 +58,7 @@ jobs:
     steps:
       !{{ common.setup_ec2_linux() }}
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
 {%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %}
       - name: Set BUILD_SPLIT_CUDA
         run: |
@@ -105,7 +110,9 @@ jobs:
   !{{ config["build_name"] }}-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: !{{ config["build_name"] }}-build
-{%- if config["gpu_arch_type"] == "cuda" %}
+{%- if config["gpu_arch_type"] == "rocm" %}
+    runs-on: linux.rocm.gpu
+{%- elif config["gpu_arch_type"] == "cuda" %}
     runs-on: linux.4xlarge.nvidia.gpu
 {%- else %}
     runs-on: linux.4xlarge
@@ -113,28 +120,34 @@ jobs:
     timeout-minutes: !{{ common.timeout_minutes }}
     !{{ upload.binary_env(config) }}
     steps:
+{%- if config["gpu_arch_type"] == "rocm" %}
+      !{{ common.setup_rocm_linux() }}
+{%- else %}
       !{{ common.setup_ec2_linux() }}
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+{%- endif %}
+      - uses: seemethere/download-artifact-s3@v3
         name: Download Build Artifacts
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-{%- if config["gpu_arch_type"] == "cuda" %}
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
+{%- if config["gpu_arch_type"] == "rocm" %}
+      - name: ROCm set GPU_FLAG
         run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+{%- elif config["gpu_arch_type"] == "cuda" %}
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
 {%- endif %}
       - name: Pull Docker image
         run: |
@@ -173,6 +186,12 @@ jobs:
           # Generate test script
           docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
           docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+{%- if config["gpu_arch_type"] == "rocm" %}
+      !{{ common.teardown_rocm_linux() }}
+{%- else %}
       !{{ common.teardown_ec2_linux("pytorch/") }}
+{%- endif %}
+  {%- if branches == "nightly" %}
   !{{ upload.upload_binaries(config) }}
+  {%- endif %}
 {%- endfor %}
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
deleted file mode 100644
index 660c0a74ba59..000000000000
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ /dev/null
@@ -1,455 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- block name -%}
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif %}
-  push:
-{%- if enable_doc_jobs and is_scheduled %}
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-{%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first and not (enable_doc_jobs  and is_scheduled) %}
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-{%- if not is_scheduled and not only_on_pr %}
-    branches:
-      - master
-      - release/*
-{%- endif %}
-{%- if is_scheduled and not only_on_pr %}
-  schedule:
-    - cron: !{{ is_scheduled }}
-{%- endif %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  DOCKER_IMAGE_BASE: !{{ docker_image_base }}
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-{%- if build_with_debug %}
-  DEBUG: 1
-{%- endif %}
-!{{ common.concurrency(build_environment) }}
-
-jobs:
-{% block build +%}
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      !{{ common.calculate_docker_image(false) }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      !{{ common.parse_ref() }}
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      {%- if build_generates_artifacts %}
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: !{{ common.upload_artifact_s3_action }}
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      {%- endif %}
-      !{{ common.teardown_ec2_linux() }}
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-{%- endblock %}
-{%- if not exclude_test %}
-{% block test +%}
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      TEST_RUNNER_TYPE: !{{ test_runner_type }}
-      ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }}
-      ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }}
-      ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }}
-      ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
-      ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
-      ENABLE_SLOW_TEST: !{{ enable_slow_test }}
-      ENABLE_DOCS_TEST: !{{ enable_docs_test }}
-      ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
-      ENABLE_XLA_TEST: !{{ enable_xla_test }}
-      ENABLE_NOARCH_TEST: !{{ enable_noarch_test }}
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-      MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }}
-      DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }}
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: !{{ build_environment }}-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-{%- if 'rocm' in test_runner_type %}
-      !{{ common.setup_rocm_linux() }}
-{%- else %}
-      !{{ common.setup_ec2_linux() }}
-{%- endif %}
-      !{{ common.checkout() }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-{%- if 'rocm' in test_runner_type %}
-      - name: ROCm set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-{%- else %}
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-{%- endif %}
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-{%- if 'rocm' in test_runner_type %}
-          df -H
-{%- else %}
-          sudo df -H
-{%- endif %}
-      !{{ common.parse_ref() }}
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after !{{ timeout_after }} minutes
-        timeout-minutes: !{{ timeout_after }}
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-{%- if 'rocm' not in test_runner_type %}
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=!{{ common.squid_proxy }} -e https_proxy=!{{ common.squid_proxy }} -e no_proxy=!{{ common.squid_no_proxy }}"
-          fi
-{%- endif %}
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-{%- if 'rocm' not in test_runner_type %}
-            ${PROXY_ENV} \
-{%- endif %}
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-{%- if 'rocm' not in test_runner_type %}
-            --ipc=host \
-{%- endif %}
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-{%- if 'rocm' in test_runner_type %}
-          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
-          docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
-          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
-          docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
-{%- else %}
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-{%- endif %}
-{%- if 'rocm' not in test_runner_type %}
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-{%- endif %}
-      !{{ common.render_test_results() }}
-{%- if 'rocm' in test_runner_type %}
-      !{{ common.upload_downloaded_files(name='linux', use_s3=False) }}
-      !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False) }}
-{%- else %}
-      !{{ common.upload_downloaded_files(name='linux') }}
-      !{{ common.upload_test_reports(name='linux') }}
-{%- endif %}
-      !{{ common.upload_test_statistics(build_environment) }}
-{%- if 'rocm' in test_runner_type %}
-      !{{ common.teardown_rocm_linux() }}
-{%- else %}
-      !{{ common.teardown_ec2_linux() }}
-{%- endif %}
-{% endblock %}
-{%- endif -%}
-{%- if enable_doc_jobs %}
-  build-docs:
-    runs-on: linux.2xlarge
-    timeout-minutes: !{{ common.timeout_minutes }}
-    strategy:
-      matrix:
-        docs_type: [cpp, python]
-    needs: [build]
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      DOCS_TYPE: ${{ matrix.docs_type }}
-      WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
-    steps:
-      !{{ common.setup_ec2_linux() }}
-      !{{ common.checkout() }}
-      - name: Pull Docker image
-        run: |
-          !{{ common.add_retry_to_env() }}
-          retry docker pull "${DOCKER_IMAGE}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-{%- if is_scheduled %}
-      - name: Generate netrc (only for docs-push)
-        if: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
-        env:
-          GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
-        run: |
-          # set credentials for https pushing
-          echo "machine github.com" > "${RUNNER_TEMP}/.netrc"
-          echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc"
-          echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc"
-{%- endif %}
-      - name: Build ${{ matrix.docs_type }} docs
-        run: |
-          set -ex
-          time docker pull "${DOCKER_IMAGE}" > /dev/null
-          # Convert refs/tags/v1.12.0rc3 into 1.12
-          if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then
-            target="${BASH_REMATCH[1]}"
-          else
-            target="master"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e IN_CI \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SHA1="$GITHUB_SHA" \
-            -e DOCS_VERSION="${target}" \
-            -e DOCS_TYPE \
-            -e PR_LABELS \
-            -e WITH_PUSH \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-{%- if is_scheduled %}
-            -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \
-{%- endif %}
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: !{{ common.upload_artifact_s3_action }}
-        name: Upload Python Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: pytorch.github.io/docs/master/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}
-      - uses: !{{ common.upload_artifact_s3_action }}
-        name: Upload C++ Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }}
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          s3-bucket: doc-previews
-          path: cppdocs/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs
-{%- endif -%}
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 604d8251bc9c..2640aab74fc8 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -1,4 +1,5 @@
 {% import 'common.yml.j2' as common %}
+{% import 'upload.yml.j2' as upload %}
 
 {%- block name -%}
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
@@ -6,24 +7,6 @@
 name: !{{ build_environment }}
 {%- endblock %}
 
-{%- macro binary_env(config) -%}
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: !{{ config["package_type"] }}
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-{%- if config["package_type"] == "libtorch" %}
-      LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
-      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-{%- else %}
-      DESIRED_PYTHON: "!{{ config["python_version"] }}"
-{%- endif %}
-{%- endmacro %}
-
 {%- macro set_runner_specific_vars() -%}
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -50,9 +33,10 @@ on:
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
 {%- for label in ciflow_config.labels | sort %}
-  {%- if label != "ciflow/default" %}
+    {%- if loop.first and branches != "nightly" %}
+    tags:
+    {%- endif %}
       - '!{{ label }}/*'
-  {%- endif %}
 {%- endfor %}
   workflow_dispatch:
 
@@ -76,14 +60,19 @@ env:
 jobs:
 {%- for config in build_configs %}
   !{{ config["build_name"] }}-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    {%- if cross_compile_arm64 %}
+    runs-on: macos-12
+    {%- else %}
     runs-on: macos-10.15
+    {%- endif %}
 {%- if config["package_type"] == "libtorch" %}
     # libtorch builds take a long time on github hosted runners
     timeout-minutes: 720
 {%- else %}
     timeout-minutes: !{{ common.timeout_minutes }}
 {%- endif %}
-    !{{ binary_env(config) }}
+    !{{ upload.binary_env(config, true) }}
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -96,16 +85,8 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
         if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
         run: |
@@ -129,53 +110,5 @@ jobs:
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  !{{ config["build_name"] }}-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: !{{ config["build_name"] }}-build
-    !{{ binary_env(config) }}
-    steps:
-      !{{ common.setup_ec2_linux() }}
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      !{{ common.teardown_ec2_linux() }}
+  !{{ upload.upload_binaries(config, has_test=False, use_s3=False) }}
 {%- endfor %}
diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2
deleted file mode 100644
index f8b0d4cc30eb..000000000000
--- a/.github/templates/macos_ci_workflow.yml.j2
+++ /dev/null
@@ -1,154 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- block name -%}
-# Template is at:    .github/templates/macos_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-on:
-{%- if is_default -%}
-  pull_request:
-{%- endif -%}
-
-{%- if is_scheduled %}
-  schedule:
-    - cron: !{{ is_scheduled }}
-{%- else %}
-  push:
-    branches:
-      - master
-      - release/*
-{%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-  workflow_dispatch:
-
-# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
-defaults:
-  run:
-    shell: bash -e -l {0}
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  COMPACT_JOB_NAME: !{{ build_environment }}
-  IN_CI: 1
-  IS_GHA: 1
-  PYTORCH_RETRY_TEST_CASES: 1
-!{{ common.set_xcode_version(xcode_version) }}
-
-jobs:
-{% block build +%}
-  build:
-    runs-on: !{{ test_runner_type }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      !{{ common.checkout() }}
-      !{{ common.setup_miniconda("3.8") }}
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Build
-        run: |
-          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          .jenkins/pytorch/macos-build.sh
-{%- if build_generates_artifacts %}
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/
-      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts on GHA
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-{%- endif %}
-{% endblock +%}
-{%- if not exclude_test %}
-{% block test +%}
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      TEST_RUNNER_TYPE: !{{ test_runner_type }}
-      ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }}
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      !{{ common.checkout(submodules="false") }}
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Build Artifacts from GHA
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: .
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      !{{ common.setup_miniconda("3.8") }}
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      !{{ common.parse_ref() }}
-      - name: Test
-        run: |
-          python3 -mpip install dist/*.whl
-          .jenkins/pytorch/macos-test.sh
-      !{{ common.render_test_results() }}
-      !{{ common.upload_downloaded_files(name='macos', artifact_name="test-jsons", use_s3=False) }}
-      !{{ common.upload_test_reports("macos", artifact_name="test-reports", use_s3=False) }}
-      !{{ common.upload_test_statistics(build_environment) }}
-{% endblock +%}
-{%- endif %}
-
-!{{ common.concurrency(build_environment) }}
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 4dc13971da1f..63bec412997e 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -19,8 +19,13 @@
 {%- endif %}
       SKIP_ALL_TESTS: 1
 {%- if config["package_type"] == "libtorch" %}
+{%- if config["libtorch_config"] %}
+      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
+{%- endif %}
       LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
+{%- if config["devtoolset"] %}
       DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+{%- endif %}
 {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
@@ -32,17 +37,25 @@
 {%- endmacro %}
 
 
-{%- macro upload_binaries(config, is_windows=False) -%}
+{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
 !{{ config["build_name"] }}-upload:  # Uploading
     runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
     if: ${{ github.repository_owner == 'pytorch' }}
+{%- if has_test %}
     needs: !{{ config["build_name"] }}-test
+{%- else %}
+    needs: !{{ config["build_name"] }}-build
+{%- endif %}
     !{{ binary_env(config, is_windows) }}
     steps:
       !{{ common.setup_ec2_linux() }}
       - name: Clone pytorch/pytorch
         uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+{%- if use_s3 %}
+      - uses: seemethere/download-artifact-s3@v3
+{%- else %}
+      - uses: actions/download-artifact@v2
+{%- endif %}
         name: Download Build Artifacts
         with:
           name: !{{ config["build_name"] }}
@@ -63,8 +76,8 @@
           PKG_DIR: "${{ runner.temp }}/artifacts"
           UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
           # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
           ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
         run: |
           docker run --rm -i \
diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2
index 5f491767c06a..0fcfbf9096b8 100644
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@@ -21,17 +21,22 @@ name: !{{ build_environment }}
 
 on:
   push:
+    {%- if branches == "nightly" %}
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    {%- endif %}
     branches:
-      - nightly
+      - !{{ branches }}
+    {%- if branches == "nightly" %}
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    {%- endif %}
 {%- for label in ciflow_config.labels | sort %}
-  {%- if label != "ciflow/default" %}
+    {%- if loop.first and branches != "nightly" %}
+    tags:
+    {%- endif %}
       - '!{{ label }}/*'
-  {%- endif %}
 {%- endfor %}
   workflow_dispatch:
 
@@ -54,22 +59,15 @@ env:
 jobs:
 {%- for config in build_configs %}
   !{{ config["build_name"] }}-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: !{{ common.timeout_minutes }}
     !{{ upload.binary_env(config, True) }}
     steps:
       !{{ common.setup_ec2_windows() }}
       !{{ set_runner_specific_vars() }}
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
       - name: Populate binary env
         shell: bash
         run: |
@@ -99,21 +97,13 @@ jobs:
     steps:
       !{{ common.setup_ec2_windows() }}
       !{{ set_runner_specific_vars() }}
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+      - uses: seemethere/download-artifact-s3@v3
         name: Download Build Artifacts
         with:
           name: !{{ config["build_name"] }}
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
       - name: Populate binary env
         shell: bash
         run: |
@@ -123,5 +113,7 @@ jobs:
         run: |
           "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
       !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+  {%- if branches == "nightly" %}
   !{{ upload.upload_binaries(config, True) }}
+  {%- endif %}
 {%- endfor %}
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
deleted file mode 100644
index 21f067101d9c..000000000000
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ /dev/null
@@ -1,231 +0,0 @@
-{% import 'common.yml.j2' as common %}
-
-{%- macro wait_and_kill_ssh() -%}
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-{%- endmacro -%}
-
-# Template is at:    .github/templates/windows_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-
-on:
-{%- if is_default %}
-  pull_request:
-{%- endif %}
-  push:
-{%- for label in ciflow_config.labels | sort %}
-  {%- if loop.first %}
-    tags:
-  {%- endif %}
-  {%- if label != "ciflow/default" %}
-      - '!{{ label }}/*'
-  {%- endif %}
-{%- endfor %}
-{%- if not is_scheduled %}
-    branches:
-      - master
-      - release/*
-{%- else %}
-  schedule:
-    - cron: !{{ is_scheduled }}
-{%- endif %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  BUILD_WHEEL: 1
-  MAX_JOBS: 8
-  CUDA_VERSION: "!{{ cuda_version }}"
-  IN_CI: 1
-  IS_GHA: 1
-  INSTALL_WINDOWS_SDK: 1
-  PYTHON_VERSION: "3.8"
-  PYTORCH_RETRY_TEST_CASES: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  SCCACHE_BUCKET: "ossci-compiler-cache"
-  VC_PRODUCT: "BuildTools"
-  VC_VERSION: ""
-  VS_VERSION: "16.8.6"
-  VC_YEAR: "2019"
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: !{{ common.squid_no_proxy }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-{%- if build_with_debug %}
-  DEBUG: 1
-{%- endif %}
-{%- if cuda_version != "cpu" %}
-  TORCH_CUDA_ARCH_LIST: "7.0"
-{%- endif %}
-  USE_CUDA: !{{ 1 if cuda_version != "cpu" else 0 }}
-
-!{{ common.concurrency(build_environment) }}
-
-jobs:
-  build:
-    runs-on: "windows.4xlarge"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-build
-      http_proxy: "!{{ common. squid_proxy }}"
-      https_proxy: "!{{ common.squid_proxy }}"
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      !{{ common.checkout() }}
-      !{{ common.display_ec2_information() }}
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-{%- if cuda_version != "cpu" %}
-      - name: Install Cuda
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-{%- endif %}
-      !{{ common.parse_ref() }}
-      - name: Build
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          .jenkins/pytorch/win-build.sh
-      # Upload to github so that people can click and download artifacts
-      - name: Upload artifacts to s3
-        uses: !{{ common.upload_artifact_s3_action }}
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      !{{ common.wait_and_kill_ssh_windows() }}
-      - name: Cleanup build-results and workspaces
-        if: always()
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}"
-          rm -rf ./*
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      TEST_RUNNER_TYPE: !{{ test_runner_type }}
-      NUM_TEST_SHARDS: !{{ num_test_shards }}
-      NUM_TEST_SHARDS_ON_PULL_REQUEST: !{{ num_test_shards_on_pull_request }}
-      PR_BODY: ${{ github.event.pull_request.body }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }}
-      RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    timeout-minutes: !{{ common.timeout_minutes }}
-    env:
-      JOB_BASE_NAME: !{{ build_environment }}-test
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "!{{ common.squid_proxy }}"
-      https_proxy: "!{{ common.squid_proxy }}"
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      !{{ common.display_ec2_information() }}
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      !{{ common.checkout() }}
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-{%- if cuda_version != "cpu" %}
-      - name: Install Cuda
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-{%- endif %}
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Check build-results folder
-        shell: powershell
-        run: |
-          tree /F C:\$Env:GITHUB_RUN_ID\build-results
-      # Needed for coverage in win-test.sh
-      - uses: actions/setup-python@v2
-        name: Setup Python3
-        with:
-          python-version: '3.x'
-      - name: Test
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-            .jenkins/pytorch/win-test.sh
-      !{{ common.upload_downloaded_files(name='windows') }}
-      !{{ common.upload_test_reports(name='windows') }}
-      !{{ common.render_test_results() }}
-      !{{ common.wait_and_kill_ssh_windows() }}
-      !{{ common.parse_ref() }}
-      !{{ common.upload_test_statistics(build_environment) }}
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
diff --git a/.github/workflows/_android-build-test.yml b/.github/workflows/_android-build-test.yml
new file mode 100644
index 000000000000..a489d7d7e002
--- /dev/null
+++ b/.github/workflows/_android-build-test.yml
@@ -0,0 +1,150 @@
+name: android-build-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  build-and-test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.2xlarge]
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
+          xla: ${{ contains(inputs.build-environment, 'xla') }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Output disk space left
+        run: |
+          sudo df -H
+
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: Build
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          TORCH_CUDA_ARCH_LIST: 5.2
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        run: |
+          set -e
+          # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because:
+          # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build;
+          # 2) Not parallelizable by architecture: it only builds libtorch for one architecture;
+
+          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
+          time docker pull "${DOCKER_IMAGE}" >/dev/null
+
+          export BUILD_LITE_INTERPRETER
+          BUILD_LITE_INTERPRETER="1"
+          if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then
+            BUILD_LITE_INTERPRETER="0"
+          fi
+
+          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
+          export id
+          id=$(docker run -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e PR_LABELS \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e BUILD_LITE_INTERPRETER \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "$(pwd):/var/lib/jenkins/workspace" \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}")
+
+          export COMMAND
+          # shellcheck disable=SC2016
+          COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1'
+          echo "${COMMAND}" > ./command.sh && bash ./command.sh
+          # Skip docker push as this job is purely for size analysis purpose.
+          # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied.
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          ARTIFACTS: ""
+          ANDROID_BUILD_TYPE: custom-build-single
+        run: |
+          # The artifact file is created inside docker container, which contains the result binaries.
+          # Now unpackage it into the project folder. The subsequent script will scan project folder
+          # to locate result binaries and report their sizes.
+          # If artifact file is not provided it assumes that the project folder has been mounted in
+          # the docker during build and already contains the result binaries, so this step can be skipped.
+          if [ -n "${ARTIFACTS}" ]; then
+            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
+            cd "${GITHUB_WORKSPACE}"
+          fi
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
+
+      - name: Chown workspace
+        uses: ./.github/actions/chown-workspace
+        if: always()
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml
new file mode 100644
index 000000000000..d0b8845a6620
--- /dev/null
+++ b/.github/workflows/_android-full-build-test.yml
@@ -0,0 +1,222 @@
+name: android-full-build-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+
+    secrets:
+      SONATYPE_NEXUS_USERNAME:
+        description: nexus user
+        required: true
+      SONATYPE_NEXUS_PASSWORD:
+        description: nexus pass
+        required: true
+      ANDROID_SIGN_KEY:
+        description: android key
+        required: true
+      ANDROID_SIGN_PASS:
+        description: android pass
+        required: true
+      SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        description: token for writing to scribe/scuba
+        required: true
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  build:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.2xlarge]
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Output disk space left
+        shell: bash
+        run: |
+          sudo df -H
+
+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Build arm-v7a
+        uses: ./.github/actions/build-android
+        with:
+          arch: arm_v7a
+          arch-for-build-env: arm-v7a
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          build-environment: ${{ inputs.build-environment }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          branch: ${{ steps.parse-ref.outputs.branch }}
+
+      - name: Build arm-v8a
+        uses: ./.github/actions/build-android
+        with:
+          arch: arm_v8a
+          arch-for-build-env: arm-v8a
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          build-environment: ${{ inputs.build-environment }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          branch: ${{ steps.parse-ref.outputs.branch }}
+
+      - name: Build x86_32
+        id: build-x86_32
+        uses: ./.github/actions/build-android
+        with:
+          arch: x86_32
+          arch-for-build-env: x86_32
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          build-environment: ${{ inputs.build-environment }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          branch: ${{ steps.parse-ref.outputs.branch }}
+
+      - name: Build x86_64
+        uses: ./.github/actions/build-android
+        with:
+          arch: x86_64
+          arch-for-build-env: x86_64
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          build-environment: ${{ inputs.build-environment }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          branch: ${{ steps.parse-ref.outputs.branch }}
+
+      - name: Build final artifact
+        env:
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          AWS_DEFAULT_REGION: us-east-1
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          ID_X86_32: ${{ steps.build-x86_32.outputs.container_id }}
+        run: |
+          set -eux
+
+          # Putting everything together
+          # ID_X86_32 container were created during build-x86_32 step
+          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a"
+          docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64"
+          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a"
+          docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_32" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_32"
+
+          # run gradle buildRelease
+          (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \
+            -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            -e IS_GHA \
+            -e PR_NUMBER \
+            -e SHA1 \
+            -e BRANCH \
+            -e GITHUB_RUN_ID \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --user jenkins \
+            -u jenkins -i "${ID_X86_32}" bash) 2>&1
+
+          mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts"
+          docker cp "${ID_X86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/"
+
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          ANDROID_BUILD_TYPE: prebuilt
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+        run: |
+          # The artifact file is created inside docker container, which contains the result binaries.
+          # Now unpackage it into the project folder. The subsequent script will scan project folder
+          # to locate result binaries and report their sizes.
+          # If artifact file is not provided it assumes that the project folder has been mounted in
+          # the docker during build and already contains the result binaries, so this step can be skipped.
+          export ARTIFACTS=${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz
+          if [ -n "${ARTIFACTS}" ]; then
+            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
+            cd "${GITHUB_WORKSPACE}"
+          fi
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
+
+      - name: Publish android snapshot
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }}
+        env:
+          SONATYPE_NEXUS_USERNAME: ${{ secrets.SONATYPE_NEXUS_USERNAME }}
+          SONATYPE_NEXUS_PASSWORD: ${{ secrets.SONATYPE_NEXUS_PASSWORD }}
+          ANDROID_SIGN_KEY: ${{ secrets.ANDROID_SIGN_KEY }}
+          ANDROID_SIGN_PASS: ${{ secrets.ANDROID_SIGN_PASS }}
+          ID_X86_32: ${{ steps.build-x86_32.outputs.container_id }}
+        run: |
+          set -eux
+          (echo "./.circleci/scripts/publish_android_snapshot.sh" | docker exec \
+            -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-publish-snapshot" \
+            -e SONATYPE_NEXUS_USERNAME \
+            -e SONATYPE_NEXUS_PASSWORD \
+            -e ANDROID_SIGN_KEY \
+            -e ANDROID_SIGN_PASS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -u jenkins -i "${ID_X86_32}" bash) 2>&1
+
+      - name: Store PyTorch Android Build Artifacts on S3
+        uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: build_android_artifacts/artifacts.tgz
+
+      - name: Chown workspace
+        uses: ./.github/actions/chown-workspace
+        if: always()
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
new file mode 100644
index 000000000000..0b782aa9708b
--- /dev/null
+++ b/.github/workflows/_bazel-build-test.yml
@@ -0,0 +1,185 @@
+name: bazel
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  build-and-test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.2xlarge]
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Output disk space left
+        run: |
+          sudo df -H
+
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Build
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
+          # TODO duplicated
+          AWS_DEFAULT_REGION: us-east-1
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          TORCH_CUDA_ARCH_LIST: 5.2
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e PR_LABELS \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
+
+      # !{{ common_android.upload_android_binary_size("", "")}}
+      - name: Test
+        # Time out the test phase after 3.5 hours
+        timeout-minutes: 210
+        env:
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          PYTORCH_RETRY_TEST_CASES: 1
+          PR_BODY: ${{ github.event.pull_request.body }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          export SHARD_NUMBER=0
+          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+          export COMMIT_MESSAGES
+          # TODO: Stop building test binaries as part of the build phase
+          # Make sure we copy test results from bazel-testlogs symlink to
+          # a regular directory ./test/test-reports
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e GIT_DEFAULT_BRANCH="$GIT_DEFAULT_BRANCH" \
+            -e IN_CI \
+            -e SHARD_NUMBER \
+            -e NUM_TEST_SHARDS \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e PR_LABELS \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="1g" \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
+
+      - name: Chown workspace
+        uses: ./.github/actions/chown-workspace
+        if: always()
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always()
+        with:
+          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Upload test statistics
+        if: always()
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        shell: bash
+        run: |
+          set -x
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
new file mode 100644
index 000000000000..96ed63cbb0f6
--- /dev/null
+++ b/.github/workflows/_docs.yml
@@ -0,0 +1,132 @@
+name: build docs
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      push:
+        required: false
+        type: boolean
+        default: false
+        description: If set, push the docs to the docs website.
+
+    secrets:
+      GH_PYTORCHBOT_TOKEN:
+        required: false
+        description: Permissions for pushing to the docs site.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+
+jobs:
+  build-docs:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.2xlarge]
+    strategy:
+      matrix:
+        docs_type: [cpp, python]
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ inputs.docker-image }}
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+
+      - name: Generate netrc (only for docs-push)
+        if: inputs.push
+        env:
+          GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
+        run: |
+          # set credentials for https pushing
+          echo "machine github.com" > "${RUNNER_TEMP}/.netrc"
+          echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc"
+          echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc"
+
+      - name: Build ${{ matrix.docs_type }} docs
+        env:
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          DOCS_TYPE: ${{ matrix.docs_type }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+        run: |
+          set -ex
+          # Convert refs/tags/v1.12.0rc3 into 1.12
+          if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then
+            target="${BASH_REMATCH[1]}"
+          else
+            target="master"
+          fi
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e IN_CI \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SHA1="$GITHUB_SHA" \
+            -e DOCS_VERSION="${target}" \
+            -e DOCS_TYPE \
+            -e PR_LABELS \
+            -e WITH_PUSH \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
+
+      - name: Chown workspace
+        uses: ./.github/actions/chown-workspace
+        if: always()
+
+      - name: Upload Python Docs Preview
+        uses: seemethere/upload-artifact-s3@v4
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: pytorch.github.io/docs/master/
+          s3-prefix: pytorch/${{ github.event.pull_request.number }}
+
+      - name: Upload C++ Docs Preview
+        uses: seemethere/upload-artifact-s3@v4
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }}
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          s3-bucket: doc-previews
+          path: cppdocs/
+          s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs
diff --git a/.github/workflows/_ios-build-test.yml b/.github/workflows/_ios-build-test.yml
new file mode 100644
index 000000000000..fa3b7e2836f8
--- /dev/null
+++ b/.github/workflows/_ios-build-test.yml
@@ -0,0 +1,187 @@
+name: ios-build-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      ios-platform:
+        required: true
+        type: string
+        description: Which iOS platform to build for.
+      ios-arch:
+        required: true
+        type: string
+        description: Which iOS arch to build for.
+
+    secrets:
+      IOS_CERT_KEY_2022:
+        required: true
+        description: ios cert
+      IOS_CERT_SECRET:
+        required: true
+        description: ios cert
+      IOS_DEV_TEAM_ID:
+        required: true
+        description: ios cert
+      IOS_SIGN_KEY_2022:
+        required: true
+        description: ios cert
+
+env:
+  IN_CI: 1
+  IS_GHA: 1
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+  BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+  IOS_PLATFORM: ${{ inputs.ios-platform }}
+  IOS_ARCH: ${{ inputs.ios-arch }}
+
+jobs:
+  build:
+    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
+    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
+    if: github.repository_owner == 'pytorch'
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      JOB_BASE_NAME: ${{ inputs.build-environment }}-build
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Populate CI build options
+        run: |
+          # Most builds use the lite interpreter, if certain builds shouldn't
+          # build the lite interpreter this env variable should get over-written
+          # in the following case statement
+          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
+
+          case ${BUILD_ENVIRONMENT} in
+            *metal*)
+              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
+              ;;
+            *full_jit*)
+              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
+              ;;
+            *custom*)
+              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
+              ;;
+            *coreml*)
+              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
+              ;;
+          esac
+
+      - name: Install brew dependencies
+        run: |
+          # Install dependencies
+          brew install libtool
+
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          conda install -y \
+            cffi \
+            cmake \
+            mkl \
+            mkl-include \
+            ninja \
+            numpy \
+            pyyaml \
+            requests \
+            setuptools \
+            typing_extensions
+
+      - name: Run Fastlane
+        run: |
+          set -x
+          cd ios/TestApp
+          # install fastlane
+          sudo gem install bundler && bundle install
+          # install certificates
+          echo "${IOS_CERT_KEY_2022}" >> cert.txt
+          base64 --decode cert.txt -o Certificates.p12
+          rm cert.txt
+          bundle exec fastlane install_root_cert
+          bundle exec fastlane install_dev_cert
+          # install the provisioning profile
+          PROFILE=PyTorch_CI_2022.mobileprovision
+          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
+          mkdir -pv "${PROVISIONING_PROFILES}"
+          cd "${PROVISIONING_PROFILES}"
+          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
+          base64 --decode cert.txt -o ${PROFILE}
+          rm cert.txt
+
+      - name: Build
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          export TCLLIBPATH="/usr/local/lib"
+          python -VV
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
+          scripts/build_ios.sh
+
+      - name: Run Build Test
+        run: |
+          PROFILE=PyTorch_CI_2022
+          # run the ruby build script
+          if ! [ -x "$(command -v xcodebuild)" ]; then
+            echo 'Error: xcodebuild is not installed.'
+            exit 1
+          fi
+          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
+            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
+          else
+            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
+          fi
+
+      - name: Run Simulator Tests
+        if: inputs.ios-platform == 'SIMULATOR'
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+          # generate models for differnet backends
+          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
+          mkdir -p ../models
+          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
+            pip install coremltools==5.0b5
+            pip install six==1.16.0
+            python coreml_backend.py
+          else
+            cd "${GITHUB_WORKSPACE}"
+            python test/mobile/model_test/gen_test_model.py ios-test
+          fi
+          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
+          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
+            echo "Setting up the TestApp for LiteInterpreter"
+            ruby setup.rb --lite 1
+          else
+            echo "Setting up the TestApp for Full JIT"
+            ruby setup.rb
+          fi
+          cd "${GITHUB_WORKSPACE}/ios/TestApp"
+          instruments -s -devices
+          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
+            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
+              fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
+            else
+              fastlane scan --skip_testing TestAppTests/TestAppTests/testCoreML
+            fi
+          else
+            fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
+          fi
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
new file mode 100644
index 000000000000..cf6419f208e2
--- /dev/null
+++ b/.github/workflows/_linux-build.yml
@@ -0,0 +1,158 @@
+name: linux-build
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+
+jobs:
+  build:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.2xlarge]
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+    steps:
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Check for new workflows
+        run: |
+          if [ ! -f "./.github/actions/setup-linux/action.yml" ]; then
+            echo "::error::Your PR is based on a version of master that is too old for our CI to work. Please rebase your PR on latest master and resubmit."
+            exit 1
+          fi
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
+          xla: ${{ contains(inputs.build-environment, 'xla') }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Build
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-build
+          # TODO duplicated
+          AWS_DEFAULT_REGION: us-east-1
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          TORCH_CUDA_ARCH_LIST: 5.2
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            -e IS_GHA \
+            -e PR_NUMBER \
+            -e SHA1 \
+            -e BRANCH \
+            -e GITHUB_RUN_ID \
+            -e SCCACHE_BUCKET \
+            -e XLA_CUDA \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c '.jenkins/pytorch/build.sh'
+
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+
+      - name: Archive artifacts into zip
+        if: inputs.build-generates-artifacts
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+
+      - name: Store PyTorch Build Artifacts on S3
+        uses: seemethere/upload-artifact-s3@v4
+        if: inputs.build-generates-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
new file mode 100644
index 000000000000..37ea69e531da
--- /dev/null
+++ b/.github/workflows/_linux-test.yml
@@ -0,0 +1,194 @@
+name: linux-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ inputs.docker-image }}
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          PYTORCH_RETRY_TEST_CASES: 1
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          PR_BODY: ${{ github.event.pull_request.body }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+        timeout-minutes: 240
+        run: |
+          set -x
+
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
+            TEST_COMMAND=.jenkins/caffe2/test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+
+          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+          export COMMIT_MESSAGES
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IS_GHA \
+            -e BRANCH \
+            -e SHA1 \
+            -e AWS_DEFAULT_REGION \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PR_BODY \
+            -e COMMIT_MESSAGES \
+            -e PYTORCH_RETRY_TEST_CASES \
+            -e PR_LABELS \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e XLA_CUDA \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --ulimit stack=10485760:83886080 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --ipc=host \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c "pip install dist/*.whl && ${TEST_COMMAND}"
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always()
+        with:
+          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Store Core dumps on S3
+        uses: seemethere/upload-artifact-s3@v4
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Upload test statistics
+        if: always()
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        shell: bash
+        run: |
+          set -x
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
new file mode 100644
index 000000000000..c5a93c7c32f4
--- /dev/null
+++ b/.github/workflows/_mac-build.yml
@@ -0,0 +1,103 @@
+name: mac-build
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      runner-type:
+        required: true
+        type: string
+        description: Name of the GitHub-managed runner type to use for the build.
+      build-generates-artifacts:
+        required: true
+        type: boolean
+        description: If set, upload generated build artifacts.
+      xcode-version:
+        required: false
+        type: string
+        default: ""
+        description: What xcode version to build with.
+
+    secrets:
+      MACOS_SCCACHE_S3_ACCESS_KEY_ID:
+        required: true
+        description: Access key for S3 bucket for macOS sccache.
+      MACOS_SCCACHE_S3_SECRET_ACCESS_KEY:
+        required: true
+        description: Secret for S3 bucket for macOS sccache.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+
+# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
+defaults:
+  run:
+    shell: bash -e -l {0}
+
+jobs:
+  build:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: ${{ inputs.runner-type }}
+    env:
+      JOB_BASE_NAME: ${{ inputs.build-environment }}
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+      BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+      COMPACT_JOB_NAME: ${{ inputs.build-environment }}
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Set xcode version
+        env:
+          XCODE_VERSION: ${{ inputs.xcode-version }}
+        run: |
+          if [ -n "${XCODE_VERSION}" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_${XCODE_VERSION}.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Setup miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: 3.8
+          activate-environment: build
+          miniconda-version: 4.7.12
+
+      - name: Install macOS homebrew dependencies
+        run: |
+          # Install dependencies
+          brew install libomp
+
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+
+      - name: Build
+        run: |
+          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
+          .jenkins/pytorch/macos-build.sh
+
+      - name: Archive artifacts into zip
+        if: inputs.build-generates-artifacts
+        run: |
+          zip -1 -r artifacts.zip dist/
+
+      - name: Store PyTorch Build Artifacts on GHA
+        uses: actions/upload-artifact@v2
+        if: inputs.build-generates-artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
new file mode 100644
index 000000000000..e0d11034e0d4
--- /dev/null
+++ b/.github/workflows/_mac-test.yml
@@ -0,0 +1,123 @@
+name: mac-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
+        required: true
+        description: access key id for test stats upload
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY:
+        required: true
+        description: secret acess key for test stats upload
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+
+# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
+defaults:
+  run:
+    shell: bash -e -l {0}
+
+jobs:
+  test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 240
+    env:
+      GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+      BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+      COMPACT_JOB_NAME: ${{ inputs.build-environment }}
+      JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PR_BODY: ${{ github.event.pull_request.body }}
+      PYTORCH_RETRY_TEST_CASES: 1
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+          use-gha: true
+
+      - name: Setup miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: 3.8
+          activate-environment: build
+          miniconda-version: 4.7.12
+
+      - name: Install macOS homebrew dependencies
+        run: |
+          # Install dependencies
+          brew install libomp
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Test
+        run: |
+          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+          export COMMIT_MESSAGES
+          python3 -mpip install dist/*.whl
+          .jenkins/pytorch/macos-test.sh
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always()
+        with:
+          use-gha: true
+          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Upload test statistics
+        if: always()
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        shell: bash
+        run: |
+          set -x
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
new file mode 100644
index 000000000000..894938fb7d5a
--- /dev/null
+++ b/.github/workflows/_rocm-test.yml
@@ -0,0 +1,192 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
+        required: true
+        description: access key id for test stats upload
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY:
+        required: true
+        description: secret acess key for test stats upload
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    timeout-minutes: 300
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          no-sudo: true
+
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ inputs.docker-image }}
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          PYTORCH_RETRY_TEST_CASES: 1
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          PR_BODY: ${{ github.event.pull_request.body }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+        timeout-minutes: 270
+        run: |
+          set -x
+
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
+            TEST_COMMAND=.jenkins/caffe2/test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+
+          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+          export COMMIT_MESSAGES
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IS_GHA \
+            -e BRANCH \
+            -e SHA1 \
+            -e AWS_DEFAULT_REGION \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PR_BODY \
+            -e COMMIT_MESSAGES \
+            -e PYTORCH_RETRY_TEST_CASES \
+            -e PR_LABELS \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --ulimit stack=10485760:83886080 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
+
+      - name: Save test results
+        if: always()
+        run: |
+          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always()
+        with:
+          use-gha: true
+          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Upload test statistics
+        if: always()
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        shell: bash
+        run: |
+          set -x
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+        if: always()
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
new file mode 100644
index 000000000000..abd7aca07f7a
--- /dev/null
+++ b/.github/workflows/_win-build.yml
@@ -0,0 +1,94 @@
+name: windows-build
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      cuda-version:
+        required: true
+        type: string
+        description: What CUDA version to build with, "cpu" for none.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  build:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, windows.4xlarge]
+    timeout-minutes: 240
+    env:
+      JOB_BASE_NAME: ${{ inputs.build-environment }}-build
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          no-sudo: true
+
+      - name: Setup Windows
+        uses: ./.github/actions/setup-win
+        with:
+          cuda-version: ${{ inputs.cuda-version }}
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Build
+        shell: bash
+        env:
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          BUILD_WHEEL: 1
+          MAX_JOBS: 8
+          CUDA_VERSION: ${{ inputs.cuda-version }}
+          PYTHON_VERSION: "3.8"
+          PYTORCH_RETRY_TEST_CASES: 1
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          SCCACHE_BUCKET: "ossci-compiler-cache"
+          VC_PRODUCT: "BuildTools"
+          VC_VERSION: ""
+          VC_YEAR: "2019"
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+          AWS_DEFAULT_REGION: us-east-1
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
+          TORCH_CUDA_ARCH_LIST: "7.0"
+          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
+        run: |
+          .jenkins/pytorch/win-build.sh
+
+      # Upload to github so that people can click and download artifacts
+      - name: Upload artifacts to s3
+        uses: seemethere/upload-artifact-s3@v4
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+
+      - name: Teardown Windows
+        uses: ./.github/actions/teardown-win
+        if: always()
+        timeout-minutes: 120
+        with:
+          extra-delete-dir: /c/${{ github.run_id }}/build-results/
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
new file mode 100644
index 000000000000..07f66b36ee7a
--- /dev/null
+++ b/.github/workflows/_win-test.yml
@@ -0,0 +1,134 @@
+name: win-test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      cuda-version:
+        required: true
+        type: string
+        description: What CUDA version to build with, "cpu" for none.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+
+env:
+  IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS
+  IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos.
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 300
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          no-sudo: true
+
+      - name: Setup Windows
+        uses: ./.github/actions/setup-win
+        with:
+          cuda-version: ${{ inputs.cuda-version }}
+
+      - name: Setup SSH (Click me for login details)
+        uses: ./.github/actions/setup-ssh
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download PyTorch Build Artifacts
+        uses: seemethere/download-artifact-s3@v3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          path: C:\${{ github.run_id }}\build-results
+
+      - name: Check build-results folder
+        shell: powershell
+        run: |
+          tree /F C:\$Env:GITHUB_RUN_ID\build-results
+
+      - name: Test
+        shell: bash
+        env:
+          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
+          INSTALL_WINDOWS_SDK: 1
+          PYTHON_VERSION: 3.8
+          PYTORCH_RETRY_TEST_CASES: 1
+          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+          VC_PRODUCT: "BuildTools"
+          VC_VERSION: ""
+          VS_VERSION: "16.8.6"
+          VC_YEAR: "2019"
+          AWS_DEFAULT_REGION: us-east-1
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CUDA_VERSION: ${{ inputs.cuda-version }}
+          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          TEST_CONFIG: ${{ matrix.config }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          PR_BODY: ${{ github.event.pull_request.body }}
+          TORCH_CUDA_ARCH_LIST: "7.0"
+        run: |
+          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+          export COMMIT_MESSAGES
+          .jenkins/pytorch/win-test.sh
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always()
+        with:
+          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Upload test statistics
+        if: always()
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: ${{ inputs.build-environment }}-test
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: ${{ github.run_id }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        shell: bash
+        run: |
+          set -x
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+
+      - name: Teardown Windows
+        uses: ./.github/actions/teardown-win
+        if: always()
+        timeout-minutes: 120
diff --git a/.github/workflows/buck_build_test.yml b/.github/workflows/buck_build_test.yml
new file mode 100644
index 000000000000..3104a9982895
--- /dev/null
+++ b/.github/workflows/buck_build_test.yml
@@ -0,0 +1,116 @@
+name: buck
+
+on:
+  push:
+    tags:
+      # Trigger on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/trunk/*'
+    branches:
+      - master
+      - main
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -e -l {0}
+
+jobs:
+
+  buck-build-test:
+    runs-on: ubuntu-latest
+    env:
+      JOB_BASE_NAME: ubuntu-latest-buck
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Set up JDK 8
+        uses: actions/setup-java@v3
+        with:
+          java-version: '8'
+          distribution: 'temurin'
+
+      - name: Setup miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: 3.8
+          activate-environment: build
+
+      - name: Install dependencies
+        run: |
+          conda install -y \
+            cffi \
+            cmake \
+            mkl \
+            mkl-include \
+            ninja \
+            numpy \
+            pyyaml \
+            requests \
+            setuptools \
+            typing_extensions
+
+      - name: Install Buck
+        run: |
+          wget https://github.com/facebook/buck/releases/download/v2021.01.12.01/buck.2021.01.12.01_all.deb
+          sudo apt install ./buck.2021.01.12.01_all.deb
+
+      - name: Download third party libraries and generate wrappers
+        run: |
+          sh scripts/buck_setup.sh
+
+      - name: Build glog
+        run: |
+          buck build third_party:glog
+
+      - name: Build C10
+        run: |
+          buck build c10:c10
+
+      - name: Build cpuinfo
+        run: |
+          buck build third_party:cpuinfo
+
+      - name: Build pthreadpool
+        run: |
+          buck build third_party:pthreadpool
+
+      - name: Build XNNPACK
+        run: |
+          buck build third_party:XNNPACK
+
+      - name: Build QNNPACK
+        run: |
+          buck build aten/src/ATen/native/quantized/cpu/qnnpack/... --keep-going
+
+      - name: Build aten_cpu
+        run: |
+          buck build :aten_cpu
+
+      - name: Build torch_mobile_core
+        run: |
+          buck build :torch_mobile_core
+
+      - name: Build torch_mobile_all_ops
+        run: |
+          buck build :torch_mobile_all_ops
+
+      - name: Build mobile benchmark
+        run: |
+          buck build :ptmobile_benchmark
+
+      - name: Run lite interpreter model
+        run: |
+          buck run :ptmobile_benchmark -- --model=ios/TestApp/models/mobilenet_v2.ptl --input_dims=1,3,224,224 --input_type=float
+
+      - name: Build everything
+        run: |
+          buck build //... --keep-going
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index f5432f0b40c9..605aa8b05b49 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -3,7 +3,10 @@ name: Create Release
 on:
   push:
     tags: ['v*']
-    branches: [master]
+    branches:
+      - master
+      - main
+      - nightly
   release:
     types: [published]
   pull_request:
@@ -18,6 +21,7 @@ jobs:
       - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: 'recursive'
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       - name: Fake name for PRs
         if: ${{ github.event_name == 'pull_request' }}
         run: echo "PT_GITHUB_REF=refs/tags/pr-tag" >> "$GITHUB_ENV"
@@ -37,7 +41,7 @@ jobs:
             cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
             mv "/tmp/$PT_RELEASE_NAME" .
             # Cleanup
-            rm -r "$PT_RELEASE_NAME"/{.azure_pipelines,.circleci,.jenkins}
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.jenkins}
             find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
             # Create archive
             tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
@@ -49,5 +53,5 @@ jobs:
           files: ${{env.PT_RELEASE_FILE}}
 
 concurrency:
-  group: create-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
new file mode 100644
index 000000000000..8cfca9514b11
--- /dev/null
+++ b/.github/workflows/docker-builds.yml
@@ -0,0 +1,78 @@
+name: docker-builds
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .circleci/docker/**
+      - .github/workflows/docker-builds.yml
+  schedule:
+    - cron: 1 3 * * 3
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+env:
+  ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine
+  AWS_DEFAULT_REGION: us-east-1
+
+jobs:
+  docker-build:
+    runs-on: [self-hosted, linux.2xlarge]
+    timeout-minutes: 240
+    strategy:
+      matrix:
+        include:
+          - docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
+          - docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
+          - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+          - docker-image-name: pytorch-linux-bionic-py3.7-clang9
+          - docker-image-name: pytorch-linux-bionic-rocm5.0-py3.7
+          - docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7
+          - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
+          - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+          - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+          - docker-image-name: pytorch-linux-xenial-py3-clang5-asan
+          - docker-image-name: pytorch-linux-xenial-py3-clang7-asan
+          - docker-image-name: pytorch-linux-xenial-py3-clang7-onnx
+          - docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+          - docker-image-name: pytorch-linux-xenial-py3.7-gcc7
+          - docker-image-name: pytorch-linux-focal-py3.7-gcc7
+    env:
+      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
+    steps:
+      - name: Clean workspace
+        shell: bash
+        run: |
+          echo "${GITHUB_WORKSPACE}"
+          sudo rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+
+      # [see note: pytorch repo ref]
+      # deep clone (fetch-depth 0) required for git merge-base
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Build docker image
+        id: build-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ matrix.docker-image-name }}
+          always-rebuild: true
+
+      - name: Pull docker image
+        uses: ./.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
+
+      - name: Chown workspace
+        uses: ./.github/actions/chown-workspace
+        if: always()
+
+      - name: Teardown Linux
+        uses: ./.github/actions/teardown-linux
+        if: always()
diff --git a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml
deleted file mode 100644
index c1932cbf09e8..000000000000
--- a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml
+++ /dev/null
@@ -1,248 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: caffe2-linux-xenial-py3.7-gcc5.4
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: caffe2-linux-xenial-py3.7-gcc5.4
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: caffe2-linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: caffe2-linux-xenial-py3.7-gcc5.4-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-docker-builds.yml b/.github/workflows/generated-docker-builds.yml
deleted file mode 100644
index 785c65d45b9b..000000000000
--- a/.github/workflows/generated-docker-builds.yml
+++ /dev/null
@@ -1,175 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/docker_builds_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: docker-builds
-
-on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.circleci/docker/**'
-      - '.github/workflows/generated-docker-builds.yml'
-  schedule:
-    - cron: 1 3 * * 3
-concurrency:
-  group: docker-builds-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-env:
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-
-jobs:
-
-  docker-build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    strategy:
-      matrix:
-        include:
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9'
-              docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
-              docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7'
-              docker_image_short_name: 'pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9'
-              docker_image_short_name: 'pytorch-linux-bionic-py3.7-clang9'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.7'
-              docker_image_short_name: 'pytorch-linux-bionic-rocm4.3.1-py3.7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.5-py3.7'
-              docker_image_short_name: 'pytorch-linux-bionic-rocm4.5-py3.7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
-              docker_image_short_name: 'pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7'
-              docker_image_short_name: 'pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7'
-              docker_image_short_name: 'pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c'
-              docker_image_short_name: 'pytorch-linux-xenial-py3-clang5-android-ndk-r19c'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan'
-              docker_image_short_name: 'pytorch-linux-xenial-py3-clang5-asan'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-asan'
-              docker_image_short_name: 'pytorch-linux-xenial-py3-clang7-asan'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-onnx'
-              docker_image_short_name: 'pytorch-linux-xenial-py3-clang7-onnx'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4'
-              docker_image_short_name: 'pytorch-linux-xenial-py3.7-gcc5.4'
-            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7'
-              docker_image_short_name: 'pytorch-linux-xenial-py3.7-gcc7'
-    env:
-      DOCKER_IMAGE_BASE: '${{ matrix.docker_image_base }}'
-    name: docker-build (${{ matrix.docker_image_short_name }})
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml
deleted file mode 100644
index 6995b22347e1..000000000000
--- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-arm64-coreml
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-arm64-coreml
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: OS
-  IOS_ARCH: arm64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-arm64-coreml-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-
-concurrency:
-  group: ios-12-5-1-arm64-coreml-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml
deleted file mode 100644
index 0fd77eef8605..000000000000
--- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-arm64-custom-ops
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-arm64-custom-ops
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: OS
-  IOS_ARCH: arm64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-arm64-custom-ops-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-
-concurrency:
-  group: ios-12-5-1-arm64-custom-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml
deleted file mode 100644
index 876e1e811f1b..000000000000
--- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-arm64-full-jit
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-arm64-full-jit
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: OS
-  IOS_ARCH: arm64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-arm64-full-jit-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-
-concurrency:
-  group: ios-12-5-1-arm64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml
deleted file mode 100644
index 065f311e90f9..000000000000
--- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-arm64-metal
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-arm64-metal
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: OS
-  IOS_ARCH: arm64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-arm64-metal-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-
-concurrency:
-  group: ios-12-5-1-arm64-metal-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml
deleted file mode 100644
index 2de63df26293..000000000000
--- a/.github/workflows/generated-ios-12-5-1-arm64.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-arm64
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-arm64
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: OS
-  IOS_ARCH: arm64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-arm64-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-
-concurrency:
-  group: ios-12-5-1-arm64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml
deleted file mode 100644
index 4306711a6210..000000000000
--- a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml
+++ /dev/null
@@ -1,176 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-x86-64-coreml
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-x86-64-coreml
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: SIMULATOR
-  IOS_ARCH: x86_64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-x86-64-coreml-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-      - name: Run Simulator Tests
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-          # generate models for differnet backends
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
-          mkdir -p ../models
-          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-            pip install coremltools==5.0b5
-            pip install six==1.16.0
-            python coreml_backend.py
-          else
-            python trace_model.py
-          fi
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            echo "Setting up the TestApp for LiteInterpreter"
-            ruby setup.rb --lite 1
-          else
-            echo "Setting up the TestApp for Full JIT"
-            ruby setup.rb
-          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp"
-          instruments -s -devices
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-              fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
-            else
-              fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
-            fi
-          else
-            fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
-          fi
-
-concurrency:
-  group: ios-12-5-1-x86-64-coreml-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml
deleted file mode 100644
index 18553b414499..000000000000
--- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml
+++ /dev/null
@@ -1,176 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-x86-64-full-jit
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-x86-64-full-jit
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: SIMULATOR
-  IOS_ARCH: x86_64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-x86-64-full-jit-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-      - name: Run Simulator Tests
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-          # generate models for differnet backends
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
-          mkdir -p ../models
-          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-            pip install coremltools==5.0b5
-            pip install six==1.16.0
-            python coreml_backend.py
-          else
-            python trace_model.py
-          fi
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            echo "Setting up the TestApp for LiteInterpreter"
-            ruby setup.rb --lite 1
-          else
-            echo "Setting up the TestApp for Full JIT"
-            ruby setup.rb
-          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp"
-          instruments -s -devices
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-              fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
-            else
-              fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
-            fi
-          else
-            fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
-          fi
-
-concurrency:
-  group: ios-12-5-1-x86-64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64.yml b/.github/workflows/generated-ios-12-5-1-x86-64.yml
deleted file mode 100644
index 0a92814866ab..000000000000
--- a/.github/workflows/generated-ios-12-5-1-x86-64.yml
+++ /dev/null
@@ -1,176 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/ios_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: ios-12-5-1-x86-64
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/ios/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: ios-12-5-1-x86-64
-  IN_CI: 1
-  IS_GHA: 1
-  IOS_PLATFORM: SIMULATOR
-  IOS_ARCH: x86_64
-
-
-jobs:
-
-  build:
-    # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations
-    #       of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test
-    if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: ios-12-5-1-x86-64-build
-      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
-      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }}
-      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }}
-      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Populate CI build options
-        run: |
-          # Most builds use the lite interpreter, if certain builds shouldn't
-          # build the lite interpreter this env variable should get over-written
-          # in the following case statement
-          echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}"
-
-          case ${BUILD_ENVIRONMENT} in
-            *metal*)
-              echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}"
-              ;;
-            *full_jit*)
-              echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}"
-              ;;
-            *custom*)
-              echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}"
-              ;;
-            *coreml*)
-              echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}"
-              ;;
-          esac
-      - name: Install brew dependencies
-        run: |
-          # Install dependencies
-          brew install libtool
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
-      - name: Run Fastlane
-        run: |
-          set -x
-          cd ios/TestApp
-          # install fastlane
-          sudo gem install bundler && bundle install
-          # install certificates
-          echo "${IOS_CERT_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o Certificates.p12
-          rm cert.txt
-          bundle exec fastlane install_root_cert
-          bundle exec fastlane install_dev_cert
-          # install the provisioning profile
-          PROFILE=PyTorch_CI_2022.mobileprovision
-          PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
-          mkdir -pv "${PROVISIONING_PROFILES}"
-          cd "${PROVISIONING_PROFILES}"
-          echo "${IOS_SIGN_KEY_2022}" >> cert.txt
-          base64 --decode cert.txt -o ${PROFILE}
-          rm cert.txt
-      - name: Build
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          export TCLLIBPATH="/usr/local/lib"
-          python -VV
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}
-          scripts/build_ios.sh
-      - name: Run Build Test
-        run: |
-          PROFILE=PyTorch_CI_2022
-          # run the ruby build script
-          if ! [ -x "$(command -v xcodebuild)" ]; then
-            echo 'Error: xcodebuild is not installed.'
-            exit 1
-          fi
-          if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}"
-          else
-            ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}"
-          fi
-      - name: Run Simulator Tests
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-          # generate models for differnet backends
-          cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark"
-          mkdir -p ../models
-          if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-            pip install coremltools==5.0b5
-            pip install six==1.16.0
-            python coreml_backend.py
-          else
-            python trace_model.py
-          fi
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            echo "Setting up the TestApp for LiteInterpreter"
-            ruby setup.rb --lite 1
-          else
-            echo "Setting up the TestApp for Full JIT"
-            ruby setup.rb
-          fi
-          cd "${GITHUB_WORKSPACE}/ios/TestApp"
-          instruments -s -devices
-          if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
-            if [ "${USE_COREML_DELEGATE}" == 1 ]; then
-              fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
-            else
-              fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
-            fi
-          else
-            fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
-          fi
-
-concurrency:
-  group: ios-12-5-1-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml
deleted file mode 100644
index fc55ce8dc285..000000000000
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml
+++ /dev/null
@@ -1,238 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: libtorch-linux-xenial-cuda10.2-py3.7-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/libtorch/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda10.2-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: libtorch-linux-xenial-cuda10.2-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml
deleted file mode 100644
index 452c20076104..000000000000
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml
+++ /dev/null
@@ -1,238 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: libtorch-linux-xenial-cuda11.3-py3.7-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/libtorch/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: libtorch-linux-xenial-cuda11.3-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml
new file mode 100644
index 000000000000..2a057f2a3fe8
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@@ -0,0 +1,5594 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-conda
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_conda/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-conda
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  conda-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cpu-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_7-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda10_2-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_7-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_3-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_7-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_6-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_8-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda10_2-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_8-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_3-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_8-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_6-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_9-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda10_2-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_9-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_3-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_9-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_6-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_10-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda10_2-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_10-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_3-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: conda-py3_10-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_6-test
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-conda.yml b/.github/workflows/generated-linux-binary-conda.yml
deleted file mode 100644
index 6b3a74dec474..000000000000
--- a/.github/workflows/generated-linux-binary-conda.yml
+++ /dev/null
@@ -1,7986 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-conda
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_conda/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-conda
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_RETRY_TEST_CASES: 1
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: linux-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  conda-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_7-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda10_2-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_7-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_1-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_7-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_7-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_7-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_5-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_8-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda10_2-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_8-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_1-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_8-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_8-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_5-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_9-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda10_2-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_9-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_1-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_9-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_9-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_5-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_10-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda10_2-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_10-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_1-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_10-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: conda-py3_10-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_5-test
-    env:
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml
new file mode 100644
index 000000000000..3fa24203231b
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml
@@ -0,0 +1,283 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-cxx11-abi
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
new file mode 100644
index 000000000000..096fd2617423
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -0,0 +1,7042 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-cxx11-abi
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-static-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-static-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-shared-with-deps-cxx11-abi-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_0-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-static-with-deps-cxx11-abi-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml
deleted file mode 100644
index 6cfdc08cd046..000000000000
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml
+++ /dev/null
@@ -1,8046 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-libtorch-cxx11-abi
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_RETRY_TEST_CASES: 1
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml
new file mode 100644
index 000000000000..922dbc27b7f2
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml
@@ -0,0 +1,283 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-pre-cxx11
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
new file mode 100644
index 000000000000..5972b6fced8e
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -0,0 +1,7042 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-pre-cxx11
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-shared-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cpu-static-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-shared-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda10_2-static-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-cuda11_6-static-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-shared-with-deps-pre-cxx11-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_0-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-static-with-deps-pre-cxx11-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_0-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_0-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_0-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml
deleted file mode 100644
index c39fb1c690c7..000000000000
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml
+++ /dev/null
@@ -1,8046 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-libtorch-pre-cxx11
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_RETRY_TEST_CASES: 1
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda10_2-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-test
-    env:
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-manywheel-master.yml b/.github/workflows/generated-linux-binary-manywheel-master.yml
new file mode 100644
index 000000000000..d384b3e79bd0
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-manywheel-master.yml
@@ -0,0 +1,294 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  manywheel-py3_7-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
new file mode 100644
index 000000000000..783227fe9d31
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -0,0 +1,8370 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_RETRY_TEST_CASES: 1
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  manywheel-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cpu-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda10_2-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda11_3-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-cuda11_6-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-rocm5_0
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-rocm5_0-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_0-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-rocm5_0-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_1_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_7-rocm5_1_1
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_1_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-rocm5_1_1-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_7-rocm5_1_1-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_7-rocm5_1_1-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_7-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cpu-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda10_2-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_3-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_6-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-rocm5_0
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-rocm5_0-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_0-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-rocm5_0-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_1_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_8-rocm5_1_1
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_1_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-rocm5_1_1-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_8-rocm5_1_1-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-rocm5_1_1-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_8-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cpu-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda10_2-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda11_3-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda11_6-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-rocm5_0
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-rocm5_0-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_0-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-rocm5_0-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_1_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_9-rocm5_1_1
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_1_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-rocm5_1_1-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_9-rocm5_1_1-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-rocm5_1_1-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cpu-build
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cpu-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda10_2-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-cuda10_2
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda10_2-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda10_2-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda10_2-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda10_2-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu102
+      GPU_ARCH_VERSION: 10.2
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda10_2
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda11_3-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda11_3-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Set BUILD_SPLIT_CUDA
+        run: |
+          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda11_6-build
+    runs-on: linux.4xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            pushd pytorch
+            bash .github/scripts/install_nvidia_utils_linux.sh
+            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+            popd
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda11_6-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-rocm5_0
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-rocm5_0-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_0-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-rocm5_0-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.0
+      GPU_ARCH_VERSION: 5.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm5_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_1_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.4xlarge
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch binary
+        run: |
+          set -x
+          mkdir -p artifacts/
+          container_name=$(docker run \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
+      - name: Chown artifacts
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - uses: seemethere/upload-artifact-s3@v4
+        with:
+          name: manywheel-py3_10-rocm5_1_1
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        working-directory: pytorch/
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_1_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-rocm5_1_1-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: Set DOCKER_HOST
+        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
+      - name: Runner health check system info
+        if: always()
+        run: |
+          cat /etc/os-release || true
+          cat /etc/apt/sources.list.d/rocm.list || true
+          cat /opt/rocm/.info/version || true
+          whoami
+      - name: Runner health check rocm-smi
+        if: always()
+        run: |
+          rocm-smi
+      - name: Runner health check rocminfo
+        if: always()
+        run: |
+          rocminfo
+      - name: Runner health check GPU count
+        if: always()
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
+              echo "Failed to detect GPUs on the runner"
+              exit 1
+          fi
+      - name: Runner health check disconnect on failure
+        if: ${{ failure() }}
+        run: |
+          killall runsvc.sh
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Test PyTorch binary
+        run: |
+          set -x
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BINARY_ENV_FILE \
+            -e BUILDER_ROOT \
+            -e BUILD_ENVIRONMENT \
+            -e BUILD_SPLIT_CUDA \
+            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
+            -e DESIRED_PYTHON \
+            -e GPU_ARCH_TYPE \
+            -e GPU_ARCH_VERSION \
+            -e IS_GHA \
+            -e LIBTORCH_VARIANT \
+            -e PACKAGE_TYPE \
+            -e PYTORCH_FINAL_PACKAGE_DIR \
+            -e PYTORCH_ROOT \
+            -e SKIP_ALL_TESTS \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
+            -v "${GITHUB_WORKSPACE}/builder:/builder" \
+            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
+            -w / \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
+          # Generate test script
+          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  manywheel-py3_10-rocm5_1_1-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-rocm5_1_1-test
+    env:
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm5.1.1
+      GPU_ARCH_VERSION: 5.1.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm5_1_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-binary-manywheel.yml b/.github/workflows/generated-linux-binary-manywheel.yml
deleted file mode 100644
index a955984d7c75..000000000000
--- a/.github/workflows/generated-linux-binary-manywheel.yml
+++ /dev/null
@@ -1,11122 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_RETRY_TEST_CASES: 1
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  manywheel-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cpu-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda10_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_3-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_5-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_3_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-rocm4_3_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_3_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm4_3_1-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_3_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm4_3_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_5_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_7-rocm4_5_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_5_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm4_5_2-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_7-rocm4_5_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm4_5_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda10_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_3-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_5-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_3_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-rocm4_3_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_3_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm4_3_1-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_3_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm4_3_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_5_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_8-rocm4_5_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_5_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm4_5_2-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_8-rocm4_5_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm4_5_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cpu-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda10_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_3-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_5-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_3_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-rocm4_3_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_3_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm4_3_1-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_3_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm4_3_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_5_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_9-rocm4_5_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_5_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm4_5_2-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_9-rocm4_5_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm4_5_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cpu-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cpu-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda10_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-cuda10_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda10_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda10_2-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda10_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda10_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu102
-      GPU_ARCH_VERSION: 10.2
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda10_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_1-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_3-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_3-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_5-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Set BUILD_SPLIT_CUDA
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_5-build
-    runs-on: linux.4xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        working-directory: pytorch/
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_5-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_3_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-rocm4_3_1
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_3_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm4_3_1-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_3_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm4_3_1-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.3.1
-      GPU_ARCH_VERSION: 4.3.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-rocm4_3_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_5_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Build PyTorch binary
-        run: |
-          set -x
-          mkdir -p artifacts/
-          container_name=$(docker run \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh"
-      - name: Chown artifacts
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        with:
-          name: manywheel-py3_10-rocm4_5_2
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            ${{ runner.temp }}/artifacts/*
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_5_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm4_5_2-build
-    runs-on: linux.4xlarge
-    timeout-minutes: 240
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Test PyTorch binary
-        run: |
-          set -x
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
-            -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
-            -e DESIRED_CUDA \
-            -e DESIRED_DEVTOOLSET \
-            -e DESIRED_PYTHON \
-            -e GPU_ARCH_TYPE \
-            -e GPU_ARCH_VERSION \
-            -e IS_GHA \
-            -e LIBTORCH_VARIANT \
-            -e PACKAGE_TYPE \
-            -e PYTORCH_FINAL_PACKAGE_DIR \
-            -e PYTORCH_ROOT \
-            -e SKIP_ALL_TESTS \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
-            -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \
-            -w / \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Generate test script
-          docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  manywheel-py3_10-rocm4_5_2-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm4_5_2-test
-    env:
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm4.5.2
-      GPU_ARCH_VERSION: 4.5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_10-rocm4_5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
deleted file mode 100644
index ee483708dfcd..000000000000
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ /dev/null
@@ -1,540 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-bionic-cuda10.2-py3.9-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/slow/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: 1
-      ENABLE_MULTIGPU_TEST: 1
-      ENABLE_NOGPU_NO_AVX_TEST: 1
-      ENABLE_NOGPU_NO_AVX2_TEST: 1
-      ENABLE_SLOW_TEST: 1
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml
deleted file mode 100644
index 91e4ff63e4c6..000000000000
--- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml
+++ /dev/null
@@ -1,542 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-bionic-py3.7-clang9
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/noarch/*'
-      - 'ciflow/trunk/*'
-      - 'ciflow/xla/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-bionic-py3.7-clang9
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-bionic-py3.7-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-bionic-py3.7-clang9-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: ''
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: 1
-      ENABLE_NOARCH_TEST: 1
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-bionic-py3.7-clang9-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-bionic-py3.7-clang9-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml
deleted file mode 100644
index 5f37b48464b8..000000000000
--- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml
+++ /dev/null
@@ -1,512 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-bionic-rocm4.5-py3.7
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/rocm/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-bionic-rocm4.5-py3.7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.5-py3.7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-bionic-rocm4.5-py3.7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.rocm.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.rocm.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: Set DOCKER_HOST
-        run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
-      - name: Runner health check system info
-        if: always()
-        run: |
-          cat /etc/os-release || true
-          cat /etc/apt/sources.list.d/rocm.list || true
-          cat /opt/rocm/.info/version || true
-          whoami
-      - name: Runner health check rocm-smi
-        if: always()
-        run: |
-          rocm-smi
-      - name: Runner health check rocminfo
-        if: always()
-        run: |
-          rocminfo
-      - name: Runner health check GPU count
-        if: always()
-        run: |
-          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
-          if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
-              exit 1
-          fi
-      - name: Runner health check disconnect on failure
-        if: ${{ failure() }}
-        run: |
-          killall runsvc.sh
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: ROCm set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
-          docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
-          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
-          docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: actions/upload-artifact@v2
-        name: Store Test Downloaded JSONs on Github
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: actions/upload-artifact@v2
-        name: Store Test Reports on Github
-        if: always()
-        with:
-          name: test-reports
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-docs-push.yml b/.github/workflows/generated-linux-docs-push.yml
deleted file mode 100644
index 0ad84fdef3e6..000000000000
--- a/.github/workflows/generated-linux-docs-push.yml
+++ /dev/null
@@ -1,392 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-docs-push
-
-on:
-  push:
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-  schedule:
-    - cron: 0 0 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-docs-push
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-docs-push-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-docs-push-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-  build-docs:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    strategy:
-      matrix:
-        docs_type: [cpp, python]
-    needs: [build]
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      DOCS_TYPE: ${{ matrix.docs_type }}
-      WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Generate netrc (only for docs-push)
-        if: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
-        env:
-          GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
-        run: |
-          # set credentials for https pushing
-          echo "machine github.com" > "${RUNNER_TEMP}/.netrc"
-          echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc"
-          echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc"
-      - name: Build ${{ matrix.docs_type }} docs
-        run: |
-          set -ex
-          time docker pull "${DOCKER_IMAGE}" > /dev/null
-          # Convert refs/tags/v1.12.0rc3 into 1.12
-          if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then
-            target="${BASH_REMATCH[1]}"
-          else
-            target="master"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e IN_CI \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SHA1="$GITHUB_SHA" \
-            -e DOCS_VERSION="${target}" \
-            -e DOCS_TYPE \
-            -e PR_LABELS \
-            -e WITH_PUSH \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Upload Python Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: pytorch.github.io/docs/master/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Upload C++ Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }}
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          s3-bucket: doc-previews
-          path: cppdocs/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs
diff --git a/.github/workflows/generated-linux-docs.yml b/.github/workflows/generated-linux-docs.yml
deleted file mode 100644
index 5709b1a7eef7..000000000000
--- a/.github/workflows/generated-linux-docs.yml
+++ /dev/null
@@ -1,382 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-docs
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/docs/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-docs
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-docs-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-docs-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-  build-docs:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    strategy:
-      matrix:
-        docs_type: [cpp, python]
-    needs: [build]
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      DOCS_TYPE: ${{ matrix.docs_type }}
-      WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Build ${{ matrix.docs_type }} docs
-        run: |
-          set -ex
-          time docker pull "${DOCKER_IMAGE}" > /dev/null
-          # Convert refs/tags/v1.12.0rc3 into 1.12
-          if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then
-            target="${BASH_REMATCH[1]}"
-          else
-            target="master"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e IN_CI \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SHA1="$GITHUB_SHA" \
-            -e DOCS_VERSION="${target}" \
-            -e DOCS_TYPE \
-            -e PR_LABELS \
-            -e WITH_PUSH \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Upload Python Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: pytorch.github.io/docs/master/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Upload C++ Docs Preview
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }}
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          s3-bucket: doc-previews
-          path: cppdocs/
-          s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs
diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml
deleted file mode 100644
index 58f8cc3d0563..000000000000
--- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml
+++ /dev/null
@@ -1,541 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-vulkan-bionic-py3.7-clang9
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-      - 'ciflow/vulkan/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-vulkan-bionic-py3.7-clang9
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-vulkan-bionic-py3.7-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: ''
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 1
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml
deleted file mode 100644
index e1dc026af70b..000000000000
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml
+++ /dev/null
@@ -1,336 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/bazel_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/bazel/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: linux.2xlarge
-    env:
-      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-build-and-test
-      NUM_TEST_SHARDS: 1
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e PR_LABELS \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          # The artifact file is created inside docker container, which contains the result binaries.
-          # Now unpackage it into the project folder. The subsequent script will scan project folder
-          # to locate result binaries and report their sizes.
-          # If artifact file is not provided it assumes that the project folder has been mounted in
-          # the docker during build and already contains the result binaries, so this step can be skipped.
-          export ARTIFACTS=
-          if [ -n "${ARTIFACTS}" ]; then
-            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
-            cd "${GITHUB_WORKSPACE}"
-          fi
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          ANDROID_BUILD_TYPE=
-          export ANDROID_BUILD_TYPE
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
-      - name: Test
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          export SHARD_NUMBER=0
-          # TODO: Stop building test binaries as part of the build phase
-          # Make sure we copy test results from bazel-testlogs symlink to
-          # a regular directory ./test/test-reports
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e SHARD_NUMBER \
-            -e NUM_TEST_SHARDS \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: 'bazel-${{ github.job }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: 'bazel-${{ github.job }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml
deleted file mode 100644
index 7a51acf31e11..000000000000
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml
+++ /dev/null
@@ -1,248 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-cuda11.3-py3.7-gcc7-no-ops
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7-no-ops
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-cuda11.3-py3.7-gcc7-no-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-no-ops-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml
deleted file mode 100644
index 4dd594483b8e..000000000000
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml
+++ /dev/null
@@ -1,540 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-cuda11.3-py3.7-gcc7
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-cuda11.3-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml
deleted file mode 100644
index df0dd5fb57f9..000000000000
--- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml
+++ /dev/null
@@ -1,238 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3-clang5-mobile-build
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/mobile/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3-clang5-mobile-build
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3-clang5-mobile-build-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3-clang5-mobile-build-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml
deleted file mode 100644
index 29a14fd9f418..000000000000
--- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml
+++ /dev/null
@@ -1,238 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3-clang5-mobile-custom-build-static
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/mobile/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3-clang5-mobile-custom-build-static
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3-clang5-mobile-custom-build-static-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3-clang5-mobile-custom-build-static-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml
deleted file mode 100644
index 5b538547df1b..000000000000
--- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml
+++ /dev/null
@@ -1,541 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.7-clang7-asan
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/sanitizers/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.7-clang7-asan
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-asan
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3.7-clang7-asan-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: ''
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 3
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml
deleted file mode 100644
index 0005308beec3..000000000000
--- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml
+++ /dev/null
@@ -1,541 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.7-clang7-onnx
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/onnx/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.7-clang7-onnx
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-onnx
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3.7-clang7-onnx-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: ''
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml
deleted file mode 100644
index 5778fe613dbc..000000000000
--- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml
+++ /dev/null
@@ -1,540 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.7-gcc5.4
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc5.4
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: 1
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: 1
-      ENABLE_BACKWARDS_COMPAT_TEST: 1
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml
deleted file mode 100644
index e9f11d265c7b..000000000000
--- a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml
+++ /dev/null
@@ -1,249 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.7-gcc7-no-ops
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc7-no-ops
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3.7-gcc7-no-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3.7-gcc7-no-ops-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml
deleted file mode 100644
index 1bb791a329b3..000000000000
--- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml
+++ /dev/null
@@ -1,540 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.7-gcc7
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: linux-xenial-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: linux-xenial-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-10-15-py3-arm64.yml b/.github/workflows/generated-macos-10-15-py3-arm64.yml
deleted file mode 100644
index ea97b3b9facf..000000000000
--- a/.github/workflows/generated-macos-10-15-py3-arm64.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-10-15-py3-arm64
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
-defaults:
-  run:
-    shell: bash -e -l {0}
-env:
-  BUILD_ENVIRONMENT: macos-10-15-py3-arm64
-  COMPACT_JOB_NAME: macos-10-15-py3-arm64
-  IN_CI: 1
-  IS_GHA: 1
-  PYTORCH_RETRY_TEST_CASES: 1
-
-
-jobs:
-
-  build:
-    runs-on: macos-10.15
-    env:
-      JOB_BASE_NAME: macos-10-15-py3-arm64
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          python-version: 3.8
-          activate-environment: build
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Build
-        run: |
-          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          .jenkins/pytorch/macos-build.sh
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/
-      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts on GHA
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-
-
-concurrency:
-  group: macos-10-15-py3-arm64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml
deleted file mode 100644
index c07454967691..000000000000
--- a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-10-15-py3-lite-interpreter-x86-64
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
-defaults:
-  run:
-    shell: bash -e -l {0}
-env:
-  BUILD_ENVIRONMENT: macos-10-15-py3-lite-interpreter-x86-64
-  COMPACT_JOB_NAME: macos-10-15-py3-lite-interpreter-x86-64
-  IN_CI: 1
-  IS_GHA: 1
-  PYTORCH_RETRY_TEST_CASES: 1
-
-  # Set xcode xcode version to 12
-  DEVELOPER_DIR: /Applications/Xcode_12.app/Contents/Developer
-
-jobs:
-
-  build:
-    runs-on: macos-10.15
-    env:
-      JOB_BASE_NAME: macos-10-15-py3-lite-interpreter-x86-64
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          python-version: 3.8
-          activate-environment: build
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Build
-        run: |
-          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          .jenkins/pytorch/macos-build.sh
-
-
-concurrency:
-  group: macos-10-15-py3-lite-interpreter-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml
deleted file mode 100644
index 41ae3259b527..000000000000
--- a/.github/workflows/generated-macos-11-py3-x86-64.yml
+++ /dev/null
@@ -1,228 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-11-py3-x86-64
-
-on:
-  push:
-    branches:
-      - master
-      - release/*
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/macos/*'
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
-defaults:
-  run:
-    shell: bash -e -l {0}
-env:
-  BUILD_ENVIRONMENT: macos-11-py3-x86-64
-  COMPACT_JOB_NAME: macos-11-py3-x86-64
-  IN_CI: 1
-  IS_GHA: 1
-  PYTORCH_RETRY_TEST_CASES: 1
-
-  # Set xcode xcode version to 12.4
-  DEVELOPER_DIR: /Applications/Xcode_12.4.app/Contents/Developer
-
-jobs:
-
-  build:
-    runs-on: macos-11
-    env:
-      JOB_BASE_NAME: macos-11-py3-x86-64
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-      PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          python-version: 3.8
-          activate-environment: build
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Build
-        run: |
-          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          .jenkins/pytorch/macos-build.sh
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/
-      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Build Artifacts on GHA
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: macos-11
-      ENABLE_DISTRIBUTED_TEST: ''
-      NUM_TEST_SHARDS: 2
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: macos-11-py3-x86-64-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Build Artifacts from GHA
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: .
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          python-version: 3.8
-          activate-environment: build
-      - name: Install macOS homebrew dependencies
-        run: |
-          # Install dependencies
-          brew install libomp
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        run: |
-          python3 -mpip install dist/*.whl
-          .jenkins/pytorch/macos-test.sh
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: actions/upload-artifact@v2
-        name: Store Test Downloaded JSONs on Github
-        if: always()
-        with:
-          name: test-jsons
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: actions/upload-artifact@v2
-        name: Store Test Reports on Github
-        if: always()
-        with:
-          name: test-reports
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: macos-11-py3-x86-64-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-
-
-concurrency:
-  group: macos-11-py3-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
new file mode 100644
index 000000000000..422416060fe6
--- /dev/null
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -0,0 +1,564 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-arm64-binary-conda
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_conda/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-arm64-binary-conda
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+  CROSS_COMPILE_ARM64: 1
+
+concurrency:
+  group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  conda-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-arm64-binary-conda.yml b/.github/workflows/generated-macos-arm64-binary-conda.yml
deleted file mode 100644
index 40383e51bee6..000000000000
--- a/.github/workflows/generated-macos-arm64-binary-conda.yml
+++ /dev/null
@@ -1,575 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-arm64-binary-conda
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_conda/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-arm64-binary-conda
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-  CROSS_COMPILE_ARM64: 1
-
-concurrency:
-  group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  conda-py3_8-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
new file mode 100644
index 000000000000..617d1e372f49
--- /dev/null
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -0,0 +1,739 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-arm64-binary-wheel
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-arm64-binary-wheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+  CROSS_COMPILE_ARM64: 1
+
+concurrency:
+  group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  wheel-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel.yml b/.github/workflows/generated-macos-arm64-binary-wheel.yml
deleted file mode 100644
index cb407a313425..000000000000
--- a/.github/workflows/generated-macos-arm64-binary-wheel.yml
+++ /dev/null
@@ -1,754 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-arm64-binary-wheel
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-arm64-binary-wheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-  CROSS_COMPILE_ARM64: 1
-
-concurrency:
-  group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  wheel-py3_7-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
new file mode 100644
index 000000000000..d5c6eae896cb
--- /dev/null
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@@ -0,0 +1,737 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-binary-conda
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_conda/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-binary-conda
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: macos-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  conda-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: conda-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-build
+    env:
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-conda.yml b/.github/workflows/generated-macos-binary-conda.yml
deleted file mode 100644
index db148ed0e024..000000000000
--- a/.github/workflows/generated-macos-binary-conda.yml
+++ /dev/null
@@ -1,752 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-conda
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_conda/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-conda
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  conda-py3_7-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_8-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_9-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  conda-py3_10-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: conda-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
new file mode 100644
index 000000000000..eac3e4019cd3
--- /dev/null
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@@ -0,0 +1,761 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-binary-libtorch-cxx11-abi
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-shared-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-shared-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-static-with-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-static-with-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-static-without-deps-cxx11-abi
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-static-without-deps-cxx11-abi-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
deleted file mode 100644
index 5f9ea6396f6c..000000000000
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
+++ /dev/null
@@ -1,788 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-libtorch-cxx11-abi
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-cxx11-abi-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-cxx11-abi-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-cxx11-abi-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-cxx11-abi-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
new file mode 100644
index 000000000000..b943ea97a970
--- /dev/null
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@@ -0,0 +1,761 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-binary-libtorch-pre-cxx11
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-binary-libtorch-pre-cxx11
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: macos-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-shared-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-shared-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: shared-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-static-with-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-static-with-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    # libtorch builds take a long time on github hosted runners
+    timeout-minutes: 720
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: libtorch-cpu-static-without-deps-pre-cxx11
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-static-without-deps-pre-cxx11-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
+    env:
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_VARIANT: static-without-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
deleted file mode 100644
index 0cac68d72912..000000000000
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
+++ /dev/null
@@ -1,788 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-libtorch-pre-cxx11
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-libtorch-pre-cxx11
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-pre-cxx11-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-pre-cxx11-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-pre-cxx11-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-pre-cxx11-build:
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
new file mode 100644
index 000000000000..2dd93eea93ca
--- /dev/null
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@@ -0,0 +1,737 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-binary-wheel
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-binary-wheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: macos-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  wheel-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-10.15
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: |
+          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+          sudo chmod +x /usr/local/bin/sccache
+          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v2
+        if: always()
+        with:
+          name: wheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cpu-build
+    env:
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-macos-binary-wheel.yml b/.github/workflows/generated-macos-binary-wheel.yml
deleted file mode 100644
index 2a97b166dd73..000000000000
--- a/.github/workflows/generated-macos-binary-wheel.yml
+++ /dev/null
@@ -1,752 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-wheel
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-wheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  wheel-py3_7-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cpu-build:
-    runs-on: macos-10.15
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
-        if: always()
-        with:
-          name: wheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml
deleted file mode 100644
index d9182993f0c1..000000000000
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml
+++ /dev/null
@@ -1,539 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: parallelnative-linux-xenial-py3.7-gcc5.4
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.7-gcc5.4
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: parallelnative-linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 1
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml
deleted file mode 100644
index 0c2df1244222..000000000000
--- a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml
+++ /dev/null
@@ -1,237 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/libtorch/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-  schedule:
-    - cron: 45 4,10,16,22 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml
deleted file mode 100644
index 366395af1f20..000000000000
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml
+++ /dev/null
@@ -1,237 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/libtorch/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml
deleted file mode 100644
index 85e1ca4101bd..000000000000
--- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml
+++ /dev/null
@@ -1,538 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-linux-bionic-cuda11.5-py3.7-gcc7
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-  schedule:
-    - cron: 45 4,10,16,22 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-linux-bionic-cuda11.5-py3.7-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: periodic-linux-bionic-cuda11.5-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
deleted file mode 100644
index 3c9c3c1199ab..000000000000
--- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
+++ /dev/null
@@ -1,540 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-      - 'ciflow/slow/*'
-      - 'ciflow/slow-gradcheck/*'
-  schedule:
-    - cron: 0 */4 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: ''
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 360 minutes
-        timeout-minutes: 360
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml
deleted file mode 100644
index 2e325fca8ad2..000000000000
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml
+++ /dev/null
@@ -1,539 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/scheduled/*'
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-  DEBUG: 1
-concurrency:
-  group: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  build:
-    runs-on: linux.2xlarge
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          PROXY_ENV=
-          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
-          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
-          #       make it so that we shouldn't have to fully disable squid for XLA tests
-          if [[ $TEST_CONFIG != 'xla' ]]; then
-            # shellcheck disable=SC2089
-            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e BRANCH \
-            -e SHA1 \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PYTORCH_RETRY_TEST_CASES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            ${PROXY_ENV} \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --ipc=host \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test jsons if they exist
-          rm -f test-jsons-*.zip
-          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
deleted file mode 100644
index 11d24eafb62d..000000000000
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ /dev/null
@@ -1,321 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-win-vs2019-cuda11.1-py3
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/scheduled/*'
-      - 'ciflow/win/*'
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.1-py3
-  BUILD_WHEEL: 1
-  MAX_JOBS: 8
-  CUDA_VERSION: "11.1"
-  IN_CI: 1
-  IS_GHA: 1
-  INSTALL_WINDOWS_SDK: 1
-  PYTHON_VERSION: "3.8"
-  PYTORCH_RETRY_TEST_CASES: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  SCCACHE_BUCKET: "ossci-compiler-cache"
-  VC_PRODUCT: "BuildTools"
-  VC_VERSION: ""
-  VS_VERSION: "16.8.6"
-  VC_YEAR: "2019"
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  TORCH_CUDA_ARCH_LIST: "7.0"
-  USE_CUDA: 1
-
-concurrency:
-  group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: "windows.4xlarge"
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-build
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          .jenkins/pytorch/win-build.sh
-      # Upload to github so that people can click and download artifacts
-      - name: Upload artifacts to s3
-        uses: seemethere/upload-artifact-s3@v3
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup build-results and workspaces
-        if: always()
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}"
-          rm -rf ./*
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
-      NUM_TEST_SHARDS: 2
-      NUM_TEST_SHARDS_ON_PULL_REQUEST: 2
-      PR_BODY: ${{ github.event.pull_request.body }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: ''
-      RUN_SMOKE_TESTS_ONLY_ON_PR: False
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Check build-results folder
-        shell: powershell
-        run: |
-          tree /F C:\$Env:GITHUB_RUN_ID\build-results
-      # Needed for coverage in win-test.sh
-      - uses: actions/setup-python@v2
-        name: Setup Python3
-        with:
-          python-version: '3.x'
-      - name: Test
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-            .jenkins/pytorch/win-test.sh
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml
deleted file mode 100644
index f89ea43911e2..000000000000
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml
+++ /dev/null
@@ -1,321 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-win-vs2019-cuda11.5-py3
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/scheduled/*'
-      - 'ciflow/win/*'
-  schedule:
-    - cron: 45 4,10,16,22 * * *
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.5-py3
-  BUILD_WHEEL: 1
-  MAX_JOBS: 8
-  CUDA_VERSION: "11.5"
-  IN_CI: 1
-  IS_GHA: 1
-  INSTALL_WINDOWS_SDK: 1
-  PYTHON_VERSION: "3.8"
-  PYTORCH_RETRY_TEST_CASES: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  SCCACHE_BUCKET: "ossci-compiler-cache"
-  VC_PRODUCT: "BuildTools"
-  VC_VERSION: ""
-  VS_VERSION: "16.8.6"
-  VC_YEAR: "2019"
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  TORCH_CUDA_ARCH_LIST: "7.0"
-  USE_CUDA: 1
-
-concurrency:
-  group: periodic-win-vs2019-cuda11.5-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: "windows.4xlarge"
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-build
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          .jenkins/pytorch/win-build.sh
-      # Upload to github so that people can click and download artifacts
-      - name: Upload artifacts to s3
-        uses: seemethere/upload-artifact-s3@v3
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup build-results and workspaces
-        if: always()
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}"
-          rm -rf ./*
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
-      NUM_TEST_SHARDS: 2
-      NUM_TEST_SHARDS_ON_PULL_REQUEST: 2
-      PR_BODY: ${{ github.event.pull_request.body }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: 1
-      RUN_SMOKE_TESTS_ONLY_ON_PR: False
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Check build-results folder
-        shell: powershell
-        run: |
-          tree /F C:\$Env:GITHUB_RUN_ID\build-results
-      # Needed for coverage in win-test.sh
-      - uses: actions/setup-python@v2
-        name: Setup Python3
-        with:
-          python-version: '3.x'
-      - name: Test
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-            .jenkins/pytorch/win-test.sh
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml
deleted file mode 100644
index bccd46728c31..000000000000
--- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml
+++ /dev/null
@@ -1,507 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/android_ci_full_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
-
-on:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/android/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: linux.2xlarge
-    env:
-      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build-build-and-test
-      NUM_TEST_SHARDS: 1
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build-arm-v7a
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          #!/bin/bash -eo pipefail
-          # Pull Docker image and run build
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-          echo "${DOCKER_IMAGE}"
-          export container_name
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-          # Copy dist folder back
-          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-arm-v7a
-          docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found"
-          docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-      - name: Build-arm-v8a
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          #!/bin/bash -eo pipefail
-          # Pull Docker image and run build
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-          echo "${DOCKER_IMAGE}"
-          export container_name
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-          # Copy dist folder back
-          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-arm-v8a
-          docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found"
-          docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-      - name: Build-x86_32
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          #!/bin/bash -eo pipefail
-          # Pull Docker image and run build
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-          echo "${DOCKER_IMAGE}"
-          export container_name
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-          # Copy dist folder back
-          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-x86_32
-          docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found"
-          docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-      - name: Build-x86_64
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          #!/bin/bash -eo pipefail
-          # Pull Docker image and run build
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-          echo "${DOCKER_IMAGE}"
-          export container_name
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1
-
-          # Copy dist folder back
-          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-x86_64
-          docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found"
-          docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}"
-          time docker push "${COMMIT_DOCKER_IMAGE}"
-      - name: Build-Final-Artifcact
-        env:
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          set -eux
-
-          docker_image_libtorch_android_x86_32="${DOCKER_IMAGE}-x86_32"
-          docker_image_libtorch_android_x86_64="${DOCKER_IMAGE}-x86_64"
-          docker_image_libtorch_android_arm_v7a="${DOCKER_IMAGE}-arm-v7a"
-          docker_image_libtorch_android_arm_v8a="${DOCKER_IMAGE}-arm-v8a"
-
-          echo "docker_image_commit: ${DOCKER_IMAGE}"
-          echo "docker_image_libtorch_android_x86_32: ${docker_image_libtorch_android_x86_32}"
-          echo "docker_image_libtorch_android_x86_64: ${docker_image_libtorch_android_x86_64}"
-          echo "docker_image_libtorch_android_arm_v7a: ${docker_image_libtorch_android_arm_v7a}"
-          echo "docker_image_libtorch_android_arm_v8a: ${docker_image_libtorch_android_arm_v8a}"
-
-          # x86_32
-          time docker pull "${docker_image_libtorch_android_x86_32}" >/dev/null
-          export id_x86_32
-          id_x86_32=$(docker run -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_32}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_32}" bash) 2>&1
-
-          # arm-v7a
-          time docker pull "${docker_image_libtorch_android_arm_v7a}" >/dev/null
-          export id_arm_v7a
-          id_arm_v7a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v7a}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_arm_v7a}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v7a"
-          docker cp "${id_arm_v7a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v7a"
-
-          # x86_64
-          time docker pull "${docker_image_libtorch_android_x86_64}" >/dev/null
-          export id_x86_64
-          id_x86_64=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_64}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_64}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_x86_64"
-          docker cp "${id_x86_64}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_x86_64"
-
-          # arm-v8a
-          time docker pull "${docker_image_libtorch_android_arm_v8a}" >/dev/null
-          export id_arm_v8a
-          id_arm_v8a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v8a}")
-
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v8a"
-          docker cp "${id_arm_v8a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v8a"
-
-          # Putting everything together
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a"
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64"
-          docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a"
-
-          # run gradle buildRelease
-          # shellcheck disable=SC1105
-          ((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec \
-            -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e PR_NUMBER \
-            -e SHA1 \
-            -e BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --user jenkins \
-            -u jenkins -i "${id_x86_32}" bash) 2>&1
-
-          mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts"
-          docker cp "${id_x86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/"
-
-          output_image="${DOCKER_IMAGE}-android-x86_32-gradle"
-          docker commit "${id_x86_32}" "${output_image}"
-          time docker push "${output_image}"
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          # The artifact file is created inside docker container, which contains the result binaries.
-          # Now unpackage it into the project folder. The subsequent script will scan project folder
-          # to locate result binaries and report their sizes.
-          # If artifact file is not provided it assumes that the project folder has been mounted in
-          # the docker during build and already contains the result binaries, so this step can be skipped.
-          export ARTIFACTS=${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz
-          if [ -n "${ARTIFACTS}" ]; then
-            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
-            cd "${GITHUB_WORKSPACE}"
-          fi
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          ANDROID_BUILD_TYPE=prebuilt
-          export ANDROID_BUILD_TYPE
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Android Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            build_android_artifacts/artifacts.tgz
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml
deleted file mode 100644
index 95924b65d8a2..000000000000
--- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml
+++ /dev/null
@@ -1,274 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/android_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/android/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: linux.2xlarge
-    env:
-      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit-build-and-test
-      NUM_TEST_SHARDS: 1
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build
-        run: |
-          set -e
-          # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because:
-          # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build;
-          # 2) Not parallelizable by architecture: it only builds libtorch for one architecture;
-
-          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-
-          export BUILD_LITE_INTERPRETER
-          BUILD_LITE_INTERPRETER="1"
-          if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then
-            BUILD_LITE_INTERPRETER="0"
-          fi
-
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          # shellcheck disable=SC2016
-          export id
-          id=$(docker run -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e PR_LABELS \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e BUILD_LITE_INTERPRETER \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "$(pwd):/var/lib/jenkins/workspace" \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}")
-
-          # shellcheck disable=SC2016
-          export COMMAND
-          # shellcheck disable=SC2016
-          COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
-          echo "${COMMAND}" > ./command.sh && bash ./command.sh
-          # Skip docker push as this job is purely for size analysis purpose.
-          # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied.
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          # The artifact file is created inside docker container, which contains the result binaries.
-          # Now unpackage it into the project folder. The subsequent script will scan project folder
-          # to locate result binaries and report their sizes.
-          # If artifact file is not provided it assumes that the project folder has been mounted in
-          # the docker during build and already contains the result binaries, so this step can be skipped.
-          export ARTIFACTS=
-          if [ -n "${ARTIFACTS}" ]; then
-            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
-            cd "${GITHUB_WORKSPACE}"
-          fi
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          ANDROID_BUILD_TYPE=custom-build-single
-          export ANDROID_BUILD_TYPE
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml
deleted file mode 100644
index 7af766ba75aa..000000000000
--- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml
+++ /dev/null
@@ -1,274 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/android_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/android/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/linux/*'
-      - 'ciflow/trunk/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  PYTORCH_RETRY_TEST_CASES: 1
-concurrency:
-  group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  # building and testing in a single job since bazel runs only small subset of tests
-  build-and-test:
-    runs-on: linux.2xlarge
-    env:
-      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-build-and-test
-      NUM_TEST_SHARDS: 1
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build
-        run: |
-          set -e
-          # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because:
-          # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build;
-          # 2) Not parallelizable by architecture: it only builds libtorch for one architecture;
-
-          echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
-          time docker pull "${DOCKER_IMAGE}" >/dev/null
-
-          export BUILD_LITE_INTERPRETER
-          BUILD_LITE_INTERPRETER="1"
-          if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then
-            BUILD_LITE_INTERPRETER="0"
-          fi
-
-          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
-          # shellcheck disable=SC2016
-          export id
-          id=$(docker run -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e PR_LABELS \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e BUILD_LITE_INTERPRETER \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "$(pwd):/var/lib/jenkins/workspace" \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --security-opt seccomp=unconfined \
-            -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}")
-
-          # shellcheck disable=SC2016
-          export COMMAND
-          # shellcheck disable=SC2016
-          COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
-          echo "${COMMAND}" > ./command.sh && bash ./command.sh
-          # Skip docker push as this job is purely for size analysis purpose.
-          # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied.
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        run: |
-          # The artifact file is created inside docker container, which contains the result binaries.
-          # Now unpackage it into the project folder. The subsequent script will scan project folder
-          # to locate result binaries and report their sizes.
-          # If artifact file is not provided it assumes that the project folder has been mounted in
-          # the docker during build and already contains the result binaries, so this step can be skipped.
-          export ARTIFACTS=
-          if [ -n "${ARTIFACTS}" ]; then
-            tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}"
-            cd "${GITHUB_WORKSPACE}"
-          fi
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          ANDROID_BUILD_TYPE=custom-build-single
-          export ANDROID_BUILD_TYPE
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
deleted file mode 100644
index 06db1e07c519..000000000000
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ /dev/null
@@ -1,304 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: win-vs2019-cpu-py3
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cpu/*'
-      - 'ciflow/trunk/*'
-      - 'ciflow/win/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: win-vs2019-cpu-py3
-  BUILD_WHEEL: 1
-  MAX_JOBS: 8
-  CUDA_VERSION: "cpu"
-  IN_CI: 1
-  IS_GHA: 1
-  INSTALL_WINDOWS_SDK: 1
-  PYTHON_VERSION: "3.8"
-  PYTORCH_RETRY_TEST_CASES: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  SCCACHE_BUCKET: "ossci-compiler-cache"
-  VC_PRODUCT: "BuildTools"
-  VC_VERSION: ""
-  VS_VERSION: "16.8.6"
-  VC_YEAR: "2019"
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  USE_CUDA: 0
-
-concurrency:
-  group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: "windows.4xlarge"
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: win-vs2019-cpu-py3-build
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          .jenkins/pytorch/win-build.sh
-      # Upload to github so that people can click and download artifacts
-      - name: Upload artifacts to s3
-        uses: seemethere/upload-artifact-s3@v3
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup build-results and workspaces
-        if: always()
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}"
-          rm -rf ./*
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: windows.4xlarge
-      NUM_TEST_SHARDS: 2
-      NUM_TEST_SHARDS_ON_PULL_REQUEST: 2
-      PR_BODY: ${{ github.event.pull_request.body }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: ''
-      RUN_SMOKE_TESTS_ONLY_ON_PR: False
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: win-vs2019-cpu-py3-test
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Check build-results folder
-        shell: powershell
-        run: |
-          tree /F C:\$Env:GITHUB_RUN_ID\build-results
-      # Needed for coverage in win-test.sh
-      - uses: actions/setup-python@v2
-        name: Setup Python3
-        with:
-          python-version: '3.x'
-      - name: Test
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-            .jenkins/pytorch/win-test.sh
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: win-vs2019-cpu-py3-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
deleted file mode 100644
index 8e84f9d53475..000000000000
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ /dev/null
@@ -1,323 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: win-vs2019-cuda11.3-py3
-
-on:
-  pull_request:
-  push:
-    tags:
-      - 'ciflow/all/*'
-      - 'ciflow/cuda/*'
-      - 'ciflow/trunk/*'
-      - 'ciflow/win/*'
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3
-  BUILD_WHEEL: 1
-  MAX_JOBS: 8
-  CUDA_VERSION: "11.3"
-  IN_CI: 1
-  IS_GHA: 1
-  INSTALL_WINDOWS_SDK: 1
-  PYTHON_VERSION: "3.8"
-  PYTORCH_RETRY_TEST_CASES: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  SCCACHE_BUCKET: "ossci-compiler-cache"
-  VC_PRODUCT: "BuildTools"
-  VC_VERSION: ""
-  VS_VERSION: "16.8.6"
-  VC_YEAR: "2019"
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock
-  AWS_DEFAULT_REGION: us-east-1
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  TORCH_CUDA_ARCH_LIST: "7.0"
-  USE_CUDA: 1
-
-concurrency:
-  group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: "windows.4xlarge"
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: win-vs2019-cuda11.3-py3-build
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-    steps:
-      - name: print labels
-        run: echo "${PR_LABELS}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          .jenkins/pytorch/win-build.sh
-      # Upload to github so that people can click and download artifacts
-      - name: Upload artifacts to s3
-        uses: seemethere/upload-artifact-s3@v3
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup build-results and workspaces
-        if: always()
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}"
-          rm -rf ./*
-
-  generate-test-matrix:
-    needs: build
-    runs-on: ubuntu-18.04
-    timeout-minutes: 240
-    env:
-      TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
-      NUM_TEST_SHARDS: 2
-      NUM_TEST_SHARDS_ON_PULL_REQUEST: 0
-      PR_BODY: ${{ github.event.pull_request.body }}
-      NOGPU_RUNNER_TYPE: windows.4xlarge
-      ENABLE_FORCE_ON_CPU_TEST: 1
-      RUN_SMOKE_TESTS_ONLY_ON_PR: True
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    timeout-minutes: 240
-    env:
-      JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Install Visual Studio 2019 toolchain
-        shell: powershell
-        run: |
-          .\.circleci\scripts\vs_install.ps1
-      - name: Install Cuda
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cuda_install.sh
-      - name: Install Cudnn
-        if: ${{ matrix.config != 'force_on_cpu' }}
-        shell: bash
-        run: |
-          .circleci/scripts/windows_cudnn_install.sh
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          path: C:\${{ github.run_id }}\build-results
-      - name: Check build-results folder
-        shell: powershell
-        run: |
-          tree /F C:\$Env:GITHUB_RUN_ID\build-results
-      # Needed for coverage in win-test.sh
-      - uses: actions/setup-python@v2
-        name: Setup Python3
-        with:
-          python-version: '3.x'
-      - name: Test
-        shell: bash
-        env:
-          PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
-        # Time out the test phase after 3.5 hours
-        timeout-minutes: 210
-        run: |
-            .jenkins/pytorch/win-test.sh
-      - name: Zip JSONs for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Downloaded JSONs on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: warn
-          path:
-            test-jsons-*.zip
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        shell: powershell
-        run: |
-          # -ir => recursive include all files in pattern
-          7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Wait until all sessions have drained
-        shell: powershell
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: '${{ github.run_id }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml
new file mode 100644
index 000000000000..32dc4f4eb945
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@@ -0,0 +1,3638 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-conda
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_conda/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-conda
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  conda-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_7-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_7-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_7-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_7-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_7-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_8-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_8-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_8-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_9-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_9-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_9-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_10-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  conda-py3_10-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: conda-py3_10-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml
deleted file mode 100644
index f1ff574a1f7e..000000000000
--- a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml
+++ /dev/null
@@ -1,4618 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-cxx11-abi
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-cxx11-abi
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_RETRY_TEST_CASES: 1
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: windows-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-with-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-without-deps-cxx11-abi-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
new file mode 100644
index 000000000000..04188e958fec
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
@@ -0,0 +1,247 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-debug
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-debug-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
new file mode 100644
index 000000000000..5983d1b4212e
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -0,0 +1,3782 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-debug
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-debug-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-debug-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-static-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-debug-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-static-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-debug-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-static-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-static-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-static-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-with-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-static-without-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-without-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-without-deps-debug-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-debug-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-debug
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml
deleted file mode 100644
index e09c0f8052c1..000000000000
--- a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml
+++ /dev/null
@@ -1,4618 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-pre-cxx11
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-pre-cxx11
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_RETRY_TEST_CASES: 1
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: windows-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-shared-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cpu-static-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cpu-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-static-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_1-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-shared-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-with-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_5-static-without-deps-pre-cxx11-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-master.yml b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
new file mode 100644
index 000000000000..422cbb27cbb7
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
@@ -0,0 +1,247 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-release
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-release-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
new file mode 100644
index 000000000000..2ecfafae499f
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -0,0 +1,3782 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-libtorch-release
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-release-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-shared-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-shared-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-release-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-shared-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-shared-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-static-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-release-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cpu-static-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cpu-static-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-release-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cpu-static-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cpu-static-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-static-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-shared-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-shared-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-shared-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-shared-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-static-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_3-static-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_3-static-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_3-static-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_3-static-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_3-static-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-shared-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-shared-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-shared-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-shared-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-static-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-with-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-with-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  libtorch-cuda11_6-static-without-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: libtorch-cuda11_6-static-without-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-without-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda11_6-static-without-deps-release-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda11_6-static-without-deps-release-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: static-without-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda11_6-static-without-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-wheel-master.yml b/.github/workflows/generated-windows-binary-wheel-master.yml
new file mode 100644
index 000000000000..befb73dd15c2
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-wheel-master.yml
@@ -0,0 +1,241 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-wheel
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-wheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  wheel-py3_7-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_7-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
new file mode 100644
index 000000000000..12a8b5661f4e
--- /dev/null
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -0,0 +1,3638 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: windows-binary-wheel
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: windows-binary-wheel
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  IN_CI: 1
+  IS_GHA: 1
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_RETRY_TEST_CASES: 1
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+concurrency:
+  group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  wheel-py3_7-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_7-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_7-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_7-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_7-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_7-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_7-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_7-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.7"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_7-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_8-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_8-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_8-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_8-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_8-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_8-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_9-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_9-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_9-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_9-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_9-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_9-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cpu-build
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cpu-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cpu-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_10-cuda11_3-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_10-cuda11_3
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_3-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda11_3-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda11_3
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_3-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda11_3-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu113
+      GPU_ARCH_VERSION: 11.3
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda11_3
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+  wheel-py3_10-cuda11_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: seemethere/upload-artifact-s3@v4
+        if: always()
+        with:
+          name: wheel-py3_10-cuda11_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails
+      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda11_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_6-upload:  # Uploading
+    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda11_6-test
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - uses: seemethere/download-artifact-s3@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda11_6
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        run: |
+          # reference ends with an RC suffix
+          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+      - name: Upload binaries
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+        run: |
+          docker run --rm -i \
+            -e ANACONDA_API_TOKEN \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e DRY_RUN \
+            -e PACKAGE_TYPE \
+            -e PKG_DIR=/artifacts \
+            -e UPLOAD_CHANNEL \
+            -e UPLOAD_SUBFOLDER \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -v "${GITHUB_WORKSPACE}:/v" \
+            -w /v \
+            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
+            bash -c '.circleci/scripts/binary_upload.sh'
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-windows-binary-wheel.yml b/.github/workflows/generated-windows-binary-wheel.yml
deleted file mode 100644
index afce9a010bb8..000000000000
--- a/.github/workflows/generated-windows-binary-wheel.yml
+++ /dev/null
@@ -1,4426 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-wheel
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-wheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  IN_CI: 1
-  IS_GHA: 1
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_RETRY_TEST_CASES: 1
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  wheel-py3_7-cpu-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_7-cuda11_1-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_1-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_7-cuda11_3-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_3-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_7-cuda11_5-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_5-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_5
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_5-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cpu-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cuda11_1-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_1-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cuda11_3-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_3-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_8-cuda11_5-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_5-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_5
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_5-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cpu-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cuda11_1-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_1-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cuda11_3-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_3-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_9-cuda11_5-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_5-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_5
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_5-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cpu-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cuda11_1-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cuda11_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_1-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_1-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu111
-      GPU_ARCH_VERSION: 11.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cuda11_3-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_3-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_3-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-  wheel-py3_10-cuda11_5-build:
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: seemethere/upload-artifact-s3@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cuda11_5
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_5-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_5-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_5
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_5-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_5-test
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu115
-      GPU_ARCH_VERSION: 11.5
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-      - name: Chown workspace
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_5
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index d98a81da5e9b..6876a2bfc36f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,22 +1,79 @@
 name: Lint
 
 on:
+  pull_request:
   push:
     branches:
       - master
-  pull_request:
+      - main
+      - release/*
+  workflow_dispatch:
 
 jobs:
+  lintrunner:
+    runs-on: linux.20_04.16x
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+          architecture: x64
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+
+      - name: Install lintrunner
+        run: pip install lintrunner==0.9.*
+
+      - name: Initialize lint dependencies
+        run: lintrunner init
+
+      - name: Do build steps necessary for linters
+        run: |
+          python3 -m tools.linter.clang_tidy.generate_build_files
+          python3 -m tools.generate_torch_version --is_debug=false
+          python3 -m tools.pyi.gen_pyi \
+            --native-functions-path aten/src/ATen/native/native_functions.yaml \
+            --tags-path aten/src/ATen/native/tags.yaml \
+            --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+      - name: Run lintrunner on all files
+        run: |
+          set +e
+          if ! lintrunner --verbose --force-color --all-files --tee-json=lint.json; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+              exit 1
+          fi
+
+      - name: Store annotations
+        if: always() && github.event_name == 'pull_request'
+        # Don't show this as an error; the above step will have already failed.
+        continue-on-error: true
+        run: |
+          # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
+          jq --raw-output \
+            '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
+            lint.json
+
   quick-checks:
-    runs-on: ubuntu-18.04
+    name: quick-checks
+    runs-on: linux.20_04.4x
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
           python-version: 3.x
           architecture: x64
+      # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
       - name: Clean PyTorch checkout
         run: |
           # Remove any artifacts from the previous checkouts
@@ -24,70 +81,16 @@ jobs:
       - name: Install requirements
         id: requirements
         run: pip3 install -r requirements.txt --user
-      - name: Ensure consistent CircleCI YAML config
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: cd .circleci && ./ensure-consistency.py
-      - name: Lint native_functions.yaml
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          pip3 install ruamel.yaml==0.17.4 --user
-          .github/scripts/lint_native_functions.py
-      - name: Ensure correct trailing newlines
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          (! git --no-pager grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)**.ipynb' ':(exclude)tools/clang_format_hash' | tools/linter/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
-      - name: Ensure no trailing spaces
-        if: always()
-        run: |
-          (! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)**.diff' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
-      - name: Ensure no tabs
-        if: always()
-        run: |
-          (! git --no-pager grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
       - name: Ensure no non-breaking spaces
         if: always()
         run: |
           # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
           # does not support the '\u000a' syntax (which is relevant for local linters)
           (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-      - name: Ensure canonical include
-        if: always()
-        run: |
-          (! git --no-pager grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' ':(exclude)torch/csrc/jit/serialization/mobile_bytecode_generated.h'|| (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
       - name: Ensure no versionless Python shebangs
         if: always()
         run: |
           (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-      - name: Ensure no unqualified noqa
-        if: always()
-        run: |
-          # shellcheck disable=SC2016
-          (! git --no-pager grep -InP '# noqa(?!: [A-Z]+\d{3})' -- '**.py' '**.pyi' ':(exclude)caffe2' || (echo 'The above lines have unqualified `noqa`; please convert them to `noqa: XXXX`'; false))
-      - name: Ensure no unqualified type ignore
-        if: always()
-        run: |
-          # shellcheck disable=SC2016
-          (! git --no-pager grep -InP '# type:\s*ignore(?!\[)' -- '**.py' '**.pyi' ':(exclude)test/test_jit.py' || (echo 'The above lines have unqualified `type: ignore`; please convert them to `type: ignore[xxxx]`'; false))
-      - name: Ensure GitHub PyPi dependencies are pinned
-        if: always()
-        run: |
-          (! git --no-pager grep --color=always -InP \
-                '(pip|pip3|python -m pip|python3 -m pip|python3 -mpip|python -mpip) install ([a-z][\.a-z-0-9]*+(?!(=|.*\.whl))([[:blank:]]|))+' \
-                -- .github \
-                ':(exclude)**.rst' \
-                ':(exclude)**.py' \
-                ':(exclude)**.md' \
-                ':(exclude)**.diff' \
-                ':(exclude)third_party' ||
-            (echo "The above lines have unpinned PyPi installs; please pin them to a specific version: e.g. 'thepackage==1.2'"; false))
-      # note that this next step depends on a clean checkout;
-      # if you run it locally then it will likely to complain
-      # about all the generated files in torch/test
-      - name: Ensure C++ source files are not executable
-        if: always()
-        run: |
-          # shellcheck disable=SC2016
-          (! find . \( -path ./third_party -o -path ./.git -o -path ./torch/bin -o -path ./build \) -prune -o -type f -executable -regextype posix-egrep -not -regex '.+(\.(bash|sh|py|so)|git-pre-commit|git-clang-format|gradlew)$' -print | grep . || (echo 'The above files have executable permission; please remove their executable permission by using `chmod -x`'; false))
       - name: C++ docs check
         if: ${{ always() && steps.requirements.outcome == 'success' }}
         run: |
@@ -98,89 +101,22 @@ jobs:
         run: |
           set -eux
           python torch/testing/_check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
-      - name: Ensure no direct cub include
-        if: always()
-        run: |
-          (! git --no-pager grep -I -no $'#include <cub/' --  ./aten  ':(exclude)aten/src/ATen/cuda/cub*.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
-      - name: Ensure no raw cuda api calls
-        if: always()
-        run: |
-          (! git --no-pager grep -I -no $'cudaStreamSynchronize' --  ./aten ./c10 ':(exclude)aten/src/ATen/test' ':(exclude)c10/cuda/CUDAFunctions.h' || (echo "The above files call raw cuda APIs directly; please use at::cuda wrappers instead"; false))
-      - name: Ensure all test files have header containing ownership information
-        if: always()
-        run: |
-          python3 -m pip install boto3==1.19.12
-          .github/scripts/lint_test_ownership.py
 
-  clang-format:
-    runs-on: ubuntu-18.04
-    if: ${{ github.event_name == 'pull_request' }}
+  workflow-checks:
+    name: workflow-checks
+    runs-on: linux.20_04.4x
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
           python-version: 3.x
           architecture: x64
-      - name: Fetch PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow us to use git merge-base
-      - name: Run clang-format
-        env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        run: |
-          set -eu
-          # This is necessary to get the same results regardless of whether the
-          # PR was opened directly or from a forked repo. See: `9f890a92` for more info.
-          git remote add upstream https://github.com/pytorch/pytorch
-          git fetch upstream "$GITHUB_BASE_REF"
-
-          # only run clang-format on allowlisted files
-          echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-          echo "| clang-format failures found! Run: "
-          echo "|    tools/linter/clang_format_ci.sh ${BASE_SHA} "
-          echo "| to fix this error. "
-          echo "| For more info, see: https://github.com/pytorch/pytorch/wiki/clang-format "
-          echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-          tools/linter/clang_format_ci.sh "${BASE_SHA}"
-
-          GIT_DIFF=$(git diff)
-          if [[ -z $GIT_DIFF ]]; then
-            exit 0
-          fi
-          echo "$GIT_DIFF"
-          exit 1
-
-  py2-setup-validate-errormsg:
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 2.x
-          architecture: x64
+      # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Attempt to run setup.py
-        run: |
-          if ! python2 setup.py | grep -q "Python 2 has reached end-of-life and is no longer supported by PyTorch."; then
-            echo 'Running setup.py with Python 2 did not give the expected error message.'
-            false
-          fi
-      - name: Keep torch.utils.collect_env python2 compliant
-        run: python2 -m py_compile torch/utils/collect_env.py
-
-  shellcheck:
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
         with:
-          python-version: 3.x
-          architecture: x64
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+          submodules: false
+          fetch-depth: 1
       - name: Install requirements
         id: requirements
         run: |
@@ -188,8 +124,6 @@ jobs:
       - name: Install Jinja2
         run: |
           pip3 install Jinja2==3.0.1 --user
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Regenerate workflows
         id: generate_workflows
         run: .github/scripts/generate_ci_workflows.py
@@ -207,59 +141,26 @@ jobs:
             echo 'onto a more recent commit from the PyTorch master branch.'
             false
           fi
-      - name: Install ShellCheck
-        id: install_shellcheck
-        if: always()
-        # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
-        run: |
-          set -x
-          scversion="v0.7.2"
-          wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
-          mkdir -p ~/.local/bin
-          cp "shellcheck-${scversion}/shellcheck" ~/.local/bin/
-          rm -r "shellcheck-${scversion}"
-          ~/.local/bin/shellcheck --version
-      - name: Extract scripts from GitHub Actions workflows
-        if: ${{ always() && steps.install_shellcheck.outcome == 'success' }}
-        run: |
-          # For local lints, remove the .extracted_scripts folder if it was already there
-          rm -rf .extracted_scripts
-          tools/extract_scripts.py --out=.extracted_scripts
-      - name: Run ShellCheck
-        if: ${{ always() && steps.install_shellcheck.outcome == 'success' }}
-        run: |
-          if ! tools/linter/run_shellcheck.sh .extracted_scripts .jenkins/pytorch; then
-            echo
-            echo 'ShellCheck gave a nonzero exit code. Please fix the warnings'
-            echo 'listed above. Note that if a path in one of the above warning'
-            echo 'messages starts with .extracted_scripts/ then that means it'
-            echo 'is referring to a shell script embedded within another file,'
-            echo 'whose path is given by the path components immediately'
-            echo 'following the .extracted_scripts/ prefix.'
-            false
-          fi
       - name: Check that jobs will be cancelled
         if: ${{ always() && steps.generate_workflows.outcome == 'success' }}
         run: |
           .github/scripts/ensure_actions_will_cancel.py
-      - name: Run actionlint
-        shell: bash
-        run: |
-          set -eux
-          bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
-          ./actionlint --color
-          rm actionlint
 
   toc:
-    runs-on: ubuntu-18.04
+    name: toc
+    runs-on: linux.20_04.4x
     # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
     env:
       NPM_CONFIG_PREFIX: ~/.npm-global
     steps:
       - name: Setup Node
         uses: actions/setup-node@v2
+      # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
       - name: Install markdown-toc
         run: npm install -g markdown-toc
       - name: Regenerate ToCs and check that they didn't change
@@ -286,222 +187,69 @@ jobs:
             false
           fi
 
-  flake8-py3:
-    runs-on: ubuntu-18.04
+  test-tools:
+    name: Test tools
+    if: ${{ github.repository == 'pytorch/pytorch' }}
+    runs-on: linux.20_04.4x
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.x
+          python-version: 3.8
           architecture: x64
-      - name: Fetch PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 2 # to allow us to use github.event.pull_request.head.sha
-      - name: Prepare output dir with HEAD commit SHA
-        env:
-          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-        run: |
-          mkdir flake8-output
-          cd flake8-output
-          echo "$HEAD_SHA" > commit-sha.txt
-      - name: Install dependencies
-        run: |
-          set -eux
-          pip3 install typing-extensions==3.10 --user # for tools/linter/translate_annotations.py
-          pip3 install -r requirements-flake8.txt --user
-          flake8 --version
-      - name: Run flake8
-        run: |
-          set -eux
-          flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt
-      - name: Translate annotations
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-        run: |
-          tools/linter/translate_annotations.py \
-            --file="${GITHUB_WORKSPACE}"/flake8-output.txt \
-            --regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)' \
-            --commit="$HEAD_SHA" \
-            > flake8-output/annotations.json
-      - name: Fail if there were any warnings
-        run: |
-          set -eu
-          # Re-output flake8 status so GitHub logs show it on the step that actually failed
-          cat "${GITHUB_WORKSPACE}"/flake8-output.txt
-          if [ -s "${GITHUB_WORKSPACE}"/flake8-output.txt ]; then
-            echo 'Please fix the above Flake8 warnings.'
-            false
-          fi
-      - name: Add annotations
-        # Don't run on forked pull requests
-        if: ${{ failure() && github.event.pull_request.head.repo.full_name == github.repository }}
-        uses: pytorch/add-annotations-github-action@master
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          check_name: 'flake8-py3'
-          linter_output_path: flake8-output/annotations.json
-          commit_sha: ${{ github.event.pull_request.head.sha }}
-          mode: json
-
-  clang-tidy:
-    runs-on: linux.2xlarge
-    container:
-      # ubuntu20.04-cuda11.2-py3.8-tidy11
-      image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a
-    steps:
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE}"
-          mkdir "${GITHUB_WORKSPACE}"
+      # [see note: pytorch repo ref]
+      # deep clone (fetch-depth 0) required, to allow us to use git log
       - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
         with:
-          fetch-depth: 0 # to allow tools/linter/clang_tidy.py to do its thing
-      - name: Prepare output dir with HEAD commit SHA
-        env:
-          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-          mkdir clang-tidy-output
-          cd clang-tidy-output
-          echo "$HEAD_SHA" > commit-sha.txt
-      - name: Fetch PR diff
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-          wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff"
-      - name: Generate build files
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-          python3 -m tools.linter.clang_tidy.generate_build_files
-      - name: Run PR clang-tidy
-        if: ${{ github.event_name == 'pull_request' }}
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-
-          # The Docker image has our custom build, so we don't need to install it
-          python3 -m tools.linter.clang_tidy \
-            --clang-tidy-exe "$(which clang-tidy)" \
-            --diff-file pr.diff \
-            --disable-progress-bar 2>&1 | tee "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
-
-      # Run clang-tidy on a smaller subset of the codebase on master until we
-      # make the repository clang-tidy clean
-      - name: Run master clang-tidy
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-
-          python3 -m tools.linter.clang_tidy \
-            --paths \
-              torch/csrc/fx \
-              torch/csrc/utils \
-              torch/csrc/generic \
-              torch/csrc/deploy \
-              torch/csrc/tensor \
-            --clang-tidy-exe "$(which clang-tidy)" \
-            --disable-progress-bar 2>&1 | tee -a "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
-
-      - name: Annotate output
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-          sed --in-place 's/^\.\.\///g' clang-tidy-output.txt
-          tools/linter/translate_annotations.py \
-            --file=clang-tidy-output.txt \
-            --regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorDesc>.*?) \[(?P<errorCode>.*)\]' \
-            --commit="$HEAD_SHA" \
-            > clang-tidy-output/annotations.json
-      - name: Check for warnings
-        run: |
-          cd "${GITHUB_WORKSPACE}"
-          set -eu
-          cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
-          if grep -Fq "Warnings detected!" "${GITHUB_WORKSPACE}"/clang-tidy-output.txt; then
-            echo 'Please fix the above clang-tidy warnings.'
-            false
-          fi
-      - name: Add annotations
-        # Don't run on forked pull requests
-        if: ${{ failure() && github.event.pull_request.head.repo.full_name == github.repository }}
-        uses: pytorch/add-annotations-github-action@master
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          check_name: 'clang-tidy'
-          linter_output_path: clang-tidy/annotations.json
-          commit_sha: ${{ github.event.pull_request.head.sha }}
-          mode: json
-
-  cmakelint:
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.x
-          architecture: x64
-      - name: Fetch PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+          submodules: false
       - name: Install dependencies
+        # mypy and boto3 versions copied from
+        # .circleci/docker/common/install_conda.sh
         run: |
           set -eux
-          pip3 install cmakelint==1.4.1 --user
-          cmakelint --version
-      - name: Run cmakelint
-        run: |
-          set -eux
-          git ls-files -z -- bootstrap '*.cmake' '*.cmake.in' '*CMakeLists.txt' | \
-          grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/|cmake/Caffe2Config.cmake.in|aten/src/ATen/ATenConfig.cmake.in|cmake/Caffe2ConfigVersion.cmake.in|cmake/TorchConfig.cmake.in|cmake/TorchConfigVersion.cmake.in|cmake/cmake_uninstall.cmake.in)' | \
-          xargs -0 cmakelint --config=.cmakelintrc --spaces=2 --quiet
-
-  mypy:
-    runs-on: ubuntu-18.04
+          python3 -mpip install -r requirements.txt
+          python3 -mpip install boto3==1.16.34
+          pip3 install typing-extensions==3.10 --user
+          pip3 install -r requirements-flake8.txt --user
+          python3 -mpip install -r requirements.txt --user
+          python3 -mpip install mypy==0.812 --user
+          make setup_lint
+      - name: Test tools
+        run: |
+          python3 -m unittest discover -vs tools/test -p 'test_*.py'
+          python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
+
+  test_collect_env:
+    if: ${{ github.repository == 'pytorch/pytorch' }}
+    name: Test collect_env
+    runs-on: linux.20_04.4x
+    strategy:
+      matrix:
+        with_torch: [with_torch, without_torch]
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
           python-version: 3.8
           architecture: x64
-      - name: Fetch PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Install dependencies
-        run: |
-          set -eux
-          python3 -mpip install -r requirements.txt --user
-          python3 -mpip install numpy==1.20 --user # https://github.com/pytorch/pytorch/pull/60472
-          python3 -mpip install expecttest==0.1.3 mypy==0.812 --user
-          # Needed to check tools/render_junit.py
-          python3 -mpip install junitparser==2.1.1 rich==10.9.0 --user
-      - name: Run autogen
+      # [see note: pytorch repo ref]
+      # deep clone (fetch-depth 0) required, to allow us to use git log
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+      - name: Install torch
+        if: matrix.with_torch == 'with_torch'
         run: |
-          set -eux
-          time python3 -mtools.generate_torch_version --is_debug=false
-          time python3 -mtools.codegen.gen -s aten/src/ATen -d build/aten/src/ATen
-          time python3 -mtools.pyi.gen_pyi --native-functions-path aten/src/ATen/native/native_functions.yaml --deprecated-functions-path "tools/autograd/deprecated.yaml"
-      - name: Run mypy
-        env:
-          MYPY_FORCE_COLOR: 1
-          TERM: xterm-color
+          # Doesn't really matter what torch version, we just need ANY torch installed
+          pip install 'torch==1.*'
+      - name: Run collect_env.py
         run: |
-          set -eux
-          STATUS=
-          for CONFIG in mypy*.ini; do
-            if ! python3 -mmypy --config="$CONFIG"; then
-              STATUS=fail
-            fi
-          done
-          if [ -n "$STATUS" ]; then
-            echo 'Please fix the above mypy warnings.'
-            false
-          fi
+          # All we need to see is that it passes
+          python3 torch/utils/collect_env.py
 
 concurrency:
-  group: lint-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 000000000000..3322b2097a17
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,33 @@
+name: nightly
+
+on:
+  schedule:
+    - cron: 0 0 * * *
+  push:
+    tags:
+      - ciflow/nightly/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  docs-build:
+    name: docs build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+
+  docs-push:
+    name: docs push
+    uses: ./.github/workflows/_docs.yml
+    needs: docs-build
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4
+      docker-image: ${{ needs.docs-build.outputs.docker-image }}
+      push: true
+    secrets:
+      GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
new file mode 100644
index 000000000000..ad3908e5f5cd
--- /dev/null
+++ b/.github/workflows/periodic.yml
@@ -0,0 +1,202 @@
+name: periodic
+
+on:
+  schedule:
+    - cron: 45 0,4,8,12,16,20 * * *
+  push:
+    tags:
+      - ciflow/periodic/*
+      - ciflow/all/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_6-py3_7-gcc7-build:
+    name: linux-bionic-cuda11.6-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.7-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+
+  linux-bionic-cuda11_6-py3_7-gcc7-test:
+    name: linux-bionic-cuda11.6-py3.7-gcc7
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_6-py3_7-gcc7-build
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.7-gcc7
+      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_7-gcc7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  libtorch-linux-bionic-cuda11_6-py3_7-gcc7-build:
+    name: libtorch-linux-bionic-cuda11.6-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-bionic-cuda11.6-py3.7-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-generates-artifacts: false
+
+  linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build:
+    name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
+      docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
+
+  linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-test:
+    name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build
+    with:
+      build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck
+      docker-image: ${{ needs.linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-bionic-rocm5_1-py3_7-slow-build:
+    name: linux-bionic-rocm5.1-py3.7-slow
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7
+
+  linux-bionic-rocm5_1-py3_7-slow-test:
+    name: linux-bionic-rocm5.1-py3.7-slow
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-bionic-rocm5_1-py3_7-slow-build
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-slow-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+        ]}
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+
+  linux-bionic-rocm5_1-py3_7-distributed-build:
+    name: linux-bionic-rocm5.1-py3.7-distributed
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7
+
+  linux-bionic-rocm5_1-py3_7-distributed-test:
+    name: linux-bionic-rocm5.1-py3.7-distributed
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-bionic-rocm5_1-py3_7-distributed-build
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-distributed-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+        ]}
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+
+  linux-xenial-cuda11_3-py3_7-gcc7-debug-build:
+    name: linux-xenial-cuda11.3-py3.7-gcc7-debug
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7-debug
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+      build-with-debug: true
+
+  linux-xenial-cuda11_3-py3_7-gcc7-debug-test:
+    name: linux-xenial-cuda11.3-py3.7-gcc7-debug
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-cuda11_3-py3_7-gcc7-debug-build
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7-debug
+      docker-image: ${{ needs.linux-xenial-cuda11_3-py3_7-gcc7-debug-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  win-vs2019-cuda11_6-py3-build:
+    name: win-vs2019-cuda11.6-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cuda11.6-py3
+      cuda-version: "11.6"
+
+  win-vs2019-cuda11_6-py3-test:
+    name: win-vs2019-cuda11.6-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cuda11_6-py3-build
+    with:
+      build-environment: win-vs2019-cuda11.6-py3
+      cuda-version: "11.6"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
+
+  ios-12-5-1-arm64:
+    name: ios-12-5-1-arm64
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-arm64
+      ios-platform: OS
+      ios-arch: arm64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+
+  ios-12-5-1-arm64-coreml:
+    name: ios-12-5-1-arm64-coreml
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-arm64-coreml
+      ios-platform: OS
+      ios-arch: arm64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+
+  ios-12-5-1-arm64-custom-ops:
+    name: ios-12-5-1-arm64-custom-ops
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-arm64-custom-ops
+      ios-platform: OS
+      ios-arch: arm64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+
+  ios-12-5-1-arm64-metal:
+    name: ios-12-5-1-arm64-metal
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-arm64-metal
+      ios-platform: OS
+      ios-arch: arm64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
index af2acc1101e7..7313d0b8e968 100644
--- a/.github/workflows/pr-labels.yml
+++ b/.github/workflows/pr-labels.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - master
+      - main
 
 jobs:
   is-properly-labeled:
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
new file mode 100644
index 000000000000..ebc936bcd5ed
--- /dev/null
+++ b/.github/workflows/pull.yml
@@ -0,0 +1,325 @@
+name: pull
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+      - main
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-xenial-py3_7-gcc5_4-build:
+    name: linux-xenial-py3.7-gcc5.4
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+
+  linux-xenial-py3_7-gcc5_4-test:
+    name: linux-xenial-py3.7-gcc5.4
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-py3_7-gcc5_4-build
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4
+      docker-image: ${{ needs.linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  linux-docs:
+    name: linux-docs
+    uses: ./.github/workflows/_docs.yml
+    needs: linux-xenial-py3_7-gcc5_4-build
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4
+      docker-image: ${{ needs.linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }}
+
+  linux-xenial-py3_7-gcc7-build:
+    name: linux-xenial-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-gcc7
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc7
+
+  linux-xenial-py3_7-gcc7-test:
+    name: linux-xenial-py3.7-gcc7
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-py3_7-gcc7-build
+    with:
+      build-environment: linux-xenial-py3.7-gcc7
+      docker-image: ${{ needs.linux-xenial-py3_7-gcc7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+        ]}
+
+  linux-xenial-py3_7-clang7-asan-build:
+    name: linux-xenial-py3.7-clang7-asan
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-clang7-asan
+      docker-image-name: pytorch-linux-xenial-py3-clang7-asan
+
+  linux-xenial-py3_7-clang7-asan-test:
+    name: linux-xenial-py3.7-clang7-asan
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-py3_7-clang7-asan-build
+    with:
+      build-environment: linux-xenial-py3.7-clang7-asan
+      docker-image: ${{ needs.linux-xenial-py3_7-clang7-asan-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.2xlarge" },
+        ]}
+
+  linux-xenial-py3_7-gcc7-no-ops:
+    name: linux-xenial-py3.7-gcc7-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-gcc7-no-ops
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc7
+
+  linux-xenial-py3_7-clang7-onnx-build:
+    name: linux-xenial-py3.7-clang7-onnx
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-clang7-onnx
+      docker-image-name: pytorch-linux-xenial-py3-clang7-onnx
+
+  linux-xenial-py3_7-clang7-onnx-test:
+    name: linux-xenial-py3.7-clang7-onnx
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-py3_7-clang7-onnx-build
+    with:
+      build-environment: linux-xenial-py3.7-clang7-onnx
+      docker-image: ${{ needs.linux-xenial-py3_7-clang7-onnx-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+        ]}
+
+  linux-bionic-py3_7-clang9-build:
+    name: linux-bionic-py3.7-clang9
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-py3.7-clang9
+      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+
+  linux-bionic-py3_7-clang9-test:
+    name: linux-bionic-py3.7-clang9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-py3_7-clang9-build
+    with:
+      build-environment: linux-bionic-py3.7-clang9
+      docker-image: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+        ]}
+
+  linux-bionic-cuda11_3-py3_7-clang9-build:
+    name: linux-bionic-cuda11.3-py3.7-clang9
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.3-py3.7-clang9
+      docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
+
+  linux-vulkan-bionic-py3_7-clang9-build:
+    name: linux-vulkan-bionic-py3.7-clang9
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-vulkan-bionic-py3.7-clang9
+      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+
+  linux-vulkan-bionic-py3_7-clang9-test:
+    name: linux-vulkan-bionic-py3.7-clang9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-vulkan-bionic-py3_7-clang9-build
+    with:
+      build-environment: linux-vulkan-bionic-py3.7-clang9
+      docker-image: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  linux-xenial-cuda11_3-py3_7-gcc7-build:
+    name: linux-xenial-cuda11.3-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+
+  linux-xenial-cuda11_3-py3_7-gcc7-test:
+    name: linux-xenial-cuda11.3-py3.7-gcc7
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-xenial-cuda11_3-py3_7-gcc7-build
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7
+      docker-image: ${{ needs.linux-xenial-cuda11_3-py3_7-gcc7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },
+        ]}
+
+  linux-bionic-rocm5_1-py3_7-build:
+    name: linux-bionic-rocm5.1-py3.7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7
+
+  linux-bionic-rocm5_1-py3_7-test:
+    name: linux-bionic-rocm5.1-py3.7
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-bionic-rocm5_1-py3_7-build
+    with:
+      build-environment: linux-bionic-rocm5.1-py3.7
+      docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+        ]}
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+
+  linux-xenial-py3-clang5-mobile-build:
+    name: linux-xenial-py3-clang5-mobile-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3-clang5-mobile-build
+      docker-image-name: pytorch-linux-xenial-py3-clang5-asan
+      build-generates-artifacts: false
+
+  linux-xenial-py3-clang5-mobile-custom-build-static:
+    name: linux-xenial-py3-clang5-mobile-custom-build-static
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3-clang5-mobile-custom-build-static
+      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+      build-generates-artifacts: false
+
+  pytorch-xla-linux-bionic-py3_7-clang8-build:
+    name: pytorch-xla-linux-bionic-py3.7-clang8
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: pytorch-xla-linux-bionic-py3.7-clang8
+      docker-image-name: xla_base
+
+  pytorch-xla-linux-bionic-py3_7-clang8-test:
+    name: pytorch-xla-linux-bionic-py3.7-clang8
+    uses: ./.github/workflows/_linux-test.yml
+    needs: pytorch-xla-linux-bionic-py3_7-clang8-build
+    with:
+      build-environment: pytorch-xla-linux-bionic-py3.7-clang8
+      docker-image: ${{ needs.pytorch-xla-linux-bionic-py3_7-clang8-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  win-vs2019-cpu-py3-build:
+    name: win-vs2019-cpu-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cpu-py3
+      cuda-version: cpu
+
+  win-vs2019-cpu-py3-test:
+    name: win-vs2019-cpu-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cpu-py3-build
+    with:
+      build-environment: win-vs2019-cpu-py3
+      cuda-version: cpu
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
+        ]}
+
+  # please ensure that this and its corresponding job in trunk.yml are in sync
+  win-vs2019-cuda11_3-py3-build:
+    # don't run build twice on master
+    if: github.event_name == 'pull_request'
+    name: win-vs2019-cuda11.3-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cuda11.3-py3
+      cuda-version: "11.3"
+
+  linux-xenial-cuda11_3-py3_7-gcc7-bazel-test:
+    name: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test
+    uses: ./.github/workflows/_bazel-build-test.yml
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+
+  pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single:
+    name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
+    uses: ./.github/workflows/_android-build-test.yml
+    with:
+      build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
+      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+
+  pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit:
+    name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
+    uses: ./.github/workflows/_android-build-test.yml
+    with:
+      build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
+      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+
+  linux-xenial-py3_7-gcc5_4-mobile-lightweight-dispatch-build:
+    name: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+      build-generates-artifacts: false
+
+  deploy-linux-xenial-cuda11_3-py3_7-gcc7-build:
+    name: deploy-linux-xenial-cuda11.3-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: deploy-linux-xenial-cuda11.3-py3.7-gcc7
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+
+  deploy-linux-xenial-cuda11_3-py3_7-gcc7-test:
+    name: linux-xenial-cuda11.3-py3.7-gcc7
+    uses: ./.github/workflows/_linux-test.yml
+    needs: deploy-linux-xenial-cuda11_3-py3_7-gcc7-build
+    with:
+      build-environment: deploy-linux-xenial-cuda11.3-py3.7-gcc7
+      docker-image: ${{ needs.deploy-linux-xenial-cuda11_3-py3_7-gcc7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml
index b11eebe3ffdf..ca30c9651ff8 100644
--- a/.github/workflows/push_nightly_docker_ghcr.yml
+++ b/.github/workflows/push_nightly_docker_ghcr.yml
@@ -1,22 +1,39 @@
-name: Build PyTorch nightly Docker image and push to GitHub Container Registry
+name: docker-release-builds
 on:
   schedule:
     # Push the nightly docker daily at 1 PM UTC
     - cron: '0 13 * * *'
+  # Trigger when we modify something related to these images
+  pull_request:
+    paths:
+      - .github/scripts/build_publish_nightly_docker.sh
+      - .github/workflows/push_nightly_docker_ghcr.yml
+      - Dockerfile
+      - docker.Makefile
   # Have the ability to trigger this job manually using the API as well
   workflow_dispatch:
 
 jobs:
-  build-publish-docker:
+  docker-release-build:
     if: ${{ github.repository == 'pytorch/pytorch' }}
     runs-on: linux.2xlarge
     env:
       GHCR_PAT: ${{ secrets.GHCR_PAT }}
+      WITH_PUSH: ${{ github.event_name == 'schedule' }}
     steps:
-      - name: Checkout
+      - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          ref: master
-      - name: Build and upload nightly docker
-        run: |
-          bash .github/scripts/build_publish_nightly_docker.sh
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
+        name: Build and upload nightly docker
+        with:
+          timeout_minutes: 10
+          max_attempts: 3
+          command: |
+            set -ex
+            bash .github/scripts/build_publish_nightly_docker.sh
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index fa5451d96951..05e7e68ff454 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -6,6 +6,7 @@ on:
 
 jobs:
   do_revert:
+    name: try_revert_pr_${{ github.event.client_payload.pr_num }}
     runs-on: ubuntu-20.04
     steps:
       - name: Setup Python
@@ -27,6 +28,14 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }}
           PR_NUM: ${{ github.event.client_payload.pr_num }}
+          COMMENT_ID: ${{ github.event.client_payload.comment_id }}
           GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
-          python3 .github/scripts/trymerge.py --revert "${PR_NUM}"
+          set -ex
+          if [ -n "${COMMENT_ID}" ]; then
+            python3 .github/scripts/trymerge.py --revert --comment-id "${COMMENT_ID}" "${PR_NUM}"
+          else
+            python3 .github/scripts/trymerge.py --revert "${PR_NUM}"
+          fi
+
+concurrency: try-revert
diff --git a/.github/workflows/run_android_tests.yml b/.github/workflows/run_android_tests.yml
new file mode 100644
index 000000000000..85cef5623d7e
--- /dev/null
+++ b/.github/workflows/run_android_tests.yml
@@ -0,0 +1,67 @@
+name: android-tests
+
+on:
+  push:
+    tags:
+      # Trigger on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/trunk/*'
+      - 'ciflow/android/*'
+    branches:
+      - master
+      - main
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: run-android-tests-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -e -l {0}
+
+jobs:
+
+  build-and-test:
+    runs-on: ubuntu-latest
+    env:
+      JOB_BASE_NAME: ubuntu-latest-android-tests
+    steps:
+      - name: Setup miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          python-version: 3.8
+          activate-environment: build
+
+      - name: Install dependencies
+        run: |
+          conda install -y \
+            cffi \
+            cmake \
+            mkl \
+            mkl-include \
+            ninja \
+            numpy \
+            pyyaml \
+            requests \
+            setuptools \
+            typing_extensions
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - name: Build PyTorch Android
+        run: |
+          export ANDROID_NDK="${ANDROID_SDK_ROOT}/ndk-bundle"
+          echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
+          ./scripts/build_pytorch_android.sh x86
+
+      - name: Run tests
+        uses: reactivecircus/android-emulator-runner@v2
+        with:
+          api-level: 25
+          script: ./android/run_tests.sh
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 6533e43facf8..d84a32ca318e 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -36,10 +36,15 @@ jobs:
           # shellcheck disable=SC1091
           . "${HOME}"/anaconda3/etc/profile.d/conda.sh
           conda activate pr-ci
-          conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions \
-                           future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm
+          # pin cmake version to 3.22 since 3.23 breaks pytorch build
+          # see details at: https://github.com/pytorch/pytorch/issues/74985
+          conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake=3.22 cffi typing_extensions \
+                           future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil
           # install magma
           conda install -y -c pytorch "${MAGMA_VERSION}"
+          # install ffmpeg-4.4.1
+          # torchvision doesn't compile on ffmpeg-5: https://github.com/pytorch/vision/issues/5616
+          conda install -y ffmpeg=4.4.1
       - name: Setup TorchBench branch
         run: |
           # shellcheck disable=SC1091
@@ -53,7 +58,7 @@ jobs:
         with:
           repository: pytorch/benchmark
           path: benchmark
-          lfs: true
+          lfs: false
           ref: ${{ env.TORCHBENCH_BRANCH }}
       - name: Run TorchBench
         run: |
@@ -84,5 +89,5 @@ jobs:
           path: ~/.torchbench/bisection/pr${{ github.event.number }}
 
 concurrency:
-  group: run-torchbench-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 000000000000..fb29e397b970
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,152 @@
+# A workflow that implements similar logic to actions/stale.
+#
+# Compared to actions/stale, it is implemented to make API requests proportional
+# to the number of stale PRs, not the total number of issues in the repo. This
+# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into
+# rate limits way too quickly.
+#
+# The behavior is:
+# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it.
+# - If a PR is labeled stale, after 30 days inactivity close the PR.
+# - `high priority` and `no-stale` PRs are exempt.
+
+name: Close stale pull requests
+
+on:
+  schedule:
+    # Run hourly.
+    - cron: 30 * * * *
+
+jobs:
+  stale:
+    if: ${{ github.repository == 'pytorch/pytorch' }}
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/github-script@v6
+        with:
+          script: |
+            // Do some dumb retries on requests.
+            const retries = 7;
+            const baseBackoff = 100;
+            const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout));
+            github.hook.wrap('request', async (request, options) => {
+              for (let attempt = 1; attempt <= retries; attempt++) {
+                try {
+                  return await request(options);
+                } catch (err) {
+                  if (attempt < retries) {
+                    core.warning(`Request getting retried. Attempt: ${attempt}`);
+                    await sleep(baseBackoff * Math.pow(2, attempt));
+                    continue;
+                  }
+                  throw err;
+                }
+              }
+            });
+
+            const MAX_API_REQUESTS = 100;
+
+            // If a PRs not labeled stale, label them stale after no update for 60 days.
+            const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60;
+            // For PRs already labeled stale, close after not update for 30 days.
+            const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30;
+
+            const STALE_MESSAGE =
+              "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`. <br>" +
+              "Feel free to remove the `Stale` label if you feel this was a mistake. <br>" +
+              "If you are unable to remove the `Stale` label please contact a maintainer in order to do so. <br>" +
+              "If you want the bot to never mark this PR stale again, add the `no-stale` label.<br>" +
+              "`Stale` pull requests will automatically be closed after 30 days of inactivity.<br>";
+
+            let numAPIRequests = 0;
+            let numProcessed = 0;
+
+            async function processPull(pull) {
+              core.info(`[${pull.number}] URL: ${pull.html_url}`);
+              numProcessed += 1;
+              const labels = pull.labels.map((label) => label.name);
+
+              // Skip if certain labels are present.
+              if (labels.includes("no-stale") || labels.includes("high priority")) {
+                core.info(`[${pull.number}] Skipping because PR has an exempting label.`);
+                return false;
+              }
+
+              // Check if the PR is stale, according to our configured thresholds.
+              let staleThresholdMillis;
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`);
+                staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS;
+              } else {
+                core.info(`[${pull.number}] Checking whether to label PR as stale.`);
+                staleThresholdMillis = STALE_LABEL_THRESHOLD_MS;
+              }
+
+              const millisSinceLastUpdated =
+                new Date().getTime() - new Date(pull.updated_at).getTime();
+
+              if (millisSinceLastUpdated < staleThresholdMillis) {
+                core.info(`[${pull.number}] Skipping because PR was updated recently`);
+                return false;
+              }
+
+              // At this point, we know we should do something.
+              // For PRs already labeled stale, close them.
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] Closing PR.`);
+                numAPIRequests += 1;
+                await github.rest.issues.update({
+                  owner: "pytorch",
+                  repo: "pytorch",
+                  issue_number: pull.number,
+                  state: "closed",
+                });
+              } else {
+                // For PRs not labeled stale, label them stale.
+                core.info(`[${pull.number}] Labeling PR as stale.`);
+
+                numAPIRequests += 1;
+                await github.rest.issues.createComment({
+                  owner: "pytorch",
+                  repo: "pytorch",
+                  issue_number: pull.number,
+                  body: STALE_MESSAGE,
+                });
+
+                numAPIRequests += 1;
+                await github.rest.issues.addLabels({
+                  owner: "pytorch",
+                  repo: "pytorch",
+                  issue_number: pull.number,
+                  labels: ["Stale"],
+                });
+              }
+            }
+
+            for await (const response of github.paginate.iterator(
+              github.rest.pulls.list,
+              {
+                owner: "pytorch",
+                repo: "pytorch",
+                state: "open",
+                sort: "created",
+                direction: "asc",
+                per_page: 100,
+              }
+            )) {
+              numAPIRequests += 1;
+              const pulls = response.data;
+              // Awaiting in a loop is intentional here. We want to serialize execution so
+              // that log groups are printed correctl
+              for (const pull of pulls) {
+                if (numAPIRequests > MAX_API_REQUESTS) {
+                  core.warning("Max API requests exceeded, exiting.");
+                  process.exit(0);
+                }
+                await core.group(`Processing PR #${pull.number}`, async () => {
+                  await processPull(pull);
+                });
+              }
+            }
+            core.info(`Processed ${numProcessed} PRs total.`);
diff --git a/.github/workflows/stale_pull_requests.yml b/.github/workflows/stale_pull_requests.yml
deleted file mode 100644
index fabb1c6b1a66..000000000000
--- a/.github/workflows/stale_pull_requests.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: 'Close stale pull requests'
-on:
-  schedule:
-    # TODO: Reduce frequency once we work through the backlog of pull requests
-    - cron: '0 * * * *'
-  workflow_dispatch:
-
-jobs:
-  stale:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/stale@v4.1.0
-        with:
-          stale-pr-message: >
-            Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`. <br>
-            Feel free to remove the `Stale` label if you feel this was a mistake. <br>
-            `Stale` pull requests will automatically be closed 30 days after being marked `Stale` <br>
-          exempt-pr-labels: "no-stale,open source,high priority"
-          days-before-stale: 60
-          days-before-close: 90
-  stale-open-source:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/stale@v4.1.0
-        with:
-          stale-pr-message: >
-            Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`. <br>
-            Feel free to remove the `Stale` label if you feel this was a mistake. <br>
-            If you are unable to remove the `Stale` label please contact a maintainer in order to do so. <br>
-            `Stale` pull requests will automatically be closed 30 days after being marked `Stale` <br>
-          exempt-pr-labels: "no-stale,high priority"
-          only-labels: "open source"
-          days-before-stale: 150
-          days-before-close: 180
diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml
deleted file mode 100644
index ed8f5babdb8d..000000000000
--- a/.github/workflows/test_tools.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: Test tools
-
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-
-jobs:
-  test:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-          architecture: x64
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow us to use git log
-      - name: Install dependencies
-        # mypy and boto3 versions copied from
-        # .circleci/docker/common/install_conda.sh
-        run: |
-          set -eux
-          python3 -mpip install -r requirements.txt
-          python3 -mpip install boto3==1.16.34
-          make setup_lint
-      - name: Test tools
-        run: |
-          python3 -m unittest discover -vs tools/test -p 'test_*.py'
-          python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
-
-concurrency:
-  group: test-tools-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
new file mode 100644
index 000000000000..3f210d3381fe
--- /dev/null
+++ b/.github/workflows/trunk.yml
@@ -0,0 +1,230 @@
+name: trunk
+
+on:
+  push:
+    branches:
+      - master
+      - main
+      - release/*
+    tags:
+      - ciflow/trunk/*
+      - ciflow/all/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  parallelnative-linux-xenial-py3_7-gcc5_4-build:
+    name: parallelnative-linux-xenial-py3.7-gcc5.4
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: parallelnative-linux-xenial-py3.7-gcc5.4
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+
+  parallelnative-linux-xenial-py3_7-gcc5_4-test:
+    name: parallelnative-linux-xenial-py3.7-gcc5.4
+    uses: ./.github/workflows/_linux-test.yml
+    needs: parallelnative-linux-xenial-py3_7-gcc5_4-build
+    with:
+      build-environment: parallelnative-linux-xenial-py3.7-gcc5.4
+      docker-image: ${{ needs.parallelnative-linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+        ]}
+
+  # Build PyTorch with BUILD_CAFFE2=ON
+  caffe2-linux-xenial-py3_7-gcc5_4-build:
+    name: caffe2-linux-xenial-py3.7-gcc5.4
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: caffe2-linux-xenial-py3.7-gcc5.4
+      docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4
+
+  linux-bionic-cuda10_2-py3_9-gcc7-build:
+    name: linux-bionic-cuda10.2-py3.9-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda10.2-py3.9-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
+
+  linux-bionic-cuda10_2-py3_9-gcc7-test:
+    name: linux-bionic-cuda10.2-py3.9-gcc7
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda10_2-py3_9-gcc7-build
+    with:
+      build-environment: linux-bionic-cuda10.2-py3.9-gcc7
+      docker-image: ${{ needs.linux-bionic-cuda10_2-py3_9-gcc7-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
+        ]}
+
+  libtorch-linux-xenial-cuda10_2-py3_7-gcc7-build:
+    name: libtorch-linux-xenial-cuda10.2-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-xenial-cuda10.2-py3.7-gcc7
+      docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
+      build-generates-artifacts: false
+
+  libtorch-linux-xenial-cuda11_3-py3_7-gcc7-build:
+    name: libtorch-linux-xenial-cuda11.3-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-xenial-cuda11.3-py3.7-gcc7
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+      build-generates-artifacts: false
+
+  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
+  linux-xenial-cuda11_3-py3_7-gcc7-no-ops-build:
+    name: linux-xenial-cuda11.3-py3.7-gcc7-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-xenial-cuda11.3-py3.7-gcc7-no-ops
+      docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+
+  pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build:
+    name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
+    uses: ./.github/workflows/_android-full-build-test.yml
+    with:
+      build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
+      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+    secrets:
+      SONATYPE_NEXUS_USERNAME: ${{ secrets.SONATYPE_NEXUS_USERNAME }}
+      SONATYPE_NEXUS_PASSWORD: ${{ secrets.SONATYPE_NEXUS_PASSWORD }}
+      ANDROID_SIGN_KEY: ${{ secrets.ANDROID_SIGN_KEY }}
+      ANDROID_SIGN_PASS: ${{ secrets.ANDROID_SIGN_PASS }}
+      SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+
+  linux-bionic-py3_7-clang9-slow-build:
+    name: linux-bionic-py3.7-clang9-slow
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-py3.7-clang9-slow
+      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+
+  linux-bionic-py3_7-clang9-slow-test:
+    name: linux-bionic-py3.7-clang9-slow
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-py3_7-clang9-slow-build
+    with:
+      build-environment: linux-bionic-py3.7-clang9-slow
+      docker-image: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  ios-12-5-1-x86-64:
+    name: ios-12-5-1-x86-64
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-x86-64
+      ios-platform: SIMULATOR
+      ios-arch: x86_64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+
+  ios-12-5-1-x86-64-coreml:
+    name: ios-12-5-1-x86-64-coreml
+    uses: ./.github/workflows/_ios-build-test.yml
+    with:
+      build-environment: ios-12-5-1-x86-64-coreml
+      ios-platform: SIMULATOR
+      ios-arch: x86_64
+    secrets:
+      IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }}
+      IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}}
+      IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}}
+      IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }}
+
+  macos-11-py3-x86-64-build:
+    name: macos-11-py3-x86-64
+    uses: ./.github/workflows/_mac-build.yml
+    with:
+      build-environment: macos-11-py3-x86-64
+      xcode-version: "13.3.1"
+      runner-type: macos-12
+      build-generates-artifacts: true
+    secrets:
+      MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+
+  macos-11-py3-x86-64-test:
+    name: macos-11-py3-x86-64
+    uses: ./.github/workflows/_mac-test.yml
+    needs: macos-11-py3-x86-64-build
+    with:
+      build-environment: macos-11-py3-x86-64
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "macos-12", xcode-version: "13.3.1" },
+          { config: "default", shard: 2, num_shards: 2, runner: "macos-12", xcode-version: "13.3.1" },
+        ]}
+    secrets:
+      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
+
+  macos-10-15-py3-lite-interpreter-x86-64:
+    name: macos-10-15-py3-lite-interpreter-x86-64
+    uses: ./.github/workflows/_mac-build.yml
+    with:
+      build-environment: macos-10-15-py3-lite-interpreter-x86-64
+      xcode-version: "12"
+      runner-type: macos-10.15
+      build-generates-artifacts: false
+    secrets:
+      MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+
+  macos-10-15-py3-arm64:
+    name: macos-10-15-py3-arm64
+    uses: ./.github/workflows/_mac-build.yml
+    with:
+      build-environment: macos-10-15-py3-arm64
+      xcode-version: "13.3.1"
+      runner-type: macos-12
+      build-generates-artifacts: false
+    secrets:
+      MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+
+  # please ensure that this and its corresponding job in pull.yml are in sync
+  win-vs2019-cuda11_3-py3-build:
+    name: win-vs2019-cuda11.3-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cuda11.3-py3
+      cuda-version: "11.3"
+
+  win-vs2019-cuda11_3-py3-test:
+    name: win-vs2019-cuda11.3-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cuda11_3-py3-build
+    with:
+      build-environment: win-vs2019-cuda11.3-py3
+      cuda-version: "11.3"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index ae29ab82462a..d5092046ebad 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -6,7 +6,8 @@ on:
 
 jobs:
   do_merge:
-    runs-on: ubuntu-20.04
+    name: try_merge_pr_${{ github.event.client_payload.pr_num }}
+    runs-on: linux.20_04.4x
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
@@ -28,5 +29,25 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }}
           PR_NUM: ${{ github.event.client_payload.pr_num }}
           GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          FORCE: ${{ github.event.client_payload.force}}
+          ON_GREEN: ${{ github.event.client_payload.on_green}}
+          COMMENT_ID: ${{ github.event.client_payload.comment_id }}
         run: |
-          python3 .github/scripts/trymerge.py "${PR_NUM}"
+          set -ex
+          if [ -n "${FORCE}" ]; then
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            fi
+          elif [ -n "${ON_GREEN}" ]; then
+            python3 .github/scripts/trymerge.py --on-green "${PR_NUM}"
+          elif [ -n "${COMMENT_ID}" ]; then
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+          else
+            python3 .github/scripts/trymerge.py "${PR_NUM}"
+          fi
+
+# TODO: Separate merge on green merges from regular merges to not hold up try-merge workflows overall concurrency
+# NOTE: force pushes are also put in their concurrency group to put them higher than regular merges
+concurrency: try-merge-${{ github.event.client_payload.force}}-${{ github.event.client_payload.on_green }}
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
new file mode 100644
index 000000000000..d45018c1ad6d
--- /dev/null
+++ b/.github/workflows/tryrebase.yml
@@ -0,0 +1,34 @@
+name: Rebase PR
+
+on:
+  repository_dispatch:
+    types: [try-rebase]
+
+jobs:
+  do_rebase:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+          architecture: x64
+
+      - name: Checkout repo
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.MERGEBOT_TOKEN }}
+
+      - name: Setup committer id
+        run: |
+          git config --global user.email "pytorchmergebot@users.noreply.github.com"
+          git config --global user.name "PyTorch MergeBot"
+
+      - name: Rebase
+        env:
+          GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }}
+          PR_NUM: ${{ github.event.client_payload.pr_num }}
+          GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          python3 .github/scripts/tryrebase.py "${PR_NUM}"
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index 82061efa3c3c..f19347070ece 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -17,8 +17,8 @@ jobs:
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Update PyTorch labels list in S3
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
         run: |
           python3 -m pip install boto3==1.19.12
           .github/scripts/export_pytorch_labels.py
diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml
index 6a53d4d24595..5f3ff056c5a4 100644
--- a/.github/workflows/update_s3_htmls.yml
+++ b/.github/workflows/update_s3_htmls.yml
@@ -12,7 +12,7 @@ jobs:
     if: ${{ github.repository == 'pytorch/pytorch' }}
     strategy:
       matrix:
-        prefix: ["whl", "whl/test", "whl/nightly"]
+        prefix: ["whl", "whl/test", "whl/nightly", "whl/lts/1.8"]
     steps:
       - name: Run updater image
         env:
@@ -20,4 +20,4 @@ jobs:
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_UPDATE_SECRET_ACCESS_KEY }}
         uses: docker://pytorch/manage_s3_html
         with:
-          args: ${{ matrix.prefix }}
+          args: --generate-pep503 ${{ matrix.prefix }}
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
new file mode 100644
index 000000000000..bfed85e5131e
--- /dev/null
+++ b/.github/workflows/upload-test-stats.yml
@@ -0,0 +1,35 @@
+name: Upload test stats
+
+on:
+  workflow_run:
+    workflows: [pull, trunk, periodic]
+    types:
+      - completed
+
+jobs:
+  upload-test-stats:
+    if: github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure'
+    runs-on: [self-hosted, linux.2xlarge]
+
+    steps:
+      - name: Print workflow information
+        env:
+          TRIGGERING_WORKFLOW: ${{ toJSON(github.event.workflow_run) }}
+        run: echo "${TRIGGERING_WORKFLOW}"
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+
+      - run: |
+          pip3 install requests==2.26
+          pip3 install rockset==0.8.3
+          pip3 install boto3==1.19.12
+          pip3 install six==1.16.0
+
+      - name: Upload test stats
+        env:
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+        run: python3 tools/stats/upload_test_stats.py --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
diff --git a/.gitignore b/.gitignore
index 71e9d56255e1..b62b84d9d0e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@ aten/src/ATen/cuda/CUDAConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
+docs/build/
 docs/cpp/src
 docs/src/**/*
 docs/cpp/build
@@ -66,8 +67,11 @@ torch/_C/__init__.pyi
 torch/_C/_nn.pyi
 torch/_C/_VariableFunctions.pyi
 torch/_VF.pyi
+torch/return_types.pyi
 torch/nn/functional.pyi
+torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+torch/csrc/lazy/generated/*.[!m]*
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
@@ -137,6 +141,7 @@ scripts/release_notes/*.json
 compile_commands.json
 *.egg-info/
 docs/source/scripts/activation_images/
+docs/source/scripts/quantization_backend_configs/
 
 ## General
 
@@ -255,6 +260,9 @@ cmake-build-debug
 #
 # Below files are not deleted by "setup.py clean".
 
+# Downloaded bazel
+tools/bazel
+
 # Visual Studio Code files
 .vs
 /.vscode/*
@@ -304,10 +312,20 @@ bazel-*
 *.zip
 
 # core dump files
-core.*
+**/core.[1-9]*
 
 # Generated if you use the pre-commit script for clang-tidy
 pr.diff
 
 # coverage files
 */**/.coverage.*
+
+# buck generated files
+.buckd/
+.lsp-buck-out/
+.lsp.buckd/
+buck-out/
+
+# Downloaded libraries
+third_party/ruy/
+third_party/glog/
diff --git a/.gitmodules b/.gitmodules
index 9c9373ef7229..8d1ea6f02fa7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,7 +9,7 @@
 [submodule "third_party/eigen"]
     ignore = dirty
     path = third_party/eigen
-    url = https://github.com/eigenteam/eigen-git-mirror.git
+    url = https://gitlab.com/libeigen/eigen.git
 [submodule "third_party/googletest"]
     ignore = dirty
     path = third_party/googletest
@@ -139,9 +139,6 @@
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
 	url = https://github.com/mreineck/pocketfft
-[submodule "third_party/breakpad"]
-	path = third_party/breakpad
-	url = https://github.com/driazati/breakpad.git
 [submodule "third_party/flatbuffers"]
 	path = third_party/flatbuffers
 	url = https://github.com/google/flatbuffers.git
diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh
index 168e823ba2cc..087055536564 100644
--- a/.jenkins/caffe2/common.sh
+++ b/.jenkins/caffe2/common.sh
@@ -26,7 +26,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
     fi
 fi
 
-# /usr/local/caffe2 is where the cpp bits are installed to in in cmake-only
+# /usr/local/caffe2 is where the cpp bits are installed to in cmake-only
 # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so
 # that the test code in .jenkins/test.sh is the same
 INSTALL_PREFIX="/usr/local/caffe2"
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index fd626d09c3e2..e9d1feba7a50 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -45,8 +45,8 @@ fi
 ################################################################################
 # C++ tests #
 ################################################################################
-# Don't run cpp tests a second time in the sharded ort_test2 job
-if [[ "$BUILD_ENVIRONMENT" != *ort_test2* ]]; then
+# Only run cpp tests in the first shard, don't run cpp tests a second time in the second shard
+if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
   echo "Running C++ tests.."
   for test in $(find "$cpp_test_dir" -executable -type f); do
     case "$test" in
@@ -134,19 +134,15 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   rocm_ignore_test+=("--ignore $caffe2_pypath/python/ideep/pool_op_test.py")
 fi
 
-# NB: Warnings are disabled because they make it harder to see what
-# the actual erroring test is
 echo "Running Python tests.."
-if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-  # locale setting is required by click package with py3
-  for loc in "en_US.utf8" "C.UTF-8"; do
-    if locale -a | grep "$loc" >/dev/null 2>&1; then
-      export LC_ALL="$loc"
-      export LANG="$loc"
-      break;
-    fi
-  done
-fi
+# locale setting is required by click package
+for loc in "en_US.utf8" "C.UTF-8"; do
+  if locale -a | grep "$loc" >/dev/null 2>&1; then
+    export LC_ALL="$loc"
+    export LANG="$loc"
+    break;
+  fi
+done
 
 # Some Caffe2 tests fail when run using AVX512 ISA, see https://github.com/pytorch/pytorch/issues/66111
 export DNNL_MAX_CPU_ISA=AVX2
@@ -154,6 +150,8 @@ export DNNL_MAX_CPU_ISA=AVX2
 # Should still run even in the absence of SHARD_NUMBER
 if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
   pip install --user pytest-sugar
+  # NB: Warnings are disabled because they make it harder to see what
+  # the actual erroring test is
   "$PYTHON" \
     -m pytest \
     -x \
@@ -170,18 +168,18 @@ if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
     "${EXTRA_TESTS[@]}"
 fi
 
-#####################
-# torchvision tests #
-#####################
+##############
+# ONNX tests #
+##############
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # Check out torch/vision at 0.9.0-rc1 commit
   # This hash must match one in .jenkins/pytorch/test.sh
   pip install -q --user git+https://github.com/pytorch/vision.git@8a2dc6f22ac4389ccba8859aa1e1cb14f1ee53db
-  pip install -q --user ninja
+  pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.11.0
+  # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21.
+  # We don't actually need it for our tests, but it's imported if it's present, so uninstall.
+  pip uninstall -q --yes numba
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
-  if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-    pip install -q --user flatbuffers==2.0 onnxruntime==1.9.0
-  fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index 60d5e5e80807..b15ab65afa87 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -15,7 +15,7 @@ clang --version
 
 # detect_leaks=0: Python is very leaky, so we need suppress it
 # symbolize=1: Gives us much better errors when things go wrong
-export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_odr_violation=0
+export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1:symbolize=1:detect_odr_violation=0
 if [ -n "$(which conda)" ]; then
   export CMAKE_PREFIX_PATH=/opt/conda
 fi
@@ -35,7 +35,7 @@ fi
 #
 # TODO: Make the ASAN flags a centralized env var and unify with USE_ASAN option
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
-  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
+  CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize-address-use-after-scope -shared-libasan -pthread" \
   CXX_FLAGS="-pthread" \
   USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \
   python setup.py bdist_wheel
diff --git a/.jenkins/pytorch/build-mobile.sh b/.jenkins/pytorch/build-mobile.sh
index f79306f87032..48cfb4fba83a 100755
--- a/.jenkins/pytorch/build-mobile.sh
+++ b/.jenkins/pytorch/build-mobile.sh
@@ -26,6 +26,8 @@ retry pip install --pre torch torchvision \
 # binary, and running forward pass with a real model.
 if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
   TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
+elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
+  test/mobile/lightweight_dispatch/build.sh
 else
   TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
 fi
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 1fc4fecf2f82..8c74fc107603 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -20,7 +20,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda11.3* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.5* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda11.3* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.5* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.6* ]]; then
   # Enabling DEPLOY build (embedded torch python interpreter, experimental)
   # only on one config for now, can expand later
   export USE_DEPLOY=ON
@@ -209,10 +209,13 @@ else
 
   if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
 
-    # ppc64le build fails when WERROR=1
+    # ppc64le, rocm builds fail when WERROR=1
+    # XLA test build fails when WERROR=1
     # set only when building other architectures
-    # only use for "python setup.py install" line
-    if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+    # or building non-XLA tests.
+    if [[ "$BUILD_ENVIRONMENT" != *ppc64le* &&
+          "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *xla* ]]; then
       WERROR=1 python setup.py bdist_wheel
     else
       python setup.py bdist_wheel
@@ -249,13 +252,11 @@ else
       fi
       sudo rm -rf original
       popd
-
-      # exit before building custom test artifacts until we resolve cmake error:
-      # static library kineto_LIBRARY-NOTFOUND not found.
-      exit 0
     fi
 
     CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}
+    CUSTOM_TEST_USE_ROCM=$([[ "$BUILD_ENVIRONMENT" == *rocm* ]] && echo "ON" || echo "OFF")
+    CUSTOM_TEST_MODULE_PATH="${PWD}/cmake/public"
     mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}"
 
     # Build custom operator tests.
@@ -265,7 +266,8 @@ else
     SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
     mkdir -p "$CUSTOM_OP_BUILD"
     pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
     make VERBOSE=1
     popd
     assert_git_not_dirty
@@ -277,7 +279,8 @@ else
     SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
     mkdir -p "$JIT_HOOK_BUILD"
     pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
     make VERBOSE=1
     popd
     assert_git_not_dirty
@@ -288,7 +291,8 @@ else
     python --version
     mkdir -p "$CUSTOM_BACKEND_BUILD"
     pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
     make VERBOSE=1
     popd
     assert_git_not_dirty
diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
index 290baa7a3b3b..9f895bbdbcc4 100755
--- a/.jenkins/pytorch/codegen-test.sh
+++ b/.jenkins/pytorch/codegen-test.sh
@@ -26,7 +26,7 @@ set -x
 rm -rf "$OUT"
 
 # aten codegen
-python -m tools.codegen.gen \
+python -m torchgen.gen \
   -d "$OUT"/torch/share/ATen
 
 # torch codegen
@@ -38,6 +38,7 @@ mkdir -p "$OUT"/pyi/torch/_C
 mkdir -p "$OUT"/pyi/torch/nn
 python -m tools.pyi.gen_pyi \
   --native-functions-path aten/src/ATen/native/native_functions.yaml \
+  --tags-path aten/src/ATen/native/tags.yaml \
   --deprecated-functions-path tools/autograd/deprecated.yaml \
   --out "$OUT"/pyi
 
@@ -45,6 +46,7 @@ python -m tools.pyi.gen_pyi \
 python -m tools.autograd.gen_autograd \
   "$OUT"/torch/share/ATen/Declarations.yaml \
   aten/src/ATen/native/native_functions.yaml \
+  aten/src/ATen/native/tags.yaml \
   "$OUT"/autograd \
   tools/autograd
 
@@ -52,5 +54,6 @@ python -m tools.autograd.gen_autograd \
 mkdir -p "$OUT"/annotated_fn_args
 python -m tools.autograd.gen_annotated_fn_args \
   aten/src/ATen/native/native_functions.yaml \
+  aten/src/ATen/native/tags.yaml \
   "$OUT"/annotated_fn_args \
   tools/autograd
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index be5245bf19bc..a593db026005 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -8,20 +8,25 @@ set -ex
 # Save the SCRIPT_DIR absolute path in case later we chdir (as occurs in the gpu perf test)
 SCRIPT_DIR="$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"
 
+if [[ "${BUILD_ENVIRONMENT}" == *linux* ]]; then
+  # TODO: Remove this once nvidia package repos are back online
+  # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
+  # shellcheck disable=SC2046
+  sudo sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+fi
+
 # Required environment variables:
 #   $BUILD_ENVIRONMENT (should be set by your Docker image)
 
 # Figure out which Python to use for ROCm
-if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
   # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
   unset HIP_PLATFORM
-  PYTHON=$(which "python${BASH_REMATCH[1]}")
-  # non-interactive bashs do not expand aliases by default
-  shopt -s expand_aliases
   export PYTORCH_TEST_WITH_ROCM=1
-  alias python='$PYTHON'
   # temporary to locate some kernel issues on the CI nodes
   export HSAKMT_DEBUG_LEVEL=4
+  # improve rccl performance for distributed tests
+  export HSA_FORCE_FINE_GRAIN_PCIE=1
 fi
 
 # This token is used by a parser on Jenkins logs for determining
@@ -145,7 +150,8 @@ fi
 # export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 if [[ "${TEST_CONFIG:-}" == *xla* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *centos* ]] || \
-   [[ "$BUILD_ENVIRONMENT" == *linux-bionic* ]]; then
+   [[ "$BUILD_ENVIRONMENT" == *linux-bionic* ]] || \
+   [[ "$BUILD_ENVIRONMENT" == *linux-focal* ]]; then
   if ! which conda; then
     echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
     exit 1
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 54bd44d3ccc6..4169f6a2cb8c 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -60,19 +60,18 @@ function get_pr_change_files() {
   set -e
 }
 
-function file_diff_from_base() {
-  # The fetch may fail on Docker hosts, this fetch is necessary for GHA
-  set +e
-  git fetch origin master --quiet
-  set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
-}
-
 function get_bazel() {
-  # download bazel version
-  wget https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -O tools/bazel
-  # verify content
-  echo '1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c tools/bazel' | sha256sum --quiet -c
+  if [[ $(uname) == "Darwin" ]]; then
+    # download bazel version
+    curl https://github.com/bazelbuild/bazel/releases/download/4.2.1/bazel-4.2.1-darwin-x86_64  -Lo tools/bazel
+    # verify content
+    echo '74d93848f0c9d592e341e48341c53c87e3cb304a54a2a1ee9cff3df422f0b23c  tools/bazel' | shasum -a 256 -c >/dev/null
+  else
+    # download bazel version
+    curl https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -o tools/bazel
+    # verify content
+    echo '1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c  tools/bazel' | shasum -a 256 -c >/dev/null
+  fi
 
   chmod +x tools/bazel
 }
diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
index 06e24936c196..ee35efc010c2 100755
--- a/.jenkins/pytorch/macos-build.sh
+++ b/.jenkins/pytorch/macos-build.sh
@@ -37,7 +37,7 @@ cross_compile_arm64() {
 }
 
 compile_x86_64() {
-  USE_DISTRIBUTED=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=1 USE_NNPACK=OFF python setup.py bdist_wheel
 }
 
 build_lite_interpreter() {
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 78999637f7f9..858a0c1eab53 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -4,13 +4,13 @@
 # shellcheck source=./macos-common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
 
-export PYTORCH_TEST_SKIP_NOARCH=1
-
 conda install -y six
-pip install -q hypothesis "expecttest==0.1.3" "librosa>=0.6.2,<0.9.0" "numba<=0.49.1" psutil "scipy==1.6.3"
+pip install -q hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba<=0.49.1" psutil "scipy==1.6.3"
 
 # TODO move this to docker
-pip install unittest-xml-reporting pytest
+# Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
+pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \
+  pytest
 
 if [ -z "${IN_CI}" ]; then
   rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch*
diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
index 2d119d09a70c..481619a8dc31 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -13,7 +13,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch (distributed only)"
 if [ -n "${IN_CI}" ]; then
   # TODO move this to docker
-  pip_install unittest-xml-reporting
+  # Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
+  pip_install "unittest-xml-reporting<=3.2.0,>=2.0.0"
 fi
 
 # Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh b/.jenkins/pytorch/short-perf-test-cpu.sh
index f2e02b52974c..ff9ef7a84eee 100755
--- a/.jenkins/pytorch/short-perf-test-cpu.sh
+++ b/.jenkins/pytorch/short-perf-test-cpu.sh
@@ -17,14 +17,15 @@ pip install -q awscli
 # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
 # More info at https://github.com/aws/aws-cli/issues/2321
 aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
-    # Get current master commit hash
-    MASTER_COMMIT_ID=$(git log --format="%H" -n 1)
-    export MASTER_COMMIT_ID
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
 fi
 
-# Find the master commit to test against
+# Find the default branch commit to test against
 git remote add upstream https://github.com/pytorch/pytorch.git
 git fetch upstream
 IFS=$'\n'
@@ -33,13 +34,13 @@ while IFS='' read -r commit_id; do
         LATEST_TESTED_COMMIT=${commit_id}
         break
     fi
-done < <(git rev-list upstream/master)
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
 aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/"${LATEST_TESTED_COMMIT}".json cpu_runtime.json
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
     # Prepare new baseline file
     cp cpu_runtime.json new_cpu_runtime.json
-    python update_commit_hash.py new_cpu_runtime.json "${MASTER_COMMIT_ID}"
+    python update_commit_hash.py new_cpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
 fi
 
 # Include tests
@@ -54,7 +55,7 @@ fi
 
 # Run tests
 export TEST_MODE="compare_with_baseline"
-if [[ "$COMMIT_SOURCE" == master ]]; then
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
     export TEST_MODE="compare_and_update"
 fi
 
@@ -66,8 +67,8 @@ run_test test_cpu_speed_torch_tensor ${TEST_MODE}
 run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE}
 run_test test_cpu_speed_mnist 20 ${TEST_MODE}
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
-    # This could cause race condition if we are testing the same master commit twice,
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
     # but the chance of them executing this line at the same time is low.
-    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${MASTER_COMMIT_ID}".json --acl public-read
+    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
 fi
diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh b/.jenkins/pytorch/short-perf-test-gpu.sh
index 4d8efee8dc20..bde8ca5c9dd3 100755
--- a/.jenkins/pytorch/short-perf-test-gpu.sh
+++ b/.jenkins/pytorch/short-perf-test-gpu.sh
@@ -17,14 +17,15 @@ pip install -q awscli --ignore-installed PyYAML
 # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
 # More info at https://github.com/aws/aws-cli/issues/2321
 aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
-    # Get current master commit hash
-    MASTER_COMMIT_ID=$(git log --format="%H" -n 1)
-    export MASTER_COMMIT_ID
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
 fi
 
-# Find the master commit to test against
+# Find the default branch commit to test against
 git remote add upstream https://github.com/pytorch/pytorch.git
 git fetch upstream
 IFS=$'\n'
@@ -33,13 +34,13 @@ while IFS='' read -r commit_id; do
         LATEST_TESTED_COMMIT=${commit_id}
         break
     fi
-done < <(git rev-list upstream/master)
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
 aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/"${LATEST_TESTED_COMMIT}".json gpu_runtime.json
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
     # Prepare new baseline file
     cp gpu_runtime.json new_gpu_runtime.json
-    python update_commit_hash.py new_gpu_runtime.json "${MASTER_COMMIT_ID}"
+    python update_commit_hash.py new_gpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
 fi
 
 # Include tests
@@ -55,7 +56,7 @@ fi
 . ./test_gpu_speed_mlstm.sh
 
 # Run tests
-if [[ "$COMMIT_SOURCE" == master ]]; then
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
     run_test test_gpu_speed_mnist 20 compare_and_update
     run_test test_gpu_speed_word_language_model 20 compare_and_update
     run_test test_gpu_speed_cudnn_lstm 20 compare_and_update
@@ -69,10 +70,10 @@ else
     run_test test_gpu_speed_mlstm 20 compare_with_baseline
 fi
 
-if [[ "$COMMIT_SOURCE" == master ]]; then
-    # This could cause race condition if we are testing the same master commit twice,
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
     # but the chance of them executing this line at the same time is low.
-    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${MASTER_COMMIT_ID}".json --acl public-read
+    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
 fi
 
 popd
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 6544b0f2693d..75234f2ff446 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -40,6 +40,11 @@ PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}
 
 if [[ $TEST_CONFIG == 'default' ]]; then
   export CUDA_VISIBLE_DEVICES=0
+  export HIP_VISIBLE_DEVICES=0
+fi
+
+if [[ $TEST_CONFIG == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export HIP_VISIBLE_DEVICES=0,1
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
@@ -48,11 +53,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
-  export PYTORCH_TEST_WITH_SLOW_GRADCHECK=ON
+  export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  # Used so that only cuda specific versions of tests are generated
+if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # Used so that only cuda/rocm specific versions of tests are generated
   # mainly used so that we're not spending extra cycles testing cpu
   # devices on expensive gpu machines
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
@@ -62,10 +67,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
   export BUILD_SPLIT_CUDA=ON
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *noarch* ]]; then
-  export PYTORCH_TEST_SKIP_NOARCH=0
-else
-  export PYTORCH_TEST_SKIP_NOARCH=1
+if [[ "$BUILD_ENVIRONMENT" == *crossref* ]]; then
+  export PYTORCH_TEST_WITH_CROSSREF=1
 fi
 
 if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
@@ -77,6 +80,7 @@ fi
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # Print GPU info
+  rocminfo
   rocminfo | grep -E 'Name:.*\sgfx|Marketing'
 
   # Manually set NUM_TEST_SHARDS since Jenkins doesn't do it
@@ -100,7 +104,7 @@ fi
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     # Suppress vptr violations arising from multiple copies of pybind11
-    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true:detect_odr_violation=0
+    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=1:strict_init_order=true:detect_odr_violation=0
     export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
     export PYTORCH_TEST_WITH_ASAN=1
     export PYTORCH_TEST_WITH_UBSAN=1
@@ -274,6 +278,14 @@ test_libtorch() {
     else
       "$TORCH_BIN_DIR"/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
     fi
+
+    # Run Lazy Tensor cpp tests
+    if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$BUILD_ENVIRONMENT" != *nogpu* ]]; then
+      LTC_TS_CUDA=1 "$TORCH_BIN_DIR"/test_lazy  --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
+    else
+      "$TORCH_BIN_DIR"/test_lazy  --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
+    fi
+
     python test/cpp/jit/tests_setup.py shutdown
     # Wait for background download to finish
     wait
@@ -306,16 +318,16 @@ test_vulkan() {
     # test reporting process (in print_test_stats.py) to function as expected.
     TEST_REPORTS_DIR=test/test-reports/cpp-vulkan/test_vulkan
     mkdir -p $TEST_REPORTS_DIR
-    "$TORCH_TEST_DIR"/vulkan_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml
+    "$TORCH_TEST_DIR"/vulkan_api_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml
   fi
 }
 
 test_distributed() {
   echo "Testing distributed python tests"
-  time python test/run_test.py --distributed-tests --verbose
+  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
   assert_git_not_dirty
 
-  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$SHARD_NUMBER" == 1 ]]; then
     echo "Testing distributed C++ tests"
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
@@ -355,7 +367,7 @@ test_rpc() {
 }
 
 test_custom_backend() {
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+  if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
     echo "Testing custom backends"
     CUSTOM_BACKEND_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-backend-build"
     pushd test/custom_backend
@@ -372,7 +384,7 @@ test_custom_backend() {
 }
 
 test_custom_script_ops() {
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+  if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
     echo "Testing custom script operators"
     CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
     pushd test/custom_operator
@@ -388,7 +400,7 @@ test_custom_script_ops() {
 }
 
 test_jit_hooks() {
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
+  if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then
     echo "Testing jit hooks in cpp"
     HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
     pushd test/jit_hooks
@@ -441,6 +453,8 @@ test_xla() {
 # nightly version.
 test_forward_backward_compatibility() {
   set -x
+  # create a dummy ts model at this version
+  python test/create_dummy_torchscript_model.py /tmp/model_new.pt
   pushd test/forward_backward_compatibility
   python -m venv venv
   # shellcheck disable=SC1091
@@ -448,10 +462,21 @@ test_forward_backward_compatibility() {
   pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
   pip show torch
   python dump_all_function_schemas.py --filename nightly_schemas.txt
+  # FC: verify newmodel can be load with old code.
+  if ! python ../load_torchscript_model.py /tmp/model_new.pt; then
+      echo "FC check failed: new model cannot be load in old code"
+      return 1
+  fi
+  python ../create_dummy_torchscript_model.py /tmp/model_old.pt
   deactivate
   rm -r venv
   pip show torch
   python check_forward_backward_compatibility.py --existing-schemas nightly_schemas.txt
+  # BC: verify old model can be load with new code
+  if ! python ../load_torchscript_model.py /tmp/model_old.pt; then
+      echo "BC check failed: old model cannot be load in new code"
+      return 1
+  fi
   popd
   set +x
   assert_git_not_dirty
@@ -518,7 +543,7 @@ test_torch_deploy() {
   ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
   ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
   "$TORCH_BIN_DIR"/test_deploy
-  "$TORCH_BIN_DIR"/test_api --gtest_filter='IMethodTest.*'
+  "$TORCH_BIN_DIR"/test_deploy_gpu
   assert_git_not_dirty
 }
 
@@ -530,8 +555,9 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-
-if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *deploy* ]]; then
+  test_torch_deploy
+elif [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
   test_forward_backward_compatibility
   # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@@ -543,15 +569,18 @@ elif [[ "${BUILD_ENVIRONMENT}" == *jit_legacy-test || "${JOB_BASE_NAME}" == *jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
-elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.1*-test1* ]]; then
-    test_torch_deploy
+elif [[ "${BUILD_ENVIRONMENT}" == *distributed* || "${JOB_BASE_NAME}" == *distributed* ]]; then
+  test_distributed
+  # Only run RPC C++ tests on the first shard
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    test_rpc
   fi
+elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
   test_python_shard 1
   test_aten
-elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then
+elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   test_python_shard 2
   test_libtorch
@@ -563,12 +592,12 @@ elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then
   # Handle arbitrary number of shards
   test_python_shard "$SHARD_NUMBER"
 elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
-  test_vulkan
+  # TODO: re-enable vulkan test
+  echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   test_bazel
-elif [[ "${BUILD_ENVIRONMENT}" == *distributed* || "${JOB_BASE_NAME}" == *distributed* ]]; then
-  test_distributed
-  test_rpc
+elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
+  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
   test_docs_test
 else
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index 4954dcf4f451..c3650856d478 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -28,10 +28,7 @@ call %INSTALLER_DIR%\install_sccache.bat
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
 
-call :retry %INSTALLER_DIR%\install_miniconda3.bat
-
-:retry
-call %* || (powershell -nop -c "& {sleep 1}" && call %*) || (powershell -nop -c "& {sleep 2}" && call %*)
+call %INSTALLER_DIR%\install_miniconda3.bat
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
 
@@ -92,6 +89,7 @@ if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=5.2
 
 :: The default sccache idle timeout is 600, which is too short and leads to intermittent build errors.
 set SCCACHE_IDLE_TIMEOUT=0
+set SCCACHE_IGNORE_SERVER_IO_ERROR=1
 sccache --stop-server
 sccache --start-server
 sccache --zero-stats
@@ -143,7 +141,7 @@ python setup.py install --cmake && sccache --show-stats && (
   if "%BUILD_ENVIRONMENT%"=="" (
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
   ) else (
-    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
+    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
     if errorlevel 1 exit /b
     if not errorlevel 0 exit /b
 
@@ -157,4 +155,5 @@ python setup.py install --cmake && sccache --show-stats && (
 
 sccache --show-stats > stats.txt
 python -m tools.stats.upload_sccache_stats stats.txt
+sccache --stop-server
 rm stats.txt
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
index 20b3b4db4c02..657848631245 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@@ -22,7 +22,7 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
   call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 libuv
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
-  call conda install -y -q -c conda-forge cmake
+  call conda install -y -q -c conda-forge cmake=3.22.3
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 )
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index b738b4e70b74..c7f3e1b6a614 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -34,7 +34,9 @@ popd
 
 :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 =======
-pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "expecttest==0.1.3" "librosa>=0.6.2,<0.9.0" psutil pillow unittest-xml-reporting pytest
+:: Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
+
+pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
 
diff --git a/.jenkins/pytorch/win-test-helpers/test_python.bat b/.jenkins/pytorch/win-test-helpers/test_python.bat
deleted file mode 100644
index 2de7ac4c3bcd..000000000000
--- a/.jenkins/pytorch/win-test-helpers/test_python.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-:: exit the batch once there's an error
-if not errorlevel 0 (
-    echo "setup pytorch env failed"
-    echo %errorlevel%
-    exit /b
-)
-
-pushd test
-if "%RUN_SMOKE_TESTS_ONLY%"=="1" (
-    :: Download specified test cases to run
-    curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv
-    if ERRORLEVEL 1 exit /b 1
-
-    python run_test.py --exclude-jit-executor --verbose --run-specified-test-cases
-) else (
-    python run_test.py --exclude-jit-executor --verbose
-)
-popd
-if ERRORLEVEL 1 exit /b 1
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat
deleted file mode 100644
index 181259df7e35..000000000000
--- a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat
+++ /dev/null
@@ -1,44 +0,0 @@
-call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-:: exit the batch once there's an error
-if not errorlevel 0 (
-    echo "setup pytorch env failed"
-    echo %errorlevel%
-    exit /b
-)
-
-pushd test
-
-set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe"
-if exist %GFLAGS_EXE% (
-    echo Some smoke tests
-    %GFLAGS_EXE% /i python.exe +sls
-    python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py
-    if ERRORLEVEL 1 goto fail
-
-    %GFLAGS_EXE% /i python.exe -sls
-    if ERRORLEVEL 1 goto fail
-)
-
-echo Copying over test times file
-copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"
-
-echo Run nn tests
-
-if "%RUN_SMOKE_TESTS_ONLY%"=="1" (
-    :: Download specified test cases to run
-    curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv
-    if ERRORLEVEL 1 goto fail
-
-    python run_test.py --exclude-jit-executor --shard 1 2 --verbose --run-specified-test-cases
-) else (
-    python run_test.py --exclude-jit-executor --shard 1 2 --verbose
-)
-if ERRORLEVEL 1 goto fail
-
-popd
-
-:eof
-exit /b 0
-
-:fail
-exit /b 1
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat
deleted file mode 100644
index 56d115f64df7..000000000000
--- a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat
+++ /dev/null
@@ -1,26 +0,0 @@
-call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-:: exit the batch once there's an error
-if not errorlevel 0 (
-    echo "setup pytorch env failed"
-    echo %errorlevel%
-    exit /b
-)
-
-echo Copying over test times file
-copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"
-
-pushd test
-
-if "%RUN_SMOKE_TESTS_ONLY%"=="1" (
-    :: Download specified test cases to run
-    curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv
-    if ERRORLEVEL 1 exit /b 1
-
-    python run_test.py --exclude-jit-executor --shard 2 2 --verbose --run-specified-test-cases
-) else (
-    python run_test.py --exclude-jit-executor --shard 2 2 --verbose
-)
-
-popd
-
-if ERRORLEVEL 1 exit /b 1
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat
new file mode 100644
index 000000000000..ccc615f67f31
--- /dev/null
+++ b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat
@@ -0,0 +1,37 @@
+call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
+:: exit the batch once there's an error
+if not errorlevel 0 (
+  echo "setup pytorch env failed"
+  echo %errorlevel%
+  exit /b
+)
+
+pushd test
+
+set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe"
+if "%SHARD_NUMBER%" == "1" (
+  if exist %GFLAGS_EXE% (
+    echo Some smoke tests
+    %GFLAGS_EXE% /i python.exe +sls
+    python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py
+    if ERRORLEVEL 1 goto fail
+
+    %GFLAGS_EXE% /i python.exe -sls
+    if ERRORLEVEL 1 goto fail
+  )
+)
+
+echo Copying over test times file
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%"
+
+echo Run nn tests
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+if ERRORLEVEL 1 goto fail
+
+popd
+
+:eof
+exit /b 0
+
+:fail
+exit /b 1
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index 51c5700db0b8..7312ca7f23c6 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -26,7 +26,6 @@ export TEST_DIR_WIN
 export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/users/circleci/workspace/build-results}"
 PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}")
 export PYTORCH_FINAL_PACKAGE_DIR_WIN
-export PYTORCH_TEST_SKIP_NOARCH=1
 
 mkdir -p "$TMP_DIR"/build/torch
 
@@ -49,8 +48,13 @@ fi
 if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then
   # run the full test suite for force_on_cpu test
   export USE_CUDA=0
-elif [[ "$TEST_CONFIG" == "smoke_tests" ]]; then
-  export RUN_SMOKE_TESTS_ONLY=1
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+  # Used so that only cuda/rocm specific versions of tests are generated
+  # mainly used so that we're not spending extra cycles testing cpu
+  # devices on expensive gpu machines
+  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 fi
 
 run_tests() {
@@ -62,32 +66,20 @@ run_tests() {
         fi
     done
 
+    "$SCRIPT_HELPERS_DIR"/test_python_shard.bat
     if [[ ( -z "${JOB_BASE_NAME}" || "${JOB_BASE_NAME}" == *-test ) && $NUM_TEST_SHARDS -eq 1 ]]; then
-        "$SCRIPT_HELPERS_DIR"/test_python.bat
-
-        if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then
-          "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat
-          "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat
-          "$SCRIPT_HELPERS_DIR"/test_libtorch.bat
-        fi
+        "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat
+        "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat
+        "$SCRIPT_HELPERS_DIR"/test_libtorch.bat
     else
-        if [[ "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then
-            "$SCRIPT_HELPERS_DIR"/test_python_first_shard.bat
-
-            if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then
-              "$SCRIPT_HELPERS_DIR"/test_libtorch.bat
-              if [[ "${USE_CUDA}" == "1" ]]; then
-                "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat
-              fi
-            fi
-
-        elif [[ "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then
-            "$SCRIPT_HELPERS_DIR"/test_python_second_shard.bat
-
-            if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then
-              "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat
-              "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat
+        if [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+            "$SCRIPT_HELPERS_DIR"/test_libtorch.bat
+            if [[ "${USE_CUDA}" == "1" ]]; then
+              "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat
             fi
+        elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
+            "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat
+            "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat
         fi
     fi
 }
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 7126745fca1a..295c516bc30b 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -20,7 +20,6 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/flake8_linter.py',
-    '--binary=flake8',
     '--',
     '@{{PATHSFILE}}'
 ]
@@ -53,6 +52,9 @@ include_patterns = [
     'test/cpp/tensorexpr/**/*.h',
     'test/cpp/tensorexpr/**/*.cpp',
 ]
+exclude_patterns = [
+    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
+]
 init_command = [
     'python3',
     'tools/linter/adapters/s3_init.py',
@@ -69,18 +71,21 @@ command = [
     '--',
     '@{{PATHSFILE}}'
 ]
+is_formatter = true
 
 [[linter]]
 code = 'MYPY'
 include_patterns = [
     'torch/**/*.py',
+    'torch/**/*.pyi',
     'caffe2/**/*.py',
+    'caffe2/**/*.pyi',
     'test/test_bundled_images.py',
     'test/test_bundled_inputs.py',
     'test/test_complex.py',
     'test/test_datapipe.py',
     'test/test_futures.py',
-    'test/test_numpy_interop.py',
+    # 'test/test_numpy_interop.py',
     'test/test_torch.py',
     'test/test_type_hints.py',
     'test/test_type_info.py',
@@ -90,11 +95,21 @@ exclude_patterns = [
     'torch/include/**',
     'torch/csrc/**',
     'torch/distributed/elastic/agent/server/api.py',
+    'torch/testing/_internal/**',
+    'torch/distributed/fsdp/fully_sharded_data_parallel.py',
+    # TODO(suo): these exclusions were added just to get lint clean on master.
+    # Follow up to do more target suppressions and remove them.
+    'torch/distributed/fsdp/flatten_params_wrapper.py',
+    'torch/ao/quantization/fx/convert.py',
+    'torch/ao/quantization/_dbr/function_fusion.py',
+    'test/test_datapipe.py',
+    'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py',
+    'test/test_numpy_interop.py',
+    'torch/torch_version.py',
 ]
 command = [
     'python3',
     'tools/linter/adapters/mypy_linter.py',
-    '--binary=mypy',
     '--config=mypy.ini',
     '--',
     '@{{PATHSFILE}}'
@@ -105,9 +120,17 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     'numpy==1.20',
     'expecttest==0.1.3',
-    'mypy==0.812',
+    'mypy==0.950',
+    'types-requests==2.27.25',
+    'types-six==1.16.15',
+    'types-PyYAML==6.0.7',
+    'types-tabulate==0.8.8',
+    'types-protobuf==3.19.18',
+    'types-pkg-resources==0.1.3',
+    'types-Jinja2==2.11.9',
     'junitparser==2.1.1',
     'rich==10.9.0',
+    'pyyaml==6.0',
 ]
 
 [[linter]]
@@ -121,10 +144,13 @@ include_patterns = [
     'torch/utils/benchmark/utils/timer.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/**/*.py',
 ]
+exclude_patterns = [
+    # (linbinyu) copied from internal repo
+    'tools/code_analyzer/gen_operators_yaml.py',
+]
 command = [
     'python3',
     'tools/linter/adapters/mypy_linter.py',
-    '--binary=mypy',
     '--config=mypy-strict.ini',
     '--',
     '@{{PATHSFILE}}'
@@ -133,11 +159,12 @@ command = [
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
+    'torch/csrc/deploy/**/*.cpp',
     'torch/csrc/fx/**/*.cpp',
-    'torch/csrc/utils/**/*.cpp',
     'torch/csrc/generic/**/*.cpp',
-    'torch/csrc/deploy/**/*.cpp',
+    'torch/csrc/onnx/**/*.cpp',
     'torch/csrc/tensor/**/*.cpp',
+    'torch/csrc/utils/**/*.cpp',
 ]
 exclude_patterns = [
     # The negative filters below are to exclude files that include onnx_pb.h or
@@ -191,7 +218,7 @@ exclude_patterns = ['test/test_jit.py']
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=# type:\s*ignore(?!\[)',
+    '--pattern=# type:\s*ignore([^\[]|$)',
     '--linter-name=TYPEIGNORE',
     '--error-name=unqualified type: ignore',
     """--error-description=\
@@ -209,7 +236,7 @@ exclude_patterns = ['caffe2/**']
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=# noqa(?!: [A-Z]+\d{3})',
+    '--pattern=# noqa([^:]|$)',
     '--linter-name=NOQA',
     '--error-name=unqualified noqa',
     """--error-description=\
@@ -245,6 +272,7 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     'ruamel.yaml==0.17.4',
 ]
+is_formatter = true
 
 [[linter]]
 code = 'NEWLINE'
@@ -254,7 +282,10 @@ exclude_patterns=[
     'third_party/**',
     '**/*.expect',
     '**/*.ipynb',
+    '**/*.ptl',
     'tools/clang_format_hash/**',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
 ]
 command = [
     'python3',
@@ -262,6 +293,7 @@ command = [
     '--',
     '@{{PATHSFILE}}',
 ]
+is_formatter = true
 
 [[linter]]
 code = 'SPACES'
@@ -270,6 +302,8 @@ exclude_patterns = [
     '**/contrib/**',
     '**/*.diff',
     'third_party/**',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
 ]
 command = [
     'python3',
@@ -295,11 +329,14 @@ exclude_patterns = [
     'third_party/**',
     '**/.gitattributes',
     '**/.gitmodules',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
+    '.lintrunner.toml',
 ]
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=\t',
+    '--pattern=	',
     '--linter-name=TABS',
     '--error-name=saw some tabs',
     '--replace-pattern=s/\t/    /',
@@ -319,6 +356,7 @@ include_patterns = [
 ]
 exclude_patterns = [
     'aten/src/ATen/native/quantized/cpu/qnnpack/**',
+    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
 ]
 command = [
     'python3',
@@ -348,7 +386,7 @@ command = [
     'tools/linter/adapters/grep_linter.py',
     """--pattern=\
     (pip|pip3|python -m pip|python3 -m pip|python3 -mpip|python -mpip) \
-    install ([a-z][\\.a-z-0-9]*+(?!(=|.*\\.whl))([[:blank:]]|))+\
+    install ([a-zA-Z0-9][A-Za-z0-9\\._\\-]+)([^/=<>~!]+)[A-Za-z0-9\\._\\-\\*\\+\\!]*$\
     """,
     '--linter-name=PYPIDEP',
     '--error-name=unpinned PyPI install',
@@ -455,3 +493,87 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     'cmakelint==1.4.1',
 ]
+
+[[linter]]
+code = 'SHELLCHECK'
+include_patterns = [
+    '.jenkins/pytorch/**/*.sh'
+]
+command = [
+    'python3',
+    'tools/linter/adapters/shellcheck_linter.py',
+    '--',
+    '@{{PATHSFILE}}',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'shellcheck-py==0.7.2.1',
+]
+
+[[linter]]
+code = 'ACTIONLINT'
+include_patterns = [
+    '.github/workflows/*.yml',
+    '.github/workflows/*.yaml',
+    # actionlint does not support composite actions yet
+    # '.github/actions/**/*.yml',
+    # '.github/actions/**/*.yaml',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/actionlint_linter.py',
+    '--binary=.lintbin/actionlint',
+    '--',
+    '@{{PATHSFILE}}',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/s3_init.py',
+    '--config-json=tools/linter/adapters/s3_init_config.json',
+    '--linter=actionlint',
+    '--dry-run={{DRYRUN}}',
+    '--output-dir=.lintbin',
+    '--output-name=actionlint',
+]
+
+[[linter]]
+code = 'TESTOWNERS'
+include_patterns = [
+    'test/**/test_*.py',
+    'test/**/*_test.py',
+]
+exclude_patterns = [
+    'test/run_test.py',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/testowners_linter.py',
+    '--',
+    '@{{PATHSFILE}}',
+]
+
+[[linter]]
+code = 'BLACK'
+include_patterns = [
+    'torchgen/**/*.py',
+    'tools/**/*.py',
+    'torch/onnx/**/*.py',
+    'torch/_refs/**/*.py',
+    'torch/_prims/**/*.py',
+    'test/onnx/**/*.py',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/black_linter.py',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'black==22.3.0',
+]
+is_formatter = true
diff --git a/BUILD.bazel b/BUILD.bazel
index 6590a7b1c3c4..d373a84f64d9 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3,10 +3,14 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
 load("//third_party:substitution.bzl", "header_template_rule")
-load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs")
+load("//:tools/bazel.bzl", "rules")
+load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "lazy_tensor_ts_sources")
 load("//tools/rules:cu.bzl", "cu_library")
 load("//tools/config:defs.bzl", "if_cuda")
-load("//:aten.bzl", "intern_build_aten_ops", "generate_aten")
+load("//:aten.bzl", "intern_build_aten_ops", "generate_aten", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cuda_sources")
+load(":build.bzl", "define_targets", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON")
+
+define_targets(rules = rules)
 
 COMMON_COPTS = [
     "-DHAVE_MALLOC_USABLE_SIZE=1",
@@ -25,17 +29,7 @@ COMMON_COPTS = [
     "-DUSE_CUDNN",
 ])
 
-# TODO: refactor this into its own library (but how to make
-# a binary based off of a module in a library?)
-py_binary(
-    name = "gen",
-    srcs = ["tools/setup_helpers/gen.py"],
-    deps = [
-        ":tools_codegen"
-    ],
-)
-
-aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + glob(["aten/src/ATen/templates/**"])
+aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + ["aten/src/ATen/native/tags.yaml"] + glob(["aten/src/ATen/templates/**"])
 
 generated_cpu_cpp = [
     "aten/src/ATen/RegisterBackendSelect.cpp",
@@ -46,6 +40,7 @@ generated_cpu_cpp = [
     "aten/src/ATen/RegisterFunctionalization_3.cpp",
     # "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
     "aten/src/ATen/RegisterMkldnnCPU.cpp",
+    "aten/src/ATen/RegisterNestedTensorCPU.cpp",
     "aten/src/ATen/RegisterQuantizedCPU.cpp",
     "aten/src/ATen/RegisterSparseCPU.cpp",
     "aten/src/ATen/RegisterSparseCsrCPU.cpp",
@@ -60,6 +55,7 @@ generated_cpu_cpp = [
     "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
     "aten/src/ATen/CompositeImplicitAutogradFunctions.h",
     "aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
+    "aten/src/ATen/CompositeViewCopyKernels.cpp",
     "aten/src/ATen/FunctionalInverses.h",
     "aten/src/ATen/Functions.h",
     "aten/src/ATen/Functions.cpp",
@@ -86,6 +82,7 @@ generated_cuda_cpp = [
     "aten/src/ATen/CUDAFunctions.h",
     "aten/src/ATen/CUDAFunctions_inl.h",
     "aten/src/ATen/RegisterCUDA.cpp",
+    "aten/src/ATen/RegisterNestedTensorCUDA.cpp",
     "aten/src/ATen/RegisterQuantizedCUDA.cpp",
     "aten/src/ATen/RegisterSparseCUDA.cpp",
     "aten/src/ATen/RegisterSparseCsrCUDA.cpp",
@@ -94,103 +91,21 @@ generated_cuda_cpp = [
 generate_aten(
     name = "generated_aten_cpp",
     srcs = aten_generation_srcs,
-    outs = generated_cpu_cpp + generated_cuda_cpp + [
-         "aten/src/ATen/Declarations.yaml",
-    ],
-    generator=":gen",
-)
-
-py_library(
-    name = "tools_codegen",
-    srcs = glob(["tools/codegen/**/*.py"]),
-)
-
-py_library(
-    name = "tools_autograd",
-    srcs = glob(["tools/autograd/*.py"]),
-    data = glob([
-        "tools/autograd/*.yaml",
-        "tools/autograd/templates/*",
-    ]),
-    deps = [":tools_codegen"],
-)
-
-py_library(
-    name = "tools_jit",
-    srcs = glob(["tools/jit/*.py"]),
-    data = glob(["tools/jit/templates/*"]),
-)
-
-py_binary(
-    name = "generate_code",
-    srcs = ["tools/setup_helpers/generate_code.py"],
-    deps = [
-        ":tools_autograd",
-        ":tools_jit",
-    ],
-)
-
-libtorch_cpp_generated_sources = [
-        "torch/csrc/autograd/generated/VariableType.h",
-        "torch/csrc/autograd/generated/VariableType_0.cpp",
-        "torch/csrc/autograd/generated/VariableType_1.cpp",
-        "torch/csrc/autograd/generated/VariableType_2.cpp",
-        "torch/csrc/autograd/generated/VariableType_3.cpp",
-        "torch/csrc/autograd/generated/VariableType_4.cpp",
-        # "torch/csrc/autograd/generated/VariableTypeEverything.cpp",
-        "torch/csrc/autograd/generated/TraceType_0.cpp",
-        "torch/csrc/autograd/generated/TraceType_1.cpp",
-        "torch/csrc/autograd/generated/TraceType_2.cpp",
-        "torch/csrc/autograd/generated/TraceType_3.cpp",
-        "torch/csrc/autograd/generated/TraceType_4.cpp",
-        # "torch/csrc/autograd/generated/TraceTypeEverything.cpp",
-        "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp",
-        "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp",
-        # "torch/csrc/autograd/generated/ADInplaceOrViewTypeEverything.cpp",
-        "torch/csrc/autograd/generated/Functions.h",
-        "torch/csrc/autograd/generated/Functions.cpp",
-        "torch/csrc/autograd/generated/variable_factories.h",
-]
-
-libtorch_python_generated_sources = [
-        "torch/csrc/autograd/generated/python_functions.h",
-        "torch/csrc/autograd/generated/python_functions_0.cpp",
-        "torch/csrc/autograd/generated/python_functions_1.cpp",
-        "torch/csrc/autograd/generated/python_functions_2.cpp",
-        "torch/csrc/autograd/generated/python_functions_3.cpp",
-        "torch/csrc/autograd/generated/python_functions_4.cpp",
-        "torch/csrc/autograd/generated/python_variable_methods.cpp",
-        "torch/csrc/autograd/generated/python_torch_functions_0.cpp",
-        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
-        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
-        "torch/csrc/autograd/generated/python_nn_functions.cpp",
-        "torch/csrc/autograd/generated/python_fft_functions.cpp",
-        "torch/csrc/autograd/generated/python_linalg_functions.cpp",
-        "torch/csrc/autograd/generated/python_sparse_functions.cpp",
-        "torch/csrc/autograd/generated/python_special_functions.cpp",
-        "torch/csrc/autograd/generated/python_return_types.cpp",
-]
-
-genrule(
-    name = "all_generated_code",
-    srcs = [
-        "aten/src/ATen/native/native_functions.yaml",
-    ],
-    outs = libtorch_cpp_generated_sources + libtorch_python_generated_sources,
-    cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src",
-    tools = [":generate_code"],
+    outs = (
+        generated_cpu_cpp +
+        generated_cuda_cpp +
+        aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") +
+        aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") +
+        aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") +
+        ["aten/src/ATen/Declarations.yaml"]
+    ),
+    generator = "//torchgen:gen",
 )
 
 filegroup(
     name = "cpp_generated_code",
-    data = [":all_generated_code"],
-    srcs = libtorch_cpp_generated_sources,
-)
-
-filegroup(
-    name = "python_generated_code",
-    data = [":all_generated_code"],
-    srcs = libtorch_python_generated_sources,
+    data = [":generate-code"],
+    srcs = GENERATED_AUTOGRAD_CPP,
 )
 
 exports_files(
@@ -229,6 +144,11 @@ filegroup(
     srcs = glob(["aten/src/ATen/native/sparse/*.cpp"]),
 )
 
+filegroup(
+    name = "aten_native_nested_cpp",
+    srcs = glob(["aten/src/ATen/native/nested/*.cpp"]),
+)
+
 filegroup(
     name = "aten_native_quantized_cpp",
     srcs = glob(
@@ -239,6 +159,11 @@ filegroup(
     ),
 )
 
+filegroup(
+    name = "aten_native_transformers_cpp",
+    srcs = glob(["aten/src/ATen/native/transformers/*.cpp"]),
+)
+
 filegroup(
     name = "aten_native_mkl_cpp",
     srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]),
@@ -287,7 +212,10 @@ filegroup(
             "aten/src/ATen/native/cuda/linalg/*.cpp",
             "aten/src/ATen/native/cudnn/*.cpp",
             "aten/src/ATen/native/miopen/*.cpp",
+            "aten/src/ATen/native/nested/cuda/*.cpp",
+            "aten/src/ATen/native/quantized/cudnn/*.cpp",
             "aten/src/ATen/native/sparse/cuda/*.cpp",
+            "aten/src/ATen/native/transformers/cuda/*.cpp",
             "aten/src/THC/*.cpp",
         ],
     ),
@@ -299,9 +227,13 @@ filegroup(
         "aten/src/ATen/cuda/*.cu",
         "aten/src/ATen/cuda/detail/*.cu",
         "aten/src/ATen/native/cuda/*.cu",
+        "aten/src/ATen/native/nested/cuda/*.cu",
         "aten/src/ATen/native/quantized/cuda/*.cu",
         "aten/src/ATen/native/sparse/cuda/*.cu",
-    ]),
+        "aten/src/ATen/native/transformers/cuda/*.cu",
+    ]) + aten_ufunc_generated_cuda_sources("aten/src/ATen/{}"),
+    # It's a bit puzzling to me why it's not necessary to declare the
+    # target that generates these sources...
 )
 
 header_template_rule(
@@ -383,6 +315,7 @@ intern_build_aten_ops(
         "@fbgemm",
         "@mkl",
     ],
+    extra_impls = aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}"),
 )
 
 cc_library(
@@ -398,9 +331,11 @@ cc_library(
         ":aten_native_mkldnn_cpp",
         ":aten_native_quantized_cpp",
         ":aten_native_sparse_cpp",
+        ":aten_native_nested_cpp",
+        ":aten_native_transformers_cpp",
         ":aten_native_xnnpack",
         ":aten_src_ATen_config",
-    ] + generated_cpu_cpp,
+    ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
     copts = ATEN_COPTS,
     data = if_cuda(
         [":libcaffe2_nvrtc.so"],
@@ -1354,7 +1289,7 @@ cc_library(
 py_binary(
     name = "gen_op",
     srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = [":tools_codegen"],
+    deps = ["//torchgen"],
 )
 
 genrule(
@@ -1622,19 +1557,6 @@ cc_library(
 )
 
 # torch
-py_binary(
-    name = "gen_version_header",
-    srcs = ["tools/setup_helpers/gen_version_header.py"],
-)
-
-genrule(
-    name = "version_h",
-    srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"],
-    outs = ["torch/csrc/api/include/torch/version.h"],
-    cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@",
-    tools = [':gen_version_header'],
-)
-
 py_binary(
     name = "stringify_file",
     srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"],
@@ -1673,7 +1595,7 @@ cc_library(
             "torch/csrc/autograd/generated/variable_factories.h",
             "torch/csrc/autograd/generated/Functions.h",
         ] + torch_cuda_headers,
-    ) + [":cpp_generated_code", ":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
     includes = [
         "torch/csrc",
         "torch/csrc/api/include",
@@ -1718,8 +1640,7 @@ cc_library(
             "torch/csrc/cuda/nccl.cpp",
             "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
         ],
-    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
-        ":cpp_generated_code",
+    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + lazy_tensor_ts_sources + GENERATED_AUTOGRAD_CPP + [
         "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
         "torch/csrc/jit/mobile/flatbuffer_loader.cpp"
     ],
@@ -1752,7 +1673,10 @@ cc_library(
         "**/*.h",
         "**/*.cuh",
     ]) + [
-        ":cpp_generated_code",
+        # We need the filegroup here because the raw list causes Bazel
+        # to see duplicate files. It knows how to deduplicate with the
+        # filegroup.
+        ":cpp_generated_code"
     ],
     includes = [
         "torch/csrc/api/include",
@@ -1768,7 +1692,7 @@ cc_library(
 
 cc_library(
     name = "torch_python",
-    srcs = libtorch_python_core_sources + [":python_generated_code"],
+    srcs = libtorch_python_core_sources + GENERATED_AUTOGRAD_PYTHON,
     deps = [
         ":torch",
         ":shm",
@@ -1880,6 +1804,9 @@ cc_test(
         "test/cpp/jit/*.h",
         "test/cpp/tensorexpr/*.cpp",
         "test/cpp/tensorexpr/*.h",
+    ], exclude=[
+        # skip this since <pybind11/embed.h> is not found in OSS build
+        "test/cpp/jit/test_exception.cpp",
     ]),
     linkstatic = True,
     tags = [
@@ -1898,6 +1825,11 @@ cc_test(
     srcs = glob([
         "test/cpp/lazy/*.cpp",
         "test/cpp/lazy/*.h",
+    ], exclude=[
+        # skip these since they depend on generated LazyIr.h which isn't available in bazel yet
+        "test/cpp/lazy/test_ir.cpp",
+        "test/cpp/lazy/test_lazy_ops.cpp",
+        "test/cpp/lazy/test_lazy_ops_util.cpp",
     ]),
     linkstatic = True,
     tags = [
@@ -1919,3 +1851,25 @@ test_suite(
         "//c10/test:tests",
     ],
 )
+
+# An internal genrule that we are converging with refers to these file
+# as if they are from this package, so we alias them for
+# compatibility.
+
+[
+    alias(
+        name = paths.basename(path),
+        actual = path,
+    )
+    for path in [
+        "aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp",
+        "aten/src/ATen/templates/DispatchKeyNativeFunctions.h",
+        "aten/src/ATen/templates/LazyIr.h",
+        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
+        "aten/src/ATen/native/native_functions.yaml",
+        "aten/src/ATen/native/tags.yaml",
+        "aten/src/ATen/native/ts_native_functions.yaml",
+        "torch/csrc/lazy/core/shape_inference.h",
+        "torch/csrc/lazy/ts_backend/ts_native_functions.cpp",
+    ]
+]
diff --git a/BUILD.buck b/BUILD.buck
new file mode 100644
index 000000000000..ad8caff6ec4d
--- /dev/null
+++ b/BUILD.buck
@@ -0,0 +1,620 @@
+load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
+load(
+    "//tools:build_variables.bzl",
+    "aten_cpu_source_list",
+    "aten_native_source_list",
+    "core_sources_common",
+    "jit_core_headers",
+    "jit_core_sources",
+    "libtorch_profiler_sources",
+)
+load(
+    "//:pt_defs.oss.bzl",
+    "USED_PT_BACKENDS",
+    "build_aten_cpu",
+    "gen_aten_files",
+    "gen_aten_libtorch_files",
+    "get_aten_codegen_extra_params",
+    "get_pt_compiler_flags",
+    "get_pt_preprocessor_flags",
+    "pt_operator_library",
+    "get_pt_ops_deps",
+    "aten_ufunc_generated_all_cpu_sources",
+    "TEMPLATE_SOURCE_LIST",
+)
+
+cxx_library(
+    name = "pthreadpool",
+    srcs = ['caffe2/utils/threadpool/pthreadpool.cc', 'caffe2/utils/threadpool/pthreadpool_impl.cc', 'caffe2/utils/threadpool/pthreadpool-cpp.cc', 'caffe2/utils/threadpool/thread_pool_guard.cpp', 'caffe2/utils/threadpool/ThreadPool.cc'],
+    deps = [':caffe2_headers', '//third_party:cpuinfo', '//third_party:glog', '//c10:c10', '//third_party:FXdiv'],
+    exported_deps = ['//third_party:pthreadpool'],
+    compiler_flags = ['-Wno-unused-function'],
+    preferred_linkage = "static",
+    exported_headers = subdir_glob([("", "caffe2/utils/threadpool/*.h")]),
+    exported_preprocessor_flags = ['-DUSE_PTHREADPOOL'],
+    header_namespace = "",
+    headers = [],
+    link_whole = True,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+cxx_library(
+    name = "caffe2_headers",
+    deps = ['//c10:c10'],
+    exported_headers = subdir_glob(
+        [
+            ("", "caffe2/**/*.h"),
+            ("", "binaries/**/*.h"),
+            ("modules", "**/*.h"),
+            ("aten/src", "ATen/core/**/*.h"),
+        ],
+        exclude = [
+            "caffe2/fb/**/*.h",
+            "caffe2/mobile/contrib/libopencl-stub/**/*.h",
+            "caffe2/mobile/contrib/libvulkan-stub/**/*.h",
+            "caffe2/mobile/contrib/nnapi/**/*.h",
+            "caffe2/mobile/fb/binary/**/*.h",
+            "caffe2/mobile/fb/snpe_so/**/*.h",
+            "caffe2/mobile/fb/boltnn/bolt_lib/include/**/*.h",
+            "caffe2/mobile/contrib/snpe/**/*.h",
+            "caffe2/mobile/fb/qpl/jni/QuickPerformanceLogger.h",
+            "caffe2/share/fb/x3d/ldi/*.h",
+            "**/*.pb.h",
+        ],
+    ),
+    compiler_flags = ['-Os', '-fexceptions', '-frtti', '-Wno-shadow', '-Wno-unknown-pragmas', '-Wno-unused-variable', '-Wno-sign-compare', '-Icaffe2', '-Imodules', '-DEIGEN_NO_DEBUG', '-DCAFFE2_USE_LITE_PROTO', '-DCAFFE2_USE_GOOGLE_GLOG', '-DCAFFE2_RNN_NO_TEXT_FORMAT', '-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK=1', '-DCAFFE2_IS_XPLAT_BUILD', '-DSTRIP_ERROR_MESSAGES', '-DUSE_INTERNAL_PTHREADPOOL_IMPL', '-DCAFFE2_USE_HPTT'],
+    preferred_linkage = "static",
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    preprocessor_flags = ['-DUSE_INTERNAL_PTHREADPOOL_IMPL'],
+    visibility = ['PUBLIC'],
+)
+
+cxx_library(
+    name = "common_core",
+    srcs = ['caffe2/core/common.cc'],
+    deps = [':caffe2_headers', '//c10:c10'],
+    exported_deps = [],
+    compiler_flags = ['-frtti', '-Os', '-Wno-unknown-pragmas', '-Wno-write-strings', '-Wno-unused-variable', '-Wno-unused-function', '-Wno-deprecated-declarations', '-Wno-shadow', '-Wno-global-constructors', '-Wno-missing-prototypes', '-std=gnu++17'],
+    preferred_linkage = "static",
+    header_namespace = "caffe2",
+    headers = [],
+    link_whole = True,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+cxx_library(
+    name = "th_header",
+    header_namespace = "",
+    exported_headers = subdir_glob([
+        # TH
+        ("aten/src", "TH/*.h"),
+        ("aten/src", "TH/*.hpp"),
+        ("aten/src", "TH/generic/*.h"),
+        ("aten/src", "TH/generic/*.hpp"),
+        ("aten/src", "TH/generic/simd/*.h"),
+        ("aten/src", "TH/vector/*.h"),
+        ("aten/src", "TH/generic/*.c"),
+        ("aten/src", "TH/generic/*.cpp"),
+        ("aten/src/TH", "*.h"),  # for #include <THGenerateFloatTypes.h>
+        # THNN
+        ("aten/src", "THNN/*.h"),
+        ("aten/src", "THNN/generic/*.h"),
+        ("aten/src", "THNN/generic/*.c"),
+    ]),
+)
+
+cxx_library(
+    name = "aten_header",
+    header_namespace = "",
+    exported_headers = subdir_glob([
+        # ATen Core
+        ("aten/src", "ATen/core/**/*.h"),
+        ("aten/src", "ATen/ops/*.h"),
+        # ATen Base
+        ("aten/src", "ATen/*.h"),
+        ("aten/src", "ATen/cpu/**/*.h"),
+        ("aten/src", "ATen/detail/*.h"),
+        ("aten/src", "ATen/quantized/*.h"),
+        ("aten/src", "ATen/vulkan/*.h"),
+        ("aten/src", "ATen/metal/*.h"),
+        ("aten/src", "ATen/mps/*.h"),
+        ("aten/src", "ATen/nnapi/*.h"),
+        # ATen Native
+        ("aten/src", "ATen/native/*.h"),
+        ("aten/src", "ATen/native/ao_sparse/quantized/cpu/*.h"),
+        ("aten/src", "ATen/native/cpu/**/*.h"),
+        ("aten/src", "ATen/native/sparse/*.h"),
+        ("aten/src", "ATen/native/mps/*.h"),
+        ("aten/src", "ATen/native/nested/*.h"),
+        ("aten/src", "ATen/native/quantized/*.h"),
+        ("aten/src", "ATen/native/quantized/cpu/*.h"),
+        ("aten/src", "ATen/native/transformers/*.h"),
+        ("aten/src", "ATen/native/ufunc/*.h"),
+        ("aten/src", "ATen/native/utils/*.h"),
+        ("aten/src", "ATen/native/vulkan/ops/*.h"),
+        ("aten/src", "ATen/native/xnnpack/*.h"),
+        # Remove the following after modifying codegen for mobile.
+        ("aten/src", "ATen/mkl/*.h"),
+        ("aten/src", "ATen/native/mkl/*.h"),
+        ("aten/src", "ATen/native/mkldnn/*.h"),
+    ], exclude = ["aten/src/ATen/Config.h"]),
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "jit_core_headers",
+    header_namespace = "",
+    exported_headers = subdir_glob([("", x) for x in jit_core_headers]),
+)
+
+cxx_library(
+    name = "generated_aten_config_header",
+    header_namespace = "ATen",
+    exported_headers = {
+        "Config.h": ":generate_aten_config[Config.h]",
+    },
+)
+
+cxx_library(
+    name = "torch_mobile_headers",
+    header_namespace = "",
+    exported_headers = subdir_glob(
+        [
+            ("", "torch/csrc/jit/mobile/*.h"),
+        ],
+    ),
+    visibility = ["PUBLIC"],
+)
+
+fb_xplat_genrule(
+    name = "generate_aten_config",
+    srcs = [
+        "aten/src/ATen/Config.h.in",
+    ],
+    cmd = " ".join([
+        "sed",
+        "-e 's/@AT_MKLDNN_ENABLED@/ATEN_MKLDNN_ENABLED_FBXPLAT/g'",
+        "-e 's/@AT_MKL_ENABLED@/ATEN_MKL_ENABLED_FBXPLAT/g'",
+        "-e 's/@AT_MKL_SEQUENTIAL@/ATEN_MKL_SEQUENTIAL_FBXPLAT/g'",
+        "-e 's/@AT_FFTW_ENABLED@/0/g'",
+        "-e 's/@AT_POCKETFFT_ENABLED@/0/g'",
+        "-e 's/@AT_NNPACK_ENABLED@/ATEN_NNPACK_ENABLED_FBXPLAT/g'",
+        "-e 's/@CAFFE2_STATIC_LINK_CUDA_INT@/CAFFE2_STATIC_LINK_CUDA_FBXPLAT/g'",
+        "-e 's/@AT_BUILD_WITH_BLAS@/USE_BLAS_FBXPLAT/g'",
+        "-e 's/@AT_PARALLEL_OPENMP@/AT_PARALLEL_OPENMP_FBXPLAT/g'",
+        "-e 's/@AT_PARALLEL_NATIVE@/AT_PARALLEL_NATIVE_FBXPLAT/g'",
+        "-e 's/@AT_PARALLEL_NATIVE_TBB@/AT_PARALLEL_NATIVE_TBB_FBXPLAT/g'",
+        "-e 's/@AT_BUILD_WITH_LAPACK@/USE_LAPACK_FBXPLAT/g'",
+        "-e 's/@AT_BLAS_F2C@/AT_BLAS_F2C_FBXPLAT/g'",
+        "-e 's/@AT_BLAS_USE_CBLAS_DOT@/AT_BLAS_USE_CBLAS_DOT_FBXPLAT/g'",
+        "aten/src/ATen/Config.h.in > $OUT/Config.h"
+    ]),
+    outs = {
+        "Config.h": ["Config.h"],
+    },
+    default_outs = ["."],
+)
+
+gen_aten_files(
+    name = "gen_aten",
+    extra_flags = get_aten_codegen_extra_params(USED_PT_BACKENDS),
+    visibility = ["PUBLIC"],
+)
+
+ATEN_EXPORTED_HEADERS = {
+    "CPUFunctions.h": ":gen_aten[CPUFunctions.h]",
+    "CPUFunctions_inl.h": ":gen_aten[CPUFunctions_inl.h]",
+    "CompositeExplicitAutogradFunctions.h": ":gen_aten[CompositeExplicitAutogradFunctions.h]",
+    "CompositeExplicitAutogradFunctions_inl.h": ":gen_aten[CompositeExplicitAutogradFunctions_inl.h]",
+    "CompositeImplicitAutogradFunctions.h": ":gen_aten[CompositeImplicitAutogradFunctions.h]",
+    "CompositeImplicitAutogradFunctions_inl.h": ":gen_aten[CompositeImplicitAutogradFunctions_inl.h]",
+    "FunctionalInverses.h": ":gen_aten[FunctionalInverses.h]",
+    "Functions.h": ":gen_aten[Functions.h]",
+    "MethodOperators.h": ":gen_aten[MethodOperators.h]",
+    "NativeFunctions.h": ":gen_aten[NativeFunctions.h]",
+    "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
+    "Operators.h": ":gen_aten[Operators.h]",
+    "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
+    "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
+    "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
+}
+
+cxx_library(
+    name = "generated_aten_headers_cpu",
+    header_namespace = "ATen",
+    exported_headers = ATEN_EXPORTED_HEADERS,
+)
+
+filegroup(
+    name = "aten_src_path",
+    srcs = [
+        "aten/src/ATen/native/native_functions.yaml",
+        "aten/src/ATen/native/tags.yaml",
+    ] + glob(["aten/src/ATen/templates/*"]),
+    visibility = [
+        "PUBLIC",
+    ],
+)
+
+build_aten_cpu(
+    name = "aten_cpu",
+    srcs = jit_core_sources +
+           aten_cpu_source_list + [
+        # Generated
+        ":gen_aten[Functions.cpp]",
+        ":gen_aten[Operators_0.cpp]",
+        ":gen_aten[Operators_1.cpp]",
+        ":gen_aten[Operators_2.cpp]",
+        ":gen_aten[Operators_3.cpp]",
+        ":gen_aten[Operators_4.cpp]",
+        ":gen_aten[core/ATenOpList.cpp]",
+        ":gen_aten[core/TensorMethods.cpp]",
+    ] + [
+        # Needed by ATen/native/EmbeddingBag.cpp
+        "caffe2/perfkernels/embedding_lookup_idx.cc",
+    ],
+)
+
+gen_aten_libtorch_files(name = "gen_aten_libtorch")
+
+
+GENERATED_AUTOGRAD_H = {
+    "Functions.h": ":gen_aten_libtorch[autograd/generated/Functions.h]",
+    "VariableType.h": ":gen_aten_libtorch[autograd/generated/VariableType.h]",
+    "variable_factories.h": ":gen_aten_libtorch[autograd/generated/variable_factories.h]",
+
+    # Don't build python bindings on mobile.
+    #"python_functions.h",
+}
+
+cxx_library(
+    name = "generated-autograd-headers",
+    header_namespace = "torch/csrc/autograd/generated",
+    exported_headers = GENERATED_AUTOGRAD_H,
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "torch_mobile_observer",
+    srcs = [
+        "torch/csrc/jit/mobile/observer.cpp",
+        #"torch/fb/observers/MobileObserverUtil.cpp",
+    ],
+    header_namespace = "",
+    exported_headers = subdir_glob(
+        [
+            ("", "torch/csrc/jit/mobile/observer.h"),
+            #("", "torch/fb/observers/ObserverUtil.h"),
+            #("", "torch/fb/observers/MobileObserverUtil.h"),
+        ],
+    ),
+    visibility = ["PUBLIC"],
+    deps = [
+        "//c10:c10",
+    ],
+)
+
+python_library(
+    name = "aten_code_template",
+    srcs = subdir_glob([
+        ("aten", "src/ATen/code_template.py"),
+    ]),
+    base_module = "",
+    visibility = ["PUBLIC"],
+)
+
+fb_xplat_genrule(
+    name = "generate-version-header",
+    srcs = [
+        "torch/csrc/api/include/torch/version.h.in",
+        "version.txt",
+    ],
+    cmd = "$(exe //tools/setup_helpers:gen-version-header) " + " ".join([
+        "--template-path",
+        "torch/csrc/api/include/torch/version.h.in",
+        "--version-path",
+        "version.txt",
+        "--output-path",
+        "$OUT/version.h",
+    ]),
+    outs = {
+        "version.h": ["version.h"],
+    },
+    default_outs = ["."],
+)
+
+cxx_library(
+    name = "generated-version-header",
+    header_namespace = "torch",
+    exported_headers = {
+        "version.h": ":generate-version-header[version.h]",
+    },
+)
+
+cxx_library(
+    name = "torch_headers",
+    header_namespace = "",
+    exported_headers = subdir_glob(
+        [
+            ("torch/csrc/api/include", "torch/**/*.h"),
+            ("", "torch/csrc/**/*.h"),
+            ("", "torch/csrc/generic/*.cpp"),
+            ("", "torch/script.h"),
+            ("", "torch/library.h"),
+            ("", "torch/custom_class.h"),
+            ("", "torch/custom_class_detail.h"),
+            # Add again due to namespace difference from aten_header.
+            ("", "aten/src/ATen/*.h"),
+            ("", "aten/src/ATen/quantized/*.h"),
+        ],
+        exclude = [
+            # Don't need on mobile.
+            "torch/csrc/Exceptions.h",
+            "torch/csrc/python_headers.h",
+            "torch/csrc/utils/auto_gil.h",
+            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
+            "torch/csrc/api/include/torch/version.h",
+        ],
+    ),
+    visibility = ["PUBLIC"],
+    deps = [
+        ":generated-version-header",
+    ],
+)
+
+
+cxx_library(
+    name = "torch_common",
+    srcs = core_sources_common,
+    compiler_flags = get_pt_compiler_flags(),
+    exported_preprocessor_flags = get_pt_preprocessor_flags(),
+    link_whole = True,
+    visibility = ["PUBLIC"],
+    deps = [
+        ":aten_cpu",
+        ":generated-autograd-headers",
+        ":torch_headers",
+        "//third_party:glog",
+        "//c10:c10",
+    ],
+)
+
+
+cxx_library(
+    name = "torch_mobile_deserialize_common",
+    srcs = [
+        "torch/csrc/jit/mobile/parse_bytecode.cpp",
+        "torch/csrc/jit/mobile/parse_operators.cpp",
+        "torch/csrc/jit/mobile/upgrader_mobile.cpp",
+        "torch/csrc/jit/serialization/import_read.cpp",
+        "torch/csrc/jit/serialization/unpickler.cpp",
+    ],
+    header_namespace = "",
+    exported_headers = [
+        "torch/csrc/jit/serialization/import_read.h",
+        "torch/csrc/jit/serialization/unpickler.h",
+    ],
+    compiler_flags = get_pt_compiler_flags(),
+    link_whole = True,
+    linker_flags = [
+        "-Wl,--no-as-needed",
+    ],
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        ":aten_cpu",
+        ":caffe2_headers",
+        ":caffe2_serialize",
+        ":torch_common",
+        ":torch_headers",
+        ":torch_mobile_headers",
+        ":torch_mobile_module",
+        ":torch_mobile_observer",
+        "//third_party:glog",
+        "//c10:c10",
+    ],
+)
+
+cxx_library(
+    name = "caffe2_serialize",
+    srcs = [
+        "caffe2/serialize/file_adapter.cc",
+        "caffe2/serialize/inline_container.cc",
+        "caffe2/serialize/istream_adapter.cc",
+        "caffe2/serialize/read_adapter_interface.cc",
+    ],
+    visibility = ["PUBLIC"],
+    deps = [
+        ":caffe2_headers",
+        "//third_party:glog",
+        "//c10:c10",
+        "//third_party:miniz",
+    ],
+)
+
+cxx_library(
+    name = "torch_mobile_deserialize",
+    srcs = [
+        "torch/csrc/jit/mobile/import.cpp",
+    ],
+    header_namespace = "",
+    exported_headers = [
+        "torch/csrc/jit/mobile/import.h",
+    ],
+    compiler_flags = get_pt_compiler_flags(),
+    link_whole = True,
+    linker_flags = [
+        "-Wl,--no-as-needed",
+    ],
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        ":aten_cpu",
+        ":caffe2_headers",
+        ":caffe2_serialize",
+        ":torch_common",
+        ":torch_headers",
+        ":torch_mobile_headers",
+        ":torch_mobile_module",
+        ":torch_mobile_observer",
+        "//third_party:glog",
+        "//c10:c10",
+        ":torch_mobile_deserialize_common",
+    ],
+)
+
+cxx_library(
+    name = "torch_mobile_module",
+    srcs = [
+        "torch/csrc/jit/mobile/function.cpp",
+        "torch/csrc/jit/mobile/interpreter.cpp",
+        "torch/csrc/jit/mobile/module.cpp",
+    ],
+    header_namespace = "",
+    exported_headers = [],
+    compiler_flags = get_pt_compiler_flags(),
+    link_whole = True,
+    linker_flags = [
+        "-Wl,--no-as-needed",
+    ],
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        ":aten_cpu",
+        ":caffe2_headers",
+        ":torch_common",
+        ":torch_headers",
+        ":torch_mobile_headers",
+        ":torch_mobile_observer",
+        "//third_party:glog",
+        "//c10:c10",
+    ],
+)
+
+cxx_library(
+    name = "torch_mobile_core",
+    srcs = [],
+    header_namespace = "",
+    exported_headers = [],
+    compiler_flags = get_pt_compiler_flags(),
+    exported_preprocessor_flags = get_pt_preprocessor_flags(),
+    link_whole = True,
+    linker_flags = [
+        "-Wl,--no-as-needed",
+        # "-ldl",
+    ],
+    visibility = ["PUBLIC"],
+    deps = [
+        ":generated-autograd-headers",
+        ":torch_mobile_observer",
+        ":torch_mobile_headers",
+    ],
+    exported_deps = [
+        ":aten_cpu",
+        ":torch_common",
+        ":torch_mobile_deserialize",
+    ],
+)
+
+pt_operator_library(
+    name = "torch_mobile_ops_full_dev",
+    check_decl = False,
+    include_all_operators = True,
+)
+
+cxx_library(
+    name = "torch_mobile_all_ops",
+    visibility = ["PUBLIC"],
+    deps = get_pt_ops_deps(
+        name = "pt_ops_full",
+        train = False,
+        deps = [
+            ":torch_mobile_ops_full_dev",
+        ],
+        enable_flatbuffer = False,
+    ),
+)
+
+python_library(
+    name = "gen_oplist_lib",
+    srcs = subdir_glob([
+        ("tools/code_analyzer", "gen_oplist.py"),
+        ("tools/code_analyzer", "gen_op_registration_allowlist.py"),
+    ]),
+    base_module = "",
+    deps = [
+        "//third_party:pyyaml",
+        "//tools/lite_interpreter:gen_selected_mobile_ops_header",
+        "//torchgen:torchgen",
+    ],
+)
+
+python_binary(
+    name = "gen_oplist",
+    main_module = "gen_oplist",
+    visibility = ["PUBLIC"],
+    deps = [
+        ":gen_oplist_lib",
+    ],
+)
+
+python_library(
+    name = "gen_operators_yaml_lib",
+    srcs = subdir_glob([
+        ("tools/code_analyzer", "gen_operators_yaml.py"),
+        ("tools/code_analyzer", "gen_op_registration_allowlist.py"),
+    ]),
+    base_module = "",
+    deps = [
+        "//third_party:pyyaml",
+        "//torchgen:torchgen",
+    ],
+)
+
+python_binary(
+    name = "gen_operators_yaml",
+    main_module = "gen_operators_yaml",
+    visibility = ["PUBLIC"],
+    deps = [
+        ":gen_operators_yaml_lib",
+    ],
+)
+
+cxx_binary(
+    name = 'ptmobile_benchmark',
+    srcs = [
+        'binaries/speed_benchmark_torch.cc',
+    ],
+    compiler_flags = [
+        "-fexceptions",
+        "-frtti",
+        "-Wno-deprecated-declarations",
+    ],
+    preprocessor_flags = [
+        "-DBUILD_LITE_INTERPRETER",
+    ],
+    platform_linker_flags = [
+        (
+            "^linux.*$",
+            [
+                "-Wl,--no-as-needed",
+                "-ldl",
+                "-pthread",
+            ],
+        ),
+    ],
+    deps = [
+        ":torch_mobile_core",
+        ":torch_mobile_all_ops",
+        "//c10:c10",
+    ],
+)
+
+filegroup(
+    name = "templated_selective_build_srcs",
+    # NB: no glob here, there are generated targets in this list!
+    srcs = glob(TEMPLATE_SOURCE_LIST) + aten_ufunc_generated_all_cpu_sources(":gen_aten[{}]"),
+    visibility = [
+        "PUBLIC",
+    ],
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0ddb61781ea..eb0ce9c882e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,30 +94,38 @@ if(APPLE)
   # RPATH stuff
   set(CMAKE_MACOSX_RPATH ON)
   if(NOT IOS)
-    # Determine if we can link against ML Compute
-    set(MLCOMPUTE_FOUND OFF)
+    # Determine if we can link against MPSGraph
+    set(MPS_FOUND OFF)
     execute_process(
-      COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
-      OUTPUT_VARIABLE _macosx_sdk_path
+      COMMAND bash -c "xcodebuild -sdk macosx -version SDKVersion"
+      RESULT_VARIABLE _exit_code
+      OUTPUT_VARIABLE _macosx_sdk_version
       OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
-    set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
-
-    find_library(_MLCompute_fwrk_path_ NAMES MLCompute PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH)
-    find_library(_MLCompute_sdk_path_ NAMES MLCompute PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH)
-
-    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mlc)
-      set(_MLC_FOLDER_EXISTS YES)
+    if(_exit_code EQUAL 0)
+        set(_MPS_supported_os_version OFF)
+        if(_macosx_sdk_version VERSION_GREATER_EQUAL 12.3)
+            set(_MPS_supported_os_version ON)
+        endif()
+        message(STATUS "sdk version: ${_macosx_sdk_version}, mps supported: ${_MPS_supported_os_version}")
+        execute_process(
+          COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
+          OUTPUT_VARIABLE _macosx_sdk_path
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+        set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
+        set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
+
+        find_library(_MPS_fwrk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH)
+        find_library(_MPS_sdk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH)
+
+        if(_MPS_supported_os_version AND _MPS_fwrk_path_ AND _MPS_sdk_path_)
+          set(MPS_FOUND ON)
+          message(STATUS "MPSGraph framework found")
+        else()
+          message(STATUS "MPSGraph framework not found")
+        endif()
     else()
-      set(_MLC_FOLDER_EXISTS NO)
-    endif()
-
-    if(_MLCompute_fwrk_path_ AND _MLCompute_sdk_path_ AND _MLC_FOLDER_EXISTS)
-      set(MLCOMPUTE_FOUND ON)
-      message(STATUS "ML Compute framework found")
-    else()
-      message(STATUS "ML Compute framework not found")
+      message(STATUS "MPS: unable to get MacOS sdk version")
+      message(STATUS "MPSGraph framework not found")
     endif()
   endif()
 endif()
@@ -189,6 +197,8 @@ option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
     BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF
     "USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF)
+cmake_dependent_option(
+     BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
 option(USE_ROCM "Use ROCm" ON)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
@@ -202,15 +212,11 @@ cmake_dependent_option(
     BUILD_NVFUSER_BENCHMARK "Build C++ binaries for nvfuser benchmarks" ON
     "USE_CUDA;BUILD_TEST" OFF)
 cmake_dependent_option(
-  USE_WHOLE_CUDNN "Use whole-library linking for cuDNN" OFF
-    "USE_STATIC_CUDNN" OFF)
-cmake_dependent_option(
-    USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" OFF
+    USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON
     "USE_CUDNN" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
-option(USE_BREAKPAD "Use breakpad crash dump library" ON)
-option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
+option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
 option(USE_FFMPEG "Use ffmpeg" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
@@ -224,8 +230,8 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
-    USE_MLCOMPUTE "Use ML Compute for macOS build" ON
-    "MLCOMPUTE_FOUND" OFF)
+    USE_MPS "Use MPS for macOS build" ON
+    "MPS_FOUND" OFF)
 cmake_dependent_option(
     USE_NCCL "Use NCCL" ON
     "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
@@ -273,17 +279,20 @@ if(NOT DEFINED USE_VULKAN)
       "ANDROID" OFF)
 endif()
 
-if(IOS)
-  set(USE_BREAKPAD OFF)
-endif()
-
 option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
-option(USE_XNNPACK "Use XNNPACK" ON)
+# option USE_XNNPACK: try to enable xnnpack by default.
+set(XNNPACK_MIN_CMAKE_VER 3.12)
+cmake_dependent_option(
+    USE_XNNPACK "Use XNNPACK. Requires cmake >= ${XNNPACK_MIN_CMAKE_VER}." ON
+    "CMAKE_VERSION VERSION_GREATER_EQUAL ${XNNPACK_MIN_CMAKE_VER}" OFF)
+if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
+  message(WARNING "USE_XNNPACK is set to OFF. XNNPACK requires CMake version ${XNNPACK_MIN_CMAKE_VER} or greater.")
+endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 # Ensure that an MKLDNN build is the default for x86 CPUs
@@ -298,6 +307,7 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(
     USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF
     "USE_MKLDNN" OFF)
+option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
 option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
     USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
@@ -306,12 +316,15 @@ cmake_dependent_option(
     USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
     "USE_DISTRIBUTED" OFF)
 cmake_dependent_option(
-    USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
+  USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
     "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
+cmake_dependent_option(
+    USE_NCCL_WITH_UCC "Enable UCC support for ProcessGroupNCCL. Only available if USE_C10D_NCCL is on." OFF
+    "USE_C10D_NCCL" OFF)
 cmake_dependent_option(
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
@@ -330,6 +343,9 @@ cmake_dependent_option(USE_CCACHE "Attempt using CCache to wrap the compilation"
 option(WERROR "Build with -Werror supported by the compiler" OFF)
 option(USE_COREML_DELEGATE "Use the CoreML backend through delegate APIs" OFF)
 option(USE_PER_OPERATOR_HEADERS "Whether ATen should generate separate headers for each operator" ON)
+cmake_dependent_option(
+    BUILD_LAZY_TS_BACKEND "Build the lazy Torchscript backend, not compatible with mobile builds" ON
+    "NOT INTERN_BUILD_MOBILE" OFF)
 
 
 if(USE_CCACHE)
@@ -429,8 +445,14 @@ else()
 endif()
 set(SELECTED_OP_LIST "" CACHE STRING
     "Path to the yaml file that contains the list of operators to include for custom build. Include all operators by default.")
-set(STATIC_DISPATCH_BACKEND "" CACHE STRING
-    "Name of the backend for which static dispatch code is generated, e.g.: CPU.")
+option(
+    STATIC_DISPATCH_BACKEND
+    "Name of the backend for which static dispatch code is generated, e.g.: CPU."
+    "")
+option(USE_LIGHTWEIGHT_DISPATCH "Enable codegen unboxing for ATen ops, need to work with static dispatch in order to work properly." OFF)
+if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND)
+  message(FATAL_ERROR "Need to enable static dispatch after enabling USE_LIGHTWEIGHT_DISPATCH.")
+endif()
 option(
   TRACING_BASED
   "Master flag to build Lite Interpreter with tracing build option"
@@ -538,6 +560,8 @@ endif(NOT MSVC)
 # purpose.
 if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
   set(INTERN_BUILD_MOBILE ON)
+  message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND")
+  set(BUILD_LAZY_TS_BACKEND OFF)
 
   if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
     # C10_MOBILE is derived from Android/iOS toolchain macros in
@@ -667,6 +691,8 @@ if(USE_FBGEMM AND ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VO
   set(USE_FBGEMM OFF)
 endif()
 
+set(BUILD_ONEDNN_GRAPH OFF)
+
 include(cmake/Dependencies.cmake)
 
 if(USE_CUDA AND (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.2) AND (CMAKE_HOST_SYSTEM_NAME MATCHES "Windows"))
@@ -766,7 +792,6 @@ if(NOT MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-type-limits")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-array-bounds")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-pragmas")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-sign-compare")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-function")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-result")
@@ -778,6 +803,10 @@ if(NOT MSVC)
     string(APPEND CMAKE_CXX_FLAGS " -Wno-range-loop-analysis")
     string(APPEND CMAKE_CXX_FLAGS " -Wno-pass-failed")
   endif()
+  if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0.0))
+    # Suppress issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43407
+    string(APPEND CMAKE_CXX_FLAGS " -Wno-attributes")
+  endif()
   if(CMAKE_COMPILER_IS_GNUCXX AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0))
     string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow")
   endif()
@@ -864,6 +893,9 @@ if(NOT MSVC)
   if(HAS_WERROR_CAST_FUNCTION_TYPE)
     string(APPEND CMAKE_CXX_FLAGS " -Werror=cast-function-type")
   endif()
+  check_cxx_compiler_flag("-Werror=sign-compare" HAS_WERROR_SIGN_COMPARE)
+  # This doesn't work globally so we use the test on specific
+  # target_compile_options
 endif()
 
 if(USE_ASAN)
@@ -918,8 +950,8 @@ if(USE_CPP_CODE_COVERAGE)
 endif()
 
 if(APPLE)
-    if(USE_MLCOMPUTE)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MLCOMPUTE -fobjc-arc -framework MLCompute -framework Metal")
+    if(USE_MPS)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MPS -fno-objc-arc -framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal")
     endif()
     string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field")
     string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces")
diff --git a/CODEOWNERS b/CODEOWNERS
index 054bd8171311..7de2b0e66d9f 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -11,8 +11,10 @@
 /torch/nn/ @albanD @jbschlosser
 /torch/optim/ @albanD
 /test/test_public_bindings.py @albanD
+/test/allowlist_for_publicAPI.json @albanD @anjali411
 /docs/source/conf.py @albanD
-/aten/src/ATen/native/native_functions.yaml @ezyang
+/aten/src/ATen/native/native_functions.yaml @bdhirsh
+/aten/src/ATen/native/tags.yaml @anjali411
 
 # Tensorpipe RPC Agent.
 /torch/csrc/distributed/rpc/tensorpipe_agent.cpp @jiayisuse @osalpekar @lw @beauby
@@ -21,15 +23,15 @@
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj
-/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj
-/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj
+/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu
+/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu
+/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @bowangbj
-/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @bowangbj
+/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu
+/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu
 
 # ONNX Export
 /torch/csrc/jit/passes/onnx.h @bowenbao @shubhambhokare1
@@ -39,7 +41,7 @@
 /test/onnx/ @bowenbao @shubhambhokare1
 
 # Docker
-/.circleci/docker/ @jeffdaily @jithunnair-amd
+/.circleci/docker/ @jeffdaily
 
 # Github Actions
 # This list is for people wanting to be notified every time there's a change
@@ -47,9 +49,9 @@
 /.github/ @seemethere @janeyx99 @atalman
 
 # Custom Test Infrastructure
-/test/run_test.py @pytorch-dev-infra
+/test/run_test.py @pytorch/pytorch-dev-infra
 /torch/testing/_internal/common_device_type.py @mruberry
-/torch/testing/_internal/common_utils.py @pytorch-dev-infra
+/torch/testing/_internal/common_utils.py @pytorch/pytorch-dev-infra
 
 # Parametrizations
 /torch/nn/utils/parametriz*.py @lezcano
@@ -61,3 +63,13 @@
 /aten/src/ATen/native/**/*LinearAlgebra* @lezcano @nikitaved @IvanYashchuk
 # tests
 /test/test_linalg.py @lezcano @nikitaved @IvanYashchuk
+
+# OpInfo-related files
+/torch/testing/_internal/common_methods_invocations.py @mruberry @ngimel
+/torch/testing/_internal/common_device_type.py @mruberry @ngimel
+test/test_ops.py @mruberry @ngimel
+test/test_ops_gradients.py @mruberry @ngimel
+test/test_unary_ufuncs.py @mruberry @ngimel
+test/test_binary_ufuncs.py @mruberry @ngimel
+test/test_reductions.py @mruberry @ngimel
+test/test_type_promotion.py @mruberry @ngimel
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 59b7ae8a488f..a09e03c01e44 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -342,6 +342,8 @@ The `expecttest` and `hypothesis` libraries must be installed to run the tests.
 an optional dependency, and `pytest` may help run tests more selectively.
 All these packages can be installed with `conda` or `pip`.
 
+**Weird note:** In our CI (Continuous Integration) jobs, we actually run the tests from the `test` folder and **not** the root of the repo, since there are various dependencies we set up for CI that expects the tests to be run from the test folder. As such, there may be some inconsistencies between local testing and CI testing--if you observe an inconsistency, please [file an issue](https://github.com/pytorch/pytorch/issues/new/choose).
+
 ### Better local unit tests with `pytest`
 
 We don't officially support `pytest`, but it works well with our
@@ -512,7 +514,7 @@ missing file warnings but will still complete. For example, to work on `jit.rst`
 
 ```bash
 cd docs/source
-ls | grep rst | grep -v index | grep -v jit | xargs rm
+find . -type f | grep rst | grep -v index | grep -v jit | xargs rm
 
 # Make your changes, build the docs, etc.
 
@@ -1098,8 +1100,7 @@ This internally invokes our driver script and closely mimics how clang-tidy is r
 
 ## Pre-commit tidy/linting hook
 
-We use clang-tidy and flake8 (installed with flake8-bugbear,
-flake8-comprehensions, flake8-pyi, and others) to perform additional
+We use clang-tidy to perform additional
 formatting and semantic checking of code. We provide a pre-commit git hook for
 performing these checks, before a commit is created:
 
@@ -1107,18 +1108,18 @@ performing these checks, before a commit is created:
   ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
   ```
 
-You'll need to install an appropriately configured flake8; see
-[Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type)
-for documentation on how to do this.
-
-If you haven't set up the pre-commit hook and have already committed files and
+If you have already committed files and
 CI reports `flake8` errors, you can run the check locally in your PR branch with:
 
   ```bash
   flake8 $(git diff --name-only $(git merge-base --fork-point master))
   ```
 
-fix the code so that no errors are reported when you re-run the above check again,
+You'll need to install an appropriately configured flake8; see
+[Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type)
+for documentation on how to do this.
+
+Fix the code so that no errors are reported when you re-run the above check again,
 and then commit the fix.
 
 ## Building PyTorch with ASAN
@@ -1245,39 +1246,17 @@ Once you submit a PR or push a new commit to a branch that is in
 an active PR, CI jobs will be run automatically. Some of these may
 fail and you will need to find out why, by looking at the logs.
 
-Fairly often, a CI failure might be unrelated to your changes. In this case, you
+Fairly often, a CI failure might be unrelated to your changes. You can
+confirm by going to our [HUD](hud.pytorch.org) and seeing if the CI job
+is failing upstream already. In this case, you
 can usually ignore the failure. See [the following
 subsection](#which-commit-is-used-in-ci) for more details.
 
 Some failures might be related to specific hardware or environment
-configurations. In this case, if the job is run by CircleCI, you can
-ssh into the job's session to perform manual debugging using the
-following steps:
-
-1. In the CircleCI page for the failed job, make sure you are logged in
-   and then click the `Rerun` actions dropdown button on the top right.
-   Click `Rerun Job with SSH`.
-
-2. When the job reruns, a new step will be added in the `STEPS` tab
-   labelled `Set up SSH`. Inside that tab will be an ssh command that
-   you can execute in a shell.
-
-3. Once you are connected through ssh, you may need to enter a docker
-   container. Run `docker ps` to check if there are any docker
-   containers running. Note that your CI job might be in the process
-   of initiating a docker container, which means it will not show up
-   yet. It is best to wait until the CI job reaches a step where it is
-   building pytorch or running pytorch tests. If the job does have a
-   docker container, run `docker exec -it IMAGE_ID /bin/bash` to
-   connect to it.
-
-4. Now you can find the pytorch working directory, which could be
-   `~/workspace` or `~/project`, and run commands locally to debug
-   the failure.
-
-For certain Windows failures, it may be useful to have a full [Remote
-Desktop](https://docs.microsoft.com/en-us/windows-server/remote/remote-desktop-services/clients/remote-desktop-clients) connection. See detailed instructions [here](https://github.com/pytorch/pytorch/wiki/Debugging-Windows-with-Remote-Desktop-or-CDB-(CLI-windbg)-on-CircleCI)
-for how to set that up after rerunning the job.
+configurations. In this case, if you're a Meta employee, you can ssh into
+the job's session to perform manual debugging following the instructions in
+our [CI wiki](https://github.com/pytorch/pytorch/wiki/Debugging-using-with-ssh-for-Github-Actions).
+
 
 ### Which commit is used in CI?
 
diff --git a/Dockerfile b/Dockerfile
index 57c5dae733da..a8dc7f141685 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,7 +32,7 @@ RUN curl -fsSL -v -o ~/miniconda.sh -O  https://repo.anaconda.com/miniconda/Mini
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b -p /opt/conda && \
     rm ~/miniconda.sh && \
-    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build pyyaml numpy ipython&& \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build pyyaml numpy ipython && \
     /opt/conda/bin/conda clean -ya
 
 FROM dev-base as submodule-update
@@ -51,7 +51,7 @@ RUN --mount=type=cache,target=/opt/ccache \
 
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.8
-ARG CUDA_VERSION=11.1
+ARG CUDA_VERSION=11.3
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
 ENV CONDA_OVERRIDE_CUDA=${CUDA_VERSION}
diff --git a/LICENSE b/LICENSE
index 9cb8cbef5a9f..04f9ad110565 100644
--- a/LICENSE
+++ b/LICENSE
@@ -28,6 +28,10 @@ All rights reserved.
 All contributions by Kakao Brain:
 Copyright 2019-2020 Kakao Brain
 
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
 All rights reserved.
diff --git a/Makefile b/Makefile
index 3d18c2b46381..21745f42a887 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 # This makefile does nothing but delegating the actual building to cmake.
 PYTHON = python3
+PIP = pip3
 
 all:
 	@mkdir -p build && cd build && cmake .. $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && $(MAKE)
@@ -15,110 +16,18 @@ ios:
 
 clean: # This will remove ALL build folders.
 	@rm -r build*/
-	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
 
 linecount:
 	@cloc --read-lang-def=caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
 
-SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
-shellcheck-gha:
-	@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
-	tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
-	tools/linter/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
-
-generate-gha-workflows:
-	.github/scripts/generate_ci_workflows.py
-	$(MAKE) shellcheck-gha
-
-shellcheck:
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'shellcheck' \
-		--step "Regenerate workflows"
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'shellcheck' \
-		--step "Assert that regenerating the workflows didn't change them"
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'shellcheck' \
-		--step 'Extract scripts from GitHub Actions workflows'
-	@$(PYTHON) tools/actions_local_runner.py \
-		$(CHANGED_ONLY) \
-		--job 'shellcheck'
-
 setup_lint:
-	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'flake8-py3' --step 'Install dependencies' --no-quiet
-	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'cmakelint' --step 'Install dependencies' --no-quiet
-	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'mypy' --step 'Install dependencies' --no-quiet
-	$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'shellcheck' --step 'Install Jinja2' --no-quiet
-
-	@if [ "$$(uname)" = "Darwin" ]; then \
-		if [ -z "$$(which brew)" ]; then \
-			echo "'brew' is required to install ShellCheck, get it here: https://brew.sh "; \
-			exit 1; \
-		fi; \
-		brew install shellcheck; \
-	else \
-		$(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
-		--job 'shellcheck' --step 'Install ShellCheck' --no-quiet; \
-	fi
-	$(PYTHON) -mpip install jinja2 --user
-	$(PYTHON) -mpip install -r tools/linter/clang_tidy/requirements.txt --user
-	$(PYTHON) -m tools.linter.install.clang_tidy
-
-quick_checks:
-# TODO: This is broken when 'git config submodule.recurse' is 'true' since the
-# lints will descend into third_party submodules
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'quick-checks' \
-		--step 'Ensure no trailing spaces' \
-		--step 'Ensure no tabs' \
-		--step 'Ensure no non-breaking spaces' \
-		--step 'Ensure canonical include' \
-		--step 'Ensure no versionless Python shebangs' \
-		--step 'Ensure no unqualified noqa' \
-		--step 'Ensure GitHub PyPi dependencies are pinned' \
-		--step 'Ensure no unqualified type ignore' \
-		--step 'Ensure no direct cub include' \
-		--step 'Ensure correct trailing newlines' \
-		--step 'Ensure no raw cuda api calls'
-
-flake8:
-	@$(PYTHON) tools/actions_local_runner.py \
-		$(CHANGED_ONLY) \
-		--job 'flake8-py3'
-
-mypy:
-	@$(PYTHON) tools/actions_local_runner.py \
-		$(CHANGED_ONLY) \
-		--job 'mypy'
-
-cmakelint:
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'cmakelint' \
-		--step 'Run cmakelint'
-
-clang-tidy:
-	@$(PYTHON) tools/actions_local_runner.py \
-		$(CHANGED_ONLY) \
-		--job 'clang-tidy'
-
-toc:
-	@$(PYTHON) tools/actions_local_runner.py \
-		--file .github/workflows/lint.yml \
-		--job 'toc' \
-		--step "Regenerate ToCs and check that they didn't change"
+	$(PIP) install lintrunner
+	lintrunner init
 
-lint: flake8 mypy quick_checks cmakelint shellcheck
+lint:
+	lintrunner
 
-quicklint: CHANGED_ONLY=--changed-only
-quicklint: mypy flake8 quick_checks cmakelint shellcheck clang-tidy
+quicklint:
+	lintrunner
diff --git a/README.md b/README.md
index 88a77f04b345..c5c362b80a6a 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ PyTorch is a Python package that provides two high-level features:
 
 You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed.
 
+Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.org](https://hud.pytorch.org/ci/pytorch/pytorch/master).
+
 <!-- toc -->
 
 - [More About PyTorch](#more-about-pytorch)
@@ -39,18 +41,6 @@ You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to
 
 <!-- tocstop -->
 
-| System | 3.7 | 3.8 |
-| :---: | :---: | :--: |
-| Linux CPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-master/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-master/) | <center>—</center> |
-| Linux GPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-master/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-master/) | <center>—</center> |
-| Windows CPU / GPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-win-ws2016-cuda9-cudnn7-py3-trigger/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-win-ws2016-cuda9-cudnn7-py3-trigger/) |  <center>—</center> |
-| Linux (ppc64le) CPU | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le/) | <center>—</center> |
-| Linux (ppc64le) GPU | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/) | <center>—</center> |
-| Linux (aarch64) CPU | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38) |
-
-See also the [CI HUD at hud.pytorch.org](https://hud.pytorch.org/ci/pytorch/pytorch/master).
-
-
 ## More About PyTorch
 
 At a granular level, PyTorch is a library that consists of the following components:
@@ -155,14 +145,9 @@ Commands to install binaries via Conda or pip wheels are on our website: [https:
 
 #### NVIDIA Jetson Platforms
 
-Python wheels for NVIDIA's Jetson Nano, Jetson TX2, and Jetson AGX Xavier are available via the following URLs:
-
-- Stable binaries:
-  - Python 3.6: https://nvidia.box.com/v/torch-stable-cp36-jetson-jp42
-- Rolling weekly binaries:
-  - Python 3.6: https://nvidia.box.com/v/torch-weekly-cp36-jetson-jp42
+Python wheels for NVIDIA's Jetson Nano, Jetson TX2, and Jetson AGX Xavier are provided [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) and the L4T container is published [here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-pytorch)
 
-They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) maintains them
+They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) and [@ptrblck](https://github.com/ptrblck) are maintaining them.
 
 
 ### From Source
@@ -178,16 +163,16 @@ If you want to compile with CUDA support, install
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
 Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/pdf/cuDNN-Support-Matrix.pdf) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardwares
 
-If you want to disable CUDA support, export environment variable `USE_CUDA=0`.
+If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.
 
 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)
 
 If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 4.0 and above installation
-- ROCm is currently supported only for Linux system.
+- ROCm is currently supported only for Linux systems.
 
-If you want to disable ROCm support, export environment variable `USE_ROCM=0`.
+If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.
 
 #### Install Dependencies
@@ -245,7 +230,7 @@ collect2: error: ld returned 1 exit status
 error: command 'g++' failed with exit status 1
 ```
 
-This is caused by `ld` from Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.6.10+, 3.7.6+ and 3.8.1+.
+This is caused by `ld` from Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.7.6+ and 3.8.1+.
 
 On macOS
 ```bash
@@ -299,7 +284,7 @@ You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob
 cmd
 
 :: Set the environment variables after you have downloaded and upzipped the mkl package,
-:: else CMake would throw error as `Could NOT find OpenMP`.
+:: else CMake would throw an error as `Could NOT find OpenMP`.
 set CMAKE_INCLUDE_PATH={Your directory}\mkl\include
 set LIB={Your directory}\mkl\lib;%LIB%
 
diff --git a/RELEASE.md b/RELEASE.md
index 8f967985a9cf..80b4bfefc122 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3,13 +3,30 @@
 <!-- toc -->
 
   - [General Overview](#general-overview)
+  - [Cutting a release branch preparations](#cutting-a-release-branch-preparations)
   - [Cutting release branches](#cutting-release-branches)
+    - [`pytorch/pytorch`](#pytorchpytorch)
+    - [`pytorch/builder` / PyTorch domain libraries](#pytorchbuilder--pytorch-domain-libraries)
     - [Making release branch specific changes](#making-release-branch-specific-changes)
     - [Getting CI signal on release branches:](#getting-ci-signal-on-release-branches)
   - [Drafting RCs (Release Candidates)](#drafting-rcs-release-candidates)
     - [Release Candidate Storage](#release-candidate-storage)
     - [Cherry Picking Fixes](#cherry-picking-fixes)
   - [Promoting RCs to Stable](#promoting-rcs-to-stable)
+  - [Additonal Steps to prepare for release day](#additonal-steps-to-prepare-for-release-day)
+    - [Modify release matrix](#modify-release-matrix)
+    - [Open Google Colab issue](#open-google-colab-issue)
+- [Patch Releases](#patch-releases)
+  - [Patch Release Criteria](#patch-release-criteria)
+  - [Patch Release Process](#patch-release-process)
+    - [Triage](#triage)
+    - [Building a release schedule / cherry picking](#building-a-release-schedule--cherry-picking)
+    - [Building Binaries / Promotion to Stable](#building-binaries--promotion-to-stable)
+- [Hardware / Software Support in Binary Build Matrix](#hardware--software-support-in-binary-build-matrix)
+  - [Python](#python)
+    - [TL;DR](#tldr)
+  - [Accelerator Software](#accelerator-software)
+    - [Special support cases](#special-support-cases)
 - [Special Topics](#special-topics)
   - [Updating submodules for a release](#updating-submodules-for-a-release)
 
@@ -19,36 +36,60 @@
 
 Releasing a new version of PyTorch generally entails 3 major steps:
 
+0. Cutting a release branch preparations
 1. Cutting a release branch and making release branch specific changes
 2. Drafting RCs (Release Candidates), and merging cherry picks
-3. Promoting RCs to stable
+3. Promoting RCs to stable and performing release day tasks
+
+## Cutting a release branch preparations
+
+Following Requirements needs to be met prior to final RC Cut:
+
+* Resolve all outstanding issues in the milestones(for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28))before first RC cut is completed. After RC cut is completed following script should be executed from builder repo in order to validate the presence of the fixes in the release branch :
+``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream  --branch release/1.11 --milestone-id 26 --missing-in-branch ```
+* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm).
+* All the nighly jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
+  * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
+  * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
+  * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
+  * [TorchText](https://hud.pytorch.org/hud/pytorch/text/nightly)
 
 ## Cutting release branches
 
+### `pytorch/pytorch`
+
 Release branches are typically cut from the branch [`viable/strict`](https://github.com/pytorch/pytorch/tree/viable/strict) as to ensure that tests are passing on the release branch.
 
-Release branches *should* be prefixed like so:
-```
-release/{MAJOR}.{MINOR}
-```
+There's a convenience script to create release branches from current `viable/strict` (from root `pytorch/pytorch`):
 
-An example of this would look like:
+```bash
+DRY_RUN=disabled scripts/release/cut-release-branch.sh
 ```
-release/1.8
+
+This script should create 2 branches:
+* `release/{MAJOR}.{MINOR}`
+* `orig/release/{MAJOR}.{MINOR}`
+
+### `pytorch/builder` / PyTorch domain libraries
+
+Convenience script can also be used domains as well as `pytorch/builder`
+
+> NOTE: RELEASE_VERSION only needs to be specified if version.txt is not available in root directory
+
+```bash
+DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 scripts/release/cut-release-branch.sh
 ```
 
-Please make sure to create branch that pins divergent point of release branch from the main branch, i.e. `orig/release/{MAJOR}.{MINOR}`
 ### Making release branch specific changes
 
 These are examples of changes that should be made to release branches so that CI / tooling can function normally on
 them:
 
-* Update target determinator to use release branch:
-  * Example: https://github.com/pytorch/pytorch/pull/40712
 * Update backwards compatibility tests to use RC binaries instead of nightlies
   * Example: https://github.com/pytorch/pytorch/pull/40706
 * A release branches should also be created in [`pytorch/xla`](https://github.com/pytorch/xla) and [`pytorch/builder`](https://github.com/pytorch/builder) repos and pinned in `pytorch/pytorch`
-  * Example: https://github.com/pytorch/pytorch/pull/65433
+  * Example PR (CircleCI, to be removed): https://github.com/pytorch/pytorch/pull/65433
+  * Example PR (GHA): https://github.com/pytorch/pytorch/pull/72739
 
 These are examples of changes that should be made to the *default* branch after a release branch is cut
 
@@ -56,6 +97,7 @@ These are examples of changes that should be made to the *default* branch after
   * Example: https://github.com/pytorch/pytorch/pull/65435
 
 ### Getting CI signal on release branches:
+
 Create a PR from `release/{MAJOR}.{MINOR}` to `orig/release/{MAJOR}.{MINOR}` in order to start CI testing for cherry-picks into release branch.
 
 Example:
@@ -98,8 +140,11 @@ For fixes that are to go into a release after the release branch has been cut we
 An example of this would look like:
 * https://github.com/pytorch/pytorch/issues/51886
 
+Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.
+
 **NOTE**: The cherry pick process is not an invitation to add new features, it is mainly there to fix regressions
 
+
 ## Promoting RCs to Stable
 
 Promotion of RCs to stable is done with this script:
@@ -113,6 +158,95 @@ Promotion should occur in two steps:
 
 **NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
 
+## Additonal Steps to prepare for release day
+
+The following should be prepared for the release day
+
+### Modify release matrix
+
+Need to modify release matrix for get started page. See following [PR](https://github.com/pytorch/pytorch.github.io/pull/959) as reference.
+
+After modifying published_versions.json you will need to regenerate regenerate the quick-start-module.js file run following command
+```
+python3 scripts/gen_quick_start_module.py >assets/quick-start-module.js
+```
+Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR but pointing to to the Release candidate location as above [Release Candidate Storage](RELEASE.md#release-candidate-storage)
+
+### Open Google Colab issue
+
+This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372)
+
+# Patch Releases
+
+A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`
+
+## Patch Release Criteria
+
+Patch releases should be considered if a regression meets the following criteria:
+
+1. Does the regression break core functionality (stable / beta features) including functionality in first party domain libraries?
+    * First party domain libraries:
+        * [pytorch/vision](https://github.com/pytorch/vision)
+        * [pytorch/audio](https://github.com/pytorch/audio)
+        * [pytorch/text](https://github.com/pytorch/text)
+3. Is there not a viable workaround?
+    * Can the regression be solved simply or is it not overcomable?
+
+> *NOTE*: Patch releases should only be considered when functionality is broken, documentation does not typically fall within this category
+
+## Patch Release Process
+
+### Triage
+
+> Main POC: Triage Reviewers
+
+1. Tag issues / pull requests that are candidates for a potential patch release with `triage review`
+    * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png)
+2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria)
+3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions if found to be within the [Patch Release Criteria](#patch-release-criteria)
+    * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png)
+
+### Building a release schedule / cherry picking
+
+> Main POC: Patch Release Managers
+
+1. After regressions / fixes have been triaged Patch Release Managers will work together and build /announce a schedule for the patch release
+    * *NOTE*: Ideally this should be ~2-3 weeks after a regression has been identified to allow other regressions to be identified
+2. Patch Release Managers will work with the authors of the regressions / fixes to cherry pick their change into the related release branch (i.e. `release/1.9` for `1.9.1`)
+
+### Building Binaries / Promotion to Stable
+
+> Main POC: Patch Release managers
+
+1. Patch Release Managers will follow the process of [Drafting RCs (Release Candidates)](#drafting-rcs-release-candidates)
+2. Patch Release Managers will follow the process of [Promoting RCs to Stable](#promoting-rcs-to-stable)
+
+# Hardware / Software Support in Binary Build Matrix
+
+PyTorch has a support matrix across a couple of different axis. This section should be used as a decision making framework to drive hardware / software support decisions
+
+## Python
+
+For versions of Python that we support we follow the [NEP 29 policy](https://numpy.org/neps/nep-0029-deprecation_policy.html), which was originally drafted by numpy.
+
+### TL;DR
+
+* All minor versions of Python released 42 months prior to the project, and at minimum the two latest minor versions.
+* All minor versions of numpy released in the 24 months prior to the project, and at minimum the last three minor versions.
+
+## Accelerator Software
+
+For acclerator software like CUDA and ROCm we will typically use the following criteria:
+* Support latest 2 minor versions
+
+### Special support cases
+
+In some instances support for a particular version of software will continue if a need is found. For example, our CUDA 11 binaries do not currently meet
+the size restrictions for publishing on PyPI so the default version that is published to PyPI is CUDA 10.2.
+
+These special support cases will be handled on a case by case basis and support may be continued if current PyTorch maintainers feel as though there may still be a
+need to support these particular versions of software.
+
 # Special Topics
 
 ## Updating submodules for a release
diff --git a/WORKSPACE b/WORKSPACE
index 95eee3bdd494..fb15aad66cb8 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -33,6 +33,13 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "google_benchmark",
+    sha256 = "6132883bc8c9b0df5375b16ab520fac1a85dc9e4cf5be59480448ece74b278d4",
+    strip_prefix = "benchmark-1.6.1/",
+    urls = ["https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz"],
+)
+
 http_archive(
   name = "pybind11_bazel",
   strip_prefix = "pybind11_bazel-7f397b5d2cc2434bbd651e096548f7b40c128044",
diff --git a/android/README.md b/android/README.md
index 002409c52349..d1d6bcd6aa3b 100644
--- a/android/README.md
+++ b/android/README.md
@@ -14,9 +14,16 @@ repositories {
     jcenter()
 }
 
+# lite interpreter build
 dependencies {
-    implementation 'org.pytorch:pytorch_android:1.6.0'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
+    implementation 'org.pytorch:pytorch_android_lite:1.10.0'
+    implementation 'org.pytorch:pytorch_android_torchvision_lite:1.10.0'
+}
+
+# full jit build
+dependencies {
+    implementation 'org.pytorch:pytorch_android:1.10.0'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.10.0'
 }
 ```
 
@@ -32,6 +39,15 @@ repositories {
     }
 }
 
+# lite interpreter build
+dependencies {
+    ...
+    implementation 'org.pytorch:pytorch_android_lite:1.12.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android_torchvision_lite:1.12.0-SNAPSHOT'
+    ...
+}
+
+# full jit build
 dependencies {
     ...
     implementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT'
@@ -68,7 +84,7 @@ They are specified as environment variables:
 
 `ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html)
 
-`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk)
+`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x.
 
 `GRADLE_HOME` - path to [gradle](https://gradle.org/releases/)
 
@@ -133,7 +149,7 @@ android {
 }
 
 dependencies {
-    extractForNativeBuild('org.pytorch:pytorch_android:1.6.0')
+    extractForNativeBuild('org.pytorch:pytorch_android:1.10.0')
 }
 
 task extractAARForNativeBuild {
diff --git a/android/common.sh b/android/common.sh
index ab1cb5ff43c7..1fee30bdc382 100644
--- a/android/common.sh
+++ b/android/common.sh
@@ -29,7 +29,8 @@ check_gradle() {
 }
 
 parse_abis_list() {
-  ABIS_LIST="x86"
+  # sync with https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1
+  ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
   CUSTOM_ABIS_LIST=false
   if [ $# -gt 0 ]; then
     ABIS_LIST=$1
diff --git a/android/pytorch_android/build.gradle b/android/pytorch_android/build.gradle
index a65c0ffd436b..d10f6a305085 100644
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@@ -50,7 +50,17 @@ android {
         }
         androidTest {
             java {
-                exclude 'org/pytorch/PytorchHostTests.java'
+                if(System.env.BUILD_LITE_INTERPRETER == '0') {
+                    println 'Build test for full jit (pytorch_jni)'
+                    exclude 'org/pytorch/PytorchHostTests.java'
+                    exclude 'org/pytorch/PytorchLiteInstrumentedTests.java'
+                    exclude 'org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java'
+                } else {
+                    println 'Build test for lite interpreter (pytorch_jni_lite)'
+                    exclude 'org/pytorch/PytorchHostTests.java'
+                    exclude 'org/pytorch/PytorchInstrumentedTests.java'
+                    exclude 'org/pytorch/suite/PytorchInstrumentedTestSuite.java'
+                }
             }
         }
     }
diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py
index 8b41fefc246e..909f824fb26d 100644
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@@ -1,4 +1,6 @@
 import torch
+from torch import Tensor
+from typing import Dict, List, Tuple, Optional
 
 OUTPUT_DIR = "src/androidTest/assets/"
 
@@ -7,7 +9,8 @@ def scriptAndSave(module, fileName):
     script_module = torch.jit.script(module)
     print(script_module.graph)
     outputFileName = OUTPUT_DIR + fileName
-    script_module.save(outputFileName)
+    # note that the lite interpreter model can also be used in full JIT
+    script_module._save_for_lite_interpreter(outputFileName)
     print("Saved to " + outputFileName)
     print('=' * 80)
 
diff --git a/android/pytorch_android/host/build.gradle b/android/pytorch_android/host/build.gradle
index 0f795f08657e..088d1b5ca420 100644
--- a/android/pytorch_android/host/build.gradle
+++ b/android/pytorch_android/host/build.gradle
@@ -25,6 +25,7 @@ sourceSets {
         java {
             srcDir '../src/androidTest/java'
             exclude '**/PytorchInstrumented*'
+            exclude '**/PytorchLiteInstrumented*'
         }
         resources.srcDirs = ["../src/androidTest/assets"]
     }
diff --git a/android/pytorch_android/src/androidTest/assets/activation_ops.ptl b/android/pytorch_android/src/androidTest/assets/activation_ops.ptl
new file mode 100644
index 000000000000..179f426ae7cd
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/activation_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl
new file mode 100644
index 000000000000..df62dd862088
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl b/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl
new file mode 100644
index 000000000000..fea933ee644f
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl b/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl
new file mode 100644
index 000000000000..01b1c153e751
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl b/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl
new file mode 100644
index 000000000000..db253a207a33
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl
new file mode 100644
index 000000000000..cc4d994f440a
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl b/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl
new file mode 100644
index 000000000000..422c2f60e6be
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl
new file mode 100644
index 000000000000..0bbbce9671c3
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl
new file mode 100644
index 000000000000..9d2b3f9dde1a
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl
new file mode 100644
index 000000000000..7d4888e0bc81
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/linear_ops.ptl b/android/pytorch_android/src/androidTest/assets/linear_ops.ptl
new file mode 100644
index 000000000000..ca9066c03dc4
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/linear_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl
new file mode 100644
index 000000000000..4c0592e5485a
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl b/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl
new file mode 100644
index 000000000000..9b8297a250d3
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl b/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl
new file mode 100644
index 000000000000..5d008eab03b9
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl b/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl
new file mode 100644
index 000000000000..d85bd06c763b
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl b/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl
new file mode 100644
index 000000000000..7209c3b3bd1f
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/padding_ops.ptl b/android/pytorch_android/src/androidTest/assets/padding_ops.ptl
new file mode 100644
index 000000000000..02e57ba20712
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/padding_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl b/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl
new file mode 100644
index 000000000000..948ed4832660
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl b/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl
new file mode 100644
index 000000000000..df051163413f
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl b/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl
new file mode 100644
index 000000000000..245ceb454d53
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl b/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl
new file mode 100644
index 000000000000..13771302c668
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl b/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl
new file mode 100644
index 000000000000..416be7cb1279
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl b/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl
new file mode 100644
index 000000000000..5e5520118764
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl b/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl
new file mode 100644
index 000000000000..a16f68f8f95f
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl b/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl
new file mode 100644
index 000000000000..9828dd2ba901
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl
new file mode 100644
index 000000000000..d0a0a254d1ef
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl
new file mode 100644
index 000000000000..d897b43cd36c
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl
new file mode 100644
index 000000000000..6f2855ea83ea
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl
new file mode 100644
index 000000000000..ac9cb8c4b94a
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl
new file mode 100644
index 000000000000..3e2f4d8cc689
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl
new file mode 100644
index 000000000000..5e2dc8294842
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/test.pt b/android/pytorch_android/src/androidTest/assets/test.pt
index 375ade9bc913..016b6d666a2a 100644
Binary files a/android/pytorch_android/src/androidTest/assets/test.pt and b/android/pytorch_android/src/androidTest/assets/test.pt differ
diff --git a/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl b/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl
new file mode 100644
index 000000000000..2d2532df2fd2
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl b/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl
new file mode 100644
index 000000000000..ce434b3b4210
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl b/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl
new file mode 100644
index 000000000000..ebb2bd693604
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl
new file mode 100644
index 000000000000..c9c45655e2bc
Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl differ
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
index bc406dc9ae74..afdde74c5bde 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
@@ -10,7 +10,11 @@
 public class PytorchHostTests extends PytorchTestBase {
 
   @Override
-  protected String assetFilePath(String assetName) throws IOException {
+  protected Module loadModel(String path) throws IOException {
+    return Module.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
     Path tempFile = Files.createTempFile("test", ".pt");
     try (InputStream resource =
         Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) {
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
index bae01e394025..20c30d1587c8 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
@@ -14,7 +14,11 @@
 public class PytorchInstrumentedTests extends PytorchTestBase {
 
   @Override
-  protected String assetFilePath(String assetName) throws IOException {
+  protected Module loadModel(String path) throws IOException {
+    return Module.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
     final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
     File file = new File(appContext.getFilesDir(), assetName);
     if (file.exists() && file.length() > 0) {
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java
new file mode 100644
index 000000000000..bc62270a6fa8
--- /dev/null
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java
@@ -0,0 +1,42 @@
+package org.pytorch;
+
+import android.content.Context;
+import androidx.test.InstrumentationRegistry;
+import androidx.test.runner.AndroidJUnit4;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import org.junit.runner.RunWith;
+
+@RunWith(AndroidJUnit4.class)
+public class PytorchLiteInstrumentedTests extends PytorchTestBase {
+
+  @Override
+  protected Module loadModel(String path) throws IOException {
+    return LiteModuleLoader.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
+    final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
+    File file = new File(appContext.getFilesDir(), assetName);
+    if (file.exists() && file.length() > 0) {
+      return file.getAbsolutePath();
+    }
+
+    try (InputStream is = appContext.getAssets().open(assetName)) {
+      try (OutputStream os = new FileOutputStream(file)) {
+        byte[] buffer = new byte[4 * 1024];
+        int read;
+        while ((read = is.read(buffer)) != -1) {
+          os.write(buffer, 0, read);
+        }
+        os.flush();
+      }
+      return file.getAbsolutePath();
+    } catch (IOException e) {
+      throw e;
+    }
+  }
+}
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
index 2817ae1bbd09..9abcbcbda8a6 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@@ -12,11 +12,11 @@
 import org.junit.Test;
 
 public abstract class PytorchTestBase {
-  private static final String TEST_MODULE_ASSET_NAME = "test.pt";
+  private static final String TEST_MODULE_ASSET_NAME = "android_api_module.ptl";
 
   @Test
   public void testForwardNull() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue input = IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1}));
     assertTrue(input.isTensor());
     final IValue output = module.forward(input);
@@ -25,7 +25,7 @@ public void testForwardNull() throws IOException {
 
   @Test
   public void testEqBool() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     for (boolean value : new boolean[] {false, true}) {
       final IValue input = IValue.from(value);
       assertTrue(input.isBool());
@@ -38,7 +38,7 @@ public void testEqBool() throws IOException {
 
   @Test
   public void testEqInt() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
       final IValue input = IValue.from(value);
       assertTrue(input.isLong());
@@ -51,7 +51,7 @@ public void testEqInt() throws IOException {
 
   @Test
   public void testEqFloat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     double[] values =
         new double[] {
           -Double.MAX_VALUE,
@@ -86,7 +86,7 @@ public void testEqTensor() throws IOException {
     }
     final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape);
 
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue input = IValue.from(inputTensor);
     assertTrue(input.isTensor());
     assertTrue(inputTensor == input.toTensor());
@@ -103,7 +103,7 @@ public void testEqTensor() throws IOException {
 
   @Test
   public void testEqDictIntKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final Map<Long, IValue> inputMap = new HashMap<>();
 
     inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE));
@@ -127,7 +127,7 @@ public void testEqDictIntKeyIntValue() throws IOException {
 
   @Test
   public void testEqDictStrKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final Map<String, IValue> inputMap = new HashMap<>();
 
     inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE));
@@ -151,7 +151,7 @@ public void testEqDictStrKeyIntValue() throws IOException {
 
   @Test
   public void testListIntSumReturnTuple() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
 
     for (int n : new int[] {0, 1, 128}) {
       long[] a = new long[n];
@@ -178,7 +178,7 @@ public void testListIntSumReturnTuple() throws IOException {
 
   @Test
   public void testOptionalIntIsNone() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
 
     assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool());
     assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool());
@@ -186,7 +186,7 @@ public void testOptionalIntIsNone() throws IOException {
 
   @Test
   public void testIntEq0None() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
 
     assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull());
     assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l);
@@ -194,7 +194,7 @@ public void testIntEq0None() throws IOException {
 
   @Test(expected = IllegalArgumentException.class)
   public void testRunUndefinedMethod() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     module.runMethod("test_undefined_method_throws_exception");
   }
 
@@ -241,7 +241,7 @@ public void testTensorIllegalStateOnWrongType() {
 
   @Test
   public void testEqString() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     String[] values =
         new String[] {
           "smoketest",
@@ -260,7 +260,7 @@ public void testEqString() throws IOException {
 
   @Test
   public void testStr3Concat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     String[] values =
         new String[] {
           "smoketest",
@@ -281,7 +281,7 @@ public void testStr3Concat() throws IOException {
 
   @Test
   public void testEmptyShape() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final long someNumber = 43;
     final IValue input = IValue.from(Tensor.fromBlob(new long[] {someNumber}, new long[] {}));
     final IValue output = module.runMethod("newEmptyShapeWithItem", input);
@@ -293,7 +293,7 @@ public void testEmptyShape() throws IOException {
 
   @Test
   public void testAliasWithOffset() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue output = module.runMethod("testAliasWithOffset");
     assertTrue(output.isTensorList());
     Tensor[] tensors = output.toTensorList();
@@ -303,7 +303,7 @@ public void testAliasWithOffset() throws IOException {
 
   @Test
   public void testNonContiguous() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue output = module.runMethod("testNonContiguous");
     assertTrue(output.isTensor());
     Tensor value = output.toTensor();
@@ -316,7 +316,7 @@ public void testChannelsLast() throws IOException {
     long[] inputShape = new long[] {1, 3, 2, 2};
     long[] data = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104};
     Tensor inputNHWC = Tensor.fromBlob(data, inputShape, MemoryFormat.CHANNELS_LAST);
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue outputNCHW = module.runMethod("contiguous", IValue.from(inputNHWC));
     assertIValueTensor(
         outputNCHW,
@@ -334,7 +334,7 @@ public void testChannelsLast3d() throws IOException {
     long[] dataNHWDC = new long[] {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16};
 
     Tensor inputNHWDC = Tensor.fromBlob(dataNHWDC, shape, MemoryFormat.CHANNELS_LAST_3D);
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
     final IValue outputNCHWD = module.runMethod("contiguous", IValue.from(inputNHWDC));
     assertIValueTensor(outputNCHWD, MemoryFormat.CONTIGUOUS, shape, dataNCHWD);
 
@@ -358,7 +358,7 @@ public void testChannelsLastConv2d() throws IOException {
     long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
     Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST);
 
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
 
     final IValue outputNCHW =
         module.runMethod("conv2d", IValue.from(inputNCHW), IValue.from(wNCHW), IValue.from(false));
@@ -377,6 +377,186 @@ public void testChannelsLastConv2d() throws IOException {
         new long[] {2, 11, -101, 4, 12, -102, 6, 13, -103, 8, 14, -104});
   }
 
+  @Test
+  public void testMobileNetV2() throws IOException {
+    try {
+      final Module module = loadModel("mobilenet_v2.ptl");
+      final IValue inputs = module.runMethod("get_all_bundled_inputs");
+      assertTrue(inputs.isList());
+      final IValue input = inputs.toList()[0];
+      assertTrue(input.isTuple());
+      module.forward(input.toTuple()[0]);
+      assertTrue(true);
+    } catch (Exception ex) {
+      assertTrue("failed to run MobileNetV2 " + ex.getMessage(), false);
+    }
+  }
+
+  @Test
+  public void testPointwiseOps() throws IOException {
+    runModel("pointwise_ops");
+  }
+
+  @Test
+  public void testReductionOps() throws IOException {
+    runModel("reduction_ops");
+  }
+
+  @Test
+  public void testComparisonOps() throws IOException {
+    runModel("comparison_ops");
+  }
+
+  @Test
+  public void testOtherMathOps() throws IOException {
+    runModel("other_math_ops");
+  }
+
+  @Test
+  public void testSpectralOps() throws IOException {
+    runModel("spectral_ops");
+  }
+
+  @Test
+  public void testBlasLapackOps() throws IOException {
+    runModel("blas_lapack_ops");
+  }
+
+  @Test
+  public void testSamplingOps() throws IOException {
+    runModel("sampling_ops");
+  }
+
+  @Test
+  public void testTensorOps() throws IOException {
+    runModel("tensor_general_ops");
+  }
+
+  @Test
+  public void testTensorCreationOps() throws IOException {
+    runModel("tensor_creation_ops");
+  }
+
+  @Test
+  public void testTensorIndexingOps() throws IOException {
+    runModel("tensor_indexing_ops");
+  }
+
+  @Test
+  public void testTensorTypingOps() throws IOException {
+    runModel("tensor_typing_ops");
+  }
+
+  @Test
+  public void testTensorViewOps() throws IOException {
+    runModel("tensor_view_ops");
+  }
+
+  @Test
+  public void testConvolutionOps() throws IOException {
+    runModel("convolution_ops");
+  }
+
+  @Test
+  public void testPoolingOps() throws IOException {
+    runModel("pooling_ops");
+  }
+
+  @Test
+  public void testPaddingOps() throws IOException {
+    runModel("padding_ops");
+  }
+
+  @Test
+  public void testActivationOps() throws IOException {
+    runModel("activation_ops");
+  }
+
+  @Test
+  public void testNormalizationOps() throws IOException {
+    runModel("normalization_ops");
+  }
+
+  @Test
+  public void testRecurrentOps() throws IOException {
+    runModel("recurrent_ops");
+  }
+
+  @Test
+  public void testTransformerOps() throws IOException {
+    runModel("transformer_ops");
+  }
+
+  @Test
+  public void testLinearOps() throws IOException {
+    runModel("linear_ops");
+  }
+
+  @Test
+  public void testDropoutOps() throws IOException {
+    runModel("dropout_ops");
+  }
+
+  @Test
+  public void testSparseOps() throws IOException {
+    runModel("sparse_ops");
+  }
+
+  @Test
+  public void testDistanceFunctionOps() throws IOException {
+    runModel("distance_function_ops");
+  }
+
+  @Test
+  public void testLossFunctionOps() throws IOException {
+    runModel("loss_function_ops");
+  }
+
+  @Test
+  public void testVisionFunctionOps() throws IOException {
+    runModel("vision_function_ops");
+  }
+
+  @Test
+  public void testShuffleOps() throws IOException {
+    runModel("shuffle_ops");
+  }
+
+  @Test
+  public void testNNUtilsOps() throws IOException {
+    runModel("nn_utils_ops");
+  }
+
+  @Test
+  public void testQuantOps() throws IOException {
+    runModel("general_quant_ops");
+  }
+
+  @Test
+  public void testDynamicQuantOps() throws IOException {
+    runModel("dynamic_quant_ops");
+  }
+
+  @Test
+  public void testStaticQuantOps() throws IOException {
+    runModel("static_quant_ops");
+  }
+
+  @Test
+  public void testFusedQuantOps() throws IOException {
+    runModel("fused_quant_ops");
+  }
+
+  @Test
+  public void testTorchScriptBuiltinQuantOps() throws IOException {
+    runModel("torchscript_builtin_ops");
+  }
+
+  @Test
+  public void testTorchScriptCollectionQuantOps() throws IOException {
+    runModel("torchscript_collection_ops");
+  }
+
   static void assertIValueTensor(
       final IValue ivalue,
       final MemoryFormat memoryFormat,
@@ -389,5 +569,15 @@ static void assertIValueTensor(
     assertArrayEquals(expectedData, t.getDataAsLongArray());
   }
 
-  protected abstract String assetFilePath(String assetName) throws IOException;
+  void runModel(final String name) throws IOException {
+    final Module storage_module = loadModel(name + ".ptl");
+    storage_module.forward();
+
+    // TODO enable this once the on-the-fly script is ready
+    // final Module on_the_fly_module = loadModel(name + "_temp.ptl");
+    // on_the_fly_module.forward();
+    assertTrue(true);
+  }
+
+  protected abstract Module loadModel(String assetName) throws IOException;
 }
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java
new file mode 100644
index 000000000000..a494ffc663ff
--- /dev/null
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java
@@ -0,0 +1,9 @@
+package org.pytorch.suite;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.pytorch.PytorchLiteInstrumentedTests;
+
+@RunWith(Suite.class)
+@Suite.SuiteClasses({PytorchLiteInstrumentedTests.class})
+public class PytorchLiteInstrumentedTestSuite {}
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
index 8094f7bdc974..5ed0c9978e83 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@@ -223,7 +223,8 @@ class TensorHybrid : public facebook::jni::HybridClass<TensorHybrid> {
     } else {
       facebook::jni::throwNewJavaException(
           facebook::jni::gJavaLangIllegalArgumentException,
-          "at::Tensor scalar type is not supported on java side");
+          "at::Tensor scalar type %s is not supported on java side",
+          c10::toString(scalarType));
     }
 
     const auto& tensorShape = tensor.sizes();
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
index 7e0f6a41d868..83a7c021bf6a 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
@@ -23,7 +23,7 @@
  * methods.
  *
  * <p>When constructing {@code Tensor} objects with {@code data} as an array, it is not specified
- * whether this data is is copied or retained as a reference so it is recommended not to modify it
+ * whether this data is copied or retained as a reference so it is recommended not to modify it
  * after constructing. {@code data} passed as a {@link Buffer} is not copied, so it can be modified
  * between {@link Module} calls to avoid reallocation. Data retrieved from {@code Tensor} objects
  * may be copied or may be a reference to the {@code Tensor}'s internal data buffer. {@code shape}
diff --git a/android/run_tests.sh b/android/run_tests.sh
index a96177f072b7..839ee209c7b7 100755
--- a/android/run_tests.sh
+++ b/android/run_tests.sh
@@ -48,4 +48,9 @@ fi
 echo "Waiting for emulator boot completed"
 $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 1; done;'
 
-$GRADLE_PATH -PABI_FILTERS=x86 -p $PYTORCH_ANDROID_DIR connectedAndroidTest
+{
+  $GRADLE_PATH -PABI_FILTERS=x86 -p $PYTORCH_ANDROID_DIR connectedAndroidTest
+} || {
+  echo "::error::Check https://github.com/pytorch/pytorch/tree/master/test/mobile/model_test to see how to fix the failed mobile test"
+  exit 1
+}
diff --git a/aten.bzl b/aten.bzl
index eccdb4b4d0cd..c97f22284f10 100644
--- a/aten.bzl
+++ b/aten.bzl
@@ -1,5 +1,6 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@rules_cc//cc:defs.bzl", "cc_library")
+load("//:tools/build_variables.bzl", "aten_ufunc_headers")
 
 CPU_CAPABILITY_NAMES = ["DEFAULT", "AVX2"]
 CAPABILITY_COMPILER_FLAGS = {
@@ -8,8 +9,9 @@ CAPABILITY_COMPILER_FLAGS = {
 }
 
 PREFIX = "aten/src/ATen/native/"
+EXTRA_PREFIX = "aten/src/ATen/"
 
-def intern_build_aten_ops(copts, deps):
+def intern_build_aten_ops(copts, deps, extra_impls):
     for cpu_capability in CPU_CAPABILITY_NAMES:
         srcs = []
         for impl in native.glob(
@@ -28,6 +30,17 @@ def intern_build_aten_ops(copts, deps):
             )
             srcs.append(out)
 
+        for impl in extra_impls:
+            name = impl.replace(EXTRA_PREFIX, "")
+            out = EXTRA_PREFIX + name + "." + cpu_capability + ".cpp"
+            native.genrule(
+                name = name + "_" + cpu_capability + "_cp",
+                srcs = [impl],
+                outs = [out],
+                cmd = "cp $< $@",
+            )
+            srcs.append(out)
+
         cc_library(
             name = "ATen_CPU_" + cpu_capability,
             srcs = srcs,
@@ -81,3 +94,32 @@ generate_aten = rule(
         "srcs": attr.label_list(allow_files = True),
     },
 )
+
+# copy pasted from ufunc_defs.bzl, as ufuncs_defs.bzl cannot be included
+# from BUILD.bazel because it has a directory relative load, and Bazel
+# always load from workspace root.  The "correct" fix would be to move
+# build_variables.bzl to the top level but I don't have time to do this at
+# the moment.
+
+aten_ufunc_names = [
+    paths.split_extension(paths.basename(h))[0]
+    for h in aten_ufunc_headers
+]
+
+def aten_ufunc_generated_cpu_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCPU_{}.cpp".format(n)
+        for n in aten_ufunc_names
+    ]]
+
+def aten_ufunc_generated_cpu_kernel_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCPUKernel_{}.cpp".format(n)
+        for n in aten_ufunc_names
+    ]]
+
+def aten_ufunc_generated_cuda_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCUDA_{}.cu".format(n)
+        for n in aten_ufunc_names
+    ]]
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 9344f7e9b870..9c3757f346cd 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -23,6 +23,7 @@ set(ATen_CPU_INCLUDE)
 set(ATen_THIRD_PARTY_INCLUDE)
 set(ATen_CUDA_CPP_SRCS)
 set(ATen_CUDA_CU_SRCS)
+set(ATen_CUDA_LINALG_SRCS)
 set(ATen_CUDA_SRCS_W_SORT_BY_KEY)
 set(ATen_CUDA_TEST_SRCS)
 set(ATen_CUDA_INCLUDE)
@@ -31,6 +32,7 @@ set(ATen_HIP_SRCS)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY)
 set(ATen_HIP_TEST_SRCS)
 set(ATen_HIP_INCLUDE)
+set(ATen_MPS_SRCS)
 set(ATen_VULKAN_TEST_SRCS)
 set(ATen_CPU_DEPENDENCY_LIBS)
 set(ATen_CUDA_DEPENDENCY_LIBS)
@@ -99,9 +101,11 @@ set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
+set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp
index 2b4898412aec..d5ab588de53d 100644
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/BatchedTensorImpl.cpp
@@ -17,7 +17,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
 {
   TORCH_INTERNAL_ASSERT(value_.defined());
   set_storage_access_should_throw();
-  set_has_contiguity_policy(HasContiguityPolicy::CustomBehavior);
+  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
   checkInvariants();
 
   const auto public_dims = value_.dim() - bdims_.size();
@@ -77,6 +77,13 @@ void BatchedTensorImpl::checkInvariants() const {
 }
 
 // The following are publically exposed as methods of Tensor
+
+IntArrayRef BatchedTensorImpl::strides_custom() const {
+  return strides_default();
+}
+
+// TODO: implement proper contiguity on batched tensor, then put
+// sizes_strides_policy back to Default
 bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
   TORCH_CHECK(memory_format == MemoryFormat::Contiguous,
       "NYI: querying is_contiguous inside of vmap for memory_format ",
diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/BatchedTensorImpl.h
index ce59fcd20947..0c025aa01b35 100644
--- a/aten/src/ATen/BatchedTensorImpl.h
+++ b/aten/src/ATen/BatchedTensorImpl.h
@@ -72,6 +72,8 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
   // bt.actualDim(2) -> Error
   int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
 
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
   // Override a bunch of methods inherited from TensorImpl to return error messages.
   bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
   void set_size(int64_t dim, int64_t new_size) override;
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 0eb0d697078e..b1b082a4f88a 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -181,6 +181,11 @@ Tensor expand_batching_rule(const Tensor& self, IntArrayRef size, bool implicit)
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
+Tensor expand_batching_rule_symint(const Tensor& self, SymIntArrayRef psize, bool implicit) {
+  return expand_batching_rule(self, expectIntArrayRef(psize), implicit);
+}
+
+
 std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int64_t dim) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto dim_physical = self_physical.getPhysicalDim(dim);
@@ -1088,6 +1093,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("tensor_split.indices", tensor_split_indices_batching_rule);
   m.impl("diagonal", diagonal_batching_rule);
   m.impl("expand", expand_batching_rule);
+  m.impl("expand.SymInt", expand_batching_rule_symint);
   m.impl("expand_as", native::expand_as); // composite wrt autograd
   m.impl("movedim.intlist", movedim_batching_rule);
   m.impl("movedim.int", static_cast<Tensor(*)(const Tensor&,int64_t,int64_t)>(native::movedim)); // composite wrt autograd
@@ -1105,6 +1111,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("select.int", select_batching_rule);
   m.impl("slice.Tensor", slice_batching_rule);
   m.impl("split.Tensor", split_batching_rule);
+  m.impl("split.sizes", split_with_sizes_batching_rule);
   m.impl("split_with_sizes", split_with_sizes_batching_rule);
   m.impl("squeeze", squeeze_batching_rule);
   m.impl("squeeze.dim", squeeze_dim_batching_rule);
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index f259e345f96d..63a4cd76c2bb 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -100,10 +100,20 @@ file(GLOB native_ao_sparse_cpp
             "native/ao_sparse/cpu/*.cpp"
             "native/ao_sparse/quantized/*.cpp"
             "native/ao_sparse/quantized/cpu/*.cpp")
+# MPS
+file(GLOB mps_cpp "mps/*.cpp")
+file(GLOB mps_mm "mps/*.mm")
+file(GLOB mps_h "mps/*.h")
+file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
+file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
+file(GLOB_RECURSE native_mps_h "native/mps/*.h")
+
 file(GLOB native_sparse_cpp "native/sparse/*.cpp")
 file(GLOB native_quantized_cpp
             "native/quantized/*.cpp"
             "native/quantized/cpu/*.cpp")
+file(GLOB native_nested_cpp "native/nested/*.cpp")
+file(GLOB native_transformers_cpp "native/transformers/*.cpp")
 
 file(GLOB native_h "native/*.h")
 file(GLOB native_ao_sparse_h
@@ -120,21 +130,30 @@ file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
 file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
 file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
 file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
+file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
 file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
 file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
 file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu")
 file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp")
 file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp")
+file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu")
+file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp")
 
 file(GLOB native_hip_hip "native/hip/*.hip")
 file(GLOB native_hip_cpp "native/hip/*.cpp")
 file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
 file(GLOB native_miopen_cpp "native/miopen/*.cpp")
 file(GLOB native_cudnn_hip_cpp "native/cudnn/hip/*.cpp")
+file(GLOB native_nested_hip_hip "native/nested/hip/*.hip")
+file(GLOB native_nested_hip_cpp "native/nested/hip/*.cpp")
 file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
 file(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp")
 file(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 file(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
+file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip")
+file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
+file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp")
 file(GLOB native_utils_cpp "native/utils/*.cpp")
 
 # XNNPACK
@@ -155,13 +174,17 @@ if(BUILD_LITE_INTERPRETER)
 else()
   set(
     all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp}
-    ${native_ao_sparse_cpp} ${native_sparse_cpp}
+    ${native_ao_sparse_cpp} ${native_sparse_cpp} ${native_nested_cpp}
     ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp}
+    ${native_transformers_cpp}
     ${native_utils_cpp} ${native_xnnpack} ${generated_sources} ${core_generated_sources}
     ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp}
   )
 endif()
 
+if(USE_LIGHTWEIGHT_DISPATCH)
+  set(all_cpu_cpp ${all_cpu_cpp} ${generated_unboxing_sources})
+endif()
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
@@ -194,9 +217,10 @@ if(USE_CUDA)
   list(APPEND ATen_CUDA_CU_SRCS
     ${cuda_cu}
     ${native_cuda_cu}
-    ${native_cuda_linalg_cpp}
+    ${native_nested_cuda_cu}
     ${native_sparse_cuda_cu}
     ${native_quantized_cuda_cu}
+    ${native_transformers_cuda_cu}
     ${cuda_generated_sources}
   )
   list(APPEND ATen_CUDA_CPP_SRCS
@@ -204,10 +228,16 @@ if(USE_CUDA)
     ${native_cuda_cpp}
     ${native_cudnn_cpp}
     ${native_miopen_cpp}
+    ${native_nested_cuda_cpp}
     ${native_quantized_cuda_cpp}
     ${native_quantized_cudnn_cpp}
     ${native_sparse_cuda_cpp}
+    ${native_transformers_cuda_cpp}
   )
+  set(ATen_CUDA_LINALG_SRCS ${native_cuda_linalg_cpp})
+  if(NOT BUILD_LAZY_CUDA_LINALG)
+    list(APPEND ATen_CUDA_CU_SRCS ${native_cuda_linalg_cpp})
+  endif()
   if(CAFFE2_USE_CUDNN)
     list(APPEND ATen_CUDA_CPP_SRCS ${cudnn_cpp})
   endif()
@@ -225,9 +255,9 @@ endif()
 
 if(USE_ROCM)
   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip})
+  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_nested_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip} ${native_transformers_hip_hip})
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-  set(all_hip_cpp ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS})
+  set(all_hip_cpp ${native_nested_hip_cpp} ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${native_transformers_hip_cpp} ${native_quantized_cudnn_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS})
   set(all_hip_cpp ${native_miopen_cpp} ${native_cudnn_hip_cpp} ${miopen_cpp} ${all_hip_cpp})
 endif()
 
@@ -392,16 +422,24 @@ if(USE_CUDA AND NOT USE_ROCM)
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a     # needed for libcusolver_static
       )
+   if(NOT BUILD_LAZY_CUDA_LINALG)
+     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
+       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a     # needed for libcusolver_static
+       )
+   endif()
   else()
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
       ${CUDA_cusparse_LIBRARY}
       ${CUDA_curand_LIBRARY}
-      ${CUDA_cusolver_LIBRARY}
       )
+   if(NOT BUILD_LAZY_CUDA_LINALG)
+     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+       ${CUDA_cusolver_LIBRARY}
+     )
+   endif()
   endif()
 
   if(CAFFE2_USE_CUDNN)
@@ -415,9 +453,9 @@ endif()
 
 
   if(USE_MAGMA)
-    if(USE_CUDA)
+    if(USE_CUDA AND NOT BUILD_LAZY_CUDA_LINALG)
       list(APPEND ATen_CUDA_DEPENDENCY_LIBS torch::magma)
-    endif(USE_CUDA)
+    endif(USE_CUDA AND NOT BUILD_LAZY_CUDA_LINALG)
     if(USE_ROCM)
       list(APPEND ATen_HIP_DEPENDENCY_LIBS torch::magma)
     endif(USE_ROCM)
@@ -455,6 +493,10 @@ if(USE_CUDA)
   list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
 endif()
 
+if(USE_MPS)
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
+endif()
+
 if(USE_ROCM)
   set(ATen_HIP_SRCS ${all_hip_cpp})
   # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
@@ -536,10 +578,12 @@ set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
+set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 3c0aa1c6bfc9..b0d15988b95c 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -69,6 +69,7 @@ struct strided_tensor_iter_fixed {
   strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
   strided_tensor_iter_fixed(Tensor& tensor, bool sort_strides = false)
       : data_(tensor.data_ptr<T>()) {
+    (void)sort_strides; // Suppress unused variable warning
     std::memset(counter_, 0, sizeof(int64_t) * N);
     if (tensor.dim() > 0) {
       std::memcpy(
@@ -152,7 +153,7 @@ inline int64_t _max_dim_tensors(ArrayRef<Tensor> tensors) {
   return dim;
 }
 
-inline void iterate(int64_t size){};
+inline void iterate(int64_t /*size*/){};
 
 template <typename Arg, typename... Args>
 inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
@@ -199,7 +200,7 @@ inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
   iterate_overflow(iter_tail...);
 }
 
-inline void forward(int64_t offset){};
+inline void forward(int64_t /*offset*/){};
 
 template <typename Arg, typename... Args>
 inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 98590b266be4..afbb2ee7200a 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -4,7 +4,10 @@
 
 #include <c10/core/TensorOptions.h>
 #include <c10/core/CPUAllocator.h>
+#include <c10/util/env.h>
 
+#include <algorithm>
+#include <cctype>
 #include <mutex>
 #include <sstream>
 #include <stdexcept>
@@ -138,11 +141,44 @@ void Context::setBenchmarkCuDNN(bool b) {
 }
 
 bool Context::allowTF32CuBLAS() const {
-  return allow_tf32_cublas;
+  static bool allow_tf32_cublas_override = c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true;
+  return allow_tf32_cublas_override || float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
 }
 
 void Context::setAllowTF32CuBLAS(bool b) {
-  allow_tf32_cublas = b;
+  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
+}
+
+Float32MatmulPrecision Context::float32MatmulPrecision() const {
+  return float32_matmul_precision;
+}
+
+void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) {
+  float32_matmul_precision = p;
+}
+
+void Context::setFloat32MatmulPrecision(const std::string &s) {
+  auto match = [this](const std::string & s_) {
+    // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
+    if (s_ == "highest") {
+      float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
+      return true;
+    } else if (s_ == "high") {
+      float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
+      return true;
+    } else if (s_ == "medium") {
+      float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
+      return true;
+    }
+    return false;
+  };
+  if (match(s)) { return; }
+  std::string sl;
+  std::transform(s.begin(), s.end(), sl.begin(),
+                 [](unsigned char c) -> unsigned char { return std::tolower(c); });
+  if (match(sl)) { return; }
+  TORCH_WARN(s, " is not one of 'highest', 'high', or 'medium'; the current"
+    "setFloat32MatmulPrecision call has no effect.");
 }
 
 at::LinalgBackend Context::linalgPreferredBackend() const {
@@ -188,6 +224,22 @@ bool Context::hasMKLDNN() {
 #endif
 }
 
+bool Context::hasMPS() {
+#if defined(__APPLE__)
+#if __is_target_os(macOS)
+  if (__builtin_available(macOS 12.3, *)) {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::MPS);
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+#else
+  return false;
+#endif
+}
+
 bool Context::hasOpenMP() {
 #ifdef _OPENMP
   return true;
@@ -236,6 +288,10 @@ const std::vector<at::QEngine>& Context::supportedQEngines() {
     engines.push_back(at::kNoQEngine);
 #endif // C10_MOBILE
 
+#if AT_MKLDNN_ENABLED()
+    engines.push_back(at::kONEDNN);
+#endif
+
 #ifdef USE_FBGEMM
     if (fbgemm::fbgemmSupportedCPU()) {
       engines.push_back(at::kFBGEMM);
@@ -293,6 +349,26 @@ bool NoTF32Guard::should_disable_tf32() {
   return override_allow_tf32_flag;
 }
 
+#ifdef USE_ROCM
+// Ops can query this flag to know they are in the backward pass.
+// This information can be used, for example, to select implementations
+// with different numerical or performance characteristics.
+// See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details.
+thread_local bool ROCmBackwardPassGuard::is_backward_pass_;
+
+ROCmBackwardPassGuard::ROCmBackwardPassGuard() {
+  is_backward_pass_ = true;
+}
+
+ROCmBackwardPassGuard::~ROCmBackwardPassGuard() {
+  is_backward_pass_ = false;
+}
+
+bool ROCmBackwardPassGuard::is_backward_pass() {
+  return is_backward_pass_;
+}
+#endif
+
 bool Context::areVmapFallbackWarningsEnabled() const {
   return display_vmap_fallback_warnings_;
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 88cbc3ec0bb3..d4840c292643 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -22,6 +22,8 @@ namespace at {
 
 class Tensor;
 
+enum class TORCH_API Float32MatmulPrecision {HIGHEST, HIGH, MEDIUM};
+
 class TORCH_API Context {
  public:
   Context();
@@ -80,15 +82,17 @@ class TORCH_API Context {
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
+  static bool hasIPU() {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::IPU);
+  }
   static bool hasXLA() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
   }
   static bool hasLazy() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
   }
-  static bool hasMLC() {
-    return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
-  }
+  static bool hasMPS();
+
   static bool hasORT() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::ORT);
   }
@@ -202,10 +206,13 @@ class TORCH_API Context {
   // https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
   void alertCuBLASConfigNotDeterministic() const;
 
+  void setFloat32MatmulPrecision(const std::string & s);
   bool allowTF32CuDNN() const;
   void setAllowTF32CuDNN(bool);
   bool allowTF32CuBLAS() const;
   void setAllowTF32CuBLAS(bool);
+  Float32MatmulPrecision float32MatmulPrecision() const;
+  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
   bool allowFP16ReductionCuBLAS() const;
   void setAllowFP16ReductionCuBLAS(bool);
   at::QEngine qEngine() const;
@@ -243,8 +250,8 @@ class TORCH_API Context {
   bool _deterministic_algorithms = false;
   bool _deterministic_algorithms_warn_only = false;
   bool benchmark_cudnn = false;
+  Float32MatmulPrecision float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
   bool allow_tf32_cudnn = true;
-  bool allow_tf32_cublas = true;
   bool allow_fp16_reduction_cublas = true;
   bool enabled_mkldnn = true;
   at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default;
@@ -287,6 +294,11 @@ static inline DeprecatedTypeProperties& HIP(ScalarType s) {
       Backend::HIP, s);
 }
 
+static inline DeprecatedTypeProperties& MPS(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::MPS, s);
+}
+
 static inline bool hasCUDA() {
   return globalContext().hasCUDA();
 }
@@ -295,12 +307,16 @@ static inline bool hasHIP() {
   return globalContext().hasHIP();
 }
 
+static inline bool hasIPU() {
+  return globalContext().hasIPU();
+}
+
 static inline bool hasXLA() {
   return globalContext().hasXLA();
 }
 
-static inline bool hasMLC() {
-  return globalContext().hasMLC();
+static inline bool hasMPS() {
+  return globalContext().hasMPS();
 }
 
 static inline bool hasORT() {
@@ -387,4 +403,14 @@ struct TORCH_API NoTF32Guard {
   bool changed = false;
 };
 
+#ifdef USE_ROCM
+struct TORCH_API ROCmBackwardPassGuard {
+  ROCmBackwardPassGuard();
+  ~ROCmBackwardPassGuard();
+  static bool is_backward_pass();
+private:
+  static thread_local bool is_backward_pass_;
+};
+#endif
+
 } // namespace at
diff --git a/aten/src/ATen/Dispatch.cpp b/aten/src/ATen/Dispatch.cpp
index 297b25b54ead..73e54b319cb6 100644
--- a/aten/src/ATen/Dispatch.cpp
+++ b/aten/src/ATen/Dispatch.cpp
@@ -7,7 +7,7 @@ void record_kernel_function_dtype(std::string name) {
   RECORD_FUNCTION_WITH_SCOPE(
         at::RecordScope::KERNEL_FUNCTION_DTYPE,
         name,
-        {});
+        c10::ArrayRef<const c10::IValue>{});
 }
 
 }}  // namespace at::detail
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 55e2036d62e2..05f31606c484 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -20,8 +20,8 @@ namespace at {
  * included in this file when code-gen is ready.
  */
 inline constexpr bool should_include_kernel_dtype(
-  const char *kernel_tag_str,
-  at::ScalarType scalar_type
+  const char* /*kernel_tag_str*/,
+  at::ScalarType /*scalar_type*/
 ) {
   return true;
 }
@@ -416,6 +416,46 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(                          \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...)                   \
+  [&] {                                                                       \
+    const auto& the_type = TYPE;                                              \
+    /* don't use TYPE again in case it is an expensive or side-effect op */   \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                     \
+    RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                  \
+    switch (_st) {                                                            \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          at::ScalarType::ComplexDouble,                                      \
+          c10::complex<double>,                                               \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          at::ScalarType::ComplexFloat,                                       \
+          c10::complex<float>,                                                \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE1,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE1>::t),           \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE2,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE2>::t),           \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE3,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE3>::t),           \
+          __VA_ARGS__)                                                        \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");        \
+    }                                                                         \
+  }()
+
 #define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                         \
   [&] {                                                                     \
     const auto& the_type = TYPE;                                            \
@@ -495,6 +535,33 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_COMPLEX_TYPES_AND(SCALARTYPE, TYPE, NAME, ...)          \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                \
+    switch (_st) {                                                          \
+      AT_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          at::ScalarType::ComplexFloat,                                     \
+          c10::complex<float>,                                              \
+          __VA_ARGS__)                                                      \
+      AT_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          at::ScalarType::ComplexDouble,                                    \
+          c10::complex<double>,                                             \
+          __VA_ARGS__)                                                      \
+      AT_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          SCALARTYPE,                                                       \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE>::t),          \
+          __VA_ARGS__)                                                      \
+      default:                                                              \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");      \
+    }                                                                       \
+  }()
+
 #define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...)                             \
   [&] {                                                                     \
     const auto& the_type = TYPE;                                            \
@@ -513,6 +580,22 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_QINT_BYTE_TYPES(TYPE, NAME, ...)                        \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                \
+    switch (_st) {                                                          \
+      AT_QINT_PRIVATE_CASE_TYPE(                                            \
+          NAME, at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)      \
+      AT_QINT_PRIVATE_CASE_TYPE(                                            \
+          NAME, at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)   \
+      default:                                                              \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");     \
+    }                                                                       \
+  }()
+
 #define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...)                                   \
   [&] {                                                                                        \
     const auto& the_type = TYPE;                                                               \
@@ -753,6 +836,56 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                               \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...)      \
+  [&] {                                                                       \
+    const auto& the_type = TYPE;                                              \
+    /* don't use TYPE again in case it is an expensive or side-effect op*/    \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                     \
+    RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                  \
+    switch (_st) {                                                            \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__)  \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__)  \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          at::ScalarType::ComplexFloat,                                       \
+          c10::complex<float>,                                                \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          at::ScalarType::ComplexDouble,                                      \
+          c10::complex<double>,                                               \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE1,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE1>::t),           \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE2,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE2>::t),           \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE3,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE3>::t),           \
+          __VA_ARGS__)                                                        \
+      AT_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SCALARTYPE4,                                                        \
+          decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE4>::t),           \
+          __VA_ARGS__)                                                        \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");        \
+    }                                                                         \
+  }()
+
 #define AT_DISPATCH_INDEX_TYPES(TYPE, NAME, ...)                            \
   [&] {                                                                     \
     const auto& the_index_type = TYPE;                                      \
diff --git a/aten/src/ATen/DynamicLibrary.cpp b/aten/src/ATen/DynamicLibrary.cpp
index b6577241bcdb..f3287121b2e2 100644
--- a/aten/src/ATen/DynamicLibrary.cpp
+++ b/aten/src/ATen/DynamicLibrary.cpp
@@ -20,22 +20,22 @@ namespace at {
 
 static void* checkDL(void* x) {
   if (!x) {
-    AT_ERROR("Error in dlopen or dlsym: ", dlerror());
+    TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen or dlsym: ", dlerror());
   }
 
   return x;
 }
-DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) {
   // NOLINTNEXTLINE(hicpp-signed-bitwise)
   handle = dlopen(name, RTLD_LOCAL | RTLD_NOW);
   if (!handle) {
     if (alt_name) {
       handle = dlopen(alt_name, RTLD_LOCAL | RTLD_NOW);
       if (!handle) {
-        AT_ERROR("Error in dlopen for library ", name, "and ", alt_name);
+        TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen for library ", name, "and ", alt_name);
       }
     } else {
-      AT_ERROR("Error in dlopen: ", dlerror());
+      TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen: ", dlerror());
     }
   }
 }
@@ -46,8 +46,9 @@ void* DynamicLibrary::sym(const char* name) {
 }
 
 DynamicLibrary::~DynamicLibrary() {
-  if (!handle)
+  if (!handle || leak_handle) {
     return;
+  }
   dlclose(handle);
 }
 
@@ -55,7 +56,7 @@ DynamicLibrary::~DynamicLibrary() {
 
 // Windows
 
-DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) {
   // NOLINTNEXTLINE(hicpp-signed-bitwise)
   HMODULE theModule;
   bool reload = true;
@@ -83,7 +84,7 @@ DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
     FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                   NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                   buf, (sizeof(buf) / sizeof(char)), NULL);
-    AT_ERROR("error in LoadLibrary for ", name, ". WinError ", dw, ": ", buf);
+    TORCH_CHECK_WITH(DynamicLibraryError, false, "error in LoadLibrary for ", name, ". WinError ", dw, ": ", buf);
   }
 }
 
@@ -91,13 +92,13 @@ void* DynamicLibrary::sym(const char* name) {
   AT_ASSERT(handle);
   FARPROC procAddress = GetProcAddress((HMODULE)handle, name);
   if (!procAddress) {
-    AT_ERROR("error in GetProcAddress");
+    TORCH_CHECK_WITH(DynamicLibraryError, false, "error in GetProcAddress");
   }
   return (void*)procAddress;
 }
 
 DynamicLibrary::~DynamicLibrary() {
-  if (!handle) {
+  if (!handle || leak_handle) {
     return;
   }
   FreeLibrary((HMODULE)handle);
diff --git a/aten/src/ATen/DynamicLibrary.h b/aten/src/ATen/DynamicLibrary.h
index 88bc0e201ced..8f65dd5b494f 100644
--- a/aten/src/ATen/DynamicLibrary.h
+++ b/aten/src/ATen/DynamicLibrary.h
@@ -1,20 +1,30 @@
 #pragma once
 
 #include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
 #include <ATen/Utils.h>
 
+namespace c10 {
+
+class DynamicLibraryError : public Error {
+  using Error::Error;
+};
+
+} // namespace c10
+
 namespace at {
 
 struct DynamicLibrary {
   AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
 
-  TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr);
+  TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr, bool leak_handle = false);
 
   TORCH_API void* sym(const char* name);
 
   TORCH_API ~DynamicLibrary();
 
  private:
+  bool leak_handle;
   void* handle = nullptr;
 };
 
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 5e21a2f52d18..5a72a09d1841 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -2,31 +2,93 @@
 #include <ATen/EmptyTensor.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/core/CPUAllocator.h>
+#include <c10/util/safe_numerics.h>
+
+#include <limits>
 
 namespace at {
 namespace detail {
-
-static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
+namespace {
+c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   if (pin_memory) {
     return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
   }
   return c10::GetCPUAllocator();
 }
 
+constexpr uint64_t storage_max() {
+  // int64_t and size_t are used somewhat inconsistently throughout ATen.
+  // To be safe, storage size calculations must fit in both types.
+  constexpr auto int64_max = static_cast<uint64_t>(
+      std::numeric_limits<int64_t>::max());
+  constexpr auto size_max = static_cast<uint64_t>(
+      std::numeric_limits<size_t>::max());
+  return std::min(int64_max, size_max);
+}
+
+}  // namespace (anonymous)
+
+size_t computeStorageNbytesContiguous(
+    IntArrayRef sizes,
+    size_t itemsize_bytes,
+    size_t storage_offset
+  ) {
+  // Ignore overflow checks on mobile
+#ifndef C10_MOBILE
+  uint64_t size = 1;
+  bool overflowed = c10::safe_multiplies_u64(sizes, &size);
+  overflowed |= c10::add_overflows(size, storage_offset, &size);
+  overflowed |= c10::mul_overflows(size, itemsize_bytes, &size);
+  overflowed |= size > storage_max();
+  TORCH_CHECK(!overflowed,
+              "Storage size calculation overflowed with sizes=", sizes);
+  return static_cast<size_t>(size);
+#else
+  const auto numel = c10::multiply_integers(sizes);
+  return itemsize_bytes * (storage_offset + numel);
+#endif
+}
+
 size_t computeStorageNbytes(
     IntArrayRef sizes,
     IntArrayRef strides,
-    size_t itemsize_bytes) {
+    size_t itemsize_bytes,
+    size_t storage_offset
+  ) {
+  // Ignore overflow checks on mobile
+#ifndef C10_MOBILE
   // size of the underlying storage is 1 bigger than the offset
   // of the last element according to stride
-  size_t size = 1;
+  uint64_t size = storage_offset + 1;
+  bool overflowed = false;
   for (const auto i : c10::irange(sizes.size())) {
-    if(sizes[i] == 0) {
+    if (sizes[i] == 0) {
       return 0;
     }
-    size += strides[i]*(sizes[i]-1);
+
+    uint64_t strided_size;
+    overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size);
+    overflowed |= c10::add_overflows(size, strided_size, &size);
   }
-  return size * itemsize_bytes;
+  overflowed |= c10::mul_overflows(size, itemsize_bytes, &size);
+  overflowed |= size > storage_max();
+  TORCH_CHECK(!overflowed,
+              "Storage size calculation overflowed with sizes=",
+              sizes, " and strides=", strides);
+  return static_cast<size_t>(size);
+#else
+  // size of the underlying storage is 1 bigger than the offset
+  // of the last element according to stride
+  uint64_t size = 1;
+  for (const auto i : c10::irange(sizes.size())) {
+    if (sizes[i] == 0) {
+      return 0;
+    }
+
+    size += strides[i] * (sizes[i] - 1);
+  }
+  return itemsize_bytes * (storage_offset + size);
+#endif
 }
 
 TensorBase empty_generic(
@@ -37,9 +99,8 @@ TensorBase empty_generic(
     c10::optional<c10::MemoryFormat> memory_format_opt) {
   at::detail::check_size_nonnegative(size);
 
-  int64_t nelements = c10::multiply_integers(size);
   caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
-  int64_t size_bytes = nelements * dtype.itemsize();
+  size_t size_bytes = computeStorageNbytesContiguous(size, dtype.itemsize());
   auto storage_impl = c10::make_intrusive<StorageImpl>(
       c10::StorageImpl::use_byte_size_t(),
       size_bytes,
@@ -73,7 +134,7 @@ TensorBase empty_strided_generic(
   at::detail::check_size_nonnegative(size);
 
   caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
-  int64_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize());
+  size_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize());
   auto storage_impl = c10::make_intrusive<StorageImpl>(
       c10::StorageImpl::use_byte_size_t(),
       size_bytes,
@@ -176,13 +237,11 @@ struct MetaAllocator final : public at::Allocator {
 
 static MetaAllocator g_meta_alloc;
 
-at::Allocator* GetMetaAllocator() {
-  return &g_meta_alloc;
-}
+REGISTER_ALLOCATOR(kMeta, &g_meta_alloc);
 
 TensorBase empty_meta(IntArrayRef size, ScalarType dtype,
                      c10::optional<c10::MemoryFormat> memory_format_opt) {
-  auto *allocator = GetMetaAllocator();
+  auto *allocator = GetAllocator(kMeta);
   constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
   return at::detail::empty_generic(
       size, allocator, meta_dks, dtype, memory_format_opt);
@@ -222,7 +281,7 @@ TensorBase empty_meta(
 
 TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride,
                               ScalarType dtype) {
-  auto *allocator = GetMetaAllocator();
+  auto *allocator = GetAllocator(kMeta);
   constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
   return at::detail::empty_strided_generic(
       size, stride, allocator, meta_dks, dtype);
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index a49b3e909d6e..895bcc8e1779 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -10,8 +10,11 @@ inline void check_size_nonnegative(IntArrayRef size) {
   }
 }
 
+TORCH_API size_t computeStorageNbytesContiguous(
+    IntArrayRef sizes, size_t itemsize, size_t storage_offset=0);
 TORCH_API size_t computeStorageNbytes(
-    IntArrayRef sizes, IntArrayRef strides, size_t itemsize);
+    IntArrayRef sizes, IntArrayRef strides,
+    size_t itemsize, size_t storage_offset=0);
 
 TORCH_API TensorBase empty_generic(
     IntArrayRef size,
diff --git a/aten/src/ATen/ExpandBase.h b/aten/src/ATen/ExpandBase.h
new file mode 100644
index 000000000000..e0a24091da19
--- /dev/null
+++ b/aten/src/ATen/ExpandBase.h
@@ -0,0 +1,23 @@
+#include <ATen/core/TensorBase.h>
+
+// Broadcasting utilities for working with TensorBase
+namespace at {
+namespace internal {
+TORCH_API TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size);
+} // namespace internal
+
+inline c10::MaybeOwned<TensorBase> expand_size(const TensorBase &self, IntArrayRef size) {
+  if (size.equals(self.sizes())) {
+    return c10::MaybeOwned<TensorBase>::borrowed(self);
+  }
+  return c10::MaybeOwned<TensorBase>::owned(
+      at::internal::expand_slow_path(self, size));
+}
+c10::MaybeOwned<TensorBase> expand_size(TensorBase &&self, IntArrayRef size) = delete;
+
+inline c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, const TensorBase &to_expand) {
+  return expand_size(to_expand, tensor.sizes());
+}
+c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, TensorBase &&to_expand) = delete;
+
+} // namespace at
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index 35588ac62a29..a44005a2ef81 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -1,8 +1,15 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ExpandUtils.h>
+#include <ATen/ExpandBase.h>
 
 #include <c10/util/irange.h>
 
 namespace at {
+namespace internal {
+TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
+  return OptionalTensorRef(self)->expand(size);
+}
+}
 
 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 55a392c8d9cc..a1b7c8a04602 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -1,5 +1,11 @@
 #pragma once
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/view_copy.h>
+#endif
+
 #include <ATen/core/DimVector.h>
 #include <ATen/Tensor.h>
 #include <c10/util/Exception.h>
@@ -293,7 +299,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
 
 // Sums `tensor` repeatedly to produce a tensor of shape `shape`.
 // Precondition: is_expandable_to(shape, tensor.sizes()) must be true
-static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
+static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape, bool always_return_non_view=false) {
   if (shape.size() == 0) {
     return tensor.sum();
   }
@@ -311,7 +317,13 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
   if (!reduce_dims.empty()) {
     tensor = tensor.sum(reduce_dims, /*keepdim=*/true);
   }
-  return leading_dims > 0 ? tensor.view(shape) : tensor;
+  if (always_return_non_view) {
+    // This is only actually used by the functionalization pass.
+    // We want to be able to guarantee that this function doesn't return a view of the input.
+    return leading_dims > 0 ? at::view_copy(tensor, shape) : tensor.clone();
+  } else {
+    return leading_dims > 0 ? tensor.view(shape) : tensor;
+  }
 }
 
 // True if `shape` can be broadcasted to `desired`
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 3e686701fa63..d4b2a08825b8 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -10,34 +10,46 @@ namespace functionalization {
 // We can't easily share it though, because (eventually) these functions
 // will all call `permute/unsqueeze_copy()` instead of `permute/unsqueeze`.
 
-Tensor permute_inverse(const Tensor& self, IntArrayRef dims) {
+Tensor permute_copy_inverse(const Tensor& self, IntArrayRef dims, bool reapply_views) {
   // invert the permutation
   auto ndims = dims.size();
   std::vector<int64_t> dims_(ndims);
   for(const auto i : c10::irange(ndims)) {
     dims_[at::maybe_wrap_dim(dims[i], ndims)] = i;
   }
-  return self.permute(dims_);
+  if (reapply_views) {
+    return at::permute(self, dims_);
+  } else {
+    return at::permute_copy(self, dims_);
+  }
 }
 
-Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) {
+Tensor unsqueeze_copy_to(const Tensor & self, IntArrayRef sizes, bool reapply_views) {
   auto result = self;
 
   int64_t nDims = sizes.size();
   for(const auto dim : c10::irange(nDims)) {
     if (sizes[dim] == 1) {
-      result = result.unsqueeze(dim);
+      if (reapply_views) {
+        result = at::unsqueeze(result, dim);
+      } else {
+        result = at::unsqueeze_copy(result, dim);
+      }
     }
   }
   return result;
 }
 
-Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
+Tensor unsqueeze_copy_to(const Tensor & self, int64_t dim, IntArrayRef sizes, bool reapply_views) {
   dim = at::maybe_wrap_dim(dim, sizes.size());
   // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
   // unsqueezing in the backward.
   if (sizes.size() > 0 && sizes[dim] == 1) {
-    return self.unsqueeze(dim);
+    if (reapply_views) {
+      return at::unsqueeze(self, dim);
+    } else {
+      return at::unsqueeze_copy(self, dim);
+    }
   }
   return self;
 }
@@ -73,72 +85,99 @@ Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
 // The codegen automatically generates the corresponding function declaration.
 // ----------------------------------------------------------
 
-Tensor FunctionalInverses::_fw_primal_inverse(const at::Tensor& base, const at::Tensor& mutated_view, int64_t level) {
+Tensor FunctionalInverses::_fw_primal_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views, int64_t level) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call _fw_primal() during the functionalization pass. For now, this is not supported.");
     return Tensor();
 }
 
-Tensor FunctionalInverses::_make_dual_inverse(const at::Tensor& base, const at::Tensor& mutated_view, const at::Tensor& tangent, int64_t level) {
+Tensor FunctionalInverses::_make_dual_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views, const at::Tensor& tangent, int64_t level) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call _make_dual() during the functionalization pass. For now, this is not supported.");
     return Tensor();
 }
 
-Tensor FunctionalInverses::view_as_real_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return at::view_as_complex(mutated_view);
+Tensor FunctionalInverses::view_as_real_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::view_as_complex(mutated_view);
+    } else {
+      return at::view_as_complex_copy(mutated_view);
+    }
 }
 
-Tensor FunctionalInverses::view_as_complex_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return at::view_as_real(mutated_view.resolve_conj());
+Tensor FunctionalInverses::view_as_complex_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::view_as_real(mutated_view.resolve_conj());
+    } else {
+      return at::view_as_real_copy(mutated_view.resolve_conj());
+    }
 }
 
-Tensor FunctionalInverses::_conj_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return mutated_view.conj();
+Tensor FunctionalInverses::_conj_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::_conj(mutated_view);
+    } else {
+      return at::_conj_copy(mutated_view);
+    }
 }
 
-Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return mutated_view.neg();
+Tensor FunctionalInverses::_neg_view_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::_neg_view(mutated_view);
+    } else {
+      return at::_neg_view_copy(mutated_view);
+    }
 }
 
-Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
+Tensor FunctionalInverses::as_strided_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
     TORCH_INTERNAL_ASSERT(false, "as_strided has not been implemented in the functionalization pass yet");
     return Tensor();
 }
 
-Tensor FunctionalInverses::diagonal_inverse(const Tensor& base, const Tensor& mutated_view, int64_t offset, int64_t dim1, int64_t dim2) {
+Tensor FunctionalInverses::diagonal_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t offset, int64_t dim1, int64_t dim2) {
+    // Pessimism: we can't reapply views for slice_scatter.
     return base.diagonal_scatter(mutated_view, offset, dim1, dim2);
 }
 
-Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, bool implicit) {
-    return at::sum_to(mutated_view, base.sizes());
+Tensor FunctionalInverses::expand_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, bool implicit) {
+    return at::sum_to(mutated_view, base.sizes(),/*always_return_non_view=*/!reapply_views);
 }
 
-Tensor FunctionalInverses::permute_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef dims) {
-    return at::functionalization::permute_inverse(mutated_view, dims);
+Tensor FunctionalInverses::expand_copy_SymInt_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, c10::SymIntArrayRef size, bool implicit) {
+    return at::sum_to(mutated_view, c10::expectIntArrayRef(base.sym_sizes()),/*always_return_non_view=*/!reapply_views);
 }
 
-Tensor FunctionalInverses::_reshape_alias_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride) {
+Tensor FunctionalInverses::permute_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef dims) {
+    return at::functionalization::permute_copy_inverse(mutated_view, dims, reapply_views);
+}
+
+Tensor FunctionalInverses::_reshape_alias_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, at::IntArrayRef stride) {
     // Note that I'm directly calling reshape(), and ignoring the strides.
     // _reshape_alias() isn't available from user code, and is an implementation detail of reshape().
     // Specifically, passing in the strides directly can get us into trouble in cases like:
     // b = a[0]; c = b.reshape(...); c.add_(1); print(a)
     // When we eventually run the _reshape_alias_inverse() call here, if we were to pass in both sizes and strides,
     // The call would fail because `mutated_view` doesn't have enough bytes of storage.
-    return mutated_view.reshape(base.sizes());
+    if (reapply_views) {
+      return at::_reshape_alias(mutated_view, base.sizes(), base.strides());
+    } else {
+      return at::_reshape_alias_copy(mutated_view, base.sizes(), base.strides());
+    }
 }
 
-Tensor FunctionalInverses::select_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, int64_t index) {
+Tensor FunctionalInverses::select_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, int64_t index) {
+    // Pessimism: we can't reapply views for slice_scatter.
     return base.select_scatter(mutated_view, dim, index);
 }
-Tensor FunctionalInverses::detach_inverse(const Tensor& base, const Tensor& mutated_view) {
+Tensor FunctionalInverses::detach_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     // the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function
     return mutated_view;
 }
 
-Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+Tensor FunctionalInverses::slice_copy_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+    // Pessimism: we can't reapply views for slice_scatter.
     return base.slice_scatter(mutated_view, dim, start, end, step);
 }
 
-Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t split_size, int64_t dim) {
+Tensor FunctionalInverses::split_copy_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, int64_t split_size, int64_t dim) {
     // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
     // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
     // on top of the base tensor.
@@ -148,10 +187,11 @@ Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor
     auto start = mutated_view_idx * split_size;
     auto end = start + split_size;
     if (end > dim_size) end = dim_size;
+    // Pessimism: we can't reapply views for slice_scatter.
     return base.slice_scatter(mutated_view, dim, start, end, 1);
 }
 
-Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) {
+Tensor FunctionalInverses::split_with_sizes_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) {
     dim = at::maybe_wrap_dim(dim, base.sizes().size());
     auto dim_size = base.size(dim);
     int64_t start = 0;
@@ -160,84 +200,123 @@ Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Te
     }
     auto end = start + split_sizes[mutated_view_idx];
     if (end > dim_size) end = dim_size;
+    // Pessimism: we can't reapply views for slice_scatter.
     return base.slice_scatter(mutated_view, dim, start, end, 1);
 }
 
-Tensor FunctionalInverses::squeeze_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return unsqueeze_to(mutated_view, base.sizes());
+Tensor FunctionalInverses::squeeze_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    return unsqueeze_copy_to(mutated_view, base.sizes(), reapply_views);
 }
 
-Tensor FunctionalInverses::squeeze_dim_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
-    return unsqueeze_to(mutated_view, dim, base.sizes());
+Tensor FunctionalInverses::squeeze_copy_dim_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim) {
+    return unsqueeze_copy_to(mutated_view, dim, base.sizes(), reapply_views);
 }
 
-Tensor FunctionalInverses::t_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return mutated_view.t();
+Tensor FunctionalInverses::t_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::t(mutated_view);
+    } else {
+      return at::t_copy(mutated_view);
+    }
 }
 
-Tensor FunctionalInverses::transpose_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim0, int64_t dim1) {
-    return mutated_view.transpose(dim0, dim1);
+Tensor FunctionalInverses::transpose_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim0, int64_t dim1) {
+    if (reapply_views) {
+      return transpose(mutated_view, dim0, dim1);
+    } else {
+      return transpose_copy(mutated_view, dim0, dim1);
+    }
 }
 
-Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
-    return mutated_view.squeeze(dim);
+Tensor FunctionalInverses::unsqueeze_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim) {
+    if (reapply_views) {
+      return at::squeeze(mutated_view, dim);
+    } else {
+      return at::squeeze_copy(mutated_view, dim);
+    }
 }
 
-Tensor FunctionalInverses::_indices_inverse(const Tensor& base, const Tensor& mutated_view) {
+Tensor FunctionalInverses::_indices_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call _indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::_values_inverse(const Tensor& base, const Tensor& mutated_view) {
+Tensor FunctionalInverses::_values_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call _values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::indices_inverse(const Tensor& base, const Tensor& mutated_view) {
+Tensor FunctionalInverses::indices_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::values_inverse(const Tensor& base, const Tensor& mutated_view) {
+Tensor FunctionalInverses::values_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::_sparse_broadcast_to_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) {
+Tensor FunctionalInverses::_sparse_broadcast_to_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call _sparse_broadcast_to() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::crow_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
+Tensor FunctionalInverses::crow_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call crow_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::col_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
+Tensor FunctionalInverses::col_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) {
     TORCH_INTERNAL_ASSERT(false, "Attempted to call col_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
     return Tensor();
 }
 
-Tensor FunctionalInverses::unbind_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t dim) {
+Tensor FunctionalInverses::ccol_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call ccol_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::row_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call row_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::unbind_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, int64_t dim) {
     dim = at::maybe_wrap_dim(dim, base.sizes().size());
+    // Pessimism: we can't reapply views for select_scatter.
     return base.select_scatter(mutated_view, dim, mutated_view_idx);
 }
 
-Tensor FunctionalInverses::view_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) {
-    return mutated_view.view(base.sizes());
+Tensor FunctionalInverses::view_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size) {
+    if (reapply_views) {
+      return mutated_view.view(base.sizes());
+    } else {
+      return at::view_copy(mutated_view, base.sizes());
+    }
 }
 
-Tensor FunctionalInverses::view_dtype_inverse(const Tensor& base, const Tensor& mutated_view, at::ScalarType dtype) {
-    return mutated_view.view(base.scalar_type());
+Tensor FunctionalInverses::view_copy_dtype_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::ScalarType dtype) {
+    if (reapply_views) {
+      return mutated_view.view(base.scalar_type());
+    } else {
+      return at::view_copy(mutated_view, base.scalar_type());
+    }
 }
 
-Tensor FunctionalInverses::unfold_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dimension, int64_t size, int64_t step) {
+Tensor FunctionalInverses::unfold_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dimension, int64_t size, int64_t step) {
     // I think autograd and the functionalization pass want the exact same thing here, but need to test to confirm.
+    // unfold_backward() is safe to use here because it is NOT a view op.
+    // (note: technically, "reapply_views" won't do anything here and we'll have an extra memory copy.
+    // We'd need to add an aliasing version of unfold_backward to fix that though).
     return unfold_backward(mutated_view, base.sizes(), dimension, size, step);
 }
 
-Tensor FunctionalInverses::alias_inverse(const Tensor& base, const Tensor& mutated_view) {
-    return mutated_view;
+Tensor FunctionalInverses::alias_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
+    if (reapply_views) {
+      return at::alias(mutated_view);
+    } else {
+      return at::alias_copy(mutated_view);
+    }
 }
 
 } // functionalization
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 52c9a3bb28cf..2fad6bfad606 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -75,16 +75,18 @@ const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
   return t;
 }
 
-void Alias::apply_updates() {
+bool Alias::apply_updates() {
   // N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
   // The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
   // It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
   // which means that we need to explicitly exclude it here before doing any other work underneath the pass.
   at::AutoDispatchSkipFunctionalize guard;
+  bool any_updates = updates_.size() > 0;
   for (auto& update_data: updates_) {
     base_ = apply_update(update_data, base_);
   }
   updates_.clear();
+  return any_updates;
 }
 
 FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
@@ -103,8 +105,8 @@ void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vec
   alias_.add_update(updated_val, view_metas);
 }
 
-void FunctionalStorageImpl::apply_updates() {
-  alias_.apply_updates();
+bool FunctionalStorageImpl::apply_updates() {
+  return alias_.apply_updates();
 }
 
 const Tensor& FunctionalStorageImpl::base() {
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index e8478a7ae903..2c8a1312cbe1 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -72,7 +72,7 @@ class Alias {
     const at::Tensor& base() const;
     size_t generation() const { return generation_; }
     void add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas);
-    void apply_updates();
+    bool apply_updates();
   private:
     // NB: base_ should always point to a tensor BELOW the current functionalization layer.
     // This is mainly to avoid reference cycles.
@@ -96,7 +96,7 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   explicit FunctionalStorageImpl(const Tensor& value);
 
   void add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas);
-  void apply_updates();
+  bool apply_updates();
   const Tensor& base();
   size_t generation() const;
 
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 5f99e3774798..2a0ca304baf5 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -17,12 +17,16 @@ void FunctionalTensorWrapper::set_constructor_metadata() {
   // For now I'm retroactively setting this in functorch,
   // but once Open Multiple Dispatch lands we should be able to calculate this in core.
   level_ = -1;
-  // shallow_copy_from overwrites the storage and dispatch keyset...
-  auto functional_storage = storage_;
-  shallow_copy_from(value_.getIntrusivePtr());
-  storage_ = functional_storage;
+  // mirror all of the generic tensor metadata onto the wrapper
+  copy_generic_tensor_metadata(value_.getIntrusivePtr().get(), this);
+  refresh_numel();
+  refresh_contiguous();
   storage_access_should_throw_ = false;
   key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set();
+  // All of the keys corresponding to functorch transforms should not be copied over.
+  // Functorch transforms all have their own wrapper tensors (e.g. BatchedTensorImpl) which expect
+  // to participate in the functorch transforms.
+  key_set_ = key_set_ - c10::functorch_transforms_ks;
 }
 
 FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
@@ -176,6 +180,9 @@ void FunctionalTensorWrapper::replace_(const Tensor& other) {
   // TODO: going to need to change this if we want nested functionalize() transforms.
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other));
   value_ = other;
+  // out= ops are allowed to resize the output tensors, mutating both the data and metadata of the tensor.
+  // We need to propagate that metadata mutation to the wrapper (new size).
+  set_sizes_and_strides(value_.sizes(), value_.strides());
 }
 
 
@@ -183,8 +190,10 @@ void FunctionalTensorWrapper::sync_() {
   if (is_up_to_date()) {
     return;
   }
-  apply_updates();
-  regenerate_from_base();
+  auto any_updates = apply_updates();
+  if (any_updates) {
+    regenerate_from_base();
+  }
 }
 
 void FunctionalTensorWrapper::regenerate_from_base() {
@@ -201,10 +210,10 @@ void FunctionalTensorWrapper::regenerate_from_base() {
   generation_ = storage_impl->generation();
 }
 
-void FunctionalTensorWrapper::apply_updates() {
+bool FunctionalTensorWrapper::apply_updates() {
   // Apply all updates on alias_
   auto storage_impl = functional_storage_impl();
-  storage_impl->apply_updates();
+  return storage_impl->apply_updates();
 }
 
 const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
@@ -222,6 +231,12 @@ Tensor to_functional_tensor(const Tensor& tensor) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor));
   return at::detail::make_tensor<FunctionalTensorWrapper>(tensor);
 }
+c10::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor) {
+  if (tensor.has_value()) {
+    return c10::make_optional<Tensor>(to_functional_tensor(*tensor));
+  }
+  return c10::nullopt;
+}
 c10::List<Tensor> to_functional_tensor(const c10::List<Tensor>& t_list) {
   c10::List<Tensor> outputs;
   outputs.reserve(t_list.size());
@@ -230,6 +245,14 @@ c10::List<Tensor> to_functional_tensor(const c10::List<Tensor>& t_list) {
   }
   return outputs;
 }
+c10::List<c10::optional<Tensor>> to_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
+  c10::List<c10::optional<Tensor>> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(to_functional_tensor(t_list[i]));
+  }
+  return outputs;
+}
 std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list) {
   std::vector<Tensor> outputs(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
@@ -237,7 +260,7 @@ std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list) {
   }
   return outputs;
 }
-TensorList to_functional_tensor(const TensorList& t_list) {
+std::vector<Tensor> to_functional_tensor(const TensorList& t_list) {
   std::vector<Tensor> outputs(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
     outputs[i] = to_functional_tensor(t_list[i]);
@@ -276,10 +299,10 @@ c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::opt
   }
   return outputs;
 }
-TensorList from_functional_tensor(const TensorList& t_list) {
+std::vector<Tensor> from_functional_tensor(const TensorList& t_list) {
   std::vector<Tensor> outputs(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
-    outputs.push_back(from_functional_tensor(t_list[i]));
+    outputs[i] = from_functional_tensor(t_list[i]);
   }
   return outputs;
 }
@@ -322,6 +345,81 @@ void sync(const c10::List<c10::optional<Tensor>> t_list) {
   }
 }
 
+void replace_(const Tensor& functional_tensor, const Tensor& other) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(functional_tensor));
+  unsafeGetFunctionalWrapper(functional_tensor)->replace_(other);
+}
+
+void replace_(const TensorList functional_tensor, TensorList other) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size());
+  for (const auto i : c10::irange(functional_tensor.size())) {
+    replace_(functional_tensor[i], other[i]);
+  }
+}
+
+
+void commit_update(const Tensor& functional_tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(functional_tensor));
+  unsafeGetFunctionalWrapper(functional_tensor)->commit_update();
+}
+
+void commit_update(const TensorList functional_tensor) {
+  for (const auto i : c10::irange(functional_tensor.size())) {
+    commit_update(functional_tensor[i]);
+  }
+}
+
+bool isFunctionalTensor(const at::Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+}
+
+bool isFunctionalTensor(const c10::optional<Tensor>& t) {
+  if (t.has_value()) {
+    return isFunctionalTensor(*t);
+  } else {
+    return false;
+  }
+}
+
+bool isFunctionalTensor(const c10::List<Tensor>& t_list) {
+  if (t_list.size() == 0) return false;
+  bool any_functional = isFunctionalTensor(t_list[0]);
+  for (const auto i : c10::irange(1, t_list.size())) {
+    auto curr_functional = isFunctionalTensor(t_list[i]);
+    TORCH_INTERNAL_ASSERT(
+         curr_functional == any_functional,
+        "Functionalization encountered a list of tensors where some are functional",
+        "and some are not, which is not currently unsupported.");
+  }
+  return any_functional;
+}
+
+bool isFunctionalTensor(const c10::List<c10::optional<Tensor>>& t_list) {
+  if (t_list.size() == 0) return false;
+  bool any_functional = isFunctionalTensor(t_list[0]);
+  for (const auto i : c10::irange(1, t_list.size())) {
+    auto curr_functional = isFunctionalTensor(t_list[i]);
+    TORCH_INTERNAL_ASSERT(
+         curr_functional == any_functional,
+        "Functionalization encountered a list of tensors where some are functional",
+        "and some are not, which is not currently unsupported.");
+  }
+  return any_functional;
+}
+
+bool isFunctionalTensor(const c10::ArrayRef<Tensor> t_list) {
+  if (t_list.size() == 0) return false;
+  bool any_functional = isFunctionalTensor(t_list[0]);
+  for (const auto i : c10::irange(1, t_list.size())) {
+    auto curr_functional = isFunctionalTensor(t_list[i]);
+    TORCH_INTERNAL_ASSERT(
+         curr_functional == any_functional,
+        "Functionalization encountered a list of tensors where some are functional",
+        "and some are not, which is not currently unsupported.");
+  }
+  return any_functional;
+}
+
 Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
@@ -373,6 +471,14 @@ void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector
   }
 }
 
+thread_local bool _functionalizationReapplyViews;
+
+bool getFunctionalizationReapplyViewsTLS() {
+  return _functionalizationReapplyViews;
+}
+void setFunctionalizationReapplyViewsTLS(bool reapply_views) {
+  _functionalizationReapplyViews = reapply_views;
+}
 
 } // namespace impl
 } // namespace functionalization
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 1696b41f1543..d0bb8e0dcf11 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -37,7 +37,6 @@ namespace at {
 //
 // See Note [Functionalization: Alias Removal] for details on the aliasing machinery.
 // See Note [Functionalization: Mutation Removal] for details on mutation removal.
-
 struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   explicit FunctionalTensorWrapper(const Tensor& value);
   // Additional constructor to create a FunctionalTensorWrapper directly from an underlying tensor that was created from a view.
@@ -64,7 +63,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // It can't just call sync_(), because the FunctionalTensorWrapper will look like it has no aliases and sync_ will be a noop.
   // We use the reference count on storage_ to determine if the wrapper is aliased, and by the time functorch
   // is ready to propagate updates to inputs, any intermediate views of the input created by the program will have been deallocated.
-  void apply_updates();
+  // This function also returns whether or not the base actually had any updates to apply.
+  bool apply_updates();
   // Takes the current state of value_ and snapshots it, sending it as a pending update to the alias.
   void commit_update();
   // When any tensor is mutated, the tensor increments its alias's "generation".
@@ -117,20 +117,24 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(const Tenso
   return functional_impl;
 }
 
-TORCH_API inline bool isFunctionalTensor(const at::Tensor& tensor) {
-  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
-}
+TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
+TORCH_API bool isFunctionalTensor(const c10::optional<Tensor>& t);
+TORCH_API bool isFunctionalTensor(const c10::List<Tensor>& t_list);
+TORCH_API bool isFunctionalTensor(const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API bool isFunctionalTensor(const c10::ArrayRef<Tensor> t_list);
 
 TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
+TORCH_API c10::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor);
 TORCH_API c10::List<Tensor> to_functional_tensor(const c10::List<Tensor>& t_list);
+TORCH_API c10::List<c10::optional<Tensor>> to_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list);
 TORCH_API std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list);
-TORCH_API TensorList to_functional_tensor(const TensorList& t_list);
+TORCH_API std::vector<Tensor> to_functional_tensor(const TensorList& t_list);
 
 TORCH_API Tensor from_functional_tensor(const Tensor& tensor);
 TORCH_API c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t);
 TORCH_API c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list);
 TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list);
-TORCH_API TensorList from_functional_tensor(const TensorList& tensors);
+TORCH_API std::vector<Tensor> from_functional_tensor(const TensorList& tensors);
 
 TORCH_API void sync(const at::Tensor& t);
 TORCH_API void sync(const c10::optional<Tensor>& t);
@@ -138,6 +142,12 @@ TORCH_API void sync(const c10::List<Tensor> t_list);
 TORCH_API void sync(const at::TensorList t_list);
 TORCH_API void sync(const c10::List<c10::optional<Tensor>> t_list);
 
+TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other);
+TORCH_API void replace_(const TensorList functional_tensor, TensorList other);
+
+TORCH_API void commit_update(const Tensor& functional_tensor);
+TORCH_API void commit_update(const TensorList functional_tensor);
+
 Tensor create_functional_tensor_with_view_meta(const Tensor& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta, int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
 std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
@@ -147,6 +157,32 @@ void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta);
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& meta_outs);
 
+
+//  ~~~~~ TLS used in functionalization ~~~~~
+
+TORCH_API bool getFunctionalizationReapplyViewsTLS();
+TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views);
+
+class TORCH_API FunctionalizationReapplyViewsGuard {
+ public:
+  FunctionalizationReapplyViewsGuard(bool reapply_views) {
+    prev_ = getFunctionalizationReapplyViewsTLS();
+    setFunctionalizationReapplyViewsTLS(reapply_views);
+  }
+
+  ~FunctionalizationReapplyViewsGuard() {
+    setFunctionalizationReapplyViewsTLS(prev_);
+  }
+
+  FunctionalizationReapplyViewsGuard(const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard operator=(const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) = delete;
+  FunctionalizationReapplyViewsGuard operator=(FunctionalizationReapplyViewsGuard&&) = delete;
+
+ private:
+  bool prev_;
+};
+
 } // namespace impl
 } // namespace functionalization
 } // namespace at
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index f130fc7cdbd4..a86a2db0521c 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -4,6 +4,12 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/to_native.h>
+#endif
+
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
     const auto& schema = op.schema();
@@ -12,23 +18,45 @@ namespace {
     const auto arguments_begin = stack->size() - num_arguments;
     auto arguments = torch::jit::last(stack, num_arguments);
 
+    auto any_functional_inputs = false;
+    auto any_tensor_inputs = false;
     for (uint64_t idx = 0; idx < num_arguments; ++idx) {
       const auto& ivalue = arguments[idx];
       if (ivalue.isTensor()) {
+        any_tensor_inputs = true;
         auto t = ivalue.toTensor();
-        at::functionalization::impl::sync(t);
-        auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
-        (*stack)[arguments_begin + idx] = t_new;
+        if (at::functionalization::impl::isFunctionalTensor(t)) {
+          any_functional_inputs = true;
+          at::functionalization::impl::sync(t);
+          auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
+          (*stack)[arguments_begin + idx] = t_new;
+        }
       } else if (ivalue.isTensorList()) {
+        any_tensor_inputs = true;
         auto tensors = ivalue.toTensorList();
-        at::functionalization::impl::sync(tensors);
-        auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors));
-        (*stack)[arguments_begin + idx] = t_new;
+        if (at::functionalization::impl::isFunctionalTensor(tensors)) {
+          any_functional_inputs = true;
+          at::functionalization::impl::sync(tensors);
+          auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors));
+          (*stack)[arguments_begin + idx] = t_new;
+        }
+      } else if (ivalue.isOptionalTensorList()) {
+        any_tensor_inputs = true;
+        auto opt_tensors = ivalue.toOptionalTensorList();
+        if (at::functionalization::impl::isFunctionalTensor(opt_tensors)) {
+          any_functional_inputs = true;
+          at::functionalization::impl::sync(opt_tensors);
+          auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(opt_tensors));
+          (*stack)[arguments_begin + idx] = t_new;
+        }
       }
     }
+    // we should wrap the output if any inputs were wrapped,
+    // OR if we're hitting a factory function (with no tensor inputs)
+    auto should_wrap_outputs = !any_tensor_inputs || any_functional_inputs;
     {
       at::AutoDispatchSkipFunctionalize guard;
-      op.redispatchBoxed(dispatchKeySet & c10::after_func_keyset, stack);
+      op.callBoxed(stack);
     }
     const auto num_returns = schema.returns().size();
     const auto returns_begin = stack->size() - num_returns;
@@ -36,19 +64,32 @@ namespace {
 
     for (const auto idx : c10::irange(num_returns)) {
       const auto& ivalue = returns[idx];
-      if (ivalue.isTensor()) {
+      if (ivalue.isTensor() && should_wrap_outputs) {
         auto t = ivalue.toTensor();
         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
         (*stack)[returns_begin + idx] = t_new;
-      } else if (ivalue.isTensorList()) {
+      } else if (ivalue.isTensorList() && should_wrap_outputs) {
         auto tensors = ivalue.toTensorList();
         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(tensors));
         (*stack)[returns_begin + idx] = t_new;
+      } else if (ivalue.isOptionalTensorList() && should_wrap_outputs) {
+        auto opt_tensors = ivalue.toOptionalTensorList();
+        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(opt_tensors));
+        (*stack)[returns_begin + idx] = t_new;
       }
     }
   }
 }
 
+at::Tensor lift_functionalize(const at::Tensor & self) {
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(self));
+  return at::functionalization::impl::to_functional_tensor(self);
+}
+
 TORCH_LIBRARY_IMPL(_, Functionalize, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&functionalizeFallback>());
 }
+
+TORCH_LIBRARY_IMPL(aten, Functionalize, m) {
+  m.impl("lift", TORCH_FN(lift_functionalize));
+}
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 24a85b4ce708..0a527bde20c9 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -459,7 +459,7 @@ std::vector<Dimname> broadcast_to_outnames(
   return unify_from_right(reference_names, tensor_names);
 }
 
-std::vector<Dimname> compute_cat_outnames(TensorList tensors) {
+std::vector<Dimname> compute_cat_outnames(ITensorListRef tensors) {
   if (!at::has_names(tensors)) {
     return {};
   }
diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h
index 8cd3e238159b..b8d421c6a611 100644
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@@ -10,7 +10,7 @@ namespace at {
 
 using NameVector = SmallVector<Dimname, kDimVectorStaticSize>;
 
-inline bool has_names(TensorList tensors) {
+inline bool has_names(ITensorListRef tensors) {
   return std::any_of(
       tensors.begin(), tensors.end(), [](const Tensor& t) { return t.has_names(); });
 }
@@ -98,7 +98,7 @@ TORCH_API void propagate_names_for_reduction(const Tensor& result, const Tensor&
 
 TORCH_API void propagate_names_for_expand(const Tensor& result, const Tensor& self);
 
-TORCH_API std::vector<Dimname> compute_cat_outnames(TensorList tensors);
+TORCH_API std::vector<Dimname> compute_cat_outnames(ITensorListRef tensors);
 
 TORCH_API std::vector<Dimname> compute_broadcast_outnames(
     const Tensor& self,
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
new file mode 100644
index 000000000000..1509bf4a2a04
--- /dev/null
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -0,0 +1,92 @@
+#include <ATen/ATen.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/NestedTensorImpl.h>
+#include <c10/core/DispatchKey.h>
+
+namespace at {
+namespace native {
+
+inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
+  if (sizes.dim() == 0) {
+    return std::vector<int64_t>();
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.dim() == 2);
+  std::vector<int64_t> result(1, sizes.sizes()[0]);
+  if (sizes.dim() > 0) {
+    size_t nested_dim = result.size();
+    int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+    result.resize(nested_dim + sizes.sizes()[1]);
+    int64_t sizes_size_0 = sizes.sizes()[0];
+    int64_t sizes_size_1 = sizes.sizes()[1];
+    for (const auto i : c10::irange(sizes_size_1)) {
+      result[nested_dim + i] = sizes_ptr[i];
+    }
+    for (const auto j : c10::irange(sizes_size_1)) {
+      for (const auto i : c10::irange(sizes_size_0)) {
+        if (result[nested_dim + j] &&
+            (result[nested_dim + j] != sizes_ptr[i * sizes.size(1) + j])) {
+          result[nested_dim + j] = -1;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+NestedTensorImpl::NestedTensorImpl(
+    at::Tensor buffer,
+    at::Tensor nested_size_tensor)
+    : TensorImpl(
+          (c10::DispatchKeySet(DispatchKey::NestedTensor) |
+           c10::DispatchKeySet(buffer.is_cuda() ? BackendComponent::CUDABit : BackendComponent::CPUBit)),
+          buffer.dtype(),
+          buffer.device()),
+      buffer_(std::move(buffer)),
+      nested_size_tensor_(std::move(nested_size_tensor)),
+      opt_sizes_(construct_opt_sizes(nested_size_tensor_))
+{
+  TORCH_WARN_ONCE(
+      "The PyTorch API of nested tensors is in prototype stage and will change "
+      "in the near future.");
+  TORCH_INTERNAL_ASSERT(buffer_.is_cuda() || buffer_.is_cpu(), "NestedTensorImpl buffer must be either CUDA or CPU but got ", buffer_);
+  TORCH_INTERNAL_ASSERT(nested_size_tensor_.is_contiguous());
+  int64_t size_dim = nested_size_tensor_.dim();
+  TORCH_INTERNAL_ASSERT(size_dim == 0 || size_dim == 2);
+  remove_autograd_key();
+  key_set_ =
+      key_set_ - c10::DispatchKeySet({c10::DispatchKey::ADInplaceOrView});
+  refresh_dim();
+  set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
+}
+
+void NestedTensorImpl::refresh_dim() {
+  const auto my_dim = nested_size_tensor_.dim() ? nested_size_tensor_.sizes()[1] + 1 : 1;
+  sizes_and_strides_.resize(my_dim);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim() == my_dim);
+}
+
+int64_t NestedTensorImpl::dim_custom() const {
+  return dim_default();
+}
+int64_t NestedTensorImpl::numel_custom() const {
+  TORCH_CHECK(false, "numel is disabled.");
+}
+bool NestedTensorImpl::is_contiguous_custom(MemoryFormat) const {
+  TORCH_CHECK(false, "is_contiguous is disabled.");
+}
+IntArrayRef NestedTensorImpl::sizes_custom() const {
+  TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support sizes. Please file an issue on https://github.com/pytorch/nestedtensor");
+}
+
+IntArrayRef NestedTensorImpl::strides_custom() const {
+  TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support strides. Please file an issue on https://github.com/pytorch/nestedtensor");
+}
+
+const char* NestedTensorImpl::tensorimpl_type_name() const {
+  return "NestedTensorImpl";
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
new file mode 100644
index 000000000000..7f29dd620b93
--- /dev/null
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <ATen/Tensor.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <ATen/MemoryOverlap.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/util/Metaprogramming.h>
+
+namespace at {
+namespace native {
+
+struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
+  explicit NestedTensorImpl(at::Tensor buffer, at::Tensor nested_size_tensor);
+
+  // TODO: don't expose private implementation details like this; in
+  // particular, resizing this tensor will mess up our dim() and
+  // callers cannot fix it.
+  const Tensor& get_nested_size_tensor() const {
+    return nested_size_tensor_;
+  }
+  // Returns nullopt if the ith dimension is irregular. The ith dimension
+  // of a NestedTensor is regular if the unbound tensors match in
+  // size at the (i-1)th dimension.
+  c10::optional<int64_t> opt_size(int64_t d) const {
+    d = at::maybe_wrap_dim(d, dim(), false);
+    if (opt_sizes_[d] == -1) {
+      return c10::nullopt;
+    }
+    return opt_sizes_[d];
+  }
+
+  const at::Tensor& get_buffer() const {
+    return buffer_;
+  }
+
+ protected:
+  const char* tensorimpl_type_name() const override;
+
+  // TODO: numel_custom and is_contiguous_custom can be profitably overridden
+  // with real implementations
+  int64_t numel_custom() const override;
+  bool is_contiguous_custom(MemoryFormat) const override;
+  IntArrayRef sizes_custom() const override;
+  IntArrayRef strides_custom() const override;
+
+  // this one is real
+  int64_t dim_custom() const override;
+
+ private:
+  // Must be called after any changes to our dim() to sync the state
+  // to TensorImpl.
+  void refresh_dim();
+
+  at::Tensor buffer_;
+  const at::Tensor nested_size_tensor_;
+  // NOTE: -1 here means the size is missing
+  std::vector<int64_t> opt_sizes_;
+};
+
+inline NestedTensorImpl* get_nested_tensor_impl_or_null(const at::Tensor& tensor) {
+  if (tensor.is_nested()) {
+    return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+  }
+  return nullptr;
+}
+
+inline NestedTensorImpl* get_nested_tensor_impl(
+    const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.is_nested(),
+      "get_nested_tensor_impl requires a NestedTensor.");
+  return static_cast<NestedTensorImpl*>(
+      tensor.unsafeGetTensorImpl());
+}
+
+
+// TODO: real implementation once we support strides.
+inline bool nested_tensor_impl_is_contiguous(
+    const NestedTensorImpl* nt,
+    at::MemoryFormat memory_format = MemoryFormat::Contiguous) {
+  return memory_format == MemoryFormat::Contiguous;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
index f9647389dc03..858aaf7a41f1 100644
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@@ -19,7 +19,7 @@ namespace at {
 
 template <typename T,
           typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline C10_HOST_DEVICE bool _isnan(T val) {
+inline C10_HOST_DEVICE bool _isnan(T /*val*/) {
   return false;
 }
 
@@ -63,7 +63,7 @@ inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
 
 template <typename T,
           typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline C10_HOST_DEVICE bool _isinf(T val) {
+inline C10_HOST_DEVICE bool _isinf(T /*val*/) {
   return false;
 }
 
diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h
index b58d4779ac7a..c25944b2074f 100644
--- a/aten/src/ATen/OpMathType.h
+++ b/aten/src/ATen/OpMathType.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include <c10/core/ScalarType.h>
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
+#include <c10/util/Exception.h>
 
 namespace at {
 
@@ -9,8 +11,26 @@ namespace at {
 template<typename scalar_t> struct OpMathType { using type = scalar_t; };
 template<> struct OpMathType<at::Half> { using type = float; };
 template<> struct OpMathType<at::BFloat16> { using type = float; };
+template<> struct OpMathType<c10::complex<Half>> { using type = c10::complex<float>; };
 
 template<typename T>
 using opmath_type = typename OpMathType<T>::type;
 
+namespace {
+
+c10::ScalarType toOpMathType(const c10::ScalarType type) {
+  switch (type) {
+#define DEFINE_CASE(scalar_t, TypeNum)                                  \
+    case ScalarType::TypeNum:                                           \
+      return CppTypeToScalarType<at::opmath_type<scalar_t>>::value;
+
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+#undef DEFINE_CASE
+
+    default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
+  }
+}
+
+}
+
 } // namespace at
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index 2c337e4a787e..63e451244a52 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>
 
@@ -29,7 +30,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
       : TensorImpl(key_set, data_type, device),
         opaque_handle_(std::move(opaque_handle)) {
     set_storage_access_should_throw();
-    set_has_contiguity_policy(HasContiguityPolicy::ContiguityNotSupported);
+    set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
     sizes_and_strides_.set_sizes(sizes);
     refresh_numel();
     is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
@@ -40,14 +41,6 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
     opaque_handle_ = {};
   }
 
-  IntArrayRef strides() const override {
-    AT_ERROR("opaque tensors do not have strides");
-  }
-
-  int64_t stride(int64_t d) const override {
-    AT_ERROR("opaque tensors do not have strides");
-  }
-
   void set_size(int64_t dim, int64_t new_size) override {
     AT_ERROR("opaque tensors do not have set_size");
   }
diff --git a/aten/src/ATen/ParallelNativeTBB.h b/aten/src/ATen/ParallelNativeTBB.h
index 01dda99990c8..a3675056f161 100644
--- a/aten/src/ATen/ParallelNativeTBB.h
+++ b/aten/src/ATen/ParallelNativeTBB.h
@@ -1,6 +1,6 @@
 #pragma once
-#include <ATen/ATen.h>
 
+#include <atomic>
 #include <cstddef>
 #include <exception>
 
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
new file mode 100644
index 000000000000..ae9f722de60a
--- /dev/null
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -0,0 +1,38 @@
+#include <ATen/PythonTorchFunctionTLS.h>
+#include <c10/core/TensorImpl.h>
+
+namespace at {
+namespace impl {
+
+static thread_local PythonTorchFunctionTLS pythonTorchFunctionState;
+
+void PythonTorchFunctionTLS::set_mode(std::shared_ptr<c10::SafePyObject> mode) {
+  pythonTorchFunctionState.mode_ = std::move(mode);
+}
+
+const std::shared_ptr<c10::SafePyObject>& PythonTorchFunctionTLS::get_mode() {
+  return pythonTorchFunctionState.mode_;
+}
+
+void PythonTorchFunctionTLS::swap_mode(std::shared_ptr<c10::SafePyObject>& mode) {
+  pythonTorchFunctionState.mode_.swap(mode);
+}
+
+void PythonTorchFunctionTLS::set_disabled(bool disabled) {
+  pythonTorchFunctionState.disabled_ = disabled;
+}
+
+bool PythonTorchFunctionTLS::is_disabled() {
+  return pythonTorchFunctionState.disabled_;
+}
+
+void PythonTorchFunctionTLS::set_state(const PythonTorchFunctionTLS& state) {
+  pythonTorchFunctionState = state;
+}
+
+const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
+  return pythonTorchFunctionState;
+}
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
new file mode 100644
index 000000000000..64256d2f7c21
--- /dev/null
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace impl {
+
+struct TORCH_API PythonTorchFunctionTLS {
+  static void set_disabled(bool);
+  static bool is_disabled();
+
+  static void set_mode(std::shared_ptr<c10::SafePyObject>);
+  static const std::shared_ptr<c10::SafePyObject>& get_mode();
+  static void swap_mode(std::shared_ptr<c10::SafePyObject>&);
+
+  static void set_state(const PythonTorchFunctionTLS& state);
+  static const PythonTorchFunctionTLS& get_state();
+
+private:
+  bool disabled_;
+  std::shared_ptr<c10::SafePyObject> mode_;
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
index 8eb10266d78f..98a38023f9b4 100644
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -15,8 +15,8 @@ inline void fill_inplace(Tensor& self, const Scalar& value_scalar) {
 
 namespace detail {
 Tensor& scalar_fill(Tensor& self, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
         fill_inplace<scalar_t>(self, value);
       });
   return self;
diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp
index 2029189912e6..adae0c7c0238 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/SparseCsrTensorImpl.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/core/LegacyTypeDispatch.h>
@@ -23,6 +24,7 @@ DeviceType SparseCsrTensorSetToDeviceType(DispatchKeySet key_set) {
 
 SparseCsrTensorImpl::SparseCsrTensorImpl(
     at::DispatchKeySet key_set,
+    at::Layout layout,
     const caffe2::TypeMeta data_type)
     : SparseCsrTensorImpl(
           key_set,
@@ -44,6 +46,8 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
               at::initialTensorOptions()
                   .device(SparseCsrTensorSetToDeviceType(key_set))
                   .dtype(data_type)) // values
+          ,
+          layout
       ) {}
 
 SparseCsrTensorImpl::SparseCsrTensorImpl(
@@ -51,30 +55,54 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
     const caffe2::TypeMeta data_type,
     at::Tensor crow_indices,
     at::Tensor col_indices,
-    at::Tensor values)
+    at::Tensor values,
+    at::Layout layout)
     : TensorImpl(key_set, data_type, values.device()),
       crow_indices_(std::move(crow_indices)),
       col_indices_(std::move(col_indices)),
-      values_(std::move(values)) {
+      values_(std::move(values)),
+      layout_(layout) {
+  // https://pytorch.org/blog/pytorch-feature-classification-changes/#beta
+  TORCH_WARN_ONCE("Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensor support is in beta state."
+                  "If you miss a functionality in the sparse tensor support, please submit a feature request "
+                  "to https://github.com/pytorch/pytorch/issues.");
   set_storage_access_should_throw();
+  is_non_overlapping_and_dense_ = false;
+  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  // TODO: If this check ever shows up as a bottleneck, which is unlikely given that
+  // comparing devices only involves comparing the type and index (two integers), we
+  // can move this to a DEBUG only assert. Until then this confirms and maintains a
+  // crucial invariance.
+  TORCH_CHECK(values_.device() == crow_indices_.device(), "Values and crow_indices need to be on the same device.");
+  TORCH_CHECK(values_.device() == col_indices_.device(), "Values and col_indices need to be on the same device.");
+}
+
+const char* SparseCsrTensorImpl::tensorimpl_type_name() const {
+  return "SparseCsrTensorImpl";
 }
 
 void SparseCsrTensorImpl::resize_(int64_t nnz, IntArrayRef size) {
-  auto rows = size[0];
-  auto cols = size[1];
+  auto rows = size[size.size() - 2];
+  auto cols = size[size.size() - 1];
   auto old_crow_indices_size = crow_indices_.size(-1);
-  crow_indices_.resize_({rows + 1});
+
+  auto new_crow_indices_size = DimVector(size.slice(0, size.size() - 2));
+  new_crow_indices_size.push_back(rows + 1);
+  crow_indices_.resize_(new_crow_indices_size);
   if (rows + 1 >= old_crow_indices_size) {
     crow_indices_.narrow(-1, old_crow_indices_size, rows + 1 - old_crow_indices_size).fill_(nnz);
   } else {
     crow_indices_.narrow(-1, rows, 1).fill_(std::min<int64_t>(nnz, rows*cols));
   }
-  col_indices_.resize_({std::min<int64_t>(nnz, rows*cols)});
-  values_.resize_({std::min<int64_t>(nnz, rows*cols)});
+  auto col_indices_values_size = DimVector(size.slice(0, size.size() - 2));
+  col_indices_values_size.push_back(std::min<int64_t>(nnz, rows*cols));
+  col_indices_.resize_(col_indices_values_size);
+  values_.resize_(col_indices_values_size);
   sizes_and_strides_.set_sizes(size);
 }
 
 void SparseCsrTensorImpl::resize_as_sparse_csr_tensor_(const Tensor& src) {
+  set_layout(src.layout());
   crow_indices_ = at::empty_like(
       src.crow_indices(),
       src.crow_indices().options(),
@@ -112,5 +140,25 @@ void SparseCsrTensorImpl::set_member_tensors(
 
   sizes_and_strides_.set_sizes(size);
   refresh_numel();
+  // TODO: If this check ever shows up as a bottleneck, which is unlikely given that
+  // comparing devices only involves comparing the type and index (two integers), we
+  // can move this to a DEBUG only assert. Until then this confirms and maintains a
+  // crucial invariance.
+  TORCH_CHECK(values_.device() == crow_indices_.device(), "Values and crow_indices need to be on the same device.");
+  TORCH_CHECK(values_.device() == col_indices_.device(), "Values and col_indices need to be on the same device.");
+}
+
+IntArrayRef SparseCsrTensorImpl::strides_custom() const {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have strides");
+}
+void SparseCsrTensorImpl::set_size(int64_t dim, int64_t new_size) {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_size.");
+}
+void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_stride.");
 }
+void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
+}
+
 } // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index 850e0a02a448..174ce53a2dad 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -28,9 +28,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   Tensor crow_indices_;
   Tensor col_indices_;
   Tensor values_;
+  Layout layout_;
 
  public:
-  explicit SparseCsrTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
+  explicit SparseCsrTensorImpl(at::DispatchKeySet, Layout layout, const caffe2::TypeMeta);
 
   void resize_(int64_t nnz, IntArrayRef size);
   void resize_as_sparse_csr_tensor_(const Tensor& src);
@@ -40,10 +41,31 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
       const Tensor& values,
       IntArrayRef size);
 
-  const Tensor& crow_indices() const { return crow_indices_; }
-  const Tensor& col_indices() const { return col_indices_; }
+  const Tensor& compressed_indices() const { return crow_indices_; }
+  const Tensor& plain_indices() const { return col_indices_; }
   const Tensor& values() const { return values_; }
-  int nnz() { return values_.size(0); }
+  int nnz() { return col_indices_.size(-1); }
+
+ protected:
+  IntArrayRef strides_custom() const override;
+
+ public:
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  void set_storage_offset(int64_t storage_offset) override;
+  Layout layout_impl() const override { return layout_; }
+  void set_layout(Layout layout) {
+    switch (layout) {
+    case kSparseCsr:
+    case kSparseCsc:
+    case kSparseBsr:
+    case kSparseBsc:
+      layout_ = layout;
+      break;
+    default:
+      TORCH_CHECK(false, "unsupported layout ", layout);
+    }
+  }
 
   /**
    * Return a TensorImpl that is a shallow-copy of this TensorImpl.
@@ -54,7 +76,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       const c10::VariableVersion& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(key_set(), dtype());
+    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(key_set(), layout_impl(), dtype());
     copy_tensor_metadata(
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
@@ -73,7 +95,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       c10::VariableVersion&& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(key_set(), dtype());
+    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(key_set(), layout_impl(), dtype());
     copy_tensor_metadata(
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
@@ -89,7 +111,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
       const caffe2::TypeMeta data_type,
       at::Tensor crow_indices,
       at::Tensor col_indices,
-      at::Tensor values);
+      at::Tensor values,
+      at::Layout layout);
+
+  const char* tensorimpl_type_name() const override;
 
   /**
    * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
@@ -105,9 +130,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
     TensorImpl::copy_tensor_metadata(src_sparse_impl, dest_sparse_impl, version_counter, allow_tensor_metadata_change);
 
     // Sparse-specific fields
-    dest_sparse_impl->crow_indices_ = src_sparse_impl->crow_indices();
-    dest_sparse_impl->col_indices_ = src_sparse_impl->col_indices();
+    dest_sparse_impl->crow_indices_ = src_sparse_impl->compressed_indices();
+    dest_sparse_impl->col_indices_ = src_sparse_impl->plain_indices();
     dest_sparse_impl->values_ = src_sparse_impl->values();
+    dest_sparse_impl->layout_ = src_sparse_impl->layout_impl();
   }
 };
 } // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
index 6dd328003ca8..dfc7ff881304 100644
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -5,16 +5,151 @@
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/SparseTensorUtils.h>
 
+#define AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, ...)    \
+  [&] {                                                                 \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsr:                                                    \
+    case kSparseCsc:                                                    \
+    case kSparseBsr:                                                    \
+    case kSparseBsc:                                                    \
+      return __VA_ARGS__();                                             \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, ROW_DIM_ACTION, COLUMN_DIM_ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsr:                                                    \
+    case kSparseBsr:                                                    \
+      return (ROW_DIM_ACTION)();                                        \
+    case kSparseCsc:                                                    \
+    case kSparseBsc:                                                    \
+      return (COLUMN_DIM_ACTION)();                                     \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, NO_BLOCK_ACTION, BLOCK_ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsr:                                                    \
+    case kSparseCsc:                                                    \
+      return (NO_BLOCK_ACTION)();                                       \
+    case kSparseBsr:                                                    \
+    case kSparseBsc:                                                    \
+      return (BLOCK_ACTION)();                                          \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(LAYOUT, NAME, ROW_DIM_ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsr:                                                    \
+    case kSparseBsr:                                                    \
+      return (ROW_DIM_ACTION)();                                        \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse row compressed tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(LAYOUT, NAME, COL_DIM_ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsc:                                                    \
+    case kSparseBsc:                                                    \
+      return (COL_DIM_ACTION)();                                        \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse column compressed tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(LAYOUT, NAME, ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseCsr:                                                    \
+    case kSparseCsc:                                                    \
+      return (ACTION)();                                                \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse compressed (non-block) tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
+#define AT_DISPATCH_SPARSE_COMPRESSED_BLOCK_LAYOUTS(LAYOUT, NAME, ACTION) \
+  [&]() {                                                               \
+    const auto& the_layout = LAYOUT;                                    \
+    switch (the_layout) {                                               \
+    case kSparseBsr:                                                    \
+    case kSparseBsc:                                                    \
+      return (ACTION)();                                                \
+    default:                                                            \
+      AT_ERROR(#NAME, " expected sparse compressed block tensor layout but got ", the_layout); \
+    }                                                                   \
+  } ()
+
 namespace at {
 namespace sparse_csr {
 
 using SparseCsrTensor = Tensor;
 
 inline SparseCsrTensorImpl* get_sparse_csr_impl(const SparseCsrTensor& self) {
-  AT_ASSERTM(
-      self.is_sparse_csr(),
-      "_internal_get_SparseCsrTensorImpl: not a sparse CSR tensor");
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "get_sparse_csr_impl", [&] {});
   return static_cast<SparseCsrTensorImpl*>(self.unsafeGetTensorImpl());
 }
-} // namespace sparse
+
+inline std::string layoutToString(Layout layout, bool upper=false, bool lower=false) {
+  switch (layout) {
+  case kSparseCsr: return (upper ? "CSR" : (lower ? "csr" : "Csr"));
+  case kSparseCsc: return (upper ? "CSC" : (lower ? "csc" : "Csc"));
+  case kSparseBsr: return (upper ? "BSR" : (lower ? "bsr" : "Bsr"));
+  case kSparseBsc: return (upper ? "BSC" : (lower ? "bsc" : "Bsc"));
+  default:
+    TORCH_CHECK(false, "Not a sparse compressed layout:", layout);
+    return "";
+  }
+}
+
+inline bool isCompressedRow(Layout layout) {
+  return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "isCompressedRow", [&]{ return true; }, [&]{ return false; });
+}
+
+inline bool isCompressedColumn(Layout layout) {
+  return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "isCompressedColumn", [&]{ return false; }, [&]{ return true; });
+}
+
+inline std::string compressedIndicesName(Layout layout) {
+  return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "compressedIndicesName", [&]{ return "crow_indices"; }, [&]{ return "ccol_indices"; });
+}
+
+inline std::string plainIndicesName(Layout layout) {
+  return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "plainIndicesName", [&]{ return "col_indices"; }, [&]{ return "row_indices"; });
+}
+
+inline int rowDimension(Layout layout, IntArrayRef size) {
+  return size.size() - (isCompressedRow(layout) ? 2 : 1);
+}
+
+inline int columnDimension(Layout layout, IntArrayRef size) {
+  return size.size() - (isCompressedColumn(layout) ? 2 : 1);
+}
+
+inline int compressedDimension(Layout layout, IntArrayRef size) {
+  return size.size() - (isCompressedRow(layout) ? 2 : 1);
+}
+
+inline int plainDimension(Layout layout, IntArrayRef size) {
+  return size.size() - (isCompressedRow(layout) ? 1 : 2);
+}
+
+} // namespace sparse_csr
 } // namespace at
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index e144ffd479d6..61303866c450 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -51,7 +51,7 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
 
   is_non_overlapping_and_dense_ = false;
   set_storage_access_should_throw();
-  set_has_contiguity_policy(HasContiguityPolicy::ContiguityNotSupported);
+  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
 }
 
 void SparseTensorImpl::release_resources() {
@@ -60,12 +60,6 @@ void SparseTensorImpl::release_resources() {
   indices_.reset();
 }
 
-IntArrayRef SparseTensorImpl::strides() const {
-  AT_ERROR("sparse tensors do not have strides");
-}
-int64_t SparseTensorImpl::stride(int64_t d) const {
-  AT_ERROR("sparse tensors do not have strides");
-}
 void SparseTensorImpl::set_size(int64_t dim, int64_t new_size) {
   AT_ERROR("sparse tensors do not have set_size");
 }
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index a52236d9369b..7381540ea3ff 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -54,8 +54,6 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   Tensor indices() const { return indices_; }
   Tensor values() const { return values_; }
 
-  IntArrayRef strides() const override;
-  int64_t stride(int64_t d) const override;
   void set_size(int64_t dim, int64_t new_size) override;
   void set_stride(int64_t dim, int64_t new_stride) override;
   void set_storage_offset(int64_t storage_offset) override;
diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp
index d5811b933e7c..712e85e851be 100644
--- a/aten/src/ATen/SparseTensorUtils.cpp
+++ b/aten/src/ATen/SparseTensorUtils.cpp
@@ -30,7 +30,7 @@ Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_
     }
   } else {
     std::vector<int64_t> indices_mult_cpu_vec;
-    indices_mult_cpu_vec.reserve(sparse_dim);
+    indices_mult_cpu_vec.resize(sparse_dim);
     int64_t mult = 1;
     for (int64_t i = sparse_dim - 1; i >= 0; i--) {
       indices_mult_cpu_vec[i] = mult;
diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h
index 1dfb8bb4ffcb..0b3719cca3bf 100644
--- a/aten/src/ATen/Tensor.h
+++ b/aten/src/ATen/Tensor.h
@@ -1,3 +1,3 @@
 #pragma once
 
-#include <ATen/core/TensorBody.h>
+#include <ATen/core/Tensor.h>
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index 20ab6bb6690c..164a7b279129 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -1,10 +1,30 @@
 #include <ATen/TensorGeometry.h>
-#include <ATen/TensorUtils.h>
 
-#include <ATen/ATen.h>
+#include <limits>
+#include <cstddef>
 
 namespace at {
 
+// See TensorGeometry.h on why this is useful now that we cache is_contiguous.
+bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
+  assert(!overflows<std::int64_t>(sizes.size()));
+  auto dim = static_cast<std::int64_t>(sizes.size());
+  int64_t expected_stride = 1;
+  bool contig_if_nonempty = true;
+  for (int64_t i = dim - 1; i >= 0; i--) {
+    if (sizes[i] == 0) {
+      return true;
+    }
+    if (contig_if_nonempty) {
+      if (sizes[i] != 1 && strides[i] != expected_stride) {
+        contig_if_nonempty = false;
+      }
+      expected_stride *= sizes[i];
+    }
+  }
+  return contig_if_nonempty;
+}
+
 bool TensorGeometry::is_contiguous() const {
   if (numel_ == 0) {
     return true;
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index ad3e16da4a6a..7762cc94df61 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -1,10 +1,17 @@
 #pragma once
 
-#include <ATen/WrapDimUtils.h>
-#include <ATen/core/Tensor.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <ATen/core/TensorBase.h>
 
 namespace at {
 
+// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
+// Although we cache is_contiguous in tensor now, this is till useful because it
+// allows checking if a particular geometry is contiguous without explicitly
+// constructing a tensor, e.g., when you want to choose a kernel strategy based
+// on whether a subgeometry is contiguous.
+TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
+
 struct TORCH_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
@@ -21,7 +28,7 @@ struct TORCH_API TensorGeometry {
       numel_ = expected_stride;
   }
 
-  explicit TensorGeometry(const Tensor& t)
+  explicit TensorGeometry(const TensorBase& t)
     : sizes_(t.sizes().vec())
     , strides_(t.strides().vec())
     , storage_offset_(t.storage_offset())
@@ -32,12 +39,12 @@ struct TORCH_API TensorGeometry {
 
   int64_t dim() const { return sizes_.size(); }
   int64_t size(int64_t dim) const {
-    dim = maybe_wrap_dim(dim, this->dim());
+    dim = c10::maybe_wrap_dim(dim, this->dim());
     return sizes_.at(static_cast<size_t>(dim));
   }
   IntArrayRef sizes() const { return IntArrayRef{ sizes_ }; }
   int64_t stride(int64_t dim) const {
-    dim = maybe_wrap_dim(dim, this->dim());
+    dim = c10::maybe_wrap_dim(dim, this->dim());
     return strides_.at(static_cast<size_t>(dim));
   }
   IntArrayRef strides() const { return IntArrayRef{ strides_ }; }
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 71c9c3feb9e7..8352b510f609 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -217,7 +217,7 @@ static inline Tensor applySelect(
     int64_t dim,
     int64_t index,
     int64_t real_dim,
-    const at::Device& self_device,
+    const at::Device& /*self_device*/,
     const IntArrayRef& self_sizes) {
   TORCH_CHECK_INDEX(
     !(index == 0 && dim == 0 && self_sizes.size() == 0),
@@ -272,7 +272,7 @@ static inline void recordTensorIndex(const Tensor& tensor, std::vector<Tensor>&
   (*dim_ptr)++;
 };
 
-static inline c10::List<c10::optional<Tensor>> typeConvertIndices(const Tensor& self, std::vector<Tensor>&& indices) {
+static inline c10::List<c10::optional<Tensor>> typeConvertIndices(const Tensor& /*self*/, std::vector<Tensor>&& indices) {
   c10::List<c10::optional<Tensor>> converted_inds;
   converted_inds.reserve(indices.size());
   for (const auto &i: indices){
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index f978456754d9..907ec8c5c57d 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/TensorIterator.h>
 #undef TORCH_ASSERT_NO_OPERATORS
@@ -13,6 +14,13 @@
 #include <ATen/TensorOperators.h>
 #include <ATen/TensorIteratorInternal.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#endif
+
 #include <c10/util/irange.h>
 #include <c10/util/SmallBuffer.h>
 
@@ -564,19 +572,19 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
         // can just return contiguous output
         // it is faster because it avoids allocating 0 size tensor and
         // resizing and restriding it
-        set_output(i, tensor_shape, {}, original_options(op), names_);
+        set_output_raw_strided(i, tensor_shape, {}, original_options(op), names_);
       } else {
         auto tensor_stride = invert_perm(op.stride_bytes);
         for (const auto dim : c10::irange(ndim())) {
           tensor_stride[dim] /= element_size;
         }
-        set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
+        set_output_raw_strided(i, tensor_shape, tensor_stride, original_options(op), names_);
       }
       op.current_dtype = op.target_dtype;
     } else if (op.tensor_base().defined()) {
       // Even if we don't resize, we still need to tell set_output about
       // the output, so that we properly set guard and propagate names
-      set_output(i, op.tensor_base().sizes(), {}, original_options(op), names_);
+      set_output_raw_strided(i, op.tensor_base().sizes(), {}, original_options(op), names_);
     }
   }
 }
@@ -1326,7 +1334,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
-          set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::Contiguous), names_);
+          set_output_raw_strided(i, shape_, {}, original_options(op).memory_format(MemoryFormat::Contiguous), names_);
         }
         break;
       }
@@ -1337,7 +1345,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
-          set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::ChannelsLast), names_);
+          set_output_raw_strided(i, shape_, {}, original_options(op).memory_format(MemoryFormat::ChannelsLast), names_);
         }
         break;
       }
@@ -1354,7 +1362,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
           }
-          set_output(i, shape_, tensor_base(i_defined).strides(), original_options(op), names_);
+          set_output_raw_strided(i, shape_, tensor_base(i_defined).strides(), original_options(op), names_);
         }
         break;
       }
@@ -1485,8 +1493,10 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
   // Extend the condition to ORT tesnors as ORT tensors also don't have storage.
   if (common_device_.type() == DeviceType::XLA  ||
+      common_device_.type() == DeviceType::IPU  ||
       common_device_.type() == DeviceType::Lazy ||
-      common_device_.type() == DeviceType::ORT) return;
+      common_device_.type() == DeviceType::ORT  ||
+      common_device_.type() == DeviceType::HPU) return;
 
   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor_base().defined());
@@ -1501,14 +1511,14 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
   view_offsets_ = DimVector(ndim_offsets, 0);
 }
 
-// This is the structured kernels implementation of set_output.  It is
+// This is the structured kernels' implementation of set_output.  It is
 // NEVER actually called directly; instead, a subclass of TensorIteratorBase
 // will override set_output to actually do the operation, and then call
 // set_output on the TensorIteratorBase to setup TI's metadata.
 // The precondition for this function is that maybe_get_output() now
 // unconditionally returns a real Tensor (prior to output setting,
 // this function may return an undefined tensor.)
-void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+void TensorIteratorBase::set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
   const auto& t = maybe_get_output(output_idx);
@@ -1575,7 +1585,7 @@ void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntAr
 // This is the "traditional" implementation of set_output.  On TensorIterator
 // instances, it is invoked directly from various call sites in this file.  No
 // funny business.
-void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+void TensorIterator::set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
   // NB: intentionally no superclass call
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index 1c485e84f16d..d09f6e77e95a 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -415,7 +415,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
     return true;
   }
 
-  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
+  void set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
 
 #define TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, maybestatic)                               \
   maybestatic void methodname(TensorBase&& out, const TensorBase& a, const TensorBase& b) = delete; \
@@ -591,7 +591,7 @@ struct TORCH_API TensorIterator final : public TensorIteratorBase {
 #undef TORCH_DISALLOW_TEMPORARIES_IMPL
 
   const Tensor& maybe_get_output(int64_t output_idx) override;
-  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
+  void set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
 };
 
 class TORCH_API TensorIteratorConfig final {
diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h
index 128bb67aa970..5608046db598 100644
--- a/aten/src/ATen/TensorMeta.h
+++ b/aten/src/ATen/TensorMeta.h
@@ -2,6 +2,7 @@
 
 #include <ATen/DimVector.h>
 #include <c10/core/TensorOptions.h>
+#include <c10/util/strides.h>
 #include <ATen/core/Dimname.h>
 
 C10_CLANG_DIAGNOSTIC_PUSH()
@@ -62,7 +63,10 @@ namespace impl {
 //
 // A notable subclass of this interface is TensorIteratorBase.
 struct TORCH_API MetaBase {
-  virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0;
+  virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+    set_output_raw_strided(output_idx, sizes, strides, options, names);
+  }
+
   virtual const Tensor& maybe_get_output(int64_t output_idx) = 0;
   void set_output(IntArrayRef sizes, TensorOptions options) {
     set_output(0, sizes, {}, options, {});
@@ -70,6 +74,59 @@ struct TORCH_API MetaBase {
   void set_output(int64_t output_idx, IntArrayRef sizes, TensorOptions options) {
     set_output(output_idx, sizes, {}, options, {});
   }
+
+  // See: https://github.com/pytorch/pytorch/issues/69813
+  // Whenever defining the output properties in the META function of a structured
+  // kernel (what was usually done with `set_output`), use one of these 3 variants,
+  // instead. In order to decide which variant to use, check the following
+  // decision tree:
+  //
+  // - Can the kernel you are going to implement support output tensors
+  //   with arbitrary strides?
+  //     |
+  //     -- YES: `set_output_raw_strided`
+  //     |
+  //     -- NO: Should the output tensor strides be contiguous?
+  //         |
+  //         -- YES: `set_output_contiguous`
+  //         |
+  //         -- NO: `set_output_strided`
+  //
+  // Use this function whenever the kernel requires specific strides for the output.
+  // If `strides` does not match the given output strides, proxy outputs will be
+  // created and passed to the IMPL function.
+  virtual void set_output_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options,
+      DimnameList names = {}) {
+    TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented.");
+  }
+
+  // Use this function whenever the kernel knows how to handle arbitrary strided outputs.
+  // This function has the same behavior as the old `set_output`: it will only
+  // re-stride if the given output was resized.
+  virtual void set_output_raw_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides_hint,
+      TensorOptions options,
+      DimnameList names = {}) {
+    TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented.");
+  }
+
+  // Use this function if the kernel requires contiguous strides.
+  // Alias for `set_output_strided`, but with contiguous strides.
+  void set_output_contiguous(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      TensorOptions options,
+      DimnameList names = {}) {
+    auto strides = c10::contiguous_strides(sizes);
+    set_output_strided(output_idx, sizes, strides, options, names);
+  }
+
   // Returns a reference to an undefined tensor if there is no presupplied
   // output
   const Tensor& maybe_get_output() { return maybe_get_output(0); }
diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h
index 7f5517bc0811..e9f5e7d26e11 100644
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@@ -28,8 +28,7 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({
 
 constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({
     DispatchKey::Batched,
-    DispatchKey::SparseCPU,
-    DispatchKey::SparseCUDA,
+    DispatchKey::Sparse,
     DispatchKey::SparseCsrCPU,
     DispatchKey::SparseCsrCUDA,
     DispatchKey::Meta,
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 754c73bb6154..7fbddd7a3482 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -129,15 +129,15 @@ void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors) {
 }
 
 void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
-  if (! (t1->is_cuda()) || ! (t2->is_cuda())) {
+  if (t1->is_cpu() || t2->is_cpu()) {
     std::ostringstream oss;
-    if (! t1->is_cuda()) {
+    if (t1->is_cpu()) {
       oss << "Tensor for " << t1 << " is on CPU, ";
     }
-    if (! t2->is_cuda()) {
+    if (t2->is_cpu()) {
       oss << "Tensor for " << t2 << " is on CPU, ";
     }
-    oss << "but expected " << ((!(t1->is_cuda() || t2->is_cuda())) ? "them" : "it")
+    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
         << " to be on GPU (while checking arguments for " << c << ")";
     AT_ERROR(oss.str());
   }
@@ -264,25 +264,6 @@ void * maybe_data_ptr(const TensorArg& tensor) {
   return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
 }
 
-// See TensorUtils.h on why this is useful now that we cache is_contiguous.
-bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
-  int64_t dim = sizes.size();
-  int64_t expected_stride = 1;
-  bool contig_if_nonempty = true;
-  for (int64_t i = dim - 1; i >= 0; i--) {
-    if (sizes[i] == 0) {
-      return true;
-    }
-    if (contig_if_nonempty) {
-      if (sizes[i] != 1 && strides[i] != expected_stride) {
-        contig_if_nonempty = false;
-      }
-      expected_stride *= sizes[i];
-    }
-  }
-  return contig_if_nonempty;
-}
-
 void check_dim_size(
     const Tensor& tensor,
     int64_t dim,
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index f018c33f1aea..e8adf16ca183 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -138,13 +138,6 @@ TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layo
 TORCH_API void* maybe_data_ptr(const Tensor& tensor);
 TORCH_API void* maybe_data_ptr(const TensorArg& tensor);
 
-// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
-// Although we cache is_contiguous in tensor now, this is till useful because it
-// allows checking if a particular geometry is contiguous without explicitly
-// constructing a tensor, e.g., when you want to choose a kernel strategy based
-// on whether a subgeometry is contiguous.
-TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
-
 TORCH_API void check_dim_size(
     const Tensor& tensor,
     int64_t dim,
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 3e3d4d6a9573..8315ddad97b2 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -13,13 +13,13 @@ ThreadLocalState::ThreadLocalState()
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
       debug_info_(c10::ThreadLocalDebugInfo::current()),
       functorch_tls_(functorch::getCopyOfFuncTorchTLS()),
-      autograd_tls_(c10::AutogradState::get_tls_state()) {
+      autograd_tls_(c10::AutogradState::get_tls_state()),
+      python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()) {
   rf_tls_ = at::get_record_function_tls_();
 
   saved_tensors_default_hooks_ = at::SavedTensorDefaultHooks::get_stack();
 
-  bumped_record_all_functions_ = at::checkRecordAllFunctions();
-  python_mode_state_ = at::impl::PythonModeTLS::get_state();
+  torch_dispatch_mode_state_ = at::impl::TorchDispatchModeTLS::get_state();
 }
 
 void ThreadLocalState::set_grad_mode(bool enabled) {
@@ -33,7 +33,9 @@ void ThreadLocalState::setThreadLocalState(
   // restore the dispatch key set TLS at the same time.
   c10::AutogradState::set_tls_state(state.autograd_tls_);
 
-  at::impl::PythonModeTLS::set_state(state.python_mode_state_);
+  at::impl::TorchDispatchModeTLS::set_state(state.torch_dispatch_mode_state_);
+
+  at::impl::PythonTorchFunctionTLS::set_state(state.python_torch_function_state_);
 
   at::set_record_function_tls_(state.rf_tls_);
 
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index c5f14518f422..3818827d479b 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -9,7 +9,8 @@
 
 #include <ATen/record_function.h>
 #include <ATen/FuncTorchTLS.h>
-#include <ATen/core/PythonModeTLS.h>
+#include <ATen/core/TorchDispatchModeTLS.h>
+#include <ATen/PythonTorchFunctionTLS.h>
 
 namespace at {
 
@@ -53,14 +54,15 @@ class TORCH_API ThreadLocalState {
   // TLS for AutogradModes
   AutogradState autograd_tls_;
 
-  std::shared_ptr<TorchDispatchTypeObject> python_mode_state_;
+  // TLS for enable_torch_dispatch_mode
+  std::shared_ptr<SafePyObject> torch_dispatch_mode_state_;
+
+  // TLS for __torch_function__ (mode and disable_torch_function)
+  at::impl::PythonTorchFunctionTLS python_torch_function_state_;
 
   // TLS for saved tensors default hooks
   std::stack<std::pair<PyObject*, PyObject*>> saved_tensors_default_hooks_;
 
-  // Whether pre-sampling RecordFunction optimization was enabled
-  bool bumped_record_all_functions_ = false;
-
   friend class ThreadLocalStateGuard;
 };
 
@@ -68,21 +70,7 @@ class TORCH_API ThreadLocalState {
 class TORCH_API ThreadLocalStateGuard {
  public:
   explicit ThreadLocalStateGuard(const ThreadLocalState& state)
-      : prev_state_(ThreadLocalState()),
-        bumped_record_all_functions_(state.bumped_record_all_functions_) {
-    // Special handling of RecordFunction pre-sampling optimization:
-    // pre-samping is enabled (bumped) when there're non-sampled
-    // (or high-frequency) global or TLS callbacks.
-    //
-    // ThreadLocalStateGuard simply resets RecordFunction's TLS and
-    // hence its thread local callbacks.
-    //
-    // Checking if the pre-sampling was enabled and preserving it in the
-    // async task by calling bumpRecordAllFunctions() and the corresponding
-    // releaseRecordAllFunctions()
-    if (bumped_record_all_functions_) {
-      at::bumpRecordAllFunctions();
-    }
+      : prev_state_(ThreadLocalState()) {
     // set the given state across the thread boundary
     ThreadLocalState::setThreadLocalState(state);
   }
@@ -90,15 +78,10 @@ class TORCH_API ThreadLocalStateGuard {
   ~ThreadLocalStateGuard() {
     // restore previously set variables
     ThreadLocalState::setThreadLocalState(prev_state_);
-    if (bumped_record_all_functions_) {
-      at::releaseRecordAllFunctions();
-    }
   }
 
  private:
   const ThreadLocalState prev_state_;
-  // Whether pre-sampling RecordFunction optimization was enabled
-  bool bumped_record_all_functions_ = false;
 };
 
 template <typename T>
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index 9160cbe2fedd..36b0785400ba 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -91,29 +91,6 @@ std::array<int64_t, N> check_intlist(ArrayRef<int64_t> list, const char * name,
   return res;
 }
 
-/**
- * Utility function to static cast input Generator* to
- * the backend generator type (CPU/CUDAGeneratorImpl etc.)
- */
-template <typename T>
-static inline T * check_generator(c10::optional<Generator> gen) {
-  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
-  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
-  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
-  return gen->get<T>();
-}
-
-/**
- * Utility function used in tensor implementations, which
- * supplies the default generator to tensors, if an input generator
- * is not supplied. The input Generator* is also static casted to
- * the backend generator type (CPU/CUDAGeneratorImpl etc.)
- */
-template <typename T>
-static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
-  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
-}
-
 using at::detail::check_size_nonnegative;
 
 namespace detail {
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index e8cfa3e6b553..4b9da640fb76 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -205,7 +205,7 @@ std::string show_config() {
 
   // TODO: do HIP
   // TODO: do XLA
-  // TODO: do MLC
+  // TODO: do MPS
 
   return ss.str();
 }
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 24fe684c6dc6..bb3fdd484992 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -4,6 +4,7 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/util/irange.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
 
 namespace at {
 
@@ -74,7 +75,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, const std::vector<std::ve
   return dim;
 }
 
-static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
+static inline int64_t legacy_cat_wrap_dim(int64_t dim, ITensorListRef tensors) {
   for (auto& tensor : tensors) {
     if (tensor.dim() == 1 && tensor.sizes()[0] == 0) {
       continue;
diff --git a/aten/src/ATen/ZeroTensorFallback.cpp b/aten/src/ATen/ZeroTensorFallback.cpp
index aeb530768656..d948c613ccf8 100644
--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@@ -99,6 +99,7 @@ namespace at {
     // do not use the fallback.
     // m.impl("mul.Tensor", torch::CppFunction::makeFallthrough());
     // m.impl("add.Tensor", torch::CppFunction::makeFallthrough());
+    // m.impl("linalg_cross", torch::CppFunction::makeFallthrough());
 
     TORCH_VIEW_FNS(m)
     TENSOR_UTILITIES_AND_CONSTRUCTORS(m)
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 7bdab0464195..51343d860a57 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -28,6 +28,14 @@ void set_cpu_enabled(bool new_enabled) {
   c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU, !new_enabled);
 }
 
+bool is_xpu_enabled() {
+  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastXPU);
+}
+
+void set_xpu_enabled(bool new_enabled) {
+  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastXPU, !new_enabled);
+}
+
 namespace {
 // Imitate Apex and cache some of the casts to streamline parameter reuse.
 // Our heuristic is to cache lower_precision_fp casts of fp32 model weights (see cached_cast below).
@@ -58,6 +66,9 @@ thread_local int nesting = 0;
 // autocast_cpu_dtype is the lower_precision_fp used by AutocastCPU.
 thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16;
 
+// autocast_xpu_dtype is the lower_precision_fp used by AutocastXPU.
+thread_local at::ScalarType autocast_xpu_dtype = at::kBFloat16;
+
 // should we enabled the cache inside autocast.
 thread_local bool cache_enabled = true;
 
@@ -85,6 +96,10 @@ at::ScalarType get_autocast_cpu_dtype() {
   return autocast_cpu_dtype;
 }
 
+at::ScalarType get_autocast_xpu_dtype() {
+  return autocast_xpu_dtype;
+}
+
 void set_autocast_cpu_dtype(at::ScalarType dtype) {
   TORCH_CHECK(
       dtype == at::kBFloat16,
@@ -96,6 +111,10 @@ void set_autocast_gpu_dtype(at::ScalarType dtype) {
   autocast_gpu_dtype = dtype;
 }
 
+void set_autocast_xpu_dtype(at::ScalarType dtype) {
+  autocast_xpu_dtype = dtype;
+}
+
 bool is_autocast_cache_enabled() {
   return cache_enabled;
 }
@@ -325,6 +344,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(addmv), "addmv", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL(ADD_NS(addr), "addr", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL(ADD_NS(matmul), "matmul", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+  KERNEL(ADD_NS(einsum), "einsum", Tensor (c10::string_view, TensorList), lower_precision_fp)
   KERNEL(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
   KERNEL(ADD_NS(mv), "mv", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
   KERNEL(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&), lower_precision_fp)
@@ -474,46 +494,18 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL_CPU(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &), lower_precision_fp)
   KERNEL_CPU(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp)
+  KERNEL_CPU(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool, bool), lower_precision_fp)
+  KERNEL_CPU(ADD_NS(matmul), "matmul", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+  KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), lower_precision_fp)
 
   // fp32 cast policy
   KERNEL_CPU(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
-  KERNEL_CPU(ADD_NS(batch_norm), "batch_norm", Tensor (const Tensor &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, bool, double, double, bool), fp32)
-
-  KERNEL_CPU(ADD_NS(dropout), "dropout", Tensor (const Tensor &, double, bool), fp32)
-  KERNEL_CPU(ADD_NS(avg_pool1d), "avg_pool1d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool), fp32)
-  KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
   KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
-  KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact1d), "_upsample_nearest_exact1d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact1d), "_upsample_nearest_exact1d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact2d), "_upsample_nearest_exact2d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact2d), "_upsample_nearest_exact2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact3d), "_upsample_nearest_exact3d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(_upsample_nearest_exact3d), "_upsample_nearest_exact3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
-  KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
-
   KERNEL_CPU(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, bool, double, double, bool), fp32)
   KERNEL_CPU(ADD_NS(grid_sampler), "grid_sampler", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
   KERNEL_CPU(ADD_NS(polar), "polar", Tensor(const Tensor &, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(multinomial), "multinomial", Tensor(const Tensor &, int64_t, bool, c10::optional<at::Generator>), fp32)
-  KERNEL_CPU(ADD_NS(poisson), "poisson", Tensor(const Tensor &, c10::optional<at::Generator>), fp32)
-  KERNEL_CPU(ADD_NS(fmod), "fmod.Tensor", Tensor(const Tensor &, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(fmod), "fmod.Scalar", Tensor(const Tensor &, const Scalar &), fp32)
   KERNEL_CPU(ADD_NS(prod), "prod", Tensor(const Tensor &, c10::optional<at::ScalarType>), fp32)
   KERNEL_CPU(ADD_NS(prod), "prod.dim_int", Tensor(const Tensor &, int64_t, bool, c10::optional<at::ScalarType>), fp32)
   KERNEL_CPU(ADD_NS(prod), "prod.dim_Dimname", Tensor(const Tensor &, at::Dimname, bool, c10::optional<at::ScalarType>), fp32)
@@ -522,36 +514,22 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool, c10::string_view), fp32)
   KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
   KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::optional<bool>, c10::optional<bool>), fp32)
+  KERNEL_CPU(ADD_NS(stft), "stft.center", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::string_view, bool, c10::optional<bool>, c10::optional<bool>), fp32)
   KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
-  KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>), fp32)
-  KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)
-  KERNEL_CPU(ADD_NS(cumprod), "cumprod.dimname", Tensor(const Tensor &, at::Dimname, c10::optional<at::ScalarType>), fp32)
-  KERNEL_CPU(ADD_NS(cumsum), "cumsum", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)
-  KERNEL_CPU(ADD_NS(cumsum), "cumsum.dimname", Tensor(const Tensor &, at::Dimname, c10::optional<at::ScalarType>), fp32)
-  KERNEL_CPU(ADD_NS(diag), "diag", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(diagflat), "diagflat", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(histc), "histc", Tensor(const Tensor &, int64_t, const at::Scalar &, const at::Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(logcumsumexp), "logcumsumexp", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Tensor", Tensor(const Tensor &, const Tensor &, bool, bool, const c10::optional<c10::string_view>, const c10::optional<Tensor> &), fp32)
-  KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Scalar", Tensor(const Tensor &, const at::Scalar &, bool, bool, const c10::optional<c10::string_view>, const c10::optional<Tensor> &), fp32)
+  KERNEL_CPU(ADD_NS(grid_sampler_2d), "grid_sampler_2d", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(_grid_sampler_2d_cpu_fallback), "_grid_sampler_2d_cpu_fallback", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(grid_sampler_3d), "grid_sampler_3d", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
   KERNEL_CPU(ADD_NS(trace), "trace", Tensor(const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(tril), "tril", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(triu), "triu", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(vander), "vander", Tensor(const Tensor &, c10::optional<int64_t>, bool), fp32)
   KERNEL_CPU(ADD_NS(view_as_complex), "view_as_complex", Tensor(const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(cholesky), "cholesky", Tensor(const Tensor &, bool), fp32)
   KERNEL_CPU(ADD_NS(cholesky_inverse), "cholesky_inverse", Tensor(const Tensor &, bool), fp32)
   KERNEL_CPU(ADD_NS(cholesky_solve), "cholesky_solve", Tensor(const Tensor &, const Tensor &, bool), fp32)
-  KERNEL_CPU(ADD_NS(dot), "dot", Tensor(const Tensor &, const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(inverse), "inverse", Tensor(const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(lu_solve), "lu_solve", Tensor(const Tensor &, const Tensor &, const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(matrix_rank), "matrix_rank", Tensor(const Tensor &, bool), fp32)
   KERNEL_CPU(ADD_NS(orgqr), "orgqr", Tensor(const Tensor &, const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(ormqr), "ormqr", Tensor(const Tensor &, const Tensor &, const Tensor &, bool, bool), fp32)
   KERNEL_CPU(ADD_NS(pinverse), "pinverse", Tensor(const Tensor &, double), fp32)
-  KERNEL_CPU(ADD_NS(vdot), "vdot", Tensor(const Tensor &, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(im2col), "im2col", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
-  KERNEL_CPU(ADD_NS(col2im), "col2im", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(max_pool3d), "max_pool3d", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32)
   KERNEL_CPU(ADD_NS(max_unpool2d), "max_unpool2d", Tensor(const Tensor &, const Tensor &, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(max_unpool3d), "max_unpool3d", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
@@ -561,18 +539,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(replication_pad1d), "replication_pad1d", Tensor(const Tensor &, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(replication_pad2d), "replication_pad2d", Tensor(const Tensor &, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(replication_pad3d), "replication_pad3d", Tensor(const Tensor &, IntArrayRef), fp32)
-  KERNEL_CPU(ADD_NS(elu), "elu", Tensor(const Tensor &, const Scalar &, const Scalar &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(hardshrink), "hardshrink", Tensor(const Tensor &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(hardsigmoid), "hardsigmoid", Tensor(const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(hardswish), "hardswish", Tensor(const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(log_sigmoid), "log_sigmoid", Tensor(const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(prelu), "prelu", Tensor(const Tensor &, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(selu), "selu", Tensor(const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(celu), "celu", Tensor(const Tensor &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(softplus), "softplus", Tensor(const Tensor &, const Scalar &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(softshrink), "softshrink", Tensor(const Tensor &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(group_norm), "group_norm", Tensor(const Tensor &, int64_t, const c10::optional<Tensor> &, const c10::optional<Tensor> &, double, bool), fp32)
-  KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
   KERNEL_CPU(ADD_NS(mse_loss), "mse_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.IntList", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t, int64_t, bool), fp32)
   KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.Tensor", Tensor(const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
@@ -580,25 +546,26 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL_CPU(ADD_NS(fft_fft), "fft_fft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
   KERNEL_CPU(ADD_NS(fft_ifft), "fft_ifft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>), fp32)
   KERNEL_CPU(ADD_NS(fft_rfft), "fft_rfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
   KERNEL_CPU(ADD_NS(fft_irfft), "fft_irfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>), fp32)
   KERNEL_CPU(ADD_NS(fft_hfft), "fft_hfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
   KERNEL_CPU(ADD_NS(fft_ihfft), "fft_ihfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
-  KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm", Tensor(const Tensor &, const at::Scalar &, at::IntArrayRef, bool, c10::optional<at::ScalarType>), fp32)
   KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm.str_ord", Tensor(const Tensor &, c10::string_view, at::IntArrayRef, bool, c10::optional<at::ScalarType>), fp32)
   KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond", Tensor(const Tensor &, const c10::optional<at::Scalar> &), fp32)
   KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond.p_str", Tensor(const Tensor &, c10::string_view), fp32)
   KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank", Tensor(const Tensor &, double, bool), fp32)
   KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.tol_tensor", Tensor(const Tensor &, const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.atol_rtol_tensor", Tensor(const Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, bool), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.atol_rtol_float", Tensor(const Tensor &, c10::optional<double>, c10::optional<double>, bool), fp32)
   KERNEL_CPU(ADD_NS(linalg_solve), "linalg_solve", Tensor(const Tensor &, const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(linalg_cholesky), "linalg_cholesky", Tensor(const Tensor &, bool), fp32)
   KERNEL_CPU(ADD_NS(linalg_svdvals), "linalg_svdvals", Tensor(const Tensor &), fp32)
@@ -607,33 +574,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(linalg_inv), "linalg_inv", Tensor(const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(linalg_householder_product), "linalg_householder_product", Tensor(const Tensor &, const Tensor &), fp32)
   KERNEL_CPU(ADD_NS(linalg_tensorinv), "linalg_tensorinv", Tensor(const Tensor &, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, c10::optional<at::IntArrayRef>), fp32)
+  KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, at::OptionalIntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(glu), "glu", Tensor (const Tensor &, int64_t), fp32)
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::cummax"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
-                                 &ADD_NS(cummax)>::type::call)));
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::cummax.dimname"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
-                                 &ADD_NS(cummax)>::type::call)));
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::cummin"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
-                                 &ADD_NS(cummin)>::type::call)));
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::cummin.dimname"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
-                                 &ADD_NS(cummin)>::type::call)));
 
   m.impl(TORCH_SELECTIVE_NAME("aten::eig"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
@@ -659,11 +601,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
                                  std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
                                  &ADD_NS(_lu_with_info)>::type::call)));
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::lu_unpack"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool),
-                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool),
-                                 &ADD_NS(lu_unpack)>::type::call)));
 
   m.impl(TORCH_SELECTIVE_NAME("aten::qr"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
@@ -671,12 +608,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
                                  std::tuple<Tensor, Tensor> (const Tensor &, bool),
                                  &ADD_NS(qr)>::type::call)));
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::solve"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
-                                 &ADD_NS(solve)>::type::call)));
-
   m.impl(TORCH_SELECTIVE_NAME("aten::svd"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
                                  std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
@@ -707,17 +638,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
                                  std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &),
                                  &ADD_NS(fractional_max_pool3d)>::type::call)));
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool1d"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
-                                 &ADD_NS(adaptive_max_pool1d)>::type::call)));
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool2d"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
-                                 &ADD_NS(adaptive_max_pool2d)>::type::call)));
 
   m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool3d"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index bede6cd59703..2831c2633d74 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -14,15 +14,26 @@ TORCH_API at::ScalarType get_autocast_gpu_dtype();
 TORCH_API at::ScalarType get_autocast_cpu_dtype();
 TORCH_API void set_autocast_gpu_dtype(at::ScalarType dtype);
 TORCH_API void set_autocast_cpu_dtype(at::ScalarType dtype);
+TORCH_API bool is_xpu_enabled();
+TORCH_API void set_xpu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_xpu_dtype();
+TORCH_API void set_autocast_xpu_dtype(at::ScalarType dtype);
 TORCH_API bool is_autocast_cache_enabled();
 TORCH_API void set_autocast_cache_enabled(bool enabled);
 
 
 namespace {
   bool is_autocast_eligible(const Tensor& tensor, DeviceType device_type) {
-    return device_type == DeviceType::CUDA
-        ? (tensor.is_cuda() || tensor.is_xla()) && tensor.is_floating_point()
-        : (tensor.is_cpu() || tensor.is_mkldnn()) && tensor.is_floating_point();
+    switch (device_type) {
+      case DeviceType::CUDA:
+        return  (tensor.is_cuda() || tensor.is_xla()) && tensor.is_floating_point();
+      case DeviceType::CPU:
+        return (tensor.is_cpu() || tensor.is_mkldnn()) && tensor.is_floating_point();
+      case DeviceType::XPU:
+        return tensor.is_xpu() && tensor.is_floating_point();
+      default:
+        return false;
+    }
   }
 } // namespace
 
@@ -33,6 +44,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
       return DispatchKey::Autocast;
     case DeviceType::CPU:
       return DispatchKey::AutocastCPU;
+    case DeviceType::XPU:
+      return DispatchKey::AutocastXPU;
     default:
       throw std::runtime_error(
           "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
@@ -46,6 +59,8 @@ inline at::ScalarType get_lower_precision_fp_from_device_type(
       return get_autocast_gpu_dtype();
     case DeviceType::CPU:
       return get_autocast_cpu_dtype();
+    case DeviceType::XPU:
+      return get_autocast_xpu_dtype();
     default:
       throw std::runtime_error(
           "unknown device type for autocast in get_lower_precision_fp_from_device_type");
diff --git a/aten/src/ATen/core/ATen_fwd.h b/aten/src/ATen/core/ATen_fwd.h
new file mode 100644
index 000000000000..f6676a0c4ff1
--- /dev/null
+++ b/aten/src/ATen/core/ATen_fwd.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <c10/core/QScheme.h>
+
+// Forward declarations of core ATen types used in dispatch functions
+namespace c10 {
+
+template<typename T>
+class optional;
+template<typename T>
+class List;
+template<typename T>
+class IListRef;
+class Stream;
+class Scalar;
+class SymInt;
+class SymIntList;
+struct Storage;
+struct TensorOptions;
+template <typename T>
+class ArrayRef;
+template <typename T>
+class OptionalArrayRef;
+
+}  // namespace c10
+
+namespace at {
+
+class Tensor;
+class OptionalTensorRef;
+struct Dimname;
+struct Generator;
+using TensorList = c10::ArrayRef<Tensor>;
+using ITensorListRef = c10::IListRef<Tensor>;
+using IOptTensorListRef = c10::IListRef<OptionalTensorRef>;
+using DimnameList = c10::ArrayRef<Dimname>;
+using IntArrayRef = c10::ArrayRef<int64_t>;
+using OptionalIntArrayRef = c10::OptionalArrayRef<int64_t>;
+
+using c10::Stream;
+using c10::Storage;
+using c10::QScheme;
+using c10::Scalar;
+using c10::SymInt;
+using c10::SymIntList;
+using c10::TensorOptions;
+
+}  // namespace at
diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h
index 8e8d354d8fe8..10b5b53b933b 100644
--- a/aten/src/ATen/core/ATen_pch.h
+++ b/aten/src/ATen/core/ATen_pch.h
@@ -98,6 +98,8 @@
 #include <c10/core/Storage.h>
 #include <c10/core/StorageImpl.h>
 #include <c10/core/Stream.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/UndefinedTensorImpl.h>
@@ -105,6 +107,7 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/impl/InlineDeviceGuard.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/core/impl/PyInterpreter.h>
 #include <c10/core/impl/SizesAndStrides.h>
 #include <c10/core/impl/VirtualGuardImpl.h>
 
@@ -153,13 +156,14 @@
 #include <c10/util/quint4x2.h>
 #include <c10/util/quint8.h>
 #include <c10/util/reverse_iterator.h>
+#include <c10/util/safe_numerics.h>
 #include <c10/util/string_utils.h>
 #include <c10/util/string_view.h>
 #include <c10/util/typeid.h>
 
-#include <ATen/Dispatch.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DimVector.h>
 #include <ATen/core/Dimname.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/NamedTensor.h>
diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h
index 6e9e2c037a5f..576b9e142ebf 100644
--- a/aten/src/ATen/core/DimVector.h
+++ b/aten/src/ATen/core/DimVector.h
@@ -1,13 +1,13 @@
 #pragma once
-
-#include <c10/util/SmallVector.h>
-#include <stdint.h>
+#include <c10/util/DimVector.h>
 
 namespace at {
 
-constexpr size_t kDimVectorStaticSize = 5;
+// Re-declaring 'DimVector' type and size inside 'at' namespace.
+// This is done to avoid modifying every use into their 'c10'
+// equivalent.
 
-/// A container for sizes or strides
-using DimVector = SmallVector<int64_t, kDimVectorStaticSize>;
+using c10::kDimVectorStaticSize;
+using c10::DimVector;
 
 } // namespace at
diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h
index 6205fc4210f9..1ef6fb0f3c2e 100644
--- a/aten/src/ATen/core/DistributionsHelper.h
+++ b/aten/src/ATen/core/DistributionsHelper.h
@@ -158,7 +158,7 @@ template <typename RNG, typename ret_type,
             !has_member_next_##TYPE##_normal_sample<RNG>::value ||                                  \
             !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
           ), int> = 0>                                                                              \
-C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* generator, ret_type* ret) {  \
+C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type* /*ret*/) {  \
   return false;                                                                                     \
 }                                                                                                   \
                                                                                                     \
@@ -174,7 +174,7 @@ template <typename RNG, typename ret_type,
           typename std::enable_if_t<(                                                               \
             !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
           ), int> = 0>                                                                              \
-C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* generator, ret_type cache) { \
+C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type /*cache*/) { \
 }
 
 DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(double);
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index f3122daf2cc6..832059ed1980 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -12,6 +12,28 @@ namespace c10 {
 std::ostream& operator<<(std::ostream & out, Backend b) {
   return out << toString(b);
 }
+
+std::ostream& operator<<(std::ostream & out, Scalar s) {
+  if (s.isFloatingPoint()) {
+    return out << s.toDouble();
+  }
+  if (s.isComplex()) {
+    return out << s.toComplexDouble();
+  }
+  if (s.isBoolean()) {
+    return out << (s.toBool() ? "true" : "false");
+  }
+  if (s.isIntegral(false)) {
+    return out << s.toLong();
+  }
+  throw std::logic_error("Unknown type in Scalar");
+}
+
+std::string toString(Scalar s) {
+  std::stringstream out;
+  out << s;
+  return out.str();
+}
 }
 namespace at {
 
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index 55cfe7b3bdf7..6dcfc6c7b3cd 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -1,12 +1,15 @@
 #pragma once
 
-#include <c10/core/Scalar.h>
-#include <ATen/core/Tensor.h>
 #include <ostream>
+#include <string>
 
+#include <c10/core/Scalar.h>
+#include <ATen/core/Tensor.h>
 
 namespace c10 {
 TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
+TORCH_API std::ostream& operator<<(std::ostream & out, Scalar s);
+TORCH_API std::string toString(Scalar s);
 }
 namespace at {
 
@@ -19,21 +22,4 @@ static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
   return print(out,t,80);
 }
 TORCH_API void print(const Tensor & t, int64_t linesize=80);
-
-static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
-  if (s.isFloatingPoint()) {
-    return out << s.toDouble();
-  }
-  if (s.isComplex()) {
-    return out << s.toComplexDouble();
-  }
-  if (s.isBoolean()) {
-    return out << (s.toBool() ? "true" : "false");
-  }
-  if (s.isIntegral(false)) {
-    return out << s.toLong();
-  }
-  throw std::logic_error("Unknown type in Scalar");
-}
-
 }
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 1e6e8d54fa72..60323f3d3a00 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -138,6 +138,29 @@ Generator make_generator(Args&&... args) {
   return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
 }
 
+/**
+ * Utility function to static cast input Generator* to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T * check_generator(c10::optional<Generator> gen) {
+  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
+  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
+  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
+  return gen->get<T>();
+}
+
+/**
+ * Utility function used in tensor implementations, which
+ * supplies the default generator to tensors, if an input generator
+ * is not supplied. The input Generator* is also static casted to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
+  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
+}
+
 namespace detail {
 
 /**
diff --git a/aten/src/ATen/core/IListRef.h b/aten/src/ATen/core/IListRef.h
new file mode 100644
index 000000000000..442bc7bfabf7
--- /dev/null
+++ b/aten/src/ATen/core/IListRef.h
@@ -0,0 +1,610 @@
+#pragma once
+
+#include <ATen/core/ivalue_to.h>
+#include <c10/util/Exception.h>
+
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+
+/*
+ * [Note: IListRef]
+ * Wrapper around different API containers (e.g. boxed and unboxed).
+ *
+ * What is it?
+ * ===========
+ * It is a tagged union of both boxed and unboxed API containers.
+ * Working implementations:
+ *
+ * - `IListRef<at::Tensor>`
+ * - `IListRef<at::OptionalTensorRef>`
+ *
+ * Note that `IListRef` is a view type. Meaning that it won't own the
+ * tensors it holds. It's intended to be used only as argument parameters.
+ * Specifically, where these 2 worlds overlap.
+ *
+ * What is this for?
+ * =================
+ * Historically, PyTorch has maintained 2 different APIs: the unboxed
+ * (called from C++ API and Python eager mode) and boxed APIs (called
+ * from the TorchScript JIT, mobile interpreter, and boxed fallbacks).
+ *
+ * Calling unboxed kernels from the boxed "world" and vice-versa may
+ * result in non-negligible overhead. Lists are one of those types:
+ *
+ * - Boxed world: `c10::List`
+ * - Unboxed world: `c10::ArrayRef`
+ *
+ * In this context, `c10::IListRef` solves this problem by wrapping those
+ * 2 container types, so that we don't need to convert from one to
+ * the other.
+ *
+ * (see https://github.com/pytorch/pytorch/issues/66328)
+ *
+ * What does it do?
+ * ================
+ * This container wraps around the different tagged containers
+ * (currently, only boxed and unboxed), without incurring in extra
+ * overhead for converting from one to another. It does so while
+ * exposing usual container methods, which dispatch to corresponding
+ * implementations.
+ *
+ * While it works with different container types, it introduces
+ * overhead for repeatedly calling member functions (since those will
+ * get dispatched, again). Therefore, you should only use it to iterate
+ * through the list up to one time. If you need to do more complex things,
+ * call `materialize()` first.
+ *
+ * Adding support for a new Tag
+ * ============================
+ * Suppose we want to add a new tag: `Chest`. Here are the steps
+ * we would have to go through:
+ *
+ * 1. Add a line for it in the macro `TORCH_ILISTREF_FORALL_TAGS`.
+ *
+ *   #define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+ *     ...
+ *     _(Chest, ##__VA_ARGS__)
+ *
+ * 2. Add type aliases, union members, and constructors.
+ *
+ *   template <typename T>
+ *   class IListRef {
+ *     ...
+ *     using chest_type =
+ *       typename detail::IListRefTagImpl<T, IListRefTag::Chest>::list_type;
+ *     ...
+ *     IListRef(...) : tag_(IListRefTag::Chest) {
+ *       ...
+ *     }
+ *     ...
+ *     union Payload {
+ *       ...
+ *       chest_type chest;
+ *       ...
+ *     };
+ *     ...
+ *   };
+ *
+ * 3. Add a default implementation for it (in 'IListRef_inl.h'). It's
+ *    preferable to make the default implementation work for `T = Tensor`
+ *    (both `Unboxed` and `Boxed` do it).
+ *
+ *   template <typename T, typename ListElemT>
+ *   class IListRefTagImplBase<IListRefTag::Chest, T, ListElemT> {
+ *    public:
+ *     using elem_type = ListElemT;
+ *     using list_type = ChestContainer<elem_type>;
+ *
+ *     static const list_type& unwrap(const IListRef<T>& ilist) { ... }
+ *
+ *     static typename list_type::const_iterator& unwrap(
+ *         IListRefIterator<T>& it) { ... }
+ *
+ *     static const typename list_type::const_iterator& unwrap(
+ *         const IListRefIterator<T>& it) { ... }
+ *
+ *     static IListRefConstRef<T> iterator_get(
+ *         const typename list_type::const_iterator& it) { ... }
+ *   }
+ *
+ * 4. Add an specialization for each of the already supported types.
+ *    Finally, for consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Chest, at::Tensor>
+ *       : public IListRefTagImplBase<IListRefTag::Chest, at::Tensor> {};
+ *
+ * Adding support for a new Type
+ * =============================
+ * Suppose we want to add support for a new type: `Matrix`.
+ * Here are the steps we would have to go through:
+ *
+ * 1. Add an specialization for each of the existing tags.
+ *    For consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Unboxed, Matrix>
+ *       : public IListRefTagImplBase<IListRefTag::Unboxed, Matrix> {};
+ *
+ *   template <>
+ *   class IListRefTagImpl<Matrix, IListRefTag::Boxed>
+ *       : public IListRefTagImplBase<IListRefTag::Boxed, Matrix> {};
+ *
+ * Common Problems
+ * ===============
+ * 1. One of `IListRef(Iterator)` methods are failing to compile.
+ *
+ *     That may be happening because the container type you added
+ *     is not compatible with the code written for that method. If
+ *     that's true, then you might have to transform that code into
+ *     a static method call (see `List::operator[]` method).
+ *
+ * 2. Can't make `IListRefIterator<T>::operator*` return a const-reference.
+ *
+ *    First, keep in mind that we assume that boxed containers will
+ *    have to deal with `IValue` (e.g. `c10::List`). In this context,
+ *    what may be happening is that `IValue` doesn't store internally
+ *    your type `T`. Instead, it constructs a type new `T` everytime
+ *    you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
+ */
+
+namespace c10 {
+template <typename T>
+class IListRef;
+
+/*
+ * Applies arbitrary macros to each `IListRefTag`.
+ */
+#define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+  _(Unboxed, ##__VA_ARGS__)                \
+  _(Boxed, ##__VA_ARGS__)                  \
+  _(Materialized, ##__VA_ARGS__)
+
+/*
+ * Defines a "switch-case" for `TAG`. Inside, it executes `BODY`,
+ * while bringing to scope:
+ *
+ * - `ImplT`: the implementation class for `TAG`
+ * - `this_`: the result of unwrapping `this`
+ */
+#define TORCH_ILISTREF_UNWRAP_CASE(TAG, BODY)                        \
+  case c10::IListRefTag::TAG: {                                      \
+    using ImplT = c10::detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+    auto& this_ = ImplT::unwrap(*this);                              \
+    BODY                                                             \
+  } break;
+
+/*
+ * Dispatches the unwrap call, depending on `TAG`, followed by
+ * the execution of `BODY`. It aborts if `TAG` is not a `IListRefTag`.
+ *
+ * This macro is useful because it allows us to handle different
+ * types (that correspond to different tags) to be implemented
+ * only once. We can do it even when the implementation of the
+ * different tags aren't syntatically the same, by dispatching
+ * it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
+ */
+#define TORCH_ILISTREF_UNWRAP(TAG, BODY)                         \
+  switch (TAG) {                                                 \
+    TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \
+    break;                                                       \
+    default:                                                     \
+      TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");     \
+  }
+
+enum class IListRefTag {
+#define DEFINE_TAG(tag, ...) tag,
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+      None
+};
+
+namespace detail {
+/*
+ * Type alias that specifies whether we return a reference or a copy of `T`.
+ *
+ * What is this for?
+ * =================
+ * Since values in the boxed world are represented by an `IValue`, we also
+ * depend on whether it can be converted to a const-reference (`Tensor`) or
+ * has to create a new copy of `T` (`OptionalTensorRef`).
+ */
+template <typename T>
+using IListRefConstRef = typename ivalue_to_const_ref_overload_return<T>::type;
+
+/*
+ * Interface that implements key functions for each `IListRefTag` type.
+ *
+ * What is this for?
+ * =================
+ * Given an `IListRef(Iterator)<T>`, some methods have to be implemented
+ * differently for each `TAG`. Therefore, the methods inside this class
+ * are used as dispatch targets for the different `IListRefTag` values.
+ *
+ * You should create an specialization of this class for each possible
+ * combination of `IListRefTag` type (except `None`) and element types
+ * (e.g. `Tensor`).
+ *
+ * What does it do?
+ * ================
+ * 1. defines static methods to be used as dispatch targets by both
+ *    `IListRef<T>` and `IListRefIterator<T>` (see the implementation of
+ *    `IListRefTagImplBase`).
+ *
+ * 2. defines the `elem_type` and `list_type` aliases that will be
+ *    used in the definition of `IListRef<T>`. In general, we should do
+ *    so by inheriting from `IListRefTagImplBase<TAG, T, ListElemT>`.
+ *
+ * [Note: IListRefTagImpl Specialization]
+ * ======================================
+ * For `IListRef(Iterator)<at::Tensor>`:
+ * - <IListRefTag::Unboxed, at::Tensor>
+ * - <IListRefTag::Boxed, at::Tensor>
+ * - <IListRefTag::Materialized, at::Tensor>
+ *
+ * For `IListRef(Iterator)<at::OptionalTensorRef>`:
+ * - <IListRefTag::Unboxed, at::OptionalTensorRef>
+ * - <IListRefTag::Boxed, at::OptionalTensorRef>
+ * - <IListRefTag::Materialized, at::OptionalTensorRef>
+ */
+template <IListRefTag TAG, typename T>
+class IListRefTagImpl {};
+
+/*
+ * Base implementation of `IListRefTagImpl<TAG, T>` methods.
+ *
+ * What is this for?
+ * =================
+ * This should make adding specializations for new types easier. For
+ * example, one should be able to add a new type just by making its
+ * `IListRefTagImpl` specialization inherit from `IListRefTagImplBase`.
+ *
+ * You should create a partial specialization for this class only if
+ * you introduce a new `IListRefTag`. The idea being that there is one
+ * default implementation for each possible value of `IListRefTag`.
+ *
+ * What does it do?
+ * ================
+ * 1. defines `elem_type` as an alias to `ListElemT`.
+ *
+ * 1. defines `list_type` as an alias to the default container type
+ *    that will hold a collection of `elem_type`. The idea being that
+ *    all types tagged as `TAG` will have `list_type` as its container,
+ *    with different `elem_type`.
+ *
+ * 3. defines the default implementation for each of the methods that
+ *    are supposed to be defined on `IListRefTagImpl` specializations.
+ *
+ * 4. inheriting from `IListRefTagImplBase<TAG, T, ListElemT>` also means
+ *    that the payload of the type `IListRef<T>` will be of type `list_type`
+ *    when it is tagged as `TAG`.
+ */
+template <IListRefTag TAG, typename T, typename ListElemT = T>
+class IListRefTagImplBase {};
+
+/*
+ * Materialized container for `IListRef<T>`.
+ *
+ * What is this for?
+ * =================
+ * Container that groups `T` references together. This exchanges the
+ * overhead of every method call from `IListRef<T>` for a dynamic allocation.
+ *
+ * You should use this container instead of `IListRef<T>` if:
+ *
+ *   - You are going to iterate the list more than once
+ *   - You need to repeatedly access arbitrary elements (using `operator[]`)
+ * What does it do?
+
+ * ================
+ * Removes the reference (&) from the type, and wraps it into a
+ * `std::reference_wrapper`. If `IListRefConstRef<T>` is not a
+ * reference type, then it's left unchanged.
+ */
+template <typename T>
+using _MaterializedIListRefElem = typename std::conditional<
+    std::is_reference<T>::value,
+    typename std::reference_wrapper<typename std::remove_reference<T>::type>,
+    T>::type;
+
+template <typename T>
+using MaterializedIListRef = std::vector<_MaterializedIListRefElem<IListRefConstRef<T>>>;
+
+} // namespace detail
+
+/*
+ * Iterator for `IListRef<T>`.
+ *
+ * What is it?
+ * ===========
+ * Currently, a `std::bidirectional_iterator` that wraps the iterator
+ * types defined for each of the `IListRefTag`.
+ *
+ * One should be able to use it, as if it were the unwrapped
+ * iterators themselves.
+
+ * What does it do?
+ * ================
+ * Similarly to `IListRef<T>`, this is a wrapper class. Specifically, it
+ * wraps each container's `const_iterator` type alias. So, for example,
+ * given that the container for `IListRefTag::Boxed` is `c10::List`, this
+ * iterator will wrap a `c10::List::const_iterator`.
+ *
+ * [Note: MSVC Iterator Debug]
+ * ===========================
+ * MSVC `vector<T>::iterator` implementation (used in the boxed variant)
+ * makes it so this union's destructor, copy-constructor (assignment), and
+ * move-constructor (assignment) are implicitly deleted.
+ *
+ * Therefore, we need to explicitly define them as needed. Follows a list
+ * of places where these are needed and their reason:
+ *
+ *   - `Payload` destructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is set to 2.
+ *
+ *   - `IListRefIterator` destructor:
+ *     same as above. However, we need to explicitly call the variant
+ *     destructor explicitly.
+ *
+ *   - `IListRefIterator` copy-constructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is different
+ *     than 0.
+ */
+template <typename T>
+class IListRefIterator : public std::iterator<std::bidirectional_iterator_tag, T> {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+
+ public:
+  using unboxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Unboxed, T>::list_type::const_iterator;
+  using boxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Boxed, T>::list_type::const_iterator;
+  using materialized_iterator_type =
+      typename detail::MaterializedIListRef<T>::const_iterator;
+
+  IListRefIterator() : tag_(IListRefTag::None) {}
+
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL != 0
+  // See [Note: MSVC Iterator Debug]
+  IListRefIterator(const IListRefIterator& iterator)
+      : tag_(iterator.tag_) {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator = iterator.payload_.boxed_iterator;
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator = iterator.payload_.unboxed_iterator;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL == 2
+  // See [Note: MSVC Iterator Debug]
+  ~IListRefIterator() {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator.~boxed_iterator_type();
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator.~unboxed_iterator_type();
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+
+  IListRefIterator(boxed_iterator_type boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed_iterator = boxed;
+  }
+
+  IListRefIterator(unboxed_iterator_type unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed_iterator = unboxed;
+  }
+
+  IListRefIterator(materialized_iterator_type materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized_iterator = materialized;
+  }
+
+  detail::IListRefConstRef<T> operator*() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::iterator_get(this_); });
+  }
+
+  IListRefIterator& operator++() {
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return *this;
+  }
+
+  IListRefIterator operator++(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return old;
+  }
+
+  IListRefIterator& operator--() {
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return *this;
+  }
+
+  IListRefIterator operator--(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return old;
+  }
+
+  bool operator==(const IListRefIterator& rhs) const {
+    if (tag_ != rhs.tag_) {
+      return false;
+    }
+    TORCH_ILISTREF_UNWRAP(tag_, {
+      auto& rhs_it = ImplT::unwrap(rhs);
+      return this_ == rhs_it;
+    });
+  }
+
+  bool operator!=(const IListRefIterator& rhs) const {
+    return !(*this == rhs);
+  }
+
+ private:
+  union Payload {
+    boxed_iterator_type boxed_iterator;
+    unboxed_iterator_type unboxed_iterator;
+    materialized_iterator_type materialized_iterator;
+    void* _init_ptr;
+    Payload() : _init_ptr(nullptr) {}
+#if defined(_MSC_VER)
+    // See [Note: MSVC Iterator Debug]
+    ~Payload() {}
+#endif
+  };
+
+  Payload payload_;
+  IListRefTag tag_;
+};
+
+/*
+ * See [Note: IListRef]
+ */
+template <typename T>
+class IListRef {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+
+ public:
+  using unboxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Unboxed, T>::list_type;
+  using boxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Boxed, T>::list_type;
+  using materialized_type =
+      typename detail::MaterializedIListRef<T>;
+
+  using iterator = IListRefIterator<T>;
+  using const_iterator = IListRefIterator<T>;
+  using value_type = typename iterator::value_type;
+
+  IListRef() : tag_(IListRefTag::None) {}
+
+  IListRef(const boxed_type& boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed = &boxed;
+  }
+
+  IListRef(const unboxed_type& unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed;
+  }
+
+  IListRef(const std::initializer_list<T>& list) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = at::ArrayRef<T>(list);
+  }
+
+  template <
+      typename... UnboxedConstructorArgs,
+      typename = std::enable_if_t<
+          std::is_constructible<unboxed_type, UnboxedConstructorArgs...>::value>>
+  IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed_type(std::forward<UnboxedConstructorArgs>(args)...);
+  }
+
+  IListRef(const materialized_type& materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized = &materialized;
+  }
+
+  size_t size() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.size(); });
+  }
+
+  bool empty() const {
+    return size() == 0;
+  }
+
+  iterator begin() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.begin(); });
+  }
+
+  iterator end() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.end(); });
+  }
+
+  detail::IListRefConstRef<T> front() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::front(this_); });
+  }
+
+  /*
+   * Materializes the `IListRef` into a `std::vector`.
+   *
+   * This should be used when one wishes to either:
+   *
+   *   - iterate over the list more than once: each `IListRefIterator`
+   *     member function call has to go through a switch, introducing
+   *     non-negligible overhead
+   *
+   *   - randomly access an arbitrary element using `operator[]`:
+   *     same reason as above
+   */
+  detail::MaterializedIListRef<T> materialize() const {
+    if (isMaterialized()) {
+      return toMaterialized();
+    }
+
+    detail::MaterializedIListRef<T> materialized;
+    materialized.reserve(size());
+    for (const auto& t : *this) {
+      materialized.emplace_back(t);
+    }
+    return materialized;
+  }
+
+#define DEFINE_CHECK(TAG, ...)    \
+  bool is##TAG() const {          \
+    return tag_ == IListRefTag::TAG; \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+
+  bool isNone() const {
+    return tag_ == IListRefTag::None;
+  }
+
+#define DEFINE_CASTING(TAG, ...)                                          \
+  const typename detail::IListRefTagImpl<IListRefTag::TAG, T>::list_type& \
+      to##TAG() const {                                                   \
+    TORCH_INTERNAL_ASSERT(is##TAG());                                     \
+    return detail::IListRefTagImpl<IListRefTag::TAG, T>::unwrap(*this);   \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CASTING);
+#undef DEFINE_CASTING
+
+ private:
+  union Payload {
+    const boxed_type* boxed;
+    unboxed_type unboxed;
+    const materialized_type* materialized;
+    Payload() : boxed(nullptr) {}
+    ~Payload() {}
+  };
+
+  Payload payload_;
+  IListRefTag tag_;
+};
+
+} // namespace c10
+
+#include <ATen/core/IListRef_inl.h>
diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h
new file mode 100644
index 000000000000..a14bcfddae2d
--- /dev/null
+++ b/aten/src/ATen/core/IListRef_inl.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+
+namespace at {
+class Tensor;
+class OptionalTensorRef;
+}
+
+namespace c10 {
+namespace detail {
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Unboxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = ArrayRef<elem_type>;
+
+  /*
+   * These `unwrap` static methods unwraps the inner containers out
+   * of `IListRef<T>` (and `IListRefIterator<T>`). They are required when
+   * the macro `TORCH_ILISTREF_UNWRAP` is called.
+   */
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return ilist.payload_.unboxed;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+
+  /*
+   * We have these function (besides the `unwrap`s above) because the
+   * implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
+   * weren't syntatically equal for the existing tags at the time
+   * (`Unboxed` and `Boxed`).
+   */
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst.front();
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Boxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Boxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = List<elem_type>;
+
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.boxed;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return (*it).get().toTensor();
+  }
+};
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Materialized`.
+ */
+template <typename T>
+class IListRefTagImplBase<IListRefTag::Materialized, T, _MaterializedIListRefElem<T>> {
+ public:
+  using elem_type = _MaterializedIListRefElem<T>;
+  using list_type = MaterializedIListRef<T>;
+
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.materialized;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+
+/*
+ * [Note: ITensorListRef]
+ * Specializations necessary for `IListRef<at::Tensor>` type.
+ *
+ * Since the default implementations are usually done with supporting
+ * `Tensor` in mind, we only have to inherit from the base implementations.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::Tensor> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::Tensor> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::Tensor>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::Tensor,
+          _MaterializedIListRefElem<at::Tensor>> {};
+
+/*
+ * [Note: IOptTensorListRef]
+ * Specializations necessary for `IListRef<at::OptionalTensorRef>` type.
+ *
+ * We can't get an `at::OptionalTensorRef` directly from an instance of
+ * `List<optional<Tensor>>` (the type that corresponds to the boxed world).
+ *
+ * So, the default implementation won't help us. Thus, we have to implement
+ * this method ourselves.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::OptionalTensorRef> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, optional<at::Tensor>> {
+
+ public:
+  /*
+   * Given an instance of the types corresponding to the `Boxed` tag, we override
+   * the default implementation, so that we can return a `at::OptionalTensorRef`.
+   */
+  static IListRefConstRef<at::OptionalTensorRef> iterator_get(
+      const typename list_type::const_iterator& it) {
+    const auto& ivalue = (*it).get();
+    if (!ivalue.isNone()) {
+        const auto& tensor = ivalue.toTensor();
+        return (tensor.defined()) ? tensor : at::OptionalTensorRef{};
+    }
+    return {};
+  }
+};
+
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::OptionalTensorRef>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::OptionalTensorRef,
+          _MaterializedIListRefElem<at::OptionalTensorRef>> {};
+
+} // namespace detail
+} // namespace c10
+
+namespace at {
+
+// [Note: ITensorListRef]
+using ITensorListRef = c10::IListRef<at::Tensor>;
+using ITensorListRefIterator = c10::IListRefIterator<at::Tensor>;
+using MaterializedITensorListRef = c10::detail::MaterializedIListRef<at::Tensor>;
+// [Note: IOptTensorListRef]
+using IOptTensorListRef = c10::IListRef<at::OptionalTensorRef>;
+using IOptTensorListRefIterator = c10::IListRefIterator<at::OptionalTensorRef>;
+using MaterializedIOptTensorListRef = c10::detail::MaterializedIListRef<at::OptionalTensorRef>;
+
+} // namespace at
diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp
new file mode 100644
index 000000000000..1a609de74f80
--- /dev/null
+++ b/aten/src/ATen/core/IListRef_test.cpp
@@ -0,0 +1,254 @@
+#include <ATen/Functions.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/Tensor.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iterator>
+
+using namespace c10;
+
+static std::vector<at::Tensor> get_tensor_vector() {
+  std::vector<at::Tensor> tensors;
+  const size_t SIZE = 5;
+  for (size_t i = 0; i < SIZE; i++) {
+    tensors.emplace_back(at::empty({0}));
+  }
+  return tensors;
+}
+
+static std::vector<optional<at::Tensor>> get_boxed_opt_tensor_vector() {
+  std::vector<optional<at::Tensor>> optional_tensors;
+  const size_t SIZE = 5;
+  for (size_t i = 0; i < SIZE * 2; i++) {
+    auto opt_tensor = (i % 2 == 0) ? optional<at::Tensor>(at::empty({0})) : nullopt;
+    optional_tensors.emplace_back(opt_tensor);
+  }
+  return optional_tensors;
+}
+
+static std::vector<at::OptionalTensorRef> get_unboxed_opt_tensor_vector() {
+  std::vector<at::OptionalTensorRef> optional_tensors;
+  const size_t SIZE = 5;
+  for (size_t i = 0; i < SIZE * 2; i++) {
+    auto opt_tensor = (i % 2 == 0) ? at::OptionalTensorRef(at::empty({0}))
+                                   : at::OptionalTensorRef();
+    optional_tensors.emplace_back(opt_tensor);
+  }
+  return optional_tensors;
+}
+
+template <typename T>
+void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) {
+  EXPECT_EQ(thing.size(), list.size());
+  size_t i = 0;
+  for (const auto& t : list) {
+    const at::Tensor& other = thing[i];
+    EXPECT_EQ(other.use_count(), use_count);
+    EXPECT_TRUE(other.is_same(t));
+    i++;
+  }
+}
+
+TEST(ITensorListRefTest, CtorEmpty_IsNone_Throws) {
+  at::ITensorListRef list;
+  EXPECT_TRUE(list.isNone());
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  EXPECT_THROW(list.size(), c10::Error);
+}
+
+TEST(ITensorListRefTest, CtorBoxed_IsBoxed) {
+  auto vec = get_tensor_vector();
+  List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  EXPECT_TRUE(list.isBoxed());
+}
+
+TEST(ITensorListRefTest, CtorUnboxed_IsUnboxed) {
+  auto vec = get_tensor_vector();
+  at::ArrayRef<at::Tensor> unboxed(vec);
+  at::ITensorListRef list(unboxed);
+  EXPECT_TRUE(list.isUnboxed());
+}
+
+TEST(ITensorListRefTest, CtorUnboxedIndirect_IsUnboxed) {
+  auto vec = get_tensor_vector();
+  auto check_is_unboxed = [](at::ITensorListRef list) {
+    EXPECT_TRUE(list.isUnboxed());
+  };
+  check_is_unboxed(at::ITensorListRef{vec[0]});
+  check_is_unboxed(at::ITensorListRef{vec.data(), vec.size()});
+  check_is_unboxed(at::ITensorListRef{&*vec.begin(), &*vec.end()});
+  check_is_unboxed(vec);
+  check_is_unboxed({vec[0], vec[1], vec[2]});
+}
+
+TEST(ITensorListRefTest, CtorTemp_IsUnboxed) {
+  auto check_is_unboxed = [](at::ITensorListRef list) {
+    EXPECT_TRUE(list.isUnboxed());
+  };
+
+  auto vec = get_tensor_vector();
+  check_is_unboxed({vec[0], vec[1]});
+}
+
+TEST(ITensorListRefTest, Boxed_GetConstRefTensor) {
+  auto vec = get_tensor_vector();
+  // We need 'boxed' to be 'const' here (and some other tests below)
+  // because 'List<Tensor>::operator[]' returns a 'ListElementReference'
+  // instead of returning a 'Tensor'. On the other hand,
+  // 'List<Tensor>::operator[] const' returns a 'const Tensor &'.
+  const List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  static_assert(
+      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      "Accessing elements from List<Tensor> through a ITensorListRef should be const references.");
+  EXPECT_TRUE(boxed[0].is_same(*list.begin()));
+  EXPECT_TRUE(boxed[1].is_same(*(++list.begin())));
+}
+
+TEST(ITensorListRefTest, Unboxed_GetConstRefTensor) {
+  auto vec = get_tensor_vector();
+  at::ITensorListRef list(vec);
+  static_assert(
+      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      "Accessing elements from ArrayRef<Tensor> through a ITensorListRef should be const references.");
+  EXPECT_TRUE(vec[0].is_same(*list.begin()));
+  EXPECT_TRUE(vec[1].is_same(*(++list.begin())));
+}
+
+TEST(ITensorListRefTest, Boxed_Equal) {
+  auto vec = get_tensor_vector();
+  List<at::Tensor> boxed(vec);
+  check_elements_same(boxed, vec, /* use_count= */ 2);
+}
+
+TEST(ITensorListRefTest, Unboxed_Equal) {
+  auto vec = get_tensor_vector();
+  check_elements_same(at::ArrayRef<at::Tensor>(vec), vec, /* use_count= */ 1);
+}
+
+TEST(ITensorListRefTest, UnboxedIndirect_Equal) {
+  // The 4 ref-count locations:
+  //   1. `vec`
+  //   2. `initializer_list` for `ITensorListRef`
+  //   3. `initializer_list` for `std::vector`
+  //   4. temporary `std::vector`
+  auto vec = get_tensor_vector();
+  // Implicit constructors
+  check_elements_same(vec[0], std::vector<at::Tensor>{vec[0]}, /* use_count= */ 3);
+  check_elements_same({vec.data(), vec.size()}, vec, /* use_count= */ 1);
+  check_elements_same({&*vec.begin(), &*vec.end()}, vec, /* use_count= */ 1);
+  // Vector constructor
+  check_elements_same(vec, vec, /* use_count= */ 1);
+  // InitializerList constructor
+  check_elements_same({vec[0], vec[1], vec[2]}, std::vector<at::Tensor>{vec[0], vec[1], vec[2]}, /* use_count= */ 4);
+}
+
+TEST(ITensorListRefTest, BoxedMaterialize_Equal) {
+  auto vec = get_tensor_vector();
+  List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  auto materialized = list.materialize();
+  check_elements_same(list, vec, 2);
+  check_elements_same(list, materialized, 2);
+  check_elements_same(materialized, vec, 2);
+}
+
+TEST(ITensorListRefTest, UnboxedMaterialize_Equal) {
+  auto vec = get_tensor_vector();
+  at::ArrayRef<at::Tensor> unboxed(vec);
+  at::ITensorListRef list(unboxed);
+  auto materialized = list.materialize();
+  check_elements_same(list, vec, 1);
+  check_elements_same(list, materialized, 1);
+  check_elements_same(materialized, vec, 1);
+}
+
+TEST(ITensorListRefIteratorTest, CtorEmpty_ThrowsError) {
+  at::ITensorListRefIterator it;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  EXPECT_THROW(*it, c10::Error);
+}
+
+TEST(ITensorListRefIteratorTest, Boxed_GetFirstElement) {
+  auto vec = get_tensor_vector();
+  const List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  EXPECT_TRUE(boxed[0].is_same(*list.begin()));
+}
+
+TEST(ITensorListRefIteratorTest, Unboxed_GetFirstElement) {
+  auto vec = get_tensor_vector();
+  at::ITensorListRef list(vec);
+  EXPECT_TRUE(vec[0].is_same(*list.begin()));
+}
+
+TEST(ITensorListRefIteratorTest, Boxed_Equality) {
+  auto vec = get_tensor_vector();
+  List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  EXPECT_EQ(list.begin(), list.begin());
+  EXPECT_NE(list.begin(), list.end());
+  EXPECT_NE(list.end(), list.begin());
+  EXPECT_EQ(list.end(), list.end());
+}
+
+TEST(ITensorListRefIteratorTest, Unboxed_Equality) {
+  auto vec = get_tensor_vector();
+  at::ITensorListRef list(vec);
+  EXPECT_EQ(list.begin(), list.begin());
+  EXPECT_NE(list.begin(), list.end());
+  EXPECT_NE(list.end(), list.begin());
+  EXPECT_EQ(list.end(), list.end());
+}
+
+TEST(ITensorListRefIteratorTest, Boxed_Iterate) {
+  auto vec = get_tensor_vector();
+  const List<at::Tensor> boxed(vec);
+  at::ITensorListRef list(boxed);
+  size_t i = 0;
+  for (const auto& t : list) {
+    EXPECT_TRUE(boxed[i++].is_same(t));
+  }
+  EXPECT_EQ(i, list.size());
+}
+
+TEST(ITensorListRefIteratorTest, Unboxed_Iterate) {
+  auto vec = get_tensor_vector();
+  at::ITensorListRef list(vec);
+  size_t i = 0;
+  for (const auto& t : list) {
+    EXPECT_TRUE(vec[i++].is_same(t));
+  }
+  EXPECT_EQ(i, list.size());
+}
+
+TEST(IOptTensorListRefTest, Boxed_Iterate) {
+  auto vec = get_boxed_opt_tensor_vector();
+  const List<optional<at::Tensor>> boxed(vec);
+  at::IOptTensorListRef list(boxed);
+  size_t i = 0;
+  for (const auto t : list) {
+    EXPECT_EQ(boxed[i].has_value(), t.has_value());
+    if (t.has_value()) {
+      EXPECT_TRUE((*boxed[i]).is_same(*t));
+    }
+    i++;
+  }
+  EXPECT_EQ(i, list.size());
+}
+
+TEST(IOptTensorListRefTest, Unboxed_Iterate) {
+  auto vec = get_unboxed_opt_tensor_vector();
+  at::ArrayRef<at::OptionalTensorRef> unboxed(vec);
+  at::IOptTensorListRef list(unboxed);
+  size_t i = 0;
+  for (const auto t : list) {
+    EXPECT_EQ(unboxed[i].has_value(), t.has_value());
+    if (t.has_value()) {
+      EXPECT_TRUE((*unboxed[i]).is_same(*t));
+    }
+    i++;
+  }
+  EXPECT_EQ(i, list.size());
+}
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index b042fab24f7d..0785a6941aff 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -78,6 +78,10 @@ class ListElementReference final {
   // assigning another ref to this assigns the underlying value
   ListElementReference& operator=(ListElementReference&& rhs) &&;
 
+  const IValue& get() const& {
+    return *iterator_;
+  }
+
   friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs);
 
 private:
@@ -235,6 +239,7 @@ class List final {
   using value_type = T;
   using size_type = typename c10::detail::ListImpl::list_type::size_type;
   using iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using const_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
   using reverse_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::reverse_iterator>;
 
   /**
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 6b51aa53156f..f9f3d6ff7f83 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -1,14 +1,59 @@
-#include <torch/library.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/core/PythonModeTLS.h>
+#include <ATen/core/TorchDispatchModeTLS.h>
+#include <ATen/core/PythonFallbackKernel.h>
+#include <c10/core/SafePyObject.h>
 
 namespace {
 
+// This TLS is used to track the state of the dispatcher to be able to restore
+// it when calling back into python.
+// It has the following invariant:
+//  - It must be empty while python code is executed.
+//  - It should only be set once even for multiple dispatcher calls that do not come
+//    back to python.
+// To achieve this, we ensure that the tls is empty by default and emptied again both when
+// we call into user torch_dispatch or returning back to python after this call.
+
+thread_local c10::optional<c10::impl::LocalDispatchKeySet> tls_on_entry;
+
+c10::impl::LocalDispatchKeySet safe_get_tls_on_entry() {
+  TORCH_CHECK(tls_on_entry.has_value(), "Accessing torch dispatch state outside of '__torch_dispatch__' "
+              "is not allowed.");
+  return tls_on_entry.value();
+}
+
+// All the keys below the Python key
+constexpr c10::DispatchKeySet after_Python_keyset = c10::DispatchKeySet(c10::DispatchKeySet::FULL) ^
+  (c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Python) |
+   c10::DispatchKeySet(c10::DispatchKey::Python));
+
+
+// This guard assumes that tls_on_entry has a value.
+struct StashTLSOnEntryGuard {
+public:
+  StashTLSOnEntryGuard(): saved_(tls_on_entry.value()) {
+    tls_on_entry = c10::nullopt;
+  }
+
+  ~StashTLSOnEntryGuard() {
+    TORCH_INTERNAL_ASSERT(!tls_on_entry.has_value());
+    tls_on_entry = saved_;
+  }
+
+private:
+  c10::impl::LocalDispatchKeySet saved_;
+};
+
 void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  // If Python Mode is active, use its PyInterpreter for dispatch
-  const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state();
-  if (maybe_python_mode_state) {
-    maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state);
+  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
+  // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
+  // StashTLSOnEntryGuard stash_guard;
+  c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset);
+
+
+  // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
+  const auto& maybe_torch_dispatch_mode_state = at::impl::TorchDispatchModeTLS::get_state();
+  if (maybe_torch_dispatch_mode_state) {
+    maybe_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack, maybe_torch_dispatch_mode_state);
     return;
   }
 
@@ -42,8 +87,53 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }
 
+void pythonTLSSnapshotFallback(const c10::OperatorHandle &op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  // It is ok for the tls to be already set here.
+  // It means that there are multiple calls into the dispatcher not originating from python code.
+  // The guard below will properly ignore such calls.
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack);
+}
+
+
 } // anonymous namespace
 
+namespace at {
+namespace impl {
+
+RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) {
+  tls_on_entry = c10::nullopt;
+}
+
+RestorePythonTLSSnapshot::~RestorePythonTLSSnapshot() {
+  TORCH_INTERNAL_ASSERT(!tls_on_entry.has_value());
+  tls_on_entry = saved_;
+}
+
+MaybeSetTLSOnEntryGuard::MaybeSetTLSOnEntryGuard() {
+  if (tls_on_entry.has_value()) {
+    value_set_ = false;
+  } else {
+    value_set_ = true;
+    tls_on_entry = c10::impl::tls_local_dispatch_key_set();
+  }
+}
+MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() {
+  if (value_set_) {
+    TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
+    tls_on_entry = c10::nullopt;
+  }
+}
+
+
+} // namespace impl
+} // namespace at
+
 TORCH_LIBRARY_IMPL(_, Python, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
 }
+
+TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>());
+}
diff --git a/aten/src/ATen/core/PythonFallbackKernel.h b/aten/src/ATen/core/PythonFallbackKernel.h
new file mode 100644
index 000000000000..94cd4e81291a
--- /dev/null
+++ b/aten/src/ATen/core/PythonFallbackKernel.h
@@ -0,0 +1,28 @@
+#pragma once
+
+
+namespace at {
+namespace impl {
+
+struct TORCH_API RestorePythonTLSSnapshot {
+  RestorePythonTLSSnapshot();
+  ~RestorePythonTLSSnapshot();
+
+private:
+  c10::impl::LocalDispatchKeySet saved_;
+  c10::impl::ForceDispatchKeyGuard guard_;
+};
+
+
+// RAII guard to make working with the above TLS safer.
+struct TORCH_API MaybeSetTLSOnEntryGuard {
+public:
+  MaybeSetTLSOnEntryGuard();
+  ~MaybeSetTLSOnEntryGuard();
+
+private:
+  bool value_set_;
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/core/PythonModeTLS.cpp b/aten/src/ATen/core/PythonModeTLS.cpp
deleted file mode 100644
index dd4b44bc5fed..000000000000
--- a/aten/src/ATen/core/PythonModeTLS.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <ATen/core/PythonModeTLS.h>
-
-namespace at { namespace impl {
-
-thread_local std::shared_ptr<TorchDispatchTypeObject> pythonModeState;
-
-void PythonModeTLS::set_state(const std::shared_ptr<TorchDispatchTypeObject>& state) {
-  pythonModeState = state;
-  if (state) {
-    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
-  } else {
-    PythonModeTLS::reset_state();
-  }
-}
-
-const std::shared_ptr<TorchDispatchTypeObject>& PythonModeTLS::get_state() {
-  return pythonModeState;
-}
-
-void PythonModeTLS::reset_state() {
-  pythonModeState.reset((TorchDispatchTypeObject*)nullptr);
-  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
-}
-
-} // namespace impl
-} // namespace at
diff --git a/aten/src/ATen/core/PythonModeTLS.h b/aten/src/ATen/core/PythonModeTLS.h
deleted file mode 100644
index be52b182c659..000000000000
--- a/aten/src/ATen/core/PythonModeTLS.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <c10/macros/Macros.h>
-#include <torch/library.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-
-namespace at {
-namespace impl {
-
-struct TORCH_API PythonModeTLS {
-  static void set_state(const std::shared_ptr<TorchDispatchTypeObject>& state);
-  static const std::shared_ptr<TorchDispatchTypeObject>& get_state();
-  static void reset_state();
-};
-
-} // namespace impl
-} // namespace at
diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h
index e11d8d6e049c..922ea8a38f50 100644
--- a/aten/src/ATen/core/QuantizerBase.h
+++ b/aten/src/ATen/core/QuantizerBase.h
@@ -55,7 +55,7 @@ struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
    */
   virtual QScheme qscheme() const = 0;
 
-  ScalarType scalar_type() {
+  ScalarType scalar_type() const {
     return scalar_type_;
   }
 
@@ -77,7 +77,7 @@ struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
   /**
    * Compare against `other` for equality.
    */
-  virtual bool equalTo(QuantizerPtr other) = 0;
+  virtual bool equalTo(QuantizerPtr other) const = 0;
 };
 
 } // namespace at
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index 9f56923c1cdb..fa175165d2e1 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -4,6 +4,15 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/FunctionalTensorWrapper.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/MethodOperators.h>
+#else
+#include <ATen/ops/contiguous_ops.h>
+#include <ATen/ops/fill_ops.h>
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/zero_ops.h>
+#endif
+
 #include <iostream>
 
 namespace at {
@@ -29,6 +38,18 @@ const TensorBase& TensorBase::zero_() const {
   return *this;
 }
 
+TensorBase TensorBase::to(
+    at::TensorOptions options,
+    bool non_blocking,
+    bool copy,
+    c10::optional<at::MemoryFormat> memory_format) const {
+  Tensor self(*this);
+  return at::_ops::to_dtype_layout::call(
+      self, optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(), options.device_opt(),
+      options.pinned_memory_opt(), non_blocking, copy, memory_format);
+}
+
 void TensorBase::enforce_invariants() {
   if (impl_.get() == nullptr) {
     throw std::runtime_error("TensorImpl with nullptr is not supported");
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index 9d65522b5d96..9c60f84a16b3 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index b05f74259dc2..37c1ed895782 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -17,6 +17,7 @@
 
 #include <ATen/core/NamedTensor.h>
 #include <ATen/core/QuantizerBase.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <ATen/core/TensorAccessor.h>
 
 namespace c10 {
@@ -43,7 +44,6 @@ inline bool variable_excluded_from_dispatch() {
   // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change.
   return true;
 #else
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::impl::tls_local_dispatch_key_set().excluded_.has(DispatchKey::Autograd));
   return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset);
 #endif
 }
@@ -142,6 +142,8 @@ class TORCH_API TensorBase {
   const TensorBase& fill_(const c10::Scalar& scalar) const;
   const TensorBase& zero_() const;
 
+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+
   bool is_complex() const {
     return at::isComplexType(this->scalar_type());
   }
@@ -155,15 +157,17 @@ class TORCH_API TensorBase {
   }
 
   int64_t size(int64_t dim) const {
+    const auto sizes = this->sizes();
+    const auto ndim = static_cast<int64_t>(sizes.size());
     // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
-    dim = c10::maybe_wrap_dim(dim, this->dim(), false);
-    return sizes()[dim];
+    return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
   }
 
   int64_t stride(int64_t dim) const {
+    const auto strides = this->strides();
+    const auto ndim = static_cast<int64_t>(strides.size());
     // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
-    dim = c10::maybe_wrap_dim(dim, this->dim(), false);
-    return strides()[dim];
+    return strides[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
   }
 
   TensorImpl * unsafeGetTensorImpl() const {
@@ -216,6 +220,9 @@ class TORCH_API TensorBase {
   IntArrayRef sizes() const {
     return impl_->sizes();
   }
+  c10::SymIntArrayRef sym_sizes() const {
+    return c10::SymIntArrayRef(reinterpret_cast<const SymInt*>(sizes().data()), sizes().size());
+  }
   IntArrayRef strides() const {
     return impl_->strides();
   }
@@ -243,7 +250,7 @@ class TORCH_API TensorBase {
       bool channels_last_strides_exact_match = false) const {
     // Setting channels_last_strides_exact_match to true forces function to
     // check 0,1 - sized dimension strides.
-    if (!is_mkldnn() && !is_sparse()) {
+    if (layout() == at::kStrided) {
       if (impl_->is_strides_like_channels_last()) {
         if (!channels_last_strides_exact_match ||
             get_channels_last_strides_2d(sizes()) == strides()) {
@@ -369,6 +376,12 @@ class TORCH_API TensorBase {
     return impl_->is_cuda();
   }
 
+  /// Returns if a `Tensor` has IPU backend.
+  bool is_ipu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ipu();
+  }
+
   /// Returns if a `Tensor` has XPU backend.
   bool is_xpu() const {
     // NB: this is not a native function to avoid dispatching overhead.
@@ -420,10 +433,10 @@ class TORCH_API TensorBase {
     return impl_->is_mkldnn();
   }
 
-  /// Returns if a `Tensor` is mlc tensor.
-  bool is_mlc() const {
+  /// Returns if a `Tensor` is mps tensor.
+  bool is_mps() const {
     // NB: this is not a native function to avoid dispatching overhead.
-    return impl_->is_mlc();
+    return impl_->is_mps();
   }
 
   /// Returns if a `Tensor` is ort tensor.
@@ -461,6 +474,11 @@ class TORCH_API TensorBase {
     return impl_->is_inference();
   }
 
+  // Returns if a `Tensor` is a NestedTensor.
+  bool is_nested() const {
+    return impl_->is_nested();
+  }
+
   /// If a tensor is a quantized tensor, returns its quantizer
   /// TODO: it's not in native_functions.yaml yet as it's not exposed to python
   QuantizerPtr quantizer() const;
@@ -865,7 +883,7 @@ struct MaybeOwnedTraits<at::TensorBase> {
     return &borrow;
   }
 
-  static bool debugBorrowIsValid(const borrow_type& borrow) {
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
     return true;
   }
 };
diff --git a/aten/src/ATen/core/TorchDispatchModeTLS.cpp b/aten/src/ATen/core/TorchDispatchModeTLS.cpp
new file mode 100644
index 000000000000..6c35890eb8b6
--- /dev/null
+++ b/aten/src/ATen/core/TorchDispatchModeTLS.cpp
@@ -0,0 +1,29 @@
+#include <ATen/core/TorchDispatchModeTLS.h>
+#include <c10/core/SafePyObject.h>
+
+namespace at { namespace impl {
+
+thread_local std::shared_ptr<SafePyObject> torchDispatchModeState;
+
+void TorchDispatchModeTLS::set_state(std::shared_ptr<SafePyObject> state) {
+  if (state) {
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true);
+  } else {
+    TorchDispatchModeTLS::reset_state();
+  }
+  torchDispatchModeState = std::move(state);
+}
+
+const std::shared_ptr<SafePyObject>& TorchDispatchModeTLS::get_state() {
+  return torchDispatchModeState;
+}
+
+void TorchDispatchModeTLS::reset_state() {
+  torchDispatchModeState.reset();
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false);
+}
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/core/TorchDispatchModeTLS.h b/aten/src/ATen/core/TorchDispatchModeTLS.h
new file mode 100644
index 000000000000..adbf30844382
--- /dev/null
+++ b/aten/src/ATen/core/TorchDispatchModeTLS.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <torch/library.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+namespace at {
+namespace impl {
+
+struct TORCH_API TorchDispatchModeTLS {
+  static void set_state(std::shared_ptr<SafePyObject> state);
+  static const std::shared_ptr<SafePyObject>& get_state();
+  static void reset_state();
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index ed67c4754fed..ebc54d8e7cba 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -56,7 +56,7 @@ TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
 
-TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
+TORCH_LIBRARY_IMPL(_, AutogradMPS, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
 
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index c9611475255b..01537c2dc471 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -157,6 +157,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr)
     static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
 
 #if !defined(C10_MOBILE)
+    (void)func_ptr; // Suppress unused variable warning
     return makeFromUnboxedFunctor<AllowLegacyTypes, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>(
         guts::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
     );
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index d9b14623dc54..4bbc5dd69dcf 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -200,7 +200,7 @@ struct BoxedKernelWrapper<
 // 3. in-place ops take a single non-const Tensor reference
 // as their first argument, and return it.
 //
-// Note: all signatures matching this pattern are are assumed to be for such ops.
+// Note: all signatures matching this pattern are assumed to be for such ops.
 // Because of this, the generated BoxedKernelWrapper specializations simply
 // return the in-place argument.
 //
@@ -260,7 +260,7 @@ struct BoxedKernelWrapper<
 // 4. out of place ops that take a single non-const Tensor reference as their
 // final argument, and also return it.
 //
-// Note: all signatures matching this pattern are are assumed to be for such ops.
+// Note: all signatures matching this pattern are assumed to be for such ops.
 // This assumption permits the generated BoxedKernelWrapper specializations to simply
 // return out arguments.
 //
@@ -300,7 +300,7 @@ struct BoxedKernelWrapper<
 // 5. out of place ops that take multiple non-const Tensor references as their
 // final arguments, and return them in a std::tuple.
 //
-// Note: all signatures matching this pattern are are assumed to be for such ops.
+// Note: all signatures matching this pattern are assumed to be for such ops.
 // This assumption permits the generated BoxedKernelWrapper specializations to simply
 // return the out arguments.
 //
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index f48246c02fd6..2b2228bb944d 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -2,6 +2,7 @@
 
 #include <ATen/core/ivalue.h>
 #include <ATen/core/stack.h>
+#include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/Metaprogramming.h>
 
@@ -180,6 +181,13 @@ namespace impl {
       "You tried to register a kernel with an unsupported input type: ArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
   };
 
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<c10::OptionalArrayRef<T>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported input type: OptionalArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  };
+
   template<class T, size_t N, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<std::array<T, N>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
@@ -233,6 +241,10 @@ namespace impl {
   struct assert_is_valid_output_type<c10::optional<T>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
 
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<c10::OptionalArrayRef<T>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+
   template<class Key, class Value, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<Dict<Key, Value>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
@@ -358,16 +370,33 @@ namespace impl {
       return ivalue_to_arg<std::vector<T>, AllowDeprecatedTypes>::call(v);
     }
   };
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<c10::SymIntArrayRef, AllowDeprecatedTypes> final {
+    static std::vector<c10::SymInt> call(IValue& v) {
+      return ivalue_to_arg<std::vector<c10::SymInt>, AllowDeprecatedTypes>::call(v);
+    }
+  };
   template<class T, bool AllowDeprecatedTypes>
   struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
     // If an argument is optional<ArrayRef<T>>, convert the IValue to an optional<std::vector<T>> and pass that
-    // to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but impliticly convertible
+    // to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but implicitly convertible
     // to optional<ArrayRef<T>>.
     static OptionalArray<T> call(IValue& v) {
       return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
     }
   };
 
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
+    // If an argument is OptionalArrayRef<T>, convert the IValue to an
+    // optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
+    // is basically a optional<std::vector<T>> but implicitly convertible to
+    // OptionalArrayRef<T>
+    static OptionalArray<T> call(IValue& v) {
+      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+    }
+  };
+
   // return_to_ivalue
   template<class T, bool AllowDeprecatedTypes, class Enable = void>
   struct return_to_ivalue final {};
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 3c6fd0c77cad..6f1e9e75ea3e 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -62,7 +62,7 @@ struct BuiltinOpFunction : public Function {
     return *this;
   }
 
-  bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {
+  bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {
     run(stack);
     return false;
   }
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index 3a019708cdda..67507c89bf1b 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -391,6 +391,7 @@ struct TORCH_API ClassType : public NamedType {
       std::vector<std::string> unresolved_class_attributes = {});
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     const auto& n = name().value();
     return n.qualifiedName();
   }
diff --git a/aten/src/ATen/core/custom_class.cpp b/aten/src/ATen/core/custom_class.cpp
index f61766c0cef3..2bba7e6df62f 100644
--- a/aten/src/ATen/core/custom_class.cpp
+++ b/aten/src/ATen/core/custom_class.cpp
@@ -4,22 +4,38 @@
 #include <ATen/core/function_schema.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/type_factory.h>
+#include <c10/util/flat_hash_map.h>
 
 #include <atomic>
 #include <unordered_map>
 
 namespace c10 {
 
-ska::flat_hash_map<std::type_index, c10::ClassTypePtr>& getCustomClassTypeMap() {
+static ska::flat_hash_map<std::type_index, c10::ClassTypePtr>& getCustomClassTypeMap() {
   static ska::flat_hash_map<std::type_index, c10::ClassTypePtr> tmap;
   return tmap;
 }
 
-std::unordered_map<std::string, std::function<PyObject*(void*)>>&
-getClassConverter() {
-  static std::unordered_map<std::string, std::function<PyObject*(void*)>>
-      classConverter;
-  return classConverter;
+c10::ClassTypePtr getCustomClassTypeImpl(const std::type_index &tindex) {
+  auto& tmap = c10::getCustomClassTypeMap();
+  auto res = tmap.find(tindex);
+  if (C10_UNLIKELY(res == tmap.end())) {
+    // type_index is not guaranteed to be unique across shared libraries on some platforms
+    // For example see https://github.com/llvm-mirror/libcxx/blob/78d6a7767ed57b50122a161b91f59f19c9bd0d19/include/typeinfo#L133
+    // Also, this is not the case if RTLD_LOCAL option is used, see
+    // https://github.com/pybind/pybind11/blob/f791dc8648e1f6ec33f402d679b6b116a76d4e1b/include/pybind11/detail/internals.h#L101-L106
+    // Take a slow path of iterating over all registered types and compare their names
+    auto class_name = std::string(tindex.name());
+    for(const auto &it: tmap) {
+      if (class_name == it.first.name()) {
+          // Do not modify existing type map here as this template is supposed to be called only once per type
+          // from getCustomClassTypeImpl()
+          return it.second;
+      }
+    }
+    TORCH_CHECK(false, "Can't find class id in custom class type map for ", tindex.name());
+  }
+  return res->second;
 }
 
 } // namespace c10
@@ -29,7 +45,7 @@ namespace torch {
 namespace detail {
 
 void record_custom_class(std::string name) {
-  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::CUSTOM_CLASS, name, {});
+  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::CUSTOM_CLASS, name, c10::ArrayRef<const c10::IValue>{});
 }
 
 } // namespace detail
diff --git a/aten/src/ATen/core/custom_class.h b/aten/src/ATen/core/custom_class.h
index 54d7bfecd762..ff9bda981b29 100644
--- a/aten/src/ATen/core/custom_class.h
+++ b/aten/src/ATen/core/custom_class.h
@@ -2,45 +2,17 @@
 
 #include <typeindex>
 #include <memory>
-#include <unordered_map>
 
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
-#include <c10/util/flat_hash_map.h>
-#include <c10/util/python_stub.h>
 
 namespace c10 {
 
 struct ClassType;
 using ClassTypePtr = std::shared_ptr<ClassType>;
 
-TORCH_API ska::flat_hash_map<std::type_index, c10::ClassTypePtr>&
-getCustomClassTypeMap();
-
-template <typename T>
-c10::ClassTypePtr getCustomClassTypeImpl() {
-  auto& tmap = c10::getCustomClassTypeMap();
-  auto tindex = std::type_index(typeid(T));
-  auto res = tmap.find(tindex);
-  if (C10_UNLIKELY(res == tmap.end())) {
-    // type_index is not guaranteed to be unique across shared libraries on some platforms
-    // For example see https://github.com/llvm-mirror/libcxx/blob/78d6a7767ed57b50122a161b91f59f19c9bd0d19/include/typeinfo#L133
-    // Also, this is not the case if RTLD_LOCAL option is used, see
-    // https://github.com/pybind/pybind11/blob/f791dc8648e1f6ec33f402d679b6b116a76d4e1b/include/pybind11/detail/internals.h#L101-L106
-    // Take a slow path of iterating over all registered types and compare their names
-    auto class_name = std::string(tindex.name());
-    for(const auto &it: tmap) {
-      if (class_name == it.first.name()) {
-          // Do not modify existing type map here as this template is supposed to be called only once per type
-          // from getCustomClassTypeImpl()
-          return it.second;
-      }
-    }
-    TORCH_CHECK(false, "Can't find class id in custom class type map for ", tindex.name());
-  }
-  return res->second;
-}
+TORCH_API c10::ClassTypePtr getCustomClassTypeImpl(const std::type_index &tindex);
 
 template <typename T>
 const c10::ClassTypePtr& getCustomClassType() {
@@ -48,10 +20,9 @@ const c10::ClassTypePtr& getCustomClassType() {
   // hash lookup can be a hot path, so just cache.
   // For the same reason, it's fine If this ends up getting duplicated across
   // DSO boundaries for whatever reason.
-  static c10::ClassTypePtr cache = getCustomClassTypeImpl<T>();
+  static c10::ClassTypePtr cache = getCustomClassTypeImpl(
+      std::type_index(typeid(T)));
   return cache;
 }
 
-TORCH_API std::unordered_map<std::string, std::function<PyObject*(void*)>>&
-getClassConverter();
 }
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
index a930edc2db63..9180d0d19e64 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@@ -6,11 +6,52 @@
 namespace c10 {
 
 void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) {
+  // (1) update nonFallthroughKeys_
   if (has_fallthrough) {
     nonFallthroughKeys_ = nonFallthroughKeys_.remove(k);
   } else {
     nonFallthroughKeys_ = nonFallthroughKeys_.add(k);
   }
+  // (2) update nonFallthroughKeysPerBackend_
+  if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) {
+    // This is a per-backend functionality key.
+    // We need to figure out what the current backend is,
+    // and only update the bitset for that backend.
+    // subtracting 1 because the first backend should have index 0 (CPU),
+    // But the enum starts with BackendComponent::InvalidBit.
+    auto backend_idx = static_cast<uint8_t>(toBackendComponent(k)) - 1;
+    TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast<uint8_t>(backend_idx) < nonFallthroughKeysPerBackend_.size());
+    if (has_fallthrough) {
+      nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k);
+    } else {
+      nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k);
+    }
+
+    // Set requiresBitsetPerBackend_ accordingly
+    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) {
+      if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) {
+        requiresBitsetPerBackend_ = true;
+        return;
+      }
+    }
+    requiresBitsetPerBackend_ = false;
+    return;
+  } else {
+    // Otherwise, if a fallthrough is set for a functionality that isn't per backend,
+    // Then we update the fallthrough bitset for EVERY backend.
+    // TODO: we could probably optimize this by only lazily updating these values
+    // the first time that we see requiresBitsetPerBackend_ = true
+    // (which should almost never happen)
+    if (has_fallthrough) {
+      for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+        nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k);
+      }
+    } else {
+      for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+        nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k);
+      }
+    }
+  }
 }
 
 std::string DispatchKeyExtractor::dumpState() const {
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 4d2e7d0d4bdc..d5345b28e714 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -74,7 +74,7 @@ namespace detail {
         }
       }
     }
-    void operator()(at::ArrayRef<c10::optional<at::Tensor>> xs) {
+    void operator()(at::ArrayRef<c10::optional<at::Tensor>>) {
       // Just checking that the handling of Tensor?[] didn't change.
       TORCH_INTERNAL_ASSERT(false);
     }
@@ -89,7 +89,7 @@ namespace detail {
       }
     }
     template <typename T>
-    void operator()(const T& x) {
+    void operator()(const T&) {
       // do nothing
     }
   };
@@ -156,14 +156,24 @@ struct TORCH_API DispatchKeyExtractor final {
       }
     });
     // Keys that are fallthrough should be skipped
-    return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    if (requiresBitsetPerBackend_) {
+      auto backend_idx = ks.getBackendIndex();
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
   }
 
   template<class... Args>
   DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
     auto ks = detail::multi_dispatch_key_set(args...);
     // Keys that are fallthrough should be skipped
-    return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    if (requiresBitsetPerBackend_) {
+      auto backend_idx = ks.getBackendIndex();
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
   }
 
   void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);
@@ -193,7 +203,12 @@ struct TORCH_API DispatchKeyExtractor final {
 
   explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
   : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse)
-  , nonFallthroughKeys_(DispatchKeySet::FULL) {}
+  , nonFallthroughKeys_(DispatchKeySet::FULL)
+  , requiresBitsetPerBackend_(false) {
+    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
+    }
+  }
 
   // this is a bitset that has ones for each argument index which has to be
   // considered for dispatch. This avoids having to iterate over the stack
@@ -205,8 +220,14 @@ struct TORCH_API DispatchKeyExtractor final {
   // fallthrough
   c10::utils::bitset dispatch_arg_indices_reverse_;
 
-  // Set of keys for which the operator does NOT have fallthrough kernel.
+  // Set of functionality keys for which the operator does NOT have fallthrough kernel.
   DispatchKeySet nonFallthroughKeys_;
+  // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND.
+  // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends.
+  std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
+  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path),
+  // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_
+  bool requiresBitsetPerBackend_;
 };
 
 }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 3dccc4645a82..66be5a187027 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -267,14 +267,16 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name)
 RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) {
   std::lock_guard<std::mutex> lock(mutex_);
 
+  auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
+  TORCH_CHECK(idx >= 0 && static_cast<uint64_t>(idx) < backendFallbackKernels_.size(), "idx=", idx);
   TORCH_CHECK(
-    !backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].kernel.isValid(),
+    !backendFallbackKernels_[idx].kernel.isValid(),
     "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ",
-    backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].debug, ", new registration ", debug
+    backendFallbackKernels_[idx].debug, ", new registration ", debug
   );
   // NB: inferred function schema is always nullptr for fallbacks, as fallbacks
   // cannot be unobxed
-  backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
+  backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
 
   for (auto& op : operators_) {
     op.op.updateFallback(*this, dispatchKey);
@@ -288,7 +290,8 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
 void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) {
   std::lock_guard<std::mutex> lock(mutex_);
 
-  backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = {};
+  auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
+  backendFallbackKernels_[idx] = {};
 
   for (auto& op : operators_) {
     op.op.updateFallback(*this, dispatchKey);
@@ -353,18 +356,18 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
   return seq_num;
 }
 
-void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, const torch::jit::Stack &stack) {
-  guard.before(op, stack, sequenceNumberForRunningRecordFunction(dispatchKey));
+void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, const torch::jit::Stack &stack) {
+  guard.before(schema_ref, c10::ArrayRef<const IValue>(stack.data(), stack.size()), sequenceNumberForRunningRecordFunction(dispatchKey));
 }
 
-void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, torch::jit::Stack &&stack) {
-  guard.before(op, std::move(stack), sequenceNumberForRunningRecordFunction(dispatchKey));
+void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, torch::jit::Stack &&stack) {
+  guard.before(schema_ref, c10::ArrayRef<const IValue>(stack.data(), stack.size()), sequenceNumberForRunningRecordFunction(dispatchKey));
 }
 
-void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey) {
+void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey) {
   // Setting sequence number in the Autograd case to associate
   // the forward range with the coresponding Autograd's node
-  guard.before(op, sequenceNumberForRunningRecordFunction(dispatchKey));
+  guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey));
 }
 
 }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 14ffa2f94c9c..c52e7822ec5c 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -152,7 +152,7 @@ class TORCH_API Dispatcher final {
 
 
   template<class Return, class... Args>
-  static Return callWithDispatchKeySlowPath(const TypedOperatorHandle<Return (Args...)>& op, bool pre_sampled, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args);
+  static Return callWithDispatchKeySlowPath(const TypedOperatorHandle<Return (Args...)>& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args);
 
   // Like call, but intended for use in a redispatch in kernels that have explicitly performed the DispatchKey update calculatulation.
   // This will take the DispatchKeySet completely as is and dispatch to the kernel of the corresponding highest priority key in the set.
@@ -263,9 +263,9 @@ class TORCH_API Dispatcher final {
   Dispatcher();
 
   static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey);
-  static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey);
-  static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, torch::jit::Stack &&stack);
-  static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, const torch::jit::Stack &stack);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, torch::jit::Stack &&stack);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, const torch::jit::Stack &stack);
 
   OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema);
   OperatorHandle findOrRegisterName_(const OperatorName& op_name);
@@ -291,7 +291,7 @@ class TORCH_API Dispatcher final {
   // Map from namespace to debug string (saying, e.g., where the library was defined)
   ska::flat_hash_map<std::string, std::string> libraries_;
 
-  std::array<impl::AnnotatedKernel, static_cast<uint8_t>(DispatchKey::NumDispatchKeys)> backendFallbackKernels_;
+  std::array<impl::AnnotatedKernel, num_runtime_entries> backendFallbackKernels_;
 
   std::unique_ptr<detail::RegistrationListenerList> listeners_;
   std::mutex mutex_;
@@ -494,33 +494,28 @@ struct CaptureKernelCall<void> {
 
 // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
 template<class Return, class... Args>
-inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle<Return(Args...)>& op, bool pre_sampled, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args) {
-    // Check if we need to run callbacks registered with RecordFunction
-    // If true and callbacks need inputs, we box the arguments and pass
-    // them into the callbacks and also into the kernel call
-
-    // Note: for perf reasons we wouldn't want to pass arguments into
-    // the function call or prematurely box them
-  at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
-  if (C10_UNLIKELY(guard.isActive())) {
-    auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
-    if (op.operatorDef_->op.isObserved()) {
-      if (guard.needsInputs()) {
-        runRecordFunction(guard, op, dispatchKey, impl::boxArgs(args...));
-      } else {
-        runRecordFunction(guard, op, dispatchKey);
-      }
-      if (C10_UNLIKELY(guard.needsOutputs())) {
-        // Calls the kernel and capture the output temporarily to pass to
-        // RecordFunction.
-        detail::CaptureKernelCall<Return> captureKernelCall(
-            kernel, op, dispatchKeySet, std::forward<Args>(args)...);
-        guard.setOutputs(captureKernelCall.getOutputs());
-        // Releases the captured output to return to caller.
-        return std::move(captureKernelCall).release();
-      }
-    }
+inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle<Return(Args...)>& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args) {
+  // If callbacks need inputs, we box the arguments and pass them to the guard.
+  // Note: For perf reasons we wouldn't want to prematurely box the arguments.
+  at::RecordFunction guard(std::move(stepCallbacks));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved());
+  auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+  auto& schema = op.schema();
+  auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+  guard.needsInputs()
+      ? runRecordFunction(guard, schema_ref, dispatchKey, impl::boxArgs(args...))
+      : runRecordFunction(guard, schema_ref, dispatchKey);
+
+  if (C10_UNLIKELY(guard.needsOutputs())) {
+    // Calls the kernel and capture the output temporarily to pass to
+    // RecordFunction.
+    detail::CaptureKernelCall<Return> captureKernelCall(
+        kernel, op, dispatchKeySet, std::forward<Args>(args)...);
+    guard.setOutputs(captureKernelCall.getOutputs());
+    // Releases the captured output to return to caller.
+    return std::move(captureKernelCall).release();
   }
+
   // keeping the guard alive while executing the kernel
   return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
 }
@@ -531,18 +526,11 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH
   detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
   auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
     .template getDispatchKeySetUnboxed<Args...>(args...);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId()));
-  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId());
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
-  // By default, when there're no high-frequency or non-sampled callbacks,
-  // RecordFunction is pre-sampled as a perf optimization;
-  // shouldRunRecordFunction checks whether RecordFunction should be executed,
-  // and sets pre_sampled boolean argument value to whether pre-sampling was used -
-  // this boolean is passed into RecordFunction to adjust the sampling rates of
-  // the callbacks
-  bool pre_sampled = false;
-  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
-    return callWithDispatchKeySlowPath<Return, Args...>(op, pre_sampled, dispatchKeySet, kernel, std::forward<Args>(args)...);
+  auto step_callbacks = at::getStepCallbacks(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(!step_callbacks.empty() && op.operatorDef_->op.isObserved())) {
+    return callWithDispatchKeySlowPath<Return, Args...>(op, step_callbacks, dispatchKeySet, kernel, std::forward<Args>(args)...);
   }
 #endif  // PYTORCH_DISABLE_PER_OP_PROFILING
   return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
@@ -553,7 +541,7 @@ template<class Return, class... Args>
 inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const {
   detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
   // do not use RecordFunction on redispatch
-  const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId());
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet);
   return kernel.template call<Return, Args...>(op, currentDispatchKeySet, std::forward<Args>(args)...);
 }
 
@@ -561,27 +549,21 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const auto& entry = op.operatorDef_->op;
   auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
-  const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
+  const auto& kernel = entry.lookup(dispatchKeySet);
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
-  bool pre_sampled = false;
-  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
-    // using already existing stack to record function execution in observers
-    at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
-    if (C10_UNLIKELY(guard.isActive())) {
-      auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
-      if (entry.isObserved()) {
-        if (guard.needsInputs()) {
-          runRecordFunction(guard, op, dispatchKey, *stack);
-        } else {
-          runRecordFunction(guard, op, dispatchKey);
-        }
-      }
-    }
+  auto step_callbacks = at::getStepCallbacks(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(!step_callbacks.empty() && entry.isObserved())) {
+    at::RecordFunction guard(std::move(step_callbacks));
+    auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+    auto& schema = op.schema();
+    auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+    guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, *stack)
+                        : runRecordFunction(guard, schema_ref, dispatchKey);
+
     // keeping the guard alive while executing the kernel
     kernel.callBoxed(op, dispatchKeySet, stack);
-    // track outputs
-    if (C10_UNLIKELY(
-            guard.isActive() && entry.isObserved() && guard.needsOutputs())) {
+
+    if (C10_UNLIKELY(guard.needsOutputs())) {
       guard.setOutputs(*stack);
     }
     return;
@@ -593,7 +575,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
 inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const auto& entry = op.operatorDef_->op;
-  const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
+  const auto& kernel = entry.lookup(dispatchKeySet);
   return kernel.callBoxed(op, dispatchKeySet, stack);
 }
 
diff --git a/aten/src/ATen/core/dispatch/ObservedOperators.cpp b/aten/src/ATen/core/dispatch/ObservedOperators.cpp
index 1d1ed4c1926a..65545a221f9c 100644
--- a/aten/src/ATen/core/dispatch/ObservedOperators.cpp
+++ b/aten/src/ATen/core/dispatch/ObservedOperators.cpp
@@ -15,6 +15,7 @@ std::unordered_set<std::string>& ObservedOperators::getUnobservedOperatorList()
     "aten::_version",
     "aten::is_complex",
     "profiler::_record_function_enter",
+    "profiler::_record_function_enter_new",
     "profiler::_record_function_exit",
   };
   return not_observed_ops;
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index d4d997fde69a..d5cc6d45933f 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -283,7 +283,10 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   }
 
   // 3. Backend fallback
-  auto dispatch_ix = static_cast<uint8_t>(dispatch_key);
+  auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
+  if (dispatch_ix < 0) {
+    return {missingKernel(), "backend fallback not registered on mobile"};
+  }
   if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) {
     return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"};
   }
@@ -299,7 +302,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
 // or alias keys and their associated keysets).
 // This function should be considered a private helper for updateDispatchTable_()
 void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
-  const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key);
+  const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
   if (C10_UNLIKELY(dispatch_ix == -1)) {
     return;
   }
@@ -329,8 +332,12 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   }
   // Note [Refresh Runtime Autograd entries in dispatchTable_]
   // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
+  // In theory, we should only have to check if the given runtime key has "dense" functionality,
+  // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit).
+  // However, there are some backends that should be included in this set that don't have the dense key set.
+  // E.g. DispatchKey::Meta, DispatchKey::ORT.
   if (c10::isBackendDispatchKey(dispatch_key)) {
-    DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
+    DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key));
     updateDispatchTableEntry_(dispatcher, autograd_key);
   }
 }
@@ -357,8 +364,9 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
   // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd
   // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd)
   // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet.
-  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
-    updateDispatchTable_(dispatcher, static_cast<DispatchKey>(iter));
+  updateDispatchTable_(dispatcher, DispatchKey::Undefined);
+  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+    updateDispatchTable_(dispatcher, k);
   }
 }
 
@@ -371,9 +379,13 @@ void OperatorEntry::checkInvariants() const {
   for (const auto& kv : kernels_) {
     TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState());
   }
-  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
-    auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast<DispatchKey>(iter));
-    TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]),
+  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+    auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k);
+    auto idx = getDispatchTableIndexForDispatchKey(k);
+    if (C10_UNLIKELY(idx == -1)) {
+      continue;
+    }
+    TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]),
       "Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n"
       "Computed table:\n~~~~~~~~~~~\n", dumpComputedTable());
   }
@@ -384,8 +396,9 @@ std::string OperatorEntry::listAllDispatchKeys() const {
   str << "[";
 
   bool has_kernels = false;
-  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
-    if (!dispatchTable_[iter].isValid()) {
+  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+    auto iter = getDispatchTableIndexForDispatchKey(k);
+    if (iter == -1 || !dispatchTable_[iter].isValid()) {
       continue;
     }
     if (has_kernels) {
@@ -443,8 +456,12 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const {
 // updateDispatchTableFull_ would update the dispatch table to be)
 std::string OperatorEntry::dumpComputedTable() const {
   std::ostringstream oss;
-  for (uint8_t i = 0; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys); i++) {
-    auto k = static_cast<DispatchKey>(i);
+  // Need to handle Undefined separately, because its a runtime key that can't be represented
+  // in a DispatchKeySet.
+  std::vector<DispatchKey> runtime_keys = {DispatchKey::Undefined};
+  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k);
+
+  for (auto k : runtime_keys) {
     auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
     if (kernel_prov.first.kernel.isValid()) {
       oss << toString(k) << ": "
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index d98bd6bc6904..c0f90808280a 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -173,10 +173,10 @@ class TORCH_API OperatorEntry final {
 
   [[noreturn]] void reportError(DispatchKey dispatchKey) const;
 
-  const KernelFunction& lookup(DispatchKey k) const {
-    const auto idx = getDispatchTableIndexForDispatchKey(k);
+  const KernelFunction& lookup(DispatchKeySet ks) const {
+    const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
     if (C10_UNLIKELY(idx == -1)) {
-      reportError(k);
+      reportError(ks.highestPriorityTypeId());
     }
     const auto& kernel = dispatchTable_[idx];
     // A valid kernel *always* has a boxed kernel and *may* have an
@@ -187,7 +187,7 @@ class TORCH_API OperatorEntry final {
     // in the common case.
     if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
       if (!kernel.isValid()) {
-        reportError(k);
+        reportError(ks.highestPriorityTypeId());
       }
     }
     return kernel;
@@ -211,7 +211,7 @@ class TORCH_API OperatorEntry final {
   OperatorName name_;
   c10::optional<AnnotatedSchema> schema_;
 
-  std::array<KernelFunction, c10::getDispatchTableIndexForDispatchKey(DispatchKey::NumDispatchKeys)> dispatchTable_;
+  std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
   DispatchKeyExtractor dispatchKeyExtractor_;
 
   // kernels_ stores all registered kernels for the corresponding dispatch key
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index 95050da593eb..5920d7c05f1f 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -123,6 +123,7 @@ DynamicType::DynamicType(const Type& other) : SharedType(DynamicType::Kind) {
     tag_ = Tag::T;          \
     break;
     FORALL_DYNAMIC_TYPES(CASE_TYPE)
+    FORALL_DYNAMIC_TYPES_FAKE(CASE_TYPE)
 #undef CASE_TYPE
     default:
       TORCH_INTERNAL_ASSERT(false, "Unsupported dynamic type: ", other.str());
@@ -210,6 +211,9 @@ TypeKind DynamicType::dynamicKind() const {
   case Tag::T:              \
     return TypeKind::T##Type;
     FORALL_DYNAMIC_TYPES(CASE_TYPE)
+    // FORALL_DYNAMIC_TYPES_FAKE is intentionally omitted here
+    // as these dynamic types map to the same tag, so they always
+    // resolve to integers
 #undef CASE_TYPE
     default:
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
@@ -227,6 +231,8 @@ TypePtr DynamicType::fallback() const {
       return BoolType::get();
     case Tag::Int:
       return IntType::get();
+    case Tag::SymInt:
+      return SymIntType::get();
     case Tag::Float:
       return FloatType::get();
     case Tag::Complex:
@@ -320,6 +326,8 @@ DynamicType::Ptr IValue::TagType<c10::DynamicType>::get(const c10::IValue& v) {
       return DynamicTypeTrait<ComplexType>::getBaseType();
     case Tag::Int:
       return DynamicTypeTrait<IntType>::getBaseType();
+    case Tag::SymInt:
+      return DynamicTypeTrait<SymIntType>::getBaseType();
     case Tag::Bool:
       return DynamicTypeTrait<BoolType>::getBaseType();
     case Tag::String:
@@ -368,7 +376,7 @@ ivalue::TupleTypeFactory<TupleType>::fallback(const Type& type) {
   for (const auto& elem : dyn.arguments().elems) {
     types.emplace_back(elem.ty);
     if (const auto& name = elem.label) {
-      fields.emplace_back(*elem.label);
+      fields.emplace_back(*name);
     }
   }
   if (const auto& name = dyn.name()) {
@@ -381,6 +389,7 @@ ivalue::TupleTypeFactory<TupleType>::fallback(const Type& type) {
 #define DYNAMIC_TYPE_TAG_VALUE(NAME, _, __) \
   constexpr bool DynamicTypeTrait<NAME##Type>::isBaseType;
 FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_TAG_VALUE)
+FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_TAG_VALUE)
 #undef DYNAMIC_TYPE_TAG_VALUE
 
 } // namespace c10
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index d5551c9a5e51..a84644ddde04 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -16,6 +16,7 @@ constexpr DynamicTypeBits kDynamicAnyTypeBit = DYNAMIC_TYPE_BIT(30);
 
 constexpr DynamicTypeBits kDynamicNoneTypeBit = DYNAMIC_TYPE_BIT(1);
 constexpr DynamicTypeBits kDynamicIntTypeBit = DYNAMIC_TYPE_BIT(3);
+constexpr DynamicTypeBits kDynamicSymIntTypeBit = DYNAMIC_TYPE_BIT(23);
 constexpr DynamicTypeBits kDynamicFloatTypeBit = DYNAMIC_TYPE_BIT(4);
 constexpr DynamicTypeBits kDynamicComplexTypeBit = DYNAMIC_TYPE_BIT(5);
 constexpr DynamicTypeBits kDynamicListTypeBit = DYNAMIC_TYPE_BIT(7);
@@ -28,6 +29,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(Bool, DYNAMIC_TYPE_BIT(2), 1)                                            \
   _(Int, kDynamicIntTypeBit, 1)                                              \
   _(Float, kDynamicFloatTypeBit, 1)                                          \
+  _(SymInt, kDynamicSymIntTypeBit, 1)                                        \
   _(Complex, kDynamicComplexTypeBit, 1)                                      \
   _(Number,                                                                  \
     (kDynamicIntTypeBit | kDynamicFloatTypeBit | kDynamicComplexTypeBit),    \
@@ -58,8 +60,14 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(Future, DYNAMIC_TYPE_BIT(22), 0)                                         \
   _(Any, 0xffffffff, 1)
 
+#define FORALL_DYNAMIC_TYPES_FAKE(_) \
+  _(ScalarType, kDynamicIntTypeBit, 1)                                \
+  _(Layout, kDynamicIntTypeBit, 1)                                        \
+  _(MemoryFormat, kDynamicIntTypeBit, 1)
+
 #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type;
   FORALL_DYNAMIC_TYPES(FORWARD_DECL_TYPE)
+  FORALL_DYNAMIC_TYPES_FAKE(FORWARD_DECL_TYPE)
 #undef FORWARD_DECL_TYPE
 
 class DynamicType;
@@ -136,6 +144,7 @@ class DynamicType : public SharedType {
   enum class Tag : DynamicTypeBits {
 #define DYNAMIC_TYPE_ITEM(NAME, VAL, _) NAME = VAL,
     FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_ITEM)
+    FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_ITEM)
 #undef DYNAMIC_TYPE_ITEM
   };
 
@@ -159,7 +168,7 @@ class DynamicType : public SharedType {
   const Arguments& arguments() const {
     return arguments_;
   }
-  TypeKind dynamicKind() const;
+  TORCH_API TypeKind dynamicKind() const;
 
   // Should be used only on the server side to restore static type information.
 #ifndef C10_MOBILE
@@ -223,6 +232,7 @@ C10_NOINLINE DynamicTypePtr makeBaseType(DynamicType::Tag tag);
     }                                                      \
   }; // namespace c10
 FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_TAG_VALUE)
+FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_TAG_VALUE)
 #undef DYNAMIC_TYPE_TAG_VALUE
 
 } // namespace c10
diff --git a/aten/src/ATen/core/enum_type.h b/aten/src/ATen/core/enum_type.h
index 50e4f3b88ba2..720d5363799f 100644
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@@ -87,6 +87,7 @@ struct TORCH_API EnumType : public NamedType {
 
   std::string annotation_str_impl(
       TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     const auto& n = name().value();
     return n.qualifiedName();
   }
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index aa18e9a073df..76e417b8c5cf 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -29,7 +29,7 @@ using Kwargs = std::unordered_map<std::string, at::IValue>;
 struct RecursiveMethodCallError : public std::exception {};
 using TaskLauncher = std::function<void(std::function<void()>)>;
 
-TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
+TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph, bool disable_autocast=false);
 
 // A Function is a pure Graph with no implicit `self` object bound.
 // It contains schema information and the executor that manages the
@@ -48,8 +48,9 @@ struct TORCH_API Function {
   virtual void run(Stack& stack) = 0;
 
   virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
-      Stack& stack,
+      Stack& /*stack*/,
       TaskLauncher taskLauncher = at::launch) {
+    (void)taskLauncher; // Suppress unused variable warning
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return {};
   }
@@ -89,7 +90,7 @@ struct TORCH_API Function {
   // call() returns false.
 
   // Overload for server interpreter, a bailout size is needed for graph executor.
-  virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {
+  virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 328ab79e2e44..2b3d51ee5e2e 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -30,8 +30,19 @@ struct Argument {
       c10::optional<IValue> default_value = c10::nullopt,
       bool kwarg_only = false,
       c10::optional<AliasInfo> alias_info = c10::nullopt)
+    : Argument(name, type, type, N, default_value, kwarg_only, alias_info) {}
+
+  Argument(
+      std::string name,
+      TypePtr fake_type,
+      TypePtr real_type,
+      c10::optional<int32_t> N = c10::nullopt,
+      c10::optional<IValue> default_value = c10::nullopt,
+      bool kwarg_only = false,
+      c10::optional<AliasInfo> alias_info = c10::nullopt)
       : name_(std::move(name)),
-        type_(type ? std::move(type) : TensorType::get()),
+        type_(fake_type ? std::move(fake_type) : TensorType::get()),
+        real_type_(real_type ? std::move(real_type) : TensorType::get()),
         N_(std::move(N)),
         default_value_(std::move(default_value)),
         alias_info_(alias_info ? std::make_unique<AliasInfo>(std::move(*alias_info)) : nullptr),
@@ -46,6 +57,7 @@ struct Argument {
   Argument(const Argument& rhs)
       : name_(rhs.name_),
         type_(rhs.type_),
+        real_type_(rhs.real_type_),
         N_(rhs.N_),
         default_value_(rhs.default_value_),
         alias_info_(rhs.alias_info_ ? std::make_unique<AliasInfo>(*rhs.alias_info_) : nullptr),
@@ -58,6 +70,7 @@ struct Argument {
     if (this != &rhs) {
       name_ = rhs.name_;
       type_ = rhs.type_;
+      real_type_ = rhs.real_type_;
       N_ = rhs.N_;
       default_value_ = rhs.default_value_;
       alias_info_ = rhs.alias_info_ ? std::make_unique<AliasInfo>(*rhs.alias_info_) : nullptr;
@@ -73,6 +86,9 @@ struct Argument {
   const TypePtr& type() const {
     return type_;
   }
+  const TypePtr& real_type() const {
+    return real_type_;
+  }
   c10::optional<int32_t> N() const {
     return N_;
   }
@@ -153,6 +169,7 @@ struct Argument {
  private:
   std::string name_;
   TypePtr type_;
+  TypePtr real_type_; // this is ScalarType, not int, e.g.
   // for list types, an optional statically known length for the list
   // e.g. for int[3]: type = ListType::ofInts(), N = 3
   // If present, this will allow scalars to be broadcast to this length to
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index 5d58ee88a418..dc4fdaf10133 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -162,7 +162,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     }
   }
 
-  // we want to test both out and default args seperately
+  // we want to test both out and default args separately
   size_t old_out_start_idx = findFirstOutArg(old.arguments());
   size_t new_out_start_idx = findFirstOutArg(arguments());
 
@@ -212,7 +212,7 @@ inline bool FunctionSchema::isForwardCompatibleWith(
     return false;
   }
 
-  // we want to test both out and default args seperately
+  // we want to test both out and default args separately
   size_t old_out_start_idx = findFirstOutArg(old.arguments());
   size_t new_out_start_idx = findFirstOutArg(arguments());
 
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 36fb0f91e4c8..10be63c2c1d9 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -43,8 +43,15 @@ namespace c10 {
   _(prim, FusionGroup)               \
   _(prim, CudaFusionGroup)           \
   _(prim, CudaFusionGuard)           \
+  _(prim, oneDNNFusionGroup)         \
+  _(prim, oneDNNFusionGuard)         \
   _(prim, FunctionalGraph)           \
   _(prim, add_optional)              \
+  _(prim, view_copy)                 \
+  _(prim, reshape_copy)              \
+  _(prim, squeeze_copy)              \
+  _(prim, unsqueeze_copy)            \
+  _(prim, flatten_copy)              \
   _(prim, DifferentiableGraph)       \
   _(prim, TensorExprGroup)           \
   _(prim, TensorExprDynamicGroup)    \
@@ -60,6 +67,8 @@ namespace c10 {
   _(prim, PadPacked) /* onnx */      \
   _(prim, Placeholder) /* debug */   \
   _(prim, Print)                     \
+  _(prim, EmptyListLiteral)          \
+  _(prim, LegacyTypedConstructor)    \
   _(prim, PythonOp)                  \
   _(prim, IgnoredPythonOp)           \
   _(prim, Reverse)                   \
@@ -92,6 +101,7 @@ namespace c10 {
   _(prim, With)                      \
   _(prim, Enter)                     \
   _(prim, Exit)                      \
+  _(prim, IfThenElse)                \
   _(aten, Bool)                      \
   _(aten, Int)                       \
   _(aten, FloatImplicit)             \
@@ -102,7 +112,6 @@ namespace c10 {
   _(aten, Complex)                   \
   _(aten, str)                       \
   _(aten, Delete)                    \
-  _(aten, gelu_)                     \
   _(prim, device)                    \
   _(prim, dtype)                     \
   _(prim, layout)                    \
@@ -220,6 +229,7 @@ namespace c10 {
   _(onnx, Gemm)                      \
   _(onnx, LSTM)                      \
   _(onnx, MatMul)                    \
+  _(onnx, Min)                       \
   _(onnx, Mul)                       \
   _(onnx, Pow)                       \
   _(onnx, RNN)                       \
@@ -241,7 +251,7 @@ namespace c10 {
   _(onnx, Less)                      \
   _(onnx, LessOrEqual)               \
   _(onnx, Not)                       \
-  _(onnx, ATen)                      \
+  _(aten, ATen)                      \
   _(onnx, Split)                     \
   _(onnx, ConstantOfShape)           \
   _(onnx, Cast)                      \
@@ -270,6 +280,9 @@ namespace c10 {
   _(onnx, Range)                     \
   _(onnx, Tile)                      \
   _(onnx, Where)                     \
+  _(onnx, Optional)                  \
+  _(onnx, OptionalGetElement)        \
+  _(onnx, OptionalHasElement)        \
   FORALL_ATTR_BASE_SYMBOLS(_)        \
   _(attr, Subgraph)                  \
   _(attr, ReverseSubgraph)           \
@@ -297,6 +310,7 @@ namespace c10 {
   _(attr, transA)                    \
   _(attr, transB)                    \
   _(attr, name)                      \
+  _(attr, module)                    \
   _(attr, beg)                       \
   _(attr, idx)                       \
   _(attr, split)                     \
@@ -308,8 +322,10 @@ namespace c10 {
   _(attr, cache_id)                  \
   _(attr, new_axis)                  \
   _(attr, warn_id)                   \
+  _(attr, output_layouts)            \
   _(attr, allowzero)                 \
-  _(attr, seen_none)
+  _(attr, seen_none)                 \
+  _(attr, overload_name)
 
 enum class _keys : unique_t {
     #define DEFINE_KEY(ns, s) ns##_##s,
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 85117e345e30..eb977f09cbe6 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -91,6 +91,8 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
         return ComplexType::get();
       case Tag::Int:
         return IntType::get();
+      case Tag::SymInt:
+        return c10::SymIntType::get();
       case Tag::Bool:
         return BoolType::get();
       case Tag::String:
@@ -271,8 +273,8 @@ bool operator==(const IValue& lhs, const IValue& rhs) {
 }
 
 bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) {
-  TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr);
-  TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr);
+  TORCH_INTERNAL_ASSERT(lhs.isIntrusivePtr());
+  TORCH_INTERNAL_ASSERT(rhs.isIntrusivePtr());
   return lhs.tag == rhs.tag &&
       lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
 }
@@ -298,6 +300,8 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
     case Tag::Int:
       return rhs.isInt() && lhs.toInt() == rhs.toInt();
+    case Tag::SymInt:
+      return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
     case Tag::Bool:
       return rhs.isBool() && lhs.toBool() == rhs.toBool();
     case Tag::String:
@@ -349,6 +353,8 @@ size_t IValue::hash(const IValue& v) {
       return c10::get_hash(v.payload.u.as_int);
     case Tag::Int:
       return c10::get_hash(v.payload.u.as_int);
+    case Tag::SymInt:
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::String:
       return c10::get_hash(v.toStringRef());
     case Tag::Tuple:
@@ -398,8 +404,8 @@ bool IValue::is(const IValue& rhs) const {
     return rhs.isTensor() && lhs.toTensor().is_same(rhs.toTensor());
   }
 
-  if (lhs.is_intrusive_ptr) {
-    return rhs.is_intrusive_ptr && ptrEqual(lhs, rhs);
+  if (lhs.isIntrusivePtr()) {
+    return rhs.isIntrusivePtr() && ptrEqual(lhs, rhs);
   }
   return lhs == rhs;
 }
@@ -429,6 +435,15 @@ bool IValue::isTensorList() const {
   return isListOf<c10::TensorType>();
 }
 
+bool IValue::isOptionalTensorList() const {
+  if (!isList()) {
+    return false;
+  }
+  const auto& ty = static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType;
+  const auto expected_ty = c10::getTypePtr<c10::optional<at::Tensor>>();
+  return expected_ty == ty;
+}
+
 bool IValue::isIntList() const {
   return isListOf<c10::IntType>();
 }
@@ -567,6 +582,8 @@ std::ostream& IValue::repr(
     }
     case IValue::Tag::Int:
       return out << v.toInt();
+    case IValue::Tag::SymInt:
+      return out << v.toSymInt();
     case IValue::Tag::Bool:
       return out << (v.toBool() ? "True" : "False");
     case IValue::Tag::Tuple: {
@@ -753,6 +770,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printComplex(out, v);
     } case IValue::Tag::Int:
       return out << v.toInt();
+    case IValue::Tag::SymInt:
+      return out << v.toSymInt();
     case IValue::Tag::Bool:
       return out << (v.toBool() ? "True" : "False");
     case IValue::Tag::Tuple: {
@@ -886,6 +905,7 @@ IValue IValue::deepcopy(
     case IValue::Tag::None:
     case IValue::Tag::Double:
     case IValue::Tag::Int:
+    case IValue::Tag::SymInt:
     case IValue::Tag::Bool:
     case IValue::Tag::Device:
     case IValue::Tag::Uninitialized: {
@@ -1159,5 +1179,4 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAny(
   }
   return ctx->dstFuture;
 }
-
 } // namespace c10
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 81867348450d..e9a0caecc5d6 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -92,12 +92,29 @@ struct OptionalArray {
     return *this;
   }
 
+  // Used when saving an argument for the backwards pass.
+  OptionalArray& operator=(c10::OptionalArrayRef<T> ref) {
+    if (ref) {
+      list = std::vector<T>(ref->begin(), ref->end());
+    } else {
+      list = nullopt;
+    }
+    return *this;
+  }
+
   operator c10::optional<c10::ArrayRef<T>>() {
     if (!list) {
       return nullopt;
     }
     return *list;
   }
+
+  operator c10::OptionalArrayRef<T>() {
+    if (!list) {
+      return nullopt;
+    }
+    return *list;
+  }
 };
 
 // Capsule is an internal implementation detail of custom C++ classes. We
@@ -127,6 +144,7 @@ struct Capsule {
   _(Double)                  \
   _(ComplexDouble)           \
   _(Int)                     \
+  _(SymInt)                  \
   _(Bool)                    \
   _(Tuple)                   \
   _(String)                  \
@@ -183,13 +201,13 @@ struct Capsule {
 /// \endrst
 struct TORCH_API IValue final {
   IValue(const IValue& rhs)
-      : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      : IValue(rhs.payload, rhs.tag) {
+    if (isIntrusivePtr() && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
       c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
     }
   }
 
-  IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) {
+  IValue(IValue&& rhs) noexcept : tag(rhs.tag) {
     moveFrom(std::move(rhs));
   }
 
@@ -330,12 +348,12 @@ struct TORCH_API IValue final {
       return isAliasOf(this->toTensor(), rhs.toTensor());
     }
 
-    if (!this->is_intrusive_ptr) {
+    if (!isIntrusivePtr()) {
       // Primitive types don't alias anything
       return false;
     }
 
-    AT_ASSERT(rhs.is_intrusive_ptr);
+    AT_ASSERT(rhs.isIntrusivePtr());
 
     // Other types can be compared by their ptr value
     return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
@@ -347,7 +365,7 @@ struct TORCH_API IValue final {
       return payload.as_tensor.use_count();
     }
 
-    if (!is_intrusive_ptr) {
+    if (!isIntrusivePtrLegacyBehavior()) {
       return 1;
     }
 
@@ -380,7 +398,6 @@ struct TORCH_API IValue final {
     } else {
       std::swap(payload.u, rhs.payload.u);
     }
-    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
     std::swap(tag, rhs.tag);
   }
 
@@ -388,7 +405,7 @@ struct TORCH_API IValue final {
   // While some of these accessors could be generated through templates,
   // we prefer to write them manually for clarity
 
-  IValue(at::TensorBase t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
+  IValue(at::TensorBase t) : tag(Tag::Tensor) {
     new (&payload.as_tensor) at::Tensor(std::move(t));
   }
   bool isTensor() const {
@@ -407,12 +424,7 @@ struct TORCH_API IValue final {
     return payload.as_tensor.unsafeGetTensorImpl();
   }
 
-  IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast<bool>(s)) {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
+  IValue(at::Storage s) : tag(Tag::Storage) {
     payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl());
   }
   bool isStorage() const {
@@ -430,7 +442,7 @@ struct TORCH_API IValue final {
 
   /// @private [doxygen private]
   IValue(intrusive_ptr<caffe2::Blob> blob)
-      : tag(Tag::Blob), is_intrusive_ptr(true) {
+      : tag(Tag::Blob) {
     // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
     // and store it as a Tensor instead.
     payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
@@ -497,7 +509,7 @@ struct TORCH_API IValue final {
   C10_NODISCARD ivalue::Tuple& toTupleRef() const;
 
   // Double
-  IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) {
+  IValue(double d) : tag(Tag::Double) {
     payload.u.as_double = d;
   }
   bool isDouble() const {
@@ -539,10 +551,24 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<at::Quantizer> toQuantizer() const&;
 
   // Int
-  IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) {
+  IValue(int64_t i) : tag(Tag::Int) {
     payload.u.as_int = i;
   }
 
+  IValue(c10::SymInt i) : tag(Tag::SymInt) {
+    payload.u.as_int = i.data();
+  }
+
+  IValue(c10::SymIntArrayRef v);
+
+  bool isSymInt() const {
+    return Tag::SymInt == tag;
+  }
+
+  c10::SymInt toSymInt() const {
+    return c10::SymInt(payload.u.as_int);
+  }
+
   // allow you to pass literals (3, 4) without ambiguity
   IValue(int32_t i) : IValue(static_cast<int64_t>(i)) {}
 
@@ -556,7 +582,7 @@ struct TORCH_API IValue final {
   }
 
   // Bool
-  IValue(bool b) : tag(Tag::Bool), is_intrusive_ptr(false) {
+  IValue(bool b) : tag(Tag::Bool) {
 #if defined(__clang__) && defined(__x86_64__)
     // Initializing entire payload stops valgrind's from reporting
     // "jump or move depends on uninitialised value" in IValue copy constructor
@@ -619,6 +645,12 @@ struct TORCH_API IValue final {
   c10::List<at::Tensor> toTensorList() const&;
   std::vector<at::Tensor> toTensorVector() const;
 
+  // OptionalTensorList
+  bool isOptionalTensorList() const;
+  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() &&;
+  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() const&;
+  std::vector<c10::optional<at::Tensor>> toOptionalTensorVector() const;
+
   // GenericList
   IValue(c10::List<IValue> v);
   bool isList() const {
@@ -666,6 +698,8 @@ struct TORCH_API IValue final {
 
   template <class T, enable_if_ivalue_constructible<T> = nullptr>
   IValue(c10::optional<T> v);
+  template <class T, enable_if_ivalue_constructible<T> = nullptr>
+  IValue(c10::OptionalArrayRef<T> v);
   IValue(c10::nullopt_t);
 
   // ClassType
@@ -698,7 +732,7 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
 
   // None
-  IValue() : tag(Tag::None), is_intrusive_ptr(false) {}
+  IValue() : tag(Tag::None) {}
   bool isNone() const {
     return Tag::None == tag;
   }
@@ -716,15 +750,17 @@ struct TORCH_API IValue final {
   // Scalar, which gets encoded as either an Int, a Double or a ComplexDouble
   IValue(const at::Scalar& s) : IValue() {
     if (s.isFloatingPoint()) {
-      *this = s.toDouble();
+      tag = Tag::Double;
+      payload.u.as_double = s.toDouble();
     } else if (s.isComplex()) {
       *this = s.toComplexDouble();
     } else if (s.isBoolean()) {
-      *this = s.toBool();
-    } else if (s.isIntegral(false)) {
-      *this = s.toLong();
+      tag = Tag::Bool;
+      payload.u.as_bool = s.toBool();
     } else {
-      TORCH_CHECK(false, "Unknown type in Scalar");
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(s.isIntegral(false), "Unknown type in Scalar");
+      tag  = Tag::Int;
+      payload.u.as_int = s.toLong();
     }
   }
 
@@ -745,7 +781,7 @@ struct TORCH_API IValue final {
   }
 
   // Device
-  IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) {
+  IValue(c10::Device d) : tag(Tag::Device) {
     payload.u.as_device.type = d.type();
     payload.u.as_device.index = d.index();
   }
@@ -759,7 +795,7 @@ struct TORCH_API IValue final {
 
   //Stream
   IValue(c10::Stream stream)
-    : tag(Tag::Stream), is_intrusive_ptr(false) {
+    : tag(Tag::Stream) {
     payload.u.as_int = stream.pack();
   }
   c10::Stream toStream() &&;
@@ -788,7 +824,7 @@ struct TORCH_API IValue final {
   }
 
   // QScheme
-  IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) {
+  IValue(at::QScheme qscheme) : tag(Tag::Int) {
     payload.u.as_int = static_cast<int64_t>(qscheme);
   }
 
@@ -804,12 +840,7 @@ struct TORCH_API IValue final {
   }
 
   // Generator
-  IValue(at::Generator g) : tag(Tag::Generator), is_intrusive_ptr(g.defined()) {
-    // Note: the undefined generator is not refcounted, so while it
-    // is tagged as a generator, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined generator.
+  IValue(at::Generator g) : tag(Tag::Generator) {
     payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl());
   }
   bool isGenerator() const {
@@ -881,7 +912,10 @@ struct TORCH_API IValue final {
       const IValue& v);
 
   bool isPtrType() const {
-    return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr;
+    if (isTensor()) {
+      return payload.as_tensor.defined();
+    }
+    return isIntrusivePtrLegacyBehavior();
   }
 
   /// @private [doxygen private]
@@ -989,7 +1023,7 @@ struct TORCH_API IValue final {
     // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable
     // the compiler to generate the same code for each case. It is
     // surprisingly difficult to get this right.
-    if (isTensor() || is_intrusive_ptr) {
+    if (isTensor() || isIntrusivePtr()) {
       c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr;
       c10::intrusive_ptr<intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(p);
       // No need to make this destructor call!
@@ -1013,14 +1047,78 @@ struct TORCH_API IValue final {
       payload.u = rhs.payload.u;
     }
     tag = rhs.tag;
-    is_intrusive_ptr = rhs.is_intrusive_ptr;
     rhs.clearToNone();
   }
 
   void clearToNone() noexcept {
     payload.u.as_int = 0;
     tag = Tag::None;
-    is_intrusive_ptr = false;
+  }
+
+  bool isIntrusivePtr() const {
+    switch (tag) {
+      case Tag::None:
+        return false;
+      case Tag::Tensor:
+        return false;
+      case Tag::Storage:
+        return true;
+      case Tag::Generator:
+        return true;
+      case Tag::Double:
+        return false;
+      case Tag::ComplexDouble:
+        return true;
+      case Tag::Int:
+        return false;
+      case Tag::SymInt:
+        return false;
+      case Tag::Bool:
+        return false;
+      case Tag::Tuple:
+        return true;
+      case Tag::String:
+        return true;
+      case Tag::Blob:
+        return true;
+      case Tag::GenericList:
+        return true;
+      case Tag::GenericDict:
+        return true;
+      case Tag::Future:
+        return true;
+      case Tag::Device:
+        return false;
+      case Tag::Stream:
+        return false;
+      case Tag::Object:
+        return true;
+      case Tag::PyObject:
+        return true;
+      case Tag::Uninitialized:
+        return false;
+      case Tag::Capsule:
+        return true;
+      case Tag::RRef:
+        return true;
+      case Tag::Quantizer:
+        return true;
+      case Tag::Enum:
+        return true;
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false, "unexpected tag ", static_cast<int>(tag));
+    return false;
+  }
+
+  // Storage and Generator were treated specially when
+  // is_intrusive_ptr was stored as explicit state. This getter
+  // preserves the old behavior for use with WeakIValue for now.
+  bool isIntrusivePtrLegacyBehavior() const {
+    if (tag == Tag::Storage || tag == Tag::Generator) {
+      return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton();
+    } else {
+      return isIntrusivePtr();
+    }
   }
 
   union Payload {
@@ -1048,7 +1146,7 @@ struct TORCH_API IValue final {
     ~Payload() {}
   };
 
-  IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) {
+  IValue(const Payload& p, Tag t) : tag(t) {
     if (isTensor()) {
       new (&payload.as_tensor) at::Tensor(p.as_tensor);
     } else {
@@ -1063,7 +1161,6 @@ struct TORCH_API IValue final {
 
   Payload payload;
   Tag tag;
-  bool is_intrusive_ptr;
   friend struct WeakIValue;
 };
 
@@ -1080,7 +1177,7 @@ struct TORCH_API WeakIValue final {
   }
   WeakIValue(const IValue& rhs)
       : tag(rhs.tag),
-        is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    is_intrusive_ptr(rhs.isIntrusivePtrLegacyBehavior()) {
     if (rhs.isTensor()) {
       payload.as_intrusive_ptr = rhs.unsafeToTensorImpl();
       is_intrusive_ptr = true;
@@ -1124,7 +1221,7 @@ struct TORCH_API WeakIValue final {
     if (!is_intrusive_ptr) {
       IValue::Payload newPayload;
       newPayload.u = payload;
-      return IValue(newPayload, tag, false);
+      return IValue(newPayload, tag);
     }
     if (IValue::Tag::Tensor == tag) {
       auto temp = c10::weak_intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl>::reclaim(
@@ -1147,7 +1244,7 @@ struct TORCH_API WeakIValue final {
       if (!pl.u.as_intrusive_ptr) {
         return IValue();
       } else {
-        return IValue(pl, tag, true);
+        return IValue(pl, tag);
       }
     }
   }
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 6c524da40ed2..7f87380e7267 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -1179,7 +1179,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
         continue;
       }
       c10::Device device = storage->device();
-      if (!device.is_cpu()) {
+      if (!device.is_cpu() && !device.is_meta()) {
         TORCH_CHECK_VALUE(
             device.type() == impl.type(),
             "Expected all data ptrs to be on a device of type ",
@@ -1235,7 +1235,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
 
   // We need devices to be sorted in order to use ensureIsSubsetOfDevices.
   static std::vector<c10::Device> sortAndDeduplicateDevices(
-      const c10::impl::VirtualGuardImpl& impl,
+      const c10::impl::VirtualGuardImpl& /*impl*/,
       std::vector<c10::Device> devices) {
     std::sort(
       devices.begin(), devices.end(),
@@ -1584,6 +1584,7 @@ DEFINE_TO(at::MemoryFormat, toMemoryFormat)
 DEFINE_TO(at::QScheme, toQScheme)
 DEFINE_TO(at::Dimname, toDimname)
 DEFINE_TO(at::Generator, toGenerator)
+DEFINE_TO(c10::SymInt, toSymInt)
 
 template <class T>
 struct _fake_type {};
@@ -1880,6 +1881,22 @@ inline std::vector<at::Tensor> IValue::toTensorVector() const {
   return createVectorFromList<at::Tensor>(
       static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
+inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() && {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<c10::optional<at::Tensor>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() const& {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<c10::optional<at::Tensor>>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<c10::optional<at::Tensor>> IValue::toOptionalTensorVector() const {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalTensorVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::optional<at::Tensor>>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
 inline c10::List<IValue> IValue::toList() && {
   AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
   return c10::List<IValue>(moveToIntrusivePtr<c10::detail::ListImpl>());
@@ -1922,7 +1939,7 @@ inline ivalue::Tuple& IValue::toTupleRef() const {
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
-    : tag(Tag::Tuple), is_intrusive_ptr(true) {
+    : tag(Tag::Tuple) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 template <
@@ -1950,14 +1967,14 @@ inline IValue::IValue(std::tuple<Args...>&& t)
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
-    : tag(Tag::String), is_intrusive_ptr(true) {
+    : tag(Tag::String) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 inline IValue::IValue(std::string v)
     : IValue(ivalue::ConstantString::create(std::move(v))) {}
 
 inline IValue::IValue(c10::impl::GenericList v)
-    : tag(Tag::GenericList), is_intrusive_ptr(true) {
+    : tag(Tag::GenericList) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 
@@ -1973,6 +1990,7 @@ inline IValue::IValue(at::ArrayRef<T> v) : IValue(c10::List<T>()) {
     list.push_back(e);
   }
 }
+inline IValue::IValue(c10::SymIntArrayRef v) : IValue(at::ArrayRef<c10::SymInt>(v.data(), v.size())) {}
 template <class T, IValue::enable_if_ivalue_constructible<T>>
 inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
   auto list = to<c10::List<T>>();
@@ -1981,6 +1999,13 @@ inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
     list.push_back(e);
   }
 }
+template <class T, IValue::enable_if_ivalue_constructible<T>>
+inline IValue::IValue(c10::OptionalArrayRef<T> v) : IValue() {
+  if (v.has_value()) {
+    *this = IValue(std::move(*v));
+  }
+}
+
 template <class T, size_t N>
 inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
   auto list = to<c10::List<T>>();
@@ -1991,7 +2016,7 @@ inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
 }
 
 inline IValue::IValue(c10::impl::GenericDict v)
-    : tag(Tag::GenericDict), is_intrusive_ptr(true) {
+    : tag(Tag::GenericDict) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 template <class Key, class Value>
@@ -2018,17 +2043,17 @@ inline IValue::IValue(c10::optional<T> v) : IValue() {
 inline IValue::IValue(c10::nullopt_t) : IValue() {}
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
-    : tag(Tag::Object), is_intrusive_ptr(true) {
+    : tag(Tag::Object) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v)
-    : tag(Tag::PyObject), is_intrusive_ptr(true) {
+    : tag(Tag::PyObject) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::EnumHolder> v)
-    : tag(Tag::Enum), is_intrusive_ptr(true) {
+    : tag(Tag::Enum) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
@@ -2036,7 +2061,6 @@ inline IValue IValue::make_capsule(
     intrusive_ptr<torch::CustomClassHolder> blob) {
   IValue iv;
   iv.tag = Tag::Capsule;
-  iv.is_intrusive_ptr = true;
   iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
   return iv;
 }
@@ -2059,27 +2083,26 @@ IValue::IValue(c10::intrusive_ptr<T> custom_class) {
   ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class)));
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release());
   tag = Tag::Object;
-  is_intrusive_ptr = true;
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
-    : tag(Tag::Future), is_intrusive_ptr(true) {
+    : tag(Tag::Future) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
-    : tag(Tag::RRef), is_intrusive_ptr(true) {
+    : tag(Tag::RRef) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<at::Quantizer> v)
-    : tag(Tag::Quantizer), is_intrusive_ptr(true) {
+    : tag(Tag::Quantizer) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 template <typename T>
 inline IValue::IValue(c10::complex<T> c)
-    : tag(Tag::ComplexDouble), is_intrusive_ptr(true) {
+    : tag(Tag::ComplexDouble) {
   auto v = c10::make_intrusive<ivalue::ComplexHolder>(c);
   payload.u.as_intrusive_ptr = v.release();
 }
@@ -2150,7 +2173,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
   // Str) return value equality
   // 2. If it is a tensor type, we need to take undefined tensor into account
   // 3. Undefined_tensor is None and vice versa should be true
-  // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when
+  // 4. If it is a reference type (i.e. isIntrusivePtr()), then is True when
   // the pointed-to object is the same.
   // 5. False for all other comparisons.
   if (this->isNone() && rhs.isNone()) {
@@ -2175,7 +2198,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
   } else {
     // for objects holding in IValue, do shallow compare on pointer address to
     // testify the identity
-    return this->is_intrusive_ptr && rhs.is_intrusive_ptr &&
+    return this->isIntrusivePtr() && rhs.isIntrusivePtr() &&
         this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
   }
 }
@@ -2192,7 +2215,7 @@ IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
   return IValue(std::move(x));
 }
 template <typename T>
-IValue from_(T&& x, std::false_type) {
+IValue from_(T&& /*x*/, std::false_type) {
   static_assert(
       guts::false_t<T>::value,
       "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@@ -2221,7 +2244,7 @@ struct MaybeOwnedTraits<IValue> {
     if (from.isTensor()) {
       return IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(from.toTensor()));
     } else {
-      return IValue(from.payload, from.tag, from.is_intrusive_ptr);
+      return IValue(from.payload, from.tag);
     }
   }
 
@@ -2232,7 +2255,7 @@ struct MaybeOwnedTraits<IValue> {
     } else if (rhs.isTensor()) {
       lhs = IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(rhs.toTensor()));
     } else {
-      lhs = IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr);
+      lhs = IValue(rhs.payload, rhs.tag);
     }
   }
 
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index c04d48213bad..8dd9e15f7dd4 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -227,6 +227,9 @@ struct TORCH_API OptionalType : public UnionType {
 
   // common cast Optional[Tensor] for undefined tensor type
   static TypePtr ofTensor();
+  //
+  // global singleton
+  static TypePtr get(TypePtr inner);
 
  private:
   explicit OptionalType(TypePtr contained);
@@ -435,6 +438,17 @@ struct TORCH_API SymbolicShape {
     return dims_;
   }
 
+  c10::optional<std::vector<bool>> symbolicDims() const {
+    if (!dims_) {
+      return c10::nullopt;
+    }
+    auto symbolic_dims = std::vector<bool>();
+    for (const ShapeSymbol& s : *dims_) {
+      symbolic_dims.push_back(!s.is_static());
+    }
+    return symbolic_dims;
+  }
+
   // Checks whether the shape is fully defined/complete, ie. rank and sizes
   // of every dimension are known.
   bool isComplete() const {
@@ -456,6 +470,14 @@ struct TORCH_API SymbolicShape {
   // result will be unranked.
   SymbolicShape merge(const SymbolicShape& other) const;
 
+  friend bool operator==(const SymbolicShape& lhs, const SymbolicShape& rhs) {
+    return lhs.dims_ == rhs.dims_;
+  }
+
+  friend bool operator!=(const SymbolicShape& lhs, const SymbolicShape& rhs) {
+    return !(lhs == rhs);
+  }
+
   private:
     c10::optional<std::vector<ShapeSymbol>> dims_;
 };
@@ -466,7 +488,7 @@ inline bool isComplete(const Stride& s) {
 }
 
 template<typename T>
-inline bool isComplete(const T& t) {
+inline bool isComplete(const T& /*t*/) {
   return true;
 }
 }
@@ -764,15 +786,36 @@ struct TORCH_API TensorType : public SharedType {
 
   static const TypeKind Kind = TypeKind::TensorType;
 
-  static std::vector<int64_t> contiguousStridesOf(at::IntArrayRef sizes) {
-    std::vector<int64_t> strides(sizes.size());
-    if (sizes.empty()) // zero-dim case
+  static std::vector<int64_t> contiguousStridesOf(
+      at::IntArrayRef in_sizes,
+      at::MemoryFormat memory_format = MemoryFormat::Contiguous) {
+    auto contiguous_fn = [](const at::IntArrayRef& sizes,
+                            const std::vector<int64_t>& dim_order) {
+      std::vector<int64_t> strides(sizes.size());
+      if (sizes.empty()) // zero-dim case
+        return strides;
+
+      strides[dim_order[0]] = 1;
+      for (size_t i = 1; i < dim_order.size(); i++) {
+        auto cur_dim = dim_order[i];
+        auto pre_dim = dim_order[i - 1];
+        strides[cur_dim] = strides[pre_dim] * sizes[pre_dim];
+      }
       return strides;
-    strides.back() = 1;
-    for (size_t i = strides.size() - 1; i > 0; i--) {
-      strides[i - 1] = strides[i] * sizes[i];
+    };
+
+    std::vector<int64_t> dim_order(in_sizes.size());
+    if (memory_format == MemoryFormat::ChannelsLast) {
+      dim_order = {1, 3, 2, 0};
+    } else if (memory_format == MemoryFormat::ChannelsLast3d) {
+      dim_order = {1, 4, 3, 2, 0};
+    } else {
+      auto ndims = in_sizes.size();
+      for (size_t i = 0; i < ndims; i++) {
+        dim_order[i] = ndims - i - 1; // Reverse
+      }
     }
-    return strides;
+    return contiguous_fn(in_sizes, dim_order);
   }
 
  private:
@@ -840,6 +883,14 @@ struct TORCH_API ListType
 
   bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
 
+  // global singleton
+  // Given an inner type T and an identifier,
+  // this function wil return the global singleton type pointer
+  // the type List<T>.
+  // The extra "identifier" argument is needed beccause we have multiple container types
+  // that all re-use this function (List<T>, array<T, N>, etc.)
+  static TypePtr get(std::string identifier, TypePtr inner);
+
   // common cast List[Tensor]
   static ListTypePtr ofTensors();
   static ListTypePtr ofOptionalTensors();
@@ -866,7 +917,11 @@ struct TORCH_API DictType : public SharedType {
   static const TypeKind Kind = TypeKind::DictType;
 
   static DictTypePtr create(TypePtr key, TypePtr value) {
-    switch (key->kind()) {
+    auto kind = key->kind();
+    if (auto dyn = key->castRaw<DynamicType>()) {
+      kind = dyn->dynamicKind();
+    }
+    switch (kind) {
       case TypeKind::AnyType:
       case TypeKind::IntType:
       case TypeKind::BoolType:
@@ -924,6 +979,14 @@ struct TORCH_API DictType : public SharedType {
     return false;
   }
 
+  // global singleton
+  // Given an inner type T and an identifier,
+  // this function wil return the global singleton type pointer
+  // the type List<T>.
+  // The extra "identifier" argument is needed beccause we have multiple container types
+  // that all re-use this function (Dict<K, V> and unordered_map<K, V>)
+  static TypePtr get(std::string identifier, TypePtr key, TypePtr val);
+
  private:
   DictType(TypePtr key, TypePtr value)
       : SharedType(TypeKind::DictType),
@@ -1173,6 +1236,7 @@ struct TORCH_API NumberType : public Type {
   NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {}
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "number"; // technically not a valid python type, but
                      // we need to use it when parsing back in annotations
                      // for implicit conversions
@@ -1200,6 +1264,7 @@ struct TORCH_API FloatType : public NumberType {
  private:
   FloatType() : NumberType(TypeKind::FloatType) {}
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "float";
   }
 };
@@ -1225,10 +1290,36 @@ struct TORCH_API ComplexType : public NumberType {
  private:
   ComplexType() : NumberType(TypeKind::ComplexType) {}
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "complex";
   }
 };
 
+// We need to introduce `SymIntType` to represent the `SymInt` type
+// used in function schemas e.g. `aten::narrow_copy(... SymInt length)
+// `SymInt` will be used to enable tracing arithmetic operations on
+// dimension values. Please see [SymInt.h] for more information
+struct SymIntType;
+using SymIntTypePtr = SingletonTypePtr<SymIntType>;
+struct TORCH_API SymIntType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "SymInt";
+  }
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    // TODO: will become a Union[SymbolicIntNode|int] in the near future
+    return "int";
+  }
+  static const TypeKind Kind = TypeKind::SymIntType;
+  // global singleton
+  static SymIntTypePtr get();
+
+ private:
+  SymIntType() : Type(TypeKind::SymIntType) {}
+};
+
 struct IntType;
 using IntTypePtr = SingletonTypePtr<IntType>;
 // This type represents a Python int number
@@ -1250,6 +1341,7 @@ struct TORCH_API IntType : public NumberType {
  private:
   IntType() : NumberType(TypeKind::IntType) {}
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "int";
   }
 };
@@ -1284,6 +1376,7 @@ struct TORCH_API StringType : public Type {
     return annotation_str();
   }
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "str";
   }
   static const TypeKind Kind = TypeKind::StringType;
@@ -1304,6 +1397,7 @@ struct TORCH_API StorageType : public Type {
     return annotation_str();
   }
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return "Storage";
   }
   static const TypeKind Kind = TypeKind::StorageType;
@@ -1339,6 +1433,7 @@ struct TORCH_API FunctionType : public NamedType {
  private:
   FunctionType(torch::jit::Function* function);
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     const auto& n = name().value();
     return n.qualifiedName();
   }
@@ -1686,6 +1781,13 @@ struct getTypePtr_<int64_t> final {
     return IntType::get();
   }
 };
+
+template <>
+struct getTypePtr_<SymInt> final {
+  static decltype(auto) call() {
+    return SymIntType::get();
+  }
+};
 template <>
 struct getTypePtr_<c10::ScalarType> final {
   static decltype(auto) call() {
@@ -1756,55 +1858,95 @@ struct getTypePtr_<at::Dimname> final {
 template <class T>
 struct getTypePtr_<std::vector<T>> final {
   static const auto& call() {
-    static auto type = ListType::create(getTypePtr_<T>::call());
+    static auto inner_type = getTypePtr_<T>::call();
+    // The "per vector<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("vector", inner_type);
     return type;
   }
 };
 template <class T>
 struct getTypePtr_<c10::ArrayRef<T>> final {
   static const auto& call() {
-    static auto type = ListType::create(getTypePtr_<T>::call());
+    static auto inner_type = getTypePtr_<T>::call();
+    // The "per ArrayRef<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("ArrayRef", inner_type);
+    return type;
+  }
+};
+template <>
+struct getTypePtr_<c10::SymIntArrayRef> final {
+  static const auto& call() {
+    static auto type = ListType::create(getTypePtr_<c10::SymInt>::call());
     return type;
   }
 };
 template <class T>
 struct getTypePtr_<c10::List<T>> final {
   static const auto& call() {
-    static auto type = ListType::create(getTypePtr_<T>::call());
+    static auto inner_type = getTypePtr_<T>::call();
+    // The "per List<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("List", inner_type);
     return type;
   }
 };
 template <class T, size_t N>
 struct getTypePtr_<std::array<T, N>> final {
   static const auto& call() {
-    static auto type = ListType::create(getTypePtr_<T>::call());
+    static auto inner_type = getTypePtr_<T>::call();
+    // The "per array<T, N>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    // (Concatenating the length onto the end of the string because we want a unique
+    // type_ptr created for every std::array<T, N> type).
+    static auto type = ListType::get(std::string("array") + std::to_string(N), inner_type);
     return type;
   }
 };
 template <class K, class V>
 struct getTypePtr_<std::unordered_map<K, V>> final {
   static const auto& call() {
-    static auto type =
-        DictType::create(getTypePtr_<K>::call(), getTypePtr_<V>::call());
+    static auto inner_key_type = getTypePtr_<K>::call();
+    static auto inner_val_type = getTypePtr_<V>::call();
+    // The "per unordered_map<K, V>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = DictType::get("unordered_map", inner_key_type, inner_val_type);
     return type;
   }
 };
 template <class K, class V>
 struct getTypePtr_<c10::Dict<K, V>> final {
   static const auto& call() {
-    static auto type =
-        DictType::create(getTypePtr_<K>::call(), getTypePtr_<V>::call());
+    static auto inner_key_type = getTypePtr_<K>::call();
+    static auto inner_val_type = getTypePtr_<V>::call();
+    // The "per Dict<K, V>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = DictType::get("Dict", inner_key_type, inner_val_type);
     return type;
   }
 };
+
 template <class T>
 struct getTypePtr_<at::optional<T>> final {
   static const auto& call() {
-    static auto type = TypeFactory::create<OptionalType>(
-        getTypePtr_<T>::call());
+    static auto inner_type = getTypePtr_<T>::call();
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = OptionalType::get(inner_type);
     return type;
   }
 };
+
+
+template<>
+struct getTypePtr_<at::OptionalIntArrayRef> final {
+  static const auto& call() {
+    static auto type = OptionalType::create(getTypePtr_<IntArrayRef>::call());
+    return type;
+  }
+};
+
 template <class... Contained>
 struct getTypePtr_<std::tuple<Contained...>> final {
   static const auto& call() {
@@ -1922,6 +2064,7 @@ struct TORCH_API InterfaceType : public NamedType {
       std::ostream* why_not);
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    (void)printer; // Suppress unused variable warning
     return name()->qualifiedName();
   }
 
@@ -1944,24 +2087,12 @@ bool equals(const Type& rhs) const override {
 EnumerationType() : Type(Kind) {}
 };
 
-struct LayoutType;
-using LayoutTypePtr = SingletonTypePtr<LayoutType>;
-// This type represents a Generator
-struct TORCH_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
-std::string str() const override {
-return "Layout";
-}
-static const TypeKind Kind = TypeKind::LayoutType;
-// global singleton
-static LayoutTypePtr get();
-
-private:
-LayoutType() : EnumerationType() {}
-};
+// WARNING: These enumeration types below DO NOT actually get parsed out
+// from the logical schema strings, instead they are mapped as ints.  To
+// observe these types, use real_type() instead of type() on Argument
 
 struct ScalarTypeType;
 using ScalarTypeTypePtr = SingletonTypePtr<ScalarTypeType>;
-// This type represents a Generator
 struct TORCH_API ScalarTypeType : public EnumerationType<TypeKind::ScalarTypeType> {
 std::string str() const override {
 return "ScalarType";
@@ -1974,6 +2105,34 @@ static ScalarTypeTypePtr get();
 ScalarTypeType() : EnumerationType() {}
 };
 
+struct MemoryFormatType;
+using MemoryFormatTypePtr = SingletonTypePtr<MemoryFormatType>;
+struct TORCH_API MemoryFormatType : public EnumerationType<TypeKind::MemoryFormatType> {
+std::string str() const override {
+return "MemoryFormatType";
+}
+static const TypeKind Kind = TypeKind::MemoryFormatType;
+// global singleton
+static MemoryFormatTypePtr get();
+
+private:
+MemoryFormatType() : EnumerationType() {}
+};
+
+struct LayoutType;
+using LayoutTypePtr = SingletonTypePtr<LayoutType>;
+struct TORCH_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
+std::string str() const override {
+return "LayoutType";
+}
+static const TypeKind Kind = TypeKind::LayoutType;
+// global singleton
+static LayoutTypePtr get();
+
+private:
+LayoutType() : EnumerationType() {}
+};
+
 // the common supertype of all lists,
 // List[T] <: AnyList for all T
 struct AnyListType;
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index 99ef1be1dd9b..2e1c84db867b 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -6,6 +6,8 @@
 
 #include <ATen/core/qualified_name.h>
 #include <ATen/core/type_ptr.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
@@ -43,11 +45,13 @@ namespace c10 {
   _(CapsuleType)            \
   _(InterfaceType)          \
   _(QSchemeType)            \
-  _(LayoutType)             \
   _(ScalarTypeType)         \
+  _(LayoutType)             \
+  _(MemoryFormatType)       \
   _(AnyListType)            \
   _(AnyTupleType)           \
   _(AnyClassType)           \
+  _(SymIntType)             \
   _(UnionType)              \
   _(DynamicType)
 
@@ -94,8 +98,9 @@ TORCH_DECLARE_SINGLETON(DeviceObjType);
 TORCH_DECLARE_SINGLETON(StreamObjType);
 TORCH_DECLARE_SINGLETON(CapsuleType);
 TORCH_DECLARE_SINGLETON(PyObjectType);
-TORCH_DECLARE_SINGLETON(LayoutType);
 TORCH_DECLARE_SINGLETON(ScalarTypeType);
+TORCH_DECLARE_SINGLETON(LayoutType);
+TORCH_DECLARE_SINGLETON(MemoryFormatType);
 TORCH_DECLARE_SINGLETON(AnyListType);
 TORCH_DECLARE_SINGLETON(AnyTupleType);
 TORCH_DECLARE_SINGLETON(AnyClassType);
@@ -140,7 +145,7 @@ struct TORCH_API Type {
   protected:
   Type(TypeKind kind) : kind_(kind) {}
 
-  virtual std::string annotation_str_impl(TypePrinter printer) const {
+  virtual std::string annotation_str_impl(TypePrinter /*printer*/) const {
     return str();
   }
   // a == b
@@ -567,7 +572,7 @@ struct TORCH_API Type {
   // per-type constructor, you only need to override this if the
   // containedTypes() is not empty
   virtual TypePtr createWithContained(
-      std::vector<TypePtr> contained_types) const {
+      std::vector<TypePtr> /*contained_types*/) const {
     AT_ERROR(
         "type with contained types did not overload createWithContained: ",
         str());
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index ba16a5bf10c1..ba608e98ad53 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -235,6 +235,9 @@ Library& Library::_fallback(CppFunction&& f) & {
   // Note if dispatch_key is DispatchKey::Undefined, it'll be ignored here since Undefined
   // isn't a runtime key, you shouldn't register anything to it at all.
   for (auto k : c10::getRuntimeDispatchKeySet(*dispatch_key)) {
+    // mobile doesn't use all dispatch keys, so skip any fallback registrations for the unused keys.
+    auto idx = getDispatchTableIndexForDispatchKey(k);
+    if (idx < 0) continue;
     registrars_.emplace_back(
       c10::Dispatcher::singleton().registerFallback(
         k,
diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h
index 997fb937093b..6e77c5653881 100644
--- a/aten/src/ATen/core/op_registration/op_allowlist.h
+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
@@ -185,7 +185,7 @@ constexpr bool op_allowlist_contains_name_in_schema(string_view allowlist, strin
 // and should be registered.  When we turn this on, the list of valid
 // mobile dispatch keys is hard coded (but you need to make sure
 // that you have the correct set of dispatch keys for this).
-constexpr bool dispatch_key_allowlist_check(DispatchKey k) {
+constexpr bool dispatch_key_allowlist_check(DispatchKey /*k*/) {
 #ifdef C10_MOBILE
   return true;
   // Disabled for now: to be enabled later!
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index ba4c8052e372..05294c25548e 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -284,7 +284,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
   EXPECT_FALSE(called_kernel1);
   EXPECT_TRUE(called_kernel2);
 
-  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+  // Test for out of tree lazy backends- ::Lazy key is now registered to TS backend in tree
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA}) {
     std::string expectMessage = expectedMessageForBackend(key);
     expectThrows<c10::Error>([&] {
       callOp(*op, dummyTensor(key));
@@ -591,7 +592,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {
 
 void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
   auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
+    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(toBackendComponent(key)))
     .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));
 
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
@@ -613,14 +614,13 @@ void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
   EXPECT_FALSE(called_nonautograd);
 }
 
+// no longer test ::Lazy key here
+// since it is now registered to TS backend in-tree and thus behaves differently,
+// does not throw the expected 'could not run..' messages
 TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
   LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
 }
 
-TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
-  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
-}
-
 void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
   {
     auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
@@ -670,6 +670,17 @@ TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLaz
   whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy);
 }
 
+TEST(OperatorRegistrationTest, whenregisteringwithinvalidoverloadname) {
+  expectThrows<c10::Error>([] {
+    auto registrar = c10::RegisterOperators().op("_test::dummy.default", c10::RegisterOperators::options()
+      .kernel(DispatchKey::CPU, [] (const int64_t&) {}));
+  }, "default is not a legal overload name for aten operators");
+  expectThrows<c10::Error>([] {
+    auto registrar = c10::RegisterOperators().op("_test::dummy.__name__", c10::RegisterOperators::options()
+      .kernel(DispatchKey::CPU, [] (const int64_t&) {}));
+  }, "__name__ is not a legal overload name for aten operators");
+}
+
 TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) {
   expectThrows<c10::Error>([] {
     auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options()
@@ -1243,6 +1254,16 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
     "(Dict(str, Dict(int, str)?[])[] a) -> Dict(str, Dict(int, str)?[])[]");
 }
 
+TEST(NewOperatorRegistrationTest, erroroutwithinvalidoverloadname) {
+  auto m = MAKE_TORCH_LIBRARY(_test);
+  expectThrows<c10::Error>([&] {
+   m.def("dummy.default(Tensor self) -> Tensor");
+  }, "default is not a legal overload name for aten operators");
+  expectThrows<c10::Error>([&] {
+   m.def("dummy.__name__(Tensor self) -> Tensor");
+  }, "__name__ is not a legal overload name for aten operators");
+}
+
 TEST(NewOperatorRegistrationTest, testBasics) {
   auto m = MAKE_TORCH_LIBRARY(_test);
   m.def("dummy(Tensor self) -> Tensor");
@@ -1770,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) {
 
 TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  bool sparsecpu_called, math_called = false;
+  bool fpga_called, math_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; }));
   m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
 
   {
-    callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU));
-    ASSERT_TRUE(sparsecpu_called);
+    callOp(*op, dummyTensor(c10::DispatchKey::FPGA));
+    ASSERT_TRUE(fpga_called);
   }
 
   {
     expectThrows<c10::Error>([&] {
-      callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true));
+      callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true));
     }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther.");
   }
 }
@@ -1828,18 +1849,15 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
   }
 
   {
-    // TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel
-    // at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel.
-    // This confusing behavior is caused by we registering fallthrough as backend fallback for
-    // Autograd keys. Note users could always work around this by registering the same kernel to
-    // AutogradPrivateUse1 as shown below until we support it.
     auto op = Dispatcher::singleton().findOp({"test::fn", ""});
     ASSERT_TRUE(op.has_value());
     catchall_called = false;
+    privateuse1_called = false;
     callOp(*op,
            dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true),
            dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true));
-    ASSERT_TRUE(catchall_called);
+    ASSERT_FALSE(catchall_called);
+    ASSERT_TRUE(privateuse1_called);
   }
 
   m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; });
@@ -1855,6 +1873,27 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
   }
 }
 
+TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) {
+  bool math_called = false;
+  bool cpu_called = false;
+  auto m = MAKE_TORCH_LIBRARY(test);
+  m.def("fn(Tensor dummy) -> Tensor");
+  m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; });
+  m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
+
+  auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
+  ASSERT_TRUE(op.has_value());
+
+  {
+    math_called = cpu_called = false;
+    // Meta should redispatch to the AutogradOther backend,
+    // which the composite kernel should be registered to.
+    callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true));
+    ASSERT_TRUE(math_called);
+    ASSERT_FALSE(cpu_called);
+  }
+}
+
 TEST(NewOperatorRegistrationTest, dispatchMultiple) {
   bool cpu_called = false;
   bool cuda_called = false;
diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h
index 35bb9964eb39..1695e5995ab6 100644
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@@ -188,7 +188,7 @@ struct TuplePacker {
 
 template <typename... Args>
 struct TuplePacker<0, Args...> {
-  static void execute(Stack& stack, std::tuple<Args...>&& t){};
+  static void execute(Stack& /*stack*/, std::tuple<Args...>&& /*t*/){};
 };
 
 template <typename... Args>
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index cb7b6cc27667..87972825d291 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -3,6 +3,40 @@
 
 namespace c10 {
 
+namespace {
+
+// The idea is to only mark possible overlap across dimensions. We want to
+// return false for expanded tensors and permuted tensors, for which dimensional
+// collapsing is safe.
+bool possible_cross_dimension_overlap(c10::IntArrayRef sizes, c10::IntArrayRef strides) {
+  int n_dim = static_cast<int>(sizes.size());
+  std::vector<size_t> stride_indices(n_dim);
+  std::iota(stride_indices.rbegin(), stride_indices.rend(), 0);
+
+  // sort indices going with ascending strides
+  for (int i = 1; i < n_dim; i++) {
+    auto c = i;
+    for (int j = i - 1; j >= 0; j--) {
+      if (strides[stride_indices[j]] > strides[stride_indices[c]]) {
+        std::swap(stride_indices[j], stride_indices[c]);
+        c = j;
+      }
+    }
+  }
+
+  for (const auto i : c10::irange(1, n_dim)) {
+    if (i != 0) {
+      // we are being conservative on checking for memory overlap
+      if (sizes[stride_indices[i]] != 1 && strides[stride_indices[i]] < sizes[stride_indices[i-1]] * strides[stride_indices[i-1]]) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}
+
 const TensorTypePtr& TensorType::get() {
   static auto value = TensorType::create(
       {}, {}, SymbolicShape(), VaryingShape<Stride>{}, {});
@@ -115,6 +149,10 @@ VaryingShape<Stride> TensorType::computeStrideProps(
     bool tensor_contiguity) {
   int n_dim = static_cast<int>(sizes.size());
   std::vector<size_t> stride_indices(n_dim);
+  // default has_overlap to false as we only compute overlap when:
+  // 1. input sizes/strides fails format check;
+  // 2. tensor_contiguity are not set.
+  bool has_overlap = false;
 
   // Sorting strides in ascending order
   // Example:
@@ -155,7 +193,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
       } else if (strides[a] > strides[b]) {
         return 1;
       } else { // strides[a] == strides[b]
-        if (sizes[a] < sizes[b] || a > b ) {
+        if (sizes[a] > sizes[b]) {
           return 1;
         }
       }
@@ -173,21 +211,35 @@ VaryingShape<Stride> TensorType::computeStrideProps(
         }
       }
     }
+    // conveniently is_contiguous_strides/is_contiguous_strides only returns
+    // true when there's no memory overlap, so we only re-compute has_overlap
+    // in the last branch when both returns false
+    if (!tensor_contiguity) {
+      // trust tensor_contiguity and only computes overlap when it is not set
+      has_overlap = possible_cross_dimension_overlap(sizes, strides);
+    }
   }
   std::vector<Stride> stride_properties;
+
+
   for (size_t i = 0; i < stride_indices.size(); i++) {
     bool contiguous_ = tensor_contiguity;
     if (!contiguous_) {
-      // innermost stride expected to be 1
-      // TODO: turn contiguous_ into an enum CONTIGUOUS, NONCONTIGUOUS,
-      // BROADCASTED
-      if (i == 0) {
-        contiguous_ = strides[stride_indices[i]] == 1;
+      if (!has_overlap) {
+        // innermost stride expected to be 1
+        // TODO: turn contiguous_ into an enum CONTIGUOUS, NONCONTIGUOUS,
+        // BROADCASTED
+        if (i == 0) {
+          contiguous_ = strides[stride_indices[i]] == 1;
+        } else {
+          contiguous_ = strides[stride_indices[i]] == 1 ||
+              (strides[stride_indices[i]] != 0 &&
+               strides[stride_indices[i]] ==
+                   strides[stride_indices[i - 1]] * sizes[stride_indices[i - 1]]);
+        }
       } else {
-        contiguous_ = strides[stride_indices[i]] == 1 ||
-            (strides[stride_indices[i]] != 0 &&
-             strides[stride_indices[i]] ==
-                 strides[stride_indices[i - 1]] * sizes[stride_indices[i - 1]]);
+        // leaving this assign statement for readability;
+        contiguous_ = false;
       }
     }
     stride_properties.emplace_back(stride_indices[i], contiguous_, strides[stride_indices[i]]);
@@ -201,7 +253,7 @@ TensorTypePtr TensorType::create(const at::Tensor& t) {
   VaryingShape<size_t> stride_indices;
   VaryingShape<int64_t> strides;
   VaryingShape<int64_t> sizes;
-  if (!t.is_mkldnn() && !t.is_sparse() && !t.is_sparse_csr()) {
+  if (t.layout() == at::kStrided) {
     sizes = VaryingShape<int64_t>{t.sizes().vec()};
     strides = VaryingShape<int64_t>{t.strides().vec()};
     return TensorType::create(
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index a3f0451dc61c..00e4ceffc156 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -11,6 +11,28 @@
 #include <ATen/core/function.h>
 #include <iostream>
 
+namespace std {
+template<>
+struct hash<std::tuple<std::string, c10::TypePtr, c10::TypePtr>> {
+  size_t operator()(std::tuple<std::string, c10::TypePtr, c10::TypePtr> const& t) const {
+    // This hashing is all hidden behind a static initializer so it
+    // doesn't have to be optimal
+    auto hash = std::hash<std::string>()(std::get<0>(t));
+    hash = at::hash_combine(hash, std::hash<c10::TypePtr>()(std::get<1>(t)));
+    hash = at::hash_combine(hash, std::hash<c10::TypePtr>()(std::get<2>(t)));
+    return hash;
+  }
+};
+template<>
+struct hash<std::tuple<std::string, c10::TypePtr>> {
+  size_t operator()(std::tuple<std::string, c10::TypePtr> const& t) const {
+    auto hash = std::hash<std::string>()(std::get<0>(t));
+    hash = at::hash_combine(hash, std::hash<c10::TypePtr>()(std::get<1>(t)));
+    return hash;
+  }
+};
+} // namespace std
+
 namespace c10 {
 
 static_assert(
@@ -208,6 +230,10 @@ LayoutTypePtr LayoutType::get() {
 static LayoutTypePtr value(new LayoutType());
 return value;
 }
+MemoryFormatTypePtr MemoryFormatType::get() {
+static MemoryFormatTypePtr value(new MemoryFormatType());
+return value;
+}
 PyObjectTypePtr PyObjectType::get() {
   static PyObjectTypePtr value(new PyObjectType());
   return value;
@@ -237,6 +263,47 @@ ListTypePtr ListType::ofStrings() {
   return value;
 }
 
+TypePtr OptionalType::get(TypePtr inner) {
+  static ska::flat_hash_map<TypePtr, TypePtr> containerTypePtrs;
+  static std::mutex mutex;
+  // Perf from the lock is ok because this function is guarded behind
+  // a static initializer; it should only be called once per type.
+  std::lock_guard<std::mutex> lock(mutex);
+  if (containerTypePtrs.find(inner) == containerTypePtrs.end()) {
+    TypePtr t = TypeFactory::create<OptionalType>(inner);
+    containerTypePtrs.emplace(inner, std::move(t));
+  }
+  return containerTypePtrs[inner];
+}
+
+TypePtr ListType::get(std::string identifier, TypePtr inner) {
+  static ska::flat_hash_map<std::tuple<std::string, TypePtr>, TypePtr> containerTypePtrs;
+  static std::mutex mutex;
+  // Perf from the lock is ok because this function is guarded behind
+  // a static initializer; it should only be called once per type.
+  auto key = std::make_tuple(identifier, inner);
+  std::lock_guard<std::mutex> lock(mutex);
+  if (containerTypePtrs.find(key) == containerTypePtrs.end()) {
+    TypePtr t = ListType::create(inner);
+    containerTypePtrs.emplace(key, std::move(t));
+  }
+  return containerTypePtrs[key];
+}
+
+TypePtr DictType::get(std::string identifier, TypePtr key, TypePtr value) {
+  static ska::flat_hash_map<std::tuple<std::string, TypePtr, TypePtr>, TypePtr> containerTypePtrs;
+  static std::mutex mutex;
+  // Perf from the lock is ok because this function is guarded behind
+  // a static initializer; it should only be called once per type.
+  auto map_key = std::make_tuple(identifier, key, value);
+  std::lock_guard<std::mutex> lock(mutex);
+  if (containerTypePtrs.find(map_key) == containerTypePtrs.end()) {
+    TypePtr t = DictType::create(key, value);
+    containerTypePtrs.emplace(map_key, std::move(t));
+  }
+  return containerTypePtrs[map_key];
+}
+
 AnyListTypePtr AnyListType::get() {
   static AnyListTypePtr value(new AnyListType());
   return value;
@@ -257,6 +324,11 @@ AnyEnumTypePtr AnyEnumType::get() {
   return value;
 }
 
+SymIntTypePtr SymIntType::get() {
+  static SymIntTypePtr value(new SymIntType());
+  return value;
+}
+
 c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, TypePtr type_hint=nullptr) {
   // check direct subtyping relation
   if (t1->isSubtypeOf(*t2)) {
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
index eb160577e869..44d39028b990 100644
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@@ -8,7 +8,7 @@
 
 namespace at { namespace vec {
 
-// TODO: Make this more efficient
+// slow path
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
     const Op& vec_fun,
@@ -27,6 +27,62 @@ inline scalar_t vec_reduce_all(
   return acc_arr[0];
 }
 
+template <typename scalar_t, typename Op>
+struct VecReduceAllSIMD {
+  static inline scalar_t apply(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
+    return vec_reduce_all(vec_fun, acc_vec, Vectorized<scalar_t>::size());
+  }
+};
+
+#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+#if defined(CPU_CAPABILITY_AVX2)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    Vec v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX2)
+#if defined(CPU_CAPABILITY_AVX512)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 256-bit shuffle
+    Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 128-bit shuffle
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm512_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX512)
+#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
+  return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
+}
+
 template <typename scalar_t, typename Op>
 inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
   using Vec = vec::Vectorized<scalar_t>;
@@ -42,7 +98,7 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size
     Vec data_vec = Vec::loadu(data + d, size - d);
     acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
+  return vec_reduce_all(vec_fun, acc_vec);
 }
 
 // similar to reduce_all, but reduces into two outputs
@@ -70,8 +126,8 @@ inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2&
     acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
   }
   return std::pair<scalar_t, scalar_t>(
-    vec_reduce_all(vec_fun1, acc_vec1, Vec::size()),
-    vec_reduce_all(vec_fun2, acc_vec2, Vec::size()));
+    vec_reduce_all(vec_fun1, acc_vec1),
+    vec_reduce_all(vec_fun2, acc_vec2));
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -95,7 +151,7 @@ inline scalar_t map_reduce_all(
     data_vec = map_fun(data_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -126,7 +182,7 @@ inline scalar_t map2_reduce_all(
     data_vec = map_fun(data_vec, data2_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -162,7 +218,7 @@ inline scalar_t map3_reduce_all(
     data_vec = map_fun(data_vec, data2_vec, data3_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename Op>
diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h
index 9efa7004090b..acb77ccaa491 100644
--- a/aten/src/ATen/cpu/vec/functional_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h
@@ -75,7 +75,7 @@ inline BFloat16 reduce_all(const Op& vec_fun, const BFloat16* data, int64_t size
     }
   }
   acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1);
-  return vec_reduce_all<float>(vec_fun, acc_fvec0, fVec::size());
+  return vec_reduce_all<float>(vec_fun, acc_fvec0);
 }
 
 template <typename scalar_t = BFloat16, typename Op1, typename Op2>
@@ -131,8 +131,8 @@ inline std::pair<BFloat16, BFloat16> reduce2_all(const Op1& vec_fun1, const Op2&
   acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1);
   acc2_fvec0 = vec_fun2(acc2_fvec0, acc2_fvec1);
   return std::pair<BFloat16, BFloat16>(
-      vec_reduce_all<float>(vec_fun1, acc1_fvec0, fVec::size()),
-      vec_reduce_all<float>(vec_fun2, acc2_fvec0, fVec::size()));
+      vec_reduce_all<float>(vec_fun1, acc1_fvec0),
+      vec_reduce_all<float>(vec_fun2, acc2_fvec0));
 }
 
 template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
@@ -187,7 +187,7 @@ inline BFloat16 map_reduce_all(
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
-  return vec_reduce_all<float>(red_fun, acc_fvec0, fVec::size());
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
 template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
@@ -255,7 +255,7 @@ inline BFloat16 map2_reduce_all(
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
-  return vec_reduce_all<float>(red_fun, acc_fvec0, fVec::size());
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
 template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
@@ -336,7 +336,7 @@ inline BFloat16 map3_reduce_all(
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
-  return vec_reduce_all<float>(red_fun, acc_fvec0, fVec::size());
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
 template <typename scalar_t = BFloat16, typename Op>
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index c64e3e589905..83060f686051 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -698,6 +698,23 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
   }
 }
 
+template <>
+inline void convert(const BFloat16* src, float* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
+    auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+    __m256 o1, o2;
+    cvtbf16_fp32(vsrc, o1, o2);
+    _mm256_storeu_ps(dst + i, o1);
+    _mm256_storeu_ps(dst + i + Vectorized<float>::size(), o2);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
 template <>
 Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index 24c25c96137b..487233bc3c40 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -214,7 +214,7 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm256_sub_pd(pi_2, asin());
   }
   Vectorized<c10::complex<double>> atan() const;
-  Vectorized<c10::complex<double>> atan2(const Vectorized<c10::complex<double>> &b) const {
+  Vectorized<c10::complex<double>> atan2(const Vectorized<c10::complex<double>>&) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> erf() const {
@@ -255,20 +255,20 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> floor() const {
     return _mm256_floor_pd(values);
   }
-  Vectorized<c10::complex<double>> hypot(const Vectorized<c10::complex<double>> &b) const {
+  Vectorized<c10::complex<double>> hypot(const Vectorized<c10::complex<double>> &) const {
     AT_ERROR("not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> igamma(const Vectorized<c10::complex<double>> &x) const {
+  Vectorized<c10::complex<double>> igamma(const Vectorized<c10::complex<double>> &) const {
     AT_ERROR("not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> igammac(const Vectorized<c10::complex<double>> &x) const {
+  Vectorized<c10::complex<double>> igammac(const Vectorized<c10::complex<double>> &) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> neg() const {
     auto zero = _mm256_setzero_pd();
     return _mm256_sub_pd(zero, values);
   }
-  Vectorized<c10::complex<double>> nextafter(const Vectorized<c10::complex<double>> &b) const {
+  Vectorized<c10::complex<double>> nextafter(const Vectorized<c10::complex<double>> &) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> round() const {
@@ -309,31 +309,31 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> operator!=(const Vectorized<c10::complex<double>>& other) const {
     return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
   }
-  Vectorized<c10::complex<double>> operator<(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> operator<(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> operator<=(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> operator<=(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> operator>(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> operator>(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> operator>=(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> operator>=(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 
   Vectorized<c10::complex<double>> eq(const Vectorized<c10::complex<double>>& other) const;
   Vectorized<c10::complex<double>> ne(const Vectorized<c10::complex<double>>& other) const;
-  Vectorized<c10::complex<double>> lt(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> lt(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> le(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> le(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> gt(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> gt(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<double>> ge(const Vectorized<c10::complex<double>>& other) const {
+  Vectorized<c10::complex<double>> ge(const Vectorized<c10::complex<double>>&) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 };
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index f917eb02da56..4093022a7e34 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -248,7 +248,7 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::acos);
   }
   Vectorized<c10::complex<float>> atan() const;
-  Vectorized<c10::complex<float>> atan2(const Vectorized<c10::complex<float>> &b) const {
+  Vectorized<c10::complex<float>> atan2(const Vectorized<c10::complex<float>>& /*b*/) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<float>> erf() const {
@@ -289,20 +289,20 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> floor() const {
     return _mm256_floor_ps(values);
   }
-  Vectorized<c10::complex<float>> hypot(const Vectorized<c10::complex<float>> &b) const {
+  Vectorized<c10::complex<float>> hypot(const Vectorized<c10::complex<float>>& /*b*/) const {
     AT_ERROR("not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> igamma(const Vectorized<c10::complex<float>> &x) const {
+  Vectorized<c10::complex<float>> igamma(const Vectorized<c10::complex<float>>& /*x*/) const {
     AT_ERROR("not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> igammac(const Vectorized<c10::complex<float>> &x) const {
+  Vectorized<c10::complex<float>> igammac(const Vectorized<c10::complex<float>>& /*x*/) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<float>> neg() const {
     auto zero = _mm256_setzero_ps();
     return _mm256_sub_ps(zero, values);
   }
-  Vectorized<c10::complex<float>> nextafter(const Vectorized<c10::complex<float>> &b) const {
+  Vectorized<c10::complex<float>> nextafter(const Vectorized<c10::complex<float>>& /*b*/) const {
     AT_ERROR("not supported for complex numbers");
   }
   Vectorized<c10::complex<float>> round() const {
@@ -343,31 +343,31 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> operator!=(const Vectorized<c10::complex<float>>& other) const {
     return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
   }
-  Vectorized<c10::complex<float>> operator<(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> operator<(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> operator<=(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> operator<=(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> operator>(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> operator>(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> operator>=(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> operator>=(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 
   Vectorized<c10::complex<float>> eq(const Vectorized<c10::complex<float>>& other) const;
   Vectorized<c10::complex<float>> ne(const Vectorized<c10::complex<float>>& other) const;
-  Vectorized<c10::complex<float>> lt(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> lt(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> le(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> le(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> gt(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> gt(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
-  Vectorized<c10::complex<float>> ge(const Vectorized<c10::complex<float>>& other) const {
+  Vectorized<c10::complex<float>> ge(const Vectorized<c10::complex<float>>& /*other*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 };
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index bba32942cc3a..6a1b84fc39a9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -65,10 +65,10 @@ __m256i pack_saturate_and_clamp(
 
 template <>
 inline __m256i pack_saturate_and_clamp<int32_t>(
-    __m256i first,
-    __m256i second,
-    int32_t min_val,
-    int32_t max_val) {
+    __m256i /*first*/,
+    __m256i /*second*/,
+    int32_t /*min_val*/,
+    int32_t /*max_val*/) {
   // This function is for linkage only, will not be used
   AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
 }
@@ -259,7 +259,7 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
 
     float_vec_return_type dequantize(
         Vectorized<float> scale,
-        Vectorized<float> zero_point,
+        Vectorized<float> /*zero_point*/,
         Vectorized<float> scale_zp_premul) const {
       __m256 float_vals = _mm256_cvtepi32_ps(vals);
       return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
@@ -269,7 +269,7 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
         const float_vec_return_type& rhs,
         float scale,
         int32_t zero_point,
-        float inverse_scale) {
+        float /*inverse_scale*/) {
       Vectorized<c10::qint32> retval;
       auto rhs_data = (__m256)rhs[0];
       at::native::quantize_vec<c10::qint32, /*precision=*/32>(
@@ -442,7 +442,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
  public:
   float_vec_return_type dequantize(
       Vectorized<float> scale,
-      Vectorized<float> zero_point,
+      Vectorized<float> /*zero_point*/,
       Vectorized<float> scale_neg_zp_premul) const {
     __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
     __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
@@ -467,7 +467,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
 
   static Vectorized<c10::qint8> quantize(
       const float_vec_return_type& rhs,
-      float scale,
+      float /*scale*/,
       int32_t zero_point,
       float inverse_scale) {
     auto* rhs_data = (float*)rhs.data();
@@ -605,7 +605,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
  public:
   float_vec_return_type dequantize(
       Vectorized<float> scale,
-      Vectorized<float> zero_point,
+      Vectorized<float> /*zero_point*/,
       Vectorized<float> scale_zp_premul) const {
     __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
     __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
@@ -630,7 +630,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
 
   static Vectorized<c10::quint8> quantize(
       const float_vec_return_type& rhs,
-      float scale,
+      float /*scale*/,
       int32_t zero_point,
       float inverse_scale) {
     auto* rhs_data = (float*)rhs.data();
@@ -763,7 +763,7 @@ struct VectorizedQuantizedConverter {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point,
-      Vectorized<float> scale_zp_premul) const {
+      Vectorized<float> /*scale_zp_premul*/) const {
     float_vec_return_type rv;
     for (const auto i : c10::irange(float_num_vecs())) {
       float tmp_vals[8];
@@ -820,7 +820,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       const float_vec_return_type& rhs,
       float scale,
       int32_t zero_point,
-      float inverse_scale) {
+      float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
@@ -952,7 +952,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       const float_vec_return_type& rhs,
       float scale,
       int32_t zero_point,
-      float inverse_scale) {
+      float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
@@ -1072,7 +1072,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
       const float_vec_return_type& rhs,
       float scale,
       int32_t zero_point,
-      float inverse_scale) {
+      float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index c690682a4aa4..c0b34252b50b 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -800,6 +800,23 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
   }
 }
 
+template <>
+inline void convert(const BFloat16* src, float* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
+    auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+    __m512 o1, o2;
+    cvtbf16_fp32(vsrc, o1, o2);
+    _mm512_storeu_ps(dst + i, o1);
+    _mm512_storeu_ps(dst + i + Vectorized<float>::size(), o2);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
 template <>
 Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index 407cbbd7a392..3bf1010efd68 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -14,6 +14,7 @@
 // See https://github.com/pytorch/pytorch/issues/37577 for an instance
 // of this bug in the past.
 
+#include <cassert>
 #include <cstring>
 #include <functional>
 #include <cmath>
@@ -133,7 +134,7 @@ struct Vectorized {
   static constexpr size_type size() {
     return VECTOR_WIDTH / sizeof(T);
   }
-  Vectorized() : values{0} {}
+  Vectorized() : values{static_cast<T>(0)} {}
   Vectorized(T val) {
     for (int i = 0; i != size(); i++) {
       values[i] = val;
@@ -537,7 +538,7 @@ struct Vectorized {
     // 1 if the pred is true, otherwise 0.
     Vectorized<T> vector;
     for (int i = 0; i != size(); ++ i) {
-      vector[i] = bool(op(values[i], other.values[i]));
+      vector[i] = static_cast<T>(op(values[i], other.values[i]));
     }
     return vector;
   }
diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h
index 92bf85ad2d2c..d8d0a1544ccd 100644
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@@ -12,7 +12,7 @@
 // It implements various functions with a simple interface
 // For example it enables the user to call vsin(float* out, const float* in,
 // size) This functions takes a pointer to a contious output array of floats and
-// a constant input array. It will then apply sin to each value in in the input
+// a constant input array. It will then apply sin to each value in the input
 // array and write the result into the output array. out and in may point to the
 // same memory, i.e. this fully supports in-place operations. These functions
 // also implement their own parallelization, so take precautions when calling
diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
index cd002414687a..1189cc05de12 100644
--- a/aten/src/ATen/cuda/Atomic.cuh
+++ b/aten/src/ATen/cuda/Atomic.cuh
@@ -4,6 +4,8 @@
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 
+#include <ATen/NumericUtils.h>
+
 template <typename T>
 struct AtomicFPOp;
 
@@ -298,7 +300,7 @@ static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BF
 static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
 
 /* Special case fp32 atomic. */
-#if defined(USE_ROCM) && defined(__gfx908__)
+#if defined(USE_ROCM)
 static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); }
 #else
 static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
@@ -344,3 +346,109 @@ inline __device__ float gpuAtomicMul (float * address, float val) {
 
   return __int_as_float(old);
 }
+
+// Atomic maximum implementation.
+
+template <typename T>
+__host__ __device__ T safe_max(T a, T b) {
+  #if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
+  #else
+    T max = at::_isnan(b) ? b : std::max<T>(a, b);
+  #endif
+
+  return max;
+}
+
+inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_max(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_max(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMax(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_max(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMax(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_max(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+// Atomic minimum implementation.
+
+template <typename T>
+__host__ __device__ T safe_min(T a, T b) {
+  #if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
+  #else
+    T min = at::_isnan(b) ? b : std::min<T>(a, b);
+  #endif
+
+  return min;
+}
+
+inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_min(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_min(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMin(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_min(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMin(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_min(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 44e24ab52b99..6a8ca194397d 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -2,7 +2,7 @@
 
 #include <ATen/cuda/ApplyGridUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/TensorUtils.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -378,12 +378,14 @@ kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
 template <typename scalar1, typename scalar2, int step, typename Op,
           int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
           int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
-inline bool CUDA_tensor_apply2(at::Tensor a,
-                               at::Tensor b,
+inline bool CUDA_tensor_apply2(at::TensorBase a,
+                               at::TensorBase b,
                                const Op op,
                                TensorArgType aType = TensorArgType::ReadWrite,
                                TensorArgType bType = TensorArgType::ReadOnly) {
-  checkDeviceType("CUDA_tensor_apply2", {a, b}, DeviceType::CUDA);
+  TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
+              "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
+              "tensors with type ", a.device().type(), " and ", b.device().type());
   int64_t totalElements = a.numel();
 
   if (totalElements != b.numel()) {
@@ -413,8 +415,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
   This ensures that each element of the tensor is operated on once and only
   once.
   */
-  Tensor oldA;
-  Tensor oldB;
+  TensorBase oldA;
+  TensorBase oldB;
 
   if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
     // Must perform in contiguous space
@@ -524,8 +526,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 template <typename scalar1, typename scalar2, typename Op,
           int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
           int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
-inline bool CUDA_tensor_apply2(at::Tensor a,
-                               at::Tensor b,
+inline bool CUDA_tensor_apply2(const at::TensorBase &a,
+                               const at::TensorBase &b,
                                const Op op,
                                TensorArgType aType = TensorArgType::ReadWrite,
                                TensorArgType bType = TensorArgType::ReadOnly) {
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 34b0214a5614..e99017289d68 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -2,10 +2,23 @@
   Provides the implementations of CUDA BLAS function templates.
  */
 
+#include <ATen/ATen.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
-#include <c10/util/irange.h>
+#include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+
+// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+// added bf16 support
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+#include <cublasLt.h>
+#endif
+
+#ifdef USE_ROCM
+#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+#endif
 
 #define CUDABLAS_POSINT_CHECK(FD, X)         \
   TORCH_CHECK(                               \
@@ -97,42 +110,6 @@ namespace at {
 namespace cuda {
 namespace blas {
 
-C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) {
-  if (error == CUBLAS_STATUS_SUCCESS) {
-    return "CUBLAS_STATUS_SUCCESS";
-  }
-  if (error == CUBLAS_STATUS_NOT_INITIALIZED) {
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  }
-  if (error == CUBLAS_STATUS_ALLOC_FAILED) {
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  }
-  if (error == CUBLAS_STATUS_INVALID_VALUE) {
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  }
-  if (error == CUBLAS_STATUS_ARCH_MISMATCH) {
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  }
-  if (error == CUBLAS_STATUS_MAPPING_ERROR) {
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  }
-  if (error == CUBLAS_STATUS_EXECUTION_FAILED) {
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  }
-  if (error == CUBLAS_STATUS_INTERNAL_ERROR) {
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-  }
-  if (error == CUBLAS_STATUS_NOT_SUPPORTED) {
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-  }
-#ifdef CUBLAS_STATUS_LICENSE_ERROR
-  if (error == CUBLAS_STATUS_LICENSE_ERROR) {
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-#endif
-  return "<unknown>";
-}
-
 /* LEVEL 3 BLAS FUNCTIONS */
 
 #ifndef USE_ROCM
@@ -274,13 +251,17 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
   float falpha = alpha;
   float fbeta = beta;
 #ifdef USE_ROCM
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
   TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k,
                                    (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea,
                                    b, rocblas_datatype_f16_r, (int)ldb, strideb,
                                    (void*)&fbeta, c, rocblas_datatype_f16_r, (int)ldc, stridec,
                                    c, rocblas_datatype_f16_r, (int)ldc, stridec,
                                    (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
-                                   0, 0));
+                                   0, flag));
 #else
   #if defined(CUDA_VERSION) && CUDA_VERSION < 11000
     // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
@@ -420,6 +401,10 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   GEMM_CHECK_ARGVALUES(at::Half);
 #ifdef USE_ROCM
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(
       handle,
       opa,
@@ -444,7 +429,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
       rocblas_datatype_f32_r,
       rocblas_gemm_algo_standard,
       0,
-      0));
+      flag));
 #else
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major >= 5) {
@@ -576,6 +561,270 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 }
 #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+
+namespace {
+// Following the pattern of CuSparseDescriptor
+// Defined here for now because this is the only place cublas_lt interface is
+// used but can be moved to a header once cublas_lt interface is used in
+// multiple places.
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct CuBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+class CuBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
+                                     cublasLtMatmulDescOpaque_t,
+                                     &cublasLtMatmulDescDestroy> {
+ public:
+  CuBlasLtMatmulDescriptor(
+      cublasComputeType_t compute_type,
+      cudaDataType_t scale_type) {
+    cublasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
+                                 cublasLtMatrixLayoutOpaque_t,
+                                 &cublasLtMatrixLayoutDestroy> {
+ public:
+  CuBlasLtMatrixLayout(
+      cudaDataType_t type,
+      uint64_t rows,
+      uint64_t cols,
+      int64_t ld) {
+    cublasLtMatrixLayout_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatrixLayoutCreate(&raw_descriptor, type, rows, cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+                                     cublasLtMatmulPreferenceOpaque_t,
+                                     &cublasLtMatmulPreferenceDestroy> {
+ public:
+  CuBlasLtMatmulPreference() {
+    cublasLtMatmulPreference_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+} // namespace
+
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation) {
+  using opmath_t = at::opmath_type<Dtype>;
+  opmath_t beta_val = 0; // bias is added in epilogue
+
+  cudaDataType_t abcType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  if (std::is_same<Dtype, double>::value) {
+    abcType = CUDA_R_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_R_64F;
+  } else if (std::is_same<Dtype, float>::value) {
+    if (at::globalContext().allowTF32CuBLAS()) {
+      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+    }
+    abcType = CUDA_R_32F;
+  } else if (std::is_same<Dtype, at::Half>::value) {
+    abcType = CUDA_R_16F;
+  } else if (std::is_same<Dtype, at::BFloat16>::value) {
+    abcType = CUDA_R_16BF;
+  }
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_TRANSA,
+      &transa,
+      sizeof(transa)));
+  cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_TRANSB,
+      &transb,
+      sizeof(transb)));
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
+  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
+    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
+  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+#if CUDA_VERSION >= 11040
+    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+#endif
+  }
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_EPILOGUE,
+      &epilogue,
+      sizeof(epilogue)));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+      &bias,
+      sizeof(Dtype*)));
+
+  CuBlasLtMatrixLayout Adesc(
+      abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld);
+  CuBlasLtMatrixLayout Bdesc(
+      abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+
+  CuBlasLtMatmulPreference preference;
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // setting this to 1M.
+  size_t workspaceSize = 1024 * 1024;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+      &workspaceSize,
+      sizeof(workspaceSize)));
+
+  auto workspace = at::empty(
+      {static_cast<int64_t>(workspaceSize)},
+      at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
+
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  cublasLtHandle_t ltHandle =
+      reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+  TORCH_CUDABLAS_CHECK(cublasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha_val,
+      mat1_ptr,
+      Adesc.descriptor(),
+      mat2_ptr,
+      Bdesc.descriptor(),
+      &beta_val,
+      result_ptr,
+      Cdesc.descriptor(),
+      result_ptr,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.data_ptr(),
+      workspaceSize,
+      at::cuda::getCurrentCUDAStream()));
+}
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<double> alpha_val,
+    const double* mat1_ptr,
+    int64_t mat1_ld,
+    const double* mat2_ptr,
+    int64_t mat2_ld,
+    const double* bias,
+    double* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<float> alpha_val,
+    const float* mat1_ptr,
+    int64_t mat1_ld,
+    const float* mat2_ptr,
+    int64_t mat2_ld,
+    const float* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    at::Half* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    at::BFloat16* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+
 template <>
 void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
   TORCH_CUDABLAS_CHECK(cublasStrsm(
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index f5f437d8d63a..10e589ecd6c9 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -70,6 +70,33 @@ template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 #endif
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+enum GEMMAndBiasActivationEpilogue {
+  None,
+  RELU,
+  GELU,
+};
+
+// NOTE: GELU activation is not supported prior to CUDA 11.4 and will
+// do nothing if passed in that case.
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
+#endif
+
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
   char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
       const Dtype *a, int64_t lda, int64_t stridea,                                           \
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index deaebd3583d6..f07daeb979b9 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -32,15 +32,11 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
 
   CUDAEvent(
       DeviceIndex device_index, const cudaIpcEventHandle_t* handle) {
-    #if !defined(USE_ROCM)
       device_index_ = device_index;
       CUDAGuard guard(device_index_);
 
       AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle));
       is_created_ = true;
-    #else
-      AT_ERROR("cuIpcOpenEventHandle with HIP is not supported");
-    #endif
   }
 
   // Note: event destruction done on creating device to avoid creating a
@@ -148,7 +144,6 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
 
   // Note: cudaIpcGetEventHandle must be called on the same device as the event
   void ipc_handle(cudaIpcEventHandle_t * handle) {
-    #if !defined(USE_ROCM)
       if (!is_created_) {
         // this CUDAEvent object was initially constructed from flags but event_
         // is not created yet.
@@ -156,9 +151,6 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
       }
       CUDAGuard guard(device_index_);
       AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_));
-    #else
-      AT_ERROR("cuIpcGetEventHandle with HIP is not supported");
-    #endif
   }
 
 private:
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
index 3fddd8556467..768f0b7549c2 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@@ -1,9 +1,7 @@
 #pragma once
 
-#include <c10/core/GeneratorImpl.h>
 #include <ATen/core/Generator.h>
 #include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
-#include <ATen/Tensor.h>
 #include <ATen/Context.h>
 #include <limits>
 
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index b28c276037b7..c7734334f4e2 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -187,7 +187,7 @@ void CUDAGraph::replay() {
     // certain topologies to be corrupted (kernels elided, internal syncs
     // ignored) when replayed back to back without a sync in between.
     // The bug is fixed in CUDA 11.4+.
-    cudaDeviceSynchronize();
+    AT_CUDA_CHECK(cudaDeviceSynchronize());
   }
 #else
   TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h
index fd88a7fc3ffd..ecb7127dfa32 100644
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@@ -34,8 +34,7 @@
 
 // BSR triangular solve functions were added in hipSPARSE 1.11.2 (ROCm 4.5.0)
 #if defined(CUDART_VERSION) ||                            \
-    (defined(USE_ROCM) && (hipsparseVersionMajor >= 1) && \
-     (hipsparseVersionMinor >= 11) && (hipsparseVersionPatch >= 2))
+      (defined(USE_ROCM) && ROCM_VERSION >= 40500 )
 #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 1
 #else
 #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 0
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
index b71bcfdd6fe6..3065babf89b6 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@@ -53,12 +53,12 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) {
   }
 }
 
-CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {
+CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided);
   IntArrayRef input_strides = input.strides();
   IntArrayRef input_sizes = input.sizes();
   auto ndim = input.dim();
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
   auto rows = input_sizes[ndim - 2];
   auto cols = input_sizes[ndim - 1];
 
@@ -79,7 +79,9 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {
   auto order = CUSPARSE_ORDER_COL;
 #endif
 
-  void* values_ptr = input.data_ptr();
+  auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0;
+  void* values_ptr = static_cast<char*>(input.data_ptr()) +
+      batch_offset * batch_stride * input.itemsize();
 
   cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type());
   check_supported_cuda_type(value_type);
@@ -94,7 +96,7 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {
       value_type,
       order));
 
-  if (ndim == 3) {
+  if (ndim >= 3 && batch_offset == -1) {
     int batch_count =
         at::native::cuda_int_cast(at::native::batchCount(input), "batch_count");
     TORCH_CUDASPARSE_CHECK(cusparseDnMatSetStridedBatch(
@@ -121,9 +123,9 @@ CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
   descriptor_.reset(raw_descriptor);
 }
 
-CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
+CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int64_t batch_offset) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_sparse_csr());
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 2 || input.dim() == 3);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
 
   IntArrayRef input_sizes = input.sizes();
   auto ndim = input.dim();
@@ -144,16 +146,29 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
   cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type());
   check_supported_cuda_type(value_type);
 
+  auto crow_indices_batch_stride = crow_indices.dim() >= 2 && batch_offset >= 0
+      ? crow_indices.stride(-2)
+      : 0;
+  auto col_indices_batch_stride =
+      col_indices.dim() >= 2 && batch_offset >= 0 ? col_indices.stride(-2) : 0;
+  auto values_batch_stride =
+      values.dim() >= 2 && batch_offset >= 0 ? values.stride(-2) : 0;
+
   cusparseSpMatDescr_t raw_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
       &raw_descriptor, // output descriptor
       rows,
       cols,
       nnz,
-      crow_indices
-          .data_ptr(), // row offsets of the sparse matrix, size = rows + 1
-      col_indices.data_ptr(), // column indices of the sparse matrix, size = nnz
-      values.data_ptr(), // values of the sparse matrix, size = nnz
+      // row offsets of the sparse matrix, size = rows + 1
+      static_cast<char*>(crow_indices.data_ptr()) +
+          batch_offset * crow_indices_batch_stride * crow_indices.itemsize(),
+      // column indices of the sparse matrix, size = nnz
+      static_cast<char*>(col_indices.data_ptr()) +
+          batch_offset * col_indices_batch_stride * col_indices.itemsize(),
+      // values of the sparse matrix, size = nnz
+      static_cast<char*>(values.data_ptr()) +
+          batch_offset * values_batch_stride * values.itemsize(),
       index_type, // data type of row offsets index
       index_type, // data type of col indices
       CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes
@@ -161,7 +176,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
       ));
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (ndim == 3) {
+  if (ndim == 3 && batch_offset == -1) {
     int batch_count =
         at::native::cuda_int_cast(at::native::batchCount(input), "batch_count");
     if (crow_indices.dim() >= 2 || values.dim() >= 2 ||
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h
index 8c29f7022c5c..40078b65df64 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@@ -99,7 +99,7 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
 class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
     : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
  public:
-  explicit CuSparseDnMatDescriptor(const Tensor& input);
+  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
 };
 
 class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
@@ -114,7 +114,7 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
     : public CuSparseSpMatDescriptor {
  public:
-  explicit CuSparseSpMatCsrDescriptor(const Tensor& input);
+  explicit CuSparseSpMatCsrDescriptor(const Tensor& input, int64_t batch_offset = -1);
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
   std::tuple<int64_t, int64_t, int64_t> get_size() {
diff --git a/aten/src/ATen/cuda/Exceptions.cpp b/aten/src/ATen/cuda/Exceptions.cpp
new file mode 100644
index 000000000000..2821f94d2b7d
--- /dev/null
+++ b/aten/src/ATen/cuda/Exceptions.cpp
@@ -0,0 +1,68 @@
+//NS: CUDACachingAllocator must be included before to get CUDART_VERSION definedi
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <ATen/cuda/Exceptions.h>
+
+namespace at {
+namespace cuda {
+namespace blas {
+
+C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) {
+  if (error == CUBLAS_STATUS_SUCCESS) {
+    return "CUBLAS_STATUS_SUCCESS";
+  }
+  if (error == CUBLAS_STATUS_NOT_INITIALIZED) {
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  }
+  if (error == CUBLAS_STATUS_ALLOC_FAILED) {
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  }
+  if (error == CUBLAS_STATUS_INVALID_VALUE) {
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  }
+  if (error == CUBLAS_STATUS_ARCH_MISMATCH) {
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  }
+  if (error == CUBLAS_STATUS_MAPPING_ERROR) {
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  }
+  if (error == CUBLAS_STATUS_EXECUTION_FAILED) {
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  }
+  if (error == CUBLAS_STATUS_INTERNAL_ERROR) {
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+  }
+  if (error == CUBLAS_STATUS_NOT_SUPPORTED) {
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+  }
+#ifdef CUBLAS_STATUS_LICENSE_ERROR
+  if (error == CUBLAS_STATUS_LICENSE_ERROR) {
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+#endif
+  return "<unknown>";
+}
+
+} // namespace blas
+
+#ifdef CUDART_VERSION
+namespace solver {
+
+C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) {
+  switch (status) {
+    case CUSOLVER_STATUS_SUCCESS:                     return "CUSOLVER_STATUS_SUCCES";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:             return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:                return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:               return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:               return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:              return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:   return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    default:                                          return "Unknown cusolver error number";
+  }
+}
+
+} // namespace solver
+#endif
+
+}} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/ScanUtils.cuh b/aten/src/ATen/cuda/ScanUtils.cuh
index 30e21b689efc..8b3ef2df76de 100644
--- a/aten/src/ATen/cuda/ScanUtils.cuh
+++ b/aten/src/ATen/cuda/ScanUtils.cuh
@@ -10,88 +10,6 @@
 namespace at {
 namespace cuda {
 
-// Extends the above Inclusive Scan to support segments. It has the same properties
-// but also takes a flag array that indicates the starts of "segments", i.e. individual
-// units to scan. For example, consider the following (+)-scan that is segmented:
-//
-// Input:  [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
-// Flags:  [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
-// Output:  1  4  6  4  5  2  3  5  1  5
-//
-// So we see that each "flag" resets the scan to that index.
-template <typename T, class BinaryOp, int Power2ScanSize>
-__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
-  // Reduce step ("upsweep")
-#pragma unroll
-  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
-    int index = (threadIdx.x + 1) * stride * 2 - 1;
-    if (index < Power2ScanSize) {
-      smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
-      bmem[index] = bmem[index] | bmem[index - stride];
-    }
-    __syncthreads();
-  }
-
-  // Post-reduce step ("downsweep")
-#pragma unroll
-  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
-    int index = (threadIdx.x + 1) * stride * 2 - 1;
-    if ((index + stride) < Power2ScanSize) {
-      smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
-      bmem[index + stride] = bmem[index + stride] | bmem[index];
-    }
-    __syncthreads();
-  }
-}
-
-// Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
-  // FIXME: this is a slow, simple implementation; need up/down sweep,
-  // prevent smem conflicts
-  smem[threadIdx.x] = in;
-
-  __syncthreads();
-
-  for (int offset = 1; offset < blockDim.x; offset *= 2) {
-    T val = 0;
-
-    if (threadIdx.x >= offset) {
-      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
-    }
-
-    __syncthreads();
-    if (threadIdx.x >= offset) {
-      smem[threadIdx.x] = val;
-    }
-
-    __syncthreads();
-  }
-
-  *out = smem[threadIdx.x];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-// Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
-  // FIXME: crappy implementation
-  // We kill write-after-read dependencies separately below, hence the `false`
-  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
-  *out -= in;
-  *carry = smem[blockDim.x - 1];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
 // Inclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
 template <typename T, bool KillWARDependency, class BinaryFunction>
diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu
index 6915a1c2b98f..bf3216eee6da 100644
--- a/aten/src/ATen/cuda/cub.cu
+++ b/aten/src/ATen/cuda/cub.cu
@@ -57,8 +57,8 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4)
 
 AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8)
 
-// BFloat16 is not supported by ROCm's radix sort
-#if !AT_ROCM_ENABLED()
+// BFloat16 Radix sort is supported from ROCm 4.5 onwards
+#if !AT_ROCM_ENABLED() || (AT_ROCM_ENABLED() && ROCM_VERSION >= 40500)
 AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8)
 #endif
 
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 6ac9905f571e..abe2e9272014 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -6,6 +6,8 @@
 #include <iterator>
 #include <limits>
 
+#include <c10/util/C++17.h>
+
 #include <ATen/cuda/cub_definitions.cuh>
 
 #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
@@ -45,17 +47,23 @@
 
 #ifdef USE_ROCM
 #define NO_ROCM(x)
+#define ROCM_HIPCUB(x) ::hipcub
 #else
 #define NO_ROCM(x) x
+#define ROCM_HIPCUB(x) x
 #endif
 
-#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()
+#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || \
+     (defined(USE_ROCM) && ROCM_VERSION >= 40500)
 
+#if !defined(USE_ROCM)
 namespace at_cuda_detail {
+#endif
+
 // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
 
 template <>
-struct cub::FpLimits<c10::BFloat16>
+struct ROCM_HIPCUB(cub)::FpLimits<c10::BFloat16>
 {
     static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
         unsigned short max_word = 0x7F7F;
@@ -68,8 +76,14 @@ struct cub::FpLimits<c10::BFloat16>
     }
 };
 
-template <> struct cub::NumericTraits<c10::BFloat16>: cub::BaseTraits<cub::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
-}
+template <>
+struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
+       ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
+
+#if !defined(USE_ROCM)
+} // namespace at_cuda_detail
+#endif
+
 #endif
 
 #if !defined(USE_ROCM)
@@ -93,13 +107,20 @@ struct cuda_type<c10::Half> {
   using type = __half;
 };
 
-#if CUB_SUPPORTS_NV_BFLOAT16()
+#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()
 
 template<>
 struct cuda_type<c10::BFloat16> {
   using type = __nv_bfloat16;
 };
 
+#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500)
+
+template<>
+struct cuda_type<c10::BFloat16> {
+  using type = hip_bfloat16;
+};
+
 #endif
 
 }  // namespace detail
@@ -142,6 +163,34 @@ inline void segmented_sort_pairs(
   }
 }
 
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename KeysOutputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
+inline void unique_by_key(
+  KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
+  KeysOutputIteratorT keys_out, ValuesOutputIteratorT values_out,
+  NumSelectedIteratorT num_selected, int64_t num_input_items)
+{
+  // TODO: use thrust::discard_iterator to handle null keys_out when https://github.com/NVIDIA/cub/issues/406 is fixed.
+  constexpr bool null_keys_out = std::is_same<KeysOutputIteratorT, std::nullptr_t>::value;
+  using KeyT = typename std::iterator_traits<KeysInputIteratorT>::value_type;
+  using RealKeysOutputIteratorT = typename std::conditional<null_keys_out, KeyT *, KeysOutputIteratorT>::type;
+  RealKeysOutputIteratorT keys_out_;
+  auto allocator = c10::cuda::CUDACachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+  c10::guts::if_constexpr<null_keys_out>(
+    [&](auto _) {
+      keys_out_owner = allocator->allocate(num_input_items * sizeof(KeyT));
+      keys_out_ = static_cast<KeyT *>(keys_out_owner.get());
+    },
+    [&](auto _) {
+      keys_out_ = keys_out;
+    }
+  );
+  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
+    keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
+}
+#endif
+
 namespace impl {
 
 template<typename InputIteratorT1, typename InputIteratorT2, typename OutputIteratorT, class ScanOpT>
diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
index e464b19e57d5..a3d551673558 100644
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@@ -18,7 +18,7 @@
 #define CUB_SUPPORTS_NV_BFLOAT16() false
 #endif
 
-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
+// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
 // https://github.com/NVIDIA/cub/pull/326
 // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
 // starting from CUDA 11.5
@@ -28,6 +28,14 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif
 
+// cub support for UniqueByKey is added to cub 1.16 in:
+// https://github.com/NVIDIA/cub/pull/405
+#if CUB_VERSION >= 101600
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
+#else
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+#endif
+
 // cub support for scan by key is added to cub 1.15
 // in https://github.com/NVIDIA/cub/pull/376
 #if CUB_VERSION >= 101500
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 4efe2ec4c33f..93a23ec6a730 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -139,16 +139,22 @@ bool CUDAHooks::hasCuSOLVER() const {
 #endif
 }
 
-#if !defined(USE_ROCM)
+bool CUDAHooks::hasROCM() const {
+  // Currently, this is same as `compiledWithMIOpen`.
+  // But in future if there are ROCm builds without MIOpen,
+  // then `hasROCM` should return true while `compiledWithMIOpen`
+  // should return false
+  return AT_ROCM_ENABLED();
+}
+
 #if defined(USE_DIRECT_NVRTC)
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
   return std::make_pair(nullptr, at::cuda::load_nvrtc());
 }
-#else
+#elif !defined(USE_ROCM)
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
   return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
 }
-#endif
 #else
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
 #if defined(_WIN32)
@@ -293,10 +299,22 @@ std::string CUDAHooks::showConfig() const {
   cudaRuntimeGetVersion(&runtimeVersion);
 
   auto printCudaStyleVersion = [&](int v) {
+#ifdef USE_ROCM
+    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
+    if(v < 500) {
+      // If major=xx, minor=yy then format -> xxyy
+      oss << (v / 100) << "." << (v % 10);
+    }
+    else {
+      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
+      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+    }
+#else
     oss << (v / 1000) << "." << (v / 10 % 100);
     if (v % 10 != 0) {
       oss << "." << (v % 10);
     }
+#endif
   };
 
 #if !defined(USE_ROCM)
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index a0d175df27c0..5aa2721170ed 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <ATen/detail/CUDAHooksInterface.h>
 
 #include <ATen/Generator.h>
@@ -27,6 +29,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasMAGMA() const override;
   bool hasCuDNN() const override;
   bool hasCuSOLVER() const override;
+  bool hasROCM() const override;
   const at::cuda::NVRTC& nvrtc() const override;
   int64_t current_device() const override;
   bool hasPrimaryContext(int64_t device_index) const override;
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
index fe5a95525e7d..e720994e9249 100644
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -166,6 +166,8 @@ CUDA_STUB1(cuModuleUnload, CUmodule);
 CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
 CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
 CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
+CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
+CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
 
 // Irregularly shaped functions
 CUresult CUDAAPI cuLaunchKernel(CUfunction f,
diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu
new file mode 100644
index 000000000000..905dc75c14ae
--- /dev/null
+++ b/aten/src/ATen/cuda/jiterator.cu
@@ -0,0 +1,345 @@
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/jiterator.h>
+#include <ATen/cuda/jiterator_impl.h>
+
+#include <iostream>
+#include <utility>
+#include <chrono>
+namespace at {
+namespace native {
+
+static inline void launch_jitted_vectorized_kernel_dynamic(
+  const std::string& name, TensorIteratorBase& iter,
+  DeviceIndex dev_idx, int64_t N, const std::string& f, void* data_ptr,
+  const std::vector<at::Scalar>& extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  const int vec_size = jitted_can_vectorize_up_to(iter);
+  bool vectorized = vec_size > 1;
+
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+
+  int nTensors =  iter.ntensors();
+  const at::ScalarType common_dtype = iter.common_dtype();
+  std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);
+  std::string compute_type_str = at::cuda::jit::typeName(toOpMathType(common_dtype));
+  std::string result_type_str = at::cuda::jit::typeName(common_dtype);
+  c10::SmallVector<std::string> extra_args_types = get_extra_args_typenames(extra_args);
+
+  // The cache key includes all the parameters to generate_code + vec_size + dev_idx
+  std::stringstream ss;
+  ss << nTensors << f;
+  ss << f_inputs_type_str << compute_type_str << result_type_str;
+  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
+  ss << extra_args_types;
+  ss << vec_size;
+// DeviceIndex, e.g. int8_t, is not treated as a number by the stream, cast to int as a workaround
+  ss << static_cast<int>(dev_idx);
+  const std::string cache_key = ss.str();
+
+  static std::mutex _jiterator_mutex;
+  static std::unordered_map<std::string, at::cuda::jit::NvrtcFunction> fns;
+  at::cuda::jit::NvrtcFunction* fn_ptr = &fns[cache_key];
+
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{_jiterator_mutex};
+    if (!fn_ptr->function) { // cache miss!
+      // Generates program
+      auto code = at::cuda::jit::generate_code(nTensors, f, name,
+                                               f_inputs_type_str, compute_type_str, result_type_str,
+                                               /*contiguous=*/true, /*dynamic_casting=*/false,
+                                               at::cuda::jit::BinaryFuncVariant::NoScalar,
+                                               extra_args_types,
+                                               vectorized, vec_size);
+      std::string kernel_name = vectorized ? name + "_vectorized" + std::to_string(vec_size) : name;
+      // Acquires the program
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+
+  // size of `extra_args` is unknown at compile-time
+  auto extra_args_size = extra_args.size();
+
+  float scalar_val = 0;
+
+  if (vectorized) {
+    // pack args for kernel launch
+    constexpr int kernel_args = 3;
+    auto args = std::make_unique<void*[]>(kernel_args + extra_args_size);
+    args[0] = static_cast<void*>(&N);
+    args[1] = data_ptr;
+    args[2] = static_cast<void*>(&scalar_val);
+
+    for (const auto i : c10::irange(extra_args_size)) {
+      // since 3 slots are already filled in `args`
+      args[i + 3] = const_cast<void*>(extra_args[i].data_ptr());
+    }
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  } else {
+    TrivialOffsetCalculatorVariant input_offset_calculator(iter);
+    void* ic_ptr = input_offset_calculator.data_ptr();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+
+    // pack args for kernel launch
+    constexpr int kernel_args = 7;
+    auto args = std::make_unique<void*[]>(kernel_args + extra_args_size);
+    args[0] = static_cast<void*>(&N);
+    args[1] = data_ptr;
+    args[2] = ic_ptr;
+    args[3] = static_cast<void*>(&oc);
+    args[4] = static_cast<void*>(&l);
+    args[5] = static_cast<void*>(&s);
+    args[6] = static_cast<void*>(&scalar_val);
+
+    for (const auto i : c10::irange(extra_args_size)) {
+      // since 7 slots are already filled in `args`
+      args[i + 7] = const_cast<void*>(extra_args[i].data_ptr());
+    }
+
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  }
+}
+
+static inline void launch_jitted_unrolled_kernel_dynamic(
+  const std::string& name, TensorIteratorBase& iter,
+  DeviceIndex dev_idx, int64_t N, const std::string& f, void* data_ptr,
+  void* ic_ptr, void* oc_ptr, void* l_ptr, void* s_ptr, bool contiguous, bool dynamic_casting,
+  const std::vector<at::Scalar>& extra_args) {
+
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  int nTensors = iter.ntensors();
+  const at::ScalarType common_dtype = iter.common_dtype();
+  std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);
+  std::string compute_type_str = at::cuda::jit::typeName(toOpMathType(common_dtype));
+  std::string result_type_str = at::cuda::jit::typeName(common_dtype);
+  c10::SmallVector<std::string> extra_args_types = get_extra_args_typenames(extra_args);
+
+  // The cache key includes all the parameters to generate_code + dev_idx
+  std::stringstream ss;
+  ss << nTensors << f;
+  ss << f_inputs_type_str << compute_type_str << result_type_str;
+  ss << contiguous << dynamic_casting;
+  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
+  ss << extra_args_types;
+  ss << dev_idx;
+  const std::string cache_key = ss.str();
+
+  static std::mutex _jiterator_mutex;
+  static std::unordered_map<std::string, at::cuda::jit::NvrtcFunction> fns;
+
+  at::cuda::jit::NvrtcFunction* fn_ptr = &fns[cache_key];
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{_jiterator_mutex};
+    if (!fn_ptr->function) {
+      auto code = at::cuda::jit::generate_code(nTensors, f, name,
+                                               f_inputs_type_str, compute_type_str, result_type_str,
+                                               contiguous, dynamic_casting,
+                                               at::cuda::jit::BinaryFuncVariant::NoScalar,
+                                               extra_args_types);
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, name);
+    }
+  }
+
+  float scalar_val = 0;
+
+  // pack args for kernel launch
+  constexpr int kernel_args = 7;
+  auto extra_args_size = extra_args.size();
+  auto args = std::make_unique<void*[]>(kernel_args + extra_args_size);
+  args[0] = static_cast<void*>(&N);
+  args[1] = data_ptr;
+  args[2] = ic_ptr;
+  args[3] = oc_ptr;
+  args[4] = l_ptr;
+  args[5] = s_ptr;
+  args[6] = static_cast<void*>(&scalar_val);
+
+  for (const auto i : c10::irange(extra_args_size)) {
+    // since 7 slots are already filled in `args`
+    args[i + 7] = const_cast<void*>(extra_args[i].data_ptr());
+  }
+
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+}
+
+void jitted_gpu_kernel_dynamic_impl(
+    const std::string& kernel_name,
+    TensorIteratorBase& iter,
+    const std::string& f,
+    const bool dynamic_casting,
+    const std::vector<at::Scalar>& extra_args) {
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  TORCH_INTERNAL_ASSERT(iter.ninputs() <= 8);
+
+  ArrayVariant data(iter);
+  void* data_ptr = data.data_ptr();
+
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel_dynamic(kernel_name, iter,
+         iter.device().index(), numel, f, data_ptr, extra_args);
+      return;
+    }
+
+    // Case 2: no dynamic casting and noncontiguous
+    OffsetCalculatorVariant input_offset_calculator(iter);
+    void* ic_ptr = input_offset_calculator.data_ptr();
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    void* oc_ptr = static_cast<void*>(&output_offset_calculator);
+
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    void* l_ptr = static_cast<void*>(&loader);
+    void* s_ptr = static_cast<void*>(&storer);
+
+    launch_jitted_unrolled_kernel_dynamic(
+      kernel_name, iter, iter.device().index(), numel, f, data_ptr,
+      ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args);
+
+    return;
+  }
+
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  LoadWithCastVariant loader(iter);
+  void* l_ptr = loader.data_ptr();
+
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast(iter.dtype(0));
+  void* s_ptr = static_cast<void*>(&storer);
+
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    TrivialOffsetCalculatorVariant input_offset_calculator(iter);
+    void* ic_ptr = input_offset_calculator.data_ptr();
+
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    void* oc_ptr = static_cast<void*>(&output_offset_calculator);
+
+    launch_jitted_unrolled_kernel_dynamic(
+      kernel_name, iter, iter.device().index(), numel, f, data_ptr,
+      ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args);
+    return;
+  }
+
+  // Case 4: dynamic casting and noncontiguous
+  OffsetCalculatorVariant input_offset_calculator(iter);
+  void* ic_ptr = input_offset_calculator.data_ptr();
+
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  void* oc_ptr = static_cast<void*>(&output_offset_calculator);
+
+  launch_jitted_unrolled_kernel_dynamic(
+      kernel_name, iter, iter.device().index(), numel, f, data_ptr,
+      ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args);
+}
+
+// Entrypoint for dynamic version of jitted GPU kernels, which accepts dynamic number of inputs
+// and arbitrary types of input and extra args. This dynamic version is needed for jiterator with python interface,
+// since the kernel definition is unknown at the compilation time.
+// Similarly, launch_jitted_vectorized_kernel_dynamic and launch_jitted_unrolled_kernel_dynamic are created
+// to handle arbitrary functions defined in python user code.
+// For templated version, see note [Jiterator] in JitLoops.cuh for more details
+void jitted_gpu_kernel_dynamic(
+    const std::string& kernel_name,
+    TensorIteratorBase& iter,
+    const std::string& f,
+    const std::vector<at::Scalar>& extra_args) {
+
+  // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel
+  //   Maybe it could be refactored?
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      jitted_gpu_kernel_dynamic(kernel_name, sub_iter, f, extra_args);
+    }
+    return;
+  }
+
+  // Computes if dynamic casting is needed
+  // Dynamic casting is needed if an input's or output's dtype differs from the common dtype
+  bool needs_dynamic_casting = false;
+  const at::ScalarType common_dtype = iter.common_dtype();
+  for (auto i = 0; i < iter.ntensors(); ++i) {
+    if (iter.dtype(i) != common_dtype) {
+      needs_dynamic_casting = true;
+      break;
+    }
+  }
+
+  jitted_gpu_kernel_dynamic_impl(kernel_name, iter, f, needs_dynamic_casting, extra_args);
+}
+
+} // namespace native
+
+namespace cuda {
+
+at::Tensor CompileAndLaunchKernel(
+  const std::string& code_string,
+  const std::string& kernel_name,
+  const std::vector<at::Tensor>& tensors,
+  const std::vector<at::Scalar>& extra_args) {
+
+  Tensor output;
+  TensorIteratorConfig config;
+  config
+    .set_check_mem_overlap(true)
+    .allow_cpu_scalars(false)
+    .promote_inputs_to_common_dtype(true)
+    .cast_common_dtype_to_outputs(true)
+    .enforce_safe_casting_to_output(true)
+    .check_all_same_device(true)
+    .add_owned_output(output);
+  for (const auto& t: tensors){
+    config.add_input(t);
+  }
+  TensorIterator iter = config.build();
+
+  CUDAGuard guard(iter.device());
+  at::native::jitted_gpu_kernel_dynamic(kernel_name, iter, code_string, extra_args);
+
+  return iter.output();
+}
+
+}} // namespace at::cuda
+
+#endif // AT_USE_JITERATOR()
diff --git a/aten/src/ATen/cuda/jiterator.h b/aten/src/ATen/cuda/jiterator.h
new file mode 100644
index 000000000000..aa831fd06505
--- /dev/null
+++ b/aten/src/ATen/cuda/jiterator.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <c10/macros/Export.h>
+#include <ATen/core/Tensor.h>
+
+#include <string>
+#include <vector>
+
+namespace at {
+namespace cuda {
+
+TORCH_CUDA_CPP_API at::Tensor CompileAndLaunchKernel(
+  const std::string& code_string,
+  const std::string& kernel_name,
+  const std::vector<at::Tensor>& tensors,
+  const std::vector<at::Scalar>& extra_args);
+
+}} // namespace at::cuda
+
+#else
+
+namespace at { namespace cuda {
+TORCH_CUDA_CPP_API at::Tensor CompileAndLaunchKernel(
+  const std::string& code_string,
+  const std::string& kernel_name,
+  const std::vector<at::Tensor>& tensors,
+  const std::vector<at::Scalar>& extra_args) {
+    TORCH_CHECK(false, "Jiterator is not supported on ROCm");
+  }
+}} // namespace at::cuda
+
+#endif // AT_USE_JITERATOR()
diff --git a/aten/src/ATen/cuda/jiterator_impl.h b/aten/src/ATen/cuda/jiterator_impl.h
new file mode 100644
index 000000000000..4fa179b41a66
--- /dev/null
+++ b/aten/src/ATen/cuda/jiterator_impl.h
@@ -0,0 +1,208 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <c10/util/variant.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
+
+#include <string>
+#include <vector>
+
+namespace at {
+namespace native {
+
+constexpr int NUM_INPUTS = 8;
+
+#define AT_FOR_8_INPUTS(_)  \
+  _(1)                      \
+  _(2)                      \
+  _(3)                      \
+  _(4)                      \
+  _(5)                      \
+  _(6)                      \
+  _(7)                      \
+  _(8)
+
+c10::SmallVector<std::string> get_extra_args_typenames(const std::vector<at::Scalar>& extra_args) {
+  c10::SmallVector<std::string> args_typenames(extra_args.size());
+  for (auto i = 0; i < extra_args.size(); ++i) {
+    args_typenames[i] = at::cuda::jit::typeName(extra_args[i].type());
+  }
+  return args_typenames;
+}
+
+int can_vectorize_up_to(at::ScalarType type, char* pointer) {
+  switch(type) {
+#define DEFINE_CASE(ctype, scalartype)                                   \
+    case ScalarType::scalartype : return memory::can_vectorize_up_to<ctype>(pointer);
+
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+#undef DEFINE_CASE
+
+    default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
+  }
+}
+
+// jitted version of the above
+// See Note [Jiterator], this relies on the assumptions enumerated there
+int jitted_can_vectorize_up_to(const TensorIteratorBase& iter) {
+  const at::ScalarType common_dtype = iter.common_dtype();
+  const at::ScalarType result_dtype = common_dtype;
+
+  // Deals with output
+  int result = can_vectorize_up_to(result_dtype, static_cast<char*>(iter.data_ptr(0)));
+
+  // Incorporates input(s)
+  for (auto i = 1; i < iter.ntensors(); ++i) {
+    result = std::min<int>(result, can_vectorize_up_to(common_dtype, static_cast<char*>(iter.data_ptr(i))));
+  }
+
+  return result;
+}
+
+template<int N>
+static std::unique_ptr<OffsetCalculator<N>> make_unique_input_offset_calculator(const TensorIteratorBase& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return std::make_unique<OffsetCalculator<N>>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+struct OffsetCalculatorVariant {
+#define DEFINE_CASE(index) std::unique_ptr<OffsetCalculator<index>>,
+  using OffsetCalculatorTypes = c10::variant<
+    AT_FOR_8_INPUTS(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  OffsetCalculatorVariant(const TensorIteratorBase& iter) {
+    int arity = iter.ninputs();
+    switch(arity) {
+#define DEFINE_CASE(index)        \
+      case index : v = make_unique_input_offset_calculator<index>(iter); break;
+
+      AT_FOR_8_INPUTS(DEFINE_CASE)
+#undef DEFINE_CASE
+      default:
+        TORCH_CHECK(false, "OffsetCalculatorVariant is not implemented for ninputs = ", arity);
+    }
+  }
+
+  void* data_ptr() {
+    return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
+  }
+
+ private:
+  OffsetCalculatorTypes v;
+};
+
+struct ArrayVariant {
+  // notice: This would produce c10::variant<at::detail::Array<char*, 2...9>>
+#define DEFINE_CASE(index) at::detail::Array<char*, index + 1>,
+  using ArrayTypes = c10::variant<
+    AT_FOR_8_INPUTS(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  ArrayVariant(const TensorIteratorBase& iter) {
+    int arity = iter.ninputs();
+    // This assumes that jiterator kernels only have 1 output
+    switch(arity) {
+#define DEFINE_CASE(index)                              \
+      case index: array = at::detail::Array<char*, index + 1>{}; break;
+
+      AT_FOR_8_INPUTS(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "ArrayVariant is not implemented for ninputs = ", arity);
+    }
+
+    c10::visit([&](auto& a) {
+      for (auto i = 0; i < arity + 1; ++i) {
+        a[i] = (char*)iter.data_ptr(i);
+      }
+    }, array);
+  }
+
+  void* data_ptr() {
+    return c10::visit([](auto & a){ return static_cast<void*>(&a); }, array);
+  }
+
+private:
+  ArrayTypes array;
+};
+
+struct TrivialOffsetCalculatorVariant {
+#define DEFINE_CASE(index) TrivialOffsetCalculator<index>,
+  using TrivialOffsetCalculatorTypes = c10::variant<
+    AT_FOR_8_INPUTS(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  TrivialOffsetCalculatorVariant(const TensorIteratorBase& iter) {
+    int arity = iter.ninputs();
+    switch(arity) {
+#define DEFINE_CASE(index)      \
+      case index: v = TrivialOffsetCalculator<index>(); break;
+
+      AT_FOR_8_INPUTS(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "TrivialOffsetCalculatorVariant is not implemented for ninputs = ", arity);
+    }
+  }
+
+  void* data_ptr() {
+    return c10::visit([](auto & v){ return static_cast<void*>(&v); }, v);
+  }
+
+private:
+  TrivialOffsetCalculatorTypes v;
+};
+
+struct LoadWithCastVariant {
+#define DEFINE_CASE(index) std::unique_ptr<memory::LoadWithCast<index>>,
+  using LoadWithCastPtr = c10::variant<
+    AT_FOR_8_INPUTS(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  LoadWithCastVariant(const TensorIteratorBase& iter) {
+    int arity = iter.ninputs();
+    switch(arity) {
+#define DEFINE_CASE(index)      \
+      case index: v = std::make_unique<memory::LoadWithCast<index>>(iter); break;
+
+      AT_FOR_8_INPUTS(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "LoadWithCastVariant is not implemented for ninputs = ", arity);
+    }
+  }
+
+  void* data_ptr() {
+    return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
+  }
+
+private:
+  LoadWithCastPtr v;
+};
+
+}} // namespace at::native
+
+
+#endif // AT_USE_JITERATOR()
diff --git a/aten/src/ATen/cuda/llvm_complex.cpp b/aten/src/ATen/cuda/llvm_complex.cpp
index 00339bdac0fb..55e39e280272 100644
--- a/aten/src/ATen/cuda/llvm_complex.cpp
+++ b/aten/src/ATen/cuda/llvm_complex.cpp
@@ -477,6 +477,14 @@ operator!=(const _Tp& __x, const complex<_Tp>& __y)
     return !(__x == __y);
 }
 
+template<class _Tp>
+inline constexpr
+bool
+operator&&(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return (__x.real() || __x.imag()) && (__y.real() || __y.imag());
+}
+
 // 26.3.7 values:
 
 template <class _Tp, bool = is_integral<_Tp>::value,
@@ -583,10 +591,41 @@ arg(_Tp __re)
 
 )ESCAPE";
 
+const std::string complex_half_body = R"ESCAPE(
+namespace std {
+template <>
+struct alignas(2) complex<at::Half> {
+  at::Half real_;
+  at::Half imag_;
+
+  // Constructors
+  complex() = default;
+
+  // implicit casting to and from `complex<float>`.
+  // NOTE: computation of `complex<Half>` will occur in `complex<float>`
+  __host__ __device__ inline complex(const std::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  inline __host__ __device__ operator std::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  at::Half real() const {return real_;}
+  at::Half imag() const {return imag_;}
+
+};
+}
+)ESCAPE";
+
+
 const std::string &get_complex_body_string() {
   return complex_body;
 }
 
+const std::string &get_complex_half_body_string() {
+  return complex_half_body;
+}
+
 const std::string complex_math = R"ESCAPE(
 
 namespace std {
@@ -724,6 +763,16 @@ log10(const complex<_Tp>& __x)
     return log(__x) / log(_Tp(10));
 }
 
+// log2
+
+template<class _Tp>
+inline
+complex<_Tp>
+log2(const complex<_Tp>& __x)
+{
+    return log(__x) / log(_Tp(2));
+}
+
 // sqrt
 
 template<class _Tp>
diff --git a/aten/src/ATen/cuda/llvm_jit_strings.h b/aten/src/ATen/cuda/llvm_jit_strings.h
index dcbecd4279bb..237bcdbb4ccb 100644
--- a/aten/src/ATen/cuda/llvm_jit_strings.h
+++ b/aten/src/ATen/cuda/llvm_jit_strings.h
@@ -9,6 +9,7 @@ namespace cuda {
 TORCH_CUDA_CPP_API const std::string &get_traits_string();
 TORCH_CUDA_CPP_API const std::string &get_cmath_string();
 TORCH_CUDA_CPP_API const std::string &get_complex_body_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_half_body_string();
 TORCH_CUDA_CPP_API const std::string &get_complex_math_string();
 
 }} // namespace at
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index 9a77b87713ef..5dbe49953cf1 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -55,7 +55,9 @@ namespace at { namespace cuda {
   _(cuDevicePrimaryCtxGetState)                  \
   _(cuLinkCreate)                                \
   _(cuLinkAddData)                               \
-  _(cuLinkComplete)
+  _(cuLinkComplete)                              \
+  _(cuFuncSetAttribute)                          \
+  _(cuFuncGetAttribute)
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
 #define AT_FORALL_NVRTC(_) \
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index d915fda024de..f954bbf5623a 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -19,6 +19,13 @@ inline cudnnDataType_t getDataType(const at::Tensor& t) {
   } else if (scalar_type == at::kDouble) {
     return CUDNN_DATA_DOUBLE;
   }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+    else if (scalar_type == at::kBFloat16) {
+    return CUDNN_DATA_BFLOAT16;
+  } else if (scalar_type == at::kQInt8) {
+    return CUDNN_DATA_INT8;
+  }
+#endif
   throw std::runtime_error("TensorDescriptor only supports double, float and half tensors");
 }
 
@@ -73,6 +80,10 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
       return "CUDNN_DATA_DOUBLE";
     case CUDNN_DATA_HALF:
       return "CUDNN_DATA_HALF";
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+    case CUDNN_DATA_BFLOAT16:
+      return "CUDNN_DATA_BFLOAT16";
+#endif
     case CUDNN_DATA_INT8:
       return "CUDNN_DATA_INT8";
     case CUDNN_DATA_INT32:
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index c704826511eb..a7bcb5eb72ea 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -21,6 +21,9 @@ std::string cudnnTypeToString(cudnnDataType_t dtype);
 inline int dataSize(cudnnDataType_t dataType)
 {
   switch (dataType) {
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+    case CUDNN_DATA_BFLOAT16:
+#endif
     case CUDNN_DATA_HALF: return 2;
     case CUDNN_DATA_FLOAT: return 4;
     default: return 8;
diff --git a/aten/src/ATen/cudnn/Handle.cpp b/aten/src/ATen/cudnn/Handle.cpp
index 2b1d90f4b3cf..a6eb8fd78154 100644
--- a/aten/src/ATen/cudnn/Handle.cpp
+++ b/aten/src/ATen/cudnn/Handle.cpp
@@ -9,7 +9,7 @@ void createCuDNNHandle(cudnnHandle_t *handle) {
   AT_CUDNN_CHECK(cudnnCreate(handle));
 }
 
-void destroyCuDNNHandle(cudnnHandle_t handle) {
+void destroyCuDNNHandle(cudnnHandle_t /*handle*/) {
 // this is because of something dumb in the ordering of
 // destruction. Sometimes atexit, the cuda context (or something)
 // would already be destroyed by the time this gets destroyed. It
diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp
index 857a7da05127..215d42fcd23f 100644
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@@ -5,12 +5,18 @@
 namespace at { namespace native {
 
 cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
-  if (dtype == at::kFloat) {
+  if (dtype == c10::kQInt8) {
+    return CUDNN_DATA_INT8;
+  } else if (dtype == at::kFloat) {
     return CUDNN_DATA_FLOAT;
   } else if (dtype == at::kDouble) {
     return CUDNN_DATA_DOUBLE;
   } else if (dtype == at::kHalf) {
     return CUDNN_DATA_HALF;
+  }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+  else if (dtype == at::kBFloat16) {
+    return CUDNN_DATA_BFLOAT16;
   } else if (dtype == at::kInt) {
     return CUDNN_DATA_INT32;
   } else if (dtype == at::kByte) {
@@ -18,6 +24,7 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
   } else if (dtype == at::kChar) {
     return CUDNN_DATA_INT8;
   }
+#endif
   std::string msg("getCudnnDataTypeFromScalarType() not supported for ");
   msg += toString(dtype);
   throw std::runtime_error(msg);
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 0454c2f30a22..1303b9f8c8bf 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -75,14 +75,15 @@ struct TORCH_API CUDAHooksInterface {
   }
 
   virtual const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const {
+    (void)device_index; // Suppress unused variable warning
     TORCH_CHECK(false, "Cannot get default CUDA generator without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual Device getDeviceFromPtr(void* data) const {
+  virtual Device getDeviceFromPtr(void* /*data*/) const {
     TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual bool isPinnedPtr(void* data) const {
+  virtual bool isPinnedPtr(void* /*data*/) const {
     return false;
   }
 
@@ -106,6 +107,10 @@ struct TORCH_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool hasROCM() const {
+    return false;
+  }
+
   virtual const at::cuda::NVRTC& nvrtc() const {
     TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
   }
@@ -159,19 +164,19 @@ struct TORCH_API CUDAHooksInterface {
         "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t device_index) const {
+  virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t /*device_index*/) const {
     TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual void cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const {
+  virtual void cuFFTSetPlanCacheMaxSize(int64_t /*device_index*/, int64_t /*max_size*/) const {
     TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual int64_t cuFFTGetPlanCacheSize(int64_t device_index) const {
+  virtual int64_t cuFFTGetPlanCacheSize(int64_t /*device_index*/) const {
     TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual void cuFFTClearPlanCache(int64_t device_index) const {
+  virtual void cuFFTClearPlanCache(int64_t /*device_index*/) const {
     TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
@@ -179,7 +184,7 @@ struct TORCH_API CUDAHooksInterface {
     return 0;
   }
 
-  virtual void deviceSynchronize(int64_t device_index) const {
+  virtual void deviceSynchronize(int64_t /*device_index*/) const {
     TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
   }
 };
diff --git a/aten/src/ATen/gen_vulkan_glsl.py b/aten/src/ATen/gen_vulkan_glsl.py
index d90afbf6a019..b43dcb6cfeff 100644
--- a/aten/src/ATen/gen_vulkan_glsl.py
+++ b/aten/src/ATen/gen_vulkan_glsl.py
@@ -4,7 +4,7 @@
 import glob
 import sys
 import os
-from tools.codegen.code_template import CodeTemplate
+from torchgen.code_template import CodeTemplate
 
 H_NAME = "glsl.h"
 CPP_NAME = "glsl.cpp"
diff --git a/aten/src/ATen/gen_vulkan_spv.py b/aten/src/ATen/gen_vulkan_spv.py
index eb3542410a20..0d0906ded60e 100644
--- a/aten/src/ATen/gen_vulkan_spv.py
+++ b/aten/src/ATen/gen_vulkan_spv.py
@@ -6,7 +6,7 @@
 import os
 import sys
 import subprocess
-from tools.codegen.code_template import CodeTemplate
+from torchgen.code_template import CodeTemplate
 
 H_NAME = "spv.h"
 CPP_NAME = "spv.cpp"
diff --git a/aten/src/ATen/jit_macros.h b/aten/src/ATen/jit_macros.h
index e1542d5fb605..bfe49b51b80a 100644
--- a/aten/src/ATen/jit_macros.h
+++ b/aten/src/ATen/jit_macros.h
@@ -8,7 +8,6 @@
     #define AT_USE_JITERATOR() true
     #define jiterator_stringify(...) std::string(#__VA_ARGS__);
 #else
-    // TODO: update this to become a static assertion
     #define AT_USE_JITERATOR() false
-    #define jiterator_stringify(...) std::string("Jiterator is disabled");
+    #define jiterator_stringify(...) static_assert(false, "Jiterator is not supported on ROCm");
 #endif // USE_ROCM
diff --git a/aten/src/ATen/jiterator_macros.h b/aten/src/ATen/jiterator_macros.h
new file mode 100644
index 000000000000..2769537346c8
--- /dev/null
+++ b/aten/src/ATen/jiterator_macros.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <string>
+
+#define JITERATOR_HOST_DEVICE C10_HOST_DEVICE
+#if defined(_MSC_VER) && defined(__CUDACC__)
+// NVRTC on Windows errors if __host__ __device__ attribute is
+// present on kernel.
+// error: attribute "__host__" does not apply here
+// error: attribute "__device__" does not apply here
+#define JITERATOR_HOST_DEVICE
+#endif
+
+// jiterator_also_stringify_as macro is used to define code (for CPU/ROCm)
+// and generate code string for `jiterator` (only when compiling for CUDA).
+// Usage :
+//      jiterator_also_stringify_as(
+//          jiterator_code(template <typename T> T identity(T x) { return x; }),
+//          identity_string);
+// This will define the template `identity` as present in code and
+// also define `std::string identity_string` with the code as the string
+// if this is being compiled for CUDA.
+
+// `jiterator_code` macro is to deal with `,` in the kernel code.
+// These `,`s confuse the preprocessor into thinking we are passing
+// multiple arguments to the macro.
+#define jiterator_code(...) __VA_ARGS__
+#if defined(__CUDACC__)
+    // CPU and CUDA case
+    #define stringify_code(...) #__VA_ARGS__
+    #define jiterator_also_stringify_as(code, str_name)                    \
+        code /* define the function */                                  \
+        const std::string str_name = std::string(stringify_code(code));
+#else
+    // CPU only or CPU and ROCm case
+    // Only needs the function
+    #define jiterator_also_stringify_as(code, str_name) code
+#endif
diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp
index 67dcb30e5283..1ad464b8d3a3 100644
--- a/aten/src/ATen/mkl/SparseBlas.cpp
+++ b/aten/src/ATen/mkl/SparseBlas.cpp
@@ -253,6 +253,39 @@ void mm<c10::complex<double>>(MKL_SPARSE_MM_ARGTYPES(c10::complex<double>)) {
       ldc));
 }
 
+#if !defined(_WIN32)
+template <>
+void spmmd<float>(MKL_SPARSE_SPMMD_ARGTYPES(float)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_s_spmmd(
+      operation, A, B, layout, C, ldc));
+}
+template <>
+void spmmd<double>(MKL_SPARSE_SPMMD_ARGTYPES(double)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_d_spmmd(
+      operation, A, B, layout, C, ldc));
+}
+template <>
+void spmmd<c10::complex<float>>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex<float>)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_c_spmmd(
+      operation,
+      A,
+      B,
+      layout,
+      reinterpret_cast<MKL_Complex8*>(C),
+      ldc));
+}
+template <>
+void spmmd<c10::complex<double>>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex<double>)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_z_spmmd(
+      operation,
+      A,
+      B,
+      layout,
+      reinterpret_cast<MKL_Complex16*>(C),
+      ldc));
+}
+#endif
+
 template <>
 void trsv<float>(MKL_SPARSE_TRSV_ARGTYPES(float)) {
   TORCH_MKLSPARSE_CHECK(mkl_sparse_s_trsv(operation, alpha, A, descr, x, y));
diff --git a/aten/src/ATen/mkl/SparseBlas.h b/aten/src/ATen/mkl/SparseBlas.h
index 7281b6950611..20fb59a54ff9 100644
--- a/aten/src/ATen/mkl/SparseBlas.h
+++ b/aten/src/ATen/mkl/SparseBlas.h
@@ -157,6 +157,29 @@ void mm<c10::complex<float>>(MKL_SPARSE_MM_ARGTYPES(c10::complex<float>));
 template <>
 void mm<c10::complex<double>>(MKL_SPARSE_MM_ARGTYPES(c10::complex<double>));
 
+#define MKL_SPARSE_SPMMD_ARGTYPES(scalar_t)                               \
+  const sparse_operation_t operation, const sparse_matrix_t A,            \
+      const sparse_matrix_t B, const sparse_layout_t layout, scalar_t *C, \
+      const MKL_INT ldc
+
+template <typename scalar_t>
+inline void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::mkl::sparse::spmmd: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void spmmd<float>(MKL_SPARSE_SPMMD_ARGTYPES(float));
+template <>
+void spmmd<double>(MKL_SPARSE_SPMMD_ARGTYPES(double));
+template <>
+void spmmd<c10::complex<float>>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex<float>));
+template <>
+void spmmd<c10::complex<double>>(
+    MKL_SPARSE_SPMMD_ARGTYPES(c10::complex<double>));
+
 #define MKL_SPARSE_TRSV_ARGTYPES(scalar_t)                      \
   const sparse_operation_t operation, const scalar_t alpha,     \
       const sparse_matrix_t A, const struct matrix_descr descr, \
diff --git a/aten/src/ATen/mkl/SparseDescriptors.h b/aten/src/ATen/mkl/SparseDescriptors.h
index 46d656898a8d..e0dfb158e356 100644
--- a/aten/src/ATen/mkl/SparseDescriptors.h
+++ b/aten/src/ATen/mkl/SparseDescriptors.h
@@ -76,7 +76,7 @@ class MklSparseCsrDescriptor
     : public MklSparseDescriptor<sparse_matrix, &mkl_sparse_destroy> {
  public:
   MklSparseCsrDescriptor(const Tensor& input) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_sparse_csr());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY((input.layout() == kSparseCsr || input.layout() == kSparseBsr));
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 2);
 
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
@@ -100,8 +100,10 @@ class MklSparseCsrDescriptor
 
     sparse_matrix_t raw_descriptor;
 
-    // Assuming that the last two dimensions are block elements of the matrix
-    if (values.dim() == 3) {
+    if (input.layout() == kSparseBsr) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          values.dim() == 3 && crow_indices.dim() == 1 &&
+          col_indices.dim() == 1);
       TORCH_CHECK(
           values.size(-1) == values.size(-2),
           "MKL Sparse doesn't support matrices with non-square blocks.");
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
new file mode 100644
index 000000000000..fd2b0b0e536a
--- /dev/null
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -0,0 +1,119 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <torch/library.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/mps/Copy.h>
+
+#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
+#define MPS_ERROR_RUNTIME_TOO_LOW \
+  "The MPS backend is supported on MacOS 12.3+.", \
+  "Current OS version can be queried using `sw_vers`"
+
+namespace at { namespace detail {
+TensorBase empty_mps(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+#if defined(__APPLE__)
+#if __is_target_os(macOS)
+  if (__builtin_available(macOS 12.3, *)) {
+    auto device = device_or_default(device_opt);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::MPS);
+
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        layout_or_default(layout_opt) == Layout::Strided,
+        "strided tensors not supported yet");
+    check_size_nonnegative(size);
+
+    auto* allocator = at::mps::GetMPSAllocator();
+    int64_t nelements = c10::multiply_integers(size);
+    auto dtype = dtype_or_default(dtype_opt);
+    auto dtype_meta = scalarTypeToTypeMeta(dtype);
+    int64_t size_bytes = nelements * dtype_meta.itemsize();
+    auto storage_impl = c10::make_intrusive<StorageImpl>(
+        c10::StorageImpl::use_byte_size_t(),
+        size_bytes,
+        allocator->allocate(size_bytes),
+        allocator,
+        /*resizeable=*/true);
+
+    auto tensor =
+        detail::make_tensor<TensorImpl>(storage_impl, DispatchKey::MPS, dtype_meta);
+    // Default TensorImpl has size [0]
+    if (size.size() != 1 || size[0] != 0) {
+      tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+    }
+
+    auto memory_format = memory_format_opt.value_or(MemoryFormat::Contiguous);
+    tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
+    return tensor;
+  } else {
+    TORCH_CHECK(false, MPS_ERROR_RUNTIME_TOO_LOW)
+  }
+#else
+  TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED)
+#endif
+#else
+  TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED)
+#endif
+}
+
+TensorBase empty_mps(
+    IntArrayRef size, const TensorOptions &options) {
+  return at::detail::empty_mps(
+      size,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt(),
+      options.memory_format_opt());
+}
+
+TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt) {
+#if defined(__APPLE__)
+#if __is_target_os(macOS)
+  if (__builtin_available(macOS 12.3, *)) {
+    auto device = device_or_default(device_opt);
+    TORCH_INTERNAL_ASSERT(device.is_mps());
+    const DeviceGuard device_guard(device);
+    auto* allocator = at::mps::GetMPSAllocator();
+    constexpr c10::DispatchKeySet mps_dks(c10::DispatchKey::MPS);
+    return at::detail::empty_strided_generic(
+        size, stride, allocator, mps_dks, dtype);
+  } else {
+    TORCH_CHECK(false, MPS_ERROR_RUNTIME_TOO_LOW)
+  }
+#else
+  TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED)
+#endif
+#else
+  TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED)
+#endif
+}
+
+TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions &options) {
+  return at::native::empty_strided_mps(
+      size,
+      stride,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt());
+}
+
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/mps/EmptyTensor.h b/aten/src/ATen/mps/EmptyTensor.h
new file mode 100644
index 000000000000..fcdb7e152da9
--- /dev/null
+++ b/aten/src/ATen/mps/EmptyTensor.h
@@ -0,0 +1,31 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at {
+namespace detail {
+
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size, const TensorOptions &options);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions &options);
+
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h
new file mode 100644
index 000000000000..7e3d3a653517
--- /dev/null
+++ b/aten/src/ATen/mps/MPSAllocator.h
@@ -0,0 +1,244 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/Tensor.h>
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+#include <torch/library.h>
+#include <c10/util/flat_hash_map.h>
+
+#include <ATen/mps/MPSDevice.h>
+#include <cstdio>
+#include <mutex>
+#include <set>
+#include <utility>
+#include <mach/vm_page_size.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <Metal/MTLHeap.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+// this implementation is based on CUDACachingAllocator.
+// It utilizes Metal Heaps to improve the performance with buffer allocation.
+// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
+namespace at {
+namespace mps {
+
+namespace HeapAllocator {
+
+#define MB(x) round_page(x * 1048576UL)
+
+static const size_t kMaxSmallAlloc = MB(1);  // largest "small" allocation is 1 MiB
+static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
+static const size_t kSmallHeap     = MB(8);  // "small" allocations are packed in 8 MiB heaps
+static const size_t kLargeHeap     = MB(32); // "large" allocations may be packed in 32 MiB heaps
+static const size_t kRoundLarge    = MB(2);  // round up large allocations to 2 MiB
+
+// TODO: check the caching performance of write-combined mode
+constexpr MTLResourceOptions kCPUCacheMode = MTLResourceOptionCPUCacheModeDefault;
+constexpr MTLResourceOptions kPrivateResourceOptions = kCPUCacheMode | MTLResourceStorageModePrivate;
+constexpr MTLResourceOptions kSharedResourceOptions  = kCPUCacheMode | MTLResourceStorageModeShared;
+
+struct HeapBlock;
+
+struct BufferBlock
+{
+  id<MTLBuffer> buffer;
+  size_t size;
+  bool in_use;
+  HeapBlock* heap;
+  id_t buf_id;
+
+  BufferBlock(size_t Size, const id<MTLBuffer> Buffer = nullptr, HeapBlock* Heap = nullptr, id_t BufID = 0) :
+            buffer(Buffer), size(Size), in_use(false), heap(Heap), buf_id(BufID) { }
+
+  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
+    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
+  }
+  static size_t alignUp(size_t Size, size_t Alignment) {
+    assert(((Alignment - 1) & Alignment) == 0);
+    return ((Size + Alignment - 1) & ~(Alignment - 1));
+  }
+};
+typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
+
+struct BufferPool;
+
+struct HeapBlock
+{
+  id<MTLHeap> heap;
+  struct { size_t total, available; } size;
+  BufferPool* pool;
+  unsigned int n_buffers;
+
+  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
+            heap(Heap), size({.total = Size, .available = Size}), pool(Pool), n_buffers(0) { }
+
+  static MTLResourceOptions getOptions(bool SharedStorage = false) { return SharedStorage ? kSharedResourceOptions : kPrivateResourceOptions; }
+
+  static id<MTLHeap> createMTLHeap(id<MTLDevice> device, size_t size, bool is_shared) {
+    id<MTLHeap> heap = nil;
+    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
+    if (d) {
+      if (size <= kMaxSmallAlloc) {
+        d.size = kSmallHeap;
+      } else if (size < kMinLargeAlloc) {
+        d.size = kLargeHeap;
+      } else {
+        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+      }
+      d.storageMode = is_shared ? MTLStorageModeShared : MTLStorageModePrivate;
+      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+      // this automatically handles Metal buffer access synchronizations at the
+      // cost of slightly lower performance.
+      d.hazardTrackingMode = MTLHazardTrackingModeTracked;
+      d.resourceOptions = getOptions(is_shared) | (MTLHazardTrackingModeTracked << MTLResourceHazardTrackingModeShift);
+      d.type = MTLHeapTypeAutomatic;
+      heap = [device newHeapWithDescriptor: d];
+      if (heap) {
+        [heap setPurgeableState:MTLPurgeableStateEmpty];
+      }
+      [d release];
+    }
+    return heap;
+  }
+  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
+    return a->size.available < b->size.available;
+  }
+  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
+      return [heap maxAvailableSizeWithAlignment:Alignment];
+  }
+  id<MTLBuffer> newMTLBuffer(size_t length, bool is_shared) {
+    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(is_shared)];
+    if (buf) {
+      size.available = heapAvailableSize(heap);
+      n_buffers++;
+    }
+    return buf;
+  }
+  void releaseMTLBuffer(id<MTLBuffer> buffer) {
+    [buffer release];
+    size.available = heapAvailableSize(heap);
+    n_buffers--;
+  }
+  void releaseMTLHeap() {
+    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
+    [heap release];
+    size.available = 0;
+  }
+};
+typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
+
+struct BufferPool
+{
+  BufferPool(const id<MTLDevice> Device, bool Small, bool Shared) :
+           device(Device), is_small(Small), is_shared(Shared),
+           heaps(HeapBlock::Comparator), buffers(BufferBlock::Comparator) { }
+
+  const id<MTLDevice> device;
+  // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
+  const bool is_small;
+  // private pools allocated on device memory; otherwise, shared between host/device
+  const bool is_shared;
+  // list of heaps ordered by their "available" (not total) memory size
+  std::set<HeapBlock*, HeapComparison> heaps;
+  // list of only "available" buffers in the pool (i.e., buffers not in-use)
+  std::set<BufferBlock*, BufferComparison> buffers;
+};
+
+struct AllocParams
+{
+  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
+            search_key(Alloc_Size), pool(Pool),
+            buffer_block(nullptr), requested_size(Requested_Size) {}
+  size_t size() const { return search_key.size; }
+
+  BufferBlock search_key;
+  BufferPool* pool;
+  BufferBlock* buffer_block;
+  size_t requested_size;
+};
+
+class MPSHeapAllocatorImpl
+{
+public:
+  explicit MPSHeapAllocatorImpl() :
+                      m_device(at::mps::MPSDevice::getInstance()->device()),
+                      m_large_pool_shared(m_device, false, true), m_large_pool_private(m_device, false, false),
+                      m_small_pool_shared(m_device, true , true), m_small_pool_private(m_device, true , false),
+                      m_total_allocated_memory(0), m_max_buffer_size([m_device maxBufferLength]),
+                      m_set_fraction(false), m_enable_debug_info(false) { }
+
+  // interface exposed to at::Allocator
+  id<MTLBuffer> Malloc(size_t size, bool sharedStorage);
+  void Free(void* ptr);
+  void EmptyCache();
+  bool isSharedBuffer(void* ptr);
+
+  inline id<MTLDevice> Device() const { return m_device; }
+  void enable_debug_info() { m_enable_debug_info = true; }
+  bool debug_info_enabled() const { return m_enable_debug_info; }
+  void set_shared_storage_mode(bool useSharedStorage);
+
+private:
+  const id<MTLDevice> m_device;
+  std::mutex m_mutex;
+  // allocated buffers by device pointer
+  ska::flat_hash_map<void*, BufferBlock*> m_allocated_buffers;
+  // unallocated cached buffers larger than 1 MB
+  BufferPool m_large_pool_shared, m_large_pool_private;
+  // unallocated cached buffers 1 MB or smaller
+  BufferPool m_small_pool_shared, m_small_pool_private;
+  // total memory allocated by HeapAllocator
+  size_t m_total_allocated_memory;
+  // max buffer size allowed by Metal
+  size_t m_max_buffer_size;
+  // sets a soft upper bound to limit the total allocations
+  bool m_set_fraction;
+  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to enable debug info
+  bool m_enable_debug_info;
+
+  HeapBlock* get_free_heap(AllocParams& p);
+  bool get_free_buffer(AllocParams& p);
+  BufferBlock* get_allocated_buffer_block(void* ptr);
+  bool alloc_buffer(AllocParams& p);
+  void free_buffer(BufferBlock* buffer_block);
+  void release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
+  void release_buffers(BufferPool& pool);
+  bool release_available_cached_buffers(const AllocParams& p);
+  bool release_cached_buffers();
+
+  BufferPool& get_pool(size_t Size, bool useShared) {
+      return Size <= kMaxSmallAlloc ? (useShared ? m_small_pool_shared : m_small_pool_private) :
+                                      (useShared ? m_large_pool_shared : m_large_pool_private);
+  }
+
+  size_t get_allocation_size(size_t Length, bool useShared) {
+    MTLSizeAndAlign sizeAlign = [m_device heapBufferSizeAndAlignWithLength:Length
+                                                                   options:HeapBlock::getOptions(useShared)];
+    return BufferBlock::alignUp(sizeAlign.size, sizeAlign.align);
+  }
+  // TODO: make this configurable
+  static size_t max_split_size() { return std::numeric_limits<size_t>::max(); }
+  // maximum size of device memory available for allocation in current process
+  size_t max_available_size() const { return [m_device recommendedMaxWorkingSetSize] - [m_device currentAllocatedSize]; }
+
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  static std::string format_size(uint64_t size) {
+    std::ostringstream os;
+    os.precision(2);
+    os << std::fixed;
+    if (size <= 1024UL) { os << size << " bytes"; }
+    else if (size <= 1048576UL) { os << ((float) size / 1024.0) << " KB"; }
+    else if (size <= 1073741824UL) { os << ((float) size / 1048576.0) << " MB"; }
+    else { os << ((float) size / 1073741824.0) << " GB"; }
+    return os.str();
+  }
+};
+
+} // namespace HeapAllocator
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
new file mode 100644
index 000000000000..0c30af5c36b5
--- /dev/null
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -0,0 +1,351 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/mps/MPSAllocator.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/Storage.h>
+#include <ATen/CPUFunctions.h>
+
+namespace at {
+namespace mps {
+
+namespace HeapAllocator {
+
+HeapBlock* MPSHeapAllocatorImpl::get_free_heap(AllocParams& p)
+{
+  BufferPool *pool = p.pool;
+  HeapBlock *heapBlock = nullptr;
+  HeapBlock search_key(p.size());
+
+  auto it = pool->heaps.lower_bound(&search_key);
+  if (it == pool->heaps.end()) {
+    id<MTLHeap> heap = HeapBlock::createMTLHeap(pool->device, p.size(), pool->is_shared);
+    if (heap) {
+      size_t heap_size = HeapBlock::heapAvailableSize(heap);
+      heapBlock = new HeapBlock(heap_size, heap, pool);
+
+      if (debug_info_enabled()) {
+        static unsigned int heap_counter = 0;
+        std::cerr << "\nAllocated "
+                  << (pool->is_small ? "small " : "large ")
+                  << (pool->is_shared ? "shared " : "private ")
+                  << "heap of size " << format_size(heap_size)
+                  << " (#heaps: " << (++heap_counter)
+                  << ", free memory: " << format_size(max_available_size()) << ")\n";
+      }
+    }
+  } else {
+    heapBlock = *it;
+    // remove and re-insert heap in the set later after a buffer is created.
+    // this ensures updating the order of heaps based on their new available sizes
+    pool->heaps.erase(it);
+  }
+  return heapBlock;
+}
+
+bool MPSHeapAllocatorImpl::alloc_buffer(AllocParams& p)
+{
+  if (m_set_fraction && m_total_allocated_memory + p.size() > max_available_size())
+    return false;
+
+  HeapBlock *heap = get_free_heap(p);
+  if (!heap)
+    return false; // this will cause releasing pool buffers to free up memory
+
+  id<MTLBuffer> buffer = heap->newMTLBuffer(p.size(), p.pool->is_shared);
+  // this should never happen as the backing memory (i.e., heap) was allocated successfully.
+  TORCH_INTERNAL_ASSERT(buffer);
+  // insert heap after a buffer was created on it to update the order of heap's set
+  p.pool->heaps.insert(heap);
+  p.buffer_block = new BufferBlock(p.size(), buffer, heap, m_allocated_buffers.size() + 1);
+  m_allocated_buffers[p.buffer_block->buffer] = p.buffer_block;
+  m_total_allocated_memory += p.size();
+
+  if (debug_info_enabled()) {
+    std::cerr << "Allocated "
+              << (p.pool->is_shared ? "shared" : "private")
+              << " buffer #" << p.buffer_block->buf_id
+              << " with aligned size " << format_size(p.size())
+              << " (requested size: " << format_size(p.requested_size)
+              << ", heap size: " << format_size(heap->size.available)
+              << ", total allocated: " << format_size(m_total_allocated_memory) << ")\n";
+  }
+  return true;
+}
+
+bool MPSHeapAllocatorImpl::get_free_buffer(AllocParams& p)
+{
+  BufferPool& pool = *p.pool;
+  auto it = pool.buffers.lower_bound(&p.search_key);
+  if (it == pool.buffers.end())
+    return false;
+  // do not return an oversized buffer for a large request
+  // allow oversized buffer size to be rounded up but within a limit
+  if ((p.size() < max_split_size() && (*it)->size >= max_split_size()) ||
+     ((p.size() >= max_split_size()) && ((*it)->size >= p.size() + kLargeHeap)))
+    return false;
+
+  p.buffer_block = *it;
+  pool.buffers.erase(it);
+  if (debug_info_enabled()) {
+    std::cerr << "Reusing "
+              << (p.pool->is_shared ? "shared" : "private")
+              << " buffer #" << p.buffer_block->buf_id
+              << " with aligned size " << format_size(p.buffer_block->size)
+              << " (requested size: " << format_size(p.requested_size) << ")\n";
+  }
+  return true;
+}
+
+id<MTLBuffer> MPSHeapAllocatorImpl::Malloc(size_t size, bool sharedStorage)
+{
+  TORCH_CHECK(size < m_max_buffer_size, "Invalid buffer size: ", format_size(size));
+
+  std::lock_guard<std::mutex> lock(m_mutex);
+  __block id<MTLBuffer> buf = nil;
+
+  size_t alloc_size = get_allocation_size(size, sharedStorage);
+  auto& pool = get_pool(alloc_size, sharedStorage);
+  AllocParams params(alloc_size, size, &pool);
+
+  bool block_found =
+      // Search pool
+      get_free_buffer(params) ||
+      // Attempt allocate
+      alloc_buffer(params) ||
+      // Free enough available cached blocks to satisfy alloc and retry alloc.
+      (release_available_cached_buffers(params) && alloc_buffer(params)) ||
+      // Free all non-split cached buffers and retry alloc.
+      (release_cached_buffers() && alloc_buffer(params));
+
+  BufferBlock* buffer_block = params.buffer_block;
+  TORCH_INTERNAL_ASSERT(block_found && buffer_block);
+  buffer_block->in_use = true;
+  return buffer_block->buffer;
+}
+
+void MPSHeapAllocatorImpl::free_buffer(BufferBlock* buffer_block)
+{
+  TORCH_INTERNAL_ASSERT(buffer_block->in_use);
+  buffer_block->in_use = false;
+  BufferPool *pool = buffer_block->heap->pool;
+  // Makes sure the BufferBlock* isn't already present in the pool we're freeing it back into.
+  TORCH_INTERNAL_ASSERT(pool->buffers.insert(buffer_block).second);
+}
+
+BufferBlock* MPSHeapAllocatorImpl::get_allocated_buffer_block(void* ptr)
+{
+  id<MTLBuffer> buf = __builtin_bit_cast(id<MTLBuffer>, ptr);
+  auto it = m_allocated_buffers.find(buf);
+  if (it == m_allocated_buffers.end())
+    return nullptr;
+
+  return it->second;
+}
+
+bool MPSHeapAllocatorImpl::isSharedBuffer(void* ptr)
+{
+  std::lock_guard<std::mutex> lock(m_mutex);
+
+  BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
+  // it's OK for the buffer_block to not exist yet
+  return buffer_block && buffer_block->heap->pool->is_shared;
+}
+
+void MPSHeapAllocatorImpl::Free(void* ptr)
+{
+  std::lock_guard<std::mutex> lock(m_mutex);
+
+  BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
+  TORCH_INTERNAL_ASSERT(buffer_block);
+  free_buffer(buffer_block);
+}
+
+void MPSHeapAllocatorImpl::EmptyCache()
+{
+  std::lock_guard<std::mutex> lock(m_mutex);
+  release_cached_buffers();
+}
+
+void MPSHeapAllocatorImpl::release_buffer(BufferBlock* buffer_block, bool remove_empty_heap)
+{
+  HeapBlock *heap = buffer_block->heap;
+  BufferPool *pool = heap->pool;
+  m_total_allocated_memory -= buffer_block->size;
+  m_allocated_buffers.erase(buffer_block->buffer);
+  pool->buffers.erase(buffer_block);
+  // will re-insert later to keep the heaps list sorted based on heap's new available size (if heap not empty)
+  pool->heaps.erase(heap);
+  heap->releaseMTLBuffer(buffer_block->buffer);
+  if (debug_info_enabled()) {
+    std::cerr << "Released buffer #" << buffer_block->buf_id
+              << " of size " << format_size(buffer_block->size)
+              << " (heap size: " << format_size(heap->size.available)
+              << ", total allocated: " << format_size(m_total_allocated_memory) << ")\n";
+
+  }
+  delete buffer_block;
+
+  if (remove_empty_heap && heap->n_buffers == 0) {
+    heap->releaseMTLHeap();
+    if (debug_info_enabled()) {
+      std::cerr << "Released heap of size " << format_size(heap->size.total)
+                << " (free memory: " << format_size(max_available_size()) << ")\n";
+    }
+    delete heap;
+  } else {
+    pool->heaps.insert(heap);
+  }
+}
+
+void MPSHeapAllocatorImpl::release_buffers(BufferPool& pool)
+{
+  auto it = pool.buffers.begin();
+  while (it != pool.buffers.end()) {
+    BufferBlock* buffer_block = *it;
+    ++it;
+    release_buffer(buffer_block);
+  }
+}
+
+bool MPSHeapAllocatorImpl::release_available_cached_buffers(const AllocParams& p)
+{
+  BufferPool& pool = *p.pool;
+
+  if (max_split_size() == std::numeric_limits<size_t>::max() || pool.buffers.empty())
+    return false;
+
+  BufferBlock key = p.search_key;
+  key.size = (key.size < max_split_size()) ? max_split_size() : key.size;
+  auto it = pool.buffers.lower_bound(&key);
+  if (it == pool.buffers.end()) {
+    size_t totalReleased = 0;
+    --it;
+    while ((totalReleased < key.size) && ((*it)->size >= max_split_size())) {
+      auto cur = it;
+      totalReleased += (*it)->size;
+      if (it != pool.buffers.begin()) {
+        --it;
+        release_buffer(*cur);
+      } else {
+        release_buffer(*cur);
+        break;
+      }
+    }
+    if (totalReleased < key.size)
+      return false;
+  } else {
+    release_buffer(*it);
+  }
+  return true;
+}
+
+bool MPSHeapAllocatorImpl::release_cached_buffers()
+{
+  // Free all cached blocks to system allocator
+  release_buffers(m_large_pool_private);
+  release_buffers(m_large_pool_shared);
+  release_buffers(m_small_pool_private);
+  release_buffers(m_small_pool_shared);
+  return true;
+}
+
+} // namespace HeapAllocator
+
+// Use "at::mps::GetMPSAllocator()" to acquire a handle to MPS Allocator
+static HeapAllocator::MPSHeapAllocatorImpl s_allocatorImpl;
+
+// MPS allocator struct to be registered with Pytorch
+struct TORCH_API MPSAllocator final : public at::Allocator {
+public:
+  explicit MPSAllocator(bool useSharedStorage) :
+      m_has_unified_memory(s_allocatorImpl.Device().hasUnifiedMemory), m_use_shared_storage(useSharedStorage)
+  {
+    const bool enable_debug_info = isEnvVarEnabled("PYTORCH_DEBUG_MPS_ALLOCATOR");
+    if (enable_debug_info) {
+      s_allocatorImpl.enable_debug_info();
+      if (!m_use_shared_storage || m_has_unified_memory) {
+        std::cerr << "Initializing "
+                  << (useSharedStorage ? "shared" : "private")
+                  << " heap allocator on "
+                  << (m_has_unified_memory ? "unified" : "discrete")
+                  << " device memory of size "
+                  << s_allocatorImpl.Device().recommendedMaxWorkingSetSize / 1048576UL << " MB\n";
+      }
+    }
+  }
+
+  ~MPSAllocator() override {
+    s_allocatorImpl.EmptyCache();
+  }
+
+  DataPtr allocate(const size_t nbytes) const override {
+    __block id<MTLBuffer> buf = nbytes > 0 ? s_allocatorImpl.Malloc(nbytes, m_use_shared_storage) : nullptr;
+    return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
+  }
+
+  DeleterFnPtr raw_deleter() const override { return &Delete; }
+  bool is_shared(void* ptr) const { return s_allocatorImpl.isSharedBuffer(ptr); }
+  bool is_shared_storge_supported() const { return m_has_unified_memory; }
+
+private:
+  bool m_has_unified_memory;
+  // use shared buffers on unified memory
+  bool m_use_shared_storage;
+
+  static void Delete(void* ptr) { if (ptr) s_allocatorImpl.Free(ptr); }
+
+  static bool isEnvVarEnabled(const char *envvar) {
+    const char *e = getenv(envvar);
+    if (e) {
+      char *t = (char*) e;
+      long val = strtol(e, &t, 0);
+      return (t != e && val != 0);
+    }
+    return false;
+  }
+};
+
+static MPSAllocator s_mps_shared_alloc(true);
+at::Allocator* getMPSSharedAllocator()
+{
+  if (s_mps_shared_alloc.is_shared_storge_supported())
+    return &s_mps_shared_alloc;
+
+  return nullptr;
+}
+
+} // namespace mps
+
+namespace native {
+
+// torch.is_pinned() implementation
+// Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we
+// will be able to use SharedStorageMode for MTLBuffer allocations. This will
+// avoid extra copies on DataLoading operations.
+bool is_pinned_mps(const Tensor& self, c10::optional<Device> device)
+{
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
+  return at::mps::s_mps_shared_alloc.is_shared(self.storage().data());
+}
+
+// torch.pin_memory() implementation
+Tensor _pin_memory_mps(const Tensor& self, c10::optional<Device> device)
+{
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
+  auto* shared_allocator = at::mps::getMPSSharedAllocator();
+  TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
+
+  const size_t storage_size = detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
+  std::cout << "Pinning memory of size " << storage_size / 1024UL << " KB\n";
+  auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false);
+  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
+} // namespace native
+
+static mps::MPSAllocator s_mps_private_alloc(false);
+REGISTER_ALLOCATOR(DeviceType::MPS, &s_mps_private_alloc);
+
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
new file mode 100644
index 000000000000..a4a4b869b44c
--- /dev/null
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -0,0 +1,62 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/ATen.h>
+
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+
+using namespace std;
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() {
+    return _mtl_device;
+  }
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+
+at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
new file mode 100644
index 000000000000..8ade0a1f7817
--- /dev/null
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -0,0 +1,62 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/mps/MPSDevice.h>
+#include <torch/library.h>
+#include <ATen/native/CPUFallback.h>
+#include <ATen/native/MathBitsFallback.h>
+#include <ATen/native/MathBitFallThroughLists.h>
+
+namespace at {
+namespace mps {
+
+static std::unique_ptr<MPSDevice> mps_device;
+static std::once_flag mpsdev_init;
+
+MPSDevice* MPSDevice::getInstance() {
+  std::call_once(mpsdev_init, [] {
+      mps_device = std::unique_ptr<MPSDevice>(new MPSDevice());
+  });
+  return mps_device.get();
+}
+
+MPSDevice::~MPSDevice() {
+  [_mtl_device release];
+  _mtl_device = nil;
+}
+
+MPSDevice::MPSDevice() {
+  NSArray* devices = MTLCopyAllDevices();
+  for (unsigned long i = 0 ; i < [devices count] ; i++) {
+    id<MTLDevice>  device = devices[i];
+    if(![device isLowPower]) { // exclude Intel GPUs
+      _mtl_device = device;
+      break;
+    }
+  }
+  assert(_mtl_device);
+}
+
+at::Allocator* getMPSSharedAllocator();
+at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
+  return useSharedAllocator ? getMPSSharedAllocator() : GetAllocator(DeviceType::MPS);
+}
+
+} // namespace mps
+
+TORCH_LIBRARY_IMPL(aten, MPS, m) {
+  m.impl("bitwise_and.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("repeat_interleave.Tensor", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("repeat_interleave.self_Tensor", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("repeat_interleave.self_int", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+  m.impl("masked_select", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h
new file mode 100644
index 000000000000..27d32bf652e7
--- /dev/null
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@@ -0,0 +1,171 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/mps/MPSStream.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+#include <ATen/Tensor.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <sys/_types/_size_t.h>
+#include <memory>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/intrusive_ptr.h>
+
+
+namespace at {
+namespace mps {
+
+// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl
+// https://github.com/pytorch/pytorch/issues/77170
+struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::MPS;
+
+  // constructor
+  MPSGuardImpl() {}
+  explicit MPSGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == DeviceType::MPS);
+  }
+
+  // returns the type
+  DeviceType type() const override {
+    return DeviceType::MPS;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    return Device(DeviceType::MPS, 0);
+  }
+
+  Device getDevice() const override {
+    return Device(DeviceType::MPS, 0);
+  }
+
+  c10::optional<Device> uncheckedGetDevice() const noexcept {
+    return Device(DeviceType::MPS, 0);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_mps());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    // TODO: Currently setting only device 0
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0));
+  }
+
+  Stream getDefaultStream(Device d) const override {
+    return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0));
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    if (at::hasMPS()) {
+      //TODO: extend it for multi-device case
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  // Event-related functions
+  void createEvent(
+    mpsEvent_t* event,
+    const EventFlag flag) const;
+
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override;
+
+  void record(
+    void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override;
+
+  void block(
+    void* event,
+    const Stream& stream) const override;
+
+  bool queryEvent(void* event) const override;
+
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for MPS.
+struct OptionalMPSGuard {
+  explicit OptionalMPSGuard() : guard_() {}
+
+  explicit OptionalMPSGuard(optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current MPS device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalMPSGuard(optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalMPSGuard(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard(OptionalMPSGuard&& other) = delete;
+  OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete;
+
+  /// Sets the MPS device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a MPS
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the MPS device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a MPS device.
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the MPS device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original MPS device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<MPSGuardImpl> guard_;
+};
+
+
+C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl);
+
+}} // namespace at::mps
diff --git a/aten/src/ATen/mps/MPSGuardImpl.mm b/aten/src/ATen/mps/MPSGuardImpl.mm
new file mode 100644
index 000000000000..c2987fdaa3e7
--- /dev/null
+++ b/aten/src/ATen/mps/MPSGuardImpl.mm
@@ -0,0 +1,60 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/mps/MPSGuardImpl.h>
+#include <ATen/mps/MPSDevice.h>
+
+namespace at {
+namespace mps {
+
+  void MPSGuardImpl::createEvent(
+    mpsEvent_t* event,
+    const EventFlag flag) const {
+    id<MTLDevice> mtl_device = MPSDevice::getInstance()->device();
+    // when static casting we already create an _event object.
+    auto mps_event = static_cast<mpsEvent_t>(*event);
+  }
+
+  void MPSGuardImpl::destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept {
+    if (!event) return;
+    auto mps_event = static_cast<mpsEvent_t>(event);
+    mps_event->~MPSEvent();
+
+  }
+
+  void MPSGuardImpl::record(
+    void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const {
+
+    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(),
+    "Event device index ",
+    device_index,
+    " does not match recording stream's device index ",
+    stream.device_index(),
+    ".");
+
+    auto mps_event = static_cast<mpsEvent_t>(*event);
+    MPSStream mps_stream{stream};
+    mps_event->recordEvent(&mps_stream);
+  }
+
+  void MPSGuardImpl::block(
+    void* event,
+    const Stream& stream) const {
+
+    auto mps_event = static_cast<mpsEvent_t>(event);
+    MPSStream mps_stream{stream};
+
+    mps_event->waitForEvent(&mps_stream);
+  }
+
+  bool MPSGuardImpl::queryEvent(void* event) const {
+    auto mps_event = static_cast<mpsEvent_t>(event);
+    return mps_event->queryEvent();
+  }
+
+}
+}
diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h
new file mode 100644
index 000000000000..1c19c42b7d77
--- /dev/null
+++ b/aten/src/ATen/mps/MPSStream.h
@@ -0,0 +1,134 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/core/Stream.h>
+#include <ATen/mps/MPSDevice.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream
+{
+public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device];}
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+private:
+  Stream _stream;
+  MTLCommandQueue_t   _commandQueue = nil;
+  MTLCommandBuffer_t  _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t    _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl
+{
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent
+{
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const {return _event; }
+
+  void recordEvent(MPSStream *stream);
+  void waitForEvent(MPSStream *queue); // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+private:
+  bool _isRecorded = false;
+  uint64_t  _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
new file mode 100644
index 000000000000..7d1d346f1755
--- /dev/null
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -0,0 +1,139 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/mps/MPSStream.h>
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+MPSStream::MPSStream(Stream stream) : _stream(stream) {
+  _commandQueue = [MPSDevice::getInstance()->device() newCommandQueue];
+  TORCH_CHECK(_stream.device_type() == DeviceType::MPS);
+  _serialQueue = dispatch_queue_create("metal gpu stream", NULL);
+}
+
+MPSStream::~MPSStream() {
+  [_commandQueue autorelease];
+  _commandQueue = nil;
+
+  assert(_commandBuffer == nil);
+}
+
+id<MTLCommandBuffer> MPSStream::commandBuffer() {
+  if (!_commandBuffer) {
+    _commandBuffer =
+        [MPSCommandBuffer commandBufferFromCommandQueue:_commandQueue].retain;
+  }
+
+  return _commandBuffer;
+}
+
+void MPSStream::synchronize() {
+  dispatch_sync(queue(), ^() {
+    @autoreleasepool {
+      commandBuffer();
+      commitAndWait();
+    }
+  });
+}
+
+void MPSStream::commit(bool doFlush) {
+  if (doFlush) {
+    flush();
+  }
+}
+
+void MPSStream::commitAndWait() {
+  assert(_commandBuffer);
+  [_commandBuffer commit];
+  [_commandBuffer waitUntilCompleted];
+  [_commandBuffer release];
+  _commandBuffer = nil;
+}
+
+void MPSStream::flush() {
+  if (_commandBuffer) {
+    [_commandBuffer commit];
+    [_commandBuffer release];
+    _commandBuffer = nil;
+  }
+}
+
+void MPSStream::_flush(bool commitAndWait) const {
+  assert(_commandBuffer);
+  [_commandBuffer commit];
+  if (commitAndWait) {
+    [_commandBuffer waitUntilCompleted];
+  }
+  [_commandBuffer release];
+}
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+MPSStream* MPSStreamImpl::_stream = nullptr;
+
+MPSStream* MPSStreamImpl::getInstance() {
+  if (_stream == nullptr) {
+    _stream =
+        new MPSStream(Stream(Stream::UNSAFE, c10::Device(DeviceType::MPS), 0));
+  }
+  return _stream;
+}
+
+MPSStreamImpl::MPSStreamImpl() {}
+
+MPSStream* getCurrentMPSStream() {
+  return getDefaultMPSStream();
+}
+
+MPSStream* getDefaultMPSStream() {
+  return MPSStreamImpl::getInstance();
+}
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+MPSEvent::MPSEvent() {
+  _event = [MPSDevice::getInstance()->device() newSharedEvent];
+}
+
+MPSEvent::~MPSEvent() {
+  [_event release];
+  _event = nil;
+}
+
+void MPSEvent::recordEvent(MPSStream* stream) {
+  @autoreleasepool {
+    _isRecorded = true;
+    dispatch_sync(stream->queue(), ^() {
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+        [commandBuffer encodeSignalEvent:_event value:_currentValue];
+        stream->commit(true);
+      }
+    });
+  }
+}
+
+void MPSEvent::waitForEvent(MPSStream* stream) {
+  dispatch_sync(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+      [commandBuffer encodeWaitForEvent:_event value:_currentValue];
+      stream->commit(false);
+    }
+  });
+}
+
+bool MPSEvent::queryEvent() {
+  return !_isRecorded || (_event.signaledValue >= _currentValue);
+}
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index ff79939830c7..f40c4aa3e823 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -164,12 +164,12 @@ TORCH_META_FUNC(softshrink_backward) (
   build_borrowing_binary_op(maybe_get_output(), grad, self);
 }
 
-TORCH_META_FUNC(gelu) (const Tensor & self) {
+TORCH_META_FUNC(gelu) (const Tensor & self, c10::string_view approximate) {
   build_unary_op(maybe_get_output(), self);
 }
 
 TORCH_META_FUNC(gelu_backward) (
-  const Tensor& grad, const Tensor& self
+  const Tensor& grad, const Tensor& self, c10::string_view approximate
 ) {
   build_borrowing_binary_op(maybe_get_output(), grad, self);
 }
@@ -202,6 +202,8 @@ DEFINE_DISPATCH(silu_stub);
 DEFINE_DISPATCH(silu_backward_stub);
 DEFINE_DISPATCH(mish_stub);
 DEFINE_DISPATCH(mish_backward_stub);
+DEFINE_DISPATCH(prelu_cpu_stub);
+DEFINE_DISPATCH(prelu_backward_cpu_stub);
 
 TORCH_IMPL_FUNC(elu_out) (
   const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result
@@ -324,50 +326,68 @@ bool use_mkldnn(const Tensor& input) {
 }
 
 TORCH_IMPL_FUNC(gelu_out_cpu) (
-  const Tensor& self, const Tensor& result
+  const Tensor& self, c10::string_view approximate, const Tensor& result
 ) {
+auto approximate_type = get_gelutype_enum(approximate);
 #if AT_MKLDNN_ENABLED()
-  if (use_mkldnn(self)) {
+  if (use_mkldnn(self) && (approximate_type == GeluType::None)) {
     const ideep::tensor& x = itensor_from_tensor(self);
     ideep::tensor y = itensor_from_tensor(result);
     ideep::eltwise_forward::compute(
       x, y, ideep::algorithm::eltwise_gelu_erf, ideep::prop_kind::forward_training, /*alpha*/ 0.0);
   } else {
-    GeluKernel(kCPU, *this);
+    GeluKernel(kCPU, *this, approximate_type);
   }
 #else
-  GeluKernel(kCPU, *this);
+  GeluKernel(kCPU, *this, approximate_type);
 #endif
 }
 
 TORCH_IMPL_FUNC(gelu_backward_out_cpu) (
-  const Tensor& grad, const Tensor& self, const Tensor& grad_input
+  const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input
 ) {
+auto approximate_type = get_gelutype_enum(approximate);
 #if AT_MKLDNN_ENABLED()
-  if (use_mkldnn(self)) {
+  if (use_mkldnn(self) && (approximate_type == GeluType::None)) {
     const ideep::tensor& x = itensor_from_tensor(self);
     ideep::tensor grady = itensor_from_tensor(grad);
     ideep::tensor gradx = itensor_from_tensor(grad_input);
     ideep::eltwise_backward::compute(x, grady, gradx,
       ideep::algorithm::eltwise_gelu_erf, /*alpha*/ 0.0);
   } else {
-    GeluBackwardKernel(kCPU, *this);
+    GeluBackwardKernel(kCPU, *this, approximate_type);
   }
 #else
-  GeluBackwardKernel(kCPU, *this);
+  GeluBackwardKernel(kCPU, *this, approximate_type);
 #endif
 }
 
 Tensor hardtanh(const Tensor& self, const Scalar& min, const Scalar& max) {
-  return at::clamp(self, min, max);
+  Tensor result = at::empty_like(self);
+  return at::hardtanh_out(result, self, min, max);
 }
 
 Tensor& hardtanh_out(const Tensor& self, const Scalar& min, const Scalar& max, Tensor& result) {
-  return at::clamp_out(result, self, min, max);
+  TORCH_CHECK(self.scalar_type() != at::kBool,
+  "Bool inputs not supported for hardtanh");
+  //preserve legacy behavior of boundaries not causing type promotion
+  Scalar min_, max_;
+  if (at::isIntegralType(self.scalar_type(), /*include_bool*/false)) {
+    int64_t minval = min.toLong();
+    int64_t maxval = max.toLong();
+    TORCH_CHECK(self.dtype() != at::kByte || (minval >= 0 &&
+       maxval >=0), "cannot do hardtanh on an unsigned type with negative limits");
+    min_ = minval;
+    max_ = maxval;
+  } else {
+    min_ = min;
+    max_ = max;
+  }
+  return at::clamp_out(result, self, min_, max_);
 }
 
 Tensor& hardtanh_(Tensor& self, const Scalar& min, const Scalar& max) {
-  return at::clamp_(self, min, max);
+  return at::hardtanh_out(self, self, min, max);
 }
 
 Tensor& hardtanh_backward_out(const Tensor& grad_output, const Tensor& self, const Scalar& min, const Scalar& max, Tensor& grad_input) {
@@ -421,10 +441,12 @@ Tensor hardswish_backward(const Tensor& grad_output, const Tensor& self) {
 }
 
 Tensor relu(const Tensor & self) {
+  TORCH_CHECK(self.scalar_type() != at::kBool, "Boolean inputs not supported for relu");
   return at::clamp_min(self, 0);
 }
 
 Tensor & relu_(Tensor & self) {
+  TORCH_CHECK(self.scalar_type() != at::kBool, "Boolean inputs not supported for relu");
   return at::clamp_min_(self, 0);
 }
 
@@ -566,14 +588,13 @@ Tensor rrelu_with_noise_backward(
     const Scalar& upper,
     bool training,
     bool is_result) {
-  auto lower_tensor = scalar_to_tensor(lower);
-  auto upper_tensor = scalar_to_tensor(upper);
-  if (training && (upper_tensor - lower_tensor).item().to<float>() > 1E-6) {
-    return grad_output.mul(noise);
+  if (training) {
+    return noise * grad_output;
   } else {
-    auto negative = (lower_tensor + upper_tensor) / 2;
-    Scalar negative_slope = negative.item();
-    return at::leaky_relu_backward(grad_output, self_or_result, negative_slope, is_result);
+    auto l = lower.toDouble();
+    auto u = upper.toDouble();
+    auto mid = (l + u) / 2.;
+    return at::leaky_relu_backward(grad_output, self_or_result, mid, is_result);
   }
 }
 
@@ -593,253 +614,119 @@ TORCH_IMPL_FUNC(threshold_backward_out)(const Tensor& grad, const Tensor& self,
   threshold_stub(device_type(), *this, threshold, 0);
 }
 
-// -----------------------------------
-// prelu forward
-// -----------------------------------
-template <typename scalar_t>
-void inline prelu_cpu_kernel_share_weights(
-  Tensor& result,
-  const Tensor& input,
-  const Tensor& weight) {
-
-  int64_t input_numel = input.numel();
-  auto result_data = result.data_ptr<scalar_t>();
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_val = weight.data_ptr<scalar_t>()[0];
-
-  at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      scalar_t input_data_val = input_data[i];
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
-      result_data[i] = r * input_data_val;
-    }
-  });
-}
-
-template <typename scalar_t>
-void inline prelu_cpu_kernel_multi_weights(
-  Tensor& result,
-  const Tensor& input,
-  const Tensor& weight,
-  int64_t input_dim0_size,
-  int64_t channel_size,
-  int64_t input_stride0,
-  int64_t input_stride1) {
-
-  scalar_t* result_data = result.data_ptr<scalar_t>();
-  scalar_t* input_data = input.data_ptr<scalar_t>();
-  scalar_t* weight_data = weight.data_ptr<scalar_t>();
-
-  auto loop = [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      int64_t offset = i * channel_size * input_stride1;
-      scalar_t* n_input_data = input_data + offset;
-      scalar_t* n_result_data = result_data + offset;
-      for (const auto j : c10::irange(channel_size)) {
-        for (const auto k : c10::irange(input_stride1)) {
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t w = (n_input_data[k] > 0) ? scalar_t(1) : weight_data[j];
-          n_result_data[k] = w * n_input_data[k];
-        }
-        n_input_data += input_stride1;
-        n_result_data += input_stride1;
-      }
-    }
-  };
-  if (input.numel() > 1000) {
-    at::parallel_for(0, input_dim0_size, 0, loop);
-  } else {
-    loop(0, input_dim0_size);
-  }
-}
-
 Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
-  auto input = self.contiguous();
-  auto weight = weight_.contiguous();
-
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(weight.is_contiguous());
+  int64_t weight_num = weight_.numel();
+  Tensor result = at::empty_like(self, self.suggest_memory_format());
 
-  int64_t weight_num = weight.numel();
-  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto strides = input.strides();
-
-  // case1: shared weight for all channels
-  if (weight_num == 1) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
-      prelu_cpu_kernel_share_weights<scalar_t>(result, input, weight);
-    });
-  }
-  else { // case2: multiple weights, one for each channel
-    int64_t input_ndim = input.dim();
+  if (weight_num != 1) {
+    int64_t input_ndim = self.dim();
     TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
-
     if (input_ndim > 1) {
-      channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
-      input_stride0 = strides[0];
-      input_stride1 = strides[1];
+      channel_size = self.size(1); // channel is the 2nd dim of input
     }
     TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
-      prelu_cpu_kernel_multi_weights<scalar_t>(
-        result,
-        input,
-        weight,
-        input_dim0_size,
-        channel_size,
-        input_stride0,
-        input_stride1);
-    });
   }
-  return result;
-}
 
-// -----------------------------------
-// prelu backward
-// -----------------------------------
-template <typename scalar_t>
-void inline prelu_cpu_backward_kernel_share_weights(
-  const Tensor& input,
-  const Tensor& weight,
-  const Tensor& grad_out,
-  Tensor& input_grad,
-  Tensor& weight_grad) {
-
-  int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_val = weight.data_ptr<scalar_t>()[0];
-  auto grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto input_grad_data = input_grad.data_ptr<scalar_t>();
-  auto weight_grad_data = weight_grad.data_ptr<scalar_t>();
-
-  scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
-      [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
-    scalar_t partial_sum = ident;
-    for (const auto i : c10::irange(start, end)) {
-      scalar_t input_data_val = input_data[i];
-      scalar_t grad_out_data_val = grad_out_data[i];
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_val;
-      input_grad_data[i] = w * grad_out_data_val;
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
-      partial_sum += mask * input_data_val * grad_out_data_val;
-    }
-    return partial_sum;
-  }, std::plus<scalar_t>());
-  weight_grad_data[0] = sum;
-}
-
-template <typename scalar_t>
-void inline prelu_cpu_backward_kernel_multi_weights(
-  const Tensor& input,
-  const Tensor& weight,
-  const Tensor& grad_out,
-  Tensor& input_grad,
-  Tensor& weight_grad_collector,
-  int64_t input_dim0_size,
-  int64_t channel_size,
-  int64_t input_stride0,
-  int64_t input_stride1) {
-
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_data = weight.data_ptr<scalar_t>();
-  auto grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto input_grad_data = input_grad.data_ptr<scalar_t>();
-  auto weight_grad_collector_data = weight_grad_collector.data_ptr<scalar_t>();
-
-  auto loop = [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      for (const auto j : c10::irange(channel_size)) {
-        for (const auto k : c10::irange(input_stride1)) {
-          int64_t pos = i * input_stride0 + j * input_stride1 + k;
-          scalar_t weight_data_val = weight_data[j];
-          scalar_t input_data_val = input_data[pos];
-          scalar_t grad_out_data_val = grad_out_data[pos];
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_data_val;
-          input_grad_data[pos] = w * grad_out_data_val;
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
-          weight_grad_collector_data[pos] = mask * input_data_val * grad_out_data_val;
-        }
-      }
+  const int64_t ndim = self.dim();
+  // Helper to convert 1d tensors or scalar tensor to an nd tensor that broadcasts with input
+  // All elements go into the channel dimension
+  DimVector sizes(ndim, 1), strides(ndim, 0);
+  auto as_nd = [&](const Tensor& t) {
+    TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0));
+    if (ndim >= 2) {
+      sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1;
+      strides[1] = t.dim() == 1 ? t.strides()[0] : 0;
+      return t.as_strided(sizes, strides);
     }
+    return t.as_strided(sizes, strides);
   };
-  if (input.numel() > 1000) {
-    at::parallel_for(0, input_dim0_size, 0, loop);
+  Tensor w;
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16));
+    w_bf16.copy_(weight_);
+    w = weight_.defined() ? as_nd(w_bf16) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
   } else {
-    loop(0, input_dim0_size);
+    w = weight_.defined() ? as_nd(weight_) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
   }
+
+  auto iter = TensorIteratorConfig()
+    .add_output(result)
+    .add_input(self)
+    .add_input(w)
+    .build();
+  prelu_cpu_stub(iter.device_type(), iter);
+  return result;
 }
 
 std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) {
-  auto input = self.contiguous();
-  auto grad_out = grad_out_.contiguous();
-  auto weight = weight_.contiguous();
-
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(grad_out.is_contiguous());
-  TORCH_CHECK(weight.is_contiguous());
-
-  int64_t weight_num = weight.numel();
-  auto strides = input.strides();
-  auto dims = input.dim();
+  int64_t weight_num = weight_.numel();
 
-  Tensor input_grad = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor weight_grad = at::empty_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor weight_grad_collector = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  Tensor input_grad = at::empty_like(self, self.suggest_memory_format());
+  Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+  Tensor weight_grad_collector = at::empty_like(self, at::MemoryFormat::Contiguous);
 
-  // case1: shared parameter for all channels
-  if (weight_num == 1) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
-      prelu_cpu_backward_kernel_share_weights<scalar_t>(input, weight, grad_out, input_grad, weight_grad);
-    });
-  }
-  else { // case2: multiple parameters, one for each channel
-    int64_t input_ndim = input.dim();
+  if (weight_num != 1) {
+    int64_t input_ndim = self.dim();
     TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
-
     if (input_ndim > 1) {
-      channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
-      input_stride0 = strides[0];
-      input_stride1 = strides[1];
+      channel_size = self.size(1); // channel is the 2nd dim of input
     }
     TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
+  }
 
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
-      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
-        input,
-        weight,
-        grad_out,
-        input_grad,
-        weight_grad_collector,
-        input_dim0_size,
-        channel_size,
-        input_stride0,
-        input_stride1);
-    });
+  const int64_t ndim = self.dim();
+  // Helper to convert 1d tensor or scalar tensor to an nd tensor that broadcasts with input
+  // All elements go into the channel dimension
+  DimVector sizes(ndim, 1), strides(ndim, 0);
+  auto as_nd = [&](const Tensor& t) {
+    TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0));
+    if (ndim >= 2) {
+      sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1;
+      strides[1] = t.dim() == 1 ? t.strides()[0] : 0;
+      return t.as_strided(sizes, strides);
+    }
+    return t.as_strided(sizes, strides);
+  };
+  Tensor w;
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16));
+    w_bf16.copy_(weight_);
+    w = weight_.defined() ? as_nd(w_bf16) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
+  } else {
+    w = weight_.defined() ? as_nd(weight_) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
+  }
+
+  auto iter = TensorIteratorConfig()
+    .add_output(input_grad)
+    .add_output(weight_grad_collector)
+    .add_input(self)
+    .add_input(grad_out_)
+    .add_input(w)
+    .build();
+
+  prelu_backward_cpu_stub(iter.device_type(), iter);
+
+  if (weight_num == 1) {
+    weight_grad.fill_(weight_grad_collector.sum());
+  } else {
     // update weight_grad
     std::vector<int64_t> reduce_dims;
+    int64_t input_ndim = self.dim();
     reduce_dims.push_back(0);
-    if (dims > 2) {
-      for (const auto i : c10::irange(2, dims)) {
-        reduce_dims.push_back(i);
-      }
+    if (input_ndim > 2) {
+      for(int64_t i = 2; i < input_ndim; i++) reduce_dims.push_back(i);
     }
     weight_grad = weight_grad_collector.sum(reduce_dims);
   }
diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h
index 963dc4665fd1..ba2dbc0768e8 100644
--- a/aten/src/ATen/native/Activation.h
+++ b/aten/src/ATen/native/Activation.h
@@ -14,6 +14,23 @@ class TensorBase;
 
 namespace at { namespace native {
 
+// These constants control the approximation behavior of gelu function.
+enum GeluType {
+  None,             // Baseline Gelu
+  Tanh,             // Tahn Gelu Approximation
+  END
+};
+
+static GeluType get_gelutype_enum(const c10::string_view approximate) {
+  if (approximate == "none") {
+    return GeluType::None;
+  } else if (approximate == "tanh") {
+    return GeluType::Tanh;
+  } else {
+    TORCH_CHECK(false, "approximate argument must be either none or tanh.");
+  }
+}
+
 using structured_activation_fn = void (*)(TensorIteratorBase&);
 using structured_activation_backward_fn = void (*)(TensorIteratorBase&);
 
@@ -35,6 +52,9 @@ using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const
 using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
 using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
 using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&);
+using gelu_fn = void (*)(TensorIteratorBase&, GeluType);
+using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType);
+using glu_jvp_fn = void (*)(TensorIteratorBase&);
 
 DECLARE_DISPATCH(elu_fn, elu_stub);
 DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub);
@@ -43,8 +63,8 @@ DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub);
 DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub);
 DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub);
 DECLARE_DISPATCH(threshold_fn, threshold_stub);
-DECLARE_DISPATCH(structured_activation_fn, GeluKernel);
-DECLARE_DISPATCH(structured_activation_backward_fn, GeluBackwardKernel);
+DECLARE_DISPATCH(gelu_fn, GeluKernel);
+DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel);
 DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub);
 DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub);
 DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub);
@@ -57,10 +77,13 @@ DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub);
 DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub);
 DECLARE_DISPATCH(structured_activation_fn, glu_stub);
 DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
+DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub);
 DECLARE_DISPATCH(structured_activation_fn, silu_stub);
 DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
 DECLARE_DISPATCH(structured_activation_fn, mish_stub);
 DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);
+DECLARE_DISPATCH(activation_fn, prelu_cpu_stub);
+DECLARE_DISPATCH(activation_backward_fn, prelu_backward_cpu_stub);
 
 } // namespace native
 
diff --git a/aten/src/ATen/native/AdaptivePooling.h b/aten/src/ATen/native/AdaptivePooling.h
index 87cf202c3cc5..68fb08a5f397 100644
--- a/aten/src/ATen/native/AdaptivePooling.h
+++ b/aten/src/ATen/native/AdaptivePooling.h
@@ -1,9 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <cmath>
 
-namespace at { namespace native {
+namespace at {
+class Tensor;
+
+namespace native {
 
 using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
 using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 225985d60485..7fa3c3e37f3e 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -21,12 +21,6 @@
 // linear algebra function uses that routine
 #if AT_BUILD_WITH_LAPACK()
 
-// gesv
-extern "C" void zgesv_(int *n, int *nrhs, std::complex<double> *a, int *lda, int *ipiv, std::complex<double> *b, int *ldb, int *info);
-extern "C" void cgesv_(int *n, int *nrhs, std::complex<float> *a, int *lda, int *ipiv, std::complex<float> *b, int *ldb, int *info);
-extern "C" void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-extern "C" void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-
 // getrf
 extern "C" void zgetrf_(int *m, int *n, std::complex<double> *a, int *lda, int *ipiv, int *info);
 extern "C" void cgetrf_(int *m, int *n, std::complex<float> *a, int *lda, int *ipiv, int *info);
@@ -57,6 +51,128 @@ extern "C" void cpotri_(char *uplo, int *n, std::complex<float> *a, int *lda, in
 extern "C" void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
 extern "C" void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
 
+// sytrf
+extern "C" void dsytrf_(
+    char* uplo,
+    int* n,
+    double* a,
+    int* lda,
+    int* ipiv,
+    double* work,
+    int* lwork,
+    int* info);
+extern "C" void ssytrf_(
+    char* uplo,
+    int* n,
+    float* a,
+    int* lda,
+    int* ipiv,
+    float* work,
+    int* lwork,
+    int* info);
+extern "C" void zsytrf_(
+    char* uplo,
+    int* n,
+    std::complex<double>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<double>* work,
+    int* lwork,
+    int* info);
+extern "C" void csytrf_(
+    char* uplo,
+    int* n,
+    std::complex<float>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<float>* work,
+    int* lwork,
+    int* info);
+
+// hetrf
+extern "C" void zhetrf_(
+    char* uplo,
+    int* n,
+    std::complex<double>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<double>* work,
+    int* lwork,
+    int* info);
+extern "C" void chetrf_(
+    char* uplo,
+    int* n,
+    std::complex<float>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<float>* work,
+    int* lwork,
+    int* info);
+
+// sytrs
+extern "C" void dsytrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    double* a,
+    int* lda,
+    int* ipiv,
+    double* b,
+    int* ldb,
+    int* info);
+extern "C" void ssytrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    float* a,
+    int* lda,
+    int* ipiv,
+    float* b,
+    int* ldb,
+    int* info);
+extern "C" void zsytrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    std::complex<double>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<double>* b,
+    int* ldb,
+    int* info);
+extern "C" void csytrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    std::complex<float>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<float>* b,
+    int* ldb,
+    int* info);
+
+// hetrs
+extern "C" void zhetrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    std::complex<double>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<double>* b,
+    int* ldb,
+    int* info);
+extern "C" void chetrs_(
+    char* uplo,
+    int* n,
+    int* nrhs,
+    std::complex<float>* a,
+    int* lda,
+    int* ipiv,
+    std::complex<float>* b,
+    int* ldb,
+    int* info);
+
 // geqrf
 extern "C" void zgeqrf_(int *m, int *n, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cgeqrf_(int *m, int *n, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info);
@@ -207,6 +323,70 @@ extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n,
 namespace at {
 namespace meta {
 
+TORCH_META_FUNC(linalg_ldl_factor_ex)
+(const Tensor& self, bool hermitian, bool check_errors) {
+  at::native::squareCheckInputs(self, "torch.linalg.ldl_factor_ex");
+  at::native::checkFloatingOrComplex(self, "torch.linalg.ldl_factor_ex");
+
+  auto ndim = self.dim();
+
+  // prefer column major strides
+  auto ld_strides = at::native::batched_matrix_contiguous_strides(self.sizes(), /*column_major=*/true);
+  set_output(0, self.sizes(), ld_strides, self.options(), {}); // LD
+
+  auto pivots_shape =
+      IntArrayRef(self.sizes().data(), ndim - 1); // self.shape[:-1]
+  set_output(
+      1, pivots_shape, {}, self.options().dtype(ScalarType::Int), {}); // pivots
+
+  auto info_shape =
+      IntArrayRef(self.sizes().data(), ndim - 2); // self.shape[:-2]
+  set_output(
+      2, info_shape, {}, self.options().dtype(ScalarType::Int), {}); // info
+}
+
+TORCH_META_FUNC(linalg_ldl_solve)
+(const Tensor& LD,
+ const Tensor& pivots,
+ const Tensor& B,
+ bool hermitian) {
+  at::native::squareCheckInputs(LD, "torch.linalg.ldl_solve");
+  at::native::checkFloatingOrComplex(LD, "torch.linalg.ldl_solve");
+  at::native::linearSolveCheckInputs(B, LD, "torch.linalg.ldl_solve");
+  TORCH_CHECK(
+      B.dim() >= 2,
+      "torch.linalg.ldl_solve: Expected B to have at least 2 dimensions, but it has ",
+      B.dim(),
+      " dimensions instead");
+  auto expected_pivots_shape =
+      IntArrayRef(LD.sizes().data(), LD.dim() - 1); // LD.shape[:-1]
+  TORCH_CHECK(
+      expected_pivots_shape.equals(pivots.sizes()),
+      "torch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, but got pivots with shape ",
+      pivots.sizes(),
+      " instead");
+  // pivots is allowed to be any integer type
+  // LAPACK we use is 32-bit interface while cuSOLVER uses 64-bit interface for integers
+  TORCH_CHECK(
+      at::isIntegralType(pivots.scalar_type(), /*includeBool=*/false),
+      "torch.linalg.ldl_solve: Expected pivots to be integers. Got ",
+      pivots.scalar_type());
+  TORCH_CHECK(
+      LD.scalar_type() == B.scalar_type(),
+      "torch.linalg.ldl_solve: ",
+      "LD dtype",
+      LD.scalar_type(),
+      " does not match b dtype ",
+      B.scalar_type());
+
+    std::vector<int64_t> B_broadcast_size;
+    std::tie(B_broadcast_size, std::ignore) = at::native::_linalg_broadcast_batch_dims(B, LD);
+
+  // prefer column major strides
+  auto result_strides = at::native::batched_matrix_contiguous_strides(B_broadcast_size, /*column_major=*/true);
+  set_output(0, B_broadcast_size, result_strides, B.options(), {});
+}
+
 TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) {
   TORCH_CHECK(self.dim() >= 2,
            "torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
@@ -220,13 +400,13 @@ TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool uppe
     std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A);
 
     // make column major strides for BLAS
-    const auto solution_strides = at::native::contiguous_strides(self_broadcast_size, /*f-contig=*/true);
+    const auto solution_strides = at::native::batched_matrix_contiguous_strides(self_broadcast_size, /*f-contig=*/true);
     set_output(0, self_broadcast_size, solution_strides, self.options(), {});
 
     // make column major strides for BLAS
-    auto clone_A_strides = at::native::contiguous_strides(A_broadcast_size, /*f_contig=*/true);
+    auto clone_A_strides = at::native::batched_matrix_contiguous_strides(A_broadcast_size, /*f_contig=*/true);
     set_output(1, A_broadcast_size, clone_A_strides, A.options(), {});
-  } else if (A.layout() == Layout::SparseCsr) {
+  } else if (A.layout() == Layout::SparseCsr || A.layout() == Layout::SparseBsr) {
     // no broadcasting for non-strided layout
     set_output(0, self.sizes(), {}, self.options(), {}); // make row major strides for Sparse BLAS
     set_output(1, {0}, {}, self.options(), {}); // return 0-sized tensor
@@ -243,7 +423,7 @@ TORCH_META_FUNC(linalg_lu_factor_ex)(const Tensor& A, bool pivot, bool check_err
   const auto n = sizes.cend()[-1];
 
   // make column major strides for BLAS
-  auto LU_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/true);
+  auto LU_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/true);
   set_output(0, sizes, LU_strides, A.options(), {});
 
   // Set sizes to the size of pivots
@@ -269,7 +449,7 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A,
   // Prepare sizes for U
   if (compute_uv) {
     sizes.back() = full_matrices ? m : k;
-    auto U_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/true);
+    auto U_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/true);
     set_output(0, sizes, U_strides, A.options(), {});
 
     // Prepare sizes for Vh
@@ -279,7 +459,7 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A,
     // We need to distinguish the cuSOLVER case, as the cuSOLVER algorithms we use
     // expect F-contig matrices, but they compute V rather than Vh
     const bool use_cusolver = at::native::svd_uses_cusolver(A);
-    auto Vh_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/!use_cusolver);
+    auto Vh_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/!use_cusolver);
     set_output(2, sizes, Vh_strides, A.options(), {});
   } else {
     set_output(0, {0}, {}, A.options(), {});
@@ -289,8 +469,71 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A,
   // Prepare sizes for S. S is always real, even when A is complex.
   sizes.pop_back();
   sizes.end()[-1] = k;
-  set_output(1, sizes, {}, A.options().dtype(c10::toValueType(A.scalar_type())), {});
+  set_output(1, sizes, {}, A.options().dtype(c10::toRealValueType(A.scalar_type())), {});
+}
+
+TORCH_META_FUNC(lu_unpack)(const Tensor& LU, const Tensor& pivots, bool unpack_data, bool unpack_pivots) {
+  TORCH_CHECK(LU.dim() >= 2, "torch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: ", LU.sizes(), " instead");
+  if (unpack_pivots) {
+    TORCH_CHECK(pivots.scalar_type() == at::kInt,
+        "torch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.\n"
+        "Note: this function is intended to be used with the output produced by torch.linalg.lu_factor");
+  }
+
+  auto sizes = LU.sizes().vec();
+  const auto m = sizes.cend()[-2];
+  const auto n = sizes.cend()[-1];
+  const auto k = std::min(m, n);
+
+  // P.shape[-2:] == (m, m) (or size zero if pivot == False)
+  sizes.end()[-1] = m;
+  if (unpack_pivots) {
+    set_output(0, sizes, LU.options());
+  } else {
+    set_output(0, {0}, LU.options());
+  }
+
+  if (unpack_data) {
+    // L.shape[-2:] == (m, k)
+    sizes.end()[-1] = k;
+    set_output(1, sizes, LU.options());
+
+    // U.shape[-2:] == (k, n)
+    sizes.end()[-2] = k;
+    sizes.end()[-1] = n;
+    set_output(2, sizes, LU.options());
+  } else {
+    set_output(1, {0}, LU.options());
+    set_output(2, {0}, LU.options());
+  }
 }
+
+TORCH_META_FUNC(linalg_lu)(const Tensor& A, bool pivot) {
+  TORCH_CHECK(A.dim() >= 2, "linalg.lu: Expected tensor with 2 or more dimensions. Got size: ", A.sizes(), " instead");
+
+  auto sizes = A.sizes().vec();
+  const auto m = sizes.cend()[-2];
+  const auto n = sizes.cend()[-1];
+  const auto k = std::min(m, n);
+
+  // P.shape[-2:] == (m, m) (or size zero if pivot == False)
+  sizes.end()[-1] = m;
+  if (pivot) {
+    set_output(0, sizes, A.options());
+  } else {
+    set_output(0, {0}, A.options());
+  }
+
+  // L.shape[-2:] == (m, k)
+  sizes.end()[-1] = k;
+  set_output(1, sizes, A.options());
+
+  // U.shape[-2:] == (k, n)
+  sizes.end()[-2] = k;
+  sizes.end()[-1] = n;
+  set_output(2, sizes, A.options());
+}
+
 } // namespace meta
 
 namespace native {
@@ -298,8 +541,6 @@ namespace native {
 #if AT_BUILD_WITH_LAPACK()
 // Define the per-batch functions to be used in the main implementation of the batched
 // linear algebra operations
-template<class scalar_t>
-void lapackSolve(int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
 
 template<class scalar_t>
 void lapackGetri(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int *info);
@@ -310,22 +551,6 @@ void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scala
 template<class scalar_t, class value_t=scalar_t>
 void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
 
-template<> void lapackSolve<c10::complex<double>>(int n, int nrhs, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *b, int ldb, int *info) {
-  zgesv_(&n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
-}
-
-template<> void lapackSolve<c10::complex<float>>(int n, int nrhs, c10::complex<float> *a, int lda, int *ipiv, c10::complex<float> *b, int ldb, int *info) {
-  cgesv_(&n, &nrhs, reinterpret_cast<std::complex<float>*>(a), &lda, ipiv, reinterpret_cast<std::complex<float>*>(b), &ldb, info);
-}
-
-template<> void lapackSolve<double>(int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) {
-  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-}
-
-template<> void lapackSolve<float>(int n, int nrhs, float *a, int lda, int *ipiv, float *b, int ldb, int *info) {
-  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-}
-
 template<> void lapackGetri<c10::complex<double>>(int n, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *work, int lwork, int *info) {
   zgetri_(&n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }
@@ -552,6 +777,290 @@ template<> void lapackSvd<float>(char jobz, int m, int n, float *a, int lda,
   sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
+template <>
+void lapackLdlSymmetric<double>(
+    char uplo,
+    int n,
+    double* a,
+    int lda,
+    int* ipiv,
+    double* work,
+    int lwork,
+    int* info) {
+  dsytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info);
+}
+
+template <>
+void lapackLdlSymmetric<float>(
+    char uplo,
+    int n,
+    float* a,
+    int lda,
+    int* ipiv,
+    float* work,
+    int lwork,
+    int* info) {
+  ssytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info);
+}
+
+template <>
+void lapackLdlSymmetric<c10::complex<double>>(
+    char uplo,
+    int n,
+    c10::complex<double>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<double>* work,
+    int lwork,
+    int* info) {
+  zsytrf_(
+      &uplo,
+      &n,
+      reinterpret_cast<std::complex<double>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<double>*>(work),
+      &lwork,
+      info);
+}
+
+template <>
+void lapackLdlSymmetric<c10::complex<float>>(
+    char uplo,
+    int n,
+    c10::complex<float>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<float>* work,
+    int lwork,
+    int* info) {
+  csytrf_(
+      &uplo,
+      &n,
+      reinterpret_cast<std::complex<float>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<float>*>(work),
+      &lwork,
+      info);
+}
+
+template <>
+void lapackLdlHermitian<double>(
+    char uplo,
+    int n,
+    double* a,
+    int lda,
+    int* ipiv,
+    double* work,
+    int lwork,
+    int* info) {
+  dsytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info);
+}
+
+template <>
+void lapackLdlHermitian<float>(
+    char uplo,
+    int n,
+    float* a,
+    int lda,
+    int* ipiv,
+    float* work,
+    int lwork,
+    int* info) {
+  ssytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info);
+}
+
+template <>
+void lapackLdlHermitian<c10::complex<double>>(
+    char uplo,
+    int n,
+    c10::complex<double>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<double>* work,
+    int lwork,
+    int* info) {
+  zhetrf_(
+      &uplo,
+      &n,
+      reinterpret_cast<std::complex<double>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<double>*>(work),
+      &lwork,
+      info);
+}
+
+template <>
+void lapackLdlHermitian<c10::complex<float>>(
+    char uplo,
+    int n,
+    c10::complex<float>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<float>* work,
+    int lwork,
+    int* info) {
+  chetrf_(
+      &uplo,
+      &n,
+      reinterpret_cast<std::complex<float>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<float>*>(work),
+      &lwork,
+      info);
+}
+
+template <>
+void lapackLdlSolveSymmetric<double>(
+    char uplo,
+    int n,
+    int nrhs,
+    double* a,
+    int lda,
+    int* ipiv,
+    double* b,
+    int ldb,
+    int* info) {
+  dsytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+
+template <>
+void lapackLdlSolveSymmetric<float>(
+    char uplo,
+    int n,
+    int nrhs,
+    float* a,
+    int lda,
+    int* ipiv,
+    float* b,
+    int ldb,
+    int* info) {
+  ssytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+
+template <>
+void lapackLdlSolveSymmetric<c10::complex<double>>(
+    char uplo,
+    int n,
+    int nrhs,
+    c10::complex<double>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<double>* b,
+    int ldb,
+    int* info) {
+  zsytrs_(
+      &uplo,
+      &n,
+      &nrhs,
+      reinterpret_cast<std::complex<double>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<double>*>(b),
+      &ldb,
+      info);
+}
+
+template <>
+void lapackLdlSolveSymmetric<c10::complex<float>>(
+    char uplo,
+    int n,
+    int nrhs,
+    c10::complex<float>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<float>* b,
+    int ldb,
+    int* info) {
+  csytrs_(
+      &uplo,
+      &n,
+      &nrhs,
+      reinterpret_cast<std::complex<float>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<float>*>(b),
+      &ldb,
+      info);
+}
+
+template <>
+void lapackLdlSolveHermitian<double>(
+    char uplo,
+    int n,
+    int nrhs,
+    double* a,
+    int lda,
+    int* ipiv,
+    double* b,
+    int ldb,
+    int* info) {
+  dsytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+
+template <>
+void lapackLdlSolveHermitian<float>(
+    char uplo,
+    int n,
+    int nrhs,
+    float* a,
+    int lda,
+    int* ipiv,
+    float* b,
+    int ldb,
+    int* info) {
+  ssytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+}
+
+template <>
+void lapackLdlSolveHermitian<c10::complex<double>>(
+    char uplo,
+    int n,
+    int nrhs,
+    c10::complex<double>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<double>* b,
+    int ldb,
+    int* info) {
+  zhetrs_(
+      &uplo,
+      &n,
+      &nrhs,
+      reinterpret_cast<std::complex<double>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<double>*>(b),
+      &ldb,
+      info);
+}
+
+template <>
+void lapackLdlSolveHermitian<c10::complex<float>>(
+    char uplo,
+    int n,
+    int nrhs,
+    c10::complex<float>* a,
+    int lda,
+    int* ipiv,
+    c10::complex<float>* b,
+    int ldb,
+    int* info) {
+  chetrs_(
+      &uplo,
+      &n,
+      &nrhs,
+      reinterpret_cast<std::complex<float>*>(a),
+      &lda,
+      ipiv,
+      reinterpret_cast<std::complex<float>*>(b),
+      &ldb,
+      info);
+}
+
 template<> void lapackLuSolve<c10::complex<double>>(char trans, int n, int nrhs, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *b, int ldb, int *info) {
   zgetrs_(&trans, &n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
 }
@@ -802,100 +1311,6 @@ bool _requires_fw_or_bw_grad(const Tensor& input) {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-/*
-Computes the solution to a system of linear equations
-  A X = B,
-where A is an n-by-n matrix and X and B are n-by-nrhs matrices.
-Note that B is required to be a matrix, the usual, vector case, is obtained with nrhs = 1.
-Above description is for non-batched input, the batched input is also supported.
-This is an in-place routine, content of both A and b are overwritten.
-'infos' is an int Tensor containing error codes for each matrix in the batched input.
-For more information see LAPACK's documentation for GESV routine.
-*/
-template<typename scalar_t>
-static void apply_solve(Tensor& b, Tensor& A, Tensor& infos) {
-#if !AT_BUILD_WITH_LAPACK()
-  AT_ERROR("solve: LAPACK library not found in compilation");
-#else
-  auto A_data = A.data_ptr<scalar_t>();
-  auto b_data = b.data_ptr<scalar_t>();
-  auto A_mat_stride = matrixStride(A);
-  auto b_mat_stride = matrixStride(b);
-  auto batch_size = batchCount(A);
-  auto n = A.size(-2);
-  auto nrhs = b.size(-1);
-  auto lda = std::max<int64_t>(1, n);
-
-  auto ipiv = at::empty({lda}, b.options().dtype(kInt));
-  auto ipiv_data = ipiv.data_ptr<int>();
-  auto infos_data = infos.data_ptr<int>();
-
-  for (const auto i : c10::irange(batch_size)) {
-    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
-    scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-    int* info_working_ptr = &infos_data[i];
-    lapackSolve<scalar_t>(n, nrhs, A_working_ptr, lda, ipiv_data, b_working_ptr, lda, info_working_ptr);
-  }
-#endif
-}
-
-std::tuple<Tensor, Tensor> _solve_helper_cpu(const Tensor& self, const Tensor& A) {
-  auto self_working_copy = cloneBatchedColumnMajor(self);
-  auto A_working_copy = cloneBatchedColumnMajor(A);
-  // infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty
-  auto infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cpu", [&]{
-    apply_solve<scalar_t>(self_working_copy, A_working_copy, infos);
-  });
-  at::_linalg_check_errors(infos, "solve_cpu", self.dim() == 2);
-  return std::tuple<Tensor, Tensor>(self_working_copy, A_working_copy);
-}
-
-// Supports arbitrary batch dimensions for self and A
-std::tuple<Tensor,Tensor> solve(const Tensor& self, const Tensor& A) {
-  TORCH_WARN_ONCE(
-    "torch.solve is deprecated in favor of torch.linalg.solve",
-    "and will be removed in a future PyTorch release.\n",
-    "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n",
-    "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n",
-    "X = torch.solve(B, A).solution\n",
-    "should be replaced with\n",
-    "X = torch.linalg.solve(A, B)"
-  );
-  TORCH_CHECK(self.dim() >= 2,
-           "B should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
-  TORCH_CHECK(A.dim() >= 2,
-           "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
-  Tensor self_broadcasted, A_broadcasted;
-  std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "solve");
-  return at::_solve_helper(self_broadcasted, A_broadcasted);
-}
-
-std::tuple<Tensor&,Tensor&> solve_out(const Tensor& self, const Tensor& A, Tensor& solution, Tensor& lu) {
-  TORCH_WARN_ONCE(
-    "torch.solve is deprecated in favor of torch.linalg.solve",
-    "and will be removed in a future PyTorch release.\n",
-    "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n",
-    "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n",
-    "X = torch.solve(B, A).solution\n",
-    "should be replaced with\n",
-    "X = torch.linalg.solve(A, B)"
-  );
-  checkSameDevice("solve", solution, self, "solution");
-  checkSameDevice("solve", lu, self, "lu");
-  checkLinalgCompatibleDtype("solve", solution, self, "solution");
-  checkLinalgCompatibleDtype("solve", lu, self, "lu");
-
-  Tensor solution_tmp, lu_tmp;
-  std::tie(solution_tmp, lu_tmp) = at::_solve_helper(self, A);
-
-  at::native::resize_output(solution, solution_tmp.sizes());
-  at::native::resize_output(lu, lu_tmp.sizes());
-  solution.copy_(solution_tmp);
-  lu.copy_(lu_tmp);
-  return std::tuple<Tensor&, Tensor&>(solution, lu);
-}
-
 // Solves a system of linear equations matmul(input, x) = other in-place
 // LAPACK/MAGMA error codes are saved in 'infos' tensor, they are not checked here
 static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor& input, const Tensor& other) {
@@ -952,8 +1367,8 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
 
   // _linalg_broadcast_batch_dims also includes linearSolveCheckInputs
   // it checks for squareness of 'input' and 'shape' compatibility of 'other' and 'input'
-  Tensor other_broadcasted, input_broadcasted;
-  std::tie(other_broadcasted, input_broadcasted) = _linalg_broadcast_batch_dims(other_, input, "linalg.solve");
+  Tensor other_broadcasted;
+  std::tie(other_broadcasted, std::ignore) = _linalg_broadcast_batch_dims(other_, input, "linalg.solve");
 
   auto squeezed_other_broadcasted = at::squeeze(other_broadcasted, -1);
   auto squeezed_result_shape = squeezed_other_broadcasted.sizes();
@@ -989,18 +1404,17 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
   // lu_factor_stub+lu_solve_stub perform calculations in-place and 'result' must be a copy of 'other_broadcasted'
   result.copy_(other_broadcasted);
 
-  auto input_working_copy = cloneBatchedColumnMajor(input_broadcasted);
-
   TORCH_INTERNAL_ASSERT(infos.scalar_type() == kInt);
   TORCH_INTERNAL_ASSERT(infos.device() == input.device());
-  infos.resize_({std::max<int64_t>(1, batchCount(input_broadcasted))});
+  infos.resize_({std::max<int64_t>(1, batchCount(input))});
   // if input is empty infos might not get filled; make sure infos doesn't contain garbage then
   if (input.numel() == 0) {
     infos.fill_(0);
   }
 
   // compute the LU factorization of 'input_working_copy'
-  auto pivots_shape = IntArrayRef(input_broadcasted.sizes().data(), input_broadcasted.dim() - 2).vec(); // input_broadcasted.shape[:-2]
+  auto input_working_copy = cloneBatchedColumnMajor(input);
+  auto pivots_shape = IntArrayRef(input.sizes().data(), input.dim() - 2).vec(); // input.shape[:-2]
   pivots_shape.push_back(std::min(input.size(-2), input.size(-1)));
   Tensor pivots = at::empty(pivots_shape, input.options().dtype(kInt));
   lu_factor_stub(input.device().type(), input_working_copy, pivots, infos, /*compute_pivots=*/true);
@@ -1023,8 +1437,7 @@ Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& resul
 
   // Now check LAPACK/MAGMA error codes
   // _linalg_check_errors calls 'infos = infos.to(kCPU)'
-  bool vector_case = linalg_solve_is_vector_rhs(input, other);
-  at::_linalg_check_errors(infos, "linalg.solve", vector_case ? result.dim() == 1 : result.dim() == 2);
+  at::_linalg_check_errors(infos, "linalg.solve", input.dim() == 2);
   return result;
 }
 
@@ -1109,7 +1522,7 @@ Tensor& _linalg_inv_out_helper_cpu(Tensor &result, Tensor& infos_lu, Tensor& inf
   return result;
 }
 
-// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
+// Computes the inverse matrix of 'input', it is saved to 'result' in-place
 // LAPACK/MAGMA/cuSOLVER error codes are saved in 'infos' tensors, they are not checked here
 static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& infos_getri, const Tensor& input) {
   squareCheckInputs(input, "linalg.inv");
@@ -1198,7 +1611,7 @@ static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& inf
   return result;
 }
 
-// Computes the inverse matrix of 'input', it is is saved to 'result' in-place
+// Computes the inverse matrix of 'input', it is saved to 'result' in-place
 Tensor& linalg_inv_out(const Tensor &input, Tensor &result) {
   auto info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   auto infos_lu = at::zeros({info_shape}, input.options().dtype(kInt));
@@ -1648,6 +2061,105 @@ std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool comput
   return at::linalg_lu_factor_ex(self, compute_pivots, false);
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DEFINE_DISPATCH(unpack_pivots_stub);
+
+TORCH_IMPL_FUNC(linalg_lu_out)(const Tensor& A,
+                               bool pivot,
+                               const Tensor& P,
+                               const Tensor& L,
+                               const Tensor& U) {
+  const auto m = A.sizes().end()[-2];
+  const auto n = A.sizes().end()[-1];
+
+  // A.shape[-2:] == (m, n)
+  // P.shape[-2:] == (m, m)
+  // L.shape[-2:] == (m, k)
+  // U.shape[-2:] == (k, n)
+  // with k = min(m, n)
+
+  // Use L as it has the correct size
+  const bool use_L = m > n;
+  auto pivots = at::empty({0}, A.options().dtype(kInt));
+  auto info = at::empty({0}, A.options().dtype(kInt));
+  at::linalg_lu_factor_ex_out(const_cast<Tensor&>(use_L ? L : U),
+                              const_cast<Tensor&>(pivots),
+                              const_cast<Tensor&>(info),
+                              A,
+                              pivot,
+                              /*check_errors=*/false);
+  at::lu_unpack_out(const_cast<Tensor&>(P),
+                    const_cast<Tensor&>(L),
+                    const_cast<Tensor&>(U),
+                    use_L ? L : U,
+                    pivots,
+                    /*unpack_lu=*/true,
+                    /*unpack_pivots=*/pivot);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_unpack ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TORCH_IMPL_FUNC(lu_unpack_out)(const Tensor& LU,
+                               const Tensor& pivots,
+                               bool unpack_lu,
+                               bool unpack_pivots,
+                               const Tensor& P,
+                               const Tensor& L,
+                               const Tensor& U) {
+  const auto m = LU.sizes().end()[-2];
+  const auto n = LU.sizes().end()[-1];
+
+  // A.shape[-2:] == (m, n)
+  // P.shape[-2:] == (m, m)
+  // L.shape[-2:] == (m, k)
+  // U.shape[-2:] == (k, n)
+  // with k = min(m, n)
+
+  if (unpack_lu) {
+    if (m > n || LU.is_same(L)) {
+      // The order of triu and tril is important as we may have LU.is_same(L)
+      at::triu_out(const_cast<Tensor&>(U), m == n ? LU : LU.narrow(-2, 0, n), 0);
+      at::tril_out(const_cast<Tensor&>(L), LU, -1);
+      L.diagonal(0, -2, -1).fill_(1.);
+    } else {
+      // The order of triu and tril is important as we may have LU.is_same(U)
+      at::tril_out(const_cast<Tensor&>(L), m == n ? LU : LU.narrow(-1, 0, m), -1);
+      L.diagonal(0, -2, -1).fill_(1.);
+      at::triu_out(const_cast<Tensor&>(U), LU, 0);
+    }
+  }
+  if (unpack_pivots) {
+    // lu_factor_ex returns an int32 1-based indexing, which is what we have in `pivots`
+    // We transform that to a proper permutation of the indices {0, ..., m-1}
+    const auto perm_sizes = IntArrayRef(P.sizes().data(), P.dim() - 1);
+
+    // Fill `perm` with the identity permutation (perhaps batched)
+    const auto perm = at::arange(m, pivots.options().memory_format(at::MemoryFormat::Contiguous).dtype(kLong))
+                        .expand(perm_sizes)
+                        .contiguous();
+
+    // Note that perm is of type kLong and pivots is a 1-indexed kInt.
+    // This is taken into account in the unpack_pivots kernel
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .declare_static_shape(pivots.sizes(), /*squash_dim=*/pivots.dim() - 1)
+      .add_output(perm)
+      .add_owned_input(pivots.contiguous())
+      .build();
+
+    if (iter.numel() != 0) {
+      unpack_pivots_stub(pivots.device().type(), iter, std::min(m, n));
+    }
+
+    // Transform the permutation into a permutation matrix
+    P.zero_();
+    P.scatter_(-2, perm.unsqueeze(-2), 1.);
+  }
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 DEFINE_DISPATCH(triangular_solve_stub);
@@ -2307,7 +2819,7 @@ void linalg_eigh_out_info(
 
   // eigenvalues are always real-valued
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.scalar_type() == real_dtype);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == vectors.scalar_type());
 
@@ -2354,7 +2866,7 @@ void linalg_eigh_out_info(
 std::tuple<Tensor, Tensor> linalg_eigh(const Tensor& input, c10::string_view uplo) {
   squareCheckInputs(input, "linalg.eigh");
   checkUplo(uplo);
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(real_dtype));
   Tensor vectors = at::empty({0}, input.options());
   Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));
@@ -2370,7 +2882,7 @@ std::tuple<Tensor&, Tensor&> linalg_eigh_out(const Tensor& input, c10::string_vi
   checkLinalgCompatibleDtype("torch.linalg.eigh", eigvecs, input, "eigenvectors");
 
   // eigenvalues are always real-valued here
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.eigh", eigvals.scalar_type(), real_dtype, "eigenvalues");
 
   Tensor eigvals_tmp, eigvecs_tmp;
@@ -2393,14 +2905,14 @@ Tensor linalg_eigvalsh(const Tensor& input, c10::string_view uplo) {
     return values;
   }
 
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(real_dtype));
   values = at::linalg_eigvalsh_outf(input, uplo, values);
   return values;
 }
 
 Tensor& linalg_eigvalsh_out(const Tensor& input, c10::string_view uplo, Tensor& result) {
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.eigvalsh", result.scalar_type(), real_dtype);
 
   squareCheckInputs(input, "linalg.eigvalsh");
@@ -2461,7 +2973,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
   value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
     int64_t lrwork = std::max(int64_t(1), 3 * n - 2);
-    ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
     rwork = at::empty({lrwork}, self.options().dtype(dtype));
     rwork_data = rwork.data_ptr<value_t>();
   }
@@ -2489,7 +3001,7 @@ std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvect
 
   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
-  ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+  ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
   auto eigvals = at::empty(self_sizes, self.options().dtype(dtype));
 
   if (self.numel() == 0) {
@@ -2549,7 +3061,7 @@ std::tuple<Tensor&, Tensor&> symeig_out(const Tensor& self, bool eigenvectors, b
   checkSameDevice("symeig", vecs, self, "eigenvectors");
   checkLinalgCompatibleDtype("symeig", vecs, self, "eigenvectors");
   // eigenvalues are always real-valued here
-  ScalarType real_dtype = toValueType(self.scalar_type());
+  ScalarType real_dtype = toRealValueType(self.scalar_type());
   checkLinalgCompatibleDtype("symeig", vals.scalar_type(), real_dtype, "eigenvalues");
 
   Tensor vals_tmp, vecs_tmp;
@@ -3068,7 +3580,11 @@ Tensor& linalg_svdvals_out(const Tensor& A, Tensor & S) {
 }
 
 Tensor linalg_svdvals(const Tensor& A) {
-  return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, /*comptue_uv=*/_requires_fw_or_bw_grad(A)));
+  // NB: Why do we need isTensorSubclassLike check for linalg_svdvals but not linalg_eigvals?
+  //     svdvals is decomposed at the vmap level in functorch so A can be a BatchedTensor wrapping
+  //     a TensorWrapper requiring fw or bw grad.
+  return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false,
+                     /*comptue_uv=*/_requires_fw_or_bw_grad(A) || isTensorSubclassLike(A)));
 }
 
 std::tuple<Tensor&, Tensor&, Tensor&> svd_out(const Tensor& self, bool some, bool compute_uv, Tensor& U, Tensor& S, Tensor& V) {
@@ -3195,7 +3711,7 @@ static void linalg_lstsq_out_info(
   TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong);
   TORCH_INTERNAL_ASSERT(rank.device() == input.device());
 
-  auto real_dtype = toValueType(input.scalar_type());
+  auto real_dtype = toRealValueType(input.scalar_type());
   TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype);
   TORCH_INTERNAL_ASSERT(singular_values.device() == input.device());
 
@@ -3393,7 +3909,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
   checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution");
 
   // 'residuals' is expected to have real float dtype
-  ScalarType real_dtype = c10::toValueType(input.scalar_type());
+  ScalarType real_dtype = c10::toRealValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution");
 
   // 'rank' is expected to have integer dtype
@@ -3410,7 +3926,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
   // set default rcond value
   double rcond_value = rcond.has_value()
     ? rcond.value()
-    : _get_epsilon(c10::toValueType(input.scalar_type())) * std::max<int64_t>(input.size(-2), input.size(-1));
+    : _get_epsilon(c10::toRealValueType(input.scalar_type())) * std::max<int64_t>(input.size(-2), input.size(-1));
 
   auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));
 
@@ -3524,9 +4040,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
     c10::optional<double> rcond,
     c10::optional<c10::string_view> driver) {
   Tensor solution = at::empty({0}, input.options());
-  Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
+  Tensor residuals = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type())));
   Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
-  Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
+  Tensor singular_values = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type())));
   std::tie(solution, residuals, rank, singular_values) =
       at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values);
   return std::make_tuple(solution, residuals, rank, singular_values);
@@ -3700,7 +4216,7 @@ Tensor _det_lu_based_helper_backward_helper(
   const Tensor& lu,
   const Tensor& pivs
 ) {
-  auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type()));
+  auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type()));
   auto n = self.size(-1);
   auto eps_tensor = at::tensor(eps, self.options());
   auto condition_diagonal = [&](const Tensor& x) {
@@ -3781,6 +4297,114 @@ Tensor _det_lu_based_helper_backward_helper(
   }
 }
 
+DEFINE_DISPATCH(ldl_factor_stub);
+
+TORCH_IMPL_FUNC(linalg_ldl_factor_ex_out)
+(const Tensor& self,
+ bool hermitian,
+ bool check_errors,
+ const Tensor& LD,
+ const Tensor& pivots,
+ const Tensor& info) {
+  // LAPACK workspace query segfalts if the input has 0 in batch dimensions.
+  if (self.numel() == 0) {
+    info.zero_();
+    return;
+  }
+
+  auto pivots_ = pivots.expect_contiguous();
+  auto info_ = info.expect_contiguous();
+
+  auto LD_ = at::native::borrow_else_clone(
+      LD.mT().is_contiguous(), LD, self, /*row_major=*/false);
+  if (LD.mT().is_contiguous()) {
+    LD_->copy_(self);
+  }
+
+  // We decided not to include upper flag in the API.
+  // https://github.com/pytorch/pytorch/pull/69828#issuecomment-1015143819
+  // We can revisit this decision later and remove upper completely
+  // also from low level functions or add it to the public API.
+  bool upper = false;
+  if (upper) {
+    LD_->triu_();
+  } else {
+    LD_->tril_();
+  }
+
+  // call ldl_factor_stub that fills the result tensors
+  ldl_factor_stub(
+      self.device().type(), *LD_, *pivots_, *info_, upper, hermitian);
+
+  if (!LD.is_same(*LD_)) {
+    LD.copy_(*LD_);
+  }
+  if (!info.is_same(*info_)) {
+    info.copy_(*info_);
+  }
+  if (!pivots.is_same(*pivots_)) {
+    pivots.copy_(*pivots_);
+  }
+
+  if (check_errors) {
+    at::_linalg_check_errors(
+        info, "torch.linalg.ldl_factor_ex", self.dim() == 2);
+  }
+}
+
+std::tuple<Tensor&, Tensor&> linalg_ldl_factor_out(
+    const Tensor& self,
+    bool hermitian,
+    Tensor& LD,
+    Tensor& pivots) {
+  auto info = at::empty({0}, self.options().dtype(kInt));
+  // We pass check_errors as we want to use lu_factor rather than lu_factor_ex
+  // in the errors
+  at::linalg_ldl_factor_ex_outf(
+      self, hermitian, /*check_errors=*/false, LD, pivots, info);
+  at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2);
+  return std::tie(LD, pivots);
+}
+
+std::tuple<Tensor, Tensor> linalg_ldl_factor(
+    const Tensor& self,
+    bool hermitian) {
+  Tensor LD, pivots, info;
+  std::tie(LD, pivots, info) =
+      at::linalg_ldl_factor_ex(self, hermitian, /*check_errors=*/false);
+  at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2);
+  return std::make_tuple(std::move(LD), std::move(pivots));
+}
+
+DEFINE_DISPATCH(ldl_solve_stub);
+
+TORCH_IMPL_FUNC(linalg_ldl_solve_out)
+(const Tensor& LD,
+ const Tensor& pivots,
+ const Tensor& B,
+ bool hermitian,
+ const Tensor& result) {
+  if (LD.numel() == 0 || pivots.numel() == 0) {
+    return;
+  }
+
+  auto pivots_ = pivots.expect_contiguous();
+
+  auto LD_ = at::native::borrow_else_clone(
+      LD.mT().is_contiguous(), LD, LD, /*row_major=*/false);
+  result.copy_(B);
+  auto result_ = at::native::borrow_else_clone(
+      result.mT().is_contiguous(), result, result, /*row_major=*/false);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(result) == batchCount(*result_));
+
+  ldl_solve_stub(
+      B.device().type(), *LD_, *pivots_, *result_, false, hermitian);
+
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve_triangular ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 namespace {
 void checkIsMatrix(const Tensor& t,
@@ -4037,4 +4661,28 @@ Tensor linalg_solve_triangular(
   return out;
 }
 
+Tensor linalg_vander(
+    const Tensor& x,
+    c10::optional<int64_t> N) {
+  auto t = x.scalar_type();
+  TORCH_CHECK(t == ScalarType::Float ||
+              t == ScalarType::Double ||
+              t == ScalarType::ComplexFloat ||
+              t == ScalarType::ComplexDouble ||
+              isIntegralType(t),
+              "linalg.vander supports floating point, complex, and integer tensors, but got ", t);
+  const auto x_ = x.dim() == 0 ? x.unsqueeze(-1) : x;
+
+  auto shape = x_.sizes().vec();
+  const auto n = N.value_or(shape.back());
+  TORCH_CHECK(n > 1, "N must be greater than 1.");
+
+  // Append cumprod of the oher 0...n-1 powers
+  shape.push_back(n - 1);
+  auto result = at::cumprod(x_.unsqueeze(-1).expand(shape), -1);
+  // The row of ones
+  shape.back() = 1LL;
+  auto ones =  result.new_ones(shape);
+  return at::cat({ones, result}, /*dim=*/ -1);
+}
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h
index f2e4057ad0de..667a6ad793fa 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.h
+++ b/aten/src/ATen/native/BatchLinearAlgebra.h
@@ -1,11 +1,14 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/LinearAlgebraUtils.h>
-#include <ATen/native/cpu/zmath.h>
+#include <ATen/native/TransposeType.h>
 
+// Forward declare TI
+namespace at {
+struct TensorIterator;
+}
 
 namespace at { namespace native {
 
@@ -161,6 +164,52 @@ void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv,
 template <class scalar_t>
 void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
 
+template <class scalar_t>
+void lapackLdlHermitian(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSymmetric(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveHermitian(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveSymmetric(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
 template<class scalar_t, class value_t=scalar_t>
 void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
 #endif
@@ -228,6 +277,12 @@ using lu_factor_fn = void (*)(
     bool /*compute_pivots*/);
 DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub);
 
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  const int64_t dim_size
+);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+
 using lu_solve_fn = void (*)(
     const Tensor& /*b*/,
     const Tensor& /*lu*/,
@@ -241,6 +296,14 @@ using lu_solve_trans_fn = void (*)(
     TransposeType /*trans*/);
 DECLARE_DISPATCH(lu_solve_trans_fn, lu_solve_trans_stub);
 
+using ldl_factor_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*info*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub);
+
 using svd_fn = void (*)(
     const Tensor& /*A*/,
     const bool /*full_matrices*/,
@@ -251,4 +314,11 @@ using svd_fn = void (*)(
     const Tensor& /*info*/);
 DECLARE_DISPATCH(svd_fn, svd_stub);
 
+using ldl_solve_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*result*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub);
 }} // namespace at::native
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 2bfac093f22c..b2c52afc4cc9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -149,7 +149,7 @@ void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vec
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (self.is_complex()) {
-    ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    ScalarType real_dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
     rwork = at::empty({n*2}, self.options().dtype(real_dtype));
     rwork_data = rwork.data_ptr<value_t>();
   }
@@ -242,7 +242,7 @@ void apply_linalg_eig(Tensor& values, Tensor& vectors, Tensor& input, Tensor& in
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (input.is_complex()) {
-    ScalarType real_dtype = toValueType(input.scalar_type());
+    ScalarType real_dtype = toRealValueType(input.scalar_type());
     rwork = at::empty({lda * 2}, input.options().dtype(real_dtype));
     rwork_data = rwork.data_ptr<value_t>();
   }
@@ -647,7 +647,7 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu
       default:
         rwork_len = std::max<int64_t>(1, rwork_opt);
     }
-    rwork = at::empty({rwork_len}, A.options().dtype(c10::toValueType(A.scalar_type())));
+    rwork = at::empty({rwork_len}, A.options().dtype(c10::toRealValueType(A.scalar_type())));
     rwork_data = rwork.data_ptr<value_t>();
   }
 
@@ -833,6 +833,137 @@ void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool u
   });
 }
 
+template <typename scalar_t>
+void apply_ldl_factor(
+    const Tensor& A,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+#if !AT_BUILD_WITH_LAPACK()
+  TORCH_CHECK(
+      false,
+      "Calling torch.linalg.ldl_factor on a CPU tensor requires compiling ",
+      "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
+#else
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0);
+  auto batch_size = batchCount(A);
+  auto n = A.size(-2);
+  auto leading_dim = A.stride(-1);
+  auto uplo = upper ? 'U' : 'L';
+
+  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
+
+  auto a_data = A.data_ptr<scalar_t>();
+  auto pivots_data = pivots.data_ptr<int>();
+  auto info_data = info.data_ptr<int>();
+
+  auto ldl_func =
+      hermitian ? lapackLdlHermitian<scalar_t> : lapackLdlSymmetric<scalar_t>;
+
+  scalar_t wkopt;
+  ldl_func(uplo, n, a_data, leading_dim, pivots_data, &wkopt, -1, info_data);
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
+  int lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
+  Tensor work = at::empty({lwork}, A.dtype());
+  auto work_data = work.data_ptr<scalar_t>();
+
+  for (const auto i : c10::irange(batch_size)) {
+    scalar_t* a_working_ptr = &a_data[i * a_stride];
+    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    auto* info_working_ptr = &info_data[i];
+    ldl_func(
+        uplo,
+        n,
+        a_working_ptr,
+        leading_dim,
+        pivots_working_ptr,
+        work_data,
+        lwork,
+        info_working_ptr);
+  }
+#endif
+}
+
+void ldl_factor_kernel(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      LD.scalar_type(), "ldl_factor_kernel_cpu", [&] {
+        apply_ldl_factor<scalar_t>(LD, pivots, info, upper, hermitian);
+      });
+}
+
+template <typename scalar_t>
+void apply_ldl_solve(
+    const Tensor& A,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper,
+    bool hermitian) {
+#if !AT_BUILD_WITH_LAPACK()
+  TORCH_CHECK(
+      false,
+      "Calling torch.linalg.ldl_factor on a CPU tensor requires compiling ",
+      "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
+#else
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0);
+  auto batch_size = batchCount(B);
+  auto n = A.size(-2);
+  auto nrhs = B.size(-1);
+  auto lda = A.stride(-1);
+  auto ldb = B.stride(-1);
+  auto uplo = upper ? 'U' : 'L';
+
+  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
+  auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
+
+  auto a_data = A.data_ptr<scalar_t>();
+  auto b_data = B.data_ptr<scalar_t>();
+  auto pivots_ = pivots.to(kInt);
+  auto pivots_data = pivots_.data_ptr<int>();
+
+  auto ldl_solve_func = hermitian ? lapackLdlSolveHermitian<scalar_t>
+                                  : lapackLdlSolveSymmetric<scalar_t>;
+
+  int info = 0;
+  for (const auto i : c10::irange(batch_size)) {
+    scalar_t* a_working_ptr = &a_data[i * a_stride];
+    scalar_t* b_working_ptr = &b_data[i * b_stride];
+    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    ldl_solve_func(
+        uplo,
+        n,
+        nrhs,
+        a_working_ptr,
+        lda,
+        pivots_working_ptr,
+        b_working_ptr,
+        ldb,
+        &info);
+  }
+  TORCH_INTERNAL_ASSERT(info == 0);
+#endif
+}
+
+void ldl_solve_kernel(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& result,
+    bool upper,
+    bool hermitian) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      LD.scalar_type(), "ldl_solve_kernel_cpu", [&] {
+        apply_ldl_solve<scalar_t>(LD, pivots, result, upper, hermitian);
+      });
+}
+
 /*
   Computes the LU decomposition of a m×n matrix or batch of matrices in 'input' tensor.
   This is an in-place routine, content of 'input', 'pivots', and 'infos' is overwritten.
@@ -851,7 +982,7 @@ void apply_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& in
 #if !AT_BUILD_WITH_LAPACK()
   TORCH_CHECK(
       false,
-      "Calling torch.lu on a CPU tensor requires compiling ",
+      "Calling torch.linalg.lu_factor on a CPU tensor requires compiling ",
       "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
 #else
   TORCH_CHECK(compute_pivots, "linalg.lu_factor: LU without pivoting is not implemented on the CPU");
@@ -908,8 +1039,8 @@ void apply_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots, Tra
   const auto trans = to_blas(transpose);
   auto pivots_data = pivots.data_ptr<int>();
   auto b_stride = matrixStride(b);
-  auto lu_stride = matrixStride(lu);
-  auto pivots_stride = pivots.size(-1);
+  auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
   auto batch_size = batchCount(b);
 
   auto n = lu.size(-2);
@@ -917,10 +1048,19 @@ void apply_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots, Tra
   auto leading_dimension = std::max<int64_t>(1, n);
 
   int info = 0;
+
+  // lu and pivots tensors can be broadcast to b
+  // here we construct a helper indexing tensor to linearly index into lu and pivots
+  IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2);
+  IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2);
+  BroadcastLinearIndices lu_index(
+      batchCount(lu), lu_batch_shape, b_batch_shape);
+
   for (const auto i : c10::irange(batch_size)) {
+    int64_t lu_index_i = lu_index(i);
     scalar_t* b_working_ptr = &b_data[i * b_stride];
-    scalar_t* lu_working_ptr = &lu_data[i * lu_stride];
-    int* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride];
+    int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride];
 
     lapackLuSolve<scalar_t>(trans, n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr,
                             b_working_ptr, leading_dimension, &info);
@@ -1021,6 +1161,32 @@ void svd_kernel(const Tensor& A,
   });
 }
 
+void unpack_pivots_cpu_kernel(TensorIterator& iter, const int64_t dim_size) {
+  auto loop = [&](char* const* const  data, const int64_t* const strides, const int64_t nelems) {
+    auto* perm_ptr = data[0];
+    const auto* pivots_ptr = data[1];
+
+    for (const auto elem : c10::irange(nelems)) {
+      (void)elem; //Suppress unused variable warning
+      // WARNING: linalg.lu_factor returns int32 pivots,
+      // this behavior could change in the future.
+      const auto perm_data = reinterpret_cast<int64_t*>(perm_ptr);
+      const auto pivots_data = reinterpret_cast<const int32_t*>(pivots_ptr);
+
+      for (const auto i : c10::irange(dim_size)) {
+        std::swap(
+          perm_data[i],
+          perm_data[pivots_data[i] - 1]
+        );
+      }
+
+      perm_ptr += strides[0];
+      pivots_ptr += strides[1];
+    }
+  };
+
+  iter.for_each(loop);
+}
 } // anonymous namespace
 
 REGISTER_ARCH_DISPATCH(cholesky_stub, DEFAULT, &cholesky_kernel);
@@ -1089,6 +1255,18 @@ REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel);
 
+REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel);
+REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
+REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
+REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
+REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel);
+
+REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel);
+REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel);
+
 REGISTER_ARCH_DISPATCH(lu_solve_trans_stub, DEFAULT, &lu_solve_trans_kernel);
 REGISTER_AVX512_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_kernel);
 REGISTER_AVX2_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_kernel);
@@ -1106,4 +1284,10 @@ REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel);
 REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel);
+
+REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel);
+REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
+REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
+REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
+REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 }} // namespace at::native
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index bdd6c87403e3..e6ba1dc47428 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -21,10 +21,11 @@ namespace native {
 static void check_convert(const Scalar& scalar, ScalarType scalarType) {
   // Validate that is possible to convert scalar to tensor dtype without
   // overflow
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::Bool,
       at::ScalarType::BFloat16,
       at::ScalarType::Half,
+      at::ScalarType::ComplexHalf,
       scalarType,
       "check_convert",
       [&] { scalar.to<scalar_t>(); });
@@ -232,10 +233,9 @@ CREATE_COMPARISON_SCALAR_TENSOR_META_FUNC(ge);
 
 namespace native {
 
-DEFINE_DISPATCH(add_stub);
 DEFINE_DISPATCH(add_clamp_stub);
-DEFINE_DISPATCH(sub_stub);
 DEFINE_DISPATCH(mul_stub);
+DEFINE_DISPATCH(sub_stub);
 DEFINE_DISPATCH(div_true_stub);
 DEFINE_DISPATCH(div_floor_stub);
 DEFINE_DISPATCH(div_trunc_stub);
@@ -277,17 +277,10 @@ DEFINE_DISPATCH(xlogy_stub);
 DEFINE_DISPATCH(xlog1py_stub);
 DEFINE_DISPATCH(zeta_stub);
 
-TORCH_IMPL_FUNC(add_out) (
-  const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result
-) {
-  add_stub(device_type(), *this, alpha);
-  TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype());
-}
-
 TORCH_IMPL_FUNC(sub_out) (
   const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result
 ) {
-  sub_stub(device_type(), *this, alpha);
+  add_stub(device_type(), *this, -alpha);
   TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype());
 }
 
@@ -626,6 +619,11 @@ Tensor& mul_(Tensor& self, const Scalar& other) {
   return at::mul_out(self, wrapped_scalar_tensor(other), self); // redispatch!
 }
 
+Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) {
+  self.values().mul_(other);
+  return self;
+}
+
 Device correct_out_device(const Tensor& self, const Tensor& other) {
   if (self.device() == at::kCPU){
       return other.device();
@@ -643,8 +641,6 @@ Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
 }
 
 Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
-  TORCH_INTERNAL_ASSERT(self._is_zerotensor() || other._is_zerotensor());
-
   auto out_device = correct_out_device(self, other);
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
@@ -672,7 +668,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
   }
 }
 
-Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
   auto out_device = correct_out_device(self, other);
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
@@ -694,6 +690,33 @@ Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alp
     return get_out_like(self);
   }
 }
+Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+  return maybe_add_maybe_sub(self, other, alpha);
+}
+
+Tensor sub_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+  return maybe_add_maybe_sub(self, other, -alpha);
+}
+
+Tensor linalg_cross_zerotensor(
+  const Tensor& input,
+  const Tensor& other,
+  const int64_t dim)
+{
+  auto out_device = correct_out_device(input, other);
+  // hack to use the TensorIterator to get the correct broadcasting and type
+  // promotion logic (see add_zerotensor)
+  auto device = Device(DeviceType::Meta);
+  auto meta_out = at::redispatch::linalg_cross(
+    c10::DispatchKeySet(at::DispatchKey::Meta),
+    input.to(device),
+    other.to(device),
+    dim);
+
+  return at::_efficientzerotensor(
+    meta_out.sizes(),
+    meta_out.options().device(out_device));
+}
 
 // multiply, alias for mul
 Tensor& multiply_out(const Tensor& self, const Tensor& other, Tensor& result) {
@@ -791,6 +814,10 @@ Tensor bitwise_and(const Tensor& self, const Scalar& other) {
   return at::bitwise_and(self, wrapped_scalar_tensor(other));
 }
 
+Tensor bitwise_and(const Scalar& self, const Tensor& other) {
+  return at::bitwise_and(wrapped_scalar_tensor(self), other);
+}
+
 Tensor& bitwise_and_(Tensor& self, const Scalar& other) {
   return self.bitwise_and_(wrapped_scalar_tensor(other));
 }
@@ -820,6 +847,10 @@ Tensor bitwise_or(const Tensor& self, const Scalar& other) {
   return at::bitwise_or(self, wrapped_scalar_tensor(other));
 }
 
+Tensor bitwise_or(const Scalar& self, const Tensor& other) {
+  return at::bitwise_or(wrapped_scalar_tensor(self), other);
+}
+
 Tensor& bitwise_or_(Tensor& self, const Scalar& other) {
   return self.bitwise_or_(wrapped_scalar_tensor(other));
 }
@@ -849,6 +880,10 @@ Tensor bitwise_xor(const Tensor& self, const Scalar& other) {
   return at::bitwise_xor(self, wrapped_scalar_tensor(other));
 }
 
+Tensor bitwise_xor(const Scalar& self, const Tensor& other) {
+  return at::bitwise_xor(wrapped_scalar_tensor(self), other);
+}
+
 Tensor& bitwise_xor_(Tensor& self, const Scalar& other) {
   return self.bitwise_xor_(wrapped_scalar_tensor(other));
 }
@@ -879,7 +914,7 @@ Tensor __lshift__(const Tensor& self, const Tensor& other) {
 
 Tensor __lshift__(const Tensor& self, const Scalar& other) {
   Tensor result;
-  auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type());
+  auto wrapper = wrapped_scalar_tensor(other);
   auto iter = TensorIterator::binary_op(result, self, wrapper);
   lshift_stub(iter.device_type(), iter);
   return iter.output();
@@ -892,7 +927,7 @@ Tensor& __ilshift__(Tensor& self, const Tensor& other) {
 }
 
 Tensor& __ilshift__(Tensor& self, const Scalar& other) {
-  auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type());
+  auto wrapper = wrapped_scalar_tensor(other);
   auto iter = TensorIterator::binary_op(self, self, wrapper);
   lshift_stub(iter.device_type(), iter);
   return self;
@@ -903,19 +938,19 @@ TORCH_IMPL_FUNC(bitwise_left_shift_out) (const Tensor& self, const Tensor& other
 }
 
 Tensor& bitwise_left_shift_out(const Tensor& self, const Scalar& other, Tensor& result) {
-  return at::bitwise_left_shift_out(result, self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_left_shift_out(result, self, wrapped_scalar_tensor(other));
 }
 
 Tensor bitwise_left_shift(const Tensor& self, const Scalar& other) {
-  return at::bitwise_left_shift(self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_left_shift(self, wrapped_scalar_tensor(other));
 }
 
 Tensor& bitwise_left_shift_(Tensor& self, const Scalar& other) {
-  return at::bitwise_left_shift_out(self, self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_left_shift_out(self, self, wrapped_scalar_tensor(other));
 }
 
 Tensor bitwise_left_shift(const Scalar& self, const Tensor& other) {
-  return at::bitwise_left_shift(wrapped_scalar_tensor(self).toType(other.scalar_type()), other);
+  return at::bitwise_left_shift(wrapped_scalar_tensor(self), other);
 }
 
 Tensor __rshift__(const Tensor& self, const Tensor& other) {
@@ -927,7 +962,7 @@ Tensor __rshift__(const Tensor& self, const Tensor& other) {
 
 Tensor __rshift__(const Tensor& self, const Scalar& other) {
   Tensor result;
-  auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type());
+  auto wrapper = wrapped_scalar_tensor(other);
   auto iter = TensorIterator::binary_op(result, self, wrapper);
   rshift_stub(iter.device_type(), iter);
   return iter.output();
@@ -940,7 +975,7 @@ Tensor& __irshift__(Tensor& self, const Tensor& other) {
 }
 
 Tensor& __irshift__(Tensor& self, const Scalar& other) {
-  auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type());
+  auto wrapper = wrapped_scalar_tensor(other);
   auto iter = TensorIterator::binary_op(self, self, wrapper);
   rshift_stub(iter.device_type(), iter);
   return self;
@@ -951,19 +986,19 @@ TORCH_IMPL_FUNC(bitwise_right_shift_out) (const Tensor& self, const Tensor& othe
 }
 
 Tensor& bitwise_right_shift_out(const Tensor& self, const Scalar& other, Tensor& result) {
-  return at::bitwise_right_shift_out(result, self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_right_shift_out(result, self, wrapped_scalar_tensor(other));
 }
 
 Tensor bitwise_right_shift(const Tensor& self, const Scalar& other) {
-  return at::bitwise_right_shift(self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_right_shift(self, wrapped_scalar_tensor(other));
 }
 
 Tensor& bitwise_right_shift_(Tensor& self, const Scalar& other) {
-  return at::bitwise_right_shift_out(self, self, wrapped_scalar_tensor(other).toType(self.scalar_type()));
+  return at::bitwise_right_shift_out(self, self, wrapped_scalar_tensor(other));
 }
 
 Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) {
-  return at::bitwise_right_shift(wrapped_scalar_tensor(self).toType(other.scalar_type()), other);
+  return at::bitwise_right_shift(wrapped_scalar_tensor(self), other);
 }
 
 template <typename Stub>
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index aea2a125bb07..f34f210c4e48 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -50,7 +50,9 @@ using binary_fn = void(*)(TensorIterator&);
 using binary_clamp_fn_alpha =
     void(*)(TensorIterator&, const Scalar& alpha, const Scalar& min_val, const Scalar& max_val);
 
+// NB: codegenned
 DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
+
 DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub);
 DECLARE_DISPATCH(structured_binary_fn_alpha, sub_stub);
 DECLARE_DISPATCH(structured_binary_fn, mul_stub);
@@ -84,7 +86,7 @@ DECLARE_DISPATCH(binary_fn_double, huber_stub);
 DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub);
 DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
 DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub);
-DECLARE_DISPATCH(binary_fn, mse_stub);
+DECLARE_DISPATCH(structured_binary_fn, mse_stub);
 DECLARE_DISPATCH(structured_binary_fn, fmod_stub);
 DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub);
 DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub);
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 04a12cb4e400..26c3804219e0 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -165,7 +165,7 @@ Tensor dot(const Tensor &self, const Tensor &other){
     return r;
   }
 
-  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {
     Tensor result = at::empty({}, self.options());
     result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
     return result;
diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp
index 728222090542..15d30c137d5b 100644
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@@ -1,4 +1,5 @@
 #include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/BucketizationUtils.h>
 #include <ATen/native/Resize.h>
diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h
index 7122723cf1ed..e23fa1267807 100644
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@@ -1,7 +1,14 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/TypeProperties.h>
+#include <ATen/ScalarOps.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index a789f58140db..80248fb23392 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/TransposeType.h>
 #include <c10/util/complex.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/Scalar.h>
diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index e9efd4b7c88d..88668d13145c 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -40,7 +40,7 @@ Tensor _view_as_real_physical(const Tensor& self) {
   new_sizes.back() = 2;
   auto new_strides = computeStrideForViewAsReal(self.strides());
   auto new_storage_offset = 2 * self.storage_offset();
-  const auto float_type = c10::toValueType(self.scalar_type());
+  const auto float_type = c10::toRealValueType(self.scalar_type());
   auto real_tensor = view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides);
   return real_tensor;
 }
diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp
deleted file mode 100644
index f7a2d76ed522..000000000000
--- a/aten/src/ATen/native/ConstantPadNd.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <ATen/ATen.h>
-
-#include <c10/util/irange.h>
-
-namespace at { namespace native {
-
-Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) {
-    TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ",
-             pad.size());
-
-    auto input_sizes = self.sizes();
-    auto l_inp = self.dim();
-
-    auto l_pad = pad.size() / 2;
-    auto l_diff = l_inp - l_pad;
-    TORCH_CHECK(l_inp >= (int64_t)l_pad, "Length of pad should be no more than twice the number of "
-             "dimensions of the input. Pad length is ", pad.size(), "while the input has ",
-             l_inp, "dimensions.");
-
-    std::vector<int64_t> new_shape;
-
-    bool all_pads_non_positive = true;
-
-    auto c_input = self;
-    for (const auto i : c10::irange(l_diff, l_inp)) {
-        auto pad_idx = 2 * (l_inp - i - 1);
-        if (pad[pad_idx] < 0) {
-            c_input = c_input.narrow(i, -pad[pad_idx], c_input.size(i) + pad[pad_idx]);
-        } else if (pad[pad_idx] != 0) {
-            all_pads_non_positive = false;
-        }
-        if (pad[pad_idx + 1] < 0) {
-            c_input = c_input.narrow(i, 0, c_input.size(i) + pad[pad_idx + 1]);
-        } else if (pad[pad_idx + 1] != 0) {
-            all_pads_non_positive = false;
-        }
-    }
-
-    // if none of the pads are positive we can optimize and just return the result
-    // of calling .narrow() on the input
-    if (all_pads_non_positive) {
-        return c_input.clone();
-    }
-
-
-    for (size_t i = 0; i < (size_t)l_diff; i ++) {
-        new_shape.emplace_back(input_sizes[i]);
-    }
-
-    for (const auto i : c10::irange((size_t)l_pad)) {
-        auto pad_idx = pad.size() - ((i + 1) * 2);
-        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
-                 pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
-                 "which is invalid. Check dimension ", l_diff + i, " of your input.");
-        new_shape.emplace_back(new_dim);
-    }
-
-    at::Tensor output;
-    const auto memory_format = self.suggest_memory_format();
-    if (self.is_quantized()) {
-        const auto qscheme = self.qscheme();
-        TORCH_CHECK(qscheme == kPerTensorAffine || qscheme == kPerTensorSymmetric,
-                    "Only per-tensor padding is supported.");
-        output = at::_empty_affine_quantized(
-            new_shape, self.options().memory_format(memory_format),
-            self.q_scale(), self.q_zero_point(), c10::nullopt);
-    } else {
-        output = at::empty(new_shape, self.options().memory_format(memory_format));
-    }
-    output.fill_(value);
-
-    auto c_output = output;
-    for (const auto i : c10::irange(l_diff, l_inp)) {
-        auto pad_idx = 2 * (l_inp - i - 1);
-        if (pad[pad_idx] > 0) {
-            c_output = c_output.narrow(i, pad[pad_idx], c_output.size(i) - pad[pad_idx]);
-        }
-        if (pad[pad_idx + 1] > 0) {
-            c_output = c_output.narrow(i, 0, c_output.size(i) - pad[pad_idx + 1]);
-        }
-    }
-    c_output.copy_(c_input);
-    return output;
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 74b87e76b39f..8493deba7b33 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/env.h>
@@ -19,6 +20,10 @@ using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
 DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
+using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
 using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
@@ -56,6 +61,25 @@ using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::T
     at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
 DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
 
+namespace {
+  static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
+}
+
+static inline bool cudnnv8_enabled_check_debug() {
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_ENABLED") == true;
+  static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
+  static uint8_t cudnnv8_debugcount = 0;
+  if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8_FLAG: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
+    cudnnv8_debugcount++;
+  }
+  return cudnnv8_flag == 1;
+}
+
+static inline bool cudnnv8_use_heur_mode_b() {
+  return cudnnv8_heuristic_mode_b;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct ConvParams {
   std::vector<int64_t> stride;
@@ -85,7 +109,8 @@ struct ConvParams {
   bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const;
   bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const;
   bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const c10::optional<IntArrayRef> bias_sizes_opt) const;
+                   const at::OptionalIntArrayRef bias_sizes_opt) const;
+  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const;
   bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
 };
 
@@ -109,7 +134,9 @@ enum class ConvBackend {
   SlowTranspose2d,
   SlowTranspose3d,
   Winograd3x3Depthwise,
-  Xnnpack2d
+  Xnnpack2d,
+  Mps,
+  MpsTranspose,
 };
 
 // Function to select the convolution backend based on the inputs and params.
@@ -120,7 +147,7 @@ enum class ConvBackend {
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const bool need_backward,
     const ConvParams& params);
 
@@ -147,6 +174,69 @@ constexpr int weight_input_channels_dim = 1;
 // Often written as 2 + max_dim (extra dims for batch size and channels)
 constexpr int max_dim = 3;
 
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
+{
+  TORCH_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  TORCH_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    AT_ERROR(ss.str());
+  }
+}
+
+
+// NOTE [ Convolution checks ]
+//
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+static void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
+
+  // Weight
+  checkSameDim(c, input, weight);
+
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+
 // NB: conv_output_size and conv_input_size are not bijections,
 // as conv_output_size loses information; this is why conv_input_size
 // takes an extra output_padding argument to resolve the ambiguity.
@@ -270,4 +360,42 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const
   return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
 }
 
+static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  // disable NHWC for MkldnnCPU tensor.
+  if (input.is_mkldnn() || weight.is_mkldnn()) {
+    return false;
+  }
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_mkldnn_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+
+  // TODO: add channels last 3d support
+  bool can_use_mkldnn_channels_last_3d = false;
+
+  return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d;
+}
+
+static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) || (
+       weight_memory_format == at::MemoryFormat::ChannelsLast));
+
+  return can_use_thnn_channels_last_2d;
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 5a3275239716..a6127a53577b 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -19,6 +19,10 @@
 #include <nnpack.h>
 #endif
 
+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/Utils.h>
+#endif
+
 constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at { namespace native {
@@ -190,8 +194,8 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co
   if (!input.is_cuda() || !cudnn_enabled) {
     return false;
   }
-  if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
-    return false;
+  if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16)  {
+    return at::native::cudnnv8_enabled_check_debug();
   }
   if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
     // bypass dilation checks for channels_last convolution
@@ -209,6 +213,22 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co
 #endif
 }
 
+auto ConvParams::use_mps( const at::Tensor& input, const at::Tensor& weight) const -> bool {
+  // These checks need to be expanded. Currently we have very limited set of
+  // checks for MPS.
+#ifdef USE_MPS
+  if (needs_64bit_indexing_no_split(input, weight)) {
+    return false;
+  }
+  if (!input.is_mps()) {
+    return false;
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
 auto ConvParams::use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const -> bool {
   if (needs_64bit_indexing_no_split(input, weight)) {
     return false;
@@ -228,6 +248,9 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c
   if (!at::globalContext().userEnabledMkldnn()) {
     return false;
   }
+  if (input.device().is_cpu() && input.scalar_type() == kBFloat16 && mkldnn_bf16_device_check()) {
+    return true;
+  }
   return (input.is_mkldnn()) || // input is mkldnn Tensor
     (input.device().is_cpu() &&
      input.scalar_type() == kFloat && // only on CPU Float Tensors
@@ -267,7 +290,7 @@ auto ConvParams::use_nnpack(const at::Tensor& input, const at::Tensor& weight) c
 auto ConvParams::use_xnnpack(
     const at::Tensor& input,
     const at::Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt) const -> bool {
+    const at::OptionalIntArrayRef bias_sizes_opt) const -> bool {
 #if defined(C10_MOBILE)
   if (!transposed) {
     return (input.size(1) == groups) &&
@@ -629,6 +652,25 @@ static void check_input_same_type_as_parameters(
   check_input_same_type_as_parameters(input, weight, /*bias=*/ Tensor());
 }
 
+static void check_input_same_type_as_parameters(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const ConvBackend backend) {
+  if (backend == ConvBackend::Mkldnn) {
+    TORCH_CHECK(input.options().type_equal(weight.options())
+        || (input.is_mkldnn() && weight.device().is_cpu() && weight.scalar_type() == kFloat),
+        "Input type (", input.toString(), ") and weight type (", weight.toString(),
+        ") should be the same or input should be a MKLDNN tensor and weight is a dense tensor");
+    TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options()))
+        || (input.is_mkldnn() && bias.device().is_cpu() && bias.scalar_type() == kFloat),
+        "Input type (", input.toString(), ") and bias type (", bias.toString(),
+        ") should be the same or input should be a MKLDNN tensor and bias is a dense tensor");
+  } else {
+    check_input_same_type_as_parameters(input, weight, bias);
+  }
+}
+
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
   TORCH_CHECK(tensor.ndimension() == 3,
            "expected 3D tensor, got tensor with ", tensor.ndimension(),
@@ -643,15 +685,97 @@ static auto view3d(const at::Tensor& tensor) -> at::Tensor {
   return tensor.squeeze(2);
 }
 
-
 static at::Tensor subtensor(at::Tensor& tensor, int dim, int groups, int g) {
   if (!tensor.defined()) {
     return at::Tensor();
   }
+  const auto memory_format = tensor.suggest_memory_format();
   int64_t n = tensor.sizes()[dim] / groups;
-  return tensor.narrow(dim, n * g, n).contiguous();
+  return tensor.narrow(dim, n * g, n).contiguous(memory_format);
+}
+
+namespace {
+
+std::pair<Tensor, Tensor> complex_to_real(const Tensor& inp) {
+  auto inp_view_as_complex = at::view_as_real(inp);
+  auto dim_i = inp_view_as_complex.dim() - 1;
+  auto i_r = inp_view_as_complex.select(dim_i, 0);
+  auto i_i = inp_view_as_complex.select(dim_i, 1);
+  return std::make_pair(i_r, i_i);
+}
+
+at::Tensor complex_convolution(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    IntArrayRef output_padding,
+    int64_t groups) {
+  check_input_same_type_as_parameters(input, weight, bias);
+  Tensor i_r, i_i, w_r, w_i;
+  std::tie(i_r, i_i) = complex_to_real(input.resolve_conj());
+  std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj());
+
+  // [NOTE] Complex Convolution
+  // conv(W, x, b) = conv(Wr, xr, br) - conv(Wi, xi, 0) + i(conv(Wi, xr, bi) + conv(Wr, xi, 0))
+  // where W, x and b are all complex inputs.
+  // With Gauss Trick:
+  // a = conv(Wr, xr, br),
+  // b = conv(Wi, xi, 0),
+  // c = conv(Wr + Wi, xr + xi, bi + br)
+  // conv(W, x, b) = a - b + i(c - a - b)
+  Tensor a, b, c;
+  if (!bias.defined()) {
+    a = at::convolution(i_r, w_r, bias, stride, padding, dilation, false, output_padding, groups);
+    b = at::convolution(i_i, w_i, bias, stride, padding, dilation, false, output_padding, groups);
+    c = at::convolution(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, false, output_padding, groups);
+  } else {
+    Tensor b_r, b_i;
+    std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj());
+    a = at::convolution(i_r, w_r, b_r, stride, padding, dilation, false, output_padding, groups);
+    b = at::convolution(i_i, w_i, Tensor(), stride, padding, dilation, false, output_padding, groups);
+    c = at::convolution(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, false, output_padding, groups);
+  }
+
+  auto i = c10::Scalar(c10::complex<double>(0, 1));
+  return a - b + i * (c - a - b);
 }
 
+at::Tensor complex_convolution_mode(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias_opt,
+    at::IntArrayRef stride,
+    c10::string_view padding,
+    at::IntArrayRef dilation,
+    int64_t groups) {
+  auto bias = bias_opt.value_or(Tensor());
+  check_input_same_type_as_parameters(input, weight, bias);
+  Tensor i_r, i_i, w_r, w_i;
+  std::tie(i_r, i_i) = complex_to_real(input.resolve_conj());
+  std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj());
+
+  // See [NOTE] Complex Convolution
+  Tensor a, b, c;
+  if (!bias.defined()) {
+    a = at::_convolution_mode(i_r, w_r, bias, stride, padding, dilation, groups);
+    b = at::_convolution_mode(i_i, w_i, bias, stride, padding, dilation, groups);
+    c = at::_convolution_mode(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups);
+  } else {
+    Tensor b_r, b_i;
+    std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj());
+    a = at::_convolution_mode(i_r, w_r, b_r, stride, padding, dilation, groups);
+    b = at::_convolution_mode(i_i, w_i, Tensor(), stride, padding, dilation, groups);
+    c = at::_convolution_mode(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups);
+  }
+
+  auto i = c10::Scalar(c10::complex<double>(0, 1));
+  return a - b + i * (c - a - b);
+}
+
+} // namespace
 
 at::Tensor conv1d(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
@@ -663,7 +787,12 @@ at::Tensor conv1d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
-  auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution(input, weight, bias, stride, padding, dilation, {0}, groups);
+  } else {
+    output = at::convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -677,7 +806,12 @@ at::Tensor conv2d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
-  auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution(input, weight, bias, stride, padding, dilation, {{0, 0}}, groups);
+  } else {
+    output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -691,7 +825,12 @@ at::Tensor conv3d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
-  auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution(input, weight, bias, stride, padding, dilation, {{0, 0, 0}}, groups);
+  } else {
+    output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -787,8 +926,12 @@ at::Tensor conv1d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
-  auto output = at::_convolution_mode(
-      input, weight, bias, stride, std::move(padding), dilation, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  } else {
+    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -799,8 +942,12 @@ at::Tensor conv2d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
-  auto output = at::_convolution_mode(
-      input, weight, bias, stride, std::move(padding), dilation, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  } else {
+    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -811,8 +958,12 @@ at::Tensor conv3d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
-  auto output = at::_convolution_mode(
-      input, weight, bias, stride, std::move(padding), dilation, groups);
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  } else {
+    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
@@ -933,7 +1084,7 @@ ConvBackend select_conv_backend(
 ConvBackend select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const bool need_backward,
     const ConvParams& params) {
 
@@ -1018,6 +1169,12 @@ ConvBackend select_conv_backend(
         // unsupported
       }
     }
+  } else if (params.use_mps(input, weight)) {
+    if (params.transposed) {
+      return ConvBackend::MpsTranspose;
+    } else {
+      return ConvBackend::Mps;
+    }
   } else {
     // Only reach here when input is backend with out-of-source implementation.
     return ConvBackend::Overrideable;
@@ -1078,18 +1235,41 @@ static inline std::vector<int64_t> calc_output_size(
 
 static inline at::MemoryFormat determine_backend_memory_format(
     const Tensor& input,
-    const Tensor& weight) {
+    const Tensor& weight,
+    const ConvBackend backend) {
   at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
   auto k = weight.ndimension();
 #if !defined(C10_MOBILE)
   // See Note [Mobile check segfaults]
-  if (detail::getCUDAHooks().compiledWithCuDNN()) {
-    backend_memory_format = cudnn_conv_suggest_memory_format(input, weight);
-  }
-  if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
-    TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
-        "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-    backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+  switch(backend) {
+    case ConvBackend::Cudnn:
+    case ConvBackend::CudnnTranspose:
+      if (detail::getCUDAHooks().compiledWithCuDNN()) {
+        backend_memory_format = cudnn_conv_suggest_memory_format(input, weight);
+      }
+      break;
+    case ConvBackend::Miopen:
+    case ConvBackend::MiopenDepthwise:
+    case ConvBackend::MiopenTranspose:
+      if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
+        TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
+            "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
+        backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+      }
+      break;
+    case ConvBackend::Mkldnn:
+      if (mkldnn_conv_use_channels_last(input, weight)) {
+        backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+      }
+      break;
+    case ConvBackend::Slow2d:
+    case ConvBackend::SlowDilated2d:
+      if (thnn_conv_use_channels_last(input, weight)) {
+        backend_memory_format = at::MemoryFormat::ChannelsLast;
+      }
+      break;
+    default:
+      backend_memory_format = at::MemoryFormat::Contiguous;
   }
 #endif
   return backend_memory_format;
@@ -1142,7 +1322,7 @@ at::Tensor _convolution(
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
   ConvBackend backend = select_conv_backend(input, weight, bias_sizes_opt, need_backward, params);
-  at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight);
+  at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend);
 
   // Call the backend.
   Tensor output;
@@ -1203,18 +1383,11 @@ at::Tensor _convolution(
       break;
     case ConvBackend::Mkldnn:
 #if AT_MKLDNN_ENABLED()
-      TORCH_CHECK(input.options().type_equal(weight.options())
-          || (input.is_mkldnn() && weight.device().is_cpu() && weight.scalar_type() == kFloat),
-          "Input type (", input.toString(), ") and weight type (", weight.toString(),
-          ") should be the same or input should be a MKLDNN tensor and weight is a dense tensor");
-      TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options()))
-          || (input.is_mkldnn() && bias.device().is_cpu() && bias.scalar_type() == kFloat),
-          "Input type (", input.toString(), ") and bias type (", bias.toString(),
-          ") should be the same or input should be a MKLDNN tensor and bias is a dense tensor");
+      check_input_same_type_as_parameters(input, weight, bias, backend);
       if (!input.is_mkldnn()) {
         // need to ensure contiguous for non-mkldnn tensors
-        input = input.contiguous();
-        weight = weight.contiguous();
+        input = input.contiguous(backend_memory_format);
+        weight = weight.contiguous(backend_memory_format);
         bias = bias.defined() ? bias.contiguous() : bias;
       }
       output = at::mkldnn_convolution(
@@ -1255,11 +1428,12 @@ at::Tensor _convolution(
     case ConvBackend::SlowDilated3d:
     case ConvBackend::SlowTranspose2d:
     case ConvBackend::SlowTranspose3d:
+      input = input.contiguous(backend_memory_format);
+      weight = weight.contiguous(backend_memory_format);
       if (params.groups == 1) {
-        output = _convolution_nogroup_backend(input.contiguous(), weight, bias, backend, params);
+        output = _convolution_nogroup_backend(input, weight, bias, backend, params);
       } else {
         std::vector<Tensor> outputs(params.groups);
-        input = input.contiguous();
         for (const auto g : c10::irange(params.groups)) {
           auto input_g = subtensor(input, 1, params.groups, g);
           auto weight_g = subtensor(weight, 0, params.groups, g);
@@ -1269,6 +1443,41 @@ at::Tensor _convolution(
         output = at::cat(outputs, 1);
       }
       break;
+    case ConvBackend::Mps:
+#ifdef USE_MPS
+      TORCH_CHECK(input.options().type_equal(weight.options()),
+               "Input type (", input.toString(), ") and weight type (", weight.toString(),
+               ") should be the same");
+      TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
+               "Input type (", input.toString(), ") and bias type (", bias.toString(),
+               ") should be the same");
+
+      output = at::_mps_convolution(input.contiguous(), weight, bias.defined() ? bias.contiguous() : bias,
+                                     params.padding, params.stride, params.dilation,
+                                     params.groups);
+#else
+      TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support");
+#endif
+      break;
+    case ConvBackend::MpsTranspose:
+#ifdef USE_MPS
+      TORCH_CHECK(input.options().type_equal(weight.options()),
+               "Input type (", input.toString(), ") and weight type (", weight.toString(),
+               ") should be the same");
+      TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())),
+               "Input type (", input.toString(), ") and bias type (", bias.toString(),
+               ") should be the same");
+      output = at::_mps_convolution_transpose(
+          input.contiguous(backend_memory_format), weight,
+          params.padding, params.output_padding,
+          params.stride, params.dilation, params.groups);
+      if (bias.defined()) {
+        output.add_(reshape_bias(input.dim(), bias));
+      }
+#else
+      TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support");
+#endif
+      break;
   }
 
   if (k == 3 && !input.is_mkldnn()) {
@@ -1565,7 +1774,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _convolution_backward_nogroup_bac
 //   output_mask: 3-dim boolean array specifying which gradients to compute in input, weight, bias order
 std::tuple<Tensor, Tensor, Tensor> convolution_backward(
     const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding,
     int64_t groups, std::array<bool, 3> output_mask) {
   auto grad_output = grad_output_;
@@ -1617,7 +1826,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
 
   // Select appropriate backend to use.
   ConvBackend backend = select_conv_backend(input, weight, bias_sizes_opt, /*need_backward=*/ true, params);
-  at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight);
+  at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend);
 
   // Call the backend.
   Tensor backend_grad_input, backend_grad_weight, backend_grad_bias;
@@ -1651,6 +1860,33 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
           input_weight_output_mask);
       break;
     }
+    case ConvBackend::Mps:
+    {
+#ifdef USE_MPS
+      check_input_same_type_as_parameters(input, weight);
+      std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
+        at::mps_convolution_backward(input, grad_output, weight, params.padding,
+          params.stride, params.dilation, params.groups, output_mask);
+#else
+      TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support");
+#endif
+      break;
+    }
+    case ConvBackend::MpsTranspose:
+    {
+#ifdef USE_MPS
+      check_input_same_type_as_parameters(input, weight);
+      std::array<bool, 2> input_weight_output_mask = {output_mask[0], output_mask[1]};
+      std::tie(backend_grad_input, backend_grad_weight) = at::mps_convolution_transpose_backward(
+        // Only make input contiguous when it is necessary for the backwards computation
+        output_mask[1] ? input.contiguous(backend_memory_format) : input,
+        grad_output, weight, params.padding, params.output_padding,
+        params.stride, params.dilation, params.groups, input_weight_output_mask);
+#else
+      TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support");
+#endif
+      break;
+    }
     case ConvBackend::CudnnTranspose:
     {
       check_input_same_type_as_parameters(input, weight);
@@ -1725,8 +1961,8 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
       TORCH_CHECK(!weight.is_mkldnn(),
           "The MKLDNN backend does not support weight as an MKLDNN tensor during training");
       if (!input.is_mkldnn()) {
-        input = input.contiguous();
-        weight = weight.contiguous();
+        input = input.contiguous(backend_memory_format);
+        weight = weight.contiguous(backend_memory_format);
       }
       std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
         mkldnn_convolution_backward_stub(input.device().type(), input, grad_output, weight, params.padding,
@@ -1753,7 +1989,8 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
     case ConvBackend::SlowTranspose2d:
     case ConvBackend::SlowTranspose3d:
     {
-      input = input.contiguous();
+      input = input.contiguous(backend_memory_format);
+      weight = weight.contiguous(backend_memory_format);
       if (params.groups == 1) {
         std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
           _convolution_backward_nogroup_backend(
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 30fb04b13615..1837a0d838ea 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -26,26 +26,31 @@ static Tensor compute_columns2d(
   const int64_t pad_width = padding[1];
   const int64_t stride_height = stride[0];
   const int64_t stride_width = stride[1];
-  const int64_t dim_planes = 1;
-  const int64_t dim_height = 2;
-  const int64_t dim_width = 3;
-  const int64_t n_input_plane = input.size(dim_planes);
-  const int64_t input_height = input.size(dim_height);
-  const int64_t input_width = input.size(dim_width);
-  const int64_t output_height =
-      (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
-  const int64_t output_width =
-      (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
   const int64_t batch_size = input.size(0);
+  const int64_t n_input_plane = input.size(1);
+  const int64_t input_height = input.size(2);
+  const int64_t input_width = input.size(3);
+  const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
+  const int64_t output_width =  (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
+
+  bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
 
   Tensor columns;
   if ((kernel_height == 1) && (stride_height == 1) && (pad_height == 0) &&
       (kernel_width == 1) && (stride_width == 1) && (pad_width == 0)) {
     // Columns are just a view on the input for the 1x1 kernel special case.
-    columns = input.view({batch_size, n_input_plane, output_height * output_width}).detach();
+    if (is_channels_last) {
+      columns = input.as_strided({batch_size, output_height * output_width, n_input_plane},
+          {output_height * output_width * n_input_plane, n_input_plane, 1}).detach();
+    } else {
+      columns = input.view({batch_size, n_input_plane, output_height * output_width}).detach();
+    }
   } else {
-    columns = at::empty({batch_size, n_input_plane * kernel_height * kernel_width,
-        output_height * output_width}, input.options());
+    int64_t row = is_channels_last ?
+        output_height * output_width : n_input_plane * kernel_height * kernel_width;
+    int64_t col = is_channels_last ?
+        kernel_height * kernel_width * n_input_plane : output_height * output_width;
+    columns = at::empty({batch_size, row, col}, input.options());
     AT_DISPATCH_ALL_TYPES_AND(kBFloat16, input.scalar_type(), "slow_conv2d_cpu", [&]{
       auto input_a = input.accessor<scalar_t, 4>();
       auto columns_a = columns.accessor<scalar_t, 3>();
@@ -69,7 +74,8 @@ static Tensor compute_columns2d(
               input_height,
               input_width,
               output_height,
-              output_width);
+              output_width,
+              is_channels_last);
         }
       });
     });
@@ -189,12 +195,15 @@ static inline void slow_conv2d_shape_check(
   }
 }
 
-static Tensor view_weight_2d(const Tensor& weight_) {
-  Tensor weight = weight_.contiguous();
+static inline Tensor view_weight_2d(const Tensor& weight_,
+    at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) {
+  Tensor weight = weight_.contiguous(memory_format);
   if (weight.dim() == 4) {
     const int64_t s1 = weight.size(0);
     const int64_t s2 = weight.size(1) * weight.size(2) * weight.size(3);
-    return weight.view({s1, s2});
+    return memory_format == at::MemoryFormat::ChannelsLast
+        ? weight.as_strided({s1, s2}, {s2, 1}) // CL: view as {oc, kh*kw*ic}
+        : weight.view({s1, s2}); // CF: view as {oc, ic*kh*kw}
   } else {
     return weight;
   }
@@ -218,29 +227,50 @@ static void slow_conv2d_update_output_frame(
     int64_t input_width,
     int64_t n_output_plane,
     int64_t output_height,
-    int64_t output_width) {
+    int64_t output_width,
+    bool is_channels_last) {
   const int beta = has_bias ? 1 : 0;
 
   // Compute out = weight * input
   // Note gemm expects fortran order, so all 3 matrices are transposed.
   // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A)
-  const int64_t m = output_height * output_width;
-  const int64_t n = n_output_plane;
-  const int64_t k = n_input_plane * kernel_height * kernel_width;
-
-  const int64_t lda = m;
-  const int64_t ldb = k;
-  const int64_t ldc = m;
-
-  at::native::cpublas::gemm(
-      TransposeType::NoTranspose,
-      TransposeType::NoTranspose,
-      m, n, k,
-      static_cast<scalar_t>(1),
-      finput.data(), lda,
-      weight.data(), ldb,
-      static_cast<scalar_t>(beta),
-      output.data(), ldc);
+  if (is_channels_last) {
+    const int64_t m = n_output_plane;
+    const int64_t n = output_height * output_width;
+    const int64_t k = n_input_plane * kernel_height * kernel_width;
+
+    const int64_t lda = k;
+    const int64_t ldb = k;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::Transpose,
+        TransposeType::NoTranspose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        weight.data(), lda,
+        finput.data(), ldb,
+        static_cast<scalar_t>(beta),
+        output.data(), ldc);
+  } else {
+    const int64_t m = output_height * output_width;
+    const int64_t n = n_output_plane;
+    const int64_t k = n_input_plane * kernel_height * kernel_width;
+
+    const int64_t lda = m;
+    const int64_t ldb = k;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::NoTranspose,
+        TransposeType::NoTranspose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        finput.data(), lda,
+        weight.data(), ldb,
+        static_cast<scalar_t>(beta),
+        output.data(), ldc);
+  }
 }
 
 template <typename scalar_t>
@@ -254,27 +284,48 @@ void slow_conv2d_backward_update_grad_input_frame(
     int64_t stride_height,
     int64_t stride_width,
     int64_t pad_height,
-    int64_t pad_width) {
+    int64_t pad_width,
+    bool is_channels_last) {
   // Compute fgrad_input = weight.T * grad_output.reshape({grad_output.shape(0), -1})
   // Note gemm expects fortran order, so all 3 matrices are transposed.
   // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A)
-  const int64_t m = grad_output.size(1) * grad_output.size(2);
-  const int64_t n = weight.size(1);
-  const int64_t k = weight.size(0);
-
-  const int64_t lda = m;
-  const int64_t ldb = n;
-  const int64_t ldc = m;
-
-  at::native::cpublas::gemm(
-      TransposeType::NoTranspose,
-      TransposeType::Transpose,
-      m, n, k,
-      static_cast<scalar_t>(1),
-      grad_output.data(), lda,
-      weight.data(), ldb,
-      static_cast<scalar_t>(0),
-      fgrad_input, ldc);
+  if (is_channels_last) {
+    const int64_t m = weight.size(1);
+    const int64_t n = grad_output.size(1) * grad_output.size(2);
+    const int64_t k = weight.size(0);
+
+    const int64_t lda = m;
+    const int64_t ldb = k;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::NoTranspose,
+        TransposeType::NoTranspose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        weight.data(), lda,
+        grad_output.data(), ldb,
+        static_cast<scalar_t>(0),
+        fgrad_input, ldc);
+  } else {
+    const int64_t m = grad_output.size(1) * grad_output.size(2);
+    const int64_t n = weight.size(1);
+    const int64_t k = weight.size(0);
+
+    const int64_t lda = m;
+    const int64_t ldb = n;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::NoTranspose,
+        TransposeType::Transpose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        grad_output.data(), lda,
+        weight.data(), ldb,
+        static_cast<scalar_t>(0),
+        fgrad_input, ldc);
+  }
 
   unfolded2d_acc_stub(
       kCPU,
@@ -291,7 +342,8 @@ void slow_conv2d_backward_update_grad_input_frame(
       grad_input.size(1),
       grad_input.size(2),
       grad_output.size(1),
-      grad_output.size(2));
+      grad_output.size(2),
+      is_channels_last);
 }
 
 void slow_conv2d_backward_out_cpu_template(
@@ -309,7 +361,10 @@ void slow_conv2d_backward_out_cpu_template(
   const int64_t stride_height = stride[0];
   const int64_t stride_width = stride[1];
 
-  const Tensor weight = view_weight_2d(weight_);
+  bool use_channels_last = thnn_conv_use_channels_last(input_, weight_);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
+  const Tensor weight = view_weight_2d(weight_, memory_format);
   slow_conv2d_shape_check(
       input_,
       grad_output_,
@@ -323,27 +378,21 @@ void slow_conv2d_backward_out_cpu_template(
       pad_width,
       false);
 
-  const Tensor input = input_.contiguous();
+  const Tensor input = input_.contiguous(memory_format);
 
   // Compute shape of columnized data excluding batch dim.
-  const int64_t dim_planes = 1;
-  const int64_t dim_height = 2;
-  const int64_t dim_width = 3;
-  const int64_t n_input_plane = input.size(dim_planes);
-  const int64_t input_height = input.size(dim_height);
-  const int64_t input_width = input.size(dim_width);
-  const int64_t output_height =
-      (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
-  const int64_t output_width =
-      (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
-  const int64_t fgrad_input_size =
-      n_input_plane * kernel_height * kernel_width * output_height * output_width;
-
-  const Tensor grad_output = grad_output_.contiguous();
-  grad_input.resize_as_(input);
-  grad_input.zero_();
-  TORCH_CHECK(grad_input.is_contiguous(), "slow_conv2d: grad_input must be contiguous");
   const int64_t batch_size = input.size(0);
+  const int64_t n_input_plane = input.size(1);
+  const int64_t input_height = input.size(2);
+  const int64_t input_width = input.size(3);
+  const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
+  const int64_t output_width = (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
+  const int64_t fgrad_input_size = n_input_plane * kernel_height * kernel_width * output_height * output_width;
+
+  const Tensor grad_output = grad_output_.contiguous(memory_format);
+  grad_input.resize_as_(input, memory_format);
+  grad_input.zero_();
+  TORCH_CHECK(grad_input.is_contiguous(memory_format), "slow_conv2d: grad_input must be contiguous");
 
   AT_DISPATCH_FLOATING_TYPES_AND(
       kBFloat16, input.scalar_type(), "slow_conv2d_cpu_grad_input", [&] {
@@ -366,7 +415,8 @@ void slow_conv2d_backward_out_cpu_template(
             stride_height,
             stride_width,
             pad_height,
-            pad_width);
+            pad_width,
+            use_channels_last);
       }
     });
   });
@@ -376,27 +426,48 @@ template <typename scalar_t>
 void slow_conv2d_backward_weight_frame(
     TensorAccessor<scalar_t, 2> grad_weight,
     TensorAccessor<scalar_t, 3> grad_output,
-    TensorAccessor<scalar_t, 2> finput) {
+    TensorAccessor<scalar_t, 2> finput,
+    bool is_channels_last) {
   // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T
   // Note gemm expects fortran order, so all 3 matrices are transposed.
   // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A)
-  const int64_t m = finput.size(0);
-  const int64_t n = grad_output.size(0);
-  const int64_t k = grad_output.size(1) * grad_output.size(2);
-
-  const int64_t lda = k;
-  const int64_t ldb = k;
-  const int64_t ldc = m;
-
-  at::native::cpublas::gemm(
-      TransposeType::Transpose,
-      TransposeType::NoTranspose,
-      m, n, k,
-      static_cast<scalar_t>(1),
-      finput.data(), lda,
-      grad_output.data(), ldb,
-      static_cast<scalar_t>(1),
-      grad_weight.data(), ldc);
+  if (is_channels_last) {
+    const int64_t m = finput.size(1);
+    const int64_t n = grad_output.size(0);
+    const int64_t k = grad_output.size(1) * grad_output.size(2);
+
+    const int64_t lda = m;
+    const int64_t ldb = n;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::NoTranspose,
+        TransposeType::Transpose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        finput.data(), lda,
+        grad_output.data(), ldb,
+        static_cast<scalar_t>(1),
+        grad_weight.data(), ldc);
+  } else {
+    const int64_t m = finput.size(0);
+    const int64_t n = grad_output.size(0);
+    const int64_t k = grad_output.size(1) * grad_output.size(2);
+
+    const int64_t lda = k;
+    const int64_t ldb = k;
+    const int64_t ldc = m;
+
+    at::native::cpublas::gemm(
+        TransposeType::Transpose,
+        TransposeType::NoTranspose,
+        m, n, k,
+        static_cast<scalar_t>(1),
+        finput.data(), lda,
+        grad_output.data(), ldb,
+        static_cast<scalar_t>(1),
+        grad_weight.data(), ldc);
+  }
 }
 
 static void slow_conv2d_backward_weight_out_cpu_template(
@@ -406,9 +477,6 @@ static void slow_conv2d_backward_weight_out_cpu_template(
     IntArrayRef kernel_size,
     IntArrayRef stride,
     IntArrayRef padding) {
-  CheckedFrom c = "slow_conv2d_backward_parameters_cpu";
-  auto grad_weight_arg = TensorArg(grad_weight, "grad_weight_arg", 0);
-
   const int64_t kernel_height = kernel_size[0];
   const int64_t kernel_width = kernel_size[1];
   const int64_t pad_height = padding[0];
@@ -416,9 +484,11 @@ static void slow_conv2d_backward_weight_out_cpu_template(
   const int64_t stride_height = stride[0];
   const int64_t stride_width = stride[1];
 
-  Tensor grad_weight_2d;
-  checkContiguous(c, grad_weight_arg);
-  grad_weight_2d = view_weight_2d(grad_weight);
+  bool use_channels_last = thnn_conv_use_channels_last(input, grad_weight);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
+  TORCH_CHECK(grad_weight.is_contiguous(memory_format), "slow_conv2d: grad_weight must be contiguous");
+  Tensor grad_weight_2d = view_weight_2d(grad_weight, memory_format);
 
   slow_conv2d_shape_check(
       input,
@@ -433,7 +503,7 @@ static void slow_conv2d_backward_weight_out_cpu_template(
       pad_width,
       true);
 
-  auto grad_output = grad_output_.contiguous();
+  auto grad_output = grad_output_.contiguous(memory_format);
   Tensor finput = compute_columns2d(input, padding, stride, kernel_size);
 
   const int64_t batch_size = input.size(0);
@@ -449,7 +519,7 @@ static void slow_conv2d_backward_weight_out_cpu_template(
       auto finput_t = finput_a[t];
 
       slow_conv2d_backward_weight_frame(
-          grad_weight_2d_a, grad_output_t, finput_t);
+          grad_weight_2d_a, grad_output_t, finput_t, use_channels_last);
     }
   });
 }
@@ -474,7 +544,10 @@ Tensor& slow_conv2d_forward_out_cpu(
   const int64_t stride_height = stride[0];
   const int64_t stride_width = stride[1];
 
-  const Tensor weight_2d = view_weight_2d(weight_);
+  bool use_channels_last = thnn_conv_use_channels_last(self, weight_);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
+  const Tensor weight_2d = view_weight_2d(weight_, memory_format);
 
   slow_conv2d_shape_check(
       self,
@@ -489,28 +562,21 @@ Tensor& slow_conv2d_forward_out_cpu(
       pad_width,
       false);
 
-  const Tensor input = self.contiguous();
-  const int64_t dim_planes = 1;
-  const int64_t dim_height = 2;
-  const int64_t dim_width = 3;
-
-  const int64_t n_input_plane = input.size(dim_planes);
-  const int64_t input_height = input.size(dim_height);
-  const int64_t input_width = input.size(dim_width);
-  const int64_t n_output_plane = weight_2d.size(0);
-  const int64_t output_height =
-      (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
-  const int64_t output_width =
-      (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
-
+  const Tensor input = self.contiguous(memory_format);
   const int64_t batch_size = input.size(0);
+  const int64_t n_input_plane = input.size(1);
+  const int64_t input_height = input.size(2);
+  const int64_t input_width = input.size(3);
+  const int64_t n_output_plane = weight_2d.size(0);
+  const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1;
+  const int64_t output_width = (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
 
   Tensor finput = compute_columns2d(input, padding, stride, kernel_size);
-  output.resize_({batch_size, n_output_plane, output_height, output_width});
+  output.resize_({batch_size, n_output_plane, output_height, output_width}, memory_format);
   if (bias.defined()) {
     output.copy_(bias.reshape({-1, 1, 1}));
   }
-  TORCH_CHECK(output.is_contiguous(), "slow_conv2d output tensor must be contiguous");
+  TORCH_CHECK(output.is_contiguous(memory_format), "slow_conv2d output tensor must be contiguous");
 
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, input.scalar_type(), "slow_conv2d_cpu", [&]{
     auto input_a = input.accessor<scalar_t, 4>();
@@ -540,7 +606,8 @@ Tensor& slow_conv2d_forward_out_cpu(
             input_width,
             n_output_plane,
             output_height,
-            output_width);
+            output_width,
+            use_channels_last);
       }
     });
   });
@@ -596,9 +663,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(
     at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
   }
 
-
   if (grad_weight.defined()) {
-    grad_weight.resize_(weight.sizes());
+    grad_weight.resize_(weight.sizes(), weight.suggest_memory_format());
     grad_weight.zero_();
     slow_conv2d_backward_weight_out_cpu_template(
         grad_weight,
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index caf2dfe7773f..46c0d48d8a7b 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -2,9 +2,11 @@
 
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
+#include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/quantized/Copy.h>
+#include <ATen/native/mps/Copy.h>
 #include <ATen/native/vulkan/ops/Copy.h>
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/vulkan/Context.h>
@@ -52,7 +54,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
   // The code below is implemented with the assumption that sizes are equal
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes()));
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kHalf, kBool, kBFloat16, kComplexHalf, self.scalar_type(), "copy_", [&] {
     scalar_t* sp = src.data_ptr<scalar_t>();
     scalar_t* rp = self.data_ptr<scalar_t>();
     scalar_t* bp = buf.data_ptr<scalar_t>();
@@ -97,7 +99,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
 // (e.g. XLA) may be supported by overriding copy_ and _copy_from.
 bool is_supported_device(Device device) {
   DeviceType device_type = device.type();
-  return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal;
+  return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS;
 }
 
 } // namespace
@@ -184,7 +186,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   }
 
   if (self.is_quantized() && !src.is_quantized()) {
-    return quantized_copy_from_float_cpu_(self, src);
+    return quantized_copy_from_float_(self, src);
   }
 
   if (self.is_quantized() && src.is_quantized()) {
@@ -210,6 +212,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
     return at::metal::metal_copy_(self, src);
   }
 
+
   auto iter = TensorIteratorConfig()
     .add_output(self)
     .add_input(src)
@@ -227,6 +230,8 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
     device_type = kCUDA;
   } else if (iter.device_type(1) == kHIP) {
     device_type = kHIP;
+  } else if (iter.device_type(1) == kMPS) {
+    device_type = kMPS;
   }
 
   // TODO: if we need to, we can also enable this path for quantized tensor
@@ -235,6 +240,12 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
     return self;
   }
 
+#ifdef USE_MPS
+  if (self.device().type() == at::kMPS || src.device().type() == at::kMPS) {
+    return at::native::mps::mps_copy_(self, src, non_blocking);
+  }
+#endif
+
   if(!self.is_complex() && src.is_complex()) {
     TORCH_WARN_ONCE("Casting complex values to real discards the imaginary part");
   }
@@ -242,6 +253,24 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   return self;
 }
 
+Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
+  // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
+  // (1) It isn't exposed to the frontend (no python bindings)
+  // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
+  // Note: This implementation doesn't currently preserve the strides of `self`.
+  // That might be fine for functorch (which already doesn't preserve strides in vmap),
+  // but it's worth looking into whether or not this implementation will be problematic for LazyTensor/XLA.
+  auto intermediate = src.to(self, non_blocking);
+  // Unfortunately, copy()'s decomposition involves view ops.
+  // To preserve the functionalization pass semantics of "maybe reapply views",
+  // we need to manually do that here.
+  if (at::functionalization::impl::getFunctionalizationReapplyViewsTLS()) {
+    return intermediate.expand(self.sizes());
+  } else {
+    return at::expand_copy(intermediate, self.sizes());
+  }
+}
+
 Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
   auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src);
   {
@@ -258,7 +287,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
   return self;
 }
 
-void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) {
+void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) {
   // Called when we are copying into an overlapping index `dst`, but we don't
   // care which writer wins. Hacky but it works. This is only used by
   // CUDA_tensor_apply2 in case that there are write overlaps.
diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h
index 6f688a73e84c..14abb32fa5ad 100644
--- a/aten/src/ATen/native/Copy.h
+++ b/aten/src/ATen/native/Copy.h
@@ -6,6 +6,7 @@ namespace at {
 
 class Tensor;
 struct TensorIterator;
+class TensorBase;
 
 namespace native {
 
@@ -13,7 +14,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking);
 
 DECLARE_DISPATCH(copy_fn, copy_stub);
 
-TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src);
+TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/Cross.h b/aten/src/ATen/native/Cross.h
index 30001fc6b8a2..9daee7f2d6c4 100644
--- a/aten/src/ATen/native/Cross.h
+++ b/aten/src/ATen/native/Cross.h
@@ -1,9 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
 
-namespace at { namespace native {
+namespace at {
+class Tensor;
+
+namespace native {
 
 using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d);
 
diff --git a/aten/src/ATen/native/DilatedConvolutionUtils.h b/aten/src/ATen/native/DilatedConvolutionUtils.h
index 2d4815799b10..51b30a9bc77a 100644
--- a/aten/src/ATen/native/DilatedConvolutionUtils.h
+++ b/aten/src/ATen/native/DilatedConvolutionUtils.h
@@ -4,7 +4,7 @@
 #include <vector>
 
 #include <ATen/div_rtn.h>
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
 
 #define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 02fb12928090..bd04b4df9a95 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -197,8 +197,8 @@ struct RegisterHIPDispatch {
     stub.set_cuda_dispatch_ptr(value);
   }
 };
-} // anonymous namespace
 
+} // anonymous namespace
 // Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
 // the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
 // adding parentheses and using helper struct to get rid of the parentheses, do
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 1bbb9cb8426a..8d23e10b1719 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -239,19 +239,72 @@ Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, c
   return result;
 }
 
-Tensor cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim, double eps) {
-  auto common_size = at::infer_size_dimvector(x1.sizes(), x2.sizes());
-  auto commonDtype = at::result_type(x1, x2);
+Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, double eps) {
+  /*
+   * cosine_similarity(x1, x2) = <x1, x2> / (||x1|| * ||x2||)
+   *
+   * The current implementation is an improvement over the previous version.
+   *
+   * Previous implementation:
+   * 1. Compute num = <x1, x2>,
+   * 2. Compute denom = ||x1|| * ||x2||,
+   * 3. Compute denom = max(denom, eps) to avoid division by zero,
+   * 4. Return num / denom.
+   *
+   * Previous implementation has the following issues:
+   * 1. Chance of losing precision in <x1, x2> when ||x1|| and ||x2|| are large.
+   * 2. Chance of losing precision in ||x1|| * ||x2|| when ||x1|| and ||x2|| are large.
+   * 3. Losing precision may cause |cosing_similarity(x1, x2)| > 1.0.
+   *
+   * Current implementation:
+   * 1. Compute x1_normalized = x1 / max(||x1||, eps),
+   *            x2_normalized = x2 / max(||x2||, eps),
+   * 2. Return <x1_normalized, x2_normalized>.
+   *
+   * The current implementation improves over the previous one by:
+   * 1. Making sure that <x1, x2> and ||x1|| * ||x2|| are not computed explicitly,
+   *    hence avoiding floating point overflows.
+   * 2. Both methods might have issues with computing ||x1|| and ||x2||, but for
+   *    the current method this is the only source of the floating point imprecision.
+   * 3. Makes sure |cosing_similarity(x1, x2)| <= 1.0.
+   *
+   */
+  auto commonDtype = at::result_type(x1_, x2_);
   TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype);
-  Tensor x1_ = x1.to(commonDtype).expand(common_size);
-  Tensor x2_ = x2.to(commonDtype).expand(common_size);
-  // Follow scipy impl to improve numerical precision
-  // Use x / sqrt(x * x) instead of x / (sqrt(x) * sqrt(x))
-  Tensor w12 = at::sum(x1_ * x2_, dim);
-  Tensor w1 = at::sum(x1_ * x1_, dim);
-  Tensor w2 = at::sum(x2_ * x2_, dim);
-  Tensor n12 = (w1 * w2).clamp_min_(eps * eps).sqrt_();
-  return w12.div_(n12);
+
+  auto common_size = at::infer_size_dimvector(x1_.sizes(), x2_.sizes());
+  auto x1 = x1_.to(commonDtype).expand(common_size);
+  auto x2 = x2_.to(commonDtype).expand(common_size);
+
+  auto x1_squared_norm = at::pow(x1, 2).sum(dim, /*keepdim=*/true);
+  auto x2_squared_norm = at::pow(x2, 2).sum(dim, /*keepdim=*/true);
+
+  {
+    at::NoGradGuard guard;
+    x1_squared_norm.clamp_min_(eps * eps);
+    x2_squared_norm.clamp_min_(eps * eps);
+  }
+
+  auto x1_norm = x1_squared_norm.sqrt_();
+  auto x2_norm = x2_squared_norm.sqrt_();
+
+  auto x1_normalized = x1.div(x1_norm);
+  auto x2_normalized = x2.div(x2_norm);
+
+  Tensor cos_sim_value = at::sum(x1_normalized * x2_normalized, dim);
+
+  // The code above is resistant to over +/-1 overshoots.
+  // However, if this happens and if it is critical, uncommenting
+  // the lines below will solve the issue.
+  // We keep these lines commented as to reduce the number of kernel
+  // launches for better runtime performance.
+  //{
+  //  at::NoGradGuard guard;
+  //  cos_sim_value.clamp_min_(-1.0);
+  //  cos_sim_value.clamp_max_(1.0);
+  //}
+
+  return cos_sim_value;
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/Distance.h b/aten/src/ATen/native/Distance.h
index f8ea4741207b..c2d881ae66f6 100644
--- a/aten/src/ATen/native/Distance.h
+++ b/aten/src/ATen/native/Distance.h
@@ -1,9 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
 
-namespace at { namespace native {
+namespace at {
+class Tensor;
+
+namespace native {
 
 using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p);
 using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index c8a8a6ed8a50..907dffc6f736 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -6,6 +6,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/util/Optional.h>
 #include <limits>
@@ -157,50 +158,22 @@ at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<in
 
 // ==================================================== Normal ========================================================
 
-// This function computes broadcasted size of mean and std, resize the output to the broadcasted size if it was empty
-// [Note] The following features will be deprecated in version 1.6 release and function signature will be changed after
-//   When mean and std are not broadcastable but have same number of elements:
-//     This function will resize the output to the size of mean if it was empty.
-//     This function will reshape the std to the shape of mean.
-//     This function will return true in deprecated case, false in broadcastable case and throw in all other cases before deprecation.
-//     This function will not return and throw if mean and std are not broadcastable after deprecation
-static bool resize_output_for_normal(at::Tensor& output, const at::Tensor& mean, const at::Tensor& std) {
-  bool expandable = at::are_expandable(mean.sizes(), std.sizes());
-  bool empty_output = output.numel() == 0;
-
-  if (expandable) {
-    auto shape = at::infer_size(mean.sizes(), std.sizes());
-    TORCH_CHECK(
-        empty_output || output.sizes().equals(shape),
-        "inconsistent tensor, output size (", output.sizes(), ") is not the same as broadcasted mean and std size (", shape, ")");
-    if (empty_output) {
-      at::native::resize_(output, shape);
-    }
-    return false;
-  }
-  else {
-    TORCH_CHECK(
-        mean.numel() == std.numel(),
-        "inconsistent tensor, std and mean are not broadcastable and have different number of elements, "
-        "expected mean ", mean.sizes(), " and std ", std.sizes(), " to have same number of elements)");
-    TORCH_CHECK(
-        empty_output || output.sizes().equals(mean.sizes()),
-        "inconsistent tensor, std and mean are not broadcastable, output size (", output.sizes(), ") is not the same as mean size (", mean.sizes(), ")");
-    TORCH_WARN_ONCE(
-        "std and mean have the same number of elements, but are not broadcastable. This was previously a "
-        "supported mode of operation, but is now deprecated and the support will be removed in version 1.6 release. "
-        "Note that the current implementation reshapes std to the shape of mean, which may be incur data copies. "
-        "Please ensure that std and mean are broadcastable to avoid these issues.");
-    if (empty_output) {
-      at::native::resize_(output, mean.sizes());
-    }
-    return true;
-  }
-}
+#define CHECK_NORMAL_TENSOR_STD(std) \
+  do { \
+    TORCH_CHECK( \
+      !std.is_complex(), \
+      "normal expects standard deviation to be non-complex"); \
+    TORCH_CHECK( \
+      std.numel() == 0 || std.min().ge(0).item<bool>(), \
+      "normal expects all elements of std >= 0.0"); \
+  } while (0)
+
+#define CHECK_NORMAL_STD(std) \
+  TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
-  TORCH_CHECK(std >= 0.0, "normal_ expects std >= 0.0, but found std=", std);
+  CHECK_NORMAL_STD(std);
   if (self.is_complex()) {
     auto float_tensor = at::view_as_real(self);
     // variance for normal distribution of the real and imaginary values
@@ -214,6 +187,10 @@ Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Genera
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
+  auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
+  at::native::resize_output(output, shape);
   normal_impl_<normal_kernel, RNG>(output, 0, std, gen);
   output.add_(mean);
   return output;
@@ -221,12 +198,11 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::opt
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
-  TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex");
-  TORCH_CHECK(
-    std.min().ge(0).item<bool>(),
-    "normal expects all elements of std >= 0.0");
-  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  CHECK_NORMAL_TENSOR_STD(std);
   auto mean_tensor = at::full({}, mean, output.options());
+  auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
   // CUDA NB: addcmul_out copies the tensor to be added into the output.
   // Please look at aten/src/THC/generic/THCTensorMathPointwise.cu
   // The previous function here was addcmul_out(output, mean_tensor, output, std, 1);
@@ -238,28 +214,22 @@ Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::opt
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
-  TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex");
-  TORCH_CHECK(
-    std.numel() == 0 || std.min().ge(0).item<bool>(),
-    "normal expects all elements of std >= 0.0");
-  bool is_deprecated_th_impl = resize_output_for_normal(output, mean, std);
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
   normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
   // CUDA NB: addcmul_out copies the tensor to be added into the output.
   // Please look at aten/src/THC/generic/THCTensorMathPointwise.cu
   // The previous function here was addcmul_out(output, mean, output, std, 1);
   // The third argument is not a constant reference and hence the samples in output are overwritten.
   // Consequently, the computation performed is mean + mean * std instead of mean + output * std
-  if (is_deprecated_th_impl) {
-    output.mul_(std.reshape(mean.sizes())).add_(mean);
-  }
-  else {
-    output.mul_(std).add_(mean);
-  }
+  output.mul_(std).add_(mean);
   return output;
 }
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
   Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
   return ret;
@@ -267,6 +237,7 @@ Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen)
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
   Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
   return ret;
@@ -274,7 +245,9 @@ Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen)
 
 template<template<typename> class normal_kernel, typename RNG>
 Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
-  Tensor ret = at::empty({0}, mean.options(), MemoryFormat::Contiguous);
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
   return ret;
 }
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index b4063af9931f..b23a18a8376a 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -257,39 +257,77 @@ struct NormalStub {
   }
 };
 
+template<typename RNG>
+struct NormalMeta {
+  // No-op!
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  }
+};
+
+// inplace
 Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl_<NormalStub, Generator>(self, mean, std, gen);
 }
 
 Tensor& normal_meta_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
-  TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std);  // TODO: dedupe
-  return self;
+  return at::native::templates::normal_impl_<NormalMeta, Generator>(self, mean, std, gen);
 }
 
+// out tensor float
 Tensor& normal_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
 }
 
+Tensor& normal_out_meta(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+}
+
+// out float tensor
 Tensor& normal_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
 }
 
+Tensor& normal_out_meta(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+
+}
+
+// out tensor tensor
 Tensor& normal_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
 }
 
+Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+}
+
+// functional tensor float
 Tensor normal(const Tensor& mean, double std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
 }
 
+Tensor normal_meta(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+}
+
+// functional float tensor
 Tensor normal(double mean, const Tensor& std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
 }
 
+Tensor normal_meta(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+}
+
+// functional tensor tensor
 Tensor normal(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
 }
 
+Tensor normal_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+}
+
 // ==================================================== Random ========================================================
 
 template<typename RNG>
@@ -411,7 +449,7 @@ Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional<Generator> gen) {
     .add_output(ret)
     .add_input(lambda)
     .build();
-  AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "poisson_cpu", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, ret.scalar_type(), "poisson_cpu", [&] {
     CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(generator->mutex_);
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index ebfaf4631369..2c334157eba9 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index fb11bc8d8cbb..36e1b92ad1bd 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -99,11 +99,11 @@ native_dropout_cpu(const Tensor& input, double p, c10::optional<bool> train) {
     double p1m = 1. - p;
     // Check for probability of zero to avoid divide by zero and NaN results
     double scale = p1m == 0 ? 0. : 1. / p1m;
-    mask = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    mask = at::empty_like(input, input.options().dtype(c10::CppTypeToScalarType<bool>::value));
     mask.bernoulli_(p1m);
     output = input.mul(mask).mul_(scale);
   } else {
-    mask = at::ones_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    mask = at::ones_like(input, input.options().dtype(c10::CppTypeToScalarType<bool>::value));
     output = input.clone();
   }
   return std::make_tuple(output, mask);
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index e6f88f556c82..6d8cea26f52e 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -10,6 +10,7 @@
 
 #ifdef USE_FBGEMM
 #include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmConvert.h>
 #else
 #include <caffe2/perfkernels/embedding_lookup_idx.h>
 #endif
@@ -60,14 +61,14 @@ std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
 // is only applicable if special conditions are met
 template<typename index_t>
 bool is_fast_path_index_select(const Tensor& src, Tensor& output, index_t padding_idx) {
-  return src.scalar_type() == kFloat && src.strides()[1] == 1 && output.strides()[1] == 1 && padding_idx < static_cast<index_t>(0);
+  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && padding_idx < static_cast<index_t>(0);
 }
 
 // Determines if we can use a fast implementation for index_select_scale_add,
 // which is only applicable if special conditions are met
 template<typename index_t>
 bool is_fast_path_index_select_scale(const Tensor& src, const Tensor& scale, Tensor& output, index_t padding_idx) {
-  return src.scalar_type() == kFloat && src.strides()[1] == 1 && output.strides()[1] == 1 && scale.strides()[0] == 1 && padding_idx < static_cast<index_t>(0);
+  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && scale.strides()[0] == 1 && padding_idx < static_cast<index_t>(0);
 }
 
 template<typename index_t>
@@ -81,7 +82,7 @@ bool is_fast_path(const Tensor& src, const c10::optional<Tensor>& scale, Tensor&
 // index_add (using add_indices as the index), without creating an intermediary
 // tensor to hold the selected embeddings
 template<typename data_t, typename index_t>
-typename std::enable_if<!std::is_same<data_t, float>::value, void>::type
+typename std::enable_if<!std::is_same<data_t, float>::value && !std::is_same<data_t, at::Half>::value, void>::type
 index_select_add(const Tensor &select_indices,
                              const Tensor &add_indices,
                              const Tensor &src,
@@ -89,19 +90,20 @@ index_select_add(const Tensor &select_indices,
                              const Tensor& /*offsets*/,
                              bool /*include_last_offset*/,
                              Tensor &bag_size,
-                             index_t padding_idx) {
+                             index_t padding_idx,
+                             _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   TORCH_CHECK(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.data_ptr<index_t>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
   auto* src_data = src.data_ptr<data_t>();
   auto* output_data = output.data_ptr<data_t>();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  index_t* bag_size_data;
+  index_t* bag_size_data = nullptr;
   if (bag_size.defined()) {
     bag_size_data = bag_size.data_ptr<index_t>();
   }
   auto numel = add_indices.numel();
-  int64_t ddim = src.sizes()[1];
+  int64_t ddim = src.size(1);
   auto vocab_size = src.size(0);
   auto src_stride0 = src.strides()[0];
   auto src_stride1 = src.strides()[1];
@@ -157,6 +159,157 @@ void fbgemm_spmdm_report_error_(
 }
 } // namespace
 
+template<typename data_t, typename index_t>
+typename std::enable_if<std::is_same<data_t, at::Half>::value, void>::type
+index_select_add(const Tensor &select_indices,
+                             const Tensor &add_indices,
+                             const Tensor &src,
+                             Tensor &output,
+                             const Tensor& offsets,
+                             bool include_last_offset,
+                             Tensor &bag_size,
+                             index_t padding_idx,
+                             _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  int64_t ddim = src.size(1);
+  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* output_data = output.data_ptr<at::Half>();
+
+  if (is_fast_path_index_select(src, output, padding_idx)) {
+    auto src_contig = src.contiguous();
+    auto* src_data = src_contig.data_ptr<at::Half>();
+    int64_t output_size = offsets.numel() - 1;
+    auto* offsets_data = offsets.data_ptr<index_t>();
+    std::vector<index_t> offsets_include_last;
+
+    if (include_last_offset) {
+      output_size = offsets.numel() - 1;
+    } else {
+      output_size = offsets.numel();
+      offsets_include_last.resize(offsets.numel() + 1);
+      if (offsets.numel() > 0) {
+        std::memcpy(
+            offsets_include_last.data(),
+            offsets.data_ptr<index_t>(),
+            sizeof(index_t) * offsets.numel());
+      }
+      offsets_include_last[offsets.numel()] = select_indices.numel();
+      offsets_data = offsets_include_last.data();
+    }
+
+#ifdef USE_FBGEMM
+    using float16 = uint16_t;
+    auto kernel_fp16_index_t = fbgemm_kernel_cache ?
+      fbgemm_kernel_cache->getCallback</* has_weight */ false, index_t, float16>(ddim) :
+      fbgemm::GenerateEmbeddingSpMDM<float16, index_t, index_t, float16>(
+        /* block_size */ddim,
+        /* has_weight */false,
+        /* normalize_by_lengths */false,
+        /* prefetch */16,
+        /* is_weight_positional */false,
+        /* use_offsets */true
+      );
+#else
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+#endif
+    at::parallel_for(
+        0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
+#ifdef USE_FBGEMM
+          bool success = kernel_fp16_index_t(
+            /* output_size */end_idx - start_idx,
+            /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
+            /* data_size */src.size(0),
+            /* input */reinterpret_cast<const float16*>(src_data),
+            /* indices */select_indices_data + offsets_data[start_idx],
+            /* offsets_or_lengths */offsets_data + start_idx,
+            /* weights */nullptr,
+            /* output */reinterpret_cast<float16*>(output_data + start_idx * ddim));
+          if (!success) {
+            fbgemm_spmdm_report_error_(
+                end_idx - start_idx,
+                offsets_data[end_idx] - offsets_data[start_idx],
+                src.size(0),
+                offsets_data + start_idx,
+                select_indices_data + offsets_data[start_idx]);
+          }
+#else
+          caffe2::EmbeddingLookupIdx(
+              /*block_size=*/ddim,
+              /*output_size=*/end_idx - start_idx,
+              /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx],
+              /*data_size=*/src.size(0),
+              /*input=*/src_data,
+              /*indices=*/select_indices_data + offsets_data[start_idx],
+              /*offsets=*/offsets_data + start_idx,
+              /*weights=*/nullptr,
+              /*scale_bias=*/nullptr,
+              /*normalize_by_lengths=*/false,
+              /*out=*/output_data_fp32 + start_idx * ddim);
+          for (const auto i : c10::irange(output_size)) {
+            // Convert FP32 intermediate buffer result back to FP16 for output dtype
+            for (const auto d : c10::irange(ddim)) {
+              (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+            }
+          }
+#endif
+        });
+
+  } else {
+    TORCH_CHECK(select_indices.numel() == add_indices.numel());
+    auto* src_data = src.data_ptr<at::Half>();
+    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    index_t* bag_size_data = nullptr;
+    if (bag_size.defined()) {
+      bag_size_data = bag_size.data_ptr<index_t>();
+    }
+    auto vocab_size = src.size(0);
+    auto src_stride0 = src.strides()[0];
+    auto src_stride1 = src.strides()[1];
+    auto output_stride0 = output.strides()[0];
+    auto output_stride1 = output.strides()[1];
+    auto numel = add_indices.numel();
+
+    Tensor src_fp32 = at::empty({ddim}, src.options().dtype(at::kFloat));
+    auto* src_data_fp32 = src_fp32.data_ptr<float>();
+
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+
+    for (const auto i : c10::irange(numel)) {
+      // We can skip indices equal to padding_idx so they are not included in
+      // the reduction
+      auto idx = select_indices_data[i];
+      TORCH_CHECK(
+          idx >= 0 && idx < vocab_size,
+          "embedding_bag: Expected idx >= 0 && idx < num_embeddings but found idx to be ",
+          idx);
+      if (idx != padding_idx) {
+        // Copy src_data + src_stride0 * idx to src_data_fp32
+        for (const auto d : c10::irange(ddim)) {
+          src_data_fp32[d] = static_cast<float>((src_data + src_stride0 * idx)[d * src_stride1]);
+        }
+        at::native::cpublas::axpy<float>(ddim, 1,
+                src_data_fp32, 1,
+                output_data_fp32 + ddim * add_indices_data[i], 1);
+
+      } else if (bag_size.defined()) {
+        // Decrement bag_size to reflect that the index is padded
+        // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+        bag_size_data[add_indices_data[i]]--;
+      }
+    }
+    for (const auto i : c10::irange(output.size(0))) {
+      // Convert FP32 intermediate buffer result back to FP16 for output dtype
+      for (const auto d : c10::irange(ddim)) {
+        (output_data + output_stride0 * i)[d * output_stride1] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+      }
+    }
+  }
+}
+
 template<typename data_t, typename index_t>
 typename std::enable_if<std::is_same<data_t, float>::value, void>::type
 index_select_add(const Tensor &select_indices,
@@ -166,8 +319,9 @@ index_select_add(const Tensor &select_indices,
                              const Tensor& offsets,
                              bool include_last_offset,
                              Tensor &bag_size,
-                             index_t padding_idx) {
-  int64_t ddim = src.sizes()[1];
+                             index_t padding_idx,
+                             _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  int64_t ddim = src.size(1);
   auto* select_indices_data = select_indices.data_ptr<index_t>();
   auto* output_data = output.data_ptr<float>();
 
@@ -195,6 +349,8 @@ index_select_add(const Tensor &select_indices,
 
 #ifdef USE_FBGEMM
     auto kernel_fp32_index_t =
+      fbgemm_kernel_cache ?
+      fbgemm_kernel_cache->getCallback</* has_weight */ false, index_t, float>(ddim) :
       fbgemm::GenerateEmbeddingSpMDM<float, index_t, index_t>(
         /* block_size */ddim,
         /* has_weight */false,
@@ -210,7 +366,7 @@ index_select_add(const Tensor &select_indices,
           bool success = kernel_fp32_index_t(
             /* output_size */end_idx - start_idx,
             /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
-            /* data_size */src.sizes()[0],
+            /* data_size */src.size(0),
             /* input */src_data,
             /* indices */select_indices_data + offsets_data[start_idx],
             /* offsets_or_lengths */offsets_data + start_idx,
@@ -220,7 +376,7 @@ index_select_add(const Tensor &select_indices,
             fbgemm_spmdm_report_error_(
                 end_idx - start_idx,
                 offsets_data[end_idx] - offsets_data[start_idx],
-                src.sizes()[0],
+                src.size(0),
                 offsets_data + start_idx,
                 select_indices_data + offsets_data[start_idx]);
           }
@@ -229,7 +385,7 @@ index_select_add(const Tensor &select_indices,
               /*block_size=*/ddim,
               /*output_size=*/end_idx - start_idx,
               /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx],
-              /*data_size=*/src.sizes()[0],
+              /*data_size=*/src.size(0),
               /*input=*/src_data,
               /*indices=*/select_indices_data + offsets_data[start_idx],
               /*offsets=*/offsets_data + start_idx,
@@ -244,7 +400,7 @@ index_select_add(const Tensor &select_indices,
     auto* src_data = src.data_ptr<float>();
     auto* add_indices_data = add_indices.data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    index_t* bag_size_data;
+    index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
       bag_size_data = bag_size.data_ptr<index_t>();
     }
@@ -284,7 +440,7 @@ index_select_add(const Tensor &select_indices,
 // mul (scaling by per_sample_weights)
 // index_add (using add_indices as the index)
 template<typename data_t, typename index_t>
-static typename std::enable_if<!std::is_same<data_t, float>::value, void>::type
+static typename std::enable_if<!std::is_same<data_t, float>::value && !std::is_same<data_t, at::Half>::value, void>::type
 index_select_scale_add(const Tensor &select_indices,
                                    const Tensor &add_indices,
                                    const Tensor &scale,
@@ -293,14 +449,15 @@ index_select_scale_add(const Tensor &select_indices,
                                    const Tensor& /*offsets*/,
                                    bool /*include_last_offset*/,
                                    Tensor &bag_size,
-                                   index_t padding_idx) {
+                                   index_t padding_idx,
+                                  _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   AT_ASSERT(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.data_ptr<index_t>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
   auto* src_data = src.data_ptr<data_t>();
   auto* output_data = output.data_ptr<data_t>();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  index_t* bag_size_data;
+  index_t* bag_size_data = nullptr;
   if (bag_size.defined()) {
     bag_size_data = bag_size.data_ptr<index_t>();
   }
@@ -338,6 +495,161 @@ index_select_scale_add(const Tensor &select_indices,
   }
 }
 
+template<typename data_t, typename index_t>
+typename std::enable_if<std::is_same<data_t, at::Half>::value, void>::type
+index_select_scale_add(const Tensor &select_indices,
+                       const Tensor &add_indices,
+                       const Tensor &scale,
+                       const Tensor &src,
+                       Tensor &output,
+                       const Tensor& offsets,
+                       bool include_last_offset,
+                       Tensor &bag_size,
+                       index_t padding_idx,
+                       _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  int64_t ddim = src.size(1);
+  auto* scale_data = scale.data_ptr<at::Half>();
+  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* output_data = output.data_ptr<at::Half>();
+
+  if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) {
+    auto src_contig = src.contiguous();
+    auto* src_data = src_contig.data_ptr<at::Half>();
+    int64_t output_size = offsets.numel() - 1;
+    auto* offsets_data = offsets.data_ptr<index_t>();
+    std::vector<index_t> offsets_include_last;
+
+    if (include_last_offset) {
+      output_size = offsets.numel() - 1;
+    } else {
+      output_size = offsets.numel();
+      offsets_include_last.resize(offsets.numel() + 1);
+      std::memcpy(
+          offsets_include_last.data(),
+          offsets.data_ptr<index_t>(),
+          sizeof(index_t) * offsets.numel());
+      offsets_include_last[offsets.numel()] = select_indices.numel();
+      offsets_data = offsets_include_last.data();
+    }
+
+    Tensor scale_fp32 = at::empty(scale.sizes(), scale.options().dtype(at::kFloat));
+    auto* scale_data_fp32 = scale_fp32.data_ptr<float>();
+
+#ifdef USE_FBGEMM
+    using float16 = uint16_t;
+    fbgemm::Float16ToFloat_simd(reinterpret_cast<const float16*>(scale_data), scale_data_fp32, scale_fp32.numel());
+    auto kernel_fp16_index_t =
+      fbgemm_kernel_cache ?
+      fbgemm_kernel_cache->getCallback</* has_weight */ true, index_t, float16>(ddim) :
+      fbgemm::GenerateEmbeddingSpMDM<float16, index_t, index_t, float16>(
+        /* block_size */ddim,
+        /* has_weight */true,
+        /* normalize_by_lengths */false,
+        /* prefetch */16,
+        /* is_weight_positional */false,
+        /* use_offsets */true
+      );
+#else
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+    for (const auto i : c10::irange(scale.numel())) {
+      scale_data_fp32[i] = static_cast<float>(scale_data[i]);
+    }
+#endif
+    at::parallel_for(
+        0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
+#ifdef USE_FBGEMM
+          bool success = kernel_fp16_index_t(
+            /* output_size */end_idx - start_idx,
+            /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
+            /* data_size */src.size(0),
+            /* input */reinterpret_cast<const float16*>(src_data),
+            /* indices */select_indices_data + offsets_data[start_idx],
+            /* offsets_or_lengths */offsets_data + start_idx,
+            /* weights */scale_data_fp32 + offsets_data[start_idx],
+            /* output */reinterpret_cast<float16*>(output_data + start_idx * ddim));
+          if (!success) {
+            fbgemm_spmdm_report_error_(
+                end_idx - start_idx,
+                offsets_data[end_idx] - offsets_data[start_idx],
+                src.size(0),
+                offsets_data + start_idx,
+                select_indices_data + offsets_data[start_idx]);
+          }
+#else
+          caffe2::EmbeddingLookupIdx(
+              /*block_size=*/ddim,
+              /*output_size=*/end_idx - start_idx,
+              /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx],
+              /*data_size=*/src.size(0),
+              /*input=*/src_data,
+              /*indices=*/select_indices_data + offsets_data[start_idx],
+              /*offsets=*/offsets_data + start_idx,
+              /*weights=*/scale_data_fp32 + offsets_data[start_idx],
+              /*scale_bias=*/nullptr,
+              /*normalize_by_lengths=*/false,
+              /*out=*/output_data_fp32 + start_idx * ddim);
+          for (const auto i : c10::irange(output_size)) {
+            // Convert FP32 intermediate buffer result back to FP16 for output dtype
+            for (const auto d : c10::irange(ddim)) {
+              (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+            }
+          }
+#endif
+        });
+  } else {
+    AT_ASSERT(select_indices.numel() == add_indices.numel());
+    auto* src_data = src.data_ptr<at::Half>();
+    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    index_t* bag_size_data = nullptr;
+    if (bag_size.defined()) {
+      bag_size_data = bag_size.data_ptr<index_t>();
+    }
+    auto vocab_size = src.size(0);
+    auto src_stride0 = src.strides()[0];
+    auto src_stride1 = src.strides()[1];
+    auto output_stride0 = output.strides()[0];
+    auto output_stride1 = output.strides()[1];
+    auto scale_stride = scale.strides()[0];
+    auto numel = add_indices.numel();
+
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+
+    for (const auto i : c10::irange(numel)) {
+      // We can skip indices equal to padding_idx so they are not included in
+      // the reduction
+      auto idx = select_indices_data[i];
+      TORCH_CHECK(
+          idx >= 0 && idx < vocab_size,
+          "embedding_bag: Expected idx >= 0 && idx < num_embeddings but found idx to be ",
+          idx);
+      if (idx != padding_idx) {
+
+        auto* src_base = src_data + src_stride0 * idx;
+        auto* output_base_fp32 = output_data_fp32 + ddim * add_indices_data[i];
+        auto scale = scale_data[i * scale_stride];
+        for (const auto j : c10::irange(ddim)) {
+          output_base_fp32[j] += static_cast<float>(src_base[j * src_stride1]) * static_cast<float>(scale);
+        }
+      } else if (bag_size.defined()) {
+        // Decrement bag_size to reflect that the index is padded
+        // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+        bag_size_data[add_indices_data[i]]--;
+      }
+    }
+    for (const auto i : c10::irange(output.size(0))) {
+      // Convert FP32 intermediate buffer result back to FP16 for output dtype
+      for (const auto d : c10::irange(ddim)) {
+        (output_data + output_stride0 * i)[d * output_stride1] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+      }
+    }
+  }
+}
+
 template<typename data_t, typename index_t>
 typename std::enable_if<std::is_same<data_t, float>::value, void>::type
 index_select_scale_add(const Tensor &select_indices,
@@ -348,8 +660,9 @@ index_select_scale_add(const Tensor &select_indices,
                                           const Tensor& offsets,
                                           bool include_last_offset,
                                           Tensor &bag_size,
-                                          index_t padding_idx) {
-  int64_t ddim = src.sizes()[1];
+                                          index_t padding_idx,
+                                          _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  int64_t ddim = src.size(1);
   auto* scale_data = scale.data_ptr<float>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
   auto* output_data = output.data_ptr<float>();
@@ -376,6 +689,8 @@ index_select_scale_add(const Tensor &select_indices,
 
 #ifdef USE_FBGEMM
     auto kernel_fp32_index_t =
+      fbgemm_kernel_cache ?
+      fbgemm_kernel_cache->getCallback</* has_weight */ true, index_t, float>(ddim) :
       fbgemm::GenerateEmbeddingSpMDM<float, index_t, index_t>(
         /* block_size */ddim,
         /* has_weight */true,
@@ -391,7 +706,7 @@ index_select_scale_add(const Tensor &select_indices,
           bool success = kernel_fp32_index_t(
             /* output_size */end_idx - start_idx,
             /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
-            /* data_size */src.sizes()[0],
+            /* data_size */src.size(0),
             /* input */src_data,
             /* indices */select_indices_data + offsets_data[start_idx],
             /* offsets_or_lengths */offsets_data + start_idx,
@@ -401,7 +716,7 @@ index_select_scale_add(const Tensor &select_indices,
             fbgemm_spmdm_report_error_(
                 end_idx - start_idx,
                 offsets_data[end_idx] - offsets_data[start_idx],
-                src.sizes()[0],
+                src.size(0),
                 offsets_data + start_idx,
                 select_indices_data + offsets_data[start_idx]);
           }
@@ -410,7 +725,7 @@ index_select_scale_add(const Tensor &select_indices,
               /*block_size=*/ddim,
               /*output_size=*/end_idx - start_idx,
               /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx],
-              /*data_size=*/src.sizes()[0],
+              /*data_size=*/src.size(0),
               /*input=*/src_data,
               /*indices=*/select_indices_data + offsets_data[start_idx],
               /*offsets=*/offsets_data + start_idx,
@@ -425,7 +740,7 @@ index_select_scale_add(const Tensor &select_indices,
     auto* src_data = src.data_ptr<float>();
     auto* add_indices_data = add_indices.data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    index_t* bag_size_data;
+    index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
       bag_size_data = bag_size.data_ptr<index_t>();
     }
@@ -477,17 +792,17 @@ void check_arguments(
   checkScalarTypes("embedding_bag", offsets_arg, {kLong, kInt});
   checkSameType("embedding_bag", indices_arg, offsets_arg);
   auto weight_arg = TensorArg(weight, "weight", 1);
-  checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble});
+  checkScalarTypes("embedding_bag", weight_arg, {kHalf, kFloat, kDouble});
 
   AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_embedding_bag_cpu_impl", [&]() {
-    if (offsets.sizes()[0] > 0) {
+    if (offsets.size(0) > 0) {
       index_t offset_0 = offsets.data_ptr<index_t>()[0];
-      index_t offset_n = offsets.data_ptr<index_t>()[offsets.sizes()[0]-1];
+      index_t offset_n = offsets.data_ptr<index_t>()[offsets.size(0)-1];
       TORCH_CHECK(offset_0 == 0, "offsets[0] has to be 0, i.e., the first sequence "
                                 "in the mini-batch has to start from position 0. "
                                 "However, got ", offsets[0]);
-      TORCH_CHECK(offset_n <= indices.sizes()[0], "offsets[-1] can not "
-                  "be greater than input's length ", indices.sizes()[0], " but got offsets[-1] of ",
+      TORCH_CHECK(offset_n <= indices.size(0), "offsets[-1] can not "
+                  "be greater than input's length ", indices.size(0), " but got offsets[-1] of ",
                   offset_n);
     }
   });
@@ -504,7 +819,7 @@ void check_arguments(
 
   if (include_last_offset) {
     TORCH_CHECK(
-        offsets.sizes()[0] >= 1,
+        offsets.size(0) >= 1,
         "include_last_offset: number of offset should be at least 1");
   }
 }
@@ -517,16 +832,16 @@ void make_bag_size_out(
     const bool include_last_offset,
     const bool requires_grad) {
   if (requires_grad || mode == MODE_MEAN || mode == MODE_MAX) {
-    auto num_bags = offsets.sizes()[0] - (include_last_offset ? 1 : 0);
+    auto num_bags = offsets.size(0) - (include_last_offset ? 1 : 0);
     at::native::resize_(bag_size_out, {num_bags}, c10::nullopt);
     // Compute this for MODE_MEAN and MODE_MAX (latter needed for backwards)
     if (num_bags != 1) {
-      bag_size_out.slice(0, 0, bag_size_out.sizes()[0] - 1, 1) =
+      bag_size_out.slice(0, 0, bag_size_out.size(0) - 1, 1) =
           offsets.slice(0, 1, num_bags, 1) -
           offsets.slice(0, 0, num_bags - 1, 1);
     }
     if (num_bags > 0) {
-      bag_size_out[-1] = indices.sizes()[0] - offsets[num_bags - 1];
+      bag_size_out[-1] = indices.size(0) - offsets[num_bags - 1];
     }
   } else {
     at::native::resize_(bag_size_out, offsets.sizes(), c10::nullopt);
@@ -541,7 +856,7 @@ void make_max_indices_out(
     const Tensor& bag_size,
     const int64_t mode,
     bool include_last_offset) {
-  int64_t numBags = offsets.sizes()[0];
+  int64_t numBags = offsets.size(0);
   if (mode == MODE_MAX) {
     if (include_last_offset) {
       TORCH_CHECK(
@@ -569,13 +884,11 @@ void make_offset2bag_out(
   bool fast_path_sum = is_fast_path(weight, per_sample_weights, output, padding_idx);
 
   if (mode == MODE_MEAN || mode == MODE_MAX || !fast_path_sum) {
-    at::native::resize_(offset2bag, {indices.sizes()[0] + 1}, c10::nullopt);
+    at::native::resize_(offset2bag, {indices.size(0) + 1}, c10::nullopt);
     at::native::zero_(offset2bag);
-  }
 
-  if (mode == MODE_MEAN || mode == MODE_MAX || !fast_path_sum) {
     make_offset2bag(offsets, offset2bag);
-    at::native::resize_(offset2bag, {indices.sizes()[0]}, c10::nullopt);
+    at::native::resize_(offset2bag, {indices.size(0)}, c10::nullopt);
     // only initialize output in slow path
     at::native::zero_(output);
   }
@@ -647,7 +960,7 @@ static Tensor apply_bag_size_backward(
 
 template <typename scalar_t>
 void embedding_bag_cpu_max_out(
-    Tensor& max_indices,
+    Tensor* max_indices,
     const Tensor& weight,
     const Tensor& indices,
     const Tensor& offset2bag,
@@ -662,8 +975,12 @@ void embedding_bag_cpu_max_out(
     auto* indices_data = indices.data_ptr<index_t>();
     auto* offset2bag_data = offset2bag.data_ptr<index_t>();
 
-    auto* max_indices_data = max_indices.data_ptr<index_t>();
-    auto max_indices_stride = max_indices.strides()[0];
+    index_t* max_indices_data = nullptr;
+    int64_t max_indices_stride = 0;
+    if (max_indices) {
+      max_indices_data = max_indices->data_ptr<index_t>();
+      max_indices_stride = max_indices->strides()[0];
+    }
 
     auto* weight_data = weight.data_ptr<scalar_t>();
     auto* output_data = output.data_ptr<scalar_t>();
@@ -690,7 +1007,9 @@ void embedding_bag_cpu_max_out(
 
           if (is_first_for_bag || (weight_item > current_item)) {
             current_item = weight_item;
-            max_indices_data[max_indices_stride * bag + dim] = word_idx;
+            if (max_indices_data) {
+              max_indices_data[max_indices_stride * bag + dim] = word_idx;
+            }
           }
         }
         if (is_first_for_bag) {
@@ -705,22 +1024,22 @@ void embedding_bag_cpu_max_out(
 }
 
 void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
-                            Tensor& bag_size, Tensor& max_indices,
+                            Tensor& bag_size, Tensor* max_indices,
                             const Tensor &weight, const Tensor &indices,
                             const Tensor &offsets, const int64_t mode,
                             const c10::optional<Tensor>& per_sample_weights,
-                            bool include_last_offset, int64_t padding_idx) {
+                            bool include_last_offset, int64_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   if (mode == MODE_MEAN || mode == MODE_SUM) {
-    AT_DISPATCH_FLOATING_TYPES(weight.scalar_type(), "embedding_bag_no_grad_cpu_out",
-      [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_no_grad_cpu_out",
+      [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx, &fbgemm_kernel_cache]() {
       AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_no_grad_cpu_out",
-        [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx]() {
+        [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx, &fbgemm_kernel_cache]() {
         if (per_sample_weights.has_value() && per_sample_weights.value().defined()) {
           TORCH_INTERNAL_ASSERT(mode == MODE_SUM);
           index_select_scale_add<scalar_t, index_t>(
-            indices, offset2bag, per_sample_weights.value(), weight, output, offsets, include_last_offset, bag_size, padding_idx);
+            indices, offset2bag, per_sample_weights.value(), weight, output, offsets, include_last_offset, bag_size, padding_idx, fbgemm_kernel_cache);
         } else {
-          index_select_add<scalar_t, index_t>(indices, offset2bag, weight, output, offsets, include_last_offset, bag_size, padding_idx);
+          index_select_add<scalar_t, index_t>(indices, offset2bag, weight, output, offsets, include_last_offset, bag_size, padding_idx, fbgemm_kernel_cache);
         }
       });
     });
@@ -729,7 +1048,9 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
       // make bag_size output deterministic
       at::native::zero_(bag_size);
     }
-     max_indices.copy_(bag_size);
+    if (max_indices) {
+      max_indices->copy_(bag_size);
+    }
   } else { // MODE_MAX
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       weight.scalar_type(), "embedding_bag_cpu_max_out", [&]() {
@@ -756,7 +1077,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_cpu_impl(
   check_arguments(weight, indices, offsets, mode, per_sample_weights, include_last_offset);
 
   Tensor output = at::empty(
-      {include_last_offset ? offsets.sizes()[0] - 1 : offsets.sizes()[0],
+      {include_last_offset ? offsets.size(0) - 1 : offsets.size(0),
        weight.sizes()[1]},
       weight.options());
 
@@ -767,7 +1088,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_cpu_impl(
   Tensor max_indices = make_max_indices(weight, indices, offsets, bag_size, mode, include_last_offset);
 
   _embedding_bag_cpu_impl_out(output, offset2bag,
-                          bag_size, max_indices,
+                          bag_size, &max_indices,
                           weight, indices, offsets,
                           mode, per_sample_weights,
                           include_last_offset, padding_idx);
@@ -866,6 +1187,63 @@ _embedding_bag_cpu(const Tensor &weight, const Tensor &indices,
       /*requires_grad=*/true);
 }
 
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool /* scale_grad_by_freq */,
+    const int64_t mode,
+    const bool /* sparse */,
+    const c10::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const c10::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  at::native::check_arguments(
+      weight, indices, offsets, mode, per_sample_weights, include_last_offset);
+
+  at::native::make_offset2bag_out(
+      offset2bag,
+      output,
+      weight,
+      indices,
+      offsets,
+      mode,
+      per_sample_weights,
+      padding_idx.value_or(-1));
+
+  at::native::make_bag_size_out(
+      bag_size, offsets, indices, mode, include_last_offset, false);
+
+  if (p_max_indices) {
+    at::native::make_max_indices_out(
+        *p_max_indices,
+        weight,
+        indices,
+        offsets,
+        bag_size,
+        mode,
+        include_last_offset);
+  }
+
+  at::native::_embedding_bag_cpu_impl_out(
+      output,
+      offset2bag,
+      bag_size,
+      p_max_indices,
+      weight,
+      indices,
+      offsets,
+      mode,
+      per_sample_weights,
+      include_last_offset,
+      padding_idx.value_or(-1),
+      fbgemm_kernel_cache);
+}
+
 // Assumes all input tensors are contiguous.
 // See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
 Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_,
@@ -894,10 +1272,10 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_,
   Tensor offset2bag_;
   if (indices.numel() != 0 && offset2bag.numel() == 0) {
     offset2bag_ = at::zeros(
-       {indices.sizes()[0] + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
+       {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
 
     make_offset2bag(offsets, offset2bag_);
-    offset2bag_.resize_({indices.sizes()[0]});
+    offset2bag_.resize_({indices.size(0)});
   } else {
     auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1);
     checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt});
@@ -1081,7 +1459,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
   // for more details.
   auto grad = grad_.contiguous();
   auto grad_arg = TensorArg(grad, "grad_", 1);
-  checkScalarTypes("embedding_bag", grad_arg, {kFloat, kDouble});
+  checkScalarTypes("embedding_bag", grad_arg, {kHalf, kFloat, kDouble});
 
   if (mode == MODE_MAX) {
     return _embedding_bag_dense_backward_cpu_max(
@@ -1092,12 +1470,24 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
   auto index_grad_weight =
       at::zeros({num_weights, grad.sizes()[1]}, grad.options());
 
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "embedding_bag_backward", [&] {
-      _embedding_bag_dense_backward_cpu_sum_mean<scalar_t>(
-          grad, indices_, offset2bag__, bag_size_, num_weights,
-          scale_grad_by_freq, mode, per_sample_weights_, index_grad_weight,
-          padding_idx);
-  });
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      grad.scalar_type(),
+      "embedding_bag_backward",
+      [&] {
+        _embedding_bag_dense_backward_cpu_sum_mean<scalar_t>(
+            grad,
+            indices_,
+            offset2bag__,
+            bag_size_,
+            num_weights,
+            scale_grad_by_freq,
+            mode,
+            per_sample_weights_,
+            index_grad_weight,
+            padding_idx);
+      });
   return index_grad_weight;
 }
 
@@ -1120,7 +1510,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
   Tensor indices, offsets;
   std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
   AT_ASSERT(indices.dim() == 1);
-  auto num_samples = indices.sizes()[0];
+  auto num_samples = indices.size(0);
 
   AT_ASSERT(weight.dim() == 2);
   AT_ASSERT(weight.sizes()[1] == embedding_features);
@@ -1134,11 +1524,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
   Tensor offset2bag_;
   if (indices.numel() != 0 && offset2bag.numel() == 0) {
     offset2bag_ = at::zeros(
-       {indices.sizes()[0] + 1}, offset2bag.options()); // offset2bag = [0 0 0 0 0]
+       {indices.size(0) + 1}, offset2bag.options()); // offset2bag = [0 0 0 0 0]
 
     make_offset2bag(offsets, offset2bag_);
 
-    at::native::resize_(offset2bag_, {indices.sizes()[0]}, c10::nullopt);
+    at::native::resize_(offset2bag_, {indices.size(0)}, c10::nullopt);
   } else {
     auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1);
     checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt});
@@ -1194,12 +1584,16 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu(
     const Tensor& offset2bag,
     int64_t mode,
     int64_t padding_idx) {
-  return AT_DISPATCH_FLOATING_TYPES(
-    grad.scalar_type(), "_embedding_bag_per_sample_weights_backward_cpu", [&]() {
-      return _embedding_bag_per_sample_weights_backward_cpu_template<scalar_t>(
-          grad, weight, indices, offsets, offset2bag, mode, padding_idx);
-    }
-  );
+  return AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      grad.scalar_type(),
+      "_embedding_bag_per_sample_weights_backward_cpu",
+      [&]() {
+        return _embedding_bag_per_sample_weights_backward_cpu_template<
+            scalar_t>(
+            grad, weight, indices, offsets, offset2bag, mode, padding_idx);
+      });
 }
 
 Tensor _embedding_bag_sparse_backward(
@@ -1229,6 +1623,5 @@ Tensor _embedding_bag_sparse_backward(
   return native::embedding_backward(index_grad, indices, num_weights, padding_idx,
                                     scale_grad_by_freq, true);
 }
-
 }
 } // namespace at::native
diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h
index e0ce5f01b384..6600c661d46a 100644
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@@ -1,4 +1,9 @@
 #include <ATen/ATen.h>
+#include <cstdint>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/FbgemmEmbedding.h>
+#endif
 
 namespace at {
 namespace native {
@@ -38,12 +43,98 @@ void make_offset2bag_out(
     const c10::optional<Tensor>& per_sample_weights,
     const int64_t padding_idx = -1);
 
+#ifdef USE_FBGEMM
+
+template<bool has_weight, typename TIndex, typename TData>
+struct _CallbackAndBlockSize {
+    using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature<TData, TIndex, TIndex, TData>::Type;
+
+    int64_t blockSize = -1;
+    TCallback callback = nullptr;
+
+    static TCallback generateCallback(int64_t block_size) {
+        return fbgemm::GenerateEmbeddingSpMDM<TData, TIndex, TIndex, TData>(
+                block_size,
+                has_weight,
+                /* normalize_by_lengths */false,
+                /* prefetch */16,
+                /* is_weight_positional */false,
+                /* use_offsets */true);
+    }
+
+    _CallbackAndBlockSize() = default;
+
+    explicit _CallbackAndBlockSize(c10::optional<int64_t> maybe_block_size)
+      : blockSize(maybe_block_size.value_or(-1))
+      , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
+    {}
+};
+
+template<typename... StorageMixins>
+struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
+
+    _EmbeddingBagKernelCacheImpl() = default;
+    // use each of the mixins to store corresponding kernel and block size
+    explicit _EmbeddingBagKernelCacheImpl(c10::optional<int64_t> maybe_block_size)
+      : StorageMixins(maybe_block_size)...
+    {}
+
+    // this method is thread safe (call sites may call from different threads)
+    template<bool has_weight, typename TIndex, typename TData>
+    typename _CallbackAndBlockSize<has_weight, TIndex, TData>::TCallback
+    getCallback(int64_t block_size) const {
+        // if the cache doesn't store the kernel for the incoming block size
+        // (so it is different from the one stored in corresponding mixin)
+        // regenerate the kernel (not writing it into the cache so we avoid locks)
+        if (block_size != _CallbackAndBlockSize<has_weight, TIndex, TData>::blockSize) {
+            return _CallbackAndBlockSize<has_weight, TIndex, TData>::generateCallback(block_size);
+        }
+        // else retrieve the cached kernel from the corresponding mixin
+        return _CallbackAndBlockSize<has_weight, TIndex, TData>::callback;
+    }
+};
+
+// instantiate the cache with the list of storage mixins
+// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
+using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
+      _CallbackAndBlockSize<true, int32_t, float>,
+      _CallbackAndBlockSize<false, int32_t, float>,
+      _CallbackAndBlockSize<true, int64_t, float>,
+      _CallbackAndBlockSize<false, int64_t, float>,
+      _CallbackAndBlockSize<true, int32_t, unsigned short>,
+      _CallbackAndBlockSize<false, int32_t, unsigned short>,
+      _CallbackAndBlockSize<true, int64_t, unsigned short>,
+      _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+#else
+struct _EmbeddingBagKernelCache {
+    explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
+};
+#endif
+
 void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
-    Tensor& bag_size, Tensor& max_indices,
+    Tensor& bag_size, Tensor* max_indices,
     const Tensor &weight, const Tensor &indices,
     const Tensor &offsets, const int64_t mode = 0,
     const c10::optional<Tensor>& per_sample_weights = c10::nullopt,
     bool include_last_offset = false,
-    int64_t padding_idx = -1);
+    int64_t padding_idx = -1,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    const bool sparse,
+    const c10::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const c10::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
 } // native
 } // at
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index ca48deab83ae..63fc611961cc 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -61,6 +61,14 @@ Tensor& fill_meta_(Tensor& self, const Tensor& value) {
   return self;
 }
 
+Tensor fill(const Tensor& self, const Scalar& value) {
+  return at::empty_like(self).fill_(value);
+}
+
+Tensor fill(const Tensor& self, const Tensor& value) {
+  return at::empty_like(self).fill_(value);
+}
+
 DEFINE_DISPATCH(fill_stub);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fill_diagonal ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/aten/src/ATen/native/Fill.h b/aten/src/ATen/native/Fill.h
index e1903a379a0c..f6de9580ae7c 100644
--- a/aten/src/ATen/native/Fill.h
+++ b/aten/src/ATen/native/Fill.h
@@ -2,13 +2,19 @@
 
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
 
-namespace at { namespace native {
+namespace c10 {
+class Scalar;
+}
 
-DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&), fill_stub);
+namespace at {
+class Tensor;
+struct TensorIterator;
+
+namespace native {
+
+DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub);
 
 Tensor& fill_out(Tensor& self, const Scalar& value);
 
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 8855fd313a56..033052f401f6 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -126,19 +126,11 @@ bool check_fast_path_restrictions(
 bool can_use_fast_route(ArrayRef<TensorList> tensorLists,
                         ArrayRef<Scalar> scalarList = {},
                         bool does_op_promote_integer_inputs_to_float = false) {
-#if defined(USE_ROCM)
-  return false;
-#else
   return check_fast_path_restrictions(tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
-#endif
 }
 
 bool can_use_fast_route(TensorList tensors1, TensorList tensors2, bool does_op_promote_integer_inputs_to_float = false) {
-#if defined(USE_ROCM)
-  return false;
-#else
   return can_use_fast_route({tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
-#endif
 }
 
 }
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index b4ea2ec186f2..bb25be4a02e5 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -134,8 +134,9 @@ static std::vector<int> fractional_max_pool2d_generate_intervals(
         static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
     }
   }
-  sequence[outputSize - 1] = inputSize - poolSize;
-
+  if (outputSize > 0) {
+    sequence[outputSize - 1] = inputSize - poolSize;
+  }
   return sequence;
 }
 
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 757ce7c05691..8bcb53847271 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -106,8 +106,9 @@ static std::vector<int> generate_intervals(
         static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
     }
   }
-  sequence[outputSize - 1] = inputSize - poolSize;
-
+  if (outputSize > 0) {
+    sequence[outputSize - 1] = inputSize - poolSize;
+  }
   return sequence;
 }
 
@@ -238,7 +239,6 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
   int64_t inputW,
   const at::Tensor& output,
   const at::Tensor& indices) {
-
   /* get contiguous input */
   auto input = input_.contiguous();
 
diff --git a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
index 154f1bf43be6..d31789051104 100644
--- a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
+++ b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
@@ -1,5 +1,17 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/FunctionOfAMatrixUtils.h>
 
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_compute_linear_combination_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at { namespace native {
 
 DEFINE_DISPATCH(_compute_linear_combination_stub);
diff --git a/aten/src/ATen/native/FunctionOfAMatrixUtils.h b/aten/src/ATen/native/FunctionOfAMatrixUtils.h
index 330efa0923f9..68b26ed13811 100644
--- a/aten/src/ATen/native/FunctionOfAMatrixUtils.h
+++ b/aten/src/ATen/native/FunctionOfAMatrixUtils.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
+#include <cstdint>
 
-namespace at { namespace native {
+namespace at {
+struct TensorIterator;
+
+namespace native {
 
 using _compute_linear_combination_fn = void(*)(
   TensorIterator& iter,
diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp
index c585caa71a01..b7b20e1c32f1 100644
--- a/aten/src/ATen/native/GatedLinearUnit.cpp
+++ b/aten/src/ATen/native/GatedLinearUnit.cpp
@@ -30,6 +30,8 @@ namespace native {
 DEFINE_DISPATCH(glu_stub);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(glu_backward_stub);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(glu_jvp_stub);
 
 TORCH_IMPL_FUNC(glu_out) (const Tensor& self, int64_t dim, const Tensor& out) {
   glu_stub(device_type(), *this);
@@ -69,5 +71,72 @@ Tensor glu_backward_cpu(const Tensor& grad_output, const Tensor& input, int64_t
   return glu_backward_cpu_out(grad_output, input, dim, grad_input);
 }
 
+Tensor glu_jvp(
+    const Tensor& glu,
+    const Tensor& x,
+    const Tensor& dx,
+    int64_t dim
+) {
+  dim = maybe_wrap_dim(dim, x.dim());
+  const auto glu_size = glu.size(dim);
+  const auto b = x.narrow(dim, glu_size, glu_size);
+  const auto da = dx.narrow(dim, 0, glu_size);
+  const auto db = dx.narrow(dim, glu_size, glu_size);
+  auto dglu = at::empty_like(glu);
+  auto iter = at::TensorIteratorConfig()
+    .add_output(dglu)
+    .add_input(glu)
+    .add_input(b)
+    .add_input(da)
+    .add_input(db)
+    .build();
+  glu_jvp_stub(iter.device_type(), iter);
+  return dglu;
+}
+
+Tensor glu_backward_jvp(
+    const Tensor& grad_x,
+    const Tensor& grad_glu,
+    const Tensor& x,
+    const Tensor& dgrad_glu,
+    const Tensor& dx,
+    int64_t dim
+) {
+  dim = maybe_wrap_dim(dim, x.dim());
+  const auto glu_size = grad_glu.size(dim);
+  const auto a = x.narrow(dim, 0, glu_size);
+  const auto b = x.narrow(dim, glu_size, glu_size);
+  const auto da = dx.narrow(dim, 0, glu_size);
+  const auto db = dx.narrow(dim, glu_size, glu_size);
+  // grad_x_a = grad_glu * sigmoid(b)
+  const auto grad_x_a = grad_x.narrow(dim, 0, glu_size);
+  // grad_x_b = grad_x_a * a * (1 - sigmoid(b))
+  const auto grad_x_b = grad_x.narrow(dim, glu_size, glu_size);
+
+  const auto sig_b = at::sigmoid(b);
+  // TODO: use glu from forward.
+  // TODO: fuse kernels.
+  const auto glu = a * sig_b;
+  const auto db_neg_sig_b = db - db * sig_b;
+
+  // dgrad_x_a = d(grad_glu * sigmoid(b))
+  //           = dgrad_glu * sigmoid(b) + grad_glu * sigmoid(b) * (1 - sigmoid(b)) * db
+  //           = dgrad_glu * sig_b + grad_x_a * (db - db * sig_b)
+  //           = dgrad_glu * sig_b + grad_x_a * db_neg_sig_b
+  const auto dgrad_x_a = dgrad_glu * sig_b + grad_x_a * db_neg_sig_b;
+
+  // dgrad_x_b = d(grad_glu * sigmoid(b) * a * (1 - sigmoid(b))
+  //           =  d(grad_glu * sigmoid(b)) * a * (1 - sigmoid(b))
+  //            + grad_glu * sigmoid(b) * da * (1 - sigmoid(b))
+  //            - grad_glu * sigmoid(b) * a * sigmoid(b) * (1 - sigmoid(b)) * db
+  //          =   dgrad_x_a * a * (1 - sigmoid(b))
+  //           + (grad_glu * sigmoid(b)) * (da * (1 - sigmoid(b)) - a * sigmoid(b) * (1 - sigmoid(b)) * db)
+  //          = dgrad_x_a * (a - glu) + grad_x_a * (da - da * sig_b - glu * db_neg_sig_b
+  const auto dgrad_x_b = dgrad_x_a * (a - glu) + grad_x_a * (da - da * sig_b - glu * db_neg_sig_b);
+
+  return at::cat({dgrad_x_a, dgrad_x_b}, dim);
+}
+
+
 } // at::native
 } // at
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 99b3d933bd89..8b0440610226 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/GridSampler.h>
+#include <ATen/native/GridSamplerUtils.h>
 #include <ATen/ATen.h>
 #include <ATen/Device.h>
 #include <ATen/NativeFunctions.h>
@@ -23,6 +24,12 @@ namespace {
                                   GridSamplerInterpolation interpolation_mode,
                                   GridSamplerPadding padding_mode,
                                   bool align_corners) {
+    // See NOTE [ grid_sampler Native Functions ].
+    // Add checks here in case this is called instead of grid_sampler.
+    check_grid_sampler_common(input, grid);
+    check_grid_sampler_3d(
+      input, grid, static_cast<int64_t>(interpolation_mode));
+
     int64_t N = input.size(0);
     int64_t C = input.size(1);
     int64_t inp_D = input.size(2);
@@ -178,8 +185,21 @@ namespace {
                                     const Tensor& input, const Tensor& grid,
                                     GridSamplerInterpolation interpolation_mode,
                                     GridSamplerPadding padding_mode,
-                                    bool align_corners) {
-    auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+                                    bool align_corners, std::array<bool,2> output_mask) {
+    // See NOTE [ grid_sampler Native Functions ].
+    // Add checks here in case this is called instead of grid_sampler.
+    check_grid_sampler_common(input, grid);
+    check_grid_sampler_3d(
+      input, grid, static_cast<int64_t>(interpolation_mode));
+
+    auto input_requires_grad = output_mask[0];
+    Tensor grad_input = ([&]() {
+      if (input_requires_grad) {
+        return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      } else {
+        return Tensor();
+      }
+    })();
     auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     // If interpolation mode is Nearest, then grad_grid is not filled in the
     // loop below.
@@ -209,17 +229,27 @@ namespace {
     int64_t gOut_sD = grad_output.stride(2);
     int64_t gOut_sH = grad_output.stride(3);
     int64_t gOut_sW = grad_output.stride(4);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sD = grad_input.stride(2);
-    int64_t gInp_sH = grad_input.stride(3);
-    int64_t gInp_sW = grad_input.stride(4);
+    int64_t gInp_sN = 0;
+    int64_t gInp_sC = 0;
+    int64_t gInp_sD = 0;
+    int64_t gInp_sH = 0;
+    int64_t gInp_sW = 0;
+    if (input_requires_grad) {
+      gInp_sN = grad_input.stride(0);
+      gInp_sC = grad_input.stride(1);
+      gInp_sD = grad_input.stride(2);
+      gInp_sH = grad_input.stride(3);
+      gInp_sW = grad_input.stride(4);
+    }
     int64_t gGrid_sN = grad_grid.stride(0);
     int64_t gGrid_sW = grad_grid.stride(3);
     scalar_t *inp_ptr = input.data_ptr<scalar_t>();
     scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
     scalar_t *gOut_ptr = grad_output.data_ptr<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data_ptr<scalar_t>();
+    scalar_t *gInp_ptr = nullptr;
+    if (input_requires_grad) {
+      gInp_ptr = grad_input.data_ptr<scalar_t>();
+    }
     scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
     // loop over each output pixel
     at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
@@ -290,22 +320,23 @@ namespace {
 
                 scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
                 scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
                 scalar_t *inp_ptr_NC = inp_ptr_N;
+                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
                 // calculate bilinear weighted pixel value and set output pixel
                 for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
                   scalar_t gOut = *gOut_ptr_NCDHW;
 
                   // calculate and set grad_input
-                  safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
-
+                  if (input_requires_grad) {
+                    safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
+                    safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
+                  }
                   // calculate grad_grid
                   if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
                     scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
@@ -368,11 +399,13 @@ namespace {
 
                 // assign nearest neighor pixel value to output pixel
                 scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-                for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-                  // calculate and set grad_input
-                  safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest,
-                              gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW);
+                if (input_requires_grad) {
+                  scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
+                  for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
+                    // calculate and set grad_input
+                    safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest,
+                                gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW);
+                  }
                 }
               }
             }
@@ -391,6 +424,11 @@ Tensor _grid_sampler_2d_cpu_quantized(
     int64_t interpolation_mode_,
     int64_t padding_mode_,
     bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   auto interpolation_mode =
       static_cast<GridSamplerInterpolation>(interpolation_mode_);
   /* Bilinear interpolation is supported using the fact that we can perform
@@ -495,6 +533,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
                                      int64_t interpolation_mode_,
                                      int64_t padding_mode_,
                                      bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   auto interpolation_mode = static_cast<GridSamplerInterpolation>(interpolation_mode_);
   auto padding_mode = static_cast<GridSamplerPadding>(padding_mode_);
   using scalar_t = float;
@@ -643,6 +686,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
                                        int64_t interpolation_mode_,
                                        int64_t padding_mode_,
                                        bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   const auto interpolation_mode = static_cast<GridSamplerInterpolation>(interpolation_mode_);
   const auto padding_mode = static_cast<GridSamplerPadding>(padding_mode_);
   using scalar_t = float;
@@ -836,10 +884,14 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
   return std::make_tuple(grad_input, grad_grid);
 }
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode,
                            bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   if (input.scalar_type() == kQUInt8) {
     return native::_grid_sampler_2d_cpu_quantized(
         input, grid, interpolation_mode, padding_mode, align_corners);
@@ -864,17 +916,26 @@ Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
     }
   }
 
-  return grid_sampler_2d_cpu_kernel(
-    kCPU, input, grid, interpolation_mode, padding_mode, align_corners);
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+  grid_sampler_2d_cpu_kernel(
+      kCPU, output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
 }
 
 DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel);
 
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode,
                            bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
   return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler3d_cpu", [&] {
     return grid_sampler_3d_cpu_impl<scalar_t>(
       input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
@@ -882,11 +943,14 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
   });
 }
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 std::tuple<Tensor, Tensor>
 grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
                              int64_t interpolation_mode, int64_t padding_mode, bool align_corners,
                              std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
 
   // AVX gather instructions use signed 32-bit offsets to gather float values.
   // Check for possible overflow and fallback to scalar implementation
@@ -911,80 +975,64 @@ grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, con
     }
   }
 
-  return grid_sampler_2d_backward_cpu_kernel(
-    kCPU, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  auto input_requires_grad = output_mask[0];
+  Tensor grad_input = ([&]() {
+    if (input_requires_grad) {
+      return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    } else {
+      return Tensor();
+    }
+  })();
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  grid_sampler_2d_backward_cpu_kernel(
+      kCPU, grad_input, grad_grid, grad_output, input, grid,
+      interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(std::move(grad_input), std::move(grad_grid));
 }
 
 DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel);
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 std::tuple<Tensor, Tensor>
 grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                             int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+                             int64_t interpolation_mode, int64_t padding_mode, bool align_corners,
+                             std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
   return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_backward_cpu", [&] {
     return grid_sampler_3d_backward_cpu_impl<scalar_t>(
       grad_output, input, grid,
       static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode), align_corners);
+      static_cast<GridSamplerPadding>(padding_mode),
+      align_corners, output_mask);
   });
 }
 
-Tensor grid_sampler(const Tensor& input, const Tensor& grid,
-                    int64_t interpolation_mode, int64_t padding_mode,
-                    bool align_corners) {
-  TORCH_CHECK(
-    input.defined() && grid.defined(),
-    "grid_sampler(): expected input and grid to not be undefined, but input "
-    "is ", input, " and grid is ", grid);
-  auto input_opt = input.options();
-  auto grid_opt = grid.options();
-  TORCH_CHECK(
-    input_opt.device() == grid_opt.device(),
-    "grid_sampler(): expected input and grid to be on same device, but input "
-    "is on ", input_opt.device(), " and grid is on ", grid_opt.device());
-  TORCH_CHECK(
-    input_opt.layout() == kStrided && grid_opt.layout() == kStrided,
-    "grid_sampler(): expected input and grid to have torch.strided layout, but "
-    "input has ", input_opt.layout(), " and grid has ", grid_opt.layout());
-  TORCH_CHECK(
-    (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(),
-    "grid_sampler(): expected 4D or 5D input and grid with same number of "
-    "dimensions, but got input with sizes ", input.sizes(),
-    " and grid with sizes ", grid.sizes());
-  TORCH_CHECK(
-    input.size(0) == grid.size(0),
-    "grid_sampler(): expected grid and input to have same batch size, but got "
-    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
-  TORCH_CHECK(
-    grid.size(-1) == input.dim() - 2,
-    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
-    "dimension, but got grid with sizes ", grid.sizes());
-  TORCH_CHECK(
-    !(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
-    "grid_sampler(): bicubic interpolation only supports 4D input"
-  );
-  for (const auto i : c10::irange(2, input.dim())) {
-    TORCH_CHECK(input.size(i) > 0,
-      "grid_sampler(): expected input to have non-empty spatial dimensions, "
-      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
-      "empty");
-  }
-  // cudnn does not support inputs larger than 1024
-  if (at::native::cudnn_is_acceptable(input) &&
-      at::native::cudnn_is_acceptable(grid) &&
-      at::native::canUse32BitIndexMath(input) &&
-      at::native::canUse32BitIndexMath(grid) &&
-      static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bilinear &&
-      static_cast<GridSamplerPadding>(padding_mode) == GridSamplerPadding::Zeros &&
-      align_corners &&
-      input.dim() == 4 &&
-      input.size(1) <= 1024) {
+// See NOTE [ grid_sampler Native Functions ].
+Tensor grid_sampler(
+  const Tensor& input,
+  const Tensor& grid,
+  int64_t interpolation_mode,
+  int64_t padding_mode,
+  bool align_corners
+) {
+  if (cond_cudnn_grid_sampler(input, grid) &&
+      static_cast<GridSamplerInterpolation>(interpolation_mode) ==
+        GridSamplerInterpolation::Bilinear &&
+      static_cast<GridSamplerPadding>(padding_mode) ==
+        GridSamplerPadding::Zeros &&
+      align_corners) {
     return cudnn_grid_sampler(input, grid);
   }
+
   if (input.dim() == 4) {
-    return at::grid_sampler_2d(input, grid, interpolation_mode, padding_mode, align_corners);
+    return at::grid_sampler_2d(
+      input, grid, interpolation_mode, padding_mode, align_corners);
   } else {
-    return at::grid_sampler_3d(input, grid, interpolation_mode, padding_mode, align_corners);
+    return at::grid_sampler_3d(
+      input, grid, interpolation_mode, padding_mode, align_corners);
   }
 }
 
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
index effc322c0d3a..f4a735032430 100644
--- a/aten/src/ATen/native/GridSampler.h
+++ b/aten/src/ATen/native/GridSampler.h
@@ -1,16 +1,13 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <utility>
 
-namespace at { namespace native {
-
-namespace detail {
+#include <ATen/native/GridSamplerUtils.h>
 
-  enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic};
-  enum class GridSamplerPadding {Zeros, Border, Reflection};
-
-}  // namespace detail
+namespace at { namespace native {
 
 using detail::GridSamplerInterpolation;
 using detail::GridSamplerPadding;
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
new file mode 100644
index 000000000000..0b6f29de8c42
--- /dev/null
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -0,0 +1,109 @@
+#pragma once
+
+// See NOTE: [Tensor vs. TensorBase]
+// https://github.com/pytorch/pytorch/pull/66979
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/TensorProperties.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at { namespace native {
+
+namespace detail {
+
+enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic};
+enum class GridSamplerPadding {Zeros, Border, Reflection};
+
+} // namespace detail
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+namespace {
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_common(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  auto input_opt = input.options();
+  auto grid_opt = grid.options();
+
+  TORCH_CHECK(
+    input.defined(),
+    "grid_sampler(): expected input to not be undefined");
+  TORCH_CHECK(
+    grid.defined(),
+    "grid_sampler(): expected grid to not be undefined");
+  TORCH_CHECK(
+    input_opt.device() == grid_opt.device(),
+    "grid_sampler(): expected input and grid to be on same device, but input "
+    "is on ", input_opt.device(), " and grid is on ", grid_opt.device());
+  TORCH_CHECK(
+    input_opt.layout() == kStrided && grid_opt.layout() == kStrided,
+    "grid_sampler(): expected input and grid to have torch.strided layout, but "
+    "input has ", input_opt.layout(), " and grid has ", grid_opt.layout());
+  TORCH_CHECK(
+    input.size(0) == grid.size(0),
+    "grid_sampler(): expected grid and input to have same batch size, but got "
+    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    grid.size(-1) == input.dim() - 2,
+    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
+    "dimension, but got grid with sizes ", grid.sizes());
+
+  for (const auto i : c10::irange(2, input.dim())) {
+    TORCH_CHECK(input.size(i) > 0,
+      "grid_sampler(): expected input to have non-empty spatial dimensions, "
+      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
+      "empty");
+  }
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_2d(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  TORCH_CHECK(
+    input.dim() == 4 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 4D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_3d(
+  const TensorBase& input,
+  const TensorBase& grid,
+  int64_t interpolation_mode
+) {
+  TORCH_CHECK(
+    input.dim() == 5 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 5D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    !(input.dim() == 5 &&
+      static_cast<GridSamplerInterpolation>(interpolation_mode) ==
+        GridSamplerInterpolation::Bicubic),
+    "grid_sampler(): bicubic interpolation only supports 4D input");
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+// cudnn does not support inputs larger than 1024.
+bool cond_cudnn_grid_sampler(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  return (
+    at::native::cudnn_is_acceptable(input) &&
+    at::native::cudnn_is_acceptable(grid) &&
+    at::native::canUse32BitIndexMath(input) &&
+    at::native::canUse32BitIndexMath(grid) &&
+    input.dim() == 4 &&
+    input.size(1) <= 1024);
+}
+
+} // anonymous namespace
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index abd1ae32ded1..c3a007f2c2dc 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -407,4 +407,28 @@ Tensor histogram_histc_cpu(const Tensor& self, int64_t bin_ct,
     return histogram_histc_cpu_out(self, bin_ct, min, max, hist);
 }
 
+std::tuple<Tensor, std::vector<Tensor>> histogramdd(
+    const Tensor &self, TensorList bins, c10::optional<ArrayRef<double>> /*range*/,
+    const c10::optional<Tensor> &weight, bool density) {
+  auto hist = at::_histogramdd_from_bin_tensors(self, bins, weight, density);
+  return std::tuple<Tensor, std::vector<Tensor>>{
+      std::move(hist), bins.vec()};
+}
+
+std::tuple<Tensor, std::vector<Tensor>> histogramdd(
+    const Tensor &self, IntArrayRef bins, c10::optional<ArrayRef<double>> range,
+    const c10::optional<Tensor> &weight, bool density) {
+  auto bin_edges = at::_histogramdd_bin_edges(self, bins, range, weight, density);
+  auto hist = at::_histogramdd_from_bin_cts(self, bins, range, weight, density);
+  return std::tuple<Tensor, std::vector<Tensor>>{
+      std::move(hist), std::move(bin_edges)};
+}
+
+std::tuple<Tensor, std::vector<Tensor>> histogramdd(
+    const Tensor &self, int64_t bins, c10::optional<ArrayRef<double>> range,
+    const c10::optional<Tensor> &weight, bool density) {
+  DimVector bins_v(self.size(-1), bins);
+  return at::native::histogramdd(self, bins_v, range, weight, density);
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h
index 02dbe4723b15..9df0aafafc18 100644
--- a/aten/src/ATen/native/Histogram.h
+++ b/aten/src/ATen/native/Histogram.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 
 #include <tuple>
diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp
index d1117b8c1d4d..265b05054b0a 100644
--- a/aten/src/ATen/native/Itertools.cpp
+++ b/aten/src/ATen/native/Itertools.cpp
@@ -11,7 +11,7 @@ Tensor _triu_mask(int64_t n, int64_t dims, bool diagonal, TensorOptions opt) {
   // get a mask that has value 1 whose indices satisfies i < j < k < ...
   // or i <= j <= k <= ... (depending on diagonal)
   Tensor range = at::arange(n, opt.dtype(kLong));
-  std::vector<Tensor> index_grids = at::meshgrid(std::vector<Tensor>(dims, range));
+  std::vector<Tensor> index_grids = at::meshgrid(std::vector<Tensor>(dims, range), "ij");
   Tensor mask = at::full(index_grids[0].sizes(), true, opt.dtype(kBool));
   if(diagonal) {
     for(int64_t i = 0; i < dims - 1; i++) {
@@ -46,9 +46,12 @@ Tensor cartesian_prod(TensorList tensors) {
 
 Tensor combinations(const Tensor& self, int64_t r, bool with_replacement) {
   TORCH_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes());
-  TORCH_CHECK(r > 0, "Expect a positive number, but got ", r);
+  TORCH_CHECK(r >= 0, "Expect a non-negative number, but got ", r);
+  if (r == 0) {
+    return at::empty({0}, self.options());
+  }
   int64_t num_elements = self.numel();
-  std::vector<Tensor> grids = at::meshgrid(std::vector<Tensor>(r, self));
+  std::vector<Tensor> grids = at::meshgrid(std::vector<Tensor>(r, self), "ij");
   Tensor mask = _triu_mask(num_elements, r, with_replacement, self.options());
   for(Tensor &t : grids) {
     t = t.masked_select(mask);
diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp
index 4e8dbbccdff7..bfac91a881ae 100644
--- a/aten/src/ATen/native/Lerp.cpp
+++ b/aten/src/ATen/native/Lerp.cpp
@@ -18,7 +18,7 @@ TORCH_META_FUNC(lerp_Tensor)(
 }
 
 TORCH_META_FUNC(lerp_Scalar)(
-    const Tensor& self, const Tensor& end, const Scalar& weight) {
+    const Tensor& self, const Tensor& end, const Scalar& /*weight*/) {
   TORCH_CHECK(self.dtype() == end.dtype(), "expected dtype ", self.dtype(),
               " for `end` but got dtype ", end.dtype());
   build_binary_op(maybe_get_output(), self, end);
@@ -29,12 +29,12 @@ TORCH_META_FUNC(lerp_Scalar)(
 namespace native {
 
 TORCH_IMPL_FUNC(lerp_Tensor)(
-    const Tensor& self, const Tensor& end, const Tensor& weight, const Tensor &out) {
+    const Tensor& /*self*/, const Tensor& /*end*/, const Tensor& weight, const Tensor& /*out*/) {
   lerp_kernel_tensor_weight(device_type(), *this);
 }
 
 TORCH_IMPL_FUNC(lerp_Scalar)(
-    const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor &out) {
+    const Tensor& /*self*/, const Tensor& /*end*/, const Scalar& weight, const Tensor& /*out*/) {
   lerp_kernel_scalar_weight(device_type(), *this, weight);
 }
 
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 3a4a8e1fd7f2..127a2cdc1037 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -6,6 +6,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <c10/util/MaybeOwned.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <array>
 #include <cctype>
@@ -25,6 +26,9 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Ten
   if (input.is_mkldnn()) {
     return at::mkldnn_linear(input, weight, *bias);
   }
+  if (input.is_mps()) {
+   return at::_mps_linear(input, weight, *bias);
+  }
 #if defined(C10_MOBILE)
   if (xnnpack::use_linear(input, weight, *bias)) {
     return xnnpack::linear(input, weight, *bias);
@@ -34,9 +38,20 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Ten
     // Fused op is marginally faster.
     return at::addmm(*bias, input, weight.t());
   }
+  if (input.dim() == 3 && bias->defined() && input.is_contiguous()) {
+    // Also hit the fused path for contiguous 3D input.
+    const auto input_sizes = input.sizes();
+    const auto result = at::addmm(*bias, input.view({input_sizes[0] * input_sizes[1], input_sizes[2]}), weight.t());
+    return result.view({input_sizes[0], input_sizes[1], result.size(1)});
+  }
   auto output = at::matmul(input, weight.t());
   if (bias->defined()) {
-    output.add_(*bias);
+    // for composite compliance use out-of-place version of `add`
+    if (isTensorSubclassLike(*bias)) {
+      output = at::add(output, *bias);
+    } else {
+      output.add_(*bias);
+    }
   }
   return output;
 }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index aed94f107051..c7ed0850e778 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -28,16 +28,41 @@
 #include <tuple>
 
 namespace at {
+
+namespace detail {
+  void check_linalg_norm_dtype(optional<ScalarType> opt_dtype, ScalarType self_dtype, const char* const name) {
+    if (opt_dtype.has_value()) {
+      auto dtype = opt_dtype.value();
+      TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype), name, ": dtype should"
+          " be floating point or complex, but got ", dtype);
+      TORCH_CHECK(isComplexType(self_dtype) == isComplexType(dtype),
+          name, ": dtype should be ", isComplexType(self_dtype) ? "complex" : "real",
+          " for ", isComplexType(self_dtype) ? "complex" : "real", " inputs, but got ", dtype);
+      TORCH_CHECK(promoteTypes(self_dtype, dtype) == dtype,
+          name, ": the dtype of the input ", "(", self_dtype, ") should be convertible ",
+          "without narrowing to the specified dtype (", dtype, ").");
+    }
+  }
+}
+
 namespace meta {
-TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
-  auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self);
+#define ADDMM_META() \
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); \
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); \
+  TORCH_CHECK( \
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", \
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); \
+ \
+  auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self); \
   set_output(0, {mat1.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names);
+
+TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
+  ADDMM_META();
+}
+
+TORCH_META_FUNC(_addmm_activation)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu) {
+  ADDMM_META();
 }
 
 TORCH_META_FUNC(mm)(const Tensor & self, const Tensor & mat2) {
@@ -51,6 +76,39 @@ TORCH_META_FUNC(mm)(const Tensor & self, const Tensor & mat2) {
   set_output(0, {self.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names);
 }
 
+TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+  at::native::checkFloatingOrComplex(self, "linalg.vector_norm");
+
+  auto dim = opt_dim.value_or(IntArrayRef{});
+  // Casting a large integer to a double will just introduce an error for
+  // values larger than 10^53 (same for negative numbers), so that's fine.
+  auto ord = scalar_ord.toDouble();
+
+  // For more context, see issue 52783
+  // If the tensor is empty and norm < 0 || norm == infty
+  //   - We cannot reduce the whole tensor
+  //   - We cannot reduce over an empty dimension
+  if (self.numel() == 0 && (ord < 0. || ord == INFINITY)) {
+    TORCH_CHECK(opt_dim.has_value(),
+      "linalg.vector_norm cannot compute the ", scalar_ord, " norm on an empty ",
+      "tensor because the operation does not have an identity");
+    for (auto dim_num : dim) {
+      TORCH_CHECK(self.size(dim_num) != 0,
+        "linalg.vector_norm cannot compute the ", scalar_ord, " norm on an empty ",
+        "dimension because the operation does not have an identity");
+    }
+  }
+
+  at::detail::check_linalg_norm_dtype(opt_dtype, self.scalar_type(), "linalg.vector_norm");
+
+  auto mask = at::native::make_dim_mask(dim, self.dim());
+  auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim);
+  auto options = self.options()
+                     .dtype(toRealValueType(opt_dtype.value_or(self.scalar_type())));
+
+  set_output(shape, options);
+}
+
 template <typename Meta>
 void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const c10::optional<Tensor>& self_baddbmm = nullopt) {
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
@@ -111,7 +169,6 @@ TORCH_META_FUNC(baddbmm)(const Tensor& self, const Tensor& batch1, const Tensor&
 namespace native {
 
 DEFINE_DISPATCH(addr_stub);
-DEFINE_DISPATCH(linalg_vector_norm_stub);
 
 // As P is a permutation matrix
 // det(P) = 1 if it's an even permutation and det(P) = -1 if it's an odd permutation
@@ -209,7 +266,7 @@ std::tuple<Tensor&, Tensor&> linalg_slogdet_out(const Tensor& input, Tensor& sig
   checkSameDevice("linalg.slogdet", sign, input, "sign");
   checkSameDevice("linalg.slogdet", logabsdet, input, "logabsdet");
   checkLinalgCompatibleDtype("linalg.slogdet", sign, input, "sign");
-  ScalarType real_dtype = toValueType(input.scalar_type());
+  ScalarType real_dtype = toRealValueType(input.scalar_type());
   // logabsdet is always real-valued here
   checkLinalgCompatibleDtype("linalg.slogdet", logabsdet.scalar_type(), real_dtype, "logabsdet");
 
@@ -248,7 +305,7 @@ std::tuple<Tensor, Tensor> get_atol_rtol(
     rtol = rtol_opt.value();
     checkNotComplexTolerance(rtol, function_name, "rtol");
   } else {
-    ScalarType real_dtype = toValueType(input.scalar_type());
+    ScalarType real_dtype = toRealValueType(input.scalar_type());
     auto default_rtol = at::full({}, _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2)), options);
     rtol = atol_opt.has_value()
            ? at::where(atol_opt.value() > 0, at::zeros({}, options), default_rtol)
@@ -266,7 +323,7 @@ std::tuple<Tensor, Tensor> get_atol_rtol(
   if (rtol_opt.has_value()) {
     rtol = rtol_opt.value();
   } else {
-    ScalarType real_dtype = toValueType(input.scalar_type());
+    ScalarType real_dtype = toRealValueType(input.scalar_type());
     auto default_rtol = _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2));
     rtol = (atol_opt.has_value() && atol_opt.value() > 0.0)
            ? 0.0
@@ -416,6 +473,7 @@ Tensor linalg_matrix_power_impl(
     const Tensor& self,
     int64_t n,
     c10::optional<Tensor> _out) {
+  NoTF32Guard disable_tf32;
   auto out = _out.value_or(Tensor());
 
   squareCheckInputs(self, "linalg.matrix_power");
@@ -1125,6 +1183,19 @@ static void addmm_impl_cpu_(
     return;
   }
 
+  // Some paths in the code below do not handle multiplications of the form [a, 0] x [0, b]
+  if (m1_sizes[1] == 0) {
+    if (beta.toComplexDouble() == 0.0) {
+      result.zero_();
+    } else {
+      if (!self.is_same(result)) {
+        result.copy_(self);
+      }
+      result.mul_(beta);
+    }
+    return;
+  }
+
   if (beta.toComplexDouble() != 0.0 && !self.is_same(result)) {
     result.copy_(self);
   }
@@ -1201,7 +1272,7 @@ static void addmm_impl_cpu_(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj());
 
   // Apply BLAS routine
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16,
       result.scalar_type(), "addmm_impl_cpu_",
       [&]{
         at::native::cpublas::gemm(
@@ -1289,6 +1360,19 @@ TORCH_IMPL_FUNC(addmm_out_cpu)(const Tensor& self, const Tensor& mat1, const Ten
   }
 }
 
+TORCH_IMPL_FUNC(addmm_activation_out_cpu)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor &result) {
+  auto b_self = expand_size(self, {mat1.sizes()[0], mat2.sizes()[1]}, "addmm_out");
+  {
+    at::NoNamesGuard guard;
+    addmm_impl_cpu_(const_cast<Tensor&>(result), *b_self, mat1, mat2, beta, alpha);
+    if (use_gelu) {
+      at::gelu_(const_cast<Tensor&>(result));
+    } else {
+      at::relu_(const_cast<Tensor&>(result));
+    }
+  }
+}
+
 TORCH_IMPL_FUNC(mm_out_cpu)(const Tensor & self, const Tensor & mat2, const Tensor & result) {
   {
     at::NoNamesGuard guard;
@@ -1393,20 +1477,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
   // is_bmm_out: true for bmm_out, false for baddbmm_
   // self_or_result is "self" for baddbmm_ and "result" for bmm_out
   Tensor& self_or_result = const_cast<Tensor&>(self_or_result_);
-  CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm");
-
-  auto checkOnCPU = [](const Tensor& t, CheckedFrom c) {
-    TORCH_CHECK(
-        !t.is_cuda(),
-        "Expect tensor to have CPU backend, but got tensor with ",
-        toString(t.options().backend()),
-        " Backend (while checking arguments for ",
-        c);
-  };
-
-  checkOnCPU(self_or_result, c);
-  checkOnCPU(batch1, c);
-  checkOnCPU(batch2, c);
 
   const auto batch1_sizes = batch1.sizes();
   const auto batch2_sizes = batch2.sizes();
@@ -1443,16 +1513,15 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
 
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "bmm", [&] {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "bmm", [&] {
           baddbmm_cpu_kernel<scalar_t, true>(self_or_result, batch1, batch2, beta, alpha);
         });
     } else {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "baddbmm", [&] {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "baddbmm", [&] {
           baddbmm_cpu_kernel<scalar_t, false>(self_or_result, batch1, batch2, beta, alpha);
         });
     }
   } else if (at::hasMKL() && ((
-            self_or_result.scalar_type() != kHalf &&
             self_or_result.scalar_type() != kBFloat16 &&
             at::native::is_floating_point(self_or_result)) ||
             at::native::is_complex(self_or_result))
@@ -1582,124 +1651,164 @@ Tensor& vdot_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return result.fill_(self.vdot(other));
 }
 
+bool should_fold(const Tensor& tensor1, const int64_t dim_tensor2) {
+  const auto dim_tensor1 = tensor1.dim();
+  if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
+    const auto t1_sizes_ptr = tensor1.sizes().cbegin();
+    const auto t1_strides = tensor1.strides();
+    if (dim_tensor1 == 3 && dim_tensor2 == 2 &&
+        t1_strides.back() != 1 &&
+        t1_strides.front() == t1_sizes_ptr[1] * t1_sizes_ptr[2]) {
+      // First dim is slowest moving, and then the following two dims are
+      // transposed. This can happen for example by permute(0, 2, 1).
+      // First 2 dims could be folded to use mm but would require permutation
+      // with actual data movement, which can be instead handled by BMM with each
+      // GEMM transposed.
+      // This can be generalized to a tensor with dim X + Y + Z where X, Y, and Z
+      // dims are contiguous, Y dims and Z dims are transposed, and X, Y, Z > 0.
+      // For example, this can happen by permute(0, 1, 5, 2, 3, 4), where X = 2,
+      // Y = 3, and Z = 1.
+      return false;
+    } else {
+      return true;
+    }
+  } else {
+    return false;
+  }
+}
+
 /*
 Matrix product of two Tensors.
 The behavior depends on the dimensionality of the Tensors as follows:
-- If both Tensors are 1-dimensional, the dot product (scalar) is returned.
-- If both arguments are 2-dimensional, the matrix-matrix product is returned.
-- If the first argument is 1-dimensional and the second argument is 2-dimensional,
-  a 1 is prepended to its dimension for the purpose of the matrix multiply.
-  After the matrix multiply, the prepended dimension is removed.
-- If the first argument is 2-dimensional and the second argument is 1-dimensional,
-  the matrix-vector product is returned.
-- If both arguments are at least 1-dimensional and at least one argument is
-  N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
-  argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
-  batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
-  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
-  The non-matrix (i.e. batch) dimensions are broadcasted (and thus
-  must be broadcastable).  For example, if tensor1 is a (j x 1 x n x m) Tensor
-  and tensor2 is a (k x m x p) Tensor, the returned tensor will be an (j x k x n x p) Tensor.
+- If both Tensors are 1-dimensional, (1d) the dot product (scalar) is returned.
+- If the arguments are 2D - 1D or 1D - 2D, the matrix-vector product is returned.
+- If both arguments are 2D, the matrix-matrix product is returned.
+- If one of the arguments is ND with N >= 3 and the other is 1D or 2D, and some
+  conditions on the strides apply (see should_fold) we fold the first N-1 dimensions
+  of the ND argument to form a matrix, call mm or mv, reshape it back to ND and return it
+- Otherwise, we return bmm, after broadcasting and folding the batched dimensions if
+  there's more than one
 */
-Tensor matmul(
-    c10::optional<Tensor> out_opt,
+Tensor _matmul_impl(
+    Tensor& out,
     const Tensor& tensor1,
     const Tensor& tensor2) {
   NoNamesGuard guard;
-  auto dim_tensor1 = tensor1.dim();
-  auto dim_tensor2 = tensor2.dim();
-  auto has_out = out_opt.has_value();
-  Tensor out = out_opt.value_or(Tensor());
+  const auto dim_tensor1 = tensor1.dim();
+  const auto dim_tensor2 = tensor2.dim();
+
+  // This is checked up here to simplify the logic below
+  // Note that the strings are just evaluated on failure, so almost always we just evaluate
+  // the condition and move on
+  TORCH_CHECK(dim_tensor1 != 0 && dim_tensor2 != 0,
+              "both arguments to matmul need to be at least 1D, but they are ",
+              dim_tensor1, "D and ", dim_tensor2, "D");
+
+
+  const bool has_out = out.defined();
 
   if (dim_tensor1 == 1 && dim_tensor2 == 1) {
-    return has_out ? at::native::dot_out(tensor1, tensor2, out) : tensor1.dot(tensor2);
+    return has_out ? at::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
     return has_out ? at::mv_out(out, tensor1, tensor2) : tensor1.mv(tensor2);
   } else if (dim_tensor1 == 1 && dim_tensor2 == 2) {
-    return has_out ? at::mm_out(out, tensor1.unsqueeze(0), tensor2).squeeze_(0)
-                   : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
+    return has_out ? at::mv_out(out, tensor2.t(), tensor1) : tensor2.t().mv(tensor1);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
     return has_out ? at::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2);
-  } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
-    // optimization: use mm instead of bmm by folding tensor1's batch into
-    // its leading matrix dimension.
-
-    Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
-    auto size1 = tensor1.sizes();
-    auto size2 = t2.sizes();
-    std::vector<int64_t> output_size;
-    output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
-    if (dim_tensor2 > 1) {
-      output_size.push_back(size2[dim_tensor2 - 1]);
-    }
-
-    // fold the batch into the first dimension
-    // Why not tensor1.view(-1, size1[size1.size() -1])?
+  } else if (should_fold(tensor1, dim_tensor2) || should_fold(tensor2, dim_tensor1)) {
+    // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) ||
+    // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2)
+    // and some condition on the strides is fulfilled
+
+    // optimization: use mm instead of bmm by folding the batch of the larger tensor
+    // into its leading matrix dimension
+    const auto transpose = dim_tensor2 > dim_tensor1;
+    const auto t1 = transpose ? MaybeOwned<Tensor>::owned(tensor2.mT())
+                              : MaybeOwned<Tensor>::borrowed(tensor1);
+    const auto t2 = !transpose ? MaybeOwned<Tensor>::borrowed(tensor2)
+                               : dim_tensor1 == 2
+                                   ? MaybeOwned<Tensor>::owned(tensor1.t())
+                                   : MaybeOwned<Tensor>::borrowed(tensor1);
+    // Invariant: t1->dim() >= 3 && (t2->dim() == 1 || t2->dim() == 2)
+    //            and *t1 and *t2 are matmul-compatible
+
+    // Why not t1->view(-1, sizes_1.back())?
     // If the last dim is 0, then view(-1, 0) won't work because the -1 becomes ambiguous.
     // This can happen in e.g. [3, 5, 0] @ [0, 0].
-    // So we manually compute the folding as a result.
-    const auto dim1_size = c10::multiply_integers(size1.begin(), size1.end() - 1);
-    auto t1 = tensor1.expect_contiguous()->view({dim1_size, size1[size1.size() - 1]});
-    Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size)
-                            : at::_unsafe_view(t1.mm(t2), output_size);
-    return has_out ? out.set_(output) : output;
-  } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) {
-    // optimization: transpose the inner dimensions of the arguments, call
-    // matmul on the swapped arguments, then transpose the inner dimensions
-    // of the result.
-    const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1;
-    const int64_t m = tensor1.size(-1);
-    const int64_t p = tensor2.size(-1);
-
-    const Tensor t2_T = tensor2.transpose(-1, -2);
-    const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t();
-    const Tensor res_T = matmul(out_opt, t2_T, t1_T);
-
-    if (dim_tensor1 == 2) {
-      Tensor res = res_T.transpose(-1, -2).contiguous();
-      return has_out ? out.set_(res) : res;
+    const auto sizes_1 = t1->sizes();
+    auto output_shape = DimVector(sizes_1.begin(), sizes_1.end() - 1);
+    const auto folded_dim1 = c10::multiply_integers(output_shape);
+
+    // Readjust output_shape if we are multiplying by a matrix
+    const auto t2_is_matrix = t2->dim() == 2;
+    if (t2_is_matrix) {
+      output_shape.push_back(t2->sizes()[1]);
     }
-    else {
-      std::vector<int64_t> shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec();
-      shape.push_back(p);
-
-      Tensor res = res_T.reshape(shape).contiguous();
-      return has_out ? out.set_(res) : res;
+    const auto t1_folded = t1->reshape({folded_dim1, sizes_1.back()});
+    if (!has_out) {
+      if (t2_is_matrix) {
+        // FIXME This path always does an unnecessary copy when transpose == true as the returned
+        // result from BLAS is already C-transposed
+        const auto output = at::_unsafe_view(t1_folded.mm(*t2), output_shape);
+        return transpose ? output.mT().contiguous() : output;
+      } else {
+        return at::_unsafe_view(t1_folded.mv(*t2), output_shape);
+      }
+    } else {
+      // Resize output into the correct shape
+      const auto transpose_out = transpose && t2_is_matrix;
+      if (transpose_out) {
+        // Swap last two elements of output_shape
+        std::iter_swap(output_shape.end() - 2, output_shape.end() - 1);
+        at::native::resize_output(out, output_shape);
+        std::iter_swap(output_shape.end() - 2, output_shape.end() - 1);
+      } else {
+        at::native::resize_output(out, output_shape);
+      }
+      const auto out_ = transpose_out ? c10::MaybeOwned<Tensor>::owned(out.mT())
+                                      : c10::MaybeOwned<Tensor>::borrowed(out);
+
+      // We then reshape the output to the expected shape and call mm/mv
+      // and transpose back if necessary
+      auto reshaped_out = t2_is_matrix ? out_->reshape({folded_dim1, t2->sizes().back()})
+                                       : out_->reshape({folded_dim1});
+      if (t2_is_matrix) {
+        at::mm_out(reshaped_out, t1_folded, *t2);
+      } else {
+        at::mv_out(reshaped_out, t1_folded, *t2);
+      }
+      if (!reshaped_out.is_alias_of(out)) {
+        out_->copy_(reshaped_out.view_as(*out_));
+      }
+      return out;
     }
-  } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
-    // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
-    // we track m1 vs m2 separately even though they must match for nicer error messages
-    int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1;
-    int64_t m1 = tensor1.size(-1);
-    IntArrayRef batch_tensor1(tensor1.sizes().data(), std::max<int64_t>(dim_tensor1 - 2, 0));
-    int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1;
-    int64_t p = tensor2.size(-1);
-    IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0));
-
-    // expand the batch portion (i.e. cut off matrix dimensions and expand rest)
-    std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
-
-    std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
-    tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1});
-
-    std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
-    tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
-
-    const int64_t expand_batch_product =
-        c10::multiply_integers(expand_batch_portion);
-
-    std::vector<int64_t> tensor1_bmm_view({expand_batch_product});
-    tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1});
-
-    std::vector<int64_t> tensor2_bmm_view({expand_batch_product});
-    tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p});
+  } else {
+    // dim_tensor1 >= 3 || dim_tensor2 >= 3
+    // We track m1 vs m2 separately even though they must match for nicer error messages
+    const int64_t n = dim_tensor1 > 1 ? tensor1.sizes().cend()[-2] : 1LL;
+    const int64_t m1 = tensor1.sizes().back();
+    const IntArrayRef batch_tensor1(tensor1.sizes().data(),
+                                    std::max<int64_t>(dim_tensor1 - 2, 0LL));
+    const int64_t m2 = dim_tensor2 > 1 ? tensor2.sizes().cend()[-2] : tensor2.sizes().back();
+    const int64_t p = dim_tensor2 > 1 ? tensor2.sizes().back() : 1LL;
+    const IntArrayRef batch_tensor2(tensor2.sizes().data(),
+                                    std::max<int64_t>(dim_tensor2 - 2, 0LL));
+    auto output_shape = infer_size_dimvector(batch_tensor1, batch_tensor2);
+
+    const auto tensor1_expand_size = [&output_shape, n, m1]{ DimVector ret(output_shape);
+                                                             ret.append({n, m1});
+                                                             return ret; }();
+    const auto tensor2_expand_size = [&output_shape, m2, p]{ DimVector ret(output_shape);
+                                                             ret.append({m2, p});
+                                                             return ret; }();
+
+    const int64_t expand_batch_product = c10::multiply_integers(output_shape);
 
     // flatten expanded batches
-    Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(tensor1_bmm_view);
-    Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(tensor2_bmm_view);
-
-    // reshape batches back into result
-    std::vector<int64_t> output_shape(expand_batch_portion);
+    const auto tensor1_expanded = tensor1.expand(tensor1_expand_size)
+                                         .reshape({expand_batch_product, n, m1});
+    const auto tensor2_expanded = tensor2.expand(tensor2_expand_size)
+                                         .reshape({expand_batch_product, m2, p});
     if (dim_tensor1 > 1) {
       output_shape.push_back(n);
     }
@@ -1707,37 +1816,42 @@ Tensor matmul(
       output_shape.push_back(p);
     }
 
-    Tensor output = has_out ? at::_unsafe_view(at::bmm_out(out, tensor1_expanded, tensor2_expanded), output_shape)
-                            : at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape);
-
-    return has_out ? out.set_(output) : output;
+    if (!has_out) {
+      return at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape);
+    } else {
+      at::native::resize_output(out, output_shape);
+      auto reshaped_out = out.reshape({expand_batch_product, n, p});
+      at::bmm_out(reshaped_out, tensor1_expanded, tensor2_expanded);
+      if (!reshaped_out.is_alias_of(out)) {
+        out.copy_(reshaped_out.view_as(out));
+      }
+      return out;
+    }
   }
-
- AT_ERROR("both arguments to matmul need to be at least 1D, but they are ",
-          dim_tensor1, "D and ", dim_tensor2, "D");
 }
 
 Tensor matmul(const Tensor & tensor1, const Tensor & tensor2) {
   auto maybe_outnames = namedinference::compute_matmul_outnames(tensor1, tensor2);
-  auto result = at::native::matmul(c10::nullopt, tensor1, tensor2);
+  at::Tensor unused;
+  auto result = at::native::_matmul_impl(unused, tensor1, tensor2);
   namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
 
 Tensor& matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) {
   auto maybe_outnames = namedinference::compute_matmul_outnames(tensor1, tensor2);
-  at::native::matmul(c10::optional<Tensor>(result), tensor1, tensor2);
+  at::native::_matmul_impl(result, tensor1, tensor2);
   namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
 
 // torch.linalg.matmul, alias for torch.matmul
 Tensor linalg_matmul(const Tensor & tensor1, const Tensor & tensor2) {
-  return at::native::matmul(tensor1, tensor2);
+  return at::matmul(tensor1, tensor2);
 }
 
 Tensor& linalg_matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) {
-  return at::native::matmul_out(tensor1, tensor2, result);
+  return at::matmul_out(result, tensor1, tensor2);
 }
 
 // torch.linalg.diagonal, alias for torch.diagonal with dim1=-2, dim2=-1 as defaults
@@ -1798,8 +1912,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) {
 
   // fill a^2
   if (2 <= num_matrices - 1) {
-    at::native::matmul(
-      buffer.select(0, 2), // out for a^2
+    // out for a^2
+    auto view_out = buffer.select(0, 2);
+    _matmul_impl(
+      view_out,
       buffer.select(0, 1),
       buffer.select(0, 1)
     );
@@ -1807,8 +1923,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) {
 
   // fill a^3
   if (3 <= num_matrices - 1) {
-    at::native::matmul(
-      buffer.select(0, 3), // out for a^3
+    // out for a^3
+    auto view_out = buffer.select(0, 3);
+    _matmul_impl(
+      view_out,
       buffer.select(0, 1),
       buffer.select(0, 2)
     );
@@ -1816,8 +1934,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) {
 
   // fill a^6
   if (4 <= num_matrices - 1) {
-    at::native::matmul(
-      buffer.select(0, 4),
+    // out for a^6
+    auto view_out = buffer.select(0, 4);
+    _matmul_impl(
+      view_out,
       buffer.select(0, 3),
       buffer.select(0, 3)
     );
@@ -1847,7 +1967,7 @@ inline Tensor _blob_to_Tensor(
   // we also insert a fake dimension so that the result could directly
   // be used in _compute_linear_combination
   auto tensor = at::from_blob((void*)blob.begin(), blob.size(),
-    c10::toValueType(in.scalar_type())).unsqueeze(0);
+    c10::toRealValueType(in.scalar_type())).unsqueeze(0);
   return _move_memory_if_cuda_input(tensor, in);
 }
 
@@ -1875,9 +1995,10 @@ Tensor compute_T4(const Tensor& A) {
   // 3 for {I, A, A^2}
   _fill_matrix_powers(As, A, 3);
 
-  at::native::matmul(
-    // output for A^2 * (I / 2 + A / 6 + A^2 / 24)
-    As.select(0, 3),
+  // output for A^2 * (I / 2 + A / 6 + A^2 / 24)
+  auto view_out = As.select(0, 3);
+  _matmul_impl(
+    view_out,
     // contains A^2
     As.select(0, 2),
     // computes (I / 2 + A / 6 + A^2 / 24)
@@ -1909,10 +2030,11 @@ Tensor compute_T8(const Tensor& A) {
   // 3 for {I, A, A^2}
   _fill_matrix_powers(As, A, 3);
 
+  // output for A4
+  auto view_out = As.select(0, 3);
   // A4 =  A2 * (x1 * A + x2 * A2)
-  at::native::matmul(
-    // output for A4
-    As.select(0, 3),
+  _matmul_impl(
+    view_out,
     // As.select(0, 2) = A^2
     As.select(0, 2),
     at::native::_compute_linear_combination(
@@ -1922,10 +2044,11 @@ Tensor compute_T8(const Tensor& A) {
     )
   );
 
+  // output for A8
+  view_out = As.select(0, 4);
   // A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4)
-  at::native::matmul(
-    // output for A8
-    As.select(0, 4),
+  _matmul_impl(
+    view_out,
     // x3 * A2 + A4
     at::native::_compute_linear_combination(
       As.narrow(0, 2, 2),
@@ -1980,7 +2103,7 @@ Tensor compute_T12(const Tensor& A) {
     reinterpret_cast<void*>(&b),
     {num_prods, num_prods},
     {num_prods, 1},
-    c10::toValueType(A.scalar_type())
+    c10::toRealValueType(A.scalar_type())
   );
   bs = _move_memory_if_cuda_input(bs, A);
 
@@ -1989,17 +2112,17 @@ Tensor compute_T12(const Tensor& A) {
 
   auto Bs = at::native::_compute_linear_combination(As, bs);
 
+  // output for A6
+  auto view_out = As.select(0, 0);
   // compute A6
-  Bs.select(0, 2).add_(at::native::matmul(
-    // tmp buffer for this matrix product
-    As.select(0, 0),
+  Bs.select(0, 2).add_(_matmul_impl(
+    view_out,
     Bs.select(0, 3),
     Bs.select(0, 3)
   ));
 
-  return Bs.select(0,0).add_(at::native::matmul(
-    // tmp buffer for this matrix product
-    As.select(0, 0),
+  return Bs.select(0, 0).add_(_matmul_impl(
+    view_out,
     Bs.select(0, 1).add_(Bs.select(0, 2)),
     Bs.select(0, 2)
   ));
@@ -2052,7 +2175,7 @@ Tensor compute_T18(const Tensor& A) {
     reinterpret_cast<void*>(&b),
     {num_prods, num_prods},
     {num_prods, 1},
-    c10::toValueType(A.scalar_type())
+    c10::toRealValueType(A.scalar_type())
   );
   bs = _move_memory_if_cuda_input(bs, A);
 
@@ -2061,17 +2184,17 @@ Tensor compute_T18(const Tensor& A) {
 
   auto Bs = at::native::_compute_linear_combination(As, bs);
 
+  // tmp buffer for this matrix product
+  auto view_out = As.select(0, 0);
   // compute A9
-  Bs.select(0, 3).add_(at::native::matmul(
-    // tmp buffer for this matrix product
-    As.select(0, 0),
+  Bs.select(0, 3).add_(_matmul_impl(
+    view_out,
     Bs.select(0, 0),
     Bs.select(0, 4))
   );
 
-  return Bs.select(0, 1).add_(at::native::matmul(
-    // tmp buffer for this matrix product
-    As.select(0, 0),
+  return Bs.select(0, 1).add_(_matmul_impl(
+    view_out,
     Bs.select(0, 2).add_(Bs.select(0, 3)),
     Bs.select(0, 3)
   ));
@@ -2280,6 +2403,218 @@ Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
   );
 }
 
+TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype, const Tensor& result) {
+  // Casting a large integer to a double will just introduce an error for
+  // values larger than 10^53 (same for negative numbers), so that's fine.
+  auto ord = scalar_ord.toDouble();
+  auto dim = opt_dim.value_or(IntArrayRef{});
+  // No need to handle opt_dtype explicitly as it is already encoded in the dtype of result
+
+  // Issue arising from the difference between vectorized and non-vectorized implementation on CPU
+  Tensor self_;
+  if (self.device().type() == c10::kCPU &&
+      isComplexType(self.scalar_type()) &&
+      std::abs(ord) == INFINITY) {
+    // TODO: This at::abs() call is used so that the at::abs() call in the
+    // backward function produces an identical result for complex inputs.
+    // However, it would be ideal if we could incorporate this into
+    // linalg_vector_norm_stub. See issue:
+    // https://github.com/pytorch/pytorch/issues/52648
+    auto in_dtype = opt_dtype.value_or(self.scalar_type());
+    self_ = self.to(in_dtype).abs();
+  } else {
+    self_ = self;
+  }
+
+  auto iter = make_reduction("vector_norm", const_cast<Tensor&>(result), self_, dim, keepdim, result.scalar_type());
+  norm_stub(iter.device_type(), iter, ord);
+}
+
+void _linalg_matrix_norm_checks(const Tensor& A, IntArrayRef dim, optional<ScalarType> opt_dtype) {
+  at::native::checkFloatingOrComplex(A, "linalg.matrix_norm");
+  TORCH_CHECK(A.dim() >= 2,
+              "linalg.matrix_norm: input tensor must be a matrix or a batch of matrices");
+
+  // dim
+  TORCH_CHECK(dim.size() == 2, "linalg.matrix_norm: dim must be a 2-tuple of ints");
+  TORCH_CHECK(dim[0] != dim[1], "Expected dims to be different, got (", dim[0], ", ", dim[1], ") instead");
+
+  // dtype
+  at::detail::check_linalg_norm_dtype(opt_dtype, A.scalar_type(), "linalg.matrix_norm");
+}
+
+Tensor linalg_matrix_norm(
+    const Tensor& A,
+    const Scalar& scalar_ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  _linalg_matrix_norm_checks(A, dim, opt_dtype);
+
+  auto ord = scalar_ord.toDouble();
+  auto abs_ord = std::abs(ord);
+  TORCH_CHECK(abs_ord == 2. || abs_ord == 1. || abs_ord == INFINITY, "linalg.matrix_norm: Order ", ord, " not supported.");
+
+  auto dim_ = dim.vec();
+  maybe_wrap_dims(dim_, A.dim());
+
+  auto max_min = [ord, keepdim](const Tensor& A, int64_t dim) { return ord > 0 ? A.amax(dim, keepdim) : A.amin(dim, keepdim); };
+  if (abs_ord == 2.) {
+    // Move dims to the end
+    auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], A.dim());
+
+    auto A_ = opt_dtype.has_value() ? A.to(*opt_dtype) : A;
+    auto result = max_min(at::linalg_svdvals(A_.permute(permutation)), -1);
+    if (keepdim) {
+      auto permutation_reverse = create_reverse_permutation(permutation);
+      result = result.unsqueeze(-1).permute(permutation_reverse);
+    }
+    return result;
+  } else {  // 1, -1, inf, -inf
+    // The infty norm is like the 1 norm on the transposed matrix
+    if (abs_ord == INFINITY) {
+      std::swap(dim_[0], dim_[1]);
+    }
+
+    // If the first reduction removes one dim from the front (dim_[0] < dim_[1]), after this
+    // reduction dim_[1] will be off by one
+    if (!keepdim && (dim_[0] < dim_[1])) {
+      dim_[1]--;
+    }
+    return max_min(at::linalg_vector_norm(A, 1., {dim_[0]}, keepdim, opt_dtype), dim_[1]);
+  }
+}
+
+Tensor& linalg_matrix_norm_out(
+    const Tensor& A,
+    const Scalar& ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype,
+    Tensor& result) {
+  checkSameDevice("linalg.matrix_norm", A, result);
+  auto out = at::linalg_matrix_norm(A, ord, dim, keepdim, opt_dtype);
+  TORCH_CHECK(out.scalar_type() == result.scalar_type(),
+              "linalg.matrix_norm expected out tensor dtype ", out.scalar_type(),
+              " but got: ", result.scalar_type());
+  at::native::resize_output(result, out.sizes());
+  result.copy_(out);
+  return result;
+}
+
+// fro / nuc
+Tensor linalg_matrix_norm(
+    const Tensor& A,
+    c10::string_view ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  _linalg_matrix_norm_checks(A, dim, opt_dtype);
+  TORCH_CHECK(ord == "fro" || ord == "nuc", "linalg.matrix_norm: Order ", ord, " not supported.");
+
+  auto A_ = opt_dtype.has_value() ? A.to(*opt_dtype) : A;
+  using Int = IntArrayRef::value_type;
+
+  if (ord == "fro") {
+    return at::linalg_vector_norm(A_, 2, dim, keepdim);
+  } else {  // nuc
+    auto dim_ = dim.vec();
+    maybe_wrap_dims(dim_, A_.dim());
+    // Move dims to the end
+    auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], A_.dim());
+    auto result = at::linalg_svdvals(A_.permute(permutation)).sum(-1, keepdim);
+    if (keepdim) {
+      auto permutation_reverse = create_reverse_permutation(permutation);
+      result = result.unsqueeze(-1).permute(permutation_reverse);
+    }
+    return result;
+  }
+}
+
+Tensor& linalg_matrix_norm_out(
+    const Tensor& A,
+    c10::string_view ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype,
+    Tensor& result) {
+  checkSameDevice("linalg.matrix_norm", A, result);
+  auto out = at::linalg_matrix_norm(A, ord, dim, keepdim, opt_dtype);
+  TORCH_CHECK(out.scalar_type() == result.scalar_type(),
+              "linalg.matrix_norm expected out tensor dtype ", out.scalar_type(),
+              " but got: ", result.scalar_type());
+  at::native::resize_output(result, out.sizes());
+  result.copy_(out);
+  return result;
+}
+
+// Numerical or None norms
+Tensor linalg_norm(const Tensor& X, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+  if (opt_dim.has_value()) {
+    TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ",
+              "dim is specified, it must be of length 1 or 2. Got ", *opt_dim);
+  } else {
+    if (opt_ord.has_value()) {
+      TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ",
+                  "dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D.");
+    }
+  }
+
+  // If ord=None, we'll always use the 2-norm or frob norm (which are the same) so we go through
+  // vector_norm
+  if (opt_ord.has_value() &&
+       ((opt_dim.has_value() && opt_dim->size() == 2) ||
+        (!opt_dim.has_value() && X.dim() == 2))) {
+    using Int = IntArrayRef::value_type;
+    auto dim = opt_dim.has_value() ? opt_dim.value().vec() : std::vector<Int>{0, 1};
+    return at::linalg_matrix_norm(X, *opt_ord, dim, keepdim, opt_dtype);
+  } else {
+    auto scalar_ord = opt_ord.value_or(Scalar(2.));
+    return at::linalg_vector_norm(X, scalar_ord, opt_dim, keepdim, opt_dtype);
+  }
+}
+
+Tensor& linalg_norm_out(const Tensor& X, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
+  checkSameDevice("linalg.norm", X, result);
+  auto out = at::linalg_norm(X, opt_ord, opt_dim, keepdim, opt_dtype);
+  TORCH_CHECK(out.scalar_type() == result.scalar_type(),
+              "linalg.norm expected out tensor dtype ", out.scalar_type(),
+              " but got: ", result.scalar_type());
+  at::native::resize_output(result, out.sizes());
+  result.copy_(out);
+  return result;
+}
+
+// Frobenius and nuclear norms
+Tensor linalg_norm(const Tensor& X, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+  if (opt_dim.has_value()) {
+    TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ",
+              "dim is specified, it mut be of length 1 or 2. Got ", *opt_dim);
+  } else {
+    TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ",
+                "dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D.");
+  }
+  using Int = IntArrayRef::value_type;
+  auto dim = opt_dim.has_value() ? opt_dim.value().vec() : std::vector<Int>{0, 1};
+  return at::linalg_matrix_norm(X, ord, dim, keepdim, opt_dtype);
+}
+
+Tensor& linalg_norm_out(const Tensor& X, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
+  checkSameDevice("linalg.norm", X, result);
+  auto out = at::linalg_norm(X, ord, opt_dim, keepdim, opt_dtype);
+  TORCH_CHECK(out.scalar_type() == result.scalar_type(),
+              "linalg.norm expected out tensor dtype ", out.scalar_type(),
+              " but got: ", result.scalar_type());
+  at::native::resize_output(result, out.sizes());
+  result.copy_(out);
+  return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//                              Frobenius Norm                                //
+//             Just used in linalg.norm. It should not be removed.            //
+////////////////////////////////////////////////////////////////////////////////
+
 Tensor frobenius_norm(const Tensor& self) {
   return at::norm(self);
 }
@@ -2287,7 +2622,7 @@ Tensor frobenius_norm(const Tensor& self) {
 Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
   // NOTE: As frobenius_norm_out is currently implemented, it will always produce a
   //    strided tensor result, even if the input is sparse.
-  auto options = self.options().layout(c10::Layout::Strided).dtype(toValueType(self.scalar_type()));
+  auto options = self.options().layout(c10::Layout::Strided).dtype(toRealValueType(self.scalar_type()));
   Tensor result = at::empty({0}, options);
   return at::native::frobenius_norm_out(self, dim, keepdim, result);
 }
@@ -2322,6 +2657,11 @@ Tensor &frobenius_norm_out(const Tensor& self,
   return result;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+//                                Nuclear Norm                                //
+//             Just used in linalg.norm. It should not be removed.            //
+////////////////////////////////////////////////////////////////////////////////
+
 Tensor nuclear_norm(const Tensor& self, bool keepdim) {
   TORCH_CHECK(
       self.dim() == 2,
@@ -2338,12 +2678,8 @@ Tensor &nuclear_norm_out(const Tensor& self, bool keepdim, Tensor& result) {
   return at::native::nuclear_norm_out(self, IntArrayRef({0, 1}), keepdim, result);
 }
 
-Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
-  Tensor result = at::empty({0}, self.options().dtype(toValueType(self.scalar_type())));
-  return at::native::nuclear_norm_out(self, dim, keepdim, result);
-}
-
-Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) {
+namespace {
+Tensor nuclear_norm_impl(const Tensor& self, IntArrayRef dim, bool keepdim) {
   TORCH_CHECK(dim.size() == 2, "nuclear norm requires a 'dim' argument of size 2");
   auto dim_ = dim.vec();
   maybe_wrap_dims(dim_, self.dim());
@@ -2356,323 +2692,25 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens
     auto permutation_reverse = create_reverse_permutation(permutation);
     result_ = result_.permute(permutation_reverse);
   }
-  at::native::resize_output(result, result_.sizes());
-  result.copy_(result_);
-  return result;
-}
-
-// Creates a vector of length ndim with values equal to its indices
-// (e.g. [0, 1, 2, ..., ndim-1])
-static std::vector<int64_t> make_dim_list(int64_t ndim) {
-  std::vector<int64_t> dim_list(ndim);
-  for (const auto ind : c10::irange(ndim)) {
-    dim_list[ind] = ind;
-  }
-  return dim_list;
-}
-
-// Checks for valid arguments to linalg_norm when type(ord) == str
-static void check_str_ord_valid(const c10::string_view str_ord, optional<IntArrayRef> opt_dim, int64_t ndim) {
-  TORCH_CHECK((str_ord == "nuc") || (str_ord == "fro"), "Invalid norm order: ", str_ord);
-  bool dims_valid = (ndim == 2 && !opt_dim.has_value()) || (opt_dim.has_value() && opt_dim.value().size() == 2);
-  TORCH_CHECK(dims_valid, "order \"", str_ord,
-    "\" can only be used if either len(dim) == 2 or (self.dim() == 2 and dim is None)");
+  return result_;
 }
+} // anonymous namespace
 
-// Performs second dimension reduction for matrix norms
-static Tensor _norm_min_max(Tensor& self, double ord, int64_t dim, bool keepdim) {
-  if (ord > 0) {
-    return self.amax(dim, keepdim);
-  } else {
-    return self.amin(dim, keepdim);
-  }
+Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
+  return nuclear_norm_impl(self, dim, keepdim).to(toRealValueType(self.scalar_type()));
 }
 
-// Performs matrix norm
-static Tensor& _linalg_norm_matrix_out(Tensor& result, const Tensor &self, const optional<Scalar>& opt_ord,
-                               IntArrayRef dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  Tensor result_;
-  auto ord = opt_ord.value_or(2.0).toDouble();
-  TORCH_CHECK(self.layout() == Layout::Strided,
-              "matrix norm only supports strided layout, got: ", self.layout());
-
-  TORCH_CHECK(dim.size() == 2, "_linalg_norm_matrix: 'dim' must either specify 2 dimensions. ",
-    "Got 'dim' specifying ", dim.size(), " dims");
-  auto dim_ = dim.vec();
-  maybe_wrap_dims(dim_, self.dim());
-  TORCH_CHECK(dim_[0] != dim_[1],
-    "Expected dims to be different, got (", dim[0], ", ", dim[1], ") instead");
-
-  ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type();
-  TORCH_CHECK(
-      at::isFloatingType(scalarType) || at::isComplexType(scalarType),
-      "Can only calculate the mean of floating and complex types. Got ",
-      toString(scalarType), " instead.");
-
-  Tensor self_;
-  if (opt_dtype.has_value()) {
-    self_ = self.to(scalarType);
-  } else {
-    self_ = self;
-  }
-
-  if (std::abs(ord) == 2) {
-    // Need to shift the reduction dims to the back, because at::linalg_svdvals will only operate on
-    // the last 2 dimensions
-    auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], self.dim());
-    auto permutation_reverse = create_reverse_permutation(permutation);
-
-    result_ = at::linalg_svdvals(self_.permute(permutation));
-    result_ = _norm_min_max(result_, ord, result_.dim() - 1, keepdim);
-
-    if (keepdim) {
-      result_ = result_.unsqueeze(-1).permute(permutation_reverse);
-    }
-  } else {
-    // abs(p) == infinity and abs(p) == 1 will perform identical reductions, except
-    // that the order of the two dims is swapped. So we can swap the dims if
-    // abs(p) == infinity to simplify the rest of the operation's logic.
-    if (std::abs(ord) == INFINITY) {
-      std::swap(dim_[0], dim_[1]);
-    }
-    // If the dim of the second reduction is greater than that of the first reduction
-    // and we are not keeping the dims, then the fact that the output of the first
-    // reduction will have one fewer dimension means that the second reduction dim
-    // will be off by one, so we need to correct that.
-    if ((dim_[1] > dim_[0]) && !keepdim) {
-      dim_[1]--;
-    }
-    if (std::abs(ord) == 1 || std::abs(ord) == INFINITY) {
-      result_ = self_.abs().sum(dim_[0], keepdim);
-      result_ = _norm_min_max(result_, ord, dim_[1], keepdim);
-    } else {
-      TORCH_CHECK(false, "Order ", ord, " not supported for matrix norm");
-    }
-  }
+Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) {
+  auto result_ = nuclear_norm_impl(self, dim, keepdim);
   at::native::resize_output(result, result_.sizes());
   result.copy_(result_);
   return result;
 }
 
-static Tensor& linalg_norm_out_impl(Tensor& result, const Tensor& self, const optional<Scalar>& opt_num_ord, optional<c10::string_view> opt_str_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  // Callers must give the ord argument as either a number, a string, or neither.
-  // Since the user-facing API has no direct control over how this function is called, this is an internal assert.
-  TORCH_INTERNAL_ASSERT(!(opt_num_ord.has_value() && opt_str_ord.has_value()));
-  if (opt_dtype.has_value()) {
-    auto dtype = opt_dtype.value();
-    TORCH_CHECK(dtype == result.scalar_type(), "provided dtype must match dtype of result, but got",
-      "dtype = ", dtype, ", out.dtype = ", result.scalar_type());
-  }
-  int64_t ndim = self.dim();
-  if (opt_str_ord.has_value()) {
-    // 'ord' is string
-    auto str_ord = opt_str_ord.value();
-    check_str_ord_valid(str_ord, opt_dim, ndim);
-    Tensor self_ = opt_dtype.has_value() ? self.to(opt_dtype.value()) : self;
-    if (str_ord == "fro") {
-      at::frobenius_norm_out(result, self_, opt_dim.value_or(IntArrayRef({0, 1})), keepdim);
-    } else if (str_ord == "nuc") {
-      if (opt_dim.has_value()) {
-        at::nuclear_norm_out(result, self_, opt_dim.value(), keepdim);
-      } else {
-        at::nuclear_norm_out(result, self_, keepdim);
-      }
-    }
-  } else {
-    // 'ord' is int or None
-    std::vector<int64_t> dim_ = opt_dim.has_value() ? opt_dim.value().vec() : make_dim_list(ndim);
-    if (!opt_num_ord.has_value() || dim_.size() == 1) {
-      Tensor result_ = at::linalg_vector_norm(
-          self, opt_num_ord.value_or(2), opt_dim, keepdim, opt_dtype);
-      // TODO: Resize and copy should be avoided with
-      //       https://github.com/pytorch/pytorch/issues/52712
-      at::native::resize_output(result, result_.sizes());
-      result.copy_(result_);
-    } else if (dim_.size() == 2) {
-      _linalg_norm_matrix_out(result, self, opt_num_ord.value(), dim_, keepdim, opt_dtype);
-    } else {
-      TORCH_CHECK(false, "'dim' must specify 1 or 2 dimensions when order is numerical and input is "
-        "not 1-D or 2-D");
-    }
-  }
-  return result;
-}
-
-static Tensor& linalg_vector_norm_impl(const Tensor& self, const Scalar& scalar_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
-  // Casting a large integer to a double will introduce some error, but for
-  // practical purposes, it won't matter since a large order will usually
-  // give an infinite result
-  auto ord = scalar_ord.toDouble();
-
-  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
-              "linalg.vector_norm only supports CPU and CUDA device types, but got: ",
-              self.device().type());
-  TORCH_CHECK(self.layout() == Layout::Strided,
-              "linalg.vector_norm only supports strided layout, but got: ", self.layout());
-
-  if (opt_dtype.has_value() && isComplexType(self.scalar_type())) {
-    TORCH_CHECK(isComplexType(opt_dtype.value()),
-      "linalg.vector_norm expected complex 'dtype', since input is complex, ",
-      "but got ", opt_dtype.value());
-  }
-
-  checkFloatingOrComplex(self, "linalg.vector_norm");
-  ScalarType in_dtype = opt_dtype.value_or(self.scalar_type());
-
-  IntArrayRef dim = opt_dim.value_or(IntArrayRef{});
-
-  if (self.numel() == 0) {
-    // TODO: The question about how to handle negative orders when the input
-    // is empty has not been settled yet. For now, we raise an error. Issue:
-    // https://github.com/pytorch/pytorch/issues/52783
-    TORCH_CHECK(ord >= 0,
-      "linalg.vector_norm of negative order cannot be performed on an empty tensor");
-
-    // For NumPy compatibility, we can only perform order infinity reduction
-    // (max/min) on a tensor with zero elements if the dimensions to reduce are
-    // nonzero. Otherwise, throw an error.
-    if (ord == INFINITY) {
-      bool has_identity = true;
-
-      if (dim.size() == 0) {
-        has_identity = false;
-      } else {
-        for (int64_t dim_num : dim) {
-          if (self.size(dim_num) == 0) {
-            has_identity = false;
-            break;
-          }
-        }
-      }
-      TORCH_CHECK(has_identity,
-        "linalg.vector_norm cannot compute the infinity norm on an empty ",
-        "dimension because the operation does not have an identity");
-    }
-  }
-  Tensor self_;
-  if (self.device().type() == c10::kCPU && isComplexType(self.scalar_type()) && std::abs(ord) == INFINITY) {
-    // TODO: This at::abs() call is used so that the at::abs() call in the
-    // backward function produces an identical result for complex inputs.
-    // However, it would be ideal if we could incorporate this into
-    // linalg_vector_norm_stub. See issue:
-    // https://github.com/pytorch/pytorch/issues/52648
-    self_ = self.to(in_dtype).abs();
-    in_dtype = toValueType(in_dtype);
-  } else {
-    self_ = self;
-  }
-  ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type()));
-  TORCH_CHECK(!result.defined() || out_dtype == result.scalar_type(),
-    "linalg.vector_norm expected out tensor dtype ", out_dtype,
-    " but got: ", result.scalar_type());
-  // omit in_dtype in the following call, to avoid make_reduction explicitly casting input to out_dtype
-  auto iter = isComplexType(self.scalar_type()) ?
-      make_reduction("vector_norm", result, self_, dim, keepdim, in_dtype, out_dtype) :
-      make_reduction("vector_norm", result, self_, dim, keepdim, out_dtype);
-
-  linalg_vector_norm_stub(iter.device_type(), iter, ord);
-  return result;
-}
-
-Tensor linalg_vector_norm(const Tensor& self, const Scalar& ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type()));
-  Tensor result = create_reduction_result(self, opt_dim.value_or(IntArrayRef{}), keepdim, out_dtype);
-  return at::native::linalg_vector_norm_impl(self, ord, opt_dim, keepdim, opt_dtype, result);
-}
+////////////////////////////////////////////////////////////////////////////////
+//                              linalg.cond                                   //
+////////////////////////////////////////////////////////////////////////////////
 
-Tensor& linalg_vector_norm_out(const Tensor& self, const Scalar& ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
-  return at::native::linalg_vector_norm_impl(self, ord, opt_dim, keepdim, opt_dtype, result);
-}
-
-namespace {
-
-// Only performs checks not performed by linalg.norm
-void check_linalg_matrix_norm_args(
-    const Tensor& self,
-    IntArrayRef dim,
-    optional<ScalarType> dtype) {
-  TORCH_CHECK(
-      self.ndimension() >= 2,
-      "linalg.matrix_norm(): input tensor must be a matrix or batch of matrices");
-  ScalarType in_dtype = dtype.value_or(self.scalar_type());
-  TORCH_CHECK(
-      in_dtype == kFloat || in_dtype == kDouble || in_dtype == kComplexFloat ||
-          in_dtype == kComplexDouble,
-      "linalg.matrix_norm(): only supports the float, double, cfloat and cdouble dtypes, but got: ",
-      toString(in_dtype));
-  TORCH_CHECK(
-      dim.size() == 2, "linalg.matrix_norm(): dim must be a 2-tuple of ints");
-}
-
-} // namespace
-
-Tensor linalg_matrix_norm(
-    const Tensor& self,
-    const Scalar& ord,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> dtype) {
-  check_linalg_matrix_norm_args(self, dim, dtype);
-  return at::native::linalg_norm(self, ord, dim, keepdim, dtype);
-}
-
-Tensor& linalg_matrix_norm_out(
-    const Tensor& self,
-    const Scalar& ord,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> dtype,
-    Tensor& result) {
-  check_linalg_matrix_norm_args(self, dim, dtype);
-  return at::native::linalg_norm_out(self, ord, dim, keepdim, dtype, result);
-}
-
-Tensor linalg_matrix_norm(
-    const Tensor& self,
-    c10::string_view ord,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> dtype) {
-  check_linalg_matrix_norm_args(self, dim, dtype);
-  return at::native::linalg_norm(self, ord, dim, keepdim, dtype);
-}
-
-Tensor& linalg_matrix_norm_out(
-    const Tensor& self,
-    c10::string_view ord,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> dtype,
-    Tensor& result) {
-  check_linalg_matrix_norm_args(self, dim, dtype);
-  return at::native::linalg_norm_out(self, ord, dim, keepdim, dtype, result);
-}
-
-// Numerical or None norms
-Tensor linalg_norm(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device());
-  Tensor result = at::empty({0}, options);
-  return at::native::linalg_norm_out(
-      self, opt_ord, opt_dim, keepdim, opt_dtype, result);
-}
-
-// Frobenius and nuclear norms
-Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device());
-  Tensor result = at::empty({0}, options);
-  return at::native::linalg_norm_out(
-      self, ord, opt_dim, keepdim, opt_dtype, result);
-}
-
-// Numerical or None norms
-Tensor& linalg_norm_out(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
-  return linalg_norm_out_impl(result, self, opt_ord, c10::nullopt, opt_dim, keepdim, opt_dtype);
-}
-
-// Frobenius and nuclear norms
-Tensor& linalg_norm_out(const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
-  return linalg_norm_out_impl(result, self, c10::nullopt, ord, opt_dim, keepdim, opt_dtype);
-}
 
 // This function helps to dispatch norm computations depending on 'ord' of variant type
 Tensor _linalg_cond_helper(const Tensor& self, c10::variant<Scalar, c10::string_view> ord_variant) {
@@ -2694,7 +2732,7 @@ Tensor _linalg_cond_helper(const Tensor& self, c10::variant<Scalar, c10::string_
 // Return zero for each matrix in the batch
 Tensor _linalg_cond_empty_matrix(const Tensor& self, c10::ScalarType dtype) {
   auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2);
-  TensorOptions options = self.options().dtype(toValueType(self.scalar_type()));
+  TensorOptions options = self.options().dtype(toRealValueType(self.scalar_type()));
   return at::zeros(result_shape, options);
 }
 
@@ -2726,7 +2764,7 @@ Tensor linalg_cond(const Tensor& self, const optional<Scalar>& opt_ord) {
 
   // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input
   if (self.numel() == 0) {
-    auto real_dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    auto real_dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
     return _linalg_cond_empty_matrix(self, real_dtype);
   }
 
@@ -2757,7 +2795,7 @@ Tensor linalg_cond(const Tensor& self, const optional<Scalar>& opt_ord) {
 
 Tensor& linalg_cond_out(const Tensor& self, const optional<Scalar>& opt_ord, Tensor& result) {
   checkSameDevice("linalg.cond", result, self);
-  ScalarType real_dtype = toValueType(self.scalar_type());
+  ScalarType real_dtype = toRealValueType(self.scalar_type());
   checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype);
 
   Tensor result_tmp = at::linalg_cond(self, opt_ord);
@@ -2791,7 +2829,7 @@ Tensor linalg_cond(const Tensor& self, c10::string_view ord) {
 // TODO: implement _out variant avoiding copy and using already allocated storage directly
 Tensor& linalg_cond_out(const Tensor& self, c10::string_view ord, Tensor& result) {
   checkSameDevice("linalg.cond", result, self);
-  ScalarType real_dtype = toValueType(self.scalar_type());
+  ScalarType real_dtype = toRealValueType(self.scalar_type());
   checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype);
 
   Tensor result_tmp = at::linalg_cond(self, ord);
@@ -2849,7 +2887,7 @@ Tensor& linalg_tensorinv_out(const Tensor& self, int64_t ind, Tensor& result) {
   return result;
 }
 
-Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) {
   /*
   The idea is to reduce the problem to 2D matrix solve.
   Step 1. (optional) `self` is permuted with `dims` such that dimensions from `dims` are moved to the right.
@@ -2887,7 +2925,7 @@ Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, optional<IntA
   return result.reshape(result_shape);
 }
 
-Tensor& linalg_tensorsolve_out(const Tensor& self, const Tensor& other, optional<IntArrayRef> dims, Tensor& result) {
+Tensor& linalg_tensorsolve_out(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims, Tensor& result) {
   checkSameDevice("tensorsolve", result, self);
   checkLinalgCompatibleDtype("tensorsolve", result, self);
 
@@ -2946,142 +2984,6 @@ struct KronImpl final {
 };
 }
 
-DEFINE_DISPATCH(unpack_pivots_stub);
-
-std::tuple<Tensor, Tensor, Tensor> lu_unpack(
-    const Tensor& LU_data,
-    const Tensor& LU_pivots,
-    bool unpack_data,
-    bool unpack_pivots
-    ) {
-  TORCH_CHECK(LU_pivots.is_contiguous() && (LU_pivots.scalar_type() == at::kInt),
-      "lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype."
-      "Note: this function is intended to be used with the output produced by torch{.linalg}.lu");
-
-  // trivial case
-  if (!unpack_data && !unpack_pivots) {
-    return std::make_tuple(Tensor(), Tensor(), Tensor());
-  }
-
-  Tensor L, U;
-  // In the generalized LU factorization, the following shape relations hold:
-  // A.shape[-2:] == (m, n),
-  // P.shape[-2:] == (m, m),
-  // U.shape[-2:] == (m, k),
-  // L.shape[-2:] == (k, n),
-  // where k = min(m, n)
-  int64_t m = LU_data.size(-2);
-  int64_t n = LU_data.size(-1);
-  int64_t k = std::min(m, n);
-
-  if (unpack_data) {
-    U = LU_data.triu();
-    if (m != k) {
-      U = U.narrow(-2, 0, k);
-    }
-
-    L = LU_data.tril();
-    if (k != n) {
-      L = L.narrow(-1, 0, k);
-    }
-    L.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1);
-  }
-
-  if (!unpack_pivots) {
-    return std::make_tuple(Tensor(), L, U);
-  }
-
-  auto unpacked_pivots_sizes = LU_pivots.sizes().vec();
-  unpacked_pivots_sizes[LU_pivots.dim() - 1] = m;
-  auto unpacked_pivots = at::empty(
-    unpacked_pivots_sizes,
-    LU_pivots.options().memory_format(at::MemoryFormat::Contiguous)
-  );
-
-  // Fill `unpacked_pivots` with identity permutation
-  auto id_perm = at::arange(m, LU_pivots.options());
-  unpacked_pivots.copy_(id_perm);
-
-  // WARNING: we assume that unchanged LAPACK pivots are provided.
-  // Since LAPACK relies on the FORTRAN's 1-based indexing,
-  // we subtract 1 to convert the pivots to the C-style 0-based indexing.
-  // This behaviour could change in the future.
-  auto LU_pivots_zero_idx = LU_pivots - 1;
-
-  auto iter = TensorIteratorConfig()
-    .set_check_mem_overlap(false)
-    .check_all_same_dtype(false)
-    .resize_outputs(false)
-    .declare_static_shape(LU_pivots.sizes(), /*squash_dim=*/LU_pivots.dim() - 1)
-    .add_output(unpacked_pivots)
-    .add_input(LU_pivots_zero_idx)
-    .build();
-  // }
-
-  unpack_pivots_stub(
-    LU_pivots.device().type(),
-    iter,
-    LU_pivots.size(-1)
-  );
-
-  // The permutation matrix is converted to LU_data.dtype
-  // because `matmul` does not work with integer matrices.
-  unpacked_pivots_sizes.push_back(m);
-  auto permutation_matrix = at::zeros(
-    unpacked_pivots_sizes,
-    LU_data.options().memory_format(at::MemoryFormat::Contiguous)
-  );
-
-  // now that we know the final permutation,
-  // scatter 1s at proper locations.
-  permutation_matrix.scatter_(
-    -2,
-    unpacked_pivots.unsqueeze(-2).to(at::kLong),
-    at::ones({1}, permutation_matrix.options()).expand(permutation_matrix.sizes())
-  );
-
-  return std::make_tuple(permutation_matrix, L, U);
-}
-
-using TupleTensorRefs3 = std::tuple<Tensor&, Tensor&, Tensor&>;
-
-TupleTensorRefs3 lu_unpack_out(
-    const Tensor& LU_data,
-    const Tensor& LU_pivots,
-    bool unpack_data,
-    bool unpack_pivots,
-    Tensor& P,
-    Tensor& L,
-    Tensor& U
-    ) {
-  Tensor P_tmp, L_tmp, U_tmp;
-  std::tie(P_tmp, L_tmp, U_tmp) = at::lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots);
-
-  if (unpack_pivots) {
-    checkSameDevice("lu_unpack", P, LU_data, "P");
-    // Note that lu_unpack returns P such that P.dtype == LU_data.dtype,
-    // because otherwise we cannot use P in matric products (no int -> float promotion)
-    checkLinalgCompatibleDtype("lu_unpack", P, LU_data, "L");
-
-    at::native::resize_output(P, P_tmp.sizes());
-    P.copy_(P_tmp);
-  }
-
-  if (unpack_data) {
-    checkSameDevice("lu_unpack", L, LU_data, "L");
-    checkSameDevice("lu_unpack", U, LU_data, "U");
-    checkLinalgCompatibleDtype("lu_unpack", L, LU_data, "L");
-    checkLinalgCompatibleDtype("lu_unpack", U, LU_data, "U");
-
-    at::native::resize_output(L, L_tmp.sizes());
-    at::native::resize_output(U, U_tmp.sizes());
-    L.copy_(L_tmp);
-    U.copy_(U_tmp);
-  }
-
-  return TupleTensorRefs3(P, L, U);
-}
-
 /*
 Calculates the Kronecker product between two Tensors.
 */
diff --git a/aten/src/ATen/native/LinearAlgebra.h b/aten/src/ATen/native/LinearAlgebra.h
index 050fe7dedc7b..304fbb8e6847 100644
--- a/aten/src/ATen/native/LinearAlgebra.h
+++ b/aten/src/ATen/native/LinearAlgebra.h
@@ -15,15 +15,4 @@ namespace at { namespace native {
 
 using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha);
 DECLARE_DISPATCH(addr_fn, addr_stub);
-
-using linalg_vector_norm_fn = void(*)(TensorIterator &, Scalar);
-DECLARE_DISPATCH(linalg_vector_norm_fn, linalg_vector_norm_stub);
-
-using unpack_pivots_fn = void(*)(
-  TensorIterator& iter,
-  int64_t dim_size
-);
-DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
-
-
 }} // namespace at::native
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 555cbb001ef2..9301d090080a 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -3,6 +3,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/irange.h>
 #include <c10/util/Exception.h>
+#include <c10/util/strides.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/TensorUtils.h>
@@ -25,23 +26,6 @@
 
 namespace at { namespace native {
 
-// Used as an interface between the different BLAS-like libraries
-enum class TransposeType {
-  NoTranspose,
-  Transpose,
-  ConjTranspose,
-};
-
-// Transforms TransposeType into the BLAS / LAPACK format
-static char to_blas(TransposeType trans) {
-  switch (trans) {
-    case TransposeType::Transpose: return 'T';
-    case TransposeType::NoTranspose: return 'N';
-    case TransposeType::ConjTranspose: return 'C';
-  }
-  TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
-}
-
 static inline c10::MaybeOwned<Tensor> expect_resolved_conj(const Tensor& tensor) {
   if (tensor.is_conj()) {
     return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
@@ -50,46 +34,23 @@ static inline c10::MaybeOwned<Tensor> expect_resolved_conj(const Tensor& tensor)
   }
 }
 
-template<class Vec>
-static inline Vec contiguous_strides_template(const IntArrayRef sizes, const bool f_contig=false) {
-  static_assert(std::is_same<IntArrayRef::value_type, typename Vec::value_type>::value,
-                "Incompatible integral type of sizes and strides");
-  // f_contig chooses between the strides of a batch of Fortran (F-contiguous) and C-contiguous matrices
-  using Int = IntArrayRef::value_type;
-  constexpr auto one = Int{1};
-  const auto n = sizes.size();
-  if (n == 0) {
-    return Vec{};
-  } else if (n == 1) {
-    // Use initializer-list to initialize the vector
-    return Vec{one};
-  }
-  // Now we have a matrix or batch of matrices
-  auto strides = Vec(n);
-  const auto last_idx = n - 1;
-  const auto snd_last_idx = n - 2;
-  // We'll fill the first two strides afterwards, otherwise the first step
-  // in the for loop is wrong
-  strides[snd_last_idx] = std::max<int64_t>(sizes[last_idx], one);
-  for (int i = snd_last_idx - 1; i >= 0; --i) {
-    strides[i] = strides[i + 1] * std::max(sizes[i + 1], one);
-  }
-  strides[last_idx] = f_contig ? std::max(sizes[snd_last_idx], one) : one;
-  if (f_contig) {
-    // We filled the wrong stride before so we correct it
-    strides[snd_last_idx] = one;
+static inline DimVector batched_matrix_contiguous_strides(
+    const IntArrayRef sizes,
+    const bool f_contig = false) {
+  // f_contig chooses between the strides of a batch of Fortran (F-contiguous)
+  // and C-contiguous matrices
+  auto strides = c10::contiguous_strides(sizes);
+  auto dim = strides.size();
+
+  if (f_contig && dim >= 2) {
+    // Fix the strides of the last two dimensions, so that we return
+    // C-contiguous batches of F-contiguous matrices.
+    strides[dim - 1] = std::max(sizes[dim - 2], static_cast<int64_t>(1));
+    strides[dim - 2] = 1;
   }
   return strides;
 }
 
-static inline DimVector contiguous_strides(const IntArrayRef sizes, const bool f_contig=false) {
-  return contiguous_strides_template<DimVector>(sizes, f_contig);
-}
-
-static inline std::vector<int64_t> contiguous_strides_vec(const IntArrayRef sizes, const bool f_contig=false) {
-  return contiguous_strides_template<std::vector<int64_t>>(sizes, f_contig);
-}
-
 /*
  * Clones a Tensor so that the following conditions hold:
  * If we think of a Tensor of having size (B, M, N), where B is any number
@@ -131,13 +92,13 @@ static inline c10::MaybeOwned<Tensor> borrow_else_clone(const bool cond, const T
  *  broadcasted shape.
  */
 static inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1,
-    c10::optional<IntArrayRef> desired_batch_sizes = c10::nullopt) {
+    at::OptionalIntArrayRef desired_batch_sizes = c10::nullopt) {
   nrows = (nrows == -1) ? src.size(-2) : nrows;
   auto copy_sizes = desired_batch_sizes.has_value()
     ? desired_batch_sizes.value().vec()
     : IntArrayRef(src.sizes().data(), src.dim() - 2).vec();
   copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)});
-  const auto copy_strides = contiguous_strides(copy_sizes, /*f-contig*/true);
+  const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true);
   auto copy = at::empty_strided(copy_sizes, copy_strides, src.options());
   copy.narrow(-2, 0, src.size(-2)).copy_(src);
   return copy;
@@ -213,7 +174,7 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
   auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes);
   Tensor a_buffer, a_was_accessed, a_buffer_3d;
   std::function<void(int64_t)> check_if_copy_needed_for_a
-    = [](int64_t a_curr_linear_batch_idx){};
+    = [](int64_t /*a_curr_linear_batch_idx*/){};
   if (a_broadcasts_over_b) {
     a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options())
       .copy_(a);
@@ -467,14 +428,14 @@ static inline std::tuple<bool, bool> _parse_qr_mode(c10::string_view mode) {
 }
 
 // Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition
-static inline std::tuple<std::vector<int64_t>,
-                         std::vector<int64_t>,
-                         int64_t> _compute_geometry_for_Q(const Tensor& input, bool reduced) {
+static inline std::tuple<DimVector, DimVector, int64_t> _compute_geometry_for_Q(
+    const Tensor& input,
+    bool reduced) {
   int64_t m = input.size(-2), n = input.size(-1);
   int64_t n_columns_q;
 
   // We need to compute the required size of Q based on the `reduced` option
-  auto q_sizes = input.sizes().vec();
+  DimVector q_sizes(input.sizes());
   if (!reduced && m > n) {
     q_sizes[input.dim() - 1] = m;
     n_columns_q = m;
@@ -482,7 +443,7 @@ static inline std::tuple<std::vector<int64_t>,
     q_sizes[input.dim() - 1] = n;
     n_columns_q = std::min(m, n);
   }
-  auto q_strides = contiguous_strides_vec(q_sizes, /*f-contig*/true);
+  auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true);
   return std::make_tuple(q_sizes, q_strides, n_columns_q);
 }
 
@@ -623,11 +584,49 @@ static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor&
   return vector_case;
 }
 
+/*
+  Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor.
+*/
+static inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) {
+  TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU);
+  return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous();
+}
+
+class BroadcastLinearIndices {
+ private:
+  Tensor linear_indices_;
+  bool is_broadcasting_;
+
+ public:
+  BroadcastLinearIndices(
+      int64_t numel,
+      IntArrayRef original_shape,
+      IntArrayRef broadcast_shape) {
+    // The assumption is that the broadcast_shape is a materialized broadcast
+    // shape of the original_shape. We need to compute the linear indices
+    // compatible with the original_shape to access the elements in the original
+    // tensor corresponding to the broadcast tensor.
+    is_broadcasting_ = !original_shape.equals(broadcast_shape);
+    if (is_broadcasting_) {
+      linear_indices_ =
+          get_linear_indices(numel, original_shape, broadcast_shape);
+    }
+  }
+  int64_t operator()(int64_t broadcast_linear_index) {
+    return is_broadcasting_
+        ? linear_indices_.data_ptr<int64_t>()[broadcast_linear_index]
+        : broadcast_linear_index;
+  }
+};
+
 static inline bool is_blas_compatible_column_major_order(const Tensor& input) {
   IntArrayRef input_strides = input.strides();
   IntArrayRef input_sizes = input.sizes();
   auto ndim = input.dim();
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.transpose(-2, -1).is_contiguous();
+  }
   auto leading_dimension = input_strides[ndim - 1];
   auto rows = input_sizes[ndim - 2];
   bool batch_stride_compatible = true;
@@ -645,7 +644,10 @@ static inline bool is_blas_compatible_row_major_order(const Tensor& input) {
   IntArrayRef input_strides = input.strides();
   IntArrayRef input_sizes = input.sizes();
   auto ndim = input.dim();
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.is_contiguous();
+  }
   auto leading_dimension = input_strides[ndim - 2];
   auto cols = input_sizes[ndim - 1];
   bool batch_stride_compatible = true;
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 1812e61febce..5358b83bdf22 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -8,6 +8,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <c10/util/Exception.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 constexpr float EPSILON = 1e-12;
 
@@ -39,6 +40,17 @@ TORCH_META_FUNC(smooth_l1_loss)
   maybe_get_output().resize_({});
 }
 
+TORCH_META_FUNC(mse_loss)
+(const Tensor& input, const Tensor& target, const int64_t reduction) {
+  build_borrowing_binary_op(maybe_get_output(), input, target);
+  if (reduction == Reduction::None) {
+    return;
+  }
+
+  TORCH_INTERNAL_ASSERT(reduction == Reduction::Mean || reduction == Reduction::Sum);
+  maybe_get_output().resize_({});
+}
+
 } // namespace meta
 
 namespace native {
@@ -70,6 +82,22 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out)
   }
 }
 
+TORCH_IMPL_FUNC(mse_loss_out)
+(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) {
+  if (reduction != Reduction::None) {
+    Tensor loss;
+    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
+    mse_stub(iter.device_type(), iter);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    } else {
+      at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    }
+  } else {
+    mse_stub(device_type(), *this);
+  }
+}
+
 Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) {
   auto targ_dim = target.dim();
   TORCH_CHECK(
@@ -310,30 +338,47 @@ Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& targe
     return apply_loss_reduction(loss, reduction);
 }
 
-Tensor binary_cross_entropy_with_logits_backward(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& pos_weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_with_logits_backward(
+    const Tensor& grad,
+    const Tensor& input,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& pos_weight_opt,
+    int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& pos_weight = c10::value_or_else(pos_weight_opt, [] {return Tensor();});
-
-    Tensor grad_input;
-    if (pos_weight.defined()) {
-        // pos_weight need to be broadcasted, thus mul(target) is not inplace.
-        auto t = pos_weight.mul(target);
-        grad_input = t.add(1).sub_(target).mul_(input.sigmoid()).sub_(t).mul_(grad);
+  const Tensor& pos_weight =
+      c10::value_or_else(pos_weight_opt, [] { return Tensor(); });
+
+  Tensor grad_input;
+  auto hasSubclassTensors = at::areAnyTensorSubclassLike({grad, input, target});
+
+  // If there are subclassed tensors use the out of place version
+  if (pos_weight.defined()) {
+    // pos_weight might need to be broadcasted, thus mul(target) is not inplace.
+    auto t = pos_weight.mul(target);
+    grad_input = hasSubclassTensors
+        ? t.add(1).sub(target).mul(input.sigmoid()).sub(t).mul(grad)
+        : t.add(1).sub_(target).mul_(input.sigmoid()).sub_(t).mul_(grad);
+  } else {
+    grad_input = hasSubclassTensors ? (input.sigmoid() - target).mul(grad)
+                                    : (input.sigmoid() - target).mul_(grad);
+  }
+  if (weight.defined()) {
+    if (at::areAnyTensorSubclassLike({grad_input, weight})) {
+      grad_input = grad_input.mul(weight);
     } else {
-        grad_input = (input.sigmoid() - target).mul_(grad);
-    }
-
-    if (weight.defined()) {
-        grad_input.mul_(weight);
+      grad_input.mul_(weight);
     }
+  }
 
-    if (reduction == at::Reduction::Mean) {
-        return grad_input / input.numel();
-    }
+  if (reduction == at::Reduction::Mean) {
+    return grad_input / input.numel();
+  }
 
-    return grad_input;
+  return grad_input;
 }
 
 Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction)
@@ -454,30 +499,6 @@ Tensor& huber_loss_backward_out(const Tensor& grad_output, const Tensor& input,
   return grad_input;
 }
 
-Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
-  Tensor loss;
-  auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
-  mse_stub(iter.device_type(), iter);
-  return apply_loss_reduction(iter.output(), reduction);
-}
-
-Tensor& mse_loss_out(const Tensor& input, const Tensor& target, int64_t reduction, Tensor&result) {
-  if (reduction != Reduction::None) {
-    Tensor loss;
-    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
-    mse_stub(iter.device_type(), iter);
-    if (reduction == Reduction::Mean) {
-      at::mean_out(result, iter.output(), 0);
-    } else {
-      at::sum_out(result, iter.output(), 0);
-    }
-  } else {
-    auto iter = TensorIterator::borrowing_binary_op(result, input, target);
-    mse_stub(iter.device_type(), iter);
-  }
-  return result;
-}
-
 Tensor mse_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
   Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::mse_loss_backward_out(grad_input, grad_output, input, target, reduction);
@@ -497,7 +518,7 @@ Tensor& mse_loss_backward_out(const Tensor& grad_output,
 }
 
 Tensor l1_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
-  const auto float_type = c10::toValueType(input.scalar_type());
+  const auto float_type = c10::toRealValueType(input.scalar_type());
   Tensor result = at::empty({0}, input.options().dtype(float_type));
   return at::l1_loss_out(result, input, target, reduction);
 }
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index ed733411ff53..212f28bca23e 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -463,7 +463,8 @@ Tensor cross_entropy_loss_prob_target(
     const Tensor& weight,
     int64_t reduction,
     double label_smoothing) {
-  const auto n_classes = self.size(1);
+  const auto class_dim = self.dim() == 1 ? 0 : 1;
+  const auto n_classes = self.size(class_dim);
   TORCH_CHECK(
       !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
       "cross_entropy: weight tensor should be defined either for all ",
@@ -472,7 +473,7 @@ Tensor cross_entropy_loss_prob_target(
       " but got weight tensor of shape: ",
       weight.sizes());
 
-  auto input = at::log_softmax(self, 1, self.scalar_type());
+  auto input = at::log_softmax(self, class_dim, self.scalar_type());
   Tensor target;
 
   if (label_smoothing > 0.0) {
@@ -484,29 +485,40 @@ Tensor cross_entropy_loss_prob_target(
 
   if (weight.defined()) {
     // Expand weight to the correct number of dims for broadcasting with input / target
-    auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
-    std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
-    weight_broadcast_shape[1] = weight.size(0);
-    Tensor weight_ = weight.view(weight_broadcast_shape);
+    Tensor weight_ = weight;
+    if (input.dim() > 1) {
+        auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
+        std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
+        weight_broadcast_shape[1] = weight.size(0);
+        weight_ = weight.view(weight_broadcast_shape);
+    }
 
     switch (reduction) {
       case Reduction::Mean:
-        return -(input * target * weight_).sum() / (input.numel() / input.size(1));
+        if (input.numel()==0){
+          return -(input * target * weight_).sum().fill_(std::numeric_limits<double>::quiet_NaN());
+        } else {
+          return -(input * target * weight_).sum() / (input.numel() / n_classes);
+        }
       case Reduction::Sum:
         return -(input * target * weight_).sum();
       case Reduction::None:
-        return -(input * target * weight_).sum(1);
+        return -(input * target * weight_).sum(class_dim);
       default:
         TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
     }
   } else {
     switch (reduction) {
       case Reduction::Mean:
-        return -(input * target).sum() / (input.numel() / input.size(1));
+        if (input.numel()==0){
+          return -(input * target).sum().fill_(std::numeric_limits<double>::quiet_NaN());
+        } else {
+          return -(input * target).sum() / (input.numel() / n_classes);
+        }
       case Reduction::Sum:
         return -(input * target).sum();
       case Reduction::None:
-        return -(input * target).sum(1);
+        return -(input * target).sum(class_dim);
       default:
         TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
     }
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index 09255e065879..ee10d00f9b5c 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -12,6 +12,7 @@
 #include <c10/util/MathConstants.h>
 #include <c10/util/math_compat.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/jiterator_macros.h>
 
 C10_CLANG_DIAGNOSTIC_PUSH()
 #if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
@@ -67,6 +68,83 @@ Output was modified to be inf or -inf when input is 1 or -1. */
     POSSIBILITY OF SUCH DAMAGE.
 */
 
+namespace {
+/*
+ * This function is derived from the implementation of the i0e function in the
+ * Cephes Math Library. See note [3-Clause BSD License for the Cephes Math
+ * Library].
+ *
+ * Computes an approximation of the exponentially scaled zeroth order modified
+ * Bessel function of the first kind. The approximation is actually two
+ * (sub)approximations, both using a Chebyshev polynomial expansion. One
+ * approximates the function over [0, 8], and the other over (8, infinity). This
+ * function takes the absolute value of all inputs to convert them into the
+ * domain of the approximation.
+ */
+jiterator_also_stringify_as(jiterator_code(
+  template <typename T>
+  JITERATOR_HOST_DEVICE T chbevl(T x, const T array[], const int len) {
+    T b0, b1, b2;
+
+    b0 = array[0];
+    b1 = 0;
+
+    for (int i = 1; i < len; ++i) {
+      b2 = b1;
+      b1 = b0;
+      b0 = x * b1 - b2 + array[i];
+    }
+
+    return T{0.5} * (b0 - b2);
+  }
+
+  template <typename T>
+  JITERATOR_HOST_DEVICE T calc_i0e(T _x) {
+    T x = fabs(_x);
+
+    if (x <= T{8.0}) {
+      static const T coefficients[] = {
+          -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+          -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+          -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+          -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+          -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+          -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+          -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+          -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+          -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+          -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+          -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+          -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+          -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+          -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+          -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+      T y = (x / T{2.0}) - T{2.0};
+      return chbevl(y, coefficients, int{30});
+    }
+
+    // x > 8
+    static const T coefficients[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+
+    return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / sqrt(x);
+  }),
+  i0e_string); // i0e_string
+}
+
 #define CENTRAL_RANGE 0.7
 
 template <typename T>
@@ -1385,37 +1463,6 @@ calc_i0(T _x) {
 // Upcast bfloat16 input to float for numerical accuracy purposes
 static inline c10::BFloat16 calc_i0(c10::BFloat16 a) { return calc_i0(static_cast<float>(a)); }
 
-/*
- * This function is derived from the implementation of the i0e function in the Cephes Math Library.
- * See note [3-Clause BSD License for the Cephes Math Library].
- *
- * Computes an approximation of the exponentially scaled zeroth order modified Bessel function of the first kind.
- * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
- * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
- * of all inputs to convert them into the domain of the approximation.
- */
-template <typename T>
-static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
-calc_i0e(T _x) {
-  T x = std::abs(_x);
-
-  if (x <= T{8.0}) {
-    auto coeff_pair = chebyshev_coefficients_i0e_A<T>();
-    auto A = std::get<0>(coeff_pair);
-    auto len = std::get<1>(coeff_pair);
-    T y = (x / T{2.0}) - T{2.0};
-    return chbevl(y, A, len);
-  }
-
-  auto coeff_pair = chebyshev_coefficients_i0e_B<T>();
-  auto B = std::get<0>(coeff_pair);
-  auto len = std::get<1>(coeff_pair);
-  return chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x);
-}
-
-// Upcast bfloat16 input to float for numerical accuracy purposes
-static inline c10::BFloat16 calc_i0e(c10::BFloat16 a) { return calc_i0e(static_cast<float>(a)); }
-
 /*
  * This function is derived from the implementation of the i1 function in the Cephes Math Library.
  * See note [3-Clause BSD License for the Cephes Math Library].
@@ -2113,4 +2160,21 @@ calc_erfcx(T x)
   }
 }
 
+/*
+ * Logarithm of Gaussian cumulative distribution function.
+
+ * This implementation of log_ndtr and its helper functions
+ * follow SciPy's implementation
+ * See NOTICE for the licenses.
+ */
+template <typename T>
+static inline C10_HOST_DEVICE T calc_log_ndtr(T x) {
+  T t = x * M_SQRT1_2;
+  if (x < T{-1.0}) {
+    return std::log(calc_erfcx(-t) / 2) - t * t;
+  } else {
+    return std::log1p(-std::erfc(t) / 2);
+  }
+}
+
 C10_CLANG_DIAGNOSTIC_POP()
diff --git a/aten/src/ATen/native/MaxPooling.h b/aten/src/ATen/native/MaxPooling.h
index c429c8e667bc..e133ad5939c8 100644
--- a/aten/src/ATen/native/MaxPooling.h
+++ b/aten/src/ATen/native/MaxPooling.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/DispatchStub.h>
 
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index 6d395d9078c6..27d4e1a93c81 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -185,132 +185,8 @@ Tensor max_unpooling3d_forward_cpu(
   return output;
 }
 
-Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
-    const Tensor& self,
-    const Tensor& indices_,
-    IntArrayRef output_size,
-    Tensor& grad_input) {
-  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
-  int64_t ndim = self.ndimension();
-  int64_t dimh = ndim == 3 ? 1 : 2;
-  int64_t dimw = ndim == 3 ? 2 : 3;
-
-  TORCH_CHECK(
-      indices_.scalar_type() == at::ScalarType::Long,
-      "elements in indices should be type int64 but got type: ", indices_.scalar_type());
-  TORCH_CHECK(
-      self.sizes() == indices_.sizes(),
-      "Expected shape of indices to be same as that of the input tensor (",
-      self.sizes(), ") but got indices tensor with shape: ", indices_.sizes());
-  TORCH_CHECK(output_size.size() == 2, "Output size must be 2 but got: ", output_size.size());
-
-  auto memory_format = self.suggest_memory_format();
-  auto grad_output = grad_output_.contiguous(memory_format);
-  auto indices = indices_.contiguous(memory_format);
-
-  grad_input.resize_(self.sizes(), memory_format);
-  grad_input.zero_();
-
-  if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) {
-    AT_ERROR(
-        "Inconsistent gradOutput size. output height = ",
-        oheight,
-        ", output width = ",
-        owidth,
-        ", gradOutput: ",
-        grad_output.size(dimh),
-        "x",
-        grad_output.size(dimw));
-  }
-
-  if (grad_input.numel() != 0) {
-    max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices);
-  }
-
-  return grad_input;
-}
-
-Tensor max_unpooling2d_backward_cpu(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& indices,
-    IntArrayRef output_size) {
-  auto grad_input = at::empty({0}, self.options());
-  max_unpooling2d_backward_out_cpu(
-      grad_output, self, indices, output_size, grad_input);
-  return grad_input;
-}
-
-Tensor& max_unpooling3d_backward_out_cpu(
-    const Tensor& grad_output_,
-    const Tensor& self,
-    const Tensor& indices_,
-    IntArrayRef output_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    Tensor& grad_input) {
-  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
-  int64_t ndim = self.ndimension();
-  int64_t dimt = ndim == 4 ? 1 : 2;
-  int64_t dimh = ndim == 4 ? 2 : 3;
-  int64_t dimw = ndim == 4 ? 3 : 4;
-
-  max_unpooling3d_shape_check(
-   self, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cpu()");
-
-  /* get contiguous gradOutput */
-  auto grad_output = grad_output_.contiguous();
-  auto indices = indices_.contiguous();
-
-  /* resize */
-  grad_input.resize_as_(self);
-  grad_input.zero_();
-
-  if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) {
-    AT_ERROR(
-        "Inconsistent gradOutput size. output depth = ",
-        oT,
-        ", output height = ",
-        oH,
-        ", output width = ",
-        oW,
-        ", gradOutput: ",
-        grad_output.size(dimt),
-        "x",
-        grad_output.size(dimh),
-        "x",
-        grad_output.size(dimw));
-  }
-
-  if (grad_input.numel() != 0) {
-    max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices);
-  }
-
-  return grad_input;
-}
-
-Tensor max_unpooling3d_backward_cpu(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& indices,
-    IntArrayRef output_size,
-    IntArrayRef stride,
-    IntArrayRef padding) {
-  auto grad_input = at::empty({0}, self.options());
-  at::native::max_unpooling3d_backward_out_cpu(
-      grad_output, self, indices, output_size, stride, padding, grad_input);
-  return grad_input;
-}
-
 DEFINE_DISPATCH(max_unpool2d_kernel);
-DEFINE_DISPATCH(max_unpool2d_backward_kernel);
 DEFINE_DISPATCH(max_unpool3d_kernel);
-DEFINE_DISPATCH(max_unpool3d_backward_kernel);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
index 68eaa372b7ee..fa7b30f5977e 100644
--- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp
+++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
@@ -24,7 +24,8 @@ void hvol2col(
     const IntArrayRef stride_size,
     const IntArrayRef pad_size,
     const IntArrayRef dilation_size,
-    Dtype* data_col) {
+    Dtype* data_col,
+    bool is_channels_last = false) {
   if (dim == 3) {
     vol2col<Dtype>(
         data_hvol,
@@ -65,7 +66,8 @@ void hvol2col(
         stride_size[1],
         dilation_size[0],
         dilation_size[1],
-        data_col);
+        data_col,
+        is_channels_last);
   }
 }
 
@@ -80,7 +82,8 @@ void col2hvol(
     const IntArrayRef stride_size,
     const IntArrayRef pad_size,
     const IntArrayRef dilation_size,
-    Dtype* data_hvol) {
+    Dtype* data_hvol,
+    bool is_channels_last = false) {
   if (dim == 3) {
     col2vol<Dtype>(
         data_col,
@@ -121,7 +124,8 @@ void col2hvol(
         stride_size[1],
         dilation_size[0],
         dilation_size[1],
-        data_hvol);
+        data_hvol,
+        is_channels_last);
   }
 }
 
@@ -167,7 +171,8 @@ void slow_conv_dilated_all_cpu_template(
     IntArrayRef kernel_size,
     IntArrayRef stride_size,
     IntArrayRef pad_size,
-    IntArrayRef dilation_size) {
+    IntArrayRef dilation_size,
+    bool is_channels_last = false) {
   slow_conv_dilated_location_check(input, weight, bias, grad_output);
   auto options = input.options();
   // The rear part of input tensor sizes:
@@ -183,7 +188,11 @@ void slow_conv_dilated_all_cpu_template(
   if (output.defined() || grad_weight.defined() || grad_input.defined()) {
     const int64_t m = c10::multiply_integers(kernel_size);
     const int64_t n = c10::multiply_integers(output_size);
-    columns.resize_({nInputPlane * m, n});
+    if (is_channels_last) {
+      columns.resize_({n, m * nInputPlane});
+    } else {
+      columns.resize_({nInputPlane * m, n});
+    }
   }
   // Initialize
   if (grad_weight.defined()) {
@@ -200,7 +209,8 @@ void slow_conv_dilated_all_cpu_template(
   std::vector<int64_t> dims(dim);
   std::iota(dims.begin(), dims.end(), 1);
 
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "slow_conv_dilated<>", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Long, at::ScalarType::BFloat16, input.scalar_type(), "slow_conv_dilated<>", [&] {
     // For each elt in batch, do:
     for (const auto elt : c10::irange(batchSize)) {
       // Matrix multiply per output:
@@ -246,7 +256,8 @@ void slow_conv_dilated_all_cpu_template(
             stride_size,
             pad_size,
             dilation_size,
-            columns.data_ptr<scalar_t>());
+            columns.data_ptr<scalar_t>(),
+            is_channels_last);
         /*
           Compute:
 
@@ -265,25 +276,47 @@ void slow_conv_dilated_all_cpu_template(
 
           gemm assumes column-major matrices:
 
+          channels last:
+            output_n^T = weight *columns^T + output_n^T
+            C = alpha * op(A) * op(B) + beta * C
+            op(A) = 't', op(B) = 'n', alpha=1, beta=1
+
+          channels first:
             output_n^T = columns^T * weight^T + output_n^T
             C = alpha * op(A) * op(B) + beta * C
             op(A) = 'n', op(B) = 'n', alpha=1, beta=1
         */
-        cpublas::gemm(
-            /*transa=*/TransposeType::NoTranspose,
-            /*transb=*/TransposeType::NoTranspose,
-            /*     m=*/columns.size(1),
-            /*     n=*/nOutputPlane,
-            /*     k=*/columns.size(0),
-            /* alpha=*/1,
-            /*     A=*/columns.data_ptr<scalar_t>(),
-            /*   lda=*/columns.size(1),
-            /*     B=*/weight.data_ptr<scalar_t>(),
-            /*   ldb=*/columns.size(0),
-            /*  beta=*/1,
-            /*     C=*/output_n.data_ptr<scalar_t>(),
-            /*   ldc=*/columns.size(1));
-
+        if (is_channels_last) {
+          cpublas::gemm(
+              /*transa=*/TransposeType::Transpose,
+              /*transb=*/TransposeType::NoTranspose,
+              /*     m=*/nOutputPlane,
+              /*     n=*/columns.size(0),
+              /*     k=*/columns.size(1),
+              /* alpha=*/static_cast<scalar_t>(1),
+              /*     A=*/weight.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/columns.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*  beta=*/static_cast<scalar_t>(1),
+              /*     C=*/output_n.data_ptr<scalar_t>(),
+              /*   ldc=*/nOutputPlane);
+        } else {
+          cpublas::gemm(
+              /*transa=*/TransposeType::NoTranspose,
+              /*transb=*/TransposeType::NoTranspose,
+              /*     m=*/columns.size(1),
+              /*     n=*/nOutputPlane,
+              /*     k=*/columns.size(0),
+              /* alpha=*/static_cast<scalar_t>(1),
+              /*     A=*/columns.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/weight.data_ptr<scalar_t>(),
+              /*   ldb=*/columns.size(0),
+              /*  beta=*/static_cast<scalar_t>(1),
+              /*     C=*/output_n.data_ptr<scalar_t>(),
+              /*   ldc=*/columns.size(1));
+        }
       } else {
         // All gradients
         grad_output_n = grad_output.select(0, elt);
@@ -309,24 +342,47 @@ void slow_conv_dilated_all_cpu_template(
 
           gemm assumes column-major matrices:
 
+          channels last:
+            columns^T = weight^T * grad_output_n^T
+            C = alpha * op(A) * op(B) + beta * C
+            op(A) = 'n', op(B) = 'n', alpha=1, beta=0
+
+          channels first:
             columns^T = grad_output_n^T * weight
             C = alpha * op(A) * op(B) + beta * C
             op(A) = 'n', op(B) = 't', alpha=1, beta=0
          */
-        cpublas::gemm(
-            /*transa=*/TransposeType::NoTranspose,
-            /*transb=*/TransposeType::Transpose,
-            /*     m=*/columns.size(1),
-            /*     n=*/columns.size(0),
-            /*     k=*/nOutputPlane,
-            /* alpha=*/1,
-            /*     A=*/grad_output_n.data_ptr<scalar_t>(),
-            /*   lda=*/columns.size(1),
-            /*     B=*/weight.data_ptr<scalar_t>(),
-            /*   ldb=*/columns.size(0),
-            /*  beta=*/0,
-            /*     C=*/columns.data_ptr<scalar_t>(),
-            /*   ldc=*/columns.size(1));
+        if (is_channels_last) {
+          cpublas::gemm(
+              /*transa=*/TransposeType::NoTranspose,
+              /*transb=*/TransposeType::NoTranspose,
+              /*     m=*/columns.size(1),
+              /*     n=*/columns.size(0),
+              /*     k=*/nOutputPlane,
+              /* alpha=*/static_cast<scalar_t>(1),
+              /*     A=*/weight.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/grad_output_n.data_ptr<scalar_t>(),
+              /*   ldb=*/nOutputPlane,
+              /*  beta=*/static_cast<scalar_t>(0),
+              /*     C=*/columns.data_ptr<scalar_t>(),
+              /*   ldc=*/columns.size(1));
+        } else {
+          cpublas::gemm(
+              /*transa=*/TransposeType::NoTranspose,
+              /*transb=*/TransposeType::Transpose,
+              /*     m=*/columns.size(1),
+              /*     n=*/columns.size(0),
+              /*     k=*/nOutputPlane,
+              /* alpha=*/static_cast<scalar_t>(1),
+              /*     A=*/grad_output_n.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/weight.data_ptr<scalar_t>(),
+              /*   ldb=*/columns.size(0),
+              /*  beta=*/static_cast<scalar_t>(0),
+              /*     C=*/columns.data_ptr<scalar_t>(),
+              /*   ldc=*/columns.size(1));
+        }
         // Unpack columns back into input:
         Tensor grad_input_n = grad_input.select(0, elt);
 
@@ -339,7 +395,8 @@ void slow_conv_dilated_all_cpu_template(
             stride_size,
             pad_size,
             dilation_size,
-            grad_input_n.data_ptr<scalar_t>());
+            grad_input_n.data_ptr<scalar_t>(),
+            is_channels_last);
       }
 
       // Gradient of weight:
@@ -354,7 +411,8 @@ void slow_conv_dilated_all_cpu_template(
             stride_size,
             pad_size,
             dilation_size,
-            columns.data_ptr<scalar_t>());
+            columns.data_ptr<scalar_t>(),
+            is_channels_last);
         scalar_t scale = 1; // TODO: expose as argument?
         /*
           Compute:
@@ -374,24 +432,47 @@ void slow_conv_dilated_all_cpu_template(
 
           gemm assumes column-major matrices:
 
-            grad_weight^T = scale * columns * grad_output_n^T +
-          grad_weight^T C = alpha * op(A) * op(B) + beta * C op(A) = 't',
-          op(B) = 'n', alpha=scale, beta=1
+          channels last:
+            grad_weight^T = scale * columns^T * grad_output_n + grad_weight^T
+            C = alpha * op(A) * op(B) + beta * C
+            op(A) = 'n', op(B) = 't', alpha=scale, beta=1
+
+          channels first:
+            grad_weight^T = scale * columns * grad_output_n^T + grad_weight^T
+            C = alpha * op(A) * op(B) + beta * C
+            op(A) = 't', op(B) = 'n', alpha=scale, beta=1
         */
-        cpublas::gemm(
-            /*transa=*/TransposeType::Transpose,
-            /*transb=*/TransposeType::NoTranspose,
-            /*     m=*/columns.size(0),
-            /*     n=*/nOutputPlane,
-            /*     k=*/columns.size(1),
-            /* alpha=*/scale,
-            /*     A=*/columns.data_ptr<scalar_t>(),
-            /*   lda=*/columns.size(1),
-            /*     B=*/grad_output_n.data_ptr<scalar_t>(),
-            /*   ldb=*/columns.size(1),
-            /*  beta=*/1,
-            /*     C=*/grad_weight.data_ptr<scalar_t>(),
-            /*   ldc=*/columns.size(0));
+        if (is_channels_last) {
+          cpublas::gemm(
+              /*transa=*/TransposeType::NoTranspose,
+              /*transb=*/TransposeType::Transpose,
+              /*     m=*/columns.size(1),
+              /*     n=*/nOutputPlane,
+              /*     k=*/columns.size(0),
+              /* alpha=*/static_cast<scalar_t>(scale),
+              /*     A=*/columns.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/grad_output_n.data_ptr<scalar_t>(),
+              /*   ldb=*/nOutputPlane,
+              /*  beta=*/static_cast<scalar_t>(1),
+              /*     C=*/grad_weight.data_ptr<scalar_t>(),
+              /*   ldc=*/columns.size(1));
+        } else {
+          cpublas::gemm(
+              /*transa=*/TransposeType::Transpose,
+              /*transb=*/TransposeType::NoTranspose,
+              /*     m=*/columns.size(0),
+              /*     n=*/nOutputPlane,
+              /*     k=*/columns.size(1),
+              /* alpha=*/static_cast<scalar_t>(scale),
+              /*     A=*/columns.data_ptr<scalar_t>(),
+              /*   lda=*/columns.size(1),
+              /*     B=*/grad_output_n.data_ptr<scalar_t>(),
+              /*   ldb=*/columns.size(1),
+              /*  beta=*/static_cast<scalar_t>(1),
+              /*     C=*/grad_weight.data_ptr<scalar_t>(),
+              /*   ldc=*/columns.size(0));
+        }
       }
 
       // Gradient of bias:
@@ -441,6 +522,9 @@ Tensor slow_conv_dilated2d_cpu(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
+  bool use_channels_last = thnn_conv_use_channels_last(input, weight);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
   Tensor undefined;
   internal::slow_conv_dilated_shape_check<2>(
       input,
@@ -459,10 +543,10 @@ Tensor slow_conv_dilated2d_cpu(
   // template function assumes batched tensors.  unsqueeze(0) will
   // insert batch dimension without affecting the original tensor.
   const Tensor input_ =
-      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
-  const Tensor weight_ = weight.contiguous();
+      (is_batch ? input.contiguous(memory_format) : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous(memory_format);
   const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined);
-  Tensor output = at::empty(output_size, options);
+  Tensor output = at::empty(output_size, options.memory_format(memory_format));
   Tensor output_ = (is_batch ? output : output.unsqueeze(0));
 
   slow_conv_dilated_all_cpu_template<2>(
@@ -477,7 +561,8 @@ Tensor slow_conv_dilated2d_cpu(
       kernel_size,
       stride_size,
       pad_size,
-      dilation_size);
+      dilation_size,
+      use_channels_last);
   return output;
 }
 
@@ -541,6 +626,9 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_cpu(
     IntArrayRef pad_size,
     IntArrayRef dilation_size,
     const std::array<bool, 3ul> output_mask) {
+  bool use_channels_last = thnn_conv_use_channels_last(input, weight);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
   Tensor undefined;
   internal::slow_conv_dilated_shape_check<2>(
       input,
@@ -556,16 +644,16 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_cpu(
   // template function assumes batched tensors.  unsqueeze(0) will
   // insert batch dimension without affecting the original tensor.
   const Tensor grad_output_ =
-      (is_batch ? grad_output.contiguous()
+      (is_batch ? grad_output.contiguous(memory_format)
                 : grad_output.contiguous().unsqueeze(0));
   const Tensor input_ =
-      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
-  const Tensor weight_ = weight.contiguous();
+      (is_batch ? input.contiguous(memory_format) : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous(memory_format);
   // compute only gradients for which the corresponding output_mask is true:
   Tensor grad_input =
-      (output_mask[0] ? at::empty(input.sizes(), options) : undefined);
+      (output_mask[0] ? at::empty(input.sizes(), options.memory_format(memory_format)) : undefined);
   Tensor grad_weight =
-      (output_mask[1] ? at::empty(weight.sizes(), options) : undefined);
+      (output_mask[1] ? at::empty(weight.sizes(), options.memory_format(memory_format)) : undefined);
   Tensor grad_bias =
       (output_mask[2] ? at::empty(weight.size(0), options) : undefined);
   Tensor grad_input_ =
@@ -583,7 +671,8 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_cpu(
       kernel_size,
       stride_size,
       pad_size,
-      dilation_size);
+      dilation_size,
+      use_channels_last);
   return std::tie(grad_input, grad_weight, grad_bias);
 }
 
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index fdce903c0806..7cdf38192708 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -10,6 +10,7 @@
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/batch_norm.h>
 #include <ATen/native/Normalization.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
 #include <vector>
@@ -26,7 +27,7 @@ TORCH_META_FUNC(renorm)(const Tensor& self, const Scalar& p, int64_t dim, const
   TORCH_CHECK(maxnorm.toDouble() >= 0.0,
               "renorm: expected maxnorm to be >= 0 but got ", maxnorm.toDouble());
   const auto ndim = self.dim();
-  TORCH_CHECK(ndim > 1, "renorm: input needs at least 2 dimensions, got ", ndim, "dimensions");
+  TORCH_CHECK(ndim > 1, "renorm: input needs at least 2 dimensions, got ", ndim, " dimensions");
   set_output(self.sizes(), self.options());
 }
 
@@ -82,7 +83,7 @@ static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) {
   return t.is_contiguous() ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
 }
 
-template<typename scalar_t>
+template<typename scalar_t, typename param_t>
 std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
     const Tensor& input, const Tensor& weight, const Tensor& bias,
     const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */,
@@ -122,10 +123,12 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
       return 1 / at::sqrt(running_var + eps);
     }
   }());
+  const bool mixed_type = !std::is_same<scalar_t, param_t>::value;
+  const auto dtype = mixed_type ? kFloat : input.scalar_type();
   auto w = weight.defined() ? as_nd(weight) :
-      at::detail::scalar_tensor_static(1, input.scalar_type(), kCPU);
+      at::detail::scalar_tensor_static(1, dtype, kCPU);
   auto b = bias.defined() ? as_nd(bias) :
-      at::detail::scalar_tensor_static(0, input.scalar_type(), kCPU);
+      at::detail::scalar_tensor_static(0, dtype, kCPU);
 
   Tensor output = at::empty_like(input, input.suggest_memory_format());
   auto iter = TensorIteratorConfig()
@@ -135,15 +138,17 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
     .add_input(invstd)
     .add_input(w)
     .add_input(b)
+    .check_all_same_dtype(false)
+    .promote_inputs_to_common_dtype(false)
     .build();
 
-  cpu_kernel(iter, [=](scalar_t input, scalar_t mean, scalar_t invstd, scalar_t weight, scalar_t bias) {
+  cpu_kernel(iter, [=](scalar_t input, param_t mean, param_t invstd, param_t weight, param_t bias) {
     return ((input - mean) * invstd) * weight + bias;
   });
   return std::make_tuple(output, save_mean, save_invstd);
 }
 
-template<typename scalar_t, template<typename T> class VarTransform>
+template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
 std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
     const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
     double momentum, double eps) {
@@ -161,20 +166,26 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
     reduce_dims[i - 1] = i;
   }
 
-  Tensor save_mean = at::mean(input, /*dims=*/reduce_dims);
-  Tensor save_var_transform = at::empty({n_input}, input.options());
-  auto save_mean_a = save_mean.accessor<scalar_t, 1>();
-  auto save_var_transform_a = save_var_transform.accessor<scalar_t, 1>();
+  bool all_contiguous = is_contiguous(input);
+  const bool mixed_type = !std::is_same<scalar_t, param_t>::value;
+  const auto dtype = mixed_type ? kFloat : input.scalar_type();
 
-  auto running_mean_a = conditional_accessor_1d<scalar_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<scalar_t>(running_var);
+  // For contiguous case, leave 'mean' computation to kernel
+  Tensor save_mean = all_contiguous
+      ? at::empty({n_input}, input.options().dtype(dtype))
+      : at::mean(input, /*dim=*/reduce_dims, /*keepdim=*/false, dtype);
+  Tensor save_var_transform = at::empty({n_input}, input.options().dtype(dtype));
+  auto save_mean_a = save_mean.accessor<param_t, 1>();
+  auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
+
+  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
 
-  bool all_contiguous = is_contiguous(input);
   if (all_contiguous) {
-    auto _mean = at::empty({n_input}, input.options());
-    auto _var_sum = at::empty({n_input}, input.options());
-    auto _mean_a = _mean.accessor<scalar_t, 1>();
-    auto _var_sum_a = _var_sum.accessor<scalar_t, 1>();
+    auto _mean = at::empty({n_input}, input.options().dtype(dtype));
+    auto _var_sum = at::empty({n_input}, input.options().dtype(dtype));
+    auto _mean_a = _mean.accessor<param_t, 1>();
+    auto _var_sum_a = _var_sum.accessor<param_t, 1>();
 
     batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
 
@@ -203,6 +214,8 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
       .add_input(input)
       .resize_outputs(false)
       .declare_static_shape(input.sizes(), /*squash_dims=*/1)
+      .check_all_same_dtype(false)
+      .promote_inputs_to_common_dtype(false)
       .build();
 
   parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
@@ -230,7 +243,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
   return std::make_tuple(save_mean, save_var_transform);
 }
 
-template<typename scalar_t>
+template<typename scalar_t, typename param_t>
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     const Tensor& grad_out_, const Tensor& input, const Tensor& weight,
     const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
@@ -238,6 +251,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
 
   using accscalar_t = at::acc_type<scalar_t, false>;
 
+  const bool mixed_type = !std::is_same<scalar_t, param_t>::value;
+  const auto dtype = mixed_type ? kFloat : input.scalar_type();
+
   Tensor grad_input;
   Tensor grad_weight;
   Tensor grad_bias;
@@ -245,10 +261,10 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     grad_input = at::empty_like(input, input.suggest_memory_format());
   }
   if (grad_input_mask[1]) {
-    grad_weight = at::empty_like(weight, at::MemoryFormat::Contiguous);
+    grad_weight = at::empty({input.size(1)}, input.options().dtype(dtype));
   }
   if (grad_input_mask[2]) {
-    grad_bias = at::empty({input.size(1)}, input.options());
+    grad_bias = at::empty({input.size(1)}, input.options().dtype(dtype));
   }
 
   // since we are directly manipulating pointers in contiguous path,
@@ -266,18 +282,18 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     return std::make_tuple(grad_input, grad_weight, grad_bias);
   }
 
-  auto weight_a = conditional_accessor_1d<scalar_t>(weight);
-  auto grad_weight_a = conditional_accessor_1d<scalar_t>(grad_weight);
-  auto grad_bias_a = conditional_accessor_1d<scalar_t>(grad_bias);
+  auto weight_a = conditional_accessor_1d<param_t>(weight);
+  auto grad_weight_a = conditional_accessor_1d<param_t>(grad_weight);
+  auto grad_bias_a = conditional_accessor_1d<param_t>(grad_bias);
 
   int64_t n_input = input.size(1);
   int64_t n = input.numel() / n_input;
 
-  auto save_mean_a = conditional_accessor_1d<scalar_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<scalar_t>(save_invstd);
+  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
 
-  auto running_mean_a = conditional_accessor_1d<scalar_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<scalar_t>(running_var);
+  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
 
   const int64_t ndim = input.dim();
 
@@ -332,9 +348,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
       TensorIterator binary_iter_local(binary_iter);
 
       for (const auto f : c10::irange(b_begin, b_end)) {
-        scalar_t w = weight.defined() ? weight_a[f] : 1;
+        param_t w = weight.defined() ? weight_a[f] : param_t(1);
 
-        scalar_t mean, invstd;
+        param_t mean, invstd;
         if (train) {
           mean = save_mean_a[f];
           invstd = save_invstd_a[f];
@@ -557,7 +573,6 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
   }
 
   // backward in inference mode is not supported in cudnn, fallback to native
-  // TODO: verify the same thing in miopen
   if (impl_index == 0 || (!train)) {
     return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask);
   } else if (impl_index == 1) {
@@ -627,9 +642,15 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
   const Tensor& running_mean = *running_mean_maybe_owned;
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
 
-  return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_update_stats_cpu", [&] {
-      return batch_norm_cpu_update_stats_template<scalar_t, Var>(self, running_mean, running_var, momentum, 0);
-    });
+  const bool mixed_type = is_mixed_type(self, running_mean, running_var);
+  return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm_update_stats_cpu", [&] {
+    if (mixed_type) {
+      check_mixed_data_type(self, running_mean, running_var);
+      return batch_norm_cpu_update_stats_template<BFloat16, float, Var>(self, running_mean, running_var, momentum, 0);
+    } else {
+      return batch_norm_cpu_update_stats_template<scalar_t, scalar_t, Var>(self, running_mean, running_var, momentum, 0);
+    }
+  });
 }
 
 std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
@@ -643,16 +664,29 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
 
   checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
 
-  return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm", [&] {
+  const bool mixed_type = is_mixed_type(self, weight, bias, running_mean, running_var);
+  return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm", [&] {
+    if (mixed_type) {
+      check_mixed_data_type(self, weight, bias, running_mean, running_var);
+      if (!train) {
+        auto save_mean = at::empty({0}, self.options().dtype(kFloat));
+        auto save_var = at::empty({0}, self.options().dtype(kFloat));
+        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
+      } else {
+        auto save_stats = batch_norm_cpu_update_stats_template<BFloat16, float, InvStd>(self, running_mean, running_var, momentum, eps);
+        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
+      }
+    } else {
       if (!train) {
         auto save_mean = at::empty({0}, self.options());
         auto save_var = at::empty({0}, self.options());
-        return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
+        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
       } else {
-        auto save_stats = batch_norm_cpu_update_stats_template<scalar_t, InvStd>(self, running_mean, running_var, momentum, eps);
-        return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
+        auto save_stats = batch_norm_cpu_update_stats_template<scalar_t, scalar_t, InvStd>(self, running_mean, running_var, momentum, eps);
+        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
       }
-    });
+    }
+  });
 }
 
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
@@ -665,9 +699,15 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_ou
   const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
   const Tensor& save_invstd = c10::value_or_else(save_invstd_opt, [] {return Tensor();});
 
-  return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_backward_cpu", [&] {
-      return batch_norm_backward_cpu_template<scalar_t>(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask);
-    });
+  const bool mixed_type = is_mixed_type(self, weight, running_mean, running_var, save_mean, save_invstd);
+  return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_cpu", [&] {
+    if (mixed_type) {
+      check_mixed_data_type(self, weight, running_mean, running_var, save_mean, save_invstd);
+      return batch_norm_backward_cpu_template<BFloat16, float>(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask);
+    } else {
+      return batch_norm_backward_cpu_template<scalar_t, scalar_t>(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask);
+    }
+  });
 }
 
 TORCH_IMPL_FUNC(renorm_out)(const Tensor& self, const Scalar& p, int64_t dim,
@@ -692,7 +732,7 @@ TORCH_IMPL_FUNC(renorm_out)(const Tensor& self, const Scalar& p, int64_t dim,
                                   /*keepdim=*/true);
   }
 
-  auto factor = (acc_type == c10::toValueType(dtype)) ?
+  auto factor = (acc_type == c10::toRealValueType(dtype)) ?
       norm : at::empty(norm.sizes(), self.options());
   auto iter = TensorIteratorConfig()
       .add_output(factor)
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
new file mode 100644
index 000000000000..9510b17de002
--- /dev/null
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -0,0 +1,213 @@
+#include <ATen/ATen.h>
+#include <ATen/native/PadNd.h>
+
+#include <c10/util/irange.h>
+
+namespace at { namespace native {
+
+Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) {
+    TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ",
+             pad.size());
+
+    auto input_sizes = self.sizes();
+    auto l_inp = self.dim();
+
+    auto l_pad = pad.size() / 2;
+    auto l_diff = l_inp - l_pad;
+    TORCH_CHECK(l_inp >= (int64_t)l_pad, "Length of pad should be no more than twice the number of "
+             "dimensions of the input. Pad length is ", pad.size(), "while the input has ",
+             l_inp, "dimensions.");
+
+    std::vector<int64_t> new_shape;
+
+    bool all_pads_non_positive = true;
+
+    auto c_input = self;
+    for (const auto i : c10::irange(l_diff, l_inp)) {
+        auto pad_idx = 2 * (l_inp - i - 1);
+        if (pad[pad_idx] < 0) {
+            c_input = c_input.narrow(i, -pad[pad_idx], c_input.size(i) + pad[pad_idx]);
+        } else if (pad[pad_idx] != 0) {
+            all_pads_non_positive = false;
+        }
+        if (pad[pad_idx + 1] < 0) {
+            c_input = c_input.narrow(i, 0, c_input.size(i) + pad[pad_idx + 1]);
+        } else if (pad[pad_idx + 1] != 0) {
+            all_pads_non_positive = false;
+        }
+    }
+
+    // if none of the pads are positive we can optimize and just return the result
+    // of calling .narrow() on the input
+    if (all_pads_non_positive) {
+        return c_input.clone();
+    }
+
+
+    for (size_t i = 0; i < (size_t)l_diff; i ++) {
+        new_shape.emplace_back(input_sizes[i]);
+    }
+
+    for (const auto i : c10::irange((size_t)l_pad)) {
+        auto pad_idx = pad.size() - ((i + 1) * 2);
+        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
+        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+                 pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
+                 "which is invalid. Check dimension ", l_diff + i, " of your input.");
+        new_shape.emplace_back(new_dim);
+    }
+
+    at::Tensor output;
+    const auto memory_format = self.suggest_memory_format();
+    if (self.is_quantized()) {
+        const auto qscheme = self.qscheme();
+        TORCH_CHECK(qscheme == kPerTensorAffine || qscheme == kPerTensorSymmetric,
+                    "Only per-tensor padding is supported.");
+        output = at::_empty_affine_quantized(
+            new_shape, self.options().memory_format(memory_format),
+            self.q_scale(), self.q_zero_point(), c10::nullopt);
+    } else {
+        output = at::empty(new_shape, self.options().memory_format(memory_format));
+    }
+    output.fill_(value);
+
+    auto c_output = output;
+    for (const auto i : c10::irange(l_diff, l_inp)) {
+        auto pad_idx = 2 * (l_inp - i - 1);
+        if (pad[pad_idx] > 0) {
+            c_output = c_output.narrow(i, pad[pad_idx], c_output.size(i) - pad[pad_idx]);
+        }
+        if (pad[pad_idx + 1] > 0) {
+            c_output = c_output.narrow(i, 0, c_output.size(i) - pad[pad_idx + 1]);
+        }
+    }
+    c_output.copy_(c_input);
+    return output;
+}
+
+Tensor _pad_circular(const Tensor &self, IntArrayRef padding) {
+  const auto in_shape = self.sizes();
+  const auto ndim = static_cast<int64_t>(in_shape.size()) - 2;
+  TORCH_CHECK(padding.size() + 4 == in_shape.size() * 2,
+              "Invalid padding size, expected ", ndim * 2, " but got ", padding.size());
+
+  DimVector out_shape(in_shape.size());
+  out_shape[0] = in_shape[0];
+  out_shape[1] = in_shape[1];
+
+  // Get shape of padded tensor
+  for (const auto i : c10::irange(ndim)) {
+    const auto pad_l = padding[2 * (ndim - i - 1) + 0];
+    const auto pad_r = padding[2 * (ndim - i - 1) + 1];
+    const auto size = in_shape[2 + i];
+    out_shape[2 + i] = size + pad_l + pad_r;
+
+    TORCH_CHECK(
+        pad_l <= size && pad_r <= size,
+        "Padding value causes wrapping around more than once.");
+    TORCH_CHECK(
+        out_shape[2 + i] >= 0,
+        "Negative padding value is resulting in an empty dimension");
+  }
+
+  auto out = self.new_empty(out_shape, self.options());
+
+  // Put original array into the padded array
+  Tensor out_slice = out;
+  Tensor in_slice = self;
+  constexpr int64_t zero = 0;
+  for (const auto i : c10::irange(ndim)) {
+    const auto dim = ndim - i + 1;
+    const auto pad_l = padding[2*i + 0];
+    const auto pad_r = padding[2*i + 1];
+    out_slice = out_slice.slice(dim, std::max(pad_l, zero), out_shape[dim] - std::max(pad_r, zero));
+    in_slice = in_slice.slice(dim, std::max(-pad_l, zero), in_shape[dim] - std::max(-pad_r, zero));
+  }
+  out_slice.copy_(in_slice);
+
+  // The following steps first pad the beginning of the tensor (left side),
+  // and then pad the end of the tensor (right side).
+  // Note: Corners will be written more than once when ndim > 1.
+  //
+  // Only in cases where padding values are > 0 are when additional copying
+  // is required.
+  for (const auto i : c10::irange(ndim)) {
+    const auto dim = ndim - i + 1;
+    const auto pad_l = padding[2*i + 0];
+    const auto pad_r = padding[2*i + 1];
+
+    if (pad_l > 0) {
+      out_slice = out.slice(dim, 0, pad_l);
+      in_slice = out.slice(dim,
+                           out_shape[dim] - pad_l - std::max(pad_r, zero),
+                           out_shape[dim] - std::max(pad_r, zero));
+      out_slice.copy_(in_slice);
+    }
+
+    if (pad_r > 0) {
+      out_slice = out.slice(dim, out_shape[dim] - pad_r, out_shape[dim]);
+      in_slice = out.slice(dim, std::max(pad_l, zero), std::max(pad_l, zero) + pad_r);
+      out_slice.copy_(in_slice);
+    }
+  }
+
+  return out;
+}
+
+Tensor _pad_enum(const Tensor &self, IntArrayRef pad, int64_t mode_int, c10::optional<double> value) {
+  const auto input_dim = self.dim();
+  TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2");
+  TORCH_CHECK(static_cast<int64_t>(pad.size()) <= input_dim * 2, "Padding length too large");
+  auto mode = static_cast<at::padding_mode>(mode_int);
+
+  if (mode == at::padding_mode::constant) {
+    return at::constant_pad_nd(self, pad, value.value_or(0.0));
+  }
+  TORCH_CHECK(!value.has_value() || *value == 0,
+              "Padding mode \"", padding_mode_string(mode),
+              "\" doesn't take in value argument");
+
+  if (pad.size() == 2 && (input_dim == 2 || input_dim == 3)) {
+    switch (mode) {
+      case at::padding_mode::reflect: return at::reflection_pad1d(self, pad);
+      case at::padding_mode::replicate: return at::replication_pad1d(self, pad);
+      case at::padding_mode::circular: return at::_pad_circular(self, pad);
+      default: {}
+    }
+  } else if(pad.size() == 4 && (input_dim == 3 || input_dim == 4)) {
+    switch (mode) {
+      case at::padding_mode::reflect: return at::reflection_pad2d(self, pad);
+      case at::padding_mode::replicate: return at::replication_pad2d(self, pad);
+      case at::padding_mode::circular: return at::_pad_circular(self, pad);
+      default: {}
+    }
+  } else if (pad.size() == 6 && (input_dim == 4 || input_dim == 5)) {
+    switch (mode) {
+      case at::padding_mode::reflect: return at::reflection_pad3d(self, pad);
+      case at::padding_mode::replicate: return at::replication_pad3d(self, pad);
+      case at::padding_mode::circular: return at::_pad_circular(self, pad);
+      default: {}
+    }
+  }
+  C10_THROW_ERROR(NotImplementedError,
+      "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
+}
+
+Tensor pad(const Tensor &self, IntArrayRef pad, c10::string_view mode, c10::optional<double> value) {
+  const auto mode_enum = [&] {
+    if (mode == "reflect") {
+      return at::padding_mode::reflect;
+    } else if (mode == "constant") {
+      return at::padding_mode::constant;
+    } else if (mode == "replicate") {
+      return at::padding_mode::replicate;
+    } else if (mode == "circular") {
+      return at::padding_mode::circular;
+    }
+    C10_THROW_ERROR(NotImplementedError,
+                    c10::str("Unrecognised padding mode ", mode));
+  }();
+  return at::native::_pad_enum(self, pad, static_cast<int64_t>(mode_enum), value);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/PadNd.h b/aten/src/ATen/native/PadNd.h
new file mode 100644
index 000000000000..37f59acb8a4c
--- /dev/null
+++ b/aten/src/ATen/native/PadNd.h
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace at {
+
+enum class padding_mode {
+  reflect,
+  replicate,
+  circular,
+  constant,
+};
+
+static inline c10::string_view padding_mode_string(padding_mode m) {
+  switch (m) {
+    case padding_mode::reflect: return "reflect";
+    case padding_mode::replicate: return "replicate";
+    case padding_mode::circular: return "circular";
+    case padding_mode::constant: return "constant";
+  }
+  TORCH_CHECK(false, "Invalid padding mode (", static_cast<int64_t>(m), ")");
+}
+
+}  // namespace at
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
index fc8e3c80cefc..41547a10f5fd 100644
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -3,21 +3,83 @@
 #include <ATen/NativeFunctions.h>
 #include <c10/util/Exception.h>
 
-#include <algorithm>
-#include <numeric>
-#include <vector>
+#include <ATen/native/cpu/PixelShuffleKernel.h>
 
 namespace at {
 namespace native {
 
-Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
+static inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_factor) {
   TORCH_CHECK(self.dim() >= 3,
               "pixel_shuffle expects input to have at least 3 dimensions, but got input with ",
               self.dim(), " dimension(s)");
-  TORCH_CHECK(
-      upscale_factor > 0,
-      "pixel_shuffle expects a positive upscale_factor, but got ",
-      upscale_factor);
+  TORCH_CHECK(upscale_factor > 0,
+              "pixel_shuffle expects a positive upscale_factor, but got ",
+              upscale_factor);
+  int64_t c = self.size(-3);
+  int64_t upscale_factor_squared = upscale_factor * upscale_factor;
+  TORCH_CHECK(c % upscale_factor_squared == 0,
+              "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
+              "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared);
+}
+
+static inline void check_pixel_unshuffle_shapes(const Tensor& self, int64_t downscale_factor) {
+  TORCH_CHECK(self.dim() >= 3,
+              "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ",
+              self.dim(), " dimension(s)");
+  TORCH_CHECK(downscale_factor > 0,
+              "pixel_unshuffle expects a positive downscale_factor, but got ",
+              downscale_factor);
+  int64_t h = self.size(-2);
+  int64_t w = self.size(-1);
+  TORCH_CHECK(h % downscale_factor == 0,
+              "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", h,
+              " is not divisible by ", downscale_factor);
+  TORCH_CHECK(w % downscale_factor == 0,
+              "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", w,
+              " is not divisible by ", downscale_factor);
+}
+
+Tensor pixel_shuffle_cpu(const Tensor& self, int64_t upscale_factor) {
+  check_pixel_shuffle_shapes(self, upscale_factor);
+
+  // Format: (B1, ..., Bn), C, H, W
+  std::vector<int64_t> output_sizes(self.sizes().begin(), self.sizes().end() - 3);
+  output_sizes.insert(output_sizes.end(),
+      {self.size(-3) / upscale_factor / upscale_factor,
+       self.size(-2) * upscale_factor,
+       self.size(-1) * upscale_factor});
+
+  auto output = at::empty({0}, self.options());
+  auto memory_format = self.suggest_memory_format();
+  output.resize_(output_sizes, memory_format);
+  auto input = self.contiguous(memory_format);
+
+  pixel_shuffle_kernel(kCPU, output, input, upscale_factor);
+  return output;
+}
+
+Tensor pixel_unshuffle_cpu(const Tensor& self, int64_t downscale_factor) {
+  check_pixel_unshuffle_shapes(self, downscale_factor);
+
+  // Format: (B1, ..., Bn), C, H, W
+  std::vector<int64_t> output_sizes(self.sizes().begin(), self.sizes().end() - 3);
+  output_sizes.insert(output_sizes.end(),
+      {self.size(-3) * downscale_factor * downscale_factor,
+       self.size(-2) / downscale_factor,
+       self.size(-1) / downscale_factor});
+
+  auto output = at::empty({0}, self.options());
+  auto memory_format = self.suggest_memory_format();
+  output.resize_(output_sizes, memory_format);
+  auto input = self.contiguous(memory_format);
+
+  pixel_unshuffle_kernel(kCPU, output, input, downscale_factor);
+  return output;
+}
+
+Tensor math_pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
+  check_pixel_shuffle_shapes(self, upscale_factor);
+
   // Format: (B1, ..., Bn), C, H, W
   int64_t c = self.size(-3);
   int64_t h = self.size(-2);
@@ -26,9 +88,6 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
   const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS;
 
   int64_t upscale_factor_squared = upscale_factor * upscale_factor;
-  TORCH_CHECK(c % upscale_factor_squared == 0,
-              "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
-              "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared);
   int64_t oc = c / upscale_factor_squared;
   int64_t oh = h * upscale_factor;
   int64_t ow = w * upscale_factor;
@@ -54,18 +113,13 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
   // and (w, upscale_factor) -> a single dim (ow).
   std::vector<int64_t> final_shape(self.sizes().begin(), self_sizes_batch_end);
   final_shape.insert(final_shape.end(), {oc, oh, ow});
+
   return input_permuted.reshape(final_shape);
 }
 
+Tensor math_pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
+  check_pixel_unshuffle_shapes(self, downscale_factor);
 
-Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
-  TORCH_CHECK(self.dim() >= 3,
-              "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ",
-              self.dim(), " dimension(s)");
-  TORCH_CHECK(
-      downscale_factor > 0,
-      "pixel_unshuffle expects a positive downscale_factor, but got ",
-      downscale_factor);
   // Format: (B1, ..., Bn), C, H, W
   int64_t c = self.size(-3);
   int64_t h = self.size(-2);
@@ -73,12 +127,6 @@ Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
   constexpr auto NUM_NON_BATCH_DIMS = 3;
   const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS;
 
-  TORCH_CHECK(h % downscale_factor == 0,
-             "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", h,
-             " is not divisible by ", downscale_factor)
-  TORCH_CHECK(w % downscale_factor == 0,
-             "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", w,
-             " is not divisible by ", downscale_factor)
   int64_t downscale_factor_squared = downscale_factor * downscale_factor;
   int64_t oc = c * downscale_factor_squared;
   int64_t oh = h / downscale_factor;
@@ -105,7 +153,11 @@ Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) {
   // resulting in height=oh and width=ow.
   std::vector<int64_t> final_shape(self.sizes().begin(), self_sizes_batch_end);
   final_shape.insert(final_shape.end(), {oc, oh, ow});
+
   return input_permuted.reshape(final_shape);
 }
 
+DEFINE_DISPATCH(pixel_shuffle_kernel);
+DEFINE_DISPATCH(pixel_unshuffle_kernel);
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 503cf8907884..0f3885524a79 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/div_rtn.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index 0526f5b2b8e4..724c53fdd0c0 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -122,7 +122,12 @@ Tensor max_pool2d(
     return at::mkldnn_max_pool2d(
         self, kernel_size, stride, padding, dilation, ceil_mode);
   }
-
+#ifdef USE_MPS
+  if (self.is_mps()) {
+    return at::_mps_max_pool2d(
+        self, kernel_size, stride, padding, dilation, ceil_mode);
+  }
+#endif
 #if defined(C10_MOBILE)
   if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
                              dilation, ceil_mode)) {
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index e3030f71d165..fcd8f6335b58 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -13,7 +13,7 @@
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 
 #include <c10/util/irange.h>
 
@@ -53,6 +53,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
   // fallback path and rather fail loudly if we cannot run FBGEMM.
   TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM.");
 
+  TORCH_WARN_ONCE("fbgemm_linear_int8_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   const Tensor input_contig = input.contiguous();
   const float* input_ptr = input_contig.data_ptr<float>();
 
@@ -179,11 +182,6 @@ Tensor fbgemm_linear_int8_weight(
     const Scalar& weight_scale,
     const Scalar& weight_zero_point,
     const Tensor& bias) {
-  // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed
-  // TORCH_WARN(
-  //     "fbgemm_linear_int8_weight will be deprecated soon."
-  //     "Please use fbgemm_linear_int8_weight_fp32_activation instead.");
-
   return at::native::fbgemm_linear_int8_weight_fp32_activation(
       input,
       weight,
@@ -219,6 +217,9 @@ void CalcColOffsetsTranspose(
 
 std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
     const Tensor& weight) {
+  TORCH_WARN_ONCE("fbgemm_linear_quantize_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -284,6 +285,9 @@ std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
 }
 
 Tensor fbgemm_pack_quantized_matrix(const Tensor& weight) {
+  TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -366,6 +370,9 @@ void HandleWeightsSaturation(int64_t N, float* weight) {
 } // namespace
 
 Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
+  TORCH_WARN_ONCE("fbgemm_pack_gemm_matrix_fp16 is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -398,6 +405,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
     const Tensor& bias) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -443,10 +453,6 @@ Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
     const Tensor& bias) {
-  // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed
-  // TORCH_WARN(
-  //     "fbgemm_linear_fp16_weight will be deprecated soon."
-  //     "Please use fbgemm_linear_fp16_weight_fp32_activation instead.");
   return at::native::fbgemm_linear_fp16_weight_fp32_activation(
       input, packed_weight, bias);
 }
@@ -461,6 +467,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
     const Scalar& /*weight_scale*/,
     const Scalar& /*weight_zero_point*/,
     const Tensor& /*bias*/) {
+  TORCH_WARN_ONCE("fbgemm_linear_int8_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -476,10 +485,8 @@ Tensor fbgemm_linear_int8_weight(
     const Scalar& /*weight_scale*/,
     const Scalar& /*weight_zero_point*/,
     const Tensor& /*bias*/) {
-  // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed
-  // TORCH_WARN(
-  //     "fbgemm_linear_int8_weight will be deprecated soon."
-  //     "Please use fbgemm_linear_int8_weight_fp32_activation instead.");
+  TORCH_WARN_ONCE("fbgemm_linear_int8_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
 
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
@@ -490,6 +497,9 @@ Tensor fbgemm_linear_int8_weight(
 
 std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
     const Tensor& /*weight*/) {
+  TORCH_WARN_ONCE("fbgemm_linear_quantize_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -498,6 +508,9 @@ std::tuple<Tensor, Tensor, double, int64_t> fbgemm_linear_quantize_weight(
 }
 
 Tensor fbgemm_pack_quantized_matrix(const Tensor& /*input*/) {
+  TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -509,10 +522,8 @@ Tensor fbgemm_pack_quantized_matrix(
     const Tensor& /*input*/,
     int64_t /*K*/,
     int64_t /*N*/) {
-  // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed
-  // TORCH_WARN(
-  //     "fbgemm_pack_quantized_matrix(weight, K, N) will be deprecated soon."
-  //     "Please use fbgemm_pack_quantized_matrix(weight) instead.");
+  TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated "
+                  "and will be removed in a future PyTorch release.")
 
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
@@ -522,6 +533,9 @@ Tensor fbgemm_pack_quantized_matrix(
 }
 
 Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
+  TORCH_WARN_ONCE("fbgemm_pack_gemm_matrix_fp16 is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -533,6 +547,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
     const Tensor& bias) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
   // fallback path and rather fail loudly if we cannot run FBGEMM.
@@ -544,10 +561,8 @@ Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
     const Tensor& bias) {
-  // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed
-  // TORCH_WARN(
-  //     "fbgemm_linear_fp16_weight will be deprecated soon."
-  //     "Please use fbgemm_linear_fp16_weight_fp32_activation instead.");
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
 
   // We make a strong guarantee that models using these operators will have the
   // same numerics across different machines. Therefore, we do not provide a
@@ -556,10 +571,6 @@ Tensor fbgemm_linear_fp16_weight(
       false, "This PyTorch installation was not built with FBGEMM operators");
 }
 
-bool fbgemm_is_cpu_supported() {
-  return false;
-}
-
 #endif // USE_FBGEMM
 
 } // namespace native
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index a2b50e3ee467..3c10afef14fa 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -291,7 +291,7 @@ If two backends have the same dispatch function, you can write `CPU, CUDA: func`
 to reuse the same function name in both cases.
 
 Available backend options can be found by searching `dispatch_keys` in
-[codegen](https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py).
+[codegen](https://github.com/pytorch/pytorch/blob/master/torchgen/gen.py).
 There are also two special "generic" backends:
 
   - `CompositeExplicitAutograd` (previously known as `DefaultBackend`):
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 8793f4e5d7b4..38696432b257 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -3,8 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cpp_custom_type_hack.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <c10/util/irange.h>
@@ -22,7 +21,6 @@ bool use_miopen(const at::Tensor& input, const double dropout_state) {
     bool is_miopen_acceptable = ((input.scalar_type() == at::kFloat)|| (input.scalar_type() == at::kHalf)) &&
                                 (detail::getCUDAHooks().compiledWithMIOpen()) &&
                                 (input.is_cuda()) &&
-                                (dropout_state == 0.0) &&
                                 (at::globalContext().userEnabledCuDNN());
     return is_miopen_acceptable;
 }
@@ -579,88 +577,6 @@ static std::vector<CellParams> gather_params(TensorList params, bool has_biases,
   return result;
 }
 
-// These gather_* functions are kept solely for the purposes of backward
-// compatbility in the legacy quantized_{lstm,gru} APIs
-
-static c10::List<c10::intrusive_ptr<CellParamsBase>> gather_quantized_params(
-    c10::List<at::Tensor> params) {
-  static at::Tensor undefined;
-  std::vector<c10::intrusive_ptr<CellParamsBase>> result;
-  TORCH_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters");
-  for (size_t i = 0; i < params.size(); i += 12) {
-    result.emplace_back(c10::make_intrusive<QuantizedCellParams>(
-        static_cast<at::Tensor>(params[i]),
-        static_cast<at::Tensor>(params[i + 1]),
-        static_cast<at::Tensor>(params[i + 2]),
-        static_cast<at::Tensor>(params[i + 3]),
-        static_cast<at::Tensor>(params[i + 4]),
-        static_cast<at::Tensor>(params[i + 5]),
-        static_cast<at::Tensor>(params[i + 6]),
-        static_cast<at::Tensor>(params[i + 7]),
-        static_cast<at::Tensor>(params[i + 8]).item(),
-        static_cast<at::Tensor>(params[i + 9]).item(),
-        static_cast<at::Tensor>(params[i + 10]).item(),
-        static_cast<at::Tensor>(params[i + 11]).item()));
-  }
-  return c10::List<c10::intrusive_ptr<CellParamsBase>>(result);
-}
-
-static c10::List<c10::intrusive_ptr<CellParamsBase>>
-gather_quantized_params_dynamic(c10::List<at::Tensor> params) {
-  static at::Tensor undefined;
-  std::vector<c10::intrusive_ptr<CellParamsBase>> result;
-  for (size_t i = 0; i < params.size(); i += 2) {
-    auto packed_struct_ih =
-        cpp_custom_type_hack::cast<c10::intrusive_ptr<LinearPackedParamsBase>>(
-            static_cast<at::Tensor>(params[i]));
-    auto packed_struct_hh =
-        cpp_custom_type_hack::cast<c10::intrusive_ptr<LinearPackedParamsBase>>(
-            static_cast<at::Tensor>(params[i + 1]));
-
-    auto bias_ih = packed_struct_ih->bias().value_or(undefined);
-    auto bias_hh = packed_struct_hh->bias().value_or(undefined);
-    result.emplace_back(c10::make_intrusive<QuantizedCellParamsDynamic>(
-        std::move(packed_struct_ih),
-        std::move(packed_struct_hh),
-        std::move(bias_ih),
-        std::move(bias_hh)));
-  }
-  return c10::List<c10::intrusive_ptr<CellParamsBase>>(result);
-}
-
-static c10::List<c10::intrusive_ptr<CellParamsBase>>
-gather_quantized_params_fp16(c10::List<at::Tensor> params) {
-  static at::Tensor undefined;
-  std::vector<c10::intrusive_ptr<CellParamsBase>> result;
-  TORCH_CHECK(params.size() % 4 == 0,
-              "incorrect number of quantized RNN parameters FP16");
-  for (size_t i = 0; i < params.size(); i += 4) {
-    c10::intrusive_ptr<LinearPackedParamsBase> packed_struct_ih =
-        cpp_custom_type_hack::cast<c10::intrusive_ptr<LinearPackedParamsBase>>(
-            static_cast<at::Tensor>(params[i]));
-    c10::intrusive_ptr<LinearPackedParamsBase> packed_struct_hh =
-        cpp_custom_type_hack::cast<c10::intrusive_ptr<LinearPackedParamsBase>>(
-            static_cast<at::Tensor>(params[i + 1]));
-
-    // NB: we install the bias from the gathered parameters here because
-    // in the "new world", the fp16 linear apply() method always expects
-    // the bias to be present in the packed struct. In the "old world",
-    // we called `fbgemm_linear_fp16_weight_fp32_activation`, which took
-    // the bias explicitly and ignored the bias in the packed struct. To
-    // reconcile serialized models that behavied in the old style, we
-    // put the bias into the appropriate packed structures here.
-    //
-    // Hopefully we can remove this in the future when we eliminate
-    // the old style altogether
-    packed_struct_ih->set_bias(params[i + 2]);
-    packed_struct_hh->set_bias(params[i + 3]);
-
-    result.emplace_back(c10::make_intrusive<QuantizedCellParamsFP16>(
-        std::move(packed_struct_ih), std::move(packed_struct_hh)));
-  }
-  return c10::List<c10::intrusive_ptr<CellParamsBase>>(result);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // HIDDEN STATE FUNCTIONS
 //
@@ -1201,6 +1117,18 @@ bool _use_cudnn_rnn_flatten_weight() {
   return detail::getCUDAHooks().compiledWithCuDNN();
 }
 
+// NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl.
+//     It duplicates the outputs of this function so the non-composite verison doesn't have to.
+//     The point is so that we avoid triggering TensorImpl use count asserts in debug mode
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+      const Tensor& cx, const Tensor& cy,
+      const Tensor& workspace, bool has_bias) {
+  TORCH_INTERNAL_ASSERT(!GradMode::is_enabled());
+  auto ret = at::_thnn_fused_lstm_cell_backward_impl(grad_hy_opt, grad_cy_opt, cx, cy, workspace, has_bias);
+  return std::make_tuple(std::get<0>(ret), std::get<0>(ret), std::get<1>(ret), std::get<2>(ret), std::get<2>(ret));
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // PUBLIC FUNCTIONS
 ////////////////////////////////////////////////////////////////////////////////
@@ -1411,21 +1339,11 @@ std::tuple<Tensor, Tensor> quantized_gru_input_legacy(
     bool train,
     bool bidirectional,
     bool batch_first) {
-  TORCH_WARN_ONCE(
+  TORCH_CHECK(
+      false,
       "torch.quantized_gru with List[Tensor] for parameters is "
-      "deprecated and may be removed! Please re-export your model "
+      "no longer supported. Please re-export your model "
       "using the newer definitions in torch.jit.quantized");
-  auto params = gather_quantized_params(std::move(_params));
-  return quantized_gru_input(
-      _input,
-      hx,
-      std::move(params),
-      has_biases,
-      num_layers,
-      dropout_p,
-      train,
-      bidirectional,
-      batch_first);
 }
 
 std::tuple<Tensor, Tensor> quantized_gru_data_legacy(
@@ -1438,21 +1356,11 @@ std::tuple<Tensor, Tensor> quantized_gru_data_legacy(
     double dropout_p,
     bool train,
     bool bidirectional) {
-  TORCH_WARN_ONCE(
+  TORCH_CHECK(
+      false,
       "torch.quantized_gru with List[Tensor] for parameters is "
-      "deprecated and may be removed! Please re-export your model "
+      "no longer supported. Please re-export your model "
       "using the newer definitions in torch.jit.quantized");
-  auto params = gather_quantized_params(std::move(_params));
-  return quantized_gru_data(
-    data,
-    batch_sizes,
-    hx,
-    std::move(params),
-    has_biases,
-    num_layers,
-    dropout_p,
-    train,
-    bidirectional);
 }
 
 using tanf_cell_type = SimpleCell<tanh_f, CellParams>;
@@ -1480,6 +1388,14 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
             num_layers, dropout_p, train, bidirectional, batch_first);
     return std::make_tuple(std::move(output), std::move(hy), std::move(cy));
   }
+#ifdef USE_MPS
+  if (_input.is_mps() && !bidirectional) {
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
+            num_layers, dropout_p, train, bidirectional, batch_first);
+    std::tuple<Tensor, Tensor, Tensor> return_values = std::make_tuple(std::get<0>(output), std::get<1>(output), std::get<2>(output));
+    return return_values;
+  }
+#endif
   // if cells are of different size, that means projections are used
   bool has_projections = (hx[0].size(2) != hx[1].size(2));
   if (use_miopen(_input, dropout_p)) {
@@ -1768,34 +1684,11 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_input_legacy(
     bool batch_first,
     c10::optional<ScalarType> dtype,
     bool use_dynamic) {
-  TORCH_WARN_ONCE(
+  TORCH_CHECK(
+      false,
       "torch.quantized_lstm with List[Tensor] for parameters is "
-      "deprecated and may be removed! Please re-export your model "
+      "no longer supported. Please re-export your model "
       "using the newer definitions in torch.jit.quantized");
-  c10::List<c10::intrusive_ptr<CellParamsBase>> params;
-  auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar;
-  if (result_dtype == at::kChar || result_dtype == at::kQInt8) {
-    if (use_dynamic) {
-      params = gather_quantized_params_dynamic(std::move(_params_));
-    } else {
-      params = gather_quantized_params(std::move(_params_));
-    }
-  } else {
-    params = gather_quantized_params_fp16(std::move(_params_));
-  }
-  return quantized_lstm_input(
-      _input,
-      std::move(hx_),
-      std::move(params),
-      has_biases,
-      num_layers,
-      dropout_p,
-      train,
-      bidirectional,
-      batch_first,
-      // NOLINTNEXTLINE(performance-move-const-arg)
-      std::move(dtype),
-      use_dynamic);
 }
 
 std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
@@ -1857,34 +1750,11 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
     bool bidirectional,
     c10::optional<ScalarType> dtype,
     bool use_dynamic) {
-  TORCH_WARN_ONCE(
+  TORCH_CHECK(
+      false,
       "torch.quantized_lstm with List[Tensor] for parameters is "
-      "deprecated and may be removed! Please re-export your model "
+      "no longer supported. Please re-export your model "
       "using the newer definitions in torch.jit.quantized");
-  c10::List<c10::intrusive_ptr<CellParamsBase>> params;
-  auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar;
-  if (result_dtype == at::kChar || result_dtype == at::kQInt8) {
-    if (use_dynamic) {
-      params = gather_quantized_params_dynamic(std::move(_params_));
-    } else {
-      params = gather_quantized_params(std::move(_params_));
-    }
-  } else {
-    params = gather_quantized_params_fp16(std::move(_params_));
-  }
-  return quantized_lstm_data(
-      data,
-      batch_sizes,
-      std::move(hx_),
-      std::move(params),
-      has_biases,
-      num_layers,
-      dropout_p,
-      train,
-      bidirectional,
-      // NOLINTNEXTLINE(performance-move-const-arg)
-      std::move(dtype),
-      use_dynamic);
 }
 
 #define DEFINE_QUANTIZED_RNN_CELL(name, hx_type, cell_type, return_type, prepare_hx_fn) \
@@ -1982,7 +1852,7 @@ DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple_hx_typ
 
 namespace {
 
-static auto ensure_linear_params_registered = register_linear_params();
+static C10_UNUSED auto ensure_linear_params_registered = register_linear_params();
 
 static auto cell_params_base_registry =
     torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 38eafedbeebf..d5ee3a3e9103 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -14,6 +14,7 @@
 #include <ATen/native/TensorDimApply.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/core/grad_mode.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <c10/util/irange.h>
 #include <c10/util/SmallBuffer.h>
@@ -200,16 +201,26 @@ TORCH_META_FUNC2(prod, dim_int)
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
 
-void check_floating_or_complex_dtype(const char* name, ScalarType dtype) {
-  TORCH_CHECK(
-      at::isFloatingType(dtype) || at::isComplexType(dtype),
-      name, "(): input dtype should be either floating point or complex dtypes. "
-      "Got ", toString(dtype), " instead.");
-}
-
 TORCH_META_FUNC2(mean, dim)
 (const Tensor& self, IntArrayRef dim, bool keepdim, optional<ScalarType> opt_dtype) {
-  check_floating_or_complex_dtype("mean", self.scalar_type());
+  auto in_dtype = at::native::get_dtype_from_self(self, opt_dtype, true);
+
+  if (!at::isFloatingType(in_dtype) && !at::isComplexType(in_dtype)) {
+    std::string what = "Input";
+    std::string dtype = toString(self.scalar_type());
+
+    if (opt_dtype.has_value()) {
+      what = "Optional";
+      dtype = toString(opt_dtype.value());
+    }
+
+    TORCH_CHECK(
+        false,
+        "mean(): could not infer output dtype. ",
+        what, " dtype must be either a floating point or complex dtype. ",
+        "Got: ", dtype);
+  }
+
   auto out_dtype = infer_dtype_from_optional(self, dim, keepdim, opt_dtype, maybe_get_output());
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
@@ -221,15 +232,17 @@ ScalarType get_result_or_self_value_dtype(
   if (result.defined()) {
     return result.scalar_type();
   } else {
-    return dtype.value_or(toValueType(self.scalar_type()));
+    return dtype.value_or(toRealValueType(self.scalar_type()));
   }
 }
 
-
-
 TORCH_META_FUNC2(norm, ScalarOpt_dim)
 (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim) {
-  check_floating_or_complex_dtype("norm", self.scalar_type());
+  TORCH_CHECK(
+      at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
+      "norm(): input dtype should be either floating point or complex. "
+      "Got ", self.scalar_type(), " instead.");
+
   auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), c10::nullopt);
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
@@ -240,7 +253,11 @@ TORCH_META_FUNC2(norm, ScalarOpt_dim_dtype)
  IntArrayRef dim,
  bool keepdim,
  ScalarType dtype) {
-  check_floating_or_complex_dtype("norm", dtype);
+  TORCH_CHECK(
+      at::isFloatingType(dtype) || at::isComplexType(dtype),
+      "norm(): the desired output dtype should be either floating point or complex. "
+      "Got ", dtype, " instead.");
+
   auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), dtype);
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
@@ -266,6 +283,34 @@ TORCH_META_FUNC(aminmax)
   this->set_output(1, shape, options);
 }
 
+TORCH_META_FUNC(amax)
+(const Tensor& self, IntArrayRef dim, bool keepdim) {
+  auto maybe_result = maybe_get_output();
+  if (maybe_result.defined()) {
+    TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ",
+            self.scalar_type(), " for input's dtype and ",  maybe_result.scalar_type(), " for out's dtype.");
+  }
+  if (self.numel() == 0) {
+    at::native::zero_numel_check_dims(self, dim, "amax()");
+  }
+  const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type();
+  resize_reduction(*this, self, dim, keepdim, out_dtype);
+}
+
+TORCH_META_FUNC(amin)
+(const Tensor& self, IntArrayRef dim, bool keepdim) {
+  auto maybe_result = maybe_get_output();
+  if (maybe_result.defined()) {
+    TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ",
+                self.scalar_type(), " for input's dtype and ",  maybe_result.scalar_type(), " for out's dtype.");
+  }
+  if (self.numel() == 0) {
+    at::native::zero_numel_check_dims(self, dim, "amin()");
+  }
+  const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type();
+  resize_reduction(*this, self, dim, keepdim, out_dtype);
+}
+
 } // namespace meta
 
 namespace native {
@@ -830,7 +875,7 @@ Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional
   }
 }
 
-void pre_check_gradient(const Tensor& self, c10::optional<int64_t> spacing_size, c10::optional<IntArrayRef> dim,  int64_t edge_order) {
+void pre_check_gradient(const Tensor& self, c10::optional<int64_t> spacing_size, at::OptionalIntArrayRef dim,  int64_t edge_order) {
   // Helper for gradient function to make sure input data satisfies prerequisites
   TORCH_CHECK(self.scalar_type() != ScalarType::Byte, "torch.gradient does not support uint8 input.");
   if (spacing_size.has_value() && !dim.has_value()) {
@@ -932,7 +977,7 @@ std::vector<int64_t> gradient_dim_preprocess(const Tensor& self, c10::optional<i
 std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) {
     pre_check_gradient(self,
                        c10::optional<int64_t>(coordinates.size()),
-                       c10::optional<IntArrayRef>(dim),
+                       at::OptionalIntArrayRef(dim),
                        edge_order);
     return gradient_helper(self, coordinates, dim, edge_order);
 }
@@ -941,7 +986,7 @@ std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, c10::op
   const auto processed_dim = gradient_dim_preprocess(self, dim);
   pre_check_gradient(self,
                      c10::optional<int64_t>(coordinates.size()),
-                     dim.has_value() ? c10::optional<IntArrayRef>(processed_dim) : c10::nullopt,
+                     dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper(self, coordinates, processed_dim, edge_order);
 }
@@ -949,7 +994,7 @@ std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, c10::op
 std::vector<Tensor> gradient(const Tensor& self, c10::ArrayRef<Scalar> spacing, IntArrayRef dim, int64_t edge_order) {
   pre_check_gradient(self,
                      c10::optional<int64_t>(spacing.size()),
-                     c10::optional<IntArrayRef>(dim),
+                     at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
 }
@@ -958,7 +1003,7 @@ std::vector<Tensor> gradient(const Tensor& self, ArrayRef<Scalar> spacing, c10::
   const auto processed_dim = gradient_dim_preprocess(self, dim);
   pre_check_gradient(self,
                      c10::optional<int64_t>(spacing.size()),
-                     dim.has_value() ? c10::optional<IntArrayRef>(processed_dim) : c10::nullopt,
+                     dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper_float(self, spacing, processed_dim, edge_order);
 }
@@ -969,7 +1014,7 @@ std::vector<Tensor> gradient(const Tensor& self, const Scalar& unit_size, IntArr
   std::vector<Scalar> spacing(dim.size(), unit_size);
   pre_check_gradient(self,
                      c10::optional<int64_t>(spacing.size()),
-                     c10::optional<IntArrayRef>(dim),
+                     at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
 }
@@ -983,7 +1028,7 @@ std::vector<Tensor> gradient(const Tensor& self, const c10::optional<Scalar>& un
                               unit_size.has_value() ? unit_size.value() : 1.0) ;
   pre_check_gradient(self,
                      unit_size.has_value() ?  c10::optional<int64_t>(spacing.size()) : c10::nullopt,
-                     dim.has_value() ? c10::optional<IntArrayRef>(processed_dim) : c10::nullopt,
+                     dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper_float(self, spacing, processed_dim, edge_order);
 }
@@ -992,7 +1037,7 @@ std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o
   std::vector<Scalar> spacing(dim.size(), 1.0) ;
   pre_check_gradient(self,
                      c10::optional<int64_t>(spacing.size()),
-                     c10::optional<IntArrayRef>(dim),
+                     at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
 }
@@ -1054,10 +1099,6 @@ Tensor& nansum_out(const Tensor& self, IntArrayRef dim,
   return result;
 }
 
-Tensor nansum(const Tensor &self, c10::optional<ScalarType> dtype) {
-  return at::native::nansum(self, std::vector<int64_t>{}, false, dtype);
-}
-
 Tensor nansum(const Tensor& self, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> opt_dtype) {
   ScalarType dtype = get_dtype_from_self(self, opt_dtype, true);
   Tensor result = create_reduction_result(self, dim, keepdim, dtype);
@@ -1262,22 +1303,29 @@ Tensor& logsumexp_out(const Tensor& self, IntArrayRef dims, bool keepdim, Tensor
               result.scalar_type());
   {
     NoNamesGuard guard;
-    logsumexp_out_impl(result, self, dims, keepdim);
+    if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+      // for integral inputs, promote input to default floating type.
+      auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype());
+      logsumexp_out_impl(result, self.to(default_dtype), dims, keepdim);
+    } else {
+      logsumexp_out_impl(result, self, dims, keepdim);
+    }
   }
   namedinference::propagate_names_for_reduction(result, self, dims, keepdim);
   return result;
 }
 
 Tensor logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) {
-  Tensor result;
-  auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype());
+  TensorOptions result_options;
   if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
-    result = at::empty({0}, self.options().dtype(default_dtype));
-    return at::native::logsumexp_out(self.to(default_dtype), dims, keepdim, result);
+    // even for integral inputs, result is floating dtype
+    auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype());
+    result_options = self.options().dtype(default_dtype);
   } else {
-    result = at::empty({0}, self.options());
-    return at::native::logsumexp_out(self, dims, keepdim, result);
+    result_options = self.options();
   }
+  auto result = at::empty({0}, result_options);
+  return at::native::logsumexp_out(self, dims, keepdim, result);
 }
 
 Tensor logsumexp(const Tensor& self, DimnameList dims, bool keepdim) {
@@ -1415,42 +1463,20 @@ TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) {
   allany_impl<0>(self, result, {}, false, or_stub);
 }
 
-Tensor &amin_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) {
-  TORCH_CHECK(self.scalar_type() == result.scalar_type(), "Expected the dtype for input and out to match, but got ",
-              self.scalar_type(), " for input's dtype and ",  result.scalar_type(), " for out's dtype.");
-  if (self.numel() == 0) {
-    zero_numel_check_dims(self, dim, "amin()");
-  }
-
-  auto iter = make_reduction("amin", result, self, dim, keepdim, self.scalar_type());
+TORCH_IMPL_FUNC(amin_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) {
+  auto iter =
+      meta::make_reduction(self, result, dim, keepdim, self.scalar_type());
   if (iter.numel() != 0) {
     min_values_stub(iter.device_type(), iter);
   }
-  return result;
 }
 
-Tensor amin(const Tensor& self, IntArrayRef dim, bool keepdim) {
-  Tensor result = at::empty({0}, self.options());
-  return at::amin_out(result, self, dim, keepdim);
-}
-
-Tensor &amax_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) {
-  TORCH_CHECK(self.scalar_type() == result.scalar_type(), "Expected the dtype for input and out to match, but got ",
-              self.scalar_type(), " for input's dtype and ",  result.scalar_type(), " for out's dtype.");
-  if (self.numel() == 0) {
-    zero_numel_check_dims(self, dim, "amax()");
-  }
-
-  auto iter = make_reduction("amax", result, self, dim, keepdim, self.scalar_type());
+TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) {
+  auto iter =
+      meta::make_reduction(self, result, dim, keepdim, self.scalar_type());
   if (iter.numel() != 0) {
     max_values_stub(iter.device_type(), iter);
   }
-  return result;
-}
-
-Tensor amax(const Tensor& self, IntArrayRef dim, bool keepdim) {
-  Tensor result = at::empty({0}, self.options());
-  return at::amax_out(result, self, dim, keepdim);
 }
 
 template <class Stub>
@@ -1556,7 +1582,7 @@ static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_
 
 static Tensor& std_var_out(
     const char* fname, Tensor& result, const Tensor& self,
-    c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction_opt,
+    at::OptionalIntArrayRef dim, c10::optional<int64_t> correction_opt,
     bool keepdim, bool take_sqrt) {
   TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(),
               "std and var only supports tensors on a CPU or CUDA device, but got: ",
@@ -1569,7 +1595,7 @@ static Tensor& std_var_out(
   if (at::isComplexType(self.scalar_type())) {
     // For complex, calculate variance of real and imaginary components
     // seperately then add to get overall variance.
-    ScalarType dtype = c10::toValueType(get_dtype_from_result(result, {}));
+    ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result, {}));
     Tensor real_in = at::real(self);
     Tensor real_out = at::empty({0}, self.options().dtype(dtype));
     std_var_out(
@@ -1624,7 +1650,7 @@ static Tensor& std_var_out(
 
 static std::tuple<Tensor&, Tensor&> std_var_mean_out(
     const char* fname, Tensor& result1, Tensor& result2, const Tensor& self,
-    c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction_opt,
+    at::OptionalIntArrayRef dim, c10::optional<int64_t> correction_opt,
     bool keepdim, bool take_sqrt) {
   AT_ASSERT(result1.defined() && result2.defined());
   TORCH_CHECK(self.device().is_cpu() || self.is_cuda(),
@@ -1634,7 +1660,7 @@ static std::tuple<Tensor&, Tensor&> std_var_mean_out(
               fname, " only supports strided layout, got: ", self.layout());
   TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
               fname, " only support floating point and complex dtypes");
-  TORCH_CHECK(result1.scalar_type() == c10::toValueType(result2.scalar_type()),
+  TORCH_CHECK(result1.scalar_type() == c10::toRealValueType(result2.scalar_type()),
               fname, " expected result1 to be real and match the precision of result2. Got ",
               result1.scalar_type(), " and ", result2.scalar_type(), ".");
 
@@ -1642,7 +1668,7 @@ static std::tuple<Tensor&, Tensor&> std_var_mean_out(
     // For complex, calculate for real and imaginary components seperately then combine as:
     // variance = var_real + var_imag
     // mean = mean_real + j * mean_imag
-    ScalarType dtype = c10::toValueType(get_dtype_from_result(result1, {}));
+    ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result1, {}));
     Tensor real_in = at::real(self);
     Tensor real_out_var = at::empty({0}, self.options().dtype(dtype));
     Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype));
@@ -1695,13 +1721,13 @@ static std::tuple<Tensor&, Tensor&> std_var_mean_out(
 
 std::tuple<Tensor, Tensor> var_mean(
     const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return at::var_mean(self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::var_mean(self, /*dim=*/at::OptionalIntArrayRef(dim),
                       /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
 std::tuple<Tensor, Tensor> std_mean(
     const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return at::std_mean(self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::std_mean(self, /*dim=*/at::OptionalIntArrayRef(dim),
                       /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
@@ -1724,11 +1750,11 @@ std::tuple<Tensor&, Tensor&> var_mean_out(
 
 static TensorOptions options_to_value_type(TensorOptions opts) {
   auto scalar_type = typeMetaToScalarType(opts.dtype());
-  return opts.dtype(c10::toValueType(scalar_type));
+  return opts.dtype(c10::toRealValueType(scalar_type));
 }
 
 std::tuple<Tensor, Tensor> var_mean(
-    const Tensor& self, c10::optional<IntArrayRef> dim,
+    const Tensor& self, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
@@ -1737,7 +1763,7 @@ std::tuple<Tensor, Tensor> var_mean(
 }
 
 std::tuple<Tensor, Tensor> std_mean(
-    const Tensor& self, c10::optional<IntArrayRef> dim,
+    const Tensor& self, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
@@ -1751,12 +1777,12 @@ Tensor var(const Tensor& self, bool unbiased) {
 }
 
 Tensor var(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return at::var(self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::var(self, /*dim=*/at::OptionalIntArrayRef(dim),
                  /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
 Tensor& var_out(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) {
-  return at::var_out(result, self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::var_out(result, self, /*dim=*/at::OptionalIntArrayRef(dim),
                      /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
@@ -1766,35 +1792,35 @@ Tensor std(const Tensor& self, bool unbiased) {
 }
 
 Tensor std(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return at::std(self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::std(self, /*dim=*/at::OptionalIntArrayRef(dim),
                  /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
 Tensor& std_out(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) {
-  return at::std_out(result, self, /*dim=*/c10::optional<IntArrayRef>(dim),
+  return at::std_out(result, self, /*dim=*/at::OptionalIntArrayRef(dim),
                      /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim);
 }
 
-Tensor std(const Tensor& self, c10::optional<IntArrayRef> dim,
+Tensor std(const Tensor& self, at::OptionalIntArrayRef dim,
            c10::optional<int64_t> correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& std_out(
-    const Tensor& self, c10::optional<IntArrayRef> dim,
+    const Tensor& self, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim, Tensor& result) {
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& var_out(
-    const Tensor& self, c10::optional<IntArrayRef> dim,
+    const Tensor& self, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim, Tensor& result) {
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
 
 Tensor var(
-    const Tensor& self, c10::optional<IntArrayRef> dim,
+    const Tensor& self, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("var", result, self, dim, correction, keepdim, false);
@@ -1971,12 +1997,25 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
 // backward function for those operators; it propagates the grad to the
 // specific value locations referred to at `indices`.
 Tensor value_selecting_reduction_backward(const Tensor& grad, int64_t dim, const Tensor& indices, IntArrayRef sizes, bool keepdim) {
+  auto inplace_scatter_if_not_tensor_subclass =
+      [&](const Tensor& grad_out, const Tensor& indices_) {
+        auto grad_in = at::zeros(sizes, grad_out.options());
+        if (areAnyTensorSubclassLike({grad, indices})) {
+          return grad_in.scatter(dim, indices_, grad_out);
+        }
+        return grad_in.scatter_(dim, indices_, grad_out);
+      };
+
   if (!keepdim && sizes.size() > 0) {
     auto grad_ = grad.unsqueeze(dim);
     auto indices_ = indices.unsqueeze(dim);
-    return at::zeros(sizes, grad_.options()).scatter_(dim, indices_, grad_);
+    return inplace_scatter_if_not_tensor_subclass(grad_, indices_);
   }
-  return at::zeros(sizes, grad.options()).scatter_(dim, indices, grad);
+  return inplace_scatter_if_not_tensor_subclass(grad, indices);
+}
+
+Tensor sum_csr(const Tensor &self, c10::optional<ScalarType> dtype) {
+  return self.values().sum(dtype);
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 3c3bff454178..aa0ed5462db2 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -1,13 +1,21 @@
 #pragma once
 
 #include <limits>
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/NonEmptyUtils.h>
 #include <ATen/WrapDimUtilsMulti.h>
+#include <c10/core/ScalarType.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
 namespace at { namespace native {
 
 // Maximum and minimum possible scalar values, including infinities
@@ -59,7 +67,7 @@ inline bool _dimreduce_return_trivial(const Tensor &result, const Tensor &self,
 }
 
 inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self,
-                                               int64_t dim, bool keepdim, const char *fn_name) {
+                                               int64_t /*dim*/, bool /*keepdim*/, const char* /*fn_name*/) {
   if (self.numel() == 1 && self.ndimension() == 0) {
     result.resize_({});
     result.fill_(self);
@@ -128,7 +136,7 @@ inline DimVector shape_from_dim_mask(const Tensor& self, DimMask mask, bool keep
 
 static void resize_reduction_result(
     Tensor& result, const Tensor& self, DimMask mask, bool keepdim,
-    ScalarType dtype)
+    ScalarType /*dtype*/)
 {
   auto shape = shape_from_dim_mask(self, mask, keepdim);
   TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
@@ -160,7 +168,7 @@ static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask,
 
 static TensorIterator make_reduction(
     const char* name, Tensor& result, const Tensor& self,
-    c10::optional<IntArrayRef> dim_opt,
+    at::OptionalIntArrayRef dim_opt,
     bool keepdim, ScalarType in_dtype, ScalarType out_dtype) {
   // check that result type and dtype match if provided
   TORCH_CHECK(
@@ -185,20 +193,22 @@ static TensorIterator make_reduction(
 
 static C10_UNUSED TensorIterator make_reduction(
     const char* name, Tensor& result, const Tensor& self,
-    c10::optional<IntArrayRef> dim, bool keepdim, ScalarType out_dtype) {
+    at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) {
   // special case for type promotion in mixed precision, improves computational
   // efficiency.
   // not generalize this to common mismatched input/output types to avoid cross
   // product of templated kernel launches.
   const bool gpu_lowp_to_f32 = (
     self.is_cuda() && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat);
-  auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype;
+  auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type()
+                   : self.is_complex() ? c10::toComplexType(out_dtype)
+                                       : out_dtype;
   return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype);
 }
 
 static TensorIterator make_reduction(
     const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
-    c10::optional<IntArrayRef> dim_opt, bool keepdim, ScalarType dtype1,
+    at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType dtype1,
     ScalarType dtype2) {
   // check that result type and dtype match if provided
   TORCH_CHECK(
@@ -235,7 +245,7 @@ static TensorIterator make_reduction(
 
 static C10_UNUSED TensorIterator make_reduction(
     const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
-    c10::optional<IntArrayRef> dim, bool keepdim, ScalarType dtype) {
+    at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) {
   return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype);
 }
 
@@ -250,7 +260,11 @@ static void zero_numel_check_dims(const Tensor& self, const int64_t dim, const c
   }
 }
 
-static C10_UNUSED void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) {
+static void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) {
+  TORCH_CHECK(
+    !dim.empty(),
+      fn_name, ": Expected reduction dim to be specified for input.numel() == 0. ",
+        "Specify the reduction dim with the 'dim' argument.");
   for (const int64_t d : dim) {
     zero_numel_check_dims(self, d, fn_name);
   }
@@ -357,7 +371,7 @@ static TensorIterator make_reduction(
     IntArrayRef dims,
     bool keepdim,
     ScalarType dtype1,
-    ScalarType dtype2) {
+    ScalarType /*dtype2*/) {
   int64_t ndim = self.dim();
   auto mask = at::native::make_dim_mask(dims, ndim);
   auto viewed_result1 = at::native::review_reduce_result(result1, ndim, mask, keepdim);
diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp
index 81eba80af1dd..d90d00e9ab40 100644
--- a/aten/src/ATen/native/ReflectionPad.cpp
+++ b/aten/src/ATen/native/ReflectionPad.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <ATen/quantized/Quantizer.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -266,76 +267,43 @@ inline void reflection_pad1d_out_loop(
 
 void reflection_pad1d_out_template(
     const Tensor& output, const Tensor& input_, IntArrayRef padding) {
-  int64_t dim_plane = 0;
-  int64_t dim_w = 1;
-  int64_t nbatch = 1;
-  // allow dim=0 only in the batch dimension.
-  TORCH_CHECK(
-      (input_.ndimension() == 2 && input_.size(1) != 0) ||
-      (input_.ndimension() == 3 && input_.size(1) != 0 && input_.size(2) != 0),
-      "2D or 3D (batch mode) tensor expected for input, but got: ", input_);
-
-  if (input_.ndimension() == 3) {
-    nbatch = input_.size(0);
-    dim_w++;
-    dim_plane++;
-  }
-
-  /* sizes */
-  auto pad_l = padding[0];
-  auto pad_r = padding[1];
-
-  int64_t nplane = input_.size(dim_plane);
-  int64_t input_w = input_.size(dim_w);
-  int64_t output_w  = input_w + pad_l + pad_r;
-
-  TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size "
-    "should be less than the corresponding input dimension, but got: padding (",
-    pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.sizes());
-
-  TORCH_CHECK(output_w >= 1 , 2,
-    "input (W: ", input_w, ")is too small. Calculated output W: ", output_w);
-
   /* get contiguous input */
   Tensor input = input_.contiguous();
 
-  /* resize output */
   if (input.ndimension() == 2) {
-    output.resize_({nplane, output_w});
     if (input.is_quantized()) {
       AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qreflection_pad1d", [&]() {
         reflection_pad1d_out_frame<scalar_t>(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-          nplane,
-          input_w, output_w,
-          pad_l);
+          input.size(0),
+          input.size(1), output.size(-1),
+          padding[0]);
       });
     } else {
       AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(input.scalar_type(), "reflection_pad1d", [&] {
         reflection_pad1d_out_frame<scalar_t>(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-          nplane,
-          input_w, output_w,
-          pad_l);
+          input.size(0),
+          input.size(1), output.size(-1),
+          padding[0]);
       });
     }
   } else {
-    output.resize_({nbatch, nplane, output_w});
     if (input.is_quantized()) {
       AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qreflection_pad1d", [&]() {
         reflection_pad1d_out_loop<scalar_t>(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-          nbatch, nplane,
-          input_w, output_w,
-          pad_l);
+          output.size(0), input.size(1),
+          input.size(2), output.size(-1),
+          padding[0]);
       });
     } else {
       AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(input.scalar_type(), "reflection_pad1d", [&] {
         reflection_pad1d_out_loop<scalar_t>(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-          nbatch, nplane,
-          input_w, output_w,
-          pad_l);
+          output.size(0), input.size(1),
+          input.size(2), output.size(-1),
+          padding[0]);
       });
     }
   }
@@ -854,25 +822,18 @@ static void reflection_pad3d_backward_out_loop(
 
 } // namespace
 
+// TODO: I tihnk this function should be removed since we implement it with
+// TORCH_IMPL_FUNC below
 Tensor& reflection_pad1d_out_cpu(const Tensor& input, IntArrayRef padding,
     Tensor& output) {
   reflection_pad1d_out_template(output, input, padding);
   return output;
 }
 
-Tensor reflection_pad1d_cpu(const Tensor& input, IntArrayRef padding) {
-  Tensor output;
-  if (input.is_quantized()) {
-    if (input.qscheme() == kPerTensorAffine) {
-      output = at::_empty_affine_quantized({0}, input.options(),
-                                           input.q_scale(),
-                                           input.q_zero_point());
-    } else {
-      TORCH_CHECK(false, "Only per tensor quantization is supported");
-    }
-  } else {
-    output = at::empty({0}, input.options());
-  }
+Tensor& reflection_pad1d_out_quantized_cpu(const Tensor& input, IntArrayRef padding,
+    Tensor& output) {
+  TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported");
+  set_quantizer_(output, make_per_tensor_affine_quantizer(input.q_scale(), input.q_zero_point(), input.scalar_type()));
   reflection_pad1d_out_template(output, input, padding);
   return output;
 }
@@ -940,18 +901,16 @@ Tensor& reflection_pad2d_out_cpu(const Tensor& input, IntArrayRef padding,
 }
 
 Tensor reflection_pad2d_cpu(const Tensor& input, IntArrayRef padding) {
-  Tensor output;
-  if (input.is_quantized()) {
-    if (input.qscheme() == kPerTensorAffine) {
-      output = at::_empty_affine_quantized({0}, input.options(),
+  Tensor output = at::empty({0}, input.options());
+  reflection_pad2d_out_template(output, input, padding);
+  return output;
+}
+
+Tensor reflection_pad2d_quantized_cpu(const Tensor& input, IntArrayRef padding) {
+  TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported");
+  Tensor output = at::_empty_affine_quantized({0}, input.options(),
                                            input.q_scale(),
                                            input.q_zero_point());
-    } else {
-      TORCH_CHECK(false, "Only per tensor quantization is supported");
-    }
-  } else {
-    output = at::empty({0}, input.options());
-  }
   reflection_pad2d_out_template(output, input, padding);
   return output;
 }
@@ -1007,7 +966,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu)
 
   if (batch_mode) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "replication_pad3d_cpu", [&] {
+        kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] {
           auto input_data = input.data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           auto nbatch = input.size(0);
@@ -1028,7 +987,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu)
         });
   } else {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "replication_pad3d_cpu", [&] {
+        kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] {
           auto input_data = input.data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           reflection_pad3d_out_frame(
@@ -1085,7 +1044,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output,
 
   if (batch_mode) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "replication_pad3d_backward_cpu", [&] {
+        kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
           reflection_pad3d_backward_out_loop<scalar_t>(
               grad_input.data_ptr<scalar_t>(),
               grad_output_.data_ptr<scalar_t>(),
@@ -1103,7 +1062,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output,
         });
   } else {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "replication_pad3d_backward_cpu", [&] {
+        kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
           reflection_pad3d_backward_out_frame<scalar_t>(
               grad_input.data_ptr<scalar_t>(),
               grad_output_.data_ptr<scalar_t>(),
diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h
index 9751f2ec8be7..dadbfb0c2374 100644
--- a/aten/src/ATen/native/Repeat.h
+++ b/aten/src/ATen/native/Repeat.h
@@ -1,6 +1,14 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorOperators.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index d89150cee267..36339aae8445 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -230,7 +230,7 @@ static void replication_pad1d_out_frame(
     long nslices,
     long iwidth,
     long owidth,
-    int pad_l, int pad_r)
+    int pad_l)
 {
   int iStartX = std::max(0, -pad_l);
   int oStartX = std::max(0, pad_l);
@@ -263,14 +263,14 @@ static void replication_pad1d_out_batch(
     long nslices,
     long iwidth,
     long owidth,
-    int pad_l, int pad_r,
+    int pad_l,
     int nbatch)
 {
   at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
     for (const auto p : c10::irange(start, end)) {
       scalar_t *input_p = input_data+p*nslices*iwidth;
       scalar_t *output_p = output_data+p*nslices*owidth;
-      replication_pad1d_out_frame(input_p, output_p, nslices, iwidth, owidth, pad_l, pad_r);
+      replication_pad1d_out_frame(input_p, output_p, nslices, iwidth, owidth, pad_l);
     }
   });
 }
@@ -281,7 +281,7 @@ static void replication_pad1d_backward_out_frame(
     long nslices,
     long iwidth,
     long owidth,
-    int pad_l, int pad_r)
+    int pad_l)
 {
   int iStartX = std::max(0, -pad_l);
   int oStartX = std::max(0, pad_l);
@@ -322,7 +322,7 @@ static void replication_pad1d_backward_out_batch(
       scalar_t *ginput_p = ginput_data + p * nslices * iwidth;
       scalar_t *goutput_p = goutput_data + p * nslices * owidth;
       replication_pad1d_backward_out_frame(ginput_p, goutput_p,
-        nslices, iwidth, owidth, pad_l, pad_r);
+        nslices, iwidth, owidth, pad_l);
     }
   });
 }
@@ -334,7 +334,7 @@ static void replication_pad2d_out_frame(
     int64_t iwidth, int64_t iheight,
     int64_t owidth, int64_t oheight,
     int pad_l, int pad_r,
-    int pad_t, int pad_b)
+    int pad_t)
 {
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
@@ -381,7 +381,7 @@ static void replication_pad2d_out_batch(
     int64_t iwidth, int64_t iheight,
     int64_t owidth, int64_t oheight,
     int pad_l, int pad_r,
-    int pad_t, int pad_b,
+    int pad_t,
     int nbatch)
 {
   at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
@@ -389,7 +389,7 @@ static void replication_pad2d_out_batch(
       scalar_t *input_p = input_data+p*nslices*iwidth*iheight;
       scalar_t *output_p = output_data+p*nslices*owidth*oheight;
       replication_pad2d_out_frame(input_p, output_p, nslices,
-          iwidth, iheight, owidth, oheight, pad_l, pad_r, pad_t, pad_b);
+          iwidth, iheight, owidth, oheight, pad_l, pad_r, pad_t);
     }
   });
 }
@@ -811,7 +811,6 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) (
   constexpr int64_t dimslices = -2;
 
   int64_t pad_l = paddingSize[0];
-  int64_t pad_r = paddingSize[1];
 
   /* get contiguous input */
   auto input = input_.contiguous();
@@ -837,7 +836,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) (
         nslices,
         iwidth,
         owidth,
-        pad_l, pad_r);
+        pad_l);
       }
     );
   }
@@ -852,7 +851,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) (
         nslices,
         iwidth,
         owidth,
-        pad_l, pad_r,
+        pad_l,
         nbatch);
       }
     );
@@ -907,7 +906,7 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cpu) (
         nslices,
         iwidth,
         owidth,
-        pad_l, pad_r);
+        pad_l);
       }
     );
   }
@@ -969,7 +968,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cpu) (
         iwidth, iheight,
         owidth, oheight,
         pad_l, pad_r,
-        pad_t, pad_b);
+        pad_t);
       }
     );
   }
@@ -983,7 +982,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cpu) (
         iwidth, iheight,
         owidth, oheight,
         pad_l, pad_r,
-        pad_t, pad_b,
+        pad_t,
         nbatch);
       }
     );
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index f05a18b7806d..08286f3983cc 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -17,7 +17,7 @@ bool resize_output_check(const Tensor& output, IntArrayRef shape) {
     TORCH_WARN(
       "An output with one or more elements was resized since it had ",
       "shape ", output.sizes(), ", which does not match the required ",
-      "output shape ", shape, ".",
+      "output shape ", shape, ". ",
       "This behavior is deprecated, and in a future PyTorch release outputs ",
       "will not be resized unless they have zero elements. You can explicitly ",
       "reuse an out tensor t by resizing it, inplace, to zero elements with ",
@@ -45,6 +45,12 @@ bool resize_output(const Tensor& output, IntArrayRef shape) {
   }
 }
 
+const Tensor& _resize_output_(const Tensor& self, IntArrayRef shape, c10::Device device) {
+  TORCH_CHECK(self.device() == device, "out Tensor doesn't have the correct device set");
+  at::native::resize_output(self, shape);
+  return self;
+}
+
 void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
   TORCH_CHECK(storage->resizable(), "Trying to resize storage that is not resizable");
 
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 3540ef8b21ac..c6fe2b3d2146 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -2,6 +2,7 @@
 
 #include <ATen/core/Tensor.h>
 #include <ATen/native/ResizeCommon.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/TensorUtils.h>
 
 #include <c10/core/CPUAllocator.h>
@@ -30,22 +31,16 @@ TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape);
 
 TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
 
-static inline void maybe_resize_storage_cpu(TensorImpl* self, uint64_t new_size) {
+static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) {
   // It does not make sense to try to resize a storage
   // to hold 0 elements, and this can break
   // if storage_offset is positive but
   // new_size is 0, so just bail in that case
   // (same comment is in cuda/Resize.h)
-  if (new_size == 0) {
+  if (self->numel() == 0) {
     return;
   }
 
-  const auto new_size_bytes_i =
-      (new_size + self->storage_offset()) * self->dtype().itemsize();
-  TORCH_CHECK(!overflows<size_t>(new_size_bytes_i), "Requested storage size (",
-              new_size_bytes_i, ") cannot be represented as a size_t");
-  const auto new_size_bytes = static_cast<size_t>(new_size_bytes_i);
-
   const Storage& storage = self->unsafe_storage();
   if (!storage) {
     auto new_storage = c10::make_intrusive<StorageImpl>(
@@ -62,21 +57,25 @@ static inline void maybe_resize_storage_cpu(TensorImpl* self, uint64_t new_size)
 inline TensorImpl* resize_impl_cpu_(
     TensorImpl* self,
     IntArrayRef size,
-    c10::optional<IntArrayRef> stride,
+    at::OptionalIntArrayRef stride,
     bool resize_storage = true) {
-  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+  if (self->sizes() == size && (!stride || self->strides() == stride.value())) {
     return self;
   }
 
-  int64_t storage_size = 1;
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
   if (stride) {
     self->set_sizes_and_strides(size, *stride);
-    // NB: storage size can be different from numel.
-    storage_size = storage_size_for(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
   } else {
     self->set_sizes_contiguous(size);
-    storage_size = self->numel();
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
   }
+
   if (resize_storage) {
     maybe_resize_storage_cpu(self, storage_size);
   }
@@ -158,6 +157,12 @@ inline void setStrided(
     IntArrayRef stride,
     int64_t storage_offset) {
   TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
+  for (auto val : stride) {
+    TORCH_CHECK(val >= 0,
+                "as_strided: Negative strides are not supported at the moment, "
+                "got strides: ", stride);
+  }
+
   auto* self_ = self.unsafeGetTensorImpl();
   checkInBoundsForStorage(
       size, stride, storage_offset, self_->dtype(), self_->storage());
@@ -170,11 +175,6 @@ inline void setStrided(
   if (self_->sizes() == size && self_->strides() == stride) {
     return;
   }
-  for (auto val : stride) {
-    TORCH_CHECK(val >= 0,
-                "as_strided: Negative strides are not supported at the moment, "
-                "got strides: ", stride);
-  }
   self_->set_sizes_and_strides(size, stride);
 }
 
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index aecfffadb020..7342c4806d44 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -20,8 +20,8 @@ Scalar item(const Tensor& self) {
 
 Scalar _local_scalar_dense_cpu(const Tensor& self) {
   Scalar r;
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cpu", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+    kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "_local_scalar_dense_cpu", [&] {
         scalar_t value = *self.data_ptr<scalar_t>();
         r = Scalar(value);
       });
diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h
index 1b71eb40975d..92e1edeb5fe0 100644
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <vector>
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/SegmentReduce.h b/aten/src/ATen/native/SegmentReduce.h
index 11a399ae77a1..1e5b87eefb6d 100644
--- a/aten/src/ATen/native/SegmentReduce.h
+++ b/aten/src/ATen/native/SegmentReduce.h
@@ -1,10 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
 #include <c10/util/Optional.h>
 
 namespace at {
+class Tensor;
+
 namespace native {
 
 enum SegmentReductionType { MAX, MEAN, MIN, SUM };
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index cd58d4f48ee8..0519bfa57e61 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -344,17 +344,17 @@ template<typename acc_t>
 struct AbsSwitch {};
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> s) {
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
   return static_cast<acc_t>(data);
 }
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> s) {
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
   return static_cast<acc_t>(std::abs(data));
 }
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> s) {
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
   return static_cast<acc_t>(std::abs(data));
 }
 
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index b4635365e432..6d9f1324eb28 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -9,6 +9,7 @@
 #include <ATen/NamedTensorUtils.h>
 
 #include <c10/core/TensorOptions.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -148,7 +149,7 @@ void host_softmax(
   int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
   parallel_for(
       0, outer_size * inner_size, grain_size,
-      [&](int64_t begin, int64_t end) {
+      [&](int64_t begin, int64_t end) __ubsan_ignore_float_divide_by_zero__ {
         for (const auto i : c10::irange(begin, end)) {
           int64_t outer_idx = i / inner_size;
           int64_t inner_idx = i % inner_size;
@@ -170,7 +171,7 @@ void host_softmax(
             }
           } else {
             for (const auto d : c10::irange(0, dim_size)) {
-              if (mask_data[d * dim_stride]) {
+              if (!mask_data[d * dim_stride]) {
                 max_input = is_meaningful_max
                     ? std::max(max_input, input_data[d * dim_stride])
                     : input_data[d * dim_stride];
@@ -183,7 +184,7 @@ void host_softmax(
           acc_type<scalar_t, false> tmpsum = 0;
           for (const auto d : c10::irange(dim_size)) {
             scalar_t z{};
-            if (!MaskedSoftMax || mask_data[d * dim_stride]) {
+            if (!MaskedSoftMax || !mask_data[d * dim_stride]) {
               z = std::exp(input_data[d * dim_stride] - max_input);
             } else {
               z = 0;
@@ -196,6 +197,8 @@ void host_softmax(
 
           if (LogSoftMax) {
             tmpsum = std::log(tmpsum);
+          } else if (tmpsum == 0) {
+            tmpsum = std::numeric_limits<scalar_t>::quiet_NaN();
           } else {
             tmpsum = 1 / tmpsum;
           }
@@ -214,12 +217,13 @@ void host_softmax(
       });
 }
 
-template <typename scalar_t, bool LogSoftMax>
+template <typename scalar_t, bool LogSoftMax, bool MaskedSoftMax = false>
 void host_softmax_backward(
     const Tensor& gI,
     const Tensor& grad,
     const Tensor& output,
-    int64_t dim) {
+    int64_t dim,
+    bool* mask = nullptr) {
 
   int64_t outer_size = 1;
   int64_t dim_size = grad.size(dim);
@@ -235,6 +239,7 @@ void host_softmax_backward(
   scalar_t* gradInput_data_base = gI.data_ptr<scalar_t>();
   scalar_t* output_data_base = output.data_ptr<scalar_t>();
   scalar_t* gradOutput_data_base = grad.data_ptr<scalar_t>();
+  bool* mask_data_base = mask;
   int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
   parallel_for(
       0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
@@ -247,19 +252,28 @@ void host_softmax_backward(
               output_data_base + outer_idx * outer_stride + inner_idx;
           const scalar_t* gradOutput_data =
               gradOutput_data_base + outer_idx * outer_stride + inner_idx;
+          bool* mask_data = nullptr;
+          if (MaskedSoftMax) {
+            mask_data = mask_data_base + outer_idx * outer_stride + inner_idx;
+          }
 
           acc_type<scalar_t, false> sum = 0;
           for (const auto d : c10::irange(dim_size)) {
-            if (LogSoftMax) {
-              sum += gradOutput_data[d * dim_stride];
-            } else {
-              sum +=
-                  gradOutput_data[d * dim_stride] * output_data[d * dim_stride];
+            if (!MaskedSoftMax || !mask_data[d * dim_stride]) {
+              if (LogSoftMax) {
+                sum += gradOutput_data[d * dim_stride];
+              } else {
+                sum +=
+                    gradOutput_data[d * dim_stride] * output_data[d * dim_stride];
+              }
             }
           }
 
           for (const auto d : c10::irange(dim_size)) {
-            if (LogSoftMax) {
+            if (MaskedSoftMax && mask_data[d * dim_stride]) {
+              gradInput_data[d * dim_stride] = 0;
+            }
+            else if (LogSoftMax) {
               gradInput_data[d * dim_stride] = gradOutput_data[d * dim_stride] -
                   std::exp(output_data[d * dim_stride]) * sum;
             } else {
@@ -360,7 +374,10 @@ TORCH_IMPL_FUNC(softmax_backward_cpu_out)
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND(
         at::ScalarType::BFloat16, grad.scalar_type(), "softmax_backward", [&] {
-          host_softmax_backward<scalar_t, false>(grad_input, grad_, output, dim_);
+          host_softmax_backward<
+              scalar_t,
+              false /* LogSoftMax */,
+              false /* MaskedSoftmax */>(grad_input, grad_, output, dim_);
         });
   }
 }
@@ -389,7 +406,10 @@ TORCH_IMPL_FUNC(log_softmax_backward_cpu_out) (
           grad.scalar_type(),
           "log_softmax_backward",
           [&] {
-            host_softmax_backward<scalar_t, true>(grad_input, grad_, output_, dim_);
+            host_softmax_backward<
+                scalar_t,
+                true /* LogSoftMax */,
+                false /* MaskedSoftMax */>(grad_input, grad_, output_, dim_);
           });
     }
   }
@@ -418,6 +438,43 @@ Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarTyp
   return result;
 }
 
+Tensor& softmax_out(
+    const Tensor& input_,
+    const int64_t dim_,
+    c10::optional<ScalarType> dtype,
+    Tensor& output_) {
+  Tensor output_temp;
+  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+      dtype == ScalarType::Float) {
+    if (!output_.is_contiguous()) {
+      auto options =
+          TensorOptions().dtype(output_.dtype()).device(output_.device());
+      output_temp = at::empty(output_.sizes(), options);
+      at::_softmax_out(output_temp, input_, dim_, true);
+    } else {
+      at::_softmax_out(output_, input_, dim_, true);
+    }
+  } else {
+    Tensor converted =
+        dtype.has_value() ? input_.toType(dtype.value()) : input_;
+    if (!output_.is_contiguous()) {
+      auto options =
+          TensorOptions().dtype(output_.dtype()).device(output_.device());
+      output_temp = at::empty(output_.sizes(), options);
+      at::_softmax_out(output_temp, converted, dim_, false);
+    } else {
+      at::_softmax_out(output_, converted, dim_, false);
+    }
+  }
+
+  if (!output_.is_contiguous()) {
+    output_.resize_(output_temp.sizes());
+    output_.copy_(output_temp);
+  }
+
+  return output_;
+}
+
 // special_softmax, alias for softmax
 Tensor special_softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
   return at::softmax(input_, dim_, dtype);
@@ -446,6 +503,43 @@ Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional<Scala
   return result;
 }
 
+Tensor& log_softmax_out(
+    const Tensor& input_,
+    const int64_t dim_,
+    c10::optional<ScalarType> dtype,
+    Tensor& output_) {
+  Tensor output_temp;
+  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+      dtype == ScalarType::Float) {
+    if (!output_.is_contiguous()) {
+      auto options =
+          TensorOptions().dtype(output_.dtype()).device(output_.device());
+      output_temp = at::empty(output_.sizes(), options);
+      at::_log_softmax_out(output_temp, input_, dim_, true);
+    } else {
+      at::_log_softmax_out(output_, input_, dim_, true);
+    }
+  } else {
+    Tensor converted =
+        dtype.has_value() ? input_.toType(dtype.value()) : input_;
+    if (!output_.is_contiguous()) {
+      auto options =
+          TensorOptions().dtype(output_.dtype()).device(output_.device());
+      output_temp = at::empty(output_.sizes(), options);
+      at::_log_softmax_out(output_temp, converted, dim_, false);
+    } else {
+      at::_log_softmax_out(output_, converted, dim_, false);
+    }
+  }
+
+  if (!output_.is_contiguous()) {
+    output_.resize_(output_temp.sizes());
+    output_.copy_(output_temp);
+  }
+
+  return output_;
+}
+
 Tensor special_log_softmax(const Tensor& input, const int64_t dim, c10::optional<ScalarType> dtype) {
   return at::log_softmax(input, dim, dtype);
 }
@@ -466,23 +560,64 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional<ScalarType> dtype)
   return at::log_softmax(self, dimname_to_position(self, dim), dtype);
 }
 
-Tensor masked_softmax_cpu(const Tensor& input, const Tensor& mask) {
-  Tensor output = at::empty_like(input, input.options());
+Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::optional<int64_t> dim_) {
   TORCH_CHECK(
-      input.sizes() == mask.sizes(), "Mask shape should match input shape");
-  TORCH_CHECK(mask.is_contiguous(), "Mask should always be contiguous");
+      input_.sizes() == mask_.sizes(), "Mask shape should match input shape");
   TORCH_CHECK(
-      mask.scalar_type() == ScalarType::Bool,
+      mask_.scalar_type() == ScalarType::Bool,
       "Mask should be a boolean tensor");
+
+  Tensor output = at::empty_like(input_, input_.options());
+  auto input = input_.contiguous();
+  auto mask = mask_.contiguous();
+  int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1;
+  dim = maybe_wrap_dim(dim, input_.dim());
+
+  if (input.dim() == 0) {
+    input = input.view(1);
+  }
+
   AT_DISPATCH_FLOATING_TYPES_AND(
-      at::ScalarType::BFloat16, input.scalar_type(), "log_softmax", [&] {
+      at::ScalarType::BFloat16, input.scalar_type(), "masked_softmax", [&] {
         host_softmax<
             scalar_t,
             false /* LogSoftMax */,
             true /* MaskedSoftMax */>(
-            output, input, input.dim() - 1, mask.data_ptr<bool>());
+            output, input, dim, mask.data_ptr<bool>());
       });
   return output;
 }
+
+Tensor masked_softmax_backward_cpu(
+    const Tensor& grad_,
+    const Tensor& output_,
+    const Tensor& mask_,
+    const c10::optional<int64_t> dim_) {
+  TORCH_CHECK(
+      grad_.sizes() == mask_.sizes(), "Mask shape should match grad shape");
+  TORCH_CHECK(
+      mask_.scalar_type() == ScalarType::Bool,
+      "Mask should be a boolean tensor");
+  auto grad = grad_.contiguous();
+  auto output = output_.contiguous();
+  auto mask = mask_.contiguous();
+
+  int64_t dim = dim_.has_value() ? dim_.value() : output.dim() - 1;
+  dim = maybe_wrap_dim(dim, grad.dim());
+
+  grad = grad.dim() == 0 ? grad.view(1) : grad;
+  output = output.dim() == 0 ? output.view(1) : output;
+  mask = mask.dim() == 0 ? mask.view(1) : mask;
+
+  Tensor grad_input = at::empty_like(grad, grad.options());
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      at::ScalarType::BFloat16, grad.scalar_type(), "masked_softmax_backward", [&] {
+        host_softmax_backward<
+            scalar_t,
+            false /* LogSoftMax */,
+            true /* MaskedSoftmax */>(grad_input, grad, output, dim, mask.data_ptr<bool>());
+      });
+  return grad_input;
+}
 }
 }
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index ae88547a8aa9..e99fd75467b6 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -14,30 +14,46 @@
 
 namespace at {
 namespace meta {
+
 using namespace native;
-  TORCH_META_FUNC(topk) (
-    const Tensor& self,
-    int64_t k,
-    int64_t dim_,
-    bool largest,
-    bool sorted) {
 
-    int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
-    TORCH_CHECK(
-        k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
-        "selected index k out of range");
-    int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
-    TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension");
-
-    // Build the output size, which is the dim being selected set to
-    // size k
-    DimVector topKSize(self.sizes().vec());
-    if (topKSize.size() > 0) {
-      topKSize[dim] = k;
-    }
-    set_output(0, topKSize, self.options());
-    set_output(1, topKSize, self.options().dtype(at::kLong));
+TORCH_META_FUNC(topk)
+(const Tensor& self, int64_t k, int64_t dim_, bool largest, bool sorted) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  TORCH_CHECK(
+      k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
+      "selected index k out of range");
+  int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
+  TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension");
+
+  // Build the output size, which is the dim being selected set to
+  // size k
+  DimVector topKSize(self.sizes().vec());
+  if (topKSize.size() > 0) {
+    topKSize[dim] = k;
   }
+  set_output(0, topKSize, self.options());
+  set_output(1, topKSize, self.options().dtype(at::kLong));
+}
+
+TORCH_META_FUNC2(sort, stable)
+(const Tensor& self, c10::optional<bool> stable, int64_t dim, bool descending) {
+  TORCH_INTERNAL_ASSERT(
+      stable.has_value(),
+      "sort(): c10::optional<bool> for stable has to have value.");
+  maybe_wrap_dim(dim, self.dim());
+
+  // See issue: https://github.com/pytorch/pytorch/issues/65863
+  // Strides should be dense, so as not to allocate too much memory.
+  // We either use 'self' strides, or infer dense strides from them.
+  std::vector<int64_t> strides = (self.is_non_overlapping_and_dense())
+      ? self.strides().vec()
+      : at::infer_dense_strides(self.sizes(), self.strides());
+
+  set_output(0, self.sizes(), strides, self.options(), {});
+  set_output(1, self.sizes(), strides, self.options().dtype(kLong), {});
+}
+
 } // namespace meta
 
 namespace native {
@@ -45,6 +61,19 @@ namespace native {
 DEFINE_DISPATCH(sort_stub);
 DEFINE_DISPATCH(topk_stub);
 
+void _fill_indices(const TensorBase &indices, int64_t dim) {
+  auto ndim = indices.dim();
+  assert(0 <= dim && dim < ndim);
+  auto dim_size = indices.size(dim);
+  auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
+  auto idx_dim_sizes = std::vector<int64_t>(ndim, 1);
+  auto idx_dim_strides = std::vector<int64_t>(ndim, 0);
+  idx_dim_sizes[dim] = dim_size;
+  idx_dim_strides[dim] = 1;
+  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
+  OptionalTensorRef(indices)->copy_(idx_dim_restrided);
+}
+
 namespace {
 
 /* Note from TH:
@@ -86,7 +115,7 @@ void quick_select_template(
     }
 
     // Use median of three for pivot choice
-    P = (L + R) >> 1;
+    P = L + (R - L) / 2;
     swap_fn(P, L + 1);
     if (gt_or_nan(arr[L + 1], arr[R])) {
       swap_fn(L + 1, R);
@@ -852,52 +881,37 @@ Tensor nanmedian_cpu(const Tensor& self) {
   return median_impl(self, /*ignore_nan=*/true);
 }
 
-std::tuple<Tensor&, Tensor&> sort_out_cpu_stable(const Tensor& self,
-    c10::optional<bool> stable,
-    int64_t dim,
-    bool descending,
-    Tensor& values,
-    Tensor& indices) {
-  values.resize_(self.sizes()).copy_(self);
-  indices.resize_(self.sizes());
-
+TORCH_IMPL_FUNC(sort_stable_out)
+(const Tensor& self,
+ c10::optional<bool> stable,
+ int64_t dim,
+ bool descending,
+ const Tensor& values,
+ const Tensor& indices) {
+  values.copy_(self);
   // check if self is scalar
   if (self.dim() == 0 && self.numel() == 1) {
     indices.zero_();
-    return std::forward_as_tuple(values, indices);
+  } else {
+    dim = maybe_wrap_dim(dim, self.dim());
+    sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value());
   }
-
-  TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional<bool> for stable has to have value.");
-  sort_stub(kCPU, values, indices, dim, descending, stable.value());
-
-  return std::forward_as_tuple(values, indices);
 }
 
-std::tuple<Tensor&, Tensor&> sort_out_cpu(const Tensor& self,
+std::tuple<Tensor&, Tensor&> sort_out(
+    const Tensor& self,
     int64_t dim,
     bool descending,
     Tensor& values,
     Tensor& indices) {
-  return at::native::sort_out_cpu_stable(
-      self, /*stable=*/false, dim, descending, values, indices);
-}
-
-std::tuple<Tensor, Tensor> sort_cpu_stable(
-    const Tensor& self,
-    c10::optional<bool> stable,
-    int64_t dim,
-    bool descending) {
-  TORCH_CHECK(!self.is_complex(), "sort(): input tensor must be of non-complex type");
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  return at::native::sort_out_cpu_stable(self, stable, dim, descending, values, indices);
+  return at::sort_out(values, indices, self, false, dim, descending);
 }
 
-std::tuple<Tensor, Tensor> sort_cpu(
+std::tuple<Tensor, Tensor> sort(
     const Tensor& self,
     int64_t dim,
     bool descending) {
-  return sort_cpu_stable(self, /*stable=*/false, dim, descending);
+  return at::sort(self, false, dim, descending);
 }
 
 Tensor& msort_out(const Tensor& self, Tensor& values) {
diff --git a/aten/src/ATen/native/Sorting.h b/aten/src/ATen/native/Sorting.h
index edfc583a50bf..627ee4521150 100644
--- a/aten/src/ATen/native/Sorting.h
+++ b/aten/src/ATen/native/Sorting.h
@@ -1,7 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
 
 namespace at {
 namespace native {
@@ -14,11 +18,13 @@ enum class QUANTILE_INTERPOLATION_MODE : uint8_t {
   NEAREST
 };
 
-using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable);
-using topk_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
+using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool);
+using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool);
 
 DECLARE_DISPATCH(sort_fn, sort_stub);
 DECLARE_DISPATCH(topk_fn, topk_stub);
 
+void _fill_indices(const TensorBase &indices, int64_t dim);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h
index f3d8805a3526..f6065927eba4 100644
--- a/aten/src/ATen/native/SortingUtils.h
+++ b/aten/src/ATen/native/SortingUtils.h
@@ -86,92 +86,5 @@ inline void _allocate_or_resize_output_with_indices(
   }
 }
 
-
-#ifdef CPU_CAPABILITY
-inline namespace CPU_CAPABILITY {
-#else
-inline namespace DEFAULT {
-#endif
-
-// Core topk loop, shared between CPU and QuantizedCPU
-template <typename scalar_t, typename accscalar_t>
-void topk_impl_loop(
-    const int64_t mode_values_stride,
-    const int64_t mode_indices_stride,
-    const int64_t tmp_values_stride,
-    const int64_t k,
-    const int64_t dim_size,
-    const bool largest,
-    const bool sorted,
-    char** data, const int64_t* strides, const int64_t n) {
-
-  using elem_t = std::pair<accscalar_t, int64_t>;
-  std::vector<elem_t> queue(dim_size);
-  for (const auto i : c10::irange(n)) {
-    TensorAccessor<scalar_t, 1> mode_values(
-        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
-        &k, &mode_values_stride);
-    TensorAccessor<int64_t, 1> mode_indices(
-        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
-        &k, &mode_indices_stride);
-    TensorAccessor<scalar_t, 1> tmp_values(
-        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
-        &dim_size, &tmp_values_stride);
-
-    auto n = dim_size;
-    auto use_partial_sort = k * 64 <= n;
-
-    for (const auto j : c10::irange(n)) {
-      queue[j].first = tmp_values[j];
-      queue[j].second = j;
-    }
-
-    // we want nan to be sorted as top for numpy compatibility
-    if (use_partial_sort) {
-      if (largest) {
-        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-          });
-      } else {
-        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-          });
-      }
-    } else {
-      if (largest) {
-        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-          });
-        if (sorted) {
-          std::sort(queue.begin(), queue.begin() + k - 1,
-            [](const elem_t& x, const elem_t& y) -> bool {
-              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-            });
-        }
-      } else {
-        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-          });
-        if (sorted) {
-          std::sort(queue.begin(), queue.begin() + k -1,
-            [](const elem_t& x, const elem_t& y) -> bool {
-              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-            });
-        }
-      }
-    }
-
-    for (const auto j : c10::irange(k)) {
-      mode_values[j] = queue[j].first;
-      mode_indices[j] = queue[j].second;
-    }
-  }
-}
-
-} // namespace CPU_CAPABILITY
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 2f5789a8f387..9c0ebed7551a 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/WrapDimUtils.h>
 #include <c10/util/irange.h>
 
 #include <algorithm>
@@ -18,7 +19,7 @@ namespace {
 // * Integers are promoted to the default floating type
 // * If require_complex=True, all types are promoted to complex
 // * Raises an error for half-precision dtypes to allow future support
-ScalarType promote_type_fft(ScalarType type, bool require_complex) {
+ScalarType promote_type_fft(ScalarType type, bool require_complex, Device device) {
   if (at::isComplexType(type)) {
     return type;
   }
@@ -27,7 +28,11 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) {
     type = c10::typeMetaToScalarType(c10::get_default_dtype());
   }
 
-  TORCH_CHECK(type == kFloat || type == kDouble, "Unsupported dtype ", type);
+  if (device.is_cuda() && !at::detail::getCUDAHooks().hasROCM()) {
+    TORCH_CHECK(type == kHalf || type == kFloat || type == kDouble, "Unsupported dtype ", type);
+  } else {
+    TORCH_CHECK(type == kFloat || type == kDouble, "Unsupported dtype ", type);
+  }
 
   if (!require_complex) {
     return type;
@@ -35,6 +40,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) {
 
   // Promote to complex
   switch (type) {
+  case kHalf: return kComplexHalf;
   case kFloat: return kComplexFloat;
   case kDouble: return kComplexDouble;
   default: TORCH_INTERNAL_ASSERT(false, "Unhandled dtype");
@@ -44,7 +50,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) {
 // Promote a tensor's dtype according to promote_type_fft
 Tensor promote_tensor_fft(const Tensor& t, bool require_complex=false) {
   auto cur_type = t.scalar_type();
-  auto new_type = promote_type_fft(cur_type, require_complex);
+  auto new_type = promote_type_fft(cur_type, require_complex, t.device());
   return (cur_type == new_type) ? t : t.to(new_type);
 }
 
@@ -218,7 +224,7 @@ struct ShapeAndDims {
 // Wraps dimensions and applies defaulting behavior.
 // Also checks transform dims are unique and transform shape is non-empty.
 ShapeAndDims canonicalize_fft_shape_and_dim_args(
-    Tensor input, c10::optional<IntArrayRef> shape, c10::optional<IntArrayRef> dim) {
+    Tensor input, at::OptionalIntArrayRef shape, at::OptionalIntArrayRef dim) {
   const int64_t input_dim = input.dim();
   const IntArrayRef input_sizes = input.sizes();
   ShapeAndDims ret;
@@ -371,8 +377,8 @@ Tensor& fft_ihfft_out(const Tensor& self, c10::optional<int64_t> n,
   return out;
 }
 
-Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
-                c10::optional<IntArrayRef> dim,
+Tensor fft_fftn(const Tensor& self, at::OptionalIntArrayRef s,
+                at::OptionalIntArrayRef dim,
                 c10::optional<c10::string_view> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
@@ -381,8 +387,8 @@ Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
 }
 
 Tensor& fft_fftn_out(const Tensor& self,
-                     c10::optional<IntArrayRef> s,
-                     c10::optional<IntArrayRef> dim,
+                     at::OptionalIntArrayRef s,
+                     at::OptionalIntArrayRef dim,
                      c10::optional<c10::string_view> norm, Tensor& out) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
@@ -391,8 +397,8 @@ Tensor& fft_fftn_out(const Tensor& self,
   return out;
 }
 
-Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
-                c10::optional<IntArrayRef> dim,
+Tensor fft_ifftn(const Tensor& self, at::OptionalIntArrayRef s,
+                at::OptionalIntArrayRef dim,
                 c10::optional<c10::string_view> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
@@ -400,8 +406,8 @@ Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
 }
 
 Tensor& fft_ifftn_out(const Tensor& self,
-                      c10::optional<IntArrayRef> s,
-                      c10::optional<IntArrayRef> dim,
+                      at::OptionalIntArrayRef s,
+                      at::OptionalIntArrayRef dim,
                       c10::optional<c10::string_view> norm, Tensor& out) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
@@ -410,8 +416,8 @@ Tensor& fft_ifftn_out(const Tensor& self,
 }
 
 static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
-                             c10::optional<IntArrayRef> s,
-                             c10::optional<IntArrayRef> dim,
+                             at::OptionalIntArrayRef s,
+                             at::OptionalIntArrayRef dim,
                              const c10::optional<c10::string_view>& norm_str) {
   TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type());
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
@@ -423,15 +429,15 @@ static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
   return fft_r2c_maybe_out(fname, out, x, desc.dim, norm, /*onesided=*/true);
 }
 
-Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
-                c10::optional<IntArrayRef> dim,
+Tensor fft_rfftn(const Tensor& self, at::OptionalIntArrayRef s,
+                at::OptionalIntArrayRef dim,
                 c10::optional<c10::string_view> norm_str) {
   return fft_rfftn_impl({}, self, s, dim, norm_str);
 }
 
 Tensor& fft_rfftn_out(const Tensor& self,
-                      c10::optional<IntArrayRef> s,
-                      c10::optional<IntArrayRef> dim,
+                      at::OptionalIntArrayRef s,
+                      at::OptionalIntArrayRef dim,
                       c10::optional<c10::string_view> norm_str, Tensor& out) {
   fft_rfftn_impl(out, self, s, dim, norm_str);
   return out;
@@ -439,12 +445,13 @@ Tensor& fft_rfftn_out(const Tensor& self,
 
 ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args(
     c10::string_view fname, const Tensor& self,
-    const c10::optional<IntArrayRef>& s,
-    const c10::optional<IntArrayRef>& dims,
+    const at::OptionalIntArrayRef& s,
+    const at::OptionalIntArrayRef& dims,
     int64_t& last_dim_size) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dims);
   TORCH_CHECK(desc.shape.size() > 0, fname, " must transform at least one axis");
 
+  // Expected output size of the hermitian-symmetric dimension
   last_dim_size = [&] {
     // Fixup default shape handling in the last dimension,
     if (!s.has_value() || (s->back() == -1)) {
@@ -453,15 +460,16 @@ ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args(
     }
     return desc.shape.back();
   }();
-  auto ld = last_dim_size / 2 + 1;
-  desc.shape.back() = ld;
-  TORCH_CHECK(ld >= 1, "Invalid number of data points (", last_dim_size, ") specified");
+  TORCH_CHECK(last_dim_size >= 1, "Invalid number of data points (", last_dim_size, ") specified");
+
+  // Expected input size of the complex-hermitian data
+  desc.shape.back() = last_dim_size / 2 + 1;
   return desc;
 }
 
 static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
-                              c10::optional<IntArrayRef> s,
-                              c10::optional<IntArrayRef> dim,
+                              at::OptionalIntArrayRef s,
+                              at::OptionalIntArrayRef dim,
                               const c10::optional<c10::string_view>& norm_str) {
   int64_t last_dim_size = 0;
   auto desc = canonicalize_fft_c2r_shape_and_dim_args(
@@ -474,15 +482,15 @@ static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
 }
 
 Tensor fft_irfftn(const Tensor& self,
-                  c10::optional<IntArrayRef> s,
-                  c10::optional<IntArrayRef> dim,
+                  at::OptionalIntArrayRef s,
+                  at::OptionalIntArrayRef dim,
                   c10::optional<c10::string_view> norm_str) {
   return fft_irfftn_impl({}, self, s, dim, norm_str);
 }
 
 Tensor& fft_irfftn_out(const Tensor& self,
-                       c10::optional<IntArrayRef> s,
-                       c10::optional<IntArrayRef> dim,
+                       at::OptionalIntArrayRef s,
+                       at::OptionalIntArrayRef dim,
                        c10::optional<c10::string_view> norm_str, Tensor& out) {
   fft_irfftn_impl(out, self, s, dim, norm_str);
   return out;
@@ -490,8 +498,8 @@ Tensor& fft_irfftn_out(const Tensor& self,
 
 static Tensor fft_hfftn_impl(
     const Tensor& self,
-    c10::optional<IntArrayRef> s,
-    c10::optional<IntArrayRef> dim,
+    at::OptionalIntArrayRef s,
+    at::OptionalIntArrayRef dim,
     c10::optional<c10::string_view> norm_str,
     const Tensor& out) {
   constexpr c10::string_view fname = "hfftn";
@@ -518,16 +526,16 @@ static Tensor fft_hfftn_impl(
 
 Tensor fft_hfftn(
     const Tensor& self,
-    c10::optional<IntArrayRef> s,
-    c10::optional<IntArrayRef> dim,
+    at::OptionalIntArrayRef s,
+    at::OptionalIntArrayRef dim,
     c10::optional<c10::string_view> norm) {
   return fft_hfftn_impl(self, s, dim, norm, {});
 }
 
 const Tensor& fft_hfftn_out(
     const Tensor& self,
-    c10::optional<IntArrayRef> s,
-    c10::optional<IntArrayRef> dim, c10::optional<c10::string_view> norm,
+    at::OptionalIntArrayRef s,
+    at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm,
     const Tensor& out) {
   fft_hfftn_impl(self, s, dim, norm, out);
   return out;
@@ -535,8 +543,8 @@ const Tensor& fft_hfftn_out(
 
 static Tensor fft_ihfftn_impl(
     const Tensor& self,
-    const c10::optional<IntArrayRef>& s,
-    const c10::optional<IntArrayRef>& dim,
+    const at::OptionalIntArrayRef& s,
+    const at::OptionalIntArrayRef& dim,
     const c10::optional<c10::string_view>& norm_str,
     const Tensor& out) {
   constexpr c10::string_view fname = "ihfftn";
@@ -560,80 +568,80 @@ static Tensor fft_ihfftn_impl(
 
 Tensor fft_ihfftn(
     const Tensor& self,
-    c10::optional<IntArrayRef> s,
-    c10::optional<IntArrayRef> dim,
+    at::OptionalIntArrayRef s,
+    at::OptionalIntArrayRef dim,
     c10::optional<c10::string_view> norm) {
   return fft_ihfftn_impl(self, s, dim, norm, {});
 }
 
 const Tensor& fft_ihfftn_out(
     const Tensor& self,
-    c10::optional<IntArrayRef> s,
-    c10::optional<IntArrayRef> dim,
+    at::OptionalIntArrayRef s,
+    at::OptionalIntArrayRef dim,
     c10::optional<c10::string_view> norm,
     const Tensor& out) {
   fft_ihfftn_impl(self, s, dim, norm, out);
   return out;
 }
 
-Tensor fft_fft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_fft2(const Tensor& self, at::OptionalIntArrayRef s,
                 IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_fftn(self, s, dim, std::move(norm));
 }
 
-Tensor& fft_fft2_out(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor& fft_fft2_out(const Tensor& self, at::OptionalIntArrayRef s,
                      IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_fftn_out(self, s, dim, std::move(norm), out);
 }
 
-Tensor fft_ifft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_ifft2(const Tensor& self, at::OptionalIntArrayRef s,
                 IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_ifftn(self, s, dim, std::move(norm));
 }
 
-Tensor& fft_ifft2_out(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor& fft_ifft2_out(const Tensor& self, at::OptionalIntArrayRef s,
                       IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_ifftn_out(self, s, dim, std::move(norm), out);
 }
 
-Tensor fft_rfft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_rfft2(const Tensor& self, at::OptionalIntArrayRef s,
                 IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_rfftn(self, s, dim, std::move(norm));
 }
 
-Tensor& fft_rfft2_out(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor& fft_rfft2_out(const Tensor& self, at::OptionalIntArrayRef s,
                       IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_rfftn_out(self, s, dim, std::move(norm), out);
 }
 
-Tensor fft_irfft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_irfft2(const Tensor& self, at::OptionalIntArrayRef s,
                   IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_irfftn(self, s, dim, std::move(norm));
 }
 
-Tensor& fft_irfft2_out(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor& fft_irfft2_out(const Tensor& self, at::OptionalIntArrayRef s,
                        IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_irfftn_out(self, s, dim, std::move(norm), out);
 }
 
 const Tensor& fft_hfft2_out(
-    const Tensor& self, c10::optional<IntArrayRef> s, IntArrayRef dim,
+    const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim,
     c10::optional<c10::string_view> norm, const Tensor& out) {
   return native::fft_hfftn_out(self, s, dim, std::move(norm), out);
 }
 
-Tensor fft_hfft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_hfft2(const Tensor& self, at::OptionalIntArrayRef s,
                  IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_hfftn(self, s, dim, std::move(norm));
 }
 
 const Tensor& fft_ihfft2_out(
-    const Tensor& self, c10::optional<IntArrayRef> s, IntArrayRef dim,
+    const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim,
     c10::optional<c10::string_view> norm, const Tensor& out) {
   return native::fft_ihfftn_out(self, s, dim, std::move(norm), out);
 }
 
-Tensor fft_ihfft2(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_ihfft2(const Tensor& self, at::OptionalIntArrayRef s,
                   IntArrayRef dim, c10::optional<c10::string_view> norm) {
   return native::fft_ihfftn(self, s, dim, std::move(norm));
 }
@@ -684,7 +692,7 @@ Tensor fft_rfftfreq(int64_t n, double d,
 
 // If an array dim is specified, wraps them according to self.dim().
 // Otherwise returns a vector of all dims.
-DimVector default_alldims(const Tensor& self, c10::optional<IntArrayRef> dim_opt) {
+DimVector default_alldims(const Tensor& self, at::OptionalIntArrayRef dim_opt) {
   DimVector dim;
   if (dim_opt) {
     IntArrayRef dim_unwrapped = *dim_opt;
@@ -699,7 +707,7 @@ DimVector default_alldims(const Tensor& self, c10::optional<IntArrayRef> dim_opt
   return dim;
 }
 
-Tensor fft_fftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+Tensor fft_fftshift(const Tensor& x, at::OptionalIntArrayRef dim_opt) {
   auto dim = default_alldims(x, dim_opt);
 
   IntArrayRef x_sizes = x.sizes();
@@ -711,7 +719,7 @@ Tensor fft_fftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
   return at::roll(x, shift, dim);
 }
 
-Tensor fft_ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+Tensor fft_ifftshift(const Tensor& x, at::OptionalIntArrayRef dim_opt) {
   auto dim = default_alldims(x, dim_opt);
 
   IntArrayRef x_sizes = x.sizes();
@@ -756,14 +764,11 @@ static Stream& write_opt(Stream& SS, const optional<T>& value) {
  *
  * This is modeled after librosa but with support for complex time-domain
  * signals and complex windows.
- *
- * NOTE: librosa's center and pad_mode arguments are currently only implemented
- * in python because it uses torch.nn.functional.pad which is python-only.
  */
 Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
             const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
-            const bool normalized, const optional<bool> onesidedOpt,
-            const optional<bool> return_complexOpt) {
+            const bool center, c10::string_view mode, const bool normalized,
+            const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> window_maybe_owned = at::borrow_from_optional_tensor(window_opt);
   const Tensor& window = *window_maybe_owned;
@@ -821,6 +826,19 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   if (self.dim() == 1) {
     input = input.unsqueeze(0);
   }
+
+  if (center) {
+    const auto input_shape = input.sizes();
+    const auto input_dim = input_shape.size();
+    const auto extra_dims = std::max(size_t{3}, input_dim) - input_dim;
+    const auto pad_amount = n_fft / 2;
+
+    DimVector extended_shape(extra_dims, 1);
+    extended_shape.append(input_shape.begin(), input_shape.end());
+    input = at::pad(input.view(extended_shape), {pad_amount, pad_amount}, mode);
+    input = input.view(IntArrayRef(input.sizes()).slice(extra_dims));
+  }
+
   int64_t batch = input.size(0);
   int64_t len = input.size(1);
   if (n_fft <= 0 || n_fft > len) {
@@ -894,6 +912,17 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   }
 }
 
+Tensor stft(
+    const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
+    const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
+    const bool normalized,
+    const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
+  return at::stft(
+      self, n_fft, hop_lengthOpt, win_lengthOpt, window_opt,
+      /*center=*/false, /*mode=*/"constant", normalized, onesidedOpt,
+      return_complexOpt);
+}
+
 // Create complex tensor from the old style of real tensor with size=(..., 2)
 // This is to support istft in the transition to requiring complex input.
 // NOTE: This may return a view of the input tensor, or might clone if necessary
@@ -1087,14 +1116,6 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   #undef REPR
 }
 
-Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-            const optional<int64_t> win_lengthOpt, const Tensor& window,
-            const bool normalized, const optional<bool> onesidedOpt) {
-  return at::native::stft(
-      self, n_fft, hop_lengthOpt, win_lengthOpt, window, normalized, onesidedOpt,
-      /*return_complex=*/c10::nullopt);
-}
-
 Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
              const optional<int64_t> win_lengthOpt, const Tensor& window,
              const bool center, const bool normalized, const optional<bool> onesidedOpt,
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 340bc5a822ad..613f6bb2bd70 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -62,6 +62,8 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/ScatterGatherChecks.h>
 #include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <c10/util/irange.h>
 #include <c10/util/Unroll.h>
@@ -74,13 +76,29 @@
 namespace at {
 namespace meta {
 
-native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce) {
-  if (reduce == "add") {
-    return native::SCATTER_GATHER_OP::REDUCE_ADD;
-  } else if (reduce == "multiply") {
-    return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY;
+native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce, bool use_new_options = false) {
+  if (use_new_options) {
+    if (reduce == "sum") {
+      return native::SCATTER_GATHER_OP::REDUCE_ADD;
+    } else if (reduce == "prod") {
+      return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY;
+    } else if (reduce == "mean") {
+      return native::SCATTER_GATHER_OP::REDUCE_MEAN;
+    } else if (reduce == "amax") {
+      return native::SCATTER_GATHER_OP::REDUCE_MAXIMUM;
+    } else if (reduce == "amin") {
+    return native::SCATTER_GATHER_OP::REDUCE_MINIMUM;
+    } else {
+      TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin.");
+    }
   } else {
-    TORCH_CHECK(false, "reduce argument must be either add or multiply.");
+    if (reduce == "add") {
+      return native::SCATTER_GATHER_OP::REDUCE_ADD;
+    } else if (reduce == "multiply") {
+      return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY;
+    } else {
+      TORCH_CHECK(false, "reduce argument must be either add or multiply.")
+    }
   }
 }
 
@@ -113,7 +131,7 @@ TORCH_META_FUNC(gather)
   at::native::gather_shape_check(self, wrapped_dim, index);
 }
 
-template <typename Meta>
+template <bool use_new_options = false, typename Meta>
 void scatter_meta_impl(
     Meta& meta,
     const Tensor& self,
@@ -137,7 +155,7 @@ void scatter_meta_impl(
   meta.set_output(self.sizes(), self.options());
   if (reduce.has_value()) {
     // Check if we have a valid reduce operator.
-    get_operator_enum(reduce.value());
+    get_operator_enum(reduce.value(), use_new_options);
   }
 }
 
@@ -174,6 +192,17 @@ TORCH_META_FUNC(scatter_add)
   scatter_meta_impl(*this, self, dim, index, src, "add");
 }
 
+TORCH_META_FUNC2(scatter_reduce, two)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const c10::string_view reduce,
+ bool include_self) {
+  (void) include_self;
+  scatter_meta_impl</*use_new_options=*/true>(*this, self, dim, index, src, reduce);
+}
+
 TORCH_PRECOMPUTE_META_FUNC(index_copy)
 (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source) {
   dim = maybe_wrap_dim(dim, self.dim());
@@ -233,28 +262,33 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy)
   return TORCH_PRECOMPUTE_STRUCT(index_copy)().set_dim(dim);
 }
 
-TORCH_PRECOMPUTE_META_FUNC(index_add)
-(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha) {
-  dim = maybe_wrap_dim(dim, self.dim());
+template <typename Meta>
+void index_func_meta_impl(
+  Meta& meta,
+  const Tensor& self,
+  int64_t dim,
+  const Tensor& index,
+  const Tensor& source,
+  c10::string_view func) {
   auto numel = index.numel();
 
-  TORCH_CHECK_INDEX(index.dim() <= 1, "index_add_(): Index is supposed to be a vector, but got dim: ",
+  TORCH_CHECK_INDEX(index.dim() <= 1, func, "_(): Index is supposed to be a vector, but got dim: ",
                     index.dim(), " with type: ", index.scalar_type(), " and size: ", index.sizes());
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
-              "index_add_(): Expected dtype int32/int64 for index but got: ", index.scalar_type());
+              func, "_(): Expected dtype int32/int64 for index but got: ", index.scalar_type());
   TORCH_CHECK(self.scalar_type() == source.scalar_type(),
-              "index_add_(): self (", self.scalar_type(), ") and source (", source.scalar_type(),
+              func, "_(): self (", self.scalar_type(), ") and source (", source.scalar_type(),
               ") must have the same scalar type");
   TORCH_CHECK(dim == 0 || dim < source.dim(),
-              "index_add_(): Indexing dim ", dim, " is out of bounds of the source tensor with dim ",
+              func, "_(): Indexing dim ", dim, " is out of bounds of the source tensor with dim ",
               source.dim());
   TORCH_CHECK(numel == (source.dim() == 0 ? 1 : source.size(dim)),
-              "index_add_(): Number of indices (", numel, ") should be equal to source.size(dim): (",
+              func, "_(): Number of indices (", numel, ") should be equal to source.size(dim): (",
               source.size(dim), "), for dim: ", dim);
 
-  auto& result = maybe_get_output(0);
+  auto& result = meta.maybe_get_output(0);
   bool is_defined = result.defined();
-  set_output(self.sizes(), self.options());
+  meta.set_output(self.sizes(), self.options());
   if (is_defined) {
     at::assert_no_internal_overlap(result);
     at::assert_no_overlap(result, index);
@@ -269,10 +303,30 @@ TORCH_PRECOMPUTE_META_FUNC(index_add)
     auto sourceSlice = source.select(dim, 0);
     auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice);
   }
+}
 
+TORCH_PRECOMPUTE_META_FUNC(index_add)
+(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha) {
+  dim = maybe_wrap_dim(dim, self.dim());
+  index_func_meta_impl(*this, self, dim, index, source, "index_add");
   return TORCH_PRECOMPUTE_STRUCT(index_add)().set_dim(dim);
 }
 
+TORCH_PRECOMPUTE_META_FUNC(index_reduce)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& source,
+ const c10::string_view reduce,
+ bool include_self) {
+  (void)include_self;
+  TORCH_CHECK(reduce == "prod" || reduce == "mean" || reduce == "amax" || reduce == "amin",
+              "index_reduce(): Expected reduce to be one of prod, mean, amax or amin but got ", reduce, ".");
+  dim = maybe_wrap_dim(dim, self.dim());
+  index_func_meta_impl(*this, self, dim, index, source, "index_reduce");
+  return TORCH_PRECOMPUTE_STRUCT(index_reduce)().set_dim(dim);
+}
+
 } // namespace meta
 
 namespace native {
@@ -296,6 +350,7 @@ DEFINE_DISPATCH(scatter_fill_stub);
 DEFINE_DISPATCH(scatter_add_stub);
 DEFINE_DISPATCH(scatter_reduce_stub);
 DEFINE_DISPATCH(scatter_scalar_reduce_stub);
+DEFINE_DISPATCH(scatter_reduce_two_stub);
 
 static bool all_strides_match(TensorList tensors) {
   TORCH_CHECK(tensors.size() >= 1);
@@ -759,6 +814,7 @@ TORCH_IMPL_FUNC(index_copy_out)
       result_dim_stride);
 }
 
+// Not calling into index_reduce_func_impl because of a different dtype dispatch
 TORCH_IMPL_FUNC(index_add_cpu_out)
 (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) {
   if (!result.is_same(self)) result.copy_(self);
@@ -825,6 +881,164 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
   }
 }
 
+void index_reduce_func_impl(
+  const Tensor& self,
+  int64_t dim,
+  const Tensor& index,
+  const Tensor& source,
+  bool include_self,
+  const Tensor& result,
+  const SCATTER_GATHER_OP& op) {
+  if (!result.is_same(self)) result.copy_(self);
+  if (!include_self) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      self.scalar_type(), "index_reduce_func_exclude_input_init", [&] {
+      scalar_t init_val;
+      switch (op) {
+        case SCATTER_GATHER_OP::REDUCE_MULTIPLY:
+          init_val = (scalar_t)1;
+          break;
+        case SCATTER_GATHER_OP::REDUCE_MAXIMUM:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? -std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::lowest();
+          break;
+        case SCATTER_GATHER_OP::REDUCE_MINIMUM:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::max();
+          break;
+        default:
+          init_val = (scalar_t)0;
+          break;
+      }
+      // index_fill_ requires index to be a LongTensor
+      result.index_fill_(dim, index.to(at::ScalarType::Long), init_val);
+    });
+  }
+
+  auto numel = index.numel();
+
+  auto index_contig = index.contiguous();
+
+  if (result.dim() > 1) {
+    // Equivalent to:
+    //   for (const auto i : c10::irange(numel)) {
+    //     auto selfSlice = self.select(dim, index_data[i]);
+    //     auto sourceSlice = source.select(dim, i);
+    //     selfSlice.op_(sourceSlice);
+    //   }
+    // But much faster as this reuses the iterator from the binary op
+    if (numel == 0) {
+      return;
+    }
+    auto selfSlice = result.select(dim, 0);
+    auto sourceSlice = source.select(dim, 0);
+    auto self_stride_bytes = result.stride(dim) * elementSize(result.scalar_type());
+    auto source_stride_bytes = source.stride(dim) * elementSize(source.scalar_type());
+    auto self_dim_size = result.size(dim);
+    auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice);
+
+    AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_func_cpu_", [&] () {
+      auto index_data = index_contig.data_ptr<index_t>();
+      for (const auto i : c10::irange(numel)) {
+        auto self_i = index_data[i];
+        TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self");
+        auto self_data = static_cast<char*>(selfSlice.data_ptr()) + self_i * self_stride_bytes;
+        auto source_data = static_cast<char*>(sourceSlice.data_ptr()) + i * source_stride_bytes;
+        iter.unsafe_replace_operand(0, self_data);
+        iter.unsafe_replace_operand(1, self_data);
+        iter.unsafe_replace_operand(2, source_data);
+
+        switch (op) {
+          case SCATTER_GATHER_OP::REDUCE_MULTIPLY :
+            mul_stub(iter.device_type(), iter);
+            break;
+          case SCATTER_GATHER_OP::REDUCE_MINIMUM :
+            minimum_stub(iter.device_type(), iter);
+            break;
+          case SCATTER_GATHER_OP::REDUCE_MAXIMUM :
+            maximum_stub(iter.device_type(), iter);
+            break;
+          default :
+            add_stub(iter.device_type(), iter, 1);
+            break;
+        }
+      }
+    });
+
+    if (op == SCATTER_GATHER_OP::REDUCE_MEAN) {
+      auto counts = include_self ? at::ones_like(result) : at::zeros_like(result);
+      counts.index_add_(dim, index, at::ones_like(source));
+      counts.masked_fill_(counts == 0, 1);
+      result.div_(counts);
+    }
+  }
+  else {
+    TORCH_CHECK(source.dim() <= 1, "source.dim() (", source.dim(), ") must one or zero for given self.dim() (", self.dim(), ")");
+    auto counts = include_self ? at::ones_like(result) : at::zeros_like(result);
+    // explicitly capture all required variables to work around windows build
+    // TODO: fix this when windows can correctly capture variables in nested lambda
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
+      result.scalar_type(), "index_func_", [&result, &source, &dim, &index_contig, &numel, &op, &counts] {
+      auto result_stride = result.dim() == 0 ? 1 : result.stride(dim);
+      auto source_stride = source.dim() == 0 ? 1 : source.stride(dim);
+      auto counts_stride = counts.dim() == 0 ? 1 : counts.stride(dim);
+      // TODO: Maybe TensorAccessor can be used here?
+      auto* result_ptr = result.data_ptr<scalar_t>();
+      auto* source_ptr = source.data_ptr<scalar_t>();
+      auto counts_ptr = counts.data_ptr<scalar_t>();
+      AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_func_cpu_",
+        [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &op, &counts_ptr, &counts_stride] {
+        auto index_data = index_contig.data_ptr<index_t>();
+        for (const auto i : c10::irange(numel)) {
+            auto self_i = index_data[i];
+            TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self");
+            scalar_t *self_ip = result_ptr + self_i * result_stride;
+            scalar_t *count_ip;
+            scalar_t val;
+            switch (op) {
+              case SCATTER_GATHER_OP::REDUCE_MEAN :
+                *self_ip += *(source_ptr + i * source_stride);
+                count_ip = counts_ptr + self_i * counts_stride;
+                *count_ip += 1;
+                break;
+              case SCATTER_GATHER_OP::REDUCE_MULTIPLY :
+                *self_ip *= *(source_ptr + i * source_stride);
+                break;
+              case SCATTER_GATHER_OP::REDUCE_MINIMUM :
+                val = *(source_ptr + i * source_stride);
+                *self_ip = at::_isnan<scalar_t>(val) ? val : std::min(*self_ip, val);
+                break;
+              case SCATTER_GATHER_OP::REDUCE_MAXIMUM :
+                val = *(source_ptr + i * source_stride);
+                *self_ip = at::_isnan<scalar_t>(val) ? val : std::max(*self_ip, val);
+                break;
+              default:
+                break;
+            }
+        }
+      });
+    });
+    if (op == SCATTER_GATHER_OP::REDUCE_MEAN) {
+      counts.masked_fill_(counts == 0, 1);
+      result.div_(counts);
+    }
+  }
+}
+
+TORCH_IMPL_FUNC(index_reduce_cpu_out)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& source,
+ const c10::string_view reduce,
+ bool include_input,
+ const Tensor& result) {
+  TORCH_WARN_ONCE("index_reduce() is in beta and the API may change at any time.");
+  auto op = meta::get_operator_enum(reduce, true);
+  index_reduce_func_impl(self, dim, index, source, include_input, result, op);
+}
+
 // Check that indices fall within dimension array size
 // Avoid redispatch call to min/max
 template <typename IndexType>
@@ -880,9 +1094,6 @@ Tensor & index_select_out_cpu_dim1_(
 
           for (const auto i : c10::irange(N)) {
             auto idx = idxs[i];
-            if (idx < 0) {
-              idx = idx + src_indexing_axis_dim;
-            }
             dst_floats[i] = src_floats[idx];
           }
         }
@@ -892,10 +1103,6 @@ Tensor & index_select_out_cpu_dim1_(
         for (const auto batch : c10::irange(outer_dims_product)) {
           for (const auto i : c10::irange(N)) {
             auto idx = idxs[i];
-            if (idx < 0) {
-              idx = idx + src_indexing_axis_dim;
-            }
-
             auto src = src_base + batch * src_batch_bytesize + idx * block_bytesize;
             auto dst = out + batch * gathered_batch_bytesize + i * block_bytesize;
             memcpy(dst, src, block_bytesize);
@@ -1071,7 +1278,12 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso
 }
 
 Tensor index_select_backward(const Tensor& grad, IntArrayRef self_sizes, int64_t dim, const Tensor& index) {
-  return at::zeros(self_sizes, grad.options()).index_add_(dim, index, grad);
+  // for composite compliance, use out-of-place variant of
+  // `index_add` if index tensor is a Tensor Subclass.
+  if (isTensorSubclassLike(index)) {
+    return grad.new_zeros(self_sizes, grad.options()).index_add(dim, index, grad);
+  }
+  return grad.new_zeros(self_sizes, grad.options()).index_add_(dim, index, grad);
 }
 
 Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Scalar& source) {
@@ -1173,10 +1385,49 @@ Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, cons
   if (sparse_grad) {
     return at::_gather_sparse_backward(self, dim, index, grad);
   }
-  return grad.new_zeros(self.sizes()).scatter_add_(dim, index, grad);
+  auto result = grad.new_zeros(self.sizes());
+  // for composite compliance, use out-of-place variant of
+  // `scatter_add` if index tensor is a Tensor Subclass.
+  if (isTensorSubclassLike(index)) {
+    return result.scatter_add(dim, index, grad);
+  }
+  result.scatter_add_(dim, index, grad);
+  return result;
+}
+
+static void scatter_reduce_exclude_self_helper(
+  const Tensor& self,
+  int64_t dim,
+  const Tensor& index,
+  const SCATTER_GATHER_OP& op) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool,
+    self.scalar_type(), "scatter_reduce_exclude_input_init", [&] {
+    scalar_t init_val;
+    switch (op) {
+      case SCATTER_GATHER_OP::REDUCE_ADD:
+        init_val = (scalar_t)0;
+        break;
+      case SCATTER_GATHER_OP::REDUCE_MULTIPLY:
+        init_val = (scalar_t)1;
+        break;
+      case SCATTER_GATHER_OP::REDUCE_MAXIMUM:
+        init_val = std::numeric_limits<scalar_t>::has_infinity ? -std::numeric_limits<scalar_t>::infinity()
+                   : std::numeric_limits<scalar_t>::lowest();
+        break;
+      case SCATTER_GATHER_OP::REDUCE_MINIMUM:
+        init_val = std::numeric_limits<scalar_t>::has_infinity ? std::numeric_limits<scalar_t>::infinity()
+                   : std::numeric_limits<scalar_t>::max();
+        break;
+      case SCATTER_GATHER_OP::REDUCE_MEAN:
+        init_val = (scalar_t)0;
+        break;
+    }
+    self.scatter_(dim, index, init_val);
+  });
 }
 
-template <typename T, typename ReduceStub, typename FillStub>
+template <bool use_new_options = false, typename T, typename ReduceStub, typename FillStub>
 void scatter_impl(
     const Tensor& self,
     int64_t dim,
@@ -1185,7 +1436,8 @@ void scatter_impl(
     const Tensor& out,
     ReduceStub& reduce_stub,
     FillStub& fill_stub,
-    const c10::optional<c10::string_view> reduce = nullopt) {
+    const c10::optional<c10::string_view> reduce = nullopt,
+    bool reduce_includes_self = true) {
 
   dim = at::maybe_wrap_dim(dim, self.dim());
   auto mut_out = const_cast<Tensor&>(out);
@@ -1197,7 +1449,11 @@ void scatter_impl(
   if (index.numel() == 0) return;
 
   if (reduce.has_value()) {
-    auto op = meta::get_operator_enum(reduce.value());
+    auto op = meta::get_operator_enum(reduce.value(), use_new_options);
+    if (!reduce_includes_self) {
+      // scatter inits for reduction to appropriate indices (used by scatter_reduce.two)
+      scatter_reduce_exclude_self_helper(mut_out, dim, index, op);
+    }
     reduce_stub(self.device().type(), mut_out, dim, index, src, op);
   } else {
     fill_stub(self.device().type(), mut_out, dim, index, src);
@@ -1282,113 +1538,35 @@ TORCH_IMPL_FUNC(scatter_add)
   }
 }
 
-Tensor scatter_reduce_two_cpu(const Tensor& self,
-                              int64_t dim,
-                              const Tensor& index,
-                              const c10::string_view reduce,
-                              const c10::optional<int64_t> output_size) {
-
-  // TODO: Add documentation.
-
-
-  TORCH_CHECK(dim >= -self.dim() && dim < self.dim(),
-      "Expected `dim` to be in range ", -self.dim(), " to ", self.dim() - 1, " (got ", dim, ")");
-
-  dim = dim < 0 ? dim + self.dim() : dim;
-
-  auto sizes = self.sizes().vec();
-  if (output_size.has_value()) {
-    sizes[dim] = output_size.value();
-  } else {
-    sizes[dim] = index.numel() > 0 ? index.max().item<int64_t>() + 1: 0;
-  }
-  Tensor out = at::empty(sizes, self.options());
-
-  TORCH_CHECK(self.dim() == index.dim(),
-      "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")");
-  for (const auto i : c10::irange(self.dim())) {
-    TORCH_CHECK(self.size(i) == index.size(i),
-        "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")");
-  }
-
-  TORCH_CHECK(reduce == "sum" || reduce == "prod" || reduce == "mean" || reduce == "amax" || reduce =="amin",
-              "`reduce` argument must be one of ('sum', 'prod', 'mean', 'amax', 'amin'");
-
-  if (self.numel() == 0) {
-    return out.zero_();
-  }
-
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "scatter_reduce", [&] {
-    if (reduce == "prod") {
-      out.fill_((scalar_t)1);
-    } else if (reduce == "amax") {
-      out.fill_(std::numeric_limits<scalar_t>::lowest());
-    } else if (reduce == "amin") {
-      out.fill_(std::numeric_limits<scalar_t>::max());
+TORCH_IMPL_FUNC(scatter_reduce_two)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const c10::string_view reduce,
+ bool include_self,
+ const Tensor& out) {
+  // See issue https://github.com/pytorch/pytorch/issues/74770
+  TORCH_WARN_ONCE("scatter_reduce() is in beta and the API may change at any time.");
+
+  scatter_impl</*use_new_options=*/true>(self, dim, index, src, out,
+                                         scatter_reduce_two_stub,
+                                         scatter_stub,
+                                         reduce,
+                                         include_self);
+
+  if (meta::get_operator_enum(reduce, true) == SCATTER_GATHER_OP::REDUCE_MEAN) {
+    auto ones = at::ones_like(src);
+    auto count = include_self ? at::ones_like(out) : at::zeros_like(out);
+    count.scatter_add_(dim, index, ones);
+    count.masked_fill_(count == 0, 1);
+
+    if (out.is_floating_point() || out.is_complex()) {
+      out.div_(count);
     } else {
-      out.fill_((scalar_t)0);
-    }
-
-
-    auto self_cont = self.contiguous();
-    auto index_cont = index.contiguous();
-    auto self_data = self_cont.data_ptr<scalar_t>();
-    auto index_data = index_cont.data_ptr<int64_t>();
-    bool out_is_contiguous = out.is_contiguous();
-    auto out_cont = out.contiguous();
-    auto out_cont_data = out_cont.data_ptr<scalar_t>();
-
-    auto counts = at::zeros_like(out_cont);
-    auto counts_data = counts.data_ptr<scalar_t>();
-
-
-    int64_t offset1 = 1, offset2 = 1;
-    for (const auto d : c10::irange(dim)) {
-      offset1 *= self.size(d);
+      out.div_(count, "floor");
     }
-    for (int64_t d = dim + 1; d < self.dim(); d++) {
-      offset2 *= self.size(d);
-    }
-
-    scalar_t value;
-    int64_t dim_index;
-    for (const auto i : c10::irange(offset1)) {
-      for (const auto j : c10::irange(self.size(dim))) {
-        for (const auto k : c10::irange(offset2)) {
-          value = self_data[i * self_cont.stride(dim) * self_cont.size(dim) + j * self_cont.stride(dim) + k];
-          dim_index = index_data[i * index_cont.stride(dim) * index_cont.size(dim) + j * index_cont.stride(dim) + k];
-          TORCH_CHECK(dim_index >= 0 && dim_index < out.size(dim),
-              "Expected `index` values to be in range ", 0, " to ", out.size(dim), " (got ", dim_index, ")");
-          int64_t ind = i * out_cont.stride(dim) * out_cont.size(dim) + dim_index * out_cont.stride(dim) + k;
-          if (reduce == "sum") {
-            out_cont_data[ind] += value;
-          } else if (reduce == "prod") {
-            out_cont_data[ind] *= value;
-          } else if (reduce == "mean") {
-            auto n = counts_data[ind];
-            out_cont_data[ind] = (out_cont_data[ind] * n + value) / (n + 1);
-            counts_data[ind] += 1;
-          } else if (reduce == "amax") {
-            out_cont_data[ind] = std::max(out_cont_data[ind], value);
-          } else {
-            out_cont_data[ind] = std::min(out_cont_data[ind], value);
-          }
-        }
-      }
-    }
-
-    if (reduce == "amin" || reduce == "amax") {
-      auto val = (reduce == "amin") ? std::numeric_limits<scalar_t>::max() : std::numeric_limits<scalar_t>::lowest();
-      out_cont.masked_fill_(out_cont == val, (scalar_t)0);
-    }
-
-    if (!out_is_contiguous) {
-      out.copy_(out_cont);
-    }
-
-  });
-
-  return out;
+  }
 }
 
 Tensor masked_scatter(const Tensor & self, const Tensor & mask, const Tensor & source) {
@@ -1566,7 +1744,14 @@ Tensor masked_select_backward(const Tensor& grad, const Tensor& input, const Ten
   // implicitly handles broadcasting).
   auto result = at::zeros_like(
       input.expand(at::infer_size(input.sizes(), mask.sizes())), at::MemoryFormat::Preserve);
-  return result.masked_scatter_(mask, grad);
+
+  // for composite compliance, use out-of-place variant
+  // of `masked_scatter`.
+  if (areAnyTensorSubclassLike({grad, mask})) {
+    return result.masked_scatter(mask, grad);
+  }
+  result.masked_scatter_(mask, grad);
+  return result;
 }
 
 namespace {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h
index 56012881ac68..a0c282d550e4 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@@ -2,16 +2,17 @@
 
 // Indexing tensors by tensors
 
-#include <ATen/ATen.h>
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 
 namespace at {
-  struct TensorIterator;
+struct TensorIterator;
 }
 
 namespace at { namespace native {
 
-enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY};
+enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY, REDUCE_MAXIMUM, REDUCE_MINIMUM, REDUCE_MEAN};
 
 using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<c10::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
 
@@ -23,6 +24,8 @@ using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const T
                                   const Tensor& src, const SCATTER_GATHER_OP& reduce);
 using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
                                          const Scalar& value, const SCATTER_GATHER_OP& reduce);
+using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                      const Tensor& src, const SCATTER_GATHER_OP& reduce);
 
 DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub);
 
@@ -32,6 +35,7 @@ DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub);
 DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub);
 DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
 DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
+DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub);
 
 TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<c10::optional<at::Tensor>>& indices);
 
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 13283d244d67..2e723fdae538 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -5,10 +5,13 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <c10/util/Exception.h>
+#include <ATen/native/BinaryOps.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
+#include <ATen/native/Fill.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/TensorIndexing.h>
+#include <ATen/native/TypeProperties.h>
 
 namespace at {
 namespace meta {
@@ -29,12 +32,117 @@ const OptionalScalarRef max) {
   if (!min && !max) {
     TORCH_CHECK(false, "torch.clamp: At least one of 'min' or 'max' must not be None");
   }
+  //Manual type promotion, since scalars have to participate in it
+  ScalarType result_type = self.scalar_type();
+  TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types");
+  //Floating is the highest supported
+  if (!isFloatingType(result_type)) {
+    at::native::ResultTypeState state = {};
+    state = at::native::update_result_type_state(self, state);
+
+    if (min) {
+      state = at::native::update_result_type_state(min.get(), state);
+    }
+    if (max) {
+      state = at::native::update_result_type_state(max.get(), state);
+    }
+    result_type = at::native::result_type(state);
+    //disallow type promoting inplace op
+    TORCH_CHECK((result_type == self.scalar_type()) ||
+       (!(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))),
+       "result type ", result_type, " can't be cast to the desired output type ",
+       self.dtype());
+  }
+  //make sure scalars weren't complex
+  TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types");
+  build_unary_op(maybe_get_output(), self.to(result_type));
+}
+
+TORCH_META_FUNC2(clamp, Tensor) (
+const Tensor& self,
+const OptionalTensorRef min,
+const OptionalTensorRef max) {
+  TORCH_CHECK(min || max, "torch.clamp: At least one of 'min' or 'max' must not be None");
+  TORCH_CHECK(!isComplexType(self.scalar_type()), "clamp is not supported for complex types");
+  #define CLAMP_CONFIG()                    \
+    TensorIteratorConfig()                  \
+      .set_check_mem_overlap(true)          \
+      .add_output(maybe_get_output())       \
+      .add_input(self)                      \
+      .promote_inputs_to_common_dtype(true) \
+      .cast_common_dtype_to_outputs(true)   \
+      .enforce_safe_casting_to_output(true)
+
+  if (min && max) {
+    build(CLAMP_CONFIG().add_input(*min).add_input(*max));
+  } else if (min) {
+    build(CLAMP_CONFIG().add_input(*min));
+  } else if (max) {
+    build(CLAMP_CONFIG().add_input(*max));
+  }
+}
+
+
+TORCH_META_FUNC(clamp_max) (
+  const Tensor& self,
+  const Scalar& max
+) {
+  //we could wrap max into tensor and send to tensor overload,
+  //but relu is implemented via clamp_min, so for perf an uniformity reasons
+  //do a faster but correct thing
+  ScalarType result_type = self.scalar_type();
+  TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types");
+  TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types");
+  //Floating is the highest supported
+  if (!isFloatingType(result_type)) {
+    auto result_type = at::native::result_type(self, max);
+    TORCH_CHECK((result_type == self.scalar_type()) ||
+       (!(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))),
+       "result type ", result_type, " can't be cast to the desired output type ",
+       self.dtype());
+    build_unary_op(maybe_get_output(), self.to(result_type));
+  } else {
+    build_borrowing_unary_op(maybe_get_output(), self);
+  }
+}
+
+TORCH_META_FUNC2(clamp_max, Tensor) (
+  const Tensor& self,
+  const Tensor& max
+) {
+  build_borrowing_binary_op(maybe_get_output(), self, max);
+}
+
+
+TORCH_META_FUNC(clamp_min) (
+  const Tensor& self,
+  const Scalar& min
+) {
+  ScalarType result_type = self.scalar_type();
+  TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types");
+  TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types");
+  //Floating is the highest supported
+  if (!isFloatingType(result_type)) {
+    auto result_type = at::native::result_type(self, min);
+    TORCH_CHECK((result_type == self.scalar_type() ||
+       !(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))),
+       "result type ", result_type, " can't be cast to the desired output type ",
+       self.dtype());
+    build_unary_op(maybe_get_output(), self.to(result_type));
+  } else {
+    build_borrowing_unary_op(maybe_get_output(), self);
+  }
+}
 
-  build_borrowing_unary_op(maybe_get_output(), self);
+TORCH_META_FUNC2(clamp_min, Tensor) (
+  const Tensor& self,
+  const Tensor& min
+) {
+  build_borrowing_binary_op(maybe_get_output(), self, min);
 }
 
 TORCH_META_FUNC2(isin, Tensor_Tensor) (
-  const Tensor& elements, const Tensor& test_elements, bool assume_unique, bool invert
+  const Tensor& elements, const Tensor& test_elements, bool /*assume_unique*/, bool /*invert*/
 ) {
   check_for_unsupported_isin_dtype(elements.scalar_type());
   check_for_unsupported_isin_dtype(test_elements.scalar_type());
@@ -42,7 +150,7 @@ TORCH_META_FUNC2(isin, Tensor_Tensor) (
 }
 
 TORCH_META_FUNC2(isin, Tensor_Scalar) (
-  const Tensor& elements, const c10::Scalar& test_elements, bool assume_unique, bool invert
+  const Tensor& elements, const c10::Scalar& test_elements, bool /*assume_unique*/, bool /*invert*/
 ) {
   check_for_unsupported_isin_dtype(elements.scalar_type());
   check_for_unsupported_isin_dtype(test_elements.type());
@@ -50,7 +158,7 @@ TORCH_META_FUNC2(isin, Tensor_Scalar) (
 }
 
 TORCH_META_FUNC2(isin, Scalar_Tensor) (
-  const c10::Scalar& elements, const Tensor& test_elements, bool assume_unique, bool invert
+  const c10::Scalar& elements, const Tensor& test_elements, bool /*assume_unique*/, bool /*invert*/
 ) {
   check_for_unsupported_isin_dtype(elements.type());
   check_for_unsupported_isin_dtype(test_elements.scalar_type());
@@ -105,8 +213,6 @@ DEFINE_DISPATCH(isposinf_stub); // NOLINT(cppcoreguidelines-avoid-non-const-glob
 DEFINE_DISPATCH(isneginf_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(mode_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(clamp_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(clamp_min_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(clamp_max_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(clamp_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(clamp_min_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(clamp_max_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -220,7 +326,7 @@ Tensor isfinite(const Tensor& self) {
 
   // Note: a complex value is finite iff both parts are finite
   if (self.is_complex()) {
-    return at::isfinite(self.abs());
+    return at::isfinite(at::real(self)).__iand__(at::isfinite(at::imag(self)));
   }
 
   return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "isfinite", [&]() {
@@ -232,47 +338,6 @@ void _assert_async_cpu(const Tensor& self) {
   TORCH_CHECK(native::is_nonzero(self), "Expected Tensor with single nonzero value, but got zero");
 }
 
-namespace {
-
-// DO NOT USE THIS -- it's just an implementation detail of wrapped_scalar tensor below.
-at::Tensor scalar_to_tensor_default_dtype(
-    const Scalar& s,
-    const Device device = at::kCPU) {
-  if (s.isFloatingPoint()) {
-    return at::scalar_tensor(
-        s, at::device(device).dtype(at::get_default_dtype()));
-  } else if (s.isBoolean()) {
-    return at::scalar_tensor(s, at::device(device).dtype(at::kBool));
-  } else if (s.isComplex()) {
-    return at::scalar_tensor(
-        s, at::device(device).dtype(at::get_default_complex_dtype()));
-  } else {
-    TORCH_INTERNAL_ASSERT(s.isIntegral(false));
-    return at::scalar_tensor(s, at::device(device).dtype(at::kLong));
-  }
-}
-
-// TLDR: Don't call `wrapped_scalar_tensor_default_dtype` -- this function is only necessary to support the partial
-// type-promotion that torch.where supports.  Once torch.where fully supports type promotion, we
-// won't need this function.
-//
-// Longer explanation:
-// `wrapped_scalar_tensor_default_dtype` is a bit of a hack because torch.where doesn't support type promotion, but
-// does support `torch.where(tensor, scalar1, scalar2)` with default scalar types.  The trickiness is we
-// usually convert double scalars to doubles, and `set_wrapped_number` defines type promotion priority
-// as being below tensor types rather than as the default dtype (perhaps we should?).  This wouldn't matter
-// if we just supported type normal type promotion on torch.where, however.
-Tensor wrapped_scalar_tensor_default_dtype(
-    const Scalar& scalar,
-    Device device) {
-  at::Tensor tensor;
-  tensor = scalar_to_tensor_default_dtype(scalar, device);
-  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
-  return tensor;
-}
-
-} // anonymous namespace
-
 // Sorting-based algorithm for isin(); used when the number of test elements is large.
 static void isin_sorting(
     const Tensor& elements,
@@ -295,7 +360,7 @@ static void isin_sorting(
   // 2. Stable sort all elements, maintaining order indices to reverse the
   //    operation. Stable sort is necessary to keep elements before test
   //    elements within the sorted list.
-  Tensor all_elements = at::_cat({elements_flat, test_elements_flat});
+  Tensor all_elements = at::cat({elements_flat, test_elements_flat});
   Tensor sorted_elements, sorted_order;
   std::tie (sorted_elements, sorted_order) = all_elements.sort(
       /*stable=*/ true, /*dim=*/ 0, /*descending=*/ false);
@@ -323,35 +388,58 @@ static void isin_sorting(
   }
 }
 
-Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) {
-  TORCH_CHECK(condition.device() == self.device() && self.device() == other.device(),
-              "Expected condition, x and y to be on the same device, but condition is on ",
-              condition.device(), " and x and y are on ", self.device(), " and ", other.device(),
-              " respectively");
-
+Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) {
+  Tensor self_, other_;
+  if (self.dtype() != other.dtype()) {
+    auto result_type = at::native::result_type(self, other);
+    self_ = self.to(result_type);
+    other_ = other.to(result_type);
+  } else {
+    self_ = self;
+    other_ = other;
+  }
   if (condition.scalar_type() == ScalarType::Byte) {
   TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead.");
-} else {
+  } else {
   TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type());
+  }
+  Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition;
+  auto iter = at::TensorIteratorConfig()
+    .check_all_same_dtype(false)
+    .add_output(out)
+    .add_input(cond_bool)
+    .add_input(self_)
+    .add_input(other_)
+    .build();
+  where_kernel(iter.device_type(), iter);
+  return out;
 }
 
-  c10::MaybeOwned<Tensor> b_condition, b_self, b_other;
-  std::tie(b_condition, b_self, b_other) = expand_outplace(condition, self, other, "where");
-  return at::_s_where(*b_condition, *b_self, *b_other);
+Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) {
+  auto result_type = at::native::result_type(self, other);
+  Tensor ret = at::empty({0}, self.options().dtype(result_type));
+  at::native::where_self_out(condition, self, other, ret);
+  return ret;
 }
 
 Tensor where(const Tensor& condition, const Scalar& self, const Tensor& other) {
-  return at::where(condition, wrapped_scalar_tensor(self, other.device()), other);
+  auto result_type = at::native::result_type(other, self);
+  auto self_converted = at::scalar_tensor(self, other.options().dtype(result_type));
+  auto other_converted = other.to(result_type);
+  return at::where(condition, self_converted, other_converted);
 }
 
 Tensor where(const Tensor& condition, const Tensor& self, const Scalar& other) {
-  return at::where(condition, self, wrapped_scalar_tensor(other, self.device()));
+  auto result_type = at::native::result_type(self, other);
+  auto other_converted = at::scalar_tensor(other, self.options().dtype(result_type));
+  auto self_converted = self.to(result_type);
+  return at::where(condition, self_converted, other_converted);
 }
 
 Tensor where(const Tensor& condition, const Scalar& self, const Scalar& other) {
-  const auto device = condition.device();
-  const Tensor& other_t = wrapped_scalar_tensor_default_dtype(other, device);
-  const Tensor& self_t = wrapped_scalar_tensor_default_dtype(self, device);
+  auto result_type = at::native::result_type(self, other);
+  const Tensor& other_t = at::scalar_tensor(other, condition.options().dtype(result_type));
+  const Tensor& self_t = at::scalar_tensor(self, condition.options().dtype(result_type));
   return at::where(condition, self_t, other_t);
 }
 
@@ -359,22 +447,6 @@ std::vector<Tensor> where(const Tensor& condition) {
   return condition.nonzero_numpy();
 }
 
-Tensor _s_where(const Tensor& condition, const Tensor& self, const Tensor& other) {
-  TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype());
-  Tensor ret = at::empty(self.sizes(), self.options());
-  //
-  Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition;
-  auto iter = at::TensorIteratorConfig()
-    .check_all_same_dtype(false)
-    .add_output(ret)
-    .add_input(cond_bool)
-    .add_input(self)
-    .add_input(other)
-    .build();
-  where_kernel(iter.device_type(), iter);
-  return ret;
-}
-
 std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
   Tensor values = at::empty({0}, self.options());
   Tensor indices = at::empty({0}, self.options().dtype(kLong));
@@ -485,13 +557,18 @@ std::tuple<Tensor, Tensor> _aminmax(const Tensor& self, int64_t dim, bool keepdi
 
 TORCH_IMPL_FUNC(clamp_out)
 (
- const Tensor& self,
+ const Tensor& /*self*/,
  const OptionalScalarRef min,
  const OptionalScalarRef max,
  const Tensor& result) {
   using at::native::detail::ClampLimits;
   if (min && max) {
-    clamp_scalar_stub(device_type(), *this, min.get(), max.get());
+    if (min.get().toDouble() != min.get().toDouble() ||
+        max.get().toDouble() != max.get().toDouble()) {
+      at::fill_(const_cast<Tensor&>(result), std::numeric_limits<double>::quiet_NaN());
+    } else {
+      clamp_scalar_stub(device_type(), *this, min.get(), max.get());
+    }
   } else if (max) {
     clamp_max_scalar_stub(device_type(), *this, max.get());
   } else if (min) {
@@ -499,112 +576,47 @@ TORCH_IMPL_FUNC(clamp_out)
   }
 }
 
-Tensor& clamp_out(const Tensor& self, const c10::optional<Tensor>& min,
-                  const c10::optional<Tensor>& max, Tensor& result) {
+TORCH_IMPL_FUNC(clamp_Tensor_out)
+(const Tensor& self, const OptionalTensorRef min,
+                  const OptionalTensorRef max, const Tensor&) {
   if (min && max) {
-    TORCH_CHECK(self.layout() == Layout::Strided,
-                "torch.clamp only supports strided layout, got: ", self.layout());
-    auto iter = TensorIteratorConfig()
-                .set_check_mem_overlap(true)
-                .add_output(result)
-                .add_input(self)
-                .add_input(*min)
-                .add_input(*max)
-                .promote_inputs_to_common_dtype(true)
-                .cast_common_dtype_to_outputs(true)
-                .enforce_safe_casting_to_output(true)
-                .build();
-    clamp_stub(iter.device_type(), iter);
-  } else if (max) {
-    at::clamp_max_outf(self, *max, result);
+    clamp_stub(device_type(), *this);
   } else if (min) {
-    at::clamp_min_outf(self, *min, result);
-  } else {
-    TORCH_CHECK(false, "torch.clamp: At least one of 'min' or 'max' must not be None");
+    maximum_stub(device_type(), *this);
+  } else if (max) {
+    minimum_stub(device_type(), *this);
   }
-  return result;
-}
-
-Tensor clamp(const Tensor& self, const c10::optional<Scalar>& min, const c10::optional<Scalar>& max) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_outf(self, min, max, result);
 }
 
-Tensor clamp(const Tensor& self, const c10::optional<Tensor>& min, const c10::optional<Tensor>& max) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_outf(self, min, max, result);
-}
-
-Tensor& clamp_(Tensor& self, const c10::optional<Scalar>& min, const c10::optional<Scalar>& max) {
-  return at::clamp_outf(self, min, max, self);
-}
-
-Tensor& clamp_(Tensor& self, const c10::optional<Tensor>& min, const c10::optional<Tensor>& max) {
-  return at::clamp_outf(self, min, max, self);
-}
-
-Tensor& clamp_max_out(const Tensor& self, const Scalar& max, Tensor& result) {
-  auto iter = TensorIterator::unary_op(result, self);
-  clamp_max_scalar_stub(iter.device_type(), iter, max);
-  return result;
-}
-
-Tensor& clamp_max_out(const Tensor& self, const Tensor& max, Tensor& result) {
-  TORCH_CHECK(self.layout() == Layout::Strided,
-              "torch.clamp only supports strided layout, got: ", self.layout());
-  auto iter = TensorIterator::borrowing_binary_op(result, self, max);
-  clamp_max_stub(iter.device_type(), iter);
-  return result;
-}
-
-Tensor clamp_max(const Tensor& self, const Scalar& max) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_max_outf(self, max, result);
-}
-
-Tensor clamp_max(const Tensor& self, const Tensor& max) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_max_outf(self, max, result);
-}
-
-Tensor& clamp_max_(Tensor& self, const Scalar& max) {
-  return at::clamp_max_outf(self, max, self);
-}
-
-Tensor& clamp_max_(Tensor& self, const Tensor& max) {
-  return at::clamp_max_outf(self, max, self);
-}
-
-Tensor& clamp_min_out(const Tensor& self, const Scalar& min, Tensor& result) {
-  auto iter = TensorIterator::unary_op(result, self);
-  clamp_min_scalar_stub(iter.device_type(), iter, min);
-  return result;
-}
-
-Tensor& clamp_min_out(const Tensor& self, const Tensor& min, Tensor& result) {
-  TORCH_CHECK(self.layout() == Layout::Strided,
-              "torch.clamp only supports strided layout, got: ", self.layout());
-  auto iter = TensorIterator::borrowing_binary_op(result, self, min);
-  clamp_min_stub(iter.device_type(), iter);
-  return result;
-}
-
-Tensor clamp_min(const Tensor& self, const Scalar& min) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_min_outf(self, min, result);
+TORCH_IMPL_FUNC(clamp_max_out)
+(const Tensor& self, const Scalar& max, const Tensor& result) {
+  if (max.toDouble() != max.toDouble()) {
+//TODO this is not great, building TI again is expensive, but I can't use
+//fill_stub because fill is not structured
+//this is a corner case anyway
+    at::fill_(const_cast<Tensor&>(result), wrapped_scalar_tensor(max));
+  } else {
+    clamp_max_scalar_stub(device_type(), *this, max);
+  }
 }
 
-Tensor clamp_min(const Tensor& self, const Tensor& min) {
-  Tensor result = at::empty({0}, self.options());
-  return at::clamp_min_outf(self, min, result);
+TORCH_IMPL_FUNC(clamp_max_Tensor_out)
+(const Tensor& self, const Tensor& max, const Tensor& result) {
+  minimum_stub(device_type(), *this);
 }
 
-Tensor& clamp_min_(Tensor& self, const Scalar& min) {
-  return at::clamp_min_outf(self, min, self);
+TORCH_IMPL_FUNC(clamp_min_out)
+(const Tensor& self, const Scalar& min, const Tensor& result) {
+  if (min.toDouble() != min.toDouble()) {
+    at::fill_(const_cast<Tensor&>(result), min);
+  } else {
+    clamp_min_scalar_stub(device_type(), *this, min);
+  }
 }
 
-Tensor& clamp_min_(Tensor& self, const Tensor& min) {
-  return at::clamp_min_outf(self, min, self);
+TORCH_IMPL_FUNC(clamp_min_Tensor_out)
+(const Tensor& self, const Tensor& min, const Tensor& result) {
+  maximum_stub(device_type(), *this);
 }
 
 // Implements the "clip" alias for clamp
@@ -646,13 +658,13 @@ std::tuple<Tensor, Tensor> max(const Tensor& self, Dimname dim, bool keepdim) {
 std::tuple<Tensor&, Tensor&> max_out(const Tensor& self, Dimname dim, bool keepdim, Tensor& max, Tensor& max_indices) {
   return at::max_out(max, max_indices, self, dimname_to_position(self, dim), keepdim);
 }
-Tensor argmax(const Tensor& self, Dimname dim, bool keepdim) {
+Tensor argmax(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
   reportNYIDimnameOverload("argmax");
 }
-Tensor argmin(const Tensor& self, Dimname dim, bool keepdim) {
+Tensor argmin(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
   reportNYIDimnameOverload("argmin");
 }
-Tensor argsort(const Tensor& self, Dimname dim, bool keepdim) {
+Tensor argsort(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
   reportNYIDimnameOverload("argsort");
 }
 std::tuple<Tensor, Tensor> mode(const Tensor& self, Dimname dim, bool keepdim) {
diff --git a/aten/src/ATen/native/TensorCompare.h b/aten/src/ATen/native/TensorCompare.h
index e81f96b0e24a..f35cd68d4806 100644
--- a/aten/src/ATen/native/TensorCompare.h
+++ b/aten/src/ATen/native/TensorCompare.h
@@ -32,10 +32,8 @@ DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub);
 using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
 DECLARE_DISPATCH(mode_fn, mode_stub);
 
-using clamp_fn = void (*)(TensorIterator &);
-DECLARE_DISPATCH(clamp_fn, clamp_stub);
-DECLARE_DISPATCH(clamp_fn, clamp_min_stub);
-DECLARE_DISPATCH(clamp_fn, clamp_max_stub);
+using clamp_tensor_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub);
 
 namespace detail {
     enum class ClampLimits {Min, Max, MinMax};
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 71690c4bf2d1..05691d2998df 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -2,8 +2,10 @@
 #include <ATen/NativeFunctions.h>
 #include <c10/util/Optional.h>
 #include <ATen/quantized/Quantizer.h>
+#include <ATen/Parallel.h>
 
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <ATen/SparseTensorUtils.h>
 
 namespace at {
 namespace native {
@@ -51,34 +53,99 @@ Tensor _to_copy(
   // memory_format is handled separately due to MemoryFormat::Preserve logic
   options = self.options().merge_in(options).memory_format(c10::nullopt);
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
+  // TODO: Use the dispatcher for this.
+  // Currently there are unenumerated extensibility issues preventing this.
+  if (self.is_sparse_csr()) {
+    TORCH_CHECK(
+        memory_format == MemoryFormat::Preserve,
+        "sparse_csr only supports memory format Preserve, but got ",
+        memory_format,
+        " instead.");
+
+    auto new_values = at::native::to(
+        self.values(),
+        dtype,
+        c10::kStrided, // values are strided
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we're in _to_copy
+        memory_format);
+
+    auto new_crow_indices = at::native::to(
+        self.crow_indices(),
+        self.crow_indices().scalar_type(), // indices are integral
+        c10::kStrided, // indices are strided
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we're in _to_copy
+        memory_format);
+
+    auto new_col_indices = at::native::to(
+        self.col_indices(),
+        self.col_indices().scalar_type(), // indices are integral
+        c10::kStrided, // indices are strided
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we're in _to_copy
+        memory_format);
+
+    return at::native::_sparse_csr_tensor_unsafe(
+        new_crow_indices,
+        new_col_indices,
+        new_values,
+        self.sizes(),
+        new_values.scalar_type(),
+        self.layout(),
+        new_values.device());
+  }
 
   bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() &&
                   (options.layout() == c10::kStrided));
 
   if (memory_format == MemoryFormat::Preserve) {
-    if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) {
-      Tensor r;
-      if (self.is_quantized()) {
-        r = at::empty_quantized(self.sizes(), self, options);
-        at::QuantizerPtr quantizer = r.quantizer();
-        r.copy_(self, non_blocking);
-        set_quantizer_(r, quantizer);
+    if (options.device().supports_as_strided()) {
+      if (self.is_non_overlapping_and_dense()) {
+        Tensor r;
+        if (self.is_quantized()) {
+          r = at::empty_quantized(self.sizes(), self, options);
+          at::QuantizerPtr quantizer = r.quantizer();
+          r.copy_(self, non_blocking);
+          set_quantizer_(r, quantizer);
+        } else {
+          r = at::empty_strided(
+              self.sizes(),
+              self.strides(),
+              options.pinned_memory(pin_out));
+          r.copy_(self, non_blocking);
+        }
+        return r;
+      } else if (!self.is_quantized() && self.layout() == kStrided) {
+          Tensor r;
+          auto strides = infer_dense_strides(self.sizes(), self.strides());
+          r = at::empty_strided(
+              self.sizes(),
+              strides,
+              options.pinned_memory(pin_out));
+          r.copy_(self, non_blocking);
+          return r;
       } else {
-        r = at::empty_strided(
-            self.sizes(),
-            self.strides(),
-            options.pinned_memory(pin_out));
-        r.copy_(self, non_blocking);
+        memory_format = self.suggest_memory_format();
       }
-      return r;
     } else {
       memory_format = self.suggest_memory_format();
     }
   }
   // See Note [Explicit nullopt MemoryFormat argument]
-  auto r = at::empty(self.sizes(),
-                     options.memory_format(memory_format).pinned_memory(pin_out),
-                     c10::nullopt);
+  // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
+  // empty_affine_quantizd/_empty_per_channel_affine_quantized calls
+  // at::empty also does not work here because there is no proper at::empty support for quantized tensors
+  // as it would return a quantized tensor with an UnknownQuantizer
+  auto r = self.is_quantized() ? at::empty_like(self, memory_format)
+                               : at::empty(self.sizes(),
+                                 options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
   r.copy_(self, non_blocking);
   return r;
 }
@@ -240,11 +307,14 @@ Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) {
   if (input_.layout() == c10::kSparse) {
     auto input = input_.coalesce();
     return grad.sparse_mask(input);
-  } else if (input_.layout() == c10::kMkldnn) {
+  }
+  if (input_.layout() == c10::kMkldnn) {
     return grad.to_mkldnn(input_.scalar_type());
-  } else {
-    AT_ERROR("Unsupported input layout: ", input_.layout());
   }
+  if (input_.layout() == c10::kStrided) {
+    return grad.to_dense();
+  }
+  AT_ERROR("to_dense_backward: Unsupported input layout: ", input_.layout());
 }
 
 Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
@@ -252,6 +322,44 @@ Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
   return grad.to_dense(input_.scalar_type());
 }
 
+Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype) {
+  if (tensor.layout() == c10::kSparse) {
+    return tensor._to_dense(dtype);
+  }
+  if (tensor.layout() == c10::kSparseCsr || tensor.layout() == c10::kSparseCsc) {
+    return tensor._to_dense(dtype);
+  }
+  if (tensor.layout() == c10::kMkldnn) {
+    return tensor._to_dense(dtype);
+  }
+  TORCH_CHECK(tensor.layout() == c10::kStrided, "to_dense does not support layout ", tensor.layout());
+  if (dtype) {
+    return tensor.to(*dtype);
+  }
+  return tensor;
+}
+
+Tensor sparse_to_dense(
+    const Tensor& self,
+    c10::optional<ScalarType> dtype) {
+  TORCH_CHECK(
+      !dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
+  Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
+  return dst.add_(self);
+}
+
+Tensor sparse_compressed_to_dense(
+    const Tensor& self,
+    c10::optional<ScalarType> dtype) {
+  TORCH_CHECK(
+      !dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense");
+  if (self.layout() == kSparseCsr) {
+    Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
+    return dst.add_(self);
+  }
+  return self.to_sparse().to_dense();
+}
+
 // Computes the strides for view_dtype output when the view dtype is
 // smaller than the original dtype
 inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) {
@@ -371,4 +479,502 @@ Tensor view_dtype(const Tensor& self, ScalarType dtype) {
   return new_tensor;
 }
 
+// Sparse layout conversions Start
+
+Tensor dense_to_sparse_csr(const Tensor& self) {
+  return self.to_sparse().to_sparse_csr();
+}
+
+Tensor dense_to_sparse_csc(const Tensor& self) {
+  return self.to_sparse().to_sparse_csc();
+}
+
+Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseBsr is currently not supported.");
+  return self;
+}
+
+Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) {
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseBsc is currently not supported.");
+  return self;
+}
+
+Tensor sparse_compressed_to_sparse_csr(const Tensor& self) {
+  if (self.layout() == kSparseCsc) {
+    TORCH_CHECK(
+        self.dim() == 2,
+        "Expected self to be of dimension 2, but got ",
+        self.dim(),
+        ".");
+    auto sizes = self.sizes();
+    auto ccol_indices = self.ccol_indices();
+    auto row_indices = self.row_indices();
+    auto values = self.values();
+
+    // convert CSC indices to COO indices and swap its rows
+    const bool out_int32 = ccol_indices.scalar_type() == ScalarType::Int;
+    Tensor indices_transposed = _convert_indices_from_csr_to_coo(
+        ccol_indices, row_indices, out_int32, true);
+
+    // sort transposed indices
+    auto indices_scalar =
+        at::sparse::flatten_indices(indices_transposed, {sizes[0], sizes[1]});
+    auto indicesPermutation = std::get<1>(indices_scalar.sort(0));
+    auto indices_transposed_sorted =
+        indices_transposed.index_select(1, indicesPermutation);
+
+    // construct a CSR tensor
+    auto new_row_indices = indices_transposed_sorted.select(0, 0);
+    auto new_col_indices = indices_transposed_sorted.select(0, 1);
+    auto new_values = values.index_select(0, indicesPermutation);
+    Tensor new_crow_indices =
+        _convert_indices_from_coo_to_csr(new_row_indices, sizes[0], out_int32);
+
+    return _sparse_csr_tensor_unsafe(
+        new_crow_indices,
+        new_col_indices,
+        new_values,
+        {sizes[0], sizes[1]},
+        new_values.scalar_type(),
+        c10::kSparseCsr,
+        new_values.device());
+  }
+  if (self.layout() == kSparseCsr) {
+    // Just returning self doesn't work
+    // RuntimeError: t.use_count() <= 1 INTERNAL ASSERT FAILED at
+    // "../torch/csrc/autograd/autograd_not_implemented_fallback.cpp":152,
+    // please report a bug to PyTorch. aten::to_sparse_csr
+    return at::native::_sparse_csr_tensor_unsafe(
+        self.crow_indices(),
+        self.col_indices(),
+        self.values(),
+        self.sizes(),
+        self.scalar_type(),
+        c10::kSparseCsr,
+        self.device());
+  }
+  AT_ERROR(
+      "sparse_compressed_to_sparse_csr expected SparseCsr or SparseCsc layout but got ",
+      self.layout());
+}
+
+Tensor coo_to_sparse_csr(const Tensor& self) {
+  TORCH_CHECK(
+      self.dim() == 2,
+      "Only 2D tensors can be converted to the SparseCsr layout but got shape: ",
+      self.sizes());
+  auto coalesced_self = self.coalesce();
+  auto row_indices = coalesced_self.indices()[0];
+  bool out_int32 = (row_indices.scalar_type() == at::kInt);
+  auto crow_indices = at::_convert_indices_from_coo_to_csr(
+      row_indices, self.size(0), out_int32);
+  return at::native::_sparse_csr_tensor_unsafe(
+      crow_indices,
+      coalesced_self.indices()[1].contiguous(),
+      coalesced_self.values(),
+      coalesced_self.sizes(),
+      coalesced_self.scalar_type(),
+      c10::kSparseCsr,
+      coalesced_self.device());
+}
+
+Tensor coo_to_sparse_csc(const Tensor& self) {
+  TORCH_CHECK(
+      self.dim() == 2,
+      "Only 2D tensors can be converted to the SparseCsc layout but got shape: ",
+      self.sizes());
+  auto coalesced_self = self.transpose(0, 1).coalesce().to_sparse_csr();
+  return at::native::_sparse_csc_tensor_unsafe(
+      coalesced_self.crow_indices(),
+      coalesced_self.col_indices(),
+      coalesced_self.values(),
+      self.sizes(),
+      coalesced_self.scalar_type(),
+      c10::kSparseCsc,
+      coalesced_self.device());
+}
+
+Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseBsr is currently not supported.");
+  return self;
+}
+
+Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) {
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseBsc is currently not supported.");
+  return self;
+}
+
+namespace {
+template <typename input_t, typename output_t>
+void convert_indices_from_coo_to_csr_cpu(
+    const Tensor& result,
+    const Tensor& input,
+    const int64_t size) {
+  int64_t numel = input.numel();
+  const input_t* data_in = input.data_ptr<input_t>();
+  output_t* data_out = result.data_ptr<output_t>();
+
+  if (numel == 0) {
+    result.zero_();
+    return;
+  }
+
+  for (int64_t i = 0; i <= data_in[0]; i++)
+    data_out[i] = static_cast<output_t>(0);
+
+  at::parallel_for(
+      0, numel - 1, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
+        input_t curr_value = data_in[start], next_value;
+        for (const auto i : c10::irange(start, end)) {
+          next_value = data_in[i + 1];
+          for (; curr_value < next_value; curr_value++)
+            data_out[curr_value + 1] = static_cast<output_t>(i + 1);
+        }
+      });
+  for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++) {
+    data_out[i] = static_cast<output_t>(numel);
+  }
+}
+
+template <typename input_t, typename output_t>
+void convert_indices_from_csr_to_coo_cpu(
+    const Tensor& indices,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const bool transpose = false) {
+  int64_t nrows = crow_indices.numel() - 1;
+  if (nrows == 0) {
+    indices.zero_();
+    return;
+  }
+  auto crow_indices_ = crow_indices.expect_contiguous();
+  const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
+  TORCH_INTERNAL_ASSERT(indices.is_contiguous());
+  auto row0 = indices.select(0, transpose ? 1 : 0);
+  auto row1 = indices.select(0, transpose ? 0 : 1);
+  output_t* data_out = row0.data_ptr<output_t>();
+  row1.copy_(*col_indices.expect_contiguous());
+  at::parallel_for(
+      0, nrows, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
+        for (const auto i : c10::irange(start, end)) {
+          std::fill(
+              &data_out[crow_indices_data_in[i]],
+              &data_out[crow_indices_data_in[i + 1]],
+              static_cast<output_t>(i));
+        }
+      });
+}
+} // namespace
+
+TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_cpu)
+(const Tensor& input,
+ const int64_t size,
+ const bool out_int32,
+ const Tensor& result) {
+  if (out_int32) {
+    AT_DISPATCH_INTEGRAL_TYPES(
+        input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] {
+          convert_indices_from_coo_to_csr_cpu<scalar_t, int32_t>(
+              result, input, size);
+        });
+  } else {
+    AT_DISPATCH_INTEGRAL_TYPES(
+        input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] {
+          convert_indices_from_coo_to_csr_cpu<scalar_t, int64_t>(
+              result, input, size);
+        });
+  }
+}
+
+TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu)
+(const Tensor& crow_indices,
+ const Tensor& col_indices,
+ const bool out_int32,
+ const bool transpose,
+ const Tensor& result) {
+  if (out_int32) {
+    AT_DISPATCH_INTEGRAL_TYPES(
+        crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] {
+          convert_indices_from_csr_to_coo_cpu<scalar_t, int32_t>(
+              result, crow_indices, col_indices, transpose);
+        });
+  } else {
+    AT_DISPATCH_INTEGRAL_TYPES(
+        crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] {
+          convert_indices_from_csr_to_coo_cpu<scalar_t, int64_t>(
+              result, crow_indices, col_indices, transpose);
+        });
+  }
+}
+
+/*
+ * Based on
+ * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
+ */
+template <class I, class T>
+void _csr_to_block_csr_cpu_kernel(
+    const I n_row,
+    const I n_col,
+    const I R,
+    const I C,
+    const I* input_crow_indices,
+    const I* input_col_indices,
+    const T* input_values,
+    I* result_crow_indices,
+    I* result_col_indices,
+    T* result_values) {
+  // All blocks are possible, that is, may be allocated if a single non-zero
+  // value lives within them. Otherwise they're not.
+
+  // Allocate pointers for all possible column blocks plus 1
+  std::vector<T*> blocks(n_col / C + 1, (T*)0);
+
+  assert(n_row % R == 0);
+  assert(n_col % C == 0);
+
+  // Major assumptions
+  // 1. Blocks must be square
+
+  // Number of blocks along rows
+  I n_brow = n_row / R;
+  // Number of blocks along columns
+  // I n_bcol = n_col / C;
+
+  // Number of elements per block
+  I RC = R * C;
+  // Number of blocks overall
+  I n_blks = 0;
+
+  result_crow_indices[0] = 0;
+
+  // Iterate over blocks along rows
+  for (I block_i = 0; block_i < n_brow; block_i++) {
+    // Iterate over rows within block
+    for (I r = 0; r < R; r++) {
+      I i = R * block_i + r; // row index
+      for (I jj = input_crow_indices[i]; jj < input_crow_indices[i + 1]; jj++) {
+        I j = input_col_indices[jj]; // column index
+
+        // Block corresponding to column index
+        I block_j = j / C;
+        // Column within block
+        I c = j % C;
+
+        if (blocks[block_j] == 0) {
+          blocks[block_j] = result_values + RC * n_blks;
+          result_col_indices[n_blks] = block_j;
+          n_blks++;
+        }
+
+        // Specific blocks entries should not be visited more than once.
+        // Scipy code does an addition here. Why?
+        *(blocks[block_j] + C * r + c) = input_values[jj];
+      }
+    }
+
+    for (I jj = input_crow_indices[R * block_i];
+         jj < input_crow_indices[R * (block_i + 1)];
+         jj++) {
+      blocks[input_col_indices[jj] / C] = 0;
+    }
+
+    result_crow_indices[block_i + 1] = n_blks;
+  }
+}
+
+/*
+ * Based on
+ * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
+ */
+template <class I>
+I csr_count_blocks(
+    const I n_row,
+    const I n_col,
+    const I R,
+    const I C,
+    const I Ap[],
+    const I Aj[]) {
+  std::vector<I> mask(n_col / C + 1, -1);
+  I n_blks = 0;
+  for (I i = 0; i < n_row; i++) {
+    I bi = i / R;
+    for (I jj = Ap[i]; jj < Ap[i + 1]; jj++) {
+      I bj = Aj[jj] / C;
+      if (mask[bj] != bi) {
+        mask[bj] = bi;
+        n_blks++;
+      }
+    }
+  }
+  return n_blks;
+}
+
+Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
+  TORCH_CHECK(
+      blocksize[0] == blocksize[1],
+      "blocks must be square. ",
+      "Got (",
+      blocksize[0],
+      ", ",
+      blocksize[1],
+      ") instead.");
+  TORCH_CHECK(
+      self.size(0) % blocksize[0] == 0 && self.size(1) % blocksize[1] == 0,
+      "Block sparse CSR Tensors must have a size that is an ",
+      "integral multiple of their block size. ",
+      "Got Tensor of size (",
+      self.size(0),
+      ", ",
+      self.size(1),
+      ") with block size (",
+      blocksize[0],
+      ", ",
+      blocksize[1],
+      ") instead.");
+  Tensor input_values = self.values().contiguous();
+  Tensor input_crow_indices = self.crow_indices().contiguous();
+  Tensor input_col_indices = self.col_indices().contiguous();
+
+  // First we determine the number of blocks needed. For each given block, if it
+  // contains a non-zero element we will allocate values and indices for it.
+  int64_t num_blocks;
+  int64_t n_row = self.size(0);
+  int64_t n_col = self.size(1);
+  AT_DISPATCH_INDEX_TYPES(
+      input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] {
+        num_blocks = csr_count_blocks<index_t>(
+            n_row,
+            n_col,
+            blocksize[0],
+            blocksize[1],
+            input_crow_indices.data_ptr<index_t>(),
+            input_col_indices.data_ptr<index_t>());
+      });
+
+  Tensor result_values =
+      input_values.new_zeros({num_blocks, blocksize[0], blocksize[1]});
+  Tensor result_crow_indices =
+      input_crow_indices.new_empty({(n_row / blocksize[0]) + 1});
+  Tensor result_col_indices = input_col_indices.new_empty({num_blocks});
+
+  // Next we copy over non-zero elements into the allocated blocks.
+  AT_DISPATCH_INDEX_TYPES(
+      input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] {
+        AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+            input_values.scalar_type(), "_csr_to_block_csr_cpu", [&] {
+              _csr_to_block_csr_cpu_kernel<index_t, scalar_t>(
+                  n_row,
+                  n_col,
+                  blocksize[0],
+                  blocksize[1],
+                  input_crow_indices.data_ptr<index_t>(),
+                  input_col_indices.data_ptr<index_t>(),
+                  input_values.data_ptr<scalar_t>(),
+                  result_crow_indices.data_ptr<index_t>(),
+                  result_col_indices.data_ptr<index_t>(),
+                  result_values.data_ptr<scalar_t>());
+            });
+      });
+  return at::native::_sparse_bsr_tensor_unsafe(
+      result_crow_indices,
+      result_col_indices,
+      result_values,
+      self.sizes(),
+      result_values.scalar_type(),
+      c10::kSparseBsr,
+      result_values.device());
+}
+
+Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
+  TORCH_CHECK(
+      self.is_sparse_csr(),
+      "Can only convert CSR to SparseBsr, but got ",
+      self.layout(),
+      " instead.");
+  Tensor self_values = self.values();
+  Tensor self_crow_indices = self.crow_indices();
+  Tensor self_col_indices = self.col_indices();
+  Tensor cpu_result = _csr_to_block_csr_cpu(
+      _sparse_csr_tensor_unsafe(
+          self_crow_indices.cpu(),
+          self_col_indices.cpu(),
+          self_values.cpu(),
+          self.sizes(),
+          self_values.scalar_type(),
+          self.layout(),
+          self_values.device()),
+      blocksize);
+  Tensor result_values = cpu_result.values().to(self_values.options());
+  Tensor result_crow_indices =
+      cpu_result.crow_indices().to(self_crow_indices.options());
+  Tensor result_col_indices =
+      cpu_result.col_indices().to(self_col_indices.options());
+  return at::native::_sparse_bsr_tensor_unsafe(
+      result_crow_indices,
+      result_col_indices,
+      result_values,
+      self.sizes(),
+      result_values.scalar_type(),
+      c10::kSparseBsr,
+      result_values.device());
+}
+
+Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) {
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseBsc is currently not supported.");
+  return self;
+}
+
+Tensor sparse_compressed_to_sparse_csc(const Tensor& self) {
+  if (self.layout() == kSparseCsc) {
+    // Based on to_sparse_csr just returning self doesn't work
+    return _sparse_csc_tensor_unsafe(
+        self.ccol_indices(),
+        self.row_indices(),
+        self.values(),
+        self.sizes(),
+        self.scalar_type(),
+        c10::kSparseCsc,
+        self.device());
+  }
+  AT_ERROR(
+      "Conversion from ", self.layout(), " to SparseCsc is currently not supported.");
+}
+
+Tensor sparse_compressed_to_sparse(const Tensor& self, int64_t sparse_dim) {
+  TORCH_CHECK(sparse_dim > 0, "sparse_dim must be >0");
+  TORCH_CHECK(sparse_dim <= 2,
+              "sparse_dim must be less than or equal to 2");
+  // TODO: implement coo.to_sparse(sparse_dim) and then use
+  // return self.to_sparse().to_sparse(sparse_dim);
+  TORCH_CHECK(
+      sparse_dim == 2, "sparse dim 1 is not supported by sparse_csr_to_dense");
+  if (self.layout() == kSparseCsc) {
+    Tensor indices = at::_convert_indices_from_csr_to_coo(
+        self.ccol_indices(), self.row_indices(), false, true);
+    return at::native::_sparse_coo_tensor_unsafe(
+               indices, self.values(), self.sizes())
+        ._coalesced_(true);
+  }
+  if (self.layout() == kSparseCsr) {
+    Tensor indices = at::_convert_indices_from_csr_to_coo(
+        self.crow_indices(), self.col_indices(), false, false);
+    return at::native::_sparse_coo_tensor_unsafe(
+               indices, self.values(), self.sizes())
+        ._coalesced_(true);
+  }
+  AT_ERROR(
+      "sparse_compressed_to_sparse expected SparseCsr or SparseCsc layout but got ",
+      self.layout());
+}
+
+Tensor sparse_compressed_to_sparse(const Tensor& self) {
+  return sparse_compressed_to_sparse(self, 2);
+}
+
+// Sparse layout conversions End
 }} // namespace at::native
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 458a694411e4..4494ff16eb6b 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -110,9 +110,9 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ complex / polar ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 void complex_check_floating(const Tensor& a, const Tensor& b) {
-  TORCH_CHECK((a.scalar_type() == kFloat || a.scalar_type() == kDouble) &&
-              (b.scalar_type() == kFloat || b.scalar_type() == kDouble),
-              "Expected both inputs to be Float or Double tensors but got ",
+  TORCH_CHECK((a.scalar_type() == kFloat || a.scalar_type() == kDouble || a.scalar_type() == kHalf) &&
+              (b.scalar_type() == kFloat || b.scalar_type() == kDouble || b.scalar_type() == kHalf),
+              "Expected both inputs to be Half, Float or Double tensors but got ",
               a.scalar_type(), " and ", b.scalar_type());
 }
 
@@ -932,7 +932,7 @@ Tensor& randperm_out_cpu(int64_t n, c10::optional<Generator> generator, Tensor&
   auto gen = get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
   std::lock_guard<std::mutex> lock(gen->mutex_);
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "randperm", [&]() -> void {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "randperm", [&]() -> void {
     randperm_cpu<scalar_t>(result, n, gen);
   });
 
@@ -1344,6 +1344,11 @@ Tensor kaiser_window(
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
   window_function_checks("kaiser_window", options, window_length);
+  // short-circuit for `meta`.
+  if (device == kMeta) {
+    return at::empty({window_length}, options);
+  }
+
   if (window_length == 0) {
     return at::empty({0}, options);
   }
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index 9ef00d619675..35e058df4b3a 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -1,11 +1,17 @@
 #pragma once
 
-#include <ATen/Functions.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Utils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/core/TensorOptions.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
 namespace at { namespace native {
 // Different combinations of row, col, and offset can lead to two cases:
 //
@@ -29,6 +35,10 @@ namespace at { namespace native {
 //    In this case, we first calculate the size of top trapezoid, and then
 //    calculate the size of the bottom rectangle.
 inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
+  // If either dimension is 0 then the there is no tril
+  if (row == 0 || col == 0) {
+    return 0;
+  }
   // number of elements in the first row of the tril
   auto m_first_row = offset > 0 ?
     std::min<int64_t>(col, 1 + offset) : // upper bounded by col
@@ -95,7 +105,7 @@ struct ZeroTensorAllocator final : public at::Allocator {
   static void deleter(void* const pointer) {
     TORCH_INTERNAL_ASSERT(!pointer);
   }
-  DataPtr allocate(const size_t nbytes) const override {
+  DataPtr allocate(const size_t /*nbytes*/) const override {
     return {nullptr, nullptr, &deleter, device_};
   }
   DeleterFnPtr raw_deleter() const override {
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 63d928749e09..fd72abc580b4 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/TensorProperties.h>
 #include <ATen/NamedTensorUtils.h>
 #include <torch/library.h>
 
@@ -31,7 +31,7 @@ int64_t stride(const Tensor& self, Dimname dim) {
   return self.strides()[pos_dim];
 }
 
-bool cudnn_is_acceptable(const Tensor& self) {
+bool cudnn_is_acceptable(const TensorBase& self) {
   if (!globalContext().userEnabledCuDNN()) return false;
   if (!self.is_cuda()) return false;
   auto st = self.scalar_type();
@@ -48,6 +48,10 @@ bool cudnn_is_acceptable(const Tensor& self) {
   return true;
 }
 
+bool cudnn_is_acceptable(const Tensor& self) {
+  return cudnn_is_acceptable(static_cast<const TensorBase&>(self));
+}
+
 Tensor & detach_(Tensor & self) {
   // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
   //AT_ERROR("detach_ is not implemented for Tensor");
diff --git a/aten/src/ATen/native/TensorProperties.h b/aten/src/ATen/native/TensorProperties.h
new file mode 100644
index 000000000000..fe6e8395c178
--- /dev/null
+++ b/aten/src/ATen/native/TensorProperties.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// See NOTE: [Tensor vs. TensorBase]
+namespace at {
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+TORCH_API bool cudnn_is_acceptable(const TensorBase& self);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 3999805fee14..9d05610f4fdb 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -5,6 +5,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/core/DimVector.h>
+#include <ATen/core/IListRef.h>
 #include <ATen/native/Copy.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
@@ -28,6 +29,139 @@
 #include <vector>
 
 namespace at {
+namespace meta {
+inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) {
+  size_t i = 0;
+  for (const Tensor& t : tensors) {
+    TORCH_CHECK(
+        t.dim() > 0,
+        "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+    i++;
+  }
+}
+
+inline c10::MemoryFormat cat_compute_output_memory_format(const MaterializedITensorListRef& inputs) {
+  c10::optional<c10::MemoryFormat> format = c10::nullopt;
+  for (const Tensor& t : inputs) {
+    auto f = t.suggest_memory_format();
+    if (f == c10::MemoryFormat::Contiguous) {
+        return f;
+    }
+    if (format.has_value() && format.value() != f) {
+        return c10::MemoryFormat::Contiguous;
+    }
+    format = f;
+  }
+  return format.value();
+}
+
+TORCH_PRECOMPUTE_META_FUNC(cat)(ITensorListRef tensors, int64_t dim) {
+  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
+  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
+  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
+  // size (i.e. other empty sizes are not skipped).
+  auto materialized = tensors.materialize();
+
+  cat_check_no_zero_dim(materialized);
+  dim = at::legacy_cat_wrap_dim(dim, tensors);
+
+  // Checking names before the actual dimensions.
+  auto maybe_outnames = namedinference::compute_cat_outnames(tensors);
+
+  TORCH_CHECK(
+      materialized.size() > 0, "torch.cat(): expected a non-empty list of Tensors");
+
+  // Look for the first valid tensor.
+  size_t valid = materialized.size();
+  for (const auto i : c10::irange(materialized.size())) {
+    if (!at::native::cat_should_skip_tensor(materialized[i].get())) {
+      valid = i;
+      break;
+    }
+  }
+
+  bool all_contiguous = true;
+  bool all_same_dtype = true;
+  bool all_same_sizes_and_stride = true;
+  auto memory_format = cat_compute_output_memory_format(materialized);
+
+  // Compute what the output dtype should be:
+  const auto& result = maybe_get_output();
+  auto is_out_defined = result.defined();
+  auto out_dtype = at::native::result_type(tensors);
+
+  // If the output tensor is defined, we need to take it into account
+  // when computing the actual output dtype and the flags.
+  if (is_out_defined) {
+    // Check for type promotion, if the output tensor is defined.
+    TORCH_CHECK(
+        canCast(out_dtype, result.scalar_type()),
+        "torch.cat(): input types can't be cast to the desired output type ",
+        result.scalar_type());
+    out_dtype = result.scalar_type();
+    all_contiguous = result.is_contiguous(memory_format);
+  }
+
+  // Fallback 'set_output' parameters.
+  // (in case we don't find a valid tensor)
+  DimVector sizes {0};
+  TensorOptions options = materialized[0].get().options()
+      .dtype(out_dtype)
+      .memory_format(memory_format);
+
+  // If we found a valid tensor, check whether the input tensors
+  // are compatible, i.e. we can execute `cat` on them.
+  bool found_valid_tensor = valid < materialized.size();
+  if (found_valid_tensor) {
+    TORCH_CHECK(
+        dim <= materialized[valid].get().dim(), "torch.cat(): dimension ", dim, "out of range");
+
+    // Compute the output tensor size.
+    // It should have the same shape as any other valid tensor,
+    // except in the dimension 'dim'.
+    size_t size_at_dim = 0;
+    for (const auto i : c10::irange(materialized.size())) {
+      const Tensor& t = materialized[i];
+      if (!at::native::cat_should_skip_tensor(t)) {
+        at::native::check_cat_shape_except_dim(materialized[valid], t, dim, i);
+        size_at_dim += t.size(dim);
+        all_contiguous = all_contiguous && t.is_contiguous(memory_format);
+        all_same_dtype = all_same_dtype && out_dtype == t.scalar_type();
+        all_same_sizes_and_stride = all_same_sizes_and_stride &&
+            t.sizes() == materialized[valid].get().sizes() &&
+            t.strides() == materialized[valid].get().strides();
+      } else {
+        all_contiguous = false;
+      }
+    }
+
+    // Actually set the output.
+    sizes = materialized[valid].get().sizes().vec();
+    sizes[dim] = size_at_dim;
+    options = materialized[valid].get().options()
+        .dtype(out_dtype)
+        .memory_format(memory_format);
+  }
+
+  set_output(0, sizes, {}, options, maybe_outnames);
+  // Checks for overlaps between the inputs and the output tensor.
+  if (is_out_defined && found_valid_tensor) {
+    at::assert_no_internal_overlap(result);
+    for (const Tensor& t : materialized) {
+      at::assert_no_overlap(result, t);
+    }
+  }
+
+  return TORCH_PRECOMPUTE_STRUCT(cat)()
+      .set_dim(dim)
+      .set_valid(valid)
+      .set_all_contiguous(all_contiguous)
+      .set_all_same_dtype(all_same_dtype)
+      .set_all_same_sizes_and_stride(all_same_sizes_and_stride)
+      .set_memory_format(memory_format);
+}
+} // namespace meta
+
 namespace native {
 
 DEFINE_DISPATCH(cat_serial_stub);
@@ -59,12 +193,19 @@ Tensor& set_storage_cpu_(Tensor& result, Storage storage, int64_t storage_offset
   checkSetStorage(result, storage, storage_offset, size, stride);
 
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
-  c10::optional<IntArrayRef> stride_opt = stride.data() != nullptr ?
-                                          c10::optional<IntArrayRef>(stride) : c10::nullopt;
-  at::native::resize_impl_cpu_(result.unsafeGetTensorImpl(), size, stride_opt);
+  at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ?
+                                          at::OptionalIntArrayRef(stride) : c10::nullopt;
+  // We can re-use this kernel for the meta device.
+  // We just need to make sure we don't actually try to resize the (null) storage.
+  at::native::resize_impl_cpu_(result.unsafeGetTensorImpl(), size, stride_opt, /*resize_storage=*/!result.is_meta());
   return result;
 }
 
+Tensor& set_(Tensor& result, const Tensor& storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) {
+  TORCH_CHECK(storage.is_contiguous(), "passed in tensor to be used as storage must be contiguous");
+  return result.set_(storage.storage(), storage_offset + storage.storage_offset(), size, stride);
+}
+
 Tensor& set_tensor_(Tensor& result, const Tensor& source) {
   if (result.unsafeGetTensorImpl() != source.unsafeGetTensorImpl()) {
     return result.set_(source.storage(), source.storage_offset(), source.sizes(), source.strides());
@@ -87,6 +228,19 @@ Tensor& set_cpu_(Tensor& result) {
   return result;
 }
 
+// We can't re-use the cpu kernel here because we don't want to use the cpu allocator.
+Tensor& set_meta_(Tensor& result) {
+  caffe2::TypeMeta dtype = result.dtype();
+  Storage storage(
+      Storage::use_byte_size_t(),
+      0,
+      c10::GetAllocator(kMeta),
+      true);
+  result.set_(storage, 0, {0}, {});
+  TORCH_INTERNAL_ASSERT(dtype == result.dtype());
+  return result;
+}
+
 Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
   TORCH_CHECK(self.is_sparse(), "input must be sparse tensor");
   int64_t sparse_extra_ndim = size.size() - self.dim();
@@ -171,132 +325,49 @@ std::vector<Tensor> broadcast_tensors(TensorList tensors) {
   return expand_outplace(tensors);
 }
 
-static bool should_skip(const Tensor& t) {
-  return t.numel() == 0 && t.dim() == 1;
-}
-
-Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) {
-  check_cat_no_zero_dim(tensors);
-  dim = legacy_cat_wrap_dim(dim, tensors);
-  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
-  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
-  // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
-  // size (i.e. other empty sizes are not skipped).
-
-  bool allContiguous = true;
-
-  // Inputs cannot alias the output tensor
-  for (const auto i : c10::irange(tensors.size())) {
-    auto lap = at::get_overlap_status(result, tensors[i]);
-    TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL &&
-        lap != at::MemOverlapStatus::FULL, 0,
-        "unsupported operation: the input tensors cannot refer to any of the "
-        "output memory locations. Found overlap in input tensor ", i);
-  }
-  at::assert_no_internal_overlap(result);
-
-  const Tensor* pnotSkippedTensor = [](const TensorList &tensors) -> const Tensor* {
-    for (auto const &tensor : tensors) {
-      if (should_skip(tensor)) {
-        continue;
-      }
-      // we've found a non-empty tensor
-      return &tensor;
-    }
-    return nullptr;
-  }(tensors);
-
-  if (!pnotSkippedTensor) {
-    // FIXME: warn if this is the case -- see comment about skipped
-    // tensors at top of function.
-    return result;
-  }
-  const Tensor& notSkippedTensor = *pnotSkippedTensor;
-
-  TORCH_CHECK(tensors.size() > 0, "torch.cat(): expected a non-empty list of Tensors");
-  TORCH_CHECK(dim <= notSkippedTensor.dim(), "torch.cat(): dimension ", dim, "out of range");
-
-  // when the input tensors are of the same size and strides,
-  // reuse the same iterator for all input tensors
-  bool reuse_iterator = true;
-  bool no_type_promotion = true;
-  // Check the type of the result
-  no_type_promotion = result.dtype() == notSkippedTensor.dtype();
-
-  // compute size of the result in the cat dimension
-  int64_t cat_dim_size = 0;
-  auto first_tensor_mem_format = tensors[0].suggest_memory_format();
-  for (const auto i : c10::irange(tensors.size())) {
-    auto const &tensor = tensors[i];
-    if (should_skip(tensor)) {
-      // don't use fast path for empty tensor
-      allContiguous = false;
-      continue;
-    }
-    check_cat_shape_except_dim(notSkippedTensor, tensor, dim, i);
-    cat_dim_size += tensor.sizes()[dim];
-
-    if (!tensor.is_contiguous(first_tensor_mem_format)) {
-      allContiguous = false;
-    }
-
-    if (tensor.sizes() != notSkippedTensor.sizes() ||
-        tensor.strides() != notSkippedTensor.strides()) {
-      reuse_iterator = false;
-    }
-    if (tensor.dtype() != notSkippedTensor.dtype()) {
-      no_type_promotion = false;
-    }
-  }
-  // compute the size of the result
-  auto result_size = notSkippedTensor.sizes().vec();
-  result_size[dim] = cat_dim_size;
-
-  // skip resizing if size of result is same as expected
-  // raise a warning while resizing if output has one or more elements
-  // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
-  // for understanding why at::native::resize_output is not called directly.
-  // if (at::native::resize_output_check(result, result_size)) {
-  // TODO: restore the above, see https://github.com/pytorch/pytorch/issues/64709
-
-  if (result.sizes() != result_size) {
-    result.resize_(result_size, first_tensor_mem_format);
-  }
-
+TORCH_IMPL_FUNC(cat_out_cpu)
+(ITensorListRef tensors,
+ int64_t dim,
+ int64_t valid,
+ bool all_contiguous,
+ bool all_same_dtype,
+ bool all_same_sizes_and_stride,
+ MemoryFormat memory_format,
+ const Tensor& result) {
   if (result.numel() == 0) {
-    return result;
+    return;
   }
 
+  auto materialized = tensors.materialize();
+
   // fast path for single thread when both inputs and result are contiguous and not empty
-  allContiguous = allContiguous && result.is_contiguous(first_tensor_mem_format);
   bool use_serial_kernel = result.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
-  ScalarType dtype = notSkippedTensor.scalar_type();
+  ScalarType dtype = materialized[valid].get().scalar_type();
   bool serial_dtype = (dtype == ScalarType::Double || dtype == ScalarType::Float || dtype == ScalarType::BFloat16);
-  if (use_serial_kernel && allContiguous && no_type_promotion && serial_dtype) {
-    cat_serial_stub(kCPU, result, tensors, dim);
-    return result;
+  if (use_serial_kernel && all_contiguous && all_same_dtype && serial_dtype) {
+    cat_serial_stub(kCPU, result, materialized, dim);
+    return;
   }
 
   int64_t offset = 0;
-  if (reuse_iterator &&
-      result.is_contiguous(first_tensor_mem_format) &&
-      no_type_promotion) {
-    const auto& source_slice = notSkippedTensor;
+  if (all_same_sizes_and_stride && result.is_contiguous(memory_format) &&
+      all_same_dtype) {
+    const Tensor& source_slice = materialized[valid];
     auto slice_dim_size = source_slice.sizes()[dim];
     auto result_slice = result.narrow(dim, 0, slice_dim_size);
     auto result_slice_data = result_slice.data_ptr();
     auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type());
 
     auto iter = TensorIteratorConfig()
-      .set_check_mem_overlap(false)  // Already checked above
+      .set_check_mem_overlap(false)
       .resize_outputs(false)
       .add_output(result_slice)
       .add_input(source_slice)
       .enforce_safe_casting_to_output(true)
       .build();
 
-    for (auto const &tensor : tensors) {
-      if (should_skip(tensor)) {
+    for (const Tensor& tensor : materialized) {
+      if (cat_should_skip_tensor(tensor)) {
         continue;
       }
       auto source_data = static_cast<char*>(tensor.data_ptr());
@@ -307,8 +378,8 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) {
       offset += slice_dim_size;
     }
   } else {
-    for (auto const &tensor: tensors) {
-      if (should_skip(tensor)) {
+    for (const Tensor& tensor: materialized) {
+      if (cat_should_skip_tensor(tensor)) {
         continue;
       }
       auto slice_dim_size = tensor.sizes()[dim];
@@ -327,24 +398,6 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) {
       offset += slice_dim_size;
     }
   }
-
-  return result;
-}
-
-Tensor _cat_cpu(TensorList tensors, int64_t dim) {
-  ScalarType high_type = result_type(tensors);
-  Tensor result = at::empty({0}, tensors[0].options().dtype(high_type));
-  return native::_cat_out_cpu(tensors, dim, result);
-}
-
-Tensor & cat_out(TensorList tensors, int64_t dim, Tensor & result) {
-  auto maybe_outnames = namedinference::compute_cat_outnames(tensors);
-  {
-    NoNamesGuard guard;
-    at::_cat_out(result, tensors, dim);
-  }
-  namedinference::propagate_names_if_nonempty(result, maybe_outnames);
-  return result;
 }
 
 Tensor& cat_out(TensorList tensors, Dimname dim, Tensor& result) {
@@ -404,7 +457,7 @@ static void check_cat_sparse_dims(Tensor const &t,
             ", but tensor at position ", pos, " has ", t.sparse_dim(), ", ", t.dense_dim(), ".");
 }
 
-static Tensor cat_sparse(TensorList tensors, int64_t dim) {
+static Tensor cat_sparse_impl(TensorList tensors, int64_t dim) {
   std::vector<Tensor> indices;
   std::vector<Tensor> values;
   int64_t wrapped = maybe_wrap_dim(dim, tensors[0].dim());
@@ -501,15 +554,15 @@ static Tensor cat_sparse(TensorList tensors, int64_t dim) {
           t._values().options().layout_opt(),
           t._values().options().device_opt(),
           t._values().options().pinned_memory_opt());
-      vals_pieces.push_back(native::cat({z1, t._values(), z2}, values_dim));
+      vals_pieces.push_back(at::cat({z1, t._values(), z2}, values_dim));
       idxs_pieces.push_back(t._indices());
     }
     auto sizes_copy = sizes.vec();
     sizes_copy[wrapped] = total_size;
     // This can create an uncoalesced tensor
     return native::sparse_coo_tensor(
-        native::cat(idxs_pieces, 1),
-        native::cat(vals_pieces),
+        at::cat(idxs_pieces, 1),
+        at::cat(vals_pieces),
         sizes_copy,
         optTypeMetaToScalarType(tensors[0].options().dtype_opt()),
         tensors[0].options().layout_opt(),
@@ -518,18 +571,9 @@ static Tensor cat_sparse(TensorList tensors, int64_t dim) {
   }
 }
 
-Tensor cat(TensorList tensors, int64_t dim) {
-  if (tensors.size() > 0 &&
-        tensors[0].is_sparse()) {
-    return cat_sparse(tensors, dim);
-  }
-
+Tensor cat_sparse(TensorList tensors, int64_t dim) {
   auto maybe_outnames = namedinference::compute_cat_outnames(tensors);
-  Tensor result;
-  {
-    NoNamesGuard guard;
-    result = at::_cat(tensors, dim);
-  }
+  auto result = cat_sparse_impl(tensors, at::legacy_cat_wrap_dim(dim, tensors));
   namedinference::propagate_names_if_nonempty(result, maybe_outnames);
   return result;
 }
@@ -798,6 +842,11 @@ Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim
   return result;
 }
 
+Tensor expand_symint(const Tensor& self, c10::SymIntArrayRef packed_size, bool implicit) {
+  auto size = expectIntArrayRef(packed_size);
+  return expand(self, size, implicit);
+}
+
 Tensor expand(const Tensor& self, IntArrayRef size, bool /*unused*/) {
   TORCH_CHECK(size.size() >= (size_t)self.dim(),
            "expand(", self.toString(), "{", self.sizes(), "}, size=", size,
@@ -877,6 +926,19 @@ const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stri
   return self;
 }
 
+Tensor narrow_copy_symint(const Tensor& self, int64_t dim, int64_t start, SymInt sym_length) {
+  return narrow_copy(self, dim, start, sym_length.expect_int());
+}
+
+Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
+  return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
+}
+
+Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+  auto output = at::empty_like(self);
+  return narrow_copy_dense_cpu_out(self, dim, start, length, output);
+}
+
 Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   int64_t allDim = self.dim();
   int64_t end = start+length;
@@ -914,6 +976,7 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
 Tensor& narrow_copy_dense_cpu_out(
   const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output
 ) {
+
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
   TORCH_CHECK(self.dtype() == output.dtype());
 
@@ -991,15 +1054,6 @@ Tensor& narrow_copy_dense_cpu_out(
   return output;
 }
 
-Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-  return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
-}
-
-Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-  auto output = at::empty_like(self);
-  return narrow_copy_dense_cpu_out(self, dim, start, length, output);
-}
-
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
   auto cur_size = self.size(dim);
@@ -1159,7 +1213,7 @@ Tensor reshape(const Tensor& self, IntArrayRef proposed_shape) {
     //
     // We need to do the checks here instead of in `native_functions.yaml`
     // to preserve backwards compatibility.
-    if (!self.is_xla() && !self.is_lazy()) {
+    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) {
       return self._reshape_alias(shape, stride.value());
     } else {
       return self.view(shape);
@@ -1302,7 +1356,7 @@ Tensor select_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim,
   return grad_input;
 }
 
-Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index) {
+Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& index) {
   /*
     Algorithm:
     index - a 1-D tensor of indicies with shape (n,)
@@ -1315,81 +1369,627 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
       new_values - shape is (new_nnz,) + dense_shape
 
       if dim < len(sparse_shape):
-          for i, idx in enumerate(index):
-              for j, jdx in enumerate(indices[dim]):
-                  if idx == jdx:
-                      icol = indices[:dim][j] + (i,) + indices[dim+1:][j]
-                      new_indices.add_column(icol)
-                      new_values.add_row(values[j])
+          # Find new_indices[dim] of the output sparse tensor and
+          # indices at which to select values/indices.
+          # The CPP code uses (binary/in a count table) search to find matches and may
+          # swap the loop order for better algorithmic complexity.
+          new_dim_indices = []
+          selected_dim_indices = []
+          # This is a brute-force algorithms to convey the main idea.
+          # The CPP code below is more efficient but more complicated.
+          for i, i_idx in enumerate(indices[dim]):
+              for j, j_idx in enumerate(index):
+                  if i_idx == j_idx:
+                      new_dim_indices.append(j)
+                      selected_dim_indices.append(i)
+          new_indices = indices.index_select(1, selected_dim_indices)
+          new_values = values.index_select(0, selected_dim_indices)
+          new_indices[dim] = new_dim_indices
       else:
           new_indices = indices
-          new_values[k] = values[k].index_select(dim - len(sparse_shape), index) for k in range(nnz)
+          new_values = values.index_select(dim - sparse_dim + 1, index);
     */
-  auto ndim = self.dim();
-  if (ndim == 0) {
-    TORCH_CHECK_INDEX(false, "index_select() cannot be applied to a 0-dim tensor.");
-  }
-  if (!(index.dim() == 1 && index.dtype() == at::kLong)) {
-    TORCH_CHECK_INDEX(false, "index_select() argument index must be 1-D long-tensor.");
-  }
+  const auto ndim = self.dim();
+  TORCH_CHECK_INDEX(ndim, "index_select() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK_INDEX(
+      index.dim() == 1 && index.dtype() == at::kLong && index.options().layout() == at::kStrided,
+      "index_select() argument index must be 1-D strided (non-sparse) long-tensor.");
   dim = maybe_wrap_dim(dim, ndim);
-  auto size = self.size(dim);
-  auto sparse_dim = self.sparse_dim();
-  auto dense_dim = self.dense_dim();
-  auto indices = self._indices();
-  auto values = self._values();
-  auto nnz = values.size(0);
-  auto new_sizes = self.sizes().vec();
-  new_sizes[dim] = index.size(0);
+  const auto size = self.size(dim);
+  const auto sparse_dim = self.sparse_dim();
+  const auto dense_dim = self.dense_dim();
+  const auto indices = self._indices();
+  const auto values = self._values();
+  const auto nnz = values.size(0);
+  const auto index_len = index.size(0);
+  auto res_sizes = self.sizes().vec();
+  res_sizes[dim] = index_len;
+
+  // Equivalent to t.index_select(dim, idx), but vanilla index_select is not parallel,
+  // so we use gather instead.
+  // We use this method to select relevant indices/values
+  // from the intersection between indices[dim] and the index.
+  const auto index_select = [](const Tensor& t, int64_t dim, const Tensor& idx) -> Tensor {
+    const auto idx_len = idx.numel();
+    auto out_shape = t.sizes().vec();
+    out_shape[dim] = idx_len;
+    auto idx_shape = std::vector<int64_t>(t.dim(), 1);
+    idx_shape[dim] = idx_len;
+    return t.gather(dim, idx.view(idx_shape).expand(out_shape));
+  };
 
+  // If indexing into sparse dimensions
   if (dim < sparse_dim) {
+    // short-circuit if index is empty
+    if (!index_len) {
+      auto res_indices = index_select(indices, 1, index);
+      res_indices[dim] = index;
+      const auto res_values = index_select(values, 0, index);
 
-    auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous();
-    int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr<int64_t>();
-    auto cpu_index = index.to(c10::kCPU).contiguous();
-    int64_t* cpu_index_ptr = cpu_index.data_ptr<int64_t>();
-    std::vector<int64_t> zindices;
-    std::vector<int64_t> iindices;
-    int64_t new_nnz = 0;
-    for (const auto i : c10::irange(new_sizes[dim])) {
-      int64_t idx = cpu_index_ptr[i];
-      if (idx < -size || idx >= size) {
-        TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
-                   self.sizes(), " at dimension ", dim);
+      return _sparse_coo_tensor_with_dims_and_tensors(
+          sparse_dim, dense_dim, res_sizes, res_indices, res_values, self.options());
+    }
+
+    const auto nneg_index = [&index, index_len, &self, size, dim]() -> Tensor {
+      const auto index_contiguous = index.contiguous();
+      auto nneg_index = at::empty_like(index_contiguous);
+      // nneg_index = (index < 0) * (index + size) + (index >= 0) * index
+      auto* ptr_index = index_contiguous.data_ptr<int64_t>();
+      auto* ptr_nneg_index = nneg_index.data_ptr<int64_t>();
+      at::parallel_for(0, index_len, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
+          const auto* src = ptr_index + start;
+          auto* dst = ptr_nneg_index + start;
+          for (C10_UNUSED const auto _ : c10::irange(start, end)) {
+            auto idx = *src++;
+            if (idx < -size || idx >= size) {
+               // Mark self and dim as used if code is compiled with STRIP_ERROR_MESSAGES
+              (void)dim;
+              (void)self;
+              TORCH_CHECK_INDEX(false,
+                  "index_select(): index contains ", idx, " that is out of range for tensor of size ",
+                  self.sizes(), " at dimension ", dim
+              );
+            }
+            if (idx < 0) {
+              idx += size;
+            }
+            *dst++ = idx;
+          }
+      });
+
+      return nneg_index;
+    }();
+
+    const auto dim_indices = indices[dim].contiguous();
+
+    // If nnz is smaller than size, then either indices[dim] or index gets sorted,
+    // then this is followed by a binary search to find interesections.
+    const auto get_selected_indices_small_nnz_large_size = [&]() -> std::tuple<Tensor, Tensor> {
+      const auto grain_size = at::internal::GRAIN_SIZE;
+      const auto n_threads_nnz = std::max<int64_t>(
+          1, std::min<int64_t>((nnz + grain_size - 1) / grain_size, at::get_num_threads())
+      );
+      const auto n_threads_index = std::max<int64_t>(
+          1, std::min<int64_t>((index_len + grain_size - 1) / grain_size, at::get_num_threads())
+      );
+      const auto search_in_dim_indices
+        // if either dim_indices or index requires sorting, we compare
+        // the cost of sort + binary search, which is comparing
+        // (len(dim_indices) + len(index)) * log(len(index)) to
+        // (len(dim_indices) + len(index)) * log(len(dim_indices)).
+        // That simplifies to comparing len(dim_indices) to len(index).
+        // Additionally, we take into consideration potential parallel
+        // speedup.
+        = (nnz / n_threads_nnz <= index_len / n_threads_index)
+        // if self is coalesced and dim is 0, then we compare
+        // index_len * log(len(dim_indices)), which is binary search into dim_indices,
+        // to (len(index_len) + len(dim_indices)) * log(index_len).
+        // Additionally, we take into consideration potential parallel
+        // speedup.
+          || (self.is_coalesced() && dim == 0
+          && (index_len * std::log2(nnz) / n_threads_index
+            <= (nnz / n_threads_nnz + index_len) * std::log2(index_len)))
+        ? true : false;
+
+      // src is a source of indices to binary search in sorted
+      Tensor sorted, sorted_idx, src;
+      std::tie(sorted, sorted_idx, src) = [
+        &dim_indices, &nneg_index, &self,
+        search_in_dim_indices, dim, nnz
+      ](void) -> std::tuple<Tensor, Tensor, Tensor> {
+        // sort dim_indices to binary search into it
+        if (search_in_dim_indices) {
+          // dim_indices is already sorted if self is coalesced and dim == 0
+          if (self.is_coalesced() && dim == 0) {
+            return std::make_tuple(dim_indices, at::arange(nnz, dim_indices.options()), nneg_index);
+          }
+          else {
+            Tensor sorted_dim_indices, sorted_dim_indices_idx;
+            std::tie(sorted_dim_indices, sorted_dim_indices_idx) = dim_indices.sort();
+            return std::make_tuple(sorted_dim_indices, sorted_dim_indices_idx, nneg_index);
+          }
+        }
+        // sort nneg_index to binary search into it
+        else {
+          Tensor sorted_nneg_index, sorted_nneg_index_idx;
+          std::tie(sorted_nneg_index, sorted_nneg_index_idx) = nneg_index.sort();
+          return std::make_tuple(sorted_nneg_index, sorted_nneg_index_idx, dim_indices);
+        }
+      }();
+
+      const auto src_grain_size = at::internal::GRAIN_SIZE;
+      const auto src_len = src.numel();
+      const auto n_threads_src = std::max<int64_t>(
+          // 1 <= n_threads_src <= std::min(ceil(src.numel() / src_grain_size), max_threads)
+          1, std::min<int64_t>((src_len + src_grain_size - 1) / src_grain_size, at::get_num_threads())
+      );
+      const auto chunk_size_src = (src_len + n_threads_src - 1) / n_threads_src;
+
+      const std::vector<int64_t> src_n_threads_shape = {
+        n_threads_src, (src_len + n_threads_src - 1) / n_threads_src
+      };
+
+      // src_int_idx and sorted_int_idx store "i" and "j" indices indicating
+      // intersections such that src_int_idx[i] == sorted_int_idx[j].
+      // These intersections are found with binary search and in parallel.
+      auto src_int_idx = at::empty(src_n_threads_shape, src.options());
+      auto sorted_int_idx = at::empty_like(src_int_idx);
+      // For each element "i" from src, int_counts define how many
+      // elements there are in sorted, i.e. "j" indices, corresponding
+      // to "i", i.e.:
+      // |{j : src_int_idx[i] == sorted_int_idx[j]}| for each i in src_int_idx.
+      auto int_counts = at::zeros_like(src_int_idx);
+
+      // fill in src_int_idx, sorted_int_idx, int_counts
+      {
+        const auto sorted_len = sorted.numel();
+        const auto* ptr_sorted = sorted.data_ptr<int64_t>();
+        const auto* ptr_sorted_start = ptr_sorted;
+        const auto* ptr_sorted_end = ptr_sorted + sorted_len;
+
+        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
+            const auto start = tid * chunk_size_src;
+            const auto end = std::min(start + chunk_size_src, src_len);
+            auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
+            auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
+            auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
+            const auto* ptr_src = src.data_ptr<int64_t>() + start;
+
+            for (const auto i : c10::irange(start, end)) {
+              const auto src_val = *ptr_src++;
+              const auto src_val_lb = std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val);
+              // We cannot just use *src_val_lb != src_val because when
+              // src_val_lb == ptr_sorted_end, dereferencing past-the-end value
+              // is not well-defined.
+              if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) {
+                ++ptr_tid_src_int_idx;
+                ++ptr_tid_sorted_int_idx;
+                ++ptr_tid_int_counts;
+                continue;
+              }
+              const auto src_val_ub = std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val);
+
+              const int64_t count = src_val_ub - src_val_lb;
+              const int64_t j = src_val_lb - ptr_sorted_start;
+
+              *ptr_tid_src_int_idx++ = i;
+              *ptr_tid_sorted_int_idx++ = j;
+              *ptr_tid_int_counts++ = count;
+            }
+        });
       }
-      if (idx < 0) {
-        idx += size;
+
+      const auto compressed_int_counts = int_counts.sum(-1);
+      const auto res_len = compressed_int_counts.sum().item<int64_t>();
+
+      // Short-circuit if empty intersection
+      if (!res_len) {
+        auto empty_idx = at::empty({0}, src.options());
+        return std::make_tuple(empty_idx, empty_idx);
+      }
+
+      // Now that we know "i", "j" and the counts, we "unflatten"
+      // them into two arrays of intersection indices such that
+      // selected_src = repeat_interleave(src_int_idx, int_counts),
+      // and selected_sorted is obtained as follows:
+      // offsets = int_counts.cumsum(0).sub_(int_counts)
+      // for ii, (j, c) in enumerate(zip(sorted_int_idx, int_counts)):
+      //     out_slice = slice(offsets[ii], offsets[ii] + c)
+      //     src_slice = slice(j, j + c)
+      //     selected_sorted[out_slice] = sorted_int_idx[src_slice]
+      auto selected_sorted = at::empty({res_len}, sorted.options());
+      auto selected_src = at::empty({res_len}, src.options());
+
+      // fill in selected_sorted, selected_src
+      {
+        auto* ptr_selected_sorted = selected_sorted.data_ptr<int64_t>();
+        auto* ptr_selected_src = selected_src.data_ptr<int64_t>();
+
+        const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts);
+        const auto* ptr_sorted_idx = sorted_idx.data_ptr<int64_t>();
+        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
+            const auto start = tid * chunk_size_src;
+            const auto end = std::min(start + chunk_size_src, src_len);
+            const auto tid_offset = thread_offsets.data_ptr<int64_t>()[tid];
+            const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
+            const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
+            const auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
+            auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
+            auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;
+
+            for (C10_UNUSED const auto _ : c10::irange(start, end)) {
+              const auto count = *ptr_tid_int_counts++;
+              const auto i = *ptr_tid_src_int_idx++;
+              const auto j = *ptr_tid_sorted_int_idx++;
+              if (!count) continue;
+
+              std::fill_n(ptr_tid_selected_src, count, i);
+              std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted);
+
+              ptr_tid_selected_sorted += count;
+              ptr_tid_selected_src += count;
+            }
+        });
       }
-      for (const auto j : c10::irange(nnz)) {
-        int64_t jdx = cpu_dim_indices_ptr[j];
-        if (idx == jdx) {
-          new_nnz++;
-          iindices.push_back(i);
-          zindices.push_back(j);
+
+      return search_in_dim_indices
+        ? std::make_tuple(selected_sorted, selected_src)
+        : std::make_tuple(selected_src, selected_sorted);
+    };
+
+    // Converts a 1d sorted idx to a compressed 1d compressed idx,
+    // aka crow in the CSR format. Useful to get a count table in
+    // a parallelized and no-sync manner.
+    // TODO: this function is equivalent to _convert_indices_from_coo_to_csr.
+    // The mentioned function is not public yet.
+    const auto sorted_idx_to_cidx = [](
+        const Tensor& idx,
+        int64_t len,
+        bool run_in_parallel = true) -> Tensor {
+      auto cidx = at::empty({len + 1}, idx.options());
+
+      const auto* ptr_idx = idx.data_ptr<int64_t>();
+      auto* ptr_cidx = cidx.data_ptr<int64_t>();
+
+      const auto idx_len = idx.numel();
+
+      std::fill_n(ptr_cidx, ptr_idx[0] + 1, 0);
+      std::fill_n(ptr_cidx + ptr_idx[idx_len - 1] + 1, len - ptr_idx[idx_len - 1], idx_len);
+
+      const auto grain_size = run_in_parallel ? at::internal::GRAIN_SIZE : idx_len;
+      at::parallel_for(0, idx_len, grain_size, [&](int64_t start, int64_t end) {
+          auto* ptr_curr_cidx = ptr_cidx + ptr_idx[start] + 1;
+          for (int64_t i = start; i < std::min(end, idx_len - 1); ++i) {
+            const auto diff = ptr_idx[i + 1] - ptr_idx[i];
+            std::fill_n(ptr_curr_cidx, diff, i + 1);
+            ptr_curr_cidx += diff;
+          }
+      });
+
+      return cidx;
+    };
+
+    // If nnz is (much) larger than size, then both indices[dim] and index get sorted
+    // with a count sort (faster, and no huge nnz-sized chunk memory allocations).
+    // The element-wise product between the count tables gives us all the intersections.
+    const auto get_selected_indices_large_nnz_small_size = [&]() -> std::tuple<Tensor, Tensor> {
+      const auto get_counts = [&sorted_idx_to_cidx](
+          // Writes into counts (must be preallocated and zero)
+          // and allows to use external buffers.
+          Tensor& counts,
+          const Tensor& t,
+          int64_t bins,
+          bool is_sorted = false,
+          bool run_in_parallel = true) -> void {
+        if (is_sorted) {
+          const auto cidx = sorted_idx_to_cidx(t, bins, run_in_parallel);
+          at::sub_out(counts, cidx.slice(0, 1, bins + 1), cidx.slice(0, 0, bins));
+        }
+        else {
+          auto* ptr_counts = counts.data_ptr<int64_t>();
+          const auto* ptr_vals = t.data_ptr<int64_t>();
+          for (C10_UNUSED const auto _ : c10::irange(t.numel())) {
+            ++ptr_counts[*ptr_vals++];
+          }
         }
+      };
+
+      const auto counts_per_thread = [&get_counts, size](
+          const Tensor& idx,
+          bool is_sorted = false,
+          int64_t grain_size = at::internal::GRAIN_SIZE
+      ) -> Tensor {
+        const auto idx_len = idx.numel();
+        // 1 <= n_threads <= min(ceil(len / grain_size), max_threads)
+        const auto n_threads = std::max<int64_t>(
+            1, std::min<int64_t>((idx_len + grain_size - 1) / grain_size, at::get_num_threads())
+        );
+        const auto chunk_size = (idx_len + n_threads - 1) / n_threads;
+        const auto run_in_parallel = (n_threads == 1);
+
+        auto counts_per_thread = at::zeros({n_threads, size}, idx.options());
+        at::parallel_for(0, n_threads, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
+          const auto start = tid * chunk_size;
+          const auto end = std::min(start + chunk_size, idx_len);
+          const auto tid_idx = idx.slice(0, start, end);
+          auto tid_counts = counts_per_thread.select(0, tid);
+          get_counts(tid_counts, tid_idx, /*bins=*/size,
+              /*is_sorted=*/is_sorted, /*run_in_parallel=*/run_in_parallel);
+        });
+
+        return counts_per_thread;
+      };
+
+      auto dim_indices_counts_per_thread = counts_per_thread(
+          dim_indices,
+          /*is_sorted=*/self.is_coalesced() && dim == 0
+          /*grain_size = at::internal::GRAIN_SIZE*/
+      );
+      auto dim_indices_offset_counts_per_thread = dim_indices_counts_per_thread.cumsum(0);
+
+      auto index_counts_per_thread = counts_per_thread(
+          nneg_index,
+          /*is_sorted=*/false
+          /*grain_size = at::internal::GRAIN_SIZE*/
+      );
+      auto index_offset_counts_per_thread = index_counts_per_thread.cumsum(0);
+
+      const auto index_counts = index_offset_counts_per_thread.select(0, -1);
+      const auto dim_indices_counts = dim_indices_offset_counts_per_thread.select(0, -1);
+      const auto intersection_counts = index_counts.mul(dim_indices_counts);
+      const auto res_len = intersection_counts.sum().item<int64_t>();
+      // Short-circuit if empty intersection
+      if (!res_len) {
+        auto empty_idx = at::empty({0}, index.options());
+        return std::make_tuple(empty_idx, empty_idx);
       }
-    }
-    auto zIndices = at::from_blob(zindices.data(), {new_nnz}, at::kLong).to(indices.device());
-    auto new_indices = indices.index_select(1, zIndices);
-    new_indices[dim] = at::from_blob(iindices.data(), {new_nnz}, at::kLong).to(indices.device());
-    auto new_values = values.index_select(0, zIndices);
-    return _sparse_coo_tensor_with_dims_and_tensors(
-        sparse_dim, dense_dim, new_sizes, new_indices, new_values, self.options());
+      const auto intersection_offsets = intersection_counts.cumsum(0);
+
+      const auto search_in_dim_indices = [&]() -> bool {
+        const auto grain_size = at::internal::GRAIN_SIZE;
+        const auto n_threads_index = std::max<int64_t>(
+            1, std::min<int64_t>((index_len + grain_size - 1) / grain_size, at::get_num_threads())
+        );
+        const auto n_threads_dim_indices = std::max<int64_t>(
+            1, std::min<int64_t>((nnz + grain_size - 1) / grain_size, at::get_num_threads())
+        );
+
+        const auto index_max_copy_work_per_thread =
+          index_counts_per_thread.mul(dim_indices_counts).sum(-1).max().item<int64_t>();
+        const auto dim_indices_max_copy_work_per_thread
+          = dim_indices_counts_per_thread.mul(index_counts).sum(-1).max().item<int64_t>();
+
+        const auto index_max_work_per_thread = index_max_copy_work_per_thread * index_len / n_threads_index;
+        const auto dim_indices_max_work_per_thread = dim_indices_max_copy_work_per_thread * nnz / n_threads_dim_indices;
+        return index_max_work_per_thread <= dim_indices_max_work_per_thread
+          ? true
+          : false;
+      }();
+
+      Tensor idx, idx_counts_per_thread, idx_offset_counts_per_thread;
+      Tensor src, src_counts_per_thread, src_offset_counts_per_thread;
+      std::tie(
+          idx, idx_counts_per_thread, idx_offset_counts_per_thread,
+          src, src_counts_per_thread, src_offset_counts_per_thread
+      ) = [&]() {
+        return search_in_dim_indices
+          ? std::make_tuple(
+              nneg_index, index_counts_per_thread, index_offset_counts_per_thread,
+              dim_indices, dim_indices_counts_per_thread, dim_indices_offset_counts_per_thread
+            )
+          : std::make_tuple(
+              dim_indices, dim_indices_counts_per_thread, dim_indices_counts_per_thread.cumsum(0),
+              nneg_index, index_counts_per_thread, index_counts_per_thread.cumsum(0)
+            );
+      }();
+
+      const auto idx_counts = idx_offset_counts_per_thread.select(0, -1);
+      const auto src_counts = src_offset_counts_per_thread.select(0, -1);
+
+      Tensor src_idx, src_idx_offsets;
+      std::tie(src_idx, src_idx_offsets) = [&](
+          int64_t grain_size = at::internal::GRAIN_SIZE
+      ) -> std::tuple<Tensor, Tensor> {
+        const auto src_intersection_counts = src_counts.mul(idx_counts > 0);
+        const auto src_intersection_offsets = src_intersection_counts.cumsum(0);
+        const auto src_idx_len = src_intersection_offsets.data_ptr<int64_t>()[size - 1];
+        auto src_idx = at::empty({src_idx_len}, src.options());
+
+        const auto* ptr_src = src.data_ptr<int64_t>();
+        const auto* ptr_intersection_counts = intersection_counts.data_ptr<int64_t>();
+        const auto* ptr_src_intersection_counts = src_intersection_counts.data_ptr<int64_t>();
+        const auto* ptr_src_intersection_offsets = src_intersection_offsets.data_ptr<int64_t>();
+        auto* ptr_src_idx = src_idx.data_ptr<int64_t>();
+
+        const auto src_len = src.numel();
+        const auto n_threads_src = std::max<int64_t>(
+            1, std::min<int64_t>((src_len + grain_size - 1) / grain_size, at::get_num_threads())
+        );
+        const auto chunk_size = (src_len + n_threads_src - 1) / n_threads_src;
+        at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
+            const auto start = tid * chunk_size;
+            const auto end = std::min(start + chunk_size, src_len);
+            auto* ptr_src_tid = ptr_src + start;
+            const auto* ptr_src_counts_per_thread
+              = src_counts_per_thread.select(0, tid).data_ptr<int64_t>();
+            const auto* ptr_src_offset_counts_per_thread
+              = src_offset_counts_per_thread.select(0, tid).data_ptr<int64_t>();
+            auto tid_counts = at::zeros({size}, src.options());
+            auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();
+
+            for (const auto i : c10::irange(start, end)) {
+              const auto idx_val = *ptr_src_tid++;
+              // skip idx value if not in the intersection
+              if (!ptr_intersection_counts[idx_val]) continue;
+              const auto idx_val_offset
+                = ptr_src_intersection_offsets[idx_val]
+                - ptr_src_intersection_counts[idx_val];
+              const auto idx_val_tid_offset
+                = ptr_src_offset_counts_per_thread[idx_val]
+                - ptr_src_counts_per_thread[idx_val];
+              auto& idx_val_local_tid_count = ptr_tid_counts[idx_val];
+              ptr_src_idx[idx_val_offset + idx_val_tid_offset + idx_val_local_tid_count] = i;
+              ++idx_val_local_tid_count;
+            }
+        });
+
+        const auto src_idx_offsets = src_intersection_offsets.sub_(src_intersection_counts);
+
+        return std::make_tuple(src_idx, src_idx_offsets);
+      }();
+
+      Tensor idx_selected, src_selected;
+      std::tie(idx_selected, src_selected) = [&](
+          int64_t grain_size = at::internal::GRAIN_SIZE
+      ) -> std::tuple<Tensor, Tensor> {
+        const auto thread_offset = [&]() {
+          // we do not need idx_counts_per_thread anymore,
+          // so it is safe to do in-place intersection.
+          auto counts_per_thread = idx_counts_per_thread.mul_(src_counts).sum(-1);
+          return counts_per_thread.cumsum(0).sub_(counts_per_thread);
+        }();
+        const auto* ptr_thread_offset = thread_offset.data_ptr<int64_t>();
+
+        auto idx_selected = at::empty({res_len}, idx.options());
+        auto src_selected = at::empty({res_len}, src.options());
+
+        const auto* ptr_idx = idx.data_ptr<int64_t>();
+        const auto* ptr_src_counts = src_counts.data_ptr<int64_t>();
+        const auto* ptr_intersection_counts = intersection_counts.data_ptr<int64_t>();
+        const auto* ptr_src_idx = src_idx.data_ptr<int64_t>();
+        const auto* ptr_src_idx_offsets = src_idx_offsets.data_ptr<int64_t>();
+        auto* ptr_idx_selected = idx_selected.data_ptr<int64_t>();
+        auto* ptr_src_selected = src_selected.data_ptr<int64_t>();
+
+        const auto idx_len = idx.numel();
+        const auto n_threads_idx = std::max<int64_t>(
+            1, std::min<int64_t>((idx_len + grain_size - 1) / grain_size, at::get_num_threads())
+        );
+        const auto chunk_size = (idx_len + n_threads_idx - 1) / n_threads_idx;
+        at::parallel_for(0, n_threads_idx, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
+            const auto start = tid * chunk_size;
+            const auto end = std::min(start + chunk_size, idx_len);
+            const auto tid_offset = ptr_thread_offset[tid];
+            const auto* ptr_idx_tid = ptr_idx + start;
+            auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset;
+            auto* ptr_src_selected_tid = ptr_src_selected + tid_offset;
+
+            for (const auto i : c10::irange(start, end)) {
+              const auto idx_val = *ptr_idx_tid++;
+              // skip if idx_val is not in the intersection
+              if (!ptr_intersection_counts[idx_val]) continue;
+              const auto count = ptr_src_counts[idx_val];
+              const auto j = ptr_src_idx_offsets[idx_val];
+              std::fill_n(ptr_idx_selected_tid, count, i);
+              std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid);
+              ptr_idx_selected_tid += count;
+              ptr_src_selected_tid += count;
+            }
+        });
+
+        return std::make_tuple(idx_selected, src_selected);
+      }();
+
+      return search_in_dim_indices
+        ? std::make_tuple(src_selected, idx_selected)
+        : std::make_tuple(idx_selected, src_selected);
+    };
+
+    const auto make_output = [&](
+        const Tensor& selected_dim_indices,
+        const Tensor& res_dim_indices) -> Tensor {
+      auto res_indices = index_select(indices, 1, selected_dim_indices);
+      res_indices[dim] = res_dim_indices;
+      const auto res_values = index_select(values, 0, selected_dim_indices);
 
-  } else {
+      return _sparse_coo_tensor_with_dims_and_tensors(
+          sparse_dim, dense_dim, res_sizes, res_indices, res_values, self.options());
+    };
+
+    // Brute-force solution for small values of nnz and index_len
+    const auto get_result_small_nnz_small_index = [&]()
+      -> Tensor {
+      const auto dim_indices_in_inner_loop = nnz >= index_len;
+      Tensor outer, inner;
+      std::tie(outer, inner) = [&]() -> std::tuple<Tensor, Tensor> {
+        if (dim_indices_in_inner_loop) {
+          return std::make_tuple(nneg_index, dim_indices);
+        }
+        else {
+          return std::make_tuple(dim_indices, nneg_index);
+        }
+      }();
+
+      const auto* ptr_outer = outer.data_ptr<int64_t>();
+      const auto* ptr_inner = inner.data_ptr<int64_t>();
+      // NOTE: if very critical, replace std::vector with
+      // a data structure that operates on stack up to some limit.
+      auto outer_selected_idx = std::vector<int64_t>();
+      auto inner_selected_idx = std::vector<int64_t>();
+      int64_t res_len = 0;
+      for (const auto i : c10::irange(outer.numel())) {
+        for (const auto j : c10::irange(inner.numel())) {
+          if (ptr_outer[i] == ptr_inner[j]) {
+            ++res_len;
+            outer_selected_idx.push_back(i);
+            inner_selected_idx.push_back(j);
+          }
+        }
+      }
 
-    auto vsize = values.sizes().vec();
-    vsize[dim + 1 - sparse_dim] = index.size(0);
-    auto new_values = at::empty(vsize, values.options());
-    for (const auto k : c10::irange(nnz)) {
-      new_values[k] = values[k].index_select(dim - sparse_dim, index);
+      const auto outer_selected_idx_tensor = at::from_blob(
+          outer_selected_idx.data(), {res_len}, at::kLong
+      );
+      const auto inner_selected_idx_tensor = at::from_blob(
+          inner_selected_idx.data(), {res_len}, at::kLong
+      );
+
+      return dim_indices_in_inner_loop
+        ? make_output(inner_selected_idx_tensor, outer_selected_idx_tensor)
+        : make_output(outer_selected_idx_tensor, inner_selected_idx_tensor);
+    };
+
+    constexpr int64_t BRUTE_FORCE_SIZE_LIMIT = 2 << 14; // 16384
+    // NOTE: such a condition to avoid overflows in (nnz * index_len)
+    if (nnz <= BRUTE_FORCE_SIZE_LIMIT && index_len <= BRUTE_FORCE_SIZE_LIMIT
+        && (nnz * index_len) <= BRUTE_FORCE_SIZE_LIMIT) {
+      return get_result_small_nnz_small_index();
     }
-    return _sparse_coo_tensor_with_dims_and_tensors(
-        sparse_dim, dense_dim, new_sizes, indices, new_values, self.options());
+    else {
+      Tensor selected_dim_indices;
+      Tensor res_dim_indices;
+
+      // A more precise decision could be of the form:
+      // `nnz < C(nnz, size) * size`, but it requires heavy benchmarking.
+      // We choose `nnz < size`, which measures theoretical complexity
+      // and does not rely on runtime performance.
+      // TODO: perform this analysis and find better C(nnz, size).
+      if (nnz <= size) {
+        std::tie(selected_dim_indices, res_dim_indices) = get_selected_indices_small_nnz_large_size();
+      }
+      else {
+        std::tie(selected_dim_indices, res_dim_indices) = get_selected_indices_large_nnz_small_size();
+      }
 
+      return make_output(selected_dim_indices, res_dim_indices);
+    }
+  }
+  // If indexing into dense dimensions
+  else {
+    // It is sufficient to just perform `index_select` on values
+    // if `dim` refers to dense dimensions.
+    const auto res_values = index_select(values, dim - sparse_dim + 1, index);
+
+    return _sparse_coo_tensor_with_dims_and_tensors(
+        sparse_dim, dense_dim, res_sizes, indices, res_values, self.options());
   }
 }
 
+Tensor index_select_sparse_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
+  auto res = index_select_sparse_cpu(self.to(at::kCPU), dim, index.to(at::kCPU));
+  return res.to(self.device());
+}
+
 Tensor slice(
     const Tensor& self,
     int64_t dim,
@@ -1453,21 +2053,9 @@ Tensor slice_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim,
 }
 
 std::vector<Tensor> split(const Tensor& self, int64_t split_size, int64_t dim) {
-  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
-  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
-  int64_t dim_size = self.size(dim);
-  TORCH_CHECK(split_size > 0 || dim_size == 0,
-           "split_size can only be 0 if dimension size is 0, "
-           "but got dimension size of ", dim_size);
-  // if split_size is 0 and dimension size is 0, there is 1 split.
-  int64_t num_splits = 1;
-  if (split_size != 0) {
-    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
-    // (returns a single split).  We might want to error here, but keep it for BC.
-    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
-  }
+  const auto num_splits = get_num_splits(self, split_size, dim);
   std::vector<Tensor> splits(num_splits);
-  int64_t last_split_size = split_size - (split_size * num_splits - dim_size);
+  int64_t last_split_size = split_size - (split_size * num_splits - self.size(dim));
 
   for (const auto i : c10::irange(num_splits)) {
     auto length = i < num_splits - 1 ? split_size : last_split_size;
@@ -1476,6 +2064,10 @@ std::vector<Tensor> split(const Tensor& self, int64_t split_size, int64_t dim) {
   return splits;
 }
 
+std::vector<Tensor> split(const Tensor& self, IntArrayRef sizes, int64_t dim) {
+  return at::split_with_sizes(self, sizes, dim);
+}
+
 std::vector<Tensor> unsafe_split(const Tensor& self, int64_t split_size, int64_t dim) {
   auto result = at::native::split(self, split_size, dim);
   for (auto& t : result) {
@@ -2111,7 +2703,7 @@ Tensor unsqueeze_sparse(Tensor const &self, int64_t dim) {
   auto sizes = self.sizes().vec();
   sizes.insert(sizes.begin() + dim, 1);
   if (dim <= sparse_dim) {
-    auto new_indices = native::cat(
+    auto new_indices = at::cat(
         {indices.narrow(0, 0, dim),
          native::zeros(
              {1, indices.size(1)},
@@ -2218,7 +2810,7 @@ Tensor flatten(const Tensor& self, DimnameList dims, Dimname out_dim) {
 }
 
 Tensor ravel(const Tensor& self) {
-  return self.reshape(-1);
+  return self.contiguous().view(-1);
 }
 
 static inline void handle_unflatten_exception(const std::runtime_error &e,
@@ -2545,7 +3137,7 @@ Tensor diag(const Tensor& self, int64_t dimension) {
 }
 
 Tensor& diag_cpu_out(const Tensor& self, int64_t dimension, Tensor &result) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, self.scalar_type(), "diag", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kBool, self.scalar_type(), "diag", [&] {
     apply_diag<scalar_t>(result, self, dimension);
   });
   return result;
@@ -2736,5 +3328,265 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64
     return output;
 }
 
+// The default implementation of lift is a no-op.
+// If TLS is set appropriately (for wrapper-tensor keys like Functionalize or functorch transforms),
+// then we'll dispatch to one of their implementations, which will properly lift the tensor into a wrapper.
+at::Tensor lift(const at::Tensor& self) {
+    return self;
+}
+
+at::Tensor& _fw_primal_copy_out(const at::Tensor & self, int64_t level, at::Tensor & out) {
+  auto tmp = self._fw_primal(level);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _make_dual_copy_out(const at::Tensor & primal, const at::Tensor & tangent, int64_t level, at::Tensor & out) {
+  auto tmp = at::_make_dual(primal, tangent, level);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& view_as_real_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = at::view_as_real(self);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& view_as_complex_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = at::view_as_complex(self);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _conj_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self._conj();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _neg_view_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self._neg_view();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& as_strided_copy_out(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset, at::Tensor & out) {
+  auto tmp = self.as_strided(size, stride, storage_offset);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _sparse_broadcast_to_copy_out(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+  auto tmp = at::_sparse_broadcast_to(self, size);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& diagonal_copy_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+  auto tmp = self.diagonal(offset, dim1, dim2);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& expand_copy_SymInt_out(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out) {
+  auto tmp = self.expand(size, implicit);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& expand_copy_out(const at::Tensor & self, at::IntArrayRef size, bool implicit, at::Tensor & out) {
+  auto tmp = self.expand(size, implicit);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& narrow_copy_out(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) {
+  auto tmp = self.narrow(dim, start, length);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& permute_copy_out(const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out) {
+  auto tmp = self.permute(dims);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _reshape_alias_copy_out(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
+  auto tmp = self._reshape_alias(size, stride);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& select_copy_int_out(const at::Tensor & self, int64_t dim, int64_t index, at::Tensor & out) {
+  auto tmp = self.select(dim, index);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& detach_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.detach();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& slice_copy_Tensor_out(const at::Tensor & self, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step, at::Tensor & out) {
+  auto tmp = self.slice(dim, start, end, step);
+  out.copy_(tmp);
+  return out;
+}
+
+
+void split_copy_Tensor_out(const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList  out) {
+  auto tmp = self.split(split_size, dim);
+
+  TORCH_CHECK(out.size() == tmp.size(), "split_copy_Tensor_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
+  for (const auto i : c10::irange(out.size())) {
+    out[i].copy_(tmp[i]);
+  }
+}
+
+
+void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList  out) {
+  auto tmp = self.split_with_sizes(split_sizes, dim);
+
+  TORCH_CHECK(out.size() == tmp.size(), "split_with_sizes_copy_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
+  for (const auto i : c10::irange(out.size())) {
+    out[i].copy_(tmp[i]);
+  }
+}
+
+
+at::Tensor& squeeze_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.squeeze();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& squeeze_copy_dim_out(const at::Tensor & self, int64_t dim, at::Tensor & out) {
+  auto tmp = self.squeeze(dim);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& t_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.t();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& transpose_copy_int_out(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out) {
+  auto tmp = self.transpose(dim0, dim1);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& unsqueeze_copy_out(const at::Tensor & self, int64_t dim, at::Tensor & out) {
+  auto tmp = self.unsqueeze(dim);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _indices_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self._indices();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& _values_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self._values();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& indices_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.indices();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& values_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.values();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& crow_indices_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.crow_indices();
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& col_indices_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.col_indices();
+  out.copy_(tmp);
+  return out;
+}
+
+
+void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  out) {
+  auto tmp = self.unbind(dim);
+
+  TORCH_CHECK(out.size() == tmp.size(), "unbind_copy_int_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
+  for (const auto i : c10::irange(out.size())) {
+    out[i].copy_(tmp[i]);
+  }
+}
+
+
+at::Tensor& view_copy_out(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+  auto tmp = self.view(size);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& view_copy_dtype_out(const at::Tensor & self, at::ScalarType dtype, at::Tensor & out) {
+  auto tmp = self.view(dtype);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& unfold_copy_out(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) {
+  auto tmp = self.unfold(dimension, size, step);
+  out.copy_(tmp);
+  return out;
+}
+
+
+at::Tensor& alias_copy_out(const at::Tensor & self, at::Tensor & out) {
+  auto tmp = self.alias();
+  out.copy_(tmp);
+  return out;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h
index 19245faff368..bb296b5ae5bc 100644
--- a/aten/src/ATen/native/TensorShape.h
+++ b/aten/src/ATen/native/TensorShape.h
@@ -1,8 +1,12 @@
-#include <ATen/ATen.h>
+#pragma once
+#include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
 
 namespace at {
 namespace native {
+inline bool cat_should_skip_tensor(const Tensor& t) {
+  return t.numel() == 0 && t.dim() == 1;
+}
 
  // Check to see if the shape of tensors is compatible
  // for being concatenated along a given dimension.
@@ -30,4 +34,28 @@ inline void check_cat_no_zero_dim(at::ArrayRef<Tensor> tensors) {
   }
 }
 
+inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  int64_t dim_size = self.size(dim);
+  TORCH_CHECK(split_size > 0 || dim_size == 0,
+           "split_size can only be 0 if dimension size is 0, "
+           "but got dimension size of ", dim_size);
+  // if split_size is 0 and dimension size is 0, there is 1 split.
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
+    // (returns a single split).  We might want to error here, but keep it for BC.
+    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
+  }
+  return num_splits;
+}
+
+///
+/// For more information, see
+/// https://pytorch.org/docs/master/generated/torch.Tensor.unfold.html#torch.Tensor.unfold
+///
+
+Tensor unfold(const Tensor& self, int64_t dimension, int64_t size, int64_t step);
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 5e5f9c91179e..f0e2c0f02caa 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -1,6 +1,7 @@
 #include <ATen/native/TensorTransformations.h>
 #include <ATen/native/IndexKernel.h>  // for flip_stub
 
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <ATen/WrapDimUtilsMulti.h>
@@ -211,6 +212,10 @@ std::vector<Tensor> atleast_3d(TensorList tensors) {
   return result;
 }
 
+Tensor chalf(const Tensor& self, c10::optional<MemoryFormat> memory_format) {
+  return self.to(kComplexHalf, false, false, memory_format);
+}
+
 DEFINE_DISPATCH(flip_stub);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index 03ee31e696aa..4909ebe84bb0 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -1,4 +1,10 @@
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/roll.h>
+#endif
 
 #include <c10/util/Exception.h>
 
diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index 065850261920..9a3a5b10cb26 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -13,7 +13,7 @@ namespace native {
 /// Else, return a new tensor containing the elementwise sums.
 Tensor _test_optional_intlist(
     const Tensor& values,
-    c10::optional<IntArrayRef> addends) {
+    at::OptionalIntArrayRef addends) {
   if (!addends) {
     return values;
   }
diff --git a/aten/src/ATen/native/TopKImpl.h b/aten/src/ATen/native/TopKImpl.h
new file mode 100644
index 000000000000..69d5c70236b8
--- /dev/null
+++ b/aten/src/ATen/native/TopKImpl.h
@@ -0,0 +1,95 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/NumericUtils.h>
+
+namespace at {
+namespace native {
+
+#ifdef CPU_CAPABILITY
+inline namespace CPU_CAPABILITY {
+#else
+inline namespace DEFAULT {
+#endif
+
+// Core topk loop, shared between CPU and QuantizedCPU
+template <typename scalar_t, typename accscalar_t>
+void topk_impl_loop(
+    const int64_t mode_values_stride,
+    const int64_t mode_indices_stride,
+    const int64_t tmp_values_stride,
+    const int64_t k,
+    const int64_t dim_size,
+    const bool largest,
+    const bool sorted,
+    char** data, const int64_t* strides, const int64_t n) {
+
+  using elem_t = std::pair<accscalar_t, int64_t>;
+  std::vector<elem_t> queue(dim_size);
+  for (const auto i : c10::irange(n)) {
+    TensorAccessor<scalar_t, 1> mode_values(
+        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
+        &k, &mode_values_stride);
+    TensorAccessor<int64_t, 1> mode_indices(
+        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
+        &k, &mode_indices_stride);
+    TensorAccessor<scalar_t, 1> tmp_values(
+        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
+        &dim_size, &tmp_values_stride);
+
+    auto n = dim_size;
+    auto use_partial_sort = k * 64 <= n;
+
+    for (const auto j : c10::irange(n)) {
+      queue[j].first = tmp_values[j];
+      queue[j].second = j;
+    }
+
+    // we want nan to be sorted as top for numpy compatibility
+    if (use_partial_sort) {
+      if (largest) {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+      } else {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+      }
+    } else {
+      if (largest) {
+        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k - 1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+            });
+        }
+      } else {
+        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k -1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+            });
+        }
+      }
+    }
+
+    for (const auto j : c10::irange(k)) {
+      mode_values[j] = queue[j].first;
+      mode_indices[j] = queue[j].second;
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/TransposeType.h b/aten/src/ATen/native/TransposeType.h
new file mode 100644
index 000000000000..5353394a9dde
--- /dev/null
+++ b/aten/src/ATen/native/TransposeType.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+namespace at {
+namespace native {
+
+// Used as an interface between the different BLAS-like libraries
+enum class TransposeType {
+  NoTranspose,
+  Transpose,
+  ConjTranspose,
+};
+
+// Transforms TransposeType into the BLAS / LAPACK format
+static char to_blas(TransposeType trans) {
+  switch (trans) {
+    case TransposeType::Transpose: return 'T';
+    case TransposeType::NoTranspose: return 'N';
+    case TransposeType::ConjTranspose: return 'C';
+  }
+  TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
+}
+
+}}  // at::native
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index b6a8a690bd28..b00a4a176918 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -6,6 +6,7 @@
 #include <ATen/TensorMeta.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TriangularOpsUtils.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -174,7 +175,13 @@ Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
 
   auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options());
   auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
-  grad_input.index_fill_(0, indices, grad);
+  // for composite compliance, use out-of-place variant of
+  // `index_fill` if grad tensor is a Tensor Subclass.
+  if (isTensorSubclassLike(grad)) {
+    grad_input = grad_input.index_fill(0, indices, grad);
+  } else {
+    grad_input.index_fill_(0, indices, grad);
+  }
   return grad_input.view(sizes);
 }
 
diff --git a/aten/src/ATen/native/TriangularOpsUtils.h b/aten/src/ATen/native/TriangularOpsUtils.h
index 13c2d33c6c18..c5bce42ed3fd 100644
--- a/aten/src/ATen/native/TriangularOpsUtils.h
+++ b/aten/src/ATen/native/TriangularOpsUtils.h
@@ -26,7 +26,7 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
   // Complete contiguity is the most desired property, which is why
   // we return true if the tensor is contiguous
   if (tensor.is_contiguous()) {
-    auto default_strides_for_size = contiguous_strides_vec(tensor.sizes());
+    auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes());
     if (tensor.strides() == default_strides_for_size) {
       return std::make_tuple(true, tensor);
     } else {
diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp
index a49e2a582658..9577e7c9dc58 100644
--- a/aten/src/ATen/native/TypeProperties.cpp
+++ b/aten/src/ATen/native/TypeProperties.cpp
@@ -133,7 +133,7 @@ ScalarType result_type(const ResultTypeState& in_state) {
   return combine_categories(in_state.dimResult, combine_categories(in_state.zeroResult, in_state.wrappedResult));
 }
 
-ScalarType result_type(TensorList tensors) {
+ScalarType result_type(ITensorListRef tensors) {
   ResultTypeState state = {};
   for (const Tensor& tensor : tensors) {
     state = update_result_type_state(tensor, state);
diff --git a/aten/src/ATen/native/TypeProperties.h b/aten/src/ATen/native/TypeProperties.h
index 85ffed1ee07f..b0f18c594882 100644
--- a/aten/src/ATen/native/TypeProperties.h
+++ b/aten/src/ATen/native/TypeProperties.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
 
 namespace at { namespace native {
 
@@ -11,8 +12,9 @@ struct ResultTypeState {
 };
 
 TORCH_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state);
+TORCH_API ResultTypeState update_result_type_state(const Scalar& scalar, const ResultTypeState& in_state);
 TORCH_API ScalarType result_type(const ResultTypeState& state);
 
-TORCH_API ScalarType result_type(TensorList tensors);
+TORCH_API ScalarType result_type(ITensorListRef tensors);
 
 }}
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index e8cfeba2df02..085fcbcca975 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -67,6 +67,7 @@ CREATE_UNARY_FLOAT_META_FUNC(special_i0e)
 CREATE_UNARY_FLOAT_META_FUNC(special_i1)
 CREATE_UNARY_FLOAT_META_FUNC(special_i1e)
 CREATE_UNARY_FLOAT_META_FUNC(special_ndtri)
+CREATE_UNARY_FLOAT_META_FUNC(special_log_ndtr)
 CREATE_UNARY_FLOAT_META_FUNC(sqrt)
 CREATE_UNARY_FLOAT_META_FUNC(tan)
 CREATE_UNARY_FLOAT_META_FUNC(tanh)
@@ -184,6 +185,7 @@ CREATE_UNARY_TORCH_IMPL_FUNC(special_i0e_out, special_i0e_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(special_i1e_out, special_i1e_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(special_i1_out, special_i1_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(special_ndtri_out, special_ndtri_stub)
+CREATE_UNARY_TORCH_IMPL_FUNC(special_log_ndtr_out, special_log_ndtr_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(sqrt_out, sqrt_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(tan_out, tan_stub)
 CREATE_UNARY_TORCH_IMPL_FUNC(tanh_out, tanh_stub)
@@ -250,7 +252,7 @@ template <typename Stub>
 static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, const Tensor& self, Stub& stub, bool promotes_integer_to_float) {
     if (self.is_complex() && !result.is_complex()) {
       // Checks if the corresponding float type can be cast to the desired dtype
-      const auto float_type = c10::toValueType(self.scalar_type());
+      const auto float_type = c10::toRealValueType(self.scalar_type());
       TORCH_CHECK(canCast(float_type, result.scalar_type()),
             "result type ", float_type, " can't be cast to the desired output type ",
             result.scalar_type());
@@ -288,8 +290,8 @@ static inline Tensor unary_op_impl(const Tensor& self, OutImpl& out_impl) {
 template <typename OutImpl>
 static inline Tensor unary_op_impl_with_complex_to_float(const Tensor& self, OutImpl& out_impl) {
   if (self.is_complex()) {
-    const auto float_type = c10::toValueType(self.scalar_type());
-    Tensor result = at::empty({0}, self.options().dtype(float_type));
+    const auto float_type = c10::toRealValueType(self.scalar_type());
+    Tensor result = at::empty_like(self, self.options().dtype(float_type));
     return out_impl(result, self);
   }
 
@@ -385,7 +387,7 @@ Tensor& angle_out(const Tensor& self, Tensor& result) {
 }
 Tensor angle(const Tensor& self) {
   if (self.is_complex()) {
-    const auto float_type = c10::toValueType(self.scalar_type());
+    const auto float_type = c10::toRealValueType(self.scalar_type());
     Tensor result = at::empty({0}, self.options().dtype(float_type));
     return at::angle_out(result, self);
   }
@@ -538,7 +540,7 @@ Tensor special_sinc(const Tensor& self) { return self.sinc(); }
 namespace {
 
 inline Tensor calc_ndtr(const Tensor& self) {
-  auto x_sqrt_2 = self / std::sqrt(2.);
+  auto x_sqrt_2 = self * M_SQRT1_2;
   return (1 + at::erf(x_sqrt_2)) * 0.5;
 }
 
@@ -806,8 +808,6 @@ Tensor& special_gammaln_out(const Tensor& self, Tensor& result) { return at::lga
 
 DEFINE_DISPATCH(abs_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(angle_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(real_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-DEFINE_DISPATCH(imag_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(conj_physical_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(acos_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(acosh_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -841,6 +841,7 @@ DEFINE_DISPATCH(log1p_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-
 DEFINE_DISPATCH(log2_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(logical_not_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(special_ndtri_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(special_log_ndtr_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(neg_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(nan_to_num_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(polygamma_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index 47224d51fc35..3c205cb9a878 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -6,7 +6,7 @@
 
 namespace at {
 class Tensor;
-struct TensorIterator;
+class TensorBase;
 struct TensorIteratorBase;
 }
 
@@ -17,8 +17,6 @@ using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a);
 
 DECLARE_DISPATCH(unary_fn, abs_stub);
 DECLARE_DISPATCH(unary_fn, angle_stub);
-DECLARE_DISPATCH(unary_fn, real_stub);
-DECLARE_DISPATCH(unary_fn, imag_stub);
 DECLARE_DISPATCH(unary_fn, conj_physical_stub);
 DECLARE_DISPATCH(unary_fn, acos_stub);
 DECLARE_DISPATCH(unary_fn, acosh_stub);
@@ -52,6 +50,7 @@ DECLARE_DISPATCH(unary_fn, log10_stub);
 DECLARE_DISPATCH(unary_fn, log1p_stub);
 DECLARE_DISPATCH(unary_fn, log2_stub);
 DECLARE_DISPATCH(unary_fn, special_ndtri_stub);
+DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub);
 DECLARE_DISPATCH(unary_fn, neg_stub);
 
 DECLARE_DISPATCH(unary_fn, reciprocal_stub);
@@ -73,14 +72,14 @@ DECLARE_DISPATCH(unary_fn, trunc_stub);
 DECLARE_DISPATCH(unary_fn, lgamma_stub);
 
 // NB: these are actually defined in Distribution
-DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional<Generator>), bernoulli_tensor_stub);
-DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
-DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
diff --git a/aten/src/ATen/native/Unfold2d.h b/aten/src/ATen/native/Unfold2d.h
index bfee0bc782f4..2ea27e0caded 100644
--- a/aten/src/ATen/native/Unfold2d.h
+++ b/aten/src/ATen/native/Unfold2d.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/ScalarType.h>
+#include <cstdint>
 
 namespace at { namespace native {
 
@@ -19,7 +20,8 @@ using unfold2d_fn = void (*)(
     int64_t input_height,
     int64_t input_width,
     int64_t output_height,
-    int64_t output_width
+    int64_t output_width,
+    bool is_channels_last
 );
 
 DECLARE_DISPATCH(unfold2d_fn, unfolded2d_copy_stub);
diff --git a/aten/src/ATen/native/UnfoldBackward.cpp b/aten/src/ATen/native/UnfoldBackward.cpp
index f1509c9dd837..10bee80cea23 100644
--- a/aten/src/ATen/native/UnfoldBackward.cpp
+++ b/aten/src/ATen/native/UnfoldBackward.cpp
@@ -1,5 +1,14 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnfoldBackward.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/unfold_backward_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at { namespace native {
 
 DEFINE_DISPATCH(unfold_backward_stub);
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index 8e33d64aa5f6..1f6c8fa1b289 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -1,11 +1,17 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/ReduceOpsUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#endif
+
 namespace at { namespace native {
 
 using unfold_backward_fn = void (*)(
@@ -106,8 +112,8 @@ static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_in(
   Tensor& grad_out,
   const Tensor& grad_in,
   int64_t dim,
-  int64_t size,
-  int64_t step
+  int64_t /*size*/,
+  int64_t /*step*/
 ) {
   dim = maybe_wrap_dim(dim, grad_out.dim());
   // last dim stores the folds
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index ce911f1763b6..dc066d99d46d 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -182,7 +182,7 @@ std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
       TORCH_CHECK(
           num_zero_dims == 1,
           "Number of zero sized dimensions is more than one, so unique cannot be applied ")
-      Tensor output = at::empty({0}, self.options());
+      Tensor output = at::empty(sizes, self.options());
       Tensor inverse_indices =
           at::empty({0}, self.options().dtype(kLong));
       Tensor counts = at::empty({0}, self.options().dtype(kLong));
diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp
index bcc8891de8dc..db75b7e99fdb 100644
--- a/aten/src/ATen/native/UpSample.cpp
+++ b/aten/src/ATen/native/UpSample.cpp
@@ -9,7 +9,7 @@ namespace upsample {
 
 TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     c10::IntArrayRef input_size,  // Full input tensor size.
-    c10::optional<c10::IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<c10::ArrayRef<double>> scale_factors) {
   const auto spatial_dimensions = static_cast<int64_t>(input_size.size()) - 2;
   if (output_size) {
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 5bc3a434f428..6b248352de6a 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -2,7 +2,7 @@
 
 #include <math.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/DispatchStub.h>
 
@@ -51,7 +51,7 @@ namespace upsample {
 
 TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     c10::IntArrayRef input_size,  // Full input tensor size.
-    c10::optional<c10::IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<c10::ArrayRef<double>> scale_factors);
 
 inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
@@ -328,6 +328,39 @@ static inline int64_t nearest_neighbor_exact_compute_source_index(
   return src_index;
 }
 
+static inline int64_t nearest_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  // This method specificly treats cases: output_size == input_size or
+  // output_size == 2 * input_size, that we would like to get rid of
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_exact_idx as replacement
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    return output_index;
+  } else if (output_size == 2 * input_size) {
+    // scale_factor = 2, shift input index
+    return output_index >> 1;
+  } else {
+    float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_compute_source_index(scale, output_index, input_size);
+  }
+}
+
+static inline int64_t nearest_exact_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
+}
+
+// Define a typedef to dispatch to nearest_idx or nearest_exact_idx
+typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>);
+
 template <typename scalar_t>
 static scalar_t upsample_get_value_bounded(
     scalar_t* data,
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 95d9f91bcb80..7cda89c61264 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -188,7 +188,7 @@ static void upsample_bicubic2d_backward_kernel(
 
   auto grad_output = grad_output_.contiguous();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
       grad_output.scalar_type(), "upsample_bicubic2d_backward", [&] {
         scalar_t* idata = grad_input.data_ptr<scalar_t>();
         scalar_t* odata = grad_output.data_ptr<scalar_t>();
@@ -264,7 +264,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_bicubic2d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -275,7 +275,7 @@ Tensor upsample_bicubic2d(
 
 Tensor upsample_bicubic2d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
@@ -287,7 +287,7 @@ Tensor upsample_bicubic2d_backward(
 
 Tensor _upsample_bicubic2d_aa(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -298,7 +298,7 @@ Tensor _upsample_bicubic2d_aa(
 
 Tensor _upsample_bicubic2d_aa_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index f73bb50c9ff4..2a228a86ac71 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -145,7 +145,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_bilinear2d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -156,7 +156,7 @@ Tensor upsample_bilinear2d(
 
 Tensor upsample_bilinear2d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
@@ -168,7 +168,7 @@ Tensor upsample_bilinear2d_backward(
 
 Tensor _upsample_bilinear2d_aa(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -179,7 +179,7 @@ Tensor _upsample_bilinear2d_aa(
 
 Tensor _upsample_bilinear2d_aa_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp
index 371a53dc8900..687cad5c879b 100644
--- a/aten/src/ATen/native/UpSampleLinear1d.cpp
+++ b/aten/src/ATen/native/UpSampleLinear1d.cpp
@@ -79,7 +79,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_linear1d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -89,7 +89,7 @@ Tensor upsample_linear1d(
 
 Tensor upsample_linear1d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index 52fa7bcc5c9a..b9bc5b3c5b96 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -109,7 +109,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_nearest1d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_w = get_scale_value(scale_factors, 0);
@@ -118,7 +118,7 @@ Tensor upsample_nearest1d(
 
 Tensor _upsample_nearest_exact1d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_w = get_scale_value(scale_factors, 0);
@@ -127,7 +127,7 @@ Tensor _upsample_nearest_exact1d(
 
 Tensor upsample_nearest1d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
@@ -137,7 +137,7 @@ Tensor upsample_nearest1d_backward(
 
 Tensor _upsample_nearest_exact1d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index 864121fb0afa..1f9a9eafd4f6 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -134,7 +134,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_nearest2d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
@@ -144,7 +144,7 @@ Tensor upsample_nearest2d(
 
 Tensor _upsample_nearest_exact2d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
@@ -154,7 +154,7 @@ Tensor _upsample_nearest_exact2d(
 
 Tensor upsample_nearest2d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
@@ -165,7 +165,7 @@ Tensor upsample_nearest2d_backward(
 
 Tensor _upsample_nearest_exact2d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index c659a86cd81f..ff559f3e09c0 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -149,7 +149,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_nearest3d_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
@@ -160,7 +160,7 @@ Tensor upsample_nearest3d_cpu(
 
 Tensor _upsample_nearest_exact3d_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
@@ -172,7 +172,7 @@ Tensor _upsample_nearest_exact3d_cpu(
 // when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd
 Tensor upsample_nearest3d_backward_cpu(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
@@ -184,7 +184,7 @@ Tensor upsample_nearest3d_backward_cpu(
 
 Tensor _upsample_nearest_exact3d_backward_cpu(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
index 75a77a76c623..52cb2e00df46 100644
--- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp
+++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
@@ -51,7 +51,7 @@ TORCH_META_FUNC(upsample_trilinear3d_backward) (
         " but got grad_output.size(", i, ") = ", grad_output.size(i));
   }
 
-  set_output(input_size, grad_output.options());
+  set_output(input_size, grad_output.options().memory_format(grad_output.suggest_memory_format()));
 }
 
 } // namespace meta
@@ -90,7 +90,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_trilinear3d(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
@@ -102,7 +102,7 @@ Tensor upsample_trilinear3d(
 
 Tensor upsample_trilinear3d_backward(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
index d1bc46809c53..b2229bdbf0d2 100644
--- a/aten/src/ATen/native/WeightNorm.cpp
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/native/cpu/WeightNormKernel.h>
 
 #include <cstring>
 #include <memory>
@@ -10,6 +11,9 @@
 namespace at {
 namespace native {
 
+DEFINE_DISPATCH(weight_norm_stub);
+DEFINE_DISPATCH(weight_norm_backward_stub);
+
 // Staying faithful to the Python for now for clarity, look for optimizations later
 // (e.g., single return statement for RVO)
 Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim)
@@ -32,6 +36,38 @@ Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim)
   }
 }
 
+std::tuple<Tensor,Tensor> weight_norm_cpu(
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim) {
+  auto w = at::empty_like(v, at::MemoryFormat::Contiguous);
+
+  // align with cuda behavior, keep norm in 'Float' when g is 'BFloat16'
+  const auto dtype = g.scalar_type() == at::ScalarType::BFloat16 ?
+      at::ScalarType::Float : g.scalar_type();
+  auto norm = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(dtype));
+  weight_norm_stub(kCPU, w, norm, v, g, dim);
+
+  return std::tuple<Tensor, Tensor>{w, norm};
+}
+
+std::tuple<Tensor, Tensor> weight_norm_backward_cpu(
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norm,
+    int64_t dim) {
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norm.is_contiguous(), "saved_norm must be contiguous");
+
+  auto grad_v = at::empty_like(saved_v, at::MemoryFormat::Contiguous);
+  auto grad_g = at::empty_like(saved_g, at::MemoryFormat::Contiguous);
+  weight_norm_backward_stub(kCPU, grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, dim);
+
+  return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+}
+
 Tensor _weight_norm
   (const Tensor & v_in,
    const Tensor & g_in,
@@ -46,12 +82,12 @@ Tensor _weight_norm
   auto v = v_in.contiguous();
   auto g = g_in.contiguous();
 
-  bool can_use_fused = v.is_cuda() && (dim == 0 || dim == v.dim() - 1);
+  bool can_use_fused = (dim == 0) || (dim == v.dim() - 1);
 
   if (can_use_fused) {
     // weight_norm does not have a derivative defined for it, so this will route back through
     // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph.
-    return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim));
+    return std::get<0>(at::_weight_norm_interface(v, g, dim));
   } else {
     // Double-differentiable primitive ops
     // at::native::norm_except_dim would probably be fine as well.
@@ -59,7 +95,7 @@ Tensor _weight_norm
   }
 }
 
-// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used
+// Differentiable backward path, an alternative to weight_norm_backward, to be used
 // when backward is itself creating a graph.
 // The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we
 // define a separate function here, instead of inlining it in weight_norm_cuda_backward.
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
index 0212f2688b52..07fc3d245fe2 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
@@ -71,7 +71,7 @@ int register_linear_params() {
 }
 
 namespace {
-static auto linear_params = register_linear_params();
+static C10_UNUSED auto linear_params = register_linear_params();
 }  // namespace
 
 }}  // namespace ao::sparse
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index e0fb55427a77..187ed4fd1404 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -2,7 +2,6 @@
 #include <c10/util/irange.h>
 #include <torch/custom_class.h>
 
-#include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
index a0a389f818c4..ec6e160b16c3 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@@ -1,7 +1,6 @@
 #include <ATen/ATen.h>
 #include <torch/custom_class.h>
 
-#include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
 #include <ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h>
diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp
deleted file mode 100644
index 599f0f866e2b..000000000000
--- a/aten/src/ATen/native/attention.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include <type_traits>
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/Parallel.h>
-#include <ATen/cpu/vec/vec256/vec256.h>
-
-namespace at {
-
-namespace native {
-
-namespace {
-
-Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto b_ = b.transpose(1, 0);
-  auto c_ = at::native::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), b.size(0)});
-}
-
-// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto B = qkv.size(0);
-  auto T = qkv.size(1);
-  auto _3D = qkv.size(2);
-  auto D = _3D / 3;
-  TORCH_CHECK(D % num_head == 0);
-  const auto dim_per_head = D / num_head;
-  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
-
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      qkv.scalar_type(),
-      "transform_bias_rescale_qkv",
-      [&] {
-        scalar_t* qkv_data = qkv.data_ptr<scalar_t>();
-        scalar_t* qkv_bias_data = qkv_bias.data_ptr<scalar_t>();
-        scalar_t* q_k_v_data = q_k_v.data_ptr<scalar_t>();
-        const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(dim_per_head));
-
-        int64_t grain_size =
-            std::min(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
-        parallel_for(
-            0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
-              for (auto i : c10::irange(begin, end)) {
-                auto t = i % T;
-                i /= T;
-                auto nh = i % num_head;
-                i /= num_head;
-                auto b = i;
-                using Vec = vec::Vectorized<scalar_t>;
-                auto V = vec::Vectorized<scalar_t>::size();
-                // TODO: handle epilogue
-                for (auto dh = 0; dh < dim_per_head / V; dh += V) {
-                  auto d = nh * dim_per_head + dh;
-                  // load
-                  auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
-                  auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]);
-                  auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]);
-
-                  auto q_data =
-                      Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) +
-                      q_bias_data;
-                  auto k_data =
-                      Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) +
-                      k_bias_data;
-                  auto v_data =
-                      Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) +
-                      v_bias_data;
-
-                  q_data = q_data / Vec(sqrt_dim_per_head);
-
-                  q_data.store(&q_k_v_data
-                                   [0 * B * num_head * T * dim_per_head +
-                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
-                                    t * dim_per_head + dh]);
-                  k_data.store(&q_k_v_data
-                                   [1 * B * num_head * T * dim_per_head +
-                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
-                                    t * dim_per_head + dh]);
-                  v_data.store(&q_k_v_data
-                                   [2 * B * num_head * T * dim_per_head +
-                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
-                                    t * dim_per_head + dh]);
-                }
-              }
-            });
-      });
-  auto q_k_v_s =
-      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
-  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
-}
-
-Tensor bmm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  auto bt_ = b_.transpose(2, 1);
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, bt_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
-}
-
-void masked_softmax_dropout(
-    const Tensor& attn_scores,
-    const c10::optional<Tensor>& attn_mask) {
-  auto B = attn_scores.size(0);
-  auto num_heads = attn_scores.size(1);
-  auto T = attn_scores.size(2);
-  if (attn_mask) {
-    TORCH_CHECK(attn_mask->is_contiguous());
-  }
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      attn_scores.scalar_type(),
-      "masked_softmax_dropout",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, false>;
-        // TODO: proper implementation with masking.
-        scalar_t* attn_scores_data = attn_scores.data_ptr<scalar_t>();
-        int64_t grain_size = std::min(internal::GRAIN_SIZE / T, (int64_t)1);
-        parallel_for(
-            0, B * num_heads * T, grain_size, [&](int64_t begin, int64_t end) {
-              for (const auto i : c10::irange(begin, end)) {
-                using Vec = vec::Vectorized<scalar_t>;
-                auto V = vec::Vectorized<scalar_t>::size();
-
-                scalar_t* input_data = attn_scores_data + i * T;
-                auto max_input = Vec(std::numeric_limits<scalar_t>::lowest());
-                // TODO: handle epilogue
-                for (auto t = 0; t < T; t += V) {
-                  auto v = Vec::loadu(&input_data[t]);
-                  max_input = vec::maximum(max_input, v);
-                }
-
-                auto hmax = std::numeric_limits<scalar_t>::lowest();
-                for (auto i = 0; i < V; ++i) {
-                  hmax = std::max(max_input[i], hmax);
-                }
-                accscalar_t hsum = 0;
-                for (auto t = 0; t < T; t += V) {
-                  auto v = Vec::loadu(&input_data[t]);
-                  // TODO: vectorize in accscalar_t?
-                  for (auto i = 0; i < V; ++i) {
-                    hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
-                  }
-                }
-                auto inv_denominator = 1.0 / hsum;
-                for (auto t = 0; t < T; t += V) {
-                  Vec v = Vec::loadu(&input_data[t]);
-
-                  // TODO: vectorize in accscalar_t?
-                  // TODO this faster solution does not work on Android build
-                  /*
-                  for (auto i = 0; i < V; ++i) {
-                    v[i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
-                  }
-                  v.store(&input_data[t]);
-                  */
-                  for (auto i = 0; i < V; ++i) {
-                    input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
-                  }
-                }
-              }
-            });
-      });
-}
-
-Tensor bmm_nn(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
-}
-
-Tensor transform_0213(const Tensor& a) {
-  // TODO: check perf vs dedicated kernel.
-  return a.permute({0, 2, 1, 3})
-      .contiguous()
-      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
-}
-
-Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto r_ = at::native::linear(a_, b, c);
-  return r_.view({a.size(0), a.size(1), r_.size(1)});
-}
-
-} // namespace
-
-Tensor multi_head_self_attention_cpu(
-    const Tensor& query,
-    const Tensor& qkv_weight,
-    const Tensor& qkv_bias,
-    const Tensor& proj_weight,
-    const Tensor& proj_bias,
-    const int64_t num_head,
-    const c10::optional<Tensor>& mask) {
-  // query shape: [B, T, D]
-  // qkv_weight shape: [3 * D, D]
-
-  // shape: [B, T, 3 x D]
-  auto qkv = gemm_nt(query, qkv_weight);
-
-  // shape: 3 x [B, num_head, T, dim_per_head]
-  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  auto q = std::get<0>(q_k_v);
-  auto k = std::get<1>(q_k_v);
-  auto v = std::get<2>(q_k_v);
-
-  // shape: [B, num_head, T, T]
-  auto qkt = bmm_nt(q, k);
-
-  // shape: [B, num_head, T, T]
-  masked_softmax_dropout(qkt, mask);
-
-  // shape: [B, num_head, T, dim_per_head]
-  auto attn_ctx = bmm_nn(qkt, v);
-
-  // shape: [B, T, D]
-  auto attn = transform_0213(attn_ctx);
-
-  // shape: [B, T, D]
-  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
-
-  return proj;
-}
-
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/batch_norm.h b/aten/src/ATen/native/batch_norm.h
index 4c25b08aa684..b729dfe199b0 100644
--- a/aten/src/ATen/native/batch_norm.h
+++ b/aten/src/ATen/native/batch_norm.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index b192d0c4d707..14fef621b10f 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -24,41 +24,106 @@ namespace {
 
 template <typename scalar_t>
 inline void _vec_log_sigmoid(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
-  using Vec = Vectorized<scalar_t>;
-  scalar_t* output_data = output.data_ptr<scalar_t>();
-  scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
-  scalar_t* input_data = input.data_ptr<scalar_t>();
-  parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
-    int64_t size = end - begin;
-    int64_t d = 0;
-    for (; d < size - (size % Vec::size()); d += Vec::size()) {
-      Vec data_vec = Vec::loadu(input_data + begin+ d);
-      Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
-      Vec buffer_vec = data_vec.abs().neg().exp();
-      Vec output_vec = min_vec - buffer_vec.log1p();
-      buffer_vec.store(buffer_data + begin + d);
-      output_vec.store(output_data + begin + d);
-    }
-    if (size - d > 0) {
-      Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
-      Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
-      Vec buffer_vec = data_vec.abs().neg().exp();
-      Vec output_vec = min_vec - buffer_vec.log1p();
-      buffer_vec.store(buffer_data + begin + d, size - d);
-      output_vec.store(output_data + begin + d, size - d);
-    }
-  });
+  if (input.scalar_type() == kBFloat16) {
+    using Vec = Vectorized<BFloat16>;
+    BFloat16* output_data = output.data_ptr<BFloat16>();
+    BFloat16* buffer_data = buffer.data_ptr<BFloat16>();
+    BFloat16* input_data = input.data_ptr<BFloat16>();
+    parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
+      int64_t size = end - begin;
+      int64_t d = 0;
+      for (; d < size - (size % Vec::size()); d += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_data + begin+ d);
+        Vectorized<float> data_vec0, data_vec1;
+        std::tie(data_vec0, data_vec1) = convert_bfloat16_float(data_vec);
+        Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
+        Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
+        Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
+        min_vec = minimum(data_vec1, Vectorized<float>(float(0)));
+        Vectorized<float> buffer_vec1 = data_vec1.abs().neg().exp();
+        Vectorized<float> output_vec1 = min_vec - buffer_vec1.log1p();
+        convert_float_bfloat16(buffer_vec0, buffer_vec1).store(buffer_data + begin + d);
+        convert_float_bfloat16(output_vec0, output_vec1).store(output_data + begin + d);
+      }
+      if (size - d > 0) {
+        Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
+        Vectorized<float> data_vec0, data_vec1;
+        std::tie(data_vec0, data_vec1) = convert_bfloat16_float(data_vec);
+        Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
+        Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
+        Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
+        min_vec = minimum(data_vec1, Vectorized<float>(float(0)));
+        Vectorized<float> buffer_vec1 = data_vec1.abs().neg().exp();
+        Vectorized<float> output_vec1 = min_vec - buffer_vec1.log1p();
+        convert_float_bfloat16(buffer_vec0, buffer_vec1).store(buffer_data + begin + d, size - d);
+        convert_float_bfloat16(output_vec0, output_vec1).store(output_data + begin + d, size - d);
+      }
+    });
+  } else {
+    using Vec = Vectorized<scalar_t>;
+    scalar_t* output_data = output.data_ptr<scalar_t>();
+    scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
+    scalar_t* input_data = input.data_ptr<scalar_t>();
+    parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
+      int64_t size = end - begin;
+      int64_t d = 0;
+      for (; d < size - (size % Vec::size()); d += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_data + begin+ d);
+        Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
+        Vec buffer_vec = data_vec.abs().neg().exp();
+        Vec output_vec = min_vec - buffer_vec.log1p();
+        buffer_vec.store(buffer_data + begin + d);
+        output_vec.store(output_data + begin + d);
+      }
+      if (size - d > 0) {
+        Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
+        Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
+        Vec buffer_vec = data_vec.abs().neg().exp();
+        Vec output_vec = min_vec - buffer_vec.log1p();
+        buffer_vec.store(buffer_data + begin + d, size - d);
+        output_vec.store(output_data + begin + d, size - d);
+      }
+    });
+  }
 }
 
-static void log_sigmoid_cpu_kernel(
-    TensorBase &output, TensorBase &buffer, const TensorBase &input) {
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
+static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, input.scalar_type(), "log_sigmoid_cpu", [&] {
     _vec_log_sigmoid<scalar_t>(output, buffer, input);
   });
 }
 
 static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    using Vec = Vectorized<BFloat16>;
+    auto zero_val = float(0);
+    auto zero_vec = Vectorized<float>(zero_val);
+    auto one_val = float(1);
+    auto one_vec = Vectorized<float>(one_val);
+    cpu_kernel_vec(iter,
+      [=](BFloat16 a, BFloat16 b, BFloat16 c) -> BFloat16 {
+        auto in_negative = float(a) < float(0);
+        auto max_deriv = in_negative ? float(1) : float(0);
+        auto sign = in_negative ? float(1) : -float(1);
+        return (max_deriv - sign * (float(b) / (float(1) + b))) * float(c);
+      },
+      [=](Vec a, Vec b, Vec c) -> Vec {
+        Vectorized<float> a0, a1, b0, b1, c0, c1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        std::tie(b0, b1) = convert_bfloat16_float(b);
+        std::tie(c0, c1) = convert_bfloat16_float(c);
+        auto mask = a0 < zero_vec;
+        auto max_deriv_vec = Vectorized<float>::blendv(zero_vec, one_vec, mask);
+        auto sign_vec = Vectorized<float>::blendv(one_vec.neg(), one_vec, mask);
+        a0 = (max_deriv_vec - sign_vec * (b0 / (one_vec + b0))) * c0;
+        mask = a1 < zero_vec;
+        max_deriv_vec = Vectorized<float>::blendv(zero_vec, one_vec, mask);
+        sign_vec = Vectorized<float>::blendv(one_vec.neg(), one_vec, mask);
+        a1 = (max_deriv_vec - sign_vec * (b1 / (one_vec + b1))) * c1;
+        return convert_float_bfloat16(a0, a1);
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
     using Vec = Vectorized<scalar_t>;
     auto zero_val = scalar_t(0);
     auto zero_vec = Vec(zero_val);
@@ -78,6 +143,7 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
         return (max_deriv_vec - sign_vec * (b / (one_vec + b))) * c;
       });
   });
+  }
 }
 
 static void threshold_kernel(
@@ -102,71 +168,142 @@ static void threshold_kernel(
 }
 
 void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
-  AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_cpu", [&]() {
-    using Vec = Vectorized<scalar_t>;
-    auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
-    auto poscoef = scale.to<scalar_t>();
-    auto negiptcoef = input_scale.to<scalar_t>();
-    const Vec negcoef_vec(negcoef);
-    const Vec negiptcoef_vec(negiptcoef);
-    const Vec poscoef_vec(poscoef);
-    const Vec one_vec(static_cast<scalar_t>(1));
-    const Vec zero_vec(static_cast<scalar_t>(0));
+  if (it.common_dtype() == kBFloat16) {
+    auto negcoef = alpha.to<float>() * scale.to<float>();
+    auto poscoef = scale.to<float>();
+    auto negiptcoef = input_scale.to<float>();
+    const Vectorized<float> negcoef_vec(negcoef);
+    const Vectorized<float> negiptcoef_vec(negiptcoef);
+    const Vectorized<float> poscoef_vec(poscoef);
+    const Vectorized<float> one_vec(static_cast<float>(1));
+    const Vectorized<float> zero_vec(static_cast<float>(0));
     cpu_kernel_vec(
-        it,
-        [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
-          return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef;
-        },
-        [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec {
-          auto cmp = (a > zero_vec);
-          if (!cmp.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
-            return a * poscoef_vec;
-          } else {
-            return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
-          }
-        });
-  });
+      it,
+      [negcoef, negiptcoef, poscoef](BFloat16 a) -> BFloat16 {
+        return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef;
+      },
+      [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized<BFloat16> a) -> Vectorized<BFloat16> {
+        Vectorized<float> a0, a1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        auto cmp0 = (a0 > zero_vec);
+        auto cmp1 = (a1 > zero_vec);
+        if (!cmp0.zero_mask() && !cmp1.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
+          return convert_float_bfloat16(a0 * poscoef_vec, a1 * poscoef_vec);
+        } else {
+          auto res0 = Vectorized<float>::blendv(((a0 * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a0 * poscoef_vec, cmp0);
+          auto res1 = Vectorized<float>::blendv(((a1 * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a1 * poscoef_vec, cmp1);
+          return convert_float_bfloat16(res0, res1);
+        }
+      }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_cpu", [&]() {
+      using Vec = Vectorized<scalar_t>;
+      auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
+      auto poscoef = scale.to<scalar_t>();
+      auto negiptcoef = input_scale.to<scalar_t>();
+      const Vec negcoef_vec(negcoef);
+      const Vec negiptcoef_vec(negiptcoef);
+      const Vec poscoef_vec(poscoef);
+      const Vec one_vec(static_cast<scalar_t>(1));
+      const Vec zero_vec(static_cast<scalar_t>(0));
+      cpu_kernel_vec(
+          it,
+          [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
+            return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef;
+          },
+          [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec {
+            auto cmp = (a > zero_vec);
+            if (!cmp.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
+              return a * poscoef_vec;
+            } else {
+              return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
+            }
+          });
+    });
+  }
 }
 
 void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, bool is_result) {
-  AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_backward_cpu", [&]() {
-    using Vec = Vectorized<scalar_t>;
-    auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
-    auto poscoef = scale.to<scalar_t>();
-    auto negiptcoef = input_scale.to<scalar_t>();
-    const Vec negcoef_vec(negcoef);
-    const Vec negiptcoef_vec(negiptcoef);
-    const Vec poscoef_vec(poscoef);
-    const Vec zero_vec(static_cast<scalar_t>(0));
+  if (it.common_dtype() == kBFloat16) {
+    auto negcoef = alpha.to<float>() * scale.to<float>();
+    auto poscoef = scale.to<float>();
+    auto negiptcoef = input_scale.to<float>();
+    const Vectorized<float> negcoef_vec(negcoef);
+    const Vectorized<float> negiptcoef_vec(negiptcoef);
+    const Vectorized<float> poscoef_vec(poscoef);
+    const Vectorized<float> zero_vec(static_cast<float>(0));
     cpu_kernel_vec(
         it,
-        [negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t {
+        [negcoef, negiptcoef, poscoef, is_result](BFloat16 a, BFloat16 b) -> BFloat16 {
           if (is_result) {
-            return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef;
+            return float(b) <= float(0) ? float(a) * negiptcoef * (float(b) + negcoef) : float(a) * poscoef;
           } else {
-            return b <= scalar_t(0) ? a * negiptcoef * negcoef * std::exp(b * negiptcoef): a * poscoef;
+            return float(b) <= float(0) ? float(a) * negiptcoef * negcoef * std::exp(float(b) * negiptcoef): float(a) * poscoef;
           }
         },
-        [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vec a, Vec b) -> Vec {
-          auto cmp = (b > zero_vec);
+        [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vectorized<BFloat16> a, Vectorized<BFloat16> b) -> Vectorized<BFloat16> {
+          Vectorized<float> a0, a1;
+          std::tie(a0, a1) = convert_bfloat16_float(a);
+          Vectorized<float> b0, b1;
+          std::tie(b0, b1) = convert_bfloat16_float(b);
+          auto cmp0 = (b0 > zero_vec);
+          auto cmp1 = (b1 > zero_vec);
           if (is_result) {
-            if (!cmp.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
-              return a * poscoef_vec;
+            if (!cmp0.zero_mask() && !cmp1.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
+              return convert_float_bfloat16(a0 * poscoef_vec, a1 * poscoef_vec);
             } else {
-              return Vec::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp);
+              auto res0 = Vectorized<float>::blendv(a0 * negiptcoef_vec * (b0 + negcoef_vec), a0 * poscoef_vec, cmp0);
+              auto res1 = Vectorized<float>::blendv(a1 * negiptcoef_vec * (b1 + negcoef_vec), a1 * poscoef_vec, cmp1);
+              return convert_float_bfloat16(res0, res1);
             }
           } else {
-            return Vec::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp);
+            auto res0 = Vectorized<float>::blendv(a0 * negiptcoef_vec * negcoef_vec * (b0 * negiptcoef_vec).exp(), a0 * poscoef_vec, cmp0);
+            auto res1 = Vectorized<float>::blendv(a1 * negiptcoef_vec * negcoef_vec * (b1 * negiptcoef_vec).exp(), a1 * poscoef_vec, cmp1);
+            return convert_float_bfloat16(res0, res1);
           }
         }
-    );
-  });
+      );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_backward_cpu", [&]() {
+      using Vec = Vectorized<scalar_t>;
+      auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
+      auto poscoef = scale.to<scalar_t>();
+      auto negiptcoef = input_scale.to<scalar_t>();
+      const Vec negcoef_vec(negcoef);
+      const Vec negiptcoef_vec(negiptcoef);
+      const Vec poscoef_vec(poscoef);
+      const Vec zero_vec(static_cast<scalar_t>(0));
+      cpu_kernel_vec(
+          it,
+          [negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t {
+            if (is_result) {
+              return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef;
+            } else {
+              return b <= scalar_t(0) ? a * negiptcoef * negcoef * std::exp(b * negiptcoef): a * poscoef;
+            }
+          },
+          [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vec a, Vec b) -> Vec {
+            auto cmp = (b > zero_vec);
+            if (is_result) {
+              if (!cmp.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
+                return a * poscoef_vec;
+              } else {
+                return Vec::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp);
+              }
+            } else {
+              return Vec::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp);
+            }
+          }
+      );
+    });
+  }
 }
 
 // TODO(yangxm): Add another fast kernel using formula
 // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
 // and the fast tanh impl from Eigen.
-void GeluKernelImpl(TensorIteratorBase& it) {
+void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) {
   auto grain_size = at::internal::GRAIN_SIZE;
   // Numbers based on benchmarking.
   // Benchmark: benchmarks/operator_benchmarks/pt/gelu_test.py
@@ -187,57 +324,165 @@ void GeluKernelImpl(TensorIteratorBase& it) {
   if (it.numel() > GELU_MIN_ELEMENTS_FOR_MULTI_THREADING) {
     grain_size = it.numel() / at::get_num_threads();
   }
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() {
-    using Vec = vec::Vectorized<scalar_t>;
-    const Vec kAlphaVec(scalar_t(M_SQRT1_2));
-    const Vec kOneVec(scalar_t(1));
-    const Vec kPointFiveVec(scalar_t(0.5));
-    cpu_kernel_vec(
-        it,
-        [](scalar_t x) {
-          const scalar_t kAlpha = scalar_t(M_SQRT1_2);
-          return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
-        },
-        [&](Vec x_vec) {
-          return x_vec * kPointFiveVec *
-              (kOneVec + (x_vec * kAlphaVec).erf());
-        },
-        grain_size);
-  });
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() {
+      using Vec = vec::Vectorized<scalar_t>;
+      const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5));
+      const Vec kKappaVec(scalar_t(0.044715));
+      const Vec kOneVec(scalar_t(1));
+      const Vec kPointFiveVec(scalar_t(0.5));
+      cpu_kernel_vec(
+          it,
+          [](scalar_t x) {
+            const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+            const scalar_t kKappa = 0.044715;
+            auto x_cube = x * x * x;
+            auto inner = kBeta * (x + kKappa * x_cube);
+            return scalar_t(0.5) * x * (scalar_t(1) + std::tanh(inner));
+          },
+          [&](Vec x_vec) {
+            auto x_cube = x_vec * x_vec * x_vec;
+            auto inner_vec = kBetaVec * (x_vec + kKappaVec * x_cube);
+            return kPointFiveVec * x_vec * (kOneVec + inner_vec.tanh());
+          },
+          grain_size);
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() {
+      using Vec = vec::Vectorized<scalar_t>;
+      const Vec kAlphaVec(scalar_t(M_SQRT1_2));
+      const Vec kOneVec(scalar_t(1));
+      const Vec kPointFiveVec(scalar_t(0.5));
+      cpu_kernel_vec(
+          it,
+          [](scalar_t x) {
+            const scalar_t kAlpha = scalar_t(M_SQRT1_2);
+            return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
+          },
+          [&](Vec x_vec) {
+            return x_vec * kPointFiveVec *
+                (kOneVec + (x_vec * kAlphaVec).erf());
+          },
+          grain_size);
+    });
+  }
 }
 
-void GeluBackwardKernelImpl(TensorIteratorBase& it) {
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() {
-    using Vec = vec::Vectorized<scalar_t>;
-    const Vec kAlphaVec(scalar_t(M_SQRT1_2));
-    const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5));
-    const Vec kOneVec(scalar_t(1));
-    const Vec kPointFiveVec(scalar_t(0.5));
-    const Vec kMinusPointFiveVec(scalar_t(-0.5));
-    cpu_kernel_vec(
-        it,
-        [](scalar_t dy, scalar_t x) {
-          const scalar_t kAlpha = scalar_t(M_SQRT1_2);
-          const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5);
-          const scalar_t cdf =
-              scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
-          const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5));
-          return dy * (cdf + x * pdf);
-        },
-        [&](Vec dy_vec, Vec x_vec) {
-          const Vec cdf_vec =
-              kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf());
-          const Vec pdf_vec =
-              kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp();
-          return dy_vec * (cdf_vec + x_vec * pdf_vec);
-        });
-  });
+void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) {
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() {
+      using Vec = vec::Vectorized<scalar_t>;
+      const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5));
+      const Vec kKappaVec(scalar_t(0.044715));
+      const Vec kOneVec(scalar_t(1));
+      const Vec kThreeVec(scalar_t(3));
+      const Vec kPointFiveVec(scalar_t(0.5));
+      cpu_kernel_vec(
+          it,
+          [](scalar_t dy, scalar_t x) {
+            const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+            const scalar_t kKappa = 0.044715;
+            auto x_sq = x * x;
+            auto x_cube = x_sq * x;
+            auto inner = kBeta * (x + kKappa * x_cube);
+            auto tanh_inner = std::tanh(inner);
+
+            auto left = scalar_t(0.5) * x;
+            auto right = scalar_t(1) + tanh_inner;
+
+            auto left_derivative = scalar_t(0.5) * right;
+
+            auto tanh_derivative = scalar_t(1) - tanh_inner * tanh_inner;
+            auto inner_derivative =
+              kBeta * (scalar_t(1) + scalar_t(3) * kKappa * x_sq);
+            auto right_derivative = left * tanh_derivative * inner_derivative;
+
+            return dy * (left_derivative + right_derivative);
+          },
+          [&](Vec dy_vec, Vec x_vec) {
+            auto x_sq = x_vec * x_vec;
+            auto x_cube = x_vec * x_vec * x_vec;
+            auto inner_vec =
+                kBetaVec * (x_vec + kKappaVec * x_cube);
+            auto tanh_inner_vec = inner_vec.tanh();
+
+            auto left_vec = kPointFiveVec * x_vec;
+            auto right_vec = kOneVec + tanh_inner_vec;
+
+            auto left_derivative_vec = kPointFiveVec * right_vec;
+
+            auto tanh_derivative_vec =
+                kOneVec - tanh_inner_vec * tanh_inner_vec;
+            auto inner_derivative_vec =
+                kBetaVec * (kOneVec + kThreeVec * kKappaVec * x_sq);
+            auto right_derivative_vec =
+                left_vec * tanh_derivative_vec * inner_derivative_vec;
+
+            return dy_vec * (left_derivative_vec + right_derivative_vec);
+          });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() {
+      using Vec = vec::Vectorized<scalar_t>;
+      const Vec kAlphaVec(scalar_t(M_SQRT1_2));
+      const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5));
+      const Vec kOneVec(scalar_t(1));
+      const Vec kPointFiveVec(scalar_t(0.5));
+      const Vec kMinusPointFiveVec(scalar_t(-0.5));
+      cpu_kernel_vec(
+          it,
+          [](scalar_t dy, scalar_t x) {
+            const scalar_t kAlpha = scalar_t(M_SQRT1_2);
+            const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5);
+            const scalar_t cdf =
+                scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
+            const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5));
+            return dy * (cdf + x * pdf);
+          },
+          [&](Vec dy_vec, Vec x_vec) {
+            const Vec cdf_vec =
+                kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf());
+            const Vec pdf_vec =
+                kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp();
+            return dy_vec * (cdf_vec + x_vec * pdf_vec);
+          });
+    });
+  }
 }
 
 void hardsigmoid_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&] {
+  if (iter.dtype() == kBFloat16) {
+    const float zero(0.0f);
+    const float three(3.0f);
+    const float six(6.0f);
+    using Vec = vec::Vectorized<float>;
+    const Vec kZeroVec(zero);
+    const Vec kThreeVec(three);
+    const Vec kSixVec(six);
+    cpu_kernel_vec(
+        iter,
+        [&](BFloat16 self_val) -> BFloat16 {
+          return std::min(std::max(float(self_val) + three, zero), six) / six;
+        },
+        [&](vec::Vectorized<BFloat16> self_val) -> vec::Vectorized<BFloat16> {
+          Vectorized<float> self_val0, self_val1;
+          std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val);
+          self_val0 = minimum(
+            maximum(self_val0 + kThreeVec, kZeroVec),
+            kSixVec
+          ) / kSixVec;
+          self_val1 = minimum(
+            maximum(self_val1 + kThreeVec, kZeroVec),
+            kSixVec
+          ) / kSixVec;
+          return convert_float_bfloat16(self_val0, self_val1);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&] {
     const scalar_t zero(0.0f);
     const scalar_t three(3.0f);
     const scalar_t six(6.0f);
@@ -257,10 +502,37 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
           ) / kSixVec;
         });
   });
+  }
 }
 
 void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_backward", [&] {
+  if (iter.dtype() == kBFloat16) {
+    const float zero(0.0f);
+    const float three(3.0f);
+    const float neg_three(-3.0f);
+    const float one_sixth(1.0f / 6.0f);
+    using Vec = Vectorized<float>;
+    Vec kZeroVec(0.0f);
+    Vec kOneSixthVec(1.0f / 6.0f);
+    cpu_kernel_vec(
+        iter,
+        [=](BFloat16 grad_val, BFloat16 self_val) -> BFloat16 {
+          return (float(self_val) > neg_three && float(self_val) < three)
+            ? float(grad_val) * one_sixth
+            : zero;
+        },
+        [=](Vectorized<BFloat16> grad_val, Vectorized<BFloat16> self_val) -> Vectorized<BFloat16> {
+          Vec self_val0, self_val1, grad_val0, grad_val1;
+          std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val);
+          std::tie(grad_val0, grad_val1) = convert_bfloat16_float(grad_val);
+          Vec gradNonZeroMask = (self_val0 > neg_three) & (self_val0 < three);
+          self_val0 = Vec::blendv(kZeroVec, grad_val0 * kOneSixthVec, gradNonZeroMask);
+          gradNonZeroMask = (self_val1 > neg_three) & (self_val1 < three);
+          self_val1 = Vec::blendv(kZeroVec, grad_val1 * kOneSixthVec, gradNonZeroMask);
+          return convert_float_bfloat16(self_val0, self_val1);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_backward", [&] {
     const scalar_t zero(0.0f);
     const scalar_t three(3.0f);
     const scalar_t neg_three(-3.0f);
@@ -280,10 +552,11 @@ void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
           return Vec::blendv(kZeroVec, grad_val * kOneSixthVec, gradNonZeroMask);
         });
   });
+  }
 }
 
 void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardshrink_cpu", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "hardshrink_cpu", [&] {
     auto lambd_val = lambd.to<scalar_t>();
     cpu_kernel_vec(
         iter,
@@ -298,16 +571,43 @@ void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
 }
 
 void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softshrink_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    auto lambd_val = lambd.to<float>();
+    auto lambdVec = Vectorized<float>(lambd_val);
+    cpu_kernel_vec(
+      iter,
+      [=](BFloat16 a) -> BFloat16 {
+        return float(a) > lambd_val ? a - lambd_val : (float(a) < -lambd_val ? a + lambd_val : float(0));
+      },
+      [=](Vectorized<BFloat16> self_val) {
+          Vectorized<float> self_val0, self_val1;
+          Vectorized<BFloat16> self_val_t0, self_val_t1;
+          std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val);
+          self_val_t0 = convert_float_bfloat16((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec));
+          self_val_t1 = convert_float_bfloat16((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec));
+          return (self_val_t0 | self_val_t1);
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softshrink_cpu", [&]() {
     auto lambd_val = lambd.to<scalar_t>();
-    cpu_kernel(iter, [=](scalar_t a) -> scalar_t {
-      return a > lambd_val ? a - lambd_val : (a < -lambd_val ? a + lambd_val : scalar_t(0));
-    });
+    auto lambdVec = Vectorized<scalar_t>(lambd_val);
+    cpu_kernel_vec(
+      iter,
+      [=](scalar_t a) -> scalar_t {
+        return a > lambd_val ? a - lambd_val : (a < -lambd_val ? a + lambd_val : scalar_t(0));
+      },
+      [=](Vectorized<scalar_t> self_val) {
+          Vectorized<scalar_t> self_val_t0, self_val_t1;
+          self_val_t0 = (self_val > lambdVec) & (self_val - lambdVec);
+          self_val_t1 = (self_val < -lambd_val) & (self_val + lambdVec);
+          return (self_val_t0 | self_val_t1);
+      });
   });
+  }
 }
 
 void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "shrink_backward_cpu", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "shrink_backward_cpu", [&] {
     auto lambd_val = lambd.to<scalar_t>();
     cpu_kernel_vec(
         iter,
@@ -337,7 +637,35 @@ void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Sca
 }
 
 void hardswish_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    const float zero(0.0f);
+    const float three(3.0f);
+    const float six(6.0f);
+    using Vec = vec::Vectorized<float>;
+    const Vec kZeroVec(zero);
+    const Vec kThreeVec(three);
+    const Vec kSixVec(six);
+    cpu_kernel_vec(
+      iter,
+      [&](BFloat16 x) -> BFloat16 {
+        return float(x) * std::min(std::max(float(x) + three, zero), six) / six;
+      },
+      [&](vec::Vectorized<BFloat16> x_vec) {
+        Vectorized<float> x_vec0, x_vec1;
+        std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec);
+        x_vec0 = x_vec0 * minimum(
+          maximum(x_vec0 + kThreeVec, kZeroVec),
+          kSixVec
+        ) / kSixVec;
+        x_vec1 = x_vec1 * minimum(
+          maximum(x_vec1 + kThreeVec, kZeroVec),
+          kSixVec
+        ) / kSixVec;
+        return convert_float_bfloat16(x_vec0, x_vec1);
+      }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() {
     const scalar_t zero(0.0f);
     const scalar_t three(3.0f);
     const scalar_t six(6.0f);
@@ -358,10 +686,58 @@ void hardswish_kernel(TensorIterator& iter) {
       }
     );
   });
+  }
 }
 
 void hardswish_backward_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    const float zero(0.0f);
+    const float three(3.0f);
+    const float neg_three(-3.0f);
+    const float one_half(0.5f);
+    using Vec = vec::Vectorized<float>;
+    const Vec kZeroVec(zero);
+    const Vec kThreeVec(three);
+    const Vec kNegThreeVec(neg_three);
+    const Vec kOneHalfVec(one_half);
+    cpu_kernel_vec(
+      iter,
+      [&](BFloat16 grad_val, BFloat16 self_val) -> BFloat16 {
+        if (float(self_val) < neg_three) {
+          return zero;
+        } else if (float(self_val) <= three) {
+          return float(grad_val) * ((float(self_val) / three) + one_half);
+        } else {
+          return grad_val;
+        }
+      },
+      [&](vec::Vectorized<BFloat16> grad_val, vec::Vectorized<BFloat16> self_val) {
+        Vectorized<float> self_val0, self_val1, grad_val0, grad_val1;
+        std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val);
+        std::tie(grad_val0, grad_val1) = convert_bfloat16_float(grad_val);
+        self_val0 = Vec::blendv(
+          Vec::blendv(
+            grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
+            grad_val0,
+            self_val0 >= kThreeVec
+          ),
+          kZeroVec,
+          self_val0 < kNegThreeVec
+        );
+        self_val1 = Vec::blendv(
+          Vec::blendv(
+            grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
+            grad_val1,
+            self_val1 >= kThreeVec
+          ),
+          kZeroVec,
+          self_val1 < kNegThreeVec
+        );
+        return convert_float_bfloat16(self_val0, self_val1);
+      }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() {
     const scalar_t zero(0.0f);
     const scalar_t three(3.0f);
     const scalar_t neg_three(-3.0f);
@@ -395,6 +771,7 @@ void hardswish_backward_kernel(TensorIterator& iter) {
       }
     );
   });
+  }
 }
 
 static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
@@ -475,7 +852,28 @@ static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& n
 }
 
 void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() {
+    if (iter.dtype() == kBFloat16) {
+      using Vec = Vectorized<float>;
+      auto beta = beta_.to<float>();
+      auto threshold = threshold_.to<float>();
+      const Vec beta_vec(beta);
+      const Vec threshold_vec(threshold);
+      cpu_kernel_vec(
+          iter,
+          [beta, threshold](BFloat16 a) -> BFloat16 {
+            return (float(a) * beta) > threshold ? a
+              : static_cast<BFloat16>((std::log1p(std::exp(float(a) * beta))) / beta);
+          },
+          [beta_vec, threshold_vec](Vectorized<BFloat16> a) -> Vectorized<BFloat16> {
+            Vectorized<float> a0, a1;
+            std::tie(a0, a1) = convert_bfloat16_float(a);
+            a0 = Vec::blendv((a0 * beta_vec).exp().log1p() / beta_vec, a0, (a0 * beta_vec) > threshold_vec);
+            a1 = Vec::blendv((a1 * beta_vec).exp().log1p() / beta_vec, a1, (a1 * beta_vec) > threshold_vec);
+            return convert_float_bfloat16(a0, a1);
+          }
+      );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() {
     using Vec = Vectorized<scalar_t>;
     auto beta = beta_.to<scalar_t>();
     auto threshold = threshold_.to<scalar_t>();
@@ -492,10 +890,36 @@ void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar
         }
     );
   });
+  }
 }
 
 void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    using Vec = Vectorized<float>;
+    auto beta = beta_.to<float>();
+    auto threshold = threshold_.to<float>();
+    const Vec beta_vec(beta);
+    const Vec threshold_vec(threshold);
+    const Vec one_vec(static_cast<float>(1.0));
+    cpu_kernel_vec(
+        iter,
+        [beta, threshold](BFloat16 a, BFloat16 b) -> BFloat16 {
+          float z = std::exp(float(b) * beta);
+          return (float(b) * beta) > threshold ? a : static_cast<BFloat16>(float(a) * z / (z + float(1.)));
+        },
+        [beta_vec, one_vec, threshold_vec](Vectorized<BFloat16> a, Vectorized<BFloat16> b) -> Vectorized<BFloat16> {
+          Vectorized<float> a0, a1, b0, b1;
+          std::tie(a0, a1) = convert_bfloat16_float(a);
+          std::tie(b0, b1) = convert_bfloat16_float(b);
+          Vec z = (b0 * beta_vec).exp();
+          a0 = Vec::blendv(a0 * z / (z + one_vec), a0, (b0 * beta_vec) > threshold_vec);
+          z = (b1 * beta_vec).exp();
+          a1 = Vec::blendv(a1 * z / (z + one_vec), a1, (b1 * beta_vec) > threshold_vec);
+          return convert_float_bfloat16(a0, a1);
+        }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() {
     using Vec = Vectorized<scalar_t>;
     auto beta = beta_.to<scalar_t>();
     auto threshold = threshold_.to<scalar_t>();
@@ -514,10 +938,29 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con
         }
     );
   });
+  }
 }
 
 void glu_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] {
+  if (iter.dtype() == kBFloat16) {
+    const float float_one_val(1);
+    const Vectorized<float> float_one_vec(float_one_val);
+    cpu_kernel_vec(
+      iter,
+      [float_one_val](BFloat16 a, BFloat16 b) -> BFloat16 {
+        return float(a) * (float_one_val / (float_one_val + std::exp(- float(b))));
+      },
+      [float_one_vec](Vectorized<BFloat16> a, Vectorized<BFloat16> b) -> Vectorized<BFloat16> {
+        Vectorized<float> a0, a1, b0, b1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        std::tie(b0, b1) = convert_bfloat16_float(b);
+        a0 = a0 * (float_one_vec / (float_one_vec + b0.neg().exp()));
+        a1 = a1 * (float_one_vec / (float_one_vec + b1.neg().exp()));
+        return convert_float_bfloat16(a0, a1);
+      }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] {
     using Vec = Vectorized<scalar_t>;
     const scalar_t one_val(1);
     const Vec one_vec(one_val);
@@ -531,25 +974,65 @@ void glu_kernel(TensorIteratorBase& iter) {
       }
     );
   });
+  }
 }
 
-void glu_backward_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&] {
+void glu_jvp_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_jvp_cpu", [&] {
     using Vec = Vectorized<scalar_t>;
-    const scalar_t one_val(1);
-    const Vec one_vec(one_val);
+    const scalar_t one(1);
+    const Vec ones(one);
     cpu_kernel_vec(
       iter,
-      [one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
-        return (one_val - a) * a * b * c;
+      [one](scalar_t res, scalar_t b, scalar_t da, scalar_t db) -> scalar_t {
+        const auto sig_b = one / (one + std::exp(-b));
+        return da * sig_b + res * (db - sig_b * db);
       },
-      [one_vec](Vec a, Vec b, Vec c) -> Vec {
-        return (one_vec - a) * a * b * c;
+      [ones](Vec res, Vec b, Vec da, Vec db) -> Vec {
+        const auto sig_b = ones / (ones + b.neg().exp());
+        return da * sig_b + res * (db - sig_b * db);
       }
     );
   });
 }
 
+void glu_backward_kernel(TensorIterator& iter) {
+  if (iter.dtype() == kBFloat16) {
+    const float float_one_val(1);
+    const Vectorized<float> float_one_vec(float_one_val);
+    cpu_kernel_vec(
+      iter,
+      [float_one_val](BFloat16 a, BFloat16 b, BFloat16 c) -> BFloat16 {
+        return  (float_one_val - float(a)) * float(a) * float(b) * float(c);
+      },
+      [float_one_vec](Vectorized<BFloat16> a, Vectorized<BFloat16> b, Vectorized<BFloat16> c) -> Vectorized<BFloat16> {
+        Vectorized<float> a0, a1, b0, b1, c0, c1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        std::tie(b0, b1) = convert_bfloat16_float(b);
+        std::tie(c0, c1) = convert_bfloat16_float(c);
+        a0 = (float_one_vec - a0) * a0 * b0 * c0;
+        a1 = (float_one_vec - a1) * a1 * b1 * c1;
+        return convert_float_bfloat16(a0, a1);
+      }
+    );
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&] {
+      using Vec = Vectorized<scalar_t>;
+      const scalar_t one_val(1);
+      const Vec one_vec(one_val);
+      cpu_kernel_vec(
+        iter,
+        [one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+          return (one_val - a) * a * b * c;
+        },
+        [one_vec](Vec a, Vec b, Vec c) -> Vec {
+          return (one_vec - a) * a * b * c;
+        }
+      );
+    });
+  }
+}
+
 void silu_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
       kBFloat16, iter.dtype(), "silu_cpu", [&]() {
@@ -618,6 +1101,65 @@ void mish_backward_kernel(TensorIterator& iter) {
       });
 }
 
+void prelu_cpu_kernel(TensorIterator& iter) {
+  if (iter.common_dtype() == kBFloat16) {
+    auto zero_vec = Vectorized<float>((float)(0));
+    auto one_vec = Vectorized<float>((float)(1));
+    cpu_kernel_vec(
+      iter,
+      [=](BFloat16 input, BFloat16 weight) -> BFloat16 {
+        return (float(input) > float(0)) ? float(input) : float(weight) * float(input);
+      },
+      [=](Vectorized<BFloat16> input, Vectorized<BFloat16> weight) -> Vectorized<BFloat16> {
+        Vectorized<float> input0, input1;
+        Vectorized<float> weight0, weight1;
+        std::tie(input0, input1) = convert_bfloat16_float(input);
+        std::tie(weight0, weight1) = convert_bfloat16_float(weight);
+
+        auto res0 = input0 * (Vectorized<float>::blendv(weight0, one_vec, input0 > zero_vec));
+        auto res1 = input1 * (Vectorized<float>::blendv(weight1, one_vec, input1 > zero_vec));
+        return convert_float_bfloat16(res0, res1);
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "prelu_cpu", [&]() {
+    using Vec = Vectorized<scalar_t>;
+    auto zero_vec = Vec((scalar_t)(0));
+    auto one_vec = Vec((scalar_t)(1));
+    cpu_kernel_vec(
+      iter,
+      [=](scalar_t input, scalar_t weight) {
+        return (input > scalar_t(0)) ? input : weight * input;
+      },
+      [=](Vec input, Vec weight) {
+        auto r = Vec::blendv(weight, one_vec, input > zero_vec);
+        return input * r;
+      });
+    });
+  }
+}
+
+void prelu_backward_cpu_kernel(TensorIterator& iter) {
+  if (iter.common_dtype() == kBFloat16) {
+    cpu_kernel_multiple_outputs(
+      iter,
+      [=](BFloat16 input, BFloat16 grad_out, BFloat16 weight) -> std::tuple<BFloat16, BFloat16> {
+        float input_grad = (float(input) > float(0)) ? float(grad_out) : float(weight) * float(grad_out);
+        float weight_grad_collector = (float(input) > float(0)) ? float(0) : float(input) * float(grad_out);
+        return std::tuple<BFloat16, BFloat16>(input_grad, weight_grad_collector);
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "prelu_backward_cpu", [&]() {
+    cpu_kernel_multiple_outputs(
+      iter,
+      [=](scalar_t input, scalar_t grad_out, scalar_t weight) -> std::tuple<scalar_t, scalar_t> {
+        scalar_t input_grad = (input > scalar_t(0)) ? grad_out : weight * grad_out;
+        scalar_t weight_grad_collector = (input > scalar_t(0)) ? scalar_t(0) : input * grad_out;
+        return std::tuple<scalar_t, scalar_t>(input_grad, weight_grad_collector);
+      });
+    });
+  }
+}
+
 } // namespace
 
 REGISTER_DISPATCH(log_sigmoid_cpu_stub, &log_sigmoid_cpu_kernel);
@@ -641,10 +1183,13 @@ REGISTER_DISPATCH(softplus_stub, &softplus_kernel);
 REGISTER_DISPATCH(softplus_backward_stub, &softplus_backward_kernel);
 REGISTER_DISPATCH(glu_stub, &glu_kernel);
 REGISTER_DISPATCH(glu_backward_stub, &glu_backward_kernel);
+REGISTER_DISPATCH(glu_jvp_stub, &glu_jvp_kernel);
 REGISTER_DISPATCH(silu_stub, &silu_kernel);
 REGISTER_DISPATCH(silu_backward_stub, &silu_backward_kernel);
 REGISTER_DISPATCH(mish_stub, &mish_kernel);
 REGISTER_DISPATCH(mish_backward_stub, &mish_backward_kernel);
+REGISTER_DISPATCH(prelu_cpu_stub, &prelu_cpu_kernel);
+REGISTER_DISPATCH(prelu_backward_cpu_stub, &prelu_backward_cpu_kernel);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
index dd131a1e2a89..b121e2390258 100644
--- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/native/AdaptivePooling.h>
diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
index 1de76289bf32..3f4038685da4 100644
--- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/native/AdaptivePooling.h>
diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
index e0b8551a0a55..df51715e1632 100644
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index f2410947de16..22c82237637f 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -21,27 +21,6 @@ namespace {
 
 using namespace vec;
 
-// Note: Undefined behavior when performing addition is intentionally
-// ignored.
-void add_kernel(TensorIteratorBase& iter, const Scalar& alpha_scalar) {
-  if (iter.dtype() == ScalarType::Bool) {
-      using scalar_t = bool;
-      auto alpha = alpha_scalar.to<scalar_t>();
-      cpu_kernel(iter,
-        [=](scalar_t a, scalar_t b) __ubsan_ignore_undefined__ -> scalar_t { return a + alpha * b; });
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "add_cpu/sub_cpu", [&]() {
-      auto alpha = alpha_scalar.to<scalar_t>();
-      auto alpha_vec = Vectorized<scalar_t>(alpha);
-      cpu_kernel_vec(iter,
-        [=](scalar_t a, scalar_t b) __ubsan_ignore_undefined__ -> scalar_t { return a + alpha * b; },
-        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) __ubsan_ignore_undefined__ {
-          return vec::fmadd(b, alpha_vec, a);
-        });
-      });
-  }
-}
-
 void add_clamp_kernel(TensorIterator& iter, const Scalar& alpha_scalar, const Scalar& min_val, const Scalar& max_val) {
   AT_DISPATCH_ALL_TYPES(iter.dtype(), "add_clamp_cpu", [&]() {
     auto alpha = alpha_scalar.to<scalar_t>();
@@ -64,7 +43,7 @@ void add_clamp_kernel(TensorIterator& iter, const Scalar& alpha_scalar, const Sc
 }
 
 void atan2_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "atan2_cpu", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "atan2_cpu", [&]() {
     cpu_kernel_vec(iter, [=](scalar_t a, scalar_t b) -> scalar_t {
     return std::atan2(a, b);
   },
@@ -74,15 +53,17 @@ void atan2_kernel(TensorIteratorBase& iter) {
   });
 }
 
-// Note: Undefined behavior when performing subtraction is intentionally
-// ignored.
-void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha_scalar) __ubsan_ignore_undefined__ {
-  add_kernel(iter, -alpha_scalar);
-}
-
 void mul_kernel(TensorIteratorBase& iter) {
   if (iter.dtype() == ScalarType::Bool) {
     cpu_kernel(iter, [=](bool a, bool b) -> bool { return a && b; });
+  } else if (iter.dtype() == kComplexHalf) {
+    cpu_kernel(
+        iter,
+        [=](c10::complex<at::Half> a,
+            c10::complex<at::Half> b) -> c10::complex<at::Half> {
+          using comp_t = c10::complex<float>;
+          return comp_t{a} * comp_t{b};
+        });
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "mul_cpu", [&]() {
       cpu_kernel_vec(iter,
@@ -331,26 +312,12 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
 }
 
 void lshift_kernel(TensorIteratorBase& iter) {
-  if (iter.dtype() == ScalarType::Float || iter.dtype() == ScalarType::Double) {
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "lshift_cpu", [&]() {
-      auto base_vec = Vectorized<scalar_t>((scalar_t)(2));
-      cpu_kernel_vec(
-        iter,
-        [=](scalar_t a, scalar_t b) -> scalar_t {
-          return a * std::pow((scalar_t)(2), b);
-        },
-        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
-          return a * base_vec.pow(b);
-      });
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cpu", [&]() {
-      cpu_kernel(iter,
-        [](scalar_t a, scalar_t b) -> scalar_t {
-          return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
-      });
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cpu", [&]() {
+    cpu_kernel(iter,
+      [](scalar_t a, scalar_t b) -> scalar_t {
+        return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
     });
-  }
+  });
 }
 
 void logical_and_kernel(TensorIterator& iter) {
@@ -411,26 +378,12 @@ void logical_xor_kernel(TensorIterator& iter) {
 }
 
 void rshift_kernel(TensorIteratorBase& iter) {
-  if (iter.dtype() == ScalarType::Float || iter.dtype() == ScalarType::Double) {
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "rshift_cpu", [&]() {
-      auto base_vec = Vectorized<scalar_t>((scalar_t)(2));
-      cpu_kernel_vec(
-        iter,
-        [=](scalar_t a, scalar_t b) -> scalar_t {
-          return a / std::pow((scalar_t)(2), b);
-        },
-        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
-          return a / base_vec.pow(b);
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cpu", [&]() {
+    cpu_kernel(iter,
+      [](scalar_t a, scalar_t b) -> scalar_t {
+        return a >> b;
       });
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cpu", [&]() {
-      cpu_kernel(iter,
-        [](scalar_t a, scalar_t b) -> scalar_t {
-          return a >> b;
-        });
-    });
-  }
+  });
 }
 
 void lt_kernel(TensorIteratorBase& iter) {
@@ -528,18 +481,18 @@ void ge_kernel(TensorIteratorBase& iter) {
 void eq_kernel(TensorIteratorBase& iter) {
   // See Note [special-case bool outputs]
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return a == b;
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kComplexHalf, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() {
       cpu_kernel_vec(
         iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
-          return a == b;
+          return static_cast<scalar_t>(a == b);
         },
         [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
           return a.eq(b);
@@ -652,8 +605,33 @@ void fmin_kernel(TensorIteratorBase& iter) {
 }
 
 void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-        kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    const float beta_val(beta);
+    const Vectorized<float> beta_val_vec(beta_val);
+    const Vectorized<float> point_five_vec(static_cast<float>(0.5));
+    cpu_kernel_vec(
+        iter,
+        [&beta_val](BFloat16 a, BFloat16 b) -> BFloat16 {
+          auto z = std::abs(float(a) - float(b));
+          return z < beta_val
+              ? static_cast<float>(0.5) * z * z / beta_val
+              : z - static_cast<float>(0.5) * beta_val;
+        },
+        [&beta_val_vec, &point_five_vec](Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
+          Vectorized<float> a0, a1, b0, b1;
+          std::tie(a0, a1) = convert_bfloat16_float(a);
+          std::tie(b0, b1) = convert_bfloat16_float(b);
+          auto z = (a0 - b0).abs();
+          a0 =  Vectorized<float>::blendv(
+              point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
+          z = (a1 - b1).abs();
+          a1 =  Vectorized<float>::blendv(
+              point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
+          return convert_float_bfloat16(a0, a1);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
         using Vec = Vectorized<scalar_t>;
         const scalar_t beta_val(beta);
         const Vec beta_val_vec(beta_val);
@@ -672,6 +650,7 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
                   point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
             });
       });
+  }
 }
 
 void huber_kernel(TensorIterator& iter, double delta) {
@@ -836,7 +815,7 @@ void tanh_backward_kernel(TensorIteratorBase& iter) {
   }
 }
 
-void mse_kernel(TensorIterator& iter) {
+void mse_kernel(TensorIteratorBase& iter) {
   if (iter.dtype() == ScalarType::Half) {
     TORCH_WARN_ONCE("Applying the CPU mse kernel on half-type tensors. "
                     "This may be slower than using float or double-type tensors.");
@@ -864,7 +843,7 @@ void fmod_kernel(TensorIteratorBase& iter) {
       });
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(), "fmod_cpu", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "fmod_cpu", [&]() {
       cpu_kernel_vec(
         iter,
         [](scalar_t x, scalar_t d) -> scalar_t {
@@ -1133,9 +1112,7 @@ void zeta_kernel(TensorIteratorBase& iter) {
 
 } // namespace
 
-REGISTER_DISPATCH(add_stub, &add_kernel);
 REGISTER_DISPATCH(add_clamp_stub, &add_clamp_kernel);
-REGISTER_DISPATCH(sub_stub, &sub_kernel);
 REGISTER_DISPATCH(mul_stub, &mul_kernel);
 REGISTER_DISPATCH(div_true_stub, &div_true_kernel);
 REGISTER_DISPATCH(div_trunc_stub, &div_trunc_kernel);
diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp
index c5c938818d0d..7b60e9a45cba 100644
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/native/CPUBlas.h>
 #include <c10/util/irange.h>
@@ -190,19 +191,28 @@ void cpublas_gemm_impl(
 }
 
 void cpublas_axpy_impl(at::ScalarType type, int64_t n, const Scalar& _a, const void *_x, int64_t incx, void *_y, int64_t incy){
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_axpy_impl",
-    [&] {
-      auto a = _a.to<scalar_t>();
-      auto x = static_cast<const scalar_t *>(_x);
-      auto y = static_cast<scalar_t *>(_y);
+  if (type == at::kBool) {
+      auto a = _a.to<bool>();
+      auto x = static_cast<const bool *>(_x);
+      auto y = static_cast<bool *>(_y);
       int64_t i;
       for(i = 0; i < n; i++)
-        y[i*incy] += a*x[i*incx];
-    });
+        y[i*incy] |= a & x[i*incx];
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::kHalf, at::kBFloat16, type, "cpublas_axpy_impl",
+      [&] {
+        auto a = _a.to<scalar_t>();
+        auto x = static_cast<const scalar_t *>(_x);
+        auto y = static_cast<scalar_t *>(_y);
+        int64_t i;
+        for(i = 0; i < n; i++)
+          y[i*incy] += a*x[i*incx];
+      });
+  }
 }
 
 void cpublas_copy_impl(at::ScalarType type, int64_t n, const void *_x, int64_t incx, void *_y, int64_t incy){
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_copy_impl",
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::kHalf, at::kBFloat16, at::kBool, type, "cpublas_copy_impl",
     [&] {
       auto x = static_cast<const scalar_t *>(_x);
       auto y = static_cast<scalar_t *>(_y);
diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp
index f9ddc5ef329c..c4fa1bb05405 100644
--- a/aten/src/ATen/native/cpu/CatKernel.cpp
+++ b/aten/src/ATen/native/cpu/CatKernel.cpp
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/native/cpu/CatKernel.h>
@@ -20,15 +21,15 @@ struct InputMeta {
 };
 
 template <typename scalar_t>
-void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) {
+void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl");
   int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]);
   scalar_t* result_data = result.data_ptr<scalar_t>();
-  int64_t ninputs = tensors.size();
+  int64_t ninputs = static_cast<int64_t>(tensors.size());
   std::vector<InputMeta> inputs;
   inputs.reserve(ninputs);
-  for (auto const &tensor : tensors) {
+  for (const Tensor& tensor : tensors) {
     inputs.emplace_back(tensor, dim, result.strides()[dim]);
   }
 
@@ -54,7 +55,7 @@ void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) {
   }
 }
 
-void cat_serial_kernel(Tensor& result, TensorList tensors, int64_t dim) {
+void cat_serial_kernel(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) {
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, result.scalar_type(), "cat_serial_kernel", [&]() {
     cat_serial_kernel_impl<scalar_t>(result, tensors, dim);
   });
diff --git a/aten/src/ATen/native/cpu/CatKernel.h b/aten/src/ATen/native/cpu/CatKernel.h
index 6b9d40c6d630..aedb4aec4f57 100644
--- a/aten/src/ATen/native/cpu/CatKernel.h
+++ b/aten/src/ATen/native/cpu/CatKernel.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
+#include <ATen/core/IListRef.h>
 
 namespace at { namespace native {
 
-using cat_serial_fn = void(*)(Tensor &, TensorList, int64_t);
+using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t);
 DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ComplexKernel.cpp b/aten/src/ATen/native/cpu/ComplexKernel.cpp
index 6b78645db1d5..99dc6134537e 100644
--- a/aten/src/ATen/native/cpu/ComplexKernel.cpp
+++ b/aten/src/ATen/native/cpu/ComplexKernel.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/TensorIterator.h>
@@ -8,7 +9,7 @@ namespace native {
 namespace {
 
 void complex_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "complex_cpu", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(), "complex_cpu", [&]() {
     cpu_kernel(iter, [=](scalar_t a, scalar_t b) -> c10::complex<scalar_t> {
       return c10::complex<scalar_t>(a, b);
     });
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index 0b9992890c67..40a0c20b5ca8 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -72,7 +72,7 @@ void copy_same_dtype(TensorIteratorBase &iter, bool requires_conj, bool requires
   }
 }
 
-void copy_kernel(TensorIterator& iter, bool non_blocking) {
+void copy_kernel(TensorIterator& iter, bool /*non_blocking*/) {
   ScalarType dtype = iter.dtype(0);
   const bool requires_conj = (
       isComplexType(dtype) && (iter.tensor_base(0).is_conj() != iter.tensor_base(1).is_conj()));
@@ -81,9 +81,9 @@ void copy_kernel(TensorIterator& iter, bool non_blocking) {
   if (dtype == iter.dtype(1)) {
     copy_same_dtype(iter, requires_conj, requires_neg);
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(ScalarType::ComplexHalf, ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] {
       using dest_t = scalar_t;
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(ScalarType::ComplexHalf, ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] {
         // Note (@zasdfgbnm):
         //
         // The code below can not be simplified as
diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp
index 99a4402d51ee..1511d17fce78 100644
--- a/aten/src/ATen/native/cpu/CrossKernel.cpp
+++ b/aten/src/ATen/native/cpu/CrossKernel.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Cross.h>
 
 #include <numeric>
@@ -5,8 +6,10 @@
 #include <algorithm>
 #include <vector>
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/cpu/vml.h>
 #include <c10/util/irange.h>
 namespace at { namespace native { namespace {
diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
index 9ab2e860d895..9fb24db673d5 100644
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@@ -1,8 +1,16 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cpu/DepthwiseConvKernel.h>
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #ifdef __ARM_NEON__
 #include <arm_neon.h>
 #endif
diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.h b/aten/src/ATen/native/cpu/DepthwiseConvKernel.h
index 7ef848032af3..56956b443386 100644
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.h
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
 
 /*
@@ -8,6 +7,8 @@
 */
 
 namespace at {
+class Tensor;
+
 namespace native {
 
 using convolution_depthwise3x3_winograd_fn =
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 2058ca482ea0..98404005c551 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -1,11 +1,12 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Distance.h>
 
-#include <numeric>
-#include <iterator>
 #include <algorithm>
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/cpu/vml.h>
 #include <c10/util/irange.h>
 
@@ -91,7 +92,7 @@ struct Dist {
   struct zdist_calc {
     static inline data_t map(const data_t& diff, const data_t& p) { return min(ceil(abs(diff)), 1); }
     static inline data_t red(const data_t& agg, const data_t& up) { return agg + up; }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; }
   };
 
   // One norm
@@ -99,8 +100,8 @@ struct Dist {
   struct odist_calc {
     static inline data_t map(const data_t& diff, const data_t& p) { return diff; }
     static inline data_t red(const data_t& agg, const data_t& up) { return agg + up; }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
-    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff); }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t /*dist*/, const Vec& /*p*/) { return Vec(grad) * sign(diff); }
   };
 
   // Special general pnorm derivative if p is less than two
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index f6803e5a3994..4363cc9d62e3 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -1,13 +1,19 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Generator.h>
 #include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/Distributions.h>
-#include <ATen/native/TensorFactories.h>
 #include <ATen/native/cpu/DistributionTemplates.h>
 
 #include <ATen/native/UnaryOps.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 #include <cmath>
 #include <limits>
 #include <type_traits>
@@ -25,22 +31,22 @@ static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma,
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
 
-void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p_, generator);
 }
 
-void bernoulli_scalar_kernel_default(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p, generator);
 }
 
 #if !AT_MKL_ENABLED()
-void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   bernoulli_scalar_kernel_default(self, p, gen);
 }
 #else
-void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
     CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
     int64_t seed;
@@ -87,7 +93,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> ge
 
       // copy_ if using buffer and non contiguous
       if (!contig) {
-        self.copy_(tmp_int_tensor);
+        OptionalTensorRef(self)->copy_(tmp_int_tensor);
       }
     });
   } else {
@@ -117,7 +123,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optio
   templates::cpu::uniform_kernel(iter, from, to, generator);
 }
 
-void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 6c017e15c461..37c799803eaf 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <ATen/Dispatch.h>
 #include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandBase.h>
 #include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
@@ -105,7 +106,7 @@ static void normal_fill_16_AVX2(float *data,
 }
 
 template<typename RNG>
-void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG generator) {
+void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) {
   float *data = self.data_ptr<float>();
   auto size = self.numel();
   std::lock_guard<std::mutex> lock(generator->mutex_);
@@ -148,7 +149,7 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s
 }
 
 template <typename scalar_t, typename RNG>
-void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG generator) {
+void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
   scalar_t *data = self.data_ptr<scalar_t>();
   auto size = self.numel();
   std::lock_guard<std::mutex> lock(generator->mutex_);
@@ -172,7 +173,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene
 }
 
 template<typename RNG>
-void normal_kernel(Tensor& self, double mean, double std, RNG generator) {
+void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) {
   auto size = self.numel();
   if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
 #ifdef CPU_CAPABILITY_AVX2
@@ -308,25 +309,25 @@ struct ExponentialKernel {
 // ================================================== Bernoulli =======================================================
 
 template<typename RNG>
-void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) {
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(generator->mutex_);
     using self_t = scalar_t;
     auto p_cpu = p_.to(kCPU);
-    c10::MaybeOwned<Tensor> p = expand_inplace(self, p_cpu);
+    auto p = expand_inplace(self, p_cpu);
     auto iter = TensorIteratorConfig()
         .add_output(self)
         .add_input(*p)
         .check_all_same_dtype(false)
         .build();
-    if (p_.scalar_type() == kDouble) {
+    if (p->scalar_type() == kDouble) {
       cpu_serial_kernel(iter, [&](const double p_val) -> self_t {
         at::bernoulli_distribution<double> bernoulli(p_val);
         return static_cast<self_t>(bernoulli(generator));
       });
     } else {
-      AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
+      AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
         using p_t = scalar_t;
         cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
           at::bernoulli_distribution<float> bernoulli(p_val);
@@ -338,7 +339,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
 }
 
 template<typename RNG>
-void bernoulli_kernel(Tensor& self, double p, RNG generator) {
+void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(generator->mutex_);
@@ -352,10 +353,10 @@ void bernoulli_kernel(Tensor& self, double p, RNG generator) {
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p, check_generator<RNG>(gen));
   }
-  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cpu/FillKernel.cpp b/aten/src/ATen/native/cpu/FillKernel.cpp
index 3685f2e179ce..c023013052d9 100644
--- a/aten/src/ATen/native/cpu/FillKernel.cpp
+++ b/aten/src/ATen/native/cpu/FillKernel.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
@@ -6,6 +7,7 @@
 #include <ATen/native/cpu/Loops.h>
 
 #include <ATen/native/Fill.h>
+#include <c10/core/Scalar.h>
 
 namespace at { namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp
index 2e0cc33c3f51..0f4d4b607717 100644
--- a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp
+++ b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp
@@ -1,6 +1,8 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/FunctionOfAMatrixUtils.h>
 
-#include <ATen/native/cpu/Loops.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
 #include <c10/util/irange.h>
 
 #if (defined(_WIN32) || defined(_WIN64))
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 4e89a499d233..47b20b2ca4c1 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -1,11 +1,12 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/Parallel.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/GridSampler.h>
 #include <ATen/native/cpu/GridSamplerKernel.h>
-#include <ATen/cpu/vml.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/cpu/vec/vec.h>
 #include <c10/util/C++17.h>
 #include <c10/util/irange.h>
 
@@ -664,6 +665,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
       auto gOut = Vec::loadu(gOut_slice[c].data() + offset, len);
 
       if (input_requires_grad) {
+        TORCH_INTERNAL_ASSERT(gInp_slice_ptr);
         auto gInp_slice_C_ptr = (*gInp_slice_ptr)[c].data();
 
         (nw * gOut).store(gInp_corner_arr);
@@ -768,7 +770,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest,
   inline void backward(TensorAccessor<scalar_t, 3>* gInp_slice_ptr,
                        TensorAccessor<scalar_t, 3>& gGrid_slice,
                        const TensorAccessor<scalar_t, 3>& gOut_slice,
-                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       const TensorAccessor<scalar_t, 3>& /*inp_slice*/,
                        int64_t offset, const Vec& grid_x, const Vec& grid_y,
                        int64_t len) const {
     if (input_requires_grad) {
@@ -1146,13 +1148,12 @@ static inline void grid_sample_2d_grid_slice_iterator(
 // and backward.
 // See NOTE [ Grid Sample CPU Kernels ] for details.
 
-Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
-                                       int64_t interpolation_mode,
-                                       int64_t padding_mode, bool align_corners) {
+void grid_sampler_2d_cpu_kernel_impl(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
   auto N = input.size(0);
   auto H = grid.size(1);
   auto W = grid.size(2);
-  auto output = at::empty({N, input.size(1), H, W}, input.options());
   auto spatial_size = H * W;
   auto grain_size = spatial_size == 0 ? (N + 1)
                                       : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/);
@@ -1207,18 +1208,18 @@ Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
   });
 #undef HANDLE_CASE
 #undef HANDLE_INTERP
-
-  return output;
 }
 
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
-                                         const Tensor& input,
-                                         const Tensor& grid,
-                                         int64_t interpolation_mode,
-                                         int64_t padding_mode,
-                                         bool align_corners,
-                                         std::array<bool,2> output_mask) {
+void grid_sampler_2d_backward_cpu_kernel_impl(
+    const TensorBase &grad_input,
+    const TensorBase &grad_grid,
+    const TensorBase &grad_output_,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    std::array<bool,2> output_mask) {
   // grad_output should be contiguous most of time. Ensuring that it is
   // contiguous can greatly simplify this code.
   auto grad_output = grad_output_.contiguous();
@@ -1228,11 +1229,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
   // is always computed.)
   auto input_requires_grad = output_mask[0];
 
-  Tensor grad_input;
-  if (input_requires_grad) {
-    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto N = input.size(0);
   auto spatial_size = grid.size(1) * grid.size(2);
   auto grain_size = spatial_size == 0 ? (N + 1)
@@ -1315,8 +1311,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
   });
 #undef HANDLE_CASE
 #undef HANDLE_INTERP
-
-  return std::make_tuple(grad_input, grad_grid);
 }
 
 }
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h
index aa4a24736dac..b1830fcd3911 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.h
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h
@@ -1,17 +1,33 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/cpu/vml.h>
 
-#include <tuple>
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
 
 namespace at { namespace native {
 
-using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t, bool);
-using backward_2d_fn = std::tuple<Tensor, Tensor>(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool, std::array<bool,2>);
+using forward_2d_fn = void (*) (
+    const TensorBase &output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners);
+using backward_2d_fn = void (*) (
+    const TensorBase &grad_input,
+    const TensorBase &grad_grid,
+    const TensorBase &grad_output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    std::array<bool, 2> output_mask);
 DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
 DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);
 
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 583f3679c0aa..6d6b4a749fb2 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -1,16 +1,23 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Histogram.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/sum.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 #include <algorithm>
-#include <mutex>
 #include <numeric>
-#include <tuple>
 #include <functional>
-#include <ATen/TensorIndexing.h>
 
 namespace at { namespace native {
 
@@ -219,7 +226,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor
         bin_edges_contig[dim] = bin_edges[dim].contiguous();
     }
 
-    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "histogram_cpu", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "histogram_cpu", [&]() {
         histogramdd_cpu_contiguous<scalar_t, bin_algorithm>(
                 hist, bin_edges_contig, reshaped_input, reshaped_weight);
     });
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index 242241b97988..7b7ab6c72802 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -103,7 +103,7 @@ void cpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
 }
 
 void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
     iter.dtype(), "index_cpu", [&] {
     cpu_index_kernel<scalar_t>(iter, index_size, index_stride, [](char* dst, char* src, int64_t offset) {
       *(scalar_t*)dst = *(scalar_t*)(src + offset);
@@ -234,7 +234,7 @@ void take_kernel(
 
 void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) {
   // NOTE: duplicate indices are only supported if accumulate is true.
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
     iter.dtype(), "index_put", [&] {
     // See Note [Enabling Deterministic Operations]
     // Parallel cpu_index_kernel with accumulation is nondeterministic, so we
@@ -409,7 +409,7 @@ void cpu_masked_fill_kernel(TensorIterator& iter, scalar_t value) {
 }
 
 void masked_fill_kernel(TensorIterator& iter, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kBFloat16, kHalf,
     iter.dtype(), "masked_fill", [&] {
       scalar_t scalar_val = value.to<scalar_t>();
       auto mask_dtype = iter.input_dtype(0);
diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h
index 971717bae4be..192177cc9bcf 100644
--- a/aten/src/ATen/native/cpu/IsContiguous.h
+++ b/aten/src/ATen/native/cpu/IsContiguous.h
@@ -25,7 +25,7 @@ struct IsContiguous<0, 0, traits, s> {
 // will be called when there is no output
 template <typename traits, int s>
 struct IsContiguous<0, -1, traits, s> {
-  static bool eval(const int64_t* strides) {
+  static bool eval(const int64_t* /*strides*/) {
     return true;
   }
 };
diff --git a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp
index 0bb92a158aa2..d67769dead45 100644
--- a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp
@@ -1,5 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/LinearAlgebra.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/SharedReduceOps.h>
@@ -20,7 +21,7 @@ void addr_kernel(TensorIterator &iter,
     // nans and infs in self should not propagate.
     if (beta_val == false) {
       cpu_kernel(iter,
-        [=](scalar_t self_val,
+        [=](scalar_t /*self_val*/,
             scalar_t vec1_val,
             scalar_t vec2_val) __ubsan_ignore_undefined__ -> scalar_t {
           return alpha_val && vec1_val && vec2_val;
@@ -53,12 +54,12 @@ void addr_kernel(TensorIterator &iter,
       // nans and infs in self should not propagate.
       if (beta_val == zero_val) {
         cpu_kernel_vec(iter,
-          [=](scalar_t self_val,
+          [=](scalar_t /*self_val*/,
               scalar_t vec1_val,
               scalar_t vec2_val) __ubsan_ignore_undefined__ -> scalar_t {
             return alpha_val * vec1_val * vec2_val;
           },
-          [=](Vec self_vec,
+          [=](Vec /*self_vec*/,
               Vec vec1_vec,
               Vec vec2_vec) __ubsan_ignore_undefined__ {
             return alpha_vec * vec1_vec * vec2_vec;
@@ -82,86 +83,7 @@ void addr_kernel(TensorIterator &iter,
   );
 }
 
-template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>::type>
-void linalg_vector_norm_kernel_cpu_impl(TensorIterator& iter, Scalar ord) {
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  double ord_val;
-  if (ord.isFloatingPoint()) {
-     ord_val = ord.to<double>();
-  } else {
-     TORCH_CHECK(false, "linalg.vector_norm expects ord to be float");
-  }
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  acc_t init_val = (ord_val == -INFINITY) ? std::numeric_limits<acc_t>::infinity() : static_cast<acc_t>(0);
-  if (iter.numel() == 0) {
-    iter.output().fill_((ord_val < 0) ? INFINITY : 0);
-    return;
-  }
-  if (ord_val == 0) {
-    binary_kernel_reduce(iter, NormZeroOps<scalar_t, acc_t>(), init_val);
-  } else if (ord_val == 1) {
-    binary_kernel_reduce(iter, NormOneOps<scalar_t, acc_t>(), init_val);
-  } else if (ord_val == 2) {
-    binary_kernel_reduce(iter, NormTwoOps<scalar_t, acc_t>(), init_val);
-  } else if (ord_val == INFINITY) {
-    binary_kernel_reduce(iter, AbsMaxOps<scalar_t, acc_t>(), init_val);
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  } else if (ord_val == -INFINITY) {
-    binary_kernel_reduce(iter, AbsMinOps<scalar_t, acc_t>(), init_val);
-  } else {
-    binary_kernel_reduce(iter, NormOps<scalar_t, acc_t> { static_cast<acc_t>(ord_val) }, init_val);
-  }
-  // For complex outputs, the above kernels do not touch the imaginary values,
-  // so we must zero them out
-  if (isComplexType(iter.output().scalar_type())) {
-    at::imag(iter.output()).zero_();
-  }
-}
-
-static void linalg_vector_norm_kernel_cpu(TensorIterator& iter, Scalar ord) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "linalg_vector_norm_cpu", [&] {
-    linalg_vector_norm_kernel_cpu_impl<scalar_t>(iter, ord);
-  });
-}
-
-void unpack_pivots_cpu_kernel(
-  TensorIterator& iter,
-  int64_t dim_size
-) {
-  if (iter.numel() == 0) {
-    return;
-  }
-
-  auto loop = [&](char** data, const int64_t* strides, int64_t nelems) {
-    auto* unpacked_pivots_ptr = data[0];
-    const auto* pivots_ptr = data[1];
-
-    for (const auto elem : c10::irange(nelems)) {
-      (void)elem; //Suppress unused variable warning
-      // WARNING: torch.lu returns int32 pivots,
-      // this behavior could change in the future.
-      auto* unpacked_pivots_data = reinterpret_cast<int32_t*>(unpacked_pivots_ptr);
-      auto* pivots_data = reinterpret_cast<const int32_t*>(pivots_ptr);
-
-      for (const auto i : c10::irange(dim_size)) {
-        std::swap(
-          unpacked_pivots_data[i],
-          unpacked_pivots_data[pivots_data[i]]
-        );
-      }
-
-      unpacked_pivots_ptr += strides[0];
-      pivots_ptr += strides[1];
-    }
-  };
-
-  iter.for_each(loop);
-}
-
 } // anonymous namespace
 
 REGISTER_DISPATCH(addr_stub, &addr_kernel);
-REGISTER_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cpu);
-REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
-
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index f704240481fe..2facc434d341 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
 
 template <typename traits, typename cb_t>
 static inline void unroll_contiguous_scalar_checks(
-    const int64_t* strides,
+    const int64_t* /*strides*/,
     std::index_sequence<>,
     cb_t&& cb) {
   cb(0);
diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
index e81601b987e1..a2e7736a4a82 100644
--- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/AdaptivePooling.h>
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp
index d70b6ef6e70d..06d0fe501426 100644
--- a/aten/src/ATen/native/cpu/MaxPooling.cpp
+++ b/aten/src/ATen/native/cpu/MaxPooling.cpp
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/MaxPooling.h>
@@ -30,13 +32,13 @@ void max_pool1d_impl(
     Tensor& output,
     const Tensor& input,
     const PoolingParams1D& p) {
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool1d_impl", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "max_pool1d_impl", [&] {
     const Tensor in = input.contiguous();
     scalar_t* const OP = output.data_ptr<scalar_t>();
     const scalar_t* const IP = in.data_ptr<scalar_t>();
 
     // Value used for padding
-    constexpr scalar_t FILL = std::numeric_limits<scalar_t>::has_infinity
+    scalar_t FILL = std::numeric_limits<scalar_t>::has_infinity
         ? -std::numeric_limits<scalar_t>::infinity()
         : std::numeric_limits<scalar_t>::lowest();
 
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
index d08531ddf32a..566f13591603 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -1,9 +1,9 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cpu/MaxUnpoolKernel.h>
-#include <ATen/ATen.h>
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
-#include <ATen/native/Pool.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
 
@@ -233,68 +233,6 @@ void cpu_max_unpool_backward(
   }
 }
 
-template <typename scalar_t>
-void cpu_max_unpool_backward_channels_last(
-    Tensor& grad_input_,
-    const Tensor& grad_output,
-    const Tensor& indices) {
-  TORCH_CHECK(grad_output.ndimension() == 4,
-      "max_unpool2d  backward with channels last format supports tensors with 4 dims.");
-  auto memory_format = at::MemoryFormat::ChannelsLast;
-  auto grad_input = grad_input_.contiguous(memory_format);
-
-  auto grad_input_data = grad_input.data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
-
-  int64_t nbatch = grad_input.size(0);
-  int64_t channels = grad_input.size(1);
-  int64_t input_height = grad_input.size(2);
-  int64_t input_width = grad_input.size(3);
-  int64_t output_height = grad_output.size(2);
-  int64_t output_width = grad_output.size(3);
-  int64_t input_image_size = input_height * input_width;
-  int64_t output_image_size = output_height * output_width;
-
-  c10::optional<int64_t> optional_error_index;
-
-  // parallel on dim N, H, W
-  at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
-    int64_t n = 0;
-    int64_t ip = 0;
-    data_index_init(begin, n, nbatch, ip, input_image_size);
-
-    for (const auto i : c10::irange(begin, end)) {
-      scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels;
-      scalar_t* grad_input_ptr = grad_input_data + i * channels;
-      int64_t* indices_ptr = indices_data + i * channels;
-
-      for (const auto c : c10::irange(channels)) {
-        int64_t maxp = indices_ptr[c];
-        if (maxp < 0 || maxp >= output_image_size) {
-          optional_error_index = maxp;
-          std::atomic_thread_fence(std::memory_order_release);
-        } else {
-          grad_input_ptr[c] = grad_output_ptr[maxp * channels + c];
-        }
-      }
-
-      // move on to next input index
-      data_index_step(n, nbatch, ip, input_image_size);
-    }
-  });
-
-  if (optional_error_index) {
-    AT_ERROR("invalid max index ", optional_error_index.value(),
-        ", owidth= ", output_width,
-        ", oheight= ", output_height);
-  }
-
-  if (!grad_input_.is_contiguous(memory_format)) {
-    grad_input_.copy_(grad_input);
-  }
-}
-
 void max_unpool2d_kernel_impl(
     Tensor& output,
     const Tensor& input,
@@ -326,42 +264,9 @@ void max_unpool3d_kernel_impl(
   });
 }
 
-void max_unpool2d_backward_kernel_impl(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& indices) {
-  switch(grad_output.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] {
-        cpu_max_unpool_backward<scalar_t, /*is_3d*/false>(grad_input, grad_output, indices);
-      });
-      break;
-    }
-    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] {
-        cpu_max_unpool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
-      });
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
-  }
-}
-
-void max_unpool3d_backward_kernel_impl(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& indices) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] {
-    cpu_max_unpool_backward<scalar_t, /*is_3d*/true>(grad_input, grad_output, indices);
-  });
-}
-
 } // anonymous namespace
 
 REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl);
-REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl);
 REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl);
-REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl);
 
 }} // at::native
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
index 00fbeb64213d..1c6507909ca4 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
@@ -1,16 +1,14 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#pragma once
 #include <ATen/native/DispatchStub.h>
 
-#pragma once
+namespace at {
+class Tensor;
 
-namespace at { namespace native {
+namespace native {
 
 using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
 
 DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
-DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel);
 DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
-DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel);
 
 }} // at::native
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index f181572f51af..feda5fe7b3ba 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -1,13 +1,20 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
+#include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/Copy.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/cpu/Loops.h>
-#include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/UnaryOps.h>
+#include <ATen/native/cpu/Loops.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
@@ -18,7 +25,8 @@ void multinomial_with_replacement_apply(
     const Tensor& self,
     const int64_t n_sample,
     c10::optional<Generator> generator) {
-  auto gen = get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
+  auto gen = get_generator_or_default<CPUGeneratorImpl>(
+      generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
   std::lock_guard<std::mutex> lock(gen->mutex_);
 
@@ -28,9 +36,9 @@ void multinomial_with_replacement_apply(
   /* cumulative probability distribution vector */
   Tensor cum_dist = at::empty({n_categories}, self.options());
 
-  const scalar_t * const self_ptr = self.data_ptr<scalar_t>();
-  scalar_t * const cum_dist_ptr = cum_dist.data_ptr<scalar_t>();
-  int64_t * const result_ptr = result.data_ptr<int64_t>();
+  const scalar_t* const self_ptr = self.data_ptr<scalar_t>();
+  scalar_t* const cum_dist_ptr = cum_dist.data_ptr<scalar_t>();
+  int64_t* const result_ptr = result.data_ptr<int64_t>();
 
   auto self_stride_0 = self.dim() > 1 ? self.stride(-2) : 0;
   auto self_stride_1 = self.stride(-1);
@@ -47,22 +55,28 @@ void multinomial_with_replacement_apply(
     scalar_t val;
     for (const auto j : c10::irange(n_categories)) {
       val = self_ptr[i * self_stride_0 + j * self_stride_1];
-      TORCH_CHECK(val >= 0, "invalid multinomial distribution (encountering probability entry < 0)");
+      TORCH_CHECK(
+          val >= 0,
+          "invalid multinomial distribution (encountering probability entry < 0)");
 // NB: std::isfinite doesn't bode well with libc++ for half datatypes,
 // so we manually cast it to a double and perform the check.
 #if defined(_LIBCPP_VERSION)
-      TORCH_CHECK(std::isfinite(static_cast<double>(val)),
-                  "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
+      TORCH_CHECK(
+          std::isfinite(static_cast<double>(val)),
+          "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
 #else
-      TORCH_CHECK(std::isfinite(val),
-                  "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
+      TORCH_CHECK(
+          std::isfinite(val),
+          "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
 #endif
 
       sum += val;
       cum_dist_ptr[j * cum_dist_stride_0] = sum;
     }
 
-    TORCH_CHECK(sum > 0, "invalid multinomial distribution (sum of probabilities <= 0)");
+    TORCH_CHECK(
+        sum > 0,
+        "invalid multinomial distribution (sum of probabilities <= 0)");
 
     /* normalize cumulative probability distribution so that last val is 1
     i.e. doesn't assume original self row sums to one */
@@ -89,20 +103,124 @@ void multinomial_with_replacement_apply(
       /* Make sure the last cumulative distribution bucket sums to 1 */
       cum_dist_ptr[(n_categories - 1) * cum_dist_stride_0] = 1;
 
-      while(right_pointer - left_pointer > 0) {
+      while (right_pointer - left_pointer > 0) {
         mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
         cum_prob = cum_dist_ptr[mid_pointer * cum_dist_stride_0];
         if (cum_prob < uniform_sample) {
           left_pointer = mid_pointer + 1;
+        } else {
+          right_pointer = mid_pointer;
         }
-        else {
+      }
+      sample_idx = left_pointer;
+
+      /* store in result tensor (will be incremented for lua compat by wrapper)
+       */
+      result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] =
+          sample_idx;
+    }
+  }
+}
+
+template <>
+void multinomial_with_replacement_apply<BFloat16>(
+    Tensor& result,
+    const Tensor& self,
+    const int64_t n_sample,
+    c10::optional<Generator> generator) {
+  auto gen = get_generator_or_default<CPUGeneratorImpl>(
+      generator, detail::getDefaultCPUGenerator());
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(gen->mutex_);
+
+  int64_t n_categories = self.size(-1);
+  int64_t n_dist = self.dim() > 1 ? self.size(-2) : 1;
+
+  /* cumulative probability distribution vector */
+  Tensor cum_dist = at::empty({n_categories}, self.options().dtype(kFloat));
+
+  const BFloat16* const self_ptr = self.data_ptr<BFloat16>();
+  float* const cum_dist_ptr = cum_dist.data_ptr<float>();
+  int64_t* const result_ptr = result.data_ptr<int64_t>();
+
+  auto self_stride_0 = self.dim() > 1 ? self.stride(-2) : 0;
+  auto self_stride_1 = self.stride(-1);
+
+  auto cum_dist_stride_0 = cum_dist.stride(0);
+
+  auto result_dist_stride_0 = result.dim() > 1 ? result.stride(-2) : 0;
+  auto result_dist_stride_1 = result.stride(-1);
+
+  for (const auto i : c10::irange(n_dist)) {
+    /* Get normalized cumulative distribution from prob distribution */
+    float sum = 0;
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    float val;
+    for (const auto j : c10::irange(n_categories)) {
+      val = self_ptr[i * self_stride_0 + j * self_stride_1];
+      TORCH_CHECK(
+          val >= 0,
+          "invalid multinomial distribution (encountering probability entry < 0)");
+// NB: std::isfinite doesn't bode well with libc++ for half datatypes,
+// so we manually cast it to a double and perform the check.
+#if defined(_LIBCPP_VERSION)
+      TORCH_CHECK(
+          std::isfinite(static_cast<double>(val)),
+          "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
+#else
+      TORCH_CHECK(
+          std::isfinite(val),
+          "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
+#endif
+
+      sum += val;
+      cum_dist_ptr[j * cum_dist_stride_0] = sum;
+    }
+
+    TORCH_CHECK(
+        sum > 0,
+        "invalid multinomial distribution (sum of probabilities <= 0)");
+
+    /* normalize cumulative probability distribution so that last val is 1
+    i.e. doesn't assume original self row sums to one */
+    if ((sum > 0) || ((sum < 1.00001) && (sum > 0.99999))) {
+      for (const auto j : c10::irange(n_categories)) {
+        cum_dist_ptr[j * cum_dist_stride_0] /= sum;
+      }
+    }
+
+    for (const auto j : c10::irange(n_sample)) {
+      /* sample a probability mass from a uniform distribution */
+      at::uniform_real_distribution<double> uniform(0, 1);
+      double uniform_sample = uniform(gen);
+      /* Do a binary search for the slot in which the prob falls
+      ie cum_dist[row][slot-1] < uniform_prob < cum_distr[row][slot] */
+      int left_pointer = 0;
+      int right_pointer = n_categories;
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      int mid_pointer;
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+      float cum_prob;
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      int sample_idx;
+      /* Make sure the last cumulative distribution bucket sums to 1 */
+      cum_dist_ptr[(n_categories - 1) * cum_dist_stride_0] = 1;
+
+      while (right_pointer - left_pointer > 0) {
+        mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
+        cum_prob = cum_dist_ptr[mid_pointer * cum_dist_stride_0];
+        if (cum_prob < uniform_sample) {
+          left_pointer = mid_pointer + 1;
+        } else {
           right_pointer = mid_pointer;
         }
       }
       sample_idx = left_pointer;
 
-      /* store in result tensor (will be incremented for lua compat by wrapper) */
-      result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = sample_idx;
+      /* store in result tensor (will be incremented for lua compat by wrapper)
+       */
+      result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] =
+          sample_idx;
     }
   }
 }
@@ -112,14 +230,16 @@ static void multinomial_with_replacement_kernel_impl(
     const Tensor& self,
     const int64_t n_sample,
     c10::optional<Generator> gen) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "multinomial", [&] {
-    multinomial_with_replacement_apply<scalar_t>(result, self, n_sample, gen);
-  });
-}
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, self.scalar_type(), "multinomial", [&] {
+        multinomial_with_replacement_apply<scalar_t>(
+            result, self, n_sample, gen);
+      });
 }
+} // namespace
 
 REGISTER_DISPATCH(
     multinomial_with_replacement_stub,
     &multinomial_with_replacement_kernel_impl);
-}
-}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp
new file mode 100644
index 000000000000..aedd845fee89
--- /dev/null
+++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp
@@ -0,0 +1,251 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/utils.h>
+#include <ATen/native/cpu/PixelShuffleKernel.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t>
+void cpu_pixel_shuffle(
+    Tensor& output,
+    const Tensor& input,
+    int64_t upscale_factor) {
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  // [(B1...Bn), C, H, W] => [N, C, H, W]
+  int64_t channels = input.size(-3);
+  int64_t height = input.size(-2);
+  int64_t width = input.size(-1);
+  int64_t sub_channels = channels / (upscale_factor * upscale_factor);
+  int64_t numel = input.numel();
+  int64_t nbatch = numel / (channels * height * width);
+  int64_t S = upscale_factor;
+
+  // input strides
+  int64_t stride_n = channels * height * width;
+  int64_t stride_c = S * S * height * width;
+  int64_t stride_s1 = S * height * width;
+  int64_t stride_s2 = height * width;
+  int64_t stride_h = width;
+
+  // input tensor shape of [n, c, s1, s2, h, w]
+  // output tensor shape of [n, c, h, s1, w, s2]
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t n{0}, c{0}, h{0}, s1{0}, w{0}, s2{0};
+    data_index_init(begin, n, nbatch, c, sub_channels, h, height, s1, S, w, width, s2, S);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t input_offset = n * stride_n + c * stride_c + s1 * stride_s1 +
+          s2 * stride_s2 + h * stride_h + w;
+      output_data[i] = input_data[input_offset];
+
+      data_index_step(n, nbatch, c, sub_channels, h, height, s1, S, w, width, s2, S);
+    }
+  });
+}
+
+template <typename scalar_t>
+void cpu_pixel_shuffle_channels_last(
+    Tensor& output,
+    const Tensor& input,
+    int64_t upscale_factor) {
+  TORCH_CHECK(input.ndimension() == 4,
+              "pixel shuffle with channels last format supports tensors with 4 dims");
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  int64_t sub_channels = channels / (upscale_factor * upscale_factor);
+  int64_t S = upscale_factor;
+
+  // input tensor shape of [n, h, w, c, s1, s2]
+  // output tensor shape of [n, h, s1, w, s2, c]
+  using Vec = vec::Vectorized<scalar_t>;
+  at::parallel_for(0, nbatch * height, 0, [&](int64_t begin, int64_t end) {
+    // temp buffer holding each channel lane
+    std::unique_ptr<scalar_t []> buffer(new scalar_t[channels]);
+    scalar_t* buffer_ptr = buffer.get();
+
+    int64_t n{0}, h{0};
+    data_index_init(begin, n, nbatch, h, height);
+    for (const auto i : c10::irange(begin, end)) {
+      for (const auto w : c10::irange(width)) {
+        scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels;
+
+        // step 1: transpose each channel lane
+        //   from: [c, s1*s2]
+        //   to:   [s1*s2, c]
+        utils::transpose(sub_channels, S * S, input_ptr, S * S, buffer_ptr, sub_channels);
+
+        // step 2: copy from temp buffer to output
+        for (const auto s1 : c10::irange(S)) {
+          scalar_t* x_ptr = buffer_ptr + s1 * S * sub_channels;
+          scalar_t* y_ptr = output_data + i * width * channels + s1 * width * S * sub_channels + w * S * sub_channels;
+
+          int64_t size = S * sub_channels;
+          int64_t d = 0;
+          for (; d < size - (size % Vec::size()); d += Vec::size()) {
+            Vec data_vec = Vec::loadu(x_ptr + d);
+            data_vec.store(y_ptr + d);
+          }
+          for (; d < size; d++) {
+            y_ptr[d] = x_ptr[d];
+          }
+        }
+      }
+
+      data_index_step(n, nbatch, h, height);
+    }
+  });
+}
+
+template <typename scalar_t>
+void cpu_pixel_unshuffle(
+    Tensor& output,
+    const Tensor& input,
+    int64_t downscale_factor) {
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  // [(B1...Bn), C, H, W] => [N, C, H, W]
+  int64_t sub_channels = input.size(-3);
+  int64_t height = input.size(-2) / downscale_factor;
+  int64_t width = input.size(-1) / downscale_factor;
+  int64_t channels = sub_channels * downscale_factor * downscale_factor;
+  int64_t numel = input.numel();
+  int64_t nbatch = numel / (channels * height * width);
+  int64_t S = downscale_factor;
+
+  // input strides
+  int64_t stride_n = channels * height * width;
+  int64_t stride_c = height * S * width * S;
+  int64_t stride_h = S * width * S;
+  int64_t stride_s1 = width * S;
+  int64_t stride_w = S;
+  int64_t stride_s2 = 1;
+
+  // input tensor shape of [n, c, h, s1, w, s2]
+  // output tensor shape of [n, c, s1, s2, h, w]
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t n{0}, c{0}, s1{0}, s2{0}, h{0}, w{0};
+    data_index_init(begin, n, nbatch, c, sub_channels, s1, S, s2, S, h, height, w, width);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t input_offset = n * stride_n + c * stride_c + h * stride_h +
+          s1 * stride_s1 + w * stride_w + s2 * stride_s2;
+      output_data[i] = input_data[input_offset];
+
+      data_index_step(n, nbatch, c, sub_channels, s1, S, s2, S, h, height, w, width);
+    }
+  });
+}
+
+template <typename scalar_t>
+void cpu_pixel_unshuffle_channels_last(
+    Tensor& output,
+    const Tensor& input,
+    int64_t downscale_factor) {
+  TORCH_CHECK(input.ndimension() == 4,
+              "pixel unshuffle with channels last format supports tensors with 4 dims");
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t sub_channels = input.size(1);
+  int64_t height = input.size(2) / downscale_factor;
+  int64_t width = input.size(3) / downscale_factor;
+  int64_t channels = sub_channels * downscale_factor * downscale_factor;
+  int64_t numel = input.numel();
+  int64_t S = downscale_factor;
+
+  // input strides
+  int64_t stride_n = height * width * channels;
+  int64_t stride_h = S * width * S * sub_channels;
+  int64_t stride_s1 = width * S * sub_channels;
+  int64_t stride_w = S * sub_channels;
+  int64_t stride_s2 = sub_channels;
+  int64_t stride_c = 1;
+
+  // input tensor shape of [n, h, s1, w, s2, c]
+  // output tensor shape of [n, h, w, c, s1, s2]
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t n{0}, h{0}, w{0}, c{0}, s1{0}, s2{0};
+    data_index_init(begin, n, nbatch, h, height, w, width, c, sub_channels, s1, S, s2, S);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t input_offset = n * stride_n + h * stride_h + s1 * stride_s1 +
+          w * stride_w + s2 * stride_s2 + c * stride_c;
+      output_data[i] = input_data[input_offset];
+
+      data_index_step(n, nbatch, h, height, w, width, c, sub_channels, s1, S, s2, S);
+    }
+  });
+}
+
+void pixel_shuffle_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    int64_t upscale_factor) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half,
+          input.scalar_type(), "pixel_shuffle", [&] {
+        cpu_pixel_shuffle<scalar_t>(output, input, upscale_factor);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half,
+          input.scalar_type(), "pixel_shuffle_channels_last", [&] {
+        cpu_pixel_shuffle_channels_last<scalar_t>(output, input, upscale_factor);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void pixel_unshuffle_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    int64_t downscale_factor) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      // input tensor shape of [N, C, Hr, Wr]
+      // output tensor shape of [N, Crr, H, W]
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half,
+          input.scalar_type(), "pixel_unshuffle", [&] {
+        cpu_pixel_unshuffle<scalar_t>(output, input, downscale_factor);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      // input tensor shape of [N, Hr, Wr, C]
+      // output tensor shape of [N, H, W, Crr]
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half,
+          input.scalar_type(), "pixel_unshuffle_channels_last", [&] {
+        cpu_pixel_unshuffle_channels_last<scalar_t>(output, input, downscale_factor);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(pixel_shuffle_kernel, &pixel_shuffle_kernel_impl);
+REGISTER_DISPATCH(pixel_unshuffle_kernel, &pixel_unshuffle_kernel_impl);
+
+}} // at::native
diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.h b/aten/src/ATen/native/cpu/PixelShuffleKernel.h
new file mode 100644
index 000000000000..f7234edf0e60
--- /dev/null
+++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.h
@@ -0,0 +1,13 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/DispatchStub.h>
+
+#pragma once
+
+namespace at { namespace native {
+
+using pixel_shuffle_fn = void(*)(Tensor&, const Tensor&, int64_t);
+DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel);
+DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel);
+
+}} // at::native
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index 549384055f20..d3be310e2802 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -92,7 +92,55 @@ static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
 
 static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
   ScalarType dtype = iter.dtype(0);
-  AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
+  if (dtype == kBFloat16) {
+    auto norm_val = norm.to<float>();
+    float beta_val(beta);
+    auto norm_val_vec = Vectorized<float>(norm_val);
+    auto beta_val_vec = Vectorized<float>(beta_val);
+    const auto neg_1_vec = Vectorized<float>(-1);
+    const auto zero_vec = Vectorized<float>(0);
+    const auto pos_1_vec = Vectorized<float>(1);
+    cpu_kernel_vec(iter,
+      [=](BFloat16 input, BFloat16 target, BFloat16 grad_output) -> BFloat16 {
+        const auto x = float(input) - float(target);
+        if (x <= -beta){
+          return -norm_val * float(grad_output);
+        }else if (x >= beta){
+          return norm_val * float(grad_output);
+        }else{
+          return norm_val * x * float(grad_output) / beta;
+        }
+      },
+      [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
+         Vectorized<BFloat16> input, Vectorized<BFloat16> target, Vectorized<BFloat16> grad_output) -> Vectorized<BFloat16> {
+        // using two blendv calls to simulate the 3 cases
+        // 1        if  x >= beta
+        // -1       if x <= -beta
+        // x / beta if |x| < beta
+        Vectorized<float> input0, input1, target0, target1, grad_output0, grad_output1;
+        std::tie(input0, input1) = convert_bfloat16_float(input);
+        std::tie(target0, target1) = convert_bfloat16_float(target);
+        std::tie(grad_output0, grad_output1) = convert_bfloat16_float(grad_output);
+        auto x = input0 - target0;
+        auto pos_or_neg_1_vec = Vectorized<float>::blendv(
+            neg_1_vec, pos_1_vec, x > zero_vec);
+        auto x_abs = x.abs();
+        auto output = Vectorized<float>::blendv(
+            x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
+        input0 = norm_val_vec * output * grad_output0;
+
+        x = input1 - target1;
+        pos_or_neg_1_vec = Vectorized<float>::blendv(
+            neg_1_vec, pos_1_vec, x > zero_vec);
+        x_abs = x.abs();
+        output = Vectorized<float>::blendv(
+            x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
+        input1 = norm_val_vec * output * grad_output1;
+        return convert_float_bfloat16(input0, input1);
+      }
+    );
+  } else {
+    AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
     scalar_t beta_val(beta);
     auto norm_val_vec = Vectorized<scalar_t>(norm_val);
@@ -126,6 +174,7 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no
       }
     );
   });
+  }
 }
 
 static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
index a13c4bca88ae..bade9772f697 100644
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -69,7 +69,7 @@ void pow_tensor_scalar_optimized_kernel(TensorIteratorBase& iter, const exp_scal
     );
   } else if (exp == -2.0) {
     cpu_kernel_vec(iter,
-        [](scalar_t base) -> scalar_t {
+        [](scalar_t base) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
           return static_cast<cast_scalar_t>(1.0) / (base * base); },
         [](Vec base) -> Vec { return (base * base).reciprocal(); }
     );
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 083f9cf19b16..58f39f156677 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -133,7 +133,7 @@ static void set_results(const res_t result, const TensorIteratorBase &iter, cons
 
 template<typename traits, std::size_t i = 0, typename... tuple_t>
 static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
-for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
+for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
   return i;
 }
 
diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
index 90bac8aab63f..c3d8ba7d136d 100644
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@@ -1,11 +1,12 @@
-#include<ATen/native/ReduceOps.h>
-#include<ATen/native/ReduceAllOps.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOpsUtils.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
-#include <ATen/native/SharedReduceOps.h>
-#include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/TensorIterator.h>
 
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/cpu/zmath.h>
@@ -30,7 +31,7 @@ inline void reduce_all_impl_vec(
   auto input_data = input.data_ptr<scalar_t>();
   // NOTE: parallel_reduce not support bool type
   scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
-    [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t {
+    [&](int64_t start, int64_t end, const scalar_t /*ident*/) -> scalar_t {
       scalar_t partial_out = vec::reduce_all<scalar_t>(
         [=](Vec x, Vec y) { return vop(x, y); },
         input_data + start,
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 67d8036f701c..52e18faf737d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -1,18 +1,23 @@
-#include <numeric>
-#include <iterator>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <algorithm>
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/native/ReduceOps.h>
-#include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/cpu/Reduce.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/imag.h>
+#endif
+
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <ATen/AccumulateType.h>
@@ -74,7 +79,7 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumsum_out_cpu", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, self.scalar_type(), "cumsum_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
@@ -93,7 +98,7 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumprod_out_cpu", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, self.scalar_type(), "cumprod_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
@@ -112,18 +117,19 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "logcumsumexp_out_cpu", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "logcumsumexp_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
-        scalar_t cum_number = (at::acc_type<scalar_t, false>)init_val;
+        using accscalar_t = at::acc_type<scalar_t, false>;
+        auto cum_number = (accscalar_t)init_val;
         for (const auto i : c10::irange(self_dim_size)) {
-          scalar_t x = self_data[i * self_dim_stride];
+          accscalar_t x = self_data[i * self_dim_stride];
 
           // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-          auto log_add_exp = [](scalar_t x, scalar_t y) -> scalar_t {
-            scalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan
-            scalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan
+          auto log_add_exp = [](accscalar_t x, accscalar_t y) -> accscalar_t {
+            accscalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan
+            accscalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan
             if (min != max || std::isfinite(min)) {
               // nan will be propagated here
               return std::log1p(std::exp(min - max)) + max;
@@ -218,6 +224,10 @@ static void norm_kernel_tensor_iterator_impl(
   } else {
     AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float");
   }
+  if (iter.numel() == 0) {
+    iter.output().fill_((val < 0) ? INFINITY : 0);
+    return;
+  }
 
   bool use_fast_path = is_reduce_lastdim(iter) && iter.dtype(0) == iter.input_dtype()
       && (iter.input_dtype() == kFloat || iter.input_dtype() == kBFloat16);
@@ -297,7 +307,7 @@ static void norm_kernel_tensor_iterator_impl(
       binary_kernel_reduce(
         iter,
         AbsMinOps<scalar_t, acc_t>(),
-        std::numeric_limits<acc_t>::max()
+        std::numeric_limits<acc_t>::infinity()
       );
     });
   } else {
diff --git a/aten/src/ATen/native/cpu/RenormKernel.cpp b/aten/src/ATen/native/cpu/RenormKernel.cpp
index 532dea3e59ab..0a9a0d0df352 100644
--- a/aten/src/ATen/native/cpu/RenormKernel.cpp
+++ b/aten/src/ATen/native/cpu/RenormKernel.cpp
@@ -1,5 +1,6 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/Normalization.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 
 #include <ATen/cpu/vec/vec.h>
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index 45c86ebdd181..d43f107a5502 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -1,7 +1,11 @@
-#include <ATen/native/ScatterGatherChecks.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/NonEmptyUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 
@@ -32,6 +36,33 @@ class ReduceAdd {
 };
 static ReduceAdd reduce_add;
 
+class ReduceMean {
+public:
+  template <typename scalar_t>
+  constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const {
+    *self_data += *src_data;
+  }
+};
+static ReduceMean reduce_mean;
+
+class ReduceMaximum {
+public:
+  template <typename scalar_t>
+  constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const {
+    *self_data = at::_isnan<scalar_t>(*src_data) ? *src_data : std::max(*self_data, *src_data);
+  }
+};
+static ReduceMaximum reduce_maximum;
+
+class ReduceMinimum {
+public:
+  template <typename scalar_t>
+  constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const {
+    *self_data = at::_isnan<scalar_t>(*src_data) ? *src_data : std::min(*self_data, *src_data);
+  }
+};
+static ReduceMinimum reduce_minimum;
+
 class TensorAssign {
 public:
   template <typename scalar_t>
@@ -280,6 +311,273 @@ struct cpu_scatter_gather_base_kernel {
       }
     );
   }
+
+  void operator()(const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name, ReduceMean& kernel_func) {
+
+    auto iter = TensorIteratorConfig()
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      // NOLINTNEXTLINE(bugprone-argument-comment)
+      .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
+      .add_output(self)
+      .add_input(src)
+      .add_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto index_dim_stride = ensure_nonempty_stride(index, dim);
+    auto index_dim_size = ensure_nonempty_size(index, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;
+
+    int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size);
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
+      "scatter_gather_tensor_cpu_reduce_mean", [&] {
+        constexpr auto SELF_ITER_STRIDE_IDX = 0;
+        constexpr auto INDEX_ITER_STRIDE_IDX = 2;
+        constexpr auto SRC_ITER_STRIDE_IDX = 1;
+        auto loop = [&](char** data, const int64_t* strides, int64_t n) {
+          auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];
+          auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];
+          auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];
+          // we change the order of TensorIterator-dim loop
+          // vs dim-TensorIterator loop order depending on
+          // whether dim is the last dimension
+          if (dim== self.dim() - 1) {
+            for (const auto nelem : c10::irange(n)) {
+              (void)nelem; //Suppress unused variable warning
+              // dim loop is a separate code block
+              // for better performance
+              _cpu_scatter_gather_dim_loop<is_scatter_like>()(
+                 (scalar_t*)self_data_bytes, self_dim_stride,
+                 (int64_t*)index_data_bytes, index_dim_stride,
+                 (scalar_t*)src_data_bytes, src_dim_stride,
+                 dim, index_dim_size, index_upper_bound,
+                 kernel_func
+               );
+
+              self_data_bytes += strides[SELF_ITER_STRIDE_IDX];
+              index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];
+              src_data_bytes += strides[SRC_ITER_STRIDE_IDX];
+            }
+          }
+          else {
+            for (const auto i : c10::irange(index_dim_size)) {
+              auto* self_data = self_data_bytes;
+              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
+              auto* src_data = src_data_bytes;
+              for (const auto nelem : c10::irange(n)) {
+                (void)nelem; //Suppress unused variable warning
+                int64_t idx_dim = *(int64_t*)index_data;
+                // we are not putting idx_dim in the error message because it disables
+                // loop optimization in clang-7
+                TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,
+                            "index ", *(int64_t*)index_data,
+                            " is out of bounds for dimension ", dim,
+                            " with size ", index_upper_bound);
+
+                kernel_func(
+                  (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,
+                  (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);
+
+                self_data += strides[SELF_ITER_STRIDE_IDX];
+                index_data += strides[INDEX_ITER_STRIDE_IDX];
+                src_data += strides[SRC_ITER_STRIDE_IDX];
+              }
+            }
+          }
+        };
+        iter.for_each(loop, grain_size);
+      }
+    );
+  }
+
+  void operator()(const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name, ReduceMaximum& kernel_func) {
+
+    auto iter = TensorIteratorConfig()
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      // NOLINTNEXTLINE(bugprone-argument-comment)
+      .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
+      .add_output(self)
+      .add_input(src)
+      .add_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto index_dim_stride = ensure_nonempty_stride(index, dim);
+    auto index_dim_size = ensure_nonempty_size(index, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;
+
+    int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size);
+
+    AT_DISPATCH_ALL_TYPES_AND3(
+      ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
+      "scatter_gather_tensor_cpu_reduce_amax", [&] {
+        constexpr auto SELF_ITER_STRIDE_IDX = 0;
+        constexpr auto INDEX_ITER_STRIDE_IDX = 2;
+        constexpr auto SRC_ITER_STRIDE_IDX = 1;
+        auto loop = [&](char** data, const int64_t* strides, int64_t n) {
+          auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];
+          auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];
+          auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];
+          // we change the order of TensorIterator-dim loop
+          // vs dim-TensorIterator loop order depending on
+          // whether dim is the last dimension
+          if (dim== self.dim() - 1) {
+            for (const auto nelem : c10::irange(n)) {
+              (void)nelem; //Suppress unused variable warning
+              // dim loop is a separate code block
+              // for better performance
+              _cpu_scatter_gather_dim_loop<is_scatter_like>()(
+                 (scalar_t*)self_data_bytes, self_dim_stride,
+                 (int64_t*)index_data_bytes, index_dim_stride,
+                 (scalar_t*)src_data_bytes, src_dim_stride,
+                 dim, index_dim_size, index_upper_bound,
+                 kernel_func
+               );
+
+              self_data_bytes += strides[SELF_ITER_STRIDE_IDX];
+              index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];
+              src_data_bytes += strides[SRC_ITER_STRIDE_IDX];
+            }
+          }
+          else {
+            for (const auto i : c10::irange(index_dim_size)) {
+              auto* self_data = self_data_bytes;
+              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
+              auto* src_data = src_data_bytes;
+              for (const auto nelem : c10::irange(n)) {
+                (void)nelem; //Suppress unused variable warning
+                int64_t idx_dim = *(int64_t*)index_data;
+                // we are not putting idx_dim in the error message because it disables
+                // loop optimization in clang-7
+                TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,
+                            "index ", *(int64_t*)index_data,
+                            " is out of bounds for dimension ", dim,
+                            " with size ", index_upper_bound);
+
+                kernel_func(
+                  (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,
+                  (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);
+
+                self_data += strides[SELF_ITER_STRIDE_IDX];
+                index_data += strides[INDEX_ITER_STRIDE_IDX];
+                src_data += strides[SRC_ITER_STRIDE_IDX];
+              }
+            }
+          }
+        };
+        iter.for_each(loop, grain_size);
+      }
+    );
+  }
+
+  void operator()(const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name, ReduceMinimum& kernel_func) {
+
+    auto iter = TensorIteratorConfig()
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      // NOLINTNEXTLINE(bugprone-argument-comment)
+      .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
+      .add_output(self)
+      .add_input(src)
+      .add_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto index_dim_stride = ensure_nonempty_stride(index, dim);
+    auto index_dim_size = ensure_nonempty_size(index, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;
+
+    int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size);
+
+    AT_DISPATCH_ALL_TYPES_AND3(
+      ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
+      "scatter_gather_tensor_cpu_reduce_amin", [&] {
+        constexpr auto SELF_ITER_STRIDE_IDX = 0;
+        constexpr auto INDEX_ITER_STRIDE_IDX = 2;
+        constexpr auto SRC_ITER_STRIDE_IDX = 1;
+        auto loop = [&](char** data, const int64_t* strides, int64_t n) {
+          auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];
+          auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];
+          auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];
+          // we change the order of TensorIterator-dim loop
+          // vs dim-TensorIterator loop order depending on
+          // whether dim is the last dimension
+          if (dim== self.dim() - 1) {
+            for (const auto nelem : c10::irange(n)) {
+              (void)nelem; //Suppress unused variable warning
+              // dim loop is a separate code block
+              // for better performance
+              _cpu_scatter_gather_dim_loop<is_scatter_like>()(
+                 (scalar_t*)self_data_bytes, self_dim_stride,
+                 (int64_t*)index_data_bytes, index_dim_stride,
+                 (scalar_t*)src_data_bytes, src_dim_stride,
+                 dim, index_dim_size, index_upper_bound,
+                 kernel_func
+               );
+
+              self_data_bytes += strides[SELF_ITER_STRIDE_IDX];
+              index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];
+              src_data_bytes += strides[SRC_ITER_STRIDE_IDX];
+            }
+          }
+          else {
+            for (const auto i : c10::irange(index_dim_size)) {
+              auto* self_data = self_data_bytes;
+              auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
+              auto* src_data = src_data_bytes;
+              for (const auto nelem : c10::irange(n)) {
+                (void)nelem; //Suppress unused variable warning
+                int64_t idx_dim = *(int64_t*)index_data;
+                // we are not putting idx_dim in the error message because it disables
+                // loop optimization in clang-7
+                TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,
+                            "index ", *(int64_t*)index_data,
+                            " is out of bounds for dimension ", dim,
+                            " with size ", index_upper_bound);
+
+                kernel_func(
+                  (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,
+                  (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);
+
+                self_data += strides[SELF_ITER_STRIDE_IDX];
+                index_data += strides[INDEX_ITER_STRIDE_IDX];
+                src_data += strides[SRC_ITER_STRIDE_IDX];
+              }
+            }
+          }
+        };
+        iter.for_each(loop, grain_size);
+      }
+    );
+  }
 };
 
 void gather_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
@@ -316,6 +614,34 @@ void scatter_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tens
     cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
                                        "scatter_reduce_multiply_", reduce_multiply);
     break;
+  default :
+    break;
+  }
+}
+
+void scatter_reduce_two_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
+                                   const Tensor& src, const SCATTER_GATHER_OP& reduce) {
+  switch (reduce) {
+  case SCATTER_GATHER_OP::REDUCE_ADD :
+    cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
+                                       "scatter_reduce_sum_", reduce_add);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MULTIPLY :
+    cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
+                                       "scatter_reduce_prod_", reduce_multiply);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MAXIMUM :
+    cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
+                                       "scatter_reduce_amax_", reduce_maximum);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MINIMUM :
+    cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
+                                       "scatter_reduce_amin_", reduce_minimum);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MEAN :
+    cpu_scatter_gather_base_kernel<>()(self, dim, index, src,
+                                       "scatter_reduce_mean_", reduce_mean);
+    break;
   }
 }
 
@@ -330,6 +656,8 @@ void scatter_scalar_reduce_cpu_kernel(const Tensor& self, const int64_t dim, con
     cpu_scatter_gather_base_kernel<>()(self, dim, index, value,
                                        "scatter_scalar_reduce_multiply_", reduce_multiply);
     break;
+  default:
+    break;
   }
 }
 
@@ -341,5 +669,6 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cpu_kernel);
 REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cpu_kernel);
 REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cpu_kernel);
 REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cpu_kernel);
+REGISTER_DISPATCH(scatter_reduce_two_stub, &scatter_reduce_two_cpu_kernel);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/SerialStackImpl.h b/aten/src/ATen/native/cpu/SerialStackImpl.h
index 3f509b0c6306..682161372009 100644
--- a/aten/src/ATen/native/cpu/SerialStackImpl.h
+++ b/aten/src/ATen/native/cpu/SerialStackImpl.h
@@ -1,10 +1,11 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 
 #include <ATen/MemoryOverlap.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
@@ -111,7 +112,7 @@ bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, in
   // or there is only one thread. Note that we aren't checking result.numel() here because
   // it may not have been resized and we want to defer that cost till later.
   int64_t numel_in_stack = first_tensor.numel() * tensors.size();
-  return numel_in_stack < at::internal::GRAIN_SIZE && at::get_num_threads() == 1;
+  return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
 }
 
 template <typename TensorListType, bool should_skip_overlap_check>
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 50a9b2350b1c..908d4fc60b7b 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cpu/SoftmaxKernel.h>
 
 #include <algorithm>
@@ -6,12 +7,13 @@
 
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 
-#include <ATen/AccumulateType.h>
 // [Note AVX-SSE transitions] In general we avoid calls into cmath for code
 // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
 // Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
@@ -41,11 +43,11 @@ inline void _vec_log_softmax_lastdim(
       outer_size,
       grain_size,
       [&](int64_t begin, int64_t end) {
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+        scalar_t tmp_sum_scalar[CHUNK_SIZE];
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+        scalar_t max_input_arr[CHUNK_SIZE];
         for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) {
-          // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-          scalar_t tmp_sum_scalar[CHUNK_SIZE];
-          // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-          scalar_t max_input_arr[CHUNK_SIZE];
           int64_t loop_end = CHUNK_SIZE;
           if (ii + CHUNK_SIZE > end)
             loop_end = end - ii;
@@ -102,38 +104,97 @@ inline void _vec_softmax_lastdim(
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
-  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
-  if (grain_size < 1)
-    grain_size = 1;
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1);
+  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      scalar_t* input_data = input_data_base + i * dim_size;
+      scalar_t* output_data = output_data_base + i * dim_size;
+      scalar_t max_input = vec::reduce_all<scalar_t>(
+          [](Vec& x, Vec& y) { return vec::maximum(x, y); },
+          input_data,
+          dim_size);
+      vec::map(
+          [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
+          output_data,
+          input_data,
+          dim_size);
+      scalar_t tmp_sum = vec::reduce_all<scalar_t>(
+          [](Vec x, Vec y) { return x + y; }, output_data, dim_size);
+      tmp_sum = 1 / tmp_sum;
+      vec::map(
+          [tmp_sum](Vec x) { return x * Vec(tmp_sum); },
+          output_data,
+          output_data,
+          dim_size);
+    }
+  });
+}
 
-  parallel_for(
-      0,
-      outer_size,
-      grain_size,
-      [&](int64_t begin, int64_t end) {
-        for (const auto i : c10::irange(begin, end)) {
-          scalar_t* input_data = input_data_base + i * dim_size;
-          scalar_t* output_data = output_data_base + i * dim_size;
-          scalar_t max_input = vec::reduce_all<scalar_t>(
-              [](Vec& x, Vec& y) { return vec::maximum(x, y); },
-              input_data,
-              dim_size);
-          vec::map(
-              [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
-              output_data,
-              input_data,
-              dim_size);
-          scalar_t tmp_sum = vec::reduce_all<scalar_t>(
-              [](Vec x, Vec y) { return x + y; }, output_data, dim_size);
-          tmp_sum = 1 / tmp_sum;
-          vec::map(
-              [tmp_sum](Vec x) { return x * Vec(tmp_sum); },
-              output_data,
-              output_data,
-              dim_size);
-        }
-      });
+template <>
+inline void _vec_softmax_lastdim<BFloat16>(
+    BFloat16* input_data_base,
+    BFloat16* output_data_base,
+    int64_t outer_size,
+    int64_t dim_size) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1);
+  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+    // thread local temp buffer.
+    std::unique_ptr<float []> buffer(new float[dim_size]);
+    float* buffer_data = buffer.get();
+
+    for (const auto i : c10::irange(begin, end)) {
+      BFloat16* input_data = input_data_base + i * dim_size;
+      BFloat16* output_data = output_data_base + i * dim_size;
+      // reduce to max and cache float input data
+      fVec max_fvec = fVec(-std::numeric_limits<float>::infinity());
+      int64_t d0 = 0;
+      for (; d0 < dim_size - (dim_size % bVec::size()); d0 += bVec::size()) {
+        bVec data_bvec = bVec::loadu(input_data + d0);
+        fVec data_fvec0, data_fvec1;
+        std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+        max_fvec = vec::maximum(max_fvec, data_fvec0);
+        max_fvec = vec::maximum(max_fvec, data_fvec1);
+        data_fvec0.store(buffer_data + d0);
+        data_fvec1.store(buffer_data + d0 + fVec::size());
+      }
+      float max_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return vec::maximum(x, y); }, max_fvec);
+      for (; d0 < dim_size; d0++) {
+        float data_val = input_data[d0];
+        max_val = std::max(max_val, data_val);
+        buffer_data[d0] = data_val;
+      }
+
+      // map (x - max).exp() and reduce to sum
+      fVec sum_fvec = fVec(float(0));
+      int64_t d1 = 0;
+      for (; d1 < dim_size - (dim_size % fVec::size()); d1 += fVec::size()) {
+        fVec data_fvec = (fVec::loadu(buffer_data + d1) - fVec(max_val)).exp();
+        sum_fvec += data_fvec;
+        data_fvec.store(buffer_data + d1);
+      }
+      float sum_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec);
+      for (; d1 < dim_size; d1++) {
+        float data_val = std::exp(buffer_data[d1] - max_val);
+        sum_val += data_val;
+        buffer_data[d1] = data_val;
+      }
+
+      sum_val = 1 / sum_val;
+      int64_t d2 = 0;
+      for (; d2 < dim_size - (dim_size % bVec::size()); d2 += bVec::size()) {
+        fVec out_fvec0 = fVec::loadu(buffer_data + d2) * fVec(sum_val);
+        fVec out_fvec1 = fVec::loadu(buffer_data + d2 + fVec::size()) * fVec(sum_val);
+        bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1);
+        out_bvec.store(output_data + d2);
+      }
+      for (; d2 < dim_size; d2++) {
+        output_data[d2] = BFloat16(buffer_data[d2] * sum_val);
+      }
+    }
+  });
 }
 
 template <typename scalar_t, bool log_softmax>
diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h
index a393c08056e2..f9af73903454 100644
--- a/aten/src/ATen/native/cpu/SoftmaxKernel.h
+++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h
@@ -1,9 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <cstdint>
 
 namespace at {
+class Tensor;
+
 namespace native {
 
 using forward_fn = void (*)(const Tensor&, const Tensor&);
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
index 8eab924407d1..b756c6c46a7e 100644
--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -1,42 +1,27 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/Sorting.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/native/StridedRandomAccessor.h>
 #include <ATen/native/CompositeRandomAccessor.h>
-#include <ATen/native/Sorting.h>
-#include <ATen/native/SortingUtils.h>
+#include <ATen/native/TopKImpl.h>
+#include <c10/core/WrapDimMinimal.h>
 #include <c10/util/irange.h>
 
 namespace at { namespace native {
 
 namespace {
 
-void _fill_indices(Tensor& indices, int64_t dim) {
-  auto dim_size = indices.size(dim);
-  auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
-  auto idx_dim_sizes = std::vector<int64_t>(indices.dim(), 1);
-  auto idx_dim_strides = std::vector<int64_t>(indices.dim(), 0);
-  idx_dim_sizes[dim] = dim_size;
-  idx_dim_strides[dim] = 1;
-  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
-  indices.copy_(idx_dim_restrided);
-}
-
 template <typename func_t>
 void _dim_apply(
-    Tensor& values,
-    Tensor& indices,
+    const TensorBase &values,
+    const TensorBase &indices,
     int64_t dim,
     const std::string& method_name,
     const func_t& f) {
-  dim = maybe_wrap_dim(dim, values.dim());
-  TORCH_CHECK(
-    dim >= 0 && dim < values.dim(),
-    method_name, "(): invalid dimension parameter ", dim
-  );
-
   auto iter = TensorIteratorConfig()
     .check_all_same_dtype(false)
     .resize_outputs(false)
@@ -56,6 +41,10 @@ void _dim_apply(
         auto* values_data_bytes = data[0];
         auto* indices_data_bytes = data[1];
 
+        if(values_data_bytes==nullptr || indices_data_bytes==nullptr){
+          return;
+        }
+
         for (const auto i : c10::irange(n)) {
           (void)i; //Suppress unused variable warning
           f(
@@ -95,8 +84,9 @@ struct KeyValueCompDesc {
 };
 
 static void sort_kernel(
-    Tensor& values,
-    Tensor& indices,
+    const TensorBase& self,
+    const TensorBase& values,
+    const TensorBase& indices,
     int64_t dim,
     bool descending,
     bool stable) {
@@ -143,9 +133,9 @@ static void sort_kernel(
 }
 
 static void topk_kernel(
-    const Tensor& values,
-    const Tensor& indices,
-    const Tensor& self,
+    const TensorBase &values,
+    const TensorBase &indices,
+    const TensorBase &self,
     int64_t k,
     int64_t dim,
     bool largest,
diff --git a/aten/src/ATen/native/cpu/StackKernel.cpp b/aten/src/ATen/native/cpu/StackKernel.cpp
index 8a6615c0d277..6e9248149d8a 100644
--- a/aten/src/ATen/native/cpu/StackKernel.cpp
+++ b/aten/src/ATen/native/cpu/StackKernel.cpp
@@ -1,6 +1,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 
 #include <ATen/Dispatch.h>
 #include <ATen/native/cpu/StackKernel.h>
diff --git a/aten/src/ATen/native/cpu/StackKernel.h b/aten/src/ATen/native/cpu/StackKernel.h
index abb72f9dba7f..4e9a45e4dd12 100644
--- a/aten/src/ATen/native/cpu/StackKernel.h
+++ b/aten/src/ATen/native/cpu/StackKernel.h
@@ -1,7 +1,7 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
index 914f03a2d81c..27fa214fba1c 100644
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@@ -1,3 +1,5 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/TensorCompare.h>
 
@@ -10,13 +12,20 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/WrapDimUtils.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
-#include <ATen/native/TensorIterator.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cpu/Loops.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/result_type.h>
+#endif
+
 namespace at { namespace native { namespace {
 
 template <typename scalar_t, typename scalar_t_2 = int64_t, typename loop1d_t>
@@ -322,14 +331,18 @@ static void isin_default_kernel_cpu(
   });
 }
 
-static void clamp_kernel_impl(TensorIterator& iter) {
+static void clamp_kernel_impl(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_cpu", [&]() {
     cpu_kernel_vec(iter,
       [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t {
-        return std::min(std::max(a, min), max);
+        if (min != min || max != max) {
+            return std::numeric_limits<scalar_t>::quiet_NaN();
+        } else {
+            return std::min(std::max(a, min), max);
+        }
       },
       [](Vectorized<scalar_t> a, Vectorized<scalar_t> min, Vectorized<scalar_t> max) {
-        return vec::clamp(a, min, max);
+        return vec::minimum(vec::maximum(a, min), max);
       });
   });
 }
@@ -350,18 +363,6 @@ static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min
   });
 }
 
-static void clamp_max_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_max_cpu", [&]() {
-    cpu_kernel_vec(iter,
-      [](scalar_t a, scalar_t max) -> scalar_t {
-        return std::min(a, max);
-      },
-      [](Vectorized<scalar_t> a, Vectorized<scalar_t> max) {
-        return vec::clamp_max(a, max);
-      });
-  });
-}
-
 static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() {
     const auto max = max_.to<scalar_t>();
@@ -376,18 +377,6 @@ static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_)
   });
 }
 
-static void clamp_min_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_min_cpu", [&]() {
-    cpu_kernel_vec(iter,
-        [](scalar_t a, scalar_t min) -> scalar_t {
-          return std::max(a, min);
-        },
-        [](Vectorized<scalar_t> a, Vectorized<scalar_t> min) {
-          return vec::clamp_min(a, min);
-        });
-  });
-}
-
 static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) {
   AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_min_cpu", [&]() {
     const auto min = min_.to<scalar_t>();
@@ -412,8 +401,6 @@ REGISTER_DISPATCH(isposinf_stub, &isposinf_kernel_impl);
 REGISTER_DISPATCH(isneginf_stub, &isneginf_kernel_impl);
 REGISTER_DISPATCH(mode_stub, &mode_kernel_impl);
 REGISTER_DISPATCH(clamp_stub, &clamp_kernel_impl);
-REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_impl);
-REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_impl);
 REGISTER_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl);
 REGISTER_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl);
 REGISTER_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 8d862615cc5d..5e61823e1d25 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -15,6 +15,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/cpu/zmath.h>
+#include <ATen/OpMathType.h>
 
 #include <c10/util/MathConstants.h>
 #include <c10/core/Scalar.h>
@@ -174,12 +175,19 @@ void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
 }
 
 static void abs_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "abs_cpu", [&]() {
-    cpu_kernel_vec(
-        iter,
-        [=](scalar_t a) -> scalar_t { return abs_impl(a); },
-        [=](Vectorized<scalar_t> a) { return a.abs(); });
-  });
+  auto dtype = iter.dtype();
+  if (dtype == kComplexHalf) {
+    using scalar_t = c10::complex<Half>;
+    using opmath_t = at::opmath_type<scalar_t>;
+    cpu_kernel(iter, [=](scalar_t a) -> scalar_t { return abs_impl(opmath_t{a}); });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "abs_cpu", [&]() {
+      cpu_kernel_vec(
+          iter,
+          [=](scalar_t a) -> scalar_t { return abs_impl(a); },
+          [=](Vectorized<scalar_t> a) { return a.abs(); });
+    });
+  }
 }
 
 static void angle_kernel(TensorIteratorBase& iter) {
@@ -191,28 +199,10 @@ static void angle_kernel(TensorIteratorBase& iter) {
   });
 }
 
-static void real_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "real_cpu", [&]() {
-    cpu_kernel_vec(
-        iter,
-        [=](scalar_t a) -> scalar_t { return real_impl(a); },
-        [=](Vectorized<scalar_t> a) { return a.real(); });
-  });
-}
-
-static void imag_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "imag_cpu", [&]() {
-    cpu_kernel_vec(
-        iter,
-        [=](scalar_t a) -> scalar_t { return imag_impl(a); },
-        [=](Vectorized<scalar_t> a) { return a.imag(); });
-  });
-}
-
 // NB: Ignores the negative bit on tensors
 void conj_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kBool, kBFloat16, kHalf, kComplexHalf, iter.common_dtype(), "conj_cpu", [&]() {
         cpu_kernel_vec(
             iter,
             [=](scalar_t a) -> scalar_t { return conj_impl(a); },
@@ -275,7 +265,7 @@ void reciprocal_kernel(TensorIteratorBase& iter) {
 
 // NB: Ignores the negative bit on tensors
 void neg_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "neg_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kComplexHalf, kBFloat16, kHalf, iter.dtype(), "neg_cpu", [&]() {
     cpu_kernel_vec(
         iter,
         [=](scalar_t a) -> scalar_t { return -a; },
@@ -312,13 +302,21 @@ static void signbit_kernel(TensorIteratorBase& iter){
   });
 }
 
-static void sgn_kernel(TensorIteratorBase& iter){
-  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cpu", [&]() {
-    cpu_kernel_vec(
-      iter,
-      [=](scalar_t a) -> scalar_t { return sgn_impl(a); },
-      [=](Vectorized<scalar_t> a) { return a.sgn(); });
-  });
+static void sgn_kernel(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  if (dtype == kComplexHalf) {
+    using scalar_t = c10::complex<Half>;
+    using opmath_t = at::opmath_type<scalar_t>;
+    cpu_kernel(
+        iter, [=](scalar_t a) -> scalar_t { return sgn_impl(opmath_t{a}); });
+  } else {
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "sgn_cpu", [&]() {
+      cpu_kernel_vec(
+        iter,
+        [=](scalar_t a) -> scalar_t { return sgn_impl(a); },
+        [=](Vectorized<scalar_t> a) { return a.sgn(); });
+    });
+  }
 }
 
 static void sinc_kernel(TensorIteratorBase& iter) {
@@ -504,6 +502,13 @@ static void ndtri_kernel(TensorIteratorBase& iter) {
       });
 }
 
+static void log_ndtr_kernel(TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cpu", [&]() {
+        cpu_kernel(iter, [](scalar_t x) { return calc_log_ndtr(x); });
+      });
+}
+
 static void i0e_kernel(TensorIteratorBase& iter) {
   TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);
   AT_DISPATCH_FLOATING_TYPES_AND(
@@ -614,8 +619,6 @@ REGISTER_DISPATCH(sigmoid_stub, &CPU_CAPABILITY::sigmoid_kernel);
 REGISTER_DISPATCH(logit_stub, &CPU_CAPABILITY::logit_kernel);
 REGISTER_DISPATCH(abs_stub, &CPU_CAPABILITY::abs_kernel);
 REGISTER_DISPATCH(angle_stub, &CPU_CAPABILITY::angle_kernel);
-REGISTER_DISPATCH(real_stub, &CPU_CAPABILITY::real_kernel);
-REGISTER_DISPATCH(imag_stub, &CPU_CAPABILITY::imag_kernel);
 REGISTER_DISPATCH(conj_physical_stub, &CPU_CAPABILITY::conj_kernel);
 REGISTER_DISPATCH(exp2_stub, &CPU_CAPABILITY::exp2_kernel);
 REGISTER_DISPATCH(bitwise_not_stub, &CPU_CAPABILITY::bitwise_not_kernel);
@@ -641,6 +644,7 @@ REGISTER_DISPATCH(special_entr_stub, &CPU_CAPABILITY::entr_kernel);
 REGISTER_DISPATCH(frexp_stub, &CPU_CAPABILITY::frexp_kernel);
 REGISTER_DISPATCH(special_i0e_stub, &CPU_CAPABILITY::i0e_kernel);
 REGISTER_DISPATCH(special_ndtri_stub, &CPU_CAPABILITY::ndtri_kernel);
+REGISTER_DISPATCH(special_log_ndtr_stub, &CPU_CAPABILITY::log_ndtr_kernel);
 REGISTER_DISPATCH(special_i1_stub, &CPU_CAPABILITY::i1_kernel);
 REGISTER_DISPATCH(special_i1e_stub, &CPU_CAPABILITY::i1e_kernel);
 REGISTER_DISPATCH(special_erfcx_stub, &CPU_CAPABILITY::erfcx_kernel);
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index cc3a6b68d43e..9bfa9ac8c6ab 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -1,8 +1,11 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/Unfold2d.h>
 #include <ATen/native/cpu/Loops.h>
 #include <c10/util/irange.h>
+#include <ATen/native/cpu/utils.h>
 #include <cmath>
 
 namespace at {
@@ -116,6 +119,61 @@ static void unfolded2d_acc(
   });
 }
 
+template <typename scalar_t>
+static void unfolded2d_acc_channels_last(
+    scalar_t* finput_data,
+    scalar_t* input_data,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width) {
+
+  for (int64_t y = 0; y < output_height; y++) {
+    for (int64_t x = 0; x < output_width; x++) {
+      scalar_t* src = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane;
+      scalar_t* dst = input_data;
+
+      if (padW > 0 || padH > 0) {
+        for (int64_t kh = 0; kh < kH; kh++) {
+          for (int64_t kw = 0; kw < kW; kw++) {
+            int64_t iy = y * dH - padH + kh;
+            int64_t ix = x * dW - padW + kw;
+            if (iy < 0 || iy >= input_height || ix < 0 || ix >= input_width) {
+            } else {
+              scalar_t* dst_slice = dst + iy * input_width * n_input_plane + ix * n_input_plane;
+              scalar_t* src_slice = src + kh * kW * n_input_plane + kw * n_input_plane;
+              cadd(dst_slice,
+                   dst_slice,
+                   src_slice,
+                   n_input_plane);
+            }
+          }
+        }
+      } else {
+        for (int64_t kh = 0; kh < kH; kh++) {
+          for (int64_t kw = 0; kw < kW; kw++) {
+            int64_t iy = y * dH + kh;
+            int64_t ix = x * dW + kw;
+            scalar_t* dst_slice = dst + iy * input_width * n_input_plane + ix * n_input_plane;
+            scalar_t* src_slice = src + kh * kW * n_input_plane + kw * n_input_plane;
+            cadd(dst_slice,
+                 dst_slice,
+                 src_slice,
+                 n_input_plane);
+          }
+        }
+      }
+    }
+  }
+}
+
 /* note: due to write issues, this one cannot be parallelized as well as
  * unfolded2d_copy */
 void unfolded2d_acc_kernel(
@@ -132,28 +190,41 @@ void unfolded2d_acc_kernel(
     int64_t input_height,
     int64_t input_width,
     int64_t output_height,
-    int64_t output_width) {
+    int64_t output_width,
+    bool is_channels_last) {
   // This function assumes that
   // output_height*dH does not overflow a int64_t
   // output_width*dW does not overflow a int64_t
 
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      at::ScalarType::BFloat16, dtype, "unfolded2d_acc", [&] {
-        unfolded2d_acc(
-            static_cast<scalar_t*>(finput_data),
-            static_cast<scalar_t*>(input_data),
-            kH,
-            kW,
-            dH,
-            dW,
-            padH,
-            padW,
-            n_input_plane,
-            input_height,
-            input_width,
-            output_height,
-            output_width);
+  if (is_channels_last) {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_acc_channels_last", [&] {
+      unfolded2d_acc_channels_last(
+          static_cast<scalar_t*>(finput_data),
+          static_cast<scalar_t*>(input_data),
+          kH, kW,
+          dH, dW,
+          padH, padW,
+          n_input_plane,
+          input_height,
+          input_width,
+          output_height,
+          output_width);
+     });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_acc", [&] {
+      unfolded2d_acc(
+          static_cast<scalar_t*>(finput_data),
+          static_cast<scalar_t*>(input_data),
+          kH, kW,
+          dH, dW,
+          padH, padW,
+          n_input_plane,
+          input_height,
+          input_width,
+          output_height,
+          output_width);
       });
+  }
 }
 
 template <typename scalar_t>
@@ -263,6 +334,64 @@ static void unfolded2d_copy(
       });
 }
 
+template <typename scalar_t>
+static void unfolded2d_copy_channels_last(
+    scalar_t* input_data,
+    scalar_t* finput_data,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width) {
+  at::parallel_for(0, output_height * output_width, 0, [&](int64_t start, int64_t end) {
+    int64_t y = 0;
+    int64_t x = 0;
+    data_index_init(start, y, output_height, x, output_width);
+
+    for (const auto k : c10::irange(start, end)) {
+      (void)k; // Suppress unused variable warning
+      scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane;
+      scalar_t* src = input_data;
+
+      if (padW > 0 || padH > 0) {
+        for (int64_t kh = 0; kh < kH; kh++) {
+          for (int64_t kw = 0; kw < kW; kw++) {
+            int64_t iy = y * dH - padH + kh;
+            int64_t ix = x * dW - padW + kw;
+            if (iy < 0 || iy >= input_height || ix < 0 || ix >= input_width) {
+              memset(dst + kh * kW * n_input_plane + kw * n_input_plane,
+                    0,
+                    sizeof(scalar_t) * n_input_plane);
+            } else {
+              memcpy(dst + kh * kW * n_input_plane + kw * n_input_plane,
+                     src + iy * input_width * n_input_plane + ix * n_input_plane,
+                     sizeof(scalar_t) * n_input_plane);
+            }
+          }
+        }
+      } else {
+        for (int64_t kh = 0; kh < kH; kh++) {
+          for (int64_t kw = 0; kw < kW; kw++) {
+            int64_t iy = y * dH + kh;
+            int64_t ix = x * dW + kw;
+            memcpy(dst + kh * kW * n_input_plane + kw * n_input_plane,
+                   src + iy * input_width * n_input_plane + ix * n_input_plane,
+                   sizeof(scalar_t) * n_input_plane);
+          }
+        }
+      }
+      // move on to next output index
+      data_index_step(y, output_height, x, output_width);
+    }
+  });
+}
+
 void unfolded2d_copy_kernel(
     ScalarType dtype,
     void *finput_data,
@@ -277,30 +406,43 @@ void unfolded2d_copy_kernel(
     int64_t input_height,
     int64_t input_width,
     int64_t output_height,
-    int64_t output_width) {
+    int64_t output_width,
+    bool is_channels_last) {
   // This function assumes that
   // kH*kW does not overflow an int
   // n_input_plane*kH*kW does not overflow a int64_t
   // output_height*dH does not overflow a int64_t
   // output_width*dW does not overflow a int64_t
 
-  AT_DISPATCH_ALL_TYPES_AND(
-      at::ScalarType::BFloat16, dtype, "unfolded2d_copy", [&] {
-        unfolded2d_copy(
-            static_cast<scalar_t*>(input_data),
-            static_cast<scalar_t*>(finput_data),
-            kH,
-            kW,
-            dH,
-            dW,
-            padH,
-            padW,
+  if (is_channels_last) {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_copy_channels_last", [&] {
+      unfolded2d_copy_channels_last(
+          static_cast<scalar_t*>(input_data),
+          static_cast<scalar_t*>(finput_data),
+            kH, kW,
+            dH, dW,
+            padH, padW,
             n_input_plane,
             input_height,
             input_width,
             output_height,
             output_width);
-      });
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_copy", [&] {
+      unfolded2d_copy(
+          static_cast<scalar_t*>(input_data),
+          static_cast<scalar_t*>(finput_data),
+            kH, kW,
+            dH, dW,
+            padH, padW,
+            n_input_plane,
+            input_height,
+            input_width,
+            output_height,
+            output_width);
+    });
+  }
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
index b226b68bbca1..8cfe6674906e 100644
--- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
@@ -1,3 +1,5 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/UnfoldBackward.h>
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 88bdbd71d1ee..cfc931862372 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -1,51 +1,28 @@
-#include <ATen/ATen.h>
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/UpSample.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/cpu/vec/vec.h>
+#include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/ones.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
 
 using scale_t = std::vector<c10::optional<double>>;
 
-static inline int64_t nearest_idx(
-    int64_t output_index,
-    int64_t input_size,
-    int64_t output_size,
-    c10::optional<double> scales) {
-  // This method specificly treats cases: output_size == input_size or
-  // output_size == 2 * input_size, that we would like to get rid of
-  // We keep this method for BC and consider as deprecated.
-  // See nearest_exact_idx as replacement
-  if (output_size == input_size) {
-    // scale_factor = 1, simply copy
-    return output_index;
-  } else if (output_size == 2 * input_size) {
-    // scale_factor = 2, shift input index
-    return output_index >> 1;
-  } else {
-    float scale = compute_scales_value<float>(scales, input_size, output_size);
-    return nearest_neighbor_compute_source_index(scale, output_index, input_size);
-  }
-}
-
-static inline int64_t nearest_exact_idx(
-    int64_t output_index,
-    int64_t input_size,
-    int64_t output_size,
-    c10::optional<double> scales) {
-  float scale = compute_scales_value<float>(scales, input_size, output_size);
-  return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
-}
-
-// Define a typedef to dispatch to nearest_idx or nearest_exact_idx
-typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>);
-
 // Helper structs and methods for cpu_upsample_linear
 //
 // Interpolation methods that used below are separable, and as such we can compute the interpolation
@@ -147,7 +124,6 @@ template <typename scalar_t, typename index_t>
 static inline scalar_t interpolate_aa_single_dim_zero_strides(
     char* src,
     char** data,
-    int64_t i,
     const index_t ids_stride) {
   const index_t ids_min = *(index_t*)&data[0][0];
   const index_t ids_size = *(index_t*)&data[1][0];
@@ -259,7 +235,7 @@ struct CheckAlmostAllZeroStrides {
 
 template <int non_zero_stride_dim, typename scalar_t, typename index_t, int interp_size>
 struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, interp_size> {
-  static inline bool eval(const int64_t* strides) {
+  static inline bool eval(const int64_t* /*strides*/) {
     return true;
   }
 };
@@ -293,7 +269,7 @@ static inline void basic_loop_aa_single_dim_zero_strides(
   for (const auto i : c10::irange(n)) {
     *(scalar_t*)&dst[i * strides[0]] =
         interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
-            src + i * strides[1], &data[2], i, ids_stride);
+            src + i * strides[1], &data[2], ids_stride);
   }
 }
 
@@ -452,6 +428,16 @@ void cpu_upsample_nearest_channels_last(
   }
 }
 
+template <typename scalar_t, typename accscalar_t>
+inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w) {
+  return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w);
+}
+
+template <typename scalar_t, typename accscalar_t, typename... Args>
+inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w, Args... args) {
+  return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w) + interpolate(args...);
+}
+
 template <typename scalar_t, typename scale_type>
 void cpu_upsample_linear_channels_last(
     const Tensor& output_,
@@ -485,6 +471,7 @@ void cpu_upsample_linear_channels_last(
   TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels);
   int64_t output_slice_size = output_depth * output_height * output_width * channels;
 
+  using accscalar_t = at::acc_type<scalar_t, false>;
   using Vec = vec::Vectorized<scalar_t>;
   auto loop2d = [&](int64_t begin, int64_t end) {
     const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
@@ -514,23 +501,19 @@ void cpu_upsample_linear_channels_last(
           scalar_t* i01 = input_indexr(n, ih0, iw1);
           scalar_t* i10 = input_indexr(n, ih1, iw0);
           scalar_t* i11 = input_indexr(n, ih1, iw1);
+          accscalar_t w00 = h0lambda * w0lambda;
+          accscalar_t w01 = h0lambda * w1lambda;
+          accscalar_t w10 = h1lambda * w0lambda;
+          accscalar_t w11 = h1lambda * w1lambda;
 
           int64_t size = channels;
           int64_t d = 0;
           for (; d < size - (size % Vec::size()); d += Vec::size()) {
-            Vec out_vec =
-                Vec(h0lambda * w0lambda) * Vec::loadu(i00 + d) + /* h0 * w0 * i00 */
-                Vec(h0lambda * w1lambda) * Vec::loadu(i01 + d) + /* h0 * w1 * i01 */
-                Vec(h1lambda * w0lambda) * Vec::loadu(i10 + d) + /* h1 * w0 * i10 */
-                Vec(h1lambda * w1lambda) * Vec::loadu(i11 + d);  /* h1 * w1 * i11 */
+            auto out_vec = interpolate(i00 + d, w00, i01 + d, w01, i10 + d, w10, i11 + d, w11);
             out_vec.store(out + d);
           }
           for (; d < size; d++) {
-            out[d] =
-                h0lambda * w0lambda * i00[d] + /* h0 * w0 * i00 */
-                h0lambda * w1lambda * i01[d] + /* h0 * w1 * i01 */
-                h1lambda * w0lambda * i10[d] + /* h1 * w0 * i10 */
-                h1lambda * w1lambda * i11[d];  /* h1 * w1 * i11 */
+            out[d] = i00[d] * w00 + i01[d] * w01 + i10[d] * w10 + i11[d] * w11;
           }
         }
       }
@@ -576,31 +559,27 @@ void cpu_upsample_linear_channels_last(
             scalar_t* i101 = input_indexr(n, id1, ih0, iw1);
             scalar_t* i110 = input_indexr(n, id1, ih1, iw0);
             scalar_t* i111 = input_indexr(n, id1, ih1, iw1);
+            accscalar_t w000 = d0lambda * h0lambda * w0lambda;
+            accscalar_t w001 = d0lambda * h0lambda * w1lambda;
+            accscalar_t w010 = d0lambda * h1lambda * w0lambda;
+            accscalar_t w011 = d0lambda * h1lambda * w1lambda;
+            accscalar_t w100 = d1lambda * h0lambda * w0lambda;
+            accscalar_t w101 = d1lambda * h0lambda * w1lambda;
+            accscalar_t w110 = d1lambda * h1lambda * w0lambda;
+            accscalar_t w111 = d1lambda * h1lambda * w1lambda;
 
             int64_t size = channels;
             int64_t d = 0;
             for (; d < size - (size % Vec::size()); d += Vec::size()) {
-              Vec out_vec =
-                  Vec(d0lambda * h0lambda * w0lambda) * Vec::loadu(i000 + d) + /* d0 * h0 * w0 * i000 */
-                  Vec(d0lambda * h0lambda * w1lambda) * Vec::loadu(i001 + d) + /* d0 * h0 * w1 * i001 */
-                  Vec(d0lambda * h1lambda * w0lambda) * Vec::loadu(i010 + d) + /* d0 * h1 * w0 * i010 */
-                  Vec(d0lambda * h1lambda * w1lambda) * Vec::loadu(i011 + d) + /* d0 * h1 * w1 * i011 */
-                  Vec(d1lambda * h0lambda * w0lambda) * Vec::loadu(i100 + d) + /* d1 * h0 * w0 * i100 */
-                  Vec(d1lambda * h0lambda * w1lambda) * Vec::loadu(i101 + d) + /* d1 * h0 * w1 * i101 */
-                  Vec(d1lambda * h1lambda * w0lambda) * Vec::loadu(i110 + d) + /* d1 * h1 * w0 * i110 */
-                  Vec(d1lambda * h1lambda * w1lambda) * Vec::loadu(i111 + d);  /* d1 * h1 * w1 * i111 */
+              auto out_vec = interpolate(
+                  i000 + d, w000, i001 + d, w001, i010 + d, w010, i011 + d, w011,
+                  i100 + d, w100, i101 + d, w101, i110 + d, w110, i111 + d, w111);
               out_vec.store(out + d);
             }
             for (; d < size; d++) {
               out[d] =
-                  d0lambda * h0lambda * w0lambda * i000[d] + /* d0 * h0 * w0 * i000 */
-                  d0lambda * h0lambda * w1lambda * i001[d] + /* d0 * h0 * w1 * i001 */
-                  d0lambda * h1lambda * w0lambda * i010[d] + /* d0 * h1 * w0 * i010 */
-                  d0lambda * h1lambda * w1lambda * i011[d] + /* d0 * h1 * w1 * i011 */
-                  d1lambda * h0lambda * w0lambda * i100[d] + /* d1 * h0 * w0 * i100 */
-                  d1lambda * h0lambda * w1lambda * i101[d] + /* d1 * h0 * w1 * i101 */
-                  d1lambda * h1lambda * w0lambda * i110[d] + /* d1 * h1 * w0 * i110 */
-                  d1lambda * h1lambda * w1lambda * i111[d];  /* d1 * h1 * w1 * i111 */
+                  i000[d] * w000 + i001[d] * w001 + i010[d] * w010 + i011[d] * w011 +
+                  i100[d] * w100 + i101[d] * w101 + i110[d] * w110 + i111[d] * w111;
             }
           }
         }
@@ -675,7 +654,7 @@ struct HelperInterpBase {
   template <typename scalar_t, typename aa_filter_fn_t>
   static inline std::vector<Tensor> _compute_indices_weights_aa(
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
-    int64_t reshape_dim, bool align_corners, scalar_t scale,
+    int64_t reshape_dim, scalar_t scale,
     int interp_size, aa_filter_fn_t aa_filter_fn
   ) {
 
@@ -786,8 +765,8 @@ struct HelperInterpNearest : public HelperInterpBase {
     HelperInterpNearest::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_nearest", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+      ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] {
 
         scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -887,8 +866,8 @@ struct HelperInterpLinear : public HelperInterpBase {
     HelperInterpLinear::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_linear", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+      ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] {
 
         scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -956,7 +935,6 @@ struct HelperInterpLinear : public HelperInterpBase {
             stride,
             ndims,
             reshape_dim,
-            align_corners,
             scale,
             interp_size,
             &HelperInterpLinear::aa_filter<scalar_t>);
@@ -990,8 +968,8 @@ struct HelperInterpCubic : public HelperInterpBase {
     HelperInterpCubic::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_cubic", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+      ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] {
 
         scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -1068,7 +1046,6 @@ struct HelperInterpCubic : public HelperInterpBase {
             stride,
             ndims,
             reshape_dim,
-            align_corners,
             scale,
             interp_size,
             &HelperInterpCubic::aa_filter<scalar_t>);
@@ -1114,7 +1091,7 @@ void upsample_generic_Nd_kernel_impl(
 
   constexpr int interp_size = F::interp_size;
   auto input_scalar_type = input.scalar_type();
-  if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
+  if ((interp_size == 1 && input_scalar_type == at::ScalarType::Byte)) {
     // nearest also supports uint8 tensor, but we have to use float
     // with compute_indices_weights
     input_scalar_type = at::ScalarType::Float;
@@ -1147,14 +1124,14 @@ void upsample_generic_Nd_kernel_impl(
 
   if (interp_size > 1) {
     // Nearest also supports uint8 tensor, so need to handle it separately
-    AT_DISPATCH_FLOATING_TYPES(
-        iter.dtype(), "upsample_generic_Nd", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] {
         // MSVC can not catch constexpr int interp_size here
         constexpr int mode = F::interp_size;
         cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte,
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
         iter.dtype(), "upsample_generic_Nd", [&] {
         constexpr int mode = F::interp_size;
         cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
@@ -1295,7 +1272,8 @@ void upsample_nearest2d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
+        input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_h, scales_w});
     });
   } else {
@@ -1326,7 +1304,8 @@ void upsample_nearest3d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
+        input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_d, scales_h, scales_w});
     });
   } else {
@@ -1369,7 +1348,7 @@ void upsample_bilinear2d_kernel_impl(
 
   // Temporarily dispatch to original channels last implementation
   if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "upsample_bilinear2d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] {
       cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_h, scales_w});
     });
   } else {
@@ -1397,7 +1376,7 @@ void upsample_trilinear3d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
       cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_d, scales_h, scales_w});
     });
   } else {
@@ -1427,156 +1406,6 @@ void upsample_bicubic2d_aa_kernel_impl(
       output, input, align_corners, {scales_h, scales_w});
 }
 
-template <typename scalar_t, typename scale_type, nearest_idx_fn_t nearest_idx_fn>
-void cpu_upsample_nearest_backward(
-    const Tensor& grad_input_,
-    const Tensor& grad_output_,
-    const scale_type& scales) {
-  TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(),
-              " for `grad_input` but got dtype ", grad_input_.dtype());
-
-  auto grad_output = grad_output_.contiguous();
-  auto grad_input = grad_input_.contiguous();
-
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto grad_input_data = grad_input.data_ptr<scalar_t>();
-  auto input_sizes = grad_input.sizes().vec();
-  auto output_sizes = grad_output.sizes().vec();
-  auto ndim = input_sizes.size();
-
-  // treat nbatch and channels as one dimension
-  int64_t channels = input_sizes[0] * input_sizes[1];
-  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
-  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
-  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
-  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
-  int64_t input_width = input_sizes[ndim - 1];
-  int64_t output_width = output_sizes[ndim - 1];
-
-  int64_t output_slice_size = output_depth * output_height * output_width;
-  int64_t input_slice_size = input_depth * input_height * input_width;
-
-  auto loop1d = [&](int64_t begin, int64_t end) {
-    for (const auto c : c10::irange(begin, end)) {
-      for (const auto ow : c10::irange(output_width)) {
-        int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[0]);
-        int64_t output_offset = c * output_slice_size + ow;
-        int64_t input_offset = c * input_slice_size + iw;
-        grad_input_data[input_offset] += grad_output_data[output_offset];
-      }
-    }
-  };
-
-  auto loop2d = [&](int64_t begin, int64_t end) {
-    for (const auto c : c10::irange(begin, end)) {
-      for (const auto oh : c10::irange(output_height)) {
-        int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
-        for (const auto ow : c10::irange(output_width)) {
-          int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
-          int64_t output_offset = c * output_slice_size + oh * output_width + ow;
-          int64_t input_offset = c * input_slice_size + ih * input_width + iw;
-          grad_input_data[input_offset] += grad_output_data[output_offset];
-        }
-      }
-    }
-  };
-
-  auto loop3d = [&](int64_t begin, int64_t end) {
-    for (const auto c : c10::irange(begin, end)) {
-      for (const auto od : c10::irange(output_depth)) {
-        int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]);
-        for (const auto oh : c10::irange(output_height)) {
-          int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
-          for (const auto ow : c10::irange(output_width)) {
-            int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
-            int64_t output_offset = c * output_slice_size +
-                od *  output_height * output_width + oh * output_width + ow;
-            int64_t input_offset = c * input_slice_size +
-                id * input_height * input_width + ih * input_width + iw;
-            grad_input_data[input_offset] += grad_output_data[output_offset];
-          }
-        }
-      }
-    }
-  };
-
-  if (ndim == 3) {
-    // upsample nearest 1d
-    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop1d);
-  } else if (ndim == 4) {
-    // upsample nearest 2d
-    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size , loop2d);
-  } else {
-    // upsample nearest 3d
-    TORCH_INTERNAL_ASSERT(ndim == 5);
-    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop3d);
-  }
-
-  if (!grad_input_.is_contiguous()) {
-    grad_input_.copy_(grad_input);
-  }
-}
-
-void upsample_nearest1d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_w});
-  });
-}
-
-void _upsample_nearest_exact1d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_w});
-  });
-}
-
-void upsample_nearest2d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest2d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
-  });
-}
-
-void _upsample_nearest_exact2d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
-  });
-}
-
-void upsample_nearest3d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest3d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
-  });
-}
-
-void _upsample_nearest_exact3d_backward_kernel_impl(
-    const Tensor& grad_input,
-    const Tensor& grad_output,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
-  });
-}
-
 template <
     typename scalar_t,
     typename scale_type,
@@ -1726,12 +1555,6 @@ REGISTER_DISPATCH(upsample_nearest2d_kernel, &upsample_nearest2d_kernel_impl);
 REGISTER_DISPATCH(_upsample_nearest_exact2d_kernel, &_upsample_nearest_exact2d_kernel_impl);
 REGISTER_DISPATCH(upsample_nearest3d_kernel, &upsample_nearest3d_kernel_impl);
 REGISTER_DISPATCH(_upsample_nearest_exact3d_kernel, &_upsample_nearest_exact3d_kernel_impl);
-REGISTER_DISPATCH(upsample_nearest1d_backward_kernel, &upsample_nearest1d_backward_kernel_impl);
-REGISTER_DISPATCH(_upsample_nearest_exact1d_backward_kernel, &_upsample_nearest_exact1d_backward_kernel_impl);
-REGISTER_DISPATCH(upsample_nearest2d_backward_kernel, &upsample_nearest2d_backward_kernel_impl);
-REGISTER_DISPATCH(_upsample_nearest_exact2d_backward_kernel, &_upsample_nearest_exact2d_backward_kernel_impl);
-REGISTER_DISPATCH(upsample_nearest3d_backward_kernel, &upsample_nearest3d_backward_kernel_impl);
-REGISTER_DISPATCH(_upsample_nearest_exact3d_backward_kernel, &_upsample_nearest_exact3d_backward_kernel_impl);
 
 REGISTER_DISPATCH(upsample_linear1d_kernel, &upsample_linear1d_kernel_impl);
 REGISTER_DISPATCH(upsample_bilinear2d_kernel, &upsample_bilinear2d_kernel_impl);
diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
index 22ab12bad12a..a26cef72bb10 100644
--- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
@@ -1,13 +1,13 @@
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <math.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <vector>
-#include <ATen/ATen.h>
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/UpSample.h>
 #include <ATen/Parallel.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/TensorIterator.h>
 #include <c10/util/irange.h>
+#include <ATen/cpu/vec/vec.h>
 
 namespace at {
 namespace native {
@@ -15,6 +15,260 @@ namespace {
 
 using scale_t = std::vector<c10::optional<double>>;
 
+template <typename scalar_t, typename scale_type, nearest_idx_fn_t nearest_idx_fn>
+void cpu_upsample_nearest_backward(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const scale_type& scales) {
+  TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(),
+              " for `grad_input` but got dtype ", grad_input_.dtype());
+
+  auto grad_output = grad_output_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+  auto input_sizes = grad_input.sizes().vec();
+  auto output_sizes = grad_output.sizes().vec();
+  auto ndim = input_sizes.size();
+
+  // treat nbatch and channels as one dimension
+  int64_t channels = input_sizes[0] * input_sizes[1];
+  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
+  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
+  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
+  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
+  int64_t input_width = input_sizes[ndim - 1];
+  int64_t output_width = output_sizes[ndim - 1];
+
+  int64_t output_slice_size = output_depth * output_height * output_width;
+  int64_t input_slice_size = input_depth * input_height * input_width;
+
+  auto loop1d = [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      for (const auto ow : c10::irange(output_width)) {
+        int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[0]);
+        int64_t output_offset = c * output_slice_size + ow;
+        int64_t input_offset = c * input_slice_size + iw;
+        grad_input_data[input_offset] += grad_output_data[output_offset];
+      }
+    }
+  };
+
+  auto loop2d = [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      for (const auto oh : c10::irange(output_height)) {
+        int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
+        for (const auto ow : c10::irange(output_width)) {
+          int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
+          int64_t output_offset = c * output_slice_size + oh * output_width + ow;
+          int64_t input_offset = c * input_slice_size + ih * input_width + iw;
+          grad_input_data[input_offset] += grad_output_data[output_offset];
+        }
+      }
+    }
+  };
+
+  auto loop3d = [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]);
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
+            int64_t output_offset = c * output_slice_size +
+                od *  output_height * output_width + oh * output_width + ow;
+            int64_t input_offset = c * input_slice_size +
+                id * input_height * input_width + ih * input_width + iw;
+            grad_input_data[input_offset] += grad_output_data[output_offset];
+          }
+        }
+      }
+    }
+  };
+
+  if (ndim == 3) {
+    // upsample nearest 1d
+    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop1d);
+  } else if (ndim == 4) {
+    // upsample nearest 2d
+    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size , loop2d);
+  } else {
+    // upsample nearest 3d
+    TORCH_INTERNAL_ASSERT(ndim == 5);
+    at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop3d);
+  }
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t, typename scale_type, nearest_idx_fn_t nearest_idx_fn>
+void cpu_upsample_nearest_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const scale_type& scales) {
+  TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(),
+              " for `grad_input` but got dtype ", grad_input_.dtype());
+
+  auto ndim = grad_output_.ndimension();
+  TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.")
+
+  auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d;
+  auto grad_output = grad_output_.contiguous(channels_last_memory_format);
+  auto grad_input = grad_input_.contiguous(channels_last_memory_format);
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+
+  auto input_sizes = grad_input.sizes().vec();
+  auto output_sizes = grad_output.sizes().vec();
+
+  int64_t num_batches =  input_sizes[0];
+  int64_t channels =  input_sizes[1];
+  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
+  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
+  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
+  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
+  int64_t input_width = input_sizes[ndim - 1];
+  int64_t output_width = output_sizes[ndim - 1];
+
+  using Vec = vec::Vectorized<scalar_t>;
+  auto acc = [](scalar_t* gin, scalar_t* gout, int64_t size) {
+    int64_t d = 0;
+    for (; d < size - (size % Vec::size()); d += Vec::size()) {
+      Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d);
+      gin_vec.store(gin + d);
+    }
+    for (; d < size; d++) {
+      gin[d] += gout[d];
+    }
+  };
+
+  auto loop2d = [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      for (const auto oh : c10::irange(output_height)) {
+        int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
+        for (const auto ow : c10::irange(output_width)) {
+          int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
+          scalar_t* grad_output_ptr = grad_output_data +
+              (n * output_height * output_width + oh * output_width + ow) * channels;
+          scalar_t* grad_input_ptr = grad_input_data +
+              (n * input_height * input_width + ih * input_width + iw) * channels;
+          acc(grad_input_ptr, grad_output_ptr, channels);
+        }
+      }
+    }
+  };
+
+  auto loop3d = [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      for (int64_t od = 0; od < output_depth; od++) {
+        int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]);
+        for (int64_t oh = 0; oh < output_height; oh++) {
+          int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
+          for (int64_t ow = 0; ow < output_width; ow++) {
+            int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
+            scalar_t* grad_output_ptr = grad_output_data +
+                (n * output_depth * output_height * output_width +
+                 od * output_height * output_width + oh * output_width + ow) * channels;
+            scalar_t* grad_input_ptr = grad_input_data +
+                (n * input_depth * input_height * input_width +
+                 id * input_height * input_width + ih * input_width + iw) * channels;
+            acc(grad_input_ptr, grad_output_ptr, channels);
+          }
+        }
+      }
+    }
+  };
+
+  if (ndim == 4) {
+    // upsample nearest 2d
+    at::parallel_for(0, num_batches, 0, loop2d);
+  } else {
+    // upsample nearest 3d
+    TORCH_INTERNAL_ASSERT(ndim == 5);
+    at::parallel_for(0, num_batches, 0, loop3d);
+  }
+
+  if (!grad_input_.is_contiguous(channels_last_memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void upsample_nearest1d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_w) {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
+    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_w});
+  });
+}
+
+void _upsample_nearest_exact1d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_w) {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] {
+    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_w});
+  });
+}
+
+void upsample_nearest2d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] {
+      cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] {
+      cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
+    });
+  }
+}
+
+void _upsample_nearest_exact2d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] {
+      cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] {
+      cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
+    });
+  }
+}
+
+void upsample_nearest3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] {
+    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+  });
+}
+
+void _upsample_nearest_exact3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] {
+    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+  });
+}
 
 template <typename scalar_t, typename scale_type>
 void cpu_upsample_linear_backward(
@@ -156,12 +410,143 @@ void cpu_upsample_linear_backward(
   }
 }
 
+template <typename scalar_t, typename scale_type>
+void cpu_upsample_linear_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    bool align_corners,
+    const scale_type& scales) {
+  TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(),
+              " for `grad_input` but got dtype ", grad_input_.dtype());
+
+  auto ndim = grad_output_.ndimension();
+  TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.")
+
+  auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d;
+  auto grad_output = grad_output_.contiguous(channels_last_memory_format);
+  auto grad_input = grad_input_.contiguous(channels_last_memory_format);
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+
+  auto input_sizes = grad_input.sizes().vec();
+  auto output_sizes = grad_output.sizes().vec();
+
+  int64_t num_batches =  input_sizes[0];
+  int64_t channels =  input_sizes[1];
+  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
+  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
+  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
+  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
+  int64_t input_width = input_sizes[ndim - 1];
+  int64_t output_width = output_sizes[ndim - 1];
+
+  using accscalar_t = at::acc_type<scalar_t, false>;
+  using Vec = vec::Vectorized<scalar_t>;
+  auto acc = [](scalar_t* gin, scalar_t* gout, accscalar_t w, int64_t size) {
+    int64_t d = 0;
+    for (; d < size - (size % Vec::size()); d += Vec::size()) {
+      Vec gin_vec = Vec::loadu(gin + d) + Vec(w) * Vec::loadu(gout + d);
+      gin_vec.store(gin + d);
+    }
+    for (; d < size; d++) {
+      gin[d] += w * gout[d];
+    }
+  };
+
+  auto loop2d = [&](int64_t begin, int64_t end) {
+    const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
+        input_height, output_height, align_corners, scales[0]);
+    const scalar_t width_scale = area_pixel_compute_scale<scalar_t>(
+        input_width, output_width, align_corners, scales[1]);
+
+    auto input_indexr = [=](int64_t n, int64_t h, int64_t w){
+      return grad_input_data + (n * input_height * input_width + h * input_width + w) * channels;
+    };
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t ih0, ih1, iw0, iw1;
+    scalar_t h0lambda, h1lambda, w0lambda, w1lambda;
+    for (const auto n : c10::irange(begin, end)) {
+      for (const auto oh : c10::irange(output_height)) {
+        compute_source_index_and_lambda(
+            ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners);
+        for (const auto ow : c10::irange(output_width)) {
+          compute_source_index_and_lambda(
+              iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);
+          scalar_t* grad_output_ptr = grad_output_data +
+              (n * output_height * output_width + oh * output_width + ow) * channels;
+          acc(input_indexr(n, ih0, iw0), grad_output_ptr, h0lambda * w0lambda, channels); /* i00 */
+          acc(input_indexr(n, ih0, iw1), grad_output_ptr, h0lambda * w1lambda, channels); /* i01 */
+          acc(input_indexr(n, ih1, iw0), grad_output_ptr, h1lambda * w0lambda, channels); /* i10 */
+          acc(input_indexr(n, ih1, iw1), grad_output_ptr, h1lambda * w1lambda, channels); /* i11 */
+        }
+      }
+    }
+  };
+
+  auto loop3d = [&](int64_t begin, int64_t end) {
+    const scalar_t depth_scale = area_pixel_compute_scale<scalar_t>(
+        input_depth, output_depth, align_corners, scales[0]);
+    const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
+        input_height, output_height, align_corners, scales[1]);
+    const scalar_t width_scale = area_pixel_compute_scale<scalar_t>(
+        input_width, output_width, align_corners, scales[2]);
+
+    auto input_indexr = [=](int64_t n, int64_t d, int64_t h, int64_t w) {
+      return grad_input_data + (n * input_depth * input_height * input_width +
+          d * input_height * input_width + h * input_width + w) * channels;
+    };
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t id0, id1, ih0, ih1, iw0, iw1;
+    scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda;
+    for (const auto n : c10::irange(begin, end)) {
+      for (const auto od : c10::irange(output_depth)) {
+        compute_source_index_and_lambda(
+            id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners);
+        for (const auto oh : c10::irange(output_height)) {
+          compute_source_index_and_lambda(
+              ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners);
+          for (const auto ow : c10::irange(output_width)) {
+            compute_source_index_and_lambda(
+                iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);
+            scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width +
+                od *  output_height * output_width + oh * output_width + ow) * channels;
+            acc(input_indexr(n, id0, ih0, iw0), grad_output_ptr, d0lambda * h0lambda * w0lambda, channels); /* i000 */
+            acc(input_indexr(n, id0, ih0, iw1), grad_output_ptr, d0lambda * h0lambda * w1lambda, channels); /* i001 */
+            acc(input_indexr(n, id0, ih1, iw0), grad_output_ptr, d0lambda * h1lambda * w0lambda, channels); /* i010 */
+            acc(input_indexr(n, id0, ih1, iw1), grad_output_ptr, d0lambda * h1lambda * w1lambda, channels); /* i011 */
+            acc(input_indexr(n, id1, ih0, iw0), grad_output_ptr, d1lambda * h0lambda * w0lambda, channels); /* i100 */
+            acc(input_indexr(n, id1, ih0, iw1), grad_output_ptr, d1lambda * h0lambda * w1lambda, channels); /* i101 */
+            acc(input_indexr(n, id1, ih1, iw0), grad_output_ptr, d1lambda * h1lambda * w0lambda, channels); /* i110 */
+            acc(input_indexr(n, id1, ih1, iw1), grad_output_ptr, d1lambda * h1lambda * w1lambda, channels); /* i111 */
+          }
+        }
+      }
+    }
+  };
+
+  if (ndim == 4) {
+    // upsample bilinear 2d
+    at::parallel_for(0, num_batches, 0, loop2d);
+  } else {
+    // upsample trilinear 3d
+    TORCH_INTERNAL_ASSERT(ndim == 5);
+    at::parallel_for(0, num_batches, 0, loop3d);
+  }
+
+  if (!grad_input_.is_contiguous(channels_last_memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
 void upsample_linear1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_linear1d_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_linear1d_backward", [&] {
     cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_w});
   });
 }
@@ -172,9 +557,15 @@ void upsample_bilinear2d_backward_kernel_impl(
     bool align_corners,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] {
-    cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
-  });
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] {
+      cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] {
+      cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
+    });
+  }
 }
 
 void upsample_trilinear3d_backward_kernel_impl(
@@ -184,13 +575,26 @@ void upsample_trilinear3d_backward_kernel_impl(
     c10::optional<double> scales_d,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] {
-    cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
-  });
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] {
+      cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] {
+      cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
+    });
+  }
 }
 
 } // anonymous namespace
 
+REGISTER_DISPATCH(upsample_nearest1d_backward_kernel, &upsample_nearest1d_backward_kernel_impl);
+REGISTER_DISPATCH(_upsample_nearest_exact1d_backward_kernel, &_upsample_nearest_exact1d_backward_kernel_impl);
+REGISTER_DISPATCH(upsample_nearest2d_backward_kernel, &upsample_nearest2d_backward_kernel_impl);
+REGISTER_DISPATCH(_upsample_nearest_exact2d_backward_kernel, &_upsample_nearest_exact2d_backward_kernel_impl);
+REGISTER_DISPATCH(upsample_nearest3d_backward_kernel, &upsample_nearest3d_backward_kernel_impl);
+REGISTER_DISPATCH(_upsample_nearest_exact3d_backward_kernel, &_upsample_nearest_exact3d_backward_kernel_impl);
+
 REGISTER_DISPATCH(upsample_linear1d_backward_kernel, &upsample_linear1d_backward_kernel_impl);
 REGISTER_DISPATCH(upsample_bilinear2d_backward_kernel, &upsample_bilinear2d_backward_kernel_impl);
 REGISTER_DISPATCH(upsample_trilinear3d_backward_kernel, &upsample_trilinear3d_backward_kernel_impl);
diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.cpp b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
new file mode 100644
index 000000000000..dfec0a49aeb1
--- /dev/null
+++ b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
@@ -0,0 +1,437 @@
+#include <ATen/ATen.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/WeightNormKernel.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+void weight_norm_first_dim_kernel(
+    Tensor& w,
+    Tensor& norm,
+    const Tensor& v,
+    const Tensor& g,
+    int64_t M, int64_t N) {
+  const auto v_data = v.data_ptr<scalar_t>();
+  const auto g_data = g.data_ptr<scalar_t>();
+  auto w_data = w.data_ptr<scalar_t>();
+  auto norm_data = norm.data_ptr<accscalar_t>();
+
+  using Vec = vec::Vectorized<accscalar_t>;
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      accscalar_t norm_val = vec::map_reduce_all<scalar_t>(
+          [](Vec x) { return x * x; },
+          [](Vec x, Vec y) { return x + y; },
+          v_data + i * N,
+          N);
+      norm_val = std::sqrt(norm_val);
+      norm_data[i] = norm_val;
+
+      accscalar_t a = g_data[i] / norm_val;
+      vec::map(
+          [a](Vec x) { return x * Vec(a); },
+          w_data + i * N,
+          v_data + i * N,
+          N);
+    }
+  });
+}
+
+template <typename scalar_t>
+inline void sum_norm_per_row(
+    scalar_t* out_ptr,
+    const scalar_t* v_ptr,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  vec::map2(
+      [](Vec out, Vec v) { return out + v * v; },
+      out_ptr,
+      out_ptr,
+      v_ptr,
+      size);
+}
+
+inline void sum_norm_per_row(
+    float* out_ptr,
+    const BFloat16* v_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    fVec v_fvec0, v_fvec1;
+    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+
+    fVec out_fvec0 = fVec::loadu(out_ptr + d) + v_fvec0 * v_fvec0;
+    fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + v_fvec1 * v_fvec1;
+    out_fvec0.store(out_ptr + d);
+    out_fvec1.store(out_ptr + d + fVec::size());
+  }
+  for(; d < size; ++d) {
+    float v_val = float(v_ptr[d]);
+    out_ptr[d] += v_val * v_val;
+  }
+}
+
+template <typename scalar_t>
+inline void apply_norm_per_row(
+    scalar_t* w_ptr,
+    const scalar_t* v_ptr,
+    const scalar_t* a_ptr,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  vec::map2(
+      [](Vec v, Vec a) { return v * a; },
+      w_ptr,
+      v_ptr,
+      a_ptr,
+      size);
+}
+
+inline void apply_norm_per_row(
+    BFloat16* w_ptr,
+    const BFloat16* v_ptr,
+    const float* a_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    fVec v_fvec0, v_fvec1;
+    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+
+    fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0;
+    fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1;
+    bVec w_bvec = convert_float_bfloat16(w_fvec0, w_fvec1);
+    w_bvec.store(w_ptr + d);
+  }
+  for(; d < size; ++d) {
+    w_ptr[d] = float(v_ptr[d]) * a_ptr[d];
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void weight_norm_last_dim_kernel(
+    Tensor& w,
+    Tensor& norm,
+    const Tensor& v,
+    const Tensor& g,
+    int64_t M, int64_t N) {
+  const auto v_data = v.data_ptr<scalar_t>();
+  const auto g_data = g.data_ptr<scalar_t>();
+  auto w_data = w.data_ptr<scalar_t>();
+  auto norm_data = norm.data_ptr<accscalar_t>();
+
+  int num_threads = at::get_num_threads();
+  Tensor buffer = at::empty({num_threads, N}, norm.options()).zero_();
+  auto buffer_data = buffer.data_ptr<accscalar_t>();
+
+  // vertical parallel reduction
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    auto buffer_ptr = buffer_data + tid * N;
+    for (const auto i : c10::irange(begin, end)) {
+      sum_norm_per_row(buffer_ptr, v_data + i * N, N);
+    }
+  });
+
+  for (const auto j : c10::irange(N)) {
+    accscalar_t sum = 0;
+    for (const auto t : c10::irange(num_threads)) {
+      sum += buffer_data[t * N + j];
+    }
+    norm_data[j] = std::sqrt(sum);
+  }
+
+  // reuse the first row of buffer to store g / norm
+  vec::convert(g_data, buffer_data, N);
+  using Vec = vec::Vectorized<accscalar_t>;
+  vec::map2(
+      [](Vec g, Vec norm) { return g / norm; },
+      buffer_data,
+      buffer_data,
+      norm_data,
+      N);
+
+  // apply w = v * (g/norm)
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      apply_norm_per_row(w_data + i * N, v_data + i * N, buffer_data, N);
+    }
+  });
+}
+
+template <typename scalar_t, typename accscalar_t>
+void weight_norm_backward_first_dim_kernel(
+    Tensor& grad_v,
+    Tensor& grad_g,
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norm,
+    int64_t M, int64_t N) {
+  const auto grad_w_data = grad_w.data_ptr<scalar_t>();
+  const auto saved_v_data = saved_v.data_ptr<scalar_t>();
+  const auto saved_g_data = saved_g.data_ptr<scalar_t>();
+  const auto saved_norm_data = saved_norm.data_ptr<accscalar_t>();
+  auto grad_v_data = grad_v.data_ptr<scalar_t>();
+  auto grad_g_data = grad_g.data_ptr<scalar_t>();
+
+  using Vec = vec::Vectorized<accscalar_t>;
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      accscalar_t per_dim_sum_val = vec::map2_reduce_all<scalar_t>(
+          [](Vec grad_w, Vec saved_v) { return grad_w * saved_v; },
+          [](Vec x, Vec y) { return x + y; },
+          grad_w_data + i * N,
+          saved_v_data + i * N,
+          N);
+
+      accscalar_t saved_norm_val = saved_norm_data[i];
+      accscalar_t saved_g_val = accscalar_t(saved_g_data[i]);
+      accscalar_t grad_g_val = per_dim_sum_val / saved_norm_val;
+
+      // grad_g = sum / norm
+      // grad_v = (g / norm) * (grad_w - v * (sum / norm^2))
+      //  let a = g /norm
+      //      b = a * grad_g / norm
+      // grad_v = a * grad_w - b * v
+      grad_g_data[i] = scalar_t(grad_g_val);
+      accscalar_t a = saved_g_val / saved_norm_val;
+      accscalar_t b = a * grad_g_val / saved_norm_val;
+
+      vec::map2(
+          [a, b](Vec grad_w, Vec v) { return Vec(a) * grad_w - Vec(b) * v; },
+          grad_v_data + i * N,
+          grad_w_data + i * N,
+          saved_v_data + i * N,
+          N);
+    }
+  });
+}
+
+template <typename scalar_t>
+inline void sum_product_per_row(
+    scalar_t* out_ptr,
+    const scalar_t* grad_w_ptr,
+    const scalar_t* v_ptr,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  vec::map3(
+      [](Vec out, Vec grad_w, Vec v) { return out + grad_w * v; },
+      out_ptr,
+      out_ptr,
+      grad_w_ptr,
+      v_ptr,
+      size);
+}
+
+inline void sum_product_per_row(
+    float* out_ptr,
+    const BFloat16* grad_w_ptr,
+    const BFloat16* v_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
+    fVec grad_w_fvec0, grad_w_fvec1;
+    std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec);
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    fVec v_fvec0, v_fvec1;
+    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+
+    fVec out_fvec0 = fVec::loadu(out_ptr + d) + grad_w_fvec0 * v_fvec0;
+    fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + grad_w_fvec1 * v_fvec1;
+    out_fvec0.store(out_ptr + d);
+    out_fvec1.store(out_ptr + d + fVec::size());
+  }
+  for(; d < size; ++d) {
+    float grad_w_val = float(grad_w_ptr[d]);
+    float v_val = float(v_ptr[d]);
+    out_ptr[d] += grad_w_val * v_val;
+  }
+}
+
+template <typename scalar_t>
+inline void apply_per_row_backward(
+    scalar_t* grad_v_ptr,
+    const scalar_t* grad_w_ptr,
+    const scalar_t* v_ptr,
+    const scalar_t* a_ptr,
+    const scalar_t* b_ptr,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  vec::map4(
+      [](Vec grad_w, Vec v, Vec a, Vec b) { return a * grad_w - b * v; },
+      grad_v_ptr,
+      grad_w_ptr,
+      v_ptr,
+      a_ptr,
+      b_ptr,
+      size);
+}
+
+inline void apply_per_row_backward(
+    BFloat16* grad_v_ptr,
+    const BFloat16* grad_w_ptr,
+    const BFloat16* v_ptr,
+    const float* a_ptr,
+    const float* b_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
+    fVec grad_w_fvec0, grad_w_fvec1;
+    std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec);
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    fVec v_fvec0, v_fvec1;
+    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+
+    fVec grad_v_fvec0 = fVec::loadu(a_ptr + d) * grad_w_fvec0 - fVec::loadu(b_ptr + d) * v_fvec0;
+    fVec grad_v_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * grad_w_fvec1
+        - fVec::loadu(b_ptr + d + fVec::size()) * v_fvec1;
+    bVec grad_v_bvec = convert_float_bfloat16(grad_v_fvec0, grad_v_fvec1);
+    grad_v_bvec.store(grad_v_ptr + d);
+  }
+  for(; d < size; ++d) {
+    grad_v_ptr[d] = float(grad_w_ptr[d]) * a_ptr[d] - float(v_ptr[d]) * b_ptr[d];
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void weight_norm_backward_last_dim_kernel(
+    Tensor& grad_v,
+    Tensor& grad_g,
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norm,
+    int64_t M, int64_t N) {
+  const auto grad_w_data = grad_w.data_ptr<scalar_t>();
+  const auto saved_v_data = saved_v.data_ptr<scalar_t>();
+  const auto saved_g_data = saved_g.data_ptr<scalar_t>();
+  const auto saved_norm_data = saved_norm.data_ptr<accscalar_t>();
+  auto grad_v_data = grad_v.data_ptr<scalar_t>();
+  auto grad_g_data = grad_g.data_ptr<scalar_t>();
+
+  int num_threads = at::get_num_threads();
+  Tensor buffer = at::empty({num_threads, N}, saved_norm.options()).zero_();
+  auto buffer_data = buffer.data_ptr<accscalar_t>();
+
+  // vertical parallel reduction
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    auto buffer_ptr = buffer_data + tid * N;
+    for (const auto i : c10::irange(begin, end)) {
+      sum_product_per_row(buffer_ptr, grad_w_data + i * N, saved_v_data + i * N, N);
+    }
+  });
+
+  // store result on the first row of buffer
+  for (const auto j : c10::irange(N)) {
+    accscalar_t sum = 0;
+    for (const auto t : c10::irange(num_threads)) {
+      sum += buffer_data[t * N + j];
+    }
+    buffer_data[j] = sum;
+  }
+
+  accscalar_t* per_dim_sum = buffer_data;
+  accscalar_t* a = buffer_data + N;
+  accscalar_t* b = buffer_data + 2 * N;
+
+  // a = g /norm
+  // b = a * grad_g / norm
+  for (const auto j : c10::irange(N)) {
+    accscalar_t saved_norm_val = saved_norm_data[j];
+    accscalar_t saved_g_val = accscalar_t(saved_g_data[j]);
+    accscalar_t grad_g_val = per_dim_sum[j] / saved_norm_val;
+    grad_g_data[j] = scalar_t(grad_g_val);
+
+    a[j] = saved_g_val / saved_norm_val;
+    b[j] = a[j] * grad_g_val / saved_norm_val;
+  }
+
+  // apply grad_v = a * grad_w - b * v
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      apply_per_row_backward(
+          grad_v_data + i * N,
+          grad_w_data + i * N,
+          saved_v_data + i * N,
+          a,
+          b,
+          N);
+    }
+  });
+}
+
+void weight_norm_kernel(
+    Tensor& w,
+    Tensor& norm,
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim) {
+  TORCH_INTERNAL_ASSERT(dim == 0 || dim == v.dim() - 1,
+      "fused kernels can only be applied for first or last dim");
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, v.scalar_type(),
+      "weight_norm_kernel", [&]() {
+    using accscalar_t = vec::vec_scalar_t<scalar_t>;
+    if (dim == 0) {
+      int64_t M = v.size(0);
+      int64_t N = v.numel() / M;
+      weight_norm_first_dim_kernel<scalar_t, accscalar_t>(w, norm, v, g, M, N);
+    } else {
+      int64_t N = v.size(-1);
+      int64_t M = v.numel() / N;
+      weight_norm_last_dim_kernel<scalar_t, accscalar_t>(w, norm, v, g, M, N);
+    }
+  });
+}
+
+void weight_norm_backward_kernel(
+    Tensor& grad_v,
+    Tensor& grad_g,
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norm,
+    int64_t dim) {
+  TORCH_INTERNAL_ASSERT(dim == 0 || dim == saved_v.dim() - 1,
+      "fused kernels can only be applied for first or last dim");
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, saved_v.scalar_type(),
+      "weight_norm_backward_kernel", [&]() {
+    using accscalar_t = vec::vec_scalar_t<scalar_t>;
+    if (dim == 0) {
+      int64_t M = saved_v.size(0);
+      int64_t N = saved_v.numel() / M;
+      weight_norm_backward_first_dim_kernel<scalar_t, accscalar_t>(grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, M, N);
+    } else {
+      int64_t N = saved_v.size(-1);
+      int64_t M = saved_v.numel() / N;
+      weight_norm_backward_last_dim_kernel<scalar_t, accscalar_t>(grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, M, N);
+    }
+  });
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(weight_norm_stub, &weight_norm_kernel);
+REGISTER_DISPATCH(weight_norm_backward_stub, &weight_norm_backward_kernel);
+
+}} // at::native
diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.h b/aten/src/ATen/native/cpu/WeightNormKernel.h
new file mode 100644
index 000000000000..1f5ad65b52d9
--- /dev/null
+++ b/aten/src/ATen/native/cpu/WeightNormKernel.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using weight_norm_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, int64_t);
+using weight_norm_backward_fn = void(*)(
+    Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, int64_t);
+
+DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub);
+DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index 302edc1e1d0a..ad277a278fa2 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -1,36 +1,44 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/batch_norm.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/CPUApplyUtils.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/cpu/utils.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/ones.h>
+#endif
+
 namespace at { namespace native {
 namespace {
 
 using namespace vec;
 
-template<typename scalar_t>
+template<typename param_t, typename accscalar_t>
 void batch_norm_cpu_collect_linear_and_constant_terms(
-    scalar_t* alpha, scalar_t* beta, int64_t n_channel,
+    accscalar_t* alpha, accscalar_t* beta, int64_t n_channel,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     const Tensor& save_mean, const Tensor& save_invstd,
     const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
 
-  const scalar_t* weight_data = weight.defined() ? weight.data_ptr<scalar_t>() : nullptr;
-  const scalar_t* bias_data = bias.defined() ? bias.data_ptr<scalar_t>() : nullptr;
+  const param_t* weight_data = weight.defined() ? weight.data_ptr<param_t>() : nullptr;
+  const param_t* bias_data = bias.defined() ? bias.data_ptr<param_t>() : nullptr;
 
-  auto save_mean_a = conditional_accessor_1d<scalar_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<scalar_t>(save_invstd);
-  auto running_mean_a = conditional_accessor_1d<scalar_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<scalar_t>(running_var);
+  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
 
   /// Collect the linear and constant terms regarding the input.
   /// output(n, c, h, w)
@@ -44,16 +52,16 @@ void batch_norm_cpu_collect_linear_and_constant_terms(
   /// Note that this is only a good idea if (input_size >> c), in degenerate
   /// cases where image_size == 1 && batch_size == 1, it is slow.
   for (const auto c : c10::irange(n_channel)) {
-    scalar_t mean, invstd;
+    accscalar_t mean, invstd;
     if (train) {
       mean = save_mean_a[c];
       invstd = save_invstd_a[c];
     } else {
       mean = running_mean_a[c];
-      invstd = 1 / std::sqrt(running_var_a[c] + static_cast<scalar_t>(eps));
+      invstd = 1 / std::sqrt(running_var_a[c] + static_cast<accscalar_t>(eps));
     }
-    scalar_t weight_v = weight_data ? weight_data[c] : 1;
-    scalar_t bias_v = bias_data ? bias_data[c] : 0;
+    param_t weight_v = weight_data ? weight_data[c] : param_t(1);
+    param_t bias_v = bias_data ? bias_data[c] : param_t(0);
     alpha[c] = invstd * weight_v;
     beta[c] = bias_v - mean * alpha[c];
   }
@@ -75,7 +83,7 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input,
   scalar_t* alpha_data = alpha.data_ptr<scalar_t>();
   scalar_t* beta_data = beta.data_ptr<scalar_t>();
 
-  batch_norm_cpu_collect_linear_and_constant_terms<scalar_t>(
+  batch_norm_cpu_collect_linear_and_constant_terms<scalar_t, scalar_t>(
      alpha_data, beta_data, n_channel, weight, bias,
      save_mean, save_invstd, running_mean, running_var, train, eps);
 
@@ -84,62 +92,37 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input,
 
   // Apply the linear terms to the input,
   // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c)
-  if (image_size != 1) {
-    const int64_t loop_size = image_size - (image_size % Vec::size());
-    at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) {
-      int64_t n = 0;
-      int64_t c = 0;
-      data_index_init(begin, n, n_batch, c, n_channel);
+  const int64_t loop_size = image_size - (image_size % Vec::size());
+  at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t c = 0;
+    data_index_init(begin, n, n_batch, c, n_channel);
 
-      for (const auto i : c10::irange(begin, end)) {
-        const Vec alpha_vec(alpha_data[c]);
-        const Vec beta_vec(beta_data[c]);
-        int64_t offset = i * image_size;
-        int64_t d = 0;
-        for (; d < loop_size; d += Vec::size()) {
-          Vec data_vec = Vec::loadu(input_data + offset + d);
-          Vec output_vec = data_vec * alpha_vec + beta_vec;
-          output_vec.store(output_data + offset + d);
-        }
-        if (image_size - d > 0) {
-          Vec data_vec = Vec::loadu(input_data + offset + d, image_size - d);
-          Vec output_vec = data_vec * alpha_vec + beta_vec;
-          output_vec.store(output_data + offset + d, image_size - d);
-        }
-        // move on to next index
-        data_index_step(n, n_batch, c, n_channel);
+    for (const auto i : c10::irange(begin, end)) {
+      const Vec alpha_vec(alpha_data[c]);
+      const Vec beta_vec(beta_data[c]);
+      int64_t offset = i * image_size;
+      int64_t d = 0;
+      for (; d < loop_size; d += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_data + offset + d);
+        Vec output_vec = data_vec * alpha_vec + beta_vec;
+        output_vec.store(output_data + offset + d);
       }
-    });
-  } else {
-    // image_size == 1
-    const int64_t loop_size = n_channel - (n_channel % Vec::size());
-    at::parallel_for(0, n_batch, 1, [&](int64_t begin, int64_t end) {
-      for (const auto n : c10::irange(begin, end)) {
-        int64_t offset = n * n_channel;
-        int64_t d = 0;
-        for (; d < loop_size; d += Vec::size()) {
-          Vec alpha_vec = Vec::loadu(alpha_data + d);
-          Vec beta_vec = Vec::loadu(beta_data + d);
-          Vec data_vec = Vec::loadu(input_data + offset + d);
-          Vec output_vec = data_vec * alpha_vec + beta_vec;
-          output_vec.store(output_data + offset + d);
-        }
-        if (n_channel - d > 0) {
-          Vec alpha_vec = Vec::loadu(alpha_data + d, n_channel - d);
-          Vec beta_vec = Vec::loadu(beta_data + d, n_channel - d);
-          Vec data_vec = Vec::loadu(input_data + offset + d, n_channel - d);
-          Vec output_vec = data_vec * alpha_vec + beta_vec;
-          output_vec.store(output_data + offset + d, n_channel - d);
-        }
+      if (image_size - d > 0) {
+        Vec data_vec = Vec::loadu(input_data + offset + d, image_size - d);
+        Vec output_vec = data_vec * alpha_vec + beta_vec;
+        output_vec.store(output_data + offset + d, image_size - d);
       }
-    });
-  }
+      // move on to next index
+      data_index_step(n, n_batch, c, n_channel);
+    }
+  });
 }
 
 template <typename scalar_t>
 void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input,
     const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd,
-    const Tensor& running_mean, const Tensor& runnning_var, bool train, double eps) {
+    const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
 
   using Vec = Vectorized<scalar_t>;
   int64_t n_batch = input.size(0);
@@ -151,9 +134,9 @@ void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input,
   scalar_t* alpha_data = alpha.data_ptr<scalar_t>();
   scalar_t* beta_data = beta.data_ptr<scalar_t>();
 
-  batch_norm_cpu_collect_linear_and_constant_terms<scalar_t>(
+  batch_norm_cpu_collect_linear_and_constant_terms<scalar_t, scalar_t>(
       alpha_data, beta_data, n_channel, weight, bias,
-      save_mean, save_invstd, running_mean, runnning_var, train, eps);
+      save_mean, save_invstd, running_mean, running_var, train, eps);
 
   scalar_t* output_data = output.data_ptr<scalar_t>();
   const scalar_t* input_data = input.data_ptr<scalar_t>();
@@ -609,16 +592,660 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad
   }
 }
 
+/// bfloat16 kernels
+template<>
+void batch_norm_cpu_contiguous_impl<BFloat16>(Tensor& output, const Tensor& input,
+    const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd,
+    const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_batch = input.size(0);
+  int64_t n_channel = input.size(1);
+  int64_t image_size = input.numel() / n_batch / n_channel;
+
+  // use float as acc type
+  Tensor alpha = at::empty({n_channel}, input.options().dtype(kFloat));
+  Tensor beta = at::empty({n_channel}, input.options().dtype(kFloat));
+  float* alpha_data = alpha.data_ptr<float>();
+  float* beta_data = beta.data_ptr<float>();
+
+  const bool mixed_type = is_mixed_type(input, weight, bias, save_mean, save_invstd, running_mean, running_var);
+  if (mixed_type) {
+    batch_norm_cpu_collect_linear_and_constant_terms<float, float>(
+        alpha_data, beta_data, n_channel, weight, bias,
+        save_mean, save_invstd, running_mean, running_var, train, eps);
+  } else {
+    batch_norm_cpu_collect_linear_and_constant_terms<BFloat16, float>(
+        alpha_data, beta_data, n_channel, weight, bias,
+        save_mean, save_invstd, running_mean, running_var, train, eps);
+  }
+
+  BFloat16* output_data = output.data_ptr<BFloat16>();
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+
+  const int64_t loop_size = image_size - (image_size % bVec::size());
+  at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t c = 0;
+    data_index_init(begin, n, n_batch, c, n_channel);
+
+    for (const auto i : c10::irange(begin, end)) {
+      const BFloat16* input_ptr = input_data + i * image_size;
+      BFloat16* output_ptr = output_data + i * image_size;
+      const float alpha_val = alpha_data[c];
+      const float beta_val = beta_data[c];
+      const fVec alpha_fvec(alpha_val);
+      const fVec beta_fvec(beta_val);
+      int64_t d = 0;
+      for (; d < loop_size; d += bVec::size()) {
+        bVec data_bvec = bVec::loadu(input_ptr + d);
+        fVec data_fvec0, data_fvec1;
+        std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+
+        fVec out_fvec0 = data_fvec0 * alpha_fvec + beta_fvec;
+        fVec out_fvec1 = data_fvec1 * alpha_fvec + beta_fvec;
+        bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1);
+        out_bvec.store(output_ptr + d);
+      }
+      for (; d < image_size; d++) {
+        output_ptr[d] = BFloat16(float(input_ptr[d]) * alpha_val + beta_val);
+      }
+      // move on to next index
+      data_index_step(n, n_batch, c, n_channel);
+    }
+  });
+}
+
+template <>
+void batch_norm_cpu_channels_last_impl<BFloat16>(Tensor& output, const Tensor& input,
+    const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd,
+    const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_batch = input.size(0);
+  int64_t n_channel = input.size(1);
+  int64_t image_size = input.numel() / n_batch / n_channel;
+
+  Tensor alpha = at::empty({n_channel}, input.options().dtype(kFloat));
+  Tensor beta = at::empty({n_channel}, input.options().dtype(kFloat));
+  float* alpha_data = alpha.data_ptr<float>();
+  float* beta_data = beta.data_ptr<float>();
+
+  const bool mixed_type = is_mixed_type(input, weight, bias, save_mean, save_invstd, running_mean, running_var);
+  if (mixed_type) {
+    batch_norm_cpu_collect_linear_and_constant_terms<float, float>(
+        alpha_data, beta_data, n_channel, weight, bias,
+        save_mean, save_invstd, running_mean, running_var, train, eps);
+  } else {
+    batch_norm_cpu_collect_linear_and_constant_terms<BFloat16, float>(
+        alpha_data, beta_data, n_channel, weight, bias,
+        save_mean, save_invstd, running_mean, running_var, train, eps);
+  }
+
+  BFloat16* output_data = output.data_ptr<BFloat16>();
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+
+  const int64_t loop_size = n_channel - (n_channel % bVec::size());
+  at::parallel_for(0, n_batch * image_size, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      const BFloat16* input_ptr = input_data + i * n_channel;
+      BFloat16* output_ptr = output_data + i * n_channel;
+      int64_t d = 0;
+      for (; d < loop_size; d += bVec::size()) {
+        fVec alpha_fvec0 = fVec::loadu(alpha_data + d);
+        fVec alpha_fvec1 = fVec::loadu(alpha_data + d + fVec::size());
+        fVec beta_fvec0 = fVec::loadu(beta_data + d);
+        fVec beta_fvec1 = fVec::loadu(beta_data + d + fVec::size());
+        bVec data_bvec = bVec::loadu(input_ptr + d);
+        fVec data_fvec0, data_fvec1;
+        std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+
+        fVec out_fvec0 = data_fvec0 * alpha_fvec0 + beta_fvec0;
+        fVec out_fvec1 = data_fvec1 * alpha_fvec1 + beta_fvec1;
+        bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1);
+        out_bvec.store(output_ptr + d);
+      }
+      for (; d < n_channel; d++) {
+        output_ptr[d] = BFloat16(float(input_ptr[d]) * alpha_data[d] + beta_data[d]);
+      }
+    }
+  });
+}
+
+template <typename param_t>
+inline void batch_norm_cpu_collect_stats_contiguous_internal(
+    Tensor& mean, Tensor& var_sum, const Tensor& input) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_batch = input.size(0);
+  int64_t n_channel = input.size(1);
+  int64_t image_size = input.numel() / n_batch / n_channel;
+  int64_t N = input.numel() / n_channel;
+
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+  param_t* mean_data = mean.data_ptr<param_t>();
+  param_t* var_sum_data = var_sum.data_ptr<param_t>();
+
+  at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      float sum_val = float(0);
+      fVec sum_fvec = fVec(float(0));
+      for (int64_t n = 0; n < n_batch; n++) {
+        const BFloat16* input_ptr = input_data + n * n_channel * image_size + c * image_size;
+        int64_t d = 0;
+        for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
+          bVec data_bvec = bVec::loadu(input_ptr + d);
+          fVec data_fvec0, data_fvec1;
+          std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+          sum_fvec += data_fvec0;
+          sum_fvec += data_fvec1;
+        }
+        for (; d < image_size; d++) {
+          sum_val += float(input_ptr[d]);
+        }
+      }
+      // TODO: use fast version
+      sum_val += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec, fVec::size());
+      float mean_val = sum_val / N;
+      mean_data[c] = param_t(mean_val);
+
+      float var_val = float(0);
+      fVec var_fvec = fVec(float(0));
+      fVec mean_fvec = fVec(mean_val);
+      for (int64_t n = 0; n < n_batch; n++) {
+        const BFloat16* input_ptr = input_data + n * n_channel * image_size + c * image_size;
+        int64_t d = 0;
+        for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
+          bVec data_bvec = bVec::loadu(input_ptr + d);
+          fVec data_fvec0, data_fvec1;
+          std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+          var_fvec += (data_fvec0 - mean_fvec) * (data_fvec0 - mean_fvec);
+          var_fvec += (data_fvec1 - mean_fvec) * (data_fvec1 - mean_fvec);
+        }
+        for (; d < image_size; d++) {
+          float data_val = input_ptr[d];
+          var_val += (data_val - mean_val) * (data_val - mean_val);
+        }
+      }
+      // TODO: use fast version
+      var_val += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, var_fvec, fVec::size());
+      var_sum_data[c] = param_t(var_val);
+    }
+  });
+}
+
+template <>
+void batch_norm_cpu_collect_stats_contiguous_impl<BFloat16>(
+    Tensor& mean, Tensor& var_sum, const Tensor& input) {
+  const bool mixed_type = is_mixed_type(input, mean, var_sum);
+  if (mixed_type) {
+    batch_norm_cpu_collect_stats_contiguous_internal<float>(mean, var_sum, input);
+  } else {
+    batch_norm_cpu_collect_stats_contiguous_internal<BFloat16>(mean, var_sum, input);
+  }
+}
+
+static inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr) {
+  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+}
+
+static inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr) {
+  using Vec = Vectorized<float>;
+  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size()));
+}
+
+template <typename param_t>
+inline void batch_norm_cpu_collect_stats_channels_last_internal(
+    Tensor& mean, Tensor& var_sum, const Tensor& input) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_channel = input.size(1);
+  int64_t N = input.numel() / n_channel;
+
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+  param_t* mean_data = mean.data_ptr<param_t>();
+  param_t* var_sum_data = var_sum.data_ptr<param_t>();
+
+  int num_threads = at::get_num_threads();
+  Tensor buffer = at::empty({num_threads, n_channel}, input.options().dtype(kFloat)).zero_();
+  float* buffer_data = buffer.data_ptr<float>();
+
+  at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    float* buffer_ptr = buffer_data + tid * n_channel;
+    for (const auto i : c10::irange(begin, end)) {
+      const BFloat16* input_ptr = input_data + i * n_channel;
+      int64_t d = 0;
+      for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
+        bVec data_bvec = bVec::loadu(input_ptr + d);
+        fVec data_fvec0, data_fvec1;
+        std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+        fVec sum_fvec0 = fVec::loadu(buffer_ptr + d) + data_fvec0;
+        fVec sum_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()) + data_fvec1;
+        sum_fvec0.store(buffer_ptr + d);
+        sum_fvec1.store(buffer_ptr + d + fVec::size());
+      }
+      for (; d < n_channel; d++) {
+        buffer_ptr[d] += input_ptr[d];
+      }
+    }
+  });
+
+  for (const auto c : c10::irange(n_channel)) {
+    float sum = 0;
+    for (const auto t : c10::irange(num_threads)) {
+      sum += buffer_data[t * n_channel + c];
+    }
+    mean_data[c] = param_t(sum / N);
+  }
+
+  buffer.zero_();
+  at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    float* buffer_ptr = buffer_data + tid * n_channel;
+    for (const auto i : c10::irange(begin, end)) {
+      const BFloat16* input_ptr = input_data + i * n_channel;
+      int64_t d = 0;
+      for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
+        bVec data_bvec = bVec::loadu(input_ptr + d);
+        fVec data_fvec0, data_fvec1;
+        std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+        fVec mean_fvec0, mean_fvec1;
+        std::tie(mean_fvec0, mean_fvec1) = load2f(mean_data + d);
+        fVec var_fvec0 = fVec::loadu(buffer_ptr + d);
+        fVec var_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size());
+        var_fvec0 += (data_fvec0 - mean_fvec0) * (data_fvec0 - mean_fvec0);
+        var_fvec1 += (data_fvec1 - mean_fvec1) * (data_fvec1 - mean_fvec1);
+        var_fvec0.store(buffer_ptr + d);
+        var_fvec1.store(buffer_ptr + d + fVec::size());
+      }
+      for (; d < n_channel; d++) {
+        float data_val = float(input_ptr[d]);
+        float mean_val = float(mean_data[d]);
+        buffer_ptr[d] += (data_val - mean_val) * (data_val - mean_val);
+      }
+    }
+  });
+
+  for (const auto c : c10::irange(n_channel)) {
+    float _var_sum = 0;
+    for (const auto t : c10::irange(num_threads)) {
+      _var_sum += buffer_data[t * n_channel + c];
+    }
+    var_sum_data[c] = param_t(_var_sum);
+  }
+}
+
+template <>
+void batch_norm_cpu_collect_stats_channels_last_impl<BFloat16>(
+    Tensor& mean, Tensor& var_sum, const Tensor& input) {
+  const bool mixed_type = is_mixed_type(input, mean, var_sum);
+  if (mixed_type) {
+    batch_norm_cpu_collect_stats_channels_last_internal<float>(mean, var_sum, input);
+  } else {
+    batch_norm_cpu_collect_stats_channels_last_internal<BFloat16>(mean, var_sum, input);
+  }
+}
+
+template <typename param_t>
+void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias,
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
+    bool train, double eps) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_batch = input.size(0);
+  int64_t n_channel = input.size(1);
+  int64_t image_size = input.numel() / n_batch / n_channel;
+  int64_t N = input.numel() / n_channel;
+
+  const BFloat16* grad_output_data = grad_output.data_ptr<BFloat16>();
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+
+  BFloat16* grad_input_data = grad_input.defined() ? grad_input.data_ptr<BFloat16>() : nullptr;
+  param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<param_t>() : nullptr;
+  param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr<param_t>() : nullptr;
+  const bool grad_input_null = grad_input_data == nullptr;
+  const bool grad_weight_null = grad_weight_data == nullptr;
+  const bool grad_bias_null = grad_bias_data == nullptr;
+
+  auto weight_a = conditional_accessor_1d<param_t>(weight);
+  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+
+  // parallel dim reduce on 'channel'
+  at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      float w = weight.defined() ? float(weight_a[c]) : 1;
+
+      float mean, invstd;
+      if (train) {
+        mean = save_mean_a[c];
+        invstd = save_invstd_a[c];
+      } else {
+        mean = running_mean_a[c];
+        invstd = 1 / std::sqrt(running_var_a[c] + eps);
+      }
+
+      // compute 1) sum; 2) dot product of Q(X) and dY.
+      float sum{0}, dotp{0};
+      fVec sum_fvec{0}, dotp_fvec{0};
+      for (const auto n : c10::irange(n_batch)) {
+        const BFloat16* x_ptr = input_data + n * n_channel * image_size + c * image_size;
+        const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size;
+
+        int64_t d = 0;
+        for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
+          bVec dy_bvec = bVec::loadu(dy_ptr + d);
+          fVec dy_fvec0, dy_fvec1;
+          std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+          sum_fvec += dy_fvec0;
+          sum_fvec += dy_fvec1;
+
+          bVec x_bvec = bVec::loadu(x_ptr + d);
+          fVec x_fvec0, x_fvec1;
+          std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+          dotp_fvec += (x_fvec0 - fVec(mean)) * dy_fvec0;
+          dotp_fvec += (x_fvec1 - fVec(mean)) * dy_fvec1;
+        }
+        for (; d < image_size; d++) {
+          sum += float(dy_ptr[d]);
+          dotp += (float(x_ptr[d]) - mean) * float(dy_ptr[d]);
+        }
+      }
+      // TODO: use fast version
+      sum += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec, fVec::size());
+      dotp += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, dotp_fvec, fVec::size());
+
+      if (!grad_input_null) {
+        if (train) {
+          float k = (float) dotp * invstd * invstd / N;
+          float grad_mean = sum / N;
+          for (const auto n : c10::irange(n_batch)) {
+            const BFloat16* x_ptr = input_data + n * n_channel * image_size + c * image_size;
+            BFloat16* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size;
+            const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size;
+            vec::map2(
+                [=](fVec x, fVec dy) {
+                  fVec dx = (x - fVec(mean)) * fVec(k);
+                  return (dy - fVec(grad_mean) - dx) * fVec(invstd) * fVec(w);
+                },
+                dx_ptr, x_ptr, dy_ptr, image_size);
+          }
+        } else { // evaluation mode
+          for (const auto n : c10::irange(n_batch)) {
+            BFloat16* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size;
+            const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size;
+            vec::map(
+                [=](fVec dy) { return dy * fVec(invstd) * fVec(w); },
+                dx_ptr, dy_ptr, image_size);
+          }
+        }
+      }
+
+      if (!grad_weight_null) {
+        grad_weight_data[c] = param_t(dotp * invstd);
+      }
+
+      if (!grad_bias_null) {
+        grad_bias_data[c] = param_t(sum);
+      }
+    }
+  });
+}
+
+template <>
+void batch_norm_cpu_backward_contiguous_impl<BFloat16>(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias,
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
+    bool train, double eps) {
+  const bool mixed_type = is_mixed_type(input, weight, running_mean, running_var, save_mean, save_invstd);
+  if (mixed_type) {
+    batch_norm_cpu_backward_contiguous_internal<float>(grad_input, grad_weight, grad_bias,
+        grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+  } else {
+    batch_norm_cpu_backward_contiguous_internal<BFloat16>(grad_input, grad_weight, grad_bias,
+        grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+  }
+}
+
+template <typename param_t>
+void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias,
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
+    bool train, double eps) {
+
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  int64_t n_channel = input.size(1);
+  int64_t N = input.numel() / n_channel;
+
+  const BFloat16* grad_output_data = grad_output.data_ptr<BFloat16>();
+  const BFloat16* input_data = input.data_ptr<BFloat16>();
+
+  BFloat16* grad_input_data = grad_input.defined() ? grad_input.data_ptr<BFloat16>() : nullptr;
+  param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<param_t>() : nullptr;
+  param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr<param_t>() : nullptr;
+
+  auto weight_a = conditional_accessor_1d<param_t>(weight);
+  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+
+  // use float as acc type
+  bool weight_defined = weight.defined();
+  Tensor weight_f = at::empty({n_channel}, input.options().dtype(kFloat));
+  Tensor mean = at::empty({n_channel}, input.options().dtype(kFloat));
+  Tensor invstd = at::empty({n_channel}, input.options().dtype(kFloat));
+  float* weight_data = weight_f.data_ptr<float>();
+  float* mean_data = mean.data_ptr<float>();
+  float* invstd_data = invstd.data_ptr<float>();
+
+  for (const auto c : c10::irange(n_channel)) {
+    weight_data[c] = weight_defined ? float(weight_a[c]) : 1;
+
+    if (train) {
+      mean_data[c] = save_mean_a[c];
+      invstd_data[c] = save_invstd_a[c];
+    } else {
+      mean_data[c] = running_mean_a[c];
+      invstd_data[c] = 1 / std::sqrt(running_var_a[c] + eps);
+    }
+  }
+
+  int num_threads = at::get_num_threads();
+  Tensor buffer = at::empty({2, num_threads, n_channel}, input.options().dtype(kFloat)).zero_();
+  float* sum_data = buffer.data_ptr<float>();
+  float* dotp_data = sum_data + num_threads * n_channel;
+
+  at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    float* sum_ptr = sum_data + tid * n_channel;
+    float* dotp_ptr = dotp_data + tid * n_channel;
+    for (const auto i : c10::irange(begin, end)) {
+      const BFloat16* x_ptr = input_data + i * n_channel;
+      const BFloat16* dy_ptr = grad_output_data + i * n_channel;
+
+      int64_t d = 0;
+      for(; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
+        bVec dy_bvec = bVec::loadu(dy_ptr + d);
+        fVec dy_fvec0, dy_fvec1;
+        std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+        fVec sum_fvec0 = dy_fvec0 + fVec::loadu(sum_ptr + d);
+        fVec sum_fvec1 = dy_fvec1 + fVec::loadu(sum_ptr + d + fVec::size());
+        sum_fvec0.store(sum_ptr + d);
+        sum_fvec1.store(sum_ptr + d + fVec::size());
+
+        bVec x_bvec = bVec::loadu(x_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+        fVec mean_fvec0 = fVec::loadu(mean_data + d);
+        fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size());
+        fVec dotp_fvec0 = fVec::loadu(dotp_ptr + d);
+        fVec dotp_fvec1 = fVec::loadu(dotp_ptr + d + fVec::size());
+        dotp_fvec0 += (x_fvec0 - mean_fvec0) * dy_fvec0;
+        dotp_fvec1 += (x_fvec1 - mean_fvec1) * dy_fvec1;
+        dotp_fvec0.store(dotp_ptr + d);
+        dotp_fvec1.store(dotp_ptr + d + fVec::size());
+      }
+      for (; d < n_channel; d++) {
+        float dy_val = dy_ptr[d];
+        float x_val = x_ptr[d];
+        float mean_val = mean_data[d];
+        sum_ptr[d] += dy_val;
+        dotp_ptr[d] += (x_val - mean_val) * dy_val;
+      }
+    }
+  });
+
+  at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      // store the final result of sum and dotp in the 1st lane of immediate buffer,
+      // so that we won't need to allocate anther buffer to store the temp values.
+      float _sum = 0;
+      for (const auto t : c10::irange(num_threads)) {
+        _sum += sum_data[t * n_channel + c];
+      }
+      sum_data[/* 0 * n_channel + */c] = _sum;
+
+      float _dotp = 0;
+      for (const auto t : c10::irange(num_threads)) {
+        _dotp += dotp_data[t * n_channel + c];
+      }
+      dotp_data[/* 0 * n_channel + */c] = _dotp;
+    }
+  });
+
+  // compute grad_input
+  if (grad_input.defined()) {
+    at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+        BFloat16* dx_ptr = grad_input_data + i * n_channel;
+        const BFloat16* x_ptr = input_data + i * n_channel;
+        const BFloat16* dy_ptr = grad_output_data + i * n_channel;
+        if (train) {
+          int64_t d = 0;
+          for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
+            bVec x_bvec = bVec::loadu(x_ptr + d);
+            fVec x_fvec0, x_fvec1;
+            std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+            fVec mean_fvec0 = fVec::loadu(mean_data + d);
+            fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size());
+            fVec dotp_fvec0 = fVec::loadu(dotp_data + d);
+            fVec dotp_fvec1 = fVec::loadu(dotp_data + d + fVec::size());
+            fVec invstd_fvec0 = fVec::loadu(invstd_data + d);
+            fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size());
+            fVec k_fvec0 = dotp_fvec0 * invstd_fvec0 * invstd_fvec0 / fVec(N);
+            fVec k_fvec1 = dotp_fvec1 * invstd_fvec1 * invstd_fvec1 / fVec(N);
+            fVec dx_fvec0 = (x_fvec0 - mean_fvec0) * k_fvec0;
+            fVec dx_fvec1 = (x_fvec1 - mean_fvec1) * k_fvec1;
+            bVec dy_bvec = bVec::loadu(dy_ptr + d);
+            fVec dy_fvec0, dy_fvec1;
+            std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+            fVec grad_mean_fvec0 = fVec::loadu(sum_data + d) / fVec(N);
+            fVec grad_mean_fvec1 = fVec::loadu(sum_data + d + fVec::size()) / fVec(N);
+            fVec w_fvec0 = fVec::loadu(weight_data + d);
+            fVec w_fvec1 = fVec::loadu(weight_data + d + fVec::size());
+            dx_fvec0 = (dy_fvec0 - grad_mean_fvec0 - dx_fvec0) * invstd_fvec0 * w_fvec0;
+            dx_fvec1 = (dy_fvec1 - grad_mean_fvec1 - dx_fvec1) * invstd_fvec1 * w_fvec1;
+            bVec dx_bvec = convert_float_bfloat16(dx_fvec0, dx_fvec1);
+            dx_bvec.store(dx_ptr + d);
+          }
+          for (; d < n_channel; d++) {
+            float x_val = x_ptr[d];
+            float mean_val = mean_data[d];
+            float dotp_val = dotp_data[d];
+            float invstd_val = invstd_data[d];
+            float k_val = dotp_val * invstd_val * invstd_val / N;
+            float dx_val = (x_val - mean_val) * k_val;
+            float dy_val = dy_ptr[d];
+            float grad_mean_val = sum_data[d] / N;
+            float w_val = weight_data[d];
+            dx_val = (dy_val - grad_mean_val - dx_val) * invstd_val * w_val;
+            dx_ptr[d] = BFloat16(dx_val);
+          }
+        } else { // evaluation mode
+          int64_t d = 0;
+          for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
+            bVec dy_bvec = bVec::loadu(dy_ptr + d);
+            fVec dy_fvec0, dy_fvec1;
+            std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+            fVec invstd_fvec0 = fVec::loadu(invstd_data + d);
+            fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size());
+            fVec w_fvec0 = fVec::loadu(weight_data + d);
+            fVec w_fvec1 = fVec::loadu(weight_data + d + fVec::size());
+            fVec dx_fvec0 = dy_fvec0 * invstd_fvec0 * w_fvec0;
+            fVec dx_fvec1 = dy_fvec1 * invstd_fvec1 * w_fvec1;
+            bVec dx_bvec = convert_float_bfloat16(dx_fvec0, dx_fvec1);
+            dx_bvec.store(dx_ptr + d);
+          }
+          for (; d < n_channel; d++) {
+            float dy_val = dy_ptr[d];
+            float invstd_val = invstd_data[d];
+            float w_val = weight_data[d];
+            float dx_val = dy_val * invstd_val * w_val;
+            dx_ptr[d] = BFloat16(dx_val);
+          }
+        }
+      }
+    });
+  }
+
+  if (grad_weight.defined()) {
+    for (const auto c : c10::irange(n_channel)) {
+      grad_weight_data[c] = param_t(dotp_data[c] * invstd_data[c]);
+    }
+  }
+
+  if (grad_bias.defined()) {
+    for (const auto c : c10::irange(n_channel)) {
+      grad_bias_data[c] = param_t(sum_data[c]);
+    }
+  }
+}
+
+template <>
+void batch_norm_cpu_backward_channels_last_impl<BFloat16>(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias,
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
+    bool train, double eps) {
+  const bool mixed_type = is_mixed_type(input, weight, running_mean, running_var, save_mean, save_invstd);
+  if (mixed_type) {
+    batch_norm_cpu_backward_channels_last_internal<float>(grad_input, grad_weight, grad_bias,
+        grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+  } else {
+    batch_norm_cpu_backward_channels_last_internal<BFloat16>(grad_input, grad_weight, grad_bias,
+        grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+  }
+}
+
 void batch_norm_cpu_kernel(Tensor& output, const Tensor& input,
     const Tensor& weight, const Tensor& bias, const Tensor& save_mean,  const Tensor& save_invstd,
     const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
-  if (input.is_contiguous()) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] {
-      batch_norm_cpu_contiguous_impl<scalar_t>(output, input, weight, bias,
-          save_mean, save_invstd, running_mean, running_var, train, eps);
+  int64_t image_size = input.numel() / input.size(0) / input.size(1);
+  if (input.is_contiguous()) { // NC11 is also channels last
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_contiguous", [&] {
+      if (image_size == 1) {
+        batch_norm_cpu_channels_last_impl<scalar_t>(output, input, weight, bias,
+            save_mean, save_invstd, running_mean, running_var, train, eps);
+      } else {
+        batch_norm_cpu_contiguous_impl<scalar_t>(output, input, weight, bias,
+            save_mean, save_invstd, running_mean, running_var, train, eps);
+      }
     });
   } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_channels_last", [&] {
       batch_norm_cpu_channels_last_impl<scalar_t>(output, input, weight, bias,
           save_mean, save_invstd, running_mean, running_var, train, eps);
     });
@@ -631,7 +1258,7 @@ void batch_norm_cpu_collect_stats_kernel(
     Tensor& mean, Tensor& var_sum, const Tensor& input) {
   int64_t image_size = input.numel() / input.size(0) / input.size(1);
   if (input.is_contiguous()) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] {
       if (image_size == 1) { // NC11 is also channels last
         batch_norm_cpu_collect_stats_channels_last_impl<scalar_t>(mean, var_sum, input);
       } else {
@@ -639,7 +1266,7 @@ void batch_norm_cpu_collect_stats_kernel(
       }
     });
   } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] {
       batch_norm_cpu_collect_stats_channels_last_impl<scalar_t>(mean, var_sum, input);
     });
   } else {
@@ -653,7 +1280,7 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten
     bool train, double eps) {
   int64_t image_size = input.numel() / input.size(0) / input.size(1);
   if (input.is_contiguous()) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] {
       if (image_size == 1) { // NC11 is also channels last
         batch_norm_cpu_backward_channels_last_impl<scalar_t>(grad_input, grad_weight, grad_bias,
             grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
@@ -663,7 +1290,7 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten
       }
     });
   } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] {
       batch_norm_cpu_backward_channels_last_impl<scalar_t>(grad_input, grad_weight, grad_bias,
           grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
     });
diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index 6f98b58a3c0e..ff84f9b60784 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -1,16 +1,24 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/group_norm.h>
 
 #include <algorithm>
 #include <array>
 #include <numeric>
 
-#include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/cpu/utils.h>
 #include <ATen/native/cpu/moments_utils.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -75,6 +83,78 @@ void GroupNormKernelImplInternal(
   });
 }
 
+template <typename T>
+std::tuple<T, T> ColumnwiseMoments(
+    const T* X_data,
+    int64_t HxW,
+    int64_t C,
+    int64_t D) {
+  using Vec = vec::Vectorized<T>;
+  constexpr int64_t K = Vec::size();
+  const int64_t inner_size = D / K * K;
+  Vec acc0_vec{0}, acc1_vec{0};
+  for (const auto m : c10::irange(HxW)) {
+    const T* X_ptr = X_data + m * C;
+    int64_t d = 0;
+    for (; d < inner_size; d += K) {
+      Vec x_vec = Vec::loadu(X_ptr + d);
+      acc0_vec += x_vec;
+      acc1_vec += x_vec * x_vec;
+    }
+    if (D - d > 0) {
+      Vec x_vec = Vec::loadu(X_ptr + d, D - d);
+      acc0_vec += x_vec;
+      acc1_vec += x_vec * x_vec;
+    }
+  }
+  // TODO: use fast path
+  T mean_val = vec::vec_reduce_all([](Vec& x, Vec& y) { return x + y; }, acc0_vec, Vec::size());
+  T rstd_val = vec::vec_reduce_all([](Vec& x, Vec& y) { return x + y; }, acc1_vec, Vec::size());
+  return std::tuple<T, T>(mean_val, rstd_val);
+}
+
+template <typename T = BFloat16>
+std::tuple<float, float> ColumnwiseMoments(
+    const BFloat16* X_data,
+    int64_t HxW,
+    int64_t C,
+    int64_t D) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  constexpr int64_t K = bVec::size();
+  const int64_t inner_size = D / K * K;
+  fVec acc0_fvec{0}, acc1_fvec{0}, zero{0};
+  for (const auto m : c10::irange(HxW)) {
+    const BFloat16* X_ptr = X_data + m * C;
+    int64_t d = 0;
+    for (; d < inner_size; d += K) {
+      bVec x_bvec = bVec::loadu(X_ptr + d);
+      fVec x_fvec0, x_fvec1;
+      std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+      acc0_fvec += x_fvec0 + x_fvec1;
+      acc1_fvec += x_fvec0 * x_fvec0 + x_fvec1 * x_fvec1;
+    }
+    if (D - d > 0) {
+      bVec x_bvec = bVec::loadu(X_ptr + d, D - d);
+      fVec x_fvec0, x_fvec1;
+      std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+      if (D - d > fVec::size()) {
+        x_fvec1 = fVec::set(zero, x_fvec1, D - d - fVec::size());
+        acc0_fvec += x_fvec0 + x_fvec1;
+        acc1_fvec += x_fvec0 * x_fvec0 + x_fvec1 * x_fvec1;
+      } else {
+        x_fvec0 = fVec::set(zero, x_fvec0, D - d);
+        acc0_fvec += x_fvec0;
+        acc1_fvec += x_fvec0 * x_fvec0;
+      }
+    }
+  }
+  // TODO: use fast path
+  float mean_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, acc0_fvec, fVec::size());
+  float rstd_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, acc1_fvec, fVec::size());
+  return std::tuple<float, float>(mean_val, rstd_val);
+}
+
 template <typename T>
 void GroupNormKernelImplChannelsLastInternal(
     const Tensor& X,
@@ -99,110 +179,204 @@ void GroupNormKernelImplChannelsLastInternal(
   T* Y_data = Y.data_ptr<T>();
   T* mean_data = mean.data_ptr<T>();
   T* rstd_data = rstd.data_ptr<T>();
-  const T s = T(1) / static_cast<T>(D * HxW);
+
+  using T_ACC = vec::vec_scalar_t<T>;
+  using Vec = vec::Vectorized<T_ACC>;
+
+  const T s = T_ACC(1) / static_cast<T_ACC>(D * HxW);
   const bool gamma_null = (gamma_data == nullptr);
   const bool beta_null = beta_data == nullptr;
 
-  // temp buffer holding x and x2
-  Tensor buffer = at::empty({N, 2 * C}, X.options()).zero_();
-  T* buffer_data = buffer.data_ptr<T>();
+  // NB: About algorithm choosen:
+  //
+  // On channels last, GroupNorm has a input shape of {N, H, W, GD},
+  // Mean and rstd are collected per each n and g, which involves reduction
+  // on non-adjacent dimensions. We can parallel in the following 2 impls:
+  //
+  // impl-1: parallel on N * G. Only need one omp session but memory access
+  //   per thread is non-contiguous.
+  //
+  // impl-2: parallel on N * HxW. Memory access per thread is contiguous,
+  //   but requires help of extra temp buffer of size {T, N, 2C}.
+  //
+  // Generally impl-2 has better performance when HxW is large enough, so that
+  //   data per thread {NHWC / T} is much larger then temp buffer per thread {2NC}
+  //
+  constexpr int64_t feature_map_threshold = 1024;
+  if (HxW < feature_map_threshold) {
+    // impl-1: parallel on N * G.
+    //
+    // for each plain of HxW, scale and bias is calculated only once
+    Tensor buffer = at::empty({N * G, 2 * D}, X.options());
+    T* buffer_data = buffer.data_ptr<T>();
 
-  using Vec = vec::Vectorized<T>;
-  at::parallel_for(0, N, 1, [&](int64_t start, int64_t end) {
-    constexpr int64_t K = Vec::size();
-    const int64_t inner_size = C / K * K;
-    for (const auto n : c10::irange(start, end)) {
-      T* mean_ptr = buffer_data + n * 2 * C;
-      T* rstd_ptr = mean_ptr + C;
-      for (const auto i : c10::irange(HxW)) {
-        const T* X_ptr = X_data + n * HxW * C + i * C;
-        for (int64_t j = 0; j < inner_size; j += K) {
-          const Vec x_vec = Vec::loadu(X_ptr + j);
-          Vec mean_vec = Vec::loadu(mean_ptr + j) + x_vec;
-          Vec rstd_vec = Vec::loadu(rstd_ptr + j) + x_vec * x_vec;
-          mean_vec.store(mean_ptr + j);
-          rstd_vec.store(rstd_ptr + j);
+    at::parallel_for(0, N * G, 1, [&](int64_t begin, int64_t end) {
+      int64_t n{0}, g{0};
+      data_index_init(begin, n, N, g, G);
+      for (const auto i : c10::irange(begin, end)) {
+        // step-1: for each n and g, collect sum of x and x2
+        //
+        // Note that using vec::map_reduce_all here is simpler to write
+        // but it is slower since horizontal reduce from vec to scalar is slow.
+        // So it is better to reduce with a vec across all HxW plain,
+        // and do a horizontal add just once for each {n, g}.
+        //
+        T_ACC mean_val, rstd_val;
+        std::tie(mean_val, rstd_val) = ColumnwiseMoments(
+                X_data + n * HxW * C + g * D,
+                HxW,
+                C,
+                D);
+
+        mean_val *= s;
+        rstd_val = std::max(rstd_val * s - mean_val * mean_val, T_ACC(0));
+        rstd_val = T_ACC(1) / std::sqrt(rstd_val + eps);
+        mean_data[i] = mean_val;
+        rstd_data[i] = rstd_val;
+
+        // step-2: calculate scale and bias
+        T* scale_ptr = buffer_data + i * 2 * D;
+        T* bias_ptr = scale_ptr + D;
+        for (const auto d : c10::irange(D)) {
+          const int64_t c = g * D + d;
+          scale_ptr[d] = rstd_val * (gamma_null ? T(1) : gamma_data[c]);
+          bias_ptr[d] = -scale_ptr[d] * mean_val + (beta_null ? T(0) : beta_data[c]);
         }
-        for (const auto j : c10::irange(inner_size, C)) {
-          mean_ptr[j] += X_ptr[j];
-          rstd_ptr[j] += X_ptr[j] * X_ptr[j];
+
+        // step-3: apply scale and bias
+        for (const auto m : c10::irange(HxW)) {
+          const T* X_ptr = X_data + n * HxW * C + m * C + g * D;
+          T* Y_ptr = Y_data + n * HxW * C + m * C + g * D;
+          vec::map3<T>(
+              [](Vec x, Vec scale, Vec bias) { return x * scale + bias; },
+              Y_ptr,
+              X_ptr,
+              scale_ptr,
+              bias_ptr,
+              D);
         }
+
+        data_index_step(n, N, g, G);
+      }
+    });
+  } else {
+    // impl-2: parallel on N * HxW.
+    //
+    // temp buffer holding x and x2
+    int num_threads = at::get_num_threads();
+    Tensor buffer = at::empty({num_threads, N, 2 * C}, X.options()).zero_();
+    T* buffer_data = buffer.data_ptr<T>();
+
+    // step-1: accumulate on dimension of C
+    //
+    // In order to improve multi-core performance when N=1,
+    // we parallel on the all the outer dimensions of N and HxW,
+    // leaving the most inner dimension C for vectorization.
+    //
+    // Note that parallel on {N, HxW, G} is not feasible for some common configs,
+    // e.g. say input shape is {1, 32, h, w} and G = 8,
+    //   this will give D = 4 which is unable to take full SIMD length.
+    //
+    // To avoid thread conflict, we make use of a temp buffer of {T, N, 2C},
+    //   firstly, reduce from {N, HxW, C} to {T, N, 2C}
+    //
+    at::parallel_for(0, N * HxW, 1, [&](int64_t begin, int64_t end) {
+      int tid = at::get_thread_num();
+      T* buffer_ptr = buffer_data + tid * N * 2 * C;
+
+      int64_t n{0}, m{0};
+      data_index_init(begin, n, N, m, HxW);
+      for (const auto i : c10::irange(begin, end)) {
+        T* mean_ptr = buffer_ptr + n * 2 * C;
+        T* rstd_ptr = mean_ptr + C;
+        const T* X_ptr = X_data + i * C;
+
+        vec::map2<T>(
+            [](Vec x, Vec y) { return x + y; },
+            mean_ptr,
+            X_ptr,
+            mean_ptr,
+            C);
+
+        vec::map2<T>(
+            [](Vec x, Vec y) { return x * x + y; },
+            rstd_ptr,
+            X_ptr,
+            rstd_ptr,
+            C);
+
+        data_index_step(n, N, m, HxW);
       }
+    });
 
+    // step-2: compute mean and rstd
+    for (const auto n : c10::irange(N)) {
       for (const auto g : c10::irange(G)) {
-        T mean_val = T(0);
-        T rstd_val = T(0);
+        T_ACC mean_val{0}, rstd_val{0};
         for (const auto d : c10::irange(D)) {
-          mean_val += mean_ptr[g * D + d];
-          rstd_val += rstd_ptr[g * D + d];
+          for (const auto t : c10::irange(num_threads)) {
+            T* buffer_ptr = buffer_data + t * N * 2 * C + n * 2 * C;
+            mean_val += buffer_ptr[g * D + d];
+            rstd_val += buffer_ptr[g * D + d + C];
+           }
         }
         mean_val *= s;
-        rstd_val = std::max(rstd_val * s - mean_val * mean_val, T(0));
-        rstd_val = T(1) / std::sqrt(rstd_val + eps);
+        rstd_val = std::max(rstd_val * s - mean_val * mean_val, T_ACC(0));
+        rstd_val = T_ACC(1) / std::sqrt(rstd_val + eps);
+        mean_data[n * G + g] = T(mean_val);
+        rstd_data[n * G + g] = T(rstd_val);
+      }
+    }
 
-        // continue to use the temp buffer for mean and rstd value,
-        // so that we can vectorize the following math on entire C dimension.
+    // step-3: compute scale and bias
+    //
+    // mean/rstd have shape of {N, G}, gamma/beta have shape of {G, D}.
+    // And scale/bias have shape of {N, C} so that we can directly vectorize on
+    // dimension of C in the final step.
+    //
+    // We could fuse step 3 and 4 into a single session but this way is better:
+    //   a. D might be too small for vectorization;
+    //   b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias
+    //
+    for (const auto n : c10::irange(N)) {
+      for (const auto g : c10::irange(G)) {
+        T* scale_ptr = buffer_data + n * 2 * C;
+        T* bias_ptr = scale_ptr + C;
+        T mean_val = mean_data[n * G + g];
+        T rstd_val = rstd_data[n * G + g];
         for (const auto d : c10::irange(D)) {
-          mean_ptr[g * D + d] = mean_val;
-          rstd_ptr[g * D + d] = rstd_val;
+          const int64_t c = g * D + d;
+          scale_ptr[c] = rstd_val * (gamma_null ? T(1) : gamma_data[c]);
+          bias_ptr[c] = -scale_ptr[c] * mean_val + (beta_null ? T(0) : beta_data[c]);
         }
-
-        mean_data[n * G + g] = mean_val;
-        rstd_data[n * G + g] = rstd_val;
       }
+    }
 
-      // expand gamma_null and beta_null to reduce if-else on critial path.
-      if (!gamma_null && !beta_null) {
-        for (const auto i : c10::irange(HxW)) {
-          const T* X_ptr = X_data + n * HxW * C + i * C;
-          T* Y_ptr = Y_data + n * HxW * C + i * C;
-          for (int64_t j = 0; j < inner_size; j += K) {
-            Vec scale_vec = Vec::loadu(rstd_ptr + j) * Vec::loadu(gamma_data + j);
-            Vec bias_vec = Vec::loadu(beta_data + j) - scale_vec * Vec::loadu(mean_ptr + j);
-            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec;
-            y_vec.store(Y_ptr + j);
-          }
-          for (const auto j : c10::irange(inner_size, C)) {
-            T scale = rstd_ptr[j] * gamma_data[j];
-            T bias = -scale * mean_ptr[j] + beta_data[j];
-            Y_ptr[j] = scale * X_ptr[j] + bias;
-          }
-        }
-      } else if (gamma_null && beta_null) {
-        for (const auto i : c10::irange(HxW)) {
-          const T* X_ptr = X_data + n * HxW * C + i * C;
-          T* Y_ptr = Y_data + n * HxW * C + i * C;
-          for (int64_t j = 0; j < inner_size; j += K) {
-            Vec scale_vec = Vec::loadu(rstd_ptr + j);
-            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) - scale_vec * Vec::loadu(mean_ptr + j);
-            y_vec.store(Y_ptr + j);
-          }
-          for (const auto j : c10::irange(inner_size, C)) {
-            T scale = rstd_ptr[j];
-            Y_ptr[j] = scale * X_ptr[j] -scale * mean_ptr[j];
-          }
-        }
-      } else {
-        for (const auto i : c10::irange(HxW)) {
-          const T* X_ptr = X_data + n * HxW * C + i * C;
-          T* Y_ptr = Y_data + n * HxW * C + i * C;
-          for (int64_t j = 0; j < inner_size; j += K) {
-            Vec gamma_vec = gamma_null ? Vec(1) : Vec::loadu(gamma_data + j);
-            Vec beta_vec = beta_null ? Vec(0) : Vec::loadu(beta_data + j);
-            Vec scale_vec = Vec::loadu(rstd_ptr + j) * gamma_vec;
-            Vec bias_vec = beta_vec - scale_vec * Vec::loadu(mean_ptr + j);
-            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec;
-            y_vec.store(Y_ptr + j);
-          }
-          for (const auto j : c10::irange(inner_size, C)) {
-            T scale = rstd_ptr[j] * (gamma_null ? T(1) : gamma_data[j]);
-            T bias = -scale * mean_ptr[j] + (beta_null ? T(0) : beta_data[j]);
-            Y_ptr[j] = scale * X_ptr[j] + bias;
-          }
-        }
+    // step-4: apply scale and bias
+    //
+    // Parallel on on the all the outer dimensions of N and HxW
+    // and vectorize on C.
+    //
+    at::parallel_for(0, N * HxW, 1, [&](int64_t begin, int64_t end) {
+      int64_t n{0}, m{0};
+      data_index_init(begin, n, N, m, HxW);
+      for (const auto i : c10::irange(begin, end)) {
+        const T* X_ptr = X_data + i * C;
+        T* Y_ptr = Y_data + i * C;
+        T* scale_ptr = buffer_data + n * 2 * C;
+        T* bias_ptr = scale_ptr + C;
+        vec::map3<T>(
+            [](Vec x, Vec scale, Vec bias) { return x * scale + bias; },
+            Y_ptr,
+            X_ptr,
+            scale_ptr,
+            bias_ptr,
+            C);
+
+        data_index_step(n, N, m, HxW);
       }
-    }
-  });
+    });
+  }
 }
 
 void GroupNormKernelImpl(
@@ -219,21 +393,22 @@ void GroupNormKernelImpl(
     Tensor& rstd) {
   switch (X.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() {
+      AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, X.scalar_type(), "GroupNormKernelImpl", [&]() {
         GroupNormKernelImplInternal<scalar_t>(
             X, gamma, beta, N, C, HxW, group, static_cast<scalar_t>(eps), Y, mean, rstd);
       });
       break;
     }
-    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() {
+    case at::MemoryFormat::ChannelsLast:
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, X.scalar_type(), "GroupNormKernelImpl", [&]() {
         GroupNormKernelImplChannelsLastInternal<scalar_t>(
             X, gamma, beta, N, C, HxW, group, static_cast<scalar_t>(eps), Y, mean, rstd);
       });
       break;
     }
     default:
-      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, ChannelsLast3d, Contiguous");
   }
 }
 
@@ -457,8 +632,8 @@ void GroupNormBackwardKernelImpl(
     Tensor& dX,
     Tensor& dgamma,
     Tensor& dbeta) {
-  AT_DISPATCH_FLOATING_TYPES(
-      X.scalar_type(), "GroupNormBackwardKernelImpl", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      ScalarType::BFloat16, X.scalar_type(), "GroupNormBackwardKernelImpl", [&]() {
         GroupNormBackwardKernelImplInternal<scalar_t>(
             dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta);
       });
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index 887b7a1dcdc9..e32b930bb592 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -1,77 +1,183 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/layer_norm.h>
 
 #include <cmath>
 #include <tuple>
 
-#include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/cpu/moments_utils.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 namespace at {
 namespace native {
 
 namespace {
 
-template <typename T>
-void LayerNormKernelImplInternal(
-    const Tensor& X,
-    const Tensor& gamma,
-    const Tensor& beta,
-    int64_t M,
-    int64_t N,
-    T eps,
-    Tensor* Y,
-    Tensor* mean,
-    Tensor* rstd) {
-  using T_ACC = vec::vec_scalar_t<T>;
-  using Vec = vec::Vectorized<T_ACC>;
-  DCHECK_EQ(X.numel(), M * N);
-  DCHECK(!gamma.defined() || gamma.numel() == N);
-  DCHECK(!beta.defined() || beta.numel() == N);
-  const T* X_data = X.data_ptr<T>();
-  const T* gamma_data = gamma.defined() ? gamma.data_ptr<T>() : nullptr;
-  const T* beta_data = beta.defined() ? beta.data_ptr<T>() : nullptr;
-  T* Y_data = Y->data_ptr<T>();
-  T* mean_data = mean->data_ptr<T>();
-  T* rstd_data = rstd->data_ptr<T>();
-  const bool gamma_null = gamma_data == nullptr;
-  const bool beta_null = beta_data == nullptr;
-  at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      const T* X_ptr = X_data + i * N;
-      T* Y_ptr = Y_data + i * N;
-      T mean_val;
-      T rstd_val;
-      std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, N);
-      rstd_val = T(1) / std::sqrt(rstd_val + eps);
-      const T_ACC scale = rstd_val;
-      const T_ACC bias = -rstd_val * mean_val;
-      if (gamma_null || beta_null) {
-        for (const auto j : c10::irange(N)) {
-          const T gamma_v = gamma_null ? T(1) : gamma_data[j];
-          const T beta_v = beta_null ? T(0) : beta_data[j];
-          Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v;
+template <typename T, typename T_ACC>
+struct LayerNormKernelImplInternal {
+  constexpr static void apply(
+      const Tensor& X,
+      const Tensor& gamma,
+      const Tensor& beta,
+      int64_t M,
+      int64_t N,
+      T eps,
+      Tensor* Y,
+      Tensor* mean,
+      Tensor* rstd) {
+    using Vec = vec::Vectorized<T>;
+    const T* X_data = X.data_ptr<T>();
+    const T* gamma_data = gamma.defined() ? gamma.data_ptr<T>() : nullptr;
+    const T* beta_data = beta.defined() ? beta.data_ptr<T>() : nullptr;
+    T* Y_data = Y->data_ptr<T>();
+    T* mean_data = mean ? mean->data_ptr<T>() : nullptr;
+    T* rstd_data = rstd ? rstd->data_ptr<T>() : nullptr;
+    const bool gamma_null = gamma_data == nullptr;
+    const bool beta_null = beta_data == nullptr;
+    const bool mean_null = mean_data == nullptr;
+    const bool rstd_null = rstd_data == nullptr;
+    at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
+      for (const auto i : c10::irange(start, end)) {
+        const T* X_ptr = X_data + i * N;
+        T* Y_ptr = Y_data + i * N;
+        T mean_val;
+        T rstd_val;
+        std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, N);
+        rstd_val = T(1) / std::sqrt(rstd_val + eps);
+        const T scale = rstd_val;
+        const T bias = -rstd_val * mean_val;
+        if (gamma_null || beta_null) {
+          for (const auto j : c10::irange(N)) {
+            const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+            const T beta_v = beta_null ? T(0) : beta_data[j];
+            Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v;
+          }
+        } else {
+          vec::map3<T>(
+              [scale, bias](Vec x, Vec gamma, Vec beta) {
+                return (x * Vec(scale) + Vec(bias)) * gamma + beta;
+              },
+              Y_ptr,
+              X_ptr,
+              gamma_data,
+              beta_data,
+              N);
+        }
+        if (!mean_null) {
+          mean_data[i] = mean_val;
+        }
+        if (!rstd_null) {
+          rstd_data[i] = rstd_val;
         }
-      } else {
-        vec::map3<T>(
-            [scale, bias](Vec x, Vec gamma, Vec beta) {
-              return (x * Vec(scale) + Vec(bias)) * gamma + beta;
-            },
-            Y_ptr,
-            X_ptr,
-            gamma_data,
-            beta_data,
-            N);
       }
-      mean_data[i] = mean_val;
-      rstd_data[i] = rstd_val;
-    }
-  });
-}
+    });
+  }
+};
+
+template <typename T_ACC>
+struct LayerNormKernelImplInternal<BFloat16, T_ACC> {
+  constexpr static void apply(
+      const Tensor& X,
+      const Tensor& gamma,
+      const Tensor& beta,
+      int64_t M,
+      int64_t N,
+      BFloat16 eps,
+      Tensor* Y,
+      Tensor* mean,
+      Tensor* rstd) {
+    using bVec = vec::Vectorized<BFloat16>;
+    using fVec = vec::Vectorized<float>;
+    const BFloat16* X_data = X.data_ptr<BFloat16>();
+    const T_ACC* gamma_data = gamma.defined() ? gamma.data_ptr<T_ACC>() : nullptr;
+    const T_ACC* beta_data = beta.defined() ? beta.data_ptr<T_ACC>() : nullptr;
+    BFloat16* Y_data = Y->data_ptr<BFloat16>();
+    T_ACC* mean_data = mean ? mean->data_ptr<T_ACC>() : nullptr;
+    T_ACC* rstd_data = rstd ? rstd->data_ptr<T_ACC>() : nullptr;
+    const bool gamma_null = gamma_data == nullptr;
+    const bool beta_null = beta_data == nullptr;
+    const bool mean_null = mean_data == nullptr;
+    const bool rstd_null = rstd_data == nullptr;
+
+    // pre convert `gamma` and `beta` to float when they are both defined
+    const bool pre_convert_gamma_beta = !gamma_null && !beta_null;
+
+    at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
+      // temp buffer holding input, gamma/beta (if defined) in float
+      //
+      // pre convert input slice to float has 2 benefits:
+      //   a. Welford algorithm involves more arithmetic operations,
+      //      this will reduce rounding error and improve performance.
+      //   b. The input slice (float) can be reused when updating
+      //      corresponding output slice.
+      //
+      int64_t buffer_size = pre_convert_gamma_beta ? 3 * N : N;
+      std::unique_ptr<float []> buffer(new float[buffer_size]);
+      float* input_buffer_ptr = buffer.get();
+      float* gamma_buffer_ptr = nullptr;
+      float* beta_buffer_ptr = nullptr;
+      if (pre_convert_gamma_beta) {
+        gamma_buffer_ptr = buffer.get() + N;
+        beta_buffer_ptr = buffer.get() + 2 * N;
+        vec::convert(gamma_data, gamma_buffer_ptr, N);
+        vec::convert(beta_data, beta_buffer_ptr, N);
+      }
+
+      for (const auto i : c10::irange(start, end)) {
+        const BFloat16* X_ptr = X_data + i * N;
+        BFloat16* Y_ptr = Y_data + i * N;
+        vec::convert(X_ptr, input_buffer_ptr, N);
+
+        float mean_val;
+        float rstd_val;
+        std::tie(mean_val, rstd_val) = utils::RowwiseMoments(input_buffer_ptr, N);
+        rstd_val = float(1) / std::sqrt(rstd_val + eps);
+        const float scale = rstd_val;
+        const float bias = -rstd_val * mean_val;
+        if (gamma_null || beta_null) {
+          for (const auto j : c10::irange(N)) {
+            const float gamma_v = gamma_null ? float(1) : float(gamma_data[j]);
+            const float beta_v = beta_null ? float(0) : float(beta_data[j]);
+            Y_ptr[j] = (input_buffer_ptr[j] * scale + bias) * gamma_v + beta_v;
+          }
+        } else {
+          int64_t d = 0;
+          for (; d < N - (N % bVec::size()); d += bVec::size()) {
+            fVec x_fvec0 = fVec::loadu(input_buffer_ptr + d);
+            fVec x_fvec1 = fVec::loadu(input_buffer_ptr + d + fVec::size());
+            fVec gamma_fvec0 = fVec::loadu(gamma_buffer_ptr + d);
+            fVec gamma_fvec1 = fVec::loadu(gamma_buffer_ptr + d + fVec::size());
+            fVec beta_fvec0 = fVec::loadu(beta_buffer_ptr + d);
+            fVec beta_fvec1 = fVec::loadu(beta_buffer_ptr + d + fVec::size());
+            fVec y_fvec0 = (x_fvec0 * fVec(scale) + fVec(bias)) * gamma_fvec0 + beta_fvec0;
+            fVec y_fvec1 = (x_fvec1 * fVec(scale) + fVec(bias)) * gamma_fvec1 + beta_fvec1;
+            bVec y_bvec = convert_float_bfloat16(y_fvec0, y_fvec1);
+            y_bvec.store(Y_ptr + d);
+          }
+          for (; d < N; d++) {
+            Y_ptr[d] = (input_buffer_ptr[d] * scale + bias) * gamma_data[d] + beta_data[d];
+          }
+        }
+        if (!mean_null) {
+          mean_data[i] = T_ACC(mean_val);
+        }
+        if (!rstd_null) {
+          rstd_data[i] = T_ACC(rstd_val);
+        }
+      }
+    });
+  }
+};
 
 void LayerNormKernelImpl(
     const Tensor& X,
@@ -83,10 +189,21 @@ void LayerNormKernelImpl(
     Tensor* Y,
     Tensor* mean,
     Tensor* rstd) {
+  DCHECK_EQ(X.numel(), M * N);
+  DCHECK(!gamma.defined() || gamma.numel() == N);
+  DCHECK(!beta.defined() || beta.numel() == N);
   AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, X.scalar_type(),
       "LayerNormKernelImpl", [&]() {
-    LayerNormKernelImplInternal<scalar_t>(
-        X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+    using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/false>;
+    const bool mixed_type = is_mixed_type(X, gamma, beta);
+    if (mixed_type) {
+      check_mixed_data_type(X, gamma, beta);
+      LayerNormKernelImplInternal<scalar_t, accscalar_t>::apply(
+          X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+    } else {
+      LayerNormKernelImplInternal<scalar_t, scalar_t>::apply(
+          X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+    }
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/mixed_data_type.h b/aten/src/ATen/native/cpu/mixed_data_type.h
new file mode 100644
index 000000000000..6964dd5fa71d
--- /dev/null
+++ b/aten/src/ATen/native/cpu/mixed_data_type.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+inline ScalarType first_type() {
+  return ScalarType::Undefined;
+}
+
+template <typename... Args>
+inline ScalarType first_type(const Tensor& arg, const Args&... parameters) {
+  return arg.defined() ? arg.scalar_type() : first_type(parameters...);
+}
+
+template <typename... Args>
+inline bool is_mixed_type(const Tensor& input, const Args&... parameters) {
+  const auto parameter_type = first_type(parameters...);
+  return ((parameter_type != ScalarType::Undefined) &&
+          (parameter_type != input.scalar_type()));
+}
+
+// currently on CPU, mixed data type is only supported
+// when input is 'BFloat16' and parameters are 'Float'
+inline void check_mixed_data_type(const Tensor& input) {
+  TORCH_CHECK(input.scalar_type() == ScalarType::BFloat16,
+      "mixed dtype (CPU): expect input to have scalar type of BFloat16");
+}
+
+template <typename... Args>
+inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) {
+  TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float,
+      "mixed dtype (CPU): expect parameter to have scalar type of Float");
+  check_mixed_data_type(input, parameters...);
+}
+
+inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) {
+  return is_mixed_type ? ScalarType::Float : t.scalar_type();
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
index 136479e2a0d4..5c607f06b3a5 100644
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@@ -37,6 +37,30 @@ inline bool data_index_step(T& x, const T& X, Args&&... args) {
   return false;
 }
 
+// Helper struct for bfloat16 vectorization
+// Useful when you need float as immediate dtype or accumulate dtype
+using namespace vec;
+struct Vec2 {
+  Vectorized<float> val0, val1;
+  Vec2(Vectorized<float> v0, Vectorized<float> v1) : val0(v0), val1(v1) {}
+  Vec2(float v) : val0(v), val1(v) {}
+  static Vec2 loadu(const BFloat16* ptr) {
+    Vectorized<float> v0, v1;
+    std::tie(v0, v1) = convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+    return {v0, v1};
+  }
+  void store(BFloat16* ptr) const {
+    Vectorized<BFloat16> val = convert_float_bfloat16(val0, val1);
+    val.store(ptr);
+  }
+};
+inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; }
+inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; }
+
+template <typename scalar_t> struct VectorizedType { using type = Vectorized<scalar_t>; };
+template <> struct VectorizedType<BFloat16> { using type = Vec2; };
+template <typename scalar_t> using VecType = typename VectorizedType<scalar_t>::type;
+
 } // namespace
 
 namespace utils {
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
index 0017b6a16d81..3f3971e6e76e 100644
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@@ -94,7 +94,7 @@ constexpr double real_impl <c10::complex<double>, double> (c10::complex<double>
 }
 
 template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
-constexpr VALUE_TYPE imag_impl (SCALAR_TYPE z) {
+constexpr VALUE_TYPE imag_impl (SCALAR_TYPE /*z*/) {
   return 0;
 }
 
@@ -123,6 +123,11 @@ inline TYPE conj_impl (TYPE z) {
   return z; //No-Op
 }
 
+template<>
+inline c10::complex<at::Half> conj_impl <c10::complex<at::Half>> (c10::complex<at::Half> z) {
+  return c10::complex<at::Half>{z.real(), -z.imag()};
+}
+
 template<>
 inline c10::complex<float> conj_impl <c10::complex<float>> (c10::complex<float> z) {
   return c10::complex<float>(z.real(), -z.imag());
diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 3bfc2621d930..19b12354cc48 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
@@ -14,12 +15,37 @@ struct AbsFunctor {
   }
 };
 
+const char abs_name[] = "abs_kernel";
 void abs_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() {
-    gpu_kernel(iter, AbsFunctor<scalar_t>());
-  });
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto abs_string = jiterator_stringify(
+        template <typename T> T abs_kernel(T x) { return std::abs(x); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/abs_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, abs_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      gpu_kernel(iter, AbsFunctor<opmath_t>());
+    });
+#endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND3(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        ScalarType::Bool,
+        iter.dtype(),
+        "abs_cuda",
+        [&]() { gpu_kernel(iter, AbsFunctor<scalar_t>()); });
+  }
 }
 
-REGISTER_DISPATCH(abs_stub, &abs_kernel_cuda);
+  REGISTER_DISPATCH(abs_stub, &abs_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp
index 2dfe0a862ea4..55b397ca77f4 100644
--- a/aten/src/ATen/native/cuda/Activation.cpp
+++ b/aten/src/ATen/native/cuda/Activation.cpp
@@ -1,9 +1,27 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/Activation.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/gelu_native.h>
+#include <ATen/ops/glu_backward_native.h>
+#include <ATen/ops/log_sigmoid_forward_native.h>
+#include <ATen/ops/prelu_backward_native.h>
+#include <ATen/ops/prelu_native.h>
+#endif
+
 namespace at { namespace native {
 
 // -----------------------------------
@@ -156,15 +174,15 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
 }
 
 TORCH_IMPL_FUNC(gelu_out_cuda) (
-    const Tensor& /*self*/, const Tensor& /*result*/
-  ) {
-  GeluCUDAKernelImpl(*this);
+  const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/
+) {
+  GeluCUDAKernelImpl(*this, get_gelutype_enum(approximate));
 }
 
 TORCH_IMPL_FUNC(gelu_backward_out_cuda) (
-    const Tensor& /*grad*/, const Tensor& /*self*/, const Tensor& /*grad_input*/
-  ) {
-  GeluBackwardCUDAKernelImpl(*this);
+  const Tensor& /*grad*/, const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*grad_input*/
+) {
+  GeluBackwardCUDAKernelImpl(*this, get_gelutype_enum(approximate));
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 168e142dd291..4f8e9b005552 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -35,6 +35,31 @@ void glu_kernel(TensorIteratorBase& iter) {
   });
 }
 
+// -----------------------------------
+// glu forward ad
+// -----------------------------------
+void glu_jvp_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "glu_cuda", [&]() {
+      using acc_t = at::acc_type<scalar_t, true>;
+      gpu_kernel(iter, [] GPU_LAMBDA (
+            scalar_t res_,
+            scalar_t b_,
+            scalar_t da_,
+            scalar_t db_) -> scalar_t {
+          const acc_t res = res_;
+          const acc_t b = b_;
+          const acc_t da = da_;
+          const acc_t db = db_;
+          const acc_t one = acc_t(1);
+
+          const acc_t sig_b = one / (one + std::exp(-b));
+          return (
+              da * sig_b + res * (db - sig_b * db)
+          );
+      });
+  });
+}
+
 // -----------------------------------
 // glu backward
 // -----------------------------------
@@ -107,11 +132,12 @@ void launch_glu_backward_kernel(const TensorIteratorBase& iter,
 void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(),
                                  "log_sigmoid_forward_cuda", [&] {
-    using acc_t = acc_type<scalar_t, true>;
+    using opmath_t = at::opmath_type<scalar_t>;
+
     gpu_kernel(iter,
         [] GPU_LAMBDA (scalar_t in_) -> scalar_t {
-          const acc_t in = in_;
-          const auto min = std::min(acc_t(0), in);
+          const opmath_t in = in_;
+          const auto min = std::min(opmath_t(0), in);
           const auto z = std::exp(-std::abs(in));
           return min - std::log1p(z);
         });
@@ -125,17 +151,17 @@ void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter) {
 void log_sigmoid_backward_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(),
                                  "log_sigmoid_backward_cuda", [&] {
-    using acc_t = acc_type<scalar_t, true>;
+    using opmath_t = at::opmath_type<scalar_t>;
     gpu_kernel(iter,
         [] GPU_LAMBDA (scalar_t in_, scalar_t grad_out_) -> scalar_t {
-          const acc_t in = in_;
-          const acc_t grad_out = grad_out_;
+          const opmath_t in = in_;
+          const opmath_t grad_out = grad_out_;
 
-          auto in_negative = in < acc_t(0);
-          auto max_deriv = in_negative ? acc_t(1) : acc_t(0);
-          auto sign = in_negative ? acc_t(1) : -acc_t(1);
+          auto in_negative = in < opmath_t(0);
+          auto max_deriv = in_negative ? opmath_t(1) : opmath_t(0);
+          auto sign = in_negative ? opmath_t(1) : -opmath_t(1);
           const auto z = std::exp(-std::abs(in));
-          return grad_out * (max_deriv - sign * (z / (acc_t(1) + z)));
+          return grad_out * (max_deriv - sign * (z / (opmath_t(1) + z)));
         });
   });
 }
@@ -368,54 +394,101 @@ static void threshold_kernel_cuda(TensorIteratorBase& iter, const Scalar& thresh
 
 void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "elu_cuda", [&]() {
-    auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
-    auto poscoef = scale.to<scalar_t>();
-    auto negiptcoef = input_scale.to<scalar_t>();
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto negcoef = alpha.to<opmath_t>() * scale.to<opmath_t>();
+    auto poscoef = scale.to<opmath_t>();
+    auto negiptcoef = input_scale.to<opmath_t>();
     gpu_kernel(iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return a > scalar_t(0) ? a * poscoef : (static_cast<scalar_t>(std::exp(a * negiptcoef)) - scalar_t(1.)) * negcoef;
+      opmath_t aop = static_cast<opmath_t>(a);
+      return aop > 0 ? aop * poscoef : std::expm1(aop * negiptcoef) * negcoef;
     });
   });
 }
 
 void elu_backward_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, bool is_result) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "elu_backward_cuda", [&]() {
-    auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
-    auto poscoef = scale.to<scalar_t>();
-    auto negiptcoef = input_scale.to<scalar_t>();
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto negcoef = alpha.to<opmath_t>() * scale.to<opmath_t>();
+    auto poscoef = scale.to<opmath_t>();
+    auto negiptcoef = input_scale.to<opmath_t>();
     gpu_kernel(iter, [negcoef, poscoef, negiptcoef, is_result]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      opmath_t aop = static_cast<opmath_t>(a);
+      opmath_t bop = static_cast<opmath_t>(b);
+
       if (is_result) {
-        return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef;
+        return bop <= 0 ? aop * negiptcoef * (bop + negcoef) : aop * poscoef;
       } else {
-        return b <= scalar_t(0) ? a * negiptcoef * negcoef * (static_cast<scalar_t>(std::exp(b * negiptcoef))) : a * poscoef;
+        return bop <= 0 ? aop * negiptcoef * negcoef * std::exp(bop * negiptcoef) : aop * poscoef;
       }
     });
   });
 }
 
-void GeluCUDAKernelImpl(TensorIteratorBase& it) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() {
-    using T_ACC = acc_type<scalar_t, true>;
-    gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
-      return static_cast<T_ACC>(x) *
-          c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
+void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) {
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() {
+      gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
+        constexpr opmath_t kKappa = 0.044715;
+        auto x_cube = static_cast<opmath_t>(x) * static_cast<opmath_t>(x) * static_cast<opmath_t>(x);
+        auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
+        return opmath_t(0.5) * static_cast<opmath_t>(x) * (opmath_t(1) + c10::cuda::compat::tanh(inner));
+      });
     });
-  });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() {
+      gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        constexpr opmath_t kAlpha = M_SQRT1_2;
+        return static_cast<opmath_t>(x) * opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+      });
+    });
+  }
 }
 
-void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
-      it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() {
-        using T_ACC = acc_type<scalar_t, true>;
-        gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
-          constexpr T_ACC kBeta = M_2_SQRTPI * M_SQRT1_2 * T_ACC(0.5);
-          const T_ACC cdf = c10::cuda::compat::normcdf(static_cast<T_ACC>(x));
-          const T_ACC pdf =
-              c10::cuda::compat::exp(
-                  T_ACC(-0.5) * static_cast<T_ACC>(x) * static_cast<T_ACC>(x)) *
-              kBeta;
-          return static_cast<T_ACC>(dy) * (cdf + static_cast<T_ACC>(x) * pdf);
+void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) {
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() {
+          gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
+            constexpr opmath_t kKappa = 0.044715;
+            auto x_sq = static_cast<opmath_t>(x) * static_cast<opmath_t>(x);
+            auto x_cube = x_sq * static_cast<opmath_t>(x);
+            auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
+            auto tanh_inner = c10::cuda::compat::tanh(inner);
+
+            auto left = opmath_t(0.5) * static_cast<opmath_t>(x);
+            auto right = opmath_t(1) + tanh_inner;
+
+            auto left_derivative = 0.5 * right;
+
+            auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner;
+            auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq);
+            auto right_derivative = left * tanh_derivative * inner_derivative;
+
+            return static_cast<opmath_t>(dy) * (left_derivative + right_derivative);
         });
       });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() {
+          gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5);
+            constexpr opmath_t kAlpha = M_SQRT1_2;
+            const opmath_t cdf =
+                opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+            const opmath_t pdf =
+                c10::cuda::compat::exp(
+                    opmath_t(-0.5) * static_cast<opmath_t>(x) * static_cast<opmath_t>(x)) *
+                kBeta;
+            return static_cast<opmath_t>(dy) * (cdf + static_cast<opmath_t>(x) * pdf);
+          });
+        });
+  }
 }
 
 namespace {
@@ -594,6 +667,7 @@ REGISTER_DISPATCH(shrink_backward_stub, &shrink_backward_kernel);
 REGISTER_DISPATCH(elu_stub, &elu_kernel);
 REGISTER_DISPATCH(elu_backward_stub, &elu_backward_kernel);
 REGISTER_DISPATCH(glu_stub, &glu_kernel);
+REGISTER_DISPATCH(glu_jvp_stub, &glu_jvp_kernel);
 REGISTER_DISPATCH(leaky_relu_stub, &leaky_relu_kernel);
 REGISTER_DISPATCH(leaky_relu_backward_stub, &leaky_relu_backward_kernel);
 REGISTER_DISPATCH(hardswish_stub, &hardswish_kernel);
diff --git a/aten/src/ATen/native/cuda/Activation.h b/aten/src/ATen/native/cuda/Activation.h
index 5e798316c9bc..5fc52ff257ce 100644
--- a/aten/src/ATen/native/cuda/Activation.h
+++ b/aten/src/ATen/native/cuda/Activation.h
@@ -1,4 +1,5 @@
-
+#pragma once
+#include <ATen/native/Activation.h>
 #include <cstdint>
 
 namespace at {
@@ -24,7 +25,7 @@ void launch_prelu_cuda_backward_kernel_multi_weights(
     const TensorBase &input, const TensorBase &weight, const TensorBase &grad_out,
     const TensorBase &input_grad, const TensorBase &weight_grad_collector);
 
-void GeluCUDAKernelImpl(TensorIteratorBase& it);
-void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it);
+void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate);
+void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index ebb2e1691107..55b0d3322e04 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -1,13 +1,24 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 #include <ATen/native/cuda/LaunchUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
index 4581fa3bf7eb..ec71b37015fb 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
@@ -1,12 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
index d1d3c079b0ad..5b46fb9c34a5 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
@@ -1,13 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/NumericLimits.cuh>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
index c2aa9adeee5b..baafc6c56d46 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
@@ -1,13 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/NumericLimits.cuh>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index c89d8a09e8d1..276f320bb199 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -1,8 +1,9 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #define _USE_MATH_DEFINES
 
 #include <math.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/cuda/ForeachFunctors.cuh>
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
index 883228ecc45d..55632014a0de 100644
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -1,6 +1,8 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/native/Pool.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
@@ -8,6 +10,14 @@
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu
index 29ba71d6acd5..ce395a4ad044 100644
--- a/aten/src/ATen/native/cuda/AveragePool3d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool3d.cu
@@ -1,5 +1,8 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/native/Pool.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -9,6 +12,14 @@
 #include <ATen/native/cuda/KernelUtils.cuh>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#endif
+
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
deleted file mode 100644
index 56d6b0acd728..000000000000
--- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#define TORCH_ASSERT_NO_OPERATORS
-#include <ATen/AccumulateType.h>
-#include <ATen/Dispatch.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/BinaryOps.h>
-#include <c10/cuda/CUDAGuard.h>
-
-// NOTE: CUDA on Windows requires that the enclosing function
-// of a __device__ lambda not have internal linkage.
-
-namespace at { namespace native {
-
-template <typename T>
-struct AddFunctor {
-  AddFunctor(T alpha) : alpha_(alpha) {}
-  T alpha_;
-  __device__ __forceinline__ T operator()(T a, T b) const __ubsan_ignore_undefined__ {
-    return a + b * alpha_;
-  }
-};
-
-void add_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
-    using opmath_t = at::opmath_type<scalar_t>;
-    opmath_gpu_kernel_with_scalars<scalar_t>(iter, AddFunctor<opmath_t>(alpha_scalar.to<opmath_t>()));
-  });
-}
-
-static void sub_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) {
-  add_kernel_cuda(iter, -alpha_scalar);
-}
-
-REGISTER_DISPATCH(add_stub, &add_kernel_cuda);
-REGISTER_DISPATCH(sub_stub, &sub_kernel_cuda);
-
-}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index c1bc53594a20..3a8ab02e3a54 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -2,6 +2,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 
@@ -10,13 +11,39 @@
 
 namespace at { namespace native {
 
+const char logical_and_name[] = "logical_and_kernel";
 void logical_and_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
-                                         iter.common_dtype(), "logical_and_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a && b;
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_and_string = jiterator_stringify(
+        template <typename T>
+        T logical_and_kernel(T a, T b) {
+          return a && b;
+        }
+    ); // logical_and_string
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_cuda", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_and_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_and_string);
+    }); // logical_and_string
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a && b;
+      });
     });
-  });
+#endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                               dtype, "logical_and_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a && b;
+      });
+   });
+  }
 }
 
 void logical_or_kernel_cuda(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index f4b618ec283f..844388e61094 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/BinaryOps.h>
 
 #include <limits>
@@ -7,6 +8,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
@@ -14,15 +16,37 @@
 namespace at {
 namespace native {
 
+const char sigmoid_backward_name[] = "sigmoid_backward";
 void sigmoid_backward_kernel_cuda(TensorIteratorBase& iter) {
-  if(isComplexType(iter.dtype())) {
-    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sigmoid_backward_cuda", [&]() {
+  auto dtype = iter.dtype();
+  if(isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto sigmoid_backward_string = jiterator_stringify(
+        template <typename T>
+        T sigmoid_backward(T a, T b) {
+          return a * std::conj((T{1.} - b) * b);
+        }
+    ); // sigmoid_backward_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_cuda", [&]() {
+        jitted_gpu_kernel<
+          /*name=*/ sigmoid_backward_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 2>(iter, sigmoid_backward_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_cuda", [&]() {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * std::conj((scalar_t{1.} - b) * b);
+        using comp_t = at::opmath_type<scalar_t>;
+        const auto one = comp_t{1.};
+        const auto comp_b = static_cast<comp_t>(b);
+        const auto comp_a = static_cast<comp_t>(a);
+        return static_cast<scalar_t>(comp_a * std::conj((one - comp_b) * comp_b));
       });
     });
+#endif
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_backward_cuda", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "sigmoid_backward_cuda", [&]() {
       gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a * (scalar_t(1.) - b) * b;
       });
@@ -63,15 +87,37 @@ void logit_backward_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scal
       });
 }
 
+const char tanh_backward_name[] = "tanh_backward";
 void tanh_backward_kernel_cuda(TensorIteratorBase& iter) {
-  if(isComplexType(iter.dtype())) {
-    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_complex_cuda", [&]() {
+  auto dtype = iter.dtype();
+  if(isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto tanh_backward_string = jiterator_stringify(
+      template <typename T>
+      T tanh_backward(T a, T b) {
+        return a * std::conj(T{1.} - b * b);
+      }
+    ); // tanh_backward_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/ tanh_backward_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 2>(iter, tanh_backward_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_cuda", [&]() {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * std::conj(scalar_t{1.} - b * b);
+        using comp_t = at::opmath_type<scalar_t>;
+        const auto one = comp_t{1.};
+        const auto comp_b = static_cast<comp_t>(b);
+        const auto comp_a = static_cast<comp_t>(a);
+        return static_cast<scalar_t>(comp_a * std::conj(one - comp_b * comp_b));
       });
     });
+#endif
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "tanh_backward_cuda", [&]() {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a * (scalar_t{1.} - b * b);
       });
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index f72ddfa4bfe2..703436a1d495 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -32,7 +32,7 @@ void huber_kernel_cuda(TensorIterator& iter, double delta) {
   });
 }
 
-void mse_kernel_cuda(TensorIterator& iter) {
+void mse_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "mse_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
       auto diff = a - b;
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index aef5600c640e..bb34c8f85366 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -5,6 +5,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAMathCompat.h>
 #include <c10/util/TypeSafeSignMath.h>
@@ -38,13 +39,30 @@ struct MulFunctor<bool> {
   }
 };
 
-
+const char div_name[] = "div_kernel";
 void div_true_kernel_cuda(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (iter.common_dtype() == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    #if AT_USE_JITERATOR()
+      static const auto div_string = jiterator_stringify(
+        template <typename T>
+        T div_kernel(T a, T b) {
+          return a / b;
+        }
+      );
+      opmath_jitted_gpu_kernel_with_scalars<div_name, scalar_t, scalar_t>(iter, div_string);
+    #else
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_gpu_kernel_with_scalars<scalar_t>(iter, DivFunctor<opmath_t>());
+    #endif
+    return;
+  }
   if (iter.is_cpu_scalar(2)) {
     // optimization for floating-point types: if the second operand is a CPU
     // scalar, compute a * reciprocal(b). Note that this may lose one bit of
     // precision compared to computing the division.
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, common_dtype, "div_true_cuda", [&]() {
       using opmath_t = at::opmath_type<scalar_t>;
       auto inv_b = opmath_t(1.0) / iter.scalar_value<opmath_t>(2);
       iter.remove_operand(2);
@@ -52,7 +70,7 @@ void div_true_kernel_cuda(TensorIteratorBase& iter) {
         MulFunctor<opmath_t>(), inv_b));
     });
   } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, common_dtype, "div_true_cuda", [&]() {
       DivFunctor<scalar_t> f;
       gpu_kernel_with_scalars(iter, f);
     });
@@ -171,11 +189,29 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
+const char mul_name[] = "mul_kernel";
 void mul_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
-    using opmath_t = at::opmath_type<scalar_t>;
-    opmath_gpu_kernel_with_scalars<scalar_t>(iter, MulFunctor<opmath_t>());
-  });
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    #if AT_USE_JITERATOR()
+      static const auto mul_string = jiterator_stringify(
+        template <typename T>
+        T mul_kernel(T a, T b) {
+          return a * b;
+        }
+      );
+      opmath_jitted_gpu_kernel_with_scalars<mul_name, scalar_t, scalar_t>(iter, mul_string);
+    #else
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_gpu_kernel_with_scalars<scalar_t>(iter, MulFunctor<opmath_t>());
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_gpu_kernel_with_scalars<scalar_t>(iter, MulFunctor<opmath_t>());
+    });
+  }
 }
 
 REGISTER_DISPATCH(div_true_stub, &div_true_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu
index 7f22ace666f2..d6bd145c4f50 100644
--- a/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu
@@ -12,47 +12,21 @@ namespace at { namespace native {
 
 
 void lshift_kernel_cuda(TensorIteratorBase& iter) {
-  if (iter.dtype() == ScalarType::Float ||
-      iter.dtype() == ScalarType::Double ||
-      iter.dtype() == ScalarType::Half ||
-      iter.dtype() == ScalarType::BFloat16) {
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "lshift_cuda", [&]() {
-      gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return a * std::pow(static_cast<scalar_t>(2), b);
-      });
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cuda", [&]() {
+    gpu_kernel_with_scalars(iter,
+      []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
     });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cuda", [&]() {
-      gpu_kernel_with_scalars(iter,
-        []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
-      });
-    });
-  }
+  });
 }
 
 void rshift_kernel_cuda(TensorIteratorBase& iter) {
-  if (iter.dtype() == ScalarType::Float ||
-      iter.dtype() == ScalarType::Double ||
-      iter.dtype() == ScalarType::Half ||
-      iter.dtype() == ScalarType::BFloat16) {
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "rshift_cuda", [&]() {
-      gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return a / std::pow(static_cast<scalar_t>(2), b);
-      });
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cuda", [&]() {
-      gpu_kernel_with_scalars(iter,
-        []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return a >> b;
-      });
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cuda", [&]() {
+    gpu_kernel_with_scalars(iter,
+      []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a >> b;
     });
-  }
+  });
 }
 
 REGISTER_DISPATCH(lshift_stub, &lshift_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 2317f072b8cc..3ca9814175c5 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1,9 +1,35 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/core/NamedTensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
 namespace at { namespace native {
 
 namespace {
@@ -90,7 +116,29 @@ c10::MaybeOwned<Tensor> prepare_batch_matrix_for_cublas(const Tensor& tensor, bo
 
 namespace {
 
-Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
+enum class Activation {
+  None,
+  RELU,
+  GELU,
+};
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
+  switch (a) {
+    case Activation::None:
+      return cuda::blas::GEMMAndBiasActivationEpilogue::None;
+    case Activation::RELU:
+      return cuda::blas::GEMMAndBiasActivationEpilogue::RELU;
+    case Activation::GELU:
+      return cuda::blas::GEMMAndBiasActivationEpilogue::GELU;
+    default:
+      TORCH_CHECK(false);
+      return cuda::blas::GEMMAndBiasActivationEpilogue::None;
+  }
+}
+#endif
+
+Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
   // Make sure to keep addmm_cuda below in sync with this code; it
   // preflights a check to try to avoid actually needing to call
   // expand().
@@ -102,9 +150,36 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   IntArrayRef mat1_sizes = mat1.sizes();
   IntArrayRef mat2_sizes = mat2.sizes();
   IntArrayRef self__sizes;
+  bool useLtInterface = false;
+  at::ScalarType scalar_type = self.scalar_type();
   c10::MaybeOwned<Tensor> self_;
   if (&result != &self) {
-    self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)
+    // Strangely, if mat2 has only 1 row or column, we get
+    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    // for cuda 11.4, cublasLtMatmul is activated
+    // the last two conditions is to skip 16b transA and non-trans-B having
+    // leading dim >> rows when they are sliced from a large tensor
+    // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+    useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+        result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+        self.is_contiguous() &&
+        (scalar_type == at::ScalarType::Double ||
+         scalar_type == at::ScalarType::Float ||
+         scalar_type == at::ScalarType::Half ||
+         scalar_type == at::ScalarType::BFloat16) &&
+        mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+        mat2_sizes[0] < 65535 && mat2_sizes[1] < 65535 &&
+        mat1_sizes[0] < 65535 && mat1_sizes[1] < 65535 &&
+        // avoid leaing dim >> rows bugs
+        ((mat1.strides()[0]==1 && mat1.strides()[1]==mat1_sizes[0]) || (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) || (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16)) &&
+        ((mat2.strides()[0]==1 && mat2.strides()[1]==mat2_sizes[0]) || (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) || (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16));
+#endif
+    if (!useLtInterface) {
+      self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    }
     self__sizes = self_->sizes();
   } else {
     self_ = c10::MaybeOwned<Tensor>::borrowed(self);
@@ -115,8 +190,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   }
 
   if (&result != &self) {
-    at::native::resize_output(result, self__sizes);
-    if (beta.toComplexDouble() != 0.0) {
+    at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+    if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
       at::native::copy_(result, *self_);
     }
   }
@@ -147,7 +222,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0);
   int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0);
   int64_t result_ld = result_->stride(transpose_result ? 0 : 1);
-  at::ScalarType scalar_type = self_->scalar_type();
 
   if (mat1.numel() == 0) {
     // By definition, when beta==0, values in self should be ignored. nans and infs
@@ -170,24 +244,92 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
-    using opmath_t = at::opmath_type<scalar_t>;
-    opmath_t alpha_val = alpha.to<opmath_t>();
-    opmath_t beta_val = beta.to<opmath_t>();
-    scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
-    scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
-    scalar_t* result_ptr = result_->data_ptr<scalar_t>();
-    at::cuda::blas::gemm<scalar_t>(
-      transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
-      transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
-      m, n, k,
-      alpha_val,
-      mat1_ptr, mat1_ld,
-      mat2_ptr, mat2_ld,
-      beta_val,
-      result_ptr, result_ld
-    );
-  });
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+  if (useLtInterface) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+          at::cuda::blas::gemm_and_bias<scalar_t>(
+              transpose_mat1,
+              transpose_mat2,
+              m,
+              n,
+              k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              mat1_->data_ptr<scalar_t>(),
+              mat1_ld,
+              mat2_->data_ptr<scalar_t>(),
+              mat2_ld,
+              self.data_ptr<scalar_t>(),
+              result_->data_ptr<scalar_t>(),
+              result_ld,
+#if 0
+              activation_to_gemm_and_blas_arg(activation)
+#else
+              // GELU is not supported (and does not compile!) prior
+              // to CUDA 11.4.  Have observed accuracy issues with
+              // GELU epilogue in 11.4; disabling the GELU epilogue
+              // path until we confirm which version it's working in.
+              activation != Activation::GELU
+              ? activation_to_gemm_and_blas_arg(activation)
+              : cuda::blas::GEMMAndBiasActivationEpilogue::None
+#endif
+          );
+        });
+  } else
+#endif
+  {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda",
+        [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
+          scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
+          scalar_t* result_ptr = result_->data_ptr<scalar_t>();
+          at::cuda::blas::gemm<scalar_t>(
+              transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
+              transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
+              m,
+              n,
+              k,
+              alpha_val,
+              mat1_ptr,
+              mat1_ld,
+              mat2_ptr,
+              mat2_ld,
+              beta_val,
+              result_ptr,
+              result_ld);
+        });
+    switch (activation) {
+      case Activation::RELU:
+        at::relu_(const_cast<Tensor&>(*result_));
+        break;
+      case Activation::GELU:
+        at::gelu_(const_cast<Tensor&>(*result_));
+        break;
+      default: break;
+    }
+  }
+
+// Preprocessor gate here needs to match the inverse of the check
+// gating activation_to_gemm_and_blas_arg above; here we are manually
+// performing a post-GELU because we weren't able to use the GELU
+// epilogue above.
+#if !0
+  if (useLtInterface && activation == Activation::GELU) {
+    at::gelu_(const_cast<Tensor&>(*result_));
+  }
+#endif
+
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
@@ -271,6 +413,10 @@ TORCH_IMPL_FUNC(addmm_out_cuda)(const Tensor& self, const Tensor& mat1, const Te
   addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha);
 }
 
+TORCH_IMPL_FUNC(addmm_activation_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor& result) {
+  addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha, use_gelu ? Activation::GELU : Activation::RELU);
+}
+
 TORCH_IMPL_FUNC(mm_out_cuda)(const Tensor& self, const Tensor& mat2, const Tensor& result) {
   addmm_out_cuda_impl(const_cast<Tensor&>(result), result, self, mat2, 0, 1);
 }
@@ -457,7 +603,8 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten
 
       // Check for contiguity of `vec` and update `vec_stride` accordingly
       const auto vec_contiguous = vec_stride == 0 ? vec.contiguous() : vec;
-      vec_stride = vec_contiguous.stride(0);
+      // A vector can be contiguous and have a stride of zero if it has it is of length 1
+      vec_stride = std::max<int64_t>(vec_contiguous.stride(0), 1LL);
 
       AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "addmv_impl_cuda", [&] {
         auto beta = beta_.to<scalar_t>();
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
index 81f81aa315ce..2a3d5730d786 100644
--- a/aten/src/ATen/native/cuda/Bucketization.cu
+++ b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -1,10 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/ceil_div.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/BucketizationUtils.h>
 #include <ATen/native/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_torch_cuda_cu_linker_symbol_op_native.h>
+#include <ATen/ops/bucketize_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/searchsorted_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
index b5b1cd5c63bc..ae0797f38e8c 100644
--- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
@@ -71,7 +71,8 @@ static inline void launch_jitted_unrolled_kernel(
   std::tuple<Args...> extra_args) {
 
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  const int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
 
   static std::mutex _jiterator_mutex;
   static std::vector<at::cuda::jit::NvrtcFunction> fns(c10::cuda::device_count());
@@ -114,9 +115,8 @@ static inline void launch_jitted_unrolled_kernel(
     // since 7 slots are already filled in `args`
     args[i + 7] = extra_args_array[i];
   }
-
-  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u},
+  {num_threads(), 1u, 1u});
 }
 
 template<
@@ -129,7 +129,8 @@ template<
 static inline void launch_jitted_vectorized_kernel(DeviceIndex dev_idx, int64_t N, const std::string& f, array_t data,
 at::opmath_type<f_inputs_type> scalar_val, std::tuple<Args...> extra_args) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  const int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
   const int vec_size = memory::jitted_can_vectorize_up_to<result_type, f_inputs_type, arity>(data);
 
   // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
@@ -195,9 +196,7 @@ at::opmath_type<f_inputs_type> scalar_val, std::tuple<Args...> extra_args) {
       // since 3 slots are already filled in `args`
       args[i + 3] = extra_args_array[i];
     }
-
-    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u}, {num_threads(), 1u, 1u});
   } else {
     auto ic = TrivialOffsetCalculator<arity>();
     auto oc = TrivialOffsetCalculator<1>();
@@ -219,14 +218,25 @@ at::opmath_type<f_inputs_type> scalar_val, std::tuple<Args...> extra_args) {
       // since 7 slots are already filled in `args`
       args[i + 7] = extra_args_array[i];
     }
-    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u}, {num_threads(), 1u, 1u});
   }
 }
 
-template <char const *name, typename result_type, typename compute_type, int arity,
-          at::cuda::jit::BinaryFuncVariant scalar_pos=at::cuda::jit::BinaryFuncVariant::NoScalar, typename ... Args>
-void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, const bool dynamic_casting, compute_type scalar_val, std::tuple<Args...> extra_args) {
+template <
+    char const* name,
+    typename result_type,
+    typename f_inputs_type,
+    int arity,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    typename... Args>
+void jitted_gpu_kernel_impl(
+    TensorIteratorBase& iter,
+    const std::string& f,
+    const bool dynamic_casting,
+    at::opmath_type<f_inputs_type> scalar_val,
+    std::tuple<Args...> extra_args) {
   TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
   TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
   TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
@@ -251,7 +261,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons
   if (!dynamic_casting) {
     if (contiguous) {
       // Case 1: no dynamic casting and contiguous
-      launch_jitted_vectorized_kernel<name, result_type, compute_type, arity, scalar_pos>(
+      launch_jitted_vectorized_kernel<name, result_type, f_inputs_type, arity, scalar_pos>(
         iter.device().index(), numel, f, data, scalar_val, extra_args);
       return;
     }
@@ -261,7 +271,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons
     auto output_offset_calculator = make_output_offset_calculator(iter);
     auto loader = memory::LoadWithoutCast();
     auto storer = memory::StoreWithoutCast();
-    launch_jitted_unrolled_kernel<name, result_type, compute_type, scalar_pos>(
+    launch_jitted_unrolled_kernel<name, result_type, f_inputs_type, scalar_pos>(
       iter.device().index(), numel, f, data, input_offset_calculator,
       output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args);
     return;
@@ -284,7 +294,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons
     // Case 3: dynamic casting and contiguous
     auto input_offset_calculator = TrivialOffsetCalculator<arity>();
     auto output_offset_calculator = TrivialOffsetCalculator<1>();
-    launch_jitted_unrolled_kernel<name, result_type, compute_type, scalar_pos>(
+    launch_jitted_unrolled_kernel<name, result_type, f_inputs_type, scalar_pos>(
       iter.device().index(), numel, f, data, input_offset_calculator,
       output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args);
     return;
@@ -293,7 +303,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons
   // Case 4: dynamic casting and noncontiguous
   auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
   auto output_offset_calculator = make_output_offset_calculator(iter);
-  launch_jitted_unrolled_kernel<name, result_type, compute_type, scalar_pos>(
+  launch_jitted_unrolled_kernel<name, result_type, f_inputs_type, scalar_pos>(
     iter.device().index(), numel, f, data, input_offset_calculator,
     output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args);
 }
diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu
index 242778faa14f..4f2b092573e3 100644
--- a/aten/src/ATen/native/cuda/CUDAScalar.cu
+++ b/aten/src/ATen/native/cuda/CUDAScalar.cu
@@ -1,5 +1,12 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_local_scalar_dense_native.h>
+#endif
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -8,8 +15,8 @@ namespace native {
 
 Scalar _local_scalar_dense_cuda(const Tensor& self) {
   Scalar r;
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+    kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
         scalar_t value;
         cudaStream_t stream = at::cuda::getCurrentCUDAStream();
         at::cuda::memcpy_and_sync(&value, self.data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
diff --git a/aten/src/ATen/native/cuda/Col2Im.cu b/aten/src/ATen/native/cuda/Col2Im.cu
index f7a63428a56f..5cb825a2e70b 100644
--- a/aten/src/ATen/native/cuda/Col2Im.cu
+++ b/aten/src/ATen/native/cuda/Col2Im.cu
@@ -1,6 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/div_rtn.h>
@@ -10,6 +11,16 @@
 #include <ATen/native/cuda/im2col.cuh>
 #include <ATen/native/im2col_shape_check.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/col2im_backward_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/im2col_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
index 9a82205e2e47..88a22f1fc2b5 100644
--- a/aten/src/ATen/native/cuda/CompareEQKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -29,7 +29,7 @@ struct CompareEqFunctor{
 }
 
 void eq_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() {
     gpu_kernel_with_scalars(iter, CompareEqFunctor<scalar_t>(EqOpType::EQ));
   });
 }
diff --git a/aten/src/ATen/native/cuda/ComplexKernel.cu b/aten/src/ATen/native/cuda/ComplexKernel.cu
index 32e60b9b2885..8738c0ab4c8e 100644
--- a/aten/src/ATen/native/cuda/ComplexKernel.cu
+++ b/aten/src/ATen/native/cuda/ComplexKernel.cu
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/TensorIterator.h>
@@ -11,7 +12,7 @@ namespace native {
 namespace {
 
 void complex_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "complex_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_cuda", [&]() {
     gpu_kernel(
       iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> c10::complex<scalar_t> {
         return c10::complex<scalar_t>(a, b);
diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
index c6144546a992..3d76bcfd30a6 100644
--- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
+++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
@@ -1,12 +1,23 @@
-#include <ATen/ATen.h>
-
-#include <ATen/div_rtn.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/div_rtn.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/im2col.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_slow_conv2d_forward_native.h>
+#include <ATen/ops/_slow_conv2d_backward_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#endif
+
 namespace at { namespace native {
 namespace {
 
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index a8720c7c967e..57f04d481fc5 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -1,4 +1,5 @@
-#include <ATen/Functions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CachingHostAllocator.h>
@@ -10,6 +11,12 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -24,8 +31,8 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-        kHalf, kBool, kBFloat16, dtype, "copy_", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+        kHalf, kBool, kBFloat16, kComplexHalf, dtype, "copy_", [&] {
           gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
   }
diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu
index e573d6594160..62310347799f 100644
--- a/aten/src/ATen/native/cuda/CrossKernel.cu
+++ b/aten/src/ATen/native/cuda/CrossKernel.cu
@@ -1,7 +1,9 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/detail/KernelUtils.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Cross.h>
+#include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index ad7ca2ac5a14..9897fbeb51e7 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
 #include <ATen/Config.h>
+#include <ATen/core/DimVector.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
 #include <ATen/native/utils/ParamsHash.h>
diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h
index 09d561736472..4b02f914d7e2 100644
--- a/aten/src/ATen/native/cuda/CuFFTUtils.h
+++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/Config.h>
 
 #include <string>
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
index ac32bfafe1a9..8f0f9b99903a 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/div_rtn.h>
 #include <ATen/cuda/CUDABlas.h>
@@ -7,7 +9,14 @@
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/Resize.h>
 #include <ATen/native/IndexingUtils.h>
-#include <ATen/native/ConvUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/_conv_depthwise2d_native.h>
+#endif
 
 namespace at {
 namespace native {
@@ -442,7 +451,7 @@ void conv_depthwise2d_backward_out(
 int getGradParamsNumThreads(int batchSize) {
   //warp per item in a batch, up to a maximum
   constexpr int MAX_BLOCK_SIZE = 256;
-  return std::min(batchSize * C10_WARP_SIZE, MAX_BLOCK_SIZE);
+  return std::min(batchSize * at::cuda::warp_size(), MAX_BLOCK_SIZE);
 }
 
 void conv_depthwise2d_grad_weight_out(
@@ -498,8 +507,9 @@ void conv_depthwise2d_grad_weight_out(
     const auto input_a = input.packed_accessor32<scalar_t, 4>();
     const auto grad_weight_a = grad_weight.packed_accessor32<scalar_t, 4>();
     using acc_t = at::acc_type<scalar_t, true>;
-    TORCH_INTERNAL_ASSERT(block.x % C10_WARP_SIZE == 0);
-    int smem = (block.x  / C10_WARP_SIZE) * sizeof(acc_t);
+    int warp_size = at::cuda::warp_size();
+    TORCH_INTERNAL_ASSERT(block.x % warp_size == 0);
+    int smem = (block.x  / warp_size) * sizeof(acc_t);
     conv_depthwise2d_grad_weight_kernel<<<grid, block, smem, stream>>>(
         grad_output_a, input_a, grad_weight_a, batchSize, inputChannels, outputChannels, depthwiseMultiplier,
         width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
index 8fbe14b797a7..5859be064bed 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
@@ -1,10 +1,20 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/ConvUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/conv_depthwise3d_native.h>
+#endif
+
 #include <algorithm>
 #include <tuple>
 #include <limits>
@@ -596,9 +606,10 @@ std::tuple<Tensor&, Tensor&, Tensor&> _depthwise_3d_backward_cuda_out(
             TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= int_max,
                         "Padded input tensor is too large.");
           }
-          TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / C10_WARP_SIZE &&
-                      grad_output_.size(3) <= int_max - C10_WARP_SIZE &&
-                      grad_output_.size(4) <= int_max - C10_WARP_SIZE,
+          int64_t warp_size = at::cuda::warp_size();
+          TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / warp_size &&
+                      grad_output_.size(3) <= int_max - warp_size &&
+                      grad_output_.size(4) <= int_max - warp_size,
                       "Output size is too large.");
 
           DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(1, 1)
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index e651ab80f47b..05a201147241 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -1,6 +1,8 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Pool.h>
@@ -12,6 +14,13 @@
 #include <c10/macros/Macros.h>
 #include <ATen/native/cuda/LaunchUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
@@ -128,8 +137,8 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
         hstart += dilation_h;
       while(wstart < 0)
         wstart += dilation_w;
-      for (int ih = hstart; ih < hend; ih++) {
-        for (int iw = wstart; iw < wend; iw++) {
+      for (int ih = hstart; ih < hend; ih += dilation_h) {
+        for (int iw = wstart; iw < wend; iw += dilation_w) {
           int cached_index = threadIdx.x;
           const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
           for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
index 67f5f41b9232..12817d5f66ea 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
@@ -1,5 +1,8 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/Pool.h>
@@ -11,6 +14,15 @@
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index b9cd1b31461e..a9130bd3e808 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -6,6 +8,13 @@
 
 #include <ATen/native/Distance.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#endif
+
 #include <c10/macros/Macros.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
index 8c9c59e7861e..a7967122db9c 100644
--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -1,6 +1,5 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
@@ -24,12 +23,12 @@
 
 namespace at { namespace native {
 
-void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen_) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen_) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(self, p_, generator);
 }
 
-void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   auto iter = TensorIterator::borrowing_nullary_op(self);
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(iter, p, generator);
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index 0b4849d1a449..27f316bc82b4 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -1,27 +1,8 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
 void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index 3ff39c3907a2..4dac756a2aaf 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -1,27 +1,8 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
 void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index 35ddcc65330b..4bfe6cb692b5 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -1,27 +1,8 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
 void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional<Generator> gen) {
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index 155759b18f57..f7b094ed6252 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -1,27 +1,8 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
 void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
index 025c70c42601..28330dbd69aa 100644
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -1,30 +1,11 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/UnaryOps.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
-void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::normal_kernel(self, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index 5da90a820a14..0607e4fa804e 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -1,27 +1,8 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {
 
 void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen_) {
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 54324cbbaf54..6a096b42f719 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -2,7 +2,7 @@
 
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
+#include <ATen/ExpandBase.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/util/Half.h>
@@ -231,7 +231,7 @@ __global__ void distribution_binary_elementwise_kernel(
 }
 
 template <typename func_t>
-void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_args, const func_t &f) {
+void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) {
   static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t");
   using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
   using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
@@ -430,7 +430,7 @@ void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transfo
 // ==================================================== Normal ========================================================
 
 template<typename RNG>
-void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {
+void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
   auto iter = TensorIterator::borrowing_nullary_op(self);
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
@@ -446,7 +446,7 @@ void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {
 
 template<typename RNG>
 struct NormalKernel {
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
     normal_kernel(self, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -574,7 +574,7 @@ struct CauchyKernel {
 
 template<typename scalar_t, typename prob_t>
 void bernoulli_tensor_cuda_kernel(
-    at::Tensor& ret, const at::Tensor& p,
+    const TensorBase &ret, const at::TensorBase &p,
     PhiloxCudaState philox_args) {
   auto functor = [philox_args] __device__(
           int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
@@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel(
 }
 
 template<typename RNG>
-void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
   PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
@@ -626,14 +626,10 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
     rng_engine_inputs = gen->philox_cuda_state(10);
   }
   TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
-  auto p_CUDA = p_.to(kCUDA);
-  //cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
-  if (self.dtype() == at::kDouble) {
-    p_CUDA = p_CUDA.to(at::kDouble);
-  } else {
-    p_CUDA = p_CUDA.to(at::kFloat);
-  }
-  c10::MaybeOwned<Tensor> p = expand_inplace(self, p_CUDA);
+  // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
+  const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
+  auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type));
+  auto p = expand_inplace(self, p_cuda);
   AT_DISPATCH_ALL_TYPES_AND3(
     at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
       if (std::is_same<scalar_t, double>::value) {
@@ -662,7 +658,7 @@ struct BernoulliKernel {
   void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     bernoulli_kernel(iter, p, check_generator<RNG>(gen));
   }
-  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu
index 04bc172ed23d..a848f0fd48f5 100644
--- a/aten/src/ATen/native/cuda/DistributionUniform.cu
+++ b/aten/src/ATen/native/cuda/DistributionUniform.cu
@@ -1,9 +1,7 @@
-#include <ATen/Dispatch.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
-#include <ATen/native/Distributions.h>
-#include <ATen/native/TensorIterator.h>
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp
new file mode 100644
index 000000000000..fc885d867445
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Distributions.cpp
@@ -0,0 +1,84 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/Distributions.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_dirichlet_grad_native.h>
+#include <ATen/ops/_sample_dirichlet_native.h>
+#include <ATen/ops/_standard_gamma_grad_native.h>
+#include <ATen/ops/_standard_gamma_native.h>
+#include <ATen/ops/binomial_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/poisson_native.h>
+#endif
+
+namespace at { namespace native {
+
+Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+  Tensor ret = at::empty(lambda.sizes(), lambda.options());
+  launch_poisson_cuda_kernel(ret, lambda, gen);
+  return ret;
+}
+
+Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+  Tensor ret = at::empty(count.sizes(), count.options());
+  at::TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(count)
+      .add_input(prob)
+      .build();
+  launch_binomial_cuda_kernel(iter, gen);
+  return ret;
+}
+
+Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
+  launch_gamma_kernel(ret, alpha, gen);
+  return ret;
+}
+
+Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
+  launch_gamma_kernel(ret, alpha, gen);
+  auto gamma_sum = ret.sum(/*dim=*/-1, /*keepdim=*/true);
+  at::TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(ret)
+      .add_input(gamma_sum)
+      .build();
+  launch_dirichlet_kernel(iter);
+  return ret;
+}
+
+Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
+  Tensor ret = at::empty(self.sizes(), self.options());
+  TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(self)
+      .add_input(output)
+      .build();
+  launch_standard_gamma_grad_kernel(iter);
+  return ret;
+}
+
+Tensor _dirichlet_grad_cuda(const Tensor& x, const Tensor& alpha, const Tensor& total) {
+  Tensor ret = at::empty(x.sizes(), x.options());
+  TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(x)
+      .add_input(alpha)
+      .add_input(total)
+      .build();
+  launch_dirichlet_grad_kernel(iter);
+  return ret;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index d7ab78c18129..717ad4d985d4 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -1,6 +1,6 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/cuda/Distributions.h>
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
@@ -42,8 +42,8 @@ namespace {
 
 template <typename scalar_t>
 void poisson_cuda_kernel(
-    at::Tensor& ret,
-    const at::Tensor& lambda,
+    const at::TensorBase &ret,
+    const at::TensorBase &lambda,
     at::PhiloxCudaState philox_args) {
   auto functor = [philox_args] __device__(
           scalar_t & ret_val, const scalar_t& lambda) {
@@ -74,19 +74,12 @@ struct curand_uniform_wrapper {
 
 template <typename scalar_t>
 void binomial_cuda_kernel(
-    at::Tensor& ret,
-    const at::Tensor& count,
-    const at::Tensor& prob,
+    at::TensorIteratorBase &iter,
     at::PhiloxCudaState philox_args) {
   using accscalar_t = at::acc_type<scalar_t, true>;
-  at::TensorIterator iter = at::TensorIteratorConfig()
-      .add_output(ret)
-      .add_input(count)
-      .add_input(prob)
-      .build();
 
   at::native::distribution_binary_kernel(iter, philox_args,
-      [philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
+      [] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
         #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
         auto uniform_lambda = curand_uniform_wrapper(state);
         BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
@@ -101,8 +94,8 @@ void binomial_cuda_kernel(
 
 template <typename scalar_t>
 void gamma_cuda_kernel(
-    at::Tensor& ret,
-    const at::Tensor& alpha,
+    const at::TensorBase &ret,
+    const at::TensorBase &alpha,
     at::PhiloxCudaState philox_args) {
   using accscalar_t = at::acc_type<scalar_t, true>;
   auto functor = [philox_args] __device__(
@@ -132,18 +125,16 @@ void gamma_cuda_kernel(
                                /*min_blocks_per_sm==*/2>(ret, alpha, functor);
 }
 
-template<typename scalar_t>
-void dirichlet_scalar_cuda_kernel(
-    at::Tensor& ret,
-    const at::Tensor& gamma) {
-  auto gamma_sum = gamma.sum(-1, true);
-  at::TensorIterator iter = at::TensorIteratorConfig()
-      .add_output(ret)
-      .add_input(gamma)
-      .add_input(gamma_sum)
-      .build();
-  at::native::gpu_kernel(iter,
-    [] GPU_LAMBDA (scalar_t gamma, scalar_t gamma_sum) {
+} // namespace
+
+namespace at { namespace native {
+
+void launch_dirichlet_kernel(at::TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+                                  iter.input_dtype(), "dirichlet_cuda", [&] {
+    at::native::gpu_kernel(
+        iter,
+        [] GPU_LAMBDA (scalar_t gamma, scalar_t gamma_sum) {
       auto ret_val = gamma / gamma_sum;
       auto min_value = std::numeric_limits<scalar_t>::min();
       auto max_value = 1 - std::numeric_limits<scalar_t>::epsilon();
@@ -151,107 +142,66 @@ void dirichlet_scalar_cuda_kernel(
       ret_val = (max_value < ret_val) ? max_value : ret_val;
       return ret_val;
     });
+  });
 }
 
-} // namespace
-
-namespace at { namespace native {
-
-Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
-  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+void launch_poisson_cuda_kernel(
+    const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen) {
   PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
     rng_engine_inputs = gen->philox_cuda_state(20);
   }
-  Tensor ret = at::empty(lambda.sizes(), lambda.options());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "poisson_cuda", [&] {
     poisson_cuda_kernel<scalar_t>(ret, lambda, rng_engine_inputs);
   });
-  return ret;
 }
 
-Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen_) {
-  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+void launch_binomial_cuda_kernel(
+    TensorIteratorBase &iter, CUDAGeneratorImpl *gen) {
   PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
     rng_engine_inputs = gen->philox_cuda_state(42);
   }
-  Tensor ret = at::empty(count.sizes(), count.options());
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "binomial_cuda", [&] {
-    binomial_cuda_kernel<scalar_t>(ret, count, prob, rng_engine_inputs);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.input_dtype(), "binomial_cuda", [&] {
+    binomial_cuda_kernel<scalar_t>(iter, rng_engine_inputs);
   });
-  return ret;
 }
 
-Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
-  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen) {
   PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
     rng_engine_inputs = gen->philox_cuda_state(10);
   }
-  Tensor ret = at::empty(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "gamma_cuda", [&] {
      gamma_cuda_kernel<scalar_t>(ret, alpha, rng_engine_inputs);
    });
-  return ret;
 }
 
-Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
-  auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
-  PhiloxCudaState rng_engine_inputs;
-  {
-    // See Note [Acquire lock when using random generators]
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(10);
-  }
-  Tensor ret = at::empty(alpha.sizes(), alpha.options());
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "dirichlet", [&] {
-    Tensor gamma = at::empty(alpha.sizes(), alpha.options());
-    gamma_cuda_kernel<scalar_t>(gamma, alpha, rng_engine_inputs);
-    dirichlet_scalar_cuda_kernel<scalar_t>(ret, gamma);
-  });
-  return ret;
-}
-
-Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
-  Tensor ret = at::empty(self.sizes(), self.options());
-  TensorIterator iter = at::TensorIteratorConfig()
-      .add_output(ret)
-      .add_input(self)
-      .add_input(output)
-      .build();
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "_standard_gamma_grad_cuda", [&] {
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "_standard_gamma_grad_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     gpu_kernel(iter,
       [] GPU_LAMBDA (scalar_t self_val, scalar_t output_val) {
         return standard_gamma_grad_one<scalar_t, accscalar_t>(self_val, output_val);
       });
   });
-  return ret;
 }
 
-Tensor _dirichlet_grad_cuda(const Tensor& x, const Tensor& alpha, const Tensor& total) {
-  Tensor ret = at::empty(x.sizes(), x.options());
-  TensorIterator iter = at::TensorIteratorConfig()
-      .add_output(ret)
-      .add_input(x)
-      .add_input(alpha)
-      .add_input(total)
-      .build();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "_dirichlet_grad_cuda", [&] {
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "_dirichlet_grad_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
-    gpu_kernel(iter,
+    at::native::gpu_kernel(iter,
       [] GPU_LAMBDA (scalar_t x_val, scalar_t alpha_val, scalar_t total_val) -> scalar_t {
         return dirichlet_grad_one<scalar_t, accscalar_t>(x_val, alpha_val, total_val);
       });
   });
-  return ret;
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Distributions.h b/aten/src/ATen/native/cuda/Distributions.h
new file mode 100644
index 000000000000..1a34fdfdf314
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Distributions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace at {
+struct CUDAGeneratorImpl;
+struct TensorIteratorBase;
+class TensorBase;
+
+namespace native {
+
+void launch_poisson_cuda_kernel(
+    const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen);
+
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen);
+
+void launch_binomial_cuda_kernel(
+    TensorIteratorBase &iter, CUDAGeneratorImpl *gen);
+
+void launch_dirichlet_kernel(TensorIteratorBase &iter);
+
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter);
+
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 528a43646b9b..6ec054aa6050 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -1,6 +1,9 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Utils.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
@@ -11,6 +14,17 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/MemoryAccess.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_masked_scale_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 namespace at{
 namespace native{
 
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index edf7e31d5621..8a241cabcd2d 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -1,5 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -11,11 +13,24 @@
 #include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/block_reduce.cuh>
+#include <ATen/native/cuda/thread_constants.h>
 
 #if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
 #endif
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_renorm_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at { namespace native {
 
 namespace {
@@ -249,8 +264,9 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
     auto indices_contig = indices.contiguous();
     auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options());
     int64_t stride = grad_weight.stride(0);
-    dim3 grid(ceil_div(stride, (int64_t)C10_WARP_SIZE));
-    dim3 block(C10_WARP_SIZE, BLOCKDIMY);
+    int warp_size = at::cuda::warp_size();
+    dim3 grid(ceil_div(stride, (int64_t)warp_size));
+    dim3 block(warp_size, BLOCKDIMY);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
@@ -263,7 +279,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
           embedding_backward_feature_kernel<scalar_t, accscalar_t, index_t>
             <<<grid,
                 block,
-                sizeof(accscalar_t)*C10_WARP_SIZE*BLOCKDIMY + sizeof(int)*C10_WARP_SIZE*BLOCKDIMY,
+                sizeof(accscalar_t)*warp_size*BLOCKDIMY + sizeof(int)*warp_size*BLOCKDIMY,
                 stream>>>
             (indices_contig.data_ptr<index_t>(),
               grad.data_ptr<scalar_t>(),
@@ -352,18 +368,18 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
       num_indices
     );
 
-    constexpr int num_threads = 128;
-    static_assert(num_threads % C10_WARP_SIZE == 0 &&
-                  num_threads <= cuda_utils::kCUDABlockReduceMaxThreads,
+    int warp_size = at::cuda::warp_size();
+    TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
+                  num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads,
                   "BlockReduceSum requires all warps be active");
     int64_t *num_unique_indices_ptr = num_unique_indices.data_ptr<int64_t>();
     dim3 grid = unique_indices.numel();
-    dim3 block = num_threads;
+    dim3 block = num_threads();
     int dim = self.stride(0);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_renorm_cuda_", [&] {
       using accscalar_t = acc_type<scalar_t, true>;
-      renorm_kernel<<<grid, block, (block.x / C10_WARP_SIZE) * sizeof(accscalar_t), stream>>>(
+      renorm_kernel<<<grid, block, (block.x / warp_size) * sizeof(accscalar_t), stream>>>(
         self.data_ptr<scalar_t>(),
         unique_indices.data_ptr<index_t>(),
         static_cast<accscalar_t>(max_norm),
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index afb2f25cc346..1a2c7627fc73 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -1,15 +1,26 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/cub.h>
+#include <ATen/cuda/cub.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/cuda/SortingCommon.cuh>
 
-#include <ATen/AccumulateType.h>
-
 #include <c10/macros/Macros.h>
 
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+#include <thrust/iterator/counting_iterator.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -35,7 +46,8 @@ int64_t ceil_div(int64_t x, int64_t y) {
 template <typename index_t>
 __global__
 void krn_partials_per_segment(index_t *ret, const index_t *segment_offsets,
-                              int64_t num_of_segments, int64_t numel) {
+                              int64_t *num_of_segments_ptr, int64_t numel) {
+  int64_t num_of_segments = *num_of_segments_ptr;
   const int id = blockIdx.x * blockDim.x + threadIdx.x;
   if(id < num_of_segments) {
     const int64_t idx_start = segment_offsets[id];
@@ -52,7 +64,8 @@ void krn_partial_segment_offset(
         const index_t *partials_per_segment,
         const index_t *partials_per_segment_offset,
         const index_t *segment_offsets,
-        int64_t num_of_segments) {
+        int64_t *num_of_segments_ptr) {
+  int64_t num_of_segments = *num_of_segments_ptr;
   const int id = blockIdx.x * blockDim.x + threadIdx.x;
   if(id < num_of_segments) {
     index_t idx = partials_per_segment_offset[id];
@@ -71,10 +84,11 @@ __global__ void compute_grad_weight_bags(
     index_t *offset2bag, index_t *count, ptrdiff_t numel,
     int64_t stride, int mode_mean, const index_t *bag_size,
     scalar_t* per_sample_weights, int64_t per_sample_weights_stride,
-    index_t* segment_offsets, int64_t num_of_segments,
+    index_t* segment_offsets, int64_t *num_of_segments_ptr,
     acc_type<scalar_t, true> *grad_weight_per_segment,
     const int64_t stride_warped) {
 
+  int64_t num_of_segments = *num_of_segments_ptr;
   const int gid = blockIdx.x * blockDim.x + threadIdx.x;
   const int id = gid / stride_warped;
   const int startFeature = gid % stride_warped;
@@ -115,10 +129,11 @@ __global__ void compute_grad_weight(
     ptrdiff_t numel,
     int64_t stride,
     index_t* segment_offsets,
-    int64_t num_of_segments,
+    int64_t *num_of_segments_ptr,
     acc_type<scalar_t, true> *grad_weight_per_segment,
     const int64_t stride_warped) {
 
+  int64_t num_of_segments = *num_of_segments_ptr;
   using accscalar_t = acc_type<scalar_t, true>;
   const int gid = blockIdx.x * blockDim.x + threadIdx.x;
   const int id = gid / stride_warped;
@@ -145,12 +160,14 @@ __global__ void compute_grad_weight(
 template <typename scalar_t, typename index_t>
 __global__ void sum_and_scatter(
     index_t *input, scalar_t *gradWeight, int64_t stride,
-    index_t* segment_offsets, int64_t num_of_segments,
+    index_t* segment_offsets, int64_t *num_of_segments_ptr,
     const acc_type<scalar_t, true> *grad_weight_per_segment,
-    const index_t *segment_sizes_offsets, int64_t num_of_partial_segments,
+    const index_t *segment_sizes_offsets, int64_t *num_of_partial_segments_ptr,
     const int64_t padding_idx,
     const int64_t stride_warped) {
 
+  int64_t num_of_segments = *num_of_segments_ptr;
+  int64_t num_of_partial_segments = *num_of_partial_segments_ptr;
   const int gid = blockIdx.x * blockDim.x + threadIdx.x;
   const int id = gid / stride_warped;
   const int startFeature = gid % stride_warped;
@@ -173,10 +190,23 @@ __global__ void sum_and_scatter(
   }
 }
 
+template<typename index_t>
+__global__ void compute_num_of_partial_segments(index_t *partials_per_segment, index_t *partials_per_segment_offset, int64_t *num_of_segments_ptr, int64_t *output) {
+  int64_t num_of_segments = *num_of_segments_ptr;
+  *output = partials_per_segment[num_of_segments-1] +
+            partials_per_segment_offset[num_of_segments-1];
+}
+
+__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
+  *num_of_segments_ptr = num_of_segments;
+}
+
 } // anon namespace
 
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
 template<typename index_t>
 int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
+#endif
 
 Tensor embedding_backward_cuda_kernel(
         const Tensor &grad,
@@ -200,19 +230,35 @@ Tensor embedding_backward_cuda_kernel(
   // spawn a warp per index. In this context, a segment is a number of rows that should
   // be summarized.
   // Unit: index in `sorted_indices` and `orig_indices`
+  auto segment_offsets = at::empty({numel}, orig_indices.options());
+  auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
+  int64_t *num_of_segments_ptr = num_of_segments_tensor.data_ptr<int64_t>();
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
-    auto segment_offsets = at::empty({numel}, orig_indices.options());
     int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
+    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+#else
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
+    auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
+    cuda::cub::unique_by_key(
+      sorted_indices.data_ptr<index_t>(), thrust::make_counting_iterator(0),
+      nullptr, segment_offsets.data_ptr<index_t>(),
+      num_of_segments_ptr, sorted_indices.numel());
+  });
+#endif
 
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
     // We split the segments up into sizes of `NROWS_PER_THREAD`
     // Compute the number partial-segments per segment (some partial-segments
     // may not be the full `NROWS_PER_THREAD` number of rows)
-    auto partials_per_segment = at::empty({num_of_segments}, orig_indices.options());
+    auto partials_per_segment = at::empty({numel}, orig_indices.options());
     {
-      krn_partials_per_segment<<<ceil_div(num_of_segments, 32), 32, 0, stream>>> (
+      krn_partials_per_segment<<<ceil_div(numel, 32), 32, 0, stream>>> (
               partials_per_segment.data_ptr<index_t>(),
               segment_offsets.data_ptr<index_t>(),
-              num_of_segments,
+              num_of_segments_ptr,
               numel);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
@@ -221,32 +267,38 @@ Tensor embedding_backward_cuda_kernel(
     // of each partial-segment in `sorted_indices`, we need to compute the
     // start position of each _segment_ in `partial_segment_offset`.
     // Unit: index in `partial_segment_offset`
-    auto partials_per_segment_offset = at::empty({num_of_segments}, orig_indices.options());
+    auto partials_per_segment_offset = at::empty({numel}, orig_indices.options());
     cuda::cub::exclusive_sum(
         partials_per_segment.data_ptr<index_t>(),
         partials_per_segment_offset.data_ptr<index_t>(),
-        num_of_segments);
+        numel);
 
     // The total number of partial-segments is the sum of `partials_per_segment_offset`
-    const int num_of_partial_segments = partials_per_segment[num_of_segments-1].item<index_t>() +
-            partials_per_segment_offset[num_of_segments-1].item<index_t>();
+    auto num_of_partial_segments_tensor = at::empty({}, grad.options().dtype(kLong));
+    int64_t *num_of_partial_segments_ptr = num_of_partial_segments_tensor.data_ptr<int64_t>();
+    compute_num_of_partial_segments<index_t><<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(
+      partials_per_segment.data_ptr<index_t>(),
+      partials_per_segment_offset.data_ptr<index_t>(),
+      num_of_segments_ptr, num_of_partial_segments_ptr);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     // Now we can compute the start position of each partial-segment
     // Unit: index in `sorted_indices` and `orig_indices`
-    auto partial_segment_offset = at::empty({num_of_partial_segments}, orig_indices.options());
+    auto partial_segment_offset = at::empty({numel}, orig_indices.options());
     {
-      krn_partial_segment_offset<<<ceil_div(num_of_segments, 32), 32, 0, stream>>> (
+      krn_partial_segment_offset<<<ceil_div(numel, 32), 32, 0, stream>>> (
               partial_segment_offset.data_ptr<index_t>(),
               partials_per_segment.data_ptr<index_t>(),
               partials_per_segment_offset.data_ptr<index_t>(),
               segment_offsets.data_ptr<index_t>(),
-              num_of_segments);
+              num_of_segments_ptr);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
-    const int stride_warped = ceil_div(stride, C10_WARP_SIZE)*C10_WARP_SIZE;
+    const int warp_size = at::cuda::warp_size();
+    const int stride_warped = ceil_div(stride, warp_size)*warp_size;
     const int block = std::min(stride_warped, MAX_BLOCK_SIZE);
-    const int grid = ceil_div(num_of_partial_segments*stride_warped, block);
+    const int grid = ceil_div(numel*stride_warped, block);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
       grad.scalar_type(), "embedding_bag_backward_cuda_compute_grad_weight", [&] {
@@ -259,7 +311,7 @@ Tensor embedding_backward_cuda_kernel(
         } else {
             op = grad.options();
         }
-        auto grad_weight_per_segment = at::empty({num_of_partial_segments, stride}, op);
+        auto grad_weight_per_segment = at::empty({numel, stride}, op);
         // Compute the sum of each partial-segment and handle bags
         if (offset2bag.defined()) {
               compute_grad_weight_bags<scalar_t><<<grid, block, 0, stream>>>(
@@ -271,7 +323,7 @@ Tensor embedding_backward_cuda_kernel(
                 per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
                 per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
                 partial_segment_offset.data_ptr<index_t>(),
-                num_of_partial_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
+                num_of_partial_segments_ptr, grad_weight_per_segment.data_ptr<partial_weight_t>(),
                 stride_warped);
               C10_CUDA_KERNEL_LAUNCH_CHECK();
         } else {
@@ -281,7 +333,7 @@ Tensor embedding_backward_cuda_kernel(
                 count.defined() ? count.data_ptr<index_t>() : nullptr,
                 numel, stride,
                 partial_segment_offset.data_ptr<index_t>(),
-                num_of_partial_segments,
+                num_of_partial_segments_ptr,
                 grad_weight_per_segment.data_ptr<partial_weight_t>(),
                 stride_warped);
               C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -289,15 +341,15 @@ Tensor embedding_backward_cuda_kernel(
 
         // Finally, we sum all the partial-sums and scatter them
         // into `grad_weight`.
-        const int grid2 = ceil_div(num_of_segments*stride_warped, block);
+        const int grid2 = ceil_div(numel*stride_warped, block);
             sum_and_scatter<scalar_t><<<grid2, block, 0, stream>>>(
               sorted_indices.data_ptr<index_t>(),
               grad_weight.data_ptr<scalar_t>(),
               stride,
               segment_offsets.data_ptr<index_t>(),
-              num_of_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
+              num_of_segments_ptr, grad_weight_per_segment.data_ptr<partial_weight_t>(),
               partials_per_segment_offset.data_ptr<index_t>(),
-              num_of_partial_segments,
+              num_of_partial_segments_ptr,
               padding_idx,
               stride_warped);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
index 7b8fc9576e21..0d8d45c1defb 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@@ -1,10 +1,8 @@
-#include <ATen/ATen.h>
+#pragma once
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
-
-#pragma once
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 4c842f2c7bcd..7ac3a7151b79 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -1,12 +1,26 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
 
-#include <ATen/AccumulateType.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
+#endif
 
 #include <ATen/cuda/cub.cuh>
 #include <ATen/native/cuda/SortingCommon.cuh>
@@ -53,7 +67,7 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
     index_t *offset2bag, int64_t numIndices, int64_t numBags,
     int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
     index_t *bag_size, index_t *max_indices,
-    index_t padding_idx, int64_t vocab_size) {
+    index_t padding_idx) {
 
   // the strategy here is that each bag x feature is handled by a single thread
 
@@ -74,7 +88,6 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
       int64_t bag_size_ = 0;
       int64_t maxWord = -1;
       for (int64_t emb = begin; emb < end; emb++) {
-        CUDA_KERNEL_ASSERT(input[emb] >= 0 && input[emb] < vocab_size);
         bool pad = (input[emb] == padding_idx);
         const int64_t weightRow = input[emb] * weight_stride0;
         scalar_t weightValue = weightFeat[weightRow];
@@ -104,7 +117,7 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
     int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
     int mode, index_t *bag_size,
     scalar_t* per_sample_weights, int64_t per_sample_weights_stride,
-    index_t padding_idx, int64_t vocab_size) {
+    index_t padding_idx) {
 
   // the strategy here is that each bag x feature is handled by a single thread
 
@@ -125,7 +138,6 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
       accscalar_t weightFeatSum = 0;
       int64_t bag_size_ = 0;
       for (int64_t emb = begin; emb < end; emb++) {
-        CUDA_KERNEL_ASSERT(input[emb] >= 0 && input[emb] < vocab_size);
         bool pad = (input[emb] == padding_idx);
         const int64_t weightRow = input[emb] * weight_stride0;
         scalar_t weightValue = weightFeat[weightRow];
@@ -350,7 +362,6 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
     numBags -= 1;
   }
   int64_t featureSize = weight.size(1);
-  int64_t vocabSize = weight.size(0);
 
   auto bag_size = at::empty(offsets.sizes(), indices.options());
   auto offset2bag =
@@ -384,7 +395,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
             offset2bag.data_ptr<index_t>(), numIndices, numBags, featureSize,
             weight.stride(0), weight.stride(1), bag_size.data_ptr<index_t>(),
             max_indices.data_ptr<index_t>(),
-            padding_idx, vocabSize);
+            padding_idx);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         EmbeddingBag_updateOutputKernel_sum_mean<scalar_t, index_t><<<grid, block, 0, stream>>>(
@@ -394,7 +405,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
             weight.stride(0), weight.stride(1), mode, bag_size.data_ptr<index_t>(),
             per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
             per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
-            padding_idx, vocabSize);
+            padding_idx);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
@@ -515,7 +526,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda(
   AT_ASSERT(weight.size(1) == embedding_features);
 
   const int threads_per_block = 512;
-  const int warps_per_block = threads_per_block / C10_WARP_SIZE;
+  const int warps_per_block = threads_per_block / at::cuda::warp_size();
 
   dim3 block(threads_per_block);
   dim3 grid((num_samples + warps_per_block - 1) / warps_per_block);
diff --git a/aten/src/ATen/native/cuda/Equal.cpp b/aten/src/ATen/native/cuda/Equal.cpp
index 401571b2f1f2..ab8c9adef4e4 100644
--- a/aten/src/ATen/native/cuda/Equal.cpp
+++ b/aten/src/ATen/native/cuda/Equal.cpp
@@ -1,6 +1,14 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/NamedTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #include <ATen/CUDAFunctions.h>
-#include <ATen/NamedTensorUtils.h>
+#else
+#include <ATen/ops/eq_cuda_dispatch.h>
+#include <ATen/ops/equal_native.h>
+#endif
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu
index 76497ee7188a..facceccf8028 100644
--- a/aten/src/ATen/native/cuda/FillKernel.cu
+++ b/aten/src/ATen/native/cuda/FillKernel.cu
@@ -1,8 +1,10 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/Fill.h>
+#include <c10/core/Scalar.h>
 
 namespace at { namespace native {
 
@@ -17,7 +19,7 @@ struct FillFunctor {
 };
 
 void fill_kernel_cuda(TensorIterator& iter, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "fill_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kHalf, kBFloat16, iter.dtype(), "fill_cuda", [&]() {
     gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
   });
 }
diff --git a/aten/src/ATen/native/cuda/ForeachReduceOp.cu b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
index 0d6848324252..05fb1f6a087d 100644
--- a/aten/src/ATen/native/cuda/ForeachReduceOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/OpMathType.h>
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/cuda/block_reduce.cuh>
@@ -24,13 +25,13 @@ namespace native {
 template<typename T, int NormType, int depth=1, int r_args_depth=1, int res_arg_index=0>
 struct LpNormFunctor {
   static_assert(NormType == 1 || NormType == 2, "foreach_norm supports only L1 and L2 norm");
+  using opmath_t = typename at::opmath_type<T>;
   __device__ __forceinline__ void operator() (
       int chunk_size,
       TensorListMetadata<depth>& tl,
-      T* output_per_tensor,
+      opmath_t* output_per_tensor,
       const int max_chunks_per_tensor
   ) {
-    using opmath_t = typename at::opmath_type<T>;
     int tensor_loc = tl.block_to_tensor[blockIdx.x];
     int chunk_idx = tl.block_to_chunk[blockIdx.x];
     int n = tl.numel_for_tensor[tensor_loc];
@@ -82,16 +83,15 @@ struct LpNormFunctor {
   }
 };
 
-template<typename T, int NormType>
+template<typename T, int NormType, typename opmath_t = at::opmath_type<T>>
 __global__ void lpnorm_cleanup(
-    T* output_per_tensor,
+    opmath_t* output_per_tensor,
     T* ret_per_tensor,
     int max_chunks_per_tensor) {
-  using opmath_t = typename at::opmath_type<T>;
   __shared__ opmath_t vals[512];
 
-  T* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
-  T val = 0;
+  opmath_t* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
+  opmath_t val = 0;
   for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) {
     val += output_this_tensor[i];
   }
@@ -134,7 +134,7 @@ std::vector<Tensor> foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o
     }
   }
   const auto options = tensors[0].options();
-  auto output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, options);
+  auto output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, options.dtype(toOpMathType(tensors[0].scalar_type())));
   auto ret_per_tensor = at::empty({ntensors}, options);
 
   auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
@@ -145,13 +145,13 @@ std::vector<Tensor> foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o
         multi_tensor_apply<1>(
           tensor_lists,
           LpNormFunctor<scalar_t, 1>(),
-          output_per_tensor.data_ptr<scalar_t>(),
+          output_per_tensor.data_ptr<opmath_t>(),
           max_chunks_per_tensor);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
         const at::cuda::OptionalCUDAGuard device_guard(device_of(output_per_tensor));
         auto stream = at::cuda::getCurrentCUDAStream();
         lpnorm_cleanup<scalar_t, 1><<<ntensors, 512, 0, stream>>>(
-          output_per_tensor.data_ptr<scalar_t>(),
+          output_per_tensor.data_ptr<opmath_t>(),
           ret_per_tensor.data_ptr<scalar_t>(),
           max_chunks_per_tensor);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -163,13 +163,13 @@ std::vector<Tensor> foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o
         multi_tensor_apply<1>(
           tensor_lists,
           LpNormFunctor<scalar_t, 2>(),
-          output_per_tensor.data_ptr<scalar_t>(),
+          output_per_tensor.data_ptr<opmath_t>(),
           max_chunks_per_tensor);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
         const at::cuda::OptionalCUDAGuard device_guard(device_of(output_per_tensor));
         auto stream = at::cuda::getCurrentCUDAStream();
         lpnorm_cleanup<scalar_t, 2><<<ntensors, 512, 0, stream>>>(
-          output_per_tensor.data_ptr<scalar_t>(),
+          output_per_tensor.data_ptr<opmath_t>(),
           ret_per_tensor.data_ptr<scalar_t>(),
           max_chunks_per_tensor);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index aa898d50a2ce..46ea4eadf1fe 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -1,16 +1,24 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/NumericLimits.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
index 34b238410bb5..92a77dc00af5 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
@@ -1,17 +1,27 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/NumericLimits.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
index 0c758c9cc10b..7c04ce4da351 100644
--- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
+++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
@@ -1,5 +1,7 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/FunctionOfAMatrixUtils.h>
 
+#include <ATen/Dispatch.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/Atomic.cuh>
diff --git a/aten/src/ATen/native/cuda/GridSampler.cpp b/aten/src/ATen/native/cuda/GridSampler.cpp
new file mode 100644
index 000000000000..aefe6f822270
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GridSampler.cpp
@@ -0,0 +1,83 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/GridSampler.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/grid_sampler_2d_backward_native.h>
+#include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_3d_backward_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+namespace at {
+namespace native {
+
+Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+  launch_grid_sampler_2d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]},
+      input.options());
+  launch_grid_sampler_3d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners, std::array<bool, 2> output_mask) {
+  auto input_requires_grad = output_mask[0];
+  Tensor grad_input = ([&]() {
+    if (input_requires_grad) {
+      return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    } else {
+      return Tensor();
+    }
+  })();
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_2d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners, std::array<bool,2> output_mask) {
+  auto input_requires_grad = output_mask[0];
+  Tensor grad_input = ([&]() {
+    if (input_requires_grad) {
+      return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    } else {
+      return Tensor();
+    }
+  })();
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_3d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
index b358853c997f..bfc3d86b8ab9 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -1,10 +1,14 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/cuda/GridSampler.h>
+#include <ATen/native/GridSamplerUtils.h>
 #include <ATen/native/cuda/GridSampler.cuh>
 #include <ATen/native/cuda/UpSample.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
 #include <c10/macros/Macros.h>
 
 namespace at { namespace native {
@@ -509,12 +513,13 @@ namespace {
       TensorInfo<scalar_t, index_t> grad_output,
       TensorInfo<scalar_t, index_t> input,
       TensorInfo<scalar_t, index_t> grid,
-      TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros
+      TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros (or unused if input_requires_grad is false)
       TensorInfo<scalar_t, index_t> grad_grid,   // initialized to empty
       const GridSamplerInterpolation interpolation_mode,
       const GridSamplerPadding padding_mode,
       bool align_corners,
-      const index_t grad_input_memory_span) {
+      const index_t grad_input_memory_span,
+      const bool input_requires_grad) {
 
     index_t C = input.sizes[1];
     index_t inp_D = input.sizes[2];
@@ -538,11 +543,19 @@ namespace {
     index_t gOut_sD = grad_output.strides[2];
     index_t gOut_sH = grad_output.strides[3];
     index_t gOut_sW = grad_output.strides[4];
-    index_t gInp_sN = grad_input.strides[0];
-    index_t gInp_sC = grad_input.strides[1];
-    index_t gInp_sD = grad_input.strides[2];
-    index_t gInp_sH = grad_input.strides[3];
-    index_t gInp_sW = grad_input.strides[4];
+    // gInp_* (and NC_offset below) are not really needed if input_requires_grad is false.
+    int64_t gInp_sN = 0;
+    int64_t gInp_sC = 0;
+    int64_t gInp_sD = 0;
+    int64_t gInp_sH = 0;
+    int64_t gInp_sW = 0;
+    if (input_requires_grad) {
+      gInp_sN = grad_input.strides[0];
+      gInp_sC = grad_input.strides[1];
+      gInp_sD = grad_input.strides[2];
+      gInp_sH = grad_input.strides[3];
+      gInp_sW = grad_input.strides[4];
+    }
     index_t gGrid_sW = grad_grid.strides[3];
 
     CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
@@ -611,30 +624,34 @@ namespace {
 
         scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
         scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-        index_t NC_offset = n * gInp_sN;
+        index_t NC_offset;
+        if (input_requires_grad) {
+          NC_offset = n * gInp_sN;
+        }
         scalar_t *inp_ptr_NC = input.data + n * inp_sN;
         // calculate bilinear weighted pixel value and set output pixel
         for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC += inp_sC) {
           scalar_t gOut = *gOut_ptr_NCDHW;
 
           // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
-          safe_add_3d(grad_input.data, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut,
-                      NC_offset, grad_input_memory_span);
-          safe_add_3d(grad_input.data, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut,
-                      NC_offset, grad_input_memory_span);
-
+          if (input_requires_grad) {
+            safe_add_3d(grad_input.data, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut,
+                        NC_offset, grad_input_memory_span);
+          }
           // calculate grad_grid
           if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
             scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
@@ -695,20 +712,21 @@ namespace {
         gGrid_ptr_NDHW[1] = giy_mult * giy;
         gGrid_ptr_NDHW[2] = giz_mult * giz;
       } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-        auto ix_nearest = static_cast<index_t>(::round(ix));
-        auto iy_nearest = static_cast<index_t>(::round(iy));
-        auto iz_nearest = static_cast<index_t>(::round(iz));
+        if (input_requires_grad) {
+          auto ix_nearest = static_cast<index_t>(::round(ix));
+          auto iy_nearest = static_cast<index_t>(::round(iy));
+          auto iz_nearest = static_cast<index_t>(::round(iz));
 
-        // assign nearest neighor pixel value to output pixel
-        scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-        index_t NC_offset = n * gInp_sN;
-        for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) {
-          // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
-          safe_add_3d(grad_input.data, iz_nearest, iy_nearest, ix_nearest,
-                      gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW,
-                      NC_offset, grad_input_memory_span);
+          // assign nearest neighor pixel value to output pixel
+          scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+          index_t NC_offset = n * gInp_sN;
+          for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) {
+            // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
+            safe_add_3d(grad_input.data, iz_nearest, iy_nearest, ix_nearest,
+                        gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW,
+                        NC_offset, grad_input_memory_span);
+          }
         }
-
         // assuming grad_grid is contiguous
         // thus we can
         //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
@@ -722,15 +740,17 @@ namespace {
   }
 }  // namespace
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   auto N = input.size(0);
-  auto C = input.size(1);
   auto H = grid.size(1);
   auto W = grid.size(2);
-  auto output = at::empty({N, C, H, W}, input.options());
   int64_t count = N * H * W;
   if (count > 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] {
@@ -760,18 +780,20 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
       }
     });
   }
-  return output;
 }
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
   auto N = input.size(0);
   auto D = grid.size(1);
   auto H = grid.size(2);
   auto W = grid.size(3);
-  auto output = at::empty({N, input.size(1), D, H, W}, input.options());
   int64_t count = N * D * H * W;
   if (count > 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_cuda", [&] {
@@ -801,15 +823,18 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
       }
     });
   }
-  return output;
 }
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode,
-                              int64_t padding_mode, bool align_corners,
-                              std::array<bool,2> output_mask) {
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("grid_sampler_2d_backward_cuda");
@@ -822,11 +847,6 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
   // is always computed.)
   auto input_requires_grad = output_mask[0];
 
-  Tensor grad_input;
-  if (input_requires_grad) {
-    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   int64_t count = N * H * W;
   if (count > 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] {
@@ -864,14 +884,18 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
       }
     });
   }
-  return std::make_tuple(grad_input, grad_grid);
 }
 
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
-                              bool align_corners) {
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase& grad_output, const TensorBase& input,
+    const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("grid_sampler_3d_backward_cuda");
@@ -879,9 +903,8 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
   auto D = grid.size(1);
   auto H = grid.size(2);
   auto W = grid.size(3);
-  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   int64_t count = N * D * H * W;
+  auto input_requires_grad = output_mask[0];
   if (count > 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] {
       if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) &&
@@ -892,12 +915,13 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             getTensorInfo<scalar_t, int>(grad_output),
             getTensorInfo<scalar_t, int>(input),
             getTensorInfo<scalar_t, int>(grid),
-            getTensorInfo<scalar_t, int>(grad_input),
+            input_requires_grad ? getTensorInfo<scalar_t, int>(grad_input) : TensorInfo<scalar_t, int>(),
             getTensorInfo<scalar_t, int>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners,
-            /*grad_input_memory_span =*/static_cast<int>(grad_input.numel()));
+            /*grad_input_memory_span =*/input_requires_grad ? static_cast<int>(grad_input.numel()) : 0,
+            input_requires_grad);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         grid_sampler_3d_backward_kernel<scalar_t>
@@ -906,17 +930,17 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             getTensorInfo<scalar_t, int64_t>(grad_output),
             getTensorInfo<scalar_t, int64_t>(input),
             getTensorInfo<scalar_t, int64_t>(grid),
-            getTensorInfo<scalar_t, int64_t>(grad_input),
+            input_requires_grad ? getTensorInfo<scalar_t, int64_t>(grad_input) : TensorInfo<scalar_t, int64_t>(),
             getTensorInfo<scalar_t, int64_t>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners,
-            /*grad_input_memory_span =*/grad_input.numel());
+            /*grad_input_memory_span =*/input_requires_grad ? grad_input.numel() : 0,
+            input_requires_grad);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
   }
-  return std::make_tuple(grad_input, grad_grid);
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh
index 2fdf3bd54912..a0e3b16c3a43 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@@ -1,16 +1,9 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#pragma once
 #include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/native/GridSamplerUtils.h>
 
 namespace at { namespace native {
 
-namespace detail {
-
-  enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic};
-  enum class GridSamplerPadding {Zeros, Border, Reflection};
-
-}  // namespace detail
-
 using detail::GridSamplerInterpolation;
 using detail::GridSamplerPadding;
 
diff --git a/aten/src/ATen/native/cuda/GridSampler.h b/aten/src/ATen/native/cuda/GridSampler.h
new file mode 100644
index 000000000000..aace9c30b0a7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GridSampler.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Im2Col.cu b/aten/src/ATen/native/cuda/Im2Col.cu
index 053418423adf..89b2a1879b4b 100644
--- a/aten/src/ATen/native/cuda/Im2Col.cu
+++ b/aten/src/ATen/native/cuda/Im2Col.cu
@@ -1,6 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/div_rtn.h>
@@ -10,6 +11,16 @@
 #include <ATen/native/cuda/im2col.cuh>
 #include <ATen/native/im2col_shape_check.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/im2col_native.h>
+#include <ATen/ops/im2col_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cpp b/aten/src/ATen/native/cuda/IndexKernel.cpp
index b85baf097559..478c96fa6084 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cpp
+++ b/aten/src/ATen/native/cuda/IndexKernel.cpp
@@ -1,10 +1,21 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/IndexKernel.h>
 #include <ATen/native/TensorAdvancedIndexing.h>  // For at::native::index_out
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
 #include <ATen/ExpandUtils.h>
-#include <ATen/Functions.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/masked_scatter_native.h>
+#include <ATen/ops/masked_select_native.h>
+#endif
+
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index eac807d0aa9b..a40bf35205e7 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -192,7 +192,7 @@ void index_put_kernel_impl(TensorIterator& iter, IntArrayRef index_size, IntArra
 }
 
 static void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_cuda", [&] {
     using dtype = OpaqueType<sizeof(scalar_t)>;
     index_kernel_impl<dtype>(iter, index_size, index_stride);
   });
@@ -233,7 +233,7 @@ static void index_copy_kernel(
 
 static void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) {
   TORCH_CHECK(!accumulate, "index_put does not support accumulate=true");
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_put", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_put", [&] {
     using dtype = OpaqueType<sizeof(scalar_t)>;
     index_put_kernel_impl<dtype>(iter, index_size, index_stride);
   });
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 9ea21b2171e9..5fc9e4f5521e 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -1,19 +1,36 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/IndexingUtils.h>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/Resize.h>
-#include <ATen/AccumulateType.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_reduce_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/masked_fill_native.h>
+#endif
+
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/cub.h>
 #include <c10/util/irange.h>
@@ -34,7 +51,7 @@ __global__ void indexing_backward_kernel(
 //stride_before is the stride of the dimension immediately preceding first indexed dimension
 //if indexing starts from the 0th dimension, stride_before does not matter because blockIdx.z will be 0 in this case
 //outer_dim is number of elements in the first unindexed dimensions
-  using accscalar_t = at::acc_type<scalar_t, true>;
+  using opmath_t = at::opmath_type<scalar_t>;
 
   // Each warp is responsible for an input into the LookupTable.
   // If the preceding input has the same destination index as this input, then the warp
@@ -61,19 +78,19 @@ __global__ void indexing_backward_kernel(
         }
         const int64_t weight_row = ((int64_t) sorted_indices[idx]) * stride + z * stride_before;
         const int64_t grad_row = ((int64_t) indices[idx]) * stride + z * numel * stride;
-        const accscalar_t scale = (accscalar_t)1.0;
+        const opmath_t scale = (opmath_t)1.0;
 
-        accscalar_t gradient[SZ];
-        accscalar_t weight[SZ];
+        opmath_t gradient[SZ];
+        opmath_t weight[SZ];
 
         while (start_feature < stride) {
           #pragma unroll
           for (int ii = 0; ii < SZ; ii++) {
             int64_t feature_dim = start_feature + ii * C10_WARP_SIZE;
             if (feature_dim < stride) {
-              gradient[ii] = static_cast<accscalar_t>(grad_output[grad_row + feature_dim]);
+              gradient[ii] = static_cast<opmath_t>(grad_output[grad_row + feature_dim]);
               if (accumulate) {
-                weight[ii] = static_cast<accscalar_t>(grad_weight[weight_row + feature_dim]);
+                weight[ii] = static_cast<opmath_t>(grad_weight[weight_row + feature_dim]);
               }
             }
           }
@@ -109,6 +126,49 @@ __global__ void indexing_backward_kernel(
 
 namespace at { namespace native {
 
+namespace {
+
+class ReduceMultiply {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMul(self_data_start + index, *src_data);
+  }
+};
+static ReduceMultiply reduce_multiply;
+
+class ReduceAdd {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
+  }
+};
+static ReduceAdd reduce_add;
+
+class ReduceMinimum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMin(self_data_start + index, *src_data);
+  }
+};
+static ReduceMinimum reduce_minimum;
+
+class ReduceMaximum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMax(self_data_start + index, *src_data);
+  }
+};
+static ReduceMaximum reduce_maximum;
+
+}
+
 static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size, bool check_range=true) {
 //we don't need to check range in backward - if there were out of bounds indices forward should already have errored out
   if (index.numel() != 0 && check_range) {
@@ -209,13 +269,12 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
   if (indices.size() > (size_t)self.dim()) {
     TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   }
-  if (!self.is_contiguous()) {
-    self = self.contiguous();
-  }
+  bool self_contiguous = self.is_contiguous();
+  auto self_ = self_contiguous ? self : self.contiguous();
   Tensor linearIndex, src, expandedValue = value;
   int64_t nElemBefore, strideBefore, sliceSize;
   std::vector<int64_t> inversePerm;
-  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self, indices, !unsafe);
+  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe);
   int64_t num_indices = linearIndex.numel();
 
   if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) {
@@ -255,7 +314,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
       auto range = at::arange(num_indices, linearIndex.options());
       // linearIndex can not be negative, and we take advantage of this
       // fact to sort on less bits for better performance.
-      int64_t nbits = cuda::cub::get_num_bits(largestIndex(self) / sliceSize);
+      int64_t nbits = cuda::cub::get_num_bits(largestIndex(self_) / sliceSize);
       cuda::cub::radix_sort_pairs(
         linearIndex.data_ptr<int64_t>(), sorted_indices.data_ptr<int64_t>(),
         range.data_ptr<int64_t>(), orig_indices.data_ptr<int64_t>(),
@@ -268,12 +327,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
           linearIndex.numel()*sliceSize*nElemBefore, " vs ", expandedValue.numel());
       const int UNROLL = 4;
       const int indices_per_block = 4;
+      const int warp_size = at::cuda::warp_size();
       dim3 grid(ceil_div(num_indices, (int64_t) indices_per_block),
-           std::min<int>(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (C10_WARP_SIZE*UNROLL))),
+           std::min<int>(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size*UNROLL))),
            std::min(std::max<int>(1,nElemBefore), at::cuda::getCurrentDeviceProperties()->maxGridSize[2]));
-      dim3 block(C10_WARP_SIZE, indices_per_block);
+      dim3 block(warp_size, indices_per_block);
 
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
       expandedValue.scalar_type(), "indexing_backward", [&] {
         indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
           sorted_indices.data_ptr<int64_t>(),
@@ -290,6 +350,8 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
 
       if (permuted) {
         self.copy_(src_.permute(inversePerm));
+      } else if (!self_contiguous) {
+        self.copy_(self_);
       }
   }
 }
@@ -350,17 +412,20 @@ static ptrdiff_t getSliceSize(const Tensor & dst,
 // of indices is a small number.
 // This kernel in fact works for all choices of problem size, but if
 // the number of indices chosen is large, then the
-// indexAddLargeIndex kernel is a better choice to increase
+// indexFuncLargeIndex kernel is a better choice to increase
 // parallelism.
-template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexAddSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                   cuda::detail::TensorInfo<T, IndexType> src,
-                                   cuda::detail::TensorInfo<IndicesType, IndexType> indices,
-                                   int dstAddDim,
-                                   int srcAddDim,
-                                   IndexType innerSize,
-                                   int64_t dstAddDimSize,
-                                   T alpha) {
+template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          typename func_t>
+__global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
+                                    cuda::detail::TensorInfo<T, IndexType> src,
+                                    cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                    int dstAddDim,
+                                    int srcAddDim,
+                                    IndexType innerSize,
+                                    int64_t dstAddDimSize,
+                                    int64_t dstNumel,
+                                    const func_t& op,
+                                    T alpha) {
   // In order to avoid reloading the index that we are copying, load
   // it once to handle all of the points that are being selected, so
   // it can be reused as much as possible. This kernel is chosen when
@@ -385,8 +450,10 @@ __global__ void indexAddSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
           cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
       srcOffset += srcIndex * src.strides[srcAddDim];
 
-      gpuAtomicAddNoReturn(&dst.data[dstOffset], src.data[srcOffset] * alpha);
+      T val = src.data[srcOffset] * alpha;
+      op(dst.data, dstOffset, dstNumel, &val);
     }
+
   }
 }
 
@@ -394,19 +461,21 @@ __global__ void indexAddSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
 // if there are a large number of indices.
 // This kernel in fact works for all choices of problem size, but if
 // the number of indices chosen is small, then the
-// indexAddSmallIndex kernel is a better choice to reduce memory
+// indexFuncSmallIndex kernel is a better choice to reduce memory
 // accesses.
 template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
-          bool IndexIsMajor>
-__global__ void indexAddLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                   cuda::detail::TensorInfo<T, IndexType> src,
-                                   cuda::detail::TensorInfo<IndicesType, IndexType> indices,
-                                   int dstAddDim,
-                                   int srcAddDim,
-                                   IndexType totalSize,
-                                   IndexType innerSize,
-                                   int64_t dstAddDimSize,
-                                   T alpha) {
+          bool IndexIsMajor, typename func_t>
+__global__ void indexFuncLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
+                                    cuda::detail::TensorInfo<T, IndexType> src,
+                                    cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                    int dstAddDim,
+                                    int srcAddDim,
+                                    IndexType totalSize,
+                                    IndexType innerSize,
+                                    int64_t dstAddDimSize,
+                                    int64_t dstNumel,
+                                    const func_t& op,
+                                    T alpha) {
   // We stride over the output including the indexed dimension
   // (totalSize), and calculate the destination index point based on that
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
@@ -435,7 +504,8 @@ __global__ void indexAddLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
       cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
     srcOffset += srcIndex * src.strides[srcAddDim];
 
-    gpuAtomicAddNoReturn(&dst.data[dstOffset], src.data[srcOffset] * alpha);
+    T val = src.data[srcOffset] * alpha;
+    op(dst.data, dstOffset, dstNumel, &val);
   }
 }
 
@@ -505,6 +575,7 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
   ptrdiff_t sourceTotalSize = source.numel();
   int64_t selfAddDimSize = self_.size(dim);
   ptrdiff_t numIndex = index.numel();
+  int64_t selfNumel = self_.numel();
 
   if (sliceSize == 0) {
     return;
@@ -514,22 +585,23 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
 
   int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
-#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)  \
-  indexAddSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM> \
-    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                \
-      selfInfo, sourceInfo, indexInfo,                                               \
-      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize, alpha_value);             \
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)     \
+  indexFuncSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM>   \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                   \
+      selfInfo, sourceInfo, indexInfo,                                                  \
+      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize,                              \
+      selfNumel, reduce_add, alpha_value);                                              \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
 #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                        \
                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)            \
-  indexAddLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                       \
-                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>           \
+  indexFuncLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                      \
+                      SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>          \
     <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                       \
       selfInfo, sourceInfo, indexInfo,                                      \
       selfAddDim, sourceAddDim, sourceTotalSize,                            \
       (IDX_IS_MAJOR) ? sliceSize : numIndex,                                \
-      selfAddDimSize, alpha_value);                                         \
+      selfAddDimSize, selfNumel, reduce_add, alpha_value);                  \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   dim3 smallIndexGrid(std::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
@@ -619,11 +691,211 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
 #undef LARGE_INDEX
 }
 
+template <typename func_t>
+void index_reduce_func_cuda_impl(
+  const Tensor& self,
+  int64_t dim,
+  const Tensor& index,
+  const Tensor& source,
+  bool include_self,
+  const SCATTER_GATHER_OP& reduce,
+  const func_t& reduce_func,
+  const Tensor& result) {
+  globalContext().alertNotDeterministic("index_reduce_cuda");
+
+  if (!result.is_same(self)) result.copy_(self);
+
+  // Scalars are treated as 1-d tensor
+  Tensor self_ = (result.dim() == 0) ? result.view(1) : result;
+  Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
+
+  TORCH_CHECK(result.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+  TORCH_CHECK(source.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims" );
+  TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+
+  if (!include_self) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      self.scalar_type(), "index_reduce_func_cuda_exclude_input_init", [&] {
+      scalar_t init_val;
+      switch (reduce) {
+        case SCATTER_GATHER_OP::REDUCE_MULTIPLY:
+          init_val = (scalar_t)1;
+          break;
+        case SCATTER_GATHER_OP::REDUCE_MAXIMUM:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? -std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::lowest();
+          break;
+        case SCATTER_GATHER_OP::REDUCE_MINIMUM:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::max();
+          break;
+        default:
+          init_val = (scalar_t)0;
+          break;
+      }
+      // index_fill_ requires index to be a LongTensor
+      self_.index_fill_(dim, index.to(at::ScalarType::Long), init_val);
+    });
+  }
+
+  // The `source` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of index we are choosing, which is the total size
+  // of the tensor `index`.
+  ptrdiff_t sliceSize = getSliceSize(self_, dim, index, source_);
+  ptrdiff_t sourceTotalSize = source.numel();
+  int64_t selfReduceDimSize = self_.size(dim);
+  ptrdiff_t numIndex = index.numel();
+  int64_t selfNumel = self_.numel();
+
+  if (sliceSize == 0) {
+    return;
+  }
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  bool indContig = index.is_contiguous();
+
+  int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)                  \
+  indexFuncSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM>                \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                                \
+      selfInfo, sourceInfo, indexInfo,                                                               \
+      selfReduceDim, sourceReduceDim, sliceSize, selfReduceDimSize,                                  \
+      selfNumel, reduce_func, alpha_value);                                                          \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                                     \
+                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)                         \
+  indexFuncLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                                   \
+                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>                        \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                                    \
+      selfInfo, sourceInfo, indexInfo,                                                   \
+      selfReduceDim, sourceReduceDim, sourceTotalSize,                                   \
+      (IDX_IS_MAJOR) ? sliceSize : numIndex,                                             \
+      selfReduceDimSize, selfNumel, reduce_func, alpha_value);                           \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  dim3 smallIndexGrid(std::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(std::min(ceil_div(sourceTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(sourceTotalSize, (ptrdiff_t)128));
+
+  if (cuda::detail::canUse32BitIndexMath(result) &&
+      cuda::detail::canUse32BitIndexMath(source) &&
+      cuda::detail::canUse32BitIndexMath(index)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "index_reduce", [&] {
+      cuda::detail::TensorInfo<scalar_t, unsigned int> selfInfo =
+          cuda::detail::getTensorInfo<scalar_t, unsigned int>(self_);
+      int selfReduceDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfReduceDim);
+      auto alpha_value = (scalar_t) 1;
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () {
+        auto sourceInfo =
+          cuda::detail::getTensorInfo<scalar_t, unsigned int>(source_);
+        int sourceReduceDim = sourceInfo.collapseDims(dim);
+        sourceInfo.reduceDim(sourceReduceDim);
+
+        auto indexInfo =
+        cuda::detail::getTensorInfo<index_t, unsigned int>(index);
+        indexInfo.collapseDims();
+
+        // A reasonable choice for when to have each thread iterate over
+        // index to choose
+        if (numIndex <= 16) {
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2);
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2);
+          } else {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1);
+          }
+        } else {
+          bool indexIsMajor = indexShouldBeMajor(selfInfo, selfReduceDim);
+
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false);
+            }
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false);
+            }
+          } else {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true);
+          }
+        }
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_reduce", [&] {
+      cuda::detail::TensorInfo<scalar_t, uint64_t> selfInfo =
+        cuda::detail::getTensorInfo<scalar_t, uint64_t>(self_);
+      int selfReduceDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfReduceDim);
+      auto alpha_value = (scalar_t) 1;
+
+      cuda::detail::TensorInfo<scalar_t, uint64_t> sourceInfo =
+        cuda::detail::getTensorInfo<scalar_t, uint64_t>(source_);
+      int sourceReduceDim = sourceInfo.collapseDims(dim);
+      sourceInfo.reduceDim(sourceReduceDim);
+
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () {
+        cuda::detail::TensorInfo<index_t, uint64_t> indexInfo =
+          cuda::detail::getTensorInfo<index_t, uint64_t>(index);
+        indexInfo.collapseDims();
+
+        LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
+      });
+    });
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
 TORCH_IMPL_FUNC(index_add_cuda_out)
 (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) {
   index_add_cuda_impl(self, dim, index, source, alpha, result);
 }
 
+TORCH_IMPL_FUNC(index_reduce_cuda_out)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& source,
+ const c10::string_view reduce,
+ bool include_self,
+ const Tensor& result) {
+  TORCH_WARN_ONCE("index_reduce() is in beta and the API may change at any time.");
+
+  if (reduce == "prod") {
+    index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MULTIPLY, reduce_multiply, result);
+  } else if (reduce == "mean") {
+    index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MEAN, reduce_add, result);
+    auto counts = include_self ? at::ones_like(result) : at::zeros_like(result);
+    counts.index_add_(dim, index, at::ones_like(source));
+    counts.masked_fill_(counts == 0, 1);
+    result.div_(counts);
+  } else if (reduce == "amax") {
+    index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MAXIMUM, reduce_maximum, result);
+  } else if (reduce == "amin") {
+    index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MINIMUM, reduce_minimum, result);
+  } else {
+    TORCH_CHECK(false, "reduce argument must be either prod, mean, amax or amin, got ", reduce, ".");
+  }
+}
+
 namespace {
 // We prefer this kernel to avoid reloading index points if the number
 // of indices is a small number.
@@ -905,15 +1177,16 @@ Tensor& index_select_out_cuda(
 }
 
 Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
-  Tensor out;
-  if (self.is_quantized()){
-    TORCH_CHECK(
-      self.qscheme() == kPerTensorAffine,
-      "Only per_tensor quantized quantized tensors are supported by index_select.")
-    out = at::empty_quantized({0}, self);
-  } else {
-    out = at::empty({0}, self.options());
-  }
+  Tensor out = at::empty({0}, self.options());
+  at::native::index_select_out_cuda(self, dim, index, out);
+  return out;
+}
+
+Tensor index_select_quantized_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
+  TORCH_CHECK(
+    self.qscheme() == kPerTensorAffine,
+    "Only per_tensor quantized quantized tensors are supported by index_select.")
+  Tensor out = at::empty_quantized({0}, self);
   at::native::index_select_out_cuda(self, dim, index, out);
   return out;
 }
@@ -922,8 +1195,8 @@ namespace {
 
 template <typename mask_t>
 void masked_fill_kernel(TensorIterator& iter, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kBool, kHalf, kBFloat16, iter.common_dtype(), "masked_fill_", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kBool, kHalf, kBFloat16, kComplexHalf, iter.common_dtype(), "masked_fill_", [&]() {
         const auto value_ = value.to<scalar_t>();
         gpu_kernel(
             iter, [value_] GPU_LAMBDA(scalar_t self, mask_t mask) -> scalar_t {
diff --git a/aten/src/ATen/native/cuda/JitLoops.cuh b/aten/src/ATen/native/cuda/JitLoops.cuh
index 6284feba2d56..bb37a6acc2e1 100644
--- a/aten/src/ATen/native/cuda/JitLoops.cuh
+++ b/aten/src/ATen/native/cuda/JitLoops.cuh
@@ -132,7 +132,7 @@ void jitted_gpu_kernel(
         /*f_inputs_type=*/f_inputs_type,
         arity,
         at::cuda::jit::BinaryFuncVariant::NoScalar>(
-        iter, f, needs_dynamic_casting, /*scalar_val=*/0, extra_args);
+        iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args);
   } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) {
     jitted_gpu_kernel_impl<
         /*name*/ name,
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
index f8ac9d3ed8f6..b080a6e5eac2 100644
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@@ -1,7 +1,14 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/cuda/cub_definitions.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
 #include <ATen/cuda/ThrustAllocator.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu
index ed57a2700c48..ac1f2ba379b5 100644
--- a/aten/src/ATen/native/cuda/Lerp.cu
+++ b/aten/src/ATen/native/cuda/Lerp.cu
@@ -3,6 +3,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/OpMathType.h>
 
 namespace at {
 namespace native {
@@ -13,17 +14,23 @@ void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
       at::ScalarType::Half, at::ScalarType::BFloat16,
       iter.common_dtype(), "lerp_cuda",
       [&] {
+        using opmath_t = at::opmath_type<scalar_t>;
         at::native::gpu_kernel(
             iter,
             [] GPU_LAMBDA(
                 scalar_t self_val,
                 scalar_t end_val,
                 scalar_t weight_val) -> scalar_t {
-              return (std::abs(weight_val) < 0.5)
-                  ? self_val + weight_val * (end_val - self_val)
-                  : end_val -
-                      (end_val - self_val) *
-                          (static_cast<scalar_t>(1) - weight_val);
+              opmath_t self_val_f = self_val;
+              opmath_t end_val_f = end_val;
+              opmath_t weight_val_f = weight_val;
+              // Conditional for better numeric. This has been discussed in
+              // https://github.com/pytorch/pytorch/pull/18871
+              return (std::abs(weight_val_f) < 0.5)
+                  ? self_val_f + weight_val_f * (end_val_f - self_val_f)
+                  : end_val_f -
+                      (end_val_f - self_val_f) *
+                          (opmath_t{1} - weight_val_f);
             });
       });
 }
@@ -33,13 +40,18 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight)
       at::ScalarType::Half, at::ScalarType::BFloat16,
       iter.common_dtype(), "lerp_cuda",
       [&]{
-        auto weight_val = weight.to<scalar_t>();
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto weight_val = weight.to<opmath_t>();
         at::native::gpu_kernel(
             iter, [=] GPU_LAMBDA(scalar_t self_val, scalar_t end_val) {
+              opmath_t self_val_f = self_val;
+              opmath_t end_val_f = end_val;
+              // Conditional for better numeric. This has been discussed in
+              // https://github.com/pytorch/pytorch/pull/18871
               return (std::abs(weight_val) < 0.5)
-                  ? self_val + weight_val * (end_val - self_val)
-                  : end_val -
-                      (end_val - self_val) * (static_cast<scalar_t>(1) - weight_val);
+                  ? self_val_f + weight_val * (end_val_f - self_val_f)
+                  : end_val_f -
+                      (end_val_f - self_val_f) * (opmath_t{1} - weight_val);
             });
       });
     }
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index f2360261e865..24590e0647b5 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -1,7 +1,7 @@
-#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/LinearAlgebra.h>
+#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/SharedReduceOps.h>
@@ -100,56 +100,38 @@ static void _launch_kernel(int total_n_elems, func_t f) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
-void _unpack_pivots_internal_kernel(
-  TensorIterator& iter,
-  int64_t dim_size
-) {
-  if (iter.numel() == 0) {
-    return;
-  }
-
+void unpack_pivots_cuda_kernel(TensorIterator& iter, const int64_t dim_size) {
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
-      _unpack_pivots_internal_kernel(sub_iter, dim_size);
+      unpack_pivots_cuda_kernel(sub_iter, dim_size);
     }
     return;
   }
 
-  auto offset_calculator = make_offset_calculator<2>(iter);
+  const auto offset_calculator = make_offset_calculator<2>(iter);
 
-  char* unpacked_pivots_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
-  const char* const __restrict__ pivots_ptr = reinterpret_cast<const char*>(iter.data_ptr(1));
+  const auto perm_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  const auto pivots_ptr = reinterpret_cast<const char*>(iter.data_ptr(1));
 
-  auto loop = [=]C10_DEVICE(int i) {
-    auto offsets = offset_calculator.get(i);
+  auto loop = [=]C10_DEVICE(const int idx) {
+    const auto offsets = offset_calculator.get(idx);
 
-    auto* unpacked_pivots_data = reinterpret_cast<int32_t*>(
-      unpacked_pivots_ptr + offsets[0]);
-    const auto* const __restrict__ pivots_data = reinterpret_cast<const int32_t*>(
-      pivots_ptr + offsets[1]);
+    int64_t* const __restrict__ perm_data = reinterpret_cast<int64_t*>(perm_ptr + offsets[0]);
+    const int32_t* const __restrict__ pivots_data = reinterpret_cast<const int32_t*>(pivots_ptr + offsets[1]);
 
     // QUESTION: can we mix 64bit offsets with 32bit Iterator indexing?
     for (int64_t i = 0; i < dim_size; ++i) {
       thrust::swap(
-        unpacked_pivots_data[i],
-        unpacked_pivots_data[pivots_data[i]]
+        perm_data[i],
+        perm_data[pivots_data[i] - 1]
       );
     }
   };
 
   _launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
 }
-
-void unpack_pivots_cuda_kernel(
-  TensorIterator& iter,
-  int64_t dim_size
-) {
-  _unpack_pivots_internal_kernel(iter, dim_size);
-}
-
 } // anonymous namespace
 
-REGISTER_DISPATCH(addr_stub, &addr_kernel_cuda);
 REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cuda_kernel);
-
+REGISTER_DISPATCH(addr_stub, &addr_kernel_cuda);
 }}
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
new file mode 100644
index 000000000000..a7606e93047f
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -0,0 +1,235 @@
+// LinearAlgebraStubs.cpp
+// Mostly a no-op unless BUILD_LAZY_CUDA_LINALG is defined
+// In that case load library is dynamically loaded when first linalg call is made
+// This helps reduce size of GPU memory context if linear algebra functions are not used
+#include <ATen/Context.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
+#include <ATen/DynamicLibrary.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/cuda/MiscUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/LinearAlgebra.h>
+#include <ATen/native/BatchLinearAlgebra.h>
+#if defined(BUILD_LAZY_CUDA_LINALG)
+#include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
+
+#if AT_MAGMA_ENABLED()
+#include <ATen/cuda/detail/CUDAHooks.h>
+
+namespace {
+struct MagmaInitializer {
+  MagmaInitializer() {
+    ::at::cuda::detail::set_magma_init_fn([]{ });
+  };
+} initializer;
+}  // namespace (anonymous)
+#endif
+#endif
+namespace at {
+namespace native {
+#if defined(BUILD_LAZY_CUDA_LINALG)
+namespace {
+cuda::detail::LinalgDispatch disp = {_symeig_helper_cuda,
+                                     _linalg_qr_helper_cuda,
+                                     _cholesky_solve_helper_cuda,
+                                     legacy_lstsq_cuda,
+                                     _linalg_inv_out_helper_cuda};
+
+at::DynamicLibrary& getTorchLinalgLibrary() {
+  static at::DynamicLibrary lib("libtorch_cuda_linalg.so", nullptr, true);
+  return lib;
+}
+
+// Lazy dispatches do nothing but load linalg library and call the stub
+// Loading the library should override the registration of those with the proper implementation
+// getTorchLinalgLibrary() throws an exception if library is not found,
+// which makes it unnecessary to have an explicit error checking
+// But make sure that this function is called only once, to avoid infinite recursion
+void loadLazyTorchLinalgLibrary() {
+  static int invoke_count = 0;
+  getTorchLinalgLibrary();
+  TORCH_CHECK(invoke_count++ == 0, "lazy wrapper should be called at most once");
+}
+
+void lazy_cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
+  loadLazyTorchLinalgLibrary();
+  cholesky_stub(DeviceType::CUDA, input, info, upper);
+}
+
+Tensor& lazy_cholesky_inverse_kernel(Tensor &result, Tensor& infos, bool upper) {
+  loadLazyTorchLinalgLibrary();
+  return cholesky_inverse_stub(DeviceType::CUDA, result, infos, upper);
+}
+
+void lazy_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
+  loadLazyTorchLinalgLibrary();
+  lu_factor_stub(DeviceType::CUDA, input, pivots, infos, compute_pivots);
+}
+
+void lazy_triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+  loadLazyTorchLinalgLibrary();
+  triangular_solve_stub(DeviceType::CUDA, A, B, left, upper, transpose, unitriangular);
+}
+
+Tensor& lazy_orgqr_kernel(Tensor& result, const Tensor& tau) {
+  loadLazyTorchLinalgLibrary();
+  return orgqr_stub(DeviceType::CUDA, result, tau);
+}
+
+void lazy_ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) {
+  loadLazyTorchLinalgLibrary();
+  ormqr_stub(DeviceType::CUDA, input, tau, other, left, transpose);
+}
+
+void lazy_geqrf_kernel(const Tensor& input, const Tensor& tau) {
+  loadLazyTorchLinalgLibrary();
+  geqrf_stub(DeviceType::CUDA, input, tau);
+}
+
+void lazy_linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
+  loadLazyTorchLinalgLibrary();
+  linalg_eigh_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+}
+
+std::tuple<Tensor, Tensor> lazy_eig_kernel(const Tensor& self, bool& eigenvectors) {
+  loadLazyTorchLinalgLibrary();
+  return eig_stub(DeviceType::CUDA, self, eigenvectors);
+}
+
+void lazy_linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, const Tensor& input, bool compute_eigenvectors) {
+  getTorchLinalgLibrary();
+  linalg_eig_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, input, compute_eigenvectors);
+}
+
+void lazy_svd_kernel(const Tensor& A,
+                     const bool full_matrices,
+                     const bool compute_uv,
+                     const Tensor& U,
+                     const Tensor& S,
+                     const Tensor& Vh,
+                     const Tensor& info) {
+  getTorchLinalgLibrary();
+  svd_stub(DeviceType::CUDA, A, full_matrices, compute_uv, U, S, Vh, info);
+}
+
+void lazy_lu_solve_trans(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType trans) {
+  getTorchLinalgLibrary();
+  lu_solve_trans_stub(DeviceType::CUDA, b, lu, pivots, trans);
+}
+
+void lazy_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots) {
+  getTorchLinalgLibrary();
+  lu_solve_stub(DeviceType::CUDA, b, lu, pivots);
+}
+
+void lazy_lstsq_kernel(const Tensor& a, Tensor& b, Tensor& rank, Tensor& singular_values, Tensor& infos, double rcond, std::string driver_name)  {
+  getTorchLinalgLibrary();
+  lstsq_stub(DeviceType::CUDA, a, b, rank, singular_values, infos, rcond, driver_name);
+}
+
+void lazy_ldl_factor(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+  loadLazyTorchLinalgLibrary();
+  ldl_factor_stub(DeviceType::CUDA, LD, pivots, info, upper, hermitian);
+}
+
+void lazy_ldl_solve(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper,
+    bool hermitian) {
+  loadLazyTorchLinalgLibrary();
+  ldl_solve_stub(DeviceType::CUDA, LD, pivots, B, upper, hermitian);
+}
+
+REGISTER_CUDA_DISPATCH(cholesky_stub, &lazy_cholesky_kernel)
+REGISTER_CUDA_DISPATCH(cholesky_inverse_stub, &lazy_cholesky_inverse_kernel);
+REGISTER_CUDA_DISPATCH(lu_factor_stub, &lazy_lu_factor);
+REGISTER_CUDA_DISPATCH(ldl_factor_stub, &lazy_ldl_factor);
+REGISTER_CUDA_DISPATCH(ldl_solve_stub, &lazy_ldl_solve);
+REGISTER_CUDA_DISPATCH(triangular_solve_stub, &lazy_triangular_solve_kernel);
+REGISTER_CUDA_DISPATCH(orgqr_stub, &lazy_orgqr_kernel);
+REGISTER_CUDA_DISPATCH(ormqr_stub, &lazy_ormqr_kernel);
+REGISTER_CUDA_DISPATCH(geqrf_stub, &lazy_geqrf_kernel);
+REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &lazy_linalg_eigh_kernel);
+REGISTER_CUDA_DISPATCH(eig_stub, &lazy_eig_kernel);
+REGISTER_CUDA_DISPATCH(linalg_eig_stub, &lazy_linalg_eig_kernel);
+REGISTER_CUDA_DISPATCH(svd_stub, &lazy_svd_kernel)
+REGISTER_CUDA_DISPATCH(lu_solve_trans_stub, &lazy_lu_solve_trans);
+REGISTER_CUDA_DISPATCH(lu_solve_stub, &lazy_lu_solve);
+REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel);
+} // anonymous namespace
+
+// Old style dispatches
+// torch_cuda_linalg dynamic library should have a global constructor
+// that calls regiserLinaglDispatch so in order ot lazy bind
+// old style dispatch all one have to do is to load library and call disp.func_name
+// Protect from infinite recursion by initializing dispatch to self and checking
+// that values are different after linalg library were loaded
+
+namespace cuda {
+namespace detail {
+void registerLinalgDispatch(const LinalgDispatch& disp_) {
+  disp = disp_;
+}
+}} //namespace cuda::detail
+
+Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
+    getTorchLinalgLibrary();
+    TORCH_CHECK(disp.inv_out_helper != _linalg_inv_out_helper_cuda, "Can't find _linalg_inv_out_helper_cuda");
+    return disp.inv_out_helper(result, infos_lu, infos_getri);
+}
+
+std::tuple<Tensor, Tensor> legacy_lstsq_cuda(const Tensor &B, const Tensor &A) {
+    getTorchLinalgLibrary();
+    TORCH_CHECK(disp.legacy_lstsq != legacy_lstsq_cuda, "Can't find legacy_lstsq_cuda");
+    return disp.legacy_lstsq(B, A);
+}
+
+Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
+    getTorchLinalgLibrary();
+    TORCH_CHECK(disp.cholesky_solve_helper != _cholesky_solve_helper_cuda, "Can't find _cholesky_solve_helper_cuda");
+    return disp.cholesky_solve_helper(self, A, upper);
+}
+
+std::tuple<Tensor, Tensor> _linalg_qr_helper_cuda(const Tensor& input, c10::string_view mode) {
+    getTorchLinalgLibrary();
+    TORCH_CHECK(disp.qr_helper != _linalg_qr_helper_cuda, "Can't find _linalg_qr_helper_cuda");
+    return disp.qr_helper(input, mode);
+}
+
+std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvectors, bool upper) {
+    getTorchLinalgLibrary();
+    TORCH_CHECK(disp.symeig_helper != _symeig_helper_cuda, "Can't find _symeig_helper_cuda");
+    return disp.symeig_helper(self, eigenvectors, upper);
+}
+
+#endif /*defined(BUILD_LAZY_CUDA_LINALG)*/
+
+std::tuple<Tensor&, Tensor&> legacy_lstsq_out_cuda(
+    const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) {
+  const auto dtype = A.scalar_type();
+  TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ",
+              A.scalar_type(), " and ", B.scalar_type());
+  TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
+              " but found", A_out.scalar_type());
+  TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
+              " but found", B_out.scalar_type());
+  Tensor A_tmp, B_tmp;
+  std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A);
+  resize_output(A_out, A_tmp.sizes());
+  A_out.copy_(A_tmp);
+  resize_output(B_out, B_tmp.sizes());
+  B_out.copy_(B_tmp);
+  return std::tuple<Tensor&, Tensor&>(B_out, A_out);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index 6afc89592799..1f885ff6fe0b 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -1,14 +1,28 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/native/TensorIterator.h>
-#include <aten/src/ATen/TensorUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/binary_cross_entropy_backward_native.h>
+#include <ATen/ops/binary_cross_entropy_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/squeeze.h>
+#endif
+
 constexpr float EPSILON = 1e-12;
 
 namespace {
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 65508b1a956b..4e406f7cd4de 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -7,15 +7,32 @@
 // Graves et al call the probabilities y, we use log_probs (also calling them inputs)
 // A few optimizations (similar to those here, but also some I didn't take) are described in
 // 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/TensorUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/macros/Macros.h>
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_ctc_loss_backward_native.h>
+#include <ATen/ops/_ctc_loss_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/logsumexp.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #include <type_traits>
 #include <numeric>
 
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index e063ec7f42fb..cbd562f542c5 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -7,108 +7,6 @@
 
 namespace at {
 namespace native {
-
-// TODO: these functions are unconditionally available because kaiser window depends on them
-// TODO: jiterate kaiser window and make them only available when not jiterating
-// NOTE: jiterating kaiser window requires extending the jiterator's scalar support
-/*
- * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h".
- */
-template <typename scalar_t>
-static inline C10_HOST_DEVICE scalar_t
-chbevl(scalar_t _x, const scalar_t array[], size_t len) {
-  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
-
-  scalar_t b0, b1, b2;
-
-  b0 = array[0];
-  b1 = 0;
-
-  for (size_t i = 1; i < len; ++i)  {
-    b2 = b1;
-    b1 = b0;
-    b0 = _x * b1 - b2 + array[i];
-  }
-
-  return (0.5 * (b0 - b2));
-}
-
-/*
- * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h".
- */
-template <typename T>
-C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
-  /* Chebyshev coefficients for exp(-x) I0(x)
-   * in the interval [0,8].
-   *
-   * lim(x->0){ exp(-x) I0(x) } = 1.
-   */
-  static const T coefficients[] = {
-      -4.41534164647933937950E-18, 3.33079451882223809783E-17,
-      -2.43127984654795469359E-16, 1.71539128555513303061E-15,
-      -1.16853328779934516808E-14, 7.67618549860493561688E-14,
-      -4.85644678311192946090E-13, 2.95505266312963983461E-12,
-      -1.72682629144155570723E-11, 9.67580903537323691224E-11,
-      -5.18979560163526290666E-10, 2.65982372468238665035E-9,
-      -1.30002500998624804212E-8,  6.04699502254191894932E-8,
-      -2.67079385394061173391E-7,  1.11738753912010371815E-6,
-      -4.41673835845875056359E-6,  1.64484480707288970893E-5,
-      -5.75419501008210370398E-5,  1.88502885095841655729E-4,
-      -5.76375574538582365885E-4,  1.63947561694133579842E-3,
-      -4.32430999505057594430E-3,  1.05464603945949983183E-2,
-      -2.37374148058994688156E-2,  4.93052842396707084878E-2,
-      -9.49010970480476444210E-2,  1.71620901522208775349E-1,
-      -3.04682672343198398683E-1,  6.76795274409476084995E-1};
-
-  return std::make_tuple(coefficients, 30);
-}
-
-template <typename T>
-C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
-  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
-   * in the inverted interval [8,infinity].
-   *
-   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
-   */
-  static const T coefficients[] = {
-      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
-      4.46562142029675999901E-17,  3.46122286769746109310E-17,
-      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
-      1.77256013305652638360E-15,  3.81168066935262242075E-15,
-      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
-      1.54008621752140982691E-14,  3.85277838274214270114E-13,
-      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
-      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
-      1.18891471078464383424E-11,  4.94060238822496958910E-10,
-      3.39623202570838634515E-9,   2.26666899049817806459E-8,
-      2.04891858946906374183E-7,   2.89137052083475648297E-6,
-      6.88975834691682398426E-5,   3.36911647825569408990E-3,
-      8.04490411014108831608E-1};
-
-  return std::make_tuple(coefficients, 25);
-}
-
-template <typename scalar_t>
-static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) {
-  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
-  // Upcast input for numerical accuracy purposes
-  // Needed for accurate results if input is bfloat16 or float16
-  scalar_t x = ::abs(_x);
-
-  if (x <= scalar_t{8.0}) {
-    auto coeff_pair = chebyshev_coefficients_i0e_A<scalar_t>();
-    auto A = std::get<0>(coeff_pair);
-    auto len = std::get<1>(coeff_pair);
-    scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0};
-    return (::exp(x) * chbevl(y, A, len));
-  }
-
-  auto coeff_pair = chebyshev_coefficients_i0e_B<scalar_t>();
-  auto B = std::get<0>(coeff_pair);
-  auto len = std::get<1>(coeff_pair);
-  return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x));
-}
-
 // See note [Jiterator]
 // TODO: elaborate in this comment on the structure of math.cuh
 #if AT_USE_JITERATOR()
@@ -276,6 +174,19 @@ const auto ndtri_string = jiterator_stringify(
   }
 ); // ndtri_string
 
+const auto log_ndtr_string = jiterator_stringify(
+  template <typename T>
+  T log_ndtr(T x) {
+    constexpr T SQRT1_2{0.707106781186547524400844362104849039};   // 1/sqrt(2)
+    T t = x * SQRT1_2;
+    if (x < T{-1.0}) {
+      return log(erfcx(-t) / 2) - t * t;
+    } else {
+      return log1p(-erfc(t) / 2);
+    }
+  }
+); // log_ndtr_string
+
 const auto gcd_string = jiterator_stringify(
   template <typename T>
   T gcd(const T a_in, const T b_in) {
@@ -555,6 +466,8 @@ const auto entr_string = jiterator_stringify(
   }
 ); // entr_string
 
+// NOTE: `kaiser_window_string` depends on `i0_string`
+//       for its implementation.
 const auto i0_string = jiterator_stringify(
   template<typename T>
   T chbevl(T x, const T array[], const int len) {
@@ -629,69 +542,6 @@ const auto i0_string = jiterator_stringify(
   }
 ); // i0_string
 
-const auto i0e_string = jiterator_stringify(
-  template<typename T>
-  T chbevl(T x, const T array[], const int len) {
-      T b0, b1, b2;
-
-      b0 = array[0];
-      b1 = 0;
-
-      for (int i = 1; i < len; ++i)  {
-          b2 = b1;
-          b1 = b0;
-          b0 = x * b1 - b2 + array[i];
-      }
-
-      return T{0.5} * (b0 - b2);
-  }
-
-  template <typename T>
-  T i0e(T _x) {
-    T x = fabs(_x);
-
-    if (x <= T{8.0}) {
-      T coefficients[] = {
-        -4.41534164647933937950E-18, 3.33079451882223809783E-17,
-        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
-        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
-        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
-        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
-        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
-        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
-        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
-        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
-        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
-        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
-        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
-        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
-        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
-        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
-
-      T y = (x / T{2.0}) - T{2.0};
-      return chbevl(y, coefficients, int{30});
-    }
-
-    // x > 8
-    T coefficients[] = {
-      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
-      4.46562142029675999901E-17,  3.46122286769746109310E-17,
-      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
-      1.77256013305652638360E-15,  3.81168066935262242075E-15,
-      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
-      1.54008621752140982691E-14,  3.85277838274214270114E-13,
-      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
-      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
-      1.18891471078464383424E-11,  4.94060238822496958910E-10,
-      3.39623202570838634515E-9,   2.26666899049817806459E-8,
-      2.04891858946906374183E-7,   2.89137052083475648297E-6,
-      6.88975834691682398426E-5,   3.36911647825569408990E-3,
-      8.04490411014108831608E-1};
-
-    return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / sqrt(x);
-  }
-); // i0e_string
-
 const auto i1_string = jiterator_stringify(
   template<typename T>
   T chbevl(const T x, const T array[], const int len) {
@@ -881,6 +731,15 @@ const auto i1e_string = jiterator_stringify(
   }
 ); // i1e_string
 
+const auto kaiser_window_string = i0_string + jiterator_stringify(
+  template <typename T>
+  T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) {
+    T x = a * inv_alpha - T{1};
+    T y = max(T{0}, T{1} - x * x);
+    return i0(beta * sqrt(y)) * inv_i0_beta;
+  }
+); // kaiser_window_string
+
 const auto sinc_string = jiterator_stringify(
   template <typename T>
   T sinc(T a) {
@@ -1509,22 +1368,102 @@ static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) {
   return static_cast<scalar_t>(sign * result);
 }
 
+/*
+ * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h".
+ */
 template <typename scalar_t>
-static inline C10_HOST_DEVICE scalar_t calc_i0e(scalar_t _x) {
+static inline C10_HOST_DEVICE scalar_t
+chbevl(scalar_t _x, const scalar_t array[], size_t len) {
   static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+
+  scalar_t b0, b1, b2;
+
+  b0 = array[0];
+  b1 = 0;
+
+  for (size_t i = 1; i < len; ++i)  {
+    b2 = b1;
+    b1 = b0;
+    b0 = _x * b1 - b2 + array[i];
+  }
+
+  return (0.5 * (b0 - b2));
+}
+
+/*
+ * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
+  /* Chebyshev coefficients for exp(-x) I0(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I0(x) } = 1.
+   */
+  static const T coefficients[] = {
+      -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+      -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+      -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+      -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+      -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+      -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+      -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+      -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+      -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+      -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+      -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+      -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+      -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+      -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+      -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+  return std::make_tuple(coefficients, 30);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+   */
+  static const T coefficients[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return std::make_tuple(coefficients, 25);
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) {
+  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+  // Upcast input for numerical accuracy purposes
+  // Needed for accurate results if input is bfloat16 or float16
   scalar_t x = ::abs(_x);
+
   if (x <= scalar_t{8.0}) {
     auto coeff_pair = chebyshev_coefficients_i0e_A<scalar_t>();
     auto A = std::get<0>(coeff_pair);
     auto len = std::get<1>(coeff_pair);
     scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0};
-    return (chbevl(y, A, len));
+    return (::exp(x) * chbevl(y, A, len));
   }
 
   auto coeff_pair = chebyshev_coefficients_i0e_B<scalar_t>();
   auto B = std::get<0>(coeff_pair);
   auto len = std::get<1>(coeff_pair);
-  return (chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x));
+  return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x));
 }
 
 template <typename T>
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 73db29deb4aa..bb9fce986d2f 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -1,11 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/max_unpool2d_native.h>
+#include <ATen/ops/max_unpool3d_native.h>
+
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
index e0b37802e875..17b02346611a 100644
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -116,6 +116,15 @@ struct LoadWithCast {
     }
   }
 
+  LoadWithCast(const TensorIteratorBase& iter) {
+    assert(iter.ninputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i + 1);
+      element_sizes[i] = c10::elementSize(iter.dtype(i + 1));
+    }
+  }
+
   template<typename scalar_t>
   __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
     void *ptr = base_ptr + element_sizes[arg] * offset;
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index 39305f41e641..e616a7d1fcfb 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -4,89 +4,9 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 
-#if AT_MAGMA_ENABLED()
-#include <magma_types.h>
-#include <magma_v2.h>
-#endif
-
 namespace at {
 namespace native {
 
-#if AT_MAGMA_ENABLED()
-
-// RAII for a MAGMA Queue
-struct MAGMAQueue {
-
-  // Default constructor without a device will cause
-  // destroying a queue which has not been initialized.
-  MAGMAQueue() = delete;
-
-  // Constructor
-  explicit MAGMAQueue(int64_t device_id) {
-    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    // Magma operations is numerically sensitive, so TF32 should be off
-    // regardless of the global flag.
-    TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
-    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-#endif
-    magma_queue_create_from_cuda(
-      device_id,
-      at::cuda::getCurrentCUDAStream(),
-      handle,
-      at::cuda::getCurrentCUDASparseHandle(),
-      &magma_queue_);
-  }
-
-  // Getter
-  magma_queue_t get_queue() const { return magma_queue_; }
-
-  // Destructor
-  ~MAGMAQueue() {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
-    // should restore the original math mode back
-    cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
-    cublasSetMathMode(handle, original_math_mode);
-#endif
-    magma_queue_destroy(magma_queue_);
-  }
-
- private:
-  magma_queue_t magma_queue_;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  cublasMath_t original_math_mode;
-#endif
-};
-
-static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
-  auto result = static_cast<magma_int_t>(value);
-  if (static_cast<int64_t>(result) != value) {
-    AT_ERROR("magma: The value of ", varname, "(", (long long)value,
-             ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
-  }
-  return result;
-}
-
-// MAGMA functions that don't take a magma_queue_t aren't stream safe
-// Work around this by synchronizing with the default stream
-struct MagmaStreamSyncGuard {
-  MagmaStreamSyncGuard() {
-    auto stream = at::cuda::getCurrentCUDAStream();
-    if (stream != at::cuda::getDefaultCUDAStream()) {
-      at::cuda::stream_synchronize(stream);
-    }
-  }
-
-  ~MagmaStreamSyncGuard() noexcept(false) {
-    auto default_stream = at::cuda::getDefaultCUDAStream();
-    if (at::cuda::getCurrentCUDAStream() != default_stream) {
-      at::cuda::stream_synchronize(default_stream);
-    }
-  }
-};
-#endif
-
 static inline int cuda_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<int>(value);
   TORCH_CHECK(static_cast<int64_t>(result) == value,
diff --git a/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu b/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu
index 88c88ce0ad80..7f61d9a0b5b0 100644
--- a/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu
+++ b/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu
@@ -1,12 +1,22 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/CUDAFunctions.h>
-#include <ATen/NativeFunctions.h>
 #include <c10/macros/Macros.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/cuda/block_reduce.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/CUDAFunctions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/sum_cuda_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss.h>
+#endif
+
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
index fcf0a6a2356a..15e6d1e9dc0c 100644
--- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu
+++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
@@ -1,9 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/native/Resize.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAException.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
@@ -114,7 +126,7 @@ __global__ void MultiMarginLoss_backward_kernel(
   }
 }
 
-void multi_margin_loss_shape_check(
+void multi_margin_loss_shape_check(int &nframe,
     const Tensor &input, const Tensor &target) {
   auto in_sizes = input.sizes();
   auto dims = in_sizes.size();
@@ -124,7 +136,7 @@ void multi_margin_loss_shape_check(
       "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
       in_sizes);
 
-  int64_t nframe = dims <= 1 ? 1 : in_sizes[0];
+  nframe = dims <= 1 ? 1 : in_sizes[0];
   TORCH_CHECK(
       target.dim() <= 1 && target.numel() == nframe,
       "inconsistent target size, expected ", nframe, " but got ",
@@ -138,16 +150,16 @@ Tensor& multi_margin_loss_cuda_out(
     const c10::optional<Tensor> &weights_, int64_t reduction, Tensor& out_) {
   auto p = p_.toLong();
   TORCH_CHECK(p == 1 || p == 2, "multi_margin_loss: Invalid p, expected 1 or 2 but got ", p);
-  multi_margin_loss_shape_check(input_, target_);
 
-  if (reduction == at::Reduction::None) {
-    resize_output(out_, target_.sizes());
-  } else if (input_.dim() == 2) {
-    resize_output(out_, {input_.sizes()[0]});
+  int nframe;
+  multi_margin_loss_shape_check(nframe, input_, target_);
+
+  // produce a scalar output for 1d input
+  if (reduction == Reduction::None && target_.dim() > 0) {
+    resize_output(out_, {nframe});
   } else {
     resize_output(out_, {});
   }
-
   if (input_.numel() == 0) {
     return out_;
   }
@@ -166,7 +178,6 @@ Tensor& multi_margin_loss_cuda_out(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "multi_margin_loss_cuda", [&] {
     const scalar_t margin = margin_.to<scalar_t>();
     if (input.dim() <= 1) {
-      int nframe = 1;
       TORCH_CHECK(target.dim() <= 1 && target.numel() == nframe, "inconsistent target size");
       dim3 blocks(1);
       dim3 threads(MULTIMARGIN_THREADS);
@@ -196,7 +207,6 @@ Tensor& multi_margin_loss_cuda_out(
     } else {
       auto in_sizes = input.sizes();
       TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
-      int nframe = in_sizes[0];
       // allow zero-dim target for 2D input.
       TORCH_CHECK(in_sizes[1] != 0 && target.dim() <= 1 && target.numel() == nframe,
                 "inconsistent target size");
@@ -248,7 +258,7 @@ Tensor& multi_margin_loss_cuda_out(
               margin);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
-        at::sum_out(out, tmp_output, /*dims=*/IntArrayRef{});
+        at::sum_out(out, tmp_output, IntArrayRef{});
       }
     }
   });
@@ -262,7 +272,7 @@ Tensor& multi_margin_loss_cuda_out(
 Tensor multi_margin_loss_cuda(
     const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin,
     const c10::optional<Tensor> &weights, int64_t reduction) {
-  auto out = at::empty({}, input.options());
+  auto out = at::empty({0}, input.options());
   multi_margin_loss_cuda_out(input, target, p, margin, weights, reduction, out);
   return out;
 }
@@ -274,7 +284,8 @@ Tensor& multi_margin_loss_cuda_backward_out(
   auto p = p_.toLong();
   TORCH_CHECK(p == 1 || p == 2,
               "multi_margin_loss_backward: Invalid p, expected 1 or 2 but got ", p);
-  multi_margin_loss_shape_check(input_, target_);
+  int nframe;
+  multi_margin_loss_shape_check(nframe, input_, target_);
   resize_output(grad_input_, input_.sizes());
 
   if (input_.numel() == 0) {
@@ -331,7 +342,6 @@ Tensor& multi_margin_loss_cuda_backward_out(
     } else {
       auto in_sizes = input.sizes();
       TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
-      int nframe = in_sizes[0];
       TORCH_CHECK((in_sizes[1] != 0) && (target.dim() <= 1) && (target.numel() == nframe),
                   "inconsistent target size");
       dim3 blocks(in_sizes[0]);
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index f9404fab0193..de8e8404ac2d 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -1,8 +1,9 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/CUDAFunctions.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/EmptyTensor.h>
 #include <ATen/cuda/detail/KernelUtils.h>
@@ -11,6 +12,16 @@
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/native/cuda/block_reduce.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/CUDAFunctions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/cumsum_cuda_dispatch.h>
+#include <ATen/ops/uniform_native.h>
+#endif
+
 #include <curand.h>
 #include <curand_kernel.h>
 #include <curand_philox4x32_x.h>
@@ -74,12 +85,13 @@ void renormRows(Tensor& t) {
   const int64_t maxThreads = std::min(
       props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads);
 
+  int warp_size = at::cuda::warp_size();
   dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
-  dim3 block(std::min(maxThreads, C10_WARP_SIZE * ceil_div(cols, int64_t{C10_WARP_SIZE})));
+  dim3 block(std::min(maxThreads, warp_size * ceil_div(cols, int64_t{warp_size})));
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(t.scalar_type(), "renormRows_cuda", [&] {
     renormRowsL1<scalar_t>
-        <<<grid, block, (block.x / C10_WARP_SIZE) * sizeof(scalar_t),
+        <<<grid, block, (block.x / warp_size) * sizeof(scalar_t),
         at::cuda::getCurrentCUDAStream()>>>(t.data_ptr<scalar_t>(),
             rows, cols);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -335,8 +347,9 @@ void multinomial_with_replacement_kernel_impl(
     int maxThreads = props->maxThreadsPerBlock;
     int maxShared = props->sharedMemPerBlock;
 
-    int requiredWarps = at::ceil_div(numCategories, C10_WARP_SIZE);
-    int requiredThreads = std::min(maxThreads, requiredWarps * C10_WARP_SIZE);
+    int warp_size = at::cuda::warp_size();
+    int requiredWarps = at::ceil_div(numCategories, warp_size);
+    int requiredThreads = std::min(maxThreads, requiredWarps * warp_size);
     int requiredShared = requiredThreads * sizeof(accscalar_t);
 
     if (n_sample == 1 && maxShared >= requiredShared) {
diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu
index 79cec9f8da3e..2246c836f3dc 100644
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@@ -1,7 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -12,6 +12,16 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/block_reduce.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/nll_loss2d_forward_native.h>
+#include <ATen/ops/nll_loss2d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
index a04d118b7502..75b4e3357540 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
@@ -1,6 +1,9 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/im2col.cuh>
+
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorMeta.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
@@ -9,7 +12,16 @@
 #include <ATen/cuda/CUDAContext.h>
 
 #include <ATen/native/ConvUtils.h>
-#include <ATen/native/cuda/im2col.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose2d_native.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
index 1198555d144e..d34de0f156bd 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
@@ -1,6 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 
@@ -10,6 +11,17 @@
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/cuda/vol2col.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose3d_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
index 2c2c11f22467..6c2942b05de3 100644
--- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
+++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
@@ -1,12 +1,25 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/vol2col.cuh>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/cuda/im2col.cuh>
-#include <ATen/native/cuda/vol2col.cuh>
 #include <ATen/native/DilatedConvolutionUtils.h>
 #include <c10/util/accumulate.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_dilated2d_native.h>
+#include <ATen/ops/slow_conv_dilated3d_native.h>
+#endif
+
 #include <tuple>
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index dcacf98a8007..0e524b7b81fd 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/EmptyTensor.h>
@@ -6,6 +8,13 @@
 #include <ATen/cuda/detail/OffsetCalculator.cuh> //for MAX_DIMS
 #include <ATen/cuda/cub.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/nonzero_native.h>
+#endif
+
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 2f9484770ad4..e7b2372a18da 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -1,3 +1,4 @@
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/ReduceOps.h>
@@ -7,6 +8,30 @@
 #include <ATen/native/cuda/Normalization.cuh>
 #include <c10/cuda/CUDAMathCompat.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/batch_norm_backward_elemt_native.h>
+#include <ATen/ops/batch_norm_backward_reduce_native.h>
+#include <ATen/ops/batch_norm_elemt_native.h>
+#include <ATen/ops/batch_norm_gather_stats_native.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_native.h>
+#include <ATen/ops/batch_norm_stats_native.h>
+#include <ATen/ops/batch_norm_update_stats_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/native_batch_norm_backward_native.h>
+#include <ATen/ops/native_batch_norm_native.h>
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+// TODO: Doesn't exist in this branch
+#if 0
+#include <ATen/ops/from_blob.h>
+#else
+#include <ATen/Functions.h>
+#endif
+
 namespace at { namespace native {
 
 namespace {
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index 266d5f19206d..a9b11e76db68 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -9,6 +10,14 @@
 #include <ATen/native/cuda/LaunchUtils.h>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at { namespace native {
 
 // The maximum number of threads in a block
@@ -846,9 +855,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
   auto feature_size = input_reshaped.size(2);
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  int block_y = std::min<int>(lastPow2(batch_size), MAX_BLOCK_SIZE/C10_WARP_SIZE);
+  int warp_size = at::cuda::warp_size();
+  int block_y = std::min<int>(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size);
   // We want block_x to be at least a warp width
-  int block_x = std::min<int>(std::max<int>(getNumThreads(feature_size), C10_WARP_SIZE), MAX_BLOCK_SIZE/block_y);
+  int block_x = std::min<int>(std::max<int>(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y);
   const dim3 block(block_x, block_y);
   const dim3 grid(n_input);
 
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
index 6fbbe1f3be47..4f308d0847dc 100644
--- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
+++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -126,7 +126,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                 if (!is_transformer_mask) {
                     idx += i*element_count;
                 }
-                if (mask[idx]) {
+                if (!mask[idx]) {
                     max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
                     is_meaningful_max = true;
                 }
@@ -160,13 +160,18 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                     idx += i*element_count;
                 }
 
-                if (mask[idx]) {
+                if (!mask[idx]) {
                     if (is_log_softmax) {
                         sum[i] += std::exp(elements[i][it] - max_value[i]);
                     } else {
                         elements[i][it] = std::exp(elements[i][it] - max_value[i]);
                         sum[i] += elements[i][it];
                     }
+                } else {
+                  if (!is_log_softmax) {
+                    // Masked values are treated as -infinity, and std::exp(-infinity) is 0.
+                    elements[i][it] = 0;
+                  }
                 }
             }
         }
@@ -183,18 +188,10 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             int element_index = local_idx + it * WARP_SIZE;
             if (element_index < element_count) {
-                if (is_masked) {
-                    int idx = it*WARP_SIZE;
-                    if (!is_transformer_mask) {
-                        idx += i*element_count;
-                    }
-                    if (!mask[idx]) {
-                        dst[i*element_count+it*WARP_SIZE] = 0;
-                        continue;
-                    }
-                }
                 if (is_log_softmax) {
                     dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
+                } else if (sum[i] == 0) {
+                    dst[i*element_count+it*WARP_SIZE] = std::numeric_limits<acc_t>::quiet_NaN();
                 } else {
                     dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i];
                 }
@@ -205,8 +202,8 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     }
 }
 
-template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax>
-__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count)
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr)
 {
     // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
     constexpr int next_power_of_two = 1 << log2_elements;
@@ -230,6 +227,9 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad,
     grad += thread_offset;
     output += thread_offset;
     gradInput += thread_offset;
+    if (is_masked) {
+        mask += thread_offset;
+    }
 
     // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
     // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
@@ -253,13 +253,14 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad,
         }
     }
 
-    acc_t sum[WARP_BATCH];
+    acc_t sum[WARP_BATCH] { 0.0f };
     #pragma unroll
     for (int i = 0;  i < WARP_BATCH;  ++i) {
-        sum[i] = grad_reg[i][0];
         #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            sum[i] += grad_reg[i][it];
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) {
+                sum[i] += grad_reg[i][it];
+            }
         }
     }
     warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
@@ -273,8 +274,11 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad,
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             int element_index = local_idx + it * WARP_SIZE;
             if (element_index < element_count) {
+                if (is_masked && mask[i*element_count+it*WARP_SIZE]) {
+                    gradInput[i*element_count+it*WARP_SIZE] = 0;
+                }
                 // compute gradients
-                if (is_log_softmax) {
+                else if (is_log_softmax) {
                     gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
                 } else {
                     gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
@@ -297,7 +301,8 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele
         const int next_power_of_two = 1 << log2_elements;
 
         // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
 
         // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
         int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
@@ -335,8 +340,8 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele
     }
 }
 
-template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
-void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count)
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
 {
     TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
     if (softmax_elements == 0) {
@@ -346,7 +351,8 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const
         const int next_power_of_two = 1 << log2_elements;
 
         // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
 
         // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
         int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
@@ -361,10 +367,10 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const
         // Launch code would be more elegant if C++ supported FOR CONSTEXPR
         switch (log2_elements) {
             #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E:                      \
-            softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax> \
+            softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked> \
                 <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>       \
                 (grad_input, grad, output, batch_count, softmax_elements_stride, \
-                softmax_elements);                                              \
+                softmax_elements, mask);                                              \
             C10_CUDA_KERNEL_LAUNCH_CHECK();                                      \
             break;
 
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 5e42326056c1..b1c4a2ae4b41 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -3,6 +3,7 @@
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/PointwiseOps.h>
@@ -10,28 +11,88 @@
 
 namespace at { namespace native {
 
+const char addcmul_name[] = "addcmul";
 void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "addcmul_cuda", [&]() {
-    // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
-    // and do math in fp32 for better accuracy.
-    using accscalar_t = at::acc_type<scalar_t, true>;
-    auto alpha = value.to<accscalar_t>();
-    gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
-      return a + alpha * (static_cast<accscalar_t>(b) * static_cast<accscalar_t>(c));
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+    #if AT_USE_JITERATOR()
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
+        auto alpha = value.to<scalar_t>();
+        static const auto addcmul_string = jiterator_stringify(
+          template <typename T> T addcmul(T a, T b, T c, T alpha) { return a + alpha * (b * c); });
+        jitted_gpu_kernel<
+            /*name=*/addcmul_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/3>(
+            iter,
+            addcmul_string,
+            /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(alpha));
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
+        auto alpha = value.to<scalar_t>();
+        gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+          return a + alpha * b * c;
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcmul_cuda", [&]() {
+      // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
+      // and do math in fp32 for better accuracy.
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto alpha = value.to<accscalar_t>();
+      gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+        return a + alpha * (static_cast<accscalar_t>(b) * static_cast<accscalar_t>(c));
+      });
     });
-  });
+  }
 }
 
+// return a + alpha * (b / static_cast<accscalar_t>(c));
+const char addcdiv_name[] = "addcdiv";
 void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "addcdiv_cuda", [&]() {
-    // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
-    // and do math in fp32 for better accuracy.
-    using accscalar_t = at::acc_type<scalar_t, true>;
-    auto alpha = value.to<accscalar_t>();
-    gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
-      return a + alpha * (b / static_cast<accscalar_t>(c));
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+    #if AT_USE_JITERATOR()
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() {
+        auto alpha = value.to<scalar_t>();
+        static const auto addcdiv_string =
+            jiterator_stringify(template <typename T> T addcdiv(
+                T a, T b, T c, T alpha) { return a + alpha * (b / c); });
+        jitted_gpu_kernel<
+            /*name=*/addcdiv_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/3>(
+            iter,
+            addcdiv_string,
+            /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(alpha));
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() {
+        auto alpha = value.to<scalar_t>();
+        gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+          return a + alpha * (b / c);
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcdiv_cuda", [&]() {
+      // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
+      // and do math in fp32 for better accuracy.
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto alpha = value.to<accscalar_t>();
+      gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+        return a + alpha * (b / static_cast<accscalar_t>(c));
+      });
     });
-  });
+  }
 }
 
 void smooth_l1_backward_cuda_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu
index 659ddc28c497..ed34bc78fba2 100644
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@@ -1,11 +1,24 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_native.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_native.h>
+#endif
+
 namespace at { namespace native {
 
 namespace {
@@ -546,7 +559,7 @@ void checkLSTMBackwardSizes(const TensorArg& grad_hy, const TensorArg& grad_cy,
   checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4);
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_cuda( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_impl_cuda( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
       const Tensor& cx, const Tensor& cy,
       const Tensor& workspace, bool has_bias) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -555,7 +568,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
   const Tensor& grad_cy = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
 
   if (!grad_hy.defined() && !grad_cy.defined()) {
-    return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>();
+    return std::tuple<Tensor, Tensor, Tensor>();
   }
   checkLSTMBackwardSizes({grad_hy, "grad_hy", 1}, {grad_cy, "grad_cy", 2},
                          {cx, "cx", 3}, {cy, "cy", 4},
@@ -572,7 +585,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
   });
 
   auto grad_bias = has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{};
-  return std::make_tuple(grad_gates, grad_gates, grad_cx, grad_bias, grad_bias);
+  return std::make_tuple(grad_gates, grad_cx, grad_bias);
 }
 
 static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5;
diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu
index f0c41f5be444..b3c679f77724 100644
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@@ -1,9 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/cuda/cub.h>
 #include <ATen/native/cuda/Randperm.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/randperm_native.h>
+#endif
+
 #include <limits>
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
index 027806ed4216..55981ac1ad8e 100644
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -1,6 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -8,20 +8,39 @@
 #include <cmath>
 #include <limits>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
+#endif
+
 #define GPU_LAMBDA __device__ __host__
 
 namespace {
 
-constexpr int num_threads = C10_WARP_SIZE * 2;
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 128;
+}
+#else
+constexpr int num_threads() {
+  return C10_WARP_SIZE * 2;
+}
+#endif
 constexpr int thread_work_size = 1;
-constexpr int block_work_size = thread_work_size * num_threads;
+constexpr int block_work_size = thread_work_size * num_threads();
 
 template<typename index_t, typename func_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename function_traits<func_t>::result_type *data) {
   #pragma unroll
   for (int i = 0; i < thread_work_size; i++) {
-    index_t idx = block_work_size * blockIdx.x + num_threads * i + threadIdx.x;
+    index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x;
     if (idx < N) {
       data[idx] = f(idx);
     }
@@ -38,10 +57,10 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
   auto stream = at::cuda::getCurrentCUDAStream();
   using scalar_t = typename function_traits<func_t>::result_type;
   if (N <= std::numeric_limits<int>::max()) {
-    elementwise_kernel_with_index<int><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    elementwise_kernel_with_index<int><<<grid, num_threads(), 0, stream>>>(N, f, output.data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    elementwise_kernel_with_index<int64_t><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
diff --git a/aten/src/ATen/native/cuda/RecordStream.cu b/aten/src/ATen/native/cuda/RecordStream.cu
index d48561df00e5..c4cb74bdc68f 100644
--- a/aten/src/ATen/native/cuda/RecordStream.cu
+++ b/aten/src/ATen/native/cuda/RecordStream.cu
@@ -1,5 +1,13 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/record_stream_native.h>
+#endif
+
 namespace at { namespace native {
 void record_stream_cuda(Tensor& self, c10::Stream stream) {
   c10::cuda::CUDACachingAllocator::recordStream(self.storage().data_ptr(), at::cuda::CUDAStream::unpack(stream.pack()));
diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu
index 103a386ff0c9..2de32f6d4a35 100644
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/cuda/Reduce.cuh>
 #include <c10/util/ArrayRef.h>
 
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 5ee3757d5937..57fa55fbec7d 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -9,6 +9,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/thread_constants.h>
 #include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/OpMathType.h>
 #include <c10/macros/Macros.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <functional>
@@ -17,6 +18,9 @@
 #include <utility>
 #include <thrust/pair.h>
 
+#include <ATen/native/cuda/jit_utils.h>
+#include <iostream>
+
 namespace at { namespace native {
 
 using at::detail::Array;
@@ -272,6 +276,65 @@ func_wrapper_t<scalar_t, func_t> func_wrapper(const func_t& op) {
   return func_wrapper_t<scalar_t, func_t> { op };
 }
 
+template <typename scalar_t, typename out_scalar_t=scalar_t>
+struct ReduceJitOp {
+//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations
+//Maybe we can find a way to unify ReduceOp and ReduceJitOp
+  using InputCalculator = OffsetCalculator<1, uint32_t>;
+  using OutputCalculator = OffsetCalculator<2, uint32_t>;
+  //TODO for now arg_t is always opmath_t of the input, later we'll need to change it
+  using arg_t = at::opmath_type<scalar_t>;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+  //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
+  //not just wrapper
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceJitOp(
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+};
+
 template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
 struct ReduceOp {
   using traits = function_traits<decltype(&ops_t::reduce)>;
@@ -284,8 +347,6 @@ struct ReduceOp {
     std::is_convertible<arg_t, out_scalar_t>::value
     && std::is_convertible<out_scalar_t, arg_t>::value;
 
-  static constexpr float acc_buffer_multiplier = (float)sizeof(arg_t) / sizeof(out_scalar_t);
-
   static constexpr int input_vec_size = ReduceConfig::input_vec_size;
 
   ops_t ops;
@@ -837,6 +898,47 @@ static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction)
   }
 }
 
+template<char const *name, typename scalar_t, typename out_scalar_t,
+int vt0, typename R>
+static void launch_jitted_reduce_kernel(DeviceIndex idx, const ReduceConfig& config,
+R& reduction, const std::string& func) {
+  constexpr int max_threads = mnt_wrapper<scalar_t>::MAX_NUM_THREADS;
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  static std::mutex _jiterator_mutex;
+  static std::vector<std::array<at::cuda::jit::NvrtcFunction, 3>> fns(c10::cuda::device_count());
+  int shared_memory = config.shared_memory_size();
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  switch(config.output_vec_size) {
+  case 4:
+    fn_ptr = &fns[idx][0];
+    break;
+  case 2:
+    fn_ptr = &fns[idx][1];
+    break;
+  default:
+    fn_ptr = &fns[idx][2];
+  }
+  if (!fn_ptr->function) {
+    std::string f_inputs_type_str = at::cuda::jit::typeName<scalar_t>();
+    std::string accum_type_str = at::cuda::jit::typeName<at::opmath_type<scalar_t>>();
+    std::string result_type_str = at::cuda::jit::typeName<out_scalar_t>();
+    int max_threads_codegen = max_threads/config.output_vec_size;
+    auto code = at::cuda::jit::generate_reduction_code(1, func, name, vt0,
+                                               f_inputs_type_str, accum_type_str, result_type_str,
+                                               true, false, config.output_vec_size, max_threads_codegen);
+
+    *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_"+std::string(name));
+
+  }
+  constexpr int kernel_args = 1;
+  void* args[kernel_args];
+  args[0] = static_cast<void*>(&reduction);
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory);
+}
+
+
 class AccumulationBuffer {
  public:
   AccumulationBuffer() {}
@@ -874,7 +976,7 @@ class AccumulationBuffer {
 };
 
 template <typename scalar_t>
-int get_output_vec_size(TensorIterator &iter) {
+int get_output_vec_size(const TensorIterator &iter) {
   int vec_size = 4;
   auto update_vec_size = [&vec_size](uint64_t n) {
     while(n % vec_size != 0) {
@@ -898,61 +1000,8 @@ int get_output_vec_size(TensorIterator &iter) {
   return vec_size;
 }
 
-template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
-inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
-                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
-  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
-
-  using traits = function_traits<decltype(&ops_t::reduce)>;
-  using arg_t = typename traits::template arg<0>::type;
-  static constexpr bool can_accumulate_in_output =
-    std::is_convertible<arg_t, out_scalar_t>::value;
-
-  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
-  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
-
-  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
-  // reused by all recursive function calls.
-  if (acc_buf_ptr == NULL) {
-    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
-    // when accumulation in output is not possible.
-    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
-      int64_t output_memory_size = iter.element_size(0);
-      for (int dim = 0; dim < iter.ndim(); dim++) {
-        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
-      }
-      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
-      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
-                                                 sizeof(out_scalar_t),
-                                                 (char*) iter.data_ptr(0),
-                                                 output_memory_size * sizeof(arg_t)));
-    } else {
-      owned_buf_ptr.reset(new AccumulationBuffer());
-    }
-    acc_buf_ptr = owned_buf_ptr.get();
-  }
-
-  if (!can_use_32bit_indexing) {
-    for (auto& sub_iter : iter.with_32bit_indexing()) {
-      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
-
-      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
-          acc_buf_ptr, sub_iter_base_idx);
-    }
-    return;
-  }
-
-  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
-  char* out_data = (char*)iter.data_ptr(0);
-  const auto noutputs = iter.noutputs();
-  optional<char*> out_data_extra;
-  if (noutputs > 1) {
-    out_data_extra = (char*)iter.data_ptr(1);
-  } else {
-    out_data_extra = nullopt;
-  }
-  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
-
+template<typename arg_t, typename scalar_t, int vt0>
+ReduceConfig setReduceConfig(const TensorIterator& iter){
   // Start by assuming that each thread handles a single output and all
   // the inputs for that output.
   int64_t num_outputs = iter.num_output_elements();
@@ -1080,7 +1129,64 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
       config.input_mult[2] = config.split_input(config.ctas_per_output);
     }
   }
+  return config;
+};
+
+template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
+inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename traits::template arg<0>::type;
+  static constexpr bool can_accumulate_in_output =
+    std::is_convertible<arg_t, out_scalar_t>::value;
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(arg_t)));
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
 
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
   at::DataPtr buffer;
   at::DataPtr semaphores;
   if (config.should_global_reduce()) {
@@ -1115,4 +1221,101 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
   launch_reduce_kernel<mnt_wrapper<scalar_t>::MAX_NUM_THREADS>(config, reduce);
 }
 
+//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function
+//try unifying with gpu_reduce_kernel
+template <char const* name, typename scalar_t, typename out_scalar_t, int vt0=4, typename ident_t=double>
+inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  //TODO - this will be different for more complicated reductions, but for now reductions using
+  //func_wrapper all have arg_t = opmath
+  using arg_t = at::opmath_type<scalar_t>;
+  static constexpr bool can_accumulate_in_output =
+    std::is_convertible<arg_t, out_scalar_t>::value;
+  static_assert(can_accumulate_in_output == true, "unsupported arg_t for jitted reduction");
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(out_scalar_t))); //TODO
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      jitted_gpu_reduce_kernel<name, scalar_t, out_scalar_t, vt0>(sub_iter, func, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  //TODO - for now we support a single input, we may be able to relax this constraint
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+    AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceJitOp<scalar_t, out_scalar_t>(
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  launch_jitted_reduce_kernel<name, scalar_t,
+  out_scalar_t, vt0>(iter.device().index(),
+  config, reduce, func);
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ReduceOps.cpp b/aten/src/ATen/native/cuda/ReduceOps.cpp
index ec1cbd3b64fb..ab878f82e3a0 100644
--- a/aten/src/ATen/native/cuda/ReduceOps.cpp
+++ b/aten/src/ATen/native/cuda/ReduceOps.cpp
@@ -1,13 +1,29 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/ReduceOps.h>
 
-#include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceAllOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorCompare.h>
 
+#include <ATen/Context.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/NamedTensor.h>
 #include <ATen/TensorIterator.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/full.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/where.h>
+#endif
+
 namespace at { namespace native {
 namespace {
 
@@ -33,12 +49,6 @@ void norm_kernel_cuda(TensorIterator& iter, const Scalar& val) {
 
 }
 
-void linalg_vector_norm_kernel_cuda(TensorIterator& iter, Scalar ord) {
-  TORCH_CHECK(ord.isFloatingPoint(), "linalg.vector_norm expects ord to be float");
-  norm_kernel_cuda(iter, ord);
-}
-
-
 void min_kernel_impl(const Tensor& result, const Tensor& indice, const Tensor& self, int64_t dim, bool keepdim) {
   auto iter = meta::make_reduction(self, result, indice, dim, keepdim, self.scalar_type(), kLong);
   min_launch_kernel(iter);
@@ -86,6 +96,5 @@ REGISTER_CUDA_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_kernel_impl);
 REGISTER_CUDA_DISPATCH(aminmax_stub, &aminmax_kernel_impl);
 
 REGISTER_CUDA_DISPATCH(norm_stub, &norm_kernel_cuda);
-REGISTER_CUDA_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index bf81ed5b7940..be1d7c515a3e 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -5,6 +5,8 @@
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/ReduceOps.h>
+#include <ATen/jit_macros.h>
+#include <ATen/OpMathType.h>
 
 namespace at { namespace native {
 
@@ -18,6 +20,35 @@ struct sum_functor {
   }
 };
 
+// jiterated specialization for `complex<Half>`
+const char sum_name[] = "sum";
+template <>
+struct sum_functor<c10::complex<at::Half>> {
+// jiterator reduction fails on windows
+// Ref: https://github.com/pytorch/pytorch/issues/77305
+#if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    std::string func = jiterator_stringify(
+    arg_t combine(arg_t a, arg_t b) {
+      return a + b;
+    }
+    );
+    jitted_gpu_reduce_kernel<sum_name, scalar_t, scalar_t>(
+        iter, func, 0.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter, func_wrapper<scalar_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }), acc_t{0.});
+  }
+#endif
+};
+
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct nansum_functor {
   void operator()(TensorIterator& iter) {
@@ -26,14 +57,30 @@ struct nansum_functor {
   }
 };
 
+const char prod_name[] = "prod";
+
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct prod_functor {
+  // jiterator reduction fails on windows
+  // Ref: https://github.com/pytorch/pytorch/issues/77305
+  #if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    std::string func = jiterator_stringify(
+    arg_t combine(arg_t a, arg_t b) {
+      return a * b;
+    }
+    );
+    jitted_gpu_reduce_kernel<prod_name, scalar_t, out_t>(
+        iter, func, 1.);
+  }
+  #else
   void operator()(TensorIterator& iter) {
     gpu_reduce_kernel<scalar_t, out_t>(
         iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
           return a * b;
-        }), 1);
+        }), 1.);
   }
+  #endif
 };
 
 // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
@@ -47,6 +94,31 @@ struct prod_functor<bool> {
   }
 };
 
+// jiterated specialization for `complex<Half>`
+template <>
+struct prod_functor<c10::complex<at::Half>> {
+// jiterator reduction fails on windows
+// Ref: https://github.com/pytorch/pytorch/issues/77305
+#if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    std::string func =
+        jiterator_stringify(arg_t combine(arg_t a, arg_t b) { return a * b; });
+    jitted_gpu_reduce_kernel<prod_name, scalar_t, scalar_t>(iter, func, 1.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>(
+            [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; }),
+        acc_t{1.});
+  }
+#endif
+};
+
 // The function `reduce_dispatch` below dispatches to the kernel based
 // on the type of `iter`. It takes care of the common logic
 // for handling Half-Precision floating types.
@@ -79,8 +151,8 @@ static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) {
 
 static void sum_kernel_cuda(TensorIterator& iter){
   auto general_dispatcher = [](TensorIterator& iter) {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(
-        ScalarType::Bool, iter.dtype(), "sum_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        kBool, kComplexHalf, iter.dtype(), "sum_cuda", [&]() {
           sum_functor<scalar_t>{}(iter);
         });
   };
@@ -100,7 +172,7 @@ static void nansum_kernel_cuda(TensorIterator& iter) {
 
 static void prod_kernel_cuda(TensorIterator& iter) {
   auto general_dispatcher = [](TensorIterator& iter) {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, iter.dtype(), "prod_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kComplexHalf, kBool, iter.dtype(), "prod_cuda", [&]() {
       prod_functor<scalar_t>{}(iter);
     });
   };
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index e497bae885f0..33f71368ca10 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -1,12 +1,27 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#endif
+
 #include <thrust/pair.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu
index 43d6602ea8e2..1b29dac6690f 100644
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@@ -1,7 +1,15 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/Repeat.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/repeat_interleave_native.h>
+#endif
+
 template <typename index_t>
 __global__ static void compute_cuda_kernel(
     index_t* repeat_ptr,
@@ -33,7 +41,7 @@ static void compute_cuda(
     int64_t size,
     int64_t result_size) {
   int64_t block = 512;
-  int64_t warps_per_block = block / C10_WARP_SIZE;
+  int64_t warps_per_block = block / at::cuda::warp_size();
   int64_t grid =
       std::min<int64_t>((size + warps_per_block - 1) / warps_per_block, 2048L);
 
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index 754161c62097..d967ffd0354d 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -1,13 +1,26 @@
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#endif
+
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
diff --git a/aten/src/ATen/native/cuda/Resize.cpp b/aten/src/ATen/native/cuda/Resize.cpp
index c4167ec56e67..43e1cb951574 100644
--- a/aten/src/ATen/native/cuda/Resize.cpp
+++ b/aten/src/ATen/native/cuda/Resize.cpp
@@ -1,10 +1,16 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/Resize.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
-#include <torch/library.h>
-#include <ATen/native/cuda/Resize.h>
 #include <ATen/native/ResizeCommon.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/resize_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/cuda/Resize.h b/aten/src/ATen/native/cuda/Resize.h
index 33ab263693dc..569b145fa61d 100644
--- a/aten/src/ATen/native/cuda/Resize.h
+++ b/aten/src/ATen/native/cuda/Resize.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/native/ResizeCommon.h>
 
 #include <c10/cuda/CUDAGuard.h>
@@ -9,19 +9,15 @@ namespace at { namespace native {
 
 TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes);
 
-static inline void maybe_resize_storage_cuda(TensorImpl* self, uint64_t new_size) {
+static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) {
   // It does not make sense to try to resize a storage
   // to hold 0 elements, and this can break
   // if storage_offset is positive but
   // new_size is 0, so just bail in that case
   // (same comment is in Resize.h)
-  if (new_size == 0) {
+  if (self->numel() == 0) {
     return;
   }
-  auto new_size_bytes_i = (new_size + self->storage_offset()) * self->dtype().itemsize();
-  TORCH_CHECK(!overflows<size_t>(new_size_bytes_i), "Requested storage size (",
-              new_size_bytes_i, ") cannot be represented as a size_t");
-  const auto new_size_bytes = static_cast<size_t>(new_size_bytes_i);
 
   const Storage &storage = self->unsafe_storage();
   TORCH_CHECK(storage, "Tensor: invalid null storage");
@@ -33,7 +29,7 @@ static inline void maybe_resize_storage_cuda(TensorImpl* self, uint64_t new_size
 inline TensorImpl* resize_impl_cuda_(
     TensorImpl* self,
     IntArrayRef size,
-    c10::optional<IntArrayRef> stride,
+    at::OptionalIntArrayRef stride,
     bool device_guard = true) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
@@ -45,14 +41,17 @@ inline TensorImpl* resize_impl_cuda_(
     guard.set_index(self->storage().device().index());
   }
 
-  int64_t storage_size = 1;
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
   if (stride) {
     self->set_sizes_and_strides(size, *stride);
-    // NB: storage size can be different from numel.
-    storage_size = storage_size_for(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
   } else {
     self->set_sizes_contiguous(size);
-    storage_size = self->numel();
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
   }
   maybe_resize_storage_cuda(self, storage_size);
 
diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu
index 048118cf7925..3b2435d3dae4 100644
--- a/aten/src/ATen/native/cuda/RreluWithNoise.cu
+++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu
@@ -1,6 +1,18 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/cuda/DistributionTemplates.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/leaky_relu.h>
+#include <ATen/ops/rrelu_with_noise_native.h>
+#endif
+
 
 namespace at { namespace native {
 
@@ -132,6 +144,12 @@ Tensor& rrelu_with_noise_out_cuda(const Tensor& self,
     bool training,
     c10::optional<Generator> generator,
     Tensor& output) {
+  at::native::resize_output(output, self.sizes());
+
+  if (self.numel() == 0) {
+    return output;
+  }
+
   TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2},
       output_arg{output, "output", 3};
   checkAllSameGPU("rrelu_with_noise_out_cuda", {self_arg, noise_arg, output_arg});
diff --git a/aten/src/ATen/native/cuda/ScanKernels.cpp b/aten/src/ATen/native/cuda/ScanKernels.cpp
index f88faa1fcac9..8ba8b742af77 100644
--- a/aten/src/ATen/native/cuda/ScanKernels.cpp
+++ b/aten/src/ATen/native/cuda/ScanKernels.cpp
@@ -1,10 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
 
 #include <ATen/native/cuda/ScanKernels.h>
 #include <ATen/native/ReduceOps.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cummax_helper_native.h>
+#include <ATen/ops/_cummin_helper_native.h>
+#include <ATen/ops/_logcumsumexp_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
 namespace at { namespace native {
 
 static c10::MaybeOwned<Tensor> contiguous_out_arg(const Tensor &tensor) {
diff --git a/aten/src/ATen/native/cuda/ScanKernels.h b/aten/src/ATen/native/cuda/ScanKernels.h
index a502847f6307..28e65372511b 100644
--- a/aten/src/ATen/native/cuda/ScanKernels.h
+++ b/aten/src/ATen/native/cuda/ScanKernels.h
@@ -1,3 +1,4 @@
+#pragma once
 #include <cstdint>
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index 4ec12e166634..8461aa4cd8e3 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -1,6 +1,7 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorAdvancedIndexing.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
 
@@ -9,6 +10,7 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -19,8 +21,9 @@ namespace at { namespace native {
 class ReduceMultiply {
 public:
   template <typename scalar_t>
-  constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const {
-    gpuAtomicMul(self_data, *src_data);
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMul(self_data_start + index, *src_data);
   }
 };
 static ReduceMultiply reduce_multiply;
@@ -28,17 +31,47 @@ static ReduceMultiply reduce_multiply;
 class ReduceAdd {
 public:
   template <typename scalar_t>
-  constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const {
-    gpuAtomicAddNoReturn(self_data, *src_data);
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
   }
 };
 static ReduceAdd reduce_add;
 
+class ReduceMean {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
+  }
+};
+static ReduceMean reduce_mean;
+
+class ReduceMinimum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMin(self_data_start + index, *src_data);
+  }
+};
+static ReduceMinimum reduce_minimum;
+
+class ReduceMaximum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMax(self_data_start + index, *src_data);
+  }
+};
+static ReduceMaximum reduce_maximum;
+
 class TensorAssign {
 public:
   template <typename scalar_t>
-  constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const {
-    *self_data = *src_data;
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    *(self_data_start + index) = *src_data;
   }
 };
 static TensorAssign tensor_assign;
@@ -87,12 +120,13 @@ struct _cuda_scatter_gather_internal_kernel {
     TensorIterator& iter,
     int64_t index_size,
     int64_t index_stride,
+    int64_t numel,  // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545
     const func_t& f
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
         _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t>()(
-          sub_iter, index_size, index_stride, f
+          sub_iter, index_size, index_stride, numel, f
         );
       }
       return;
@@ -110,14 +144,12 @@ struct _cuda_scatter_gather_internal_kernel {
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds");
 
-      char* self_data = self_ptr + offsets[0];
-      char* src_data = src_ptr + offsets[1];
-
       f(
-        (scalar_t*)self_data + (is_scatter_like ? idx_dim * index_stride : 0),
-        (scalar_t*)src_data + (is_scatter_like ? 0 : idx_dim * index_stride)
+        (scalar_t*)(self_ptr + offsets[0]),
+        is_scatter_like ? idx_dim * index_stride : 0,
+        numel,
+        (scalar_t*)(src_ptr + offsets[1]) + (is_scatter_like ? 0 : idx_dim * index_stride)
       );
-
     };
 
     _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
@@ -126,12 +158,11 @@ struct _cuda_scatter_gather_internal_kernel {
 
 template <bool is_scatter_like = true, bool cast_to_opaque = true>
 struct cuda_scatter_gather_base_kernel {
-  template <typename func_t>
   void operator()(
     const Tensor& self, int64_t dim,
     const Tensor& index, const Tensor& src,
     const std::string& method_name,
-    const func_t& f
+    const ReduceAdd& f
   ) {
     at::assert_no_internal_overlap(self);
 
@@ -179,7 +210,7 @@ struct cuda_scatter_gather_base_kernel {
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
         _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
-          iter, index_size, index_stride, f
+          iter, index_size, index_stride, self.numel(), f
         );
       }
     );
@@ -189,7 +220,66 @@ struct cuda_scatter_gather_base_kernel {
     const Tensor& self, int64_t dim,
     const Tensor& index, const Tensor& src,
     const std::string& method_name,
-    const ReduceMultiply& f
+    const TensorAssign& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+    auto self_strides = ensure_nonempty_vec(self.strides().vec());
+    auto src_strides = ensure_nonempty_vec(src.strides().vec());
+
+    // restride self and src such that
+    // self.shape = src.shape = index.shape
+    //
+    // restride stride[dim] such that
+    // if (is_scatter_like) self.stride[dim] = 0
+    // else src.stride[dim] = 0
+    auto self_restrided = is_scatter_like ?
+        restride_dim(self, dim, index_sizes)
+      : self.as_strided(index_sizes, self_strides);
+    auto src_restrided = is_scatter_like ?
+        src.as_strided(index_sizes, src_strides)
+      : restride_dim(src, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_input(src_restrided)
+      .add_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
+    auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
+
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "cuda_scatter_gather_base_kernel_func", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
+          iter, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+
+  template <typename func_t>
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name,
+    const func_t& f
   ) {
     at::assert_no_internal_overlap(self);
 
@@ -232,12 +322,12 @@ struct cuda_scatter_gather_base_kernel {
     AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
       iter.dtype(),
-      "cuda_scatter_gather_base_kernel_reduce_multiply", [&] {
+      "cuda_scatter_gather_base_kernel_func", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
         _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
-          iter, index_size, index_stride, f
+          iter, index_size, index_stride, self.numel(), f
         );
       }
     );
@@ -252,12 +342,13 @@ struct _cuda_scatter_fill_internal_kernel {
     scalar_t src_val,
     int64_t index_size,
     int64_t index_stride,
+    int64_t numel,  // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545
     const func_t& f
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
         _cuda_scatter_fill_internal_kernel<scalar_t>()(
-          sub_iter, src_val, index_size, index_stride, f
+          sub_iter, src_val, index_size, index_stride, numel, f
         );
       }
       return;
@@ -275,13 +366,12 @@ struct _cuda_scatter_fill_internal_kernel {
         && "index out of bounds"
       );
 
-      char* self_data = self_ptr + offsets[0];
-
       f(
-        (scalar_t*)self_data + idx_dim * index_stride,
+        (scalar_t*)(self_ptr + offsets[0]),
+        idx_dim * index_stride,
+        numel,
         (scalar_t*)&src_val
       );
-
     };
 
     _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
@@ -328,7 +418,7 @@ struct cuda_scatter_fill_base_kernel {
         auto src_val = *(dtype*)&src_scalar_val;
 
         _cuda_scatter_fill_internal_kernel<dtype>()(
-          iter, src_val, index_size, index_stride, f
+          iter, src_val, index_size, index_stride, self.numel(), f
         );
       }
     );
@@ -371,7 +461,7 @@ struct cuda_scatter_fill_base_kernel {
         auto src_val = *(dtype*)&src_scalar_val;
 
         _cuda_scatter_fill_internal_kernel<dtype>()(
-          iter, src_val, index_size, index_stride, f
+          iter, src_val, index_size, index_stride, self.numel(), f
         );
       }
     );
@@ -416,6 +506,35 @@ void scatter_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Ten
     cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
                                        "scatter_reduce_cuda_multiply_", reduce_multiply);
     break;
+  default :
+    break;
+  }
+}
+
+void scatter_reduce_two_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
+                                    const Tensor& src, const SCATTER_GATHER_OP& reduce) {
+  globalContext().alertNotDeterministic("scatter_reduce_cuda");
+  switch (reduce) {
+  case SCATTER_GATHER_OP::REDUCE_ADD :
+    cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_cuda_sum_", reduce_add);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MULTIPLY :
+    cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_cuda_prod_", reduce_multiply);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MAXIMUM :
+    cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_cuda_amax_", reduce_maximum);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MINIMUM :
+    cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_cuda_amin_", reduce_minimum);
+    break;
+  case SCATTER_GATHER_OP::REDUCE_MEAN :
+    cuda_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_cuda_mean_", reduce_mean);
+    break;
   }
 }
 
@@ -430,6 +549,8 @@ void scatter_scalar_reduce_cuda_kernel(const Tensor& self, const int64_t dim, co
     cuda_scatter_fill_base_kernel<false>()(self, dim, index, value,
                                       "scatter_fill_cuda_multiply_", reduce_multiply);
     break;
+  default :
+    break;
   }
 }
 
@@ -440,5 +561,6 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cuda_kernel);
 REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel);
 REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cuda_kernel);
 REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cuda_kernel);
+REGISTER_DISPATCH(scatter_reduce_two_stub, &scatter_reduce_two_cuda_kernel);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index 6a5a768ae0d8..862de29c76cb 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -1,12 +1,20 @@
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SegmentReduce.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/cub.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index 17eb91973075..08605cf4ed1b 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
@@ -9,14 +10,21 @@
 #include <c10/core/MemoryFormat.h>
 #include <c10/util/Optional.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cat_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/narrow.h>
+#endif
+
 namespace at {
 namespace native {
 
-#if defined(USE_ROCM)
-constexpr int CAT_ARRAY_BATCH_SIZE = 1024;
-#else
 constexpr int CAT_ARRAY_BATCH_SIZE = 128;
-#endif
 constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4;
 
 namespace {
@@ -83,45 +91,6 @@ struct TensorSizeStride {
   */
 
 
-// Use pinned memory and and pass the struct by pointer on ROCm
-template <typename T, typename IndexType>
-struct CatArrInputTensor {
-  T* input;
-  IndexType offset;
-  IndexType dimSize;
-  IndexType nElements;
-};
-
-template <typename T, typename IndexType, int Dims>
-C10_LAUNCH_BOUNDS_1(512)
-__global__ void HIP_CatArrayBatchedCopy(
-    T* output,
-    CatArrInputTensor<T, IndexType>* inputs,
-    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
-    const int concatDim,
-    IndexType dimStride) {
-
-    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
-    IndexType nElements = inputs[blockIdx.y].nElements;
-
-    if(tid >= nElements) return;
-
-    T* data = inputs[blockIdx.y].input;
-    IndexType offset = inputs[blockIdx.y].offset;
-    IndexType dimSize = inputs[blockIdx.y].dimSize;
-    IndexType dataOffset = offset * dimStride;
-
-    IndexType stride = gridDim.x * blockDim.x;
-
-    while( tid < nElements){
-    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
-                  os.tensorSize, os.tensorStride, dimSize, concatDim, tid);
-    output[dataOffset + elementOffset] = data[tid];
-
-    tid += stride;
-    }
-}
-
 // pass meta data directly through kernel argument instead of pin memory
 // In contiguous case, we will not need stride_size, setting it as 1 as placeholder
 // to pass compile.
@@ -171,129 +140,8 @@ __global__ void CatArrayBatchedCopy(
     }
 }
 
-template <typename scalar_t>
-void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
-                  int nDims, c10::MemoryFormat memory_format) {
-  // First, let's set up our kernel parameters. We start with a raw pointer to
-  // the storage for the output Tensor.
-  scalar_t *data = out.data_ptr<scalar_t>();
-
-  // Kernel Parameter
-  long tensorMetadataSize =
-    sizeof(CatArrInputTensor<scalar_t, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
-  auto d_inputs_storage = at::empty(
-    {tensorMetadataSize}, out.options().dtype(at::kByte));
-  auto d_inputs = static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
-    d_inputs_storage.data_ptr());
-
-  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
-
-  // Next, let's initialize the size, stride arrays for the output Tensor.
-  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i);
-      outputParam.tensorStride[i] = out.stride(i);
-    }
-  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
-    // permute the semantics of dims from NCHW to NHWC so that the input
-    // tensor is now contiguous
-    outputParam.tensorSize[0] = at::native::size(out, 0);
-    outputParam.tensorStride[0] = out.stride(0);
-    for (int i = 1; i < nDims - 1; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i + 1);
-      outputParam.tensorStride[i] = out.stride(i + 1);
-    }
-    outputParam.tensorSize[nDims - 1] = at::native::size(out, 1);
-    outputParam.tensorStride[nDims - 1] = out.stride(1);
-  } else {
-    TORCH_CHECK(false, "unsupported memory format");
-  }
-
-  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-
-  // Now we loop
-  int batchCounter = 0;
-  int64_t offset = 0;
-  for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) {
-    // Re-allocate stackInputs every iteration to avoid read-after-write hazard
-    {
-      auto stackInputs_storage = at::empty({tensorMetadataSize},
-          out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true));
-      auto stackInputs =
-        static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
-          stackInputs_storage.data_ptr());
-      for (batchCounter = 0;
-           batchCounter < CAT_ARRAY_BATCH_SIZE &&
-             (i+batchCounter) < inputs.size();
-           ++batchCounter) {
-        int64_t dimSize = 0;
-        // There is a legacy case where a 1-D empty tensor can be concat with
-        // high-dimensional tensor
-        if (inputs[i+batchCounter].numel() > 0) {
-          dimSize = at::native::size(inputs[i+batchCounter], dimension);
-        }
-
-        stackInputs[batchCounter].input =
-          inputs[i+batchCounter].data_ptr<scalar_t>();
-        stackInputs[batchCounter].offset = offset;
-        stackInputs[batchCounter].dimSize = dimSize;
-        stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel();
-
-        // update offset
-        offset += dimSize;
-      }
-      at::native::copy_(d_inputs_storage, stackInputs_storage,
-                        /* non_blocking= */ true);
-    }
-
-    // Next, let's consider how we set our kernel launch parameters.
-    // We borrow from THCApply, which the kernel's internal indexing
-    // is based on.
-    dim3 applyBlock = dim3(32*16);
-
-    //Get grid where x dim fills half gpu and y dim is number of tensors.
-    //This will have cating two tensors fill the entire grid, but prevent
-    //many threads from needlessly load meta data if their sizes is small.
-    dim3 catGrid;
-    getCatGrid(batchCounter, catGrid);
-
-    if (memory_format != c10::MemoryFormat::Contiguous) {
-      switch (dimension) {
-      case 0:
-        break;
-      case 1:
-        dimension = nDims - dimension;
-        break;
-      default:
-        dimension--;
-      }
-    }
-    // Template Declarations for dim = 1, 2, 3, 4
-#define HANDLE_CASE(DIMS) \
-    HIP_CatArrayBatchedCopy<scalar_t, unsigned int, DIMS><<<\
-        catGrid, applyBlock, 0, stream.stream()>>>(\
-            data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    switch (nDims) {
-      case 1:
-        HANDLE_CASE(1);
-        break;
-      case 2:
-        HANDLE_CASE(2);
-        break;
-      case 3:
-        HANDLE_CASE(3);
-        break;
-      case 4:
-        HANDLE_CASE(4);
-        break;
-    }
-#undef HANDLE_CASE
-  }
-}
-
 template <typename scalar_t, int batch_size, int stride_size>
-void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
+void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, int64_t dimension,
                   int nDims, c10::MemoryFormat memory_format) {
   // First, let's set up our kernel parameters. We start with a raw pointer to
   // the storage for the output Tensor.
@@ -304,19 +152,19 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
   // Next, let's initialize the size, stride arrays for the output Tensor.
   if (memory_format == c10::MemoryFormat::Contiguous) {
     for (int i = 0; i < nDims; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i);
+      outputParam.tensorSize[i] = out.size(i);
       outputParam.tensorStride[i] = out.stride(i);
     }
   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
     // permute the semantics of dims from NCHW to NHWC so that the input
     // tensor is now contiguous
-    outputParam.tensorSize[0] = at::native::size(out, 0);
+    outputParam.tensorSize[0] = out.size(0);
     outputParam.tensorStride[0] = out.stride(0);
     for (int i = 1; i < nDims - 1; ++i) {
-      outputParam.tensorSize[i] = at::native::size(out, i + 1);
+      outputParam.tensorSize[i] = out.size(i + 1);
       outputParam.tensorStride[i] = out.stride(i + 1);
     }
-    outputParam.tensorSize[nDims - 1] = at::native::size(out, 1);
+    outputParam.tensorSize[nDims - 1] = out.size(1);
     outputParam.tensorStride[nDims - 1] = out.stride(1);
   } else {
     TORCH_CHECK(false, "unsupported memory format");
@@ -335,16 +183,16 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
       int64_t dimSize = 0;
       // There is a legacy case where a 1-D empty tensor can be concat with
       // high-dimensional tensor
-      if (inputs[i+batchCounter].numel() > 0) {
-        dimSize = at::native::size(inputs[i+batchCounter], dimension);
+      if (inputs[i+batchCounter].get().numel() > 0) {
+        dimSize = inputs[i+batchCounter].get().size(dimension);
       }
-      catMetaData.input[batchCounter] = inputs[i+batchCounter].data_ptr<scalar_t>();
+      catMetaData.input[batchCounter] = inputs[i+batchCounter].get().data_ptr<scalar_t>();
       catMetaData.offset[batchCounter] = offset;
       catMetaData.dimSize[batchCounter] = dimSize;
-      catMetaData.nElements[batchCounter] = inputs[i+batchCounter].numel();
+      catMetaData.nElements[batchCounter] = inputs[i+batchCounter].get().numel();
       if (stride_size > 1) {
-        auto strides = inputs[i+batchCounter].strides();
-        auto sizes = inputs[i+batchCounter].sizes();
+        auto strides = inputs[i+batchCounter].get().strides();
+        auto sizes = inputs[i+batchCounter].get().sizes();
         for(int j = 0; j < nDims; j++){
           catMetaData.tensorStride[batchCounter].tensorSize[j] = sizes[j];
           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
@@ -403,125 +251,20 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
 }
 } // namespace
 
-Tensor cat_cuda(TensorList inputs, int64_t dimension) {
-  ScalarType high_type = result_type(inputs);
-  Tensor out = at::empty({0}, inputs.front().options().dtype(high_type));
-  at::native::cat_out_cuda(inputs, dimension, out);
-  return out;
-}
-
-inline c10::MemoryFormat compute_output_memory_format(const TensorList &inputs) {
-  c10::optional<c10::MemoryFormat> format = c10::nullopt;
-  for (auto &t : inputs) {
-    auto f = t.suggest_memory_format();
-    if (!format.has_value()) {
-      format = f;
-      continue;
-    }
-    if (format.value() == f) {
-      continue;
-    }
-    bool contiguous = (format.value() == c10::MemoryFormat::Contiguous || f == c10::MemoryFormat::Contiguous || format.value() != f);
-    if (contiguous) {
-      return c10::MemoryFormat::Contiguous;
-    }
-  }
-  return format.value();
-}
-
-Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) {
-  check_cat_no_zero_dim(inputs);
-  dimension = legacy_cat_wrap_dim(dimension, inputs);
-
-  // previously, size [0] tensors were the only possible empty tensors; thus, it
-  // wasn't possible to cat empty tensors unless all the other tensors were
-  // 1-dimensional, so we allowed these tensors to be "skipped".  We maintain
-  // this behavior for backwards compatibility, but only for this specific size
-  // (i.e. other empty sizes are not skipped).
-  // FIXME: warn if this is the case
-  auto should_skip = [](const Tensor &t) {
-    return t.dim() == 1 && at::native::size(t, 0) == 0;
-  };
-
-  const Tensor *notSkippedTensor = NULL;  // non-owning reference
-  int nDims = 0;
-
-  // Check for type promotion
-  TORCH_CHECK(canCast(result_type(inputs), out.scalar_type()), "torch.cat(): input types ",
-                      " can't be cast to the desired output type ",
-                      out.scalar_type());
-
-  // Inputs cannot alias the output tensor
-  for (int i = 0; i < inputs.size(); i++) {
-    auto lap = at::get_overlap_status(out, inputs[i]);
-    TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL &&
-                lap != at::MemOverlapStatus::FULL,
-                "torch.cat(): unsupported operation: the input tensors cannot refer to any "
-                "of the output memory locations. Found overlap in input "
-                "tensor ", i);
-  }
-  at::assert_no_internal_overlap(out);
-
-  for (int i = 0; i < inputs.size(); i++) {
-    if (should_skip(inputs[i])) {
-      continue;
-    }
-    nDims = inputs[i].dim();
-    notSkippedTensor = &inputs[i];
-    break;
+TORCH_IMPL_FUNC(cat_out_cuda)
+(ITensorListRef tensors,
+ int64_t dim,
+ int64_t valid,
+ bool all_contiguous,
+ bool all_same_dtype,
+ bool all_same_sizes_and_stride,
+ MemoryFormat memory_format,
+ const Tensor& result) {
+  if (result.numel() == 0) {
+    return;
   }
 
-  // If all inputs are empty tensors, return an empty tensor
-  if (notSkippedTensor == NULL) {
-    return out;
-  }
-
-  TORCH_CHECK(inputs.size() > 0, "torch.cat(): invalid number of inputs ", inputs.size());
-  TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
-
-  for (const Tensor& t: inputs) {
-    TORCH_CHECK(t.device() == notSkippedTensor->device(),
-                "torch.cat(): all input tensors must be on the same device. Received ",
-                t.device(), " and ", notSkippedTensor->device());
-  }
-
-  TORCH_CHECK(
-      out.device() == notSkippedTensor->device(),
-      "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
-      notSkippedTensor->device(), " and out is on ", out.device());
-
-  c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
-
-  std::vector<int64_t> size(notSkippedTensor->sizes().vec());
-
-  // Compute size of the result in the cat dimension
-  int64_t cat_dim_size = 0;
-  for (int i = 0; i < inputs.size(); i++) {
-    const Tensor &tensor = inputs[i];
-    if (should_skip(tensor)) {
-      continue;
-    }
-    check_cat_shape_except_dim(*notSkippedTensor, tensor, dimension, i);
-    cat_dim_size += at::native::size(tensor, dimension);
-  }
-
-  // Compute the size of the result
-  size[dimension] = cat_dim_size;
-
-  // skip resizing if size of result is same as expected
-  // raise a warning while resizing if output has one or more elements
-  // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
-  // for understanding why at::native::resize_output is not called directly.
-  // if (at::native::resize_output_check(out, size)) {
-  // TODO: restore the above, see https://github.com/pytorch/pytorch/issues/64709
-
-  if (out.sizes() != size) {
-    out.resize_(size, memory_format);
-  }
-
-  if (out.numel() == 0) {
-    return out;
-  }
+  auto materialized = tensors.materialize();
 
   // We parallelize the copy if all 6 conditions pass:
   //
@@ -531,76 +274,51 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) {
   // 4. All input tensors are contiguous (output tensor may be non-contig)
   // 5. All input tensors can use 32-bit indexing
 
-  const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
+  const bool all32BitIndexable = std::all_of(materialized.begin(), materialized.end(),
     [] (const Tensor& t) {
       return at::cuda::detail::canUse32BitIndexMath(t);
     });
-  const bool allContiguous = std::all_of(inputs.begin(), inputs.end(),
-    [=](const Tensor& t) {
-      return !t.defined() || t.is_contiguous(memory_format);
-    });
-  ScalarType firstType = inputs[0].scalar_type();
-  bool allSameType = std::all_of(inputs.begin(), inputs.end(),
-    [firstType](const Tensor& t) {
-      return t.scalar_type() == firstType;
-    });
-  allSameType = allSameType && (out.scalar_type() == firstType);
 
-#if defined(USE_ROCM)
-  if (inputs.size() > 1 &&
-      out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
-      at::cuda::detail::canUse32BitIndexMath(out) &&
-      allContiguous &&
-      all32BitIndexable &&
-      allSameType) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-          at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-          out.scalar_type(), "cat_cuda", [&]() {
-        hip_parallel_cat<scalar_t>(out, inputs, dimension, nDims, memory_format);
-      });
-#else
+  int nDims = materialized[valid].get().dim();
+
   // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways
   // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant
   // memory. Therefore, we could pass more inputs to cuda threads.
   // For non-contiguous, we reduce the number of inputs passed to cuda kernel due to the limitation
   // of constant memory.
-  if (inputs.size() > 1 &&
-      out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
-      at::cuda::detail::canUse32BitIndexMath(out) &&
-      allContiguous &&
+  if (materialized.size() > 1 &&
+      result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      at::cuda::detail::canUse32BitIndexMath(result) &&
+      all_contiguous &&
       all32BitIndexable &&
-      allSameType) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-          at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-          out.scalar_type(), "cat_cuda", [&]() {
-        parallel_cat<scalar_t, CAT_ARRAY_BATCH_SIZE, 1>(out, inputs, dimension, nDims, memory_format);
+      all_same_dtype) {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+          kComplexHalf, kHalf, kBool, kBFloat16,
+          result.scalar_type(), "cat_cuda", [&]() {
+        parallel_cat<scalar_t, CAT_ARRAY_BATCH_SIZE, 1>(result, materialized, dim, nDims, memory_format);
       });
-  } else if (inputs.size() > 1 &&
-      out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
-      at::cuda::detail::canUse32BitIndexMath(out) &&
+  } else if (materialized.size() > 1 &&
+      result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      at::cuda::detail::canUse32BitIndexMath(result) &&
       nDims <= CAT_ARRAY_MAX_INPUT_DIMS &&
       all32BitIndexable &&
-      allSameType &&
+      all_same_dtype &&
       memory_format == c10::MemoryFormat::Contiguous) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-          at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-          out.scalar_type(), "cat_cuda", [&]() {
-        parallel_cat<scalar_t, CAT_ARRAY_BATCH_SIZE/2, CAT_ARRAY_BATCH_SIZE/2>(out, inputs, dimension, nDims, memory_format);
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+          kComplexHalf, kHalf, kBool, kBFloat16,
+          result.scalar_type(), "cat_cuda", [&]() {
+        parallel_cat<scalar_t, CAT_ARRAY_BATCH_SIZE/2, CAT_ARRAY_BATCH_SIZE/2>(result, materialized, dim, nDims, memory_format);
       });
-#endif
   } else {
     int64_t offset = 0;
-    for (int j = 0; j < inputs.size(); j++)
-    {
-      if (should_skip(inputs[j])) continue;
-      int64_t dimSize = at::native::size(inputs[j], dimension);
-      Tensor nt = at::narrow(out, dimension, offset, dimSize);
-      copy_(nt, inputs[j]);
+    for (const Tensor& t : materialized) {
+      if (cat_should_skip_tensor(t)) continue;
+      int64_t dimSize = t.size(dim);
+      Tensor nt = at::narrow(result, dim, offset, dimSize);
+      copy_(nt, t);
       offset += dimSize;
     }
   }
-
-  return out;
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 181fbb994c3f..b16dad4b9156 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -1,7 +1,9 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/WrapDimUtils.h>
 #include <c10/macros/Macros.h>
 
@@ -13,6 +15,19 @@
 #include <ATen/native/cuda/MemoryAccess.cuh>
 #include <ATen/native/cuda/PersistentSoftmax.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_masked_softmax_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_softmax_native.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/softmax.h>
+#include <ATen/ops/_softmax_backward_data.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -153,7 +168,7 @@ inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
 
   while (block_size < (max_block_size)) block_size *= 2;
   // Launch at least a single warp - the kernel assumes that.
-  block_size = std::max(block_size, static_cast<uint64_t>(C10_WARP_SIZE));
+  block_size = std::max(block_size, static_cast<uint64_t>(at::cuda::warp_size()));
   return dim3(block_size);
 }
 
@@ -816,7 +831,7 @@ void host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t d
         int64_t remaining = outer_size;
         int64_t chunk_size = (1<<30) / dim_size;
         while(remaining > 0) {
-          dispatch_softmax_backward<scalar_t, scalar_t, accscalar_t, is_log_softmax>(
+          dispatch_softmax_backward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false /* masked_softmax */>(
             gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
           gI_ptr += chunk_size * dim_size;
           grad_ptr += chunk_size * dim_size;
@@ -840,7 +855,7 @@ void host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t d
         int64_t remaining = outer_size;
         int64_t chunk_size = (1<<30) / dim_size;
         while(remaining > 0) {
-          dispatch_softmax_backward<accscalar_t, scalar_t, accscalar_t, is_log_softmax>(
+          dispatch_softmax_backward<accscalar_t, scalar_t, accscalar_t, is_log_softmax, false /* masked_softmax */>(
             gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
           gI_ptr += chunk_size * dim_size;
           grad_ptr += chunk_size * dim_size;
@@ -914,7 +929,7 @@ TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
          input_dtype == ScalarType::Half),
         "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
   }
-  host_softmax_backward<LogSoftMaxBackwardEpilogue,true>(grad, output, dim, half_to_float, grad_input);
+  host_softmax_backward<LogSoftMaxBackwardEpilogue, true>(grad, output, dim, half_to_float, grad_input);
 }
 
 TORCH_IMPL_FUNC(softmax_cuda_out) (
@@ -939,34 +954,52 @@ TORCH_IMPL_FUNC(softmax_backward_cuda_out)
         "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
   }
   Tensor tmp = grad * output;
-  host_softmax_backward<SoftMaxBackwardEpilogue,false>(tmp, output, dim, half_to_float, grad_input);
+  host_softmax_backward<SoftMaxBackwardEpilogue, false>(tmp, output, dim, half_to_float, grad_input);
 }
 
-Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) {
-    TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
-    bool is_transformer_mask = (input.dim() == 4 && mask.dim() == 2 && input.size(0) == mask.size(0) && input.size(2) == mask.size(1) && input.size(3) == mask.size(1));
-    TORCH_CHECK(mask.sizes() == input.sizes() || is_transformer_mask, "Mask shape should match input");
-    // Always do masked softmax on last dim
-    int softmax_elements = input.size(input.dim() - 1);
-    // Persistent softmax only support softmax_elements <= 1024,
-    // Therefore once softmax_elements > 1024, we need to fallback to vanilla masked_softmax
-    Tensor output = at::empty_like(input, input.options());
-    // Fallback to a slower masked softmax solution
-    if (softmax_elements > 1024 || softmax_elements * input.element_size() > 4096 || !mask.is_contiguous()) {
-        AT_DISPATCH_FLOATING_TYPES_AND2(
-          ScalarType::Half,
-          ScalarType::BFloat16,
-          input.scalar_type(),
-          "masked_softmax",
-          [&] {
-            Tensor mask_not = mask.logical_not();
-            output = at::softmax(input.masked_fill(mask_not, -std::numeric_limits<scalar_t>::infinity()), -1);
-          });
-        return output;
-    }
-    int batch_count = input.numel() / softmax_elements;
-    int chunk_size = input.numel() / input.size(0);
-    if (is_transformer_mask) {
+Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const c10::optional<int64_t> dim_) {
+  Tensor output = at::empty_like(input_, input_.options());
+  TORCH_CHECK(mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
+
+  // If input is [B, H, T, T] and mask is [B, T]
+  // we have special fast kernel
+  bool is_BxT_mask = (input_.dim() == 4 && mask_.dim() == 2 && input_.size(0) == mask_.size(0) && input_.size(2) == mask_.size(1) && input_.size(3) == mask_.size(1));
+
+  // If input is [B, H, T, T] and mask is [T, T]
+  // expand mask to [B, H, T, T] and treat it like regular mask
+  // TODO We should have special fast kernel for TxT mask as well
+  bool is_TxT_mask = input_.dim() == 4 && mask_.dim() == 2 && input_.size(3) == mask_.size(1) && input_.size(2) == mask_.size(0) && mask_.size(0) == mask_.size(1);
+  TORCH_CHECK(mask_.sizes() == input_.sizes() || is_BxT_mask || is_TxT_mask, "Mask shape should match input");
+
+  auto input = input_.dim() == 0 ? input_.view(1) : input_;
+  auto mask = mask_.dim() == 0 ? mask_.view(1) : mask_;
+  if (is_TxT_mask) {
+    mask = mask.expand(input.sizes());
+  }
+  int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1;
+
+  int softmax_elements = input.size(dim);
+  // Persistent softmax is only supported when all of the conditions are held:
+  //     1) softmax_elements <= 1024
+  //     2) softmax_elements * input.element_size() <= 4096
+  //     3) mask.is_contiguous()
+  //     4) dim == input.dim() - 1
+  // Otherwise, we fallback to vanilla softmax (where we do not support transformer_mask since converting the mask is expensive)
+  if (softmax_elements > 1024 || softmax_elements * input.element_size() > 4096 || !mask.is_contiguous() || dim < input.dim()-1) {
+    TORCH_CHECK(mask.sizes() == input.sizes(), "Mask shape should match input shape; transformer_mask is not supported in the fallback case.");
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      input.scalar_type(),
+      "masked_softmax",
+      [&] {
+        output = at::softmax(input.masked_fill(mask, -std::numeric_limits<scalar_t>::infinity()), dim);
+      });
+    return output;
+  }
+  int batch_count = input.numel() / softmax_elements;
+  int chunk_size = input.numel() / input.size(0);
+  if (is_BxT_mask) {
     // Only support when num_heads is even in transformer
     TORCH_CHECK(input.size(1) % 2 == 0, "Only support when num_heads is even in transformer");
     AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -988,7 +1021,7 @@ Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) {
         );
       });
 
-    } else {
+  } else {
     AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::Half,
       ScalarType::BFloat16,
@@ -1005,8 +1038,71 @@ Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) {
           mask.data_ptr<bool>()
         );
       });
-    }
-    return output;
+  }
+  return output;
+}
+
+Tensor masked_softmax_backward_cuda(
+    const Tensor& grad_,
+    const Tensor& output_,
+    const Tensor& mask_,
+    const c10::optional<int64_t> dim_) {
+  Tensor grad_input = at::empty_like(grad_, grad_.options());
+  if (grad_.numel() == 0) {
+    return grad_input;
+  }
+
+  auto grad = grad_.contiguous();
+  auto output = output_.contiguous();
+  auto mask = mask_.contiguous();
+  int64_t dim = dim_.has_value() ? dim_.value() : output.dim() - 1;
+
+  grad = grad.dim() == 0 ? grad.view(1) : grad;
+  mask = mask.dim() == 0 ? mask.view(1) : mask;
+  output = output.dim() == 0 ? output.view(1) : output;
+
+  TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(grad.sizes() == mask.sizes(), "Mask shape should match grad shape");
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
+
+  int softmax_elements = output.size(dim);
+  int64_t batch_count = grad.numel() / softmax_elements;
+
+  if (softmax_elements > 1024 || softmax_elements * grad.element_size() > 4096 || dim < grad.dim()-1) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      grad_input.scalar_type(),
+      "masked_softmax_backward",
+      [&] {
+        grad_input = at::_softmax_backward_data(
+          grad,
+          output.masked_fill(mask, 0),
+          dim,
+          grad.scalar_type()
+        );
+      });
+  } else {
+    grad = grad * output;
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      grad_input.scalar_type(),
+      "masked_softmax_backward",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        dispatch_softmax_backward<scalar_t, scalar_t, accscalar_t, false, true /* masked_softmax */>(
+          grad_input.data_ptr<scalar_t>(),  // gI_ptr
+          grad.data_ptr<scalar_t>(),  // grad_ptr
+          output.data_ptr<scalar_t>(),  // output_ptr
+          softmax_elements,  // softmax_elements
+          softmax_elements,   // softmax_elements_stride
+          batch_count,  // batch_count
+          mask.data_ptr<bool>()  /* not masked */
+        );
+      });
+  }
+  return grad_input;
 }
 }
 }
diff --git a/aten/src/ATen/native/cuda/Sort.cpp b/aten/src/ATen/native/cuda/Sort.cpp
index 8bb7d93bfdb5..efef65f9f2e1 100644
--- a/aten/src/ATen/native/cuda/Sort.cpp
+++ b/aten/src/ATen/native/cuda/Sort.cpp
@@ -1,11 +1,24 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/Sort.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/ExpandUtils.h>
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/WrapDimUtils.h>
+#include <ATen/native/Sorting.h>
 #include <ATen/native/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/sort_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #include <limits>
 
 namespace at { namespace native {
@@ -26,7 +39,7 @@ bool should_use_small_sort(const TensorBase &self, int64_t dim) {
 
 std::vector<int64_t> infer_dense_strides_dim_last(const Tensor & self, int64_t dim);
 
-void fillSliceWithIndex(Tensor& t,int dim) {
+void fillSliceWithIndex(const Tensor& t, int dim) {
   if (t.numel()) {
     auto sizes = DimVector(t.dim(), 1);
     sizes[dim] = t.sizes()[dim];
@@ -39,18 +52,28 @@ void fillSliceWithIndex(Tensor& t,int dim) {
 // We perform a segmented sort in cub with inputs that have
 // more than 1024/2048 elements along the selected dimension.
 // Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
-std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending, Tensor & values, Tensor & indices) {
+void sort_cuda_kernel(
+    const TensorBase& self_base,
+    const TensorBase& values_base,
+    const TensorBase& indices_base,
+    int64_t dim,
+    bool descending,
+    bool stable) {
   // this algorithm is always stable
-  TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional<bool> for stable has to have value.");
-  TensorArg self_arg{self, "self", 1}, values_arg{values, "values", 2}, indices_arg{indices, "indices", 3};
-  checkAllSameGPU(__func__, {self_arg, values_arg, indices_arg});
 
-  bool is_non_overlapping_and_dense = self.is_non_overlapping_and_dense();
-  int64_t ndim = self.dim();
-  dim = maybe_wrap_dim(dim, ndim);
-  int64_t nsort = self.sizes()[dim];
+  // Macro for converting `TensorBase` -> `Tensor` without
+  // reference count bumps.
+#define TOTENSOR(BASE, VAR)           \
+  OptionalTensorRef opt_##BASE(BASE); \
+  const Tensor& VAR = *opt_##BASE;
+
+  // Converting TensorBase into Tensor.
+  // We will need Tensor's methods from this point onwards.
+  TOTENSOR(self_base, self);
+  TOTENSOR(values_base, values);
+  TOTENSOR(indices_base, indices);
 
-  TORCH_CHECK(nsort <= std::numeric_limits<int>::max(),
+  TORCH_CHECK(self.sizes()[dim] <= std::numeric_limits<int>::max(),
     "The dimension being sorted can not have more than INT_MAX elements.");
 
   const auto self_dtype = self.dtype();
@@ -60,37 +83,9 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
   TORCH_CHECK(self_dtype != ScalarType::ComplexFloat && self_dtype != ScalarType::ComplexDouble,
     "Sort currently does not support complex dtypes on CUDA.");
 
-  if (ndim == 0) {
-    if (!values.defined()) {
-      values = self.clone();
-    } else {
-      values.resize_as_(self);
-      values.copy_(self);
-    }
-    if (!indices.defined()) {
-      indices = at::zeros({}, self.options().dtype(kLong));
-    } else {
-      indices.resize_as_(self);
-      indices.zero_();
-    }
-    return std::forward_as_tuple(values, indices);
-  }
-
   // use inplace algorithm for smaller input sizes without stable=True
-  if (should_use_small_sort(self, dim) && !stable.value()) {
+  if (should_use_small_sort(self, dim) && !stable) {
     // from thc: sorted->values, indices->indices, input->self
-
-    if (!values.defined()) {
-      values = at::empty_like(self);
-    }
-    if (!indices.defined()) {
-      indices = at::empty_like(self, self.options().dtype(kLong));
-    }
-
-    // Make sure sufficient output space is allocated
-    auto self_size = self.sizes();
-    at::native::resize_output(values, self_size);
-    at::native::resize_output(indices, self_size);
     fillSliceWithIndex(indices, dim);
 
     // We sort k/v pairs in-place; copy unsorted input to output
@@ -99,12 +94,12 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
     // Sort using our in-place k/v kernel that supports arbitrary
     // layout
     sortKeyValueInplace(values, indices, dim, descending);
-    return std::forward_as_tuple(values, indices);
+    return;
   }
 
   Tensor self_;
   bool newself = false;
-  if (is_non_overlapping_and_dense && self.stride(dim) == 1) {
+  if (self.is_non_overlapping_and_dense() && self.stride(dim) == 1) {
     self_ = self;
   } else {
     auto new_strides_unsort = infer_dense_strides_dim_last(self, dim);
@@ -114,19 +109,6 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
   }
 
   c10::MaybeOwned<Tensor> values_tmp, indices_tmp;
-  if (!values.defined()) {
-    if (is_non_overlapping_and_dense) {
-      values = at::empty_strided(self.sizes(), self.strides(), self.options());
-    } else {
-      auto strides = at::infer_dense_strides(self.sizes(), self.strides());
-      values = at::empty_strided(self.sizes(), strides, self.options());
-    }
-  } else {
-    TORCH_CHECK(self_.scalar_type() == values.scalar_type(),
-      "Unexpected dtype for values, expect ", self_.scalar_type(), ", got ", values.scalar_type());
-    values.resize_as_(self);
-  }
-
   if (values.strides() == self_.strides() && (newself || get_overlap_status(self, values) == MemOverlapStatus::NO)) {
     values_tmp = c10::MaybeOwned<Tensor>::borrowed(values);
   } else {
@@ -134,18 +116,6 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
         at::empty_strided(self_.sizes(), self_.strides(), self_.options()));
   }
 
-  if (!indices.defined()) {
-    if (is_non_overlapping_and_dense) {
-      indices = at::empty_strided(self.sizes(), self.strides(), self.options().dtype(kLong));
-    } else {
-      auto strides = at::infer_dense_strides(self.sizes(), self.strides());
-      indices = at::empty_strided(self.sizes(), strides, self.options().dtype(kLong));
-    }
-  } else {
-    TORCH_CHECK(kLong == indices.scalar_type(),
-      "Unexpected dtype for values, expect torch.long, got ", indices.scalar_type());
-    indices.resize_as_(self);
-  }
   if (indices.strides() != self_.strides()) {
     indices_tmp = c10::MaybeOwned<Tensor>::owned(
         at::empty_strided(self_.sizes(), self_.strides(), self_.options().dtype(kLong)));
@@ -161,20 +131,11 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
   if (!indices_tmp->is_same(indices)) {
     indices.copy_(*indices_tmp);
   }
-  return std::forward_as_tuple(values, indices);
 }
 
-std::tuple<Tensor &,Tensor &> sort_out_cuda(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices) {
-  return sort_out_stable_cuda(self, /*stable=*/false, dim, descending, values, indices);
-}
-
-std::tuple<Tensor,Tensor> sort_stable_cuda(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending) {
-  Tensor values, indices;
-  return sort_out_stable_cuda(self, stable, dim, descending, values, indices);
-}
-
-std::tuple<Tensor,Tensor> sort_cuda(const Tensor & self, int64_t dim, bool descending) {
-  return sort_stable_cuda(self, /*stable=*/false, dim, descending);
-}
+// TODO: we should handle this accordingly when we start using REGISTER_HIP_DISPATCH,
+// since REGISTER_DISPATCH won't work in this cpp file.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_CUDA_DISPATCH(sort_stub, &sort_cuda_kernel);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
index 15c89f7b76e2..5c08ddf59782 100644
--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@@ -11,6 +11,7 @@
 #include <ATen/native/cuda/SortingCommon.cuh>
 
 #include <limits>
+#include <c10/core/DeviceArray.h>
 
 namespace at { namespace native {
 
@@ -231,6 +232,7 @@ __global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64
 }
 
 
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
 __global__ void fill_index_and_segment_kernel(
     int2 *data, int numel, at::cuda::detail::IntDivider<uint32_t> nsort_divider) {
   CUDA_KERNEL_LOOP(idx, numel) {
@@ -241,6 +243,7 @@ __global__ void fill_index_and_segment_kernel(
   }
 }
 
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
 __global__ void fill_reverse_indices_kernel(
     int64_t *data, int numel, at::cuda::detail::IntDivider<uint32_t> nsort_divider) {
   CUDA_KERNEL_LOOP(idx, numel) {
@@ -248,6 +251,31 @@ __global__ void fill_reverse_indices_kernel(
   }
 }
 
+template<typename scalar_t>
+inline void segmented_sort_large_segments(
+    const int64_t nsegments, const int64_t nsort, const int64_t n, const bool descending,
+    const scalar_t * self_ptr, scalar_t * values_ptr, int64_t * indices_ptr
+  ) {
+  using namespace at::cuda::detail;
+  auto allocator = at::cuda::getCUDADeviceAllocator();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  dim3 block = CUDA_NUM_THREADS;
+  dim3 grid = GET_BLOCKS(nsort);
+  c10::DeviceArray<int64_t> indices(*allocator, nsort);
+  at::cuda::detail::IntDivider<uint32_t> nsort_divider(nsort);
+  fill_reverse_indices_kernel<<<grid, block, 0, stream>>>(
+      indices.get(), nsort, nsort_divider);
+  const int64_t *initial_indices = indices.get();
+
+  for (auto i: c10::irange(nsegments)){
+    at::cuda::cub::radix_sort_pairs<scalar_t, int64_t>(
+        self_ptr, values_ptr, initial_indices, indices_ptr,
+        nsort, descending);
+    indices_ptr += nsort;
+    self_ptr += nsort;
+    values_ptr += nsort;
+  }
+}
 
 template<typename scalar_t>
 inline void segmented_sort_pairs_by_full_sort(
@@ -325,14 +353,14 @@ void launch_stable_sort_kernel(
   TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
   int64_t *indices_ptr = indices.data_ptr<int64_t>();
 
-#if defined(USE_ROCM)
-  constexpr bool is_rocm = true;
+#if (defined(USE_ROCM) && ROCM_VERSION < 40500)
+  constexpr bool is_rocm_bf16_sort_unsupported = true;
 #else
-  constexpr bool is_rocm = false;
+  constexpr bool is_rocm_bf16_sort_unsupported = false;
 #endif
 
   AT_DISPATCH_ALL_TYPES_AND3(kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&]{
-    c10::guts::if_constexpr<!(is_rocm && std::is_same<scalar_t, c10::BFloat16>::value)>([&](auto _){
+    c10::guts::if_constexpr<!(is_rocm_bf16_sort_unsupported && std::is_same<scalar_t, c10::BFloat16>::value)>([&](auto _){
       const scalar_t *self_ptr = self.data_ptr<scalar_t>();
       scalar_t *values_ptr = values.data_ptr<scalar_t>();
       int64_t remaining = _(numel);
@@ -340,7 +368,11 @@ void launch_stable_sort_kernel(
         int64_t n = std::min(remaining, nbatch);
         int64_t nsegments = n / nsort;
 
-        if (nsegments < 128) {
+        if (nsegments == 1 || nsort >= 1000000) { //rough heuristics where even a single sort occupies GPU
+          segmented_sort_large_segments(
+              nsegments, nsort, n, descending,
+              self_ptr, values_ptr, indices_ptr);
+        } else if (nsegments < 128) {
           segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending,
             self_ptr, values_ptr, indices_ptr);
         } else {
@@ -353,7 +385,7 @@ void launch_stable_sort_kernel(
         values_ptr += n;
         indices_ptr += n;
       }
-    }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm"); });
+    }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm < 4.5"); });
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/SortImpl.cu b/aten/src/ATen/native/cuda/SortImpl.cu
index a806c4a13874..c6e29262046e 100644
--- a/aten/src/ATen/native/cuda/SortImpl.cu
+++ b/aten/src/ATen/native/cuda/SortImpl.cu
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <thrust/execution_policy.h>
 #include <thrust/sort.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cuda/Sorting.cpp b/aten/src/ATen/native/cuda/Sorting.cpp
index fc526497812d..97b8df55416e 100644
--- a/aten/src/ATen/native/cuda/Sorting.cpp
+++ b/aten/src/ATen/native/cuda/Sorting.cpp
@@ -1,12 +1,27 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/Sorting.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/NamedTensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/Context.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
+
 #include <ATen/native/SortingUtils.h>
 #include <ATen/native/ReduceOpsUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/full.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/where.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index d72788c1b97c..52fa2710596d 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -5,6 +5,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <c10/macros/Macros.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/SortingRadixSelect.cuh>
@@ -189,7 +190,7 @@ struct KthValueLauncher {
     }
 
     dim3 block(std::min(
-        round_up(slice_size, (int64_t)C10_WARP_SIZE), (int64_t)1024));
+        round_up(slice_size, (int64_t)at::cuda::warp_size()), (int64_t)1024));
     auto stream = at::cuda::getCurrentCUDAStream();
     gatherKthValue<scalar_t, index_t, all_dims><<<grid, block, 0, stream>>>(
         self_info,
@@ -228,7 +229,7 @@ struct MedianLauncher {
     }
 
     dim3 block(std::min(
-        round_up(slice_size, (int64_t)C10_WARP_SIZE), (int64_t)1024));
+        round_up(slice_size, (int64_t)at::cuda::warp_size()), (int64_t)1024));
     auto stream = at::cuda::getCurrentCUDAStream();
     gatherMedian<scalar_t, index_t, all_dims><<<grid, block, 0, stream>>>(
         values_info,
diff --git a/aten/src/ATen/native/cuda/SparseMM.cu b/aten/src/ATen/native/cuda/SparseMM.cu
index 0cc3fe3806a0..922efa5f4fcb 100644
--- a/aten/src/ATen/native/cuda/SparseMM.cu
+++ b/aten/src/ATen/native/cuda/SparseMM.cu
@@ -1,7 +1,13 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/sspaddmm_native.h>
+#endif
+
 namespace at { namespace native {
 // sparse, sparse, sparse, dense, real, real -> sparse
 Tensor& _sspaddmm_out_only_sparse_cuda(const Tensor& self,
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index 95fef7d09150..b418e8ffc8ab 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -1,19 +1,28 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
-#include <ATen/Utils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/detail/KernelUtils.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/Resize.h>
-#include <ATen/native/TensorIterator.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_fft_c2c_native.h>
+#include <ATen/ops/_fft_c2r_native.h>
+#include <ATen/ops/_fft_r2c_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/mul.h>
+#endif
+
 #include <cufft.h>
 #include <cufftXt.h>
 
@@ -248,7 +257,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
   out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
 
   // Create the transform plan (either from cache or locally)
-  const auto value_type = c10::toValueType(input.scalar_type());
+  const auto value_type = c10::toRealValueType(input.scalar_type());
   auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex());
   CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type);
   CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index());
@@ -445,7 +454,7 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   DimVector out_sizes(in_sizes.begin(), in_sizes.end());
   out_sizes[dim.back()] = lastdim;
 
-  auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
+  auto output = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
 
   if (use_optimized_cufft_path(dim)) {
     Tensor temp;
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 4a91f58e61ec..2f5c13006578 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -1,19 +1,11 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
-#include <ATen/Utils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/CUDAHooksInterface.h>
-#include <ATen/native/Resize.h>
-#include <ATen/native/TensorIterator.h>
 #include <ATen/native/SpectralOpsUtils.h>
-#include <ATen/native/cuda/CuFFTUtils.h>
-#include <ATen/native/cuda/CuFFTPlanCache.h>
-#include <c10/util/accumulate.h>
-
 
 #include <cmath>
 #include <vector>
@@ -21,8 +13,6 @@
 
 namespace at { namespace native {
 
-using namespace at::native::detail;
-
 // Offset calculator for indexing in Hermitian mirrored order.
 // In mirrored dims, maps linear index i to (n - i) % n
 template <typename index_t>
@@ -116,17 +106,17 @@ void _fft_fill_with_conjugate_symmetry_cuda_(
       signal_half_sizes, out_strides, mirror_dims, element_size);
 
   const auto numel = c10::multiply_integers(signal_half_sizes);
-  AT_DISPATCH_COMPLEX_TYPES(dtype, "_fft_fill_with_conjugate_symmetry", [&] {
-        using namespace cuda::detail;
-        _fft_conjugate_copy_kernel<<<
-          GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-              numel,
-              static_cast<scalar_t*>(out_data),
-              static_cast<const scalar_t*>(in_data),
-              input_offset_calculator,
-              output_offset_calculator);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "_fft_fill_with_conjugate_symmetry", [&] {
+      using namespace cuda::detail;
+      _fft_conjugate_copy_kernel<<<
+        GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+            numel,
+            static_cast<scalar_t*>(out_data),
+            static_cast<const scalar_t*>(in_data),
+            input_offset_calculator,
+            output_offset_calculator);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
 }
 
 REGISTER_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cuda_);
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 958ad88183e8..5476682d7c4d 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -1,10 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/Resize.h>
 #include <ATen/cuda/Atomic.cuh>
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bincount_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/histc_native.h>
+#include <ATen/ops/zeros_native.h>
+#endif
+
 namespace at {
 namespace cuda {
 #define THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM 100
@@ -19,16 +32,22 @@ namespace cuda {
  */
 enum class CUDAHistogramMemoryType { SHARED, MULTI_BLOCK, GLOBAL };
 namespace {
-  template<typename input_t, typename IndexType>
-  __device__ static IndexType getBin(input_t bVal, input_t minvalue, input_t maxvalue, int64_t nbins) {
-    IndexType bin = (int)((bVal - minvalue) * nbins / (maxvalue - minvalue));
-    // (only applicable for histc)
-    // while each bin is inclusive at the lower end and exclusive at the higher, i.e. [start, end)
-    // the last bin is inclusive at both, i.e. [start, end], in order to include maxvalue if exists
-    // therefore when bin == nbins, adjust bin to the last bin
-    if (bin == nbins) bin -= 1;
-    return bin;
-  }
+template <typename input_t, typename IndexType>
+__device__ static IndexType getBin(
+    input_t bVal,
+    at::acc_type<input_t, /*is_cuda=*/true> minvalue,
+    at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
+    int64_t nbins) {
+  IndexType bin = (int)(((bVal - minvalue)) * nbins / (maxvalue - minvalue));
+  // (only applicable for histc)
+  // while each bin is inclusive at the lower end and exclusive at the higher,
+  // i.e. [start, end) the last bin is inclusive at both, i.e. [start, end], in
+  // order to include maxvalue if exists therefore when bin == nbins, adjust bin
+  // to the last bin
+  if (bin == nbins)
+    bin -= 1;
+  return bin;
+}
 }
 
 /*
@@ -49,8 +68,8 @@ __global__ void kernelHistogram1D(
     detail::TensorInfo<output_t, IndexType> p, /* partial output */
     detail::TensorInfo<input_t, IndexType> b, /* input */
     int64_t nbins,
-    input_t minvalue,
-    input_t maxvalue,
+    at::acc_type<input_t, /*is_cuda=*/true> minvalue,
+    at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
     IndexType totalElements,
     Op getOp) {
   extern __shared__ unsigned char my_smem[];
@@ -72,7 +91,8 @@ __global__ void kernelHistogram1D(
       const auto bVal = b.data[bOffset];
       if (bVal >= minvalue && bVal <= maxvalue) {
         // Use value at `b` as an offset of `smem`
-        const IndexType bin = getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
+        const IndexType bin =
+            getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
         gpuAtomicAddNoReturn(&smem[bin], getOp(linearIndex));
       }
     }
@@ -98,7 +118,8 @@ __global__ void kernelHistogram1D(
       const auto bVal = b.data[bOffset];
       if (bVal >= minvalue && bVal <= maxvalue) {
         // Use value at `b` as an offset of `p`
-        const IndexType bin = getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
+        const IndexType bin =
+            getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
         const IndexType pIdx = p.strides[0] * blockIdx.x + bin;
         const IndexType pOffset =
             detail::IndexToOffset<output_t, IndexType, PDims>::get(pIdx, p);
@@ -129,7 +150,8 @@ __global__ void kernelHistogram1D(
       const auto bVal = b.data[bOffset];
       if (bVal >= minvalue && bVal <= maxvalue) {
         // Use value at `b` as an offset of `a`
-        const IndexType bin = getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
+        const IndexType bin =
+            getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
         const IndexType aOffset =
             detail::IndexToOffset<output_t, IndexType, ADims>::get(bin, a);
         gpuAtomicAddNoReturn(&a.data[aOffset], getOp(linearIndex));
@@ -138,13 +160,23 @@ __global__ void kernelHistogram1D(
   }
 }
 
-#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM)                              \
-  kernelHistogram1D<output_t, input_t, IndexType, 1, 2, -1, MEMORY_TYPE>              \
-      <<<grid,                                                                        \
-         block,                                                                       \
-         SHARED_MEM,                                                                  \
-         getCurrentCUDAStream()>>>(                                                   \
-          aInfo, pInfo, bInfo, nbins, minvalue, maxvalue, totalElements, WEIGHTS_OP); \
+#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM)                 \
+  kernelHistogram1D<                                                     \
+      output_t,                                                          \
+      input_t,                                                           \
+      IndexType,                                                         \
+      1,                                                                 \
+      2,                                                                 \
+      -1,                                                                \
+      MEMORY_TYPE><<<grid, block, SHARED_MEM, getCurrentCUDAStream()>>>( \
+      aInfo,                                                             \
+      pInfo,                                                             \
+      bInfo,                                                             \
+      nbins,                                                             \
+      minvalue,                                                          \
+      maxvalue,                                                          \
+      totalElements,                                                     \
+      WEIGHTS_OP);                                                       \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
 #define HANDLE_SWITCH_CASE(mType, getOp)                                   \
@@ -193,8 +225,8 @@ bool CUDA_tensor_histogram(
     at::Tensor b, /* input */
     at::Tensor c, /* weights(optional) */
     int64_t nbins,
-    input_t minvalue,
-    input_t maxvalue,
+    at::acc_type<input_t, /*is_cuda=*/true> minvalue,
+    at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
     TensorArgType aType = TensorArgType::ReadWrite,
     TensorArgType bType = TensorArgType::ReadOnly,
     TensorArgType cType = TensorArgType::ReadOnly) {
@@ -299,9 +331,14 @@ Tensor _bincount_cuda_template(
     AT_ERROR("input and weights should have the same length");
   }
 
-  const int64_t nbins = std::max(*self.max().cpu().data_ptr<input_t>() + (int64_t)1, minlength);
-  const input_t minvalue = 0;
-  const input_t maxvalue = nbins;
+  const int64_t nbins =
+      std::max(self.max().item<input_t>() + (int64_t)1, minlength);
+
+  // we are using acc_type for the bounds, in particular int64_t for integers
+  // in order to avoid overflows (e.g. using 256 bins for dtype uint8)
+  using bounds_t = at::acc_type<input_t, /*is_cuda=*/true>;
+  const bounds_t minvalue = 0;
+  const bounds_t maxvalue = nbins;
   // alloc output counter on GPU
   Tensor output;
   if (has_weights) {
@@ -311,7 +348,7 @@ Tensor _bincount_cuda_template(
         weights.options().layout_opt(),
         weights.options().device_opt(),
         weights.options().pinned_memory_opt());
-    auto ret = cuda::CUDA_tensor_histogram<weights_t, input_t, true>(
+    cuda::CUDA_tensor_histogram<weights_t, input_t, true>(
         output, self, weights, nbins, minvalue, maxvalue);
   } else {
     output = native::zeros(
@@ -320,7 +357,7 @@ Tensor _bincount_cuda_template(
         c10::nullopt /* layout */,
         DeviceType::CUDA,
         c10::nullopt /* pin_memory */);
-    auto ret = cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
+    cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
         output, self, weights, nbins, minvalue, maxvalue);
   }
   return output;
@@ -331,8 +368,8 @@ template <typename input_t>
 Tensor _histc_cuda_template(
     const Tensor& self,
     int64_t nbins,
-    input_t min,
-    input_t max) {
+    at::acc_type<input_t, /*is_cuda=*/true> min,
+    at::acc_type<input_t, /*is_cuda=*/true> max) {
   if (nbins <= 0) {
     AT_ERROR("bins must be > 0");
   }
@@ -374,8 +411,8 @@ Tensor _histc_cuda_template(
 #endif
   TORCH_CHECK(minvalue < maxvalue, "max must be larger than min");
 
-  auto ret = cuda::CUDA_tensor_histogram<input_t, input_t, false>(
-    output, self, Tensor(), nbins, minvalue, maxvalue);
+  cuda::CUDA_tensor_histogram<input_t, input_t, false>(
+      output, self, Tensor(), nbins, minvalue, maxvalue);
   return output;
 }
 } // namespace
@@ -412,7 +449,9 @@ Tensor _histc_cuda(
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("_histc_cuda");
   return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] {
-    return _histc_cuda_template<scalar_t>(self, nbins, min.to<scalar_t>(), max.to<scalar_t>());
+    using bounds_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+    return _histc_cuda_template<scalar_t>(
+        self, nbins, min.to<bounds_t>(), max.to<bounds_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/TensorCompare.cpp b/aten/src/ATen/native/cuda/TensorCompare.cpp
index 5d2c84fdaca5..b99df69f3b2a 100644
--- a/aten/src/ATen/native/cuda/TensorCompare.cpp
+++ b/aten/src/ATen/native/cuda/TensorCompare.cpp
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/TensorCompare.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu
index a786488cabef..f81c90c56517 100644
--- a/aten/src/ATen/native/cuda/TensorCompare.cu
+++ b/aten/src/ATen/native/cuda/TensorCompare.cu
@@ -39,12 +39,16 @@ void isneginf_kernel_impl(TensorIteratorBase &iter) {
   });
 }
 
-void clamp_kernel_impl(TensorIterator& iter) {
+void clamp_kernel_impl(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_cuda", [&] {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower, scalar_t upper) -> scalar_t {
       // Propagate nan, which doesn't propagate automatically for ROCm
       if (at::_isnan(v)) {
         return v;
+      } if (at::_isnan(lower)) {
+        return lower;
+      } if (at::_isnan(upper)) {
+        return upper;
       } else {
         return ::min(::max(v, lower), upper);
       }
@@ -82,50 +86,10 @@ void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min) {
   launch_clamp_scalar(iter, min, min, at::native::detail::ClampLimits::Min);
 }
 
-void clamp_min_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_min_cuda", [&] {
-    if (iter.is_cpu_scalar(2)){
-      Scalar min = iter.scalar_value<scalar_t>(2);
-      iter.remove_operand(2);
-      clamp_min_scalar_kernel_impl(iter, min);
-    } else {
-      gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower) -> scalar_t {
-        // Propagate nan, which doesn't propagate automatically for ROCm
-        if (_isnan(v)) {
-          return v;
-        } else {
-          return ::max(v, lower);
-        }
-      });
-    }
-  });
-}
-
-
 void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max) {
   launch_clamp_scalar(iter, max, max, at::native::detail::ClampLimits::Max);
 }
 
-void clamp_max_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_max_cuda", [&] {
-    if (iter.is_cpu_scalar(2)){
-      Scalar max = iter.scalar_value<scalar_t>(2);
-      iter.remove_operand(2);
-      clamp_max_scalar_kernel_impl(iter, max);
-    } else {
-      gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t upper) -> scalar_t {
-        // Propagate nan, which doesn't propagate automatically for ROCm
-        if (_isnan(v)) {
-          return v;
-        } else {
-          return ::min(v, upper);
-        }
-      });
-    }
-  });
-}
-
-
 } // anonymous namespace
 
 
@@ -133,8 +97,6 @@ REGISTER_DISPATCH(where_kernel, &where_kernel_impl);
 REGISTER_DISPATCH(isposinf_stub, &isposinf_kernel_impl);
 REGISTER_DISPATCH(isneginf_stub, &isneginf_kernel_impl);
 REGISTER_DISPATCH(clamp_stub, &clamp_kernel_impl);
-REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_impl);
-REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_impl);
 REGISTER_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl);
 REGISTER_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl);
 REGISTER_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl);
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 29bd7adce5a0..f442c9c9f4e1 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -1,14 +1,29 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/EmptyTensor.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/EmptyTensor.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/native/cuda/Resize.h>
 #include <ATen/native/TensorFactories.h>
-#include <ATen/NativeFunctions.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_strided_native.h>
+#include <ATen/ops/eye_native.h>
+#include <ATen/ops/tril_indices_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_indices_native.h>
+#include <ATen/ops/triu_native.h>
+#endif
+
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cpp b/aten/src/ATen/native/cuda/TensorModeKernel.cpp
index 73ae5f3199b9..c04693bb72e2 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cpp
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cpp
@@ -1,5 +1,5 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/TensorModeKernel.h>
-#include <ATen/Functions.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/ReduceOpsUtils.h>
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu
index 40a8e19eb445..ce76987e94e0 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cu
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu
@@ -3,7 +3,6 @@
 #include <ATen/native/cuda/TensorModeKernel.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/NonEmptyUtils.h>
-#include <ATen/native/TensorCompare.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/ThrustAllocator.h>
 #include <c10/core/DeviceArray.h>
@@ -142,7 +141,8 @@ void handle_fused_mode(
     int64_t slice_size,
     int64_t slices) {
   constexpr int num_threads = size / 2;
-  static_assert(num_threads % C10_WARP_SIZE == 0 &&
+  int warp_size = at::cuda::warp_size();
+  TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 &&
                 num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, "");
   const auto memsize =
       (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int));
@@ -191,15 +191,9 @@ void fused_mode(
     case 16:
     case 8:
     case 4:
-    case 2: {
-      if (ceilPowerOf2 > 2 * C10_WARP_SIZE) {
-        handle_fused_mode<128, scalar_t>(
-            grid, self, ti_values, ti_indices, slice_size, slices);
-      } else {
-        handle_fused_mode<2 * C10_WARP_SIZE, scalar_t>(
-            grid, self, ti_values, ti_indices, slice_size, slices);
-      }
-    }
+    case 2:
+      handle_fused_mode<128, scalar_t>(
+          grid, self, ti_values, ti_indices, slice_size, slices);
       break;
     case 1:
     default:
diff --git a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
index cc1c523dc1a3..0bb7eb410acf 100644
--- a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
+++ b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
@@ -1,9 +1,15 @@
-
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/set_native.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -27,8 +33,8 @@ Tensor& set_storage_cuda_(Tensor& result, Storage storage, int64_t storage_offse
   checkSetStorage(result, storage, storage_offset, size, stride);
 
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
-  c10::optional<IntArrayRef> stride_opt = stride.data() != nullptr ?
-                                          c10::optional<IntArrayRef>(stride) : c10::nullopt;
+  at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ?
+                                          at::OptionalIntArrayRef(stride) : c10::nullopt;
   at::native::resize_impl_cuda_(result.unsafeGetTensorImpl(), size, stride_opt);
   return result;
 }
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
index 392b3ce25ce2..66cda4f38023 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -1,12 +1,26 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/TensorTopK.h>
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/cuda/Sort.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/CUDAFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/sort_cuda_dispatch.h>
+#include <ATen/ops/topk_native.h>
+#endif
+
 namespace at {
 namespace native {
 
+// TODO: remove this when CUDA <11.6 is no longer supported
 void topk_out_with_sort(
   const Tensor& self,
   int64_t k, int64_t dim, bool largest,
@@ -14,12 +28,15 @@ void topk_out_with_sort(
   const Tensor& indices
 ) {
   Tensor sorted_values, sorted_indices;
-  std::tie(sorted_values, sorted_indices) = at::native::sort_cuda(self, dim, largest);
+  std::tie(sorted_values, sorted_indices) = at::cuda::sort(self, /* stable= */false, dim, largest);
   values.copy_(sorted_values.narrow(dim, 0, k));
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
 
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk();
 bool should_use_sort(const Tensor& self, int64_t dim) {
+  if (disable_sort_for_topk()) return false;
   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
   if (self.dim() == 0) return false;
   if (self.dtype() == kBool) return false; // Bool is not support by topk
@@ -71,7 +88,7 @@ TORCH_IMPL_FUNC(topk_out_cuda)
 
       Tensor sortedIndices = at::empty_like(indices);
       Tensor sortedValues = at::empty_like(values);
-      sort_out_cuda(values, dim, largest, sortedValues, sortedIndices);
+      at::cuda::sort_outf(values, /* stable= */ false, dim, largest, sortedValues, sortedIndices);
       indices.copy_(indices.gather(dim, sortedIndices));
       values.copy_(sortedValues);
     }
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 7980619a7864..a4763e2d6f0d 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -13,6 +13,7 @@
 #include <ATen/native/cuda/SortUtils.cuh>
 #include <ATen/cuda/cub.cuh>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/detail/KernelUtils.h>
 
 #include <c10/macros/Macros.h>
 
@@ -20,6 +21,12 @@ using namespace at::native;
 
 namespace at {
 namespace native {
+
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk() {
+  return CUB_SUPPORTS_SCAN_BY_KEY();
+}
+
 namespace sbtopk { // single_block_topk
 
 template <typename T>
@@ -189,7 +196,8 @@ void launch(
 
     dim3 grid;
     TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
-    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)C10_WARP_SIZE) * (int64_t)C10_WARP_SIZE, (int64_t)1024));
+    int warp_size = at::cuda::warp_size();
+    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
     gatherTopK<T, IndexType, Dim, /* WithKthValues= */false><<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
         input,
         inputSliceSize,
@@ -208,6 +216,15 @@ void launch(
 
 namespace mbtopk { // multi_block_topk
 
+// Assumptions:
+// The number of elements can be larger than UINT32_MAX, but
+// the number of total blocks can not be larger than UINT32_MAX.
+// So we can not have more than UINT32_MAX slices. The actual limit
+// for number of slices could be a few fold smaller than UINT32_MAX,
+// because we could be using multiple blocks per slice.
+// Further more, the size of each input slice is also assumped to be
+// smaller than UINT32_MAX
+
 constexpr int BLOCK_THREADS = 256;
 
 // Over what radix we are selecting values
@@ -215,6 +232,8 @@ constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
 static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+constexpr int MIN_ITEMS_PER_THREAD = 4;
+constexpr int MAX_ITEMS_PER_THREAD = 64;
 
 template <typename T, typename IndexType>
 __global__ void fill(T* x, T value, IndexType size) {
@@ -230,42 +249,44 @@ template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
 __global__ void radixFindKthValues(
     at::cuda::detail::TensorInfo<T, IndexType> input,
-    IndexType slice_size,
-    IndexType* ks_to_find, // size: num_slices
+    uint32_t slice_size,
+    uint32_t* ks_to_find,  // size: num_slices
 
-    IndexType num_slices,
+    uint32_t num_slices,
     IndexType withinSliceStride,
 
     int current_bit,
     int items_per_thread,
-    IndexType blocks_per_slice,
+    uint32_t blocks_per_slice,
     Bitwise desiredMask,
 
     // outputs
     uint32_t* semaphores,  // size: num_slices
     Bitwise* desires,      // size: num_slices
-    IndexType* counts,     // size: num_slices * blocks_per_slice * radix_digits
+    short* counts,         // size: num_slices * blocks_per_slice * radix_digits
     T* kthValues           // size: num_slices, only write when current_bit reaches 0
   ) {
 
   int items_per_block = items_per_thread * BLOCK_THREADS;
   int tidx = threadIdx.x;
-  IndexType block_idx = getLinearBlockId<IndexType>();
-  IndexType slice_idx = block_idx / blocks_per_slice;
-  IndexType blk_idx_in_slice = block_idx % blocks_per_slice;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+  uint32_t blk_idx_in_slice = block_idx % blocks_per_slice;
   if (slice_idx >= num_slices) {
     return;
   }
 
   Bitwise desired = desires[slice_idx];
-  IndexType k_to_find = ks_to_find[slice_idx];
+  uint32_t k_to_find = ks_to_find[slice_idx];
   IndexType slice_start_index = at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, input);
   T* data = &input.data[slice_start_index];
 
-  typedef cub::BlockScan<IndexType, BLOCK_THREADS> BlockScan;
+  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
+    "blockwise counter too large");
   union __align__(16) TempStorage {
     uint32_t digit_counters[RADIX_DIGITS];
-    IndexType digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
+    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
     typename BlockScan::TempStorage scan_storage;
   };
   __shared__ TempStorage temp_storage;
@@ -299,18 +320,19 @@ __global__ void radixFindKthValues(
 
   // load digit counter to register, one digit per thread
   static_assert(RADIX_DIGITS <= BLOCK_THREADS, "this kernel requires RADIX_DIGITS <= BLOCK_THREADS");
-  IndexType digit_count = 0;
+  uint32_t digit_count = 0;
   if (tidx < RADIX_DIGITS) {
     digit_count = temp_storage.digit_counters[tidx];
   }
 
+  // We always write out counts regardless if blocks_per_slice == 1 because
+  // it will be used to compute offsets for `gatherTopK`.
+  if (tidx < RADIX_DIGITS) {
+    counts[block_idx * RADIX_DIGITS + tidx] = digit_count;
+  }
   // if blocks_per_slice == 1, there is no need to do cross-block reduction
-  // in this case counts saved at registers instead of global memory
+  // in this case we use counts saved at registers directly
   if (blocks_per_slice > 1) {
-
-    if (tidx < RADIX_DIGITS) {
-      counts[block_idx * RADIX_DIGITS + tidx] = digit_count;
-    }
     __threadfence(); // make sure writes are globally visible
     __syncthreads(); // make sure all writes are finished before update semaphores
   }
@@ -341,7 +363,7 @@ __global__ void radixFindKthValues(
   }
 
   // compute the block-wide inclusive prefix sum
-  IndexType digit_count_cumsum;
+  uint32_t digit_count_cumsum;
   BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
   __syncthreads();
   // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
@@ -351,14 +373,14 @@ __global__ void radixFindKthValues(
   __syncthreads();
 
   if (tidx < RADIX_DIGITS) {
-    IndexType digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
 
     // if not the last pass: update desired and ks_to_find
     // if last pass: write out the kth value
     if (digit_count_cumsum_left < k_to_find && k_to_find <= digit_count_cumsum) {
       desired = at::cuda::Bitfield<Bitwise>::setBitfield(desired, tidx, current_bit, RADIX_BITS);
+      desires[slice_idx] = desired;
       if (current_bit > 0) {
-        desires[slice_idx] = desired;
         ks_to_find[slice_idx] = k_to_find - digit_count_cumsum_left;
       } else {
         kthValues[slice_idx] = TopKTypeConfig<T>::deconvert(desired);
@@ -370,7 +392,199 @@ __global__ void radixFindKthValues(
   if (tidx == 0) {
     semaphores[slice_idx] = 0;
   }
-};
+}
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+// Assumption: k can not be larger than UINT32_MAX
+template <typename Bitwise>
+C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
+__global__ void computeBlockwiseWithinKCounts(
+  Bitwise* desires,          // size: num_slices
+  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t blocks_per_slice,
+  int current_bit,
+  bool largest,
+  // outputs:
+  uint32_t* withinKCounts   // size: num_slices * blocks_per_slice == num_blocks
+) {
+  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  int tidx = threadIdx.x;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+
+  Bitwise desired = doLdg(desires + slice_idx);
+  Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
+
+  // if largest, then only threads that has tidx > desired_digit are active
+  // if !largest, then only threads that has tidx < desired_digit are active
+  // each active thread will read the count for its corresponding, and
+  // do warp reduction followed by shared memory reduction to get the total count
+  // non-active thread should not load, and non-active warp should not do reduction.
+  bool warp_is_active, thread_is_active;
+  int warp = tidx / C10_WARP_SIZE;
+  if (largest) {
+    int end_of_warp = warp * C10_WARP_SIZE + C10_WARP_SIZE - 1;
+    warp_is_active = end_of_warp > desired_digit;
+    thread_is_active = tidx > desired_digit;
+  } else {
+    int start_of_warp = warp * C10_WARP_SIZE;
+    warp_is_active = start_of_warp < desired_digit;
+    thread_is_active = tidx < desired_digit;
+  }
+  uint32_t count = 0;
+  if (warp_is_active) {
+    if (thread_is_active) {
+      count = doLdg(counts + block_idx * RADIX_DIGITS + tidx);
+    }
+    for (int offset = C10_WARP_SIZE / 2; offset > 0; offset /= 2) {
+      count += WARP_SHFL_DOWN(count, offset);
+    }
+  }
+
+  constexpr int num_warps = RADIX_DIGITS / C10_WARP_SIZE;
+  __shared__ uint32_t warp_counts[num_warps];
+  if (tidx % C10_WARP_SIZE == 0) {
+    warp_counts[warp] = count;
+  }
+  __syncthreads();
+  static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE,
+    "Assuming only 1 warp is needed for final reduction");
+  if (warp != 0) {
+    return;
+  }
+  count = 0;
+  if (tidx < num_warps) {
+    count = warp_counts[tidx];
+  }
+  for (int offset = num_warps / 2; offset > 0; offset /= 2) {
+    count += WARP_SHFL_DOWN(count, offset);
+  }
+  if (tidx == 0) {
+    withinKCounts[block_idx] += count;
+  }
+}
+
+// Assumption: slice_size can not be larger than UINT32_MAX
+template <typename Bitwise>
+__global__ void computeBlockwiseKthCounts(
+  Bitwise* desires,            // size: num_slices
+  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t blocks_per_slice,
+  // outputs:
+  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
+) {
+  CUDA_KERNEL_LOOP_TYPE(idx, num_blocks, uint32_t) {
+    uint32_t slice_idx = idx / blocks_per_slice;
+    Bitwise desired = doLdg(desires + slice_idx);
+    Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, 0, RADIX_BITS);
+    kthCounts[idx] = doLdg(counts + idx * RADIX_DIGITS + desired_digit);
+  }
+}
+
+template <typename T, typename IndexType, int Dim>
+C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
+__global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+                           bool largest,
+
+                           uint32_t numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           at::cuda::detail::TensorInfo<T, IndexType> topK,
+                           IndexType topKWithinSliceStride,
+
+                           at::cuda::detail::TensorInfo<int64_t, IndexType> indices,
+                           IndexType indicesWithinSliceStride,
+
+                           uint32_t items_per_thread,
+                           uint32_t blocks_per_slice,
+
+                           T *kthValues,
+                           uint32_t* withinKCounts,
+                           uint32_t* kthCounts) {
+
+  uint32_t items_per_block = items_per_thread * BLOCK_THREADS;
+  uint32_t tidx = threadIdx.x;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+  uint32_t blk_idx_in_slice = block_idx % blocks_per_slice;
+
+  items_per_thread = (blk_idx_in_slice + 1 < blocks_per_slice)
+      ? items_per_thread
+      : at::ceil_div((int64_t)(inputSliceSize - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS);
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, input);
+  IndexType topKSliceStartIndex =
+    at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, topK);
+  IndexType indicesSliceStartIndex =
+    at::cuda::detail::IndexToOffset<int64_t, IndexType, Dim>::get(slice_idx, indices);
+
+  T* inputSliceStart = &input.data[sliceStartIndex];
+  T* topKSliceStart = &topK.data[topKSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  T kthValue = kthValues[slice_idx];
+  const auto kthValueConverted = at::native::TopKTypeConfig<T>::convert(kthValue);
+
+  // Find the start index in output tensor of this block
+  uint32_t startWithinK = 0;
+  if (blk_idx_in_slice > 0) {
+    startWithinK = withinKCounts[block_idx - 1];
+  }
+  uint32_t startKth = withinKCounts[slice_idx * blocks_per_slice + blocks_per_slice - 1];
+  if (blk_idx_in_slice > 0) {
+    startKth += kthCounts[block_idx - 1];
+  }
+
+  // Read input, select topk out and write
+  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  for (int i = 0; i < items_per_thread; ++i) {
+    // Find the start offset for this slice
+    IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx;
+    T val;
+    int withinK = 0;
+    int kth = 0;
+    if (idx < inputSliceSize) {
+      val = doLdg(inputSliceStart + idx * inputWithinSliceStride);
+      const auto valConverted = at::native::TopKTypeConfig<T>::convert(val);
+      withinK = (largest ? valConverted > kthValueConverted : valConverted < kthValueConverted);
+      kth = (valConverted == kthValueConverted);
+    }
+
+    uint32_t withinKIndex;
+    uint32_t numWithinK;
+    BlockScan(temp_storage).ExclusiveSum(withinK, withinKIndex, numWithinK);
+    __syncthreads();
+    if (withinK) {
+      uint32_t offset = withinKIndex + startWithinK;
+      topKSliceStart[offset * topKWithinSliceStride] = val;
+      indicesSliceStart[offset * indicesWithinSliceStride] = idx;
+    }
+    startWithinK += numWithinK;
+
+    if (startKth < outputSliceSize) {
+      uint32_t kthIndex;
+      uint32_t numKth;
+      BlockScan(temp_storage).ExclusiveSum(kth, kthIndex, numKth);
+      __syncthreads();
+      if (kth) {
+        uint32_t offset = kthIndex + startKth;
+        if (offset < outputSliceSize) {
+          topKSliceStart[offset * topKWithinSliceStride] = val;
+          indicesSliceStart[offset * indicesWithinSliceStride] = idx;
+        }
+      }
+      startKth += numKth;
+    }
+  }
+}
+#endif
 
 int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
   // occupancy of this kernel is limited by registers per threads
@@ -391,10 +605,19 @@ int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
 #endif
   int blocks_per_mp = std::min(regs_per_mp / REGS_PER_BLOCK, max_blocks_per_mp);
   int64_t items_per_thread = at::ceil_div((int64_t)(slice_size * num_slices), (int64_t)(mpc * blocks_per_mp * BLOCK_THREADS));
-  items_per_thread = std::max(4, std::min((int)items_per_thread, 64)); // clamp to (4, 64)
+  items_per_thread = std::max(MIN_ITEMS_PER_THREAD, std::min((int)items_per_thread, MAX_ITEMS_PER_THREAD)); // clamp to (4, 64)
   return items_per_thread;
 }
 
+class BlockIdxToKey {
+  uint32_t blocks_per_slice;
+public:
+  BlockIdxToKey(uint32_t blocks_per_slice): blocks_per_slice(blocks_per_slice) {}
+  __device__ __forceinline__ uint32_t operator()(uint32_t blk) const {
+    return blk / blocks_per_slice;
+  }
+};
+
 template <typename T, typename IndexType, int Dim>
 void launch(
     at::cuda::detail::TensorInfo<T, IndexType> input,
@@ -402,7 +625,7 @@ void launch(
     IndexType outputSliceSize, // aka `k`
     bool largest,
 
-    IndexType numInputSlices,
+    uint32_t numInputSlices,
     IndexType inputWithinSliceStride,
 
     at::cuda::detail::TensorInfo<T, IndexType> topK,
@@ -410,14 +633,15 @@ void launch(
 
     at::cuda::detail::TensorInfo<int64_t, IndexType> indices,
     IndexType indicesWithinSliceStride) {
+  auto stream = c10::cuda::getCurrentCUDAStream();
 
   // configure items_per_thread based on device architecture and input size
   int items_per_thread = get_items_per_thread(numInputSlices, inputSliceSize);
   int items_per_block = items_per_thread * BLOCK_THREADS;
 
   using Bitwise = typename TopKTypeConfig<T>::RadixType;
-  int64_t blocks_per_slice = at::ceil_div((int64_t)inputSliceSize, (int64_t)items_per_block);
-  int64_t num_blocks = numInputSlices * blocks_per_slice;
+  uint32_t blocks_per_slice = at::ceil_div((int64_t)inputSliceSize, (int64_t)items_per_block);
+  uint32_t num_blocks = numInputSlices * blocks_per_slice;
 
   // temporary storage
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
@@ -428,20 +652,31 @@ void launch(
   TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
   auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
   uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), c10::cuda::getCurrentCUDAStream()));
+  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
 
-  auto ks_to_find_buffer = allocator.allocate(numInputSlices * sizeof(IndexType));
-  IndexType* ks_to_find = reinterpret_cast<IndexType*>(ks_to_find_buffer.get());
-  IndexType k_to_find = largest ? inputSliceSize - outputSliceSize + 1: outputSliceSize;
-  fill<IndexType><<<std::min((numInputSlices + 511) / 512, (IndexType)65535), 512, 0, c10::cuda::getCurrentCUDAStream()>>>(
+  auto ks_to_find_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
+  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
+  uint32_t k_to_find = largest ? inputSliceSize - outputSliceSize + 1: outputSliceSize;
+  fill<uint32_t><<<std::min(((int64_t)numInputSlices + 511) / 512, (int64_t)1073741824), 512, 0, stream>>>(
     ks_to_find, k_to_find, numInputSlices);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   auto desired_buffer = allocator.allocate(numInputSlices * sizeof(Bitwise));
   Bitwise* desired = reinterpret_cast<Bitwise*>(desired_buffer.get());
 
-  auto counts_buffer = allocator.allocate(num_blocks * RADIX_DIGITS * sizeof(IndexType));
-  IndexType* counts = reinterpret_cast<IndexType*>(counts_buffer.get());
+  auto counts_buffer = allocator.allocate(num_blocks * RADIX_DIGITS * sizeof(short));
+  short* counts = reinterpret_cast<short*>(counts_buffer.get());
+  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
+    "blockwise counter too large");
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
+  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
+
+  auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
+  uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
+#endif
 
   Bitwise desiredMask = 0;
   dim3 grid;
@@ -450,7 +685,7 @@ void launch(
 
   // iterate radix bits for multiple passes
   for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
+    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
         input,
         inputSliceSize,
         ks_to_find,
@@ -465,15 +700,38 @@ void launch(
         counts,
         kthValues);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+    computeBlockwiseWithinKCounts<Bitwise><<<grid, RADIX_DIGITS, 0, stream>>>(
+      desired, counts, blocks_per_slice, current_bit, largest, withinKCounts);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+#endif
     desiredMask = at::cuda::Bitfield<Bitwise>::setBitfield(desiredMask, RADIX_MASK, current_bit, RADIX_BITS);
   }
 
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+  computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
+    desired, counts, num_blocks, blocks_per_slice, kthCounts);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block
+  using counting_iter_t = cub::CountingInputIterator<uint32_t, uint32_t>;
+  using slice_idx_iter_t = cub::TransformInputIterator<uint32_t, BlockIdxToKey, counting_iter_t>;
+  slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice));
+  at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks);
+  at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks);
+  // copy topk values to output tensor
+  gatherTopK<T, IndexType, Dim><<<grid, block, 0, stream>>>(
+    input, inputSliceSize, outputSliceSize, largest, numInputSlices, inputWithinSliceStride,
+    topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
+    blocks_per_slice, kthValues, withinKCounts, kthCounts);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+#else
   // Find topk values based on kth values
   {
     dim3 grid;
     TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
-    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)C10_WARP_SIZE) * (int64_t)C10_WARP_SIZE, (int64_t)1024));
-    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
+    int warp_size = at::cuda::warp_size();
+    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
+    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
         input,
         inputSliceSize,
         outputSliceSize,
@@ -487,15 +745,29 @@ void launch(
         kthValues);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
+#endif
 }
 
 } // namespace mbtopk
 
 bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
+  if (num_slices > std::numeric_limits<uint32_t>::max() ||
+      slice_size > std::numeric_limits<uint32_t>::max()) return false;
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
+  return (num_slices <= 20 && slice_size >= 20000) ||
+      (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
+      (num_slices > 40 && num_slices <= 80 && slice_size >= 8000) ||
+      (num_slices > 80 && num_slices < 200 && slice_size >= 5000) ||
+      (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
+      (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
+      (num_slices > 4000 && slice_size >= 400);
+#else
   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
   return (num_slices <= 400 && slice_size >= 5000) ||
-      (num_slices >= 400 && num_slices < 4000 && slice_size >= 1000) ||
+      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
       (num_slices >= 4000 && slice_size >= 300);
+#endif
 }
 
 void launch_gather_topk_kernel(
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index d46a5613df78..335d746294d0 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -1,11 +1,20 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorTransformations.h>
 
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/macros/Macros.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/roll_native.h>
+#endif
+
 #include <cstddef>
 #include <vector>
 
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index 3a0f8fb1e4d1..2d7bf30309dc 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -1,15 +1,20 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ceil_div.h>
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/Resize.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/diag.h>
+#include <ATen/ops/diag_native.h>
+#include <ATen/ops/trace_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
 #endif
 
 #include <ATen/cuda/CUDAApplyUtils.cuh>
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
index 07be6bb96556..0589c3ba4f0d 100644
--- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -2,6 +2,7 @@
 #include <limits>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
@@ -20,53 +21,41 @@ __host__ __device__ static inline scalar_t angle_wrapper(scalar_t v) {
 
 template<typename T>
 __host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T> v) {
-  return std::arg(v);
+  return c10::complex<T>{std::arg(v), 0};
 }
 
+const char angle_name[] = "angle_kernel";
 void angle_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "angle_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return angle_wrapper(a);
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto angle_string = jiterator_stringify(
+        template <typename T>
+        T angle_kernel(T v) {
+          return T{std::arg(v)};
+        }
+    ); // angle string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "angle_cuda", [&]() {
+        jitted_gpu_kernel<
+          /*name=*/ angle_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 1>(iter, angle_string);
     });
-  });
-}
-
-// We manually overload real because std::real does not work types other than c10::complex.
-template<typename scalar_t>
-__host__ __device__ static inline scalar_t real_wrapper(scalar_t v) {
-  return v;
-}
-
-template<typename T>
-__host__ __device__ static inline c10::complex<T> real_wrapper(c10::complex<T> v) {
-  return v.real();
-}
-
-void real_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "real_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return real_wrapper(a);
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "angle_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return angle_wrapper(a);
+        });
     });
-  });
-}
-
-// We manually overload imag because std::imag does not work types other than c10::complex.
-template<typename scalar_t>
-__host__ __device__ static inline scalar_t imag_wrapper(scalar_t v) {
-  return 0;
-}
-
-template<typename T>
-__host__ __device__ static inline c10::complex<T> imag_wrapper(c10::complex<T> v) {
-  return v.imag();
-}
-
-void imag_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "imag_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return imag_wrapper(a);
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(dtype, "angle_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return angle_wrapper(a);
+        });
     });
-  });
+  }
 }
 
 // We manually overload conj because std::conj does not work types other than c10::complex.
@@ -81,18 +70,35 @@ __host__ __device__ static inline c10::complex<T> conj_wrapper(c10::complex<T> v
 }
 
 // NB: Ignores the negative bit on tensors
+const char conj_name[] = "conj_kernel";
 void conj_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    #if AT_USE_JITERATOR()
+      static const auto conj_string = jiterator_stringify(
+        template <typename T>
+        T conj_kernel(T z) {
+          return std::conj(z);
+        }
+      );
+      jitted_gpu_kernel<conj_name, scalar_t, scalar_t, 1>(iter, conj_string);
+    #else
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return conj_wrapper(a);
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cuda", [&]() {
         gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
           return conj_wrapper(a);
         });
-      });
+    });
+  }
 }
 
 REGISTER_DISPATCH(angle_stub, &angle_kernel_cuda);
-REGISTER_DISPATCH(real_stub, &real_kernel_cuda);
-REGISTER_DISPATCH(imag_stub, &imag_kernel_cuda);
 REGISTER_DISPATCH(conj_physical_stub, &conj_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
index 47f88383de42..c0187284b98b 100644
--- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
@@ -4,26 +4,73 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Math.cuh>
 
 namespace at { namespace native {
 
+const char log_name[] = "log_kernel";
 void log_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return ::log(a);
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log_string = jiterator_stringify(
+        template <typename T> T log_kernel(T x) { return std::log(x); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "log_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log_string);
     });
-  });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, iter.common_dtype(), "log_cuda", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::log(static_cast<opmath_t>(a));
+          });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log(a);
+      });
+    });
+  }
 }
 
+const char log10_name[] = "log10_kernel";
 void log10_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return ::log10(a);
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log10_string = jiterator_stringify(
+        template <typename T> T log10_kernel(T x) { return std::log10(x); });
+    AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log10_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log10_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log10_string);
     });
-  });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log10_cuda", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log10(a); });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log10(a);
+      });
+    });
+  }
 }
 
 void log1p_kernel_cuda(TensorIteratorBase& iter) {
@@ -34,12 +81,33 @@ void log1p_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
+const char log2_name[] = "log2_kernel";
 void log2_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return ::log2(a);
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log2_string = jiterator_stringify(
+        template <typename T> T log2_kernel(T x) { return std::log2(x); });
+    AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log2_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log2_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log2_string);
     });
-  });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log2_cuda", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log2(a); });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log2(a);
+      });
+    });
+  }
 }
 
 REGISTER_DISPATCH(log_stub, &log_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 671ce1d6cbcd..85c3fb7a1005 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -8,6 +8,8 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/Math.cuh>
 #include <ATen/NumericUtils.h>
@@ -32,12 +34,38 @@ void bitwise_not_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
+const char exp_name[] = "exp_kernel";
 void exp_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "exp_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return std::exp(a);
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto exp_string = jiterator_stringify(
+          template <typename T>
+          T exp_kernel(T x) {
+            return std::exp(x);
+      }); // exp_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_cuda", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/exp_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, exp_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          return std::exp(static_cast<opmath_t>(a));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "exp_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::exp(a);
+      });
     });
-  });
+  }
 }
 
 void expm1_kernel_cuda(TensorIteratorBase& iter) {
@@ -53,19 +81,45 @@ void expm1_kernel_cuda(TensorIteratorBase& iter) {
 
 // We manually overload rsqrt because std::rsqrt does not work with complex types.
 template<typename scalar_t>
-__host__ __device__ static inline scalar_t rsqrt_wrapper(scalar_t v) {
+C10_HOST_DEVICE static inline scalar_t rsqrt_wrapper(scalar_t v) {
   return ::rsqrt(v);
 }
 
 template<typename T>
-__host__ __device__ static inline c10::complex<T> rsqrt_wrapper(c10::complex<T> v) {
+C10_HOST_DEVICE static inline c10::complex<T> rsqrt_wrapper(c10::complex<T> v) {
   const c10::complex<T> one = c10::complex<T>(1.0, 0);
   // std::sqrt for c10::complex is overloaded in c10/util/complex_math.h
   return one / ::sqrt(v);
 }
 
+const char rsqrt_name[] = "rsqrt_kernel";
 void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto rsqrt_string = jiterator_stringify(
+          template <typename T>
+          T rsqrt_kernel(T x) {
+            const T one = T{1};
+            return one / std::sqrt(x);
+      }); // rsqrt_string
+      AT_DISPATCH_COMPLEX_TYPES(common_dtype, "rsqrt_cuda", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/rsqrt_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, rsqrt_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(common_dtype, "rsqrt_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          // In CUDA, ::rsqrt is overloaded for float and at::Half here is implicitly cast to float.
+          return rsqrt_wrapper(a);
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16, ScalarType::Half,
       iter.common_dtype(), "rsqrt_cuda",
       [&]() {
@@ -74,14 +128,40 @@ void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
           return rsqrt_wrapper(a);
         });
       });
+  }
 }
 
+const char sqrt_name[] = "sqrt_kernel";
 void sqrt_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "sqrt_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return ::sqrt(a);
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto sqrt_string = jiterator_stringify(
+          template <typename T>
+          T sqrt_kernel(T x) {
+            return std::sqrt(x);
+      }); // sqrt_string
+      AT_DISPATCH_COMPLEX_TYPES(common_dtype, "sqrt_cuda", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/sqrt_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, sqrt_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(common_dtype, "sqrt_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return std::sqrt(a);
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, common_dtype, "sqrt_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::sqrt(a);
+      });
     });
-  });
+  }
 }
 
 void clamp_kernel_cuda(TensorIteratorBase& iter, const Scalar& min_value, const Scalar& max_value) {
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index b88dc6597bdd..170ae6566b75 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -1,12 +1,14 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Math.cuh>
 #include <c10/util/TypeSafeSignMath.h>
+#include <ATen/OpMathType.h>
 
 #include <type_traits>
 
@@ -23,12 +25,38 @@ void logical_not_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // NB: Ignores the negative bit on tensors
+const char neg_name[] = "neg_kernel";
 void neg_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() {
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto neg_string = jiterator_stringify(
+      template <typename T>
+      T neg_kernel(T a) {
+        return -a;
+      }
+  ); // neg_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_cuda", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ neg_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, neg_string);
+  });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return -a;
+      });
+  });
+#endif
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, dtype, "neg_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return -a;
     });
   });
+  }
 }
 
 void sign_kernel_cuda(TensorIteratorBase& iter){
@@ -52,7 +80,7 @@ void signbit_kernel_cuda(TensorIteratorBase& iter){
 }
 
 template<typename T>
-__host__ __device__ static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
+C10_HOST_DEVICE static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
   if (z == c10::complex<T>(0, 0)) {
     return c10::complex<T>(0, 0);
   } else {
@@ -60,13 +88,38 @@ __host__ __device__ static inline c10::complex<T> sgn_wrapper(c10::complex<T> z)
   }
 }
 
+const char sgn_name[] = "sgn_kernel";
 void sgn_kernel_cuda(TensorIteratorBase& iter){
-  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() {
+  auto dtype = iter.dtype();
+  #if AT_USE_JITERATOR()
+    static const auto sgn_string = jiterator_stringify(
+        template <typename T>
+        T sgn_kernel(T z) {
+          const T zero = T(0);
+          if (z == zero) {
+            return zero;
+          } else {
+            return z / std::abs(z);
+          }
+        }
+      ); // sgn_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sgn_cuda", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ sgn_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, sgn_string);
+      });
+  #else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sgn_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
       gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return sgn_wrapper(a);
+        return sgn_wrapper(opmath_t{a});
       });
   });
+  #endif
 }
+
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda);
 REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda);
 REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
index 71a355347022..0cb0d9f238cf 100644
--- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
@@ -63,7 +63,7 @@ void i0_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-const char i0e_name[] = "i0e";
+const char i0e_name[] = "calc_i0e";
 void i0e_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_cuda", [&]() {
@@ -120,12 +120,41 @@ void i1e_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
+const char sigmoid_name[] = "sigmoid";
 void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "sigmoid_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return static_cast<scalar_t>(1) / (static_cast<scalar_t>(1) + std::exp(-a));
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    // only jiterate for complex-dtype
+    #if AT_USE_JITERATOR()
+      static const auto sigmoid_string = jiterator_stringify(
+        template <typename T>
+        T sigmoid(T x) {
+          return T{1} / (T{1} + std::exp(-x));
+        }
+      ); // sigmoid_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_cuda", [&]() {
+        jitted_gpu_kernel<
+            /*name=*/sigmoid_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/1>(iter, sigmoid_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_cuda", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const auto one = opmath_t{1};
+          return static_cast<scalar_t>(one / (one + std::exp(-opmath_t{a})));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "sigmoid_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return scalar_t{1} / (scalar_t{1} + std::exp(-a));
+      });
     });
-  });
+  }
 }
 
 const char sinc_name[] = "sinc";
@@ -202,6 +231,23 @@ void ndtri_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
+const char log_ndtr_name[] = "log_ndtr";
+void log_ndtr_kernel_cuda(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() {
+      jitted_gpu_kernel</*name=*/log_ndtr_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, log_ndtr_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return calc_log_ndtr(a); });
+      });
+  #endif
+}
+
 void erf_kernel_cuda(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "erf_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
@@ -264,18 +310,38 @@ void erfcx_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
+const char kaiser_window_name[] = "kaiser_window";
 void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length, double beta_){
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
-    using opmath_t = at::opmath_type<scalar_t>;
-    const opmath_t inv_alpha = static_cast<opmath_t>(2.0 / (window_length - 1));
-    const opmath_t beta = static_cast<opmath_t>(beta_);
-    const opmath_t inv_i0_beta = 1.0 / calc_i0(beta);
-    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t {
-      opmath_t x = static_cast<opmath_t>(a) * inv_alpha - 1;
-      opmath_t y = std::max<opmath_t>(0, 1 - x * x);
-      return calc_i0(beta * ::sqrt(y)) * inv_i0_beta;
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
+        using opmath_t = at::opmath_type<scalar_t>;
+        const opmath_t inv_alpha = static_cast<opmath_t>(2.0 / (window_length - 1));
+        const opmath_t beta = static_cast<opmath_t>(beta_);
+        const opmath_t inv_i0_beta = 1.0 / calc_i0(beta);
+        jitted_gpu_kernel<
+            /*name=*/kaiser_window_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/1>(
+            iter,
+            kaiser_window_string,
+            /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(inv_alpha, beta, inv_i0_beta));
     });
-  });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
+      using opmath_t = at::opmath_type<scalar_t>;
+      const opmath_t inv_alpha = static_cast<opmath_t>(2.0 / (window_length - 1));
+      const opmath_t beta = static_cast<opmath_t>(beta_);
+      const opmath_t inv_i0_beta = 1.0 / calc_i0(beta);
+      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t {
+        opmath_t x = static_cast<opmath_t>(a) * inv_alpha - 1;
+        opmath_t y = std::max<opmath_t>(0, 1 - x * x);
+        return calc_i0(beta * ::sqrt(y)) * inv_i0_beta;
+      });
+    });
+  #endif
 }
 
 const char entr_name[] = "entr";
@@ -322,6 +388,7 @@ REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda);
 REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda);
 REGISTER_DISPATCH(special_entr_stub, &entr_kernel_cuda);
 REGISTER_DISPATCH(special_ndtri_stub, &ndtri_kernel_cuda);
+REGISTER_DISPATCH(special_log_ndtr_stub, &log_ndtr_kernel_cuda);
 REGISTER_DISPATCH(special_erfcx_stub, &erfcx_kernel_cuda);
 
 } // namespace native
diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
index 8b43900e9271..90f5238d0180 100644
--- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnfoldBackward.h>
 
 #include <ATen/native/cuda/Loops.cuh>
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index d268ca1c4903..746bba7a66c5 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -1,8 +1,22 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/ThrustAllocator.h>
 #include <thrust/execution_policy.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/_unique_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/unique_consecutive_native.h>
+#include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#endif
+
 #include <tuple>
 #include <iterator>
 #include <thrust/adjacent_difference.h>
@@ -103,7 +117,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
     TORCH_CHECK(
         num_zero_dims == 1,
         "Number of zero sized dimensions is more than one, so unique cannot be applied ")
-    Tensor output = at::empty({0}, self.options());
+    Tensor output = at::empty(sizes, self.options());
     Tensor inverse_indices =
         at::empty({0}, self.options().dtype(kLong));
     Tensor counts = at::empty({0}, self.options().dtype(kLong));
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
index bda84bdda4e1..cc19b96a7797 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cuda/UniqueCub.cuh>
 
 #include <ATen/cuda/CUDAContext.h>
@@ -5,6 +6,13 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/cub.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#endif
+
 namespace at {
 namespace native {
 namespace internal {
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cuh b/aten/src/ATen/native/cuda/UniqueCub.cuh
index 1bb96e3f5ebd..6e1cccc2e175 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cuh
+++ b/aten/src/ATen/native/cuda/UniqueCub.cuh
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index b609b42a4d9e..09e460640df8 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -1,7 +1,12 @@
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
+#pragma once
+#include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/Atomic.cuh>
 
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/OptionalArrayRef.h>
+
 #include <math.h>
 
 namespace at {
@@ -11,7 +16,7 @@ namespace upsample {
 // TODO: Remove duplicate declaration.
 TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     c10::IntArrayRef input_size,  // Full input tensor size.
-    c10::optional<c10::IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<c10::ArrayRef<double>> scale_factors);
 } // namespace upsample
 
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index 29dec1735f23..1214955b06d4 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -1,12 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/cuda/UpSample.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index d5153838139f..d76e2783207f 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -1,9 +1,10 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
-#include <ATen/ATen.h>
-#include <ATen/ceil_div.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -12,6 +13,20 @@
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/native/cuda/LaunchUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
@@ -456,7 +471,6 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024
 __global__ void upsample_gen2d_aa_out_frame(
     const accscalar_t height_scale,
     const accscalar_t width_scale,
-    const bool align_corners,
     const PackedTensorAccessor64<scalar_t, 4> idata,
     PackedTensorAccessor64<scalar_t, 4> odata,
     const InterpFilter & interp_filter) {
@@ -550,7 +564,6 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024
 __global__ void upsample_gen2d_aa_backward_out_frame(
     const accscalar_t height_scale,
     const accscalar_t width_scale,
-    const bool align_corners,
     PackedTensorAccessor64<scalar_t, 4> idata,
     const PackedTensorAccessor64<scalar_t, 4> odata,
     const InterpFilter & interp_filter) {
@@ -672,8 +685,6 @@ static void upsample_gen2d_aa_out_cuda_template(
   int output_height = output_size[0];
   int output_width = output_size[1];
 
-  int nbatch = input.size(0);
-  int channels = input.size(1);
   int input_height = input.size(2);
   int input_width = input.size(3);
 
@@ -735,7 +746,7 @@ static void upsample_gen2d_aa_out_cuda_template(
             <<<grid,
                block,
                shmem_size,
-               stream>>>(height_scale, width_scale, align_corners, idata, odata, interp_filter);
+               stream>>>(height_scale, width_scale, idata, odata, interp_filter);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 
@@ -766,8 +777,6 @@ static void upsample_gen2d_aa_backward_out_cuda_template(
   int output_height = output_size[0];
   int output_width = output_size[1];
 
-  int nbatch = input_size[0];
-  int channels = input_size[1];
   int input_height = input_size[2];
   int input_width = input_size[3];
 
@@ -819,7 +828,7 @@ static void upsample_gen2d_aa_backward_out_cuda_template(
             <<<grid,
                block,
                shmem_size,
-               stream>>>(height_scale, width_scale, align_corners, idata, odata, interp_filter);
+               stream>>>(height_scale, width_scale, idata, odata, interp_filter);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index c23887cb79a6..af9edca2280e 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -1,15 +1,24 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/cuda/UpSample.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_linear1d_native.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 52b7b1d70947..decdfca30d78 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -1,12 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/native/cuda/UpSample.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_nearest1d_native.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index 7b2a58c764bb..8aa4f68aeda6 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -1,7 +1,8 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -10,6 +11,17 @@
 #include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index 3b12614c10d5..1a4afa012d78 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -1,11 +1,28 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/UpSample.cuh>
+
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/ceil_div.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/native/cuda/UpSample.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#endif
 
 namespace at {
 namespace native {
@@ -322,7 +339,7 @@ using at::native::upsample_cuda::get_scale_value;
 
 Tensor upsample_nearest3d_cuda(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
@@ -333,7 +350,7 @@ Tensor upsample_nearest3d_cuda(
 
 Tensor _upsample_nearest_exact3d_cuda(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
@@ -345,7 +362,7 @@ Tensor _upsample_nearest_exact3d_cuda(
 // when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd
 Tensor upsample_nearest3d_backward_cuda(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
@@ -357,7 +374,7 @@ Tensor upsample_nearest3d_backward_cuda(
 
 Tensor _upsample_nearest_exact3d_backward_cuda(
     const Tensor& grad_output,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input_size, output_size, scale_factors);
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index a3623d2eb0f8..b19bf4858ac6 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -1,9 +1,10 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
-#include <ATen/ATen.h>
-#include <ATen/ceil_div.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/Atomic.cuh>
@@ -12,6 +13,14 @@
 #include <ATen/native/cuda/UpSample.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_trilinear3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index e9136ca61388..e25a1b40775d 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -1,11 +1,24 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <c10/util/Exception.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/DeviceUtils.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/_weight_norm_interface_native.h>
+#include <ATen/ops/_weight_norm_interface_backward_native.h>
+#endif
+
+
 namespace at {
 namespace native {
 namespace {
@@ -413,7 +426,7 @@ std::tuple<Tensor,Tensor> weight_norm_cuda
   return std::tuple<Tensor, Tensor>{w, norms};
 }
 
-std::tuple<Tensor, Tensor> weight_norm_cuda_backward
+std::tuple<Tensor, Tensor> weight_norm_backward_cuda
   (const Tensor & grad_w,
    const Tensor & saved_v,
    const Tensor & saved_g,
diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu
deleted file mode 100644
index 8dad56fac0e6..000000000000
--- a/aten/src/ATen/native/cuda/attention.cu
+++ /dev/null
@@ -1,253 +0,0 @@
-#include <type_traits>
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/TensorAccessor.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/detail/KernelUtils.h>
-#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/MemoryAccess.cuh>
-#include <ATen/native/cuda/block_reduce.cuh>
-#include <ATen/native/cuda/PersistentSoftmax.cuh>
-
-#include <c10/cuda/CUDAMathCompat.h>
-
-namespace at {
-
-namespace native {
-
-namespace {
-
-Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto b_ = b.transpose(1, 0);
-  auto c_ = at::native::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), b.size(0)});
-}
-
-template <typename scalar_t, typename accscalar_t>
-__global__ void transform_bias_rescale_qkv_kernel(
-    // [B, T, 3 * D]
-    const PackedTensorAccessor64<scalar_t, 3, RestrictPtrTraits> qkv,
-    // [3 * D]
-    const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv_bias,
-    // [3, B, NH, T, DH]
-    PackedTensorAccessor64<scalar_t, 5, RestrictPtrTraits> q_k_v) {
-  // warp per DH.
-  // so launch B * NH * T warps.
-  auto NH = q_k_v.size(2);
-  auto T = q_k_v.size(3);
-  auto DH = q_k_v.size(4);
-
-  auto t = blockIdx.x % T;
-  auto b = blockIdx.x / T;
-
-  auto D = NH * DH;
-  constexpr int VEC = 4;
-  const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(DH));
-  using LoadT = memory::aligned_vector<scalar_t, VEC>;
-
-  // FIXME: assert ((D % VEC) == 0)
-
-  for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) {
-    auto d = d_v * VEC;
-    auto nh = d / DH;
-    auto dh = d % DH;
-    scalar_t qkv_bias_q[VEC];
-    scalar_t qkv_bias_k[VEC];
-    scalar_t qkv_bias_v[VEC];
-    scalar_t qkv_q[VEC];
-    scalar_t qkv_k[VEC];
-    scalar_t qkv_v[VEC];
-
-    *reinterpret_cast<LoadT*>(&qkv_bias_q) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 0 * D]);
-    *reinterpret_cast<LoadT*>(&qkv_bias_k) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 1 * D]);
-    *reinterpret_cast<LoadT*>(&qkv_bias_v) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 2 * D]);
-
-    *reinterpret_cast<LoadT*>(&qkv_q) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 0 * D]);
-    *reinterpret_cast<LoadT*>(&qkv_k) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 1 * D]);
-    *reinterpret_cast<LoadT*>(&qkv_v) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 2 * D]);
-
-#pragma unroll
-    // TODO: specialize for float2half2/half2float2?
-    for (auto ii = 0; ii < VEC; ++ii) {
-      qkv_q[ii] = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_q[ii]) +
-           static_cast<accscalar_t>(qkv_bias_q[ii])) /
-          static_cast<accscalar_t>(sqrt_dim_per_head));
-      qkv_k[ii] = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_k[ii]) +
-           static_cast<accscalar_t>(qkv_bias_k[ii])));
-      qkv_v[ii] = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_v[ii]) +
-           static_cast<accscalar_t>(qkv_bias_v[ii])));
-    }
-    *reinterpret_cast<LoadT*>(&q_k_v[0][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_q);
-    *reinterpret_cast<LoadT*>(&q_k_v[1][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_k);
-    *reinterpret_cast<LoadT*>(&q_k_v[2][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_v);
-  }
-}
-
-// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto B = qkv.size(0);
-  auto T = qkv.size(1);
-  auto _3D = qkv.size(2);
-  auto D = _3D / 3;
-  TORCH_CHECK(D % num_head == 0);
-  const auto dim_per_head = D / num_head;
-  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      qkv.scalar_type(),
-      "transform_bias_rescale_qkv",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-        auto threads = std::min<int32_t>(1024, D / 4);
-        auto blocks = B * T;
-        transform_bias_rescale_qkv_kernel<scalar_t, accscalar_t>
-            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                qkv.packed_accessor64<scalar_t, 3, RestrictPtrTraits>(),
-                qkv_bias.packed_accessor64<scalar_t, 1, RestrictPtrTraits>(),
-                q_k_v.packed_accessor64<scalar_t, 5, RestrictPtrTraits>());
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  auto q_k_v_s =
-      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
-  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
-}
-
-Tensor bmm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  auto bt_ = b_.transpose(2, 1);
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, bt_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
-}
-
-template <typename T>
-__inline__ __device__ T WarpReduceMax(T val) {
-#pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val = std::max(val, WARP_SHFL_DOWN(val, offset));
-  }
-  return val;
-}
-
-template <typename T>
-__inline__ __device__ T WarpReduceSum(T val) {
-#pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val += WARP_SHFL_DOWN(val, offset);
-  }
-  return val;
-}
-
-void masked_softmax_dropout(
-    const Tensor& attn_scores,
-    const c10::optional<Tensor>& attn_mask) {
-  auto B = attn_scores.size(0);
-  auto num_heads = attn_scores.size(1);
-  auto T = attn_scores.size(2);
-  if (attn_mask) {
-    TORCH_CHECK(attn_mask->is_contiguous());
-  }
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      attn_scores.scalar_type(),
-      "masked_softmax_dropout",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-        // TODO: proper implementation with masking.
-        dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false, false>(
-          attn_scores.data_ptr<scalar_t>(),
-          attn_scores.data_ptr<scalar_t>(),
-          T,
-          T,
-          B * num_heads * T
-        );
-      });
-}
-
-Tensor bmm_nn(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
-}
-
-Tensor transform_0213(const Tensor& a) {
-  // TODO: check perf vs dedicated kernel.
-  return a.permute({0, 2, 1, 3})
-      .contiguous()
-      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
-}
-
-Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto r_ = at::native::linear(a_, b, c);
-  return r_.view({a.size(0), a.size(1), r_.size(1)});
-}
-
-} // namespace
-
-Tensor multi_head_self_attention_cuda(
-    const Tensor& query,
-    const Tensor& qkv_weight,
-    const Tensor& qkv_bias,
-    const Tensor& proj_weight,
-    const Tensor& proj_bias,
-    const int64_t num_head,
-    const c10::optional<Tensor>& mask) {
-  // query shape: [B, T, D]
-  // qkv_weight shape: [3 * D, D]
-
-  // shape: [B, T, 3 x D]
-  auto qkv = gemm_nt(query, qkv_weight);
-
-  // shape: 3 x [B, num_head, T, dim_per_head]
-  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  auto q = std::get<0>(q_k_v);
-  auto k = std::get<1>(q_k_v);
-  auto v = std::get<2>(q_k_v);
-
-  // shape: [B, num_head, T, T]
-  auto qkt = bmm_nt(q, k);
-
-  // shape: [B, num_head, T, T]
-  masked_softmax_dropout(qkt, mask);
-
-  // shape: [B, num_head, T, dim_per_head]
-  auto attn_ctx = bmm_nn(qkt, v);
-
-  // shape: [B, T, D]
-  auto attn = transform_0213(attn_ctx);
-
-  // shape: [B, T, D]
-  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
-
-  return proj;
-}
-
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu
index 8abbae013a59..53ce77fa37b1 100644
--- a/aten/src/ATen/native/cuda/group_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu
@@ -1,13 +1,13 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/group_norm.h>
 
 #include <type_traits>
 
 #include <thrust/tuple.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/cuda/CUDAMathCompat.h>
@@ -15,6 +15,12 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/block_reduce.cuh>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -573,7 +579,7 @@ void GroupNormKernelImplInternal(
 
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   const int64_t num_threads = D * HxW < cuda_utils::kCUDABlockReduceNumThreads
-      ? C10_WARP_SIZE
+      ? at::cuda::warp_size()
       : cuda_utils::kCUDABlockReduceNumThreads;
   RowwiseMomentsCUDAKernel<T><<<N * G, num_threads, 0, cuda_stream>>>(
       D * HxW, eps, X_data, mean_data, rstd_data);
@@ -694,7 +700,7 @@ void GroupNorm1dBackward(
     T_ACC* c2_data = c2.data_ptr<T_ACC>();
     T_ACC* c3_data = c3.data_ptr<T_ACC>();
     const int64_t num_threads = (C / G) < cuda_utils::kCUDABlockReduceNumThreads
-        ? C10_WARP_SIZE
+        ? at::cuda::warp_size()
         : cuda_utils::kCUDABlockReduceNumThreads;
     Compute1dBackwardFusedParamsCUDAKernel<T>
         <<<dim3(N, G), num_threads, 0, cuda_stream>>>(
@@ -841,8 +847,9 @@ void GroupNormBackwardKernelImplInternal(
     return;
   }
 
+  int warp_size = at::cuda::warp_size();
   int64_t num_threads = HxW < cuda_utils::kCUDABlockReduceNumThreads
-      ? C10_WARP_SIZE
+      ? warp_size
       : cuda_utils::kCUDABlockReduceNumThreads;
   ComputeInternalGradientsCUDAKernel<T><<<N * C, num_threads, 0, cuda_stream>>>(
       HxW, dY_data, X_data, ds_data, db_data);
@@ -868,7 +875,7 @@ void GroupNormBackwardKernelImplInternal(
     }
 
     num_threads = (C / G) < cuda_utils::kCUDABlockReduceNumThreads
-        ? C10_WARP_SIZE
+        ? warp_size
         : cuda_utils::kCUDABlockReduceNumThreads;
     ComputeBackwardFusedParamsCUDAKernel<T>
         <<<dim3(N, G), num_threads, 0, cuda_stream>>>(
diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh
index 9c692e1e6c9e..6398230e5d5a 100644
--- a/aten/src/ATen/native/cuda/im2col.cuh
+++ b/aten/src/ATen/native/cuda/im2col.cuh
@@ -1,9 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
-
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index c8010a6e9b0a..0b6dcd3787a4 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <c10/core/ScalarType.h>
 #include <c10/util/irange.h>
 #include <c10/util/hash.h>
@@ -6,10 +7,10 @@
 #include <ATen/jit_macros.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
-#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/code_template.h>
 #include <ATen/native/cuda/jit_utils.h>
 #include <ATen/cuda/llvm_jit_strings.h>
+#include <ATen/native/cuda/reduction_template.cuh>
 
 #include <sstream>
 #include <fstream>
@@ -82,7 +83,7 @@ const std::string jit_common_types = R"ESCAPE(
   _(void, QInt32) /* 14 */                        \
   _(at::BFloat16, BFloat16) /* 15 */                             \
 
-  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_) \
+  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_)       \
   _(uint8_t, Byte)                                                 \
   _(int8_t, Char)                                                  \
   _(int16_t, Short)                                                \
@@ -91,6 +92,7 @@ const std::string jit_common_types = R"ESCAPE(
   _(at::Half, Half)                                                \
   _(float, Float)                                                  \
   _(double, Double)                                                \
+  _(std::complex<at::Half>, ComplexHalf)                           \
   _(std::complex<float>, ComplexFloat)                             \
   _(std::complex<double>, ComplexDouble)                           \
   _(bool, Bool)                                                    \
@@ -118,11 +120,17 @@ const std::string jit_common_types = R"ESCAPE(
   Array() = default;
   Array(const Array&) = default;
   Array& operator=(const Array&) = default;
+  __device__ Array(T x) {
+    for (int i = 0; i < size; i++) {
+      data[i] = x;
+    }
+  }
   };
 
   ${half_string}
   ${bfloat16_string}
   ${complex_body_string}
+  ${complex_half_body_string}
   ${complex_math_string}
 
 
@@ -249,6 +257,29 @@ const std::string dynamic_cast_support_literal = R"ESCAPE(
     }
   };
 
+  template <>
+  struct static_cast_with_inter_type<std::complex<at::Half>, at::BFloat16> {
+    static inline std::complex<at::Half> apply(at::BFloat16 src) {
+      return static_cast<std::complex<at::Half>>(float{src});
+    }
+  };
+
+  template <>
+  struct static_cast_with_inter_type<std::complex<at::Half>, at::Half> {
+    static inline std::complex<at::Half> apply(at::Half src) {
+      return static_cast<std::complex<at::Half>>(float{src});
+    }
+  };
+
+  template <>
+  struct static_cast_with_inter_type<
+      std::complex<at::Half>,
+      std::complex<double>> {
+    static inline std::complex<at::Half> apply(std::complex<double> src) {
+      return static_cast<std::complex<at::Half>>(static_cast<std::complex<float>>(src));
+    }
+  };
+
   // Fetch a value with dynamic type src_type from ptr, and cast it to static type dest_t.
   #define FETCH_AND_CAST_CASE(type, scalartype) \
     case ScalarType::scalartype:                \
@@ -256,7 +287,7 @@ const std::string dynamic_cast_support_literal = R"ESCAPE(
   template<typename dest_t>
   __device__ inline dest_t fetch_and_cast(const ScalarType src_type, const void *ptr) {
     switch (src_type) {
-        AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(FETCH_AND_CAST_CASE)
+        AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(FETCH_AND_CAST_CASE)
         default:
           ERROR_UNSUPPORTED_CAST
     }
@@ -271,7 +302,7 @@ const std::string dynamic_cast_support_literal = R"ESCAPE(
   template<typename src_t>
   __device__ inline void cast_and_store(const ScalarType dest_type, void *ptr, src_t value) {
   switch (dest_type) {
-      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(CAST_AND_STORE_CASE)
+      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(CAST_AND_STORE_CASE)
       default:;
   }
   ERROR_UNSUPPORTED_CAST
@@ -322,10 +353,7 @@ const std::string no_dynamic_cast_support_literal = R"ESCAPE(
 
 )ESCAPE";
 
-const std::string jit_code_template = R"ESCAPE(
-
-  ${dynamic_casting_string}
-
+const std::string offset_calc_template = R"ESCAPE(
   template <typename T>
   struct DivMod {
   T div;
@@ -409,6 +437,14 @@ const std::string jit_code_template = R"ESCAPE(
     ${index_type} strides_[25][NARGS];
   };
 
+
+)ESCAPE";
+
+const std::string jit_code_template = R"ESCAPE(
+
+  ${dynamic_casting_string}
+
+
   ${functor}
 
   // TODO: setup grid-stride loop
@@ -709,7 +745,10 @@ std::string generate_code(
     functor_args << "arg0[j], scalar_val";
   }
   env.s("args", functor_args.str());
-  if (f_inputs_type == "at::Half" || result_type == "at::Half" || dynamic_casting) {
+  if (f_inputs_type == "at::Half" || result_type == "at::Half" ||
+      f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // complex<Half> depends on complex<T> and Half dtypes.
     env.s("half_string", jiterator_half_support_literal);
   } else {
     env.s("half_string", "");
@@ -722,7 +761,9 @@ std::string generate_code(
   // the definition of complex math functions is only needed when the compute type is complex
   // but the definition of std::complex is needed for dynamic casting even if the compute type is not complex
   if (f_inputs_type == "std::complex<float>" || result_type == "std::complex<float>" ||
-      f_inputs_type == "std::complex<double>" || result_type == "std::complex<double>") {
+      f_inputs_type == "std::complex<double>" || result_type == "std::complex<double>" ||
+      f_inputs_type == "std::complex<at::Half>" || result_type == "std::complex<at::Half>") {
+    // complex<Half> depends on complex<T> and Half dtypes.
     env.s("traits_string", get_traits_string());
     env.s("complex_body_string", get_complex_body_string());
     env.s("complex_math_string", get_complex_math_string());
@@ -735,6 +776,15 @@ std::string generate_code(
     env.s("complex_body_string", "");
     env.s("complex_math_string", "");
   }
+  if (f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // dynamic_casting requires the definition of all types
+    // include complex<at::Half>
+    // Look at the definition of `StoreWithCast` and `LoadWithCast`.
+    env.s("complex_half_body_string", get_complex_half_body_string());
+  } else {
+    env.s("complex_half_body_string", "");
+  }
 
   if (!vectorized) {
     if (!dynamic_casting) {
@@ -769,7 +819,7 @@ std::string generate_code(
                   << ">(out[j], data[0], output_offsets[0]);\n";
     env.s("store_outputs", store_outputs.str());
 
-    static auto cuda_template = at::jit::CodeTemplate(jit_common_types + jit_code_template);
+    static auto cuda_template = at::jit::CodeTemplate(jit_common_types + offset_calc_template + jit_code_template);
     const auto code = cuda_template.format(env);
     return code;
   }
@@ -808,6 +858,134 @@ std::string generate_code(
   return code;
 }
 
+// Creates directories recursively
+bool _r_mkdir(const std::string& dir) {
+  // Check if current dir exists
+  const char* p_dir = dir.c_str();
+  const bool dir_exists = (access(p_dir, F_OK) == 0);
+  if (dir_exists) {
+    return true;
+  }
+
+  // Try to create current directory
+#ifdef _WIN32
+  int ret = _mkdir(dir.c_str());
+#else
+  int ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  // Success
+  if (ret == 0) {
+    return true;
+  }
+
+  // Find folder separator and check if we are at the top
+  auto  pos = dir.find_last_of("/\\");
+  if (pos == std::string::npos) {
+    return false;
+  }
+
+  // Try to create parent directory
+  if (!(_r_mkdir(dir.substr(0, pos)))) {
+    return false;
+  }
+
+  // Try to create complete path again
+#ifdef _WIN32
+  ret = _mkdir(dir.c_str());
+#else
+  ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  return ret == 0;
+}
+
+// Creates directories recursively assuming that base exists
+bool r_mkdir_with_base(std::string& base, std::string& dir){
+  const char* p_base = base.c_str();
+  const bool base_exists = (access(p_base, F_OK) == 0);
+  if (!base_exists) {
+    return false;
+  }
+
+  // remove trailing '/' or '\\'
+  if ((base[base.size()-1]=='/') || base[base.size()-1]=='\\') {
+    base.pop_back();
+  }
+  if ((dir[dir.size()-1]=='/') || dir[dir.size()-1]=='\\') {
+    dir.pop_back();
+  }
+
+  return _r_mkdir(base+dir);
+
+}
+
+std::string load_code_template(const std::string& path) {
+  std::ifstream ifs{path};
+  std::string s{
+    std::istreambuf_iterator<char>(ifs),
+    std::istreambuf_iterator<char>()};
+  return s;
+}
+
+std::string generate_reduction_code(
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const int vt0,
+    const std::string& f_inputs_type,
+    const std::string& reduction_accum_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen) {
+      at::jit::TemplateEnv env;
+      env.s("index_type", "unsigned int");
+      env.s("scalar_type", f_inputs_type);
+      env.s("result_type", result_type);
+      env.s("reduction_accum_type", reduction_accum_type);
+      env.s("vt0", std::to_string(vt0));
+      env.s("name", name);
+      env.s("max_threads_lb", std::to_string(max_threads_codegen));
+      // reductions don't support dynamic casting, so the only way to get nonstandard types
+      // is through input
+      if (f_inputs_type == "at::Half" || f_inputs_type == "std::complex<at::Half>") {
+        // complex<Half> depends on complex<T> and Half dtypes.
+        env.s("half_string", jiterator_half_support_literal);
+      } else {
+        env.s("half_string", "");
+      }
+      if (f_inputs_type == "at::BFloat16") {
+        env.s("bfloat16_string", jiterator_bfloat16_support_literal);
+      } else {
+        env.s("bfloat16_string", "");
+      }
+      if (f_inputs_type == "std::complex<float>" ||
+          f_inputs_type == "std::complex<double>" ||
+          f_inputs_type == "std::complex<at::Half>" ) {
+        // complex<Half> depends on complex<T> and Half dtypes.
+        env.s("traits_string", get_traits_string());
+        env.s("complex_body_string", get_complex_body_string());
+        env.s("complex_math_string", get_complex_math_string());
+        env.s("complex", std::to_string(1));
+      } else {
+        env.s("traits_string", "");
+        env.s("complex_body_string", "");
+        env.s("complex_math_string", "");
+        env.s("complex", std::to_string(0));
+      }
+      if (f_inputs_type == "std::complex<at::Half>") {
+        env.s("complex_half_body_string", get_complex_half_body_string());
+      } else {
+        env.s("complex_half_body_string", "");
+      }
+      env.s("cmath_string", get_cmath_string());
+      env.s("functor", func);
+      env.s("output_vec_size", std::to_string(vec_size));
+      static auto cuda_template = at::jit::CodeTemplate(
+        jit_common_types + offset_calc_template + get_reduction_template());
+      const auto code = cuda_template.format(env);
+      return code;
+}
 
 // Acquires (possibly creating) the kernel cache directory
 c10::optional<std::string> get_cache_dir() {
@@ -822,6 +1000,8 @@ c10::optional<std::string> get_cache_dir() {
   // Cache path comes from PYTORCH_KERNEL_CACHE_PATH, then TEMP (Windows) or XDG_CACHE_HOME (Linux), then HOME environment variables
   std::string cache_dir;
   char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
+  // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
+  std::string kernels_cache_dir = "";
   if (ptkcp != nullptr) {
     cache_dir = std::string(ptkcp);
   } else {
@@ -832,7 +1012,8 @@ c10::optional<std::string> get_cache_dir() {
     ptkcp = std::getenv("XDG_CACHE_HOME");
 #endif
     if (ptkcp != nullptr) {
-      cache_dir = std::string(ptkcp) + "/torch/kernels";
+      kernels_cache_dir = "/torch/kernels";
+      cache_dir = std::string(ptkcp) + kernels_cache_dir;
     } else {
       // Falls back to HOME/.cache
       ptkcp = std::getenv("HOME");
@@ -841,7 +1022,8 @@ c10::optional<std::string> get_cache_dir() {
                         " This disables kernel caching.");
         return {};
       } else {
-        cache_dir = std::string(ptkcp) + "/.cache/torch/kernels";
+        kernels_cache_dir = "/.cache/torch/kernels";
+        cache_dir = std::string(ptkcp) + kernels_cache_dir;
       }
     }
   }
@@ -850,11 +1032,8 @@ c10::optional<std::string> get_cache_dir() {
   const char* p_cache_dir = cache_dir.c_str();
   const bool cache_dir_exists = (access(p_cache_dir, F_OK) == 0);
   if (!cache_dir_exists) {
-#ifdef _WIN32
-    if (_mkdir(p_cache_dir) != 0) {
-#else
-    if (mkdir(p_cache_dir, S_IRWXU | S_IRWXG | S_IRWXO) != 0) {
-#endif
+    std::string s_ptkcp = std::string(ptkcp);
+    if (!r_mkdir_with_base(s_ptkcp, kernels_cache_dir)) {
       TORCH_WARN_ONCE("Specified kernel cache directory could not be created! This disables kernel caching.",
                       " Specified directory is ", cache_dir, ".",
                       " This warning will appear only once per process.");
@@ -886,9 +1065,7 @@ c10::optional<std::string> get_cache_dir() {
 NvrtcFunction jit_pwise_function(
     const std::string& code,
     const std::string& kernel_name) {
-
   initializeCudaContext();
-
   // Acquires CUDA and nvrtc versions and whether we're compiling to ptx or SASS
   const cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   int cuda_major = 0, cuda_minor = 0, nvrtc_major = 0, nvrtc_minor = 0;
@@ -983,7 +1160,7 @@ NvrtcFunction jit_pwise_function(
     AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, log.data()));
     std::stringstream cu;
     cu << log.data();
-    throw std::runtime_error(cu.str() + code);
+    throw std::runtime_error(code + cu.str());
   }
 
   size_t ptx_size = 0;
@@ -1049,24 +1226,26 @@ NvrtcFunction jit_pwise_function(
 void launch_jitted_pwise_function(
     NvrtcFunction function,
     void* args[],
-    const int nBlocks,
-    const int kBlockSize) {
+    const dim3 nBlocks,
+    const dim3 kBlockSize,
+    const int smem) {
   initializeCudaContext();
   const auto& nvrtc = at::globalContext().getNVRTC();
   // Launches kernel on current stream
   auto stream = at::cuda::getCurrentCUDAStream();
   AT_CUDA_DRIVER_CHECK(nvrtc.cuLaunchKernel(
     function.function,
-    nBlocks,
-    1,
-    1,
-    kBlockSize,
-    1,
-    1,
-    0,
+    nBlocks.x,
+    nBlocks.y,
+    nBlocks.z,
+    kBlockSize.x,
+    kBlockSize.y,
+    kBlockSize.z,
+    smem,
     stream,
     args,
     nullptr));
 }
 
+
 }}} // at::cuda::jit
diff --git a/aten/src/ATen/native/cuda/jit_utils.h b/aten/src/ATen/native/cuda/jit_utils.h
index 908ffabfea2f..2af015bbb7fe 100644
--- a/aten/src/ATen/native/cuda/jit_utils.h
+++ b/aten/src/ATen/native/cuda/jit_utils.h
@@ -8,6 +8,7 @@
 #include <c10/util/irange.h>
 #include <ATen/jit_macros.h>
 #include <ATen/cuda/detail/LazyNVRTC.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 namespace at { namespace cuda { namespace jit {
 
@@ -32,6 +33,19 @@ std::string generate_code(
     bool vectorized=false,
     int vec_size=0);
 
+std::string generate_reduction_code(
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const int vt0,
+    const std::string& f_inputs_type,
+    const std::string& reduction_accum_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen);
+
 NvrtcFunction jit_pwise_function(
     const std::string& code,
     const std::string& kernel_name);
@@ -39,8 +53,9 @@ NvrtcFunction jit_pwise_function(
 void launch_jitted_pwise_function(
     NvrtcFunction function,
     void* args[],
-    const int nBlocks,
-    const int kBlockSize);
+    const dim3 nBlocks,
+    const dim3 kBlockSize,
+    const int smem=0);
 
 template <typename T>
 struct delayed_false : std::false_type {
@@ -53,7 +68,7 @@ struct delayed_false : std::false_type {
 template <typename T>
 inline std::string typeName() {
   // we can't use static_assert(false) directly as the
-  // program will be not compile even if the template is not
+  // program will be not compiled even if the template is not
   // instantiated, so we use `delayed_false`
   // to make sure compiler doesn't eagerly raise
   // fail this assertion.
@@ -71,16 +86,18 @@ AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN)
 // JIT uses std::complex directly, because nvRTC compile programs
 // with -default-device, so there is no such issue like:
 //   "std::sin(complex) is __host__ only"
+template <> inline std::string typeName<bool>(){
+    return "bool";
+}
+template <> inline std::string typeName<c10::complex<at::Half>>(){
+    return "std::complex<at::Half>";
+}
 template <> inline std::string typeName<c10::complex<float>>(){
     return "std::complex<float>";
 }
 template <> inline std::string typeName<c10::complex<double>>(){
     return "std::complex<double>";
 }
-template <> inline std::string typeName<c10::complex<c10::Half>>(){
-    TORCH_INTERNAL_ASSERT(false, "torch.complex32 is not supported");
-    return "std::complex<at::Half>";
-}
 template <> inline std::string typeName<at::Half>(){
     return "at::Half";
 }
@@ -88,4 +105,20 @@ template <> inline std::string typeName<at::BFloat16>(){
     return "at::BFloat16";
 }
 
+#define TYPE_NAME_CASE(ctype, scalartype)                    \
+  case ScalarType::scalartype:  return std::string(#ctype);
+inline std::string typeName(ScalarType t) {
+    switch (t) {
+        AT_FORALL_SCALAR_TYPES(TYPE_NAME_CASE)
+        case ScalarType::Bool : return "bool";
+        case ScalarType::Half : return "at::Half";
+        case ScalarType::BFloat16 : return "at::BFloat16";
+        case ScalarType::ComplexFloat : return "std::complex<float>";
+        case ScalarType::ComplexDouble : return "std::complex<double>";
+        default:
+            TORCH_CHECK(false, "invalid type for jiterator");
+    }
+}
+#undef TYPE_NAME_CASE
+
 }}}  // namespace at::cuda::jit
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 940ff7d06819..faa0fd2d4b98 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -1,16 +1,28 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/layer_norm.h>
 
 #include <type_traits>
 
 #include <thrust/tuple.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/native/cuda/block_reduce.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/native_layer_norm_backward_native.h>
+#include <ATen/ops/zeros_like_native.h>
+#endif
 
 #include <c10/cuda/CUDAMathCompat.h>
 
@@ -636,8 +648,8 @@ void launch_vectorized_layer_norm_kernel(
 ) {
     //constexpr int alignment = 16; //currently unused to make sure float and half results are bw accurate
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const int num_threads = 128;
-    const dim3 threads(C10_WARP_SIZE,num_threads/C10_WARP_SIZE,1);
+    const int warp_size = at::cuda::warp_size();
+    const dim3 threads(warp_size, num_threads() / warp_size, 1);
     const dim3 blocks(M);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
     int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
@@ -739,10 +751,10 @@ void LayerNormBackwardKernelImplInternal(
   T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   if (dX_data != nullptr) {
-    const int num_threads = 128;
+    const int warp_size = at::cuda::warp_size();
     const dim3 blocks(M);
-    int nshared = (num_threads/C10_WARP_SIZE) * sizeof(T_ACC);
-    layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, cuda_stream>>>(dY_data,
+    int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
+    layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
     X_data, mean_data, rstd_data, gamma_data, dX_data, N);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
@@ -933,6 +945,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cuda(
   return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
+REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 9910859d8b86..7eee90a1b227 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -1,7 +1,9 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/BatchLinearAlgebra.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 
@@ -9,12 +11,30 @@
 
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
-#include <ATen/native/Resize.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
+#include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cholesky_solve_helper_native.h>
+#include <ATen/ops/_linalg_inv_out_helper_native.h>
+#include <ATen/ops/_linalg_qr_helper_native.h>
+#include <ATen/ops/_symeig_helper_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/linalg_eigh.h>
+#include <ATen/ops/linalg_eigvalsh.h>
+#include <ATen/ops/lstsq_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #if AT_MAGMA_ENABLED()
 #include <magma_types.h>
 #include <magma_v2.h>
@@ -25,11 +45,22 @@ const bool use_magma_ = true;
 namespace {
 struct MagmaInitializer {
   MagmaInitializer() {
+#if defined(BUILD_LAZY_CUDA_LINALG)
+    magma_init();
+#else
     ::at::cuda::detail::set_magma_init_fn([]{ magma_init(); });
-  };
+#endif
+  }
 } initializer;
 }  // namespace (anonymous)
 
+#define AT_MAGMA_VERSION MAGMA_VERSION_MAJOR*100 + MAGMA_VERSION_MINOR*10 + MAGMA_VERSION_MICRO
+
+// Check that MAGMA never releases MAGMA_VERSION_MINOR >= 10 or MAGMA_VERSION_MICRO >= 10
+#if MAGMA_VERSION_MINOR >= 10 || MAGMA_VERSION_MICRO >= 10
+#error "MAGMA release minor or micro version >= 10, please correct AT_MAGMA_VERSION"
+#endif
+
 #else
 const bool use_magma_ = false;
 
@@ -37,18 +68,28 @@ const bool use_magma_ = false;
 
 namespace at {
 namespace native {
+#if defined(BUILD_LAZY_CUDA_LINALG)
+// All registrations with PyTorch runtime should be done dynamically
+// so if library is lazy loaded it must not export anything, otherwise
+// it can result in symbol clashes
+namespace lazy_linalg {
+#endif
 
 #if AT_MAGMA_ENABLED()
-template<class scalar_t>
-void magmaSolve(
-    magma_int_t n, magma_int_t nrhs, scalar_t* dA, magma_int_t ldda,
-    magma_int_t* ipiv, scalar_t* dB, magma_int_t lddb, magma_int_t* info);
 
-template<class scalar_t>
-void magmaSolveBatched(
-    magma_int_t n, magma_int_t nrhs, scalar_t** dA_array, magma_int_t ldda,
-    magma_int_t** dipiv_array, scalar_t** dB_array, magma_int_t lddb,
-    magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue);
+template <class scalar_t>
+void magmaLdlHermitian(
+    magma_uplo_t uplo,
+    magma_int_t n,
+    scalar_t* dA,
+    magma_int_t ldda,
+    magma_int_t* ipiv,
+    magma_int_t* info) {
+  TORCH_CHECK(
+      false,
+      "LDL decomposition is not available.",
+      "Please rebuild with MAGMA 2.5.4+.");
+}
 
 template<class scalar_t>
 void magmaLu(
@@ -163,85 +204,63 @@ void magmaGels(
     scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb,
     scalar_t* hwork, magma_int_t lwork, magma_int_t* info);
 
-template<>
-void magmaSolve<double>(
-    magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda,
-    magma_int_t* ipiv, double* dB, magma_int_t lddb, magma_int_t* info) {
-  MagmaStreamSyncGuard guard;
-  magma_dgesv_gpu(n, nrhs, dA, ldda, ipiv, dB, lddb, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
+#if AT_MAGMA_VERSION >= 254
 
-template<>
-void magmaSolve<float>(
-    magma_int_t n, magma_int_t nrhs, float* dA, magma_int_t ldda,
-    magma_int_t* ipiv, float* dB, magma_int_t lddb, magma_int_t* info) {
+template <>
+void magmaLdlHermitian<double>(
+    magma_uplo_t uplo,
+    magma_int_t n,
+    double* dA,
+    magma_int_t ldda,
+    magma_int_t* ipiv,
+    magma_int_t* info) {
   MagmaStreamSyncGuard guard;
-  magma_sgesv_gpu(n, nrhs, dA, ldda, ipiv, dB, lddb, info);
+  magma_dsytrf_gpu(uplo, n, dA, ldda, ipiv, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
-void magmaSolve<c10::complex<double>>(
-    magma_int_t n, magma_int_t nrhs, c10::complex<double>* dA, magma_int_t ldda,
-    magma_int_t* ipiv, c10::complex<double>* dB, magma_int_t lddb, magma_int_t* info) {
+template <>
+void magmaLdlHermitian<float>(
+    magma_uplo_t uplo,
+    magma_int_t n,
+    float* dA,
+    magma_int_t ldda,
+    magma_int_t* ipiv,
+    magma_int_t* info) {
   MagmaStreamSyncGuard guard;
-  magma_zgesv_gpu(n, nrhs,
-    reinterpret_cast<magmaDoubleComplex*>(dA), ldda, ipiv,
-    reinterpret_cast<magmaDoubleComplex*>(dB), lddb, info);
+  magma_ssytrf_gpu(uplo, n, dA, ldda, ipiv, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
-void magmaSolve<c10::complex<float>>(
-    magma_int_t n, magma_int_t nrhs, c10::complex<float>* dA, magma_int_t ldda,
-    magma_int_t* ipiv, c10::complex<float>* dB, magma_int_t lddb, magma_int_t* info) {
+template <>
+void magmaLdlHermitian<c10::complex<double>>(
+    magma_uplo_t uplo,
+    magma_int_t n,
+    c10::complex<double>* dA,
+    magma_int_t ldda,
+    magma_int_t* ipiv,
+    magma_int_t* info) {
   MagmaStreamSyncGuard guard;
-  magma_cgesv_gpu(n, nrhs,
-    reinterpret_cast<magmaFloatComplex*>(dA), ldda, ipiv,
-    reinterpret_cast<magmaFloatComplex*>(dB), lddb, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-template<>
-void magmaSolveBatched<double>(
-    magma_int_t n, magma_int_t nrhs, double** dA_array, magma_int_t ldda,
-    magma_int_t** dipiv_array, double** dB_array, magma_int_t lddb,
-    magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) {
-  magma_dgesv_batched(n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, dinfo_array, batch_count, magma_queue.get_queue());
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-template<>
-void magmaSolveBatched<float>(
-    magma_int_t n, magma_int_t nrhs, float** dA_array, magma_int_t ldda,
-    magma_int_t** dipiv_array, float** dB_array, magma_int_t lddb,
-    magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) {
-  magma_sgesv_batched(n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, dinfo_array, batch_count, magma_queue.get_queue());
+  magma_zhetrf_gpu(
+      uplo, n, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, ipiv, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
-void magmaSolveBatched<c10::complex<double>>(
-    magma_int_t n, magma_int_t nrhs, c10::complex<double>** dA_array, magma_int_t ldda,
-    magma_int_t** dipiv_array, c10::complex<double>** dB_array, magma_int_t lddb,
-    magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) {
-  magma_zgesv_batched(n, nrhs,
-    reinterpret_cast<magmaDoubleComplex**>(dA_array), ldda, dipiv_array,
-    reinterpret_cast<magmaDoubleComplex**>(dB_array), lddb, dinfo_array, batch_count, magma_queue.get_queue());
+template <>
+void magmaLdlHermitian<c10::complex<float>>(
+    magma_uplo_t uplo,
+    magma_int_t n,
+    c10::complex<float>* dA,
+    magma_int_t ldda,
+    magma_int_t* ipiv,
+    magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_chetrf_gpu(
+      uplo, n, reinterpret_cast<magmaFloatComplex*>(dA), ldda, ipiv, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
-void magmaSolveBatched<c10::complex<float>>(
-    magma_int_t n, magma_int_t nrhs, c10::complex<float>** dA_array, magma_int_t ldda,
-    magma_int_t** dipiv_array, c10::complex<float>** dB_array, magma_int_t lddb,
-    magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) {
-  magma_cgesv_batched(n, nrhs,
-    reinterpret_cast<magmaFloatComplex**>(dA_array), ldda, dipiv_array,
-    reinterpret_cast<magmaFloatComplex**>(dB_array), lddb, dinfo_array, batch_count, magma_queue.get_queue());
-  AT_CUDA_CHECK(cudaGetLastError());
-}
+#endif // AT_MAGMA_VERSION >= 254
 
 template<>
 void magmaLu<double>(
@@ -1249,95 +1268,127 @@ magma_trans_t to_magma(TransposeType trans) {
   auto storage_##name = pin_memory<type>(size); \
   name = static_cast<type*>(storage_##name.data());
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+namespace {
 
 template <typename scalar_t>
-static void apply_solve(Tensor& b, Tensor& A, Tensor& infos_out) {
+void apply_ldl_factor_magma(
+    const Tensor& A,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper) {
 #if !AT_MAGMA_ENABLED()
-AT_ERROR("solve: MAGMA library not found in "
-    "compilation. Please rebuild with MAGMA.");
+  TORCH_CHECK(
+      false,
+      "torch.linalg.ldl_factor: MAGMA library not found in "
+      "compilation. Please rebuild with MAGMA.");
 #else
-  auto A_data = A.data_ptr<scalar_t>();
-  auto b_data = b.data_ptr<scalar_t>();
+  auto batch_size = batchCount(A);
   magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)");
-  magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
-  magma_int_t lda = std::max(magma_int_t{1}, n);
-
-  if (b.dim() == 2) {
-    auto ipiv = at::empty({n}, at::kInt);
-    // magmaSolve requires infos tensor to live on CPU
-    Tensor infos = at::empty(infos_out.sizes(), infos_out.options().device(kCPU));
-    magmaSolve<scalar_t>(n, nrhs, A_data, lda, ipiv.data_ptr<magma_int_t>(),
-                        b_data, lda, infos.data_ptr<magma_int_t>());
-    infos_out.copy_(infos);
-  } else {
-    auto infos_data = infos_out.data_ptr<magma_int_t>();
-    auto A_mat_stride = matrixStride(A);
-    auto b_mat_stride = matrixStride(b);
-    magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount");
-
-    magma_int_t* ipiv_data;
-    magma_int_t** ipiv_array;
-    scalar_t** A_array;
-    scalar_t** b_array;
-
-    ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n);
-    ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size);
-    ALLOCATE_ARRAY(A_array, scalar_t*, batch_size);
-    ALLOCATE_ARRAY(b_array, scalar_t*, batch_size);
-
-    // Set up the created arrays
-    for (int64_t i = 0; i < batch_size; i++) {
-      A_array[i] = &A_data[i * A_mat_stride];
-      b_array[i] = &b_data[i * b_mat_stride];
-      ipiv_array[i] = &ipiv_data[i * n];
-    }
-
-    MAGMAQueue magma_queue(b.get_device());
+  magma_int_t leading_dim = magma_int_cast(A.stride(-1), "A.stride(-1)");
+  magma_uplo_t uplo = upper ? MagmaUpper : MagmaLower;
 
-    constexpr int64_t batch_limit = 65535;
-    // Compute as many batches of 65535 possible
-    // The number of "mini"-batches are floor(batch_size / batch_limit)
-    // and these cover floor(batch_size / batch_limit) * batch_limit matrix solves
-    int64_t mini_batches = batch_size / batch_limit, mini_idx;
-    for (mini_idx = 0; mini_idx < mini_batches * batch_limit; mini_idx += batch_limit) {
-      scalar_t** A_array_cur = &A_array[mini_idx];
-      scalar_t** b_array_cur = &b_array[mini_idx];
-      magma_int_t** ipiv_array_cur = &ipiv_array[mini_idx];
-      magma_int_t* info_array_cur = &infos_data[mini_idx];
+  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
 
-      magmaSolveBatched<scalar_t>(
-          n, nrhs, A_array_cur, lda, ipiv_array_cur, b_array_cur, lda,
-          info_array_cur, batch_limit, magma_queue);
-    }
+  auto a_data = A.data_ptr<scalar_t>();
+  Tensor pivots_cpu =
+      at::empty_like(pivots, pivots.options().device(kCPU).pinned_memory(true));
+  auto pivots_data = pivots_cpu.data_ptr<magma_int_t>();
+  Tensor info_cpu =
+      at::empty_like(info, info.options().device(kCPU).pinned_memory(true));
+  auto info_data = info_cpu.data_ptr<magma_int_t>();
+
+  for (const auto i : c10::irange(batch_size)) {
+    scalar_t* a_working_ptr = &a_data[i * a_stride];
+    magma_int_t* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    magma_int_t* info_working_ptr = &info_data[i];
+    magmaLdlHermitian<scalar_t>(
+        uplo,
+        n,
+        a_working_ptr,
+        leading_dim,
+        pivots_working_ptr,
+        info_working_ptr);
+  }
+  pivots.copy_(pivots_cpu);
+  info.copy_(info_cpu);
+#endif
+}
 
-    // Compute whatever is left = batch_size - floor(batch_size / batch_limit) * batch_limit
-    // which concisely is equal to batch_size % batch_limit
-    if (batch_size % batch_limit != 0) {
-      magmaSolveBatched<scalar_t>(
-          n, nrhs, &A_array[mini_idx], lda, &ipiv_array[mini_idx], &b_array[mini_idx], lda,
-          &infos_data[mini_idx], batch_size % batch_limit, magma_queue);
-    }
+void ldl_factor_magma(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+  if (LD.is_complex()) {
+    TORCH_CHECK(
+        hermitian,
+        "torch.linalg.ldl_factor: complex tensors with hermitian=False flag are not supported with MAGMA backend. ",
+        "Currently preferred backend is ",
+        at::globalContext().linalgPreferredBackend(),
+        ", please set 'default' or 'cusolver' backend with torch.backends.cuda.preferred_linalg_library");
   }
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      LD.scalar_type(), "ldl_factor_magma", [&] {
+        apply_ldl_factor_magma<scalar_t>(LD, pivots, info, upper);
+      });
+}
+
+void ldl_factor_kernel(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      return ldl_factor_cusolver(
+          LD, pivots, info, upper, hermitian);
+    case at::LinalgBackend::Magma:
+      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+    default:
+    // By default use cusolver if available and magma otherwise.
+    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
+    // call magma for complex inputs
+#ifdef USE_CUSOLVER
+#if AT_MAGMA_ENABLED() && (AT_MAGMA_VERSION >= 254)
+      if (LD.is_complex() && hermitian) {
+        return ldl_factor_magma(
+            LD, pivots, info, upper, hermitian);
+      }
+#endif
+      return ldl_factor_cusolver(
+          LD, pivots, info, upper, hermitian);
+#else
+      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
+  }
 }
 
-std::tuple<Tensor, Tensor> _solve_helper_cuda(const Tensor& self, const Tensor& A) {
-  auto self_working_copy = cloneBatchedColumnMajor(self);
-  auto A_working_copy = cloneBatchedColumnMajor(A);
-  // infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty
-  auto infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cuda", [&]{
-    apply_solve<scalar_t>(self_working_copy, A_working_copy, infos);
-  });
-  if (self.dim() > 2) {
-    batchCheckErrors(infos, "solve_cuda");
-  } else {
-    singleCheckErrors(infos.item().toInt(), "solve_cuda");
+void ldl_solve_kernel(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper,
+    bool hermitian) {
+  // TODO: It should be possible to add the MAGMA backend for this function when using MAGMA 2.6.0
+  // https://bitbucket.org/icl/magma/src/c703d112dcf19eb8c73676cef10888aa2ef73457/ReleaseNotes#lines-48
+  if (LD.is_complex()) {
+    TORCH_CHECK(
+        !hermitian,
+        "torch.linalg.ldl_solve: complex tensors with hermitian=True flag are not supported on CUDA.");
   }
-  return std::tuple<Tensor, Tensor>(self_working_copy, A_working_copy);
+
+  ldl_solve_cusolver(LD, pivots, B, upper);
 }
 
+} // anonymous namespace
+
+REGISTER_CUDA_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
+REGISTER_CUDA_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /*
@@ -2275,7 +2326,7 @@ std::tuple<Tensor, Tensor> linalg_qr_helper_magma(const Tensor& self, c10::strin
   std::tie(compute_q, reduced) = _parse_qr_mode(mode);
 
   // Setup input geometry and inputs for apply_qr
-  std::vector<int64_t> q_sizes, q_strides;
+  DimVector q_sizes, q_strides;
   int64_t n_columns_q;
   std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, reduced);
   Tensor q_working_copy, r_working_copy;
@@ -2417,7 +2468,7 @@ std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvec
   Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt).device(at::kCPU));
 
   auto eigvals_shape = IntArrayRef(self.sizes().data(), self.dim()-1);  // self.shape[:-1]
-  ScalarType real_dtype = toValueType(self.scalar_type());
+  ScalarType real_dtype = toRealValueType(self.scalar_type());
 
   // magmaSyevd uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors.
   // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues.
@@ -2635,7 +2686,7 @@ TORCH_CHECK(false, "Calling torch.linalg.eig on a CUDA tensor requires compiling
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (input.is_complex()) {
-    ScalarType real_dtype = toValueType(input.scalar_type());
+    ScalarType real_dtype = toRealValueType(input.scalar_type());
     rwork = at::empty({lda * 2}, input.options().dtype(real_dtype));
     rwork_data = rwork.data_ptr<value_t>();
   }
@@ -2851,19 +2902,27 @@ static void apply_lu_solve_looped_magma(const Tensor& b, const Tensor& lu, const
   auto pivots_data = pivots_cpu.data_ptr<magma_int_t>();
 
   auto b_stride = matrixStride(b);
-  auto lu_stride = matrixStride(lu);
-  auto pivots_stride = pivots_cpu.size(-1);
+  auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0;
+  auto pivots_stride = pivots_cpu.dim() > 1 ? pivots_cpu.stride(-2) : 0;
   auto batch_size = batchCount(b);
 
   magma_int_t n = magma_int_cast(lu.size(-2), "n");
   magma_int_t nrhs = magma_int_cast(b.size(-1), "nrhs");
   auto leading_dimension = std::max<magma_int_t>(1, n);
 
+  // lu and pivots tensors can be broadcast to b
+  // here we construct a helper indexing tensor to linearly index into lu and pivots
+  IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2);
+  IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2);
+  BroadcastLinearIndices lu_index(
+      batchCount(lu), lu_batch_shape, b_batch_shape);
+
   int info = 0;
   for (decltype(batch_size) i = 0; i < batch_size; i++) {
+    int64_t lu_index_i = lu_index(i);
     scalar_t* b_working_ptr = &b_data[i * b_stride];
-    scalar_t* lu_working_ptr = &lu_data[i * lu_stride];
-    int* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride];
+    int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride];
 
     magmaLuSolve<scalar_t>(n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr, b_working_ptr, leading_dimension, &info, trans);
 
@@ -2896,6 +2955,8 @@ static void apply_lu_solve_batched_magma(const Tensor& b, const Tensor& lu, cons
       "Calling torch.lu_solve on a CUDA tensor requires compiling ",
       "PyTorch with MAGMA. Please rebuild with MAGMA.");
 #else
+  TORCH_INTERNAL_ASSERT(batchCount(b) == batchCount(lu), "batch_size of b and lu must be the same");
+  TORCH_INTERNAL_ASSERT(batchCount(lu) == batchCount(pivots.unsqueeze(-1)), "batch_size of lu and pivots must be the same");
   auto trans = to_magma(transpose);
   auto b_data = b.data_ptr<scalar_t>();
   auto lu_data = lu.data_ptr<scalar_t>();
@@ -2962,9 +3023,36 @@ static void lu_solve_looped_magma(const Tensor& b, const Tensor& lu, const Tenso
   });
 }
 
+namespace {
+
+c10::MaybeOwned<Tensor> maybe_expand_lu(const Tensor& b, const Tensor& lu) {
+  if (batchCount(b) != batchCount(lu)) {
+    IntArrayRef b_batch_size(b.sizes().data(), b.dim() - 2);
+    DimVector expand_size(b_batch_size);
+    expand_size.insert(expand_size.end(), {lu.size(-2), lu.size(-1)});
+    return c10::MaybeOwned<Tensor>::owned(
+        cloneBatchedColumnMajor(lu.expand(expand_size)));
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(lu);
+  }
+}
+
+c10::MaybeOwned<Tensor> maybe_expand_pivots(const Tensor& b,const Tensor& pivots) {
+  if (batchCount(b) != batchCount(pivots.unsqueeze(-1))) {
+    IntArrayRef b_batch_size(b.sizes().data(), b.dim() - 2);
+    DimVector expand_size(b_batch_size);
+    expand_size.insert(expand_size.end(), {pivots.size(-1)});
+    return c10::MaybeOwned<Tensor>::owned(
+        pivots.expand(expand_size).clone(at::MemoryFormat::Contiguous));
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(pivots);
+  }
+}
+
+}  // anonymous namespace
 
 static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType trans) {
-  auto batch_size = batchCount(lu);
+  auto batch_size = batchCount(b);
   auto m = lu.size(-2);
   auto b2 = b.size(-1);
   bool over_magma_dim_limit = b2 > 1024;  // magma implementation of LU solve cannot handle a b tensor with last dim > 1024 (https://bitbucket.org/icl/magma/issues/19/dgesv_batched-dgetrs_batched-fails-for)
@@ -2980,11 +3068,15 @@ static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Ten
 #endif // ifdef USE_CUSOLVER
 #ifdef CUDART_VERSION
   else if ((batch_size > 2 && m <= 128) || (batch_size > 8 && over_magma_dim_limit)) {
-    lu_solve_batched_cublas(b, lu, pivots, trans);
+    c10::MaybeOwned<Tensor> lu_ = maybe_expand_lu(b, lu);
+    c10::MaybeOwned<Tensor> pivots_ = maybe_expand_pivots(b, pivots);
+    lu_solve_batched_cublas(b, *lu_, *pivots_, trans);
   }
 #endif // ifdef CUDART_VERSION
   else {
-    lu_solve_batched_magma(b, lu, pivots, trans);
+    c10::MaybeOwned<Tensor> lu_ = maybe_expand_lu(b, lu);
+    c10::MaybeOwned<Tensor> pivots_ = maybe_expand_pivots(b, pivots);
+    lu_solve_batched_magma(b, *lu_, *pivots_, trans);
   }
 }
 
@@ -3159,27 +3251,20 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
         "Please rebuild with cuSOLVER.");
 #endif
   } else { // m >= n
-#if !AT_MAGMA_ENABLED()
-    // MAGMA is not available we can either use cuBLAS or cuSOLVER here
+#if !AT_ROCM_ENABLED()
+    // On CUDA platform we use either cuBLAS or cuSOLVER here
     // the batched vs looped dispatch is implemented based on the following performance results
     // https://github.com/pytorch/pytorch/pull/54725#issuecomment-832234456
     if (m <= 256 && batchCount(b) >= std::max<int64_t>(2, m / 16)) {
-      // if CUDART_VERSION is defined then cuBLAS is available
-      #ifdef CUDART_VERSION
       gels_batched_cublas(a, b, infos);
-      #else
-      // this would either call cuSOLVER or MAGMA,
-      // if MAGMA is called a runtime error is thrown about not finding MAGMA in compilation
-      gels_looped(a, b, infos);
-      #endif // CUDART_VERSION
     } else {
       gels_looped(a, b, infos);
     }
 #else
-    // if both MAGMA and cuSOLVER are available this would call cuSOLVER
-    // MAGMA is called if cuSOLVER is not available
-    gels_looped(a, b, infos);
-#endif // AT_MAGMA_ENABLED()
+    // On ROCm platform we can only use MAGMA here
+    // If MAGMA is not available, an error will be thrown
+    gels_magma(a, b, infos);
+#endif // !AT_ROCM_ENABLED()
   }
 }
 
@@ -3244,25 +3329,21 @@ std::tuple<Tensor, Tensor> legacy_lstsq_cuda(const Tensor &B, const Tensor &A) {
 #endif  // AT_MAGMA_ENABLED()
 }
 
-std::tuple<Tensor&, Tensor&> legacy_lstsq_out_cuda(
-    const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) {
-  const auto dtype = A.scalar_type();
-  TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ",
-              A.scalar_type(), " and ", B.scalar_type());
-  TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
-              " but found", A_out.scalar_type());
-  TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
-              " but found", B_out.scalar_type());
-  Tensor A_tmp, B_tmp;
-  std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A);
-  resize_output(A_out, A_tmp.sizes());
-  A_out.copy_(A_tmp);
-  resize_output(B_out, B_tmp.sizes());
-  B_out.copy_(B_tmp);
-  return std::tuple<Tensor&, Tensor&>(B_out, A_out);
-}
 
+#if defined(BUILD_LAZY_CUDA_LINALG)
+struct DispatchInitializer {
+  DispatchInitializer() {
+    cuda::detail::LinalgDispatch disp{ _symeig_helper_cuda,
+                                       _linalg_qr_helper_cuda,
+                                       _cholesky_solve_helper_cuda,
+                                       legacy_lstsq_cuda,
+                                       _linalg_inv_out_helper_cuda};
+    cuda::detail::registerLinalgDispatch(disp);
+  };
+} initializer;
 
+}  // namespace lazy_linalg
+#endif
 }}  // namespace at::native
 
 #undef ALLOCATE_ARRAY
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index efbd987a3814..c73a14f73b71 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/CUDAEvent.h>
@@ -10,6 +10,7 @@
 #include <c10/util/irange.h>
 
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/TransposeType.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/cuda/linalg/CUDASolver.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
@@ -95,6 +96,8 @@ static void apply_lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, con
 #ifndef CUDART_VERSION
   TORCH_CHECK(false, "lu_solve: cuBLAS backend for lu_solve is not available.")
 #else
+  TORCH_INTERNAL_ASSERT(batchCount(b) == batchCount(lu), "batch_size of b and lu must be the same");
+  TORCH_INTERNAL_ASSERT(batchCount(lu) == batchCount(pivots.unsqueeze(-1)), "batch_size of lu and pivots must be the same");
   const auto trans = to_cublas(transpose);
 
   auto pivots_data = pivots.data_ptr<int>();
@@ -122,6 +125,181 @@ void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pi
   });
 }
 
+namespace {
+
+template <typename scalar_t>
+void apply_ldl_factor_cusolver(
+    const Tensor& A,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper) {
+#ifndef USE_CUSOLVER
+  TORCH_CHECK(
+      false,
+      "Calling torch.linalg.ldl_factor on a CUDA tensor requires compiling ",
+      "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support.");
+#else
+  auto batch_size = batchCount(A);
+  auto n = cuda_int_cast(A.size(-2), "A.size(-2)");
+  auto lda = cuda_int_cast(A.stride(-1), "A.stride(-1)");
+  auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+
+  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
+
+  auto a_data = A.data_ptr<scalar_t>();
+  auto pivots_data = pivots.data_ptr<int>();
+  auto info_data = info.data_ptr<int>();
+
+  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
+
+  int lwork = 0;
+  at::cuda::solver::sytrf_bufferSize(handle, n, a_data, lda, &lwork);
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto work = allocator.allocate(sizeof(scalar_t) * lwork);
+
+  for (const auto i : c10::irange(batch_size)) {
+    auto* a_working_ptr = &a_data[i * a_stride];
+    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    auto* info_working_ptr = &info_data[i];
+    at::cuda::solver::sytrf(
+        handle,
+        uplo,
+        n,
+        a_working_ptr,
+        lda,
+        pivots_working_ptr,
+        reinterpret_cast<scalar_t*>(work.get()),
+        lwork,
+        info_working_ptr);
+  }
+#endif
+}
+
+template <typename scalar_t>
+void apply_ldl_solve_cusolver(
+    const Tensor& A,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper) {
+#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
+    CUSOLVER_VERSION >= 11102)
+  TORCH_CHECK(
+      false,
+      "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
+      "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER 11.1.2+ (CUDA 11.3.1+) support.");
+#else
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0);
+  auto batch_size = batchCount(B);
+  auto n = A.size(-2);
+  auto nrhs = B.size(-1);
+  auto lda = A.stride(-1);
+  auto ldb = B.stride(-1);
+  auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+
+  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
+  auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
+  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
+
+  auto a_data = A.data_ptr<scalar_t>();
+  auto b_data = B.data_ptr<scalar_t>();
+
+  auto pivots_ = pivots.to(kLong);
+  auto pivots_data = pivots_.data_ptr<int64_t>();
+
+  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
+  auto datatype = at::cuda::solver::get_cusolver_datatype<scalar_t>();
+  size_t worksize_device = 0;
+  size_t worksize_host = 0;
+
+  TORCH_CUSOLVER_CHECK(cusolverDnXsytrs_bufferSize(
+      handle,
+      uplo,
+      n,
+      nrhs,
+      datatype,
+      a_data,
+      lda,
+      pivots_data,
+      datatype,
+      b_data,
+      ldb,
+      &worksize_device,
+      &worksize_host));
+
+  // allocate workspace storage
+  auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
+  auto workdata_device = device_allocator.allocate(worksize_device);
+  void* workdata_device_ptr = workdata_device.get();
+
+  auto& host_allocator = *at::getCPUAllocator();
+  auto workdata_host = host_allocator.allocate(worksize_host);
+  void* workdata_host_ptr = workdata_host.get();
+
+  Tensor info = at::zeros({}, A.options().dtype(at::kInt));
+  for (const auto i : c10::irange(batch_size)) {
+    auto* a_working_ptr = &a_data[i * a_stride];
+    auto* b_working_ptr = &b_data[i * b_stride];
+    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    TORCH_CUSOLVER_CHECK(cusolverDnXsytrs(
+        handle,
+        uplo,
+        n,
+        nrhs,
+        datatype,
+        a_working_ptr,
+        lda,
+        pivots_working_ptr,
+        datatype,
+        b_working_ptr,
+        ldb,
+        workdata_device_ptr,
+        worksize_device,
+        workdata_host_ptr,
+        worksize_host,
+        info.data_ptr<int>()));
+  }
+
+  // info from sytrs only reports if the i-th parameter is wrong
+  // so we don't need to check it all the time
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.item().toInt() == 0);
+#endif
+}
+
+} // anonymous namespace
+
+void ldl_factor_cusolver(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian) {
+  if (LD.is_complex()) {
+    TORCH_CHECK(
+        !hermitian,
+        "torch.linalg.ldl_factor: complex tensors with hermitian=True flag are not supported with cuSOLVER backend. ",
+        "Currently preferred backend is ",
+        at::globalContext().linalgPreferredBackend(),
+        ", please set 'default' or 'magma' backend with torch.backends.cuda.preferred_linalg_library");
+  }
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      LD.scalar_type(), "ldl_factor_looped_cusolver", [&] {
+        apply_ldl_factor_cusolver<scalar_t>(LD, pivots, info, upper);
+      });
+}
+
+void ldl_solve_cusolver(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      LD.scalar_type(), "ldl_solve_looped_cusolver", [&] {
+        apply_ldl_solve_cusolver<scalar_t>(LD, pivots, B, upper);
+      });
+}
+
 template <typename scalar_t>
 static void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
   cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
@@ -1445,26 +1623,34 @@ void lu_solve_looped_cusolver(const Tensor& b, const Tensor& lu, const Tensor& p
     const auto trans = to_cublas(transpose);
     int n = cuda_int_cast(lu.size(-2), "n");
     int nrhs = cuda_int_cast(b.size(-1), "nrhs");
-    auto batch_size = batchCount(lu);
+    auto batch_size = batchCount(b);
     auto info = at::zeros({1}, lu.options().dtype(kInt));
     auto info_data = info.data_ptr<int>();
     auto b_data = b.data_ptr<scalar_t>();
     auto lu_data = lu.data_ptr<scalar_t>();
     auto pivots_data = pivots.data_ptr<int>();
-    auto pivots_stride = pivots.size(-1);
-    auto lu_stride = matrixStride(lu);
+    auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
+    auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0;
     auto b_stride = matrixStride(b);
     int leading_dimension = cuda_int_cast(std::max<int>(1, n), "leading_dimension");
 
+    // lu and pivots tensors can be broadcast to b
+    // here we construct a helper indexing tensor to linearly index into lu and pivots
+    IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2);
+    IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2);
+    BroadcastLinearIndices lu_index(
+        batchCount(lu), lu_batch_shape, b_batch_shape);
+
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
     for (auto batch = decltype(batch_size){0}; batch < batch_size; ++batch) {
+      int64_t lu_index_i = lu_index(batch);
       at::cuda::solver::getrs<scalar_t>(
         handle,
         n,
         nrhs,
-        lu_data + batch * lu_stride,
+        lu_data + lu_index_i * lu_stride,
         leading_dimension,
-        pivots_data + batch * pivots_stride,
+        pivots_data + lu_index_i * pivots_stride,
         b_data + batch * b_stride,
         leading_dimension,
         info_data,
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index 14da99f83d36..8979a23580db 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
-#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/TransposeType.h>
 #include <ATen/native/cuda/MiscUtils.h>
 
 #if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)
@@ -39,6 +40,17 @@ void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool u
 void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
 void gels_batched_cublas(const Tensor& a, Tensor& b, Tensor& infos);
 void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType transpose);
+void ldl_factor_cusolver(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& info,
+    bool upper,
+    bool hermitian);
+void ldl_solve_cusolver(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool upper);
 
 #ifdef USE_CUSOLVER
 
@@ -65,4 +77,19 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
 
 #endif  // USE_CUSOLVER
 
+#if defined(BUILD_LAZY_CUDA_LINALG)
+namespace cuda { namespace detail {
+// This is only used for an old-style dispatches
+// Please do not add any new entires to it
+struct LinalgDispatch {
+   std::tuple<Tensor, Tensor> (*symeig_helper)(const Tensor& self, bool eigenvectors, bool upper);
+   std::tuple<Tensor, Tensor> (*qr_helper)(const Tensor& input, c10::string_view mode);
+   Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
+   std::tuple<Tensor, Tensor> (*legacy_lstsq)(const Tensor &B, const Tensor &A);
+   Tensor& (*inv_out_helper)(Tensor &result, Tensor& infos_lu, Tensor& infos_getri);
+};
+C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
+}} // namespace cuda::detail
+#endif
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
index 036cdd329e35..85141f820e5f 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -10,20 +10,6 @@ namespace at {
 namespace cuda {
 namespace solver {
 
-C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) {
-  switch (status) {
-    case CUSOLVER_STATUS_SUCCESS:                     return "CUSOLVER_STATUS_SUCCES";
-    case CUSOLVER_STATUS_NOT_INITIALIZED:             return "CUSOLVER_STATUS_NOT_INITIALIZED";
-    case CUSOLVER_STATUS_ALLOC_FAILED:                return "CUSOLVER_STATUS_ALLOC_FAILED";
-    case CUSOLVER_STATUS_INVALID_VALUE:               return "CUSOLVER_STATUS_INVALID_VALUE";
-    case CUSOLVER_STATUS_ARCH_MISMATCH:               return "CUSOLVER_STATUS_ARCH_MISMATCH";
-    case CUSOLVER_STATUS_EXECUTION_FAILED:            return "CUSOLVER_STATUS_EXECUTION_FAILED";
-    case CUSOLVER_STATUS_INTERNAL_ERROR:              return "CUSOLVER_STATUS_INTERNAL_ERROR";
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:   return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    default:                                          return "Unknown cusolver error number";
-  }
-}
-
 template <>
 void getrf<double>(
     cusolverDnHandle_t handle, int m, int n, double* dA, int ldda, int* ipiv, int* info) {
@@ -162,6 +148,71 @@ void getrs<c10::complex<float>>(
       info));
 }
 
+template <>
+void sytrf_bufferSize<double>(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(double)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnDsytrf_bufferSize(handle, n, A, lda, lwork));
+}
+
+template <>
+void sytrf_bufferSize<float>(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(float)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnSsytrf_bufferSize(handle, n, A, lda, lwork));
+}
+
+template <>
+void sytrf_bufferSize<c10::complex<double>>(
+    CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex<double>)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnZsytrf_bufferSize(
+      handle, n, reinterpret_cast<cuDoubleComplex*>(A), lda, lwork));
+}
+
+template <>
+void sytrf_bufferSize<c10::complex<float>>(
+    CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex<float>)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnCsytrf_bufferSize(
+      handle, n, reinterpret_cast<cuComplex*>(A), lda, lwork));
+}
+
+template <>
+void sytrf<double>(CUDASOLVER_SYTRF_ARGTYPES(double)) {
+  TORCH_CUSOLVER_CHECK(
+      cusolverDnDsytrf(handle, uplo, n, A, lda, ipiv, work, lwork, devInfo));
+}
+
+template <>
+void sytrf<float>(CUDASOLVER_SYTRF_ARGTYPES(float)) {
+  TORCH_CUSOLVER_CHECK(
+      cusolverDnSsytrf(handle, uplo, n, A, lda, ipiv, work, lwork, devInfo));
+}
+
+template <>
+void sytrf<c10::complex<double>>(
+    CUDASOLVER_SYTRF_ARGTYPES(c10::complex<double>)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnZsytrf(
+      handle,
+      uplo,
+      n,
+      reinterpret_cast<cuDoubleComplex*>(A),
+      lda,
+      ipiv,
+      reinterpret_cast<cuDoubleComplex*>(work),
+      lwork,
+      devInfo));
+}
+
+template <>
+void sytrf<c10::complex<float>>(
+    CUDASOLVER_SYTRF_ARGTYPES(c10::complex<float>)) {
+  TORCH_CUSOLVER_CHECK(cusolverDnCsytrf(
+      handle,
+      uplo,
+      n,
+      reinterpret_cast<cuComplex*>(A),
+      lda,
+      ipiv,
+      reinterpret_cast<cuComplex*>(work),
+      lwork,
+      devInfo));
+}
 
 template<>
 void gesvd_buffersize<float>(CUDASOLVER_GESVD_BUFFERSIZE_ARGTYPES()) {
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.h b/aten/src/ATen/native/cuda/linalg/CUDASolver.h
index bd8c5cc11064..4a2cd9680c77 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.h
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.h
@@ -46,6 +46,47 @@ void getrs<c10::complex<double>>(CUDASOLVER_GETRS_ARGTYPES(c10::complex<double>)
 template<>
 void getrs<c10::complex<float>>(CUDASOLVER_GETRS_ARGTYPES(c10::complex<float>));
 
+#define CUDASOLVER_SYTRF_BUFFER_ARGTYPES(Dtype) \
+  cusolverDnHandle_t handle, int n, Dtype *A, int lda, int *lwork
+
+template <class Dtype>
+void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(Dtype)) {
+  TORCH_CHECK(
+      false,
+      "at::cuda::solver::sytrf_bufferSize: not implemented for ",
+      typeid(Dtype).name());
+}
+template <>
+void sytrf_bufferSize<float>(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(float));
+template <>
+void sytrf_bufferSize<double>(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(double));
+template <>
+void sytrf_bufferSize<c10::complex<double>>(
+    CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex<double>));
+template <>
+void sytrf_bufferSize<c10::complex<float>>(
+    CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex<float>));
+
+#define CUDASOLVER_SYTRF_ARGTYPES(Dtype)                                      \
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, Dtype *A, int lda, \
+      int *ipiv, Dtype *work, int lwork, int *devInfo
+
+template <class Dtype>
+void sytrf(CUDASOLVER_SYTRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(
+      false,
+      "at::cuda::solver::sytrf: not implemented for ",
+      typeid(Dtype).name());
+}
+template <>
+void sytrf<float>(CUDASOLVER_SYTRF_ARGTYPES(float));
+template <>
+void sytrf<double>(CUDASOLVER_SYTRF_ARGTYPES(double));
+template <>
+void sytrf<c10::complex<double>>(
+    CUDASOLVER_SYTRF_ARGTYPES(c10::complex<double>));
+template <>
+void sytrf<c10::complex<float>>(CUDASOLVER_SYTRF_ARGTYPES(c10::complex<float>));
 
 #define CUDASOLVER_GESVD_BUFFERSIZE_ARGTYPES()  \
     cusolverDnHandle_t handle, int m, int n, int *lwork
diff --git a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
index e64d8eeb1030..599c86d334d4 100644
--- a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
@@ -11,6 +11,7 @@ void createCusolverDnHandle(cusolverDnHandle_t *handle) {
 }
 
 void destroyCusolverDnHandle(cusolverDnHandle_t handle) {
+  (void)handle; // Suppress unused variable warning
 // this is because of something dumb in the ordering of
 // destruction. Sometimes atexit, the cuda context (or something)
 // would already be destroyed by the time this gets destroyed. It
diff --git a/aten/src/ATen/native/cuda/linalg/MagmaUtils.h b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
new file mode 100644
index 000000000000..a58cfd9bef9f
--- /dev/null
+++ b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <ATen/cuda/CUDAConfig.h>
+
+#if AT_MAGMA_ENABLED()
+#include <magma_types.h>
+#include <magma_v2.h>
+#endif
+
+namespace at {
+namespace native {
+
+#if AT_MAGMA_ENABLED()
+
+// RAII for a MAGMA Queue
+struct MAGMAQueue {
+
+  // Default constructor without a device will cause
+  // destroying a queue which has not been initialized.
+  MAGMAQueue() = delete;
+
+  // Constructor
+  explicit MAGMAQueue(int64_t device_id) {
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    // Magma operations is numerically sensitive, so TF32 should be off
+    // regardless of the global flag.
+    TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
+    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+#endif
+    magma_queue_create_from_cuda(
+      device_id,
+      at::cuda::getCurrentCUDAStream(),
+      handle,
+      at::cuda::getCurrentCUDASparseHandle(),
+      &magma_queue_);
+  }
+
+  // Getter
+  magma_queue_t get_queue() const { return magma_queue_; }
+
+  // Destructor
+  ~MAGMAQueue() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
+    // should restore the original math mode back
+    cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
+    cublasSetMathMode(handle, original_math_mode);
+#endif
+    magma_queue_destroy(magma_queue_);
+  }
+
+ private:
+  magma_queue_t magma_queue_;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  cublasMath_t original_math_mode;
+#endif
+};
+
+static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<magma_int_t>(value);
+  if (static_cast<int64_t>(result) != value) {
+    AT_ERROR("magma: The value of ", varname, "(", (long long)value,
+             ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
+  }
+  return result;
+}
+
+// MAGMA functions that don't take a magma_queue_t aren't stream safe
+// Work around this by synchronizing with the default stream
+struct MagmaStreamSyncGuard {
+  MagmaStreamSyncGuard() {
+    auto stream = at::cuda::getCurrentCUDAStream();
+    if (stream != at::cuda::getDefaultCUDAStream()) {
+      at::cuda::stream_synchronize(stream);
+    }
+  }
+
+  ~MagmaStreamSyncGuard() noexcept(false) {
+    auto default_stream = at::cuda::getDefaultCUDAStream();
+    if (at::cuda::getCurrentCUDAStream() != default_stream) {
+      at::cuda::stream_synchronize(default_stream);
+    }
+  }
+};
+#endif
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/reduction_template.cuh b/aten/src/ATen/native/cuda/reduction_template.cuh
new file mode 100644
index 000000000000..4d9d559d8ec8
--- /dev/null
+++ b/aten/src/ATen/native/cuda/reduction_template.cuh
@@ -0,0 +1,664 @@
+namespace at {
+namespace cuda {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return __shfl_down_sync(mask, value, delta, width);
+  }
+
+
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  }
+  #endif
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+
+
+
+
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+
+  };
+
+
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+
+  inline __device__ ${functor}
+
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+
+
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    scalar_t values[input_vec_size];
+
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+
+  const std::string reduction_template_1 = R"ESCAPE(
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+
+
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+
+}}
diff --git a/aten/src/ATen/native/cuda/thread_constants.h b/aten/src/ATen/native/cuda/thread_constants.h
index 464c6fe9fe2e..651053d663e4 100644
--- a/aten/src/ATen/native/cuda/thread_constants.h
+++ b/aten/src/ATen/native/cuda/thread_constants.h
@@ -13,7 +13,7 @@ constexpr int num_threads() {
   return 256;
 }
 #else
-constexpr int num_threads() {
+constexpr uint32_t num_threads() {
   return C10_WARP_SIZE * 4;
 }
 #endif
diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh
index 17459f382816..7ab719bc819e 100644
--- a/aten/src/ATen/native/cuda/vol2col.cuh
+++ b/aten/src/ATen/native/cuda/vol2col.cuh
@@ -1,9 +1,5 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
-
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index de45a3a2dd40..6968548b0e0e 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -152,69 +152,6 @@ std::string repro_from_args(const ConvolutionParams& params) {
   return ss.str();
 }
 
-// ---------------------------------------------------------------------
-//
-// Checking
-//
-// ---------------------------------------------------------------------
-
-// Used on pad, stride and dilation
-static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
-{
-  TORCH_CHECK(args.size() <= expected_size,
-           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
-           expected_size, " (while checking arguments for ", c, ")");
-  TORCH_CHECK(args.size() >= expected_size,
-           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
-           expected_size, " (while checking arguments for ", c, ")");
-
-  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
-  if (num_negative_values > 0){
-    std::stringstream ss;
-    ss << arg_name << " should be greater than zero but got (";
-    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    AT_ERROR(ss.str());
-  }
-}
-
-
-// NOTE [ Convolution checks ]
-//
-// NB: For many call sites, it is not strictly necessary to check all of
-// these relationships (for example, for forward convolution, we compute
-// the size of output ourselves, so we don't actually need to check
-// output.  However, writing a single function that does everything
-// means we get to reuse it for both forwards and all backwards
-// variants, even when the set of "real" inputs varies.  The magic of
-// relational computing!
-//
-// (There is one downside, which is that it is slightly harder to write
-// error messages which are able to distinguish between real inputs
-// (which the user can change) and computed inputs (which the user can
-// only indirectly affect).  It would be an interesting exercise to
-// come up with a general framework to handle such situations.)
-static void convolution_shape_check(
-    CheckedFrom c,
-    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
-{
-  check_args(c, padding, input->dim() - 2, "padding");
-  check_args(c, stride, padding.size(), "stride");
-  check_args(c, dilation, padding.size(), "dilation");
-
-  // Input
-  checkDimRange(c, input, 3, 6 /* exclusive */);
-  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
-
-  // Weight
-  checkSameDim(c, input, weight);
-
-  // TODO: check that output->size() matches output_sizes
-  // TODO: check that weight matches output->sizes()
-  checkSameDim(c, input, output);
-}
-
 // ---------------------------------------------------------------------
 //
 // Convolution forward / Transposed convolution backward
@@ -494,6 +431,7 @@ Tensor cudnn_convolution_relu(
   }
 
   auto& ctx = at::globalContext();
+  bool benchmark = ctx.benchmarkCuDNN();
   bool allow_tf32 = ctx.allowTF32CuDNN();
   auto _bias = bias_t.has_value()
           ? bias_t.value()
@@ -516,7 +454,7 @@ Tensor cudnn_convolution_relu(
       padding,
       dilation,
       groups,
-      false, // benchmark
+      benchmark, // benchmark
       false, // deterministic
       allow_tf32  // allow_tf32
   );
@@ -532,7 +470,7 @@ Tensor cudnn_convolution_relu(
       padding,
       dilation,
       groups,
-      false, // benchmark
+      benchmark, // benchmark
       false, // deterministic
       allow_tf32  // allow_tf32
   );
@@ -554,6 +492,11 @@ Tensor cudnn_convolution_add_relu(
   auto memory_format = cudnn_conv_suggest_memory_format(input_t, weight_t);
   const Tensor input = input_t.contiguous(memory_format);
   const Tensor weight = weight_t.contiguous(memory_format);
+  Tensor z = z_t;
+  if (z.suggest_memory_format() != memory_format) {
+    z = z.to(memory_format);
+  }
+  z = z.contiguous(memory_format);
 
   // FuseFrozenConvAddRelu performs some tensor shape checking
   Tensor output_t = at::detail::empty_cuda(
@@ -566,6 +509,7 @@ Tensor cudnn_convolution_add_relu(
 
   auto& ctx = at::globalContext();
   bool allow_tf32 = ctx.allowTF32CuDNN();
+  bool benchmark = ctx.benchmarkCuDNN();
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias_t.has_value()
           ? bias_t.value()
@@ -581,14 +525,14 @@ Tensor cudnn_convolution_add_relu(
       output_t,
       input,
       weight,
-      z_t,
+      z,
       _alpha,
       _bias,
       stride,
       padding,
       dilation,
       groups,
-      false, // benchmark
+      benchmark,
       false, // deterministic
       allow_tf32  // allow_tf32
   );
@@ -597,14 +541,14 @@ Tensor cudnn_convolution_add_relu(
       output_t,
       input,
       weight,
-      z_t,
+      z,
       _alpha,
       _bias,
       stride,
       padding,
       dilation,
       groups,
-      false, // benchmark
+      benchmark,
       false, // deterministic
       allow_tf32  // allow_tf32
   );
diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h
index c3b5ef74ff8f..9ee5bfb3f9e6 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.h
+++ b/aten/src/ATen/native/cudnn/ConvShared.h
@@ -105,4 +105,48 @@ void raw_cudnn_convolution_add_relu_fallback_out(
     bool benchmark,
     bool deterministic,
     bool allow_tf32);
+
+
+#if AT_CUDNN_ENABLED()
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+// v7 functions are preserved here to allow for runtime switching to v7
+// (e.g., TORCH_CUDNN_V8_API_ENABLED=0).
+// Note that v7 forward/backward out can have different behavior from the v8
+// versions, as v7 explicitly splits large tensors as a 32-bit indexing
+// workaround whereas v8 expects cuDNN to handle large tensors.
+void raw_cudnn_convolution_forward_out_v7(
+    const Tensor& output, const Tensor& input, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic, bool allow_tf32);
+
+void raw_cudnn_convolution_backward_input_out_v7(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic, bool allow_tf32);
+
+void raw_cudnn_convolution_backward_weight_out_v7(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic, bool allow_tf32);
+
+void raw_cudnn_convolution_add_relu_out_v7(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& z,
+    float alpha,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
+#endif
+#endif
 }}
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 502b32a5b446..a2ff4839a40c 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -619,8 +619,6 @@ if (args.params.dataType == CUDNN_DATA_FLOAT) {
 //
 // ---------------------------------------------------------------------
 
-#if !HAS_CUDNN_V8()
-
 void raw_cudnn_convolution_forward_out_32bit(
     const Tensor& output, const Tensor& input, const Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
@@ -666,15 +664,18 @@ void raw_cudnn_convolution_forward_out_32bit(
   );
 }
 
+
+#if !HAS_CUDNN_V8()
 void raw_cudnn_convolution_forward_out(
+#else
+void raw_cudnn_convolution_forward_out_v7(
+#endif
     const Tensor& output, const Tensor& input, const Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, bool allow_tf32) {
   split_batch_dim_to_32bit_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 256, raw_cudnn_convolution_forward_out_32bit);
 }
 
-#endif // !HAS_CUDNN_V8()
-
 // ---------------------------------------------------------------------
 //
 // Convolution backward / Transposed convolution forward
@@ -726,7 +727,11 @@ void raw_cudnn_convolution_backward_input_out_32bit(
   );
 }
 
+#if !HAS_CUDNN_V8()
 void raw_cudnn_convolution_backward_input_out(
+#else
+void raw_cudnn_convolution_backward_input_out_v7(
+#endif
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
@@ -785,7 +790,11 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
   );
 }
 
+#if !HAS_CUDNN_V8()
 void raw_cudnn_convolution_backward_weight_out(
+#else
+void raw_cudnn_convolution_backward_weight_out_v7(
+#endif
     const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, bool allow_tf32) {
@@ -808,6 +817,9 @@ void raw_cudnn_convolution_backward_weight_out(
   int64_t split_size = std::max<int64_t>(1024 * 1024 * 512 / max_inner_size, 1L);
   int64_t num_splits = (n + split_size - 1) / split_size;
   if (split_size * max_inner_size < int_max) {
+    const auto kAccType = (grad_weight.scalar_type() == kHalf || grad_weight.scalar_type() == kBFloat16)
+                             ? kFloat : grad_weight.scalar_type();
+    Tensor grad_weight_accumulator = at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType));
     for (const auto i : c10::irange(num_splits)) {
       int64_t start = split_size * i;
       int64_t split_size_ = std::min<int64_t>(split_size, n - start);
@@ -815,8 +827,9 @@ void raw_cudnn_convolution_backward_weight_out(
       Tensor grad_output_ = grad_output.narrow(0, start, split_size_);
       Tensor grad_weight_ = at::empty_like(grad_weight);
       raw_cudnn_convolution_backward_weight_out_32bit(grad_weight_, grad_output_, input_, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
-      grad_weight.add_(grad_weight_);
+      grad_weight_accumulator.add_(grad_weight_);
     }
+    grad_weight.copy_(grad_weight_accumulator);
     return;
   }
   // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated:
@@ -833,7 +846,12 @@ void raw_cudnn_convolution_backward_weight_out(
   TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
+#if !HAS_CUDNN_V8()
 void raw_cudnn_convolution_add_relu_out(
+#else
+void raw_cudnn_convolution_add_relu_out_v7(
+#endif
+
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index 9ba1775988b9..24c5f3c2e3d6 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -8,6 +8,8 @@
 
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include <cudnn_frontend.h>
+#include <cudnn_frontend_find_plan.h>
+#include <cudnn_frontend_get_plan.h>
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/Exceptions.h>
@@ -17,34 +19,57 @@
 #include <ATen/cudnn/Handle.h>
 #include <ATen/TensorUtils.h>
 
+#include <c10/util/env.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <mutex>
 #include <unordered_map>
 
-namespace at { namespace native{
+namespace at { namespace native {
 
 namespace {
 
+// TODO: remove duplicate code in Conv_v7.cpp
+constexpr size_t operator "" _TiB(unsigned long long n) {
+  return size_t(n) << 40;
+}
+
 uint8_t getAlignment(const Tensor &t) {
   // alignment are in bytes
   uint8_t alignment = 1;
-  uint64_t address = reinterpret_cast<uint64_t>(t.data_ptr());
-  while (address % alignment == 0 && alignment < 16) alignment *= 2;
+  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
+  for (; alignment < 64; alignment *= 2) {
+    if (address % (alignment * 2)) {
+      return alignment;
+    }
+  }
   return alignment;
 }
 
-cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, int64_t id, uint8_t alignment) {
-  auto shape = t.sizes();
+cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const bool _virtual) {
+  auto sizes = t.sizes();
   auto strides = t.strides();
-  return cudnn_frontend::TensorBuilder()
-    .setDim(shape.size(), shape.data())
+  auto r = cudnn_frontend::TensorBuilder()
+    .setDim(sizes.size(), sizes.data())
     .setStrides(strides.size(), strides.data())
     .setId(id)
     .setAlignment(alignment)
-    .setDataType(getCudnnDataType(t))
+    .setDataType(dataType)
+    .setVirtual(_virtual)
     .build();
+  return r;
+}
+
+cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment) {
+  return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), false);
 }
 
-cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation) {
+cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, const at::ScalarType scalar_type) {
   uint64_t convDim = stride.size();
+  if (scalar_type == kBFloat16 || scalar_type == kHalf) {
+    dataType = CUDNN_DATA_FLOAT;
+  }
   return cudnn_frontend::ConvDescBuilder()
     .setDataType(dataType)
     .setMathMode(CUDNN_CROSS_CORRELATION)
@@ -63,11 +88,12 @@ void filterEngineConfigs(
 {
   auto filter = [=](cudnnBackendDescriptor_t c) {
     if (deterministic) {
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) return true;
+      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) {return true;}
     }
-    if (scalar_type == kFloat || !allow_tf32) {
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) return true;
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c)) return true;
+    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {return true;}
+    if (scalar_type == kFloat) {
+      // TODO: check under which conditions this is OK
+      if (!allow_tf32 && cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c)) {return true;}
     }
     return false;
   };
@@ -76,99 +102,545 @@ void filterEngineConfigs(
 
 struct CacheKey {
   ConvolutionParams params;
-  uint8_t input_alignment;
-  uint8_t weight_alignment;
-  uint8_t output_alignment;
+  cudnnBackendDescriptorType_t operation;
+  uint8_t x_alignment;
+  uint8_t w_alignment;
+  uint8_t y_alignment;
 };
 
-// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp
-std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, ParamsHash<CacheKey>, ParamsEqual<CacheKey>> engine_cache;
+struct CacheKeyFused {
+  ConvolutionParams params;
+  // No op here because it is assumed to be a forward conv op
+  uint8_t x_alignment;
+  uint8_t w_alignment;
+  uint8_t y_alignment;
+  uint8_t z_alignment;
+  uint8_t b_alignment;
+  // TODO: does it make sense to have this in the key? but alpha is a graph-level param...
+  float alpha;
+};
 
-}
+template <typename T, typename KeyType>
+struct BenchmarkCache {
+std::mutex mutex;
+std::unordered_map<KeyType, cudnn_frontend::ExecutionPlan, ParamsHash<KeyType>, ParamsEqual<KeyType>> engine_cache;
 
-void raw_cudnn_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
-  TORCH_CHECK(!benchmark, "not supported yet");
-  if (output.numel() == 0) {
-    return;
+// TODO: is this thread safe if cache is updated? is pointer stale?
+cudnn_frontend::ExecutionPlan* find(const KeyType& key) {
+  std::lock_guard<std::mutex> guard(mutex);
+  auto it = engine_cache.find(key);
+  if (it == engine_cache.end()) {
+    return nullptr;
   }
+  // TODO: probably want ExecutionPlan copy constructor or better way to return
+  return &(it->second);
+}
 
-  cudnnHandle_t handle = getCudnnHandle();
+void emplace(const KeyType& key, T& results) {
+  std::lock_guard<std::mutex> guard(mutex);
+  engine_cache.emplace(key, std::move(results));
+}
 
-  CacheKey key;
-  setConvolutionParams(&key.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32);
-  key.input_alignment = getAlignment(input);
-  key.output_alignment = getAlignment(output);
-  key.weight_alignment = getAlignment(weight);
-
-  auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor cfg) {
-    auto plan = cudnn_frontend::ExecutionPlanBuilder()
-        .setHandle(handle)
-        .setEngineConfig(cfg)
-        .build();
-
-    auto workspace_size = plan.getWorkspaceSize();
-    auto workspace = at::empty({workspace_size}, input.options().dtype(kByte));
-    void *data_ptrs[] = {input.data_ptr(), output.data_ptr(), weight.data_ptr()};
-    // std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
-    int64_t uids[] = {'x', 'y', 'w'};
-    auto variantPack = cudnn_frontend::VariantPackBuilder()
-        .setWorkspacePointer(workspace.data_ptr())
-        .setDataPointers(3, data_ptrs)
-        .setUids(3, uids)
-        .build();
-    AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
-  };
+};
 
-  auto search = engine_cache.find(key);
-  if (search != engine_cache.end()) {
-    run(search->second);
-    return;
-  }
+BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKey> benchmark_cache;
+BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFused> benchmark_cache_fused;
+
+} // namespace
+
+// NB: This (and the fused version) can't be a constructor, because then CacheKey
+// would not be a POD anymore.
+void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
+  memset(&key, 0, sizeof(key));
+  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32);
+  key.operation = operation;
+  key.x_alignment = getAlignment(x);
+  key.y_alignment = getAlignment(y);
+  key.w_alignment = getAlignment(w);
+}
+
+void setCacheKeyFused(CacheKeyFused& key, const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
+  memset(&key, 0, sizeof(key));
+  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32);
+  key.x_alignment = getAlignment(x);
+  key.y_alignment = getAlignment(y);
+  key.w_alignment = getAlignment(w);
+  key.z_alignment = getAlignment(z);
+  key.b_alignment = getAlignment(b);
+  key.alpha = alpha;
+}
 
-  auto op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-      .setxDesc(getTensorDescriptor(input, 'x', key.input_alignment))
-      .setyDesc(getTensorDescriptor(output, 'y', key.output_alignment))
-      .setwDesc(getTensorDescriptor(weight, 'w', key.weight_alignment))
-      .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation))
+void run_conv_plan(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const cudnn_frontend::ExecutionPlan& plan) {
+  c10::DeviceGuard g(x.options().device());
+  auto workspace_size = plan.getWorkspaceSize();
+  auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
+  int64_t uids[] = {'x', 'y', 'w'};
+  auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
+      .setDataPointers(3, data_ptrs)
+      .setUids(3, uids)
       .build();
-  // std::cout << op.describe() << std::endl;
+  AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+}
 
-  std::array<cudnn_frontend::Operation const *, 1> ops = {&op};
+void run_conv_plan_fused(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const cudnn_frontend::ExecutionPlan& plan) {
+  c10::DeviceGuard g(x.options().device());
+  auto workspace_size = plan.getWorkspaceSize();
+  auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
+  int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
+  auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
+      .setDataPointers(5, data_ptrs)
+      .setUids(5, uids)
+      .build();
+  AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+}
 
+auto build_opgraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) {
+  auto op = cudnn_frontend::OperationBuilder(desc)
+      .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment))
+      .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment))
+      .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment))
+      .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation, x.scalar_type()))
+      .build();
+  std::array<cudnn_frontend::Operation const *, 1> ops = {&op};
   auto opGraph = cudnn_frontend::OperationGraphBuilder()
       .setHandle(handle)
-      .setOperationGraph(1, ops.data())
+      .setOperationGraph(ops.size(), ops.data())
+      .build();
+  return opGraph;
+}
+
+auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFused& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) {
+  // need computation to be done in FLOAT type regardless of reduced precision input
+  const auto precision = CUDNN_DATA_FLOAT;
+  auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_ADD)
+                           .setMathPrecision(precision)
+                           .build();
+  auto addBiasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setMathPrecision(precision)
+                            .build();
+  auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setMathPrecision(precision)
+                           .build();
+  auto convDesc = getConvDescriptor(key.params.dataType, padding, stride, dilation, x.scalar_type());
+  const float alpha1 = 1.0;
+  const float alpha2 = alpha;
+  auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                   .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment))
+                   // virtual output of conv
+                   .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.y_alignment, precision, true))
+                   .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment))
+                   .setAlpha(alpha1)
+                   .setcDesc(convDesc)
+                   .build();
+  auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(conv_op.getOutputTensor())
+                           .setbDesc(getTensorDescriptor(z, 'z', key.z_alignment))
+                           // another virtual output (of add)
+                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.y_alignment, precision, true))
+                           .setpwDesc(addDesc)
+                           .setAlpha(alpha1)
+                           .setAlpha2(alpha2)
+                           .build();
+  auto add_bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(add_op.getOutputTensor())
+                           .setbDesc(getTensorDescriptor(b, 'b', key.b_alignment))
+                           // another virtual output (of add bias)
+                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.y_alignment, precision, true))
+                           .setpwDesc(addBiasDesc)
+                           .build();
+  auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(add_bias_op.getOutputTensor())
+                          // final output is in original datatype
+                          .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment))
+                          .setpwDesc(actDesc)
+                          .build();
+  std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op, &add_bias_op, &act_op};
+  auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                     .setHandle(handle)
+                     .setOperationGraph(ops.size(), ops.data())
+                     .build();
+  return opGraph;
+}
+
+auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tensor& x, const bool deterministic, const bool allow_tf32, const cudnnBackendHeurMode_t heur_mode) {
+   // Method for engine config generator based on heuristics
+  auto heurgen_method = [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList {
+      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                            .setOperationGraph(opGraph)
+                            .setHeurMode(heur_mode)
+                            .build();
+      auto &engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+      cudnn_frontend::EngineConfigList filtered_configs;
+      filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, x.scalar_type());
+      return filtered_configs;
+  };
+  // Method for engine config generator based on fallback list
+  auto fallback_method = [&desc, &x, deterministic, allow_tf32](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList {
+    auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                        .setOperationGraph(opGraph)
+                        .setOperation(desc)
+                        .build();
+    auto &fallback_list = fallback.getFallbackList();
+    cudnn_frontend::EngineConfigList filtered_configs;
+    filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, x.scalar_type());
+    return filtered_configs;
+  };
+  std::array<cudnn_frontend::GeneratorSource const, 2> sources = {heurgen_method, fallback_method};
+  return sources;
+}
+
+size_t get_available_workspace() {
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  size_t max_block_size = 0;
+  size_t tmp_bytes = 0;  // Only used for filling pointer parameters that aren't used later
+  c10::cuda::CUDACachingAllocator::cacheInfo(device, &tmp_bytes, &max_block_size);
+  return max_block_size;
+}
+
+void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::OperationGraph& opGraph, cudnn_frontend::EngineConfigGenerator& generator, const Tensor& x, cudnn_frontend::executionPlans_t& valid_plans, at::DataPtr& workspace_ptr, unsigned int max_plans = 0) {
+  auto initial_predicate_function = [&](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+    return false;
+  };
+  auto plans = generator.cudnnGetPlan(handle, opGraph, initial_predicate_function);
+  size_t max_block_size = get_available_workspace();
+  size_t max_workspace_size = 0u;
+  std::for_each(plans.begin(), plans.end(), [&] (cudnn_frontend::ExecutionPlan& plan) {
+    size_t curr_workspace_size = plan.getWorkspaceSize();
+    if (curr_workspace_size <= max_block_size) {
+      if (curr_workspace_size > max_workspace_size) {
+        max_workspace_size = plan.getWorkspaceSize();
+      }
+      valid_plans.emplace_back(std::move(plan));
+    }
+  });
+  TORCH_CHECK_WITH(CUDAOutOfMemoryError, max_workspace_size < 1_TiB, "Not enough memory for workspace!");
+  bool remove_invalid = false;
+  while (max_workspace_size) {
+    try {
+      workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size);
+      break;
+    } catch (c10::CUDAOutOfMemoryError &e) {
+      max_workspace_size /= 2;
+      cudaGetLastError(); // clear CUDA error
+      remove_invalid = true;
+    }
+  }
+  if (remove_invalid) {
+    cudnn_frontend::executionPlans_t new_valid_plans;
+    unsigned int plan_count = 0;
+    for (auto &plan : valid_plans) {
+      if (plan.getWorkspaceSize() <= max_workspace_size) {
+        new_valid_plans.emplace_back(std::move(plan));
+        plan_count++;
+      }
+      if (max_plans && plan_count >= max_plans) {
+        break;
+      }
+    }
+    valid_plans = std::move(new_valid_plans);
+  }
+}
+
+auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) {
+  auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
+  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
+  int64_t uids[] = {'x', 'y', 'w'};
+  // We don't care about getting the best ordering of algos if we're roing to run all of them
+  auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT);
+  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  cudnn_frontend::executionPlans_t valid_plans;
+  c10::DeviceGuard g(x.options().device());
+  at::DataPtr workspace_ptr;
+  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setDataPointers(3, data_ptrs)
+      .setUids(3, uids)
+      .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
       .build();
-  // std::cout << opGraph.describe() << std::endl;
 
-  auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-      .setOperationGraph(opGraph)
-      .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+  auto plans = cudnn_frontend::time_sorted_plan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(handle, std::move(valid_plans), variantPack);
+
+  cudnn_frontend::executionPlans_t sorted_plans;
+  for (auto& plan : plans) {
+    sorted_plans.emplace_back(std::move(plan));
+  }
+  return sorted_plans;
+}
+
+auto get_plans_from_find_fused(const cudnnHandle_t handle,
+                               const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b,
+                               const float alpha, const CacheKeyFused& key,
+                               const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation,
+                               const bool deterministic, const bool allow_tf32) {
+  auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
+  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
+  int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
+
+  auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT);
+  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  cudnn_frontend::executionPlans_t valid_plans;
+  c10::DeviceGuard g(x.options().device());
+  at::DataPtr workspace_ptr;
+  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setDataPointers(5, data_ptrs)
+      .setUids(5, uids)
+      .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
       .build();
-  auto fallback = cudnn_frontend::EngineFallbackListBuilder()
-                    .setOperationGraph(opGraph)
-                    .setOperation(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                    .build();
 
-  auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
-  auto& fallback_list = fallback.getFallbackList();
+  auto plans = cudnn_frontend::time_sorted_plan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(handle, std::move(valid_plans), variantPack);
+
+  cudnn_frontend::executionPlans_t sorted_plans;
+  for (auto& plan : plans) {
+    sorted_plans.emplace_back(std::move(plan));
+  }
+  return sorted_plans;
+}
+
+
+// We only get configs from this stage to avoid building unnecessary plans that are never executed
+auto get_configs_from_heuristics(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) {
+  auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
+  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT;
+  auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, heuristic_mode);
+
+  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  auto configs = generator.generate_engine_config(opGraph);
+  return configs;
+}
+
+auto get_configs_from_heuristics_fused(const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFused& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) {
+  auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
+  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT;
+  auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, heuristic_mode);
+
+  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  auto configs = generator.generate_engine_config(opGraph);
+  return configs;
+}
+
+void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKey& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) {
+  for (auto & plan : plans) {
+    try {
+      run_conv_plan(handle, x, y, w, plan);
+      benchmark_cache.emplace(key, plan);
+      return;
+    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
+      catch (c10::CUDAOutOfMemoryError &e) {
+        cudaGetLastError(); // clear CUDA error
+    }
+  }
+  TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation");
+}
+
+void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFused& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) {
+  for (auto & plan : plans) {
+    try {
+      run_conv_plan_fused(handle, x, y, w, z, b, plan);
+      benchmark_cache_fused.emplace(key, plan);
+      return;
+    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
+      catch (c10::CUDAOutOfMemoryError &e) {
+        cudaGetLastError(); // clear CUDA error
+    }
+  }
+  TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation");
+}
 
-  cudnn_frontend::EngineConfigList filtered_configs;
-  filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, input.scalar_type());
-  filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, input.scalar_type());
+void try_configs(cudnn_frontend::EngineConfigList& configs, const CacheKey& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) {
+  for (auto & config : configs) {
+    try {
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                    .setHandle(handle)
+                    .setEngineConfig(config)
+                    .build();
+      run_conv_plan(handle, x, y, w, plan);
+      benchmark_cache.emplace(key, plan);
+      return;
+    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
+      catch (c10::CUDAOutOfMemoryError &e) {
+        cudaGetLastError(); // clear CUDA error
+    }
+  }
+  TORCH_CHECK(false, "GET was unable to find an engine to execute this computation");
+}
 
-  for (auto &cfg : filtered_configs) {
+void try_configs_fused(cudnn_frontend::EngineConfigList& configs, const CacheKeyFused& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) {
+  for (auto & config : configs) {
     try {
-      run(cfg);
-      engine_cache[key] = cfg;
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                    .setHandle(handle)
+                    .setEngineConfig(config)
+                    .build();
+      run_conv_plan_fused(handle, x, y, w, z, b, plan);
+      benchmark_cache_fused.emplace(key, plan);
       return;
     } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
+      catch (c10::CUDAOutOfMemoryError &e) {
+        cudaGetLastError(); // clear CUDA error
+    }
+  }
+  TORCH_CHECK(false, "GET was unable to find an engine to execute this computation");
+}
+
+void run_single_conv(const cudnnBackendDescriptorType_t operation,
+  const Tensor& x, const Tensor& y, const Tensor& w,
+  const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
+  const bool benchmark, const bool deterministic, const bool allow_tf32) {
+  cudnnHandle_t handle = getCudnnHandle();
+
+  CacheKey key;
+  setCacheKey(key, operation, y, x, w, padding, stride, dilation, groups, deterministic, allow_tf32);
+  // TODO: is this thread safe if cache is updated? is pointer stale?
+  auto search = benchmark_cache.find(key);
+  if (search) {
+    try {
+      run_conv_plan(handle, x, y, w, *search);
+      return;
+    } catch(c10::CUDAOutOfMemoryError &e) {
+      cudaGetLastError(); // clear CUDA error
+    }
+  }
+
+  if (!benchmark) {
+    cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics(handle, operation,
+                                                                           x, y, w, key,
+                                                                           padding, stride, dilation,
+                                                                           deterministic, allow_tf32);
+    try_configs(configs, key, handle, x, y, w);
+  } else {
+    cudnn_frontend::executionPlans_t plans = get_plans_from_find(handle, operation,
+                                                                 x, y, w, key,
+                                                                 padding, stride, dilation,
+                                                                 deterministic, allow_tf32);
+    // Replicate v7 behavior: clear cached blocks as benchmark incurs
+    // significant memory consumptiont that is not needed after this step
+    c10::cuda::CUDACachingAllocator::emptyCache();
+    try_plans(plans, key, handle, x, y, w);
+  }
+}
+
+void run_fused_conv(const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b,
+  float alpha, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
+  int64_t groups, const bool benchmark, const bool deterministic, const bool allow_tf32) {
+  cudnnHandle_t handle = getCudnnHandle();
+
+  CacheKeyFused key;
+  setCacheKeyFused(key, y, x, w, z, b, alpha, padding, stride, dilation, groups, deterministic, allow_tf32);
+  auto search = benchmark_cache_fused.find(key);
+  if (search) {
+    try {
+      run_conv_plan_fused(handle, x, y, w, z, b, *search);
+      return;
+    } catch(c10::CUDAOutOfMemoryError &e) {
+      cudaGetLastError(); // clear CUDA error
+    }
+  }
+
+  if (!benchmark) {
+    cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics_fused(handle,
+                                                                                 x, y, w, z, b, alpha, key,
+                                                                                 padding, stride, dilation,
+                                                                                 deterministic, allow_tf32);
+    try_configs_fused(configs, key, handle, x, y, w, z, b);
+  } else {
+    cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused(handle,
+                                                                       x, y, w, z, b, alpha, key,
+                                                                       padding, stride, dilation,
+                                                                       deterministic, allow_tf32);
+    try_plans_fused(plans, key, handle, x, y, w, z, b);
+  }
+}
+
+void raw_cudnn_convolution_forward_out(
+    const Tensor& output, const Tensor& input, const Tensor& weight,
+    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
+    const bool benchmark, const bool deterministic, const bool allow_tf32)
+{
+  if (output.numel() == 0) { return; }
+  if (at::native::cudnnv8_enabled_check_debug()) {
+    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+      input, output, weight, padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  } else {
+    raw_cudnn_convolution_forward_out_v7(
+      output, input, weight,
+      padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  }
+}
+
+void raw_cudnn_convolution_backward_input_out(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
+    const bool benchmark, const bool deterministic, const bool allow_tf32) {
+  if (grad_input.numel() == 0) { return; }
+  if (at::native::cudnnv8_enabled_check_debug()) {
+    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+      grad_input, grad_output, weight, padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  } else {
+    raw_cudnn_convolution_backward_input_out_v7(
+      grad_input,
+      grad_output,
+      weight,
+      padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  }
+}
+
+void raw_cudnn_convolution_backward_weight_out(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
+    const bool benchmark, const bool deterministic, const bool allow_tf32) {
+  if (grad_weight.numel() == 0) { return; }
+  if (at::native::cudnnv8_enabled_check_debug()) {
+    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+      input, grad_output, grad_weight, padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  } else {
+    raw_cudnn_convolution_backward_weight_out_v7(
+      grad_weight, grad_output, input,
+      padding, stride, dilation, groups,
+      benchmark, deterministic, allow_tf32);
+  }
+}
+
+void raw_cudnn_convolution_add_relu_out(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& z,
+    float alpha,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  if (output.numel() == 0) { return; }
+  if (at::native::cudnnv8_enabled_check_debug()) {
+    auto bias_ = bias.view({1, bias.numel(), 1, 1});
+    run_fused_conv(input, output, weight, z, bias_,
+      alpha, stride, padding, dilation,
+      groups, benchmark, deterministic, allow_tf32);
+  } else {
+    raw_cudnn_convolution_add_relu_out_v7(output, input, weight, z,
+                                          alpha, bias, stride, padding, dilation,
+                                          groups, benchmark, deterministic, allow_tf32);
   }
-  TORCH_CHECK(false, "Unable to find an engine to execute this computation");
 }
 
 }} // at::native
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index 38bde06aa6cc..b22d25cbff97 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Config.h>
 #include <ATen/cuda/CUDAConfig.h>
+#include <ATen/native/GridSamplerUtils.h>
 
 #if !AT_CUDNN_ENABLED()
 
@@ -67,6 +68,13 @@ void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input)
 Tensor cudnn_grid_sampler_forward(
     const Tensor& input_t, const Tensor& grid_t)
 {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input_t, grid_t);
+  TORCH_CHECK(
+    cond_cudnn_grid_sampler(input_t, grid_t),
+    "Invalid arguments to cudnn_grid_sampler_forward");
+
   auto input_contig = contiguousIfZeroInStrides(input_t);
   auto grid_contig = grid_t.contiguous();
   TensorArg input{ input_contig, "input", 1 },
@@ -106,6 +114,13 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
     const Tensor& input_t, const Tensor& grid_t,
     const Tensor& grad_output_t)
 {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input_t, grid_t);
+  TORCH_CHECK(
+    cond_cudnn_grid_sampler(input_t, grid_t),
+    "Invalid arguments to cudnn_grid_sampler_backward");
+
   auto input_contig = contiguousIfZeroInStrides(input_t);
   auto grid_contig = grid_t.contiguous();
   auto grad_output_contig = contiguousIfZeroInStrides(grad_output_t);
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index a80fc4fe0335..29430b38e74e 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -753,19 +753,61 @@ namespace {
     }
   }
 
-  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) {
+  inline bool use_rnn_persist_small_h(const RNNDescriptorParams& rnn,
+                                            const TensorDescriptorListParams& tensors,
+                                            bool forward) {
+#if CUDNN_VERSION >= 8201 // 8.2.1
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    if (prop->major < 6) return false;
+
+    if (forward) {
+      if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
+        return rnn.hidden_size <= 384;
+      }
+      if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
+        return rnn.hidden_size <= 192;
+      }
+    } else /* backward */ {
+      if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
+        return rnn.hidden_size <= 256;
+      }
+      if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
+        return rnn.hidden_size <= 128;
+      }
+    }
+
+    return false;
+#else
+    return false;
+#endif
+  }
+
+  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input, bool forward) {
     // LSTM with projections only works with standard algorithm
     if (rnn.proj_size != 0) {
       return CUDNN_RNN_ALGO_STANDARD;
     }
 
-    if (getCudnnDataType(input) == CUDNN_DATA_HALF &&
-        !tensors.is_input_packed()) {
-      if (use_persist_common_heuristics(rnn, tensors) &&
-          use_persist_device_heuristics(rnn, tensors)) {
-        return CUDNN_RNN_ALGO_PERSIST_STATIC;
+    // Persistent algos typically don't work for packed inputs with sequence lengths that vary
+    // across batch elements, and will return CUDNN_STATUS_NOT_SUPPORTED if attempted. See
+    // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
+    if (!tensors.is_input_packed()) {
+      auto cudnnDataType = getCudnnDataType(input);
+#if CUDNN_VERSION >= 8201 // 8.2.1
+      if (cudnnDataType != CUDNN_DATA_DOUBLE) {
+        if (use_rnn_persist_small_h(rnn, tensors, forward)) {
+          return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H;
+        }
+      }
+#endif
+      if (cudnnDataType == CUDNN_DATA_HALF) {
+        if (use_persist_common_heuristics(rnn, tensors) &&
+            use_persist_device_heuristics(rnn, tensors)) {
+          return CUDNN_RNN_ALGO_PERSIST_STATIC;
+        }
       }
     }
+
     return CUDNN_RNN_ALGO_STANDARD;
   }
 
@@ -970,7 +1012,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto y = output;
 
   auto handle = getCudnnHandle();
-  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
+  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, true);
   fn.rnn.set_algo(algo);
   RNNDescriptors descs(fn, handle, x, y, hx, cx);
 
@@ -1131,7 +1173,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
            "Gradients aren't CUDA tensors");
 
-  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
+  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false);
   fn.rnn.set_algo(algo);
   RNNDescriptors descs(fn, handle, x, y, hx, cx);
 
@@ -1234,7 +1276,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   const auto& y = output;
   auto dw = at::zeros(weight_buf.sizes(), weight_buf.options());
 
-  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
+  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false);
   fn.rnn.set_algo(algo);
   RNNDescriptors descs(fn, handle, x, y, hx, cx);
 
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 5533780a4547..db1d82f84fef 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -16,6 +16,39 @@
 namespace at {
 namespace native {
 
+void check_group_norm_inputs(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t C,
+    int64_t num_groups) {
+  TORCH_CHECK(
+      num_groups > 0,
+      "Expected num groups to be greater than 0, got ", num_groups);
+  TORCH_CHECK(
+      C % num_groups == 0,
+      "Expected number of channels in input to be divisible by ",
+      "num_groups, but got input of shape ",
+      input.sizes(),
+      " and "
+      "num_groups=",
+      num_groups);
+  TORCH_CHECK(
+      !weight.defined() || (weight.dim() == 1 && weight.numel() == C),
+      "Expected weight to be a vector of size equal to the number of ",
+      "channels in input, but got weight of shape ",
+      weight.sizes(),
+      " and input of shape ",
+      input.sizes());
+  TORCH_CHECK(
+      !bias.defined() || (bias.dim() == 1 && bias.numel() == C),
+      "Expected bias to be a vector of size equal to the number of ",
+      "channels in input, but got bias of shape ",
+      weight.sizes(),
+      " and input of shape ",
+      input.sizes());
+}
+
 std::tuple<Tensor, Tensor, Tensor> native_group_norm(
     const Tensor& X,
     const c10::optional<Tensor>& gamma_opt /* optional */,
@@ -31,6 +64,9 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm(
   const Tensor& gamma = *gamma_maybe_owned;
   const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); });
 
+  // repeated check so expanded weights can call native_group_norm directly but
+  // save mean and variance from forward
+  check_group_norm_inputs(X, gamma, beta, C, group);
   auto memory_format = X.device().is_cpu() ?
       X.suggest_memory_format() : at::MemoryFormat::Contiguous;
 
@@ -128,28 +164,7 @@ Tensor group_norm(
 
   const int64_t N = input.size(0);
   const int64_t C = input.size(1);
-  TORCH_CHECK(
-      C % num_groups == 0,
-      "Expected number of channels in input to be divisible by ",
-      "num_groups, but got input of shape ",
-      input.sizes(),
-      " and "
-      "num_groups=",
-      num_groups);
-  TORCH_CHECK(
-      !weight.defined() || (weight.dim() == 1 && weight.numel() == C),
-      "Expected weight to be a vector of size equal to the number of ",
-      "channels in input, but got weight of shape ",
-      weight.sizes(),
-      " and input of shape ",
-      input.sizes());
-  TORCH_CHECK(
-      !bias.defined() || (bias.dim() == 1 && bias.numel() == C),
-      "Expected bias to be a vector of size equal to the number of ",
-      "channels in input, but got bias of shape ",
-      weight.sizes(),
-      " and input of shape ",
-      input.sizes());
+  check_group_norm_inputs(input, weight, bias, C, num_groups);
 
   const auto input_shape = input.sizes();
   const int64_t HxW =
diff --git a/aten/src/ATen/native/group_norm.h b/aten/src/ATen/native/group_norm.h
index 58fc0867b1ac..1673df9253ee 100644
--- a/aten/src/ATen/native/group_norm.h
+++ b/aten/src/ATen/native/group_norm.h
@@ -1,9 +1,11 @@
 #pragma once
 
-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <cstdint>
 
 namespace at {
+class Tensor;
+
 namespace native {
 
 using forward_fn = void (*)(
diff --git a/aten/src/ATen/native/im2col.h b/aten/src/ATen/native/im2col.h
index 854052145d54..c3daed3d4ffc 100644
--- a/aten/src/ATen/native/im2col.h
+++ b/aten/src/ATen/native/im2col.h
@@ -3,6 +3,8 @@
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
 
 #include <algorithm>
@@ -26,27 +28,59 @@ static void im2col(
     const int64_t stride_w,
     const int64_t dilation_h,
     const int64_t dilation_w,
-    T* data_col) {
+    T* data_col,
+    bool is_channels_last = false) {
   const int64_t height_col = output_height;
   const int64_t width_col = output_width;
   const int64_t channels_col = channels * kernel_h * kernel_w;
 
-  for (const auto c_col : c10::irange(channels_col)) {
-    int64_t w_offset = c_col % kernel_w;
-    int64_t h_offset = (c_col / kernel_w) % kernel_h;
-    int64_t c_im = c_col / kernel_h / kernel_w;
+  if (is_channels_last) {
+    at::parallel_for(0, height_col * width_col, 0, [&](int64_t begin, int64_t end) {
+      int64_t h_col{0}, w_col{0};
+      data_index_init(begin, h_col, height_col, w_col, width_col);
 
-    for (const auto h_col : c10::irange(height_col)) {
-      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (const auto i_col : c10::irange(begin, end)) {
+        for (const auto h_offset : c10::irange(kernel_h)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_offset : c10::irange(kernel_w)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
 
-      for (const auto w_col : c10::irange(width_col)) {
-        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
-        data_col[(c_col * height_col + h_col) * width_col + w_col] =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
-            ? data_im[(c_im * height + h_im) * width + w_im]
-            : static_cast<T>(0);
+            const T* slice_im = data_im + (h_im * width + w_im) * channels;
+            T* slice_col = data_col + (i_col * kernel_h * kernel_w + h_offset * kernel_w + w_offset) * channels;
+
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              std::copy_n(slice_im, channels, slice_col);
+            } else {
+              std::fill_n(slice_col, channels, T(0));
+            }
+          }
+        }
+
+        // move the the next index
+        data_index_step(h_col, height_col, w_col, width_col);
       }
-    }
+    });
+  } else {
+    at::parallel_for(0, channels_col, 0, [&](int64_t begin, int64_t end) {
+      int64_t c_im{0}, h_offset{0}, w_offset{0};
+      data_index_init(begin, c_im, channels, h_offset, kernel_h, w_offset, kernel_w);
+
+      for (const auto c_col : c10::irange(begin, end)) {
+        for (const auto h_col : c10::irange(height_col)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_col : c10::irange(width_col)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+            data_col[(c_col * height_col + h_col) * width_col + w_col] =
+                (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                ? data_im[(c_im * height + h_im) * width + w_im]
+                : static_cast<T>(0);
+          }
+        }
+
+        // move to the next index
+        data_index_step(c_im, channels, h_offset, kernel_h, w_offset, kernel_w);
+      }
+    });
   }
 }
 
@@ -66,27 +100,48 @@ static void col2im(
     const int64_t stride_w,
     const int64_t dilation_h,
     const int64_t dilation_w,
-    T* data_im) {
+    T* data_im,
+    bool is_channels_last = false) {
   std::fill_n(data_im, height * width * channels, T(0));
 
   const int64_t height_col = output_height;
   const int64_t width_col = output_width;
   const int64_t channels_col = channels * kernel_h * kernel_w;
 
-  for (const auto c_col : c10::irange(channels_col)) {
-    int64_t w_offset = c_col % kernel_w;
-    int64_t h_offset = (c_col / kernel_w) % kernel_h;
-    int64_t c_im = c_col / kernel_h / kernel_w;
-
+  if (is_channels_last) {
     for (const auto h_col : c10::irange(height_col)) {
-      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
-
       for (const auto w_col : c10::irange(width_col)) {
-        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        for (const auto h_offset : c10::irange(kernel_h)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_offset : c10::irange(kernel_w)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+
+            T* slice_im = data_im + (h_im * width + w_im) * channels;
+            const T* slice_col = data_col + ((h_col * width_col + w_col) * kernel_h * kernel_w
+                + h_offset * kernel_w + w_offset) * channels;
+
+            if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) {
+              std::transform(slice_col, slice_col + channels, slice_im, slice_im, std::plus<T>());
+            }
+          }
+        }
+      }
+    }
+  } else {
+    for (const auto c_col : c10::irange(channels_col)) {
+      int64_t w_offset = c_col % kernel_w;
+      int64_t h_offset = (c_col / kernel_w) % kernel_h;
+      int64_t c_im = c_col / kernel_h / kernel_w;
+
+      for (const auto h_col : c10::irange(height_col)) {
+        int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        for (const auto w_col : c10::irange(width_col)) {
+          int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
 
-        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
-          data_im[(c_im * height + h_im) * width + w_im] +=
-              data_col[(c_col * height_col + h_col) * width_col + w_col];
+          if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+            data_im[(c_im * height + h_im) * width + w_im] +=
+                data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
       }
     }
   }
diff --git a/aten/src/ATen/native/im2col_shape_check.h b/aten/src/ATen/native/im2col_shape_check.h
index 84de7aa4c4f5..45fc96ea8443 100644
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#pragma once
+#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index c6b9b6d5c26a..16da001d3a16 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -8,6 +8,7 @@
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 
 #include <array>
 #include <functional>
@@ -18,7 +19,7 @@
 namespace at {
 namespace native {
 
-void layer_norm_cpu_out(
+void layer_norm_with_mean_rstd_out(
     at::Tensor& out,
     at::Tensor& mean,
     at::Tensor& rstd,
@@ -50,6 +51,20 @@ void layer_norm_cpu_out(
   rstd = rstd.view(stat_shape);
 }
 
+void layer_norm_cpu_out(
+    at::Tensor& out,
+    const at::Tensor& input,
+    const Tensor& gamma,
+    const Tensor& beta,
+    double eps,
+    int64_t M,
+    int64_t N) {
+  if (M <= 0) {
+    return;
+  }
+  LayerNormKernel(kCPU, input, gamma, beta, M, N, eps, &out, /*mean=*/nullptr, /*rstd=*/nullptr);
+}
+
 std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
     const Tensor& input,
     IntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
@@ -60,6 +75,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
+  bool mixed_type = is_mixed_type(input, weight, bias);
 
   auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
   auto M = M_N.first;
@@ -75,10 +91,11 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
       c10::nullopt /* device */,
       c10::nullopt /* pin_memory */,
       at::MemoryFormat::Contiguous);
-  Tensor mean = at::empty({M}, X->options());
-  Tensor rstd = at::empty({M}, X->options());
+  const auto dtype = param_scalar_type(input, mixed_type);
+  Tensor mean = at::empty({M}, X->options().dtype(dtype));
+  Tensor rstd = at::empty({M}, X->options().dtype(dtype));
 
-  layer_norm_cpu_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N);
+  layer_norm_with_mean_rstd_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N);
   return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
 }
 
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index f4ef0351095a..629bc9ab3906 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/accumulate.h>
 
@@ -65,10 +65,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_layer_norm_inputs(
 
 void layer_norm_cpu_out(
     at::Tensor& out,
-    at::Tensor& mean,
-    at::Tensor& rstd,
     const at::Tensor& input,
-    IntArrayRef normalized_shape,
     const Tensor& gamma,
     const Tensor& beta,
     double eps,
diff --git a/aten/src/ATen/native/metal/MetalContext.h b/aten/src/ATen/native/metal/MetalContext.h
index ca58eb9a433a..7954c129dbdb 100644
--- a/aten/src/ATen/native/metal/MetalContext.h
+++ b/aten/src/ATen/native/metal/MetalContext.h
@@ -3,8 +3,7 @@
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #include <string>
 
-API_AVAILABLE(ios(10.0), macos(10.13))
-// TODO[T79947194]: Convert this class to C++
+API_AVAILABLE(ios(11.0), macos(10.13))
 @interface MetalContext : NSObject
 @property(nonatomic, strong, readonly) id<MTLDevice> device;
 @property(nonatomic, strong, readonly) id<MTLCommandQueue> commandQueue;
diff --git a/aten/src/ATen/native/metal/MetalNeuronType.h b/aten/src/ATen/native/metal/MetalNeuronType.h
index 8ae5b3a8b341..b59d163c4ae8 100644
--- a/aten/src/ATen/native/metal/MetalNeuronType.h
+++ b/aten/src/ATen/native/metal/MetalNeuronType.h
@@ -37,7 +37,7 @@ static inline NeuronType neuronType(
   }
 }
 
-static inline MPSCNNNeuron* neuronType(NeuronType type) {
+static inline MPSCNNNeuron* neuron(NeuronType type) {
   if (type == NeuronType::Relu) {
     return [MPSCNNNeuronOp relu];
   } else if (type == NeuronType::Sigmoid) {
@@ -45,16 +45,27 @@ static inline MPSCNNNeuron* neuronType(NeuronType type) {
   } else if (type == NeuronType::Tanh) {
     return [MPSCNNNeuronOp tanh];
   } else if (type == NeuronType::HardSigmoid) {
-    if (@available(iOS 11.0, *)) {
-      return [MPSCNNNeuronOp hardSigmoid];
-    } else {
-      return nil;
-    }
+    return [MPSCNNNeuronOp hardSigmoid];
   } else {
     return nil;
   }
 }
 
+API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
+static inline MPSNNNeuronDescriptor* neuronDescriptor(NeuronType type) {
+  if (type == NeuronType::Relu) {
+    return [MPSCNNNeuronOpDescriptor reluDescriptor];
+  } else if (type == NeuronType::Sigmoid) {
+    return [MPSCNNNeuronOpDescriptor sigmoidDescriptor];
+  } else if (type == NeuronType::Tanh) {
+    return [MPSCNNNeuronOpDescriptor tanhDescriptor];
+  } else if (type == NeuronType::HardSigmoid) {
+    return [MPSCNNNeuronOpDescriptor hardSigmoidDescriptor];
+  } else {
+    return [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeNone];
+  }
+}
+
 }
 }
 }
diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h
index 0ee703f2ee26..edd7ba46d086 100644
--- a/aten/src/ATen/native/metal/MetalShaders.h
+++ b/aten/src/ATen/native/metal/MetalShaders.h
@@ -464,31 +464,6 @@ kernel void reflection_pad2d(texture2d_array<half, access::read> in_arr[[texture
   }
 }
 
-constant bool resize_is_arr = (ushort_arg_4 > 1 || ushort_arg_5 > 4);
-constant bool resize_is_tex = !resize_is_arr;
-kernel void resize_nearest(texture2d_array<half, access::sample> in_arr[[texture(0), function_constant(resize_is_arr)]],
-                           texture2d<half, access::sample> in_tex[[texture(0), function_constant(resize_is_tex)]],
-                           texture2d_array<half, access::write> out_arr[[texture(1), function_constant(resize_is_arr)]],
-                           texture2d<half, access::write> out_tex[[texture(1), function_constant(resize_is_tex)]],
-                           ushort3 gid[[thread_position_in_grid]]) {
-    const ushort oH = ushort_arg_0;
-    const ushort oW = ushort_arg_1;
-    if (gid.x >= oW || gid.y >= oH) {
-        return;
-    }
-    const float height_scale = float(ushort_arg_2) / 10000;
-    const float width_scale = float(ushort_arg_3) / 10000;
-    constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest);
-    const int in_y = (int)(gid.y / height_scale);
-    const int in_x = (int)(gid.x / width_scale);
-    if(resize_is_arr) {
-        out_arr.write(in_arr.sample(s, float2(in_x, in_y), gid.z), gid.xy, gid.z);
-    } else {
-        out_tex.write(in_tex.sample(s, float2(in_x, in_y)), gid.xy);
-    }
-}
-
-
 constant bool reshape_out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4);
 constant bool reshape_out_is_tex = !reshape_out_is_arr;
 constant bool reshape_in_is_arr = (ushort_arg_7 > 1 || ushort_arg_6 > 4);
diff --git a/aten/src/ATen/native/metal/MetalTensorImpl.h b/aten/src/ATen/native/metal/MetalTensorImpl.h
index 865e466a8de7..799f7ef3bd11 100644
--- a/aten/src/ATen/native/metal/MetalTensorImpl.h
+++ b/aten/src/ATen/native/metal/MetalTensorImpl.h
@@ -23,11 +23,11 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
             opaque_handle,
             sizes),
         strides_(strides.vec()) {
-    TensorImpl::set_has_contiguity_policy(
-        TensorImpl::HasContiguityPolicy::CustomBehavior);
   }
 
-  IntArrayRef strides() const override {
+  // TODO: manually storing strides here is dumb
+
+  IntArrayRef strides_custom() const override {
     return strides_;
   }
 
@@ -35,11 +35,6 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
     return true;
   }
 
-  int64_t stride(int64_t d) const override {
-    d = at::maybe_wrap_dim(d, this->dim(), false);
-    return strides_[d];
-  }
-
  private:
   const char* tensorimpl_type_name() const override {
     return "MetalTensorImpl";
diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
index cd73ba4eddb3..f614429eefdd 100644
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@@ -10,7 +10,7 @@
 namespace native {
 namespace metal {
 
-class API_AVAILABLE(ios(10.0), macos(10.13)) MetalTensorImplStorage::Impl {
+class API_AVAILABLE(ios(11.0), macos(10.13)) MetalTensorImplStorage::Impl {
  public:
   Impl(const std::vector<int64_t>& sizes, const std::vector<int64_t>& strides)
       : _sizes(sizes),
@@ -93,7 +93,7 @@ void copy_data_to_host(float* host) {
   impl()->copy_data_to_host(hostData);
 }
 
-API_AVAILABLE(ios(10.0))
+API_AVAILABLE(ios(11.0))
 MPSImageWrapper* MetalTensorImplStorage::texture() const {
   return impl()->texture();
 }
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h
index 517b00061f61..d26e358a3523 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h
@@ -3,7 +3,7 @@
 #import <ATen/native/metal/mpscnn/MPSCNNOp.h>
 #import <Foundation/Foundation.h>
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 @interface MPSCNNConvDataSource : NSObject<MPSCNNConvolutionDataSource>
 @property(nonatomic, assign) void* weights;
 @property(nonatomic, assign) float* bias;
@@ -15,7 +15,7 @@ API_AVAILABLE(ios(10.0), macos(10.13))
 @end
 
 using namespace at::native::metal;
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 @interface MPSCNNConvOp : NSObject<MPSCNNOp>
 + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
                 weights:(float*)w
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm
index 83fd0d3c6c6d..adf9e1b75c2d 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm
@@ -1,7 +1,7 @@
-#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
 #import <ATen/native/metal/MetalContext.h>
 #import <ATen/native/metal/mpscnn/MPSCNNConvOp.h>
 #import <ATen/native/metal/mpscnn/MPSCNNNeuronOp.h>
+#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
 
 #include <c10/util/Exception.h>
 
@@ -14,7 +14,7 @@ @implementation MPSCNNConvDataSource {
 - (id)initWithWeights:(void*)weights
                  Bias:(float*)bias
                  Desc:(MPSCNNConvolutionDescriptor*)desc
-    API_AVAILABLE(ios(10.0), macos(10.13)) {
+    API_AVAILABLE(ios(11.0), macos(10.13)) {
   self = [super init];
   if (self) {
     _weights = (float*)weights;
@@ -36,7 +36,7 @@ - (float* _Nullable)biasTerms {
   return _bias;
 }
 
-- (MPSDataType)dataType API_AVAILABLE(ios(10.0), macos(10.13)) {
+- (MPSDataType)dataType API_AVAILABLE(ios(11.0), macos(10.13)) {
   return MPSDataTypeFloat32;
 }
 
@@ -71,7 +71,7 @@ @implementation MPSCNNConvOp {
 + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
                 weights:(float*)w
                    bias:(float*)b
-           neuronFilter:(NeuronType)t API_AVAILABLE(ios(10.0), macos(10.13)) {
+           neuronFilter:(NeuronType)t API_AVAILABLE(ios(11.0), macos(10.13)) {
   using namespace at::native::metal::mpscnn;
   TORCH_CHECK(
       params.DX == params.DY == 1, "Dilated convolution is not supported yet.");
@@ -79,7 +79,7 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
   const int64_t iC = params.C;
   const int64_t kH = params.KH;
   const int64_t kW = params.KW;
-  MPSCNNNeuron* neuron = neuronType(t);
+  MPSCNNNeuron* neuron = at::native::metal::neuron(t);
   MPSCNNConvolutionDescriptor* desc = nil;
   if (params.isDepthwise()) {
     if (@available(iOS 11.0, *)) {
@@ -87,9 +87,14 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
           cnnConvolutionDescriptorWithKernelWidth:kW
                                      kernelHeight:kH
                              inputFeatureChannels:iC
-                            outputFeatureChannels:oC
-                                     neuronFilter:neuron];
+                            outputFeatureChannels:oC];
+
       desc.groups = 1;
+#if TARGET_OS_MACCATALYST
+      desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t);
+#else
+      desc.neuron = neuron;
+#endif
     } else {
       TORCH_CHECK(
           false,
@@ -103,13 +108,23 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
         channels in each group to be multiple of 4 for \
         group > 1.");
     }
-    desc = [MPSCNNConvolutionDescriptor
-        cnnConvolutionDescriptorWithKernelWidth:kW
-                                   kernelHeight:kH
-                           inputFeatureChannels:iC
-                          outputFeatureChannels:oC
-                                   neuronFilter:neuron];
-    desc.groups = params.G;
+    if (@available(iOS 11.0, *)) {
+      desc = [MPSCNNConvolutionDescriptor
+          cnnConvolutionDescriptorWithKernelWidth:kW
+                                     kernelHeight:kH
+                             inputFeatureChannels:iC
+                            outputFeatureChannels:oC];
+      desc.groups = params.G;
+#if TARGET_OS_MACCATALYST
+      desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t);
+#else
+      desc.neuron = neuron;
+#endif
+    } else {
+      TORCH_CHECK(
+          false,
+          "MPSCNNConvolutionDescriptor is only available on iOS 11.0 and above");
+    }
   }
   desc.strideInPixelsX = params.SX;
   desc.strideInPixelsY = params.SY;
@@ -124,15 +139,8 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params
                weights:dataSource];
 
   } else {
-#if TARGET_OS_IPHONE
-    // Fallback on earlier versions
-    conv = [[MPSCNNConvolution alloc]
-               initWithDevice:[MetalContext sharedInstance].device
-        convolutionDescriptor:desc
-                kernelWeights:w
-                    biasTerms:b
-                        flags:MPSCNNConvolutionFlagsNone];
-#endif
+    TORCH_CHECK(
+        false, "MPSCNNConvolution is only available on iOS 11.0 and above");
   }
   [conv setEdgeMode:MPSImageEdgeModeZero];
   MPSOffset offset;
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h
index 91c01ce227f5..297b180f59c4 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h
@@ -4,7 +4,7 @@
 #import <Foundation/Foundation.h>
 
 using namespace at::native::metal;
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 @interface MPSCNNFullyConnectedOp : NSObject<MPSCNNOp>
 + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params
                           weights:(float*)w
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm
index 7e4d5974bbb8..353095a8f52f 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm
@@ -10,14 +10,18 @@ + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params
                           weights:(float*)w
                              bias:(float*)b
                      neuronFilter:(NeuronType)t
-    API_AVAILABLE(ios(10.0), macos(10.13)) {
-  MPSCNNNeuron* neuron = neuronType(t);
+    API_AVAILABLE(ios(11.0), macos(10.13)) {
+  MPSCNNNeuron* neuron = at::native::metal::neuron(t);
   MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor
       cnnConvolutionDescriptorWithKernelWidth:params.KW
                                  kernelHeight:params.KH
                          inputFeatureChannels:params.IC
-                        outputFeatureChannels:params.OC
-                                 neuronFilter:neuron];
+                        outputFeatureChannels:params.OC];
+#if TARGET_OS_MACCATALYST
+  desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t);
+#else
+  desc.neuron = neuron;
+#endif
   desc.strideInPixelsX = 1;
   desc.strideInPixelsY = 1;
 
@@ -31,14 +35,9 @@ + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params
         initWithDevice:[MetalContext sharedInstance].device
                weights:ds];
   } else {
-#if TARGET_OS_IPHONE
-    fc = [[MPSCNNFullyConnected alloc]
-               initWithDevice:[MetalContext sharedInstance].device
-        convolutionDescriptor:desc
-                kernelWeights:w
-                    biasTerms:b
-                        flags:MPSCNNConvolutionFlagsNone];
-#endif
+    TORCH_CHECK(
+        false,
+        "MPSCNNFullyConnectedOp is only available on iOS 11.0 and above");
   }
   [fc setClipRect:MTLRegionMake3D(0, 0, 0, 1, 1, params.N)];
   [fc setOffset:{.x = static_cast<NSInteger>(params.W / 2),
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h
index 2e2dee8b022c..e1a9b2617bd3 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h
@@ -8,3 +8,13 @@
 + (MPSCNNNeuronTanH*)tanh;
 
 @end
+
+API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
+@interface MPSCNNNeuronOpDescriptor : NSObject
+
++ (MPSNNNeuronDescriptor*)hardSigmoidDescriptor;
++ (MPSNNNeuronDescriptor*)reluDescriptor;
++ (MPSNNNeuronDescriptor*)sigmoidDescriptor;
++ (MPSNNNeuronDescriptor*)tanhDescriptor;
+
+@end
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm
index 5e208731c88c..1b322f9a97e9 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm
@@ -4,6 +4,10 @@
 @implementation MPSCNNNeuronOp
 
 + (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
+// Remove this once we support iOS 11.3
+#if TARGET_OS_MACCATALYST
+  return nil;
+#else
   static dispatch_once_t onceToken;
   static MPSCNNNeuronHardSigmoid* neuron = nil;
   dispatch_once(&onceToken, ^{
@@ -13,9 +17,14 @@ + (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
                      b:0.5];
   });
   return neuron;
+#endif
 }
 
 + (MPSCNNNeuronReLU*)relu {
+// Remove this once we support iOS 11.3
+#if TARGET_OS_MACCATALYST
+  return nil;
+#else
   static MPSCNNNeuronReLU* relu = nil;
   static dispatch_once_t onceToken;
   dispatch_once(&onceToken, ^{
@@ -24,9 +33,14 @@ + (MPSCNNNeuronReLU*)relu {
                      a:0];
   });
   return relu;
+#endif
 }
 
 + (MPSCNNNeuronSigmoid*)sigmoid {
+// Remove this once we support iOS 11.3
+#if TARGET_OS_MACCATALYST
+  return nil;
+#else
   static dispatch_once_t onceToken;
   static MPSCNNNeuronSigmoid* sigmoid = nil;
   dispatch_once(&onceToken, ^{
@@ -34,9 +48,14 @@ + (MPSCNNNeuronSigmoid*)sigmoid {
         initWithDevice:[MetalContext sharedInstance].device];
   });
   return sigmoid;
+#endif
 }
 
 + (MPSCNNNeuronTanH*)tanh {
+// Remove this once we support iOS 11.3
+#if TARGET_OS_MACCATALYST
+  return nil;
+#else
   static dispatch_once_t onceToken;
   static MPSCNNNeuronTanH* tanh = nil;
   dispatch_once(&onceToken, ^{
@@ -46,6 +65,57 @@ + (MPSCNNNeuronTanH*)tanh {
                      b:1];
   });
   return tanh;
+#endif
+}
+
+@end
+
+API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
+@implementation MPSCNNNeuronOpDescriptor
+
++ (MPSNNNeuronDescriptor*)hardSigmoidDescriptor {
+  static dispatch_once_t onceToken;
+  static MPSNNNeuronDescriptor* neuronDesc = nil;
+  dispatch_once(&onceToken, ^{
+    neuronDesc = [MPSNNNeuronDescriptor
+        cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid
+                                  a:1.0 / 6.0
+                                  b:0.5];
+  });
+  return neuronDesc;
+}
+
++ (MPSNNNeuronDescriptor*)reluDescriptor {
+  static dispatch_once_t onceToken;
+  static MPSNNNeuronDescriptor* neuronDesc = nil;
+  dispatch_once(&onceToken, ^{
+    neuronDesc =
+        [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU
+                                                         a:0];
+  });
+  return neuronDesc;
+}
+
++ (MPSNNNeuronDescriptor*)sigmoidDescriptor {
+  static dispatch_once_t onceToken;
+  static MPSNNNeuronDescriptor* neuronDesc = nil;
+  dispatch_once(&onceToken, ^{
+    neuronDesc = [MPSNNNeuronDescriptor
+        cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid];
+  });
+  return neuronDesc;
+}
+
++ (MPSNNNeuronDescriptor*)tanhDescriptor {
+  static dispatch_once_t onceToken;
+  static MPSNNNeuronDescriptor* neuronDesc = nil;
+  dispatch_once(&onceToken, ^{
+    neuronDesc =
+        [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH
+                                                         a:1.0
+                                                         b:1.0];
+  });
+  return neuronDesc;
 }
 
 @end
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h
index 809518ef1a80..13264d097e92 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h
@@ -2,6 +2,24 @@
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #include <string>
 
+// This is a utility macro that can be used to throw an exception when a Metal
+// API function produces a NSError. The exception will contain a message with
+// useful info extracted from the NSError.
+#define METAL_THROW_IF_ERROR(error, preamble)                                    \
+  do {                                                                           \
+    if C10_LIKELY(error) {                                                       \
+      throw c10::Error(                                                          \
+          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)},                 \
+          c10::str(                                                              \
+              preamble,                                                          \
+              " Error details: ",                                                \
+              " Localized_description: ", error.localizedDescription.UTF8String, \
+              " Domain: ", error.domain.UTF8String,                              \
+              " Code: ", error.code,                                             \
+              " User Info: ", error.userInfo.description.UTF8String));           \
+    }                                                                            \
+  } while (false)
+
 namespace at {
 namespace native {
 namespace metal {
@@ -13,12 +31,12 @@ struct LaunchParams {
   MTLSize threadsPerGrid; // iOS 11.0
 };
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 LaunchParams spatialPointwiseKernelLaunchParams(
     id<MTLComputePipelineState> pipeline,
     MPSImage* im);
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 LaunchParams spatialPointwiseKernelLaunchParams(
     id<MTLComputePipelineState> pipeline,
     NSUInteger numberOfImages,
@@ -26,7 +44,7 @@ LaunchParams spatialPointwiseKernelLaunchParams(
     NSUInteger height,
     NSUInteger width);
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 static inline std::string kernelFor(
     MPSImage* image,
     const std::string& arrayKernel,
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
index bba2a525429a..33de62301ef5 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h
@@ -9,7 +9,7 @@ namespace at {
 namespace native {
 namespace metal {
 
-class API_AVAILABLE(ios(10.0), macos(10.13)) MPSImageWrapper {
+class API_AVAILABLE(ios(11.0), macos(10.13)) MPSImageWrapper {
  public:
   MPSImageWrapper(IntArrayRef sizes);
   ~MPSImageWrapper();
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
index 287f94dde778..d5a9632d26c9 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
@@ -32,12 +32,13 @@ - (void)beginSynchronization {
 }
 
 - (void)endSynchronization:(NSError*)error {
+  // if something went wrong during command buffer execution
   if (error) {
     if (_imageWrapper) {
       _imageWrapper->release();
     }
-    // throw exceptions if we failed to flush the command buffer
-    TORCH_CHECK(error);
+    // throw an exception with error details
+    METAL_THROW_IF_ERROR(error, "Command buffer execution failed!");
   }
 }
 
diff --git a/aten/src/ATen/native/metal/ops/MetalAddmm.mm b/aten/src/ATen/native/metal/ops/MetalAddmm.mm
index 94e1add60b30..e0c196ac68b3 100644
--- a/aten/src/ATen/native/metal/ops/MetalAddmm.mm
+++ b/aten/src/ATen/native/metal/ops/MetalAddmm.mm
@@ -16,7 +16,7 @@
 namespace native {
 namespace metal {
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor addmm(
     const Tensor& bias,
     const Tensor& input,
diff --git a/aten/src/ATen/native/metal/ops/MetalConcat.mm b/aten/src/ATen/native/metal/ops/MetalConcat.mm
index 14b4ce7dbfc1..c43bf055fa2e 100644
--- a/aten/src/ATen/native/metal/ops/MetalConcat.mm
+++ b/aten/src/ATen/native/metal/ops/MetalConcat.mm
@@ -203,7 +203,7 @@ Tensor cat(const TensorList tensors, int64_t dim) {
 }
 
 TORCH_LIBRARY_IMPL(aten, Metal, m) {
-  m.impl(TORCH_SELECTIVE_NAME("aten::_cat"), TORCH_FN(cat));
+  m.impl(TORCH_SELECTIVE_NAME("aten::cat"), TORCH_FN(cat));
 }
 
 }
diff --git a/aten/src/ATen/native/metal/ops/MetalNeurons.mm b/aten/src/ATen/native/metal/ops/MetalNeurons.mm
index 03a5de0851ad..ca925d9b841b 100644
--- a/aten/src/ATen/native/metal/ops/MetalNeurons.mm
+++ b/aten/src/ATen/native/metal/ops/MetalNeurons.mm
@@ -51,19 +51,19 @@ Tensor neuronKernel(const Tensor& input, MPSCNNNeuron* neuron) {
   return input;
 }
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor relu(const Tensor& input) {
   TORCH_CHECK(input.is_metal());
   return neuronKernel(input, [MPSCNNNeuronOp relu]);
 }
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor& relu_(Tensor& input) {
   TORCH_CHECK(input.is_metal());
   return neuronKernel_(input, [MPSCNNNeuronOp relu]);
 }
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor sigmoid(const Tensor& input) {
   return neuronKernel(input, [MPSCNNNeuronOp sigmoid]);
 }
@@ -74,7 +74,7 @@ Tensor sigmoid(const Tensor& input) {
   return neuronKernel_(input, [MPSCNNNeuronOp hardSigmoid]);
 }
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor tanh(const Tensor& input) {
   TORCH_CHECK(input.is_metal());
   return neuronKernel(input, [MPSCNNNeuronOp tanh]);
@@ -85,9 +85,7 @@ Tensor tanh(const Tensor& input) {
   m.impl(TORCH_SELECTIVE_NAME("aten::relu"), TORCH_FN(relu));
   m.impl(TORCH_SELECTIVE_NAME("aten::relu_"), TORCH_FN(relu_));
   m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), TORCH_FN(sigmoid));
-  if (@available(iOS 11.0, *)) {
-    m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
-  }
+  m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
 };
 
 }
diff --git a/aten/src/ATen/native/metal/ops/MetalPadding.mm b/aten/src/ATen/native/metal/ops/MetalPadding.mm
index ca62cfc6de65..4edd4a04bbde 100644
--- a/aten/src/ATen/native/metal/ops/MetalPadding.mm
+++ b/aten/src/ATen/native/metal/ops/MetalPadding.mm
@@ -13,7 +13,7 @@
 namespace native {
 namespace metal {
 
-// API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor reflection_pad2d(const Tensor& input, IntArrayRef padding) {
   TORCH_CHECK(input.is_metal());
 
diff --git a/aten/src/ATen/native/metal/ops/MetalPooling.mm b/aten/src/ATen/native/metal/ops/MetalPooling.mm
index 056602d381b8..5e3b9110756e 100644
--- a/aten/src/ATen/native/metal/ops/MetalPooling.mm
+++ b/aten/src/ATen/native/metal/ops/MetalPooling.mm
@@ -15,7 +15,7 @@
 namespace native {
 namespace metal {
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor max_pool2d(
     const Tensor& input,
     IntArrayRef kernel_size,
@@ -70,7 +70,7 @@ Tensor max_pool2d(
   return output;
 }
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor adaptive_avg_pool2d(const Tensor& input, IntArrayRef output_size) {
   // averages across the width and height, and outputs a 1x1xC image.
   TORCH_CHECK(output_size[0] == 1 && output_size[1] == 1);
diff --git a/aten/src/ATen/native/metal/ops/MetalReshape.mm b/aten/src/ATen/native/metal/ops/MetalReshape.mm
index ed74014a169e..37842ee3be59 100644
--- a/aten/src/ATen/native/metal/ops/MetalReshape.mm
+++ b/aten/src/ATen/native/metal/ops/MetalReshape.mm
@@ -15,7 +15,7 @@
 namespace native {
 namespace metal {
 
-API_AVAILABLE(ios(10.0), macos(10.13))
+API_AVAILABLE(ios(11.0), macos(10.13))
 Tensor view(const Tensor& input, IntArrayRef size) {
   TORCH_CHECK(input.is_metal());
   auto inferred_size = at::infer_size(size, input.numel());
diff --git a/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm b/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm
index b3fb27f7619a..39524569bae5 100644
--- a/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm
+++ b/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm
@@ -17,7 +17,7 @@
 
 Tensor upsample_nearest2d_vec(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   TORCH_CHECK(input.is_metal());
   auto osize =
@@ -58,28 +58,9 @@ Tensor upsample_nearest2d_vec(
                       sourceImage:X
                  destinationImage:Y];
   } else {
-    NSUInteger sh = scale_h.value() * 10000;
-    NSUInteger sw = scale_w.value() * 10000;
-    id<MTLComputePipelineState> state =
-        [[MetalContext sharedInstance] specializedPipelineState:"resize_nearest"
-                                                      Constants:@[
-                                                        @(output_height),
-                                                        @(output_width),
-                                                        @(sh),
-                                                        @(sw),
-                                                        @(nbatch),
-                                                        @(channels),
-                                                      ]];
-    id<MTLComputeCommandEncoder> encoder =
-        [commandBuffer.buffer computeCommandEncoder];
-    [encoder setComputePipelineState:state];
-    [encoder setTexture:[X texture] atIndex:0];
-    [encoder setTexture:[Y texture] atIndex:1];
-    const auto& launchParams =
-        mpscnn::spatialPointwiseKernelLaunchParams(state, Y);
-    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
-            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
-    [encoder endEncoding];
+      TORCH_CHECK(
+          false,
+          "MPSCNNUpsamplingNearest is only available on iOS 11.0 and above");
   }
   auto output = makeTensor(std::move(mt), input.options());
   return output;
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 8fc00d850113..fc4587db9c34 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -135,52 +135,6 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
   return t.narrow(dim, group_idx * group_size, group_size);
 }
 
-// ---------------------------------------------------------------------
-//
-// Checking
-//
-// ---------------------------------------------------------------------
-
-// Used on pad, stride and dilation
-static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
-{
-  TORCH_CHECK(args.size() <= expected_size,
-           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
-           expected_size, " (while checking arguments for ", c, ")");
-  TORCH_CHECK(args.size() >= expected_size,
-           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
-           expected_size, " (while checking arguments for ", c, ")");
-
-  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
-  if (num_negative_values > 0){
-    std::stringstream ss;
-    ss << arg_name << " should be greater than zero but got (";
-    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    AT_ERROR(ss.str());
-  }
-}
-
-// see NOTE [ Convolution checks] in src/Aten/native/cudnn/Conv.cpp
-static void convolution_shape_check(
-    CheckedFrom c,
-    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
-{
-  check_args(c, padding, input->dim() - 2, "padding");
-  check_args(c, stride, padding.size(), "stride");
-  check_args(c, dilation, padding.size(), "dilation");
-
-  // Input
-  checkDimRange(c, input, 3, 6 /* exclusive */);
-  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
-
-  // Weight
-  checkSameDim(c, input, weight);
-
-  checkSameDim(c, input, output);
-}
-
 // This POD struct is used to let us easily compute hashes of the
 // parameters
 struct ConvolutionParams
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index 9f53c1186ab3..b5a63dd803d1 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -352,7 +352,7 @@ std::pair<std::vector<Tensor>, size_t> get_parameters(miopenHandle_t handle, con
             param_size /= elem_size;
 
             if(linear_id == 0 || linear_id == num_linear_layers / 2) {
-                const auto size = { static_cast<int64_t>(param_size * num_linear_layers / 2), 1L};
+                std::initializer_list<int64_t> size = { static_cast<int64_t>(param_size * num_linear_layers / 2), 1L};
                 Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
                 params.emplace_back(std::move(param));
                 layer_params_count++;
@@ -386,7 +386,7 @@ std::pair<std::vector<Tensor>, size_t> get_parameters(miopenHandle_t handle, con
                 bias_size /= elem_size;
 
                 if(linear_id == 0 || linear_id == num_linear_layers / 2) {
-                    const auto size = { static_cast<int64_t>(bias_size * num_linear_layers / 2), 1L};
+                    std::initializer_list<int64_t> size = { static_cast<int64_t>(bias_size * num_linear_layers / 2), 1L};
                     Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
                     params.emplace_back(std::move(param));
                     layer_params_count++;
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.h b/aten/src/ATen/native/mkl/LinearAlgebra.h
index d5e4518e70bf..a536c193524e 100644
--- a/aten/src/ATen/native/mkl/LinearAlgebra.h
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.h
@@ -1,4 +1,5 @@
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/TransposeType.h>
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
index 35d583c64733..40557d478b15 100644
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -206,6 +206,58 @@ void addmm_dense_result(
 #endif
 }
 
+/*
+  Computes a sparse matrix-sparse matrix product with dense result defined as
+  C <- alpha*(A*B) + beta*C
+
+  Args:
+  * `A` - Sparse Tensor storing m x k matrix.
+  * `B` - Sparse Tensor storing k x n matrix.
+  * `C` - [in] Dense Tensor storing matrix of size m x n.
+          [out] result of the operation.
+*/
+void addmm_sparse_input_dense_result(
+    const Tensor& A,
+    const Tensor& B,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& C) {
+#if !AT_USE_MKL_SPARSE()
+  TORCH_CHECK(
+      false,
+      "Calling addmm on a sparse CPU tensor requires Linux platform. ",
+      "Please use PyTorch built with MKL on Linux.");
+#else
+  // MKL function computes C <- A*B
+  // So we need a temporary matrix to store the result
+  // and then add it to C
+  auto C_ = at::empty(C.sizes(), C.options());
+  auto order = SPARSE_LAYOUT_ROW_MAJOR;
+  auto ldc = C_.stride(-2);
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      C.scalar_type(), "addmm_sparse_input_dense_result", [&] {
+        auto mkl_A = at::mkl::sparse::MklSparseCsrDescriptor<scalar_t>(A);
+        auto mkl_B = at::mkl::sparse::MklSparseCsrDescriptor<scalar_t>(B);
+        at::mkl::sparse::spmmd<scalar_t>(
+            SPARSE_OPERATION_NON_TRANSPOSE,
+            mkl_A.descriptor(),
+            mkl_B.descriptor(),
+            order,
+            C_.data_ptr<scalar_t>(),
+            ldc);
+      });
+
+  // If beta is zero NaN and Inf should not be propagated to the result
+  if (beta.toComplexDouble() == 0.) {
+    C.zero_();
+  } else {
+    C.mul_(beta);
+  }
+  C.add_(C_, alpha);
+#endif
+}
+
 /*
   Computes a sparse matrix-sparse matrix product defined as
   C <- alpha*(A*B) + beta*C
@@ -288,14 +340,22 @@ void addmm_out_sparse_csr(
     const Scalar& alpha,
     const Tensor& result) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.dim() == 2 && mat2.dim() == 2 && result.dim() == 2);
-  if (mat2.layout() == kStrided && result.layout() == kStrided) {
+  if ((mat1.layout() == kSparseCsr || mat1.layout() == kSparseBsr) &&
+      mat2.layout() == kStrided && result.layout() == kStrided) {
     return addmm_dense_result(mat1, mat2, beta, alpha, result);
-  } else if (mat2.is_sparse_csr() && result.is_sparse_csr()) {
+  }
+  if (mat1.layout() == kStrided && mat2.is_sparse_csr() && result.layout() == kStrided) {
+    // TODO: We can use MKL's transposition flags once we have CSC support.
+    return addmm_dense_result(mat2.transpose(0, 1), mat1.transpose(0, 1), beta, alpha, result.transpose(0, 1));
+  }
+  if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.layout() == kStrided) {
+    return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+  }
+  if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.is_sparse_csr()) {
     return addmm_sparse_result(mat1, mat2, beta, alpha, result);
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "addmm: Received unexpected tensor layouts as input.");
   }
+  TORCH_CHECK(false, "addmm: computation on CPU is not implemented for ",
+              result.layout(), " + ", mat1.layout(), " @ ", mat2.layout());
 }
 
 /*
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index bcf8afe2a373..470c3a48e5e0 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -250,7 +250,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   auto in_sizes = self.sizes();
   DimVector out_sizes(in_sizes.begin(), in_sizes.end());
   out_sizes[dim.back()] = last_dim_size;
-  auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
   pocketfft::shape_t axes(dim.begin(), dim.end());
   if (self.scalar_type() == kComplexFloat) {
     pocketfft::c2r(shape_from_tensor(out), stride_from_tensor(self), stride_from_tensor(out), axes, false,
@@ -347,7 +347,7 @@ static DftiDescriptor _plan_mkl_fft(
 
   // precision
   const DFTI_CONFIG_VALUE prec = [&]{
-    switch (c10::toValueType(dtype)) {
+    switch (c10::toRealValueType(dtype)) {
       case ScalarType::Float: return DFTI_SINGLE;
       case ScalarType::Double: return DFTI_DOUBLE;
       default: TORCH_CHECK(false, "MKL FFT doesn't support tensors of type: ", dtype);
@@ -466,7 +466,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
     batched_out_sizes[i + 1] = out_sizes[dim[i]];
   }
 
-  const auto value_type = c10::toValueType(input.scalar_type());
+  const auto value_type = c10::toRealValueType(input.scalar_type());
   out.resize_(batched_out_sizes, MemoryFormat::Contiguous);
 
   auto descriptor = _plan_mkl_fft(
@@ -523,7 +523,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   auto in_sizes = input.sizes();
   DimVector out_sizes(in_sizes.begin(), in_sizes.end());
   out_sizes[dim.back()] = last_dim_size;
-  auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
   return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false);
 }
 
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index fb41dcdd6215..a2489e42e185 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -43,27 +43,78 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub);
 
 namespace at { namespace native {
 
-ideep::tensor _mkldnn_convolution(
-    const ideep::tensor& x,
-    const ideep::tensor& w,
-    const c10::optional<ideep::tensor>& b,
+#define MKLDNNTensor(itensor, options)                                  \
+  new_with_itensor_mkldnn(                                              \
+      std::move(itensor),                                               \
+      optTypeMetaToScalarType(options.dtype_opt()),                     \
+      options.device_opt())
+
+// Note [MKLDNN Convolution Memory Formats]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// MKLDNN has 3 types of memory formats in convolution:
+//
+// In case memory format passed from PyTorch (aka. user layout)
+// differs from the internal layout which MKLDNN used, a `reorder` is needed;
+// otherwise when user layout is identical to internal layout,
+// MKLDNN uses a memory `view` upon an existing CPU tensor.
+//
+// 1. NCHW (CPU tensor, contiguous)
+//  input reorder:  NCHW(user) -> Blocked(internal)
+//  weight reorder: OIHW(user) -> Blocked(internal)
+//  output reorder: Blocked(internal) -> NCHW(user)
+//
+// 2. NHWC: (CPU tensor, channels last)
+//  input view:     NHWC(user) -> NHWC(internal)
+//  weight reorder: OHWI(user) -> Blocked(internal)
+//  output view:    NHWC(internal) -> NHWC(user)
+//
+// 3. Blocked (MKLDNN tensor):
+//  By explicitly converting a tensor to mkldnn, e.g. `x.to_mkldnn()`,
+//  blocked format will propagate between layers. Input, output will be in blocked format.
+//
+//  For inference case, weight can be prepacked into blocked format by
+//  (so as to save weight reoder overhead):
+//      model = torch.utils.mkldnn.to_mkldnn(model)
+//
+//  For training case, grad_output can be CPU tensor or MKLDNN tensor,
+//  but weight/bias and grad_weight/grad_bias are always CPU tensor.
+//
+
+Tensor mkldnn_convolution(
+    const Tensor& input,
+    const Tensor& weight, const c10::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  if (input.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
 
-  auto kernel_size = w.get_dims();
+  auto output_sizes = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
+  auto output = at::empty({0}, input.options());
 
-  std::vector<int64_t> input_size = x.get_dims();
-  std::vector<int64_t> output_sizes =
-      conv_output_size(input_size, kernel_size, padding, stride, dilation);
+  const ideep::tensor x = itensor_from_tensor(input);
+  const ideep::tensor w = itensor_from_tensor(weight);
 
   ideep::tensor y;
-  if (b.has_value()) {
+  if (is_channels_last) {
+    output.resize_(output_sizes, input.suggest_memory_format());
+    y = itensor_from_tensor(output);
+  }
+  if (bias.defined()) {
+    const ideep::tensor b = itensor_from_tensor(bias);
     ideep::convolution_forward::compute(
         x,
         w,
-        b.value(),
+        b,
         {output_sizes.cbegin(), output_sizes.cend()},
         y,
         {stride.begin(), stride.end()},
@@ -83,47 +134,14 @@ ideep::tensor _mkldnn_convolution(
         {padding.begin(), padding.end()},
         groups);
   }
-  return y;
-}
-
-Tensor mkldnn_convolution(
-    const Tensor& input,
-    const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef padding,
-    IntArrayRef stride,
-    IntArrayRef dilation,
-    int64_t groups) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  if (input.scalar_type() == ScalarType::BFloat16) {
-    TORCH_CHECK(mkldnn_bf16_device_check(),
-        "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
-  }
-  const ideep::tensor mkldnn_input = itensor_from_tensor(input);
-  const ideep::tensor mkldnn_weight = itensor_from_tensor(weight);
-  c10::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
-  if (bias.defined()) {
-    mkldnn_bias = itensor_from_tensor(bias);
-  }
-
-  ideep::tensor mkldnn_output = _mkldnn_convolution(
-      mkldnn_input,
-      mkldnn_weight,
-      mkldnn_bias,
-      padding,
-      stride,
-      dilation,
-      groups);
 
   if (input.is_mkldnn()) {
-    return new_with_itensor_mkldnn(std::move(mkldnn_output), optTypeMetaToScalarType(input.options().dtype_opt()),
-                                   input.options().device_opt());
+    return MKLDNNTensor(y, input.options());
+  } else if (!is_channels_last) {
+    return mkldnn_to_dense(MKLDNNTensor(y, input.options()));
   } else {
-    return mkldnn_to_dense(
-        new_with_itensor_mkldnn(std::move(mkldnn_output), optTypeMetaToScalarType(input.options().dtype_opt()),
-                                input.options().device_opt()));
+    TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc());
+    return output;
   }
 }
 
@@ -131,17 +149,22 @@ Tensor mkldnn_convolution_backward_input(
     IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined)
 {
-  // for training case, grad_output can be cpu tensor or MKLDNN tensor,
-  // but weight and bias always cpu tensor.
-  auto mkldnn_grad_output = itensor_from_tensor(grad_output);
-  auto mkldnn_weight = itensor_view_from_dense(weight);
+  bool is_channels_last = grad_output.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
+  auto grad_input = at::empty({0}, grad_output.options());
 
-  ideep::tensor mkldnn_grad_input;
+  auto grad_y = itensor_from_tensor(grad_output);
+  auto w = itensor_view_from_dense(weight);
+
+  ideep::tensor grad_x;
+  if (is_channels_last) {
+    grad_input.resize_(input_size, grad_output.suggest_memory_format());
+    grad_x = itensor_from_tensor(grad_input);
+  }
   ideep::convolution_backward_data::compute(
-      mkldnn_grad_output,
-      mkldnn_weight,
+      grad_y,
+      w,
       input_size.vec(),
-      mkldnn_grad_input,
+      grad_x,
       stride.vec(),
       dilation.vec(),
       padding.vec(),
@@ -149,14 +172,12 @@ Tensor mkldnn_convolution_backward_input(
       groups);
 
   if (grad_output.is_mkldnn()) {
-    return new_with_itensor_mkldnn(std::move(mkldnn_grad_input),
-                                   optTypeMetaToScalarType(grad_output.options().dtype_opt()),
-                                   grad_output.options().device_opt());
-
+    return MKLDNNTensor(grad_x, grad_output.options());
+  } else if (!is_channels_last){
+    return mkldnn_to_dense(MKLDNNTensor(grad_x, grad_output.options()));
   } else {
-    return mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_input),
-                                                   optTypeMetaToScalarType(grad_output.options().dtype_opt()),
-                                                   grad_output.options().device_opt()));
+    TORCH_INTERNAL_ASSERT(grad_x.get_desc().is_nhwc());
+    return grad_input;
   }
 }
 
@@ -164,19 +185,19 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
     IntArrayRef weight_size, const Tensor& grad_output, const Tensor& input,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined)
 {
-  // for training case, grad_output and input can be cpu tensor or MKLDNN tensor,
-  // but weight and bias are always cpu tensor.
-  const ideep::tensor mkldnn_grad_output = itensor_from_tensor(grad_output);
-  const ideep::tensor mkldnn_input = itensor_from_tensor(input);
+  bool is_channels_last = grad_output.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
 
-  ideep::tensor mkldnn_grad_weight, mkldnn_grad_bias;
+  const ideep::tensor grad_y = itensor_from_tensor(grad_output);
+  const ideep::tensor x = itensor_from_tensor(input);
+
+  ideep::tensor grad_w, grad_b;
   if (bias_defined) {
     ideep::convolution_backward_weights::compute(
-        mkldnn_input,
-        mkldnn_grad_output,
+        x,
+        grad_y,
         weight_size.vec(),
-        mkldnn_grad_weight,
-        mkldnn_grad_bias,
+        grad_w,
+        grad_b,
         stride.vec(),
         dilation.vec(),
         padding.vec(),
@@ -184,10 +205,10 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
         groups);
   } else {
     ideep::convolution_backward_weights::compute(
-        mkldnn_input,
-        mkldnn_grad_output,
+        x,
+        grad_y,
         weight_size.vec(),
-        mkldnn_grad_weight,
+        grad_w,
         stride.vec(),
         dilation.vec(),
         padding.vec(),
@@ -195,20 +216,23 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
         groups);
   }
 
-  return std::make_tuple(
-      mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_weight),
-                                              optTypeMetaToScalarType(grad_output.options().dtype_opt()),
-                                              grad_output.options().device_opt())),
-      mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_bias),
-                                              optTypeMetaToScalarType(grad_output.options().dtype_opt()),
-                                              grad_output.options().device_opt())));
+  if (!is_channels_last) {
+    return std::make_tuple(
+        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())),
+        bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
+  } else {
+    return std::make_tuple(
+        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())).to(at::MemoryFormat::ChannelsLast),
+        bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
+  }
 }
 
 std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
     const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, std::array<bool,3> output_mask)
 {
-  Tensor grad_output = grad_output_t.is_mkldnn() ? grad_output_t : grad_output_t.contiguous();
+  auto memory_format = input.suggest_memory_format();
+  Tensor grad_output = grad_output_t.is_mkldnn() ? grad_output_t : grad_output_t.contiguous(memory_format);
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
diff --git a/aten/src/ATen/native/mkldnn/Gelu.cpp b/aten/src/ATen/native/mkldnn/Gelu.cpp
index fa78cd1c3a96..1d2a67251513 100644
--- a/aten/src/ATen/native/mkldnn/Gelu.cpp
+++ b/aten/src/ATen/native/mkldnn/Gelu.cpp
@@ -1,17 +1,17 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Config.h>
-
+#include <ATen/native/Activation.h>
 
 #if !AT_MKLDNN_ENABLED()
 
 namespace at { namespace native {
 
-Tensor mkldnn_gelu(const Tensor& input) {
+Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) {
   TORCH_CHECK(false, "mkldnn_gelu: ATen not compiled with MKLDNN support");
 }
 
-Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) {
+Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) {
   TORCH_CHECK(false, "mkldnn_gelu_backward: ATen not compiled with MKLDNN support");
 }
 
@@ -24,11 +24,13 @@ Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) {
 
 namespace at { namespace native {
 
-Tensor mkldnn_gelu(const Tensor& input) {
+Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) {
   if (input.scalar_type() == ScalarType::BFloat16) {
     TORCH_CHECK(mkldnn_bf16_device_check(),
         "mkldnn_gelu: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
   }
+  TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None,
+                  "mkldnn_gelu: fast, approximate gelu is not supported");
   const ideep::tensor& x = itensor_from_tensor(input);
   ideep::tensor y;
   ideep::eltwise_forward::compute(
@@ -37,7 +39,9 @@ Tensor mkldnn_gelu(const Tensor& input) {
                                  input.options().device_opt());
 }
 
-Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) {
+Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) {
+  TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None,
+                  "mkldnn_gelu_backward: fast, approximate gelu is not supported");
   const ideep::tensor& x = itensor_from_tensor(input);
   ideep::tensor grady = itensor_from_tensor(grad_output);
   ideep::tensor gradx;
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index cfbbf5c6fa19..fbfb329a5e93 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -30,7 +30,7 @@ Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional<ScalarType> dt
       : stensor.to_public(cpu_tensor.template data_ptr<BFloat16>(),
                          ideep::tensor::data_type::bf16);
   cpu_tensor.as_strided_(dims, pub_tensor.get_strides());
-  return cpu_tensor;
+  return cpu_tensor.contiguous();
 }
 
 Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional<ScalarType> dtype) {
@@ -43,7 +43,7 @@ Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional<ScalarType> dtype
              "dense_to_mkldnn expects float or bfloat16 tensor input");
   TORCH_CHECK(cpu_tensor.dim() <= 5,
              "Can't convert cpu tensor with the number of dimensions > 5");
-  // TODO: consider to convert non-contiguous tensor to `ideep::tensor` directly.
+  // NOTE: forbid direct convert from non-contiguous (or channels last) to `ideep::tensor`.
   auto cpu_tensor_cont = cpu_tensor.contiguous();
   auto data_type = dtype.has_value() ? dtype.value() : cpu_tensor.scalar_type();
   TORCH_CHECK(data_type == ScalarType::Float || data_type == ScalarType::BFloat16,
diff --git a/aten/src/ATen/native/mkldnn/Prelu.cpp b/aten/src/ATen/native/mkldnn/Prelu.cpp
new file mode 100644
index 000000000000..acc78211d83c
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Prelu.cpp
@@ -0,0 +1,79 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+
+#if !AT_MKLDNN_ENABLED()
+
+namespace at { namespace native {
+
+Tensor mkldnn_prelu(const Tensor& input, const Tensor& weight) {
+  TORCH_CHECK(false, "mkldnn_prelu: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<Tensor, Tensor> mkldnn_prelu_backward(const Tensor& grad_output, const Tensor& input, const Tensor& weight) {
+  TORCH_CHECK(false, "mkldnn_prelu_backward: ATen not compiled with MKLDNN support");
+}
+
+}}
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
+#include <ATen/native/mkldnn/Utils.h>
+
+namespace at { namespace native {
+
+Tensor mkldnn_prelu(const Tensor& input, const Tensor& weight) {
+  if (input.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_relu: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  int64_t weight_num = weight.numel();
+  if (weight_num != 1) {
+    int64_t channel_size = input.dim() > 1 ? input.size(1) : 1;
+    TORCH_CHECK(channel_size == weight_num,
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
+      " and channel size = ", channel_size, ".");
+  }
+  const ideep::tensor& x = itensor_from_mkldnn(input);
+  const ideep::tensor& w = itensor_from_tensor(weight);
+
+  ideep::tensor y;
+  ideep::prelu_forward::compute(
+      x, w, y, ideep::prop_kind::forward_training);
+  return new_with_itensor_mkldnn(std::move(y), optTypeMetaToScalarType(input.options().dtype_opt()),
+                                 input.options().device_opt());
+}
+
+std::tuple<Tensor, Tensor> mkldnn_prelu_backward(const Tensor& grad_output, const Tensor& input, const Tensor& weight) {
+  const ideep::tensor& x = itensor_from_mkldnn(input);
+  const ideep::tensor& w = itensor_from_tensor(weight);
+  const ideep::tensor grady = itensor_from_mkldnn(grad_output);
+  ideep::tensor gradx;
+  ideep::tensor gradw;
+
+  ideep::prelu_backward::compute(
+      x, w, grady, gradx, gradw, ideep::prop_kind::backward);
+  if (weight.is_mkldnn()) {
+    return std::make_tuple(
+        new_with_itensor_mkldnn(std::move(gradx),
+                                optTypeMetaToScalarType(grad_output.options().dtype_opt()),
+                                grad_output.options().device_opt()),
+        new_with_itensor_mkldnn(std::move(gradw),
+                                optTypeMetaToScalarType(weight.options().dtype_opt()),
+                                weight.options().device_opt()));
+  } else {
+    return std::make_tuple(
+        new_with_itensor_mkldnn(std::move(gradx),
+                                optTypeMetaToScalarType(grad_output.options().dtype_opt()),
+                                grad_output.options().device_opt()),
+        mkldnn_to_dense(new_with_itensor_mkldnn(std::move(gradw),
+                                                optTypeMetaToScalarType(weight.options().dtype_opt()),
+                                                weight.options().device_opt())));
+  }
+}
+}}
+
+#endif // AT_MKLDNN_EBABLED
diff --git a/aten/src/ATen/native/mps/Copy.h b/aten/src/ATen/native/mps/Copy.h
new file mode 100644
index 000000000000..1a4465e73538
--- /dev/null
+++ b/aten/src/ATen/native/mps/Copy.h
@@ -0,0 +1,28 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <atomic>
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/Copy.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/mps/MPSDevice.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+namespace mps {
+
+at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking);
+void copy_blit_mps(void* dst, const void* src, size_t size);
+
+} // namespace mps
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
new file mode 100644
index 000000000000..26cae7238b70
--- /dev/null
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -0,0 +1,202 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/TensorFactory.h>
+#include <c10/core/ScalarType.h>
+#include <torch/library.h>
+#include <unordered_map>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+using namespace at::mps;
+
+namespace at {
+namespace native {
+namespace mps {
+
+struct TORCH_CUDA_CPP_API MPSGeneratorImpl : public c10::GeneratorImpl {
+  MPSGeneratorImpl(DeviceIndex device_index = -1);
+  ~MPSGeneratorImpl() = default;
+
+  void set_current_seed(uint64_t seed) override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  static DeviceType device_type();
+
+private:
+  MPSGeneratorImpl* clone_impl() const override;
+  uint64_t seed_ = default_rng_seed_val;
+};
+
+const Generator& getDefaultMPSGenerator();
+
+void runMPSGraph(
+    MPSStream* mpsStream,
+    MPSGraph* mpsGraph,
+    NSDictionary* feeds,
+    NSDictionary* results);
+
+MPSDataType getMPSDataType(ScalarType scalar_type);
+MPSDataType getMPSScalarType(ScalarType scalar_type);
+std::string getMPSTypeString(ScalarType scalar_type);
+std::string getMPSShapeString(MPSShape* shape);
+std::string getTensorsStringKey(const TensorList& tensors);
+double getMPSScalarValue(const Tensor& t);
+std::string getArrayRefString(const IntArrayRef s);
+std::string getStridedKey(const Tensor& self, const IntArrayRef sz,
+                          const IntArrayRef strides, int64_t offset);
+id<MTLBuffer> gatherViewTensor(const at::Tensor& src, id<MTLBuffer> s);
+
+MPSShape* getMPSShape(const Tensor& t);
+MPSShape* getMPSShape(IntArrayRef sizes);
+MPSShape* getMPSShape(c10::MaybeOwned<Tensor> t);
+
+class Placeholder {
+ public:
+  Placeholder() : _placeholder(nullptr), _value(nullptr) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr);
+  MPSGraphTensor* getMPSGraphTensor() {
+    return _placeholder;
+  }
+  MPSGraphTensorData* getMPSGraphTensorData() {
+    return _value;
+  }
+  bool isIntermediate() {
+    return _value == nullptr;
+  }
+
+ private:
+  MPSGraphTensor* _placeholder;
+  MPSGraphTensorData* _value;
+};
+
+void resize_tensor(Tensor* output);
+MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph,
+                                          MPSStream* mpsStream,
+                                          const Tensor& tensor);
+
+MPSGraph* make_mps_graph();
+void printTensorNDArray(const Tensor& t);
+
+MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor);
+MPSGraphTensor* mpsGraphConstantFloatPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape);
+MPSGraphTensor* mpsGraphConstantPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape, MPSDataType dataType);
+
+string get_mem_format_string(c10::MemoryFormat memory_format);
+
+using MPSCacheKey = int64_t;
+
+// derive this class to cache a graph and its inputs/ouputs
+// can be used to store any NSObject
+struct MPSCachedGraph
+{
+  MPSCachedGraph(NSObject *object) : _object([object retain]) {}
+  virtual ~MPSCachedGraph() {
+   [_object release];
+   _object = nullptr;
+  }
+  MPSGraph *graph() const { return (MPSGraph *)_object; }
+  NSObject *object() const { return _object; }
+private:
+  NSObject *_object = nullptr;
+};
+
+// TODO: Improve the overall design of MPSGraphCache.
+// https://github.com/pytorch/pytorch/issues/77176
+// Cache holding various keys mapped to graphs
+
+struct MPSGraphCache
+{
+  typedef MPSCachedGraph * (^CreateCachedGraphBlock)();
+
+  struct CacheEntry {
+    CacheEntry(std::string key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {}
+    MPSCachedGraph* cachedGraph_ = nullptr;
+    std::string key_ = nullptr;
+  };
+
+ public:
+
+  static MPSGraphCache* getInstance() {
+    if(_instance_cache == nullptr) {
+      _instance_cache = new MPSGraphCache();
+    }
+    return _instance_cache;
+  }
+
+  ~MPSGraphCache() {
+    dispatch_release(serialQueue_);
+
+    for (auto i : cache_) {
+      delete i.second.cachedGraph_;
+    }
+  }
+
+  // Disallow the copy constructor and operator= functions
+  MPSGraphCache(const MPSGraphCache&) = delete;
+  void operator=(const MPSGraphCache&) = delete;
+
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+
+    __block MPSCachedGraph * result = nil;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync(serialQueue_, ^() {
+
+      // verify the cached entry doesn't already exist
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        result = entry.cachedGraph_;
+      }
+      else {
+        result = createCacheBlock();
+        CacheEntry entry(key, result);
+        cache_.emplace(hash, entry);
+      }
+    });
+    return result;
+  }
+
+  MPSCachedGraph* LookUp(const std::string& key) const {
+
+    __block MPSCachedGraph* result = nullptr;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync(serialQueue_, ^() {
+
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        result = entry.cachedGraph_;
+      }
+    });
+    return result;
+  }
+ private:
+  MPSGraphCache() {
+    serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
+  }
+
+  static MPSGraphCache* _instance_cache;
+  std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+  dispatch_queue_t serialQueue_ = nullptr;
+
+};
+
+} // namespace mps
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
new file mode 100644
index 000000000000..ea0d153d0ecc
--- /dev/null
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -0,0 +1,447 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+uint64_t MPSGeneratorImpl::seed() {
+  auto random = c10::detail::getNonDeterministicRandom(true);
+  this->set_current_seed(random);
+  return random;
+}
+uint64_t MPSGeneratorImpl::current_seed() const {
+  return seed_;
+}
+
+void MPSGeneratorImpl::set_current_seed(uint64_t seed) {
+  seed_ = seed;
+}
+
+MPSGeneratorImpl::MPSGeneratorImpl(DeviceIndex device_index)
+  : c10::GeneratorImpl{Device(DeviceType::MPS, device_index),
+              DispatchKeySet(c10::DispatchKey::MPS)} {
+}
+
+const Generator& getDefaultMPSGenerator() {
+  auto gen = make_generator<MPSGeneratorImpl>(0);
+  gen.seed();
+  return gen;
+}
+DeviceType MPSGeneratorImpl::device_type() {
+  return DeviceType::MPS;
+}
+c10::intrusive_ptr<c10::TensorImpl> MPSGeneratorImpl::get_state() const {
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+
+  return state_tensor.getIntrusivePtr();
+}
+
+void MPSGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  detail::check_rng_state(new_state);
+
+  auto new_state_size = new_state.numel();
+
+  uint64_t input_seed;
+  auto new_rng_state = new_state.data<uint8_t>();
+  memcpy(&input_seed, new_rng_state, seed_size);
+  this->set_current_seed(input_seed);
+}
+
+MPSGeneratorImpl* MPSGeneratorImpl::clone_impl() const {
+  auto gen = new MPSGeneratorImpl(0);
+  gen->set_current_seed(this->seed_);
+  return gen;
+}
+
+std::string getStridedKey(const Tensor& self, const IntArrayRef sz,
+                          const IntArrayRef strides, int64_t offset) {
+  // TODO: move storage_offset to a PlaceholderTensor and strides to a
+  // tensor too, to avoid too many cache entries.
+  return std::to_string((uintptr_t)self.storage().data()) +
+              ":" + mps::getArrayRefString(sz) +
+              ":" + mps::getArrayRefString(strides) +
+              ":" + std::to_string(offset);
+}
+
+void runMPSGraph(
+    MPSStream* mpsStream,
+    MPSGraph* mpsGraph,
+    NSDictionary* feeds,
+    NSDictionary* results) {
+
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      mpsStream->commit(true);
+      id<MTLCommandQueue> commandQueue = mpsStream->commandQueue();
+      MPSGraphExecutionDescriptor *executionDescriptor = [[MPSGraphExecutionDescriptor new] autorelease];
+
+      executionDescriptor.completionHandler = ^(NSDictionary<MPSGraphTensor *,
+                                                MPSGraphTensorData *> * resultsDictionary,
+                                                NSError * _Nullable error) {
+      };
+
+      [mpsGraph runAsyncWithMTLCommandQueue:commandQueue
+                                feeds:feeds
+                     targetOperations:nil
+                    resultsDictionary:results
+                  executionDescriptor:executionDescriptor];
+
+    }
+  });
+}
+
+MPSDataType getMPSDataType(ScalarType scalar_type) {
+  switch (scalar_type) {
+    case ScalarType::Float:
+      return MPSDataTypeFloat32;
+    case ScalarType::Half:
+      return MPSDataTypeFloat16;
+    case ScalarType::Int:
+      return MPSDataTypeInt32;
+    case ScalarType::Long:
+      return MPSDataTypeInt64;
+    case ScalarType::Short:
+      return MPSDataTypeInt16;
+    case ScalarType::Byte:
+      return MPSDataTypeInt8;
+    case ScalarType::Bool:
+      return MPSDataTypeBool;
+    default:
+      TORCH_CHECK_TYPE(false, "Trying to convert ", scalar_type, " to the MPS backend but there is no mapping for it.")
+  }
+}
+
+MPSDataType getMPSScalarType(ScalarType scalar_type) {
+  switch (scalar_type) {
+    // This is an intentional fallthrough supporting Double for Scalar
+    // types as they are casted to Float32 currently.
+    case ScalarType::Double:
+    case ScalarType::Float:
+      return MPSDataTypeFloat32;
+    case ScalarType::Half:
+      return MPSDataTypeFloat16;
+    case ScalarType::Int:
+      return MPSDataTypeInt32;
+    case ScalarType::Long:
+      return MPSDataTypeInt64;
+    case ScalarType::Short:
+      return MPSDataTypeInt16;
+    case ScalarType::Byte:
+      return MPSDataTypeInt8;
+    case ScalarType::Bool:
+      return MPSDataTypeBool;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Trying to convert ", scalar_type, " to the MPS backend but there is no mapping for it.")
+  }
+}
+
+std::string getMPSTypeString(ScalarType scalar_type) {
+  switch (scalar_type) {
+    case ScalarType::Double:
+    case ScalarType::Float:
+      return "MPSDataTypeFloat32";
+    case ScalarType::Half:
+      return "MPSDataTypeFloat16";
+    case ScalarType::Int:
+      return "MPSDataTypeInt32";
+    case ScalarType::Long:
+      return "MPSDataTypeInt64";
+    case ScalarType::Short:
+      return "MPSDataTypeInt16";
+    case ScalarType::Byte:
+      return "MPSDataTypeInt8";
+    case ScalarType::Bool:
+      return "MPSDataTypeBool";
+    default:
+      return "Undefined";
+  }
+}
+
+std::string getMPSShapeString(MPSShape* shape) {
+    std::string str;
+    for(NSNumber *elem in shape) {
+        str += std::to_string(elem.unsignedLongValue) + ",";
+    }
+    return str;
+}
+
+std::string getArrayRefString(const IntArrayRef s) {
+  std::stringstream ss;
+  std::copy(s.begin(), s.end(), std::ostream_iterator<int>(ss, ","));
+  return ss.str();
+}
+
+std::string getTensorsStringKey(const TensorList& tensors) {
+    std::string str;
+    // The key format per tensor would look like ":MPSDataTypeFloat32[1,1,1,10]:"
+    for (const Tensor& tensor: tensors) {
+      str += ":";
+      if (tensor.defined()) {
+        str += getMPSTypeString(tensor.scalar_type()) + "[";
+        // if tensor is a scalar
+        if (tensor.dim() == 0) {
+          str += std::to_string(getMPSScalarValue(tensor));
+        } else {
+          const NSString* ns_shape_key = [[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","];
+          str += std::string(ns_shape_key.UTF8String);
+        }
+        str += "]";
+      } else {
+        str += "Undefined";
+      }
+    }
+    return str;
+}
+
+double getMPSScalarValue(const Tensor& t) {
+  assert (t.dim() == 0);  // only applicable for scalar types
+  auto other_value = t.item();
+  return other_value.to<double>();
+}
+
+MPSShape* getMPSShape(const Tensor& t) {
+  const int sz = t.dim();
+  const int sz_ = (sz > 0) ? sz : 1;
+
+  NSNumber* numbers[sz_];
+
+  for (int i = 0; i < sz_; i++)
+  {
+    NSInteger sz_i = (i < sz) ? t.size(i) : 1;
+
+    NSNumber* number = [NSNumber numberWithInt:sz_i];
+    numbers[i] = number;
+  }
+  return [NSArray arrayWithObjects:numbers count:sz_];
+}
+
+MPSShape* getMPSShape(c10::MaybeOwned<Tensor> t) {
+  const Tensor& t_ = *t;
+  return getMPSShape(t_);
+}
+
+MPSShape* getMPSShape(IntArrayRef sizes) {
+  const int sz = sizes.size();
+  const int sz_ = (sz > 0) ? sz : 1;
+
+  NSNumber* numbers[sz_];
+
+  for (int i = 0; i < sz_; i++)
+  {
+    NSInteger sz_i = (i < sz) ? sizes[i] : 1;
+
+    NSNumber* number = [NSNumber numberWithInt:sz_i];
+    numbers[i] = number;
+  }
+  return [NSArray arrayWithObjects:numbers count:sz_];
+}
+
+void printTensorNDArray(const Tensor& t) {
+  if (!t.is_mps()) return;
+  if(t.numel() == 0)
+  {
+      std::cout << "Empty tensor" << std::endl;
+      return;
+  }
+  // Get shape and data type
+  auto selfShape = getMPSShape(t);
+  auto selfDType = getMPSDataType(t.scalar_type());
+
+  // Initialize data
+  id<MTLBuffer> selfBuf = __builtin_bit_cast(id<MTLBuffer>, t.storage().data());
+  MPSGraphTensorData* tdata = [[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf
+                                                            shape:selfShape
+                                                         dataType:selfDType];
+  [tdata printNDArray];
+}
+
+id<MTLBuffer> gatherViewTensor(const at::Tensor& src, id<MTLBuffer> sourceBuffer) {
+  assert (!src.is_contiguous());
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* stream = getCurrentMPSStream();
+  @autoreleasepool {
+    struct CachedGraph : public MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor* inputTensor_ = nil;
+      MPSGraphTensor* outputTensor_ = nil;
+      IntArrayRef size_;
+      IntArrayRef stride_;
+      int64_t storage_offset_;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+    string key = getStridedKey(src, src.sizes(), src.strides(), src.storage_offset());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if (cachedGraph) {
+      @autoreleasepool {
+        MPSGraphTensor* inputTensor = cachedGraph->inputTensor_;
+        auto output = at::native::empty_mps(
+                        src.sizes(),
+                        src.scalar_type(),
+                        c10::nullopt,
+                        kMPS,
+                        c10::nullopt,
+                        c10::nullopt);
+        MPSGraphTensorData* inputTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer: sourceBuffer
+                                                                            shape: [inputTensor shape]
+                                                                            dataType: [inputTensor dataType]];
+        id<MTLBuffer> resultBuffer = __builtin_bit_cast(id<MTLBuffer>, output.storage().data());
+        MPSGraphTensorData* outputTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer: resultBuffer
+                                                                            shape: getMPSShape(src.sizes())
+                                                                            dataType: getMPSDataType(src.scalar_type())];
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+          inputTensor : inputTensorData
+        };
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+          cachedGraph->outputTensor_ : outputTensorData
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+#if _DEBUG
+        NSLog(@"%@", [cachedGraph->graph() debugDescription]);
+        TORCH_WARN("We have a non-contiguous tensor in copy_from_mps with key ", key);
+
+        //// Update the Blit sourceBuffer to the result of this operation
+        printTensorNDArray(output);
+#endif
+        return resultBuffer;
+      }
+    }
+  }
+  return nil;
+}
+
+
+
+Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape)
+{
+  TORCH_CHECK(self.is_mps(), "Placeholder storage has not been allocated on MPS device!");
+  // extract the pointer to MTLBuffer from the Tensor's storage
+  id<MTLBuffer> selfBuf = __builtin_bit_cast(id<MTLBuffer>, self.storage().data());
+  const size_t buf_size = [selfBuf length];
+
+  // tensor.numel() could be zero, but tensor is valid as long as the buffer size is non-zero.
+  // if buf_size is zero in here, it's not a user error. It could be a missing check for
+  // tensor.numel() == 0 in our internal implementations of ops.
+  TORCH_INTERNAL_ASSERT(buf_size > 0, "Placeholder tensor is empty!");
+
+  TORCH_CHECK(self.storage().nbytes() <= buf_size, "Placeholder buffer size (", buf_size,
+      ") is not large enough to contain the Tensor storage of size ", self.storage().nbytes());
+
+  const MPSDataType mpsDataType = getMPSDataType(self.scalar_type());
+  if (!mpsShape)
+    mpsShape = getMPSShape(self);
+
+  _value = [[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf
+                                                   shape:mpsShape
+                                                dataType:mpsDataType];
+  TORCH_INTERNAL_ASSERT(_value);
+  _placeholder = mpsGraphTensor;
+}
+
+MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph,
+                                          MPSStream* mpsStream,
+                                          const Tensor& tensor) {
+  auto mpsShape = getMPSShape(tensor);
+  auto dataType = getMPSDataType(tensor.scalar_type());
+
+  MPSGraphTensorData *result = nil;
+  if (tensor.numel() > 0) {
+    id<MTLBuffer> buf = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+    result = [[[MPSGraphTensorData alloc] initWithMTLBuffer:buf
+                                                    shape:mpsShape
+                                                 dataType:dataType]
+                                                 autorelease];
+  } else {
+    // create empty NDArray
+    MPSNDArrayDescriptor *desc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
+                                                                        shape:mpsShape];
+    MPSNDArray *emptyArray = [[[MPSNDArray alloc]
+                              initWithDevice:mpsStream->device() descriptor:desc] autorelease];
+    result = [[[MPSGraphTensorData alloc] initWithMPSNDArray:emptyArray] autorelease];
+  }
+  assert(result);
+  return result;
+}
+
+void resize_tensor(Tensor* output) {
+  output->resize_(output->sizes());
+}
+
+MPSGraph* make_mps_graph() {
+  MPSGraph* mpsGraph = [[MPSGraph new] autorelease];
+  mpsGraph.options = MPSGraphOptionsNone;
+  return mpsGraph;
+}
+
+MPSGraphTensor* mpsGraphConstantFloatPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape) {
+  // "value" is always double, so is the Placeholder's type (we only support Float32).
+  return [mpsGraph constantWithScalar:value
+                                shape:mpsShape
+                             dataType:MPSDataTypeFloat32];
+}
+
+MPSGraphTensor* mpsGraphConstantPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape, MPSDataType dataType) {
+  // Bool is not handled by constantWithScalar
+  MPSGraphTensor* constPlaceHolder = [mpsGraph constantWithScalar:value
+                                                            shape:mpsShape
+                                                         dataType:(dataType == MPSDataTypeBool ? MPSDataTypeFloat32 : dataType)];
+  if (dataType == MPSDataTypeBool)
+    return [mpsGraph castTensor:constPlaceHolder toType:MPSDataTypeBool name:@"ConstantBoolTensor"];
+
+  return constPlaceHolder;
+}
+
+MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType) {
+  return [mpsGraph placeholderWithShape:nil
+                               dataType:dataType
+                                   name:nil];
+}
+
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape) {
+  return [mpsGraph placeholderWithShape:mpsShape
+                               dataType:dataType
+                                   name:nil];
+}
+
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor) {
+    return [mpsGraph placeholderWithShape:getMPSShape(tensor)
+                                 dataType:getMPSDataType(tensor.scalar_type())
+                                     name:nil];
+}
+
+
+string get_mem_format_string(c10::MemoryFormat memory_format) {
+  string mem_format_key;
+  switch(memory_format) {
+    case at::MemoryFormat::Contiguous:
+      mem_format_key = "Contiguous";
+      break;
+    case at::MemoryFormat::ChannelsLast:
+      mem_format_key = "ChannelsLast";
+      break;
+    default:
+      assert(0 && "Invalid memory format\n");
+  }
+
+  return mem_format_key;
+}
+
+MPSGraphCache* MPSGraphCache::_instance_cache = nullptr;
+
+} // namespace mps
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/TensorFactory.cpp b/aten/src/ATen/native/mps/TensorFactory.cpp
new file mode 100644
index 000000000000..78899fc8fa3c
--- /dev/null
+++ b/aten/src/ATen/native/mps/TensorFactory.cpp
@@ -0,0 +1,136 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <torch/library.h>
+#include <ATen/mps/EmptyTensor.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/TensorFactory.h>
+namespace at { namespace native {
+
+static inline void maybe_resize_storage_mps(TensorImpl* self, uint64_t new_size) {
+  if (new_size == 0) {
+    return;
+  }
+
+  auto storage = self->storage().unsafeGetStorageImpl();
+  if (!storage) {
+    TORCH_CHECK(false, "Tensor: invalid null storage");
+  }
+  uint64_t new_size_bytes = (new_size + self->storage_offset()) * self->dtype().itemsize();
+  if (new_size_bytes > self->storage().nbytes()) {
+    if (new_size_bytes == 0) {
+      storage->set_data_ptr_noswap(at::DataPtr(nullptr, at::Device(at::DeviceType::MPS, 0)));
+      storage->set_nbytes(0);
+    } else {
+      at::DataPtr new_data = storage->allocator()->allocate(new_size_bytes);
+      size_t copy_capacity = std::min<size_t>(new_size_bytes, storage->nbytes());
+      if (storage->data() && copy_capacity > 0) {
+        at::native::mps::copy_blit_mps(new_data.get(), storage->data(), copy_capacity);
+      }
+      // Destructively overwrite data_ptr
+      storage->set_data_ptr_noswap(std::move(new_data));
+      storage->set_nbytes(new_size_bytes);
+    }
+  }
+}
+
+inline TensorImpl* resize_impl_mps_(
+    TensorImpl* self,
+    IntArrayRef size,
+    c10::optional<IntArrayRef> stride,
+    bool device_guard = true) {
+  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+    return self;
+  }
+
+  int64_t storage_size = 1;
+  if (stride) {
+    self->set_sizes_and_strides(size, *stride);
+    // NB: storage size can be different from numel.
+    storage_size = storage_size_for(size, *stride);
+  } else {
+    self->set_sizes_contiguous(size);
+    storage_size = self->numel();
+  }
+  maybe_resize_storage_mps(self, storage_size);
+
+  return self;
+}
+
+Tensor empty_mps(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+
+  return at::detail::empty_mps(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+}
+
+Tensor empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
+  check_size_nonnegative(size);
+  // empty memory formatempty
+  auto t = at::native::empty_mps(
+      {0},
+      dtype_opt,
+      layout_opt,
+      device_opt,
+      pin_memory_opt);
+  resize_impl_mps_(t.unsafeGetTensorImpl(), size, stride);
+  return t;
+}
+
+const Tensor& resize_mps_(
+    const Tensor& self,
+    IntArrayRef size,
+    c10::optional<MemoryFormat> optional_memory_format) {
+  if (self.has_names()) {
+    return resize_named_tensor_(self, size, optional_memory_format);
+  }
+  auto* self_ = self.unsafeGetTensorImpl();
+  resize_impl_mps_(self_, size, /*strides=*/c10::nullopt);
+  if (optional_memory_format.has_value()) {
+    auto memory_format =
+        optional_memory_format.value();
+    TORCH_CHECK(
+        memory_format != MemoryFormat::Preserve,
+        "Unsupported memory format",
+        memory_format);
+    self_->empty_tensor_restride(memory_format);
+  }
+  return self;
+}
+
+Tensor& set_mps_(Tensor& result) {
+  caffe2::TypeMeta dtype = result.dtype();
+  Storage storage(
+      Storage::use_byte_size_t(),
+      0,
+      at::mps::GetMPSAllocator(),
+      true);
+  result.set_(storage, 0, {0}, {});
+  TORCH_INTERNAL_ASSERT(dtype == result.dtype());
+  return result;
+}
+
+Tensor& set_storage_mps_(Tensor& result, Storage storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) {
+  checkSetStorage(result, storage, storage_offset, size, stride);
+  //std::cout << "set storage_mps " << storage_offset << " stride " << stride << std::endl;
+  result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
+  c10::optional<IntArrayRef> stride_opt = stride.data() != nullptr ?
+                                          c10::optional<IntArrayRef>(stride) : c10::nullopt;
+  at::native::resize_impl_mps_(result.unsafeGetTensorImpl(), size, stride_opt);
+  return result;
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/TensorFactory.h b/aten/src/ATen/native/mps/TensorFactory.h
new file mode 100644
index 000000000000..cb7931deb6bc
--- /dev/null
+++ b/aten/src/ATen/native/mps/TensorFactory.h
@@ -0,0 +1,17 @@
+//  Copyright © 2022 Apple Inc.
+
+#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...)                                \
+  [&] {                                                                       \
+    const auto& the_type = TYPE;                                              \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                     \
+    RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st);                                  \
+    switch (_st) {                                                            \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__)  \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \
+      AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Half, at::Half, __VA_ARGS__) \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");        \
+    }                                                                         \
+  }()
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
new file mode 100644
index 000000000000..b0a1fe4bbcea
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -0,0 +1,1570 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+using namespace at::mps;
+
+namespace at {
+namespace native {
+
+Tensor relu_mps(const Tensor& self) {
+  using namespace mps;
+  Tensor output = at::empty_like(self);
+  resize_tensor(&output);
+  TORCH_CHECK(output.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "relu" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          // passing selector of reLUWithTensor on the mpsGraph object
+          MPSGraphTensor* outputTensor = [mpsGraph reLUWithTensor:inputTensor
+                                                             name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return output;
+}
+
+Tensor & relu_mps_(Tensor & self) {
+  using namespace mps;
+  // Inplace relu
+  Tensor &output = self;
+  TORCH_CHECK(output.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "relu_" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          // passing selector of reLUWithTensor on the mpsGraph object
+          MPSGraphTensor* outputTensor = [mpsGraph reLUWithTensor:inputTensor
+                                                             name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return output;
+}
+
+TORCH_IMPL_FUNC(leaky_relu_out_mps) (
+  const Tensor& self, const Scalar& negative_slope, const Tensor& output) {
+  using namespace mps;
+  TORCH_CHECK(output.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream *stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + to_string(negative_slope.to<double>());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          MPSGraphTensor* negSlopeTensor = [mpsGraph constantWithScalar:negative_slope.to<double>()
+                                                                  shape:@[@1]
+                                                               dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* negSlopeMulXTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                                                         secondaryTensor:negSlopeTensor
+                                                                                    name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph maximumWithPrimaryTensor:negSlopeMulXTensor
+                                                            secondaryTensor:inputTensor
+                                                                       name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+}
+
+TORCH_IMPL_FUNC(leaky_relu_backward_out_mps) (
+  const Tensor& grad_output,
+  const Tensor& self,
+  const Scalar& negative_slope,
+  bool self_is_result,
+  const Tensor& output ) {
+
+  using namespace mps;
+  TORCH_CHECK(output.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream *stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string key = "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" + to_string(negative_slope.to<double>());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+          MPSGraphTensor* negSlopeTensor = [mpsGraph constantWithScalar:negative_slope.to<double>()
+                                                                  shape:@[@1]
+                                                               dataType:getMPSScalarType(self.scalar_type())];
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSScalarType(self.scalar_type())];
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+                                                                   secondaryTensor:zeroTensor
+                                                                              name:nil];
+          MPSGraphTensor* gradientsMulNegSlopeTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor
+                                                                                 secondaryTensor:negSlopeTensor
+                                                                                            name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
+                                                            truePredicateTensor:gradOutputTensor
+                                                           falsePredicateTensor:gradientsMulNegSlopeTensor
+                                                                           name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+
+TORCH_IMPL_FUNC(log_softmax_mps_out) (
+  const Tensor &self,
+  const int64_t dim,
+  const bool half_to_float,
+  const Tensor &out) {
+  using namespace mps;
+
+  if (self.numel() == 0) {
+    return;
+  }
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + to_string(dim);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph* newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          MPSGraphTensor* softmaxTensor = [mpsGraph softMaxWithTensor:inputTensor
+                                                                 axis:dim
+                                                                 name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:softmaxTensor
+                                                                  name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(log_softmax_backward_mps_out) (
+  const Tensor& grad_output,
+  const Tensor& output,
+  int64_t dim,
+  ScalarType input_dtype,
+  const Tensor& out) {
+  using namespace mps;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output.scalar_type()) + ":" + to_string(dim);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph* newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type()));
+          MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()));
+
+          MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:outputTensor
+                                                              name:nil];
+          MPSGraphTensor* sumTensor = [mpsGraph reductionSumWithTensor:gradOutputTensor
+                                                                  axis:dim
+                                                                  name:nil];
+          MPSGraphTensor* multiplicationTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
+                                                                           secondaryTensor:sumTensor
+                                                                                      name:nil];
+          MPSGraphTensor* resultTensor = [mpsGraph subtractionWithPrimaryTensor:gradOutputTensor
+                                                                secondaryTensor:multiplicationTensor
+                                                                           name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->gradInputTensor_ = resultTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradPlaceholder   = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder resultPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(sigmoid_out_mps)(
+  const Tensor& self,
+  const Tensor& output) {
+  using namespace mps;
+  TORCH_CHECK(output.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "sigmoid_out_mps" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          // Initialize graph
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          MPSGraphTensor* outputTensor = [mpsGraph sigmoidWithTensor:inputTensor
+                                                                name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+}
+
+TORCH_IMPL_FUNC(sigmoid_backward_out_mps)(
+  const Tensor& grad_output,
+  const Tensor& output,
+  const Tensor& grad_input) {
+  using namespace mps;
+  TORCH_CHECK(grad_input.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "sigmoid_backward_out_mps:" + getMPSTypeString(grad_output.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type()));
+          MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()));
+
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* oneMinusSigmoidTensor = [mpsGraph subtractionWithPrimaryTensor:unitTensor
+                                                                         secondaryTensor:outputTensor
+                                                                                    name:nil];
+          MPSGraphTensor* timesTensor = [mpsGraph multiplicationWithPrimaryTensor:oneMinusSigmoidTensor
+                                                               secondaryTensor:outputTensor
+                                                                          name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor
+                                                                      secondaryTensor:timesTensor
+                                                                                 name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradOutputPlaceholder   = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder gradInputPlaceholder   = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+}
+
+TORCH_IMPL_FUNC(tanh_backward_out_mps)(
+  const Tensor& grad_output,
+  const Tensor& output,
+  const Tensor& grad_input) {
+  using namespace mps;
+  TORCH_CHECK(grad_input.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "tanh_backward_out_mps:" + getMPSTypeString(grad_output.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type()));
+          MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()));
+
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* tanh2Tensor = [mpsGraph squareWithTensor:outputTensor
+                                                              name:nil];
+          MPSGraphTensor* oneMinusTanh2Tensor = [mpsGraph subtractionWithPrimaryTensor:unitTensor
+                                                                       secondaryTensor:tanh2Tensor
+                                                                                  name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor
+                                                                      secondaryTensor:oneMinusTanh2Tensor
+                                                                                 name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradOutputPlaceholder   = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder gradInputPlaceholder   = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+}
+
+TORCH_IMPL_FUNC(threshold_out_mps)(
+  const Tensor& self,
+  const Scalar& threshold,
+  const Scalar& value,
+  const Tensor& result) {
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" +
+                                       to_string(threshold.to<double>()) + ":" +
+                                       to_string(value.to<double>());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          MPSGraphTensor *thresholdTensor = [mpsGraph constantWithScalar: threshold.to<double>()
+                                                                   shape: @[@1]
+                                                                dataType: getMPSDataType(self.scalar_type())];
+
+          MPSGraphTensor *valueTensor = [mpsGraph constantWithScalar: value.to<double>()
+                                                               shape: @[@1]
+                                                            dataType: getMPSDataType(self.scalar_type())];
+
+          // x > threshold
+          MPSGraphTensor *predicateTensor = [mpsGraph greaterThanWithPrimaryTensor: inputTensor
+                                                                   secondaryTensor: thresholdTensor
+                                                                              name: nil];
+
+          // result = (self > threshold) ? self : value
+          MPSGraphTensor *outputTensor = [mpsGraph selectWithPredicateTensor: predicateTensor
+                                                         truePredicateTensor: inputTensor
+                                                        falsePredicateTensor: valueTensor
+                                                                        name: nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+TORCH_IMPL_FUNC(threshold_backward_out_mps)(
+  const Tensor& grad,
+  const Tensor& self,
+  const Scalar& threshold,
+  const Tensor& gradInput) {
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+  TORCH_CHECK(grad.is_mps());
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "threshold_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" +
+                                                 to_string(threshold.to<double>());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *gradTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad);
+
+          MPSGraphTensor *thresholdTensor = [mpsGraph constantWithScalar: threshold.to<double>()
+                                                                   shape: @[@1]
+                                                                dataType: getMPSDataType(self.scalar_type())];
+
+          MPSGraphTensor *zeroTensor = [mpsGraph constantWithScalar: 0.0
+                                                           dataType: inputTensor.dataType];
+
+          // x > threshold
+          MPSGraphTensor *predicateTensor = [mpsGraph greaterThanWithPrimaryTensor: inputTensor
+                                                                   secondaryTensor: thresholdTensor
+                                                                              name: nil];
+
+          // result = (self > threshold) ? grad : zeroTensor
+          MPSGraphTensor *gradInputTensor = [mpsGraph selectWithPredicateTensor: predicateTensor
+                                                         truePredicateTensor: gradTensor
+                                                        falsePredicateTensor: zeroTensor
+                                                                        name: nil];
+
+          newCachedGraph->gradTensor_ = gradTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradPlaceholder = Placeholder(cachedGraph->gradTensor_, grad);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, gradInput);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+MPSGraphTensor* normcdf (MPSGraph* mpsGraph, MPSGraphTensor *inputTensor) {
+    // (1.0f + erf(x*SQRT1_2)) * 0.5f * x;
+    const float SQRT1_2 = 0.707106781186547524400844362104849039f;
+    MPSGraphTensor *sqrt1_2 = [mpsGraph constantWithScalar:SQRT1_2
+                                                        shape:@[@1]
+                                                     dataType:MPSDataTypeFloat32];
+    MPSGraphTensor *onef = [mpsGraph constantWithScalar:1.0f
+                                                  shape:@[@1]
+                                              dataType:MPSDataTypeFloat32];
+    MPSGraphTensor *halff = [mpsGraph constantWithScalar:0.5f
+                                                    shape:@[@1]
+                                                dataType:MPSDataTypeFloat32];
+
+    MPSGraphTensor *erfTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                                          secondaryTensor: sqrt1_2
+                                                                  name : nil];
+    erfTensor = [mpsGraph erfWithTensor: erfTensor name : nil];
+    erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor
+                                      secondaryTensor: onef
+                                                  name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                        secondaryTensor: halff
+                                                    name : nil];
+
+    return  erfTensor;
+}
+
+TORCH_IMPL_FUNC(gelu_out_mps) (
+    const Tensor& self, c10::string_view approximate, const Tensor& output
+  ) {
+  using namespace mps;
+  TORCH_CHECK(output.is_mps());
+
+  // Empty output
+  if(output.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "gelu_out_mps" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph,
+                                                                  getMPSDataType(self.scalar_type()),
+                                                                  getMPSShape(self));
+
+          MPSGraphTensor* outputTensor =  normcdf(mpsGraph, inputTensor);
+          outputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
+                                                   secondaryTensor:inputTensor
+                                                              name:nil];
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+}
+
+TORCH_IMPL_FUNC(gelu_backward_out_mps) (
+    const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input
+  ) {
+  using namespace mps;
+  constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * (0.5);
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "gelu_backward_out_mps" + getTensorsStringKey({self, grad});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradTensor = mpsGraphRankedPlaceHolder(mpsGraph,
+                                                                  getMPSDataType(grad.scalar_type()),
+                                                                  getMPSShape(grad));
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph,
+                                                                  getMPSDataType(self.scalar_type()),
+                                                                  getMPSShape(self));
+          MPSGraphTensor* cdf = normcdf(mpsGraph, inputTensor);
+          MPSGraphTensor *halff = [mpsGraph constantWithScalar:-0.5f
+                                                    shape:@[@1]
+                                                dataType:MPSDataTypeFloat32];
+          MPSGraphTensor *betaf = [mpsGraph constantWithScalar:kBeta
+                                                    shape:@[@1]
+                                                dataType:MPSDataTypeFloat32];
+          MPSGraphTensor *pdfMul = [mpsGraph squareWithTensor : inputTensor
+                                                    name : nil];
+          pdfMul = [mpsGraph multiplicationWithPrimaryTensor : pdfMul
+                                          secondaryTensor : halff
+                                                    name : nil];
+          pdfMul = [mpsGraph exponentWithTensor : pdfMul
+                                        name  : nil];
+          MPSGraphTensor* pdf = [mpsGraph multiplicationWithPrimaryTensor : pdfMul
+                                                        secondaryTensor  : betaf
+                                                                  name : nil];
+          pdf = [mpsGraph multiplicationWithPrimaryTensor : inputTensor
+                                          secondaryTensor : pdf
+                                            name : nil];
+          pdf = [mpsGraph additionWithPrimaryTensor : pdf
+                                  secondaryTensor : cdf
+                                      name : nil];
+          MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor : gradTensor
+                                                                   secondaryTensor : pdf
+                                                                              name : nil];
+
+          newCachedGraph->gradTensor_ = gradTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradPlaceholder   = Placeholder(cachedGraph->gradTensor_, grad);
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+
+}
+
+void elu_variants_out_mps (
+  const Tensor& self,
+  const Scalar& alpha,
+  const Scalar& scale,
+  const Scalar& input_scale,
+  const Tensor& result,
+  string func_name) {
+
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+
+  // Empty output
+  if(result.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = func_name + ":" + getTensorsStringKey({self}) + ":" +
+                                       to_string(alpha.to<double>()) + ":" +
+                                       to_string(scale.to<double>()) + ":" +
+                                       to_string(input_scale.to<double>());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          // scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
+
+          MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
+                                                               shape:@[@1]
+                                                            dataType:getMPSDataType(self.scalar_type())];
+
+          MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
+                                                                    shape:@[@1]
+                                                                 dataType:getMPSDataType(self.scalar_type())];
+
+          MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
+                                                               shape:@[@1]
+                                                            dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+
+          MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                                                        secondaryTensor:inputScaleTensor
+                                                                                   name:nil];
+          MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor
+                                                                   name:nil];
+          MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
+                                                                          secondaryTensor:unitTensor
+                                                                                     name:nil];
+          MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
+                                                                       secondaryTensor:alphaTensor
+                                                                                  name:nil];
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+                                                                   secondaryTensor:zeroTensor
+                                                                              name:nil];
+          MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
+                                                        truePredicateTensor:inputTensor
+                                                       falsePredicateTensor:alphaTimesTensor
+                                                                       name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
+                                                                   secondaryTensor:scaleTensor
+                                                                              name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
+TORCH_IMPL_FUNC(elu_out_mps) (
+   const Tensor& self,
+   const Scalar& alpha,
+   const Scalar& scale,
+   const Scalar& input_scale,
+   const Tensor& result) {
+
+  elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
+}
+
+TORCH_IMPL_FUNC(elu_backward_out_mps) (
+  const Tensor& grad_output,
+  const Scalar& alpha,
+  const Scalar& scale,
+  const Scalar& input_scale,
+  bool is_result,
+  const Tensor& self_or_result,
+  const Tensor& grad_input
+) {
+
+  using namespace mps;
+  TORCH_CHECK(grad_output.is_mps());
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+                                                 to_string(alpha.to<double>()) + ":" +
+                                                 to_string(scale.to<double>()) + ":" +
+                                                 to_string(input_scale.to<double>()) + ":" +
+                                                 to_string(is_result);
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+          MPSGraphTensor* inputTensor = nil;
+          MPSGraphTensor* resultTensor = nil;
+
+          MPSGraphTensor* lessThanZeroGradTensor = nil;
+
+          if(is_result) {
+            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
+            MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
+                                                               shape:@[@1]
+                                                            dataType:getMPSDataType(grad_output.scalar_type())];
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+                                                                        secondaryTensor:alphaTensor
+                                                                                   name:nil];
+            auto constMul = scale.to<double>() * input_scale.to<double>();
+            MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
+                                                                    shape:@[@1]
+                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
+            lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
+                                                               secondaryTensor:constMulTensor
+                                                                          name:nil];
+          }
+          else {
+            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
+            MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
+                                                                    shape:@[@1]
+                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                                                          secondaryTensor:inputScaleTensor
+                                                                                     name:nil];
+            MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
+                                                                name:nil];
+            auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
+            MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
+                                                                    shape:@[@1]
+                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
+            lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
+                                                               secondaryTensor:constMulTensor
+                                                                          name:nil];
+          }
+
+          MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
+                                                               shape:@[@1]
+                                                            dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+                                                                   secondaryTensor:zeroTensor
+                                                                              name:nil];
+          MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
+                                                       truePredicateTensor:scaleTensor
+                                                      falsePredicateTensor:lessThanZeroGradTensor
+                                                                      name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                                                      secondaryTensor:gradOutputTensor
+                                                                                 name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder selfPlaceholder = Placeholder();
+    Placeholder resultPlaceholder = Placeholder();
+    if(is_result)
+      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result);
+    else
+      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
+
+    if(is_result)
+      feeds = @{
+        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
+      };
+    else
+      feeds = @{
+        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+      };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(silu_out_mps) (
+  const Tensor& self,
+  const Tensor& result) {
+
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+
+  // Empty output
+  if(result.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "silu_out_mps:" + getTensorsStringKey({self});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* negativeInput = [mpsGraph negativeWithTensor:inputTensor
+                                                                  name:nil];
+          MPSGraphTensor* expNegativeTensor = [mpsGraph exponentWithTensor:negativeInput
+                                                                      name:nil];
+          MPSGraphTensor* expPlusOneTensor = [mpsGraph additionWithPrimaryTensor:expNegativeTensor
+                                                                 secondaryTensor:unitTensor
+                                                                            name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph divisionWithPrimaryTensor:inputTensor
+                                                             secondaryTensor:expPlusOneTensor
+                                                                        name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(silu_backward_out_mps) (
+  const Tensor& grad_output,
+  const Tensor& self,
+  const Tensor& grad_input) {
+
+  using namespace mps;
+  TORCH_CHECK(grad_output.is_mps());
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "silu_out_backward_mps:" + getTensorsStringKey({grad_output});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* negativeInput = [mpsGraph negativeWithTensor:inputTensor
+                                                                  name:nil];
+          MPSGraphTensor* expNegativeTensor = [mpsGraph exponentWithTensor:negativeInput
+                                                                      name:nil];
+          MPSGraphTensor* expPlusOneTensor = [mpsGraph additionWithPrimaryTensor:expNegativeTensor
+                                                                 secondaryTensor:unitTensor
+                                                                            name:nil];
+          MPSGraphTensor* sigmoidTensor = [mpsGraph reciprocalWithTensor:expPlusOneTensor
+                                                                    name:nil];
+          MPSGraphTensor* oneMinusSigmoid = [mpsGraph subtractionWithPrimaryTensor:unitTensor
+                                                                   secondaryTensor:sigmoidTensor
+                                                                              name:nil];
+          MPSGraphTensor* inputTimesDiff = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                                                     secondaryTensor:oneMinusSigmoid
+                                                                                name:nil];
+          MPSGraphTensor* onePlusTensor = [mpsGraph additionWithPrimaryTensor:unitTensor
+                                                              secondaryTensor:inputTimesDiff
+                                                                         name:nil];
+          MPSGraphTensor* gradTensor = [mpsGraph multiplicationWithPrimaryTensor:sigmoidTensor
+                                                                 secondaryTensor:onePlusTensor
+                                                                            name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                                                      secondaryTensor:gradOutputTensor
+                                                                                 name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+// -------------------------------------------------
+// Hardtanh backward
+
+Tensor hardtanh_backward_mps
+  (const Tensor& grad_output,
+   const Tensor& self,
+   const Scalar& min,
+   const Scalar& max) {
+
+  Tensor grad_input = at::native::empty_mps(
+                      grad_output.sizes(),
+                      grad_output.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  grad_input = hardtanh_backward_out_mps(grad_output, self, min, max, grad_input);
+  return grad_input;
+}
+
+// Hardtanh backward
+Tensor& hardtanh_backward_out_mps
+  (const Tensor& grad_output,
+   const Tensor& self,
+   const Scalar& min,
+   const Scalar& max,
+   Tensor& grad_input) {
+
+  using namespace mps;
+  TORCH_CHECK(grad_output.is_mps());
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return grad_input;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+                                                 to_string(min.to<double>()) + ":" +
+                                                 to_string(max.to<double>());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          // TODO: Compute gradient
+          MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* minTensor = [mpsGraph constantWithScalar:min.to<double>()
+                                                             shape:@[@1]
+                                                          dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* maxTensor = [mpsGraph constantWithScalar:max.to<double>()
+                                                             shape:@[@1]
+                                                          dataType:getMPSDataType(grad_output.scalar_type())];
+          MPSGraphTensor* greaterThanMaxPredicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+                                                                                 secondaryTensor:maxTensor
+                                                                                            name:nil];
+          MPSGraphTensor* lesserThanMinPredicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                                               secondaryTensor:minTensor
+                                                                                          name:nil];
+          MPSGraphTensor* greaterThanMaxGradTensor = [mpsGraph selectWithPredicateTensor:greaterThanMaxPredicateTensor
+                                                                     truePredicateTensor:zeroTensor
+                                                                    falsePredicateTensor:unitTensor
+                                                                                    name:nil];
+          MPSGraphTensor* lesserThanMinGradTensor = [mpsGraph selectWithPredicateTensor:lesserThanMinPredicateTensor
+                                                                    truePredicateTensor:zeroTensor
+                                                                   falsePredicateTensor:unitTensor
+                                                                                   name:nil];
+          MPSGraphTensor* gradTensor = [mpsGraph multiplicationWithPrimaryTensor:greaterThanMaxGradTensor
+                                                                 secondaryTensor:lesserThanMinGradTensor
+                                                                            name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                                                      secondaryTensor:gradOutputTensor
+                                                                                 name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm b/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm
new file mode 100644
index 000000000000..c82818318e9e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm
@@ -0,0 +1,154 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Pool.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+
+void set_kernel_params
+  (int64_t isizeH, int64_t isizeW,
+   int64_t osizeH, int64_t osizeW,
+   int64_t &strideH, int64_t &strideW,
+   int64_t &kernel_sizeH, int64_t &kernel_sizeW) {
+
+  strideH = (int64_t) (isizeH / osizeH);
+  strideW = (int64_t) (isizeW / osizeW);
+
+  kernel_sizeH = isizeH - (osizeH-1) * strideH;
+  kernel_sizeW = isizeW - (osizeW-1) * strideW;
+}
+
+Tensor& adaptive_avg_pool2d_out_mps
+  (const Tensor& input,
+   IntArrayRef output_size,
+   Tensor& output) {
+
+  for (int64_t i = 1; i < input.ndimension(); i++) {
+    TORCH_CHECK(input.size(i) > 0,
+      "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
+      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
+      "empty");
+  }
+
+  int64_t isizeH = input.size(-2);
+  int64_t isizeW = input.size(-1);
+
+  int64_t osizeH = output_size[0];
+  int64_t osizeW = output_size[1];
+
+  if(input.suggest_memory_format() == at::MemoryFormat::ChannelsLast)
+    TORCH_CHECK(input.ndimension() == 4,
+                    "adaptive_avg_pool2d(): Expected 4D tensor, but got ",
+                    input.sizes())
+
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous:
+    case at::MemoryFormat::ChannelsLast:
+      break;
+    default:
+        TORCH_CHECK(
+          false,
+          "Unsupported memory format. Supports only ChannelsLast, Contiguous")
+  }
+
+  int64_t strideH;
+  int64_t strideW;
+  int64_t kernel_sizeH;
+  int64_t kernel_sizeW;
+
+  set_kernel_params(isizeH, isizeW,
+                    osizeH, osizeW,
+                    strideH, strideW,
+                    kernel_sizeH, kernel_sizeW);
+
+  output =  at::avg_pool2d(input,
+                           IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                           IntArrayRef({strideH, strideW}),
+                           IntArrayRef({0, 0}),
+                           false,
+                           true,
+                           c10::nullopt);
+  return output;
+}
+
+Tensor adaptive_avg_pool2d_mps
+  (at::Tensor const& input,
+   IntArrayRef output_size) {
+
+  IntArrayRef output_shape;
+
+  auto osizeH = output_size[0];
+  auto osizeW = output_size[1];
+
+  std::vector<long long> out_dims = {};
+
+  if(input.ndimension() == 4) {
+    auto sizeB = input.size(0);
+    auto sizeD = input.size(1);
+
+    out_dims.push_back(sizeB);
+    out_dims.push_back(sizeD);
+    out_dims.push_back(osizeH);
+    out_dims.push_back(osizeW);
+    output_shape = IntArrayRef(out_dims);
+  }
+  else {
+    auto sizeD = input.size(0);
+    out_dims.push_back(sizeD);
+    out_dims.push_back(osizeH);
+    out_dims.push_back(osizeW);
+    output_shape = IntArrayRef(out_dims);
+  }
+
+  const auto memory_format = input.suggest_memory_format();
+  Tensor output = at::native::empty_mps(
+                      output_shape,
+                      input.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      memory_format);
+  return adaptive_avg_pool2d_out_mps(input, output_size, output);
+
+}
+
+Tensor adaptive_avg_pool2d_backward_mps
+  (const Tensor& gradOutput,
+   const Tensor& input) {
+
+    int64_t isizeH = input.size(-2);
+    int64_t isizeW = input.size(-1);
+    int64_t osizeH = gradOutput.size(-2);
+    int64_t osizeW = gradOutput.size(-1);
+
+    int64_t strideH, strideW, kernel_sizeH, kernel_sizeW;
+
+    set_kernel_params(isizeH, isizeW,
+                      osizeH, osizeW,
+                      strideH, strideW,
+                      kernel_sizeH, kernel_sizeW);
+    auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    if (gradInput.numel() != 0)
+      gradInput = at::avg_pool2d_backward(gradOutput,
+                                          input,
+                                          IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                                          IntArrayRef({strideH, strideW}),
+                                          IntArrayRef({0, 0}),
+                                          false,
+                                          true,
+                                          c10::nullopt);
+
+    return gradInput;
+
+}
+
+}
+}
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
new file mode 100644
index 000000000000..1a3e4155dac8
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -0,0 +1,332 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+struct BinaryOpCachedGraph : public MPSCachedGraph
+{
+  BinaryOpCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *primaryTensor = nil, *secondaryTensor = nil, *outputTensor = nil;
+};
+
+typedef MPSGraphTensor* (^BinaryOpBlock)(MPSGraph*, MPSGraphTensor*, MPSGraphTensor*);
+#define BinaryOpFn() MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* primary, MPSGraphTensor* secondary)
+
+void binaryOpTensor(const Tensor& self_t, const Tensor& other_t, const Tensor& output, std::string op_name, BinaryOpBlock binaryBlock)
+{
+  // it's possible to receive empty tensors here
+  if (self_t.numel() == 0 || other_t.numel() == 0) {
+    return;
+  }
+
+  const bool is_self_scalar = self_t.dim() == 0;
+  const bool is_other_scalar = other_t.dim() == 0;
+  Tensor self = is_self_scalar ? self_t : self_t.contiguous(at::MemoryFormat::Contiguous);
+  Tensor other = is_other_scalar ? other_t : other_t.contiguous(at::MemoryFormat::Contiguous);
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  @autoreleasepool {
+    string key = op_name + getTensorsStringKey({self, other});
+    BinaryOpCachedGraph* cachedGraph = static_cast<BinaryOpCachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+        BinaryOpCachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new BinaryOpCachedGraph(mpsGraph);
+          newCachedGraph->primaryTensor = !is_self_scalar ? mpsGraphRankedPlaceHolder(mpsGraph, self) :
+                                          mpsGraphConstantPlaceHolder(mpsGraph, getMPSScalarValue(self), getMPSShape(other),
+                                                                      // if other is scalar too, then use self's data type here and let the other
+                                                                      // have the same data type as self in the secondaryTensor
+                                                                      getMPSDataType((!is_other_scalar ? other : self).scalar_type()));
+
+          newCachedGraph->secondaryTensor = !is_other_scalar ? mpsGraphRankedPlaceHolder(mpsGraph, other) :
+                                            mpsGraphConstantPlaceHolder(mpsGraph, getMPSScalarValue(other), getMPSShape(self),
+                                                                        // regardless of self's data type, the scondaryTensor's type must match it.
+                                                                        getMPSDataType(self.scalar_type()));
+          newCachedGraph->outputTensor = binaryBlock(mpsGraph, newCachedGraph->primaryTensor, newCachedGraph->secondaryTensor);
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<BinaryOpCachedGraph *>(tmpCachedGraph);
+    }
+
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    if (!is_self_scalar) {
+      Placeholder selfPlaceholder = Placeholder(cachedGraph->primaryTensor, self);
+      feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
+    }
+    if (!is_other_scalar) {
+      Placeholder otherPlaceholder = Placeholder(cachedGraph->secondaryTensor, other);
+      feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
+    }
+
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+}
+
+void binaryOpScalar(const Tensor& self, const Scalar& other, const Tensor& output, std::string op_name, BinaryOpBlock binaryBlock)
+{
+  binaryOpTensor(self, wrapped_scalar_tensor(other), output, op_name, binaryBlock);
+}
+
+void div_mode_template(const Tensor& self, const Tensor& other,
+                       c10::optional<c10::string_view> rounding_mode,
+                       const Tensor& output, const string op_name)
+{
+  BinaryOpBlock div_mode_op_block = ^BinaryOpFn() {
+    MPSGraphTensor* divTensor =  [mpsGraph divisionWithPrimaryTensor:primary
+                                                     secondaryTensor:secondary
+                                                                name:nil];
+    if (!rounding_mode.has_value()) {
+      return divTensor;
+    } else if (*rounding_mode == "trunc") {
+      return trunc_tensor(mpsGraph, divTensor);
+    } else if (*rounding_mode == "floor") {
+      return [mpsGraph floorWithTensor:divTensor name:nil];
+    }
+    assert(0 && "Invalid rounding mode\n");
+    return nullptr;
+  };
+  binaryOpTensor(self, other, output, op_name + "_out_mps:" + (rounding_mode.has_value() ? c10::str(*rounding_mode) : ""), div_mode_op_block);
+}
+
+void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output, std::string op_name)
+{
+  BinaryOpBlock add_sub_op_block = ^BinaryOpFn() {
+    double alpha_val = alpha.toDouble();
+    MPSGraphTensor* secondaryTensor = secondary;
+
+    // if alpha is 1.0, then we don't bother adding another multiply to graph
+    if (alpha_val != 1.0) {
+      MPSGraphTensor* alphaTensor = mpsGraphConstantPlaceHolder(mpsGraph, alpha_val, getMPSShape(other), getMPSDataType(other.scalar_type()));
+      secondaryTensor = [mpsGraph multiplicationWithPrimaryTensor:secondary
+                                                  secondaryTensor:alphaTensor
+                                                             name:nil];
+    }
+    if (op_name == "add")
+      return [mpsGraph additionWithPrimaryTensor:primary
+                                 secondaryTensor:secondaryTensor
+                                            name:nil];
+    else
+      return [mpsGraph subtractionWithPrimaryTensor:primary
+                                    secondaryTensor:secondaryTensor
+                                               name:nil];
+  };
+  binaryOpTensor(self, other, output, op_name + "_out_mps:" + std::to_string(alpha.toDouble()), add_sub_op_block);
+}
+
+} // namespace mps
+
+#define CREATE_MPS_BINARY_OP_FUNC(func_out, func_stub, other_type)                              \
+TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \
+  mps::binaryOp##other_type(self, other, output, #func_stub,                                    \
+    ^BinaryOpFn() {                                                                             \
+      return [mpsGraph func_stub##WithPrimaryTensor:primary                                     \
+                                    secondaryTensor:secondary                                   \
+                                               name:nil]; });                                   \
+}
+
+// Boolean Ops require casting output to "MPSDataTypeBool"
+#define CREATE_MPS_BOOLEAN_OP_FUNC(func_out, func_stub, other_type)                             \
+TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \
+  mps::binaryOp##other_type(self, other, output, #func_stub,                                    \
+    ^BinaryOpFn() {                                                                             \
+      MPSGraphTensor* outputTensor = [mpsGraph func_stub##WithPrimaryTensor:primary             \
+                                                            secondaryTensor:secondary           \
+                                                                       name:nil];               \
+      return [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"boolOut"]; });     \
+}
+
+// Boolean Binary Ops
+CREATE_MPS_BOOLEAN_OP_FUNC(eq_scalar_out_mps, equal, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(eq_tensor_out_mps, equal, Tensor);
+CREATE_MPS_BOOLEAN_OP_FUNC(ne_scalar_out_mps, notEqual, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(ne_tensor_out_mps, notEqual, Tensor);
+CREATE_MPS_BOOLEAN_OP_FUNC(le_scalar_out_mps, lessThanOrEqualTo, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(le_tensor_out_mps, lessThanOrEqualTo, Tensor);
+CREATE_MPS_BOOLEAN_OP_FUNC(lt_scalar_out_mps, lessThan, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(lt_tensor_out_mps, lessThan, Tensor);
+CREATE_MPS_BOOLEAN_OP_FUNC(ge_scalar_out_mps, greaterThanOrEqualTo, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(ge_tensor_out_mps, greaterThanOrEqualTo, Tensor);
+CREATE_MPS_BOOLEAN_OP_FUNC(gt_scalar_out_mps, greaterThan, Scalar);
+CREATE_MPS_BOOLEAN_OP_FUNC(gt_tensor_out_mps, greaterThan, Tensor);
+
+// Arithmetic Binary Ops
+CREATE_MPS_BINARY_OP_FUNC(minimum_out_mps, minimum, Tensor);
+CREATE_MPS_BINARY_OP_FUNC(maximum_out_mps, maximum, Tensor);
+CREATE_MPS_BINARY_OP_FUNC(mul_out_mps, multiplication, Tensor);
+CREATE_MPS_BINARY_OP_FUNC(pow_tensor_scalar_out_mps, power, Scalar);
+CREATE_MPS_BINARY_OP_FUNC(pow_tensor_tensor_out_mps, power, Tensor);
+CREATE_MPS_BINARY_OP_FUNC(atan2_mps_out, atan2, Tensor);
+
+
+TORCH_IMPL_FUNC(div_out_mode_mps) (const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode, const Tensor& output) {
+  mps::div_mode_template(self, other, rounding_mode, output, "div_mode");
+}
+
+TORCH_IMPL_FUNC(div_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::div_mode_template(self, other, c10::nullopt, output, "div");
+}
+
+TORCH_IMPL_FUNC(add_out_mps) (const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
+  mps::add_sub_template(self, other, alpha, output, "add");
+}
+
+TORCH_IMPL_FUNC(sub_out_mps) (const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
+  mps::add_sub_template(self, other, alpha, output, "sub");
+}
+
+
+TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
+{
+      using namespace mps;
+      MPSStream* stream = getCurrentMPSStream();
+
+      if (&output != &self) {
+          output.resize_(self.sizes());;
+      }
+
+      // Derive from MPSCachedGraph
+      struct CachedGraph : public MPSCachedGraph
+      {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor *inputTensor_ = nil;
+        MPSGraphTensor *otherTensor_ = nil;
+        MPSGraphTensor *outputTensor_ = nil;
+      };
+
+      MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+      @autoreleasepool {
+        string key = "log_base_e_out_mps:" + getTensorsStringKey({self, other});
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+              MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+              MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+              MPSGraphTensor* ePowXTensor = [mpsGraph exponentWithTensor:xTensor
+                                                                         name:nil];
+              MPSGraphTensor* ePowYTensor = [mpsGraph exponentWithTensor:yTensor
+                                                                         name:nil];
+              MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:ePowXTensor
+                                                                secondaryTensor:ePowYTensor
+                                                                     name:nil];
+              MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:sumTensor
+                                                                     name:nil];
+
+              newCachedGraph->inputTensor_ = xTensor;
+              newCachedGraph->otherTensor_ = yTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+        Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+          selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+          otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
+        };
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      }
+
+    }
+
+TORCH_IMPL_FUNC(logaddexp2_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
+{
+      using namespace mps;
+      MPSStream* stream = getCurrentMPSStream();
+
+      if (&output != &self) {
+          output.resize_(self.sizes());;
+      }
+
+      // Derive from MPSCachedGraph
+      struct CachedGraph : public MPSCachedGraph
+      {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor *inputTensor_ = nil;
+        MPSGraphTensor *otherTensor_ = nil;
+        MPSGraphTensor *outputTensor_ = nil;
+      };
+
+      MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+      @autoreleasepool {
+        string key = "log_base_two_out_mps:" + getTensorsStringKey({self, other});
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+              MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+              MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+              MPSGraphTensor* twoPowXTensor = [mpsGraph exponentBase2WithTensor:xTensor
+                                                                         name:nil];
+              MPSGraphTensor* twoPowYTensor = [mpsGraph exponentBase2WithTensor:yTensor
+                                                                         name:nil];
+              MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:twoPowXTensor
+                                                                secondaryTensor:twoPowYTensor
+                                                                     name:nil];
+              MPSGraphTensor* outputTensor = [mpsGraph logarithmBase2WithTensor:sumTensor
+                                                                     name:nil];
+
+              newCachedGraph->inputTensor_ = xTensor;
+              newCachedGraph->otherTensor_ = yTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+        Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+          selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+          otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
+        };
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      }
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
new file mode 100644
index 000000000000..7ab34ac31401
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -0,0 +1,196 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+
+namespace at {
+namespace native {
+
+
+Tensor dot_mps(
+  const Tensor &self,
+  const Tensor &other)
+{
+  using namespace mps;
+  auto output = at::native::empty_mps({}, self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor* selfTensor_ = nil;
+      MPSGraphTensor* otherTensor_ = nil;
+      MPSGraphTensor* outputTensor_ = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "dot_mps" + getTensorsStringKey({self, other});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *otherTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, other);
+
+          MPSGraphTensor *dot = [mpsGraph multiplicationWithPrimaryTensor: selfTensor
+                                                          secondaryTensor: otherTensor
+                                                                     name: @"multiplication"];
+
+          MPSGraphTensor *dotProductTensor = [mpsGraph reductionSumWithTensor: dot
+                                                                         axes: nil
+                                                                         name: @"dotProduct"];
+          newCachedGraph->selfTensor_ = selfTensor;
+          newCachedGraph->otherTensor_ = otherTensor;
+          newCachedGraph->outputTensor_ = dotProductTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+Tensor& addmv_out_mps_impl(
+  const Tensor &self,
+  const Tensor &mat,
+  const Tensor &vec,
+  const Scalar& beta_,
+  const Scalar& alpha_,
+  Tensor& result)
+{
+  using namespace mps;
+
+  TORCH_CHECK(mat.is_mps());
+  TORCH_CHECK(vec.is_mps());
+  TORCH_CHECK(result.is_mps());
+  TORCH_CHECK(self.is_mps());
+
+  c10::MaybeOwned<Tensor> self_ = expand_size(self, {mat.size(0)});
+  auto betaval = beta_.toComplexDouble();
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *selfTensor_      = nil;
+    MPSGraphTensor *matMulVecTensor_ = nil;
+    MPSGraphTensor *outputTensor_    = nil;
+  };
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  MPSStream *stream = at::mps::getCurrentMPSStream();
+  Tensor matMulVec = mm(mat, vec.unsqueeze(1)).squeeze(1);
+
+  @autoreleasepool {
+    string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec})
+                                       + ":" + to_string(beta_.toDouble())
+                                       + ":" + to_string(alpha_.toDouble());
+    CachedGraph* cachedGraph = nil;
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *matMulVecTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, matMulVec);
+          MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+          // Intermediates for beta and alpha
+          MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar: alpha_.toDouble()
+                                                            dataType: getMPSScalarType(mat.scalar_type())];
+
+          // Intermediates for multiplying by beta and alpha
+          MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:matMulVecTensor
+                                                                              secondaryTensor:alphaTensor
+                                                                                         name:@"MM/alpha*(mat@vec)"];
+          newCachedGraph->outputTensor_ = productTimesAlphaTensor;
+
+          if (betaval != 0.0)
+          {
+            MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar: beta_.toDouble()
+                                                             dataType: getMPSScalarType(self.scalar_type())];
+
+            MPSGraphTensor* selfTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor: selfTensor
+                                                                            secondaryTensor: betaTensor
+                                                                                       name: @"MM/beta*input"];
+
+            MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor: productTimesAlphaTensor
+                                                               secondaryTensor: selfTimesBetaTensor
+                                                                          name: @"MM/beta*input + alpha*(mat@vec)"];
+
+            newCachedGraph->outputTensor_ = outputTensor;
+          }
+
+          newCachedGraph->selfTensor_ = selfTensor;
+          newCachedGraph->matMulVecTensor_ = matMulVecTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder matMulVecPlaceholder = Placeholder(cachedGraph->matMulVecTensor_, matMulVec);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
+    feeds[matMulVecPlaceholder.getMPSGraphTensor()]   = matMulVecPlaceholder.getMPSGraphTensorData();
+    if (betaval != 0.0)
+    {
+        Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+        feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
+    }
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return result;
+}
+
+TORCH_IMPL_FUNC(addmv_out_mps)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_, const Tensor& result) {
+  addmv_out_mps_impl(self, mat, vec, beta_, alpha_, const_cast<Tensor&>(result));
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
new file mode 100644
index 000000000000..09e962b94f78
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -0,0 +1,94 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+using namespace at::mps;
+
+namespace at {
+namespace native {
+
+Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) {
+  using namespace mps;
+
+  if (self.numel() == 0) {
+    return self;
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache *cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "fill_scalar_mps_impl:" + getMPSTypeString(self.scalar_type())
+                                         + ":" + string([ns_shape_key UTF8String])
+                                         + ":" + to_string(value.toDouble());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar:value.toDouble()
+                                                               shape:input_shape
+                                                            dataType:getMPSScalarType(self.scalar_type())];
+          MPSGraphTensor* outputTensor = [mpsGraph identityWithTensor:inputTensor
+                                                                 name:nil];
+
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return self;
+}
+
+Tensor& zero_mps_(Tensor& self) {
+  return at::native::fill_scalar_mps_impl(self, 0.0f);
+}
+
+Tensor& fill_scalar_mps(Tensor& self, const Scalar& value) {
+  return at::native::fill_scalar_mps_impl(self, value);
+}
+
+Tensor& fill_tensor_mps_(Tensor& self, const Tensor& value) {
+  TORCH_CHECK(value.dim() == 0, "fill_ only supports 0-dimension value tensor but got tensor with ", value.dim(), " dimensions.");
+  return at::native::fill_scalar_mps_impl(self, value.item());
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
new file mode 100644
index 000000000000..40327536b564
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -0,0 +1,508 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/ConvUtils.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+// Create convolution descriptor
+void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
+                    NSUInteger strideInX, NSUInteger strideInY,
+                    NSUInteger dilationRateInX, NSUInteger dilationRateInY,
+                    NSUInteger paddingHorizontal, NSUInteger paddingVertical,
+                    c10::MemoryFormat memory_format, NSUInteger groups) {
+  descriptor_.strideInX = strideInX;
+  descriptor_.strideInY = strideInY;
+  descriptor_.dilationRateInX = dilationRateInX;
+  descriptor_.dilationRateInY = dilationRateInY;
+
+  // TODO: Program the padding style
+  descriptor_.paddingStyle = MPSGraphPaddingStyleExplicit;
+
+  descriptor_.paddingLeft = paddingHorizontal;
+  descriptor_.paddingRight = paddingHorizontal;
+  descriptor_.paddingTop = paddingVertical;
+  descriptor_.paddingBottom = paddingVertical;
+
+  descriptor_.dataLayout = (memory_format == at::MemoryFormat::Contiguous) ?
+        MPSGraphTensorNamedDataLayoutNCHW : MPSGraphTensorNamedDataLayoutNHWC;
+  descriptor_.weightsLayout = (memory_format == at::MemoryFormat::Contiguous) ?
+        MPSGraphTensorNamedDataLayoutOIHW : MPSGraphTensorNamedDataLayoutHWIO;
+  descriptor_.groups = groups;
+}
+
+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "mps_convolution";
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 };
+  checkAllSameType(c, {input, weight});
+  checkAllSameGPU(c, {input, weight});
+
+  bool bias_defined;
+
+  if(bias_opt == c10::nullopt)
+    bias_defined = false;
+  else
+   bias_defined = bias_opt->defined();
+
+  auto memory_format = input_t.suggest_memory_format();
+  auto output_t = at::empty(
+                    conv_output_size(input->sizes(), weight->sizes(),
+                                     padding, stride, dilation),
+                    input->scalar_type(),
+                    c10::nullopt,
+                    kMPS,
+                    c10::nullopt,
+                    memory_format);
+
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  TensorArg output{ output_t, "result", 0 };
+
+  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* biasTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    IntArrayRef bias_shape;
+    if(bias_defined)
+      bias_shape = bias_opt.value().sizes();
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string bias_shape_key;
+    if(bias_defined)
+      bias_shape_key = to_string(bias_shape[0]);
+    else
+      bias_shape_key = "nobias";
+
+    string key = "mps_convolution:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
+                                    + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
+                                    + to_string(padding[0]) + ":" + to_string(padding[1]) + ":"
+                                    + to_string(groups) + ":" +  mem_format_key
+                                    + mps::getTensorsStringKey({input_t, weight_t}) + ":"
+                                    + to_string(bias_defined) + ":" + bias_shape_key;
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new];
+          fill_conv_desc(descriptor_, stride[0], stride[1],
+                                      dilation[0], dilation[1],
+                                      padding[1], padding[0],
+                                      memory_format, groups);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+          MPSGraphTensor* biasTensor = nil;
+          if(bias_defined)
+            biasTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType((bias_opt.value()).scalar_type()));
+
+          MPSGraphTensor* outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
+                                                                   weightsTensor:weightTensor
+                                                                      descriptor:descriptor_
+                                                                            name:nil];
+
+          if(bias_defined) {
+            outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor
+                                               secondaryTensor:biasTensor
+                                                          name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->biasTensor_ = biasTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto weightsPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_t);
+    auto biasPlaceholder = native_mps::Placeholder();
+    // Reshape the bias to be broadcastable with output of conv2d
+    if(bias_defined)
+      biasPlaceholder = native_mps::Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, *output);
+
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = [[NSMutableDictionary alloc] initWithCapacity: 3];
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
+    if(bias_defined) {
+      feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
+    }
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return *output;
+}
+
+Tensor mps_convolution_backward_input(
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
+  namespace native_mps = at::native::mps;
+  using namespace mps;
+  CheckedFrom c = "mps_convolution_backward_input";
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
+  checkAllSameType(c, {grad_output, weight});
+  checkAllSameGPU(c, {grad_output, weight});
+  auto memory_format = grad_output_t.suggest_memory_format();
+  auto grad_input_t = at::empty(
+                    input_size,
+                    grad_output->scalar_type(),
+                    c10::nullopt,
+                    kMPS,
+                    c10::nullopt,
+                    memory_format);
+
+  // Avoid "grad_input" when this is being used as transposed convolution
+  TensorArg grad_input{ grad_input_t, "result", 0 };
+  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  // Add backward with input
+  @autoreleasepool {
+
+    MPSStream* stream = getCurrentMPSStream();
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    MPSShape* mps_input_shape = getMPSShape(input_size);
+
+    NSString* ns_shape_key = [[mps_input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "mps_convolution_backward_input:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
+                                                   + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
+                                                   + to_string(padding[0]) + ":" + to_string(padding[1]) + ":"
+                                                   + to_string(groups) + ":" +  mem_format_key
+                                                   + getTensorsStringKey({grad_output_t, weight_t}) + ":"
+                                                   + string([ns_shape_key UTF8String]);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph* newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new];
+          fill_conv_desc(descriptor_, stride[0], stride[1],
+                                      dilation[0], dilation[1],
+                                      padding[1], padding[0],
+                                      memory_format, groups);
+
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+
+          MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                                            weightsTensor:weightTensor
+                                                                                              outputShape:mps_input_shape
+                                                                             forwardConvolutionDescriptor:descriptor_
+                                                                                                     name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      weightsPlaceholder.getMPSGraphTensor() : weightsPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+  return *grad_input;
+}
+
+Tensor mps_convolution_backward_weights(
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
+  namespace native_mps = at::native::mps;
+  using namespace mps;
+  CheckedFrom c = "mps_convolution_backward_weights";
+  auto memory_format = input_t.suggest_memory_format();
+
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+  TensorArg input{ input_t, "input", 2};
+
+  checkAllSameType(c, {grad_output, input});
+  checkAllSameGPU(c, {grad_output, input});
+
+  auto grad_weight_t = at::empty(weight_size, grad_output_t.options(), memory_format);
+  TensorArg grad_weight{ grad_weight_t, "result", 0 };
+
+  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gradWeightTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSStream* stream = getCurrentMPSStream();
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    MPSShape* mps_weight_shape = getMPSShape(weight_size);
+
+    NSString* ns_shape_key = [[mps_weight_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "mps_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
+                                                     + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
+                                                     + to_string(padding[0]) + ":" + to_string(padding[1]) + ":"
+                                                     + to_string(groups) + ":" +  mem_format_key
+                                                     + getTensorsStringKey({grad_output_t, input_t}) + ":"
+                                                     + string([ns_shape_key UTF8String]);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph* newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new];
+          fill_conv_desc(descriptor_, stride[0], stride[1],
+                                      dilation[0], dilation[1],
+                                      padding[1], padding[0],
+                                      memory_format, groups);
+
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+
+          MPSGraphTensor* gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                                                 sourceTensor:inputTensor
+                                                                                                  outputShape:mps_weight_shape
+                                                                                 forwardConvolutionDescriptor:descriptor_
+                                                                                                         name:nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradWeightTensor_ = gradWeightTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return grad_weight_t;
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    std::array<bool,3> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (input.numel() == 0) {
+    if (output_mask[0]) {
+      grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+    if (output_mask[1]) {
+      grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+  } else {
+    if (output_mask[0]) {
+      grad_input = mps_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, output_mask[2]);
+    }
+    if (output_mask[1]) {
+      grad_weight = mps_convolution_backward_weights(weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2]);
+    }
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+Tensor mps_convolution_transpose_forward(
+    const Tensor& grad_output, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  auto input_size = conv_input_size(grad_output.sizes(), weight.sizes(),
+                                    padding, output_padding, stride, dilation, groups);
+  return mps_convolution_backward_input(input_size, grad_output, weight,
+                                    padding, stride, dilation, groups, false);
+}
+
+Tensor _mps_convolution_transpose(
+    const Tensor& input_t, const Tensor& weight_t,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
+    int64_t groups) {
+
+  auto output_t = mps_convolution_transpose_forward(
+    input_t, weight_t, padding, output_padding, stride, dilation, groups);
+  return output_t;
+
+}
+
+Tensor mps_convolution_transpose_backward_input(
+    const Tensor& grad_output_t, const Tensor& weight_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
+    int64_t groups)
+{
+  return at::_mps_convolution(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+}
+
+Tensor mps_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  return mps_convolution_backward_weights(
+      weight_size, input_t, grad_output_t,
+      padding, stride, dilation, groups, false);
+}
+
+
+std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
+    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    std::array<bool,2> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+
+  Tensor grad_input, grad_weight;
+  if (output_mask[0]) {
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+  }
+  if (output_mask[1]) {
+    grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);
+  }
+
+  return std::tuple<Tensor,Tensor>{grad_input, grad_weight};
+}
+
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
new file mode 100644
index 000000000000..ec7dce287a2e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -0,0 +1,408 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <iostream>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <torch/library.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+namespace native {
+
+MPSGraphTensor* chainViewOperation(MPSGraph* mpsGraph, IntArrayRef size,
+                             IntArrayRef stride, int64_t storage_offset,
+                             MPSGraphTensor* inputTensor, const Tensor& self) {
+  MPSGraphTensor *outputTensor = nil;
+  @autoreleasepool {
+      int32_t* sizeArray = new int32_t[size.size()]();
+      for (int i = 0; i < size.size(); i++) {
+        sizeArray[i] = size[i];
+      }
+      NSData* shapeData = [NSData dataWithBytes : sizeArray
+                                    length : size.size()*sizeof(int32_t)];
+
+      MPSGraphTensor* shapeTensor =  [mpsGraph constantWithData : shapeData
+                                                        shape : @[[NSNumber numberWithUnsignedInteger: size.size()]]
+                                                      dataType : MPSDataTypeInt32];
+      MPSGraphTensor* storageOffsetTensor = [mpsGraph constantWithScalar :  storage_offset
+                                                            dataType : MPSDataTypeInt32];
+      MPSGraphTensor* strideTensor = [mpsGraph constantWithScalar : stride[self.dim()-1]
+                                                      dataType : MPSDataTypeInt32];
+      MPSGraphTensor* rangeTensor = [mpsGraph coordinateAlongAxis:-1
+                                                      withShapeTensor : shapeTensor
+                                                              name : nil];
+      MPSGraphTensor* indexTensor = [mpsGraph multiplicationWithPrimaryTensor :  rangeTensor
+                                                          secondaryTensor : strideTensor
+                                                              name : nil];
+      MPSGraphTensor* indicesTensor = indexTensor;
+      // create stride Tensors for each rank of the input tensor
+      for (int i = 1; i < self.dim(); i++) {
+        strideTensor = [mpsGraph constantWithScalar : stride[self.dim() - i - 1]
+                                        dataType : MPSDataTypeInt32];
+        MPSGraphTensor* rangeTensor = [mpsGraph coordinateAlongAxis: (-i-1)
+                                                        withShapeTensor : shapeTensor
+                                                                name : nil];
+        MPSGraphTensor* indexTensor = [mpsGraph multiplicationWithPrimaryTensor :  rangeTensor
+                                                            secondaryTensor : strideTensor
+                                                                name : nil];
+        indicesTensor = [mpsGraph additionWithPrimaryTensor : indexTensor
+                                          secondaryTensor : indicesTensor
+                                              name : nil];
+      }
+      indicesTensor = [mpsGraph additionWithPrimaryTensor : indicesTensor
+                                            secondaryTensor : storageOffsetTensor
+                                                  name : nil];
+      MPSGraphTensor *reshapedInputTensor = [mpsGraph reshapeTensor:inputTensor
+                                                         withShape:@[@-1]
+                                                              name:nil];
+      MPSGraphTensor *reshapedIndicesTensor = [mpsGraph reshapeTensor:indicesTensor
+                                                 withShape:@[@-1]
+                                                      name:nil];
+      // Call gather to coalesce the needed values. Result will be of same shape as flattened indices tensor
+      MPSGraphTensor *gatheredTensor = [mpsGraph gatherWithUpdatesTensor:reshapedInputTensor
+                                                        indicesTensor:reshapedIndicesTensor
+                                                                 axis:0
+                                                      batchDimensions:0
+                                                                 name:nil];
+
+      delete[] sizeArray;
+      // Reshape the data to desired size
+      outputTensor =  [mpsGraph reshapeTensor:gatheredTensor
+                               withShapeTensor:shapeTensor
+                                          name:nil];
+  }
+  return outputTensor;
+}
+
+
+// There are few cases we need to consider:
+// Here nodes are the Tensors and the edges are the operations performed on the
+// Tensor. As a result of the operation performed we can have result as View
+// Tensor (View T) or a Non view tensor (NonView T). The difference is if its
+// mapped by the same underlying storage ptr or a new MTLBuffer was allocated.
+//                T = Tensor
+//                 ----------
+//                 | Orig T |
+//                 ----------
+//                /     |     \
+//             View T  View T  NonView T
+//             /      /    \      |
+//            View T /      \     |
+//            |     /        \    |
+//            |    /          \   |
+//            |   /            \  |
+//            NonView T         NonView T
+//
+//
+Tensor as_strided_tensorimpl_mps(const Tensor& self, IntArrayRef size,
+                                 IntArrayRef stride,
+                                 optional<int64_t> storage_offset_) {
+  using namespace mps;
+  // Use the size and stride to create a unique key
+  auto result = detail::make_tensor<TensorImpl>(
+      c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
+  auto storage_offset = storage_offset_.value_or(self.storage_offset());
+  setStrided(result, size, stride, storage_offset);
+
+  // 0 sizes won't result in any change in the shape of the Tensor so we can
+  // skip it. Also if the memory is contiguous we don't need to do
+  // gather-scatter operations using graph.
+  if (size.size() > 0 && (!result.is_contiguous())) {
+
+    // If self itself was a view tensor, that means we need to chain the graphs
+    // else we will create a new entry in the cache
+    struct CachedGraph : public MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor* inputTensor_ = nil;
+      MPSGraphTensor* outputTensor_ = nil;
+      IntArrayRef size_;
+      IntArrayRef stride_;
+      int64_t storage_offset_;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+      string lookup_key = mps::getStridedKey(self, self.sizes(), self.strides(),
+                      self.storage_offset());
+#if _DEBUG
+      std::cout << "Lookup key " << lookup_key << std::endl;
+#endif
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(lookup_key));
+
+      if(!cachedGraph) {
+        string insert_key = mps::getStridedKey(self,size, stride, storage_offset);
+#if _DEBUG
+        std::cout << "Insert key " << insert_key << std::endl;
+#endif
+        CachedGraph* insertCachedGraph = static_cast<CachedGraph *>(cache_->LookUp(insert_key));
+        if (!insertCachedGraph) {
+          MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(insert_key, ^ MPSCachedGraph * () {
+            CachedGraph *newCachedGraph = nil;
+            @autoreleasepool {
+                MPSGraph* mpsGraph = make_mps_graph();
+                newCachedGraph = new CachedGraph(mpsGraph);
+
+                // Self is the input tensor we are creating view of
+                MPSGraphTensor* inputTensor = [mpsGraph placeholderWithShape : getMPSShape(self)
+                                                                    dataType : getMPSDataType(self.scalar_type())
+                                                                    name : nil];
+                newCachedGraph->inputTensor_ = inputTensor;
+                newCachedGraph->outputTensor_ = chainViewOperation(mpsGraph, size,
+                                                                   stride,
+                                                                   storage_offset,
+                                                                   inputTensor,
+                                                                   self);
+                newCachedGraph->size_ = size;
+                newCachedGraph->stride_ = stride;
+                newCachedGraph->storage_offset_ = storage_offset;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+      } else {
+        // Else part takes care of the chaining where multiple view operations
+        // were implemented on the same underlying data storage ptr
+        cachedGraph->outputTensor_ = chainViewOperation(cachedGraph->graph(),
+                                      size, stride, storage_offset,
+                                      cachedGraph->outputTensor_, self);
+        cachedGraph->size_ = size;
+        cachedGraph->stride_ = stride;
+        cachedGraph->storage_offset_ = storage_offset;
+      }
+    }
+  }
+  return result;
+}
+
+namespace mps {
+
+void* pageAlignedBlockPtr(
+    const void* ptr,
+    NSUInteger size,
+    NSUInteger* alignedBlockSize) {
+  uintptr_t address = (uintptr_t)ptr;
+  uintptr_t alignedAddress = address & ~(PAGE_SIZE - 1);
+  uintptr_t alignedEnd = ((address + size) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+  uint64_t alignedLength = alignedEnd - alignedAddress;
+
+  assert(address >= alignedAddress);
+  assert(address + size <= alignedAddress + alignedLength);
+
+  *alignedBlockSize = alignedLength;
+  return (void*)alignedAddress;
+}
+
+static at::Tensor& copy_from_mps_(at::Tensor& self, const at::Tensor& src,
+                           bool non_blocking) {
+
+  using namespace mps;
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t size = src.nbytes();
+  if (size == 0) return self;
+  void* host_dst = self.data_ptr();
+
+  // MTLContext* context = static_cast<MTLContext *>(device->device_handle);
+  auto storage_byte_offset = src.storage_offset() * src.itemsize();
+  id<MTLBuffer> sourceBuffer = __builtin_bit_cast(id<MTLBuffer>, src.storage().data());
+
+  if (!src.is_contiguous()) {
+    id<MTLBuffer> gatherTensor = gatherViewTensor(src, sourceBuffer);
+    if (gatherTensor) {
+      sourceBuffer = gatherTensor;
+      storage_byte_offset = 0;
+    }
+  }
+
+  if (sourceBuffer == nil) return self;
+  NSUInteger destOffset = 0;
+
+  @autoreleasepool {
+    MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared;
+    NSUInteger alignedLength = 0;
+
+    void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)size, &alignedLength);
+    id<MTLBuffer> destBuffer = [device newBufferWithBytesNoCopy:alignedPtr
+                                                         length:alignedLength
+                                                        options:options
+                                                    deallocator:nil];
+    destOffset = uintptr_t(host_dst) - uintptr_t(alignedPtr);
+    // 4 bytes alignment required on macos for blits.
+    TORCH_CHECK(destOffset % 4 == 0, "Unaligned blit request");
+
+    dispatch_sync(stream->queue(), ^() {
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+        id<MTLBlitCommandEncoder> blitEncoder =
+            [commandBuffer blitCommandEncoder];
+
+        [blitEncoder copyFromBuffer:sourceBuffer
+                       sourceOffset:(NSUInteger)storage_byte_offset
+                           toBuffer:destBuffer
+                  destinationOffset:(NSUInteger)destOffset
+                               size:(NSUInteger)size];
+        [blitEncoder endEncoding];
+
+        if (non_blocking) {
+          stream->commit(true);
+        } else {
+          stream->commitAndWait();
+        }
+        [destBuffer release];
+      }
+    });
+  }
+
+  return self;
+}
+
+static at::Tensor& copy_to_mps_(at::Tensor& self, const at::Tensor& src,
+                         bool non_blocking) {
+  MPSStream* stream = getCurrentMPSStream();
+  const void* host_src = src.data_ptr();
+  uint64_t size = src.nbytes();
+
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  auto dst_byte_offset = self.storage_offset() * self.itemsize();
+  id<MTLBuffer> destBuffer = __builtin_bit_cast(id<MTLBuffer>, self.storage().data());
+
+  NSUInteger sourceOffset = 0;
+  @autoreleasepool {
+    MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared;
+    NSUInteger alignedLength = 0;
+
+    void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)size, &alignedLength);
+    id<MTLBuffer> sourceBuffer = [device newBufferWithBytesNoCopy:alignedPtr
+                                          length:alignedLength
+                                         options:options
+                                     deallocator:nil];
+    sourceOffset = uintptr_t(host_src) - uintptr_t(alignedPtr);
+
+    dispatch_sync(stream->queue(), ^() {
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+        id<MTLBlitCommandEncoder> blitEncoder =
+            [commandBuffer blitCommandEncoder];
+
+        [blitEncoder copyFromBuffer:sourceBuffer
+                       sourceOffset:(NSUInteger)sourceOffset
+                           toBuffer:destBuffer
+                  destinationOffset:(NSUInteger)dst_byte_offset
+                               size:(NSUInteger)size];
+        [blitEncoder endEncoding];
+        if (non_blocking) {
+          stream->commit(true);
+        } else {
+          stream->commitAndWait();
+        }
+      }
+    });
+    [sourceBuffer release];
+  }
+
+  return self;
+}
+
+void copy_blit_mps(void* dst, const void* src, size_t size) {
+  MPSStream* stream = getCurrentMPSStream();
+  id<MTLBuffer> sourceBuffer = (id<MTLBuffer>)(src);
+  id<MTLBuffer> destBuffer = (id<MTLBuffer>)(dst);
+  dispatch_sync(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+      id<MTLBlitCommandEncoder> blitEncoder =
+          [commandBuffer blitCommandEncoder];
+
+      [blitEncoder copyFromBuffer:sourceBuffer
+                     sourceOffset:0
+                         toBuffer:destBuffer
+                destinationOffset:0
+                             size:size];
+      [blitEncoder endEncoding];
+      stream->commitAndWait();
+    }
+  });
+}
+
+
+static at::Tensor& copy_kernel_mps(at::Tensor& dst, const at::Tensor& src,
+                            bool non_blocking) {
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t size = src.nbytes();
+
+  auto src_byte_offset = src.storage_offset() * src.itemsize();
+  id<MTLBuffer> sourceBuffer = __builtin_bit_cast(id<MTLBuffer>, src.storage().data());
+
+  auto dst_byte_offset = dst.storage_offset() * dst.itemsize();
+  id<MTLBuffer> destBuffer = __builtin_bit_cast(id<MTLBuffer>, dst.storage().data());
+
+  dispatch_sync(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
+      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer blitCommandEncoder];
+
+      [blitEncoder copyFromBuffer:sourceBuffer
+                     sourceOffset:src_byte_offset
+                         toBuffer:destBuffer
+                destinationOffset:dst_byte_offset
+                             size:size];
+      [blitEncoder endEncoding];
+      if (non_blocking) {
+        stream->commit(true);
+      } else {
+        stream->commitAndWait();
+      }
+    }
+  });
+  return dst;
+}
+
+at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
+{
+  TORCH_CHECK(dst.defined(), "dst is undefined");
+  TORCH_CHECK(src.defined(), "src is undefined");
+
+  if (src.numel() == 0 || dst.is_same(src)) {
+    return dst;
+  }
+  if (dst.numel() == 0) {
+    dst.resize_as_(src);
+  }
+
+  if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
+    return copy_from_mps_(dst, src, non_blocking);
+  }
+  if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
+    return copy_to_mps_(dst, src, non_blocking);
+  }
+
+  if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
+    return copy_kernel_mps(dst, src, non_blocking);
+  }
+  TORCH_INTERNAL_ASSERT(
+      src.device().type() == DeviceType::MPS,
+      "mps_copy_ is implemented only for *->MPS; MPS->*");
+  return dst;
+}
+} // namespace mps
+
+Tensor _copy_from_and_resize_mps(const at::Tensor& self, const at::Tensor& dst)
+{
+  return mps::mps_copy_(const_cast<Tensor&>(dst), self, false);
+}
+
+Tensor _copy_from_mps(const at::Tensor& self, const at::Tensor& dst, bool non_blocking)
+{
+  return mps::mps_copy_(const_cast<Tensor&>(dst), self, non_blocking);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
new file mode 100644
index 000000000000..0ff2a443d6ab
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -0,0 +1,459 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/Distributions.h>
+#include <ATen/native/DistributionTemplates.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+namespace at {
+namespace native {
+namespace templates {
+
+}
+
+Tensor& uniform_mps_(Tensor& input, double from, double to, c10::optional<Generator> gen_)
+{
+  using namespace mps;
+
+  if (input.numel() == 0) {
+    return input;
+  }
+  double delta = (to - from);
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "check_uniform_bounds", [&] {
+    const auto dtype = input.dtype();
+    const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+    const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+    TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to);
+    TORCH_CHECK((to - from) <= std::numeric_limits<scalar_t>::max(),
+          "uniform_ expects to-from <= std::numeric_limits<", toString(input.scalar_type()),
+          ">::max(), but found to=", to, " and from=", from,
+          " which result in to-from to exceed the limit");
+    from = std::min(std::max(from, min), max);
+    to = std::max(std::min(to, max), min);
+  });
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t seed_ = c10::detail::getNonDeterministicRandom(true);
+
+  @autoreleasepool {
+    MPSShape* input_shape = getMPSShape(input);
+    string key = "uniform_mps_" + getTensorsStringKey(input) + ":" + to_string(from) + ":" + to_string(to) + ":" + to_string(seed_);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          // TODO: right now taking the default seed. Extend it to be extracted from the
+          // MPSGenerator
+          MPSGraphTensor* randomTensor = [mpsGraph randomUniformTensorWithShape:input_shape
+                                                                           seed:seed_
+                                                                           name:nil];
+          MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta
+                                                               shape:input_shape
+                                                            dataType:MPSDataTypeFloat32];
+          MPSGraphTensor* fromTensor = [mpsGraph constantWithScalar:from
+                                                              shape:input_shape
+                                                           dataType:MPSDataTypeFloat32];
+          MPSGraphTensor* mulTensor = [mpsGraph multiplicationWithPrimaryTensor:randomTensor
+                                                                secondaryTensor:deltaTensor
+                                                                           name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:mulTensor
+                                                             secondaryTensor:fromTensor
+                                                                        name:nil];
+          newCachedGraph->outputTensor_ = outputTensor;
+
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, input);
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = nil;
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return input;
+}
+
+Tensor& normal_mps_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  if (self.numel() == 0)
+    return self;
+  TORCH_CHECK(std >= 0.0, "normal_mps_ expects std >= 0.0, but found std=", std);
+
+  Tensor mean_t = empty_mps(
+                      self.sizes(),
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  mean_t.fill_(mean);
+
+  Tensor std_t = empty_mps(
+                      self.sizes(),
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  std_t.fill_(std);
+
+  return normal_mps_out(mean_t, std_t, gen, self);
+}
+
+Tensor& normal_mps_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+  TORCH_CHECK(std >= 0.0, "normal_mps_out expects std >= 0.0, but found std=", std);
+
+  Tensor std_t = empty_mps(
+                      output.sizes(),
+                      output.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  std_t.fill_(std);
+
+  return normal_mps_out(mean, std_t, gen, output);
+
+}
+
+Tensor& normal_mps_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  TORCH_CHECK(
+    std.min().ge(0).item<bool>(),
+    "normal expects all elements of std >= 0.0");
+
+
+  Tensor mean_t = empty_mps(
+                      output.sizes(),
+                      output.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  mean_t.fill_(mean);
+
+  return normal_mps_out(mean_t, std, gen, output);
+
+}
+
+Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex");
+  TORCH_CHECK(std.numel() == 0 || std.min().ge(0).item<bool>(), "normal expects all elements of std >= 0.0");
+  // Check that mean and std have same number of elements
+  TORCH_CHECK(mean.numel() == std.numel(), "normal_mps_out: mean and std must have same number of elements")
+
+  using namespace mps;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* meanTensor_ = nil;
+    MPSGraphTensor* stdTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t seed_ = c10::detail::getNonDeterministicRandom(true);
+
+  @autoreleasepool {
+    MPSShape* input_shape = getMPSShape(output);
+    string key = "normal_mps_out:" + getMPSShapeString(input_shape) + ":" + getMPSTypeString(output.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphRandomOpDescriptor* desc = [[MPSGraphRandomOpDescriptor new] autorelease];
+          desc.distribution = MPSGraphRandomDistributionNormal;
+          desc.dataType = getMPSDataType(output.scalar_type());
+          desc.mean = 0.0;
+          desc.standardDeviation = 1.0;
+
+          MPSGraphTensor* meanTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), input_shape);
+          MPSGraphTensor* stdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), input_shape);
+
+          // TODO: right now taking the default seed. Extend it to be extracted from the
+          // MPSGenerator
+          MPSGraphTensor* randomTensor = [mpsGraph randomTensorWithShape:input_shape
+                                                              descriptor:desc
+                                                                    name:nil];
+          MPSGraphTensor* scaleTensor = [mpsGraph multiplicationWithPrimaryTensor:randomTensor
+                                                                  secondaryTensor:stdTensor
+                                                                             name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:scaleTensor
+                                                            secondaryTensor:meanTensor
+                                                                        name:nil];
+          newCachedGraph->meanTensor_ = meanTensor;
+          newCachedGraph->stdTensor_ = stdTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto meanPlaceholder = Placeholder(cachedGraph->meanTensor_, mean);
+    auto stdPlaceholder = Placeholder(cachedGraph->stdTensor_, std);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      meanPlaceholder.getMPSGraphTensor() : meanPlaceholder.getMPSGraphTensorData(),
+      stdPlaceholder.getMPSGraphTensor() : stdPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return output;
+}
+
+Tensor& bernoulli_out_mps(const Tensor& p_, c10::optional<Generator> gen, Tensor& result) {
+  result.resize_(p_.sizes());
+  return  bernoulli_mps_(result, p_, gen);
+}
+
+Tensor& bernoulli_mps_(Tensor& self, double p, c10::optional<Generator> gen) {
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_mps_ expects p to be in [0, 1], but got p=", p);
+  Tensor p_t = empty_mps(
+                      self.sizes(),
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  p_t.fill_(p);
+
+  return bernoulli_mps_(self, p_t, gen);
+}
+
+Tensor& bernoulli_mps_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  TORCH_CHECK(self.is_same_size(p_), "bernoulli_mps_: probability and self tensor should be of the same shape")
+
+  using namespace mps;
+
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t seed_ = c10::detail::getNonDeterministicRandom(true);
+
+  @autoreleasepool {
+    MPSShape* input_shape = getMPSShape(self);
+
+    auto mps_dtype = getMPSDataType(p_.scalar_type());
+
+    MPSGraph* mpsGraph = make_mps_graph();
+
+    MPSGraphTensor* probTensor = mpsGraphRankedPlaceHolder(mpsGraph, mps_dtype, input_shape);
+
+    // TODO: right now taking the default seed. Extend it to be extracted from the
+    // MPSGenerator
+    MPSGraphTensor* randomTensor = [mpsGraph randomUniformTensorWithShape:input_shape
+                                                                     seed:seed_
+                                                                     name:nil];
+    MPSGraphTensor* outputTensor = [mpsGraph lessThanWithPrimaryTensor:randomTensor
+                                                       secondaryTensor:probTensor
+                                                                  name:nil];
+
+    auto probPlaceholder = Placeholder(probTensor, p_);
+    auto outputPlaceholder = Placeholder(outputTensor, self);
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      probPlaceholder.getMPSGraphTensor() : probPlaceholder.getMPSGraphTensorData(),
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, mpsGraph, feeds, results);
+  }
+
+  return self;
+
+}
+
+// Taken from ATen/native/DistributionTemplates.h
+#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \
+  TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \
+
+#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \
+  if (var < -(1LL << digits) || var > (1LL << digits)) { \
+    TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \
+      "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \
+      "This warning will become an error in version 1.7 release, please fix the code in advance"); \
+  }
+
+// Modified from ATen/native/DistributionTemplates.h
+static void check_from_to_in_range(int64_t from, int64_t to_inc, ScalarType scalar_type) {
+  const auto dtype = scalarTypeToTypeMeta(scalar_type);
+  if (isFloatingType(scalar_type)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] {
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+
+      constexpr auto digits = std::numeric_limits<scalar_t>::digits;
+      WARN_OUT_OF_BOUNDS(from, "from", digits, dtype);
+      WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype);
+    });
+  } else if (isIntegralType(scalar_type, /*includeBool=*/true)) {
+    AT_DISPATCH_INTEGRAL_TYPES_AND(at::ScalarType::Bool, scalar_type, "check_random_integral_bounds", [&]() {
+      const auto min = static_cast<int64_t>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+    });
+  } else {
+    TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types");
+  }
+}
+
+
+// random_.from
+Tensor& random_mps_
+  (Tensor& self,
+   int64_t from,
+   optional<int64_t> to_opt,
+   c10::optional<Generator> gen) {
+
+  using namespace mps;
+
+  MPSStream* stream = getCurrentMPSStream();
+  uint64_t seed_ = c10::detail::getNonDeterministicRandom(true);
+
+  auto input_dtype = self.scalar_type();
+
+  int64_t to;
+
+  if(to_opt.has_value()) {
+    // [from, to)
+    to = *to_opt;
+    TORCH_CHECK(from < to, "random_mps_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to);
+    if (isFloatingType(input_dtype)) {
+      // TODO: what is "random_update_from_to"?
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_update_from_to", [&] {
+        from = templates::update_from<scalar_t>(from);
+        to = templates::update_to<scalar_t>(to);
+        TORCH_CHECK(from < to, "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to);
+      });
+      check_from_to_in_range(from, to - 1, input_dtype);
+    }
+  }
+  else if (from != std::numeric_limits<int64_t>::lowest()) {
+    // [from, std::numeric_limits<int64_t>::max()]
+    to = 0;
+    if(isFloatingType(input_dtype)) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_from_to_range_calc", [&] {
+        constexpr int64_t scalar_t_max = static_cast<int64_t>(1) << std::numeric_limits<scalar_t>::digits;
+        to = scalar_t_max > std::numeric_limits<int64_t>::max() ? std::numeric_limits<int64_t>::max() : static_cast<int64_t>(scalar_t_max);
+        from = templates::update_from<scalar_t>(from);
+        TORCH_CHECK(from < to, "random_mps_ expects 'from' casted to dtype to be less than or equal to 'to' casted to dtype, but got from=", from, " > to=", to);
+      });
+    }
+    else if(isIntegralType(input_dtype, /*includeBool=*/true)) {
+      AT_DISPATCH_INTEGRAL_TYPES_AND(at::ScalarType::Bool, input_dtype, "random_from_to_range_calc", [&] {
+        if (std::is_same<scalar_t, bool>::value) {
+          to = static_cast<int64_t>(true);
+        } else {
+          to = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+        }
+      });
+    }
+    else {
+      TORCH_CHECK(false, "random_mps_ handles only integral, floating-point and boolean types");
+    }
+    check_from_to_in_range(from, to, input_dtype);
+  }
+  else {
+    // [std::numeric_limits<int64_t>::lowest(), std::numeric_limits<int64_t>::max()]
+    // range = 2^64
+
+    // TODO - how to implement this?
+    TORCH_CHECK(false, "random_mps_ currently does not handle the lowest() -> max() range");
+
+  }
+
+  @autoreleasepool {
+    MPSShape* input_shape = getMPSShape(self);
+
+    MPSGraph* mpsGraph = make_mps_graph();
+
+    MPSGraphRandomOpDescriptor* descriptor = [MPSGraphRandomOpDescriptor descriptorWithDistribution:MPSGraphRandomDistributionUniform
+                                                                                           dataType:MPSDataTypeInt32];
+    descriptor.minInteger = from;
+    descriptor.maxInteger = to - 1;
+
+    // TODO: right now taking the default seed. Extend it to be extracted from the
+    // MPSGenerator
+    MPSGraphTensor* randomTensor = [mpsGraph randomTensorWithShape:input_shape
+                                                        descriptor:descriptor
+                                                              seed:seed_
+                                                              name:nil];
+
+    MPSGraphTensor* outputTensor = nil;
+
+    if(input_dtype != ScalarType::Int)
+      outputTensor = [mpsGraph castTensor:randomTensor
+                                   toType:getMPSDataType(input_dtype)
+                                     name:@"outputTensor"];
+    else
+      outputTensor = randomTensor;
+
+    auto outputPlaceholder = Placeholder(outputTensor, self);
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = nil;
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, mpsGraph, feeds, results);
+  }
+
+  return self;
+
+}
+
+Tensor& random_mps_
+  (Tensor& self,
+   int64_t to,
+   c10::optional<Generator> gen) {
+
+  return random_mps_(self, 0, to, gen);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
new file mode 100644
index 000000000000..c313f0624b98
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -0,0 +1,330 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+
+#include <ATen/ceil_div.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/AccumulateType.h>
+#include <torch/library.h>
+#include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/IndexingUtils.h>
+#include <c10/util/irange.h>
+#include <c10/core/QScheme.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+
+Tensor index_select_mps(const Tensor & self,
+                         int64_t dim,
+                         const Tensor & index) {
+  IntArrayRef input_shape = self.sizes();
+  auto num_input_dims = input_shape.size();
+
+  IntArrayRef index_shape = index.sizes();
+  auto num_indices = index.numel();
+  TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
+
+  dim = maybe_wrap_dim(dim, self.dim());
+  int64_t* shape_data = (int64_t*)malloc(num_input_dims * sizeof(int64_t));
+
+  // Calculate new shape
+  for(int i = 0; i < num_input_dims; i++) {
+    if(i == dim)
+      shape_data[i] = num_indices;
+    else
+      shape_data[i] = input_shape[i];
+  }
+
+  IntArrayRef output_shape = IntArrayRef(shape_data, num_input_dims);
+
+  Tensor result = at::native::empty_mps(
+                      output_shape,
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+
+  free(shape_data);
+
+  index_select_out_mps(self, dim, index, result);
+  return result;
+}
+
+Tensor& index_select_out_mps(const Tensor & self,
+                             int64_t dim,
+                             const Tensor & index,
+                             Tensor & output) {
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+  dim = maybe_wrap_dim(dim, self.dim());
+  // Checks
+  TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
+  TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
+  TORCH_CHECK(self.scalar_type() == output.scalar_type(),
+              "index_select(): self and output must have the same scalar type");
+  TORCH_CHECK(dim == 0 || dim < self.dim(),
+              "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* indexTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
+
+          MPSGraphTensor* outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor
+                                                             indicesTensor:indexTensor
+                                                                      axis:dim
+                                                           batchDimensions:0
+                                                                      name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->indexTensor_ = indexTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+
+}
+
+Tensor & masked_fill__mps(Tensor& self, const Tensor & mask, const Scalar& value) {
+  using namespace mps;
+  TORCH_CHECK(self.device() == mask.device(), "expected self and mask to be on the same device, but got mask on ",
+    mask.device(), " and self on ", self.device());
+  TORCH_CHECK(mask.scalar_type() == kByte || mask.scalar_type() == kBool,
+    "expected mask dtype to be Bool but got ", mask.scalar_type());
+  auto maybe_outnames = namedinference::broadcast_to_outnames(self, mask, "masked_fill_");
+
+  c10::MaybeOwned<Tensor> b_mask = expand_inplace(self, mask, "masked_fill_");
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *maskTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+  @autoreleasepool {
+    string key = "masked_fill" + getTensorsStringKey({self, mask}) + ":" + std::to_string(value.toDouble());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, mask);
+          MPSDataType valueType = getMPSScalarType(value.type());
+
+          // constantWithScalar doesn't like Bool constants getting created so
+          // mapping them to int8
+          if (valueType == MPSDataTypeBool) {
+            valueType = MPSDataTypeInt8;
+          }
+          MPSGraphTensor* valueTensor =  [mpsGraph constantWithScalar:value.to<double>()
+                                                            dataType:valueType];
+          valueTensor = [mpsGraph castTensor:valueTensor
+                                          toType:getMPSDataType(self.scalar_type())
+                                           name : @"castTensorEq"];
+
+          MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor
+                                                        truePredicateTensor:valueTensor
+                                                        falsePredicateTensor:inputTensor
+                                                             name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->maskTensor_ = maskTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder maskPlaceholder   = Placeholder(cachedGraph->maskTensor_, mask);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+  namedinference::propagate_names_if_nonempty(self, maybe_outnames);
+  return self;
+}
+
+Tensor embedding_dense_backward_mps(
+    const Tensor & grad_, const Tensor & indices, int64_t num_weights,
+    int64_t padding_idx, bool scale_grad_by_freq)
+{
+    // TODO: implement padding_idx & scale_grad_by_freq.
+    namespace native_mps = at::native::mps;
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *incomingGradTensor_ = nil;
+      MPSGraphTensor *indicesTensor_ = nil;
+      MPSGraphTensor *outgoingGradTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    IntArrayRef incoming_gradient_shape = grad_.sizes();
+    int64_t num_incoming_gradient_dims = incoming_gradient_shape.size();
+
+    IntArrayRef indices_shape = indices.sizes();
+    int64_t num_indices_dims = indices_shape.size();
+
+    int64_t* outgoing_gradient_shape = (int64_t *) malloc(sizeof(int64_t) * 2);
+    int64_t D = incoming_gradient_shape[num_incoming_gradient_dims - 1];
+    outgoing_gradient_shape[0] = num_weights;
+    outgoing_gradient_shape[1] = D;
+    int64_t num_outgoing_gradient_dims = 2;
+    Tensor outgoing_gradient = at::native::empty_mps(
+                                IntArrayRef(outgoing_gradient_shape, num_outgoing_gradient_dims),
+                                grad_.scalar_type(),
+                                c10::nullopt,
+                                kMPS,
+                                c10::nullopt,
+                                c10::nullopt);
+
+    if (outgoing_gradient.numel() == 0) {
+      return outgoing_gradient;
+    }
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        string key = "edb_mps:" + native_mps::getMPSTypeString(grad_.scalar_type()) + ":indices" + std::to_string(num_indices_dims) + ":num_weights" + std::to_string(num_weights) + ":padding_idx" + std::to_string(padding_idx) + ":scaled" + std::to_string(scale_grad_by_freq);
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      // Initialize once if configuration not found in cache
+      if(!cachedGraph) {
+        native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+          CachedGraph *newCachedGraph = nil;
+
+          @autoreleasepool {
+            MPSGraph* mpsGraph = native_mps::make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+
+            MPSGraphTensor* incomingGradTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(grad_.scalar_type()));
+
+            MPSGraphTensor* indicesTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(indices.scalar_type()));
+
+            MPSGraphTensor *reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor:indicesTensor
+                             axes:@[@-1]
+                             name:nil];
+
+            MPSGraphTensor *outgoingGradTensor;
+            outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor:incomingGradTensor
+                            indicesTensor:reshapedIndicesTensor
+                                    shape:native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape, num_outgoing_gradient_dims))
+                          batchDimensions:0
+                                     mode:MPSGraphScatterModeAdd
+                                     name:@"edb"];
+
+            newCachedGraph->incomingGradTensor_ = incomingGradTensor;
+            newCachedGraph->indicesTensor_ = indicesTensor;
+            newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
+
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+      }
+      auto incomingGradPlaceholder = native_mps::Placeholder(cachedGraph->incomingGradTensor_, grad_);
+      auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices);
+      auto outgoingGradPlaceholder = native_mps::Placeholder(cachedGraph->outgoingGradTensor_, outgoing_gradient);
+
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          incomingGradPlaceholder.getMPSGraphTensor() : incomingGradPlaceholder.getMPSGraphTensorData(),
+          indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
+      };
+
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outgoingGradPlaceholder.getMPSGraphTensor() : outgoingGradPlaceholder.getMPSGraphTensorData()
+      };
+      native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+    free(outgoing_gradient_shape);
+    return outgoing_gradient;
+}
+
+Tensor & masked_fill__mps(Tensor& self, const Tensor & mask, const Tensor & value) {
+  TORCH_CHECK(value.dim() == 0, "masked_fill_ only supports a 0-dimensional value tensor, but got tensor "
+      "with ", value.dim(), " dimension(s).");
+  return masked_fill__mps(self, mask, value.item());
+}
+
+}
+}
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
new file mode 100644
index 000000000000..d9cad62ee27c
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -0,0 +1,358 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+using namespace at::mps;
+
+namespace at {
+namespace native {
+
+Tensor _mps_linear(
+  const Tensor& input,
+  const Tensor& weight,
+  const c10::optional<Tensor>& bias_opt) {
+  // wT = transpose(weight);
+  // y=x*wT+b
+
+  using namespace mps;
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  auto bias = bias_opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
+    : c10::MaybeOwned<Tensor>::owned(c10::in_place);
+
+  auto input_size = input.sizes();
+  std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
+  output_size.push_back(weight.size(0));
+  Tensor output = at::native::empty_mps(output_size,
+                                        input.scalar_type(),
+                                        c10::nullopt,
+                                        kMPS,
+                                        c10::nullopt,
+                                        input.suggest_memory_format());
+
+  TORCH_CHECK(output.is_mps());
+
+  MPSStream *stream = getCurrentMPSStream();
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* biasTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  bool is_bias_defined = bias->defined();
+
+  @autoreleasepool {
+
+    MPSShape* wt_shape = getMPSShape(weight);
+    string wt_key = string([[[wt_shape valueForKey:@"description"] componentsJoinedByString:@","] UTF8String]);
+    MPSShape* bias_shape = nil;
+    string bias_key = "nobias";
+    if(is_bias_defined) {
+      bias_key = "bias";
+    }
+
+    string key = "mps_linear" + getTensorsStringKey({input, weight}) + ":" + bias_key;
+
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
+          MPSGraphTensor* biasTensor = nil;
+
+          if(is_bias_defined) {
+            biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType((*bias).scalar_type()));
+          }
+
+          MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
+                                                                  dimension:-1
+                                                              withDimension:-2
+                                                                       name:nil];
+
+          MPSGraphTensor* outputTensor = nil;
+
+          if (!is_bias_defined)
+          {
+            outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor
+                                                           secondaryTensor:weightTransposeTensor
+                                                                      name:nil];
+          }
+          else
+          {
+            MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor
+                                                                           secondaryTensor:weightTransposeTensor
+                                                                                      name:nil];
+            outputTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor
+                                               secondaryTensor:biasTensor
+                                                          name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->biasTensor_ = biasTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+    Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
+    Placeholder biasPlaceholder = Placeholder();
+    if(is_bias_defined)
+      biasPlaceholder = Placeholder(cachedGraph->biasTensor_, *bias);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
+    feeds[inputPlaceholder.getMPSGraphTensor()]   = inputPlaceholder.getMPSGraphTensorData();
+    feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+    if (is_bias_defined)
+        feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+Tensor _mps_linear_backward_input(
+    IntArrayRef    input_size,
+    const Tensor & grad_output,
+    const Tensor & weight)
+{
+  TORCH_CHECK(grad_output.is_mps(),
+      "mps_linear_backward: grad_output needs to be mps layout");
+  TORCH_CHECK(weight.device().is_mps() && weight.scalar_type() == kFloat,
+      "mps_linear_backward: weight needs to be a dense tensor");
+
+  const Tensor weight_reshaped = weight.is_contiguous() ? weight : weight.contiguous();
+
+   struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *weightTensor_ = nil;
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  Tensor output = at::native::empty_mps(input_size,
+                                        grad_output.scalar_type(),
+                                        c10::nullopt,
+                                        kMPS,
+                                        c10::nullopt,
+                                        grad_output.suggest_memory_format());
+  TORCH_CHECK(output.is_mps());
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  MPSStream *stream= getCurrentMPSStream();
+
+  @autoreleasepool {
+
+   string key = "mps_linear_backward_input" + mps::getTensorsStringKey({grad_output, weight_reshaped});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
+          MPSGraphTensor *gradOutputTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+          MPSGraphTensor *outputTensor =
+            [mpsGraph matrixMultiplicationWithPrimaryTensor: gradOutputTensor
+                                           secondaryTensor: weightTensor
+                                                      name: nil];
+
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight_reshaped);
+    mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    return output;
+  }
+}
+
+std::tuple<Tensor, Tensor> _mps_linear_backward_weights(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight, bool bias_defined)
+{
+  TORCH_CHECK(grad_output.is_mps() && input.is_mps(),
+      "_mps_linear_backward: grad_output and input needs to be mps layout");
+
+   struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *weightTensor_ = nil;
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+    MPSGraphTensor *biasTensor_ = nil;
+  };
+
+  auto grad_output_reshaped = grad_output.dim() > 2 ?
+    grad_output.reshape({-1, grad_output.size(grad_output.dim() - 1)}) : grad_output;
+  auto input_reshaped = input.dim() > 2 ? input.reshape({-1, input.size(input.dim() - 1)}) : input;
+
+  TORCH_CHECK(grad_output_reshaped.is_mps());
+  TORCH_CHECK(input_reshaped.is_mps());
+
+  Tensor output = at::native::empty_mps({grad_output_reshaped.size(1), input_reshaped.size(1)},
+                                        grad_output.scalar_type(),
+                                        c10::nullopt,
+                                        kMPS,
+                                        c10::nullopt,
+                                        grad_output.suggest_memory_format());
+  Tensor bias = at::native::empty_mps({grad_output_reshaped.size(1)},
+                                        grad_output.scalar_type(),
+                                        c10::nullopt,
+                                        kMPS,
+                                        c10::nullopt,
+                                        grad_output.suggest_memory_format());
+  TORCH_CHECK(output.is_mps());
+  TORCH_CHECK(bias.is_mps());
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  MPSStream *stream= getCurrentMPSStream();
+
+  @autoreleasepool {
+
+   string key = "mps_linear_backward_weights:" + to_string(bias_defined) + ":" +
+                                                 mps::getTensorsStringKey({input_reshaped, weight, grad_output_reshaped});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped);
+          MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight);
+          MPSGraphTensor *gradOutputTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_reshaped);
+
+          MPSGraphTensor *gradOutputTransposeTensor =
+            [mpsGraph transposeTensor: gradOutputTensor
+                            dimension: -1
+                        withDimension: -2
+                                 name: nil];
+
+          // grad_weight
+          MPSGraphTensor *outputTensor =
+            [mpsGraph matrixMultiplicationWithPrimaryTensor: gradOutputTransposeTensor
+                                            secondaryTensor: inputTensor
+                                                       name: nil];
+          MPSGraphTensor *biasTensor = nil;
+          if (bias_defined)
+          {
+              // grad_bias
+              biasTensor = [mpsGraph reductionSumWithTensor: gradOutputTensor
+                                                       axis: 0
+                                                       name: nil];
+
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->biasTensor_ = biasTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    mps::Placeholder inputPlaceholder = mps::Placeholder(cachedGraph->inputTensor_, input_reshaped);
+    mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight);
+    mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output_reshaped);
+    mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output);
+    mps::Placeholder biasPlaceholder = mps::Placeholder(cachedGraph->biasTensor_, bias);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [NSMutableDictionary dictionary];
+    results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData();
+    if (bias_defined)
+      results[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    return std::tuple<Tensor, Tensor>{ output, bias };
+  }
+}
+
+
+std::tuple<Tensor, Tensor, Tensor> mps_linear_backward(
+    const Tensor& input, const Tensor& grad_output,
+    const Tensor& weight, std::array<bool,3> output_mask) {
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::_mps_linear_backward_input(input.sizes(), grad_output, weight);
+  }
+  if (output_mask[1] || output_mask[2]) {
+    std::tie(grad_weight, grad_bias) = at::_mps_linear_backward_weights(grad_output, input, weight, output_mask[2]);
+  }
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
new file mode 100644
index 000000000000..3b02567a2236
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -0,0 +1,598 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+
+namespace at {
+namespace native {
+
+/*
+ * Helper functions to be used for mm/addmm for detecting the Transpositions
+ * when doing Batched GEMM operations.
+ */
+
+static Tensor prepare_batch_matrix_by_transposing(const Tensor& tensor,
+                                       bool& transpose_tensor,
+                                       int64_t& ld_tensor,
+                                       bool transpose_result,
+                                       int64_t m, int64_t n) {
+  IntArrayRef tensor_strides = tensor.strides();
+  Tensor tensor_;
+  int fast_dim = transpose_result ? 2 : 1;
+  int leading_dim = transpose_result ? 1 : 2;
+
+  if (tensor_strides[fast_dim] == 1 &&
+    (tensor_strides[leading_dim] >= std::max<int64_t>(1, m))) {
+    transpose_tensor = false;
+    tensor_ = tensor;
+    ld_tensor = tensor_strides[leading_dim];
+  } else if ((tensor_strides[leading_dim] == 1) &&
+    (tensor_strides[fast_dim] >= std::max<int64_t>(1, n))) {
+    transpose_tensor = true;
+    tensor_ = tensor;
+    ld_tensor = tensor_strides[fast_dim];
+  } else {
+    transpose_tensor = !transpose_result;
+    // gemm call requires leading dimension and stride parameters to be non-zero
+    bool is_stride_non_zero = tensor.stride(1) != 0 && tensor.stride(2) != 0;
+    if (tensor.is_contiguous() && is_stride_non_zero) {
+      tensor_ = tensor;
+    } else {
+      tensor_ = tensor.clone(at::MemoryFormat::Contiguous);
+    }
+    ld_tensor = tensor_.stride(1);
+  }
+
+  return tensor_;
+}
+
+/*
+ * Helper functions to be used for mm/addmm for detecting the Transpositions
+ * when doing GEMM operations.
+ */
+void prepare_matrices_for_broadcasting(
+  const Tensor * bias,
+  const Tensor & self,
+  const Tensor & other,
+  const Scalar * beta,
+  bool * transpose_mat1_times_mat2,
+  bool & transpose_mat1,
+  bool & transpose_mat2) {
+  TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
+  if (bias && beta->toDouble() != 0.0f) {
+    TORCH_CHECK(bias->dim() == 2, "tensors must be 2-D");
+  }
+
+  std::pair<int64_t, int64_t> mat1_sizes;
+  std::pair<int64_t, int64_t> mat2_sizes;
+
+  mat1_sizes = std::make_pair(self.sizes()[0], self.sizes()[1]);
+  mat2_sizes = std::make_pair(other.sizes()[0], other.sizes()[1]);
+
+  if (mat1_sizes == mat2_sizes) {
+    transpose_mat2 = true;
+    std::swap(mat2_sizes.first, mat2_sizes.second);
+  }
+  if (bias && beta && transpose_mat1_times_mat2) {
+    if (beta->toDouble() != 0.0f && mat1_sizes.first == bias->sizes()[1] && mat2_sizes.second == bias->sizes()[0])
+      *transpose_mat1_times_mat2 = true;
+  }
+}
+
+enum LinearAlgebraOpType {
+  ADDBMM_OP_TYPE,
+  BADDBMM_OP_TYPE
+};
+
+Tensor& mm_out_mps_impl(
+    const Tensor& self,
+    const Tensor& other,
+    Tensor& output) {
+  using namespace mps;
+  TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
+
+  TensorArg args[]{{output, "out", 0}, {self, "mat1", 1}, {other, "mat2", 2}};
+  checkAllSameGPU("mm", args);
+
+  TORCH_CHECK(output.is_mps());
+
+  // Transpose inputs if needed
+  IntArrayRef output_sizes = output.sizes();
+  if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) {
+    return output;
+  }
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *selfTensor_ = nil;
+    MPSGraphTensor *otherTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  bool transpose_mat1            = false;
+  bool transpose_mat2            = false;
+
+  prepare_matrices_for_broadcasting(NULL, self, other, NULL, NULL, transpose_mat1, transpose_mat2);
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    string key = "mm_out_mps_impl" + getTensorsStringKey({self, other})
+                                   + ":" + to_string(transpose_mat1) + ":" + to_string(transpose_mat2);
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *otherTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, other);
+
+          MPSGraphTensor* t1 = nil;
+          MPSGraphTensor* t2 = nil;
+
+          if(transpose_mat1)
+            t1 = [mpsGraph transposeTensor:selfTensor
+                                 dimension:-1
+                             withDimension:-2
+                                      name:nil];
+          else
+            t1 = selfTensor;
+
+          if(transpose_mat2)
+            t2 = [mpsGraph transposeTensor:otherTensor
+                                 dimension:-1
+                             withDimension:-2
+                                      name:nil];
+          else
+            t2 = otherTensor;
+
+          MPSGraphTensor* outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:t1
+                                                                         secondaryTensor:t2
+                                                                                    name:nil];
+
+          newCachedGraph->selfTensor_ = selfTensor;
+          newCachedGraph->otherTensor_ = otherTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+Tensor& addmm_out_mps_impl(
+    const Tensor& bias,
+    const Tensor& self,  // input
+    const Tensor& other, // weight
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& output) {
+  using namespace mps;
+
+  TORCH_CHECK(output.is_mps());
+  TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
+
+  TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}};
+  checkAllSameGPU(__func__, args);
+
+  IntArrayRef mat1_sizes = self.sizes();
+  IntArrayRef mat2_sizes = other.sizes();
+  IntArrayRef bias_sizes;
+  c10::MaybeOwned<Tensor> bias_;
+  if (&output != &bias) {
+    bias_ = expand_size(bias, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    bias_sizes = bias_->sizes();
+  } else {
+    bias_ = c10::MaybeOwned<Tensor>::borrowed(bias);
+    bias_sizes = bias_->sizes();
+    TORCH_CHECK(output.dim() == 2, "tensors must be 2-D");
+    TORCH_CHECK(bias_sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
+    TORCH_CHECK(bias_sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
+  }
+
+  if (&output != &self) {
+    output.resize_(bias_sizes);
+    if (beta.toComplexDouble() != 0.0) {
+      at::native::copy_(output, *bias_);
+    }
+  }
+  IntArrayRef output_sizes = output.sizes();
+  if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) {
+    return output;
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  MPSGraph* mpsGraph = make_mps_graph();
+
+  bool transpose_mat1_times_mat2 = false;
+  bool transpose_mat1            = false;
+  bool transpose_mat2            = false;
+
+  prepare_matrices_for_broadcasting(&bias, self, other, &beta, &transpose_mat1_times_mat2, transpose_mat1, transpose_mat2);
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *selfTensor_ = nil;
+    MPSGraphTensor *otherTensor_ = nil;
+    MPSGraphTensor *biasTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, bias})
+                                       + ":" + to_string(transpose_mat1) + ":" + to_string(transpose_mat2)
+                                       + ":" + to_string(beta.toDouble())
+                                       + ":" + to_string(alpha.toDouble());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *otherTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, other);
+          MPSGraphTensor *biasTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, bias);
+
+          MPSGraphTensor* t1 = nil;
+          MPSGraphTensor* t2 = nil;
+
+          if(transpose_mat1)
+            t1 = [mpsGraph transposeTensor:selfTensor
+                                 dimension:-1
+                             withDimension:-2
+                                      name:nil];
+          else
+            t1 = selfTensor;
+
+          if(transpose_mat2)
+            t2 = [mpsGraph transposeTensor:otherTensor
+                                 dimension:-1
+                             withDimension:-2
+                                      name:nil];
+          else
+            t2 = otherTensor;
+
+
+          // TODO: Use alpha and beta here with fill_.Scalar and mul
+          // Intermediate as placeholder
+          MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:t1
+                                                                          secondaryTensor:t2
+                                                                                     name:@"MM/(mat1@mat2)"];
+
+          // Intermediates for beta and alpha
+          MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta.toDouble()
+                                                           dataType:getMPSScalarType(bias.scalar_type())];
+          MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.toDouble()
+                                                           dataType:getMPSScalarType(self.scalar_type())];
+
+          // Intermediates for multiplying by beta and alpha
+          MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:productTensor
+                                                                              secondaryTensor:alphaTensor
+                                                                                         name:@"MM/alpha*(mat1@mat2)"];
+          MPSGraphTensor* biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor
+                                                                          secondaryTensor:betaTensor
+                                                                                     name:@"MM/beta*input"];
+
+          if (transpose_mat1_times_mat2)
+            biasTimesBetaTensor = [mpsGraph transposeTensor: biasTimesBetaTensor
+                                                  dimension: -1
+                                              withDimension: -2
+                                                       name: nil];
+
+          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:productTimesAlphaTensor
+                                                             secondaryTensor:biasTimesBetaTensor
+                                                                        name:@"MM/beta*input + alpha*(mat1@mat2)"];
+
+          newCachedGraph->selfTensor_ = selfTensor;
+          newCachedGraph->otherTensor_ = otherTensor;
+          newCachedGraph->biasTensor_ = biasTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(),
+      biasPlaceholder.getMPSGraphTensor() : biasPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+
+Tensor& bmm_out_mps_impl(
+  const Tensor & batch1,
+  const Tensor & batch2,
+  Tensor & result) {
+  using namespace mps;
+
+  if (batch1.numel() == 0 || batch2.numel() == 0) {
+    return result;
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *batch1Tensor_ = nil;
+    MPSGraphTensor *batch2Tensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1);
+          MPSGraphTensor *batch2Tensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2);
+
+          MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor
+                                                                          secondaryTensor:batch2Tensor
+                                                                                     name:@"MM/(batch1@batch2)"];
+
+          newCachedGraph->batch1Tensor_ = batch1Tensor;
+          newCachedGraph->batch2Tensor_ = batch2Tensor;
+          newCachedGraph->outputTensor_ = productTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1);
+    Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(),
+      batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return result;
+}
+
+Tensor& addbmm_or_baddbmm_out_mps_impl(
+  const Tensor       & input,
+  const Tensor       & batch1,
+  const Tensor       & batch2,
+  const Scalar       & beta,
+  const Scalar       & alpha,
+  Tensor             & result,
+  LinearAlgebraOpType  opType) {
+  using namespace mps;
+
+  TORCH_CHECK(input.is_mps());
+  TORCH_CHECK(batch1.is_mps());
+  TORCH_CHECK(batch2.is_mps());
+  TORCH_CHECK(result.is_mps());
+
+  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
+  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
+  TORCH_CHECK(batch1.size(0) == batch2.size(0),
+      "batch1 and batch2 must have same number of batches, got ",
+      batch1.size(0), " and ", batch2.size(0));
+  TORCH_CHECK(batch1.size(2) == batch2.size(1),
+      "Incompatible matrix sizes for bmm (",
+      batch1.size(1), "x", batch1.size(2), " and ",
+      batch2.size(1), "x", batch2.size(2), ")");
+
+  const int64_t dim1 = batch1.size(1);
+  const int64_t dim2 = batch2.size(2);
+  TORCH_CHECK(input.size(0) == dim1 && input.size(1) == dim2,
+      "input tensor does not match matmul output shape");
+
+  if (opType == ADDBMM_OP_TYPE)
+  {
+    result.resize_as_(input);
+
+    const int64_t num_batches = batch1.size(0);
+
+    if (num_batches == 0) {
+      result.zero_();
+      return result;
+    }
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *batch1Tensor_ = nil;
+    MPSGraphTensor *batch2Tensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = (opType == ADDBMM_OP_TYPE) ? ("addbmm_out_mps_impl") : ("baddbmm_out_mps_impl");
+    key += getTensorsStringKey({batch1, batch2, input})
+               + ":" + to_string(beta.toDouble())
+               + ":" + to_string(alpha.toDouble());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor  = mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor *batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1);
+          MPSGraphTensor *batch2Tensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2);
+
+          // Intermediates for beta and alpha
+          MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar: beta.toDouble()
+                                                           dataType: getMPSScalarType(input.scalar_type())];
+          MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar: alpha.toDouble()
+                                                            dataType: getMPSScalarType(batch1.scalar_type())];
+
+          MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor
+                                                                          secondaryTensor:batch2Tensor
+                                                                                     name:@"(batch1@batch2)"];
+
+          MPSGraphTensor* reductionSumTensor = productTensor;
+          if (opType == ADDBMM_OP_TYPE) {
+            reductionSumTensor = [mpsGraph reductionSumWithTensor: productTensor
+                                                             axis: 0
+                                                             name: @"reductionSum(batch1@batch2)"];
+          }
+
+          // Intermediates for multiplying by beta and alpha
+          MPSGraphTensor* reductionSumTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor: reductionSumTensor
+                                                                              secondaryTensor: alphaTensor
+                                                                                         name: @"alpha*(batch1@batch2)"];
+          MPSGraphTensor* biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                                                          secondaryTensor: betaTensor
+                                                                                     name: @"beta*input"];
+
+          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:reductionSumTimesAlphaTensor
+                                                             secondaryTensor:biasTimesBetaTensor
+                                                                        name:@"beta*input + alpha*(batch1@batch2)"];
+
+          newCachedGraph->inputTensor_  = inputTensor;
+          newCachedGraph->batch1Tensor_ = batch1Tensor;
+          newCachedGraph->batch2Tensor_ = batch2Tensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor_,  input);
+    Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1);
+    Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor()  : inputPlaceholder.getMPSGraphTensorData(),
+      batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(),
+      batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return result;
+}
+
+TORCH_IMPL_FUNC(mm_out_mps)(const Tensor& self, const Tensor& mat2, const Tensor& result) {
+  mm_out_mps_impl(self, mat2, const_cast<Tensor&>(result));
+}
+
+TORCH_IMPL_FUNC(addmm_out_mps)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  addmm_out_mps_impl(self, mat1, mat2, beta, alpha, const_cast<Tensor&>(result));
+}
+
+TORCH_IMPL_FUNC(bmm_out_mps) (const Tensor & batch1, const Tensor & batch2, const Tensor & result) {
+  bmm_out_mps_impl(batch1, batch2, const_cast<Tensor&>(result));
+}
+
+TORCH_IMPL_FUNC(baddbmm_out_mps) (const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  addbmm_or_baddbmm_out_mps_impl(self, batch1, batch2, beta, alpha, const_cast<Tensor&>(result), BADDBMM_OP_TYPE);
+}
+
+Tensor& addbmm_out_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, Tensor& result) {
+  auto b_self = expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm_out");
+
+  addbmm_or_baddbmm_out_mps_impl(*b_self, batch1, batch2, beta, alpha, result, ADDBMM_OP_TYPE);
+  return result;
+}
+
+Tensor addbmm_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) {
+  Tensor result = at::empty({0}, self.options());
+  return addbmm_out_mps(self, batch1, batch2, beta, alpha, result);
+}
+
+Tensor &addbmm_mps_(Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) {
+  return addbmm_out_mps(self, batch1, batch2, beta, alpha, self);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
new file mode 100644
index 000000000000..35202fd70a5f
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -0,0 +1,1379 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <objc/NSObjCRuntime.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+namespace mps {
+
+string reductionToString(int64_t reduction)
+{
+    switch(reduction) {
+        case Reduction::Mean: return "Mean";
+        case Reduction::Sum: return "Sum";
+        default: return "None";
+    }
+}
+
+MPSGraphTensor* reduceTensor(MPSGraphTensor *tensor, int64_t reduction, MPSGraph *mpsGraph, NSUInteger axesCount)
+{
+    NSMutableArray<NSNumber *> *axes = [NSMutableArray<NSNumber*> arrayWithCapacity:axesCount];
+    for (NSUInteger i = 0; i < axesCount; i++) axes[i] = @(i);
+
+    switch(reduction) {
+        case Reduction::Mean:
+            return [mpsGraph meanOfTensor: tensor axes: axes name: @"reductionMeanTensor"];
+        case Reduction::Sum:
+            return [mpsGraph reductionSumWithTensor: tensor axes: axes name: @"reductionSumTensor"];
+        default:
+            assert(reduction == Reduction::None);
+            return tensor;
+    }
+}
+
+// MSELoss
+void mse_loss_out_impl(const Tensor& input, const Tensor& target,
+                          int64_t reduction, const Tensor& output, const string op_name)
+{
+}
+
+Tensor& mse_loss_backward_out_impl(const Tensor& grad_output, const Tensor& input, const Tensor& target,
+                                   int64_t reduction, Tensor& grad_input, const string op_name)
+{
+    TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
+    auto norm = reduction == Reduction::Mean ? 2. / static_cast<double>(input.numel()) : 2.;
+
+    struct CachedGraph : public MPSCachedGraph
+    {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor *inputTensor = nil, *targetTensor = nil;
+        MPSGraphTensor *gradInputTensor = nil, *gradOutputTensor = nil;
+    };
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+        string key = op_name + reductionToString(reduction) + ":" +
+                               to_string(grad_input.sizes()[1]) +
+                               getTensorsStringKey({input, target, grad_output});
+
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        if(!cachedGraph) {
+            cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+                    newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+                    newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+                    MPSGraphTensor *normTensor = [mpsGraph constantWithScalar: norm
+                                                           dataType: MPSDataTypeFloat32];
+                    MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: newCachedGraph->inputTensor
+                                                                        secondaryTensor: newCachedGraph->targetTensor
+                                                                                   name: nil];
+                    MPSGraphTensor *diffGradientTensor = [mpsGraph multiplicationWithPrimaryTensor: diffTensor
+                                                                                   secondaryTensor: newCachedGraph->gradOutputTensor
+                                                                                              name: nil];
+                    newCachedGraph->gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor: diffGradientTensor
+                                                                                secondaryTensor: normTensor
+                                                                                           name: nil];
+                }
+                return newCachedGraph;
+            }));
+        }
+        Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
+        Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
+        Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input);
+        Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output);
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+            inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+            targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(),
+            gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+        };
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            gradInputPlaceholder.getMPSGraphTensor() :gradInputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+
+    return grad_input;
+}
+
+// namespace to localize the CachedGraph struct for Binary Cross Entropy
+namespace BCELoss
+{
+
+struct CachedGraph : public MPSCachedGraph
+{
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor = nil, *targetTensor = nil;
+    // gradOutput only used on backward pass
+    MPSGraphTensor *weightTensor = nil, *gradOutputTensor = nil;
+    // lossTensor used for forward, and gradInputTensor for backward pass
+    union { MPSGraphTensor *lossTensor = nil; MPSGraphTensor *gradInputTensor; };
+};
+
+MPSGraphTensor* bce_forward_mps(CachedGraph *bceGraph)
+{
+    MPSGraph *mpsGraph = bceGraph->graph();
+
+    // Forward BCE: L = -w (y ln(x) + (1-y) ln(1-x))
+    MPSGraphTensor *one = [mpsGraph constantWithScalar: 1.0
+                                              dataType: MPSDataTypeFloat32];
+    // -100 is the hard limit value defined in BCELoss Spec. to clamp the log
+    MPSGraphTensor *neg100 = [mpsGraph constantWithScalar: -100.0
+                                                 dataType: MPSDataTypeFloat32];
+    // 1 - x
+    MPSGraphTensor *one_Input = [mpsGraph subtractionWithPrimaryTensor: one
+                                                       secondaryTensor: bceGraph->inputTensor
+                                                                  name: nil];
+    // log(x)
+    MPSGraphTensor *logInput = [mpsGraph logarithmWithTensor: bceGraph->inputTensor
+                                                        name: nil];
+    // max(log(x), -100)
+    MPSGraphTensor *clampedLogInput = [mpsGraph maximumWithPrimaryTensor: logInput
+                                                         secondaryTensor: neg100
+                                                                    name: nil];
+    // log(1 - x)
+    MPSGraphTensor *log1_Input = [mpsGraph logarithmWithTensor: one_Input
+                                                          name: nil];
+    // max(log(1 - x), -100)
+    MPSGraphTensor *clampedLog1_Input = [mpsGraph maximumWithPrimaryTensor: log1_Input
+                                                           secondaryTensor: neg100
+                                                                      name: nil];
+    // (y - 1) resulted from -(1 - y)
+    MPSGraphTensor *target_1 = [mpsGraph subtractionWithPrimaryTensor: bceGraph->targetTensor
+                                                      secondaryTensor: one
+                                                                 name: nil];
+    // (y - 1) * max(log(1 - x), -100)
+    MPSGraphTensor *target_1TimesLog1_Input = [mpsGraph multiplicationWithPrimaryTensor: target_1
+                                                                        secondaryTensor: clampedLog1_Input
+                                                                                   name: nil];
+    // y * max(log(x), -100)
+    MPSGraphTensor *targetTimesLogInput = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->targetTensor
+                                                                    secondaryTensor: clampedLogInput
+                                                                               name: nil];
+    // ((y - 1) * max(log(1 - x), -100)) - (y * max(log(x), -100))
+    MPSGraphTensor *bceLoss = [mpsGraph subtractionWithPrimaryTensor: target_1TimesLog1_Input
+                                                     secondaryTensor: targetTimesLogInput
+                                                                name: nil];
+    return bceLoss;
+}
+
+MPSGraphTensor* bce_backward_mps(CachedGraph *bceGraph)
+{
+    MPSGraph *mpsGraph = bceGraph->graph();
+
+    // Backward BCE: d(L)/d(x) = -w (y - x) / (x - x^2)
+    MPSGraphTensor *one = [mpsGraph constantWithScalar: 1.0
+                                              dataType: MPSDataTypeFloat32];
+    // epsilon used to clamp the grad input denominator
+    MPSGraphTensor *epsilon = [mpsGraph constantWithScalar: 1e-12
+                                                  dataType: MPSDataTypeFloat32];
+    // 1 - x
+    MPSGraphTensor *one_Input = [mpsGraph subtractionWithPrimaryTensor: one
+                                                       secondaryTensor: bceGraph->inputTensor
+                                                                  name: nil];
+    // x * (1 - x)
+    MPSGraphTensor *inputTimes1_Input = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->inputTensor
+                                                                  secondaryTensor: one_Input
+                                                                             name: nil];
+    // max(x * (1 - x), epsilon)
+    MPSGraphTensor *gradInputDenominator = [mpsGraph maximumWithPrimaryTensor: inputTimes1_Input
+                                                              secondaryTensor: epsilon
+                                                                         name: nil];
+    // (x - y)
+    MPSGraphTensor *input_target = [mpsGraph subtractionWithPrimaryTensor: bceGraph->inputTensor
+                                                          secondaryTensor: bceGraph->targetTensor
+                                                                     name: nil];
+    // (x - y) / max(x * (1 - x), epsilon)
+    MPSGraphTensor *inputDivGradInputDenom = [mpsGraph divisionWithPrimaryTensor: input_target
+                                                                 secondaryTensor: gradInputDenominator
+                                                                            name: nil];
+    // gradOutput * (((x - y) / max(x * (1 - x), epsilon)))
+    MPSGraphTensor *gradInput = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->gradOutputTensor
+                                                          secondaryTensor: inputDivGradInputDenom
+                                                                     name: nil];
+    return gradInput;
+}
+
+// Binary Cross Enropy (Forward/Backward BCELoss)
+// NOTE: "loss" tensor would be "grad_input" if it's a backward pass
+Tensor& bce_loss_out_impl(const Tensor& input, const Tensor& target,
+                          const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss,
+                          const c10::optional<Tensor>& grad_output_opt, const string op_name)
+{
+    // TODO: add sanity check for the elements of input tensor to be within [0..1]
+    TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
+
+    c10::MaybeOwned<Tensor> weight_maybe_owned      = at::borrow_from_optional_tensor(weight_opt);
+    c10::MaybeOwned<Tensor> grad_output_maybe_owned = at::borrow_from_optional_tensor(grad_output_opt);
+    const Tensor& weight      = *weight_maybe_owned;
+    const Tensor& grad_output = *grad_output_maybe_owned;
+
+    loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
+    TORCH_CHECK(loss.is_mps());
+
+    Tensor loss_squeezed = at::squeeze(loss);
+    Tensor input_squeezed = at::squeeze(input);
+    Tensor target_squeezed = at::squeeze(target);
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+        string key = op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
+
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        if(!cachedGraph) {
+            cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
+                    newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);
+
+                    MPSGraphTensor *bceLossUnweighted = nil;
+                    // if grad_output is defined, then it's a backward pass
+                    if (grad_output.defined()) {
+                        newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+                        bceLossUnweighted = bce_backward_mps(newCachedGraph);
+                    } else {
+                        bceLossUnweighted = bce_forward_mps(newCachedGraph);
+                    }
+
+                    MPSGraphTensor *bceLoss = bceLossUnweighted;
+                    if (weight.defined()) {
+                        newCachedGraph->weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
+                        bceLoss = [mpsGraph multiplicationWithPrimaryTensor: bceLossUnweighted
+                                                            secondaryTensor: newCachedGraph->weightTensor
+                                                                       name: nil];
+                    }
+
+                    if (grad_output.defined()) {
+                        if (reduction == at::Reduction::Mean) {
+                            MPSGraphTensor *inputNumel = [mpsGraph constantWithScalar: static_cast<double>(input.numel())
+                                                                             dataType: MPSDataTypeFloat32];
+                            newCachedGraph->gradInputTensor = [mpsGraph divisionWithPrimaryTensor: bceLoss
+                                                                                  secondaryTensor: inputNumel
+                                                                                             name: nil];
+                        } else {
+                            newCachedGraph->gradInputTensor = bceLoss;
+                        }
+                    } else {
+                        newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
+                    }
+                }
+                return newCachedGraph;
+            }));
+        }
+        Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input_squeezed);
+        Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
+        Placeholder lossPlaceholder   = Placeholder(cachedGraph->lossTensor, loss_squeezed);
+
+        NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+        feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+        feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData();
+        if (weight.defined()) {
+            Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor, weight);
+            feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+        }
+        if (grad_output.defined()) {
+            Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output);
+            feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+        }
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            lossPlaceholder.getMPSGraphTensor() : lossPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+
+    return loss;
+}
+
+} // namespace BCELoss
+
+// NLLLoss
+void nllnd_loss_backward_impl(
+Tensor& grad_input,
+const Tensor& grad_output,
+const Tensor& input,
+const Tensor& target,
+const Tensor& weight,
+int64_t reduction,
+int64_t ignore_index,
+const Tensor& total_weight,
+bool is2D)
+{
+    // Empty output
+    if(grad_input.numel() == 0)
+        return;
+
+    MPSStream* stream = getCurrentMPSStream();
+
+    struct CachedGraph : public MPSCachedGraph
+    {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor* inputTensor_ = nil;
+        MPSGraphTensor* targetTensor_ = nil;
+        MPSGraphTensor* weightTensor_ = nil;
+        MPSGraphTensor* totalWeightTensor_ = nil;
+        MPSGraphTensor* gradInputTensor_ = nil;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+
+        auto numClasses = grad_input.sizes()[1];
+        bool isWeightsArrayValid = (weight.numel() > 0);
+
+        MPSShape* input_shape = getMPSShape(input);
+        MPSShape* target_shape = getMPSShape(target);
+        MPSShape* weight_shape = getMPSShape(weight);
+        MPSShape* total_weight_shape = getMPSShape(total_weight);
+
+        NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+        string key = "nllnd_loss_backward_impl:" + to_string(numClasses) + ":" +
+                                                   to_string(ignore_index) + ":" +
+                                                   to_string(isWeightsArrayValid) + ":" +
+                                                   reductionToString(reduction) + ":" +
+                                                   [ns_shape_key UTF8String] + ":" +
+                                                   getMPSTypeString(input.scalar_type()) + ":" +
+                                                   getMPSTypeString(target.scalar_type()) + ":" +
+                                                   getMPSTypeString(weight.scalar_type()) + ":" +
+                                                   getMPSTypeString(total_weight.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
+                    MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()), target_shape);
+                    MPSGraphTensor* weightTensor = nil;
+                    if(isWeightsArrayValid)
+                        weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(weight.scalar_type()), weight_shape);
+                    MPSGraphTensor* totalWeightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(total_weight.scalar_type()), total_weight_shape);
+
+                    MPSGraphTensor *udpatedTargetTensor = targetTensor;
+
+                    // Replace ignored_index with length depth + 1 so that oneHotAPI ignores it
+                    if(ignore_index != -100)
+                    {
+                        MPSGraphTensor *mpsGraphIndexTensor = [mpsGraph constantWithScalar: ignore_index
+                                                                                  dataType: MPSDataTypeInt64];
+                        MPSGraphTensor *mpsGraphDepthPlusOneTensor = [mpsGraph constantWithScalar: (numClasses + 1)
+                                                                                  dataType: MPSDataTypeInt64];
+
+                        // Equal tensor
+                        MPSGraphTensor* mpsGraphIsEqualTensor = [mpsGraph equalWithPrimaryTensor: targetTensor
+                                                                                 secondaryTensor: mpsGraphIndexTensor
+                                                                                            name: @"isEqualTensor"];
+
+                        udpatedTargetTensor = [mpsGraph selectWithPredicateTensor: mpsGraphIsEqualTensor
+                                                          truePredicateTensor: mpsGraphDepthPlusOneTensor
+                                                         falsePredicateTensor: targetTensor
+                                                                         name: @"predicateTensor"];
+                    }
+
+                    float onValue = -1.0f;
+
+                    MPSGraphTensor *oneHotTensor;
+
+                    oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
+                                                               depth:numClasses
+                                                                axis:1
+                                                            dataType:inputTensor.dataType
+                                                             onValue:onValue
+                                                            offValue:0.0f
+                                                                name:nil];
+
+                    if(isWeightsArrayValid)
+                    {
+                        oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
+                                                                 secondaryTensor:weightTensor
+                                                                            name:@"scaleByWeightTensor"];
+                    }
+
+                    if(reduction == Reduction::Mean)
+                    {
+                        oneHotTensor = [mpsGraph divisionNoNaNWithPrimaryTensor:oneHotTensor
+                                                                secondaryTensor:totalWeightTensor
+                                                                           name:@"divisionTensor"];
+                    }
+
+                    MPSGraphTensor* gradInputTensor = oneHotTensor;
+
+                    newCachedGraph->inputTensor_ = inputTensor;
+                    newCachedGraph->targetTensor_ = targetTensor;
+                    newCachedGraph->weightTensor_ = weightTensor;
+                    newCachedGraph->totalWeightTensor_ = totalWeightTensor;
+                    newCachedGraph->gradInputTensor_ = gradInputTensor;
+
+                }
+                return newCachedGraph;
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder   = Placeholder(cachedGraph->inputTensor_, input);
+        auto targetPlaceholder   = Placeholder(cachedGraph->targetTensor_, target);
+        Placeholder weightPlaceholder = Placeholder();
+        if(isWeightsArrayValid)
+            weightPlaceholder  = Placeholder(cachedGraph->weightTensor_, weight);
+        auto totalWeightPlaceholder   = Placeholder(cachedGraph->totalWeightTensor_, total_weight);
+        auto gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = [[NSMutableDictionary alloc] initWithCapacity: 4];
+        feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+        feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData();
+        feeds[totalWeightPlaceholder.getMPSGraphTensor()] = totalWeightPlaceholder.getMPSGraphTensorData();
+
+        if(isWeightsArrayValid)
+            feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+
+    return;
+}
+
+void nllnd_loss_forward_impl
+(Tensor& output,
+ Tensor& total_weight,
+ const Tensor& input,
+ const Tensor& target,
+ const Tensor& weight,
+ int64_t reduction,
+ int64_t ignore_index,
+ bool is2D)
+{
+    std::vector<long long> reshapedTarget(target.sizes().begin(), target.sizes().end());
+    reshapedTarget.push_back(1);
+
+    Tensor batchSizeTensor = at::empty_like(input).resize_(IntArrayRef(1));
+    float batchVal = 1.0f;
+    for(size_t i = 0; i < reshapedTarget.size(); ++i)
+        batchVal *= reshapedTarget[i];
+    batchSizeTensor[0] = batchVal;
+
+    if(reduction == Reduction::None)
+        output.resize_(target.sizes());
+    if(reduction == Reduction::Sum)
+        output.resize_({});
+    if(reduction == Reduction::Mean)
+        output.resize_({});
+
+    TORCH_CHECK(output.is_mps());
+
+    // Empty output
+    if(output.numel() == 0)
+        return;
+
+    struct CachedGraph : public MPSCachedGraph
+    {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor* inputTensor_ = nil;
+        MPSGraphTensor* targetTensor_ = nil;
+        MPSGraphTensor* weightTensor_ = nil;
+        MPSGraphTensor* batchSizeTensor_ = nil;
+        MPSGraphTensor* totalWeightTensor_ = nil;
+        MPSGraphTensor* outputTensor_ = nil;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    MPSStream* stream = getCurrentMPSStream();
+
+    @autoreleasepool {
+
+        bool isWeightsArrayValid = (weight.numel() > 0);
+
+        MPSShape* input_shape = getMPSShape(input);
+        MPSShape* target_shape = getMPSShape(target);
+        MPSShape* weight_shape = getMPSShape(weight);
+
+        NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+        // TODO: Make the key
+        string key = "nllnd_loss_forward_impl:" + to_string(ignore_index) + ":" +
+                                                  to_string(isWeightsArrayValid) + ":" +
+                                                  reductionToString(reduction) + ":" +
+                                                  [ns_shape_key UTF8String] + ":" +
+                                                  getMPSTypeString(input.scalar_type()) + ":" +
+                                                  getMPSTypeString(target.scalar_type()) + ":" +
+                                                  getMPSTypeString(weight.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
+                    MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()), target_shape);
+                    MPSGraphTensor* weightTensor = nil;
+                    if(isWeightsArrayValid)
+                        weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(weight.scalar_type()), weight_shape);
+                    MPSGraphTensor* mps_batchSizeTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batchSizeTensor.scalar_type()));
+
+                    MPSGraphTensor* mpsGraphBatchSizeTensor = mps_batchSizeTensor;
+
+                    // The transposes are needed to get the class dimension (dim 1) to the inner most dim for gather op.
+                    // The transpose become nop in the 2D case.
+                    MPSGraphTensor* mpsTransposeTensor = inputTensor;
+                    int classDim = 1;
+                    int lastDim = input.sizes().size()-1;
+                    mpsTransposeTensor = [mpsGraph transposeTensor:inputTensor
+                                                         dimension:classDim
+                                                     withDimension:lastDim
+                                                              name:nil];
+                    for(int i = 0; i < lastDim - 2; ++i)
+                    {
+                        mpsTransposeTensor = [mpsGraph transposeTensor:mpsTransposeTensor
+                                                             dimension:classDim+i
+                                                         withDimension:classDim+i+1 name:nil];
+                    }
+
+
+                    MPSGraphTensor* mpsGatherTensor = [mpsGraph gatherWithUpdatesTensor:mpsTransposeTensor
+                                                                          indicesTensor:targetTensor
+                                                                                   axis:lastDim
+                                                                        batchDimensions:lastDim
+                                                                                   name:@"gatherTensor"];
+
+                    bool isIgnoreIndexValid = (ignore_index != -100);
+                    MPSGraphTensor* weightGatherTensor;
+
+                    if(isWeightsArrayValid)
+                    {
+                        weightGatherTensor = [mpsGraph gatherWithUpdatesTensor:weightTensor
+                                                                 indicesTensor:targetTensor
+                                                                          axis:0
+                                                               batchDimensions:0
+                                                                          name:@"weightGatherTensor"];
+                        MPSGraphTensor *mpsGatherCopyTensor = [mpsGraph identityWithTensor:mpsGatherTensor
+                                                                                      name:@"identityTensor"];
+                        mpsGatherTensor = [mpsGraph multiplicationWithPrimaryTensor:weightGatherTensor
+                                                                    secondaryTensor:mpsGatherCopyTensor
+                                                                               name:@"scaledLossTensor"];
+                    }
+
+                    // Both these cases need recomputation of denominator when reductionMode == mean
+                    if(isIgnoreIndexValid || isWeightsArrayValid)
+                    {
+                        // Setup tensors
+                        MPSGraphTensor *mpsGraphZeroTensor = [mpsGraph constantWithScalar:0.0
+                                                                                 dataType:mpsGatherTensor.dataType];
+                        MPSGraphTensor *mpsGraphOneTensor = [mpsGraph constantWithScalar:1.0
+                                                                                dataType:mpsGatherTensor.dataType];
+                        // @TODO: Remove this identity call with ToT StarSky MPSGraph
+                        MPSGraphTensor *mpsGraphOneTensorCopy = [mpsGraph identityWithTensor:mpsGraphOneTensor
+                                                                                        name:@"IdentityHackTensor"];
+
+                        MPSGraphTensor *mpsGraphIsEqualTensor;
+
+                        if(isIgnoreIndexValid)
+                        {
+                            MPSGraphTensor *mpsGraphIndexTensor = [mpsGraph constantWithScalar:ignore_index
+                                                                                      dataType:MPSDataTypeInt64];
+                            // Equal tensor
+                            mpsGraphIsEqualTensor = [mpsGraph equalWithPrimaryTensor:targetTensor
+                                                                     secondaryTensor:mpsGraphIndexTensor
+                                                                                name:@"isEqualTensor"];
+                            // Zero out loss
+                            MPSGraphTensor *mpsGatherCopyTensor = [mpsGraph identityWithTensor:mpsGatherTensor
+                                                                                          name:@"identityTensor"];
+                            mpsGatherTensor = [mpsGraph selectWithPredicateTensor:mpsGraphIsEqualTensor
+                                                              truePredicateTensor:mpsGraphZeroTensor
+                                                             falsePredicateTensor:mpsGatherCopyTensor
+                                                                             name:@"predicateTensor"];
+                        }
+
+                        if(isWeightsArrayValid)
+                        {
+                            mpsGraphOneTensorCopy = weightGatherTensor;
+                            if(!isIgnoreIndexValid)
+                            {
+                                mpsGraphIsEqualTensor = [mpsGraph constantWithScalar: 0.0
+                                                                               shape: targetTensor.shape
+                                                                            dataType: targetTensor.dataType];
+                            }
+                        }
+
+                        // Compute new batch size
+                        MPSGraphTensor* mpsSelectOneTensor = [mpsGraph selectWithPredicateTensor:mpsGraphIsEqualTensor
+                                                                             truePredicateTensor:mpsGraphZeroTensor
+                                                                            falsePredicateTensor:mpsGraphOneTensorCopy
+                                                                                            name:@"predicateOneTensor"];
+                        mpsGraphBatchSizeTensor = [mpsGraph reductionSumWithTensor:mpsSelectOneTensor
+                                                                              axes:nil
+                                                                              name:@"batchSizeReductionTensor"];
+                    }
+
+                    MPSGraphTensor *mpsGraphNegTensor = [mpsGraph negativeWithTensor:mpsGatherTensor
+                                                                                name:@"negativeTensor"];
+
+                    MPSGraphTensor* mpsGraphReducedTensor = mpsGraphNegTensor;
+
+                    if(!(reduction == Reduction::None))
+                    {
+                        mpsGraphReducedTensor = [mpsGraph reductionSumWithTensor:mpsGraphNegTensor
+                                                                            axes:nil
+                                                                            name:@"reductionSumTensor"];
+                        if(reduction == Reduction::Mean)
+                        {
+                            mpsGraphReducedTensor = [mpsGraph divisionNoNaNWithPrimaryTensor:mpsGraphReducedTensor
+                                                                             secondaryTensor:mpsGraphBatchSizeTensor
+                                                                                        name:@"divisionTensor"];
+                        }
+                    }
+
+                    mpsGraphReducedTensor = [mpsGraph reshapeTensor:mpsGraphReducedTensor
+                                                          withShape:getMPSShape(output)
+                                                               name:nil];
+
+                    newCachedGraph->inputTensor_ = inputTensor;
+                    newCachedGraph->targetTensor_ = targetTensor;
+                    newCachedGraph->weightTensor_ = weightTensor;
+                    newCachedGraph->batchSizeTensor_ = mps_batchSizeTensor;
+                    newCachedGraph->totalWeightTensor_ = mpsGraphBatchSizeTensor;
+                    newCachedGraph->outputTensor_ = mpsGraphReducedTensor;
+                }
+                return newCachedGraph;
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, input);
+        Placeholder targetPlaceholder   = Placeholder(cachedGraph->targetTensor_, target);
+        Placeholder weightPlaceholder = Placeholder();
+        if(isWeightsArrayValid)
+            weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
+        Placeholder batchSizePlaceholder   = Placeholder(cachedGraph->batchSizeTensor_, batchSizeTensor);
+        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+        Placeholder totalWeightsPlaceholder = Placeholder(cachedGraph->totalWeightTensor_, total_weight);
+
+        // Create dictionary of inputs and outputs
+        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = [[NSMutableDictionary alloc] initWithCapacity: 4];
+        feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
+        feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData();
+        feeds[batchSizePlaceholder.getMPSGraphTensor()] = batchSizePlaceholder.getMPSGraphTensorData();
+
+        if(isWeightsArrayValid)
+            feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+            totalWeightsPlaceholder.getMPSGraphTensor() : totalWeightsPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    }
+
+    return;
+}
+
+void smooth_l1_loss_impl(
+    const Tensor &input,
+    const Tensor &target,
+    const int64_t reduction,
+    double beta,
+    const Tensor &output,
+    MPSShape *mpsInputShape,
+    MPSShape *mpsOutputShape)
+{
+ struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *targetTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache *cache_ = MPSGraphCache::getInstance();
+
+  MPSStream *stream= getCurrentMPSStream();
+
+  @autoreleasepool {
+    MPSShape* input_shape = getMPSShape(input);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "smooth_l1_loss_impl:" + reductionToString(reduction) + ":" +
+                                          [ns_shape_key UTF8String] + ":" +
+                                          to_string(beta) + ":" +
+                                          getMPSTypeString(input.scalar_type()) + ":" +
+                                          getMPSTypeString(target.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        // smooth_l1_loss_mps:
+        // ln = 0.5 * ( xn - yn ) ^ 2 / beta,       if |xn - yn| < beta
+        //    = | xn - yn | - 0.5 * beta,           otherwise
+
+        @autoreleasepool {
+          // Initialize graph
+          MPSGraph *mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()));
+          MPSGraphTensor *targetTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()));
+
+          // Setup tensors
+          MPSGraphTensor *mpsGraphZeroTensor = [mpsGraph constantWithScalar: 0.0
+                                                                   dataType: inputTensor.dataType];
+          MPSGraphTensor *mpsGraphOneTensor = [mpsGraph constantWithScalar: 1.0
+                                                                  dataType: inputTensor.dataType];
+          MPSGraphTensor *mpsGraphHalfTensor = [mpsGraph constantWithScalar: 0.5
+                                                                   dataType: MPSDataTypeFloat32];
+          MPSGraphTensor *betaTensor = [mpsGraph constantWithScalar: beta
+                                                           dataType: MPSDataTypeFloat32];
+          // 0.5 * beta
+          MPSGraphTensor *halfTensorMulBetaTensor = [mpsGraph constantWithScalar: beta * 0.5
+                                                                        dataType: MPSDataTypeFloat32];
+          // Calculating first part of the equation:
+          // ln = 0.5(xn - yn)^2/beta, if |xn - yn| < beta
+
+          // xn - yn
+          MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor
+                                                              secondaryTensor: targetTensor
+                                                                         name: nil];
+
+          // | xn - yn |
+          MPSGraphTensor *diffAbsTensor = [mpsGraph absoluteWithTensor: diffTensor
+                                                                  name: nil];
+
+          // | xn - yn | < beta
+          MPSGraphTensor *diffAbsLessThanBetaTensor = [mpsGraph lessThanWithPrimaryTensor: diffAbsTensor
+                                                                          secondaryTensor: betaTensor
+                                                                                     name: nil];
+
+          // ( xn - yn ) ^ 2
+          MPSGraphTensor *diffSquare = [mpsGraph squareWithTensor: diffTensor
+                                                             name: nil];
+
+          // 0.5 * ( xn - yn ) ^ 2
+          MPSGraphTensor *diffSquareMulHalfTensor = [mpsGraph multiplicationWithPrimaryTensor: diffSquare
+                                                                              secondaryTensor: mpsGraphHalfTensor
+                                                                                         name: nil];
+
+          // 0.5 * ( xn - yn ) ^ 2 / beta
+          MPSGraphTensor *loss1Temp = [mpsGraph divisionWithPrimaryTensor: diffSquareMulHalfTensor
+                                                          secondaryTensor: betaTensor
+                                                                    name: nil];
+
+          // Calculating second part of the equation:
+          // | xn - yn | - 0.5 * beta, if | xn - yn | >= beta
+
+          // | xn - yn | - 0.5 * beta
+          MPSGraphTensor *loss2Temp = [mpsGraph subtractionWithPrimaryTensor: diffAbsTensor
+                                                             secondaryTensor: halfTensorMulBetaTensor
+                                                                        name: nil];
+
+          MPSGraphTensor *lossTensor = [mpsGraph selectWithPredicateTensor: diffAbsLessThanBetaTensor
+                                                            truePredicateTensor: loss1Temp
+                                                           falsePredicateTensor: loss2Temp
+                                                                           name: @"lossTensor"];
+
+          MPSGraphTensor *outputTensor = reduceTensor(lossTensor, reduction, mpsGraph, 1);
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->targetTensor_ = targetTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input, mpsInputShape);
+    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target, mpsInputShape);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, mpsOutputShape);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder .getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+void smooth_l1_loss_template(
+    const Tensor &input,
+    const Tensor &target,
+    const int64_t reduction,
+    double beta,
+    const Tensor &output)
+{
+ TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
+  TORCH_CHECK(input.is_mps());
+  TORCH_CHECK(target.is_mps());
+
+  MPSShape *mpsInputShape = nil;
+  MPSShape *mpsOutputShape = nil;
+
+  // Determine the shape of the output
+  // If the reduction is 'mean' or 'sum', the output shape is a scalar,
+  // otherwise, the output shape is the same shape as input
+  if (reduction == Reduction::Mean || reduction == Reduction::Sum)
+  {
+      // Output: scalar, if reduction is 'mean' or 'sum'
+      IntArrayRef input_shape = input.sizes();
+      int64_t num_input_dims = input_shape.size();
+      NSMutableArray<NSNumber*> *apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      int64_t num_in_elements = 1;
+      for(int i = 0; i < num_input_dims; i++) {
+          num_in_elements *= input_shape[i];
+      }
+      apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+      // Output is a single value in case reduction is set to mean or sum
+      NSMutableArray<NSNumber*> *apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      apparent_out_shape[0] = @1;
+      mpsInputShape = apparent_input_shape;
+      mpsOutputShape = apparent_out_shape;
+  }
+  else
+  {
+      // Output: If reduction is 'none', then (N, *); same shape as the input
+      assert(reduction == Reduction::None);
+      mpsInputShape = getMPSShape(input);
+      mpsOutputShape = mpsInputShape;
+      //resize_tensor(&output);
+  }
+  TORCH_CHECK(output.is_mps());
+
+  smooth_l1_loss_impl(
+      input,
+      target,
+      reduction,
+      beta,
+      output,
+      mpsInputShape,
+      mpsOutputShape);
+}
+
+void smooth_l1_loss_backward_impl(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& grad_input)
+{
+ struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *targetTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+ MPSGraphCache *cache_ = MPSGraphCache::getInstance();
+
+  MPSStream *stream= getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    auto numClasses = grad_input.sizes()[1];
+    MPSShape* input_shape = getMPSShape(input);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "smooth_l1_loss_backward_impl:" + to_string(numClasses) + ":" +
+                                                   reductionToString(reduction) + ":" +
+                                                   [ns_shape_key UTF8String] + ":" +
+                                                   to_string(beta) + ":" +
+                                                   getMPSTypeString(input.scalar_type()) + ":" +
+                                                   getMPSTypeString(target.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          auto numElements = input.numel();
+
+          MPSGraph *mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()));
+          MPSGraphTensor *targetTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()));
+
+          MPSGraphTensor *betaTensor = [mpsGraph constantWithScalar: beta
+                                                           dataType: MPSDataTypeFloat32];
+
+          MPSGraphTensor *numelTensor = [mpsGraph constantWithScalar: numElements
+                                                            dataType: MPSDataTypeFloat32];
+
+          // xn - yn
+          MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor
+                                                              secondaryTensor: targetTensor
+                                                                         name: nil];
+
+          // | xn - yn |
+          MPSGraphTensor *diffAbsTensor = [mpsGraph absoluteWithTensor: diffTensor
+                                                                  name: nil];
+
+          // | xn - yn | < beta
+          MPSGraphTensor *diffAbsLessThanBetaTensor = [mpsGraph lessThanWithPrimaryTensor: diffAbsTensor
+                                                                          secondaryTensor: betaTensor
+                                                                                     name: nil];
+
+          // ( xn - yn ) / beta
+          MPSGraphTensor *truePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor
+                                                                    secondaryTensor: betaTensor
+                                                                               name: nil];
+
+          // ( x - y ) / | x - y |
+           MPSGraphTensor *falsePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor
+                                                                      secondaryTensor: diffAbsTensor
+                                                                                 name: nil];
+
+          MPSGraphTensor *lossTensor = [mpsGraph selectWithPredicateTensor: diffAbsLessThanBetaTensor
+                                                            truePredicateTensor: truePredicateTensor
+                                                           falsePredicateTensor: falsePredicateTensor
+                                                                           name: @"lossTensor"];
+
+          MPSGraphTensor *outputTensor = lossTensor;
+          if (reduction == Reduction::Mean)
+          {
+              outputTensor = [mpsGraph divisionWithPrimaryTensor: lossTensor
+                                                 secondaryTensor: numelTensor
+                                                            name: nil];
+          }
+
+          MPSGraphTensor *gradInputTensor = outputTensor;
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->targetTensor_ = targetTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder .getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+void smooth_l1_loss_backward_template(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& grad_input)
+{
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss_backward does not support negative values for beta.");
+  TORCH_CHECK(input.is_mps());
+  TORCH_CHECK(target.is_mps());
+
+  smooth_l1_loss_backward_impl(
+      grad_output, input, target, reduction, beta, grad_input
+  );
+}
+
+} // namespace mps
+
+// APIs exposed to at::native scope
+
+// MSELoss
+TORCH_IMPL_FUNC(mse_loss_out_mps) (
+        const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output) {
+    string op_name = __func__;
+    using namespace mps;
+    TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
+    TORCH_CHECK(output.is_mps());
+
+    struct CachedGraph : public MPSCachedGraph
+    {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor* inputTensor = nil;
+        MPSGraphTensor* targetTensor = nil;
+        MPSGraphTensor* outputTensor = nil;
+    };
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+        string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target});
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        if(!cachedGraph) {
+            cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+                    newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+
+                    MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: newCachedGraph->inputTensor
+                                                                        secondaryTensor: newCachedGraph->targetTensor
+                                                                                   name: nil];
+                    MPSGraphTensor *diffSquareTensor = [mpsGraph squareWithTensor: diffTensor
+                                                                             name: nil];
+                    newCachedGraph->outputTensor = reduceTensor(diffSquareTensor, reduction, mpsGraph, input.sizes().size());
+                }
+                return newCachedGraph;
+            }));
+        }
+        Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
+        Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
+        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+            inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+            targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
+        };
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+Tensor& mse_loss_backward_out_mps(const Tensor& grad_output, const Tensor& input,
+                                  const Tensor& target, int64_t reduction, Tensor& grad_input)
+{
+    return mps::mse_loss_backward_out_impl(grad_output, input, target, reduction, grad_input, __func__);
+}
+
+Tensor mse_loss_backward_mps(const Tensor& grad_output, const Tensor& input,
+                             const Tensor& target, int64_t reduction)
+{
+    Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    return mps::mse_loss_backward_out_impl(grad_output, input, target, reduction, grad_input, __func__);
+}
+
+// BCELoss
+Tensor& binary_cross_entropy_out_mps(const Tensor& input, const Tensor& target,
+                                     const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss)
+{
+    return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, loss, c10::nullopt, __func__);
+}
+
+Tensor binary_cross_entropy_mps(const Tensor& input, const Tensor& target,
+                                const c10::optional<Tensor>& weight_opt, int64_t reduction)
+{
+    Tensor loss = at::empty_like(input);
+    return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, loss, c10::nullopt, __func__);
+}
+
+Tensor& binary_cross_entropy_backward_out_mps(const Tensor& grad_output, const Tensor& input,
+                                              const Tensor& target, const c10::optional<Tensor>& weight_opt,
+                                              int64_t reduction, Tensor& grad_input)
+{
+    return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, grad_input, grad_output, __func__);
+}
+
+Tensor binary_cross_entropy_backward_mps(const Tensor& grad_output, const Tensor& input, const Tensor& target,
+                                         const c10::optional<Tensor>& weight_opt, int64_t reduction)
+{
+    Tensor grad_input = at::empty_like(input);
+    return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, grad_input, grad_output, __func__);
+}
+
+// SmoothL1Loss
+TORCH_IMPL_FUNC(smooth_l1_loss_out_mps)(
+                    const Tensor& input,
+                    const Tensor& target,
+                    int64_t reduction,
+                    double beta,
+                    const Tensor& result) {
+  mps::smooth_l1_loss_template(
+      input, target, reduction, beta, result);
+}
+
+Tensor& smooth_l1_loss_backward_out_mps(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& grad_input) {
+  mps::smooth_l1_loss_backward_template(
+      grad_output, input, target, reduction, beta, grad_input);
+  return grad_input;
+}
+
+// NLLLoss
+TORCH_IMPL_FUNC(nll_loss_backward_out_mps)
+(const Tensor& grad_output,
+ const Tensor& self,
+ const Tensor& target,
+ OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& total_weight,
+ const Tensor& grad_input
+)
+{
+    const Tensor& weight = weight_opt.getTensorRef();
+
+    mps::nllnd_loss_backward_impl((Tensor &)grad_input,
+                             grad_output,
+                             self,
+                             target,
+                             weight,
+                             reduction,
+                             ignore_index,
+                             total_weight,
+                             false);
+    return;
+}
+
+TORCH_IMPL_FUNC(nll_loss_forward_out_mps)
+(const Tensor& self,
+ const Tensor& target,
+ const OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& output,
+ const Tensor& total_weight) {
+
+    const Tensor& weight = weight_opt.getTensorRef();
+
+    mps::nllnd_loss_forward_impl((Tensor &)output,
+                            (Tensor &)total_weight,
+                             self,
+                             target,
+                             weight,
+                             reduction,
+                             ignore_index,
+                             false);
+
+    return;
+}
+
+inline void check_inputs_nll_loss2d(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight) {
+  TORCH_CHECK(
+      target.dim() == 3,
+      "only batches of spatial targets supported (3D tensors)"
+      " but got targets of dimension: ",
+      target.dim());
+  TORCH_CHECK(
+      input.dim() == 4,
+      "only batches of spatial inputs supported (4D tensors), "
+      "but got input of dimension: ",
+      input.dim());
+  TORCH_CHECK(
+      !weight.defined() || weight.numel() == input.size(1),
+      "weight tensor should be defined either for all or no classes");
+
+  const int64_t input0 = input.size(0);
+  const int64_t input2 = input.size(2);
+  const int64_t input3 = input.size(3);
+  const int64_t target0 = target.size(0);
+  const int64_t target1 = target.size(1);
+  const int64_t target2 = target.size(2);
+  TORCH_CHECK(
+      input0 == target0 && input2 == target1 && input3 == target2,
+      "size mismatch (got input: ",
+      input.sizes(),
+      " , target: ",
+      target.sizes());
+}
+
+
+void nll_loss2d_forward_out_mps_template(
+    Tensor& output,
+    Tensor& total_weight,
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index) {
+  check_inputs_nll_loss2d(input, target, weight);
+  total_weight.resize_({});
+
+    mps::nllnd_loss_forward_impl(output,
+                         total_weight,
+                         input,
+                         target,
+                         weight,
+                         reduction,
+                         ignore_index,
+                         true);
+
+    return;
+}
+
+std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_mps(const Tensor& self,
+    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    Tensor& output,
+    Tensor& total_weight) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  nll_loss2d_forward_out_mps_template(
+      output, total_weight, self, target, weight, reduction, ignore_index);
+  return std::tuple<Tensor&, Tensor&>(output, total_weight);
+}
+
+std::tuple<Tensor, Tensor> nll_loss2d_forward_mps(
+    const Tensor& self,
+    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  auto output = at::empty({0}, self.options());
+  auto total_weight = at::empty({0}, self.options());
+  at::native::nll_loss2d_forward_out_mps(
+      self, target, weight, reduction, ignore_index, output, total_weight);
+  return std::make_tuple(output, total_weight);
+}
+
+void nll_loss2d_backward_out_mps_template(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight) {
+  check_inputs_nll_loss2d(input, target, weight);
+  grad_input.resize_as_(input);
+  grad_input.zero_();
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  TORCH_CHECK(
+      total_weight.numel() == 1,
+      "expected total_weight to be a single element tensor, got: ",
+      total_weight.sizes(),
+      " (",
+      total_weight.numel(),
+      " elements)");
+
+    mps::nllnd_loss_backward_impl(grad_input,
+                             grad_output,
+                             input,
+                             target,
+                             weight,
+                             reduction,
+                             ignore_index,
+                             total_weight,
+                             true);
+
+    return;
+}
+
+Tensor& nll_loss2d_backward_out_mps(const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight,
+    Tensor& grad_input) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  nll_loss2d_backward_out_mps_template(
+      grad_input,
+      grad_output,
+      self,
+      target,
+      weight,
+      reduction,
+      ignore_index,
+      total_weight);
+  return grad_input;
+}
+
+Tensor nll_loss2d_backward_mps(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight) {
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  auto grad_input = at::zeros_like(self);
+  nll_loss2d_backward_out_mps(
+      grad_output,
+      self,
+      target,
+      weight,
+      reduction,
+      ignore_index,
+      total_weight,
+      grad_input);
+  return grad_input;
+}
+
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
new file mode 100644
index 000000000000..dc85e13a9f29
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -0,0 +1,804 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Pool.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+void get_shapes(MPSShape* input_shape_readonly,
+                NSMutableArray<NSNumber*>* &input_shape,
+                NSMutableArray<NSNumber*>* &new_mean_shape,
+                NSMutableArray<NSNumber*>* &axes,
+                int num_input_dims, c10::MemoryFormat memory_format,
+                bool isBackward) {
+  // Modify the shape
+  if(memory_format == at::MemoryFormat::Contiguous) {
+    for(int i = 0; i < num_input_dims; i++)
+      input_shape[i] = input_shape_readonly[i];
+  }
+  else { // ChannelsLast
+    auto num_channels = input_shape_readonly[1];
+    input_shape[0] = input_shape_readonly[0];
+    for(int i = 1; i < num_input_dims-1; i++)
+      input_shape[i] = input_shape_readonly[i+1];
+    input_shape[num_input_dims-1] = num_channels;
+  }
+
+  // Mean shape should remain unchanged in backward
+  if(memory_format == at::MemoryFormat::Contiguous || isBackward) {
+    new_mean_shape[0] = @1;
+    new_mean_shape[1] = input_shape_readonly[1];
+    for(int i = 2; i < num_input_dims; i++)
+      new_mean_shape[i] = @1;
+  }
+  else if(memory_format == at::MemoryFormat::ChannelsLast) {
+    for(int i = 0; i < num_input_dims-1; i++)
+      new_mean_shape[i] = @1;
+    new_mean_shape[num_input_dims-1] = input_shape[num_input_dims-1];
+  }
+
+  // Set axes of reduction
+  if(memory_format == at::MemoryFormat::Contiguous || isBackward) {
+      axes[0] = @0;
+      for(int i = 2; i < num_input_dims; i++)
+        axes[i-1] = [NSNumber numberWithInt:i];
+    }
+    else {
+      for(int i = 0; i < num_input_dims-1; i++)
+        axes[i] = [NSNumber numberWithInt:i];
+    }
+}
+
+// Inverse standard deviation now becomes variance (without epsilon)
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out
+                   (const Tensor& self,
+                    const c10::optional<Tensor>& weight_opt,
+                    const c10::optional<Tensor>& bias_opt,
+                    const c10::optional<Tensor>& running_mean_opt,
+                    const c10::optional<Tensor>& running_var_opt,
+                    bool train, double momentum, double epsilon,
+                    Tensor& output,
+                    Tensor& save_mean,
+                    Tensor& save_var) {
+
+  namespace native_mps = at::native::mps;
+
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* biasTensor_ = nil;
+    MPSGraphTensor* runningMeanTensor_ = nil;
+    MPSGraphTensor* runningVarTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* saveMeanTensor_ = nil;
+    MPSGraphTensor* saveVarTensor_ = nil;
+    MPSGraphTensor* runningMeanInplaceUpdate_ = nil;
+    MPSGraphTensor* runningVarInplaceUpdate_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
+  const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+  TORCH_CHECK(has_running_mean == has_running_var);
+
+  const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
+  const bool has_bias = (bias_opt.has_value() && bias_opt->defined());
+
+  const auto memory_format = self.suggest_memory_format();
+
+  if (output.numel() == 0) {
+    return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_var);;
+  }
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    // Number of elements in one channel, needed for bessel correction term
+    const int64_t N = self.numel() / save_mean.numel();
+    MPSShape* input_shape_readonly = mps::getMPSShape(self);
+    int num_input_dims = [input_shape_readonly count];
+    // Input shape changes based on memory format
+    NSMutableArray<NSNumber*>* input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Shape which can be broadcasted with input
+    NSMutableArray<NSNumber*>* new_mean_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Reduction axes
+    NSMutableArray<NSNumber*>* axes = [NSMutableArray<NSNumber*> arrayWithCapacity:(num_input_dims-1)];
+
+    get_shapes(input_shape_readonly, input_shape, new_mean_shape, axes, num_input_dims, memory_format, false);
+
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "batch_norm_mps_out:" + mem_format_key + ":" + std::to_string(epsilon) + ":"
+                      + std::to_string(momentum) + ":" + std::to_string(train) + ":"
+                      + std::to_string(has_running_mean) + ":"
+                      + std::to_string(has_weight) + ":" + std::to_string(has_bias) + ":"
+                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(self.scalar_type());
+    auto input_mps_dtype = native_mps::getMPSDataType(self.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    // Dim where channels are located
+    int channelsDim;
+    if(memory_format == at::MemoryFormat::Contiguous)
+      channelsDim = 1;
+    else
+      channelsDim = num_input_dims - 1;
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+
+            MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_mps_dtype, input_shape);
+            MPSGraphTensor* weightTensor = nil;
+            // Should have shape of mean
+            if(has_weight)
+              weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(weight_opt.value().scalar_type()), new_mean_shape);
+            MPSGraphTensor* biasTensor = nil;
+            if(has_bias)
+              biasTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(bias_opt.value().scalar_type()), new_mean_shape);
+            MPSGraphTensor* runningMeanTensor = nil;
+            MPSGraphTensor* runningVarTensor = nil;
+            if(has_running_mean) {
+              runningMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_mean_opt.value().scalar_type()), new_mean_shape);
+              runningVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_var_opt.value().scalar_type()), new_mean_shape);
+            }
+
+            // Mean and inv std tensors to be saved and returned
+            MPSGraphTensor* saveMeanTensor = nil;
+            MPSGraphTensor* saveVarTensor = nil;
+
+            // Running stats inplace update
+            MPSGraphTensor* runningMeanInplaceUpdate = nil;
+            MPSGraphTensor* runningVarInplaceUpdate = nil;
+
+            MPSGraphTensor* updatedRunningMeanTensor = nil;
+            MPSGraphTensor* updatedRunningVarTensor = nil;
+
+            /*
+            If train:
+              If has_running_mean:
+                Update the running stats to be stored into save_mean and save_var,
+                AND to be used in current batchnorm computation
+              Else:
+                Just calculate the var using batch variance
+            If not train:
+              Check if running mean exists (maybe do this check before making graph)
+              Copy the running mean into the mean to be saved
+              Calculate the save_var directly from the running variance
+
+            Compute the batch norm output and stats to be saved
+            */
+
+            if(train) {
+              // Compute mean and variance of the current batch
+              MPSGraphTensor* batchMeanTensor = [mpsGraph meanOfTensor:inputTensor
+                                                                  axes:axes
+                                                                  name:nil];
+              MPSGraphTensor* batchVarianceTensor = [mpsGraph varianceOfTensor:inputTensor
+                                                                          axes:axes
+                                                                          name:nil];
+              if(has_running_mean) {
+                // TODO: This is not the formula used in PyTorch, is this OK? Seems more robust
+                // float besselCorrectionTerm = float(N) / std::max(N - 1.0f, 1.0f);
+                float besselCorrectionTerm = float(N) / float(N - 1.0f);
+                MPSGraphTensor* besselConstantTensor = [mpsGraph constantWithScalar:(double)besselCorrectionTerm
+                                                                              shape:@[@1]
+                                                                           dataType:input_mps_dtype];
+                MPSGraphTensor* unbiasedVarianceTensor = [mpsGraph multiplicationWithPrimaryTensor:batchVarianceTensor
+                                                                                   secondaryTensor:besselConstantTensor
+                                                                                              name:nil];
+                MPSGraphTensor* momentumTensor = [mpsGraph constantWithScalar:(double)momentum
+                                                                        shape:@[@1]
+                                                                     dataType:input_mps_dtype];
+                MPSGraphTensor* oneMinusMomentum = [mpsGraph constantWithScalar:(double)(1.0 - momentum)
+                                                                          shape:@[@1]
+                                                                       dataType:input_mps_dtype];
+                // Compute updated running mean
+                MPSGraphTensor* scaledBatchMean = [mpsGraph multiplicationWithPrimaryTensor:batchMeanTensor
+                                                                            secondaryTensor:momentumTensor
+                                                                                       name:nil];
+                MPSGraphTensor* scaledRunningMean = [mpsGraph multiplicationWithPrimaryTensor:runningMeanTensor
+                                                                              secondaryTensor:oneMinusMomentum
+                                                                                         name:nil];
+                updatedRunningMeanTensor = [mpsGraph additionWithPrimaryTensor:scaledBatchMean
+                                                                               secondaryTensor:scaledRunningMean
+                                                                                          name:nil];
+                // Compute updated running var
+                MPSGraphTensor* scaledCorrectedBatchVar = [mpsGraph multiplicationWithPrimaryTensor:unbiasedVarianceTensor
+                                                                                    secondaryTensor:momentumTensor
+                                                                                               name:nil];
+                MPSGraphTensor* scaledRunningVar = [mpsGraph multiplicationWithPrimaryTensor:runningVarTensor
+                                                                             secondaryTensor:oneMinusMomentum
+                                                                                        name:nil];
+                updatedRunningVarTensor = [mpsGraph additionWithPrimaryTensor:scaledCorrectedBatchVar
+                                                              secondaryTensor:scaledRunningVar
+                                                                         name:nil];
+                // Update saved mean and inverse std tensor
+                saveMeanTensor = batchMeanTensor;
+                saveVarTensor = batchVarianceTensor;
+            }
+            else {
+              saveMeanTensor = batchMeanTensor;
+              saveVarTensor = batchVarianceTensor;
+            }
+          }
+          else { // Test
+            TORCH_CHECK(has_running_mean);
+            saveMeanTensor = [mpsGraph identityWithTensor:runningMeanTensor
+                                                     name:nil];
+            saveVarTensor = [mpsGraph identityWithTensor:runningVarTensor
+                                                    name:nil];
+          }
+
+          // Compute output of batch norm
+          MPSGraphTensor* outputTensor = [mpsGraph normalizationWithTensor:inputTensor
+                                                                meanTensor:saveMeanTensor
+                                                            varianceTensor:saveVarTensor
+                                                               gammaTensor:weightTensor
+                                                                betaTensor:biasTensor
+                                                                   epsilon:(float)epsilon
+                                                                      name:nil];
+
+          // Reshape saved mean and var to fit output
+          saveMeanTensor = [mpsGraph reshapeTensor:saveMeanTensor
+                                         withShape:@[new_mean_shape[channelsDim]]
+                                               name:nil];
+          saveVarTensor = [mpsGraph reshapeTensor:saveVarTensor
+                                        withShape:@[new_mean_shape[channelsDim]]
+                                             name:nil];
+
+          if(train && has_running_mean) {
+            // Running stats inplace update
+            runningMeanInplaceUpdate = [mpsGraph reshapeTensor:updatedRunningMeanTensor
+                                                     withShape:@[input_shape[channelsDim]]
+                                                          name:nil];
+            runningVarInplaceUpdate = [mpsGraph reshapeTensor:updatedRunningVarTensor
+                                                    withShape:@[input_shape[channelsDim]]
+                                                         name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->biasTensor_ = biasTensor;
+          newCachedGraph->runningMeanTensor_ = runningMeanTensor;
+          newCachedGraph->runningVarTensor_ = runningVarTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->saveMeanTensor_ = saveMeanTensor;
+          newCachedGraph->saveVarTensor_ = saveVarTensor;
+          newCachedGraph->runningMeanInplaceUpdate_ = runningMeanInplaceUpdate;
+          newCachedGraph->runningVarInplaceUpdate_ = runningVarInplaceUpdate;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, self, input_shape);
+    auto weightPlaceholder = native_mps::Placeholder();
+    if(has_weight)
+      weightPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_opt.value(), new_mean_shape);
+    auto biasPlaceholder = native_mps::Placeholder();
+    if(has_bias)
+      biasPlaceholder = native_mps::Placeholder(cachedGraph->biasTensor_, bias_opt.value(), new_mean_shape);
+    auto runningMeanPlaceholder = native_mps::Placeholder();
+    auto runningVarPlaceholder = native_mps::Placeholder();
+    if(has_running_mean) {
+      runningMeanPlaceholder = native_mps::Placeholder(cachedGraph->runningMeanTensor_, running_mean_opt.value(), new_mean_shape);
+      runningVarPlaceholder = native_mps::Placeholder(cachedGraph->runningVarTensor_, running_var_opt.value(), new_mean_shape);
+    }
+
+    auto runningMeanInplaceUpdatePlaceholder = native_mps::Placeholder();
+    auto runningVarInplaceUpdatePlaceholder = native_mps::Placeholder();
+
+    if(train && has_running_mean) {
+      runningMeanInplaceUpdatePlaceholder = native_mps::Placeholder(cachedGraph->runningMeanInplaceUpdate_, running_mean_opt.value());
+      runningVarInplaceUpdatePlaceholder = native_mps::Placeholder(cachedGraph->runningVarInplaceUpdate_, running_var_opt.value());
+    }
+
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output, input_shape);
+    auto saveMeanPlaceholder = native_mps::Placeholder(cachedGraph->saveMeanTensor_, save_mean);
+    auto saveVarPlaceholder = native_mps::Placeholder(cachedGraph->saveVarTensor_, save_var);
+
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    if(has_weight)
+      feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+    if(has_bias)
+      feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
+    if(has_running_mean) {
+      feeds[runningMeanPlaceholder.getMPSGraphTensor()] = runningMeanPlaceholder.getMPSGraphTensorData();
+      feeds[runningVarPlaceholder.getMPSGraphTensor()] = runningVarPlaceholder.getMPSGraphTensorData();
+    }
+
+    NSMutableDictionary *results = [[NSMutableDictionary new] autorelease];
+    results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData();
+    results[saveMeanPlaceholder.getMPSGraphTensor()] = saveMeanPlaceholder.getMPSGraphTensorData();
+    results[saveVarPlaceholder.getMPSGraphTensor()] = saveVarPlaceholder.getMPSGraphTensorData();
+
+    // If train and has_running_mean, add updated running mean to the output
+    if(train && has_running_mean) {
+      results[runningMeanInplaceUpdatePlaceholder.getMPSGraphTensor()] = runningMeanInplaceUpdatePlaceholder.getMPSGraphTensorData();
+      results[runningVarInplaceUpdatePlaceholder.getMPSGraphTensor()] = runningVarInplaceUpdatePlaceholder.getMPSGraphTensorData();
+    }
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_var);
+}
+
+std::tuple<Tensor, Tensor, Tensor> batch_norm_mps
+                  (const Tensor& self,
+                   const c10::optional<Tensor>& weight_opt,
+                   const c10::optional<Tensor>& bias_opt,
+                   const c10::optional<Tensor>& running_mean_opt,
+                   const c10::optional<Tensor>& running_var_opt,
+                   bool train,
+                   double momentum,
+                   double epsilon) {
+
+  const auto memory_format = self.suggest_memory_format();
+
+  auto output = at::native::empty_mps(
+          self.sizes(),
+          self.scalar_type(),
+          c10::nullopt,
+          kMPS,
+          c10::nullopt,
+          memory_format);
+
+  int64_t n_input = self.size(1);
+
+  auto save_mean = at::native::empty_mps(
+              {n_input},
+              self.scalar_type(),
+              // TODO: Accumulate type?
+              // at::toAccumulateType(self.scalar_type(), /*is_cuda=*/false),
+              c10::nullopt,
+              kMPS,
+              c10::nullopt,
+              c10::nullopt);
+  auto save_var = at::native::empty_mps(
+              {n_input},
+              self.scalar_type(),
+              // TODO: Accumulate type?
+              // at::toAccumulateType(self.scalar_type(), /*is_cuda=*/false),
+              c10::nullopt,
+              kMPS,
+              c10::nullopt,
+              c10::nullopt);
+
+  at::native::batch_norm_mps_out(
+      self,
+      weight_opt,
+      bias_opt,
+      running_mean_opt,
+      running_var_opt,
+      train,
+      momentum,
+      epsilon,
+      output,
+      save_mean,
+      save_var);
+  return std::make_tuple(output, save_mean, save_var);
+}
+
+string get_mem_string(c10::MemoryFormat memory_format) {
+  string mem_format_key;
+  switch(memory_format) {
+    case at::MemoryFormat::Contiguous:
+      mem_format_key = "Contiguous";
+      break;
+    case at::MemoryFormat::ChannelsLast:
+      mem_format_key = "ChannelsLast";
+      break;
+    default:
+      assert(0 && "Invalid memory format\n");
+  }
+
+  return mem_format_key;
+}
+
+// Batch norm backward
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_mps
+                  (const Tensor& grad_out,
+                   const Tensor& input,
+                   const c10::optional<Tensor>& weight_opt,
+                   const c10::optional<Tensor>& running_mean_opt,
+                   const c10::optional<Tensor>& running_var_opt,
+                   const c10::optional<Tensor>& save_mean_opt,
+                   const c10::optional<Tensor>& save_var_opt,
+                   bool train,
+                   double epsilon,
+                   std::array<bool,3> grad_input_mask) {
+
+  Tensor grad_input;
+  Tensor grad_weight;
+  Tensor grad_bias;
+
+  const auto memory_format = input.suggest_memory_format();
+
+  if (grad_input_mask[0]) {
+    grad_input = at::native::empty_mps(input.sizes(),
+                                       input.scalar_type(),
+                                       c10::nullopt,
+                                       kMPS,
+                                       c10::nullopt,
+                                       memory_format);
+  }
+  // Assuming that if grad_input_mask of weight is 1, then the weight is available
+  if (grad_input_mask[1]) {
+    grad_weight = at::native::empty_mps(weight_opt.value().sizes(),
+                                        weight_opt.value().scalar_type(),
+                                        c10::nullopt,
+                                        kMPS,
+                                        c10::nullopt,
+                                        at::MemoryFormat::Contiguous);
+  }
+  if (grad_input_mask[2]) {
+    grad_bias = at::native::empty_mps(weight_opt.value().sizes(),
+                                      weight_opt.value().scalar_type(),
+                                      c10::nullopt,
+                                      kMPS,
+                                      c10::nullopt,
+                                      at::MemoryFormat::Contiguous);
+  }
+
+  namespace native_mps = at::native::mps;
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* runningMeanTensor_ = nil;
+    MPSGraphTensor* runningVarTensor_ = nil;
+    MPSGraphTensor* saveMeanTensor_ = nil;
+    MPSGraphTensor* saveVarTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+    MPSGraphTensor* gradWeightTensor_ = nil;
+    MPSGraphTensor* gradBiasTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
+  const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+  TORCH_CHECK(has_running_mean == has_running_var);
+  const bool has_save_mean = (save_mean_opt.has_value() && save_mean_opt->defined());
+  const bool has_save_var = (save_var_opt.has_value() && save_var_opt->defined());
+  TORCH_CHECK(has_save_mean == has_save_var);
+
+  const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
+
+  if (grad_input.numel() == 0) {
+    return std::make_tuple(grad_input, grad_weight, grad_bias);
+  }
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    MPSShape* input_shape_readonly = mps::getMPSShape(input);
+    int num_input_dims = [input_shape_readonly count];
+    NSMutableArray<NSNumber*>* input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Broadcast with input
+    NSMutableArray<NSNumber*>* new_mean_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Reduction axes
+    NSMutableArray<NSNumber*>* axes = [NSMutableArray<NSNumber*> arrayWithCapacity:(num_input_dims-1)];
+
+    get_shapes(input_shape_readonly, input_shape, new_mean_shape, axes, num_input_dims, memory_format, true);
+
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":"
+                      + std::to_string(train) + ":"
+                      + std::to_string(has_running_mean) + ":"
+                      + std::to_string(has_weight) + ":"
+                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(input.scalar_type());
+    auto input_mps_dtype = native_mps::getMPSDataType(input.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          // NCHW - Channels dim is 1
+          int channelsDim = 1;
+
+          MPSGraphTensor* inputTensorOriginal = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_mps_dtype, input_shape);
+          // Shape is the ORIGINAL NCHW shape
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(grad_out.scalar_type()), input_shape_readonly);
+          MPSGraphTensor* weightTensor = nil;
+          if(has_weight)
+            weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(weight_opt.value().scalar_type()), new_mean_shape);
+          MPSGraphTensor* runningMeanTensor = nil;
+          MPSGraphTensor* runningVarTensor = nil;
+          if(has_running_mean) {
+            runningMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_mean_opt.value().scalar_type()), new_mean_shape);
+            runningVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_var_opt.value().scalar_type()), new_mean_shape);
+          }
+
+          // Mean and inv std tensors to be saved and returned
+          MPSGraphTensor* saveMeanTensor = nil;
+          MPSGraphTensor* saveVarTensor = nil;
+          if(has_save_mean) {
+            saveMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(save_mean_opt.value().scalar_type()), new_mean_shape);
+            saveVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(save_var_opt.value().scalar_type()), new_mean_shape);
+          }
+
+          MPSGraphTensor* gradInputTensor = nil;
+          MPSGraphTensor* gradWeightTensor = nil;
+          MPSGraphTensor* gradBiasTensor = nil;
+          MPSGraphTensor* inputTensor = nil;
+
+          if(memory_format == at::MemoryFormat::Contiguous)
+            inputTensor = inputTensorOriginal;
+          else {
+            // Reshape/transpose the input as needed
+            auto N = input_shape[0];
+            auto H = input_shape[1];
+            auto W = input_shape[2];
+            auto C = input_shape[3];
+
+            inputTensor = [mpsGraph reshapeTensor:inputTensorOriginal
+                                        withShape:@[N, ([NSNumber numberWithInt:[H intValue]* [W intValue]]), C]
+                                             name:nil];
+            inputTensor = [mpsGraph transposeTensor:inputTensor
+                                          dimension:1
+                                      withDimension:2
+                                               name:nil];
+            inputTensor = [mpsGraph reshapeTensor:inputTensor
+                                        withShape:@[N, C, H, W]
+                                             name:nil];
+          }
+
+          if(train) {
+            // Use save_mean and save_var
+            if(grad_input_mask[1]) {
+              gradWeightTensor = [mpsGraph normalizationGammaGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                                   sourceTensor:inputTensor
+                                                                                     meanTensor:saveMeanTensor
+                                                                                 varianceTensor:saveVarTensor
+                                                                                  reductionAxes:axes
+                                                                                        epsilon:(float)epsilon
+                                                                                           name:nil];
+            }
+            if(grad_input_mask[2]) {
+              gradBiasTensor = [mpsGraph normalizationBetaGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                                sourceTensor:inputTensor
+                                                                               reductionAxes:axes
+                                                                                        name:nil];
+            }
+            if(grad_input_mask[0]) {
+              gradInputTensor = [mpsGraph normalizationGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                             sourceTensor:inputTensor
+                                                                               meanTensor:saveMeanTensor
+                                                                           varianceTensor:saveVarTensor
+                                                                              gammaTensor:weightTensor
+                                                                      gammaGradientTensor:gradWeightTensor
+                                                                       betaGradientTensor:gradBiasTensor
+                                                                            reductionAxes:axes
+                                                                                  epsilon:(float) epsilon
+                                                                                     name:nil];
+            }
+          }
+          else {
+            // Use running mean and running var
+            MPSGraphTensor* rsqrtTensor = nil;
+            MPSGraphTensor* epsilonTensor = nil;
+            if(grad_input_mask[1]) {
+              epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon
+                                                     shape:@[@1]
+                                                  dataType:input_mps_dtype];
+              MPSGraphTensor* xMinusMean = [mpsGraph subtractionWithPrimaryTensor:inputTensor
+                                                                  secondaryTensor:runningMeanTensor
+                                                                             name:nil];
+              MPSGraphTensor* varianceEpsTensor = [mpsGraph additionWithPrimaryTensor:runningVarTensor
+                                                                      secondaryTensor:epsilonTensor
+                                                                                 name:nil];
+              rsqrtTensor = [mpsGraph reverseSquareRootWithTensor:varianceEpsTensor
+                                                             name:nil];
+              MPSGraphTensor* bnForwardTensor = [mpsGraph multiplicationWithPrimaryTensor:xMinusMean
+                                                                          secondaryTensor:rsqrtTensor
+                                                                                     name:nil];
+              MPSGraphTensor* gradBnMulTensor = [mpsGraph multiplicationWithPrimaryTensor:bnForwardTensor
+                                                                          secondaryTensor:gradOutputTensor
+                                                                                     name:nil];
+              gradWeightTensor = [mpsGraph reductionSumWithTensor:gradBnMulTensor
+                                                             axes:axes
+                                                             name:nil];
+            }
+            if(grad_input_mask[2]) {
+              gradBiasTensor = [mpsGraph normalizationBetaGradientWithIncomingGradientTensor:gradOutputTensor
+                                                                                sourceTensor:inputTensor
+                                                                               reductionAxes:axes
+                                                                                        name:nil];
+            }
+            if(grad_input_mask[0]) {
+
+              MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
+                                                                  shape:input_shape_readonly
+                                                               dataType:input_mps_dtype];
+              if(!epsilonTensor)
+                epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon
+                                                       shape:@[@1]
+                                                    dataType:input_mps_dtype];
+              if(!rsqrtTensor) {
+                MPSGraphTensor* varianceEpsTensor = [mpsGraph additionWithPrimaryTensor:runningVarTensor
+                                                                      secondaryTensor:epsilonTensor
+                                                                                 name:nil];
+                rsqrtTensor = [mpsGraph reverseSquareRootWithTensor:varianceEpsTensor
+                                                               name:nil];
+              }
+
+              gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:unitTensor
+                                                          secondaryTensor:rsqrtTensor
+                                                                     name:nil];
+              if(has_weight)
+                gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradInputTensor
+                                                            secondaryTensor:weightTensor
+                                                                       name:nil];
+              gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradInputTensor
+                                                          secondaryTensor:gradOutputTensor
+                                                                     name:nil];
+            }
+          }
+
+          if(grad_input_mask[1]) {
+            gradWeightTensor = [mpsGraph reshapeTensor:gradWeightTensor
+                                             withShape:@[input_shape_readonly[channelsDim]]
+                                                  name:nil];
+          }
+          if(grad_input_mask[2]) {
+            gradBiasTensor = [mpsGraph reshapeTensor:gradBiasTensor
+                                           withShape:@[input_shape_readonly[channelsDim]]
+                                                name:nil];
+          }
+
+          MPSGraphTensor* gradInputTensorFinal = nil;
+
+          if(memory_format == at::MemoryFormat::Contiguous)
+            gradInputTensorFinal = gradInputTensor;
+          else {
+            // Reshape/transpose the input as needed
+            auto N = input_shape[0];
+            auto H = input_shape[1];
+            auto W = input_shape[2];
+            auto C = input_shape[3];
+
+            gradInputTensorFinal = [mpsGraph reshapeTensor:gradInputTensor
+                                                 withShape:@[N, C, ([NSNumber numberWithInt:[H intValue]* [W intValue]])]
+                                                      name:nil];
+            gradInputTensorFinal = [mpsGraph transposeTensor:gradInputTensorFinal
+                                                   dimension:1
+                                               withDimension:2
+                                                        name:nil];
+            gradInputTensorFinal = [mpsGraph reshapeTensor:gradInputTensorFinal
+                                                 withShape:@[N, H, W, C]
+                                                      name:nil];
+          }
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensorOriginal;
+          newCachedGraph->weightTensor_ = weightTensor;
+          newCachedGraph->runningMeanTensor_ = runningMeanTensor;
+          newCachedGraph->runningVarTensor_ = runningVarTensor;
+          newCachedGraph->saveMeanTensor_ = saveMeanTensor;
+          newCachedGraph->saveVarTensor_ = saveVarTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensorFinal;
+          newCachedGraph->gradWeightTensor_ = gradWeightTensor;
+          newCachedGraph->gradBiasTensor_ = gradBiasTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input, input_shape);
+    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_out, input_shape_readonly);
+    auto weightPlaceholder = native_mps::Placeholder();
+    if(has_weight)
+      weightPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_opt.value(), new_mean_shape);
+    auto runningMeanPlaceholder = native_mps::Placeholder();
+    auto runningVarPlaceholder = native_mps::Placeholder();
+    if(has_running_mean) {
+      runningMeanPlaceholder = native_mps::Placeholder(cachedGraph->runningMeanTensor_, running_mean_opt.value(), new_mean_shape);
+      runningVarPlaceholder = native_mps::Placeholder(cachedGraph->runningVarTensor_, running_var_opt.value(), new_mean_shape);
+    }
+    auto saveMeanPlaceholder = native_mps::Placeholder();
+    auto saveVarPlaceholder = native_mps::Placeholder();
+    if(has_save_mean) {
+      saveMeanPlaceholder = native_mps::Placeholder(cachedGraph->saveMeanTensor_, save_mean_opt.value(), new_mean_shape);
+      saveVarPlaceholder = native_mps::Placeholder(cachedGraph->saveVarTensor_, save_var_opt.value(), new_mean_shape);
+    }
+
+    auto gradInputPlaceholder = native_mps::Placeholder();
+    if(grad_input_mask[0])
+      gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input, input_shape);
+    auto gradWeightPlaceholder = native_mps::Placeholder();
+    if(grad_input_mask[1])
+      gradWeightPlaceholder = native_mps::Placeholder(cachedGraph->gradWeightTensor_, grad_weight);
+    auto gradBiasPlaceholder = native_mps::Placeholder();;
+    if(grad_input_mask[2])
+      gradBiasPlaceholder = native_mps::Placeholder(cachedGraph->gradBiasTensor_, grad_bias);
+
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+    if(has_weight)
+      feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
+    if(has_running_mean) {
+      feeds[runningMeanPlaceholder.getMPSGraphTensor()] = runningMeanPlaceholder.getMPSGraphTensorData();
+      feeds[runningVarPlaceholder.getMPSGraphTensor()] = runningVarPlaceholder.getMPSGraphTensorData();
+    }
+    if(has_save_mean) {
+      feeds[saveMeanPlaceholder.getMPSGraphTensor()] = saveMeanPlaceholder.getMPSGraphTensorData();
+      feeds[saveVarPlaceholder.getMPSGraphTensor()] = saveVarPlaceholder.getMPSGraphTensorData();
+    }
+
+    NSMutableDictionary *results = [[NSMutableDictionary new] autorelease];
+    if(grad_input_mask[0])
+      results[gradInputPlaceholder.getMPSGraphTensor()] = gradInputPlaceholder.getMPSGraphTensorData();
+    if(grad_input_mask[1])
+      results[gradWeightPlaceholder.getMPSGraphTensor()] = gradWeightPlaceholder.getMPSGraphTensorData();
+    if(grad_input_mask[2])
+      results[gradBiasPlaceholder.getMPSGraphTensor()] = gradBiasPlaceholder.getMPSGraphTensorData();
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return std::make_tuple(grad_input, grad_weight, grad_bias);
+
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
new file mode 100644
index 000000000000..569cad0fbfb0
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -0,0 +1,123 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace at {
+namespace native {
+// scope the MPS's internal methods to not expose them to at::native
+namespace mps {
+
+Tensor& addc_mul_div_out_mps(const Tensor& self,
+                             const Tensor& tensor1,
+                             const Tensor& tensor2,
+                             const Scalar& value_opt, // default value = 1.0
+                             Tensor& output,
+                             const bool is_div,
+                             const string op_name)
+{
+  using scalar_t = double;
+  scalar_t value_scalar = value_opt.to<scalar_t>();
+  if (&output != &self) {
+    output.resize_(output.sizes());
+  }
+  TORCH_CHECK(output.is_mps());
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    MPSGraphTensor *firstTensor = nil, *secondTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = op_name + to_string(value_scalar)
+                         + getTensorsStringKey({self, tensor1, tensor2})+ ":"
+                         + getMPSTypeString(value_opt.type());
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+          CachedGraph* newCachedGraph = nil;
+          @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+
+            newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+            newCachedGraph->firstTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor1);
+            newCachedGraph->secondTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor2);
+
+            // the tensor to be optionally multiplied by value_scalar
+            MPSGraphTensor *multiplicandTensor = nil;
+            if (is_div) {
+              multiplicandTensor = [mpsGraph divisionWithPrimaryTensor:newCachedGraph->firstTensor
+                                                       secondaryTensor:newCachedGraph->secondTensor
+                                                                  name:nil];
+            } else {
+              multiplicandTensor = [mpsGraph multiplicationWithPrimaryTensor:newCachedGraph->firstTensor
+                                                             secondaryTensor:newCachedGraph->secondTensor
+                                                                        name:nil];
+            }
+            // the tensor to be added to input_tensor
+            MPSGraphTensor *addendTensor = multiplicandTensor;
+            // if value_scalar is 1.0, then we don't bother adding another multiply to graph
+            if (value_scalar != 1.0) {
+              MPSGraphTensor* valueTensor = [mpsGraph constantWithScalar:value_scalar
+                                                                dataType:getMPSScalarType(value_opt.type())];
+              addendTensor = [mpsGraph multiplicationWithPrimaryTensor:multiplicandTensor
+                                                       secondaryTensor:valueTensor
+                                                                  name:nil];
+            }
+            newCachedGraph->outputTensor = [mpsGraph additionWithPrimaryTensor:newCachedGraph->inputTensor
+                                                               secondaryTensor:addendTensor
+                                                                          name:nil];
+          }
+          return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    // Inputs as placeholders
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self);
+    Placeholder tensor1Placeholder = Placeholder(cachedGraph->firstTensor, tensor1);
+    Placeholder tensor2Placeholder = Placeholder(cachedGraph->secondTensor, tensor2);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+
+    // Create dictionary of inputs and outputs
+    // Utility to dump out graph : [mpsGraph dump];
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      tensor1Placeholder.getMPSGraphTensor() : tensor1Placeholder.getMPSGraphTensorData(),
+      tensor2Placeholder.getMPSGraphTensor() : tensor2Placeholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+} // namespace mps
+
+// APIs exposed to at::native scope
+TORCH_IMPL_FUNC(addcmul_out_mps)
+(const Tensor& self, const Tensor& tensor1, const Tensor& tensor2, const Scalar& value, const Tensor& output)
+{
+  mps::addc_mul_div_out_mps(self, tensor1, tensor2, value, const_cast<Tensor&>(output), false, "addcmul_out_mps");
+}
+
+TORCH_IMPL_FUNC(addcdiv_out_mps)
+(const Tensor& self, const Tensor& tensor1, const Tensor& tensor2, const Scalar& value, const Tensor& output)
+{
+  mps::addc_mul_div_out_mps(self, tensor1, tensor2, value, const_cast<Tensor&>(output), true, "addcdiv_out_mps");
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
new file mode 100644
index 000000000000..77a284963d6e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -0,0 +1,891 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Pool.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+// Create pooling descriptor
+void fill_pool_desc(MPSGraphPooling2DOpDescriptor* desc,
+                    NSUInteger kW, NSUInteger kH,
+                    NSUInteger dW, NSUInteger dH,
+                    NSUInteger dilationW, NSUInteger dilationH,
+                    NSUInteger padW, NSUInteger padH,
+                    bool ceil_mode, c10::MemoryFormat memory_format) {
+  desc.kernelWidth = kW;
+  desc.kernelHeight = kH;
+  desc.strideInX = dW;
+  desc.strideInY = dH;
+  desc.dilationRateInX = dilationW;
+  desc.dilationRateInY = dilationH;
+  desc.paddingLeft = padW;
+  desc.paddingRight = padW;
+  desc.paddingTop = padH;
+  desc.paddingBottom = padH;
+  desc.ceilMode = ceil_mode;
+  desc.paddingStyle = MPSGraphPaddingStyleExplicit;
+  switch(memory_format) {
+    case at::MemoryFormat::Contiguous:
+      desc.dataLayout = MPSGraphTensorNamedDataLayoutNCHW;
+      break;
+    case at::MemoryFormat::ChannelsLast:
+      desc.dataLayout = MPSGraphTensorNamedDataLayoutNHWC;
+      break;
+    default:
+      assert(0 && "Check should have been done earlier\n");
+  }
+}
+
+Tensor _mps_max_pool2d(
+    const Tensor& input_t,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  // NB: stride default is not expressible as an integer constant, so we accept
+  // empty stride for this case
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_t.suggest_memory_format();
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    TORCH_CHECK(input_t.ndimension() == 4,
+      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
+  } else if (memory_format == at::MemoryFormat::Contiguous) {
+    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+  } else {
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+
+  /* sizes */
+  const int64_t nbatch = input_t.ndimension() == 4 ? input_t.size(-4) : 1;
+  const int64_t nInputPlane = input_t.size(-3);
+  const int64_t inputHeight = input_t.size(-2);
+  const int64_t inputWidth = input_t.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+
+  pool2d_shape_check(
+    input_t,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth, memory_format);
+
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "mps_max_pool2d";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  Tensor output_t;
+
+  if (input_t.ndimension() == 3) {
+    output_t = at::native::empty_mps(
+                  {nInputPlane, outputHeight, outputWidth},
+                  input_t.scalar_type(),
+                  c10::nullopt,
+                  kMPS,
+                  c10::nullopt,
+                  memory_format);
+  } else {
+    output_t = at::native::empty_mps(
+                  {nbatch, nInputPlane, outputHeight, outputWidth},
+                  input_t.scalar_type(),
+                  c10::nullopt,
+                  kMPS,
+                  c10::nullopt,
+                  memory_format);
+  }
+
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string key = "mps_max_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                     to_string(dW) + ":" + to_string(dH) + ":" +
+                                     to_string(dilationW) + ":" + to_string(dilationH) + ":" +
+                                     to_string(padW) + ":" + to_string(padH) + ":" +
+                                     to_string(ceil_mode) + ":" + mem_format_key +
+                                     mps::getTensorsStringKey({input_t});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          MPSGraphTensor* outputTensor = [mpsGraph maxPooling2DWithSourceTensor:inputTensor
+                                                                     descriptor:desc
+                                                                           name:nil];
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output_t;
+}
+
+Tensor mps_max_pool2d_backward(
+    const Tensor& grad_output,
+    const Tensor& input_t,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  // NB: stride default is not expressible as an integer constant, so we accept
+  // empty stride for this case
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_t.suggest_memory_format();
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    TORCH_CHECK(input_t.ndimension() == 4,
+      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
+  } else if (memory_format == at::MemoryFormat::Contiguous) {
+    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+  } else {
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "mps_max_pool2d_backward";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  Tensor grad_input;
+  grad_input = at::native::empty_mps(
+                input_t.sizes(),
+                input_t.scalar_type(),
+                c10::nullopt,
+                kMPS,
+                c10::nullopt,
+                memory_format);
+
+  if (grad_input.numel() == 0) {
+    return grad_input;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string key = "mps_max_pool2d_backward:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                              to_string(dW) + ":" + to_string(dH) + ":" +
+                                              to_string(dilationW) + ":" + to_string(dilationH) + ":" +
+                                              to_string(padW) + ":" + to_string(padH) + ":" +
+                                              to_string(ceil_mode) + ":" + mem_format_key +
+                                              mps::getTensorsStringKey({input_t, grad_output});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor
+                                                                                sourceTensor:inputTensor
+                                                                                  descriptor:desc
+                                                                                        name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return grad_input;
+}
+
+TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
+    const Tensor& input_t,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode,
+    const Tensor& output_t,
+    const Tensor& indices) {
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  // NB: stride default is not expressible as an integer constant, so we accept
+  // empty stride for this case
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_t.suggest_memory_format();
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    TORCH_CHECK(input_t.ndimension() == 4,
+      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
+  } else if (memory_format == at::MemoryFormat::Contiguous) {
+    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+  } else {
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+
+  /* sizes */
+  const int64_t nbatch = input_t.ndimension() == 4 ? input_t.size(-4) : 1;
+  const int64_t nInputPlane = input_t.size(-3);
+  const int64_t inputHeight = input_t.size(-2);
+  const int64_t inputWidth = input_t.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+
+  pool2d_shape_check(
+    input_t,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth, memory_format);
+
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "max_pool2d_with_indices_out_mps";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* indicesTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  if (output_t.numel() == 0) {
+    return;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string key = "max_pool2d_with_indices_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                                      to_string(dW) + ":" + to_string(dH) + ":" +
+                                                      to_string(dilationW) + ":" + to_string(dilationH) + ":" +
+                                                      to_string(padW) + ":" + to_string(padH) + ":" +
+                                                      to_string(ceil_mode) + ":" + mem_format_key +
+                                                      mps::getTensorsStringKey({input_t}) + ":" +
+                                                      native_mps::getMPSTypeString(indices.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
+          desc.returnIndicesMode = MPSGraphPoolingReturnIndicesGlobalFlatten2D;
+          desc.returnIndicesDataType = MPSDataTypeInt32;
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:inputTensor
+                                                                                           descriptor:desc
+                                                                                                 name:nil];
+
+            MPSGraphTensor* indicesTensor = poolOutputs[1];
+            if(mps::getMPSDataType(indices.scalar_type()) == MPSDataTypeInt64) {
+                indicesTensor = [mpsGraph castTensor:indicesTensor
+                                               toType:MPSDataTypeInt64
+                                                 name:@"castToI64"];
+            }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = poolOutputs[0];
+          newCachedGraph->indicesTensor_ = indicesTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
+    auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)
+(const Tensor& grad_output,
+const Tensor& input_t,
+IntArrayRef kernel_size,
+IntArrayRef stride,
+IntArrayRef padding,
+IntArrayRef dilation,
+bool ceil_mode,
+const Tensor& indices,
+const Tensor& grad_input) {
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  // NB: stride default is not expressible as an integer constant, so we accept
+  // empty stride for this case
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_t.suggest_memory_format();
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    TORCH_CHECK(input_t.ndimension() == 4,
+      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
+  } else if (memory_format == at::MemoryFormat::Contiguous) {
+    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+  } else {
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "max_pool2d_with_indices_backward_out_mps";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string key = "max_pool2d_with_indices_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                               to_string(dW) + ":" + to_string(dH) + ":" +
+                                               to_string(dilationW) + ":" + to_string(dilationH) + ":" +
+                                               to_string(padW) + ":" + to_string(padH) + ":" +
+                                               to_string(ceil_mode) + ":" + mem_format_key +
+                                               mps::getTensorsStringKey({input_t, grad_output});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor
+                                                                                sourceTensor:inputTensor
+                                                                                  descriptor:desc
+                                                                                        name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+TORCH_IMPL_FUNC(avg_pool2d_out_mps) (
+   const Tensor& input_,
+   int64_t kH_,
+   int64_t kW_,
+   int64_t dH_,
+   int64_t dW_,
+   int64_t padH_,
+   int64_t padW_,
+   bool ceil_mode,
+   bool count_include_pad,
+   c10::optional<int64_t> divisor_override,
+   const Tensor& output) {
+  namespace native_mps = at::native::mps;
+
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg input_arg{ input_, "input_", 2 };
+
+  checkAllSameGPU("avg_pool2d_out_cuda", {output_arg, input_arg});
+
+  const int kH = safe_downcast<int, int64_t>(kH_);
+  const int kW = safe_downcast<int, int64_t>(kW_);
+
+  const int dH = safe_downcast<int, int64_t>(dH_);
+  const int dW = safe_downcast<int, int64_t>(dW_);
+
+  const int padH = safe_downcast<int, int64_t>(padH_);
+  const int padW = safe_downcast<int, int64_t>(padW_);
+
+  /* sizes */
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+  int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+  const auto memory_format = input_.suggest_memory_format();
+
+  Tensor input = input_.contiguous(memory_format);
+
+  const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
+
+  bool use_divisor = divisor_override.has_value();
+  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
+
+  if (count != 0) {
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor* inputTensor_ = nil;
+      MPSGraphTensor* outputTensor_ = nil;
+      MPSGraphTensor* indicesTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+      string mem_format_key;
+      switch(memory_format) {
+        case at::MemoryFormat::Contiguous:
+          mem_format_key = "Contiguous";
+          break;
+        case at::MemoryFormat::ChannelsLast:
+          mem_format_key = "ChannelsLast";
+          break;
+        default:
+          assert(0 && "Check should have been done earlier\n");
+      }
+
+      string key = "mps_avg_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                       to_string(dW) + ":" + to_string(dH) + ":" +
+                                       to_string(padW) + ":" + to_string(padH) + ":" +
+                                       to_string(ceil_mode) + ":" + mem_format_key + ":" +
+                                       to_string(divisor_override_value) +
+                                       mps::getTensorsStringKey({input});
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+      if(!cachedGraph) {
+        native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          CachedGraph *newCachedGraph = nil;
+
+          @autoreleasepool {
+            MPSGraph* mpsGraph = native_mps::make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+
+            MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+            fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format);
+
+            MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
+            MPSGraphTensor* outputTensor = [mpsGraph avgPooling2DWithSourceTensor:inputTensor
+                                                                       descriptor:desc
+                                                                             name:nil];
+            newCachedGraph->inputTensor_ = inputTensor;
+            newCachedGraph->outputTensor_ = outputTensor;
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+      }
+
+      auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input);
+      auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output);
+
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      };
+
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
+
+      native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+  }
+}
+
+TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps) (
+  const Tensor& gradOutput_,
+  const Tensor& input_,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  bool ceil_mode,
+  bool count_include_pad,
+  c10::optional<int64_t> divisor_override,
+  const Tensor& gradInput
+) {
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+
+  checkAllSameGPU("avg_pool2d_backward_out_cuda",
+                  {gradInput_arg, gradOutput_arg, input_arg});
+  namespace native_mps = at::native::mps;
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const auto memory_format = input_.suggest_memory_format();
+  const Tensor input = input_.contiguous(memory_format);
+  const Tensor gradOutput = gradOutput_.contiguous(memory_format);
+
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+
+
+  const int32_t count = safe_downcast<int32_t, int64_t>(input.numel());
+  if (count == 0) {
+    return;
+  }
+  bool use_divisor = divisor_override.has_value();
+  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
+
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "avg_pool2d_backward_out_mps";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  if (gradInput.numel() == 0) {
+    return;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    string mem_format_key;
+    switch(memory_format) {
+      case at::MemoryFormat::Contiguous:
+        mem_format_key = "Contiguous";
+        break;
+      case at::MemoryFormat::ChannelsLast:
+        mem_format_key = "ChannelsLast";
+        break;
+      default:
+        assert(0 && "Check should have been done earlier\n");
+    }
+
+    string key = "avg_pool2d_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
+                                               to_string(dW) + ":" + to_string(dH) + ":" +
+                                               to_string(outputWidth) + ":" + to_string(outputHeight) + ":" +
+                                               to_string(padW) + ":" + to_string(padH) + ":" +
+                                               to_string(ceil_mode) + ":" + mem_format_key +
+                                               mps::getTensorsStringKey({input, gradOutput});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
+          fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, gradOutput);
+          MPSGraphTensor *gradInputTensor = [mpsGraph avgPooling2DGradientWithGradientTensor:gradOutputTensor
+                                                                               sourceTensor:inputTensor
+                                                                                descriptor : desc
+                                                                                       name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input);
+    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, gradOutput);
+    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, gradInput);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm
new file mode 100644
index 000000000000..d7307b9b39e9
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@@ -0,0 +1,66 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <cmath>
+#include <limits>
+
+namespace at {
+namespace native {
+
+
+Tensor& arange_mps_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
+  AT_DISPATCH_MPS_TYPES(result.scalar_type(), "arange_mps", [&]() {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+
+    double size_d;
+    if (std::is_same<scalar_t, int64_t>::value) {
+      size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
+                          / step.to<accscalar_t>());
+    } else {
+      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+                          / step.to<double>());
+    }
+
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+              std::isfinite(static_cast<double>(xend)),
+              "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+              "upper bound and larger bound inconsistent with step sign");
+
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+              "invalid size, possible overflow?");
+    int64_t size = static_cast<int64_t>(size_d);
+    int64_t numel = result.numel();
+
+    if (numel != size) {
+      if(numel > 0){
+        TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
+                    " is ", numel, " which does not match the computed number of elements ", size,
+                    ". Note that this may occur as a result of rounding error. "
+                    "The out tensor will be resized to a tensor of shape (", size, ",).");
+      }
+      result.resize_({size});
+    }
+    bool is_contiguous = result.is_contiguous();
+    Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+    //TODO: Add arange Metal kernel.
+
+    if(!is_contiguous) {
+      result.copy_(r);
+    }
+  });
+
+  return result;
+}
+}} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
new file mode 100644
index 000000000000..7e840c3f4dd9
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -0,0 +1,1587 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/Pool.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+using namespace std;
+
+enum StdVarType {
+  STANDARD_VARIANCE,
+  STANDARD_DEVIATION
+};
+
+void set_apparent_shapes(NSMutableArray<NSNumber*> * &apparent_out_shape,
+                         NSMutableArray<NSNumber*> * &apparent_in_shape,
+                         int64_t num_reduce_dims,
+                         int64_t num_input_dims,
+                         int64_t num_output_dims,
+                         IntArrayRef& input_shape,
+                         NSMutableArray<NSNumber*> * &axes) {
+
+  if(num_reduce_dims == 0) {
+    /* Output shape becomes a one
+     * Input shape becomes flattened
+     * Because 0 reduce dims means all dims are reduced
+     */
+    apparent_in_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+    int64_t num_in_elements = 1;
+    for(int i = 0; i < num_input_dims; i++) {
+        num_in_elements *= input_shape[i];
+    }
+    apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+    apparent_out_shape[0] = @1;
+  }
+
+  else {
+    // num_output_dims in this case is number of input dims
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_output_dims];
+    for(int i = 0; i < num_output_dims; i++) {
+      int64_t current_input_dim = input_shape[i];
+
+      // If the current dim is to be reduced
+      bool is_reduce_dim = false;
+
+      for(int j = 0; j < num_reduce_dims; j++) {
+        if(i == [axes[j] intValue]) {
+          is_reduce_dim = true;
+          break;
+        }
+      }
+
+      if(is_reduce_dim) {
+        apparent_out_shape[i] = @1;
+      }
+      else {
+        apparent_out_shape[i] = [NSNumber numberWithInt:current_input_dim];
+      }
+    }
+  }
+
+}
+
+// Helper function to set the axes of reduction
+void set_axes(NSMutableArray<NSNumber *> * &axes,
+              int64_t num_reduce_dims,
+              IntArrayRef& dim,
+              int64_t num_input_dims) {
+    if(num_reduce_dims == 0) {
+      axes = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      axes[0] = @0;
+    }
+    else {
+      axes = [NSMutableArray<NSNumber*> arrayWithCapacity:num_reduce_dims];
+      for(int i = 0; i < num_reduce_dims; i++) {
+        axes[i] = [NSNumber numberWithInt:maybe_wrap_dim(dim[i], num_input_dims)];
+      }
+    }
+}
+
+void reduction_out_mps
+   (const Tensor& input_t,
+    IntArrayRef dim,
+    bool keepdim,
+    c10::optional<ScalarType> dtype,
+    const Tensor& output_t,
+    string reduction_type,
+    string func_name) {
+
+  IntArrayRef input_shape = input_t.sizes();
+
+  for(int i = 0; i < dim.size(); i++) {
+    auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size());
+    TORCH_CHECK(wrap_dim < input_shape.size(),
+    func_name+": reduction dim must be in the range of input shape")
+  }
+
+  namespace native_mps = at::native::mps;
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  int64_t num_input_dims = input_shape.size();
+  int64_t num_reduce_dims = dim.size();
+  int64_t num_output_dims;
+
+  // For output shape calculation, assume that keepdim is true
+  num_output_dims = num_input_dims;
+  NSMutableArray<NSNumber*> *apparent_output_shape = nil;
+  NSMutableArray<NSNumber*> *apparent_input_shape = nil;
+
+  // Reduction axes
+  NSMutableArray<NSNumber *> *axes;
+  set_axes(axes, num_reduce_dims, dim, input_shape.size());
+
+  set_apparent_shapes(apparent_output_shape,
+                      apparent_input_shape,
+                      num_reduce_dims,
+                      num_input_dims,
+                      num_output_dims,
+                      input_shape,
+                      axes);
+
+  if (output_t.numel() == 0 || input_t.numel() == 0) {
+    return;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    // TODO: Make this key proper
+    NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
+    string key =  func_name+":" + string([ns_key UTF8String]) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":" + native_mps::getMPSTypeString(output_t.scalar_type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+
+          MPSGraphTensor* castInputTensor = nil;
+
+          if(input_t.scalar_type() != ScalarType::Float && input_t.scalar_type() != ScalarType::Int)
+            castInputTensor =  [mpsGraph castTensor:inputTensor
+                                             toType:MPSDataTypeFloat32
+                                               name:@"castInputTensor"];
+          else
+            castInputTensor = inputTensor;
+
+          MPSGraphTensor* castOutputTensor = nil;
+
+          if(reduction_type == "sum")
+            castOutputTensor = [mpsGraph reductionSumWithTensor:castInputTensor
+                                                           axes:axes
+                                                           name:nil];
+          else if(reduction_type == "prod")
+            castOutputTensor = [mpsGraph reductionProductWithTensor:castInputTensor
+                                                               axes:axes
+                                                               name:nil];
+          else if(reduction_type == "mean")
+            castOutputTensor = [mpsGraph meanOfTensor:inputTensor
+                                                 axes:axes
+                                                 name:nil];
+
+          MPSGraphTensor* outputTensor = nil;
+
+          if(input_t.scalar_type() != ScalarType::Float)
+            outputTensor = [mpsGraph castTensor:castOutputTensor
+                                         toType:(native_mps::getMPSDataType(output_t.scalar_type()))
+                                           name:@"outputTensor"];
+          else
+            outputTensor = castOutputTensor;
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder();
+
+    if(apparent_input_shape)
+      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
+    else
+      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(sum_out_mps)
+   (const Tensor& input_t,
+    IntArrayRef dim,
+    bool keepdim,
+    c10::optional<ScalarType> dtype,
+    const Tensor& output_t) {
+
+    reduction_out_mps(input_t, dim, keepdim, dtype, output_t, "sum", "sum_out_mps");
+}
+
+TORCH_IMPL_FUNC(prod_out_mps)
+   (const Tensor& input_t,
+    int64_t dim,
+    bool keepdim,
+    c10::optional<ScalarType> dtype,
+    const Tensor& output_t) {
+
+    int64_t dims[1] = {dim};
+
+    reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, "prod", "prod_out_mps");
+}
+
+// Taken from ReduceOps.cpp
+inline ScalarType get_dtype_from_self(
+    const Tensor& self,
+    const optional<ScalarType>& dtype,
+    bool promote_integers) {
+  if (dtype.has_value()) {
+    return dtype.value();
+  }
+  ScalarType src_type = self.scalar_type();
+  if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
+    return kLong;
+  }
+  return src_type;
+}
+
+Tensor prod_mps(const Tensor &self, c10::optional<ScalarType> opt_dtype) {
+
+  auto num_dims = self.dim();
+
+  int64_t dims[num_dims];
+
+  for(int i = 0; i < num_dims; i++)
+    dims[i] = i;
+
+  Tensor output_t = at::native::empty_mps(
+                      {},
+                      get_dtype_from_self(self, opt_dtype, true),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+
+  reduction_out_mps(self, IntArrayRef(dims, num_dims), false, opt_dtype, const_cast<Tensor&>(output_t), "prod", "prod_mps");
+
+  return output_t;
+}
+
+TORCH_IMPL_FUNC(mean_out_mps)
+   (const Tensor& input_t,
+    IntArrayRef dim,
+    bool keepdim,
+    c10::optional<ScalarType> dtype,
+    const Tensor& output_t) {
+
+    reduction_out_mps(input_t, dim, keepdim, dtype, output_t, "mean", "mean_out_mps");
+}
+
+TORCH_IMPL_FUNC(argmax_out_mps)
+   (const Tensor& input_t,
+    c10::optional<int64_t> dim,
+    bool keepdim,
+    const Tensor& output_t) {
+
+    namespace native_mps = at::native::mps;
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    int64_t dim_;
+
+    if (dim.has_value()) {
+        dim_ = maybe_wrap_dim(dim.value(), input_t.dim());
+        native::zero_numel_check_dims(input_t, dim_, "argmax()");
+    } else {
+        TORCH_CHECK_INDEX(
+        input_t.numel() != 0,
+        "argmax()", ": Expected reduction dim to be specified for input.numel() == 0.");
+        // Since input will be flattened, take argmax along 0'th dimension
+        dim_ = 0;
+    }
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_in_shape = nil;
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+
+    if(dim.has_value()) {
+        apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+        for(int i = 0; i < num_input_dims; i++) {
+            if(dim_ == i)
+                apparent_out_shape[i] = @1;
+            else
+                apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+        }
+    }
+    else {
+        apparent_in_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+        int64_t num_in_elements = 1;
+        for(int i = 0; i < num_input_dims; i++) {
+            num_in_elements *= input_shape[i];
+        }
+        apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+        apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+        apparent_out_shape[0] = @1;
+    }
+
+    if (output_t.numel() == 0) {
+        return;
+    }
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        string key = "argmax_out_mps:" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+
+              MPSGraphTensor* castInputTensor = nil;
+
+              if(input_t.scalar_type() != ScalarType::Float &&
+                 input_t.scalar_type() != ScalarType::Int   &&
+                 input_t.scalar_type() != ScalarType::Half)
+                castInputTensor =  [mpsGraph castTensor:inputTensor
+                                                 toType:MPSDataTypeFloat32
+                                                   name:@"castInputTensor"];
+              else
+                castInputTensor = inputTensor;
+
+              MPSGraphTensor* argmaxOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor
+                                                                                   axis:(NSInteger)dim_
+                                                                                   name:@"argmax_out"];
+              MPSGraphTensor* outputTensor = [mpsGraph castTensor:argmaxOutTensor
+                                                           toType:MPSDataTypeInt64
+                                                             name:@"cast_out"];
+
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        native_mps::Placeholder inputPlaceholder = native_mps::Placeholder();
+        if(apparent_in_shape)
+            inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape);
+        else
+            inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    }
+
+}
+
+TORCH_IMPL_FUNC(norm_out_mps)
+(const Tensor& input_t,
+ const OptionalScalarRef opt_p,
+ IntArrayRef dim,
+ bool keepdim,
+ const Tensor& output_t)
+{
+  if (input_t.numel() == 0)
+    return;
+  IntArrayRef input_shape = input_t.sizes();
+
+  for(int i = 0; i < dim.size(); i++) {
+    auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size());
+    TORCH_CHECK(wrap_dim < input_shape.size(),
+    "norm_out_mps: reduction dim must be in the range of input shape")
+  }
+  namespace native_mps = at::native::mps;
+  CheckedFrom c = "norm_out_mps";
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  auto p = opt_p.has_value() ? opt_p.get().to<double>() : Scalar(2.0).to<double>();
+  auto reciprocal_p = 1 / p;
+  bool pIsZero = (p == 0.0);
+  bool pIsPosInf = (p == numeric_limits<double>::infinity());
+  bool pIsNegInf = (p == -numeric_limits<double>::infinity());
+
+  int64_t num_input_dims = input_shape.size();
+  int64_t num_reduce_dims = dim.size();
+  int64_t num_output_dims;
+
+  // For output shape calculation, assume that keepdim is true
+  num_output_dims = num_input_dims;
+  NSMutableArray<NSNumber*> *apparent_output_shape = nil;
+  NSMutableArray<NSNumber*> *apparent_input_shape = nil;
+
+  // Reduction axes
+  NSMutableArray<NSNumber *> *axes;
+  set_axes(axes, num_reduce_dims, dim, input_shape.size());
+
+  set_apparent_shapes(apparent_output_shape,
+                      apparent_input_shape,
+                      num_reduce_dims,
+                      num_input_dims,
+                      num_output_dims,
+                      input_shape,
+                      axes);
+  if (output_t.numel() == 0) {
+    return;
+  }
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
+      string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
+      string key =  string("norm_out_mps:") + [ns_key UTF8String] + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":p" + to_string(p) + ":" + keepdim_info;
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+
+          MPSGraphTensor *outputTensor;
+
+          if (pIsZero)
+          {
+              MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                       name:nil];
+              MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p
+                                                                   dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+              MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor
+                                                             secondaryTensor:powerValTensor
+                                                                        name:nil];
+              outputTensor = [mpsGraph reductionSumWithTensor:powerTensor
+                                                         axes:axes
+                                                         name:nil];
+          }
+          else if (pIsPosInf)
+          {
+              MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                       name:nil];
+              outputTensor = [mpsGraph reductionMaximumWithTensor:absoluteTensor
+                                                             axes:axes
+                                                             name:nil];
+          }
+          else if (pIsNegInf)
+          {
+              MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                       name:nil];
+              outputTensor = [mpsGraph reductionMinimumWithTensor:absoluteTensor
+                                                             axes:axes
+                                                             name:nil];
+          }
+          else
+          {
+              MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                       name:nil];
+
+              MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p
+                                                                   dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+
+              MPSGraphTensor *reciprocalPowerValTensor = [mpsGraph constantWithScalar:reciprocal_p
+                                                                             dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+
+              MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor
+                                                             secondaryTensor:powerValTensor
+                                                                        name:nil];
+
+              MPSGraphTensor *reductionSumTensor = [mpsGraph reductionSumWithTensor:powerTensor
+                                                                         axes:axes
+                                                                         name:nil];
+
+              outputTensor = [mpsGraph powerWithPrimaryTensor:reductionSumTensor
+                                              secondaryTensor:reciprocalPowerValTensor
+                                                         name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder();
+
+    if(apparent_input_shape)
+      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
+    else
+      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
+
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+}
+
+Tensor std_var_common_impl_mps(
+  const Tensor & input_t,
+  at::OptionalIntArrayRef dim,
+  c10::optional<int64_t> correction,
+  bool keepdim,
+  StdVarType stdVarType)
+{
+  namespace native_mps = at::native::mps;
+
+  IntArrayRef input_shape = input_t.sizes();
+  int64_t num_input_dims = input_shape.size();
+
+  bool use_dim = dim.has_value();
+  IntArrayRef dim_value = use_dim ? dim.value() : NULL;
+
+  if (use_dim)
+  {
+      string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+      errMessage += ": reduction dim must be in the range of input shape";
+      for(int i = 0; i < dim_value.size(); i++) {
+        auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
+        TORCH_CHECK(wrap_dim < input_shape.size(), errMessage.c_str())
+    }
+  }
+
+  bool use_correction = correction.has_value();
+  const auto correction_value = use_correction ? correction.value() : false;
+  int64_t correction_n = 1;
+
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  int64_t num_output_dims = 0;
+  NSMutableArray<NSNumber *> *axes = nil;
+  NSMutableArray<NSNumber*> *apparent_output_shape = nil;
+  NSMutableArray<NSNumber*> *apparent_input_shape = nil;
+  int64_t* output_shape = nil;
+
+  if ((!keepdim && !use_dim) || (!keepdim && use_dim && dim_value.size() <= 0))
+  {
+      // Flatten the input tensor to reduce it to one value
+      apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      int64_t num_in_elements = 1;
+      for(int i = 0; i < num_input_dims; i++) {
+          num_in_elements *= input_shape[i];
+      }
+      apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+      // Output is a single value
+      apparent_output_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      apparent_output_shape[0] = @1;
+
+      num_output_dims = 0;
+
+      correction_n = num_in_elements;
+
+        // Reduction axes
+      axes = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      axes[0] = @0;
+
+  }
+  else if (!keepdim && use_dim && dim_value.size() > 0)
+  {
+      int64_t num_reduce_dims = dim_value.size();
+      num_output_dims = num_input_dims;
+
+      set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
+      set_apparent_shapes(apparent_output_shape,
+                           apparent_input_shape,
+                           num_reduce_dims,
+                           num_input_dims,
+                           num_output_dims,
+                           input_shape,
+                           axes);
+
+      num_output_dims = (num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0; //num_input_dims;
+      output_shape = (int64_t *)malloc(num_output_dims  * sizeof(int64_t));
+
+      unsigned int curr_i = 0;
+      for (int i = 0; i < num_input_dims; i++)
+      {
+          bool found = false;
+          for (int j = 0; j < num_reduce_dims; j++)
+          {
+              if (i == dim_value[j])
+              {
+                  found = true;
+                  break;
+              }
+          }
+          if (found) continue;
+          output_shape[curr_i] = input_shape[i];
+          curr_i += 1;
+      }
+
+      for(int i = 0; i < num_reduce_dims; i++)
+      {
+          correction_n *= input_shape[dim_value[i]];
+      }
+      // (3, 4, 5) --> (3, 5)
+  }
+  else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.size() <= 0))
+  {
+      num_output_dims = 0;
+      int64_t num_reduce_dims = 0;
+      set_axes(axes, num_reduce_dims, dim_value, input_shape.size());
+      set_apparent_shapes(apparent_output_shape,
+                          apparent_input_shape,
+                           num_reduce_dims,
+                           num_input_dims,
+                           num_output_dims,
+                           input_shape,
+                           axes);
+      num_output_dims = num_input_dims;
+      output_shape = (int64_t *)malloc(num_output_dims  * sizeof(int64_t));
+      for (int i = 0; i < num_input_dims; i++)
+      {
+          output_shape[i] = (int64_t) 1;
+          correction_n *= input_shape[i];
+      }
+      // scalar --> vector case [[1.0034567]]
+  }
+  else if (keepdim && use_dim && dim_value.size() > 0)
+  {
+      int64_t num_reduce_dims = dim_value.size();
+      num_output_dims = num_input_dims;
+
+      set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
+      set_apparent_shapes(apparent_output_shape,
+                           apparent_input_shape,
+                           num_reduce_dims,
+                           num_input_dims,
+                           num_output_dims,
+                           input_shape,
+                           axes);
+
+      num_output_dims = num_input_dims;//(num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0;
+      output_shape = (int64_t *)malloc(num_output_dims  * sizeof(int64_t));
+
+      for(int i = 0; i < num_reduce_dims; i++)
+      {
+          correction_n *= input_shape[dim_value[i]];
+      }
+
+      for (int i = 0; i < num_input_dims; i++)
+      {
+          output_shape[i] = [apparent_output_shape[i] longValue];
+      }
+  }
+
+  Tensor output_t = at::native::empty_mps(
+                      IntArrayRef(output_shape, num_output_dims),
+                      input_t.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+
+  if (output_t.numel() == 0 || input_t.numel() == 0)
+  {
+     return output_t;
+  }
+
+  double bessel_correction = ((double) correction_n) / ((double) (correction_n-1));
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+    NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
+    string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
+    string use_dim_info = (use_dim) ? "use_dim=1:" + to_string(dim_value.size()) : "use_dim=0";
+    string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
+    string key = op_key + use_dim_info + ":" + keepdim_info + ":" + string([ns_key UTF8String]) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":" + bessel_corrected;
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    // Initialize once if configuration not found in cache
+  if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+      CachedGraph *newCachedGraph = nil;
+
+      @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor *outputVarTensor = [mpsGraph varianceOfTensor:inputTensor
+                                                                     axes:axes
+                                                                     name:nil];
+          MPSGraphTensor *outputTensor;
+
+          if (use_correction && correction_value)
+          {
+              MPSGraphTensor *besselTensor= [mpsGraph constantWithScalar:bessel_correction
+                                                    dataType:MPSDataTypeFloat32];
+              MPSGraphTensor *correctedTensor = [mpsGraph multiplicationWithPrimaryTensor: outputVarTensor
+                                                                          secondaryTensor: besselTensor
+                                                                                     name: nil];
+              outputTensor = (stdVarType == STANDARD_DEVIATION) ?
+                    [mpsGraph squareRootWithTensor:correctedTensor name:nil] : correctedTensor;
+          }
+          else
+          {
+              outputTensor = (stdVarType == STANDARD_DEVIATION) ?
+                    [mpsGraph squareRootWithTensor:outputVarTensor name:nil] : outputVarTensor;
+          }
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+
+      }
+      return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+  }
+  auto inputPlaceholder = native_mps::Placeholder();
+
+  if(apparent_input_shape)
+  {
+    inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
+  }
+  else
+  {
+    inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+  }
+  auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
+
+  NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+  };
+
+  NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+  };
+  native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+  free(output_shape);
+  return output_t;
+}
+
+Tensor var_mps(
+  const Tensor & input_t,
+  at::OptionalIntArrayRef dim,
+  c10::optional<int64_t> correction,
+  bool keepdim)
+{
+  return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_VARIANCE);
+}
+
+Tensor std_mps(
+   const Tensor & input_t,
+   at::OptionalIntArrayRef dim,
+   c10::optional<int64_t> correction,
+   bool keepdim)
+{
+  return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_DEVIATION);
+}
+
+TORCH_IMPL_FUNC(any_out_mps)
+  (const Tensor& input_t,
+   int64_t dim,
+   bool keepdim,
+   const Tensor& output_t)
+{
+    namespace native_mps = at::native::mps;
+
+    if (output_t.numel() == 0 || input_t.numel() == 0) {
+      return;
+    }
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_, "any()");
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i)
+            apparent_out_shape[i] = @1;
+        else
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+    }
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        MPSShape* input_t_shape = native_mps::getMPSShape(input_t);
+        string key = string("any_out_mps:") + native_mps::getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+                MPSGraphTensor* outputTensor;
+                MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type());
+                MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape);
+
+                if (input_type != MPSDataTypeInt32 &&
+                    input_type != MPSDataTypeFloat32 &&
+                    input_type != MPSDataTypeFloat16 )
+                {
+                    MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor
+                                                                      toType:MPSDataTypeInt32
+                                                                        name:@"any_all"];
+                    MPSGraphTensor* outputCastedTensor = [mpsGraph reductionOrWithTensor:inputCastedTensor
+                                                                                     axis:dim_
+                                                                                     name:nil];
+                    outputTensor = [mpsGraph castTensor:outputCastedTensor
+                                                 toType:MPSDataTypeBool
+                                                   name:@"any"];
+                }
+                else
+                {
+                    MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionOrWithTensor:inputTensor
+                                                                                       axis:dim_
+                                                                                       name:nil];
+                    outputTensor = [mpsGraph castTensor:outputUncastedTensor
+                                                 toType:MPSDataTypeBool
+                                                   name:@"any"];
+                }
+                newCachedGraph->inputTensor_ = inputTensor;
+                newCachedGraph->outputTensor_ = outputTensor;
+
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+}
+}
+
+TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
+{
+    namespace native_mps = at::native::mps;
+    if (output_t.numel() == 0 || input_t.numel() == 0) {
+      return;
+    }
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        MPSShape* input_t_shape = native_mps::getMPSShape(input_t);
+        string key = string("any_all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) +":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              MPSGraphTensor* outputTensor;
+              MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type());
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape);
+
+              if (input_type != MPSDataTypeInt32 &&
+                  input_type != MPSDataTypeFloat32 &&
+                  input_type != MPSDataTypeFloat16 )
+              {
+                  MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor
+                                                                    toType:MPSDataTypeInt32
+                                                                      name:@"any_all"];
+                  MPSGraphTensor* outputCastedTensor = [mpsGraph reductionOrWithTensor:inputCastedTensor
+                                                                                   axes:nil
+                                                                                   name:nil];
+                  outputTensor = [mpsGraph castTensor:outputCastedTensor
+                                               toType:MPSDataTypeBool
+                                                 name:@"any_all"];
+              }
+              else
+              {
+                  MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionOrWithTensor:inputTensor
+                                                                                     axes:nil
+                                                                                     name:nil];
+                  outputTensor = [mpsGraph castTensor:outputUncastedTensor
+                                               toType:MPSDataTypeBool
+                                                 name:@"any_all"];
+              }
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+}
+
+TORCH_IMPL_FUNC(all_out_mps)
+  (const Tensor& input_t,
+   int64_t dim,
+   bool keepdim,
+   const Tensor& output_t)
+{
+    namespace native_mps = at::native::mps;
+
+    if (output_t.numel() == 0 || input_t.numel() == 0) {
+      return;
+    }
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_, "all()");
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i)
+            apparent_out_shape[i] = @1;
+        else
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+    }
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        MPSShape* input_t_shape = native_mps::getMPSShape(input_t);
+        string key = string("all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+                MPSGraphTensor* outputTensor;
+                MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type());
+                MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape);
+
+                if (input_type != MPSDataTypeInt32 &&
+                    input_type != MPSDataTypeFloat32 &&
+                    input_type != MPSDataTypeFloat16 )
+                {
+                    MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor
+                                                                      toType:MPSDataTypeInt32
+                                                                        name:@"all_all"];
+                    MPSGraphTensor* outputCastedTensor = [mpsGraph reductionAndWithTensor:inputCastedTensor
+                                                                                     axis:dim_
+                                                                                     name:nil];
+                    outputTensor = [mpsGraph castTensor:outputCastedTensor
+                                                 toType:MPSDataTypeBool
+                                                   name:@"all"];
+                }
+                else
+                {
+                    MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionAndWithTensor:inputTensor
+                                                                                       axis:dim_
+                                                                                       name:nil];
+                    outputTensor = [mpsGraph castTensor:outputUncastedTensor
+                                                 toType:MPSDataTypeBool
+                                                   name:@"all"];
+                }
+                newCachedGraph->inputTensor_ = inputTensor;
+                newCachedGraph->outputTensor_ = outputTensor;
+
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
+{
+    namespace native_mps = at::native::mps;
+    if (output_t.numel() == 0 || input_t.numel() == 0) {
+      return;
+    }
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        MPSShape* input_t_shape = native_mps::getMPSShape(input_t);
+        string key = string("all_all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) +":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              MPSGraphTensor* outputTensor;
+              MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type());
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape);
+
+              if (input_type != MPSDataTypeInt32 &&
+                  input_type != MPSDataTypeFloat32 &&
+                  input_type != MPSDataTypeFloat16 )
+              {
+                  MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor
+                                                                    toType:MPSDataTypeInt32
+                                                                      name:@"all_all"];
+                  MPSGraphTensor* outputCastedTensor = [mpsGraph reductionAndWithTensor:inputCastedTensor
+                                                                                   axes:nil
+                                                                                   name:nil];
+                  outputTensor = [mpsGraph castTensor:outputCastedTensor
+                                               toType:MPSDataTypeBool
+                                                 name:@"all_all"];
+              }
+              else
+              {
+                  MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionAndWithTensor:inputTensor
+                                                                                     axes:nil
+                                                                                     name:nil];
+                  outputTensor = [mpsGraph castTensor:outputUncastedTensor
+                                               toType:MPSDataTypeBool
+                                                 name:@"all_all"];
+              }
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+}
+
+//-----------------------------------------------------------------------
+// Min and max functions
+
+Tensor min_max_mps
+  (const Tensor& input_t,
+   string reduction_type,
+   string func_name) {
+
+  namespace native_mps = at::native::mps;
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public native_mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+  IntArrayRef input_shape = input_t.sizes();
+  int64_t num_input_dims = input_shape.size();
+
+  // Flatten the input tensor to reduce it to one value
+  NSMutableArray<NSNumber*> *apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+  int64_t num_in_elements = 1;
+  for(int i = 0; i < num_input_dims; i++) {
+      num_in_elements *= input_shape[i];
+  }
+  apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+  Tensor output_t = at::native::empty_mps({}, input_t.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
+
+  if (output_t.numel() == 0 || num_in_elements == 0) {
+    return output_t;
+  }
+
+  @autoreleasepool {
+    string key = func_name + mps::getTensorsStringKey(input_t);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    // Initialize once if configuration not found in cache
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = native_mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+
+          MPSGraphTensor* outputTensor = nil;
+
+          if(reduction_type == "max")
+            outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
+                                                           axes:@[@0]
+                                                           name:nil];
+          else if(reduction_type == "min")
+            outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
+                                                           axes:@[@0]
+                                                           name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, @[@1]);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+
+  return output_t;
+}
+
+// Max entire tensor into scalar result
+Tensor max_mps(const Tensor& input_t) {
+
+  return min_max_mps(input_t, "max", "max_mps");
+}
+
+// Min entire tensor into scalar result
+Tensor min_mps(const Tensor& input_t) {
+
+  return min_max_mps(input_t, "min", "min_mps");
+}
+
+void min_max_out_mps
+  (const Tensor& input_t,
+  int64_t dim,
+  bool keepdim,
+  const Tensor& output_t,
+  const Tensor& indices_t,
+  string reduction_type,
+  string func_name) {
+
+    namespace native_mps = at::native::mps;
+
+    if (output_t.numel() == 0) {
+      return;
+    }
+    if (input_t.numel() == 1 && input_t.dim() == 0) {
+      output_t.fill_(input_t);
+      indices_t.fill_(0);
+      return;
+    }
+
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+      MPSGraphTensor *indicesTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i)
+            apparent_out_shape[i] = @1;
+        else
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+    }
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+              MPSGraphTensor* outputTensor = nil;
+              if(reduction_type == "max")
+                outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
+                                                               axis:(NSInteger)dim_
+                                                               name:nil];
+              else if(reduction_type == "min")
+                outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
+                                                               axis:(NSInteger)dim_
+                                                               name:nil];
+
+              MPSGraphTensor* castInputTensor = nil;
+
+              if(input_t.scalar_type() != ScalarType::Float &&
+                 input_t.scalar_type() != ScalarType::Int   &&
+                 input_t.scalar_type() != ScalarType::Half)
+                castInputTensor =  [mpsGraph castTensor:inputTensor
+                                                 toType:MPSDataTypeFloat32
+                                                   name:@"castInputTensor"];
+              else
+                castInputTensor = inputTensor;
+
+              MPSGraphTensor* argreduceOutTensor = nil;
+              if(reduction_type == "max")
+                argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor
+                                                                        axis:(NSInteger)dim_
+                                                                        name:@"argmax_out"];
+              else if(reduction_type == "min")
+                argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor:castInputTensor
+                                                                        axis:(NSInteger)dim_
+                                                                        name:@"argmax_out"];
+
+              MPSGraphTensor *indicesTensor = [mpsGraph castTensor:argreduceOutTensor
+                                                            toType:MPSDataTypeInt64
+                                                              name:@"cast_out"];
+
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+              newCachedGraph->indicesTensor_ = indicesTensor;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+        auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape);
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+          indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    }
+
+}
+
+// Max out with dim
+TORCH_IMPL_FUNC(max_out_mps)
+  (const Tensor& input_t,
+   int64_t dim,
+   bool keepdim,
+   const Tensor& output_t,
+   const Tensor& indices_t) {
+
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_,  "max()");
+
+    min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, "max", "max_out_mps");
+}
+
+// Min out with dim
+TORCH_IMPL_FUNC(min_out_mps)
+  (const Tensor& input_t,
+   int64_t dim,
+   bool keepdim,
+   const Tensor& output_t,
+   const Tensor& indices_t) {
+
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_, "min()");
+
+    min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, "min", "min_out_mps");
+}
+
+// Min/Max with dim
+std::tuple<Tensor, Tensor> min_max_mps
+   (const Tensor& input_t,
+    int64_t dim,
+    bool keepdim,
+    string reduction_type,
+    string func_name) {
+
+    namespace native_mps = at::native::mps;
+
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_, "max()");
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+    // Use this if keepdim is false
+    int64_t num_output_dims = num_input_dims - 1;
+
+    int64_t* malloc_apparent_out_shape = (int64_t *)malloc(num_input_dims * sizeof(int64_t));
+    int64_t* malloc_out_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t));
+
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Counter for shape when keepdim is false
+    int out_i = 0;
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i) {
+            apparent_out_shape[i] = @1;
+            malloc_apparent_out_shape[i] = 1;
+        }
+        else {
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+            malloc_apparent_out_shape[i] = input_shape[i];
+            malloc_out_shape[out_i] = input_shape[i];
+            out_i++;
+        }
+    }
+
+    Tensor output_t;
+    Tensor indices_t;
+    if(!keepdim) {
+     output_t = at::native::empty_mps(
+                      IntArrayRef(malloc_out_shape, num_output_dims),
+                      input_t.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+     indices_t = at::native::empty_mps(
+                      IntArrayRef(malloc_out_shape, num_output_dims),
+                      ScalarType::Long,
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+    }
+    else {
+      output_t = at::native::empty_mps(
+                      IntArrayRef(malloc_apparent_out_shape, num_input_dims),
+                      input_t.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+     indices_t = at::native::empty_mps(
+                      IntArrayRef(malloc_apparent_out_shape, num_input_dims),
+                      ScalarType::Long,
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+    }
+
+    if (output_t.numel() == 0 || input_t.numel() == 0) {
+        free(malloc_out_shape);
+        free(malloc_apparent_out_shape);
+        return std::tuple<Tensor, Tensor>{output_t, indices_t};
+    }
+
+    min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, reduction_type, func_name);
+
+    free(malloc_out_shape);
+    free(malloc_apparent_out_shape);
+    return std::tuple<Tensor, Tensor>{output_t, indices_t};
+}
+
+// Max with dim
+std::tuple<Tensor, Tensor> max_mps
+   (const Tensor& input_t,
+    int64_t dim,
+    bool keepdim) {
+
+    return min_max_mps(input_t, dim, keepdim, "max", "max_mps");
+}
+
+// Min with dim
+std::tuple<Tensor, Tensor> min_mps
+   (const Tensor& input_t,
+    int64_t dim,
+    bool keepdim) {
+
+    return min_max_mps(input_t, dim, keepdim, "min", "min_mps");
+}
+
+}
+
+}
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
new file mode 100644
index 000000000000..a7708f1a327c
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -0,0 +1,173 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+
+Tensor permute_mps(const Tensor& self, IntArrayRef dims) {
+  auto nDims = self.dim();
+  TORCH_CHECK(dims.size() == (size_t)nDims,
+           "number of dims don't match in permute");
+  auto oldSizes = self.sizes();
+  auto oldStrides = self.strides();
+  DimVector newSizes(nDims);
+  DimVector newStrides(nDims);
+  std::vector<bool> seen(nDims);
+  for (const auto i : c10::irange(nDims)) {
+    auto dim = maybe_wrap_dim(dims[i], nDims);
+    TORCH_CHECK(!seen[dim],
+             "repeated dim in permute");
+    seen[dim] = true;
+    newSizes[i] = oldSizes[dim];
+    newStrides[i] = oldStrides[dim];
+  }
+  return self.as_strided(newSizes, newStrides);
+}
+
+void set_apparent_shapes(NSMutableArray<NSNumber*> * input_shape,
+                         NSMutableArray<NSNumber*> * &apparent_input_shape,
+                         int64_t num_input_dims,
+                         IntArrayRef repeats,
+                         NSMutableArray<NSNumber*> * &repeats_shape,
+                         int64_t num_repeat_dims) {
+
+
+  // Set repeats_shape
+
+  repeats_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_repeat_dims];
+
+  for(int i = 0; i < num_repeat_dims; i++)
+    repeats_shape[i] = [NSNumber numberWithInt:repeats[i]];
+
+  // If no extension of the shape is needed
+  if(num_repeat_dims == num_input_dims) {
+    apparent_input_shape = input_shape;
+  }
+  // num_repeat_dims > num_input_dims
+  else {
+    apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_repeat_dims];
+
+    for(int i = 0; i < num_repeat_dims - num_input_dims; i++)
+      apparent_input_shape[i] = @1;
+
+    for(int i = num_repeat_dims - num_input_dims; i < num_repeat_dims; i++)
+      apparent_input_shape[i] = input_shape[i + num_input_dims - num_repeat_dims];
+  }
+
+}
+
+Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
+
+  using namespace mps;
+
+  TORCH_CHECK(repeats.size() >= (size_t)self.dim(),
+           "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  NSMutableArray<NSNumber*> *apparent_input_shape = nil;
+  NSMutableArray<NSNumber*> *repeats_shape = nil;
+
+  auto input_shape = getMPSShape(self);
+  auto num_input_dims = [input_shape count];
+  auto num_repeat_dims = repeats.size();
+
+  set_apparent_shapes(input_shape,
+                      apparent_input_shape,
+                      num_input_dims,
+                      repeats,
+                      repeats_shape,
+                      num_repeat_dims);
+
+  // Set output shape
+  int64_t output_shape[num_repeat_dims];
+  bool zero_tensor = false;
+  for(int i = 0; i < num_repeat_dims; i++) {
+    output_shape[i] = repeats[i] * [apparent_input_shape[i] intValue];
+    if(output_shape[i] == 0)
+      zero_tensor = true;
+  }
+
+  Tensor output = at::native::empty_mps(
+                      IntArrayRef(output_shape, num_repeat_dims),
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+
+  // Empty output
+  if(zero_tensor)
+    return output;
+
+  auto stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    NSString* ns_repeats_key = [[repeats_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "repeat_mps:" + getMPSTypeString(self.scalar_type())
+                               + ":" + string([ns_shape_key UTF8String])
+                               + ":" + string([ns_repeats_key UTF8String]);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), apparent_input_shape);
+          MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor
+                                               withMultiplier:repeats_shape
+                                                         name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparent_input_shape);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+
+}
+
+}
+}
diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
new file mode 100644
index 000000000000..a219d3f8172c
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -0,0 +1,510 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/RNN.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/mps/OperationUtils.h>
+#import <MetalPerformanceShadersGraph/MPSGraphRNNOps.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+
+std::vector<long long> getTensorShape(MPSGraphTensor* mpsTensor) {
+    std::vector<long long> output_dimensions = {};
+    auto dims = mpsTensor.shape;
+    for (int i = 0; i<[dims count];i++){
+        output_dimensions.push_back([dims[i] intValue]);
+    }
+    return output_dimensions;
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+    using namespace mps;
+    std::vector<Tensor> kernel_weights;
+    std::vector<Tensor> recurrent_kernel_weights;
+    std::vector<Tensor> biases;
+    std::vector<Tensor> recurrent_biases;
+    for (size_t i = 0; i < num_layers; i+=1) {
+        kernel_weights.push_back(params[i*4]);
+        recurrent_kernel_weights.push_back(params[i*4+1]);
+        biases.push_back(params[i*4+2]);
+        recurrent_biases.push_back(params[i*4+3]);
+    }
+
+    struct CachedGraph : public MPSCachedGraph {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      std::vector<MPSGraphTensor*> inputTensors_;
+      std::vector<MPSGraphTensor*> outputTensors_;
+      NSMutableArray<MPSGraphTensor*> *kernelWeightsList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
+      std::vector<MPSGraphTensor*> outputCellStateFwdVector_;
+      std::vector<MPSGraphTensor*> outputZStateVector_;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    MPSStream* stream = getCurrentMPSStream();
+    int timesteps = (batch_first ? input.size(1) : input.size(0));
+
+    @autoreleasepool {
+      string key = "lstm_" + getTensorsStringKey({input, hx[0], hx[1]}) + getMPSTypeString(input.scalar_type()) + "_num_layers_" + std::to_string(num_layers);
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      if(!cachedGraph) {
+        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+          CachedGraph *newCachedGraph = nil;
+
+          @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+            NSMutableArray<MPSGraphTensor*> *kernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+            NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+            NSMutableArray<MPSGraphTensor*> *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+            NSMutableArray<MPSGraphTensor*> *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+
+            for (size_t i = 0; i < num_layers; i += 1) {
+                [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
+                [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
+                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+            }
+
+            MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
+            opDesc.training = true;
+            opDesc.bidirectional = bidirectional;
+            opDesc.produceCell = true;
+
+            MPSShape* inputShape = getMPSShape(input);
+            MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
+            MPSGraphTensor* stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[0]));
+            MPSGraphTensor* cellStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[1]));
+            std::vector<MPSGraphTensor*> inputTensors = {inputTensor, stateTensor, cellStateTensor,};
+
+            if(batch_first) {
+                inputTensor = [mpsGraph transposeTensor:inputTensor
+                                                dimension:0
+                                                withDimension:1
+                                                name:nil];
+            }
+
+            MPSGraphTensor* inputTensor_ = inputTensor;
+            MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
+                                                        dimension:0
+                                                        start:0
+                                                        length:1
+                                                        name:nil];
+            MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
+                                                                dimension:0
+                                                                start:0
+                                                                length:1
+                                                                name:nil];
+            NSArray<MPSGraphTensor*>* outputs = nil;
+            NSMutableArray<MPSGraphTensor*>* outputStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+            NSMutableArray<MPSGraphTensor*>* outputCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+            NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+            NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+            for(int i = 0; i < num_layers; i++) {
+                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                                    secondaryTensor:recurrentBiasList[i]
+                                                                            name:nil];
+                outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
+                                        recurrentWeight:recurrentKernelWeightsList[i]
+                                            inputWeight:kernelWeightsList[i]
+                                                   bias:biasTensor
+                                              initState:stateTensor_
+                                               initCell:cellStateTensor_
+                                             descriptor:opDesc
+                                                   name:nil];
+
+
+                stateTensor_ = [mpsGraph sliceTensor:stateTensor
+                                                            dimension:0
+                                                            start:i
+                                                            length:1
+                                                            name:nil];
+                cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
+                                                                    dimension:0
+                                                                    start:i
+                                                                    length:1
+                                                                    name:nil];
+                inputTensor_ = [outputs objectAtIndex:0];
+                if(dropout_p>0.0 && train && (i!=num_layers-1)) {
+                    inputTensor_ = [mpsGraph dropoutTensor:inputTensor_
+                                                      rate:dropout_p
+                                                      name:nil];
+
+                }
+
+                [outputStateArray addObject:[mpsGraph sliceTensor:[outputs objectAtIndex:0] dimension:0 start:-1 length:1 name:nil]];
+                [outputCellStateArray addObject:[mpsGraph sliceTensor:[outputs objectAtIndex:1] dimension:0 start:-1 length:1 name:nil]];
+                [outputCellStateFwdArray addObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:1]
+                                                                            axis:0
+                                                                            name:nil]];
+                [outputZStateArray addObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:2]
+                                                            axis:0
+                                                            name:nil]];
+            }
+
+            MPSGraphTensor* outputStates = [mpsGraph concatTensors:outputStateArray
+                                                            dimension:0
+                                                            name:nil];
+            MPSGraphTensor* outputCellStates = [mpsGraph concatTensors:outputCellStateArray
+                                                            dimension:0
+                                                            name:nil];
+            MPSGraphTensor* outputZStates = [mpsGraph concatTensors:outputZStateArray
+                                                            dimension:0
+                                                            name:nil];
+            MPSGraphTensor* outputCellStatesFwd = [mpsGraph concatTensors:outputCellStateFwdArray
+                                                            dimension:0
+                                                            name:nil];
+
+            std::vector<MPSGraphTensor*> outputTensors = {[outputs objectAtIndex:0], outputStates, outputCellStates, outputZStates, outputCellStatesFwd};
+            newCachedGraph->inputTensors_ = inputTensors;
+            newCachedGraph->outputTensors_ = outputTensors;
+            newCachedGraph->kernelWeightsList_ = kernelWeightsList;
+            newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList;
+            newCachedGraph->biasList_ = kernelBiasList;
+            newCachedGraph->recurrentBiasList_ = recurrentBiasList;
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+      }
+
+      NSMutableArray<MPSGraphTensor*> *kernelWeightsList = cachedGraph->kernelWeightsList_;
+      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_;
+      NSMutableArray<MPSGraphTensor*> *biasList = cachedGraph->biasList_;
+      NSMutableArray<MPSGraphTensor*> *recurrentBiasList = cachedGraph->recurrentBiasList_;
+
+      Placeholder kernelWeight;
+      Placeholder recurrentKernelWeight;
+      Placeholder bias;
+      Placeholder recurrentBias;
+      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[NSMutableDictionary alloc] init];
+      for (size_t i = 0; i < num_layers; i+=1) {
+          kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
+          recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
+          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+          [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
+          [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
+          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+
+      }
+      Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
+      Placeholder selfState   = Placeholder(cachedGraph->inputTensors_[1], hx[0]);
+      Placeholder selfCellState   = Placeholder(cachedGraph->inputTensors_[2], hx[1]);
+      [feeds setObject:selfPlaceholder.getMPSGraphTensorData() forKey:selfPlaceholder.getMPSGraphTensor()];
+      [feeds setObject:selfState.getMPSGraphTensorData() forKey:selfState.getMPSGraphTensor()];
+      [feeds setObject:selfCellState.getMPSGraphTensorData() forKey:selfCellState.getMPSGraphTensor()];
+
+
+      auto dims = getTensorShape(cachedGraph->outputTensors_[0]);
+      Tensor output = at::empty(IntArrayRef(dims), input.options());
+      Tensor hy = at::empty_like(hx[0], input.options());
+      Tensor cy = at::empty_like(hx[1], input.options());
+      Tensor zState = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[3])), input.options());
+      Tensor cellStateFwd = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[4])), input.options());
+
+      Placeholder outputPlaceholder0 = Placeholder(cachedGraph->outputTensors_[0], output);
+      Placeholder outputPlaceholder1 = Placeholder(cachedGraph->outputTensors_[1], hy);
+      Placeholder outputPlaceholder2 = Placeholder(cachedGraph->outputTensors_[2], cy);
+      Placeholder outputPlaceholder3 = Placeholder(cachedGraph->outputTensors_[3], zState);
+      Placeholder outputPlaceholder4 = Placeholder(cachedGraph->outputTensors_[4], cellStateFwd);
+
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder0.getMPSGraphTensor() : outputPlaceholder0.getMPSGraphTensorData(),
+        outputPlaceholder1.getMPSGraphTensor() : outputPlaceholder1.getMPSGraphTensorData(),
+        outputPlaceholder2.getMPSGraphTensor() : outputPlaceholder2.getMPSGraphTensorData(),
+        outputPlaceholder3.getMPSGraphTensor() : outputPlaceholder3.getMPSGraphTensorData(),
+        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData()
+      };
+
+      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      return std::make_tuple(output, hy, cy, zState, cellStateFwd);
+    }
+}
+
+std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+    using namespace mps;
+    const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] {return Tensor();});
+    const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
+    auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx[0], input.options());
+    auto grad_cy = grad_cy_r.defined() ? grad_cy_r : at::zeros_like(hx[1], input.options());
+
+    std::vector<Tensor> kernel_weights;
+    std::vector<Tensor> recurrent_kernel_weights;
+    std::vector<Tensor> biases;
+    std::vector<Tensor> recurrent_biases;
+    for (size_t i = 0; i < num_layers; i+=1) {
+        kernel_weights.push_back(params[i*4]);
+        recurrent_kernel_weights.push_back(params[i*4+1]);
+        biases.push_back(params[i*4+2]);
+        recurrent_biases.push_back(params[i*4+3]);
+    }
+
+    struct CachedGraph : public MPSCachedGraph {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      std::vector<MPSGraphTensor*> inputTensors_;
+      std::vector<MPSGraphTensor*> outputTensors_;
+      NSMutableArray<MPSGraphTensor*> *kernelWeightsList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradOutput_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradRecWeights_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradWeights_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradBias_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradState_ = nil;
+      NSMutableArray<MPSGraphTensor*> *gradCellState_ = nil;
+    };
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    // Get stream
+    MPSStream* stream = getCurrentMPSStream();
+    @autoreleasepool {
+
+        string key = "lstm_backward_" + getTensorsStringKey({input, z_state, cell_state_fwd, grad_y, grad_cy, grad_hy})+ getMPSTypeString(input.scalar_type()) + "_num_layers_" + std::to_string(num_layers);
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        if(!cachedGraph) {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    NSMutableArray<MPSGraphTensor*> *kernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+                    NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+                    NSMutableArray<MPSGraphTensor*> *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+                    NSMutableArray<MPSGraphTensor*> *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+
+                    for (size_t i = 0; i < num_layers; i += 1) {
+                        [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
+                        [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
+                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                    }
+
+                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
+                    MPSGraphTensor* stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[0]));
+                    MPSGraphTensor* cellStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[1]));
+                    MPSGraphTensor* zStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(z_state));
+                    MPSGraphTensor* gradientTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_y.scalar_type()), getMPSShape(grad_y));
+                    MPSGraphTensor* gradientCyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_cy.scalar_type()), getMPSShape(grad_cy));
+                    MPSGraphTensor* gradientHyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_hy.scalar_type()), getMPSShape(grad_hy));
+                    MPSGraphTensor* cellStateFwdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(cell_state_fwd.scalar_type()), getMPSShape(cell_state_fwd));
+
+                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor};
+                    newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList;
+                    newCachedGraph->kernelWeightsList_ = kernelWeightsList;
+                    newCachedGraph->biasList_ = kernelBiasList;
+                    newCachedGraph->recurrentBiasList_ = recurrentBiasList;
+                    newCachedGraph->inputTensors_ = inputs;
+
+                    MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
+                    opDesc.training = true; //train;
+                    opDesc.bidirectional = bidirectional;
+                    opDesc.produceCell = true;
+
+                    MPSGraphTensor* gradientTensor_ = gradientTensor;
+
+                    NSArray<MPSGraphTensor*>* outputs = nil;
+
+                    NSMutableArray<MPSGraphTensor*>* gradOutputArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradRecWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradRecBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+                    NSMutableArray<MPSGraphTensor*>* gradCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
+
+                    for (int i = num_layers - 1; i >= 0; i--) {
+                        MPSGraphTensor* zState = [mpsGraph sliceTensor:zStateTensor
+                                                                dimension:0
+                                                                start:i
+                                                                length:1
+                                                                name:nil];
+                        zState = [mpsGraph squeezeTensor:zState
+                                                    axis:0
+                                                    name:nil];
+                        MPSGraphTensor* cellStateFwd = [mpsGraph sliceTensor:cellStateFwdTensor
+                                                                dimension:0
+                                                                start:i
+                                                                length:1
+                                                                name:nil];
+                        cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
+                                                    axis:0
+                                                    name:nil];
+                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                                            secondaryTensor:recurrentBiasList[i]
+                                                                            name:nil];
+
+                        MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
+                                                                    dimension:0
+                                                                    start:i
+                                                                    length:1
+                                                                    name:nil];
+                        MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
+                                                                            dimension:0
+                                                                            start:i
+                                                                            length:1
+                                                                            name:nil];
+                        MPSGraphTensor* gradientHyTensor_ = [mpsGraph sliceTensor:gradientHyTensor
+                                                                    dimension:0
+                                                                    start:i
+                                                                    length:1
+                                                                    name:nil];
+
+                        MPSGraphTensor* gradientCyTensor_ = [mpsGraph sliceTensor:gradientCyTensor
+                                                                            dimension:0
+                                                                            start:i
+                                                                            length:1
+                                                                            name:nil];
+
+                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: inputTensor
+                                             recurrentWeight: recurrentKernelWeightsList[i]
+                                              sourceGradient: gradientTensor_
+                                                      zState: zState
+                                               cellOutputFwd: cellStateFwd
+                                               stateGradient: gradientHyTensor_
+                                                cellGradient: gradientCyTensor_
+                                                 inputWeight: kernelWeightsList[i]
+                                                        bias: biasTensor
+                                                   initState: stateTensor_
+                                                    initCell: cellStateTensor_
+                                                        mask: nil
+                                                    peephole: nil
+                                                  descriptor: opDesc
+                                                        name: nil];
+
+
+                        gradientTensor_ = [outputs objectAtIndex:0];
+                        [gradOutputArray addObject:[outputs objectAtIndex:0]];
+                        [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
+                        [gradWeightsArray addObject:[outputs objectAtIndex:2]];
+                        [gradBiasArray addObject:[outputs objectAtIndex:3]];
+                        [gradStateArray addObject:[outputs objectAtIndex:4]];
+                        [gradCellStateArray addObject:[outputs objectAtIndex:5]];
+                    }
+                    std::vector<MPSGraphTensor*> outputTensors = {[outputs objectAtIndex:0],[outputs objectAtIndex:1],[outputs objectAtIndex:2],[outputs objectAtIndex:3], [outputs objectAtIndex:4], [outputs objectAtIndex:5]};
+                    newCachedGraph->outputTensors_ = outputTensors;
+                    newCachedGraph->gradOutput_ = gradOutputArray;
+                    newCachedGraph->gradRecWeights_ = gradRecWeightsArray;
+                    newCachedGraph->gradWeights_ = gradWeightsArray;
+                    newCachedGraph->gradBias_ = gradBiasArray;
+                    newCachedGraph->gradState_ = gradStateArray;
+                    newCachedGraph->gradCellState_ = gradCellStateArray;
+
+                }
+                return newCachedGraph;
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        Placeholder inputPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
+        Placeholder statePlaceholder   = Placeholder(cachedGraph->inputTensors_[1], hx[0]);
+        Placeholder cellStatePlaceholder   = Placeholder(cachedGraph->inputTensors_[2], hx[1]);
+        Placeholder gradientPlaceholder   = Placeholder(cachedGraph->inputTensors_[3], grad_y);
+        Placeholder zStatePlaceholder   = Placeholder(cachedGraph->inputTensors_[4], z_state);
+        Placeholder cellStateFwdPlaceholder   = Placeholder(cachedGraph->inputTensors_[5], cell_state_fwd);
+        Placeholder gradientHyPlaceholder   = Placeholder(cachedGraph->inputTensors_[6], grad_hy);
+        Placeholder gradientCyPlaceholder   = Placeholder(cachedGraph->inputTensors_[7], grad_cy);
+
+        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[NSMutableDictionary alloc] init];
+        [feeds setObject:gradientPlaceholder.getMPSGraphTensorData() forKey:gradientPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:gradientHyPlaceholder.getMPSGraphTensorData() forKey:gradientHyPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:gradientCyPlaceholder.getMPSGraphTensorData() forKey:gradientCyPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:inputPlaceholder.getMPSGraphTensorData() forKey:inputPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:statePlaceholder.getMPSGraphTensorData() forKey: statePlaceholder.getMPSGraphTensor()];
+        [feeds setObject:cellStatePlaceholder.getMPSGraphTensorData() forKey:cellStatePlaceholder.getMPSGraphTensor()];
+        [feeds setObject:zStatePlaceholder.getMPSGraphTensorData() forKey:zStatePlaceholder.getMPSGraphTensor()];
+        [feeds setObject:cellStateFwdPlaceholder.getMPSGraphTensorData() forKey:cellStateFwdPlaceholder.getMPSGraphTensor()];
+
+        NSMutableArray<MPSGraphTensor*> *kernelWeightsList = cachedGraph->kernelWeightsList_;
+        NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_;
+        NSMutableArray<MPSGraphTensor*> *biasList = cachedGraph->biasList_;
+        NSMutableArray<MPSGraphTensor*> *recurrentBiasList = cachedGraph->recurrentBiasList_;
+        Placeholder kernelWeight;
+        Placeholder recurrentKernelWeight;
+        Placeholder bias;
+        Placeholder recurrentBias;
+        for (size_t i = 0; i < num_layers; i+=1) {
+            kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
+            recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
+            [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+        }
+
+        Tensor output = at::empty_like(input);
+        Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
+        Tensor grad_weights = at::empty_like(kernel_weights[0]);
+        Tensor grad_bias = at::empty_like(biases[0]);
+        Tensor grad_state = at::empty_like(hx[0]);
+        Tensor grad_cell_state = at::empty_like(hx[1]);
+        Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
+        Placeholder gradRecWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[1], grad_rec_weights);
+        Placeholder gradWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[2], grad_weights);
+        Placeholder gradBiasPlaceholder   = Placeholder(cachedGraph->outputTensors_[3], grad_bias);
+        Placeholder gradStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[4], grad_state);
+        Placeholder gradCellStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[5], grad_cell_state);
+
+        std::vector<Tensor> grad_hx = {grad_state, grad_cell_state};
+
+        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *results = [[NSMutableDictionary alloc] init];
+        NSMutableArray<MPSGraphTensor*> *gradOutputArray = cachedGraph->gradOutput_;
+        NSMutableArray<MPSGraphTensor*> *gradRecWeightsArray = cachedGraph->gradRecWeights_;
+        NSMutableArray<MPSGraphTensor*> *gradWeightsArray = cachedGraph->gradWeights_;
+        NSMutableArray<MPSGraphTensor*> *gradBiasArray = cachedGraph->gradBias_;
+        NSMutableArray<MPSGraphTensor*> *gradStateArray = cachedGraph->gradState_;
+        NSMutableArray<MPSGraphTensor*> *gradCellStateArray = cachedGraph->gradCellState_;
+        Placeholder gradOutPlaceholder;
+
+        std::vector<Tensor> weights;
+        for (int i = 0; i < num_layers; i++) {
+            Tensor output = at::empty_like(input);
+            Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
+            Tensor grad_weights = at::empty_like(kernel_weights[i]);
+            Tensor grad_bias = at::empty_like(biases[i]);
+            Tensor grad_state = at::empty_like(hx[0]);
+            Tensor grad_cell_state = at::empty_like(hx[1]);
+            weights.push_back(grad_weights);
+            weights.push_back(grad_rec_weights);
+            weights.push_back(grad_bias);
+            weights.push_back(grad_bias);
+            gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
+            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
+            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
+            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex:i], grad_bias);
+            gradStatePlaceholder = Placeholder([gradStateArray objectAtIndex:i], grad_state);
+            gradCellStatePlaceholder = Placeholder([gradCellStateArray objectAtIndex:i], grad_cell_state);
+
+            [results setObject:gradOutPlaceholder.getMPSGraphTensorData() forKey:gradOutPlaceholder.getMPSGraphTensor()];
+            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
+            [results setObject:gradBiasPlaceholder.getMPSGraphTensorData() forKey:gradBiasPlaceholder.getMPSGraphTensor()];
+            [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
+            [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
+            [results setObject:gradWeightsPlaceholder.getMPSGraphTensorData() forKey:gradWeightsPlaceholder.getMPSGraphTensor()];
+        }
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output, grad_hx, weights);
+
+    }
+}
+}}//at::native
diff --git a/aten/src/ATen/native/mps/operations/Scalar.mm b/aten/src/ATen/native/mps/operations/Scalar.mm
new file mode 100644
index 000000000000..2a5d7fd700c4
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Scalar.mm
@@ -0,0 +1,39 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/NativeFunctions.h>
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/Copy.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+using namespace at::mps;
+
+namespace at {
+namespace native {
+
+Scalar _local_scalar_dense_mps(const Tensor& self) {
+  Scalar r;
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_mps", [&] {
+        Tensor output = at::empty_like(self, kCPU);
+
+        Tensor cpu_output = mps::mps_copy_(output, self, false);
+        scalar_t value = *cpu_output.data_ptr<scalar_t>();
+        r = Scalar(value);
+   });
+
+  return r;
+}
+
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
new file mode 100644
index 000000000000..a8d73d5fc42a
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -0,0 +1,500 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(gather_out_mps)
+(const Tensor & self,
+ int64_t dim,
+ const Tensor & index,
+ bool sparse_grad,
+ const Tensor & output) {
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  dim = at::maybe_wrap_dim(dim, self.dim());
+
+  TORCH_CHECK(!sparse_grad, "sparse_grad not supported in MPS yet")
+
+  TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
+  TORCH_CHECK(self.scalar_type() == output.scalar_type(),
+              "gather(): self and output must have the same scalar type");
+  TORCH_CHECK(dim >= 0 && dim < self.dim(),
+              "gather(): Indexing dim ", dim, " is out of bounds of tensor");
+
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* indexTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    MPSShape* index_shape = getMPSShape(index);
+    NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    int num_input_dims = [input_shape count];
+    int num_index_dims = [index_shape count];
+
+    TORCH_CHECK(num_input_dims == num_index_dims, "Input and index must have same rank")
+
+    // Determine if we need to slice into the input tensor
+    bool needSlice = false;
+
+    for(int i = 0; i < num_input_dims; i++) {
+      TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
+      if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue])
+        needSlice = true;
+    }
+
+    string key = "gather_out_mps:" + getMPSTypeString(self.scalar_type()) + ":"
+                                   + getMPSTypeString(index.scalar_type()) + ":"
+                                   + std::to_string(dim) + ":"
+                                   + [ns_input_shape_key UTF8String] + ":"
+                                   + [ns_index_shape_key UTF8String];
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
+          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape);
+
+          MPSGraphTensor* getInput = nil;
+
+          // Slice into the input tensor IF NEEDED
+          if(needSlice) {
+            NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *ends = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+
+            for(int i = 0; i < num_input_dims; i++) {
+              // All strides are 1
+              strides[i] = @1;
+              // All starts are 0
+              starts[i] = @0;
+              if(i != dim)
+                ends[i] = index_shape[i];
+              else
+                ends[i] = input_shape[i];
+            }
+
+            getInput = [mpsGraph sliceTensor:inputTensor
+                                         starts:starts
+                                           ends:ends
+                                        strides:strides
+                                           name:nil];
+
+          }
+          else
+            getInput = inputTensor;
+
+          MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor
+                                                          toType:getMPSDataType(ScalarType::Int)
+                                                            name:(NSString * _Nonnull)nil];
+
+          MPSGraphTensor* outputTensor = [mpsGraph gatherAlongAxisWithUpdatesTensor:getInput
+                                                                      indicesTensor:castIndexTensor
+                                                                               axis:(NSInteger)dim
+                                                                               name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->indexTensor_ = indexTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape);
+    Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+void scatter_mps_general
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const Tensor& output,
+ string func_name,
+ const c10::string_view reduce) {
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  dim = at::maybe_wrap_dim(dim, self.dim());
+
+  TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
+  TORCH_CHECK(self.scalar_type() == output.scalar_type() && output.scalar_type() == src.scalar_type(),
+              "scatter(): self, src and output must have the same scalar type");
+  TORCH_CHECK(dim >= 0 && dim < self.dim(),
+              "scatter(): Indexing dim ", dim, " is out of bounds of tensor");
+
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* indexTensor_ = nil;
+    MPSGraphTensor* srcTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    MPSShape* index_shape = getMPSShape(index);
+    NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    MPSShape* src_shape = getMPSShape(src);
+    NSString* ns_src_shape_key = [[src_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    int num_input_dims = [input_shape count];
+    int num_index_dims = [index_shape count];
+    int num_src_dims = [src_shape count];
+
+    TORCH_CHECK(num_input_dims == num_index_dims && num_index_dims == num_src_dims, "Input, index and src must have same rank")
+
+    // Do we need to slice into the src tensor?
+    bool needSlice = false;
+    bool inputNeedSlice = false;
+
+    for(int i = 0; i < num_input_dims; i++) {
+      TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
+      TORCH_CHECK([index_shape[i] intValue] <= [src_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
+      if([index_shape[i] intValue] < [src_shape[i] intValue])
+        needSlice = true;
+      if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue])
+        inputNeedSlice = true;
+    }
+
+    TORCH_CHECK(reduce != "mean", "Scatter reduce mean mode not yet supported in MPS")
+
+    string reduce_key;
+
+    if(reduce == "set")
+      reduce_key = "set";
+    else if(reduce == "sum")
+      reduce_key = "sum";
+    else if(reduce == "add")
+      reduce_key = "add";
+    else if(reduce == "prod")
+      reduce_key = "prod";
+    else if(reduce == "multiply")
+      reduce_key = "multiply";
+    else if(reduce == "amax")
+      reduce_key = "amax";
+    else if(reduce == "amin")
+      reduce_key = "amin";
+
+    string key = func_name + ":" + getMPSTypeString(self.scalar_type()) + ":"
+                                 + getMPSTypeString(index.scalar_type()) + ":"
+                                 + std::to_string(dim) + ":"
+                                 + [ns_input_shape_key UTF8String] + ":"
+                                 + [ns_index_shape_key UTF8String] + ":"
+                                 + [ns_src_shape_key UTF8String] + ":"
+                                 + reduce_key;
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
+          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape);
+          MPSGraphTensor* srcTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(src.scalar_type()), src_shape);
+
+          MPSGraphTensor* getSrc = nil;
+          MPSGraphTensor* getInput = nil;
+
+          // Slice into the src tensor IF NEEDED
+          if(needSlice) {
+            NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *ends = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+
+            for(int i = 0; i < num_input_dims; i++) {
+              // All strides are 1
+              strides[i] = @1;
+              // All starts are 0
+              starts[i] = @0;
+              ends[i] = index_shape[i];
+            }
+
+            getSrc = [mpsGraph sliceTensor:srcTensor
+                                    starts:starts
+                                      ends:ends
+                                   strides:strides
+                                      name:nil];
+
+          }
+          else
+            getSrc = srcTensor;
+
+          // Use in case input needs to be smaller to get scatter
+          NSMutableArray<NSNumber*>* scatterInputShape = nil;
+
+          // Slice into the input tensor IF NEEDED
+          if(inputNeedSlice) {
+            NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *ends = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+
+            scatterInputShape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+
+            for(int i = 0; i < num_input_dims; i++) {
+              // All strides are 1
+              strides[i] = @1;
+              // All starts are 0
+              starts[i] = @0;
+              if(i != dim) {
+                ends[i] = index_shape[i];
+                scatterInputShape[i] = index_shape[i];
+              }
+              else {
+                ends[i] = input_shape[i];
+                scatterInputShape[i] = input_shape[i];
+              }
+            }
+
+            getInput = [mpsGraph sliceTensor:inputTensor
+                                      starts:starts
+                                        ends:ends
+                                     strides:strides
+                                        name:nil];
+
+          }
+          else {
+            getInput = inputTensor;
+            scatterInputShape = input_shape;
+          }
+
+          MPSGraphTensor* outputTensor = nil;
+
+          MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor
+                                                          toType:getMPSDataType(ScalarType::Int)
+                                                            name:(NSString * _Nonnull)nil];
+
+          MPSGraphScatterMode scatter_mode;
+
+          if(reduce_key == "set")
+            scatter_mode = MPSGraphScatterModeSet;
+          else if(reduce_key == "sum" || reduce_key == "add")
+            scatter_mode = MPSGraphScatterModeAdd;
+          else if(reduce_key == "prod" || reduce_key == "multiply")
+            scatter_mode = MPSGraphScatterModeMul;
+          else if(reduce_key == "amax")
+            scatter_mode = MPSGraphScatterModeMax;
+          else if(reduce_key == "amin")
+            scatter_mode = MPSGraphScatterModeMin;
+
+          if(!inputNeedSlice) {
+            outputTensor = [mpsGraph scatterAlongAxisWithDataTensor:getInput
+                                                      updatesTensor:getSrc
+                                                      indicesTensor:castIndexTensor
+                                                               axis:(NSInteger)dim
+                                                               mode:scatter_mode
+                                                               name:nil];
+          }
+          else {
+            // Scatter this into the input with set mode
+            MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxisWithDataTensor:getInput
+                                                                       updatesTensor:getSrc
+                                                                       indicesTensor:castIndexTensor
+                                                                                axis:(NSInteger)dim
+                                                                                mode:scatter_mode
+                                                                                name:nil];
+
+            // Make an array of scatter indices tensors
+            NSMutableArray<MPSGraphTensor*>* indicesTensors = [NSMutableArray<MPSGraphTensor*> arrayWithCapacity:num_input_dims];
+
+            // 1. Concatenate the coord tensors
+            // 2. Flatten the values
+            // 3. Scatter into input with add mode
+
+            int shape_data[num_input_dims];
+
+            for(int i = 0; i < num_input_dims; i++) {
+              shape_data[i] = {[scatterInputShape[i] intValue]};
+            }
+
+            MPSGraphTensor* scatterInputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:num_input_dims * sizeof(int)]
+                                                                           shape:@[[NSNumber numberWithInt:num_input_dims]]
+                                                                        dataType:MPSDataTypeInt32];
+
+            for(int i = 0; i < num_input_dims; i++) {
+              MPSGraphTensor* axisTensor = [mpsGraph constantWithScalar:i
+                                                               dataType:MPSDataTypeInt32];
+              MPSGraphTensor* scatter_currentIndexTensor = [mpsGraph getCoordinateValueWithShapeTensor:scatterInputShapeTensor
+                                                                                            axisTensor:axisTensor
+                                                                                                  name:nil];
+              scatter_currentIndexTensor = [mpsGraph reshapeTensor:scatter_currentIndexTensor
+                                                         withShape:@[@-1, @1]
+                                                              name:nil];
+              indicesTensors[i] = scatter_currentIndexTensor;
+            }
+
+            MPSGraphTensor* scatter_fullIndexTensor = [mpsGraph concatTensors:indicesTensors
+                                                                    dimension:(NSInteger)1
+                                                                         name:nil];
+
+            MPSGraphTensor* flatValuesTensor = [mpsGraph reshapeTensor:scatterTensor
+                                                             withShape:@[@-1]
+                                                                  name:nil];
+
+            outputTensor = [mpsGraph scatterNDWithDataTensor:inputTensor
+                                               updatesTensor:flatValuesTensor
+                                               indicesTensor:scatter_fullIndexTensor
+                                             batchDimensions:0
+                                                        mode:MPSGraphScatterModeSet
+                                                        name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->srcTensor_ = srcTensor;
+          newCachedGraph->indexTensor_ = indexTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape);
+    Placeholder srcPlaceholder = Placeholder(cachedGraph->srcTensor_, src, src_shape);
+    Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      srcPlaceholder.getMPSGraphTensor() : srcPlaceholder.getMPSGraphTensorData(),
+      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(scatter_src_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const Tensor& output) {
+
+  scatter_mps_general(self, dim, index, src, output, "scatter_src_out_mps", "set");
+
+}
+
+TORCH_IMPL_FUNC(scatter_value_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Scalar& value,
+ const Tensor& output) {
+
+  Tensor src = at::native::empty_mps(index.sizes(),
+                                     self.scalar_type(),
+                                     c10::nullopt,
+                                     kMPS,
+                                     c10::nullopt,
+                                     self.suggest_memory_format());
+  src.fill_(value);
+  scatter_mps_general(self, dim, index, const_cast<Tensor&>(src), output, "scatter_value_out_mps", "set");
+
+}
+
+TORCH_IMPL_FUNC(scatter_reduce_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const c10::string_view reduce,
+ const Tensor& output) {
+
+  scatter_mps_general(self, dim, index, src, output, "scatter_reduce_out_mps", reduce);
+
+}
+
+TORCH_IMPL_FUNC(scatter_value_reduce_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Scalar& value,
+ const c10::string_view reduce,
+ const Tensor& output) {
+
+  Tensor src = at::native::empty_mps(index.sizes(),
+                                     self.scalar_type(),
+                                     c10::nullopt,
+                                     kMPS,
+                                     c10::nullopt,
+                                     self.suggest_memory_format());
+  src.fill_(value);
+  scatter_mps_general(self, dim, index, const_cast<Tensor&>(src), output, "scatter_value_reduce_out_mps", reduce);
+
+}
+
+TORCH_IMPL_FUNC(scatter_add_mps_out)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& src,
+ const Tensor& output) {
+
+  scatter_mps_general(self, dim, index, src, output, "scatter_add_mps_out", "add");
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
new file mode 100644
index 000000000000..edef852027fb
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -0,0 +1,918 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/TensorShape.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/util/Optional.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+// Pad operations (1D/2D/3D forward and backward)
+Tensor& pad_out_template(Tensor &output, const Tensor &input_, IntArrayRef padding,
+                         const c10::optional<Tensor>& grad_output_opt,
+                         MPSGraphPaddingMode mode, const string op_name)
+{
+  const int padding_size = (int) padding.size();
+  const int padding_dim = padding_size / 2; // either 1D, 2D, or 3D
+
+  TORCH_CHECK(padding_size == 2 || padding_size == 4 || padding_size == 6,
+              "invalid padding argument of size ", padding_size);
+
+  const Tensor& grad_output_ = *(at::borrow_from_optional_tensor(grad_output_opt));
+  const bool is_backward_pass = grad_output_.defined();
+
+  int dim_w = padding_dim, dim_h = padding_dim - 1, dim_d = padding_dim - 2, dim_slices = 0;
+  int64_t nbatch = 1, ndims = input_.ndimension();
+
+  if (!is_backward_pass) {
+    bool valid_dims = input_.size(1) != 0 && input_.size(padding_dim) != 0;
+    TORCH_CHECK((ndims == 1 + padding_dim && valid_dims) ||
+                (ndims == 2 + padding_dim && valid_dims && input_.size(1 + padding_dim) != 0),
+                "3D or 4D (batch mode) tensor expected for input, but got: ", input_);
+  }
+
+  if (ndims == 2 + padding_dim) {
+    nbatch = input_.size(0);
+    dim_w++;
+    dim_h++;
+    dim_d++;
+    dim_slices++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+  int64_t pad_t = padding_dim > 1 ? padding[2] : 0;
+  int64_t pad_b = padding_dim > 1 ? padding[3] : 0;
+  int64_t pad_front = padding_dim > 2 ? padding[4] : 0;
+  int64_t pad_back  = padding_dim > 2 ? padding[5] : 0;
+
+  int64_t nplane = input_.size(dim_slices);
+  int64_t input_w = input_.size(dim_w);
+  int64_t output_w  = input_w + pad_l + pad_r;
+  int64_t input_h = padding_dim > 1 ? input_.size(dim_h) : 0;
+  int64_t output_h = padding_dim > 1 ? input_h + pad_t + pad_b : 0;
+  int64_t input_d = padding_dim > 2 ? input_.size(dim_d) : 0;
+  int64_t output_d = padding_dim > 2 ? input_d + pad_front + pad_back : 0;
+
+  Tensor grad_output, input = input_;
+
+  if (!is_backward_pass) {
+    TORCH_CHECK(pad_l < input_w && pad_r < input_w,
+      "Argument #4: Padding size should be less than the corresponding "
+      "input dimension, but got: padding (", pad_l, ", ", pad_r,
+      ") at dimension ", dim_w, " of input ", ndims);
+
+    if (padding_dim > 1) {
+      TORCH_CHECK(pad_t < input_h && pad_b < input_h,
+        "Argument #6: Padding size should be less than the corresponding "
+        "input dimension, but got: padding (", pad_t, ", ", pad_b,
+        ") at dimension ", dim_h, " of input ", ndims);
+    }
+    TORCH_CHECK(output_w >= 1 || output_h >= padding_dim - 1,
+      "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated "
+      "output H: ", output_h, " W: ", output_w);
+
+    if (ndims == 1 + padding_dim) {
+      if (padding_dim == 3)
+        output.resize_({nplane, output_d, output_h, output_w});
+      else if (padding_dim == 2)
+        output.resize_({nplane, output_h, output_w});
+      else
+        output.resize_({nplane, output_w});
+    } else {
+      if (padding_dim == 3)
+        output.resize_({nbatch, nplane, output_d, output_h, output_w});
+      else if (padding_dim == 2)
+        output.resize_({nbatch, nplane, output_h, output_w});
+      else
+        output.resize_({nbatch, nplane, output_w});
+    }
+    if (output.numel() == 0 || input_.numel() == 0)
+      return output;
+    input = input_.contiguous();
+  } else {
+    TORCH_CHECK(output_w == grad_output_.size(dim_w),
+      "gradOutput width unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w));
+    if (padding_dim > 1) {
+      TORCH_CHECK(output_h == grad_output_.size(dim_h),
+        "gradOutput height unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h));
+    }
+    grad_output = grad_output_.contiguous();
+  }
+
+  const int64_t input_dim = input.dim();
+  MPSShape *leftPadding = nullptr, *rightPadding = nullptr;
+  if (padding_dim == 3) {
+    leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_front), @(pad_t), @(pad_l) } count:input_dim];
+    rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_back), @(pad_b), @(pad_r) } count:input_dim];
+  } else if (padding_dim == 2) {
+    leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_t), @(pad_l) } count:input_dim];
+    rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_b), @(pad_r) } count:input_dim];
+  } else if (padding_dim == 1) {
+    leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_l) } count:input_dim];
+    rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_r) } count:input_dim];
+  }
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) { }
+    MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    MPSGraphTensor *gradOutputTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = op_name + getTensorsStringKey({input, grad_output}) +
+                           ":L" + to_string(pad_l)     + ":R" + to_string(pad_r) +
+                           ":T" + to_string(pad_t)     + ":B" + to_string(pad_b) +
+                           ":F" + to_string(pad_front) + ":K" + to_string(pad_back);
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+            newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+            if (!is_backward_pass) {
+              newCachedGraph->outputTensor = [mpsGraph padTensor:newCachedGraph->inputTensor
+                                                 withPaddingMode:mode
+                                                     leftPadding:leftPadding
+                                                    rightPadding:rightPadding
+                                                   constantValue:0
+                                                            name:nil];
+            } else {
+              newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+              newCachedGraph->outputTensor = [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor
+                                                                                sourceTensor:newCachedGraph->inputTensor
+                                                                                 paddingMode:mode
+                                                                                 leftPadding:leftPadding
+                                                                                rightPadding:rightPadding
+                                                                                        name:nil];
+            }
+        }
+        return newCachedGraph;
+      }));
+    }
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    if (is_backward_pass) {
+        Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output);
+        feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+    }
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+  return output;
+}
+} // namespace mps
+
+// 1D Reflection and Replication Padding
+TORCH_IMPL_FUNC(reflection_pad1d_out_mps)
+(const Tensor& input, IntArrayRef padding, const Tensor& output)
+{
+  mps::pad_out_template(const_cast<Tensor&>(output), input, padding, c10::nullopt, MPSGraphPaddingModeReflect, "reflection_pad1d_out_mps");
+}
+
+TORCH_IMPL_FUNC(reflection_pad1d_backward_out_mps)
+(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  mps::pad_out_template(const_cast<Tensor&>(grad_input), input, padding, grad_output, MPSGraphPaddingModeReflect, "reflection_pad1d_backward_out_mps");
+}
+
+TORCH_IMPL_FUNC(replication_pad1d_out_mps)
+(const Tensor& input, IntArrayRef padding, const Tensor& output)
+{
+  mps::pad_out_template(const_cast<Tensor&>(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad1d_out_mps");
+}
+
+TORCH_IMPL_FUNC(replication_pad1d_backward_out_mps)
+(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  mps::pad_out_template(const_cast<Tensor&>(grad_input), input, padding, grad_output, MPSGraphPaddingModeClampToEdge, "replication_pad1d_backward_out_mps");
+}
+
+// 2D Reflection and Replication Padding
+Tensor& reflection_pad2d_out_mps(const Tensor& input, IntArrayRef padding, Tensor& output)
+{
+  return mps::pad_out_template(output, input, padding, c10::nullopt, MPSGraphPaddingModeReflect, __func__);
+}
+
+Tensor reflection_pad2d_mps(const Tensor& input, IntArrayRef padding)
+{
+  Tensor output = at::empty({0}, input.options());
+  return mps::pad_out_template(output, input, padding, c10::nullopt, MPSGraphPaddingModeReflect, __func__);
+}
+
+Tensor& reflection_pad2d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeReflect, __func__);
+}
+
+Tensor reflection_pad2d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding)
+{
+  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeReflect, __func__);
+}
+
+TORCH_IMPL_FUNC(replication_pad2d_out_mps)
+(const Tensor& input, IntArrayRef padding, const Tensor& output)
+{
+  mps::pad_out_template(const_cast<Tensor&>(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad2d_out_mps");
+}
+
+Tensor& replication_pad2d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__);
+}
+
+Tensor replication_pad2d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding)
+{
+  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__);
+}
+
+// 3D Reflection and Replication Padding
+TORCH_IMPL_FUNC(reflection_pad3d_out_mps)
+(const Tensor& input, IntArrayRef padding, const Tensor& output)
+{
+  mps::pad_out_template(const_cast<Tensor&>(output), input, padding, c10::nullopt, MPSGraphPaddingModeReflect, "reflection_pad3d_out_mps");
+}
+
+TORCH_IMPL_FUNC(reflection_pad3d_backward_out_mps)
+(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  mps::pad_out_template(const_cast<Tensor&>(grad_input), input, padding, grad_output, MPSGraphPaddingModeReflect, "reflection_pad3d_backward_out_mps");
+}
+
+TORCH_IMPL_FUNC(replication_pad3d_out_mps)
+(const Tensor& input, IntArrayRef padding, const Tensor& output)
+{
+  mps::pad_out_template(const_cast<Tensor&>(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad3d_out_mps");
+}
+
+Tensor& replication_pad3d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input)
+{
+  grad_input.resize_as_(input).zero_();
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__);
+}
+
+Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding)
+{
+  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__);
+}
+
+// topk
+TORCH_IMPL_FUNC(topk_out_mps)
+  (const Tensor& self,
+  int64_t k,
+  int64_t dim_,
+  bool largest,
+  bool sorted,
+  const Tensor& values,
+  const Tensor& indices)
+{
+  using namespace mps;
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  TORCH_CHECK(
+    k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
+    "selected index k out of range");
+
+  if (self.dim() == 0 && self.numel() == 1)
+  {
+      values.copy_(self);
+      indices.zero_();
+      return;
+  }
+  MPSStream* stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph
+  {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  // MPSGraph topK is always sorted.
+  @autoreleasepool
+  {
+      // Input as placeholders
+      MPSShape* input_shape = getMPSShape(self);
+      NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+      string key = string("topk:") + [ns_shape_key UTF8String] + ":" +
+                             getMPSTypeString(self.scalar_type()) +
+                             ":k" + to_string(k) + ":dim" + to_string(dim_) +
+                             ":largest" + to_string(largest);
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      if(!cachedGraph)
+      {
+          cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+          CachedGraph *newCachedGraph = nil;
+          @autoreleasepool
+          {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+              newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
+              if ((dim_ != -1 && dim_ != self.dim() - 1) && (!largest))
+              {
+                // transpose and negate
+                  MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor
+                                                                               dimension: (NSUInteger)self.dim()-1
+                                                                               withDimension: (NSUInteger)dim_
+                                                                               name: nil];
+                  MPSGraphTensor * identity = [mpsGraph identityWithTensor: transposedInput
+                                                                               name: nil];
+                  MPSGraphTensor * negatedTransposedInput = [mpsGraph  negativeWithTensor:identity
+                                                                                        name: nil];
+                  NSArray<MPSGraphTensor *> * outputMPSGraphTensors = [mpsGraph
+                                                                       topKWithSourceTensor:negatedTransposedInput
+                                                                       k:((NSUInteger) k)
+                                                                       name:nil];
+                  MPSGraphTensor *valuesNegatedTransposed = outputMPSGraphTensors[0];
+                  MPSGraphTensor *indicesTransposed = outputMPSGraphTensors[1];
+                  MPSGraphTensor *valuesNegated = [mpsGraph transposeTensor: valuesNegatedTransposed
+                                                                                        dimension: (NSUInteger)self.dim()-1
+                                                                                    withDimension: (NSUInteger)dim_
+                                                                                             name: nil];
+                  newCachedGraph->valuesTensor = [mpsGraph negativeWithTensor:valuesNegated
+                                                                         name: nil];
+                  newCachedGraph->indicesTensor = [mpsGraph transposeTensor: indicesTransposed
+                                                                            dimension: (NSUInteger)self.dim()-1
+                                                                            withDimension: (NSUInteger)dim_
+                                                                            name: nil];
+              }
+              else if (dim_ != -1 && dim_ != self.dim() - 1)
+              {
+                  MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor
+                                                                               dimension: (NSUInteger)self.dim()-1
+                                                                               withDimension: (NSUInteger)dim_
+                                                                               name: nil];
+                  MPSGraphTensor * identity = [mpsGraph identityWithTensor: transposedInput
+                                                                               name: nil];
+                  NSArray<MPSGraphTensor *> * outputMPSGraphTensors = [mpsGraph
+                                                                       topKWithSourceTensor:identity
+                                                                       k:((NSUInteger) k)
+                                                                       name:nil];
+                  MPSGraphTensor *valuesTransposed = outputMPSGraphTensors[0];
+                  MPSGraphTensor *indicesTransposed = outputMPSGraphTensors[1];
+                  newCachedGraph->valuesTensor = [mpsGraph transposeTensor:valuesTransposed
+                                                                        dimension: (NSUInteger)self.dim()-1
+                                                                        withDimension: (NSUInteger)dim_
+                                                                        name: nil];
+                  newCachedGraph->indicesTensor = [mpsGraph transposeTensor: indicesTransposed
+                                                                            dimension: (NSUInteger)self.dim()-1
+                                                                            withDimension: (NSUInteger)dim_
+                                                                            name: nil];
+              }
+              else if (!largest)
+              {
+                  // only negate
+                  MPSGraphTensor *negatedInput = [mpsGraph negativeWithTensor:newCachedGraph->selfTensor
+                                                                        name: nil];
+                  NSArray<MPSGraphTensor *> * outputMPSGraphTensors = [mpsGraph
+                                                                       topKWithSourceTensor:negatedInput
+                                                                       k:((NSUInteger) k)
+                                                                       name:nil];
+                  MPSGraphTensor *valuesNegated = outputMPSGraphTensors[0];
+                  newCachedGraph->valuesTensor = [mpsGraph negativeWithTensor:valuesNegated
+                                                                            name: nil];
+                  newCachedGraph->indicesTensor = outputMPSGraphTensors[1];
+              }
+              else
+              {
+                  NSArray<MPSGraphTensor *> * outputMPSGraphTensors = [mpsGraph
+                                                                         topKWithSourceTensor:newCachedGraph->selfTensor
+                                                                         k:((NSUInteger) k)
+                                                                         name:nil];
+                  newCachedGraph->valuesTensor = outputMPSGraphTensors[0];
+                  newCachedGraph->indicesTensor = outputMPSGraphTensors[1];
+              }
+
+          }
+          return newCachedGraph;
+        }));
+      }
+  Placeholder inputPlaceholder  = Placeholder(cachedGraph->selfTensor, self);
+  // Outputs as placeholders
+  Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
+  Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
+  // Create dictionary of inputs and outputs
+  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =  nil;
+  feeds = @{
+  inputPlaceholder.getMPSGraphTensor() :
+      inputPlaceholder.getMPSGraphTensorData()
+  };
+  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+  valuesPlaceholder.getMPSGraphTensor() :
+          valuesPlaceholder.getMPSGraphTensorData(),
+  indicesPlaceholder.getMPSGraphTensor() :
+        indicesPlaceholder.getMPSGraphTensorData()
+  };
+
+  runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+void check_shape_except_dim(const Tensor &first, const Tensor &second,
+                            int dimension, int index)
+{
+  int first_dims = first.dim();
+  int second_dims = second.dim();
+  TORCH_CHECK(first_dims == second_dims,
+      "Tensors must have same number of dimensions: got ", first_dims,
+      " and ", second_dims);
+  for (int dim = 0; dim < first_dims; dim++) {
+    if (dim == dimension) {
+      continue;
+    }
+    int64_t first_dim_size = at::native::size(first, dim);
+    int64_t second_dim_size = at::native::size(second, dim);
+    TORCH_CHECK(first_dim_size == second_dim_size,
+        "Sizes of tensors must match except in dimension ", dim, ". Got ",
+        static_cast<long long>(first_dim_size), " and ",
+        static_cast<long long>(second_dim_size), " (The offending index is ",
+        index, ")");
+  }
+}
+
+inline c10::MemoryFormat compute_output_memory_format(const TensorList &inputs) {
+  c10::optional<c10::MemoryFormat> format = c10::nullopt;
+  for (auto &t : inputs) {
+    auto f = t.suggest_memory_format();
+    if (!format.has_value()) {
+      format = f;
+      continue;
+    }
+    if (format.value() == f) {
+      continue;
+    }
+    bool contiguous = (format.value() == c10::MemoryFormat::Contiguous || f == c10::MemoryFormat::Contiguous || format.value() != f);
+    if (contiguous) {
+      return c10::MemoryFormat::Contiguous;
+    }
+  }
+  return format.value();
+}
+
+//Tensor cat_mps(TensorList inputs, int64_t dimension) {
+  //ScalarType high_type = result_type(inputs);
+  //Tensor out = at::empty({0}, inputs.front().options().dtype(high_type));
+  //at::native::cat_out_mps(inputs, dimension, out);
+  //return out;
+//}
+
+TORCH_IMPL_FUNC(cat_out_mps)
+      (ITensorListRef inputs,
+       int64_t dimension,
+       int64_t valid,
+       bool all_contiguous,
+       bool all_same_dtype,
+       bool all_same_sizes_and_stride,
+       MemoryFormat memory_format,
+       const Tensor& out) {
+  using namespace mps;
+  if (out.numel() == 0) {
+    return;
+  }
+
+  auto materialized_inputs = inputs.materialize();
+
+  int idx = 0;
+  for(const Tensor& t : materialized_inputs) {
+    TORCH_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", idx, ") cannot be concatenated");
+    idx++;
+  }
+
+  dimension = legacy_cat_wrap_dim(dimension, inputs);
+
+  // previously, size [0] tensors were the only possible empty tensors; thus, it
+  // wasn't possible to cat empty tensors unless all the other tensors were
+  // 1-dimensional, so we allowed these tensors to be "skipped".  We maintain
+  // this behavior for backwards compatibility, but only for this specific size
+  // (i.e. other empty sizes are not skipped).
+  // FIXME: warn if this is the case
+  auto should_skip = [](const Tensor& t) {
+    return t.dim() == 1 && at::native::size(t, 0) == 0;
+  };
+
+  const Tensor* notSkippedTensor = NULL; // non-owning reference
+  int nDims = 0;
+
+  // Check for type promotion
+  TORCH_CHECK(
+      canCast(result_type(inputs), out.scalar_type()),
+      "torch.cat(): input types ",
+      " can't be cast to the desired output type ",
+      out.scalar_type());
+
+  // Inputs cannot alias the output tensor
+  idx = 0;
+  for(const Tensor& t : materialized_inputs) {
+    auto lap = at::get_overlap_status(out, t);
+    TORCH_CHECK(
+        lap != at::MemOverlapStatus::PARTIAL &&
+            lap != at::MemOverlapStatus::FULL,
+        "torch.cat(): unsupported operation: the input tensors cannot refer to any "
+        "of the output memory locations. Found overlap in input "
+        "tensor ",
+        idx);
+    idx++;
+  }
+  at::assert_no_internal_overlap(out);
+
+  for(const Tensor& t : materialized_inputs) {
+    if (should_skip(t)) {
+      continue;
+    }
+    nDims = t.dim();
+    // TODO: Is this OK?
+    notSkippedTensor = &t;
+  }
+
+  // If all inputs are empty tensors, return an empty tensor
+  if (notSkippedTensor == NULL) {
+    return;
+  }
+
+  TORCH_CHECK(
+      inputs.size() > 0,
+      "torch.cat(): invalid number of inputs ",
+      inputs.size());
+  TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
+
+  for (const Tensor& t : inputs) {
+    TORCH_CHECK(
+        t.device() == notSkippedTensor->device(),
+        "torch.cat(): all input tensors must be on the same device. Received ",
+        t.device(),
+        " and ",
+        notSkippedTensor->device());
+  }
+
+  TORCH_CHECK(
+      out.device() == notSkippedTensor->device(),
+      "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
+      notSkippedTensor->device(),
+      " and out is on ",
+      out.device());
+
+  // TODO: memory_format is now an argument?
+  // // TODO: Factor out `compute_output_memory_format`
+  // c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
+
+  std::vector<int64_t> size(notSkippedTensor->sizes().vec());
+
+  // Compute size of the result in the cat dimension
+  int64_t cat_dim_size = 0;
+  idx = 0;
+  for(const Tensor& tensor : materialized_inputs) {
+    if (should_skip(tensor)) {
+      continue;
+    }
+    // TODO: Factor out `check_shape_except_dim`
+    check_shape_except_dim(*notSkippedTensor, tensor, dimension, idx);
+    cat_dim_size += at::native::size(tensor, dimension);
+    idx++;
+  }
+
+  // Compute the size of the result
+  size[dimension] = cat_dim_size;
+
+  // skip resizing if size of result is same as expected
+  if (out.sizes() != size) {
+    out.resize_(size, memory_format);
+  }
+
+  if (out.numel() == 0) {
+    return;
+  }
+
+  // Get stream
+  MPSStream* stream = getCurrentMPSStream();
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    // TODO: Free this when no longer needed globally
+    MPSGraphTensor** inputMPSGraphTensors_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache *cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "cat_out_mps:" + getMPSTypeString(result_type(inputs))
+                                + ":" + to_string(inputs.size())
+                                + ":" + to_string(dimension);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          // Initialize graph
+          MPSGraph *mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          // Create placeholders
+          MPSGraphTensor* inputMPSGraphTensors[inputs.size()];
+
+          for(int i = 0; i < inputs.size(); i++)
+            inputMPSGraphTensors[i] = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(result_type(inputs)));
+
+          auto inputTensorsArray = [NSArray arrayWithObjects:inputMPSGraphTensors
+                                                       count:inputs.size()];
+          // Use concatTensors to concatenate
+          MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
+                                                       dimension:dimension // Maybe convert this from int64_t -> int32
+                                                            name:nil];
+
+          newCachedGraph->inputMPSGraphTensors_ = (MPSGraphTensor**)malloc(inputs.size() * sizeof(MPSGraphTensor*));
+
+          for(int i = 0; i < inputs.size(); i++)
+            newCachedGraph->inputMPSGraphTensors_[i] = inputMPSGraphTensors[i];
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    std::vector<Placeholder> inputPlaceholders;
+    int i = 0;
+    for(const Tensor& tensor : materialized_inputs) {
+      Placeholder currentInputPlaceholder = Placeholder(cachedGraph->inputMPSGraphTensors_[i], tensor);
+      inputPlaceholders.push_back(currentInputPlaceholder);
+      i++;
+    }
+
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    for (int i = 0; i < inputs.size(); i++) {
+      feeds[(inputPlaceholders[i]).getMPSGraphTensor()] = (inputPlaceholders[i]).getMPSGraphTensorData();
+    }
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+void upsample_backward_out_mps(const Tensor& grad_output,
+                               IntArrayRef output_size,
+                               IntArrayRef input_size,
+                               c10::optional<double> scales_h,
+                               c10::optional<double> scales_w,
+                               const Tensor& grad_input,
+                               MPSGraphResizeMode requested_mode,
+                               bool requested_align_corners
+                               )
+{
+    using namespace mps;
+    int64_t input_dims = input_size.size();
+
+    TORCH_CHECK((input_dims == 4),
+            "NCHW tensor expected for input");
+
+    struct CachedGraph : public MPSCachedGraph {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor *gradInputTensor = nil, *gradOutputTensor = nil;
+    };
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+    /* sizes */
+    int64_t output_height = output_size[0];
+    int64_t output_width = output_size[1];
+
+    int64_t input_n = input_size[0];
+    int64_t input_c = input_size[1];
+    int64_t input_height = input_size[2];
+    int64_t input_width = input_size[3];
+
+    @autoreleasepool {
+      MPSShape* output_shape = getMPSShape(grad_output);
+      string key = string("upsample_backward:") + mps::getMPSShapeString(output_shape) + ":" +
+                             getMPSTypeString(grad_output.scalar_type()) +
+                             ":oh" + to_string(output_height) + ":ow" + to_string(output_width) +
+                             ":ih" + to_string(input_height) + ":iw" + to_string(input_width) +
+                             ":mode" + to_string(requested_mode);
+
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      if(!cachedGraph) {
+        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+          CachedGraph *newCachedGraph = nil;
+          @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_input.scalar_type()), output_shape);
+              MPSGraphTensor * shapeTensor = [mpsGraph constantWithScalar:0
+                                                                    shape:@[[NSNumber numberWithLong: input_n],
+                                                                            [NSNumber numberWithLong: input_c],
+                                                                            [NSNumber numberWithLong:input_height],
+                                                                            [NSNumber numberWithLong:input_width]]
+                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
+
+              newCachedGraph->gradInputTensor  = [mpsGraph resizeWithGradientTensor: newCachedGraph->gradOutputTensor
+                                                                           input: shapeTensor
+                                                                            mode: requested_mode
+                                                                    centerResult: true
+                                                                    alignCorners: requested_align_corners
+                                                                        layout: MPSGraphTensorNamedDataLayoutNCHW
+                                                                            name: nil];
+
+          }
+          return newCachedGraph;
+        }));
+      }
+      Placeholder gradOutputPlaceholder  = Placeholder(cachedGraph->gradOutputTensor, grad_output);
+      Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input);
+
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+          gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      };
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+          gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+      };
+      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false);
+}
+
+TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false);
+}
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeBilinear, align_corners);
+}
+
+void upsample_out_mps(const Tensor& input,
+                      IntArrayRef output_size,
+                      c10::optional<double> scales_h,
+                      c10::optional<double> scales_w,
+                      const Tensor& output,
+                      MPSGraphResizeMode requested_mode,
+                      bool requested_align_corners)
+{
+    // Get stream
+    using namespace mps;
+    struct CachedGraph : public MPSCachedGraph {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    };
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    /* sizes */
+    int64_t output_height = output_size[0];
+    int64_t output_width = output_size[1];
+    @autoreleasepool {
+      MPSShape* input_shape = getMPSShape(input);
+      NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+      string key = string("upsample_2d:") + mps::getMPSShapeString(input_shape) + ":" +
+                             getMPSTypeString(input.scalar_type()) +
+                             ":h" + to_string(output_height) + ":w" + to_string(output_width) +
+                             ":mode" + to_string(requested_mode);
+
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      if(!cachedGraph) {
+        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+          CachedGraph *newCachedGraph = nil;
+
+          @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
+              newCachedGraph->outputTensor = [mpsGraph resizeTensor:newCachedGraph->inputTensor
+                                                               size:@[ @(output_height), @(output_width)]
+                                                               mode:requested_mode
+                                                               centerResult: true
+                                                               alignCorners: requested_align_corners
+                                                               layout: MPSGraphTensorNamedDataLayoutNCHW
+                                                               name:nil];
+          }
+          return newCachedGraph;
+        }));
+      }
+      Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
+      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      };
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
+      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+    // Note: this differs from the CPU implementation in the way
+    // ties are resolved wrt to nearest mostly in cases where the scale
+    // is not an integer.
+    // Example:
+    // For upsampling from (2, 5) to (2, 16)
+    // MPS:
+    // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
+    // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
+    // CPU:
+    // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
+    // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
+    using namespace mps;
+    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false);
+}
+
+
+TORCH_IMPL_FUNC(upsample_nearest2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+    // Note: this differs from the CPU implementation in the way
+    // ties are resolved wrt to nearest mostly in cases where the scale
+    // is not an integer.
+    // Example:
+    // For upsampling from (2, 5) to (2, 16)
+    // MPS:
+    // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
+    // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
+    // CPU:
+    // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
+    // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
+    using namespace mps;
+    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false);
+}
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+    using namespace mps;
+    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeBilinear, align_corners);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/SoftMax.mm b/aten/src/ATen/native/mps/operations/SoftMax.mm
new file mode 100644
index 000000000000..4246a37671e9
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/SoftMax.mm
@@ -0,0 +1,278 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+
+void get_shapes(MPSShape* input_shape_readonly,
+                NSMutableArray<NSNumber*>* &input_shape,
+                int num_input_dims, c10::MemoryFormat memory_format) {
+  // Modify the shape
+  if(memory_format == at::MemoryFormat::Contiguous) {
+    for(int i = 0; i < num_input_dims; i++)
+      input_shape[i] = input_shape_readonly[i];
+  }
+  else { // ChannelsLast
+    auto num_channels = input_shape_readonly[1];
+    input_shape[0] = input_shape_readonly[0];
+    for(int i = 1; i < num_input_dims-1; i++)
+      input_shape[i] = input_shape_readonly[i+1];
+    input_shape[num_input_dims-1] = num_channels;
+  }
+}
+
+// Note - Currently only supported for 4D image tensors
+
+TORCH_IMPL_FUNC(softmax_mps_out)
+(const Tensor& input_,
+ const int64_t dim,
+ const bool half_to_float,
+ const Tensor& output) {
+
+  TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on MPS");
+
+  if (input_.numel() == 0) {
+    return;
+  }
+
+  Tensor input;
+  if (input_.dim() == 0) {
+    input = input_.view(1);
+  }
+  else
+    input = input_;
+
+  int64_t dim_ = maybe_wrap_dim(dim, input.dim());
+  TORCH_CHECK(
+      dim_ >= 0 && dim_ < input.dim(),
+      "Softmax:dim must be non-negative and less than input dimensions");
+
+  const auto memory_format = input.suggest_memory_format();
+  // TORCH_CHECK(input.suggest_memory_format() == output.suggest_memory_format(), "Input and output memory format should match")
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    string mem_format_key = get_mem_format_string(memory_format);
+    MPSShape* input_shape_readonly = mps::getMPSShape(input);
+    int num_input_dims = [input_shape_readonly count];
+    // Check - Channels last implies 4d
+    TORCH_CHECK(memory_format != at::MemoryFormat::ChannelsLast || num_input_dims == 4, "ChannelsLast implies 4d tensor")
+    // Input shape changes based on memory format
+    NSMutableArray<NSNumber*>* input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+
+    get_shapes(input_shape_readonly, input_shape, num_input_dims, memory_format);
+
+    // Change dim
+    if(memory_format == at::MemoryFormat::ChannelsLast && dim_ > 0) {
+      switch(dim_) {
+        case 1:
+          dim_ = 3;
+          break;
+        case 2:
+          dim_ = 1;
+          break;
+        case 3:
+          dim_ = 2;
+          break;
+        default:
+          assert(0 && "Invalid dim\n");
+      }
+    }
+
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "softmax_mps_out:" + mem_format_key + ":" + getMPSTypeString(input.scalar_type()) + ":"
+                                    + [ns_shape_key UTF8String] + ":" + std::to_string(dim_);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
+
+          // passing selector of softMaxWithTensor on the mpsGraph object
+          MPSGraphTensor* outputTensor = [mpsGraph softMaxWithTensor:inputTensor
+                                                                axis:(NSInteger)dim_
+                                                                name:nil];
+
+          // Output needs to be contiguous format
+          if(memory_format == at::MemoryFormat::ChannelsLast) {
+            auto N = input_shape[0];
+            auto H = input_shape[1];
+            auto W = input_shape[2];
+            auto C = input_shape[3];
+
+            outputTensor = [mpsGraph reshapeTensor:outputTensor
+                                         withShape:@[N, ([NSNumber numberWithInt:[H intValue]* [W intValue]]), C]
+                                              name:nil];
+            outputTensor = [mpsGraph transposeTensor:outputTensor
+                                           dimension:1
+                                       withDimension:2
+                                                name:nil];
+            outputTensor = [mpsGraph reshapeTensor:outputTensor
+                                         withShape:@[N, C, H, W]
+                                              name:nil];
+
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input, input_shape);
+    // This must be the Contiguous shape
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(softmax_backward_mps_out)
+(const Tensor& grad_,
+ const Tensor& output_,
+ int64_t dim,
+ ScalarType input_dtype,
+ const Tensor& grad_input) {
+
+  if (output_.numel() == 0) {
+    return;
+  }
+
+  Tensor grad;
+  if (grad_.dim() == 0) {
+    grad = grad_.view(1);
+  }
+  else
+    grad = grad_;
+
+  Tensor output;
+  if (output_.dim() == 0) {
+    output = output_.view(1);
+  }
+  else
+    output = output_;
+
+  int64_t dim_ = maybe_wrap_dim(dim, grad.dim());
+  TORCH_CHECK(
+      dim_ >= 0 && dim_ < grad.dim(),
+      "Grad:dim must be non-negative and less than input dimensions");
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* softmaxTensor_ = nil;
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* grad_shape = mps::getMPSShape(grad);
+    int num_grad_dims = [grad_shape count];
+
+    NSString* ns_shape_key = [[grad_shape valueForKey:@"description"] componentsJoinedByString:@","];
+
+    string key = "softmax_backward_mps_out:" + getMPSTypeString(output.scalar_type()) + ":"
+                                             + [ns_shape_key UTF8String] + ":" + std::to_string(dim_);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* softmaxTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), grad_shape);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad.scalar_type()), grad_shape);
+
+          MPSGraphTensor* mulTensor = [mpsGraph multiplicationWithPrimaryTensor:softmaxTensor
+                                                                secondaryTensor:gradOutputTensor
+                                                                           name:nil];
+          MPSGraphTensor* mulSumTensor = [mpsGraph reductionSumWithTensor:mulTensor
+                                                                     axis:(NSInteger)dim_
+                                                                     name:nil];
+          MPSGraphTensor* gradSubTensor = [mpsGraph subtractionWithPrimaryTensor:gradOutputTensor
+                                                                 secondaryTensor:mulSumTensor
+                                                                            name:nil];
+          MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:softmaxTensor
+                                                                      secondaryTensor:gradSubTensor
+                                                                                 name:nil];
+
+          newCachedGraph->softmaxTensor_ = softmaxTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder softmaxPlaceholder = Placeholder(cachedGraph->softmaxTensor_, output, grad_shape);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad, grad_shape);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      softmaxPlaceholder.getMPSGraphTensor() : softmaxPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
new file mode 100644
index 000000000000..a6c267290312
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -0,0 +1,344 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/TensorUtils.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+struct CachedGraph : public MPSCachedGraph
+{
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    MPSGraphTensor *minTensor = nil, *maxTensor = nil;
+};
+
+void clamp_mps_graph(CachedGraph* cachedGraph, const Tensor& input_tensor)
+{
+    MPSGraph *mpsGraph = cachedGraph->graph();
+
+    cachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
+
+    if (cachedGraph->minTensor && cachedGraph->maxTensor) {
+        cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor
+                                               minValueTensor:cachedGraph->minTensor
+                                               maxValueTensor:cachedGraph->maxTensor
+                                                         name:nil];
+    } else if (cachedGraph->maxTensor) {
+        cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor
+                                                       secondaryTensor:cachedGraph->maxTensor
+                                                                  name:nil];
+    } else if (cachedGraph->minTensor) {
+        cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor
+                                                       secondaryTensor:cachedGraph->minTensor
+                                                                  name:nil];
+    }
+}
+
+void clamp_tensor_out_mps(const Tensor& input_t,
+                          const OptionalTensorRef min_opt,
+                          const OptionalTensorRef max_opt,
+                          const Tensor& output_t,
+                          string op_name)
+{
+    const bool has_min = (min_opt.has_value() && min_opt->defined());
+    const bool has_max = (max_opt.has_value() && max_opt->defined());
+
+    TORCH_CHECK(has_min || has_max, op_name + ": either min, max or both tensors must be defined")
+    if (has_min)
+        TORCH_CHECK(min_opt->is_same_size(input_t), op_name + ": min and input tensors must be of the same shape")
+    if (has_max)
+        TORCH_CHECK(max_opt->is_same_size(input_t), op_name + ": max and input tensors must be of the same shape")
+
+    if (output_t.numel() == 0)
+        return;
+
+    @autoreleasepool {
+        // the optional min/max refs could affect how we build the cached graph
+        string key = op_name + (has_min ? "_min" : "") + (has_max ? "_max" : "")
+                             + "_tensor" + getTensorsStringKey({input_t});
+        MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if (!cachedGraph) {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    if (has_min)
+                        newCachedGraph->minTensor = mpsGraphRankedPlaceHolder(mpsGraph, *min_opt);
+                    if (has_max)
+                        newCachedGraph->maxTensor = mpsGraphRankedPlaceHolder(mpsGraph, *max_opt);
+
+                    clamp_mps_graph(newCachedGraph, input_t);
+                }
+                return newCachedGraph;
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input_t);
+        auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, output_t);
+
+        NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+        feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+        if (has_min) {
+            auto minPlaceholder = Placeholder(cachedGraph->minTensor, *min_opt);
+            feeds[minPlaceholder.getMPSGraphTensor()] = minPlaceholder.getMPSGraphTensorData();
+        }
+        if (has_max) {
+            auto maxPlaceholder = Placeholder(cachedGraph->maxTensor, *max_opt);
+            feeds[maxPlaceholder.getMPSGraphTensor()] = maxPlaceholder.getMPSGraphTensorData();
+        }
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+            outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+void clamp_scalar_out_mps(const Tensor& input_t,
+                             const OptionalScalarRef min_opt,
+                             const OptionalScalarRef max_opt,
+                             const Tensor& output_t,
+                             string op_name)
+{
+    using scalar_t = double;
+
+    const bool has_min = (min_opt.has_value());
+    const bool has_max = (max_opt.has_value());
+    TORCH_CHECK(has_min || has_max, op_name + ": either min, max or both scalars must be defined")
+
+    scalar_t min_scalar =  std::numeric_limits<scalar_t>::infinity();
+    scalar_t max_scalar = -std::numeric_limits<scalar_t>::infinity();
+
+    if (has_min)
+        min_scalar = min_opt.get().to<scalar_t>();
+    if (has_max)
+        max_scalar = max_opt.get().to<scalar_t>();
+
+    if (output_t.numel() == 0)
+        return ;
+
+    @autoreleasepool {
+        // the optional min/max refs could affect how we build the cached graph
+        string key = op_name + (has_min ? ("_min:" + to_string(min_scalar)) : "") + (has_max ? ("_max:" + to_string(max_scalar)) : "")
+                             + "_scalar:" + getTensorsStringKey({input_t});
+        MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if (!cachedGraph) {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+                CachedGraph *newCachedGraph = nil;
+
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+
+                    if (has_min)
+                        newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
+                                                                           shape:(mps::getMPSShape(input_t))
+                                                                        dataType:(mps::getMPSScalarType(input_t.scalar_type())) ];
+                    if (has_max)
+                        newCachedGraph->maxTensor = [mpsGraph constantWithScalar:max_scalar
+                                                                           shape:(mps::getMPSShape(input_t))
+                                                                        dataType:(mps::getMPSScalarType(input_t.scalar_type())) ];
+
+                    clamp_mps_graph(newCachedGraph, input_t);
+                }
+                return newCachedGraph;
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder  = Placeholder(cachedGraph->inputTensor , input_t);
+        auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, output_t);
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+            outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+} // namespace mps
+
+// APIs exposed to at::native scope
+TORCH_IMPL_FUNC(clamp_Tensor_out_mps)
+(const Tensor& input_t, const OptionalTensorRef min, const OptionalTensorRef max, const Tensor& output_t)
+{
+    mps::clamp_tensor_out_mps(input_t, min, max, output_t, __func__);
+}
+
+TORCH_IMPL_FUNC(clamp_out_mps)
+(const Tensor& input_t, const OptionalScalarRef min, const OptionalScalarRef max, const Tensor& output_t)
+{
+    mps::clamp_scalar_out_mps(input_t, min, max, const_cast<Tensor&>(output_t), "clamp_out_mps");
+}
+
+TORCH_IMPL_FUNC(clamp_min_Tensor_out_mps)
+(const Tensor& input_t, const Tensor& min, const Tensor& output_t)
+{
+    mps::clamp_tensor_out_mps(input_t, min, at::OptionalTensorRef(), output_t, __func__);
+}
+
+TORCH_IMPL_FUNC(clamp_min_out_mps)
+(const Tensor& input_t, const Scalar& min, const Tensor& output_t)
+{
+    mps::clamp_scalar_out_mps(input_t, min, at::OptionalScalarRef(), output_t, __func__);
+}
+
+TORCH_IMPL_FUNC(clamp_max_Tensor_out_mps)
+(const Tensor& input_t, const Tensor& max, const Tensor& output_t)
+{
+    mps::clamp_tensor_out_mps(input_t, at::OptionalTensorRef(), max, output_t, __func__);
+}
+
+TORCH_IMPL_FUNC(clamp_max_out_mps)
+(const Tensor& input_t, const Scalar& max, const Tensor& output_t)
+{
+    mps::clamp_scalar_out_mps(input_t, at::OptionalScalarRef(), max, output_t, __func__);
+}
+
+Tensor& where_self_out_mps(const Tensor& condition,
+                           const Tensor& self,
+                           const Tensor& other,
+                           Tensor& out) {
+  TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype());
+
+  if (condition.scalar_type() == ScalarType::Byte) {
+  TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead.");
+  } else {
+  TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type());
+  }
+  Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition;
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Empty output
+  if(out.numel() == 0)
+    return out;
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* conditionTensor_ = nil;
+    MPSGraphTensor* selfTensor_ = nil;
+    MPSGraphTensor* otherTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* input_shape = getMPSShape(self);
+
+    string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+                MPSGraph* mpsGraph = make_mps_graph();
+                newCachedGraph = new CachedGraph(mpsGraph);
+
+                MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, cond_bool);
+                MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+                MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+
+                MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:conditionTensor
+                                                               truePredicateTensor:selfTensor
+                                                              falsePredicateTensor:otherTensor
+                                                                              name:nil];
+
+                newCachedGraph->conditionTensor_ = conditionTensor;
+                newCachedGraph->selfTensor_ = selfTensor;
+                newCachedGraph->otherTensor_ = otherTensor;
+                newCachedGraph->outputTensor_ = outputTensor;
+            }
+            return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder conditionPlaceholder = Placeholder(cachedGraph->conditionTensor_, cond_bool);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      conditionPlaceholder.getMPSGraphTensor() : conditionPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+  }
+
+  return out;
+}
+
+Tensor where_mps(const Tensor& condition,
+                 const Tensor& self,
+                 const Tensor& other) {
+
+  auto cond_shape = condition.sizes();
+  auto self_shape = self.sizes();
+  auto other_shape = other.sizes();
+
+  bool cond_zero_shape = (condition.dim() == 0);
+  bool self_zero_shape = (self.dim() == 0);
+  bool other_zero_shape = (other.dim() == 0);
+
+  auto max_dim = std::max(condition.dim(), std::max(self.dim(), other.dim()));
+
+  auto sum_dims = condition.dim() + self.dim() + other.dim();
+
+  TORCH_CHECK(max_dim == 0 || !(sum_dims % max_dim), "All inputs of where should have same/compatible number of dims")
+
+  int64_t out_arr[max_dim];
+
+  // Broadcasted output shape
+  for(int i = 0; i < max_dim; i++) {
+
+    int64_t cond_num = cond_zero_shape ? 0 : condition.size(i);
+    int64_t self_num = self_zero_shape ? 0 : self.size(i);
+    int64_t other_num = other_zero_shape ? 0 : other.size(i);
+
+    out_arr[i] = std::max(cond_num, std::max(self_num, other_num));
+  }
+
+  Tensor ret = empty_mps(IntArrayRef(out_arr, max_dim),
+                         self.scalar_type(),
+                         c10::nullopt,
+                         kMPS,
+                         c10::nullopt,
+                         self.suggest_memory_format());
+  return where_self_out_mps(condition, self, other, ret);
+
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
new file mode 100644
index 000000000000..6a29d080cb6c
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -0,0 +1,370 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+#ifdef __OBJC__
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(triu_mps_out)
+(const Tensor& self,
+ int64_t k,
+ const Tensor &output) {
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "triu_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* outputTensor = nil;
+
+          MPSGraphTensor* minusOneTensor = [mpsGraph constantWithScalar:-1
+                                                               dataType:MPSDataTypeInt32];
+
+          if(k > 0) {
+            MPSGraphTensor* diagMinusOneTensor = [mpsGraph constantWithScalar:(k-1)
+                                                                     dataType:MPSDataTypeInt32];
+            MPSGraphTensor* complementTensor = [mpsGraph bandPartWithTensor:inputTensor
+                                                             numLowerTensor:minusOneTensor
+                                                             numUpperTensor:diagMinusOneTensor
+                                                                       name:nil];
+            outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
+                                                  secondaryTensor:complementTensor
+                                                             name:nil];
+          }
+          else {
+            MPSGraphTensor* minusDiagTensor = [mpsGraph constantWithScalar:(-k)
+                                                                  dataType:MPSDataTypeInt32];
+            outputTensor = [mpsGraph bandPartWithTensor:inputTensor
+                                         numLowerTensor:minusDiagTensor
+                                         numUpperTensor:minusOneTensor
+                                                   name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+TORCH_IMPL_FUNC(tril_mps_out)
+(const Tensor& self,
+ int64_t k,
+ const Tensor &output) {
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "tril_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* outputTensor = nil;
+
+          MPSGraphTensor* minusOneTensor = [mpsGraph constantWithScalar:-1
+                                                               dataType:MPSDataTypeInt32];
+
+          if(k >= 0) {
+            MPSGraphTensor* diagTensor = [mpsGraph constantWithScalar:k
+                                                             dataType:MPSDataTypeInt32];
+            outputTensor = [mpsGraph bandPartWithTensor:inputTensor
+                                         numLowerTensor:minusOneTensor
+                                         numUpperTensor:diagTensor
+                                                   name:nil];
+          }
+          else {
+            MPSGraphTensor* negDiagMinusOneTensor = [mpsGraph constantWithScalar:(-k-1)
+                                                                        dataType:MPSDataTypeInt32];
+            MPSGraphTensor* complementTensor = [mpsGraph bandPartWithTensor:inputTensor
+                                                             numLowerTensor:negDiagMinusOneTensor
+                                                             numUpperTensor:minusOneTensor
+                                                                       name:nil];
+            outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
+                                                  secondaryTensor:complementTensor
+                                                             name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+}
+
+Tensor& diag_mps_out(const Tensor& self,
+                     int64_t diagonal,
+                     Tensor &output) {
+
+  // Do checks, resize output
+  IntArrayRef input_size = self.sizes();
+  auto num_input_dims = input_size.size();
+  // Input can only be 1D or 2D
+  TORCH_CHECK(num_input_dims == 1 || num_input_dims == 2,
+    "diag_mps_out: Input tensor must be 1D or 2D")
+
+  if(num_input_dims == 1) {
+    auto n = input_size[0];
+    if(diagonal > 0)
+      n += diagonal;
+    else if(diagonal < 0)
+      n -= diagonal;
+
+    output.resize_({n, n});
+  }
+  else if(num_input_dims == 2) {
+    auto num_diag_elements = std::min(input_size[0], input_size[1]);
+    if(diagonal > 0) {
+      TORCH_CHECK(input_size[1] - diagonal > 0, "Matrix not big enough for requested diagonal")
+      num_diag_elements = std::min(input_size[0], input_size[1] - diagonal);
+    }
+    else if(diagonal < 0) {
+      TORCH_CHECK(input_size[0] + diagonal > 0, "Matrix not big enough for requested diagonal")
+      num_diag_elements = std::min(input_size[0] + diagonal, input_size[1]);
+    }
+
+    output.resize_({num_diag_elements});
+  }
+
+  using namespace mps;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+
+    MPSShape* input_shape = getMPSShape(self);
+    MPSShape* output_shape = getMPSShape(output);
+    NSNumber* num_input_cols = nil;
+    NSNumber* num_output_cols = nil;
+    NSMutableArray<NSNumber*>* flat_input_shape = nil;
+    NSMutableArray<NSNumber*>* flat_output_shape = nil;
+    if(num_input_dims == 1) {
+      num_output_cols = output_shape[1];
+      flat_output_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      flat_output_shape[0] = [NSNumber numberWithInt:[output_shape[0] intValue] * [output_shape[1] intValue]];
+    }
+    else if(num_input_dims == 2) {
+      num_input_cols = input_shape[1];
+      flat_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+      flat_input_shape[0] = [NSNumber numberWithInt:[input_shape[0] intValue] * [input_shape[1] intValue]];
+    }
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    string key = "diag_mps_out:" + getMPSTypeString(self.scalar_type()) + ":" + std::to_string(diagonal)
+                                 + ":" + string([ns_shape_key UTF8String]);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          // TODO: Accept this as the flat version in 2D case
+          MPSGraphTensor* inputTensor = nil;
+          if(num_input_dims == 1)
+           inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()));
+         else
+           inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), flat_input_shape);
+
+          MPSGraphTensor* outputTensor = nil;
+
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
+                                                           dataType:MPSDataTypeInt32];
+          MPSGraphTensor* numDiagElementsRange = nil;
+          MPSGraphTensor* diagOffset = nil;
+          MPSGraphTensor* rowMultiplier = nil;
+          MPSGraphTensor* rowIndices = nil;
+          MPSGraphTensor* colIndices = nil;
+          MPSGraphTensor* indicesTensor = nil;
+
+          if(num_input_dims == 1) {
+            int shape_data[1] = {[input_shape[0] intValue]};
+            MPSGraphTensor* inputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)]
+                                                                    shape:@[@1]
+                                                                 dataType:MPSDataTypeInt32];
+            numDiagElementsRange = [mpsGraph getCoordinateValueWithShapeTensor:inputShapeTensor
+                                                                    axisTensor:zeroTensor
+                                                                          name:nil];
+            diagOffset = [mpsGraph constantWithScalar:diagonal
+                                             dataType:MPSDataTypeInt32];
+            rowMultiplier = [mpsGraph constantWithScalar:[num_output_cols intValue]
+                                                dataType:MPSDataTypeInt32];
+          }
+          else {
+            int shape_data[1] = {[output_shape[0] intValue]};
+            MPSGraphTensor* outputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)]
+                                                                     shape:@[@1]
+                                                                  dataType:MPSDataTypeInt32];
+            numDiagElementsRange = [mpsGraph getCoordinateValueWithShapeTensor:outputShapeTensor
+                                                                    axisTensor:zeroTensor
+                                                                          name:nil];
+            diagOffset = [mpsGraph constantWithScalar:diagonal
+                                             dataType:MPSDataTypeInt32];
+            rowMultiplier = [mpsGraph constantWithScalar:[num_input_cols intValue]
+                                                dataType:MPSDataTypeInt32];
+          }
+
+          if(diagonal >= 0) {
+            rowIndices = numDiagElementsRange;
+            colIndices = [mpsGraph additionWithPrimaryTensor:numDiagElementsRange
+                                             secondaryTensor:diagOffset
+                                                        name:nil];
+          }
+          else {
+            rowIndices = [mpsGraph subtractionWithPrimaryTensor:numDiagElementsRange
+                                                secondaryTensor:diagOffset
+                                                           name:nil];;
+            colIndices = numDiagElementsRange;
+          }
+
+          indicesTensor = [mpsGraph multiplicationWithPrimaryTensor:rowIndices
+                                                    secondaryTensor:rowMultiplier
+                                                               name:nil];
+          indicesTensor = [mpsGraph additionWithPrimaryTensor:indicesTensor
+                                              secondaryTensor:colIndices
+                                                         name:nil];
+
+          if(num_input_dims == 1) {
+            // TODO: Scatter mode doesn't matter, so what should I set it to be?
+            outputTensor = [mpsGraph scatterWithUpdatesTensor:inputTensor
+                                                indicesTensor:indicesTensor
+                                                        shape:flat_output_shape
+                                                         axis:0
+                                                         mode:MPSGraphScatterModeAdd
+                                                         name:nil];
+            outputTensor = [mpsGraph reshapeTensor:outputTensor
+                                         withShape:output_shape
+                                              name:nil];
+          }
+          else if(num_input_dims == 2) {
+            outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor
+                                               indicesTensor:indicesTensor
+                                                        axis:0
+                                             batchDimensions:0
+                                                        name:nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder();
+    if(num_input_dims == 1)
+      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    else
+      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, flat_input_shape);
+
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
new file mode 100644
index 000000000000..528b1643ff6c
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -0,0 +1,174 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+typedef MPSGraphTensor* (^UnaryOpBlock)(MPSGraph*, MPSGraphTensor*);
+
+void unary_op(const Tensor& self_t, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock)
+{
+  Tensor self = self_t.contiguous(at::MemoryFormat::Contiguous);
+  if (!output.is_same_size(self)) {
+    output.resize_(self.sizes());
+  }
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  @autoreleasepool {
+    string key = op_name + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          newCachedGraph->outputTensor = unaryBlock(mpsGraph, newCachedGraph->inputTensor);
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+}
+
+MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)
+{
+  MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                   dataType:inputTensor.dataType];
+  MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                        secondaryTensor:zeroTensor
+                                                                    name:nil];
+  return [mpsGraph selectWithPredicateTensor:predicateTensor
+                         truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil]
+                        falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil]
+                                        name:nil];
+};
+
+} // namespace mps
+
+TORCH_IMPL_FUNC(trunc_out_mps) (const Tensor& self, const Tensor& output) {
+  mps::unary_op(self, output, "trunc_out_mps",
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)
+                  { return mps::trunc_tensor(mpsGraph, inputTensor); });
+}
+
+#define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)              \
+TORCH_IMPL_FUNC(func_out) (const Tensor& self, const Tensor& output) {                \
+  mps::unary_op(self, output, #func_out,                                              \
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)   \
+                  { return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; }); \
+}
+
+#define CREATE_MPS_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                         \
+Tensor& func_out(const Tensor& self, Tensor& output) {                                \
+  mps::unary_op(self, output, #func_out,                                              \
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)   \
+                  { return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; }); \
+  return output;                                                                      \
+}
+
+
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp_out_mps, exponent)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sqrt_out_mps, squareRoot)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(rsqrt_out_mps, reverseSquareRoot)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sign_out_mps, sign)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(neg_out_mps, negative)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log_out_mps, logarithm)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log10_out_mps, logarithmBase10)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log2_out_mps, logarithmBase2)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(ceil_out_mps, ceil)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(floor_out_mps, floor)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(round_out_mps, round)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(erf_out_mps, erf)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sin_out_mps, sin)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cos_out_mps, cos)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(tan_out_mps, tan)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asin_out_mps, asin)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acos_out_mps, acos)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atan_out_mps, atan)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sinh_out_mps, sinh)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cosh_out_mps, cosh)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(tanh_out_mps, tanh)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asinh_out_mps, asinh)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh)
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh)
+
+CREATE_MPS_UNARY_TORCH_IMPL_FUNC(abs_out_mps, absolute)
+
+TORCH_IMPL_FUNC(log1p_out_mps) (const Tensor& self, const Tensor& output)
+{
+    using namespace mps;
+    if (!output.is_same_size(self)) {
+      output.resize_(self.sizes());
+    }
+    struct CachedGraph : public MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    };
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+    @autoreleasepool {
+      string key = string("log1p_out_mps") + getTensorsStringKey({self});
+      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+      if(!cachedGraph) {
+        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+          CachedGraph *newCachedGraph = nil;
+          @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+            newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+              MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                          shape:getMPSShape(self)
+                                                       dataType:mps::getMPSDataType(self.scalar_type())];
+              MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:newCachedGraph->inputTensor
+                                                         secondaryTensor:oneTensor
+                                                                    name:nil];
+            newCachedGraph->outputTensor = [mpsGraph logarithmWithTensor:addedTensor
+                                                                    name:nil];
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+      }
+
+      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self);
+      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+      };
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
+      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8c333efd3bf7..d6b5adf593a6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -145,6 +145,7 @@
 
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
   variants: method
+  tags: inplace_view
 
 - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
   variants: method
@@ -274,6 +275,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: abs_out
+    MPS: abs_out_mps
     SparseCPU, SparseCUDA: abs_sparse_out
     SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
 
@@ -328,12 +330,12 @@
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   variants: function
   dispatch:
-    CPU, CUDA: view_as_real
+    CPU, CUDA, MPS, Meta: view_as_real
 
 - func: view_as_complex(Tensor(a) self) -> Tensor(a)
   variants: function
   dispatch:
-    CPU, CUDA: view_as_complex
+    CPU, CUDA, Meta: view_as_complex
 
 - func: sgn(Tensor self) -> Tensor
   variants: function, method
@@ -357,6 +359,9 @@
     SparseCPU, SparseCUDA: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
 
+- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: method
+
 - func: real(Tensor(a) self) -> Tensor(a)
   device_check: NoCheck   # TensorIterator
   variants: function
@@ -422,6 +427,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: acos_out
+    MPS: acos_out_mps
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -448,6 +454,7 @@
     SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -457,18 +464,22 @@
     SparseCPU, SparseCUDA: add_sparse_
     SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
   structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
   dispatch:
-    CPU, CUDA: add_out
     SparseCPU: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
     SparseCsrCPU: add_out_sparse_csr_cpu
     SparseCsrCUDA: add_out_sparse_csr_cuda
     MkldnnCPU: mkldnn_add_out
+    MPS: add_out_mps
 
 - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   variants: function
@@ -521,6 +532,7 @@
   dispatch:
     CPU: addmv_out_cpu
     CUDA: addmv_out_cuda
+    MPS: addmv_out_mps
     SparseCsrCPU: addmv_out_sparse_csr
     SparseCsrCUDA: addmv_out_sparse_csr_cuda
 
@@ -560,6 +572,7 @@
   - dim -> int dim
   dispatch:
     CPU, CUDA: all_out
+    MPS: all_out_mps
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -583,6 +596,7 @@
   - dim -> int dim
   dispatch:
     CPU, CUDA: any_out
+    MPS: any_out_mps
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -603,6 +617,7 @@
   dispatch:
     CPU, Meta: arange_out
     CUDA: arange_cuda_out
+    MPS: arange_mps_out
 
 # This function is a temporary hack to allow tracing of arange like constructs with dynamic
 # bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
@@ -620,6 +635,7 @@
   structured: True
   dispatch:
     CPU, CUDA: argmax_out
+    MPS: argmax_out_mps
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
   structured_delegate: argmin.out
@@ -644,6 +660,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: acosh_out
+    MPS: acosh_out_mps
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
@@ -673,6 +690,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: asinh_out
+    MPS: asinh_out_mps
     SparseCPU, SparseCUDA: asinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
 
@@ -705,6 +723,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: atanh_out
+    MPS: atanh_out_mps
     SparseCPU, SparseCUDA: atanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
 
@@ -721,6 +740,7 @@
   variants: function, method
   dispatch:
     ZeroTensor, CPU, CUDA, Meta: as_strided_tensorimpl
+    MPS: as_strided_tensorimpl_mps
     QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
   device_check: NoCheck
   device_guard: False
@@ -756,6 +776,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: asin_out
+    MPS: asin_out_mps
     SparseCPU, SparseCUDA: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
 
@@ -790,6 +811,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: atan_out
+    MPS: atan_out_mps
     SparseCPU, SparseCUDA: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
 
@@ -833,6 +855,7 @@
   dispatch:
     CPU: baddbmm_out_cpu
     CUDA: baddbmm_out_cuda
+    MPS: baddbmm_out_mps
     SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
 
 - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -861,18 +884,21 @@
   variants: function
   dispatch:
     CPU, CUDA: bernoulli_out
+    MPS: bernoulli_out_mps
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -890,6 +916,7 @@
   dispatch:
     CPU: binary_cross_entropy_cpu
     CUDA: binary_cross_entropy_cuda
+    MPS: binary_cross_entropy_mps
 
 - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -898,6 +925,7 @@
   dispatch:
     CPU: binary_cross_entropy_out_cpu
     CUDA: binary_cross_entropy_out_cuda
+    MPS: binary_cross_entropy_out_mps
 
 - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
   python_module: nn
@@ -905,6 +933,7 @@
   dispatch:
     CPU: binary_cross_entropy_backward_cpu
     CUDA: binary_cross_entropy_backward_cuda
+    MPS: binary_cross_entropy_backward_mps
 
 - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -912,6 +941,7 @@
   dispatch:
     CPU: binary_cross_entropy_backward_out_cpu
     CUDA: binary_cross_entropy_backward_out_cuda
+    MPS: binary_cross_entropy_backward_out_mps
 
 - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1061,6 +1091,7 @@
   dispatch:
     CPU: bmm_out_cpu
     CUDA: bmm_out_cuda
+    MPS: bmm_out_mps
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
     SparseCsrCUDA: bmm_out_sparse_csr_cuda
@@ -1078,12 +1109,20 @@
     SparseCPU, SparseCUDA: sparse_broadcast_to
 
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
+  structured_delegate: cat.out
   dispatch:
-    CompositeExplicitAutograd: cat
+    SparseCPU, SparseCUDA: cat_sparse
+    QuantizedCPU: cat_quantized_cpu
 
 - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  precomputed:
+  - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
   dispatch:
-    CompositeExplicitAutograd: cat_out
+    CPU: cat_out_cpu
+    CUDA: cat_out_cuda
+    MPS: cat_out_mps
+    QuantizedCPU: cat_out_quantized_cpu
 
 - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
 
@@ -1125,6 +1164,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: ceil_out
+    MPS: ceil_out_mps
     SparseCPU, SparseCUDA: ceil_sparse_out
     SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
 
@@ -1164,8 +1204,7 @@
 
 - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
   variants: function, method
-  dispatch:
-    CPU, CUDA: clamp
+  structured_delegate: clamp.Tensor_out
 
 - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1177,8 +1216,7 @@
 
 - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_
+  structured_delegate: clamp.Tensor_out
 
 - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1187,73 +1225,83 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: clamp_out
+    MPS: clamp_out_mps
 
 - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_out
+    CPU, CUDA: clamp_Tensor_out
+    MPS: clamp_Tensor_out_mps
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_max
+  structured_delegate: clamp_max.out
 
 - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_max
+  structured_delegate: clamp_max.Tensor_out
 
 - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_max_
+  structured_delegate: clamp_max.out
 
 - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_max_
+  structured_delegate: clamp_max.Tensor_out
 
 - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: clamp_max_out
+    MPS: clamp_max_out_mps
 
 - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_max_out
+    CPU, CUDA: clamp_max_Tensor_out
+    MPS: clamp_max_Tensor_out_mps
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_min
+  structured_delegate: clamp_min.out
 
 - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_min
+  structured_delegate: clamp_min.Tensor_out
 
 - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_min_
+  structured_delegate: clamp_min.out
 
 - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: clamp_min_
+  structured_delegate: clamp_min.Tensor_out
 
 - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: clamp_min_out
+    MPS: clamp_min_out_mps
 
 - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_min_out
+    CPU, CUDA: clamp_min_Tensor_out
+    MPS: clamp_min_Tensor_out_mps
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -1360,23 +1408,28 @@
 
 - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
 
+- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+  variants: function
+
 - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
   variants: method
   device_check: NoCheck
   device_guard: False
   dispatch:
     MkldnnCPU: copy_mkldnn_
-    SparseCPU, SparseCUDA, SparseHIP: copy_sparse_wrapper_
+    SparseCPU, SparseCUDA: copy_sparse_wrapper_
     CompositeExplicitAutograd: copy_
-    SparseCsrCPU, SparseCsrCUDA: copy_sparse_csr_
+    SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
-  dispatch: {}
+  dispatch:
+    MPS: _copy_from_mps
 
 # We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
 # See https://github.com/pytorch/xla/issues/2881
 - func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
-  dispatch: {}
+  dispatch:
+    MPS: _copy_from_and_resize_mps
 
 - func: cos(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1394,6 +1447,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: cos_out
+    MPS: cos_out_mps
 
 - func: cosh(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1411,6 +1465,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: cosh_out
+    MPS: cosh_out_mps
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
 
@@ -1457,6 +1512,14 @@
   dispatch:
     CUDA: cudnn_convolution_transpose
 
+- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution_transpose
+
+- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_transpose_backward
+
 - func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
   dispatch:
     CUDA: cudnn_convolution_relu
@@ -1679,6 +1742,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: div_out
+    MPS: div_out_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
 
 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -1701,6 +1765,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: div_out_mode
+    MPS: div_out_mode_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -1780,6 +1845,7 @@
   dispatch:
     CPU: dot
     CUDA: dot_cuda
+    MPS: dot_mps
 
 - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1800,6 +1866,7 @@
 - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
   dispatch:
     CompositeExplicitAutograd: embedding
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
 
 - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
 
@@ -1807,6 +1874,7 @@
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
+    MPS: embedding_dense_backward_mps
 
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
   dispatch:
@@ -1872,10 +1940,12 @@
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
+    MPS: empty_mps
     Meta: empty_meta
     MkldnnCPU: empty_mkldnn
     SparseCPU, SparseCUDA: empty_sparse
-    SparseCsrCPU, SparseCsrCUDA: empty_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
+    QuantizedCPU, QuantizedCUDA: empty_unknown_quantized
 
 # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
 # is significantly more difficult to implement by different backends
@@ -1920,9 +1990,19 @@
   dispatch:
     CPU, Meta: resize_
     CUDA: resize_cuda_
+    MPS: resize_mps_
     QuantizedCPU: quantized_resize_cpu_
     SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
 
+# This is a utility function to enable users to resize out tensor while registering kernels for out variants.
+# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
+# to make it easy to register out variants for ops.
+- func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function
+  dispatch:
+    Meta: _resize_output_
+
 - func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   category_override: factory
   variants: function
@@ -1946,7 +2026,9 @@
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
+    MPS: empty_strided_mps
     Meta: empty_strided_meta
+    QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
 
 - func: erf(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1970,6 +2052,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: erf_out
+    MPS: erf_out_mps
     SparseCPU, SparseCUDA: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
 
@@ -2006,6 +2089,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: exp_out
+    MPS: exp_out_mps
 
 - func: exp2(Tensor self) -> Tensor
   structured_delegate: exp2.out
@@ -2020,6 +2104,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: exp2_out
+    MPS: exp2_out_mps
 
 - func: expm1(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2046,6 +2131,13 @@
     SparseCPU, SparseCUDA: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
 
+- func: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: expand_symint
+
 - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_check: NoCheck
@@ -2090,19 +2182,32 @@
 - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
   variants: method
 
+- func: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+
+- func: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+
 - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CPU, CUDA: fill_
+    MPS: fill_scalar_mps
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
+    SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CPU, CUDA: fill_
+    MPS: fill_tensor_mps_
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
 
@@ -2130,6 +2235,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: floor_out
+    MPS: floor_out_mps
     SparseCPU, SparseCUDA: floor_sparse_out
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
 
@@ -2221,10 +2327,12 @@
   variants: function, method
 
 # NOTE [ grid_sampler Native Functions ]
-# `grid_sampler` does all the shape checking and then dispatches to one of
-# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
-# has the corresponding backward defined as native functions as well. Therefore,
-# in these functions and their backwards, no more shape checking is done.
+# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to
+# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of
+# which has the corresponding backward defined as native functions as well.
+# However, we do shape checking everywhere for now since each of the mentioned
+# functions can be called directly, which will lead to crashes otherwise.
+# See https://github.com/pytorch/pytorch/issues/73187 for more information.
 #
 # There is also _grid_sampler_2d_backward_cpu_fallback which is an
 # implementation detail of grid_sampler_2d and is only exposed here for testing
@@ -2262,7 +2370,10 @@
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
 
-- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
+# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
+# the case where `input` doesn't require gradient. Gradient for `grid` is always
+# computed (only `output_mask[0]` is checked by the implementations).
+- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
   dispatch:
     CPU: grid_sampler_3d_backward_cpu
     CUDA: grid_sampler_3d_backward_cuda
@@ -2451,7 +2562,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: isnan
+    CPU, CUDA, MPS: isnan
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
 
@@ -2549,11 +2660,6 @@
     CUDA: layer_norm_cuda
     CompositeImplicitAutograd: math_native_layer_norm
 
-- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, int num_head, Tensor? mask=None) -> Tensor
-  dispatch:
-    CPU: multi_head_self_attention_cpu
-    CUDA: multi_head_self_attention_cuda
-
 - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CPU: layer_norm_backward_cpu
@@ -2582,6 +2688,14 @@
 - func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
+# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
+# native_functions.yaml
+# https://github.com/pytorch/pytorch/issues/77394
+- func: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+  python_module: nn
+  dispatch:
+    MPS: _mps_linear
+
 - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
   dispatch:
@@ -2599,6 +2713,18 @@
   dispatch:
     MkldnnCPU: mkldnn_linear_backward
 
+- func: _mps_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
+  dispatch:
+    MPS: _mps_linear_backward_input
+
+- func: _mps_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
+  dispatch:
+    MPS: _mps_linear_backward_weights
+
+- func: mps_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: mps_linear_backward
+
 - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
 
 - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -2646,6 +2772,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log_out
+    MPS: log_out_mps
 
 - func: log10(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2665,6 +2792,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log10_out
+    MPS: log10_out_mps
 
 - func: log1p(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2688,6 +2816,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log1p_out
+    MPS: log1p_out_mps
     SparseCPU, SparseCUDA: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
 
@@ -2707,12 +2836,14 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: log2_out
+    MPS: log2_out_mps
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: logaddexp_out
+    MPS: logaddexp_out_mps
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   variants: method, function
@@ -2725,6 +2856,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: logaddexp2_out
+    MPS: logaddexp2_out_mps
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   variants: method, function
@@ -2798,6 +2930,11 @@
 - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
+- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: log_softmax_out
+
 - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
@@ -2809,6 +2946,7 @@
   dispatch:
     CPU: log_softmax_cpu_out
     CUDA: log_softmax_cuda_out
+    MPS: log_softmax_mps_out
 
 - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _log_softmax_backward_data.out
@@ -2818,6 +2956,7 @@
   dispatch:
     CPU: log_softmax_backward_cpu_out
     CUDA: log_softmax_backward_cuda_out
+    MPS: log_softmax_backward_mps_out
 
 - func: _logcumsumexp(Tensor self, int dim) -> Tensor
   dispatch:
@@ -2929,6 +3068,7 @@
   - dim -> int dim
   dispatch:
     CPU, CUDA: max_out
+    MPS: max_out_mps
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -2944,10 +3084,10 @@
 
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: amax
+  structured_delegate: amax.out
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
   dispatch:
     CPU, CUDA: amax_out
 
@@ -2958,6 +3098,17 @@
 
 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
 
+# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
+# native_functions.yaml
+# https://github.com/pytorch/pytorch/issues/77394
+- func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MPS: _mps_max_pool2d
+
+- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MPS: mps_max_pool2d_backward
+
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   dispatch:
     MkldnnCPU: mkldnn_max_pool2d
@@ -2981,6 +3132,7 @@
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   dispatch:
     QuantizedCPU: quantized_max_pool2d
+    QuantizedCUDA: quantized_max_pool2d_cudnn
 
 - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
 
@@ -3004,6 +3156,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: mean_out
+    MPS: mean_out_mps
     QuantizedCPU: mean_out_quantized_cpu
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -3076,6 +3229,7 @@
   - dim -> int dim
   dispatch:
     CPU, CUDA: min_out
+    MPS: min_out_mps
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -3086,13 +3240,24 @@
 
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: amin
+  structured_delegate: amin.out
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
   dispatch:
     CPU, CUDA: amin_out
 
+# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
+# native_functions.yaml
+# https://github.com/pytorch/pytorch/issues/77394
+- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution
+
+- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_backward
+
 - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: mkldnn_convolution
@@ -3137,10 +3302,12 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
+    MPS: mm_out_mps
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
+  python_module: sparse
 
 - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
   dispatch:
@@ -3172,8 +3339,10 @@
   variants: function, method
   dispatch:
     SparseCPU, SparseCUDA: mul_sparse
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3181,7 +3350,9 @@
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: mul_sparse_
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3189,8 +3360,10 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: mul_out
+    MPS: mul_out_mps
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
+    SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
     MkldnnCPU: mkldnn_mul_out
 
   # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -3199,12 +3372,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: mul
+    SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: mul_
+    SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
 
 # multiply, alias for mul
 - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
@@ -3253,6 +3428,12 @@
     CPU: narrow_copy_dense_cpu
     SparseCPU, SparseCUDA: narrow_copy_sparse
     CompositeExplicitAutograd: narrow_copy_dense
+  tags: view_copy
+
+- func: narrow_copy.SymInt(Tensor self, int dim, int start, SymInt length) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: narrow_copy_symint
 
 - func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -3272,11 +3453,13 @@
   dispatch:
     CPU: batch_norm_cpu
     CUDA: batch_norm_cuda
+    MPS: batch_norm_mps
     MkldnnCPU: mkldnn_batch_norm
 
 - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   dispatch:
     CUDA: batch_norm_cuda_out
+    MPS: batch_norm_mps_out
 
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   dispatch:
@@ -3303,6 +3486,7 @@
   dispatch:
     CPU: batch_norm_backward_cpu
     CUDA: batch_norm_backward_cuda
+    MPS: batch_norm_backward_mps
     MkldnnCPU: mkldnn_batch_norm_backward
 
 - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
@@ -3370,6 +3554,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: permute
+    MPS: permute_mps
 
 - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
   variants: function, method
@@ -3410,8 +3595,14 @@
   variants: function, method
 
 - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_shuffle_cpu
+    CompositeExplicitAutograd: math_pixel_shuffle
 
 - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_unshuffle_cpu
+    CompositeExplicitAutograd: math_pixel_unshuffle
 
 - func: channel_shuffle(Tensor self, int groups) -> Tensor
   dispatch:
@@ -3427,6 +3618,7 @@
   variants: method
   dispatch:
     CUDA: is_pinned_cuda
+    MPS: is_pinned_mps
     CompositeExplicitAutograd: is_pinned_default
 
 # TODO: add a copy kwarg that guarantees that the tensor is put into fresh
@@ -3438,6 +3630,7 @@
 - func: _pin_memory(Tensor self, Device? device=None) -> Tensor
   dispatch:
     CUDA: _pin_memory_cuda
+    MPS: _pin_memory_mps
 
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
   variants: function, method
@@ -3573,6 +3766,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: reciprocal_out
+    MPS: reciprocal_out_mps
 
 - func: neg(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -3596,6 +3790,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: neg_out
+    MPS: neg_out_mps
     SparseCPU, SparseCUDA: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
 
@@ -3612,6 +3807,7 @@
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   dispatch:
     CompositeExplicitAutograd: repeat
+    MPS: repeat_mps
 
 - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor
   variants: function
@@ -3638,7 +3834,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor: _reshape_alias
+    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
     # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
 
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -3675,6 +3871,7 @@
   dispatch:
     CPU: round_out
     CUDA: round_out
+    MPS: round_out_mps
     SparseCPU, SparseCUDA: round_sparse_out
     SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
 
@@ -3707,16 +3904,20 @@
   variants: function, method
   dispatch:
     CPU, CUDA: relu
+    MPS: relu_mps
     MkldnnCPU: mkldnn_relu
     QuantizedCPU: relu_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CPU, CUDA: relu_
+    MPS: relu_mps_
     MkldnnCPU: mkldnn_relu_
     QuantizedCPU: relu_quantized_cpu_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
 
 - func: relu6(Tensor self) -> Tensor
   python_module: nn
@@ -3727,16 +3928,18 @@
 - func: prelu(Tensor self, Tensor weight) -> Tensor
   variants: function, method
   dispatch:
+    MkldnnCPU: mkldnn_prelu
     CPU: prelu_cpu
     CUDA: prelu_cuda
 
 - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
   variants: function, method
   dispatch:
+    MkldnnCPU: mkldnn_prelu_backward
     CPU: prelu_backward_cpu
     CUDA: prelu_backward_cuda
 
-- func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
@@ -3744,24 +3947,34 @@
   dispatch:
     CPU: gelu_out_cpu
     CUDA: gelu_out_cuda
+    MPS: gelu_out_mps
+
+- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
 
-- func: gelu(Tensor self) -> Tensor
+- func: gelu(Tensor self, *, str approximate='none') -> Tensor
   structured_delegate: gelu.out
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_gelu
     QuantizedCPU: gelu_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
 
-- func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
     CPU: gelu_backward_out_cpu
     CUDA: gelu_backward_out_cuda
+    MPS: gelu_backward_out_mps
 
-- func: gelu_backward(Tensor grad, Tensor self) -> Tensor
+- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
   structured_delegate: gelu_backward.grad_input
   python_module: nn
   dispatch:
@@ -3811,6 +4024,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: rsqrt_out
+    MPS: rsqrt_out_mps
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
   variants: function, method
@@ -3823,6 +4037,7 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: select
+    SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
 
 - func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor
   variants: function
@@ -3865,6 +4080,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: silu_out
+    MPS: silu_out_mps
 
 - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -3872,6 +4088,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: silu_backward_out
+    MPS: silu_backward_out_mps
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
   structured_delegate: silu_backward.grad_input
@@ -3925,6 +4142,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sigmoid_out
+    MPS: sigmoid_out_mps
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   variants: function, method
@@ -3962,6 +4180,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sin_out
+    MPS: sin_out_mps
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
     SparseCPU, SparseCUDA: sin_sparse_out
 
@@ -4001,6 +4220,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sinh_out
+    MPS: sinh_out_mps
     SparseCPU, SparseCUDA: sinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
 
@@ -4087,6 +4307,11 @@
 - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
+- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: softmax_out
+
 - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
@@ -4100,6 +4325,7 @@
   dispatch:
     CPU: softmax_cpu_out
     CUDA: softmax_cuda_out
+    MPS: softmax_mps_out
 
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _softmax_backward_data.out
@@ -4109,6 +4335,7 @@
   dispatch:
     CPU: softmax_backward_cpu_out
     CUDA: softmax_backward_cuda_out
+    MPS: softmax_backward_mps_out
 
 - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
   variants: function, method
@@ -4124,6 +4351,10 @@
   dispatch:
     CompositeExplicitAutograd: split
 
+- func: split.sizes(Tensor(a -> *) self, int[] split_size, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_guard: False
+
 - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
   variants: function, method
   device_check: NoCheck
@@ -4161,7 +4392,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: squeeze
+    CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
 
 - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
@@ -4169,7 +4400,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: squeeze
+    CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
 
 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -4239,12 +4470,13 @@
 
 - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
 
-# The signature is designed to be consistent with librosa except that it is
-# missing the `pad_mode` and `center` arguments, which are taken care of at
-# `torch.functional.py`. They shall be moved here once we have mapping between
-# Python strings and C++ Enum in codegen.
+# Overload without center & pad mode, needed for forward-compatibility
 - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
   variants: function, method
+  cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
+
+- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  variants: function, method
 
 - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
   variants: function, method
@@ -4265,6 +4497,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sum
+    SparseCsrCPU, SparseCsrCUDA: sum_csr
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: sum.IntList_out
@@ -4280,21 +4513,17 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: sum_out
+    MPS: sum_out_mps
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
 
-- func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+- func: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
 
-- func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  variants: function, method
-  dispatch:
-    CPU, CUDA: nansum
-
-- func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+- func: nansum.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nansum_out
 
@@ -4325,6 +4554,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sqrt_out
+    MPS: sqrt_out_mps
     SparseCPU, SparseCUDA: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
 
@@ -4337,8 +4567,6 @@
   variants: function, method
 
 - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: square_out
 
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4353,6 +4581,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: std
+    MPS: std_mps
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
@@ -4404,6 +4633,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: prod
+    MPS: prod_mps
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: prod.int_out
@@ -4415,6 +4645,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: prod_out
+    MPS: prod_out_mps
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4460,6 +4691,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: tan_out
+    MPS: tan_out_mps
     SparseCPU, SparseCUDA: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
 
@@ -4488,6 +4720,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: tanh_out
+    MPS: tanh_out_mps
     SparseCPU, SparseCUDA: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
 
@@ -4518,12 +4751,14 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: threshold_out
+    MPS: threshold_out_mps
 
 - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: threshold_backward_out
+    MPS: threshold_backward_out_mps
 
 - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
   variants: function
@@ -4602,6 +4837,28 @@
 
 - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
 
+# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
+- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
+    CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
+
+- func: _nested_tensor_from_mask(Tensor t, Tensor mask) -> Tensor
+  dispatch:
+    CPU, CUDA: NestedTensor_nested_tensor_from_mask
+
+- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
+  device_check: NoCheck # cpu_nested_shape_example will always be on CPU
+  dispatch:
+    CPU: nested_from_padded_generic
+    CUDA: nested_from_padded_cuda
+
+# _nested_from_padded is not usable from Python, so
+# _nested_from_padded_and_nested_example is available for testing.
+- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _trilinear
@@ -4632,6 +4889,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: trunc_out
+    MPS: trunc_out_mps
     SparseCPU, SparseCUDA: trunc_sparse_out
     SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
 
@@ -4693,7 +4951,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: unsqueeze
+    CompositeExplicitAutograd: unsqueeze
     SparseCPU, SparseCUDA: unsqueeze_sparse
     QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
 
@@ -4720,6 +4978,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: var
+    MPS: var_mps
 
 - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4771,12 +5030,18 @@
   device_check: NoCheck
   device_guard: False
 
-# we define both of these because 'where' does the broadcast and '_s_where' doesn't;
-# this allows us to implicitly calculate the broadcast derivative, while only dealing with the
-# _s_where derivative.
 - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  dispatch:
+    CPU, CUDA: where
+    MPS: where_mps
+
+- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: where_self_out
+    MPS: where_self_out_mps
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
   variants: function
@@ -4791,11 +5056,6 @@
   device_check: NoCheck   # TensorIterator
   variants: function
 
-- func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
-  variants: function
-  dispatch:
-    CPU, CUDA: _s_where
-
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
   variants: function
 
@@ -4804,15 +5064,17 @@
 - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
   variants: function
 
-- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
   variants: function
   dispatch:
+    CPU: weight_norm_cpu
     CUDA: weight_norm_cuda
 
-- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
   variants: function
   dispatch:
-    CUDA: weight_norm_cuda_backward
+    CPU: weight_norm_backward_cpu
+    CUDA: weight_norm_backward_cuda
 
 - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
   variants: function
@@ -4894,6 +5156,16 @@
     SparseCPU: _sparse_sum_backward_cpu
     SparseCUDA: _sparse_sum_backward_cuda
 
+- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_sum_cpu
+    SparseCsrCUDA: _sparse_csr_sum_cuda
+
+- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_prod_cpu
+    SparseCsrCUDA: _sparse_csr_prod_cuda
+
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   python_module: sparse
   variants: function
@@ -4969,6 +5241,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: norm_out
+    MPS: norm_out_mps
 
 # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
@@ -4994,24 +5267,31 @@
   dispatch:
     CPU, CUDA: frexp_out
 
+# Deprecated (v.1.12)
 - func: frobenius_norm(Tensor self) -> Tensor
   variants: function
 
+# Deprecated (v.1.12)
 - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
   variants: function
 
+# Deprecated (v.1.12)
 - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
+# Deprecated (v.1.12)
 - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
   variants: function
 
+# Deprecated (v.1.12)
 - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
+# Deprecated (v.1.12)
 - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
   variants: function
 
+# Deprecated (v.1.12)
 - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
@@ -5020,7 +5300,7 @@
   dispatch:
     CompositeExplicitAutograd: clone
     SparseCPU, SparseCUDA: clone_sparse
-    SparseCsrCPU, SparseCsrCUDA: clone_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
     MkldnnCPU: mkldnn_clone
     QuantizedCPU, QuantizedCUDA: quantized_clone
 
@@ -5035,7 +5315,7 @@
 
 - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
-  variants: function
+  variants: function, method
   dispatch:
     SparseCPU, SparseCUDA: resize_as_sparse_
     SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_
@@ -5045,8 +5325,10 @@
   variants: method, function
   dispatch:
     CPU, CUDA: zero_
+    MPS: zero_mps_
     Meta: zero_meta_
     SparseCPU, SparseCUDA: zero_sparse_
+    SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
 
 - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -5055,6 +5337,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sub_out
+    MPS: sub_out_mps
     SparseCPU, SparseCUDA: sub_out_sparse
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -5063,6 +5346,7 @@
   structured_delegate: sub.out
   dispatch:
     SparseCPU, SparseCUDA: sub_sparse
+    ZeroTensor: sub_zerotensor
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5132,7 +5416,7 @@
 
 # Functionally the same as addmm, but we give it a different derivative formula
 # that doesn't propagate gradients to non-present entries on sparse.
-- func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   python_module: sparse
   dispatch:
     CompositeExplicitAutograd: _sparse_addmm
@@ -5141,17 +5425,20 @@
   python_module: sparse
   dispatch:
     SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu
 
 - func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   python_module: sparse
   dispatch:
     SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
 
 - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
     CPU: addmm_out_cpu
     CUDA: addmm_out_cuda
+    MPS: addmm_out_mps
     SparseCPU: addmm_out_sparse_dense_cpu
     SparseCUDA: addmm_out_sparse_dense_cuda
     SparseCsrCPU: addmm_out_sparse_csr_cpu
@@ -5174,6 +5461,16 @@
     SparseCPU: s_addmm_sparse_dense_cpu_
     SparseCUDA: s_addmm_sparse_dense_cuda_
 
+- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmm_activation_out_cpu
+    CUDA: addmm_activation_out_cuda
+
+- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
+  structured_delegate: _addmm_activation.out
+  variants: function, method
+
 # NOTE [ Sparse: autograd and API ]
 #
 #
@@ -5285,11 +5582,23 @@
 # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
 # the default would never make sense.
 
+- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
 - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
 
+- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
 - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
 
+- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
 - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
 
@@ -5301,7 +5610,11 @@
 
 - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
 
+- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
 - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   dispatch:
@@ -5328,14 +5641,20 @@
   dispatch:
     SparseCPU: sparse_mask_cpu
     SparseCUDA: sparse_mask_cuda
+    SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
 
 - func: _to_cpu(Tensor[] tensors) -> Tensor[]
   variants: function
 
 - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
   variants: method
+
+# Special case of to_dense with custom derivative
+- func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
+  variants: method
   dispatch:
-    SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
     MkldnnCPU: mkldnn_to_dense
 
 - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
@@ -5451,6 +5770,20 @@
   device_check: NoCheck
   device_guard: False
 
+- func: ccol_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+- func: row_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
 - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     SparseCPU: hspmm_out_sparse_cpu
@@ -5471,6 +5804,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: unbind
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
 
 - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
   variants: function, method
@@ -5479,11 +5813,41 @@
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
 
 - func: to_sparse(Tensor self) -> Tensor
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
+
+- func: to_sparse_csr(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csr
+    SparseCPU, SparseCUDA: coo_to_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
+
+- func: to_sparse_csc(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csc
+    SparseCPU, SparseCUDA: coo_to_sparse_csc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
+
+- func: to_sparse_bsr(Tensor self, int[2] blocksize) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsr
+    SparseCPU, SparseCUDA: coo_to_sparse_bsr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
+
+- func: to_sparse_bsc(Tensor self, int[2] blocksize) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsc
+    SparseCPU, SparseCUDA: coo_to_sparse_bsc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
 
 - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
   variants: method
@@ -5729,16 +6093,33 @@
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
+    MPS: _local_scalar_dense_mps
   variants: function
 
+# MPS LSTM implementation
+
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: _lstm_mps
+
+- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+  dispatch:
+    MPS: lstm_mps_backward
+
+
 # Fused RNN kernels
 - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_lstm_cell_cuda
 
-- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
+#     It is necessary to avoid triggering TensorImpl use count checks in debug mode
+# NB: this is function is NOT differentiable
+- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
   dispatch:
-    CUDA: _thnn_fused_lstm_cell_backward_cuda
+    CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
+
+- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
 
 - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
 
@@ -5819,36 +6200,51 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: set_
+    CPU, CUDA, Meta, MPS: set_
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
   variants: method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU: set_storage_cpu_
+    CPU, Meta: set_storage_cpu_
     CUDA: set_storage_cuda_
+    MPS: set_storage_mps_
     QuantizedCPU, QuantizedCUDA: set_storage_quantized_
 
+- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
 - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
   variants: method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: set_tensor_
+    CPU, CUDA, Meta, MPS: set_tensor_
 
 - func: set_(Tensor(a!) self) -> Tensor(a!)
   variants: method
   dispatch:
     CPU: set_cpu_
     CUDA: set_cuda_
+    Meta: set_meta_
+    MPS: set_mps_
+
+- func: lift(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    # Not making it CompositeImplicitAutograd because lift
+    # should be a primitive w.r.t. functorch
+    CompositeExplicitAutograd: lift
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
   variants: method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: is_set_to
+    CPU, CUDA, MPS: is_set_to
 
 - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5856,6 +6252,7 @@
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
+    MPS: masked_fill__mps
 
 - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5869,6 +6266,7 @@
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
+    MPS: masked_fill__mps
 
 - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5887,17 +6285,22 @@
   dispatch:
     CompositeExplicitAutograd: masked_scatter
 
-- func: _masked_softmax(Tensor self, Tensor mask) -> Tensor
+- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor
   dispatch:
     CUDA: masked_softmax_cuda
     CPU: masked_softmax_cpu
 
+- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_backward_cuda
+    CPU: masked_softmax_backward_cpu
+
 - func: view(Tensor(a) self, int[] size) -> Tensor(a)
   variants: method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: view
+    ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, MPS: view
     MkldnnCPU: mkldnn_view
 
 # Warning: If you want to change the name or overload name of this
@@ -5916,7 +6319,7 @@
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
   variants: method
   dispatch:
-    CPU, CUDA: put_
+    CPU, CUDA, MPS: put_
 
 - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
   variants: function, method
@@ -5941,6 +6344,23 @@
 - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
   variants: function, method
 
+- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU: index_reduce_cpu_out
+    CUDA: index_reduce_cuda_out
+
+- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: index_reduce.out
+  variants: method
+
+- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: index_reduce.out
+  variants: function, method
+
 - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
@@ -5995,6 +6415,7 @@
   variants: function
   dispatch:
     CPU, CUDA: scatter_src_out
+    MPS: scatter_src_out_mps
 
 - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
   structured_delegate: scatter.value_out
@@ -6009,6 +6430,7 @@
   variants: function
   dispatch:
     CPU, CUDA: scatter_value_out
+    MPS: scatter_value_out_mps
 
 - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
   structured_delegate: scatter.reduce_out
@@ -6023,6 +6445,7 @@
   variants: function
   dispatch:
     CPU, CUDA: scatter_reduce_out
+    MPS: scatter_reduce_out_mps
 
 - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
   structured_delegate: scatter.value_reduce_out
@@ -6037,6 +6460,7 @@
   variants: function
   dispatch:
     CPU, CUDA: scatter_value_reduce_out
+    MPS: scatter_value_reduce_out_mps
 
 - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
   variants: function, method
@@ -6057,14 +6481,24 @@
   variants: function
   dispatch:
     CPU, CUDA: scatter_add
+    MPS: scatter_add_mps_out
 
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
   variants: function, method
 
-- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
+- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: scatter_reduce.two_out
   variants: function, method
+
+- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: scatter_reduce.two_out
+  variants: method
+
+- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
   dispatch:
-    CPU: scatter_reduce_two_cpu
+    CPU, CUDA: scatter_reduce_two
 
 - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: eq.Scalar_out
@@ -6100,6 +6534,12 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_and
 
+- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and
+
 - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
@@ -6148,6 +6588,12 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
 
+- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or
+
 - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
@@ -6196,6 +6642,12 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
 
+- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor
+
 - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
@@ -6271,25 +6723,25 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: bitwise_left_shift
+    CompositeExplicitAutograd: bitwise_left_shift
 
 - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: bitwise_left_shift_
+    CompositeExplicitAutograd: bitwise_left_shift_
 
 - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_left_shift_out
+    CompositeExplicitAutograd: bitwise_left_shift_out
 
 - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_left_shift
+    CompositeExplicitAutograd: bitwise_left_shift
 
 - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6336,25 +6788,25 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: bitwise_right_shift
+    CompositeExplicitAutograd: bitwise_right_shift
 
 - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: bitwise_right_shift_
+    CompositeExplicitAutograd: bitwise_right_shift_
 
 - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_right_shift_out
+    CompositeExplicitAutograd: bitwise_right_shift_out
 
 - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_right_shift
+    CompositeExplicitAutograd: bitwise_right_shift
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   structured_delegate: tril.out
@@ -6383,15 +6835,18 @@
   variants: method
   dispatch:
     CPU, CUDA: addbmm_
+    MPS: addbmm_mps_
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: addbmm_out
+    MPS: addbmm_out_mps
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: method, function
   dispatch:
     CPU, CUDA: addbmm
+    MPS: addbmm_mps
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6399,6 +6854,7 @@
   dispatch:
     CPU, CUDA: random_
     Meta: random_meta_
+    MPS: random_mps_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6406,6 +6862,7 @@
   dispatch:
     CPU, CUDA: random_
     Meta: random_meta_
+    MPS: random_mps_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6419,6 +6876,7 @@
   variants: method
   dispatch:
     CPU, CUDA: uniform_
+    MPS: uniform_mps_
     Meta: uniform_meta_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
@@ -6451,6 +6909,7 @@
   dispatch:
     CPU: diag_cpu_out
     CUDA: diag_cuda_out
+    MPS: diag_mps_out
 
 - func: diag(Tensor self, int diagonal=0) -> Tensor
   variants: method, function
@@ -6472,6 +6931,7 @@
   dispatch:
     CPU: triu_cpu
     CUDA: triu_cuda
+    MPS: triu_mps_out
 
 - func: triu(Tensor self, int diagonal=0) -> Tensor
   structured_delegate: triu.out
@@ -6482,6 +6942,7 @@
   dispatch:
     CPU: tril_cpu
     CUDA: tril_cuda
+    MPS: tril_mps_out
 
 - func: tril(Tensor self, int diagonal=0) -> Tensor
   structured_delegate: tril.out
@@ -6514,6 +6975,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: ne_Scalar_out
+    MPS: ne_scalar_out_mps
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6529,6 +6991,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: ne_Tensor_out
+    MPS: ne_tensor_out_mps
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6575,6 +7038,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: eq_Scalar_out
+    MPS: eq_scalar_out_mps
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6590,6 +7054,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: eq_Tensor_out
+    MPS: eq_tensor_out_mps
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6605,6 +7070,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: ge_Scalar_out
+    MPS: ge_scalar_out_mps
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6620,6 +7086,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: ge_Tensor_out
+    MPS: ge_tensor_out_mps
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6666,6 +7133,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: le_Scalar_out
+    MPS: le_scalar_out_mps
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6681,6 +7149,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: le_Tensor_out
+    MPS: le_tensor_out_mps
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6727,6 +7196,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: gt_Scalar_out
+    MPS: gt_scalar_out_mps
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6742,6 +7212,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: gt_Tensor_out
+    MPS: gt_tensor_out_mps
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6788,6 +7259,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: lt_Scalar_out
+    MPS: lt_scalar_out_mps
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6803,6 +7275,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: lt_Tensor_out
+    MPS: lt_tensor_out_mps
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6861,15 +7334,18 @@
   dispatch:
     CPU, QuantizedCPU: index_select_out_cpu_
     CUDA, QuantizedCUDA: index_select_out_cuda
+    MPS: index_select_out_mps
 
 - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
   variants: method, function
   dispatch:
     CPU: index_select_cpu_
     QuantizedCPU: index_select_quantized_cpu_
-    CUDA, QuantizedCUDA: index_select_cuda
-    SparseCPU: index_select_sparse
-    SparseCUDA: index_select_sparse
+    CUDA: index_select_cuda
+    QuantizedCUDA: index_select_quantized_cuda
+    SparseCPU: index_select_sparse_cpu
+    SparseCUDA: index_select_sparse_cuda
+    MPS: index_select_mps
 
 - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -6918,6 +7394,7 @@
   structured: True
   dispatch:
     CPU, CUDA: gather_out
+    MPS: gather_out_mps
 
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
   variants: method, function
@@ -6941,6 +7418,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: addcmul_out
+    MPS: addcmul_out_mps
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   structured_delegate: addcmul.out
@@ -6958,6 +7436,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: addcdiv_out
+    MPS: addcdiv_out_mps
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   structured_delegate: addcdiv.out
@@ -7005,10 +7484,13 @@
 
 - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
   python_module: linalg
-  variants: method, function
+  variants: function
   dispatch:
     CPU, CUDA: linalg_solve_triangular
 
+- func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
+  python_module: linalg
+
 - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
   dispatch:
     CompositeExplicitAutograd: symeig_out
@@ -7086,21 +7568,6 @@
     CPU: _cholesky_solve_helper_cpu
     CUDA: _cholesky_solve_helper_cuda
 
-- func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
-  variants: function, method
-  dispatch:
-    CompositeExplicitAutograd: solve
-
-- func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU)
-  dispatch:
-    CompositeExplicitAutograd: solve_out
-
-- func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU: _solve_helper_cpu
-    CUDA: _solve_helper_cuda
-
 - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
   variants: method, function
   dispatch:
@@ -7151,13 +7618,14 @@
   dispatch:
     CPU, CUDA: lu_solve
 
+# lu_unpack
 - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+  structured_delegate: lu_unpack.out
   variants: function
-  dispatch:
-    CPU, CUDA: lu_unpack
 
 - func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
   variants: function
+  structured: True
   dispatch:
     CPU, CUDA: lu_unpack_out
 
@@ -7281,6 +7749,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sign_out
+    MPS: sign_out_mps
     SparseCPU, SparseCUDA: sign_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
 
@@ -7312,6 +7781,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: atan2_out
+    MPS: atan2_mps_out
 
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7398,6 +7868,12 @@
   dispatch:
     CPU: histogramdd_cpu
 
+- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
 - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
@@ -7535,6 +8011,7 @@
   variants: method, function
   dispatch:
     CPU, CUDA: min
+    MPS: min_mps
     QuantizedCPU: min_quantized_cpu
 
 - func: fmin(Tensor self, Tensor other) -> Tensor
@@ -7554,6 +8031,7 @@
   variants: method, function
   dispatch:
     CPU, CUDA: max
+    MPS: max_mps
     QuantizedCPU: max_quantized_cpu
 
 - func: fmax(Tensor self, Tensor other) -> Tensor
@@ -7579,6 +8057,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: maximum_out
+    MPS: maximum_out_mps
 
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
@@ -7600,6 +8079,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: minimum_out
+    MPS: minimum_out_mps
 
 # binary min, alias for minimum
 # NOTE: min is not an alias for minimum, since there is also unary min
@@ -7633,27 +8113,23 @@
 - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU: sort_out_cpu
-    CUDA: sort_out_cuda
+    CompositeExplicitAutograd: sort_out
 
 - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
   dispatch:
-    CPU: sort_out_cpu_stable
-    CUDA: sort_out_stable_cuda
+    CPU, CUDA: sort_stable_out
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU: sort_cpu
-    CUDA: sort_cuda
-    QuantizedCPU: sort_quantized_cpu
+    CompositeExplicitAutograd: sort
 
 - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  structured_delegate: sort.values_stable
   variants: method, function
   dispatch:
-    CPU: sort_cpu_stable
-    CUDA: sort_stable_cuda
     QuantizedCPU: sort_quantized_cpu_stable
 
 - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7683,6 +8159,7 @@
   dispatch:
     CPU: topk_out_cpu
     CUDA: topk_out_cuda
+    MPS: topk_out_mps
 
 - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
   variants: method, function
@@ -7700,6 +8177,7 @@
   structured: True
   dispatch:
     CPU, CUDA: all_all_out
+    MPS: all_all_out_mps
 
 - func: any(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7713,6 +8191,7 @@
   structured: True
   dispatch:
     CPU, CUDA: any_all_out
+    MPS: any_all_out_mps
 
 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7735,7 +8214,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA: unfold
+    CPU, CUDA, Meta: unfold
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
@@ -7756,6 +8235,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: pow_Tensor_Tensor_out
+    MPS: pow_tensor_tensor_out_mps
 
 - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7779,6 +8259,7 @@
   dispatch:
     CPU, CUDA: pow_Tensor_Scalar_out
     SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    MPS: pow_tensor_scalar_out_mps
 
 - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7822,32 +8303,45 @@
   variants: method
   dispatch:
     CPU, CUDA: normal_
+    MPS: normal_mps_
     Meta: normal_meta_
     SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
 
 - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: normal_out
+    MPS: normal_mps_out
+    Meta: normal_out_meta
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU, CUDA: normal
+    #MPS: normal_mps
+    Meta: normal_meta
 
 - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU, CUDA: normal
+    Meta: normal_meta
+    #MPS: normal_mps
 
 - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU, CUDA: normal
+    Meta: normal_meta
+    #MPS: normal_mps
 
 - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -7868,17 +8362,18 @@
   dispatch:
     CUDA: _amp_update_scale_cuda_
 
-- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
-  dispatch:
-    CPU: _cat_cpu
-    CUDA: cat_cuda
-    QuantizedCPU: cat_quantized_cpu
+#- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
+  #dispatch:
+    #CPU: _cat_cpu
+    #CUDA: cat_cuda
+    #MPS: cat_mps
+    #QuantizedCPU: cat_quantized_cpu
 
-- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: _cat_out_cpu
-    CUDA: cat_out_cuda
-    QuantizedCPU: cat_out_quantized_cpu
+#- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  #dispatch:
+    #CPU: _cat_out_cpu
+    #CUDA: cat_out_cuda
+    #QuantizedCPU: cat_out_quantized_cpu
 
 - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -8586,25 +9081,29 @@
 
 - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss_out
+    MPS: mse_loss_out_mps
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   device_check: NoCheck   # TensorIterator
+  structured_delegate: mse_loss.out
   python_module: nn
-  dispatch:
-    CPU, CUDA: mse_loss
 
 - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss_backward_out
+    MPS: mse_loss_backward_out_mps
 
 - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss_backward
+    MPS: mse_loss_backward_mps
 
 - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -8695,6 +9194,7 @@
   dispatch:
     CPU: nll_loss_forward_out_cpu
     CUDA: nll_loss_forward_out_cuda
+    MPS: nll_loss_forward_out_mps
 
 - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
   python_module: nn
@@ -8706,6 +9206,7 @@
   dispatch:
     CPU: nll_loss_backward_out_cpu
     CUDA: nll_loss_backward_out_cuda
+    MPS: nll_loss_backward_out_mps
 
 - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
   python_module: nn
@@ -8722,24 +9223,28 @@
   dispatch:
     CPU: nll_loss2d_forward_out_cpu
     CUDA: nll_loss2d_forward_out_cuda
+    MPS: nll_loss2d_forward_out_mps
 
 - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
   python_module: nn
   dispatch:
     CPU: nll_loss2d_forward_cpu
     CUDA: nll_loss2d_forward_cuda
+    MPS: nll_loss2d_forward_mps
 
 - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: nll_loss2d_backward_out_cpu
     CUDA: nll_loss2d_backward_out_cuda
+    MPS: nll_loss2d_backward_out_mps
 
 - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
   python_module: nn
   dispatch:
     CPU: nll_loss2d_backward_cpu
     CUDA: nll_loss2d_backward_cuda
+    MPS: nll_loss2d_backward_mps
 
 - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8748,6 +9253,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: smooth_l1_loss_out
+    MPS: smooth_l1_loss_out_mps
 
 - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8759,6 +9265,7 @@
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
+    MPS: smooth_l1_loss_backward_out_mps
 
 - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   python_module: nn
@@ -8812,6 +9319,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: elu_out
+    MPS: elu_out_mps
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   structured_delegate: elu.out
@@ -8824,6 +9332,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: elu_backward_out
+    MPS: elu_backward_out_mps
 
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
   structured_delegate: elu_backward.grad_input
@@ -8860,6 +9369,16 @@
     CPU: glu_backward_cpu
     CUDA: glu_backward_cuda
 
+- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_jvp
+
+- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_backward_jvp
+
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
@@ -8896,31 +9415,33 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardtanh_out
+    CPU, CUDA, MPS: hardtanh_out
     QuantizedCPU: hardtanh_out_quantized_cpu
 
 - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardtanh
+    CPU, CUDA, MPS: hardtanh
     QuantizedCPU: hardtanh_quantized_cpu
 
 - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_backward_out
+    MPS: hardtanh_backward_out_mps
 
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_backward
+    MPS: hardtanh_backward_mps
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardtanh_
+    CPU, CUDA, MPS: hardtanh_
     QuantizedCPU: hardtanh_quantized_cpu_
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -8953,6 +9474,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_out
+    MPS: leaky_relu_out_mps
     QuantizedCPU: leaky_relu_out_quantized_cpu
 
 - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -8968,6 +9490,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_backward_out
+    MPS: leaky_relu_backward_out_mps
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   structured_delegate: leaky_relu_backward.grad_input
@@ -9090,6 +9613,7 @@
   dispatch:
     CPU: adaptive_avg_pool2d_out_cpu
     CUDA: adaptive_avg_pool2d_out_cuda
+    MPS: adaptive_avg_pool2d_out_mps
     MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
 
 - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
@@ -9107,13 +9631,16 @@
   dispatch:
     CPU: adaptive_avg_pool2d_cpu
     CUDA: adaptive_avg_pool2d_cuda
+    MPS: adaptive_avg_pool2d_mps
     QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
+    QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
 
 - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool2d_backward_cpu
     CUDA: adaptive_avg_pool2d_backward_cuda
+    MPS: adaptive_avg_pool2d_backward_mps
 
 - func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -9201,6 +9728,7 @@
   dispatch:
     CPU: avg_pool2d_out_cpu
     CUDA: avg_pool2d_out_cuda
+    MPS: avg_pool2d_out_mps
     MkldnnCPU: mkldnn_avg_pool2d_out
 
 - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
@@ -9216,6 +9744,7 @@
   dispatch:
     CPU: avg_pool2d_backward_out_cpu
     CUDA: avg_pool2d_backward_out_cuda
+    MPS: avg_pool2d_backward_out_mps
     MkldnnCPU: mkldnn_avg_pool2d_backward_out
 
 - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
@@ -9313,6 +9842,7 @@
   dispatch:
     CPU: max_pool2d_with_indices_out_cpu
     CUDA: max_pool2d_with_indices_out_cuda
+    MPS: max_pool2d_with_indices_out_mps
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -9325,6 +9855,7 @@
   dispatch:
     CPU: max_pool2d_with_indices_backward_out_cpu
     CUDA: max_pool2d_with_indices_backward_out_cuda
+    MPS: max_pool2d_with_indices_backward_out_mps
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
   python_module: nn
@@ -9368,18 +9899,6 @@
     CPU: max_unpooling2d_forward_cpu
     CUDA: max_unpooling2d_forward_cuda
 
-- func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!)
-  python_module: nn
-  dispatch:
-    CPU: max_unpooling2d_backward_out_cpu
-    CUDA: max_unpooling2d_backward_out_cuda
-
-- func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  python_module: nn
-  dispatch:
-    CPU: max_unpooling2d_backward_cpu
-    CUDA: max_unpooling2d_backward_cuda
-
 - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
@@ -9392,30 +9911,18 @@
     CPU: max_unpooling3d_forward_cpu
     CUDA: max_unpooling3d_forward_cuda
 
-- func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
-  python_module: nn
-  dispatch:
-    CPU: max_unpooling3d_backward_out_cpu
-    CUDA: max_unpooling3d_backward_out_cuda
-
-- func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  python_module: nn
-  dispatch:
-    CPU: max_unpooling3d_backward_cpu
-    CUDA: max_unpooling3d_backward_cuda
-
 - func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
   dispatch:
-    CPU, QuantizedCPU: reflection_pad1d_out_cpu
+    CPU: reflection_pad1d_out_cpu
+    QuantizedCPU: reflection_pad1d_out_quantized_cpu
     CUDA: reflection_pad1d_out_cuda
+    MPS: reflection_pad1d_out_mps
 
 - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
   python_module: nn
   structured_delegate: reflection_pad1d.out
-  dispatch:
-    QuantizedCPU: reflection_pad1d_cpu
 
 - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -9423,6 +9930,7 @@
   dispatch:
     CPU: reflection_pad1d_backward_out_cpu
     CUDA: reflection_pad1d_backward_out_cuda
+    MPS: reflection_pad1d_backward_out_mps
 
 - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
   python_module: nn
@@ -9433,24 +9941,29 @@
   dispatch:
     CPU, QuantizedCPU: reflection_pad2d_out_cpu
     CUDA: reflection_pad2d_out_cuda
+    MPS: reflection_pad2d_out_mps
 
 - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
   python_module: nn
   dispatch:
-    CPU, QuantizedCPU: reflection_pad2d_cpu
+    CPU: reflection_pad2d_cpu
+    QuantizedCPU: reflection_pad2d_quantized_cpu
     CUDA: reflection_pad2d_cuda
+    MPS: reflection_pad2d_mps
 
 - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_backward_out_cpu
     CUDA: reflection_pad2d_backward_out_cuda
+    MPS: reflection_pad2d_backward_out_mps
 
 - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_backward_cpu
     CUDA: reflection_pad2d_backward_cuda
+    MPS: reflection_pad2d_backward_mps
 
 - func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -9458,6 +9971,7 @@
   dispatch:
     CPU: reflection_pad3d_out_cpu
     CUDA: reflection_pad3d_out_cuda
+    MPS: reflection_pad3d_out_mps
 
 - func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor
   python_module: nn
@@ -9469,6 +9983,7 @@
   dispatch:
     CPU: reflection_pad3d_backward_out_cpu
     CUDA: reflection_pad3d_backward_out_cuda
+    MPS: reflection_pad3d_backward_out_mps
 
 - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
   python_module: nn
@@ -9480,6 +9995,7 @@
   dispatch:
     CPU: replication_pad1d_out_cpu
     CUDA: replication_pad1d_out_cuda
+    MPS: replication_pad1d_out_mps
 
 - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
   python_module: nn
@@ -9491,6 +10007,7 @@
   dispatch:
     CPU: replication_pad1d_backward_out_cpu
     CUDA: replication_pad1d_backward_out_cuda
+    MPS: replication_pad1d_backward_out_mps
 
 - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
   python_module: nn
@@ -9502,6 +10019,7 @@
   dispatch:
     CPU: replication_pad2d_out_cpu
     CUDA: replication_pad2d_out_cuda
+    MPS: replication_pad2d_out_mps
 
 - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
   python_module: nn
@@ -9512,12 +10030,14 @@
   dispatch:
     CPU: replication_pad2d_backward_out_cpu
     CUDA: replication_pad2d_backward_out_cuda
+    MPS: replication_pad2d_backward_out_mps
 
 - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: replication_pad2d_backward_cpu
     CUDA: replication_pad2d_backward_cuda
+    MPS: replication_pad2d_backward_mps
 
 - func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -9525,6 +10045,7 @@
   dispatch:
     CPU: replication_pad3d_out_cpu
     CUDA: replication_pad3d_out_cuda
+    MPS: replication_pad3d_out_mps
 
 - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
   python_module: nn
@@ -9535,19 +10056,30 @@
   dispatch:
     CPU: replication_pad3d_backward_out_cpu
     CUDA: replication_pad3d_backward_out_cuda
+    MPS: replication_pad3d_backward_out_mps
 
 - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: replication_pad3d_backward_cpu
     CUDA: replication_pad3d_backward_cuda
+    MPS: replication_pad3d_backward_mps
 
-- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+- func: _pad_circular(Tensor self, int[] pad) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_linear1d
 
-- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
+- func: _pad_enum(Tensor self, int[] pad, int mode, float? value=None) -> Tensor
+  python_module: nn
+
+- func: pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> Tensor
+  python_module: nn
+
+- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: upsample_linear1d
+
+- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
   dispatch:
     CompositeExplicitAutograd: upsample_linear1d_backward
@@ -9697,6 +10229,7 @@
   dispatch:
     CPU: upsample_bilinear2d_out_cpu
     CUDA: upsample_bilinear2d_out_cuda
+    MPS: upsample_bilinear2d_out_mps
 
 - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -9710,6 +10243,7 @@
   dispatch:
     CPU: upsample_bilinear2d_backward_out_cpu
     CUDA: upsample_bilinear2d_backward_out_cuda
+    MPS: upsample_bilinear2d_backward_out_mps
 
 - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -9853,6 +10387,7 @@
   dispatch:
     CPU: upsample_nearest2d_out_cpu
     CUDA: upsample_nearest2d_out_cuda
+    MPS: upsample_nearest2d_out_mps
 
 - func: _upsample_nearest_exact2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -9860,6 +10395,7 @@
   dispatch:
     CPU: _upsample_nearest_exact2d_out_cpu
     CUDA: _upsample_nearest_exact2d_out_cuda
+    MPS: _upsample_nearest_exact2d_out_mps
 
 - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -9879,6 +10415,7 @@
   dispatch:
     CPU: upsample_nearest2d_backward_out_cpu
     CUDA: upsample_nearest2d_backward_out_cuda
+    MPS: upsample_nearest2d_backward_out_mps
 
 - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -9886,6 +10423,7 @@
   dispatch:
     CPU: _upsample_nearest_exact2d_backward_out_cpu
     CUDA: _upsample_nearest_exact2d_backward_out_cuda
+    MPS: _upsample_nearest_exact2d_backward_out_mps
 
 - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -9949,6 +10487,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sigmoid_backward_out
+    MPS: sigmoid_backward_out_mps
 
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   python_module: nn
@@ -9971,6 +10510,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: tanh_backward_out
+    MPS: tanh_backward_out_mps
 
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   python_module: nn
@@ -10236,6 +10776,19 @@
   dispatch:
     CPU, CUDA: special_ndtri_out
 
+- func: special_log_ndtr(Tensor self) -> Tensor
+  structured_delegate: special_log_ndtr.out
+  python_module: special
+  variants: function
+
+- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_log_ndtr_out
+
 - func: special_expm1(Tensor self) -> Tensor
   python_module: special
   variants: function
@@ -10489,7 +11042,7 @@
 
 - func: special_polygamma(int n, Tensor self) -> Tensor
   python_module: special
-  variants: function, method
+  variants: function
 
 - func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: special
@@ -10786,6 +11339,8 @@
   python_module: linalg
   variants: function
   structured_delegate: linalg_cross.out
+  dispatch:
+    ZeroTensor: linalg_cross_zerotensor
 
 - func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
@@ -10816,6 +11371,20 @@
   dispatch:
     CPU, CUDA: linalg_lu_factor_ex_out
 
+# linalg.lu
+- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+  python_module: linalg
+  structured_delegate: linalg_lu.out
+  variants: function
+
+- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_out
+
+# linalg.det
 - func: linalg_det(Tensor self) -> Tensor
   python_module: linalg
   variants: function
@@ -10837,6 +11406,38 @@
   dispatch:
     CPU, CUDA: _det_lu_based_helper_backward_helper
 
+- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
+  structured_delegate: linalg_ldl_factor_ex.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_factor_ex_out
+
+- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
+  structured_delegate: linalg_ldl_solve.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_solve_out
+
 - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
   python_module: linalg
   variants: function
@@ -10983,11 +11584,11 @@
 - func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   python_module: linalg
   variants: function
-  dispatch:
-    CPU, CUDA: linalg_vector_norm
+  structured_delegate: linalg_vector_norm.out
 
 - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
+  structured: True
   dispatch:
     CPU, CUDA: linalg_vector_norm_out
 
@@ -11111,13 +11712,13 @@
   python_module: linalg
   variants: function
 
-- func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
+- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
   python_module: linalg
   variants: function
   dispatch:
     CompositeExplicitAutograd: linalg_qr
 
-- func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
   python_module: linalg
   variants: function
   dispatch:
@@ -11237,3 +11838,447 @@
 - func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
   variants: function
   python_module: nn
+
+- func: nested_tensor(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: function
+
+- func: _fw_primal_copy(Tensor self, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _fw_primal_copy
+  tags: view_copy
+
+- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _make_dual_copy
+  tags: view_copy
+
+- func: view_as_real_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_as_real_copy
+  tags: view_copy
+
+- func: view_as_complex_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_as_complex_copy
+  tags: view_copy
+
+- func: _conj_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _conj_copy
+  tags: view_copy
+
+- func: _neg_view_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _neg_view_copy
+  tags: view_copy
+
+- func: as_strided_copy(Tensor self, int[] size, int[] stride, int? storage_offset=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: as_strided_copy
+  tags: view_copy
+
+- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _sparse_broadcast_to_copy
+  tags: view_copy
+
+- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: diagonal_copy
+  tags: view_copy
+
+- func: expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: expand_copy
+  tags: view_copy
+
+- func: expand_copy.SymInt(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: expand_copy_SymInt
+  tags: view_copy
+
+- func: permute_copy(Tensor self, int[] dims) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: permute_copy
+  tags: view_copy
+
+- func: _reshape_alias_copy(Tensor self, int[] size, int[] stride) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _reshape_alias_copy
+  tags: view_copy
+
+- func: select_copy.int(Tensor self, int dim, int index) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: select_copy_int
+  tags: view_copy
+
+- func: detach_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: detach_copy
+  tags: view_copy
+
+- func: slice_copy.Tensor(Tensor self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: slice_copy_Tensor
+  tags: view_copy
+
+- func: split_copy.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_copy_Tensor
+  tags: view_copy
+
+- func: split_with_sizes_copy(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes_copy
+  tags: view_copy
+
+- func: squeeze_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: squeeze_copy
+  tags: view_copy
+
+- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: squeeze_copy_dim
+  tags: view_copy
+
+- func: t_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: t_copy
+  tags: view_copy
+
+- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: transpose_copy_int
+  tags: view_copy
+
+- func: unsqueeze_copy(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze_copy
+  tags: view_copy
+
+- func: _indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _indices_copy
+  tags: view_copy
+
+- func: _values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _values_copy
+  tags: view_copy
+
+- func: indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: indices_copy
+  tags: view_copy
+
+- func: values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: values_copy
+  tags: view_copy
+
+- func: crow_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: crow_indices_copy
+  tags: view_copy
+
+- func: col_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: col_indices_copy
+  tags: view_copy
+
+- func: ccol_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: ccol_indices_copy
+  tags: view_copy
+
+- func: row_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: row_indices_copy
+  tags: view_copy
+
+- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unbind_copy_int
+  tags: view_copy
+
+- func: view_copy(Tensor self, int[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_copy
+  tags: view_copy
+
+- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_copy_dtype
+  tags: view_copy
+
+- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unfold_copy
+  tags: view_copy
+
+- func: alias_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: alias_copy
+  tags: view_copy
+
+- func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _fw_primal_copy_out
+
+
+- func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _make_dual_copy_out
+
+
+- func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_as_real_copy_out
+
+
+- func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_as_complex_copy_out
+
+
+- func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _conj_copy_out
+
+
+- func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _neg_view_copy_out
+
+
+- func: as_strided_copy.out(Tensor self, int[] size, int[] stride, int? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: as_strided_copy_out
+
+
+- func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _sparse_broadcast_to_copy_out
+
+
+- func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: diagonal_copy_out
+
+
+- func: expand_copy.SymInt_out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: expand_copy_SymInt_out
+
+
+- func: expand_copy.out(Tensor self, int[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: expand_copy_out
+
+
+- func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: permute_copy_out
+
+
+- func: _reshape_alias_copy.out(Tensor self, int[] size, int[] stride, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _reshape_alias_copy_out
+
+
+- func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: select_copy_int_out
+
+
+- func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: detach_copy_out
+
+
+- func: slice_copy.Tensor_out(Tensor self, int dim=0, int? start=None, int? end=None, int step=1, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: slice_copy_Tensor_out
+
+
+- func: split_copy.Tensor_out(Tensor self, int split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_copy_Tensor_out
+
+
+- func: split_with_sizes_copy.out(Tensor self, int[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes_copy_out
+
+
+- func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: squeeze_copy_out
+
+
+- func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: squeeze_copy_dim_out
+
+
+- func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: t_copy_out
+
+
+- func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: transpose_copy_int_out
+
+
+- func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze_copy_out
+
+
+- func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _indices_copy_out
+
+
+- func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _values_copy_out
+
+
+- func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: indices_copy_out
+
+
+- func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: values_copy_out
+
+
+- func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: crow_indices_copy_out
+
+
+- func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: col_indices_copy_out
+
+
+- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unbind_copy_int_out
+
+
+- func: view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_copy_out
+
+
+- func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: view_copy_dtype_out
+
+
+- func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unfold_copy_out
+
+
+- func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: alias_copy_out
+
+- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU: NestedTensor_to_padded_tensor_generic
+    NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
+
+- func: _nested_tensor_layer_norm(Tensor self, Tensor? weight, Tensor? bias, float eps) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm
+
+# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
+- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+
+- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
new file mode 100644
index 000000000000..d4f3338fb4cc
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -0,0 +1,551 @@
+#include <ATen/native/nested/NestedTensorMath.h>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/NestedTensorImpl.h>
+#include <c10/core/DispatchKey.h>
+#include <ATen/native/nested/NestedTensorMath.h>
+
+namespace at {
+namespace native {
+
+namespace {
+template <typename Func>
+Tensor map_nt(const Tensor& nt, Func f) {
+  auto* nt_impl = get_nested_tensor_impl(nt);
+  const auto& sizes = nt_impl->get_nested_size_tensor();
+  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
+}
+
+c10::optional<int64_t> maybe_get_consistent_last_dim_of_nested_tensor(
+    const NestedTensorImpl& nt) {
+  const auto& sizes = nt.get_nested_size_tensor();
+  // The last entry in every row of sizes must be the same.
+  const auto& last_dims = sizes.select(1, -1);
+  const auto last_dims_accessor = last_dims.packed_accessor64<int64_t, 1>();
+  // REVIEW: this can't be the most efficient and concise way to
+  // write this check, can it?
+  const auto last_dim_value = last_dims_accessor[0];
+  for (const auto i : c10::irange(1, last_dims.numel())) {
+    if (last_dims_accessor[i] != last_dim_value) {
+      return c10::nullopt;
+    }
+  }
+  return last_dim_value;
+}
+
+int64_t num_bytes(IntArrayRef sizes) {
+  // 0-dim Tensors have torch.Size of .size() 0, but carry 1 memory.
+  // Empty 1-dim Tensors (torch.tensor([])) have torch.Size of .size() 1,
+  // but carry 0 memory.
+  int64_t result = 1;
+  int64_t stride = 1;
+  for (int ii = sizes.size() - 1; ii >= 0; --ii) {
+    result += (sizes[ii] - 1) * stride;
+    // TODO: accept strides as input when we support them instead of
+    // assuming contiguous.
+    stride *= sizes[ii];
+  }
+  return result;
+}
+
+std::vector<int64_t> NestedTensor_get_max_size_from_size_tensor(const Tensor& sizes) {
+  if (sizes.dim() == 0) {
+    return {};
+  }
+  const auto sizes_ptr = sizes.data_ptr<int64_t>();
+  const auto sizes_size_0 = sizes.sizes()[0];
+  const auto sizes_size_1 = sizes.sizes()[1];
+  TORCH_INTERNAL_ASSERT(sizes_size_1 > 0);
+  std::vector<int64_t> results(sizes_size_1, 0);
+  for (const auto ii : c10::irange(sizes_size_0)) {
+    for (const auto jj : c10::irange(sizes_size_1)) {
+      auto val = sizes_ptr[ii * sizes_size_1 + jj];
+      if (results[jj] < val) {
+        results[jj] = val;
+      }
+    }
+  }
+  return results;
+}
+
+Tensor pad_tensor_to_shape(
+    const Tensor& t,
+    IntArrayRef goal_shape,
+    double value = 0) {
+  std::vector<int64_t> padd;
+  auto tup = t.sizes();
+  TORCH_CHECK(
+      t.dim() == (int64_t)(goal_shape.size()),
+      "dimension ",
+      t.dim(),
+      " doesn't match length ",
+      goal_shape.size(),
+      " of goal shape.");
+  for (int64_t i = tup.size() - 1; i >= 0; i--) {
+    padd.push_back(0);
+    padd.push_back(goal_shape[i] - tup[i]);
+  }
+  Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padd), value);
+  new_tensor = new_tensor.reshape(goal_shape);
+  return new_tensor;
+}
+} // namespace
+
+at::Tensor wrap_buffer(at::Tensor buffer, at::Tensor nested_size_tensor) {
+  TORCH_CHECK(buffer.is_contiguous(), "Given buffer must be contiguous.");
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(buffer), std::move(nested_size_tensor));
+}
+
+inline const at::Tensor& get_buffer(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_buffer();
+}
+
+inline const at::Tensor& get_nested_size_tensor(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_nested_size_tensor();
+}
+
+// CPU only!
+// TODO: The algorithm here can be optimized, right now it involves a lot of
+// small tensor manipulations
+std::vector<at::Tensor> NestedTensor_unbind(
+    const at::Tensor& self,
+    int64_t dim) {
+  TORCH_CHECK(
+      dim == 0,
+      "NestedTensor can only be unbound along dimension 0 ",
+      "got dimension ",
+      dim,
+      " instead.");
+  auto esizes = get_nested_size_tensor(self);
+  std::vector<at::Tensor> result_tensors;
+  if (esizes.dim() == 0) {
+    return result_tensors;
+  }
+  auto esizes_chunks = esizes.unbind(0);
+  std::vector<int64_t> splits;
+  for (const auto i : c10::irange(esizes_chunks.size())) {
+    splits.push_back(esizes_chunks[i].prod().item<int64_t>());
+  }
+  auto buffer_chunks = at::split_with_sizes(get_buffer(self), splits);
+  for (const auto i : c10::irange(buffer_chunks.size())) {
+    const auto& esize_chunk = esizes_chunks[i];
+    result_tensors.push_back(buffer_chunks[i].view(IntArrayRef(
+        esize_chunk.data_ptr<int64_t>(),
+        esize_chunk.data_ptr<int64_t>() + esize_chunk.numel())));
+  }
+  return result_tensors;
+}
+
+Tensor& NestedTensor_relu_(Tensor& self) {
+  at::relu_(const_cast<Tensor&>(get_nested_tensor_impl(self)->get_buffer()));
+  return self;
+}
+
+Tensor NestedTensor_relu(const Tensor& self) {
+  return map_nt(self, at::relu);
+}
+
+Tensor& NestedTensor_gelu_(Tensor& self, c10::string_view approximate) {
+  at::gelu_(const_cast<Tensor&>(get_nested_tensor_impl(self)->get_buffer()), approximate);
+  return self;
+}
+
+Tensor NestedTensor_gelu(const Tensor& self, c10::string_view approximate) {
+  return map_nt(
+      self,
+      [approximate](const Tensor& buffer) {
+        return at::gelu(buffer, approximate);
+      });
+}
+
+Tensor NestedTensor_nested_tensor_from_mask(const Tensor& t, const Tensor& mask) {
+    TORCH_CHECK(mask.scalar_type() == at::ScalarType::Bool, "Expected mask to be of ScalarType Bool, but got ", mask.scalar_type(), " instead.");
+    TORCH_CHECK(mask.dim() == 2, "Padding mask should be 2D");
+    TORCH_CHECK(t.dim() == 3, "Input should be a 3D tensor, N * L * D");
+    auto N = t.size(0), L = t.size(1), D = t.size(2);
+    auto NN = mask.size(0), LL = mask.size(1);
+    TORCH_CHECK(N == NN && L == LL, "Mask size should match input size");
+
+    // N * L
+    Tensor sizes = mask;
+    Tensor tmp_pad = at::zeros({N, 1}, mask.options());
+    // Make sure padding is only added at the end of mask
+    Tensor nums = at::cat({sizes, tmp_pad}, 1).to(kInt).argmin(1);
+
+    // N, ([size1, size2, ... sizeN])
+    sizes = sizes.cumsum(1).select(1, L - 1);
+    nums = nums.to(sizes.options());
+
+    TORCH_CHECK(sizes.equal(nums), "Mask must be left-aligned without gaps");
+
+    sizes = sizes.reshape({N, 1});
+    // N, ([d1=D, d2=D, ... dN=D])
+    Tensor d = at::full_like(sizes, D);
+
+    // N * 2, ([[size1, D], [size2, D], ..., [sizeN, D]])
+    sizes = at::cat({sizes, d}, 1);
+
+    return at::_nested_from_padded(t, sizes, false);
+}
+
+Tensor nested_tensor(
+    TensorList list,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    c10::optional<bool> pin_memory) {
+  TensorOptions options_ =
+      TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
+          pin_memory);
+
+  if (list.size() == 0) {
+    return wrap_buffer(ones({0}, dtype, layout, device), ones({}));
+  }
+  std::vector<Tensor> sizes;
+  std::vector<Tensor> flat_tensors;
+  for (const auto i : c10::irange(list.size())) {
+    if (i > 0) {
+      int64_t dim_i = list[i].dim();
+      int64_t dim_prev = list[i - 1].dim();
+      TORCH_CHECK(
+          dim_i == dim_prev,
+          "All Tensors given to nested_tensor must have the same dimension. ",
+          "Found dimension ",
+          dim_i,
+          " for Tensor at index ",
+          i,
+          " and dimension ",
+          dim_prev,
+          " for Tensor at index ",
+          i - 1,
+          ".");
+    }
+    // TODO: Remove call to contiguous once we support strides.
+    flat_tensors.push_back(list[i].reshape(-1).contiguous());
+    sizes.push_back(tensor(c10::IntArrayRef(list[i].sizes())));
+  }
+
+  TensorOptions options = flat_tensors[0].options().merge_in(options_);
+
+  return wrap_buffer(
+      at::cat(flat_tensors).to(options), at::native::stack(sizes));
+}
+
+int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt) {
+  auto result = maybe_get_consistent_last_dim_of_nested_tensor(nt);
+  TORCH_CHECK(
+      result.has_value(),
+      "all tensors in NestedTensor must have the same trailing dim for Matmul but got ",
+      nt.get_nested_size_tensor().select(1, -1));
+  return *result;
+}
+
+std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt) {
+  return NestedTensor_get_max_size_from_size_tensor(nt.get_nested_size_tensor());
+}
+
+Tensor NestedTensor_layer_norm(
+    const Tensor& input,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    double eps) {
+  TORCH_CHECK(weight_opt && bias_opt, "NestedTensor layer_norm requires weight and bias");
+  const auto& weight = *weight_opt;
+  const auto& bias = *bias_opt;
+  TORCH_CHECK(!weight.is_nested(), "NestedTensor weight not supported for layer_norm");
+  TORCH_CHECK(!bias.is_nested(), "NestedTensor bias not supported for layer_norm");
+  auto* nt_input = get_nested_tensor_impl(input);
+  TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_input));
+  const auto& input_buffer = nt_input->get_buffer();
+  const auto last_dim = get_consistent_last_dim_of_nested_tensor(*nt_input);
+  const auto valid_word_num = input_buffer.numel() / last_dim;
+  const auto weight_contig = weight.expect_contiguous();
+  const auto bias_contig = bias.expect_contiguous();
+  auto output_buffer = at::native::empty_like(
+      input_buffer,
+      c10::nullopt /* dtype */,
+      c10::nullopt /* layout */,
+      c10::nullopt /* device */,
+      c10::nullopt /* pin_memory */,
+      at::MemoryFormat::Contiguous);
+  auto options = input_buffer.options();
+  if (input_buffer.is_cuda()) {
+    auto acc_type = at::toAccumulateType(input_buffer.scalar_type(), true);
+    options = options.dtype(acc_type);
+  }
+  Tensor mean = at::empty({valid_word_num}, options);
+  Tensor rstd = at::empty({valid_word_num}, options);
+  LayerNormKernel(
+      input_buffer.is_cuda() ? kCUDA : kCPU,
+      input_buffer,
+      *weight_contig,
+      *bias_contig,
+      valid_word_num,
+      last_dim,
+      eps,
+      &output_buffer,
+      &mean,
+      &rstd);
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(output_buffer), nt_input->get_nested_size_tensor());
+}
+
+Tensor NestedTensor_from_padded_and_nested_example(
+    const Tensor& padded,
+    const Tensor& nt_example) {
+  return _nested_from_padded(padded, get_nested_tensor_impl(nt_example)->get_nested_size_tensor());
+}
+
+Tensor nested_from_padded_generic(
+    const Tensor& padded,
+    const Tensor& sizes,
+    const bool do_transform_0213) {
+  // Check and do transform 0213
+  auto padded_transformed = padded;
+  if (do_transform_0213) {
+    padded_transformed = padded.permute({0, 2, 1, 3})
+      .contiguous()
+      .view(
+          {padded.size(0),
+           padded.size(2),
+           padded.size(1) * padded.size(3)});
+  }
+  const auto target_size = NestedTensor_get_max_size_from_size_tensor(sizes);
+  IntArrayRef target_size_arr(target_size);
+  std::vector<at::Tensor> masks;
+  std::vector<at::Tensor> all_sizes = sizes.unbind();
+  for (const auto& size : all_sizes) {
+    IntArrayRef sizes_i(
+        size.data_ptr<int64_t>(), size.data_ptr<int64_t>() + size.numel());
+    at::Tensor mask_i = padded_transformed.new_full(
+        sizes_i, true, kBool, c10::nullopt, c10::nullopt, c10::nullopt);
+    masks.push_back(pad_tensor_to_shape(mask_i, target_size_arr));
+  }
+  at::Tensor final_mask = at::stack(masks);
+  at::Tensor new_buffer = padded_transformed.masked_select(final_mask);
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(new_buffer), sizes);
+}
+
+Tensor NestedTensor_to_padded_tensor_generic(
+    const Tensor& t,
+    double padding,
+    OptionalIntArrayRef output_size) {
+  // TODO: skipped optimization for case of all 1x1 tensors
+  auto& nt = *get_nested_tensor_impl(t);
+  auto max_size = NestedTensor_get_max_size(nt);
+  auto sizes = nt.get_nested_size_tensor();
+
+  if (sizes.numel() == 0 || sizes.dim() == 0) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(nt.get_buffer().numel() == 0);
+    return nt.get_buffer();
+  }
+
+  // TODO: doesn't handle empty/scalar entries because we don't need
+  // it for transformers; see to_padded_tensor in
+  // pytorch/nestedtensor's masking.cpp.
+
+  const auto sizes_num_rows = sizes.sizes()[0];
+  const auto sizes_num_columns = sizes.sizes()[1];
+  const auto sizes_data_start = sizes.data_ptr<int64_t>();
+  const auto sizes_data_end = sizes_data_start + sizes.numel();
+  std::vector<int64_t> split_sizes;
+  split_sizes.reserve(sizes_num_rows);
+  for (auto sizes_data = sizes_data_start; sizes_data != sizes_data_end;
+       sizes_data += sizes_num_columns) {
+    split_sizes.push_back(
+        num_bytes(IntArrayRef(sizes_data, sizes_num_columns)));
+  }
+  std::vector<int64_t> nonzero_split_sizes;
+  for (const auto split_size : split_sizes) {
+    if (split_size > 0) {
+      nonzero_split_sizes.push_back(split_size);
+    }
+  }
+  const auto buffer = nt.get_buffer();
+  std::vector<Tensor> buffers_;
+  if (!nonzero_split_sizes.empty()) {
+    buffers_ = at::split_with_sizes(buffer, nonzero_split_sizes, 0);
+  }
+
+  std::vector<Tensor> buffers;
+  buffers.reserve(split_sizes.size());
+  int64_t next_buffer = 0;
+  auto sizes_ptr = sizes_data_start;
+  for (const auto split_size : split_sizes) {
+    Tensor to_pad;
+    IntArrayRef tensor_sizes(sizes_ptr, sizes_num_columns);
+    if (split_size > 0) {
+      to_pad = buffers_[next_buffer++].reshape(tensor_sizes);
+    } else {
+      to_pad = at::empty(tensor_sizes, buffer.options());
+    }
+    buffers.push_back(pad_tensor_to_shape(to_pad, max_size, padding));
+    sizes_ptr += sizes_num_columns;
+  }
+  auto ret_val = at::stack(buffers);
+
+  // Pad output tensor to output_size if provided
+  if (output_size.has_value()) {
+    auto output_size_ = output_size.value();
+    TORCH_CHECK(
+        (int64_t)output_size_.size() == ret_val.dim(),
+        "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
+    for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
+      TORCH_CHECK(
+          output_size_[i] >= ret_val.size(i),
+          "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
+    }
+    return pad_tensor_to_shape(ret_val, output_size_, padding);
+  }
+  return ret_val;
+}
+
+Tensor NestedTensor_embedding(
+    const Tensor& weight,
+    const Tensor& indices,
+    int64_t padding_idx,
+    bool scale_grad_by_freq,
+    bool sparse) {
+  const auto* nt_indices = get_nested_tensor_impl(indices);
+  TORCH_CHECK(
+      !weight.is_nested(), "NestedTensor weight not supported for embedding");
+  TORCH_CHECK(indices.dim() < 3);
+  TORCH_CHECK(indices.dim() > 0, "NestedTensor embedding doesn't support empty indices.")
+  TORCH_CHECK(weight.dim() == 2);
+  TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_indices));
+  TORCH_CHECK(weight.is_contiguous());
+
+  const auto& indices_buffer = nt_indices->get_buffer();
+  auto result_buffer = at::embedding(
+      weight, indices_buffer, padding_idx, scale_grad_by_freq, sparse);
+  const auto& sizes = nt_indices->get_nested_size_tensor();
+  auto new_sizes = at::empty({sizes.size(0)}, sizes.options());
+  new_sizes.fill_(weight.sizes()[1]);
+  new_sizes = new_sizes.reshape({new_sizes.size(0), 1});
+  new_sizes = at::cat({sizes, new_sizes}, 1);
+  return at::detail::make_tensor<NestedTensorImpl>(
+      result_buffer.reshape({-1}), std::move(new_sizes));
+}
+
+std::pair<NestedTensorImpl*, NestedTensorImpl*>
+get_elementwise_nested_tensor_impl(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name) {
+  if (self.is_nested() && !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a nested self and non-nested other");
+  } else if (!(self.is_nested()) && other.is_nested()) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and nested other");
+  } else if (!(self.is_nested()) || !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and non-nested other");
+  }
+
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto other_ptr = get_nested_tensor_impl(other);
+
+  TORCH_CHECK(
+      self.dim() == other.dim(),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      at::equal(
+          self_ptr->get_nested_size_tensor(),
+          other_ptr->get_nested_size_tensor()),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      nested_tensor_impl_is_contiguous(self_ptr) &&
+          nested_tensor_impl_is_contiguous(other_ptr),
+      op_name,
+      " does not support non-contiguous NestedTensor inputs");
+  return std::make_pair(self_ptr, other_ptr);
+}
+
+template <typename Func>
+Tensor NestedTensor_elementwise_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  const auto& nt_self = *self_impl;
+  const auto& nt_other = *other_impl;
+  const auto& self_sizes = nt_self.get_nested_size_tensor();
+  return wrap_buffer(
+      f(nt_self.get_buffer().reshape({-1}),
+        nt_other.get_buffer().reshape({-1})),
+      self_sizes);
+}
+
+Tensor NestedTensor_add_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) {
+        return at::add(b1, b2, alpha);
+      });
+}
+
+Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "mul", [](const Tensor& b1, const Tensor& b2) {
+        return at::mul(b1, b2);
+      });
+}
+
+template <typename Func>
+Tensor& NestedTensor_elementwise__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  const auto& nt_self = *self_impl;
+  const auto& nt_other = *other_impl;
+  f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1}));
+  return self;
+}
+
+Tensor& NestedTensor_add__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) {
+        return b1.add_(b2, alpha);
+      });
+}
+
+Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "mul_", [](const Tensor& b1, const Tensor& b2) {
+        return b1.mul_(b2);
+      });
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h
new file mode 100644
index 000000000000..8f2919fc35b8
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorMath.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <ATen/NestedTensorImpl.h>
+
+#include <vector>
+
+namespace at {
+namespace native {
+struct NestedTensorImpl;
+
+// TODO: cache this and only do it once per NestedTensor
+int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
+
+TORCH_API std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt);
+
+TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding, OptionalIntArrayRef output_size);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
new file mode 100644
index 000000000000..6ca4ff7e22a7
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@@ -0,0 +1,142 @@
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/native/nested/NestedTensorMath.h>
+
+namespace at {
+namespace native {
+
+Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
+  auto* nt_self = get_nested_tensor_impl_or_null(self);
+  TORCH_CHECK(nt_self != nullptr);
+  TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_self));
+  TORCH_CHECK(self.dim() == 3 && other.dim() == 2);
+  const auto last_dim = get_consistent_last_dim_of_nested_tensor(*nt_self);
+  TORCH_CHECK(
+      last_dim == other.sizes()[0],
+      "shape mismatch for NestedTensor matmul. NestedTensor last_dim: ",
+      last_dim,
+      " vs. first dim of rhs: ",
+      other.sizes()[0]);
+  const Tensor& self_buffer = nt_self->get_buffer();
+  Tensor result_buffer =
+      at::mm(self_buffer.reshape({-1, other.sizes()[0]}), other);
+  result_buffer = result_buffer.reshape({-1});
+  int64_t other_size_1 = other.sizes()[1];
+  Tensor new_sizes = nt_self->get_nested_size_tensor().clone();
+  // Now the last entry in every row of new_sizes should be other_size_1.
+  new_sizes.index_put_({at::indexing::Slice(), -1}, other_size_1);
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(result_buffer), std::move(new_sizes));
+}
+
+Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::Scalar& beta,
+    const c10::Scalar& alpha,
+    c10::optional<bool> use_gelu) {
+  // Interesting case: alpha * NT * T + beta * T
+  const auto* nt_mat1 = get_nested_tensor_impl_or_null(mat1);
+  TORCH_INTERNAL_ASSERT(nt_mat1 != nullptr);
+  TORCH_INTERNAL_ASSERT(!mat2.is_nested());
+  TORCH_INTERNAL_ASSERT(!self.is_nested());
+  TORCH_INTERNAL_ASSERT(nested_tensor_impl_is_contiguous(nt_mat1));
+  TORCH_INTERNAL_ASSERT(mat1.dim() == 3 && mat2.dim() == 2);
+  TORCH_INTERNAL_ASSERT(
+      get_consistent_last_dim_of_nested_tensor(*nt_mat1) == mat2.sizes()[0]);
+  const Tensor& mat1_buffer = nt_mat1->get_buffer();
+  Tensor result_buffer = !use_gelu.has_value()
+      ? at::addmm(
+            self, mat1_buffer.reshape({-1, mat2.sizes()[0]}), mat2, beta, alpha)
+      : at::_addmm_activation(
+            self,
+            mat1_buffer.reshape({-1, mat2.sizes()[0]}),
+            mat2,
+            beta,
+            alpha,
+            *use_gelu);
+  result_buffer = result_buffer.reshape({-1});
+  int64_t other_size_1 = mat2.sizes()[1];
+  Tensor new_sizes = nt_mat1->get_nested_size_tensor().clone();
+  new_sizes.index_put_({at::indexing::Slice(), -1}, other_size_1);
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(result_buffer), std::move(new_sizes));
+}
+
+Tensor NestedTensor_add_NestedTensor_in_place(
+    const Tensor& self,
+    const Tensor& other) {
+  TORCH_INTERNAL_ASSERT(self.is_nested() && other.is_nested());
+  const auto& nt_self = *get_nested_tensor_impl(self);
+  const auto& nt_other = *get_nested_tensor_impl(other);
+
+  const auto& self_sizes = nt_self.get_nested_size_tensor();
+  const auto& other_sizes = nt_other.get_nested_size_tensor();
+
+  TORCH_CHECK(at::equal(self_sizes, other_sizes));
+  TORCH_INTERNAL_ASSERT(
+      nested_tensor_impl_is_contiguous(&nt_self) &&
+      nested_tensor_impl_is_contiguous(&nt_other));
+  nt_self.get_buffer().view({-1}).add_(nt_other.get_buffer().view({-1}));
+  return self;
+}
+
+Tensor NestedTensor_batch_offsets_from_size_tensor(
+    const Tensor& sizes,
+    int64_t extra_elements) {
+  int64_t* const sizes_ptr = sizes.data_ptr<int64_t>();
+  Tensor offsets = at::empty({1 + sizes.size(0) + extra_elements}, at::kInt);
+  int32_t* const offsets_ptr = offsets.data_ptr<int32_t>();
+  offsets_ptr[0] = 0;
+  const auto sizes_size_1 = sizes.size(1);
+  const auto sizes_size_0 = sizes.size(0);
+  for (const auto i : c10::irange(sizes_size_0)) {
+    int64_t prod = 1;
+    for (const auto j : c10::irange(sizes_size_1)) {
+      prod *= sizes_ptr[i * sizes_size_1 + j];
+    }
+    offsets_ptr[i + 1] = offsets_ptr[i] + prod;
+  }
+  return offsets;
+}
+
+Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim) {
+  auto* nt_impl = get_nested_tensor_impl(nt);
+  TORCH_CHECK(
+      !mask_dim || *mask_dim < nt.dim(),
+      "Requested mask dimension ",
+      *mask_dim,
+      " is bigger than dimension ",
+      nt.dim(),
+      " of given NestedTensor.");
+
+  // TODO: port optimization for 1x1 tensors from
+  // pytorch/nestedtensor's version.
+
+  TORCH_CHECK(
+      mask_dim && *mask_dim == 2 && nt.dim() == 3,
+      "Only the special case of mask_dim == 2 on a 3-D NestedTensor is supported right now.")
+  const auto& sizes = nt_impl->get_nested_size_tensor();
+  // Shape: # of tensors in our NestedTensor by max size along first dim
+  // TODO: calculate this without allocating a std::vector.
+  const auto result_size_1 = NestedTensor_get_max_size(*nt_impl)[0];
+  auto result = at::ones({sizes.sizes()[0], result_size_1}, at::kBool);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.dim() == 2);
+  auto* result_data = result.data_ptr<bool>();
+  auto* sizes_ptr = sizes.data_ptr<int64_t>();
+  const auto sizes_size_1 = sizes.sizes()[1];
+  for (const auto ii : c10::irange(sizes.sizes()[0])) {
+    auto length = sizes_ptr[ii * sizes_size_1];
+    for (const auto jj : c10::irange(length)) {
+      result_data[ii * result_size_1 + jj] = false;
+    }
+  }
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
new file mode 100644
index 000000000000..a4b70d954c3f
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
@@ -0,0 +1,87 @@
+/**
+ * Transformer-specific NestedTensor utility functions.
+ *
+ * Not co-located with NestedTensor core code yet because they only
+ * support specific cases needed in transformers.
+ */
+#pragma once
+
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+} // namespace c10
+
+namespace at {
+class Tensor;
+namespace native {
+struct NestedTensorImpl;
+
+// Requires that self is a contiguous NestedTensor, other is not a
+// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self
+// must have a consistent last dimension across its included Tensors
+// and that dimension must match other.size(0).
+Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other);
+
+// Requires that mat1 is a contiguous NestedTensor, self & mat2 are
+// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1
+// has a consistent last dimension across its included Tensors that
+// matches mat2.size(0).
+Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::Scalar& beta,
+    const c10::Scalar& alpha,
+    c10::optional<bool> use_gelu = c10::nullopt);
+
+Tensor NestedTensor_add_NestedTensor_in_place(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor(
+    const Tensor& sizes,
+    int64_t extra_elements);
+
+Tensor NestedTensor_from_padded_tensor_cpu(
+    const Tensor& padded,
+    const NestedTensorImpl& nt);
+
+Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim);
+
+template <typename T>
+void remove_padding_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void remove_padding_transform0213_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
new file mode 100644
index 000000000000..f1cf67676ced
--- /dev/null
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -0,0 +1,206 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/NestedTensorImpl.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_nested_from_padded.h>
+#endif
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+#include <ATen/native/nested/NestedTensorMath.h>
+
+namespace at {
+namespace native {
+namespace {
+int64_t padded_tensor_numel(const Tensor& sizes) {
+  const auto sizes_num_rows = sizes.sizes()[0];
+  const auto sizes_row_length = sizes.sizes()[1];
+  const auto* sizes_data = sizes.data_ptr<int64_t>();
+  int64_t numel = 0;
+  for (const auto row_num : c10::irange(sizes_num_rows)) {
+    const auto* row_ptr = sizes_data + row_num * sizes_row_length;
+    int64_t prod = 1;
+    for (const auto idx : c10::irange(sizes_row_length)) {
+      prod *= row_ptr[idx];
+    }
+    numel += prod;
+  }
+  return numel;
+}
+} // namespace
+Tensor nested_from_padded_cuda(
+    const Tensor& padded,
+    const Tensor& sizes,
+    bool do_transform_0213) {
+  if (padded.dim() > 1 && padded.dim() < 5) {
+    if (padded.dtype() != kFloat && padded.dtype() != kHalf) {
+      TORCH_WARN_ONCE(
+          "nested_from_padded CUDA kernels only support fp32/fp16; falling "
+          "back to slower generic kernel");
+      return at::native::nested_from_padded_generic(padded, sizes, do_transform_0213);
+    }
+    TORCH_CHECK(
+        (padded.dim() == 4 && do_transform_0213) ||
+            (padded.dim() == 3 && !do_transform_0213),
+        "padded tensor size error");
+    Tensor target_offsets =
+        NestedTensor_batch_offsets_from_size_tensor(sizes, 0);
+    Tensor padded_sizes_tensor = at::tensor(padded.sizes());
+    Tensor output = at::empty({padded_tensor_numel(sizes)}, padded.options());
+    Tensor target_size_sizes = sizes.reshape(-1);
+
+    Tensor metadata =
+        at::cat({target_size_sizes, padded_sizes_tensor, target_offsets});
+    metadata = metadata.to(at::Device(kCUDA), kInt, true, true);
+
+    auto output_size_ptr = metadata.data_ptr<int>();
+    auto input_size_ptr = output_size_ptr + target_size_sizes.numel();
+    auto offsets_ptr = input_size_ptr + padded_sizes_tensor.numel();
+
+    if (padded.dtype() == kFloat) {
+      if (do_transform_0213) {
+        remove_padding_transform0213_kernelLauncher(
+            padded.data_ptr<float>(),
+            output.data_ptr<float>(),
+            offsets_ptr,
+            input_size_ptr,
+            output_size_ptr,
+            padded.dim() - 2,
+            padded.sizes()[0]);
+      } else {
+        remove_padding_kernelLauncher(
+            padded.data_ptr<float>(),
+            output.data_ptr<float>(),
+            offsets_ptr,
+            input_size_ptr,
+            output_size_ptr,
+            padded.dim() - 1,
+            padded.sizes()[0]);
+      }
+    } else if (padded.dtype() == kHalf) {
+      if (do_transform_0213) {
+        remove_padding_transform0213_kernelLauncher(
+            padded.data_ptr<c10::Half>(),
+            output.data_ptr<c10::Half>(),
+            offsets_ptr,
+            input_size_ptr,
+            output_size_ptr,
+            padded.dim() - 2,
+            padded.sizes()[0]);
+      } else {
+        remove_padding_kernelLauncher(
+            padded.data_ptr<c10::Half>(),
+            output.data_ptr<c10::Half>(),
+            offsets_ptr,
+            input_size_ptr,
+            output_size_ptr,
+            padded.dim() - 1,
+            padded.sizes()[0]);
+      }
+    } else {
+      AT_ERROR("Only support fp32/fp16 for padded input");
+    }
+    return at::detail::make_tensor<NestedTensorImpl>(std::move(output), sizes);
+  } else {
+    return at::native::nested_from_padded_generic(padded, sizes);
+  }
+}
+
+Tensor batch_offsets_from_efficient_size(const Tensor& ef_sizes) {
+  int64_t* nt_sizes_ptr = ef_sizes.data_ptr<int64_t>();
+  int64_t ef_sizes_size_0 = ef_sizes.sizes()[0];
+  Tensor offsets = at::empty({1 + ef_sizes_size_0}, at::kLong);
+  int64_t* offsets_ptr = offsets.data_ptr<int64_t>();
+  offsets_ptr[0] = 0;
+  int64_t ef_sizes_size_1 = ef_sizes.sizes()[1];
+  for (const auto i : c10::irange(ef_sizes_size_0)) {
+    int64_t prod = 1;
+    for (const auto j : c10::irange(ef_sizes_size_1)) {
+      prod = prod * nt_sizes_ptr[i * ef_sizes_size_1 + j];
+    }
+    offsets_ptr[i + 1] = offsets_ptr[i] + prod;
+  }
+  return offsets;
+}
+
+Tensor NestedTensor_to_padded_tensor_cuda(
+    const Tensor& t,
+    double padding,
+    OptionalIntArrayRef output_size) {
+  int64_t t_dim = t.dim();
+  if (t_dim >= 2 && t_dim <= 4 &&
+      (t.dtype() == at::kFloat || t.dtype() == at::kDouble ||
+       t.dtype() == at::kHalf)) {
+    auto* nt_input = get_nested_tensor_impl(t);
+    TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_input));
+    const auto& nt_buffer = nt_input->get_buffer();
+
+    if (t_dim == 3 && nt_input->opt_size(2) && (*nt_input->opt_size(2) > 0) &&
+        !(output_size.has_value())) {
+      Tensor nt_sizes = nt_input->get_nested_size_tensor();
+      Tensor sizes_dim1 = at::native::narrow(nt_sizes, 1, 0, 1);
+      Tensor sizes_dim2 = at::native::narrow(nt_sizes, 1, 1, 1);
+      Tensor result = at::detail::make_tensor<NestedTensorImpl>(
+          nt_input->get_buffer(), sizes_dim1 * sizes_dim2[0]);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.dim() == 2);
+      result =
+          NestedTensor_to_padded_tensor_cuda(result, padding, output_size);
+      return result.reshape({result.sizes()[0], -1, *nt_input->opt_size(2)});
+    }
+
+    Tensor nt_sizes = nt_input->get_nested_size_tensor();
+    Tensor offsets = batch_offsets_from_efficient_size(nt_sizes);
+    auto new_size = NestedTensor_get_max_size(*nt_input);
+    new_size.insert(new_size.begin(), nt_sizes.sizes()[0]);
+
+    // Pad output tensor to output_size if provided
+    if (output_size.has_value()) {
+      auto output_size_ = output_size.value();
+      TORCH_CHECK(
+          output_size_.size() == new_size.size(),
+          "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
+      for (uint64_t i = 0; i < new_size.size(); i++) {
+        TORCH_CHECK(
+            output_size_[i] >= new_size[i],
+            "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
+        new_size[i] = output_size_[i];
+      }
+    }
+
+    Tensor output = at::empty(IntArrayRef(new_size), nt_buffer.options());
+
+    int64_t input_dim = nt_sizes.sizes()[1];
+    int64_t batch_size = nt_sizes.sizes()[0];
+    int64_t output_batch_size = new_size[0];
+    // TODO: Remove need for cat here
+    at::Tensor metadata = at::cat({offsets, nt_sizes.reshape(-1)});
+    metadata = metadata.to(at::Device(kCUDA), at::kInt);
+
+    std::vector<Tensor> split =
+        at::split_with_sizes(metadata, {offsets.numel(), nt_sizes.numel()}, 0);
+
+    offsets = split[0];
+    nt_sizes = split[1];
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        nt_buffer.scalar_type(), "NestedTensor_to_padded_tensor_cuda", [&]() {
+          add_padding_kernelLauncher(
+              nt_buffer.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>(),
+              (scalar_t)(padding),
+              offsets.data_ptr<int>(),
+              nt_sizes.data_ptr<int>(),
+              input_dim,
+              new_size,
+              batch_size,
+              output_batch_size);
+        });
+    return output;
+  }
+  return NestedTensor_to_padded_tensor_generic(t, padding, output_size);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
new file mode 100644
index 000000000000..7e9f95aad747
--- /dev/null
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
@@ -0,0 +1,449 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/PersistentSoftmax.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+#include <c10/cuda/CUDAMathCompat.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+namespace at {
+namespace native {
+
+template <typename T>
+__global__ void remove_padding_transform0213_2(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int offset = offsets[batch_id];
+  const int* sizes_i = output_sizes + batch_id * output_dim;
+  const int numel_i = sizes_i[0] * sizes_i[1];
+  int input_offset =
+      batch_id * input_sizes[1] * input_sizes[2] * input_sizes[3];
+  for (int ii = 0; ii < (numel_i / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int i2 = i / sizes_i[1];
+    const int i13 = i % sizes_i[1];
+    const int i1 = i13 / (sizes_i[1] / input_sizes[1]);
+    const int i3 = i13 % (sizes_i[1] / input_sizes[1]);
+
+    output[offset + i] = input
+        [input_offset + i1 * input_sizes[2] * input_sizes[3] +
+         i2 * input_sizes[3] + i3];
+  }
+  const int i = (numel_i / grainsize) * grainsize + tid;
+  if (i < numel_i) {
+    const int i2 = i / sizes_i[1];
+    const int i13 = i % sizes_i[1];
+    const int i1 = i13 / (sizes_i[1] / input_sizes[1]);
+    const int i3 = i13 % (sizes_i[1] / input_sizes[1]);
+    output[offset + i] = input
+        [input_offset + i1 * input_sizes[2] * input_sizes[3] +
+         i2 * input_sizes[3] + i3];
+  }
+}
+
+template <typename T>
+__global__ void remove_padding_2(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int offset = offsets[batch_id];
+  const int* sizes_i = output_sizes + batch_id * output_dim;
+  const int numel_i = sizes_i[0] * sizes_i[1];
+  int input_offset = batch_id * input_sizes[1] * input_sizes[2];
+  for (int ii = 0; ii < (numel_i / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int i0 = i / sizes_i[1];
+    const int i1 = i % sizes_i[1];
+    const int i0_offset = i0 * input_sizes[2];
+    output[offset + i] = input[input_offset + i0_offset + i1];
+  }
+  const int i = (numel_i / grainsize) * grainsize + tid;
+  if (i < numel_i) {
+    const int i0 = i / sizes_i[1];
+    const int i1 = i % sizes_i[1];
+    const int i0_offset = i0 * input_sizes[2];
+    output[offset + i] = input[input_offset + i0_offset + i1];
+  }
+}
+
+template <typename T>
+__global__ void remove_padding(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int offset = offsets[batch_id];
+  const int* sizes_i = output_sizes + batch_id * output_dim;
+  const int numel_i = sizes_i[0] * sizes_i[1] * sizes_i[2];
+  int input_offset =
+      batch_id * input_sizes[1] * input_sizes[2] * input_sizes[3];
+  for (int ii = 0; ii < (numel_i / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int i0 = i / (sizes_i[1] * sizes_i[2]);
+    const int i1 = (i % (sizes_i[1] * sizes_i[2])) / sizes_i[2];
+    const int i2 = i % sizes_i[2];
+    const int i0_offset = i0 * input_sizes[2] * input_sizes[3];
+    const int i1_offset = i1 * input_sizes[3];
+    output[offset + i] = input[input_offset + i0_offset + i1_offset + i2];
+  }
+  const int i = (numel_i / grainsize) * grainsize + tid;
+  if (i < numel_i) {
+    const int i0 = i / (sizes_i[1] * sizes_i[2]);
+    const int i1 = (i % (sizes_i[1] * sizes_i[2])) / sizes_i[2];
+    const int i2 = i % sizes_i[2];
+    const int i0_offset = i0 * input_sizes[2] * input_sizes[3];
+    const int i1_offset = i1 * input_sizes[3];
+    output[offset + i] = input[input_offset + i0_offset + i1_offset + i2];
+  }
+}
+
+template <typename T>
+void remove_padding_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size) {
+  dim3 grid;
+  grid.x = batch_size;
+  grid.y = 16;
+  at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream();
+  if (output_dim == 2) {
+    remove_padding_2<T><<<grid, 256, 0, stream>>>(
+        input,
+        output,
+        offsets,
+        input_sizes,
+        output_sizes,
+        output_dim,
+        batch_size);
+  } else {
+    remove_padding<T><<<grid, 256, 0, stream>>>(
+        input,
+        output,
+        offsets,
+        input_sizes,
+        output_sizes,
+        output_dim,
+        batch_size);
+  }
+}
+
+template <typename T>
+void remove_padding_transform0213_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size) {
+  dim3 grid;
+  grid.x = batch_size;
+  grid.y = 16;
+  at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream();
+  TORCH_CHECK(
+      output_dim == 2,
+      "remove padding transform0213 only support output dim == 2");
+
+  remove_padding_transform0213_2<T><<<grid, 256, 0, stream>>>(
+      input,
+      output,
+      offsets,
+      input_sizes,
+      output_sizes,
+      output_dim,
+      batch_size);
+}
+
+template void remove_padding_kernelLauncher<float>(
+    const float* input,
+    float* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template void remove_padding_kernelLauncher<c10::Half>(
+    const c10::Half* input,
+    c10::Half* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template void remove_padding_transform0213_kernelLauncher<float>(
+    const float* input,
+    float* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template void remove_padding_transform0213_kernelLauncher<c10::Half>(
+    const c10::Half* input,
+    c10::Half* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+__global__ void add_padding_1(
+    const T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    int output_sizes_1,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int* sizes_i = input_sizes + batch_id * input_dim;
+  const int batch_output_offset = batch_id * output_sizes_1;
+  for (int ii = 0; ii < (output_sizes_1 / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int output_offset = batch_output_offset + i;
+    if (batch_id < batch_size && i < sizes_i[0]) {
+      const int batch_input_offset = offsets[batch_id];
+      output[output_offset] = input[batch_input_offset + i];
+    } else {
+      output[output_offset] = padding_value;
+    }
+  }
+  const int i = (output_sizes_1 / grainsize) * grainsize + tid;
+  if (i < output_sizes_1) {
+    const int output_offset = batch_output_offset + i;
+    if (batch_id < batch_size && (i < sizes_i[0])) {
+      const int batch_input_offset = offsets[batch_id];
+      output[output_offset] = input[batch_input_offset + i];
+    } else {
+      output[output_offset] = padding_value;
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_padding_2(
+    const T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    int output_sizes_1,
+    int output_sizes_2,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int* sizes_i = input_sizes + batch_id * input_dim;
+  const int output_offset = batch_id * output_sizes_1 * output_sizes_2;
+  const int output_numel = output_sizes_1 * output_sizes_2;
+  for (int ii = 0; ii < (output_numel / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int i0 = i / (output_sizes_2);
+    const int i1 = i - i0 * output_sizes_2;
+    if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1]) {
+      const int offset = offsets[batch_id];
+      const int input_offset = offset + i0 * sizes_i[1] + i1;
+      output[output_offset + i] = input[input_offset];
+    } else {
+      output[output_offset + i] = padding_value;
+    }
+  }
+  const int i = (output_numel / grainsize) * grainsize + tid;
+  if (i < output_numel) {
+    const int i0 = i / (output_sizes_2);
+    const int i1 = i - i0 * output_sizes_2;
+    if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1]) {
+      const int offset = offsets[batch_id];
+      const int input_offset = offset + i0 * sizes_i[1] + i1;
+      output[output_offset + i] = input[input_offset];
+    } else {
+      output[output_offset + i] = padding_value;
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_padding_3(
+    const T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    int output_sizes_1,
+    int output_sizes_2,
+    int output_sizes_3,
+    const int batch_size) {
+  const int batch_id = blockIdx.x;
+  const int grid_id = blockIdx.y;
+  const int tid = threadIdx.x + grid_id * 256;
+  const int grainsize = 16 * 256;
+  const int* sizes_i = input_sizes + batch_id * input_dim;
+  const int output_offset =
+      batch_id * output_sizes_1 * output_sizes_2 * output_sizes_3;
+  const int output_numel = output_sizes_1 * output_sizes_2 * output_sizes_3;
+  for (int ii = 0; ii < (output_numel / grainsize); ii++) {
+    const int i = ii * grainsize + tid;
+    const int i0 = i / (output_sizes_2 * output_sizes_3);
+    const int i1 = (i % (output_sizes_2 * output_sizes_3)) / output_sizes_3;
+    const int i2 = i % output_sizes_3;
+    if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) {
+      const int offset = offsets[batch_id];
+      const int input_offset =
+          offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2;
+      output[output_offset + i] = input[input_offset];
+    } else {
+      output[output_offset + i] = padding_value;
+    }
+  }
+  const int i = (output_numel / grainsize) * grainsize + tid;
+  if (i < output_numel) {
+    const int i0 = i / (output_sizes_2 * output_sizes_3);
+    const int i1 = (i % (output_sizes_2 * output_sizes_3)) / output_sizes_3;
+    const int i2 = i % output_sizes_3;
+    if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) {
+      const int offset = offsets[batch_id];
+      const int input_offset =
+          offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2;
+      output[output_offset + i] = input[input_offset];
+    } else {
+      output[output_offset + i] = padding_value;
+    }
+  }
+}
+
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input, // [batch_size x None]
+    T* output, // [batch_size x max(input.nested_size(1)) x inner_size]
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size) {
+  at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream();
+  dim3 grid;
+  grid.x = output_batch_size;
+  grid.y = 16;
+  if (input_dim == 1) {
+    add_padding_1<T><<<grid, 256, 0, stream>>>(
+        input,
+        output,
+        padding_value,
+        offsets,
+        input_sizes,
+        input_dim,
+        output_sizes[1],
+        batch_size);
+  }
+  if (input_dim == 2) {
+    add_padding_2<T><<<grid, 256, 0, stream>>>(
+        input,
+        output,
+        padding_value,
+        offsets,
+        input_sizes,
+        input_dim,
+        output_sizes[1],
+        output_sizes[2],
+        batch_size);
+  }
+  if (input_dim == 3) {
+    add_padding_3<T><<<grid, 256, 0, stream>>>(
+        input,
+        output,
+        padding_value,
+        offsets,
+        input_sizes,
+        input_dim,
+        output_sizes[1],
+        output_sizes[2],
+        output_sizes[3],
+        batch_size);
+  }
+}
+
+template void add_padding_kernelLauncher<double>(
+    double* input,
+    double* output,
+    double padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+
+template void add_padding_kernelLauncher<float>(
+    float* input,
+    float* output,
+    float padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+
+template void add_padding_kernelLauncher<c10::Half>(
+    c10::Half* input,
+    c10::Half* output,
+    c10::Half padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp
index ac3f5e9783d0..e3b6bd8cd669 100644
--- a/aten/src/ATen/native/quantized/Copy.cpp
+++ b/aten/src/ATen/native/quantized/Copy.cpp
@@ -13,7 +13,7 @@ namespace native {
 // This means that assignment of a non-contiguous quantized subtensor is currently not supported in pytorch
 // e.g., Consider a 2x2 quantized tensor qt1 and a non-quantized tensor t2. The operation
 // `qt1[:, 0] = t2[:, 0]` would trigger the exception b/c neither the LHS nor RHS is contiguous
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) {
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src) {
   TORCH_CHECK(
       src.scalar_type() == at::kFloat,
       "Quantized copy only works with kFloat as source Tensor");
@@ -23,21 +23,14 @@ Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) {
   TORCH_CHECK(
       self.sizes().equals(src.sizes()),
       "Quantized copy only works with Tensors with the same shape");
-  TORCH_CHECK(
-      self.device().type() == kCPU,
-      "Quantized copy only works with QuantizedCPU Tensors");
   AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() {
-    if (self.qscheme() == kPerChannelAffine) {
+    if (self.qscheme() == kPerChannelAffine || self.qscheme() == kPerChannelAffineFloatQParams
+        || self.qscheme() == kPerChannelSymmetric) {
       quantize_tensor_per_channel_affine(src, self, self.q_per_channel_scales(),
                                          self.q_per_channel_zero_points(),
                                          self.q_per_channel_axis());
     } else {
-      float* src_data = src.data_ptr<float>();
-      scalar_t* self_data = self.data_ptr<scalar_t>();
-      for (const auto i : c10::irange(self.numel())) {
-        self_data[i] = quantize_val<scalar_t>(
-            self.q_scale(), self.q_zero_point(), src_data[i]);
-      }
+      quantize_tensor_per_tensor_affine(src, self, self.q_scale(), self.q_zero_point());
     }
   });
   return self;
diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h
index 65dabd24f1f3..d52c8ff0fb2c 100644
--- a/aten/src/ATen/native/quantized/Copy.h
+++ b/aten/src/ATen/native/quantized/Copy.h
@@ -5,6 +5,6 @@
 namespace at {
 namespace native {
 
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src);
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src);
 }
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 5fefa3557f4b..6e858a3b5c25 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -15,8 +15,11 @@ Tensor quantize_per_tensor_dynamic(
     const Tensor& self,
     ScalarType dtype,
     bool reduce_range) {
-  TORCH_CHECK( (dtype == ScalarType::QInt8 || dtype == ScalarType::QUInt8), "dtype ", dtype, "not supported");
+  TORCH_CHECK( (dtype == ScalarType::QInt8 || dtype == ScalarType::QUInt8 || dtype == ScalarType::Half), "dtype ", dtype, "not supported");
   auto input_contig = self.contiguous();
+  if (dtype == ScalarType::Half) {
+    return input_contig.to(ScalarType::Half);
+  }
   float x_min = input_contig.min().item<float>();
   float x_max = input_contig.max().item<float>();
 
diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp
index 839068b28ec2..08a104257f4e 100644
--- a/aten/src/ATen/native/quantized/TensorCompare.cpp
+++ b/aten/src/ATen/native/quantized/TensorCompare.cpp
@@ -35,12 +35,5 @@ std::tuple<Tensor, Tensor> sort_quantized_cpu_stable(
       sort_indicies);
 }
 
-std::tuple<Tensor, Tensor> sort_quantized_cpu(
-    const Tensor& self,
-    int64_t dim,
-    bool descending) {
-  return sort_quantized_cpu_stable(self, /*stable=*/false, dim, descending);
-}
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index 08a972eacc38..aa0fef5df9dc 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -66,6 +66,40 @@ Tensor empty_per_channel_affine_quantized(
       quantizer);
 }
 
+Tensor empty_unknown_quantized(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    c10::optional<bool> pin_memory,
+    c10::optional<c10::MemoryFormat> optional_memory_format) {
+  // See [Note: hacky wrapper removal for TensorOptions]
+  TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+
+  TORCH_CHECK(
+    !(options_.has_memory_format() && optional_memory_format.has_value()),
+    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+    "the redundant setter.");
+  auto options = options_.merge_memory_format(optional_memory_format);
+  TORCH_CHECK(
+      options.has_dtype(),
+      "Must provide data type for Tensor creation functions.");
+  QuantizerPtr quantizer = make_unknown_quantizer(typeMetaToScalarType(options.dtype()));
+  return new_qtensor(size, options, quantizer);
+}
+
+Tensor empty_strided_unknown_quantized(
+    IntArrayRef size,
+    IntArrayRef strided,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    c10::optional<bool> pin_memory) {
+
+  TORCH_CHECK(false, "empty_strided not supported on quantized tensors yet see https://github.com/pytorch/pytorch/issues/74540")
+
+}
+
 // Provide better error message if dtype is wrong
 Tensor empty_affine_quantized_other_backends_stub(
     IntArrayRef,
diff --git a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp
index dc58f609f7a7..c99c81226ff5 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp
@@ -115,12 +115,13 @@ T quantize_val(double scale, int64_t zero_point, float value) {
   return static_cast<T>(qvalue);
 }
 
-uint8_t quantize_val_arm(
+template <typename T>
+T quantize_val_arm(
     const float scale,
     const int32_t zero_point,
     const float value) {
-  constexpr int32_t qmin = std::numeric_limits<uint8_t>::min();
-  constexpr int32_t qmax = std::numeric_limits<uint8_t>::max();
+  constexpr int32_t qmin = std::numeric_limits<T>::min();
+  constexpr int32_t qmax = std::numeric_limits<T>::max();
   float inv_scale = 1.0f / scale;
 #ifndef _MSC_VER
   auto r = static_cast<int32_t>(Round(value * inv_scale));
@@ -135,7 +136,7 @@ uint8_t quantize_val_arm(
 #endif
   r = std::max(r, qmin);
   r = std::min(r, qmax);
-  return static_cast<uint8_t>(r);
+  return static_cast<T>(r);
 }
 
 template <typename T, int precision>
@@ -151,6 +152,14 @@ void quantize_vec(
   }
 }
 
+template uint8_t quantize_val_arm<uint8_t>(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
+template int8_t quantize_val_arm<int8_t>(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
 template <typename T>
 TORCH_API float dequantize_val(double scale, int64_t zero_point, T value) {
   return static_cast<float>(scale) * (value.val_ - static_cast<int32_t>(zero_point));
diff --git a/aten/src/ATen/native/quantized/affine_quantizer_base.h b/aten/src/ATen/native/quantized/affine_quantizer_base.h
index 9e6a9ff58d24..31526c3ec3c5 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer_base.h
+++ b/aten/src/ATen/native/quantized/affine_quantizer_base.h
@@ -10,7 +10,8 @@ template <typename T>
 TORCH_API T quantize_val(double scale, int64_t zero_point, float value);
 // TODO combine this with quantize_val once the numerics for ARM are aligned
 // with it
-uint8_t quantize_val_arm(
+template <typename T>
+T quantize_val_arm(
     const float scale,
     const int32_t zero_point,
     const float value);
diff --git a/aten/src/ATen/native/quantized/cpu/conv_packed_params.h b/aten/src/ATen/native/quantized/cpu/conv_packed_params.h
deleted file mode 100644
index 130be6a0724d..000000000000
--- a/aten/src/ATen/native/quantized/cpu/conv_packed_params.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/core/ivalue.h>
-
-template <int kSpatialDim = 2>
-struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
-  virtual at::Tensor apply(
-      const at::Tensor& input,
-      double output_scale,
-      int64_t output_zero_point) = 0;
-  virtual at::Tensor apply_relu(
-      const at::Tensor& input,
-      double output_scale,
-      int64_t output_zero_point) = 0;
-  virtual at::Tensor apply_dynamic(
-      const at::Tensor& input,
-      bool reduce_range) = 0;
-
-  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
-
-  virtual torch::List<int64_t> stride() const = 0;
-  virtual torch::List<int64_t> padding() const = 0;
-  virtual torch::List<int64_t> output_padding() const = 0;
-  virtual torch::List<int64_t> dilation() const = 0;
-  virtual int64_t groups() const = 0;
-  virtual bool transpose() const = 0;
-};
diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index cf5c04977b6a..369f54b43961 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -4,6 +4,7 @@
 #include <ATen/core/List.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <c10/util/irange.h>
 
 #include <tuple>
@@ -358,6 +359,20 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
     );
   }
 #endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::ONEDNN) {
+    return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // AT_MKLDNN_ENABLED()
 TORCH_CHECK(
   false,
   "Didn't find engine for when deserializing ConvPackedParams: ",
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index ab6df06f7b73..da6064f9ddbc 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -1,10 +1,10 @@
 #include <ATen/ATen.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/conv_serialization.h>
 #include <ATen/native/quantized/cpu/embedding_packed_params.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/quantized/QTensorImpl.h>
 #include <ATen/quantized/Quantizer.h>
@@ -160,9 +160,10 @@ Tensor MakeStridedQTensorCPU(
       allocator->allocate(size_bytes),
       allocator,
       /* resizable = */ true);
+  constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU);
   auto tensor = detail::make_tensor<QTensorImpl>(
       storage,
-      at::DispatchKeySet(at::DispatchKey::QuantizedCPU),
+      quantized_cpu_ks,
       dtype,
       quantizer);
   get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides);
@@ -471,6 +472,16 @@ int register_linear_params() {
                       std::move(weight), std::move(bias));
                 }
 #endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+                if (at::globalContext().qEngine() == at::QEngine::ONEDNN) {
+                  TORCH_CHECK(
+                      weight.scalar_type() == at::kQInt8,
+                      "ONEDNN only supports INT8 bit width currently. Got ",
+                      c10::toString(weight.scalar_type()));
+                  return PackedLinearWeightsOnednn::prepack(
+                      std::move(weight), std::move(bias));
+                }
+#endif // #if AT_MKLDNN_ENABLED()
                 TORCH_CHECK(false, "Unknown qengine");
               })
               .def("bias", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) {
@@ -543,9 +554,9 @@ int register_embedding_params() {
 
 namespace {
 
-static auto conv2d_params = register_conv_params<2>();
-static auto conv3d_params = register_conv_params<3>();
-static auto linear_params = register_linear_params();
-static auto embedding_params = register_embedding_params();
+static C10_UNUSED auto conv2d_params = register_conv_params<2>();
+static C10_UNUSED auto conv3d_params = register_conv_params<3>();
+static C10_UNUSED auto linear_params = register_linear_params();
+static C10_UNUSED auto embedding_params = register_embedding_params();
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index 4282bb34dd43..c98ef18ec85c 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -1,9 +1,8 @@
 #pragma once
 
 #include <ATen/Tensor.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/embedding_packed_params.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
 
@@ -100,15 +99,15 @@ struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
   c10::optional<at::Tensor> bias_;
 
   at::Tensor apply(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) override {
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
     TORCH_INTERNAL_ASSERT(false);
   }
   at::Tensor apply_relu(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) override {
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
     TORCH_INTERNAL_ASSERT(false);
   }
 
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 23afea3e52ce..a42eeeac2234 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -1,13 +1,15 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
-#include <ATen/native/SortingUtils.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/TopKImpl.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <ATen/native/quantized/fake_quant_affine.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
+#include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
 
 #include <cmath>
@@ -104,69 +106,93 @@ Tensor qcat_nhwc_kernel(
   // which causes an internal compiler error if they're not
   AT_DISPATCH_QINT_TYPES(output.scalar_type(), "qcat_nhwc", [&, N, H, W]() {
     using Vec = Vectorized<scalar_t>;
-    for (const auto batch : c10::irange(N)) {
-      for (const auto row : c10::irange(H)) {
-        for (const auto col : c10::irange(W)) {
-          // loop over input tensors
-          for (const auto tidx : c10::irange(Cs_in.size())) {
-            scalar_t::underlying* optr =
-                reinterpret_cast<scalar_t::underlying*>(output.data_ptr()) +
-                batch * H * W * C_out + row * W * C_out + col * C_out +
-                Cs_sum[tidx];
-
-            auto curr_C = Cs_in[tidx];
-            float curr_scale = scales[tidx];
-            int64_t curr_zero_pt = zero_pts[tidx];
-
-            scalar_t::underlying* iptr =
-                reinterpret_cast<scalar_t::underlying*>(data_ptrs[tidx]) +
-                batch * H * W * curr_C + row * W * curr_C + col * curr_C;
-
-            constexpr int64_t VLEN = Vec::size();
-            int64_t c = 0;
+    at::parallel_for(0, N * H * W, 0, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+        // loop over input tensors
+        for (const auto tidx : c10::irange(Cs_in.size())) {
+          scalar_t::underlying* optr =
+              reinterpret_cast<scalar_t::underlying*>(output.data_ptr()) +
+              i * C_out + Cs_sum[tidx];
+
+          auto curr_C = Cs_in[tidx];
+          float curr_scale = scales[tidx];
+          int64_t curr_zero_pt = zero_pts[tidx];
+
+          scalar_t::underlying* iptr =
+              reinterpret_cast<scalar_t::underlying*>(data_ptrs[tidx]) +
+              i * curr_C;
+
+          constexpr int64_t VLEN = Vec::size();
+          int64_t c = 0;
 
-            // Vectorized loop
-            if (c + VLEN <= curr_C) {
-              auto curr_scale_vec = Vectorized<float>(curr_scale);
-              auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
-              auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
-              for (; c + VLEN <= curr_C; c += VLEN) {
-                auto inp_vec = Vec::loadu(iptr + c);
-                auto float_values = inp_vec.dequantize(
-                    curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul);
-                Vec::float_vec_return_type retvals;
-                for (int i = 0; i < Vec::float_num_vecs(); ++i) {
-                  if (ReLUFused) {
-                    retvals[i] =
-                        vec::maximum(float_values[i], Vectorized<float>(0.0f));
-                  } else {
-                    retvals[i] = float_values[i];
-                  }
+          // Vectorized loop
+          if (c + VLEN <= curr_C) {
+            auto curr_scale_vec = Vectorized<float>(curr_scale);
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
+            for (; c + VLEN <= curr_C; c += VLEN) {
+              auto inp_vec = Vec::loadu(iptr + c);
+              auto float_values = inp_vec.dequantize(
+                  curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul);
+              Vec::float_vec_return_type retvals;
+              for (int i = 0; i < Vec::float_num_vecs(); ++i) {
+                if (ReLUFused) {
+                  retvals[i] =
+                      vec::maximum(float_values[i], Vectorized<float>(0.0f));
+                } else {
+                  retvals[i] = float_values[i];
                 }
-                auto quantized =
-                    Vec::quantize(retvals, scale, zero_point, inv_scale);
-                quantized.store(optr + c);
               }
+              auto quantized =
+                  Vec::quantize(retvals, scale, zero_point, inv_scale);
+              quantized.store(optr + c);
             }
+          }
 
-            // Scalar loop
-            for (; c < curr_C; ++c) {
-              auto float_val = at::native::dequantize_val(
-                  curr_scale,
-                  curr_zero_pt,
-                  reinterpret_cast<scalar_t*>(iptr)[c]);
+          // Vectorized loop for channel between 8 and 32 (avx2)
+          constexpr int kVLEN = Vectorized<float>::size();
+          int64_t elem_size = curr_C - c;
+          if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
+            auto curr_scale_vec = Vectorized<float>(curr_scale);
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
+            int64_t vec_num = elem_size / kVLEN;
+            std::array<typename scalar_t::underlying, VLEN> buf_in;
+            memcpy(buf_in.data(), iptr + c, vec_num * kVLEN);
+            auto inp_vec = Vec::loadu(buf_in.data());
+            auto float_values = inp_vec.dequantize(
+                curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul);
+            Vec::float_vec_return_type retvals;
+            for (int i = 0; i < vec_num; ++i) {
               if (ReLUFused) {
-                float_val = std::max(0.0f, float_val);
+                retvals[i] =
+                    vec::maximum(float_values[i], Vectorized<float>(0.0f));
+              } else {
+                retvals[i] = float_values[i];
               }
-              optr[c] = at::native::quantize_val<scalar_t>(
-                            scale, zero_point, float_val)
-                            .val_;
-            } // for c
-
-          } // for tidx
-        } // for col
-      } // for row
-    } // for b
+            }
+            auto quantized =
+                Vec::quantize(retvals, scale, zero_point, inv_scale);
+            quantized.store(optr + c, vec_num * kVLEN);
+            c += vec_num * kVLEN;
+          }
+
+          // Scalar loop
+          for (; c < curr_C; ++c) {
+            auto float_val = at::native::dequantize_val(
+                curr_scale,
+                curr_zero_pt,
+                reinterpret_cast<scalar_t*>(iptr)[c]);
+            if (ReLUFused) {
+              float_val = std::max(0.0f, float_val);
+            }
+            optr[c] = at::native::quantize_val<scalar_t>(
+                          scale, zero_point, float_val)
+                          .val_;
+          } // for c
+        } // for tidx
+      } // for i
+    });
   });
 
   return output;
@@ -615,7 +641,7 @@ static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
   });
 }
 
-void qgelu_kernel(const Tensor& qx, Tensor& qy) {
+void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
   int64_t zero_point = qx.q_zero_point();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   float scale = qx.q_scale();
@@ -626,40 +652,83 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy) {
   float output_scale = scale;
   float inv_output_scale = 1.0 / output_scale;
   const auto kAlphaVec = Vectorized<float>(M_SQRT1_2);
+  const auto kBetaVec = Vectorized<float>(M_SQRT2 * M_2_SQRTPI * 0.5);
+  const auto kKappaVec = Vectorized<float>(0.044715);
   const auto kOneVec = Vectorized<float>(1);
   const auto kPointFiveVec = Vectorized<float>(0.5);
 
-  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() {
-    qy = at::_empty_affine_quantized(
-        qx.sizes(),
-        // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
-        at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()),
-        output_scale,
-        output_zero_point,
-        c10::nullopt);
-    auto iter = TensorIterator::unary_op(qy, qx);
-
-    using Vec = Vectorized<scalar_t>;
-    cpu_kernel_vec(
-        iter,
-        [&](scalar_t value_qx) -> scalar_t {
-          const auto value_dx =
-              at::native::dequantize_val(scale, zero_point, value_qx);
-          const auto value_dy =
-              value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2));
-          return at::native::quantize_val<scalar_t>(
-              output_scale, output_zero_point, value_dy);
-        },
-        [&](Vec value_qx) -> Vec {
-          auto value_dx = value_qx.dequantize(
-              scale_vec, zero_point_vec, scale_neg_zp_premul_vec);
-          for (auto & value : value_dx) {
-            value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf());
-          }
-          return Vec::quantize(
-              value_dx, output_scale, output_zero_point, inv_output_scale);
-        });
-  });
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() {
+      qy = at::_empty_affine_quantized(
+          qx.sizes(),
+          // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+          at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()),
+          output_scale,
+          output_zero_point,
+          c10::nullopt);
+      auto iter = TensorIterator::unary_op(qy, qx);
+
+      using Vec = Vectorized<scalar_t>;
+      cpu_kernel_vec(
+          iter,
+          [&](scalar_t value_qx) -> scalar_t {
+            const auto value_dx =
+                at::native::dequantize_val(scale, zero_point, value_qx);
+
+            const auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+            const auto kKappa = 0.044715;
+            const auto x_cube = value_dx * value_dx * value_dx;
+            const auto inner = kBeta * (value_dx + kKappa * x_cube);
+            const auto value_dy = 0.5 * value_dx * (1.0 + std::tanh(inner));
+
+            return at::native::quantize_val<scalar_t>(
+                output_scale, output_zero_point, value_dy);
+          },
+          [&](Vec value_qx) -> Vec {
+            auto value_dx = value_qx.dequantize(
+                scale_vec, zero_point_vec, scale_neg_zp_premul_vec);
+            for (auto & value : value_dx) {
+              auto value_cube = value * value * value;
+              auto inner = kBetaVec * (value + kKappaVec * value_cube);
+              value = kPointFiveVec * value * (kOneVec + inner.tanh());
+            }
+            return Vec::quantize(
+                value_dx, output_scale, output_zero_point, inv_output_scale);
+          });
+    });
+  } else {
+    AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() {
+      qy = at::_empty_affine_quantized(
+          qx.sizes(),
+          // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+          at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()),
+          output_scale,
+          output_zero_point,
+          c10::nullopt);
+      auto iter = TensorIterator::unary_op(qy, qx);
+
+      using Vec = Vectorized<scalar_t>;
+      cpu_kernel_vec(
+          iter,
+          [&](scalar_t value_qx) -> scalar_t {
+            const auto value_dx =
+                at::native::dequantize_val(scale, zero_point, value_qx);
+            const auto value_dy =
+                value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2));
+            return at::native::quantize_val<scalar_t>(
+                output_scale, output_zero_point, value_dy);
+          },
+          [&](Vec value_qx) -> Vec {
+            auto value_dx = value_qx.dequantize(
+                scale_vec, zero_point_vec, scale_neg_zp_premul_vec);
+            for (auto & value : value_dx) {
+              value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf());
+            }
+            return Vec::quantize(
+                value_dx, output_scale, output_zero_point, inv_output_scale);
+          });
+    });
+  }
 }
 
 
@@ -1314,87 +1383,85 @@ void qmaxpool_2d_nhwc_kernel(
     scalar_t* idata = static_cast<scalar_t*>(qx.data_ptr());
     scalar_t* odata = static_cast<scalar_t*>(qy.data_ptr());
 
-    // Loop over N
-    for (const auto b : c10::irange(qx.size(0))) {
-      // Loop over H
-      auto* i_p =
-          reinterpret_cast<scalar_t::underlying*>(idata + b * iW * iH * iC);
-      for (const auto row : c10::irange(oH)) {
-        // Loop over W
-        for (const auto col : c10::irange(oW)) {
-          // Pointer to output data for this specific N,H,W position
-          auto* o_p = reinterpret_cast<scalar_t::underlying*>(
-              odata + b * oH * oW * iC + row * oW * iC + col * iC);
-
-          // Loop over reduction block
-          int64_t h_start = row * sH - pH;
-          int64_t w_start = col * sW - pW;
-          int64_t h_end = std::min(h_start + (kH - 1) * dH + 1, iH);
-          int64_t w_end = std::min(w_start + (kW - 1) * dW + 1, iW);
-          while (h_start < 0)
-            h_start += dH;
-          while (w_start < 0)
-            w_start += dW;
-
-          int64_t c = 0;
-
-          // Interleaved vector loop 4x
-          constexpr auto vec_width = Vectorized<scalar_t>::size();
-          for (; c + 4 * vec_width <= iC; c += 4 * vec_width) {
-            Vectorized<scalar_t> acc{
-                scalar_t(std::numeric_limits<scalar_t::underlying>::lowest())};
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-            Vectorized<scalar_t> accs[4] = {acc, acc, acc, acc};
-            int64_t tcntr = 0;
-            int64_t x, y;
-            for (y = h_start; y < h_end; y += dH) {
-              for (x = w_start; x < w_end; x += dW) {
-                for (const auto i : c10::irange(4)) {
-                  tcntr = y * iW + x;
-                  auto vals = Vectorized<scalar_t>::loadu(
-                      i_p + tcntr * iC + c + Vectorized<scalar_t>::size() * i);
-                  accs[i] = vec::maximum(accs[i], vals);
-                }
-              } // for x
-            } // for y
-            for (const auto i : c10::irange(4)) {
-              accs[i].store(o_p + c + Vectorized<scalar_t>::size() * i);
-            }
-          } // for c
-
-          // Vector loop
-          for (; c + vec_width <= iC; c += vec_width) {
-            Vectorized<scalar_t> acc{
-                scalar_t(std::numeric_limits<scalar_t::underlying>::lowest())};
-            int64_t tcntr = 0;
-            int64_t x, y;
-            for (y = h_start; y < h_end; y += dH) {
-              for (x = w_start; x < w_end; x += dW) {
+    int64_t nBatch = qx.size(0);
+    at::parallel_for(0, nBatch * oH * oW, 0, [&](int64_t begin, int64_t end) {
+      int64_t b{0}, row{0}, col{0};
+      data_index_init(begin, b, nBatch, row, oH, col, oW);
+
+      for (const auto i : c10::irange(begin, end)) {
+        auto* i_p = reinterpret_cast<scalar_t::underlying*>(idata + b * iW * iH * iC);
+        auto* o_p = reinterpret_cast<scalar_t::underlying*>(odata + i * iC);
+
+        // Loop over reduction block
+        int64_t h_start = row * sH - pH;
+        int64_t w_start = col * sW - pW;
+        int64_t h_end = std::min(h_start + (kH - 1) * dH + 1, iH);
+        int64_t w_end = std::min(w_start + (kW - 1) * dW + 1, iW);
+        while (h_start < 0)
+          h_start += dH;
+        while (w_start < 0)
+          w_start += dW;
+
+        int64_t c = 0;
+
+        // Interleaved vector loop 4x
+        constexpr auto vec_width = Vectorized<scalar_t>::size();
+        for (; c + 4 * vec_width <= iC; c += 4 * vec_width) {
+          Vectorized<scalar_t> acc{
+              scalar_t(std::numeric_limits<scalar_t::underlying>::lowest())};
+          // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+          Vectorized<scalar_t> accs[4] = {acc, acc, acc, acc};
+          int64_t tcntr = 0;
+          int64_t x, y;
+          for (y = h_start; y < h_end; y += dH) {
+            for (x = w_start; x < w_end; x += dW) {
+              for (const auto i : c10::irange(4)) {
                 tcntr = y * iW + x;
-                auto vals = Vectorized<scalar_t>::loadu(i_p + tcntr * iC + c);
-                acc = vec::maximum(acc, vals);
-              } // for x
-            } // for y
-            acc.store(o_p + c);
-          } // for c
-
-          for (; c < iC; ++c) {
-            auto max_val = std::numeric_limits<scalar_t::underlying>::lowest();
-            int64_t tcntr = 0;
-            int64_t x, y;
-            for (y = h_start; y < h_end; y += dH) {
-              for (x = w_start; x < w_end; x += dW) {
-                tcntr = y * iW + x;
-                auto val = *(i_p + tcntr * iC + c);
-                max_val = std::max(max_val, val);
-              } // for x
-            } // for y
-
-            o_p[c] = max_val;
-          } // for c
-        } // for col
-      } // for row
-    } // for b
+                auto vals = Vectorized<scalar_t>::loadu(
+                    i_p + tcntr * iC + c + Vectorized<scalar_t>::size() * i);
+                accs[i] = vec::maximum(accs[i], vals);
+              }
+            } // for x
+          } // for y
+          for (const auto i : c10::irange(4)) {
+            accs[i].store(o_p + c + Vectorized<scalar_t>::size() * i);
+          }
+        } // for c
+
+        // Vector loop
+        for (; c + vec_width <= iC; c += vec_width) {
+          Vectorized<scalar_t> acc{
+              scalar_t(std::numeric_limits<scalar_t::underlying>::lowest())};
+          int64_t tcntr = 0;
+          int64_t x, y;
+          for (y = h_start; y < h_end; y += dH) {
+            for (x = w_start; x < w_end; x += dW) {
+              tcntr = y * iW + x;
+              auto vals = Vectorized<scalar_t>::loadu(i_p + tcntr * iC + c);
+              acc = vec::maximum(acc, vals);
+            } // for x
+          } // for y
+          acc.store(o_p + c);
+        } // for c
+
+        for (; c < iC; ++c) {
+          auto max_val = std::numeric_limits<scalar_t::underlying>::lowest();
+          int64_t tcntr = 0;
+          int64_t x, y;
+          for (y = h_start; y < h_end; y += dH) {
+            for (x = w_start; x < w_end; x += dW) {
+              tcntr = y * iW + x;
+              auto val = *(i_p + tcntr * iC + c);
+              max_val = std::max(max_val, val);
+            } // for x
+          } // for y
+
+          o_p[c] = max_val;
+        } // for c
+
+        data_index_step(b, nBatch, row, oH, col, oW);
+      }
+    });
   });
 }
 
@@ -1751,9 +1818,6 @@ void _qavg_pool_nhwc_kernel(
   int istrideH = strideW * inputWidth;
   int istrideD = istrideH * inputHeight;
   int istrideB = istrideD * inputDepth;
-  int ostrideH = strideW * outputWidth;
-  int ostrideD = ostrideH * outputHeight;
-  int ostrideB = ostrideD * outputDepth;
 
   // lift these operations outside the loop to reduce access overheads
   float input_scale = qx.q_scale();
@@ -1763,85 +1827,81 @@ void _qavg_pool_nhwc_kernel(
   int64_t divisor_override_factor =
       divisor_override.has_value() ? divisor_override.value() : 0;
 
-  at::parallel_for(0, nBatch, 0, [&](int64_t batch_start, int64_t batch_end) {
-    for (int64_t b = batch_start; b < batch_end; ++b) {
-      auto* i_p =
-          reinterpret_cast<typename T::underlying*>(idata + b * istrideB);
-      for (int od = 0; od < outputDepth; od++) {
-        for (int oh = 0; oh < outputHeight; oh++) {
-          for (int ow = 0; ow < outputWidth; ow++) {
-            auto* o_p = reinterpret_cast<typename T::underlying*>(
-                odata + b * ostrideB + od * ostrideD + oh * ostrideH +
-                ow * strideW);
-            int dstart = od * dD - padD;
-            int hstart = oh * dH - padH;
-            int wstart = ow * dW - padW;
-
-            int dend = std::min(dstart + kD, (int)inputDepth + padD);
-            int hend = std::min(hstart + kH, (int)inputHeight + padH);
-            int wend = std::min(wstart + kW, (int)inputWidth + padW);
-            int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-
-            dstart = std::max(dstart, 0);
-            hstart = std::max(hstart, 0);
-            wstart = std::max(wstart, 0);
-            dend = std::min(dend, (int)inputDepth);
-            hend = std::min(hend, (int)inputHeight);
-            wend = std::min(wend, (int)inputWidth);
-
-            int size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            int divide_size = count_include_pad ? pool_size : size;
-            int divide_factor =
-                divisor_override_factor ? divisor_override_factor : divide_size;
-            float multiplier = input_scale / output_scale  / divide_factor;
-            int input_zero_point_m_size = -input_zero_point * size;
-
-            int c_start = 0;
-
-            // For int8 quantization, we implicitly use int32 as accumulation
-            // Or else, it will go to the slow path
-            // TODO: support 16bit, 32bit, and etc.
-            do_avg_pool_nhwc_on_AVX_n<T>(
-                i_p,
-                o_p,
-                c_start,
-                input_zero_point_m_size,
-                output_zero_point,
-                multiplier,
-                dstart,
-                dend,
-                hstart,
-                hend,
-                wstart,
-                wend,
-                inputDepth,
-                inputHeight,
-                inputWidth,
-                nInputPlane);
-
-            // 1) The following loop handles the remaining channels
-            // 2) It also handles the Non-AVX2 path
-            for (int c = c_start; c < nInputPlane; ++c) {
-              int32_t acc_int32 = input_zero_point_m_size;
-              for (int64_t id = dstart; id < dend; id++) {
-                for (int64_t ih = hstart; ih < hend; ih++) {
-                  for (int64_t iw = wstart; iw < wend; iw++) {
-                    auto val =
-                        *(i_p + id * istrideD + ih * istrideH + iw * strideW +
-                          c * strideC);
-                    acc_int32 += val;
-                  }
-                }
-              }
-              double acc_fp = acc_int32 * 1.0;
-              // clamp
-              o_p[c] = at::native::quantize_val<T>(
-                            1.0f / multiplier, output_zero_point, acc_fp)
-                            .val_;
-            } // c
-          } // ow
-        } // oh
-      } // od
+  at::parallel_for(0, nBatch * outputDepth * outputHeight * outputWidth, 0, [&](int64_t begin, int64_t end) {
+    int64_t b{0}, od{0}, oh{0}, ow{0};
+    data_index_init(begin, b, nBatch, od, outputDepth, oh, outputHeight, ow, outputWidth);
+
+    for (const auto i : c10::irange(begin, end)) {
+      auto* i_p = reinterpret_cast<typename T::underlying*>(idata + b * istrideB);
+      auto* o_p = reinterpret_cast<typename T::underlying*>(odata + i * strideW);
+      int dstart = od * dD - padD;
+      int hstart = oh * dH - padH;
+      int wstart = ow * dW - padW;
+
+      int dend = std::min(dstart + kD, (int)inputDepth + padD);
+      int hend = std::min(hstart + kH, (int)inputHeight + padH);
+      int wend = std::min(wstart + kW, (int)inputWidth + padW);
+      int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+      dstart = std::max(dstart, 0);
+      hstart = std::max(hstart, 0);
+      wstart = std::max(wstart, 0);
+      dend = std::min(dend, (int)inputDepth);
+      hend = std::min(hend, (int)inputHeight);
+      wend = std::min(wend, (int)inputWidth);
+
+      int size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+      int divide_size = count_include_pad ? pool_size : size;
+      int divide_factor =
+          divisor_override_factor ? divisor_override_factor : divide_size;
+      float multiplier = input_scale / output_scale  / divide_factor;
+      int input_zero_point_m_size = -input_zero_point * size;
+
+      int c_start = 0;
+
+      // For int8 quantization, we implicitly use int32 as accumulation
+      // Or else, it will go to the slow path
+      // TODO: support 16bit, 32bit, and etc.
+      do_avg_pool_nhwc_on_AVX_n<T>(
+          i_p,
+          o_p,
+          c_start,
+          input_zero_point_m_size,
+          output_zero_point,
+          multiplier,
+          dstart,
+          dend,
+          hstart,
+          hend,
+          wstart,
+          wend,
+          inputDepth,
+          inputHeight,
+          inputWidth,
+          nInputPlane);
+
+      // 1) The following loop handles the remaining channels
+      // 2) It also handles the Non-AVX2 path
+      for (const auto c: c10::irange(c_start, nInputPlane)) {
+        int32_t acc_int32 = input_zero_point_m_size;
+        for (const auto id : c10::irange(dstart, dend)) {
+          for (const auto ih : c10::irange(hstart, hend)) {
+            for (const auto iw : c10::irange(wstart, wend)) {
+              auto val =
+                  *(i_p + id * istrideD + ih * istrideH + iw * strideW +
+                  c * strideC);
+              acc_int32 += val;
+            }
+          }
+       }
+       double acc_fp = acc_int32 * 1.0;
+       // clamp
+       o_p[c] = at::native::quantize_val<T>(
+           1.0f / multiplier, output_zero_point, acc_fp)
+           .val_;
+      } // c
+
+      data_index_step(b, nBatch, od, outputDepth, oh, outputHeight, ow, outputWidth);
     }
   });
 }
@@ -2019,88 +2079,90 @@ void qupsample_bilinear2d_nhwc_kernel(
     bool align_corners,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_QINT_TYPES(
-      input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() {
-        auto* idata = static_cast<scalar_t*>(input.data_ptr());
-        auto* odata = static_cast<scalar_t*>(output.data_ptr());
-        float inverse_scale = output.q_scale() / input.q_scale();
-        const auto rheight = area_pixel_compute_scale<float>(
-            input_height, output_height, align_corners, scales_h);
-        const auto rwidth = area_pixel_compute_scale<float>(
-            input_width, output_width, align_corners, scales_w);
-
-        const int64_t input_q_zero_point = input.q_zero_point();
-        const int64_t output_q_zero_point = output.q_zero_point();
-
-        for (const auto b : c10::irange(nbatch)) {
-          auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(
-              idata + b * input_height * input_width * channels);
-          auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(
-              odata + b * output_height * output_width * channels);
-
-          for (const auto h2 : c10::irange(output_height)) {
-            const auto h1r = area_pixel_compute_source_index<float>(
-                rheight, h2, align_corners, /*cubic=*/false);
-
-            const int64_t h1 = h1r;
-            const int64_t h1p = (h1 < input_height - 1) ? 1 : 0;
-            const float h1lambda = h1r - h1;
-            const float h0lambda = static_cast<float>(1.) - h1lambda;
-
-            for (const auto w2 : c10::irange(output_width)) {
-              const auto w1r = area_pixel_compute_source_index<float>(
-                  rwidth, w2, align_corners, /*cubic=*/false);
-              const int64_t w1 = w1r;
-              const int64_t w1p = (w1 < input_width - 1) ? 1 : 0;
-
-              const float w1lambda = w1r - w1;
-              const float w0lambda = static_cast<float>(1.) - w1lambda;
-
-              int64_t c = 0;
-              // We use float32 to do the computation
-              const typename scalar_t::underlying* pos1 =
-                  i_p + (h1 * input_width + w1) * channels;
-              typename scalar_t::underlying* pos2 =
-                  o_p + (h2 * output_width + w2) * channels;
-              // We have to isolate this function out because the VS does not
-              // expand the macro correctly.
-              c = do_quantized_bilinear_on_AVX_n<scalar_t>(
-                  pos1,
-                  pos2,
-                  input_height,
-                  input_width,
-                  output_height,
-                  output_width,
-                  channels,
-                  output_q_zero_point,
-                  input_q_zero_point,
-                  inverse_scale,
-                  h0lambda,
-                  h1lambda,
-                  w0lambda,
-                  w1lambda,
-                  h1p,
-                  w1p);
-              // 1) The following loop handles the remaining channels
-              // 2) It also handles the Non-AVX2 path
-              for (; c < channels; ++c) {
-                float result = h0lambda *
-                        (w0lambda * pos1[0] + w1lambda * pos1[w1p * channels]) +
-                    h1lambda *
-                        (w0lambda * pos1[h1p * input_width * channels] +
-                         w1lambda * pos1[(h1p * input_width + w1p) * channels]);
-                pos2[0] = at::native::quantize_val<scalar_t>(
-                              inverse_scale,
-                              output_q_zero_point,
-                              result - input_q_zero_point)
-                              .val_;
-                pos1 += 1;
-                pos2 += 1;
-              } // c
-            } // w2
-          } // h2
-        } // b
-      });
+  AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() {
+    auto* idata = static_cast<scalar_t*>(input.data_ptr());
+    auto* odata = static_cast<scalar_t*>(output.data_ptr());
+    float inverse_scale = output.q_scale() / input.q_scale();
+    const auto rheight = area_pixel_compute_scale<float>(
+        input_height, output_height, align_corners, scales_h);
+    const auto rwidth = area_pixel_compute_scale<float>(
+        input_width, output_width, align_corners, scales_w);
+
+    auto input_q_zero_point = input.q_zero_point();
+    auto output_q_zero_point = output.q_zero_point();
+    at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+      int64_t b{0}, h2{0}, w2{0};
+      data_index_init(begin, b, nbatch, h2, output_height, w2, output_width);
+
+      for (const auto i : c10::irange(begin, end)) {
+        (void)i; //Suppress unused variable warning
+        auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(
+            idata + b * input_height * input_width * channels);
+        auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(
+            odata + b * output_height * output_width * channels);
+
+        const auto h1r = area_pixel_compute_source_index<float>(
+            rheight, h2, align_corners, /*cubic=*/false);
+
+        const int64_t h1 = h1r;
+        const int64_t h1p = (h1 < input_height - 1) ? 1 : 0;
+        const float h1lambda = h1r - h1;
+        const float h0lambda = static_cast<float>(1.) - h1lambda;
+
+        const auto w1r = area_pixel_compute_source_index<float>(
+            rwidth, w2, align_corners, /*cubic=*/false);
+        const int64_t w1 = w1r;
+        const int64_t w1p = (w1 < input_width - 1) ? 1 : 0;
+
+        const float w1lambda = w1r - w1;
+        const float w0lambda = static_cast<float>(1.) - w1lambda;
+
+        int64_t c = 0;
+        // We use float32 to do the computation
+        const typename scalar_t::underlying* pos1 =
+            i_p + (h1 * input_width + w1) * channels;
+        typename scalar_t::underlying* pos2 =
+            o_p + (h2 * output_width + w2) * channels;
+        // We have to isolate this function out because the VS does not
+        // expand the macro correctly.
+        c = do_quantized_bilinear_on_AVX_n<scalar_t>(
+            pos1,
+            pos2,
+            input_height,
+            input_width,
+            output_height,
+            output_width,
+            channels,
+            output_q_zero_point,
+            input_q_zero_point,
+            inverse_scale,
+            h0lambda,
+            h1lambda,
+            w0lambda,
+            w1lambda,
+            h1p,
+            w1p);
+        // 1) The following loop handles the remaining channels
+        // 2) It also handles the Non-AVX2 path
+        for (; c < channels; ++c) {
+          float result = h0lambda *
+                  (w0lambda * pos1[0] + w1lambda * pos1[w1p * channels]) +
+              h1lambda *
+                  (w0lambda * pos1[h1p * input_width * channels] +
+                   w1lambda * pos1[(h1p * input_width + w1p) * channels]);
+          pos2[0] = at::native::quantize_val<scalar_t>(
+                        inverse_scale,
+                        output_q_zero_point,
+                        result - input_q_zero_point)
+                        .val_;
+          pos1 += 1;
+          pos2 += 1;
+        } // c
+
+        data_index_step(b, nbatch, h2, output_height, w2, output_width);
+      }
+    });
+  });
 }
 
 void qtopk_kernel(Tensor& values,
@@ -2201,65 +2263,66 @@ void q_batch_norm_kernel(
     auto scale_neg_zp_premul = fake_scale * in_zp_vec.neg();
     auto out_zero_point_v = Vec(scalar_t(out_zero_point));
     const auto lanes = static_cast<int64_t>(Vec::float_num_vecs() * kVLen);
-    for (const auto i : c10::irange(outer_size)) {
-      auto* X_ptr = reinterpret_cast<typename scalar_t::underlying*>(X + i * C);
-      auto* Y_ptr = reinterpret_cast<typename scalar_t::underlying*>(Y + i * C);
-      int64_t ch = 0;
-
-      for(; ch + lanes <= C; ch += lanes ) {
-        do_bn_compute<scalar_t>(
-          X_ptr + ch,
-          Y_ptr + ch,
-          fake_scale,
-          in_zp_vec,
-          scale_neg_zp_premul,
-          out_zero_point,
-          out_zero_point_v,
-          alpha + ch,
-          beta + ch,
-          Vec::float_num_vecs(),
-          ReluFused,
-          kVLen
-        );
-      }
+    at::parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+        auto* X_ptr = reinterpret_cast<typename scalar_t::underlying*>(X + i * C);
+        auto* Y_ptr = reinterpret_cast<typename scalar_t::underlying*>(Y + i * C);
+        int64_t ch = 0;
+
+        for(; ch + lanes <= C; ch += lanes) {
+          do_bn_compute<scalar_t>(
+            X_ptr + ch,
+            Y_ptr + ch,
+            fake_scale,
+            in_zp_vec,
+            scale_neg_zp_premul,
+            out_zero_point,
+            out_zero_point_v,
+            alpha + ch,
+            beta + ch,
+            Vec::float_num_vecs(),
+            ReluFused,
+            kVLen
+          );
+        }
 
-      // for channel between 8 and 32, still use 32 width for performance
-      // Benchmark shows it is faster than doing 8 channels each time
-      int64_t elem_size = C - ch;
-      if ((lanes == 32) && elem_size >= kVLen) {
-        int64_t vec_num = elem_size / kVLen;
-        std::vector<typename scalar_t::underlying> buf_in(lanes);
-        memcpy(buf_in.data(), X_ptr + ch, vec_num * kVLen); // 3 cycles
-        do_bn_compute<scalar_t>(
-          buf_in.data(),
-          Y_ptr + ch,
-          fake_scale,
-          in_zp_vec,
-          scale_neg_zp_premul,
-          out_zero_point,
-          out_zero_point_v,
-          alpha + ch,
-          beta + ch,
-          vec_num,
-          ReluFused,
-          kVLen
-        );
-        ch += vec_num * kVLen;
-      }
-      // for channels less than 8
-      for (; ch < C; ++ch) {
-        long quantized_down = out_zero_point +
-            lrintf(alpha[ch] * (X_ptr[ch] - in_zero_point) +
-                        beta[ch]);
-        if (ReluFused) { // static if
-          quantized_down = std::max<long>(quantized_down, out_zero_point);
+        // for channel between 8 and 32, still use 32 width for performance
+        // Benchmark shows it is faster than doing 8 channels each time
+        int64_t elem_size = C - ch;
+        if ((lanes == 32) && elem_size >= kVLen) {
+          int64_t vec_num = elem_size / kVLen;
+          std::vector<typename scalar_t::underlying> buf_in(lanes);
+          memcpy(buf_in.data(), X_ptr + ch, vec_num * kVLen); // 3 cycles
+          do_bn_compute<scalar_t>(
+            buf_in.data(),
+            Y_ptr + ch,
+            fake_scale,
+            in_zp_vec,
+            scale_neg_zp_premul,
+            out_zero_point,
+            out_zero_point_v,
+            alpha + ch,
+            beta + ch,
+            vec_num,
+            ReluFused,
+            kVLen
+          );
+          ch += vec_num * kVLen;
+        }
+        // for channels less than 8
+        for (; ch < C; ++ch) {
+          long quantized_down = out_zero_point +
+              lrintf(alpha[ch] * (X_ptr[ch] - in_zero_point) +
+                          beta[ch]);
+          if (ReluFused) { // static if
+            quantized_down = std::max<long>(quantized_down, out_zero_point);
+          }
+          Y_ptr[ch] = std::min<long>(
+              std::max<long>(quantized_down, minimum), maximum);
         }
-        Y_ptr[ch] = std::min<long>(
-            std::max<long>(quantized_down, minimum), maximum);
       }
-    }
-});
-
+    });
+  });
 }
 
 void _fake_quantize_tensor_helper(
@@ -2740,22 +2803,50 @@ void quantize_tensor_arm(
   }
 }
 
+namespace quantize_tensor_arm_intrinsics {
+template <typename Tx8>
+C10_ALWAYS_INLINE Tx8 vqmov(int16x8_t vraw);
+
+template <>
+C10_ALWAYS_INLINE uint8x8_t vqmov<uint8x8_t>(int16x8_t vraw) {
+  return vqmovun_s16(vraw);
+}
+
+template <>
+C10_ALWAYS_INLINE int8x8_t vqmov<int8x8_t>(int16x8_t vraw) {
+  return vqmovn_s16(vraw);
+}
+
+template <typename T, typename Tx8>
+C10_ALWAYS_INLINE void vst1(T* out, Tx8 vout);
+
+template <>
+C10_ALWAYS_INLINE void vst1<uint8_t, uint8x8_t>(uint8_t* out, uint8x8_t vout) {
+  vst1_u8(out, vout);
+}
+
+template <>
+C10_ALWAYS_INLINE void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
+  vst1_s8(out, vout);
+}
+} // namespace quantize_tensor_arm_intrinsics
+
 // Specialized implementation from caffe2::Int8Quantize.
 // There may be slight accuracy difference between this and implementation of
 // quantize_val
 // TODO Update quantize_tensor_arm implementation to follow quantize_val,
 // i.e. f = Round(value/scale + zero_point)
-// TODO Make quantize_tensor_arm work for other datatypes too (int8, int32).
-template <>
-void quantize_tensor_arm<c10::quint8>(
+// TODO Make quantize_tensor_arm work for int32 datatype too.
+template <typename scalar_t, typename underlying_t, typename underlying_x8_t>
+void quantize_tensor_arm_q8(
     const float* __restrict__ in,
-    c10::quint8* __restrict__ out,
+    scalar_t* __restrict__ out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
   const float inv_scale = 1.0f / scale;
   uint32_t i = 0;
-  uint8_t* out_underlying = reinterpret_cast<uint8_t*>(out);
+  underlying_t* out_underlying = reinterpret_cast<underlying_t*>(out);
   const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
 #if defined(__ARM_NEON__)
   // magic float and magic int to take care of rounding
@@ -2786,12 +2877,15 @@ void quantize_tensor_arm<c10::quint8>(
             vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
     const int16x8_t vraw01234567 =
         vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
-    const uint8x8_t vout01234567 = vqmovun_s16(vraw01234567);
-    vst1_u8(out_underlying, vout01234567);
+    const underlying_x8_t vout01234567 =
+        quantize_tensor_arm_intrinsics::vqmov<underlying_x8_t>(vraw01234567);
+    quantize_tensor_arm_intrinsics::vst1<underlying_t, underlying_x8_t>(
+        out_underlying, vout01234567);
     out_underlying += 8;
   }
   for (; i < N; ++i) {
-    (*out_underlying++) = at::native::quantize_val_arm(scale, zero_point, (*in++));
+    (*out_underlying++) =
+        at::native::quantize_val_arm<underlying_t>(scale, zero_point, (*in++));
   }
 #else
   const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t)zero_point);
@@ -2804,16 +2898,42 @@ void quantize_tensor_arm<c10::quint8>(
     const int32x4_t v4567_rounded = vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale));
     const int16x8_t v01234567_packed = vqaddq_s16(
         vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point);
-    const uint8x8_t vout01234567 = vqmovun_s16(v01234567_packed);
-    vst1_u8(out_underlying, vout01234567);
+    const underlying_x8_t vout01234567 =
+        quantize_tensor_arm_intrinsics::vqmov<underlying_x8_t>(
+            v01234567_packed);
+    quantize_tensor_arm_intrinsics::vst1<underlying_t, underlying_x8_t>(
+        out_underlying, vout01234567);
     out_underlying += 8;
   }
   for (; i < N; ++i) {
-    (*out_underlying++) = at::native::quantize_val_arm(scale, zero_point, (*in++));
+    (*out_underlying++) =
+        at::native::quantize_val_arm<underlying_t>(scale, zero_point, (*in++));
   }
 #endif
 }
 
+template <>
+void quantize_tensor_arm<c10::quint8>(
+    const float* __restrict__ in,
+    c10::quint8* __restrict__ out,
+    const int64_t N,
+    const float scale,
+    const int32_t zero_point) {
+  quantize_tensor_arm_q8<c10::quint8, uint8_t, uint8x8_t>(
+      in, out, N, scale, zero_point);
+}
+
+template <>
+void quantize_tensor_arm<c10::qint8>(
+    const float* __restrict__ in,
+    c10::qint8* __restrict__ out,
+    const int64_t N,
+    const float scale,
+    const int32_t zero_point) {
+  quantize_tensor_arm_q8<c10::qint8, int8_t, int8x8_t>(
+      in, out, N, scale, zero_point);
+}
+
 #if defined(__aarch64__)
 #define VMOVL_HIGH_U8(x) vmovl_high_u8(x)
 #define VMOVL_HIGH_S8(x) vmovl_high_s8(x)
@@ -3132,8 +3252,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
           out += 8;
         }
         for (; c < channels; ++c) {
-          (*out++) =
-              at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++));
+          (*out++) = at::native::quantize_val_arm<uint8_t>(
+              scales_data[c], zero_points_data[c], (*in++));
         }
       }
     }
@@ -3163,8 +3283,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
           out += 8;
         }
         for (; e < elements_per_channel; ++e) {
-          (*out++) =
-              at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++));
+          (*out++) = at::native::quantize_val_arm<uint8_t>(
+              scales_data[c], zero_points_data[c], (*in++));
         }
       }
     }
@@ -3210,8 +3330,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
           out += 8;
         }
         for (; c < channels; ++c) {
-          (*out++) =
-              at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++));
+          (*out++) = at::native::quantize_val_arm<uint8_t>(
+              scales_data[c], zero_points_data[c], (*in++));
         }
       }
     }
@@ -3238,8 +3358,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
           out += 8;
         }
         for (; e < elements_per_channel; ++e) {
-          (*out++) =
-              at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++));
+          (*out++) = at::native::quantize_val_arm<uint8_t>(
+              scales_data[c], zero_points_data[c], (*in++));
         }
       }
     }
diff --git a/aten/src/ATen/native/quantized/cpu/onednn_utils.h b/aten/src/ATen/native/quantized/cpu/onednn_utils.h
new file mode 100644
index 000000000000..4ee8e8737fb2
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/onednn_utils.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <ATen/Config.h>
+#if AT_MKLDNN_ENABLED()
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
+#include <ATen/native/mkldnn/Utils.h>
+
+struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
+  PackedLinearWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      c10::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> orig_bias)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        orig_weight_(std::move(orig_weight)),
+        orig_bias_(std::move(orig_bias)) {}
+  std::unique_ptr<ideep::tensor> weight_;
+  c10::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  c10::optional<at::Tensor> orig_bias_;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return orig_bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+ private:
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false);
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      c10::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> orig_bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      uint8_t transpose)
+    : weight_(std::move(weight)),
+    bias_(std::move(bias)),
+    orig_weight_(std::move(orig_weight)),
+    orig_bias_(std::move(orig_bias)),
+    stride_(std::move(stride)),
+    padding_(std::move(padding)),
+    output_padding_(std::move(output_padding)),
+    dilation_(std::move(dilation)),
+    groups_(groups),
+    transpose_(transpose) {}
+
+  std::unique_ptr<ideep::tensor> weight_;
+  c10::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  c10::optional<at::Tensor> orig_bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  uint8_t transpose_;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return (bool)transpose_;
+  }
+
+ private:
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+};
+
+#endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/packed_params.h b/aten/src/ATen/native/quantized/cpu/packed_params.h
deleted file mode 100644
index 49bd26de5f55..000000000000
--- a/aten/src/ATen/native/quantized/cpu/packed_params.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#include <ATen/core/ivalue.h>
-
-struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
-  virtual at::Tensor apply(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) = 0;
-  virtual at::Tensor apply_relu(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) = 0;
-
-  // out variant of LinearPackedParamsBase::apply
-  virtual at::Tensor& apply_out(
-      const at::Tensor& input,
-      double output_scale,
-      int64_t output_zero_point,
-      at::Tensor& output) {
-    throw std::runtime_error(
-        "apply_out is not implemented for this packed "
-        "parameter type");
-    return output;
-  }
-
-  virtual at::Tensor& apply_relu_out(
-      const at::Tensor& input,
-      double output_scale,
-      int64_t output_zero_point,
-      at::Tensor& output) {
-    throw std::runtime_error(
-        "apply_relu_out is not implemented for this packed "
-        "parameter type");
-    return output;
-  }
-
-  virtual at::Tensor apply_dynamic(
-      at::Tensor input,
-      bool reduce_range = false) = 0;
-  virtual at::Tensor apply_dynamic_relu(
-      at::Tensor input,
-      bool reduce_range = false) = 0;
-
-  virtual at::Tensor& apply_dynamic_out(
-      const at::Tensor& /* input */,
-      at::Tensor& output,
-      bool /* reduce_range */) {
-    throw std::runtime_error(
-        "apply_dynamic_out is not implemented for this packed "
-        "parameter type");
-    return output;
-  }
-  virtual at::Tensor& apply_dynamic_relu_out(
-      const at::Tensor& /* input */,
-      at::Tensor& output,
-      bool /* reduce_range */) {
-    throw std::runtime_error(
-        "apply_dynamic_relu_out is not implemented for this packed "
-        "parameter type");
-    return output;
-  }
-
-  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
-
-  virtual c10::optional<at::Tensor> bias() = 0;
-
-  virtual void set_bias(c10::optional<at::Tensor> bias) {
-    throw std::runtime_error(
-        "set_bias is not implemented for this packed "
-        "parameter type");
-  }
-};
diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
index 6aaffff79a22..cbca3ba58ef7 100644
--- a/aten/src/ATen/native/quantized/cpu/qadd.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -7,10 +7,9 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/xnnpack_utils.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 
-#include <algorithm>
-
 namespace at {
 namespace native {
 
@@ -217,18 +216,170 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
 
   return qy;
 }
-#endif
+#endif // USE_PYTORCH_QNNPACK
+
+#ifdef USE_XNNPACK
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_add_nd(
+    int8_t azp,
+    float ascale,
+    int8_t bzp,
+    float bscale,
+    int8_t czp,
+    float cscale,
+    int8_t output_min,
+    int8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* op) {
+  return xnn_create_add_nd_qs8(
+      azp,        /* int8_t input1_zero_point   */
+      ascale,     /* float input1_scale         */
+      bzp,        /* int8_t input2_zero_point   */
+      bscale,     /* float input2_scale         */
+      czp,        /* int8_t output_zero_point   */
+      cscale,     /* float output_scale         */
+      output_min, /* int8_t output_min          */
+      output_max, /* int8_t output_max          */
+      flags,      /* uint32_t flags             */
+      op);        /* xnn_operator_t* add_op_out */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_add_nd(
+    xnn_operator_t op,
+    const std::vector<size_t>& a_shape,
+    const std::vector<size_t>& b_shape,
+    const int8_t* da,
+    const int8_t* db,
+    int8_t* dc,
+    pthreadpool_t pt_pool) {
+  return xnn_setup_add_nd_qs8(
+      op,             /* xnn_operator_t add_op      */
+      a_shape.size(), /* size_t num_input1_dims     */
+      a_shape.data(), /* const size_t* input1_shape */
+      b_shape.size(), /* size_t num_input2_dims     */
+      b_shape.data(), /* const size_t* input2_shape */
+      da,             /* const int8_t* input1       */
+      db,             /* const int8_t* input2       */
+      dc,             /* int8_t* output             */
+      pt_pool);       /* pthreadpool_t threadpool   */
+}
+
+template <typename scalar_t, bool ReLUFused = false>
+Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
+  using underlying_t = typename scalar_t::underlying;
+  const string func_name = "xnnp_add()";
+  TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
+  TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
+
+  // using qa memory format for qb to allow xnnpack kernel to flatten all the
+  // dims
+  auto qa_mem_format = qa.suggest_memory_format();
+  Tensor qa_contig = qa.contiguous(qa_mem_format);
+  Tensor qb_contig = qb.contiguous(qa_mem_format);
+
+  const auto a_zero_point = qa_contig.q_zero_point();
+  const auto b_zero_point = qb_contig.q_zero_point();
+  const auto a_scale = qa_contig.q_scale();
+  const auto b_scale = qb_contig.q_scale();
+
+  Tensor qy = at::native::empty_affine_quantized(
+      at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
+      qa.scalar_type(),
+      c10::nullopt /* layout */,
+      kCPU,
+      c10::nullopt /* pin_memory */,
+      scale,
+      zero_point,
+      qa_mem_format);
+
+  if (qa_contig.size(0) == 0) {
+    return qy;
+  }
+
+  xnn_operator_t xnnp_op = nullptr;
+  xnnpack_operator xnnp_add_operator;
+
+  auto output_max = std::numeric_limits<underlying_t>::max();
+  auto output_min = std::numeric_limits<underlying_t>::min();
+  if (ReLUFused) {
+    /*
+     * FIXME: use acticationLimits<T>()
+     * With <T>, MSVC runs into "error C3862: indetifier activationLimits not found".
+     */
+    constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
+    constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
+    int64_t qvalue = static_cast<int64_t>(zero_point);
+    qvalue = std::max<int64_t>(qvalue, qmin);
+    output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
+  }
+
+  // Create an operator
+  auto status = xnnp_create_add_nd(
+      a_zero_point,
+      a_scale,
+      b_zero_point,
+      b_scale,
+      static_cast<underlying_t>(zero_point),
+      static_cast<float>(scale),
+      output_min,
+      output_max,
+      0,
+      &xnnp_op);
+  xnnp_add_operator = xnnpack_operator(xnnp_op);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn create operator failed(", status,")!");
+
+  const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
+  const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
+
+  // Setup the operator
+  status = xnnp_setup_add_nd(
+      xnnp_add_operator.get(),
+      qa_shape,
+      qb_shape,
+      reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
+      reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
+      reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
+      caffe2::pthreadpool_());
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn setup operator failed(", status,")!");
+
+  // Run the operator
+  status = xnn_run_operator(
+      xnnp_add_operator.get(), /* xnn_operator_t op */
+      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn run operator failed(", status,")");
+  return qy;
+}
+#endif // USE_XNNPACK
 
 template <bool ReLUFused = false>
 Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
   check_inputs(qa, qb);
+
+  if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
+    TORCH_CHECK(
+        qa.scalar_type() == qb.scalar_type(),
+        "Both inputs to qadd must have same type");
+
+#ifdef USE_XNNPACK
+    if (qa.scalar_type() == kQInt8) {
+          return xnnp_add<c10::qint8, ReLUFused>(qa, qb, scale, zero_point);
+    }
+#endif // USE_XNNPACK
+
 #ifdef USE_PYTORCH_QNNPACK
-  if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
-      qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */
-      qa.scalar_type() == kQUInt8 && qb.scalar_type() == kQUInt8) {
+    if(qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */
+      qa.scalar_type() == kQUInt8) {
     return qnnpack_add<ReLUFused>(qa, qb, scale, zero_point);
+    }
+#endif // USE_PYTORCH_QNNPACK
   }
-#endif
   auto qc = at::_empty_affine_quantized(
       qa.sizes(),
       at::device(kCPU)
diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
index 8e09e32c4203..4322b3558f5c 100644
--- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/TensorIterator.h>
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index c32daf362516..aa77489f7419 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -5,9 +5,12 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <ATen/SmallVector.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/xnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
+#include <ATen/native/ConvUtils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>
@@ -160,7 +163,7 @@ std::array<int64_t, kSpatialDim> MakeInputShape(
     int64_t W);
 
 template <>
-std::array<int64_t, 2> MakeInputShape(int64_t _, int64_t H, int64_t W) {
+std::array<int64_t, 2> MakeInputShape(int64_t /*D*/, int64_t H, int64_t W) {
   return {H, W};
 }
 template <>
@@ -442,6 +445,21 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
         padding(),
         output_padding(),
         dilation());
+
+    // if use direct convolution implementation, compute the col_offsets
+    // of the weight matrix at model initialization stage.
+    // We need to know the shape of output matrix
+    // to compute col_offsets for direct convolution.
+    // Hence it cannot be called from inside weight packing function
+    // like other quantized conv implementation
+    if (pack_w->getPackedWForDirectconv().get() &&
+        pack_w->getPackedWForDirectconv().get()->is_first_call()) {
+          pack_w->getPackedWForDirectconv().get()->col_offsets_with_zero_pt_s8acc32_DirectConvT(
+              conv_p,
+              w_zp.data(),
+              col_offsets,
+              M);
+    }
   } else {
     output_shape = MakeConvOutputShape<kSpatialDim>(N, M, conv_p.OUT_DIM);
   }
@@ -573,22 +591,262 @@ template at::Tensor PackedConvWeight<3>::apply_impl<false>(
 
 #ifdef USE_PYTORCH_QNNPACK
 
+#ifdef USE_XNNPACK
 template <int kSpatialDim>
-at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply(
-    const at::Tensor& input,
-    double output_scale,
-    int64_t output_zero_point) {
-  return apply_impl<false>(input, output_scale, output_zero_point);
-}
+template <typename scalar_t, bool kReluFused>
+at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
+    const at::Tensor& act, double output_scale, int64_t output_zero_point) {
+  using underlying_t = typename scalar_t::underlying;
 
-template <int kSpatialDim>
-at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_relu(
-    const at::Tensor& input,
-    double output_scale,
-    int64_t output_zero_point) {
-  return apply_impl<true>(input, output_scale, output_zero_point);
+  std::lock_guard<std::mutex> lock(qnnp_mutex_);
+
+  const std::string func_name = transpose()
+      ? "quantized::conv_transpose (xnnpack)"
+      : "quantized::conv (xnnpack)";
+  TORCH_CHECK(
+      kSpatialDim == 2,
+      func_name, ": xnnpack does not currently support 3d convolution.");
+
+  /*
+   * NB:
+   * [de]conv_prepack prepares weights (values, scale, and zero_points) ahead of
+   * time during prepack() call assuming the activation will be uint8_t. But it
+   * may not always be the case. A solution may involve making prepack routine
+   * aware of the input qdtype. But currently all the pieces are not ready to
+   * pass that model level info to the prepack function. So, for now, here in
+   * this function we have to massage weights if we learn the input qdtype is
+   * not uint8_t. This involves copying and converting uint8_t to int8_t
+   * whenever necessary. To add to that, since XNNPACK, as of writing this,
+   * doesn't support per_channel weights for quint8_t, we add following assert
+   * makes sure we don't run into that case. Also take shortcuts when processing
+   * weights, which means we have to revisit and fix some weight massging logic
+   * when we enable the missing feature in XNNPACK.
+   *
+   * Table below summarizes how the weights are handled,
+   *
+   * .-------------------------------------------------------------------------.
+   * | input_qdtype |              uint8_t            |            int8_t      |
+   * | per_channel  |       yes       |       no      |      yes     |    no   |
+   * |-------------------------------------------------------------------------|
+   * | zero_points  | at::zeros()*    | orig_zp + 128 | at:zeros()** | orig_zp |
+   * | scale        |            dtype = float, no changes needed              |
+   * | values       |        always processed before passing to XNNPACK        |
+   * .-------------------------------------------------------------------------.
+   *
+   * Notes: * - zero_points for uint8_t + per_channel: no support in xnnpack, need
+   * to fix when support is added. ** - zero_points for int8_t: symmetric
+   * quantization means XNNPACK will ignore kernel zero point(s).
+   */
+
+  if ((std::is_same<underlying_t, c10::quint8>::value )) {
+    TORCH_CHECK(!per_channel(),
+      func_name, ": xnnpack does not currently have per_channel support with activation dtype of c10::quint8."
+    );
+  }
+
+  // More checks
+  ConvDimChecks<kSpatialDim>(
+      act.ndimension(),
+      stride().size(),
+      padding().size(),
+      output_padding().size(),
+      dilation().size(),
+      func_name,
+      transpose());
+
+  const int64_t N = act.size(0);
+  const int64_t H = act.size(2);
+  const int64_t W = act.size(3);
+  const int64_t D = 1;
+  const int64_t M = bias.size(0);
+
+  const auto act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast);
+  const auto act_input_scale = act_nhwc.q_scale();
+
+  auto status = xnn_status_invalid_state;
+
+  // Create an operator iff necessary
+  if (!xnnp_convolution_op ||
+      (!input_scale.has_value() || input_scale.value() != act_input_scale)) {
+    xnn_operator_t xnnp_op = nullptr;
+
+    // Update the input scale so we may cache the op
+    input_scale = act_input_scale;
+
+    // create an empty tensor for packing the weights
+    const at::Tensor weight_contig =
+        orig_weight.contiguous(c10::MemoryFormat::ChannelsLast);
+    const float* w_scales_data = w_scales.data_ptr<float>();
+    underlying_t w_zp = 0;
+    at::Tensor weight_tensor;
+
+    if (!per_channel()) {
+      w_zp = static_cast<underlying_t>(
+          weight_contig.q_zero_point() +
+          (std::is_same<underlying_t, uint8_t>::value ? 128 : 0));
+
+      weight_tensor = at::native::empty_affine_quantized(
+          weight_contig.sizes(),
+          c10::CppTypeToScalarType<scalar_t>::value,
+          c10::nullopt /* layout */,
+          c10::kCPU,
+          c10::nullopt /* pin_memory */,
+          w_scales_data[0],
+          w_zp,
+          c10::MemoryFormat::ChannelsLast);
+    } else { /* per_channel */
+      weight_tensor = at::native::empty_per_channel_affine_quantized(
+          weight_contig.sizes(),
+          w_scales,
+          at::zeros(w_scales.sizes(), at::kInt), /* see comment above about w_zp */
+          weight_contig.q_per_channel_axis(),
+          c10::CppTypeToScalarType<scalar_t>::value,
+          c10::nullopt /* layout */,
+          c10::kCPU,
+          c10::nullopt /* pin_memory */,
+          c10::MemoryFormat::ChannelsLast);
+    }
+
+    // copy from the original weight and take care of dtype change if necessary
+    at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset<scalar_t>(
+        weight_contig, weight_tensor);
+    const at::Tensor xnnp_weight =
+        at::native::xnnp_utils::convert_conv_weights_to_channel_last_tensor<
+            kSpatialDim>(weight_tensor, groups(), transpose());
+
+    auto output_min = kReluFused
+        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).first
+        : std::numeric_limits<underlying_t>::min();
+    auto output_max = kReluFused
+        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).second
+        : std::numeric_limits<underlying_t>::max();
+
+
+    // Original bias was float, so we requantize it here.
+    at::Tensor qbias;
+    if (per_channel()) {
+      auto bias_quant_scales =
+          weight_contig.q_per_channel_scales() * act_input_scale;
+      auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+      qbias = at::native::quantize_per_channel(
+          bias, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+    } else {
+      qbias = at::native::quantize_per_tensor(
+          bias, weight_contig.q_scale() * act_input_scale, 0, c10::kQInt32);
+    }
+
+    status = at::native::xnnp_utils::xnnp_create_convolution2d_nhwc(
+        padding()[0],
+        padding()[1],
+        padding()[0],
+        padding()[1],
+        kernel_[0],
+        kernel_[1],
+        stride()[0],
+        stride()[1],
+        dilation()[0],
+        dilation()[1],
+        groups(),
+        !transpose() ? orig_weight.size(1) : orig_weight.size(0) / groups(),
+        !transpose() ? orig_weight.size(0) / groups() : orig_weight.size(1),
+        !transpose() ? orig_weight.size(1) * groups() : orig_weight.size(0),
+        !transpose() ? orig_weight.size(0) : orig_weight.size(1) * groups(),
+        act_nhwc.q_zero_point(),
+        act_input_scale,
+        w_zp, /* will be ignored for Q[SC]8, see comment
+                above about w_zp*/
+        w_scales_data,
+        reinterpret_cast<const underlying_t*>(
+            xnnp_weight.template data_ptr<scalar_t>()),
+        reinterpret_cast<int32_t*>(qbias.template data_ptr<c10::qint32>()),
+        output_zero_point,
+        output_scale,
+        output_min,
+        output_max,
+        0,
+        &xnnp_op,
+        per_channel(),
+        transpose());
+
+    xnnp_convolution_op = xnnpack_operator(xnnp_op);
+    TORCH_CHECK(
+        status == xnn_status_success,
+        func_name,
+        ": xnn create operator failed(",
+        status,
+        ")");
+  }
+
+  at::SmallVector<int64_t, kSpatialDim + 2> output_shape;
+  const auto input_shape = MakeInputShape<kSpatialDim>(D, H, W);
+  if (transpose()) {
+    output_shape = MakeDeConvOutputShape<kSpatialDim>(
+        N, M, {H, W}, kernel_, stride(), padding(), output_padding(), dilation());
+  } else {
+    output_shape = MakeConvOutputShape<kSpatialDim>(
+        N, M, input_shape, kernel_, stride(), padding(), dilation());
+  }
+
+  if (act_nhwc.numel() > 0) {
+    TORCH_CHECK(
+        std::all_of(
+            output_shape.begin(),
+            output_shape.end(),
+            [](int64_t i) { return i > 0; }),
+        func_name, ": ", kSpatialDim, "d (xnnpack): each dimension of output tensor should be greater than 0.")
+  }
+
+  // Allocate output Tensor and a buffer for XNNPACK to use
+  at::Tensor output = at::native::empty_affine_quantized(
+      output_shape,
+      c10::CppTypeToScalarType<scalar_t>::value,
+      c10::nullopt /* layout */,
+      c10::kCPU,
+      c10::nullopt /* pin_memory */,
+      output_scale,
+      output_zero_point,
+      c10::MemoryFormat::ChannelsLast);
+
+  // Setup the operator
+  status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
+      xnnp_convolution_op.get(),
+      N,
+      H,
+      W,
+      reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
+      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
+      caffe2::pthreadpool_(),
+      per_channel(),
+      transpose(),
+      output_padding()[0],
+      output_padding()[1]);
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn setup operator failed(",
+      status,
+      ")");
+
+  // Run the operator
+  status = xnn_run_operator(
+      xnnp_convolution_op.get(), /* xnn_operator_t op */
+      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn run operator failed(",
+      status,
+      ")");
+
+  return output;
 }
 
+#endif // USE_XNNPACK
+
 template <int kSpatialDim>
 template <bool kReluFused>
 at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
@@ -607,7 +865,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
               func_name,
               "(qnnpack): Expected activation data type ",
               toString(c10::kQUInt8),
-              "but got ",
+              " but got ",
               toString(act.scalar_type()));
   ConvDimChecks<kSpatialDim>(
       act.ndimension(), stride().size(), padding().size(),
@@ -805,6 +1063,61 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
   return output;
 }
 
+#ifdef USE_XNNPACK
+bool can_use_xnnp(
+    c10::ScalarType dtype,
+    int kSpatialDim,
+    bool per_channel,
+    bool transpose) {
+  if (!at::native::xnnpack::available()) {
+    return false;
+  }
+  bool supported_dtypes = dtype == c10::kQInt8;
+  bool invalid_config =
+      (kSpatialDim != 2 /* No support for 3d convolution */
+        || (dtype == c10::kQInt8 && transpose &&
+            per_channel)); /* int8_t deconv does not support per-channel */
+  if (supported_dtypes && invalid_config) {
+    /* don't want this to fall through to QNNPACK */
+    const std::string func_name =
+        transpose ? "quantized::conv_transpose" : "quantized::conv";
+    TORCH_CHECK(
+        false,
+        func_name,
+        " (xnnpack): Unsupported conv config for dtype KQInt8");
+  }
+  return supported_dtypes && !invalid_config;
+}
+#endif  // USE_XNNPACK
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+#ifdef USE_XNNPACK
+  if (can_use_xnnp(input.scalar_type(), kSpatialDim, per_channel(), transpose())) {
+    return apply_impl_xnnp<c10::qint8, false>(
+        input, output_scale, output_zero_point);
+  } /* fall through for unsupported types, configs, or shapes */
+#endif // USE_XNNPACK
+  return apply_impl<false>(input, output_scale, output_zero_point);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_relu(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+#ifdef USE_XNNPACK
+  if (can_use_xnnp(input.scalar_type(), kSpatialDim, per_channel(), transpose())) {
+    return apply_impl_xnnp<c10::qint8, true>(
+        input, output_scale, output_zero_point);
+  } /* fall through for unsupported types, configs, or shapes */
+#endif // USE_XNNPACK
+  return apply_impl<true>(input, output_scale, output_zero_point);
+}
+
 template at::Tensor PackedConvWeightsQnnp<2>::apply(
     const at::Tensor& act,
     double output_scale,
@@ -837,6 +1150,177 @@ template at::Tensor PackedConvWeightsQnnp<3>::apply_impl<false>(
 
 #endif // USE_PYTORCH_QNNPACK
 
+#if AT_MKLDNN_ENABLED()
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<false>(input, output_scale, output_zero_point);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_relu(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<true>(input, output_scale, output_zero_point);
+}
+
+template <int kSpatialDim>
+template <bool kReluFused>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point) {
+  std::string func_name = "quantized::conv";
+  if (transpose()) {
+    func_name += "_transpose";
+  }
+  func_name += std::to_string(kSpatialDim) + "d";
+  if (kReluFused) {
+    func_name += "_relu";
+  }
+  ConvDimChecks<kSpatialDim>(
+      act.ndimension(), stride().size(), padding().size(),
+      output_padding().size(), dilation().size(), func_name, transpose());
+  TORCH_CHECK(act.scalar_type() == c10::ScalarType::QUInt8,
+      func_name, " (ONEDNN): data type of input should be QUint8.");
+
+  // src
+  auto act_contig = act.contiguous(kSpatialDim == 2 ? c10::MemoryFormat::ChannelsLast : c10::MemoryFormat::ChannelsLast3d);
+  auto src_dims = act_contig.sizes().vec();
+  auto src_data_type = dnnl::memory::data_type::u8;
+  auto src_desc = ideep::tensor::desc(src_dims, src_data_type,
+      kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
+  ideep::tensor src;
+  src.init(src_desc, act_contig.data_ptr());
+  // weights & bias
+  ideep::tensor& weights = *(weight_.get());
+  bool with_bias = bias_.has_value();
+  const auto& kernel_size = weights.get_dims();
+  // dst
+  const std::vector<int64_t>& input_size = src.get_dims();
+  std::vector<int64_t> output_sizes;
+  if (transpose()) {
+    // Prepacked weight format: [o, i, ...]
+    const int N = act.size(0); // batch size
+    const int C = act.size(1); // input channels
+    const int M = weights.get_dim(0); // output channels
+    const int D = kSpatialDim == 2 ? 1 : act.size(2); // input depth
+    const int H = act.size(kSpatialDim); // input height
+    const int W = act.size(kSpatialDim + 1); // input width
+    const int KH = weights.get_dim(kSpatialDim); // kernel height
+    const int KW = weights.get_dim(kSpatialDim + 1); // kernel width
+    const int KD = kSpatialDim == 2 ? 1 : weights.get_dim(2); // kernel depth
+    TORCH_CHECK(C == groups() * weights.get_dim(1), // weight: [o, i, ...]
+                func_name, " (ONEDNN): input channel number should be ",
+                groups() * weights.get_dim(1), ", but got ", C);
+    auto output_shape = MakeDeConvOutputShape<kSpatialDim>(
+        N,
+        M,
+        kSpatialDim == 2 ? std::vector<int64_t>{H, W} : std::vector<int64_t>{D, H, W},
+        kSpatialDim == 2 ? std::vector<int64_t>{KH, KW} : std::vector<int64_t>{KD, KH, KW},
+        stride(),
+        padding(),
+        output_padding(),
+        dilation());
+    output_sizes = c10::IntArrayRef(output_shape).vec();
+  } else {
+    output_sizes = at::native::conv_output_size(input_size, kernel_size, padding().vec(), stride().vec(), dilation().vec());
+  }
+  ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
+  at::Tensor output = at::_empty_affine_quantized(
+      dst_dims,
+      device(c10::kCPU)
+          .dtype(c10::kQUInt8)
+          .memory_format(kSpatialDim == 2 ?
+              c10::MemoryFormat::ChannelsLast :
+              c10::MemoryFormat::ChannelsLast3d),
+      output_scale,
+      output_zero_point,
+      c10::nullopt);
+  if (output.numel() == 0) {
+    return output;
+  }
+  ideep::tensor dst({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}},
+                    output.data_ptr());
+  // Parameters
+  const ideep::dims& strides = stride().vec();
+  const ideep::dims& dilates = dilation().vec();
+  const ideep::dims& padding_l = padding().vec();
+  const ideep::dims& padding_r = padding().vec();
+  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/act.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal
+  const ideep::scale_t& weights_scales = weights.get_scale();
+  const ideep::scale_t& dst_scales = ideep::scale_t(weights_scales.size(), 1.0/output_scale); // Scales of ONEDNN and PyTorch are reciprocal
+  const ideep::zero_point_t src_zero_points = ideep::zero_point_t(1, act.q_zero_point());
+  const ideep::zero_point_t dst_zero_points = ideep::zero_point_t(1, output_zero_point);
+  ideep::attr_t op_attr = kReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+  op_attr.set_zero_points(DNNL_ARG_SRC, ideep::utils::tensor_zp_mask(1), {DNNL_RUNTIME_S32_VAL}); // runtime src zero point
+  if (with_bias) {
+    // Bias might be modified outside (e.g. by quantization bias correction).
+    // If so, update the prepacked bias as well.
+    if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) {
+      bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr());
+    }
+    const auto& b = bias_.value();
+    if (transpose()) {
+      ideep::convolution_transpose_forward::compute_v2(
+          src, weights, b, dst_dims, dst,
+          strides, padding_l, padding_r, dilates,
+          groups(), src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points,
+          op_attr, dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference,
+          ideep::u8s8, ideep::engine::cpu_engine());
+    } else {
+      ideep::convolution_forward::compute_v2(
+          src, weights, b, dst_dims, dst,
+          strides, dilates, padding_l, padding_r, groups(),
+          src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points,
+          op_attr, dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference,
+          ideep::u8s8, ideep::engine::cpu_engine());
+    }
+  } else {
+    if (transpose()) {
+      ideep::convolution_transpose_forward::compute_v2(
+          src, weights, dst_dims, dst,
+          strides, padding_l, padding_r, dilates,
+          groups(), src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points,
+          op_attr, dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference,
+          ideep::u8s8, ideep::engine::cpu_engine());
+    } else {
+      ideep::convolution_forward::compute_v2(
+          src, weights, dst_dims, dst,
+          strides, dilates, padding_l, padding_r, groups(),
+          src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points,
+          op_attr, dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference,
+          ideep::u8s8, ideep::engine::cpu_engine());
+    }
+  }
+  return output;
+}
+
+template at::Tensor PackedConvWeightsOnednn<2>::apply(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+template at::Tensor PackedConvWeightsOnednn<2>::apply_relu(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+template at::Tensor PackedConvWeightsOnednn<3>::apply(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+template at::Tensor PackedConvWeightsOnednn<3>::apply_relu(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+#endif // #if AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 namespace {
@@ -914,10 +1398,10 @@ class QConvInt8ForBC final {
   static Tensor run(
       Tensor act,
       const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
+      torch::List<int64_t> /*stride*/,
+      torch::List<int64_t> /*padding*/,
+      torch::List<int64_t> /*dilation*/,
+      int64_t /*groups*/,
       double output_scale,
       int64_t output_zero_point) {
     if (kReluFused) {
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
index ec95748cd42b..2f3a6ed8f3cd 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
@@ -5,9 +5,10 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <ATen/SmallVector.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <c10/util/irange.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
@@ -118,6 +119,57 @@ template at::Tensor PackedConvWeightsQnnp<3>::apply_dynamic(
 
 #endif // USE_PYTORCH_QNNPACK
 
+#if AT_MKLDNN_ENABLED()
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) {
+
+  // Find min/max of input
+  float x_max = 0, x_min = 0;
+  if (input.numel() > 0) {
+    x_min = input.min().item<float>();
+    x_max = input.max().item<float>();
+  }
+
+  // Input tensor is quantized as 8-bit unsigned values
+  static constexpr int precision = 8;
+  static constexpr bool is_signed = false;
+
+  // Calculate scale and zero point for quantization of input tensor
+  auto q_params = quant_utils::ChooseQuantizationParams(
+      /*min=*/x_min,
+      /*max=*/x_max,
+      /*qmin=*/is_signed ? -(1 << (precision - 1)) : 0,
+      /*qmax=*/
+      is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
+      /*preserve_sparsity=*/false,
+      /*force_scale_power_of_two=*/false,
+      /*reduce_range=*/reduce_range);
+
+  // Quantize input
+  at::Tensor q_input = at::quantize_per_tensor(
+      input, q_params.scale, q_params.zero_point, c10::kQUInt8);
+
+  at::Tensor out =
+      apply_impl<false>(q_input, q_params.scale, q_params.zero_point);
+
+  // TODO: Modify ideep to allow fp32 input & output
+  // to avoid explicit `quantize - dequantize`
+  return at::dequantize(out);
+}
+
+template at::Tensor PackedConvWeightsOnednn<2>::apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range);
+
+template at::Tensor PackedConvWeightsOnednn<3>::apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range);
+
+#endif // AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 3cb5d9ef1a18..85edffef25b9 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -2,10 +2,11 @@
 #include <vector>
 
 #include <ATen/ATen.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <ATen/quantized/Quantizer.h>
 #include <torch/library.h>
@@ -314,6 +315,165 @@ c10::intrusive_ptr<ConvPackedParamsBase<2>> PackedConvWeightsQnnp<
         bool transpose);
 #endif // USE_PYTORCH_QNNPACK
 
+#if AT_MKLDNN_ENABLED()
+template <int kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsOnednn<
+    kSpatialDim>::
+    prepack(
+        at::Tensor weight,
+        c10::optional<at::Tensor> bias,
+        torch::List<int64_t> stride,
+        torch::List<int64_t> padding,
+        torch::List<int64_t> output_padding,
+        torch::List<int64_t> dilation,
+        int64_t groups,
+        bool transpose) {
+  TORCH_CHECK(
+      weight.ndimension() == kSpatialDim + 2,
+      "Weights are expected to have ", kSpatialDim + 2, " dimensions");
+  TORCH_CHECK(
+      stride.size() == kSpatialDim,
+      "stride should contain ", kSpatialDim, " elements for ",
+      kSpatialDim, "D convolution.");
+  TORCH_CHECK(
+      padding.size() == kSpatialDim,
+      "Specify front/top/left padding only. "
+      "end/bottom/right padding assumed to be equal to front/top/left");
+  TORCH_CHECK(
+      !transpose || output_padding.size() == kSpatialDim,
+      "quantized::conv_prepack: Specify top/left output padding "
+      "only. bottom/right padding assumed to be equal to top/left");
+  TORCH_CHECK(
+      dilation.size() == kSpatialDim,
+      "dilation should contain ", kSpatialDim, " elements for ",
+      kSpatialDim, "D convolution.");
+  TORCH_CHECK(
+      !transpose || std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; }),
+      "quantized::conv_prepack: ONEDNN only supports zero output_padding.");
+
+  // Weight
+  // Format: [OC IC//group KH KW] for conv; [IC OC//group KH KW] for deconv
+  auto dims = weight.sizes().vec();
+  auto strides = stride.vec();
+  auto padding_l = padding.vec();
+  auto padding_r = padding.vec();
+  auto dilates = dilation.vec();
+  auto op_attr = ideep::attr_t();
+  std::vector<int32_t> wgt_zero_points;
+  ideep::scale_t wgt_scales;
+  const int output_channels = transpose ? weight.size(1) * groups
+                                        : weight.size(0);
+  const auto qtype = weight.qscheme();
+  if (qtype == c10::kPerTensorAffine) {
+    TORCH_CHECK(
+        weight.q_zero_point()==0,
+        "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
+        " whose zero point must be 0.");
+    wgt_zero_points = std::vector<int32_t>(1, weight.q_zero_point());
+    wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal
+  } else if (qtype == c10::kPerChannelAffine) {
+    TORCH_CHECK(
+        !transpose,
+        "Per Channel Quantization is currently disabled for transposed conv");
+    wgt_zero_points.resize(output_channels);
+    wgt_scales.resize(output_channels);
+    for (int i = 0; i < output_channels; ++i) {
+      wgt_zero_points[i] = weight.q_per_channel_zero_points()[i].item<int32_t>();
+      TORCH_CHECK(
+          wgt_zero_points[i]==0,
+          "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
+          " whose zero point must be 0.");
+      wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item<float>(); // Scales of ONEDNN and PyTorch are reciprocal
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype));
+  }
+
+  // Set runtime src zero point
+  auto src_zero_point = {DNNL_RUNTIME_S32_VAL};
+  op_attr.set_zero_points(DNNL_ARG_SRC,
+                          ideep::utils::tensor_zp_mask(src_zero_point.size()),
+                          src_zero_point);
+  at::Tensor weight_copy;
+  ideep::tensor::desc w_desc;
+  ideep::dims dims_iohw, dims_giohw;
+  ideep::tag w_tag = ideep::tag::any;
+  const bool with_groups = groups > 1;
+  if (transpose) {
+    w_desc = ideep::convolution_transpose_forward::expected_weights_desc(
+        dims, dnnl::memory::data_type::s8,
+        strides, padding_l, padding_r, dilates, groups,
+        dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference,
+        ideep::dims(), op_attr);
+    // convolution_transpose_forward::expected_weights_desc() gives format [i, o, ...],
+    // but ONEDNN requires [o, i, ...] for computation
+    dims_iohw = w_desc.get_dims();
+    dims_giohw = with_groups ? ideep::utils::group_dims(dims_iohw, groups) : dims_iohw;
+    std::vector<int64_t> perms(dims_giohw.size(), 0); // for permutation of weight
+    std::iota(perms.begin(), perms.end(), 0);
+    w_desc = w_desc.transpose(with_groups, with_groups + 1);
+    std::swap(perms[with_groups], perms[with_groups + 1]);
+    weight_copy = weight.reshape(dims_giohw).permute(c10::IntArrayRef(perms)).clone();
+  } else {
+    w_desc = ideep::convolution_forward::expected_weights_desc(
+        dims, dnnl::memory::data_type::s8,
+        strides, padding_l, padding_r, dilates, groups,
+        dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference,
+        dnnl::memory::data_type::u8, ideep::dims(), op_attr);
+    weight_copy = weight.clone();
+  }
+  if (with_groups) {
+    w_tag = kSpatialDim == 2 ? ideep::tag::goihw : ideep::tag::goidhw;
+  } else {
+    w_tag = kSpatialDim == 2 ? ideep::tag::oihw : ideep::tag::oidhw;
+  }
+  ideep::dims w_dims = with_groups ? ideep::utils::group_dims(w_desc.get_dims(), groups)
+                                   : w_desc.get_dims();
+  ideep::tensor wgt = ideep::tensor(
+      ideep::tensor::desc({w_dims, dnnl::memory::data_type::s8, w_tag}, groups),
+      weight_copy.data_ptr());
+  wgt.set_scale(wgt_scales); // Scales are needed for feed_from().
+  ideep::tensor exp_wgt;
+  exp_wgt.init(w_desc);
+  exp_wgt.set_scale(wgt_scales); // Also for feed_from()
+  exp_wgt.feed_from(wgt, transpose); // expect wgt to be in [OC IC KH KW] format
+  ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt);
+  packed_weight_p->set_scale(wgt_scales);
+  packed_weight_p->set_zero_point(wgt_zero_points);
+  std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
+  // Bias
+  c10::optional<ideep::tensor> onednn_bias{c10::nullopt};
+  if (bias.has_value()) {
+    at::Tensor bias_vec = bias.value();
+    TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
+    TORCH_CHECK(
+        bias_vec.size(0) == output_channels,
+        "bias should have K elements: " + std::to_string(output_channels));
+    auto bias_desc = ideep::tensor::desc(bias.value().sizes().vec(), dnnl::memory::data_type::f32);
+    ideep::tensor packed_bias;
+    packed_bias.init(bias_desc, bias.value().data_ptr());
+    onednn_bias = c10::optional<ideep::tensor>(packed_bias);
+  }
+  auto ret_ptr = c10::make_intrusive<PackedConvWeightsOnednn<kSpatialDim>>(
+      PackedConvWeightsOnednn<kSpatialDim>{
+        std::move(weight_ptr),
+        onednn_bias,
+        weight,
+        bias,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        transpose
+      });
+  return ret_ptr;
+}
+
+template struct PackedConvWeightsOnednn<2>;
+template struct PackedConvWeightsOnednn<3>;
+#endif // #if AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 namespace {
@@ -377,6 +537,14 @@ class QConvPackWeightInt8 final {
     }
 #endif
 
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+        weight, bias, stride, padding, output_padding, dilation, groups,
+            transpose);
+    }
+#endif
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv2d_prepack ",
@@ -438,8 +606,6 @@ class QConv1dPackWeightInt8 final {
     }
 #endif
 
-
-
 #ifdef USE_PYTORCH_QNNPACK
     if (ctx.qEngine() == at::QEngine::QNNPACK) {
       return PackedConvWeightsQnnp<2>::prepack(
@@ -447,6 +613,15 @@ class QConv1dPackWeightInt8 final {
           transpose);
     }
 #endif
+
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      return PackedConvWeightsOnednn<2>::prepack(
+          weight, bias, stride, padding, output_padding, dilation, groups,
+          transpose);
+    }
+#endif
+
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::conv1d_prepack ",
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
deleted file mode 100644
index e4855062e360..000000000000
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-#include <tuple>
-#include <vector>
-
-#include <ATen/ATen.h>
-#include <torch/library.h>
-#include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <ATen/native/quantized/cpu/quant_utils.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
-
-#ifdef USE_FBGEMM
-template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
-    kSpatialDim>::unpack() {
-  auto* packed_weights_p = w.get();
-  // output channels
-  const int output_channels = packed_weights_p->outputChannels();
-  const int input_channels = packed_weights_p->inputChannels();
-  const int groups = packed_weights_p->groups();
-
-  const int kernel_d = kSpatialDim == 2 ? 1 : kernel[0];
-  // R (kernel height)
-  const int kernel_h = kernel[kSpatialDim - 2];
-  // S (kernel width)
-  const int kernel_w = kernel[kSpatialDim - 1];
-
-  const int C_per_G = input_channels / groups;
-
-  // Tensor for unpacked weights
-  // Unpacked format would be physical KRS(C/G) but logical KCRS (channels
-  // first) because that's how
-  // ChannelsLast3d is not available now.FBGEMM stores the weights
-  // TODO: Unify 2d and 3d when ChannelsLast3d is ready.
-  at::Tensor unpacked_weights;
-  if (q_scheme == c10::kPerTensorAffine) {
-    unpacked_weights = kSpatialDim == 2
-        ? at::_empty_affine_quantized(
-              {output_channels, C_per_G, kernel_h, kernel_w},
-              device(c10::kCPU)
-                  .dtype(c10::kQInt8)
-                  .memory_format(c10::MemoryFormat::ChannelsLast),
-              w_scale[0],
-              w_zp[0],
-              c10::nullopt)
-        : at::native::fbgemm_utils::
-              MakeEmptyAffineQuantizedChannelsLast3dTensor(
-                  output_channels,
-                  C_per_G,
-                  kernel_d,
-                  kernel_h,
-                  kernel_w,
-                  device(c10::kCPU).dtype(c10::kQInt8),
-                  w_scale[0],
-                  w_zp[0]);
-  } else if (q_scheme == c10::kPerChannelAffine) {
-    TORCH_CHECK(
-        !transpose(),
-        "Per Channel Quantization is currently disabled for transposed conv");
-    auto scales = at::from_blob(
-        w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
-    auto zero_points = at::from_blob(
-        w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
-    unpacked_weights = kSpatialDim == 2
-        ? at::_empty_per_channel_affine_quantized(
-              {output_channels, C_per_G, kernel_h, kernel_w},
-              scales.toType(c10::kDouble),
-              zero_points.toType(c10::kLong),
-              0, /* The output channel axis is 0 */
-              device(c10::kCPU).dtype(c10::kQInt8),
-              c10::MemoryFormat::ChannelsLast)
-        : at::native::fbgemm_utils::
-              MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor(
-                  output_channels,
-                  C_per_G,
-                  kernel_d,
-                  kernel_h,
-                  kernel_w,
-                  device(c10::kCPU).dtype(c10::kQInt8),
-                  scales.toType(c10::kDouble),
-                  zero_points.toType(c10::kLong));
-  } else {
-    TORCH_CHECK(false, "Unsupported qscheme: ", toString(q_scheme));
-  }
-  int8_t* unpacked_weights_p =
-      reinterpret_cast<int8_t*>(unpacked_weights.data_ptr<c10::qint8>());
-  packed_weights_p->unpack(unpacked_weights_p);
-  if(transpose()){
-    unpacked_weights =
-        at::native::fbgemm_utils::TransposeConvTensorUnpackConversion<
-            kSpatialDim>(unpacked_weights, groups);
-  }
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
-      unpacked_weights, bias);
-}
-
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
-    2>::unpack();
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
-    3>::unpack();
-#endif // USE_FBGEMM
-
-#ifdef USE_PYTORCH_QNNPACK
-template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
-    kSpatialDim>::unpack() {
-  TORCH_CHECK(
-      kSpatialDim == 2,
-      "QNNPACK only supports conv2d_unpack right "
-      "now.");
-  TORCH_CHECK(
-        orig_weight.defined(),
-        "Cannot unpack weights. "
-        "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias);
-}
-
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
-    2>::unpack();
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
-    3>::unpack();
-#endif // USE_PYTORCH_QNNPACK
-
-namespace at {
-namespace native {
-namespace {
-
-/*
- * QConvPackWeightInt8 expects its input tensor to be in shape
- * [output_channels, kernel_height, kernel_width, input_channels/Groups]
- * Therefore, the unpacking of packed weight tensor using QConvUnpackWeightsInt8
- * results in a tensor of the same shape.
- */
-
-template <int kSpatialDim = 2>
-class QConvUnpackWeightsInt8 final {
- public:
-  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    auto& ctx = at::globalContext();
-
-#ifdef USE_FBGEMM
-    if (ctx.qEngine() == at::QEngine::FBGEMM) {
-      return packed_weight->unpack();
-    }
-#endif
-
-#ifdef USE_PYTORCH_QNNPACK
-    if (ctx.qEngine() == at::QEngine::QNNPACK) {
-      TORCH_CHECK(
-          kSpatialDim == 2,
-          "quantized::conv2d_unpack (qnnpack): QNNPACK only supports Conv2d "
-          "now.");
-      return packed_weight->unpack();
-    }
-#endif
-
-    TORCH_CHECK(
-        false,
-        "Didn't find engine for operation quantized::conv2d_unpack ",
-        toString(ctx.qEngine()));
-  }
-};
-
-class QConv1dUnpackWeightsInt8 final {
- public:
-  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<2>>& packed_weight) {
-    auto& ctx = at::globalContext();
-    at::Tensor weight;
-    c10::optional<at::Tensor> bias;
-#ifdef USE_FBGEMM
-    if (ctx.qEngine() == at::QEngine::FBGEMM) {
-      std::tie(weight, bias) = packed_weight->unpack();
-      weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
-      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(weight, bias);
-    }
-#endif
-
-#ifdef USE_PYTORCH_QNNPACK
-    if (ctx.qEngine() == at::QEngine::QNNPACK) {
-      std::tie(weight, bias) = packed_weight->unpack();
-      at::Tensor new_weight = weight.clone();
-      new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
-      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(new_weight, bias);
-    }
-#endif
-
-    TORCH_CHECK(
-        false,
-        "Didn't find engine for operation quantized::conv1d_unpack ",
-        toString(ctx.qEngine()));
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvStride final {
- public:
-  static torch::List<int64_t> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->stride();
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvPadding final {
- public:
-  static torch::List<int64_t> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->padding();
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvOutputPadding final {
- public:
-  static torch::List<int64_t> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->output_padding();
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvDilation final {
- public:
-  static torch::List<int64_t> run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->dilation();
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvGroups final {
- public:
-  static int64_t run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->groups();
-  }
-};
-
-template <int kSpatialDim = 2>
-class QConvTranspose final {
- public:
-  static int64_t run(
-      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
-    return packed_weight->transpose();
-  }
-};
-
-IValue
-unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) {
-  auto params = ivalue.toCustomClass<ConvPackedParamsBase<2>>();
-  at::Tensor weight;
-  c10::optional<at::Tensor> bias;
-  std::tie(weight, bias) = params->unpack();
-  c10::optional<IntArrayRef> bias_sizes = c10::nullopt;
-  if (bias && bias->defined()) {
-    bias_sizes = bias->sizes();
-  }
-  return IValue(std::make_tuple(
-      weight.sizes(),
-      bias_sizes,
-      params->stride(),
-      params->padding(),
-      params->dilation(),
-      params->groups()));
-}
-
-TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
-  // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-  // We use  conv2d_unpack to be consistent with conv3d_unpack
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack_sizes"), TORCH_FN(unpack_quantized_prepacked_sizes_conv2d));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
-
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
-
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
-
-  // ConvTranspose is the same, however, we want to have different name.
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
-
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_stride"), TORCH_FN(QConvStride<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_padding"), TORCH_FN(QConvPadding<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_dilation"), TORCH_FN(QConvDilation<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_groups"), TORCH_FN(QConvGroups<3>::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
-}
-
-} // namespace
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
new file mode 100644
index 000000000000..693e093b1209
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
@@ -0,0 +1,136 @@
+#include <tuple>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
+#include <ATen/native/quantized/cpu/quant_utils.h>
+#include <ATen/native/quantized/packed_params.h>
+
+#ifdef USE_FBGEMM
+template <int kSpatialDim>
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+    kSpatialDim>::unpack() {
+  auto* packed_weights_p = w.get();
+  // output channels
+  const int output_channels = packed_weights_p->outputChannels();
+  const int input_channels = packed_weights_p->inputChannels();
+  const int groups = packed_weights_p->groups();
+
+  const int kernel_d = kSpatialDim == 2 ? 1 : kernel[0];
+  // R (kernel height)
+  const int kernel_h = kernel[kSpatialDim - 2];
+  // S (kernel width)
+  const int kernel_w = kernel[kSpatialDim - 1];
+
+  const int C_per_G = input_channels / groups;
+
+  // Tensor for unpacked weights
+  // Unpacked format would be physical KRS(C/G) but logical KCRS (channels
+  // first) because that's how
+  // ChannelsLast3d is not available now.FBGEMM stores the weights
+  // TODO: Unify 2d and 3d when ChannelsLast3d is ready.
+  at::Tensor unpacked_weights;
+  if (q_scheme == c10::kPerTensorAffine) {
+    unpacked_weights = kSpatialDim == 2
+        ? at::_empty_affine_quantized(
+              {output_channels, C_per_G, kernel_h, kernel_w},
+              device(c10::kCPU)
+                  .dtype(c10::kQInt8)
+                  .memory_format(c10::MemoryFormat::ChannelsLast),
+              w_scale[0],
+              w_zp[0],
+              c10::nullopt)
+        : at::native::fbgemm_utils::
+              MakeEmptyAffineQuantizedChannelsLast3dTensor(
+                  output_channels,
+                  C_per_G,
+                  kernel_d,
+                  kernel_h,
+                  kernel_w,
+                  device(c10::kCPU).dtype(c10::kQInt8),
+                  w_scale[0],
+                  w_zp[0]);
+  } else if (q_scheme == c10::kPerChannelAffine) {
+    TORCH_CHECK(
+        !transpose(),
+        "Per Channel Quantization is currently disabled for transposed conv");
+    auto scales = at::from_blob(
+        w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
+    auto zero_points = at::from_blob(
+        w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
+    unpacked_weights = kSpatialDim == 2
+        ? at::_empty_per_channel_affine_quantized(
+              {output_channels, C_per_G, kernel_h, kernel_w},
+              scales.toType(c10::kDouble),
+              zero_points.toType(c10::kLong),
+              0, /* The output channel axis is 0 */
+              device(c10::kCPU).dtype(c10::kQInt8),
+              c10::MemoryFormat::ChannelsLast)
+        : at::native::fbgemm_utils::
+              MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor(
+                  output_channels,
+                  C_per_G,
+                  kernel_d,
+                  kernel_h,
+                  kernel_w,
+                  device(c10::kCPU).dtype(c10::kQInt8),
+                  scales.toType(c10::kDouble),
+                  zero_points.toType(c10::kLong));
+  } else {
+    TORCH_CHECK(false, "Unsupported qscheme: ", toString(q_scheme));
+  }
+  int8_t* unpacked_weights_p =
+      reinterpret_cast<int8_t*>(unpacked_weights.data_ptr<c10::qint8>());
+  packed_weights_p->unpack(unpacked_weights_p);
+  if(transpose()){
+    unpacked_weights =
+        at::native::fbgemm_utils::TransposeConvTensorUnpackConversion<
+            kSpatialDim>(unpacked_weights, groups);
+  }
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+      unpacked_weights, bias);
+}
+
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+    2>::unpack();
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+    3>::unpack();
+#endif // USE_FBGEMM
+
+#ifdef USE_PYTORCH_QNNPACK
+template <int kSpatialDim>
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+    kSpatialDim>::unpack() {
+  TORCH_CHECK(
+      kSpatialDim == 2,
+      "QNNPACK only supports conv2d_unpack right "
+      "now.");
+  TORCH_CHECK(
+        orig_weight.defined(),
+        "Cannot unpack weights. "
+        "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias);
+}
+
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+    2>::unpack();
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+    3>::unpack();
+#endif // USE_PYTORCH_QNNPACK
+
+#if AT_MKLDNN_ENABLED()
+template <int kSpatialDim>
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+    kSpatialDim>::unpack() {
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+      orig_weight_, orig_bias_);
+}
+
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+    2>::unpack();
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+    3>::unpack();
+#endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index cf18da771e5c..7579e3185174 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -405,7 +405,7 @@ at::Tensor& embedding_bag_byte_impl(
 
   if (!pruned_weights || fallback_to_no_sparse) {
     auto kernel_i8 =
-        fbgemm::GenerateEmbeddingSpMDM<uint8_t, IndexType, OffsetType>(
+        fbgemm::GenerateEmbeddingSpMDM<uint8_t, IndexType, OffsetType, /*OutType=*/float, /*TRHEAD_LOCAL=*/true>(
             /*block_size=*/D,
             /*has_weight=*/per_sample_weights_.has_value(),
             /*normalize_by_lengths=*/false,
diff --git a/aten/src/ATen/native/quantized/cpu/qgelu.cpp b/aten/src/ATen/native/quantized/cpu/qgelu.cpp
index 7c0ee3cd784f..c07796f608d4 100644
--- a/aten/src/ATen/native/quantized/cpu/qgelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qgelu.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <torch/library.h>
+#include <ATen/native/Activation.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/quantized/Quantizer.h>
@@ -15,9 +16,9 @@ namespace native {
 
 DEFINE_DISPATCH(qgelu_stub);
 
-Tensor gelu_quantized_cpu(const Tensor& qx) {
+Tensor gelu_quantized_cpu(const Tensor& qx, c10::string_view approximate) {
   Tensor qy;
-  qgelu_stub(qx.device().type(), qx, qy);
+  qgelu_stub(qx.device().type(), qx, qy, get_gelutype_enum(approximate));
   return qy;
 }
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index ac055bf74a6e..d358f23c6af3 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -2,8 +2,10 @@
 #include <ATen/Parallel.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/xnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/custom_class.h>
 #include <torch/library.h>
@@ -270,6 +272,161 @@ at::Tensor& PackedLinearWeight::apply_relu_out(
 #endif // USE_FBGEMM
 
 #ifdef USE_PYTORCH_QNNPACK
+
+#ifdef USE_XNNPACK
+// TODO: add per_channel support in the future when xnnp supports it
+template <typename scalar_t, bool kReluFused>
+at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  using underlying_t = typename scalar_t::underlying;
+
+  std::lock_guard<std::mutex> lock(qnnp_mutex_);
+
+  const std::string func_name = kReluFused ? "quantized::linear_relu (xnnpack)"
+                                           : "quantized::linear (xnnpack)";
+  TORCH_CHECK(
+      input.dim() >= 2, func_name, ": Input tensor rank should be >= 2.");
+  TORCH_CHECK(
+      !per_channel(),
+      func_name,
+      ": xnnpack does not currently have per_channel support.");
+
+  const auto input_contig = input.contiguous();
+  const auto input_scale = input_contig.q_scale();
+
+  const size_t rows_w = bias_.size(0);
+  const size_t cols_w = input_contig.size(input_contig.dim() - 1);
+
+  auto status = xnn_status_invalid_state;
+
+  // Create an operator iff not already created
+  if (!xnnp_linear_op ||
+      (!this->input_scale.has_value() ||
+       this->input_scale.value() != input_scale)) {
+    // Update the input scale so we may cache the op
+    this->input_scale = input_scale;
+
+    xnn_operator_t xnnp_op = nullptr;
+
+    const float* weight_scales_data = w_scales.data_ptr<float>();
+
+    // prepare weights
+    underlying_t w_zp = static_cast<underlying_t>(
+        orig_weight.q_zero_point() +
+        (std::is_same<underlying_t, uint8_t>::value ? 128 : 0));
+
+   at::Tensor xnnp_weight = at::_empty_affine_quantized(
+        orig_weight.sizes(),
+        c10::CppTypeToScalarType<scalar_t>::value,
+        weight_scales_data[0],
+        w_zp);
+
+    // copy from the original weight and take care of dtype change if necessary
+    at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset<scalar_t>(
+        orig_weight, xnnp_weight);
+
+    // Original bias was float, so we requantize it here.
+    at::Tensor qbias = at::native::quantize_per_tensor(
+          bias_, orig_weight.q_scale() * input_scale, 0, c10::kQInt32);
+
+    // output limits
+   auto output_min = kReluFused
+        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).first
+        : std::numeric_limits<underlying_t>::min();
+    auto output_max = kReluFused
+        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+        ? activationLimits<underlying_t>(output_scale, output_zero_point, Activation::RELU).second
+        : std::numeric_limits<underlying_t>::max();
+
+    // Create an operator
+    status = at::native::xnnp_utils::xnnp_create_fully_connected_nc(
+        cols_w, /* input_channels */
+        rows_w, /* output_channels */
+        cols_w, /* input_stride */
+        rows_w, /* output_stride */
+        input_contig.q_zero_point(),
+        input_contig.q_scale(),
+        w_zp,
+        weight_scales_data[0],
+        reinterpret_cast<const underlying_t*>(
+            xnnp_weight.template data_ptr<scalar_t>()),
+        reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()),
+        output_zero_point,
+        output_scale,
+        output_min,
+        output_max,
+        0, /* flags */
+        &xnnp_op);
+    xnnp_linear_op = xnnpack_operator(xnnp_op);
+
+    TORCH_CHECK(
+        status == xnn_status_success,
+        func_name,
+        ": xnn create operator failed(",
+        status,
+        ")");
+  }
+
+  /*
+   * Allocate output Tensor and a buffer for XNNPACK to use
+   * The resulting matrix here is 2-D, let's view it with the original
+   * left hand dimensions of the input. Here are two examples:
+   * 1. If the input tensor is {M, K}, the output tensor is {M, N}.
+   * 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}.
+   */
+  std::vector<int64_t> out_sizes = input.sizes().vec();
+  out_sizes.back() = static_cast<int64_t>(rows_w);
+  at::Tensor output = at::native::empty_affine_quantized(
+      out_sizes,
+      c10::CppTypeToScalarType<scalar_t>::value,
+      c10::nullopt /* layout */,
+      c10::kCPU,
+      c10::nullopt /* pin_memory */,
+      output_scale,
+      output_zero_point,
+      input.suggest_memory_format());
+
+  // calculate batch_size
+  size_t rows_input = 1;
+  for (const auto i : c10::irange(input_contig.dim() - 1)) {
+    rows_input *= input_contig.size(i);
+  }
+
+  // Setup the operator
+  status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
+      xnnp_linear_op.get(),
+      rows_input, /* batch_size */
+      reinterpret_cast<const underlying_t*>(
+          input_contig.template data_ptr<scalar_t>()),
+      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
+      caffe2::pthreadpool_());
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn setup operator failed(",
+      status,
+      ")");
+
+  // Run the opeator
+  status = xnn_run_operator(
+      xnnp_linear_op.get(), // Linear op
+      caffe2::pthreadpool_() // threadpool
+  );
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn run operator failed(",
+      status,
+      ")");
+
+  return output;
+}
+#endif // USE_XNNPACK
+
 template <bool ReluFused>
 at::Tensor PackedLinearWeightsQnnp::apply_impl(
     at::Tensor input,
@@ -414,10 +571,35 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
   return output;
 }
 
+#ifdef USE_XNNPACK
+bool can_use_xnnp(c10::ScalarType dtype, bool per_channel) {
+  if(!at::native::xnnpack::available()) {
+    return false;
+  }
+
+  bool supported_dtypes = dtype == c10::kQInt8;
+  bool invalid_config = per_channel; /* xnnp does not currently support
+                                        per-channel fully connected op */
+  if (supported_dtypes && invalid_config) {
+    /* don't want this to fall through to QNNPACK */
+    TORCH_CHECK(
+        false,
+        "quantized::linear (xnnpack): Unsupported config for dtype KQInt8");
+  }
+  return supported_dtypes && !invalid_config;
+}
+#endif // USE_XNNPACK
+
 at::Tensor PackedLinearWeightsQnnp::apply(
     at::Tensor input,
     double output_scale,
     int64_t output_zero_point) {
+#ifdef USE_XNNPACK
+  if (can_use_xnnp(input.scalar_type(), per_channel())) {
+    return apply_impl_xnnp<c10::qint8, false>(
+        input, output_scale, output_zero_point);
+  } /* fall through for unsupported types, configs, or shapes */
+#endif // USE_XNNPACK
   return apply_impl<false>(std::move(input), output_scale, output_zero_point);
 }
 
@@ -425,11 +607,92 @@ at::Tensor PackedLinearWeightsQnnp::apply_relu(
     at::Tensor input,
     double output_scale,
     int64_t output_zero_point) {
+#ifdef USE_XNNPACK
+  if (can_use_xnnp(input.scalar_type(), per_channel())) {
+    return apply_impl_xnnp<c10::qint8, true>(
+        input, output_scale, output_zero_point);
+  } /* fall through for unsupported types, configs, or shapes */
+#endif // USE_XNNPACK
   return apply_impl<true>(std::move(input), output_scale, output_zero_point);
 }
 
 #endif // USE_PYTORCH_QNNPACK
 
+#if AT_MKLDNN_ENABLED()
+template <bool ReluFused>
+at::Tensor PackedLinearWeightsOnednn::apply_impl(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  const int64_t dim = input.dim();
+  TORCH_CHECK(
+      dim != 0,
+      "qlinear (ONEDNN): input dim should be at least 1, but got 0");
+  TORCH_CHECK(input.scalar_type() == c10::ScalarType::QUInt8,
+      "qlinear (ONEDNN): data type of input should be QUint8.");
+
+  auto input_contig = input.expect_contiguous();
+  auto& w = *(weight_.get());
+  auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1);
+  auto input_dims = {M, K};
+  auto input_data_type = dnnl::memory::data_type::u8;
+  auto input_desc = ideep::tensor::desc(input_dims, input_data_type);
+  ideep::attr_t op_attr = ReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+  ideep::tensor x(input_desc, input_contig->data_ptr<c10::quint8>());
+  auto dst_dims = {M, N};
+  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/input.q_scale());
+  const ideep::scale_t& weights_scales = w.get_scale();
+  const ideep::scale_t& dst_scales = ideep::scale_t(1, 1.0/output_scale); // Scales of ONEDNN and PyTorch are reciprocal
+  const ideep::zero_point_t& src_zero_point = ideep::zero_point_t(1, input.q_zero_point());
+  const ideep::zero_point_t& dst_zero_point = ideep::zero_point_t(1, output_zero_point);
+  // Compute: Use ideep::matmul_forward to support asymmetric quantization
+  // Allocate output Tensor
+  at::Tensor output = at::_empty_affine_quantized(
+      dst_dims,
+      at::device(c10::kCPU).dtype(c10::kQUInt8),
+      output_scale,
+      output_zero_point);
+  if (output.numel() == 0) {
+    return output;
+  }
+  ideep::tensor y({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}},
+                  output.data_ptr());
+  if (bias_.has_value()) {
+    // Bias might be modified outside (e.g. by quantization bias correction).
+    // If so, update the prepacked bias as well.
+    if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) {
+      bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr());
+    }
+    const auto& b = bias_.value();
+    ideep::matmul_forward::compute_v2(x, w, b, y, 1.0f, 1.0f, src_scales, weights_scales, dst_scales,
+                                      src_zero_point, dst_zero_point, op_attr);
+  } else {
+    ideep::matmul_forward::compute_v2(x, w, y, 1.0f, 1.0f, src_scales, weights_scales, dst_scales,
+                                      src_zero_point, dst_zero_point, op_attr);
+  }
+  auto out_sizes = input.sizes().vec();
+  out_sizes.back() = N;
+  if (output.sizes().vec() == out_sizes)
+    return output;
+  return output.reshape(out_sizes);
+}
+
+at::Tensor PackedLinearWeightsOnednn::apply(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<false>(std::move(input), output_scale, output_zero_point);
+}
+
+at::Tensor PackedLinearWeightsOnednn::apply_relu(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<true>(std::move(input), output_scale, output_zero_point);
+}
+
+#endif // #if AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 676b2f1ce649..111255726dcf 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -2,8 +2,9 @@
 #include <ATen/Parallel.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>
@@ -463,6 +464,99 @@ void PackedLinearWeightFp16::set_bias(c10::optional<at::Tensor> bias) {
 
 #endif // USE_FBGEMM
 
+#if AT_MKLDNN_ENABLED()
+template <bool ReluFused>
+at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
+    at::Tensor input,
+    bool reduce_range) {
+  // Dynamic: fp32 * int8 -> fp32
+  using at::Tensor;
+
+  TORCH_CHECK(
+      input.dim() >= 2,
+      "The dimension of input tensor should be larger than or equal to 2");
+  TORCH_CHECK(input.scalar_type() == c10::ScalarType::Float,
+      "qlinear_dynamic (ONEDNN): data type of input should be float.");
+
+  // Input -> uint8
+  auto input_contig = input.contiguous();
+  const int64_t dim = input.dim();
+  auto input_reshaped =
+      dim == 2 ? input : input.reshape({-1, input.size(input.dim() - 1)});
+  auto input_dims = input_reshaped.sizes().vec();
+  auto input_data_type = dnnl::memory::data_type::f32;
+  auto input_desc = ideep::tensor::desc(input_dims, input_data_type);
+  ideep::attr_t op_attr = ReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+  ideep::tensor x;
+  x.init(input_desc, input_contig.data_ptr());
+  // Find quantization parameters
+  float x_max = 0, x_min = 0;
+  if (input.numel() > 0) {
+    x_min = input_contig.min().item<float>();
+    x_max = input_contig.max().item<float>();
+  }
+  const int precision = 8;
+  auto q_params = quant_utils::ChooseQuantizationParams(
+      /*min=*/x_min,
+      /*max=*/x_max,
+      /*qmin=*/0,
+      /*qmax=*/(1 << precision) - 1,
+      /*preserve_sparsity=*/false,
+      /*force_scale_power_of_two=*/false,
+      /*reduce_range=*/reduce_range);
+  const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point);
+  // weights, dst
+  auto w = *(weight_.get());
+  auto dst_dims = {x.get_dim(0), w.get_dim(1)};
+  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale);
+  const ideep::scale_t& weights_scales = w.get_scale();
+  // Compute -> f32
+  // Use ideep::matmul_forward instead of ideep::inner_product_forward,
+  // since the latter does not support asymmetric quantization
+  // Allocate output Tensor
+  at::Tensor output = at::empty(dst_dims, input.options().dtype(at::kFloat));
+  if (output.numel() == 0) return output;
+  ideep::tensor y({dst_dims, ideep::tensor::data_type::f32,
+                   {output.strides().cbegin(), output.strides().cend()}},
+                  output.data_ptr());
+  if (bias_.has_value()) {
+    // Bias might be modified outside (e.g. by quantization bias correction).
+    // If so, update the prepacked bias as well.
+    if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) {
+      bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr());
+    }
+    const ideep::tensor b = bias_.value();
+    ideep::matmul_forward::compute_v2(x, w, b, y, 1.0f, 1.0f,
+                                      src_scales, weights_scales, ideep::scale_t(),
+                                      src_zero_point, ideep::zero_point_t(), op_attr);
+  } else {
+    ideep::matmul_forward::compute_v2(x, w, y, 1.0f, 1.0f,
+                                      src_scales, weights_scales, ideep::scale_t(),
+                                      src_zero_point, ideep::zero_point_t(), op_attr);
+  }
+  auto out_sizes = input.sizes().vec();
+  out_sizes.back() = w.get_dim(1);
+  if (output.sizes().vec() == out_sizes)
+    return output;
+  return output.reshape(out_sizes);
+}
+
+at::Tensor PackedLinearWeightsOnednn::apply_dynamic(
+    at::Tensor input,
+    bool reduce_range) {
+  return apply_dynamic_impl</*ReluFused=*/false>(
+      std::move(input), reduce_range);
+}
+
+at::Tensor PackedLinearWeightsOnednn::apply_dynamic_relu(
+    at::Tensor input,
+    bool reduce_range) {
+  return apply_dynamic_impl</*ReluFused=*/true>(
+      std::move(input), reduce_range);
+}
+
+#endif // #if AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 93c54dc10889..6ca6905119f4 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -1,9 +1,9 @@
 #include <ATen/ATen.h>
-#include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <ATen/quantized/Quantizer.h>
 #include <torch/custom_class.h>
@@ -194,6 +194,80 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightFp16::prepack(
 }
 #endif // USE_FBGEMM
 
+#if AT_MKLDNN_ENABLED()
+c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
+    at::Tensor weight,
+    c10::optional<at::Tensor> bias) {
+  TORCH_CHECK(
+      weight.dim() == 2,
+      "The weight tensor for quantized::linear_prepack (onednn) should"
+      " be 2-dimensional.");
+  // Weight
+  std::vector<int64_t> dims = weight.sizes().vec();
+  auto N = weight.size(0);
+  std::vector<int32_t> wgt_zero_points;
+  ideep::scale_t wgt_scales;
+  const auto qtype = weight.qscheme();
+  if (qtype == c10::kPerTensorAffine) {
+    TORCH_CHECK(
+        weight.q_zero_point() == 0,
+        "quantized::linear_prepack: ONEDNN only supports symmetric quantization of weight,"
+        " whose zero point must be 0, but got ", weight.q_zero_point());
+    wgt_zero_points = std::vector<int32_t>(1, weight.q_zero_point());
+    wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal
+  } else if (qtype == c10::kPerChannelAffine) {
+    wgt_zero_points.resize(N);
+    wgt_scales.resize(N);
+    for (int i = 0; i < N; ++i) {
+      wgt_zero_points[i] = weight.q_per_channel_zero_points()[i].item<int32_t>();
+      TORCH_CHECK(
+          wgt_zero_points[i] == 0,
+          "quantized::linear_prepack: ONEDNN only supports symmetric quantization of weight,"
+          " whose zero point must be 0, but got ",  wgt_zero_points[i], ", at index ", i);
+      wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item<float>(); // Scales of ONEDNN and PyTorch are reciprocal
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype));
+  }
+
+  // Prepack weight
+  auto weight_copy = weight.clone();
+  ideep::tensor wgt = ideep::tensor({dims, dnnl::memory::data_type::s8}, weight_copy.data_ptr());
+  wgt.transpose_(0, 1); // ONEDNN requires transposed weight
+  auto w_desc = ideep::matmul_forward::expected_weights_desc(wgt.get_dims(), dnnl::memory::data_type::s8,
+                                                             dnnl::memory::data_type::u8);
+  ideep::tensor exp_wgt(w_desc);
+  exp_wgt.feed_from(wgt);
+  ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt);
+  packed_weight_p->set_scale(wgt_scales);
+  packed_weight_p->set_zero_point(wgt_zero_points);
+  std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
+  // Bias
+  c10::optional<ideep::tensor> onednn_bias{c10::nullopt};
+  if (bias.has_value()) {
+    auto& b = bias.value();
+    auto bias_size = b.sizes().vec();
+    bias_size.insert(bias_size.begin(), 1);
+    TORCH_CHECK(
+        bias_size[1] == weight_ptr->get_dim(1),
+        "bias should have N elements: ",
+        std::to_string(weight_ptr->get_dim(1)),
+        ", but got ", bias_size[1]);
+    auto bias_desc = ideep::tensor::desc(bias_size, dnnl::memory::data_type::f32);
+    ideep::tensor packed_bias;
+    packed_bias.init(bias_desc, b.data_ptr());
+    onednn_bias = c10::optional<ideep::tensor>(packed_bias);
+  }
+  auto ret_ptr = c10::make_intrusive<PackedLinearWeightsOnednn>(
+      PackedLinearWeightsOnednn{
+        std::move(weight_ptr),
+        onednn_bias,
+        weight,
+        bias});
+  return ret_ptr;
+}
+#endif // #if AT_MKLDNN_ENABLED()
+
 namespace at {
 namespace native {
 
@@ -224,6 +298,11 @@ class QLinearPackWeightInt8 final {
           std::move(weight), std::move(bias));
     }
 #endif
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias));
+    }
+#endif // #if AT_MKLDNN_ENABLED()
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack ",
@@ -238,6 +317,9 @@ class QLinearPackWeightFp16 final {
       c10::optional<Tensor> bias) {
     auto& ctx = at::globalContext();
 #ifdef USE_FBGEMM
+    // temporarily convert weight back to fp32, needs to be fixed
+    // after fbgemm fixes the interface for their prepacking op (take fp16 input0
+    weight = weight.to(ScalarType::Float);
     if (ctx.qEngine() == at::QEngine::FBGEMM) {
       return PackedLinearWeightFp16::prepack(
           std::move(weight), std::move(bias));
@@ -251,6 +333,14 @@ class QLinearPackWeightFp16 final {
           "not supported by QNNPACK");
     }
 #endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      TORCH_CHECK(
+          false,
+          "quantized::linear_prepack_fp16 is currently "
+          "not supported by ONEDNN");
+    }
+#endif // #if AT_MKLDNN_ENABLED()
     TORCH_CHECK(
         false,
         "Didn't find engine for operation quantized::linear_prepack_fp16 ",
@@ -261,63 +351,18 @@ class QLinearPackWeightFp16 final {
 class QLinearPackWeightInt8Legacy final {
  public:
   static Tensor run(at::Tensor weight, c10::optional<Tensor> bias) {
-    auto& ctx = at::globalContext();
-    auto options = weight.options();
-
-#ifdef USE_FBGEMM
-    if (ctx.qEngine() == at::QEngine::FBGEMM) {
-      auto prepacked =
-          PackedLinearWeight::prepack(std::move(weight), std::move(bias));
-      auto wrapped =
-          std::make_unique<c10::intrusive_ptr<LinearPackedParamsBase>>(
-              std::move(prepacked));
-      return cpp_custom_type_hack::create(std::move(wrapped), options);
-    }
-#endif // USE_FBGEMM
-#ifdef USE_PYTORCH_QNNPACK
-    if (ctx.qEngine() == at::QEngine::QNNPACK) {
-      auto prepacked =
-          PackedLinearWeightsQnnp::prepack(std::move(weight), std::move(bias));
-      auto wrapped =
-          std::make_unique<c10::intrusive_ptr<LinearPackedParamsBase>>(
-              std::move(prepacked));
-      return cpp_custom_type_hack::create(std::move(wrapped), options);
-    }
-#endif // USE_PYTORCH_QNNPACK
-    TORCH_CHECK(
-        false,
-        "Didn't find engine for operation quantized::linear_prepack ",
-        toString(ctx.qEngine()));
+    TORCH_CHECK(false,
+        "This model uses an outdated version of quantized.linear_prepack. "
+        "Please re-export your model using the newer definitions in torch.jit.quantized");
   }
 };
 
 class QLinearPackWeightFp16Legacy final {
  public:
   static Tensor run(at::Tensor weight, c10::optional<Tensor> bias) {
-    auto& ctx = at::globalContext();
-#ifdef USE_FBGEMM
-    auto options = weight.options();
-    if (ctx.qEngine() == at::QEngine::FBGEMM) {
-      auto prepacked =
-          PackedLinearWeightFp16::prepack(std::move(weight), std::move(bias));
-      auto wrapped =
-          std::make_unique<c10::intrusive_ptr<LinearPackedParamsBase>>(
-              std::move(prepacked));
-      return cpp_custom_type_hack::create(std::move(wrapped), options);
-    }
-#endif // USE_FBGEMM
-#ifdef USE_PYTORCH_QNNPACK
-    if (ctx.qEngine() == at::QEngine::QNNPACK) {
-      TORCH_CHECK(
-          false,
-          "quantized::linear_prepack_fp16 is currently "
-          "not supported by QNNPACK");
-    }
-#endif // USE_PYTORCH_QNNPACK
-    TORCH_CHECK(
-        false,
-        "Didn't find engine for operation quantized::linear_prepack_fp16 ",
-        toString(ctx.qEngine()));
+    TORCH_CHECK(false,
+        "This model uses an outdated version of quantized.linear_prepack_fp16. "
+        "Please re-export your model using the newer definitions in torch.jit.quantized");
   }
 };
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
deleted file mode 100644
index 2a34e6748eb4..000000000000
--- a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cpp_custom_type_hack.h>
-#include <ATen/native/quantized/cpu/fbgemm_utils.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
-#include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <torch/custom_class.h>
-#include <torch/library.h>
-
-int register_linear_params();
-
-#ifdef USE_FBGEMM
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeight::unpack() {
-  auto packB = w.get();
-
-  int64_t N = static_cast<int64_t>(packB->numCols());
-  int64_t K = static_cast<int64_t>(packB->numRows());
-
-  at::Tensor weight_origin;
-  if (q_scheme == c10::kPerTensorAffine) {
-    weight_origin = at::_empty_affine_quantized(
-        {N, K}, at::device(c10::kCPU).dtype(c10::kQInt8), w_scale[0], w_zp[0]);
-  } else if (q_scheme == c10::kPerChannelAffine) {
-    auto scales = at::from_blob(
-        w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
-    auto zero_points = at::from_blob(
-        w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
-
-    weight_origin = at::_empty_per_channel_affine_quantized(
-        {N, K},
-        scales.toType(c10::kDouble),
-        zero_points.toType(c10::kLong),
-        0, // The output channel axis is 0
-        device(c10::kCPU).dtype(c10::kQInt8));
-  }
-
-  int8_t* weight_ptr_int8 =
-      reinterpret_cast<int8_t*>(weight_origin.data_ptr<c10::qint8>());
-
-  // packB->printPackedMatrix("packedB inside fbgemm_unpack
-  // (QLinearUnpackWeightInt8): ");
-  packB->unpack(weight_ptr_int8);
-
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
-      weight_origin, bias_);
-}
-#endif // USE_FBGEMM
-
-#ifdef USE_PYTORCH_QNNPACK
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsQnnp::
-    unpack() {
-  TORCH_CHECK(
-      orig_weight.defined(),
-      "Cannot unpack weights. "
-      "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias_);
-}
-#endif // USE_PYTORCH_QNNPACK
-
-#ifdef USE_FBGEMM
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightFp16::
-    unpack() {
-  auto& packed_weight_ptr = w;
-
-  auto nrows = packed_weight_ptr->numRows();
-  auto ncols = packed_weight_ptr->numCols();
-
-  at::Tensor unpacked_weight =
-      at::empty({ncols, nrows}, at::kHalf, c10::MemoryFormat::Contiguous);
-  packed_weight_ptr->unpack(
-      static_cast<fbgemm::float16*>(unpacked_weight.data_ptr()),
-      fbgemm::matrix_op_t::Transpose);
-
-  return std::make_tuple(unpacked_weight.to(at::kFloat), bias_);
-}
-#endif // USE_FBGEMM
-
-namespace at {
-namespace native {
-namespace {
-
-class QLinearUnpackWeightInt8 final {
- public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
-      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
-    return packed_weight->unpack();
-  }
-};
-
-class QLinearUnpackWeightFp16 final {
- public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
-      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
-    auto& ctx = at::globalContext();
-
-    TORCH_CHECK(
-        ctx.qEngine() != at::QEngine::QNNPACK,
-        "quantized::linear_unpack_fp16 is currently "
-        "not supported by QNNPACK");
-
-    return packed_weight->unpack();
-  }
-};
-
-class QLinearUnpackWeightInt8Legacy final {
- public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
-      const at::Tensor& packed_weight) {
-    TORCH_WARN_ONCE(
-        "quantized.linear_unpack(Tensor) is deprecated! Please "
-        "upgrade your model to use the newer quantized.linear_"
-        "unpack(LinearPackedParamsBase) overload");
-    return cpp_custom_type_hack::cast<
-               c10::intrusive_ptr<LinearPackedParamsBase>>(packed_weight)
-        ->unpack();
-  }
-};
-
-class QLinearUnpackWeightFp16Legacy final {
- public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
-      const at::Tensor& packed_weight) {
-    TORCH_WARN_ONCE(
-        "quantized.linear_unpack(Tensor) is deprecated! Please "
-        "upgrade your model to use the newer quantized.linear_"
-        "unpack(LinearPackedParamsBase) overload");
-    auto& ctx = at::globalContext();
-
-    TORCH_CHECK(
-        ctx.qEngine() != at::QEngine::QNNPACK,
-        "quantized::linear_unpack_fp16 is currently "
-        "not supported by QNNPACK");
-
-    return cpp_custom_type_hack::cast<
-               c10::intrusive_ptr<LinearPackedParamsBase>>(packed_weight)
-        ->unpack();
-  }
-};
-
-TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
-}
-
-TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
-  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run));
-  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run));
-}
-
-} // namespace
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp
new file mode 100644
index 000000000000..b7182bf0fa47
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp
@@ -0,0 +1,83 @@
+#include <ATen/ATen.h>
+#include <ATen/cpp_custom_type_hack.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <torch/custom_class.h>
+#include <torch/library.h>
+
+int register_linear_params();
+
+#ifdef USE_FBGEMM
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeight::unpack() {
+  auto packB = w.get();
+
+  int64_t N = static_cast<int64_t>(packB->numCols());
+  int64_t K = static_cast<int64_t>(packB->numRows());
+
+  at::Tensor weight_origin;
+  if (q_scheme == c10::kPerTensorAffine) {
+    weight_origin = at::_empty_affine_quantized(
+        {N, K}, at::device(c10::kCPU).dtype(c10::kQInt8), w_scale[0], w_zp[0]);
+  } else if (q_scheme == c10::kPerChannelAffine) {
+    auto scales = at::from_blob(
+        w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
+    auto zero_points = at::from_blob(
+        w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
+
+    weight_origin = at::_empty_per_channel_affine_quantized(
+        {N, K},
+        scales.toType(c10::kDouble),
+        zero_points.toType(c10::kLong),
+        0, // The output channel axis is 0
+        device(c10::kCPU).dtype(c10::kQInt8));
+  }
+
+  int8_t* weight_ptr_int8 =
+      reinterpret_cast<int8_t*>(weight_origin.data_ptr<c10::qint8>());
+
+  // packB->printPackedMatrix("packedB inside fbgemm_unpack
+  // (QLinearUnpackWeightInt8): ");
+  packB->unpack(weight_ptr_int8);
+
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+      weight_origin, bias_);
+}
+#endif // USE_FBGEMM
+
+#ifdef USE_PYTORCH_QNNPACK
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsQnnp::
+    unpack() {
+  TORCH_CHECK(
+      orig_weight.defined(),
+      "Cannot unpack weights. "
+      "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias_);
+}
+#endif // USE_PYTORCH_QNNPACK
+
+#ifdef USE_FBGEMM
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightFp16::
+    unpack() {
+  auto& packed_weight_ptr = w;
+
+  auto nrows = packed_weight_ptr->numRows();
+  auto ncols = packed_weight_ptr->numCols();
+
+  at::Tensor unpacked_weight =
+      at::empty({ncols, nrows}, at::kHalf, c10::MemoryFormat::Contiguous);
+  packed_weight_ptr->unpack(
+      static_cast<fbgemm::float16*>(unpacked_weight.data_ptr()),
+      fbgemm::matrix_op_t::Transpose);
+
+  return std::make_tuple(unpacked_weight.to(at::kFloat), bias_);
+}
+#endif // USE_FBGEMM
+
+#if AT_MKLDNN_ENABLED()
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsOnednn::unpack() {
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+      orig_weight_, orig_bias_);
+}
+#endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
index 013966a52510..e42941fd0a35 100644
--- a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
@@ -1,6 +1,12 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#ifdef USE_RUY_QMATMUL
+#include <ATen/Parallel.h>
+#include <ATen/native/quantized/cpu/ruy_utils.h>
+#include <ruy/ruy.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -21,6 +27,142 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
       "Both inputs to Matmul must have the same quantization scheme.");
 }
 
+#ifdef USE_RUY_QMATMUL
+
+Tensor qmatmul(
+    const Tensor& qa,
+    const Tensor& qb,
+    const double output_scale,
+    const int64_t output_zero_point) {
+  check_inputs(qa, qb);
+
+  const int64_t num_dims = qa.dim();
+  const int64_t b_num_dims = qb.dim();
+
+  TORCH_CHECK(
+      num_dims == b_num_dims,
+      "MatMul operands should have the same dimensionality. (", num_dims,
+      " and ", b_num_dims, " provided)");
+  TORCH_CHECK(
+      num_dims >= 2,
+      "Quantized Matmul currently only suports operands which are at least 2-dimensional. (",
+      num_dims, " provided)");
+
+  const int64_t m = qa.size(num_dims - 2);
+  const int64_t k = qa.size(num_dims - 1);
+  const int64_t b_k = qb.size(num_dims - 2);
+  const int64_t n = qb.size(num_dims - 1);
+
+  TORCH_CHECK(
+      b_k == k,
+      "For Quantized Matmul, the size of tensor a (", k,
+      ") at dimension ", num_dims - 1, " must match the size of tensor b (",
+      b_k, ") at dimension ", num_dims - 2, ".");
+
+  std::vector<int64_t> out_size_vec(num_dims);
+  size_t num_matmuls = 1;
+  for (int64_t i = 0; i < num_dims - 2; i++) {
+    const int64_t dim = qa.size(i);
+    const int64_t qb_dim = qb.size(i);
+
+    TORCH_CHECK(
+        dim == qb_dim,
+        "For Quantized Matmul, the size of tensor a (", dim,
+        ") must match the size of tensor b (", qb_dim,
+        ") at dimension ", i);
+
+    out_size_vec[i] = dim;
+    num_matmuls *= dim;
+  }
+  out_size_vec[num_dims - 2] = m;
+  out_size_vec[num_dims - 1] = n;
+
+  Tensor out = at::_empty_affine_quantized(
+      IntArrayRef(out_size_vec),
+      at::device(kCPU)
+          .dtype(qa.scalar_type())
+          .memory_format(qa.suggest_memory_format()),
+      output_scale,
+      output_zero_point,
+      c10::nullopt);
+
+  const Tensor& qa_contig = qa.contiguous();
+  const Tensor& qb_contig = qb.contiguous();
+
+  AT_DISPATCH_QINT_BYTE_TYPES(qa.scalar_type(), "qmatmul", [&] {
+    using underlying_t = typename scalar_t::underlying;
+
+    const underlying_t* qa_data = reinterpret_cast<const underlying_t*>(
+        qa_contig.data_ptr<scalar_t>());
+    const underlying_t* qb_data = reinterpret_cast<const underlying_t*>(
+        qb_contig.data_ptr<scalar_t>());
+    underlying_t* out_data =
+        reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>());
+
+    const size_t qa_stride = m * k;
+    const size_t qb_stride = k * n;
+    const size_t out_stride = m * n;
+
+    auto matmuls = [&](int64_t begin, int64_t end) {
+
+      ruy::Matrix<underlying_t> qa_matrix;
+      ruy::MakeSimpleLayout(
+          m, k, ruy::Order::kRowMajor, qa_matrix.mutable_layout());
+      qa_matrix.set_zero_point(qa.q_zero_point());
+
+      ruy::Matrix<underlying_t> qb_matrix;
+      ruy::MakeSimpleLayout(
+          k, n, ruy::Order::kRowMajor, qb_matrix.mutable_layout());
+      qb_matrix.set_zero_point(qb.q_zero_point());
+
+      ruy::Matrix<underlying_t> out_matrix;
+      ruy::MakeSimpleLayout(
+          m, n, ruy::Order::kRowMajor, out_matrix.mutable_layout());
+      out_matrix.set_zero_point(output_zero_point);
+
+      // Requantization explanation:
+      // https://github.com/google/gemmlowp/blob/e844ffd17118c1e17d94e1ba4354c075a4577b88/doc/quantization.md
+      const double requantization_scale_inv =
+          (qa.q_scale() * qb.q_scale()) / output_scale;
+
+      ruy::MulParams<int32_t, underlying_t> mul_params;
+
+      int multiplier_fixedpoint;
+      int multiplier_exponent;
+      ruy_utils::quantize_multiplier(requantization_scale_inv,
+                                     &multiplier_fixedpoint,
+                                     &multiplier_exponent);
+      mul_params.set_multiplier_fixedpoint(multiplier_fixedpoint);
+      mul_params.set_multiplier_exponent(multiplier_exponent);
+
+      const underlying_t* qa_subtensor = qa_data + begin * qa_stride;
+      const underlying_t* qb_subtensor = qb_data + begin * qb_stride;
+      underlying_t* out_subtensor = out_data + begin * out_stride;
+
+      for (int64_t i = begin; i < end; i++) {
+        qa_matrix.set_data(qa_subtensor);
+        qb_matrix.set_data(qb_subtensor);
+        out_matrix.set_data(out_subtensor);
+        ruy::Mul(qa_matrix,
+                 qb_matrix,
+                 mul_params,
+                 ruy_utils::get_ruy_context(),
+                 &out_matrix);
+
+        qa_subtensor += qa_stride;
+        qb_subtensor += qb_stride;
+        out_subtensor += out_stride;
+      }
+    };
+
+    at::parallel_for(0, num_matmuls, 1, matmuls);
+  });
+
+  return out;
+}
+
+#else // ifdef USE_RUY_QMATMUL
+
 Tensor qmatmul(
     const Tensor& qa,
     const Tensor& qb,
@@ -34,6 +176,8 @@ Tensor qmatmul(
       rc, output_scale, output_zero_point, qa.scalar_type());
 }
 
+#endif // ifdef USE_RUY_QMATMUL
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::matmul"), TORCH_FN(qmatmul));
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck b/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck
new file mode 100644
index 000000000000..85abc6a60916
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck
@@ -0,0 +1,143 @@
+load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+
+cxx_library(
+    name = "pytorch_qnnpack",
+    srcs = ['src/add.c', 'src/average-pooling.c', 'src/channel-shuffle.c', 'src/clamp.c', 'src/conv-prepack.cc', 'src/conv-run.cc', 'src/convolution.c', 'src/deconv-run.cc', 'src/deconvolution.c', 'src/fc-dynamic-run.cc', 'src/fc-prepack.cc', 'src/fc-run.cc', 'src/fully-connected.c', 'src/fully-connected-sparse.c', 'src/global-average-pooling.c', 'src/hardsigmoid.c', 'src/hardswish.c', 'src/indirection.c', 'src/init.c', 'src/leaky-relu.c', 'src/max-pooling.c', 'src/operator-delete.c', 'src/operator-run.c', 'src/pack_block_sparse.cc', 'src/sigmoid.c', 'src/softargmax.c', 'src/tanh.c'],
+    deps = [':qnnp_interface', ':ukernels_asm', ':ukernels_neon', ':ukernels_psimd', ':ukernels_scalar', ':ukernels_sse2', ':ukernels_sse41', ':ukernels_ssse3', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = ['//third_party:cpuinfo'],
+    compiler_flags = ['-O2', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION', '-Wno-deprecated-declarations'],
+    preferred_linkage = "static",
+    exported_headers = subdir_glob([("src", "qnnpack/*.h"),("include", "*.h"),]),
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_ssse3",
+    srcs = ['wrappers/requantization/gemmlowp-ssse3.c', 'wrappers/requantization/precise-ssse3.c', 'wrappers/requantization/q31-ssse3.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = [],
+    compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['86', ['-mssse3', '-mno-sse4']], ['osmeta', ['-mosmeta-no-restrict-sse']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_psimd",
+    srcs = ['src/requantization/fp32-psimd.c', 'src/requantization/precise-psimd.c', 'src/sgemm/6x8-psimd.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv', '//third_party:psimd'],
+    exported_deps = [],
+    compiler_flags = ['-O3', '-ffast-math', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_scalar",
+    srcs = ['src/requantization/fp32-scalar.c', 'src/requantization/gemmlowp-scalar.c', 'src/requantization/precise-scalar.c', 'src/requantization/q31-scalar.c', 'src/u8lut32norm/scalar.c', 'src/x8lut/scalar.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = [],
+    compiler_flags = ['-O2', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_asm",
+    srcs = ['wrappers/dummy.c', 'wrappers/hgemm/8x8-aarch32-neonfp16arith.S', 'wrappers/q8conv/4x8-aarch32-neon.S', 'wrappers/q8dwconv/up8x9-aarch32-neon.S', 'wrappers/q8dwconv/up8x9-aarch32-neon-per-channel.S', 'wrappers/q8gemm/4x8-aarch32-neon.S', 'wrappers/q8gemm/4x8-dq-aarch32-neon.S', 'wrappers/q8gemm/4x8c2-xzp-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x4-packA-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S', 'wrappers/q8gemm_sparse/8x4-packA-aarch64-neon.S', 'wrappers/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S', 'wrappers/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S', 'wrappers/q8conv/8x8-aarch64-neon.S', 'wrappers/q8gemm/8x8-aarch64-neon.S', 'wrappers/q8gemm/8x8-dq-aarch64-neon.S'],
+    deps = [],
+    exported_deps = [],
+    compiler_flags = ['-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['^iphoneos-armv7$', ['-mfpu=neon-vfpv4']], ['osmeta', ['-mfpu=neon-vfpv4']]],
+    platform_preprocessor_flags = [['android', ['-D__ELF__=1']], ['tizen', ['-D__ELF__=1']], ['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_sse41",
+    srcs = ['wrappers/requantization/gemmlowp-sse4.c', 'wrappers/requantization/precise-sse4.c', 'wrappers/requantization/q31-sse4.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = [],
+    compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['86', ['-msse4.1', '-mno-sse4.2']], ['osmeta', ['-mosmeta-no-restrict-sse']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_neon",
+    srcs = ['wrappers/q8avgpool/mp8x9p8q-neon.c', 'wrappers/q8avgpool/up8x9-neon.c', 'wrappers/q8avgpool/up8xm-neon.c', 'wrappers/q8conv/4x8-neon.c', 'wrappers/q8conv/8x8-neon.c', 'wrappers/q8dwconv/mp8x25-neon.c', 'wrappers/q8dwconv/mp8x25-neon-per-channel.c', 'wrappers/q8dwconv/mp8x27-neon.c', 'wrappers/q8dwconv/up8x9-neon.c', 'wrappers/q8dwconv/up8x9-neon-per-channel.c', 'wrappers/q8gavgpool/mp8x7p7q-neon.c', 'wrappers/q8gavgpool/up8x7-neon.c', 'wrappers/q8gavgpool/up8xm-neon.c', 'wrappers/q8gemm/4x-sumrows-neon.c', 'wrappers/q8gemm/4x8-dq-neon.c', 'wrappers/q8gemm/4x8-neon.c', 'wrappers/q8gemm/4x8c2-xzp-neon.c', 'wrappers/q8gemm/6x4-neon.c', 'wrappers/q8gemm/8x8-neon.c', 'wrappers/q8vadd/neon.c', 'wrappers/requantization/fp32-neon.c', 'wrappers/requantization/gemmlowp-neon.c', 'wrappers/requantization/precise-neon.c', 'wrappers/requantization/q31-neon.c', 'wrappers/sgemm/5x8-neon.c', 'wrappers/sgemm/6x8-neon.c', 'wrappers/u8clamp/neon.c', 'wrappers/u8maxpool/16x9p8q-neon.c', 'wrappers/u8maxpool/sub16-neon.c', 'wrappers/u8rmax/neon.c', 'wrappers/x8zip/x2-neon.c', 'wrappers/x8zip/x3-neon.c', 'wrappers/x8zip/x4-neon.c', 'wrappers/x8zip/xm-neon.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = [],
+    compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "ukernels_sse2",
+    srcs = ['wrappers/q8avgpool/mp8x9p8q-sse2.c', 'wrappers/q8avgpool/up8x9-sse2.c', 'wrappers/q8avgpool/up8xm-sse2.c', 'wrappers/q8conv/4x4c2-sse2.c', 'wrappers/q8dwconv/mp8x25-sse2.c', 'wrappers/q8dwconv/mp8x25-sse2-per-channel.c', 'wrappers/q8dwconv/mp8x27-sse2.c', 'wrappers/q8dwconv/up8x9-sse2.c', 'wrappers/q8dwconv/up8x9-sse2-per-channel.c', 'wrappers/q8gavgpool/mp8x7p7q-sse2.c', 'wrappers/q8gavgpool/up8x7-sse2.c', 'wrappers/q8gavgpool/up8xm-sse2.c', 'wrappers/q8gemm/2x4c8-sse2.c', 'wrappers/q8gemm/4x4c2-dq-sse2.c', 'wrappers/q8gemm/4x4c2-sse2.c', 'wrappers/q8gemm_sparse/8x4c1x4-packed-sse2.c', 'wrappers/q8vadd/sse2.c', 'wrappers/requantization/fp32-sse2.c', 'wrappers/requantization/gemmlowp-sse2.c', 'wrappers/requantization/precise-sse2.c', 'wrappers/requantization/q31-sse2.c', 'wrappers/u8clamp/sse2.c', 'wrappers/u8maxpool/16x9p8q-sse2.c', 'wrappers/u8maxpool/sub16-sse2.c', 'wrappers/u8rmax/sse2.c', 'wrappers/x8zip/x2-sse2.c', 'wrappers/x8zip/x3-sse2.c', 'wrappers/x8zip/x4-sse2.c', 'wrappers/x8zip/xm-sse2.c'],
+    deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'],
+    exported_deps = [],
+    compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_compiler_flags = [['86', ['-msse2', '-mno-sse3']]],
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
+
+
+cxx_library(
+    name = "qnnp_interface",
+    srcs = [],
+    deps = ['//third_party:pthreadpool_header'],
+    exported_deps = [],
+    compiler_flags = ['-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'],
+    preferred_linkage = "static",
+    header_namespace = "",
+    headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]),
+    link_whole = False,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    visibility = ['PUBLIC'],
+)
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
index 30cc61dc17fb..4a86d641e412 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
@@ -10,7 +10,7 @@ project(googletest-download NONE)
 
 include(ExternalProject)
 ExternalProject_Add(googletest
-  URL https://github.com/google/googletest/archive/release-1.8.0.zip
+  URL https://github.com/google/googletest/archive/release-1.10.0.zip
   URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
   SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
   BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
index 30cc61dc17fb..4a86d641e412 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
@@ -10,7 +10,7 @@ project(googletest-download NONE)
 
 include(ExternalProject)
 ExternalProject_Add(googletest
-  URL https://github.com/google/googletest/archive/release-1.8.0.zip
+  URL https://github.com/google/googletest/archive/release-1.10.0.zip
   URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
   SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
   BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
index 1f6d6f1d9105..60ea7822a760 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@@ -6,8 +6,8 @@
 #include <pytorch_qnnpack.h>
 #include <qnnpack_func.h>
 
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/cpu/xnnpack_utils.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/utils/Factory.h>
 
 #include <utility>
@@ -40,6 +40,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
         orig_weight(std::move(orig_weight)),
         bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
             bias, bias.suggest_memory_format())),
+        per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
         input_scale(std::move(input_scale)),
         w_scales(w_scales),
         w_zero_points(std::move(w_zps)) {}
@@ -47,6 +48,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
   std::unique_ptr<qnnpack::PackBMatrix> w;
   at::Tensor orig_weight;
   at::Tensor bias_;
+  bool per_channel_;
   c10::optional<double> input_scale;
   at::Tensor w_scales;
   std::vector<uint8_t> w_zero_points;
@@ -74,8 +76,23 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
       at::Tensor weight,
       c10::optional<at::Tensor> bias);
 
+  bool per_channel() const {
+    return per_channel_;
+  }
+
  private:
   std::mutex qnnp_mutex_;
+
+#ifdef USE_XNNPACK
+  xnnpack_operator xnnp_linear_op;
+
+  template <typename scalar_t, bool kReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+
   template <bool ReluFused>
   at::Tensor apply_impl(
       at::Tensor input,
@@ -112,6 +129,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
         dilation_(std::move(dilation)),
         groups_(groups),
         transpose_(transpose),
+        is_per_channel_(is_per_channel),
         input_scale(input_scale),
         kernel_(std::move(kernel)),
         w_scales(w_scale),
@@ -200,7 +218,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
     convolution->input_padding_height = padding_[kSpatialDim - 2];
     convolution->input_padding_width = padding_[kSpatialDim - 1];
     convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
-    convolution->per_channel = is_per_channel;
+    convolution->per_channel = is_per_channel_;
     convolution->transpose = transpose_;
 
     const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
@@ -260,6 +278,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
   }
 
   std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
+  #ifdef USE_XNNPACK
+  xnnpack_operator xnnp_convolution_op;
+  #endif  // USE_XNNPACK
   std::unique_ptr<qnnpack::PrePackConvWeights> w;
   at::Tensor orig_weight;
   at::Tensor bias;
@@ -269,6 +290,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
   torch::List<int64_t> dilation_;
   int64_t groups_;
   bool transpose_;
+  bool is_per_channel_;
   c10::optional<double> input_scale;
   std::vector<int64_t> kernel_;
   at::Tensor w_scales;
@@ -326,6 +348,10 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
     return transpose_;
   }
 
+  bool per_channel() const {
+    return is_per_channel_;
+  }
+
  private:
   std::mutex qnnp_mutex_;
   template <bool ReluFused>
@@ -333,6 +359,14 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
       const at::Tensor& input,
       double output_scale,
       int64_t output_zero_point);
+
+#ifdef USE_XNNPACK
+  template <typename scalar_t, bool ReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
 };
 
 enum class Activation : uint8_t { NONE = 0, RELU = 1 };
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
new file mode 100644
index 000000000000..cfe326aed421
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -0,0 +1,148 @@
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#ifdef USE_PYTORCH_QNNPACK
+#include <ATen/native/quantized/cpu/init_qnnpack.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
+#include <pytorch_qnnpack.h>
+#endif // USE_PYTORCH_QNNPACK
+
+namespace at {
+namespace native {
+
+namespace {
+
+#ifdef USE_PYTORCH_QNNPACK
+
+const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+const static int qnnpack_softmax_output_zero_point = 0;
+
+bool is_qnnpack_compatible(
+    const Tensor& qx,
+    const double output_scale,
+    const int64_t output_zero_point) {
+  return (
+      (qx.qscheme() == kPerTensorAffine ||
+       qx.qscheme() == kPerTensorSymmetric) &&
+      qx.scalar_type() == c10::kQUInt8 && qx.ndimension() > 0 &&
+      output_scale == qnnpack_softmax_output_scale &&
+      output_zero_point == qnnpack_softmax_output_zero_point);
+}
+
+Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) {
+  /*
+    Cases for contiguity/dimensionality
+    1) stride along target dim is 1
+        requires no change to qx
+    2) dim is the last dimension (but qx is not contiguous)
+        requires using qx.contiguous()
+    3) other
+        requires permuting qx.contiguous()
+   */
+
+  const int64_t last_dim = qx.dim() - 1;
+  c10::optional<std::vector<int64_t>> permuted_dims = c10::nullopt;
+  c10::optional<at::Tensor> qx_contig = c10::nullopt;
+  const at::Tensor* qx_contig_ptr = nullptr;
+
+  if (qx.stride(dim) == 1) {
+    qx_contig_ptr = &qx;
+  } else if (dim == last_dim) {
+    qx_contig = qx.contiguous();
+    qx_contig_ptr = &qx_contig.value();
+  } else {
+    permuted_dims = std::vector<int64_t>(qx.dim());
+    std::iota(permuted_dims->begin(), permuted_dims->end(), 0);
+    permuted_dims->at(last_dim) = dim;
+    permuted_dims->at(dim) = last_dim;
+    qx_contig = qx.permute(permuted_dims.value()).contiguous();
+    qx_contig_ptr = &qx_contig.value();
+  }
+
+  at::Tensor qy = at::_empty_affine_quantized(
+      qx_contig_ptr->sizes(),
+      at::device(kCPU)
+          .dtype(qx.scalar_type())
+          .memory_format(qx_contig_ptr->suggest_memory_format()),
+      qnnpack_softmax_output_scale,
+      qnnpack_softmax_output_zero_point,
+      c10::nullopt);
+
+  const size_t channels = qx.size(dim);
+  const float input_scale = static_cast<float>(qx.q_scale());
+  const uint32_t flags = 0;
+  const size_t batch_size = qx.numel() / channels;
+  const uint8_t* input =
+      reinterpret_cast<const uint8_t*>(qx_contig_ptr->data_ptr<c10::quint8>());
+  const size_t input_stride = channels;
+  uint8_t* output = reinterpret_cast<uint8_t*>(qy.data_ptr<c10::quint8>());
+  const size_t output_stride = channels;
+
+  initQNNPACK();
+  pytorch_qnnp_operator_t softargmax = nullptr;
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> softmax_op(
+      softargmax);
+
+  pytorch_qnnp_status status = pytorch_qnnp_create_softargmax_nc_q8(
+      channels,
+      input_scale,
+      qnnpack_softmax_output_zero_point,
+      qnnpack_softmax_output_scale,
+      flags,
+      &softargmax);
+  TORCH_CHECK(
+      status == pytorch_qnnp_status_success,
+      "failed to create QNNPACK Softmax operator");
+  CHECK_NOTNULL(softargmax);
+
+  status = pytorch_qnnp_setup_softargmax_nc_q8(
+      softargmax, batch_size, input, input_stride, output, output_stride);
+  TORCH_CHECK(
+      status == pytorch_qnnp_status_success,
+      "failed to setup QNNPACK Softmax operator");
+
+  pthreadpool_t threadpool = caffe2::pthreadpool_();
+  status = pytorch_qnnp_run_operator(softargmax, threadpool);
+  TORCH_CHECK(
+      status == pytorch_qnnp_status_success,
+      "failed to run QNNPACK Softmax operator");
+
+  return permuted_dims.has_value() ? qy.permute(permuted_dims.value()) : qy;
+}
+
+#endif // USE_PYTORCH_QNNPACK
+
+Tensor qsoftmax_naive(
+    const Tensor& qx,
+    const int64_t dim,
+    const double output_scale,
+    const int64_t output_zero_point) {
+  Tensor rx = at::dequantize(qx);
+  Tensor ry = at::softmax(rx, dim);
+  return at::quantize_per_tensor(
+      ry, output_scale, output_zero_point, qx.scalar_type());
+}
+
+Tensor qsoftmax(
+    const Tensor& qx,
+    const int64_t dim,
+    const double output_scale,
+    const int64_t output_zero_point) {
+#ifdef USE_PYTORCH_QNNPACK
+  if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
+      is_qnnpack_compatible(qx, output_scale, output_zero_point)) {
+    return qsoftmax_qnnpack(qx, dim);
+  }
+#endif // USE_PYTORCH_QNNPACK
+  return qsoftmax_naive(qx, dim, output_scale, output_zero_point);
+}
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::softmax"), TORCH_FN(qsoftmax));
+}
+
+} // namespace
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/quantized_ops.h b/aten/src/ATen/native/quantized/cpu/quantized_ops.h
index a1766380fe53..bfa1f1f77562 100644
--- a/aten/src/ATen/native/quantized/cpu/quantized_ops.h
+++ b/aten/src/ATen/native/quantized/cpu/quantized_ops.h
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/native/Activation.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 
@@ -8,7 +9,7 @@ namespace native {
 using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
 using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
                                 const Scalar& /*negval_*/);
-using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */);
 using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point);
 using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
 using qclamp_fn = void (*)(
diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp
index ab30cd7d3810..2fcd308cfd82 100644
--- a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp
@@ -1,7 +1,9 @@
 #include <ATen/ATen.h>
+#include <ATen/Parallel.h>
 #include <ATen/native/UpSample.h>
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
+#include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
 
 #include <algorithm>
@@ -13,6 +15,18 @@ namespace at {
 namespace native {
 namespace {
 
+// pre calcuate interpolation params on width
+struct UpsampleBilinearParamW {
+  int64_t w1, w1p;
+  float w0lambda, w1lambda;
+
+  UpsampleBilinearParamW(int64_t w1, int64_t w1p, float w0lambda, float w1lambda)
+    : w1(w1)
+    , w1p(w1p)
+    , w0lambda(w0lambda)
+    , w1lambda(w1lambda) {}
+};
+
 // at::native functions for the native_functions.yaml
 template <typename scalar_t>
 static void upsample_bilinear2d_out_frame(
@@ -50,51 +64,73 @@ static void upsample_bilinear2d_out_frame(
   const auto rheight = area_pixel_compute_scale<float>(
       input_height, output_height, align_corners, scales_h);
 
-  const auto rwidth =
-      area_pixel_compute_scale<float>(input_width, output_width, align_corners, scales_w);
+  const auto rwidth = area_pixel_compute_scale<float>(
+      input_width, output_width, align_corners, scales_w);
+
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   float output_scale = output.q_scale() / input.q_scale();
 
   const int64_t input_q_zero_point = input.q_zero_point();
   const int64_t output_q_zero_point = output.q_zero_point();
 
-  for (const auto h2 : c10::irange(output_height)) {
-    const auto h1r = area_pixel_compute_source_index<float>(
-        rheight, h2, align_corners, /*cubic=*/false);
+  std::vector<UpsampleBilinearParamW> params_w;
+  params_w.reserve(output_width);
+  for (const auto w2 : c10::irange(output_width)) {
+    const auto w1r = area_pixel_compute_source_index<float>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+
+    const int64_t w1 = w1r;
+    const int64_t w1p = (w1 < input_width - 1) ? 1 : 0;
+
+    const float w1lambda = w1r - w1;
+    const float w0lambda = static_cast<float>(1.) - w1lambda;
+
+    params_w.emplace_back(w1, w1p, w0lambda, w1lambda);
+  }
+
+  // compared to 'nearest', each requires 4 points and takes additional * and +
+  // set the scale to be 16.
+  int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, output_width) / 16;
+  at::parallel_for(0, channels * output_height, grain_size, [&](int64_t begin, int64_t end) {
+    int64_t nc{0}, h2{0};
+    data_index_init(begin, nc, channels, h2, output_height);
+
+    for (const auto i : c10::irange(begin, end)) {
+      const auto h1r = area_pixel_compute_source_index<float>(
+          rheight, h2, align_corners, /*cubic=*/false);
 
-    const int64_t h1 = h1r;
-    const int64_t h1p = (h1 < input_height - 1) ? 1 : 0;
+      const int64_t h1 = h1r;
+      const int64_t h1p = (h1 < input_height - 1) ? 1 : 0;
 
-    const float h1lambda = h1r - h1;
-    const float h0lambda = static_cast<float>(1.) - h1lambda;
+      const float h1lambda = h1r - h1;
+      const float h0lambda = static_cast<float>(1.) - h1lambda;
 
-    for (const auto w2 : c10::irange(output_width)) {
-      const auto w1r = area_pixel_compute_source_index<float>(
-          rwidth, w2, align_corners, /*cubic=*/false);
+      const auto* i_ptr = &i_p[nc * input_height * input_width];
+      auto* pos2 = &o_p[i * output_width];
 
-      const int64_t w1 = w1r;
-      const int64_t w1p = (w1 < input_width - 1) ? 1 : 0;
+      for (const auto w2 : c10::irange(output_width)) {
+        const auto& param_w = params_w[w2];
+        const int64_t w1 = param_w.w1;
+        const int64_t w1p = param_w.w1p;
+        const float w0lambda = param_w.w0lambda;
+        const float w1lambda = param_w.w1lambda;
 
-      const float w1lambda = w1r - w1;
-      const float w0lambda = static_cast<float>(1.) - w1lambda;
-      const typename scalar_t::underlying* pos1 = i_p + h1 * input_width + w1;
-      typename scalar_t::underlying* pos2 = o_p + h2 * output_width + w2;
+        const auto* pos1 = i_ptr + h1 * input_width + w1;
 
-      for (const auto c : c10::irange(channels)) {
-        (void)c; //Suppress unused variable warning
         float result = h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
             h1lambda *
                 (w0lambda * pos1[h1p * input_width] +
                  w1lambda * pos1[h1p * input_width + w1p]) - input_q_zero_point;
         // requantization
-        pos2[0] = at::native::quantize_val<scalar_t>(
+        pos2[w2] = at::native::quantize_val<scalar_t>(
                       output_scale, output_q_zero_point, result)
                       .val_;
-        pos1 += input_width * input_height;
-        pos2 += output_width * output_height;
       }
+
+      data_index_step(nc, channels, h2, output_height);
     }
-  }
+  });
+
 }
 
 } // namespace
@@ -178,7 +214,7 @@ using at::native::upsample::get_scale_value;
 
 Tensor upsample_bilinear2d_quantized_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
       bool align_corners,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp
index 377ef15790b1..c4f8e452c95c 100644
--- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp
@@ -1,8 +1,10 @@
 #include <ATen/ATen.h>
+#include <ATen/Parallel.h>
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/quantized/Quantizer.h>
+#include <ATen/native/cpu/utils.h>
 
 #include <c10/util/irange.h>
 
@@ -44,25 +46,32 @@ static void upsample_nearest2d_out_frame(
     return;
   }
 
-  for (const auto h2 : c10::irange(output_height)) {
-    const int64_t h1 =
-        nn_compute_source_index_fn(height_scale, h2, input_height);
+  std::unique_ptr<int64_t []> input_offset_arr(new int64_t[output_width]);
+  int64_t* input_offset = input_offset_arr.get();
 
-    for (const auto w2 : c10::irange(output_width)) {
-      const int64_t w1 =
-          nn_compute_source_index_fn(width_scale, w2, input_width);
+  for (const auto w2 : c10::irange(output_width)) {
+    const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
+    input_offset[w2] = w1;
+  }
+
+  int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, output_width);
+  at::parallel_for(0, channels * output_height, grain_size, [&](int64_t begin, int64_t end) {
+    int64_t nc{0}, h2{0};
+    data_index_init(begin, nc, channels, h2, output_height);
 
-      const auto* pos1 = &i_p[h1 * input_width + w1];
-      auto* pos2 = &o_p[h2 * output_width + w2];
+    for (const auto i : c10::irange(begin, end)) {
+      const int64_t h1 = nn_compute_source_index_fn(height_scale, h2, input_height);
+      const auto* pos1 = &i_p[nc * input_height * input_width + h1 * input_width];
+      auto* pos2 = &o_p[i * output_width];
 
-      for (const auto c : c10::irange(channels)) {
-        (void)c; //Suppress unused variable warning
-        pos2[0] = pos1[0];
-        pos1 += input_height * input_width;
-        pos2 += output_height * output_width;
+      for (const auto w2 : c10::irange(output_width)) {
+        const int64_t w1 = input_offset[w2];
+        pos2[w2] = pos1[w1];
       }
+
+      data_index_step(nc, channels, h2, output_height);
     }
-  }
+  });
 }
 
 template <typename scalar_t, nn_compute_source_index_fn_t nn_compute_source_index_fn>
@@ -80,29 +89,24 @@ static void upsample_nearest2d_out_frame_nhwc(
   float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
   float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
 
-  for (const auto b : c10::irange(nbatch)) {
-    auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(idata + b * input_height * input_width * channels);
-    auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(odata + b * output_height * output_width * channels);
-    // special case: just copy
-    if (input_height == output_height && input_width == output_width) {
-      std::memcpy(o_p, i_p, channels * input_height * input_width * sizeof(typename scalar_t::underlying));
-      return;
-    }
+  at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t b{0}, h2{0}, w2{0};
+    data_index_init(begin, b, nbatch, h2, output_height, w2, output_width);
 
-    for (const auto h2 : c10::irange(output_height)) {
-      const int64_t h1 =
-          nn_compute_source_index_fn(height_scale, h2, input_height);
+    for (const auto i : c10::irange(begin, end)) {
+      auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(idata + b * input_height * input_width * channels);
+      auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(odata + i * channels);
 
-      for (const auto w2 : c10::irange(output_width)) {
-        const int64_t w1 =
-            nn_compute_source_index_fn(width_scale, w2, input_width);
+      const int64_t h1 = nn_compute_source_index_fn(height_scale, h2, input_height);
+      const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
 
-        const auto* pos1 = &i_p[(h1 * input_width + w1)*channels];
-        auto* pos2 = &o_p[(h2 * output_width + w2)*channels];
-        std::memcpy(pos2, pos1, channels * sizeof(typename scalar_t::underlying));
-      }
+      const auto* pos1 = &i_p[(h1 * input_width + w1)*channels];
+      auto* pos2 = &o_p[0];
+      std::memcpy(pos2, pos1, channels * sizeof(typename scalar_t::underlying));
+
+      data_index_step(b, nbatch, h2, output_height, w2, output_width);
     }
-  }
+  });
 }
 
 template <nn_compute_source_index_fn_t nn_compute_source_index_fn>
@@ -137,6 +141,12 @@ Tensor _upsample_nearest2d_quantized_cpu(
         input.q_zero_point(),
         c10::nullopt);
 
+    // special case: just copy
+    if (input_height == output_height && input_width == output_width) {
+      output.copy_(input);
+      return output;
+    }
+
     AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_nearest2d", [&] {
       auto* idata = static_cast<scalar_t*>(input.data_ptr());
       auto* odata = static_cast<scalar_t*>(output.data_ptr());
@@ -202,7 +212,7 @@ Tensor _upsample_nearest_exact2d_quantized_cpu(
 
 Tensor upsample_nearest2d_quantized_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
@@ -212,7 +222,7 @@ Tensor upsample_nearest2d_quantized_cpu(
 
 Tensor _upsample_nearest_exact2d_quantized_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp
index db4077ef4328..d2e835421336 100644
--- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp
@@ -232,7 +232,7 @@ Tensor _upsample_nearest_exact3d_quantized_cpu(
 
 Tensor upsample_nearest3d_quantized_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
@@ -243,7 +243,7 @@ Tensor upsample_nearest3d_quantized_cpu(
 
 Tensor _upsample_nearest_exact3d_quantized_cpu(
     const Tensor& input,
-    c10::optional<IntArrayRef> output_size,
+    at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
diff --git a/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp b/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp
new file mode 100644
index 000000000000..d0164f736352
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp
@@ -0,0 +1,37 @@
+#ifdef USE_RUY_QMATMUL
+
+#include <ATen/ATen.h>
+#include <ATen/native/quantized/cpu/ruy_utils.h>
+
+namespace at {
+namespace native {
+namespace ruy_utils {
+
+static thread_local ruy::Context context;
+
+ruy::Context* get_ruy_context() {
+  return &context;
+}
+
+// Adopted from Ruy:
+// https://github.com/google/ruy/blob/2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0/ruy/test.h#L1602
+void quantize_multiplier(double scale,
+                         int* multiplier_fixedpoint,
+                         int* multiplier_exponent) {
+  TORCH_CHECK(scale > 0, "Quantization scale (", scale, ") must be positive.");
+  const double q = std::frexp(scale, multiplier_exponent);
+  auto q_fixed = static_cast<std::int64_t>(std::round(q * (1ll << 31)));
+  TORCH_CHECK(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*multiplier_exponent;
+  }
+  TORCH_CHECK(q_fixed <= std::numeric_limits<std::int32_t>::max());
+  *multiplier_fixedpoint = static_cast<std::int32_t>(q_fixed);
+}
+
+} // namespace ruy_utils
+} // namespace native
+} // namesplace
+
+#endif // USE_RUY_QMATMUL
diff --git a/aten/src/ATen/native/quantized/cpu/ruy_utils.h b/aten/src/ATen/native/quantized/cpu/ruy_utils.h
new file mode 100644
index 000000000000..aeb332af4eca
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/ruy_utils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef USE_RUY_QMATMUL
+
+#include <ruy/ruy.h>
+
+namespace at {
+namespace native {
+namespace ruy_utils {
+
+ruy::Context* get_ruy_context();
+
+void quantize_multiplier(double scale,
+                         int* multiplier_fixedpoint,
+                         int* multiplier_exponent);
+
+} // namespace ruy_utils
+} // namespace native
+} // namesplace
+
+#endif // USE_RUY_QMATMUL
diff --git a/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp
new file mode 100644
index 000000000000..8f81c8ea8d5e
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp
@@ -0,0 +1,89 @@
+#ifdef USE_XNNPACK
+
+#include <ATen/ATen.h>
+#include <ATen/quantized/Quantizer.h>
+#include <ATen/native/quantized/cpu/xnnpack_utils.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace native {
+namespace xnnp_utils {
+
+std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
+  const auto mem_format = in.suggest_memory_format();
+  const auto& sizes = in.sizes();
+  std::vector<size_t> ret(sizes.begin(), sizes.end());
+  if (mem_format == c10::MemoryFormat::ChannelsLast) {
+    // NCHW -> NHWC
+    // 0123 -> 0231
+    ret[1] = sizes[2]; /* H */
+    ret[2] = sizes[3]; /* W */
+    ret[3] = sizes[1]; /* C */
+  } else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
+    // NCDHW -> NDHWC
+    // 01234 -> 02341
+    ret[1] = sizes[2]; /* D */
+    ret[2] = sizes[3]; /* H */
+    ret[3] = sizes[4]; /* W */
+    ret[4] = sizes[1]; /* C */
+  }
+  return ret;
+}
+
+template <typename PT>
+void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out) {
+  using T = typename PT::underlying;
+  static constexpr auto offset = std::is_same<T, uint8_t>::value ? 128 : 0;
+  TORCH_CHECK(
+      in.scalar_type() == c10::kQInt8,
+      "q8_copy_int8_weight_and_add_offset: Expected input weight data type ",
+      toString(c10::kQInt8),
+      " but got ",
+      toString(in.scalar_type()))
+  const int8_t* in_ptr =
+      reinterpret_cast<const int8_t*>(in.data_ptr<c10::qint8>());
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr<PT>());
+
+  for (const auto i : c10::irange(in.numel())) {
+    out_ptr[i] = static_cast<T>(static_cast<int32_t>(in_ptr[i]) + offset);
+  }
+}
+
+template void q8_copy_int8_weight_and_add_offset<c10::quint8>(
+    const at::Tensor& in,
+    at::Tensor& out);
+template void q8_copy_int8_weight_and_add_offset<c10::qint8>(
+    const at::Tensor& in,
+    at::Tensor& out);
+
+/*
+ * Stolen from fbgemm_utils::ConvertConvWeightsToChannelLastTensor to avoid
+ * dependence on USE_FBGEMM. Reorder weights to the format xnnpack expects.
+ * TODO: add a 3d variant.
+ */
+template <>
+Tensor convert_conv_weights_to_channel_last_tensor<2>(
+    const at::Tensor& src,
+    int groups,
+    bool transpose) {
+  return transpose ?
+                   // 2D conv transpose weight transform
+                   // IC OC/G KH KW -> G OC/G KH KW IC/G
+      [&]() {
+        auto ic_g_oc_g_hw_tensors = src.chunk(groups);
+        for (auto& tensor : ic_g_oc_g_hw_tensors) {
+          tensor = tensor.unsqueeze(0);
+        }
+        auto fused_tensor = at::cat(ic_g_oc_g_hw_tensors);
+        set_quantizer_(fused_tensor, src.quantizer());
+        return fused_tensor.permute({0, 2, 3, 4, 1})
+            .contiguous(c10::MemoryFormat::Contiguous);
+      }()
+                   // 2d conv weight transform
+                   : src.contiguous(c10::MemoryFormat::ChannelsLast);
+}
+} // namespace xnnp_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_XNNPACK
diff --git a/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h
new file mode 100644
index 000000000000..78f325263f4f
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+#include <cstdint>
+
+#include <ATen/ATen.h>
+#include <ATen/native/xnnpack/Common.h>
+
+using xnnpack_operator = at::native::xnnpack::Operator;
+
+namespace at {
+namespace native {
+namespace xnnp_utils {
+
+/*
+ * Return shape in the same order as the memory format
+ * e.g. channels_last will return NHWC instead of NCHW
+ */
+std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in);
+
+/*
+ * Input is always int8_t, output can be [int8_t, uint8_t].
+ * input  + offset = output
+ * int8_t + 128    = uint8_t
+ * int8_t + 0      = int8_t
+ */
+template <typename PT>
+void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out);
+
+template <int kSpatialDim>
+Tensor convert_conv_weights_to_channel_last_tensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
+
+/*
+ * Series of create wrapper functions to call xnn_create_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_convolution2d_nhwc(
+    uint32_t pad_top,
+    uint32_t pad_right,
+    uint32_t pad_bottom,
+    uint32_t pad_left,
+    uint32_t kernel_h,
+    uint32_t kernel_w,
+    uint32_t stride_h,
+    uint32_t stride_w,
+    uint32_t dilation_h,
+    uint32_t dilation_w,
+    uint32_t groups,
+    size_t group_input_channels,
+    size_t group_output_channels,
+    size_t ip_chan_stride,
+    size_t op_chan_stride,
+    int8_t izp,
+    float ip_scale,
+    int8_t kzp,
+    const float* k_scales,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t ozp,
+    float op_scale,
+    int8_t op_min,
+    int8_t op_max,
+    uint32_t flags,
+    xnn_operator_t* op,
+    bool per_channel,
+    bool transpose) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero."
+                    "But got: ", kzp);
+
+  if (transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_create_deconvolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t output_padding_top          */
+        pad_right,      /* uint32_t output_padding_right        */
+        pad_bottom,     /* uint32_t output_padding_bottom       */
+        pad_left,       /* uint32_t output_padding_left         */
+        kernel_h,       /* uint32_t kernel_height               */
+        kernel_w,       /* uint32_t kernel_width                */
+        stride_h,       /* uint32_t stride_height               */
+        stride_w,       /* uint32_t stride_width                */
+        dilation_h,     /* uint32_t dilation_height             */
+        dilation_w,     /* uint32_t dilation_width              */
+        groups,         /* uint32_t groups                      */
+        group_input_channels,  /* size_t group_input_channels   */
+        group_output_channels, /* size_t group_output_channels  */
+        ip_chan_stride, /* size_t input_pixel_stride            */
+        op_chan_stride, /* size_t output_pixel_stride           */
+        izp,            /* int8_t input_zero_point              */
+        ip_scale,       /* float input_scale                    */
+        k_scales[0],    /* float kernel_scale                   */
+        kernel,         /* const int8_t* kernel                 */
+        bias,           /* const int32_t* bias                  */
+        ozp,            /* int8_t output_zero_point             */
+        op_scale,       /* float output_scale                   */
+        op_min,         /* int8_t output_min                    */
+        op_max,         /* int8_t output_max                    */
+        flags,          /* uint32_t flags                       */
+        op);            /* xnn_operator_t* deconvolution_op_out */
+
+  }
+
+  if (!per_channel) {
+    return xnn_create_convolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales[0],    /* float kernel_scale                 */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        op);            /* xnn_operator_t* convolution_op_out */
+  } else { /* per_channel */
+    return xnn_create_convolution2d_nhwc_qc8(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales,       /* const float* kernel_scale          */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        op);            /* xnn_operator_t* convolution_op_out */
+  }
+}
+
+/*
+ * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_convolution2d_nhwc(
+    xnn_operator_t op,
+    size_t batch,
+    size_t in_h,
+    size_t in_w,
+    const int8_t* inp,
+    int8_t* outp,
+    pthreadpool_t pt_pool,
+    bool per_channel = false,
+    bool transpose = false,
+    uint32_t adj_h = 0,
+    uint32_t adj_w = 0) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_setup_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        batch,    /* size_t batch_size               */
+        in_h,     /* size_t input_height             */
+        in_w,     /* size_t input_width              */
+        adj_h,    /* uint32_t adjustment_height      */
+        adj_w,    /* uint32_t adjustment_width       */
+        inp,      /* const int8_t* input             */
+        outp,     /* int8_t* output                  */
+        pt_pool); /* pthreadpool_t threadpool        */
+  }
+
+  if (!per_channel) {
+    return xnn_setup_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        inp,      /* const int8_t* input           */
+        outp,     /* int8_t* output                */
+        pt_pool); /* pthreadpool_t threadpool      */
+  } else { /* per_channel */
+    return xnn_setup_convolution2d_nhwc_qc8(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        inp,      /* const int8_t* input           */
+        outp,     /* int8_t* output                */
+        pt_pool); /* pthreadpool_t threadpool      */
+  }
+}
+
+
+/*
+ * Series of wrapper functions to call xnn_create* and xnn_setup*
+ * functions for linear
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_fully_connected_nc(
+    size_t input_channels,
+    size_t output_channels,
+    size_t input_stride,
+    size_t output_stride,
+    int8_t input_zero_point,
+    float input_scale,
+    int8_t kernel_zero_point,
+    float kernel_scale,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t output_zero_point,
+    float output_scale,
+    int8_t output_min,
+    int8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* fully_connected_op_out) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero."
+                    "But got: ", kernel_zero_point);
+  return xnn_create_fully_connected_nc_qs8(
+      input_channels,          /* size_t input_channels                  */
+      output_channels,         /* size_t output_channels                 */
+      input_stride,            /* size_t input_stride                    */
+      output_stride,           /* size_t output_stride                   */
+      input_zero_point,        /* int8_t input_zero_point                */
+      input_scale,             /* float input_scale                      */
+      kernel_scale,            /* float kernel_scale                     */
+      kernel,                  /* const int8_t* kernel                   */
+      bias,                    /* const int32_t* bias                    */
+      output_zero_point,       /* int8_t output_zero_point               */
+      output_scale,            /* float output_scale                     */
+      output_min,              /* int8_t output_min                      */
+      output_max,              /* int8_t output_max                      */
+      flags,                   /* uint32_t flags                         */
+      fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    size_t batch_size,
+    const int8_t* input,
+    int8_t* output,
+    pthreadpool_t threadpool) {
+  return xnn_setup_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      batch_size,         /* size_t batch_size                 */
+      input,              /* const int8_t* input               */
+      output,             /* int8_t* output                    */
+      threadpool);        /* pthreadpool_t threadpool          */
+}
+
+} // namespace xnnp_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_XNNPACK
diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
new file mode 100644
index 000000000000..fed5600c8369
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
@@ -0,0 +1,258 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+#include <ATen/native/cudnn/Macros.h>
+#if HAS_CUDNN_V8()
+
+#include <ATen/core/TensorBase.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/Handle.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <ATen/TensorUtils.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/QScheme.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/ArrayRef.h>
+#include <torch/library.h>
+
+#include <unordered_map>
+
+namespace at {
+namespace native {
+namespace {
+constexpr uint8_t max_num_input_dim = 5;
+struct AddParams {
+  c10::DeviceIndex device_id;
+  int input_a_size[max_num_input_dim];
+  int input_b_size[max_num_input_dim];
+  uint8_t input_dim; // we currently assume both inputs are given as the same size (i.e., no broadcasting)
+  at::MemoryFormat memory_format;
+  bool deterministic;
+  bool allow_tf32;
+};
+struct CacheKey {
+  AddParams params;
+  uint8_t input_a_alignment;
+  uint8_t input_b_alignment;
+  uint8_t output_alignment;
+  bool kReluFused;
+};
+void setAddParams(
+    AddParams* params, const at::Tensor& input_a, const at::Tensor& input_b,
+    bool deterministic, bool allow_tf32) {
+  memset(params, 0, sizeof(AddParams));
+  params->device_id = at::cuda::current_device();
+  params->input_dim = input_a.dim();
+  params->memory_format = input_a.suggest_memory_format();
+  for (int i = 0; i < params->input_dim; ++i) {
+    params->input_a_size[i] = input_a.sizes()[i];
+    params->input_b_size[i] = input_b.sizes()[i];
+  }
+  params->deterministic = deterministic;
+  params->allow_tf32 = allow_tf32;
+}
+// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp
+// we currently set the maximum number of input dimensions to 5
+// this can be increased, if necessary
+std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, at::native::ParamsHash<CacheKey>, at::native::ParamsEqual<CacheKey>> execution_plan_cache;
+
+// TODO: this is also in qadd.cpp and some other cpp files in quantized/cpu/. I think we should
+// move everything into a utilities file in quantized/ directory later.
+inline void check_inputs(const Tensor& qa, const Tensor& qb) {
+  TORCH_CHECK(
+      qa.qscheme() == kPerTensorAffine,
+      "Only per tensor quantization is suported in Add.");
+  TORCH_CHECK(
+      qa.qscheme() == qb.qscheme(),
+      "Both inputs to Add must have the same quantization shceme.");
+  TORCH_CHECK(
+      qa.scalar_type() == qb.scalar_type(),
+      "Add operands should have same data type.");
+}
+
+// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized add
+// We implement relu ( (a_int8 + b_int8 * ( b_scale/a_scale) ) ) * ( a_scale / out_scale )
+// which requires 4 cudnn ops (2 multiplication, 1 addition, and 1 relu ops)
+// Multiplication ops: rhs_mult_op, requant_op
+// Addition op: add_op
+// Relu op: relu_op
+template <bool kReluFused = false>
+Tensor add(Tensor qa, Tensor qb, double output_scale, int64_t output_zero_point) {
+  if (qa.numel() == 0) {
+    return Tensor{};
+  }
+  // TODO: add shape checking when broadcasted add is supported. For now we assume the input tensors are the same shape
+  TORCH_CHECK(qa.sizes() == qb.sizes(), "Quantized cudnn add currently expects both input tensors to be the same shape");
+
+  check_inputs(qa, qb);
+
+  // cudnn expects tensors to be at least 3D. So we will prepend dummy dimensions if the input tensors are not at least 3D
+  auto orig_sizes = qa.sizes().vec();
+  if (qa.dim() < 3) {
+    std::vector<int64_t> new_sizes(3, 1);
+    // cudnn expects leading dimensions to be the dummy dimensions
+    new_sizes.back() = qa.sizes().back();
+    if (qa.dim() == 2) {
+      new_sizes[1] = qa.size(0);
+    }
+    qa = qa.view(new_sizes);
+    qb = qb.view(new_sizes);
+  } else if (qa.dim() == 4) {
+    qa = qa.contiguous(c10::MemoryFormat::ChannelsLast);
+    qb = qb.contiguous(c10::MemoryFormat::ChannelsLast);
+  }
+
+  auto memory_format = qa.dim() == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+  at::Tensor add_output = at::empty(qa.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
+  at::Tensor quantized_output = at::_empty_affine_quantized(qa.sizes(), at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
+                                                            output_scale, output_zero_point, memory_format);
+  // TODO: When cudnn enables support for broadcasting, we can remove this tensor
+  at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
+  requantize_multiplier_tensor.fill_(qa.q_scale() / output_scale);
+  at::Tensor rhs_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format);
+  rhs_multiplier_tensor.fill_(qb.q_scale() / qa.q_scale());
+
+  cudnnHandle_t handle = at::native::getCudnnHandle();
+  CacheKey key;
+  // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are
+  // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two
+  // CacheKey objects have the same user defined parameters, but
+  // different padded values, resulting in different hash outputs.
+  memset(&key, 0, sizeof(key));
+  bool deterministic{true};
+  bool allow_tf32{false};
+  setAddParams(&key.params, qa, qb, deterministic, allow_tf32);
+  key.kReluFused = kReluFused;
+  key.input_a_alignment = cudnn_utils::getAlignment(qa);
+  key.input_b_alignment = cudnn_utils::getAlignment(qb);
+  key.output_alignment = cudnn_utils::getAlignment(add_output);
+
+  auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) {
+    auto workspace_size = 0;
+    auto workspace = at::empty({workspace_size}, qa.options().dtype(at::kByte));
+    std::vector<void *> data_ptrs;
+    std::vector<int64_t> uids;
+    data_ptrs.reserve(8);
+    uids.reserve(8);
+    data_ptrs = {qb.data_ptr<int8_t>(), rhs_multiplier_tensor.data_ptr(), add_output.data_ptr(),
+                 qa.data_ptr<int8_t>(), add_output.data_ptr(), requantize_multiplier_tensor.data_ptr(),
+                 quantized_output.data_ptr<int8_t>()};
+    uids = {'b', 'm', 'c', 'a', 'p', 'r', 'q'};
+    if (kReluFused) {
+        data_ptrs.emplace_back(add_output.data_ptr()),
+        uids.emplace_back('f');
+    }
+
+    auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setWorkspacePointer(workspace.data_ptr())
+      .setDataPointers(uids.size(), data_ptrs.data())
+      .setUids(uids.size(), uids.data())
+      .build();
+    auto variant_pack_desc = variantPack.get_raw_desc();
+    AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc));
+  };
+
+  auto search = execution_plan_cache.find(key);
+  if (search != execution_plan_cache.end()) {
+    cudnn_frontend::ManagedOpaqueDescriptor plan_desc = search->second;
+    run(plan_desc);
+    return quantized_output.view(orig_sizes);
+  }
+
+  // computes qb_int8 * ( qb_scale/qa_scale )
+  auto rhs_mult_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(cudnn_utils::getTensorDescriptor(qb.sizes(), qb.strides(), CUDNN_DATA_INT8, 'b', key.input_b_alignment))
+      .setbDesc(cudnn_utils::getTensorDescriptor(rhs_multiplier_tensor, 'm', cudnn_utils::getAlignment(rhs_multiplier_tensor)))
+      .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'c', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(add_output)))
+      .build();
+
+  // add_op computes (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) )
+  // add_output is a fp32 tensor for accumulation purposes
+  auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(rhs_mult_op.getOutputTensor())
+      .setbDesc(cudnn_utils::getTensorDescriptor(qa.sizes(), qa.strides(), CUDNN_DATA_INT8, 'a', key.input_a_alignment))
+      .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'p', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(add_output)))
+      .build();
+
+  // relu_op computes
+  // relu( (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) )  )
+  // output is a fp32 tensor
+  c10::optional<cudnn_frontend::Operation> relu_op;
+  if (kReluFused) {
+    // we use inplace operation here where the output is assigned to the input
+    relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(add_op.getOutputTensor())
+      .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'f', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(add_output)))
+      .build());
+  }
+
+  // requant_op computes
+  // (a_int8 + b_int8 * ( b_scale/a_scale) ) * a_scale / out_scale
+  auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+    .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : add_op.getOutputTensor())
+    .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 'r', cudnn_utils::getAlignment(requantize_multiplier_tensor)))
+    .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'q', cudnn_utils::getAlignment(quantized_output)))
+    .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor)))
+    .build();
+
+  std::vector<cudnn_frontend::Operation const *> ops{&rhs_mult_op, &add_op};
+  if (kReluFused) {
+    ops.emplace_back(&(relu_op.value()));
+  }
+  ops.emplace_back(&requant_op);
+
+  auto opGraph = cudnn_frontend::OperationGraphBuilder()
+      .setHandle(handle)
+      .setOperationGraph(ops.size(), ops.data())
+      .build();
+  // std::cout << "opGraph: " << opGraph.describe() << std::endl;
+
+  auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+      .setOperationGraph(opGraph)
+      .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+      .build();
+  auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                    .setOperationGraph(opGraph)
+                    .setOperation(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                    .build();
+
+  auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+  auto& fallback_list = fallback.getFallbackList();
+
+  cudnn_frontend::EngineConfigList filtered_configs;
+  cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar);
+  cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar);
+  for (auto &cfg : engine_configs) {
+    try {
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+        .setHandle(handle)
+        .setEngineConfig(cfg)
+        .build();
+      auto plan_desc = plan.get_desc();
+      run(plan_desc);
+      execution_plan_cache[key] = plan_desc;
+      return quantized_output.view(orig_sizes);
+    } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;}
+  }
+
+  TORCH_CHECK(false, "Unable to find an engine to execute this computation in Quantized Add Cudnn");
+}
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add"), TORCH_FN(add</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"), TORCH_FN(add</*ReLUFused=*/true>));
+}
+
+} // namespace
+} // namespace native
+} // namespace at
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
index bae4b9e2cb9d..6fd5be129c84 100644
--- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
@@ -4,50 +4,29 @@
 #if AT_CUDNN_ENABLED()
 
 #include <ATen/native/cudnn/Macros.h>
+#include <c10/util/ArrayRef.h>
 
 #if HAS_CUDNN_V8()
 
-#include <cudnn_frontend.h>
 #include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/Handle.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/cudnn/ConvShared.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/utils/ParamsHash.h>
-#include <ATen/cudnn/Handle.h>
 #include <ATen/TensorUtils.h>
+#include <cudnn_frontend.h>
 #include <torch/library.h>
 
-#include <unordered_map>
 #include <iostream>
+#include <unordered_map>
+#include <vector>
 
-namespace at { namespace native{
-
-namespace {
-
-uint8_t getAlignment(const Tensor &t) {
-  // alignment are in bytes
-  uint8_t alignment = 1;
-  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
-  while (address % alignment == 0 && alignment < 16) alignment *= 2;
-  return alignment;
-}
-
-cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, int64_t id, uint8_t alignment) {
-  auto shape = t.sizes();
-  auto strides = t.strides();
-  return cudnn_frontend::TensorBuilder()
-    .setDim(shape.size(), shape.data())
-    .setStrides(strides.size(), strides.data())
-    .setId(id)
-    .setAlignment(alignment)
-    .setDataType(getCudnnDataType(t))
-    .build();
-}
-
-// TODO: there is a table from input dtype and weight dtype to operator dtype,
+// TODO: there is a table from input dtype and weight dtype to operator qdtype,
 // we can derive the operator dtype based on input dtype
-cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation) {
+cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, c10::IntArrayRef padding, c10::IntArrayRef stride, c10::IntArrayRef dilation) {
   uint64_t convDim = stride.size();
   return cudnn_frontend::ConvDescBuilder()
     .setDataType(dataType)
@@ -60,79 +39,19 @@ cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArray
     .build();
 }
 
-// TODO: there is a table from input dtype to operator dtype, we can derive
-// the operator dtype based on input dtype
-cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) {
-  return cudnn_frontend::PointWiseDescBuilder()
-    .setMode(CUDNN_POINTWISE_MUL)
-    .setMathPrecision(dataType)
-    .build();
-}
-
-void filterEngineConfigs(
-  cudnn_frontend::EngineConfigList &from,
-  cudnn_frontend::EngineConfigList &to,
-  bool deterministic, bool allow_tf32, c10::ScalarType scalar_type)
-{
-  auto filter = [=](cudnnBackendDescriptor_t c) {
-    if (deterministic) {
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) return true;
-    }
-    if (scalar_type == kFloat || scalar_type == kChar || !allow_tf32) {
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) return true;
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c)) return true;
-    }
-    return false;
-  };
-  cudnn_frontend::filter(from, to, filter);
-}
-
-cudnn_frontend::ExecutionPlan
-get_execplan_from_heuristics_else_fall_back(cudnn_frontend::OperationGraph&& opGraph, cudnnHandle_t handle_) {
-  auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-    .setOperationGraph(opGraph)
-    .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
-    .build();
-
-  // std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl;
-  auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
-
-  // Try engine configs returned by the heuristics and pick up the first one that works.
-  for (auto& ecfg : engine_config) {
-    try {
-      auto plan = cudnn_frontend::ExecutionPlanBuilder()
-        .setHandle(handle_)
-        .setEngineConfig(ecfg, opGraph.getTag())
-        .build();
-      return plan;
-    } catch (cudnn_frontend::cudnnException& e) {
-      continue;
-    }
-  }
-
-  {
-    auto total_engines = opGraph.getEngineCount();
-    // std::cout << opGraph.describe() << " has " << total_engines << " engines." << std::endl;
-    auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
-    // std::cout << engine.describe() << std::endl;
-
-    auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
-    // std::cout << engine_config.describe() << std::endl;
-
-    return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build();
-  }
-}
-
+// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp
+namespace {
 struct CacheKey {
-  ConvolutionParams params;
+  at::native::ConvolutionParams params;
   uint8_t input_alignment;
   uint8_t weight_alignment;
   uint8_t output_alignment;
+  // default to -1 when no bias
+  int8_t bias_alignment;
+  bool kReluFused;
 };
-
-// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp
-std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, ParamsHash<CacheKey>, ParamsEqual<CacheKey>> execution_plan_cache;
-
+std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, at::native::ParamsHash<CacheKey>, at::native::ParamsEqual<CacheKey>> execution_plan_cache;
+}
 // TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
 // multiple operators
 // reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
@@ -144,9 +63,9 @@ at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
     int M, // output channels
     const std::array<int64_t, kSpatialDim>& input_image_shape,
     const std::vector<int64_t>& kernel,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation);
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation);
 
 template <>
 at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
@@ -154,9 +73,9 @@ at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
     int M, // output channels
     const std::array<int64_t, 2>& input_image_shape,
     const std::vector<int64_t>& kernel,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation) {
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation) {
   const int H = input_image_shape[0];
   const int W = input_image_shape[1];
   const int64_t Y_H =
@@ -166,48 +85,93 @@ at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
   return {N, M, Y_H, Y_W};
 }
 
-void raw_cudnn_convolution_forward_out(
-    const Tensor& output,
-    const Tensor& input,
-    const Tensor& weight,
-    IntArrayRef padding,
-    IntArrayRef stride,
-    IntArrayRef dilation,
-    int64_t groups,
-    bool benchmark,
-    bool deterministic,
-    bool allow_tf32,
-    float requantize_multiplier
-) {
-  TORCH_CHECK(!benchmark, "not supported yet");
-  if (output.numel() == 0) {
+
+// the parameter quantized_output is a quantized tensor
+template <int kSpatialDim>
+template <bool kReluFused>
+void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& quantized_output, const at::Tensor& input, double output_scale) {
+  if (quantized_output.numel() == 0) {
     return;
   }
-
-  Tensor conv_output = at::empty_like(output, output.options().dtype(at::kFloat));
-  Tensor requantize_multiplier_tensor = at::empty_like(output, output.options().dtype(at::kFloat));
+  at::Tensor conv_output = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast);
+  // TODO: combine empty & fill_ using full_like or full
+  at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast);
+  auto act_scale = input.q_scale();
+  auto weight_scale = maybe_padded_weight_.q_scale();
+  auto requantize_multiplier = act_scale * weight_scale / output_scale;
   requantize_multiplier_tensor.fill_(requantize_multiplier);
-  cudnnHandle_t handle = getCudnnHandle();
+  c10::optional<at::Tensor> bias_multiplier_tensor;
+  c10::optional<at::Tensor> broadcasted_bias;
+  if (bias_.has_value()) {
+    // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output.
+    // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
+    // the number of trailling dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias
+    // becomes quantized_output.dim() - 2 + 1. nothing needs to be done for the leading dimensions
+    std::vector<int64_t> new_size(quantized_output.dim() - 1, 1);
+    new_size[0] = bias_.value().size(0);
+    broadcasted_bias = bias_.value().reshape(new_size);
+    broadcasted_bias.value() = broadcasted_bias.value().broadcast_to(quantized_output.sizes());
+    broadcasted_bias.value() = broadcasted_bias.value().to(c10::MemoryFormat::ChannelsLast);
+    bias_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast);
+    auto bias_multiplier = 1.0 / (act_scale * weight_scale);
+    bias_multiplier_tensor.value().fill_(bias_multiplier);
+  }
 
+  cudnnHandle_t handle = at::native::getCudnnHandle();
   CacheKey key;
-  setConvolutionParams(&key.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32);
+  // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are
+  // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two
+  // CacheKey objects have the same user defined parameters, but
+  // different padded values, resulting in different hash outputs.
+  memset(&key, 0, sizeof(key));
+  bool deterministic{true};
+  bool allow_tf32{false};
+  auto padding_vec = padding_.vec();
+  auto stride_vec = stride_.vec();
+  auto dilation_vec = dilation_.vec();
+  setConvolutionParams(&key.params, input, maybe_padded_weight_, padding_vec, stride_vec, dilation_vec, groups_, deterministic, allow_tf32);
+
   // operator datatype needs to be int32 for int8 convolution, but we can
   // set the datatype for output tensor to int32 or fp32
   key.params.dataType = CUDNN_DATA_INT32;
-  key.input_alignment = getAlignment(input);
-  key.output_alignment = getAlignment(conv_output);
-  key.weight_alignment = getAlignment(weight);
+  key.input_alignment = cudnn_utils::getAlignment(input);
+  key.output_alignment = cudnn_utils::getAlignment(conv_output);
+  key.weight_alignment = cudnn_utils::getAlignment(maybe_padded_weight_);
+  if (bias_.has_value()) {
+    key.bias_alignment = cudnn_utils::getAlignment(broadcasted_bias.value());
+  } else {
+    key.bias_alignment = -1;
+  }
+  key.kReluFused = kReluFused;
 
   auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) {
     auto workspace_size = 0;
-    auto workspace = at::empty({workspace_size}, input.options().dtype(kByte));
-    void *data_ptrs[] = {reinterpret_cast<int8_t*>(input.data_ptr()), conv_output.data_ptr(), reinterpret_cast<int8_t*>(weight.data_ptr()), requantize_multiplier_tensor.data_ptr(), output.data_ptr()};
-    // std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
-    int64_t uids[] = {'x', 'y', 'w', 's', 'r'};
+    auto workspace = at::empty({workspace_size}, input.options().dtype(at::kByte));
+    std::vector<void *> data_ptrs;
+    std::vector<int64_t> uids;
+    data_ptrs.reserve(10);
+    uids.reserve(10);
+    data_ptrs = {input.data_ptr<int8_t>(), conv_output.data_ptr(), maybe_padded_weight_.data_ptr<int8_t>(),
+                 requantize_multiplier_tensor.data_ptr(), quantized_output.data_ptr<int8_t>()};
+    uids = {'x', 'y', 'w', 's', 'r'};
+    if (bias_.has_value()) {
+      data_ptrs.insert(data_ptrs.end(), {broadcasted_bias.value().data_ptr(), bias_multiplier_tensor.value().data_ptr(),
+                                         broadcasted_bias.value().data_ptr(), conv_output.data_ptr()});
+      uids.insert(uids.end(), {'b', 'c', 'd', 'e'});
+      if (kReluFused) {
+        data_ptrs.emplace_back(conv_output.data_ptr()),
+        uids.emplace_back('f');
+      }
+    } else {
+      if (kReluFused) {
+        data_ptrs.emplace_back(conv_output.data_ptr());
+        uids.emplace_back('f');
+      }
+    }
     auto variantPack = cudnn_frontend::VariantPackBuilder()
       .setWorkspacePointer(workspace.data_ptr())
-      .setDataPointers(5, data_ptrs)
-      .setUids(5, uids)
+      .setDataPointers(uids.size(), data_ptrs.data())
+      .setUids(uids.size(), uids.data())
       .build();
     auto variant_pack_desc = variantPack.get_raw_desc();
     AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc));
@@ -219,25 +183,81 @@ void raw_cudnn_convolution_forward_out(
     run(plan_desc);
     return;
   }
-
+  // conv_op computes act_fp32 * w_fp32 (matrix multiplication)
+  // where act_fp32 and w_fp32 are the input and weight variables, resp.
+  // output is a fp32 tensor
   auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-      .setxDesc(getTensorDescriptor(input, 'x', key.input_alignment))
-      .setyDesc(getTensorDescriptor(conv_output, 'y', key.output_alignment))
-      .setwDesc(getTensorDescriptor(weight, 'w', key.weight_alignment))
-      .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation))
+      .setxDesc(cudnn_utils::getTensorDescriptor(input.sizes(), input.strides(), CUDNN_DATA_INT8, 'x', key.input_alignment))
+      .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'y', key.output_alignment))
+      .setwDesc(cudnn_utils::getTensorDescriptor(maybe_padded_weight_.sizes(), maybe_padded_weight_.strides(), CUDNN_DATA_INT8, 'w', key.weight_alignment))
+      .setcDesc(getConvDescriptor(key.params.dataType, padding_vec, stride_vec, dilation_vec))
       .build();
   // std::cout << "operator:" << conv_op.describe() << std::endl;
-  // TODO: add support for bias
 
+  c10::optional<cudnn_frontend::Operation> bias_mult_op;
+  c10::optional<cudnn_frontend::Operation> sum_conv_bias_op;
+  if (bias_.has_value()) {
+    // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation;
+    // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
+    // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
+
+    // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale))
+    // where bias_multiplier = (1 / (act_scale * w_scale))
+    // output is a fp32 tensor
+    // we use inplace operation here where the output is assigned to the input
+    bias_mult_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'b', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setbDesc(cudnn_utils::getTensorDescriptor(bias_multiplier_tensor.value(), 'c', cudnn_utils::getAlignment(bias_multiplier_tensor.value())))
+      .setyDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(bias_multiplier_tensor.value())))
+      .build());
+
+    // computes (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)])
+    // where the 1st and 2nd summands is conv_output and broadcasted_bias, resp.
+    // output is a fp32 tensor
+    // we use inplace operation here where the output is assigned to the input
+    sum_conv_bias_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(conv_op.getOutputTensor())
+      .setbDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'e', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(broadcasted_bias.value())))
+      .build());
+  }
+
+  // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]
+  // or relu(act_int8 * w_int8) if bias is not present.
+  // output is a fp32 tensor
+  c10::optional<cudnn_frontend::Operation> relu_op;
+  std::shared_ptr<cudnn_frontend::OpaqueBackendPointer> tensor2requant_ptr = bias_.has_value() ? sum_conv_bias_op.value().getOutputTensor() : conv_op.getOutputTensor();
+  if (kReluFused) {
+    // we use inplace operation here where the output is assigned to the input
+    relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(tensor2requant_ptr)
+      .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'f', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(conv_output)))
+      .build());
+  }
+
+  // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale))
+  // or relu(act_int8 * w_int8) / (out_scale / (act_scale * w_scale))) if bias is not present.
+  // output is a fp32 tensor
   auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-    .setxDesc(conv_op.getOutputTensor())
-    .setbDesc(getTensorDescriptor(requantize_multiplier_tensor, 's', getAlignment(requantize_multiplier_tensor)))
-    .setyDesc(getTensorDescriptor(output, 'r', getAlignment(output)))
-    .setpwDesc(getPointWiseMulDescriptor(getCudnnDataType(requantize_multiplier_tensor)))
+    .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : tensor2requant_ptr)
+    .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 's', cudnn_utils::getAlignment(requantize_multiplier_tensor)))
+    .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'r', cudnn_utils::getAlignment(quantized_output)))
+    .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor)))
     .build();
   // std::cout << "operator:" << requant_op.describe() << std::endl;
 
-  std::array<cudnn_frontend::Operation const *, 2> ops = {&conv_op, &requant_op};
+  std::vector<cudnn_frontend::Operation const *> ops{&conv_op};
+  if (bias_.has_value()) {
+    ops.emplace_back(&(bias_mult_op.value()));
+    ops.emplace_back(&(sum_conv_bias_op.value()));
+  }
+  if (kReluFused) {
+    ops.emplace_back(&(relu_op.value()));
+  }
+  ops.emplace_back(&requant_op);
 
   auto opGraph = cudnn_frontend::OperationGraphBuilder()
       .setHandle(handle)
@@ -258,8 +278,8 @@ void raw_cudnn_convolution_forward_out(
   auto& fallback_list = fallback.getFallbackList();
 
   cudnn_frontend::EngineConfigList filtered_configs;
-  filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, input.scalar_type());
-  filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, input.scalar_type());
+  cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar);
+  cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar);
 
   for (auto &cfg : engine_configs) {
     try {
@@ -271,101 +291,159 @@ void raw_cudnn_convolution_forward_out(
       run(plan_desc);
       execution_plan_cache[key] = plan_desc;
       return;
-    } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;}
+    } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;}
   }
-  TORCH_CHECK(false, "Unable to find an engine to execute this computation");
+
+  TORCH_CHECK(false, "Unable to find an engine to execute this computation in Quantized Conv2D Cudnn");
 }
 
 //
 // output Tensor will be a clampped int8 Tensor
 // both act and weight will be int8 Tensor
-//
+/*
+Numerics:
+out_fp32 = conv_fp32(act_fp32, w_fp32, …)
+                    = act_fp32 * w_fp32 + bias_fp32
+act_int8 = act_fp32 / act_scale + act_zero_point
+w_int8 = w_fp32 / w_scale + w_zero_point
+out_int8 = out_fp32 / out_scale + out_zero_point
+out_int8 = (act_fp32 * w_fp32 + [bias_fp32]) / out_scale + out_zero_point
+              = (act_int8 - act_zero_point) * act_scale * (w_int8 - w_zero_point) * w_scale / out_scale + out_zero_point + [bias_fp32 / out_scale]
+             = (act_int8 * w_int8 - act_int8 * w_zero_point - act_zero_point * w_int8 + act_zero_point * w_zero_point) * act_scale * w_scale / out_scale + out_zero_point + [bias_fp32 / out_scale]
+             = (if both act and weight are symmetrically quantized, int8, then act_zero_point = w_zero_point = 0)
+             = (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) * act_scale * w_scale / out_scale
+             = (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale))
+             = requantize((act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]), out_scale / (act_scale * w_scale))
+*/
 template <int kSpatialDim>
-Tensor raw_cudnn_convolution_forward(
-    const Tensor& act,
-    const Tensor& weight,
-    IntArrayRef padding,
-    IntArrayRef stride,
-    IntArrayRef dilation,
-    int64_t groups,
-    bool benchmark,
-    bool deterministic,
-    bool allow_tf32,
-    float requantize_multiplier) {
-  // TODO: add dimension validations for input/weight/bias
-  const int N = act.size(0);
-  const int C = act.size(1);
-  const int D = kSpatialDim == 3 ? act.size(2) : 1;
-  const int H = act.size(kSpatialDim);
-  const int W = act.size(kSpatialDim + 1);
-  const int M = weight.size(0); // output channels
-  std::vector<int64_t> kernel_size = {weight.size(2), weight.size(3)};
-  at::SmallVector<int64_t, kSpatialDim + 2> output_shape;
-  output_shape = MakeConvOutputShape<kSpatialDim>(N, M, {H, W}, kernel_size, stride, padding, dilation);
-  Tensor output_int8 = at::empty(
+template <bool kReluFused>
+at::Tensor PackedConvWeightCudnn<kSpatialDim>::apply_impl(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point) {
+  const auto batch_size = kSpatialDim == 2 ? act.size(0) : 1;
+  const auto num_input_channels = act.size(kSpatialDim - 1);
+  const auto H = act.size(kSpatialDim);
+  const auto W = act.size(kSpatialDim + 1);
+  const auto num_output_channels = maybe_padded_weight_.size(0); // output channels
+  std::vector<int64_t> kernel_size = {maybe_padded_weight_.size(2), maybe_padded_weight_.size(3)};
+  at::SmallVector<int64_t, kSpatialDim + 2> output_shape = MakeConvOutputShape<kSpatialDim>(batch_size, num_output_channels, {H, W},
+  kernel_size, stride_, padding_, dilation_);
+  at::Tensor quantized_output = at::_empty_affine_quantized(
       output_shape,
-      at::device(at::kCUDA).dtype(at::kChar),
-      at::MemoryFormat::ChannelsLast
-  );
-
-  raw_cudnn_convolution_forward_out(
-      output_int8, act, weight,
-      padding, stride, dilation, groups,
-      benchmark,
-      deterministic,
-      allow_tf32,
-      requantize_multiplier);
-  return output_int8;
+      at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
+      output_scale,
+      output_zero_point,
+      at::MemoryFormat::ChannelsLast);
+
+  // cudnn v8.4.0 expects conv2d's int8 activation tensor's input channels to be a multiple of 4. if it is not
+  // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding.
+  // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end;
+  // currently, limit padding support to groups=1 (ungrouped conv)
+  // TODO: implement this for groups > 1; should be straightforward since we're only padding a single dimension
+  auto act_maybe_padded = act;
+  if (num_input_channels % 4 != 0) {
+    int8_t num_slices = 4 - num_input_channels % 4; // number of slices we need to pad
+    act_maybe_padded = at::pad(act, {0, 0, 0, 0, 0, num_slices, 0, 0}, "constant", 0);
+  }
+  apply_impl_helper<kReluFused>(
+      quantized_output, act_maybe_padded.to(c10::MemoryFormat::ChannelsLast), output_scale);
+
+  // need to return sliced tensor if output_channels was padded
+  if (num_unpadded_output_channels_ != maybe_padded_weight_.size(0)) {
+    return quantized_output.slice(1, 0, num_unpadded_output_channels_);
+  }
+  return quantized_output;
 }
 
+template <int kSpatialDim>
+at::Tensor PackedConvWeightCudnn<kSpatialDim>::apply(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<false>(input, output_scale, output_zero_point);
+}
 
-template <int kSpatialDim, bool kReluFused>
-class QConvInt8 final {
+template <int kSpatialDim>
+at::Tensor PackedConvWeightCudnn<kSpatialDim>::apply_relu(
+    const at::Tensor& input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<true>(input, output_scale, output_zero_point);
+}
+
+template at::Tensor PackedConvWeightCudnn<2>::apply(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+template at::Tensor PackedConvWeightCudnn<2>::apply_relu(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point);
+
+namespace at {
+namespace native {
+namespace {
+
+template <bool kReluFused>
+class QConv1dInt8 final {
  public:
   static Tensor run(
       Tensor act,
-      Tensor weight,
-      c10::optional<Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
+      const c10::intrusive_ptr<ConvPackedParamsBase<2>>& packed_weight,
       double output_scale,
       int64_t output_zero_point) {
-    TORCH_CHECK(!kReluFused, "conv relu not supported yet");
-    TORCH_CHECK(!bias.has_value(), "bias is not supported yet");
-    act = act.contiguous(c10::MemoryFormat::ChannelsLast);
-    weight = weight.contiguous(c10::MemoryFormat::ChannelsLast);
-    // requantization
-    // out_int8 = act_int8 * weight_int8 * act_scale * w_scale / output_scale
-    auto act_scale = act.q_scale();
-    auto weight_scale = weight.q_scale();
-    auto requantize_multiplier = act_scale * weight_scale / output_scale;
+    at::Tensor output;
+    // we currently use conv2d kernel for conv1d by making the input and weight tensors
+    // 4D rather than 3D. we add a dummy width dimension of size 1
+    // N, C, L -> N, C, 1, L
+    act = act.unsqueeze(-2);
+    if (kReluFused) {
+      output = packed_weight->apply_relu(act, output_scale, output_zero_point);
+    } else {
+      output = packed_weight->apply(act, output_scale, output_zero_point);
+    }
+    // N, C, 1, L -> N, C, L
+    return output.squeeze_(-2);
+  }
+};
 
+template <int kSpatialDim, bool kReluFused>
+class QConvInt8 final {
+ public:
+  static at::Tensor run(
+      at::Tensor act,
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight,
+      double output_scale,
+      int64_t output_zero_point) {
+    TORCH_CHECK(kSpatialDim == 1 || kSpatialDim == 2, "Error in quantized cudnn conv2d operator: "
+                "Expected kSpatialDim == 1 || kSpatialDim == 2; received kSpatialDim=", kSpatialDim);
     // TODO: check all zero_points are zero/all tensors are symmetrically quantized
-    Tensor output_int8_requant = raw_cudnn_convolution_forward<kSpatialDim>(
-        act.int_repr(), weight.int_repr(),
-        IntArrayRef(padding.vec()), IntArrayRef(stride.vec()), IntArrayRef(dilation.vec()), groups,
-        false /* benchmark */,
-        true /* deterministic */,
-        false /* allow_tf32 */,
-        requantize_multiplier
-    );
-
-    // clamping is done in cudnn kernels, which probably defaults to -128, 127
-    // for int8 dtype, we may need to add new operators to the graph if
-    // we want to change the clamping
-    Tensor quantized_output = at::_make_per_tensor_quantized_tensor(output_int8_requant, output_scale, output_zero_point);
-    return quantized_output;
+    if (kReluFused) {
+      return packed_weight->apply_relu(act, output_scale, output_zero_point);
+    } else {
+      return packed_weight->apply(act, output_scale, output_zero_point);
+    }
   }
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
-  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_cudnn"), QConvInt8<2, false>::run);
+  // the cpu conv1d doesn't use the quantized::conv1d*.new variant for packed weights. instead it just uses
+  // quantized::conv1d for packed weights (see quantized/library.cpp).
+  // this is inconsistent with what has been done for conv2d where new variants use packed weights, and
+  // old variant does not. we adopt this inconsistency for now to be consistent with QuantizedCPU's conv1d
+  // and will eventually deprecate the old variants
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"), QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"), QConv1dInt8<true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"), QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run);
 }
 
 } // namespace
-}} // at::native
+} // namespace native
+} // namespace at
+
 
 #endif  // HAS_CUDNN_V8
 #endif  // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
new file mode 100644
index 000000000000..9314d9ee9293
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@@ -0,0 +1,374 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+#include <c10/util/ArrayRef.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/Handle.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <ATen/TensorUtils.h>
+#include <c10/core/ScalarType.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <cudnn_frontend.h>
+#include <torch/library.h>
+
+#include <iostream>
+#include <unordered_map>
+
+// TODO: there is a table from input dtype and weight dtype to operator dtype,
+// we can derive the operator dtype based on input dtype
+cudnn_frontend::MatMulDesc_v8 getLinearDescriptor(cudnnDataType_t dataType) {
+  return cudnn_frontend::MatMulDescBuilder()
+    .setMathPrecision(dataType)
+    .build();
+}
+
+// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp
+namespace {
+// we currently set the maximum number of input dimensions to 5
+// this can be increased, if necessary
+constexpr uint8_t max_num_input_dim = 5;
+struct LinearParams {
+  c10::DeviceIndex device_id;
+  cudnnDataType_t dataType;
+  int input_size[max_num_input_dim];
+  uint8_t input_dim;
+  at::MemoryFormat memory_format;
+  int64_t weight_size[2];
+  bool deterministic;
+  bool allow_tf32;
+};
+struct CacheKey {
+  LinearParams params;
+  uint8_t input_alignment;
+  uint8_t weight_alignment;
+  uint8_t output_alignment;
+  // default to -1 when no bias
+  int8_t bias_alignment;
+  bool kReluFused;
+};
+void setLinearParams(
+    LinearParams* params, const at::Tensor& input, const at::Tensor& weight,
+    bool deterministic, bool allow_tf32) {
+  // operator datatype needs to be int32 for int8 matmul, but we can
+  // set the datatype for output tensor to int32 or fp32
+  memset(params, 0, sizeof(LinearParams));
+  params->device_id = at::cuda::current_device();
+  params->dataType = CUDNN_DATA_INT32;
+  params->input_dim = input.dim();
+  params->memory_format = input.suggest_memory_format();
+  for (int i = 0; i < params->input_dim; ++i) {
+    params->input_size[i] = input.sizes()[i];
+  }
+  for (int i = 0; i < 2; ++i) {
+    params->weight_size[i] = weight.sizes()[i];
+  }
+  params->deterministic = deterministic;
+  params->allow_tf32 = allow_tf32;
+}
+std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, at::native::ParamsHash<CacheKey>, at::native::ParamsEqual<CacheKey>> execution_plan_cache;
+}
+// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
+// multiple operators
+// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
+//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
+
+// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized linear op
+// We implement relu(act_int8 * transpose(w_int8) + [bias_fp32/(act_scale * w_scale] ) * ( act_scale * w_scale / out_scale )
+// which requires 5 cudnn ops (1 matmul, 2 multiplication, 1 add, and 1 relu ops)
+// matmul op: linear_op
+// Multiplication ops: rhs_mult_op, requant_op
+// Addition op: add_op
+// Relu op: relu_op
+template <bool kReluFused>
+void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_output, const at::Tensor& input, double output_scale) {
+  if (quantized_output.numel() == 0) {
+    return;
+  }
+  at::Tensor linear_output = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat));
+  auto act_scale = input.q_scale();
+  auto weight_scale = orig_weight.q_scale();
+  auto requantize_multiplier = act_scale * weight_scale / output_scale;
+  at::Tensor requantize_multiplier_tensor = at::full(quantized_output.sizes(), requantize_multiplier, at::device(at::kCUDA).dtype(at::kFloat));
+  requantize_multiplier_tensor.fill_(requantize_multiplier);
+  c10::optional<at::Tensor> bias_multiplier_tensor;
+  c10::optional<at::Tensor> broadcasted_bias;
+  if (bias_.has_value()) {
+    // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output
+    // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
+    // the number of trailling dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity
+    std::vector<int64_t> new_size(quantized_output.dim(), 1);
+    new_size.back() = bias_.value().size(0);
+    broadcasted_bias = bias_.value().clone().reshape(new_size);
+    broadcasted_bias.value() = broadcasted_bias.value().broadcast_to(quantized_output.sizes()).contiguous();
+    bias_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat));
+    auto bias_multiplier = 1.0 / (act_scale * weight_scale);
+    bias_multiplier_tensor.value().fill_(bias_multiplier);
+  }
+
+  cudnnHandle_t handle = at::native::getCudnnHandle();
+  CacheKey key;
+  // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are
+  // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two
+  // CacheKey objects have the same user defined parameters, but
+  // different padded values, resulting in different hash outputs.
+  memset(&key, 0, sizeof(key));
+  bool deterministic{true};
+  bool allow_tf32{false};
+  setLinearParams(&key.params, input, orig_weight, deterministic, allow_tf32);
+
+  key.input_alignment = cudnn_utils::getAlignment(input);
+  key.output_alignment = cudnn_utils::getAlignment(linear_output);
+  key.weight_alignment = cudnn_utils::getAlignment(orig_weight);
+  if (bias_.has_value()) {
+    key.bias_alignment = cudnn_utils::getAlignment(broadcasted_bias.value());
+  } else {
+    key.bias_alignment = -1;
+  }
+  key.kReluFused = kReluFused;
+  // the matmul operation is input * transpose(weight), so we will work with the transposed weight
+  auto weight_transposed = transpose(orig_weight, 0, 1);
+  // cudnn expects tensors to be at least 3D. weight_transposed is currently 2D. we will create a 3D view
+  // by prepending a leading dummy dimension (cudnn expects leading dimensions to be the dummy dimensions)
+  std::vector<int64_t> new_sizes(3, 1);
+  new_sizes.back() = weight_transposed.size(1);
+  new_sizes[1] = weight_transposed.size(0);
+  weight_transposed = weight_transposed.view(new_sizes);
+
+  auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) {
+    auto workspace_size = 0;
+    auto workspace = at::empty({workspace_size}, input.options().dtype(at::kByte));
+    std::vector<void *> data_ptrs;
+    std::vector<int64_t> uids;
+    data_ptrs.reserve(9);
+    uids.reserve(9);
+    data_ptrs = {input.data_ptr<int8_t>(), weight_transposed.data_ptr<int8_t>(),
+                 requantize_multiplier_tensor.data_ptr(), quantized_output.data_ptr<int8_t>()};
+    uids = {'x', 'w', 's', 'r'};
+    if (bias_.has_value()) {
+      data_ptrs.insert(data_ptrs.end(), {broadcasted_bias.value().data_ptr(), bias_multiplier_tensor.value().data_ptr(),
+                                         broadcasted_bias.value().data_ptr(), broadcasted_bias.value().data_ptr(), linear_output.data_ptr()});
+      uids.insert(uids.end(), {'b', 'c', 'd', 'n', 'e'});
+    }
+    auto variantPack = cudnn_frontend::VariantPackBuilder()
+      .setWorkspacePointer(workspace.data_ptr())
+      .setDataPointers(uids.size(), data_ptrs.data())
+      .setUids(uids.size(), uids.data())
+      .build();
+    auto variant_pack_desc = variantPack.get_raw_desc();
+    AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc));
+  };
+
+  auto search = execution_plan_cache.find(key);
+  if (search != execution_plan_cache.end()) {
+    cudnn_frontend::ManagedOpaqueDescriptor plan_desc = search->second;
+    run(plan_desc);
+    return;
+  }
+
+  // linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication)
+  // where act_int8 and w_int8 are the input and weight variables, resp.
+  // output is a fp32 tensor
+  auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+      .setaMatDesc(cudnn_utils::getTensorDescriptor(input.sizes(), input.strides(), CUDNN_DATA_INT8, 'x', key.input_alignment))
+      .setbMatDesc(cudnn_utils::getTensorDescriptor(weight_transposed.sizes(), weight_transposed.strides(), CUDNN_DATA_INT8, 'w', key.weight_alignment))
+      .setcMatDesc(cudnn_utils::getTensorDescriptor(linear_output, 'y', key.output_alignment, true))
+      .setmatmulDesc(getLinearDescriptor(key.params.dataType))
+      .build();
+  // std::cout << "operator:" << linear_op.describe() << std::endl;
+
+  c10::optional<cudnn_frontend::Operation> bias_mult_op;
+  c10::optional<cudnn_frontend::Operation> sum_linear_bias_op;
+  if (bias_.has_value()) {
+    // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation;
+    // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
+    // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
+
+    // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale))
+    // where bias_multiplier = (1 / (act_scale * w_scale))
+    // output is a fp32 tensor
+    // we use inplace operation here where the output is assigned to the input
+    bias_mult_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'b', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setbDesc(cudnn_utils::getTensorDescriptor(bias_multiplier_tensor.value(), 'c', cudnn_utils::getAlignment(bias_multiplier_tensor.value())))
+      // TODO: I think we should be able to make this a virtual tensor, but we would need cudnn to support
+      // setbdesc(ManagedOpaqueDescriptor const &raw_tensor) first
+      .setyDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(bias_multiplier_tensor.value())))
+      .build());
+
+    // computes (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)])
+    // where the 1st and 2nd summands is linear_output and broadcasted_bias, resp.
+    // output is a fp32 tensor
+    // we use inplace operation here where the output is assigned to the input
+    sum_linear_bias_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(linear_op.getOutputTensor())
+      // TODO: An additional entry for broadcasted_bias in the uid-data_ptr pairing
+      // appears to be needed in the current version of cudnn (8.4.0). Without it, some
+      // test cases are failing. NVIDIA is currently investigating this issue.
+      // When this issue is fixed, we can change 'n' back to 'd' and remove the additional entry in uid and data_ptrs in variant pack above
+      .setbDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'n', cudnn_utils::getAlignment(broadcasted_bias.value())))
+      .setyDesc(cudnn_utils::getTensorDescriptor(linear_output, 'e', key.output_alignment))
+      .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(broadcasted_bias.value())))
+      .build());
+  }
+
+  // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]
+  // or relu(act_int8 * w_int8) if bias is not present.
+  // output is a fp32 tensor
+  c10::optional<cudnn_frontend::Operation> relu_op;
+  std::shared_ptr<cudnn_frontend::OpaqueBackendPointer> tensor2requant_ptr = bias_.has_value() ? sum_linear_bias_op.value().getOutputTensor() : linear_op.getOutputTensor();
+  if (kReluFused) {
+    // we use inplace operation here where the output is assigned to the input
+    relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+      .setxDesc(tensor2requant_ptr)
+      .setyDesc(cudnn_utils::getTensorDescriptor(linear_output, 'f', key.output_alignment, true))
+      .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(linear_output)))
+      .build());
+  }
+
+  // requant_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale))
+  // or relu(act_int8 * w_int8) / (out_scale / (act_scale * w_scale))) if bias is not present.
+  // output is a fp32 tensor
+  auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+    .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : tensor2requant_ptr)
+    .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 's', cudnn_utils::getAlignment(requantize_multiplier_tensor)))
+    .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'r', cudnn_utils::getAlignment(quantized_output)))
+    .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor)))
+    .build();
+  // // std::cout << "operator:" << requant_op.describe() << std::endl;
+
+  std::vector<cudnn_frontend::Operation const *> ops{&linear_op};
+  if (bias_.has_value()) {
+    ops.emplace_back(&(bias_mult_op.value()));
+    ops.emplace_back(&(sum_linear_bias_op.value()));
+  }
+  if (kReluFused) {
+    ops.emplace_back(&(relu_op.value()));
+  }
+  ops.emplace_back(&requant_op);
+
+  auto opGraph = cudnn_frontend::OperationGraphBuilder()
+      .setHandle(handle)
+      .setOperationGraph(ops.size(), ops.data())
+      .build();
+  // std::cout << "opGraph: " << opGraph.describe() << std::endl;
+
+  auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+      .setOperationGraph(opGraph)
+      .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+      .build();
+  auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                    .setOperationGraph(opGraph)
+                    .setOperation(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                    .build();
+
+  auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+  auto& fallback_list = fallback.getFallbackList();
+
+  cudnn_frontend::EngineConfigList filtered_configs;
+  cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar);
+  cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar);
+
+  for (auto &cfg : engine_configs) {
+    try {
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+        .setHandle(handle)
+        .setEngineConfig(cfg)
+        .build();
+      auto plan_desc = plan.get_desc();
+      run(plan_desc);
+      execution_plan_cache[key] = plan_desc;
+      return;
+    } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;}
+  }
+
+  TORCH_CHECK(false, "Unable to find an engine to execute this computation Quantized Linear Cudnn");
+}
+
+// output Tensor will be a clampped int8 Tensor
+// both act and weight will be int8 Tensor
+// Numerics are the same as conv (see aten/src/ATen/native/quantized/Conv.cpp):
+template <bool kReluFused>
+at::Tensor PackedLinearWeightCudnn::apply_impl(
+    const at::Tensor& act,
+    double output_scale,
+    int64_t output_zero_point) {
+  std::vector<int64_t> original_output_shape{act.sizes().vec()}; // 2D
+  original_output_shape.back() = orig_weight.size(0); // output channels
+  // cudnn expects tensors to be at least 3D. we will prepend a dummy dimension for quantized_output
+  std::vector<int64_t> output_shape(3, 1);
+  output_shape[1] = original_output_shape[0];
+  output_shape[2] = original_output_shape[1];
+  at::Tensor quantized_output = at::_empty_affine_quantized(
+      output_shape,
+      at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
+      output_scale,
+      output_zero_point);
+  // cudnn expects tensors to be at least 3D. act is currently 2D. we will create a 3D view
+  std::vector<int64_t> new_sizes(3, 1);
+  // cudnn expects leading dimensions to be the dummy dimensions
+  new_sizes.back() = act.sizes().back();
+  new_sizes[1] = act.size(0);
+  apply_impl_helper<kReluFused>(
+      quantized_output, act.view(new_sizes), output_scale);
+  return quantized_output.view(original_output_shape);
+}
+
+at::Tensor PackedLinearWeightCudnn::apply(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<false>(input, output_scale, output_zero_point);
+}
+
+at::Tensor PackedLinearWeightCudnn::apply_relu(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point) {
+  return apply_impl<true>(input, output_scale, output_zero_point);
+}
+
+namespace at {
+namespace native {
+namespace {
+
+template <bool kReluFused>
+class QLinearInt8 final {
+ public:
+  static at::Tensor run(
+      at::Tensor act,
+      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight,
+      double output_scale,
+      int64_t output_zero_point) {
+    // TODO: check all zero_points are zero/all tensors are symmetrically quantized
+    if (kReluFused) {
+      return packed_weight->apply_relu(act, output_scale, output_zero_point);
+    } else {
+      return packed_weight->apply(act, output_scale, output_zero_point);
+    }
+  }
+};
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), QLinearInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), QLinearInt8<true>::run);
+}
+
+} // namespace
+} // namespace native
+} // namespace at
+
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
new file mode 100644
index 000000000000..8335eeeca2ff
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
@@ -0,0 +1,248 @@
+#include <c10/util/Exception.h>
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+#include <ATen/native/cudnn/Macros.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Handle.h>
+#include <ATen/cudnn/Types.h>
+#endif // AT_CUDNN_ENABLED
+#endif // USE_CUDA
+
+#include <ATen/ATen.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
+#include <torch/library.h>
+
+#include <vector>
+
+namespace at {
+namespace native {
+namespace {
+// TODO: This function is the same as that of qpool.cpp. We should refactor this into quantized directory
+// so that we don't need to duplicate the function
+void check_maxpool2d_params(
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation) {
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+              "Expected 1d or 2d kernel size, got ", kernel_size.size());
+  TORCH_CHECK(stride.empty() || stride.size() == 2,
+              "Expected no strides or 2d strides, got", stride.size());
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+              "Expected 1d or 2d padding, got ", padding.size());
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+              "Expected 1d or 2d dilation, got ", dilation.size());
+}
+}
+
+// The current implementation of quantized cuda adaptive average pooling uses the following:
+// dequant -> fp32 adaptive average pooling -> quant. This is the same numerically as
+// quantized adaptive average pooling. This is not the ideal implementation, as we desire to
+// operate on the quantized values directly.
+// However, we are currently blocked on this as we are waiting for cudnn's 8.5.0 release, which is anticipated
+// to support adaptive average pooling. When that support is made available, we will use it directly. TODO
+Tensor adaptive_avg_pool2d_quantized_cuda(
+    const at::Tensor& input,
+    IntArrayRef output_size) {
+// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
+#ifdef USE_CUDA
+// #if AT_CUDNN_ENABLED()
+// #if HAS_CUDNN_V8()
+    // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
+    // to per channel quantized tensors
+    TORCH_CHECK(input.qscheme() == at::kPerTensorAffine, "adaptive_avg_pool2d_quantized_cuda oonly supports per tensor quantized tensors");
+    auto input_fp32 = at::dequantize(input);
+    auto result_fp32 = at::adaptive_avg_pool2d(input_fp32, output_size);
+    return at::quantize_per_tensor(result_fp32, input.q_scale(), input.q_zero_point(), input.scalar_type());
+#else // USE_CUDA
+  AT_ERROR("at::native::adaptive_avg_pool2d_quantized_cuda: ATen not compiled with USE_CUDA support");
+  return Tensor{}; // never reached, placates the compiler
+#endif
+}
+
+// Currently we support 4D and 3D input (qx) tensors, the latter of which is supported for
+// legacy reasons. The first dimension of a 4D input tensor is the batch size.
+// For a 3D tensor, there is no batch size dimension -- it can be viewed as a single batch.
+// cudnn's 2D pooling operation requires the input and output to be 4D tensors, so we must cast
+// any 3D tensors to 4D prior to using cudnn
+// This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
+// pooling operations.
+// Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
+// documentation on the APIs
+// Currently, it appears there is no cudnn support for dilated pooling -- we will
+// submit a feature request for this with cudnn
+// TODO: ideally, we would like to use structured kernel support here so we do not have to repeat
+// the input checks, however, that would require us to implement max_pool2d_with_indices_out_quantized_cuda
+// based on how the dispatch table is currently constructed in native_functions.yaml. currently,
+// there is no support for producing indices with cudnn max pooling, so until that becomes available, this cannot be done.
+Tensor quantized_max_pool2d_cudnn(
+    const Tensor& qx,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+#ifdef USE_CUDA
+#if AT_CUDNN_ENABLED()
+#if HAS_CUDNN_V8()
+  check_maxpool2d_params(
+      kernel_size,
+      stride,
+      padding,
+      dilation);
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  auto ndim = qx.dim();
+  TORCH_CHECK(
+      ndim == 3 || ndim == 4, "Expecting the input tensor of rank 3 or 4.");
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "quantized_max_pool2d_cudnn(): Expected kernel_size to be 2-dimensional: got ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride.size() == 2,
+      "quantized_max_pool2d_cudnn(): Expected stride to be 2-dimensional: got ",
+      stride.size());
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "quantized_max_pool2d_cudnn(): Expected dilation to be 2-dimensional: got ",
+      dilation.size());
+  TORCH_CHECK(
+      dilation[0] == 1 && dilation[1] == 1,
+      "quantized_max_pool2d_cudnn(): Expected dilation=[1, 1] (cudnn does not currently support dilation[i] != 1), got",
+      dilation);
+  TORCH_CHECK(
+      padding.size() == 2,
+      "quantized_max_pool2d_cudnn(): Expected padding to be 2-dimensional: got ",
+      padding.size());
+
+  auto input = qx;
+  if (ndim == 4) {
+    input = qx.to(MemoryFormat::ChannelsLast);
+  } else { // 3D
+    std::vector<int64_t> new_sizes{1, qx.size(0), qx.size(1), qx.size(2)};
+    input = qx.view(new_sizes);
+  }
+  int batch_size = input.size(0);
+  int64_t inC = input.size(1);
+  int64_t inH = input.size(2);
+  int64_t inW = input.size(3);
+  // Check output dimensions.
+  int64_t padH = padding[0];
+  int64_t padW = padding[1];
+  int64_t kH = kernel_size[0];
+  int64_t kW = kernel_size[1];
+  int64_t strideH = stride[0];
+  int64_t strideW = stride[1];
+  TORCH_CHECK(
+      kH > 0 && kW > 0,
+      "qnnpack_maxpool2d(): kernel_size should be greater than zero.");
+  TORCH_CHECK(
+      strideH > 0 && strideW > 0,
+      "qnnpack_maxpool2d(): strides should be greater than zero.");
+  int64_t dilationH = dilation[0];
+  int64_t dilationW = dilation[1];
+  int64_t outC = inC;
+  int64_t outH = pooling_output_shape(inH, kH, padH, strideH, dilationH, ceil_mode);
+  int64_t outW = pooling_output_shape(inW, kW, padW, strideW, dilationW, ceil_mode);
+  TORCH_CHECK(outH > 0 && outW > 0,
+              "Given input size: (",
+              inC, "x", inH, "x", inW,
+              "). Calculated output size: (",
+              outC, "x", outH, "x", outW,
+              "). Output size is too small.");
+
+  std::vector<int64_t> output_shape;
+  if (ndim == 3) {
+    // cudnn requires 4D input and output for 2D pooling, so we prepend a dummy dimension
+    // whose size represents the batch size (1)
+    output_shape = {1, outC, outH, outW};
+  } else {
+    output_shape = {batch_size, outC, outH, outW};
+  }
+  auto qy = at::_empty_affine_quantized(
+      output_shape,
+      at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
+      input.q_scale(),
+      input.q_zero_point(),
+      (ndim == 4 ? MemoryFormat::ChannelsLast : MemoryFormat::Contiguous));
+
+  cudnnHandle_t handle = getCudnnHandle();
+  cudnnPoolingDescriptor_t poolingDesc;
+  AT_CUDNN_CHECK_WITH_SHAPES(cudnnCreatePoolingDescriptor(&poolingDesc));
+  AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetPooling2dDescriptor(
+      poolingDesc,
+      CUDNN_POOLING_MAX_DETERMINISTIC,
+      CUDNN_NOT_PROPAGATE_NAN,
+      kernel_size[0], // kernel height
+      kernel_size[1], // kernel width
+      padding[0], // vertical padding
+      padding[1], // horizontal padding
+      stride[0], // vertical stride
+      stride[1])); // horizontal stride
+
+  float one{1};
+  float zero{0.0};
+  TensorDescriptor xDesc;
+  at::MemoryFormat memory_format = (ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous);
+  xDesc.set(input, memory_format);
+  TensorDescriptor yDesc;
+  yDesc.set(qy, memory_format);
+  cudnnPoolingForward(handle,
+                      poolingDesc,
+                      &one,
+                      xDesc.desc(),
+                      input.data_ptr<int8_t>(),
+                      &zero,
+                      yDesc.desc(),
+                      qy.data_ptr<int8_t>());
+
+  // recall we casted our input and output to 4D if qx was 3D, so we recast it back to 3D prior to returning
+  return (ndim == 3 ? qy.view(std::vector<int64_t>(output_shape.begin() + 1, output_shape.end())) : qy);
+#else // HAS_CUDNN_V8()
+  AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN v8 support");
+  return Tensor{}; // never reached, placates the compiler
+#endif // HAS_CUDNN_V8()
+#else // AT_CUDNN_ENABLED()
+  AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN support");
+  return Tensor{}; // never reached, placates the compiler
+#endif // AT_CUDNN_ENABLED()
+#else // USE_CUDA
+  AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with USE_CUDA support");
+  return Tensor{}; // never reached, placates the compiler
+#endif
+}
+
+// Keep the registry in the anonymous namespace.
+namespace {
+template <uint32_t kSpatialDim>
+class QMaxPool_arr_args final {
+ public:
+  static Tensor run(
+      Tensor qx,
+      std::vector<int64_t> kernel_size,
+      std::vector<int64_t> stride,
+      std::vector<int64_t> padding,
+      std::vector<int64_t> dilation,
+      bool ceil_mode) {
+    TORCH_CHECK(kSpatialDim == 2, "quantized max pool is only valid for 2D")
+    return quantized_max_pool2d_cudnn(qx, kernel_size, stride, padding,
+                                    dilation, ceil_mode);
+  }
+};
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run));
+}
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp b/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp
new file mode 100644
index 000000000000..7db1f7092f51
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp
@@ -0,0 +1,217 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+#include <ATen/native/quantized/cpu/quant_utils.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/irange.h>
+#include <torch/library.h>
+
+#include <array>
+#include <vector>
+
+template <int kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightCudnn<
+    kSpatialDim>::
+    prepack(
+        at::Tensor weight,
+        c10::optional<at::Tensor> bias,
+        torch::List<int64_t> stride,
+        torch::List<int64_t> padding,
+        torch::List<int64_t> output_padding,
+        torch::List<int64_t> dilation,
+        int64_t groups,
+        bool transpose) {
+  // TODO: need to check out to implement groups for conv operator in Conv.cpp
+  TORCH_CHECK(groups == 1, "Quantized cudnn conv2d is currenty limited to groups = 1; received groups =", groups);
+  TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme()));
+  TORCH_CHECK(
+      kSpatialDim == 2,  // 1D is packed as 2d, hence we don't need other checks
+      "cuDNN packing only supports 2D convolution.");
+  TORCH_CHECK(
+      weight.ndimension() == kSpatialDim + 2,
+      "Weights are expected to have ",
+      kSpatialDim + 2,
+      " dimensions");
+  TORCH_CHECK(
+      stride.size() == kSpatialDim,
+      "stride should contain ",
+      kSpatialDim,
+      " elements for ",
+      kSpatialDim,
+      "D convolution.");
+  TORCH_CHECK(
+      padding.size() == kSpatialDim,
+      "quantized::conv_prepack (cudnn): Specify front/top/left padding only. "
+      "end/bottom/right padding assumed to be equal to front/top/left");
+  TORCH_CHECK(
+      !transpose || output_padding.size() == kSpatialDim,
+      "quantized::conv_prepack: Specify top/left output padding "
+      "only. bottom/right padding assumed to be equal to top/left");
+  TORCH_CHECK(
+      dilation.size() == kSpatialDim,
+      "quantized::conv_prepack (cudnn): dilation should contain ",
+      kSpatialDim,
+      " elements for ",
+      kSpatialDim,
+      "D convolution.");
+  TORCH_CHECK(!transpose, "cudNN quantized conv prepack expects transpose = false")
+  const int num_unpadded_output_channels = weight.size(0);
+  const auto qtype = weight.qscheme();
+  if (bias.has_value()) {
+    TORCH_CHECK(bias.value().dim() == 1, "bias should be a vector (1D Tensor)");
+    TORCH_CHECK(
+        bias.value().size(0) == num_unpadded_output_channels,
+        "bias should have K elements: " + std::to_string(num_unpadded_output_channels));
+    // TODO: we create a broadcasted_bias tensor later so I think we don't need to make this contiguous here.
+    // we will revisit this when nvidia adds proper support for broadcasting
+    // bias_contig = bias->contiguous();
+  }
+
+  // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not
+  // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding.
+  // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end;
+  // currently, limit padding support to groups=1 (ungrouped conv)
+  // TODO: implement this for groups > 1
+  auto num_input_channels = weight.size(1);
+  int8_t num_output_slices2pad = (4 - num_unpadded_output_channels % 4) % 4;
+  int8_t num_input_slices2pad = (4 - num_input_channels % 4) % 4;
+  if (num_output_slices2pad != 0 || num_input_slices2pad != 0) {
+    // the second argument is an initializer list of padded values. there are 2 values for each dimension.
+    // refer to https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html for more details
+    weight = at::pad(weight, {0, 0, 0, 0, 0, num_input_slices2pad, 0, num_output_slices2pad}, "constant", 0);
+    if (bias.has_value()) {
+      bias.value() = at::pad(bias.value(), {0, num_output_slices2pad}, "constant", 0);
+    }
+  }
+
+  auto ret_ptr = c10::make_intrusive<PackedConvWeightCudnn<kSpatialDim>>(
+          weight.to(c10::MemoryFormat::ChannelsLast), // TODO: this assumes 2D I think. make it more general?
+          bias,
+          stride,
+          padding,
+          output_padding,
+          dilation,
+          groups,
+          transpose,
+          qtype,
+          num_unpadded_output_channels);
+  return ret_ptr;
+}
+
+template
+c10::intrusive_ptr<ConvPackedParamsBase<2>> PackedConvWeightCudnn<
+    2>::
+    prepack(
+        at::Tensor weight,
+        c10::optional<at::Tensor> bias_in,
+        torch::List<int64_t> stride,
+        torch::List<int64_t> padding,
+        torch::List<int64_t> output_padding,
+        torch::List<int64_t> dilation,
+        int64_t groups,
+        bool transpose);
+
+namespace at {
+namespace native {
+namespace {
+
+template <int kSpatialDim = 2>
+class QConvPackWeightInt8Cudnn final {
+ public:
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> run_conv(
+      Tensor weight,
+      c10::optional<Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups) {
+    torch::List<int64_t> output_padding;
+    output_padding.reserve(kSpatialDim);
+    for (const auto idx : c10::irange(kSpatialDim)) {
+      (void)idx; //Suppress unused variable warning
+      output_padding.push_back((int64_t)0);
+    }
+    return _run(weight, bias, stride, padding, output_padding, dilation, groups,
+                /*transpose=*/false);
+  }
+
+ private:
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> _run(
+      Tensor weight,
+      c10::optional<Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose) {
+    return PackedConvWeightCudnn<kSpatialDim>::prepack(
+        weight, bias, stride, padding, output_padding, dilation, groups,
+        transpose);
+  }
+};
+
+class QConv1dPackWeightInt8Cudnn final {
+ public:
+  static c10::intrusive_ptr<ConvPackedParamsBase<2>> run_conv(
+      Tensor weight,
+      c10::optional<Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups) {
+    const torch::List<int64_t> output_padding({0});
+    return _run(weight, bias, stride, padding, output_padding, dilation, groups,
+                /*transpose=*/false);
+  }
+
+ private:
+  static c10::intrusive_ptr<ConvPackedParamsBase<2>> _run(
+      Tensor weight,
+      c10::optional<Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose) {
+    if (weight.dim() == 3) {
+      // we currently use conv2d kernel for conv1d by making the input and weight tensors
+      // 4D rather than 3D. we add a dummy width dimension of size 1
+      // out channels, in channels / groups, L -> out channels, in channels / groups, 1, L
+      weight = weight.unsqueeze(-2);
+    }
+    stride = quant_utils::MakeArgForConv1d(stride, 1);
+    padding = quant_utils::MakeArgForConv1d(padding, 0);
+    output_padding = quant_utils::MakeArgForConv1d(output_padding, 0);
+    dilation = quant_utils::MakeArgForConv1d(dilation, 1);
+
+    return PackedConvWeightCudnn<2>::prepack(
+        weight, bias, stride, padding, output_padding, dilation, groups,
+        transpose);
+  }
+};
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_prepack"), TORCH_FN(QConv1dPackWeightInt8Cudnn::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8Cudnn<2>::run_conv));
+}
+
+} // namespace
+} // namespace native
+} // namespace at
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp
new file mode 100644
index 000000000000..e18c6ce4d888
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp
@@ -0,0 +1,28 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/ATen.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <torch/library.h>
+
+#include <tuple>
+
+template <int kSpatialDim>
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
+    kSpatialDim>::unpack() {
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>{maybe_padded_weight_, bias_};
+}
+
+template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
+    2>::unpack();
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp b/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp
new file mode 100644
index 000000000000..3541ce9b7d80
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp
@@ -0,0 +1,63 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/irange.h>
+#include <torch/library.h>
+
+c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightCudnn::prepack(
+        at::Tensor weight,
+        c10::optional<at::Tensor> bias) {
+  TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme()));
+  const int output_channels = weight.size(0);
+  const auto qtype = weight.qscheme();
+  if (bias.has_value()) {
+    TORCH_CHECK(bias.value().dim() == 1, "bias should be a vector (1D Tensor)");
+    TORCH_CHECK(
+        bias.value().size(0) == output_channels,
+        "bias should have K elements: " + std::to_string(output_channels));
+  }
+
+  auto ret_ptr = c10::make_intrusive<PackedLinearWeightCudnn>(
+          weight,
+          bias,
+          qtype);
+  return ret_ptr;
+}
+
+namespace at {
+namespace native {
+namespace {
+
+class QLinearPackWeightInt8Cudnn final {
+ public:
+  static c10::intrusive_ptr<LinearPackedParamsBase> run(
+      at::Tensor weight,
+      c10::optional<Tensor> bias) {
+      return PackedLinearWeightCudnn::prepack(std::move(weight), std::move(bias));
+  }
+};
+
+TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8Cudnn::run));
+}
+
+
+} // namespace
+} // namespace native
+} // namespace at
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp b/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp
new file mode 100644
index 000000000000..ebf77b0294d8
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp
@@ -0,0 +1,23 @@
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/ATen.h>
+#include <ATen/native/quantized/cudnn/utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <torch/library.h>
+
+#include <tuple>
+
+std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightCudnn::unpack() {
+  return std::tuple<at::Tensor, c10::optional<at::Tensor>>{orig_weight, bias_};
+}
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/utils.h b/aten/src/ATen/native/quantized/cudnn/utils.h
new file mode 100644
index 000000000000..3eba354bd20c
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cudnn/utils.h
@@ -0,0 +1,335 @@
+#pragma once
+/*
+This file contains some of the auxiliary functions used by both Conv.cpp & Linear.cpp (introduced in a later PR)
+*/
+
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
+#include <ATen/cudnn/Types.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/ArrayRef.h>
+#include <cudnn_frontend.h>
+
+struct PackedLinearWeightCudnn : public LinearPackedParamsBase {
+  PackedLinearWeightCudnn(
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> bias,
+      c10::QScheme q_scheme)
+      : orig_weight(std::move(orig_weight)),
+        bias_(std::move(bias)),
+        q_scheme(std::move(q_scheme)) {}
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) override {
+    throw std::runtime_error(
+    "apply_relu_out is not implemented for this packed "
+    "parameter type");
+  }
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) override {
+    throw std::runtime_error(
+    "apply_relu_out is not implemented for this packed "
+    "parameter type");
+  }
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+ private:
+  at::Tensor orig_weight;
+  c10::optional<at::Tensor> bias_;
+  c10::QScheme q_scheme;
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  void apply_impl_helper(
+      const at::Tensor& quantized_output,
+      const at::Tensor& input,
+      double output_scale);
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightCudnn : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightCudnn(
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose,
+      c10::QScheme q_scheme,
+      int64_t output_channels)
+      : maybe_padded_weight_(std::move(orig_weight)),
+        bias_(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        q_scheme_(q_scheme),
+        num_unpadded_output_channels_(output_channels) {} // output channels needs to be stored when we have to pad this dimension
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) {
+    TORCH_CHECK(false, "apply_dynamic is currently not reported");
+  }
+
+  at::Tensor apply_dynamic_relu(
+    const at::Tensor& input,
+    bool reduce_range) {
+    TORCH_CHECK(false, "apply_dynamic_relu is currently not reported");
+  }
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  const float* GetBiasData(at::Tensor* bias);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+ private:
+  // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not
+  // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding, hence the naming
+  // convention "maybe"_padded_weight.
+  // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_
+  at::Tensor maybe_padded_weight_;
+  c10::optional<at::Tensor> bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+  c10::QScheme q_scheme_;
+  int64_t num_unpadded_output_channels_;
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  void apply_impl_helper(
+      const at::Tensor& quantized_output,
+      const at::Tensor& input,
+      double output_scale);
+};
+
+namespace cudnn_utils {
+namespace {
+
+uint8_t getAlignment(const at::Tensor &t) {
+  // alignment are in bytes
+  uint8_t alignment = 1;
+  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
+  while (address % alignment == 0 && alignment < 16) alignment *= 2;
+  return alignment;
+}
+
+// For the two getTensorDescriptor functions, there is a is_virtual parameter. This parameter is used to set the cudnn
+// tensor as virtual or not. Setting the tensor as virtual is expected to have some performance benefits as the cudnn
+// backend cudnn will no longer directly save to the tensor, allowing us to omit this tensor from the variant pack.
+// See third_party/cudnn_frontend/samples/fusion_sample.cpp for other examples
+
+cudnn_frontend::Tensor getTensorDescriptor(const at::Tensor &t, int64_t id, uint8_t alignment, bool is_virtual = false) {
+  auto shape = t.sizes();
+  auto strides = t.strides();
+  if (is_virtual) {
+    return cudnn_frontend::TensorBuilder()
+      .setDim(shape.size(), shape.data())
+      .setStrides(strides.size(), strides.data())
+      .setId(id)
+      .setAlignment(alignment)
+      .setVirtual()
+      .setDataType(at::native::getCudnnDataType(t))
+      .build();
+  }
+  return cudnn_frontend::TensorBuilder()
+    .setDim(shape.size(), shape.data())
+    .setStrides(strides.size(), strides.data())
+    .setId(id)
+    .setAlignment(alignment)
+    .setDataType(at::native::getCudnnDataType(t))
+    .build();
+}
+
+cudnn_frontend::Tensor getTensorDescriptor(const c10::IntArrayRef& shape, const c10::IntArrayRef& strides, cudnnDataType_t cudnn_dtype, int64_t id, uint8_t alignment, bool is_virtual = false) {
+  if (is_virtual) {
+    return cudnn_frontend::TensorBuilder()
+      .setDim(shape.size(), shape.data())
+      .setStrides(strides.size(), strides.data())
+      .setId(id)
+      .setAlignment(alignment)
+      .setVirtual()
+      .setDataType(cudnn_dtype)
+      .build();
+  }
+  return cudnn_frontend::TensorBuilder()
+    .setDim(shape.size(), shape.data())
+    .setStrides(strides.size(), strides.data())
+    .setId(id)
+    .setAlignment(alignment)
+    .setDataType(cudnn_dtype)
+    .build();
+}
+
+// TODO: there is a table from input dtype to operator dtype, we can derive
+// the operator dtype based on input dtype
+cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) {
+  return cudnn_frontend::PointWiseDescBuilder()
+    .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_MUL)
+    .setMathPrecision(dataType)
+    .build();
+}
+
+// TODO: there is a table from input dtype to operator dtype, we can derive
+// the operator dtype based on input dtype
+cudnn_frontend::PointWiseDesc_v8 getPointWiseAddDescriptor(cudnnDataType_t dataType) {
+  return cudnn_frontend::PointWiseDescBuilder()
+    .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_ADD)
+    .setMathPrecision(dataType)
+    .build();
+}
+
+// TODO: there is a table from input dtype to operator dtype, we can derive
+// the operator dtype based on input dtype
+cudnn_frontend::PointWiseDesc_v8 getPointWiseReluDescriptor(cudnnDataType_t dataType) {
+  return cudnn_frontend::PointWiseDescBuilder()
+    .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_RELU_FWD)
+    .setMathPrecision(dataType)
+    .build();
+}
+
+
+void filterEngineConfigs(
+  cudnn_frontend::EngineConfigList &from,
+  cudnn_frontend::EngineConfigList &to,
+  bool deterministic, bool allow_tf32, c10::ScalarType scalar_type)
+{
+  auto filter = [=](cudnnBackendDescriptor_t c) {
+    if (deterministic) {
+      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) return true;
+    }
+    if (scalar_type == at::kFloat || scalar_type == at::kChar || !allow_tf32) {
+      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) return true;
+      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c)) return true;
+    }
+    return false;
+  };
+  cudnn_frontend::filter(from, to, filter);
+}
+
+
+cudnn_frontend::ExecutionPlan get_execplan_from_heuristics_else_fall_back(cudnn_frontend::OperationGraph&& opGraph, cudnnHandle_t handle_) {
+  auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+    .setOperationGraph(opGraph)
+    .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+    .build();
+
+  // std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl;
+  auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+
+  // Try engine configs returned by the heuristics and pick up the first one that works.
+  for (auto& ecfg : engine_config) {
+    try {
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+        .setHandle(handle_)
+        .setEngineConfig(ecfg, opGraph.getTag())
+        .build();
+      return plan;
+    } catch (cudnn_frontend::cudnnException& e) {
+      continue;
+    }
+  }
+
+  {
+    // std::cout << opGraph.describe() << " has " << total_engines << " engines." << std::endl;
+    auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
+    // std::cout << engine.describe() << std::endl;
+
+    auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
+    // std::cout << engine_config.describe() << std::endl;
+
+    return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build();
+  }
+}
+} // anonymous
+} // cudnn_utils
+
+#endif  // HAS_CUDNN_V8
+#endif  // AT_CUDNN_ENABLED
+#endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 783c847dff0a..047a126e79a1 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -1,7 +1,6 @@
 #include <torch/library.h>
 
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/quantized/cpu/embedding_packed_params.h>
 #include <torch/custom_class.h>
 
@@ -189,10 +188,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::leaky_relu(Tensor qx, Scalar negative_slope, bool inplace, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::sigmoid(Tensor qx, float output_scale, int output_zero_point) -> Tensor"));
-
-  // quantized ops implemented in cudnn, with QuantizedCUDA dispatch
-  // TODO: use the same signature as quantized::conv2d
-  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_cudnn(Tensor act, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::softmax(Tensor qx, int dim, float output_scale, int output_zero_point) -> Tensor"));
 }
 
 // According to #33294: The "_" prefix registration will be
diff --git a/aten/src/ATen/native/quantized/packed_params.h b/aten/src/ATen/native/quantized/packed_params.h
new file mode 100644
index 000000000000..64d8ec840c46
--- /dev/null
+++ b/aten/src/ATen/native/quantized/packed_params.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+
+struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+
+  // out variant of LinearPackedParamsBase::apply
+  virtual at::Tensor& apply_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual at::Tensor& apply_relu_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual at::Tensor apply_dynamic(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor apply_dynamic_relu(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+
+  virtual at::Tensor& apply_dynamic_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual c10::optional<at::Tensor> bias() = 0;
+
+  virtual void set_bias(c10::optional<at::Tensor> /*bias*/) {
+    throw std::runtime_error(
+        "set_bias is not implemented for this packed "
+        "parameter type");
+  }
+};
+
+template <int kSpatialDim = 2>
+struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) = 0;
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual torch::List<int64_t> stride() const = 0;
+  virtual torch::List<int64_t> padding() const = 0;
+  virtual torch::List<int64_t> output_padding() const = 0;
+  virtual torch::List<int64_t> dilation() const = 0;
+  virtual int64_t groups() const = 0;
+  virtual bool transpose() const = 0;
+};
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
new file mode 100644
index 000000000000..062fc8a0522a
--- /dev/null
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -0,0 +1,224 @@
+/*
+The dispatch registrations at the end of this file applies to fbgemm, qnnpack, and cudnn backends.
+The correct unpack backend function is determined using runtime polymorphism through the packed_weight pointer,
+which is of type intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> and points to either a PackedConvWeightsQnnp,
+PackedConvWeights (Fbgemm), or PackedConvWeightsCudnn at runtime, which all inherit from ConvPackedParamsBase.
+The implementations for the unpack functions can be found in /cpu/qconv_unpack_impl.cpp, for fbgemm&qnnpack
+and /cudnn/conv_unpack_impl.cpp, for cudnn.
+*/
+
+#include <tuple>
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <ATen/native/quantized/cpu/onednn_utils.h>
+#include <ATen/native/quantized/cpu/quant_utils.h>
+#include <ATen/native/quantized/packed_params.h>
+
+namespace at {
+namespace native {
+namespace {
+
+/*
+ * QConvPackWeightInt8 expects its input tensor to be in shape
+ * [output_channels, kernel_height, kernel_width, input_channels/Groups]
+ * Therefore, the unpacking of packed weight tensor using QConvUnpackWeightsInt8
+ * results in a tensor of the same shape.
+ */
+
+template <int kSpatialDim = 2>
+class QConvUnpackWeightsInt8 final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+    if (ctx.qEngine() == at::QEngine::FBGEMM) {
+      return packed_weight->unpack();
+    }
+#endif
+
+#ifdef USE_PYTORCH_QNNPACK
+    if (ctx.qEngine() == at::QEngine::QNNPACK) {
+      TORCH_CHECK(
+          kSpatialDim == 2,
+          "quantized::conv2d_unpack (qnnpack): QNNPACK only supports Conv2d "
+          "now.");
+      return packed_weight->unpack();
+    }
+#endif
+
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      return packed_weight->unpack();
+    }
+#endif
+
+    TORCH_CHECK(
+        false,
+        "Didn't find engine for operation quantized::conv2d_unpack ",
+        toString(ctx.qEngine()));
+  }
+};
+
+class QConv1dUnpackWeightsInt8 final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<2>>& packed_weight) {
+    auto& ctx = at::globalContext();
+    at::Tensor weight;
+    c10::optional<at::Tensor> bias;
+#ifdef USE_FBGEMM
+    if (ctx.qEngine() == at::QEngine::FBGEMM) {
+      std::tie(weight, bias) = packed_weight->unpack();
+      weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(weight, bias);
+    }
+#endif
+
+#ifdef USE_PYTORCH_QNNPACK
+    if (ctx.qEngine() == at::QEngine::QNNPACK) {
+      std::tie(weight, bias) = packed_weight->unpack();
+      at::Tensor new_weight = weight.clone();
+      new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(new_weight, bias);
+    }
+#endif
+
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      std::tie(weight, bias) = packed_weight->unpack();
+      at::Tensor new_weight = weight.clone();
+      new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(new_weight, bias);
+    }
+#endif
+
+    TORCH_CHECK(
+        false,
+        "Didn't find engine for operation quantized::conv1d_unpack ",
+        toString(ctx.qEngine()));
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvStride final {
+ public:
+  static torch::List<int64_t> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->stride();
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvPadding final {
+ public:
+  static torch::List<int64_t> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->padding();
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvOutputPadding final {
+ public:
+  static torch::List<int64_t> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->output_padding();
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvDilation final {
+ public:
+  static torch::List<int64_t> run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->dilation();
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvGroups final {
+ public:
+  static int64_t run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->groups();
+  }
+};
+
+template <int kSpatialDim = 2>
+class QConvTranspose final {
+ public:
+  static int64_t run(
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
+    return packed_weight->transpose();
+  }
+};
+
+IValue
+unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) {
+  auto params = ivalue.toCustomClass<ConvPackedParamsBase<2>>();
+  at::Tensor weight;
+  c10::optional<at::Tensor> bias;
+  std::tie(weight, bias) = params->unpack();
+  at::OptionalIntArrayRef bias_sizes = c10::nullopt;
+  if (bias && bias->defined()) {
+    bias_sizes = bias->sizes();
+  }
+  return IValue(std::make_tuple(
+      weight.sizes(),
+      bias_sizes,
+      params->stride(),
+      params->padding(),
+      params->dilation(),
+      params->groups()));
+}
+
+TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
+  // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  // We use  conv2d_unpack to be consistent with conv3d_unpack
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack_sizes"), TORCH_FN(unpack_quantized_prepacked_sizes_conv2d));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
+
+  // ConvTranspose is the same, however, we want to have different name.
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_stride"), TORCH_FN(QConvStride<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_padding"), TORCH_FN(QConvPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_dilation"), TORCH_FN(QConvDilation<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_groups"), TORCH_FN(QConvGroups<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
+}
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
new file mode 100644
index 000000000000..cfcd0589f03c
--- /dev/null
+++ b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
@@ -0,0 +1,77 @@
+/*
+The dispatch registrations at the end of this file applies to fbgemm, qnnpack, and cudnn backends.
+The correct unpack backend function is determined using runtime polymorphism through the packed_weight pointer,
+which is of type intrusive_ptr<LinearPackedParamsBase> and points to either a PackedLinearWeightsQnnp,
+PackedLinearWeights (Fbgemm), or PackedLinearWeightsCudnn at runtime, which all inherit from LinearPackedParamsBase.
+The implementations for the unpack functions can be found in /cpu/qlinear_unpack_impl.cpp, for fbgemm&qnnpack
+and /cudnn/linear_unpack_impl.cpp, for cudnn.
+*/
+#include <ATen/ATen.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/packed_params.h>
+#include <ATen/native/quantized/cpu/qnnpack_utils.h>
+#include <torch/custom_class.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace {
+
+class QLinearUnpackWeightInt8 final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
+    return packed_weight->unpack();
+  }
+};
+
+class QLinearUnpackWeightFp16 final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
+    auto& ctx = at::globalContext();
+
+    TORCH_CHECK(
+        ctx.qEngine() != at::QEngine::QNNPACK,
+        "quantized::linear_unpack_fp16 is currently "
+        "not supported by QNNPACK");
+
+    return packed_weight->unpack();
+  }
+};
+
+class QLinearUnpackWeightInt8Legacy final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+      const at::Tensor& packed_weight) {
+    TORCH_CHECK(false,
+        "quantized.linear_unpack(Tensor) is unsupported! Please "
+        "upgrade your model to use the newer quantized.linear_"
+        "unpack(LinearPackedParamsBase) overload");
+  }
+};
+
+class QLinearUnpackWeightFp16Legacy final {
+ public:
+  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+      const at::Tensor& packed_weight) {
+    TORCH_CHECK(false,
+        "quantized.linear_unpack(Tensor) is unsupported! Please "
+        "upgrade your model to use the newer quantized.linear_"
+        "unpack(LinearPackedParamsBase) overload");
+  }
+};
+
+TORCH_LIBRARY_IMPL(quantized, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
+}
+
+TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run));
+}
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseBlas.cpp b/aten/src/ATen/native/sparse/SparseBlas.cpp
index 50bd6a8d863c..9d5e6e163794 100644
--- a/aten/src/ATen/native/sparse/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlas.cpp
@@ -1,7 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Tensor.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/sparse/SparseBlas.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -12,6 +14,10 @@
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/mul.h>
 #include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/resize_as_sparse_native.h>
+#include <ATen/ops/sparse_sampled_addmm_native.h>
 #endif
 
 #include <c10/util/MaybeOwned.h>
@@ -26,7 +32,7 @@ Tensor& addmv_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseCsr || mat.layout() == kSparseBsr);
 
   TORCH_CHECK(mat.dim() == 2, "addmv: Expected mat to be 2-D");
   TORCH_CHECK(vec.dim() == 1, "addmv: Expected vec to be 1-D");
@@ -89,5 +95,148 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out_sparse_csr_cpu(
   return std::tuple<Tensor&, Tensor&>(X, clone_A);
 }
 
+/*
+  Computes `result` <- α*(A @ B) * spy(C) + β*C, where spy(C) is the sparsity pattern matrix of C.
+
+  Args:
+  * `mat1` - [in] dense Tensor A of size m × k.
+  * `mat2` - [in] dense Tensor B of size k × n.
+  * `self` - [in] sparse Tensor C of size m × n.
+  * `result` - [out] sparse Tensor of size m × n.
+*/
+Tensor& sparse_sampled_addmm_out_sparse_csr_cpu(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  at::native::sparse::sparse_sampled_addmm_check_inputs(self, mat1, mat2, beta, alpha, result);
+  // Allow only same types as for the CUDA path
+  auto t = self.scalar_type();
+  TORCH_CHECK(t == ScalarType::Double || t == ScalarType::Float ||
+    t == ScalarType::ComplexFloat || t == ScalarType::ComplexDouble,
+    "sparse_sampled_addmm: Expected self to be a floating-point or complex tensor, but got ", t);
+  if (&result != &self) {
+    // We allow self to be a single matrix when mat1 and mat2 are batched
+    auto result_sizes = DimVector(mat1.sizes().slice(0, mat1.dim() - 2));
+    result_sizes.push_back(self.size(-2));
+    result_sizes.push_back(self.size(-1));
+    at::sparse_csr::get_sparse_csr_impl(result)->resize_(self._nnz(), result_sizes);
+  }
+  result.copy_((self.to_dense().mul(beta).add(mat1.matmul(mat2), alpha)).sparse_mask(self));
+  return result;
+}
+
+Tensor sparse_sampled_addmm_sparse_csr_cpu(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  auto result = at::empty({0, 0}, self.options());
+  at::native::sparse_sampled_addmm_out_sparse_csr_cpu(self, mat1, mat2, beta, alpha, result);
+  return result;
+}
+
+namespace sparse {
+
+void sparse_sampled_addmm_check_inputs(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& result) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_sparse_csr());
+
+  TORCH_CHECK(
+      mat1.layout() == kStrided,
+      "sampled_addmm: Expected mat1 to have strided layout, but got ",
+      mat1.layout());
+  TORCH_CHECK(
+      mat2.layout() == kStrided,
+      "sampled_addmm: Expected mat2 to have strided layout, but got ",
+      mat2.layout());
+
+  TORCH_CHECK(
+      result.layout() == kSparseCsr,
+      "sampled_addmm: Expected result to have sparse csr layout, but got ",
+      result.layout());
+
+  TORCH_CHECK(
+      mat1.scalar_type() == mat2.scalar_type(),
+      "sampled_addmm: Expected mat1 and mat2 to have the same dtype, but got ",
+      mat1.scalar_type(),
+      " and ",
+      mat2.scalar_type());
+  TORCH_CHECK(
+      mat1.scalar_type() == self.scalar_type(),
+      "sampled_addmm: Expected mat1 and self to have the same dtype, but got ",
+      mat1.scalar_type(),
+      " and ",
+      self.scalar_type());
+  TORCH_CHECK(
+      result.scalar_type() == self.scalar_type(),
+      "sampled_addmm: Expected result and self to have the same dtype, but got ",
+      result.scalar_type(),
+      " and ",
+      self.scalar_type());
+
+  TORCH_CHECK(
+      mat1.dim() >= 2,
+      "sampled_addmm: Expected mat1 to be a matrix, got ",
+      mat1.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      mat2.dim() >= 2,
+      "sampled_addmm: Expected mat2 to be a matrix, got ",
+      mat2.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      result.dim() >= 2,
+      "sampled_addmm: Expected result to be a matrix, got ",
+      result.dim(),
+      "-D tensor");
+
+  TORCH_CHECK(
+    mat1.sizes().slice(0, mat1.dim() - 2) == mat2.sizes().slice(0, mat2.dim() - 2),
+    "sampled_addmm: Expected mat1 and mat2 to have the same batch size, but got ",
+    mat1.sizes().slice(0, mat1.dim() - 2),
+    " and ",
+    mat2.sizes().slice(0, mat2.dim() - 2));
+
+  TORCH_CHECK(
+    !(self.dim() > 2 && self.sizes().slice(0, self.dim() - 2) != mat1.sizes().slice(0, mat1.dim() - 2)),
+    "sampled_addmm: Expected self and mat1 to have the same batch size, but got ",
+    self.sizes().slice(0, self.dim() - 2),
+    " and ",
+    mat1.sizes().slice(0, mat1.dim() - 2));
+
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  TORCH_CHECK(
+      mat1_sizes[mat1.dim() - 1] == mat2_sizes[mat2.dim() - 2],
+      "sampled_addmm: mat1 and mat2 shapes cannot be multiplied (",
+      mat1_sizes[mat1.dim() - 2],
+      "x",
+      mat1_sizes[mat1.dim() - 1],
+      " and ",
+      mat2_sizes[mat2.dim() - 2],
+      "x",
+      mat2_sizes[mat2.dim() - 1],
+      ")");
+
+  IntArrayRef self_sizes = self.sizes();
+  TORCH_CHECK(
+      self_sizes[self.dim() - 2] == mat1_sizes[mat1.dim() - 2],
+      "sampled_addmm: self.shape[-2] must match mat1.shape[-2]");
+  TORCH_CHECK(
+      self_sizes[self.dim() - 1] == mat2_sizes[mat2.dim() - 1],
+      "sampled_addmm: self.shape[-1] must match mat2.shape[-1]");
+}
+
+} // namespace sparse
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseBlas.h b/aten/src/ATen/native/sparse/SparseBlas.h
new file mode 100644
index 000000000000..337308a2dddf
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseBlas.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/Scalar.h>
+
+namespace at {
+namespace native {
+namespace sparse {
+
+TORCH_API void sparse_sampled_addmm_check_inputs(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    const Tensor& result);
+
+} // namespace sparse
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
index 6b133d3b6325..4ad0d55c6891 100644
--- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
@@ -73,7 +73,7 @@ void triangular_solve_out_sparse_csr(
       "Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ",
       "Please use PyTorch built MKL support.");
 #else
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseCsr || A.layout() == kSparseBsr);
   sparse::impl::mkl::triangular_solve_out_sparse_csr(A, B, X, upper, transpose, unitriangular);
 #endif
 }
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index f91d9648e7db..62d600dc0926 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -9,14 +9,26 @@
 #include <ATen/SparseCsrTensorImpl.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorImpl.h>
+#include <ATen/native/LinearAlgebraUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
 #include <ATen/ops/_nnz_native.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_native.h>
 #include <ATen/ops/_validate_sparse_csr_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_native.h>
+#include <ATen/ops/ccol_indices_native.h>
 #include <ATen/ops/clone_native.h>
 #include <ATen/ops/col_indices_native.h>
 #include <ATen/ops/copy_native.h>
@@ -26,7 +38,13 @@
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/resize_as_sparse_native.h>
 #include <ATen/ops/resize_native.h>
+#include <ATen/ops/row_indices_native.h>
+#include <ATen/ops/select_native.h>
+#include <ATen/ops/sparse_compressed_tensor_native.h>
 #include <ATen/ops/sparse_csr_tensor_native.h>
+#include <ATen/ops/sparse_csc_tensor_native.h>
+#include <ATen/ops/sparse_bsr_tensor_native.h>
+#include <ATen/ops/sparse_bsc_tensor_native.h>
 #include <ATen/ops/values_native.h>
 #endif
 
@@ -40,15 +58,23 @@ namespace {
 
 } // end anonymous namespace
 
-void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) {
+void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout) {
+
+  // Layout must be Sparse Compressed
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{});
+
+  const std::string layout_name = layoutToString(layout, /*upper=*/ true);
+  const std::string compressed_indices_name = compressedIndicesName(layout);
+  const std::string plain_indices_name = plainIndicesName(layout);
+
   // Layout Invariants
   TORCH_CHECK(
-      col_indices.layout() == kStrided && col_indices.is_contiguous(),
-      "expected col_indices to be a strided and contiguous tensor");
+      plain_indices.layout() == kStrided && plain_indices.is_contiguous(),
+      "expected ", plain_indices_name, " to be a strided and contiguous tensor");
 
   TORCH_CHECK(
-      crow_indices.layout() == kStrided && crow_indices.is_contiguous(),
-      "expected crow_indices to be a strided and contiguous tensor");
+      compressed_indices.layout() == kStrided && compressed_indices.is_contiguous(),
+      "expected ", compressed_indices_name ," to be a strided and contiguous tensor");
 
   TORCH_CHECK(
       values.layout() == kStrided && values.is_contiguous(),
@@ -56,78 +82,141 @@ void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor&
 
   // Shape and Strides invariants
   TORCH_CHECK(
-      size.size() == 2,
-      "size of a CSR tensor must be of length 2, but got: ",
-      size.size());
+              size.size() >= 2,
+              "size of a batched ", layout_name, " tensor must have length >= 2, but got: ",
+              size.size());
   TORCH_CHECK(
-      crow_indices.dim() == 1,
-      "crow_indices must have dim=1 but got crow_indices.dim()=",
-      crow_indices.dim());
+              compressed_indices.dim() >= 1,
+              compressed_indices_name, " must have dim >= 1 but got ", compressed_indices_name, ".dim() = ",
+              compressed_indices.dim());
   TORCH_CHECK(
-      col_indices.dim() == 1,
-      "col_indices must have dim=1 but got col_indices.dim()=",
-      col_indices.dim());
+              plain_indices.dim() >= 1,
+              plain_indices_name, " must have dim >= 1 but got ", plain_indices_name, ".dim() = ",
+              plain_indices.dim());
   TORCH_CHECK(
-      values.dim() == 1,
-      "values must have dim=1 but got values.dim()=",
-      values.dim());
-  // Note, this check also enforces `crow_indices.numel() >= 1`
+              values.dim() >= 1,
+              "values must have dim >= 1 but got values.dim() = ",
+              values.dim());
+
   TORCH_CHECK(
-      crow_indices.numel() == (size[0] + 1),
-      "crow_indices.numel() must be size(0) + 1, but got: ",
-      crow_indices.numel());
+      compressed_indices.dim() == plain_indices.dim(),
+      "number of dimensions of ", compressed_indices_name, " and ", plain_indices_name, " must be the same but got ",
+      compressed_indices.dim(), " and ", plain_indices.dim(), ", respectively");
+
+  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
+      layout, "validate_sparse_compressed_tensor_args",
+      [&] {
+        TORCH_CHECK(
+                    compressed_indices.dim() == values.dim(),
+                    "number of dimensions of indices and values must be the same but got ",
+                    compressed_indices.dim(), " and ", values.dim(), ", respectively");
+      },
+      [&] {
+        TORCH_CHECK(
+                    compressed_indices.dim() + 2 == values.dim(),
+                    "number of dimensions of indices must be two less than the number of dimensions of the values but got ",
+                    compressed_indices.dim(), " + 2 not equal to ", values.dim());
+      });
+
+  TORCH_CHECK(
+      static_cast<size_t>(compressed_indices.dim()) == size.size() - 1,
+      "number of dimensions of indices must be one less than the number of dimensions of the provided size but got ",
+      compressed_indices.dim(), " not equal to ", size.size(), " - 1");
+
+  int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{ return 0; }, [&]{ return 2; });
+  IntArrayRef block_size = values.sizes().slice(values.dim() - block_ndim, block_ndim);
+  int64_t numel_per_block = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
+                                [&]() -> int64_t { return 1; }, [&]() -> int64_t { return block_size[0] * block_size[1]; });
+  int compressed_dim = compressedDimension(layout, size);
+  int plain_dim = plainDimension(layout, size);
+
+  // All batch sizes must be the same
+  auto batch_size = size.slice(0, size.size() - 2);
+  auto compressed_indices_batch_size = compressed_indices.sizes().slice(0, compressed_indices.dim() - 1);
+  auto plain_indices_batch_size = plain_indices.sizes().slice(0, plain_indices.dim() - 1);
+  auto values_batch_size = values.sizes().slice(0, values.dim() - 1 - block_ndim);
   TORCH_CHECK(
-      col_indices.numel() == values.numel(),
-      "col_indices and values must have equal sizes, but got col_indices.numel(): ",
-      col_indices.numel(),
-      ", values.numel(): ",
-      values.numel());
+      batch_size == compressed_indices_batch_size &&
+      batch_size == plain_indices_batch_size &&
+      batch_size == values_batch_size,
+      "all batch dimensions of the provided size (", batch_size, "), indices (",
+      compressed_indices_batch_size,", ", plain_indices_batch_size, "), and values (",
+      values_batch_size,") must be the same.");
+
+  // Note, this check also enforces `compressed_indices.size(-1) >= 1`
+  TORCH_CHECK(
+              compressed_indices.size(-1) == (size[compressed_dim] + 1),
+              compressed_indices_name, ".size(-1) must be equal to size[-", (size.size() - compressed_dim), "] + 1 (that is ",
+              size[compressed_dim] + 1, "), but got: ", compressed_indices.size(-1));
+
+  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
+      [&] {
+        TORCH_CHECK(
+                    plain_indices.numel() == values.numel(),
+                    plain_indices_name, " and values must have the same number of elements, but got ", plain_indices_name, ".numel(): ",
+                    plain_indices.numel(), ", values.numel(): ", values.numel());
+      },
+      [&] {
+        TORCH_CHECK(
+                    plain_indices.numel() * numel_per_block == values.numel(),
+                    "number of ", plain_indices_name, " elements must be the same as the number of blocks in values, but got ",
+                    plain_indices_name, ".numel() * numel_per_block: ", plain_indices.numel() * numel_per_block,
+                    ", values.numel(): ", values.numel(),", numel_per_block: ", numel_per_block);
+      });
 
   // Indices invariants
-  AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "csr_construct_check", [&] {
-    Tensor crow_indices_cpu = crow_indices.to(kCPU);
-    auto crow_indices_accessor = crow_indices_cpu.accessor<index_t, 1>();
-    TORCH_CHECK(
-        crow_indices_accessor[0] == 0, "0th value of crow_indices must be 0.");
-
-    TORCH_CHECK(
-        crow_indices_accessor[crow_indices.numel() - 1] == col_indices.numel(),
-        "last value of crow_indices should be equal to the length of col_indices.");
-
-    for (int i =  1; i <= size[0]; i++) {
-      TORCH_CHECK(
-          crow_indices_accessor[i - 1] <= crow_indices_accessor[i],
-          "at position i = ", i, ", this condition crow_indices[i - 1] <= crow_indices[i] fails");
-    }
-    if (col_indices.numel() > 0) {
-      TORCH_CHECK(0 <= col_indices.min().item<index_t>(), "col_indices.min() should be greater or equal to zero");
-      TORCH_CHECK(size[1] > col_indices.max().item<index_t>(), "size(1) should be greater than col_indices.max()");
-    }
-  });
-
-  // CSR Type Invariants
-  auto crow_indices_type = crow_indices.scalar_type();
-  auto col_indices_type = col_indices.scalar_type();
+  AT_DISPATCH_INDEX_TYPES(compressed_indices.scalar_type(), "validate_sparse_compressed_tensor_args",
+      [&] {
+        Tensor compressed_indices_cpu = compressed_indices.to(kCPU);
+        auto compressed_indices_data_ptr = compressed_indices_cpu.data_ptr<index_t>();
+        auto batch_stride = compressed_indices_cpu.dim() >= 2 ? compressed_indices_cpu.stride(-2) : 0;
+        auto compressed_dims = size[compressedDimension(layout, size)];
+        for (const auto batch_id : c10::irange(batchCount(compressed_indices_cpu))) {
+          TORCH_CHECK(
+                      compressed_indices_data_ptr[batch_id*batch_stride] == 0,
+                      "(Batch element ", batch_id, ") ",
+                      ": 0th value of ", compressed_indices_name, " must be 0, but it is ", compressed_indices_data_ptr[batch_id*batch_stride]);
+          TORCH_CHECK(
+                      compressed_indices_data_ptr[batch_id*batch_stride + compressed_indices.size(-1) - 1] == plain_indices.size(-1),
+                      "(Batch element ", batch_id, ") ",
+                      "last value of ", compressed_indices_name, " should be equal to the length of ", plain_indices_name, ".");
+          for (int i =  1; i <= compressed_dims; i++) {
+            TORCH_CHECK(
+                        compressed_indices_data_ptr[batch_id*batch_stride + i - 1] <= compressed_indices_data_ptr[batch_id*batch_stride + i],
+                        "(Batch element ", batch_id, ") ",
+                        "at position i = ", i, ", the condition ", compressed_indices_name, "[i - 1] <= ", compressed_indices_name, "[i] fails");
+          }
+        }
+        if (plain_indices.numel() > 0) {
+          TORCH_CHECK(0 <= plain_indices.min().item<index_t>(), plain_indices_name, ".min() should be greater or equal to zero");
+          TORCH_CHECK(size[plain_dim] > plain_indices.max().item<index_t>(), "size[-", (size.size() - plain_dim),"] should be greater than ", plain_indices_name, ".max()");
+        }
+      });
+
+  // Type Invariants
+  auto compressed_indices_type = compressed_indices.scalar_type();
+  auto plain_indices_type = plain_indices.scalar_type();
   TORCH_CHECK(
-      crow_indices_type == col_indices_type,
-      "both crow_indices and col_indices should have the same type.");
+      compressed_indices_type == plain_indices_type,
+      "both ", compressed_indices_name, " and ", plain_indices_name, " should have the same type, bot got ",
+      compressed_indices_type, " and ", plain_indices_type, ", respectively");
   TORCH_CHECK(
-      crow_indices_type == kInt || crow_indices_type == kLong,
-      "crow_indices and col_indices must be an int32 or int64 type, but got: ",
-      crow_indices_type);
+      compressed_indices_type == kInt || compressed_indices_type == kLong,
+      compressed_indices_name, " and ", plain_indices_name, " must be an int32 or int64 type, but got: ",
+      compressed_indices_type);
 
-  // CSR Device Invariants
+  // Device Invariants
   TORCH_CHECK(
-      col_indices.get_device() == crow_indices.get_device(),
-      "crow_indices and col_indices devices (",
-      crow_indices.get_device(),
+      plain_indices.get_device() == compressed_indices.get_device(),
+      compressed_indices_name, " and ", plain_indices_name, " devices (",
+      compressed_indices.get_device(),
       ", ",
-      col_indices.get_device(),
+      plain_indices.get_device(),
       ") must match");
   TORCH_CHECK(
-      crow_indices.get_device() == values.get_device(),
-      "device of crow_indices (",
-      crow_indices.get_device(),
+      compressed_indices.get_device() == values.get_device(),
+      "device of ", compressed_indices_name, " (",
+      compressed_indices.get_device(),
       ") must match device of values (",
       values.get_device(),
       ")");
@@ -136,19 +225,46 @@ void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor&
       "device type of values (",
       values.device().type(),
       ") must be CPU or CUDA");
+
+}
+
+void _validate_sparse_compressed_tensor_args(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, IntArrayRef size, Layout layout) {
+  _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout);
+}
+
+void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) {
+  _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseCsr);
+}
+
+void _validate_sparse_csc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size) {
+  _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseCsc);
+}
+
+void _validate_sparse_bsr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) {
+  _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseBsr);
+}
+
+void _validate_sparse_bsc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size) {
+  _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseBsc);
 }
 
-// Construction of CSR tensors.
-SparseCsrTensor new_csr_tensor(const TensorOptions& options) {
+// Construction of CSR, CSC, BSR, and BSC tensors.
+
+// Note: The usage of "Csr" in names like SparseCsrTensor,
+// SparseCsrCPU, SparseCsrCUDA, and SparseCsrTensorImpl exists because
+// of historical reasons (that ought to be removed in future) and does
+// not mean that the corresponding functionality would be CSR layout
+// only specific.
+SparseCsrTensor new_compressed_tensor(const TensorOptions& options) {
   // TODO: remove this comment after enabling autograd support for CSR tensor
   // constructor.
   // TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
-  TORCH_INTERNAL_ASSERT(options.layout() == kSparseCsr);
+  Layout layout = AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(options.layout(), "new_compressed_tensor", [&] { return the_layout; });
   DispatchKey dispatch_key;
 
   TORCH_CHECK_NOT_IMPLEMENTED(
     options.device().type() == kCPU || options.device().type() == kCUDA,
-     "Could not run '", "sparse_csr_tensor", "' from the '", options.device(), "' device.)");
+     "Could not run 'new_compressed_tensor' from the '", options.device(), "' device.)");
 
   if (options.device().is_cuda()) {
     dispatch_key = DispatchKey::SparseCsrCUDA;
@@ -157,44 +273,117 @@ SparseCsrTensor new_csr_tensor(const TensorOptions& options) {
   }
 
   return detail::make_tensor<SparseCsrTensorImpl>(
-      DispatchKeySet(dispatch_key), options.dtype());
+      DispatchKeySet(dispatch_key), layout, options.dtype());
 }
 
-Tensor _sparse_csr_tensor_unsafe(const Tensor& crow_indices, const Tensor& col_indices,
-    const Tensor& values,
-    IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
 
-  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+Tensor _sparse_compressed_tensor_unsafe(const Tensor& compressed_indices,
+                                        const Tensor& plain_indices,
+                                        const Tensor& values,
+                                        IntArrayRef size,
+                                        c10::optional<ScalarType> dtype,
+                                        c10::optional<Layout> layout,
+                                        c10::optional<Device> device,
+                                        c10::optional<bool> pin_memory) {
+  if (!layout) {
+    AT_ERROR("sparse_compressed_tensor_unsafe expected sparse compressed tensor layout but got none");
+  }
+  Layout layout_ = layout.value();
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor_unsafe", [&]{});
+  TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
+  SparseCsrTensor self = new_compressed_tensor(options);
+  get_sparse_csr_impl(self)->set_member_tensors(compressed_indices, plain_indices, values, size);
+  return self;
+}
 
-  SparseCsrTensor self = new_csr_tensor(options);
-  get_sparse_csr_impl(self)->set_member_tensors(crow_indices, col_indices, values, size);
+template <Layout required_layout>
+Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indices,
+                                                 const Tensor& plain_indices,
+                                                 const Tensor& values,
+                                                 IntArrayRef size,
+                                                 c10::optional<ScalarType> dtype,
+                                                 c10::optional<Layout> layout,
+                                                 c10::optional<Device> device,
+                                                 c10::optional<bool> pin_memory) {
+  Layout layout_ = layout.value_or(required_layout);
+  TORCH_CHECK(layout_ == required_layout, "sparse compressed layout must be ",required_layout, " but got ", layout_);
+  TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
+  SparseCsrTensor self = new_compressed_tensor(options);
+  get_sparse_csr_impl(self)->set_member_tensors(compressed_indices, plain_indices, values, size);
   return self;
 }
 
+#define SPARSE_COMPRESSED_TENSOR_UNSAFE(KIND, REQUIRED_LAYOUT)          \
+  Tensor _sparse_##KIND##_tensor_unsafe(const Tensor& compressed_indices, \
+                                        const Tensor& plain_indices,    \
+                                        const Tensor& values,           \
+                                        IntArrayRef size,               \
+                                        c10::optional<ScalarType> dtype, \
+                                        c10::optional<Layout> layout,   \
+                                        c10::optional<Device> device,   \
+                                        c10::optional<bool> pin_memory) { \
+    return _sparse_compressed_tensor_unsafe_template<REQUIRED_LAYOUT>(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); \
+  }
+
+SPARSE_COMPRESSED_TENSOR_UNSAFE(csr, kSparseCsr);
+SPARSE_COMPRESSED_TENSOR_UNSAFE(csc, kSparseCsc);
+SPARSE_COMPRESSED_TENSOR_UNSAFE(bsr, kSparseBsr);
+SPARSE_COMPRESSED_TENSOR_UNSAFE(bsc, kSparseBsc);
+
+DimVector _estimate_sparse_compressed_tensor_size(
+    const Tensor& compressed_indices,
+    const Tensor& plain_indices,
+    const Tensor& values,
+    Layout layout) {
+  DimVector size = DimVector(IntArrayRef(plain_indices.sizes().data(), plain_indices.dim() - 1));
+  int64_t compressed_dim = (plain_indices.size(-1) > 0 ? compressed_indices.size(-1) - 1 : 0);
+  int64_t plain_dim = AT_DISPATCH_INTEGRAL_TYPES(plain_indices.scalar_type(), "estimate_sparse_compressed_tensor_size",
+                                                 [&]() -> int64_t {
+                                                   if (plain_indices.numel() > 0) {
+                                                     return plain_indices.max().item<scalar_t>() + 1;
+                                                   } else {
+                                                     return 0;
+                                                   }
+                                                   });
+  AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "estimate_sparse_compressed_tensor_size",
+      [&]{
+        size.push_back(compressed_dim);
+        size.push_back(plain_dim);
+      },
+      [&]{
+        size.push_back(plain_dim);
+        size.push_back(compressed_dim);
+      });
+  return size;
+}
+
 // TODO: This constructor should probably use an ATen abstract method in order
 // to make autograd dispatch available for the CSR constructor. See the relevant
 // note in native_functions.yaml.
-Tensor sparse_csr_tensor(
-    const Tensor& crow_indices,
-    const Tensor& col_indices,
+Tensor sparse_compressed_tensor(
+    const Tensor& compressed_indices,
+    const Tensor& plain_indices,
     const Tensor& values,
     IntArrayRef size,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
+
+  if (!layout) {
+    AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none");
+  }
+  Layout layout_ = layout.value();
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor", [&]{});
+
   // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+  TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
 
-  at::native::_validate_sparse_csr_tensor_args(crow_indices, col_indices, values, size);
+  _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_);
 
-  return at::native::_sparse_csr_tensor_unsafe(
-      crow_indices,
-      col_indices,
+  return at::native::_sparse_compressed_tensor_unsafe(
+      compressed_indices,
+      plain_indices,
       values,
       size,
       optTypeMetaToScalarType(options.dtype_opt()),
@@ -203,29 +392,31 @@ Tensor sparse_csr_tensor(
       options.pinned_memory_opt());
 }
 
-Tensor sparse_csr_tensor(
-    const Tensor& crow_indices,
-    const Tensor& col_indices,
+Tensor sparse_compressed_tensor(
+    const Tensor& compressed_indices,
+    const Tensor& plain_indices,
     const Tensor& values,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
-  // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
-  std::array<int64_t, 2> size = {0, 0};
-  if (col_indices.numel() > 0) {
-    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "csr_construct_check", [&] {
-      size[0] = crow_indices.numel() - 1;
-      size[1] = col_indices.max().item<index_t>() + 1;
-    });
+
+  if (!layout) {
+    AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none");
   }
+  Layout layout_ = layout.value();
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor", [&]{});
 
-  at::native::_validate_sparse_csr_tensor_args(crow_indices, col_indices, values, size);
+  DimVector size = _estimate_sparse_compressed_tensor_size(compressed_indices, plain_indices, values, layout_);
+
+  // See [Note: hacky wrapper removal for TensorOptions]
+  TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
 
-  return at::native::_sparse_csr_tensor_unsafe(
-      crow_indices,
-      col_indices,
+  _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_);
+
+  return at::native::_sparse_compressed_tensor_unsafe(
+      compressed_indices,
+      plain_indices,
       values,
       size,
       optTypeMetaToScalarType(options.dtype_opt()),
@@ -234,7 +425,41 @@ Tensor sparse_csr_tensor(
       options.pinned_memory_opt());
 }
 
-Tensor empty_sparse_csr(
+#define SPARSE_COMPRESSED_TENSOR(KIND, REQUIRED_LAYOUT)                 \
+  Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices,       \
+                                const Tensor& plain_indices,            \
+                                const Tensor& values,                   \
+                                c10::optional<ScalarType> dtype,        \
+                                c10::optional<Layout> layout,           \
+                                c10::optional<Device> device,           \
+                                c10::optional<bool> pin_memory) {       \
+    if (layout) {                                                       \
+      TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \
+    }                                                                   \
+    c10::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
+    return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, dtype, layout_, device, pin_memory); \
+  }                                                                     \
+  Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices,       \
+                                const Tensor& plain_indices,            \
+                                const Tensor& values,                   \
+                                IntArrayRef size,                       \
+                                c10::optional<ScalarType> dtype,        \
+                                c10::optional<Layout> layout,           \
+                                c10::optional<Device> device,           \
+                                c10::optional<bool> pin_memory) {       \
+    if (layout) {                                                       \
+      TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \
+    }                                                                   \
+    c10::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
+    return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, dtype, layout_, device, pin_memory); \
+  }
+
+SPARSE_COMPRESSED_TENSOR(csr, kSparseCsr)
+SPARSE_COMPRESSED_TENSOR(csc, kSparseCsc)
+SPARSE_COMPRESSED_TENSOR(bsr, kSparseBsr)
+SPARSE_COMPRESSED_TENSOR(bsc, kSparseBsc)
+
+Tensor empty_sparse_compressed(
     IntArrayRef size,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
@@ -242,27 +467,34 @@ Tensor empty_sparse_csr(
     c10::optional<bool> pin_memory,
     c10::optional<MemoryFormat> optional_memory_format) {
   check_size_nonnegative(size);
+  TORCH_CHECK(size.size() >= 2, "torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size ", size);
 
-  TORCH_CHECK(size.size() == 2, "torch.empty: Only 2D sparse CSR tensors are supported.");
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout == Layout::SparseCsr);
+  // Strided is the default layout for torch.empty.
+  Layout layout_ = layout.value_or(Layout::Strided);
+
+  // torch.empty cannot be used to create blocked tensors because its
+  // API lacks a method to specify the block size.
+  AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(layout_, "empty_sparse_compressed", [&]{});
 
-  auto rows = size[0];
   int64_t nnz = 0;
+  auto compressed_indices_size = DimVector(size.slice(0, size.size() - 2));
+  auto plain_indices_and_values_size = DimVector(size.slice(0, size.size() - 2));
+  compressed_indices_size.push_back(size[compressedDimension(layout_, size)] + 1);
+  plain_indices_and_values_size.push_back(nnz);
 
   TensorOptions options = TensorOptions().dtype(ScalarType::Long).layout(Layout::Strided).device(device).pinned_memory(pin_memory);
-  auto crow_indices = at::empty({rows + 1}, options);
-  auto col_indices = at::empty({nnz}, options);
-  auto values = at::empty({nnz}, options.dtype(dtype));
-
-  return at::native::_sparse_csr_tensor_unsafe(
-      crow_indices,
-      col_indices,
-      values,
-      size,
-      dtype,
-      layout,
-      device,
-      pin_memory);
+  auto compressed_indices = at::empty(compressed_indices_size, options);
+  auto plain_indices = at::empty(plain_indices_and_values_size, options);
+  auto values = at::empty(plain_indices_and_values_size, options.dtype(dtype));
+
+  return at::native::_sparse_compressed_tensor_unsafe(compressed_indices,
+                                                      plain_indices,
+                                                      values,
+                                                      size,
+                                                      dtype,
+                                                      layout,
+                                                      device,
+                                                      pin_memory);
 }
 
 const Tensor& resize_sparse_csr_(
@@ -270,32 +502,63 @@ const Tensor& resize_sparse_csr_(
     IntArrayRef size,
     c10::optional<MemoryFormat> optional_memory_format) {
   check_size_nonnegative(size);
-  TORCH_CHECK(size.size() == 2, "torch.resize_: Only 2D sparse CSR tensors are supported.");
+  TORCH_CHECK(size.size() >= 2, "torch.resize_: Only batched sparse CSR matrices are supported, but got size ", size);
   TORCH_CHECK(
-      self.size(1) <= size[1],
+      self.size(-1) <= size[size.size() - 1],
       "torch.resize_: Resizing columns of sparse CSR tensors to a smaller value is not supported. ",
       "The original number of columns is ",
-      self.size(1),
-      " while the requested new number of columns is ", size[1], ".");
+      self.size(-1),
+      " while the requested new number of columns is ", size[size.size() - 1], ".");
   get_sparse_csr_impl(self)->resize_(self._nnz(), size);
   return self;
 }
 
-Tensor& copy_sparse_csr_(Tensor& self, const Tensor& src, bool non_blocking) {
+Tensor& copy_sparse_compressed_(Tensor& self, const Tensor& src, bool non_blocking) {
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_", [&]{});
   TORCH_CHECK(
-      self.sizes() == src.sizes(),
-      "copy_sparse_csr_: only same size tensors are supported.");
+      self.layout() == src.layout(),
+      "torch.copy_: copy of sparse compressed tensors having different layouts is not supported.",
+      " self layout is ", self.layout(), " and src layout is ", src.layout());
   TORCH_CHECK(
-      self.is_sparse_csr() && src.is_sparse_csr(),
-      "copy_sparse_csr_: copy between different layouts is not supported. Found self type = ",
-      self.toString(),
-      " and src type = ",
-      src.toString());
-  TORCH_CHECK(
-      self._nnz() == src._nnz(),
-      "copy_sparse_csr_: only tensors with the same number of specified elements are supported.");
-  self.crow_indices().copy_(src.crow_indices(), non_blocking);
-  self.col_indices().copy_(src.col_indices(), non_blocking);
+      self._nnz() == src._nnz(),  // actually, values copy allows different shapes as long as operands are broadcastable
+      "torch.copy_: only sparse compressed tensors with the same number of specified elements are supported.");
+  auto self_compressed_dim = compressedDimension(self.layout(), self.sizes());
+  auto src_compressed_dim = compressedDimension(src.layout(), src.sizes());
+  auto self_compressed_dims = self.size(self_compressed_dim);
+  auto src_compressed_dims = src.size(compressedDimension(src.layout(), src.sizes()));
+  if (self_compressed_dim == src_compressed_dim) {
+    TORCH_CHECK(self_compressed_dims == src_compressed_dims,
+                "torch.copy_: expected shapes of self and src to match along dimension ",
+                self_compressed_dim, " for ",
+                self.layout(), " layout but the corresponding dimensions of self and src are ",
+                self_compressed_dims, " and ", src_compressed_dims, ", respecitvely.");
+  } else {
+    TORCH_CHECK(self_compressed_dims == src_compressed_dims,
+                "torch.copy_: expected shapes of self and src to match along dimensions ",
+                self_compressed_dim, " and ", src_compressed_dim, ", respectively, for ",
+                self.layout(), " layout but the corresponding dimensions of self and src are ",
+                self_compressed_dims, " and ", src_compressed_dims, ", respecitvely.");
+  }
+  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_",
+                                              [&]{},
+                                              [&]{
+                                                auto self_values = self.values();
+                                                auto src_values = src.values();
+                                                auto self_block_size = DimVector(self_values.sizes().slice(self_values.dim()-2, 2));
+                                                auto src_block_size = DimVector(src_values.sizes().slice(src_values.dim()-2, 2));
+                                                TORCH_CHECK(self_block_size == src_block_size,
+                                                            "torch.copy_: copy of sparse compressed tensors having different block sizes is not supported.",
+                                                            " self and src block sizes are ", self_block_size, " and ", src_block_size, ", respectivly.");
+                                              });
+  AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_",
+                                            [&]{
+                                              self.crow_indices().copy_(src.crow_indices(), non_blocking);
+                                              self.col_indices().copy_(src.col_indices(), non_blocking);
+                                            },
+                                            [&]{
+                                              self.ccol_indices().copy_(src.ccol_indices(), non_blocking);
+                                              self.row_indices().copy_(src.row_indices(), non_blocking);
+                                            });
   self.values().copy_(src.values(), non_blocking);
   return self;
 }
@@ -310,11 +573,27 @@ Tensor values_sparse_csr(const Tensor& self) {
 }
 
 Tensor crow_indices_sparse_csr(const Tensor& self) {
-  return get_sparse_csr_impl(self)->crow_indices().alias();
+  return AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(self.layout(),
+                                                   "crow_indices",
+                                                   [&]{ return get_sparse_csr_impl(self)->compressed_indices().alias(); });
 }
 
 Tensor col_indices_sparse_csr(const Tensor& self) {
-  return get_sparse_csr_impl(self)->col_indices().alias();
+  return AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(self.layout(),
+                                                   "col_indices",
+                                                   [&]{ return get_sparse_csr_impl(self)->plain_indices().alias(); });
+}
+
+Tensor ccol_indices_sparse_csr(const Tensor& self) {
+  return AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(self.layout(),
+                                                   "ccol_indices",
+                                                   [&]{ return get_sparse_csr_impl(self)->compressed_indices().alias(); });
+}
+
+Tensor row_indices_sparse_csr(const Tensor& self) {
+  return AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(self.layout(),
+                                                   "row_indices",
+                                                   [&]{ return get_sparse_csr_impl(self)->plain_indices().alias(); });
 }
 
 bool _is_same_size_as_sparse_csr(
@@ -339,23 +618,31 @@ const SparseCsrTensor& resize_as_sparse_csr_(
   return self;
 }
 
-SparseCsrTensor clone_sparse_csr(
-    const SparseCsrTensor& self,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+SparseCsrTensor clone_sparse_compressed(
+                                        const SparseCsrTensor& self,
+                                        c10::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !optional_memory_format.has_value(),
       "unsupported memory format option ",
       optional_memory_format.value());
   TensorOptions options = self.options();
-  return at::native::_sparse_csr_tensor_unsafe(
-                                               self.crow_indices().clone(),
-                                               self.col_indices().clone(),
-                                               self.values().clone(),
-                                               self.sizes(),
-                                               optTypeMetaToScalarType(options.dtype_opt()),
-                                               options.layout_opt(),
-                                               options.device_opt(),
-                                               options.pinned_memory_opt());
+  auto compressed_indices = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(),
+                                                                      "clone_sparse_compressed",
+                                                                      [&]{ return self.crow_indices(); },
+                                                                      [&]{ return self.ccol_indices(); });
+  auto plain_indices = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(),
+                                                                 "clone_sparse_compressed",
+                                                                 [&]{ return self.col_indices(); },
+                                                                 [&]{ return self.row_indices(); });
+  return at::native::_sparse_compressed_tensor_unsafe(
+                                                      compressed_indices.clone(),
+                                                      plain_indices.clone(),
+                                                      self.values().clone(),
+                                                      self.sizes(),
+                                                      optTypeMetaToScalarType(options.dtype_opt()),
+                                                      options.layout_opt(),
+                                                      options.device_opt(),
+                                                      options.pinned_memory_opt());
 }
 
 Tensor empty_like_sparse_csr(
@@ -377,9 +664,9 @@ Tensor empty_like_sparse_csr(
         self.col_indices().clone(),
         at::empty(self.values().sizes(), options.layout(kStrided)),
         self.sizes(),
-        dtype,
+        optTypeMetaToScalarType(options.dtype()),
         self.layout(),
-        device);
+        options.device());
     return result;
   } else if (options.layout() == kStrided) {
     return at::native::empty_like(self, dtype, layout, device, pin_memory, optional_memory_format);
@@ -388,5 +675,43 @@ Tensor empty_like_sparse_csr(
   }
 }
 
+Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
+  TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
+  TORCH_CHECK_INDEX(self.dim() != 0, "select() cannot be applied to a 0-dim tensor.");
+  dim = maybe_wrap_dim(dim, self.dim());
+  auto size = self.size(dim);
+  if (index < -size || index >= size) {
+    TORCH_CHECK_INDEX(false, "select(): index ", index, " out of range for tensor of size ",
+                   self.sizes(), " at dimension ", dim);
+  }
+  if (index < 0) {
+    index += size;
+  }
+
+  TORCH_INTERNAL_ASSERT(dim >= 0 && dim < self.dim());
+
+  auto new_sizes = DimVector(self.sizes());
+  new_sizes.erase(new_sizes.begin() + dim);
+  auto options = self.options();
+
+  // Selecting batch dimension
+  if (dim < self.dim() - 2) {
+    return at::native::_sparse_csr_tensor_unsafe(
+        self.crow_indices().select(dim, index),
+        self.col_indices().select(dim, index),
+        self.values().select(dim, index),
+        new_sizes,
+        optTypeMetaToScalarType(options.dtype_opt()),
+        options.layout_opt(),
+        options.device_opt(),
+        options.pinned_memory_opt());
+  } else {
+    TORCH_CHECK(self.dim() == 2, "select(): selecting rows or columns is not implemented for batched sparse CSR tensors.")
+    // Converting to COO and calling select is slighly slower than operating on the CSR indices directly
+    // for constructing a COO vector, however current version is more readable and easier to understand.
+    return self.to_sparse().select(dim, index);
+  }
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 8d17356ea5a1..5d520142cf0b 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -1,16 +1,17 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/mkl/Sparse.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkl/SparseBlasImpl.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
+#include <ATen/native/sparse/SparseCsrTensorMath.h>
 #include <c10/util/irange.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -21,7 +22,9 @@
 #include <ATen/ops/_conj_physical_native.h>
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_unique.h>
 #include <ATen/ops/abs.h>
 #include <ATen/ops/abs_native.h>
 #include <ATen/ops/add.h>
@@ -50,6 +53,7 @@
 #include <ATen/ops/erfinv_native.h>
 #include <ATen/ops/expm1.h>
 #include <ATen/ops/expm1_native.h>
+#include <ATen/ops/fill_native.h>
 #include <ATen/ops/floor.h>
 #include <ATen/ops/floor_native.h>
 #include <ATen/ops/isinf.h>
@@ -63,9 +67,11 @@
 #include <ATen/ops/log1p.h>
 #include <ATen/ops/log1p_native.h>
 #include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul_native.h>
 #include <ATen/ops/neg.h>
 #include <ATen/ops/neg_native.h>
 #include <ATen/ops/normal_native.h>
+#include <ATen/ops/ones_like.h>
 #include <ATen/ops/rad2deg.h>
 #include <ATen/ops/rad2deg_native.h>
 #include <ATen/ops/resize_as_sparse_native.h>
@@ -85,12 +91,16 @@
 #include <ATen/ops/sinh_native.h>
 #include <ATen/ops/sqrt.h>
 #include <ATen/ops/sqrt_native.h>
+#include <ATen/ops/sparse_mask.h>
+#include <ATen/ops/sparse_mask_native.h>
 #include <ATen/ops/tan.h>
 #include <ATen/ops/tan_native.h>
 #include <ATen/ops/tanh.h>
 #include <ATen/ops/tanh_native.h>
+#include <ATen/ops/tensor.h>
 #include <ATen/ops/trunc.h>
 #include <ATen/ops/trunc_native.h>
+#include <ATen/ops/zero_native.h>
 #include <ATen/ops/zeros.h>
 #endif
 
@@ -99,19 +109,22 @@
 namespace at {
 namespace meta {
 
-TORCH_META_FUNC(_convert_indices_from_coo_to_csr) (
-  const Tensor& self, const int64_t size, const bool out_int32
-) {
+TORCH_META_FUNC(_convert_indices_from_coo_to_csr)
+(const Tensor& self, const int64_t size, const bool out_int32) {
   TORCH_CHECK(self.dim() <= 1, "Input is supposed to be a vector");
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
-  c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
   set_output(size + 1, options);
 }
 
-TORCH_META_FUNC(_convert_indices_from_csr_to_coo) (
-  const Tensor& crow_indices, const Tensor& col_indices, const bool out_int32, const bool transpose
-) {
-  TORCH_CHECK(crow_indices.dim() == 1, "crow_indices is supposed to be a vector");
+TORCH_META_FUNC(_convert_indices_from_csr_to_coo)
+(const Tensor& crow_indices,
+ const Tensor& col_indices,
+ const bool out_int32,
+ const bool transpose) {
+  TORCH_CHECK(
+      crow_indices.dim() == 1, "crow_indices is supposed to be a vector");
   TORCH_CHECK(col_indices.dim() == 1, "col_indices is supposed to be a vector");
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = crow_indices.options().dtype(scalar_type);
@@ -124,33 +137,6 @@ namespace {
 
 constexpr int64_t GRAIN_SIZE = at::internal::GRAIN_SIZE;
 
-template <typename input_t, typename output_t>
-void convert_indices_from_coo_to_csr_cpu(const Tensor& result, const Tensor& input, const int64_t size) {
-  int64_t numel = input.numel();
-  const input_t* data_in = input.data_ptr<input_t>();
-  output_t* data_out = result.data_ptr<output_t>();
-
-  if (numel == 0) {
-    result.zero_();
-    return;
-  }
-
-  for (int64_t i = 0; i <= data_in[0]; i++)
-    data_out[i] = static_cast<output_t>(0);
-
-  at::parallel_for(0, numel - 1, GRAIN_SIZE, [&](int64_t start, int64_t end) {
-    input_t curr_value = data_in[start], next_value;
-    for (const auto i : c10::irange(start, end)) {
-      next_value = data_in[i + 1];
-      for (; curr_value < next_value; curr_value++)
-        data_out[curr_value + 1] = static_cast<output_t>(i + 1);
-    }
-  });
-
-  for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++)
-    data_out[i] = static_cast<output_t>(numel);
-}
-
 template <typename F>
 Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) {
   TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
@@ -162,9 +148,9 @@ Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) {
     if (result.numel() == 0) {
       at::native::resize_as_sparse_csr_(result, self);
     }
-    // copy_sparse_csr_ internally checks the sizes of result and self tensors
+    // copy_sparse_compressed_ internally checks the sizes of result and self tensors
     // Hence no external size check required
-    at::native::copy_sparse_csr_(result, self);
+    at::native::copy_sparse_compressed_(result, self);
   }
 
   auto self_values = self.values();
@@ -174,7 +160,7 @@ Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) {
   return result;
 }
 
-template <typename F, typename ...Args>
+template <typename F, typename... Args>
 Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) {
   TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
 
@@ -184,7 +170,11 @@ Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) {
 }
 
 template <typename input_t, typename output_t>
-void convert_indices_from_csr_to_coo_cpu(const Tensor& indices, const Tensor& crow_indices, const Tensor& col_indices, const bool transpose=false) {
+void convert_indices_from_csr_to_coo_cpu(
+    const Tensor& indices,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const bool transpose = false) {
   int64_t nrows = crow_indices.numel() - 1;
   if (nrows == 0) {
     indices.zero_();
@@ -193,16 +183,18 @@ void convert_indices_from_csr_to_coo_cpu(const Tensor& indices, const Tensor& cr
   auto crow_indices_ = crow_indices.expect_contiguous();
   const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
   TORCH_INTERNAL_ASSERT(indices.is_contiguous());
-  auto row0 = indices.select(0, transpose?1:0);
-  auto row1 = indices.select(0, transpose?0:1);
+  auto row0 = indices.select(0, transpose ? 1 : 0);
+  auto row1 = indices.select(0, transpose ? 0 : 1);
   output_t* data_out = row0.data_ptr<output_t>();
   row1.copy_(*col_indices.expect_contiguous());
   at::parallel_for(0, nrows, GRAIN_SIZE, [&](int64_t start, int64_t end) {
     for (const auto i : c10::irange(start, end)) {
-      std::fill(&data_out[crow_indices_data_in[i]], &data_out[crow_indices_data_in[i + 1]], static_cast<output_t>(i));
+      std::fill(
+          &data_out[crow_indices_data_in[i]],
+          &data_out[crow_indices_data_in[i + 1]],
+          static_cast<output_t>(i));
     }
   });
-
 }
 
 } // end anonymous namespace
@@ -221,26 +213,27 @@ inline Tensor get_result_tensor_for_unary_op(F op, const Tensor& input) {
 
   // To handle type promotion for inputs to unary ops,
   // we first get the result from the underlined op, and use the result
-  // to create a sparse CSR tensor, which is used as the input to the out= variant
+  // to create a sparse CSR tensor, which is used as the input to the out=
+  // variant
   auto result_values = op(values);
 
   auto result = at::native::_sparse_csr_tensor_unsafe(
-    input.crow_indices().clone(),
-    input.col_indices().clone(),
-    result_values,
-    input.sizes(),
-    result_values.scalar_type(),
-    input.layout(),
-    result_values.device());
+      input.crow_indices().clone(),
+      input.col_indices().clone(),
+      result_values,
+      input.sizes(),
+      result_values.scalar_type(),
+      input.layout(),
+      result_values.device());
 
   return result;
 }
-}
+} // namespace
 
 static constexpr bool is_mkl_supported() {
 #ifdef _MSC_VER
   return false;
-#elif  __APPLE__ || __MACH__
+#elif __APPLE__ || __MACH__
   return false;
 #else
   return true;
@@ -248,41 +241,79 @@ static constexpr bool is_mkl_supported() {
 }
 
 // Only accept squares sparse matrices or dense input as a vector
-// TODO: Check what happens with MKL, the output error reported with non square matrices tends to be high
-// See: https://github.com/pytorch/pytorch/issues/58770
+// TODO: Check what happens with MKL, the output error reported with non square
+// matrices tends to be high See:
+// https://github.com/pytorch/pytorch/issues/58770
 bool is_square_or_vec(int64_t dim_i, int64_t dim_j, int64_t dim_k) {
-  return (dim_i == dim_k  && dim_k == dim_j) || (dim_i == dim_j && dim_k == 1);
+  return (dim_i == dim_k && dim_k == dim_j) || (dim_i == dim_j && dim_k == 1);
 }
 
-Tensor& normal_sparse_csr_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_sparse_csr_(
+    Tensor& self,
+    double mean,
+    double std,
+    c10::optional<Generator> gen) {
   return unary_op_inplace(self, &Tensor::normal_, mean, std, gen);
 }
 
+Tensor& fill_sparse_csr_(Tensor& self, const Scalar& value) {
+  return unary_op_inplace(self, &TensorBase::fill_, value);
+}
+
+Tensor sparse_mask_sparse_csr(
+    const Tensor& self,
+    const Tensor& sparse_mask) {
+  TORCH_CHECK(sparse_mask.is_sparse_csr(), "sparse_mask_sparse_csr expects mask to be sparse csr");
+  TORCH_CHECK(self.dim() == 2, "sparse_mask_sparse_csr expects self to be 2D");
+  TORCH_CHECK(sparse_mask.dim() == 2, "sparse_mask_sparse_csr expects mask to be 2D");
+
+  // We are computing self.mul(at::ones_like(sparse_mask))
+  // But mul(dense, sparse_csr) is not implemented yet
+  if (self.layout() == sparse_mask.layout()) {
+    // Both inputs are CSR
+    return self.mul(at::ones_like(sparse_mask));
+  } else {
+    return self.sparse_mask(sparse_mask.to_sparse()).to_sparse_csr();
+  }
+}
+
+Tensor mul_scalar_sparse_csr(const Tensor& self, const Scalar& other) {
+  auto result_values = self.values().mul(other);
+  return at::native::_sparse_csr_tensor_unsafe(
+      self.crow_indices().clone(),
+      self.col_indices().clone(),
+      result_values,
+      self.sizes(),
+      result_values.scalar_type(),
+      self.layout(),
+      result_values.device());
+}
+
 /* Implementation of Unary Ufuncs, those supported for Sparse CSR Layout
  * Only simple funcs, with 0->0 correspondence are currently supported. */
 
-#define CREATE_UNARY_UFUNC_OUT(op_name)                                    \
-  Tensor& op_name##_sparse_csr_out(const Tensor& self, Tensor& result) {   \
-    return unary_op_out(&at::op_name##_outf, self, result);                \
+#define CREATE_UNARY_UFUNC_OUT(op_name)                                  \
+  Tensor& op_name##_sparse_csr_out(const Tensor& self, Tensor& result) { \
+    return unary_op_out(&at::op_name##_outf, self, result);              \
   }
 
-#define CREATE_UNARY_UFUNC_FUNCTIONAL(op_name)                             \
-  Tensor op_name##_sparse_csr(const Tensor& self) {                        \
-    return get_result_tensor_for_unary_op(&at::op_name, self);             \
+#define CREATE_UNARY_UFUNC_FUNCTIONAL(op_name)                 \
+  Tensor op_name##_sparse_csr(const Tensor& self) {            \
+    return get_result_tensor_for_unary_op(&at::op_name, self); \
   }
 
-#define CREATE_UNARY_UFUNC_INPLACE(op_name)                                \
-  Tensor& op_name##_sparse_csr_(Tensor& self) {                            \
-    return unary_op_inplace(self, &Tensor::op_name##_);                    \
+#define CREATE_UNARY_UFUNC_INPLACE(op_name)             \
+  Tensor& op_name##_sparse_csr_(Tensor& self) {         \
+    return unary_op_inplace(self, &Tensor::op_name##_); \
   }
 
-#define CREATE_UNARY_UFUNC(op_name)                                        \
-  CREATE_UNARY_UFUNC_OUT(op_name);                                         \
-  CREATE_UNARY_UFUNC_FUNCTIONAL(op_name);                                  \
+#define CREATE_UNARY_UFUNC(op_name)       \
+  CREATE_UNARY_UFUNC_OUT(op_name);        \
+  CREATE_UNARY_UFUNC_FUNCTIONAL(op_name); \
   CREATE_UNARY_UFUNC_INPLACE(op_name);
 
-#define CREATE_UNARY_UFUNC_NO_INPLACE(op_name)                             \
-  CREATE_UNARY_UFUNC_OUT(op_name);                                         \
+#define CREATE_UNARY_UFUNC_NO_INPLACE(op_name) \
+  CREATE_UNARY_UFUNC_OUT(op_name);             \
   CREATE_UNARY_UFUNC_FUNCTIONAL(op_name);
 
 // Exhaustive list of the unary ufuncs supported by sparse CSR
@@ -309,6 +340,8 @@ CREATE_UNARY_UFUNC(tanh);
 CREATE_UNARY_UFUNC(trunc);
 CREATE_UNARY_UFUNC(conj_physical);
 
+CREATE_UNARY_UFUNC_INPLACE(zero);
+
 // With addition of `round.decimals` overload, using CREATE_UNARY_UFUNC leads
 // to unresolved overload.
 Tensor& round_sparse_csr_out(const Tensor& self, Tensor& result) {
@@ -336,8 +369,12 @@ CREATE_UNARY_UFUNC_FUNCTIONAL(isnan);
 CREATE_UNARY_UFUNC_FUNCTIONAL(isinf);
 
 template <typename scalar_t>
-void addmm_out_sparse_csr_native_cpu(const Tensor& sparse, const Tensor& dense, const Tensor& r, Scalar alpha, Scalar beta) {
-
+void addmm_out_sparse_csr_native_cpu(
+    const Tensor& sparse,
+    const Tensor& dense,
+    const Tensor& r,
+    Scalar alpha,
+    Scalar beta) {
   auto dim_i = sparse.size(0);
   auto dim_k = dense.size(1);
 
@@ -347,41 +384,46 @@ void addmm_out_sparse_csr_native_cpu(const Tensor& sparse, const Tensor& dense,
 
   scalar_t cast_alpha = alpha.to<scalar_t>();
   r.mul_(beta);
-  AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
-    auto csr_accessor = csr.accessor<index_t, 1>();
-    auto col_indices_accessor = col_indices.accessor<index_t, 1>();
-
-    auto values_accessor = values.accessor<scalar_t, 1>();
-    scalar_t* dense_ptr = dense.data_ptr<scalar_t>();
-    scalar_t* r_ptr = r.data_ptr<scalar_t>();
-
-    int64_t dense_stride0 = dense.stride(0);
-    int64_t dense_stride1 = dense.stride(1);
-    int64_t r_stride0 = r.stride(0);
-    int64_t r_stride1 = r.stride(1);
-
-    at::parallel_for(
-        0,
-        dim_i,
-        internal::GRAIN_SIZE,
-        [&](int64_t irow_start, int64_t irow_end) {
-            for (index_t h = irow_start; h < irow_end; ++h) {
-              index_t i_start = csr_accessor[h];
-              index_t i_end = csr_accessor[h+1];
-              for (index_t i = i_start; i < i_end; i++) {
-                scalar_t val = values_accessor[i];
-                index_t col = col_indices_accessor[i];
-                at::native::cpublas::axpy<scalar_t>(dim_k,
-                    cast_alpha * val,
-                    dense_ptr + col * dense_stride0, dense_stride1,
-                    r_ptr + h * r_stride0, r_stride1);
+  AT_DISPATCH_INDEX_TYPES(
+      col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
+        auto csr_accessor = csr.accessor<index_t, 1>();
+        auto col_indices_accessor = col_indices.accessor<index_t, 1>();
+
+        auto values_accessor = values.accessor<scalar_t, 1>();
+        scalar_t* dense_ptr = dense.data_ptr<scalar_t>();
+        scalar_t* r_ptr = r.data_ptr<scalar_t>();
+
+        int64_t dense_stride0 = dense.stride(0);
+        int64_t dense_stride1 = dense.stride(1);
+        int64_t r_stride0 = r.stride(0);
+        int64_t r_stride1 = r.stride(1);
+
+        at::parallel_for(
+            0,
+            dim_i,
+            internal::GRAIN_SIZE,
+            [&](int64_t irow_start, int64_t irow_end) {
+              for (index_t h = irow_start; h < irow_end; ++h) {
+                index_t i_start = csr_accessor[h];
+                index_t i_end = csr_accessor[h + 1];
+                for (index_t i = i_start; i < i_end; i++) {
+                  scalar_t val = values_accessor[i];
+                  index_t col = col_indices_accessor[i];
+                  at::native::cpublas::axpy<scalar_t>(
+                      dim_k,
+                      cast_alpha * val,
+                      dense_ptr + col * dense_stride0,
+                      dense_stride1,
+                      r_ptr + h * r_stride0,
+                      r_stride1);
+                }
               }
-            }
-    });
-  });
+            });
+      });
 }
 
 // Functions for matrix multiplication.
+// result = beta * self + alpha (mat1 @ mat2)
 Tensor& addmm_out_sparse_csr_cpu(
     const Tensor& self,
     const Tensor& mat1,
@@ -389,62 +431,61 @@ Tensor& addmm_out_sparse_csr_cpu(
     const Scalar& beta,
     const Scalar& alpha,
     Tensor& result) {
-  TORCH_INTERNAL_ASSERT(mat1.is_sparse_csr());
-
   // TODO: remove this, there are no codegenerated checks for devices yet
-  TORCH_CHECK(
-    !self.is_cuda(),
-    "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got CUDA tensor");
-  TORCH_CHECK(
-      !result.is_cuda(),
-      "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got CUDA tensor");
-  TORCH_CHECK(
-      !mat1.is_cuda(),
-      "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
-  TORCH_CHECK(
-      !mat2.is_cuda(),
-      "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+  sparse::impl::_check_is_cpu(self, "self");
+  sparse::impl::_check_is_cpu(mat1, "mat1");
+  sparse::impl::_check_is_cpu(mat2, "mat2");
+  sparse::impl::_check_is_cpu(result, "result");
 
-  // All the checks are from addmm_out_cuda_impl (ATen/native/cuda/Blas.cpp) and TORCH_META_FUNC(addmm) (ATen/native/LinearAlgebra.cpp)
+  // All the checks are from addmm_out_cuda_impl (ATen/native/cuda/Blas.cpp) and
+  // TORCH_META_FUNC(addmm) (ATen/native/LinearAlgebra.cpp)
   // TODO: remove code duplication and unify code
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  sparse::impl::_check_dim(mat1, 2, "mat1");
+  sparse::impl::_check_dim(mat2, 2, "mat2");
+
   TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-
-  IntArrayRef mat1_sizes = mat1.sizes();
-  IntArrayRef mat2_sizes = mat2.sizes();
-  IntArrayRef self__sizes;
-  c10::MaybeOwned<Tensor> self_;
-  if (&result != &self && self.layout() == kStrided) {
-    self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-    self__sizes = self_->sizes();
+      mat1.size(1) == mat2.size(0), "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.size(0), "x", mat1.size(1), " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
+  c10::MaybeOwned<at::Tensor> self_;
+  // Don't expand self if this is an in-place operation
+  if (&result == &self) {
+     self_ = c10::MaybeOwned<Tensor>::borrowed(self);
   } else {
-    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
-    self__sizes = self_->sizes();
+     self_ = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm");
   }
 
-  TORCH_CHECK(((self_->dim() == 2) && (self_->sizes()[0] == mat1.sizes()[0]) && (self_->sizes()[1] == mat2.sizes()[1])),
-  "The input tensor must be a matrix with size ", mat1.sizes()[0], "x", mat2.sizes()[1], ", but got a ", self_->dim(),
-  "-D tensor with size ", self__sizes[0], "x", self__sizes[1]);
+
+  TORCH_CHECK(((self_->dim() == 2) &&
+               (self_->size(0) == mat1.size(0)) &&
+               (self_->size(1) == mat2.size(1))),
+              "The input tensor must be a matrix with size ",
+              mat1.size(0),
+              "x",
+              mat2.size(1),
+              ", but got a ",
+              self_->dim(),
+              "-D tensor with size ",
+              self_->size(0),
+              "x",
+              self_->size(1));
 
   if (&result != &self) {
     if (result.layout() == kStrided) {
-      at::native::resize_output(result, self__sizes);
+      at::native::resize_output(result, self_->sizes());
     } else {
-      at::native::resize_as_sparse_csr_(result, *self_);
+      result.resize_as_sparse_(*self_);
     }
     result.copy_(*self_);
   }
 
-  IntArrayRef result_sizes = result.sizes();
-  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+  if (result.numel() == 0) {
     return result;
   }
 
-  if (mat1._nnz() == 0 && mat2.layout() == kStrided) {
-    // According to docs, when beta==0 values in self should be ignored. nans and infs should not propagate
+  if (sparse::impl::_is_sparse_and_zero(mat1) || sparse::impl::_is_sparse_and_zero(mat2)) {
+    // According to docs, when beta==0 values in self should be ignored.
+    // nans and infs should not propagate
     if (beta.toComplexDouble() == 0.) {
       result.zero_();
     } else {
@@ -453,26 +494,19 @@ Tensor& addmm_out_sparse_csr_cpu(
     return result;
   }
 
-  if (mat2.is_sparse_csr() && (mat1._nnz() == 0 || mat2._nnz() == 0)) {
-    if (beta.toComplexDouble() == 0.) {
-      result.values().zero_();
-    } else {
-      result.values().mul_(beta);
-    }
-    return result;
-  }
-
 #if !AT_USE_MKL_SPARSE()
-    if (mat2.is_sparse_csr() && result.is_sparse_csr()) {
-      TORCH_CHECK(
-          false,
-          "Calling addmm on sparse CPU tensors requires Linux platform. ",
-          "Please use PyTorch built with MKL on Linux.");
-    }
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.layout() == kStrided);
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "addmm_sparse_dense", [&] {
-        addmm_out_sparse_csr_native_cpu<scalar_t>(mat1, mat2, result, alpha, beta);
-    });
+  TORCH_CHECK(
+      (mat1.is_sparse_csr() ||
+       (mat2.is_sparse_csr() && result.is_sparse_csr())),
+      false,
+      "Calling addmm on sparse CPU tensors requires Linux platform. ",
+      "Please use PyTorch built with MKL on Linux.");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.layout() == kStrided);
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      result.scalar_type(), "addmm_sparse_dense", [&] {
+        addmm_out_sparse_csr_native_cpu<scalar_t>(
+            mat1, mat2, result, alpha, beta);
+      });
 #else
   sparse::impl::mkl::addmm_out_sparse_csr(mat1, mat2, beta, alpha, result);
 #endif
@@ -504,17 +538,36 @@ Tensor& _sparse_csr_mm_out(
   return at::addmm_out(result, zero, mat1, mat2, 0.0, 1.0);
 }
 
-Tensor _sparse_csr_mm(
-    const Tensor& mat1,
-    const Tensor& mat2) {
-  Tensor zero;
+Tensor _sparse_csr_mm(const Tensor& mat1, const Tensor& mat2) {
   if (mat1.is_sparse_csr() && mat2.is_sparse_csr()) {
+    // Return sparse
     // TODO: replace with at::zeros when it's implemented for sparse csr
-    zero = at::empty({mat1.size(0), mat2.size(1)}, mat2.options());
-  } else {
-    zero = at::zeros({mat1.size(0), mat2.size(1)}, mat2.options());
+    return at::addmm(
+        at::empty({mat1.size(0), mat2.size(1)}, mat2.options()),
+        mat1,
+        mat2,
+        0.0,
+        1.0);
+  }
+  if (mat1.is_sparse_csr() && mat2.layout() == c10::kStrided) {
+    // Return dense
+    return at::addmm(
+        at::zeros({mat1.size(0), mat2.size(1)}, mat2.options()),
+        mat1,
+        mat2,
+        0.0,
+        1.0);
+  }
+  if (mat1.layout() == c10::kStrided && mat2.is_sparse_csr()) {
+    // Return dense
+    return at::addmm(
+        at::zeros({mat1.size(0), mat2.size(1)}, mat1.options()),
+        mat1,
+        mat2,
+        0.0,
+        1.0);
   }
-  return at::addmm(zero, mat1, mat2, 0.0, 1.0);
+  TORCH_INTERNAL_ASSERT(false, "Shouldn't get here. Please open an issue.");
 }
 
 Tensor _sparse_csr_addmm(
@@ -530,14 +583,20 @@ Tensor _sparse_csr_addmm(
 }
 
 // Functions for element-wise addition.
-Tensor add_sparse_csr(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+Tensor add_sparse_csr(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
   auto commonDtype = at::result_type(self, other);
   alpha_check(commonDtype, alpha);
   Tensor result = at::empty({0, 0}, self.options().dtype(commonDtype));
   return at::add_out(result, self, other, alpha); // redispatch!
 }
 
-Tensor& add_sparse_csr_(Tensor& self, const Tensor& other, const Scalar& alpha) {
+Tensor& add_sparse_csr_(
+    Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
   return at::add_out(self, self, other, alpha); // redispatch!
 }
 
@@ -581,13 +640,10 @@ void add_out_dense_sparse_csr_cpu(
       " in add operation");
 
   auto src_values = src.values();
-  auto src_crow_indices = src.crow_indices();
-  auto src_col_indices = src.col_indices();
 
   resize_output(out, dense.sizes());
 
   Tensor resultBuffer = out;
-  Tensor valuesBuffer = src_values.to(commonDtype);
 
   if (out.scalar_type() != commonDtype) {
     resultBuffer = dense.to(commonDtype);
@@ -595,36 +651,54 @@ void add_out_dense_sparse_csr_cpu(
     resultBuffer.copy_(dense);
   }
 
+  if (src._nnz() == 0) {
+    return;
+  }
+
+  auto valuesBuffer = src_values.to(commonDtype).view({-1, src_values.size(-1)});
+  resultBuffer = resultBuffer.view({-1, out.size(-2), out.size(-1)});
+  auto src_crow_indices = src.crow_indices().view({-1, src.crow_indices().size(-1)});
+  auto src_col_indices = src.col_indices().view({-1, src.col_indices().size(-1)});
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kHalf, kBool, kBFloat16,
+      kHalf,
+      kBool,
+      kBFloat16,
       commonDtype,
       "add_out_op2_sparse_csr",
-      [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() {
+      [&valuesBuffer,
+       &resultBuffer,
+       &alpha,
+       &src_crow_indices,
+       &src_col_indices]() {
         AT_DISPATCH_INDEX_TYPES(
             src_crow_indices.scalar_type(),
             "csr_add_out_crow_indices",
-            [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() {
-              auto values_accessor = valuesBuffer.accessor<scalar_t, 1>();
+            [&valuesBuffer,
+             &resultBuffer,
+             &alpha,
+             &src_crow_indices,
+             &src_col_indices]() {
+              auto batch_count = resultBuffer.dim() > 2 ? resultBuffer.size(-3) : 1;
+              auto values_accessor = valuesBuffer.accessor<scalar_t, 2>();
               scalar_t* out_ptr = resultBuffer.data_ptr<scalar_t>();
               scalar_t cast_value = alpha.to<scalar_t>();
 
               auto crow_indices_accessor =
-                  src_crow_indices.accessor<index_t, 1>();
+                  src_crow_indices.accessor<index_t, 2>();
               auto col_indices_accessor =
-                  src_col_indices.accessor<index_t, 1>();
-              auto out_strides0 = resultBuffer.strides()[0];
-              auto out_strides1 = resultBuffer.strides()[1];
-
-              for (index_t irow = 0; irow < src_crow_indices.size(0) - 1;
-                   ++irow) {
-                index_t start_index = crow_indices_accessor[irow];
-                index_t end_index = crow_indices_accessor[irow + 1];
-
-                for (index_t i = start_index; i < end_index; ++i) {
-                  auto icol = col_indices_accessor[i];
-                  auto index = resultBuffer.storage_offset() + irow * out_strides0 +
-                      icol * out_strides1;
-                  out_ptr[index] += cast_value * values_accessor[i];
+                  src_col_indices.accessor<index_t, 2>();
+              auto out_strides = resultBuffer.strides();
+
+              for (const auto batch_idx : c10::irange(batch_count)) {
+                for (const auto irow : c10::irange(src_crow_indices.size(-1) - 1)) {
+                  index_t start_index = crow_indices_accessor[batch_idx][irow];
+                  index_t end_index = crow_indices_accessor[batch_idx][irow + 1];
+                  for (const auto i : c10::irange(start_index, end_index)) {
+                    auto icol = col_indices_accessor[batch_idx][i];
+                    auto index = batch_idx * out_strides[0] + irow * out_strides[1] + icol * out_strides[2];
+                    out_ptr[index] += cast_value * values_accessor[batch_idx][i];
+                  }
                 }
               }
             });
@@ -654,32 +728,358 @@ Tensor& add_out_sparse_csr_cpu(
   return out;
 }
 
-TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_cpu) (
-  const Tensor& input, const int64_t size, const bool out_int32, const Tensor& result
-) {
-  if (out_int32) {
-    AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] {
-      convert_indices_from_coo_to_csr_cpu<scalar_t, int>(result, input, size);
-    });
+/*
+    Reductions on sparse CSR tensors using masked semantics.
+
+    - A CSR tensor is a 2D tensor that is specified by a 3-tuple
+      (crow_indices, col_indices, values).
+
+    - To support a reduction operator on a CSR tensor, define:
+
+template <typename scalar_t>
+struct Reduction...Op {
+  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+    return a ... b;
+  }
+  inline scalar_t identity() const { return ...; }
+};
+
+Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+  ...
+      result = reduce_sparse_csr_cpu_template<scalar_t>(input_, dims_to_sum, keepdim, Reduction...Op<scalar_t>());
+  ...
+  return result;
+}
+
+      and add the following
+
+        - func: _sparse_csr_op.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+          dispatch:
+            SparseCsrCUDA: _sparse_csr_..._cpu
+
+      to native_functions.yaml
+
+      Use ReductionAddOp and _sparse_csr_sum implementation as an example.
+
+    - Since a CSR tensor dimensionality is always 2, only reductions
+      with keepdim=True can be supported.
+
+*/
+
+namespace {
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim0_cpu_template(const Tensor& sparse, ReductionOp rop) {
+  /*
+    Consider the following sparse tensor:
+
+    1 * * * *
+    * * * 2 *
+    * * 3 * *
+    * * * * *
+    4 * 5 * *
+
+    that has CSR representation
+
+      crow_indices = [0, 1, 2, 3, 3, 5]
+      col_indices = [0, 3, 2, 0, 2]
+      values = [1, 2, 3, 4, 5]
+
+    Reduction with dim=0 results:
+
+    rop(1,4) * rop(3,5) 2 *
+
+    that has CSR representation
+
+      new_crow_indices = [0, 3]
+      new_col_indices = [0, 2, 3]
+      new_values = [rop(1, 4], rop(3, 5), 2]
+
+    In general, the CSR representation data can be computed as follows:
+
+      new_col_indices, col_map = col_indices.unique(sorted=True, return_inverse=True)
+      nnz = new_col_indices.numel()
+      new_crow_indices = [0, nnz]
+      new_values.resize(nnz); new_values.fill_(identity)
+      for i in range(col_indices.numel()):
+          new_values[col_map[i]] = rop(new_values[col_map[i], values[i])
+   */
+
+  Tensor col_indices = sparse.col_indices();
+  Tensor values = sparse.values();
+  auto numel = values.numel();
+  Tensor new_col_indices;
+  Tensor columns_map;
+
+  /*
+    Calling at::_unique constitutes the main bottleneck of this
+    function. However, it is still about 5x faster than using the
+    invariant:
+      csr.sum(dim=0) == csr.transpose(0, 1).sum(dim=1)
+  */
+  std::tie(new_col_indices, columns_map) = at::_unique(col_indices, true, true);
+  auto nnz = new_col_indices.numel();
+
+  Tensor new_crow_indices = at::empty({2}, col_indices.options());
+  new_crow_indices[0] = 0;
+  new_crow_indices[1] = nnz;
+
+  Tensor new_values = at::empty({nnz}, values.options());
+  new_values.fill_(rop.identity());
+
+  AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "reduce_sparse_csr_dim0_cpu_indices",
+                          [&]() {
+                            index_t* columns_map_ptr = columns_map.data_ptr<index_t>();
+                            scalar_t* values_ptr = values.data_ptr<scalar_t>();
+                            scalar_t* new_values_ptr = new_values.data_ptr<scalar_t>();
+
+                            // There is no point in parallelizing the following for-loop
+                            // because about 99.3% of the computation time is spent in the
+                            // at::_unique call above.
+                            for (int64_t i=0; i<numel; i++) {
+                              index_t col = columns_map_ptr[i];
+                              scalar_t val = values_ptr[i];
+                              new_values_ptr[col] = rop(new_values_ptr[col], val);
+                            }
+                          });
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {1, sparse.size(1)},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim1_cpu_template(const Tensor& sparse, ReductionOp rop) {
+  /*
+    Consider the following sparse tensor:
+
+    1 * * * *
+    * * * 2 *
+    * * 3 * *
+    * * * * *
+    4 * 5 * *
+
+    that has CSR representation
+
+      crow_indices = [0, 1, 2, 3, 3, 5]
+      col_indices = [0, 3, 2, 0, 2]
+      values = [1, 2, 3, 4, 5]
+
+    Reduction with dim=1 results:
+
+    1
+    2
+    3
+    *
+    rop(4, 5)
+
+    that has CSR representation
+
+      new_crow_indices = [0, 1, 2, 3, 3, 4]
+      new_col_indices = [0, 0, 0, 0]
+      new_values = [1, 2, 3, rop(4, 5)]
+
+    In general, the result CSR data can be computed as follows:
+
+      new_crow_indices = [0]
+      for i in range(1, nrows+1):
+          new_crow_indices[i] = new_crow_indices[i-1] + (crow_indices[i] == crow_indices[i-1])
+      nnz = new_crow_indices[-1]
+      new_col_indices = zeros(nnz)
+      new_values.resize(nnz)
+      j = -1
+      for i in range(1, nrows+1):
+          if crow_indices[i] == crow_indices[i-1]:
+              continue
+          j += 1
+          new_values[j] = rop(values[crow_indices[i] : crow_indices[i-1]])
+  */
+
+  Tensor crow_indices = sparse.crow_indices();
+  auto ioptions = crow_indices.options();
+  Tensor values = sparse.values();
+  auto nrows = sparse.size(0);
+
+  Tensor new_crow_indices = at::empty({crow_indices.numel()}, ioptions);
+  Tensor new_col_indices = at::empty({}, ioptions);
+  Tensor new_values = at::empty({}, values.options());
+  Tensor row_map = at::empty({nrows}, ioptions);
+
+  AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "reduce_sparse_csr_dim1_cpu_indices",
+                          [&]() {
+    index_t* crow_indices_ptr = crow_indices.data_ptr<index_t>();
+    index_t* new_crow_indices_ptr = new_crow_indices.data_ptr<index_t>();
+    index_t* row_map_ptr = row_map.data_ptr<index_t>();
+    int64_t nnz = 0;
+    new_crow_indices_ptr[0] = 0;
+    for(int64_t i=0; i<nrows; i++) {
+      if (crow_indices_ptr[i] != crow_indices_ptr[i + 1]) {
+        row_map_ptr[i] = nnz;
+        nnz++;
+      }
+      new_crow_indices_ptr[i + 1] = nnz;
+    }
+    new_col_indices.resize_(nnz);
+    new_col_indices.fill_(index_t(0));
+    new_values.resize_(nnz);
+
+    scalar_t* values_ptr = values.data_ptr<scalar_t>();
+    scalar_t* new_values_ptr = new_values.data_ptr<scalar_t>();
+
+    at::parallel_for(
+        0,
+        nrows,
+        internal::GRAIN_SIZE,
+        [&](int64_t irow_start, int64_t irow_end) {
+            index_t i_end = crow_indices_ptr[irow_start];
+            for (index_t h = irow_start; h < irow_end; ++h) {
+              index_t i_start = i_end;
+              i_end = crow_indices_ptr[h+1];
+              if (i_start != i_end) {
+                scalar_t res = values_ptr[i_start];
+                for (index_t i = i_start + 1; i < i_end; i++) {
+                  res = rop(res, values_ptr[i]);
+                }
+                new_values_ptr[row_map_ptr[h]] = res;
+              }
+            }
+        });
+                          });
+
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {sparse.size(0), 1},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim01_cpu_template(const Tensor& sparse, ReductionOp rop) {
+
+  auto ioptions = sparse.col_indices().options();
+  Tensor values = sparse.values();
+  auto numel = values.numel();
+  auto nnz = std::min<int64_t>(1, numel);
+
+  /* TODO: we can likely do about 3x better than parallel_reduce:
+
+In [2]: t=torch.randn(5000, 5000).to_sparse_csr()
+
+In [3]: %timeit torch._sparse_csr_sum(t, dim=(0, 1), keepdim=True)
+3.39 ms ± 898 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+In [4]: %timeit torch.sum(t.values())
+1.07 ms ± 291 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
+  */
+  scalar_t* values_ptr = values.data_ptr<scalar_t>();
+  scalar_t value = at::parallel_reduce(
+                                       0,
+                                       numel,
+                                       internal::GRAIN_SIZE,
+                                       rop.identity(),
+                                       [&](int64_t i_start, int64_t i_end, scalar_t identity) {
+                                         scalar_t res = identity;
+                                         for (int64_t i=i_start; i<i_end; i++) {
+                                           scalar_t val = values_ptr[i];
+                                           res = rop(res, val);
+                                         }
+                                         return res;
+                                       }, rop
+                                       );
+
+  Tensor new_col_indices = at::zeros({nnz}, ioptions);
+  Tensor new_crow_indices = at::tensor(ArrayRef<int64_t>{0, nnz}, ioptions);
+  Tensor new_values;
+  if (numel > 0) {
+    new_values = at::empty({1}, values.options());
+    new_values.fill_(value);
   } else {
-    AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] {
-      convert_indices_from_coo_to_csr_cpu<scalar_t, int64_t>(result, input, size);
-    });
+    new_values = at::empty({}, values.options());
+  }
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {1, std::min<int64_t>(1, sparse.size(1))},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, std::vector<int64_t> dims, ReductionOp rop) {
+  if (dims.size() == 1) {
+    if (dims[0] == 0) {
+      return reduce_sparse_csr_dim0_cpu_template<scalar_t>(sparse, rop);
+    } else {
+      TORCH_INTERNAL_ASSERT(dims[0] == 1);
+      return reduce_sparse_csr_dim1_cpu_template<scalar_t>(sparse, rop);
+    }
+  } else if (dims.size() == 2) {
+    TORCH_INTERNAL_ASSERT(((dims[0] == 0 && dims[1] == 1) || (dims[0] == 1 && dims[1] == 0)));
+    return reduce_sparse_csr_dim01_cpu_template<scalar_t>(sparse, rop);
+  }
+  TORCH_INTERNAL_ASSERT(dims.size() == 0);
+  // effective after gh-29137 has been resolved
+  return sparse.clone();
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_sum, bool keepdim, ReductionOp rop) {
+  TORCH_INTERNAL_ASSERT(sparse.is_sparse_csr());
+  TORCH_CHECK(keepdim, "reduction operations on CSR tensors with keepdim=False is unsupported");
+  TORCH_INTERNAL_ASSERT(sparse.device() == kCPU);
+
+  const int64_t input_dim = sparse.dim();
+  TORCH_INTERNAL_ASSERT(input_dim == 2);
+  auto dims = dims_to_sum.vec();
+  maybe_wrap_dims(dims, input_dim);
+  if (dims.size() == 0) {
+    // after gh-29137 is resolved, delete this if-block
+    dims.emplace_back(0);
+    dims.emplace_back(1);
   }
+  return reduce_sparse_csr_cpu_template<scalar_t>(sparse, dims, rop);
 }
 
-TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu) (
-  const Tensor& crow_indices, const Tensor& col_indices, const bool out_int32, const bool transpose, const Tensor& result
-) {
-  if (out_int32) {
-    AT_DISPATCH_INTEGRAL_TYPES(crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] {
-      convert_indices_from_csr_to_coo_cpu<scalar_t, int32_t>(result, crow_indices, col_indices, transpose);
+template <typename scalar_t>
+struct ReductionAddOp {
+  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+    return a + b;
+  }
+  inline scalar_t identity() const { return 0; }
+};
+
+template <typename scalar_t>
+struct ReductionMulOp {
+  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+    return a * b;
+  }
+  inline scalar_t identity() const { return 1; }
+};
+
+}  // namespace
+
+Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+  ScalarType dtype_ = dtype.value_or(input.scalar_type());
+  Tensor input_ = input.to(dtype_);
+  Tensor result;
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_sum_cpu",
+    [&] {
+      result = reduce_sparse_csr_cpu_template<scalar_t>(input_, dims_to_sum, keepdim, ReductionAddOp<scalar_t>());
     });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] {
-      convert_indices_from_csr_to_coo_cpu<scalar_t, int64_t>(result, crow_indices, col_indices, transpose);
+  return result;
+}
+
+Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional<ScalarType> dtype) {
+  ScalarType dtype_ = dtype.value_or(input.scalar_type());
+  Tensor input_ = input.to(dtype_);
+  Tensor result;
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_prod_cpu",
+    [&] {
+      result = reduce_sparse_csr_cpu_template<scalar_t>(input_, dims_to_reduce, keepdim, ReductionMulOp<scalar_t>());
     });
-  }
+  return result;
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
new file mode 100644
index 000000000000..a92added5f01
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/core/Scalar.h>
+
+namespace at {
+namespace native {
+namespace sparse {
+namespace impl {
+
+// Returns true if all entries of self are zero
+// TODO: This has potential to be a generic helper
+inline bool _is_sparse_and_zero(const Tensor& self) {
+  if (self.layout() == kSparse || self.layout() == kSparseCsr ||
+      self.layout() == kSparseCsc || self.layout() == kSparseBsr ||
+      self.layout() == kSparseBsc) {
+    if (self._nnz() == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+inline void _check_is_cpu(const Tensor& self, c10::string_view name) {
+  TORCH_CHECK(
+      self.is_cpu(),
+      "Expected all tensors to be on the same device. addmm expected '",
+      name,
+      "' to be CPU tensor, but got ",
+      self.device(),
+      " tensor");
+}
+
+inline void _check_is_cuda(const Tensor& self, c10::string_view name) {
+  TORCH_CHECK(
+      self.is_cuda(),
+      "Expected all tensors to be on the same device. addmm expected '",
+      name,
+      "' to be CUDA tensor, but got ",
+      self.device(),
+      " tensor");
+}
+
+inline void _check_dim(const Tensor& self, int64_t target_dim, c10::string_view name) {
+  if (target_dim == 2) {
+    TORCH_CHECK(
+        self.dim() == target_dim,
+        name, " must be a matrix, ",
+        "got ", self.dim(), "-D tensor");
+  }
+  TORCH_CHECK(
+      self.dim() == target_dim,
+      "Expected ",
+      name,
+      " to be of dimension ",
+      target_dim,
+      " but got ",
+      self.dim(),
+      " instead.");
+}
+
+}
+}
+}
+}
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 814acad4c7f6..784aa4f4a64d 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -22,6 +22,7 @@
 #include <ATen/ops/_coalesce.h>
 #include <ATen/ops/_coalesce_native.h>
 #include <ATen/ops/_coalesced_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
 #include <ATen/ops/_dimI_native.h>
 #include <ATen/ops/_dimV_native.h>
 #include <ATen/ops/_indices_native.h>
@@ -344,7 +345,7 @@ void _validate_sparse_coo_tensor_args(
     Tensor max_indices =
         std::get</* values */ 0>(indices.max(/* dim */ 1, /* keepdim */ false));
     Tensor cpu_min_indices, cpu_max_indices;
-    if (indices.is_cuda()) {
+    if (!indices.is_cpu()) {
       cpu_min_indices = min_indices.to(at::DeviceType::CPU);
       cpu_max_indices = max_indices.to(at::DeviceType::CPU);
     } else {
@@ -545,15 +546,6 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim) {
 
 // NB: Dropped the resizeNd variants
 
-Tensor sparse_to_dense(
-    const SparseTensor& self,
-    c10::optional<ScalarType> dtype) {
-  TORCH_CHECK(
-      !dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
-  Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
-  return dst.add_(self);
-}
-
 SparseTensor& copy_sparse_wrapper_(
     Tensor& self,
     const Tensor& src,
@@ -640,7 +632,8 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
   auto indicesBufferAccessor = indicesBuffer.accessor<int64_t, 1>();
 
   int64_t i = -1;
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(values.scalar_type(), "coalesce", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, values.scalar_type(),
+                                         "coalesce", [&] {
     int64_t prev = -1;
     int64_t blockSize = values.stride(0);
     scalar_t* values_ptr = values.data_ptr<scalar_t>();
@@ -769,7 +762,7 @@ SparseTensor& sparse_mask_out_cpu(
     // TODO: Re-audit this; it used to be an indexSelect directly into r_values
     at::index_select_out(r_values, t_view, 0, indices);
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(r_values.scalar_type(), "sparse_mask", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, r_values.scalar_type(), "sparse_mask", [&] {
       sparse_mask_out_cpu_kernel<scalar_t>(
           r_values, t, r_nnz, sparse_dim, mask_indices);
     });
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 611154fdee20..6963f60eaa22 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -27,6 +27,7 @@
 #include <ATen/ops/_sparse_sum.h>
 #include <ATen/ops/_sparse_sum_backward_native.h>
 #include <ATen/ops/_sparse_sum_native.h>
+#include <ATen/ops/_sparse_sparse_matmul.h>
 #include <ATen/ops/add.h>
 #include <ATen/ops/add_native.h>
 #include <ATen/ops/addmm.h>
@@ -676,7 +677,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen
       dstBuffer.add_(srcBuffer, value);
     }
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool,
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
         commonDtype, "add_dense_sparse", [&] {
           add_dense_sparse_worker_cpu<scalar_t>(resultBuffer, value, sparse, indices, valuesBuffer);
         });
@@ -706,6 +707,42 @@ Tensor& mul_sparse_(Tensor& self, const Tensor& other) {
   return at::mul_out(self, self, other);  // redispatch!
 }
 
+Tensor& mul_out_sparse_csr(const Tensor& t_, const Tensor& src_, Tensor& r) {
+  // // TODO: Use a specialized CSR kernel for performance if needed
+  if (t_.is_sparse_csr() && src_.layout() == kStrided) {
+    return mul_out_sparse_csr(t_, src_.sparse_mask(t_), r);
+  }
+  if (t_.layout() == kStrided && src_.is_sparse_csr()) {
+    return mul_out_sparse_csr(t_.sparse_mask(src_), src_, r);
+  }
+  TORCH_CHECK(r.is_sparse_csr(), "Expected result Tensor to be of format CSR");
+  Tensor t = t_.to_sparse();
+  Tensor src = src_.to_sparse();
+  Tensor tmp_result = t.mul(src);
+  auto r_sparse_csr = tmp_result.to_sparse_csr();
+  r.resize_as_sparse_(r_sparse_csr);
+  r.copy_(r_sparse_csr);
+  return r;
+}
+
+Tensor mul_sparse_csr(const Tensor& self, const Tensor& other) {
+  auto commonDtype = at::result_type(self, other);
+  if (self.is_sparse_csr() && other.layout() == kStrided) {
+    return mul_sparse_csr(self, other.sparse_mask(self));
+  }
+  if (self.layout() == kStrided && other.is_sparse_csr()) {
+    return mul_sparse_csr(self.sparse_mask(other), other);
+  }
+  auto result_options = self.options().dtype(commonDtype);
+  // CSR is 2d!
+  Tensor result = at::empty({0, 0}, result_options);
+  return at::mul_out(result, self, other); // redispatch!
+}
+
+Tensor& mul_sparse_csr_(Tensor& self, const Tensor& other) {
+  return at::mul_out(self, self, other); // redispatch!
+}
+
 SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, SparseTensor& r) {
   if (src_.dim() == 0) {
     return mul_out_sparse_zerodim(r, t_, src_);
@@ -781,7 +818,7 @@ SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, SparseTen
       s_i++;
     }
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half,
         commonDtype, "mul_out_sparse", [&] {
           auto r_accessor = r_buffer.accessor<scalar_t, 1>();
           auto t_accessor = t_values.accessor<scalar_t, 1>();
@@ -866,10 +903,22 @@ Tensor& s_addmm_out_sparse_dense_cpu(
     const Scalar& alpha
 ) {
   // TODO: This error message seems awfully opaque
-  TORCH_CHECK(!t.is_cuda(),  "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got CUDA tensor");
-  TORCH_CHECK(!r.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got CUDA tensor");
-  TORCH_CHECK(!sparse_.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
-  TORCH_CHECK(!dense.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(
+      t.is_cpu(),
+      "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got tensor on ",
+      t.device());
+  TORCH_CHECK(
+      r.is_cpu(),
+      "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got tensor on ",
+      t.device());
+  TORCH_CHECK(
+      sparse_.is_cpu(),
+      "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got tensor on ",
+      t.device());
+  TORCH_CHECK(
+      dense.is_cpu(),
+      "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got tensor on ",
+      t.device());
 
   TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
   TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values");
@@ -969,11 +1018,14 @@ Tensor _sparse_addmm(
 }
 
 Tensor _sparse_mm(
-  const SparseTensor& sparse,
-  const Tensor& dense
+  const Tensor& mat1,
+  const Tensor& mat2
 ) {
-  Tensor t = at::zeros({}, dense.options());
-  return at::_sparse_addmm(t, sparse, dense, 0, 1);  // redispatch!
+  if (mat1.is_sparse() && mat2.is_sparse()) {
+    return at::_sparse_sparse_matmul(mat1, mat2);
+  }
+  Tensor t = at::zeros({mat1.size(-2), mat2.size(-1)}, mat2.options());
+  return at::_sparse_addmm(t, mat1, mat2, 0, 1);
 }
 
 // NB: Despite its suggestive name, this actually only exists so that
@@ -1492,11 +1544,14 @@ scalar_t binary_search_strided_rightmost(scalar_t search_val, TensorAccessor<sca
 
   int64_t left_ind = 0;
   int64_t right_ind = length - 1;
-  int64_t mid_ind; // NOLINT(cppcoreguidelines-init-variables)
+  // This value should be overwritten in the loop so we use
+  // a destructive initial value to ensure disaster if that
+  // turns out not to be the case.
+  int64_t mid_ind = std::numeric_limits<int64_t>::max();
   bool done_searching = false;
 
   while (!done_searching) {
-    mid_ind = (left_ind+right_ind) >> 1;
+    mid_ind = left_ind + (right_ind - left_ind) / 2;
     scalar_t mid_val = sorted_arr_accessor[sorted_arr_begin_idx + mid_ind];
 
     if (mid_val > search_val) {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
index 6a8b7253fbfc..0bfde528cb0e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@@ -1,8 +1,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
+#include <ATen/native/sparse/SparseBlas.h>
+#include <ATen/native/sparse/SparseCsrTensorMath.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -40,46 +43,15 @@ Tensor& sparse_sampled_addmm_out_sparse_csr_cuda(
     const Scalar& beta,
     const Scalar& alpha,
     Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_sparse_csr());
-
-  TORCH_CHECK(mat1.layout() == kStrided, "sampled_addmm: Expected mat1 to have strided layout, but got ", mat1.layout());
-  TORCH_CHECK(mat2.layout() == kStrided, "sampled_addmm: Expected mat2 to have strided layout, but got ", mat2.layout());
-
-  TORCH_CHECK(result.layout() == kSparseCsr, "sampled_addmm: Expected result to have sparse csr layout, but got ", result.layout());
-
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "sampled_addmm: Expected mat1 and mat2 to have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.scalar_type() == self.scalar_type(), "sampled_addmm: Expected mat1 and self to have the same dtype, but got ", mat1.scalar_type(), " and ", self.scalar_type());
-  TORCH_CHECK(result.scalar_type() == self.scalar_type(), "sampled_addmm: Expected result and self to have the same dtype, but got ", result.scalar_type(), " and ", self.scalar_type());
-
-  TORCH_CHECK(
-      mat1.dim() == 2, "sampled_addmm: Expected mat1 to be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(
-      mat2.dim() == 2, "sampled_addmm: Expected mat2 to be a matrix, got ", mat2.dim(), "-D tensor");
-  TORCH_CHECK(
-    result.dim() == 2, "sampled_addmm: Expected result to be a matrix, got ", result.dim(), "-D tensor");
-
-  IntArrayRef mat1_sizes = mat1.sizes();
-  IntArrayRef mat2_sizes = mat2.sizes();
-  TORCH_CHECK(
-      mat1_sizes[1] == mat2_sizes[0],
-      "sampled_addmm: mat1 and mat2 shapes cannot be multiplied (",
-      mat1_sizes[0],
-      "x",
-      mat1_sizes[1],
-      " and ",
-      mat2_sizes[0],
-      "x",
-      mat2_sizes[1],
-      ")");
-
-  IntArrayRef self_sizes = self.sizes();
-  TORCH_CHECK(
-      self_sizes[0] == mat1_sizes[0], "sampled_addmm: self dim 0 must match mat1 dim 0");
-  TORCH_CHECK(
-      self_sizes[1] == mat2_sizes[1], "sampled_addmm: self dim 1 must match mat2 dim 1");
+  at::native::sparse::sparse_sampled_addmm_check_inputs(
+      self, mat1, mat2, beta, alpha, result);
 
   if (&result != &self) {
-    at::native::resize_as_sparse_csr_(result, self);
+    // We allow self to be a single matrix when mat1 and mat2 are batched
+    auto result_sizes = DimVector(mat1.sizes().slice(0, mat1.dim() - 2));
+    result_sizes.push_back(self.size(-2));
+    result_sizes.push_back(self.size(-1));
+    at::sparse_csr::get_sparse_csr_impl(result)->resize_(self._nnz(), result_sizes);
     result.copy_(self);
   }
 
@@ -103,6 +75,7 @@ Tensor sparse_sampled_addmm_sparse_csr_cuda(
   return result;
 }
 
+// result = beta * self + alpha * (mat1 @ mat2)
 Tensor& addmm_out_sparse_csr_cuda(
     const Tensor& self,
     const Tensor& mat1,
@@ -110,65 +83,63 @@ Tensor& addmm_out_sparse_csr_cuda(
     const Scalar& beta,
     const Scalar& alpha,
     Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
+  sparse::impl::_check_is_cuda(self, "self");
+  sparse::impl::_check_is_cuda(mat1, "mat1");
+  sparse::impl::_check_is_cuda(mat2, "mat2");
+  sparse::impl::_check_is_cuda(result, "result");
 
   // Same checks as in TORCH_META_FUNC(addmm) at
   // aten/src/ATen/native/LinearAlgebra.cpp
-  TORCH_CHECK(
-      mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(
-      mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  sparse::impl::_check_dim(mat1, 2, "mat1");
+  sparse::impl::_check_dim(mat2, 2, "mat2");
 
-  IntArrayRef mat1_sizes = mat1.sizes();
-  IntArrayRef mat2_sizes = mat2.sizes();
   TORCH_CHECK(
-      mat1_sizes[1] == mat2_sizes[0],
-      "mat1 and mat2 shapes cannot be multiplied (",
-      mat1_sizes[0],
-      "x",
-      mat1_sizes[1],
-      " and ",
-      mat2_sizes[0],
-      "x",
-      mat2_sizes[1],
-      ")");
+      mat1.size(1) == mat2.size(0), "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.size(0), "x", mat1.size(1), " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
 
   // From addmm_out_cuda_impl at ATen/native/cuda/Blas.cpp
   // TODO: remove code duplication and unify code
   // There were undefined symbol problems,
   // when using the same function for CUDA and SparseCsrCUDA dispatch keys
   // Also structured kernels do not support sparse output
-  IntArrayRef self__sizes;
-  c10::MaybeOwned<Tensor> self_;
-  if (&result != &self && self.layout() == kStrided) {
-    self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-    self__sizes = self_->sizes();
+  c10::MaybeOwned<at::Tensor> self_;
+  // Don't expand self if this is an in-place operation
+  if (&result == &self) {
+     self_ = c10::MaybeOwned<Tensor>::borrowed(self);
   } else {
-    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
-    self__sizes = self_->sizes();
-    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
-    TORCH_CHECK(
-        self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
-    TORCH_CHECK(
-        self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
+     self_ = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm");
   }
 
+  sparse::impl::_check_dim(*self_, 2, "self");
+  TORCH_CHECK(((self_->dim() == 2) &&
+               (self_->size(0) == mat1.size(0)) &&
+               (self_->size(1) == mat2.size(1))),
+              "The input tensor must be a matrix with size ",
+              mat1.size(0),
+              "x",
+              mat2.size(1),
+              ", but got a ",
+              self_->dim(),
+              "-D tensor with size ",
+              self_->size(0),
+              "x",
+              self_->size(1));
+
   if (&result != &self) {
     if (result.layout() == kStrided) {
-      at::native::resize_output(result, self__sizes);
+      at::native::resize_output(result, self_->sizes());
     } else {
-      at::native::resize_as_sparse_csr_(result, *self_);
+      result.resize_as_sparse_(*self_);
     }
     result.copy_(*self_);
   }
 
-  IntArrayRef result_sizes = result.sizes();
-  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+  if (result.numel() == 0) {
     return result;
   }
 
-  if (mat1._nnz() == 0 && mat2.layout() == kStrided) {
-    // According to docs, when beta==0 values in self should be ignored
+  if (sparse::impl::_is_sparse_and_zero(mat1) || sparse::impl::_is_sparse_and_zero(mat2)) {
+    // According to docs, when beta==0 values in self should be ignored.
     // nans and infs should not propagate
     if (beta.toComplexDouble() == 0.) {
       result.zero_();
@@ -178,15 +149,6 @@ Tensor& addmm_out_sparse_csr_cuda(
     return result;
   }
 
-  if (mat2.is_sparse_csr() && (mat1._nnz() == 0 || mat2._nnz() == 0)) {
-    if (beta.toComplexDouble() == 0.) {
-      result.values().zero_();
-    } else {
-      result.values().mul_(beta);
-    }
-    return result;
-  }
-
   sparse::impl::cuda::addmm_out_sparse_csr(mat1, mat2, beta, alpha, result);
   return result;
 }
@@ -240,7 +202,7 @@ Tensor& addmv_out_sparse_csr_cuda(
     const Scalar& beta,
     const Scalar& alpha,
     Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseCsr || mat.layout() == kSparseBsr);
 
   TORCH_CHECK(mat.dim() == 2, "addmv: Expected mat to be 2-D");
   TORCH_CHECK(vec.dim() == 1, "addmv: Expected vec to be 1-D");
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index 7eab11060e83..2dace2717403 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -120,6 +120,15 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
       input.sizes());
 }
 
+void inline bsrsv2_bsrsm2_may_need_to_sync() {
+#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703
+  // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x
+  // See https://github.com/pytorch/pytorch/issues/71297
+  ::c10::cuda::device_synchronize();
+#endif
+  // else: do nothing!
+}
+
 void block_sparse_triangular_solve_vec(
     const at::sparse_csr::SparseCsrTensor& A,
     const Tensor& B,
@@ -134,7 +143,7 @@ void block_sparse_triangular_solve_vec(
       "PyTorch with ROCm 4.5.0+. ",
       "Please use PyTorch built with newer ROCm version.");
 #else
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseBsr);
   // values is expected to be a blocks of sparse matrix
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.values().dim() == 3);
   // blocks are expected to be square
@@ -213,6 +222,15 @@ void block_sparse_triangular_solve_vec(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
+        if (!unitriangular) {
+          int first_zero_diag_idx = -1;
+          cusparseStatus_t status = cusparseXbsrsv2_zeroPivot(handle, info.descriptor(), &first_zero_diag_idx);
+          if (status == CUSPARSE_STATUS_ZERO_PIVOT) {
+            X_->fill_(NAN);
+            return;
+          }
+        }
+
         at::cuda::sparse::bsrsv2_solve(
             handle,
             block_layout,
@@ -230,6 +248,8 @@ void block_sparse_triangular_solve_vec(
             X_->data_ptr<scalar_t>(),
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
+
+        bsrsv2_bsrsm2_may_need_to_sync();
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -251,7 +271,7 @@ void block_sparse_triangular_solve_mat(
       "PyTorch with ROCm 4.5.0+. ",
       "Please use PyTorch built with newer ROCm version.");
 #else
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseBsr);
   // values is expected to be a blocks of sparse matrix
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.values().dim() == 3);
   // blocks are expected to be square
@@ -339,6 +359,15 @@ void block_sparse_triangular_solve_mat(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
+        if (!unitriangular) {
+          int first_zero_diag_idx = -1;
+          cusparseStatus_t status = cusparseXbsrsm2_zeroPivot(handle, info.descriptor(), &first_zero_diag_idx);
+          if (status == CUSPARSE_STATUS_ZERO_PIVOT) {
+            X_->fill_(NAN);
+            return;
+          }
+        }
+
         at::cuda::sparse::bsrsm2_solve(
             handle,
             block_layout,
@@ -360,6 +389,8 @@ void block_sparse_triangular_solve_mat(
             ldx,
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
+
+        bsrsv2_bsrsm2_may_need_to_sync();
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -373,7 +404,7 @@ void block_sparse_mv(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseBsr);
   // values is expected to be a blocks of sparse matrix
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.values().dim() == 3);
   // blocks are expected to be square
@@ -437,7 +468,7 @@ void block_sparse_mm(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.layout() == kSparseBsr);
   // values is expected to be a blocks of sparse matrix
   TORCH_INTERNAL_ASSERT(mat1.values().dim() == 3);
   // blocks are expected to be square
@@ -531,9 +562,6 @@ void spmm(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-  if (mat1.values().dim() >= 3 && mat1.values().size(-1) > 1) {
-    return block_sparse_mm(mat1, mat2, beta, alpha, result);
-  }
 #if !AT_USE_CUSPARSE_GENERIC_API()
   addmm_out_legacy(mat1, mat2, beta, alpha, result);
 #else
@@ -793,18 +821,26 @@ void spgemm(
 } // anonymous namespace
 
 void addmm_out_sparse_csr(
-    const at::sparse_csr::SparseCsrTensor& mat1,
+    const Tensor& mat1,
     const Tensor& mat2,
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-  if (mat2.layout() == kStrided && result.layout() == kStrided) {
+  if (mat1.layout() == kSparseBsr && mat2.layout() == kStrided && result.layout() == kStrided) {
+    return block_sparse_mm(mat1, mat2, beta, alpha, result);
+  }
+  if (mat1.is_sparse_csr() && mat2.layout() == kStrided && result.layout() == kStrided) {
     return spmm(mat1, mat2, beta, alpha, result);
-  } else if (mat2.is_sparse_csr() && result.is_sparse_csr()) {
+  }
+  if (mat1.layout() == kStrided && mat2.is_sparse_csr() && result.layout() == kStrided) {
+    // TODO: We can use cuSPARSE's transposition flags once we have CSC support.
+    return spmm(mat2.transpose(0, 1), mat1.transpose(0, 1), beta, alpha, result.transpose(0, 1));
+  }
+  if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.is_sparse_csr()) {
     return spgemm(mat1, mat2, beta, alpha, result);
-  } else {
-    TORCH_INTERNAL_ASSERT(false, "Received unexpected tensor layouts as input.");
   }
+  TORCH_CHECK(false, "addmm: computation on CUDA is not implemented for ",
+              result.layout(), " + ", mat1.layout(), " @ ", mat2.layout());
 }
 
 /*
@@ -823,7 +859,7 @@ void addmv_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-  if (mat.values().dim() == 3 && mat.values().size(-1) > 1) {
+  if (mat.layout() == kSparseBsr) {
     return block_sparse_mv(mat, vec, beta, alpha, result);
   }
 #if !AT_USE_CUSPARSE_GENERIC_API()
@@ -964,6 +1000,24 @@ void add_out_sparse_csr(
   auto B_col_indices_ptr = B_col_indices.data_ptr<int>();
   auto C_col_indices_ptr = C_col_indices.data_ptr<int>();
 
+  // Windows compilers don't support nested macros
+  // so we need this lambda outside of the
+  // AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES
+  auto fix_nnz = [
+#if AT_ROCM_ENABLED()
+                     &C_crow_indices,
+                     &m
+#endif
+  ](int nnz) -> int {
+// For some reason POINTER_MODE_HOST is not working here
+// Let's extract manually the nnz from the C_crow_indices
+#if AT_ROCM_ENABLED()
+    return std::max({nnz, C_crow_indices.narrow(-1, m, 1).item<int>()});
+#else
+    return nnz;
+#endif
+  };
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       C.scalar_type(), "add_out_sparse_csr_cuda_impl", [&] {
         auto beta_ = beta.to<scalar_t>();
@@ -1024,6 +1078,8 @@ void add_out_sparse_csr(
             &nnzC,
             work_data.get());
 
+        nnzC = fix_nnz(nnzC);
+
         // Resize result using nnz information from cusparse
         col_indices_and_values_resize_(C, nnzC);
         C_col_indices = C.col_indices();
@@ -1080,7 +1136,7 @@ void triangular_solve_out_sparse_csr(
     X.fill_(NAN);
     return;
   }
-  if (A.values().dim() == 3 && A.values().size(-1) > 1) {
+  if (A.layout() == kSparseBsr) {
     if (B.size(-1) == 1) {
       return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
     } else {
@@ -1244,64 +1300,75 @@ void sampled_addmm_out_sparse_csr(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr());
 
-  auto descA = at::cuda::sparse::CuSparseDnMatDescriptor(A);
-  auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(B);
-  auto descC = at::cuda::sparse::CuSparseSpMatCsrDescriptor(C);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) == batchCount(B));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) == batchCount(C));
 
   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
   cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
+  c10::MaybeOwned<Tensor> A_ = prepare_dense_matrix_for_cusparse(A);
+  c10::MaybeOwned<Tensor> B_ = prepare_dense_matrix_for_cusparse(B);
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       C.scalar_type(),
       "sampled_addmm_out_sparse_csr",
       [&] {
-        auto beta_ = beta.to<scalar_t>();
-        auto alpha_ = alpha.to<scalar_t>();
-        auto compute_type = at::cuda::getCudaDataType<scalar_t>();
-        auto handle = at::cuda::getCurrentCUDASparseHandle();
-        size_t buffer_size = 0;
-        TORCH_CUDASPARSE_CHECK(cusparseSDDMM_bufferSize(
-            handle,
-            opA,
-            opB,
-            &alpha_,
-            descA.descriptor(),
-            descB.descriptor(),
-            &beta_,
-            descC.descriptor(),
-            compute_type,
-            CUSPARSE_SDDMM_ALG_DEFAULT,
-            &buffer_size // output
-            ));
+        // CUDA 11.6 doesn't support batched inputs, it raises an error:
+        // ** On entry to cusparseSDDMM_bufferSize(): batched SDDMM is not supported
+        // So we need to resort to the for loop
+        for (const auto i : c10::irange(batchCount(A))) {
+          auto descA = at::cuda::sparse::CuSparseDnMatDescriptor(*A_, /*batch_offset=*/i);
+          auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_, /*batch_offset=*/i);
+          auto descC = at::cuda::sparse::CuSparseSpMatCsrDescriptor(C, /*batch_offset=*/i);
+
+          auto beta_ = beta.to<scalar_t>();
+          auto alpha_ = alpha.to<scalar_t>();
+          auto compute_type = at::cuda::getCudaDataType<scalar_t>();
+          auto handle = at::cuda::getCurrentCUDASparseHandle();
+          size_t buffer_size = 0;
+          TORCH_CUDASPARSE_CHECK(cusparseSDDMM_bufferSize(
+              handle,
+              opA,
+              opB,
+              &alpha_,
+              descA.descriptor(),
+              descB.descriptor(),
+              &beta_,
+              descC.descriptor(),
+              compute_type,
+              CUSPARSE_SDDMM_ALG_DEFAULT,
+              &buffer_size // output
+              ));
 
-        auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-        auto buffer = allocator.allocate(buffer_size);
+          auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+          auto buffer = allocator.allocate(buffer_size);
 
-        TORCH_CUDASPARSE_CHECK(cusparseSDDMM_preprocess(
-            handle,
-            opA,
-            opB,
-            &alpha_,
-            descA.descriptor(),
-            descB.descriptor(),
-            &beta_,
-            descC.descriptor(),
-            compute_type,
-            CUSPARSE_SDDMM_ALG_DEFAULT,
-            buffer.get()));
+          TORCH_CUDASPARSE_CHECK(cusparseSDDMM_preprocess(
+              handle,
+              opA,
+              opB,
+              &alpha_,
+              descA.descriptor(),
+              descB.descriptor(),
+              &beta_,
+              descC.descriptor(),
+              compute_type,
+              CUSPARSE_SDDMM_ALG_DEFAULT,
+              buffer.get()));
 
-        TORCH_CUDASPARSE_CHECK(cusparseSDDMM(
-            handle,
-            opA,
-            opB,
-            &alpha_,
-            descA.descriptor(),
-            descB.descriptor(),
-            &beta_,
-            descC.descriptor(),
-            compute_type,
-            CUSPARSE_SDDMM_ALG_DEFAULT,
-            buffer.get()));
+          TORCH_CUDASPARSE_CHECK(cusparseSDDMM(
+              handle,
+              opA,
+              opB,
+              &alpha_,
+              descA.descriptor(),
+              descB.descriptor(),
+              &beta_,
+              descC.descriptor(),
+              compute_type,
+              CUSPARSE_SDDMM_ALG_DEFAULT,
+              buffer.get()));
+        }
       });
 #endif
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index c83592335511..2a266319212a 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -2,6 +2,7 @@
 
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/thread_constants.h>
 #include <c10/macros/Macros.h>
 
 namespace at { namespace native {
@@ -209,6 +210,13 @@ __global__ void valueSparseIntersectionKernel(
   int64_t match, d;
   int64_t nDimI = r_indices.sizes[0];
   IndexType valueSize = r_values.strides[0];
+  // reset valueSize if a dense dimension is zero:
+  for (d=0; d<r_values.dims; d++) {
+    if (r_values.sizes[d] == 0) {
+      valueSize = 0;
+      break;
+    }
+  }
   IndexType r_i = 0, t_i = 0, s_i = 0;
   while (t_i < t_nnz && s_i < s_nnz) {
     match = 1;
@@ -297,7 +305,7 @@ __global__ void indexSparseIntersectionKernel(
 // }
 
 template <typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   Dtype *values, Dtype *newValues,
@@ -321,7 +329,6 @@ __global__ void coalesceValuesKernel(
     for (int row = begin; row < end; row++) {
       const int valueRow = ((int) value_indices[row]) * stride;
 
-
       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
@@ -344,6 +351,56 @@ __global__ void coalesceValuesKernel(
   }
 }
 
+// coalesceValuesKernel when Dtype/Acctype is bool. Can be eliminated using
+// `if constexpr` when CUDA codes will be compiled under C++-17, see
+// gh-56055 for blockers.
+template<typename Dtype>
+C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
+__global__ void coalesceValuesKernel(
+  int64_t *segment_offsets, int64_t *value_indices,
+  bool *values, bool *newValues,
+  int64_t nnz, int64_t newNnz, int64_t stride) {
+
+  int seg = blockIdx.x * 4 + threadIdx.y;
+
+  // Number of values processed by each thread (grain size)
+  const int SZ = 4;
+
+  if (seg < newNnz) {
+    const int newValueRow = seg * stride;
+    const int begin = segment_offsets[seg];
+    const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+    const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+    bool tmp[SZ];
+    #pragma unroll
+    for (int ii = 0; ii < SZ; ii++) {
+      tmp[ii] = 0;
+    }
+    for (int row = begin; row < end; row++) {
+      const int valueRow = ((int) value_indices[row]) * stride;
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++)
+      {
+        int featureDim = startFeature + ii * C10_WARP_SIZE;
+        if (featureDim < stride)
+        {
+          tmp[ii] |= values[valueRow + featureDim];
+        }
+      }
+    }
+    #pragma unroll
+    for (int ii = 0; ii < SZ; ii++)
+    {
+      int featureDim = startFeature + ii * C10_WARP_SIZE;
+      if (featureDim < stride)
+      {
+        newValues[newValueRow + featureDim] = tmp[ii];
+      }
+    }
+  }
+}
+
 } // namespace apply
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 30e7d873b39c..dc5a2acf2da1 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -142,10 +142,11 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
     const int SZ = 4;
     values = values.contiguous();
     int64_t stride = c10::multiply_integers(values.sizes().slice(1));
-    dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) C10_WARP_SIZE*SZ));
-    dim3 block(C10_WARP_SIZE, SZ);
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-      at::ScalarType::Half, at::ScalarType::BFloat16, values.scalar_type(), "coalesce_sparse_cuda", [&] {
+    int warp_size = at::cuda::warp_size();
+    dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) warp_size*SZ));
+    dim3 block(warp_size, SZ);
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, values.scalar_type(), "coalesce_sparse_cuda", [&] {
         using cuda_accscalar_t = acc_type<scalar_t, /* is_cuda */ true>;
         apply::coalesceValuesKernel<scalar_t, cuda_accscalar_t><<<grid, block, 0, stream>>>(
           uniqueOffsets.data_ptr<int64_t>(),
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 0d99e298ec9d..9dbf562300f3 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -503,8 +503,8 @@ SparseTensor& mul_out_sparse_cuda(const SparseTensor& t_, const SparseTensor& sr
   TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
 
   Tensor resultNnz = at::empty({1}, CUDA(kLong));
-  AT_DISPATCH_ALL_TYPES_AND(
-    at::ScalarType::Half, commonDtype, "mul_out_sparse_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "mul_out_sparse_cuda", [&] {
         apply::valueSparseIntersectionKernel<<<grid, block, 0, stream>>>(
             TensorMulOp<scalar_t>(),
             I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index c13984f2d92f..6bdd4b40f8f4 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -16,8 +16,12 @@
 #else
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_unique.h>
 #include <ATen/ops/add_native.h>
 #include <ATen/ops/resize_as_sparse_native.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/zeros.h>
 #endif
 
 #include <cuda_runtime.h>
@@ -29,6 +33,7 @@
 #include <ATen/cuda/ThrustAllocator.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
+#include <ATen/native/cuda/Reduce.cuh>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
@@ -159,18 +164,26 @@ Tensor& add_out_dense_sparse_csr_cuda(
       " in add operation");
 
   Tensor src_values = src.values();
-  Tensor src_crow_indices = src.crow_indices();
-  Tensor src_col_indices = src.col_indices();
 
   resize_output(output, dense.sizes());
 
   Tensor resultBuffer = output;
-  Tensor valuesBuffer = src_values.to(commonDtype);
+
   if (output.scalar_type() != commonDtype) {
     resultBuffer = dense.to(commonDtype);
   } else if (!is_same_tensor(output, dense)) {
     resultBuffer.copy_(dense);
   }
+
+  if (src._nnz() == 0) {
+    return output;
+  }
+
+  auto valuesBuffer = src_values.to(commonDtype).view({-1, src_values.size(-1)});
+  resultBuffer = resultBuffer.view({-1, output.size(-2), output.size(-1)});
+  auto src_crow_indices = src.crow_indices().view({-1, src.crow_indices().size(-1)});
+  auto src_col_indices = src.col_indices().view({-1, src.col_indices().size(-1)});
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kHalf, kBool, kBFloat16,
       commonDtype,
@@ -180,6 +193,7 @@ Tensor& add_out_dense_sparse_csr_cuda(
             src_crow_indices.scalar_type(),
             "csr_add_out_crow_indices",
               [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() {
+                auto batch_count = resultBuffer.dim() > 2 ? resultBuffer.size(-3) : 1;
                 scalar_t* values_accessor = valuesBuffer.data_ptr<scalar_t>();
                 scalar_t* out_ptr = resultBuffer.data_ptr<scalar_t>();
                 scalar_t cast_value = alpha.to<scalar_t>();
@@ -189,8 +203,11 @@ Tensor& add_out_dense_sparse_csr_cuda(
                 int64_t out_storage_offset = resultBuffer.storage_offset();
 
                 auto out_strides = resultBuffer.strides();
-                int64_t out_strides0 = out_strides[0];
-                int64_t out_strides1 = out_strides[1];
+                auto out_strides0 = out_strides[0];
+                auto out_strides1 = out_strides[1];
+                auto crow_stride0 = src_crow_indices.stride(0);
+                auto col_stride0 = src_col_indices.stride(0);
+                auto val_stride0 = valuesBuffer.stride(0);
 
                 cudaStream_t stream = at::cuda::getCurrentCUDAStream();
                 at::cuda::ThrustAllocator allocator;
@@ -200,24 +217,29 @@ Tensor& add_out_dense_sparse_csr_cuda(
                thrust::for_each(
                     policy,
                     thrust::make_counting_iterator(int64_t(0)),
-                    thrust::make_counting_iterator(int64_t(src_crow_indices.size(0) - 1)),
+                    thrust::make_counting_iterator(int64_t(src_crow_indices.size(-1) - 1)),
                     [values_accessor,
                     crow_indices_accessor,
                     col_indices_accessor,
                     out_ptr,
-                    out_storage_offset,
-                    out_strides0,
                     cast_value,
-                    out_strides1
+                    out_strides0,
+                    out_strides1,
+                    crow_stride0,
+                    col_stride0,
+                    val_stride0,
+                    batch_count
                     ]__device__(int64_t irow) {
-                        index_t start_index = crow_indices_accessor[irow];
-                        index_t end_index = crow_indices_accessor[irow + 1];
+                      for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+                        index_t start_index = crow_indices_accessor[batch_idx*crow_stride0 + irow];
+                        index_t end_index = crow_indices_accessor[batch_idx*crow_stride0 + irow + 1];
 
                         for (index_t i = start_index; i < end_index; ++i) {
-                            auto icol = col_indices_accessor[i];
-                            auto index = out_storage_offset + irow * out_strides0 + icol * out_strides1;
-                            out_ptr[index] += cast_value * values_accessor[i];
+                            auto icol = col_indices_accessor[batch_idx*col_stride0 + i];
+                            auto index = batch_idx * out_strides0 + irow * out_strides1 + icol;
+                            out_ptr[index] += cast_value * values_accessor[batch_idx*val_stride0 + i];
                         }
+                      }
                     });
               });
       });
@@ -275,5 +297,342 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cuda) (
   }
 }
 
+  /*
+    Reductions on sparse CSR tensors using masked semantics.
+
+    - To support a reduction operator on a CSR tensor with CUDA storage, define
+
+template <typename scalar_t>
+struct Reduction...Op {
+  __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const {
+    return a ... b;
+  }
+  __device__ __forceinline__ scalar_t identity() const { return ...; }
+  __forceinline__ scalar_t identity_cpu() const { return ...; }
+};
+
+
+Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+  ...
+      result = reduce_sparse_csr_cuda_template<scalar_t>(input_, dims_to_sum, keepdim, Reduction...Op<scalar_t>());
+  ...
+  return result;
+}
+
+      and add the following
+
+        - func: _sparse_csr_op.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+          dispatch:
+            SparseCsrCUDA: _sparse_csr_..._cuda
+
+      to native_functions.yaml
+  */
+
+namespace {
+
+template <typename scalar_t, typename index_t, typename ReductionOp>
+__global__ void reduce_sparse_csr_dim0_cuda_kernel(scalar_t* new_values,
+                                                   const index_t* new_col_indices,
+                                                   const int64_t new_nnz,
+                                                   const scalar_t* values,
+                                                   const index_t* col_indices,
+                                                   const int64_t nnz,
+                                                   ReductionOp rop
+                                                   ) {
+  int64_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < new_nnz) {
+    index_t col = new_col_indices[tid];
+    scalar_t v = rop.identity();
+    for (int64_t j=0; j < nnz; j++) {
+      if (col == col_indices[j]) {
+        v = rop(v, values[j]);
+      }
+    }
+    new_values[tid] = v;
+  }
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim0_cuda_template(const Tensor& sparse, ReductionOp rop) {
+  /*
+    Consider the following sparse tensor:
+
+      1 * * * *
+      * * * 2 *
+      * * 3 * *
+      * * * * *
+      4 * 5 * *
+
+    that has CSR representation
+
+      crow_indices = [0, 1, 2, 3, 3, 5]
+      col_indices = [0, 3, 2, 0, 2]
+      values = [1, 2, 3, 4, 5]
+
+    Reduction with dim=0 results:
+
+      rop(1,4) * rop(3,5) 2 *
+
+    that has CSR representation
+
+      new_crow_indices = [0, 3]
+      new_col_indices = [0, 2, 3]
+      new_values = [rop(1, 4], rop(3, 5), 2]
+
+    In general, the CSR representation data can be computed as follows:
+
+      nnz = col_indices.numel()
+      new_col_indices = col_indices.unique(sorted=True, return_inverse=False)
+      new_nnz = new_col_indices.numel()
+      new_crow_indices = [0, new_nnz]
+      new_values.resize(new_nnz)
+
+      for i in range(new_nnz):
+          v = identity
+          col = new_col_indices[i]
+          for j in range(nnz):
+              if col == col_indices[j]:
+                  v = rop(v, values[j])
+          new_values[i] = v
+
+    Notice this algorithm is different from the one used on CPU data.
+  */
+
+  Tensor col_indices = sparse.col_indices();
+  Tensor values = sparse.values();
+  auto ncols = sparse.size(1);
+  auto nnz = col_indices.numel();
+  Tensor new_col_indices;
+
+  std::tie(new_col_indices, std::ignore) = at::_unique(col_indices, true, false);
+  auto new_nnz = new_col_indices.numel();
+  Tensor new_crow_indices = at::tensor(ArrayRef<int64_t>{0, new_nnz}, col_indices.options());
+  Tensor new_values = at::empty({new_nnz}, values.options());
+
+  scalar_t* values_ptr = values.data_ptr<scalar_t>();
+  scalar_t* new_values_ptr = new_values.data_ptr<scalar_t>();
+  int64_t THREADS = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  int64_t BLOCKS = (new_nnz + THREADS) / THREADS;
+  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "reduce_sparse_csr_dim0_cuda_indices",
+                          [&]() {
+                            index_t* col_indices_ptr = col_indices.data_ptr<index_t>();
+                            index_t* new_col_indices_ptr = new_col_indices.data_ptr<index_t>();
+                            reduce_sparse_csr_dim0_cuda_kernel<<<BLOCKS, THREADS, 0, stream>>>(new_values_ptr,
+                                                                                               new_col_indices_ptr,
+                                                                                               new_nnz,
+                                                                                               values_ptr,
+                                                                                               col_indices_ptr,
+                                                                                               nnz,
+                                                                                               rop
+                                                                                               );
+                          });
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {1, ncols},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename index_t>
+__global__ void reduce_crow_indices_dim1_cuda_kernel(index_t* new_crow_indices,
+                                                     index_t* row_map,
+                                                     const index_t* crow_indices,
+                                                     const int64_t nrows
+                                                     ) {
+  int64_t nnz = 0;
+  new_crow_indices[0] = 0;
+  for(int64_t i=0; i<nrows; i++) {
+    if (crow_indices[i] != crow_indices[i + 1]) {
+      row_map[i] = nnz;
+      nnz++;
+    }
+    new_crow_indices[i + 1] = nnz;
+  }
+}
+
+template <typename scalar_t, typename index_t, typename ReductionOp>
+__global__ void reduce_sparse_csr_dim1_cuda_kernel(scalar_t* new_values,
+                                                   const scalar_t* values,
+                                                   const index_t* crow_indices,
+                                                   const index_t* row_map,
+                                                   const int64_t nrows,
+                                                   ReductionOp rop
+                                                   ) {
+  int64_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < nrows) {
+    index_t i_start = crow_indices[tid];
+    index_t i_end = crow_indices[tid+1];
+    if (i_start != i_end) {
+      scalar_t acc = rop.identity();
+      for (index_t i = i_start; i < i_end; i++) {
+        acc = rop(acc, values[i]);
+      }
+      new_values[row_map[tid]] = acc;
+    }
+  }
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim1_cuda_template(const Tensor& sparse, ReductionOp rop) {
+  /*
+    The algorithm of computing reduce of a CSR tensor along the last
+    dimension is explained in the comment of the
+    reduce_sparse_csr_dim1_cpu_template function.
+  */
+  Tensor crow_indices = sparse.crow_indices();
+  auto ioptions = crow_indices.options();
+  Tensor values = sparse.values();
+  auto nrows = sparse.size(0);
+  auto numel = values.numel();
+
+  Tensor new_crow_indices = at::empty({crow_indices.numel()}, ioptions);
+  Tensor new_col_indices = at::empty({}, ioptions);
+  Tensor new_values = at::empty({}, values.options());
+  Tensor row_map = at::empty({nrows}, ioptions);
+
+  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+  int64_t THREADS = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  int64_t BLOCKS = (nrows + THREADS) / THREADS;
+
+  AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "reduce_sparse_csr_dim1_cuda_indices",
+                          [&]() {
+                            index_t* crow_indices_ptr = crow_indices.data_ptr<index_t>();
+                            index_t* new_crow_indices_ptr = new_crow_indices.data_ptr<index_t>();
+                            index_t* row_map_ptr = row_map.data_ptr<index_t>();
+                            reduce_crow_indices_dim1_cuda_kernel<<<1, 1, 0, stream>>>(new_crow_indices_ptr,
+                                                                                      row_map_ptr,
+                                                                                      crow_indices_ptr,
+                                                                                      nrows);
+                            C10_CUDA_KERNEL_LAUNCH_CHECK();
+                            index_t new_nnz = new_crow_indices[-1].item<index_t>();
+                            new_col_indices.resize_(new_nnz);
+                            new_col_indices.fill_(index_t(0));
+                            new_values.resize_(new_nnz);
+
+                            scalar_t* values_ptr = values.data_ptr<scalar_t>();
+                            scalar_t* new_values_ptr = new_values.data_ptr<scalar_t>();
+                            reduce_sparse_csr_dim1_cuda_kernel<<<BLOCKS, THREADS, 0, stream>>>(new_values_ptr,
+                                                                                               values_ptr,
+                                                                                               crow_indices_ptr,
+                                                                                               row_map_ptr,
+                                                                                               nrows,
+                                                                                               rop);
+                            C10_CUDA_KERNEL_LAUNCH_CHECK();
+                          });
+
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {sparse.size(0), 1},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_dim01_cuda_template(const Tensor& sparse, ReductionOp rop) {
+
+  auto ioptions = sparse.col_indices().options();
+  Tensor values = sparse.values();
+  auto numel = values.numel();
+  auto nnz = std::min<int64_t>(1, numel);
+
+  Tensor new_values;
+  if (numel > 0) {
+    new_values = at::empty({1}, values.options());
+    auto iter = TensorIterator::reduce_op(new_values, values);
+    gpu_reduce_kernel<scalar_t, scalar_t>(iter, func_wrapper<scalar_t>(rop), rop.identity_cpu());
+  } else {
+    new_values = at::empty({}, values.options());
+  }
+  Tensor new_col_indices = at::zeros({nnz}, ioptions);
+  Tensor new_crow_indices = at::tensor(ArrayRef<int64_t>{0, nnz}, ioptions);
+  return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values,
+                                               {1, std::min<int64_t>(1, sparse.size(1))},
+                                               new_values.scalar_type(),
+                                               sparse.layout(),
+                                               new_values.device());
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_cuda_template(const Tensor& sparse, std::vector<int64_t> dims, ReductionOp rop) {
+  if (dims.size() == 1) {
+    if (dims[0] == 0) {
+      return reduce_sparse_csr_dim0_cuda_template<scalar_t>(sparse, rop);
+    } else {
+      TORCH_INTERNAL_ASSERT(dims[0] == 1);
+      return reduce_sparse_csr_dim1_cuda_template<scalar_t>(sparse, rop);
+    }
+  } else if (dims.size() == 2) {
+    TORCH_INTERNAL_ASSERT(((dims[0] == 0 && dims[1] == 1) || (dims[0] == 1 && dims[1] == 0)));
+    return reduce_sparse_csr_dim01_cuda_template<scalar_t>(sparse, rop);
+  }
+  TORCH_INTERNAL_ASSERT(dims.size() == 0);
+  // effective after gh-29137 has been resolved
+  return sparse.clone();
+}
+
+template <typename scalar_t, typename ReductionOp>
+Tensor reduce_sparse_csr_cuda_template(const Tensor& sparse, IntArrayRef dims_to_sum, bool keepdim, ReductionOp rop) {
+  TORCH_INTERNAL_ASSERT(sparse.is_sparse_csr());
+  TORCH_CHECK(keepdim, "reduction operations on CSR tensors with keepdim=False is unsupported");
+  TORCH_INTERNAL_ASSERT(sparse.is_cuda());
+
+  const int64_t input_dim = sparse.dim();
+  TORCH_INTERNAL_ASSERT(input_dim == 2);
+  auto dims = dims_to_sum.vec();
+  maybe_wrap_dims(dims, input_dim);
+  if (dims.size() == 0) {
+    // after gh-29137 is resolved, delete this if-block
+    dims.emplace_back(0);
+    dims.emplace_back(1);
+  }
+  return reduce_sparse_csr_cuda_template<scalar_t>(sparse, dims, rop);
+}
+
+template <typename scalar_t>
+struct ReductionAddOp {
+  __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const {
+    return a + b;
+  }
+  __device__ __forceinline__ scalar_t identity() const { return 0; }
+  __forceinline__ scalar_t identity_cpu() const { return 0; }
+};
+
+template <typename scalar_t>
+struct ReductionMulOp {
+  __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const {
+    return a * b;
+  }
+  __device__ __forceinline__ scalar_t identity() const { return 1; }
+  __forceinline__ scalar_t identity_cpu() const { return 1; }
+};
+
+} // namespace
+
+Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+  ScalarType dtype_ = dtype.value_or(input.scalar_type());
+  Tensor input_ = input.to(dtype_);
+  Tensor result;
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_sum_cuda",
+    [&] {
+      result = reduce_sparse_csr_cuda_template<scalar_t>(input_, dims_to_sum, keepdim, ReductionAddOp<scalar_t>());
+    });
+  return result;
+}
+
+Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional<ScalarType> dtype) {
+  ScalarType dtype_ = dtype.value_or(input.scalar_type());
+  Tensor input_ = input.to(dtype_);
+  Tensor result;
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_prod_cuda",
+    [&] {
+      result = reduce_sparse_csr_cuda_template<scalar_t>(input_, dims_to_reduce, keepdim, ReductionMulOp<scalar_t>());
+    });
+  return result;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
new file mode 100644
index 000000000000..d79b13adae84
--- /dev/null
+++ b/aten/src/ATen/native/tags.yaml
@@ -0,0 +1,10 @@
+# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml`
+
+- tag: inplace_view
+  desc: |
+          This tag indicates if an operator *only* modifies the tensor metadata
+- tag: view_copy
+  desc: |
+          This tag indicates operators that are *_copy* variants
+          of view/aliasing operators. If an operator has a view_copy tag,
+          then it should have the name {op}_copy, where {op} is a view operator.
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
new file mode 100644
index 000000000000..697aabb46009
--- /dev/null
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -0,0 +1,482 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/cpu/vec/vec256/vec256.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cat.h>
+#endif
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+
+namespace at {
+
+namespace native {
+
+namespace {
+
+Tensor gemm_nt(const Tensor& self, const Tensor& other) {
+  if (self.is_nested()) {
+    return NestedTensor_matmul(self, other.t());
+  } else {
+    return at::native::matmul(self, other.t());
+  }
+}
+
+template <typename scalar_t>
+void transform_bias_rescale_qkv_inner_loop(
+    int64_t B,
+    int64_t T,
+    int64_t _3D,
+    int64_t D,
+    int64_t num_head,
+    int64_t dim_per_head,
+    scalar_t* qkv_data,
+    scalar_t* qkv_bias_data,
+    scalar_t* q_k_v_data,
+    scalar_t inv_sqrt_dim_per_head,
+    int64_t begin,
+    int64_t end) {
+  for (auto i : c10::irange(begin, end)) {
+    auto t = i % T;
+    i /= T;
+    auto nh = i % num_head;
+    i /= num_head;
+    auto b = i;
+    using Vec = vec::Vectorized<scalar_t>;
+    auto V = vec::Vectorized<scalar_t>::size();
+    auto dh = 0;
+    auto d = nh * dim_per_head;
+    for (; dh + V <= dim_per_head; dh += V, d += V) {
+      // load
+      auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
+      auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]);
+      auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]);
+
+      auto q_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) +
+          q_bias_data;
+      auto k_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) +
+          k_bias_data;
+      auto v_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) +
+          v_bias_data;
+
+      q_data = q_data * Vec(inv_sqrt_dim_per_head);
+
+      q_data.store(&q_k_v_data
+                       [0 * B * num_head * T * dim_per_head +
+                        b * num_head * T * dim_per_head +
+                        nh * T * dim_per_head + t * dim_per_head + dh]);
+      k_data.store(&q_k_v_data
+                       [1 * B * num_head * T * dim_per_head +
+                        b * num_head * T * dim_per_head +
+                        nh * T * dim_per_head + t * dim_per_head + dh]);
+      v_data.store(&q_k_v_data
+                       [2 * B * num_head * T * dim_per_head +
+                        b * num_head * T * dim_per_head +
+                        nh * T * dim_per_head + t * dim_per_head + dh]);
+    }
+    for (; dh < dim_per_head; dh++) {
+      auto d = nh * dim_per_head + dh;
+      auto q_bias = qkv_bias_data[d + 0 * D];
+      auto k_bias = qkv_bias_data[d + 1 * D];
+      auto v_bias = qkv_bias_data[d + 2 * D];
+      auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias;
+      auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias;
+      auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias;
+      q_data = q_data * inv_sqrt_dim_per_head;
+      q_k_v_data
+          [0 * B * num_head * T * dim_per_head +
+           b * num_head * T * dim_per_head + nh * T * dim_per_head +
+           t * dim_per_head + dh] = q_data;
+      q_k_v_data
+          [1 * B * num_head * T * dim_per_head +
+           b * num_head * T * dim_per_head + nh * T * dim_per_head +
+           t * dim_per_head + dh] = k_data;
+      q_k_v_data
+          [2 * B * num_head * T * dim_per_head +
+           b * num_head * T * dim_per_head + nh * T * dim_per_head +
+           t * dim_per_head + dh] = v_data;
+    }
+  }
+}
+
+Tensor bmm_nt(const Tensor& a, const Tensor& b) {
+  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
+  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
+  auto bt_ = b_.transpose(2, 1);
+  auto c_ = at::bmm(a_, bt_);
+  return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
+}
+
+Tensor masked_softmax(
+    Tensor& attn_scores,
+    c10::optional<Tensor> attn_mask,
+    const Tensor& query) {
+  if (query.is_nested() && !attn_mask) {
+    // TODO: maybe we could do better than generating a mask every time?
+
+    attn_mask = NestedTensor_to_mask(query, 2);
+    // TODO: CPU path does not support transformer mask yet.
+    if (attn_scores.is_cpu()) {
+      attn_mask = attn_mask->view({-1, 1, 1, attn_scores.sizes()[3]});
+      // 1 means skip, 0 means keep.
+      // want:
+      // 0,0 -> 0
+      // 0,1 -> 1
+      // 1,1 -> 1
+      // so that's logical OR.
+      *attn_mask = *attn_mask | attn_mask->transpose(2, 3);
+      attn_mask = at::expand_inplace(attn_scores, *attn_mask)->contiguous();
+    }
+    attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true);
+  }
+  if (attn_mask && attn_mask->dtype() != at::kBool) {
+    TORCH_WARN(
+        "Converting mask without torch.bool dtype to bool; this will "
+        "negatively affect performance. Prefer to use a boolean mask directly.");
+    attn_mask = attn_mask->to(at::kBool);
+  }
+  if (attn_scores.is_cpu() && attn_mask && attn_mask->dim() == 2) {
+    // TODO: CPU path does not support transformer mask yet.
+    const auto batch_size = attn_scores.sizes()[0];
+    const auto seq_len = attn_scores.sizes()[3];
+    TORCH_CHECK(attn_mask->sizes()[0] == batch_size);
+    TORCH_CHECK(attn_mask->sizes()[1] == seq_len);
+    attn_mask = attn_mask->view({batch_size, 1, 1, seq_len});
+    attn_mask = at::expand_inplace(attn_scores, *attn_mask)->contiguous();
+  }
+  if (attn_mask) {
+    return _masked_softmax(attn_scores, *attn_mask);
+  } else {
+    return _softmax_out(attn_scores, attn_scores, attn_scores.dim() - 1, false);
+  }
+}
+
+Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b) {
+  const std::array<int64_t, 3> newAShape = {
+      a.sizes()[0] * a.sizes()[1], a.sizes()[2], a.sizes()[3]};
+  auto a_ = a.view(newAShape);
+  const std::array<int64_t, 3> newBShape = {
+      b.sizes()[0] * b.sizes()[1], b.sizes()[2], b.sizes()[3]};
+  auto b_ = b.view(newBShape);
+  auto out_ = out.reshape({newAShape[0], newAShape[1], newBShape[2]});
+  auto c_ = at::bmm_out(out_, a_, b_);
+  return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
+}
+
+Tensor transform_0213(const Tensor& a) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
+  return a.permute({0, 2, 1, 3})
+      .contiguous()
+      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
+}
+
+Tensor transform0213_gemm_nt_bias(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& c,
+    const Tensor& query) {
+  if (query.is_nested()) {
+    at::Tensor nested_a = _nested_from_padded(
+        a, get_nested_tensor_impl(query)->get_nested_size_tensor(), true);
+    return NestedTensor_times_Tensor_plus_Tensor_addmm(
+        c, nested_a, b.t(), 1, 1);
+  } else {
+    const Tensor a_0213 = transform_0213(a);
+    auto a_ = a_0213.view({a_0213.size(0) * a_0213.size(1), a_0213.size(2)});
+    auto r_ = at::native::linear(a_, b, c);
+    return r_.view({a_0213.size(0), a_0213.size(1), r_.size(1)});
+  }
+}
+
+void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      (size_t)t.dim() == shape.size(),
+      "(called from line ",
+      line,
+      ") ",
+      "expected ",
+      shape.size(),
+      "-D tensor but got ",
+      t.dim());
+  if (t.is_nested()) {
+    return;
+  }
+  for (auto idx : c10::irange(shape.size())) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        shape[idx] == 0 || t.sizes()[idx] == shape[idx],
+        "(called from line ",
+        line,
+        ") ",
+        "expected dim ",
+        idx,
+        " to be ",
+        shape[idx],
+        " but got ",
+        t.sizes()[idx]);
+  }
+}
+} // namespace
+
+// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
+std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cpu(
+    const Tensor& qkv,
+    const Tensor& qkv_bias,
+    const int64_t num_head) {
+  auto qkv_ = qkv.is_nested()
+    ? c10::MaybeOwned<Tensor>::owned(qkv.to_padded_tensor(0))
+    : c10::MaybeOwned<Tensor>::borrowed(qkv);
+  auto B = qkv_->size(0);
+  auto T = qkv_->size(1);
+  auto _3D = qkv_->size(2);
+  auto D = _3D / 3;
+  TORCH_CHECK(D % num_head == 0);
+  TORCH_CHECK(_3D % 3 == 0);
+  const auto dim_per_head = D / num_head;
+  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv_->options());
+
+  const auto qkv_contig = qkv_->expect_contiguous();
+  const auto qkv_bias_contig = qkv_bias.expect_contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      qkv_->scalar_type(),
+      "transform_bias_rescale_qkv",
+      [&] {
+        scalar_t* qkv_data = qkv_contig->data_ptr<scalar_t>();
+        scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr<scalar_t>();
+        scalar_t* q_k_v_data = q_k_v.data_ptr<scalar_t>();
+        const scalar_t inv_sqrt_dim_per_head =
+            1.0 / std::sqrt(static_cast<scalar_t>(dim_per_head));
+
+        int64_t grain_size =
+            std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
+        parallel_for(
+            0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
+              transform_bias_rescale_qkv_inner_loop(
+                  B,
+                  T,
+                  _3D,
+                  D,
+                  num_head,
+                  dim_per_head,
+                  qkv_data,
+                  qkv_bias_data,
+                  q_k_v_data,
+                  inv_sqrt_dim_per_head,
+                  begin,
+                  end);
+            });
+      });
+  auto q_k_v_s =
+      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3);
+  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
+}
+
+std::tuple<Tensor, Tensor> native_multi_head_attention(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const int64_t embed_dim,
+    const int64_t num_head,
+    const Tensor& qkv_weight,
+    const Tensor& qkv_bias,
+    const Tensor& proj_weight,
+    const Tensor& proj_bias,
+    const c10::optional<Tensor>& mask,
+    bool need_weights,
+    bool average_attn_weights) {
+  // query shape: [B, T, D]
+  // qkv_weight shape: [3 * D, D]
+
+  TORCH_CHECK(
+      !mask || !query.is_nested(),
+      "NestedTensor with mask is not supported yet");
+  const auto D = embed_dim;
+  TORCH_CHECK(
+      query.dim() == 3,
+      "expected 3-D `query`, got ",
+      query.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || query.sizes()[2] == embed_dim,
+      "passed-in embed_dim ",
+      embed_dim,
+      " didn't match last dim of query ",
+      query.sizes()[2]);
+  TORCH_CHECK(
+      key.dim() == 3,
+      "expected 3-D `key`, got ",
+      key.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      value.dim() == 3,
+      "expected 3-D `value`, got ",
+      value.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || key.is_nested() || value.is_nested() ||
+          (query.sizes() == key.sizes() && key.sizes() == value.sizes()),
+      "expected `query`/`key`/`value` shapes to match");
+  TORCH_CHECK(
+      qkv_weight.dim() == 2,
+      "expected 2-D `qkv_weight`, got ",
+      qkv_weight.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      D * 3 == qkv_weight.sizes()[0],
+      "expected `qkv_weight` first dim to be 3x embed_dim");
+  TORCH_CHECK(
+      D == qkv_weight.sizes()[1],
+      "expected `qkv_weight` second dim to be embed_Dim");
+  TORCH_CHECK(
+      qkv_bias.dim() == 1,
+      "expected 2-D `qkv_bias`, got ",
+      qkv_bias.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      qkv_bias.sizes()[0] == 3 * D,
+      "expected `qkv_bias` first dim and first dim of query to be equal");
+  TORCH_CHECK(D % num_head == 0, "`embed_dim` must divide evenly by `num_heads`");
+
+#ifndef NDEBUG
+  const auto B = query.is_nested()
+      ? get_nested_tensor_impl(query)->get_nested_size_tensor().size(0)
+      : query.sizes()[0];
+  auto T = query.is_nested() ? 0 : query.sizes()[1];
+  const auto dim_per_head = D / num_head;
+#endif
+
+  // shape: [B, T, 3 x D]
+  Tensor qkv;
+
+  if (key.is_same(value)) {
+    if (query.is_same(key)) {
+      // self-attention
+      qkv = gemm_nt(query, qkv_weight);
+    } else {
+      // encoder-decoder attention
+      // TODO: is there a more efficient way to set this up?
+      // TODO: can we stay nested insted of using cat? Probably just make a
+      // NestedTensor out of the matmul results or something?
+      auto q_kv_weight_s =
+          at::native::split_with_sizes(qkv_weight, {D, D * 2}, 0);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          q_kv_weight_s.size() == 2,
+          "expected split to produce 2 tensors but it produced ",
+          q_kv_weight_s.size());
+      auto q = gemm_nt(query, q_kv_weight_s[0]);
+      auto kv = gemm_nt(key, q_kv_weight_s[1]);
+      qkv = at::cat({q, kv}, 2);
+    }
+  } else {
+    auto q_k_v_weight_s = at::native::chunk(qkv_weight, 3, 0);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        q_k_v_weight_s.size() == 3,
+        "expected chunk to produce 3 tensors but it produced ",
+        q_k_v_weight_s.size());
+    // TODO: can we stay nested instead of using cat?
+    auto q = gemm_nt(query, q_k_v_weight_s[0]);
+    auto k = gemm_nt(key, q_k_v_weight_s[1]);
+    auto v = gemm_nt(value, q_k_v_weight_s[2]);
+    qkv = at::cat({q, k, v}, 2);
+  }
+
+  if (!qkv.is_nested() && qkv.numel() == 0) {
+    if (query.is_nested()) {
+      return std::make_tuple(Tensor(), Tensor());
+    }
+    return std::make_tuple(at::empty_like(query), Tensor());
+  }
+
+#ifndef NDEBUG
+  if (!query.is_nested() || !qkv.is_nested()) {
+    if (query.is_nested()) {
+      T = qkv.size(1);
+    }
+    debug_assert_shape(__LINE__, qkv, {B, T, 3 * D});
+  }
+#endif
+
+#ifdef DEBUG_PRINT_EACH_STEP
+  if (!qkv.is_nested()) {
+    std::cerr << "qkv: " << qkv << std::endl;
+  }
+#endif
+  // shape: 3 x [B, num_head, T, dim_per_head]
+  auto q_k_v = _transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
+  qkv = Tensor(); // Not used any more, allow free
+  auto& q = std::get<0>(q_k_v);
+  const auto& k = std::get<1>(q_k_v);
+  const auto& v = std::get<2>(q_k_v);
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, q, {B, num_head, T, dim_per_head});
+  debug_assert_shape(__LINE__, k, {B, num_head, T, dim_per_head});
+  debug_assert_shape(__LINE__, v, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "q: " << q << std::endl;
+  std::cerr << "k: " << k << std::endl;
+  std::cerr << "v: " << v << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  auto qkt = bmm_nt(q, k);
+  // q & k are dead but cannot be freed because they were packed with v
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, qkt, {B, num_head, T, T});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  // TODO: long-term, have a kernel that works with
+  // NestedTensor directly if there is no mask passed
+  qkt = masked_softmax(qkt, mask, query);
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt after softmax: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, dim_per_head]
+  // reuse storage for q; we're done with it
+  auto attn_ctx = bmm_nn(q, qkt, v);
+  // qkv is not dead; we just reused storage for q!
+  if (!need_weights) {
+    qkt = Tensor();
+  }
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, attn_ctx, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "attn_ctx: " << attn_ctx << std::endl;
+#endif
+
+  // shape: [B, T, D]
+  // Fuse transform_0213 inside
+  auto proj = transform0213_gemm_nt_bias(
+      attn_ctx, proj_weight, proj_bias, query);
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, proj, {B, T, D});
+#endif
+  if (need_weights && average_attn_weights) {
+    // weights are not needed for full transformer, so don't worry too
+    // much about performance -- we implement this just to make use
+    // cases that don't disable need_weights still get some speedup.
+    qkt = qkt.sum(1);
+    qkt /= num_head;
+  }
+  return std::make_tuple(std::move(proj), std::move(qkt));
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
new file mode 100644
index 000000000000..fc9a83266a20
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -0,0 +1,400 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/TensorAccessor.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/PersistentSoftmax.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+#include <c10/cuda/CUDAMathCompat.h>
+
+#include <ATen/native/nested/NestedTensorMath.h>
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+namespace at {
+
+namespace native {
+
+namespace {
+
+static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4;
+
+template <typename scalar_t, typename accscalar_t, bool assume_aligned>
+__global__ void transform_bias_rescale_qkv_kernel(
+    // [B, T, 3 * D]
+    const PackedTensorAccessor64<scalar_t, 3, RestrictPtrTraits> qkv,
+    // [3 * D]
+    const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv_bias,
+    // [3, B, NH, T, DH]
+    PackedTensorAccessor64<scalar_t, 5, RestrictPtrTraits> q_k_v,
+    const scalar_t inv_sqrt_dim_per_head) {
+  // warp per DH.
+  // so launch B * NH * T warps.
+  auto NH = q_k_v.size(2);
+  auto T = q_k_v.size(3);
+  auto DH = q_k_v.size(4);
+
+  auto t = blockIdx.x % T;
+  auto b = blockIdx.x / T;
+
+  auto D = NH * DH;
+
+  if (assume_aligned) {
+    constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC;
+    using LoadT = memory::aligned_vector<scalar_t, VEC>;
+    for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) {
+      auto d = d_v * VEC;
+      auto nh = d / DH;
+      auto dh = d % DH;
+      scalar_t qkv_bias_q[VEC];
+      scalar_t qkv_bias_k[VEC];
+      scalar_t qkv_bias_v[VEC];
+      scalar_t qkv_q[VEC];
+      scalar_t qkv_k[VEC];
+      scalar_t qkv_v[VEC];
+
+      // Here we require D % VEC == 0 for these vectorized loads.
+      *reinterpret_cast<LoadT*>(&qkv_bias_q) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 0 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_bias_k) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 1 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_bias_v) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 2 * D]);
+
+      *reinterpret_cast<LoadT*>(&qkv_q) =
+          *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 0 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_k) =
+          *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 1 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_v) =
+          *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 2 * D]);
+
+#pragma unroll
+      // TODO: specialize for float2half2/half2float2?
+      for (auto ii = 0; ii < VEC; ++ii) {
+        qkv_q[ii] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_q[ii]) +
+             static_cast<accscalar_t>(qkv_bias_q[ii])) *
+            static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+        qkv_k[ii] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_k[ii]) +
+             static_cast<accscalar_t>(qkv_bias_k[ii])));
+        qkv_v[ii] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_v[ii]) +
+             static_cast<accscalar_t>(qkv_bias_v[ii])));
+      }
+
+      // Here we require DH % VEC == 0 for these vectorized stores.
+      *reinterpret_cast<LoadT*>(&q_k_v[0][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_q);
+      *reinterpret_cast<LoadT*>(&q_k_v[1][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_k);
+      *reinterpret_cast<LoadT*>(&q_k_v[2][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_v);
+    }
+  } else {
+    // Same as above, but we can't vectorize memory access.
+    for (int32_t d = threadIdx.x; d < D; d += blockDim.x) {
+      auto nh = d / DH;
+      auto dh = d % DH;
+      scalar_t qkv_bias_q = qkv_bias[d + 0 * D];
+      scalar_t qkv_bias_k = qkv_bias[d + 1 * D];
+      scalar_t qkv_bias_v = qkv_bias[d + 2 * D];
+      scalar_t qkv_q = qkv[b][t][d + 0 * D];
+      scalar_t qkv_k = qkv[b][t][d + 1 * D];
+      scalar_t qkv_v = qkv[b][t][d + 2 * D];
+      qkv_q = static_cast<scalar_t>(
+          (static_cast<accscalar_t>(qkv_q) +
+           static_cast<accscalar_t>(qkv_bias_q)) *
+          static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+      qkv_k = static_cast<scalar_t>(
+          (static_cast<accscalar_t>(qkv_k) +
+           static_cast<accscalar_t>(qkv_bias_k)));
+      qkv_v = static_cast<scalar_t>(
+          (static_cast<accscalar_t>(qkv_v) +
+           static_cast<accscalar_t>(qkv_bias_v)));
+
+      q_k_v[0][b][nh][t][dh] = qkv_q;
+      q_k_v[1][b][nh][t][dh] = qkv_k;
+      q_k_v[2][b][nh][t][dh] = qkv_v;
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, bool assume_aligned = false>
+__global__ void transform_bias_rescale_qkv_add_padding_kernel(
+    // [B, T, 3 * D], but it's a NestedTensor buffer
+    const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv,
+    // [3 * D]
+    const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv_bias,
+    const int* offsets,
+    const int* input_sizes,
+    // [3, B, NH, T, DH]
+    PackedTensorAccessor64<scalar_t, 5, RestrictPtrTraits> q_k_v,
+    const scalar_t inv_sqrt_dim_per_head) {
+  // warp per DH.
+  // so launch B * NH * T warps.
+  const auto NH = q_k_v.size(2);
+  const auto T = q_k_v.size(3);
+  const auto DH = q_k_v.size(4);
+
+  const auto t = blockIdx.x % T;
+  const auto b = blockIdx.x / T;
+
+  const auto D = NH * DH;
+  const auto _3D = 3 * D;
+
+  const auto offset_for_batch = offsets[b];
+  const auto input_dim = 1;
+  const auto* sizes_i = input_sizes + b * input_dim;
+  if (assume_aligned) {
+    constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC;
+    using LoadT = memory::aligned_vector<scalar_t, VEC>;
+    for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) {
+      auto d = d_v * VEC;
+      auto nh = d / DH;
+      auto dh = d % DH;
+      scalar_t qkv_bias_q[VEC];
+      scalar_t qkv_bias_k[VEC];
+      scalar_t qkv_bias_v[VEC];
+      scalar_t qkv_q[VEC];
+      scalar_t qkv_k[VEC];
+      scalar_t qkv_v[VEC];
+
+      const auto first_item_offset = t * _3D + d;
+      const auto last_item_offset = first_item_offset + VEC - 1;
+      const bool first_item_in_bounds = first_item_offset < sizes_i[0];
+      const bool entire_vec_in_bounds = last_item_offset < sizes_i[0];
+
+      // Here we require D % VEC == 0 for these vectorized loads.
+      *reinterpret_cast<LoadT*>(&qkv_bias_q) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 0 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_bias_k) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 1 * D]);
+      *reinterpret_cast<LoadT*>(&qkv_bias_v) =
+          *reinterpret_cast<const LoadT*>(&qkv_bias[d + 2 * D]);
+
+      if (entire_vec_in_bounds) {
+        const auto offset = offset_for_batch + first_item_offset;
+        *reinterpret_cast<LoadT*>(&qkv_q) =
+            *reinterpret_cast<const LoadT*>(&qkv[offset + 0 * D]);
+        *reinterpret_cast<LoadT*>(&qkv_k) =
+            *reinterpret_cast<const LoadT*>(&qkv[offset + 1 * D]);
+        *reinterpret_cast<LoadT*>(&qkv_v) =
+            *reinterpret_cast<const LoadT*>(&qkv[offset + 2 * D]);
+#pragma unroll
+        // TODO: specialize for float2half2/half2float2?
+        for (auto ii = 0; ii < VEC; ++ii) {
+          qkv_q[ii] = static_cast<scalar_t>(
+              (static_cast<accscalar_t>(qkv_q[ii]) +
+               static_cast<accscalar_t>(qkv_bias_q[ii])) *
+              static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+          qkv_k[ii] = static_cast<scalar_t>(
+              (static_cast<accscalar_t>(qkv_k[ii]) +
+               static_cast<accscalar_t>(qkv_bias_k[ii])));
+          qkv_v[ii] = static_cast<scalar_t>(
+              (static_cast<accscalar_t>(qkv_v[ii]) +
+               static_cast<accscalar_t>(qkv_bias_v[ii])));
+        }
+      } else if (first_item_in_bounds) {
+        const auto offset = offset_for_batch + first_item_offset;
+        qkv_q[0] = qkv[offset + 0 * D];
+        qkv_k[0] = qkv[offset + 1 * D];
+        qkv_v[0] = qkv[offset + 2 * D];
+        qkv_q[0] = static_cast<scalar_t>(
+              (static_cast<accscalar_t>(qkv_q[0]) +
+               static_cast<accscalar_t>(qkv_bias_q[0])) *
+              static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+        qkv_k[0] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_k[0]) +
+               static_cast<accscalar_t>(qkv_bias_k[0])));
+          qkv_v[0] = static_cast<scalar_t>(
+              (static_cast<accscalar_t>(qkv_v[0]) +
+               static_cast<accscalar_t>(qkv_bias_v[0])));
+#pragma unroll
+        for (auto ii = 1; ii < VEC; ++ii) {
+          const auto loop_offset = offset + ii;
+          if (loop_offset < sizes_i[0]) {
+            qkv_q[ii] = qkv[loop_offset + 0 * D];
+            qkv_k[ii] = qkv[loop_offset + 1 * D];
+            qkv_v[ii] = qkv[loop_offset + 2 * D];
+            qkv_q[ii] = static_cast<scalar_t>(
+                (static_cast<accscalar_t>(qkv_q[ii]) +
+                 static_cast<accscalar_t>(qkv_bias_q[ii])) *
+                static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+            qkv_k[ii] = static_cast<scalar_t>(
+                (static_cast<accscalar_t>(qkv_k[ii]) +
+                 static_cast<accscalar_t>(qkv_bias_k[ii])));
+            qkv_v[ii] = static_cast<scalar_t>(
+                (static_cast<accscalar_t>(qkv_v[ii]) +
+                 static_cast<accscalar_t>(qkv_bias_v[ii])));
+          } else {
+            qkv_q[ii] = 0;
+            qkv_k[ii] = 0;
+            qkv_v[ii] = 0;
+          }
+        }
+      } else {
+#pragma unroll
+        for (auto ii = 0; ii < VEC; ++ii) {
+          qkv_q[ii] = 0;
+          qkv_k[ii] = 0;
+          qkv_v[ii] = 0;
+        }
+      }
+
+      // Here we require DH % VEC == 0 for these vectorized stores.
+      *reinterpret_cast<LoadT*>(&q_k_v[0][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_q);
+      *reinterpret_cast<LoadT*>(&q_k_v[1][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_k);
+      *reinterpret_cast<LoadT*>(&q_k_v[2][b][nh][t][dh]) =
+          *reinterpret_cast<const LoadT*>(&qkv_v);
+    }
+  } else {
+    for (int32_t d = threadIdx.x; d < D; d += blockDim.x) {
+      auto nh = d / DH;
+      auto dh = d % DH;
+      scalar_t qkv_bias_q = qkv_bias[d + 0 * D];
+      scalar_t qkv_bias_k = qkv_bias[d + 1 * D];
+      scalar_t qkv_bias_v = qkv_bias[d + 2 * D];
+
+      const auto item_offset = t * _3D + d;
+      const bool in_bounds = item_offset < sizes_i[0];
+      scalar_t qkv_q, qkv_k, qkv_v;
+      if (in_bounds) {
+        const auto qkv_offset = offset_for_batch + item_offset;
+        qkv_q = qkv[qkv_offset + 0 * D];
+        qkv_k = qkv[qkv_offset + 1 * D];
+        qkv_v = qkv[qkv_offset + 2 * D];
+        qkv_q = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_q) +
+             static_cast<accscalar_t>(qkv_bias_q)) *
+            static_cast<accscalar_t>(inv_sqrt_dim_per_head));
+        qkv_k = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_k) +
+             static_cast<accscalar_t>(qkv_bias_k)));
+        qkv_v = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(qkv_v) +
+             static_cast<accscalar_t>(qkv_bias_v)));
+      } else {
+        qkv_q = 0;
+        qkv_k = 0;
+        qkv_v = 0;
+      }
+
+      q_k_v[0][b][nh][t][dh] = qkv_q;
+      q_k_v[1][b][nh][t][dh] = qkv_k;
+      q_k_v[2][b][nh][t][dh] = qkv_v;
+    }
+  }
+}
+
+Tensor collapse_dims_1_and_2(const Tensor& sizes) {
+  auto sizes_dim1 = at::native::narrow(sizes, 1, 0, 1);
+  auto sizes_dim2 = at::native::narrow(sizes, 1, 1, 1);
+
+  return (sizes_dim1 * sizes_dim2).contiguous();
+}
+
+} // namespace
+// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
+__host__ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cuda(
+    const Tensor& qkv,
+    const Tensor& qkv_bias,
+    const int64_t num_head) {
+  auto B = qkv.is_nested()
+      ? get_nested_tensor_impl(qkv)->get_nested_size_tensor().size(0)
+      : qkv.size(0);
+  // TODO: calculate this without the std::vector -- NestedTensor_to_mask wants
+  // this too
+  auto T = qkv.is_nested()
+      ? NestedTensor_get_max_size(*get_nested_tensor_impl(qkv))[0]
+      : qkv.size(1);
+  auto _3D = qkv_bias.size(0);
+  auto D = _3D / 3;
+  TORCH_CHECK(D % num_head == 0);
+  const auto dim_per_head = D / num_head;
+  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv_bias.options());
+#define CALL_KERNEL(assume_aligned)                                        \
+  transform_bias_rescale_qkv_kernel<scalar_t, accscalar_t, assume_aligned> \
+      <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(          \
+          qkv.packed_accessor64<scalar_t, 3, RestrictPtrTraits>(),         \
+          qkv_bias.packed_accessor64<scalar_t, 1, RestrictPtrTraits>(),    \
+          q_k_v.packed_accessor64<scalar_t, 5, RestrictPtrTraits>(),       \
+          1.0 / std::sqrt(static_cast<scalar_t>(dim_per_head)))
+#define CALL_ADD_PADDING_KERNEL(assume_aligned)                         \
+  transform_bias_rescale_qkv_add_padding_kernel<                        \
+      scalar_t,                                                         \
+      accscalar_t,                                                      \
+      assume_aligned>                                                   \
+      <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(       \
+          nt_qkv->get_buffer()                                          \
+              .packed_accessor64<scalar_t, 1, RestrictPtrTraits>(),     \
+          qkv_bias.packed_accessor64<scalar_t, 1, RestrictPtrTraits>(), \
+          offsets_ptr,                                                  \
+          sizes_ptr,                                                    \
+          q_k_v.packed_accessor64<scalar_t, 5, RestrictPtrTraits>(),    \
+          1.0 / std::sqrt(static_cast<scalar_t>(dim_per_head)))
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      qkv.scalar_type(),
+      "transform_bias_rescale_qkv",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto threads = std::max(
+            std::min<int32_t>(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1);
+        auto blocks = B * T;
+        const bool aligned =
+            ((dim_per_head % TRANSFORM_BIAS_RESCALE_VEC) == 0) &&
+            ((reinterpret_cast<intptr_t>(qkv_bias.data_ptr()) %
+              TRANSFORM_BIAS_RESCALE_VEC) == 0);
+        if (aligned) {
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+              D % TRANSFORM_BIAS_RESCALE_VEC == 0,
+              "D = num_heads * dim_per_head, so we should have dim_per_head % "
+              "TRANSFORM_BIAS_RESCALE_VEC == 0 => "
+              "D % TRANSFORM_BIAS_RESCALE_VEC == 0");
+        }
+        if (qkv.is_nested()) {
+          auto* nt_qkv = get_nested_tensor_impl(qkv);
+          auto sizes = collapse_dims_1_and_2(nt_qkv->get_nested_size_tensor());
+          auto offsets =
+              NestedTensor_batch_offsets_from_size_tensor(sizes, sizes.numel());
+          at::native::narrow(offsets, 0, sizes.numel() + 1, sizes.numel())
+              .copy_(sizes.reshape({-1}));
+          auto metadata = offsets.to(at::Device(kCUDA), at::kInt, true, true);
+          const auto offsets_ptr = metadata.data_ptr<int>();
+          const auto sizes_ptr = offsets_ptr + sizes.numel() + 1;
+          const auto input_dim = sizes.sizes()[1];
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input_dim == 1);
+          if (aligned &&
+              ((reinterpret_cast<intptr_t>(nt_qkv->get_buffer().data_ptr()) %
+                TRANSFORM_BIAS_RESCALE_VEC) == 0)) {
+            CALL_ADD_PADDING_KERNEL(true);
+          } else {
+            CALL_ADD_PADDING_KERNEL(false);
+          }
+        } else if (aligned) {
+          CALL_KERNEL(true);
+        } else {
+          CALL_KERNEL(false);
+        }
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+#undef CALL_ADD_PADDING_KERNEL
+#undef CALL_KERNEL
+  auto q_k_v_s =
+      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
+  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp
new file mode 100644
index 000000000000..a789aab18d6c
--- /dev/null
+++ b/aten/src/ATen/native/transformers/transformer.cpp
@@ -0,0 +1,137 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/NestedTensorImpl.h>
+
+#include <torch/library.h>
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+
+namespace at {
+
+namespace native {
+
+namespace {
+Tensor linear_for_ffn(
+    const Tensor& bias,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    c10::optional<bool> use_gelu) {
+  if (mat1.is_nested()) {
+    return NestedTensor_times_Tensor_plus_Tensor_addmm(
+        bias, mat1, mat2.t(), 1, 1, use_gelu);
+  }
+
+  auto mat1_ = mat1.view({mat1.sizes()[0] * mat1.sizes()[1], mat1.sizes()[2]});
+  Tensor result;
+  if (use_gelu.has_value()) {
+    result = at::_addmm_activation(bias, mat1_, mat2.t(), 1, 1, *use_gelu);
+  } else {
+    result = at::addmm(bias, mat1_, mat2.t());
+  }
+  return result.view({mat1.sizes()[0], mat1.sizes()[1], -1});
+}
+
+Tensor ffn(
+    const Tensor& input,
+    const Tensor& w1,
+    const Tensor& b1,
+    const Tensor& w2,
+    const Tensor& b2,
+    bool use_gelu,
+    bool add_norm) {
+  TORCH_CHECK(add_norm == false, "TODO add_norm to be supported in FFN");
+  TORCH_CHECK(input.dim() == 3, "batched input size should be 3");
+  TORCH_CHECK(w1.dim() == 2, "2d weights expected");
+  TORCH_CHECK(w2.dim() == 2, "2d weights expected");
+  Tensor res = linear_for_ffn(b1, input, w1, use_gelu);
+  res = linear_for_ffn(b2, res, w2, c10::nullopt);
+  return res;
+}
+} // namespace
+
+Tensor transformer_encoder_layer_forward(
+    const Tensor& src,
+    const int64_t embed_dim,
+    const int64_t num_heads,
+    const Tensor& qkv_weight,
+    const Tensor& qkv_bias,
+    const Tensor& proj_weight,
+    const Tensor& proj_bias,
+    const bool use_gelu,
+    const bool norm_first,
+    const double layer_norm_eps,
+    const Tensor& layer_norm_weight_1,
+    const Tensor& layer_norm_bias_1,
+    const Tensor& layer_norm_weight_2,
+    const Tensor& layer_norm_bias_2,
+    const Tensor& ffn_weight_1,
+    const Tensor& ffn_bias_1,
+    const Tensor& ffn_weight_2,
+    const Tensor& ffn_bias_2,
+    const c10::optional<Tensor>& mask) {
+  {
+    const Tensor& check_for_empty = src.is_nested() ? get_nested_tensor_impl(src)->get_buffer() : src;
+    if (check_for_empty.numel() == 0) {
+      return src.is_nested()
+        ? at::detail::make_tensor<NestedTensorImpl>(check_for_empty, get_nested_tensor_impl(src)->get_nested_size_tensor())
+        : src.clone();
+    }
+  }
+  TORCH_CHECK(!norm_first, "norm_first is not supported yet");
+  const bool use_nested_tensor = src.is_nested();
+  auto x = std::get<0>(native_multi_head_attention(
+      src,
+      src,
+      src,
+      embed_dim,
+      num_heads,
+      qkv_weight,
+      qkv_bias,
+      proj_weight,
+      proj_bias,
+      mask,
+      false /* need_weights */));
+  if (use_nested_tensor) {
+    NestedTensor_add_NestedTensor_in_place(x, src);
+    x = NestedTensor_layer_norm(
+        x, layer_norm_weight_1, layer_norm_bias_1, layer_norm_eps);
+  } else {
+    x.add_(src);
+    x = at::layer_norm(
+        x,
+        {embed_dim},
+        layer_norm_weight_1,
+        layer_norm_bias_1,
+        layer_norm_eps,
+        true);
+  }
+
+  auto pre_ffn_res = x;
+  x = ffn(
+      x,
+      ffn_weight_1,
+      ffn_bias_1,
+      ffn_weight_2,
+      ffn_bias_2,
+      use_gelu,
+      /* add_norm* */ false);
+  if (use_nested_tensor) {
+    NestedTensor_add_NestedTensor_in_place(x, pre_ffn_res);
+    x = NestedTensor_layer_norm(
+        x, layer_norm_weight_2, layer_norm_bias_2, layer_norm_eps);
+  } else {
+    x.add_(pre_ffn_res);
+    x = at::layer_norm(
+        x,
+        {embed_dim},
+        layer_norm_weight_2,
+        layer_norm_bias_2,
+        layer_norm_eps,
+        true);
+  }
+  return x;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
new file mode 100644
index 000000000000..80febbd039fc
--- /dev/null
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -0,0 +1,181 @@
+backend: Lazy
+cpp_namespace: torch::lazy
+full_codegen:
+  - _adaptive_avg_pool2d
+  - _adaptive_avg_pool2d_backward
+  - _log_softmax
+  - _log_softmax_backward_data
+  - _softmax
+  - _softmax_backward_data
+  - abs
+  - add.Tensor
+  - addcdiv
+  - addcmul
+  - addmm
+  - arange.start_out
+  - all
+  - any
+  - avg_pool2d
+  - avg_pool2d_backward
+  - baddbmm
+  - bernoulli
+  - bernoulli_.float
+  - binary_cross_entropy
+  - binary_cross_entropy_backward
+  - bitwise_and.Tensor
+  - bitwise_or.Tensor
+  - bmm
+  - cat
+  - clamp
+  - clamp_min
+  - constant_pad_nd
+  - convolution
+  - convolution_backward
+  - cos
+  - cumsum
+  - div.Tensor
+  - div.Tensor_mode
+  - elu
+  - elu_backward
+  - embedding
+  - embedding_dense_backward
+  - eq.Scalar
+  - eq.Tensor
+  - exp
+  - flip
+  - floor
+  - frac
+  - gather
+  - ge.Scalar
+  - ge.Tensor
+  - gelu
+  - gelu_backward
+  - glu
+  - glu_backward
+  - glu_jvp
+  - grid_sampler_2d
+  - grid_sampler_2d_backward
+  - gt.Scalar
+  - gt.Tensor
+  - hardsigmoid
+  - index_select
+  - kl_div_backward
+  - l1_loss_backward
+  - le.Scalar
+  - le.Tensor
+  - leaky_relu
+  - leaky_relu_backward
+  - log
+  - log2
+  - logdet
+  - log_sigmoid_backward
+  - log_sigmoid_forward
+  - lt.Scalar
+  - lt.Tensor
+  - masked_fill_.Scalar
+  - masked_fill_.Tensor
+  - max
+  - max.dim
+  - max_pool2d_with_indices
+  - max_pool2d_with_indices_backward
+  - maximum
+  - mean
+  - mean.dim
+  - min
+  - minimum
+  - mm
+  - mul.Tensor
+  - mv
+  - native_dropout
+  - native_dropout_backward
+  - native_layer_norm
+  - native_layer_norm_backward
+  - ne.Scalar
+  - ne.Tensor
+  - neg
+  - nll_loss_backward
+  - nll_loss_forward
+  - nll_loss2d_backward
+  - nll_loss2d_forward
+  - nonzero
+  - norm.ScalarOpt_dim
+  - pow.Tensor_Scalar
+  - pow.Tensor_Tensor
+  - random_
+  - random_.from
+  - random_.to
+  - reciprocal
+  - relu
+  - relu_
+  - remainder.Tensor
+  - repeat
+  - rsqrt
+  - scatter_add
+  - sgn
+  - sigmoid
+  - sigmoid_backward
+  - silu
+  - smooth_l1_loss
+  - smooth_l1_loss_backward
+  - softplus
+  - softplus_backward
+  - sort
+  - sqrt
+  - stack
+  - std
+  - std.dim
+  - std.correction
+  - sub.Tensor
+  - sum
+  - sum.dim_IntList
+  - tanh
+  - tanh_backward
+  - threshold
+  - threshold_backward
+  - topk
+  - trace
+  - tril
+  - triu
+  - trunc
+  - upsample_bilinear2d
+  - upsample_bilinear2d_backward
+  - upsample_nearest2d
+  - upsample_nearest2d_backward
+  - zero_
+  - narrow_copy.SymInt
+supported:
+  - as_strided
+  - as_strided_
+  - clone
+  - _copy_from
+  - _copy_from_and_resize
+  - diagonal
+  - empty.memory_format
+  - empty_strided
+  - expand
+  - fill_.Scalar
+  - narrow
+  - native_batch_norm
+  - native_batch_norm_backward
+  - normal_
+  - max_pool3d_with_indices
+  - max_pool3d_with_indices_backward
+  - permute
+  - select.int
+  - slice.Tensor
+  - squeeze
+  - squeeze.dim
+  - squeeze_
+  - squeeze_.dim
+  - t
+  - t_
+  - _to_copy
+  - transpose.int
+  - transpose_
+  - unsqueeze
+  - unsqueeze_
+  - view
+  - alias
+  - _unsafe_view
+autograd:
+  - max_pool3d
diff --git a/aten/src/ATen/native/ufunc/add.h b/aten/src/ATen/native/ufunc/add.h
new file mode 100644
index 000000000000..94a776728ead
--- /dev/null
+++ b/aten/src/ATen/native/ufunc/add.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+
+namespace at {
+namespace native {
+namespace ufunc {
+
+template <typename T>
+C10_HOST_DEVICE C10_ALWAYS_INLINE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ {
+  return self + alpha * other;
+}
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+using vec::Vectorized;
+template <typename T>
+C10_ALWAYS_INLINE Vectorized<T> add(Vectorized<T> self, Vectorized<T> other, Vectorized<T> alpha) __ubsan_ignore_undefined__ {
+  return vec::fmadd(other, alpha, self);
+}
+#endif
+
+}}}  // namespace at::native::ufunc
diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp
deleted file mode 100644
index 6d253206bafd..000000000000
--- a/aten/src/ATen/native/vulkan/Vulkan.cpp
+++ /dev/null
@@ -1,1420 +0,0 @@
-#include <ATen/Utils.h>
-#include <c10/util/accumulate.h>
-#include <c10/util/ArrayRef.h>
-#include <c10/util/Exception.h>
-#include <c10/util/irange.h>
-
-#ifdef USE_VULKAN_WRAPPER
-#include <vulkan_wrapper.h>
-#else
-#include <vulkan/vulkan.h>
-#endif
-
-#include <ATen/native/vulkan/Vulkan.h>
-#include <ATen/native/vulkan/VulkanCommon.h>
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-#include <ATen/native/vulkan/glsl.h>
-#include <shaderc/shaderc.hpp>
-#else
-#include <ATen/native/vulkan/spv.h>
-#endif
-
-#include <cstring>
-#include <functional>
-#include <iostream>
-#include <numeric>
-#include <stdio.h>
-#include <unistd.h>
-
-
-#define VK_CHECK(f)                                                \
-  {                                                                \
-    VkResult res = (f);                                            \
-    TORCH_CHECK(res == VK_SUCCESS, "Vulkan error VkResult:", res); \
-  }
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace detail {
-
-VContext::VContext(const bool enableValidationLayers)
-    : enableValidationLayers_(enableValidationLayers) {
-  createInstance();
-  findPhysicalDevice();
-  createDevice();
-
-  computeUnitFactory_ = std::make_unique<ComputeUnitFactory>(device_);
-}
-
-VContext::~VContext() {
-  if (enableValidationLayers_) {
-    const auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
-        instance_, "vkDestroyDebugReportCallbackEXT");
-    if (func) {
-      func(instance_, debugReportCallback_, nullptr);
-    }
-  }
-
-  // ComputeUnitFactory_ owns ComputeUnits and VkPipelineCache, need valid
-  // VkDevice for destructing, destructing before vkDestroyDevice
-  computeUnitFactory_.reset();
-
-  vkDestroyCommandPool(device_, commandPool_, nullptr);
-  vkDestroyDevice(device_, nullptr);
-  vkDestroyInstance(instance_, nullptr);
-}
-
-static VKAPI_ATTR VkBool32 VKAPI_CALL debugReportCallbackFn(
-    const VkDebugReportFlagsEXT msgFlags,
-    const VkDebugReportObjectTypeEXT objectType,
-    const uint64_t object,
-    const size_t location,
-    const int32_t msgCode,
-    const char* const pLayerPrefix,
-    const char* const pMsg,
-    void* const pUserData) {
-  std::stringstream s;
-  s << pLayerPrefix << " " << msgCode << " " << pMsg << std::endl;
-  if (msgFlags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
-    LOG(ERROR) << s.str();
-  } else if (msgFlags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
-    LOG(WARNING) << "WARN:" << s.str();
-  } else if (msgFlags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
-    LOG(WARNING) << "PERF_WARN:" << s.str();
-  } else if (msgFlags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
-    LOG(INFO) << s.str();
-  }
-  return VK_FALSE;
-}
-
-void VContext::createInstance() {
-  std::vector<const char*> enabledExtensions;
-  if (enableValidationLayers_) {
-    uint32_t layerPresentCount = 0;
-    VK_CHECK(vkEnumerateInstanceLayerProperties(&layerPresentCount, nullptr));
-    std::vector<VkLayerProperties> layerProps(layerPresentCount);
-    VK_CHECK(vkEnumerateInstanceLayerProperties(&layerPresentCount, layerProps.data()));
-    std::array<const char*, 6> instanceLayers{
-        "VK_LAYER_GOOGLE_unique_objects",
-        "VK_LAYER_GOOGLE_threading",
-        "VK_LAYER_LUNARG_object_tracker",
-        "VK_LAYER_LUNARG_core_validation",
-        "VK_LAYER_LUNARG_parameter_validation",
-        "VK_LAYER_KHRONOS_validation",
-    };
-
-    for (const auto& wantedLayer : instanceLayers) {
-      for (const auto& presentLayer : layerProps) {
-        if (strcmp(wantedLayer, presentLayer.layerName) == 0) {
-          enabledValidationLayers_.push_back(wantedLayer);
-          break;
-        }
-      }
-    }
-
-    uint32_t extCount = 0;
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(nullptr, &extCount, nullptr));
-    std::vector<VkExtensionProperties> extProps(extCount);
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(nullptr, &extCount, extProps.data()));
-    bool foundExt = false;
-    for (VkExtensionProperties p : extProps) {
-      if (strcmp(VK_EXT_DEBUG_REPORT_EXTENSION_NAME, p.extensionName) == 0) {
-        foundExt = true;
-        break;
-      }
-    }
-    if (foundExt) {
-      enabledExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
-    }
-  }
-
-  VkApplicationInfo applicationInfo{};
-  applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
-  applicationInfo.pApplicationName = "PyTorch";
-  applicationInfo.applicationVersion = 0;
-  applicationInfo.pEngineName = "PyTorch";
-  applicationInfo.engineVersion = 0;
-  applicationInfo.apiVersion = VK_API_VERSION_1_0;
-
-  VkInstanceCreateInfo createInfo{};
-  createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-  createInfo.flags = 0;
-  createInfo.pApplicationInfo = &applicationInfo;
-  createInfo.enabledLayerCount = enabledValidationLayers_.size();
-  createInfo.ppEnabledLayerNames = enabledValidationLayers_.data();
-  createInfo.enabledExtensionCount = enabledExtensions.size();
-  createInfo.ppEnabledExtensionNames = enabledExtensions.data();
-
-  VK_CHECK(vkCreateInstance(&createInfo, nullptr, &instance_));
-
-  if (enableValidationLayers_) {
-    VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{};
-    debugReportCallbackCreateInfo.sType =
-        VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT;
-    debugReportCallbackCreateInfo.flags = VK_DEBUG_REPORT_ERROR_BIT_EXT |
-        VK_DEBUG_REPORT_WARNING_BIT_EXT |
-        VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT;
-    debugReportCallbackCreateInfo.pfnCallback = &debugReportCallbackFn;
-
-    const auto vkCreateDebugReportCallbackEXT =
-        (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
-            instance_, "vkCreateDebugReportCallbackEXT");
-    TORCH_CHECK(
-        vkCreateDebugReportCallbackEXT,
-        "Could not load vkCreateDebugReportCallbackEXT");
-    VK_CHECK(vkCreateDebugReportCallbackEXT(
-        instance_,
-        &debugReportCallbackCreateInfo,
-        nullptr,
-        &debugReportCallback_));
-  }
-}
-
-void VContext::findPhysicalDevice() {
-  uint32_t deviceCount = 0;
-  VK_CHECK(vkEnumeratePhysicalDevices(instance_, &deviceCount, nullptr));
-  TORCH_CHECK(
-      deviceCount > 0, "Vulkan: Could not find a device with vulkan support");
-  std::vector<VkPhysicalDevice> devices(deviceCount);
-  VK_CHECK(vkEnumeratePhysicalDevices(instance_, &deviceCount, devices.data()));
-  physicalDevice_ = devices[0];
-}
-
-uint32_t VContext::getComputeQueueFamilyIndex() {
-  uint32_t queueFamilyCount = 0;
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physicalDevice_, &queueFamilyCount, nullptr);
-  TORCH_CHECK(
-      queueFamilyCount > 0, "Vulkan: Invalid number of queue families");
-  std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physicalDevice_, &queueFamilyCount, queueFamilies.data());
-
-  for (const auto i : c10::irange(queueFamilies.size())) {
-    VkQueueFamilyProperties props = queueFamilies[i];
-    if (props.queueCount > 0 && (props.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
-      return i;
-    }
-  }
-
-  TORCH_CHECK(
-      false, "Vulkan: Could not find a queue family that supports operations");
-}
-
-void VContext::createDevice() {
-  VkDeviceQueueCreateInfo queueCreateInfo{};
-  queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-  queueFamilyIndex_ = getComputeQueueFamilyIndex();
-  queueCreateInfo.queueFamilyIndex = queueFamilyIndex_;
-  queueCreateInfo.queueCount = 1;
-  const float queuePriorities = 1.0f;
-  queueCreateInfo.pQueuePriorities = &queuePriorities;
-  VkDeviceCreateInfo deviceCreateInfo{};
-  VkPhysicalDeviceFeatures deviceFeatures{};
-
-  deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
-  deviceCreateInfo.enabledLayerCount = enabledValidationLayers_.size();
-  deviceCreateInfo.ppEnabledLayerNames = enabledValidationLayers_.data();
-  deviceCreateInfo.pQueueCreateInfos = &queueCreateInfo;
-
-  deviceCreateInfo.queueCreateInfoCount = 1;
-  deviceCreateInfo.pEnabledFeatures = &deviceFeatures;
-
-  VK_CHECK(
-      vkCreateDevice(physicalDevice_, &deviceCreateInfo, nullptr, &device_));
-  queue_ = {};
-  vkGetDeviceQueue(device_, queueFamilyIndex_, 0, &queue_);
-
-  VkPhysicalDeviceProperties physicalDeviceProperties{};
-  vkGetPhysicalDeviceProperties(physicalDevice_, &physicalDeviceProperties);
-
-  VkCommandPoolCreateInfo commandPoolCreateInfo{};
-  commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
-  commandPoolCreateInfo.flags = 0;
-  commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex_;
-  VK_CHECK(vkCreateCommandPool(
-      device_, &commandPoolCreateInfo, nullptr, &commandPool_));
-  physicalDeviceLimits_ = physicalDeviceProperties.limits;
-}
-
-static std::unique_ptr<VContext> gContext;
-const VContext& context() {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(gContext);
-  return *gContext;
-}
-
-bool initVulkanContextOnce() {
-  static const int once = []() {
-#ifdef USE_VULKAN_WRAPPER
-    if (!InitVulkan()) {
-      TORCH_WARN("Vulkan Wrapper Failed to InitVulkan");
-      return 1;
-    }
-#endif
-    gContext = std::make_unique<VContext>(kEnableValidationLayers);
-    if (!gContext) {
-      TORCH_WARN("Vulkan Failed to create Vulkan Context");
-      return 2;
-    }
-    return 0;
-  }();
-  ((void)once);
-  return static_cast<bool>(gContext);
-}
-
-bool is_available() {
-  return initVulkanContextOnce();
-}
-
-uint32_t findMemoryType(
-    const VkPhysicalDevice physicalDevice,
-    const uint32_t memoryTypeBits,
-    const VkMemoryPropertyFlags properties) {
-  VkPhysicalDeviceMemoryProperties memoryProperties{};
-  vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memoryProperties);
-  for (const auto i : c10::irange(memoryProperties.memoryTypeCount)) {
-    if ((memoryTypeBits & (1 << i)) &&
-        ((memoryProperties.memoryTypes[i].propertyFlags & properties) ==
-         properties)) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-void VBuffer::MapMemory::flushWriteToDevice() {
-  VkMappedMemoryRange range{};
-  range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-  range.memory = deviceMemory_;
-  range.offset = offset_;
-  range.size = size_;
-  range.pNext = nullptr;
-
-  VK_CHECK(vkFlushMappedMemoryRanges(context().device(), 1, &range));
-}
-
-void VBuffer::MapMemory::flushWriteToHost() {
-  VkMappedMemoryRange range{};
-  range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-  range.memory = deviceMemory_;
-  range.offset = offset_;
-  range.size = size_;
-  range.pNext = nullptr;
-
-  VK_CHECK(vkInvalidateMappedMemoryRanges(context().device(), 1, &range));
-}
-
-VBuffer::VBuffer(
-    const VkDeviceSize bufferSizeBytes,
-    const VkBufferUsageFlags bufferUsageFlags,
-    const VkDescriptorType descriptorType)
-    : bufferSizeBytes_(bufferSizeBytes), descriptorType_(descriptorType) {
-  const auto device = context().device();
-  const auto physicalDevice = context().physicalDevice();
-  VkBufferCreateInfo bufferCreateInfo{};
-  bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-  bufferCreateInfo.size = bufferSizeBytes_;
-  bufferCreateInfo.usage = bufferUsageFlags;
-  bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  VK_CHECK(vkCreateBuffer(device, &bufferCreateInfo, nullptr, &buffer_));
-  VkMemoryRequirements memoryRequirements;
-  vkGetBufferMemoryRequirements(device, buffer_, &memoryRequirements);
-  VkMemoryAllocateInfo allocateInfo{};
-  allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-  allocateInfo.allocationSize = memoryRequirements.size;
-  allocateInfo.memoryTypeIndex = findMemoryType(
-      physicalDevice,
-      memoryRequirements.memoryTypeBits,
-      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-  VK_CHECK(vkAllocateMemory(device, &allocateInfo, nullptr, &bufferMemory_));
-  VK_CHECK(vkBindBufferMemory(device, buffer_, bufferMemory_, 0));
-}
-
-VBuffer::~VBuffer() {
-  vkFreeMemory(context().device(), bufferMemory_, nullptr);
-  vkDestroyBuffer(context().device(), buffer_, nullptr);
-}
-
-void VBuffer::copy_from_device_to_host(
-    void* const outputData, const int64_t size) const {
-  auto mm = map();
-  TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory");
-  ::memcpy(outputData, mm.ptr(), size);
-  mm.flushWriteToHost();
-}
-
-void VBuffer::copy_from_host_to_device(
-    const void* const data, const int64_t size) {
-  auto mm = map();
-  TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory");
-  ::memcpy(mm.ptr(), data, size);
-  mm.flushWriteToDevice();
-}
-
-void VBuffer::set_zeros() {
-  auto mm = map();
-  TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory");
-  ::memset(mm.ptr(), 0, bufferSizeBytes_);
-}
-
-VkDescriptorBufferInfo VBuffer::makeDescriptorBufferInfo() const {
-  VkDescriptorBufferInfo info{};
-  info.buffer = buffer_;
-  info.offset = 0;
-  info.range = bufferSizeBytes_;
-  return info;
-}
-
-VkWriteDescriptorSet VBuffer::makeWriteDescriptorSet(
-    const VkDescriptorSet descriptorSet,
-    const uint32_t binding,
-    const VkDescriptorBufferInfo* const bufferInfo) const {
-  VkWriteDescriptorSet writeSet{};
-  writeSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-  writeSet.pNext = nullptr;
-  writeSet.dstSet = descriptorSet;
-  writeSet.dstBinding = binding;
-  writeSet.dstArrayElement = 0;
-  writeSet.descriptorCount = 1;
-  writeSet.descriptorType = descriptorType_;
-  writeSet.pImageInfo = nullptr;
-  writeSet.pBufferInfo = bufferInfo;
-  writeSet.pTexelBufferView = nullptr;
-  return writeSet;
-}
-
-void VBuffer::bind(const VkDescriptorSet descriptorSet, const uint32_t binding) const {
-  const auto descrBufferInfo = makeDescriptorBufferInfo();
-  const auto writeDescrSet =
-      makeWriteDescriptorSet(descriptorSet, binding, &descrBufferInfo);
-  vkUpdateDescriptorSets(context().device(), 1, &writeDescrSet, 0, nullptr);
-}
-
-void VBuffer::addBufferMemoryBarrier(
-    const VkCommandBuffer commandBuffer,
-    const VkDeviceSize offset,
-    const VkDeviceSize size) const {
-  VkBufferMemoryBarrier barrier{};
-  barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-  barrier.buffer = buffer_;
-  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.offset = offset;
-  barrier.pNext = nullptr;
-  barrier.size = size;
-  barrier.srcAccessMask =
-      VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT;
-  barrier.dstAccessMask =
-      VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT;
-
-  vkCmdPipelineBarrier(
-      commandBuffer,
-      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT,
-      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT,
-      0,
-      0,
-      nullptr,
-      1,
-      &barrier,
-      0,
-      nullptr);
-}
-
-VImage::VImage(const ImageSize imageSize, const ImageSize dataSize)
-    : imageSize_(imageSize), dataSize_(dataSize) {
-  const auto device = context().device();
-  const auto physicalDevice = context().physicalDevice();
-
-  VkImageCreateInfo imageInfo{};
-  imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-  imageInfo.imageType = kImageType;
-  imageInfo.extent.width = imageSize_[0];
-  imageInfo.extent.height = imageSize_[1];
-  imageInfo.extent.depth = imageSize_[2];
-
-  imageInfo.mipLevels = 1;
-  imageInfo.arrayLayers = 1;
-  imageInfo.format = kFormat;
-  imageInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
-  imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-  imageInfo.usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
-  imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-  imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
-  imageInfo.pNext = nullptr;
-  imageInfo.flags = 0;
-  imageLayout_ = VK_IMAGE_LAYOUT_UNDEFINED;
-
-  VK_CHECK(vkCreateImage(device, &imageInfo, nullptr, &image_));
-
-  VkMemoryRequirements memReqs{};
-  vkGetImageMemoryRequirements(device, image_, &memReqs);
-  VkMemoryAllocateInfo allocInfo{};
-  allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-  allocInfo.allocationSize = memReqs.size;
-  allocInfo.memoryTypeIndex = findMemoryType(
-      physicalDevice,
-      memReqs.memoryTypeBits,
-      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-
-  VK_CHECK(vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory_));
-  VK_CHECK(vkBindImageMemory(device, image_, imageMemory_, 0));
-
-  const VkImageViewCreateInfo imageViewCreateInfo = makeImageViewCreateInfo();
-  VK_CHECK(
-      vkCreateImageView(device, &imageViewCreateInfo, nullptr, &imageView_));
-
-  const VkSamplerCreateInfo samplerCreateInfo = makeSamplerCreateInfo();
-  VK_CHECK(vkCreateSampler(device, &samplerCreateInfo, nullptr, &sampler_));
-}
-
-VImage::~VImage() {
-  vkFreeMemory(context().device(), imageMemory_, nullptr);
-  vkDestroySampler(context().device(), sampler_, nullptr);
-  vkDestroyImageView(context().device(), imageView_, nullptr);
-  vkDestroyImage(context().device(), image_, nullptr);
-}
-
-VkImageViewCreateInfo VImage::makeImageViewCreateInfo() const {
-  VkImageViewCreateInfo info{};
-  info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-  info.image = image_;
-  info.viewType = kImageViewType;
-  info.format = kFormat;
-  info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-  info.subresourceRange.baseMipLevel = 0;
-  info.subresourceRange.levelCount = 1;
-  info.subresourceRange.baseArrayLayer = 0;
-  info.subresourceRange.layerCount = 1;
-  return info;
-}
-
-VkSamplerCreateInfo VImage::makeSamplerCreateInfo() const {
-  VkSamplerCreateInfo info{};
-  info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
-  info.magFilter = kFilter;
-  info.minFilter = kFilter;
-  info.addressModeU = kSamplerAddressMode;
-  info.addressModeV = kSamplerAddressMode;
-  info.addressModeW = kSamplerAddressMode;
-  info.anisotropyEnable = VK_FALSE;
-  info.maxAnisotropy = 1.0f;
-  info.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
-  info.compareEnable = VK_FALSE;
-  info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
-  info.mipLodBias = 0.0f;
-  info.minLod = 0.0f;
-  info.maxLod = 0.0f;
-  return info;
-}
-
-VkDescriptorImageInfo VImage::makeDescriptorImageInfo(
-    const VkImageLayout imageLayout) const {
-  VkDescriptorImageInfo info{};
-  info.sampler = sampler_;
-  info.imageView = imageView_;
-  info.imageLayout = imageLayout;
-  return info;
-}
-
-VkWriteDescriptorSet VImage::makeWriteDescriptorSet(
-    const VkDescriptorSet descriptorSet,
-    const uint32_t binding,
-    const VkDescriptorType descriptorType,
-    const VkDescriptorImageInfo* const imageInfo) const {
-  VkWriteDescriptorSet writeSet{};
-  writeSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-  writeSet.pNext = nullptr;
-  writeSet.dstSet = descriptorSet;
-  writeSet.dstBinding = binding;
-  writeSet.dstArrayElement = 0;
-  writeSet.descriptorCount = 1;
-  writeSet.descriptorType = descriptorType, writeSet.pImageInfo = imageInfo;
-  writeSet.pBufferInfo = nullptr;
-  writeSet.pTexelBufferView = nullptr;
-  return writeSet;
-}
-
-void VImage::bind(
-    const VkDescriptorSet descriptorSet,
-    const uint32_t binding,
-    const VkDescriptorType descriptorType,
-    const VkImageLayout imageLayout) const {
-  const auto descrImageInfo = makeDescriptorImageInfo(imageLayout);
-  const auto writeDescrSet = makeWriteDescriptorSet(
-      descriptorSet, binding, descriptorType, &descrImageInfo);
-  vkUpdateDescriptorSets(context().device(), 1, &writeDescrSet, 0, nullptr);
-}
-
-void VImage::bindShaderRead(
-    const VkDescriptorSet descriptorSet, const uint32_t binding) const {
-  bind(
-      descriptorSet,
-      binding,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
-}
-
-void VImage::bindStorageImage(
-    const VkDescriptorSet descriptorSet, const uint32_t binding) const {
-  bind(
-      descriptorSet,
-      binding,
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_IMAGE_LAYOUT_GENERAL);
-}
-
-void VImage::addImageMemoryBarrier(
-    const VkCommandBuffer commandBuffer,
-    const VkImageLayout newLayout) const {
-  const VkImageLayout oldLayout = imageLayout_;
-  if (oldLayout == newLayout) {
-    return;
-  }
-
-  VkImageMemoryBarrier barrier{};
-  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-  barrier.image = image_;
-  barrier.newLayout = newLayout;
-  barrier.oldLayout = oldLayout;
-  barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-  barrier.subresourceRange.levelCount = 1;
-  barrier.subresourceRange.layerCount = 1;
-
-  barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-  barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-
-  VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-  VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-  if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
-      newLayout == VK_IMAGE_LAYOUT_GENERAL) {
-    barrier.srcAccessMask = 0;
-    barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-    srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-  } else if (
-      oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
-      newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
-    barrier.srcAccessMask = 0;
-    barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-  } else if (
-      oldLayout == VK_IMAGE_LAYOUT_GENERAL &&
-      newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
-    barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-    barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-  } else if (
-      oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL &&
-      newLayout == VK_IMAGE_LAYOUT_GENERAL) {
-    barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT;
-    barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "Vulkan: Unsupported Vulkan Image Layout transition");
-  }
-  vkCmdPipelineBarrier(
-      commandBuffer,
-      srcStageMask,
-      dstStageMask,
-      0,
-      0,
-      nullptr,
-      0,
-      nullptr,
-      1,
-      &barrier);
-  imageLayout_ = newLayout;
-}
-
-void VImage::addImageMemoryBarrierToGeneral(
-    const VkCommandBuffer commandBuffer) const {
-  addImageMemoryBarrier(commandBuffer, VK_IMAGE_LAYOUT_GENERAL);
-}
-
-void VImage::addImageMemoryBarrierToShaderRead(
-    const VkCommandBuffer commandBuffer) const {
-  addImageMemoryBarrier(
-      commandBuffer, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
-}
-
-VkDescriptorSetLayoutBinding descriptorSetLayoutBinding(
-    const uint32_t binding,
-    const VkDescriptorType descriptorType) {
-  return {binding, descriptorType, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr};
-}
-
-void createDescriptorSetLayout(
-    const VkDevice device,
-    const VkDescriptorSetLayoutBinding* const bindings,
-    const uint32_t bindingCount,
-    VkDescriptorSetLayout* const setLayout) {
-  VkDescriptorSetLayoutCreateInfo createInfo{};
-  createInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-  createInfo.pNext = nullptr;
-  createInfo.flags = 0;
-  createInfo.bindingCount = bindingCount;
-  createInfo.pBindings = bindings;
-  VK_CHECK(
-      vkCreateDescriptorSetLayout(device, &createInfo, nullptr, setLayout));
-}
-
-void createDescriptorPool(
-    const VkDevice device,
-    const VkDescriptorPoolSize* poolSizes,
-    const uint32_t poolSizeCount,
-    const uint32_t maxSets,
-    VkDescriptorPool* const descriptorPool) {
-  VkDescriptorPoolCreateInfo createInfo{};
-  createInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-  createInfo.pNext = nullptr;
-  createInfo.flags = 0;
-  createInfo.maxSets = maxSets;
-  createInfo.poolSizeCount = poolSizeCount;
-  createInfo.pPoolSizes = poolSizes;
-  VK_CHECK(
-      vkCreateDescriptorPool(device, &createInfo, nullptr, descriptorPool));
-}
-
-void allocateDescriptorSet(
-    const VkDevice device,
-    const VkDescriptorPool descriptorPool,
-    const VkDescriptorSetLayout* const descriptorSetLayout,
-    VkDescriptorSet* const descriptorSet) {
-  VkDescriptorSetAllocateInfo allocateInfo{};
-  allocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
-  allocateInfo.pNext = nullptr;
-  allocateInfo.descriptorPool = descriptorPool;
-  allocateInfo.descriptorSetCount = 1;
-  allocateInfo.pSetLayouts = descriptorSetLayout;
-  VK_CHECK(vkAllocateDescriptorSets(device, &allocateInfo, descriptorSet));
-}
-
-void createDescriptorSetLayoutSinglePool(
-    const VkDevice device,
-    const std::vector<VkDescriptorType>& descrTypes,
-    VkDescriptorSetLayout* const descrSetLayout,
-    VkDescriptorPool* const descrPool,
-    VkDescriptorSet* const descrSet) {
-  const auto size = descrTypes.size();
-  std::vector<VkDescriptorSetLayoutBinding> bindings;
-  std::vector<VkDescriptorPoolSize> poolSizes;
-  uint32_t i = 0;
-  for (const auto& descrType : descrTypes) {
-    bindings.push_back(descriptorSetLayoutBinding(i, descrType));
-    poolSizes.push_back(VkDescriptorPoolSize{descrType, 1});
-    i++;
-  }
-  createDescriptorSetLayout(device, bindings.data(), size, descrSetLayout);
-  createDescriptorPool(
-      device, poolSizes.data(), size, 1 /* maxSets */, descrPool);
-  allocateDescriptorSet(device, *descrPool, descrSetLayout, descrSet);
-}
-
-void allocateCommandBuffer(VkDevice device, VkCommandBuffer* commandBuffer) {
-  VkCommandBufferAllocateInfo commandBufferAllocateInfo{};
-  commandBufferAllocateInfo.sType =
-      VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-  commandBufferAllocateInfo.commandPool = context().commandPool();
-  commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-  commandBufferAllocateInfo.commandBufferCount = 1;
-
-  VK_CHECK(vkAllocateCommandBuffers(
-      device, &commandBufferAllocateInfo, commandBuffer));
-}
-
-void beginCommandBuffer(VkCommandBuffer commandBuffer) {
-  VkCommandBufferBeginInfo beginInfo{};
-  beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-  beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-  VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo));
-}
-
-void endCommandBuffer(VkCommandBuffer commandBuffer) {
-  VK_CHECK(vkEndCommandBuffer(commandBuffer));
-}
-
-void submitAndWaitCommandBuffer(
-    VkDevice device,
-    VkQueue queue,
-    VkCommandBuffer commandBuffer) {
-  VkSubmitInfo submitInfo{};
-  submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-  submitInfo.commandBufferCount = 1;
-  submitInfo.pCommandBuffers = &commandBuffer;
-
-  VkFence fence;
-  VkFenceCreateInfo fenceCreateInfo{};
-  fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-  fenceCreateInfo.flags = 0;
-  VK_CHECK(vkCreateFence(device, &fenceCreateInfo, NULL, &fence))
-
-  VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, fence));
-  vkWaitForFences(device, 1, &fence, VK_TRUE, ComputeUnit::kFenceTimeoutNanos);
-
-  vkDestroyFence(device, fence, NULL);
-}
-
-ComputeUnit::~ComputeUnit() {
-  vkDestroyShaderModule(context().device(), computeShaderModule_, nullptr);
-  vkDestroyPipelineLayout(context().device(), pipelineLayout_, nullptr);
-  vkDestroyPipeline(context().device(), pipeline_, nullptr);
-}
-
-void ComputeUnit::createComputePipeline(
-    const uint32_t* const code,
-    const uint32_t codeSize,
-    const VkPipelineCache pipelineCache,
-    const VkDescriptorSetLayout descrSetLayout,
-    const WorkGroupSize workGroupSize) {
-  const auto device = context().device();
-  VkShaderModuleCreateInfo createInfo{};
-  createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-  createInfo.pCode = code;
-  createInfo.codeSize = codeSize;
-
-  VK_CHECK(vkCreateShaderModule(
-      device, &createInfo, nullptr, &computeShaderModule_));
-
-  VkSpecializationMapEntry spMapEntries[3];
-  {
-    uint32_t offset = 0;
-    size_t size = sizeof(WorkGroupSize::x);
-    spMapEntries[0].constantID = 0;
-    spMapEntries[0].offset = offset;
-    spMapEntries[0].size = size;
-    offset += size;
-    size = sizeof(WorkGroupSize::y);
-    spMapEntries[1].constantID = 1;
-    spMapEntries[1].offset = offset;
-    spMapEntries[1].size = size;
-    offset += size;
-    size = sizeof(WorkGroupSize::z);
-    spMapEntries[2].constantID = 2;
-    spMapEntries[2].offset = offset;
-    spMapEntries[2].size = size;
-  }
-  VkSpecializationInfo spInfo;
-  spInfo.mapEntryCount = 3;
-  spInfo.pMapEntries = spMapEntries;
-  spInfo.dataSize = sizeof(workGroupSize);
-  spInfo.pData = &workGroupSize;
-
-  VkPipelineShaderStageCreateInfo shaderStageCreateInfo{};
-  shaderStageCreateInfo.sType =
-      VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-  shaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
-  shaderStageCreateInfo.module = computeShaderModule_;
-  shaderStageCreateInfo.pName = "main";
-  shaderStageCreateInfo.pSpecializationInfo = &spInfo;
-
-  VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{};
-  pipelineLayoutCreateInfo.sType =
-      VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-  pipelineLayoutCreateInfo.setLayoutCount = 1;
-  pipelineLayoutCreateInfo.pSetLayouts = &descrSetLayout;
-
-  VK_CHECK(vkCreatePipelineLayout(
-      device, &pipelineLayoutCreateInfo, nullptr, &pipelineLayout_));
-
-  VkComputePipelineCreateInfo pipelineCreateInfo{};
-  pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
-  pipelineCreateInfo.stage = shaderStageCreateInfo;
-  pipelineCreateInfo.layout = pipelineLayout_;
-
-  VK_CHECK(vkCreateComputePipelines(
-      device, pipelineCache, 1, &pipelineCreateInfo, nullptr, &pipeline_));
-}
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-void ComputeUnit::createComputePipelineCompile(
-    const std::string& glslSrc,
-    const VkPipelineCache pipelineCache,
-    const VkDescriptorSetLayout descrSetLayout,
-    const WorkGroupSize workGroupSize) {
-  shaderc::Compiler compiler{};
-  shaderc::CompileOptions options{};
-#ifdef DEBUG
-  options.SetGenerateDebugInfo();
-#endif
-  options.SetTargetEnvironment(
-      shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_0);
-  options.SetForcedVersionProfile(450, shaderc_profile_core);
-  const shaderc::SpvCompilationResult compilationResult = compiler.CompileGlslToSpv(
-      glslSrc.c_str(),
-      glslSrc.size(),
-      shaderc_compute_shader,
-      "vulkan_shader.comp",
-      "main",
-      options);
-  const auto compilationStatus = compilationResult.GetCompilationStatus();
-  TORCH_INTERNAL_ASSERT(
-      compilationStatus == shaderc_compilation_status_success,
-      "Shader compilation error: status:",
-      compilationStatus,
-      compilationResult.GetErrorMessage());
-  const std::vector<uint32_t> shaderSpvCode(
-      compilationResult.cbegin(), compilationResult.cend());
-  const auto codeSizeBytes = 4 * shaderSpvCode.size();
-  createComputePipeline(
-      shaderSpvCode.data(),
-      codeSizeBytes,
-      pipelineCache,
-      descrSetLayout,
-      workGroupSize);
-}
-#endif
-
-void ComputeUnit::createCommandBuffer(VkDescriptorSet& descriptorSet) {
-  const auto device = context().device();
-  VkCommandBufferAllocateInfo commandBufferAllocateInfo{};
-  commandBufferAllocateInfo.sType =
-      VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-  commandBufferAllocateInfo.commandPool = context().commandPool();
-  commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-  commandBufferAllocateInfo.commandBufferCount = 1;
-
-  VK_CHECK(vkAllocateCommandBuffers(
-      device, &commandBufferAllocateInfo, &commandBuffer_));
-
-  VkCommandBufferBeginInfo beginInfo{};
-  beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-  beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-  VK_CHECK(vkBeginCommandBuffer(commandBuffer_, &beginInfo));
-
-  vkCmdBindPipeline(commandBuffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_);
-  vkCmdBindDescriptorSets(
-      commandBuffer_,
-      VK_PIPELINE_BIND_POINT_COMPUTE,
-      pipelineLayout_,
-      0,
-      1,
-      &descriptorSet,
-      0,
-      nullptr);
-}
-
-void ComputeUnit::addMemoryBarrier(
-    const VkPipelineStageFlags srcStageMask,
-    const VkAccessFlags srcAccessMask,
-    const VkPipelineStageFlags dstStageMask,
-    const VkAccessFlags dstAccessMask) {
-  VkMemoryBarrier barrier{};
-  barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
-  barrier.pNext = nullptr;
-  barrier.srcAccessMask = srcAccessMask;
-  barrier.dstAccessMask = dstAccessMask;
-  vkCmdPipelineBarrier(
-      commandBuffer_,
-      srcStageMask,
-      dstStageMask,
-      0,
-      1,
-      &barrier,
-      0,
-      nullptr,
-      0,
-      nullptr);
-}
-
-void ComputeUnit::dispatchCommandBuffer(
-    const uint32_t groupCountX,
-    const uint32_t groupCountY,
-    const uint32_t groupCountZ) {
-  vkCmdDispatch(commandBuffer_, groupCountX, groupCountY, groupCountZ);
-}
-
-void ComputeUnit::endCommandBuffer() {
-  at::native::vulkan::detail::endCommandBuffer(commandBuffer_);
-}
-
-void ComputeUnit::dispatchCommandBuffer(
-    const uint32_t gridX,
-    const uint32_t gridY,
-    const uint32_t gridZ,
-    const WorkGroupSize workGroupSize) {
-  dispatchCommandBuffer(
-      UP_DIV(gridX, workGroupSize.x),
-      UP_DIV(gridY, workGroupSize.y),
-      UP_DIV(gridZ, workGroupSize.z));
-}
-
-void ComputeUnit::submitAndWaitCommandBuffer() {
-  VkSubmitInfo submitInfo{};
-  submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-  submitInfo.commandBufferCount = 1;
-  submitInfo.pCommandBuffers = &commandBuffer_;
-
-  VkFence fence{};
-  VkFenceCreateInfo fenceCreateInfo{};
-  fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-  fenceCreateInfo.flags = 0;
-  VK_CHECK(vkCreateFence(context().device(), &fenceCreateInfo, NULL, &fence))
-
-  VK_CHECK(vkQueueSubmit(context().queue(), 1, &submitInfo, fence));
-  vkWaitForFences(context().device(), 1, &fence, VK_TRUE, kFenceTimeoutNanos);
-
-  vkDestroyFence(context().device(), fence, NULL);
-}
-
-VBuffer makeUniformConstBuffer(const void* const ptr, const VkDeviceSize size) {
-  VBuffer constBuffer = VBuffer::makeUniformBuffer(size);
-  constBuffer.copy_from_host_to_device(ptr, size);
-  return constBuffer;
-}
-
-ComputeUnitFactory::ComputeUnitFactory(const VkDevice device)
-    : device_(device) {
-  VkPipelineCacheCreateInfo createInfo{};
-  createInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
-  createInfo.pNext = nullptr;
-  createInfo.flags = 0;
-  createInfo.initialDataSize = 0;
-  createInfo.pInitialData = nullptr;
-  VK_CHECK(vkCreatePipelineCache(
-      device_, &createInfo, nullptr /* allocator */, &pipelineCache_));
-}
-
-ComputeUnitFactory::~ComputeUnitFactory() {
-  vkDestroyPipelineCache(device_, pipelineCache_, nullptr /* allocator */);
-}
-
-std::string ComputeUnitFactory::getCacheKey(
-    const char* const key,
-    const WorkGroupSize workGroupSize) {
-  std::stringstream ss;
-  ss << key << ':' << workGroupSize.x << ':' << workGroupSize.y << ':'
-     << workGroupSize.z;
-  return ss.str();
-}
-
-ComputeUnit& ComputeUnitFactory::get(
-    const std::string& cacheKey,
-    const std::function<std::shared_ptr<ComputeUnit>()> factoryFn) {
-  const auto it = computeUnits_.find(cacheKey);
-  if (it != computeUnits_.end()) {
-    return *(it->second.get());
-  }
-  auto computeUnit = factoryFn();
-  computeUnits_.insert(std::make_pair(cacheKey, computeUnit));
-  return *(computeUnit.get());
-}
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-ComputeUnit& ComputeUnitFactory::get(
-    const char* const key,
-    const char* const glslSrc,
-    const VkDescriptorSetLayout descrSetLayout,
-    const WorkGroupSize workGroupSize) {
-  return get(
-      getCacheKey(key, workGroupSize),
-      [glslSrc,
-       pipelineCache = pipelineCache_,
-       descrSetLayout,
-       workGroupSize]() {
-        return std::make_shared<ComputeUnit>(
-            glslSrc, pipelineCache, descrSetLayout, workGroupSize);
-      });
-}
-#else
-ComputeUnit& ComputeUnitFactory::get(
-    const char* const key,
-    const uint32_t* const code,
-    const uint32_t codeSize,
-    const VkDescriptorSetLayout descrSetLayout,
-    const WorkGroupSize workGroupSize) {
-  return get(
-      getCacheKey(key, workGroupSize),
-      [code,
-       codeSize,
-       pipelineCache = pipelineCache_,
-       descrSetLayout,
-       workGroupSize]() {
-        return std::make_shared<ComputeUnit>(
-            code, codeSize, pipelineCache, descrSetLayout, workGroupSize);
-      });
-}
-#endif
-
-// VBuffer <-> VImage
-void copy_buffer_to_image(const VBuffer& buffer, VImage& image) {
-  const auto device = context().device();
-
-  VkDescriptorSetLayout descrSetLayout{};
-  VkDescriptorSetLayoutBinding bindings[] = {
-      descriptorSetLayoutBinding(0, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE),
-      descriptorSetLayoutBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER),
-      descriptorSetLayoutBinding(2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)};
-  createDescriptorSetLayout(
-      device, bindings, 3 /* bindingsCount */, &descrSetLayout);
-
-  VkDescriptorPool descrPool{};
-  VkDescriptorPoolSize poolSizes[] = {{VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1},
-                                      {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1},
-                                      {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}};
-  createDescriptorPool(
-      device, poolSizes, 3 /* poolSizeCount */, 1 /* maxSets */, &descrPool);
-
-  VkDescriptorSet descrSet{};
-  allocateDescriptorSet(device, descrPool, &descrSetLayout, &descrSet);
-
-  image.bindStorageImage(descrSet, 0);
-  buffer.bind(descrSet, 1);
-  WorkGroupSize workGroupSize{8, 8, 1};
-
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(nchw_to_image), descrSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descrSet);
-
-  image.addImageMemoryBarrierToGeneral(computeUnit.commandBuffer());
-  buffer.addBufferMemoryBarrier(
-      computeUnit.commandBuffer(), 0, buffer.sizeBytes());
-  computeUnit.addMemoryBarrier(
-      VK_PIPELINE_STAGE_HOST_BIT,
-      VK_ACCESS_HOST_WRITE_BIT,
-      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-      VK_ACCESS_SHADER_READ_BIT);
-  computeUnit.dispatchCommandBuffer(
-      image.w(), image.h(), image.d(), workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-
-  vkDestroyDescriptorPool(device, descrPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descrSetLayout, nullptr);
-}
-
-void copy_image_to_buffer(
-    const VImage& image,
-    VBuffer& buffer,
-    bool addBufferMemoryBarrierForHost) {
-  const auto device = context().device();
-  TORCH_INTERNAL_ASSERT(
-      buffer.sizeBytes() >= image.capacityBytes(),
-      "VulkanBuffer's capacity is less than VulkanImage capacity to copy from");
-
-  VkDescriptorSetLayout descrSetLayout{};
-  const VkDescriptorSetLayoutBinding bindings[] = {
-      descriptorSetLayoutBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER),
-      descriptorSetLayoutBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER),
-      descriptorSetLayoutBinding(2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)};
-  createDescriptorSetLayout(
-      device, bindings, 3 /* bindingsCount */, &descrSetLayout);
-
-  VkDescriptorPool descrPool{};
-  const VkDescriptorPoolSize poolSizes[] = {
-      {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1},
-      {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1},
-      {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}};
-  createDescriptorPool(
-      device, poolSizes, 3 /* poolSizeCount */, 1 /* maxSets */, &descrPool);
-
-  VkDescriptorSet descrSet{};
-  allocateDescriptorSet(device, descrPool, &descrSetLayout, &descrSet);
-
-  image.bindShaderRead(descrSet, 0);
-  buffer.bind(descrSet, 1);
-
-  const WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(image_to_nchw), descrSetLayout, workGroupSize);
-
-  computeUnit.createCommandBuffer(descrSet);
-  image.addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer());
-  computeUnit.dispatchCommandBuffer(
-      image.w(), image.h(), image.d(), workGroupSize);
-
-  if (addBufferMemoryBarrierForHost) {
-    computeUnit.addMemoryBarrier(
-        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-        VK_ACCESS_SHADER_WRITE_BIT,
-        VK_PIPELINE_STAGE_HOST_BIT,
-        VK_ACCESS_HOST_READ_BIT);
-  }
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-
-  vkDestroyDescriptorPool(device, descrPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descrSetLayout, nullptr);
-} // VBuffer <-> VImage
-
-void copy_buffer_to_buffer(
-    const VBuffer& srcBuffer,
-    VBuffer& dstBuffer,
-    VkDeviceSize size,
-    VkDeviceSize srcOffset,
-    VkDeviceSize dstOffset) {
-  auto device = context().device();
-  VkCommandBuffer commandBuffer{};
-  allocateCommandBuffer(device, &commandBuffer);
-  beginCommandBuffer(commandBuffer);
-
-  VkBufferCopy copyRegion{};
-  copyRegion.srcOffset = srcOffset;
-  copyRegion.dstOffset = dstOffset;
-  copyRegion.size = size;
-  vkCmdCopyBuffer(
-      commandBuffer,
-      srcBuffer.vkbuffer(),
-      dstBuffer.vkbuffer(),
-      1,
-      &copyRegion);
-
-  endCommandBuffer(commandBuffer);
-  submitAndWaitCommandBuffer(device, context().queue(), commandBuffer);
-}
-
-// VulkanTensor
-
-class VulkanTensor::Impl final {
- public:
-  explicit Impl(std::vector<int64_t> sizes)
-      : sizes_(std::move(sizes)),
-        strides_(std::vector<int64_t>(sizes_.size())),
-        numel_(c10::multiply_integers(sizes_)) {
-    TORCH_CHECK(
-        initVulkanContextOnce(), "Vulkan Failed to create Vulkan Context");
-  }
-
-  std::vector<int64_t> sizes() const {
-    return sizes_;
-  }
-
-  std::vector<int64_t> strides() const {
-    return strides_;
-  }
-
-  inline int64_t dim() const {
-    return sizes_.size();
-  }
-
-  inline int64_t numel() const {
-    return numel_;
-  }
-
-  inline bool has_buffer() const {
-    return static_cast<bool>(buffer_);
-  }
-
-  inline VBuffer* buffer() {
-    if (!has_buffer()) {
-      buffer_ = std::make_unique<VBuffer>(buffer_size_for_sizes(sizes_));
-    }
-    return buffer_.get();
-  }
-
-  const VBuffer* buffer() const {
-    if (!has_buffer()) {
-      buffer_ = std::make_unique<VBuffer>(buffer_size_for_sizes(sizes_));
-    }
-    return buffer_.get();
-  }
-
-  inline bool can_be_image() const {
-    return dim() <= 4;
-  }
-
-  inline bool has_image() const {
-    return static_cast<bool>(image_);
-  }
-
-  inline bool has_storage() {
-    return has_buffer();
-  }
-
-  ImageSizes imageSizes_W_H_NC4() {
-    TORCH_INTERNAL_ASSERT(
-        can_be_image(),
-        "Vulkan: Only Tensors with dim <= 4 can be represented as Vulkam Image");
-    auto d = dim();
-    int64_t _wd = 1;
-    int64_t _hd = 1;
-    int64_t _dd = 1;
-    if (d == 4) {
-      _wd = sizes_[3];
-      _hd = sizes_[2];
-      _dd = sizes_[1] * sizes_[0];
-    } else if (d == 3) {
-      _wd = sizes_[2];
-      _hd = sizes_[1];
-      _dd = sizes_[0];
-    } else if (d == 2) {
-      _wd = sizes_[1];
-      _hd = sizes_[0];
-    } else if (d == 1) {
-      _wd = sizes_[0];
-    }
-    int32_t wd = safe_downcast<int64_t>(_wd);
-    int32_t hd = safe_downcast<int64_t>(_hd);
-    int32_t dd = safe_downcast<int64_t>(_dd);
-    return {{wd, hd, UP_DIV(dd, 4)}, {wd, hd, dd}};
-  }
-
-  VImage* image(const c10::optional<ImageSizes> imageSizes = c10::nullopt) {
-    if (image_) {
-      return image_.get();
-    }
-
-    if (imageSizes.has_value()) {
-      image_ = std::make_unique<VImage>(*imageSizes);
-      return image_.get();
-    }
-
-    image_ = std::make_unique<VImage>(imageSizes_W_H_NC4());
-    if (buffer_) {
-      copy_buffer_to_image(*buffer_, *image_);
-    }
-    return image_.get();
-  }
-
-  const VImage* image(
-      c10::optional<ImageSizes> imageSizes = c10::nullopt) const {
-    return const_cast<VulkanTensor::Impl*>(this)->image(imageSizes);
-  }
-
-  VkDeviceSize buffer_size_for_sizes(std::vector<int64_t> sizes) const {
-    const auto d = sizes.size();
-    const auto numel = c10::multiply_integers(sizes);
-    VkDeviceSize bufferSize{sizeof(float) * numel};
-    // alignment to be able to copy between image and buffer
-    if (d == 4) {
-      bufferSize =
-          sizeof(float) * ALIGN_UP4(sizes[0] * sizes[1]) * sizes[2] * sizes[3];
-    } else if (d == 3) {
-      bufferSize = sizeof(float) * ALIGN_UP4(sizes[0]) * sizes[1] * sizes[2];
-    } else if (d == 2) {
-      bufferSize = sizeof(float) * 4 * sizes[0] * sizes[1];
-    } else if (d == 1) {
-      bufferSize = sizeof(float) * 4 * sizes[0];
-    }
-    return bufferSize;
-  }
-
-  void allocate_storage() {
-    buffer_ = std::make_unique<VBuffer>(buffer_size_for_sizes(sizes_));
-  }
-
-  void set_data_from_host(const float* const inputData) {
-    buffer()->copy_from_host_to_device(
-        (const void*)inputData, sizeof(float) * numel_);
-  }
-
-  void copy_data_to_host(float* const outputData) const {
-    sync_image_to_buffer();
-    buffer()->copy_from_device_to_host(outputData, sizeof(float) * numel_);
-  }
-
-  void sync_image_to_buffer() const {
-    if (has_image()) {
-      copy_image_to_buffer(
-          *image(),
-          *(const_cast<VBuffer*>(buffer())),
-          true /* memory barrier for host memory map */);
-    }
-  }
-
- private:
-  std::vector<int64_t> sizes_;
-  std::vector<int64_t> strides_;
-  int64_t numel_;
-  mutable std::unique_ptr<VBuffer> buffer_;
-  std::unique_ptr<VImage> image_;
-};
-
-std::shared_ptr<VulkanTensor::Impl> VulkanTensor::impl() {
-  return impl_;
-}
-
-std::shared_ptr<const VulkanTensor::Impl> VulkanTensor::impl() const {
-  return impl_;
-}
-
-std::vector<int64_t> VulkanTensor::sizes() const {
-  return impl()->sizes();
-}
-
-void VulkanTensor::sync_image_to_buffer() const {
-  return impl()->sync_image_to_buffer();
-}
-
-std::vector<int64_t> VulkanTensor::strides() const {
-  return impl()->strides();
-}
-
-int64_t VulkanTensor::dim() const {
-  return impl()->dim();
-}
-
-int64_t VulkanTensor::numel() const {
-  return impl()->numel();
-}
-
-bool VulkanTensor::has_storage() const {
-  return impl()->has_buffer();
-}
-
-void VulkanTensor::allocate_storage() {
-  impl()->allocate_storage();
-}
-
-void VulkanTensor::set_data_from_host(const float* const inputData) {
-  impl()->set_data_from_host(inputData);
-}
-
-void VulkanTensor::copy_data_to_host(float* const outputData) const {
-  impl()->copy_data_to_host(outputData);
-}
-
-bool VulkanTensor::has_buffer() const {
-  return impl()->has_buffer();
-}
-
-VBuffer* VulkanTensor::buffer() {
-  return impl()->buffer();
-}
-
-const VBuffer* VulkanTensor::buffer() const {
-  return impl()->buffer();
-}
-
-bool VulkanTensor::can_be_image() const {
-  return impl()->can_be_image();
-}
-
-bool VulkanTensor::has_image() const {
-  return impl()->has_image();
-}
-
-VImage* VulkanTensor::image(const c10::optional<ImageSizes> imageSizes) {
-  return impl()->image(imageSizes);
-}
-
-const VImage* VulkanTensor::image(const c10::optional<ImageSizes> imageSizes) const {
-  return impl()->image(imageSizes);
-}
-
-VulkanTensor::VulkanTensor(std::vector<int64_t> sizes)
-    : impl_(std::make_shared<Impl>(std::move(sizes))) {}
-
-std::ostream& operator<<(std::ostream& s, const ImageSize& imageSize) {
-  s << "ImageSize{" << imageSize[0] << ", " << imageSize[1] << ", "
-    << imageSize[2] << "}";
-  return s;
-}
-std::ostream& operator<<(std::ostream& s, const ImageSizes& imageSizes) {
-  s << "ImageSizes{imageSize:" << imageSizes.imageSize
-    << ", dataSize:" << imageSizes.dataSize << "}";
-  return s;
-}
-
-std::ostream& operator<<(std::ostream& s, const WorkGroupSize& workGroupSize) {
-  s << "WorkGroupSize{" << workGroupSize.x << " " << workGroupSize.y << " "
-    << workGroupSize.z << "}";
-  return s;
-}
-
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h
deleted file mode 100644
index c2b1775e8f0a..000000000000
--- a/aten/src/ATen/native/vulkan/Vulkan.h
+++ /dev/null
@@ -1,532 +0,0 @@
-#pragma once
-
-#include <c10/util/Optional.h>
-#include <array>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#ifdef USE_VULKAN_WRAPPER
-#include <vulkan_wrapper.h>
-#else
-#include <vulkan/vulkan.h>
-#endif
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-#include <ATen/native/vulkan/glsl.h>
-#define GLSL_SPV(name) #name, name##_glsl
-#else
-#include <ATen/native/vulkan/spv.h>
-#define GLSL_SPV(name) #name, name##_spv, name##_spv_len
-#endif
-#include <ATen/native/vulkan/VulkanCommon.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace detail {
-
-#ifdef DEBUG
-static constexpr bool kEnableValidationLayers = true;
-#else
-static constexpr bool kEnableValidationLayers = false;
-#endif
-
-bool is_available();
-
-class VContext;
-const VContext& context();
-
-// VulkanTensor is a handle that holds shared pointer to VulkanTensor:Impl,
-// that owns Tensor representation on GPU.
-// VulkanTensor is copyable and moveable (copying and moving pointer to Impl).
-//
-// VulkanTensor::Impl is moveable only, owns Vulkan device memory for Tensor
-// data. Tensor can be represented in several formats.
-//
-// 0. VBuffer - (wrapper on  vulkan VkBuffer), supports all tensor dimensions,
-// data is in Contiguous format (NCHW), in plan to preserve at::Tensor memory
-// format (3d or 4d tensors can be in NHWC ChannelsLast format). It is located
-// in host visible memory that can be memory mapped to CPU memory.
-//
-// 1. VImage(TexC4) - (wrapper on vulkan VkImage), optional representation of
-// tensors with dimension <= 4 as VkImage, used in shaders as texture or storage
-// image. It is 3-dimensional image (x, y, z) with 4 component * 16 bit for each
-// triple (x, y, z).
-// For NCHW, NHWC:
-//
-// For dim==4: image.x - W sizes[3]; image.y -  H sizes[2]; image.z - (N
-// sizes[0] * C sizes[1]) / 4;
-//
-// For dim==3: image.x - W sizes[2]; image.y - H sizes[1]; image.z - (C
-// sizes[0]) / 4
-//
-// For dim==2: image.x - W sizes[1]; image.y - H sizes[0]; image.z : 1
-//
-// For dim==1: image.x - W sizes[0]; image.y : 1; image.z : 1
-//
-//
-// 2. VImage (other format) - Currently not added, but for some operations
-// another texture packing format can be beneficial for performance.
-//
-// Contract about synchronization between representations:
-// 1.VImage(TexC4) representation is allocated lazily with calling image(),
-// fails for dimensions > 4.
-//
-// Tensor data can be in 0.VBuffer and/or 1.VImage(TexC4),
-// If Tensor can be represented as image - VulkanTensor::Impl::can_be_image()
-// returns true. Image representation created lazily by call
-// VulkanTensor::Impl::image(), if it is called on Tensor with !can_be_image() -
-// it fails.
-//
-// If image allocated - image data has priority.
-// VulkanTensor::copy_data_to_host checks if image allocated -
-// copy_image_to_buffer first.
-class VBuffer;
-class VImage;
-
-using ImageSize = std::array<int32_t, 3>;
-struct ImageSizes {
-  ImageSize imageSize;
-  ImageSize dataSize;
-};
-
-class VulkanTensor final {
-  class Impl;
-
- public:
-  VulkanTensor() = default;
-  explicit VulkanTensor(std::vector<int64_t> sizes);
-  ~VulkanTensor() = default;
-
-  VulkanTensor(VulkanTensor&&) = default;
-  VulkanTensor& operator=(VulkanTensor&&) = default;
-
-  VulkanTensor(const VulkanTensor&) = default;
-  VulkanTensor& operator=(const VulkanTensor&) = default;
-
-  bool defined() const {
-    return static_cast<bool>(impl_);
-  }
-
-  std::vector<int64_t> sizes() const;
-  std::vector<int64_t> strides() const;
-  int64_t dim() const;
-  int64_t numel() const;
-
-  bool has_storage() const;
-  void allocate_storage();
-  void set_data_from_host(const float* inputData);
-  void copy_data_to_host(float* outputData) const;
-
-  bool has_buffer() const;
-  VBuffer* buffer();
-  const VBuffer* buffer() const;
-
-  bool can_be_image() const;
-  bool has_image() const;
-
-  void sync_image_to_buffer() const;
-
-  // if imageSizes argument is not specified:
-  // Allocates VImage of sizes{W,H,NC4} and fills it from tensor VBuffer if it
-  // exists, see comment for VulkanTensor.
-  //
-  // if imageSizes argument is specified:
-  // Only allocates VImage of specified sizes, that will be returned on
-  // subsequent image() calls. Can be used when user wants to store tensor image
-  // not in default{W, H, NC4} format (For performance or other reasons).
-  VImage* image(c10::optional<ImageSizes> imageSizes = c10::nullopt);
-  const VImage* image(
-      c10::optional<ImageSizes> imageSizes = c10::nullopt) const;
-
- private:
-  std::shared_ptr<Impl> impl();
-  std::shared_ptr<const Impl> impl() const;
-  std::shared_ptr<Impl> impl_;
-};
-
-class ComputeUnitFactory;
-class VContext final {
- public:
-  explicit VContext(bool enableValidationLayers);
-  ~VContext();
-  VContext(const VContext&) = delete;
-  VContext& operator=(const VContext&) = delete;
-  VContext(VContext&&) = default;
-  VContext& operator=(VContext&&) = default;
-
-  inline VkDevice device() const {
-    return device_;
-  }
-  inline VkPhysicalDevice physicalDevice() const {
-    return physicalDevice_;
-  }
-  inline VkPhysicalDeviceLimits limits() const {
-    return physicalDeviceLimits_;
-  }
-  inline VkCommandPool commandPool() const {
-    return commandPool_;
-  }
-  inline VkQueue queue() const {
-    return queue_;
-  }
-  ComputeUnitFactory& computeUnitFactory() const {
-    return *(computeUnitFactory_.get());
-  }
-
- private:
-  void createInstance();
-  void findPhysicalDevice();
-  void createDevice();
-  uint32_t getComputeQueueFamilyIndex();
-
-  VkInstance instance_;
-  VkDebugReportCallbackEXT debugReportCallback_;
-  VkDevice device_;
-  VkPhysicalDevice physicalDevice_;
-  VkPhysicalDeviceLimits physicalDeviceLimits_;
-  std::vector<const char*> enabledValidationLayers_;
-  VkQueue queue_;
-  uint32_t queueFamilyIndex_;
-  bool enableValidationLayers_;
-  VkCommandPool commandPool_;
-  std::unique_ptr<ComputeUnitFactory> computeUnitFactory_;
-};
-
-class VBuffer final {
- public:
-  class MapMemory final {
-   public:
-    MapMemory(
-        const VkDevice device,
-        const VkDeviceMemory deviceMemory,
-        const VkDeviceSize offset,
-        const VkDeviceSize size)
-        : device_(device),
-          deviceMemory_(deviceMemory),
-          offset_(offset),
-          size_(size) {
-      vkMapMemory(device_, deviceMemory_, 0, size, 0, &mappedMemory_);
-    }
-    ~MapMemory() {
-      vkUnmapMemory(device_, deviceMemory_);
-    }
-    MapMemory(const MapMemory&) = delete;
-    MapMemory& operator=(const MapMemory&) = delete;
-    MapMemory(MapMemory&&) = default;
-    MapMemory& operator=(MapMemory&&) = default;
-    inline const void* ptr() const {
-      return mappedMemory_;
-    }
-    inline void* ptr() {
-      return mappedMemory_;
-    }
-    void flushWriteToHost();
-    void flushWriteToDevice();
-
-   private:
-    VkDevice device_;
-    VkDeviceMemory deviceMemory_;
-    VkDeviceSize offset_;
-    VkDeviceSize size_;
-    void* mappedMemory_;
-  };
-
-  explicit VBuffer(
-      VkDeviceSize bufferSizeBytes,
-      VkBufferUsageFlags bufferUsageFlags = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-          VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-      VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
-
-  ~VBuffer();
-
-  VBuffer(const VBuffer&) = delete;
-  VBuffer& operator=(const VBuffer&) = delete;
-  VBuffer(VBuffer&&) = default;
-  VBuffer& operator=(VBuffer&&) = default;
-
-  static inline VBuffer makeUniformBuffer(const VkDeviceSize bufferSize) {
-    return VBuffer{bufferSize,
-                   VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-                   VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  }
-
-  MapMemory map() const {
-    return MapMemory{context().device(), bufferMemory_, 0, bufferSizeBytes_};
-  }
-
-  void copy_from_device_to_host(void* outputData, int64_t size) const;
-  void copy_from_host_to_device(const void* data, int64_t size);
-  void set_zeros();
-
-  VkDescriptorBufferInfo makeDescriptorBufferInfo() const;
-  VkWriteDescriptorSet makeWriteDescriptorSet(
-      VkDescriptorSet descriptorSet,
-      uint32_t binding,
-      const VkDescriptorBufferInfo* bufferInfo) const;
-
-  void bind(VkDescriptorSet descriptorSet, uint32_t binding) const;
-
-  inline VkDeviceSize sizeBytes() const {
-    return bufferSizeBytes_;
-  }
-
-  void addBufferMemoryBarrier(
-      VkCommandBuffer commandBuffer,
-      VkDeviceSize offset,
-      VkDeviceSize size) const;
-
-  inline VkBuffer vkbuffer() const {
-    return buffer_;
-  }
-
- private:
-  VkDeviceSize bufferSizeBytes_;
-  VkDescriptorType descriptorType_;
-  VkBuffer buffer_;
-  VkDeviceMemory bufferMemory_;
-};
-
-VBuffer makeUniformConstBuffer(const void* ptr, VkDeviceSize size);
-
-class VImage final {
- public:
-  static constexpr VkImageType kImageType = VK_IMAGE_TYPE_3D;
-  static constexpr VkFilter kFilter = VK_FILTER_NEAREST;
-  static constexpr VkFormat kFormat = VK_FORMAT_R16G16B16A16_SFLOAT;
-  static constexpr VkSamplerAddressMode kSamplerAddressMode =
-      VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-  static constexpr VkImageViewType kImageViewType = VK_IMAGE_VIEW_TYPE_3D;
-
-  explicit VImage(ImageSize imageSize, ImageSize dataSize);
-  explicit VImage(ImageSizes imageSizes)
-      : VImage(imageSizes.imageSize, imageSizes.dataSize) {}
-  ~VImage();
-  VImage(const VImage&) = delete;
-  VImage& operator=(const VImage&) = delete;
-  VImage(VImage&&) = default;
-  VImage& operator=(VImage&&) = default;
-
-  inline auto w() const {
-    return imageSize_[0];
-  }
-  inline auto h() const {
-    return imageSize_[1];
-  }
-  inline auto d() const {
-    return imageSize_[2];
-  }
-
-  VkImageViewCreateInfo makeImageViewCreateInfo() const;
-  VkSamplerCreateInfo makeSamplerCreateInfo() const;
-  VkDescriptorImageInfo makeDescriptorImageInfo(
-      VkImageLayout imageLayout) const;
-  VkWriteDescriptorSet makeWriteDescriptorSet(
-      VkDescriptorSet descriptorSet,
-      uint32_t binding,
-      VkDescriptorType descriptorType,
-      const VkDescriptorImageInfo* imageInfo) const;
-  void bind(
-      VkDescriptorSet descriptorSet,
-      uint32_t binding,
-      VkDescriptorType descriptorType,
-      VkImageLayout imageLayout) const;
-  void bindShaderRead(VkDescriptorSet descriptorSet, uint32_t binding) const;
-  void bindStorageImage(VkDescriptorSet descriptorSet, uint32_t binding) const;
-  inline VkDeviceSize sizeBytes() const {
-    return sizeof(float) * dataSize_[0] * dataSize_[1] * dataSize_[2];
-  }
-
-  inline VkDeviceSize capacityBytes() const {
-    // Every VImage pixel(texel) contains 4 float elements
-    return sizeof(float) * 4 * imageSize_[0] * imageSize_[1] * imageSize_[2];
-  }
-
-  ImageSize sizes() const {
-    return imageSize_;
-  }
-
-  void addImageMemoryBarrier(
-      VkCommandBuffer commandBuffer,
-      VkImageLayout newLayout) const;
-  void addImageMemoryBarrierToGeneral(VkCommandBuffer commandBuffer) const;
-  void addImageMemoryBarrierToShaderRead(VkCommandBuffer commandBuffer) const;
-
- private:
-  ImageSize imageSize_;
-  ImageSize dataSize_;
-  VkImage image_;
-  VkDeviceMemory imageMemory_;
-  VkImageView imageView_;
-  VkSampler sampler_;
-  // Holds current image layout that will be used in
-  // addImageMemoryBarrier as the previous layout. Need to be mutable to
-  // use addImageMemoryBarrier() for const VImage.
-  mutable VkImageLayout imageLayout_;
-};
-
-void copy_buffer_to_image(const VBuffer& buffer, VImage& image);
-
-void copy_image_to_buffer(
-    const VImage& image,
-    VBuffer& buffer,
-    bool addBufferMemoryBarrierForHost = false);
-
-void copy_buffer_to_buffer(
-    const VBuffer& srcBuffer,
-    VBuffer& dstBuffer,
-    VkDeviceSize size,
-    VkDeviceSize srcOffset = 0,
-    VkDeviceSize dstOffset = 0);
-
-VkDescriptorSetLayoutBinding descriptorSetLayoutBinding(
-    uint32_t binding,
-    VkDescriptorType descriptorType);
-
-void createDescriptorSetLayout(
-    VkDevice device,
-    const VkDescriptorSetLayoutBinding* bindings,
-    uint32_t bindingCount,
-    VkDescriptorSetLayout* setLayout);
-
-void allocateDescriptorSet(
-    VkDevice device,
-    VkDescriptorPool descriptorPool,
-    const VkDescriptorSetLayout* descriptorSetLayout,
-    VkDescriptorSet* descriptorSet);
-
-void createDescriptorSetLayoutSinglePool(
-    VkDevice device,
-    const std::vector<VkDescriptorType>& descrTypes,
-    VkDescriptorSetLayout* descrSetLayout,
-    VkDescriptorPool* descrPool,
-    VkDescriptorSet* descrSet);
-
-void allocateCommandBuffer(VkDevice device, VkCommandBuffer* commandBuffer);
-void beginCommandBuffer(VkCommandBuffer commandBuffer);
-void endCommandBuffer(VkCommandBuffer commandBuffer);
-void submitAndWaitCommandBuffer(VkDevice device, VkCommandBuffer commandBuffer);
-
-struct WorkGroupSize {
-  uint32_t x;
-  uint32_t y;
-  uint32_t z;
-};
-
-class ComputeUnit final {
- public:
-  static constexpr uint64_t kFenceTimeoutNanos = 100000000000;
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-  ComputeUnit(
-      const char* const glslSrc,
-      const VkPipelineCache pipelineCache,
-      const VkDescriptorSetLayout descrSetLayout,
-      const WorkGroupSize workGroupSize) {
-    createComputePipelineCompile(
-        glslSrc, pipelineCache, descrSetLayout, workGroupSize);
-  }
-#else
-  ComputeUnit(
-      const uint32_t* const spvCode,
-      const unsigned int spvCodeSize,
-      const VkPipelineCache pipelineCache,
-      const VkDescriptorSetLayout& descrSetLayout,
-      const WorkGroupSize workGroupSize) {
-    const auto codeSize = spvCodeSize;
-    createComputePipeline(
-        spvCode, codeSize, pipelineCache, descrSetLayout, workGroupSize);
-  }
-#endif
-
-  ~ComputeUnit();
-  ComputeUnit(const ComputeUnit&) = delete;
-  ComputeUnit& operator=(const ComputeUnit&) = delete;
-  ComputeUnit(ComputeUnit&&) = default;
-  ComputeUnit& operator=(ComputeUnit&&) = default;
-
-  void createComputePipeline(
-      const uint32_t* code,
-      const uint32_t codeSize,
-      VkPipelineCache pipelineCache,
-      VkDescriptorSetLayout descrSetLayout,
-      WorkGroupSize workGroupSize);
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-  void createComputePipelineCompile(
-      const std::string& glslSrc,
-      const VkPipelineCache pipelineCache,
-      const VkDescriptorSetLayout descrSetLayout,
-      const WorkGroupSize workGroupSize);
-#endif
-
-  void createCommandBuffer(VkDescriptorSet& descriptorSet);
-  void addMemoryBarrier(
-      VkPipelineStageFlags srcStageMask,
-      VkAccessFlags srcAccessMask,
-      VkPipelineStageFlags dstStageMask,
-      VkAccessFlags dstAccessMask);
-  void dispatchCommandBuffer(
-      uint32_t groupCountX,
-      uint32_t groupCountY,
-      uint32_t groupCountZ);
-  void dispatchCommandBuffer(
-      uint32_t gridX,
-      uint32_t gridY,
-      uint32_t gridZ,
-      WorkGroupSize workGroupSize);
-  void submitAndWaitCommandBuffer();
-  void endCommandBuffer();
-  inline VkCommandBuffer commandBuffer() {
-    return commandBuffer_;
-  }
-
- private:
-  VkCommandBuffer commandBuffer_;
-  VkPipeline pipeline_;
-  VkPipelineLayout pipelineLayout_;
-  VkShaderModule computeShaderModule_;
-};
-
-class ComputeUnitFactory {
- public:
-  explicit ComputeUnitFactory(const VkDevice device);
-  ~ComputeUnitFactory();
-  ComputeUnitFactory(const ComputeUnitFactory&) = default;
-  ComputeUnitFactory& operator=(const ComputeUnitFactory&) = default;
-  ComputeUnitFactory(ComputeUnitFactory&&) = default;
-  ComputeUnitFactory& operator=(ComputeUnitFactory&&) = default;
-
-#ifdef USE_VULKAN_SHADERC_RUNTIME
-  ComputeUnit& get(
-      const char* key,
-      const char* glslSrc,
-      VkDescriptorSetLayout descrSetLayout,
-      WorkGroupSize workGroupSize);
-#else
-  ComputeUnit& get(
-      const char* key,
-      const uint32_t* code,
-      const uint32_t codeSize,
-      VkDescriptorSetLayout descrSetLayout,
-      WorkGroupSize workGroupSize);
-#endif
- private:
-  std::string getCacheKey(const char* key, WorkGroupSize workGroupSize);
-  ComputeUnit& get(
-      const std::string& cacheKey,
-      std::function<std::shared_ptr<ComputeUnit>()> factoryFn);
-
-  VkDevice device_;
-  VkPipelineCache pipelineCache_;
-  std::unordered_map<std::string, std::shared_ptr<ComputeUnit>> computeUnits_;
-};
-
-std::ostream& operator<<(std::ostream& s, const WorkGroupSize& workGroupSize);
-std::ostream& operator<<(std::ostream& s, const ImageSize& imageSize);
-std::ostream& operator<<(std::ostream& s, const ImageSizes& imageSizes);
-
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp
deleted file mode 100644
index 768ce081b353..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ /dev/null
@@ -1,711 +0,0 @@
-#include <ATen/native/vulkan/VulkanAten.h>
-#include <ATen/ATen.h>
-#include <ATen/Config.h>
-#include <ATen/InferSize.h>
-#include <ATen/native/Pool.h>
-#include <ATen/native/UpSample.h>
-#include <ATen/native/utils/ParamUtils.h>
-#include <ATen/native/vulkan/Vulkan.h>
-#include <ATen/native/vulkan/VulkanOpaqueTensorImpl.h>
-#include <ATen/native/vulkan/VulkanOps.h>
-#include <ATen/vulkan/Context.h>
-#include <c10/util/irange.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace aten {
-using at::native::vulkan::detail::VulkanTensor;
-using VulkanTensorImpl = VulkanOpaqueTensorImpl<VulkanTensor>;
-
-namespace {
-int64_t normalize_dim(int64_t d, int64_t n) {
-  return (d % n + n) % n;
-}
-} // namespace
-
-Tensor new_with_vtensor_vulkan(
-    VulkanTensor&& vt,
-    const TensorOptions& options) {
-  auto sizes = vt.sizes();
-  auto strides = vt.strides();
-  return at::detail::make_tensor<VulkanTensorImpl>(
-      DispatchKeySet(DispatchKey::Vulkan),
-      options.dtype(),
-      at::Device(at::kVulkan),
-      std::move(vt),
-      std::vector<int64_t>(sizes.begin(), sizes.end()),
-      std::vector<int64_t>(strides.begin(), strides.end()));
-}
-
-VulkanTensor& vtensor_from_vulkan(const Tensor& tensor) {
-  TORCH_INTERNAL_ASSERT(
-      tensor.is_vulkan(), "vtensor_from_vulkan expects Vulkan tensor input");
-  VulkanTensorImpl* const impl =
-      static_cast<VulkanTensorImpl*>(tensor.unsafeGetTensorImpl());
-  return impl->unsafe_opaque_handle();
-}
-
-Tensor empty(
-    IntArrayRef size,
-    optional<ScalarType> dtype,
-    optional<Layout> layout,
-    optional<Device> device,
-    optional<bool> pin_memory,
-    const optional<MemoryFormat> memory_format) {
-  TORCH_CHECK(
-      !pin_memory.has_value(),
-      "'pin_memory' argument is incompatible with Vulkan tensor");
-  TORCH_CHECK(
-      !memory_format.has_value(),
-      "'memory_format' argument is incompatible with Vulkan tensor");
-  VulkanTensor vt{size.vec()};
-  return new_with_vtensor_vulkan(
-      std::move(vt), at::device(at::kVulkan).dtype(dtype));
-}
-
-Tensor empty_strided(
-    IntArrayRef size,
-    IntArrayRef stride,
-    optional<ScalarType> dtype,
-    optional<Layout> layout,
-    optional<Device> device,
-    optional<bool> pin_memory) {
-  return vulkan::aten::empty(
-      size, dtype, layout, device, pin_memory, c10::nullopt);
-}
-
-Tensor upsample_nearest2d(
-    const Tensor& input,
-    const IntArrayRef outputSizes,
-    const c10::optional<double> scales_h,
-    const c10::optional<double> scales_w) {
-  const auto& x = vtensor_from_vulkan(input);
-  const auto inputSizes = input.sizes();
-  const auto in = inputSizes[0];
-  const auto ic = inputSizes[1];
-  const auto ih = inputSizes[2];
-  const auto iw = inputSizes[3];
-
-  const auto oh = outputSizes[0];
-  const auto ow = outputSizes[1];
-  const float height_scale = compute_scales_value<float>(scales_h, ih, oh);
-  const float width_scale = compute_scales_value<float>(scales_w, iw, ow);
-  VulkanTensor output{{in, ic, oh, ow}};
-  vulkan::detail::upsample_nearest2d(
-      output, x, ih, iw, oh, ow, in, ic, height_scale, width_scale);
-  return new_with_vtensor_vulkan(std::move(output), input.options());
-}
-
-Tensor adaptive_avg_pool2d(const at::Tensor& input, IntArrayRef outputSize) {
-  TORCH_INTERNAL_ASSERT(
-      input.dim() == 4,
-      "vulkan_adaptive_avg_pool2d expects 4-dimensional input");
-  const auto& x = vtensor_from_vulkan(input);
-  const auto inputSize = input.sizes();
-  const auto in = inputSize[0];
-  const auto ic = inputSize[1];
-  const auto ih = inputSize[2];
-  const auto iw = inputSize[3];
-
-  const auto oh = outputSize[0];
-  const auto ow = outputSize[1];
-  VulkanTensor output{{in, ic, oh, ow}};
-  vulkan::detail::adaptive_avg_pool2d(output, x, ih, iw, oh, ow, in, ic);
-  return new_with_vtensor_vulkan(std::move(output), input.options());
-}
-
-Tensor avg_pool2d(
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
-  const int kH = safe_downcast<int>(kernel_size[0]);
-  const int kW =
-      kernel_size.size() == 1 ? kH : safe_downcast<int>(kernel_size[1]);
-
-  TORCH_CHECK(
-      stride.empty() || stride.size() == 1 || stride.size() == 2,
-      "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
-  const int dH = stride.empty() ? kH : safe_downcast<int>(stride[0]);
-  const int dW = stride.empty()
-      ? kW
-      : stride.size() == 1 ? dH : safe_downcast<int>(stride[1]);
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "avg_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int>(padding[1]);
-
-  const auto& x = vtensor_from_vulkan(self);
-  auto inputSize = self.sizes();
-  const int64_t iN = inputSize[0];
-  const int64_t iC = inputSize[1];
-  const int64_t iH = inputSize[2];
-  const int64_t iW = inputSize[3];
-
-  const int64_t oH =
-      pooling_output_shape<int64_t>(iH, kH, padH, dH, 1, ceil_mode);
-  const int64_t oW =
-      pooling_output_shape<int64_t>(iW, kW, padW, dW, 1, ceil_mode);
-
-  pool2d_shape_check(
-      self, kH, kW, dH, dW, padH, padW, 1, 1, iC, iH, iW, oH, oW, self.suggest_memory_format());
-
-  VulkanTensor y{{iN, iC, oH, oW}};
-  vulkan::detail::avg_pool2d(
-      y, x, iH, iW, oH, oW, iN, iC, kH, kW, dH, dW, padH, padW);
-  return new_with_vtensor_vulkan(std::move(y), self.options());
-}
-
-Tensor max_pool2d(
-    const at::Tensor& self,
-    const IntArrayRef kernel_size,
-    const IntArrayRef stride,
-    const IntArrayRef padding,
-    const IntArrayRef dilation,
-    bool ceil_mode) {
-  TORCH_CHECK(
-      kernel_size.size() == 1 || kernel_size.size() == 2,
-      "Vulkan max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int>(kernel_size[0]);
-  const int kW =
-      kernel_size.size() == 1 ? kH : safe_downcast<int>(kernel_size[1]);
-  TORCH_CHECK(
-      stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
-      "Vulkan max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int>(stride[0]);
-  const int dW = stride.empty()
-      ? kW
-      : stride.size() == 1 ? dH : safe_downcast<int>(stride[1]);
-
-  TORCH_CHECK(
-      padding.size() == 1 || padding.size() == 2,
-      "Vulkan max_pool2d: padding must be either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int>(padding[1]);
-
-  TORCH_CHECK(
-      dilation.size() == 1 || dilation.size() == 2,
-      "Vulkan max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int>(dilation[0]);
-  const int dilationW =
-      dilation.size() == 1 ? dilationH : safe_downcast<int>(dilation[1]);
-  TORCH_CHECK(
-      self.dim() == 4, "Vulkan max_pool2d is implemented for 4-dim input");
-
-  const auto& x = vtensor_from_vulkan(self);
-  const auto inputSize = self.sizes();
-  const int64_t iN = inputSize[0];
-  const int64_t iC = inputSize[1];
-  const int64_t iH = inputSize[2];
-  const int64_t iW = inputSize[3];
-
-  const int64_t oH =
-      pooling_output_shape<int64_t>(iH, kH, padH, dH, dilationH, ceil_mode);
-  const int64_t oW =
-      pooling_output_shape<int64_t>(iW, kW, padW, dW, dilationW, ceil_mode);
-
-  pool2d_shape_check(
-      self,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      dilationH,
-      dilationW,
-      iC,
-      iH,
-      iW,
-      oH,
-      oW,
-      self.suggest_memory_format());
-
-  VulkanTensor y{{iN, iC, oH, oW}};
-  vulkan::detail::max_pool2d(
-      y,
-      x,
-      iH,
-      iW,
-      oH,
-      oW,
-      iN,
-      iC,
-      kH,
-      kW,
-      dH,
-      dW,
-      padH,
-      padW,
-      dilationH,
-      dilationW);
-  return new_with_vtensor_vulkan(std::move(y), self.options());
-}
-
-Tensor reshape(at::Tensor const& input, IntArrayRef shape) {
-  return new_with_vtensor_vulkan(
-      vulkan::detail::reshape_copy(vtensor_from_vulkan(input), shape.vec()),
-      input.options());
-}
-
-Tensor cat(const TensorList tensors, int64_t dim) {
-  const auto norm_dim = normalize_dim(dim, 4);
-  TORCH_INTERNAL_ASSERT(
-      norm_dim == 0 || norm_dim == 1,
-      "Vulkan cat is implemented only for batch and channels dimensions");
-  at::Tensor tensor = tensors[0];
-  int64_t cat_dim_size = 0;
-
-  std::vector<VulkanTensor> vTensors{};
-  for (const auto i : c10::irange(tensors.size())) {
-    const auto& t = tensors[i];
-    TORCH_INTERNAL_ASSERT(
-        t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");
-    TORCH_INTERNAL_ASSERT(t.is_vulkan(), "Vulkan cat expects Vulkan inputs");
-
-    for (const auto d : c10::irange(4)) {
-      if (d == dim) {
-        continue;
-      }
-      TORCH_INTERNAL_ASSERT(
-          t.size(d) == tensor.size(d),
-          "Vulkan cat inputs must have matching sizes except concatenated dimension");
-    }
-    vTensors.push_back(vtensor_from_vulkan(t));
-    cat_dim_size += t.size(dim);
-  }
-
-  auto result_size = tensor.sizes().vec();
-  result_size[dim] = cat_dim_size;
-
-  VulkanTensor output{result_size};
-
-  vulkan::detail::cat(output, vTensors, dim);
-  return new_with_vtensor_vulkan(std::move(output), tensor.options());
-}
-
-Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
-  return new_with_vtensor_vulkan(
-      vulkan::detail::transpose(vtensor_from_vulkan(self), dim0, dim1),
-      self.options());
-}
-
-Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
-  auto& x = vtensor_from_vulkan(self);
-  x = vulkan::detail::transpose(x, dim0, dim1);
-  return self;
-}
-
-Tensor view(const Tensor& self, IntArrayRef size) {
-  return new_with_vtensor_vulkan(
-      vulkan::detail::reshape_copy(
-          vtensor_from_vulkan(self), at::infer_size(size, self.numel())),
-      self.options());
-}
-
-Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
-  return self;
-}
-
-Tensor slice(
-    const Tensor& self,
-    int64_t dim,
-    int64_t start,
-    int64_t end,
-    int64_t step) {
-  return new_with_vtensor_vulkan(
-      vulkan::detail::slice(vtensor_from_vulkan(self), dim, start, end, step),
-      self.options());
-}
-
-Tensor add(const Tensor& self, const Tensor& other, const Scalar& alpha) {
-  auto xt = self.is_vulkan() ? self : self.vulkan();
-  const auto& x = vtensor_from_vulkan(xt);
-  auto yt = other.is_vulkan() ? other : other.vulkan();
-  const auto& y = vtensor_from_vulkan(yt);
-  const float a = alpha.to<float>();
-
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::add(output, x, y, a);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-VulkanTensor& vtensor(Tensor& t) {
-  if (t.is_vulkan()) {
-    return vtensor_from_vulkan(t);
-  }
-  auto tv = t.vulkan();
-  return vtensor_from_vulkan(tv);
-}
-
-const VulkanTensor& vtensor(const Tensor& t) {
-  if (t.is_vulkan()) {
-    return vtensor_from_vulkan(t);
-  }
-  const auto tv = t.vulkan();
-  return vtensor_from_vulkan(tv);
-}
-
-Tensor& add_(Tensor& self, const Tensor& other, const Scalar& alpha) {
-  auto& x = vtensor(self);
-  const auto& y = vtensor(other);
-  float a = alpha.to<float>();
-
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::add(output, x, y, a);
-  x = std::move(output);
-  return self;
-}
-
-Tensor add_scalar(const Tensor& self, const Scalar& other, const Scalar& alpha) {
-  const auto& x = vtensor_from_vulkan(self);
-  const float s = other.to<float>();
-  const float a = alpha.to<float>();
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::add(output, x, s * a);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-Tensor mul_scalar(const Tensor& self, const Scalar& other) {
-  const auto& x = vtensor_from_vulkan(self);
-  const float s = other.to<float>();
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::mul(output, x, s);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-Tensor select(const Tensor& self, int64_t dim, int64_t index) {
-  auto sliced = vulkan::aten::slice(self, dim, index, index + 1, 1);
-  auto sizes = self.sizes().vec();
-  sizes.erase(sizes.begin() + dim);
-  return vulkan::aten::reshape(sliced, sizes);
-}
-
-Tensor unsqueeze(const Tensor& self, int64_t dim) {
-  auto sizes = self.sizes().vec();
-  sizes.insert(sizes.begin() + dim, 1);
-  return vulkan::aten::reshape(self, sizes);
-}
-
-Tensor convolution(
-    const Tensor& input, // Vulkan
-    const Tensor& weight, // CPU
-    const c10::optional<Tensor>& bias, // CPU
-    const IntArrayRef stride,
-    const IntArrayRef padding,
-    const IntArrayRef dilation,
-    const bool transposed,
-    const IntArrayRef output_padding,
-    const int64_t groups) {
-  const vulkan::Conv2DParams params{
-      input.sizes(), weight.sizes(), padding, stride, dilation, groups};
-  TORCH_INTERNAL_ASSERT(
-      input.dim() == 4, "convolution: Expected 4-dimensional input");
-  TORCH_INTERNAL_ASSERT(
-      weight.dim() == 4, "convolution: Expected 4-dimensional weight");
-  TORCH_INTERNAL_ASSERT(
-      groups == 1 || groups == params.C,
-      "convolution: only nogroup or depthwise convolutions supported");
-  TORCH_INTERNAL_ASSERT(!transposed, "convolution: transposed not supported");
-
-  const VulkanTensor& vinput = vtensor_from_vulkan(input);
-  VulkanTensor voutput = VulkanTensor{params.output_sizes()};
-
-  vulkan::detail::conv2d(
-      voutput,
-      vinput,
-      weight.data_ptr<float>(),
-      (bias.has_value() && bias->defined())
-          ? c10::make_optional<const float*>(bias->data_ptr<float>())
-          : c10::nullopt,
-      params);
-  return new_with_vtensor_vulkan(std::move(voutput), input.options());
-}
-
-Tensor addmm(
-    const Tensor& self,
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha) {
-  const VulkanTensor t =
-      vtensor_from_vulkan(self.is_vulkan() ? self : self.vulkan());
-  const VulkanTensor m1 =
-      vtensor_from_vulkan(mat1.is_vulkan() ? mat1 : mat1.vulkan());
-  const VulkanTensor m2 =
-      vtensor_from_vulkan(mat2.is_vulkan() ? mat2 : mat2.vulkan());
-  const float b = beta.to<float>();
-  const float a = alpha.to<float>();
-
-  VulkanTensor output = VulkanTensor{self.sizes().vec()};
-  vulkan::detail::addmm(output, t, m1, m2, b, a);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-Tensor mm(const Tensor& self, const Tensor& mat2) {
-  TORCH_INTERNAL_ASSERT(
-      self.dim() == 2 && mat2.dim() == 2,
-      "vulkan_mm expects 2-dimensional tensors");
-  const auto m1Sizes = self.sizes();
-  const auto m2Sizes = mat2.sizes();
-  TORCH_INTERNAL_ASSERT(
-      m1Sizes[1] == m2Sizes[0],
-      "vulkan_mm expects self.sizes[1] equal mat2.sizes[0]");
-
-  const auto& m1 = vtensor_from_vulkan(self.is_vulkan() ? self : self.vulkan());
-  const auto& m2 = vtensor_from_vulkan(mat2.is_vulkan() ? mat2 : mat2.vulkan());
-
-  VulkanTensor output{{m1Sizes[0], m2Sizes[1]}};
-  vulkan::detail::addmm(output, c10::nullopt, m1, m2, 0.f, 1.f);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-Tensor clamp(
-    const Tensor& self,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max) {
-  const auto& x = vtensor_from_vulkan(self);
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::clamp(
-      output,
-      x,
-      min ? min.value().to<float>() : -std::numeric_limits<float>::infinity(),
-      max ? max.value().to<float>() : std::numeric_limits<float>::infinity());
-  return vulkan::aten::new_with_vtensor_vulkan(
-      std::move(output), self.options());
-}
-
-Tensor& clamp_(
-    Tensor& self,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max) {
-  auto& x = vtensor_from_vulkan(self);
-  VulkanTensor output{self.sizes().vec()};
-  vulkan::detail::clamp(
-      output,
-      x,
-      min ? min.value().to<float>() : -std::numeric_limits<float>::infinity(),
-      max ? max.value().to<float>() : std::numeric_limits<float>::infinity());
-  x = std::move(output);
-  return self;
-}
-
-Tensor hardtanh(const Tensor& self, const Scalar& min, const Scalar& max) {
-  return vulkan::aten::clamp(self, min, max);
-}
-
-Tensor& hardtanh_(Tensor& self, const Scalar& min, const Scalar& max) {
-  return vulkan::aten::clamp_(self, min, max);
-}
-
-Tensor& relu_(Tensor& self) {
-  return vulkan::aten::clamp_(self, 0, nullopt);
-}
-
-Tensor mean(
-    const Tensor& self,
-    const IntArrayRef dim,
-    const bool keepdim,
-    const optional<ScalarType> dtype) {
-  TORCH_INTERNAL_ASSERT(!keepdim, "keepdim not implemented for Vulkan mean");
-  TORCH_INTERNAL_ASSERT(self.is_vulkan(), "mean expects Vulkan tensor input");
-
-  // Mean is implemented only for HW dimensions of 4-d tensor
-  TORCH_INTERNAL_ASSERT(self.dim() == 4);
-  static const std::unordered_set<int64_t> expected_dims_set({2, 3});
-  std::unordered_set<int64_t> dims_set;
-  for (const auto& d : dim) {
-    dims_set.insert(normalize_dim(d, 4));
-  }
-  TORCH_INTERNAL_ASSERT(expected_dims_set == dims_set);
-
-  const auto& x = vtensor_from_vulkan(self);
-  const auto sizes = self.sizes();
-  VulkanTensor output{std::vector<int64_t>{sizes[0], sizes[1]}};
-  vulkan::detail::mean(output, x);
-  return new_with_vtensor_vulkan(std::move(output), self.options());
-}
-
-#ifndef USE_VULKAN_API
-
-TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
-  m.impl("slice.Tensor", TORCH_FN(at::native::vulkan::aten::slice));
-  m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape));
-  m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select));
-  m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose));
-  m.impl("transpose_", at::native::vulkan::aten::transpose_);
-  m.impl("view", TORCH_FN(at::native::vulkan::aten::view));
-  m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze));
-  m.impl("empty.memory_format", at::native::vulkan::aten::empty);
-  m.impl("empty_strided", TORCH_FN(at::native::vulkan::aten::empty_strided));
-  m.impl("add.Tensor", TORCH_FN(at::native::vulkan::aten::add));
-  m.impl("clamp", TORCH_FN(at::native::vulkan::aten::clamp));
-  m.impl("mean.dim", TORCH_FN(at::native::vulkan::aten::mean));
-  m.impl("mm", TORCH_FN(at::native::vulkan::aten::mm));
-  m.impl("addmm", TORCH_FN(at::native::vulkan::aten::addmm));
-  m.impl(
-      "upsample_nearest2d",
-      TORCH_FN(at::native::vulkan::aten::upsample_nearest2d));
-  m.impl(
-      "_adaptive_avg_pool2d",
-      TORCH_FN(at::native::vulkan::aten::adaptive_avg_pool2d));
-  m.impl("avg_pool2d", TORCH_FN(at::native::vulkan::aten::avg_pool2d));
-  m.impl("max_pool2d", TORCH_FN(at::native::vulkan::aten::max_pool2d));
-  m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat));
-  m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar));
-  m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar));
-  m.impl(
-      "convolution_overrideable", at::native::vulkan::aten::convolution);
-  m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_);
-  m.impl("relu_", at::native::vulkan::aten::relu_);
-  m.impl("add_.Tensor", at::native::vulkan::aten::add_);
-}
-
-#endif /* USE_VULKAN_API */
-
-Tensor& copy_from_vulkan_(Tensor& self, const Tensor& src) {
-  TORCH_INTERNAL_ASSERT(
-      src.device().type() == DeviceType::Vulkan,
-      "copy_from_vulkan input tensor's device is not Vulkan");
-  TORCH_INTERNAL_ASSERT(
-      self.device().is_cpu(),
-      "copy_from_vulkan is implemented only for CPU device output");
-  TORCH_INTERNAL_ASSERT(
-      self.layout() == Layout::Strided,
-      "copy_from_vulkan is implemented only for Strided layout output");
-  TORCH_INTERNAL_ASSERT(
-      self.scalar_type() == ScalarType::Float,
-      "copy_from_vulkan is implemented only for float dtype output, got:",
-      self.scalar_type());
-  TORCH_INTERNAL_ASSERT(
-      self.is_contiguous(),
-      "copy_from_vulkan is implemented only for contiguous output tensor");
-
-  const auto& vtensor = vtensor_from_vulkan(src);
-  vtensor.copy_data_to_host(self.data_ptr<float>());
-  return self;
-}
-
-Tensor& copy_to_vulkan_(Tensor& self, const Tensor& src) {
-  TORCH_INTERNAL_ASSERT(
-      self.device().type() == DeviceType::Vulkan,
-      "copy_to_vulkan output tensor's device is not Vulkan");
-  TORCH_INTERNAL_ASSERT(
-      src.device().is_cpu(),
-      "copy_to_vulkan is implemented only for CPU device input");
-  TORCH_INTERNAL_ASSERT(
-      src.layout() == Layout::Strided,
-      "copy_to_vulkan is implemented only for Strided layout input");
-  TORCH_INTERNAL_ASSERT(
-      src.scalar_type() == ScalarType::Float,
-      "copy_to_vulkan is implemented only for float dtype");
-
-  auto cpu_tensor_contiguous = src.contiguous();
-  VulkanTensor& vtensor = vtensor_from_vulkan(self);
-  vtensor.set_data_from_host(cpu_tensor_contiguous.data_ptr<float>());
-  return self;
-}
-
-Tensor& vulkan_copy_impl_(Tensor& self, const Tensor& src) {
-  if (src.device().type() == at::kVulkan && self.device().type() == at::kCPU) {
-    return copy_from_vulkan_(self, src);
-  }
-  if (src.device().type() == at::kCPU && self.device().type() == at::kVulkan) {
-    return copy_to_vulkan_(self, src);
-  }
-  TORCH_INTERNAL_ASSERT(
-      src.device().type() == DeviceType::Vulkan,
-      "vulkan_copy_ is implemented only for CPU,Strided,float->Vulkan; Vulkan->CPU,Strided,float");
-  return self;
-}
-
-struct VulkanImpl final : public at::vulkan::VulkanImplInterface {
-  bool is_vulkan_available() const override {
-    return at::native::vulkan::detail::is_available();
-  }
-
-  Tensor& vulkan_copy_(Tensor& self, const Tensor& src) const override {
-    return vulkan_copy_impl_(self, src);
-  }
-};
-static at::vulkan::VulkanImplRegistrar g_vulkan_impl(new VulkanImpl());
-
-} // namespace aten
-
-using detail::VulkanTensor;
-Tensor convolution_prepack_weights(const Tensor& weight) {
-  const auto wsizes = weight.sizes();
-  TORCH_INTERNAL_ASSERT(
-      wsizes.size() == 4,
-      "convolution_prepack_weights: Expected 4-dimensional weight");
-
-  const int64_t OC = wsizes[0];
-  const int64_t C = wsizes[1];
-  const int64_t KH = wsizes[2];
-  const int64_t KW = wsizes[3];
-  VulkanTensor voutput =
-      VulkanTensor{{UP_DIV(OC, 4), UP_DIV(C, 4), KH * KW, 16}};
-
-  vulkan::detail::conv2d_prepack_weights(
-      voutput, weight.data_ptr<float>(), OC, C, KH, KW);
-  return aten::new_with_vtensor_vulkan(
-      std::move(voutput), at::device(at::kVulkan).dtype(at::kFloat));
-}
-
-Tensor convolution_prepacked(
-    const Tensor& input, // Vulkan
-    const IntArrayRef weightSizes,
-    const Tensor& weight_prepacked_vulkan, // Vulkan
-    const c10::optional<Tensor>& bias, // Vulkan|CPU
-    const IntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    int64_t groups,
-    const float output_min,
-    const float output_max) {
-  TORCH_INTERNAL_ASSERT(
-      input.dim() == 4, "Vulkan convolution: Expected 4-dimensional input");
-  TORCH_INTERNAL_ASSERT(
-      weight_prepacked_vulkan.dim() == 4,
-      "Vulkan convolution: Expected 4-dimensional weight");
-  vulkan::Conv2DParams params{
-      input.sizes(), weightSizes, padding, stride, dilation, groups};
-  TORCH_INTERNAL_ASSERT(
-      groups == 1 || groups == params.C,
-      "Vulkan convolution: only nogroup or depthwise convolutions supported");
-  const VulkanTensor& vinput = aten::vtensor_from_vulkan(input);
-  const VulkanTensor& vweight =
-      aten::vtensor_from_vulkan(weight_prepacked_vulkan);
-  VulkanTensor voutput =
-      VulkanTensor{{params.N, params.OC, params.OH, params.OW}};
-  const bool hasBias = bias.has_value() && bias->defined();
-  if (hasBias && bias->is_vulkan()) {
-    const VulkanTensor& vbias = aten::vtensor_from_vulkan(*bias);
-    vulkan::detail::conv2d(
-        voutput, vinput, vweight, vbias, params, output_min, output_max);
-  } else {
-    vulkan::detail::conv2d(
-        voutput,
-        vinput,
-        vweight,
-        hasBias ? c10::make_optional<const float*>((*bias).data_ptr<float>())
-                : c10::nullopt,
-        params,
-        output_min,
-        output_max);
-  }
-  return aten::new_with_vtensor_vulkan(std::move(voutput), input.options());
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanAten.h b/aten/src/ATen/native/vulkan/VulkanAten.h
deleted file mode 100644
index 8345ff6ac065..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanAten.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-Tensor convolution_prepack_weights(const at::Tensor& weight);
-
-Tensor convolution_prepacked(
-    const at::Tensor& input, // Vulkan
-    IntArrayRef weightSizes,
-    const at::Tensor& weight_prepacked_vulkan, // Vulkan
-    const c10::optional<at::Tensor>& bias, // Vulkan|CPU
-    IntArrayRef padding,
-    IntArrayRef stride,
-    IntArrayRef dilation,
-    int64_t groups,
-    const float output_min,
-    const float output_max);
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanCommon.h b/aten/src/ATen/native/vulkan/VulkanCommon.h
deleted file mode 100644
index 39d9c3bc129c..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanCommon.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include <array>
-
-#include <ATen/Tensor.h>
-
-#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y))
-#define ALIGN_UP4(x) ROUND_UP((x), 4)
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-struct ContextConv2D final {
-  at::Tensor weight_prepacked_vulkan_;
-  c10::optional<at::Tensor> bias_vulkan_;
-  std::array<int64_t, 4> weight_size_;
-  std::array<int64_t, 2> padding_;
-  std::array<int64_t, 2> stride_;
-  std::array<int64_t, 2> dilation_;
-  int64_t groups_;
-  float output_min_;
-  float output_max_;
-
-  ContextConv2D() = delete;
-
-  ContextConv2D(
-      at::Tensor&& weight_prepacked_vulkan,
-      c10::optional<at::Tensor>&& bias_vulkan,
-      std::array<int64_t, 4> weight_size,
-      std::array<int64_t, 2> padding,
-      std::array<int64_t, 2> stride,
-      std::array<int64_t, 2> dilation,
-      int64_t groups,
-      float output_min,
-      float output_max)
-      : weight_prepacked_vulkan_(std::move(weight_prepacked_vulkan)),
-        bias_vulkan_(std::move(bias_vulkan)),
-        weight_size_(weight_size),
-        padding_(padding),
-        stride_(stride),
-        dilation_(dilation),
-        groups_(groups),
-        output_min_(output_min),
-        output_max_(output_max) {}
-
-  ContextConv2D(ContextConv2D&&) = default;
-  ContextConv2D& operator=(ContextConv2D&&) = default;
-
-  ~ContextConv2D() {}
-
-  static constexpr float kMin = -std::numeric_limits<float>::infinity();
-  static constexpr float kMax = std::numeric_limits<float>::infinity();
-};
-
-namespace detail {
-template <typename To, typename From>
-inline constexpr To safe_downcast_internal(const From v) {
-  typedef std::common_type_t<From, To> Type;
-  constexpr Type min{static_cast<Type>(std::numeric_limits<To>::lowest())};
-  constexpr Type max{static_cast<Type>(std::numeric_limits<To>::max())};
-  TORCH_CHECK(min <= v && v <= max, "Cast failed: out of range");
-  return static_cast<To>(v);
-}
-
-template <typename To, typename From>
-inline constexpr bool is_signed_to_unsigned() {
-  return std::is_signed<From>::value && std::is_unsigned<To>::value;
-}
-
-template <
-    typename To,
-    typename From,
-    std::enable_if_t<is_signed_to_unsigned<To, From>(), bool> = true>
-inline constexpr To safe_downcast(const From v) {
-  TORCH_CHECK(v >= From{}, "Cast failed: negative signed to unsigned");
-  return safe_downcast_internal<To, From>(v);
-}
-
-template <
-    typename To,
-    typename From,
-    std::enable_if_t<!is_signed_to_unsigned<To, From>(), bool> = true>
-inline constexpr To safe_downcast(const From v) {
-  return safe_downcast_internal<To, From>(v);
-}
-
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanConvolution.cpp b/aten/src/ATen/native/vulkan/VulkanConvolution.cpp
deleted file mode 100644
index d9ef04c14036..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanConvolution.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <ATen/native/utils/ParamUtils.h>
-
-#include <ATen/native/vulkan/VulkanAten.h>
-#include <ATen/native/vulkan/VulkanCommon.h>
-#include <ATen/native/vulkan/VulkanConvolution.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace detail {
-namespace convolution2d {
-
-namespace {
-// TODO: This function is not used.
-bool available(
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias,
-    const IntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const float output_min,
-    const float output_max) {
-  return at::native::is_vulkan_available() && (4 == weight.ndimension()) &&
-      (at::Backend::CPU == weight.options().backend()) &&
-      (kFloat == weight.scalar_type());
-}
-
-} // namespace
-
-c10::intrusive_ptr<vulkan::Conv2dOpContext> createConv2dClampPrePackOpContext(
-    Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
-  return vulkan::VulkanConv2dOpContext::create_context(
-      std::move(weight),
-      std::move(bias),
-      std::move(padding),
-      std::move(stride),
-      std::move(dilation),
-      groups,
-      output_min,
-      output_max);
-}
-
-Tensor conv2d_clamp_run(
-    const Tensor& input,
-    const c10::intrusive_ptr<at::native::vulkan::Conv2dOpContext>& op_context) {
-  return op_context->run(input);
-}
-
-ContextConv2D create(
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias,
-    const IntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const float output_min,
-    const float output_max) {
-  const auto padding_expanded = expand_param_if_needed(padding, "padding", 2);
-  const auto stride_expanded = expand_param_if_needed(stride, "stride", 2);
-  const auto dilation_expanded =
-      expand_param_if_needed(dilation, "dilation", 2);
-  const Tensor weight_nchw = weight.contiguous();
-  const auto ws = weight_nchw.sizes();
-  return ContextConv2D{
-      groups == 1 ? at::native::vulkan::convolution_prepack_weights(weight_nchw)
-                  : weight_nchw.vulkan(),
-      bias.has_value() ? c10::make_optional((*bias).vulkan()) : c10::nullopt,
-      // TODO: Are we sure these tensors will always come into this fucntion with the
-      // the dimensions expected below? What if they don't?  This may trigger a segfault.
-      // TODO: If we need TORCH_CHECK(available()) calls here as a sanity check, add it.
-      {{ws[0], ws[1], ws[2], ws[3]}},
-      {padding_expanded[0], padding_expanded[1]},
-      {stride_expanded[0], stride_expanded[1]},
-      {dilation_expanded[0], dilation_expanded[1]},
-      groups,
-      output_min,
-      output_max};
-}
-
-Tensor run(const ContextConv2D& context, const Tensor& input) {
-  return at::native::vulkan::convolution_prepacked(
-      input,
-      context.weight_size_,
-      context.weight_prepacked_vulkan_,
-      context.bias_vulkan_,
-      context.padding_,
-      context.stride_,
-      context.dilation_,
-      context.groups_,
-      context.output_min_,
-      context.output_max_);
-}
-
-} // namespace convolution2d
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanConvolution.h b/aten/src/ATen/native/vulkan/VulkanConvolution.h
deleted file mode 100644
index e956d133a155..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanConvolution.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#pragma once
-
-#include <array>
-
-#include <ATen/Tensor.h>
-#include <ATen/native/vulkan/VulkanCommon.h>
-#include <ATen/native/vulkan/VulkanOpContext.h>
-#include <c10/util/ArrayRef.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-struct Conv2DParams final {
-  int64_t N; // batch size
-  int64_t C; // channels
-  int64_t H; // input height
-  int64_t W; // input width
-  int64_t OC; // output channels
-  int64_t KH; // kernel height
-  int64_t KW; // kernel width
-  int64_t SY; // stride y (height)
-  int64_t SX; // stride x (width)
-  int64_t PY; // padding y (height)
-  int64_t PX; // padding x (width)
-  int64_t DY; // dilation y (height)
-  int64_t DX; // dilation x (width)
-  int64_t G; // groups
-  int64_t OW; // output width
-  int64_t OH; // output height
-  int64_t OC_4;
-  int64_t C_4;
-
-  Conv2DParams() = delete;
-  Conv2DParams(
-      c10::IntArrayRef inputSizes,
-      int64_t OC,
-      int64_t KH,
-      int64_t KW,
-      int64_t SY,
-      int64_t SX,
-      int64_t PY,
-      int64_t PX,
-      int64_t DY,
-      int64_t DX,
-      int64_t G)
-    // TODO: What if inputSizes is not of the expected dimensionality?
-    // Should check prior to indexing.
-      : N(inputSizes[0]),
-        C(inputSizes[1]),
-        H(inputSizes[2]),
-        W(inputSizes[3]),
-        OC(OC),
-        KH(KH),
-        KW(KW),
-        SY(SY),
-        SX(SX),
-        PY(PY),
-        PX(PX),
-        DY(DY),
-        DX(DX),
-        G(G) {
-    OC_4 = UP_DIV(OC, 4);
-    C_4 = UP_DIV(C, 4);
-    const int64_t KWE = (KW - 1) * DX + 1;
-    const int64_t KHE = (KH - 1) * DY + 1;
-    OW = ((W - KWE + 2 * PX) / SX) + 1;
-    OH = ((H - KHE + 2 * PY) / SY) + 1;
-  }
-
-  Conv2DParams(
-      c10::IntArrayRef inputSizes,
-      c10::IntArrayRef weightSizes,
-      c10::IntArrayRef padding,
-      c10::IntArrayRef stride,
-      c10::IntArrayRef dilation,
-      int64_t groups)
-    // TODO: What if these parameters are not of the correct dimensionality?
-    // Should check prior to indexing.
-      : Conv2DParams(
-            inputSizes,
-            weightSizes[0],
-            weightSizes[2],
-            weightSizes[3],
-            stride[0],
-            stride[1],
-            padding[0],
-            padding[1],
-            dilation[0],
-            dilation[1],
-            groups) {}
-
-  std::vector<int64_t> output_sizes() const {
-    return {N, OC, OH, OW};
-  }
-};
-
-namespace detail {
-namespace convolution2d {
-
-c10::intrusive_ptr<at::native::vulkan::Conv2dOpContext>
-createConv2dClampPrePackOpContext(
-    Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max);
-
-Tensor conv2d_clamp_run(
-    const Tensor& input,
-    const c10::intrusive_ptr<at::native::vulkan::Conv2dOpContext>& op_context);
-
-ContextConv2D create(
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias,
-    const IntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const float output_min,
-    const float output_max);
-
-Tensor run(const ContextConv2D& context, const Tensor& input);
-
-} // namespace convolution2d
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanOpContext.cpp b/aten/src/ATen/native/vulkan/VulkanOpContext.cpp
deleted file mode 100644
index c5e613f4827d..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanOpContext.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <ATen/native/vulkan/VulkanOpContext.h>
-#include <ATen/native/vulkan/VulkanConvolution.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-c10::intrusive_ptr<Conv2dOpContext> VulkanConv2dOpContext::create_context(
-    at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& dilation,
-    const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
-  auto op_context = vulkan::detail::convolution2d::create(
-      weight,
-      bias,
-      padding,
-      stride,
-      dilation,
-      groups,
-      output_min ? output_min->to<float>() : vulkan::ContextConv2D::kMin,
-      output_max ? output_max->to<float>() : vulkan::ContextConv2D::kMax);
-  return c10::make_intrusive<VulkanConv2dOpContext>(
-      std::move(weight),
-      std::move(bias),
-      std::move(padding),
-      std::move(stride),
-      std::move(dilation),
-      groups,
-      output_min,
-      output_max,
-      std::move(op_context));
-}
-
-Tensor VulkanConv2dOpContext::run(const Tensor& input) {
-  return vulkan::detail::convolution2d::run(op_context_, input);
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanOpContext.h b/aten/src/ATen/native/vulkan/VulkanOpContext.h
deleted file mode 100644
index 970b4edca39c..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanOpContext.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-
-#include <ATen/Tensor.h>
-#include <ATen/native/vulkan/VulkanCommon.h>
-#include <torch/custom_class.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-using SerializationTypeConv2dPrePack = std::tuple<
-    Tensor,
-    c10::optional<Tensor>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    int64_t,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
-
-class Conv2dOpContext : public torch::jit::CustomClassHolder {
- protected:
-  Tensor orig_weight_;
-  c10::optional<Tensor> orig_bias_;
-  std::vector<int64_t> stride_;
-  std::vector<int64_t> padding_;
-  std::vector<int64_t> dilation_;
-  int64_t groups_;
-  c10::optional<Scalar> output_min_;
-  c10::optional<Scalar> output_max_;
-
- public:
-  SerializationTypeConv2dPrePack unpack() {
-    return std::make_tuple(
-        orig_weight_,
-        orig_bias_,
-        stride_,
-        padding_,
-        dilation_,
-        groups_,
-        output_min_,
-        output_max_);
-  }
-
-  virtual Tensor run(const Tensor& input) = 0;
-};
-
-class VulkanConv2dOpContext final : public Conv2dOpContext {
- private:
-  ContextConv2D op_context_;
-
- public:
-  VulkanConv2dOpContext(
-      Tensor&& weight,
-      c10::optional<Tensor>&& bias,
-      std::vector<int64_t>&& padding,
-      std::vector<int64_t>&& stride,
-      std::vector<int64_t>&& dilation,
-      uint64_t groups,
-      const c10::optional<Scalar>& min,
-      const c10::optional<Scalar>& max,
-      ContextConv2D&& op_context)
-      : op_context_(std::move(op_context)) {
-    orig_weight_ = std::move(weight);
-    orig_bias_ = std::move(bias);
-    padding_ = std::move(padding);
-    stride_ = std::move(stride);
-    dilation_ = std::move(dilation);
-    groups_ = groups;
-    output_min_ = min;
-    output_max_ = max;
-  }
-
-   Tensor run(const Tensor& input) override;
-
-  static c10::intrusive_ptr<Conv2dOpContext> create_context(
-      Tensor&& weight,
-      c10::optional<Tensor>&& bias,
-      std::vector<int64_t>&& padding,
-      std::vector<int64_t>&& stride,
-      std::vector<int64_t>&& dilation,
-      int64_t groups,
-      const c10::optional<Scalar>& output_min,
-      const c10::optional<Scalar>& output_max);
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
index 9e48de07094a..05c5ce977cd1 100644
--- a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
+++ b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
@@ -24,10 +24,9 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
             sizes,
             false),
         strides_(strides.vec()) {
-    TensorImpl::set_has_contiguity_policy(TensorImpl::HasContiguityPolicy::CustomBehavior);
   }
 
-  IntArrayRef strides() const override {
+  IntArrayRef strides_custom() const override {
     return strides_;
   }
 
@@ -35,16 +34,13 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
     return true;
   }
 
-  int64_t stride(int64_t d) const override {
-    d = at::maybe_wrap_dim(d, this->dim(), false);
-    return strides_[d];
-  }
-
  private:
   const char* tensorimpl_type_name() const override {
     return "VulkanOpaqueTensorImpl";
   }
 
+  // TODO: storing strides separately is unnecessary, the base TensorImpl
+  // has space for them
   SmallVector<int64_t, 5> strides_;
 };
 
diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp
deleted file mode 100644
index 7cbd7479e256..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ /dev/null
@@ -1,1307 +0,0 @@
-#include <ATen/InferSize.h>
-#include <ATen/Utils.h>
-#include <c10/util/accumulate.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Optional.h>
-#include <c10/util/irange.h>
-
-#include <ATen/native/vulkan/Vulkan.h>
-#include <ATen/native/vulkan/VulkanCommon.h>
-#include <ATen/native/vulkan/VulkanConvolution.h>
-#include <ATen/native/vulkan/VulkanOps.h>
-
-#include <iostream>
-#include <limits>
-#include <vector>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace detail {
-
-void upsample_nearest2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    int64_t IH,
-    int64_t IW,
-    int64_t OH,
-    int64_t OW,
-    int64_t IN,
-    int64_t IC,
-    float scaleH,
-    float scaleW) {
-  auto device = context().device();
-  int64_t C = IN * IC;
-  struct ConstBlock {
-    float scaleX;
-    float scaleY;
-  };
-  ConstBlock cb{scaleW,
-                scaleH};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(upsample_nearest2d), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer());
-  computeUnit.dispatchCommandBuffer(OW, OH, C, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-VulkanTensor reshape_copy(
-    const VulkanTensor& input,
-    std::vector<int64_t> shape) {
-  input.sync_image_to_buffer();
-  VulkanTensor output{infer_size(shape, input.numel())};
-  copy_buffer_to_buffer(
-      *(input.buffer()), *(output.buffer()), input.buffer()->sizeBytes());
-  return output;
-}
-
-VulkanTensor cat(
-    VulkanTensor& output,
-    ArrayRef<VulkanTensor> inputs,
-    int64_t dim) {
-  VkDeviceSize outputOffset = 0;
-  for (const auto& input : inputs) {
-    input.sync_image_to_buffer();
-    const auto sizeBytes = sizeof(float) * input.numel();
-    copy_buffer_to_buffer(
-        *(input.buffer()), *(output.buffer()), sizeBytes, 0, outputOffset);
-    outputOffset += sizeBytes;
-  }
-  return output;
-}
-
-void adaptive_avg_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int64_t IH,
-    const int64_t IW,
-    const int64_t OH,
-    const int64_t OW,
-    const int64_t IN,
-    const int64_t IC) {
-  auto device = context().device();
-  int64_t C = IN * IC;
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(adaptive_avg_pool2d), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer());
-  computeUnit.dispatchCommandBuffer(OW, OH, C, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void max_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int iH,
-    const int iW,
-    const int oH,
-    const int oW,
-    const int _n,
-    const int _c,
-    const int kH,
-    const int kW,
-    const int dH,
-    const int dW,
-    const int padH,
-    const int padW,
-    const int dilationH,
-    const int dilationW) {
-  auto device = context().device();
-  const auto c = _n * _c;
-  struct ConstBlock {
-    int32_t inputSize[4];
-    int32_t outputSize[4];
-    int32_t kernelSize[2];
-    int32_t stride[2];
-    int32_t padding[2];
-    int32_t dilate[2];
-  };
-  ConstBlock cb{
-      {iW, iH, c, 0},
-      {oW, oH, c, 0},
-      {kW, kH},
-      {dW, dH},
-      {padW, padH},
-      {dilationW, dilationH},
-  };
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(max_pool2d), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer());
-  computeUnit.dispatchCommandBuffer(oW, oH, c, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void avg_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int iH,
-    const int iW,
-    const int oH,
-    const int oW,
-    const int _n,
-    const int _c,
-    const int kH,
-    const int kW,
-    const int dH,
-    const int dW,
-    const int padH,
-    const int padW) {
-  auto device = context().device();
-  const auto c = _n * _c;
-  struct ConstBlock {
-    int32_t kernelSize[2];
-    int32_t stride[2];
-    int32_t padding[2];
-  };
-  ConstBlock cb{
-      {kW, kH},
-      {dW, dH},
-      {padW, padH},
-  };
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(avg_pool2d), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer());
-  computeUnit.dispatchCommandBuffer(oW, oH, c, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-VulkanTensor transpose(
-    const VulkanTensor& input,
-    const int64_t dim0,
-    const int64_t dim1) {
-  const auto idim = input.dim();
-  TORCH_INTERNAL_ASSERT(
-      idim <= 6, "Vulkan transpose is implemented only for dim <= 6");
-  auto device = context().device();
-  struct ConstBlock {
-    int32_t istrides[8];
-    int32_t ostrides[8];
-    int32_t odims[8];
-    int32_t storageOffset;
-  };
-
-  auto isizes = input.sizes();
-  auto osizes = isizes;
-  std::swap(osizes[dim0], osizes[dim1]);
-  VulkanTensor output{osizes};
-  output.allocate_storage();
-
-  std::array<int32_t, 8> idims8;
-  idims8.fill(1);
-  std::array<int32_t, 8> odims8;
-  odims8.fill(1);
-  std::copy(isizes.cbegin(), isizes.cend(), idims8.end() - idim);
-  std::copy(osizes.cbegin(), osizes.cend(), odims8.end() - idim);
-  std::array<int32_t, 8> istrides8;
-  istrides8.fill(1);
-  std::array<int32_t, 8> ostrides8;
-  ostrides8.fill(1);
-  for (int i = 6; i >= 0; --i) {
-    istrides8[i] = idims8[i + 1] * istrides8[i + 1];
-    ostrides8[i] = odims8[i + 1] * ostrides8[i + 1];
-  }
-  std::swap(istrides8[8 - idim + dim0], istrides8[8 - idim + dim1]);
-
-  ConstBlock cb{};
-  std::copy(istrides8.cbegin(), istrides8.cend(), std::begin(cb.istrides));
-  std::copy(ostrides8.cbegin(), ostrides8.cend(), std::begin(cb.ostrides));
-  std::copy(odims8.cbegin(), odims8.cend(), std::begin(cb.odims));
-  cb.storageOffset = 0;
-
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.buffer()->bind(descriptorSet, 0);
-  input.buffer()->bind(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(permute), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.buffer()->addBufferMemoryBarrier(
-      computeUnit.commandBuffer(), 0, input.buffer()->sizeBytes());
-  computeUnit.dispatchCommandBuffer(
-      odims8[6] * odims8[7],
-      odims8[4] * odims8[5],
-      odims8[2] * odims8[3],
-      workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-  return output;
-}
-
-VulkanTensor slice(
-    const VulkanTensor& input,
-    const int64_t dim,
-    const int64_t _start,
-    const int64_t _end,
-    const int64_t step) {
-  const auto isizes = input.sizes();
-  auto osizes = isizes;
-  auto start = _start;
-  auto end = _end;
-  if (start < 0) {
-    start += isizes[dim];
-  }
-  if (end < 0) {
-    end += isizes[dim];
-  }
-  if (start < 0) {
-    start = 0;
-  } else if (start >= isizes[dim]) {
-    start = isizes[dim];
-  }
-  if (end < start) {
-    end = start;
-  } else if (end >= isizes[dim]) {
-    end = isizes[dim];
-  }
-  const auto len = end - start;
-  osizes[dim] = (len + step - 1) / step;
-
-  VulkanTensor output{osizes};
-  output.allocate_storage();
-
-  auto idim = input.dim();
-  std::array<int32_t, 8> idims8;
-  idims8.fill(1);
-  std::copy(isizes.cbegin(), isizes.cend(), idims8.end() - idim);
-  std::array<int32_t, 8> istrides8;
-  istrides8.fill(1);
-  for (int i = 6; i >= 0; --i) {
-    istrides8[i] = idims8[i + 1] * istrides8[i + 1];
-  }
-
-  std::array<int32_t, 8> odims8 = idims8;
-  std::array<int32_t, 8> ostrides8 = istrides8;
-
-  ostrides8[8 - idim + dim] *= step;
-  auto storage_offset = start * istrides8[8 - idim + dim];
-
-  auto device = context().device();
-  struct ConstBlock {
-    int32_t istrides[8];
-    int32_t ostrides[8];
-    int32_t odims[8];
-    int32_t storageOffset;
-  };
-
-  ConstBlock cb{};
-  std::copy(istrides8.cbegin(), istrides8.cend(), std::begin(cb.istrides));
-  std::copy(ostrides8.cbegin(), ostrides8.cend(), std::begin(cb.ostrides));
-  std::copy(odims8.cbegin(), odims8.cend(), std::begin(cb.odims));
-  cb.storageOffset = storage_offset;
-
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.buffer()->bind(descriptorSet, 0);
-  input.buffer()->bind(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(permute), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  input.buffer()->addBufferMemoryBarrier(
-      computeUnit.commandBuffer(), 0, input.buffer()->sizeBytes());
-  computeUnit.dispatchCommandBuffer(
-      odims8[6] * odims8[7],
-      odims8[4] * odims8[5],
-      odims8[2] * odims8[3],
-      workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-  return output;
-}
-
-void add(
-    VulkanTensor& output,
-    const VulkanTensor& input0,
-    const VulkanTensor& input1,
-    float alpha) {
-  auto odim = output.dim();
-  TORCH_INTERNAL_ASSERT(
-      odim <= 4, "Vulkan add is implemented for dim <= 4, output dim > 4");
-  auto i0dim = input0.dim();
-  TORCH_INTERNAL_ASSERT(
-      i0dim <= 4, "Vulkan add is implemented for dim <= 4, input0 dim > 4");
-  auto i1dim = input1.dim();
-  TORCH_INTERNAL_ASSERT(
-      i1dim <= 4, "Vulkan add is implemented for dim <= 4, input1 dim > 4");
-
-  auto os = output.sizes();
-  auto i0s = input0.sizes();
-  auto i1s = input1.sizes();
-
-  std::array<int64_t, 4> os4 = {1, 1, 1, 1};
-  std::copy(os.begin(), os.end(), os4.end() - odim);
-  std::array<int64_t, 4> i0s4 = {1, 1, 1, 1};
-  std::copy(i0s.cbegin(), i0s.cend(), i0s4.end() - i0dim);
-  std::array<int64_t, 4> i1s4 = {1, 1, 1, 1};
-  std::copy(i1s.cbegin(), i1s.cend(), i1s4.end() - i1dim);
-
-  TORCH_INTERNAL_ASSERT(
-      (os4 == i0s4) && (i0s4 == i1s4),
-      "Vulkan add expects the same dimensions for all operands");
-
-  auto C = os4[0] * os4[1];
-  auto H = os4[2];
-  auto W = os4[3];
-
-  auto device = context().device();
-  struct ConstBlock {
-    float alpha;
-  };
-  ConstBlock cb{alpha};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input0.image()->bindShaderRead(descriptorSet, 1);
-  input1.image()->bindShaderRead(descriptorSet, 2);
-  constBuffer.bind(descriptorSet, 3);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(add), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input0.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  input1.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(W, H, C, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void add(VulkanTensor& output, const VulkanTensor& input, const float s) {
-  const auto sizes = input.sizes();
-
-  const auto C = c10::multiply_integers(sizes.cbegin(), sizes.cend() - 2);
-  const auto C_4 = UP_DIV(C, 4);
-  const auto H = sizes[2];
-  const auto W = sizes[3];
-
-  auto device = context().device();
-  struct ConstBlock {
-    float s;
-  };
-  ConstBlock cb{s};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(add_scalar), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(W, H, C_4, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void mul(VulkanTensor& output, const VulkanTensor& input, const float s) {
-  const auto sizes = input.sizes();
-
-  const auto C = c10::multiply_integers(sizes.cbegin(), sizes.cend() - 2);
-  const auto C_4 = UP_DIV(C, 4);
-  const auto H = sizes[2];
-  const auto W = sizes[3];
-
-  auto device = context().device();
-  struct ConstBlock {
-    float s;
-  };
-  ConstBlock cb{s};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(mul_scalar), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(W, H, C_4, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4(
-    const float* weights,
-    const int OC,
-    const int C,
-    const int KH,
-    const int KW) {
-  const auto C_4 = UP_DIV(C, 4);
-  const auto kBufSizeNumel = ALIGN_UP4(OC) * ALIGN_UP4(C) * KH * KW;
-  auto size = sizeof(float) * kBufSizeNumel;
-  VBuffer kernelBuffer{size};
-  const int oc_4SizeNumel = KW * KH * C_4 * 16;
-  auto mappedMemory = kernelBuffer.map();
-  if (mappedMemory.ptr()) {
-    float* basePtr = (float*)mappedMemory.ptr();
-    memset(basePtr, 0, size);
-    const float* src = weights;
-    int ridx = 0;
-    for (const auto oc : c10::irange(OC)) {
-      int oc_4 = oc / 4;
-      int oc_4_i = oc % 4;
-      float* dst_oc = basePtr + oc_4 * oc_4SizeNumel;
-      for (const auto ic : c10::irange(C)) {
-        int ic_4 = ic / 4;
-        int ic_4_i = ic % 4;
-        float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
-        for (const auto ky : c10::irange(KH)) {
-          float* dst_ky = dst_ic + ky * KW * 16;
-          for (const auto kx : c10::irange(KW)) {
-            float* dst_kx = dst_ky + kx * 16;
-            dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
-          }
-        }
-      }
-    }
-  }
-  mappedMemory.flushWriteToDevice();
-  return kernelBuffer;
-}
-
-VBuffer bufferFromOptionalHostData(
-    c10::optional<const float*> data,
-    const uint32_t dataSize,
-    const uint32_t bufferSize) {
-  TORCH_INTERNAL_ASSERT(
-      dataSize <= bufferSize,
-      "buffer size(",
-      bufferSize,
-      ") is not enough for data(",
-      dataSize,
-      ")");
-  const auto sizeAligned =
-      ROUND_UP(bufferSize, context().limits().minStorageBufferOffsetAlignment);
-  VBuffer buffer{sizeAligned};
-  if (data.has_value()) {
-    buffer.copy_from_host_to_device(*data, dataSize);
-  } else {
-    buffer.set_zeros();
-  }
-  return buffer;
-}
-
-VBuffer bufferZeros(const uint32_t size) {
-  VBuffer buffer{size};
-  buffer.set_zeros();
-  return buffer;
-}
-
-void conv2d_depthwise(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight,
-    const VBuffer& biasBuffer,
-    const Conv2DParams& params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  TORCH_INTERNAL_ASSERT(params.G == params.C);
-  auto osizes = output.sizes();
-  TORCH_INTERNAL_ASSERT(osizes[2] == params.OH);
-  TORCH_INTERNAL_ASSERT(osizes[3] == params.OW);
-  struct ConstBlock {
-    int32_t padding[2];
-    int32_t kernelSize[2];
-    int32_t stride[2];
-    int32_t dilate[2];
-    int32_t inputSize[4];
-    int32_t outputSize[4];
-    float outputMin;
-    float outputMax;
-  };
-  ConstBlock cb{
-      {safe_downcast<int32_t>(params.PX), safe_downcast<int32_t>(params.PY)},
-      {safe_downcast<int32_t>(params.KW), safe_downcast<int32_t>(params.KH)},
-      {safe_downcast<int32_t>(params.SX), safe_downcast<int32_t>(params.SY)},
-      {safe_downcast<int32_t>(params.DX), safe_downcast<int32_t>(params.DY)},
-      {safe_downcast<int32_t>(params.OW),
-       safe_downcast<int32_t>(params.OH),
-       safe_downcast<int32_t>(params.OC_4),
-       0},
-      {safe_downcast<int32_t>(params.W),
-       safe_downcast<int32_t>(params.H),
-       safe_downcast<int32_t>(params.C_4),
-       0},
-      output_min ? *output_min : -std::numeric_limits<float>::infinity(),
-      output_max ? *output_max : std::numeric_limits<float>::infinity()};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  auto device = context().device();
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  weight.image()->bindShaderRead(descriptorSet, 2);
-  biasBuffer.bind(descriptorSet, 3);
-  constBuffer.bind(descriptorSet, 4);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(conv2d_dw_clamp), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  weight.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(
-      params.OW, params.OH, params.OC_4, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void conv2d_depthwise(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight,
-    const c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  conv2d_depthwise(
-      output,
-      input,
-      weight,
-      bufferFromOptionalHostData(
-          bias,
-          sizeof(float) * params.OC,
-          sizeof(float) * ALIGN_UP4(params.OC)),
-      params,
-      output_min,
-      output_max);
-}
-
-void conv2d_depthwise(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const float* weight,
-    const c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  VulkanTensor weightTensor{{params.OC, params.KH, params.KW}};
-  weightTensor.set_data_from_host(weight);
-  conv2d_depthwise(
-      output,
-      input,
-      weightTensor,
-      bufferFromOptionalHostData(
-          bias,
-          sizeof(float) * params.OC,
-          sizeof(float) * ALIGN_UP4(params.OC)),
-      params,
-      output_min,
-      output_max);
-}
-
-ImageSizes conv2d_prepack_weights_image_sizes(
-    int64_t argOC,
-    int64_t argC,
-    int64_t KH,
-    int64_t KW) {
-  const int32_t C = safe_downcast<int32_t>(argC);
-  const int32_t OC = safe_downcast<int32_t>(argOC);
-  const int32_t Cup4 = ALIGN_UP4(C);
-  const int32_t OC_4 = UP_DIV(OC, 4);
-  const int32_t Z = safe_downcast<int32_t>(KH) * safe_downcast<int32_t>(KW);
-  return {{Cup4, OC_4, Z}, {Cup4, OC_4, Z}};
-}
-
-void conv2d_prepack_weights_to_image(
-    VImage& image,
-    const float* weight,
-    int64_t OC,
-    int64_t C,
-    int64_t KH,
-    int64_t KW) {
-  auto kernelBuffer = kernelNCHW_OCHW_repack_O4C4HWi4o4(weight, OC, C, KH, KW);
-  auto OC_4 = UP_DIV(OC, 4);
-  auto C_4 = UP_DIV(C, 4);
-
-  auto expectedSizes = conv2d_prepack_weights_image_sizes(OC, C, KH, KW);
-  TORCH_INTERNAL_ASSERT(
-      image.sizes() == expectedSizes.imageSize,
-      "Out VImage sizes do not match expected");
-
-  struct ConstBlock {
-    int32_t KWxKH;
-    int32_t C_4;
-  };
-  ConstBlock cb{safe_downcast<int32_t>(KW * KH), safe_downcast<int32_t>(C_4)};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      context().device(),
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  image.bindStorageImage(descriptorSet, 0);
-  kernelBuffer.bind(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{1, 1, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(KO4C4HW_to_image), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  image.addImageMemoryBarrierToGeneral(commandBuffer);
-  kernelBuffer.addBufferMemoryBarrier(
-      commandBuffer, 0, kernelBuffer.sizeBytes());
-  computeUnit.addMemoryBarrier(
-      VK_PIPELINE_STAGE_HOST_BIT,
-      VK_ACCESS_HOST_WRITE_BIT,
-      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-      VK_ACCESS_SHADER_READ_BIT);
-  computeUnit.dispatchCommandBuffer(C_4, OC_4, KH * KW, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(context().device(), descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(
-      context().device(), descriptorSetLayout, nullptr);
-}
-
-VImage conv2d_prepack_weights_image(
-    const float* weight,
-    int64_t OC,
-    int64_t C,
-    int64_t KH,
-    int64_t KW) {
-  VImage image{conv2d_prepack_weights_image_sizes(OC, C, KH, KW)};
-  conv2d_prepack_weights_to_image(image, weight, OC, C, KH, KW);
-  return image;
-}
-
-void conv2d_prepack_weights(
-    VulkanTensor& output,
-    const float* weight,
-    int64_t OC,
-    int64_t C,
-    int64_t KH,
-    int64_t KW) {
-  auto imageSizes = conv2d_prepack_weights_image_sizes(OC, C, KH, KW);
-  conv2d_prepack_weights_to_image(
-      *(output.image(imageSizes)), weight, OC, C, KH, KW);
-}
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VImage& kernelImage,
-    const VBuffer& biasBuffer,
-    const Conv2DParams& params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  TORCH_INTERNAL_ASSERT(
-      params.G == 1, "Prepacked kernel VImage for non-group conv2d only");
-  auto osizes = output.sizes();
-  TORCH_INTERNAL_ASSERT(
-      osizes[2] == params.OH,
-      "Output tensor dims do not match specified conv2d params");
-  TORCH_INTERNAL_ASSERT(
-      osizes[3] == params.OW,
-      "Output tensor dims do not match specified conv2d params");
-
-  struct ConstBlock {
-    int32_t padding[2];
-    int32_t kernelSize[2];
-    int32_t stride[2];
-    int32_t dilate[2];
-    int32_t inputSize[4];
-    int32_t outputSize[4];
-    float outputMin;
-    float outputMax;
-  };
-  float outputMin =
-      output_min ? *output_min : -std::numeric_limits<float>::infinity();
-  float outputMax =
-      output_max ? *output_max : std::numeric_limits<float>::infinity();
-  ConstBlock cb{
-      {safe_downcast<int32_t>(params.PX), safe_downcast<int32_t>(params.PY)},
-      {safe_downcast<int32_t>(params.KW), safe_downcast<int32_t>(params.KH)},
-      {safe_downcast<int32_t>(params.SX), safe_downcast<int32_t>(params.SY)},
-      {safe_downcast<int32_t>(params.DX), safe_downcast<int32_t>(params.DY)},
-      {safe_downcast<int32_t>(params.OW),
-       safe_downcast<int32_t>(params.OH),
-       safe_downcast<int32_t>(params.OC_4),
-       safe_downcast<int32_t>(params.OC)},
-      {safe_downcast<int32_t>(params.W),
-       safe_downcast<int32_t>(params.H),
-       safe_downcast<int32_t>(params.C_4),
-       safe_downcast<int32_t>(params.C)},
-      outputMin,
-      outputMax};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  auto device = context().device();
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  kernelImage.bindShaderRead(descriptorSet, 2);
-  biasBuffer.bind(descriptorSet, 3);
-  constBuffer.bind(descriptorSet, 4);
-
-  WorkGroupSize workGroupSize{1, 1, safe_downcast<uint32_t>(params.OC_4)};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(conv2d_nogroup_clamp), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  kernelImage.addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(
-      UP_DIV(params.OW, 4 * workGroupSize.x),
-      UP_DIV(params.OH, workGroupSize.y),
-      UP_DIV(params.OC_4, workGroupSize.z));
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VImage& kernelImage,
-    const c10::optional<const float*> bias,
-    const Conv2DParams& params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  TORCH_INTERNAL_ASSERT(
-      params.G == 1, "Prepacked kernel VImage for non-group conv2d only");
-  conv2d(
-      output,
-      input,
-      kernelImage,
-      bufferFromOptionalHostData(
-          bias,
-          sizeof(float) * params.OC,
-          sizeof(float) * ALIGN_UP4(params.OC)),
-      params,
-      output_min,
-      output_max);
-}
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight_prepacked,
-    c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  if (params.G > 1) {
-    conv2d_depthwise(
-        output,
-        input,
-        weight_prepacked,
-        bufferFromOptionalHostData(
-            bias,
-            sizeof(float) * params.OC,
-            sizeof(float) * ALIGN_UP4(params.OC)),
-        params,
-        output_min,
-        output_max);
-    return;
-  }
-
-  conv2d(
-      output,
-      input,
-      *(weight_prepacked.image()),
-      bias,
-      params,
-      output_min,
-      output_max);
-}
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight_prepacked,
-    const VulkanTensor& bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  if (params.G > 1) {
-    conv2d_depthwise(
-        output,
-        input,
-        weight_prepacked,
-        *(bias.buffer()),
-        params,
-        output_min,
-        output_max);
-    return;
-  }
-
-  conv2d(
-      output,
-      input,
-      *(weight_prepacked.image()),
-      *(bias.buffer()),
-      params,
-      output_min,
-      output_max);
-}
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const float* weight,
-    const c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min,
-    c10::optional<float> output_max) {
-  if (params.G > 1) {
-    TORCH_INTERNAL_ASSERT(
-        params.G == params.C,
-        "Vulkan conv2d supports only no-group and depthwise");
-    conv2d_depthwise(
-        output, input, weight, bias, params, output_min, output_max);
-    return;
-  }
-
-  conv2d(
-      output,
-      input,
-      conv2d_prepack_weights_image(
-          weight, params.OC, params.C, params.KH, params.KW),
-      bias,
-      params,
-      output_min,
-      output_max);
-}
-
-void clamp(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    float min,
-    float max) {
-  auto sizes = output.sizes();
-  auto C = sizes[0] * sizes[1];
-  auto H = sizes[2];
-  auto W = sizes[3];
-
-  auto device = context().device();
-  struct ConstBlock {
-    float min;
-    float max;
-  };
-  ConstBlock cb{min, max};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(clamp), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(W, H, C, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void addmm(
-    VulkanTensor& output,
-    c10::optional<const VulkanTensor> t,
-    const VulkanTensor& m1,
-    const VulkanTensor& m2,
-    float beta,
-    float alpha) {
-  bool hasT = t.has_value();
-  const auto m1Sizes = m1.sizes();
-  const auto m2Sizes = m2.sizes();
-  TORCH_INTERNAL_ASSERT(m1Sizes.size() == 2);
-  TORCH_INTERNAL_ASSERT(m2Sizes.size() == 2);
-  const auto m1W = m1Sizes[1];
-  const auto m1C = 1;
-  const auto m2H = m2Sizes[0];
-  const auto m2C = 1;
-  const auto OH = m1Sizes[0];
-  const auto OW = m2Sizes[1];
-
-  TORCH_INTERNAL_ASSERT(m1W == m2H);
-  TORCH_INTERNAL_ASSERT(m1C == m2C);
-
-  const auto C = m1C;
-  const auto C_4 = UP_DIV(C, 4);
-
-  auto device = context().device();
-
-  struct ConstBlock {
-    float alpha;
-    float beta;
-  };
-  ConstBlock cb{alpha, beta};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{};
-  if (hasT) {
-    descriptorTypes = {
-        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-    };
-  } else {
-    descriptorTypes = {
-        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-    };
-  }
-
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  m1.image()->bindShaderRead(descriptorSet, 1);
-  m2.image()->bindShaderRead(descriptorSet, 2);
-  if (hasT) {
-    (*t).image()->bindShaderRead(descriptorSet, 3);
-    constBuffer.bind(descriptorSet, 4);
-  }
-
-  WorkGroupSize workGroupSize{8, 8, 1};
-  if (hasT) {
-    auto& computeUnit = context().computeUnitFactory().get(
-        GLSL_SPV(addmm), descriptorSetLayout, workGroupSize);
-    computeUnit.createCommandBuffer(descriptorSet);
-    auto commandBuffer = computeUnit.commandBuffer();
-    output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-    m1.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-    m2.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-    (*t).image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-    computeUnit.dispatchCommandBuffer(OW, OH, C_4, workGroupSize);
-    computeUnit.endCommandBuffer();
-    computeUnit.submitAndWaitCommandBuffer();
-  } else {
-    auto& computeUnit = context().computeUnitFactory().get(
-        GLSL_SPV(mm), descriptorSetLayout, workGroupSize);
-    computeUnit.createCommandBuffer(descriptorSet);
-    auto commandBuffer = computeUnit.commandBuffer();
-    output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-    m1.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-    m2.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-    computeUnit.dispatchCommandBuffer(OW, OH, C_4, workGroupSize);
-    computeUnit.endCommandBuffer();
-    computeUnit.submitAndWaitCommandBuffer();
-  }
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-void mean(VulkanTensor& output, const VulkanTensor& input) {
-  auto isizes = input.sizes();
-  int32_t N = safe_downcast<int32_t>(isizes[0]);
-  int32_t C = safe_downcast<int32_t>(isizes[1]);
-  int32_t H = safe_downcast<int32_t>(isizes[2]);
-  int32_t W = safe_downcast<int32_t>(isizes[3]);
-
-  auto device = context().device();
-  struct ConstBlock {
-    int32_t W;
-    int32_t H;
-  };
-  ConstBlock cb{W, H};
-  VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
-
-  VkDescriptorSetLayout descriptorSetLayout{};
-  VkDescriptorPool descriptorPool{};
-  VkDescriptorSet descriptorSet{};
-  std::vector<VkDescriptorType> descriptorTypes{
-      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-      VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-  createDescriptorSetLayoutSinglePool(
-      device,
-      descriptorTypes,
-      &descriptorSetLayout,
-      &descriptorPool,
-      &descriptorSet);
-
-  output.image()->bindStorageImage(descriptorSet, 0);
-  input.image()->bindShaderRead(descriptorSet, 1);
-  constBuffer.bind(descriptorSet, 2);
-
-  WorkGroupSize workGroupSize{1, 1, 1};
-  auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(mean2d), descriptorSetLayout, workGroupSize);
-  computeUnit.createCommandBuffer(descriptorSet);
-  auto commandBuffer = computeUnit.commandBuffer();
-  output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
-  input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(C, N, 1, workGroupSize);
-  computeUnit.endCommandBuffer();
-  computeUnit.submitAndWaitCommandBuffer();
-  vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-  vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-}
-
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanOps.h b/aten/src/ATen/native/vulkan/VulkanOps.h
deleted file mode 100644
index b1064df9e2c1..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanOps.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#pragma once
-
-#include <ATen/native/vulkan/Vulkan.h>
-#include <ATen/native/vulkan/VulkanConvolution.h>
-#include <c10/util/Optional.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace detail {
-
-void upsample_nearest2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    int64_t IH,
-    int64_t IW,
-    int64_t OH,
-    int64_t OW,
-    int64_t N,
-    int64_t C,
-    float scaleH,
-    float scaleW);
-
-void adaptive_avg_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int64_t IH,
-    const int64_t IW,
-    const int64_t OH,
-    const int64_t OW,
-    const int64_t IN,
-    const int64_t IC);
-
-void max_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int iH,
-    const int iW,
-    const int oH,
-    const int oW,
-    const int _n,
-    const int _c,
-    const int kH,
-    const int kW,
-    const int dH,
-    const int dW,
-    const int padH,
-    const int padW,
-    const int dilationH,
-    const int dilationW);
-
-void avg_pool2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const int iH,
-    const int iW,
-    const int oH,
-    const int oW,
-    const int _n,
-    const int _c,
-    const int kH,
-    const int kW,
-    const int dH,
-    const int dW,
-    const int padH,
-    const int padW);
-
-VulkanTensor transpose(
-    const VulkanTensor& input,
-    const int64_t dim0,
-    const int64_t dim1);
-
-VulkanTensor slice(
-    const VulkanTensor& input,
-    const int64_t dim,
-    const int64_t start,
-    const int64_t end,
-    const int64_t step);
-
-VulkanTensor reshape_copy(
-    const VulkanTensor& input,
-    std::vector<int64_t> shape);
-
-VulkanTensor cat(
-    VulkanTensor& output,
-    ArrayRef<VulkanTensor> inputs,
-    int64_t dim);
-
-void add(
-    VulkanTensor& output,
-    const VulkanTensor& input0,
-    const VulkanTensor& input1,
-    float alpha);
-
-void mul(VulkanTensor& output, const VulkanTensor& input, const float s);
-
-void add(VulkanTensor& output, const VulkanTensor& input, const float s);
-
-void conv2d_prepack_weights(
-    VulkanTensor& output,
-    const float* weight,
-    int64_t OC,
-    int64_t C,
-    int64_t KH,
-    int64_t KW);
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const float* weight,
-    const c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min = c10::nullopt,
-    c10::optional<float> output_max = c10::nullopt);
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight_prepacked,
-    const c10::optional<const float*> bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min = c10::nullopt,
-    c10::optional<float> output_max = c10::nullopt);
-
-void conv2d(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    const VulkanTensor& weight_prepacked,
-    const VulkanTensor& bias,
-    const Conv2DParams params,
-    c10::optional<float> output_min = c10::nullopt,
-    c10::optional<float> output_max = c10::nullopt);
-
-void clamp(
-    VulkanTensor& output,
-    const VulkanTensor& input,
-    float min,
-    float max);
-
-void addmm(
-    VulkanTensor& output,
-    c10::optional<const VulkanTensor> t,
-    const VulkanTensor& m1,
-    const VulkanTensor& m2,
-    float beta,
-    float alpha);
-
-void mean(VulkanTensor& output, const VulkanTensor& input);
-
-} // namespace detail
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp b/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp
deleted file mode 100644
index 0a1c5fcea72d..000000000000
--- a/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <ATen/core/op_registration/op_registration.h>
-#include <torch/custom_class.h>
-
-#include <ATen/native/vulkan/VulkanConvolution.h>
-#include <ATen/native/vulkan/VulkanOpContext.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-#ifndef USE_VULKAN_API
-
-using detail::convolution2d::createConv2dClampPrePackOpContext;
-
-TORCH_LIBRARY(vulkan, m) {
-  m.class_<Conv2dOpContext>("Conv2dOpContext")
-      .def_pickle(
-          [](const c10::intrusive_ptr<Conv2dOpContext>& op_context)
-              -> SerializationTypeConv2dPrePack { // __getstate__
-            return op_context->unpack();
-          },
-          [](SerializationTypeConv2dPrePack state)
-              -> c10::intrusive_ptr<Conv2dOpContext> { // __setstate__
-            return createConv2dClampPrePackOpContext(
-                std::move(std::get<0>(state)),
-                std::move(std::get<1>(state)),
-                std::move(std::get<2>(state)),
-                std::move(std::get<3>(state)),
-                std::move(std::get<4>(state)),
-                std::move(std::get<5>(state)),
-                std::move(std::get<6>(state)),
-                std::move(std::get<7>(state)));
-          });
-}
-
-TORCH_LIBRARY(vulkan_prepack, m) {
-  m.def(
-      "conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, "
-      "int[2] padding, int[2] dilation, int groups, "
-      "Scalar? output_min=None, Scalar? output_max=None) "
-      "-> __torch__.torch.classes.vulkan.Conv2dOpContext");
-  m.def(
-      "conv2d_clamp_run(Tensor X, "
-      "__torch__.torch.classes.vulkan.Conv2dOpContext W_prepack) -> Tensor Y");
-}
-
-TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
-  m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext));
-}
-
-TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
-  m.impl("conv2d_clamp_run", detail::convolution2d::conv2d_clamp_run);
-}
-
-#endif /* USE_VULKAN_API */
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
new file mode 100644
index 000000000000..461e1ec92dcc
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -0,0 +1,398 @@
+#include <ATen/native/vulkan/api/Adapter.h>
+#include <iomanip>
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+namespace {
+
+void find_requested_device_extensions(
+    VkPhysicalDevice physical_device,
+    std::vector<const char*>& enabled_extensions,
+    const std::vector<const char*>& requested_extensions) {
+  uint32_t device_extension_properties_count = 0;
+  VK_CHECK(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &device_extension_properties_count, nullptr));
+  std::vector<VkExtensionProperties> device_extension_properties(
+      device_extension_properties_count);
+  VK_CHECK(vkEnumerateDeviceExtensionProperties(
+      physical_device,
+      nullptr,
+      &device_extension_properties_count,
+      device_extension_properties.data()));
+
+  std::vector<const char*> enabled_device_extensions;
+
+  for (const auto& requested_extension : requested_extensions) {
+    for (const auto& extension : device_extension_properties) {
+      if (strcmp(requested_extension, extension.extensionName) == 0) {
+        enabled_extensions.push_back(requested_extension);
+        break;
+      }
+    }
+  }
+}
+
+//
+// Print utils
+//
+
+std::string get_device_type_str(const VkPhysicalDeviceType type) {
+  switch(type) {
+    case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU:
+      return "INTEGRATED_GPU";
+    case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU:
+      return "DISCRETE_GPU";
+    case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU:
+      return "VIRTUAL_GPU";
+    case VK_PHYSICAL_DEVICE_TYPE_CPU:
+      return "CPU";
+    default:
+      return "UNKOWN";
+  }
+}
+
+std::string get_memory_properties_str(const VkMemoryPropertyFlags flags) {
+  std::bitset<10> values(flags);
+  std::stringstream ss("|");
+  if (values[0]) {
+    ss << " DEVICE_LOCAL |";
+  }
+  if (values[1]) {
+    ss << " HOST_VISIBLE |";
+  }
+  if (values[2]) {
+    ss << " HOST_COHERENT |";
+  }
+  if (values[3]) {
+    ss << " HOST_CACHED |";
+  }
+  if (values[4]) {
+    ss << " LAZILY_ALLOCATED |";
+  }
+
+  return ss.str();
+}
+
+std::string get_queue_family_properties_str(const VkQueueFlags flags) {
+  std::bitset<10> values(flags);
+  std::stringstream ss("|");
+  if (values[0]) {
+    ss << " GRAPHICS |";
+  }
+  if (values[1]) {
+    ss << " COMPUTE |";
+  }
+  if (values[2]) {
+    ss << " TRANSFER |";
+  }
+
+  return ss.str();
+}
+
+} // namespace
+
+Adapter::Adapter(const VkPhysicalDevice handle, const uint32_t num_queues)
+  : physical_handle_(handle),
+    properties_{},
+    memory_properties_{},
+    queue_families_{},
+    num_requested_queues_{num_queues},
+    queue_usage_{},
+    handle_(VK_NULL_HANDLE),
+    queues_{},
+    num_compute_queues_{},
+    has_unified_memory_{false},
+    timestamp_compute_and_graphics_{false},
+    timestamp_period_{0.f} {
+  // This should never happen, but double check to be safe
+  TORCH_CHECK(
+      VK_NULL_HANDLE != physical_handle_,
+      "Pytorch Vulkan Adapter: VK_NULL_HANDLE passed to Adapter constructor!")
+
+  vkGetPhysicalDeviceProperties(physical_handle_, &properties_);
+  vkGetPhysicalDeviceMemoryProperties(physical_handle_, &memory_properties_);
+
+  timestamp_compute_and_graphics_ = properties_.limits.timestampComputeAndGraphics;
+  timestamp_period_ = properties_.limits.timestampPeriod;
+
+  // Check if there are any memory types have both the HOST_VISIBLE and the
+  // DEVICE_LOCAL property flags
+  const VkMemoryPropertyFlags unified_memory_flags =
+    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+  for (const uint32_t i : c10::irange(memory_properties_.memoryTypeCount)) {
+    if (memory_properties_.memoryTypes[i].propertyFlags | unified_memory_flags) {
+      has_unified_memory_ = true;
+      break;
+    }
+  }
+
+  uint32_t queue_family_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_handle_, &queue_family_count, nullptr);
+
+  queue_families_.resize(queue_family_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_handle_, &queue_family_count, queue_families_.data());
+
+  // Find the total number of compute queues
+  for (const uint32_t family_i : c10::irange(queue_families_.size())) {
+    const VkQueueFamilyProperties& properties = queue_families_[family_i];
+    // Check if this family has compute capability
+    if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
+      num_compute_queues_ += properties.queueCount;
+    }
+  }
+
+  queue_usage_.reserve(num_requested_queues_);
+  queues_.reserve(num_requested_queues_);
+}
+
+Adapter::Adapter(Adapter&& other) noexcept
+  : physical_handle_(other.physical_handle_),
+    properties_(other.properties_),
+    memory_properties_(other.memory_properties_),
+    queue_families_(std::move(other.queue_families_)),
+    num_requested_queues_(other.num_requested_queues_),
+    queue_usage_(std::move(other.queue_usage_)),
+    handle_(other.handle_),
+    queues_(std::move(other.queues_)),
+    num_compute_queues_(other.num_compute_queues_),
+    has_unified_memory_(other.has_unified_memory_),
+    timestamp_compute_and_graphics_(other.timestamp_compute_and_graphics_),
+    timestamp_period_(other.timestamp_period_) {
+  other.physical_handle_ = VK_NULL_HANDLE;
+  other.handle_ = VK_NULL_HANDLE;
+}
+
+Adapter::~Adapter() {
+  if C10_LIKELY(VK_NULL_HANDLE == handle_) {
+    return;
+  }
+  vkDestroyDevice(handle_, nullptr);
+  handle_ = VK_NULL_HANDLE;
+}
+
+void Adapter::init_device() {
+  // It is possible that multiple threads will attempt to initialize the device
+  // simultaneously, so lock the mutex before initializing
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  // Do not initialize the device if there are no compute queues available
+  TORCH_CHECK(
+      num_compute_queues_ > 0,
+      "Pytorch Vulkan Adapter: Cannot initialize Adapter as this device does not "
+      "have any queues that support compute!")
+
+  // This device has already been initialized, no-op
+  if C10_LIKELY(VK_NULL_HANDLE != handle_) {
+    return;
+  }
+
+  //
+  // Find compute queues up to the requested number of queues
+  //
+
+  std::vector<VkDeviceQueueCreateInfo> queue_create_infos;
+  queue_create_infos.reserve(num_requested_queues_);
+
+  std::vector<std::pair<uint32_t, uint32_t>> queues_to_get;
+  queues_to_get.reserve(num_requested_queues_);
+
+  uint32_t remaining_queues = num_requested_queues_;
+  for (const uint32_t family_i : c10::irange(queue_families_.size())) {
+    const VkQueueFamilyProperties& properties = queue_families_[family_i];
+    // Check if this family has compute capability
+    if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
+      const uint32_t queues_to_init = std::min(
+          remaining_queues, properties.queueCount);
+
+      const std::vector<float> queue_priorities(queues_to_init, 1.0f);
+      queue_create_infos.push_back({
+        VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,  // sType
+        nullptr,  // pNext
+        0u,  // flags
+        family_i,  // queueFamilyIndex
+        queues_to_init,  // queueCount
+        queue_priorities.data(),  // pQueuePriorities
+      });
+
+      for (const uint32_t queue_i : c10::irange(queues_to_init)) {
+        // Use this to get the queue handle once device is created
+        queues_to_get.emplace_back(family_i, queue_i);
+      }
+      remaining_queues -= queues_to_init;
+    }
+    if (remaining_queues == 0) {
+      break;
+    }
+  }
+
+  //
+  // Create the VkDevice
+  //
+
+  std::vector<const char*> requested_device_extensions {
+  #ifdef VK_KHR_portability_subset
+    VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+  #endif
+  };
+
+  std::vector<const char*> enabled_device_extensions;
+  find_requested_device_extensions(
+      physical_handle_, enabled_device_extensions, requested_device_extensions);
+
+  const VkDeviceCreateInfo device_create_info{
+    VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,  // sType
+    nullptr,  // pNext
+    0u,  // flags
+    static_cast<uint32_t>(queue_create_infos.size()),  // queueCreateInfoCount
+    queue_create_infos.data(),  // pQueueCreateInfos
+    0u,  // enabledLayerCount
+    nullptr,  // ppEnabledLayerNames
+    static_cast<uint32_t>(enabled_device_extensions.size()),  // enabledExtensionCount
+    enabled_device_extensions.data(),  // ppEnabledExtensionNames
+    nullptr,  // pEnabledFeatures
+  };
+
+  const VkResult device_create_res = vkCreateDevice(
+      physical_handle_, &device_create_info, nullptr, &handle_);
+  // If device was not created successfully, ensure handle_ is invalid and throw
+  if (VK_SUCCESS != device_create_res) {
+    handle_ = VK_NULL_HANDLE;
+    VK_CHECK(device_create_res);
+  }
+
+#ifdef USE_VULKAN_VOLK
+  volkLoadDevice(handle_);
+#endif
+
+  //
+  // Obtain handles for the created queues and initialize queue usage heuristic
+  //
+
+  for (const std::pair<uint32_t, uint32_t>& queue_idx : queues_to_get) {
+    VkQueue queue_handle = VK_NULL_HANDLE;
+    VkQueueFlags flags = queue_families_[queue_idx.first].queueFlags;
+    vkGetDeviceQueue(
+        handle_, queue_idx.first, queue_idx.second, &queue_handle);
+    queues_.push_back({queue_idx.first, queue_idx.second, flags, queue_handle});
+    // Initial usage value
+    queue_usage_.push_back(0);
+  }
+}
+
+Adapter::Queue Adapter::request_queue() {
+  // Lock the mutex as multiple threads can request a queue at the same time
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  uint32_t min_usage = UINT32_MAX;
+  uint32_t min_used_i = 0;
+  for (const uint32_t i : c10::irange(queues_.size())) {
+    if (queue_usage_[i] < min_usage) {
+      min_used_i = i;
+      min_usage = queue_usage_[i];
+    }
+  }
+  queue_usage_[min_used_i] += 1;
+
+  return queues_[min_used_i];
+}
+
+void Adapter::return_queue(Adapter::Queue& compute_queue) {
+  for (const uint32_t i : c10::irange(queues_.size())) {
+    if ((queues_[i].family_index == compute_queue.family_index) &&
+        (queues_[i].queue_index == compute_queue.queue_index)) {
+      std::lock_guard<std::mutex> lock(mutex_);
+      queue_usage_[i] -= 1;
+      break;
+    }
+  }
+}
+
+std::string Adapter::stringize() const {
+  std::stringstream ss;
+
+  uint32_t v_major = VK_VERSION_MAJOR(properties_.apiVersion);
+  uint32_t v_minor = VK_VERSION_MINOR(properties_.apiVersion);
+  std::string device_type = get_device_type_str(properties_.deviceType);
+  VkPhysicalDeviceLimits limits = properties_.limits;
+
+  ss << "{" << std::endl;
+  ss << "  Physical Device Info {" << std::endl;
+  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
+  ss << "    driverversion: " << properties_.driverVersion << std::endl;
+  ss << "    deviceType:    " << device_type << std::endl;
+  ss << "    deviceName:    " << properties_.deviceName << std::endl;
+
+#define PRINT_LIMIT_PROP(name) \
+  ss << "      " << std::left << std::setw(36) << #name << limits.name << std::endl;
+
+#define PRINT_LIMIT_PROP_VEC3(name) \
+  ss << "      " << std::left << std::setw(36) << #name \
+  << limits.name[0] << "," \
+  << limits.name[1] << "," \
+  << limits.name[2] << std::endl;
+
+  ss << "    Physical Device Limits {" << std::endl;
+  PRINT_LIMIT_PROP(maxImageDimension1D);
+  PRINT_LIMIT_PROP(maxImageDimension2D);
+  PRINT_LIMIT_PROP(maxImageDimension3D);
+  PRINT_LIMIT_PROP(maxTexelBufferElements);
+  PRINT_LIMIT_PROP(maxPushConstantsSize);
+  PRINT_LIMIT_PROP(maxMemoryAllocationCount);
+  PRINT_LIMIT_PROP(maxSamplerAllocationCount);
+  PRINT_LIMIT_PROP(maxComputeSharedMemorySize);
+  PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupCount);
+  PRINT_LIMIT_PROP(maxComputeWorkGroupInvocations);
+  PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupSize);
+  ss << "    }" << std::endl;
+  ss << "  }" << std::endl;;
+
+  const VkPhysicalDeviceMemoryProperties& mem_props = memory_properties_;
+  ss << "  Memory Info {" << std::endl;
+  ss << "    Memory Types [" << std::endl;
+  for (int i = 0; i < mem_props.memoryTypeCount; ++i) {
+  ss << "      " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
+               << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags)
+               << std::endl;
+  }
+  ss << "    ]" << std::endl;
+  ss << "    Memory Heaps [" << std::endl;
+  for (int i = 0; i < mem_props.memoryHeapCount; ++i) {
+  ss << "      " << mem_props.memoryHeaps[i].size << std::endl;
+  }
+  ss << "    ]" << std::endl;
+  ss << "  }" << std::endl;
+
+  ss << "  Queue Families {" << std::endl;
+  for (const VkQueueFamilyProperties& queue_family_props : queue_families_) {
+  ss << "    (" << queue_family_props.queueCount << " Queues) "
+     << get_queue_family_properties_str(queue_family_props.queueFlags) << std::endl;
+  }
+  ss << "  }" << std::endl;
+  ss << "  VkDevice: " << handle_ << std::endl;
+  ss << "  Compute Queues [" << std::endl;
+  for (const Adapter::Queue& compute_queue : queues_) {
+  ss << "    Family " << compute_queue.family_index
+     << ", Queue " << compute_queue.queue_index
+     << ": " << compute_queue.handle << std::endl;;
+  }
+  ss << "  ]" << std::endl;
+  ss << "}";
+
+  return ss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const Adapter& adapter) {
+  os << adapter.stringize() << std::endl;
+  return os;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
index b4203530f635..a7aa29cc5baa 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.h
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -5,6 +5,8 @@
 #include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
+#include <ostream>
+#include <iostream>
 
 namespace at {
 namespace native {
@@ -12,29 +14,103 @@ namespace vulkan {
 namespace api {
 
 //
-// A Vulkan Adapter represents a physical device and its properties.  Adapters
-// are enumerated through the Runtime and are used in creation of Contexts.
-// Each tensor in PyTorch is associated with a Context to make the
-// device <-> tensor affinity explicit.
+// A Vulkan Adapter represents a logical device and all its properties. It
+// manages all relevant properties of the underlying physical device, a
+// handle to the logical device, and a number of compute queues available to
+// the device. It is primarily responsible for managing the VkDevice handle
+// which points to the logical device object on the GPU.
 //
+// This class is primarily used by the Runtime class, which holds one Adapter
+// instance for each physical device visible to the VkInstance. Upon construction,
+// this class will populate the physical device properties, but will not create
+// the logical device until specifically requested via the init_device() funtion.
+//
+// init_device() will create the logical device and obtain the VkDevice handle
+// for it. It will also create a number of compute queues up to the amount
+// requested when the Adapter instance was constructed.
+//
+// Contexts (which represent one thread of execution) will request a compute
+// queue from an Adapter. The Adapter will then select a compute queue to
+// assign to the Context, attempting to balance load between all available
+// queues. This will allow different Contexts (which typically execute on
+// separate threads) to run concurrently.
+//
+
+class Adapter final {
+ public:
+  explicit Adapter(const VkPhysicalDevice handle, const uint32_t num_queues);
+
+  Adapter(const Adapter&) = delete;
+  Adapter& operator=(const Adapter&) = delete;
+
+  Adapter(Adapter&&) noexcept;
+  Adapter& operator=(Adapter&&) = delete;
+
+  ~Adapter();
+
+  struct Queue {
+    uint32_t family_index;
+    uint32_t queue_index;
+    VkQueueFlags capabilities;
+    VkQueue handle;
+  };
+
+ private:
+  // Use a mutex to manage resources held by this class since
+  // it can be accessed from multiple threads
+  std::mutex mutex_;
+  // Physical Device Properties
+  VkPhysicalDevice physical_handle_;
+  VkPhysicalDeviceProperties properties_;
+  VkPhysicalDeviceMemoryProperties memory_properties_;
+  std::vector<VkQueueFamilyProperties> queue_families_;
+  // Queue Management
+  uint32_t num_requested_queues_;
+  std::vector<uint32_t> queue_usage_;
+  // Handles
+  VkDevice handle_;
+  std::vector<Queue> queues_;
+  // Metadata
+  uint32_t num_compute_queues_;
+  bool has_unified_memory_;
+  bool timestamp_compute_and_graphics_;
+  float timestamp_period_;
+
+ public:
+  inline VkPhysicalDevice physical_handle() const {
+    return physical_handle_;
+  }
 
-struct Adapter final {
-  Runtime* runtime;
-  VkPhysicalDevice handle;
-  VkPhysicalDeviceProperties properties;
-  VkPhysicalDeviceMemoryProperties memory_properties;
-  uint32_t compute_queue_family_index;
+  inline VkDevice device_handle() const {
+    return handle_;
+  }
 
   inline bool has_unified_memory() const {
-    // Ideally iterate over all memory types to see if there is a pool that
-    // is both host-visible, and device-local.  This should be a good proxy
-    // for now.
-    return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType;
+    return has_unified_memory_;
+  }
+
+  inline uint32_t num_compute_queues() const {
+    return num_compute_queues_;
+  }
+
+  inline bool timestamp_compute_and_graphics() const {
+    return timestamp_compute_and_graphics_;
+  }
+
+  inline float timestamp_period() const {
+    return timestamp_period_;
   }
 
+  void init_device();
+  Queue request_queue();
+  void return_queue(Queue& compute_queue);
+
   inline Shader::WorkGroup local_work_group_size() const {
     return { 4u, 4u, 4u, };
   }
+
+  std::string stringize() const;
+  friend std::ostream& operator<<(std::ostream& os, const Adapter& adapter);
 };
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index 692796a736ab..7f32681a6f5b 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -333,7 +333,7 @@ inline void Command::Buffer::Barrier::reset() {
 Command::Pool::Pool(const GPU& gpu)
   : device_(gpu.device),
     command_pool_(
-        create_command_pool(gpu.device, gpu.adapter->compute_queue_family_index),
+        create_command_pool(gpu.device, gpu.queue_family_index),
         VK_DELETER(CommandPool)(device_)),
     buffer_{} {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index 827ee85cb788..84bbeaa6f0e1 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -32,7 +32,11 @@
 #define VK_CHECK(function)                                  \
   do {                                                      \
     const VkResult result = (function);                     \
-    TORCH_CHECK(VK_SUCCESS == result, "VkResult:", result); \
+    TORCH_CHECK(                                            \
+        VK_SUCCESS == result,                               \
+        C10_STRINGIZE(__FILE__), " [",                      \
+        C10_STRINGIZE(__LINE__), "] "                       \
+        "VkResult:", result);                               \
   } while (false)
 
 #define VK_CHECK_RELAXED(function)                          \
@@ -61,7 +65,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-struct Adapter;
+class Adapter;
 struct Command;
 class Context;
 struct Descriptor;
@@ -71,8 +75,10 @@ class Runtime;
 struct Shader;
 
 struct GPU final {
+  VkInstance instance;
   const Adapter* adapter;
   VkDevice device;
+  uint32_t queue_family_index;
   VkQueue queue;
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index d65a89895714..260d10dbe686 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -1,6 +1,7 @@
 #include <ATen/native/vulkan/api/Context.h>
-#include <ATen/vulkan/Context.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Copy.h>
+#include <ATen/vulkan/Context.h>
 
 #include <sstream>
 
@@ -103,23 +104,19 @@ VkQueue acquire_queue(
 
 } // namespace
 
-Context::Context(const Adapter& adapter)
-    : adapter_(adapter),
-      device_(
-          create_device(
-              adapter.handle,
-              adapter.compute_queue_family_index),
-          &VK_DELETER(Device)),
-      queue_(acquire_queue(device(), adapter.compute_queue_family_index)),
+Context::Context(const VkInstance instance, size_t adapter_i)
+    : instance_(instance),
+      adapter_i_(adapter_i),
+      device_(runtime()->get_adapter(adapter_i).device_handle()),
+      queue_(runtime()->get_adapter(adapter_i).request_queue()),
       shader_(gpu()),
       pipeline_(gpu()),
       threadcontext_(gpu()) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_,
-      "Invalid Vulkan device!");
 }
 
 Context::~Context() {
+  // Let the device know the context is done with the queue
+  runtime()->get_adapter(adapter_i_).return_queue(queue_);
   // Do not call flush() since all per-thread objects will be destroyed as each thread exits
 }
 
@@ -155,12 +152,7 @@ bool available() {
 Context* context() {
   static const std::unique_ptr<Context> context([]() -> Context* {
     try {
-      const Adapter adapter = runtime()->select([](const Adapter& adapter) {
-        // Select the first adapter.
-        return true;
-      });
-
-      return new Context(adapter);
+      return new Context(runtime()->instance(), runtime()->default_adapter_i());
     }
     catch (const std::exception& e) {
       TORCH_CHECK(false, "Vulkan: Failed to initialize context! Error: ", e.what());
@@ -196,7 +188,6 @@ Descriptor::Set dispatch_prologue(
     const Shader::Descriptor& shader_descriptor,
     const Shader::WorkGroup& local_work_group_size) {
   Context* const context = api::context();
-  const GPU gpu = context->gpu();
   Descriptor& descriptor = context->descriptor();
   Pipeline& pipeline = context->pipeline();
   Shader& shader = context->shader();
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index e38c4e59227a..7b1bb85f9230 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -27,16 +27,20 @@ namespace api {
 
 class Context final {
  public:
-  explicit Context(const Adapter& adapter);
+  explicit Context(const VkInstance instance, size_t adapter_i);
+
   Context(const Context&) = delete;
-  Context(Context&&) = default;
   Context& operator=(const Context&) = delete;
+
+  Context(Context&&) = default;
   Context& operator=(Context&&) = default;
+
   ~Context();
 
   GPU gpu();
   Command& command();
   Shader& shader();
+  QueryPool& querypool();
   Pipeline& pipeline();
   Descriptor& descriptor();
   Resource& resource();
@@ -67,15 +71,19 @@ class Context final {
 
  private:
   // Construction and destruction order matters.  Do not move members around.
-  Adapter adapter_;
-  Handle<VkDevice, decltype(&VK_DELETER(Device))> device_;
-  VkQueue queue_;
+  VkInstance instance_;
+  size_t adapter_i_;
+  VkDevice device_;
+  Adapter::Queue queue_;
   Shader shader_;
   Pipeline pipeline_;
   ThreadContext threadcontext_;
 };
 
 bool available();
+
+// The global runtime is retrieved using this function, where it is declared as
+// a static local variable.
 Context* context();
 
 //
@@ -84,10 +92,13 @@ Context* context();
 
 inline GPU Context::gpu() {
   // A GPU is simply a (physical device, logical device, device queue) trio.
+  const Adapter* p_adapter = runtime()->get_adapter_p(adapter_i_);
   return {
-    &adapter_,
-    device(),
-    queue(),
+    instance_,
+    p_adapter,
+    device_,
+    queue_.family_index,
+    queue_.handle,
   };
 }
 
@@ -111,14 +122,16 @@ inline Resource& Context::resource() {
   return threadcontext_.resource();
 }
 
+inline QueryPool& Context::querypool() {
+  return threadcontext_.querypool();
+}
+
 inline VkDevice Context::device() {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_);
-  return device_.get();
+  return device_;
 }
 
 inline VkQueue Context::queue() {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_);
-  return queue_;
+  return queue_.handle;
 }
 
 namespace detail {
diff --git a/aten/src/ATen/native/vulkan/api/OpProfiler.h b/aten/src/ATen/native/vulkan/api/OpProfiler.h
new file mode 100644
index 000000000000..b38b5dc95729
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/OpProfiler.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/QueryPool.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+class OpProfiler final {
+ public:
+  explicit OpProfiler(Command::Buffer& buffer, QueryPool& querypool, const std::string& query_name)
+    : buffer_(buffer),
+      querypool_(querypool) {
+    query_index_ = querypool.begin(buffer_.handle(), query_name);
+  }
+  OpProfiler(const OpProfiler&) = delete;
+  OpProfiler(OpProfiler&&) = delete;
+  OpProfiler& operator=(const OpProfiler&) = delete;
+  OpProfiler& operator=(OpProfiler&&) = delete;
+  ~OpProfiler() {
+    querypool_.end(buffer_.handle(), query_index_);
+  }
+
+private:
+  Command::Buffer& buffer_;
+  QueryPool& querypool_;
+  int query_index_;
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
new file mode 100644
index 000000000000..9e12e3be3e3f
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -0,0 +1,120 @@
+#include <ATen/native/vulkan/api/QueryPool.h>
+#include <ATen/native/vulkan/ops/Tensor.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+namespace {
+
+VkQueryPool create_query_pool(const VkDevice& device, const uint32_t queryCount) {
+  VkQueryPool queryPool{};
+  VkQueryPoolCreateInfo info{};
+  info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+  info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+  info.queryCount = queryCount;
+  VK_CHECK(vkCreateQueryPool(device, &info, nullptr, &queryPool));
+  return queryPool;
+};
+
+void destroy_query_pool(const VkDevice& device, const VkQueryPool& querypool) {
+  if (VK_NULL_HANDLE != device && VK_NULL_HANDLE != querypool) {
+    vkDestroyQueryPool(device, querypool, nullptr);
+  }
+}
+
+} // namespace
+
+QueryPool::QueryPool(const VkDevice& device, const bool is_timestamps_supported, const float timestamp_period_us)
+  : device_(device),
+    is_timestamps_supported_(is_timestamps_supported),
+    timestamp_period_us_(timestamp_period_us),
+    querypool_(VK_NULL_HANDLE) {
+}
+
+QueryPool::~QueryPool() {
+  destroy_query_pool(device_, querypool_);
+  querypool_ = VK_NULL_HANDLE;
+  query_names_.clear();
+}
+
+bool QueryPool::is_enabled() const {
+  return VK_NULL_HANDLE != querypool_;
+}
+
+bool QueryPool::enable() {
+  TORCH_CHECK(VK_NULL_HANDLE == querypool_, "The query pool already exists.");
+  TORCH_CHECK(is_timestamps_supported_, "The device doesn't support for timestamps on all graphics and compute queues.");
+  querypool_ = create_query_pool(device_, Configuration::kMaxQueryCount);
+  return is_enabled();
+}
+
+std::vector<QueryPool::PerfInfo> QueryPool::disable(const bool waitfor_allqueries/* = true*/) {
+  auto out = result(waitfor_allqueries);
+  destroy_query_pool(device_, querypool_);
+  querypool_ = VK_NULL_HANDLE;
+  query_names_.clear();
+  return out;
+}
+
+int QueryPool::begin(const VkCommandBuffer& commandBuffer, const std::string& query_name) {
+  if (VK_NULL_HANDLE == querypool_ || VK_NULL_HANDLE == commandBuffer) {
+    return -1;
+  }
+  auto newQueryIndex = static_cast<uint32_t>(query_names_.size());
+  TORCH_CHECK(newQueryIndex < Configuration::kMaxQueryCount, "The query index cannot exceed Configuration::kMaxQueryCount.");
+  query_names_.push_back(query_name);
+
+  vkCmdWriteTimestamp(
+        commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool_, newQueryIndex * Configuration::kTimestampsPerQuery);
+  return static_cast<int>(newQueryIndex);
+}
+
+void QueryPool::end(const VkCommandBuffer& commandBuffer, const int queryIndex) {
+  if (VK_NULL_HANDLE == querypool_ || VK_NULL_HANDLE == commandBuffer) {
+    return;
+  }
+  vkCmdWriteTimestamp(
+        commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool_, static_cast<uint32_t>(queryIndex) * Configuration::kTimestampsPerQuery + 1u);
+}
+
+std::vector<QueryPool::PerfInfo> QueryPool::result(const bool waitfor_allqueries) const {
+  if (VK_NULL_HANDLE == querypool_) {
+    return std::vector<QueryPool::PerfInfo> {};
+  }
+
+  std::vector<QueryPool::PerfInfo> perfInfo;
+  const VkQueryResultFlags flags = waitfor_allqueries ? (VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT) : VK_QUERY_RESULT_64_BIT;
+  std::array<uint64_t, 2> counter_data{};
+  for (uint32_t queryIndex = 0u; queryIndex < query_names_.size(); ++queryIndex) {
+    const auto& query_name = query_names_[queryIndex];
+
+    // Grab the gpu timings (nanoseconds)
+    auto ret = vkGetQueryPoolResults(device_, querypool_, queryIndex * Configuration::kTimestampsPerQuery, Configuration::kTimestampsPerQuery,
+        sizeof(uint64_t) * counter_data.size(), counter_data.data(), sizeof(uint64_t),
+        flags);
+    if (ret != VK_SUCCESS) {
+      std::stringstream msg;
+      msg << "vkGetQueryPoolResults() for \"" << query_name << "\"" << " returned an error code " << ret << ".";
+      TORCH_WARN(msg.str());
+      continue;
+    }
+
+    // Tally up GPU time
+    int64_t gpu_time_us = static_cast<int64_t>(
+        (static_cast<double>(counter_data[1] - counter_data[0]) *
+            timestamp_period_us_) / 1'000.f);    // convert ns to us
+
+    perfInfo.emplace_back(QueryPool::PerfInfo {
+        query_name,
+        static_cast<int64_t>(static_cast<double>(counter_data[0]) * timestamp_period_us_ / 1'000.f),
+        static_cast<int64_t>(static_cast<double>(counter_data[1]) * timestamp_period_us_ / 1'000.f),
+        gpu_time_us });
+ }
+  return perfInfo;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.h b/aten/src/ATen/native/vulkan/api/QueryPool.h
new file mode 100644
index 000000000000..edabba7fa705
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Adapter.h>
+#include <ATen/native/vulkan/api/Command.h>
+#include <ATen/native/vulkan/api/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+class QueryPool final {
+ public:
+  explicit QueryPool(const VkDevice& device, const bool is_timestamps_supported, const float timestamp_period_us);
+  QueryPool(const QueryPool&) = delete;
+  QueryPool(QueryPool&&) = default;
+  QueryPool& operator=(const QueryPool&) = delete;
+  QueryPool& operator=(QueryPool&&) = default;
+  ~QueryPool();
+
+public:
+  struct PerfInfo final {
+    std::string query_name;
+    int64_t start_time_us;
+    int64_t end_time_us;
+    int64_t execution_time_us;
+  };
+
+  struct Configuration final {
+    static constexpr uint32_t kTimestampsPerQuery = 2u;
+    static constexpr uint32_t kMaxQueryCount = 65536u;
+  };
+
+public:
+  bool is_enabled() const;
+  bool enable();
+  std::vector<QueryPool::PerfInfo> disable(const bool waitfor_allqueries = true);
+  int begin(const VkCommandBuffer& commandBuffer, const std::string& query_name);
+  void end(const VkCommandBuffer& commandBuffer, const int queryIndex);
+  std::vector<PerfInfo> result(const bool waitfor_allqueries) const;
+
+private:
+  VkDevice device_;
+  bool is_timestamps_supported_;
+  float timestamp_period_us_;
+  VkQueryPool querypool_;
+  std::vector<std::string> query_names_;
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index a6cd3c62da9b..520ccc87d533 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -364,8 +364,8 @@ Resource::Pool::Pool(
   : device_(gpu.device),
     allocator_(
         create_allocator(
-            gpu.adapter->runtime->instance(),
-            gpu.adapter->handle,
+            gpu.instance,
+            gpu.adapter->physical_handle(),
             device_),
         vmaDestroyAllocator),
     memory_{
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index b90c81eb6435..c925a0226f6a 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -1,22 +1,132 @@
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Adapter.h>
-#include <c10/util/irange.h>
-
-#include <sstream>
 
 namespace at {
 namespace native {
 namespace vulkan {
 namespace api {
+
 namespace {
 
-struct Configuration final {
-#ifdef DEBUG
-  static constexpr Runtime::Type kRuntime = Runtime::Type::Debug;
-#else
-  static constexpr Runtime::Type kRuntime = Runtime::Type::Release;
+
+void find_requested_layers_and_extensions(
+    std::vector<const char*>& enabled_layers,
+    std::vector<const char*>& enabled_extensions,
+    const std::vector<const char*>& requested_layers,
+    const std::vector<const char*>& requested_extensions) {
+
+  // Get supported instance layers
+  uint32_t layer_count = 0;
+  VK_CHECK(vkEnumerateInstanceLayerProperties(&layer_count, nullptr));
+
+  std::vector<VkLayerProperties> layer_properties(layer_count);
+  VK_CHECK(vkEnumerateInstanceLayerProperties(
+      &layer_count, layer_properties.data()));
+
+  // Search for requested layers
+  for (const auto& requested_layer : requested_layers) {
+    for (const auto& layer : layer_properties) {
+      if (strcmp(requested_layer, layer.layerName) == 0) {
+        enabled_layers.push_back(requested_layer);
+        break;
+      }
+    }
+  }
+
+  // Get supported instance extensions
+  uint32_t extension_count = 0;
+  VK_CHECK(vkEnumerateInstanceExtensionProperties(
+      nullptr, &extension_count, nullptr));
+
+  std::vector<VkExtensionProperties> extension_properties(extension_count);
+  VK_CHECK(vkEnumerateInstanceExtensionProperties(
+      nullptr, &extension_count, extension_properties.data()));
+
+  // Search for requested extensions
+  for (const auto& requested_extension : requested_extensions) {
+    for (const auto& extension : extension_properties) {
+      if (strcmp(requested_extension, extension.extensionName) == 0) {
+        enabled_extensions.push_back(requested_extension);
+        break;
+      }
+    }
+  }
+}
+
+VkInstance create_instance(const RuntimeConfiguration& config) {
+  const VkApplicationInfo application_info{
+    VK_STRUCTURE_TYPE_APPLICATION_INFO,  // sType
+    nullptr,  // pNext
+    "PyTorch Vulkan Backend",  // pApplicationName
+    0,  // applicationVersion
+    nullptr, // pEngineName
+    0,  // engineVersion
+    VK_API_VERSION_1_0,  // apiVersion
+  };
+
+  std::vector<const char*> enabled_layers;
+  std::vector<const char*> enabled_extensions;
+
+  if (config.enableValidationMessages) {
+    std::vector<const char*> requested_layers {
+      // "VK_LAYER_LUNARG_api_dump",
+      "VK_LAYER_KHRONOS_validation",
+    };
+    std::vector<const char*> requested_extensions {
+      #ifdef VK_EXT_debug_report
+      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+      #endif
+    };
+
+    find_requested_layers_and_extensions(
+        enabled_layers,
+        enabled_extensions,
+        requested_layers,
+        requested_extensions);
+  }
+
+  const VkInstanceCreateInfo instance_create_info{
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,  // sType
+    nullptr,  // pNext
+    0u,  // flags
+    &application_info,  // pApplicationInfo
+    static_cast<uint32_t>(enabled_layers.size()),  // enabledLayerCount
+    enabled_layers.data(),  // ppEnabledLayerNames
+    static_cast<uint32_t>(enabled_extensions.size()),  // enabledExtensionCount
+    enabled_extensions.data(),  // ppEnabledExtensionNames
+  };
+
+  VkInstance instance{};
+  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
+  TORCH_CHECK(instance, "Invalid Vulkan instance!");
+
+#ifdef USE_VULKAN_VOLK
+  volkLoadInstance(instance);
 #endif
-};
+
+  return instance;
+}
+
+std::vector<Adapter> create_adapters(const VkInstance instance,
+                                        const uint32_t num_queues) {
+  if (VK_NULL_HANDLE == instance) {
+    return std::vector<Adapter>();
+  }
+
+  uint32_t device_count = 0;
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
+
+  std::vector<VkPhysicalDevice> devices(device_count);
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
+
+  std::vector<Adapter> adapters;
+  adapters.reserve(device_count);
+  for (const VkPhysicalDevice physical_device : devices) {
+    adapters.emplace_back(physical_device, num_queues);
+  }
+
+  return adapters;
+}
 
 VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
     const VkDebugReportFlagsEXT flags,
@@ -46,113 +156,22 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
   return VK_FALSE;
 }
 
-VkInstance create_instance(const Runtime::Type type) {
-  std::vector<const char*> enabled_instance_layers;
-  std::vector<const char*> enabled_instance_extensions;
-
-  if (Runtime::Type::Debug == type) {
-    uint32_t instance_layers_count = 0;
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count, nullptr));
-
-    std::vector<VkLayerProperties> instance_layer_properties(
-        instance_layers_count);
-
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count,
-        instance_layer_properties.data()));
-
-    constexpr const char* const requested_instance_layers[]{
-        // "VK_LAYER_LUNARG_api_dump",
-        "VK_LAYER_KHRONOS_validation",
-    };
-
-    for (const auto& requested_instance_layer : requested_instance_layers) {
-      for (const auto& layer : instance_layer_properties) {
-        if (strcmp(requested_instance_layer, layer.layerName) == 0) {
-          enabled_instance_layers.push_back(requested_instance_layer);
-          break;
-        }
-      }
-    }
-
-    uint32_t instance_extension_count = 0;
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, nullptr));
-
-    std::vector<VkExtensionProperties> instance_extension_properties(
-        instance_extension_count);
-
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, instance_extension_properties.data()));
-
-    constexpr const char* const requested_instance_extensions[]{
-    #ifdef VK_EXT_debug_report
-      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
-    #endif
-    };
-
-    for (const auto& requested_instance_extension : requested_instance_extensions) {
-      for (const auto& extension : instance_extension_properties) {
-        if (strcmp(requested_instance_extension, extension.extensionName) == 0) {
-          enabled_instance_extensions.push_back(requested_instance_extension);
-          break;
-        }
-      }
-    }
-  }
-
-  constexpr VkApplicationInfo application_info{
-    VK_STRUCTURE_TYPE_APPLICATION_INFO,
-    nullptr,
-    "PyTorch",
-    0,
-    "PyTorch",
-    0,
-    VK_API_VERSION_1_0,
-  };
-
-const VkInstanceCreateInfo instance_create_info{
-    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
-    nullptr,
-    0u,
-    &application_info,
-    static_cast<uint32_t>(enabled_instance_layers.size()),
-    enabled_instance_layers.data(),
-    static_cast<uint32_t>(enabled_instance_extensions.size()),
-    enabled_instance_extensions.data(),
-  };
-
-  VkInstance instance{};
-  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
-  TORCH_CHECK(instance, "Invalid Vulkan instance!");
-
-#ifdef USE_VULKAN_WRAPPER
-#ifdef USE_VULKAN_VOLK
-  volkLoadInstance(instance);
-#endif
-#endif
-
-  return instance;
-}
-
 VkDebugReportCallbackEXT create_debug_report_callback(
-    const VkInstance instance,
-    const Runtime::Type type) {
-  if (Runtime::Type::Debug != type) {
+    const VkInstance instance, const RuntimeConfiguration config) {
+  if (VK_NULL_HANDLE == instance || !config.enableValidationMessages) {
     return VkDebugReportCallbackEXT{};
   }
 
   const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
-    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
-    nullptr,
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,  // sType
+    nullptr,  // pNext
     VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
       VK_DEBUG_REPORT_WARNING_BIT_EXT |
       VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
       VK_DEBUG_REPORT_ERROR_BIT_EXT |
-      VK_DEBUG_REPORT_DEBUG_BIT_EXT,
-    debug_report_callback_fn,
-    nullptr,
+      VK_DEBUG_REPORT_DEBUG_BIT_EXT,  // flags
+    debug_report_callback_fn,  // pfnCallback
+    nullptr,  // pUserData
   };
 
   const auto vkCreateDebugReportCallbackEXT =
@@ -177,179 +196,177 @@ VkDebugReportCallbackEXT create_debug_report_callback(
   return debug_report_callback;
 }
 
-std::vector<VkPhysicalDevice> acquire_physical_devices(
-    const VkInstance instance) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      instance,
-      "Invalid Vulkan instance!");
+//
+// Adapter selection methods
+//
 
-  uint32_t device_count = 0;
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
-
-  TORCH_CHECK(
-      device_count > 0,
-      "Vulkan: Could not find a device with Vulkan support!");
+uint32_t select_first(const std::vector<Adapter>& adapters) {
+  if (adapters.size() == 0) {
+    TORCH_WARN("Pytorch Vulkan Runtime: no device adapters are available for selection!");
+    return adapters.size() + 1; // return out of range to signal invalidity
+  }
 
-  std::vector<VkPhysicalDevice> devices(device_count);
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
+  // Select the first adapter that has compute capability
+  for (const uint32_t i : c10::irange(adapters.size())) {
+    if (adapters[i].num_compute_queues() > 0) {
+      return i;
+    }
+  }
 
-  return devices;
+  TORCH_WARN("Pytorch Vulkan Runtime: no device adapters support compute!");
+  return adapters.size() + 1;
 }
 
-VkPhysicalDeviceProperties query_physical_device_properties(
-    const VkPhysicalDevice physical_device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      physical_device,
-      "Invalid Vulkan physical device!");
-
-  VkPhysicalDeviceProperties physical_device_properties{};
-  vkGetPhysicalDeviceProperties(
-      physical_device,
-      &physical_device_properties);
-
-  return physical_device_properties;
-}
+//
+// Global runtime initialization
+//
+
+std::unique_ptr<Runtime> init_global_vulkan_runtime() {
+  // Load Vulkan drivers
+#if defined(USE_VULKAN_VOLK)
+  if (VK_SUCCESS != volkInitialize()) {
+    TORCH_WARN(
+        "Pytorch Vulkan Runtime: Failed to load Vulkan driver using volkInitialize()! "
+        "The global vulkan runtime is invalid.");
+    return std::unique_ptr<Runtime>(nullptr);
+  }
+#elif defined(USE_VULKAN_WRAPPER)
+  if (!InitVulkan()) {
+    TORCH_WARN(
+        "Pytorch Vulkan Runtime: Failed to load Vulkan driver using initVulkan()! "
+        "The global vulkan runtime is invalid.");
+    return std::unique_ptr<Runtime>(nullptr);
+  }
+#endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */
 
-VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties(
-    const VkPhysicalDevice physical_device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      physical_device,
-      "Invalid Vulkan physical device!");
+  const bool enableValidationMessages =
+#if defined(DEBUG)
+    true;
+#else
+    false;
+#endif /* DEBUG */
+  const bool initDefaultDevice = true;
+  const uint32_t numRequestedQueues = 1; // TODO: raise this value
+
+  const RuntimeConfiguration default_config {
+    enableValidationMessages,
+    initDefaultDevice,
+    AdapterSelector::First,
+    numRequestedQueues,
+  };
 
-  VkPhysicalDeviceMemoryProperties physical_device_memory_properties{};
-  vkGetPhysicalDeviceMemoryProperties(
-      physical_device,
-      &physical_device_memory_properties);
+  try {
+    return std::make_unique<Runtime>(Runtime(default_config));
+  }
+  catch (const std::exception& e) {
+    TORCH_WARN(
+        "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! "
+        "The global vulkan runtime is invalid. Error: ",
+        e.what());
+  }
+  catch (...) {
+    TORCH_WARN(
+        "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! "
+        "The global vulkan runtime is invalid. "
+        "Error: Unknown");
+  }
 
-  return physical_device_memory_properties;
+  return std::unique_ptr<Runtime>(nullptr);
 }
 
-uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      physical_device,
-      "Invalid Vulkan physical device!");
-
-  uint32_t queue_family_count = 0;
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device, &queue_family_count, nullptr);
-
-  TORCH_CHECK(
-      queue_family_count > 0,
-      "Vulkan: Invalid number of queue families!");
-
-  std::vector<VkQueueFamilyProperties>
-      queue_families_properties(queue_family_count);
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device,
-      &queue_family_count,
-      queue_families_properties.data());
+} // namespace
 
-  for (const auto i : c10::irange(queue_families_properties.size())) {
-    const VkQueueFamilyProperties& properties = queue_families_properties[i];
-    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
-      return i;
+Runtime::Runtime(const RuntimeConfiguration config)
+  : instance_(create_instance(config)),
+    adapters_(create_adapters(instance_, config.numRequestedQueues)),
+    default_adapter_i_{},
+    debug_report_callback_(create_debug_report_callback(instance_, config)) {
+  if (config.initDefaultDevice) {
+    try {
+      switch(config.defaultSelector) {
+        case AdapterSelector::First:
+          default_adapter_i_ = init_adapter(select_first);
+      }
+    }
+    catch (const std::exception& e) {
+      TORCH_WARN(
+          "Pytorch Vulkan Runtime: Could not initialize default device! Error: ",
+          e.what());
+    }
+    catch (...) {
+      TORCH_WARN(
+          "Pytorch Vulkan Runtime: Could not initialize default device! Error: "
+          "Unknown.");
     }
   }
-
-  TORCH_CHECK(
-      false,
-      "Vulkan: Could not find a queue family that supports compute operations!");
 }
 
-} // namespace
+Runtime::~Runtime() {
+  if C10_LIKELY(VK_NULL_HANDLE == instance_) {
+    return;
+  }
 
-Runtime::Debug::Debug(const VkInstance instance)
-  : instance_(instance) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        instance,
-        "Invalid Vulkan instance!");
-}
+  // Clear adapters list to trigger device destruction before destroying VkInstance
+  adapters_.clear();
 
-void Runtime::Debug::operator()(
-    const VkDebugReportCallbackEXT debug_report_callback) const {
-  if (debug_report_callback) {
+  // Instance must be destroyed last as its used to destroy the debug report callback.
+  if (debug_report_callback_) {
     const auto vkDestroyDebugReportCallbackEXT =
       (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
           instance_, "vkDestroyDebugReportCallbackEXT");
 
       TORCH_CHECK(
           vkDestroyDebugReportCallbackEXT,
-          "Could not load vkDestroyDebugReportCallbackEXT");
+          "Pytorch Vulkan Runtime: Could not load vkDestroyDebugReportCallbackEXT "
+          "when destroying debug_report_callback_");
 
       vkDestroyDebugReportCallbackEXT(
-          instance_, debug_report_callback, nullptr);
+          instance_, debug_report_callback_, nullptr);
+
+    debug_report_callback_ = {};
   }
-}
 
-Runtime::Runtime(const Type type)
-    : instance_(create_instance(type), &VK_DELETER(Instance)),
-      debug_report_callback_(
-          create_debug_report_callback(instance(), type),
-          Debug(instance())) {
+  vkDestroyInstance(instance_, nullptr);
+  instance_ = VK_NULL_HANDLE;
 }
 
-Adapter Runtime::select(const Selector& selector) {
-  const std::vector<VkPhysicalDevice> physical_devices =
-      acquire_physical_devices(instance());
-
-  for (const VkPhysicalDevice physical_device : physical_devices) {
-    const Adapter adapter{
-      this,
-      physical_device,
-      query_physical_device_properties(physical_device),
-      query_physical_device_memory_properties(physical_device),
-      query_compute_queue_family_index(physical_device),
-    };
-
-    if (selector(adapter)) {
-      return adapter;
-    }
-  }
-
-  TORCH_CHECK(
-      false,
-      "Vulkan: no adapter was selected as part of device enumeration!");
+Runtime::Runtime(Runtime&& other) noexcept
+  : instance_(other.instance_),
+    adapters_(std::move(other.adapters_)),
+    default_adapter_i_(other.default_adapter_i_),
+    debug_report_callback_(other.debug_report_callback_) {
+  other.instance_ = VK_NULL_HANDLE;
+  other.debug_report_callback_ = {};
 }
 
-Runtime* runtime() {
-  static const std::unique_ptr<Runtime> runtime([]() -> Runtime* {
-#ifdef USE_VULKAN_WRAPPER
-#ifdef USE_VULKAN_VOLK
-    if (VK_SUCCESS != volkInitialize()) {
-      TORCH_WARN("Vulkan: Failed to initialize Volk!");
-      return nullptr;
-    }
-#else
- if (!InitVulkan()) {
-      TORCH_WARN("Vulkan: Failed to initialize Vulkan Wrapper!");
-      return nullptr;
-    }
-#endif /* USE_VULKAN_VOLK */
-#endif /* USE_VULKAN_WRAPPER */
+uint32_t Runtime::init_adapter(const Selector& selector) {
+  TORCH_CHECK(
+      adapters_.size() > 0,
+      "Pytorch Vulkan Runtime: Could not initialize adapter because no "
+      "devices were found by the Vulkan instance.");
 
-    try {
-      return new Runtime(Configuration::kRuntime);
-    }
-    catch (const std::exception& e) {
-      TORCH_WARN(
-          "Vulkan: Failed to initialize runtime! Error: ",
-          e.what());
-    }
-    catch (...) {
-      TORCH_WARN(
-          "Vulkan: Failed to initialize runtime! "
-          "Error: Unknown");
-    }
+  uint32_t i = selector(adapters_);
+  TORCH_CHECK(
+      i < adapters_.size(),
+      "Pytorch Vulkan Runtime: no suitable device adapter was selected! "
+      "Device could not be initialized");
 
-    return nullptr;
-  }());
+  adapters_[i].init_device();
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      runtime,
-      "Invalid Vulkan runtime!");
+  return i;
+}
 
-  return runtime.get();
+Runtime* runtime() {
+  // The global vulkan runtime is declared as a static local variable within a
+  // non-static function to ensure it has external linkage. If it were a global
+  // static variable there would be one copy per translation unit that includes
+  // Runtime.h as it would have internal linkage.
+  static const std::unique_ptr<Runtime> p_runtime = init_global_vulkan_runtime();
+  TORCH_CHECK(
+      p_runtime,
+      "Pytorch Vulkan Runtime: The global runtime could not be retrieved "
+      "because it failed to initialize.");
+  return p_runtime.get();
 }
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h
index 55eae70f8723..140c0869d627 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.h
+++ b/aten/src/ATen/native/vulkan/api/Runtime.h
@@ -19,52 +19,76 @@ namespace api {
 // are associated with a Context to make tensor <-> device affinity explicit.
 //
 
+enum AdapterSelector {
+  First,
+};
+
+struct RuntimeConfiguration final {
+  bool enableValidationMessages;
+  bool initDefaultDevice;
+  AdapterSelector defaultSelector;
+  uint32_t numRequestedQueues;
+};
+
 class Runtime final {
  public:
-  enum class Type {
-    Debug,
-    Release,
-  };
+  explicit Runtime(const RuntimeConfiguration config);
 
-  explicit Runtime(Type type);
+  // Do not allow copying. There should be only one global instance of this class.
   Runtime(const Runtime&) = delete;
   Runtime& operator=(const Runtime&) = delete;
-  Runtime(Runtime&&) = default;
-  Runtime& operator=(Runtime&&) = default;
-  ~Runtime() = default;
 
-  VkInstance instance() const;
+  Runtime(Runtime&&) noexcept;
+  Runtime& operator=(Runtime&&) = delete;
 
-  typedef std::function<bool (const Adapter&)> Selector;
-  Adapter select(const Selector& selector);
+  ~Runtime();
 
  private:
-  class Debug final {
-   public:
-    explicit Debug(VkInstance);
-    void operator()(VkDebugReportCallbackEXT) const;
+  VkInstance instance_;
+  std::vector<Adapter> adapters_;
+  uint32_t default_adapter_i_;
 
-   private:
-    VkInstance instance_;
-  };
+  VkDebugReportCallbackEXT debug_report_callback_;
 
- private:
-  // Construction and destruction order matters.  Do not move members around.
-  Handle<VkInstance, decltype(&VK_DELETER(Instance))> instance_;
-  Handle<VkDebugReportCallbackEXT, Debug> debug_report_callback_;
+ public:
+  inline VkInstance instance() const {
+    return instance_;
+  }
+
+  inline Adapter* get_adapter_p() {
+    TORCH_CHECK(
+        default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(),
+        "Pytorch Vulkan Runtime: Default device adapter is not set correctly!");
+    return &adapters_[default_adapter_i_];
+  }
+
+  inline Adapter& get_adapter() {
+    TORCH_CHECK(
+        default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(),
+        "Pytorch Vulkan Runtime: Default device adapter is not set correctly!");
+    return adapters_[default_adapter_i_];
+  }
+
+  inline Adapter* get_adapter_p(uint32_t i) {
+    return &adapters_[i];
+  }
+
+  inline Adapter& get_adapter(uint32_t i) {
+    return adapters_[i];
+  }
+
+  inline uint32_t default_adapter_i() const {
+    return default_adapter_i_;
+  }
+
+  using Selector = std::function<uint32_t (const std::vector<Adapter>&)>;
+  uint32_t init_adapter(const Selector& selector);
 };
 
+// The global runtime is retrieved using this function, where it is declared as
+// a static local variable.
 Runtime* runtime();
 
-//
-// Impl
-//
-
-inline VkInstance Runtime::instance() const {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
-  return instance_.get();
-}
-
 } // namespace api
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/api/ThreadContext.cpp b/aten/src/ATen/native/vulkan/api/ThreadContext.cpp
index 039218fe2d2a..d230d97ecda7 100644
--- a/aten/src/ATen/native/vulkan/api/ThreadContext.cpp
+++ b/aten/src/ATen/native/vulkan/api/ThreadContext.cpp
@@ -35,6 +35,13 @@ ThreadContext::SingletonThreadLocalObject<Resource>::SingletonThreadLocalObject(
   : object_(gpu) {
 }
 
+template<>
+ThreadContext::SingletonThreadLocalObject<QueryPool>::SingletonThreadLocalObject(const GPU& gpu)
+  : object_(gpu.device,
+      gpu.adapter->timestamp_compute_and_graphics(),
+      gpu.adapter->timestamp_period()) {
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/api/ThreadContext.h b/aten/src/ATen/native/vulkan/api/ThreadContext.h
index 6f0360359e5e..0145e345f8d7 100644
--- a/aten/src/ATen/native/vulkan/api/ThreadContext.h
+++ b/aten/src/ATen/native/vulkan/api/ThreadContext.h
@@ -5,6 +5,7 @@
 #include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
+#include <ATen/native/vulkan/api/QueryPool.h>
 #include <ATen/native/vulkan/api/Resource.h>
 
 namespace at {
@@ -29,6 +30,7 @@ class ThreadContext final {
   Command& command();
   Descriptor& descriptor();
   Resource& resource();
+  QueryPool& querypool();
 
  private:
   GPU gpu_;
@@ -67,6 +69,10 @@ inline Resource& ThreadContext::resource() {
   return SingletonThreadLocalObject<Resource>::get(gpu_);
 }
 
+inline QueryPool& ThreadContext::querypool() {
+  return SingletonThreadLocalObject<QueryPool>::get(gpu_);
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
deleted file mode 100644
index 2c02e034603e..000000000000
--- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
+++ /dev/null
@@ -1,31 +0,0 @@
-#version 450 core
-#define PRECISION $precision
-layout(std430) buffer;
-layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1) readonly buffer kernel {
-  vec4 data[];
-}
-uKernel;
-layout(set = 0, binding = 2) uniform constBlock {
-  int KWxKH;
-  int C_4;
-}
-uConstBlock;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID) * ivec3(4, 1, 1);
-  int KWxKH = uConstBlock.KWxKH;
-  int C_4 = uConstBlock.C_4;
-  int bufferIdx = pos.x * KWxKH + 4 * pos.y * C_4 * KWxKH + 4 * pos.z;
-  vec4 v0 = uKernel.data[bufferIdx + 0];
-  vec4 v1 = uKernel.data[bufferIdx + 1];
-  vec4 v2 = uKernel.data[bufferIdx + 2];
-  vec4 v3 = uKernel.data[bufferIdx + 3];
-
-  imageStore(uOutput, ivec3(pos.x + 0, pos.y, pos.z), v0);
-  imageStore(uOutput, ivec3(pos.x + 1, pos.y, pos.z), v1);
-  imageStore(uOutput, ivec3(pos.x + 2, pos.y, pos.z), v2);
-  imageStore(uOutput, ivec3(pos.x + 3, pos.y, pos.z), v3);
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
deleted file mode 100644
index 06af09e0b655..000000000000
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
+++ /dev/null
@@ -1,59 +0,0 @@
-#version 450 core
-#define PRECISION $precision
-#define FORMAT    $format
-
-layout(std430) buffer;
-layout(set = 0, binding = 0, FORMAT) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION sampler3D uKernel;
-layout(set = 0, binding = 3)         readonly buffer bias {
-  vec4 data[];
-}
-uBias;
-layout(set = 0, binding = 4) uniform constBlock {
-  ivec2 padding;
-  ivec2 kernelSize;
-  ivec2 stride;
-  ivec2 dilate;
-  ivec4 outputSize;
-  ivec4 inputSize;
-  float outputMin;
-  float outputMax;
-}
-uConstBlock;
-
-#define UP_DIV(x, y) (((x) + (y)-1) / (y))
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  ivec4 outputSize = uConstBlock.outputSize;
-  if (all(lessThan(ivec3(gl_GlobalInvocationID), outputSize.xyz))) {
-    int KW = uConstBlock.kernelSize.x;
-    int KH = uConstBlock.kernelSize.y;
-    ivec4 inputSize = uConstBlock.inputSize;
-    ivec2 dilate = uConstBlock.dilate;
-    ivec2 padding = uConstBlock.padding;
-    ivec2 stride = uConstBlock.stride;
-
-    ivec2 s0 = pos.xy * stride - padding;
-    ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, dilate)));
-    ivec2 efxy = min(uConstBlock.kernelSize, UP_DIV(inputSize.xy - s0, dilate));
-
-    vec4 acc = uBias.data[pos.z];
-    int sx, kxi, kyi;
-    for (kyi = sfxy.y; kyi < efxy.y; ++kyi) {
-      int sy = kyi * dilate.y + s0.y;
-      for (kxi = 0; kxi < KW; ++kxi) {
-        sx = kxi * dilate.x + s0.x;
-        vec4 iv = texelFetch(uInput, ivec3(sx, sy, pos.z), 0);
-        vec4 kv = texelFetch(uKernel, ivec3(kxi, kyi, pos.z), 0);
-        acc += kv * iv;
-      }
-    }
-    vec4 outputMin = vec4(uConstBlock.outputMin);
-    vec4 outputMax = vec4(uConstBlock.outputMax);
-    imageStore(uOutput, pos, clamp(acc, outputMin, outputMax));
-  }
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
deleted file mode 100644
index 89411284fed4..000000000000
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
+++ /dev/null
@@ -1,82 +0,0 @@
-#version 450 core
-#define PRECISION $precision
-layout(std430) buffer;
-layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
-layout(set = 0, binding = 3) readonly buffer bias {
-  vec4 data[];
-}
-uBias;
-layout(set = 0, binding = 4) uniform constBlock {
-  ivec2 padding;
-  ivec2 kernelSize;
-  ivec2 stride;
-  ivec2 dilate;
-  ivec4 outputSize;
-  ivec4 inputSize;
-  float outputMin;
-  float outputMax;
-}
-uConstBlock;
-
-#define UP_DIV(x, y) (((x) + (y)-1) / (y))
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 gpos = ivec3(gl_GlobalInvocationID);
-  if (all(lessThan(gpos, uConstBlock.outputSize.xyz))) {
-    ivec3 pos = gpos * ivec3(4, 1, 1);
-    int kernelX = uConstBlock.kernelSize.x;
-    int kernelY = uConstBlock.kernelSize.y;
-    ivec3 inputSize = uConstBlock.inputSize.xyz;
-    ivec2 s0 = pos.xy * uConstBlock.stride - uConstBlock.padding;
-    int fx, fy, fz;
-    ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstBlock.dilate)));
-    ivec2 efxy =
-        min(uConstBlock.kernelSize,
-            UP_DIV(uConstBlock.inputSize.xy - s0, uConstBlock.dilate));
-    vec4 color = uBias.data[pos.z];
-    vec4 color2 = color;
-    vec4 color3 = color;
-    vec4 color4 = color;
-    int kY = pos.z;
-    int strideX = uConstBlock.stride.x;
-    for (fy = sfxy.y; fy < efxy.y; ++fy) {
-      int sy = fy * uConstBlock.dilate.y + s0.y;
-      for (fx = 0; fx < kernelX; ++fx) {
-        int kZ = fx + fy * kernelX;
-        int sx1 = fx * uConstBlock.dilate.x + s0.x;
-        int sx2 = sx1 + strideX;
-        int sx3 = sx1 + strideX * 2;
-        int sx4 = sx1 + strideX * 3;
-        float m1 = sx1 >= 0 && sx1 < inputSize.x ? 1.0 : 0.0;
-        float m2 = sx2 >= 0 && sx2 < inputSize.x ? 1.0 : 0.0;
-        float m3 = sx3 >= 0 && sx3 < inputSize.x ? 1.0 : 0.0;
-        float m4 = sx4 >= 0 && sx4 < inputSize.x ? 1.0 : 0.0;
-        fz = 0;
-        for (; fz < inputSize.z; ++fz) {
-          int kX = 4 * fz;
-          vec4 k0 = texelFetch(uKernel, ivec3(kX + 0, kY, kZ), 0);
-          vec4 k1 = texelFetch(uKernel, ivec3(kX + 1, kY, kZ), 0);
-          vec4 k2 = texelFetch(uKernel, ivec3(kX + 2, kY, kZ), 0);
-          vec4 k3 = texelFetch(uKernel, ivec3(kX + 3, kY, kZ), 0);
-
-          mat4 k = mat4(k0, k1, k2, k3);
-
-          color += k * texelFetch(uInput, ivec3(sx1, sy, fz), 0) * m1;
-          color2 += k * texelFetch(uInput, ivec3(sx2, sy, fz), 0) * m2;
-          color3 += k * texelFetch(uInput, ivec3(sx3, sy, fz), 0) * m3;
-          color4 += k * texelFetch(uInput, ivec3(sx4, sy, fz), 0) * m4;
-        }
-      }
-    }
-    vec4 outputMin = vec4(uConstBlock.outputMin);
-    vec4 outputMax = vec4(uConstBlock.outputMax);
-    imageStore(uOutput, ivec3(pos.x + 0, pos.y, pos.z), clamp(color, outputMin, outputMax));
-    imageStore(uOutput, ivec3(pos.x + 1, pos.y, pos.z), clamp(color2, outputMin, outputMax));
-    imageStore(uOutput, ivec3(pos.x + 2, pos.y, pos.z), clamp(color3, outputMin, outputMax));
-    imageStore(uOutput, ivec3(pos.x + 3, pos.y, pos.z), clamp(color4, outputMin, outputMax));
-  }
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
deleted file mode 100644
index 8baae9b5fcd5..000000000000
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
+++ /dev/null
@@ -1,65 +0,0 @@
-#version 450 core
-#define PRECISION $precision
-layout(std430) buffer;
-layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
-layout(set = 0, binding = 3) readonly buffer bias {
-  vec4 data[];
-}
-uBias;
-layout(set = 0, binding = 4) uniform constBlock {
-  ivec2 padding;
-  ivec2 kernelSize;
-  ivec2 stride;
-  ivec2 dilate;
-  ivec4 outputSize;
-  ivec4 inputSize;
-  float outputMin;
-  float outputMax;
-}
-uConstBlock;
-
-#define UP_DIV(x, y) (((x) + (y)-1) / (y))
-
-layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (all(lessThan(pos, uConstBlock.outputSize.xyz))) {
-    int kernelX = uConstBlock.kernelSize.x;
-    int kernelY = uConstBlock.kernelSize.y;
-    ivec3 inputSize = uConstBlock.inputSize.xyz;
-    ivec2 s0 = pos.xy * uConstBlock.stride - uConstBlock.padding;
-    int fx, fy, fz;
-    ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstBlock.dilate)));
-    ivec2 efxy =
-        min(uConstBlock.kernelSize,
-            UP_DIV(uConstBlock.inputSize.xy - s0, uConstBlock.dilate));
-    vec4 color = uBias.data[pos.z];
-    int kY = pos.z;
-    int strideX = uConstBlock.stride.x;
-    for (fy = sfxy.y; fy < efxy.y; ++fy) {
-      int sy = fy * uConstBlock.dilate.y + s0.y;
-      for (fx = 0; fx < kernelX; ++fx) {
-        int kZ = fx + fy * kernelX;
-        int sx = fx * uConstBlock.dilate.x + s0.x;
-        fz = 0;
-        for (; fz < inputSize.z; ++fz) {
-          int kX = 4 * fz;
-          vec4 k0 = texelFetch(uKernel, ivec3(kX + 0, kY, kZ), 0);
-          vec4 k1 = texelFetch(uKernel, ivec3(kX + 1, kY, kZ), 0);
-          vec4 k2 = texelFetch(uKernel, ivec3(kX + 2, kY, kZ), 0);
-          vec4 k3 = texelFetch(uKernel, ivec3(kX + 3, kY, kZ), 0);
-
-          mat4 k = mat4(k0, k1, k2, k3);
-
-          color += k * texelFetch(uInput, ivec3(sx, sy, fz), 0);
-        }
-      }
-    }
-    vec4 outputMin = vec4(uConstBlock.outputMin);
-    vec4 outputMax = vec4(uConstBlock.outputMax);
-    imageStore(uOutput, ivec3(pos.x, pos.y, pos.z), clamp(color, outputMin, outputMax));
-  }
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/lerp.glsl b/aten/src/ATen/native/vulkan/glsl/lerp.glsl
new file mode 100644
index 000000000000..433877a8efe8
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/lerp.glsl
@@ -0,0 +1,36 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput0;
+layout(set = 0, binding = 2)         uniform PRECISION                    sampler3D uInput1;
+layout(set = 0, binding = 3)         uniform PRECISION                    sampler3D uInput2;
+layout(set = 0, binding = 4)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec4 isize0;
+  ivec4 isize1;
+  ivec4 isize2;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const ivec3 input0_pos = pos % uBlock.isize0.xyz;
+    const ivec3 input1_pos = pos % uBlock.isize1.xyz;
+    const ivec3 input2_pos = pos % uBlock.isize2.xyz;
+    imageStore(
+        uOutput,
+        pos,
+        texelFetch(uInput0, input0_pos, 0)
+          + texelFetch(uInput2, input2_pos, 0)
+          * (texelFetch(uInput1, input1_pos, 0) - texelFetch(uInput0, input0_pos, 0)));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_.glsl
new file mode 100644
index 000000000000..b727f7bf51a7
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/lerp_.glsl
@@ -0,0 +1,33 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION          sampler3D uInput0;
+layout(set = 0, binding = 2)         uniform PRECISION          sampler3D uInput1;
+layout(set = 0, binding = 3)         uniform PRECISION restrict Block {
+  ivec4 size;
+  ivec4 isize0;
+  ivec4 isize1;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const ivec3 input0_pos = pos % uBlock.isize0.xyz;
+    const ivec3 input1_pos = pos % uBlock.isize1.xyz;
+    imageStore(
+        uOutput,
+        pos,
+        imageLoad(uOutput, pos)
+          + texelFetch(uInput1, input1_pos, 0)
+          * (texelFetch(uInput0, input0_pos, 0) - imageLoad(uOutput, pos)));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl
new file mode 100644
index 000000000000..2978f0922f3d
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl
@@ -0,0 +1,34 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput0;
+layout(set = 0, binding = 2)         uniform PRECISION                    sampler3D uInput1;
+layout(set = 0, binding = 3)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  ivec4 isize0;
+  ivec3 isize1;
+  float weight;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const ivec3 input0_pos = pos % uBlock.isize0.xyz;
+    const ivec3 input1_pos = pos % uBlock.isize1.xyz;
+    imageStore(
+        uOutput,
+        pos,
+        texelFetch(uInput0, input0_pos, 0)
+          + uBlock.weight
+          * (texelFetch(uInput1, input1_pos, 0) - texelFetch(uInput0, input0_pos, 0)));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl
new file mode 100644
index 000000000000..fa32b8b13667
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl
@@ -0,0 +1,31 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION          sampler3D uInput0;
+layout(set = 0, binding = 2)         uniform PRECISION restrict Block {
+  ivec4 size;
+  ivec3 isize0;
+  float weight;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const ivec3 input0_pos = pos % uBlock.isize0.xyz;
+    imageStore(
+        uOutput,
+        pos,
+        imageLoad(uOutput, pos)
+          + uBlock.weight
+          * (texelFetch(uInput0, input0_pos, 0) - imageLoad(uOutput, pos)));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl
deleted file mode 100644
index 3d1191ff6eea..000000000000
--- a/aten/src/ATen/native/vulkan/glsl/permute.glsl
+++ /dev/null
@@ -1,57 +0,0 @@
-#version 450 core
-layout(std430) buffer;
-layout(set = 0, binding = 0) writeonly buffer outputBuffer {
-  float data[];
-}
-uOutput;
-layout(set = 0, binding = 1) readonly buffer inputBuffer {
-  float data[];
-}
-uInput;
-layout(set = 0, binding = 2) uniform constBlock {
-  ivec4 inStrides[2];
-  ivec4 outStrides[2];
-  ivec4 outDims[2];
-  int inOffset;
-}
-uConst;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  ivec4 outIdx[2];
-
-  int d1 = uConst.outDims[0][3];
-  int d3 = uConst.outDims[1][1];
-  int d5 = uConst.outDims[1][3];
-
-  int oi0 = pos.z / d1;
-  int oi1 = pos.z - d1 * oi0;
-
-  int oi2 = pos.y / d3;
-  int oi3 = pos.y - d3 * oi2;
-
-  int oi4 = pos.x / d5;
-  int oi5 = pos.x - d5 * oi4;
-
-  ivec4 oIdx0 = ivec4(0, 0, oi0, oi1);
-  ivec4 oIdx1 = ivec4(oi2, oi3, oi4, oi5);
-  if (all(lessThan(oIdx0, uConst.outDims[0])) &&
-      all(lessThan(oIdx1, uConst.outDims[1]))) {
-    ivec4 ins0 = uConst.inStrides[0];
-    ivec4 ins1 = uConst.inStrides[1];
-    int inIdxInt = oIdx0.x * ins0.x + oIdx0.y * ins0.y + oIdx0.z * ins0.z +
-        oIdx0.w * ins0.w;
-    inIdxInt += oIdx1.x * ins1.x + oIdx1.y * ins1.y + oIdx1.z * ins1.z +
-        oIdx1.w * ins1.w;
-    ivec4 outs0 = uConst.outStrides[0];
-    ivec4 outs1 = uConst.outStrides[1];
-    int outIdxInt = oIdx0.x * outs0.x + oIdx0.y * outs0.y + oIdx0.z * outs0.z +
-        oIdx0.w * outs0.w;
-    outIdxInt += oIdx1.x * outs1.x + oIdx1.y * outs1.y + oIdx1.z * outs1.z +
-        oIdx1.w * outs1.w;
-
-    uOutput.data[outIdxInt] = uInput.data[uConst.inOffset + inIdxInt];
-  }
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/tanh.glsl b/aten/src/ATen/native/vulkan/glsl/tanh.glsl
index 8d611630cf74..70315def6342 100644
--- a/aten/src/ATen/native/vulkan/glsl/tanh.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/tanh.glsl
@@ -18,6 +18,10 @@ void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
   if (all(lessThan(pos, uBlock.size.xyz))) {
-    imageStore(uOutput, pos, tanh(texelFetch(uInput, pos, 0)));
+    const vec4 intex = texelFetch(uInput, pos, 0);
+    imageStore(
+        uOutput,
+        pos,
+        tanh(clamp(intex, -15.0, 15.0)));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl
index 59649da65180..ef8fd35fc588 100644
--- a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl
@@ -17,6 +17,10 @@ void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
   if (all(lessThan(pos, uBlock.size.xyz))) {
-    imageStore(uOutput, pos, tanh(imageLoad(uOutput, pos)));
+    const vec4 intex = imageLoad(uOutput, pos);
+    imageStore(
+        uOutput,
+        pos,
+        tanh(clamp(intex, -15.0, 15.0)));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp
index 42e941f00a77..268487e10c1c 100644
--- a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
 
@@ -54,7 +55,8 @@ Tensor arithmetic_scalar(
     const Tensor& self_arg,
     const Scalar& other,
     const c10::optional<Scalar>& alpha_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -69,6 +71,8 @@ Tensor arithmetic_scalar(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY (v_output.has_image() && v_self.has_image()) {
       const float other_val = alpha_arg
           ? other.to<float>() * alpha_arg->to<float>()
@@ -114,7 +118,8 @@ Tensor& arithmetic_scalar_(
     Tensor& self,
     const Scalar& other,
     const c10::optional<Scalar>& alpha_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
@@ -126,6 +131,8 @@ Tensor& arithmetic_scalar_(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY (v_self.has_image()) {
       const float other_val = alpha_arg
           ? other.to<float>() * alpha_arg->to<float>()
@@ -169,7 +176,8 @@ Tensor arithmetic_tensor(
     const Tensor& self_arg,
     const Tensor& other_arg,
     const c10::optional<Scalar>& alpha_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   check_inputs(self_arg, other_arg);
   api::Context* const context = api::context();
 
@@ -188,6 +196,8 @@ Tensor arithmetic_tensor(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY (v_self.has_image() && v_other.has_image()) {
       const float alpha = alpha_arg ? alpha_arg->to<float>() : 1.0;
       const struct Block final {
@@ -243,7 +253,8 @@ Tensor& arithmetic_tensor_(
     Tensor& self,
     const Tensor& other_arg,
     const c10::optional<Scalar>& alpha_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   check_inputs(self, other_arg);
   api::Context* const context = api::context();
 
@@ -259,6 +270,8 @@ Tensor& arithmetic_tensor_(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY (
         v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
       const float alpha = alpha_arg ? alpha_arg->to<float>() : 1.0;
@@ -310,25 +323,33 @@ Tensor add_scalar(
     const Scalar& other,
     const Scalar& alpha) {
   return arithmetic_scalar(
-      self_arg, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar));
+      self_arg, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar), "aten::add.Scalar");
 }
 
 Tensor& add_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) {
   return arithmetic_scalar_(
-      self, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar_));
+      self, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar_), "aten::add_.Scalar");
 }
 
 Tensor add_tensor(
     const Tensor& self_arg,
     const Tensor& other_arg,
     const Scalar& alpha) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(alpha.to<float>()),
+        VK_KERNEL(add_scalar),
+        "aten::add.Tensor");
+  }
   return arithmetic_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add));
+      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add), "aten::add.Tensor");
 }
 
 Tensor& add_tensor_(Tensor& self, const Tensor& other_arg, const Scalar& alpha) {
   return arithmetic_tensor_(
-      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add_));
+      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add_), "aten::add_.Tensor");
 }
 
 Tensor sub_scalar(
@@ -339,7 +360,8 @@ Tensor sub_scalar(
       self_arg,
       other,
       c10::optional<Scalar>(-1 * alpha.to<float>()),
-      VK_KERNEL(add_scalar));
+      VK_KERNEL(add_scalar),
+      "aten::sub.Scalar");
 }
 
 Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) {
@@ -347,40 +369,57 @@ Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) {
       self,
       other,
       c10::optional<Scalar>(-1 * alpha.to<float>()),
-      VK_KERNEL(add_scalar_));
+      VK_KERNEL(add_scalar_),
+      "aten::sub_.Scalar");
 }
 
 Tensor sub_tensor(
     const Tensor& self_arg,
     const Tensor& other_arg,
     const Scalar& alpha) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(-1 * alpha.to<float>()),
+        VK_KERNEL(add_scalar),
+        "aten::sub.Tensor");
+  }
   return arithmetic_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub));
+      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub), "aten::sub.Tensor");
 }
 
 Tensor& sub_tensor_(Tensor& self, const Tensor& other_arg, const Scalar& alpha) {
   return arithmetic_tensor_(
-      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub_));
+      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub_), "aten::sub_.Tensor");
 }
 
 Tensor mul_scalar(const Tensor& self_arg, const Scalar& other) {
   return arithmetic_scalar(
-      self_arg, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar));
+      self_arg, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar), "aten::mul.Scalar");
 }
 
 Tensor& mul_scalar_(Tensor& self, const Scalar& other) {
   return arithmetic_scalar_(
-      self, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar_));
+      self, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar_), "aten::mul_.Scalar");
 }
 
 Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(),
+        VK_KERNEL(mul_scalar),
+        "aten::mul.Tensor");
+  }
   return arithmetic_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul));
+      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul), "aten::mul.Tensor");
 }
 
 Tensor& mul_tensor_(Tensor& self, const Tensor& other_arg) {
   return arithmetic_tensor_(
-      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul_));
+      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul_), "aten::mul_.Tensor");
 }
 
 Tensor div_scalar(const Tensor& self_arg, const Scalar& other) {
@@ -388,7 +427,8 @@ Tensor div_scalar(const Tensor& self_arg, const Scalar& other) {
       self_arg,
       1.0 / other.to<float>(),
       c10::optional<Scalar>(),
-      VK_KERNEL(mul_scalar));
+      VK_KERNEL(mul_scalar),
+      "aten::div.Scalar");
 }
 
 Tensor& div_scalar_(Tensor& self, const Scalar& other) {
@@ -396,17 +436,26 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) {
       self,
       1.0 / other.to<float>(),
       c10::optional<Scalar>(),
-      VK_KERNEL(mul_scalar_));
+      VK_KERNEL(mul_scalar_),
+      "aten::div_.Scalar");
 }
 
 Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        1.0 / other_arg.item<float>(),
+        c10::optional<Scalar>(),
+        VK_KERNEL(mul_scalar),
+        "aten::div.Tensor");
+  }
   return arithmetic_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(div));
+      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(div), "aten::div.Tensor");
 }
 
 Tensor& div_tensor_(Tensor& self, const Tensor& other_arg) {
   return arithmetic_tensor_(
-      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(div_));
+      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(div_), "aten::div_.Tensor");
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index a6e65607fb07..3f5cb3d2afb9 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
 
@@ -9,10 +10,11 @@ namespace {
 
 using namespace api::utils;
 
-Tensor clamp(
+Tensor _clamp(
     const Tensor& self_arg,
     const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max) {
+    const c10::optional<Scalar>& max,
+    const std::string& op_name) {
   TORCH_CHECK(
       min || max,
       "At least one of 'min' or 'max' must not be None");
@@ -31,6 +33,8 @@ Tensor clamp(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -79,10 +83,18 @@ Tensor clamp(
   return convert(v_output);
 }
 
-Tensor& clamp_(
-    Tensor& self,
+Tensor clamp(
+    const Tensor& self_arg,
     const c10::optional<Scalar>& min,
     const c10::optional<Scalar>& max) {
+  return _clamp(self_arg, min, max, "aten::clamp");
+}
+
+Tensor& _clamp_(
+    Tensor& self,
+    const c10::optional<Scalar>& min,
+    const c10::optional<Scalar>& max,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
@@ -98,6 +110,8 @@ Tensor& clamp_(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -140,9 +154,17 @@ Tensor& clamp_(
   return self;
 }
 
+Tensor& clamp_(
+    Tensor& self,
+    const c10::optional<Scalar>& min,
+    const c10::optional<Scalar>& max) {
+  return _clamp_(self, min, max, "aten::clamp_");
+}
+
 Tensor activation(
     const Tensor& self_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -157,6 +179,8 @@ Tensor activation(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -202,7 +226,8 @@ Tensor activation(
 
 Tensor& activation_(
     Tensor& self,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
@@ -214,6 +239,8 @@ Tensor& activation_(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -255,44 +282,45 @@ Tensor hardtanh(
     const Tensor& self,
     const Scalar& min,
     const Scalar& max) {
-  return ops::clamp(self, min, max);
+  return ops::_clamp(self, min, max, "aten::hardtanh");
 }
 
 Tensor& hardtanh_(
     Tensor& self,
     const Scalar& min,
     const Scalar& max) {
-  return ops::clamp_(self, min, max);
+  return ops::_clamp_(self, min, max, "aten::hardtanh_");
 }
 
 Tensor relu(const Tensor& self) {
-  return ops::clamp(self, 0, c10::nullopt);
+  return ops::_clamp(self, 0, c10::nullopt, "aten::relu");
 }
 
 Tensor& relu_(Tensor& self) {
-  return ops::clamp_(self, 0, c10::nullopt);
+  return ops::_clamp_(self, 0, c10::nullopt, "aten::relu_");
 }
 
 Tensor hardswish(const Tensor& self) {
-  return ops::activation(self, VK_KERNEL(hardswish));
+  return ops::activation(self, VK_KERNEL(hardswish), "aten::hardswish");
 }
 
 Tensor& hardswish_(Tensor& self) {
-  return ops::activation_(self, VK_KERNEL(hardswish_));
+  return ops::activation_(self, VK_KERNEL(hardswish_), "aten::hardswish_");
 }
 
 Tensor hardsigmoid(const Tensor& self) {
-  return ops::activation(self, VK_KERNEL(hardsigmoid));
+  return ops::activation(self, VK_KERNEL(hardsigmoid), "aten::hardsigmoid");
 }
 
 Tensor& hardsigmoid_(Tensor& self) {
-  return ops::activation_(self, VK_KERNEL(hardsigmoid_));
+  return ops::activation_(self, VK_KERNEL(hardsigmoid_), "aten::hardsigmoid_");
 }
 
 Tensor activation_scalar(
     const Tensor& self_arg,
     const Scalar& scalar_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -307,6 +335,8 @@ Tensor activation_scalar(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -355,7 +385,8 @@ Tensor activation_scalar(
 Tensor& activation_scalar_(
     Tensor& self,
     const Scalar& scalar_arg,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
@@ -367,6 +398,8 @@ Tensor& activation_scalar_(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -409,41 +442,41 @@ Tensor& activation_scalar_(
 Tensor hardshrink(
     const Tensor& self_arg,
     const Scalar& lambd) {
-  return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink));
+  return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink), "aten::hardshrink");
 }
 
 Tensor& hardshrink_(
     Tensor& self,
     const Scalar& lambd) {
-  return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_));
+  return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_), "aten::hardshrink_");
 }
 
 Tensor leaky_relu(
     const Tensor& self_arg,
     const Scalar& negative_slope) {
-  return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu));
+  return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu), "aten::leaky_relu");
 }
 
 Tensor& leaky_relu_(
     Tensor& self,
     const Scalar& negative_slope) {
-  return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_));
+  return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_), "aten::leaky_relu_");
 }
 
 Tensor sigmoid(const Tensor& self) {
-  return ops::activation(self, VK_KERNEL(sigmoid));
+  return ops::activation(self, VK_KERNEL(sigmoid), "aten::sigmoid");
 }
 
 Tensor& sigmoid_(Tensor& self) {
-  return ops::activation_(self, VK_KERNEL(sigmoid_));
+  return ops::activation_(self, VK_KERNEL(sigmoid_), "aten::sigmoid_");
 }
 
 Tensor tanh(const Tensor& self) {
-  return ops::activation(self, VK_KERNEL(tanh));
+  return ops::activation(self, VK_KERNEL(tanh), "aten::tanh");
 }
 
 Tensor& tanh_(Tensor& self) {
-  return ops::activation_(self, VK_KERNEL(tanh_));
+  return ops::activation_(self, VK_KERNEL(tanh_), "aten::tanh_");
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp
index 3d587864ad2a..eefa365bc478 100644
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/api/Helper.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
@@ -25,73 +26,75 @@ Tensor cat_feature(const TensorList tensors, vTensor& v_output) {
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_batch)");
 
-  int64_t ch_size_allprior = 0;
-  int64_t ch_interval = 0;
-  for (const auto& tensor : tensors) {
-    ch_interval += tensor.sizes()[1];
-  }
+    int64_t ch_size_allprior = 0;
+    int64_t ch_interval = 0;
+    for (const auto& tensor : tensors) {
+      ch_interval += tensor.sizes()[1];
+    }
 
-  auto dst_image = v_output.image(
-    command_buffer,
-    vTensor::Stage::Compute,
-    vTensor::Access::Read | vTensor::Access::Write);
-
-  for (const auto& tensor : tensors) {
-    const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
-    const vTensor& v_self = convert(self);
-    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-      auto src_image = v_self.image(
-              command_buffer,
-              vTensor::Stage::Compute);
-
-      const struct Block final {
-        uvec3 size;                // output texture size
-        uint32_t fill_0;           // dummy
-        uvec3 isize;               // input texture size
-        uint32_t fill_1;           // dummy
-        uint32_t batch_size;       // input tensor's batch size
-        uint32_t ch_size;          // input tensor's channel size
-        uint32_t ch_interval;      // channel interval (total # of channels for all tensors)
-        uint32_t ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
-      } block {
-        v_output.extents(),
-        0u,
-        v_self.extents(),
-        0u,
-        safe_downcast<uint32_t>(v_self.sizes()[0]),
-        safe_downcast<uint32_t>(v_self.sizes()[1]),
-        safe_downcast<uint32_t>(ch_interval),
-        safe_downcast<uint32_t>(ch_size_allprior),
-      };
-
-      ch_size_allprior += v_self.sizes()[1];
-
-      context->dispatch(
-          command_buffer,
-          {
-            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          },
-          VK_KERNEL(cat_feature),
+    auto dst_image = v_output.image(
+      command_buffer,
+      vTensor::Stage::Compute,
+      vTensor::Access::Read | vTensor::Access::Write);
+
+    for (const auto& tensor : tensors) {
+      const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
+      const vTensor& v_self = convert(self);
+      if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+        auto src_image = v_self.image(
+                command_buffer,
+                vTensor::Stage::Compute);
+
+        const struct Block final {
+          uvec3 size;                // output texture size
+          uint32_t fill_0;           // dummy
+          uvec3 isize;               // input texture size
+          uint32_t fill_1;           // dummy
+          uint32_t batch_size;       // input tensor's batch size
+          uint32_t ch_size;          // input tensor's channel size
+          uint32_t ch_interval;      // channel interval (total # of channels for all tensors)
+          uint32_t ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
+        } block {
+          v_output.extents(),
+          0u,
           v_self.extents(),
-          context->gpu().adapter->local_work_group_size(),
-          // Read/Write access bypasses synchronization but inserts appropriate
-          // barriers if necessary.
-          dst_image,
-          // Read-only access is implied on const tensors and triggers an async
-          // synchronization if necessary.
-          src_image,
-          // Object lifetime is managed by the resource pool.
-          // It is OK not to keep track of the handle.
-          context->resource().pool.uniform(block).object);
-    }
-    else {
-      TORCH_CHECK(false, "Not implemented!");
+          0u,
+          safe_downcast<uint32_t>(v_self.sizes()[0]),
+          safe_downcast<uint32_t>(v_self.sizes()[1]),
+          safe_downcast<uint32_t>(ch_interval),
+          safe_downcast<uint32_t>(ch_size_allprior),
+        };
+
+        ch_size_allprior += v_self.sizes()[1];
+
+        context->dispatch(
+            command_buffer,
+            {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            },
+            VK_KERNEL(cat_feature),
+            v_self.extents(),
+            context->gpu().adapter->local_work_group_size(),
+            // Read/Write access bypasses synchronization but inserts appropriate
+            // barriers if necessary.
+            dst_image,
+            // Read-only access is implied on const tensors and triggers an async
+            // synchronization if necessary.
+            src_image,
+            // Object lifetime is managed by the resource pool.
+            // It is OK not to keep track of the handle.
+            context->resource().pool.uniform(block).object);
+      }
+      else {
+        TORCH_CHECK(false, "Not implemented!");
+      }
     }
   }
-
   command_pool.submit(context->gpu().queue, command_buffer);
 
   return convert(v_output);
@@ -101,52 +104,54 @@ Tensor cat_feature_mult4ch(const TensorList tensors, vTensor& v_output) {
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_feature_mult4ch)");
 
-  int64_t depth_size_allprior = 0;
-  int64_t ch_interval = 0;
-  for (const auto& tensor : tensors) {
-    ch_interval += tensor.sizes()[1];
-  }
-  const int64_t depth_interval = ch_interval / 4;
-
-  auto dst_image = v_output.image(
-    command_buffer,
-    vTensor::Stage::Transfer,
-    vTensor::Access::Write);
-  uvec3 src_offset{};
-  uvec3 dst_offset{};
-
-  for (const auto& tensor : tensors) {
-    const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
-    const vTensor& v_self = convert(self);
-    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-      auto src_image = v_self.image(
-              command_buffer,
-              vTensor::Stage::Transfer);
-
-      const uint32_t depth_slice = safe_downcast<uint32_t>(tensor.sizes()[1] / 4);
-      uvec3 copy_extents {v_self.extents().data[0u],
-        v_self.extents().data[1u],
-        depth_slice};
-
-      for (const auto b : c10::irange(tensor.sizes()[0])) {
-        src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
-        dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
-        api::helper::copy_texture_to_texture(command_buffer,
-          src_image,
-          dst_image,
-          copy_extents,
-          src_offset,
-          dst_offset);
-      }
-
-      depth_size_allprior += depth_slice;
+    int64_t depth_size_allprior = 0;
+    int64_t ch_interval = 0;
+    for (const auto& tensor : tensors) {
+      ch_interval += tensor.sizes()[1];
     }
-    else {
-      TORCH_CHECK(false, "Not implemented!");
+    const int64_t depth_interval = ch_interval / 4;
+
+    auto dst_image = v_output.image(
+      command_buffer,
+      vTensor::Stage::Transfer,
+      vTensor::Access::Write);
+    uvec3 src_offset{};
+    uvec3 dst_offset{};
+
+    for (const auto& tensor : tensors) {
+      const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
+      const vTensor& v_self = convert(self);
+      if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+        auto src_image = v_self.image(
+                command_buffer,
+                vTensor::Stage::Transfer);
+
+        const uint32_t depth_slice = safe_downcast<uint32_t>(tensor.sizes()[1] / 4);
+        uvec3 copy_extents {v_self.extents().data[0u],
+          v_self.extents().data[1u],
+          depth_slice};
+
+        for (const auto b : c10::irange(tensor.sizes()[0])) {
+          src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
+          dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
+          api::helper::copy_texture_to_texture(command_buffer,
+            src_image,
+            dst_image,
+            copy_extents,
+            src_offset,
+            dst_offset);
+        }
+
+        depth_size_allprior += depth_slice;
+      }
+      else {
+        TORCH_CHECK(false, "Not implemented!");
+      }
     }
   }
-
   command_pool.submit(context->gpu().queue, command_buffer);
 
   return convert(v_output);
@@ -160,37 +165,39 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_width)");
+
+    auto dst_image = v_output.image(
+      command_buffer,
+      vTensor::Stage::Transfer,
+      vTensor::Access::Write);
+
+    uvec3 src_offset{};
+    uvec3 dst_offset{};
+    for (const auto& tensor : tensors) {
+      const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
+      const vTensor& v_self = convert(self);
+      if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+        auto src_image = v_self.image(
+                command_buffer,
+                vTensor::Stage::Transfer);
 
-  auto dst_image = v_output.image(
-    command_buffer,
-    vTensor::Stage::Transfer,
-    vTensor::Access::Write);
-
-  uvec3 src_offset{};
-  uvec3 dst_offset{};
-  for (const auto& tensor : tensors) {
-    const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
-    const vTensor& v_self = convert(self);
-    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-      auto src_image = v_self.image(
-              command_buffer,
-              vTensor::Stage::Transfer);
-
-      api::helper::copy_texture_to_texture(command_buffer,
-        src_image,
-        dst_image,
-        v_self.extents(),
-        src_offset,
-        dst_offset);
-
-      // Increment by height
-      dst_offset.data[1u] += v_self.extents().data[1u];
-    }
-    else {
-      TORCH_CHECK(false, "Not implemented!");
+        api::helper::copy_texture_to_texture(command_buffer,
+          src_image,
+          dst_image,
+          v_self.extents(),
+          src_offset,
+          dst_offset);
+
+        // Increment by height
+        dst_offset.data[1u] += v_self.extents().data[1u];
+      }
+      else {
+        TORCH_CHECK(false, "Not implemented!");
+      }
     }
   }
-
   command_pool.submit(context->gpu().queue, command_buffer);
 
   return convert(v_output);
@@ -199,7 +206,6 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
 Tensor cat(
   const at::TensorList tensors,
   const int64_t dim) {
-  const auto norm_dim = normalize_dim(dim, 4);
   TORCH_CHECK(
     tensors.size() > 0,
     "Vulkan cat expects at least one tensor");
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index c7d629cae96f..94799208c7c8 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -1,8 +1,9 @@
-#include <ATen/native/vulkan/ops/Convolution.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
+#include <ATen/native/vulkan/api/Utils.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/vulkan/ops/Common.h>
-#include <ATen/native/vulkan/api/Utils.h>
+#include <ATen/native/vulkan/ops/Convolution.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -290,7 +291,7 @@ vTensor pack_weights(
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   const Tensor weight = weight_arg.contiguous();
 
@@ -322,7 +323,7 @@ vTensor pack_biases(
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   const int64_t src_w = weight.size(Layout::Filter::output);
   const int64_t packed_w = div_up(src_w, INT64_C(4));
@@ -549,14 +550,15 @@ Conv2dOpContext Conv2dOpContext::create(
     groups,
     method,
     output_min,
-    output_max,
+    output_max
   };
 }
 
 void Conv2dOpContext::conv2d_sliding_window(
     const api::Shader::Descriptor& shader,
     vTensor& v_output,
-    const vTensor& v_input) const {
+    const vTensor& v_input,
+    const std::string& op_name) const {
   bool valid = C10_LIKELY(v_output.has_image() && v_input.has_image() && packed_.v_weight.has_image());
   TORCH_CHECK(valid, "Not Implemented!")
 
@@ -564,6 +566,8 @@ void Conv2dOpContext::conv2d_sliding_window(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     const struct Block final {
       uvec3 extents;
       int32_t ic4;
@@ -667,103 +671,106 @@ void Conv2dOpContext::conv2d_winograd_2_3(
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
-
-  vTensor v_input_winograd{
-    context,
-    {
-      v_input.sizes()[Layout::Activation4D::batch],
-      v_input.sizes()[Layout::Activation4D::channels],
-      out_h_units*4,
-      out_w_units*4,
-    },
-    v_output.options(),
-  };
-
   {
-    const struct TransformBlock final {
-      uvec3 extents;
-      uint32_t fill;
-      ivec2 limits;
-      ivec2 padding;
-    } transform_block {
-      v_input_winograd.extents(),
-      0u,
+    api::OpProfiler profiler(command_buffer, context->querypool(), "prepacked::conv2d_clamp_run (conv2d_winograd_2_3)");
+
+    vTensor v_input_winograd{
+      context,
       {
-        safe_downcast<int32_t>(v_input.sizes()[Layout::Activation4D::width]),
-        safe_downcast<int32_t>(v_input.sizes()[Layout::Activation4D::height]),
-      },
-      {
-        safe_downcast<int32_t>(packed_.padding[Layout::Parameter::width]),
-        safe_downcast<int32_t>(packed_.padding[Layout::Parameter::height]),
+        v_input.sizes()[Layout::Activation4D::batch],
+        v_input.sizes()[Layout::Activation4D::channels],
+        out_h_units*4,
+        out_w_units*4,
       },
+      v_output.options(),
     };
 
-    context->dispatch(
-        command_buffer,
+    {
+      const struct TransformBlock final {
+        uvec3 extents;
+        uint32_t fill;
+        ivec2 limits;
+        ivec2 padding;
+      } transform_block {
+        v_input_winograd.extents(),
+        0u,
         {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          safe_downcast<int32_t>(v_input.sizes()[Layout::Activation4D::width]),
+          safe_downcast<int32_t>(v_input.sizes()[Layout::Activation4D::height]),
         },
-        VK_KERNEL(transform_winograd_2_3_sh),
-        v_input_winograd.extents(),
-        adaptive_work_group_size(v_input_winograd.extents()),
-        v_input_winograd.image(
-            command_buffer,
-            vTensor::Stage::Compute,
-            vTensor::Access::Write),
-        v_input.image(
-            command_buffer,
-            vTensor::Stage::Compute),
-        context->resource().pool.uniform(transform_block).object);
-
-  }
-  {
-    const struct Block final {
-      uvec3 extents;
-      int32_t ic4;
-      vec2 clamp;
-    } block {
-      v_output.extents(),
-      safe_downcast<int32_t>(packed_.filter[Layout::Filter::input] / 4),
-      {
-        packed_.output_min,
-        packed_.output_max,
-      },
-    };
+        {
+          safe_downcast<int32_t>(packed_.padding[Layout::Parameter::width]),
+          safe_downcast<int32_t>(packed_.padding[Layout::Parameter::height]),
+        },
+      };
 
-    uvec3 global_size = {
-      safe_downcast<uint32_t>(out_w_units),
-      safe_downcast<uint32_t>(out_h_units),
-      v_output.extents().data[2u],
-    };
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(transform_winograd_2_3_sh),
+          v_input_winograd.extents(),
+          adaptive_work_group_size(v_input_winograd.extents()),
+          v_input_winograd.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          v_input.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          context->resource().pool.uniform(transform_block).object);
 
-    context->dispatch(
-        command_buffer,
+    }
+    {
+      const struct Block final {
+        uvec3 extents;
+        int32_t ic4;
+        vec2 clamp;
+      } block {
+        v_output.extents(),
+        safe_downcast<int32_t>(packed_.filter[Layout::Filter::input] / 4),
         {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          packed_.output_min,
+          packed_.output_max,
         },
-        VK_KERNEL(conv2d_winograd_2_3),
-        global_size,
-        adaptive_work_group_size(global_size),
-        v_output.image(
-            command_buffer,
-            vTensor::Stage::Compute,
-            vTensor::Access::Write),
-        v_input_winograd.image(
-            command_buffer,
-            vTensor::Stage::Compute),
-        packed_.v_weight.image(
-            command_buffer,
-            vTensor::Stage::Compute),
-        packed_.v_bias.buffer(
-            command_buffer,
-            vTensor::Stage::Compute),
-        context->resource().pool.uniform(block).object);
+      };
+
+      uvec3 global_size = {
+        safe_downcast<uint32_t>(out_w_units),
+        safe_downcast<uint32_t>(out_h_units),
+        v_output.extents().data[2u],
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(conv2d_winograd_2_3),
+          global_size,
+          adaptive_work_group_size(global_size),
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          v_input_winograd.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          packed_.v_weight.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          packed_.v_bias.buffer(
+              command_buffer,
+              vTensor::Stage::Compute),
+          context->resource().pool.uniform(block).object);
+    }
   }
   command_pool.submit(context->gpu().queue, command_buffer);
 }
@@ -797,19 +804,22 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
       conv2d_sliding_window(
         VK_KERNEL(conv2d_dw),
         v_output,
-        v_input);
+        v_input,
+        "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d_dw)");
       break;
     case Conv2dPointwise:
       conv2d_sliding_window(
         VK_KERNEL(conv2d_pw_2x2),
         v_output,
-        v_input);
+        v_input,
+        "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d_pw_2x2)");
       break;
     default:
       conv2d_sliding_window(
         VK_KERNEL(conv2d),
         v_output,
-        v_input);
+        v_input,
+        "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d)");
       break;
   }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h
index 78eef9111d3b..c87d86c585b0 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.h
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.h
@@ -61,7 +61,8 @@ class Conv2dOpContext final : public torch::jit::CustomClassHolder {
   void conv2d_sliding_window(
       const api::Shader::Descriptor& shader,
       vTensor& v_output,
-      const vTensor& v_input) const;
+      const vTensor& v_input,
+      const std::string& op_name) const;
 
   void conv2d_winograd_2_3(
       vTensor& v_output,
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index 1cf6b1ad6aa9..b7fbea07d9e6 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 
 namespace at {
@@ -9,7 +10,6 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
   api::Context* const context = api::context();
 
   api::Command::Pool& command_pool = context->command().pool;
-  api::Command::Buffer& command_buffer = command_pool.stream();
   {
     // X -> Vulkan
     if (at::kVulkan == self.device().type()) {
@@ -17,28 +17,33 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
 
       // Vulkan -> Vulkan
       if (at::kVulkan == src.device().type()) {
-        command_buffer.copy(
-            // - Read-only access is implied on const tensors.  Memory barriers
-            //   are automatically inserted if a RAW hazard is detected.
-            // - Recording any potential pending sync operations into the same
-            //   command buffer prevents an expensive queue submission.
-            convert(src).buffer(
-                command_buffer,
-                vTensor::Stage::Transfer),
-            // - Write-only access never triggers a sync as the contents will be
-            //   overwritten regardless.  Having said that, appropriate barriers
-            //   are inserted automatically if WAR or WAW hazards are detected.
-            // - Recording pending sync operations into the same command buffer
-            //   prevents an expensive queue submission.
-            v_self.buffer(
-                command_buffer,
-                vTensor::Stage::Transfer,
-                vTensor::Access::Write));
-
+       api::Command::Buffer& command_buffer = command_pool.stream();
+       {
+          api::OpProfiler profiler(command_buffer, context->querypool(), "copy_");
+
+          command_buffer.copy(
+              // - Read-only access is implied on const tensors.  Memory barriers
+              //   are automatically inserted if a RAW hazard is detected.
+              // - Recording any potential pending sync operations into the same
+              //   command buffer prevents an expensive queue submission.
+              convert(src).buffer(
+                  command_buffer,
+                  vTensor::Stage::Transfer),
+              // - Write-only access never triggers a sync as the contents will be
+              //   overwritten regardless.  Having said that, appropriate barriers
+              //   are inserted automatically if WAR or WAW hazards are detected.
+              // - Recording pending sync operations into the same command buffer
+              //   prevents an expensive queue submission.
+              v_self.buffer(
+                  command_buffer,
+                  vTensor::Stage::Transfer,
+                  vTensor::Access::Write));
+        }
         command_pool.submit(context->gpu().queue, command_buffer);
       }
       // CPU -> Vulkan
       else {
+        api::Command::Buffer& command_buffer = command_pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything
         const Tensor cpu_src = src.device().is_cpu() ? src : src.cpu();
 
         // Requesting write-only host access to the tensor never triggers a sync
@@ -75,6 +80,7 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
     }
     // Vulkan -> X
     else if (at::kVulkan == src.device().type()) {
+      api::Command::Buffer& command_buffer = command_pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything
       const vTensor& v_src = convert(src);
 
       // Vulkan -> CPU
diff --git a/aten/src/ATen/native/vulkan/ops/Gru.cpp b/aten/src/ATen/native/vulkan/ops/Gru.cpp
new file mode 100644
index 000000000000..8395dc8bebde
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Gru.cpp
@@ -0,0 +1,246 @@
+#include <ATen/native/vulkan/ops/Gru.h>
+#include <vector>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+//
+// input_vk: input tensor of shape (L, N, H_in) when batch_first=False
+//                                 (N, L, H_in) when batch_first=True containing the features of the input sequence
+// hx_vk: initial hidden state for each element in the batch. tensor of shape (D * num_layers, N, H_out)
+// output: tensor of shape (N, L, D * H_out)) when batch_first=True
+// h_n: tensor of shape (D * num_layers, N, H_out)
+//
+//  where
+//    L = sequence length
+//    N = batch size
+//    D = 2 if bidirectional=True otherwise 1
+//    H_in = input_size (# of expected features in the input x)
+//    H_out = hidden_size (# of features in the hidden state h)
+//
+std::tuple<Tensor, Tensor> gru_input(
+  const Tensor & input_vk,  // input sequence (vulkan)
+  const Tensor & hx_vk,     // initial hidden state (vulkan)
+  TensorList params_cpu,    // weights/biases (cpu)
+  bool has_biases,
+  int64_t num_layers,
+  double dropout,
+  bool train,
+  bool bidirectional,
+  bool batch_first) {
+  TORCH_CHECK(params_cpu.size() == 4 * num_layers, "Vulkan gru expects 'params_cpu' size to be 4 * 'num_layers'.");
+  TORCH_INTERNAL_ASSERT(input_vk.sizes().size() == 3, "Vulkan gru expects 'input_vk' dims to be 3.");
+  TORCH_INTERNAL_ASSERT(hx_vk.sizes().size() == 3, "Vulkan gru expects 'hx_vk' dims to be 3.");
+  TORCH_INTERNAL_ASSERT(has_biases, "Vulkan gru expects 'has_biases' to be true.");
+  TORCH_INTERNAL_ASSERT(!train, "Vulkan gru expects 'train' to be false.");
+  TORCH_INTERNAL_ASSERT(!bidirectional, "Vulkan gru expects 'bidirectional' to be false.");
+  TORCH_INTERNAL_ASSERT(batch_first, "Vulkan gru expects 'batch_first' to be true.");
+  TORCH_INTERNAL_ASSERT(dropout < std::numeric_limits<double>::epsilon()*1000, "Vulkan gru expects 'dropout' to be 0.0.");
+
+  const auto h_in = input_vk.size(2);
+  std::vector<at::Tensor> h_n_list;  // hidden output
+
+  // reshape to 2D due to Vulkan at::mm op accepts only 2D
+  auto x = input_vk.reshape({input_vk.size(0) * input_vk.size(1), input_vk.size(2)});
+
+  for (int64_t i = 0; i < num_layers; ++i) {
+    // extract each hidden state and squeeze into 2D dim
+    auto h = at::slice(hx_vk, 0, i, i + 1, 1);
+    h = h.reshape({h.size(0) * h.size(1), h.size(2)});
+
+    const auto& w_ih = params_cpu[i * 4];
+    const auto& w_hh = params_cpu[i * 4 + 1];
+    const auto& b_ih = params_cpu[i * 4 + 2];
+    const auto& b_hh = params_cpu[i * 4 + 3];
+
+    const auto&  w_i_rzn = w_ih.split(h_in);
+    const auto&  w_h_rzn = w_hh.split(h_in);
+    const auto&  b_i_rzn = b_ih.split(h_in);
+    const auto&  b_h_rzn = b_hh.split(h_in);
+
+    const auto&  w_ir = w_i_rzn[0];
+    const auto&  w_iz = w_i_rzn[1];
+    const auto&  w_in = w_i_rzn[2];
+    const auto&  w_hr = w_h_rzn[0];
+    const auto&  w_hz = w_h_rzn[1];
+    const auto&  w_hn = w_h_rzn[2];
+    const auto&  b_ir = b_i_rzn[0];
+    const auto&  b_iz = b_i_rzn[1];
+    const auto&  b_in = b_i_rzn[2];
+    const auto&  b_hr = b_h_rzn[0];
+    const auto&  b_hz = b_h_rzn[1];
+    const auto&  b_hn = b_h_rzn[2];
+
+    const auto&  r = at::sigmoid(at::addmm(b_ir, x, w_ir.t()) + at::addmm(b_hr, h, w_hr.t()));
+    const auto&  z = at::sigmoid(at::addmm(b_iz, x, w_iz.t()) + at::addmm(b_hz, h, w_hz.t()));
+    const auto&  n = at::tanh(at::addmm(b_in, x, w_in.t()) + r * (at::addmm(b_hn, h, w_hn.t())));
+    h = (z * (-1) + 1) * n + z * h;
+    x = h;  // next input
+    h_n_list.emplace_back(h.reshape({1, 1, h.size(0), h.size(1)}));  // 2D to 4D for cat op
+  }
+
+  auto h_n = at::cat(h_n_list, 1);
+  h_n = h_n.reshape({h_n.size(0) * h_n.size(1), h_n.size(2), h_n.size(3)});
+  return std::tuple<Tensor, Tensor>(x, h_n);
+}
+
+#ifdef USE_VULKAN_API
+
+TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl(TORCH_SELECTIVE_NAME("aten::gru.input"), TORCH_FN(gru_input));
+}
+
+#endif /* USE_VULKAN_API */
+
+} // namespace
+
+std::vector<LinearOpContext> pack_linear_op_contexts(
+    const std::vector<Tensor>& params_cpu,
+    int64_t num_layers) {
+  TORCH_CHECK(params_cpu.size() == 4 * num_layers, "Vulkan gru expects 'params_cpu' size to be 4 * 'num_layers'.");
+  std::vector<LinearOpContext> linear_op_contexts;
+  for (int64_t i = 0; i < num_layers; ++i) {
+    const auto& w_ih = params_cpu.at(i * 4);
+    const auto& w_hh = params_cpu.at(i * 4 + 1);
+    const auto& b_ih = params_cpu.at(i * 4 + 2);
+    const auto& b_hh = params_cpu.at(i * 4 + 3);
+    const auto& h_in = w_ih.size(0) / 3;
+
+    const auto&  w_i_rzn = w_ih.split(h_in);
+    const auto&  w_h_rzn = w_hh.split(h_in);
+    const auto&  b_i_rzn = b_ih.split(h_in);
+    const auto&  b_h_rzn = b_hh.split(h_in);
+
+    const auto&  w_ir = w_i_rzn[0];
+    const auto&  w_iz = w_i_rzn[1];
+    const auto&  w_in = w_i_rzn[2];
+    const auto&  w_hr = w_h_rzn[0];
+    const auto&  w_hz = w_h_rzn[1];
+    const auto&  w_hn = w_h_rzn[2];
+    const auto&  b_ir = b_i_rzn[0];
+    const auto&  b_iz = b_i_rzn[1];
+    const auto&  b_in = b_i_rzn[2];
+    const auto&  b_hr = b_h_rzn[0];
+    const auto&  b_hz = b_h_rzn[1];
+    const auto&  b_hn = b_h_rzn[2];
+
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_ir.t(), b_ir));
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_hr.t(), b_hr));
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_iz.t(), b_iz));
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_hz.t(), b_hz));
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_in.t(), b_in));
+    linear_op_contexts.emplace_back(LinearOpContext::create(w_hn.t(), b_hn));
+  }
+  return linear_op_contexts;
+}
+
+GruOpContext::GruOpContext(
+    const std::vector<Tensor>& params_cpu,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first)
+  : packed_{pack_linear_op_contexts(params_cpu, num_layers), has_biases, num_layers, dropout, train, bidirectional, batch_first},
+    unpacked_{params_cpu, has_biases, num_layers, dropout, train, bidirectional, batch_first} {
+  TORCH_INTERNAL_ASSERT(packed_.has_biases, "Vulkan gru expects 'has_biases' to be true.");
+  TORCH_INTERNAL_ASSERT(!packed_.train, "Vulkan gru expects 'train' to be false.");
+  TORCH_INTERNAL_ASSERT(!packed_.bidirectional, "Vulkan gru expects 'bidirectional' to be false.");
+  TORCH_INTERNAL_ASSERT(packed_.batch_first, "Vulkan gru expects 'batch_first' to be true.");
+  TORCH_INTERNAL_ASSERT(packed_.dropout < std::numeric_limits<double>::epsilon()*1000, "Vulkan gru expects 'dropout' to be 0.0.");
+}
+
+GruOpContext GruOpContext::create(
+    const std::vector<Tensor>& params_cpu, // weights/biases (cpu)
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  return GruOpContext{
+      params_cpu,
+      has_biases,
+      num_layers,
+      dropout,
+      train,
+      bidirectional,
+      batch_first
+    };
+}
+
+std::tuple<Tensor, Tensor> GruOpContext::run(
+    const Tensor & input_vk,      // input sequence (vulkan)
+    const Tensor & hx_vk) const { // initial hidden state (vulkan)
+  TORCH_INTERNAL_ASSERT(input_vk.sizes().size() == 3, "Vulkan gru expects 'input_vk' dims to be 3.");
+  TORCH_INTERNAL_ASSERT(hx_vk.sizes().size() == 3, "Vulkan gru expects 'hx_vk' dims to be 3.");
+
+  const int64_t linear_op_contexts_per_layer = 6;   // (b_ir, w_ir), (b_hr, w_hr), (b_iz, w_iz), (b_hz, w_hz), (b_in, w_in), (b_hn, w_hn)
+  std::vector<at::Tensor> h_n_list;  // hidden output
+
+  // reshape to 2D due to Vulkan at::mm op accepts only 2D
+  auto x = input_vk.reshape({input_vk.size(0) * input_vk.size(1), input_vk.size(2)});
+
+  for (int64_t i = 0; i < packed_.num_layers; ++i) {
+    // extract each hidden state and squeeze into 2D dim
+    auto h = at::slice(hx_vk, 0, i, i + 1, 1);
+    h = h.reshape({h.size(0) * h.size(1), h.size(2)});
+
+    const auto&  cxt_ir = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 0];
+    const auto&  cxt_hr = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 1];
+    const auto&  cxt_iz = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 2];
+    const auto&  cxt_hz = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 3];
+    const auto&  cxt_in = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 4];
+    const auto&  cxt_hn = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 5];
+
+    const auto&  r = at::sigmoid(cxt_ir.run(x, 1.0f, 1.0f, "aten::addmm") + cxt_hr.run(h, 1.0f, 1.0f, "aten::addmm"));
+    const auto&  z = at::sigmoid(cxt_iz.run(x, 1.0f, 1.0f, "aten::addmm") + cxt_hz.run(h, 1.0f, 1.0f, "aten::addmm"));
+    const auto&  n = at::tanh(cxt_in.run(x, 1.0f, 1.0f, "aten::addmm") + r * (cxt_hn.run(h, 1.0f, 1.0f, "aten::addmm")));
+    h = (z * (-1) + 1) * n + z * h;
+    x = h;  // next input
+    h_n_list.emplace_back(h.reshape({1, 1, h.size(0), h.size(1)}));  // 2D to 4D for cat op
+  }
+
+  auto h_n = at::cat(h_n_list, 1);
+  h_n = h_n.reshape({h_n.size(0) * h_n.size(1), h_n.size(2), h_n.size(3)});
+  return std::tuple<Tensor, Tensor>(x, h_n);
+}
+
+GruOpContext::State GruOpContext::unpack() const {
+  return GruOpContext::State{
+    unpacked_.params_cpu,
+    unpacked_.has_biases,
+    unpacked_.num_layers,
+    unpacked_.dropout,
+    unpacked_.train,
+    unpacked_.bidirectional,
+    unpacked_.batch_first,
+  };
+}
+
+c10::intrusive_ptr<GruOpContext> gru_prepack(
+    std::vector<Tensor>&& params_cpu,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  return c10::make_intrusive<GruOpContext>(GruOpContext::create(
+      params_cpu, has_biases, num_layers, dropout, train, bidirectional, batch_first));
+}
+
+std::tuple<Tensor, Tensor> gru_run(
+    const Tensor& input_vk,
+    const Tensor& hx_vk,
+    const c10::intrusive_ptr<GruOpContext>& context) {
+  return context->run(input_vk, hx_vk);
+}
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Gru.h b/aten/src/ATen/native/vulkan/ops/Gru.h
new file mode 100644
index 000000000000..8000aa449ca4
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Gru.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/ops/Mm.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+
+class GruOpContext final : public torch::jit::CustomClassHolder {
+ public:
+  static GruOpContext create(
+      const std::vector<Tensor>& params_cpu, // weights/biases (cpu)
+      bool has_biases,
+      int64_t num_layers,
+      double dropout,
+      bool train,
+      bool bidirectional,
+      bool batch_first);
+
+  using State = std::tuple<std::vector<Tensor>, bool, int64_t, double, bool, bool, bool>;
+
+  std::tuple<Tensor, Tensor> run(
+      const Tensor& input_vk,
+      const Tensor & hx_vk) const;
+  State unpack() const;
+
+ private:
+  GruOpContext(
+      const std::vector<Tensor>& params_cpu, // weights/biases (cpu)
+      bool has_biases,
+      int64_t num_layers,
+      double dropout,
+      bool train,
+      bool bidirectional,
+      bool batch_first);
+
+ private:
+  struct {
+    std::vector<LinearOpContext> linear_op_contexts;  // {{ op context for b_ir, w_ir, op context for b_hr, w_hr,
+                                                      //    op context for b_iz, w_iz, op context for b_hz, w_hz,
+                                                      //    op context for b_in, w_in, op context for b_hn, w_hn,}, ...}
+    bool has_biases{};
+    int64_t num_layers{};
+    double dropout{};
+    bool train{};
+    bool bidirectional{};
+    bool batch_first{};
+  } packed_;
+
+  struct {
+    std::vector<Tensor> params_cpu;      // weights/biases (cpu)
+    bool has_biases{};
+    int64_t num_layers{};
+    double dropout{};
+    bool train{};
+    bool bidirectional{};
+    bool batch_first{};
+  } unpacked_;
+};
+
+c10::intrusive_ptr<GruOpContext> gru_prepack(
+    std::vector<Tensor>&& params_cpu,   // weights/biases (cpu)
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first);
+
+std::tuple<Tensor, Tensor> gru_run(
+    const Tensor& input_vk,
+    const Tensor & hx_vk,
+    const c10::intrusive_ptr<GruOpContext>& context);
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Lerp.cpp b/aten/src/ATen/native/vulkan/ops/Lerp.cpp
new file mode 100644
index 000000000000..4a1a351919c5
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Lerp.cpp
@@ -0,0 +1,400 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
+#include <ATen/native/vulkan/ops/Common.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+namespace {
+
+using namespace api::utils;
+
+void check_inputs_elementwise_op(const Tensor& input1, const Tensor& input2) {
+  TORCH_CHECK(
+      channels_size(input1) == channels_size(input2),
+      "Vulkan elementwise ops require channel dimension to be equal!");
+  if (batch_size(input1) != batch_size(input2)) {
+    TORCH_CHECK(
+        channels_size(input1) % 4 == 0,
+        "Vulkan elementwise ops require channel to be a multiple of 4 to broadcast along batch dimension!")
+  }
+
+  const uint32_t input1_h = height_size(input1);
+  const uint32_t input1_w = width_size(input1);
+  const uint32_t input2_h = height_size(input2);
+  const uint32_t input2_w = width_size(input2);
+
+  const std::string broadcast_error_msg =
+      "Incompatible input dimensions for broadcasting for Vulkan elementwise op!";
+  if (input1_h != input2_h) {
+    if (input1_h > input2_h) {
+      TORCH_CHECK(input2_h == 1, broadcast_error_msg);
+      TORCH_CHECK(input2_w == input1_w || input2_w == 1, broadcast_error_msg);
+    } else if (input2_h > input1_h) {
+      TORCH_CHECK(input1_h == 1, broadcast_error_msg);
+      TORCH_CHECK(input1_w == input2_w || input1_w == 1, broadcast_error_msg);
+    }
+  } else if (input1_w != input2_w) {
+    if (input1_w > input2_w) {
+      TORCH_CHECK(input2_w == 1, broadcast_error_msg);
+    } else if (input2_w > input1_w) {
+      TORCH_CHECK(input1_h == 1, broadcast_error_msg);
+    }
+  }
+}
+
+Tensor _lerp_scalar(
+    const Tensor& start_arg,
+    const Tensor& end_arg,
+    const Scalar& weight_arg,
+    const std::string& op_name) {
+  check_inputs_elementwise_op(start_arg, end_arg);
+  api::Context* const context = api::context();
+
+  const Tensor start = start_arg.is_vulkan() ? start_arg : start_arg.vulkan();
+  const vTensor& v_start = convert(start);
+
+  const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan();
+  const vTensor& v_end = convert(end);
+
+  vTensor v_output{
+    context,
+    v_start.sizes(),
+    v_start.options(),
+  };
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
+    if C10_LIKELY (v_start.has_image() && v_end.has_image()) {
+      const float weight = weight_arg.to<float>();
+      const struct Block final {
+        uvec3 extents;
+        uint32_t fill_0;
+        uvec3 input1_extents;
+        uint32_t fill_1;
+        uvec3 input2_extents;
+        float weight;
+      } block{
+          v_output.extents(),
+          0u,
+          v_start.extents(),
+          0u,
+          v_end.extents(),
+          weight,
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(lerp_scalar),
+          v_output.extents(),
+          adaptive_work_group_size(v_output.extents()),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer, vTensor::Stage::Compute, vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_start.image(command_buffer, vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_end.image(command_buffer, vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    } else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return convert(v_output);
+}
+
+Tensor& _lerp_scalar_(
+    Tensor& self,
+    const Tensor& end_arg,
+    const Scalar& weight_arg,
+    const std::string& op_name) {
+  check_inputs_elementwise_op(self, end_arg);
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self.is_vulkan(),
+      "Vulkan: In-place lerp is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self);
+
+  const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan();
+  const vTensor& v_end = convert(end);
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
+    if C10_LIKELY (
+        v_self.has_image() && v_end.has_image() && !self.is_same(end)) {
+      const float weight = weight_arg.to<float>();
+      const struct Block final {
+        uvec3 extents;
+        uint32_t fill_0;
+        uvec3 input_extents;
+        float alpha;
+      } block{
+          v_self.extents(),
+          0u,
+          v_end.extents(),
+          weight,
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(lerp_scalar_),
+          v_self.extents(),
+          adaptive_work_group_size(v_self.extents()),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_end.image(command_buffer, vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    } else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return self;
+}
+
+Tensor _lerp_tensor(
+    const Tensor& start_arg,
+    const Tensor& end_arg,
+    const Tensor& weight_arg,
+    const std::string& op_name) {
+  check_inputs_elementwise_op(start_arg, end_arg);
+  check_inputs_elementwise_op(start_arg, weight_arg);
+  api::Context* const context = api::context();
+
+  const Tensor start = start_arg.is_vulkan() ? start_arg : start_arg.vulkan();
+  const vTensor& v_start = convert(start);
+
+  const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan();
+  const vTensor& v_end = convert(end);
+
+  const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan();
+  const vTensor& v_weight = convert(weight);
+
+  vTensor v_output{
+    context,
+    v_start.sizes(),
+    v_start.options(),
+  };
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
+    if C10_LIKELY (v_start.has_image() && v_end.has_image() && v_weight.has_image()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t fill_0;
+        uvec3 input1_extents;
+        uint32_t fill_1;
+        uvec3 input2_extents;
+        uint32_t fill_2;
+        uvec3 input3_extents;
+        uint32_t fill_3;
+      } block{
+          v_output.extents(),
+          0u,
+          v_start.extents(),
+          0u,
+          v_end.extents(),
+          0u,
+          v_weight.extents(),
+          0u,
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(lerp),
+          v_output.extents(),
+          adaptive_work_group_size(v_output.extents()),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer, vTensor::Stage::Compute, vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_start.image(command_buffer, vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_end.image(command_buffer, vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_weight.image(command_buffer, vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    } else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return convert(v_output);
+}
+
+Tensor& _lerp_tensor_(
+    Tensor& self,
+    const Tensor& end_arg,
+    const Tensor& weight_arg,
+    const std::string& op_name) {
+  check_inputs_elementwise_op(self, end_arg);
+  check_inputs_elementwise_op(self, weight_arg);
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self.is_vulkan(),
+      "Vulkan: In-place lerp is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self);
+
+  const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan();
+  const vTensor& v_end = convert(end);
+
+  const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan();
+  const vTensor& v_weight = convert(weight);
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
+    if C10_LIKELY (
+        v_self.has_image() && v_end.has_image() && v_weight.has_image() && !self.is_same(end)) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t fill_0;
+        uvec3 input1_extents;
+        uint32_t fill_1;
+        uvec3 input2_extents;
+        uint32_t fill_2;
+      } block{
+          v_self.extents(),
+          0u,
+          v_end.extents(),
+          0u,
+          v_weight.extents(),
+          0u,
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(lerp_),
+          v_self.extents(),
+          adaptive_work_group_size(v_self.extents()),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_end.image(command_buffer, vTensor::Stage::Compute),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_weight.image(command_buffer, vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    } else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return self;
+}
+
+Tensor lerp_scalar(const Tensor& start, const Tensor& end, const Scalar& weight) {
+  return _lerp_scalar(
+      start, end, weight, "aten::lerp.Scalar");
+}
+
+Tensor& lerp_scalar_(Tensor& self, const Tensor& end, const Scalar& weight) {
+  return _lerp_scalar_(
+      self, end, weight, "aten::lerp_.Scalar");
+}
+
+Tensor lerp_tensor(const Tensor& start, const Tensor& end, const Tensor& weight) {
+  if (weight.sizes().size() == 0) {
+    return _lerp_scalar(
+        start, end, weight.item<float>(), "aten::lerp.Tensor");
+  }
+  return _lerp_tensor(
+      start, end, weight, "aten::lerp.Tensor");
+}
+
+Tensor& lerp_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) {
+  if (weight.sizes().size() == 0) {
+    return _lerp_scalar_(
+        self, end, weight.item<float>(), "aten::lerp_.Tensor");
+  }
+  return _lerp_tensor_(
+      self, end, weight, "aten::lerp_.Tensor");
+}
+
+#ifdef USE_VULKAN_API
+
+TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
+  m.impl(TORCH_SELECTIVE_NAME("aten::lerp.Scalar"), TORCH_FN(lerp_scalar));
+  m.impl(TORCH_SELECTIVE_NAME("aten::lerp_.Scalar"), TORCH_FN(lerp_scalar_));
+  m.impl(TORCH_SELECTIVE_NAME("aten::lerp.Tensor"), TORCH_FN(lerp_tensor));
+  m.impl(TORCH_SELECTIVE_NAME("aten::lerp_.Tensor"), TORCH_FN(lerp_tensor_));
+}
+
+#endif /* USE_VULKAN_API */
+
+} // namespace
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp
index 947cb2c5e39d..3e678056fc3b 100644
--- a/aten/src/ATen/native/vulkan/ops/Mean.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Utils.h>
 #include <torch/library.h>
@@ -55,6 +56,8 @@ Tensor mean(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::mean.dim");
+
     if C10_LIKELY(v_input.has_image()) {
       const struct Block final {
         uvec3 extents;
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index b19f02af0b7e..04c65677c962 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Mm.h>
 #include <c10/util/irange.h>
 
@@ -16,7 +17,7 @@ vTensor pack_weights(
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   const Tensor weight = weight_arg.contiguous();
   const IntArrayRef w_sizes = weight.sizes();
@@ -70,7 +71,7 @@ vTensor pack_biases(
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   using Future = vTensor::Future<float, vTensor::Access::Write>;
   if (bias_arg) {
@@ -193,7 +194,8 @@ Tensor addmm(
       bias).run(
           input,
           alpha.to<float>(),
-          beta.to<float>());
+          beta.to<float>(),
+          "aten::addmm");
 }
 
 Tensor mm(
@@ -204,7 +206,8 @@ Tensor mm(
       c10::optional<Tensor>()).run(
           mat1_arg,
           1.0f,
-          1.0f);
+          1.0f,
+          "aten::mm");
 }
 
 #ifdef USE_VULKAN_API
@@ -250,7 +253,8 @@ LinearOpContext LinearOpContext::create(
 Tensor LinearOpContext::run(
     const Tensor& input_arg,
     const float alpha,
-    const float beta) const {
+    const float beta,
+    const std::string& op_name) const {
   api::Context* const context = api::context();
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
@@ -278,9 +282,10 @@ Tensor LinearOpContext::run(
   };
 
   api::Command::Pool& command_pool = context->command().pool;
-
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if (v_input.has_image() &&
         packed_.v_weight.has_image() &&
         packed_.v_bias.has_image()) {
@@ -412,7 +417,7 @@ c10::intrusive_ptr<LinearOpContext> linear_prepack(
 Tensor linear_run(
     const Tensor& input,
     const c10::intrusive_ptr<LinearOpContext>& context) {
-  return context->run(input, 1.0, 1.0);
+  return context->run(input, 1.0, 1.0, "prepacked::linear_clamp_run");
 }
 
 } // namespace ops
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h
index 1dfef32ba9a7..5603f5e51821 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.h
+++ b/aten/src/ATen/native/vulkan/ops/Mm.h
@@ -18,7 +18,7 @@ class LinearOpContext final : public torch::jit::CustomClassHolder {
 
   using State = std::tuple<Tensor, c10::optional<Tensor>>;
 
-  Tensor run(const Tensor& input, float beta, float alpha) const;
+  Tensor run(const Tensor& input, float beta, float alpha, const std::string& op_name) const;
   State unpack() const;
 
  private:
diff --git a/aten/src/ATen/native/vulkan/ops/Padding.cpp b/aten/src/ATen/native/vulkan/ops/Padding.cpp
index 8d16093bd384..dcbd3a326fea 100644
--- a/aten/src/ATen/native/vulkan/ops/Padding.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Padding.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
@@ -55,6 +56,8 @@ Tensor reflection_pad2d(const Tensor& self_arg, IntArrayRef padding) {
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::reflection_pad2d");
+
     if C10_LIKELY (v_output.has_image() && v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
diff --git a/aten/src/ATen/native/vulkan/ops/Permute.cpp b/aten/src/ATen/native/vulkan/ops/Permute.cpp
index 29fed363d115..557c99592af0 100644
--- a/aten/src/ATen/native/vulkan/ops/Permute.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Permute.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
 
@@ -13,63 +14,65 @@ Tensor permute_4d(const Tensor& input, const uvec4& in_size, const uvec4& out_si
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
-
-  auto dst_image = v_output.image(
-    command_buffer,
-    vTensor::Stage::Compute,
-    vTensor::Access::Read | vTensor::Access::Write);
-
-  const Tensor self = input.is_vulkan() ? input : input.vulkan();
-  const vTensor& v_self = convert(self);
-  if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-    auto src_image = v_self.image(
-            command_buffer,
-            vTensor::Stage::Compute);
-
-    const struct Block final {
-      uvec3 size;                // output texture size
-      uint32_t fill_0;           // dummy
-      uvec3 isize;               // input texture size
-      uint32_t fill_1;           // dummy
-      uvec4 tensor_size;         // output tensor size
-      uvec4 itensor_size;        // input tensor size
-      uvec4 dims;                // output dims
-    } block {
-      v_output.extents(),
-      0u,
-      v_self.extents(),
-      0u,
-      out_size,
-      in_size,
-      out_dims,
-    };
-
-    context->dispatch(
-        command_buffer,
-        {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-        },
-        VK_KERNEL(permute_4d),
-        // build up shader operations from the output texture point of view
-        // to avoid the nondeterministic order of GPU shader operations between texels
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::permute (permute_4d)");
+
+    auto dst_image = v_output.image(
+      command_buffer,
+      vTensor::Stage::Compute,
+      vTensor::Access::Read | vTensor::Access::Write);
+
+    const Tensor self = input.is_vulkan() ? input : input.vulkan();
+    const vTensor& v_self = convert(self);
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      auto src_image = v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute);
+
+      const struct Block final {
+        uvec3 size;                // output texture size
+        uint32_t fill_0;           // dummy
+        uvec3 isize;               // input texture size
+        uint32_t fill_1;           // dummy
+        uvec4 tensor_size;         // output tensor size
+        uvec4 itensor_size;        // input tensor size
+        uvec4 dims;                // output dims
+      } block {
         v_output.extents(),
-        context->gpu().adapter->local_work_group_size(),
-        // Read/Write access bypasses synchronization but inserts appropriate
-        // barriers if necessary.
-        dst_image,
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        src_image,
-        // Object lifetime is managed by the resource pool.
-        // It is OK not to keep track of the handle.
-        context->resource().pool.uniform(block).object);
-  }
-  else {
-    TORCH_CHECK(false, "Not implemented!");
+        0u,
+        v_self.extents(),
+        0u,
+        out_size,
+        in_size,
+        out_dims,
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(permute_4d),
+          // build up shader operations from the output texture point of view
+          // to avoid the nondeterministic order of GPU shader operations between texels
+          v_output.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Read/Write access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          dst_image,
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          src_image,
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
   }
-
   command_pool.submit(context->gpu().queue, command_buffer);
 
   return convert(v_output);
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
index 6c67ada1d747..7a2fe98ba7d4 100644
--- a/aten/src/ATen/native/vulkan/ops/Pool.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/Pool.h>
 #include <torch/library.h>
@@ -36,6 +37,8 @@ Tensor adaptive_avg_pool2d(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_adaptive_avg_pool2d");
+
     if C10_LIKELY(v_self.has_image()) {
       const uvec3 v_output_size = v_output.extents();
       const uvec3 v_self_size = v_self.extents();
@@ -101,7 +104,8 @@ Tensor pool2d(
     const IntArrayRef padding_arg,
     const IntArrayRef dilation_arg,
     const bool ceil_mode,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   if (stride_arg.empty()) {
     stride_arg = kernel_arg;
   }
@@ -175,6 +179,8 @@ Tensor pool2d(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_self.has_image()) {
       const struct Block final {
         uvec3 extents;
@@ -257,7 +263,8 @@ Tensor avg_pool2d(
     padding_arg,
     {1,1},
     ceil_mode,
-    VK_KERNEL(avg_pool2d)
+    VK_KERNEL(avg_pool2d),
+    "aten::avg_pool2d"
   );
 }
 
@@ -275,7 +282,8 @@ Tensor max_pool2d(
     padding_arg,
     dilation_arg,
     ceil_mode,
-    VK_KERNEL(max_pool2d)
+    VK_KERNEL(max_pool2d),
+    "aten::max_pool2d"
   );
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Register.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp
index 4b90fc8696e1..942836cf6838 100644
--- a/aten/src/ATen/native/vulkan/ops/Register.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Register.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Convolution.h>
+#include <ATen/native/vulkan/ops/Gru.h>
 #include <ATen/native/vulkan/ops/TransposeConvolution2d.h>
 #include <ATen/native/vulkan/ops/Mm.h>
 #include <torch/custom_class.h>
@@ -28,9 +29,9 @@ TORCH_LIBRARY(vulkan, m) {
                 std::move(std::get<2>(state)),
                 std::move(std::get<3>(state)),
                 std::move(std::get<4>(state)),
-                std::move(std::get<5>(state)),
-                std::move(std::get<6>(state)),
-                std::move(std::get<7>(state)));
+                std::get<5>(state),
+                std::get<6>(state),
+                std::get<7>(state));
           });
   m.class_<TransposeConv2dOpContext>("TransposeConv2dOpContext")
       .def_pickle(
@@ -47,9 +48,9 @@ TORCH_LIBRARY(vulkan, m) {
                 std::move(std::get<3>(state)),
                 std::move(std::get<4>(state)),
                 std::move(std::get<5>(state)),
-                std::move(std::get<6>(state)),
-                std::move(std::get<7>(state)),
-                std::move(std::get<8>(state)));
+                std::get<6>(state),
+                std::get<7>(state),
+                std::get<8>(state));
           });
   m.class_<LinearOpContext>("LinearOpContext")
       .def_pickle(
@@ -62,6 +63,23 @@ TORCH_LIBRARY(vulkan, m) {
             return linear_prepack(
                 std::move(std::get<0>(state)), std::move(std::get<1>(state)));
           });
+  m.class_<GruOpContext>("GruOpContext")
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<GruOpContext>& context) {
+            return context->unpack();
+          },
+          // __setstate__
+          [](GruOpContext::State state) {
+            return gru_prepack(
+                std::move(std::get<0>(state)),
+                std::get<1>(state),
+                std::get<2>(state),
+                std::get<3>(state),
+                std::get<4>(state),
+                std::get<5>(state),
+                std::get<6>(state));
+          });
 }
 
 TORCH_LIBRARY(vulkan_prepack, m) {
@@ -87,18 +105,33 @@ TORCH_LIBRARY(vulkan_prepack, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "vulkan_prepack::linear_run(Tensor X, "
       "__torch__.torch.classes.vulkan.LinearOpContext BW_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::gru_prepack(Tensor[] params_cpu, "
+      "bool has_biases, "
+      "int num_layers, "
+      "float dropout, "
+      "bool train, "
+      "bool bidirectional, "
+      "bool batch_first) "
+      "-> __torch__.torch.classes.vulkan.GruOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::gru_run(Tensor input_vk, "
+      "Tensor hx_vk, "
+      "__torch__.torch.classes.vulkan.GruOpContext G_prepack) -> (Tensor next_input, Tensor hidden_layer)"));
 }
 
 TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_clamp_prepack"), TORCH_FN(conv2d_clamp_prepack));
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_transpose_clamp_prepack"), TORCH_FN(conv2d_transpose_clamp_prepack));
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::linear_prepack"), TORCH_FN(linear_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::gru_prepack"), TORCH_FN(gru_prepack));
 }
 
 TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_clamp_run"), TORCH_FN(conv2d_clamp_run));
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_transpose_clamp_run"), TORCH_FN(conv2d_transpose_clamp_run));
   m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::linear_run"), TORCH_FN(linear_run));
+  m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::gru_run"), TORCH_FN(gru_run));
 }
 
 Tensor convolution(
diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp
index 160099f3754d..86a466942052 100644
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
 
@@ -7,9 +8,10 @@ namespace vulkan {
 namespace ops {
 namespace {
 
-Tensor view(
+Tensor view_internal(
     const Tensor& self_arg,
-    const IntArrayRef shape) {
+    const IntArrayRef shape,
+    const std::string& op_name) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -24,6 +26,8 @@ Tensor view(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     command_buffer.copy(
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
@@ -42,11 +46,17 @@ Tensor view(
   return convert(v_output);
 }
 
+inline Tensor view(
+    const Tensor& self_arg,
+    const IntArrayRef shape) {
+  return view_internal(self_arg, shape, "aten::view");
+}
+
 Tensor _reshape_alias(
     const Tensor& self_arg,
     const IntArrayRef shape,
     const IntArrayRef strides) {
-  return view(self_arg, shape);
+  return view_internal(self_arg, shape, "aten::_reshape_alias");
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp
index 36f3a713b468..1d454c7ff709 100644
--- a/aten/src/ATen/native/vulkan/ops/Slice.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp
@@ -1,4 +1,5 @@
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/api/Helper.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
@@ -16,65 +17,67 @@ Tensor slice_4d(const Tensor& input, const int64_t dim, const int64_t start, con
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_4d)");
 
-  const Tensor self = input.is_vulkan() ? input : input.vulkan();
-  const vTensor& v_self = convert(self);
-  if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-    auto src_image = v_self.image(
-            command_buffer,
-            vTensor::Stage::Compute);
-    auto dst_image = v_output.image(
-      command_buffer,
-      vTensor::Stage::Compute,
-      vTensor::Access::Write);
-
-    const struct Block final {
-      uvec3 size;                // output texture size
-      uint32_t fill_0;           // dummy
-      uvec3 isize;               // input texture size
-      uint32_t fill_1;           // dummy
-      uvec4 tensor_size;         // output tensor size
-      uvec4 itensor_size;        // input tensor size
-      uvec4 args;                // input arguments (dim, start, end, step)
-    } block {
-      v_output.extents(),
-      0u,
-      v_self.extents(),
-      0u,
-      out_tsize,
-      in_tsize,
-      { safe_downcast<uint32_t>(dim),
-        safe_downcast<uint32_t>(start),
-        safe_downcast<uint32_t>(end),
-        safe_downcast<uint32_t>(step) },
-    };
-
-    context->dispatch(
+    const Tensor self = input.is_vulkan() ? input : input.vulkan();
+    const vTensor& v_self = convert(self);
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      auto src_image = v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute);
+      auto dst_image = v_output.image(
         command_buffer,
-        {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-        },
-        VK_KERNEL(slice_4d),
-        // build up shader operations from the output texture point of view
-        // to avoid the nondeterministic order of GPU shader operations between texels
+        vTensor::Stage::Compute,
+        vTensor::Access::Write);
+
+      const struct Block final {
+        uvec3 size;                // output texture size
+        uint32_t fill_0;           // dummy
+        uvec3 isize;               // input texture size
+        uint32_t fill_1;           // dummy
+        uvec4 tensor_size;         // output tensor size
+        uvec4 itensor_size;        // input tensor size
+        uvec4 args;                // input arguments (dim, start, end, step)
+      } block {
         v_output.extents(),
-        context->gpu().adapter->local_work_group_size(),
-        // Write-only access bypasses synchronization but inserts appropriate
-        // barriers if necessary.
-        dst_image,
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        src_image,
-        // Object lifetime is managed by the resource pool.
-        // It is OK not to keep track of the handle.
-        context->resource().pool.uniform(block).object);
-  }
-  else {
-    TORCH_CHECK(false, "Not implemented!");
-  }
+        0u,
+        v_self.extents(),
+        0u,
+        out_tsize,
+        in_tsize,
+        { safe_downcast<uint32_t>(dim),
+          safe_downcast<uint32_t>(start),
+          safe_downcast<uint32_t>(end),
+          safe_downcast<uint32_t>(step) },
+      };
 
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(slice_4d),
+          // build up shader operations from the output texture point of view
+          // to avoid the nondeterministic order of GPU shader operations between texels
+          v_output.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          dst_image,
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          src_image,
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
   command_pool.submit(context->gpu().queue, command_buffer);
   return convert(v_output);
 }
@@ -83,56 +86,58 @@ Tensor slice_width(const Tensor& input, const int64_t start, const int64_t end,
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_width)");
 
-  const Tensor self = input.is_vulkan() ? input : input.vulkan();
-  const vTensor& v_self = convert(self);
-  if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-    auto src_image = v_self.image(
-            command_buffer,
-            vTensor::Stage::Transfer);
-    auto dst_image = v_output.image(
-      command_buffer,
-      vTensor::Stage::Transfer,
-      vTensor::Access::Write);
+    const Tensor self = input.is_vulkan() ? input : input.vulkan();
+    const vTensor& v_self = convert(self);
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      auto src_image = v_self.image(
+              command_buffer,
+              vTensor::Stage::Transfer);
+      auto dst_image = v_output.image(
+        command_buffer,
+        vTensor::Stage::Transfer,
+        vTensor::Access::Write);
 
-    uvec3 src_offset{};
-    uvec3 dst_offset{};
+      uvec3 src_offset{};
+      uvec3 dst_offset{};
 
-    if (step == 1) {
-      src_offset.data[0u] = start;
-      uvec3 copy_extents {safe_downcast<uint32_t>(end - start),
-        v_self.extents().data[1u],
-        v_self.extents().data[2u]};
-      api::helper::copy_texture_to_texture(command_buffer,
-        src_image,
-        dst_image,
-        copy_extents,
-        src_offset,
-        dst_offset);
-    } else {
-      uvec3 copy_extents {1u,
-        v_self.extents().data[1u],
-        v_self.extents().data[2u]};
-      const auto x_max = v_self.extents().data[0u];
-      for (int64_t x = start, x_new = 0; x < end; x += step, ++x_new) {
-        if (x >= x_max) { // out of range
-          continue;
-        }
-        src_offset.data[0u] = x;
-        dst_offset.data[0u] = x_new;
+      if (step == 1) {
+        src_offset.data[0u] = start;
+        uvec3 copy_extents {safe_downcast<uint32_t>(end - start),
+          v_self.extents().data[1u],
+          v_self.extents().data[2u]};
         api::helper::copy_texture_to_texture(command_buffer,
           src_image,
           dst_image,
           copy_extents,
           src_offset,
           dst_offset);
+      } else {
+        uvec3 copy_extents {1u,
+          v_self.extents().data[1u],
+          v_self.extents().data[2u]};
+        const auto x_max = v_self.extents().data[0u];
+        for (int64_t x = start, x_new = 0; x < end; x += step, ++x_new) {
+          if (x >= x_max) { // out of range
+            continue;
+          }
+          src_offset.data[0u] = x;
+          dst_offset.data[0u] = x_new;
+          api::helper::copy_texture_to_texture(command_buffer,
+            src_image,
+            dst_image,
+            copy_extents,
+            src_offset,
+            dst_offset);
+        }
       }
     }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
   }
-  else {
-    TORCH_CHECK(false, "Not implemented!");
-  }
-
   command_pool.submit(context->gpu().queue, command_buffer);
   return convert(v_output);
 }
@@ -141,56 +146,58 @@ Tensor slice_height(const Tensor& input, const int64_t start, const int64_t end,
   api::Context* const context = api::context();
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_height)");
 
-  const Tensor self = input.is_vulkan() ? input : input.vulkan();
-  const vTensor& v_self = convert(self);
-  if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-    auto src_image = v_self.image(
-            command_buffer,
-            vTensor::Stage::Transfer);
-    auto dst_image = v_output.image(
-      command_buffer,
-      vTensor::Stage::Transfer,
-      vTensor::Access::Write);
+    const Tensor self = input.is_vulkan() ? input : input.vulkan();
+    const vTensor& v_self = convert(self);
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      auto src_image = v_self.image(
+              command_buffer,
+              vTensor::Stage::Transfer);
+      auto dst_image = v_output.image(
+        command_buffer,
+        vTensor::Stage::Transfer,
+        vTensor::Access::Write);
 
-    uvec3 src_offset{};
-    uvec3 dst_offset{};
+      uvec3 src_offset{};
+      uvec3 dst_offset{};
 
-    if (step == 1) {
-      src_offset.data[1u] = start;
-      uvec3 copy_extents {v_self.extents().data[0u],
-        safe_downcast<uint32_t>(end - start),
-        v_self.extents().data[2u]};
-      api::helper::copy_texture_to_texture(command_buffer,
-        src_image,
-        dst_image,
-        copy_extents,
-        src_offset,
-        dst_offset);
-    } else {
-      uvec3 copy_extents {v_self.extents().data[0u],
-        1u,
-        v_self.extents().data[2u]};
-      const auto y_max = v_self.extents().data[1u];
-      for (int64_t y = start, y_new = 0; y < end; y += step, ++y_new) {
-        if (y >= y_max) { // out of range
-          continue;
-        }
-        src_offset.data[1u] = y;
-        dst_offset.data[1u] = y_new;
+      if (step == 1) {
+        src_offset.data[1u] = start;
+        uvec3 copy_extents {v_self.extents().data[0u],
+          safe_downcast<uint32_t>(end - start),
+          v_self.extents().data[2u]};
         api::helper::copy_texture_to_texture(command_buffer,
           src_image,
           dst_image,
           copy_extents,
           src_offset,
           dst_offset);
+      } else {
+        uvec3 copy_extents {v_self.extents().data[0u],
+          1u,
+          v_self.extents().data[2u]};
+        const auto y_max = v_self.extents().data[1u];
+        for (int64_t y = start, y_new = 0; y < end; y += step, ++y_new) {
+          if (y >= y_max) { // out of range
+            continue;
+          }
+          src_offset.data[1u] = y;
+          dst_offset.data[1u] = y_new;
+          api::helper::copy_texture_to_texture(command_buffer,
+            src_image,
+            dst_image,
+            copy_extents,
+            src_offset,
+            dst_offset);
+        }
       }
     }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
   }
-  else {
-    TORCH_CHECK(false, "Not implemented!");
-  }
-
   command_pool.submit(context->gpu().queue, command_buffer);
   return convert(v_output);
 }
diff --git a/aten/src/ATen/native/vulkan/ops/Softmax.cpp b/aten/src/ATen/native/vulkan/ops/Softmax.cpp
index 9a3cce1fa224..f36a5fc54540 100644
--- a/aten/src/ATen/native/vulkan/ops/Softmax.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Softmax.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Utils.h>
 #include <torch/library.h>
@@ -14,7 +15,8 @@ Tensor softmax_internal(
     const at::Tensor& input_arg,
     const int64_t dim,
     const bool half_to_float,
-    const api::Shader::Descriptor& shader_descriptor) {
+    const api::Shader::Descriptor& shader_descriptor,
+    const std::string& op_name) {
   TORCH_CHECK(
       input_arg.dim() == 4,
       "Vulkan softmax expects 4-dimensional input!");
@@ -56,6 +58,8 @@ Tensor softmax_internal(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), op_name);
+
     if C10_LIKELY(v_input.has_image()) {
       const struct Block final {
         uvec3 iextents;
@@ -105,14 +109,14 @@ Tensor softmax(
     const at::Tensor& input_arg,
     const int64_t dim,
     const bool half_to_float) {
-  return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(softmax));
+  return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(softmax), "_softmax");
 }
 
 Tensor log_softmax(
     const at::Tensor& input_arg,
     const int64_t dim,
     const bool half_to_float) {
-  return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(log_softmax));
+  return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(log_softmax), "_log_softmax");
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
index 0a99a916f4f5..0de253447d2d 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/ops/Tensor.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <c10/util/accumulate.h>
@@ -761,6 +762,8 @@ void vTensor::View::CMD::copy_buffer_to_image(
     return;
   }
 
+  api::OpProfiler profiler(command_buffer_, view_.context_->querypool(), "copy_buffer_to_image");
+
   barrier(
       state.transition({
           // Staging
@@ -819,6 +822,8 @@ void vTensor::View::CMD::copy_image_to_buffer(
     return;
   }
 
+  api::OpProfiler profiler(command_buffer_, view_.context_->querypool(), "copy_image_to_buffer");
+
   barrier(
       state.transition({
           // Staging
diff --git a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
index d459e5d9d74a..0c12e930f05c 100644
--- a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
+++ b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
@@ -1,8 +1,9 @@
-#include <ATen/native/vulkan/ops/TransposeConvolution2d.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/ParamUtils.h>
-#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <ATen/native/vulkan/api/Utils.h>
+#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/ops/TransposeConvolution2d.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -86,7 +87,7 @@ vTensor pack_weights(const Tensor& weight_arg) {
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   const Tensor weight = at::permute(weight_arg, {1, 0, 2, 3}).contiguous();
 
@@ -105,7 +106,7 @@ vTensor pack_biases(
   }
 
   api::Context* const context = api::context();
-  api::Command::Buffer& command_buffer = context->command().pool.stream();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();  // Don't collect the timestamp since the command buffer doesn't record anything
 
   const int64_t src_w = weight.size(Layout::TransposedFilter::output);
   const int64_t packed_w = div_up(src_w, INT64_C(4));
@@ -353,6 +354,8 @@ void TransposeConv2dOpContext::conv2d_transpose_sliding_window(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "prepacked::conv2d_transpose_clamp_run (conv2d_transpose_sliding_window)");
+
     const struct Block final {
       uvec3 extents;
       int32_t ic4;
diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
index e6aa594ec6eb..20516bb387a0 100644
--- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@@ -1,5 +1,6 @@
-#include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/UpSample.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
+#include <ATen/native/vulkan/ops/Common.h>
 #include <torch/library.h>
 
 namespace at {
@@ -39,6 +40,8 @@ Tensor upsample_nearest2d(
   api::Command::Pool& command_pool = context->command().pool;
   api::Command::Buffer& command_buffer = command_pool.stream();
   {
+    api::OpProfiler profiler(command_buffer, context->querypool(), "aten::upsample_nearest2d");
+
     if C10_LIKELY(v_input.has_image()) {
       const struct Block final {
         uvec3 extents;
diff --git a/aten/src/ATen/native/xnnpack/Activation.cpp b/aten/src/ATen/native/xnnpack/Activation.cpp
index 33215771fe55..5ccf4aad40e7 100644
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@@ -10,7 +10,7 @@ namespace xnnpack {
 
 bool use_hardswish(
   const Tensor& input) {
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
           (1 <= input.ndimension()) &&
           (input.device().is_cpu()) &&
           (kFloat == input.scalar_type()) &&
diff --git a/aten/src/ATen/native/xnnpack/AveragePooling.cpp b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
index 4379741e6a05..7359836bb953 100644
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@@ -10,7 +10,7 @@ namespace xnnpack {
 
 bool use_global_average_pool(
   const Tensor& input) {
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
           (1 <= input.ndimension()) &&
           (input.device().is_cpu()) &&
           (kFloat == input.scalar_type()) &&
diff --git a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
index a17d7bb2daac..34cab01d0507 100644
--- a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
+++ b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
@@ -17,7 +17,7 @@ bool use_channel_shuffle(
   //   and all dimensions must be positive.
   // * The number of groups must be larger than 1 and
   //   the number of channels must be divisible by the number of groups.
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
       // Input
       (4 == input.dim()) &&
       (input.device().is_cpu()) &&
diff --git a/aten/src/ATen/native/xnnpack/Common.h b/aten/src/ATen/native/xnnpack/Common.h
index 5a0b68baffe7..b000ffada157 100644
--- a/aten/src/ATen/native/xnnpack/Common.h
+++ b/aten/src/ATen/native/xnnpack/Common.h
@@ -67,6 +67,9 @@ struct ContextConv2D final {
   static constexpr float kMax = std::numeric_limits<float>::infinity();
 };
 
+
+bool available();
+
 namespace internal {
 
 struct Layout final {
@@ -121,9 +124,6 @@ struct Layout final {
     static constexpr size_t width = 1u;
   };
 };
-
-bool available();
-
 } // namespace internal
 } // namespace xnnpack
 } // namespace native
diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
index 8278bfa19d9d..278e35280c40 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -27,7 +27,7 @@ namespace {
 // TODO: Decouple and improve error handling and messages.
 bool available(
     const Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const IntArrayRef padding,
     const IntArrayRef stride,
     const IntArrayRef dilation,
@@ -36,7 +36,7 @@ bool available(
     const float output_min,
     const float output_max) {
          // XNNPACK
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
          // Weight
          (4 == weight.ndimension()) &&
          (weight.size(Layout::Filter::height) > 0) &&
@@ -189,7 +189,7 @@ ContextConv2D create(
   TORCH_CHECK(
       available(
           weight_nhwc,
-          (bias.has_value() && bias->defined()) ? c10::optional<IntArrayRef>(bias->sizes()) : c10::nullopt,
+          (bias.has_value() && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt,
           padding_expanded,
           stride_expanded,
           dilation_expanded,
@@ -433,7 +433,7 @@ unpack_prepacked_sizes_conv2d(const IValue& ivalue) {
   const auto& bias = std::get<1>(tuple);
   return IValue(std::make_tuple(
       std::get<0>(tuple).sizes(),
-      (bias && bias->defined()) ? c10::optional<IntArrayRef>(bias->sizes()) : c10::nullopt,
+      (bias && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt,
       std::get<2>(tuple),
       std::get<3>(tuple),
       std::get<4>(tuple),
@@ -452,7 +452,7 @@ Tensor conv2d_transpose_clamp_run(
 bool use_convolution2d(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const IntArrayRef padding,
     const IntArrayRef stride,
     const IntArrayRef dilation,
diff --git a/aten/src/ATen/native/xnnpack/Engine.h b/aten/src/ATen/native/xnnpack/Engine.h
index 71ed262297b3..9d5c0e4594ac 100644
--- a/aten/src/ATen/native/xnnpack/Engine.h
+++ b/aten/src/ATen/native/xnnpack/Engine.h
@@ -13,7 +13,7 @@ namespace xnnpack {
 bool use_convolution2d(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<IntArrayRef> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const IntArrayRef padding,
     const IntArrayRef stride,
     const IntArrayRef dilation,
diff --git a/aten/src/ATen/native/xnnpack/Init.cpp b/aten/src/ATen/native/xnnpack/Init.cpp
index 8f69ec02c2ca..e7365bea2a61 100644
--- a/aten/src/ATen/native/xnnpack/Init.cpp
+++ b/aten/src/ATen/native/xnnpack/Init.cpp
@@ -49,13 +49,13 @@ bool C10_UNUSED deinitialize() {
 }
 
 } // namespace
+} // namespace internal
 
 bool available() {
   // Add extra conditions here that should disable mobile CPU impl at runtime in its totality.
   return internal::initialize();
 }
 
-} // namespace internal
 } // namespace xnnpack
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index 7911256b1f21..3f7ae681f955 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -21,7 +21,7 @@ bool available(
     const float output_min,
     const float output_max) {
          // XNNPACK
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
           // Weight
           (2 == weight.ndimension()) &&
           (weight.device().is_cpu()) &&
@@ -187,7 +187,7 @@ unpack_prepacked_sizes_linear(const IValue& ivalue) {
   const auto& bias = std::get<1>(tuple);
   return IValue(std::make_tuple(
       std::get<0>(tuple).sizes(),
-      (bias && bias->defined()) ? c10::optional<IntArrayRef>(bias->sizes()) : c10::nullopt));
+      (bias && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt));
 }
 
 } // namespace linear
diff --git a/aten/src/ATen/native/xnnpack/MaxPooling.cpp b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
index 7c101f9117ac..871959080821 100644
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@@ -88,7 +88,7 @@ bool use_max_pool2d(
   const bool output_size_eq = (pt_outputHeight == xnnpack_outputHeight) &&
     (pt_outputWidth == xnnpack_outputWidth);
 
-  return xnnpack::internal::available() &&
+  return xnnpack::available() &&
       // Input
       (4 == input.dim()) &&
       (input.device().is_cpu()) &&
diff --git a/aten/src/ATen/native/xnnpack/Shim.cpp b/aten/src/ATen/native/xnnpack/Shim.cpp
index 89fffa024aef..32ddfb4b8525 100644
--- a/aten/src/ATen/native/xnnpack/Shim.cpp
+++ b/aten/src/ATen/native/xnnpack/Shim.cpp
@@ -31,7 +31,7 @@ bool available() {
 bool use_convolution2d(
     const Tensor&,
     const Tensor&,
-    const c10::optional<IntArrayRef>,
+    const at::OptionalIntArrayRef,
     const IntArrayRef,
     const IntArrayRef,
     const IntArrayRef,
diff --git a/aten/src/ATen/nnapi/nnapi_model_loader.cpp b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
index 8553d974a8de..7966c0d17b19 100644
--- a/aten/src/ATen/nnapi/nnapi_model_loader.cpp
+++ b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
@@ -97,9 +97,9 @@ int load_nnapi_model(
     size_t num_buffers,
     const void** buffer_ptrs,
     int32_t* buffer_sizes,
-    size_t num_memories,
-    ANeuralNetworksMemory** memories,
-    int32_t* memory_sizes,
+    size_t /*num_memories*/,
+    ANeuralNetworksMemory** /*memories*/,
+    int32_t* /*memory_sizes*/,
     int32_t* out_input_count,
     int32_t* out_output_count,
     size_t* out_bytes_consumed) {
diff --git a/aten/src/ATen/nnapi/nnapi_wrapper.cpp b/aten/src/ATen/nnapi/nnapi_wrapper.cpp
index aa81bf942488..90122df15ef8 100644
--- a/aten/src/ATen/nnapi/nnapi_wrapper.cpp
+++ b/aten/src/ATen/nnapi/nnapi_wrapper.cpp
@@ -336,8 +336,6 @@ int check_Execution_getOutputOperandDimensions(ANeuralNetworksExecution* executi
 void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi) {
 #ifdef _WIN32
   TORCH_CHECK(false, "Running NNAPI models is not supported on Windows.");
-#elif __XROS__
-  TORCH_CHECK(false, "Running NNAPI models is not supported on XROS.");
 #else
   if (!loaded) {
     // Clear error flag.
diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
index 558ab57e900f..f7599e70ea05 100644
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@@ -26,7 +26,7 @@ class TORCH_API TensorMaker {
  public:
   using ContextDeleter = DeleterFnPtr;
 
-  TensorMaker& strides(optional<IntArrayRef> value) noexcept {
+  TensorMaker& strides(OptionalIntArrayRef value) noexcept {
     strides_ = value;
 
     return *this;
@@ -79,7 +79,7 @@ class TORCH_API TensorMaker {
 
   void* data_;
   IntArrayRef sizes_;
-  optional<IntArrayRef> strides_{};
+  OptionalIntArrayRef strides_{};
   optional<int64_t> storage_offset_{};
   std::function<void(void*)> deleter_{};
   std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
diff --git a/aten/src/ATen/ops/tensor.h b/aten/src/ATen/ops/tensor.h
index 3369eaf2502c..2f72b7ef0263 100644
--- a/aten/src/ATen/ops/tensor.h
+++ b/aten/src/ATen/ops/tensor.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
+#include <c10/core/ScalarType.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index aa5898194356..4a1bac8bc4c1 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -417,4 +417,23 @@ Tensor from_blob_quantized_per_channel_affine(
   return qtensor;
 }
 
+Tensor UnknownQuantizer::quantize(const Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer");
+}
+Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) {
+  TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer");
+}
+Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) {
+  TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer");
+}
+QScheme UnknownQuantizer::qscheme() const {
+  TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer");
+}
+bool UnknownQuantizer::equalTo(QuantizerPtr other) const{
+  TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer");
+}
+QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) {
+  return c10::make_intrusive<UnknownQuantizer>(scalar_type);
+}
+
 } // namespace at
diff --git a/aten/src/ATen/quantized/Quantizer.h b/aten/src/ATen/quantized/Quantizer.h
index 5d9c7111f19e..05bd39b71223 100644
--- a/aten/src/ATen/quantized/Quantizer.h
+++ b/aten/src/ATen/quantized/Quantizer.h
@@ -18,6 +18,23 @@
 
 namespace at {
 
+/**
+ * UnknownQuantizer is a placeholder quantizer for functions that implement
+ * quantization in a two step process.  First a tensor is allocated but with
+ * unknown quantizer, and then the quantization kernel decides what the final
+ * quantizer will be.
+ */
+struct TORCH_API UnknownQuantizer : public Quantizer {
+  explicit UnknownQuantizer(ScalarType scalar_type)
+    : Quantizer(scalar_type) {}
+
+  Tensor quantize(const Tensor& tensor) override;
+  Tensor dequantize(const Tensor& qtensor) override;
+  Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
+  QScheme qscheme() const override;
+  bool equalTo(QuantizerPtr other) const override;
+};
+
 /**
  * UniformQuantizer is the parent class for all uniform quantizers.
  * These quantization scheme will map float value uniformly to
@@ -80,7 +97,7 @@ struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
     return zero_point_;
   }
 
-  bool equalTo(QuantizerPtr other) override {
+  bool equalTo(QuantizerPtr other) const override {
     if (!other.get() || other->qscheme() != kPerTensorAffine) {
       return false;
     }
@@ -139,7 +156,7 @@ struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
   Tensor dequantize(const Tensor& qtensor) override;
   Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
 
-  bool equalTo(QuantizerPtr other) override {
+  bool equalTo(QuantizerPtr other) const override {
     if (!other.get() || other->qscheme() != kPerChannelAffine) {
       return false;
     }
@@ -190,7 +207,7 @@ struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffine
   Tensor dequantize(const Tensor& qtensor) override;
   Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
 
-  bool equalTo(QuantizerPtr other) override {
+  bool equalTo(QuantizerPtr other) const override {
     if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) {
       return false;
     }
@@ -222,6 +239,8 @@ TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
     int64_t axis,
     ScalarType scalar_type);
 
+TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
+
 // Create a Quantized Tensor given arguments for normal Tensor and a quantizer
 TORCH_API Tensor new_qtensor(
     IntArrayRef sizes,
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index efe773f9f03b..8d9160135cc1 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -2,6 +2,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ThreadLocal.h>
+#include <c10/util/overloaded.h>
 
 #include <algorithm>
 #include <cstdlib>
@@ -22,16 +23,6 @@ RecordFunctionHandle next_unique_record_function_handle() {
   return RecordFunctionHandle(unique_rf_id++);
 }
 
-RecordFunctionTLS& rf_tls() {
-#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-  static c10::ThreadLocal<RecordFunctionTLS> rf_tls_;
-  return rf_tls_.get();
-#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-  static thread_local RecordFunctionTLS rf_tls_;
-  return rf_tls_;
-#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-}
-
 std::atomic<int64_t> defaultNodeId(-1);
 
 // Enumerates thread ids logically;
@@ -40,534 +31,632 @@ std::atomic<int64_t> defaultNodeId(-1);
 std::atomic<uint64_t> next_thread_id_ {0};
 thread_local uint64_t current_thread_id_ = 0;
 
-// Low probability constant
-static constexpr double kLowProb = 0.001;
-struct CoinflipTLS {
-  int tries_left_;
-  std::mt19937 genGeo_;
-  std::mt19937 genZeroOne_;
-  std::geometric_distribution<int> distGeo_;
-  std::uniform_real_distribution<double> distZeroOne_;
-  CoinflipTLS();
+static constexpr size_t NumRecordScopes =
+    static_cast<size_t>(RecordScope::NUM_SCOPES);
+
+RecordFunctionCallbacks::iterator findCallback(
+    RecordFunctionCallbacks& entries,
+    CallbackHandle handle) {
+  auto match_handle = [handle](const auto& el) { return el.handle_ == handle; };
+  return std::find_if(entries.begin(), entries.end(), match_handle);
+}
+
+c10::optional<RecordFunctionCallback> extractCallback(
+    RecordFunctionCallbacks& entries,
+    CallbackHandle handle) {
+  auto it = findCallback(entries, handle);
+  if (it == entries.end()) {
+    return c10::nullopt;
+  }
+  auto out = it->callback_;
+  entries.erase(it);
+  return out;
+}
+
+// ============================================================================
+// == Callback manager ========================================================
+// ============================================================================
+// The high level idea of the RecordFunction callback machinery is based on the
+// observation that the set of callbacks to be run changes infrequently.
+// However, in order to reuse the active set we have to be able to invalidate
+// when the active set changes. There are three events that can change which
+// callbacks should be run:
+//  1) The set of global callbacks changes
+//  2) The set of local callbacks changes
+//  3) A sampling callback is present, and should run on this iteration
+//
+// Global callbacks rely on thread local replication and an atomic version
+// counter to maintain consistency. Whenever we change the set of active global
+// callbacks (add / remove / enable / disable) the `GlobalCallbackManager`
+// increments the version number and updates the global state while holding
+// a mutex. The local callback manager snapshots the global callbacks and
+// lazily rebuilds by comparing`GlobalCallbackManager::version()` (which is
+// a simple atomic read) to the version of the last rebuild. In the
+// overwhelmingly common case that they match it can reuse the existing
+// snapshot. Otherwise it must call the much more expensive (and locked)
+// `GlobalCallbackManager::getSnapshot()`.
+//
+// Handling changes to the thread local callbacks is trivial; functions that
+// change them can simply force a cache rebuild for that thread after the
+// changes are made.
+//
+// Sampling is by far the most challenging to handle efficiently. In general
+// sampling callbacks are expected to have very low frequency. (e.g. 1 per
+// million) Random number generation is rather expensive, so flipping a coin on
+// every call for every sampling callback is wasteful. We can significantly
+// reduce this cost by noting that the number of failures of a Bernoulli random
+// variable is a geometric distribution, and thus we can sample the geometric
+// distribution to determine the next time a callback should run. This reduces
+// the cost from a random sample to a simple integer decrement.
+//
+// We can further note that Bernoulli samples are independent. (In contrast to,
+// say, sampling without replacement.) This means that we can generate a
+// counter for each scope that a given callback supports and then decrement the
+// counter corresponding to the RecordScope being called. Conceptually, this is
+// analogous to flipping different coins with the same probability. By sharding
+// on RecordScope, we can consolidate the decrement to a single shared counter
+// and update individual counters during rebuild.
+
+class GlobalCallbackManager {
+ public:
+  static GlobalCallbackManager& get(); // Singleton
+
+ private:
+  GlobalCallbackManager() = default;
+
+ public:
+  static constexpr size_t NoVersion = 0;
+  using snapshot_t = std::pair<size_t, RecordFunctionCallbacks>;
+
+  //                                                                Locking?
+  size_t version() const; //                                     No
+  snapshot_t getSnapshot() const; //                                Yes
+  CallbackHandle addCallback(RecordFunctionCallback cb); //         Yes
+  void setCallbackEnabled(CallbackHandle handle, bool enabled); //  Yes
+  void removeCallback(CallbackHandle handle); //                    Yes
+  void clearCallbacks(); //                                         Yes
+
+ private:
+  std::atomic<size_t> version_{NoVersion + 1};
+  RecordFunctionCallbacks global_callbacks_; // Source of truth.
+  mutable std::mutex update_mutex_;
 };
 
-CoinflipTLS::CoinflipTLS()
-    : tries_left_(0), genGeo_(std::random_device()()), genZeroOne_(std::random_device()()), distGeo_(kLowProb), distZeroOne_(0.0, 1.0) {}
+class CacheEntry {
+ public:
+  CacheEntry() = default;
+  CacheEntry(std::mt19937* generator, RecordScope scope);
 
-CoinflipTLS& coinflip_tls() {
-#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-  static c10::ThreadLocal<CoinflipTLS> coinflip_tls_;
-  return coinflip_tls_.get();
-#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-  static thread_local CoinflipTLS coinflip_tls_;
-  return coinflip_tls_;
-#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
-}
+  // The caller is expected to check `GlobalCallbackManager::get().version()'
+  // and call CacheEntry::update() if necessary.
+  StepCallbacks getActiveCallbacks();
 
-int sample_geometric() {
-  return coinflip_tls().distGeo_(coinflip_tls().genGeo_);
-}
+  // Full rebuild. (E.g. during registration)
+  void update(const std::vector<RecordFunctionCallback>& callbacks);
 
-double sample_zero_one() {
-  return coinflip_tls().distZeroOne_(coinflip_tls().genZeroOne_);
-}
+ private:
+  struct CallbackAndCounter {
+    RecordFunctionCallback callback_;
+
+    // `-1` indicates that a callback is not sampled.
+    int tries_left_{-1};
+  };
+
+  void rebuildActiveCallbacks();
+  int sampleTries(double p) const;
+
+  // std::mt19937 is quite large, so all scopes share the same generator.
+  std::mt19937* generator_{nullptr};
+
+  // Includes sampling callbacks which are waiting to run.
+  c10::SmallVector<CallbackAndCounter, kSoftLimitCallbacks> callbacks_;
+  RecordScope scope_;
+
+  StepCallbacks active_callbacks_;
+
+  // For managing sampling callbacks
+  int sampling_countdown_{0};
+  int steps_for_this_update_{0};
+};
+
+class LocalCallbackManager {
+ public:
+  static LocalCallbackManager& get(); // Singleton
 
-struct GlobalRecordFunctionCallbacksEntry {
-  RecordFunctionCallback callback;
  private:
-  std::atomic<bool> enabled;
+  LocalCallbackManager();
+
  public:
-  CallbackHandle handle;
-
-  GlobalRecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h)
-      : callback(std::move(cb)), enabled(true), handle(h) {}
-
-  // Copying is fine despite std::atomic<bool> not being supposed to
-  // have a copy/move constructor: adding & removing callbacks is
-  // already not thread-safe.
-  GlobalRecordFunctionCallbacksEntry(
-      const GlobalRecordFunctionCallbacksEntry& rhs)
-      : callback(rhs.callback), enabled(rhs.enabled.load()), handle(rhs.handle) {}
-
-  GlobalRecordFunctionCallbacksEntry& operator=(const GlobalRecordFunctionCallbacksEntry& rhs) {
-    callback = rhs.callback;
-    enabled = rhs.enabled.load();
-    handle = rhs.handle;
-    return *this;
-  }
+  const RecordFunctionTLS& getTLS() const;
+  StepCallbacks getActiveCallbacks(const RecordScope scope);
 
-  GlobalRecordFunctionCallbacksEntry(
-      GlobalRecordFunctionCallbacksEntry&& rhs) noexcept
-      : callback(std::move(rhs.callback)), enabled(rhs.enabled.load()), handle(rhs.handle) {}
+  void setTLS(const RecordFunctionTLS& tls);
+  void seed(uint32_t seed);
+  CallbackHandle addCallback(RecordFunctionCallback callback);
+  bool setCallbackEnabled(CallbackHandle handle, bool enabled);
+  bool removeCallback(CallbackHandle handle);
+  void clearCallbacks();
 
-  GlobalRecordFunctionCallbacksEntry& operator=(GlobalRecordFunctionCallbacksEntry&& rhs) noexcept {
-    callback = std::move(rhs.callback);
-    enabled = rhs.enabled.load();
-    handle = rhs.handle;
-    return *this;
-  }
+ private:
+  void rebuild_all(const GlobalCallbackManager::snapshot_t& global_snapshot);
 
-  // Returns true if the status changed, false otherwise.
-  bool disable() {
-    bool expected = true;
-    // NOTE: we use sequentially consistent access here and in
-    // enable() because updating further atomic flags depends on this
-    // operation.
-    return enabled.compare_exchange_strong(expected, false);
-  }
+  void rebuild_callback_scopes(
+      const GlobalCallbackManager::snapshot_t& global_snapshot,
+      const RecordFunctionCallback& callback);
 
-  // Returns true if the status changed, false otherwise.
-  bool enable() {
-    bool expected = false;
-    return enabled.compare_exchange_strong(expected, true);
-  }
+  void rebuild_scope(
+      const GlobalCallbackManager::snapshot_t& global_snapshot,
+      const RecordScope scope);
 
-  // Read the flag. Note that it is neither necessary nor correct to
-  // check this before calling enable() or disable().
-  bool isEnabled() const {
-    return enabled.load(std::memory_order_relaxed);
-  }
-};
+  // Source of truth.
+  RecordFunctionTLS registered_callbacks_;
 
-using GlobalRecordFunctionCallbacks =
-  c10::SmallVector<GlobalRecordFunctionCallbacksEntry, kSoftLimitCallbacks>;
+  // Runtime cache.
+  size_t global_version_{GlobalCallbackManager::NoVersion};
+  std::array<CacheEntry, NumRecordScopes> active_callbacks_;
+  std::mt19937 generator_{};
+};
 
-} // namespace
+// ============================================================================
+// == GlobalCallbackManager: Implementation ===================================
+// ============================================================================
+GlobalCallbackManager& GlobalCallbackManager::get() {
+  static GlobalCallbackManager manager;
+  return manager;
+}
 
-const RecordFunctionTLS& get_record_function_tls_() {
-  return rf_tls();
+size_t GlobalCallbackManager::version() const {
+  return version_.load(std::memory_order_relaxed);
 }
 
-void set_record_function_tls_(const RecordFunctionTLS& tls) {
-  rf_tls() = tls;
+std::pair<size_t, RecordFunctionCallbacks> GlobalCallbackManager::getSnapshot() const {
+  std::lock_guard<std::mutex> guard(update_mutex_);
+  return {version_.load(std::memory_order_seq_cst), global_callbacks_};
 }
 
-enum class ToggledCallbackResult {
-  NotFound,
-  FoundButNotToggled,
-  FoundAndToggled,
-};
+CallbackHandle GlobalCallbackManager::addCallback(RecordFunctionCallback cb) {
+  std::lock_guard<std::mutex> guard(update_mutex_);
+  ++version_;
+  auto handle = next_unique_callback_handle();
+  global_callbacks_.emplace_back(std::move(cb), handle);
+  return handle;
+}
 
-template <typename RecordFunctionCallbacks>
-static ToggledCallbackResult findAndToggleCallback(
-    RecordFunctionCallbacks& cbs, CallbackHandle handle, bool enabled) {
-  auto it = std::find_if(
-      cbs.begin(), cbs.end(),
-      [handle](
-          const auto& el) {
-        return el.handle == handle;
-      });
-  if (it != cbs.end()) {
-    bool changed = enabled ? it->enable() : it->disable();
-    if (!changed) {
-      return ToggledCallbackResult::FoundButNotToggled;
-    }
-    if (it->callback.samplingProb() > kLowProb) {
-      // try to disable/restore pre-sampling of RecordFunction
-      if (enabled) {
-        at::bumpRecordAllFunctions();
-      } else {
-        at::releaseRecordAllFunctions();
-      }
+void GlobalCallbackManager::setCallbackEnabled(
+    CallbackHandle handle,
+    bool enabled) {
+  std::lock_guard<std::mutex> guard(update_mutex_);
+  auto it = findCallback(global_callbacks_, handle);
+  if (it != global_callbacks_.end()) {
+    if (it->enabled_ != enabled) {
+      ++version_;
+      it->enabled_ = enabled;
     }
-    return ToggledCallbackResult::FoundAndToggled;
+  } else {
+    LOG(WARNING) << "Requested callback is not found";
   }
-  return ToggledCallbackResult::NotFound;
-}
-
-template <typename RecordFunctionCallbacks>
-static bool findAndRemoveCallback(
-    RecordFunctionCallbacks& cbs, CallbackHandle handle) {
-  auto it = std::find_if(
-      cbs.begin(), cbs.end(),
-      [handle](
-          const auto& el) {
-        return el.handle == handle;
-      });
-  if (it != cbs.end()) {
-    // We do not need to try to call releaseRecordAllFunctions here
-    // because findAndRemoveCallback is used only as a helper in
-    // removeCallback. removeCallback calls disableCallback, which
-    // calls findAndToggleCallback, which already will do a
-    // releaseRecordAllFunctions for us.
-    cbs.erase(it);
-    return true;
+}
+
+void GlobalCallbackManager::removeCallback(CallbackHandle handle) {
+  std::lock_guard<std::mutex> guard(update_mutex_);
+  if (extractCallback(global_callbacks_, handle).has_value()) {
+    ++version_;
+  } else {
+    LOG(WARNING) << "Requested callback is not found";
   }
-  return false;
 }
 
-class CallbackManager {
- public:
-  CallbackManager() : num_enabled_global_callbacks_(0) {}
+void GlobalCallbackManager::clearCallbacks() {
+  std::lock_guard<std::mutex> guard(update_mutex_);
+  ++version_;
+  global_callbacks_.clear();
+}
 
-  CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) {
-    if (cb.samplingProb() > kLowProb) {
-      // pre-sampling of RecordFunction with prob. kLowProb cannot be used
-      at::bumpRecordAllFunctions();
-    }
-    // note: monotonically increasing callbacks_unique_id keeps
-    // sorted_tls_callbacks_ sorted
-    auto handle = next_unique_callback_handle();
-    rf_tls().sorted_tls_callbacks_.emplace_back(std::move(cb), handle);
-    return handle;
-  }
+// ============================================================================
+// == CacheEntry: Implementation ==============================================
+// ============================================================================
+CacheEntry::CacheEntry(std::mt19937* generator, RecordScope scope)
+    : generator_{generator}, scope_{scope} {
+  rebuildActiveCallbacks();
+}
 
-  CallbackHandle addGlobalCallback(RecordFunctionCallback cb) {
-    if (cb.samplingProb() > kLowProb) {
-      // pre-sampling of RecordFunction with prob. kLowProb cannot be used
-      at::bumpRecordAllFunctions();
-    }
-    auto handle = next_unique_callback_handle();
-    // NOLINTNEXTLINE(performance-move-const-arg)
-    sorted_global_callbacks_.emplace_back(std::move(cb), handle);
-    num_enabled_global_callbacks_.fetch_add(1, std::memory_order_relaxed);
-    return handle;
+void CacheEntry::update(const std::vector<RecordFunctionCallback>& callbacks) {
+  callbacks_.clear();
+  callbacks_.reserve(callbacks.size());
+  for (const auto& callback : callbacks) {
+    const auto p = callback.samplingProb();
+    callbacks_.push_back({callback, p < 1.0 ? sampleTries(p) : -1});
   }
 
-  void removeCallback(CallbackHandle handle) {
-    // This could be implemented more efficiently, but callback
-    // addition/removal is not intended to run in performance-critical
-    // paths (it's not thread-safe and should be done during
-    // initialization).
-    disableCallback(handle);
-    auto found = findAndRemoveCallback(rf_tls().sorted_tls_callbacks_, handle);
-    if (!found) {
-      found = findAndRemoveCallback(sorted_global_callbacks_, handle);
-    }
-    if (!found) {
-      LOG(WARNING) << "Requested callback is not found";
-    }
-  }
+  rebuildActiveCallbacks();
+}
+
+StepCallbacks CacheEntry::getActiveCallbacks() {
+  // We rebuild the active set when `sampling_countdown_` reaches zero, so if it
+  // reaches zero at the start of this function something has gone wrong.
+  TORCH_INTERNAL_ASSERT(sampling_countdown_ > 0, sampling_countdown_);
 
-  void disableCallback(CallbackHandle handle) {
-    auto found = findAndToggleCallback(
-        rf_tls().sorted_tls_callbacks_, handle, false);
-    if (found == ToggledCallbackResult::NotFound) {
-      found = findAndToggleCallback(
-          sorted_global_callbacks_, handle, false);
-      if (found == ToggledCallbackResult::FoundAndToggled) {
-        const auto previousCount = num_enabled_global_callbacks_.fetch_sub(1, std::memory_order_relaxed);
-        TORCH_CHECK(previousCount > 0, previousCount);
+  if (C10_UNLIKELY(!(--sampling_countdown_))) {
+    // Use inferred steps to update sampled callbacks.
+    for (auto& i : callbacks_) {
+      if (i.tries_left_ > 0) {
+        TORCH_INTERNAL_ASSERT(i.tries_left_ >= steps_for_this_update_);
+        i.tries_left_ -= steps_for_this_update_;
       }
     }
-    if (found == ToggledCallbackResult::NotFound) {
-      LOG(WARNING) << "Requested callback is not found";
-    }
-  }
 
-  void reenableCallback(CallbackHandle handle) {
-    auto found = findAndToggleCallback(
-        rf_tls().sorted_tls_callbacks_, handle, true);
-    if (found == ToggledCallbackResult::NotFound) {
-      found = findAndToggleCallback(
-          sorted_global_callbacks_, handle, true);
-      if (found == ToggledCallbackResult::FoundAndToggled) {
-        num_enabled_global_callbacks_.fetch_add(1, std::memory_order_relaxed);
+    // Determine which callbacks to run and for how long.
+    rebuildActiveCallbacks();
+
+    // Resample any sampled callbacks that ran this call.
+    for (auto& i : callbacks_) {
+      if (!i.tries_left_) {
+        i.tries_left_ = sampleTries(i.callback_.samplingProb());
       }
     }
-    if (found == ToggledCallbackResult::NotFound) {
-      LOG(WARNING) << "Requested callback is not found";
+  }
+
+  return active_callbacks_;
+}
+
+void CacheEntry::rebuildActiveCallbacks() {
+  // We could store thread ID in CacheEntry, but rebuilds are infrequent and
+  // this saves us from having to plumb it through.
+  const auto thread_id = RecordFunction::currentThreadId();
+  active_callbacks_ = StepCallbacks(thread_id, scope_);
+
+  sampling_countdown_ = std::numeric_limits<int>::max();
+  for (const auto& i : callbacks_) {
+    if (i.tries_left_ < 0) {
+      // Callback is not sampled. Unconditionally push.
+      active_callbacks_.callbacks_.push_back(
+          {i.callback_.start(), i.callback_.end()});
+
+    } else if (i.tries_left_ == 0) {
+      // Callback is sampled and we have reached a sampling event. Push and
+      // set `sampling_countdown_` to one so we trigger a rebuild after one call.
+      active_callbacks_.callbacks_.push_back(
+          {i.callback_.start(), i.callback_.end()});
+      sampling_countdown_ = 1;
+
+    } else {
+      // Callback is sampled and we have not reached sampling event. Set
+      // `sampling_countdown_` to rebuild when it is time for this callback to
+      // execute.
+      sampling_countdown_ = std::min(sampling_countdown_, i.tries_left_);
     }
+    active_callbacks_.needs_inputs_ |= i.callback_.needsInputs();
+    active_callbacks_.needs_outputs_ |= i.callback_.needsOutputs();
+    active_callbacks_.needs_ids_ |= i.callback_.needsIds();
   }
+  steps_for_this_update_ = sampling_countdown_;
+}
+
+int CacheEntry::sampleTries(double p) const {
+  TORCH_INTERNAL_ASSERT(generator_ != nullptr);
+  TORCH_INTERNAL_ASSERT(p > 0.0 && p <= 1.0);
 
-  void clearGlobalCallbacks() {
-    sorted_global_callbacks_.clear();
-    num_enabled_global_callbacks_ = 0;
+  // The geometric distribution returns the number of failures. We add one to
+  // also account for the call where we succeed.
+  return std::geometric_distribution<int>(p)(*generator_) + 1;
+}
+
+// ============================================================================
+// == LocalCallbackManager: Implementation ====================================
+// ============================================================================
+LocalCallbackManager& LocalCallbackManager::get() {
+#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+  static c10::ThreadLocal<LocalCallbackManager> manager;
+  return manager.get();
+#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+  static thread_local LocalCallbackManager manager;
+  return manager;
+#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+}
+
+LocalCallbackManager::LocalCallbackManager() {
+  for (auto i : c10::irange(NumRecordScopes)) {
+    active_callbacks_[i] = CacheEntry(&generator_, static_cast<RecordScope>(i));
   }
+  rebuild_all(GlobalCallbackManager::get().getSnapshot());
+}
+
+const RecordFunctionTLS& LocalCallbackManager::getTLS() const {
+  return registered_callbacks_;
+}
 
-  void clearThreadLocalCallbacks() {
-    rf_tls().sorted_tls_callbacks_.clear();
+StepCallbacks LocalCallbackManager::getActiveCallbacks(
+    const RecordScope scope) {
+  const auto global_version = GlobalCallbackManager::get().version();
+  if (C10_UNLIKELY(global_version != global_version_)) {
+    rebuild_all(GlobalCallbackManager::get().getSnapshot());
   }
+  return active_callbacks_[static_cast<size_t>(scope)].getActiveCallbacks();
+}
+
+void LocalCallbackManager::setTLS(const RecordFunctionTLS& tls) {
+  registered_callbacks_ = tls;
+  rebuild_all(GlobalCallbackManager::get().getSnapshot());
+}
+
+void LocalCallbackManager::seed(uint32_t seed) {
+  generator_.seed(seed);
+}
+
+CallbackHandle LocalCallbackManager::addCallback(
+    RecordFunctionCallback callback) {
+  auto handle = next_unique_callback_handle();
+  auto& callbacks = registered_callbacks_.sorted_tls_callbacks_;
+  callbacks.emplace_back(std::move(callback), handle);
+  rebuild_callback_scopes(
+      GlobalCallbackManager::get().getSnapshot(), callbacks.back().callback_);
+  return handle;
+}
 
-  inline bool hasGlobalCallbacks() const {
-    return num_enabled_global_callbacks_.load(std::memory_order_relaxed) > 0;
+bool LocalCallbackManager::setCallbackEnabled(
+    CallbackHandle handle,
+    bool enabled) {
+  auto it = findCallback(registered_callbacks_.sorted_tls_callbacks_, handle);
+  auto found = (it != registered_callbacks_.sorted_tls_callbacks_.end());
+  if (found && it->enabled_ != enabled) {
+    it->enabled_ = enabled;
+    rebuild_callback_scopes(
+        GlobalCallbackManager::get().getSnapshot(), it->callback_);
   }
+  return found;
+}
 
-  inline bool hasThreadLocalCallbacks() const {
-    return !rf_tls().sorted_tls_callbacks_.empty();
+bool LocalCallbackManager::removeCallback(CallbackHandle handle) {
+  auto& callbacks = registered_callbacks_.sorted_tls_callbacks_;
+  auto callback = extractCallback(callbacks, handle);
+  if (callback.has_value()) {
+    rebuild_callback_scopes(
+        GlobalCallbackManager::get().getSnapshot(), *callback);
   }
+  return callback.has_value();
+}
 
-  // We need this function to be inlined: init() is a hot path and
-  // callbackShouldRun is even hotter because it's called multiple
-  // times per init(). Profiling shows that the function prologue is
-  // taking up a significant fraction of the time.
-  static bool C10_ALWAYS_INLINE callbackShouldRun(
-      const RecordFunctionCallback& cb, RecordScope scope, bool pre_sampled) {
-    TORCH_INTERNAL_ASSERT(
-        !pre_sampled || (cb.sampling_prob_ <= kLowProb),
-        "Incorrect usage of a pre-sampled RecordFunction with a high-frequency "
-        " or non-sampled callback");
-
-    // first check whether this callback is interested in
-    // the given scope type
-    if (!cb.checkScope(scope)) {
-      return false;
-    }
+void LocalCallbackManager::clearCallbacks() {
+  registered_callbacks_.sorted_tls_callbacks_.clear();
+  rebuild_all(GlobalCallbackManager::get().getSnapshot());
+}
 
-    // otherwise potentially do the sampling
-    double sampling_prob = cb.sampling_prob_;
-    constexpr double kLowProbInv = 1 / kLowProb;
-    if (pre_sampled) {
-      // adjust the sampling rate to account for kLowProb pre-sampling of
-      // the RecordFunction
-      sampling_prob *= kLowProbInv;
-    }
+void LocalCallbackManager::rebuild_all(const GlobalCallbackManager::snapshot_t& global_snapshot) {
+  global_version_ = global_snapshot.first;
+  for (auto i : c10::irange(NumRecordScopes)) {
+    rebuild_scope(global_snapshot, static_cast<RecordScope>(i));
+  }
+}
 
-    if (sampling_prob < 1.0) {
-      // model the low probability events as events happening
-      // with probability kLowProb followed by another sampling with
-      // probability (sampling_prob / kLowProb), then replace the coin
-      // flip for kLowProb with a thread local number of tries tries_left_
-      // sampled from the geometric distribution.
-      if (sampling_prob < kLowProb) {
-        if (coinflip_tls().tries_left_ == 0) {
-          coinflip_tls().tries_left_ = sample_geometric();
-          return (sample_zero_one() < sampling_prob * kLowProbInv);
-        } else {
-          --coinflip_tls().tries_left_;
-          return false;
-        }
-      } else {
-        return (sample_zero_one() < sampling_prob);
+void LocalCallbackManager::rebuild_callback_scopes(
+    const GlobalCallbackManager::snapshot_t& global_snapshot,
+    const RecordFunctionCallback& callback) {
+  if (global_snapshot.first == global_version_) {
+    // Only rebuild scopes associated with `callback`
+    for (auto i : c10::irange(NumRecordScopes)) {
+      if (callback.checkScope(static_cast<RecordScope>(i))) {
+        rebuild_scope(global_snapshot, static_cast<RecordScope>(i));
       }
     }
+  } else {
+    rebuild_all(global_snapshot);
+  }
+}
+
+void LocalCallbackManager::rebuild_scope(
+    const GlobalCallbackManager::snapshot_t& global_snapshot,
+    const RecordScope scope) {
+  std::vector<RecordFunctionCallback> callbacks;
+  if (registered_callbacks_.tls_record_function_enabled_) {
+    auto populate_callbacks =
+        [&](const RecordFunctionCallbacks& raw_callbacks) {
+          for (const auto& i : raw_callbacks) {
+            if (i.enabled_ && i.callback_.checkScope(scope) &&
+                i.callback_.samplingProb() > 0) {
+              callbacks.push_back(i.callback_);
+            }
+          }
+        };
+    populate_callbacks(global_snapshot.second);
+    populate_callbacks(registered_callbacks_.sorted_tls_callbacks_);
+  }
+  active_callbacks_[static_cast<size_t>(scope)].update(callbacks);
+}
+
+// ============================================================================
+// == Callback execution ======================================================
+// ============================================================================
+void logTryRunCallbackError(const char* what, const char* name) {
+  LOG(WARNING) << "Exception in RecordFunction callback: " << what
+               << " , for the range " << name;
+}
+
+template <bool is_start>
+C10_ALWAYS_INLINE bool tryRunCallback(
+    const StepCallbacks::StartEndPair callback_ptrs,
+    const RecordFunction& rf,
+    std::unique_ptr<ObserverContext>& ctx) {
+  try {
+    if (is_start && callback_ptrs.start_) {
+      ctx = callback_ptrs.start_(rf);
+    }
+
+    if (!is_start && callback_ptrs.end_) {
+      callback_ptrs.end_(rf, ctx.get());
+    }
 
     return true;
+  } catch (const std::exception& e) {
+    logTryRunCallbackError(e.what(), rf.name());
+    return false;
+  } catch (...) {
+    logTryRunCallbackError("unknown", rf.name());
+    return false;
   }
+}
 
-  // init is called by RecordFunction in constructor to
-  // determine which thread local and global callbacks are going
-  // to be executed and whether any of them need inputs
-  inline void init(RecordFunction& rec_fn, RecordScope scope, bool pre_sampled) {
-    bool found_needs_inputs = false;
-    bool found_needs_outputs = false;
-    bool found_needs_ids = false;
-
-    for (const auto& cb: rf_tls().sorted_tls_callbacks_) {
-      if (cb.isEnabled() && callbackShouldRun(cb.callback, scope, pre_sampled)) {
-        if (cb.callback.needsInputs()) {
-          found_needs_inputs = true;
-        }
-        if (cb.callback.needsOutputs()) {
-          found_needs_outputs = true;
-        }
-        if (cb.callback.needsIds()) {
-          found_needs_ids = true;
-        }
-        if (!rec_fn.state_) {
-          rec_fn.state_.emplace(scope);
-        }
-        rec_fn.state_->sorted_active_tls_handles_.push_back(cb.handle);
-      }
-    }
+} // namespace
 
-    for (const auto& cb: sorted_global_callbacks_) {
-      if (cb.isEnabled() && callbackShouldRun(cb.callback, scope, pre_sampled)) {
-        if (cb.callback.needsInputs()) {
-          found_needs_inputs = true;
-        }
-        if (cb.callback.needsOutputs()) {
-          found_needs_outputs = true;
-        }
-        if (cb.callback.needsIds()) {
-          found_needs_ids = true;
-        }
-        if (!rec_fn.state_) {
-          rec_fn.state_.emplace(scope);
-        }
-        rec_fn.state_->sorted_active_global_handles_.push_back(cb.handle);
-      }
-    }
+RecordFunction::RecordFunction(RecordScope scope)
+    : RecordFunction(getStepCallbacks(scope)) {}
 
-    if (!rec_fn.state_) {
-      return;
-    }
+RecordFunction::RecordFunction(StepCallbacks&& step_callbacks)
+    : step_callbacks_{std::move(step_callbacks)} {
+  ctx_.resize(step_callbacks_.callbacks_.size());
+  if (step_callbacks_.needs_ids_) {
+    setHandle(next_unique_record_function_handle());
+  }
+}
 
-    // Pre-allocate observer context list with nullptr.
-    rec_fn.state_->tls_ctx_.resize(rec_fn.state_->sorted_active_tls_handles_.size());
-    rec_fn.state_->global_ctx_.resize(rec_fn.state_->sorted_active_global_handles_.size());
+void RecordFunction::runStartCallbacks() {
+  for (const auto i : c10::irange(step_callbacks_.callbacks_.size())) {
+    tryRunCallback</*is_start=*/true>(
+        step_callbacks_.callbacks_[i], *this, ctx_[i]);
+  }
+  called_start_callbacks_ = true;
+}
 
-    rec_fn.state_->needs_inputs = found_needs_inputs;
-    rec_fn.state_->needs_outputs = found_needs_outputs;
-    if (found_needs_ids) {
-      rec_fn.setHandle(next_unique_record_function_handle());
+void RecordFunction::end() {
+  if (called_start_callbacks_) {
+    for (const auto i : c10::irange(step_callbacks_.callbacks_.size())) {
+      tryRunCallback</*is_start=*/false>(
+        step_callbacks_.callbacks_[i], *this, ctx_[i]);
     }
+    step_callbacks_.callbacks_.clear();
   }
+}
 
-  void runStartCallbacks(RecordFunction& rf) {
-    mergeRunCallbacks(
-        sorted_global_callbacks_,
-        rf.state_->sorted_active_global_handles_,
-        rf.state_->global_ctx_,
-        /* is_start */ true,
-        rf);
-    mergeRunCallbacks(
-        rf_tls().sorted_tls_callbacks_,
-        rf.state_->sorted_active_tls_handles_,
-        rf.state_->tls_ctx_,
-        /* is_start */ true,
-        rf);
-    rf.state_->called_start_callbacks_ = true;
-  }
+const char* RecordFunction::name() const {
+  return c10::visit(
+      c10::overloaded(
+          [](const std::string& name) { return name.c_str(); },
+          [](const schema_ref_t schema) {
+            return schema.get().name().c_str();
+          }),
+      fn_);
+}
 
-  void runEndCallbacks(RecordFunction& rf) {
-    mergeRunCallbacks(
-        sorted_global_callbacks_,
-        rf.state_->sorted_active_global_handles_,
-        rf.state_->global_ctx_,
-        /* is_start */ false,
-        rf);
-    mergeRunCallbacks(
-        rf_tls().sorted_tls_callbacks_,
-        rf.state_->sorted_active_tls_handles_,
-        rf.state_->tls_ctx_,
-        /* is_start */ false,
-        rf);
-  }
+size_t RecordFunction::num_inputs() const {
+  return c10::visit(
+      c10::overloaded(
+          [&](const std::string&) { return inputs_.size(); },
+          [](const schema_ref_t schema) {
+            return schema.get().arguments().size();
+          }),
+      fn_);
+}
 
-  // Global callbacks; must be sorted in increasing handle order
-  GlobalRecordFunctionCallbacks sorted_global_callbacks_;
-  std::atomic<uint_fast32_t> num_enabled_global_callbacks_;
+size_t RecordFunction::num_outputs() const {
+  return c10::visit(
+      c10::overloaded(
+          [&](const std::string&) { return outputs_.size(); },
+          [](const schema_ref_t schema) {
+            return schema.get().returns().size();
+          }),
+      fn_);
+}
 
- private:
-  static void logTryRunCallbackError(const char* what, const RecordFunction& rf) {
-    LOG(WARNING) << "Exception in RecordFunction callback: " << what << " , for the range " << rf.name();
-  }
+c10::optional<OperatorName> RecordFunction::operator_name() const {
+  return c10::visit(
+      c10::overloaded(
+          [&](const std::string&) -> c10::optional<OperatorName> {
+            return c10::nullopt;
+          },
+          [](const schema_ref_t schema) -> c10::optional<OperatorName> {
+            return schema.get().operator_name();
+          }),
+      fn_);
+}
 
-  C10_ALWAYS_INLINE static bool tryRunCallback(
-      const RecordFunctionCallback& rfcb,
-      RecordFunction& rf,
-      std::unique_ptr<ObserverContext>& ctx,
-      bool is_start) {
-    try {
-      if (is_start) {
-        ctx = rfcb.start() ? rfcb.start()(rf) : nullptr;
-      }
-      else {
-        if (rfcb.end()) {
-          rfcb.end()(rf, ctx.get());
-        }
-      }
-      return true;
-    } catch (const std::exception &e) {
-      logTryRunCallbackError(e.what(), rf);
-      return false;
-    } catch (...) {
-      logTryRunCallbackError("unknown", rf);
-      return false;
-    }
-  }
+StepCallbacks getStepCallbacks(RecordScope scope) {
+  return LocalCallbackManager::get().getActiveCallbacks(scope);
+}
 
-  template <typename RecordFunctionCallbacks>
-  static void mergeRunCallbacks(
-      const RecordFunctionCallbacks& sorted_callbacks,
-      const CallbackHandles& sorted_handles,
-      ObserverContextList& ctx_list,
-      bool is_start,
-      RecordFunction& rf) {
-    size_t num_executed = 0;
-    size_t idx_c = 0;
-    const auto sorted_handles_size = sorted_handles.size();
-    const auto ctx_list_size = ctx_list.size();
-    const auto sorted_callbacks_size = sorted_callbacks.size();
-    for (size_t idx_h = 0; idx_h < sorted_handles_size && idx_h < ctx_list_size; ++idx_h) {
-      while (idx_c < sorted_callbacks_size &&
-            sorted_callbacks[idx_c].handle < sorted_handles[idx_h]) {
-        ++idx_c;
-      }
-      if (idx_c >= sorted_callbacks_size) {
-        break;
-      }
-      if (sorted_callbacks[idx_c].handle == sorted_handles[idx_h]) {
-        tryRunCallback(sorted_callbacks[idx_c].callback, rf, ctx_list[idx_h], is_start);
-        ++num_executed;
-      }
-    }
+const RecordFunctionTLS& get_record_function_tls_() {
+  return LocalCallbackManager::get().getTLS();
+}
 
-    if (num_executed != sorted_handles.size()) {
-      C10_LOG_EVERY_MS(WARNING, 1000)
-          << "Could not match some of the start callbacks with the corresponding end callbacks, "
-          << "callbacks changed during RecordFunction lifetime; you might be trying to profile "
-          << "the code after profiler is finished";
-    }
-  }
-};
+void set_record_function_tls_(const RecordFunctionTLS& tls) {
+  LocalCallbackManager::get().setTLS(tls);
+}
 
 namespace {
-  // Keeping this static manager local.
-  CallbackManager& manager() {
-    static CallbackManager _manager;
-    return _manager;
-  }
+bool anyEnabled(const RecordFunctionCallbacks& callbacks) {
+  return std::any_of(callbacks.begin(), callbacks.end(), [](const auto& cb) {
+    return cb.enabled_;
+  });
+}
 } // namespace
 
 bool hasCallbacks() {
-  auto& m = manager();
-  return m.hasGlobalCallbacks() || m.hasThreadLocalCallbacks();
+  return hasThreadLocalCallbacks() || hasGlobalCallbacks();
 }
 
 bool hasGlobalCallbacks() {
-  return manager().hasGlobalCallbacks();
+  return anyEnabled(GlobalCallbackManager::get().getSnapshot().second);
 }
 
 bool hasThreadLocalCallbacks() {
-  return manager().hasThreadLocalCallbacks();
+  return anyEnabled(get_record_function_tls_().sorted_tls_callbacks_);
 }
 
 CallbackHandle addThreadLocalCallback(
     RecordFunctionCallback cb) {
   // NOLINTNEXTLINE(performance-move-const-arg)
-  return manager().addThreadLocalCallback(std::move(cb));
+  return LocalCallbackManager::get().addCallback(std::move(cb));
 }
 
 CallbackHandle addGlobalCallback(
     RecordFunctionCallback cb) {
   // NOLINTNEXTLINE(performance-move-const-arg)
-  return manager().addGlobalCallback(std::move(cb));
+  return GlobalCallbackManager::get().addCallback(std::move(cb));
 }
 
 void removeCallback(CallbackHandle handle) {
-  manager().removeCallback(handle);
+  if (!LocalCallbackManager::get().removeCallback(handle)) {
+    GlobalCallbackManager::get().removeCallback(handle);
+  }
 }
 
 void disableCallback(CallbackHandle handle) {
-  manager().disableCallback(handle);
+  if (!LocalCallbackManager::get().setCallbackEnabled(handle, false)) {
+    GlobalCallbackManager::get().setCallbackEnabled(handle, false);
+  }
 }
 
 void reenableCallback(CallbackHandle handle) {
-  manager().reenableCallback(handle);
+  if (!LocalCallbackManager::get().setCallbackEnabled(handle, true)) {
+    GlobalCallbackManager::get().setCallbackEnabled(handle, true);
+  }
 }
 
 void clearGlobalCallbacks() {
-  manager().clearGlobalCallbacks();
+  GlobalCallbackManager::get().clearCallbacks();
 }
 
 void clearThreadLocalCallbacks() {
-  manager().clearThreadLocalCallbacks();
+  LocalCallbackManager::get().clearCallbacks();
 }
 
 void clearCallbacks() {
-  auto& m = manager();
-  m.clearGlobalCallbacks();
-  m.clearThreadLocalCallbacks();
+  clearGlobalCallbacks();
+  clearThreadLocalCallbacks();
 }
 
 bool isRecordFunctionEnabled() {
-  return rf_tls().tls_record_function_enabled_;
+  return LocalCallbackManager::get().getTLS().tls_record_function_enabled_;
 }
 
 void enableRecordFunction(bool enable) {
-  rf_tls().tls_record_function_enabled_ = enable;
+  auto tls = LocalCallbackManager::get().getTLS();
+  if (tls.tls_record_function_enabled_ != enable) {
+    tls.tls_record_function_enabled_ = enable;
+    LocalCallbackManager::get().setTLS(tls);
+  }
 }
 
-RecordFunction::RecordFunction(RecordScope scope, bool pre_sampled) {
-  auto* rf_tls_ptr = &rf_tls();
-  if (rf_tls_ptr->tls_record_function_enabled_) {
-    auto& m = manager();
-    if (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty()) {
-      m.init(*this, scope, pre_sampled);
-    }
-  }
+void set_record_function_seed_for_testing(uint32_t seed) {
+  LocalCallbackManager::get().seed(seed);
 }
 
 /* static */
@@ -580,45 +669,29 @@ uint64_t RecordFunction::currentThreadId() {
 }
 
 void RecordFunction::before(const char* name, int64_t sequence_nr) {
-  if (!isActive()) {
-    return;
-  }
-  state_->op_input_size = state_->inputs_.size();
-  state_->name_ = name;
-  state_->sequence_nr_ = sequence_nr;
-  state_->thread_id_ = currentThreadId();
-  state_->operator_name_.reset();
+  fn_ = name;
+  sequence_nr_ = sequence_nr;
 
-  manager().runStartCallbacks(*this);
+  runStartCallbacks();
+  invalidateInputs();
 }
 
 void RecordFunction::before(std::string name, int64_t sequence_nr) {
-  if (!isActive()) {
-    return;
-  }
-  state_->op_input_size = state_->inputs_.size();
-  state_->name_ = std::move(name);
-  state_->sequence_nr_ = sequence_nr;
-  state_->thread_id_ = currentThreadId();
-  state_->operator_name_.reset();
+  fn_ = std::move(name);
+  sequence_nr_ = sequence_nr;
 
-  manager().runStartCallbacks(*this);
+  runStartCallbacks();
+  invalidateInputs();
 }
 
 void RecordFunction::before(
-    c10::OperatorHandle const& op,
+    RecordFunction::schema_ref_t schema,
     int64_t sequence_nr) {
-  if (!isActive()) {
-    return;
-  }
-  state_->sequence_nr_ = sequence_nr;
-  state_->thread_id_ = currentThreadId();
-  state_->operator_name_ = op.operator_name();
-  state_->op_input_size = op.schema().arguments().size();
-  state_->op_output_size = op.schema().returns().size();
-  state_->name_ = op.schema().name();
+  sequence_nr_ = sequence_nr;
+  fn_ = schema;
 
-  manager().runStartCallbacks(*this);
+  runStartCallbacks();
+  invalidateInputs();
 }
 
 /* static */ void RecordFunction::setDefaultNodeId(int64_t newDefaultNodeId) {
@@ -634,69 +707,24 @@ RecordFunction::~RecordFunction() {
   end();
 }
 
-void RecordFunction::end() {
-  if (isActive() && state_->called_start_callbacks_) {
-    manager().runEndCallbacks(*this);
-    state_.reset();
-  }
-}
-
 void RecordFunction::_setAsync() {
-  if (isActive()) {
-    state_->is_async_ = true;
-  }
+  is_async_ = true;
 }
 
 bool RecordFunction::isAsync() const {
-  if (isActive()) {
-    return state_->is_async_;
-  }
-  return false;
-}
-
-// RecordFunction pre-sampling
-namespace {
-// Whether to try to create RecordFunction on each call (>0) or
-// use pre-sampling (=0)
-std::atomic<int> global_record_all_functions_ {0};
-}
-
-void bumpRecordAllFunctions() {
-  global_record_all_functions_.fetch_add(1, std::memory_order_relaxed);
+  return is_async_;
 }
 
-void releaseRecordAllFunctions() {
-  TORCH_CHECK(global_record_all_functions_.fetch_sub(1, std::memory_order_relaxed) > 0);
-}
-
-bool checkRecordAllFunctions() {
-  return (global_record_all_functions_.load(std::memory_order_relaxed) > 0);
-}
-
-bool shouldRunRecordFunction(bool* pre_sampled) {
-  auto* rf_tls_ptr = &rf_tls();
-  if (rf_tls_ptr->sorted_tls_callbacks_.empty() && !manager().hasGlobalCallbacks()) {
-    *pre_sampled = false;
-    return false;
-  }
-  if (global_record_all_functions_.load(std::memory_order_relaxed) > 0) {
-    *pre_sampled = false;
-    return true;
-  }
-  if (!rf_tls_ptr->tls_record_function_enabled_) {
-    *pre_sampled = false;
-    return false;
+void RecordFunction::_setStaticRuntimeOutVariant() {
+  if (isActive()) {
+    is_static_runtime_out_variant_ = true;
   }
+}
 
-  *pre_sampled = true;
-  auto* coinflip_tls_ptr = &coinflip_tls();
-  if (coinflip_tls_ptr->tries_left_ == 0) {
-    coinflip_tls_ptr->tries_left_ = sample_geometric();
-    return true;
-  } else {
-    --coinflip_tls_ptr->tries_left_;
-    return false;
+bool RecordFunction::isStaticRuntimeOutVariant() const {
+  if (isActive()) {
+    return is_static_runtime_out_variant_;
   }
+  return false;
 }
-
 } // namespace at
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index f6688726fcaf..af594f47e789 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -5,6 +5,7 @@
 #include <c10/macros/Export.h>
 #include <c10/util/Optional.h>
 #include <c10/util/SmallVector.h>
+#include <c10/util/variant.h>
 
 #include <array>
 #include <atomic>
@@ -99,80 +100,250 @@ struct ObserverContext {
 typedef c10::SmallVector<uint64_t, kSoftLimitCallbacks> CallbackHandles;
 typedef c10::SmallVector<std::unique_ptr<ObserverContext>, kSoftLimitCallbacks> ObserverContextList;
 typedef uint64_t RecordFunctionHandle;
+struct RecordFunction;
+
+//
+// PyTorch callbacks/observers API:
+//
+
+/**
+ * RecordFunctionCallback represents a pair of callbacks to be used with
+ * RecordFunction, members:
+ *   start, end - the callbacks to run when entering and exiting the scope;
+ *     optionally, the start callback may return an ObserverContext which will
+ *     be passed to the end callback, use appropriate constructor accordingly.
+ *   needs_inputs - whether the callbacks need the inputs passed from the observed
+ *     function/range; NOTE: passing the inputs incurs an additional overhead;
+ *   sampling_probability - if not 1.0, then the callback is probabilistically sampled
+ *     to run; NOTE: start and end callbacks always run as a pair and are sampled
+ *     together;
+ *   scopes - types of scopes to execute the callbacks on (see RecordScope);
+ *     passing empty set means the callbacks will be executed for all possible
+ *     scope types
+ *   should_run - optional function that returns whether this callback should run;
+ *     overwrites the effect of setting sampling_probability
+ */
+class TORCH_API RecordFunctionCallback {
+ public:
+  using StartCallback = std::unique_ptr<ObserverContext>(*)(const RecordFunction&);
+  using EndCallback = void (*)(const RecordFunction&, ObserverContext*);
+
+  // This interface supports observers that require passing an ObserverContext
+  // between start and end callbacks.
+  explicit RecordFunctionCallback(
+      StartCallback start,
+      EndCallback end = nullptr) :
+      start_(start),
+      end_(end) {
+    scopes_.fill(true);
+  }
+
+  RecordFunctionCallback& needsInputs(bool needs_inputs) {
+    needs_inputs_ = needs_inputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsOutputs(bool needs_outputs) {
+    needs_outputs_ = needs_outputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsIds(bool needs_ids) {
+    needs_ids_ = needs_ids;
+    return *this;
+  }
+
+  RecordFunctionCallback& samplingProb(double sampling_prob) {
+    TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0,
+        "Invalid sampling probability");
+    sampling_prob_ = sampling_prob;
+    return *this;
+  }
+
+  RecordFunctionCallback& scopes(
+      const std::unordered_set<RecordScope, std::hash<RecordScope>>& scopes) {
+    if (!scopes.empty()) {
+      scopes_.fill(false);
+      for (auto sc : scopes) {
+        scopes_[static_cast<size_t>(sc)] = true;
+      }
+    } else {
+      scopes_.fill(true);
+    }
+    return *this;
+  }
+
+  bool needsInputs() const {
+    return needs_inputs_;
+  }
+
+  bool needsOutputs() const {
+    return needs_outputs_;
+  }
+
+  bool needsIds() const {
+    return needs_ids_;
+  }
+
+  double samplingProb() const {
+    return sampling_prob_;
+  }
+
+  bool checkScope(RecordScope sc) const {
+    return scopes_[(size_t)sc];
+  }
+
+  StartCallback start() const {
+    return start_;
+  }
+
+  EndCallback end() const {
+    return end_;
+  }
+
+ private:
+  StartCallback start_;
+  EndCallback end_;
+  double sampling_prob_ = 1.0;
+  std::array<bool, static_cast<size_t>(RecordScope::NUM_SCOPES)> scopes_ = {};
+  bool needs_inputs_ = false;
+  bool needs_outputs_ = false;
+  bool needs_ids_ = false;
+};
+
+// Notes:
+//  - two types of callbacks are provided: thread local and global
+//     - thread local callbacks are added/removed only for the given thread
+//       and are stored locally for each thread and separately from the list
+//       of the global callbacks
+//     - global callbacks are stored in a single per process list and are
+//       invoked by every RecordFunction, in addition to the thread local
+//       callbacks specific to the given thread
+//  - we allow the added callbacks to be sampled, by specifying a sampling
+//    probability for each callback pair, if the start callback is
+//    not picked to run, the corresponding end callback won't be called
+//  - a typical use case for the global callbacks is passive monitoring
+//    in the background (e.g. fleet-wide monitoring), without focusing on
+//    the specific piece of code
+//  - in contrast, thread local callbacks are enabled locally, on demand,
+//    for the specific piece of code (range) and are not sampled
+//  - a typical use case for thread local callbacks is profiler and code
+//    execution tracer
+//  - note, thread local callbacks are automatically propagated with
+//    ThreadLocalState across JIT continuations and async tasks (at::launch)
+
+typedef uint64_t CallbackHandle;
+
+// It is unnecessary to use atomic operations for enabling
+// thread-local function callbacks. Moreover, it prevents saving to
+// ThreadLocalState because std::atomic is non-copyable.
+struct RecordFunctionCallbacksEntry {
+  RecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h)
+      : callback_(cb), handle_(h) {}
+
+  RecordFunctionCallback callback_;
+  bool enabled_{true};
+  CallbackHandle handle_;
+};
+
+// Holds pairs (callbacks, unique_id)
+using RecordFunctionCallbacks = std::vector<RecordFunctionCallbacksEntry>;
+
+// Generated by the callback managers to determine which functions to run.
+struct StepCallbacks {
+  StepCallbacks() = default;
+  StepCallbacks(uint64_t thread_id, RecordScope scope)
+      : thread_id_{thread_id}, scope_{scope} {}
+
+  bool empty() const {
+    return callbacks_.empty();
+  }
+
+  struct StartEndPair {
+    RecordFunctionCallback::StartCallback start_;
+    RecordFunctionCallback::EndCallback end_;
+  };
+
+  using StartEndPairs = c10::SmallVector<StartEndPair, kSoftLimitCallbacks>;
+
+  StartEndPairs callbacks_;
+  uint64_t thread_id_{0};
+  RecordScope scope_{RecordScope::FUNCTION};
+  bool needs_inputs_{false};
+  bool needs_outputs_{false};
+  bool needs_ids_{false};
+};
 
 struct TORCH_API RecordFunction {
   // Default constructor is used with before function called afterwards:
   //  scope - record scope that this function tracks
   //  pre_sampled - whether this RecordFunction was already pre-sampled with
   //    kLowProb probability
-  RecordFunction(
-      RecordScope scope = RecordScope::FUNCTION,
-      bool pre_sampled = false);
+  explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION);
+  explicit RecordFunction(StepCallbacks&& step_callbacks);
 
   template <typename F>
   void before(
       F fn,
-      const std::vector<c10::IValue>* args,
+      c10::ArrayRef<const c10::IValue> args,
       int64_t current_sequence_nr = -1) {
     if (!isActive()) {
       return;
     }
-    state_->inputs_ = *args;
+    inputs_ = args;
+#ifndef NDEBUG
+    inputs_valid_ = true;
+#endif
     before(fn, current_sequence_nr);
   }
 
+  template <typename F>
+  void before(
+      F fn,
+      const std::vector<IValue>* args,
+      int64_t current_sequence_nr = -1) {
+    before(std::move(fn), c10::ArrayRef<const c10::IValue>(args->data(), args->size()), current_sequence_nr);
+  }
+
   // Destructor calls end callbacks
   virtual ~RecordFunction();
 
   RecordFunction(const RecordFunction&) = delete;
   RecordFunction& operator=(const RecordFunction&) = delete;
 
-  const char* name() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called name() on inactive RecordFunction");
-    return state_->name_.c_str();
-  }
+  const char* name() const;
 
   int64_t seqNr() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called seqNr() on inactive RecordFunction");
-    return state_->sequence_nr_;
+    return sequence_nr_;
   }
 
-  const std::vector<c10::IValue>& inputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called inputs() on inactive RecordFunction");
-    return state_->inputs_;
+  c10::ArrayRef<const IValue> inputs() const {
+#ifndef NDEBUG
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs_valid_, "Called inputs() outside RecordFunction start callback");
+#endif
+    return inputs_;
   }
 
   const std::vector<c10::IValue>& outputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called outputs() on inactive RecordFunction");
-    return state_->outputs_;
+    return outputs_;
   }
 
   void setOutputs(std::vector<c10::IValue>&& outputs) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setOutputs() on inactive RecordFunction");
-    state_->outputs_ = std::move(outputs);
+    outputs_ = std::move(outputs);
   }
 
   void setOutputs(c10::ArrayRef<c10::IValue> outputs) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setOutputs() on inactive RecordFunction");
-    state_->outputs_ = outputs.vec();
-  }
-
-  size_t num_inputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called num_inputs() on inactive RecordFunction");
-    return state_->op_input_size;
+    outputs_ = outputs.vec();
   }
 
-  size_t num_outputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called num_outputs() on inactive RecordFunction");
-    return state_->op_output_size;
-  }
+  size_t num_inputs() const;
+  size_t num_outputs() const;
 
   // Retrieves the thread_id that this RecordFunction ran start callbacks with.
   // Useful for writing thread safe end callbacks that may be potentially
   // executed in a different thread (async ops)
   uint64_t threadId() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called threadId() on inactive RecordFunction");
-    return state_->thread_id_;
+    return step_callbacks_.thread_id_;
   }
 
   // For backward functions - thread id of the corresponding forward function,
@@ -180,18 +351,15 @@ struct TORCH_API RecordFunction {
   // used alongside with sequence number to correlate backward functions with
   // the forward ones
   uint64_t forwardThreadId() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called forwardThreadId() on inactive RecordFunction");
-    return state_->fwd_thread_id_;
+    return fwd_thread_id_;
   }
 
   void setForwardThreadId(uint64_t thread_id) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setForwardThreadId() on inactive RecordFunction");
-    state_->fwd_thread_id_ = thread_id;
+    fwd_thread_id_ = thread_id;
   }
 
   RecordScope scope() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called scope() on inactive RecordFunction");
-    return state_->scope_;
+    return step_callbacks_.scope_;
   }
 
   // Returns logical thread_id for the current thread
@@ -202,39 +370,16 @@ struct TORCH_API RecordFunction {
 
   // before functions initialize RecordFunction members and call
   // start callbacks
+  using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
   void before(const char* name, int64_t sequence_nr = -1);
   void before(std::string name, int64_t sequence_nr = -1);
-  void before(c10::OperatorHandle const& op, int64_t sequence_nr = -1);
+  void before(schema_ref_t schema, int64_t sequence_nr = -1);
 
   // Sets node ID for distributed profiling
   static void setDefaultNodeId(int64_t defaultNodeId);
   // Gets node ID for distributed profiling
   static int64_t getDefaultNodeId();
 
-  template<typename F>
-  void before(
-      F fn,
-      c10::ArrayRef<c10::IValue> args,
-      int64_t current_sequence_nr = -1) {
-    if (!isActive()) {
-      return;
-    }
-    state_->inputs_ = args.vec();
-    before(fn, current_sequence_nr);
-  }
-
-  template<typename F>
-  void before(
-      F fn,
-      std::vector<c10::IValue>&& args,
-      int64_t current_sequence_nr = -1) {
-    if (!isActive()) {
-      return;
-    }
-    state_->inputs_ = std::move(args);
-    before(fn, current_sequence_nr);
-  }
-
   // Calls end callbacks. After end(), accessors will no longer provide useful results.
   void end();
 
@@ -244,238 +389,132 @@ struct TORCH_API RecordFunction {
   // Returns whether this RecordFunction corresponds to an async event orn ot.
   bool isAsync() const;
 
+  // Internal-only, used to denote out variant used for Static Runtime execution
+  void _setStaticRuntimeOutVariant();
+  bool isStaticRuntimeOutVariant() const;
+
   RecordFunctionHandle handle() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called handle() on inactive RecordFunction");
-    return state_->handle_;
+    return handle_;
   }
 
-  c10::optional<OperatorName> operator_name() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called operator_name() on inactive RecordFunction");
-    return state_->operator_name_;
-  }
+  c10::optional<OperatorName> operator_name() const;
 
   void setHandle(RecordFunctionHandle handle) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setHandle() on inactive RecordFunction");
-    state_->handle_ = handle;
+    handle_ = handle;
   }
 
   // Whether this RecordFunction runs any callbacks.
   bool isActive() const {
-    return state_.has_value();
+    return !step_callbacks_.empty();
   }
 
   bool needsInputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called needsInputs() on inactive RecordFunction");
-    return state_->needs_inputs;
+    return step_callbacks_.needs_inputs_;
   }
 
   bool needsOutputs() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called needsOutputs() on inactive RecordFunction");
-    return state_->needs_outputs;
+    return step_callbacks_.needs_outputs_;
   }
 
   int64_t debugHandle() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called debugHandle() on inactive RecordFunction");
-    return state_->debug_handle_;
+    return debug_handle_;
   }
 
   void setDebugHandle(int64_t debug_handle) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setDebugHandle() on inactive RecordFunction");
-    state_->debug_handle_ = debug_handle;
+    debug_handle_ = debug_handle;
   }
 
- private:
-
-  // Allows the modification of some internal states for callbacks.
-  friend class CallbackManager;
-
-  struct State {
-    explicit State(RecordScope scope) : scope_(scope) {}
-
-    // Whether any of the picked callbacks require inputs
-    bool needs_inputs = false;
-
-    // Whether any of the picked callbacks require outputs
-    bool needs_outputs = false;
-
-    // In cases when RecordFunction might be active but we chose not to
-    // use the observers (e.g. operator is not observed), this boolean
-    // flag is used to check whether the start callbacks were called
-    bool called_start_callbacks_ = false;
-
-    // Whether the RecordFunction is pre-sampled
-    bool pre_sampled_ = false;
+  void invalidateInputs() {
+#ifndef NDEBUG
+    inputs_valid_ = false;
+#endif
+  }
 
-    // Used internally to keep track of thread local and global callbacks
-    // that were picked to run; must be sorted;
-    CallbackHandles sorted_active_tls_handles_;
-    CallbackHandles sorted_active_global_handles_;
+ private:
+  void runStartCallbacks();
 
-    // Stores various ObserverContext objects with event metadata for thread local
-    // callbacks.
-    ObserverContextList tls_ctx_;
+  StepCallbacks step_callbacks_;
 
-    // Stores various ObserverContext objects with event metadata for global
-    // callbacks.
-    ObserverContextList global_ctx_;
+  // In cases when RecordFunction might be active but we chose not to
+  // use the observers (e.g. operator is not observed), this boolean
+  // flag is used to check whether the start callbacks were called
+  bool called_start_callbacks_ = false;
 
-    std::string name_;
-    int64_t sequence_nr_ = -1;
-    std::vector<c10::IValue> inputs_;
-    std::vector<c10::IValue> outputs_;
+#ifndef NDEBUG
+  bool inputs_valid_ = false;
+#endif
 
-    c10::optional<c10::OperatorName> operator_name_;
-    size_t op_input_size{0};
-    size_t op_output_size{0};
+  // Stores various ObserverContext objects with event metadata for callbacks.
+  ObserverContextList ctx_;
 
-    // Kind of scope this RecordFunction is observing
-    const RecordScope scope_;
+  c10::variant<std::string, schema_ref_t> fn_;
 
-    // The logical thread_id that this RecordFunction was created with
-    uint64_t thread_id_ = 0;
+  int64_t sequence_nr_ = -1;
+  c10::ArrayRef<const IValue> inputs_;
+  std::vector<c10::IValue> outputs_;
 
-    // For backward functions - thread id of the the forward function
-    uint64_t fwd_thread_id_ = 0;
+  // For backward functions - thread id of the the forward function
+  uint64_t fwd_thread_id_ = 0;
 
-    // Unique id for this RecordFunction, used in callbacks to track start
-    // and end of ranges
-    RecordFunctionHandle handle_ {0};
+  // Unique id for this RecordFunction, used in callbacks to track start
+  // and end of ranges
+  RecordFunctionHandle handle_ {0};
 
-    // Whether this record_function corresponds to an async event or not. Async
-    // events can complete in different threads or follow a future-like pattern
-    // of use.
-    bool is_async_{false};
+  // Whether this record_function corresponds to an async event or not. Async
+  // events can complete in different threads or follow a future-like pattern
+  // of use.
+  bool is_async_{false};
 
-    // Debug handles are used for lazy annotation of module hierarchy
-    // and callstack.
-    // This is specifically is useful for mobile runtime, where generated
-    // debug handles can be lazily symbolicated using debug information
-    int64_t debug_handle_{-1};
-  };
+  // Debug handles are used for lazy annotation of module hierarchy
+  // and callstack.
+  // This is specifically is useful for mobile runtime, where generated
+  // debug handles can be lazily symbolicated using debug information
+  int64_t debug_handle_{-1};
 
-  c10::optional<State> state_;
+  // Whether this RecordFunction is used for an out variant run with
+  // Static Runtime
+  bool is_static_runtime_out_variant_{false};
 };
 
-//
-// PyTorch callbacks/observers API:
-//
+TORCH_API StepCallbacks getStepCallbacks(RecordScope scope);
 
-/**
- * RecordFunctionCallback represents a pair of callbacks to be used with
- * RecordFunction, members:
- *   start, end - the callbacks to run when entering and exiting the scope;
- *     optionally, the start callback may return an ObserverContext which will
- *     be passed to the end callback, use appropriate constructor accordingly.
- *   needs_inputs - whether the callbacks need the inputs passed from the observed
- *     function/range; NOTE: passing the inputs incurs an additional overhead;
- *   sampling_probability - if not 1.0, then the callback is probabilistically sampled
- *     to run; NOTE: start and end callbacks always run as a pair and are sampled
- *     together;
- *   scopes - types of scopes to execute the callbacks on (see RecordScope);
- *     passing empty set means the callbacks will be executed for all possible
- *     scope types
- *   should_run - optional function that returns whether this callback should run;
- *     overwrites the effect of setting sampling_probability
- */
-class TORCH_API RecordFunctionCallback {
- public:
-  using StartCallback = std::unique_ptr<ObserverContext>(*)(const RecordFunction&);
-  using EndCallback = void (*)(const RecordFunction&, ObserverContext*);
-
-  // This interface supports observers that require passing an ObserverContext
-  // between start and end callbacks.
-  explicit RecordFunctionCallback(
-      StartCallback start,
-      EndCallback end = nullptr) :
-      start_(start),
-      end_(end) {
-    scopes_.fill(true);
-  }
-
-  RecordFunctionCallback& needsInputs(bool needs_inputs) {
-    needs_inputs_ = needs_inputs;
-    return *this;
-  }
-
-  RecordFunctionCallback& needsOutputs(bool needs_outputs) {
-    needs_outputs_ = needs_outputs;
-    return *this;
-  }
-
-  RecordFunctionCallback& needsIds(bool needs_ids) {
-    needs_ids_ = needs_ids;
-    return *this;
-  }
-
-  RecordFunctionCallback& samplingProb(double sampling_prob) {
-    TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0,
-        "Invalid sampling probability");
-    sampling_prob_ = sampling_prob;
-    return *this;
-  }
-
-  RecordFunctionCallback& scopes(
-      const std::unordered_set<RecordScope, std::hash<RecordScope>>& scopes) {
-    if (!scopes.empty()) {
-      scopes_.fill(false);
-      for (auto sc : scopes) {
-        scopes_[static_cast<size_t>(sc)] = true;
-      }
-    } else {
-      scopes_.fill(true);
-    }
-    return *this;
-  }
-
-  bool needsInputs() const {
-    return needs_inputs_;
-  }
-
-  bool needsOutputs() const {
-    return needs_outputs_;
-  }
-
-  bool needsIds() const {
-    return needs_ids_;
-  }
-
-  double samplingProb() const {
-    return sampling_prob_;
+namespace detail {
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope(RecordFunction& guard, F fn, const Inputs& inputs, Args&&... args) {
+  if (guard.needsInputs()) {
+    guard.before(fn, c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()), std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
   }
+}
 
-  bool checkScope(RecordScope sc) const {
-    return scopes_[(size_t)sc];
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(RecordFunction& guard, F fn, int64_t debug_handle, const Inputs& inputs, Args&&... args) {
+  guard.setDebugHandle(debug_handle);
+  if (guard.needsInputs()) {
+    guard.before(fn, c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()), std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
   }
+}
 
-  StartCallback start() const {
-    return start_;
-  }
+template <typename F, typename... Args>
+void record_function_with_scope(RecordFunction& guard, F fn, c10::ArrayRef<const c10::IValue> inputs, Args&&... args) {
+  return record_function_with_scope<c10::ArrayRef<const c10::IValue>, F, Args...>(guard, std::move(fn), inputs, std::forward<Args>(args)...);
+}
 
-  EndCallback end() const {
-    return end_;
-  }
+template <typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(RecordFunction& guard, F fn, int64_t debug_handle, c10::ArrayRef<const c10::IValue> inputs, Args&&... args) {
+  return record_function_with_scope_and_debug_handle<c10::ArrayRef<const c10::IValue>, F, Args...>(guard, std::move(fn), debug_handle, inputs, std::forward<Args>(args)...);
+}
 
- private:
-  friend class CallbackManager;
-  StartCallback start_;
-  EndCallback end_;
-  double sampling_prob_ = 1.0;
-  std::array<bool, static_cast<size_t>(RecordScope::NUM_SCOPES)> scopes_ = {};
-  bool needs_inputs_ = false;
-  bool needs_outputs_ = false;
-  bool needs_ids_ = false;
-};
+} // namespace detail
 
-// Using macro to minimize inputs copies,
 // optional argument - function's seq_no
 #define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \
   at::RecordFunction guard(scope); \
-  if (guard.isActive()) {          \
-    if (guard.needsInputs()) {                 \
-      guard.before(fn, inputs, ##__VA_ARGS__); \
-    } else { \
-      guard.before(fn, ##__VA_ARGS__); \
-    } \
+  if (guard.isActive()) { \
+    ::at::detail::record_function_with_scope(guard, fn, inputs, ##__VA_ARGS__); \
   }
 
 #define RECORD_FUNCTION(fn, inputs, ...) \
@@ -490,7 +529,7 @@ class TORCH_API RecordFunctionCallback {
 // Custom user scopes in C++; similar to Python's 'with record_function("..."):'
 #define RECORD_USER_SCOPE(fn) \
   RECORD_FUNCTION_WITH_SCOPE( \
-    at::RecordScope::USER_SCOPE, fn, {})
+    at::RecordScope::USER_SCOPE, fn, c10::ArrayRef<const c10::IValue>{})
 
 // RECORD_USER_SCOPE with inputs
 #define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \
@@ -501,15 +540,10 @@ class TORCH_API RecordFunctionCallback {
 // post process events
 #define RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(  \
     scope, fn, debug_handle, inputs, ...)           \
-    at::RecordFunction guard(scope);                \
-    if (guard.isActive()) {                         \
-      guard.setDebugHandle(debug_handle);           \
-      if (guard.needsInputs()) {                    \
-        guard.before(fn, inputs, ##__VA_ARGS__);    \
-      } else {                                      \
-        guard.before(fn, ##__VA_ARGS__);            \
-      }                                             \
-    }
+  at::RecordFunction guard(scope);                  \
+  if (guard.isActive()) {                           \
+    ::at::detail::record_function_with_scope_and_debug_handle(guard, fn, debug_handle, inputs, ##__VA_ARGS__); \
+  }
 
 // Helper macros to record LITE INTERPETER scope events with debug handles
 #define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(             \
@@ -517,63 +551,6 @@ class TORCH_API RecordFunctionCallback {
     RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(                      \
         at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
 
-// Notes:
-//  - two types of callbacks are provided: thread local and global
-//     - thread local callbacks are added/removed only for the given thread
-//       and are stored locally for each thread and separately from the list
-//       of the global callbacks
-//     - global callbacks are stored in a single per process list and are
-//       invoked by every RecordFunction, in addition to the thread local
-//       callbacks specific to the given thread
-//  - we allow the added callbacks to be sampled, by specifying a sampling
-//    probability for each callback pair, if the start callback is
-//    not picked to run, the corresponding end callback won't be called
-//  - a typical use case for the global callbacks is passive monitoring
-//    in the background (e.g. fleet-wide monitoring), without focusing on
-//    the specific peice of code
-//  - in contrast, thread local callbacks are enabled locally, on demand,
-//    for the specific piece of code (range) and are not sampled
-//  - a typical use case for thread local callbacks is profiler and code
-//    execution tracer
-//  - note, thread local callbacks are automatically propagated with
-//    ThreadLocalState across JIT continuations and async tasks (at::launch)
-//  - adding/removing global callbacks is not thread safe and should be done
-//    only when no other code is running, e.g. during the initialization
-
-typedef uint64_t CallbackHandle;
-
-// It is unnecessary to use atomic operations for enabling
-// thread-local function callbacks. Moreover, it prevents saving to
-// ThreadLocalState because std::atomic is non-copyable.
-struct ThreadLocalRecordFunctionCallbacksEntry {
-  RecordFunctionCallback callback;
-  bool enabled = true;
-  CallbackHandle handle;
-
-  ThreadLocalRecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h)
-      : callback(std::move(cb)), handle(h) {}
-
-  bool disable() {
-    auto old = enabled;
-    enabled = false;
-    return old != enabled;
-  }
-
-  bool enable() {
-    auto old = enabled;
-    enabled = true;
-    return old != enabled;
-  }
-
-  bool isEnabled() const {
-    return enabled;
-  }
-};
-
-// Holds pairs (callbacks, unique_id)
-using ThreadLocalRecordFunctionCallbacks =
-  std::vector<ThreadLocalRecordFunctionCallbacksEntry>;
-
 /**
  * addThreadLocalCallback adds a thread local callback to run with RecordFunction,
  * returns handle to use with removeThreadLocalCallback
@@ -595,7 +572,6 @@ TORCH_API void clearThreadLocalCallbacks();
 /**
  * addGlobalCallback adds a global callback to run with RecordFunction:
  *
- * WARNING: not thread safe, typically addGlobalCallback can be called
  * only during the program initialization
  */
 TORCH_API CallbackHandle addGlobalCallback(
@@ -605,7 +581,6 @@ TORCH_API CallbackHandle addGlobalCallback(
  * removeCallback removes a callback given the handle returned by
  * addThreadLocalCallback or addGlobalCallback;
  *
- * WARNING: removing a global callback is not thread safe,
  * no other code can run simultaneously
  */
 TORCH_API void removeCallback(CallbackHandle handle);
@@ -630,13 +605,12 @@ TORCH_API bool hasGlobalCallbacks();
 
 /**
  * clearGlobalCallbacks removes all global callbacks
- * WARNING: not thread safe
  */
 TORCH_API void clearGlobalCallbacks();
 
 // for both thread local and global callbacks
 TORCH_API bool hasCallbacks();
-TORCH_API void clearCallbacks(); // not thread safe
+TORCH_API void clearCallbacks();
 
 /**
  * enableRecordFunction enables RecordFunction thread locally
@@ -673,30 +647,15 @@ class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
 struct TORCH_API RecordFunctionTLS {
   // Thread local vector of callbacks, holds pairs (callbacks, unique_id);
   // must be sorted in increasing handles order
-  ThreadLocalRecordFunctionCallbacks sorted_tls_callbacks_;
+  RecordFunctionCallbacks sorted_tls_callbacks_;
 
   bool tls_record_function_enabled_ = true;
-
-  // Stores the number of coin flips before the next successful coin flip
-  int tries_left_ = 0;
 };
 
 TORCH_API const RecordFunctionTLS& get_record_function_tls_();
 
 TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
 
-// Checks whether RecordFunction should be called,
-// sets boolean pointed by the argument to whether pre-sampling was used
-TORCH_API bool shouldRunRecordFunction(bool*);
-
-// The following functions are used to disable/enable pre-sampling of RecordFunction
-// when high-frequency/non-sampled callbacks are added/removed.
-// Note: every call to bumpRecordAllFunctions() is supposed to be matched with
-// the corresponding releaseRecordAllFunctions() call.
-// Note: disabling pre-sampling of RecordFunction incurs an extra overhead, since
-// RecordFunction will be created for each operator call.
-TORCH_API void bumpRecordAllFunctions();
-TORCH_API void releaseRecordAllFunctions();
-TORCH_API bool checkRecordAllFunctions();
+TORCH_API void set_record_function_seed_for_testing(uint32_t seed);
 
 } // namespace at
diff --git a/aten/src/ATen/templates/CompositeViewCopyKernels.cpp b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
new file mode 100644
index 000000000000..558802a7b7e8
--- /dev/null
+++ b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
@@ -0,0 +1,20 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/clone.h>
+$ops_headers
+#endif
+
+namespace at {
+namespace native {
+
+
+${CompositeViewCopyKernel_Definitions}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/templates/DispatchKeyFunction.h b/aten/src/ATen/templates/DispatchKeyFunction.h
index 7cad9b73c6e1..c92d5eb3898e 100644
--- a/aten/src/ATen/templates/DispatchKeyFunction.h
+++ b/aten/src/ATen/templates/DispatchKeyFunction.h
@@ -11,32 +11,10 @@
 // Forward declarations of any types needed in the operator signatures.
 // We can't directly include these classes because it will cause circular include dependencies.
 // This file is included by TensorBody.h, which defines the Tensor class.
-namespace c10 {
-
-template<typename T>
-class optional;
-template<typename T>
-class List;
-class Stream;
-class Scalar;
-struct Storage;
-struct TensorOptions;
-
-}
+#include <ATen/core/ATen_fwd.h>
 
 namespace at {
 
-class Tensor;
-struct Dimname;
-struct Generator;
-using TensorList = c10::ArrayRef<Tensor>;
-using DimnameList = c10::ArrayRef<Dimname>;
-using c10::Stream;
-using c10::Storage;
-using c10::QScheme;
-using c10::Scalar;
-using c10::TensorOptions;
-
 namespace ${dispatch_namespace} {
 
 ${dispatch_namespaced_declarations}
diff --git a/aten/src/ATen/templates/DispatchKeyFunctions.h b/aten/src/ATen/templates/DispatchKeyFunctions.h
index 1718b4be8274..ffae71319137 100644
--- a/aten/src/ATen/templates/DispatchKeyFunctions.h
+++ b/aten/src/ATen/templates/DispatchKeyFunctions.h
@@ -1,4 +1,10 @@
 #include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
 // Note [Avoiding Include Cycles In Static Dispatch]
 // In order to avoid #include cycles in the static dispatch build, we've carefully split out
 // the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
@@ -20,4 +26,4 @@
 // - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
 // - This also means that static dispatch build, CPUFunctions.h only needs to
 //   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
-${inline_headers_for_nonstatic_build}
+${inline_headers}
diff --git a/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp b/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp
new file mode 100644
index 000000000000..1a5b4a452592
--- /dev/null
+++ b/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp
@@ -0,0 +1,9 @@
+// ${generated_comment}
+${includes}
+${native_functions_include}
+
+${namespace_prologue}
+
+${native_function_definitions}
+
+${namespace_epilogue}
diff --git a/aten/src/ATen/templates/DispatchKeyNativeFunctions.h b/aten/src/ATen/templates/DispatchKeyNativeFunctions.h
index abc3df27b93a..b45a17b5922f 100644
--- a/aten/src/ATen/templates/DispatchKeyNativeFunctions.h
+++ b/aten/src/ATen/templates/DispatchKeyNativeFunctions.h
@@ -1,13 +1,19 @@
 #pragma once
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
 // ${generated_comment}
 
 #include <ATen/Tensor.h>
 
-namespace ${cpp_namespace} {
+${namespace_prologue}
 
 struct ${class_name} {
 
 ${dispatch_declarations}
 
 };
-}  // namespace ${cpp_namespace}
+${namespace_epilogue}
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index 3313b90d51b0..fb531363f53e 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -62,14 +62,14 @@
 #include <ATen/TracerMode.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
 #include <ATen/core/Tensor.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/Storage.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/Optional.h>
-
-${static_dispatch_extra_headers}
+#include <c10/util/OptionalArrayRef.h>
 
 #include <ATen/ops/from_blob.h>
 #include <ATen/ops/tensor.h>
diff --git a/aten/src/ATen/templates/LazyIr.h b/aten/src/ATen/templates/LazyIr.h
new file mode 100644
index 000000000000..1ee90e66cc6c
--- /dev/null
+++ b/aten/src/ATen/templates/LazyIr.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// This file contains autogenerated LazyTensor IR nodes
+${lazy_ir_sysinc}
+${lazy_ir_inc}
+
+${namespace_prologue}
+using at::operator<<;
+
+// kNullValue is used to contribute a static hash value any time
+// a node has an Optional<Value> input that is nullopt.  It is important
+// to differentiate between HASH(nullopt, something) and HASH(something, nullopt),
+// and using kNullValue in the hash function in the order of arguments
+// serves this purpose.
+static const torch::lazy::Value kNullValue = torch::lazy::Value();
+
+${ir_declarations}
+
+${namespace_epilogue}
diff --git a/aten/src/ATen/templates/MethodOperators.h b/aten/src/ATen/templates/MethodOperators.h
index 4671efe519be..0e192cd05ef3 100644
--- a/aten/src/ATen/templates/MethodOperators.h
+++ b/aten/src/ATen/templates/MethodOperators.h
@@ -13,33 +13,7 @@
 // Forward declarations of any types needed in the operator signatures.
 // We can't directly include these classes because it will cause circular include dependencies.
 // This file is included by TensorBody.h, which defines the Tensor class.
-namespace c10 {
-
-template<typename T>
-class optional;
-template<typename T>
-class List;
-class Stream;
-class Scalar;
-struct Storage;
-struct TensorOptions;
-
-}
-
-namespace at {
-
-class Tensor;
-struct Dimname;
-struct Generator;
-using TensorList = c10::ArrayRef<Tensor>;
-using DimnameList = c10::ArrayRef<Dimname>;
-using c10::Stream;
-using c10::Storage;
-using c10::QScheme;
-using c10::Scalar;
-using c10::TensorOptions;
-
-}
+#include <ATen/core/ATen_fwd.h>
 
 ${MethodOperators_includes}
 
diff --git a/aten/src/ATen/templates/NativeMetaFunctions.h b/aten/src/ATen/templates/NativeMetaFunctions.h
index c83830f1eb10..89989e2121c9 100644
--- a/aten/src/ATen/templates/NativeMetaFunctions.h
+++ b/aten/src/ATen/templates/NativeMetaFunctions.h
@@ -3,6 +3,7 @@
 // ${generated_comment}
 
 #include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
 #include <ATen/TensorMeta.h>
 #include <ATen/TensorIterator.h>
 
diff --git a/aten/src/ATen/templates/Operator.h b/aten/src/ATen/templates/Operator.h
index 15434af15bae..8b3989b66deb 100644
--- a/aten/src/ATen/templates/Operator.h
+++ b/aten/src/ATen/templates/Operator.h
@@ -2,40 +2,15 @@
 
 // ${generated_comment}
 
-#include <c10/core/QScheme.h>
 #include <tuple>
 #include <vector>
 
-
 // Forward declarations of any types needed in the operator signatures.
 // We can't directly include these classes because it will cause circular include dependencies.
 // This file is included by TensorBody.h, which defines the Tensor class.
-namespace c10 {
-
-template<typename T>
-class optional;
-template<typename T>
-class List;
-class Stream;
-class Scalar;
-struct Storage;
-struct TensorOptions;
-
-}
+#include <ATen/core/ATen_fwd.h>
 
 namespace at {
-
-class Tensor;
-struct Dimname;
-struct Generator;
-using TensorList = c10::ArrayRef<Tensor>;
-using DimnameList = c10::ArrayRef<Dimname>;
-using c10::Stream;
-using c10::Storage;
-using c10::QScheme;
-using c10::Scalar;
-using c10::TensorOptions;
-
 namespace _ops {
 
 ${declarations}
diff --git a/aten/src/ATen/templates/Operators.cpp b/aten/src/ATen/templates/Operators.cpp
index e390de90d27a..082bb67c3e20 100644
--- a/aten/src/ATen/templates/Operators.cpp
+++ b/aten/src/ATen/templates/Operators.cpp
@@ -10,6 +10,8 @@
 ${operator_headers}
 #endif
 
+${static_dispatch_extra_headers}
+
 namespace at { namespace _ops {
 
 ${definitions}
diff --git a/aten/src/ATen/templates/Operators.h b/aten/src/ATen/templates/Operators.h
index 3dc55a677106..e74b96ef3d5c 100644
--- a/aten/src/ATen/templates/Operators.h
+++ b/aten/src/ATen/templates/Operators.h
@@ -17,9 +17,12 @@
   and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
 #endif
 
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/QScheme.h>
+#include <c10/util/OptionalArrayRef.h>
 #include <tuple>
 #include <vector>
 
diff --git a/aten/src/ATen/templates/RedispatchFunctions.cpp b/aten/src/ATen/templates/RedispatchFunctions.cpp
index e8d502dbdde5..58102bd97fca 100644
--- a/aten/src/ATen/templates/RedispatchFunctions.cpp
+++ b/aten/src/ATen/templates/RedispatchFunctions.cpp
@@ -6,8 +6,6 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/adaption.h>
 
-${static_dispatch_extra_headers}
-
 namespace at {
 
 namespace redispatch {
diff --git a/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp b/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp
new file mode 100644
index 000000000000..279f987c66a2
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
+
+#include <ATen/UnboxingFunctions.h>
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
+// incremental rebuilds. See the comment at the top of
+// templates/VariableType.cpp for an analogous, in-depth discussion.
+//
+// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10
+// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h
+// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
+
+namespace torch { namespace jit {
+
+using autograd::Variable;
+using autograd::variable_list;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using at::TensorOptions;
+using at::DeviceGuard;
+
+using ::c10::fmap;
+using ::c10::filter;
+
+namespace {
+
+RegisterOperators reg({
+
+    // Generated operators
+    ${unboxed_ops}
+});
+
+} // anon namespace
+
+
+}} // namespace torch::jit
diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index 63a7a1a1a6c6..df00c0d0e4a3 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -5,6 +5,11 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
 // NOTE: This condition is true for all PyTorch internal libraries, it
 //       just excludes external projects such as torch_xla which
 //       re-use some of the PyTorch codegen machinery.
@@ -57,12 +62,12 @@ namespace {
 
 ${dispatch_anonymous_definitions}
 
-TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
-  ${dispatch_registrations}
-}
+${static_init_dispatch_registrations}
 
 } // anonymous namespace
 
+${deferred_dispatch_registrations}
+
 namespace ${dispatch_namespace} {
 
 ${dispatch_namespaced_definitions}
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index 412d6f582e4a..3f08b1da436e 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -10,7 +10,13 @@
 #include <ATen/Operators.h>
 #include <ATen/NativeFunctions.h>
 #else
+// needed for the meta tensor calls to get stride info in functionalization
 #include <ATen/ops/empty_strided_native.h>
+// needed for special handling of copy_().
+// See Note [functionalizating copy_() and not preserving strides]
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/expand_copy_ops.h>
+
 $ops_headers
 #endif
 
@@ -19,7 +25,8 @@ namespace functionalization {
 
 
 ${func_definitions}
-}  // namespace func
+
+}  // namespace functionalization
 
 namespace {
 
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 7cbffad063d6..6d09d68deb1f 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -17,7 +17,6 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Storage.h>
-#include <ATen/core/TensorAccessor.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/core/UndefinedTensorImpl.h>
 #include <c10/core/WrapDimMinimal.h>
@@ -25,6 +24,7 @@
 #include <c10/util/Deprecated.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/macros/Export.h>
 #include <ATen/core/CheckMemoryFormat.h>
@@ -32,8 +32,11 @@
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/NamedTensor.h>
 #include <ATen/core/QuantizerBase.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/TensorAccessor.h>
 #include <ATen/core/TensorBase.h>
 
+
 #include <ATen/MethodOperators.h>
 
 namespace c10{
@@ -340,6 +343,10 @@ class TORCH_API Tensor: public TensorBase {
     return to(options().device(DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false);
   }
 
+  Tensor meta() const {
+    return to(options().device(DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false);
+  }
+
   // ~~~~~ Autograd API ~~~~~
 
   /// \fn bool is_leaf() const;
@@ -628,8 +635,7 @@ Tensor make_tensor(Args&&... args) {
 
 } // namespace at
 
-// See Note [Avoiding Include Cycles In Static Dispatch]
-${static_dispatch_ops_headers}
+
 namespace at {
 ${tensor_method_definitions}
 } // namespace at
@@ -674,7 +680,7 @@ struct MaybeOwnedTraits<at::Tensor> {
     return &borrow;
   }
 
-  static bool debugBorrowIsValid(const borrow_type& borrow) {
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
     return true;
   }
 };
diff --git a/aten/src/ATen/templates/TensorMethods.cpp b/aten/src/ATen/templates/TensorMethods.cpp
index 29a43a657bb3..dd8f3c384176 100644
--- a/aten/src/ATen/templates/TensorMethods.cpp
+++ b/aten/src/ATen/templates/TensorMethods.cpp
@@ -7,7 +7,9 @@ namespace at {
    template <>                                                       \
    TORCH_API T* TensorBase::data_ptr() const {                       \
      TORCH_CHECK(                                                    \
-         scalar_type() == ScalarType::name,                          \
+         scalar_type() == ScalarType::name                           \
+         || (isQIntType(scalar_type())                               \
+         && toUnderlying(scalar_type()) == ScalarType::name),        \
          "expected scalar type "                                     \
          #name                                                       \
          " but found ",                                              \
@@ -15,7 +17,7 @@ namespace at {
      return this->unsafeGetTensorImpl()->data_ptr_impl<T>();         \
    }
 
- AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST)
  AT_FORALL_QINT_TYPES(DEFINE_CAST)
  #undef DEFINE_CAST
 
@@ -25,7 +27,7 @@ namespace at {
      return item().to##name();     \
    }
 
- AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ITEM)
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ITEM)
  #undef DEFINE_ITEM
 
  } //namespace at
diff --git a/aten/src/ATen/templates/UfuncCPU.cpp b/aten/src/ATen/templates/UfuncCPU.cpp
new file mode 100644
index 000000000000..6b363a508907
--- /dev/null
+++ b/aten/src/ATen/templates/UfuncCPU.cpp
@@ -0,0 +1,19 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
diff --git a/aten/src/ATen/templates/UfuncCPUKernel.cpp b/aten/src/ATen/templates/UfuncCPUKernel.cpp
new file mode 100644
index 000000000000..0cac55664d61
--- /dev/null
+++ b/aten/src/ATen/templates/UfuncCPUKernel.cpp
@@ -0,0 +1,14 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/Dispatch.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
+namespace native {
+${native_definitions}
+}} // namespace at::native
diff --git a/aten/src/ATen/templates/UfuncCUDA.cu b/aten/src/ATen/templates/UfuncCUDA.cu
new file mode 100644
index 000000000000..e75d82d9cc84
--- /dev/null
+++ b/aten/src/ATen/templates/UfuncCUDA.cu
@@ -0,0 +1,21 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+${cuda_headers}
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
diff --git a/aten/src/ATen/templates/UnboxingFunctions.cpp b/aten/src/ATen/templates/UnboxingFunctions.cpp
new file mode 100644
index 000000000000..86c13235d862
--- /dev/null
+++ b/aten/src/ATen/templates/UnboxingFunctions.cpp
@@ -0,0 +1,35 @@
+#include <ATen/UnboxingFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+namespace at {
+namespace unboxing {
+
+using ::c10::fmap;
+using ::c10::filter;
+using torch::jit::peek;
+using torch::jit::drop;
+using torch::jit::pack;
+using torch::jit::pop;
+
+// Generated function declaration
+${definitions}
+
+} // namespace unboxing
+} // namespace at
diff --git a/aten/src/ATen/templates/UnboxingFunctions.h b/aten/src/ATen/templates/UnboxingFunctions.h
new file mode 100644
index 000000000000..a65469a9b012
--- /dev/null
+++ b/aten/src/ATen/templates/UnboxingFunctions.h
@@ -0,0 +1,32 @@
+// ${generated_comment}
+
+// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators,
+// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed
+// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This
+// unboxing logic is an alternative to template-based metaprogramming unboxing.
+
+#pragma once
+
+#include <ATen/ATen.h>
+namespace at {
+namespace unboxing {
+namespace {
+
+template<typename T, size_t N>
+std::array<T, N> as_array(const c10::List<c10::IValue>& list) {
+    std::array<T, N> res;
+    AT_ASSERT(list.size() == N);
+    std::vector<T> vec;
+    for (c10::IValue elem : list) {
+        vec.push_back(elem.to<T>());
+    }
+    std::copy(vec.begin(), vec.end(), res.begin());
+    return res;
+}
+}  // namespace <anonymous>
+using Stack = std::vector<c10::IValue>;
+// Generated function declaration
+${declarations}
+
+} // namespace unboxing
+} // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index cda1262f6eb1..ce83898a50eb 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -84,7 +84,6 @@ list(APPEND ATen_HIP_TEST_SRCS
 #  ${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_stream_test.cpp
 
 list(APPEND ATen_VULKAN_TEST_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp)
 
 list(APPEND ATen_MOBILE_TEST_SRCS
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 6c2c977abd7e..d14e7cd36ab9 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -41,7 +41,9 @@ void TestOnesAndDot(DeprecatedTypeProperties& type) {
   Tensor b = ones({3, 4}, type);
   ASSERT_EQ_RESOLVED((b + b).sum().item<double>(), 24);
   ASSERT_EQ_RESOLVED(b.numel(), 12);
-  ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item<double>(), 12);
+  if (type.backend() != Backend::CPU || type.scalarType() != kHalf) {
+    ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item<double>(), 12);
+  }
 }
 
 void TestSort(DeprecatedTypeProperties& type) {
diff --git a/aten/src/ATen/test/cuda_atomic_ops_test.cu b/aten/src/ATen/test/cuda_atomic_ops_test.cu
index 54d43ffec019..d5d261440064 100644
--- a/aten/src/ATen/test/cuda_atomic_ops_test.cu
+++ b/aten/src/ATen/test/cuda_atomic_ops_test.cu
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <c10/test/util/Macros.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAException.h>
 
 #include <cmath>
@@ -25,6 +26,24 @@ __global__ void mul_test_kernel(T * a, T * sum) {
   gpuAtomicMul(&sum[idx], a[idx]);
 }
 
+template <typename T>
+__global__ void max_test_kernel(T * a, T * max) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int a_idx = (tid) % (arraysize * factor);
+  int idx = a_idx / factor;
+
+  gpuAtomicMax(&max[idx], a[a_idx]);
+}
+
+template <typename T>
+__global__ void min_test_kernel(T * a, T * min) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int a_idx = (tid) % (arraysize * factor);
+  int idx = a_idx / factor;
+
+  gpuAtomicMin(&min[idx], a[a_idx]);
+}
+
 template <typename T>
 void test_atomic_add() {
   dim3 dimBlock(blocksize, 1);
@@ -75,7 +94,7 @@ void test_atomic_mul() {
   for (int i = 0; i < arraysize; ++i) {
     a[i] = 2;
     sum[i] = 2;
-    answer[i] = pow(sum[i], static_cast<T>(factor));
+    answer[i] = pow(sum[i], static_cast<T>(factor + 1));
   }
 
   cudaMalloc((void**)&ad, arraysize * sizeof(T));
@@ -97,7 +116,88 @@ void test_atomic_mul() {
   cudaFree(sumd);
 }
 
+template <typename T>
+void test_atomic_max() {
+  dim3 dimBlock(blocksize, 1);
+  dim3 dimGrid(1, 1);
+
+  T *ad, *sumd;
+
+  std::vector<T> a(arraysize * factor);
+  std::vector<T> sum(arraysize);
+  std::vector<T> answer(arraysize);
+
+  int j;
+  for (int i = 0; i < arraysize * factor; ++i) {
+    a[i] = i;
+    if (i % factor == 0) {
+      j = i / factor;
+      sum[j] = std::numeric_limits<T>::lowest();
+      answer[j] = (j + 1) * factor - 1;
+    }
+  }
+
+  cudaMalloc((void**)&ad, arraysize * factor * sizeof(T));
+  cudaMalloc((void**)&sumd, arraysize * sizeof(T));
+
+  cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice);
+  cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
+
+  max_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < arraysize; ++i) {
+    ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
+  }
+
+  cudaFree(ad);
+  cudaFree(sumd);
+}
+
+template <typename T>
+void test_atomic_min() {
+  dim3 dimBlock(blocksize, 1);
+  dim3 dimGrid(1, 1);
+
+  T *ad, *sumd;
+
+  std::vector<T> a(arraysize * factor);
+  std::vector<T> sum(arraysize);
+  std::vector<T> answer(arraysize);
+
+  int j;
+  for (int i = 0; i < arraysize * factor; ++i) {
+    a[i] = i;
+    if (i % factor == 0) {
+      j = i / factor;
+      sum[j] = std::numeric_limits<T>::max();
+      answer[j] = j * factor;
+    }
+  }
+
+  cudaMalloc((void**)&ad, arraysize * factor * sizeof(T));
+  cudaMalloc((void**)&sumd, arraysize * sizeof(T));
+
+  cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice);
+  cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice);
+
+  min_test_kernel<<<dimGrid, dimBlock>>>(ad, sumd);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < arraysize; ++i) {
+    ASSERT_EQ(sum[i], answer[i]) << typeid(T).name();
+  }
+
+  cudaFree(ad);
+  cudaFree(sumd);
+}
+
 TEST(TestAtomicOps, TestAtomicAdd) {
+  if (!at::cuda::is_available()) return;
   test_atomic_add<uint8_t>();
   test_atomic_add<int8_t>();
   test_atomic_add<int16_t>();
@@ -113,8 +213,25 @@ TEST(TestAtomicOps, TestAtomicAdd) {
 }
 
 TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMul)) {
+  if (!at::cuda::is_available()) return;
   test_atomic_mul<at::BFloat16>();
   test_atomic_mul<at::Half>();
   test_atomic_mul<float>();
   test_atomic_mul<double>();
 }
+
+TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMax)) {
+  if (!at::cuda::is_available()) return;
+  test_atomic_max<at::BFloat16>();
+  test_atomic_max<at::Half>();
+  test_atomic_max<float>();
+  test_atomic_max<double>();
+}
+
+TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMin)) {
+  if (!at::cuda::is_available()) return;
+  test_atomic_min<at::BFloat16>();
+  test_atomic_min<at::Half>();
+  test_atomic_min<float>();
+  test_atomic_min<double>();
+}
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index a55d9458e851..aa1644c94b76 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -76,6 +76,13 @@ __device__ void test(){
   assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold);
   assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold);
 #endif
+
+  // test complex<32>
+  Half real = 3.0f;
+  Half imag = -10.0f;
+  auto complex = c10::complex<Half>(real, imag);
+  assert(complex.real() == real);
+  assert(complex.imag() == imag);
 }
 
 __global__ void kernel(){
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 652823e8e9b1..02ccb8b6ce5d 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -164,3 +164,11 @@ TEST(TestHalf, CommonMath) {
   assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold);
 #endif
 }
+
+TEST(TestHalf, ComplexHalf) {
+  Half real = 3.0f;
+  Half imag = -10.0f;
+  auto complex = c10::complex<Half>(real, imag);
+  assert(complex.real() == real);
+  assert(complex.imag() == imag);
+}
diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
index f86bcec92d03..08312305975c 100644
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@@ -401,7 +401,6 @@ TEST(IValueTest, FutureSetError) {
   }
 }
 
-
 TEST(IValueTest, ValueEquality) {
   EXPECT_EQ(IValue("asdf"), IValue("asdf"));
   EXPECT_NE(IValue("asdf"), IValue("ASDF"));
@@ -804,6 +803,23 @@ TEST(IValueTest, ToWeakAndBack) {
   }
 }
 
+// Storage and Generator did not set is_intrusive_ptr if they were
+// undefined, which led use_count to return 1 instead of 0 for these
+// cases.
+TEST(IValueTest, UseCountCornerCases) {
+  at::Storage undefinedStorage;
+  at::Generator undefinedGenerator;
+  at::Tensor undefinedTensor;
+
+  IValue ivEmptyStorage(undefinedStorage);
+  IValue ivEmptyGenerator(undefinedGenerator);
+  IValue ivEmptyTensor(undefinedTensor);
+
+  ASSERT_EQ(1, ivEmptyStorage.use_count());
+  ASSERT_EQ(1, ivEmptyGenerator.use_count());
+  ASSERT_EQ(0, ivEmptyTensor.use_count());
+}
+
 // TODO(gmagogsfm): Add type conversion test?
 
 using ivalue::TupleElements;
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 6a3253c50548..ebe35b9a6a6b 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -295,3 +295,11 @@ TEST(TestScalarTensor, TestScalarTensorCUDA) {
     test(CUDA(kFloat));
   }
 }
+
+TEST(TestScalarTensor, TestScalarTensorMPS) {
+  manual_seed(123);
+
+  if (at::hasMPS()) {
+    test(MPS(kFloat));
+  }
+}
diff --git a/aten/src/ATen/test/stride_properties_test.cpp b/aten/src/ATen/test/stride_properties_test.cpp
index 09c13139fc4c..e37e7c13da42 100644
--- a/aten/src/ATen/test/stride_properties_test.cpp
+++ b/aten/src/ATen/test/stride_properties_test.cpp
@@ -69,10 +69,25 @@ TEST(StridePropertiesTest, ZeroStrideIndicesEagerConsistencyTest) {
 }
 
 TEST(StridePropertiesTest, ExpandedStrideIndicesTest) {
-  // NOLINTNEXTLINE(performance-for-range-copy)
   Tensor t = at::rand({1});
   // note: expand with dimension of size 1 is tricky as stride is different
   // depending on the order of the unsqueezed dimension.
   t = t.expand({4, 4, 4});
   EXPECT_TRUE(CheckStrideIndices(t, at::MemoryFormat::Contiguous));
 }
+
+TEST(StridePropertiesTest, SlicedStrideIndicesTest) {
+  // Sliced tensor shouldn't have changed stride order
+  Tensor t = at::rand({16, 4}).slice(1, 0, 4, 4);
+
+  auto temp = TensorType::create(c10::nullopt, c10::nullopt, t.sizes(), t.strides(), c10::nullopt);
+  TORCH_INTERNAL_ASSERT(temp->stride_properties().isComplete() &&
+      temp->stride_properties().isComplete(), "complete stride properties is needed for the test");
+  std::vector<size_t> stride_indices(2);
+  std::iota(stride_indices.rbegin(), stride_indices.rend(), 0);
+
+  auto index_iter = stride_indices.begin();
+  for (const auto& opt_stride : *temp->stride_properties().sizes()) {
+    EXPECT_TRUE(*index_iter++ == opt_stride->stride_index_.value());
+  }
+}
diff --git a/aten/src/ATen/test/vmap_test.cpp b/aten/src/ATen/test/vmap_test.cpp
index 0d325906325a..1feafaa59f3a 100644
--- a/aten/src/ATen/test/vmap_test.cpp
+++ b/aten/src/ATen/test/vmap_test.cpp
@@ -728,7 +728,7 @@ TEST(VmapTest, TestBatchedTensorExpand) {
     // logical dim is 0, expand size has same dimensionality as logical dim
     auto tensor = at::randn({2, 3});
     auto batched = makeBatched(tensor, {{0, 0}, {1, 1}});
-    auto batched_out = batched.expand({});
+    auto batched_out = batched.expand(c10::IntArrayRef({}));
     const auto& out = maybeGetBatchedImpl(batched_out)->value();
     ASSERT_EQ(out.data_ptr(), tensor.data_ptr());
     ASSERT_TRUE(at::allclose(out, tensor));
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index ec2b1bd12526..f13f673cb2b9 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -2,6 +2,9 @@
 
 #include <gtest/gtest.h>
 #include <ATen/ATen.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/native/vulkan/api/api.h>
+#include <ATen/native/vulkan/api/OpProfiler.h>
 #include <c10/util/irange.h>
 
 // TODO: These functions should move to a common place.
@@ -64,7 +67,7 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
 }
 
 
-static void gen_allpermutations(std::vector<std::vector<int64_t>>& out, std::vector<int64_t> in, int i) {
+static void gen_allpermutations(std::vector<std::vector<int64_t>>& out, std::vector<int64_t> in, unsigned i) {
   // generate all permutations of a given dims
   if (i == in.size()) {
     out.push_back(in);
@@ -137,11 +140,49 @@ static void clone_test(const std::vector<int64_t>& size, c10::optional<at::Memor
   ASSERT_TRUE(check);
 }
 
+template <class... Inputs>
+inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+  return {std::forward<Inputs>(inputs)...};
+}
+
+template <class... Args>
+inline std::vector<c10::IValue> callOpByHandle(
+    const c10::OperatorHandle& op,
+    Args... args) {
+  auto stack = makeStack(std::forward<Args>(args)...);
+  c10::Dispatcher::singleton().callBoxed(op, &stack);
+  return stack;
+}
+
+template <class... Args>
+inline std::vector<c10::IValue> callOpByName(
+    const char* func_name,
+    const char* overload_name,
+    Args... args) {
+  const c10::optional<c10::OperatorHandle> op_handle =
+      c10::Dispatcher::singleton().findSchema({func_name, overload_name});
+  assert(op_handle.has_value());
+  return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
+}
+
 } // namespace
 
 namespace {
 
-TEST(VulkanAPITest, adaptive_avg_pool2d) {
+class VulkanAPITest : public ::testing::Test {
+public:
+#if defined (__ANDROID__)  // to avoid `Undefined symbols for architecture arm64` error
+    static void SetUpTestSuite() {
+      at::native::vulkan::api::context()->querypool().enable();
+    }
+
+    static void TearDownTestSuite() {
+      at::native::vulkan::api::context()->querypool().disable(false);
+    }
+#endif
+};
+
+TEST_F(VulkanAPITest, adaptive_avg_pool2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -159,7 +200,7 @@ TEST(VulkanAPITest, adaptive_avg_pool2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add) {
+TEST_F(VulkanAPITest, add) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -181,7 +222,7 @@ TEST(VulkanAPITest, add) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_broadcast0) {
+TEST_F(VulkanAPITest, add_broadcast0) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -203,7 +244,7 @@ TEST(VulkanAPITest, add_broadcast0) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_broadcast1) {
+TEST_F(VulkanAPITest, add_broadcast1) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -225,7 +266,7 @@ TEST(VulkanAPITest, add_broadcast1) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_broadcast2) {
+TEST_F(VulkanAPITest, add_broadcast2) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -247,7 +288,7 @@ TEST(VulkanAPITest, add_broadcast2) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_) {
+TEST_F(VulkanAPITest, add_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -269,7 +310,7 @@ TEST(VulkanAPITest, add_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_broadcast0_) {
+TEST_F(VulkanAPITest, add_broadcast0_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -291,7 +332,7 @@ TEST(VulkanAPITest, add_broadcast0_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_broadcast1_) {
+TEST_F(VulkanAPITest, add_broadcast1_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -313,7 +354,7 @@ TEST(VulkanAPITest, add_broadcast1_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_scalar) {
+TEST_F(VulkanAPITest, add_scalar) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -334,7 +375,7 @@ TEST(VulkanAPITest, add_scalar) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, add_scalar_) {
+TEST_F(VulkanAPITest, add_scalar_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -355,7 +396,7 @@ TEST(VulkanAPITest, add_scalar_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, addmm) {
+TEST_F(VulkanAPITest, addmm) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -379,7 +420,7 @@ TEST(VulkanAPITest, addmm) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, addmm_expand) {
+TEST_F(VulkanAPITest, addmm_expand) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -403,7 +444,7 @@ TEST(VulkanAPITest, addmm_expand) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, avg_pool2d) {
+TEST_F(VulkanAPITest, avg_pool2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -420,7 +461,7 @@ TEST(VulkanAPITest, avg_pool2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, clamp) {
+TEST_F(VulkanAPITest, clamp) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -442,7 +483,7 @@ TEST(VulkanAPITest, clamp) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, clamp_) {
+TEST_F(VulkanAPITest, clamp_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -464,7 +505,7 @@ TEST(VulkanAPITest, clamp_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, conv2d) {
+TEST_F(VulkanAPITest, conv2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -537,7 +578,7 @@ TEST(VulkanAPITest, conv2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, conv2d_dw) {
+TEST_F(VulkanAPITest, conv2d_dw) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -609,7 +650,7 @@ TEST(VulkanAPITest, conv2d_dw) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, conv2d_pw) {
+TEST_F(VulkanAPITest, conv2d_pw) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -681,7 +722,7 @@ TEST(VulkanAPITest, conv2d_pw) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, conv2d_winograd) {
+TEST_F(VulkanAPITest, conv2d_winograd) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -753,7 +794,7 @@ TEST(VulkanAPITest, conv2d_winograd) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, copy) {
+TEST_F(VulkanAPITest, copy) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -769,7 +810,7 @@ TEST(VulkanAPITest, copy) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div) {
+TEST_F(VulkanAPITest, div) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -791,7 +832,7 @@ TEST(VulkanAPITest, div) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_broadcast0) {
+TEST_F(VulkanAPITest, div_broadcast0) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -813,7 +854,7 @@ TEST(VulkanAPITest, div_broadcast0) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_broadcast1) {
+TEST_F(VulkanAPITest, div_broadcast1) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -835,7 +876,7 @@ TEST(VulkanAPITest, div_broadcast1) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_broadcast2) {
+TEST_F(VulkanAPITest, div_broadcast2) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -857,7 +898,7 @@ TEST(VulkanAPITest, div_broadcast2) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_) {
+TEST_F(VulkanAPITest, div_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -879,7 +920,7 @@ TEST(VulkanAPITest, div_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_broadcast0_) {
+TEST_F(VulkanAPITest, div_broadcast0_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -901,7 +942,7 @@ TEST(VulkanAPITest, div_broadcast0_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_broadcast1_) {
+TEST_F(VulkanAPITest, div_broadcast1_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -923,7 +964,7 @@ TEST(VulkanAPITest, div_broadcast1_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_scalar) {
+TEST_F(VulkanAPITest, div_scalar) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -944,7 +985,7 @@ TEST(VulkanAPITest, div_scalar) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, div_scalar_) {
+TEST_F(VulkanAPITest, div_scalar_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -965,7 +1006,7 @@ TEST(VulkanAPITest, div_scalar_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, empty) {
+TEST_F(VulkanAPITest, empty) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -973,7 +1014,7 @@ TEST(VulkanAPITest, empty) {
   ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat)));
 }
 
-TEST(VulkanAPITest, hardsigmoid) {
+TEST_F(VulkanAPITest, hardsigmoid) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -992,7 +1033,7 @@ TEST(VulkanAPITest, hardsigmoid) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, hardsigmoid_) {
+TEST_F(VulkanAPITest, hardsigmoid_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1011,7 +1052,7 @@ TEST(VulkanAPITest, hardsigmoid_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, hardshrink) {
+TEST_F(VulkanAPITest, hardshrink) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1033,7 +1074,7 @@ TEST(VulkanAPITest, hardshrink) {
   }
 }
 
-TEST(VulkanAPITest, hardshrink_) {
+TEST_F(VulkanAPITest, hardshrink_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1054,7 +1095,7 @@ TEST(VulkanAPITest, hardshrink_) {
   }
 }
 
-TEST(VulkanAPITest, leaky_relu) {
+TEST_F(VulkanAPITest, leaky_relu) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1076,7 +1117,7 @@ TEST(VulkanAPITest, leaky_relu) {
   }
 }
 
-TEST(VulkanAPITest, leaky_relu_) {
+TEST_F(VulkanAPITest, leaky_relu_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1097,7 +1138,205 @@ TEST(VulkanAPITest, leaky_relu_) {
   }
 }
 
-TEST(VulkanAPITest, hardswish) {
+TEST_F(VulkanAPITest, lerp) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  const auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
+  const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(c_cpu, c_vulkan.cpu());
+  if (!check) {
+    showRtol(c_cpu, c_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_broadcast0) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  const auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
+  const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(c_cpu, c_vulkan.cpu());
+  if (!check) {
+    showRtol(c_cpu, c_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_broadcast1) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  const auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu);
+  const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(c_cpu, c_vulkan.cpu());
+  if (!check) {
+    showRtol(c_cpu, c_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  a_cpu.lerp_(b_cpu, w_cpu);
+  a_vulkan.lerp_(b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_broadcast0_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  a_cpu.lerp_(b_cpu, w_cpu);
+  a_vulkan.lerp_(b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_broadcast1_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto w_vulkan = w_cpu.vulkan();
+
+  a_cpu.lerp_(b_cpu, w_cpu);
+  a_vulkan.lerp_(b_vulkan, w_vulkan);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_scalar) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const float w_scalar = 3.1415f;
+
+  const auto c_cpu = at::lerp(a_cpu, b_cpu, w_scalar);
+  const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_scalar);
+
+  const auto check = almostEqual(c_cpu, c_vulkan.cpu());
+  if (!check) {
+    showRtol(c_cpu, c_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, lerp_scalar_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  const auto b_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto b_vulkan = b_cpu.vulkan();
+
+  const float w_scalar = 3.1415f;
+
+  a_cpu.lerp_(b_cpu, w_scalar);
+  a_vulkan.lerp_(b_vulkan, w_scalar);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, hardswish) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1116,7 +1355,7 @@ TEST(VulkanAPITest, hardswish) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, hardswish_) {
+TEST_F(VulkanAPITest, hardswish_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1135,7 +1374,7 @@ TEST(VulkanAPITest, hardswish_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, max_pool2d) {
+TEST_F(VulkanAPITest, max_pool2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1153,7 +1392,7 @@ TEST(VulkanAPITest, max_pool2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mean) {
+TEST_F(VulkanAPITest, mean) {
   const auto in_cpu = at::rand({17, 3, 79, 53}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
   const auto out_cpu = at::mean(in_cpu, {-1, -2}, true);
 
@@ -1168,7 +1407,7 @@ TEST(VulkanAPITest, mean) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mean2d) {
+TEST_F(VulkanAPITest, mean2d) {
   const auto in_cpu = at::rand({11, 7, 173, 37}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
   const auto out_cpu = at::mean(in_cpu, {-1, -2}, false);
 
@@ -1183,7 +1422,7 @@ TEST(VulkanAPITest, mean2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mm) {
+TEST_F(VulkanAPITest, mm) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1203,7 +1442,7 @@ TEST(VulkanAPITest, mm) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul) {
+TEST_F(VulkanAPITest, mul) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1225,7 +1464,7 @@ TEST(VulkanAPITest, mul) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_broadcast0) {
+TEST_F(VulkanAPITest, mul_broadcast0) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1247,7 +1486,7 @@ TEST(VulkanAPITest, mul_broadcast0) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_broadcast1) {
+TEST_F(VulkanAPITest, mul_broadcast1) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1269,7 +1508,7 @@ TEST(VulkanAPITest, mul_broadcast1) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_broadcast2) {
+TEST_F(VulkanAPITest, mul_broadcast2) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1291,7 +1530,7 @@ TEST(VulkanAPITest, mul_broadcast2) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_) {
+TEST_F(VulkanAPITest, mul_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1313,7 +1552,7 @@ TEST(VulkanAPITest, mul_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_broadcast0_) {
+TEST_F(VulkanAPITest, mul_broadcast0_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1335,7 +1574,7 @@ TEST(VulkanAPITest, mul_broadcast0_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_broadcast1_) {
+TEST_F(VulkanAPITest, mul_broadcast1_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1357,7 +1596,7 @@ TEST(VulkanAPITest, mul_broadcast1_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_scalar) {
+TEST_F(VulkanAPITest, mul_scalar) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1378,7 +1617,7 @@ TEST(VulkanAPITest, mul_scalar) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, mul_scalar_) {
+TEST_F(VulkanAPITest, mul_scalar_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1399,7 +1638,7 @@ TEST(VulkanAPITest, mul_scalar_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, reflection_pad2d) {
+TEST_F(VulkanAPITest, reflection_pad2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1418,7 +1657,7 @@ TEST(VulkanAPITest, reflection_pad2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, reshape) {
+TEST_F(VulkanAPITest, reshape) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1440,7 +1679,7 @@ TEST(VulkanAPITest, reshape) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, reshape_) {
+TEST_F(VulkanAPITest, reshape_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1462,7 +1701,7 @@ TEST(VulkanAPITest, reshape_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sigmoid) {
+TEST_F(VulkanAPITest, sigmoid) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1481,7 +1720,7 @@ TEST(VulkanAPITest, sigmoid) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sigmoid_) {
+TEST_F(VulkanAPITest, sigmoid_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1500,7 +1739,7 @@ TEST(VulkanAPITest, sigmoid_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, softmax) {
+TEST_F(VulkanAPITest, softmax) {
   at::Tensor test_in[] = {
     at::rand({1, 196, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
     at::rand({1, 197, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
@@ -1523,7 +1762,7 @@ TEST(VulkanAPITest, softmax) {
   }
 }
 
-TEST(VulkanAPITest, log_softmax) {
+TEST_F(VulkanAPITest, log_softmax) {
   at::Tensor test_in[] = {
     at::rand({1, 196, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
     at::rand({1, 197, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
@@ -1546,12 +1785,12 @@ TEST(VulkanAPITest, log_softmax) {
   }
 }
 
-TEST(VulkanAPITest, tanh) {
+TEST_F(VulkanAPITest, tanh) {
   if (!at::is_vulkan_available()) {
     return;
   }
 
-  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
   const auto in_vulkan = in_cpu.vulkan();
 
   const auto out_cpu = at::tanh(in_cpu);
@@ -1565,12 +1804,12 @@ TEST(VulkanAPITest, tanh) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, tanh_) {
+TEST_F(VulkanAPITest, tanh_) {
   if (!at::is_vulkan_available()) {
     return;
   }
 
-  auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
   auto vulkan = cpu.vulkan();
 
   at::tanh_(cpu);
@@ -1584,7 +1823,7 @@ TEST(VulkanAPITest, tanh_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub) {
+TEST_F(VulkanAPITest, sub) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1606,7 +1845,7 @@ TEST(VulkanAPITest, sub) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_broadcast0) {
+TEST_F(VulkanAPITest, sub_broadcast0) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1628,7 +1867,7 @@ TEST(VulkanAPITest, sub_broadcast0) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_broadcast1) {
+TEST_F(VulkanAPITest, sub_broadcast1) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1650,7 +1889,7 @@ TEST(VulkanAPITest, sub_broadcast1) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_broadcast2) {
+TEST_F(VulkanAPITest, sub_broadcast2) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1672,7 +1911,7 @@ TEST(VulkanAPITest, sub_broadcast2) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_) {
+TEST_F(VulkanAPITest, sub_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1694,7 +1933,7 @@ TEST(VulkanAPITest, sub_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_broadcast0_) {
+TEST_F(VulkanAPITest, sub_broadcast0_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1716,7 +1955,7 @@ TEST(VulkanAPITest, sub_broadcast0_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, sub_broadcast1_) {
+TEST_F(VulkanAPITest, sub_broadcast1_) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1738,7 +1977,7 @@ TEST(VulkanAPITest, sub_broadcast1_) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, transposed_conv2d) {
+TEST_F(VulkanAPITest, transposed_conv2d) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1818,7 +2057,7 @@ TEST(VulkanAPITest, transposed_conv2d) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, upsample_nearest2d) {
+TEST_F(VulkanAPITest, upsample_nearest2d) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -1838,7 +2077,7 @@ TEST(VulkanAPITest, upsample_nearest2d) {
 }
 
 #if !defined(__APPLE__)
-TEST(VulkanAPITest, cat_dim1_samefeature_success) {
+TEST_F(VulkanAPITest, cat_dim1_samefeature_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1862,7 +2101,7 @@ TEST(VulkanAPITest, cat_dim1_samefeature_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_difffeature_success) {
+TEST_F(VulkanAPITest, cat_dim1_difffeature_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1886,7 +2125,7 @@ TEST(VulkanAPITest, cat_dim1_difffeature_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_texture2d_success) {
+TEST_F(VulkanAPITest, cat_dim1_texture2d_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1911,7 +2150,7 @@ TEST(VulkanAPITest, cat_dim1_texture2d_success) {
 }
 #endif /* !defined(__APPLE__) */
 
-TEST(VulkanAPITest, cat_dim1_singledepth_success) {
+TEST_F(VulkanAPITest, cat_dim1_singledepth_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1935,7 +2174,7 @@ TEST(VulkanAPITest, cat_dim1_singledepth_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_singletensor_success) {
+TEST_F(VulkanAPITest, cat_dim1_singletensor_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1957,7 +2196,7 @@ TEST(VulkanAPITest, cat_dim1_singletensor_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_twotensors_success) {
+TEST_F(VulkanAPITest, cat_dim1_twotensors_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -1980,7 +2219,7 @@ TEST(VulkanAPITest, cat_dim1_twotensors_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_bat1_mult4ch_success) {
+TEST_F(VulkanAPITest, cat_dim1_bat1_mult4ch_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2004,7 +2243,7 @@ TEST(VulkanAPITest, cat_dim1_bat1_mult4ch_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_bat2_mult4ch_success) {
+TEST_F(VulkanAPITest, cat_dim1_bat2_mult4ch_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2028,7 +2267,7 @@ TEST(VulkanAPITest, cat_dim1_bat2_mult4ch_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_mult4ch_mixed_success) {
+TEST_F(VulkanAPITest, cat_dim1_mult4ch_mixed_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2052,7 +2291,7 @@ TEST(VulkanAPITest, cat_dim1_mult4ch_mixed_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) {
+TEST_F(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2077,7 +2316,7 @@ TEST(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim2_sameheight_success) {
+TEST_F(VulkanAPITest, cat_dim2_sameheight_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2101,7 +2340,7 @@ TEST(VulkanAPITest, cat_dim2_sameheight_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim2_diffheight_success) {
+TEST_F(VulkanAPITest, cat_dim2_diffheight_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2125,7 +2364,7 @@ TEST(VulkanAPITest, cat_dim2_diffheight_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim2_singledepth_success) {
+TEST_F(VulkanAPITest, cat_dim2_singledepth_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2149,7 +2388,7 @@ TEST(VulkanAPITest, cat_dim2_singledepth_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, cat_dim2_invalidinputs_exceptions) {
+TEST_F(VulkanAPITest, cat_dim2_invalidinputs_exceptions) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2198,7 +2437,7 @@ TEST(VulkanAPITest, cat_dim2_invalidinputs_exceptions) {
   }
 }
 
-TEST(VulkanAPITest, permute_2d_success) {
+TEST_F(VulkanAPITest, permute_2d_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2220,7 +2459,7 @@ TEST(VulkanAPITest, permute_2d_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, permute_3d_success) {
+TEST_F(VulkanAPITest, permute_3d_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2247,7 +2486,7 @@ TEST(VulkanAPITest, permute_3d_success) {
   }
 }
 
-TEST(VulkanAPITest, permute_4d_success) {
+TEST_F(VulkanAPITest, permute_4d_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2274,7 +2513,7 @@ TEST(VulkanAPITest, permute_4d_success) {
   }
 }
 
-TEST(VulkanAPITest, permute_4dmclaren_success) {
+TEST_F(VulkanAPITest, permute_4dmclaren_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2296,7 +2535,7 @@ TEST(VulkanAPITest, permute_4dmclaren_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, permute_4dbig_success) {
+TEST_F(VulkanAPITest, permute_4dbig_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2323,7 +2562,7 @@ TEST(VulkanAPITest, permute_4dbig_success) {
   }
 }
 
-TEST(VulkanAPITest, permute_negativedims_success) {
+TEST_F(VulkanAPITest, permute_negativedims_success) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2345,7 +2584,7 @@ TEST(VulkanAPITest, permute_negativedims_success) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, permute_1d_nochange) {
+TEST_F(VulkanAPITest, permute_1d_nochange) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2367,7 +2606,7 @@ TEST(VulkanAPITest, permute_1d_nochange) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, permute_sameDims_nochange) {
+TEST_F(VulkanAPITest, permute_sameDims_nochange) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2389,7 +2628,7 @@ TEST(VulkanAPITest, permute_sameDims_nochange) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanAPITest, permute_invalidinputs_exceptions) {
+TEST_F(VulkanAPITest, permute_invalidinputs_exceptions) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -2449,7 +2688,7 @@ TEST(VulkanAPITest, permute_invalidinputs_exceptions) {
   }, ::c10::Error);
 }
 
-TEST(VulkanAPITest, slice_width_success) {
+TEST_F(VulkanAPITest, slice_width_success) {
   // Arrange
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {3, {2, 3, 40, 50}},  // 4D tensors with dim=width
@@ -2462,7 +2701,7 @@ TEST(VulkanAPITest, slice_width_success) {
   slice_tests(dim2sizes);
 }
 
-TEST(VulkanAPITest, slice_height_success) {
+TEST_F(VulkanAPITest, slice_height_success) {
   // Arrange
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {2, {2, 3, 40, 50}},  // 4D tensors with dim=height
@@ -2475,7 +2714,7 @@ TEST(VulkanAPITest, slice_height_success) {
   slice_tests(dim2sizes);
 }
 
-TEST(VulkanAPITest, slice_feature_success) {
+TEST_F(VulkanAPITest, slice_feature_success) {
   // Arrange
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
@@ -2487,7 +2726,7 @@ TEST(VulkanAPITest, slice_feature_success) {
   slice_tests(dim2sizes);
 }
 
-TEST(VulkanAPITest, slice_batch_success) {
+TEST_F(VulkanAPITest, slice_batch_success) {
   // Arrange
   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
     {0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
@@ -2498,7 +2737,7 @@ TEST(VulkanAPITest, slice_batch_success) {
   slice_tests(dim2sizes);
 }
 
-TEST(VulkanAPITest, slice_invalidinputs_exceptions) {
+TEST_F(VulkanAPITest, slice_invalidinputs_exceptions) {
   // Act: slice step must be positive
   EXPECT_THROW({
     slice_test({2, 3, 4, 5}, 3, 0, 3, 0);
@@ -2515,7 +2754,7 @@ TEST(VulkanAPITest, slice_invalidinputs_exceptions) {
   }, ::c10::Error);
 }
 
-TEST(VulkanAPITest, clone_success) {
+TEST_F(VulkanAPITest, clone_success) {
   // Arrange
   std::multimap<c10::optional<c10::MemoryFormat>, std::vector<int64_t>> mem2sizes {
     {c10::MemoryFormat::Preserve, {2, 3, 5, 161}},    // 4D tensors with MemoryFormat::Preserve
@@ -2538,7 +2777,7 @@ TEST(VulkanAPITest, clone_success) {
   }
 }
 
-TEST(VulkanAPITest, clone_invalidinputs_exceptions) {
+TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) {
   // Act: Vulkan supports Preserve and Contiguous memory foramts
   EXPECT_THROW({
     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast);
@@ -2786,7 +3025,7 @@ class MobileNetV2 final : public OpsList {
   }
 };
 
-TEST(VulkanAPITest, mobilenetv2) {
+TEST_F(VulkanAPITest, mobilenetv2) {
   if (!at::is_vulkan_available()) {
     return;
   }
@@ -2805,6 +3044,453 @@ TEST(VulkanAPITest, mobilenetv2) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, gru_mclareninputs_success) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = 384;  // input_size
+  const int H_out = 384; // hidden_size
+  const int num_layers = 2;
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
+  // Act
+  const auto out_cpu = at::gru(in_cpu, h0_cpu,
+      { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+
+  // weights/biases should be always on CPU.
+  const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+
+  auto cpu_output = std::get<0>(out_cpu);
+  auto cpu_hidden = std::get<1>(out_cpu);
+  auto vulkan_output = std::get<0>(out_vulkan);
+  auto vulkan_hidden = std::get<1>(out_vulkan);
+
+  // Assert
+  const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
+  if (!check_output) {
+    showRtol(cpu_output, vulkan_output.cpu());
+  }
+  ASSERT_TRUE(check_output);
+
+  const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
+  if (!check_hidden) {
+    showRtol(cpu_hidden, vulkan_hidden.cpu());
+  }
+  ASSERT_TRUE(check_hidden);
+}
+
+TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = 384;  // input_size
+  const int H_out = 384; // hidden_size
+  const int num_layers = 2;
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
+  // Act: incorrect # of weights/biases
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: non-3D input tensor
+  EXPECT_THROW({
+    const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+    at::gru(in_cpu_2d.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: non-3D hidden tensor
+  EXPECT_THROW({
+    const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+    at::gru(in_cpu.vulkan(), h0_cpu_2d.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: has_biases should be true
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      false, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: train should be false
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: bidirectional should be false
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, true, batch_first);
+  }, ::c10::Error);
+
+  // Act: batch_first should be true
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, false);
+  }, ::c10::Error);
+
+  // Act: dropout should be 0.0
+  EXPECT_THROW({
+    at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, 1.0, train, bidirectional, batch_first);
+  }, ::c10::Error);
+}
+
+TEST_F(VulkanAPITest, gru_prepack_success) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = 384;  // input_size
+  const int H_out = 384; // hidden_size
+  const int num_layers = 2;
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
+  // Act
+  const auto out_cpu = at::gru(in_cpu, h0_cpu,
+      { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+
+  auto prepack = callOpByName(
+      "vulkan_prepack::gru_prepack",
+      "",
+      std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+        weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+  auto out_vulkan = callOpByName(
+      "vulkan_prepack::gru_run",
+      "",
+      in_cpu.vulkan(), h0_cpu.vulkan(), prepack[0]);
+
+  auto cpu_output = std::get<0>(out_cpu);
+  auto cpu_hidden = std::get<1>(out_cpu);
+  auto vulkan_output = out_vulkan[0].toTensor();
+  auto vulkan_hidden = out_vulkan[1].toTensor();
+
+  // Assert
+  const auto check_output = almostEqual(cpu_output, vulkan_output.cpu());
+  if (!check_output) {
+    showRtol(cpu_output, vulkan_output.cpu());
+  }
+  ASSERT_TRUE(check_output);
+
+  const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu());
+  if (!check_hidden) {
+    showRtol(cpu_hidden, vulkan_hidden.cpu());
+  }
+  ASSERT_TRUE(check_hidden);
+}
+
+TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = 384;  // input_size
+  const int H_out = 384; // hidden_size
+  const int num_layers = 2;
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
+  // Act: incorrect # of weights/biases
+  EXPECT_THROW({
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) }),
+        has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: non-3D input tensor
+  EXPECT_THROW({
+    const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+    auto out_vulkan = callOpByName(
+        "vulkan_prepack::gru_run",
+        "",
+        in_cpu_2d.vulkan(), h0_cpu.vulkan(), prepack[0]);
+  }, ::c10::Error);
+
+  // Act: non-3D hidden tensor
+  EXPECT_THROW({
+    const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+    auto out_vulkan = callOpByName(
+        "vulkan_prepack::gru_run",
+        "",
+        in_cpu.vulkan(), h0_cpu_2d.vulkan(), prepack[0]);
+  }, ::c10::Error);
+
+  // Act: has_biases should be true
+  EXPECT_THROW({
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+           weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        false, num_layers, gru_dropout, train, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: train should be false
+  EXPECT_THROW({
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+           weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
+  }, ::c10::Error);
+
+  // Act: bidirectional should be false
+  EXPECT_THROW({
+     auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+           weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, gru_dropout, train, true, batch_first);
+ }, ::c10::Error);
+
+  // Act: batch_first should be true
+  EXPECT_THROW({
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+           weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, gru_dropout, train, bidirectional, false);
+  }, ::c10::Error);
+
+  // Act: dropout should be 0.0
+  EXPECT_THROW({
+    auto prepack = callOpByName(
+        "vulkan_prepack::gru_prepack",
+        "",
+        std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+           weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
+        has_biases, num_layers, 1.0, train, bidirectional, batch_first);
+  }, ::c10::Error);
+}
+
+#if defined (__ANDROID__)  // to avoid `Undefined symbols for architecture arm64` error
+TEST_F(VulkanAPITest, profiling_invalideinputs_exceptions) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Act: The device doesn't support for timestamps on all graphics and compute queues.
+  EXPECT_THROW({
+    const bool is_timestamps_supported_ = false;
+    const float timestamp_period = 1.f;
+    at::native::vulkan::api::QueryPool querypool(at::native::vulkan::api::context()->gpu().device, is_timestamps_supported_, timestamp_period);
+    querypool.enable();
+  }, ::c10::Error);
+
+  // Act: The query pool already exists.
+  EXPECT_THROW({
+    auto context = at::native::vulkan::api::context();
+    at::native::vulkan::api::QueryPool querypool(
+        context->gpu().device,
+        context->gpu().adapter->timestamp_compute_and_graphics(),
+        context->gpu().adapter->timestamp_period());
+    querypool.enable();
+    querypool.enable(); // already enabled
+  }, ::c10::Error);
+
+  // Act: The query index cannot exceed Configuration::kMaxQueryCount.
+  EXPECT_THROW({
+    auto context = at::native::vulkan::api::context();
+    at::native::vulkan::api::QueryPool querypool(
+        context->gpu().device,
+        context->gpu().adapter->timestamp_compute_and_graphics(),
+        context->gpu().adapter->timestamp_period());
+    querypool.enable();
+    for (uint32_t i = 0u; i < at::native::vulkan::api::QueryPool::Configuration::kMaxQueryCount + 1u; ++i) {
+      at::native::vulkan::api::Command::Buffer& command_buffer = context->command().pool.stream();
+      {
+        at::native::vulkan::api::OpProfiler profiler(command_buffer, querypool, "test");
+      }
+      context->command().pool.submit(context->gpu().queue, command_buffer);
+    }
+  }, ::c10::Error);
+}
+
+// NOTE: Keep the following test at the end of file
+//       so that it can print out the op execution time for all prior tests
+TEST_F(VulkanAPITest, profiling_result_success) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  auto is_enabled = at::native::vulkan::api::context()->querypool().is_enabled();
+  if (is_enabled) {
+    auto perf_info = at::native::vulkan::api::context()->querypool().disable(false);
+    std::cout
+        << "-----------------------------------------------------------------------------------------" << std::endl
+        << "Query Name                                  Execution             Start               End" << std::endl
+        << "-----------------------------------------------------------------------------------------" << std::endl;
+    for (size_t i = 0; i < perf_info.size(); i++) {
+      std::cout << std::left << std::setw(35) << perf_info[i].query_name.c_str()
+        << std::right << std::setw(15) << perf_info[i].execution_time_us << " us"
+        << std::setw(15) << perf_info[i].start_time_us << " us"
+        << std::setw(15) << perf_info[i].end_time_us << " us" << std::left << std::endl;
+    }
+  }
+  at::native::vulkan::api::context()->querypool().enable();
+  const auto in_cpu1 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu2 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu3 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1);
+  out_vulkan.cpu();  // to make sure all GPU operations are done
+
+  // Act
+  auto perf_info = at::native::vulkan::api::context()->querypool().disable(true);
+  for (size_t i = 0; i < perf_info.size(); i++) {
+    std::cout << std::left << std::setw(35) << perf_info[i].query_name.c_str()
+      << std::right << std::setw(15) << perf_info[i].execution_time_us << " us"
+      << std::setw(15) << perf_info[i].start_time_us << " us"
+      << std::setw(15) << perf_info[i].end_time_us << " us" << std::left << std::endl;
+  }
+
+  // Assert
+  ASSERT_TRUE(perf_info.size() == 5u);
+  ASSERT_TRUE(perf_info[0].query_name == "aten::_cat (cat_feature_mult4ch)");
+
+  if (is_enabled) {
+    at::native::vulkan::api::context()->querypool().enable();
+  }
+}
+#endif
+
 } // namespace
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/vulkan_perf_test.cpp b/aten/src/ATen/test/vulkan_perf_test.cpp
index fa6b303eead5..230484a0f915 100644
--- a/aten/src/ATen/test/vulkan_perf_test.cpp
+++ b/aten/src/ATen/test/vulkan_perf_test.cpp
@@ -7,7 +7,8 @@
 
 namespace {
 
-static void cat_op_channel_perf(benchmark::State& state) {
+// using Vulkan Timestamp Queries for the pure GPU execution time only
+static void cat_op_channel_perf_gpu_only(benchmark::State& state) {
   // Guard
   if (!at::is_vulkan_available()) {
     return;
@@ -25,12 +26,64 @@ static void cat_op_channel_perf(benchmark::State& state) {
   const auto in_vulkan2 = in_cpu2.vulkan();
   const auto in_vulkan3 = in_cpu3.vulkan();
 
+  // Act
+  for (auto _ : state) {
+    at::native::vulkan::api::context()->querypool().enable();
+    const auto vulkan_out = at::cat({in_vulkan1, in_vulkan2, in_vulkan3}, 1);
+    vulkan_out.cpu();
+    auto perf_info = at::native::vulkan::api::context()->querypool().disable(true);
+    state.SetIterationTime(perf_info[0].execution_time_us / 1'000'000.); // us to sec
+  }
+}
+
+static void gru_op_perf(benchmark::State& state) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = static_cast<int>(state.range(0));  // input_size
+  const int H_out = static_cast<int>(state.range(1)); // hidden_size
+  const int num_layers = static_cast<int>(state.range(2));
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
   // Act
   while (state.KeepRunning()) {
-    const auto out_vulkan = at::cat({in_vulkan1, in_vulkan2, in_vulkan3}, 1);
+    // weights/biases should be always on CPU.
+    const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+
+    auto vulkan_output = std::get<0>(out_vulkan);
+    auto vulkan_hidden = std::get<1>(out_vulkan);
 
     // to avoid out-of-memory issues, release resources by waiting and flushing all GPU operations
-    at::native::vulkan::api::context()->wait(out_vulkan);
+    at::native::vulkan::api::context()->wait(vulkan_output);
+    at::native::vulkan::api::context()->wait(vulkan_hidden);
     at::native::vulkan::api::context()->flush();
   }
 }
@@ -42,12 +95,14 @@ static void CommonBenchmarkSettings(benchmark::internal::Benchmark* b) {
 
 } // namespace
 
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 20, 221, 193}); // big multiple of 4 channels
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 39, 221, 193}); // big non-multiple of 4 channels
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 4, 221, 193}); // small multiple of 4 channels
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels
-BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(3)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread)
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 40, 221, 193});  // big multiple of 4 channels
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 20, 221, 193});  // big multiple of 4 channels
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 39, 221, 193});  // big non-multiple of 4 channels
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 4, 221, 193});   // small multiple of 4 channels
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 3, 221, 193});   // small non-multiple of 4 channels
+BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(3)->Iterations(100)->Args({3, 40, 221, 193});  // big multiple of 4 channels (multi-thread)
+BENCHMARK(gru_op_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(100)->Args({384, 384, 2});                                        // McLaren Model inputs
+
 BENCHMARK_MAIN();
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp
deleted file mode 100644
index 09c98fa214c1..000000000000
--- a/aten/src/ATen/test/vulkan_test.cpp
+++ /dev/null
@@ -1,950 +0,0 @@
-#ifndef USE_VULKAN_API
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/vulkan/Context.h>
-#include <c10/util/irange.h>
-
-bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
-  double maxValue = 0.0;
-  for (auto& tensor : inputs) {
-    maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
-  }
-  return diff.abs().max().item<float>() < (0.01 + 2e-2 * maxValue);
-}
-bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
-  return checkRtol(a - b, {a, b});
-}
-
-bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
-  return (a - b).abs().max().item<float>() == 0.f;
-}
-
-TEST(VulkanTest, ToVulkanToCpu) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t =
-      at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto tv = t.vulkan();
-  ASSERT_TRUE(tv.options().device().type() == at::kVulkan);
-  auto t2 = tv.cpu();
-  ASSERT_TRUE(t2.options().device().type() == at::kCPU);
-  ASSERT_TRUE(almostEqual(t2, t));
-}
-
-TEST(VulkanTest, upsampleNearest2D) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::upsample_nearest2d(t_in, {4, 6});
-  auto tv_in =
-      t_in.to(at::TensorOptions{at::Device{at::kVulkan}}.dtype(at::kFloat));
-
-  auto tv_out = at::upsample_nearest2d(tv_in, {4, 6});
-  auto t_out =
-      tv_out.to(at::TensorOptions{at::Device{at::kCPU}}.dtype(at::kFloat));
-
-  bool check = almostEqual(t_out_expected, t_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, add) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in0 = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_in1 = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::add(t_in0, t_in1, 2);
-  auto tv_in0 = t_in0.vulkan();
-  auto tv_in1 = t_in1.vulkan();
-  auto tv_out = at::add(tv_in0, tv_in1, 2);
-  auto t_out = tv_out.cpu();
-
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
-}
-
-TEST(VulkanTest, add_not4dim) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in0 = at::rand({1, 1000}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_in1 = at::rand({1000}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::add(t_in0, t_in1, 2);
-  auto tv_in0 = t_in0.vulkan();
-  auto tv_in1 = t_in1.vulkan();
-  auto tv_out = at::add(tv_in0, tv_in1, 2);
-  auto t_out = tv_out.cpu();
-
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
-}
-
-TEST(VulkanTest, add_cpu_vulkan) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in0 = at::rand({2, 96, 1000}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_in1 =
-      at::rand({1, 2, 96, 1000}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::add(t_in0, t_in1, 2);
-  auto tv_in0 = t_in0.vulkan();
-  auto tv_in1 = t_in1.vulkan();
-
-  auto tv_out1 = at::add(tv_in0, t_in1, 2);
-  auto t_out1 = tv_out1.cpu();
-  ASSERT_TRUE(almostEqual(t_out1, t_out_expected));
-
-  auto tv_out2 = at::add(t_in0, tv_in1, 2);
-  auto t_out2 = tv_out2.cpu();
-  ASSERT_TRUE(almostEqual(t_out2, t_out_expected));
-}
-
-TEST(VulkanTest, add_) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in0 = at::rand({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_in1 = at::rand({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
-  auto tv_in0 = t_in0.vulkan();
-  auto tv_in1 = t_in1.vulkan();
-
-  t_in0.add_(t_in1, 2);
-  tv_in0.add_(tv_in1, 2);
-  auto t_out = tv_in0.cpu();
-  bool check = almostEqual(t_out, t_in0);
-  if (!check) {
-    std::cout << "expected:\n" << t_in0 << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, mulScalar) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  const float other = 3.14;
-  auto t_out_expected = t_in.mul(other);
-  auto tv_in = t_in.vulkan();
-  auto tv_out = tv_in.mul(other);
-  auto t_out = tv_out.cpu();
-
-  bool check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, addScalar) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  float* data = t_in.data_ptr<float>();
-  auto numel = t_in.numel();
-  for (const auto i : c10::irange(numel)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    data[i] = i;
-  }
-
-  const float other = 3.14;
-  const float alpha = 2;
-  auto t_out_expected = t_in.add(other, alpha);
-  auto tv_in = t_in.vulkan();
-  auto tv_out = tv_in.add(other, alpha);
-  auto t_out = tv_out.cpu();
-
-  bool check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, conv2d) {
-  if (!at::is_vulkan_available())
-    return;
-  auto OC = 2;
-  auto C = 3;
-  int64_t H = 3;
-  int64_t W = 3;
-  int64_t KH = 2;
-  int64_t KW = 2;
-  auto t_in = at::rand({1, C, H, W}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_w = at::rand({OC, C, KH, KW}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_b = at::zeros({OC}, at::device(at::kCPU).dtype(at::kFloat));
-  int64_t groups = 1;
-  std::vector<int64_t> stride{1, 1};
-  std::vector<int64_t> padding{0, 0};
-  std::vector<int64_t> dilation{1, 1};
-
-  auto t_out_expected =
-      at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups);
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups);
-  auto t_out = tv_out.cpu();
-  bool check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, conv2dDWWeightsOnCPU) {
-  if (!at::is_vulkan_available())
-    return;
-  auto C = 3;
-  int64_t groups = C;
-  int64_t H = 3;
-  int64_t W = 3;
-  int64_t KH = 2;
-  int64_t KW = 2;
-  auto t_in = at::rand({1, C, H, W}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_w =
-      at::rand({groups, 1, KH, KW}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_b = at::zeros({groups}, at::device(at::kCPU).dtype(at::kFloat));
-  std::vector<int64_t> stride{1, 1};
-  std::vector<int64_t> padding{0, 0};
-  std::vector<int64_t> dilation{1, 1};
-  auto t_out_expected =
-      at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups);
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups);
-  auto t_out = tv_out.cpu();
-  bool check = almostEqual(t_out_expected, t_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, addmm) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_m1 = at::rand({2, 2}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_m2 = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_b = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-
-  float beta = 100;
-  float alpha = 2;
-  auto t_out_expected = at::addmm(t_b, t_m1, t_m2, beta, alpha);
-
-  auto tv_m1 = t_m1.vulkan();
-  auto tv_m2 = t_m2.vulkan();
-  auto tv_b = t_b.vulkan();
-  auto tv_out = at::addmm(tv_b, tv_m1, tv_m2, beta, alpha);
-  auto t_out = tv_out.cpu();
-  bool check = almostEqual(t_out_expected, t_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, mm) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_m1 = at::rand({10, 20}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_m2 = at::rand({20, 30}, at::device(at::kCPU).dtype(at::kFloat));
-
-  auto t_out_expected = t_m1.mm(t_m2);
-
-  auto tv_m1 = t_m1.vulkan();
-  auto tv_m2 = t_m2.vulkan();
-  auto tv_out = tv_m1.mm(tv_m2);
-  auto t_out = tv_out.cpu();
-  bool check = almostEqual(t_out_expected, t_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, clamp) {
-  if (!at::is_vulkan_available())
-    return;
-  float min = -0.5;
-  float max = 0.5;
-  auto t_in = at::rand({1, 3, 16, 16}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::clamp(t_in, min, max);
-
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::clamp(tv_in, min, max);
-  auto t_out = tv_out.cpu();
-
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
-}
-
-TEST(VulkanTest, hardtanh_) {
-  if (!at::is_vulkan_available())
-    return;
-  float min = -0.5;
-  float max = 0.5;
-  auto t_in = at::rand({1, 3, 16, 16}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::hardtanh_(t_in, min, max);
-
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::hardtanh_(tv_in, min, max);
-  auto t_out = tv_out.cpu();
-
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
-}
-
-TEST(VulkanTest, relu_) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t = at::empty({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_in = t.uniform_(-1, 1);
-  auto tv_in = t_in.vulkan();
-
-  t_in.relu_();
-  tv_in.relu_();
-  auto tv_out = tv_in.cpu();
-  bool check = almostEqual(t_in, tv_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_in << std::endl;
-    std::cout << "got:\n" << tv_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, mean) {
-  if (!at::is_vulkan_available())
-    return;
-  auto t_in = at::rand({2, 3, 3, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::mean(t_in, {2, 3}, false);
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::mean(tv_in, {2, 3}, false);
-  auto t_out = tv_out.cpu();
-  bool check = almostEqual(t_out_expected, t_out);
-  if (!check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-enum class OpType { conv2d, hardtanh_, mean, addmm };
-
-class BaseOp {
- public:
-  BaseOp(OpType t) : type(t) {}
-  virtual ~BaseOp() = default;
-  virtual at::Tensor run(at::Tensor&) = 0;
-  virtual std::string toString() = 0;
-  OpType type;
-};
-
-class Hardtanh_ : public BaseOp {
- public:
-  Hardtanh_() : BaseOp(OpType::hardtanh_) {}
-  at::Tensor run(at::Tensor& t) override {
-    return at::hardtanh_(t, 0, 6);
-  }
-  std::string toString() override {
-    return "hardtanh_";
-  }
-};
-
-class Mean : public BaseOp {
- public:
-  Mean() : BaseOp(OpType::mean) {}
-  at::Tensor run(at::Tensor& t) override {
-    return at::mean(t, {2, 3}, false);
-  }
-  std::string toString() override {
-    return "mean";
-  }
-};
-
-class Addmm : public BaseOp {
- public:
-  Addmm(int64_t m1H, int64_t m1W, int64_t m2W, float _beta, float _alpha)
-      : BaseOp(OpType::addmm), beta(_beta), alpha(_alpha) {
-    m2 = at::rand(
-        c10::IntArrayRef({m1W, m2W}), at::device(at::kCPU).dtype(at::kFloat));
-    m2v = m2.vulkan();
-    b = at::rand(
-        c10::IntArrayRef({m1H, m2W}), at::device(at::kCPU).dtype(at::kFloat));
-    bv = b.vulkan();
-  }
-
-  at::Tensor run(at::Tensor& t) override {
-    if (t.is_vulkan()) {
-      return at::addmm(bv, t, m2v, beta, alpha);
-    }
-    return at::addmm(b, t, m2, beta, alpha);
-  }
-
-  std::string toString() override {
-    return "addmm";
-  }
-
-  at::Tensor m2;
-  at::Tensor m2v;
-  at::Tensor b;
-  at::Tensor bv;
-  float beta;
-  float alpha;
-};
-
-class Conv2d : public BaseOp {
- public:
-  Conv2d(c10::IntArrayRef wsizes, int64_t g, int64_t s, int64_t p)
-      : BaseOp(OpType::conv2d), stride(s), padding(p), groups(g) {
-    w = at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat));
-    b = at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat));
-  };
-
-  at::Tensor run(at::Tensor& t) override {
-    return at::conv2d(t, w, b, {stride}, {padding}, {1}, groups);
-  }
-  std::string toString() override {
-    return "conv2d";
-  }
-
-  int64_t stride;
-  int64_t padding;
-  int64_t groups;
-  at::Tensor w;
-  at::Tensor b;
-};
-
-class OpsList {
- public:
-  OpsList() = default;
-  OpsList(std::vector<std::unique_ptr<BaseOp>>& _ops) : ops(std::move(_ops)) {}
-
-  auto runDual(at::Tensor& in, at::Tensor& vin) {
-    at::Tensor t = in;
-    at::Tensor tv = vin;
-    int i = 0;
-    for (const auto& op : ops) {
-      t = op->run(t);
-      tv = op->run(tv);
-      auto tv_cpu = t.cpu();
-      TORCH_INTERNAL_ASSERT(
-          almostEqual(t, tv_cpu),
-          "Not almost equal cpu vs vulkan op i:",
-          i,
-          " ",
-          op->toString());
-      i++;
-    }
-    return std::make_pair(t, tv);
-  }
-
-  auto run(at::Tensor& in) {
-    at::Tensor t = in;
-    int i = 0;
-    for (const auto& op : ops) {
-      t = op->run(t);
-      i++;
-    }
-    return t;
-  }
-
-  std::vector<std::unique_ptr<BaseOp>> ops;
-};
-
-class MobileNetV2 : public OpsList {
- public:
-  MobileNetV2() {
-    ops.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({24, 144, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 2, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({32, 144, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 2, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({64, 192, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({96, 384, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 2, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({160, 576, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Conv2d({320, 960, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Conv2d({1280, 320, 1, 1}, 1, 1, 0));
-    ops.emplace_back(new Hardtanh_());
-    ops.emplace_back(new Mean());
-    ops.emplace_back(new Addmm(1, 1280, 1000, 0, 1));
-  }
-};
-
-TEST(VulkanTest, DISABLED_mobilenetv2) {
-  if (!at::is_vulkan_available())
-    return;
-
-  MobileNetV2 mn2{};
-  auto t_in =
-      at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat));
-  auto tv_in = t_in.vulkan();
-  mn2.runDual(t_in, tv_in);
-}
-
-TEST(VulkanTest, OpsList) {
-  if (!at::is_vulkan_available())
-    return;
-
-  std::vector<std::unique_ptr<BaseOp>> ops;
-  ops.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1));
-  ops.emplace_back(new Hardtanh_());
-  ops.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1));
-  ops.emplace_back(new Hardtanh_());
-  ops.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0));
-  ops.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0));
-  ops.emplace_back(new Hardtanh_());
-  ops.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1));
-  ops.emplace_back(new Hardtanh_());
-  ops.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0));
-  ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); // 1, 144, 56, 56
-  ops.emplace_back(new Hardtanh_());
-  ops.emplace_back(new Mean());
-  ops.emplace_back(new Addmm(1, 144, 1000, 0, 1));
-  OpsList opsList(ops);
-  auto t_in =
-      at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = opsList.run(t_in);
-
-  auto tv_in = t_in.vulkan();
-
-  auto tv_out = opsList.run(t_in);
-  auto t_out = tv_out.cpu();
-
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
-}
-
-template <class... Inputs>
-inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
-  return {std::forward<Inputs>(inputs)...};
-}
-
-template <class... Args>
-inline std::vector<c10::IValue> callOpByHandle(
-    const c10::OperatorHandle& op,
-    Args... args) {
-  auto stack = makeStack(std::forward<Args>(args)...);
-  c10::Dispatcher::singleton().callBoxed(op, &stack);
-  return stack;
-}
-
-template <class... Args>
-inline std::vector<c10::IValue> callOpByName(
-    const char* func_name,
-    const char* overload_name,
-    Args... args) {
-  const c10::optional<c10::OperatorHandle> op_handle =
-      c10::Dispatcher::singleton().findSchema({func_name, overload_name});
-  assert(op_handle.has_value());
-  return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
-}
-
-TEST(VulkanTest, conv2dPrepack) {
-  if (!at::is_vulkan_available())
-    return;
-  auto OC = 2;
-  auto C = 3;
-  int64_t groups = 1;
-  auto t_in = at::rand({1, C, 3, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_w = at::rand({OC, C, 2, 2}, at::device(at::kCPU).dtype(at::kFloat));
-  auto t_b = at::zeros({OC}, at::device(at::kCPU).dtype(at::kFloat));
-
-  std::vector<int64_t> stride{1, 1};
-  std::vector<int64_t> padding{0, 0};
-  std::vector<int64_t> dilation{1, 1};
-  float output_min = 0.25;
-  float output_max = 1.0;
-
-  auto t_out_conv2d =
-      at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups);
-  auto t_out_expected = at::clamp(t_out_conv2d, output_min, output_max);
-
-  auto tv_in = t_in.vulkan();
-  auto tv_out_conv2d =
-      at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups);
-  auto tv_out = at::clamp(tv_out_conv2d, output_min, output_max);
-
-  auto t_out = tv_out.cpu();
-  bool no_prepack_check = almostEqual(t_out, t_out_expected);
-  if (!no_prepack_check) {
-    std::cout << "t_out_expected:\n" << t_out_expected << std::endl;
-    std::cout << "t_out:\n" << t_out << std::endl;
-  }
-  ASSERT_TRUE(no_prepack_check);
-
-  auto prepack = callOpByName(
-      "vulkan_prepack::conv2d_clamp_prepack",
-      "",
-      t_w,
-      t_b,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_min,
-      output_max);
-  auto tv_out_prepack_ivalues =
-      callOpByName("vulkan_prepack::conv2d_clamp_run", "", tv_in, prepack[0]);
-  auto tv_out_prepack = tv_out_prepack_ivalues[0].toTensor();
-  auto t_out_prepack = tv_out_prepack.cpu();
-  const auto prepack_check = almostEqual(t_out_prepack, t_out_expected);
-  if (!prepack_check) {
-    std::cout << "expected:\n" << t_out_expected << std::endl;
-    std::cout << "got:\n" << t_out_prepack << std::endl;
-  }
-  ASSERT_TRUE(prepack_check);
-}
-
-TEST(VulkanTest, adaptive_avg_pool2d) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 2, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::adaptive_avg_pool2d(t_in, {3, 3});
-  auto tv_in = t_in.vulkan();
-
-  auto tv_out = at::adaptive_avg_pool2d(tv_in, {3, 3});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-// TODO: Enable when view operator for Vulkan landed
-TEST(VulkanTest, DISABLED_adaptive_avg_pool2d_2) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 1280, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::adaptive_avg_pool2d(t_in, {1, 1});
-  auto tv_in = t_in.vulkan();
-
-  auto tv_out = at::adaptive_avg_pool2d(tv_in, {1, 1});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, reshape) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 8, 1, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::reshape(t_in, {1, 8});
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::reshape(tv_in, {1, 8});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, reshape2) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 3, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::reshape(t_in, {2, 3, 1, 2});
-
-  auto tv_in = t_in.vulkan();
-  auto tv_out = at::reshape(tv_in, {2, 3, 1, 2});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, tensor5d) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({2, 2, 2, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto tv_in = t_in.vulkan();
-}
-
-TEST(VulkanTest, tensor5d_transpose) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::empty({1, 2, 3, 2, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  float* data = t_in.data_ptr<float>();
-  auto numel = t_in.numel();
-  for (const auto i : c10::irange(numel)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    data[i] = i;
-  }
-
-  auto tv_in = t_in.vulkan();
-
-  auto t_out_expected = t_in.transpose(1, 2);
-  auto t_out = tv_in.transpose(1, 2).cpu();
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, view) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({2, 4, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = t_in.view({2, 2, 2, 3, 3});
-  auto tv_in = t_in.vulkan();
-  auto tv_out = tv_in.view({2, 2, 2, 3, 3});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, slice) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  float* data = t_in.data_ptr<float>();
-  auto numel = t_in.numel();
-  for (const auto i : c10::irange(numel)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    data[i] = i;
-  }
-
-  auto tv_in = t_in.vulkan();
-
-  auto t_out_expected = t_in.slice(1, 2, 4, 1);
-  auto t_out = tv_in.slice(1, 2, 4, 1).cpu();
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, select) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  float* data = t_in.data_ptr<float>();
-  auto numel = t_in.numel();
-  for (const auto i : c10::irange(numel)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    data[i] = i;
-  }
-
-  auto tv_in = t_in.vulkan();
-
-  auto t_out_expected = t_in.slice(1, 1);
-  auto t_out = tv_in.slice(1, 1).cpu();
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, unsqueeze) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::empty({1, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  float* data = t_in.data_ptr<float>();
-  auto numel = t_in.numel();
-  for (const auto i : c10::irange(numel)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    data[i] = i;
-  }
-
-  auto tv_in = t_in.vulkan();
-
-  auto t_out_expected = t_in.unsqueeze(1);
-  auto t_out = tv_in.unsqueeze(1).cpu();
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, cat) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in0 =
-      at::rand({1, 1, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_in1 =
-      at::rand({1, 2, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_in2 =
-      at::rand({1, 5, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-
-  auto t_out_expected = at::cat({t_in0, t_in1, t_in2}, 1);
-  auto tv_out = at::cat({t_in0.vulkan(), t_in1.vulkan(), t_in2.vulkan()}, 1);
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, DISABLED_max_pool2d) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 3, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::max_pool2d(t_in, {2, 2}, {1}, {0}, {1});
-  auto tv_in = t_in.vulkan();
-
-  auto tv_out = at::max_pool2d(tv_in, {2, 2}, {1}, {0}, {1});
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-TEST(VulkanTest, avg_pool2d) {
-  if (!at::is_vulkan_available())
-    return;
-
-  auto t_in =
-      at::rand({1, 3, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto t_out_expected = at::avg_pool2d(t_in, {2, 2}, {1}, {0}, true);
-  auto tv_in = t_in.vulkan();
-
-  auto tv_out = at::avg_pool2d(tv_in, {2, 2}, {1}, {0}, true);
-  auto t_out = tv_out.cpu();
-
-  const auto check = almostEqual(t_out, t_out_expected);
-  if (!check) {
-    std::cout << "expected:" << t_out_expected << std::endl;
-    std::cout << "got:" << t_out << std::endl;
-  }
-  ASSERT_TRUE(check);
-}
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index 4a724fa94008..5b0c02c2846a 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -64,6 +64,9 @@ fi
 if [[ -x ./cuda_cub_test ]]; then
   ./cuda_cub_test
 fi
+if [[ -x ./cuda_atomic_ops_test ]]; then
+  ./cuda_atomic_ops_test
+fi
 if [ "$VALGRIND" == "ON" ]; then
   valgrind --suppressions="$VALGRIND_SUP" --error-exitcode=1 ./basic --gtest_filter='-*CUDA'
   if [[ -x ./tensor_interop_test ]]; then
diff --git a/benchmarks/cpp/nvfuser/CMakeLists.txt b/benchmarks/cpp/nvfuser/CMakeLists.txt
index b566e6a359e9..24809c9ed18a 100644
--- a/benchmarks/cpp/nvfuser/CMakeLists.txt
+++ b/benchmarks/cpp/nvfuser/CMakeLists.txt
@@ -1,7 +1,9 @@
 if(USE_CUDA)
   add_executable(nvfuser_bench
-    batch_norm.cpp
-    batch_norm_backward.cpp
+    batch_norm_channels_first.cpp
+    batch_norm_channels_first_backward.cpp
+    batch_norm_channels_last.cpp
+    batch_norm_channels_last_backward.cpp
     bert.cpp
     broadcast.cpp
     gelu_backward.cpp
@@ -10,11 +12,15 @@ if(USE_CUDA)
     instance_norm.cpp
     layer_norm.cpp
     layer_norm_backward.cpp
+    rms_norm.cpp
+    rms_norm_backward.cpp
     lstm_cell.cpp
     reduction.cpp
     softmax.cpp
     softmax_backward.cpp
     scale_bias_relu.cpp
+    transpose.cpp
+    timm.cpp
     utils.cpp
     main.cpp)
 
diff --git a/benchmarks/cpp/nvfuser/batch_norm.cpp b/benchmarks/cpp/nvfuser/batch_norm.cpp
deleted file mode 100644
index ef6bdd667d66..000000000000
--- a/benchmarks/cpp/nvfuser/batch_norm.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime.h>
-
-#include "utils.h"
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, dtype);
-  auto bias = makeContigTensor(1, dtype);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    weight = castOp(DataType::Float, weight);
-    bias = castOp(DataType::Float, bias);
-  }
-
-  auto momentum_ptr = new Double(kMomentum);
-  auto eps_ptr = new Double(kEps);
-
-  auto result = batch_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  auto output = result.output;
-
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
-  }
-
-  fusion->addOutput(output);
-}
-
-static void NvFuserScheduler_BatchNorm(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  // inputs
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(at_weight);
-  auto ato_bias = c10::optional<at::Tensor>(at_bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
-
-  auto output = at::batch_norm(
-      at_x,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-    auto output = at::batch_norm(
-        at_x,
-        ato_weight,
-        ato_bias,
-        ato_run_mean,
-        ato_run_var,
-        true,
-        kMomentum,
-        kEps,
-        true);
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
-           int64_t(dataTypeSize(dtype)) +
-       (2 * (at_run_mean.numel() + at_run_var.numel()) *
-        int64_t(dataTypeSize(DataType::Float)))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
-  Baseline_BatchNorm(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_fp32,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_fp16,
-    setupBatchNorm,
-    NvFuserScheduler_BatchNorm,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    // cuDNN didn't make it to 1024
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_backward.cpp
deleted file mode 100644
index e4a9fdcb0340..000000000000
--- a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-
-#include <benchmark/benchmark.h>
-
-#include <ATen/Operators.h>
-
-#include <cuda_runtime.h>
-
-#include "utils.h"
-
-using namespace torch::jit::fuser::cuda;
-
-//------------------------------------------------------------------------------
-
-static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  FusionGuard fg(fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  // setup fusion
-  auto input = makeContigTensor(4, dtype);
-  auto grad_output = makeContigTensor(4, dtype);
-  auto weight = makeContigTensor(1, DataType::Float);
-  auto running_mean = makeContigTensor(1, DataType::Float);
-  auto running_var = makeContigTensor(1, DataType::Float);
-  auto save_mean = makeContigTensor(1, DataType::Float);
-  auto save_var = makeContigTensor(1, DataType::Float);
-
-  fusion->addInput(input);
-  fusion->addInput(grad_output);
-  fusion->addInput(weight);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-  fusion->addInput(save_mean);
-  fusion->addInput(save_var);
-
-  if (dtype == DataType::Half) {
-    input = castOp(DataType::Float, input);
-    grad_output = castOp(DataType::Float, grad_output);
-  }
-
-  auto eps_ptr = new Double(kEps);
-
-  auto result = batch_norm_backward(
-      input,
-      grad_output,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_var,
-      kTraining,
-      eps_ptr,
-      std::vector<bool>(3, true));
-
-  auto grad_input = result.grad_input;
-  auto grad_weight = result.grad_weight;
-  auto grad_bias = result.grad_bias;
-
-  if (dtype == DataType::Half) {
-    grad_input = castOp(DataType::Half, grad_input);
-    grad_weight = castOp(DataType::Half, grad_weight);
-    grad_bias = castOp(DataType::Half, grad_bias);
-  }
-
-  fusion->addOutput(grad_input);
-  fusion->addOutput(grad_weight);
-  fusion->addOutput(grad_bias);
-}
-
-static void NvFuserScheduler_BatchNorm_BWD(
-    benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
-
-  std::vector<c10::IValue> aten_inputs(
-      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
-
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_BWD(
-    benchmark::State& benchmark_state,
-    DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  std::vector<int64_t> input_shape{
-      benchmark_state.range(0),
-      benchmark_state.range(1),
-      benchmark_state.range(2),
-      benchmark_state.range(2)};
-
-  at::manual_seed(0);
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
-  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
-
-  auto ato_weight = c10::optional<at::Tensor>(weight);
-  auto ato_bias = c10::optional<at::Tensor>(bias);
-  auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
-  auto ato_run_var = c10::optional<at::Tensor>(run_var);
-  auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
-  auto ato_save_var = c10::optional<at::Tensor>(save_var);
-
-  auto fwd_result = at::_ops::_batch_norm_impl_index::call(
-      input,
-      ato_weight,
-      ato_bias,
-      ato_run_mean,
-      ato_run_var,
-      true,
-      kMomentum,
-      kEps,
-      true);
-  cudaDeviceSynchronize();
-
-  // Sync everything up before we start
-  clearL2Cache();
-  cudaDeviceSynchronize();
-  for (auto _ : benchmark_state) {
-    CudaKernelTimer timer;
-
-    at::_ops::cudnn_batch_norm_backward::call(
-        input,
-        grad_out,
-        weight,
-        ato_run_mean,
-        ato_run_var,
-        save_mean,
-        save_var,
-        kEps,
-        std::get<3>(fwd_result));
-
-    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
-    cudaDeviceSynchronize();
-    clearL2Cache();
-    cudaDeviceSynchronize();
-  }
-
-  benchmark_state.SetBytesProcessed(
-      int64_t(benchmark_state.iterations()) *
-      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
-       (run_mean.numel() + run_var.numel() + save_mean.numel() +
-        save_var.numel() + weight.numel()) *
-           int64_t(dataTypeSize(DataType::Float))));
-}
-
-//------------------------------------------------------------------------------
-
-static void Baseline_BatchNorm_BWD_cuDNN_fp32(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Float);
-}
-
-static void Baseline_BatchNorm_BWD_cuDNN_fp16(
-    benchmark::State& benchmark_state) {
-  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
-}
-
-//------------------------------------------------------------------------------
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_BWD_fp32,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Float);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_DEFINE(
-    NvFuserScheduler_BatchNorm_BWD_fp16,
-    setupBatchNorm_BWD,
-    NvFuserScheduler_BatchNorm_BWD,
-    DataType::Half);
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-//------------------------------------------------------------------------------
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    // cuDNN didn't make it to 1024
-    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
-
-BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
-    // ->RangeMultiplier(2)
-    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
-    ->Unit(benchmark::kMicrosecond)
-    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp
new file mode 100644
index 000000000000..723d222516df
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp
@@ -0,0 +1,339 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupBatchNorm(Fusion* fusion, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  FusionGuard fg(fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  // setup fusion
+  auto input = makeContigTensor(4, dtype);
+  auto weight = makeContigTensor(1, dtype);
+  auto bias = makeContigTensor(1, dtype);
+  auto running_mean = makeContigTensor(1, DataType::Float);
+  auto running_var = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  if (dtype == DataType::Half) {
+    input = castOp(DataType::Float, input);
+    weight = castOp(DataType::Float, weight);
+    bias = castOp(DataType::Float, bias);
+  }
+
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kTraining,
+      momentum_ptr,
+      eps_ptr);
+
+  auto output = result.output;
+
+  if (dtype == DataType::Half) {
+    output = castOp(DataType::Half, output);
+  }
+
+  fusion->addOutput(output);
+}
+
+static void NvFuserScheduler_BatchNorm(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_weight = at::ones({input_shape[1]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
+  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
+  std::vector<c10::IValue> aten_inputs(
+      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
+           int64_t(dataTypeSize(dtype)) +
+       (2 * (at_run_mean.numel() + at_run_var.numel()) *
+        int64_t(dataTypeSize(DataType::Float)))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm(
+    benchmark::State& benchmark_state,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_weight = at::ones({input_shape[1]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
+  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
+
+  auto ato_weight = c10::optional<at::Tensor>(at_weight);
+  auto ato_bias = c10::optional<at::Tensor>(at_bias);
+  auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
+  auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
+
+  auto output = at::batch_norm(
+      at_x,
+      ato_weight,
+      ato_bias,
+      ato_run_mean,
+      ato_run_var,
+      true,
+      kMomentum,
+      kEps,
+      true);
+
+  clearL2Cache();
+  cudaDeviceSynchronize();
+  for (auto _ : benchmark_state) {
+    CudaKernelTimer timer;
+    auto output = at::batch_norm(
+        at_x,
+        ato_weight,
+        ato_bias,
+        ato_run_mean,
+        ato_run_var,
+        true,
+        kMomentum,
+        kEps,
+        true);
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+    cudaDeviceSynchronize();
+    clearL2Cache();
+    cudaDeviceSynchronize();
+  }
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
+           int64_t(dataTypeSize(dtype)) +
+       (2 * (at_run_mean.numel() + at_run_var.numel()) *
+        int64_t(dataTypeSize(DataType::Float)))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm(benchmark_state, DataType::Float);
+}
+
+static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm(benchmark_state, DataType::Half);
+}
+
+// Simple aliases just for names in the printed output
+static void Baseline_ResNet_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm(benchmark_state, DataType::Half);
+}
+
+static void Baseline_ResNext_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm(benchmark_state, DataType::Half);
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_fp32,
+    setupBatchNorm,
+    NvFuserScheduler_BatchNorm,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_fp16,
+    setupBatchNorm,
+    NvFuserScheduler_BatchNorm,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    // cuDNN didn't make it to 1024
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+// RESNET and REXNEXT benchmarks
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNet_BatchNorm_fp16,
+    setupBatchNorm,
+    NvFuserScheduler_BatchNorm,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNext_BatchNorm_fp16,
+    setupBatchNorm,
+    NvFuserScheduler_BatchNorm,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_ResNet_BatchNorm_cuDNN_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_ResNext_BatchNorm_cuDNN_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp
new file mode 100644
index 000000000000..af2b4d145fc8
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp
@@ -0,0 +1,362 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <ATen/Operators.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  FusionGuard fg(fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  // setup fusion
+  auto input = makeContigTensor(4, dtype);
+  auto grad_output = makeContigTensor(4, dtype);
+  auto weight = makeContigTensor(1, DataType::Float);
+  auto running_mean = makeContigTensor(1, DataType::Float);
+  auto running_var = makeContigTensor(1, DataType::Float);
+  auto save_mean = makeContigTensor(1, DataType::Float);
+  auto save_var = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(input);
+  fusion->addInput(grad_output);
+  fusion->addInput(weight);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+  fusion->addInput(save_mean);
+  fusion->addInput(save_var);
+
+  if (dtype == DataType::Half) {
+    input = castOp(DataType::Float, input);
+    grad_output = castOp(DataType::Float, grad_output);
+  }
+
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm_backward(
+      input,
+      grad_output,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_var,
+      kTraining,
+      eps_ptr,
+      std::vector<bool>(3, true));
+
+  auto grad_input = result.grad_input;
+  auto grad_weight = result.grad_weight;
+  auto grad_bias = result.grad_bias;
+
+  if (dtype == DataType::Half) {
+    grad_input = castOp(DataType::Half, grad_input);
+    grad_weight = castOp(DataType::Half, grad_weight);
+    grad_bias = castOp(DataType::Half, grad_bias);
+  }
+
+  fusion->addOutput(grad_input);
+  fusion->addOutput(grad_weight);
+  fusion->addOutput(grad_bias);
+}
+
+static void NvFuserScheduler_BatchNorm_BWD(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const bool kTraining = true;
+  const float kEps = 1e-5;
+
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(input_shape, options);
+  at::Tensor grad_out = at::randn(input_shape, options);
+  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs(
+      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
+       (run_mean.numel() + run_var.numel() + save_mean.numel() +
+        save_var.numel() + weight.numel()) *
+           int64_t(dataTypeSize(DataType::Float))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_BWD(
+    benchmark::State& benchmark_state,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(input_shape, options);
+  at::Tensor grad_out = at::randn(input_shape, options);
+  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
+
+  auto ato_weight = c10::optional<at::Tensor>(weight);
+  auto ato_bias = c10::optional<at::Tensor>(bias);
+  auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
+  auto ato_run_var = c10::optional<at::Tensor>(run_var);
+  auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
+  auto ato_save_var = c10::optional<at::Tensor>(save_var);
+
+  auto fwd_result = at::_ops::_batch_norm_impl_index::call(
+      input,
+      ato_weight,
+      ato_bias,
+      ato_run_mean,
+      ato_run_var,
+      true,
+      kMomentum,
+      kEps,
+      true);
+  cudaDeviceSynchronize();
+
+  // Sync everything up before we start
+  clearL2Cache();
+  cudaDeviceSynchronize();
+  for (auto _ : benchmark_state) {
+    CudaKernelTimer timer;
+
+    at::_ops::cudnn_batch_norm_backward::call(
+        input,
+        grad_out,
+        weight,
+        ato_run_mean,
+        ato_run_var,
+        save_mean,
+        save_var,
+        kEps,
+        std::get<3>(fwd_result));
+
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+    cudaDeviceSynchronize();
+    clearL2Cache();
+    cudaDeviceSynchronize();
+  }
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
+       (run_mean.numel() + run_var.numel() + save_mean.numel() +
+        save_var.numel() + weight.numel()) *
+           int64_t(dataTypeSize(DataType::Float))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_BWD_cuDNN_fp32(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_BWD(benchmark_state, DataType::Float);
+}
+
+static void Baseline_BatchNorm_BWD_cuDNN_fp16(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
+}
+
+// Simple aliases just for names in the printed output
+static void Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
+}
+
+static void Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
+}
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_BWD_fp32,
+    setupBatchNorm_BWD,
+    NvFuserScheduler_BatchNorm_BWD,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_BWD_fp16,
+    setupBatchNorm_BWD,
+    NvFuserScheduler_BatchNorm_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    // cuDNN didn't make it to 1024
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+// RESNET and REXNEXT benchmarks
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNet_BatchNorm_BWD_fp16,
+    setupBatchNorm_BWD,
+    NvFuserScheduler_BatchNorm_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_BWD_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNext_BatchNorm_BWD_fp16,
+    setupBatchNorm_BWD,
+    NvFuserScheduler_BatchNorm_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_BWD_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
new file mode 100644
index 000000000000..14fde631aec0
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
@@ -0,0 +1,367 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  FusionGuard fg(fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  // setup fusion
+  auto input = makeContigTensor(4, dtype);
+  auto weight = makeContigTensor(1, dtype);
+  auto bias = makeContigTensor(1, dtype);
+  auto running_mean = makeContigTensor(1, DataType::Float);
+  auto running_var = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  if (dtype == DataType::Half) {
+    input = castOp(DataType::Float, input);
+    weight = castOp(DataType::Float, weight);
+    bias = castOp(DataType::Float, bias);
+  }
+
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kTraining,
+      momentum_ptr,
+      eps_ptr,
+      true);
+
+  auto output = result.output;
+
+  if (dtype == DataType::Half) {
+    output = castOp(DataType::Half, output);
+  }
+
+  fusion->addOutput(output);
+}
+
+static void NvFuserScheduler_BatchNorm_nhwc(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(2),
+      benchmark_state.range(2),
+      benchmark_state.range(1)};
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_weight = at::ones({input_shape[3]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
+  at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options);
+  at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options);
+  std::vector<c10::IValue> aten_inputs(
+      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
+           int64_t(dataTypeSize(dtype)) +
+       (2 * (at_run_mean.numel() + at_run_var.numel()) *
+        int64_t(dataTypeSize(DataType::Float)))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_nhwc(
+    benchmark::State& benchmark_state,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options)
+                        .contiguous(c10::MemoryFormat::ChannelsLast);
+  at::Tensor at_weight = at::ones({input_shape[1]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
+  at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
+
+  auto ato_weight = c10::optional<at::Tensor>(at_weight);
+  auto ato_bias = c10::optional<at::Tensor>(at_bias);
+  auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
+  auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
+
+  auto output = at::batch_norm(
+      at_x,
+      ato_weight,
+      ato_bias,
+      ato_run_mean,
+      ato_run_var,
+      true,
+      kMomentum,
+      kEps,
+      true);
+
+  clearL2Cache();
+  cudaDeviceSynchronize();
+  for (auto _ : benchmark_state) {
+    CudaKernelTimer timer;
+    at::_ops::_batch_norm_impl_index::call(
+        at_x,
+        ato_weight,
+        ato_bias,
+        ato_run_mean,
+        ato_run_var,
+        true,
+        kMomentum,
+        kEps,
+        true);
+
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+    cudaDeviceSynchronize();
+    clearL2Cache();
+    cudaDeviceSynchronize();
+  }
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
+           int64_t(dataTypeSize(dtype)) +
+       (2 * (at_run_mean.numel() + at_run_var.numel()) *
+        int64_t(dataTypeSize(DataType::Float)))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_nhwc_cuDNN_fp32(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Float);
+}
+
+static void Baseline_BatchNorm_nhwc_cuDNN_fp16(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
+}
+
+// Simple aliases just for names in the printed output
+static void Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
+}
+
+static void Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_nhwc_fp32,
+    setupBatchNorm_nhwc,
+    NvFuserScheduler_BatchNorm_nhwc,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_nhwc_fp16,
+    setupBatchNorm_nhwc,
+    NvFuserScheduler_BatchNorm_nhwc,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    // cuDNN didn't make it to 1024
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+// RESNET and REXNEXT benchmarks
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16,
+    setupBatchNorm_nhwc,
+    NvFuserScheduler_BatchNorm_nhwc,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16,
+    setupBatchNorm_nhwc,
+    NvFuserScheduler_BatchNorm_nhwc,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// Permutation of TIMM sizes
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16,
+    setupBatchNorm_nhwc,
+    NvFuserScheduler_BatchNorm_nhwc,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
+    ->ArgsProduct(
+        {{8, 16, 32, 64, 128, 256},
+         {24, 40, 48, 56, 72, 152, 184, 200, 368},
+         {7, 14, 28, 56, 112}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
+    ->ArgsProduct(
+        {{128, 256, 512, 1024, 2048},
+         {24, 40, 48, 56, 72, 152},
+         {7, 14, 28, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp
new file mode 100644
index 000000000000..0660b75e3942
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp
@@ -0,0 +1,387 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <ATen/Operators.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  FusionGuard fg(fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  // setup fusion
+  auto input = makeContigTensor(4, dtype);
+  auto grad_output = makeContigTensor(4, dtype);
+  auto weight = makeContigTensor(1, DataType::Float);
+  auto running_mean = makeContigTensor(1, DataType::Float);
+  auto running_var = makeContigTensor(1, DataType::Float);
+  auto save_mean = makeContigTensor(1, DataType::Float);
+  auto save_var = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(input);
+  fusion->addInput(grad_output);
+  fusion->addInput(weight);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+  fusion->addInput(save_mean);
+  fusion->addInput(save_var);
+
+  if (dtype == DataType::Half) {
+    input = castOp(DataType::Float, input);
+    grad_output = castOp(DataType::Float, grad_output);
+  }
+
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm_backward(
+      input,
+      grad_output,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_var,
+      kTraining,
+      eps_ptr,
+      std::vector<bool>(3, true),
+      true);
+
+  auto grad_input = result.grad_input;
+  auto grad_weight = result.grad_weight;
+  auto grad_bias = result.grad_bias;
+
+  if (dtype == DataType::Half) {
+    grad_input = castOp(DataType::Half, grad_input);
+    grad_weight = castOp(DataType::Half, grad_weight);
+    grad_bias = castOp(DataType::Half, grad_bias);
+  }
+
+  fusion->addOutput(grad_input);
+  fusion->addOutput(grad_weight);
+  fusion->addOutput(grad_bias);
+}
+
+static void NvFuserScheduler_BatchNorm_nhwc_BWD(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const bool kTraining = true;
+  const float kEps = 1e-5;
+
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(2),
+      benchmark_state.range(2),
+      benchmark_state.range(1)};
+
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(input_shape, options);
+  at::Tensor grad_out = at::randn(input_shape, options);
+  at::Tensor weight = at::ones({input_shape[3]}, fp32_options);
+  at::Tensor run_mean = at::zeros({input_shape[3]}, fp32_options);
+  at::Tensor run_var = at::ones({input_shape[3]}, fp32_options);
+  at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options);
+  at::Tensor save_var = at::ones({input_shape[3]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs(
+      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
+       (run_mean.numel() + run_var.numel() + save_mean.numel() +
+        save_var.numel() + weight.numel()) *
+           int64_t(dataTypeSize(DataType::Float))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_nhwc_BWD(
+    benchmark::State& benchmark_state,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(input_shape, options)
+                         .contiguous(c10::MemoryFormat::ChannelsLast);
+  at::Tensor grad_out = at::randn(input_shape, options)
+                            .contiguous(c10::MemoryFormat::ChannelsLast);
+  at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
+  at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
+
+  auto ato_weight = c10::optional<at::Tensor>(weight);
+  auto ato_bias = c10::optional<at::Tensor>(bias);
+  auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
+  auto ato_run_var = c10::optional<at::Tensor>(run_var);
+  auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
+  auto ato_save_var = c10::optional<at::Tensor>(save_var);
+
+  auto fwd_result = at::_ops::_batch_norm_impl_index::call(
+      input,
+      ato_weight,
+      ato_bias,
+      ato_run_mean,
+      ato_run_var,
+      true,
+      kMomentum,
+      kEps,
+      true);
+  cudaDeviceSynchronize();
+
+  // Sync everything up before we start
+  clearL2Cache();
+  cudaDeviceSynchronize();
+  for (auto _ : benchmark_state) {
+    CudaKernelTimer timer;
+
+    at::_ops::cudnn_batch_norm_backward::call(
+        input,
+        grad_out,
+        weight,
+        ato_run_mean,
+        ato_run_var,
+        save_mean,
+        save_var,
+        kEps,
+        std::get<3>(fwd_result));
+
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+    cudaDeviceSynchronize();
+    clearL2Cache();
+    cudaDeviceSynchronize();
+  }
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
+       (run_mean.numel() + run_var.numel() + save_mean.numel() +
+        save_var.numel() + weight.numel()) *
+           int64_t(dataTypeSize(DataType::Float))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Float);
+}
+
+static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16(
+    benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
+}
+
+// Simple aliases just for names in the printed output
+static void Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
+}
+
+static void Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
+  Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_nhwc_BWD_fp32,
+    setupBatchNorm_nhwc_BWD,
+    NvFuserScheduler_BatchNorm_nhwc_BWD,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_BatchNorm_nhwc_BWD_fp16,
+    setupBatchNorm_nhwc_BWD,
+    NvFuserScheduler_BatchNorm_nhwc_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{64, 512}, {32, 128}, {2, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 64}, {2, 32}, {2, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+// RESNET and REXNEXT benchmarks
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16,
+    setupBatchNorm_nhwc_BWD,
+    NvFuserScheduler_BatchNorm_nhwc_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16,
+    setupBatchNorm_nhwc_BWD,
+    NvFuserScheduler_BatchNorm_nhwc_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// Permutation of TIMM sizes
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16,
+    setupBatchNorm_nhwc_BWD,
+    NvFuserScheduler_BatchNorm_nhwc_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
+    ->ArgsProduct(
+        {{8, 16, 32, 64, 128, 256},
+         {24, 40, 48, 56, 72, 152, 184, 200, 368},
+         {7, 14, 28, 56, 112}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
+    ->ArgsProduct(
+        {{128, 256, 512, 1024, 2048},
+         {24, 40, 48, 56, 72, 152},
+         {7, 14, 28, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16)
+    ->Args({256, 64, 112})
+    ->Args({256, 64, 56})
+    ->Args({256, 256, 56})
+    ->Args({256, 128, 56})
+    ->Args({256, 128, 28})
+    ->Args({256, 512, 28})
+    ->Args({256, 256, 28})
+    ->Args({256, 256, 14})
+    ->Args({256, 1024, 14})
+    ->Args({256, 512, 14})
+    ->Args({256, 512, 7})
+    ->Args({256, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16)
+    ->Args({128, 64, 112})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 56})
+    ->Args({128, 128, 56})
+    ->Args({128, 256, 28})
+    ->Args({128, 512, 28})
+    ->Args({128, 512, 14})
+    ->Args({128, 1024, 14})
+    ->Args({128, 1024, 7})
+    ->Args({128, 2048, 7})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/bert.cpp b/benchmarks/cpp/nvfuser/bert.cpp
index f8a389331ee3..f105cfe4a4e3 100644
--- a/benchmarks/cpp/nvfuser/bert.cpp
+++ b/benchmarks/cpp/nvfuser/bert.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -14,7 +15,7 @@
 
 #include <sstream>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -36,7 +37,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
   fusion->addInput(tv1);
 
   // TODO: should be input
-  auto d16 = new Double(1.0);
+  auto d16 = IrBuilder::create<Double>(1.0);
 
   if (is_fp16) {
     tv0 = castOp(DataType::Float, tv0);
@@ -47,7 +48,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
   auto tv3 = add(tv2, tv0);
 
   auto tv10 = softmax(tv3, 3);
-  auto dropout_tvs = dropout(tv10, new Double(0.9));
+  auto dropout_tvs = dropout(tv10, IrBuilder::create<Double>(0.9));
   auto tv12 = dropout_tvs.mask;
   auto tv14 = dropout_tvs.output;
 
@@ -83,9 +84,9 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
   }
 
   // TODO: should be inputs
-  auto d32 = new Double(1.0);
+  auto d32 = IrBuilder::create<Double>(1.0);
   // fusion->addInput(d32);
-  auto d33 = new Double(2.0);
+  auto d33 = IrBuilder::create<Double>(2.0);
   // fusion->addInput(d33);
 
   auto tv4 = mul(tv2, tv3);
@@ -252,14 +253,15 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
 
   auto tv5 = broadcast(tv4, {true, true, false});
   auto tv6 = add(tv3, tv5);
-  auto dropout_outs = dropout(tv6, new Double(0.9));
+  auto dropout_outs = dropout(tv6, IrBuilder::create<Double>(0.9));
 
   auto tv8 = dropout_outs.output;
   auto tv10 = dropout_outs.mask;
 
   auto tv11 = add(tv10, tv2);
 
-  auto layer_norm_outs = layer_norm(tv11, 1, tv0, tv1, new Double(1e-5));
+  auto layer_norm_outs =
+      layer_norm(tv11, 1, tv0, tv1, IrBuilder::create<Double>(1e-5));
   auto tv14 = layer_norm_outs.output;
   auto tv21 = layer_norm_outs.mean;
   auto tv26 = layer_norm_outs.invstd;
@@ -481,7 +483,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
     tv1 = castOp(DataType::Float, tv1);
     tv8 = castOp(DataType::Float, tv8);
   }
-  auto d36 = mul(new Double(1.0), tv1->axis(2)->extent());
+  auto d36 = mul(IrBuilder::create<Double>(1.0), tv1->axis(2)->extent());
   auto d47 = unaryOp(UnaryOpType::Reciprocal, d36);
 
   auto tv9 = broadcast(tv5, {true, true, false});
@@ -583,7 +585,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
   }
 
   // Uncertain this is the right value, but going for it anyways
-  auto d34 = div(new Double(1.0), tv0->axis(2)->extent());
+  auto d34 = div(IrBuilder::create<Double>(1.0), tv0->axis(2)->extent());
 
   auto tv25 = mul(tv21, tv0);
   auto tv26 = mul(tv25, d34);
diff --git a/benchmarks/cpp/nvfuser/broadcast.cpp b/benchmarks/cpp/nvfuser/broadcast.cpp
index d693ff68bf85..8411444ca96a 100644
--- a/benchmarks/cpp/nvfuser/broadcast.cpp
+++ b/benchmarks/cpp/nvfuser/broadcast.cpp
@@ -12,7 +12,7 @@
 
 #include <sstream>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -70,9 +70,8 @@ static void NvFuserScheduler_Broadcast(
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
   TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
   auto params = toString(compile_log.pointwise_params.value());
-  auto lparams = toString(compile_log.launch_constraints.value());
+  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
 
diff --git a/benchmarks/cpp/nvfuser/gelu_backward.cpp b/benchmarks/cpp/nvfuser/gelu_backward.cpp
index 9d53d9c27593..6632ba58a236 100644
--- a/benchmarks/cpp/nvfuser/gelu_backward.cpp
+++ b/benchmarks/cpp/nvfuser/gelu_backward.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 
@@ -11,7 +12,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -41,23 +42,23 @@ static void setupFusion(Fusion* fusion) {
   auto t5 = castOp(DataType::Float, t4);
   auto t6 = broadcast(t3, {true, true, false});
   auto t7 = add(t6, t5);
-  auto t8 = mul(t7, new Double(k_079));
-  auto t9 = mul(t7, new Double(k_004));
+  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
+  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
   auto t10 = mul(t9, t7);
-  auto t11 = add(t10, new Int(1));
+  auto t11 = add(t10, IrBuilder::create<Int>(1));
   auto t12 = mul(t8, t11);
   auto t13 = unaryOp(UnaryOpType::Tanh, t12);
-  auto t14 = mul(t7, new Double(0.5));
+  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
   auto t15 = mul(t13, t13);
   auto t16 = unaryOp(UnaryOpType::Neg, t15);
-  auto t17 = add(t16, new Int(1));
-  auto t18 = mul(t7, new Double(k_010));
+  auto t17 = add(t16, IrBuilder::create<Int>(1));
+  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
   auto t19 = mul(t18, t7);
-  auto t20 = add(t19, new Double(k_079));
+  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
   auto t21 = mul(t17, t20);
   auto t22 = mul(t14, t21);
-  auto t23 = add(t13, new Int(1));
-  auto t24 = mul(t23, new Double(0.5));
+  auto t23 = add(t13, IrBuilder::create<Int>(1));
+  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
   auto t25 = add(t22, t24);
   auto t26 = mul(t25, t1);
 
diff --git a/benchmarks/cpp/nvfuser/heuristic_cache.cpp b/benchmarks/cpp/nvfuser/heuristic_cache.cpp
index 22b8ec4ce972..64b1ecfb756d 100644
--- a/benchmarks/cpp/nvfuser/heuristic_cache.cpp
+++ b/benchmarks/cpp/nvfuser/heuristic_cache.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -10,23 +11,10 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
     std::unique_ptr<FusionExecutorCache>& fec,
@@ -129,7 +117,7 @@ static auto getLayerForwardNormRuntime(
   Fusion& fusion = *fusion_ptr.get();
 
   const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
 
   auto input = makeSymbolicTensor(shape.size());
   fusion.addInput(input);
diff --git a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp
index 22b8ec4ce972..64b1ecfb756d 100644
--- a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp
+++ b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -10,23 +11,10 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
     std::unique_ptr<FusionExecutorCache>& fec,
@@ -129,7 +117,7 @@ static auto getLayerForwardNormRuntime(
   Fusion& fusion = *fusion_ptr.get();
 
   const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
 
   auto input = makeSymbolicTensor(shape.size());
   fusion.addInput(input);
diff --git a/benchmarks/cpp/nvfuser/instance_norm.cpp b/benchmarks/cpp/nvfuser/instance_norm.cpp
index 395ac6c8c9cd..a7139c113a43 100644
--- a/benchmarks/cpp/nvfuser/instance_norm.cpp
+++ b/benchmarks/cpp/nvfuser/instance_norm.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
@@ -9,16 +10,22 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
-static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
+static void setupInstanceNorm(
+    Fusion* fusion,
+    DataType dtype,
+    bool channels_last_3d = false) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
   FusionGuard fg(fusion);
 
   auto input = makeContigTensor(4, dtype);
+  if (channels_last_3d) {
+    input = makeContigTensor(5, dtype);
+  }
   auto weight = makeContigTensor(1, dtype);
   auto bias = makeContigTensor(1, dtype);
   auto running_mean = makeContigTensor(1, DataType::Float);
@@ -39,8 +46,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
   const bool kTraining = true;
   const float kMomentum = 0.1;
   const float kEps = 1e-5;
-  auto momentum_ptr = new Double(kMomentum);
-  auto eps_ptr = new Double(kEps);
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
 
   auto norm = instance_norm(
       input,
@@ -50,7 +57,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
       running_var,
       kTraining,
       momentum_ptr,
-      eps_ptr);
+      eps_ptr,
+      channels_last_3d);
 
   auto output = unaryOp(UnaryOpType::Relu, norm.output);
 
@@ -66,7 +74,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
 static void NvFuserScheduler_InstanceNorm(
     benchmark::State& benchmark_state,
     FusionExecutorCache* fusion_executor_cache,
-    DataType dtype) {
+    DataType dtype,
+    bool channels_last_3d = false) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
   std::vector<int64_t> input_shape{
@@ -75,17 +84,25 @@ static void NvFuserScheduler_InstanceNorm(
       benchmark_state.range(1),
       benchmark_state.range(1)};
 
+  std::vector<int64_t> input_shape_3d{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(1),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
   // inputs
   at::manual_seed(0);
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto fp32_options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_var = at::ones({input_shape[1]}, fp32_options);
+  at::Tensor at_x =
+      at::randn(channels_last_3d ? input_shape_3d : input_shape, options);
+  at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
+  at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
+  at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
+  at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs = {
       at_x, at_weight, at_bias, at_mean, at_var};
@@ -93,21 +110,20 @@ static void NvFuserScheduler_InstanceNorm(
 
   runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
 
-  const size_t kSize =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t kChannels = input_shape[1];
+  const size_t kChannels = benchmark_state.range(2);
 
   // Read: x, weight, bias, running_mean, running_var
   // Write: y, running_mean, running_var
   benchmark_state.SetBytesProcessed(
       benchmark_state.iterations() *
-      ((kChannels * 2 + kSize * 2) * dataTypeSize(dtype) +
+      ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
        (kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
 }
 
 static void Baseline_InstanceNorm(
     benchmark::State& benchmark_state,
-    DataType dtype) {
+    DataType dtype,
+    bool channels_last_3d = false) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
   std::vector<int64_t> input_shape{
@@ -115,6 +131,14 @@ static void Baseline_InstanceNorm(
       benchmark_state.range(2),
       benchmark_state.range(1),
       benchmark_state.range(1)};
+  std::vector<int64_t> input_shape_3d{
+      benchmark_state.range(0),
+      benchmark_state.range(2),
+      benchmark_state.range(1),
+      benchmark_state.range(1),
+      benchmark_state.range(1),
+  };
+
   const float kMomentum = 0.1;
   const float kEps = 1e-5;
   const auto aten_dtype = data_type_to_aten(dtype);
@@ -125,10 +149,15 @@ static void Baseline_InstanceNorm(
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[1]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[1]}, options);
-  at::Tensor at_mean = at::zeros({input_shape[1]}, fp32_options);
-  at::Tensor at_var = at::ones({input_shape[1]}, fp32_options);
+  if (channels_last_3d) {
+    at_x = at::randn(
+        input_shape_3d,
+        options.memory_format(c10::MemoryFormat::ChannelsLast3d));
+  }
+  at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
+  at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
+  at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
+  at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
 
   auto ato_weight = c10::optional<at::Tensor>(at_weight);
   auto ato_bias = c10::optional<at::Tensor>(at_bias);
@@ -158,15 +187,13 @@ static void Baseline_InstanceNorm(
     cudaDeviceSynchronize();
   }
 
-  const size_t kSize =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-  const size_t kChannels = input_shape[1];
+  const size_t kChannels = benchmark_state.range(2);
 
   // Read: x, weight, bias, running_mean, running_var
   // Write: y, running_mean, running_var
   benchmark_state.SetBytesProcessed(
       benchmark_state.iterations() *
-      ((kChannels * 2 + kSize * 2) * dataTypeSize(dtype) +
+      ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
        (kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
 }
 
@@ -180,6 +207,11 @@ static void Baseline_InstanceNorm_fp16(benchmark::State& benchmark_state) {
   Baseline_InstanceNorm(benchmark_state, DataType::Half);
 }
 
+static void Baseline_InstanceNorm_fp32_channels_last_3d(
+    benchmark::State& benchmark_state) {
+  Baseline_InstanceNorm(benchmark_state, DataType::Float, true);
+}
+
 //------------------------------------------------------------------------------
 
 NVFUSER_BENCHMARK_DEFINE(
@@ -205,6 +237,44 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp16)
     ->Ranges({{8, 8}, {640, 640}, {64, 256}})
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_InstanceNorm3d_channels_last_fp32,
+    setupInstanceNorm,
+    NvFuserScheduler_InstanceNorm,
+    DataType::Float,
+    true);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 8}, {128, 128}, {32, 32}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 8}, {64, 64}, {64, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 8}, {32, 32}, {128, 128}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 8}, {16, 16}, {256, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 8}, {4, 8}, {320, 320}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
 //------------------------------------------------------------------------------
 
 BENCHMARK(Baseline_InstanceNorm_fp32)
@@ -219,4 +289,28 @@ BENCHMARK(Baseline_InstanceNorm_fp16)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
+BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
+    ->RangeMultiplier(2)
+    ->Ranges({{2, 8}, {128, 128}, {32, 32}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
+    ->RangeMultiplier(2)
+    ->Ranges({{2, 8}, {64, 64}, {64, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
+    ->RangeMultiplier(2)
+    ->Ranges({{2, 8}, {16, 16}, {256, 256}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
+    ->RangeMultiplier(2)
+    ->Ranges({{2, 8}, {4, 8}, {320, 320}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
 //------------------------------------------------------------------------------
diff --git a/benchmarks/cpp/nvfuser/layer_norm.cpp b/benchmarks/cpp/nvfuser/layer_norm.cpp
index c4f79b2b668b..d793a45caa3c 100644
--- a/benchmarks/cpp/nvfuser/layer_norm.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -10,7 +11,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -24,7 +25,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
   const int kReductionAxis = 1;
   const float kEps = 1e-5;
 
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
 
   // setup fusion
   auto input = makeContigTensor(2, dtype);
@@ -45,8 +46,8 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
 
   auto output = layer_norm_results.output;
 
-  if (dtype == DataType::Half) {
-    output = castOp(DataType::Half, output);
+  if (dtype != DataType::Float) {
+    output = castOp(dtype, output);
   }
 
   fusion->addOutput(output);
@@ -89,9 +90,9 @@ static void Baseline_LayerNorm(
 
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
-  const int kReductionAxis = 1;
+  const size_t kReductionAxis = 1;
   std::vector<int64_t> norm_shape;
-  for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
+  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
     norm_shape.push_back(input_shape[idx]);
   }
 
diff --git a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
index 43eafcc42fb1..9e6ac1c207d1 100644
--- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -10,7 +11,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -22,7 +23,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
   const int kReductionAxis = 1;
-  Double* eps_ptr = new Double(1e-5);
+  Double* eps_ptr = IrBuilder::create<Double>(1e-5);
 
   // setup fusion
   auto grad_out = makeContigTensor(2, dtype);
@@ -33,12 +34,12 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
   auto mean = TensorViewBuilder()
                   .contiguity({false, false})
                   .shape({-1, 1})
-                  .dtype(dtype)
+                  .dtype(DataType::Float)
                   .build();
   auto rstd = TensorViewBuilder()
                   .contiguity({false, false})
                   .shape({-1, 1})
-                  .dtype(dtype)
+                  .dtype(DataType::Float)
                   .build();
 
   fusion->addInput(grad_out);
@@ -53,20 +54,17 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
     input = castOp(DataType::Float, input);
     weight = castOp(DataType::Float, weight);
     bias = castOp(DataType::Float, bias);
-    mean = castOp(DataType::Float, mean);
-    rstd = castOp(DataType::Float, rstd);
   }
 
   auto layer_norm_results = layer_norm_backward(
       grad_out, input, {1}, mean, rstd, weight, bias, {true, true, true});
 
-  if (dtype == DataType::Half) {
+  if (dtype != DataType::Float) {
     layer_norm_results.grad_input =
-        castOp(DataType::Half, layer_norm_results.grad_input);
-    layer_norm_results.grad_bias =
-        castOp(DataType::Half, layer_norm_results.grad_bias);
+        castOp(dtype, layer_norm_results.grad_input);
+    layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias);
     layer_norm_results.grad_weight =
-        castOp(DataType::Half, layer_norm_results.grad_weight);
+        castOp(dtype, layer_norm_results.grad_weight);
   }
 
   fusion->addOutput(layer_norm_results.grad_input);
@@ -85,14 +83,16 @@ static void NvFuserScheduler_LayerNorm_BWD(
 
   // inputs
   at::manual_seed(0);
-  auto options =
+  auto maybe_fp16_options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[1]}, options);
-  at::Tensor bias = at::randn({input_shape[1]}, options);
-  at::Tensor mean = at::randn({input_shape[0], 1}, options);
-  at::Tensor rstd = at::randn({input_shape[0], 1}, options);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
+  at::Tensor input = at::randn(input_shape, maybe_fp16_options);
+  at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
+  at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
+  at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
+  at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs(
       {grad_out, input, weight, bias, mean, rstd});
@@ -115,22 +115,24 @@ static void Baseline_LayerNorm_BWD(
 
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
-  const int kReductionAxis = 1;
+  const size_t kReductionAxis = 1;
   std::vector<int64_t> norm_shape;
-  for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
+  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
     norm_shape.push_back(input_shape[idx]);
   }
 
   // inputs
   at::manual_seed(0);
-  auto options =
+  auto maybe_fp16_options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor grad_out = at::randn(input_shape, options);
-  at::Tensor input = at::randn(input_shape, options);
-  at::Tensor weight = at::randn({input_shape[1]}, options);
-  at::Tensor bias = at::randn({input_shape[1]}, options);
-  at::Tensor mean = at::randn({input_shape[0], 1}, options);
-  at::Tensor rstd = at::randn({input_shape[0], 1}, options);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
+  at::Tensor input = at::randn(input_shape, maybe_fp16_options);
+  at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
+  at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
+  at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
+  at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
   std::array<bool, 3> output_mask = {true, true, true};
 
   clearL2Cache();
diff --git a/benchmarks/cpp/nvfuser/lstm_cell.cpp b/benchmarks/cpp/nvfuser/lstm_cell.cpp
index 65f869fac4ad..20ec7c8f4700 100644
--- a/benchmarks/cpp/nvfuser/lstm_cell.cpp
+++ b/benchmarks/cpp/nvfuser/lstm_cell.cpp
@@ -9,7 +9,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
diff --git a/benchmarks/cpp/nvfuser/reduction.cpp b/benchmarks/cpp/nvfuser/reduction.cpp
index c25097963dbc..3fd1bcb59dfc 100644
--- a/benchmarks/cpp/nvfuser/reduction.cpp
+++ b/benchmarks/cpp/nvfuser/reduction.cpp
@@ -12,7 +12,7 @@
 
 #include <sstream>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -66,9 +66,8 @@ static void NvFuserScheduler_Reduction(
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
   TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value());
-  TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
   auto rparams = toString(compile_log.reduction_params.value());
-  auto lparams = toString(compile_log.launch_constraints.value());
+  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(rparams + lparams);
 
@@ -191,6 +190,18 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{1024, 1024 * 512}, {2, 4 * 1024}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 4 * 1024}, {1024, 1024 * 512}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
 NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
     // ->RangeMultiplier(2)
     ->Ranges({{1, 1024 * 1024}, {160, 320}})
@@ -215,6 +226,18 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{1024, 1024 * 1024}, {2, 4 * 1024}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 4 * 1024}, {1024, 1024 * 1024}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
 NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
     // ->RangeMultiplier(2)
     ->Ranges({{1, 1024 * 1024}, {160, 320}})
diff --git a/benchmarks/cpp/nvfuser/rms_norm.cpp b/benchmarks/cpp/nvfuser/rms_norm.cpp
new file mode 100644
index 000000000000..81fdf46cf818
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/rms_norm.cpp
@@ -0,0 +1,172 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupRMSNorm(Fusion* fusion, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
+
+  FusionGuard fg(fusion);
+
+  const int kReductionAxis = 2;
+  const float kEps = 1e-6;
+
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  // setup fusion
+  auto input = makeContigTensor(3, dtype);
+  auto weight = makeContigTensor(1, dtype);
+
+  fusion->addInput(input);
+  fusion->addInput(weight);
+
+  if (dtype == DataType::Half) {
+    input = castOp(DataType::Float, input);
+    weight = castOp(DataType::Float, weight);
+  }
+
+  auto rms_norm_results = rms_norm(input, 1, weight, eps_ptr);
+
+  auto output = rms_norm_results.output;
+
+  if (dtype != DataType::Float) {
+    output = castOp(dtype, output);
+  }
+
+  fusion->addOutput(output);
+}
+
+static void NvFuserScheduler_RMSNorm(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
+
+  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
+  const float kEps = 1e-6;
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(input_shape, options);
+  at::Tensor weight = at::randn({input_shape[2]}, options);
+
+  std::vector<c10::IValue> aten_inputs({input, weight});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (2 * input.numel() + weight.numel()) * int64_t(dataTypeSize(dtype)));
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_RMSNorm_fp32,
+    setupRMSNorm,
+    NvFuserScheduler_RMSNorm,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{16, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{18, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{22, 44}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{24, 48}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_RMSNorm_fp16,
+    setupRMSNorm,
+    NvFuserScheduler_RMSNorm,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{16, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{18, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{22, 44}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{24, 48}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// TODO: Automatically disable/enable if bf16 is supported
+// NVFUSER_BENCHMARK_DEFINE(
+//     NvFuserScheduler_RMSNorm_bf16,
+//     setupRMSNorm,
+//     NvFuserScheduler_RMSNorm,
+//     DataType::BFloat16);
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{16, 64}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{18, 56}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{22, 44}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{24, 48}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/rms_norm_backward.cpp b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp
new file mode 100644
index 000000000000..b4c6ac413c75
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp
@@ -0,0 +1,166 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+//------------------------------------------------------------------------------
+
+static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
+  FusionGuard fg(fusion);
+
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
+
+  const int kReductionAxis = 2;
+  Double* eps_ptr = IrBuilder::create<Double>(1e-6);
+
+  // setup fusion
+  auto grad_out = makeContigTensor(3, dtype);
+  auto input = makeContigTensor(3, dtype);
+  auto weight = makeContigTensor(1, dtype);
+  auto rstd = TensorViewBuilder()
+                  .contiguity({false, false, false})
+                  .shape({-1, -1, 1})
+                  .dtype(dtype)
+                  .build();
+
+  fusion->addInput(grad_out);
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(rstd);
+
+  if (dtype == DataType::Half) {
+    grad_out = castOp(DataType::Float, grad_out);
+    input = castOp(DataType::Float, input);
+    weight = castOp(DataType::Float, weight);
+    rstd = castOp(DataType::Float, rstd);
+  }
+
+  auto rms_norm_results =
+      rms_norm_backward(grad_out, input, {1}, rstd, weight, {true, true, true});
+
+  if (dtype != DataType::Float) {
+    rms_norm_results.grad_input = castOp(dtype, rms_norm_results.grad_input);
+    rms_norm_results.grad_weight = castOp(dtype, rms_norm_results.grad_weight);
+  }
+
+  fusion->addOutput(rms_norm_results.grad_input);
+  fusion->addOutput(rms_norm_results.grad_weight);
+}
+
+static void NvFuserScheduler_RMSNorm_BWD(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype) {
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
+
+  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
+
+  // inputs
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  at::Tensor grad_out = at::randn(input_shape, options);
+  at::Tensor input = at::randn(input_shape, options);
+  at::Tensor weight = at::randn({input_shape[2]}, options);
+  at::Tensor rstd = at::randn({input_shape[0], input_shape[1], 1}, options);
+
+  std::vector<c10::IValue> aten_inputs({grad_out, input, weight, rstd});
+
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (3 * input.numel() + weight.numel() + rstd.numel()) *
+      int64_t(dataTypeSize(dtype)));
+}
+
+//------------------------------------------------------------------------------
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_RMSNorm_BWD_fp32,
+    setupRMSNorm_BWD,
+    NvFuserScheduler_RMSNorm_BWD,
+    DataType::Float);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{16, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{28, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
+    ->RangeMultiplier(2)
+    ->Ranges({{24, 48}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_RMSNorm_BWD_fp16,
+    setupRMSNorm_BWD,
+    NvFuserScheduler_RMSNorm_BWD,
+    DataType::Half);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{16, 64}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{28, 56}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
+    ->RangeMultiplier(2)
+    ->Ranges({{24, 48}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// TODO: Automatically disable/enable if bf16 is supported
+// NVFUSER_BENCHMARK_DEFINE(
+//     NvFuserScheduler_RMSNorm_BWD_bf16,
+//     setupRMSNorm_BWD,
+//     NvFuserScheduler_RMSNorm_BWD,
+//     DataType::BFloat16);
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{16, 64}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{28, 56}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
+
+// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
+//     ->RangeMultiplier(2)
+//     ->Ranges({{24, 48}})
+//     ->Unit(benchmark::kMicrosecond)
+//     ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/scale_bias_relu.cpp b/benchmarks/cpp/nvfuser/scale_bias_relu.cpp
index 47ed9047f159..6bb7fc18aa0b 100644
--- a/benchmarks/cpp/nvfuser/scale_bias_relu.cpp
+++ b/benchmarks/cpp/nvfuser/scale_bias_relu.cpp
@@ -8,7 +8,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -136,9 +136,8 @@ static void NvFuserScheduler_SBR(
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
   TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
   auto params = toString(compile_log.pointwise_params.value());
-  auto lparams = toString(compile_log.launch_constraints.value());
+  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
   benchmark_state.SetLabel(lparams);
@@ -240,9 +239,8 @@ static void NvFuserScheduler_SBR_Norm(
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
   TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
   auto params = toString(compile_log.pointwise_params.value());
-  auto lparams = toString(compile_log.launch_constraints.value());
+  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
 
diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp
index 33a9404b0739..2e5e23ed7442 100644
--- a/benchmarks/cpp/nvfuser/shape_inference.cpp
+++ b/benchmarks/cpp/nvfuser/shape_inference.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -10,27 +11,10 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
-namespace {
-
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
-} // namespace
-
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
     std::unique_ptr<FusionExecutorCache>& fec,
@@ -151,7 +135,7 @@ static auto getLayerForwardNormRuntime(
   Fusion& fusion = *fusion_ptr.get();
 
   const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
 
   auto input = makeSymbolicTensor(shape.size());
   fusion.addInput(input);
diff --git a/benchmarks/cpp/nvfuser/softmax.cpp b/benchmarks/cpp/nvfuser/softmax.cpp
index 3964e03671fa..439e426220f8 100644
--- a/benchmarks/cpp/nvfuser/softmax.cpp
+++ b/benchmarks/cpp/nvfuser/softmax.cpp
@@ -11,7 +11,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -87,7 +87,7 @@ static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) {
   std::vector<c10::IValue> aten_inputs({aten_input});
 
   // Schedule through magic scheduler:
-  auto runtime_info = SchedulerRuntimeInfo(fusion, aten_inputs, true);
+  SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
   TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
       ScheduleHeuristic::Persistent, fusion, runtime_info));
   auto scheduler = SchedulerEntry::makeEntry(
@@ -132,7 +132,7 @@ static void Softmax_WarpReduce(benchmark::State& benchmark_state) {
   std::vector<c10::IValue> aten_inputs({aten_input});
 
   // Schedule through magic scheduler:
-  auto runtime_info = SchedulerRuntimeInfo(fusion, aten_inputs, true);
+  SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
   TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
       ScheduleHeuristic::Persistent, fusion, runtime_info));
   auto scheduler = SchedulerEntry::makeEntry(
diff --git a/benchmarks/cpp/nvfuser/softmax_backward.cpp b/benchmarks/cpp/nvfuser/softmax_backward.cpp
index 1bf2e623291a..8fb35083c6dc 100644
--- a/benchmarks/cpp/nvfuser/softmax_backward.cpp
+++ b/benchmarks/cpp/nvfuser/softmax_backward.cpp
@@ -11,7 +11,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
diff --git a/benchmarks/cpp/nvfuser/softmax_dropout.cpp b/benchmarks/cpp/nvfuser/softmax_dropout.cpp
index b4890eaf8d8a..48950373731c 100644
--- a/benchmarks/cpp/nvfuser/softmax_dropout.cpp
+++ b/benchmarks/cpp/nvfuser/softmax_dropout.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@@ -11,7 +12,7 @@
 
 #include <cuda_runtime.h>
 
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 using namespace torch::jit::fuser::cuda;
 
@@ -35,7 +36,7 @@ static void setupSoftmaxDropout(
   auto attention_scores = makeContigTensor(4, dtype);
   auto attention_mask = makeContigTensor(4, dtype);
 
-  Double* divisor = new Double();
+  Double* divisor = IrBuilder::create<Double>();
 
   fusion->addInput(attention_scores);
   fusion->addInput(attention_mask);
@@ -49,8 +50,8 @@ static void setupSoftmaxDropout(
   attention_scores = div(attention_scores, divisor);
   attention_scores = add(attention_scores, attention_mask);
   auto attention_probs = softmax(attention_scores, kReductionAxis);
-  auto prob = new Double(kDropoutProbability);
-  auto scale = new Double(kScale);
+  auto prob = IrBuilder::create<Double>(kDropoutProbability);
+  auto scale = IrBuilder::create<Double>(kScale);
   auto dropout_results = dropout(attention_probs, prob, scale);
   auto output = dropout_results.output;
 
diff --git a/benchmarks/cpp/nvfuser/timm.cpp b/benchmarks/cpp/nvfuser/timm.cpp
new file mode 100644
index 000000000000..e7e9d22e8c95
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/timm.cpp
@@ -0,0 +1,741 @@
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) {
+  FusionGuard fg(fusion);
+
+  auto t2 = makeContigTensor(3, DataType::Float);
+  auto t3 = TensorViewBuilder()
+                .shape({-1, -1, 1})
+                .dtype(DataType::Float)
+                .contiguity({true, true, false})
+                .build();
+  auto t4 = TensorViewBuilder()
+                .shape({-1, -1, 1})
+                .dtype(DataType::Float)
+                .contiguity({true, true, false})
+                .build();
+  auto t7 = makeContigTensor(3, DataType::Half);
+
+  fusion->addInput(t2);
+  fusion->addInput(t3);
+  fusion->addInput(t4);
+  fusion->addInput(t7);
+
+  auto t8 = castOp(DataType::Float, t7);
+  auto t9 = set(t8);
+  auto t10 = sub(t2, t3);
+  auto t11 = mul(t10, t4);
+  auto t25 = mul(t9, t11);
+  auto t26 = sum(t25, {0, 1});
+  auto t36 = set(t26);
+  auto t27 = sum(t9, {0, 1});
+  auto t37 = set(t27);
+  auto t39 = castOp(DataType::Half, t11);
+
+  fusion->addOutput(t36);
+  fusion->addOutput(t37);
+  fusion->addOutput(t39);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t2 = at::randn(input_shape, fp32_options);
+  auto t3 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
+  auto t4 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
+  auto t7 = at::randn(input_shape, fp16_options);
+
+  std::vector<c10::IValue> aten_inputs({t2, t3, t4, t7});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // full tensor - float + halfx2 - t2, t7, t39
+  // Inner most dimension only - floatx2 - t36, t37
+  // Outer two dimensions only - floatx2 - t3, t4
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      //           t2 + t7    t3 + t4                 t36 + t37
+      t2.numel() * (4 + 2) + t3.numel() * 4 * 2 + input_shape[2] * (4 * 2) +
+      // T39
+      t2.numel() * 2);
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7,
+    setup_vit_base_patch16_224_bcast7,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7,
+    nullptr);
+
+// pwise case, broadcasting both sides
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7)
+    ->Args({64, 197, 768})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
+  FusionGuard fg(fusion);
+
+  auto t2 = makeContigTensor(3, DataType::Float);
+  auto t5 = makeContigTensor(1, DataType::Float);
+  auto t3 = makeContigTensor(3, DataType::Half);
+  auto t0 = makeContigTensor(1, DataType::Float);
+  auto t1 = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(t2);
+  fusion->addInput(t5);
+  fusion->addInput(t3);
+  fusion->addInput(t0);
+  fusion->addInput(t1);
+
+  std::vector<bool> bcast_pattern0({true, true, false});
+  std::vector<bool> bcast_pattern1({false, false, true});
+
+  auto t4 = castOp(DataType::Float, t3);
+  auto t6 = set(t5);
+  auto t7 = broadcast(t6, bcast_pattern0);
+  auto t8 = add(t4, t7);
+  auto t9 = randlike(t8);
+  auto d34 =
+      sub(IrBuilder::create<Double>(1.0), IrBuilder::create<Double>(0.0));
+  auto t10 = lt(t9, d34);
+  auto t11 = castOp(DataType::Float, t10);
+  auto t12 = mul(t8, t11);
+  auto b36 = eq(d34, IrBuilder::create<Double>(0.0));
+  auto d37 = castOp(DataType::Double, b36);
+  auto d38 = add(d37, d34);
+  auto d40 = div(IrBuilder::create<Double>(1.0), d38);
+  auto t13 = mul(t12, d40);
+  auto t14 = set(t13);
+  auto t15 = add(t2, t14);
+  auto t16 = set(t15);
+  auto t36 = sum(t16, {2});
+  auto d151 = castOp(DataType::Double, t2->axis(2)->extent());
+  auto d152 = mul(IrBuilder::create<Double>(1.0), d151);
+  auto t19 = div(t36, d152);
+  auto t22 = broadcast(t19, bcast_pattern1);
+  auto t23 = sub(t16, t22);
+  auto t37 = mul(t23, t23);
+  auto t20 = sum(t37, {2});
+  auto t24 = broadcast(t20, bcast_pattern1);
+  auto d95 = castOp(DataType::Double, t2->axis(2)->extent());
+  auto d96 = mul(IrBuilder::create<Double>(1.0), d95);
+  auto d105 = reciprocal(d95);
+  auto t25 = mul(t24, d105);
+  auto t26 = add(t25, IrBuilder::create<Double>(1e-6));
+  auto t27 = rsqrt(t26);
+  auto t28 = mul(t23, t27);
+  auto t17 = set(t1);
+  auto t29 = broadcast(t17, bcast_pattern0);
+  auto t30 = mul(t28, t29);
+  auto t18 = set(t0);
+  auto t31 = broadcast(t18, bcast_pattern0);
+  auto t32 = add(t30, t31);
+  auto t33 = set(t32);
+  auto t34 = castOp(DataType::Half, t33);
+
+  fusion->addOutput(t16); // full 3d float
+  fusion->addOutput(t10); // full 3d bool
+  fusion->addOutput(t22); // bcast last dim float
+  fusion->addOutput(t27); // bcast last dim float
+  fusion->addOutput(t18); // passthrough t0 float
+  fusion->addOutput(t17); // passthrough t1 float
+  fusion->addOutput(t34); // full 3d half
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t2 = at::randn(input_shape, fp32_options);
+  auto t5 = at::randn({input_shape[2]}, fp32_options);
+  auto t3 = at::randn(input_shape, fp16_options);
+  auto t0 = at::randn({input_shape[2]}, fp32_options);
+  auto t1 = at::randn({input_shape[2]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs({t2, t5, t3, t0, t1});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16
+  // Inner most dim only - floatx5 - t5, t0, t1, t7, t17
+  // Outer two dims only - floatx2 - t22, t27
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      t2.numel() * (2 * 4 + 2 * 2 + 1) + t5.numel() * 5 * 4 +
+      input_shape[0] * input_shape[1] * 2 * 4);
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW,
+    setup_vit_base_patch16_224_bcast5,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5,
+    nullptr);
+
+// Broadcast on both sides
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW)
+    ->Args({64, 197, 768})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void setup_vit_base_patch16_224_bcast_outer2(
+    Fusion* fusion,
+    void* null) {
+  FusionGuard fg(fusion);
+
+  auto t0 = makeContigTensor(3, DataType::Half);
+  auto t2 = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(t0);
+  fusion->addInput(t2);
+
+  auto t1 = castOp(DataType::Float, t0);
+  auto t3 = set(t2);
+  auto t4 = broadcast(t3, {true, true, false});
+  auto t5 = add(t1, t4);
+  auto t6 = castOp(DataType::Half, t5);
+  auto t7 = castOp(DataType::Half, t3);
+
+  fusion->addOutput(t6);
+  fusion->addOutput(t7);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(input_shape, fp16_options);
+  auto t2 = at::randn({input_shape[2]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs({t0, t2});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // full tensor - halfx2 - t0, t6
+  // inner dimension only - halfx2 - t2, t7
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2) +
+      input_shape[2] * (2 + 4));
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2,
+    setup_vit_base_patch16_224_bcast_outer2,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2,
+    nullptr);
+
+NVFUSER_BENCHMARK_RUN(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2)
+    ->Args({64, 197, 2304})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
+  FusionGuard fg(fusion);
+
+  auto t0 = makeContigTensor(4, DataType::Half);
+  fusion->addInput(t0);
+  auto d13 = IrBuilder::create<Double>();
+  fusion->addInput(d13);
+
+  auto t1 = castOp(DataType::Float, t0);
+  auto t2 = set(t1);
+  auto t3 = mul(t2, d13);
+  auto t4 = set(t3);
+  auto t5 = max(t4, {3});
+  auto t6 = broadcast(t5, {false, false, false, true});
+  auto t7 = sub(t4, t6);
+  auto t8 = exp(t7);
+  auto t9 = sum(t8, {3});
+  auto t10 = broadcast(t9, {false, false, false, true});
+  auto t11 = reciprocal(t10);
+  auto t12 = mul(t8, t11);
+  auto t13 = randlike(t12);
+  auto d79 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
+  auto t14 = lt(t13, d79);
+  auto t15 = castOp(DataType::Float, t14);
+  auto b81 = eq(d79, IrBuilder::create<Double>(0));
+  auto d82 = castOp(DataType::Double, b81);
+  auto d83 = add(d82, d79);
+  auto d85 = div(IrBuilder::create<Double>(1), d83);
+  auto t16 = mul(t12, t15);
+  auto t17 = mul(t16, d85);
+  auto t18 = set(t17);
+  auto t19 = castOp(DataType::Half, t18);
+
+  fusion->addOutput(t19);
+  fusion->addOutput(t14);
+  fusion->addOutput(t12);
+  fusion->addOutput(t4);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(input_shape, fp16_options);
+
+  std::vector<c10::IValue> aten_inputs({t0, 0.125});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * t0.numel() * 13);
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3,
+    setup_vit_base_patch16_224_norm_inner3,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3,
+    nullptr);
+
+// Norm inner dim
+NVFUSER_BENCHMARK_RUN(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3)
+    ->Args({64, 12, 197})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void setup_vit_base_patch16_224_bcast_outer6(
+    Fusion* fusion,
+    void* null) {
+  FusionGuard fg(fusion);
+
+  auto t0 = makeContigTensor(3, DataType::Half);
+  auto t2 = makeContigTensor(1, DataType::Float);
+
+  fusion->addInput(t0);
+  fusion->addInput(t2);
+
+  auto t1 = castOp(DataType::Float, t0);
+  auto t3 = set(t2);
+  auto t4 = broadcast(t3, {true, true, false});
+  auto t5 = add(t1, t4);
+  auto t6 = set(t5);
+  auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
+  auto t8 = erf(t7);
+  auto t9 = add(IrBuilder::create<Double>(1), t8);
+  auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
+  auto t11 = mul(t6, t10);
+  auto t12 = randlike(t11);
+  auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
+  auto t13 = lt(t12, d66);
+  auto t14 = castOp(DataType::Float, t13);
+  auto t15 = mul(t11, t14);
+  auto b68 = eq(d66, IrBuilder::create<Double>(0));
+  auto d69 = castOp(DataType::Double, b68);
+  auto d70 = add(d69, d66);
+  auto d72 = div(IrBuilder::create<Double>(1), d70);
+  auto t16 = mul(t15, d72);
+  auto t17 = set(t16);
+  auto t18 = castOp(DataType::Half, t17);
+  auto t19 = castOp(DataType::Half, t3);
+
+  fusion->addOutput(t18);
+  fusion->addOutput(t13);
+  fusion->addOutput(t6);
+  fusion->addOutput(t19);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(input_shape, fp16_options);
+  auto t2 = at::randn({input_shape[2]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs({t0, t2});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  // full tensors - float, halfx2, bool - t6, t0, t18, t13
+  // inner dimension only - float, half - t2, t19
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
+      input_shape[2] * (4 + 2));
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6,
+    setup_vit_base_patch16_224_bcast_outer6,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6,
+    nullptr);
+
+NVFUSER_BENCHMARK_RUN(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6)
+    // First size is original, the rest are variations to check perf
+    // reliability.
+    ->Args({64, 197, 3 * 1024})
+    ->Args({64, 197, 2 * 1024})
+    ->Args({64, 197, 1024})
+    ->Args({64, 197, 512})
+    ->Args({3, 1024, 64 * 197})
+    ->Args({2, 1024, 64 * 197})
+    ->Args({1, 1024, 64 * 197})
+    ->Args({2, 256, 64 * 197})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// Reverse the broadcast dimensions to check for consistency in scheduling.
+static void setup_vit_base_patch16_224_bcast_inner6(
+    Fusion* fusion,
+    void* null) {
+  FusionGuard fg(fusion);
+
+  auto t0 = makeContigTensor(3, DataType::Half);
+  auto t2 = makeContigTensor(2, DataType::Float);
+
+  fusion->addInput(t0);
+  fusion->addInput(t2);
+
+  auto t1 = castOp(DataType::Float, t0);
+  auto t3 = set(t2);
+  auto t4 = broadcast(t3, {false, false, true});
+  auto t5 = add(t1, t4);
+  auto t6 = set(t5);
+  auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
+  auto t8 = erf(t7);
+  auto t9 = add(IrBuilder::create<Double>(1), t8);
+  auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
+  auto t11 = mul(t6, t10);
+  auto t12 = randlike(t11);
+  auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
+  auto t13 = lt(t12, d66);
+  auto t14 = castOp(DataType::Float, t13);
+  auto t15 = mul(t11, t14);
+  auto b68 = eq(d66, IrBuilder::create<Double>(0));
+  auto d69 = castOp(DataType::Double, b68);
+  auto d70 = add(d69, d66);
+  auto d72 = div(IrBuilder::create<Double>(1), d70);
+  auto t16 = mul(t15, d72);
+  auto t17 = set(t16);
+  auto t18 = castOp(DataType::Half, t17);
+  auto t19 = castOp(DataType::Half, t3);
+
+  fusion->addOutput(t18);
+  fusion->addOutput(t13);
+  fusion->addOutput(t6);
+  fusion->addOutput(t19);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(input_shape, fp16_options);
+  auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options);
+
+  std::vector<c10::IValue> aten_inputs({t0, t2});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // full tensors - float, halfx2, bool - t6, t0, t18, t13
+  // outer two dimensions only - float, half - t2, t19
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
+      input_shape[0] * input_shape[1] * (4 + 2));
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6,
+    setup_vit_base_patch16_224_bcast_inner6,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6,
+    nullptr);
+
+NVFUSER_BENCHMARK_RUN(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6)
+    ->Args({64, 197, 3 * 1024})
+    ->Args({64, 197, 2 * 1024})
+    ->Args({64, 197, 1024})
+    ->Args({64, 197, 512})
+    ->Args({3, 1024, 64 * 197})
+    ->Args({2, 1024, 64 * 197})
+    ->Args({1, 1024, 64 * 197})
+    ->Args({2, 256, 64 * 197})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) {
+  FusionGuard fg(fusion);
+
+  auto t0 = makeContigTensor(3, DataType::Bool);
+  fusion->addInput(t0);
+
+  auto t1 = makeContigTensor(3, DataType::Half);
+  fusion->addInput(t1);
+
+  auto t2 = castOp(DataType::Float, t1);
+
+  auto t3 = makeContigTensor(3, DataType::Half);
+  fusion->addInput(t3);
+
+  auto t4 = castOp(DataType::Float, t3);
+
+  auto d35 = t3->axis(2)->extent();
+
+  auto t5 = TensorViewBuilder()
+                .shape({-1, -1, 1})
+                .dtype(DataType::Float)
+                .contiguity({true, true, false})
+                .build();
+  fusion->addInput(t5);
+
+  auto t6 = TensorViewBuilder()
+                .shape({-1, -1, 1})
+                .dtype(DataType::Float)
+                .contiguity({true, true, false})
+                .build();
+  fusion->addInput(t6);
+
+  auto t7 = makeContigTensor(1, DataType::Half);
+  fusion->addInput(t7);
+
+  auto t8 = castOp(DataType::Float, t7);
+
+  auto t9 = makeContigTensor(1, DataType::Half);
+  fusion->addInput(t9);
+
+  auto t11 = sub(t4, t5);
+  auto t12 = mul(t11, t6);
+
+  auto t13 = broadcast(t8, {true, true, false});
+  auto t14 = mul(t2, t13);
+  auto t15 = mul(d35, t14);
+  auto t16 = sum(t14, {2});
+  auto t17 = broadcast(t16, {false, false, true});
+  auto t18 = mul(t14, t12);
+  auto t19 = sum(t18, {2});
+  auto t20 = broadcast(t19, {false, false, true});
+
+  auto t40 = castOp(DataType::Half, t12);
+  auto t41 = castOp(DataType::Float, t40);
+  auto t42 = castOp(DataType::Half, t20);
+  auto t43 = castOp(DataType::Float, t42);
+  auto t21 = mul(t42, t43);
+
+  auto t38 = castOp(DataType::Half, t15);
+  auto t39 = castOp(DataType::Float, t38);
+  auto t44 = castOp(DataType::Half, t17);
+  auto t45 = castOp(DataType::Float, t44);
+  auto t22 = sub(t39, t45);
+
+  auto t23 = sub(t22, t21);
+
+  auto d87 = reciprocal(d35);
+  auto t24 = mul(d87, t6);
+
+  auto t25 = mul(t24, t23);
+  auto t26 = mul(t2, t41);
+  auto t27 = sum(t26, {0, 1});
+  auto t28 = sum(t2, {0, 1});
+
+  auto t29 = castOp(DataType::Float, t0);
+  auto t30 = mul(t25, t29);
+
+  auto d33 = IrBuilder::create<Double>();
+  fusion->addInput(d33);
+  auto t31 = mul(t30, d33);
+  auto t32 = sum(t31, {0, 1});
+  auto t33 = castOp(DataType::Half, t32);
+  auto t34 = castOp(DataType::Half, t31);
+  auto t35 = castOp(DataType::Half, t25);
+  auto t36 = castOp(DataType::Half, t27);
+  auto t37 = castOp(DataType::Half, t28);
+
+  fusion->addOutput(t33);
+  fusion->addOutput(t34);
+  fusion->addOutput(t35);
+  fusion->addOutput(t36);
+  fusion->addOutput(t37);
+}
+
+static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+  // auto bool_options = at::TensorOptions().dtype(at::kBool).device(at::kCUDA,
+  // 0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto fp32_options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(input_shape, fp16_options).to(at::kBool);
+  auto t1 = at::randn(input_shape, fp16_options);
+  auto t3 = at::randn(input_shape, fp16_options);
+  auto t5 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
+  auto t6 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
+  auto t7 = at::randn({input_shape[2]}, fp16_options);
+  auto t9 = at::randn({input_shape[2]}, fp16_options);
+
+  std::vector<c10::IValue> aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35
+  // Outer two dimensions - floatx2 - t5, t6
+  // Inner dimension - halfx5 - t7, t9, t33, t36, t37
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * ((t0.numel() * (4 * 2 + 1))) +
+      (t5.numel() * 4 * 2) + (t7.numel() * 5 * 2));
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD,
+    setup_vit_base_patch16_224_LN_BWD,
+    NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD,
+    nullptr);
+
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD)
+    ->Args({128, 197, 768})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) {
+  FusionGuard fg(fusion);
+
+  auto t2 = makeContigTensor(4, DataType::Half);
+  auto t5 = makeContigTensor(4, DataType::Half);
+  auto t7 = makeContigTensor(4, DataType::Half);
+  auto t9 = makeContigTensor(4, DataType::Half);
+  auto t4 = makeConcreteTensor({}, DataType::Half);
+
+  fusion->addInput(t2);
+  fusion->addInput(t5);
+  fusion->addInput(t7);
+  fusion->addInput(t9);
+  fusion->addInput(t4);
+
+  auto d86 = IrBuilder::create<Double>(0);
+
+  auto t3 = castOp(DataType::Float, t2);
+  auto t6 = castOp(DataType::Float, t5);
+  auto t8 = castOp(DataType::Float, t7);
+  auto t10 = castOp(DataType::Float, t9);
+  auto t11 = add(t8, t10);
+  auto t12 = set(t11);
+  auto t13 = set(t6);
+  auto t14 = lt(t13, d86);
+  auto t15 = broadcast(t4, {true, true, true, true});
+  auto t16 = where(t14, t15, t12);
+  auto t17 = set(t16);
+  auto t29 = castOp(DataType::Half, t17);
+  auto t18 = mul(t17, t3);
+  auto t19 = transpose(t18, {{0, 0}, {1, 3}, {2, 1}, {3, 2}});
+  auto t30 = castOp(DataType::Half, t19);
+
+  fusion->addOutput(t29);
+  fusion->addOutput(t30);
+}
+
+static void NvFuserScheduler_nhwc_seresnet152d_transpose65(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    void* null) {
+  std::vector<int64_t> input_shape{
+      benchmark_state.range(0),
+      benchmark_state.range(2),
+      benchmark_state.range(2),
+      benchmark_state.range(1)};
+
+  at::manual_seed(0);
+  auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  auto t2 = at::randn(input_shape, fp16_options);
+  auto t5 = at::randn(input_shape, fp16_options);
+  auto t7 = at::randn(input_shape, fp16_options);
+  auto t9 = at::randn(input_shape, fp16_options);
+  // Need zero dim tensor don't know how to do that, so just going to reduce a
+  // 1D tensor
+  auto t4 = at::randn({2}, fp16_options).sum();
+
+  std::vector<c10::IValue> aten_inputs({t2, t5, t7, t9, t4});
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+
+  // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) * t2.numel() * 6 * 2);
+}
+
+NVFUSER_BENCHMARK_DEFINE(
+    NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65,
+    nhwc_seresnet152d_transpose65,
+    NvFuserScheduler_nhwc_seresnet152d_transpose65,
+    nullptr);
+
+// Norm inner dim Half version of vit_base_patch16_224_norm_inner3
+NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65)
+    ->Args({128, 12, 197})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
diff --git a/benchmarks/cpp/nvfuser/transpose.cpp b/benchmarks/cpp/nvfuser/transpose.cpp
new file mode 100644
index 000000000000..39ee0452c160
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/transpose.cpp
@@ -0,0 +1,483 @@
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+#define TRANSPOSE_CONFIG {true, false, false, false}
+
+using namespace torch::jit::fuser::cuda;
+
+struct TransposeConfig {
+    bool input1_transpose_axes = false;
+    bool input2_transpose_axes = false;
+    bool intermediate_transpose_axes = false;
+    bool output_transpose_axes = false;
+};
+
+std::vector<at::Tensor> generateInputs(
+    DataType dtype,
+    int num_dims,
+    std::pair<int, int> axes,
+    int perm_size,
+    int innerdim_size,
+    bool input1_transpose_axes,
+    bool input2_transpose_axes,
+    bool non_vectorize_offset = false,
+    int iter_size = 32) {
+  at::manual_seed(0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+
+  std::vector<int64_t> transpose_shape(num_dims, iter_size);
+  transpose_shape[axes.second] = innerdim_size;
+  transpose_shape[axes.first] = perm_size;
+
+  std::vector<int64_t> non_transpose_shape(num_dims, iter_size);
+  non_transpose_shape[axes.first] = innerdim_size;
+  non_transpose_shape[axes.second] = perm_size;
+
+  // TensorType: Concrete, Contig, Symbolic
+  // Vectorization | Unroll - Add 1 to sizes
+  // Shift axis by 1 to disable vectorize loads
+  if (non_vectorize_offset) {
+    for (auto idx : c10::irange(transpose_shape.size())) {
+      transpose_shape[idx] += 1;
+    }
+    for (auto idx : c10::irange(non_transpose_shape.size())) {
+      non_transpose_shape[idx] += 1;
+    }
+  }
+
+  auto optionalTransposeSize =
+      [&transpose_shape, &non_transpose_shape](bool transpose_tensor) {
+        return (transpose_tensor) ? transpose_shape : non_transpose_shape;
+      };
+
+  at::Tensor aten_input1 =
+      at::randn(optionalTransposeSize(input1_transpose_axes), options);
+  at::Tensor aten_input2 =
+      at::randn(optionalTransposeSize(input2_transpose_axes), options);
+  return {aten_input1, aten_input2};
+}
+
+//------------------------------------------------------------------------------
+
+static void setupTranspose(
+    Fusion* fusion,
+    DataType dtype,
+    int num_dims,
+    std::pair<int, int> axes,
+    TransposeConfig tc) {
+  FusionGuard fg(fusion);
+  typedef std::pair<int, int> transpose_axes;
+
+  auto getTransposeMap =
+      [](const transpose_axes& axes) -> std::unordered_map<int, int> {
+    return {{axes.first, axes.second}, {axes.second, axes.first}};
+  };
+
+  auto optionalTranspose = [&getTransposeMap, axes](
+                               TensorView* tv, bool is_transpose) {
+    return (is_transpose) ? transpose(tv, getTransposeMap(axes)) : tv;
+  };
+
+  auto input1 = makeContigTensor(num_dims);
+  auto input2 = makeContigTensor(num_dims);
+  fusion->addInput(input1);
+  fusion->addInput(input2);
+
+  auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes);
+  auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes);
+  auto intermediate = add(ot_input1, ot_input2);
+  auto ot_intermediate =
+      optionalTranspose(intermediate, tc.intermediate_transpose_axes);
+  auto output = relu(ot_intermediate);
+  auto ot_output = optionalTranspose(output, tc.output_transpose_axes);
+  fusion->addOutput(ot_output);
+}
+
+static void NvFuserScheduler_Transpose(
+    benchmark::State& benchmark_state,
+    FusionExecutorCache* fusion_executor_cache,
+    DataType dtype,
+    int num_dims,
+    std::pair<int, int> axes,
+    TransposeConfig tc) {
+  auto aten_inputs = generateInputs(
+      dtype,
+      num_dims,
+      axes,
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      tc.input1_transpose_axes,
+      tc.input2_transpose_axes);
+  auto at_input1 = aten_inputs[0];
+  auto at_input2 = aten_inputs[1];
+
+  std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
+  runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      ((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype))));
+}
+
+//------------------------------------------------------------------------------
+
+#define NVFUSER_TRANSPOSE_SQUARE_RUN(             \
+    TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
+  NVFUSER_BENCHMARK_DEFINE(                       \
+      TITLE,                                      \
+      setupTranspose,                             \
+      NvFuserScheduler_Transpose,                 \
+      DTYPE,                                      \
+      NUM_DIMS,                                   \
+      {AXIS1, AXIS2},                             \
+      CONFIG);                                    \
+                                                  \
+  NVFUSER_BENCHMARK_RUN(TITLE)                    \
+      ->RangeMultiplier(8)                        \
+      ->Args({9, 2408})                           \
+      ->Args({16, 512})                           \
+      ->Args({18, 96})                            \
+      ->Args({24, 96})                            \
+      ->Args({24, 256})                           \
+      ->Args({24, 512})                           \
+      ->Args({32, 27})                            \
+      ->Args({32, 96})                            \
+      ->Args({32, 288})                           \
+      ->Args({32, 864})                           \
+      ->Args({40, 120})                           \
+      ->Args({48, 128})                           \
+      ->Args({48, 256})                           \
+      ->Args({49, 512})                           \
+      ->Args({49, 1024})                          \
+      ->Args({49, 2048})                          \
+      ->Args({49, 4608})                          \
+      ->Args({64, 64})                            \
+      ->Args({64, 96})                            \
+      ->Args({64, 128})                           \
+      ->Args({64, 147})                           \
+      ->Args({64, 192})                           \
+      ->Args({64, 256})                           \
+      ->Args({64, 288})                           \
+      ->Args({64, 512})                           \
+      ->Args({80, 64})                            \
+      ->Args({81, 1728})                          \
+      ->Args({83, 1728})                          \
+      ->Args({96, 864})                           \
+      ->Args({100, 1280})                         \
+      ->Args({100, 4032})                         \
+      ->Args({120, 40})                           \
+      ->Args({128, 128})                          \
+      ->Args({128, 512})                          \
+      ->Args({128, 1152})                         \
+      ->Args({192, 128})                          \
+      ->Args({192, 256})                          \
+      ->Args({192, 720})                          \
+      ->Args({192, 768})                          \
+      ->Args({192, 1120})                         \
+      ->Args({192, 1728})                         \
+      ->Args({196, 256})                          \
+      ->Args({196, 512})                          \
+      ->Args({196, 1024})                         \
+      ->Args({196, 2304})                         \
+      ->Args({256, 256})                          \
+      ->Args({256, 1024})                         \
+      ->Args({256, 2304})                         \
+      ->Args({284, 512})                          \
+      ->Args({320, 1280})                         \
+      ->Args({320, 1728})                         \
+      ->Args({324, 2592})                         \
+      ->Args({361, 768})                          \
+      ->Args({361, 1120})                         \
+      ->Args({384, 2})                            \
+      ->Args({384, 32})                           \
+      ->Args({384, 128})                          \
+      ->Args({384, 256})                          \
+      ->Args({384, 512})                          \
+      ->Args({384, 1280})                         \
+      ->Args({384, 2592})                         \
+      ->Args({384, 4032})                         \
+      ->Args({448, 1280})                         \
+      ->Args({480, 16})                           \
+      ->Args({480, 256})                          \
+      ->Args({512, 2})                            \
+      ->Args({512, 16})                           \
+      ->Args({512, 128})                          \
+      ->Args({512, 256})                          \
+      ->Args({512, 1024})                         \
+      ->Args({512, 2048})                         \
+      ->Args({512, 3072})                         \
+      ->Args({512, 4608})                         \
+      ->Args({784, 40})                           \
+      ->Args({784, 120})                          \
+      ->Args({784, 128})                          \
+      ->Args({784, 1152})                         \
+      ->Args({1001, 2408})                        \
+      ->Args({1024, 16})                          \
+      ->Args({1024, 256})                         \
+      ->Args({1024, 512})                         \
+      ->Args({1024, 1024})                        \
+      ->Args({1024, 3072})                        \
+      ->Args({1369, 192})                         \
+      ->Args({1369, 256})                         \
+      ->Args({1369, 288})                         \
+      ->Args({2048, 512})                         \
+      ->Args({2048, 1024})                        \
+      ->Args({2250, 27})                          \
+      ->Args({3072, 512})                         \
+      ->Args({3072, 1024})                        \
+      ->Args({3136, 64})                          \
+      ->Args({5329, 720})                         \
+      ->Args({5625, 64})                          \
+      ->Args({12544, 147})                        \
+      ->Args({22201, 288})                        \
+      ->Unit(benchmark::kMicrosecond)
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp32_Inner_2D_01_Axis,
+    DataType::Float,
+    2 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp32_Inner_3D_02_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp32_Inner_3D_12_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    1 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp32_Outer_3D_01_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+//------------------------------------------------------------------------------
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp16_Inner_2D_01_Axis,
+    DataType::Half,
+    2 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp16_Inner_3D_02_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp16_Inner_3D_12_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    1 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_SQUARE_RUN(
+    NF_Transpose_Random_fp16_Outer_3D_01_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+//------------------------------------------------------------------------------
+
+
+#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
+  NVFUSER_BENCHMARK_DEFINE(                                                 \
+      TITLE,                                                                \
+      setupTranspose,                                                       \
+      NvFuserScheduler_Transpose,                                           \
+      DTYPE,                                                                \
+      NUM_DIMS,                                                             \
+      {AXIS1, AXIS2},                                                       \
+      CONFIG);                                                              \
+                                                                            \
+  NVFUSER_BENCHMARK_RUN(TITLE)                                              \
+      ->RangeMultiplier(8)                                                  \
+      ->Ranges({{2, 256 * 256}, {160, 320}})                                \
+      ->Unit(benchmark::kMicrosecond)                                       \
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp32_Inner_2D_01_Axis,
+    DataType::Float,
+    2 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp32_Inner_3D_02_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp32_Inner_3D_12_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    1 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp32_Outer_3D_01_Axis,
+    DataType::Float,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+//------------------------------------------------------------------------------
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp16_Inner_2D_01_Axis,
+    DataType::Half,
+    2 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp16_Inner_3D_02_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp16_Inner_3D_12_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    1 /* axis1 */,
+    2 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+NVFUSER_TRANSPOSE_RUN(
+    NF_Transpose_fp16_Outer_3D_01_Axis,
+    DataType::Half,
+    3 /* num_dims */,
+    0 /* axis1 */,
+    1 /* axis2 */,
+    TransposeConfig(TRANSPOSE_CONFIG));
+
+//------------------------------------------------------------------------------
+
+static void Baseline_Transpose(
+    benchmark::State& benchmark_state,
+    DataType dtype,
+    int num_dims,
+    std::pair<int, int> axes,
+    TransposeConfig tc) {
+  auto aten_inputs = generateInputs(
+      dtype,
+      num_dims,
+      axes,
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      tc.input1_transpose_axes,
+      tc.input2_transpose_axes);
+  auto at_input1 = aten_inputs[0];
+  auto at_input2 = aten_inputs[1];
+
+  auto optionalTransposeAten = [&axes](at::Tensor at, bool is_transpose) {
+    return (is_transpose) ? at::transpose(at, axes.first, axes.second) : at;
+  };
+
+  for (auto _ : benchmark_state) {
+    clearL2Cache();
+    CudaKernelTimer timer;
+
+    auto at_ot_input1 =
+        optionalTransposeAten(at_input1, tc.input1_transpose_axes);
+    auto at_ot_input2 =
+        optionalTransposeAten(at_input2, tc.input2_transpose_axes);
+    auto at_intermediate = add(at_ot_input1, at_ot_input2);
+    auto at_ot_intermediate =
+        optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes);
+    auto at_output = relu(at_ot_intermediate);
+    auto at_ot_output =
+        optionalTransposeAten(at_output, tc.output_transpose_axes);
+
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+  }
+  // Sync everything up before we're finished, don't want to run ahead on the
+  // cpu while benchmarking.
+  cudaDeviceSynchronize();
+
+  benchmark_state.SetBytesProcessed(
+      int64_t(benchmark_state.iterations()) *
+      (at_input1.numel() * 3 * int64_t(dataTypeSize(dtype))));
+}
+
+//------------------------------------------------------------------------------
+
+static void Baseline_Transpose_fp32_Inner_2D_01_Axis(
+    benchmark::State& benchmark_state) {
+  Baseline_Transpose(
+      benchmark_state,
+      DataType::Float,
+      2 /* num_dims */,
+      {0, 1} /* axes */,
+      TRANSPOSE_CONFIG);
+}
+
+static void Baseline_Transpose_fp16_Inner_2D_01_Axis(
+    benchmark::State& benchmark_state) {
+  Baseline_Transpose(
+      benchmark_state,
+      DataType::Half,
+      2 /* num_dims */,
+      {0, 1} /* axes */,
+      TRANSPOSE_CONFIG);
+}
+
+//------------------------------------------------------------------------------
+
+BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 1024 * 1024}, {160, 320}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis)
+    // ->RangeMultiplier(2)
+    ->Ranges({{2, 1024 * 1024}, {160, 320}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+//------------------------------------------------------------------------------
diff --git a/benchmarks/cpp/nvfuser/utils.cpp b/benchmarks/cpp/nvfuser/utils.cpp
index 053fc6939082..c15248bce71d 100644
--- a/benchmarks/cpp/nvfuser/utils.cpp
+++ b/benchmarks/cpp/nvfuser/utils.cpp
@@ -1,4 +1,4 @@
-#include "utils.h"
+#include <benchmarks/cpp/nvfuser/utils.h>
 
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 
@@ -16,8 +16,8 @@ std::string toString(ReductionParams rparams) {
   if (rparams.schedule_3D) {
     ss << "3D Schedule // "
        << "Outer Reduction: "
-       << (rparams.cross_block_outer_reduce ? "cross block / " : "")
-       << (rparams.cross_grid_outer_reduce ? "cross grid / " : "")
+       << (rparams.cross_block_outer_reduction ? "cross block / " : "")
+       << (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
        << (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
     if (rparams.batches_per_block_outer_reduction > 1 ||
         rparams.persistent_kernel) {
@@ -31,16 +31,17 @@ std::string toString(ReductionParams rparams) {
                                        : "")
      << (rparams.split_grid_dim_iter_dom ? "split grid dimension / " : "")
      << (rparams.vectorize_iter_dom ? "vectorize / " : "")
-     << (rparams.unroll_iter_dom && !rparams.vectorize_iter_dom ? "unroll / "
-                                                                : "");
-  if (rparams.unroll_iter_dom || rparams.vectorize_iter_dom) {
+     << (rparams.unroll_factor_iter_dom > 1 && !rparams.vectorize_iter_dom
+             ? "unroll / "
+             : "");
+  if (rparams.unroll_factor_iter_dom > 1 || rparams.vectorize_iter_dom) {
     ss << "factor " << rparams.unroll_factor_iter_dom;
   }
 
   ss << " // Inner Reduction Domain: "
-     << (rparams.cross_block_inner_reduce ? "cross block reduction / " : "")
+     << (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
      << (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
-     << (rparams.cross_grid_inner_reduce ? "cross grid reduction / " : "");
+     << (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");
 
   if (rparams.batches_per_block_inner_reduction > 1 ||
       rparams.persistent_kernel) {
@@ -48,15 +49,17 @@ std::string toString(ReductionParams rparams) {
        << " / ";
   }
 
-  ss << (rparams.cross_grid_inner_reduce &&
+  ss << (rparams.cross_grid_inner_reduction &&
                  rparams.split_grid_dim_inner_reduction
              ? "split grid dimension / "
              : "")
      << (rparams.vectorize_inner_reduction ? "vectorize / " : "")
-     << (rparams.unroll_inner_reduction && !rparams.vectorize_inner_reduction
+     << (rparams.unroll_factor_inner_reduction > 1 &&
+                 !rparams.vectorize_inner_reduction
              ? "unroll / "
              : "");
-  if (rparams.unroll_inner_reduction || rparams.vectorize_inner_reduction) {
+  if (rparams.unroll_factor_inner_reduction > 1 ||
+      rparams.vectorize_inner_reduction) {
     ss << "factor " << rparams.unroll_factor_inner_reduction;
   }
   return ss.str();
@@ -76,11 +79,11 @@ std::string toString(PointwiseParams params) {
     ss << "1D"
        << "/";
   }
-  if (params.inner_factor > 1) {
+  if (params.unroll_factor > 1) {
     if (params.vectorize) {
-      ss << "Vectorize, Factor: " << params.inner_factor;
+      ss << "Vectorize, Factor: " << params.unroll_factor;
     } else {
-      ss << "Unroll, Factor: " << params.inner_factor;
+      ss << "Unroll, Factor: " << params.unroll_factor;
     }
   }
   return ss.str();
@@ -108,6 +111,10 @@ void clearL2Cache() {
   torch::Tensor t1 = torch::clone(t0);
 };
 
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
 TensorView* makeContigTensor(size_t ndims, DataType dtype) {
   return TensorViewBuilder()
       .ndims(ndims)
@@ -116,24 +123,50 @@ TensorView* makeContigTensor(size_t ndims, DataType dtype) {
       .build();
 }
 
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+TensorView* makeContigConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype) {
+  return TensorViewBuilder()
+      .shape(shape)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(shape.size(), true))
+      .build();
+}
+
 void runBenchmarkIterations(
     benchmark::State& benchmark_state,
     FusionExecutorCache* fusion_executor_cache,
     std::vector<c10::IValue>& aten_inputs) {
   fusion_executor_cache->runFusionWithInputs(aten_inputs);
   bool segmented =
-      fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented();
+      fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() &&
+      fusion_executor_cache->getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() > 1;
 
   if (!segmented) {
     fusion_executor_cache->profile(true);
     fusion_executor_cache->runFusionWithInputs(aten_inputs);
     auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
     auto executor_instance = compile_log.fusion_executor;
-    TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value());
-    TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
-    auto rparams = toString(compile_log.reduction_params.value());
-    auto lparams = toString(compile_log.launch_constraints.value());
-    benchmark_state.SetLabel(rparams + lparams);
+
+    if (compile_log.reduction_params.has_value()) {
+      auto rparams = toString(compile_log.reduction_params.value());
+      auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
+      benchmark_state.SetLabel(rparams + lparams);
+    } else if (compile_log.pointwise_params.has_value()){
+      auto pparams = toString(compile_log.pointwise_params.value());
+      auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
+      benchmark_state.SetLabel(pparams + lparams);
+    }
+
     executor_instance->setMeasureKernelTimeFlag(true);
 
     // Sync everything up before we start
diff --git a/benchmarks/cpp/nvfuser/utils.h b/benchmarks/cpp/nvfuser/utils.h
index b4a2f3a7a916..176290fd76f3 100644
--- a/benchmarks/cpp/nvfuser/utils.h
+++ b/benchmarks/cpp/nvfuser/utils.h
@@ -18,6 +18,24 @@
 
 using namespace torch::jit::fuser::cuda;
 
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float);
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes. Taken from test_gpu.cpp
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float);
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float);
+
+// Make a contiguous tensor of compile-time known sizes
+TensorView* makeContigConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float);
+
 std::string toString(ReductionParams rparams);
 std::string toString(PointwiseParams params);
 std::string toString(LaunchParams lparams);
@@ -32,10 +50,6 @@ void runBenchmarkIterations(
 
 void clearL2Cache();
 
-// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
-// but unknown sizes. Taken from test_gpu.cpp
-TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float);
-
 class CudaKernelTimer {
  public:
   CudaKernelTimer() {
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
index eddac0a46394..77e86020f28a 100644
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -82,10 +82,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor output = Compute(
-      "output",
-      {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
-      [&](axis n, axis c, axis h, axis w) {
+  Tensor output =
+      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
         // Compute affine terms.
         auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
         auto weight_v = weight.load(c);
@@ -143,10 +141,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor output = Compute(
-      "output",
-      {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
-      [&](axis n, axis c, axis h, axis w) {
+  Tensor output =
+      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
         // Compute affine terms.
         auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
         auto weight_v = weight.load(c);
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
index 13a02ee7723d..be60f9cd599b 100644
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -12,26 +12,21 @@ static void BM_CompileSwish(benchmark::State& state) {
     constexpr int N = 512;
     te::VarHandle n("n", te::kInt);
     te::BufHandle A("A", {N}, te::kFloat);
-    te::Tensor relu =
-        te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return te::Max::make(A.load(i), 0.f, false);
-        });
-    te::Tensor min6 =
-        te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return te::Min::make(relu.load(i), 6.f, false);
-        });
-    te::Tensor plus3 =
-        te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return min6.load(i) + 3.f;
-        });
-    te::Tensor times =
-        te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return A.load(i) * plus3.load(i);
-        });
-    te::Tensor sixth =
-        te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return times.load(i) * 1.f / 6.f;
-        });
+    te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
+      return te::Max::make(A.load(i), 0.f, false);
+    });
+    te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
+      return te::Min::make(relu.load(i), 6.f, false);
+    });
+    te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) {
+      return min6.load(i) + 3.f;
+    });
+    te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
+      return A.load(i) * plus3.load(i);
+    });
+    te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
+      return times.load(i) * 1.f / 6.f;
+    });
     te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
     for (auto tensor : {relu, min6, plus3, times}) {
       nest.computeInline(tensor.buf());
@@ -46,26 +41,20 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   constexpr int N = 512;
   te::VarHandle n("n", te::kInt);
   te::BufHandle A("A", {N}, te::kFloat);
-  te::Tensor relu =
-      te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return te::Max::make(A.load(i), 0.f, false);
-      });
-  te::Tensor min6 =
-      te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return te::Min::make(relu.load(i), 6.f, false);
-      });
-  te::Tensor plus3 =
-      te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return min6.load(i) + 3.f;
-      });
-  te::Tensor times =
-      te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return A.load(i) * plus3.load(i);
-      });
-  te::Tensor sixth =
-      te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return times.load(i) * 1.f / 6.f;
-      });
+  te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
+    return te::Max::make(A.load(i), 0.f, false);
+  });
+  te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
+    return te::Min::make(relu.load(i), 6.f, false);
+  });
+  te::Tensor plus3 = te::Compute(
+      "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; });
+  te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
+    return A.load(i) * plus3.load(i);
+  });
+  te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
+    return times.load(i) * 1.f / 6.f;
+  });
   te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
   for (auto tensor : {relu, min6, plus3, times}) {
     nest.computeInline(tensor.buf());
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
index 854092139aba..b7b97d02e3a8 100644
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -61,7 +61,7 @@ class ConcatBench : public benchmark::Fixture {
 
     Tensor output = Compute(
         "aten_cat",
-        {{output_size_[0], "M"}, {output_size_[1], "N"}},
+        {output_size_[0], output_size_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           int d = 0;
           std::vector<int> cumulative_concat_dim_sizes(num_inputs);
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
index 6d452368fc7a..403746578dff 100644
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -44,12 +44,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
   loop.prepareForCodegen();
   te::StmtPtr s = loop.root_stmt();
@@ -66,12 +66,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -124,12 +124,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -182,12 +182,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -248,12 +248,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
index abc8c3de3f33..8d77a459c603 100644
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -38,7 +38,7 @@ class ParallelAdd : public benchmark::Fixture {
 BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M}, kFloat);
-  Tensor c_tensor = Compute("c", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
     return a_buf.load(m) + b_buf.load(m);
   });
   LoopNest loop_nest({c_tensor});
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index 085505b52fe5..bf0fe21ca0b1 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -235,12 +235,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   loop.prepareForCodegen();
@@ -266,12 +266,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   const int kChunkSize = 8;
@@ -305,12 +305,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   const int kChunkSize = 8;
@@ -349,7 +349,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
       {},
       te::Sum(),
       [&](const te::ExprHandle& m) { return AP.load(m); },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   te::BufPtr rfac_buf;
@@ -392,8 +392,8 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
   const int kChunkSize = 8;
 
   te::BufHandle a("A", {M}, te::kFloat);
-  te::Tensor b =
-      te::computeSum({a, te::IntList({0}), false}, {}, at::kFloat, at::kCPU);
+  te::Tensor b = te::computeSum(
+      {a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU);
   te::LoopNest nest({b});
 
   auto loops = nest.getLoopStmtsFor(b);
@@ -456,8 +456,8 @@ BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
 BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
   constexpr int kCacheSize = 1 << 12;
   te::BufHandle a("A", {M, N}, te::kFloat);
-  te::Tensor b =
-      te::computeSum({a, te::IntList({0}), false}, {N}, at::kFloat, at::kCPU);
+  te::Tensor b = te::computeSum(
+      {a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
@@ -565,8 +565,8 @@ BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6});
 BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
   constexpr int kChunkSize = 8;
   te::BufHandle a("A", {M, N}, te::kFloat);
-  te::Tensor b =
-      te::computeSum({a, te::IntList({1}), false}, {M}, at::kFloat, at::kCPU);
+  te::Tensor b = te::computeSum(
+      {a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
index 0454530f880f..568905acd7c4 100644
--- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
@@ -46,13 +46,13 @@ class SignedLog1pBench : public benchmark::Fixture {
         "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return abs(input_ph.load(m, n));
         });
     Tensor log1p_result = Compute(
         "aten_log1p",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return log1p(abs_result.load(m, n));
         });
@@ -60,7 +60,7 @@ class SignedLog1pBench : public benchmark::Fixture {
         computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return sign_result.load(m, n) * log1p_result.load(m, n);
         });
@@ -94,13 +94,13 @@ class SignedLog1pBench : public benchmark::Fixture {
         "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return abs(input_ph.load(m, n));
         });
     Tensor log_vml_result = Compute(
         "aten_log1p",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return log_vml(abs_result.load(m, n) + ExprHandle(1));
         });
@@ -108,7 +108,7 @@ class SignedLog1pBench : public benchmark::Fixture {
         computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return sign_result.load(m, n) * log_vml_result.load(m, n);
         });
diff --git a/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py b/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py
index 6d4f0c689401..fd582ddd7781 100644
--- a/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py
@@ -4,7 +4,7 @@
 def basic_ddp_model(self, rank, model, process_group, hook_state, hook):
     r"""
     A function that creates a ddp_model and hook_state objects.
-    The ddp model is is initialized with a single device id and
+    The ddp model is initialized with a single device id and
     the process group. The ddp_model also registers the communication
     hook.
     Args:
diff --git a/benchmarks/fastrnns/bench.py b/benchmarks/fastrnns/bench.py
index b7c315b27fef..8b4569a9d56b 100644
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@@ -6,6 +6,7 @@
 import json
 import copy
 import time
+from torch.autograd.profiler import record_function
 
 from .fuser import set_fuser
 from .runner import get_nn_runners
@@ -73,7 +74,8 @@ def train_batch(modeldef):
         gc.collect()
 
         fwd_start_event.record()
-        forward_output = modeldef.forward(*modeldef.inputs)
+        with record_function("## forward ##"):
+            forward_output = modeldef.forward(*modeldef.inputs)
         fwd_end_event.record()
 
         # XXX: Use if need to print something
diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
index e1daab594c50..29d395055296 100644
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@@ -4,18 +4,18 @@ def set_fuser(fuser_name, executor_name):
     assert fuser_name in ['te', 'old', 'none', 'default']
     if fuser_name == 'te':
         torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
     elif fuser_name == 'old':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
     elif fuser_name == 'none':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
@@ -25,12 +25,11 @@ def set_fuser(fuser_name, executor_name):
     # --executor overrides settings of --fuser
     if executor_name == 'profiling':
         torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     elif executor_name == 'simple':
-        torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
     elif executor_name == 'legacy':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(True)
     elif executor_name == 'default':
         pass
diff --git a/benchmarks/functional_autograd_benchmark/README.md b/benchmarks/functional_autograd_benchmark/README.md
index a5f106fec67d..32d194b5da52 100644
--- a/benchmarks/functional_autograd_benchmark/README.md
+++ b/benchmarks/functional_autograd_benchmark/README.md
@@ -20,6 +20,10 @@ export OMP_NUM_THREADS=10
 git checkout master
 python setup.py develop
 
+# Install dependencies:
+# Scipy is required by detr
+pip install scipy
+
 # Run the benchmark for the base
 # This will use the GPU if available.
 pushd benchmarks/functional_autograd_benchmark
@@ -46,3 +50,18 @@ popd
 - `compare.py` is the entry point to run the comparison script that generates a markdown table.
 - `torchaudio_models.py` and `torchvision_models.py`  contains code extracted from torchaudio and torchvision to be able to run the models without having a specific version of these libraries installed.
 - `ppl_models.py`, `vision_models.py` and `audio_text_models.py` contain all the getter functions used for the benchmark.
+
+
+### Benchmarking against `functorch`
+
+```bash
+# Install stable functorch:
+pip install functorch
+# or install from source:
+pip install git+https://github.com/pytorch/functorch
+
+# Run the benchmark for the base
+# This will use the GPU if available.
+pushd benchmarks/functional_autograd_benchmark
+python functional_autograd_benchmark.py --output bench-with-functorch.txt
+```
diff --git a/benchmarks/functional_autograd_benchmark/audio_text_models.py b/benchmarks/functional_autograd_benchmark/audio_text_models.py
index 938e677ac38a..e731568afe7b 100644
--- a/benchmarks/functional_autograd_benchmark/audio_text_models.py
+++ b/benchmarks/functional_autograd_benchmark/audio_text_models.py
@@ -3,7 +3,11 @@
 
 import torchaudio_models as models
 
-from utils import extract_weights, load_weights, GetterReturnType
+from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
+
+
+has_functorch = check_for_functorch()
+
 
 def get_wav2letter(device: torch.device) -> GetterReturnType:
     N = 10
@@ -50,6 +54,12 @@ def get_deepspeech(device: torch.device) -> GetterReturnType:
 
     model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
                               audio_conf=audio_conf, bidirectional=True)
+
+    if has_functorch:
+        from functorch.experimental import replace_all_batch_norm_modules_
+
+        replace_all_batch_norm_modules_(model)
+
     model = model.to(device)
     criterion = nn.CTCLoss()
     params, names = extract_weights(model)
@@ -71,6 +81,11 @@ def get_transformer(device: torch.device) -> GetterReturnType:
     ntoken = 50
     model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
     model.to(device)
+
+    if has_functorch:
+        # disable dropout for consistency checking
+        model.eval()
+
     criterion = nn.NLLLoss()
     params, names = extract_weights(model)
 
diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
index aceb59e25b0d..1b0ef20902da 100644
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
@@ -6,6 +6,13 @@
 from collections import defaultdict
 from typing import NamedTuple, Callable, List, Any
 
+try:
+    import functorch as ft
+    has_functorch = True
+    print(f"Found functorch: {ft.__version__}")
+except ImportError:
+    has_functorch = False
+
 import ppl_models
 import vision_models
 import audio_text_models
@@ -36,6 +43,65 @@ def jacrev(model, inp, strict=None):
     else:
         return getattr(functional, task)
 
+def get_task_functorch(task: str) -> Callable:
+
+    @torch.no_grad()
+    def vjp(model, inp, v=None, strict=None):
+        assert v is not None
+        out, vjpfunc = ft.vjp(model, *inp)
+        return out, vjpfunc(v)
+
+    @torch.no_grad()
+    def jvp(model, inp, v=None, strict=None):
+        assert v is not None
+        return ft.jvp(model, inp, v)
+
+    @torch.no_grad()
+    def vhp(model, inp, v=None, strict=None):
+        assert v is not None
+        argnums = tuple(range(len(inp)))
+        _, vjpfunc, aux = ft.vjp(ft.grad_and_value(model, argnums), *inp, has_aux=True)
+        return aux, vjpfunc(v)
+
+    @torch.no_grad()
+    def hvp(model, inp, v=None, strict=None):
+        assert v is not None
+        argnums = tuple(range(len(inp)))
+        _, hvp_out, aux = ft.jvp(ft.grad_and_value(model, argnums), inp, v, has_aux=True)
+        return aux, hvp_out
+
+    @torch.no_grad()
+    def jacfwd(model, inp, v=None, strict=None):
+        argnums = tuple(range(len(inp)))
+        return ft.jacfwd(model, argnums)(*inp)
+
+    @torch.no_grad()
+    def jacrev(model, inp, v=None, strict=None):
+        argnums = tuple(range(len(inp)))
+        return ft.jacrev(model, argnums)(*inp)
+
+    @torch.no_grad()
+    def hessian(model, inp, v=None, strict=None):
+        argnums = tuple(range(len(inp)))
+        return ft.hessian(model, argnums=argnums)(*inp)
+
+    @torch.no_grad()
+    def hessian_fwdrev(model, inp, v=None, strict=None):
+        argnums = tuple(range(len(inp)))
+        return ft.jacfwd(ft.jacrev(model, argnums=argnums), argnums=argnums)(*inp)
+
+    @torch.no_grad()
+    def hessian_revrev(model, inp, v=None, strict=None):
+        argnums = tuple(range(len(inp)))
+        return ft.jacrev(ft.jacrev(model, argnums=argnums), argnums=argnums)(*inp)
+
+    if task in locals():
+        return locals()[task]
+    elif task == "jacobian":
+        raise RuntimeError("functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet")
+    else:
+        raise RuntimeError(f"Unsupported task: {task}")
+
 # Listing of the different tasks
 FAST_TASKS_NO_DOUBLE_BACK = [
     "vjp",
@@ -99,7 +165,7 @@ def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
 
     return v
 
-def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None:
+def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) -> None:
     func = get_task_func(task)
 
     if v is not None:
@@ -107,7 +173,24 @@ def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None:
     else:
         res = func(model, inp, strict=True)
 
-def run_model(model_getter: GetterType, args: Any, task: str) -> List[float]:
+def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False) -> None:
+    func = get_task_functorch(task)
+
+    if v is not None:
+        res = func(model, inp, v=v, strict=True)
+    else:
+        res = func(model, inp, strict=True)
+
+    if maybe_check_consistency:
+        af_func = get_task_func(task)
+        if v is not None:
+            expected = af_func(model, inp, v=v, strict=True)
+        else:
+            expected = af_func(model, inp, strict=True)
+        atol = 1e-2 if task == "vhp" else 5e-3
+        torch.testing.assert_close(res, expected, rtol=1e-5, atol=atol, msg=f"Consistency fail for task '{task}'")
+
+def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once) -> List[float]:
     if args.gpu == -1:
         device = torch.device("cpu")
 
@@ -121,14 +204,17 @@ def noop():
     model, inp = model_getter(device)
 
     v = get_v_for(model, inp, task)
+
     # Warmup
-    run_once(model, inp, task, v)
+    # maybe_check_consistency=True checks for consistency between
+    # functorch vs autograd.functional and is done in run_once_functorch only
+    run_once_fn(model, inp, task, v, maybe_check_consistency=True)
 
     elapsed = []
     for it in range(args.num_iters):
         do_sync()
         start = time.time()
-        run_once(model, inp, task, v)
+        run_once_fn(model, inp, task, v)
         do_sync()
         elapsed.append(time.time() - start)
 
@@ -173,6 +259,18 @@ def main():
             results[name][task] = (mean.item(), var.item())
             print("Results for model {} on task {}: {}s (var: {})".format(name, task, mean, var))
 
+            if has_functorch:
+                try:
+                    runtimes = run_model(model_getter, args, task, run_once_fn=run_once_functorch)
+                except RuntimeError as e:
+                    print(f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t", e)
+                    continue
+
+                runtimes = torch.tensor(runtimes)
+                mean, var = runtimes.mean(), runtimes.var()
+                results[name][f"functorch {task}"] = (mean.item(), var.item())
+                print("Results for model {} on task {} using Functorch: {}s (var: {})".format(name, task, mean, var))
+
     if args.output:
         with open(args.output, "w") as f:
             f.write(to_markdown_table(results))
diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
index c7aeb29d157b..dcf03e7a28d0 100644
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@@ -101,3 +101,10 @@ def from_markdown_table(data: str) -> TimingResultType:
         res[model][task] = (float(mean), float(var))
 
     return res
+
+def check_for_functorch():
+    try:
+        import functorch  # noqa: F401
+        return True
+    except ImportError:
+        return False
diff --git a/benchmarks/functional_autograd_benchmark/vision_models.py b/benchmarks/functional_autograd_benchmark/vision_models.py
index cd2f84e638a1..4c7c9d5bdd53 100644
--- a/benchmarks/functional_autograd_benchmark/vision_models.py
+++ b/benchmarks/functional_autograd_benchmark/vision_models.py
@@ -2,13 +2,22 @@
 from torch import Tensor
 import torchvision_models as models
 
-from utils import extract_weights, load_weights, GetterReturnType
+from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
 
 from typing import cast
 
+has_functorch = check_for_functorch()
+
+
 def get_resnet18(device: torch.device) -> GetterReturnType:
     N = 32
     model = models.resnet18(pretrained=False)
+
+    if has_functorch:
+        from functorch.experimental import replace_all_batch_norm_modules_
+
+        replace_all_batch_norm_modules_(model)
+
     criterion = torch.nn.CrossEntropyLoss()
     model.to(device)
     params, names = extract_weights(model)
@@ -29,6 +38,14 @@ def get_fcn_resnet(device: torch.device) -> GetterReturnType:
     N = 8
     criterion = torch.nn.MSELoss()
     model = models.fcn_resnet50(pretrained=False, pretrained_backbone=False)
+
+    if has_functorch:
+        from functorch.experimental import replace_all_batch_norm_modules_
+
+        replace_all_batch_norm_modules_(model)
+        # disable dropout for consistency checking
+        model.eval()
+
     model.to(device)
     params, names = extract_weights(model)
 
@@ -56,6 +73,12 @@ def get_detr(device: torch.device) -> GetterReturnType:
 
     model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads,
                         num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
+
+    if has_functorch:
+        from functorch.experimental import replace_all_batch_norm_modules_
+
+        replace_all_batch_norm_modules_(model)
+
     losses = ['labels', 'boxes', 'cardinality']
     eos_coef = 0.1
     bbox_loss_coef = 5
@@ -74,9 +97,9 @@ def get_detr(device: torch.device) -> GetterReturnType:
     for idx in range(N):
         targets = {}
         n_targets: int = int(torch.randint(5, 10, size=tuple()).item())
-        label = torch.randint(5, 10, size=(n_targets,))
+        label = torch.randint(5, 10, size=(n_targets,), device=device)
         targets["labels"] = label
-        boxes = torch.randint(100, 800, size=(n_targets, 4))
+        boxes = torch.randint(100, 800, size=(n_targets, 4), device=device)
         for t in range(n_targets):
             if boxes[t, 0] > boxes[t, 2]:
                 boxes[t, 0], boxes[t, 2] = boxes[t, 2], boxes[t, 0]
diff --git a/benchmarks/instruction_counts/core/expand.py b/benchmarks/instruction_counts/core/expand.py
index 6e882f3a52cb..f6713ee65cb9 100644
--- a/benchmarks/instruction_counts/core/expand.py
+++ b/benchmarks/instruction_counts/core/expand.py
@@ -8,7 +8,7 @@
 import os
 import re
 import textwrap
-from typing import cast, List, Optional, Tuple, TYPE_CHECKING
+from typing import List, Optional, Tuple, TYPE_CHECKING
 import uuid
 
 import torch
@@ -63,15 +63,12 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
 
     # Import magic to actually load our function.
     module_spec = importlib.util.spec_from_file_location(f"torchscript__{name}", module_path)
+    assert module_spec is not None
     module = importlib.util.module_from_spec(module_spec)
     loader = module_spec.loader
     assert loader is not None
 
-    # Module.loader has type Optional[_Loader]. Even when we assert loader is
-    # not None and MyPy narrows it to type _Loader, it will not pass type
-    # checks. So we have to use a cast to tell MyPy that _Loader implements
-    # importlib.abc.Loader.
-    cast(importlib.abc.Loader, loader).exec_module(module)
+    loader.exec_module(module)
 
     # And again, the type checker has no way of knowing that this line is valid.
     jit_model = module.jit_model  # type: ignore[attr-defined]
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index 9efa4a8c22bc..59918f6fab3c 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -136,7 +136,7 @@ $ python -m benchmark_all_test --list_tests
 
 Filter and run an operator (use add as an example):
 ```
-$ python -m benchmark_all_test --operator add --omp_num_threads 1 --mkl_num_threads 1
+$ python -m benchmark_all_test --operators add --omp_num_threads 1 --mkl_num_threads 1
 ```
 Note: this filter is based on the operator name rather than the file name.
 
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 4248e4776f22..16a66d5cf92b 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -200,8 +200,8 @@ def _print_header(self):
                 print("# {}".format(self.args.operators))
 
     def _print_perf_result(self, reported_run_time_us, test_case):
-        if self.args.ai_pep_format:
-            # Output for AI-PEP
+        if self.args.report_aibench:
+            # Output for AIBench
             # Print out per iteration execution time instead of avg time
             return
             test_name = '_'.join([test_case.framework, test_case.test_config.test_name])
@@ -288,7 +288,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
             report_run_time = 1e6 * run_time_sec / iters
             time_trace.append(report_run_time)
             # Print out the time spent in each epoch in ms
-            if self.args.ai_pep_format:
+            if self.args.report_aibench:
                 mode = "JIT" if self.use_jit else "Eager"
                 test_name = '_'.join([test_case.framework, test_case.test_config.test_name, mode])
                 print("PyTorchObserver " + json.dumps(
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index b9347364428e..3e998e6ceb4e 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -89,12 +89,12 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--ai_pep_format",
+        "--report_aibench",
         type=benchmark_utils.str2bool,
         nargs='?',
         const=True,
         default=False,
-        help="Print result when running on AI-PEP"
+        help="Print result when running on AIBench"
     )
 
     parser.add_argument(
diff --git a/benchmarks/operator_benchmark/pt/qinterpolate_test.py b/benchmarks/operator_benchmark/pt/qinterpolate_test.py
index ec58e6e6a7dd..764274f92581 100644
--- a/benchmarks/operator_benchmark/pt/qinterpolate_test.py
+++ b/benchmarks/operator_benchmark/pt/qinterpolate_test.py
@@ -44,7 +44,7 @@ def init(self, M, N, K, dtype, mode, scale, contig):
                                                  zero_point=zero_point,
                                                  dtype=dtype)
         if not contig:
-            permute_dims = list(range(q_input.ndim))[::-1]
+            permute_dims = list(range(self.q_input.ndim))[::-1]
             self.q_input = self.q_input.permute(permute_dims)
 
         self.inputs = {
diff --git a/benchmarks/static_runtime/CMakeLists.txt b/benchmarks/static_runtime/CMakeLists.txt
index d248fe2a5573..1fba02566771 100644
--- a/benchmarks/static_runtime/CMakeLists.txt
+++ b/benchmarks/static_runtime/CMakeLists.txt
@@ -6,4 +6,5 @@ list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc
 list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cc)
 list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc)
 list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_module.cc)
+list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_generated_ops.cc)
 set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE)
diff --git a/benchmarks/static_runtime/deep_wide_pt.h b/benchmarks/static_runtime/deep_wide_pt.h
index 73a943146f24..5b18c96364ba 100644
--- a/benchmarks/static_runtime/deep_wide_pt.h
+++ b/benchmarks/static_runtime/deep_wide_pt.h
@@ -60,7 +60,7 @@ struct DeepAndWideFast : torch::nn::Module {
       auto dp_unflatten = at::cpu::bmm(ad_emb_packed, user_emb_t);
       // auto dp = at::native::flatten(dp_unflatten, 1);
       auto dp = dp_unflatten.view({dp_unflatten.size(0), 1});
-      auto input = at::native::_cat_cpu({dp, wide_preproc}, 1);
+      auto input = at::cpu::cat({dp, wide_preproc}, 1);
 
       // fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
       fc_w_t_ = torch::t(fc_w_);
@@ -114,7 +114,7 @@ struct DeepAndWideFast : torch::nn::Module {
 
       // Potential optimization: we can replace cat with carefully constructed
       // tensor views on the output that are passed to the _out ops above.
-      at::native::_cat_out_cpu(
+      at::cpu::cat_outf(
           {prealloc_tensors[5], prealloc_tensors[2]}, 1, prealloc_tensors[6]);
       at::cpu::addmm_out(
           prealloc_tensors[7], fc_b_, prealloc_tensors[6], fc_w_t_, 1, 1);
diff --git a/benchmarks/static_runtime/test_cpu_fusion.cc b/benchmarks/static_runtime/test_cpu_fusion.cc
new file mode 100644
index 000000000000..82f11a9ec5db
--- /dev/null
+++ b/benchmarks/static_runtime/test_cpu_fusion.cc
@@ -0,0 +1,138 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/torch.h>
+#include <thread>
+
+#include "test_utils.h"
+
+using namespace torch;
+using namespace torch::jit;
+using namespace torch::jit::test;
+
+TEST(CpuFusion, Simple) {
+  const auto simple_script = R"JIT(
+    def forward(self, a, b):
+        return (a + b).relu().tanh()
+  )JIT";
+
+  Module m("module");
+  m.define(simple_script);
+
+  StaticModuleOptions opts; // start with the defaults.
+  opts.enable_tensorexpr_fusion = true;
+
+  auto input1 = at::randn({2, 3});
+  auto input2 = at::ones({2, 3});
+
+  auto smodule = StaticModule(m, /* is_frozen */ false, opts, {input1, input2});
+  StaticRuntime runtime(smodule);
+
+  // Test with sample inputs
+  {
+    auto actual = runtime({input1, input2}, {});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+
+  // Test with different inputs
+  {
+    auto new_input1 = at::randn({5, 14});
+    auto new_input2 = at::randn({5, 14});
+    auto actual = runtime({new_input1, new_input2}, {});
+    auto expect = at::tanh(at::relu(new_input1 + new_input2));
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+}
+
+TEST(CpuFusion, FallbackGraph) {
+  const auto simple_script = R"JIT(
+    def forward(self, a, b):
+        return (a + b).relu().tanh()
+  )JIT";
+
+  Module m("module");
+  m.define(simple_script);
+
+  StaticModuleOptions opts; // start with the defaults.
+  opts.enable_tensorexpr_fusion = true;
+
+  auto sample_input1 = at::randn({2, 3});
+  auto sample_input2 = at::ones({2, 3});
+  auto smodule = StaticModule(
+      m, /* is_frozen */ false, opts, {sample_input1, sample_input2});
+
+  StaticRuntime runtime(smodule);
+
+  // The sample inputs above were contiguous. Now, use a strided input
+  // to trigger running the fallback graph.
+  {
+    auto input1 = at::narrow(at::randn({2, 6}), 1, 0, 3);
+    auto input2 = at::ones({2, 3});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    auto actual = runtime({input1, input2}, {});
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+
+  // Test with strided inputs of different size.
+  {
+    auto input1 = at::narrow(at::randn({10, 30}), 1, 0, 25);
+    auto input2 = at::randn({10, 25});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    auto actual = runtime({input1, input2}, {});
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+}
+
+TEST(CpuFusion, ParallelRuntimes) {
+  const auto simple_script = R"JIT(
+    def forward(self, a, b):
+        return (a + b).relu().tanh()
+  )JIT";
+
+  Module m("module");
+  m.define(simple_script);
+
+  StaticModuleOptions opts; // start with the defaults.
+  opts.enable_tensorexpr_fusion = true;
+
+  auto sample_input1 = at::randn({2, 3});
+  auto sample_input2 = at::ones({2, 3});
+  auto smodule = StaticModule(
+      m, /* is_frozen */ false, opts, {sample_input1, sample_input2});
+
+  constexpr size_t kNumThreads = 2;
+  std::vector<std::vector<std::pair<int, int>>> all_inputs;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    std::vector<std::pair<int, int>> thread_input = {
+        {id, id + 1},
+        {id + 10, id + 11},
+        {id + 20, id + 21},
+        {id + 30, id + 31},
+        {id + 40, id + 41},
+        {id + 50, id + 51},
+        {id + 60, id + 61},
+        {id + 70, id + 71}};
+    all_inputs.emplace_back(std::move(thread_input));
+  }
+
+  auto exec_runtime = [&](size_t tid) {
+    const auto& inputs = all_inputs[tid];
+    StaticRuntime runtime(smodule);
+    for (const auto& inp : inputs) {
+      auto a = at::randn({inp.first, inp.second});
+      auto b = at::randn({inp.first, inp.second});
+      auto expect = at::tanh(at::relu(a + b));
+      auto actual = runtime({a, b}, {});
+      EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+    }
+  };
+
+  std::vector<std::thread> threads;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    threads.emplace_back(exec_runtime, id);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
diff --git a/benchmarks/static_runtime/test_generated_ops.cc b/benchmarks/static_runtime/test_generated_ops.cc
new file mode 100644
index 000000000000..3011a3abbe05
--- /dev/null
+++ b/benchmarks/static_runtime/test_generated_ops.cc
@@ -0,0 +1,7864 @@
+// @lint-ignore-every CLANGTIDY HOWTOEVEN
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/torch.h>
+
+#include "test_utils.h"
+
+using namespace caffe2;
+using namespace torch;
+using namespace torch::jit;
+using namespace torch::jit::test;
+using c10::IValue;
+
+TEST(StaticRuntime, autogen_absolute) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::absolute(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_angle) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::angle(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sgn) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sgn(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_acos) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::acos(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arccos) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arccos(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__add_relu_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::_add_relu(%self, %other, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, other0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, other1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_addmv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %mat: Tensor, %vec: Tensor, %beta: int, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::addmv(%self, %mat, %vec, %beta, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({2});
+  auto mat0 = at::rand({2, 2});
+  auto vec0 = at::rand({2});
+  auto beta0 = 2;
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, mat0, vec0, beta0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({35});
+  auto mat1 = at::rand({35, 35});
+  auto vec1 = at::rand({35});
+  auto beta1 = 2;
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, mat1, vec1, beta1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_addr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %vec1: Tensor, %vec2: Tensor, %beta: int, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::addr(%self, %vec1, %vec2, %beta, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto vec10 = at::rand({6});
+  auto vec20 = at::rand({6});
+  auto beta0 = 2;
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, vec10, vec20, beta0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22});
+  auto vec11 = at::rand({22});
+  auto vec21 = at::rand({22});
+  auto beta1 = 2;
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, vec11, vec21, beta1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_argmax) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int?, %keepdim: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::argmax(%self, %dim, %keepdim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto keepdim0 = false;
+  std::vector<IValue> args{self0, dim0, keepdim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto keepdim1 = false;
+  std::vector<IValue> args2{self1, dim1, keepdim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_acosh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::acosh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({2, 2, 2}) + at::ones({2, 2, 2});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({5, 5, 5}) + at::ones({5, 5, 5});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_asinh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::asinh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arcsinh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arcsinh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_atanh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::atanh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arctanh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arctanh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_asin) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::asin(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arcsin) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arcsin(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_atan) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::atan(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arctan) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arctan(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_baddbmm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %batch1: Tensor, %batch2: Tensor, %beta: int, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::baddbmm(%self, %batch1, %batch2, %beta, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto batch10 = at::rand({6, 6, 6});
+  auto batch20 = at::rand({6, 6, 6});
+  auto beta0 = 2;
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, batch10, batch20, beta0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto batch11 = at::rand({22, 22, 22});
+  auto batch21 = at::rand({22, 22, 22});
+  auto beta1 = 2;
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, batch11, batch21, beta1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_not) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_not(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_copysign_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::copysign(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logical_not) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logical_not(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logical_xor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logical_xor(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logical_and) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logical_and(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logical_or) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logical_or(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ceil) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::ceil(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_clamp_max) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %max: int):
+        %bias: None = prim::Constant()
+        %ret = aten::clamp_max(%self, %max)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto max0 = 2;
+  std::vector<IValue> args{self0, max0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto max1 = 2;
+  std::vector<IValue> args2{self1, max1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_clip) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %min: int?, %max: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::clip(%self, %min, %max)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto min0 = 2;
+  auto max0 = 2;
+  std::vector<IValue> args{self0, min0, max0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto min1 = 2;
+  auto max1 = 2;
+  std::vector<IValue> args2{self1, min1, max1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_complex) {
+  const std::string script = R"IR(
+    graph(%real: Tensor, %imag: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::complex(%real, %imag)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto real0 = at::rand({6, 6, 6});
+  auto imag0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{real0, imag0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto real1 = at::rand({22, 22, 22});
+  auto imag1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{real1, imag1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_polar) {
+  const std::string script = R"IR(
+    graph(%abs: Tensor, %angle: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::polar(%abs, %angle)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto abs0 = at::rand({6, 6, 6});
+  auto angle0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{abs0, angle0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto abs1 = at::rand({22, 22, 22});
+  auto angle1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{abs1, angle1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cos) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::cos(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cosh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::cosh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cumprod) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %dtype: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::cumprod(%self, %dim, %dtype)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto dtype0 = at::ScalarType::Float;
+  std::vector<IValue> args{self0, dim0, dtype0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto dtype1 = at::ScalarType::Float;
+  std::vector<IValue> args2{self1, dim1, dtype1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_diff) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int, %dim: int, %prepend: Tensor?, %append: Tensor?):
+        %bias: None = prim::Constant()
+        %ret = aten::diff(%self, %n, %dim, %prepend, %append)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto prepend0 = at::rand({6, 6, 6});
+  auto append0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, n0, dim0, prepend0, append0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto prepend1 = at::rand({22, 22, 22});
+  auto append1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, n1, dim1, prepend1, append1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_divide_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::divide(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_true_divide_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::true_divide(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_dot) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %tensor: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::dot(%self, %tensor)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({16});
+  auto tensor0 = at::rand({16});
+  std::vector<IValue> args{self0, tensor0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({64});
+  auto tensor1 = at::rand({64});
+  std::vector<IValue> args2{self1, tensor1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_vdot) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::vdot(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({16});
+  auto other0 = at::rand({16});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({64});
+  auto other1 = at::rand({64});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_erf) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::erf(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_erfc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::erfc(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_exp) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::exp(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_exp2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::exp2(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_expm1) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::expm1(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_floor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::floor(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_frac) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::frac(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gcd) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::gcd(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lcm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::lcm(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_index_copy) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %source: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::index_copy(%self, %dim, %index, %source)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({2});
+  auto dim0 = 0;
+  auto index0 = at::randint(0, 1, {2}, at::kLong);
+  auto source0 = at::rand({2});
+  std::vector<IValue> args{self0, dim0, index0, source0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({32});
+  auto dim1 = 0;
+  auto index1 = at::randint(0, 10, {32}, at::kLong);
+  auto source1 = at::rand({32});
+  std::vector<IValue> args2{self1, dim1, index1, source1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_inverse) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::inverse(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_isin_Tensor_Tensor) {
+  const std::string script = R"IR(
+    graph(%elements: Tensor, %test_elements: Tensor, %assume_unique: bool, %invert: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::isin(%elements, %test_elements, %assume_unique, %invert)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto elements0 = at::rand({6, 6, 6});
+  auto test_elements0 = at::rand({6, 6, 6});
+  auto assume_unique0 = false;
+  auto invert0 = false;
+  std::vector<IValue> args{elements0, test_elements0, assume_unique0, invert0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto elements1 = at::rand({22, 22, 22});
+  auto test_elements1 = at::rand({22, 22, 22});
+  auto assume_unique1 = false;
+  auto invert1 = false;
+  std::vector<IValue> args2{elements1, test_elements1, assume_unique1, invert1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_isin_Tensor_Scalar) {
+  const std::string script = R"IR(
+    graph(%elements: Tensor, %test_element: int, %assume_unique: bool, %invert: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::isin(%elements, %test_element, %assume_unique, %invert)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto elements0 = at::rand({6, 6, 6});
+  auto test_element0 = 2;
+  auto assume_unique0 = false;
+  auto invert0 = false;
+  std::vector<IValue> args{elements0, test_element0, assume_unique0, invert0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto elements1 = at::rand({22, 22, 22});
+  auto test_element1 = 2;
+  auto assume_unique1 = false;
+  auto invert1 = false;
+  std::vector<IValue> args2{elements1, test_element1, assume_unique1, invert1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_isin_Scalar_Tensor) {
+  const std::string script = R"IR(
+    graph(%element: int, %test_elements: Tensor, %assume_unique: bool, %invert: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::isin(%element, %test_elements, %assume_unique, %invert)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto element0 = 2;
+  auto test_elements0 = at::rand({6, 6, 6});
+  auto assume_unique0 = false;
+  auto invert0 = false;
+  std::vector<IValue> args{element0, test_elements0, assume_unique0, invert0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto element1 = 2;
+  auto test_elements1 = at::rand({22, 22, 22});
+  auto assume_unique1 = false;
+  auto invert1 = false;
+  std::vector<IValue> args2{element1, test_elements1, assume_unique1, invert1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_kron) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::kron(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ldexp_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::ldexp(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_log10) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::log10(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_log1p) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::log1p(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_log2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::log2(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logaddexp) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logaddexp(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logaddexp2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::logaddexp2(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_xlogy_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::xlogy(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__log_softmax) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %half_to_float: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::_log_softmax(%self, %dim, %half_to_float)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto half_to_float0 = false;
+  std::vector<IValue> args{self0, dim0, half_to_float0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto half_to_float1 = false;
+  std::vector<IValue> args2{self1, dim1, half_to_float1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__log_softmax_backward_data) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %output: Tensor, %dim: int, %input_dtype: int):
+        %bias: None = prim::Constant()
+        %ret = aten::_log_softmax_backward_data(%grad_output, %output, %dim, %input_dtype)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto output0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto input_dtype0 = at::ScalarType::Float;
+  std::vector<IValue> args{grad_output0, output0, dim0, input_dtype0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto output1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto input_dtype1 = at::ScalarType::Float;
+  std::vector<IValue> args2{grad_output1, output1, dim1, input_dtype1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__logcumsumexp) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int):
+        %bias: None = prim::Constant()
+        %ret = aten::_logcumsumexp(%self, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_logcumsumexp) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int):
+        %bias: None = prim::Constant()
+        %ret = aten::logcumsumexp(%self, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_matrix_power) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int):
+        %bias: None = prim::Constant()
+        %ret = aten::matrix_power(%self, %n)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  std::vector<IValue> args{self0, n0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  std::vector<IValue> args2{self1, n1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_mm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %mat2: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::mm(%self, %mat2)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({8, 8});
+  auto mat20 = at::rand({8, 8});
+  std::vector<IValue> args{self0, mat20};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({32, 32});
+  auto mat21 = at::rand({32, 32});
+  std::vector<IValue> args2{self1, mat21};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_multiply_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::multiply(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_mv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %vec: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::mv(%self, %vec)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto vec0 = at::rand({6});
+  std::vector<IValue> args{self0, vec0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22});
+  auto vec1 = at::rand({22});
+  std::vector<IValue> args2{self1, vec1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_mvlgamma) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %p: int):
+        %bias: None = prim::Constant()
+        %ret = aten::mvlgamma(%self, %p)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto p0 = 1;
+  std::vector<IValue> args{self0, p0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto p1 = 1;
+  std::vector<IValue> args2{self1, p1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_rad2deg) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::rad2deg(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_deg2rad) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::deg2rad(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_reciprocal) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::reciprocal(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_neg) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::neg(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_negative) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::negative(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_round) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::round(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_round_decimals) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %decimals: int):
+        %bias: None = prim::Constant()
+        %ret = aten::round(%self, %decimals)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto decimals0 = 1;
+  std::vector<IValue> args{self0, decimals0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto decimals1 = 1;
+  std::vector<IValue> args2{self1, decimals1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gelu) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %approximate: str):
+        %bias: None = prim::Constant()
+        %ret = aten::gelu(%self, %approximate)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto approximate0 = "tanh";
+  std::vector<IValue> args{self0, approximate0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto approximate1 = "tanh";
+  std::vector<IValue> args2{self1, approximate1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gelu_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %approximate: str):
+        %bias: None = prim::Constant()
+        %ret = aten::gelu_backward(%grad_output, %self, %approximate)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto approximate0 = "tanh";
+  std::vector<IValue> args{grad_output0, self0, approximate0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto approximate1 = "tanh";
+  std::vector<IValue> args2{grad_output1, self1, approximate1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardshrink) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %lambd: int):
+        %bias: None = prim::Constant()
+        %ret = aten::hardshrink(%self, %lambd)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto lambd0 = 2;
+  std::vector<IValue> args{self0, lambd0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto lambd1 = 2;
+  std::vector<IValue> args2{self1, lambd1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardshrink_backward) {
+  const std::string script = R"IR(
+    graph(%grad_out: Tensor, %self: Tensor, %lambd: int):
+        %bias: None = prim::Constant()
+        %ret = aten::hardshrink_backward(%grad_out, %self, %lambd)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_out0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto lambd0 = 2;
+  std::vector<IValue> args{grad_out0, self0, lambd0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_out1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto lambd1 = 2;
+  std::vector<IValue> args2{grad_out1, self1, lambd1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_rsqrt) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::rsqrt(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_silu) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::silu(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_silu_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::silu_backward(%grad_output, %self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{grad_output0, self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{grad_output1, self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_mish) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::mish(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sin) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sin(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sinc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sinc(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sinh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sinh(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__softmax) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %half_to_float: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::_softmax(%self, %dim, %half_to_float)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto half_to_float0 = false;
+  std::vector<IValue> args{self0, dim0, half_to_float0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto half_to_float1 = false;
+  std::vector<IValue> args2{self1, dim1, half_to_float1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__softmax_backward_data) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %output: Tensor, %dim: int, %input_dtype: int):
+        %bias: None = prim::Constant()
+        %ret = aten::_softmax_backward_data(%grad_output, %output, %dim, %input_dtype)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto output0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto input_dtype0 = at::ScalarType::Float;
+  std::vector<IValue> args{grad_output0, output0, dim0, input_dtype0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto output1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto input_dtype1 = at::ScalarType::Float;
+  std::vector<IValue> args2{grad_output1, output1, dim1, input_dtype1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sqrt) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sqrt(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_square) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::square(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_prod_dim_int) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %keepdim: bool, %dtype: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::prod(%self, %dim, %keepdim, %dtype)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto keepdim0 = false;
+  auto dtype0 = at::ScalarType::Float;
+  std::vector<IValue> args{self0, dim0, keepdim0, dtype0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto keepdim1 = false;
+  auto dtype1 = at::ScalarType::Float;
+  std::vector<IValue> args2{self1, dim1, keepdim1, dtype1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_tan) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::tan(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_threshold) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %threshold: int, %value: int):
+        %bias: None = prim::Constant()
+        %ret = aten::threshold(%self, %threshold, %value)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto threshold0 = 2;
+  auto value0 = 2;
+  std::vector<IValue> args{self0, threshold0, value0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto threshold1 = 2;
+  auto value1 = 2;
+  std::vector<IValue> args2{self1, threshold1, value1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_threshold_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %threshold: int):
+        %bias: None = prim::Constant()
+        %ret = aten::threshold_backward(%grad_output, %self, %threshold)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto threshold0 = 2;
+  std::vector<IValue> args{grad_output0, self0, threshold0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto threshold1 = 2;
+  std::vector<IValue> args2{grad_output1, self1, threshold1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_trunc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::trunc(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fix) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::fix(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_nuclear_norm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %keepdim: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::nuclear_norm(%self, %keepdim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({8, 8});
+  auto keepdim0 = false;
+  std::vector<IValue> args{self0, keepdim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({32, 32});
+  auto keepdim1 = false;
+  std::vector<IValue> args2{self1, keepdim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_subtract_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::subtract(%self, %other, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, other0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, other1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_heaviside) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %values: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::heaviside(%self, %values)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto values0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, values0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto values1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, values1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__addmm_activation) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %mat1: Tensor, %mat2: Tensor, %beta: int, %alpha: int, %use_gelu: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::_addmm_activation(%self, %mat1, %mat2, %beta, %alpha, %use_gelu)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({8, 8});
+  auto mat10 = at::rand({8, 8});
+  auto mat20 = at::rand({8, 8});
+  auto beta0 = 2;
+  auto alpha0 = 2;
+  auto use_gelu0 = false;
+  std::vector<IValue> args{self0, mat10, mat20, beta0, alpha0, use_gelu0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({32, 32});
+  auto mat11 = at::rand({32, 32});
+  auto mat21 = at::rand({32, 32});
+  auto beta1 = 2;
+  auto alpha1 = 2;
+  auto use_gelu1 = false;
+  std::vector<IValue> args2{self1, mat11, mat21, beta1, alpha1, use_gelu1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_index_add) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %source: Tensor, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::index_add(%self, %dim, %index, %source, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({2});
+  auto dim0 = 0;
+  auto index0 = at::randint(0, 1, {2}, at::kInt);
+  auto source0 = at::rand({2});
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, dim0, index0, source0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({16});
+  auto dim1 = 0;
+  auto index1 = at::randint(0, 10, {16}, at::kInt);
+  auto source1 = at::rand({16});
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, dim1, index1, source1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_scatter_src) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter(%self, %dim, %index, %src)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  std::vector<IValue> args{self0, dim0, index0, src0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64);
+  auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  std::vector<IValue> args2{self1, dim1, index1, src1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_scatter_value) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %value: int):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter(%self, %dim, %index, %value)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto value0 = 2;
+  auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  std::vector<IValue> args{self0, dim0, index0, value0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64);
+  auto value1 = 2;
+  auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  std::vector<IValue> args2{self1, dim1, index1, value1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_scatter_reduce) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor, %reduce: str):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter(%self, %dim, %index, %src, %reduce)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto reduce0 = "add";
+  std::vector<IValue> args{self0, dim0, index0, src0, reduce0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64);
+  auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto reduce1 = "add";
+  std::vector<IValue> args2{self1, dim1, index1, src1, reduce1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_scatter_value_reduce) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %value: int, %reduce: str):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter(%self, %dim, %index, %value, %reduce)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto value0 = 2;
+  auto reduce0 = "add";
+  auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  std::vector<IValue> args{self0, dim0, index0, value0, reduce0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64);
+  auto value1 = 2;
+  auto reduce1 = "add";
+  auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  std::vector<IValue> args2{self1, dim1, index1, value1, reduce1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_scatter_add) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter_add(%self, %dim, %index, %src)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64);
+  std::vector<IValue> args{self0, dim0, index0, src0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64);
+  auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64);
+  std::vector<IValue> args2{self1, dim1, index1, src1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_scatter_reduce_two) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor, %reduce: str, %include_self: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::scatter_reduce(%self, %dim, %index, %src, %reduce, %include_self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  auto index0 = at::randint(6, {6, 6, 6}, torch::kInt64);
+  auto src0 = at::rand({6, 6, 6});
+  auto reduce0 = "mean";
+  auto include_self0 = false;
+  std::vector<IValue> args{self0, dim0, index0, src0, reduce0, include_self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  auto index1 = at::randint(22, {22, 22, 22}, torch::kInt64);
+  auto src1 = at::rand({22, 22, 22});
+  auto reduce1 = "mean";
+  auto include_self1 = false;
+  std::vector<IValue> args2{self1, dim1, index1, src1, reduce1, include_self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_eq_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::eq(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_eq_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::eq(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_and_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_and(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_or_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_or(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_xor_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_xor(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt);
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt);
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_left_shift_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_left_shift(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_bitwise_right_shift_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::bitwise_right_shift(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_tril) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %diagonal: int):
+        %bias: None = prim::Constant()
+        %ret = aten::tril(%self, %diagonal)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto diagonal0 = 1;
+  std::vector<IValue> args{self0, diagonal0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto diagonal1 = 1;
+  std::vector<IValue> args2{self1, diagonal1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_triu) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %diagonal: int):
+        %bias: None = prim::Constant()
+        %ret = aten::triu(%self, %diagonal)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto diagonal0 = 1;
+  std::vector<IValue> args{self0, diagonal0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto diagonal1 = 1;
+  std::vector<IValue> args2{self1, diagonal1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_digamma) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::digamma(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lerp_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %end: Tensor, %weight: int):
+        %bias: None = prim::Constant()
+        %ret = aten::lerp(%self, %end, %weight)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto end0 = at::rand({6, 6, 6});
+  auto weight0 = 2;
+  std::vector<IValue> args{self0, end0, weight0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto end1 = at::rand({22, 22, 22});
+  auto weight1 = 2;
+  std::vector<IValue> args2{self1, end1, weight1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lerp_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %end: Tensor, %weight: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::lerp(%self, %end, %weight)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto end0 = at::rand({6, 6, 6});
+  auto weight0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, end0, weight0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto end1 = at::rand({22, 22, 22});
+  auto weight1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, end1, weight1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_addbmm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %batch1: Tensor, %batch2: Tensor, %beta: int, %alpha: int):
+        %bias: None = prim::Constant()
+        %ret = aten::addbmm(%self, %batch1, %batch2, %beta, %alpha)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto batch10 = at::rand({6, 6, 6});
+  auto batch20 = at::rand({6, 6, 6});
+  auto beta0 = 2;
+  auto alpha0 = 2;
+  std::vector<IValue> args{self0, batch10, batch20, beta0, alpha0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22});
+  auto batch11 = at::rand({22, 22, 22});
+  auto batch21 = at::rand({22, 22, 22});
+  auto beta1 = 2;
+  auto alpha1 = 2;
+  std::vector<IValue> args2{self1, batch11, batch21, beta1, alpha1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_diag) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %diagonal: int):
+        %bias: None = prim::Constant()
+        %ret = aten::diag(%self, %diagonal)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({8, 8});
+  auto diagonal0 = 1;
+  std::vector<IValue> args{self0, diagonal0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({32, 32});
+  auto diagonal1 = 1;
+  std::vector<IValue> args2{self1, diagonal1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cross) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor, %dim: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::cross(%self, %other, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({3, 3, 3});
+  auto other0 = at::rand({3, 3, 3});
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, other0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 3, 22});
+  auto other1 = at::rand({22, 3, 22});
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, other1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ne_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::ne(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ne_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::ne(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ge_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::ge(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ge_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::ge(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_le_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::le(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_le_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::le(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gt_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::gt(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gt_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::gt(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lt_Scalar) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: int):
+        %bias: None = prim::Constant()
+        %ret = aten::lt(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = 2;
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = 2;
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lt_Tensor) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::lt(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_take) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %index: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::take(%self, %index)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto index0 = at::randint(0, 216, {20}, torch::kInt64);
+  std::vector<IValue> args{self0, index0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto index1 = at::randint(0, 1000, {100}, torch::kInt64);
+  std::vector<IValue> args2{self1, index1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_take_along_dim) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %indices: Tensor, %dim: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::take_along_dim(%self, %indices, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto indices0 = at::argsort(self0, 1);
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, indices0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto indices1 = at::argsort(self1, 1);
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, indices1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_masked_select) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %mask: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::masked_select(%self, %mask)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto mask0 = at::randn({6, 6, 6}) > 0.5;
+  std::vector<IValue> args{self0, mask0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto mask1 = at::rand({22, 22, 22}) > 0.5;
+  std::vector<IValue> args2{self1, mask1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_gather) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor, %sparse_grad: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::gather(%self, %dim, %index, %sparse_grad)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(1, 100, {2, 2, 2}, at::kInt);
+  auto dim0 = 1;
+  auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64);
+  auto sparse_grad0 = false;
+  std::vector<IValue> args{self0, dim0, index0, sparse_grad0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(1, 100, {5, 5, 5}, at::kInt);
+  auto dim1 = 1;
+  auto index1 = at::randint(0, 4, {5, 5, 5}, torch::kInt64);
+  auto sparse_grad1 = false;
+  std::vector<IValue> args2{self1, dim1, index1, sparse_grad1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_addcmul) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %tensor1: Tensor, %tensor2: Tensor, %value: int):
+        %bias: None = prim::Constant()
+        %ret = aten::addcmul(%self, %tensor1, %tensor2, %value)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto tensor10 = at::rand({6, 6, 6});
+  auto tensor20 = at::rand({6, 6, 6});
+  auto value0 = 2;
+  std::vector<IValue> args{self0, tensor10, tensor20, value0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto tensor11 = at::rand({22, 22, 22});
+  auto tensor21 = at::rand({22, 22, 22});
+  auto value1 = 2;
+  std::vector<IValue> args2{self1, tensor11, tensor21, value1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_addcdiv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %tensor1: Tensor, %tensor2: Tensor, %value: int):
+        %bias: None = prim::Constant()
+        %ret = aten::addcdiv(%self, %tensor1, %tensor2, %value)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto tensor10 = at::rand({6, 6, 6});
+  auto tensor20 = at::rand({6, 6, 6});
+  auto value0 = 2;
+  std::vector<IValue> args{self0, tensor10, tensor20, value0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto tensor11 = at::rand({22, 22, 22});
+  auto tensor21 = at::rand({22, 22, 22});
+  auto value1 = 2;
+  std::vector<IValue> args2{self1, tensor11, tensor21, value1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_solve_triangular) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %B: Tensor, %upper: bool, %left: bool, %unitriangular: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_solve_triangular(%self, %B, %upper, %left, %unitriangular)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto B0 = at::rand({6, 6, 6});
+  auto upper0 = false;
+  auto left0 = false;
+  auto unitriangular0 = false;
+  std::vector<IValue> args{self0, B0, upper0, left0, unitriangular0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto B1 = at::rand({22, 22, 22});
+  auto upper1 = false;
+  auto left1 = false;
+  auto unitriangular1 = false;
+  std::vector<IValue> args2{self1, B1, upper1, left1, unitriangular1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cholesky_solve) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %input2: Tensor, %upper: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::cholesky_solve(%self, %input2, %upper)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto input20 = at::rand({6, 6, 6});
+  auto upper0 = false;
+  std::vector<IValue> args{self0, input20, upper0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto input21 = at::rand({22, 22, 22});
+  auto upper1 = false;
+  std::vector<IValue> args2{self1, input21, upper1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_cholesky_inverse) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %upper: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::cholesky_inverse(%self, %upper)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto upper0 = false;
+  std::vector<IValue> args{self0, upper0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto upper1 = false;
+  std::vector<IValue> args2{self1, upper1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_orgqr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %input2: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::orgqr(%self, %input2)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto input20 = at::rand({6, 6});
+  std::vector<IValue> args{self0, input20};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto input21 = at::rand({22, 22});
+  std::vector<IValue> args2{self1, input21};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_ormqr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %input2: Tensor, %input3: Tensor, %left: bool, %transpose: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::ormqr(%self, %input2, %input3, %left, %transpose)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto input20 = at::rand({6, 6});
+  auto input30 = at::rand({6, 6, 6});
+  auto left0 = false;
+  auto transpose0 = false;
+  std::vector<IValue> args{self0, input20, input30, left0, transpose0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto input21 = at::rand({22, 22});
+  auto input31 = at::rand({22, 22, 22});
+  auto left1 = false;
+  auto transpose1 = false;
+  std::vector<IValue> args2{self1, input21, input31, left1, transpose1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_lgamma) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::lgamma(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_polygamma) {
+  const std::string script = R"IR(
+    graph(%n: int, %self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::polygamma(%n, %self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto n0 = 1;
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{n0, self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto n1 = 1;
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{n1, self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_erfinv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::erfinv(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_i0) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::i0(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_signbit) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::signbit(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_atan2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::atan2(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_arctan2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::arctan2(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_histc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %bins: int, %min: int, %max: int):
+        %bias: None = prim::Constant()
+        %ret = aten::histc(%self, %bins, %min, %max)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto bins0 = 1;
+  auto min0 = 2;
+  auto max0 = 2;
+  std::vector<IValue> args{self0, bins0, min0, max0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto bins1 = 1;
+  auto min1 = 2;
+  auto max1 = 2;
+  std::vector<IValue> args2{self1, bins1, min1, max1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_hypot) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::hypot(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_igamma) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::igamma(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_igammac) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::igammac(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_nextafter) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::nextafter(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fmin) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::fmin(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fmax) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::fmax(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_maximum) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::maximum(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_max_other) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::max(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_minimum) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::minimum(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_min_other) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::min(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_quantile) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %q: Tensor, %dim: int?, %keepdim: bool, %interpolation: str):
+        %bias: None = prim::Constant()
+        %ret = aten::quantile(%self, %q, %dim, %keepdim, %interpolation)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto q0 = at::rand({6});
+  auto dim0 = 1;
+  auto keepdim0 = false;
+  auto interpolation0 = "linear";
+  std::vector<IValue> args{self0, q0, dim0, keepdim0, interpolation0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto q1 = at::rand({22});
+  auto dim1 = 1;
+  auto keepdim1 = false;
+  auto interpolation1 = "linear";
+  std::vector<IValue> args2{self1, q1, dim1, keepdim1, interpolation1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_nanquantile) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %q: Tensor, %dim: int?, %keepdim: bool, %interpolation: str):
+        %bias: None = prim::Constant()
+        %ret = aten::nanquantile(%self, %q, %dim, %keepdim, %interpolation)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto q0 = at::rand({6});
+  auto dim0 = 1;
+  auto keepdim0 = false;
+  auto interpolation0 = "linear";
+  std::vector<IValue> args{self0, q0, dim0, keepdim0, interpolation0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto q1 = at::rand({22});
+  auto dim1 = 1;
+  auto keepdim1 = false;
+  auto interpolation1 = "linear";
+  std::vector<IValue> args2{self1, q1, dim1, keepdim1, interpolation1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_msort) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::msort(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_renorm) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %p: int, %dim: int, %maxnorm: int):
+        %bias: None = prim::Constant()
+        %ret = aten::renorm(%self, %p, %dim, %maxnorm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto p0 = 2;
+  auto dim0 = 1;
+  auto maxnorm0 = 2;
+  std::vector<IValue> args{self0, p0, dim0, maxnorm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto p1 = 2;
+  auto dim1 = 1;
+  auto maxnorm1 = 2;
+  std::vector<IValue> args2{self1, p1, dim1, maxnorm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__convert_indices_from_coo_to_csr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %size: int, %out_int32: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::_convert_indices_from_coo_to_csr(%self, %size, %out_int32)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::randint(0, 3, {2}, at::kInt);
+  auto size0 = 10;
+  auto out_int320 = false;
+  std::vector<IValue> args{self0, size0, out_int320};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::randint(0, 3, {12}, at::kInt);
+  auto size1 = 24;
+  auto out_int321 = false;
+  std::vector<IValue> args2{self1, size1, out_int321};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen__convert_indices_from_csr_to_coo) {
+  const std::string script = R"IR(
+    graph(%crow_indices: Tensor, %col_indices: Tensor, %out_int32: bool, %transpose: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::_convert_indices_from_csr_to_coo(%crow_indices, %col_indices, %out_int32, %transpose)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto crow_indices0 = torch::tensor({1}, torch::kInt32);
+  auto col_indices0 = torch::tensor({0, 1, 0}, torch::kInt32);
+  auto out_int320 = false;
+  auto transpose0 = false;
+  std::vector<IValue> args{crow_indices0, col_indices0, out_int320, transpose0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto crow_indices1 = torch::tensor({0}, torch::kInt32);
+  auto col_indices1 =
+      torch::tensor({0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2}, torch::kInt32);
+  auto out_int321 = false;
+  auto transpose1 = false;
+  std::vector<IValue> args2{
+      crow_indices1, col_indices1, out_int321, transpose1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_mse_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %reduction: int):
+        %bias: None = prim::Constant()
+        %ret = aten::mse_loss(%self, %target, %reduction)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto target0 = at::rand({6, 6, 6});
+  auto reduction0 = 1;
+  std::vector<IValue> args{self0, target0, reduction0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto target1 = at::rand({22, 22, 22});
+  auto reduction1 = 1;
+  std::vector<IValue> args2{self1, target1, reduction1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_l1_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %reduction: int):
+        %bias: None = prim::Constant()
+        %ret = aten::l1_loss(%self, %target, %reduction)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto target0 = at::rand({6, 6, 6});
+  auto reduction0 = 1;
+  std::vector<IValue> args{self0, target0, reduction0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto target1 = at::rand({22, 22, 22});
+  auto reduction1 = 1;
+  std::vector<IValue> args2{self1, target1, reduction1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_multi_margin_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %p: int, %margin: int, %weight: Tensor?, %reduction: int):
+        %bias: None = prim::Constant()
+        %ret = aten::multi_margin_loss(%self, %target, %p, %margin, %weight, %reduction)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto target0 = at::randint(6, {6}, torch::kInt64);
+  auto p0 = 2;
+  auto margin0 = 2;
+  auto weight0 = at::rand({6});
+  auto reduction0 = 1;
+  std::vector<IValue> args{self0, target0, p0, margin0, weight0, reduction0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22});
+  auto target1 = at::randint(22, {22}, torch::kInt64);
+  auto p1 = 2;
+  auto margin1 = 2;
+  auto weight1 = at::rand({22});
+  auto reduction1 = 1;
+  std::vector<IValue> args2{self1, target1, p1, margin1, weight1, reduction1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_multilabel_margin_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %reduction: int):
+        %bias: None = prim::Constant()
+        %ret = aten::multilabel_margin_loss(%self, %target, %reduction)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto target0 = at::randint(6, {6, 6}, torch::kInt64);
+  auto reduction0 = 1;
+  std::vector<IValue> args{self0, target0, reduction0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22});
+  auto target1 = at::randint(22, {22, 22}, torch::kInt64);
+  auto reduction1 = 1;
+  std::vector<IValue> args2{self1, target1, reduction1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_nll_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int):
+        %bias: None = prim::Constant()
+        %ret = aten::nll_loss(%self, %target, %weight, %reduction, %ignore_index)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6});
+  auto target0 = at::randint(6, {6}, torch::kInt64);
+  auto weight0 = at::rand({6});
+  auto reduction0 = 1;
+  auto ignore_index0 = 1;
+  std::vector<IValue> args{self0, target0, weight0, reduction0, ignore_index0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22});
+  auto target1 = at::randint(22, {22}, torch::kInt64);
+  auto weight1 = at::rand({22});
+  auto reduction1 = 1;
+  auto ignore_index1 = 1;
+  std::vector<IValue> args2{self1, target1, weight1, reduction1, ignore_index1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_nll_loss_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int, %total_weight: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::nll_loss_backward(%grad_output, %self, %target, %weight, %reduction, %ignore_index, %total_weight)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({});
+  auto self0 = at::rand({6});
+  auto target0 = at::randint(0, 5, {6}, torch::kInt64);
+  auto weight0 = at::rand({6});
+  auto reduction0 = 1;
+  auto ignore_index0 = 1;
+  auto total_weight0 = at::rand({});
+  std::vector<IValue> args{
+      grad_output0,
+      self0,
+      target0,
+      weight0,
+      reduction0,
+      ignore_index0,
+      total_weight0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({});
+  auto self1 = at::rand({36});
+  auto target1 = at::randint(0, 11, {36}, torch::kInt64);
+  auto weight1 = at::rand({36});
+  auto reduction1 = 1;
+  auto ignore_index1 = 1;
+  auto total_weight1 = at::rand({});
+  std::vector<IValue> args2{
+      grad_output1,
+      self1,
+      target1,
+      weight1,
+      reduction1,
+      ignore_index1,
+      total_weight1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_nll_loss2d) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int):
+        %bias: None = prim::Constant()
+        %ret = aten::nll_loss2d(%self, %target, %weight, %reduction, %ignore_index)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6, 6});
+  auto target0 = at::randint(6, {6, 6, 6}, torch::kInt64);
+  auto weight0 = at::rand({6});
+  auto reduction0 = 1;
+  auto ignore_index0 = 1;
+  std::vector<IValue> args{self0, target0, weight0, reduction0, ignore_index0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+
+  auto self1 = at::rand({22, 22, 22, 22});
+  auto target1 = at::randint(22, {22, 22, 22}, torch::kInt64);
+  auto weight1 = at::rand({22});
+  auto reduction1 = 1;
+  auto ignore_index1 = 1;
+  std::vector<IValue> args2{self1, target1, weight1, reduction1, ignore_index1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+}
+
+TEST(StaticRuntime, autogen_soft_margin_loss) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %target: Tensor, %reduction: int):
+        %bias: None = prim::Constant()
+        %ret = aten::soft_margin_loss(%self, %target, %reduction)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto target0 = at::rand({6, 6, 6});
+  auto reduction0 = 1;
+  std::vector<IValue> args{self0, target0, reduction0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto target1 = at::rand({22, 22, 22});
+  auto reduction1 = 1;
+  std::vector<IValue> args2{self1, target1, reduction1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_elu) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %alpha: int, %scale: int, %input_scale: int):
+        %bias: None = prim::Constant()
+        %ret = aten::elu(%self, %alpha, %scale, %input_scale)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto alpha0 = 2;
+  auto scale0 = 2;
+  auto input_scale0 = 2;
+  std::vector<IValue> args{self0, alpha0, scale0, input_scale0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto alpha1 = 2;
+  auto scale1 = 2;
+  auto input_scale1 = 2;
+  std::vector<IValue> args2{self1, alpha1, scale1, input_scale1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_elu_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %alpha: int, %scale: int, %input_scale: int, %is_result: bool, %self_or_result: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::elu_backward(%grad_output, %alpha, %scale, %input_scale, %is_result, %self_or_result)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto alpha0 = 2;
+  auto scale0 = 2;
+  auto input_scale0 = 2;
+  auto is_result0 = false;
+  auto self_or_result0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{
+      grad_output0, alpha0, scale0, input_scale0, is_result0, self_or_result0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto alpha1 = 2;
+  auto scale1 = 2;
+  auto input_scale1 = 2;
+  auto is_result1 = false;
+  auto self_or_result1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{
+      grad_output1, alpha1, scale1, input_scale1, is_result1, self_or_result1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_glu) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int):
+        %bias: None = prim::Constant()
+        %ret = aten::glu(%self, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardsigmoid) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::hardsigmoid(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardsigmoid_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::hardsigmoid_backward(%grad_output, %self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{grad_output0, self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{grad_output1, self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardtanh) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %min_val: int, %max_val: int):
+        %bias: None = prim::Constant()
+        %ret = aten::hardtanh(%self, %min_val, %max_val)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto min_val0 = 2;
+  auto max_val0 = 2;
+  std::vector<IValue> args{self0, min_val0, max_val0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto min_val1 = 2;
+  auto max_val1 = 2;
+  std::vector<IValue> args2{self1, min_val1, max_val1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_hardswish) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::hardswish(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_leaky_relu_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %negative_slope: int, %self_is_result: bool):
+        %bias: None = prim::Constant()
+        %ret = aten::leaky_relu_backward(%grad_output, %self, %negative_slope, %self_is_result)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto negative_slope0 = 2;
+  auto self_is_result0 = false;
+  std::vector<IValue> args{
+      grad_output0, self0, negative_slope0, self_is_result0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto negative_slope1 = 2;
+  auto self_is_result1 = false;
+  std::vector<IValue> args2{
+      grad_output1, self1, negative_slope1, self_is_result1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_log_sigmoid) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::log_sigmoid(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_softplus) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %beta: int, %threshold: int):
+        %bias: None = prim::Constant()
+        %ret = aten::softplus(%self, %beta, %threshold)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto beta0 = 2;
+  auto threshold0 = 2;
+  std::vector<IValue> args{self0, beta0, threshold0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto beta1 = 2;
+  auto threshold1 = 2;
+  std::vector<IValue> args2{self1, beta1, threshold1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_softplus_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %beta: int, %threshold: int):
+        %bias: None = prim::Constant()
+        %ret = aten::softplus_backward(%grad_output, %self, %beta, %threshold)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto beta0 = 2;
+  auto threshold0 = 2;
+  std::vector<IValue> args{grad_output0, self0, beta0, threshold0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto beta1 = 2;
+  auto threshold1 = 2;
+  std::vector<IValue> args2{grad_output1, self1, beta1, threshold1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_softshrink) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %lambd: int):
+        %bias: None = prim::Constant()
+        %ret = aten::softshrink(%self, %lambd)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto lambd0 = 2;
+  std::vector<IValue> args{self0, lambd0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto lambd1 = 2;
+  std::vector<IValue> args2{self1, lambd1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_softshrink_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %lambd: int):
+        %bias: None = prim::Constant()
+        %ret = aten::softshrink_backward(%grad_output, %self, %lambd)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto self0 = at::rand({6, 6, 6});
+  auto lambd0 = 2;
+  std::vector<IValue> args{grad_output0, self0, lambd0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto self1 = at::rand({22, 22, 22});
+  auto lambd1 = 2;
+  std::vector<IValue> args2{grad_output1, self1, lambd1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_adaptive_max_pool2d_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %indices: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::adaptive_max_pool2d_backward(%grad_output, %self, %indices)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::randint(-3, 2, {2, 2, 2});
+  auto self0 = at::randint(-3, 2, {2, 2, 2});
+  auto indices0 = at::randint(0, 1, {2, 2, 2}, at::kLong);
+  std::vector<IValue> args{grad_output0, self0, indices0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::randint(-3, 3, {3, 3, 3});
+  auto self1 = at::randint(-3, 2, {3, 3, 3});
+  auto indices1 = at::randint(0, 1, {3, 3, 3}, at::kLong);
+  std::vector<IValue> args2{grad_output1, self1, indices1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_adaptive_max_pool3d_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %self: Tensor, %indices: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::adaptive_max_pool3d_backward(%grad_output, %self, %indices)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::randint(-3, 2, {2, 2, 2, 2});
+  auto self0 = at::randint(-3, 2, {2, 2, 2, 2});
+  auto indices0 = at::randint(0, 1, {2, 2, 2, 2}, at::kLong);
+  std::vector<IValue> args{grad_output0, self0, indices0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::randint(-3, 3, {3, 3, 3, 3});
+  auto self1 = at::randint(-3, 2, {3, 3, 3, 3});
+  auto indices1 = at::randint(0, 1, {3, 3, 3, 3}, at::kLong);
+  std::vector<IValue> args2{grad_output1, self1, indices1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_sigmoid_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %output: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::sigmoid_backward(%grad_output, %output)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto output0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{grad_output0, output0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto output1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{grad_output1, output1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_tanh_backward) {
+  const std::string script = R"IR(
+    graph(%grad_output: Tensor, %output: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::tanh_backward(%grad_output, %output)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto grad_output0 = at::rand({6, 6, 6});
+  auto output0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{grad_output0, output0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto grad_output1 = at::rand({22, 22, 22});
+  auto output1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{grad_output1, output1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_isposinf) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::isposinf(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_isneginf) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::isneginf(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_entr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_entr(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_ndtri) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_ndtri(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_log_ndtr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_log_ndtr(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_expm1) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_expm1(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_exp2) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_exp2(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_psi) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_psi(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_digamma) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_digamma(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_gammaln) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_gammaln(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_erf) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_erf(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_erfc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_erfc(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_erfcx) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_erfcx(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_erfinv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_erfinv(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_ndtr) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_ndtr(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_xlog1py) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_xlog1py(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_xlogy) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_xlogy(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_zeta) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_zeta(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({2, 2, 2}, at::kDouble) + at::ones({2, 2, 2});
+  auto other0 = at::rand({2, 2, 2}, at::kDouble) + at::ones({2, 2, 2});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({5, 5, 5}, at::kDouble) + at::ones({5, 5, 5});
+  auto other1 = at::rand({5, 5, 5}, at::kDouble) + at::ones({5, 5, 5});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_i0) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_i0(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_i0e) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_i0e(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_i1) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_i1(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_i1e) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_i1e(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_polygamma) {
+  const std::string script = R"IR(
+    graph(%n: int, %self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_polygamma(%n, %self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto n0 = 1;
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{n0, self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto n1 = 1;
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{n1, self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_expit) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_expit(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_sinc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_sinc(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_round) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %decimals: int):
+        %bias: None = prim::Constant()
+        %ret = aten::special_round(%self, %decimals)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto decimals0 = 1;
+  std::vector<IValue> args{self0, decimals0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto decimals1 = 1;
+  std::vector<IValue> args2{self1, decimals1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_log1p) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_log1p(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_gammainc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_gammainc(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_gammaincc) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::special_gammaincc(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_special_multigammaln) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %p: int):
+        %bias: None = prim::Constant()
+        %ret = aten::special_multigammaln(%self, %p)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto p0 = 1;
+  std::vector<IValue> args{self0, p0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto p1 = 1;
+  std::vector<IValue> args2{self1, p1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_fft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_fft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_ifft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_ifft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_rfft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_rfft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_irfft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_irfft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_hfft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_hfft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_fft_ihfft) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int?, %dim: int, %norm: str?):
+        %bias: None = prim::Constant()
+        %ret = aten::fft_ihfft(%self, %n, %dim, %norm)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  auto dim0 = 1;
+  auto norm0 = "forward";
+  std::vector<IValue> args{self0, n0, dim0, norm0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  auto dim1 = 1;
+  auto norm1 = "forward";
+  std::vector<IValue> args2{self1, n1, dim1, norm1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_cross) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor, %dim: int):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_cross(%self, %other, %dim)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 3, 6});
+  auto other0 = at::rand({6, 3, 6});
+  auto dim0 = 1;
+  std::vector<IValue> args{self0, other0, dim0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 3, 22});
+  auto other1 = at::rand({22, 3, 22});
+  auto dim1 = 1;
+  std::vector<IValue> args2{self1, other1, dim1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_det) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_det(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_matmul) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_matmul(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_eigvals) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_eigvals(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_inv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_inv(%self)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_inner) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::inner(%self, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{self0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{self1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_outer) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %vec2: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::outer(%self, %vec2)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({16});
+  auto vec20 = at::rand({16});
+  std::vector<IValue> args{self0, vec20};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({64});
+  auto vec21 = at::rand({64});
+  std::vector<IValue> args2{self1, vec21};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_svdvals) {
+  const std::string script = R"IR(
+    graph(%A: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_svdvals(%A)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto A0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{A0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto A1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{A1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_cond) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %p: int?):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_cond(%self, %p)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto p0 = 2;
+  std::vector<IValue> args{self0, p0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto p1 = 2;
+  std::vector<IValue> args2{self1, p1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_solve) {
+  const std::string script = R"IR(
+    graph(%input: Tensor, %other: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_solve(%input, %other)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto input0 = at::rand({6, 6, 6});
+  auto other0 = at::rand({6, 6, 6});
+  std::vector<IValue> args{input0, other0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto input1 = at::rand({22, 22, 22});
+  auto other1 = at::rand({22, 22, 22});
+  std::vector<IValue> args2{input1, other1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_tensorinv) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %ind: int):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_tensorinv(%self, %ind)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6, 6});
+  auto ind0 = 2;
+  std::vector<IValue> args{self0, ind0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22, 22});
+  auto ind1 = 2;
+  std::vector<IValue> args2{self1, ind1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
+
+TEST(StaticRuntime, autogen_linalg_matrix_power) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %n: int):
+        %bias: None = prim::Constant()
+        %ret = aten::linalg_matrix_power(%self, %n)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6, 6, 6});
+  auto n0 = 1;
+  std::vector<IValue> args{self0, n0};
+  testStaticRuntime(
+      script,
+      args,
+      {},
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+
+  auto self1 = at::rand({22, 22, 22});
+  auto n1 = 1;
+  std::vector<IValue> args2{self1, n1};
+  testStaticRuntime(
+      script,
+      args,
+      args2,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/true);
+}
diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc
index 353ce93bb651..41758ec9f2f3 100644
--- a/benchmarks/static_runtime/test_static_module.cc
+++ b/benchmarks/static_runtime/test_static_module.cc
@@ -243,6 +243,14 @@ TEST(StaticRuntime, ReplaceWithCopy_replaces_reshape) {
         c = inp.reshape(shape)
         return (a, b, c)
   )JIT");
+  ExpectToReplaceWithCopy(R"JIT(
+    def forward(self, cond: bool, x):
+        if cond:
+            y = x.reshape(x.shape)
+        else:
+            y = x.clone()
+        return y.clone()
+  )JIT");
 }
 
 TEST(
@@ -289,7 +297,6 @@ TEST(
         return (d)
   )JIT");
   ExpectNotToReplaceWithCopy(reshape_inplace_script);
-  ExpectNotToReplaceWithCopy(reshape_inplace_script_1);
 }
 
 TEST(StaticRuntime, CanEnableStaticRuntime) {
@@ -432,7 +439,8 @@ TEST(StaticRuntime, LongModel) {
   torch::jit::StaticModule smod(mod);
   at::Tensor output_2 = smod(input_tensors, {}).toTensor();
   smod.runtime().check_for_memory_leak();
-  EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+  EXPECT_TRUE(
+      torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
 }
 
 TEST(StaticRuntime, TrivialModel) {
@@ -450,7 +458,8 @@ TEST(StaticRuntime, TrivialModel) {
   torch::jit::StaticModule smod(mod);
   at::Tensor output_2 = smod(input_tensors, {}).toTensor();
   smod.runtime().check_for_memory_leak();
-  EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+  EXPECT_TRUE(
+      torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
 }
 
 TEST(StaticRuntime, DeepWide) {
@@ -475,7 +484,8 @@ TEST(StaticRuntime, DeepWide) {
       ASSERT_TRUE(outputs.size() > 0);
       at::Tensor output_2 = outputs[0].toTensor();
       smod.runtime().check_for_memory_leak();
-      EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+      EXPECT_TRUE(
+          torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
     }
   }
 }
@@ -502,7 +512,8 @@ TEST(StaticRuntime, KWargsAPI_1) {
         smod.runtime().check_for_memory_leak();
 
         at::Tensor output_2 = getTensor(output_ivalue);
-        EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+        EXPECT_TRUE(
+            torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
 
         // check for output aliasing
         EXPECT_EQ(output_ivalue.use_count(), 1);
@@ -546,7 +557,8 @@ TEST(StaticRuntime, KWargsAPI_2) {
         smod.runtime().check_for_memory_leak();
 
         at::Tensor output_2 = getTensor(output_ivalue);
-        EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+        EXPECT_TRUE(
+            torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
 
         // check for output aliasing
         EXPECT_EQ(output_ivalue.use_count(), 1);
@@ -562,6 +574,24 @@ TEST(StaticRuntime, KWargsAPI_2) {
   }
 }
 
+TEST(StaticRuntime, KWargsAPI_Optional) {
+  const auto src = R"JIT(
+    def forward(self, x, y, z: Optional[Tensor] = None):
+        return x + y
+  )JIT";
+
+  torch::jit::Module mod("mod");
+  mod.define(src);
+  torch::jit::StaticModule smod(mod);
+  const auto kwargs = std::unordered_map<std::string, IValue>{
+      {"x", at::randn({1})}, {"y", at::randn({1})}};
+
+  auto expected = mod.forward({}, kwargs).toTensor();
+  auto actual = smod({}, kwargs).toTensor();
+
+  EXPECT_TRUE(expected.equal(actual));
+}
+
 TEST(StaticRuntime, CleanUpMemory) {
   const int embedding_size = 32;
   const int num_features = 50;
@@ -605,7 +635,8 @@ TEST(StaticRuntime, CleanUpMemory) {
             ASSERT_TRUE(outputs.size() > 0);
             auto output_2 = outputs[0].toTensor();
             runtime.check_for_memory_leak();
-            EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+            EXPECT_TRUE(torch::allclose(
+                output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
             if (manage_output_tensors) {
               runtime.deallocateOutputTensors();
               runtime.checkOutputTensorMemoryLeaks();
@@ -850,7 +881,8 @@ TEST(StaticRuntime, FusionPass) {
       }
       EXPECT_TRUE(hit);
       auto output_2 = getTensor(module.forward(inputs));
-      EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
+      EXPECT_TRUE(
+          torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7));
     }
   }
 }
@@ -882,8 +914,9 @@ TEST(
       sigmoid_node,
       /*enable_out_variant=*/true,
       /*check_memory_overlap=*/false);
-  ProcessedNode pnode(sigmoid_node, &fn, createProcessedNodeInputs({0}), 1);
-  pnode.set_values(values.data());
+  StaticNodeInfo static_node_info(
+      sigmoid_node, &fn, createProcessedNodeInputs({0}), 1);
+  ProcessedNode pnode(static_node_info, values.data());
   EXPECT_TRUE(pnode.verify_no_memory_overlap(/* force_check*/ true));
 
   pnode.Output(0) = values[0];
@@ -901,8 +934,9 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithImmutableInputsWithInplaceOps) {
       sigmoid_node,
       /*enable_out_variant=*/true,
       /*check_memory_overlap=*/false);
-  ProcessedNode pnode(sigmoid_node, &fn, createProcessedNodeInputs({0}), 1);
-  pnode.set_values(values.data());
+  StaticNodeInfo static_node_info(
+      sigmoid_node, &fn, createProcessedNodeInputs({0}), 1);
+  ProcessedNode pnode(static_node_info, values.data());
 
   ASSERT_EQ(&pnode.Output(0), &values[1]);
   EXPECT_TRUE(pnode.verify_no_memory_overlap());
@@ -928,9 +962,10 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithOverlappingOutputs) {
         list_unpack_node,
         /*enable_out_variant=*/true,
         /*check_memory_overlap */ false);
-    ProcessedNode list_unpack_pnode(
+    StaticNodeInfo list_unpack_static_node_info(
         list_unpack_node, &fn, createProcessedNodeInputs({0}), 1);
-    list_unpack_pnode.set_values(values.data());
+    ProcessedNode list_unpack_pnode(
+        list_unpack_static_node_info, values.data());
     ASSERT_EQ(list_unpack_pnode.outputs().size(), 2);
     EXPECT_TRUE(
         list_unpack_pnode.verify_no_memory_overlap(/* force_check*/ true));
@@ -942,9 +977,10 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithOverlappingOutputs) {
         list_unpack_node,
         /*enable_out_variant=*/true,
         /*check_memory_overlap */ false);
-    ProcessedNode list_unpack_pnode(
+    StaticNodeInfo list_unpack_static_node_info(
         list_unpack_node, &fn, createProcessedNodeInputs({0}), 1);
-    list_unpack_pnode.set_values(values.data());
+    ProcessedNode list_unpack_pnode(
+        list_unpack_static_node_info, values.data());
     auto b = at::randn({2, 3});
     list_unpack_pnode.Output(0) = b;
     list_unpack_pnode.Output(1) = b;
@@ -1500,3 +1536,231 @@ TEST(ForceNonEmptyOutputs, TwoSubBlocks) {
     }
   }
 }
+
+TEST(EliminateExtraPermuteOps, FusesSumCorrectly) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        y = torch.permute(x, (0, 2, 1))
+        z = torch.sum(y, dim=-1)
+        return z
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+
+  auto graph = mod.get_method("forward").graph();
+  // turn the ListConstruct(%constant) into proper constant lists
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::permute"));
+  auto* sum = getNodeWithKind(graph, "aten::sum");
+  ASSERT_NE(sum, nullptr);
+  auto dim = toIValue(sum->input(1));
+  ASSERT_TRUE(dim.has_value() && dim->isIntList());
+  EXPECT_EQ(dim->toIntList(), c10::List<int64_t>{1});
+}
+
+TEST(EliminateExtraPermuteOps, DoesNotFuseSumWrongDim) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        y = torch.permute(x, (0, 2, 1))
+        z = torch.sum(y, dim=1)
+        return z
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+
+  auto graph = mod.get_method("forward").graph();
+  // turn the ListConstruct(%constant) into proper constant lists
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+
+  EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute"));
+}
+
+TEST(EliminateExtraPermuteOps, DoesNotFuseSumNonConstantDim) {
+  const auto src = R"JIT(
+    def forward(self, x, dim: int):
+        y = torch.permute(x, (0, 2, 1))
+        z = torch.sum(y, dim=dim)
+        return z
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+
+  auto graph = mod.get_method("forward").graph();
+  // turn the ListConstruct(%constant) into proper constant lists
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+
+  EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute"));
+}
+
+TEST(EliminateExtraPermuteOps, FusesSoftmaxCorrectly) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        a = torch.permute(x, [0, 2, 1])
+        b = torch.softmax(a, 2)
+        c = torch.permute(b, [0, 2, 1])
+        return c.clone()
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+  graph->dump();
+
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::permute"));
+  auto* softmax = getNodeWithKind(graph, "aten::softmax");
+  ASSERT_NE(softmax, nullptr);
+  auto dim = toIValue(softmax->input(1));
+  ASSERT_TRUE(dim.has_value() && dim->isInt());
+  EXPECT_EQ(dim->toInt(), 1);
+
+  std::vector<IValue> args{at::randn({3, 4, 5})};
+  testStaticRuntime(src, args, /*args2=*/{}, /*use_allclose=*/true);
+}
+
+TEST(EliminateExtraPermuteOps, DoesNotFuseSoftmaxWrongPermuteDim) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        a = torch.permute(x, [0, 1, 2])
+        b = torch.softmax(a, 2)
+        c = torch.permute(b, [0, 1, 2])
+        return c.clone()
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+  EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute"));
+}
+
+TEST(EliminateExtraPermuteOps, DoesNotFuseSoftmaxWrongSoftmaxDim) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        a = torch.permute(x, [0, 2, 1])
+        b = torch.softmax(a, 0)
+        c = torch.permute(b, [0, 2, 1])
+        return c.clone()
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  ConstantPropagation(graph);
+  EliminateExtraPermuteOps(graph);
+  EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute"));
+}
+
+TEST(UseSplitAndSqueeze, Fusion) {
+  const auto src = R"IR(
+    graph(%x: Tensor):
+      %dim: int = prim::Constant[value=1]()
+      %split_size: int = prim::Constant[value=1]()
+      %split: Tensor[] = aten::split(%x, %split_size, %dim)
+      %a: Tensor, %b: Tensor = prim::ListUnpack(%split)
+      %c: Tensor = aten::squeeze(%a, %dim)
+      %d: Tensor = aten::squeeze(%b, %dim)
+      return (%c, %d)
+  )IR";
+  auto graph = getGraphFromIR(src);
+  UseSplitAndSqueeze(graph);
+  EXPECT_TRUE(
+      hasNodeWithKind(graph, "static_runtime::fused_split_and_squeeze_copy"));
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::split"));
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::squeeze"));
+  EXPECT_FALSE(hasNodeWithKind(graph, "prim::ListUnpack"));
+}
+
+TEST(EliminateNoOpSlice, IntegerStart) {
+  const auto src = R"JIT(
+    def forward(self, x: List[int]) -> List[int]:
+        return x[0:]
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  EXPECT_TRUE(hasNodeWithKind(graph, "aten::slice"));
+  EliminateNoOpSlice(graph);
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::slice"));
+}
+
+TEST(EliminateNoOpSlice, NoneStart) {
+  const auto src = R"JIT(
+    def forward(self, x: List[int]) -> List[int]:
+        return x[:]
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  EliminateNoOpSlice(graph);
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::slice"));
+}
+
+#ifdef FBCODE_CAFFE2
+// FuseClampNaNToNum pass is disabled externally to avoid MSVC errors in CI
+TEST(FuseClampNaNToNum, FusionHappens) {
+  const auto src = R"JIT(
+    def forward(self, x):
+        y = torch.clamp(x, min=0.0, max=1.0)
+        z = y.nan_to_num()
+        return z.clone()
+  )JIT";
+  torch::jit::Module mod("m");
+  mod.define(src);
+  auto graph = mod.get_method("forward").graph();
+  FuseClampNaNToNum(graph);
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::clamp"));
+  EXPECT_FALSE(hasNodeWithKind(graph, "aten::nan_to_num"));
+  EXPECT_TRUE(hasNodeWithKind(graph, "static_runtime::clamp_nan_to_num"));
+  // Correctness of the op is exercised in StaticRuntime.clamp_nan_to_num
+}
+
+TEST(FuseClampNaNToNum, NoFusion) {
+  const auto src1 = R"JIT(
+    def forward(self, x, a: float, b: float):
+        y = torch.clamp(x, a, b)
+        z = y.nan_to_num()
+        return z.clone()
+  )JIT";
+
+  const auto src2 = R"JIT(
+    def forward(self, x):
+        y = torch.clamp(x, min=0.0)
+        z = y.nan_to_num()
+        return z.clone()
+  )JIT";
+
+  const auto src3 = R"JIT(
+    def forward(self, x):
+        y = torch.clamp(x, max=0.0)
+        z = y.nan_to_num()
+        return z.clone()
+  )JIT";
+
+  const auto src4 = R"JIT(
+    def forward(self, x):
+        y = torch.clamp(x)
+        z = y.nan_to_num()
+        return z.clone()
+  )JIT";
+
+
+  auto checkScript = [](const char* src) {
+    torch::jit::Module mod("m");
+    mod.define(src);
+    auto graph = mod.get_method("forward").graph();
+    FuseClampNaNToNum(graph);
+    EXPECT_TRUE(hasNodeWithKind(graph, "aten::clamp"));
+    EXPECT_TRUE(hasNodeWithKind(graph, "aten::nan_to_num"));
+    EXPECT_FALSE(hasNodeWithKind(graph, "static_runtime::clamp_nan_to_num"));
+  };
+
+  checkScript(src1);
+  checkScript(src2);
+  checkScript(src3);
+  checkScript(src4);
+}
+#endif
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index bc923e707e1d..f6d4b0efd58e 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -5,6 +5,8 @@
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/runtime/static/ProcessedNodeInputs.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/csrc/jit/runtime/static/passes.h>
+#include <torch/csrc/jit/testing/file_check.h>
 #include <stdexcept>
 
 #include "deep_wide_pt.h"
@@ -172,6 +174,146 @@ TEST(StaticRuntime, Clamp) {
   testStaticRuntime(clamp_script_2, {a, min_t, max_t}, {b, max_t1, min_t1});
 }
 
+TEST(StaticRuntime, ClampMinOnly) {
+  const auto src = R"JIT(
+    def forward(self, inp: Tensor, min: float):
+        a = torch.clamp(inp, min, None).clone()
+        return (a)
+  )JIT";
+  auto a = at::randn({2, 3});
+  auto b = at::randn({4, 3, 2});
+  testStaticRuntime(src, {a, 0.5});
+  testStaticRuntime(src, {a, 0.5}, {b, 0.25});
+}
+
+TEST(StaticRuntime, ClampMaxOnly) {
+  const auto src = R"JIT(
+    def forward(self, inp: Tensor, max: float):
+        a = torch.clamp(inp, None, max).clone()
+        return (a)
+  )JIT";
+  auto a = at::randn({2, 3});
+  auto b = at::randn({4, 3, 2});
+  testStaticRuntime(src, {a, 0.5});
+  testStaticRuntime(src, {a, 0.5}, {b, 0.25});
+}
+
+TEST(StaticRuntime, ClampIntTensor) {
+  const auto src = R"JIT(
+    def forward(self, inp: Tensor, min: float, max: float):
+        a = torch.clamp(inp, min, max).clone()
+        return (a)
+  )JIT";
+  auto a = at::randint(0, 20, {2, 3});
+  auto b = at::randint(0, 20, {4, 3, 2});
+  auto min = 5.0f;
+  auto max = 5.0f;
+  testStaticRuntime(src, {a, min, max});
+  testStaticRuntime(src, {a, min, max}, {b, min, max});
+}
+
+TEST(StaticRuntime, LenWithTuple) {
+  const auto src = R"IR(
+    graph(%input : int[]):
+        %res : int = aten::len(%input)
+        return (%res)
+  )IR";
+
+  testStaticRuntime(src, {c10::List<int64_t>(4)});
+}
+
+TEST(StaticRuntime, LenWithTensor) {
+  const auto src = R"IR(
+    graph(%input : Tensor):
+        %res : int = aten::len(%input)
+        return (%res)
+  )IR";
+
+  testStaticRuntime(src, {at::randn({2, 2, 2})});
+}
+
+TEST(StaticRuntime, LenWithStr) {
+  const auto src = R"IR(
+    graph(%input : str):
+        %res : int = aten::len(%input)
+        return (%res)
+  )IR";
+
+  testStaticRuntime(src, {"static_runtime"});
+}
+
+TEST(StaticRuntime, LenWithDict_str) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[str, str]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<std::string, std::string> dict;
+  dict.insert("abc", "123");
+  dict.insert("def", "456");
+  testStaticRuntime(script, {dict});
+}
+
+TEST(StaticRuntime, LenWithDict_int) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[int, int]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<int64_t, int64_t> dict;
+  dict.insert(0, 1);
+  dict.insert(2, 3);
+  testStaticRuntime(script, {dict});
+}
+
+TEST(StaticRuntime, LenWithDict_bool) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[bool, bool]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<bool, bool> dict;
+  dict.insert(true, false);
+  dict.insert(false, true);
+  testStaticRuntime(script, {dict});
+}
+
+TEST(StaticRuntime, LenWithDict_float) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[float, float]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<double, double> dict;
+  dict.insert(0.1, 0.9);
+  dict.insert(0.8, 0.18);
+  testStaticRuntime(script, {dict});
+}
+
+TEST(StaticRuntime, LenWithDict_complex) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[complex, complex]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<c10::complex<double>, c10::complex<double>> dict;
+  dict.insert(0.1, 0.4);
+  dict.insert(0.9, 0.45);
+  testStaticRuntime(script, {dict});
+}
+
+TEST(StaticRuntime, LenWithDict_Tensor) {
+  const auto script = R"JIT(
+    def forward(self, input: Dict[Tensor, Tensor]):
+        return len(input)
+  )JIT";
+
+  c10::Dict<at::Tensor, at::Tensor> dict;
+  dict.insert(at::randn({1, 2}), at::randn({1, 2}));
+  dict.insert(at::randn({1, 2}), at::randn({1, 2}));
+  testStaticRuntime(script, {dict});
+}
+
 TEST(StaticRuntime, Logit) {
   // no nnc
   const auto logit_script_1 = R"JIT(
@@ -293,6 +435,99 @@ TEST(StaticRuntime, EmbeddingBagWithManagedOutput) {
   testStaticRuntime(embedding_bag_managed_output, args, args2);
 }
 
+TEST(StaticRuntime, EmbeddingBagWithExtraneousOutput) {
+  const std::string embedding_bag_default_ir = R"IR(
+    graph(%weight, %indices, %offsets):
+        %scale_grad_by_freq : bool = prim::Constant[value=0]()
+        %mode : int = prim::Constant[value=0]()
+        %sparse : bool = prim::Constant[value=0]()
+        %per_sample_weights : NoneType = prim::Constant()
+        %include_last_offset : bool = prim::Constant[value=0]()
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        %none : NoneType = prim::Constant()
+        %res : Tensor = aten::clone(%y0, %none)
+        return (%res)
+  )IR";
+  auto graph = getGraphFromIR(embedding_bag_default_ir);
+  RemoveUnnecessaryOutputs(graph);
+  torch::jit::testing::FileCheck()
+      .check("static_runtime::embedding_bag")
+      ->run(*graph);
+
+  const std::string embedding_bag_mean_ir = R"IR(
+    graph(%weight, %indices, %offsets):
+        %scale_grad_by_freq : bool = prim::Constant[value=0]()
+        %mode : int = prim::Constant[value=1]()
+        %sparse : bool = prim::Constant[value=0]()
+        %per_sample_weights : NoneType = prim::Constant()
+        %include_last_offset : bool = prim::Constant[value=0]()
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        %none : NoneType = prim::Constant()
+        %res : Tensor = aten::clone(%y0, %none)
+        return (%res)
+  )IR";
+  graph = getGraphFromIR(embedding_bag_mean_ir);
+  RemoveUnnecessaryOutputs(graph);
+  torch::jit::testing::FileCheck()
+      .check("static_runtime::embedding_bag")
+      ->run(*graph);
+
+  const std::string embedding_bag_max_last_offset_ir = R"IR(
+    graph(%weight, %indices, %offsets):
+        %scale_grad_by_freq : bool = prim::Constant[value=0]()
+        %mode : int = prim::Constant[value=2]()
+        %sparse : bool = prim::Constant[value=0]()
+        %per_sample_weights : NoneType = prim::Constant()
+        %include_last_offset : bool = prim::Constant[value=1]()
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        %none : NoneType = prim::Constant()
+        %res : Tensor = aten::clone(%y0, %none)
+        return (%res)
+  )IR";
+  graph = getGraphFromIR(embedding_bag_max_last_offset_ir);
+  RemoveUnnecessaryOutputs(graph);
+  torch::jit::testing::FileCheck()
+      .check("static_runtime::embedding_bag")
+      ->run(*graph);
+
+  const std::string embedding_bag_normal_ir = R"IR(
+    graph(%weight, %indices, %offsets):
+        %scale_grad_by_freq : bool = prim::Constant[value=0]()
+        %mode : int = prim::Constant[value=0]()
+        %sparse : bool = prim::Constant[value=0]()
+        %per_sample_weights : NoneType = prim::Constant()
+        %include_last_offset : bool = prim::Constant[value=0]()
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        %none : NoneType = prim::Constant()
+        %res0 : Tensor = aten::clone(%y0, %none)
+        %res1 : Tensor = aten::clone(%y1, %none)
+        %res2 : Tensor = aten::clone(%y2, %none)
+        %res3 : Tensor = aten::clone(%y3, %none)
+        return (%res0, %res1, %res2, %res3)
+  )IR";
+  graph = getGraphFromIR(embedding_bag_normal_ir);
+  RemoveUnnecessaryOutputs(graph);
+  torch::jit::testing::FileCheck()
+      .check_not("static_runtime::embedding_bag")
+      ->run(*graph);
+
+  at::Tensor weight = torch::randn({3, 11}, at::ScalarType::Float);
+  at::Tensor input = torch::tensor({0, 1, 0, 2});
+  at::Tensor offset = torch::tensor({0, 2, 4});
+  std::vector<IValue> args{weight, input, offset};
+  testStaticRuntime(embedding_bag_default_ir, args);
+  testStaticRuntime(embedding_bag_mean_ir, args);
+  testStaticRuntime(embedding_bag_max_last_offset_ir, args);
+
+  at::Tensor weight2 = torch::randn({10, 11}, at::ScalarType::Float);
+  at::Tensor input2 = torch::tensor({0, 1, 0, 2, 1});
+  at::Tensor offset2 = torch::tensor({0, 1, 2, 3, 4, 5});
+  std::vector<IValue> args2{weight2, input2, offset2};
+  testStaticRuntime(embedding_bag_default_ir, args, args2);
+  testStaticRuntime(embedding_bag_mean_ir, args, args2);
+  testStaticRuntime(embedding_bag_max_last_offset_ir, args, args2);
+}
+
 TEST(StaticRuntime, LayerNorm) {
   const std::string layer_norm_with_weights = R"JIT(
     def forward(self, input: Tensor, normalized_shape: List[int], weight: Tensor, bias: Tensor):
@@ -304,13 +539,6 @@ TEST(StaticRuntime, LayerNorm) {
         return torch.layer_norm(input, normalized_shape, None, None, 1e-05, False).clone()
   )JIT";
 
-#ifdef FBCODE_CAFFE2
-  script::Module module("module");
-  module.define(layer_norm_with_weights);
-  torch::jit::StaticModule smodule(module);
-  ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr);
-  ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr);
-#endif
   const auto a = torch::rand({1, 2, 2, 2});
   const auto b = torch::rand({3, 2, 2, 2});
   for (int normalized_size : {2, 3}) {
@@ -1170,13 +1398,23 @@ TEST(StaticRuntime, Full) {
         return (a.clone())
   )JIT";
 
-  auto dtype = at::ScalarType::Int;
   auto cpu = at::Device(DeviceType::CPU);
   c10::List<int64_t> size0{2, 5};
-  std::vector<IValue> args{size0, 4, dtype, at::kStrided, cpu, false};
+  std::vector<IValue> args{
+      size0, 4, at::ScalarType::Int, at::kStrided, cpu, false};
+  std::vector<IValue> args1{
+      size0, 4, at::ScalarType::Float, at::kStrided, cpu, false};
   c10::List<int64_t> size1{5, 6};
-  std::vector<IValue> args2{size1, 5, dtype, at::kStrided, cpu, false};
+  std::vector<IValue> args2{
+      size1, 5, at::ScalarType::Float, at::kStrided, cpu, false};
   testStaticRuntime(full_script, args);
+  testStaticRuntime(
+      full_script,
+      args,
+      args1,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
   testStaticRuntime(full_script, args, args2);
 }
 
@@ -1202,16 +1440,157 @@ TEST(StaticRuntime, FullLike) {
 
   auto a = at::randn({2, 3});
   auto b = at::randn({3, 4, 2});
-  auto dtype = at::ScalarType::Int;
   auto cpu = at::Device(DeviceType::CPU);
   std::vector<IValue> args{
-      a, 4, dtype, at::kStrided, cpu, false, c10::MemoryFormat::Contiguous};
+      a,
+      4,
+      at::ScalarType::Int,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
+  std::vector<IValue> args1{
+      a,
+      4,
+      at::ScalarType::Float,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
   std::vector<IValue> args2{
-      b, 4, dtype, at::kStrided, cpu, false, c10::MemoryFormat::Contiguous};
+      b,
+      4,
+      at::ScalarType::Float,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
   testStaticRuntime(full_like_script, args);
+  testStaticRuntime(
+      full_like_script,
+      args,
+      args1,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
   testStaticRuntime(full_like_script, args, args2);
 }
 
+TEST(StaticRuntime, Ones) {
+  const auto script = R"JIT(
+    def forward(self,
+                size: List[int],
+                dtype: Optional[int],
+                layout: Optional[int],
+                device: Optional[Device],
+                pin_memory: Optional[bool]):
+        a = torch.ones(size,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       pin_memory=pin_memory)
+        return (a.clone())
+  )JIT";
+
+  auto dtype = at::ScalarType::Int;
+  auto cpu = at::Device(DeviceType::CPU);
+  c10::List<int64_t> size0{2, 5};
+  std::vector<IValue> args{size0, dtype, at::kStrided, cpu, false};
+  c10::List<int64_t> size1{5, 6};
+  std::vector<IValue> args2{size1, dtype, at::kStrided, cpu, false};
+  testStaticRuntime(script, args);
+  testStaticRuntime(script, args, args2);
+}
+
+TEST(StaticRuntime, OnesLike) {
+  const auto script = R"JIT(
+    def forward(self,
+                input: Tensor,
+                dtype: Optional[int],
+                layout: Optional[int],
+                device: Optional[Device],
+                pin_memory: Optional[bool],
+                memory_format: Optional[int]):
+        a = torch.ones_like(input,
+                            dtype=dtype,
+                            layout=layout,
+                            device=device,
+                            pin_memory=pin_memory,
+                            memory_format=memory_format)
+        return (a.clone())
+  )JIT";
+
+  auto cpu = at::Device(DeviceType::CPU);
+  auto input0 = at::randn({2, 5});
+  std::vector<IValue> args{
+      input0,
+      at::ScalarType::Int,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
+  std::vector<IValue> args1{
+      input0,
+      at::ScalarType::Float,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
+  auto input1 = at::randn({5, 6});
+  std::vector<IValue> args2{
+      input1,
+      at::ScalarType::Float,
+      at::kStrided,
+      cpu,
+      false,
+      c10::MemoryFormat::Contiguous};
+  testStaticRuntime(script, args);
+  testStaticRuntime(
+      script,
+      args,
+      args1,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+  testStaticRuntime(script, args, args2);
+}
+
+TEST(StaticRuntime, Zeros) {
+  const auto script = R"JIT(
+    def forward(self,
+                size: List[int],
+                dtype: Optional[int],
+                layout: Optional[int],
+                device: Optional[Device],
+                pin_memory: Optional[bool]):
+        a = torch.zeros(size,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       pin_memory=pin_memory)
+        return (a.clone())
+  )JIT";
+
+  auto cpu = at::Device(DeviceType::CPU);
+  c10::List<int64_t> size0{2, 5};
+  std::vector<IValue> args{
+      size0, at::ScalarType::Int, at::kStrided, cpu, false};
+  std::vector<IValue> args1{
+      size0, at::ScalarType::Float, at::kStrided, cpu, false};
+  c10::List<int64_t> size1{5, 6};
+  std::vector<IValue> args2{
+      size1, at::ScalarType::Float, at::kStrided, cpu, false};
+  testStaticRuntime(script, args);
+  testStaticRuntime(
+      script,
+      args,
+      args1,
+      /*use_allclose=*/false,
+      /*use_equalnan=*/false,
+      /*check_resize=*/false);
+  testStaticRuntime(script, args, args2);
+}
+
 TEST(StaticRuntime, Linear) {
   const auto linear_script = R"JIT(
     def forward(self, inp: Tensor, weights: Tensor, bias: Optional[Tensor]) -> Tensor:
@@ -1442,6 +1821,28 @@ TEST(StaticRuntime, Index) {
   testStaticRuntime(index_with_two_tensors_script, args_c, args_d);
 }
 
+TEST(StaticRuntime, IndexSelect) {
+  const std::string script = R"IR(
+    graph(%self: Tensor, %dim: int, %index: Tensor):
+        %bias: None = prim::Constant()
+        %ret = aten::index_select(%self, %dim, %index)
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  auto self0 = at::rand({6});
+  auto dim0 = 0;
+  auto index0 = at::randint(0, 5, {6}, torch::kInt32);
+  std::vector<IValue> args{self0, dim0, index0};
+  testStaticRuntime(script, args);
+
+  auto self1 = at::rand({128});
+  auto dim1 = 0;
+  auto index1 = at::randint(0, 127, {127}, torch::kInt32);
+  std::vector<IValue> args2{self1, dim1, index1};
+  testStaticRuntime(script, args, args2);
+}
+
 TEST(StaticRuntime, ClampMin) {
   const auto clamp_min_int_script = R"JIT(
     def forward(self, a: Tensor, b: int):
@@ -1770,7 +2171,7 @@ TEST(StaticRuntime, QuantizedLinearDynamicFp16) {
         %packed_params = quantized::linear_prepack_fp16(%weights, %bias)
         %output = quantized::linear_dynamic_fp16(%input, %packed_params)
         %ret = aten::clone(%output, %bias)
-        return (%output)
+        return (%ret)
   )IR";
   at::Tensor weight = torch::randn({3, 2}, torch::kFloat);
   at::Tensor input = torch::randn({3, 2}, torch::kFloat);
@@ -1784,6 +2185,27 @@ TEST(StaticRuntime, QuantizedLinearDynamicFp16) {
       {input_2, weight_2});
 }
 
+TEST(StaticRuntime, QuantizedLinearReluDynamicFp16) {
+  const std::string quantized_linear_relu_dynamic_fp16_script = R"IR(
+    graph(%input: Tensor, %weights: Tensor):
+        %bias: None = prim::Constant()
+        %packed_params = quantized::linear_prepack_fp16(%weights, %bias)
+        %output = quantized::linear_relu_dynamic_fp16(%input, %packed_params)
+        %ret = aten::clone(%output, %bias)
+        return (%ret)
+  )IR";
+  at::Tensor weight = torch::randn({3, 2}, torch::kFloat);
+  at::Tensor input = torch::randn({3, 2}, torch::kFloat);
+
+  at::Tensor weight_2 = torch::randn({4, 3}, torch::kFloat);
+  at::Tensor input_2 = torch::randn({5, 3}, torch::kFloat);
+
+  testStaticRuntime(
+      quantized_linear_relu_dynamic_fp16_script,
+      {input, weight},
+      {input_2, weight_2});
+}
+
 TEST(StaticRuntime, VarStack) {
   const auto var_stack_script = R"JIT(
     def forward(self, inp1: Tensor, inp2: Tensor, dim: int):
@@ -2159,21 +2581,30 @@ TEST(StaticRuntime, Where) {
         return torch.where(x > 0, x, y).clone()
   )JIT";
 
-  std::vector<IValue> args1_fallback = {at::randn({2, 2}), at::randn({2, 2})};
-  std::vector<IValue> args2_fallback = {at::randn({3, 6}), at::randn({3, 6})};
+  std::vector<IValue> args1 = {at::randn({2, 2}), at::randn({2, 2})};
+  std::vector<IValue> args2 = {at::randn({8, 10}), at::randn({8, 10})};
 
-  std::vector<IValue> args1_nnc = {
-      at::randint(-10, 10, {2, 2}, at::kLong),
-      at::randint(-10, 10, {2, 2}, at::kLong)};
-  std::vector<IValue> args2_nnc = {
-      at::randint(-10, 10, {3, 6}, at::kLong),
-      at::randint(-10, 10, {3, 6}, at::kLong)};
+  testStaticRuntime(where_script, args1);
+  testStaticRuntime(where_script, args1, args2);
+}
 
-  testStaticRuntime(where_script, args1_fallback);
-  testStaticRuntime(where_script, args1_fallback, args2_fallback);
+TEST(StaticRuntime, WhereBroadcast) {
+  const auto where_script = R"JIT(
+    def forward(self, cond_1d, x, y):
+        shape = [-1] + [1] * (x.dim() - 1)
+        cond = cond_1d.view(shape)
+        return torch.where(cond, x, y).clone()
+  )JIT";
 
-  testStaticRuntime(where_script, args1_nnc);
-  testStaticRuntime(where_script, args1_nnc, args2_nnc);
+  std::vector<IValue> args1 = {
+      at::tensor({0, 1}).to(at::kBool), at::randn({2, 2}), at::randn({2, 2})};
+  std::vector<IValue> args2 = {
+      at::tensor({1, 0, 0}).to(at::kBool),
+      at::randn({3, 6}),
+      at::randn({3, 6})};
+
+  testStaticRuntime(where_script, args1);
+  testStaticRuntime(where_script, args1, args2);
 }
 
 TEST(StaticRuntime, View) {
@@ -2720,3 +3151,229 @@ TEST(StaticRuntime, ToList) {
   )JIT";
   testStaticRuntime(src, {at::randn({2, 2})});
 }
+
+TEST(StaticRuntime, IfThenElse) {
+  const auto src = R"IR(
+    graph(%cond: bool, %a: Tensor, %b: Tensor):
+        %none: NoneType = prim::Constant()
+        %c: Tensor = prim::IfThenElse(%cond, %a, %b)
+        %d: Tensor = aten::clone(%c, %none)
+        return (%d)
+  )IR";
+
+  std::vector<IValue> args1{true, at::randn({1}), at::randn({1})};
+  std::vector<IValue> args2{false, at::randn({1}), at::randn({1})};
+
+  testStaticRuntime(src, args1);
+  testStaticRuntime(src, args2);
+}
+
+TEST(StaticRuntime, EmptyIfBlock) {
+  const auto src =
+      R"JIT(
+      def forward(self, cond: bool, a: Tensor, b: Tensor):
+          l = []
+          if cond:
+              l.append((a + b).clone())
+          return l
+  )JIT";
+
+  testStaticRuntime(src, {true, at::rand(1), at::rand({1, 2})});
+  testStaticRuntime(src, {false, at::rand(1), at::rand({1, 2})});
+}
+
+TEST(StaticRuntime, EmptyNestedIfBlock) {
+  const auto src =
+      R"JIT(
+      def forward(self, cond: bool, a: Tensor, b: Tensor):
+          l = []
+          if cond:
+              if cond:
+                  l.append((a + b).clone())
+          return l
+  )JIT";
+
+  testStaticRuntime(src, {true, at::rand(1), at::rand({1, 2})});
+  testStaticRuntime(src, {false, at::rand(1), at::rand({1, 2})});
+}
+
+TEST(StaticRuntime, StackEmpty) {
+  const auto src = R"JIT(
+    def forward(self):
+        x = torch.stack([])
+        return x
+  )JIT";
+
+  torch::jit::Module mod("mod");
+  mod.define(src);
+
+  torch::jit::StaticModule smod(mod);
+  EXPECT_THROW(smod({}), c10::Error);
+}
+
+TEST(StaticRuntime, ConcatEmpty) {
+  const auto src = R"JIT(
+    def forward(self):
+        x = torch.concat([])
+        return x
+  )JIT";
+
+  torch::jit::Module mod("mod");
+  mod.define(src);
+
+  torch::jit::StaticModule smod(mod);
+  EXPECT_THROW(smod({}), c10::Error);
+}
+
+TEST(StaticRuntime, IntImplicit) {
+  const auto src = R"IR(
+    graph(%a: Tensor):
+        %y: int = aten::IntImplicit(%a)
+        return (%y)
+  )IR";
+  testStaticRuntime(src, {at::tensor({1}, at::kInt).squeeze()});
+}
+
+TEST(StaticRuntime, IntImplicit_ThrowOnBadInputs) {
+  const auto src = R"IR(
+    graph(%a: Tensor):
+        %y: int = aten::IntImplicit(%a)
+        return (%y)
+  )IR";
+  auto graph = getGraphFromIR(src);
+  torch::jit::StaticModule smod(graph);
+  // Not 0D tensor
+  EXPECT_THROW(smod({at::tensor({1, 2}, at::kInt)}), std::runtime_error);
+  // Wrong dtype
+  EXPECT_THROW(
+      smod({at::tensor({1}, at::kFloat).squeeze()}), std::runtime_error);
+}
+
+TEST(StaticRuntime, Select) {
+  const auto src = R"IR(
+    graph(%a: Tensor, %dim: int, %index: int):
+        %none: NoneType = prim::Constant()
+        %b: Tensor = aten::select(%a, %dim, %index)
+        %c: Tensor = aten::clone(%b, %none)
+        return (%c)
+  )IR";
+  testStaticRuntime(src, {at::randn({2, 2}), 0, 1});
+}
+
+TEST(StaticRuntime, ReshapeAs) {
+  const auto src = R"JIT(
+    def forward(self, a, b):
+        return a.reshape_as(b).clone()
+  )JIT";
+  testStaticRuntime(src, {at::randn({2, 2}), at::randn({4})});
+}
+
+TEST(StaticRuntime, MoveCtor) {
+  auto mod = getDeepAndWideSciptModel();
+  std::vector<IValue> args{
+      at::randn({1, 1, 32}), at::randn({1, 1, 32}), at::randn({1, 50})};
+
+  torch::jit::StaticModule smod(mod);
+
+  torch::jit::StaticRuntime runtime(smod);
+  auto expected = runtime(args);
+
+  torch::jit::StaticRuntime new_runtime(std::move(runtime));
+  auto actual = new_runtime(args);
+  compareResults(expected, actual);
+}
+
+TEST(StaticRuntime, SingleBlockIfReturnList) {
+  const auto src = R"JIT(
+    def forward(self, a, b, cond: bool):
+        lst = []
+        if cond:
+            lst.append(a + b)
+        return lst
+  )JIT";
+  std::vector<IValue> args1{at::randn({1}), at::randn({1}), true};
+  std::vector<IValue> args2{at::randn({42, 42}), at::randn({42, 42}), false};
+  testStaticRuntime(src, args1, args2);
+}
+
+TEST(StaticRuntime, NestedBlockIfReturnList) {
+  const auto src = R"JIT(
+    def forward(self, a, b, cond1: bool, cond2: bool):
+        if cond1:
+            lst = []
+            if cond2:
+                lst.append(a + b)
+            lst.append(a * b)
+            return lst
+        return []
+  )JIT";
+  std::vector<IValue> args1{at::randn({1}), at::randn({1}), true, true};
+  std::vector<IValue> args2{
+      at::randn({42, 42}), at::randn({42, 42}), true, false};
+  testStaticRuntime(src, args1, args2);
+}
+
+TEST(StaticRuntime, QuantizedLinearDynamicFp16ReluFusion) {
+  const auto src = R"IR(
+    graph(%input: Tensor, %weights: Tensor):
+        %bias: None = prim::Constant()
+        %packed_params = quantized::linear_prepack_fp16(%weights, %bias)
+        %x = quantized::linear_dynamic_fp16(%input, %packed_params)
+        %y = aten::relu(%x)
+        %ret = aten::clone(%y, %bias)
+        return (%ret)
+  )IR";
+  at::Tensor weight = torch::randn({3, 2}, torch::kFloat);
+  at::Tensor input = torch::randn({3, 2}, torch::kFloat);
+
+  at::Tensor weight_2 = torch::randn({4, 3}, torch::kFloat);
+  at::Tensor input_2 = torch::randn({5, 3}, torch::kFloat);
+
+  testStaticRuntime(src, {input, weight}, {input_2, weight_2});
+
+  auto graph = getGraphFromIR(src);
+  QuantizedLinearReluFusion(graph);
+  EXPECT_FALSE(hasNodeWithKind(graph, "quantized::linear_dynamic_fp16"));
+  EXPECT_TRUE(hasNodeWithKind(graph, "quantized::linear_relu_dynamic_fp16"));
+}
+
+TEST(StaticRuntime, ClampNaNToNum) {
+  const auto src1 = R"JIT(
+    def forward(self, a):
+        return torch.clamp(a, min=1.0, max=2.0).nan_to_num().clone()
+  )JIT";
+
+  const auto src2 = R"JIT(
+    def forward(self, a, nan: float):
+        return torch.clamp(a, min=-1.0, max=2.0).nan_to_num(nan=nan).clone()
+  )JIT";
+
+  const auto src3 = R"JIT(
+    def forward(self, a):
+        return torch.clamp(a, min=1.0, max=-1.0).nan_to_num().clone()
+  )JIT";
+
+  auto a = at::tensor({
+      std::numeric_limits<float>::quiet_NaN(),
+      std::numeric_limits<float>::infinity(),
+      -std::numeric_limits<float>::infinity(),
+      0.0f,
+      3.0f
+    });
+  auto b = a.repeat({10, 5});
+
+  // Have to use_allclose even though all NaNs will be replaced - testStaticRuntime
+  // also checks inputs at the end to make sure they're not changed
+  testStaticRuntime(src1, {a}, {}, /*use_allclose=*/true, /*use_equalnan=*/true);
+  testStaticRuntime(src1, {a}, {b}, /*use_allclose=*/true, /*use_equalnan=*/true);
+
+  testStaticRuntime(src2, {a, 42.0}, {}, /*use_allclose=*/true, /*use_equalnan=*/true);
+  testStaticRuntime(src2, {a, 2.0}, {b, 1.0}, /*use_allclose=*/true, /*use_equalnan=*/true);
+
+  testStaticRuntime(src3, {a}, {}, /*use_allclose=*/true, /*use_equalnan=*/true);
+  testStaticRuntime(src3, {a}, {b}, /*use_allclose=*/true, /*use_equalnan=*/true);
+
+  // Non-NNC path
+  testStaticRuntime(src1, {a.to(at::kDouble)}, {}, /*use_allclose=*/true, /*use_equalnan=*/true);
+  testStaticRuntime(src1, {a.to(at::kDouble)}, {b.to(at::kDouble)}, /*use_allclose=*/true, /*use_equalnan=*/true);
+}
diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc
index 276d0a023ff0..7e0733fbc8af 100644
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@@ -146,11 +146,13 @@ void compareTensorLists(
   }
 }
 
+} // namespace
+
 void compareResults(
     const IValue& expect,
     const IValue& actual,
-    const bool use_allclose = false,
-    const bool use_equalnan = false) {
+    const bool use_allclose,
+    const bool use_equalnan) {
   if (expect.isTensor()) {
     VLOG(2) << "expect " << expect.toTensor() << std::endl;
     VLOG(2) << "output " << actual.toTensor() << std::endl;
@@ -198,8 +200,6 @@ void compareResults(
   }
 }
 
-} // namespace
-
 at::Tensor getTensor(const at::IValue& ival) {
   if (ival.isTensor()) {
     return ival.toTensor();
@@ -290,100 +290,104 @@ void testStaticRuntime(
 
   for (bool enable_out_variant : {true, false}) {
     for (bool manage_output_tensors : {true, false}) {
-      if (!enable_out_variant && manage_output_tensors) {
-        continue;
-      }
-      // run static runtime three times
-      // 1st run: collect allocation profiles (args)
-      // 2nd run: exercise memory planner and resizing with args2
-      // 3rd run: run with args again
-      StaticModuleOptions opts{
-          .enable_out_variant = enable_out_variant,
-          .optimize_memory = enable_out_variant,
-          .manage_output_tensors = manage_output_tensors};
-      auto smodule = test_context->makeStaticModule(opts);
-      StaticRuntime runtime(smodule);
-      auto actual = runtime(args, {});
-      if (actual.isTensor()) {
-        EXPECT_GE(smodule.num_nodes(), 2)
-            << "If we only have one node, the output of the op we are testing is "
-            << "not being managed by the memory planner! A failure here "
-            << "can typically be fixed by clone()ing the output of the test script.";
-      }
-      runtime.check_for_memory_leak();
-      // first run
-      VLOG(2) << "enable_out_variant: " << enable_out_variant;
-      VLOG(2) << "manage_output_tensors: " << manage_output_tensors;
-      VLOG(2) << "args: " << args;
-      VLOG(2) << "args2: " << args2;
-      VLOG(2) << "expect: " << expect;
-      VLOG(2) << "actual: " << actual;
-      compareResults(expect, actual, use_allclose, use_equalnan);
-      VLOG(2) << "first run comparison done";
-      if (manage_output_tensors) {
-        actual = IValue();
-        runtime.deallocateOutputTensors();
-        runtime.checkOutputTensorMemoryLeaks();
-      }
-
-      if (!args2.empty()) {
-        auto* memory_planner = runtime.get_memory_planner();
-        size_t managed_bytes =
-            memory_planner ? memory_planner->total_managed() : 0;
-
-        // Run static runtime again with inputs of a different shape.
-        expect = test_context->getExpected(args2);
-        actual = runtime(args2, {});
-        runtime.check_for_memory_leak();
-        VLOG(2) << "comparing with args2";
-        compareResults(expect, actual, use_allclose, use_equalnan);
-        VLOG(2) << "second run comparison done";
-        if (manage_output_tensors) {
-          actual = IValue();
-          runtime.deallocateOutputTensors();
-          runtime.checkOutputTensorMemoryLeaks();
+      for (bool enable_tensorexpr_fusion : {true, false}) {
+        if (!enable_out_variant && manage_output_tensors) {
+          continue;
         }
-
-        size_t new_managed_bytes =
-            memory_planner ? memory_planner->total_managed() : 0;
-        if (check_resize && new_managed_bytes > 0) {
-          EXPECT_GT(new_managed_bytes, managed_bytes);
-        }
-
-        // Run static runtime again with an input of the shape observed during
-        // the profile run.
-        expect = test_context->getExpected(args);
-        actual = runtime(args, {});
-        runtime.check_for_memory_leak();
-        // third run
-        VLOG(2) << "comparing third run";
-        compareResults(expect, actual, use_allclose, use_equalnan);
-        VLOG(2) << "third run comparison done";
-        if (manage_output_tensors) {
-          actual = IValue();
-          runtime.deallocateOutputTensors();
-          runtime.checkOutputTensorMemoryLeaks();
+        // run static runtime three times
+        // 1st run: collect allocation profiles (args)
+        // 2nd run: exercise memory planner and resizing with args2
+        // 3rd run: run with args again
+        StaticModuleOptions opts{
+            .enable_out_variant = enable_out_variant,
+            .optimize_memory = enable_out_variant,
+            .manage_output_tensors = manage_output_tensors,
+            .enable_tensorexpr_fusion = enable_tensorexpr_fusion};
+        auto smodule = test_context->makeStaticModule(opts);
+        StaticRuntime runtime(smodule);
+        auto actual = runtime(args, {});
+        if (actual.isTensor()) {
+          EXPECT_GE(smodule.num_nodes(), 2)
+              << "If we only have one node, the output of the op we are testing is "
+              << "not being managed by the memory planner! A failure here "
+              << "can typically be fixed by clone()ing the output of the test script.";
         }
-      } else {
-        // run static runtime again to exercise the memory planner
-        // and allocate managed tensors.
-        actual = runtime(args, {});
         runtime.check_for_memory_leak();
-        VLOG(2) << "comparing second run with same args";
+        // first run
+        VLOG(2) << "enable_out_variant: " << enable_out_variant;
+        VLOG(2) << "manage_output_tensors: " << manage_output_tensors;
+        VLOG(2) << "enable_tensorexpr_fusion: " << enable_tensorexpr_fusion;
+        VLOG(2) << "args: " << args;
+        VLOG(2) << "args2: " << args2;
+        VLOG(2) << "expect: " << expect;
+        VLOG(2) << "actual: " << actual;
         compareResults(expect, actual, use_allclose, use_equalnan);
-        VLOG(2) << "second run comparison done";
+        VLOG(2) << "first run comparison done";
         if (manage_output_tensors) {
           actual = IValue();
           runtime.deallocateOutputTensors();
           runtime.checkOutputTensorMemoryLeaks();
         }
-        // third run to use the allocated managed tensors.
-        actual = runtime(args, {});
-        runtime.check_for_memory_leak();
-        if (manage_output_tensors) {
-          actual = IValue();
-          runtime.deallocateOutputTensors();
-          runtime.checkOutputTensorMemoryLeaks();
+
+        if (!args2.empty()) {
+          auto* memory_planner = runtime.get_memory_planner();
+          size_t managed_bytes =
+              memory_planner ? memory_planner->total_managed() : 0;
+
+          // Run static runtime again with inputs of a different shape.
+          expect = test_context->getExpected(args2);
+          actual = runtime(args2, {});
+          runtime.check_for_memory_leak();
+          VLOG(2) << "comparing with args2";
+          compareResults(expect, actual, use_allclose, use_equalnan);
+          VLOG(2) << "second run comparison done";
+          if (manage_output_tensors) {
+            actual = IValue();
+            runtime.deallocateOutputTensors();
+            runtime.checkOutputTensorMemoryLeaks();
+          }
+
+          size_t new_managed_bytes =
+              memory_planner ? memory_planner->total_managed() : 0;
+          if (check_resize && new_managed_bytes > 0) {
+            EXPECT_GT(new_managed_bytes, managed_bytes);
+          }
+
+          // Run static runtime again with an input of the shape observed during
+          // the profile run.
+          expect = test_context->getExpected(args);
+          actual = runtime(args, {});
+          runtime.check_for_memory_leak();
+          // third run
+          VLOG(2) << "comparing third run";
+          compareResults(expect, actual, use_allclose, use_equalnan);
+          VLOG(2) << "third run comparison done";
+          if (manage_output_tensors) {
+            actual = IValue();
+            runtime.deallocateOutputTensors();
+            runtime.checkOutputTensorMemoryLeaks();
+          }
+        } else {
+          // run static runtime again to exercise the memory planner
+          // and allocate managed tensors.
+          actual = runtime(args, {});
+          runtime.check_for_memory_leak();
+          VLOG(2) << "comparing second run with same args";
+          compareResults(expect, actual, use_allclose, use_equalnan);
+          VLOG(2) << "second run comparison done";
+          if (manage_output_tensors) {
+            actual = IValue();
+            runtime.deallocateOutputTensors();
+            runtime.checkOutputTensorMemoryLeaks();
+          }
+          // third run to use the allocated managed tensors.
+          actual = runtime(args, {});
+          runtime.check_for_memory_leak();
+          if (manage_output_tensors) {
+            actual = IValue();
+            runtime.deallocateOutputTensors();
+            runtime.checkOutputTensorMemoryLeaks();
+          }
         }
       }
     }
diff --git a/benchmarks/static_runtime/test_utils.h b/benchmarks/static_runtime/test_utils.h
index cb0a5a4a8c2e..27efd4d7d42e 100644
--- a/benchmarks/static_runtime/test_utils.h
+++ b/benchmarks/static_runtime/test_utils.h
@@ -53,6 +53,12 @@ void compareResultsWithJIT(
     const bool use_allclose = false,
     const bool use_equalnan = false);
 
+void compareResults(
+    const IValue& expect,
+    const IValue& actual,
+    const bool use_allclose = false,
+    const bool use_equalnan = false);
+
 } // namespace test
 } // namespace jit
 } // namespace torch
diff --git a/benchmarks/tensorexpr/__main__.py b/benchmarks/tensorexpr/__main__.py
index f243ff5b6105..f984dbccd02d 100644
--- a/benchmarks/tensorexpr/__main__.py
+++ b/benchmarks/tensorexpr/__main__.py
@@ -56,7 +56,7 @@ def main():
         "--input-iter",
         type=str,
         default=None,
-        help="a comma separated list of of Tensor dimensions that includes a start, \
+        help="a comma separated list of Tensor dimensions that includes a start, \
               stop, and increment that can be constant or a power of 2 \
               {start:stop:inc,start:stop:pow2}",
     )
@@ -137,7 +137,7 @@ def main():
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     elif args.cuda_fuser == "old":
         import torch
         torch._C._jit_set_profiling_executor(False)
@@ -148,7 +148,7 @@ def main():
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     else :
         raise ValueError("Undefined fuser: {}".format(args.cuda_fuser))
 
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index a98754eea2c3..b683ee002280 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -4,6 +4,7 @@ if(INTERN_BUILD_MOBILE)
     caffe2_binary_target("speed_benchmark.cc")
   else()
     caffe2_binary_target("speed_benchmark_torch.cc")
+    caffe2_binary_target("load_benchmark_torch.cc")
     if(NOT BUILD_LITE_INTERPRETER)
       caffe2_binary_target("compare_models_torch.cc")
     endif()
diff --git a/binaries/aot_model_compiler.cc b/binaries/aot_model_compiler.cc
index b9d1d24c08ea..7d2d68a61f17 100644
--- a/binaries/aot_model_compiler.cc
+++ b/binaries/aot_model_compiler.cc
@@ -30,6 +30,16 @@ C10_DEFINE_string(
     "If multiple inputs needed, use semicolon to separate "
     "the dtype of different tensors."
     "Supported dtypes: float, int64, uint8");
+C10_DEFINE_string(
+    input_memory_formats,
+    "",
+    "Input memory format."
+    "If multiple inputs needed, use semicolon to separate."
+    "Supported values: contiguous, channels_last");
+C10_DEFINE_string(
+    dynamic_dims,
+    "",
+    "Comma separated dimensions of input tensors that can be dynamic");
 C10_DEFINE_string(method_name, "forward", "The name of the method.");
 C10_DEFINE_string(
     output_llvm,
@@ -61,6 +71,8 @@ c10::Dict<c10::IValue, c10::IValue> createCompileSpec() {
       c10::StringType::get(), c10::AnyType::get());
   method_spec.insert("sizes", FLAGS_input_dims);
   method_spec.insert("types", FLAGS_input_types);
+  method_spec.insert("memory_formats", FLAGS_input_memory_formats);
+  method_spec.insert("dynamic_sizes", FLAGS_dynamic_dims);
   method_spec.insert("asmfile", FLAGS_output_llvm);
   method_spec.insert("model_name", FLAGS_model_name);
   method_spec.insert("model_version", FLAGS_model_version);
@@ -79,6 +91,7 @@ int main(int argc, char** argv) {
       " --model_version=<model version>"
       " --input_dims=<input dimensions like '1,3,224,224;2,2'>"
       " --input_types=<input dtypes like 'float;float'>"
+      " --input_memory_formats=<input memory formats like 'channels_last;contiguous'>"
       " [--method_name=<method name>]"
       " [--output_llvm=<llvm assembly output file path>]"
       " [--output_model=<output model file path>]");
@@ -93,10 +106,18 @@ int main(int argc, char** argv) {
   CAFFE_ENFORCE(!FLAGS_model_name.empty(), c10::UsageMessage());
   CAFFE_ENFORCE(!FLAGS_model_version.empty(), c10::UsageMessage());
   CAFFE_ENFORCE(!FLAGS_input_dims.empty(), c10::UsageMessage());
+  const auto dims_size = split(';', FLAGS_input_dims).size();
   CAFFE_ENFORCE(
-      split(';', FLAGS_input_dims).size() ==
-          split(';', FLAGS_input_types).size(),
+      dims_size == split(';', FLAGS_input_types).size(),
       "Number of input_dims and input_types should be the same");
+  const auto mem_formats_size = split(';', FLAGS_input_memory_formats).size();
+  CAFFE_ENFORCE(
+      mem_formats_size == 0 || mem_formats_size == dims_size,
+      "Number of input_memory_formats should be 0 (default contiguous) or the same as number of input_dims");
+  if (FLAGS_output_llvm.empty()) {
+    FLAGS_output_llvm =
+        FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.ll";
+  }
 
   std::string output_model_name = FLAGS_output_model;
   if (output_model_name.empty()) {
diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
index 2b344c1f5947..8684e07ee4fd 100755
--- a/binaries/bench_gen/bench_gen.py
+++ b/binaries/bench_gen/bench_gen.py
@@ -59,7 +59,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Utilitity to generate Caffe2 benchmark models.")
+        description="Utility to generate Caffe2 benchmark models.")
     parser.add_argument("operator", help="Caffe2 operator to benchmark.")
     parser.add_argument("-b", "--blob",
                         help="Instantiate a blob --blob name=dim1,dim2,dim3",
diff --git a/binaries/load_benchmark_torch.cc b/binaries/load_benchmark_torch.cc
new file mode 100644
index 000000000000..330955657ece
--- /dev/null
+++ b/binaries/load_benchmark_torch.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include "caffe2/core/timer.h"
+#include "caffe2/utils/string_utils.h"
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/script.h>
+
+#include <c10/mobile/CPUCachingAllocator.h>
+
+#include <chrono>
+using namespace std::chrono;
+
+C10_DEFINE_string(model, "", "The given torch script model to benchmark.");
+C10_DEFINE_int(iter, 10, "The number of iterations to run.");
+C10_DEFINE_bool(
+  report_pep,
+  true,
+  "Whether to print performance stats for AI-PEP.");
+
+int main(int argc, char** argv) {
+  c10::SetUsageMessage(
+    "Run model load time benchmark for pytorch model.\n"
+    "Example usage:\n"
+    "./load_benchmark_torch"
+    " --model=<model_file>"
+    " --iter=20");
+  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
+    std::cerr << "Failed to parse command line flags!" << std::endl;
+    return 1;
+  }
+
+  std::cout << "Starting benchmark." << std::endl;
+  CAFFE_ENFORCE(
+      FLAGS_iter >= 0,
+      "Number of main runs should be non negative, provided ",
+      FLAGS_iter,
+      ".");
+
+  caffe2::Timer timer;
+  std::vector<long> times;
+
+  for (int i = 0; i < FLAGS_iter; ++i) {
+    auto start = high_resolution_clock::now();
+
+#if BUILD_LITE_INTERPRETER
+    auto module = torch::jit::_load_for_mobile(FLAGS_model);
+#else
+    auto module = torch::jit::load(FLAGS_model);
+#endif
+
+    auto stop = high_resolution_clock::now();
+    auto duration = duration_cast<microseconds>(stop - start);
+    times.push_back(duration.count());
+  }
+
+  const double micros = static_cast<double>(timer.MicroSeconds());
+  if (FLAGS_report_pep) {
+    for (auto t : times) {
+      std::cout << R"(PyTorchObserver {"type": "NET", "unit": "us", )"
+                << R"("metric": "latency", "value": ")"
+                << t << R"("})" << std::endl;
+    }
+  }
+
+  const double iters = static_cast<double>(FLAGS_iter);
+  std::cout << "Main run finished. Microseconds per iter: "
+            << micros / iters
+            << ". Iters per second: " << 1000.0 * 1000 * iters / micros
+            << std::endl;
+
+  return 0;
+}
diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc
index c80f46d75652..8d53007bc8ef 100644
--- a/binaries/record_function_benchmark.cc
+++ b/binaries/record_function_benchmark.cc
@@ -49,12 +49,10 @@ float runPureRecordFunctionBench(int iter) {
   typedef std::chrono::microseconds us;
   std::chrono::time_point<clock> start_time = clock::now();
   for (auto idx = 0; idx < iter; ++idx) {
-    bool pre_sampled = false;
-    if (at::shouldRunRecordFunction(&pre_sampled)) {
-      at::RecordFunction guard(at::RecordScope::USER_SCOPE, pre_sampled);
-      if (C10_UNLIKELY(guard.isActive())) {
-        guard.before("Test", -1);
-      }
+    auto step_callbacks = at::getStepCallbacks(at::RecordScope::USER_SCOPE);
+    if (!step_callbacks.empty()) {
+      at::RecordFunction guard(std::move(step_callbacks));
+      guard.before("Test", -1);
     }
   }
   auto duration = static_cast<float>(
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index e4eee10636e3..ea523898b51e 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -180,35 +180,48 @@ class vkRunner final : public Runner<T> {
   virtual c10::IValue run(
       T& module,
       const std::vector<c10::IValue>& inputs) override {
-    // Upload the input tensor(s) to GPU memory.
-    inputs_.clear();
-    inputs_.reserve(inputs.size());
-    for (const auto& input : inputs) {
-      if (input.isTensor()) {
-        inputs_.emplace_back(input.toTensor().vulkan());
-      }
-      else if (input.isList()) {
-        const c10::List<c10::IValue> input_as_list = input.toList();
-        c10::List<at::Tensor> input_vk_list;
-        input_vk_list.reserve(input_as_list.size());
-        for (int i=0; i < input_as_list.size(); ++i) {
-          const c10::IValue element = input_as_list.get(i);
-          if (element.isTensor()) {
-            input_vk_list.emplace_back(element.toTensor().vulkan());
-          }
-          else {
-            CAFFE_THROW("Input of type c10::List must only contain Tensors!");
+
+    if (inputs_.size() == 0) {
+      // Upload the input tensor(s) to GPU memory.
+      inputs_.clear();
+      inputs_.reserve(inputs.size());
+      for (const auto& input : inputs) {
+        if (input.isTensor()) {
+          inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan());
+        }
+        else if (input.isTensorList()) {
+          const c10::List<at::Tensor> input_as_list = input.toTensorList();
+          c10::List<at::Tensor> input_vk_list;
+          input_vk_list.reserve(input_as_list.size());
+          for (int i=0; i < input_as_list.size(); ++i) {
+            const at::Tensor element = input_as_list.get(i);
+            input_vk_list.emplace_back(at::rand(element.sizes()).vulkan());
           }
+          inputs_.emplace_back(c10::IValue(input_vk_list));
+        }
+        else {
+          CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!");
         }
-        inputs_.emplace_back(c10::IValue(input_vk_list));
-      }
-      else {
-        CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!");
       }
     }
 
     // Run, and download the output tensor to system memory.
-    return module.forward(inputs_).toTensor().cpu();
+    c10::IValue output = module.forward(inputs_);
+    if (output.isTensor()) {
+      return output.toTensor().cpu();
+    }
+    else if (output.isTensorList()) {
+      return output.toTensorList().get(0).cpu();
+    }
+    else if (output.isList()) {
+      return output.toList().get(0).toTensor().cpu();
+    }
+    else if (output.isTuple()) {
+      return output.toTuple()->elements()[0].toTensor().cpu();
+    }
+    else {
+      CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!");
+    };
   }
 
  private:
diff --git a/build.bzl b/build.bzl
new file mode 100644
index 000000000000..a1566377e844
--- /dev/null
+++ b/build.bzl
@@ -0,0 +1,127 @@
+def define_targets(rules):
+    rules.cc_library(
+        name = "caffe2_serialize",
+        srcs = [
+            "caffe2/serialize/file_adapter.cc",
+            "caffe2/serialize/inline_container.cc",
+            "caffe2/serialize/istream_adapter.cc",
+            "caffe2/serialize/read_adapter_interface.cc",
+        ],
+        tags = [
+            "supermodule:android/default/pytorch",
+            "supermodule:ios/default/public.pytorch",
+            "-fbcode",
+            "xplat",
+        ],
+        visibility = ["//visibility:public"],
+        deps = [
+            ":caffe2_headers",
+            "@com_github_glog//:glog",
+            "//c10",
+            "//third_party/miniz-2.0.8:miniz",
+        ],
+    )
+
+    rules.genrule(
+        name = "generate-code",
+        srcs = [
+            ":DispatchKeyNativeFunctions.cpp",
+            ":DispatchKeyNativeFunctions.h",
+            ":LazyIr.h",
+            ":RegisterDispatchKey.cpp",
+            ":native_functions.yaml",
+            ":shape_inference.h",
+            ":tags.yaml",
+            ":ts_native_functions.cpp",
+            ":ts_native_functions.yaml",
+        ],
+        tools = ["//tools/setup_helpers:generate_code"],
+        outs = GENERATED_AUTOGRAD_CPP + GENERATED_AUTOGRAD_PYTHON + GENERATED_TESTING_PY,
+        cmd = "$(location //tools/setup_helpers:generate_code) " +
+              "--gen-dir=$(RULEDIR) " +
+              "--native-functions-path $(location :native_functions.yaml) " +
+              "--tags-path=$(location :tags.yaml) " +
+              "--gen_lazy_ts_backend",
+    )
+
+    rules.genrule(
+        name = "version_h",
+        srcs = [
+            ":torch/csrc/api/include/torch/version.h.in",
+            ":version.txt",
+        ],
+        outs = ["torch/csrc/api/include/torch/version.h"],
+        cmd = "$(location //tools/setup_helpers:gen_version_header) " +
+              "--template-path $(location :torch/csrc/api/include/torch/version.h.in) " +
+              "--version-path $(location :version.txt) --output-path $@ ",
+        tools = ["//tools/setup_helpers:gen_version_header"],
+    )
+
+# These lists are temporarily living in and exported from the shared
+# structure so that an internal build that lives under a different
+# root can access them. These could technically live in a separate
+# file in the same directory but that would require extra work to
+# ensure that file is synced to both Meta internal repositories and
+# GitHub. This problem will go away when the targets downstream of
+# generate-code that use these lists are moved into the shared
+# structure as well.
+
+_GENERATED_AUTOGRAD_PYTHON_HEADERS = [
+    "torch/csrc/autograd/generated/python_functions.h",
+]
+
+_GENERATED_AUTOGRAD_CPP_HEADERS = [
+    "torch/csrc/autograd/generated/Functions.h",
+    "torch/csrc/autograd/generated/VariableType.h",
+    "torch/csrc/autograd/generated/variable_factories.h",
+]
+
+GENERATED_AUTOGRAD_H = _GENERATED_AUTOGRAD_CPP_HEADERS + _GENERATED_AUTOGRAD_PYTHON_HEADERS
+
+GENERATED_TESTING_PY = [
+    "torch/testing/_internal/generated/annotated_fn_args.py",
+]
+
+GENERATED_LAZY_H = [
+    "torch/csrc/lazy/generated/LazyIr.h",
+    "torch/csrc/lazy/generated/LazyNativeFunctions.h",
+]
+
+_GENERATED_AUTOGRAD_PYTHON_CPP = [
+    "torch/csrc/autograd/generated/python_functions_0.cpp",
+    "torch/csrc/autograd/generated/python_functions_1.cpp",
+    "torch/csrc/autograd/generated/python_functions_2.cpp",
+    "torch/csrc/autograd/generated/python_functions_3.cpp",
+    "torch/csrc/autograd/generated/python_functions_4.cpp",
+    "torch/csrc/autograd/generated/python_nn_functions.cpp",
+    "torch/csrc/autograd/generated/python_fft_functions.cpp",
+    "torch/csrc/autograd/generated/python_linalg_functions.cpp",
+    "torch/csrc/autograd/generated/python_return_types.cpp",
+    "torch/csrc/autograd/generated/python_sparse_functions.cpp",
+    "torch/csrc/autograd/generated/python_special_functions.cpp",
+    "torch/csrc/autograd/generated/python_torch_functions_0.cpp",
+    "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
+    "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
+    "torch/csrc/autograd/generated/python_variable_methods.cpp",
+]
+
+GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
+
+GENERATED_AUTOGRAD_CPP = [
+    "torch/csrc/autograd/generated/Functions.cpp",
+    "torch/csrc/autograd/generated/VariableType_0.cpp",
+    "torch/csrc/autograd/generated/VariableType_1.cpp",
+    "torch/csrc/autograd/generated/VariableType_2.cpp",
+    "torch/csrc/autograd/generated/VariableType_3.cpp",
+    "torch/csrc/autograd/generated/VariableType_4.cpp",
+    "torch/csrc/autograd/generated/TraceType_0.cpp",
+    "torch/csrc/autograd/generated/TraceType_1.cpp",
+    "torch/csrc/autograd/generated/TraceType_2.cpp",
+    "torch/csrc/autograd/generated/TraceType_3.cpp",
+    "torch/csrc/autograd/generated/TraceType_4.cpp",
+    "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp",
+    "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp",
+    "torch/csrc/lazy/generated/LazyNativeFunctions.cpp",
+    "torch/csrc/lazy/generated/RegisterAutogradLazy.cpp",
+    "torch/csrc/lazy/generated/RegisterLazy.cpp",
+] + _GENERATED_AUTOGRAD_CPP_HEADERS + GENERATED_LAZY_H
diff --git a/c10/BUILD.bazel b/c10/BUILD.bazel
index f4a43cf93013..5e6ed8297e5e 100644
--- a/c10/BUILD.bazel
+++ b/c10/BUILD.bazel
@@ -1,6 +1,9 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@rules_cc//cc:defs.bzl", "cc_library")
-load("//tools/config:defs.bzl", "if_cuda")
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
 
 # The bool_flag targets allow configuring the build from the
 # command-line, e.g. --//c10:use_gflags or --no//c10:use_gflags to
@@ -47,27 +50,3 @@ cc_library(
     }),
     visibility = ["//:__pkg__"],
 )
-
-cc_library(
-    name = "c10",
-    deps = [
-        "//c10/core:CPUAllocator",
-        "//c10/core:ScalarType",
-        "//c10/core:alignment",
-        "//c10/core:alloc_cpu",
-        "//c10/core:base",
-        "//c10/macros",
-        "//c10/mobile:CPUCachingAllocator",
-        "//c10/mobile:CPUProfilingAllocator",
-        "//c10/util:TypeCast",
-        "//c10/util:base",
-        "//c10/util:typeid",
-    ] + if_cuda(
-        [
-            "//c10/cuda",
-            "//c10/cuda:Macros",
-        ],
-        [],
-    ),
-    visibility = ["//:__pkg__"],
-)
diff --git a/c10/BUILD.buck b/c10/BUILD.buck
new file mode 100644
index 000000000000..b70b780302a8
--- /dev/null
+++ b/c10/BUILD.buck
@@ -0,0 +1,50 @@
+load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+
+cxx_library(
+    name = "c10",
+    srcs = glob(
+        ["**/*.cpp"],
+        exclude = [
+            "test/**/*.cpp",
+            "benchmark/**/*.cpp",
+            "cuda/**/*.cpp",
+        ],
+    ),
+      deps = [
+        "//third_party:fmt",
+        "//third_party:glog",
+    ],
+    exported_deps = [],
+    compiler_flags = [
+        "-Werror",
+        "-Wno-global-constructors",
+        "-DDISABLE_NAMEDTENSOR",
+        "-DSUPPORTS_BACKTRACE=0"
+    ],
+    exported_headers = subdir_glob(
+        [
+            ("", "**/*.h"),
+        ],
+        exclude = [
+            "test/**/*.h",
+            "benchmark/**/*.h",
+            "cuda/**/*.h",
+        ],
+    ),
+    exported_linker_flags = [],
+    exported_preprocessor_flags = [
+        '-DC10_USING_CUSTOM_GENERATED_MACROS',
+        '-DC10_USE_GLOG',
+        '-DC10_USE_MINIMAL_GLOG',
+        '-DC10_DISABLE_NUMA',
+        '-DC10_MOBILE',
+        '-fexceptions',
+        '-Wno-global-constructors'
+    ],
+    header_namespace = "c10",
+    link_whole = True,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    preprocessor_flags = ['-DC10_BUILD_MAIN_LIB'],
+    reexport_all_header_dependencies = True,
+    visibility = ['PUBLIC'],
+)
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 23a0e024d35e..41b1a1a0bc9b 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -50,6 +50,9 @@ target_compile_options(c10 PRIVATE "-DC10_BUILD_MAIN_LIB")
 if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
   target_compile_options(c10 PRIVATE "-fvisibility=hidden")
 endif()
+if(HAS_WERROR_SIGN_COMPARE AND WERROR)
+  target_compile_options(c10 PRIVATE "-Werror=sign-compare")
+endif()
 
 # ---[ Dependency of c10
 if(${USE_GFLAGS})
diff --git a/c10/benchmark/BUILD.bazel b/c10/benchmark/BUILD.bazel
new file mode 100644
index 000000000000..d1a0db360d23
--- /dev/null
+++ b/c10/benchmark/BUILD.bazel
@@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
diff --git a/c10/benchmark/build.bzl b/c10/benchmark/build.bzl
new file mode 100644
index 000000000000..f9838e00cb4e
--- /dev/null
+++ b/c10/benchmark/build.bzl
@@ -0,0 +1,10 @@
+def define_targets(rules):
+    rules.cc_binary(
+        name = "intrusive_ptr",
+        srcs = ["intrusive_ptr_benchmark.cpp"],
+        tags = ["benchmark"],
+        deps = [
+            "//c10/util:base",
+            "@google_benchmark//:benchmark",
+        ],
+    )
diff --git a/c10/build.bzl b/c10/build.bzl
new file mode 100644
index 000000000000..21107eb8b992
--- /dev/null
+++ b/c10/build.bzl
@@ -0,0 +1,24 @@
+def define_targets(rules):
+    rules.cc_library(
+        name = "c10",
+        deps = [
+            "//c10/core:CPUAllocator",
+            "//c10/core:ScalarType",
+            "//c10/core:alignment",
+            "//c10/core:alloc_cpu",
+            "//c10/core:base",
+            "//c10/macros",
+            "//c10/mobile:CPUCachingAllocator",
+            "//c10/mobile:CPUProfilingAllocator",
+            "//c10/util:TypeCast",
+            "//c10/util:base",
+            "//c10/util:typeid",
+        ] + rules.if_cuda(
+            [
+                "//c10/cuda",
+                "//c10/cuda:Macros",
+            ],
+            [],
+        ),
+        visibility = ["//visibility:public"],
+    )
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index c05139a93f00..4f571fd91511 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -217,9 +217,9 @@ struct AllocatorRegisterer {
   }
 };
 
-#define REGISTER_ALLOCATOR(t, f)                  \
-  namespace {                                     \
-  static AllocatorRegisterer<t> g_allocator_d(f); \
+#define REGISTER_ALLOCATOR(t, f)                       \
+  namespace {                                          \
+  static c10::AllocatorRegisterer<t> g_allocator_d(f); \
   }
 
 // An interface for reporting thread local memory usage
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index e17a1bc4226c..a8ad60f8c913 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -32,6 +32,7 @@ enum class Backend {
   HIP,
   VE,
   FPGA,
+  IPU,
   XPU,
   SparseCPU,
   SparseCUDA,
@@ -49,9 +50,10 @@ enum class Backend {
   QuantizedXPU,
   Undefined,
   MkldnnCPU,
-  MLC,
+  MPS,
   HPU,
   Lazy,
+  PrivateUse1,
   NumOptions
 };
 
@@ -72,8 +74,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::XLA;
   } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
     return Backend::Lazy;
-  } else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) {
-    return Backend::MLC;
+  } else if (t == DispatchKey::MPS || t == DispatchKey::AutogradMPS) {
+    return Backend::MPS;
   } else if (t == DispatchKey::Vulkan) {
     return Backend::Vulkan;
   } else if (t == DispatchKey::Metal) {
@@ -96,6 +98,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::QuantizedCPU;
   } else if (t == DispatchKey::QuantizedCUDA) {
     return Backend::QuantizedCUDA;
+  } else if (t == DispatchKey::IPU || t == DispatchKey::AutogradIPU) {
+    return Backend::IPU;
   } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
     return Backend::XPU;
   } else if (t == DispatchKey::SparseXPU) {
@@ -104,6 +108,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::QuantizedXPU;
   } else if (t == DispatchKey::HPU || t == DispatchKey::AutogradHPU) {
     return Backend::HPU;
+  } else if (t == DispatchKey::PrivateUse1) {
+    return Backend::PrivateUse1;
   } else if (t == DispatchKey::Undefined) {
     return Backend::Undefined;
   } else {
@@ -129,6 +135,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::XLA;
     case Backend::Lazy:
       return DispatchKey::Lazy;
+    case Backend::IPU:
+      return DispatchKey::IPU;
     case Backend::XPU:
       return DispatchKey::XPU;
     case Backend::SparseXPU:
@@ -157,10 +165,12 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::QuantizedCUDA;
     case Backend::Undefined:
       return DispatchKey::Undefined;
-    case Backend::MLC:
-      return DispatchKey::MLC;
+    case Backend::MPS:
+      return DispatchKey::MPS;
     case Backend::HPU:
       return DispatchKey::HPU;
+    case Backend::PrivateUse1:
+      return DispatchKey::PrivateUse1;
     default:
       throw std::runtime_error("Unknown backend");
   }
@@ -196,6 +206,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::CPU;
     case Backend::SparseCsrCUDA:
       return DeviceType::CUDA;
+    case Backend::IPU:
+      return DeviceType::IPU;
     case Backend::XPU:
     case Backend::SparseXPU:
     case Backend::QuantizedXPU:
@@ -209,10 +221,12 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::Vulkan;
     case Backend::Metal:
       return DeviceType::Metal;
-    case Backend::MLC:
-      return DeviceType::MLC;
+    case Backend::MPS:
+      return DeviceType::MPS;
     case Backend::HPU:
       return DeviceType::HPU;
+    case Backend::PrivateUse1:
+      return DeviceType::PrivateUse1;
     case Backend::Undefined:
       TORCH_CHECK(false, "Undefined backend is not a valid device type");
     default:
@@ -235,14 +249,16 @@ static inline const char* toString(Backend b) {
       return "FPGA";
     case Backend::XPU:
       return "XPU";
+    case Backend::IPU:
+      return "IPU";
     case Backend::ORT:
       return "ORT";
     case Backend::XLA:
       return "XLA";
     case Backend::Lazy:
       return "Lazy";
-    case Backend::MLC:
-      return "MLC";
+    case Backend::MPS:
+      return "MPS";
     case Backend::SparseCPU:
       return "SparseCPU";
     case Backend::SparseCUDA:
@@ -271,6 +287,8 @@ static inline const char* toString(Backend b) {
       return "QuantizedXPU";
     case Backend::HPU:
       return "HPU";
+    case Backend::PrivateUse1:
+      return "PrivateUseOne";
     default:
       return "UNKNOWN_BACKEND";
   }
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 2531e3942271..5cd474774c9e 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -20,6 +20,7 @@ DeviceType parse_type(const std::string& device_string) {
       types = {{
           {"cpu", DeviceType::CPU},
           {"cuda", DeviceType::CUDA},
+          {"ipu", DeviceType::IPU},
           {"xpu", DeviceType::XPU},
           {"mkldnn", DeviceType::MKLDNN},
           {"opengl", DeviceType::OPENGL},
@@ -32,9 +33,10 @@ DeviceType parse_type(const std::string& device_string) {
           {"xla", DeviceType::XLA},
           {"lazy", DeviceType::Lazy},
           {"vulkan", DeviceType::Vulkan},
-          {"mlc", DeviceType::MLC},
+          {"mps", DeviceType::MPS},
           {"meta", DeviceType::Meta},
           {"hpu", DeviceType::HPU},
+          {"privateuseone", DeviceType::PrivateUse1},
       }};
   auto device = std::find_if(
       types.begin(),
@@ -47,7 +49,7 @@ DeviceType parse_type(const std::string& device_string) {
   }
   TORCH_CHECK(
       false,
-      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
+      "Expected one of cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mps, xla, lazy, vulkan, meta, hpu, privateuseone device type at start of device string: ",
       device_string);
 }
 enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR };
diff --git a/c10/core/Device.h b/c10/core/Device.h
index b935eed6a656..774cf404da29 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -81,6 +81,11 @@ struct C10_API Device final {
     return type_ == DeviceType::CUDA;
   }
 
+  /// Return true if the device is of MPS type.
+  bool is_mps() const noexcept {
+    return type_ == DeviceType::MPS;
+  }
+
   /// Return true if the device is of HIP type.
   bool is_hip() const noexcept {
     return type_ == DeviceType::HIP;
@@ -96,11 +101,21 @@ struct C10_API Device final {
     return type_ == DeviceType::XPU;
   }
 
+  /// Return true if the device is of IPU type.
+  bool is_ipu() const noexcept {
+    return type_ == DeviceType::IPU;
+  }
+
   /// Return true if the device is of HPU type.
   bool is_hpu() const noexcept {
     return type_ == DeviceType::HPU;
   }
 
+  /// Return true if the device is of META type.
+  bool is_meta() const noexcept {
+    return type_ == DeviceType::Meta;
+  }
+
   /// Return true if the device is of CPU type.
   bool is_cpu() const noexcept {
     return type_ == DeviceType::CPU;
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index 4635acdb148c..ac4c1f653efb 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -31,8 +31,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
       return lower_case ? "xla" : "XLA";
     case DeviceType::Lazy:
       return lower_case ? "lazy" : "LAZY";
-    case DeviceType::MLC:
-      return lower_case ? "mlc" : "MLC";
+    case DeviceType::MPS:
+      return lower_case ? "mps" : "MPS";
     case DeviceType::Vulkan:
       return lower_case ? "vulkan" : "VULKAN";
     case DeviceType::Metal:
@@ -43,6 +43,10 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
       return lower_case ? "meta" : "META";
     case DeviceType::HPU:
       return lower_case ? "hpu" : "HPU";
+    case DeviceType::IPU:
+      return lower_case ? "ipu" : "IPU";
+    case DeviceType::PrivateUse1:
+      return lower_case ? "privateuseone" : "PRIVATEUSEONE";
     default:
       TORCH_CHECK(
           false,
@@ -78,12 +82,14 @@ bool isValidDeviceType(DeviceType d) {
     case DeviceType::ORT:
     case DeviceType::XLA:
     case DeviceType::Lazy:
-    case DeviceType::MLC:
+    case DeviceType::MPS:
     case DeviceType::Vulkan:
     case DeviceType::Metal:
     case DeviceType::XPU:
     case DeviceType::Meta:
     case DeviceType::HPU:
+    case DeviceType::IPU:
+    case DeviceType::PrivateUse1:
       return true;
     default:
       return false;
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index c6bd56914d6d..ca995bc9d9ab 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -26,16 +26,18 @@ enum class DeviceType : int8_t {
   Vulkan = 10, // Vulkan
   Metal = 11, // Metal
   XPU = 12, // XPU
-  MLC = 13, // ML Compute / Apple
+  MPS = 13, // MPS
   Meta = 14, // Meta (tensors with no data)
   HPU = 15, // HPU / HABANA
   VE = 16, // SX-Aurora / NEC
   Lazy = 17, // Lazy Tensors
+  IPU = 18, // Graphcore IPU
+  PrivateUse1 = 19, // PrivateUse1 device
   // NB: If you add more devices:
   //  - Change the implementations of DeviceTypeName and isValidDeviceType
   //    in DeviceType.cpp
   //  - Change the number below
-  COMPILE_TIME_MAX_DEVICE_TYPES = 18,
+  COMPILE_TIME_MAX_DEVICE_TYPES = 20,
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
@@ -44,7 +46,7 @@ constexpr DeviceType kHIP = DeviceType::HIP;
 constexpr DeviceType kFPGA = DeviceType::FPGA;
 constexpr DeviceType kORT = DeviceType::ORT;
 constexpr DeviceType kXLA = DeviceType::XLA;
-constexpr DeviceType kMLC = DeviceType::MLC;
+constexpr DeviceType kMPS = DeviceType::MPS;
 constexpr DeviceType kMeta = DeviceType::Meta;
 constexpr DeviceType kVulkan = DeviceType::Vulkan;
 constexpr DeviceType kMetal = DeviceType::Metal;
@@ -52,18 +54,20 @@ constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kHPU = DeviceType::HPU;
 constexpr DeviceType kVE = DeviceType::VE;
 constexpr DeviceType kLazy = DeviceType::Lazy;
+constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
 
 // define explicit int constant
 constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
     static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
 
 static_assert(
-    COMPILE_TIME_MAX_DEVICE_TYPES <= 18,
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 20,
     "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
     "for this constant to reflect the actual number of DeviceTypes we support "
     "in PyTorch; it's important that this number is not too large as we "
     "use this to allocate stack arrays in some places in our code.  If you "
-    "are indeed just adding the 18th device type, feel free to change "
+    "are indeed just adding the 20th device type, feel free to change "
     "the check to 32; but if you are adding some sort of extensible device "
     "types registration, please be aware that you are affecting code that "
     "this number is small.  Try auditing uses of this constant.");
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 7d2f9e7fcb6c..f06603dc4bc1 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -1,14 +1,49 @@
 #include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
 
 #include <unordered_map>
 
 namespace c10 {
 
+const char* toString(BackendComponent t) {
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return "CPUBit";
+    case BackendComponent::CUDABit:
+      return "CUDABit";
+    case BackendComponent::HIPBit:
+      return "HIPBit";
+    case BackendComponent::XLABit:
+      return "XLABit";
+    case BackendComponent::LazyBit:
+      return "LazyBit";
+    case BackendComponent::XPUBit:
+      return "XPUBit";
+    case BackendComponent::IPUBit:
+      return "IPUBit";
+    case BackendComponent::MPSBit:
+      return "MPSBit";
+    case BackendComponent::HPUBit:
+      return "HPUBit";
+    case BackendComponent::VEBit:
+      return "VEBit";
+    case BackendComponent::PrivateUse1Bit:
+      return "PrivateUse1Bit";
+    case BackendComponent::PrivateUse2Bit:
+      return "PrivateUse2Bit";
+    case BackendComponent::PrivateUse3Bit:
+      return "PrivateUse3Bit";
+    case BackendComponent::InvalidBit:
+      return "InvalidBit";
+    default:
+      return "UNKNOWN_BACKEND_BIT";
+  }
+}
+
 const char* toString(DispatchKey t) {
   switch (t) {
     case DispatchKey::Undefined:
       return "Undefined";
-
     case DispatchKey::CPU:
       return "CPU";
     case DispatchKey::CUDA:
@@ -21,14 +56,16 @@ const char* toString(DispatchKey t) {
       return "FPGA";
     case DispatchKey::XPU:
       return "XPU";
+    case DispatchKey::IPU:
+      return "IPU";
     case DispatchKey::ORT:
       return "ORT";
     case DispatchKey::XLA:
       return "XLA";
     case DispatchKey::Lazy:
       return "Lazy";
-    case DispatchKey::MLC:
-      return "MLC";
+    case DispatchKey::MPS:
+      return "MPS";
     case DispatchKey::HPU:
       return "HPU";
     case DispatchKey::Vulkan:
@@ -64,9 +101,15 @@ const char* toString(DispatchKey t) {
 
     case DispatchKey::NestedTensor:
       return "NestedTensor";
+    case DispatchKey::NestedTensorCPU:
+      return "NestedTensorCPU";
+    case DispatchKey::NestedTensorCUDA:
+      return "NestedTensorCUDA";
 
     case DispatchKey::Python:
       return "Python";
+    case DispatchKey::PythonTLSSnapshot:
+      return "PythonTLSSnapshot";
 
     case DispatchKey::PrivateUse1:
       return "PrivateUse1";
@@ -89,6 +132,8 @@ const char* toString(DispatchKey t) {
       return "Autograd";
     case DispatchKey::AutogradCPU:
       return "AutogradCPU";
+    case DispatchKey::AutogradIPU:
+      return "AutogradIPU";
     case DispatchKey::AutogradXPU:
       return "AutogradXPU";
     case DispatchKey::AutogradCUDA:
@@ -97,12 +142,10 @@ const char* toString(DispatchKey t) {
       return "AutogradXLA";
     case DispatchKey::AutogradLazy:
       return "AutogradLazy";
-    case DispatchKey::AutogradMLC:
-      return "AutogradMLC";
+    case DispatchKey::AutogradMPS:
+      return "AutogradMPS";
     case DispatchKey::AutogradHPU:
       return "AutogradHPU";
-    case DispatchKey::AutogradNestedTensor:
-      return "AutogradNestedTensor";
     case DispatchKey::AutogradPrivateUse1:
       return "AutogradPrivateUse1";
     case DispatchKey::AutogradPrivateUse2:
@@ -111,6 +154,8 @@ const char* toString(DispatchKey t) {
       return "AutogradPrivateUse3";
     case DispatchKey::AutogradOther:
       return "AutogradOther";
+    case DispatchKey::AutogradNestedTensor:
+      return "AutogradNestedTensor";
 
     case DispatchKey::ZeroTensor:
       return "ZeroTensor";
@@ -133,6 +178,9 @@ const char* toString(DispatchKey t) {
     case DispatchKey::AutocastCPU:
       return "AutocastCPU";
 
+    case DispatchKey::AutocastXPU:
+      return "AutocastXPU";
+
     case DispatchKey::Batched:
       return "Batched";
 
@@ -168,6 +216,21 @@ const char* toString(DispatchKey t) {
     case DispatchKey::FuncTorchBatched:
       return "FuncTorchBatched";
 
+    // Out-of-core torchdistX dispatch keys
+    case DispatchKey::Fake:
+      return "Fake";
+    case DispatchKey::DeferredInit:
+      return "DeferredInit";
+
+    case DispatchKey::Dense:
+      return "Dense";
+    case DispatchKey::Quantized:
+      return "Quantized";
+    case DispatchKey::Sparse:
+      return "Sparse";
+    case DispatchKey::AutogradFunctionality:
+      return "AutogradFunctionality";
+
     default:
       return "UNKNOWN_TENSOR_TYPE_ID";
   }
@@ -176,78 +239,42 @@ const char* toString(DispatchKey t) {
 std::ostream& operator<<(std::ostream& str, DispatchKey rhs) {
   return str << toString(rhs);
 }
+std::ostream& operator<<(std::ostream& str, BackendComponent rhs) {
+  return str << toString(rhs);
+}
 
-// for a given backend key, return the associated autograd key.
-// for non-backend keys, return AutogradOther as a default.
-// Note: it's convenient and fast to return a default here rather than (say)
-// returning an optional<DispatchKey>, or throwing. But it makes callers
-// responsible for either a) enforcing the invariant that only backend keys
-// be passed as arguments, or b) interpreting our return value carefully.
-//
-DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
-  switch (t) {
-    case DispatchKey::CPU:
-      return DispatchKey::AutogradCPU;
-    case DispatchKey::XPU:
-      return DispatchKey::AutogradXPU;
-    case DispatchKey::CUDA:
-      return DispatchKey::AutogradCUDA;
-    case DispatchKey::XLA:
-      return DispatchKey::AutogradXLA;
-    case DispatchKey::Lazy:
-      return DispatchKey::AutogradLazy;
-    case DispatchKey::MLC:
-      return DispatchKey::AutogradMLC;
-    case DispatchKey::HPU:
-      return DispatchKey::AutogradHPU;
-    case DispatchKey::NestedTensor:
-      return DispatchKey::AutogradNestedTensor;
-    case DispatchKey::PrivateUse1:
-      return DispatchKey::AutogradPrivateUse1;
-    case DispatchKey::PrivateUse2:
-      return DispatchKey::AutogradPrivateUse2;
-    case DispatchKey::PrivateUse3:
-      return DispatchKey::AutogradPrivateUse3;
-    default:
-      return DispatchKey::AutogradOther;
-  }
+DispatchKey getAutogradKeyFromBackend(BackendComponent k) {
+  // We want this to return an autograd key. We're relying on the fact that
+  // getAutogradRelatedKeySetFromBackend returns an autograd key +
+  // ADInplaceOrView, and autograd has higher precedence. The core mapping from
+  // backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend`
+  // instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a
+  // hotpath function, and we want to make sure that it doesn't have to
+  // construct any DispatchKeySets at runtime.
+  return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId();
 }
 
 c10::DispatchKey parseDispatchKey(const std::string& k) {
   static std::unordered_map<std::string, c10::DispatchKey> key_map = {
       {"Undefined", c10::DispatchKey::Undefined},
-      {"CPU", c10::DispatchKey::CPU},
-      {"CUDA", c10::DispatchKey::CUDA},
-      {"HIP", c10::DispatchKey::HIP},
+      {"Dense", c10::DispatchKey::Dense},
       {"FPGA", c10::DispatchKey::FPGA},
       {"ORT", c10::DispatchKey::ORT},
-      {"XLA", c10::DispatchKey::XLA},
-      {"MLC", c10::DispatchKey::MLC},
+      {"MPS", c10::DispatchKey::MPS},
       {"Vulkan", c10::DispatchKey::Vulkan},
       {"Metal", c10::DispatchKey::Metal},
-      {"XPU", c10::DispatchKey::XPU},
-      {"HPU", c10::DispatchKey::HPU},
       {"VE", c10::DispatchKey::VE},
-      {"Lazy", c10::DispatchKey::Lazy},
       {"Meta", c10::DispatchKey::Meta},
-      {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
-      {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
-      {"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
+      {"Quantized", c10::DispatchKey::Quantized},
       {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId},
       {"MkldnnCPU", c10::DispatchKey::MkldnnCPU},
-      {"SparseCPU", c10::DispatchKey::SparseCPU},
-      {"SparseCUDA", c10::DispatchKey::SparseCUDA},
-      {"SparseHIP", c10::DispatchKey::SparseHIP},
-      {"SparseXPU", c10::DispatchKey::SparseXPU},
-      {"SparseVE", c10::DispatchKey::SparseVE},
+      {"Sparse", c10::DispatchKey::Sparse},
       {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU},
       {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA},
-      {"NestedTensor", c10::DispatchKey::NestedTensor},
-      {"PrivateUse1", c10::DispatchKey::PrivateUse1},
-      {"PrivateUse2", c10::DispatchKey::PrivateUse2},
-      {"PrivateUse3", c10::DispatchKey::PrivateUse3},
       {"BackendSelect", c10::DispatchKey::BackendSelect},
       {"Python", c10::DispatchKey::Python},
+      {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot},
+      {"Fake", c10::DispatchKey::Fake},
       {"Named", c10::DispatchKey::Named},
       {"Conjugate", c10::DispatchKey::Conjugate},
       {"Negative", c10::DispatchKey::Negative},
@@ -256,30 +283,62 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
        c10::DispatchKey::FuncTorchDynamicLayerBackMode},
       {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView},
       {"AutogradOther", c10::DispatchKey::AutogradOther},
-      {"AutogradCPU", c10::DispatchKey::AutogradCPU},
-      {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
-      {"AutogradXLA", c10::DispatchKey::AutogradXLA},
-      {"AutogradLazy", c10::DispatchKey::AutogradLazy},
-      {"AutogradXPU", c10::DispatchKey::AutogradXPU},
-      {"AutogradMLC", c10::DispatchKey::AutogradMLC},
-      {"AutogradHPU", c10::DispatchKey::AutogradHPU},
+      {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality},
       {"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor},
-      {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
-      {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
-      {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
       {"Tracer", c10::DispatchKey::Tracer},
       {"AutocastCPU", c10::DispatchKey::AutocastCPU},
+      {"AutocastXPU", c10::DispatchKey::AutocastXPU},
       {"AutocastCUDA", c10::DispatchKey::AutocastCUDA},
       {"FuncTorchBatched", c10::DispatchKey::FuncTorchBatched},
       {"FuncTorchVmapMode", c10::DispatchKey::FuncTorchVmapMode},
       {"Batched", c10::DispatchKey::Batched},
       {"VmapMode", c10::DispatchKey::VmapMode},
+      {"DeferredInit", c10::DispatchKey::DeferredInit},
       {"FuncTorchGradWrapper", c10::DispatchKey::FuncTorchGradWrapper},
       {"FuncTorchDynamicLayerFrontMode",
        c10::DispatchKey::FuncTorchDynamicLayerFrontMode},
       {"TESTING_ONLY_GenericWrapper",
        c10::DispatchKey::TESTING_ONLY_GenericWrapper},
       {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode},
+
+      {"CPU", c10::DispatchKey::CPU},
+      {"CUDA", c10::DispatchKey::CUDA},
+      {"HIP", c10::DispatchKey::HIP},
+      {"XLA", c10::DispatchKey::XLA},
+      {"MPS", c10::DispatchKey::MPS},
+      {"XPU", c10::DispatchKey::XPU},
+      {"IPU", c10::DispatchKey::IPU},
+      {"HPU", c10::DispatchKey::HPU},
+      {"Lazy", c10::DispatchKey::Lazy},
+      {"NestedTensor", c10::DispatchKey::NestedTensor},
+      {"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU},
+      {"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA},
+      {"PrivateUse1", c10::DispatchKey::PrivateUse1},
+      {"PrivateUse2", c10::DispatchKey::PrivateUse2},
+      {"PrivateUse3", c10::DispatchKey::PrivateUse3},
+
+      {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
+      {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
+      {"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
+
+      {"SparseCPU", c10::DispatchKey::SparseCPU},
+      {"SparseCUDA", c10::DispatchKey::SparseCUDA},
+      {"SparseHIP", c10::DispatchKey::SparseHIP},
+      {"SparseXPU", c10::DispatchKey::SparseXPU},
+      {"SparseVE", c10::DispatchKey::SparseVE},
+
+      {"AutogradCPU", c10::DispatchKey::AutogradCPU},
+      {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
+      {"AutogradXLA", c10::DispatchKey::AutogradXLA},
+      {"AutogradLazy", c10::DispatchKey::AutogradLazy},
+      {"AutogradIPU", c10::DispatchKey::AutogradIPU},
+      {"AutogradXPU", c10::DispatchKey::AutogradXPU},
+      {"AutogradMPS", c10::DispatchKey::AutogradMPS},
+      {"AutogradHPU", c10::DispatchKey::AutogradHPU},
+      {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
+      {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
+      {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
+
       {"Autograd", c10::DispatchKey::Autograd},
       {"CompositeImplicitAutograd",
        c10::DispatchKey::CompositeImplicitAutograd},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 1bb8268e2bd0..5380bfb319b7 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -9,20 +9,99 @@
 
 namespace c10 {
 
+// Semantically, each value of BackendComponent identifies a "backend" for our
+// dispatch. Some functionalities that we may dispatch to are allowed to
+// register different handlers for each backend. The BackendComponent is then
+// used to figure out which backend implementation to dispatch to.
+
+// In implementation terms, the backend component identifies a specific "bit" in
+// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
+// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
+// functionalities. When we encounter a functionality bit that is known to be
+// customizeable per-backend, then we also look at the lower BackendComponent
+// bits and take the highest bit to determine which backend's implementation to
+// use.
+
+enum class BackendComponent : uint8_t {
+
+  // A "backend" is colloquially used to refer to handlers for dispatch
+  // which actually implement the numerics of an operation in question.
+  //
+  // Due to the nature of the enum, these backends are specified in
+  // an ordered way, but for most backends this order is not semantically
+  // meaningful (e.g., it's valid to reorder these backends without changing
+  // semantics).  The only situation when backend ordering is meaningful
+  // is when the backend participates in multiple dispatch with another
+  // backend; e.g., CPU and CUDA (cuda must have higher priority).
+
+  // These keys don't correspond to individual kernels.
+  // Instead, they represent the backends that are allowed to override specific
+  // pieces of functionality:
+  // - dense kernels (e.g. DispatchKey::CPU)
+  // - sparse kernels (e.g. DispatchKey::SparseCPU)
+  // - quantized kernels (e.g. DispatchKey::QuantizedCPU)
+  // - autograd kernels (e.g. DispatchKey::AutogradCPU)
+  // We reserve space in the runtime operator table for this full cross product
+  // of
+  // [backends in this enum] x [keys below that are explicitly marked as having
+  // per-backend functionality]
+
+  InvalidBit = 0,
+  CPUBit,
+  CUDABit,
+  HIPBit,
+  XLABit,
+  MPSBit,
+  IPUBit,
+  XPUBit,
+  HPUBit,
+  VEBit,
+  LazyBit,
+  PrivateUse1Bit,
+  PrivateUse2Bit,
+  PrivateUse3Bit,
+  // Define an alias to represent end of backend dispatch keys.
+  // If you add new backend keys after PrivateUse3, please also update it here.
+  // (But you shouldn't: private use keys should have higher precedence than
+  // all built-in keys)
+  EndOfBackendKeys = PrivateUse3Bit,
+};
+
 // Semantically, a dispatch key identifies a possible "level" in our
-// dispatch, for which a handler may be registered.  Traditional
-// backends like CPU and CUDA get dispatch keys; however, so do
-// "wrapping" layers like Variable (for autograd handling).
+// dispatch, for which a handler may be registered. Each handler corresponds
+// to a type of functionality.
 //
 // In implementation terms, the dispatch key identifies a specific "bit" in a
 // DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
 // we "count leading zeros" when we extract the highest priority dispatch
 // key.)
 //
-// NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py
-enum class DispatchKey : uint8_t {
+// Note [DispatchKey Classification]
+// This enum actually contains several types of keys, which are explained
+// in more detail further down:
+// (1) non-customizable backends (e.g. FPGA)
+// (2) non-customizable functionalities (e.g. Functionalize)
+// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
+// AutogradFunctionality) (4) per-backend instances of customizable
+// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
+// CompositeImplicitAutograd)
+//
+// Of the categories above, it's important to note:
+// (a) which keys are assigned individual bits in a DispatchKeySet
+// (b) which keys are assigned individual slots in the runtime operator table
+// ("Runtime keys")
+//
+// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
+// (1), (2) and (4) all get their own dedicated slots in the runtime operator
+// table.
+
+// See Note [DispatchKeySet Internal Representation] for more details.
+//
+// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py
+enum class DispatchKey : uint16_t {
+
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  // This is not a "real" tensor id, but it exists to give us a "nullopt"
+  // This is not a "real" functionality, but it exists to give us a "nullopt"
   // element we can return for cases when a DispatchKeySet contains no elements.
   // You can think a more semantically accurate definition of DispatchKey is:
   //
@@ -38,24 +117,31 @@ enum class DispatchKey : uint8_t {
   // this will get eliminated, but for now it's convenient)
   CatchAll = Undefined,
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  // A "backend" is colloquially used to refer to handlers for dispatch
-  // which actually implement the numerics of an operation in question.
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
+  // Every value in the enum (up to EndOfFunctionalityKeys)
+  // corresponds to an individual "functionality" that can be dispatched to.
+  // This is represented in the DispatchKeySet by assigning each of these enum
+  // values
+  // to each of the remaining (64 - len(BackendComponent)) bits.
   //
-  // Due to the nature of the enum, these backends are specified in
-  // an ordered way, but for most backends this order is not semantically
-  // meaningful (e.g., it's valid to reorder these backends without changing
-  // semantics).  The only situation when backend ordering is meaningful
-  // is when the backend participates in multiple dispatch with another
-  // backend; e.g., CPU and SparseCPU (sparse must have
-  // higher priority).
+  // Most of these functionalities have a single handler assigned to them,
+  // making them "runtime keys".
+  // That map to a single slot in the runtime operator table.
+  //
+  // A few functionalities are allowed to be customizable per backend.
+  // See [Note: Per-Backend Functionality Dispatch Keys] for details.
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Dense,
+
+  // Below are non-extensible backends.
+  // These are backends that currently don't have their own overrides for
+  // Autograd/Sparse/Quantized kernels,
+  // and we therefore don't waste space in the runtime operator table allocating
+  // space for them.
+  // If any of these backends ever need to customize, e.g., Autograd, then we'll
+  // need to add a DispatchKey::*Bit for them.
 
-  // Here are backends which you think of as traditionally specifying
-  // how to implement operations on some device.
-  CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
-  CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
-  HIP, // NB: I think this is not actually used, due to Note [Masquerading as
-  // CUDA]
   FPGA, // Xilinx support lives out of tree at
   // https://gitlab.com/pytorch-complex/vitis_kernels
 
@@ -67,14 +153,8 @@ enum class DispatchKey : uint8_t {
   // - aten/src/ATen/test/extension_backend_test.cpp
   ORT,
 
-  XLA, // lives out of tree at https://github.com/pytorch/xla
-  MLC, // lives out of tree at https://github.com/pytorch/MLCompute
   Vulkan,
   Metal,
-  XPU, // For out of tree Intel's heterogeneous computing plug-in
-  HPU, // For out of tree & closed source integration of HPU / Habana
-  VE, // For out of tree & closed source integration of SX-Aurora / NEC
-  Lazy, // For lazy tensor backends
 
   // A meta tensor is a tensor without any data associated with it.  (They
   // have also colloquially been referred to as tensors on the "null" device).
@@ -83,11 +163,8 @@ enum class DispatchKey : uint8_t {
   // tensor with the output shape and dtype, but wouldn't actually add anything.
   Meta,
 
-  // Here are backends which specify more specialized operators
-  // based on the dtype of the tensor.
-  QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
-  QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
-  QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Quantized,
 
   // This backend is to support custom RNGs; it lets you go
   // to a different kernel if you pass in a generator that is not a
@@ -106,30 +183,28 @@ enum class DispatchKey : uint8_t {
   // the corresponding dense tensors, and must be handled before them.
   MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
   // NB: not to be confused with MKLDNN, which is Caffe2 only
-  SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
-  SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
-  SparseHIP, // TODO: I think this is not actually used, due to Note
-  // [Masquerading as CUDA]
-  SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
-  SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Sparse,
 
   SparseCsrCPU,
   SparseCsrCUDA,
 
-  NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor
-
-  // Here are reserved backends for user-defined backends, see Note [Private use
-  // DispatchKey]
-  // To see some example about how to use this, check out ORT
-  PrivateUse1,
-  PrivateUse2,
-  PrivateUse3,
+  // Note [Non-Customizable Backend Keys]
+  // Every key above here is considered a "non-customizable backend".
+  // These are backends that will work correctly with autograd, but
+  // but currently don't require separate implementations
+  // for autograd sparse or quantized kernels.
+  // Any new backends that don't need to be customized should go above here.
+  // If an existing backend needs to e.g. override autograd, then we can
+  // consider promoting it into the "BackendComponent" enum
+  //
+  // For all intents and purposes from the perspective of DispatchKeySet,
+  // "non-customizable backend" keys are treated the same way
+  // as other functionality keys
+  EndOfNonCustomizableBackends = SparseCsrCUDA,
 
-  // Define an alias key to represent end of backend dispatch keys.
-  // If you add new backend keys after PrivateUse3, please also update it here.
-  // (But you shouldn't: private use keys should have higher precedence than
-  // all built-in keys)
-  EndOfBackendKeys = PrivateUse3,
+  NestedTensor,
 
   // In some situations, it is not immediately obvious what the correct
   // backend for function is, because the function in question doesn't
@@ -140,6 +215,10 @@ enum class DispatchKey : uint8_t {
 
   Python,
 
+  // Out-of-core key for Fake Tensor in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/fake_tensor.html
+  Fake,
+
   // The named dispatch key is set for any tensors with named dimensions.
   // Although we have a dispatch key for named tensors, for historical reasons,
   // this dispatch key doesn't do any of the substantive functionality for named
@@ -233,26 +312,25 @@ enum class DispatchKey : uint8_t {
   // AutogradOther key. We can add specific autograd key for those backends
   // upon request.
   AutogradOther,
-  AutogradCPU,
-  AutogradCUDA,
-  AutogradXLA,
-  AutogradLazy,
-  AutogradXPU,
-  AutogradMLC,
-  AutogradHPU,
-  AutogradNestedTensor, // lives out of tree at
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  AutogradFunctionality,
+
+  // NestedTensor is an example of something that isn't a "real backend"
+  // (because it mostly consists of redispatching kernels)
+  // but it would like to override autograd functionality in C++.
+  // We can handle cases like this by adding an extra functionality key
+  // exclusively for handling autograd for NestedTensor.
+  // lives out of tree at
   // https://github.com/pytorch/nestedtensor
-  // Here are some reserved pre-autograd keys for user-defined backends, see
-  // Note [Private use DispatchKey]
-  AutogradPrivateUse1,
-  AutogradPrivateUse2,
-  AutogradPrivateUse3,
+  AutogradNestedTensor,
 
   Tracer,
 
   // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
   // and inputs are saved for backward in the post-autocast type.
   AutocastCPU,
+  AutocastXPU,
   // Naughtily, AutocastCUDA is also being used for XLA.  In the terminal state,
   // it probably should get its own Autocast key
   AutocastCUDA,
@@ -274,12 +352,25 @@ enum class DispatchKey : uint8_t {
   VmapMode,
 
   FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype]
+
   // Alias and mutation removal.
   // If some backends want to opt into only alias removal or only mutation
   // removal,
   // we can consider adding separate keys dedicated to those individual passes.
   // See Note [Functionalization Pass In Core] for details.
   Functionalize,
+
+  // Out-of-core key for Deferred Module Initialization in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/deferred_init.html
+  DeferredInit,
+
+  // Used by Python key logic to know the set of tls on entry to the dispatcher
+  // This kernel assumes it is the top-most non-functorch-related DispatchKey.
+  // If you add a key above, make sure to update the fallback implementation for
+  // this.
+  PythonTLSSnapshot,
+
+  // This key should be at the very top of the dispatcher
   FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]
 
   // TESTING: This is intended to be a generic testing tensor type id.
@@ -299,9 +390,128 @@ enum class DispatchKey : uint8_t {
   TESTING_ONLY_GenericMode,
 
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  NumDispatchKeys, // Sentinel, end of runtime keys.
+  EndOfFunctionalityKeys, // End of functionality keys.
+
+  // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
+  // Here are backends which you think of as traditionally specifying
+  // how to implement operations on some device.
+
+  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+  StartOfDenseBackends,
+  CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
+  CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
+  HIP, // NB: I think this is not actually used, due to Note [Masquerading as
+  // CUDA]
+  XLA, // lives out of tree at https://github.com/pytorch/xla
+  MPS, // registered at build/aten/src/ATen/RegisterMPS.cpp
+  IPU, // lives out of tree at https://github.com/graphcore/poptorch
+  XPU, // For out of tree Intel's heterogeneous computing plug-in
+  HPU, // For out of tree & closed source integration of HPU / Habana
+  VE, // For out of tree & closed source integration of SX-Aurora / NEC
+  Lazy, // For lazy tensor backends
+  // Here are reserved backends for user-defined backends, see Note [Private use
+  // DispatchKey]
+  // To see some example about how to use this, check out ORT
+  PrivateUse1,
+  PrivateUse2,
+  PrivateUse3,
+  EndOfDenseBackends = PrivateUse3,
+
+  // ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ //
+  // keys starting with an _ are not currently used,
+  // but are needed to ensure that every backend is indexed correctly.
+
+  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+  StartOfQuantizedBackends,
+  QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
+  QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
+  _QuantizedHIP,
+  _QuantizedXLA,
+  _QuantizedMPS,
+  _QuantizedIPU,
+  QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in
+  _QuantizedHPU,
+  _QuantizedVE,
+  _QuantizedLazy,
+  _QuantizedPrivateUse1,
+  _QuantizedPrivateUse2,
+  _QuantizedPrivateUse3,
+  EndOfQuantizedBackends = _QuantizedPrivateUse3,
+
+  // ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ //
+  // keys starting with an _ are not currently used,
+  // but are needed to ensure that every backend is indexed correctly.
+
+  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+  StartOfSparseBackends,
+  SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
+  SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
+  SparseHIP, // TODO: I think this is not actually used, due to Note
+  // [Masquerading as CUDA]
+  _SparseXLA,
+  _SparseMPS,
+  _SparseIPU,
+  SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
+  _SparseHPU,
+  SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC
+  _SparseLazy,
+  _SparsePrivateUse1,
+  _SparsePrivateUse2,
+  _SparsePrivateUse3,
+  EndOfSparseBackends = _SparsePrivateUse3,
+
+  // ~~~~~~~~~~~~~~ "NestedTensor" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~
+  // //
+  // keys starting with an _ are not currently used,
+  // but are needed to ensure that every backend is indexed correctly.
+
+  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+  StartOfNestedTensorBackends,
+  // registered at build/aten/src/ATen/RegisterNestedTensorCPU.cpp
+  NestedTensorCPU,
+  // registered at build/aten/src/ATen/RegisterNestedTensorCUDA.cpp
+  NestedTensorCUDA,
+  _NestedTensorHIP,
+  _NestedTensorXLA,
+  _NestedTensorMPS,
+  _NestedTensorIPU,
+  _NestedTensorXPU,
+  _NestedTensorHPU,
+  _NestedTensorVE,
+  _NestedTensorLazy,
+  _NestedTensorPrivateUse1,
+  _NestedTensorPrivateUse2,
+  _NestedTensorPrivateUse3,
+  EndOfNestedTensorBackends = _NestedTensorPrivateUse3,
+
+  // ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ //
+  // keys starting with an _ are not currently used,
+  // but are needed to ensure that every backend is indexed correctly.
+
+  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+  StartOfAutogradBackends,
+  AutogradCPU,
+  AutogradCUDA,
+  _AutogradHIP,
+  AutogradXLA,
+  AutogradMPS,
+  AutogradIPU,
+  AutogradXPU,
+  AutogradHPU,
+  _AutogradVE,
+  AutogradLazy,
+  // Here are some reserved pre-autograd keys for user-defined backends, see
+  // Note [Private use DispatchKey]
+  AutogradPrivateUse1,
+  AutogradPrivateUse2,
+  AutogradPrivateUse3,
+  EndOfAutogradBackends = AutogradPrivateUse3,
+  // If we add a new per-backend functionality key that has higher priority
+  // than Autograd, then this key should be updated.
+  EndOfRuntimeBackendKeys = EndOfAutogradBackends,
 
   // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // Note [Alias Dispatch Keys]
   // Alias dispatch keys are synthetic dispatch keys which map to multiple
   // runtime dispatch keys. Alisa keys have precedence, but they are always
   // lower precedence than runtime keys. You can register a kernel to an
@@ -321,6 +531,7 @@ enum class DispatchKey : uint8_t {
 
   // Define an alias key to represent end of alias dispatch keys.
   // If you add new alias keys after Autograd, please also update it here.
+  StartOfAliasKeys = Autograd,
   EndOfAliasKeys = CompositeExplicitAutograd, //
 
   // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
@@ -360,54 +571,84 @@ enum class DispatchKey : uint8_t {
 // built-in autograd formulas for operators are not appropriate.
 
 static_assert(
-    static_cast<uint8_t>(DispatchKey::NumDispatchKeys) < 64,
-    "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries");
+    (static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
+     static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
+    "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
+    " both map to backend and functionality bits"
+    " into a 64-bit bitmask; you must have less than 64 total entries between them");
 
-#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
-/**
- * The method below maps the dispatch key in the enum DispatchKey to an
- * integer index in the dispatchTable_ array in OperatorEntry. The array
- * is trimmed for mobile to reduce peak memory usage since it's
- * unnecessary to reserve additional space for dispatch keys that will
- * never be used on mobile.
- */
-C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
-  switch (dk) {
-    case DispatchKey::Undefined:
-      return 0;
-    case DispatchKey::CPU:
-      return 1;
-    case DispatchKey::QuantizedCPU:
-      return 2;
-    case DispatchKey::SparseCPU:
-      return 3;
-    case DispatchKey::BackendSelect:
-      return 4;
-    case DispatchKey::ADInplaceOrView:
-      return 5;
-    case DispatchKey::AutogradOther:
-      return 6;
-    case DispatchKey::AutogradCPU:
-      return 7;
-    case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys.
-      return 8;
-    default:
-      return -1;
+// Check if a DispatchKey is an alias mapping to other runtime keys.
+constexpr bool isAliasDispatchKey(DispatchKey k) {
+  return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
+}
+
+// [Note: Per-Backend Functionality Dispatch Keys]
+// Check if a DispatchKey is a per-backend functionality key
+// Any functionalities that can be customized per-backend should be added here.
+// These keys correspond to functionalities that can be customized indivually
+// per backend. While they only take up one bit in the `DispatchKeySet` bitset,
+// they map to (# backends) slots in the operator table.
+// Each of these keys also has a separate set of "runtime keys" in the dispatch
+// key enum, per backend, which *do* map to the individual operator table slots.
+// For example, the "Sparse" key maps to an individual bit in the
+// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
+// slots in the runtime operator table.
+
+constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
+  if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
+      k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality ||
+      k == DispatchKey::NestedTensor) {
+    return true;
+  } else {
+    return false;
   }
 }
-#else
-/**
- * For the server use-case, make this a simple pass-through.
- */
-C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
-  return static_cast<int>(dk);
+
+// Note that this includes Undefined in the total count.
+// BUT EndOfFunctionalityKeys is its own (placeholder) key.
+// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
+// In the above example, there are 3 total functionality keys.
+constexpr uint8_t num_functionality_keys =
+    static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);
+
+constexpr uint8_t num_backends =
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);
+
+// Note [No More Than 16 Backends]
+// Search for this note to find places in the code where the "no more than 16
+// backends" invariant is baked in.
+static_assert(
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
+    "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
+there are a few places where this invariant is baked in");
+
+constexpr uint8_t numPerBackendFunctionalityKeys() {
+  uint8_t count = 0;
+  for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
+    if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
+      ++count;
+  }
+  return count;
 }
+
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+// See [Note: Trimmed Mobile Dispatch Keys]
+constexpr uint16_t num_runtime_entries = 8;
+#else
+constexpr uint16_t num_runtime_entries = num_functionality_keys +
+    (numPerBackendFunctionalityKeys() * (num_backends - 1));
 #endif
 
+// See Note [No More Than 16 Backends]
+constexpr uint16_t full_backend_mask =
+    (static_cast<uint16_t>(1) << num_backends) - 1;
+
 C10_API const char* toString(DispatchKey);
+C10_API const char* toString(BackendComponent);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
+C10_API std::ostream& operator<<(std::ostream&, BackendComponent);
 
-C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t);
+C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
 
 // Parses a string into a dispatch key.
 // If the string cannot be correctly parsed, throws an exception.
@@ -420,10 +661,100 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k);
 // torch::dispatch(torch::kCPU, ...) is also valid.
 constexpr DispatchKey kAutograd = DispatchKey::Autograd;
 
-// Check if a DispatchKey is an alias mapping to other runtime keys.
-inline bool isAliasDispatchKey(DispatchKey k) {
-  return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys;
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr BackendComponent toBackendComponent(DispatchKey k) {
+  if (k >= DispatchKey::StartOfDenseBackends &&
+      k <= DispatchKey::EndOfDenseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
+  } else if (
+      k >= DispatchKey::StartOfQuantizedBackends &&
+      k <= DispatchKey::EndOfQuantizedBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseBackends &&
+      k <= DispatchKey::EndOfSparseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
+  } else if (
+      k >= DispatchKey::StartOfNestedTensorBackends &&
+      k <= DispatchKey::EndOfNestedTensorBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends));
+  } else if (
+      k >= DispatchKey::StartOfAutogradBackends &&
+      k <= DispatchKey::EndOfAutogradBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends));
+  } else {
+    return BackendComponent::InvalidBit;
+  }
+}
+
+constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
+  if (k <= DispatchKey::EndOfFunctionalityKeys) {
+    return k;
+  } else if (k <= DispatchKey::EndOfDenseBackends) {
+    return DispatchKey::Dense;
+  } else if (k <= DispatchKey::EndOfQuantizedBackends) {
+    return DispatchKey::Quantized;
+  } else if (k <= DispatchKey::EndOfSparseBackends) {
+    return DispatchKey::Sparse;
+  } else if (k <= DispatchKey::EndOfNestedTensorBackends) {
+    return DispatchKey::NestedTensor;
+  } else if (k <= DispatchKey::EndOfAutogradBackends) {
+    return DispatchKey::AutogradFunctionality;
+  } else {
+    return DispatchKey::Undefined;
+  }
 }
+
+// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
+// DispatchKey::CUDA.
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
+    DispatchKey functionality_k,
+    BackendComponent backend_k) {
+  if (functionality_k == DispatchKey::Dense) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Sparse) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Quantized) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::NestedTensor) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::AutogradFunctionality) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  return DispatchKey::Undefined;
+}
+
 } // namespace c10
 
 namespace torch {
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 7f85567f886f..3127e7bb43e3 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -1,37 +1,30 @@
 #include <c10/core/DispatchKeySet.h>
+#include <c10/util/irange.h>
+#include <iostream>
 
 namespace c10 {
 
-// backend_dispatch_keyset should include all runtime backend keys.
+// backend_dispatch_keyset includes all dispatch keys that map to backends.
 // Alias key DispatchKey::CompositeExplicitAutograd maps to
-// backend_dispatch_keyset NestedTensor has been explicitly removed due to
-// incompatibility with some kernels, such as structured kernels, that use the
-// DefaultBackend key.
-constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
-    DispatchKeySet({
-        DispatchKey::CPU,
-        DispatchKey::CUDA,
-        DispatchKey::XLA,
-        DispatchKey::Lazy,
-        DispatchKey::XPU,
-        DispatchKey::PrivateUse1,
-        DispatchKey::PrivateUse2,
-        DispatchKey::PrivateUse3,
-        DispatchKey::MLC,
-        DispatchKey::HPU,
-        DispatchKey::ORT,
-        DispatchKey::Meta,
-    });
+// backend_dispatch_keyset
+constexpr DispatchKeySet backend_dispatch_keyset =
+    autogradother_backends | DispatchKeySet(DispatchKey::Dense);
 
 bool isBackendDispatchKey(DispatchKey t) {
   return t != DispatchKey::Undefined
       // See Note [No Alias Keys in DispatchKeySet]
-      && !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t);
+      && !isAliasDispatchKey(t)
+      // Note [NestedTensor Not Included in Backend Keys]
+      // NestedTensor has been explicitly removed from the "backend keyset" due
+      // to incompatibility with some kernels, so we don't want it to be
+      // included in CompositeImplicitAutograd or CompositeExplicitAutograd
+      // kernels.
+      && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t);
 }
 
 // math_dispatch_keyset contains all keys in backend_dispatch_keyset and
 // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
-// maps to math_dispatch_keyset.
+// maps to [math_dispatch_keyset x full_backend_mask]
 constexpr DispatchKeySet math_dispatch_keyset =
     backend_dispatch_keyset | autograd_dispatch_keyset;
 
@@ -39,7 +32,12 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
   switch (t) {
     case DispatchKey::Autograd:
-      return autograd_dispatch_keyset;
+      // See Note [autograd_dispatch_keyset Does Not Include Backend Bits]
+      // That's why we OR it with a mask of the backend bits here.
+      // getRuntimeDispatchKeySet() expects to return a keyset of runtime
+      // dispatch keys, like AutogradCPU, but that requires having backend bits.
+      return autograd_dispatch_keyset |
+          DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
     case DispatchKey::CompositeImplicitAutograd:
       return math_dispatch_keyset;
     case DispatchKey::CompositeExplicitAutograd:
@@ -53,11 +51,13 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
   switch (t) {
     case DispatchKey::Autograd:
-      return autograd_dispatch_keyset.has(k);
+      return autograd_dispatch_keyset.has(toFunctionalityKey(k));
     case DispatchKey::CompositeImplicitAutograd:
-      return math_dispatch_keyset.has(k);
+      // See Note [NestedTensor Not Included in Backend Keys]
+      return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k);
     case DispatchKey::CompositeExplicitAutograd:
-      return backend_dispatch_keyset.has(k);
+      // See Note [NestedTensor Not Included in Backend Keys]
+      return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k);
     default:
       return t == k;
   }
@@ -75,12 +75,12 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
       return DispatchKeySet(DispatchKey::XLA);
     case DispatchKey::AutogradLazy:
       return DispatchKeySet(DispatchKey::Lazy);
-    case DispatchKey::AutogradMLC:
-      return DispatchKeySet(DispatchKey::MLC);
+    case DispatchKey::AutogradMPS:
+      return DispatchKeySet(DispatchKey::MPS);
     case DispatchKey::AutogradHPU:
       return DispatchKeySet(DispatchKey::HPU);
-    case DispatchKey::AutogradNestedTensor:
-      return DispatchKeySet(DispatchKey::NestedTensor);
+    case DispatchKey::AutogradIPU:
+      return DispatchKeySet(DispatchKey::IPU);
     case DispatchKey::AutogradXPU:
       return DispatchKeySet(DispatchKey::XPU);
     case DispatchKey::AutogradPrivateUse1:
@@ -96,23 +96,6 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
   }
 }
 
-DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) {
-  switch (t) {
-    case DispatchKey::CPU:
-      return DispatchKeySet(DispatchKey::AutocastCPU);
-    case DispatchKey::CUDA:
-    case DispatchKey::XLA:
-      return DispatchKeySet(DispatchKey::AutocastCUDA);
-    default:
-      return DispatchKeySet();
-  }
-}
-
-DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) {
-  return DispatchKeySet(
-      {DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)});
-}
-
 bool isIncludedInAlias(DispatchKey k, DispatchKey alias) {
   return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k);
 }
@@ -129,18 +112,135 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
     return os;
   }
   os << "DispatchKeySet(";
-  DispatchKey tid;
   bool first = true;
-  while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) {
+  for (auto k : ts) {
     if (!first) {
       os << ", ";
     }
-    os << tid;
-    ts = ts.remove(tid);
+    os << k;
     first = false;
   }
   os << ")";
   return os;
 }
 
+DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() {
+  TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val);
+  TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends, next_backend_);
+
+  // Create a masked version of the set representation to ignore previous
+  // keys that we've iterated through.
+  uint64_t masked_functionality_bits =
+      llvm::maskTrailingZeros<uint64_t>(next_functionality_) & *data_ptr_;
+  uint64_t masked_backend_bits =
+      llvm::maskTrailingZeros<uint64_t>(next_backend_) & full_backend_mask &
+      *data_ptr_;
+
+  uint64_t first_functionality_idx =
+      llvm::findFirstSet(masked_functionality_bits);
+  uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits);
+
+  // If there are no keys, set to end iterator value
+  if (first_functionality_idx == std::numeric_limits<uint64_t>::max() ||
+      next_functionality_ == iterator::end_iter_mask_val) {
+    // Set up state to be the same as end()
+    next_functionality_ = iterator::end_iter_mask_val;
+    current_dispatchkey_idx_ = iterator::end_iter_key_val;
+    next_backend_ = 0;
+    current_backendcomponent_idx_ = iterator::end_iter_key_val;
+    return *this;
+  }
+
+  // The +1 is because of DispatchKey::Undefined and
+  // BackendComponent::InvalidBit
+  auto new_next_functionality = first_functionality_idx + 1;
+  auto new_backendcomponent_idx = first_backendcomponent_idx + 1;
+  // and the -num_backends is because the first <num_backends> bits in the
+  // keyset are not Dispatch Keys.
+  auto next_dispatchkey_idx = new_next_functionality - num_backends;
+
+  // If the current functionality bit is a per-backend bit, we need special
+  // handling
+  if (isPerBackendFunctionalityKey(
+          static_cast<DispatchKey>(next_dispatchkey_idx))) {
+    // case 1: if the current backend is undefined, then there is no valid
+    // backend instance of this functionality key so we can skip it.
+    if (first_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
+      // increment the functionality mask so we skip the current functionality
+      // bit on the next increment.
+      next_functionality_ = new_next_functionality;
+      ++(*this);
+      return *this;
+    }
+
+    // Otherwise, at this point we know what the current backend and
+    // functionality bits are.
+    current_dispatchkey_idx_ = next_dispatchkey_idx;
+    current_backendcomponent_idx_ = new_backendcomponent_idx;
+
+    // Next, we need to set up the masks for the next increment.
+    uint64_t next_backendcomponent_bits =
+        llvm::maskTrailingZeros<uint64_t>(first_backendcomponent_idx + 1) &
+        full_backend_mask & *data_ptr_;
+    uint64_t next_backendcomponent_idx =
+        llvm::findFirstSet(next_backendcomponent_bits);
+    if (next_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
+      // case 2: the current backend is valid, but there is not another backend
+      // in the keyset. In this case, we need to bump the functionality mask and
+      // reset the backend mask for the next increment
+      next_functionality_ = new_next_functionality;
+      next_backend_ = 0;
+    } else {
+      // case 3: we have another backend to iterate over. We want to iterate
+      // over the same functionality bit next time, but a different backend bit.
+      next_backend_ = first_backendcomponent_idx + 1;
+    }
+  } else {
+    // Functionality bits that aren't per backend are simpler to handle. We can
+    // ignore the backend bits.
+    TORCH_INTERNAL_ASSERT(next_backend_ == 0);
+    current_dispatchkey_idx_ = next_dispatchkey_idx;
+    next_functionality_ = new_next_functionality;
+  }
+  return *this;
+}
+
+std::array<FunctionalityOffsetAndMask, num_functionality_keys>
+initializeFunctionalityOffsetsAndMasks() {
+  std::array<FunctionalityOffsetAndMask, num_functionality_keys>
+      offsets_and_masks;
+  // manualy set the first entry, which corresponds to Undefined.
+  offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0);
+  // loop through every functionality key (aside from Undefined).
+  for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) {
+    // functionality_idx should be Dense -> 1, ...
+    auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1];
+    auto k = static_cast<DispatchKey>(functionality_idx);
+
+    // If the previous functionality was not per-backend, then we can just
+    // increment the previous offset. Otherwise, the next offset =
+    // previous_offset + num_backends.
+    auto next_offset = prev_offset_and_mask.offset +
+        (prev_offset_and_mask.mask == 0 ? 1 : num_backends);
+    // the mask is used in the runtime index calculation to find the offset of
+    // the backend. For non-per-backend functionalities, this offset should
+    // always be 0. Otherwise, we need to get the index of the backend (which we
+    // can do using a backend mask).
+    auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0;
+    offsets_and_masks[functionality_idx] =
+        FunctionalityOffsetAndMask(next_offset, next_mask);
+  }
+  // Sanity check that the computed offset index of the last functionality key
+  // is correct. This assumes that the highest priority functionality key is not
+  // per backend.
+  TORCH_INTERNAL_ASSERT(
+      offsets_and_masks[num_functionality_keys - 1].offset ==
+          (num_runtime_entries - 1),
+      "num_runtime_entries: ",
+      num_runtime_entries,
+      "last_offset: ",
+      offsets_and_masks[num_functionality_keys - 1].offset);
+  return offsets_and_masks;
+}
+
 } // namespace c10
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 79d39652219b..d6241be9701e 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -1,5 +1,4 @@
 #pragma once
-
 #include <c10/core/DispatchKey.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Metaprogramming.h>
@@ -8,29 +7,147 @@
 
 namespace c10 {
 
+struct FunctionalityOffsetAndMask {
+  // empty constructor shouldn't be used; only needed to initialize
+  // the array before populating it.
+  FunctionalityOffsetAndMask() {}
+  FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask)
+      : offset(offset), mask(mask) {}
+  // This needs to big enough to cover the size of the operator table.
+  uint16_t offset;
+  // See Note [No More Than 16 Backends]
+  // This mask needs to be big enough to mask all of the backend bits.
+  // We probably don't ever want to have more than 16 backend bits, so uint16_t
+  // should be enough.
+  uint16_t mask;
+};
+static_assert(
+    c10::num_runtime_entries < 65536,
+    "The dispatcher currently only supports up to 2^16 runtime entries");
+
+C10_API std::array<FunctionalityOffsetAndMask, num_functionality_keys>
+initializeFunctionalityOffsetsAndMasks();
+
+C10_ALWAYS_INLINE static const std::
+    array<FunctionalityOffsetAndMask, num_functionality_keys>&
+    offsetsAndMasks() {
+  static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks();
+  return offsets_and_masks_;
+}
+
+// A representation of a set of DispatchKeys. A DispatchKeySet contains both
+// "functionality" bits and "backend bits", and every tensor holds its own
+// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the
+// keyset on every input tensor, or’ing them together, and dispatching to a
+// specific piece of functionality. The functionality bits are *ordered*. When
+// multiple functionality bits are set, we use the highest priority
+// functionality. Similarly, multiple backend bits can theoretically be set if
+// you call an operator with multiple tensors from difference devices (e.g. CPU
+// and CUDA), although support for mixed device dispatch is limited (the only
+// kernels that gracefully handle mixed device inputs for now are cuda kernels
+// that take in a scalar cpu tensor).
+
 // A representation of a set of DispatchKeys.  A tensor may have multiple
 // tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the
 // DispatchKeySet specifies what type ids apply.  The internal representation is
 // as a 64-bit bit set (this means only 64 tensor type ids are supported).
 //
-// Note that DispatchKeys are ordered; thus, we can ask questions like "what is
-// the highest priority DispatchKey in the set"?  (The set itself is not
-// ordered; two sets with the same ids will always have the ids ordered in the
-// same way.)
+// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like
+// "what is the highest priority DispatchKey in the set"?  (The set itself is
+// not ordered; two sets with the same ids will always have the ids ordered in
+// the same way.)
+//
+// Note [DispatchKeySet Internal Representation]
+// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects
+// that get passed around at runtime.
+// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset
+// and individual dispatch keys.
+//
+// First: why do we have this distinction, and why not map every dispatch key
+// directly to a bit? This is mostly because we have several types of
+// functionalities that different backends would like to customize. For example,
+// we have:
+// - "Dense":     CPU, CUDA, XLA, ... (~12 keys)
+// - "Sparse":    SparseCPU, SparseCUDA, ...
+// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
+// - "Autograd":  AutogradCPU, AutogradCUDA, Autograd XLA, ...
+// The problem is that total number of keys grows quadratically with [#
+// backends] x [# functionalities], making it very difficult to map each key
+// directly to a bit in a bitset without dramatically increasing the size of the
+// bitset over time.
+//
+// The two enums (BackendComponent and DispatchKey) can be divided roughly into
+// 5 categories.
+//
+// (1) "Building block" keys
+//    (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit,
+//    CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys
+//    (e.g. AutogradFunctionality, Sparse, Dense)
+// (2) "Runtime" keys
+//    (a) "non-customizable backends" (e.g. FPGA)
+//    (b) "non-customizable functionalities" (e.g. Functionalize)
+//    (c) "per-backend instances of customizable functionalities" (e.g. CPU,
+//    SparseCPU, AutogradCPU)
+// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys])
+//
+// (1) Building block keys always correspond to individual bits in a
+// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual
+// runtime keys. e.g.
+//     auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit,
+//     DispatchKey::Dense});
+//     // The keyset has the runtime dense-cpu key.
+//     dense_cpu_ks.has(DispatchKey::CPU);
+//     // And it contains the building block keys too.
+//     dense_cpu_ks.has(DispatchKey::CPUBit);
+//     dense_cpu_ks.has(DispatchKey::Dense);
+//
+// Not every backend and not every functionality counts as a "building block
+// key". This is mostly to give us more levers to pull in the design space.
+// Backend keys and functionality keys that count as "building blocks" will
+// contribute to a full cross product of functionality that can be overriden.
 //
-// At the moment, there are no nontrivial uses of this set; tensors are always
-// singletons.  In the near future, this set will represent variable? + tensor
-// type id.  In the far future, it will be requires grad? + profiling? +
-// tracing? + lazy? + tensor type id.
+// For example, right now we have at least 12 "backend" building blocks (CPU,
+// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense,
+// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow
+// every dispatcher operator to be customized in up to 12*4 different ways. Each
+// of those requires a slot in the operator table of every dispatcher operator.
+// Not every piece of functionality necessarily needs to be customizeable
+// per-backend, and not every backend necessarily needs to be able to customize
+// every type of functionality.
 //
-// (The difference between variable and requires grad, is that
-// there are currently three states a tensor can be:
-//  1. Not a variable
-//  2. Variable with requires_grad=False
-//  3. Variable with requires_grad=True
-// Eventually, we want to kill state (1), and only dispatch to autograd
-// handling code if one of the inputs requires grad.)
 //
+// (2) Every runtime key corresponds directly to a slot in an operator's runtime
+// dispatch table, and you can directly register kernels to a runtime dispatch
+// key.
+//
+// For per-backend functionalities like "Dense" or "AutogradFunctionality",
+// you can think of the corresponding runtime dispatch keys as "instances" of
+// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all
+// runtime instances of the "Dense" building block key.
+
+// (2a) and (2b) are represented identically in the DispatchKeySet logic:
+// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT
+// customizeable per backend.
+//   In order to do so, we'd need to promote it to a per-backend functionality
+//   "building block" key.
+// - non-customizeable backends (e.g. FPGA) can NOT customize existing
+// functionality like Sparse, Autograd, etc.
+//   In order to do so, we'd need to promote it to a backend "building block"
+//   key.
+//
+// In both cases, these keys directly correspond to runtime slots in the
+// operator table.
+//
+//
+// (3) "Alias" keys
+// See Note [Alias Dispatch Keys]
+//
+// Final note: for anyone making future changes to the Dispatcher +
+// DispatchKeySet internals, there's a closed PR with a basic
+// python-implementation of the Dispatcher that might be useful in quickly
+// testing out and validating changes. See it at
+// https://github.com/pytorch/pytorch/pull/68743
+
 // An undefined tensor is one with an empty tensor type set.
 class DispatchKeySet final {
  public:
@@ -41,29 +158,146 @@ class DispatchKeySet final {
   // NB: default constructor representation as zero is MANDATORY as
   // use of DispatchKeySet in TLS requires this.
   constexpr DispatchKeySet() : repr_(0) {}
+
   constexpr DispatchKeySet(Full)
-      : repr_(std::numeric_limits<decltype(repr_)>::max()) {}
+      : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
+
   constexpr DispatchKeySet(FullAfter, DispatchKey t)
       // LSB after t are OK, but not t itself.
-      : repr_((1ULL << (static_cast<uint8_t>(t) - 1)) - 1) {}
+      // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
+      // Quantized > Dense). But backends don't really have an ordering.
+      // Therefore, we're enforcing that FullAfter can only be used on
+      // "functionality" keys.
+      : repr_(
+            (1ULL
+             << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
+                 1)) -
+            1) {}
+
   // Public version of DispatchKeySet(uint64_t) API; external users
   // must be explicit when they do this!
   constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
-  explicit constexpr DispatchKeySet(DispatchKey t)
-      : repr_(
-            t == DispatchKey::Undefined
-                ? 0
-                : 1ULL << (static_cast<uint8_t>(t) - 1)) {}
-  explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
-      : repr_(0) {
+
+  constexpr explicit DispatchKeySet(BackendComponent k) {
+    if (k == BackendComponent::InvalidBit) {
+      repr_ = 0;
+    } else {
+      repr_ = 1ULL << (static_cast<uint8_t>(k) - 1);
+    }
+  }
+
+  constexpr explicit DispatchKeySet(DispatchKey k) {
+    if (k == DispatchKey::Undefined) {
+      // Case 1: handle Undefined specifically
+      repr_ = 0;
+    } else if (k <= DispatchKey::EndOfFunctionalityKeys) {
+      // Case 2: handle "functionality-only" keys
+      // These keys have a functionality bit set, but no backend bits
+      // These can technically be either:
+      // - valid runtime keys (e.g. DispatchKey::AutogradOther,
+      // DispatchKey::FuncTorchBatched, etc)
+      // - "building block" keys that aren't actual runtime keys (e.g.
+      // DispatchKey::Dense or Sparse)
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(k) - 1);
+      repr_ = functionality_val;
+    } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) {
+      // Case 3: "runtime" keys that have a functionality bit AND a backend bit.
+      // First compute which bit to flip for the functionality.
+      auto functionality_k = toFunctionalityKey(k);
+      // The - 1 is because Undefined is technically a "functionality" that
+      // doesn't show up in the bitset. So e.g. Dense is technically the second
+      // functionality, but the lowest functionality bit.
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(functionality_k) - 1);
+
+      // then compute which bit to flip for the backend
+      // Case 4a: handle the runtime instances of "per-backend functionality"
+      // keys For example, given DispatchKey::CPU, we should set:
+      // - the Dense functionality bit
+      // - the CPUBit backend bit
+      // first compute which bit to flip for the backend
+      auto backend_k = toBackendComponent(k);
+      uint64_t backend_val = backend_k == BackendComponent::InvalidBit
+          ? 0
+          : 1ULL << (static_cast<uint8_t>(backend_k) - 1);
+      repr_ = functionality_val + backend_val;
+    } else {
+      // At this point, we should have covered every case except for alias keys.
+      // Technically it would be possible to add alias dispatch keys to a
+      // DispatchKeySet, but the semantics are a little confusing and this
+      // currently isn't needed anywhere.
+      repr_ = 0;
+    }
+  }
+
+  constexpr uint64_t keys_to_repr(std::initializer_list<DispatchKey> ks) {
+    uint64_t repr = 0;
     for (auto k : ks) {
-      repr_ |= DispatchKeySet(k).repr_;
+      repr |= DispatchKeySet(k).repr_;
     }
+    return repr;
   }
+
+  constexpr uint64_t backend_bits_to_repr(
+      std::initializer_list<BackendComponent> ks) {
+    uint64_t repr = 0;
+    for (auto k : ks) {
+      repr |= DispatchKeySet(k).repr_;
+    }
+    return repr;
+  }
+
+  explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
+      : repr_(keys_to_repr(ks)) {}
+
+  explicit constexpr DispatchKeySet(std::initializer_list<BackendComponent> ks)
+      // Note: for some reason, putting this logic directly in the constructor
+      // appears to fail to compile on CUDA 10.1.
+      // See an example internal failure at
+      // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr
+      : repr_(backend_bits_to_repr(ks)) {}
+
   // Test if a DispatchKey is in the set
-  bool inline has(DispatchKey t) const {
+  inline bool has(DispatchKey t) const {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined);
-    return static_cast<bool>(repr_ & DispatchKeySet(t).repr_);
+    return has_all(DispatchKeySet(t));
+  }
+  constexpr bool has_backend(BackendComponent t) const {
+    return has_all(DispatchKeySet(t));
+  }
+
+  // Test if a DispatchKey is in the set
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if all of them are in the current set.
+  constexpr bool has_all(DispatchKeySet ks) const {
+    return static_cast<bool>((repr_ & ks.repr_) == ks.repr_);
+  }
+
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if any of them are in the current set. This could technically
+  // be pretty easily implemented using has(). It is strictly a perf
+  // optimization though. There are many places in the code base where we want
+  // to test for multiple functionality keys together. HOWEVER, runtime
+  // per-backend functionality keys aren't allowed to be used with this
+  // function, because you can end up with weird results. e.g.
+  // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU))
+  // would return true.
+  inline bool has_any(DispatchKeySet ks) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // Either there are no backend bits in the input keyset
+        ((ks.repr_ & full_backend_mask) == 0) ||
+        // or there are no per-backend-functionality bits
+        // See [Note: Per-Backend Functionality Dispatch Keys]
+        ((ks &
+          DispatchKeySet({
+                             DispatchKey::Dense,
+                             DispatchKey::Quantized,
+                             DispatchKey::Sparse,
+                             DispatchKey::AutogradFunctionality,
+                         })
+              .repr_) == 0));
+    return static_cast<bool>((repr_ & ks.repr_) != 0);
   }
   // Test if DispatchKeySet is a superset of ks.
   bool isSupersetOf(DispatchKeySet ks) const {
@@ -74,31 +308,64 @@ class DispatchKeySet final {
     return DispatchKeySet(repr_ | other.repr_);
   }
   // Perform set intersection
-  DispatchKeySet operator&(DispatchKeySet other) const {
+  constexpr DispatchKeySet operator&(DispatchKeySet other) const {
     return DispatchKeySet(repr_ & other.repr_);
   }
-  // Compute the set difference self - other
+  // Compute the set difference self - other,
+  // but ONLY for the functionality keys.
+  // Any backend bits set on self will remain unchanged.
+  // See Note [Removing keys from DispatchKeySet Only Affects Functionality
+  // Keys]
   DispatchKeySet operator-(DispatchKeySet other) const {
-    return DispatchKeySet(repr_ & ~other.repr_);
+    return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_));
   }
+
   // Compute self ^ other
   constexpr DispatchKeySet operator^(DispatchKeySet other) const {
     return DispatchKeySet(repr_ ^ other.repr_);
   }
-  // Perform set equality
   bool operator==(DispatchKeySet other) const {
     return repr_ == other.repr_;
   }
+  bool operator!=(DispatchKeySet other) const {
+    return repr_ != other.repr_;
+  }
   // Add a DispatchKey to the DispatchKey set.  Does NOT mutate,
   // returns the extended DispatchKeySet!
   C10_NODISCARD DispatchKeySet add(DispatchKey t) const {
     return *this | DispatchKeySet(t);
   }
-  // Remove a DispatchKey from the DispatchKey set.  This is
-  // generally not an operation you should be doing (it's
-  // used to implement operator<<)
-  C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const {
-    return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_);
+  C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const {
+    return *this | ks;
+  }
+
+  // Remove a DispatchKey from the DispatchKey set.
+  // This is generally not an operation you should be doing
+  // (it's used to implement the printing overload, operator<<)
+  //
+  // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys]
+  // Only functionality bits are allowed to be removed from a keyset.
+  // For now, we're only allowing removal of "functionality bits" from the
+  // keyset, which is specifically needed by the fallthrough key calculation
+  // logic. Why is removing backend bits problematic? Consider this example:
+  //
+  // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA,
+  // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA)
+  // DispatchKeySet([DispatchKey.CPU,
+  // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA)
+  //
+  // What do we want to happen?
+  // Technically, we'd like it to be true that after removal,
+  // the first keyset still has the CUDA dispatch key while the second doesn't.
+  // Unfortunately there's no way to represent that, because the two keysets are
+  // represented the same way internally: functionality bits: Autograd, Dense
+  // backend bits: CPU, CUDA
+  //
+  // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd"
+  // bit from the bitset.
+  constexpr DispatchKeySet remove(DispatchKey t) const {
+    return DispatchKeySet(
+        repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask));
   }
   // Is the set empty?  (AKA undefined tensor)
   bool empty() const {
@@ -107,22 +374,112 @@ class DispatchKeySet final {
   uint64_t raw_repr() {
     return repr_;
   }
-  // Return the type id in this set with the highest priority (i.e.,
-  // is the largest in the DispatchKey enum).  Intuitively, this
-  // type id is the one that should handle dispatch (assuming there
-  // aren't any further exclusions or inclusions).
+
+  DispatchKey highestFunctionalityKey() const {
+    auto functionality_idx = indexOfHighestBit();
+    // This means that none of the functionality bits were set.
+    if (functionality_idx < num_backends)
+      return DispatchKey::Undefined;
+    // The first num_backend bits in the keyset don't correspond to real
+    // dispatch keys.
+    return static_cast<DispatchKey>(functionality_idx - num_backends);
+  }
+
+  // This is similar like toBackendComponent(DispatchKey), but less restrictive.
+  // toBackendComponent() errors out if the key that it was passed has no
+  // backend bits, which is useful for error checking. We need a version of that
+  // here that can also handle "fake" backends like FPGA, because they need to
+  // map to the AutogradOther key. For those backends, we return
+  // BackendComponent::InvalidBit.
+  BackendComponent highestBackendKey() const {
+    // mask to mask out functionality bits
+    auto backend_idx =
+        DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit();
+    // all zeros across the backend bits means that no backend bits are set.
+    if (backend_idx == 0)
+      return BackendComponent::InvalidBit;
+    return static_cast<BackendComponent>(backend_idx);
+  }
+
+  // returns the DispatchKey of highest priority in the set.
   DispatchKey highestPriorityTypeId() const {
-    // TODO: If I put Undefined as entry 64 and then adjust the
-    // singleton constructor to shift from the right, we can get rid of the
-    // subtraction here.  It's modestly more complicated to get right so I
-    // didn't do it for now.
-    return static_cast<DispatchKey>(64 - llvm::countLeadingZeros(repr_));
+    auto functionality_k = highestFunctionalityKey();
+    if (isPerBackendFunctionalityKey(functionality_k)) {
+      return toRuntimePerBackendFunctionalityKey(
+          functionality_k, highestBackendKey());
+    }
+    return functionality_k;
+  }
+
+  // Returns the index of the most-significant bit in the keyset.
+  // This is used to as part of the calculation into the operator table to get:
+  // - the highest "functionality" bit in the keyset.
+  // - the highest "backend" bit in the keyset.
+  uint8_t indexOfHighestBit() const {
+    return 64 - llvm::countLeadingZeros(repr_);
   }
 
-  DispatchKey highestPriorityBackendTypeId() const {
-    return (*this &
-            ((1ULL << static_cast<uint8_t>(DispatchKey::EndOfBackendKeys)) - 1))
-        .highestPriorityTypeId();
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+  // [Note: Trimmed Mobile Dispatch Keys]
+  /**
+   * The method below maps the dispatch key in the enum DispatchKey to an
+   * integer index in the dispatchTable_ array in OperatorEntry. The array
+   * is trimmed for mobile to reduce peak memory usage since it's
+   * unnecessary to reserve additional space for dispatch keys that will
+   * never be used on mobile.
+   */
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto dk = highestPriorityTypeId();
+    switch (dk) {
+      case DispatchKey::Undefined:
+        return 0;
+      case DispatchKey::CPU:
+        return 1;
+      case DispatchKey::QuantizedCPU:
+        return 2;
+      case DispatchKey::SparseCPU:
+        return 3;
+      case DispatchKey::BackendSelect:
+        return 4;
+      case DispatchKey::ADInplaceOrView:
+        return 5;
+      case DispatchKey::AutogradOther:
+        return 6;
+      case DispatchKey::AutogradCPU:
+        return 7;
+      default:
+        return -1;
+    }
+  }
+#else
+  // returns the index in the operator table of highest priority key in the the
+  // keyset Note that we could in theory implement this using
+  // highestPriorityTypeId(), but this code is very hotpath and we can do it
+  // faster without it.
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto functionality_idx =
+        DispatchKeySet(repr_ >> num_backends).indexOfHighestBit();
+    auto offset_and_mask = offsetsAndMasks()[functionality_idx];
+    // Mask the functionality bits out first, then right-shift by 1.
+    // right-shifting by 1 because everything is zero-indexed.
+    // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should
+    // give us an offset of 1, etc.
+    auto backend_idx =
+        DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit();
+    return offset_and_mask.offset + backend_idx;
+  }
+#endif
+
+  // returns the "index" of the highest priority backend in the keyset.
+  // This is pretty similar to getBackendKey(), but:
+  // - It's hotpath code (part of the runtime bitset calculation)
+  // - I's returns an integer index, not an enum value
+  // - Everything is shifted to the right by 1.
+  //   BackendComponent::InvalidBit is technically the lowest enum value,
+  //   but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2,
+  //   etc.
+  uint64_t getBackendIndex() const {
+    return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit();
   }
 
  private:
@@ -130,42 +487,53 @@ class DispatchKeySet final {
   uint64_t repr_ = 0;
 
  public:
-  // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the
-  // set. The iterator is only invalidated by the destruction of the underlying
-  // DispatchKeySet as the iterator stores a pointer to the raw representation
-  // of the DispatchKeySet.
+  // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys
+  // in the set. The iterator is only invalidated by the destruction of the
+  // underlying DispatchKeySet as the iterator stores a pointer to the raw
+  // representation of the DispatchKeySet. Note: When we encounter a per-backend
+  // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend
+  // in the keyset, for that functionality. For example, if the next
+  // functionality key to iterate over is Autograd, and the backend bits in the
+  // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit],
+  // then the next two keys we return will be DispatchKey::AutogradCPU,
+  // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than
+  // CUDA in DispatchKey.h).
   class iterator {
    public:
     using self_type = iterator;
     using iterator_category = std::input_iterator_tag;
     using value_type = DispatchKey;
     using difference_type = ptrdiff_t;
-
-    explicit iterator(const uint64_t* data_ptr, uint8_t i = 0)
-        : data_ptr_(data_ptr), i_(i) {
+    // final mask value should mask out the entire keyset
+    static const uint8_t end_iter_mask_val =
+        num_backends + num_functionality_keys;
+    // final key value should be the last DispatchKey
+    static const uint8_t end_iter_key_val = num_functionality_keys;
+
+    // current_dispatchkey_idx_ will iterate through all functionality bits.
+    // current_backendcomponent_idx_ will iterate through all backend bits.
+    explicit iterator(
+        const uint64_t* data_ptr,
+        uint8_t next_functionality = num_backends,
+        uint8_t next_backend = 0)
+        : data_ptr_(data_ptr),
+          next_functionality_(next_functionality),
+          next_backend_(next_backend),
+          // These are in an invalid state at construction time, and set by the
+          // first increment call
+          current_dispatchkey_idx_(end_iter_key_val),
+          current_backendcomponent_idx_(end_iter_key_val) {
       // Go to the first key in the set
+      TORCH_INTERNAL_ASSERT(
+          next_functionality_ >= num_backends,
+          "num_backends=",
+          static_cast<uint32_t>(num_backends),
+          "next_functionality_=",
+          static_cast<uint32_t>(next_functionality_));
       ++(*this);
     }
 
-    self_type& operator++() {
-      TORCH_INTERNAL_ASSERT(
-          i_ <= static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
-
-      // Create a masked version of the set representation to ignore previous
-      // keys that we've iterated through.
-      uint64_t masked_data = llvm::maskTrailingZeros<uint64_t>(i_) & *data_ptr_;
-      uint64_t firstKeyIndex = llvm::findFirstSet(masked_data);
-
-      // If there are no keys, set to end iterator value
-      if (firstKeyIndex == std::numeric_limits<uint64_t>::max() ||
-          i_ == static_cast<uint8_t>(DispatchKey::NumDispatchKeys)) {
-        i_ = static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
-        return *this;
-      }
-
-      i_ = static_cast<uint8_t>(firstKeyIndex) + 1;
-      return *this;
-    }
+    C10_API self_type& operator++();
 
     self_type operator++(int) {
       self_type previous_iterator = *this;
@@ -174,18 +542,50 @@ class DispatchKeySet final {
     }
 
     bool operator==(const self_type& rhs) const {
-      return i_ == rhs.i_;
+      return next_functionality_ == rhs.next_functionality_ &&
+          current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ &&
+          next_backend_ == rhs.next_backend_ &&
+          current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_;
     }
     bool operator!=(const self_type& rhs) const {
-      return i_ != rhs.i_;
+      return next_functionality_ != rhs.next_functionality_ ||
+          current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ ||
+          next_backend_ != rhs.next_backend_ ||
+          current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_;
     }
     DispatchKey operator*() const {
-      return static_cast<DispatchKey>(i_);
+      auto functionality_key =
+          static_cast<DispatchKey>(current_dispatchkey_idx_);
+      if (isPerBackendFunctionalityKey(functionality_key)) {
+        auto next_key = toRuntimePerBackendFunctionalityKey(
+            functionality_key,
+            static_cast<BackendComponent>(current_backendcomponent_idx_));
+        // We expect all of the Dense, Sparse, Quantized, and Autograd keys to
+        // be ordered the same way with respect to their backends
+        TORCH_INTERNAL_ASSERT(
+            toBackendComponent(next_key) ==
+                static_cast<BackendComponent>(current_backendcomponent_idx_),
+            "Tried to map functionality key ",
+            toString(functionality_key),
+            " and backend bit ",
+            toString(
+                static_cast<BackendComponent>(current_backendcomponent_idx_)),
+            " to a runtime key, but ended up with ",
+            toString(next_key),
+            ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.",
+            " Please double check that enum for inconsistencies.");
+        return next_key;
+      } else {
+        return functionality_key;
+      }
     }
 
    private:
     const uint64_t* data_ptr_;
-    uint8_t i_;
+    uint8_t next_functionality_;
+    uint8_t next_backend_;
+    uint8_t current_dispatchkey_idx_;
+    uint8_t current_backendcomponent_idx_;
   };
 
  public:
@@ -195,37 +595,42 @@ class DispatchKeySet final {
     return iterator(&repr_);
   }
 
-  // We do not need to iterate beyond NumDispatchKeys so we will treat this as
-  // the end iterator. NumDispatchKeys will always be strictly less than 64.
+  // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat
+  // this as the end iterator.
   iterator end() const {
-    return iterator(&repr_, static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
+    return iterator(&repr_, iterator::end_iter_mask_val);
   }
 };
 
 C10_API std::string toString(DispatchKeySet);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
 
-// autograd_dispatch_keyset should include all runtime autograd keys.
-// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset.
+C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+  return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
+}
+
+// Alias key DispatchKey::Autograd maps to
+// (autograd_dispatch_keyset x full_backend_mask)
 // NB: keys in this set also get associated with CompositeImplicitAutograd
+//
+// Note [autograd_dispatch_keyset Does Not Include Backend Bits]
+// We don't want to include any backend bits (BackendComponent::CPUBit, etc)
+// directly in autograd_dispatch_keyset.
+// Why? keysets like autograd_dispatch_keyset are commonly used to remove
+// autograd keys from a DispatchKeySet throughout the code base. However, you
+// are only allowed to remove functionality bits from a keyset, not backend
+// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality
+// Keys] for details. To be consistent and avoid confusion, we're explicitly
+// setting up autograd_dispatch_keyset to not have any backend bits.
 constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
-    DispatchKey::AutogradCPU,
-    DispatchKey::AutogradCUDA,
-    DispatchKey::AutogradXLA,
-    DispatchKey::AutogradLazy,
-    DispatchKey::AutogradNestedTensor,
-    DispatchKey::AutogradMLC,
-    DispatchKey::AutogradHPU,
-    DispatchKey::AutogradXPU,
-    DispatchKey::AutogradPrivateUse1,
-    DispatchKey::AutogradPrivateUse2,
-    DispatchKey::AutogradPrivateUse3,
+    DispatchKey::AutogradFunctionality,
     DispatchKey::AutogradOther,
 });
 
 constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
     DispatchKey::AutocastCPU,
     DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
 });
 
 // See Note [TLS Initialization]
@@ -237,32 +642,48 @@ constexpr DispatchKeySet default_included_set = DispatchKeySet({
 constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
     DispatchKey::AutocastCPU,
     DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
 });
 
 constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
     autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView);
 
+constexpr DispatchKeySet python_ks = DispatchKeySet({
+    DispatchKey::Python,
+    DispatchKey::PythonTLSSnapshot,
+});
+
+constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
+
+constexpr DispatchKeySet sparse_csr_ks =
+    DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA});
+
+constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
+
 // backend dispatch keys that map to DispatchKey::AutogradOther
 // NB: keys in this set also get associated with CompositeImplicitAutograd
-constexpr DispatchKeySet autogradother_backends = DispatchKeySet(
-    {DispatchKey::HIP,
-     DispatchKey::VE,
-     DispatchKey::FPGA,
-     DispatchKey::ORT,
-     DispatchKey::Vulkan,
-     DispatchKey::Metal,
-     DispatchKey::QuantizedCPU,
-     DispatchKey::QuantizedCUDA,
-     DispatchKey::CustomRNGKeyId,
-     DispatchKey::MkldnnCPU,
-     DispatchKey::SparseCPU,
-     DispatchKey::SparseCUDA,
-     DispatchKey::SparseHIP,
-     DispatchKey::SparseVE,
-     DispatchKey::SparseXPU,
-     DispatchKey::SparseCsrCPU,
-     DispatchKey::SparseCsrCUDA,
-     DispatchKey::Meta});
+constexpr DispatchKeySet autogradother_backends =
+    DispatchKeySet(
+        // HIP and VE aren't in this list: they now have their own backend bits
+        // which means that they can now have their own Autograd keys.
+        // Technically, HIP will now redispatch to its own custom AutogradHIP
+        // slot in the runtime table.
+        {DispatchKey::FPGA,
+         DispatchKey::ORT,
+         DispatchKey::Vulkan,
+         DispatchKey::Metal,
+         DispatchKey::SparseCsrCPU,
+         DispatchKey::SparseCsrCUDA,
+         DispatchKey::CustomRNGKeyId,
+         DispatchKey::MkldnnCPU,
+         DispatchKey::Meta,
+         // Sparse and Quantized backends also live here.
+         DispatchKey::Sparse,
+         DispatchKey::Quantized})
+    // Including the backend bits because this keyset is used during op
+    // registration, which requires looping over all runtime autogradother
+    // backend keys.
+    | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
 
 // The set of dispatch keys that come after autograd
 // n.b. this relies on the fact that AutogradOther is currently the lowest
@@ -292,6 +713,57 @@ constexpr DispatchKeySet after_func_keyset =
             // away with it by explicitly removing the key here.
             c10::DispatchKey::ADInplaceOrView);
 
+constexpr DispatchKeySet backend_bitset_mask =
+    DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1);
+
+constexpr auto inplace_or_view_ks =
+    DispatchKeySet(DispatchKey::ADInplaceOrView);
+constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
+constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU);
+constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
+constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
+constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
+constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy);
+constexpr auto autograd_mps_ks = DispatchKeySet(DispatchKey::AutogradMPS);
+constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU);
+constexpr auto autograd_privateuse1_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse1);
+constexpr auto autograd_privateuse2_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse2);
+constexpr auto autograd_privateuse3_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse3);
+constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther);
+
+// keyset correpsonding to functorch keys that have their own dedicated
+// TensorImpl subclass.
+constexpr auto functorch_transforms_ks = DispatchKeySet(
+    {DispatchKey::FuncTorchBatched,
+     DispatchKey::FuncTorchVmapMode,
+     DispatchKey::Batched,
+     DispatchKey::VmapMode,
+     DispatchKey::FuncTorchGradWrapper});
+
+// This keyset has:
+// (1) the functionality bits corresponding to backends (dense, sparse,
+// quantized) (2) all of the backend bits set
+constexpr DispatchKeySet backend_functionality_keys =
+    DispatchKeySet({
+        DispatchKey::Dense,
+        DispatchKey::Quantized,
+        DispatchKey::Sparse,
+    }) |
+    DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+
+struct OpTableOffsetAndMask {
+  uint16_t offset;
+  uint16_t backend_mask;
+};
+
+static_assert(
+    num_backends <= 16,
+    "Right now we expect the number of backends not to exceed 16. In the (unlikely) event"
+    " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too.");
+
 // true if t is a backend dispatch key
 C10_API bool isBackendDispatchKey(DispatchKey t);
 
@@ -307,10 +779,65 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k);
 C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
 
 // Returns a DispatchKeySet of autograd related keys mapped to backend.
-C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t);
+// for a given backend key, use the associated autograd key.
+// for non-backend keys, use AutogradOther as a default.
+// Note: it's convenient and fast to return a default here rather than (say)
+// returning an optional<DispatchKey>, or throwing. But it makes callers
+// responsible for either a) enforcing the invariant that only backend keys
+// be passed as arguments, or b) interpreting our return value carefully.
+inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return inplace_or_view_ks | autograd_cpu_ks;
+    case BackendComponent::IPUBit:
+      return inplace_or_view_ks | autograd_ipu_ks;
+    case BackendComponent::XPUBit:
+      return inplace_or_view_ks | autograd_xpu_ks;
+    case BackendComponent::CUDABit:
+      return inplace_or_view_ks | autograd_cuda_ks;
+    case BackendComponent::XLABit:
+      return inplace_or_view_ks | autograd_xla_ks;
+    case BackendComponent::LazyBit:
+      return inplace_or_view_ks | autograd_lazy_ks;
+    case BackendComponent::MPSBit:
+      return inplace_or_view_ks | autograd_mps_ks;
+    case BackendComponent::HPUBit:
+      return inplace_or_view_ks | autograd_hpu_ks;
+    case BackendComponent::PrivateUse1Bit:
+      return inplace_or_view_ks | autograd_privateuse1_ks;
+    case BackendComponent::PrivateUse2Bit:
+      return inplace_or_view_ks | autograd_privateuse2_ks;
+    case BackendComponent::PrivateUse3Bit:
+      return inplace_or_view_ks | autograd_privateuse3_ks;
+    default:
+      return inplace_or_view_ks | autograd_other_ks;
+  }
+}
 
 // Returns a DispatchKeySet of autocast related keys mapped to backend.
-C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t);
+inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
+  constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
+  constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU);
+  constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA);
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return autocast_cpu_ks;
+    case BackendComponent::XPUBit:
+      return autocast_xpu_ks;
+    case BackendComponent::CUDABit:
+    case BackendComponent::XLABit:
+      return autocast_cuda_ks;
+    default:
+      return DispatchKeySet();
+  }
+}
+
+// returns the "backend" DispatchKey of highest priority in the set.
+// This is basically like highestBackendKey(), except that we have some
+// "functionality" bits that correspond to backends (Sparse, Quantized)
+inline DispatchKey highestPriorityBackendTypeId(DispatchKeySet ks) {
+  return (ks & backend_functionality_keys).highestPriorityTypeId();
+}
 
 // This API exists because we have a use case for checking
 // getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
@@ -329,7 +856,8 @@ static inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) {
   // here.  At the moment, autograd keys and ADInplaceOrView key need this
   // treatment;
   return (s - autograd_dispatch_keyset_with_ADInplaceOrView -
-          autocast_dispatch_keyset)
+          autocast_dispatch_keyset -
+          DispatchKeySet({DispatchKey::PythonTLSSnapshot, DispatchKey::Python}))
       .highestPriorityTypeId();
 }
 
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 4022b150d084..e2876bf9a1cf 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -43,7 +43,7 @@ namespace detail {
  * Note this is a legacy method (from THRandom.cpp)
  * FIXME: use std::random_device with entropy information
  */
-#if !defined(_WIN32) && !defined(__XROS__)
+#if !defined(_WIN32)
 static uint64_t readURandomLong() {
   int randDev = open("/dev/urandom", O_RDONLY);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -56,7 +56,7 @@ static uint64_t readURandomLong() {
   close(randDev);
   return randValue;
 }
-#endif // _WIN32 && __XROS__
+#endif // _WIN32
 
 /**
  * Gets a non deterministic random number number from either the
@@ -82,9 +82,6 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
     s = (uint64_t)std::chrono::high_resolution_clock::now()
             .time_since_epoch()
             .count();
-#elif defined(__XROS__)
-    std::random_device rd;
-    s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
 #elif defined(__SGX_ENABLED__)
     TORCH_CHECK(
         sgx_read_rand(reinterpret_cast<uint8_t*>(&s), sizeof(s)) == SGX_SUCCESS,
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
index f37ceb18a835..0ac72439b7f0 100644
--- a/c10/core/Layout.h
+++ b/c10/core/Layout.h
@@ -6,12 +6,24 @@
 #include <ostream>
 
 namespace c10 {
-enum class Layout : int8_t { Strided, Sparse, SparseCsr, Mkldnn, NumOptions };
+enum class Layout : int8_t {
+  Strided,
+  Sparse,
+  SparseCsr,
+  Mkldnn,
+  SparseCsc,
+  SparseBsr,
+  SparseBsc,
+  NumOptions
+};
 
 constexpr auto kStrided = Layout::Strided;
 constexpr auto kSparse = Layout::Sparse;
 constexpr auto kSparseCsr = Layout::SparseCsr;
 constexpr auto kMkldnn = Layout::Mkldnn;
+constexpr auto kSparseCsc = Layout::SparseCsc;
+constexpr auto kSparseBsr = Layout::SparseBsr;
+constexpr auto kSparseBsc = Layout::SparseBsc;
 
 inline Layout layout_from_backend(Backend backend) {
   switch (backend) {
@@ -25,7 +37,9 @@ inline Layout layout_from_backend(Backend backend) {
       return Layout::Mkldnn;
     case Backend::SparseCsrCPU:
     case Backend::SparseCsrCUDA:
-      return Layout::SparseCsr;
+      TORCH_CHECK(
+          false,
+          "Cannot map Backend SparseCsrCPU|SparseCsrCUDA to a unique layout.");
     default:
       return Layout::Strided;
   }
@@ -39,6 +53,12 @@ inline std::ostream& operator<<(std::ostream& stream, at::Layout layout) {
       return stream << "Sparse";
     case at::kSparseCsr:
       return stream << "SparseCsr";
+    case at::kSparseCsc:
+      return stream << "SparseCsc";
+    case at::kSparseBsr:
+      return stream << "SparseBsr";
+    case at::kSparseBsc:
+      return stream << "SparseBsc";
     case at::kMkldnn:
       return stream << "Mkldnn";
     default:
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
index 8cafde1b5c5e..a4dfd1e87ebe 100644
--- a/c10/core/MemoryFormat.h
+++ b/c10/core/MemoryFormat.h
@@ -29,7 +29,8 @@ enum class MemoryFormat : int8_t {
   Contiguous,
   Preserve,
   ChannelsLast,
-  ChannelsLast3d
+  ChannelsLast3d,
+  NumOptions
 };
 
 // If you are seeing this, it means that this call site was not checked if
@@ -54,7 +55,7 @@ inline std::ostream& operator<<(
     case MemoryFormat::ChannelsLast3d:
       return stream << "ChannelsLast3d";
     default:
-      TORCH_CHECK(false, "Unknown memory format");
+      TORCH_CHECK(false, "Unknown memory format ", memory_format);
   }
 }
 
diff --git a/c10/core/QEngine.h b/c10/core/QEngine.h
index ac092193d921..60c21361f15f 100644
--- a/c10/core/QEngine.h
+++ b/c10/core/QEngine.h
@@ -15,11 +15,13 @@ enum class QEngine : uint8_t {
   NoQEngine = 0,
   FBGEMM = 1,
   QNNPACK = 2,
+  ONEDNN = 3,
 };
 
 constexpr auto kNoQEngine = QEngine::NoQEngine;
 constexpr auto kFBGEMM = QEngine::FBGEMM;
 constexpr auto kQNNPACK = QEngine::QNNPACK;
+constexpr auto kONEDNN = QEngine::ONEDNN;
 
 inline std::string toString(QEngine qengine) {
   switch (qengine) {
@@ -29,6 +31,8 @@ inline std::string toString(QEngine qengine) {
       return "FBGEMM";
     case kQNNPACK:
       return "QNNPACK";
+    case kONEDNN:
+      return "ONEDNN";
     default:
       TORCH_CHECK(
           false, "Unrecognized Quantized Engine: ", static_cast<int>(qengine));
diff --git a/c10/core/SafePyObject.cpp b/c10/core/SafePyObject.cpp
new file mode 100644
index 000000000000..d8c3da49ffb1
--- /dev/null
+++ b/c10/core/SafePyObject.cpp
@@ -0,0 +1,11 @@
+#include <c10/core/SafePyObject.h>
+#include <c10/core/TensorImpl.h>
+
+namespace c10 {
+
+PyObject* SafePyObject::ptr(const c10::impl::PyInterpreter* interpreter) const {
+  TORCH_INTERNAL_ASSERT(interpreter == pyinterpreter_);
+  return data_;
+}
+
+} // namespace c10
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
new file mode 100644
index 000000000000..13e32da3dc1d
--- /dev/null
+++ b/c10/core/SafePyObject.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/python_stub.h>
+
+namespace c10 {
+
+// This is an safe owning holder for a PyObject, akin to pybind11's
+// py::object, with two major differences:
+//
+//  - It is in c10/core; i.e., you can use this type in contexts where
+//    you do not have a libpython dependency
+//
+//  - It is multi-interpreter safe (ala torchdeploy); when you fetch
+//    the underlying PyObject* you are required to specify what the current
+//    interpreter context is and we will check that you match it.
+//
+// It is INVALID to store a reference to a Tensor object in this way;
+// you should just use TensorImpl directly in that case!
+struct C10_API SafePyObject {
+  // Steals a reference to data
+  SafePyObject(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+
+  // In principle this could be copyable if we add an incref to PyInterpreter
+  // but for now it's easier to just disallow it.
+  SafePyObject(SafePyObject const&) = delete;
+  SafePyObject& operator=(SafePyObject const&) = delete;
+
+  ~SafePyObject() {
+    pyinterpreter_->decref(data_, /*is_tensor*/ false);
+  }
+
+  c10::impl::PyInterpreter* pyinterpreter() const {
+    return pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
+} // namespace c10
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 08bf95e1875d..295d1006ff29 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -67,13 +67,17 @@ class C10_API Scalar {
   }
 
   // TODO: Support ComplexHalf accessor
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ACCESSOR)
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ACCESSOR)
 
   // also support scalar.to<int64_t>();
   // Deleted for unsupported types, but specialized below for supported types
   template <typename T>
   T to() const = delete;
 
+  const void* data_ptr() const {
+    return static_cast<const void*>(&v);
+  }
+
 #undef DEFINE_ACCESSOR
   bool isFloatingPoint() const {
     return Tag::HAS_d == tag;
@@ -201,7 +205,7 @@ using OptionalScalarRef = c10::OptionalRef<Scalar>;
   inline T Scalar::to<T>() const { \
     return to##name();             \
   }
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_TO)
 #undef DEFINE_TO
 
 } // namespace c10
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index a32d4aa42151..0728e67ef2f0 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -1,9 +1,8 @@
 #pragma once
 
-#include <c10/util/ArrayRef.h>
 #include <c10/util/BFloat16.h>
+#include <c10/util/Exception.h>
 #include <c10/util/Half.h>
-#include <c10/util/Optional.h>
 #include <c10/util/complex.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
@@ -63,6 +62,21 @@ namespace c10 {
   _(bool, Bool)                                                    \
   _(at::BFloat16, BFloat16)
 
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
+  _(uint8_t, Byte)                             \
+  _(int8_t, Char)                              \
+  _(int16_t, Short)                            \
+  _(int, Int)                                  \
+  _(int64_t, Long)                             \
+  _(at::Half, Half)                            \
+  _(float, Float)                              \
+  _(double, Double)                            \
+  _(c10::complex<c10::Half>, ComplexHalf)      \
+  _(c10::complex<float>, ComplexFloat)         \
+  _(c10::complex<double>, ComplexDouble)       \
+  _(bool, Bool)                                \
+  _(at::BFloat16, BFloat16)
+
 enum class ScalarType : int8_t {
 #define DEFINE_ENUM(_1, n) n,
   AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ENUM)
@@ -307,7 +321,7 @@ static inline bool isUnderlying(ScalarType type, ScalarType qtype) {
   return type == toUnderlying(qtype);
 }
 
-static inline ScalarType toValueType(ScalarType t) {
+static inline ScalarType toRealValueType(ScalarType t) {
   switch (t) {
     case ScalarType::ComplexHalf:
       return ScalarType::Half;
@@ -402,28 +416,28 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
         toString(b));
   }
 
-  // this matrix has to be consistent with AT_FORALL_SCALAR_TYPES_WITH_COMPLEX
-  // so that's why we have to add undefined as we are not sure what is the
-  // corrent values for the type promotions in complex type cases.
+  // this matrix has to be consistent with
+  // AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS undefined is used where we
+  // are not sure about the correct value for type promotion.
   static constexpr ScalarType _promoteTypesLookup[static_cast<int>(
       ScalarType::NumOptions)][static_cast<int>(ScalarType::NumOptions)] = {
       /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  q1  q2  q3  bf*/
-      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, ud, c4, c8, u1, ud, ud, ud, bf},
-      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, ud, c4, c8, i1, ud, ud, ud, bf},
-      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, ud, c4, c8, i2, ud, ud, ud, bf},
-      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, ud, c4, c8, i4, ud, ud, ud, bf},
-      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, ud, c4, c8, i8, ud, ud, ud, bf},
-      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, ud, c4, c8, f2, ud, ud, ud, f4},
-      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, ud, c4, c8, f4, ud, ud, ud, f4},
-      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, ud, c8, c8, f8, ud, ud, ud, f8},
-      /* c2 */ {ud, ud, ud, ud, ud, ud, ud, ud, c2, c4, c8, ud, ud, ud, ud, ud},
+      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf},
+      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf},
+      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf},
+      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf},
+      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf},
+      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4},
+      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4},
+      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8},
+      /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4},
       /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4},
       /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8},
-      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, ud, c4, c8, b1, ud, ud, ud, bf},
+      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf},
       /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
       /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
       /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, ud, c4, c8, bf, ud, ud, ud, bf},
+      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf},
   };
   return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
 }
diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h
index 6d4946b29bc3..910e0d24b0a3 100644
--- a/c10/core/ScalarTypeToTypeMeta.h
+++ b/c10/core/ScalarTypeToTypeMeta.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
 #include <c10/util/typeid.h>
 
 // these just expose TypeMeta/ScalarType bridge functions in c10
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index 11c7d396fa22..d00b644b10d5 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -14,7 +14,7 @@ struct C10_API Storage {
 
   // Allocates memory buffer using given allocator and creates a storage with it
   Storage(
-      use_byte_size_t use_byte_size,
+      use_byte_size_t /*use_byte_size*/,
       size_t size_bytes,
       Allocator* allocator = nullptr,
       bool resizable = false)
@@ -28,7 +28,7 @@ struct C10_API Storage {
   // potential future reallocations, however it can be nullptr if the storage
   // is non-resizable
   Storage(
-      use_byte_size_t use_byte_size,
+      use_byte_size_t /*use_byte_size*/,
       size_t size_bytes,
       at::DataPtr data_ptr,
       at::Allocator* allocator = nullptr,
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index f90eafee5418..cc167927229a 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -35,7 +35,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
   struct use_byte_size_t {};
 
   StorageImpl(
-      use_byte_size_t use_byte_size,
+      use_byte_size_t /*use_byte_size*/,
       size_t size_bytes,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
@@ -52,7 +52,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
   StorageImpl(
-      use_byte_size_t use_byte_size,
+      use_byte_size_t /*use_byte_size*/,
       size_t size_bytes,
       at::Allocator* allocator,
       bool resizable)
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
new file mode 100644
index 000000000000..d09135a3389b
--- /dev/null
+++ b/c10/core/SymInt.cpp
@@ -0,0 +1,18 @@
+
+#include <c10/core/SymInt.h>
+#include <c10/core/SymbolicIntNode.h>
+
+namespace c10 {
+
+std::shared_ptr<SymbolicIntNode> SymInt::toSymbolicIntNode() {
+  auto& st = getSymIntTable();
+  TORCH_CHECK(is_symbolic());
+  return st.getNode(SymInt::SYM_TAG_MASK ^ static_cast<uint64_t>(data_));
+}
+
+c10::SymInt SymInt::toSymInt(std::shared_ptr<SymbolicIntNode> sin_sp) {
+  auto& sit = getSymIntTable();
+  auto data = sit.addNode(sin_sp) | SYM_TAG_MASK;
+  return c10::SymInt(data);
+}
+} // namespace c10
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
new file mode 100644
index 000000000000..d189a98b042d
--- /dev/null
+++ b/c10/core/SymInt.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+class SymbolicIntNode;
+
+// `SymInt` is a C++ wrapper class around int64_t data_ which  and is used to
+// represent concrete dimension values.
+//
+// `SymInt` is also a data type in Pytorch that can be used in function schemas
+// to enable tracing.
+//
+// `SymInt` is introduced to enable tracing arithmetic
+// operations on symbolic integers (e.g. sizes). Tracing symbolic sizes will
+// allow LTC and AOTAutograd representing dynamic shapes in expression graphs
+// faithfully without baking in concrete dimension values.
+//
+// To trace the operations, SymInt will overload arithmetic operators (e.g. +,
+// -, *) and will provide overloads taking SymInt for commonly used math
+// functions.
+//
+// SymInt will be extenteded to represent a union structure Union[int64_t,
+// SymbolicIntNode*] which will be implemented as a single packed int64_t field
+// named data_.
+//
+// data_ can be either a plain int64_t or (1 << 63 | `index`). `index` points to
+// SymbolicIntNode* that will be responsible for constructing an IR node for
+// a traced operation to represent it in LTC or Fx graphs.
+class C10_API SymInt {
+ public:
+  explicit SymInt(int64_t d) : data_(d){};
+
+  int64_t expect_int() const {
+    TORCH_CHECK(!is_symbolic());
+    return data_;
+  }
+
+  bool is_symbolic() const {
+    return static_cast<uint64_t>(SYM_TAG_MASK) &
+        static_cast<uint64_t>(this->data_);
+  }
+
+  bool operator==(const SymInt& p2) const {
+    return data_ == p2.data_;
+  }
+
+  SymInt operator+(SymInt sci) const {
+    TORCH_CHECK(
+        !this->is_symbolic() && !sci.is_symbolic(),
+        "Symbolic Add isn't supported yet");
+    return SymInt(data_ + sci.data_);
+  }
+
+  std::shared_ptr<SymbolicIntNode> toSymbolicIntNode();
+  static c10::SymInt toSymInt(std::shared_ptr<SymbolicIntNode> sin);
+
+  // This is needed for interoperability with IValue
+  int64_t data() const {
+    return data_;
+  }
+
+ private:
+  const static int64_t SYM_TAG_MASK = 1LL << 63;
+  int64_t data_;
+};
+
+C10_API std::ostream& operator<<(std::ostream& os, SymInt s);
+} // namespace c10
diff --git a/c10/core/SymIntArrayRef.cpp b/c10/core/SymIntArrayRef.cpp
new file mode 100644
index 000000000000..1ac65c455be0
--- /dev/null
+++ b/c10/core/SymIntArrayRef.cpp
@@ -0,0 +1,23 @@
+#include <c10/core/SymIntArrayRef.h>
+#include <iostream>
+
+namespace c10 {
+
+at::IntArrayRef expectIntArrayRef(c10::SymIntArrayRef ar) {
+  for (c10::SymInt sci : ar) {
+    TORCH_CHECK(!sci.is_symbolic());
+  }
+
+  return IntArrayRef(reinterpret_cast<const int64_t*>(ar.data()), ar.size());
+}
+
+std::ostream& operator<<(std::ostream& os, SymInt s) {
+  os << "SymInt(" << s.data() << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& out, const c10::SymIntArrayRef& list) {
+  return out << list.wrapped_symint_array_ref;
+}
+
+} // namespace c10
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
new file mode 100644
index 000000000000..f7d3367dbd95
--- /dev/null
+++ b/c10/core/SymIntArrayRef.h
@@ -0,0 +1,183 @@
+// This file defines `SymIntArrayRef` which serves as the view onto
+// std::vector<SymInt>. This class is conceptually and mostly functionally
+// equivalent to ArrayRef<SymInt>.
+//
+// However, ArrayRef<SymInt> can't be used directly as it introduces ambiguity
+// in the following cases:
+//   - a.expand({1, 2, 3}) matches two overloads:
+//       1. `at::Tensor Tensor::expand(c10::SymIntArrayRef size, bool implicit)`
+//       2. `at::Tensor Tensor::expand(at::IntArrayRef size, bool implicit)`
+// Introducing `SymIntArrayRef` allows to have a finer-grained control over
+// which overload will be used.
+
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+#include <array>
+#include <initializer_list>
+#include <iterator>
+#include <vector>
+
+namespace c10 {
+/// SymIntArrayRef - Represent a constant reference to an array (0 or more
+/// elements consecutively in memory), i.e. a start pointer and a length.  It
+/// allows various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the SymIntArrayRef. For this reason, it is not in
+/// general safe to store an SymIntArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+
+class SymIntArrayRef final {
+ public:
+  using iterator = const c10::SymInt*;
+  using const_iterator = const c10::SymInt*;
+  using size_type = size_t;
+  using value_type = c10::SymInt;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  ArrayRef<c10::SymInt> wrapped_symint_array_ref;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty SymIntArrayRef.
+  /* implicit */ constexpr SymIntArrayRef() {}
+
+  /* implicit */ SymIntArrayRef(const std::vector<c10::SymInt>& Vec)
+      : wrapped_symint_array_ref(Vec) {}
+
+  /// Construct an SymIntArrayRef from a pointer and length.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef(
+      const c10::SymInt* data,
+      size_t length)
+      : wrapped_symint_array_ref(data, length) {}
+
+  /// Construct an SymIntArrayRef from a range.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef(
+      const c10::SymInt* begin,
+      const c10::SymInt* end)
+      : wrapped_symint_array_ref(begin, end) {}
+
+  /// Construct an SymIntArrayRef from a C array.
+  template <size_t N>
+  /* implicit */ constexpr SymIntArrayRef(const c10::SymInt (&Arr)[N])
+      : wrapped_symint_array_ref(Arr) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return wrapped_symint_array_ref.begin();
+  }
+  constexpr iterator end() const {
+    return wrapped_symint_array_ref.end();
+  }
+
+  // These are actually the same as iterator, since SymIntArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return wrapped_symint_array_ref.cbegin();
+  }
+  constexpr const_iterator cend() const {
+    return wrapped_symint_array_ref.cend();
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return size() == 0;
+  }
+
+  constexpr const c10::SymInt* data() const {
+    return wrapped_symint_array_ref.data();
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return wrapped_symint_array_ref.size();
+  }
+
+  /// front - Get the first element.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& front() const {
+    return wrapped_symint_array_ref.front();
+  }
+
+  /// back - Get the last element.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& back() const {
+    return wrapped_symint_array_ref.back();
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(SymIntArrayRef RHS) const {
+    return this->wrapped_symint_array_ref.equals(RHS.wrapped_symint_array_ref);
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef
+  slice(size_t N, size_t M) const {
+    return SymIntArrayRef(wrapped_symint_array_ref.data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef slice(size_t N) const {
+    return slice(N, size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const c10::SymInt& operator[](size_t Index) const {
+    return wrapped_symint_array_ref[Index];
+  }
+
+  /// Vector compatibility
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& at(size_t Index) const {
+    return wrapped_symint_array_ref.at(Index);
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, c10::SymInt>::value, SymIntArrayRef>::
+      type&
+      operator=(U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, c10::SymInt>::value, SymIntArrayRef>::
+      type&
+      operator=(std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<c10::SymInt> vec() const {
+    return wrapped_symint_array_ref.vec();
+  }
+
+  friend std::ostream& operator<<(
+      std::ostream& out,
+      const SymIntArrayRef& list);
+  /// @}
+};
+
+TORCH_API at::IntArrayRef expectIntArrayRef(c10::SymIntArrayRef ar);
+
+std::ostream& operator<<(std::ostream& out, const c10::SymIntArrayRef& list);
+
+} // namespace c10
diff --git a/c10/core/SymIntTable.cpp b/c10/core/SymIntTable.cpp
new file mode 100644
index 000000000000..c124ed737c25
--- /dev/null
+++ b/c10/core/SymIntTable.cpp
@@ -0,0 +1,28 @@
+#include <c10/core/SymbolicIntNode.h>
+
+namespace c10 {
+
+int64_t SymIntTable::addNode(std::shared_ptr<SymbolicIntNode> sin) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto index = nodes_.size();
+  nodes_.push_back(sin);
+  return index;
+}
+std::shared_ptr<SymbolicIntNode> SymIntTable::getNode(size_t index) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  TORCH_CHECK(index < nodes_.size());
+  return nodes_[index];
+}
+
+c10::SymInt SymbolicIntNode::toSymInt() {
+  // We will need to figure out a way
+  // to dedup nodes
+  auto sit_sp = this->shared_from_this();
+  return SymInt::toSymInt(sit_sp);
+}
+
+SymIntTable& getSymIntTable() {
+  static SymIntTable sit;
+  return sit;
+}
+} // namespace c10
diff --git a/c10/core/SymbolicIntNode.h b/c10/core/SymbolicIntNode.h
new file mode 100644
index 000000000000..cf8fb4de6abf
--- /dev/null
+++ b/c10/core/SymbolicIntNode.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <mutex>
+#include <vector>
+
+namespace c10 {
+
+class C10_API SymbolicIntNode
+    : public std::enable_shared_from_this<SymbolicIntNode> {
+ public:
+  c10::SymInt toSymInt();
+  virtual ~SymbolicIntNode(){};
+  virtual std::ostream& operator<<(std::ostream& os) {
+    return os;
+  };
+};
+
+class C10_API SymIntTable {
+ public:
+  int64_t addNode(std::shared_ptr<SymbolicIntNode> sin);
+  std::shared_ptr<SymbolicIntNode> getNode(size_t index);
+
+ private:
+  std::vector<std::shared_ptr<SymbolicIntNode>> nodes_;
+  std::mutex mutex_;
+};
+
+C10_API SymIntTable& getSymIntTable();
+
+} // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index b83ee395045e..272425d8855e 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -20,43 +20,6 @@ C10_DEFINE_int64(
 
 namespace c10 {
 
-namespace impl {
-
-static std::string noop_name_fn(const PyInterpreter*) {
-  return "<unloaded interpreter>";
-}
-
-static void noop_decref_fn(const PyInterpreter*, PyObject*, bool) {
-  // no-op
-}
-
-static c10::intrusive_ptr<TensorImpl> noop_detach_fn(
-    const PyInterpreter*,
-    const TensorImpl*) {
-  TORCH_INTERNAL_ASSERT(
-      0,
-      "attempted to detach (shallow_copy_and_detach) Tensor with nontrivial PyObject after corresponding interpreter died");
-}
-
-static void noop_dispatch_fn(
-    const PyInterpreter*,
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type) {
-  TORCH_INTERNAL_ASSERT(
-      0,
-      "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died");
-}
-
-void PyInterpreter::disarm() noexcept {
-  name_fn_ = &noop_name_fn;
-  decref_fn_ = &noop_decref_fn;
-  detach_fn_ = &noop_detach_fn;
-  dispatch_fn_ = &noop_dispatch_fn;
-}
-
-} // namespace impl
-
 const char* const TensorImpl::err_msg_tensor_metadata_change_not_allowed =
     "is not allowed on a Tensor created from .data or .detach().\n"
     "If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)\n"
@@ -120,11 +83,11 @@ TensorImpl::TensorImpl(
 
 // [Note: Python key removal]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// In most constructors for TensorImpl, you will see Python key is removed from
-// the passed in DispatchKeySet.  Why?
+// In most constructors for TensorImpl, you will see Python and
+// PythonTLSSnapshot keys are removed from the passed in DispatchKeySet.  Why?
 //
-// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a
-// nontrivial __torch_dispatch__ implementation.
+// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject
+// for the Tensor has a nontrivial __torch_dispatch__ implementation.
 //
 // When a fresh TensorImpl is created, there is *no* PyObject (this only gets
 // initialized lazily at the first point in time the Tensor passes into Python).
@@ -132,8 +95,8 @@ TensorImpl::TensorImpl(
 //
 // In practice, what will happen shortly afterwards is that the TensorImpl
 // will get its PyObject initialized by Tensor._make_subclass; at this point
-// the Python dispatch key will be set and all is well.  The point is to delay
-// the dispatch key setting until that point.
+// the Python and PythonTLSSnapshot dispatch keys will be set and all is well.
+// The point is to delay the dispatch key setting until that point.
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 TensorImpl::TensorImpl(
@@ -148,8 +111,7 @@ TensorImpl::TensorImpl(
       numel_(0),
       data_type_(data_type),
       device_opt_(storage_.device()),
-      key_set_(key_set.remove(
-          DispatchKey::Python)) { // See [Note: Python key removal]
+      key_set_(key_set - c10::python_ks) { // See [Note: Python key removal]
   init_bitfields();
   // Inference tensor doesn't have version counter.
   if (!is_inference()) {
@@ -190,12 +152,12 @@ TensorImpl::TensorImpl(
 
   // TODO: be more explicit about the full key set at call sites so we
   // don't have to keep recomputing it here
-  DispatchKey k = key_set.highestPriorityBackendTypeId();
+  auto k = key_set.highestBackendKey();
 
   key_set = key_set | getAutocastRelatedKeySetFromBackend(k);
 
-  key_set =
-      key_set.remove(DispatchKey::Python); // See [Note: Python key removal]
+  // See [Note: Python key removal]
+  key_set = key_set - c10::python_ks;
 
   // Inference tensor doesn't have autograd related keys.
   if (inference_mode) {
@@ -219,16 +181,6 @@ TensorImpl::TensorImpl(
   // Caffe2 operators create Storages with default devices.
 }
 
-#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-IntArrayRef TensorImpl::sizes() const {
-  return sizes_and_strides_.sizes_arrayref();
-}
-#endif
-
-IntArrayRef TensorImpl::strides() const {
-  return sizes_and_strides_.strides_arrayref();
-}
-
 void TensorImpl::HandleResize() {
   // If needed, we will free the data. the next mutable_data() call
   // will create the data storage.
@@ -371,11 +323,11 @@ void TensorImpl::release_resources() {
   if (storage_) {
     storage_ = {};
   }
-  if (owns_pyobj_) {
+  if (owns_pyobj()) {
     TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
     TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
     pyobj_interpreter_.load(std::memory_order_acquire)
-        ->decref(pyobj_, /*is_tensor*/ true);
+        ->decref(_unchecked_untagged_pyobj(), /*is_tensor*/ true);
     // NB: this destructor can only be entered when there are no
     // references to this C++ object (obviously), NOR any references
     // to the PyObject (if there are references to the PyObject,
@@ -386,22 +338,6 @@ void TensorImpl::release_resources() {
   }
 }
 
-#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-int64_t TensorImpl::dim() const {
-  return sizes_and_strides_.size();
-}
-#endif
-
-int64_t TensorImpl::size(int64_t d) const {
-  d = at::maybe_wrap_dim(d, dim(), false);
-  return sizes_and_strides_.size_at_unchecked(d);
-}
-
-int64_t TensorImpl::stride(int64_t d) const {
-  d = at::maybe_wrap_dim(d, dim(), false);
-  return sizes_and_strides_.stride_at_unchecked(d);
-}
-
 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
 bool TensorImpl::has_storage() const {
   return storage_;
@@ -413,28 +349,42 @@ void TensorImpl::throw_storage_access_error() const {
       false, "Cannot access storage of ", tensorimpl_type_name());
 }
 
-bool TensorImpl::is_contiguous_nondefault_policy_impl(
-    at::MemoryFormat memory_format) const {
-  if (has_contiguity_ ==
-      static_cast<uint8_t>(HasContiguityPolicy::ContiguityNotSupported)) {
-    TORCH_CHECK_NOT_IMPLEMENTED(
+bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
+  if (is_python_dispatch()) {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter) {
+      return interpreter->is_contiguous(this);
+    }
+    TORCH_CHECK(
         false,
-        "Tensors of type ",
-        tensorimpl_type_name(),
-        " do not have is_contiguous");
-  } else {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        has_contiguity_ ==
-        static_cast<uint8_t>(HasContiguityPolicy::CustomBehavior));
-    return is_contiguous_custom(memory_format);
+        "cannot access PyObject for Tensor on interpreter ",
+        pyobj_interpreter_.load()->name());
   }
+  TORCH_CHECK(
+      false,
+      "Tensors of type ",
+      tensorimpl_type_name(),
+      " do not have is_contiguous");
 }
 
-bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
-  TORCH_INTERNAL_ASSERT(
+IntArrayRef TensorImpl::sizes_custom() const {
+  TORCH_CHECK(
+      false, "Tensors of type ", tensorimpl_type_name(), " do not have sizes");
+}
+IntArrayRef TensorImpl::strides_custom() const {
+  TORCH_CHECK(
       false,
-      "TensorImpl::is_contiguous_custom should never be called; did you "
-      "set_has_contiguity_policy and forget to override is_contiguous_custom?");
+      "Tensors of type ",
+      tensorimpl_type_name(),
+      " do not have strides");
+}
+int64_t TensorImpl::dim_custom() const {
+  TORCH_CHECK(
+      false, "Tensors of type ", tensorimpl_type_name(), " do not have dim");
+}
+int64_t TensorImpl::numel_custom() const {
+  TORCH_CHECK(
+      false, "Tensors of type ", tensorimpl_type_name(), " do not have numel");
 }
 
 static void deletePlacementDeleteContext(void* ptr) {
@@ -544,18 +494,25 @@ c10::intrusive_ptr<TensorImpl> TensorImpl::shallow_copy_and_detach(
       std::move(version_counter), allow_tensor_metadata_change);
 }
 
-void TensorImpl::copy_tensor_metadata_except_version_counter(
+// This function copies all of the metadata from the src tensor except for:
+// - key_set_
+// - storage_
+// - storage_access_should_throw_
+// - sizes_strides_policy_
+// - version_counter_
+// - allow_tensor_metadata_change_
+// The idea is that if we have a "wrapper tensor" (like in functionalization),
+// all of the above are properties that the wrapper will want to customize,
+// while everything else should be mirrored between the wrapper and the inner
+// tensor.
+void TensorImpl::copy_generic_tensor_metadata(
     const TensorImpl* src_impl,
-    TensorImpl* dest_impl,
-    bool allow_tensor_metadata_change) {
-  dest_impl->storage_ = src_impl->storage_;
+    TensorImpl* dest_impl) {
   dest_impl->sizes_and_strides_ = src_impl->sizes_and_strides_;
   dest_impl->storage_offset_ = src_impl->storage_offset_;
   dest_impl->data_type_ = src_impl->data_type_;
   dest_impl->device_opt_ = src_impl->device_opt_;
-  dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python);
   dest_impl->is_contiguous_ = src_impl->is_contiguous_;
-  dest_impl->has_contiguity_ = src_impl->has_contiguity_;
   dest_impl->is_channels_last_contiguous_ =
       src_impl->is_channels_last_contiguous_;
   dest_impl->is_channels_last_3d_contiguous_ =
@@ -566,14 +523,32 @@ void TensorImpl::copy_tensor_metadata_except_version_counter(
       src_impl->is_non_overlapping_and_dense_;
   dest_impl->is_wrapped_number_ = src_impl->is_wrapped_number_;
   dest_impl->reserved_ = src_impl->reserved_;
-  dest_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
-  dest_impl->storage_access_should_throw_ =
-      src_impl->storage_access_should_throw_;
   if (src_impl->named_tensor_meta_ != nullptr) {
     dest_impl->named_tensor_meta_ = src_impl->named_tensor_meta_->clone();
   }
 }
 
+void TensorImpl::copy_tensor_metadata_except_version_counter(
+    const TensorImpl* src_impl,
+    TensorImpl* dest_impl,
+    bool allow_tensor_metadata_change) {
+  // First call the generic copy function
+  copy_generic_tensor_metadata(src_impl, dest_impl);
+  // Then copy everything else (see the comment at copy_generic_tensor_metadata
+  // for the list of metadata that it does not directly copy).
+  dest_impl->storage_ = src_impl->storage_;
+  // Copying tensor metadata doesn't change the PyObject (maybe
+  // it should), which means that we have to preserve whatever the
+  // original Python keyset was (as it's associated with the PyObject
+  // being a tensor subclass or not)
+  dest_impl->key_set_ = (src_impl->key_set_ - c10::python_ks) |
+      (dest_impl->key_set_ & c10::python_ks);
+  dest_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+  dest_impl->sizes_strides_policy_ = src_impl->sizes_strides_policy_;
+  dest_impl->storage_access_should_throw_ =
+      src_impl->storage_access_should_throw_;
+}
+
 void TensorImpl::copy_tensor_metadata(
     const TensorImpl* src_impl,
     TensorImpl* dest_impl,
@@ -601,21 +576,178 @@ void TensorImpl::copy_tensor_metadata(
   }
 }
 
-TorchDispatchTypeObject::TorchDispatchTypeObject(
-    PyObject* type_object,
-    c10::impl::PyInterpreter* pyinterpreter)
-    : data_(type_object), pyinterpreter_(pyinterpreter) {}
+// Legacy Caffe2 operations
 
-TorchDispatchTypeObject::~TorchDispatchTypeObject() {
-  pyinterpreter_->decref(data_, /*is_tensor*/ false);
-}
-
-c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const {
-  return pyinterpreter_;
+void TensorImpl::Extend(int64_t num, float growthPct) {
+  TORCH_CHECK(sizes_and_strides_.size() >= 1u);
+  TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend");
+  TORCH_CHECK(
+      is_contiguous_,
+      "Right now Extend is only supported for contiguous Tensor.");
+  using SizesVector = SmallVector<int64_t, 5>;
+  SizesVector newDims(
+      sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
+  newDims[0] += num;
+  if (!storage_.data()) {
+    Resize(newDims);
+    return;
+  }
+  const auto newNumel = c10::multiply_integers(newDims.begin(), newDims.end());
+  if (newNumel * data_type_.itemsize() <= storage_.nbytes()) {
+    sizes_and_strides_.set_sizes(newDims);
+    numel_ = newNumel;
+    return;
+  }
+  SizesVector newCapacity(
+      sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
+  newCapacity[0] = std::max(
+      newDims[0],
+      static_cast<int64_t>(std::ceil(
+          sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100))));
+  auto oldData = std::move(storage_.data_ptr());
+  auto oldSize = numel_;
+  Resize(newCapacity);
+  auto* newData = raw_mutable_data(data_type_);
+  if (data_type_.copy()) {
+    TORCH_CHECK(
+        device_type() == DeviceType::CPU, "non-POD types work only on CPU");
+    data_type_.copy()(oldData.get(), newData, oldSize);
+  } else {
+    // The following copy uses the current (thread local) stream for copying
+    // and also takes the GPU id from the device() field passed in.
+    //
+    // TODO: Potentially more enforcements are necessary to avoid accidental
+    // switch to sync copy if the currently set device is wrong.
+    //
+    // Specifically, we might need to switch to a different context device
+    // here explicitly to avoid relying on user synchronizing things
+    // properly.
+    CopyBytes(
+        oldSize * itemsize(),
+        oldData.get(),
+        device(),
+        newData,
+        device(),
+        true); // non-blocking
+  }
+  reserved_ = true;
+  sizes_and_strides_.set_sizes(newDims);
+  numel_ = newNumel;
 }
 
-PyObject* TorchDispatchTypeObject::ptr() const {
-  return data_;
+void TensorImpl::ReserveSpace(int64_t outer_dim) {
+  TORCH_CHECK(
+      is_contiguous_,
+      "Right now ReserveSpace is only supported for contiguous Tensor.");
+  TORCH_CHECK(storage_.unique(), "Can't call ReserveSpace on shared storage.");
+  // TODO: eliminate newCapacity.
+  SmallVector<int64_t, 5> newCapacity(
+      sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
+  newCapacity[0] = outer_dim;
+  auto newNumel = c10::multiply_integers(newCapacity);
+  if (newNumel * data_type_.itemsize() <= storage_.nbytes()) {
+    return;
+  }
+  // Old data is discarded
+  storage_.data_ptr().clear();
+  auto oldSize = numel_;
+  SmallVector<int64_t, 5> oldDims(
+      sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
+  Resize(newCapacity);
+  // Allocate new memory but don't copy over the data
+  raw_mutable_data(data_type_);
+  sizes_and_strides_.set_sizes(oldDims);
+  numel_ = oldSize;
+  reserved_ = true;
+}
+
+void TensorImpl::Reshape(const std::vector<int64_t>& dims) {
+  TORCH_CHECK(
+      is_contiguous_,
+      "Right now Reshape is only supported for contiguous Tensor.");
+  int64_t new_size = 1;
+  for (auto d : dims) {
+    TORCH_CHECK(d >= 0);
+    new_size *= d;
+  }
+  TORCH_CHECK(
+      new_size == numel_,
+      "New size and old size are not equal. You cannot use Reshape, "
+      "but should use Resize."
+      // TODO(jiayq): remove the following warning after pending diffs
+      // stabilize.
+      " The old caffe2 mixes Reshape and Resize but this behavior has "
+      "been changed. If you find this error, most likely you will need "
+      "to change corresponding code from Reshape to Resize.");
+  sizes_and_strides_.set_sizes(dims);
+  empty_tensor_restride(MemoryFormat::Contiguous);
+}
+
+void TensorImpl::FreeMemory() {
+  // We'll detach from the old Storage and create a new one
+  storage_ = Storage::create_legacy(storage_.device());
+  storage_offset_ = 0;
+}
+
+void TensorImpl::ShareData(const TensorImpl& src) {
+  // Right now, we are assuming the device_type are the same, since it is
+  // inherently the same in the non-templatized code. We should probably add
+  // an assert here which might affect perf a little bit.
+  TORCH_CHECK(
+      src.numel_ == numel_,
+      "Size mismatch - did you call reshape before sharing the data?");
+  // It is possible that the source tensor hasn't called mutable_data() yet,
+  // in which case ShareData() doesn't make much sense since we don't really
+  // know what to share yet.
+  // TODO: Add the assert after all uninitialized states are eliminated
+  // TORCH_CHECK(src.dtype_initialized(),
+  //            "Source tensor don't have a data type (did you call
+  //            mutable_data<T> on the tensor?)");
+  if (!src.dtype_initialized()) {
+    C10_LOG_EVERY_MS(WARNING, 1000)
+        << "Source tensor don't have a data type (did you call mutable_data<T> on the tensor?)";
+  }
+  TORCH_CHECK(
+      src.storage_initialized(),
+      "Source tensor has no content and has size > 0");
+  // Finally, do sharing.
+  /* Since we create new Storage whenever we need to change data_type/nbytes
+   * this still keeps the original semantics
+   */
+  storage_ = src.storage();
+  data_type_ = src.dtype();
+  device_opt_ = src.device_opt();
+  storage_offset_ = src.storage_offset();
+}
+
+void TensorImpl::ShareExternalPointer(
+    DataPtr&& data_ptr,
+    const caffe2::TypeMeta data_type,
+    size_t size_bytes) {
+  TORCH_CHECK(
+      data_type != ScalarType::Undefined,
+      "To share with a raw external pointer you need to pass in an "
+      "initialized data_type(TypeMeta).");
+  if (!size_bytes) {
+    size_bytes = numel_ * data_type.itemsize();
+  }
+  if (storage_.unique()) {
+    storage_.UniqueStorageShareExternalPointer(std::move(data_ptr), size_bytes);
+    data_type_ = data_type;
+    device_opt_ = storage_.device();
+    storage_offset_ = 0;
+  } else {
+    // Create a new Storage
+    storage_ = Storage(
+        Storage::use_byte_size_t(),
+        size_bytes,
+        std::move(data_ptr),
+        /*allocator=*/nullptr,
+        /*resizable=*/false);
+    data_type_ = data_type;
+    device_opt_ = storage_.device();
+    storage_offset_ = 0;
+  }
 }
 
 namespace impl {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 86aca278c9d3..717f066e4127 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -6,8 +6,11 @@
 #include <c10/core/InferenceMode.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/Storage.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorOptions.h>
+#include <c10/core/WrapDimMinimal.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/core/impl/PyInterpreter.h>
 #include <c10/core/impl/SizesAndStrides.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Flags.h>
@@ -16,9 +19,11 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/util/python_stub.h>
+#include <c10/util/safe_numerics.h>
 
 #include <algorithm>
 #include <atomic>
+#include <limits>
 #include <memory>
 #include <numeric>
 
@@ -49,17 +54,9 @@ class TensorBase;
 
 namespace c10 {
 class Scalar;
-struct IValue;
 struct Storage;
-class OperatorHandle;
 } // namespace c10
 
-namespace torch {
-namespace jit {
-using Stack = std::vector<c10::IValue>;
-}
-} // namespace torch
-
 namespace c10 {
 
 /**
@@ -92,7 +89,7 @@ inline int64_t size_to_dim_(int k, IntArrayRef dims) {
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
 inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
-  TORCH_CHECK((unsigned)l < dims.size());
+  TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size());
   int64_t r = 1;
   if (k < l) {
     for (int i = k + 1; i < l; ++i) {
@@ -168,9 +165,6 @@ struct C10_API AutogradMetaInterface {
   virtual ~AutogradMetaInterface();
 };
 
-// forward declared
-struct TorchDispatchTypeObject;
-
 namespace impl {
 
 // Unfortunately, the definition of AutogradMeta lives in a separate
@@ -196,137 +190,6 @@ struct C10_API AutogradMetaFactoryRegisterer {
   }
 };
 
-// Note [Python interpreter tag]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// We store a PyObject on TensorImpl so that we can efficiently translate
-// tensors into the Python representations.  However, in some situations
-// (torchdeploy) there may be multiple Python interpreters in a single process
-// and we must take care not to accidentally mix up PyObjects with the wrong
-// interpreters.  Thus, we also tag every TensorImpl with the Python interpreter
-// it corresponds to.
-//
-// With torchdeploy, we have these invariants:
-//  - Any given TensorImpl can be associated with AT MOST one Python
-//  interpreter.
-//    We represent the interpreter tag as a memory address to an instance of
-//    a virtual class that is allocated once per interpreter (this is so that
-//    we can request the interpreter to perform operations for us, if
-//    necessary).
-//  - A given TensorImpl's interpreter tag can only go from uninitialized to
-//    tagged; once tagged, this is a quiescent state (once tagged to an
-//    interpreter, ALWAYS tagged to that interpreter)
-//  - A thread may mutate the PyObject field of a TensorImpl if and only if it
-//    holds the GIL for the interpreter tagged on the TensorImpl.  (If the
-//    TensorImpl is not tagged, it must first atomically claim its tag before it
-//    can validly write)
-
-// The PyInterpreter object itself is a class that contains some function
-// pointers for interacting with the interpreter.  For now this is just for
-// debugging, but if a Tensor can own a PyObject, the interpreter can be used to
-// free it.
-//
-// WARNING: This class has to be written very carefully, because it may be
-// possible for a Tensor to have a reference an interpreter corresponding to
-// a shared library that has ALREADY BEEN UNLOADED.  This makes blindly calling
-// virtual methods very dangerous, because the vtable may be garbage at that
-// point (on a good day, you might get "pure virtual method called").
-//
-// The idea to solve this problem is we always leak PyInterpreters (so they
-// always stay live even after dlclose), and disarm the "virtual methods" by
-// replacing them with function pointers that just no-op.  This can't be done
-// with a traditional C++ vtable, so we have to roll our own.
-//
-// NB: The downside with representing PyInterpreter tags as full objects is that
-// it takes an extra word on TensorImpl.  If tags were instead just integer
-// indices, on 64-bit architectures we could pack the tag and PyObject together
-// into a single atomic word.  On 32-bit architectures we could simply say that
-// only one Python interpreter is supported (erroring if a nontrivial
-// interpreter tag is attempted to be set).
-//
-// The difficulty with this scheme is we need to maintain an out-of-line table
-// to get at the PyInterpreters so that we can do virtual method calls on them,
-// and registration/deregistration to this table must be done in a thread safe
-// manner.  This can be easily done if the number of possible PyInterpreters is
-// small enough (e.g., 8-bit integer) by simply preallocating an array of
-// sufficient size to hold all possible interpreters.  Surely 128 threads is
-// more than enough for anyone!
-//
-// I didn't decide to do this technique at the moment, because the extra word
-// added by the PyInterpreter tag takes us to 24 words, which means that we
-// still fit inside three eight word cache lines.  If you need to penny pinch
-// another word consider doing this!
-
-struct PyInterpreter;
-struct C10_API PyInterpreter {
-  using name_sig = std::string(const PyInterpreter*);
-  using decref_sig = void(const PyInterpreter*, PyObject*, bool);
-  using detach_sig =
-      c10::intrusive_ptr<TensorImpl>(const PyInterpreter*, const TensorImpl*);
-  using dispatch_sig = void(
-      const PyInterpreter*,
-      const c10::OperatorHandle&,
-      torch::jit::Stack* stack,
-      const std::shared_ptr<TorchDispatchTypeObject>& type);
-
-  PyInterpreter(
-      name_sig* name_fn,
-      decref_sig* decref_fn,
-      detach_sig* detach,
-      dispatch_sig* dispatch)
-      : name_fn_(name_fn),
-        decref_fn_(decref_fn),
-        detach_fn_(detach),
-        dispatch_fn_(dispatch) {}
-
-  name_sig* name_fn_;
-  decref_sig* decref_fn_;
-  detach_sig* detach_fn_;
-  dispatch_sig* dispatch_fn_;
-
-  // UBSAN suppression fixes: "call to function
-  // (anonymous namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*,
-  // _object*) through pointer to incorrect function type 'void (*)(const
-  // c10::impl::PyInterpreter *, _object *)'" See
-  // https://github.com/google/sanitizers/issues/911
-
-  // Report the name of this interpreter
-  __ubsan_ignore_function__ std::string name() const {
-    return (*name_fn_)(this);
-  }
-
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes an `is_tensor` arg]
-  __ubsan_ignore_function__ void decref(PyObject* pyobj, bool is_tensor) const {
-    return (*decref_fn_)(this, pyobj, is_tensor);
-  }
-
-  // Perform a detach by deferring to the __torch_dispatch__ implementation of
-  // detach, which will also arrange for the PyObject to get copied in this
-  // situation
-  __ubsan_ignore_function__ c10::intrusive_ptr<TensorImpl> detach(
-      const TensorImpl* self) const {
-    return (*detach_fn_)(this, self);
-  }
-
-  // Invoke the Python boxed fallback dispatch to go back into Python
-  __ubsan_ignore_function__ void dispatch(
-      const c10::OperatorHandle& op,
-      torch::jit::Stack* stack,
-      const std::shared_ptr<TorchDispatchTypeObject>& type) const {
-    return (*dispatch_fn_)(this, op, stack, type);
-  }
-
-  // Disarm this PyInterpreter, making all of its methods noops.
-  // Because the function pointers are raw pointers (not atomics),
-  // a disarm() invocation that is concurrent with active destructors
-  // is not thread safe and will trigger TSAN.  My hope is that this
-  // situations doesn't ever actually happen; tensor destruction should
-  // quiesce when a dlclose happens, and any long lived tensors whose
-  // destructors would be disarmed here only begin the destruction process
-  // on process shutdown (long after the dlclose has occurred).
-  void disarm() noexcept;
-};
-
 // PyInterpreterStatus describes what the state of its interpreter tag
 // is, relative to the thread currently holding the GIL.
 enum class PyInterpreterStatus {
@@ -361,30 +224,6 @@ struct C10_API NamedTensorMetaInterface {
   };
 };
 
-// NOTE [What is TorchDispatchTypeObject?]
-// A TorchDispatchTypeObject represents the type of a Tensor subclass that has
-// a __torch_dispatch__ classmethod. Concretely, it holds the class as a
-// PyObject* and a PyInterpreter* that says which python interpreter the class
-// came from.
-//
-// See NOTE [dispatch_fn's type argument] for more details
-struct C10_API TorchDispatchTypeObject {
-  // Steals a reference to type_object
-  TorchDispatchTypeObject(
-      PyObject* type_object,
-      c10::impl::PyInterpreter* pyinterpreter);
-
-  // Releases the stolen reference to type_object
-  ~TorchDispatchTypeObject();
-
-  c10::impl::PyInterpreter* pyinterpreter() const;
-  PyObject* ptr() const;
-
- private:
-  PyObject* data_;
-  c10::impl::PyInterpreter* pyinterpreter_;
-};
-
 // NOTE [ Version Counter Sharing ]
 //
 // Every Tensor has a version counter. Version counters are incremented whenever
@@ -700,34 +539,153 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Return a reference to the sizes of this tensor.  This reference remains
    * valid as long as the tensor is live and not resized.
    */
-  TENSORIMPL_MAYBE_VIRTUAL IntArrayRef sizes() const
-#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-  {
-    return sizes_and_strides_.sizes_arrayref();
+  IntArrayRef sizes() const {
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+      return sizes_custom();
+    }
+    return sizes_default();
   }
-#else
-      ;
-#endif
 
   /**
    * Return a reference to the strides of this tensor.  This reference remains
    * valid as long as the tensor is live and not restrided.
    */
-  virtual IntArrayRef strides() const;
+  IntArrayRef strides() const {
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+      return strides_custom();
+    }
+    return strides_default();
+  }
+
+  /**
+   * Return the size of a tensor at some dimension, wrapping the dimension if
+   * necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t size(int64_t d) const {
+    d = maybe_wrap_dim(d, dim(), false);
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+      return sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+    }
+    return sizes_and_strides_.size_at_unchecked(d);
+  }
+
+  /**
+   * Return the stride of a tensor at some dimension, wrapping the dimension
+   * if necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t stride(int64_t d) const {
+    d = maybe_wrap_dim(d, dim(), false);
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+      return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+    }
+    return sizes_and_strides_.stride_at_unchecked(d);
+  }
 
   /**
    * Return the number of dimensions of this tensor.  Note that 0-dimension
    * represents a Tensor that is a Scalar, e.g., one that has a single element.
    */
-  TENSORIMPL_MAYBE_VIRTUAL int64_t dim() const
-#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-  {
+  int64_t dim() const {
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+      return dim_custom();
+    }
+    return dim_default();
+  }
+
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: Previously, if you were using the Caffe2 API, you could
+   * test numel() == -1 to see if a tensor was uninitialized.  This
+   * is no longer true; numel always accurately reports the product
+   * of sizes of a tensor.
+   */
+  int64_t numel() const {
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+      return numel_custom();
+    }
+    return numel_default();
+  }
+
+  /**
+   * Whether or not a tensor is laid out in contiguous memory.
+   *
+   * Tensors with non-trivial strides are not contiguous.  See
+   * compute_contiguous() for the exact definition of whether or not
+   * a tensor is contiguous or not.
+   */
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    if (C10_UNLIKELY(
+            sizes_strides_policy_ >=
+            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+      return is_contiguous_custom(memory_format);
+    }
+    return is_contiguous_default(memory_format);
+  }
+
+ protected:
+  /**
+   * Customization points for the functions above.  sizes_strides_policy_
+   * must be set to enable these.
+   *
+   * NB: dim is overrideable separately from sizes because it is possible
+   * for a tensor to have rank, but not well defined sizes.
+   */
+  // sizes_strides_policy_ >= CustomStrides
+  virtual IntArrayRef strides_custom() const;
+  virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const;
+  // sizes_strides_policy_ >= CustomSizes
+  virtual IntArrayRef sizes_custom() const;
+  virtual int64_t dim_custom() const;
+  virtual int64_t numel_custom() const;
+
+  // These are factored into separate functions in case subclasses
+  // want to use them
+  inline IntArrayRef strides_default() const {
+    return sizes_and_strides_.strides_arrayref();
+  }
+  inline bool is_contiguous_default(at::MemoryFormat memory_format) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(compute_contiguous() == is_contiguous_);
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_contiguous_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_contiguous_;
+    }
+    return is_contiguous_;
+  }
+  inline IntArrayRef sizes_default() const {
+    return sizes_and_strides_.sizes_arrayref();
+  }
+  inline int64_t dim_default() const {
     return sizes_and_strides_.size();
   }
-#else
-      ;
+  inline int64_t numel_default() const {
+#ifdef DEBUG
+    TORCH_INTERNAL_ASSERT(compute_numel() == numel_);
 #endif
+    return numel_;
+  }
 
+ public:
   /**
    * True if this tensor has storage. See storage() for details.
    */
@@ -777,164 +735,125 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_;
   }
 
-  /**
-   * The number of elements in a tensor.
-   *
-   * WARNING: Previously, if you were using the Caffe2 API, you could
-   * test numel() == -1 to see if a tensor was uninitialized.  This
-   * is no longer true; numel always accurately reports the product
-   * of sizes of a tensor.
-   */
-  TENSORIMPL_MAYBE_VIRTUAL int64_t numel() const {
-#ifdef DEBUG
-    TORCH_INTERNAL_ASSERT(compute_numel() == numel_);
-#endif
-    return numel_;
-  }
-
   bool unique_version() const {
     return version_counter_.unique();
   }
 
-  /**
-   * Whether or not a tensor is laid out in contiguous memory.
-   *
-   * Tensors with non-trivial strides are not contiguous.  See
-   * compute_contiguous() for the exact definition of whether or not
-   * a tensor is contiguous or not.
-   *
-   * NOTE: is_contiguous is only `TENSORIMPL_MAYBE_VIRTUAL` for
-   * backward compatibility. See `set_has_contiguity_policy` and
-   * `is_contiguous_custom` for the encouraged customization point.
-   */
-  TENSORIMPL_MAYBE_VIRTUAL bool is_contiguous(
-      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
-    if (C10_UNLIKELY(
-            has_contiguity_ !=
-            static_cast<uint8_t>(HasContiguityPolicy::Default))) {
-      return is_contiguous_nondefault_policy_impl(memory_format);
-    }
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(compute_contiguous() == is_contiguous_);
-    if (memory_format == at::MemoryFormat::ChannelsLast) {
-      return is_channels_last_contiguous_;
-    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
-      return is_channels_last_3d_contiguous_;
-    }
-    return is_contiguous_;
-  }
-
- private:
-  bool is_contiguous_nondefault_policy_impl(at::MemoryFormat) const;
-
  protected:
-  /**
-   * Customization point for is_contiguous; must also
-   * set_has_contiguity_policy(HasContiguityPolicy::Custom) for this
-   * to be called.
-   */
-  virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const;
+  virtual Layout layout_impl() const {
+    TORCH_CHECK(
+        false, "layout_impl is only implemented for TensorImpl subclasses.");
+  }
 
  public:
+  // Whether a tensor is sparse COO or not.
   bool is_sparse() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::SparseCPU) ||
-        key_set_.has(DispatchKey::SparseCUDA) ||
-        key_set_.has(DispatchKey::SparseHIP) ||
-        key_set_.has(DispatchKey::SparseXPU);
+    return key_set_.has_all(c10::sparse_ks);
   }
 
-  // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR
-  // format.
+  // Whether a tensor is sparse CSR or not.
   bool is_sparse_csr() const {
-    return key_set_.has(DispatchKey::SparseCsrCPU) ||
-        key_set_.has(DispatchKey::SparseCsrCUDA);
+    return layout() == kSparseCsr;
   }
 
   bool is_quantized() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::QuantizedCPU) ||
-        key_set_.has(DispatchKey::QuantizedCUDA) ||
-        key_set_.has(DispatchKey::QuantizedXPU);
+    constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized);
+    return key_set_.has_all(quantized_ks);
   }
 
   bool is_meta() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::Meta);
+    constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta);
+    return key_set_.has_all(meta_ks);
   }
 
   bool is_cpu() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::CPU) ||
-        key_set_.has(DispatchKey::SparseCPU) ||
-        key_set_.has(DispatchKey::SparseCsrCPU) ||
-        key_set_.has(DispatchKey::QuantizedCPU) ||
-        key_set_.has(DispatchKey::MkldnnCPU);
+    constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) |
+        DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU});
+    return key_set_.has_any(cpu_bits_ks);
   }
 
   bool is_cuda() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::CUDA) ||
-        key_set_.has(DispatchKey::SparseCUDA) ||
-        key_set_.has(DispatchKey::SparseCsrCUDA) ||
-        key_set_.has(DispatchKey::QuantizedCUDA);
+    constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) |
+        DispatchKeySet(DispatchKey::SparseCsrCUDA);
+    return key_set_.has_any(cuda_bits_ks);
   }
 
   bool is_xpu() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::XPU) ||
-        key_set_.has(DispatchKey::SparseXPU) ||
-        key_set_.has(DispatchKey::QuantizedXPU);
+    constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit);
+    return key_set_.has_all(xpu_ks);
+  }
+
+  bool is_ipu() const {
+    constexpr auto ipu_ks = DispatchKeySet(BackendComponent::IPUBit);
+    return key_set_.has_all(ipu_ks);
   }
 
   bool is_xla() const {
-    return key_set_.has(DispatchKey::XLA);
+    constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit);
+    return key_set_.has_all(xla_ks);
   }
 
   bool is_hpu() const {
-    return key_set_.has(DispatchKey::HPU);
+    constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit);
+    return key_set_.has_all(hpu_ks);
   }
 
   bool is_lazy() const {
-    return key_set_.has(DispatchKey::Lazy);
+    constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit);
+    return key_set_.has_all(lazy_ks);
   }
 
   bool is_hip() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::HIP) ||
-        key_set_.has(DispatchKey::SparseHIP);
+    constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit);
+    return key_set_.has_all(hip_ks);
   }
 
   bool is_ve() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE);
+    constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit);
+    return key_set_.has_all(ve_ks);
   }
 
   bool is_mkldnn() const {
-    return key_set_.has(DispatchKey::MkldnnCPU);
+    return key_set_.has_all(c10::mkldnn_ks);
   }
 
   bool is_vulkan() const {
-    return key_set_.has(DispatchKey::Vulkan);
+    constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan);
+    return key_set_.has_all(vulkan_ks);
   }
 
   bool is_metal() const {
-    return key_set_.has(DispatchKey::Metal);
+    constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal);
+    return key_set_.has_all(metal_ks);
   }
 
-  bool is_mlc() const {
-    return key_set_.has(DispatchKey::MLC);
+  bool is_mps() const {
+    return key_set_.has(DispatchKey::MPS);
   }
 
   bool is_ort() const {
-    return key_set_.has(DispatchKey::ORT);
+    constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT);
+    return key_set_.has_all(ort_ks);
+  }
+
+  bool is_nested() const {
+    return key_set_.has(DispatchKey::NestedTensor);
   }
 
   // TODO: remove this once we don't automatically enabled Autograd dispatch
@@ -950,8 +869,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // Invariant:
   //   Inference tensor has version_counter_.enabled() == false
   bool is_inference() {
-    bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView);
-    bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty();
+    bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks);
+    bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         no_ADInplaceOrView == no_Autograd,
         "ADInplaceOrView and Autograd keys must be on/off at the same time.");
@@ -972,14 +891,32 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   Layout layout() const {
     // NB: This method is not virtual and avoid dispatches for perf.
-    if (is_sparse()) {
+    // strided is also the most common layout type, so we check for
+    // strided case first.
+    // This keyset must also be kept in sync with the logic in
+    // is_sparse() / is_sparse_csr() / is_mkldnn()
+    constexpr auto sparse_and_sparsecsr_and_mkldnn_ks =
+        c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks;
+    if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) {
+      return kStrided;
+    } else if (is_sparse()) {
       return kSparse;
-    } else if (is_sparse_csr()) {
-      return kSparseCsr;
-    } else if (is_mkldnn()) {
-      return kMkldnn;
+    } else if (key_set_.has_any(c10::sparse_csr_ks)) {
+      // Typically, the tensor dispatch keys define the tensor layout
+      // uniquely. This allows using non-virtual layout method for
+      // better performance. However, when tensor's layout depends,
+      // say, on tensor attributes, one must use this execution path
+      // where the corresponding tensor impl class overwrites virtual
+      // layout_impl() method.
+      //
+      // TODO: implement layout() as native function/method so that
+      // __torch_dispatch__ users will be able to redefine the
+      // layout() method.
+      return layout_impl();
     } else {
-      return kStrided;
+      TORCH_INTERNAL_ASSERT(
+          is_mkldnn(), "There is an error in the layout calculation logic.");
+      return kMkldnn;
     }
   }
 
@@ -1065,7 +1002,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Whether or not the imaginary part of the tensor should be negated
    */
   inline bool is_conj() const {
-    return key_set_.has(DispatchKey::Conjugate);
+    constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate);
+    return key_set_.has_all(conjugate_ks);
   }
 
   /**
@@ -1085,7 +1023,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Whether or not the tensor is a zerotensor
    */
   inline bool _is_zerotensor() const {
-    return key_set_.has(DispatchKey::ZeroTensor);
+    constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor);
+    return key_set_.has_all(zerotensor_ks);
   }
 
   /**
@@ -1105,7 +1044,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Whether or not the tensor should be negated
    */
   inline bool is_neg() const {
-    return key_set_.has(DispatchKey::Negative);
+    constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative);
+    return key_set_.has_all(negative_ks);
   }
 
   /**
@@ -1412,16 +1352,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     refresh_contiguous();
   }
 
-  /**
-   * Return the size of a tensor at some dimension.
-   */
-  virtual int64_t size(int64_t d) const;
-
-  /**
-   * Return the stride of a tensor at some dimension.
-   */
-  virtual int64_t stride(int64_t d) const;
-
   /**
    * Set whether a tensor allows changes to its metadata (e.g. sizes / strides /
    * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor
@@ -1476,14 +1406,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   void set_python_dispatch(bool k) {
     if (k) {
-      key_set_ = key_set_.add(DispatchKey::Python);
+      key_set_ = key_set_.add(c10::python_ks);
     } else {
-      key_set_ = key_set_.remove(DispatchKey::Python);
+      key_set_ = key_set_ - c10::python_ks;
     }
   }
 
   bool is_python_dispatch() const {
-    return key_set_.has(DispatchKey::Python);
+    return key_set_.has_all(c10::python_ks);
   }
 
   /**
@@ -1548,13 +1478,23 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   inline bool has_compatible_shallow_copy_type(DispatchKeySet from) {
     auto is_dense = [](DispatchKeySet ts) {
-      return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) ||
-          ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU);
+      constexpr auto dense_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::MPSBit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit});
+      constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
+      return ts.has_any(dense_k) && ts.has_any(dense_backends);
     };
     auto is_sparse = [](DispatchKeySet ts) {
-      return ts.has(DispatchKey::SparseCPU) ||
-          ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) ||
-          ts.has(DispatchKey::SparseXPU);
+      constexpr auto sparse_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit});
+      constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
+      return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
     };
     return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
         (is_sparse(key_set_) && is_sparse(from));
@@ -1679,6 +1619,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // we are the ONLY thread that can have gotten to this point.  It is not
     // possible to conflict with another zero interpreter as access is protected
     // by GIL
+    // NB: owns_pyobj tag is initially false
     pyobj_ = pyobj;
   }
 
@@ -1688,6 +1629,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return pyobj_interpreter_.load(std::memory_order_acquire);
   }
 
+  PyObject* _unchecked_untagged_pyobj() const {
+    return reinterpret_cast<PyObject*>(
+        reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
+  }
+
   // Test the interpreter tag.  If tagged for the current interpreter, return
   // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
   // returns a nullopt.  If it is definitely invalid, raises an error.
@@ -1707,7 +1653,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       return c10::nullopt;
     } else if (interpreter == self_interpreter) {
       // NB: pyobj_ could still be null!
-      return c10::make_optional(pyobj_);
+      return c10::make_optional(_unchecked_untagged_pyobj());
     } else {
       TORCH_CHECK(
           false,
@@ -1758,63 +1704,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    *
    * This op is auto-asynchronous if the underlying device (CUDA) supports it.
    */
-  void Extend(int64_t num, float growthPct) {
-    TORCH_CHECK(sizes_and_strides_.size() >= 1u);
-    TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend");
-    TORCH_CHECK(
-        is_contiguous_,
-        "Right now Extend is only supported for contiguous Tensor.");
-    using SizesVector = SmallVector<int64_t, 5>;
-    SizesVector newDims(
-        sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
-    newDims[0] += num;
-    if (!storage_.data()) {
-      Resize(newDims);
-      return;
-    }
-    const auto newNumel =
-        c10::multiply_integers(newDims.begin(), newDims.end());
-    if (newNumel * data_type_.itemsize() <= storage_.nbytes()) {
-      sizes_and_strides_.set_sizes(newDims);
-      numel_ = newNumel;
-      return;
-    }
-    SizesVector newCapacity(
-        sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
-    newCapacity[0] = std::max(
-        newDims[0],
-        static_cast<int64_t>(std::ceil(
-            sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100))));
-    auto oldData = std::move(storage_.data_ptr());
-    auto oldSize = numel_;
-    Resize(newCapacity);
-    auto* newData = raw_mutable_data(data_type_);
-    if (data_type_.copy()) {
-      TORCH_CHECK(
-          device_type() == DeviceType::CPU, "non-POD types work only on CPU");
-      data_type_.copy()(oldData.get(), newData, oldSize);
-    } else {
-      // The following copy uses the current (thread local) stream for copying
-      // and also takes the GPU id from the device() field passed in.
-      //
-      // TODO: Potentially more enforcements are necessary to avoid accidental
-      // switch to sync copy if the currently set device is wrong.
-      //
-      // Specifically, we might need to switch to a different context device
-      // here explicitly to avoid relying on user synchronizing things
-      // properly.
-      CopyBytes(
-          oldSize * itemsize(),
-          oldData.get(),
-          device(),
-          newData,
-          device(),
-          true); // non-blocking
-    }
-    reserved_ = true;
-    sizes_and_strides_.set_sizes(newDims);
-    numel_ = newNumel;
-  }
+  void Extend(int64_t num, float growthPct);
 
   /**
    * @brief Reserve space for the underlying tensor.
@@ -1822,33 +1712,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * This must be called after Resize(), since we only specify the first
    * dimension This does not copy over the old data to the newly allocated space
    */
-  template <class T>
-  void ReserveSpace(const T& outer_dim) {
-    TORCH_CHECK(
-        is_contiguous_,
-        "Right now ReserveSpace is only supported for contiguous Tensor.");
-    TORCH_CHECK(
-        storage_.unique(), "Can't call ReserveSpace on shared storage.");
-    // TODO: eliminate newCapacity.
-    SmallVector<int64_t, 5> newCapacity(
-        sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
-    newCapacity[0] = outer_dim;
-    auto newNumel = c10::multiply_integers(newCapacity);
-    if (newNumel * data_type_.itemsize() <= storage_.nbytes()) {
-      return;
-    }
-    // Old data is discarded
-    storage_.data_ptr().clear();
-    auto oldSize = numel_;
-    SmallVector<int64_t, 5> oldDims(
-        sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end());
-    Resize(newCapacity);
-    // Allocate new memory but don't copy over the data
-    raw_mutable_data(data_type_);
-    sizes_and_strides_.set_sizes(oldDims);
-    numel_ = oldSize;
-    reserved_ = true;
-  }
+  void ReserveSpace(int64_t outer_dim);
 
   /**
    * @brief Resizes a tensor.
@@ -1883,38 +1747,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Resizes the tensor without touching underlying storage.
    * This requires the total size of the tensor to remains constant.
    */
-  inline void Reshape(const std::vector<int64_t>& dims) {
-    TORCH_CHECK(
-        is_contiguous_,
-        "Right now Reshape is only supported for contiguous Tensor.");
-    int64_t new_size = 1;
-    for (auto d : dims) {
-      TORCH_CHECK(d >= 0);
-      new_size *= d;
-    }
-    TORCH_CHECK(
-        new_size == numel_,
-        "New size and old size are not equal. You cannot use Reshape, "
-        "but should use Resize."
-        // TODO(jiayq): remove the following warning after pending diffs
-        // stabilize.
-        " The old caffe2 mixes Reshape and Resize but this behavior has "
-        "been changed. If you find this error, most likely you will need "
-        "to change corresponding code from Reshape to Resize.");
-    sizes_and_strides_.set_sizes(dims);
-    empty_tensor_restride(MemoryFormat::Contiguous);
-  }
+  void Reshape(const std::vector<int64_t>& dims);
 
   /**
    * Release whatever memory the tensor was holding but keep size and type
    * information. Subsequent call to mutable_data will trigger new memory
    * allocation.
    */
-  inline void FreeMemory() {
-    // We'll detach from the old Storage and create a new one
-    storage_ = Storage::create_legacy(storage_.device());
-    storage_offset_ = 0;
-  }
+  void FreeMemory();
 
   /**
    * @brief Shares the data with another tensor.
@@ -1929,67 +1769,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * The source tensor should already have its data allocated.
    */
   // To be deprecated
-  void ShareData(const TensorImpl& src) {
-    // Right now, we are assuming the device_type are the same, since it is
-    // inherently the same in the non-templatized code. We should probably add
-    // an assert here which might affect perf a little bit.
-    TORCH_CHECK(
-        src.numel_ == numel_,
-        "Size mismatch - did you call reshape before sharing the data?");
-    // It is possible that the source tensor hasn't called mutable_data() yet,
-    // in which case ShareData() doesn't make much sense since we don't really
-    // know what to share yet.
-    // TODO: Add the assert after all uninitialized states are eliminated
-    // TORCH_CHECK(src.dtype_initialized(),
-    //            "Source tensor don't have a data type (did you call
-    //            mutable_data<T> on the tensor?)");
-    if (!src.dtype_initialized()) {
-      C10_LOG_EVERY_MS(WARNING, 1000)
-          << "Source tensor don't have a data type (did you call mutable_data<T> on the tensor?)";
-    }
-    TORCH_CHECK(
-        src.storage_initialized(),
-        "Source tensor has no content and has size > 0");
-    // Finally, do sharing.
-    /* Since we create new Storage whenever we need to change data_type/nbytes
-     * this still keeps the original semantics
-     */
-    storage_ = src.storage();
-    data_type_ = src.dtype();
-    device_opt_ = src.device_opt();
-    storage_offset_ = src.storage_offset();
-  }
+  void ShareData(const TensorImpl& src);
 
   void ShareExternalPointer(
       DataPtr&& data_ptr,
       const caffe2::TypeMeta data_type,
-      size_t size_bytes) {
-    TORCH_CHECK(
-        data_type != ScalarType::Undefined,
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    if (!size_bytes) {
-      size_bytes = numel_ * data_type.itemsize();
-    }
-    if (storage_.unique()) {
-      storage_.UniqueStorageShareExternalPointer(
-          std::move(data_ptr), size_bytes);
-      data_type_ = data_type;
-      device_opt_ = storage_.device();
-      storage_offset_ = 0;
-    } else {
-      // Create a new Storage
-      storage_ = Storage(
-          Storage::use_byte_size_t(),
-          size_bytes,
-          std::move(data_ptr),
-          /*allocator=*/nullptr,
-          /*resizable=*/false);
-      data_type_ = data_type;
-      device_opt_ = storage_.device();
-      storage_offset_ = 0;
-    }
-  }
+      size_t size_bytes);
 
   /**
    * Returns a mutable raw pointer of the underlying storage. Since we will need
@@ -2158,6 +1943,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         // Cleaning warning messages, no need to break as TORCH_CHECK(false)
         // terminates flow.
         // break;
+      case MemoryFormat::NumOptions:
+        TORCH_INTERNAL_ASSERT(false, "invalid memory format ", memory_format);
     }
     // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually
     // exclusive see #24090
@@ -2244,11 +2031,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Compute the number of elements based on the sizes of a tensor.
    */
   int64_t compute_numel() const {
-    int64_t n = 1;
-    for (auto s : sizes()) {
-      n *= s;
-    }
-    return n;
+#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE)
+    // Use overflow checks if supported by the compiler
+    return safe_compute_numel();
+#else
+    return c10::multiply_integers(sizes());
+#endif
   }
 
   /**
@@ -2257,14 +2045,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * using a sparse layout has multiple dimensions with large sizes.
    */
   int64_t safe_compute_numel() const {
-    int64_t n = 1;
-    for (auto s : sizes()) {
-      TORCH_CHECK(
-          s == 0 || n <= std::numeric_limits<int64_t>::max() / s,
-          "numel: integer multiplication overflow");
-      n *= s;
-    }
-    return n;
+    uint64_t n = 1;
+    bool overflows = c10::safe_multiplies_u64(sizes(), &n);
+    constexpr auto numel_max = std::min(
+        static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
+        static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
+
+    overflows |= (n > numel_max);
+    TORCH_CHECK(!overflows, "numel: integer multiplication overflow");
+    return static_cast<int64_t>(n);
   }
 
   /**
@@ -2392,36 +2181,43 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // See NOTE [ Metadata Change for a Detached Tensor ] for details.
   static const char* const err_msg_tensor_metadata_change_not_allowed;
 
+  static void copy_generic_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl);
+
  public:
   void set_storage_access_should_throw() {
     storage_access_should_throw_ = true;
   }
 
   bool owns_pyobj() {
-    return owns_pyobj_;
+    return reinterpret_cast<uintptr_t>(pyobj_) & 1;
   }
 
   void set_owns_pyobj(bool b) {
-    owns_pyobj_ = b;
+    pyobj_ = reinterpret_cast<PyObject*>(
+        reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
   }
 
- protected:
-  // Policy for adjusting the behavior of is_contiguous(). Allows
-  // subclass customization while still being able to inline
-  // is_contiguous() in the common case.
-  enum class HasContiguityPolicy : uint8_t {
-    // Default behavior: check is_contiguous_ and similar bitflags.
-    Default,
-    // Throw a generic error message that this tensor type does not
-    // support is_contiguous.
-    ContiguityNotSupported,
-    // Call virtual is_contiguous_custom method to implement custom
-    // is_contiguous behavior.
-    CustomBehavior,
+ public:
+  enum class SizesStridesPolicy : uint8_t {
+    // Default behavior, e.g., dense tensor.
+    //
+    // Can override: nothing
+    Default = 0,
+    // Customizable strides behavior, e.g., sparse tensor,
+    // mkldnn tensor.
+    //
+    // Can override: strides(), is_contiguous()
+    CustomStrides = 1,
+    // Customizable sizes behavior, e.g., nested tensor
+    //
+    // Can override: strides(), is_contiguous(), sizes(), dim(), numel()
+    CustomSizes = 2,
   };
 
-  void set_has_contiguity_policy(HasContiguityPolicy p) {
-    has_contiguity_ = static_cast<uint8_t>(p);
+  void set_sizes_strides_policy(SizesStridesPolicy policy) {
+    sizes_strides_policy_ = static_cast<uint8_t>(policy);
   }
 
   Storage storage_;
@@ -2481,17 +2277,24 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // care)
   std::atomic<impl::PyInterpreter*> pyobj_interpreter_;
 
-  // This field contains a weak reference to a PyObject representing
-  // this Tensor.  It MUST NOT be a strong reference, as that would
-  // create a reference cycle between Tensor and the PyObject.  If
-  // pyobj is nullptr, when we transfer Tensor to Python, we allocate
-  // a new PyObject for it and set this field.  This field does not
-  // have to be protected by an atomic as it is only allowed to be
-  // accessed when you hold the GIL.
+  // This field contains a reference to a PyObject representing this Tensor.
+  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
+  // PyObject for it and set this field.  This field does not have to be
+  // protected by an atomic as it is only allowed to be accessed when you hold
+  // the GIL, or during destruction of the tensor.
   //
   // When a PyObject dies, you are obligated to clear this field
   // (otherwise, you will try to use-after-free the pyobj); this currently
   // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
+  //
+  // NB: Ordinarily, this should not be a strong reference, as if the
+  // PyObject owns the Tensor, this would create a reference cycle.
+  // However, sometimes this ownership flips.  To track who owns
+  // who, this has a single pointer tag indicating whether or not the
+  // C++ object owns the PyObject (the common case, zero, means PyObject
+  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
+  // or check_pyobj for checked access.  See references to PyObject
+  // resurrection in torch/csrc/autograd/python_variable.cpp
   PyObject* pyobj_;
 
   c10::impl::SizesAndStrides sizes_and_strides_;
@@ -2523,9 +2326,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   // Tensor is contiguous
   bool is_contiguous_ : 1;
-  // gcc doesn't like enum class bitfields; see
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
-  /* HasContiguityPolicy */ uint8_t has_contiguity_ : 2;
 
   // Tensor is a subclass that does not permit storage access.
   bool storage_access_should_throw_ : 1;
@@ -2534,7 +2334,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // or -std=gnu++2a
   inline void init_bitfields() {
     is_contiguous_ = true;
-    has_contiguity_ = static_cast<uint8_t>(HasContiguityPolicy::Default);
 
     is_channels_last_ = false;
     is_channels_last_contiguous_ = false;
@@ -2544,7 +2343,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     is_wrapped_number_ = false;
     allow_tensor_metadata_change_ = true;
     reserved_ = false;
-    owns_pyobj_ = false;
+    sizes_strides_policy_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
     storage_access_should_throw_ = false;
   }
 
@@ -2598,12 +2397,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // then subsequent Resize()s will not free up Storage.
   bool reserved_ : 1;
 
-  // If pyobj_ is nullptr, this is always false.
-  // Otherwise, this indicates whether or not TensorImpl owns the pyobj_
-  // or vice versa.  Ordinarily, pyobj_ owns TensorImpl, but if the
-  // Python object's refcount goes to zero, we flip the ownership
-  // direction (to make sure the pyobj stays live).
-  bool owns_pyobj_ : 1;
+  // Call _custom() virtual methods for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t sizes_strides_policy_ : 2;
 
   // The set of DispatchKeys which describe this tensor.  NB: this
   // does NOT include Autograd (historically, it did, but
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index f7619db0d60f..e906720ba61f 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -643,6 +643,9 @@ inline DispatchKey computeDispatchKey(
           }
           return DispatchKey::CUDA;
         }
+        case DeviceType::IPU: {
+          return DispatchKey::IPU;
+        }
         case DeviceType::XPU: {
           if (isQIntType(dtype_)) {
             return DispatchKey::QuantizedXPU;
@@ -670,8 +673,8 @@ inline DispatchKey computeDispatchKey(
           return DispatchKey::XLA;
         case DeviceType::Lazy:
           return DispatchKey::Lazy;
-        case DeviceType::MLC:
-          return DispatchKey::MLC;
+        case DeviceType::MPS:
+          return DispatchKey::MPS;
         case DeviceType::Vulkan:
           return DispatchKey::Vulkan;
         case DeviceType::Metal:
@@ -680,6 +683,9 @@ inline DispatchKey computeDispatchKey(
           return DispatchKey::Meta;
         case DeviceType::HPU:
           return DispatchKey::HPU;
+        case DeviceType::PrivateUse1: {
+          return DispatchKey::PrivateUse1;
+        }
         default:
           TORCH_CHECK_NOT_IMPLEMENTED(
               false,
@@ -716,6 +722,9 @@ inline DispatchKey computeDispatchKey(
               device_.type());
       }
     case Layout::SparseCsr:
+    case Layout::SparseCsc:
+    case Layout::SparseBsr:
+    case Layout::SparseBsc:
       switch (device_.type()) {
         case DeviceType::CPU:
           return DispatchKey::SparseCsrCPU;
@@ -723,7 +732,9 @@ inline DispatchKey computeDispatchKey(
           return DispatchKey::SparseCsrCUDA;
         default:
           AT_ERROR(
-              "Unsupported device type for sparse CSR layout: ",
+              "Unsupported device type for ",
+              layout_,
+              " layout: ",
               device_.type());
       }
     default:
@@ -738,9 +749,14 @@ inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) {
     case DispatchKey::SparseHIP:
     case DispatchKey::SparseVE:
     case DispatchKey::SparseXPU:
+      return Layout::Sparse;
     case DispatchKey::SparseCsrCPU:
     case DispatchKey::SparseCsrCUDA:
-      return Layout::Sparse;
+      TORCH_CHECK(
+          false,
+          "Cannot map DispatchKey ",
+          dispatch_key,
+          " to a unique layout.");
     case DispatchKey::MkldnnCPU:
       return Layout::Mkldnn;
     default:
@@ -780,19 +796,24 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
       return DeviceType::Meta;
 
     // stuff that people are actively developing
+    case DispatchKey::IPU:
+    case DispatchKey::AutogradIPU:
+      return DeviceType::IPU;
     case DispatchKey::XPU:
     case DispatchKey::SparseXPU:
     case DispatchKey::QuantizedXPU:
     case DispatchKey::AutogradXPU:
       return DeviceType::XPU;
-    case DispatchKey::MLC:
-    case DispatchKey::AutogradMLC:
-      return DeviceType::MLC;
+    case DispatchKey::MPS:
+    case DispatchKey::AutogradMPS:
+      return DeviceType::MPS;
     case DispatchKey::HPU:
     case DispatchKey::AutogradHPU:
       return DeviceType::HPU;
     case DispatchKey::ORT:
       return DeviceType::ORT;
+    case DispatchKey::PrivateUse1:
+      return DeviceType::PrivateUse1;
     default:
       TORCH_CHECK(
           false,
diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
index 0b8c9c1348b6..1c24c17b53d3 100644
--- a/c10/core/UndefinedTensorImpl.cpp
+++ b/c10/core/UndefinedTensorImpl.cpp
@@ -7,14 +7,13 @@ namespace c10 {
 UndefinedTensorImpl::UndefinedTensorImpl()
     : TensorImpl(DispatchKey::Undefined, caffe2::TypeMeta(), c10::nullopt) {
   set_storage_access_should_throw();
+  // TODO: accessing the sizes on an undefined tensor is not meaningful
+  // and should error too, but empirically it does not!
+  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
 }
 
-int64_t UndefinedTensorImpl::size(int64_t d) const {
-  TORCH_CHECK(false, "size(dim) called on an undefined Tensor");
-}
-
-int64_t UndefinedTensorImpl::stride(int64_t d) const {
-  TORCH_CHECK(false, "stride(dim) called on an undefined Tensor");
+bool UndefinedTensorImpl::is_contiguous_custom(MemoryFormat format) const {
+  return is_contiguous_default(format);
 }
 
 #ifdef DEBUG
@@ -29,10 +28,6 @@ void UndefinedTensorImpl::set_storage_offset(int64_t) {
   TORCH_CHECK(false, "set_storage_offset() called on an undefined Tensor");
 }
 
-IntArrayRef UndefinedTensorImpl::strides() const {
-  TORCH_CHECK(false, "strides() called on undefined Tensor");
-}
-
 const char* UndefinedTensorImpl::tensorimpl_type_name() const {
   return "UndefinedTensorImpl";
 }
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index fc6501850499..ddf688a569c6 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -18,14 +18,14 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 #endif
     return &_singleton;
   }
-  IntArrayRef strides() const override;
-  int64_t size(int64_t d) const override;
-  int64_t stride(int64_t d) const override;
 #ifdef DEBUG
   bool has_storage() const override;
 #endif
   void set_storage_offset(int64_t offset) override;
 
+ protected:
+  bool is_contiguous_custom(MemoryFormat format) const override;
+
  private:
   UndefinedTensorImpl();
   static UndefinedTensorImpl _singleton;
diff --git a/c10/core/WrapDimMinimal.cpp b/c10/core/WrapDimMinimal.cpp
new file mode 100644
index 000000000000..2dc359fc5d4f
--- /dev/null
+++ b/c10/core/WrapDimMinimal.cpp
@@ -0,0 +1,36 @@
+#include <c10/core/WrapDimMinimal.h>
+
+namespace c10 {
+namespace detail {
+
+int64_t maybe_wrap_dim_slow(
+    int64_t dim,
+    int64_t dim_post_expr,
+    bool wrap_scalar) {
+  if (dim_post_expr <= 0) {
+    TORCH_CHECK_INDEX(
+        wrap_scalar,
+        "dimension specified as ",
+        dim,
+        " but tensor has no dimensions");
+    return c10::maybe_wrap_dim(dim, /*dim_post_expr=*/1, /*wrap_scalar=*/false);
+  }
+
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  TORCH_CHECK_INDEX(
+      min <= dim && dim <= max,
+      "Dimension out of range (expected to be in range of [",
+      min,
+      ", ",
+      max,
+      "], but got ",
+      dim,
+      ")");
+
+  TORCH_INTERNAL_ASSERT(
+      false, "should never reach here as dim should be out-of-bounds");
+}
+
+} // namespace detail
+} // namespace c10
diff --git a/c10/core/WrapDimMinimal.h b/c10/core/WrapDimMinimal.h
index 01cb1c641a14..4a6f37514749 100644
--- a/c10/core/WrapDimMinimal.h
+++ b/c10/core/WrapDimMinimal.h
@@ -4,37 +4,22 @@
 
 namespace c10 {
 
+namespace detail {
+C10_API int64_t
+maybe_wrap_dim_slow(int64_t dim, int64_t dim_post_expr, bool wrap_scalar);
+}
+
 static inline int64_t maybe_wrap_dim(
     int64_t dim,
     int64_t dim_post_expr,
     bool wrap_scalar = true) {
-  if (dim_post_expr <= 0) {
-    if (!wrap_scalar) {
-      TORCH_CHECK_INDEX(
-          false,
-          "dimension specified as ",
-          dim,
-          " but tensor has no dimensions");
-    }
-    dim_post_expr = 1; // this will make range [-1, 0]
-  }
-
-  int64_t min = -dim_post_expr;
-  int64_t max = dim_post_expr - 1;
-  if (dim < min || dim > max) {
-    TORCH_CHECK_INDEX(
-        false,
-        "Dimension out of range (expected to be in range of [",
-        min,
-        ", ",
-        max,
-        "], but got ",
-        dim,
-        ")");
+  // Inline the fast paths
+  if (C10_LIKELY(-dim_post_expr <= dim && dim < dim_post_expr)) {
+    // Branch-less version of dim + (dim < 0 ? dim_post_expr : 0)
+    return dim + dim_post_expr * (dim < 0);
   }
-  if (dim < 0)
-    dim += dim_post_expr;
-  return dim;
+  // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
+  return c10::detail::maybe_wrap_dim_slow(dim, dim_post_expr, wrap_scalar);
 }
 
 } // namespace c10
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index a87f25b60eed..5a409715a622 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -117,6 +117,7 @@ struct C10_API DeviceGuardImplInterface {
    */
   virtual Stream getStreamFromGlobalPool(Device, bool isHighPriority = false)
       const {
+    (void)isHighPriority; // Suppress unused varaible warning
     TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
   }
 
@@ -130,7 +131,7 @@ struct C10_API DeviceGuardImplInterface {
   /**
    * Destroys the given event.
    */
-  virtual void destroyEvent(void* event, const DeviceIndex device_index)
+  virtual void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
       const noexcept {}
 
   /**
@@ -140,10 +141,10 @@ struct C10_API DeviceGuardImplInterface {
    * event to continue and marks that version as recorded.
    * */
   virtual void record(
-      void** event,
-      const Stream& stream,
-      const DeviceIndex device_index,
-      const c10::EventFlag flag) const {
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const c10::EventFlag /*flag*/) const {
     TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
@@ -155,7 +156,7 @@ struct C10_API DeviceGuardImplInterface {
    * When the stream reaches this command it will stop processing
    * additional commands until that version of the event is marked as recorded.
    */
-  virtual void block(void* event, const Stream& stream) const {
+  virtual void block(void* /*event*/, const Stream& /*stream*/) const {
     TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
@@ -165,7 +166,7 @@ struct C10_API DeviceGuardImplInterface {
    *  (2) the current version is marked as recorded.
    * Returns false otherwise.
    */
-  virtual bool queryEvent(void* event) const {
+  virtual bool queryEvent(void* /*event*/) const {
     TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
@@ -180,7 +181,7 @@ struct C10_API DeviceGuardImplInterface {
    * Return true if all the work previously enqueued on the stream for
    * asynchronous execution has completed running on the device.
    */
-  virtual bool queryStream(const Stream& stream) const {
+  virtual bool queryStream(const Stream& /*stream*/) const {
     TORCH_CHECK(false, "Backend doesn't support querying streams.");
   }
 
@@ -188,7 +189,7 @@ struct C10_API DeviceGuardImplInterface {
    * Wait (by blocking the calling thread) until all the work previously
    * enqueued on the stream has completed running on the device.
    */
-  virtual void synchronizeStream(const Stream& stream) const {
+  virtual void synchronizeStream(const Stream& /*stream*/) const {
     TORCH_CHECK(false, "Backend doesn't support synchronizing streams.");
   }
 
@@ -225,15 +226,15 @@ struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
   void setDevice(Device) const override {
     // no-op
   }
-  void uncheckedSetDevice(Device d) const noexcept override {
+  void uncheckedSetDevice(Device) const noexcept override {
     // no-op
   }
-  Stream getStream(Device d) const noexcept override {
+  Stream getStream(Device) const noexcept override {
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
   // NB: These do NOT set the current device
-  Stream exchangeStream(Stream s) const noexcept override {
+  Stream exchangeStream(Stream) const noexcept override {
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
@@ -243,26 +244,26 @@ struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
 
   // Event-related functions
   void record(
-      void** event,
-      const Stream& stream,
-      const DeviceIndex device_index,
-      const EventFlag flag) const override {
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const EventFlag /*flag*/) const override {
     TORCH_CHECK(false, D, " backend doesn't support events.");
   }
-  void block(void* event, const Stream& stream) const override {
+  void block(void* /*event*/, const Stream& /*stream*/) const override {
     TORCH_CHECK(false, D, " backend doesn't support events.")
   }
-  bool queryEvent(void* event) const override {
+  bool queryEvent(void* /*event*/) const override {
     TORCH_CHECK(false, D, " backend doesn't support events.")
   }
-  void destroyEvent(void* event, const DeviceIndex device_index)
+  void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
       const noexcept override {}
 
   // Stream-related functions
-  bool queryStream(const Stream& stream) const override {
+  bool queryStream(const Stream& /*stream*/) const override {
     return true;
   }
-  void synchronizeStream(const Stream& stream) const override {
+  void synchronizeStream(const Stream& /*stream*/) const override {
     // Don't wait for anything.
   }
 };
diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h
index 2d47db0fdb18..c86255220c1c 100644
--- a/c10/core/impl/FakeGuardImpl.h
+++ b/c10/core/impl/FakeGuardImpl.h
@@ -9,7 +9,7 @@ namespace impl {
 
 // FakeGuardImpl is hardcoded to have eight devices.  Not for
 // any good reason, just to simplify code.
-constexpr size_t kFakeGuardImplMaxDevices = 8;
+constexpr DeviceIndex kFakeGuardImplMaxDevices = 8;
 
 /**
  * A fake implementation of DeviceGuardImplInterface suitable for testing.
@@ -21,7 +21,7 @@ struct FakeGuardImpl final : public DeviceGuardImplInterface {
   static constexpr DeviceType static_type = T;
   // Runtime device type is not used
   FakeGuardImpl(DeviceType) {}
-  FakeGuardImpl() {}
+  FakeGuardImpl() = default;
   DeviceType type() const override {
     return T;
   }
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index 050363fc7c11..70af58b95716 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard {
   DispatchKeySet exclude_;
 };
 
+struct C10_API ForceDispatchKeyGuard {
+ public:
+  ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set)
+      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+    c10::impl::_force_tls_local_dispatch_key_set(key_set);
+  }
+  ~ForceDispatchKeyGuard() {
+    c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
+  }
+
+ private:
+  c10::impl::LocalDispatchKeySet saved_keyset_;
+};
+
 // Non-RAII API for manipulating the thread-local dispatch state.
 // Please prefer the RAII API.  The non-RAII API may be useful when
 // the included/excluded state of a given DispatchKey must span
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
new file mode 100644
index 000000000000..145fc56bc0d5
--- /dev/null
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -0,0 +1,48 @@
+#include <c10/core/TensorImpl.h>
+#include <c10/core/impl/PyInterpreter.h>
+
+namespace c10 {
+namespace impl {
+
+static std::string noop_name_fn(const PyInterpreter*) {
+  return "<unloaded interpreter>";
+}
+
+static void noop_decref_fn(const PyInterpreter*, PyObject*, bool) {
+  // no-op
+}
+
+static c10::intrusive_ptr<TensorImpl> noop_detach_fn(
+    const PyInterpreter*,
+    const TensorImpl*) {
+  TORCH_INTERNAL_ASSERT(
+      0,
+      "attempted to detach (shallow_copy_and_detach) Tensor with nontrivial PyObject after corresponding interpreter died");
+}
+
+static void noop_dispatch_fn(
+    const PyInterpreter*,
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    const std::shared_ptr<SafePyObject>& type) {
+  TORCH_INTERNAL_ASSERT(
+      0,
+      "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died");
+}
+
+static bool noop_is_contiguous_fn(const PyInterpreter*, const TensorImpl*) {
+  TORCH_INTERNAL_ASSERT(
+      0,
+      "attempted to is_contiguous Tensor with nontrivial PyObject after corresponding interpreter died");
+}
+
+void PyInterpreter::disarm() noexcept {
+  name_fn_ = &noop_name_fn;
+  decref_fn_ = &noop_decref_fn;
+  detach_fn_ = &noop_detach_fn;
+  dispatch_fn_ = &noop_dispatch_fn;
+  is_contiguous_fn_ = &noop_is_contiguous_fn;
+}
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
new file mode 100644
index 000000000000..fb432e78c19e
--- /dev/null
+++ b/c10/core/impl/PyInterpreter.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/python_stub.h>
+#include <string>
+#include <vector>
+
+// Forward declarations
+
+namespace c10 {
+struct IValue;
+class OperatorHandle;
+struct TensorImpl;
+struct SafePyObject;
+} // namespace c10
+
+namespace torch {
+namespace jit {
+using Stack = std::vector<c10::IValue>;
+}
+} // namespace torch
+
+// Actual implementation
+
+namespace c10 {
+namespace impl {
+
+// Note [Python interpreter tag]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Traditionally, PyTorch is layered such that our Python library
+// (libtorch_python) references our pure C++ library (libtorch) as the
+// natural order of things.  However, sometimes this natural order is
+// subverted: C++ objects refer to Python objects (for example, we
+// store a PyObject* pointer on TensorImpl so that converting from a
+// C++ Tensor to a Python Tensor is just a memory dereference).
+//
+// These unusual orderings must be treated with care.  To start, you need to
+// virtualize the destructor so that the PyObject can be decref'ed on
+// destruction (because the C++ object itself doesn't know anything about
+// Python--remember, layering!).  This process itself is fraught, since
+// acquiring the GIL could lead to deadlocks if someone is blocking on you
+// while holding the GIL.  Furthermore, if the C++ objects outlive the
+// interpreter (which can happen if you stash them in a static global
+// variable defined in libtorch), you may attempt to decref the object when
+// the Python interpreter has already been shutdown.
+//
+// BUT WAIT, IT GETS WORSE.  With torchdeploy, there may be multiple Python
+// interpreters in a single process. If a C++ object is accessible from
+// multiple interpreters, we must take care not to accidentally pass a
+// PyObject from one interpreter with another interpreter.
+//
+// To prevent these mixups, we introduce a PyInterpreter "tag" (object with
+// a vtable), which specifies a specific Python interpreter.
+//
+//  - Any given object can be associated with AT MOST one Python interpreter.
+//    We represent the interpreter tag as a memory address to an instance of
+//    a virtual class that is allocated once per interpreter (this is so that
+//    we can request the interpreter to perform operations for us, if
+//    necessary).
+//
+//  - It can be recorded with a PyObject (PyInterpreterObject) so that
+//    we know what interpreter the object is associated with, and we can
+//    raise an error if you try to use the PyObject from the wrong
+//    interpreter context.
+//
+//  - It contains a vtable that can be used to perform various Python
+//    operations from ordinary C++ code that ordinarily wouldn't be accessible
+//    from libtorch.
+//
+// A simple use case is when a C++ object must be associated with a PyObject.
+// However, for TensorImpl, we lazily allocate a PyObject the first time the
+// object passes into Python.  The invariants for this situation are more
+// subtle:
+//
+//  - A given TensorImpl's interpreter tag can only go from uninitialized to
+//    tagged; once tagged, this is a quiescent state (once tagged to an
+//    interpreter, ALWAYS tagged to that interpreter)
+//
+//  - A thread may mutate the PyObject field of a TensorImpl if and only if it
+//    holds the GIL for the interpreter tagged on the TensorImpl.  (If the
+//    TensorImpl is not tagged, it must first atomically claim its tag before it
+//    can validly write)
+//
+// WARNING: This class has to be written very carefully, because it may be
+// possible for a Tensor to have a reference an interpreter corresponding to
+// a shared library that has ALREADY BEEN UNLOADED.  This makes blindly calling
+// virtual methods very dangerous, because the vtable may be garbage at that
+// point (on a good day, you might get "pure virtual method called").
+//
+// The idea to solve this problem is we always leak PyInterpreters (so they
+// always stay live even after dlclose), and disarm the "virtual methods" by
+// replacing them with function pointers that just no-op.  This can't be done
+// with a traditional C++ vtable, so we have to roll our own.
+//
+// NB: The downside with representing PyInterpreter tags as full objects is that
+// it takes an extra word on TensorImpl.  If tags were instead just integer
+// indices, on 64-bit architectures we could pack the tag and PyObject together
+// into a single atomic word.  On 32-bit architectures we could simply say that
+// only one Python interpreter is supported (erroring if a nontrivial
+// interpreter tag is attempted to be set).
+//
+// The difficulty with this scheme is we need to maintain an out-of-line table
+// to get at the PyInterpreters so that we can do virtual method calls on them,
+// and registration/deregistration to this table must be done in a thread safe
+// manner.  This can be easily done if the number of possible PyInterpreters is
+// small enough (e.g., 8-bit integer) by simply preallocating an array of
+// sufficient size to hold all possible interpreters.  Surely 128 threads is
+// more than enough for anyone!
+//
+// I didn't decide to do this technique at the moment, because the extra word
+// added by the PyInterpreter tag takes us to 24 words, which means that we
+// still fit inside three eight word cache lines.  If you need to penny pinch
+// another word consider doing this!
+
+struct C10_API PyInterpreter {
+  // Feel free to add as much random crap here as you need; each of these
+  // can be thought of as a "C++ to Python" hook.
+  using name_sig = std::string(const PyInterpreter*);
+  using decref_sig = void(const PyInterpreter*, PyObject*, bool);
+  using detach_sig =
+      c10::intrusive_ptr<TensorImpl>(const PyInterpreter*, const TensorImpl*);
+  using dispatch_sig = void(
+      const PyInterpreter*,
+      const c10::OperatorHandle&,
+      torch::jit::Stack* stack,
+      // This is a Tensor subclass type object
+      const std::shared_ptr<SafePyObject>& type);
+  using is_contiguous_sig = bool(const PyInterpreter*, const TensorImpl*);
+
+  PyInterpreter(
+      name_sig* name_fn,
+      decref_sig* decref_fn,
+      detach_sig* detach,
+      dispatch_sig* dispatch,
+      is_contiguous_sig* is_contiguous)
+      : name_fn_(name_fn),
+        decref_fn_(decref_fn),
+        detach_fn_(detach),
+        dispatch_fn_(dispatch),
+        is_contiguous_fn_(is_contiguous) {}
+
+  name_sig* name_fn_;
+  decref_sig* decref_fn_;
+  detach_sig* detach_fn_;
+  dispatch_sig* dispatch_fn_;
+  is_contiguous_sig* is_contiguous_fn_;
+
+  // UBSAN suppression fixes: "call to function
+  // (anonymous namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*,
+  // _object*) through pointer to incorrect function type 'void (*)(const
+  // c10::impl::PyInterpreter *, _object *)'" See
+  // https://github.com/google/sanitizers/issues/911
+
+  // Report the name of this interpreter
+  __ubsan_ignore_function__ std::string name() const {
+    return (*name_fn_)(this);
+  }
+
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
+  // See NOTE [PyInterpreter::decref takes an `is_tensor` arg]
+  __ubsan_ignore_function__ void decref(PyObject* pyobj, bool is_tensor) const {
+    return (*decref_fn_)(this, pyobj, is_tensor);
+  }
+
+  // Perform a detach by deferring to the __torch_dispatch__ implementation of
+  // detach, which will also arrange for the PyObject to get copied in this
+  // situation
+  __ubsan_ignore_function__ c10::intrusive_ptr<TensorImpl> detach(
+      const TensorImpl* self) const {
+    return (*detach_fn_)(this, self);
+  }
+
+  // Invoke the Python boxed fallback dispatch to go back into Python
+  __ubsan_ignore_function__ void dispatch(
+      const c10::OperatorHandle& op,
+      torch::jit::Stack* stack,
+      const std::shared_ptr<SafePyObject>& type) const {
+    return (*dispatch_fn_)(this, op, stack, type);
+  }
+
+  __ubsan_ignore_function__ bool is_contiguous(const TensorImpl* self) const {
+    return (*is_contiguous_fn_)(this, self);
+  }
+
+  // Disarm this PyInterpreter, making all of its methods noops.
+  // Because the function pointers are raw pointers (not atomics),
+  // a disarm() invocation that is concurrent with active destructors
+  // is not thread safe and will trigger TSAN.  My hope is that this
+  // situations doesn't ever actually happen; tensor destruction should
+  // quiesce when a dlclose happens, and any long lived tensors whose
+  // destructors would be disarmed here only begin the destruction process
+  // on process shutdown (long after the dlclose has occurred).
+  void disarm() noexcept;
+};
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index c1ac4bd0ed0c..a098003f9501 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -7,6 +7,7 @@
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
+#include <c10/util/llvmMathExtras.h>
 
 #include <cuda_runtime_api.h>
 #include <algorithm>
@@ -177,6 +178,8 @@ struct Block {
   Block* prev; // prev block if split from a larger allocation
   Block* next; // next block if split from a larger allocation
   int event_count; // number of outstanding CUDA events
+  int gc_count; // counter for prioritizing older / less useful blocks for
+                // garbage collection
 
   Block(
       int device,
@@ -193,7 +196,8 @@ struct Block {
         allocated(0),
         prev(nullptr),
         next(nullptr),
-        event_count(0) {}
+        event_count(0),
+        gc_count(0) {}
 
   // constructor for search key
   Block(int device, cudaStream_t stream, size_t size)
@@ -206,7 +210,8 @@ struct Block {
         allocated(0),
         prev(nullptr),
         next(nullptr),
-        event_count(0) {}
+        event_count(0),
+        gc_count(0) {}
 
   bool is_split() const {
     return (prev != nullptr) || (next != nullptr);
@@ -310,7 +315,7 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
   if (at::cuda::currentStreamCaptureStatusMayInitCtx() ==
       at::cuda::CaptureStatus::None) {
 #endif
-    return cudaMalloc(p, size);
+    return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
   } else {
     // It's ok to capture cudaMallocs, as long as we never cudaFree those
@@ -318,7 +323,7 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
     // Capturing cudaMalloc behaves nicely: it gives the graph new VA,
     // but is ignored (won't leakily allocate new memory) in replays.
     at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed};
-    return cudaMalloc(p, size);
+    return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
   }
 #endif
 }
@@ -330,6 +335,17 @@ class CachingAllocatorConfig {
   static size_t max_split_size() {
     return instance().m_max_split_size;
   }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
+
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As ane example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
 
  private:
   static CachingAllocatorConfig& instance() {
@@ -342,8 +358,12 @@ class CachingAllocatorConfig {
   }
 
   CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()) {}
+      : m_max_split_size(std::numeric_limits<size_t>::max()),
+        m_roundup_power2_divisions(0),
+        m_garbage_collection_threshold(0) {}
   size_t m_max_split_size;
+  size_t m_roundup_power2_divisions;
+  double m_garbage_collection_threshold;
 
   void parseArgs() {
     const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
@@ -373,6 +393,32 @@ class CachingAllocatorConfig {
             val2 = std::min(
                 val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
             m_max_split_size = val2 * 1024 * 1024;
+          } else if (kv[0].compare("roundup_power2_divisions") == 0) {
+            size_t val2 = stoi(kv[1]);
+            TORCH_CHECK(
+                llvm::isPowerOf2_64(val2),
+                "For roundups, the divisons has to be power of 2 ",
+                "");
+            m_roundup_power2_divisions = val2;
+          } else if (kv[0].compare("garbage_collection_threshold") == 0) {
+            /*
+             * Perform garbage collection of GPU memory blocks to avoid
+             * triggering expensive sync-and-reclaim-all operation. Upon setting
+             * the threshold (e.g., 0.8), the allocator will start reclaiming
+             * blocks if GPU memory capacity usage exceeds the threshold (i.e.,
+             * 80% of total memory).
+             * Values 0.0 and 1.0 are not allowed as they are less meaningful.
+             */
+            double val2 = stod(kv[1]);
+            TORCH_CHECK(
+                val2 > 0,
+                "garbage_collect_threshold too small, set it 0.0~1.0",
+                "");
+            TORCH_CHECK(
+                val2 < 1.0,
+                "garbage_collect_threshold too big, set it 0.0~1.0",
+                "");
+            m_garbage_collection_threshold = val2;
           } else {
             TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
           }
@@ -469,18 +515,31 @@ class DeviceCachingAllocator {
     params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
     params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
 
+    // First, try to get a block from the existing pool.
     bool block_found =
         // Search pool
         get_free_block(params)
         // Trigger callbacks and retry search
-        || (trigger_free_memory_callbacks(params) && get_free_block(params))
-        // Attempt allocate
-        || alloc_block(params, false)
-        // Free enough available cached blocks to satisfy alloc and retry alloc.
-        ||
-        (release_available_cached_blocks(params) && alloc_block(params, false))
-        // Free all non-split cached blocks and retry alloc.
-        || (release_cached_blocks() && alloc_block(params, true));
+        || (trigger_free_memory_callbacks(params) && get_free_block(params));
+
+    // Can't reuse an existing block; try to get a new one.
+    if (!block_found) {
+      // Do garbage collection if the flag is set.
+      if (C10_UNLIKELY(
+              set_fraction &&
+              CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+        garbage_collect_cached_blocks();
+      }
+      // Attempt allocate
+      block_found = alloc_block(params, false)
+          // Free enough available cached blocks to satisfy alloc and retry
+          // alloc.
+          || (release_available_cached_blocks(params) &&
+              alloc_block(params, false))
+          // Free all non-split cached blocks and retry alloc.
+          || (C10_LIKELY(captures_underway == 0) && release_cached_blocks() &&
+              alloc_block(params, true));
+    }
 
     if (!block_found) {
       // For any error code other than cudaErrorMemoryAllocation,
@@ -699,9 +758,9 @@ class DeviceCachingAllocator {
     if (*largest ==
         0) { // make an initial guess if a zero *largest is passed in
       size_t tmp_bytes;
-      cudaMemGetInfo(
+      C10_CUDA_CHECK(cudaMemGetInfo(
           largest, // Use free memory as an optimistic initial guess of *largest
-          &tmp_bytes);
+          &tmp_bytes));
     }
     cache_info_aux(large_blocks, total, largest);
     cache_info_aux(small_blocks, total, largest);
@@ -808,11 +867,43 @@ class DeviceCachingAllocator {
     return result;
   }
 
+  // This function takes the size and number of divisions argument and rounds
+  // up the size argument for the nearest power-of-2 division.
+  // For example, if we need to round-up 1200 and number of divisions is 4,
+  // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
+  // them, the values are 1024, 1280, 1536, and 1792. So the function will
+  // return 1280 as the nearest ceiling of power-2 divison.
+  static size_t roundup_power2_next_division(size_t size, size_t divisions) {
+    if (C10_UNLIKELY(size <= 4 || divisions <= 1)) {
+      return size;
+    }
+    if (llvm::isPowerOf2_64(size)) {
+      return size;
+    }
+
+    // divide the space between these 2's power into equal divisions
+    // If division is zero, return the power-of-2 ceiling.
+    size_t power2_floor = llvm::PowerOf2Floor(size);
+    size_t power2_divison =
+        power2_floor >> (63 - llvm::countLeadingZeros(divisions));
+    if (C10_UNLIKELY(power2_divison == 0)) {
+      return (power2_floor << 1);
+    }
+    size_t round_size_floor = size & (~(power2_divison - 1));
+    return (round_size_floor == size) ? size
+                                      : round_size_floor + power2_divison;
+  }
+
   static size_t round_size(size_t size) {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
     } else {
-      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+      auto divisions = CachingAllocatorConfig::roundup_power2_divisions();
+      if (divisions > 0 && size > (kMinBlockSize * divisions)) {
+        return roundup_power2_next_division(size, divisions);
+      } else {
+        return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+      }
     }
   }
 
@@ -1037,6 +1128,15 @@ class DeviceCachingAllocator {
 
   bool get_free_block(AllocParams& p) {
     BlockPool& pool = *p.pool;
+
+    if (C10_UNLIKELY(
+            set_fraction &&
+            CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+      // Track block reuse interval only when garbage collection is enabled.
+      for (auto& b : pool.blocks) {
+        ++b->gc_count;
+      }
+    }
     auto it = pool.blocks.lower_bound(&p.search_key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream())
       return false;
@@ -1049,6 +1149,7 @@ class DeviceCachingAllocator {
         ((*it)->size >= p.size() + kLargeBuffer))
       return false;
     p.block = *it;
+    (*it)->gc_count = 0; // Denote this block has been used
     pool.blocks.erase(it);
     return true;
   }
@@ -1062,6 +1163,62 @@ class DeviceCachingAllocator {
     return freed_memory;
   }
 
+  void garbage_collect_cached_blocks() {
+    // Free unused cached blocks to reclaim GPU memory.
+    // Unlike release_cached_blocks(), this does not enforce synchronization and
+    // therefore should be of less overheads.
+
+    size_t gc_threshold = static_cast<size_t>(
+        CachingAllocatorConfig::garbage_collection_threshold() *
+        allowed_memory_maximum);
+    // No need to trigger GC yet
+    if (total_allocated_memory <= gc_threshold) {
+      return;
+    }
+    const auto target_size = total_allocated_memory - gc_threshold;
+    size_t gc_reclaimed = 0;
+
+    // Calculate the total age of the free-able blocks. We'll use it later to
+    // get "avg age" threshold.
+    double total_age = 0.0;
+    int freeable_block_count = 0;
+    for (auto& b : large_blocks.blocks) {
+      if (!b->is_split()) {
+        total_age += b->gc_count;
+        ++freeable_block_count;
+      }
+    }
+    // No free-able blocks?
+    if (freeable_block_count == 0) {
+      return;
+    }
+
+    // Repeat GC until we reach reclaim > target size.
+    bool block_freed = true;
+    while (gc_reclaimed < target_size && block_freed == true &&
+           freeable_block_count > 0) {
+      // Free blocks exceeding this age threshold first.
+      double age_threshold = total_age / freeable_block_count;
+      // Stop iteration if we can no longer free a block.
+      block_freed = false;
+
+      // Free blocks of > avg age. Don't stop upon reaching the target_size,
+      // we don't want this GC to be triggered frequently.
+      auto it = large_blocks.blocks.begin();
+      while (it != large_blocks.blocks.end()) {
+        Block* block = *it;
+        ++it;
+        if (!block->is_split() && block->gc_count >= age_threshold) {
+          block_freed = true;
+          gc_reclaimed += block->size;
+          total_age -= block->gc_count; // Decrement the age
+          freeable_block_count--; // One less block that can be freed
+          release_block(block);
+        }
+      }
+    }
+  }
+
   bool alloc_block(AllocParams& p, bool isRetry) {
     // Defensively checks for preexisting CUDA error state.
     C10_CUDA_CHECK(cudaGetLastError());
@@ -1304,7 +1461,7 @@ class DeviceCachingAllocator {
         cudaEvent_t event = e.first;
         Block* block = e.second;
 
-        cudaError_t err = cudaEventQuery(event);
+        cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(event));
         if (err == cudaErrorNotReady) {
           // ignore and clear the error if not ready
           cudaGetLastError();
@@ -1422,9 +1579,9 @@ class THCCachingAllocator {
         fraction,
         ". Please set within (0, 1).");
     int activated_device;
-    cudaGetDevice(&activated_device);
+    C10_CUDA_CHECK(cudaGetDevice(&activated_device));
     if (activated_device != device) {
-      cudaSetDevice(device);
+      C10_CUDA_CHECK(cudaSetDevice(device));
     }
     device_allocator[device]->setMemoryFraction(fraction);
   }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index d3a73943f7bb..9b1a6ecf1590 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -102,6 +102,7 @@ struct DeviceStats {
 // cudaMalloc)..
 struct BlockInfo {
   int64_t size = 0;
+  int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
 };
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 77d0d07ac95e..ca441711cbd6 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -63,6 +63,26 @@ class C10_CUDA_API CUDAError : public c10::Error {
     }                                                          \
   } while (0)
 
+// Indicates that a CUDA error is handled in a non-standard way
+#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
+
+// Intentionally ignore a CUDA error
+#define C10_CUDA_IGNORE_ERROR(EXPR)                             \
+  do {                                                          \
+    cudaError_t __err = EXPR;                                   \
+    if (__err != cudaSuccess) {                                 \
+      cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+      (void)error_unused;                                       \
+    }                                                           \
+  } while (0)
+
+// Clear the last CUDA error
+#define C10_CUDA_CLEAR_ERROR()                                \
+  do {                                                        \
+    cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+    (void)error_unused;                                       \
+  } while (0)
+
 // This should be used directly after every kernel launch to ensure
 // the launch happened correctly and provide an early, close-to-source
 // diagnostic if it didn't.
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 255d798d13fb..9ab61aa1f381 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -10,16 +10,13 @@ namespace {
 // returns -1 on failure
 int32_t driver_version() {
   int driver_version = -1;
-  cudaError_t err = cudaDriverGetVersion(&driver_version);
-  if (err != cudaSuccess) {
-    cudaError_t last_err C10_UNUSED = cudaGetLastError();
-  }
+  C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version));
   return driver_version;
 }
 
 int device_count_impl(bool fail_if_no_driver) {
   int count;
-  auto err = cudaGetDeviceCount(&count);
+  auto err = C10_CUDA_ERROR_HANDLED(cudaGetDeviceCount(&count));
   if (err == cudaSuccess) {
     return count;
   }
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 7bb97e88b991..6d17136341c6 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -111,7 +111,7 @@ class C10_CUDA_API CUDAStream {
 
   bool query() const {
     DeviceGuard guard{stream_.device()};
-    cudaError_t err = cudaStreamQuery(stream());
+    cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
 
     if (err == cudaSuccess) {
       return true;
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 8f5cfdc259d3..583feeec2600 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -41,7 +41,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
   c10::optional<Device> uncheckedGetDevice() const noexcept {
     int device;
-    auto err = cudaGetDevice(&device);
+    const auto err = C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device));
     C10_CUDA_CHECK_WARN(err);
     if (err != cudaSuccess) {
       return c10::nullopt;
@@ -164,7 +164,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     if (!event)
       return true;
     cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
-    const cudaError_t err = cudaEventQuery(cuda_event);
+    const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
     if (err != cudaErrorNotReady) {
       C10_CUDA_CHECK(err);
     } else {
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 28dc1df9430e..e839d2841f7c 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -1,5 +1,6 @@
 #ifndef C10_MACROS_MACROS_H_
 #define C10_MACROS_MACROS_H_
+#include <cassert>
 
 /* Main entry for c10/macros.
  *
@@ -331,15 +332,14 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) || defined(__XROS__) || \
-    (defined(USE_ROCM) && ROCM_VERSION < 40100)
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(USE_ROCM)
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
 extern "C" {
 C10_IMPORT
-#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__)
+#if defined(__CUDA_ARCH__)
 __host__ __device__
 #endif // __CUDA_ARCH__
     void
@@ -360,8 +360,7 @@ extern SYCL_EXTERNAL void __assert_fail(
     unsigned int line,
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
-#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) || \
-    defined(__HIP_ARCH__) || defined(__HIP__)
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
 __host__ __device__
 #endif // __CUDA_ARCH__
     void
@@ -482,8 +481,7 @@ __host__ __device__
 #endif
 
 #ifndef HAS_DEMANGLE
-#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) || \
-    defined(__XROS__)
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
 #define HAS_DEMANGLE 0
 #elif defined(__APPLE__) && \
     (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
diff --git a/c10/macros/cmake_configure_file.bzl b/c10/macros/cmake_configure_file.bzl
index 53ed0656739b..16d09cc9ee30 100644
--- a/c10/macros/cmake_configure_file.bzl
+++ b/c10/macros/cmake_configure_file.bzl
@@ -13,7 +13,7 @@ def _cmake_configure_file_impl(ctx):
         )
 
     # Replace any that remain with /* #undef FOO */.
-    command.append("| sed --regexp-extended 's@#cmakedefine (\\w+)@/* #undef \\1 */@'")
+    command.append("| sed -r 's@#cmakedefine ([A-Z0-9_]+)@/* #undef \\1 */@'")
     command.append("> $2")
 
     ctx.actions.run_shell(
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index 0c6b2a5486f7..0b3a5a5f3d84 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -1,24 +1,55 @@
 def define_targets(rules):
-    rules.cc_test(
+    rules.test_suite(
         name = "tests",
+        tests = [
+            ":core_tests",
+            ":typeid_test",
+            ":util_base_tests",
+        ],
+        visibility = ["//:__pkg__"],
+    )
+
+    rules.cc_test(
+        name = "core_tests",
         size = "small",
         srcs = rules.glob([
-            "util/*.cpp",
             "core/*.cpp",
             "core/impl/*.cpp",
         ]),
         copts = ["-Wno-deprecated-declarations"],
+        deps = [
+            "@com_google_googletest//:gtest_main",
+            "//c10/core:base",
+            "//c10/util:base",
+        ],
+    )
+
+    rules.cc_test(
+        name = "typeid_test",
+        size = "small",
+        srcs = ["util/typeid_test.cpp"],
+        copts = ["-Wno-deprecated-declarations"],
+        deps = [
+            "@com_google_googletest//:gtest_main",
+            "//c10/util:typeid",
+        ],
+    )
+
+    rules.cc_test(
+        name = "util_base_tests",
+        srcs = rules.glob(
+            ["util/*.cpp"],
+            exclude = ["util/typeid_test.cpp"],
+        ),
+        copts = ["-Wno-deprecated-declarations"],
         deps = [
             ":Macros",
             ":complex_math_test_common",
             ":complex_test_common",
             "@com_google_googletest//:gtest_main",
-            "//c10/core:base",
             "//c10/macros",
             "//c10/util:base",
-            "//c10/util:typeid",
         ],
-        visibility = ["//:__pkg__"],
     )
 
     rules.cc_library(
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index 43b06c110e5b..266f45882393 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -3,25 +3,163 @@
 #include <unordered_set>
 
 #include <c10/core/DispatchKeySet.h>
+#include <c10/util/irange.h>
 
 using namespace c10;
 
+// This test exists not to be comprehensive, but to more clearly show
+// what the semantics of DispatchKeySet are.
+TEST(DispatchKeySet, ShowSemantics) {
+  // the "CPU" dispatch key is an instance of a per-backend-functionality key.
+  // It corresponds to "dense" functionality, "CPU" backend.
+  // This means that it gets a dense functionality bit, and a cpu backend bit
+  // set.
+  auto undefined_set = DispatchKeySet();
+  auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU);
+  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
+  ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
+  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
+
+  auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy);
+  ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense));
+  ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit));
+  ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy));
+
+  // You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block"
+  // dispatch keys. You are allowed to directly create keysets out of them!
+  auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) |
+      DispatchKeySet(BackendComponent::CPUBit);
+  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
+  ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
+  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
+  ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks);
+
+  // Similarly, the AutogradCUDA key gets 2 bits in the keyset:
+  // The "Autograd" functionality bit, and the "CUDA" backend bit
+  auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA);
+  ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality));
+  ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit));
+
+  // Because DispatchKeySet uses a condensed internal representation, you cannot
+  // use it to represent the FULL cross product of backends and functionalities
+  // for example:
+  auto autograd_dense_cpu_cuda = DispatchKeySet(
+      {DispatchKey::AutogradFunctionality,
+       DispatchKey::Dense,
+       DispatchKey::CUDA,
+       DispatchKey::CPU});
+  auto fpga = DispatchKeySet(DispatchKey::FPGA);
+  auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU});
+  // this keyset has all of the building block keys:
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit));
+
+  // and it also has the "runtime" keys that correspond to the full
+  // cross-product of functionality
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU));
+  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA));
+
+  // This means that there's no way to represent a keyset with, say, only
+  // Autograd CUDA + Dense CPU. Instead, you should think of a keyset as
+  // inheriting the full set of functionalities + backends of its keys. This
+  // means that the below keysets are all indistinguishable from each other.
+  ASSERT_EQ(
+      autograd_dense_cpu_cuda,
+      DispatchKeySet(
+          {DispatchKey::AutogradCUDA,
+           DispatchKey::AutogradCPU,
+           DispatchKey::CUDA,
+           DispatchKey::CPU}));
+  ASSERT_EQ(
+      autograd_dense_cpu_cuda,
+      DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU}));
+  ASSERT_EQ(
+      autograd_dense_cpu_cuda,
+      DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU}));
+
+  // ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~
+
+  // Iterators allow you to iterate individually through the DispatchKey's in a
+  // DispatchKeySet
+  auto empty_set = DispatchKeySet();
+  auto t1 = empty_set.begin();
+  auto t2 = empty_set.end();
+  ASSERT_EQ(*empty_set.begin(), *empty_set.end());
+
+  // However, only keys that correspond to actual runtime indices of kernels in
+  // the operator table show up when you iterate through a keyset. i.e.
+  // DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an
+  // iterator.
+  auto dense_cpu_iter = dense_cpu_set.begin();
+  ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU);
+  ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end());
+
+  auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin();
+  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU);
+  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA);
+  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU);
+  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA);
+  ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end());
+
+  // But other "functionality bits" that are not defined per-backend DO get
+  // their own slots in the operator table.
+  auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) |
+      DispatchKeySet(
+                          {DispatchKey::FPGA, // runtime key
+                           DispatchKey::Functionalize, // runtime key
+                           DispatchKey::Dense}); // NOT a runtime key
+  auto mixed_iter = mixed_keyset.begin();
+  ASSERT_EQ(*mixed_iter++, DispatchKey::CPU);
+  ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA);
+  ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize);
+  ASSERT_EQ(*mixed_iter, *mixed_keyset.end());
+}
+
 TEST(DispatchKeySet, Empty) {
   DispatchKeySet empty_set;
-  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+  for (uint8_t i = 0;
+       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
        i++) {
     auto tid = static_cast<DispatchKey>(i);
+    if (tid == DispatchKey::Undefined)
+      continue;
     ASSERT_FALSE(empty_set.has(tid));
   }
   ASSERT_TRUE(empty_set.empty());
   DispatchKeySet empty_set2;
   ASSERT_TRUE(empty_set == empty_set2);
-  ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined);
 }
 
-TEST(DispatchKeySet, Singleton) {
-  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
-       i++) {
+// This covers all keys that correspond to a single backend bit, e.g.
+// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still
+// allow adding them directly to a keyset
+TEST(DispatchKeySet, SingletonBackendComponent) {
+  for (const auto i : c10::irange(1, num_backends)) {
+    auto tid = static_cast<DispatchKey>(i);
+    DispatchKeySet sing(tid);
+    ASSERT_EQ(sing, sing);
+    ASSERT_EQ(sing, DispatchKeySet().add(tid));
+    ASSERT_EQ(sing, sing.add(tid));
+    ASSERT_EQ(sing, sing | sing);
+    ASSERT_FALSE(sing.empty());
+    ASSERT_TRUE(sing.has(tid));
+  }
+}
+
+// This covers all keys that correspond to a single functionality bit:
+// - runtime, not-per-backend functionality keys, e.g.
+// DispatchKey::FuncTorchBatched
+// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA
+// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense
+//   Even though it's not a runtime key, we still allow adding it directly to a
+//   keyset.
+// DispatchKey::
+TEST(DispatchKeySet, SingletonFunctionalityKeys) {
+  for (const auto i : c10::irange(1, num_functionality_keys)) {
     auto tid = static_cast<DispatchKey>(i);
     DispatchKeySet sing(tid);
     ASSERT_EQ(sing, sing);
@@ -30,47 +168,147 @@ TEST(DispatchKeySet, Singleton) {
     ASSERT_EQ(sing, sing | sing);
     ASSERT_FALSE(sing.empty());
     ASSERT_TRUE(sing.has(tid));
-    ASSERT_EQ(sing.highestPriorityTypeId(), tid);
     ASSERT_EQ(sing.remove(tid), DispatchKeySet());
   }
 }
 
-TEST(DispatchKeySet, Doubleton) {
-  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+// This covers runtime keys that are per-backend,
+// and take up more than one bit in a DispatchKeySet. They take up one
+// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA,
+// AutogradCPU, AutogradCUDA
+TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) {
+  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
+       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
+       i++) {
+    auto tid = static_cast<DispatchKey>(i);
+    // Skip these because they aren't real keys.
+    if (tid == DispatchKey::StartOfDenseBackends ||
+        tid == DispatchKey::StartOfSparseBackends ||
+        tid == DispatchKey::StartOfQuantizedBackends ||
+        tid == DispatchKey::StartOfAutogradBackends) {
+      continue;
+    }
+    DispatchKeySet sing(tid);
+    ASSERT_EQ(sing, sing);
+    ASSERT_EQ(sing, DispatchKeySet().add(tid));
+    ASSERT_EQ(sing, sing.add(tid));
+    ASSERT_EQ(sing, sing | sing);
+    ASSERT_FALSE(sing.empty());
+    ASSERT_TRUE(sing.has(tid));
+
+    auto functionality_key = toFunctionalityKey(tid);
+    auto backend_key = toBackendComponent(tid);
+    // These two sets should be equivalent:
+    // DispatchKeySet(DispatchKey::CPU)
+    // DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit})
+    auto expected_ks =
+        DispatchKeySet(functionality_key) | DispatchKeySet(backend_key);
+    ASSERT_EQ(sing, expected_ks);
+    // These two sets should be equivalent:
+    // DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense)
+    // DispatchKeySet(BackendComponent::CPUBit)
+    expected_ks = DispatchKeySet(toBackendComponent(tid));
+    ASSERT_EQ(sing.remove(tid), expected_ks);
+  }
+}
+
+TEST(DispatchKeySet, DoubletonPerBackend) {
+  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
+       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
        i++) {
     for (uint8_t j = i + 1;
-         j < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+         j <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
          j++) {
       ASSERT_LT(i, j);
       auto tid1 = static_cast<DispatchKey>(i);
       auto tid2 = static_cast<DispatchKey>(j);
-      auto doub = DispatchKeySet(tid1).add(tid2);
-      ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2));
-      ASSERT_TRUE(doub.has(tid1));
-      ASSERT_TRUE(doub.has(tid2));
-      ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j
+
+      // Skip these because they aren't real keys.
+      if (tid1 == DispatchKey::StartOfDenseBackends ||
+          tid1 == DispatchKey::StartOfSparseBackends ||
+          tid1 == DispatchKey::StartOfQuantizedBackends ||
+          tid1 == DispatchKey::StartOfNestedTensorBackends ||
+          tid1 == DispatchKey::StartOfAutogradBackends)
+        continue;
+      if (tid2 == DispatchKey::StartOfDenseBackends ||
+          tid2 == DispatchKey::StartOfSparseBackends ||
+          tid2 == DispatchKey::StartOfQuantizedBackends ||
+          tid2 == DispatchKey::StartOfNestedTensorBackends ||
+          tid2 == DispatchKey::StartOfAutogradBackends)
+        continue;
+
+      auto backend1 = toBackendComponent(tid1);
+      auto backend2 = toBackendComponent(tid2);
+      auto functionality1 = toFunctionalityKey(tid1);
+      auto functionality2 = toFunctionalityKey(tid2);
+
+      auto combined = DispatchKeySet({tid1, tid2});
+      // The combined set has the backend bits
+      ASSERT_TRUE(combined.has_backend(backend1));
+      ASSERT_TRUE(combined.has_backend(backend2));
+      // and it has the backend bits
+      ASSERT_TRUE(combined.has(functionality1));
+      ASSERT_TRUE(combined.has(functionality2));
+      // and it has the original two runtime keys
+      ASSERT_TRUE(combined.has(tid1));
+      ASSERT_TRUE(combined.has(tid2));
+
+      // Add all of the keys in the keyset to a real set
+      std::unordered_set<DispatchKey> visited_keys;
+      auto iter = combined.begin();
+      while (*iter != *combined.end()) {
+        visited_keys.insert(*iter);
+        ++iter;
+      }
+      std::unordered_set<DispatchKey> expected_keys;
+      expected_keys.insert(
+          toRuntimePerBackendFunctionalityKey(functionality1, backend1));
+      expected_keys.insert(
+          toRuntimePerBackendFunctionalityKey(functionality1, backend2));
+      expected_keys.insert(
+          toRuntimePerBackendFunctionalityKey(functionality2, backend1));
+      expected_keys.insert(
+          toRuntimePerBackendFunctionalityKey(functionality2, backend2));
+      ASSERT_EQ(expected_keys, visited_keys);
+
+      if (backend1 == backend2 || functionality1 == functionality2) {
+        // We have two runtime keys, with either the same backend or the same
+        // per-backend functionalities. E.g. {AutogradCUDA, CUDA} or
+        // {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in
+        // this set.
+        ASSERT_EQ(2, visited_keys.size());
+      } else {
+        // since i and j are different keys, they should not have the same
+        // functionality and backend
+        ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2);
+        // We have two runtime keys, that have different backends + per-backend
+        // functionalities. So we should expect the full cross product of
+        // runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU,
+        // then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU}
+        ASSERT_EQ(4, visited_keys.size());
+      }
     }
   }
 }
 
 TEST(DispatchKeySet, Full) {
   DispatchKeySet full(DispatchKeySet::FULL);
-  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
-       i++) {
+  for (const auto i : c10::irange(1, num_functionality_keys)) {
     auto tid = static_cast<DispatchKey>(i);
     ASSERT_TRUE(full.has(tid));
   }
+  ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys));
 }
 
 TEST(DispatchKeySet, IteratorBasicOps) {
   DispatchKeySet empty_set;
   DispatchKeySet full_set(DispatchKeySet::FULL);
-  DispatchKeySet mutated_set = empty_set.add(static_cast<DispatchKey>(1));
+  DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU);
 
   // Constructor + Comparison
-  ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys);
-  ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys);
-  ASSERT_EQ(*mutated_set.begin(), static_cast<DispatchKey>(1));
+  ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys);
+  ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys);
+  ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU);
 
   ASSERT_TRUE(empty_set.begin() == empty_set.end());
   ASSERT_TRUE(full_set.begin() != full_set.end());
@@ -80,6 +318,25 @@ TEST(DispatchKeySet, IteratorBasicOps) {
   ASSERT_TRUE(full_set.begin() != ++full_set.begin());
 }
 
+TEST(DispatchKeySet, getHighestPriorityBackendTypeId) {
+  // AutogradCPU isn't a backend key so it is ignored
+  DispatchKeySet dense_cpu({DispatchKey::AutogradCPU, DispatchKey::CPU});
+  ASSERT_EQ(DispatchKey::CPU, c10::highestPriorityBackendTypeId(dense_cpu));
+
+  // Functionalize isn't a backend key so it is ignored
+  DispatchKeySet sparse_cuda(
+      {DispatchKey::Functionalize, DispatchKey::SparseCUDA});
+  ASSERT_EQ(
+      DispatchKey::SparseCUDA, c10::highestPriorityBackendTypeId(sparse_cuda));
+
+  // quantizedCUDA has higher priority than CUDA
+  DispatchKeySet quantized_cuda(
+      {DispatchKey::CUDA, DispatchKey::QuantizedCUDA});
+  ASSERT_EQ(
+      DispatchKey::QuantizedCUDA,
+      c10::highestPriorityBackendTypeId(quantized_cuda));
+}
+
 TEST(DispatchKeySet, IteratorEmpty) {
   DispatchKeySet empty_set;
   uint8_t i = 0;
@@ -90,16 +347,37 @@ TEST(DispatchKeySet, IteratorEmpty) {
   ASSERT_EQ(i, 0);
 }
 
+TEST(DispatchKeySet, IteratorCrossProduct) {
+  // The iterator should return all runtime keys in the set,
+  // including the cross product of {backends} x {functionalities}
+  auto ks =
+      DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) |
+      DispatchKeySet(
+          {DispatchKey::Dense,
+           DispatchKey::FPGA,
+           DispatchKey::AutogradFunctionality});
+
+  auto iter = ks.begin();
+  // iterate through dense backends first.
+  ASSERT_EQ(DispatchKey::CPU, *(iter++));
+  ASSERT_EQ(DispatchKey::CUDA, *(iter++));
+  // FPGA doesn't have a backend bit, so it isn't included in the cross product.
+  ASSERT_EQ(DispatchKey::FPGA, *(iter++));
+  // iterate through the autograd keys laster.
+  ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++));
+  ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++));
+}
+
 TEST(DispatchKeySet, IteratorFull) {
   DispatchKeySet full_set(DispatchKeySet::FULL);
   uint8_t i = 0;
 
   for (const auto& it : full_set) {
     i++;
-    ASSERT_TRUE(it == static_cast<DispatchKey>(i));
-    ASSERT_TRUE(it != DispatchKey::NumDispatchKeys);
   }
-  ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
+  // Total # of runtime entries includes an entry for DispatchKey::Undefined,
+  // which is not included when iterating through the DispatchKeySet.
+  ASSERT_EQ(i, num_runtime_entries - 1);
 }
 
 TEST(DispatchKeySet, IteratorRangeFull) {
@@ -108,41 +386,61 @@ TEST(DispatchKeySet, IteratorRangeFull) {
 
   for (DispatchKey dispatch_key : full_set) {
     i++;
-    ASSERT_TRUE(dispatch_key == static_cast<DispatchKey>(i));
   }
 
-  ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
-}
-
-TEST(DispatchKeySet, SpecificKeys) {
-  DispatchKeySet keyset({
-      static_cast<DispatchKey>(0), // Undefined should be ignored
-      static_cast<DispatchKey>(4),
-      static_cast<DispatchKey>(10),
-      static_cast<DispatchKey>(15),
-  });
-  std::unordered_set<DispatchKey> visited_keys;
-
-  for (DispatchKey key : keyset) {
-    visited_keys.insert(key);
-  }
-
-  ASSERT_EQ(visited_keys.size(), 3);
-  ASSERT_TRUE(
-      visited_keys.find(static_cast<DispatchKey>(4)) != visited_keys.end());
-  ASSERT_TRUE(
-      visited_keys.find(static_cast<DispatchKey>(10)) != visited_keys.end());
-  ASSERT_TRUE(
-      visited_keys.find(static_cast<DispatchKey>(15)) != visited_keys.end());
+  // Total # of runtime entries includes an entry for DispatchKey::Undefined,
+  // which is not included when iterating through the DispatchKeySet.
+  ASSERT_EQ(i, num_runtime_entries - 1);
 }
 
 TEST(DispatchKeySet, FailAtEndIterator) {
   DispatchKeySet full_set(DispatchKeySet::FULL);
   uint64_t raw_repr = full_set.raw_repr();
 
+  // doesn't throw
+  DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
   EXPECT_THROW(
       DispatchKeySet::iterator(
-          &raw_repr, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) + 1),
+          &raw_repr, num_backends + num_functionality_keys + 1),
       c10::Error);
 }
+
+TEST(DispatchKeySet, TestKeyOrderingInvariants) {
+  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
+       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
+       i++) {
+    auto k = static_cast<DispatchKey>(i);
+    // Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+    // The DispatchKey enum includes all of the runtime keys for
+    // Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA,
+    // AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys
+    // to be the same as the ordering of the backends in the `BackendComponent`
+    // enum. This makes several utilities in `DispatchKey.h` and
+    // `DispatchKeySet.h` significantly easier to implement. The purpose of the
+    // test is to assert (through CI) that this invariant is maintained.
+    //
+    // The only way that we can really check this invariant is by
+    // comparing the string names of each enum.
+    // We only really care about the ordering for "real" keys that are actually
+    // used, which we expect to be able to print properly. This saves us from
+    // having to enumerate the full set of possible runtime keys in
+    // DispatchKey::toString(). It also relies on toString() being implemented
+    // correctly.
+    auto functionality_str = std::string(toString(k));
+    if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID")
+      continue;
+
+    auto computed_backend_k = toBackendComponent(k);
+    auto computed_backend_str = std::string(toString(computed_backend_k));
+    // Skip, e.g., the "Bit" from "CPUBit"
+    computed_backend_str =
+        computed_backend_str.substr(0, computed_backend_str.size() - 3);
+
+    ASSERT_TRUE(
+        functionality_str.find(computed_backend_str) != std::string::npos)
+        << "DispatchKey invariant broken! Found a key that is not ordered correctly"
+        << " with its backend bit. key = " << toString(k) << ", " << k
+        << ", computed backend = " << toString(computed_backend_k);
+  }
+}
diff --git a/c10/test/util/DeadlockDetection_test.cpp b/c10/test/util/DeadlockDetection_test.cpp
new file mode 100644
index 000000000000..35c4953f6d33
--- /dev/null
+++ b/c10/test/util/DeadlockDetection_test.cpp
@@ -0,0 +1,31 @@
+#include <c10/util/DeadlockDetection.h>
+
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+
+using namespace ::testing;
+using namespace c10::impl;
+
+struct DummyPythonGILHooks : public PythonGILHooks {
+  bool check_python_gil() const override {
+    return true;
+  }
+};
+
+TEST(DeadlockDetection, basic) {
+  ASSERT_FALSE(check_python_gil());
+  DummyPythonGILHooks hooks;
+  SetPythonGILHooks(&hooks);
+  ASSERT_TRUE(check_python_gil());
+  SetPythonGILHooks(nullptr);
+}
+
+#ifndef _WIN32
+TEST(DeadlockDetection, disable) {
+  setenv("TORCH_DISABLE_DEADLOCK_DETECTION", "1", 1);
+  DummyPythonGILHooks hooks;
+  SetPythonGILHooks(&hooks);
+  SetPythonGILHooks(&hooks);
+}
+#endif
diff --git a/c10/test/util/Synchronized_test.cpp b/c10/test/util/Synchronized_test.cpp
new file mode 100644
index 000000000000..ce781a10cadb
--- /dev/null
+++ b/c10/test/util/Synchronized_test.cpp
@@ -0,0 +1,43 @@
+#include <c10/util/Synchronized.h>
+#include <gtest/gtest.h>
+
+#include <array>
+#include <thread>
+
+namespace {
+
+TEST(Synchronized, TestSingleThreadExecution) {
+  c10::Synchronized<int> iv(0);
+  const int kMaxValue = 100;
+  for (int i = 0; i < kMaxValue; ++i) {
+    auto ret = iv.withLock([](int& iv) { return ++iv; });
+    EXPECT_EQ(ret, i + 1);
+  }
+
+  iv.withLock([kMaxValue](int& iv) { EXPECT_EQ(iv, kMaxValue); });
+}
+
+TEST(Synchronized, TestMultiThreadedExecution) {
+  c10::Synchronized<int> iv(0);
+#define NUM_LOOP_INCREMENTS 10000
+
+  auto thread_cb = [&iv]() {
+    for (int i = 0; i < NUM_LOOP_INCREMENTS; ++i) {
+      iv.withLock([](int& iv) { ++iv; });
+    }
+  };
+
+  std::array<std::thread, 10> threads;
+  for (auto& t : threads) {
+    t = std::thread(thread_cb);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  iv.withLock([](int& iv) { EXPECT_EQ(iv, NUM_LOOP_INCREMENTS * 10); });
+#undef NUM_LOOP_INCREMENTS
+}
+
+} // namespace
diff --git a/c10/test/util/ordered_preserving_dict_test.cpp b/c10/test/util/ordered_preserving_dict_test.cpp
index 773b2e7a2a35..aa1d7f0f986e 100644
--- a/c10/test/util/ordered_preserving_dict_test.cpp
+++ b/c10/test/util/ordered_preserving_dict_test.cpp
@@ -48,7 +48,7 @@ dict_int_int test_dict(dict_int_int& dict) {
   }
   dict.erase(begin, end);
 
-  std::vector<size_t> order;
+  std::vector<int64_t> order;
   for (const auto i : c10::irange(100)) {
     if (!erase_set.count(i)) {
       order.push_back(i);
@@ -211,12 +211,12 @@ TEST(OrderedPreservingDictTest, test_range_erase) {
   using HMap =
       ska_ordered::order_preserving_flat_hash_map<std::string, std::int64_t>;
 
-  const std::size_t nb_values = 1000;
+  const int64_t nb_values = 1000;
   HMap map;
   for (const auto i : c10::irange(nb_values)) {
     map[c10::guts::to_string(i)] = i;
     auto begin = map.begin();
-    for (size_t j = 0; j <= i; ++j, begin++) {
+    for (int64_t j = 0; j <= i; ++j, begin++) {
       TORCH_INTERNAL_ASSERT(begin->second == j);
     }
   }
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 0602404b5f05..4d45c5e6c413 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -25,7 +25,6 @@
 #include <vector>
 
 namespace c10 {
-
 /// ArrayRef - Represent a constant reference to an array (0 or more elements
 /// consecutively in memory), i.e. a start pointer and a length.  It allows
 /// various APIs to take consecutive elements easily and conveniently.
@@ -92,7 +91,6 @@ class ArrayRef final {
     debugCheckNullptrInvariant();
   }
 
-  /// Construct an ArrayRef from a generic Container.
   template <
       typename Container,
       typename = std::enable_if_t<std::is_same<
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index bed04a438abe..4143ae595e31 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -76,7 +76,7 @@ struct bitset final {
   // (i.e. if the very first bit is set, this function returns '1'), and a
   // return of '0' means that there was no bit set.
   size_t find_first_set() const {
-#if defined(_MSC_VER) && defined(_M_X64)
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
     unsigned long result;
     bool has_bits_set = (0 != _BitScanForward64(&result, bitset_));
     if (!has_bits_set) {
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index 13ba70ddab12..8ef9be69bded 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -40,7 +40,7 @@ namespace guts {
 
 template <typename Base, typename Child, typename... Args>
 typename std::enable_if<
-    !std::is_array<Base>::value && !std::is_array<Base>::value &&
+    !std::is_array<Base>::value && !std::is_array<Child>::value &&
         std::is_base_of<Base, Child>::value,
     std::unique_ptr<Base>>::type
 make_unique_base(Args&&... args) {
diff --git a/c10/util/DeadlockDetection.cpp b/c10/util/DeadlockDetection.cpp
index d95a72f95553..bb95939fc53f 100644
--- a/c10/util/DeadlockDetection.cpp
+++ b/c10/util/DeadlockDetection.cpp
@@ -1,11 +1,17 @@
 #include <c10/util/DeadlockDetection.h>
 
+#include <cstdlib>
+
 namespace c10 {
 namespace impl {
 
 namespace {
 PythonGILHooks* python_gil_hooks = nullptr;
+
+bool disable_detection() {
+  return std::getenv("TORCH_DISABLE_DEADLOCK_DETECTION") != nullptr;
 }
+} // namespace
 
 bool check_python_gil() {
   if (!python_gil_hooks) {
@@ -15,6 +21,9 @@ bool check_python_gil() {
 }
 
 void SetPythonGILHooks(PythonGILHooks* hooks) {
+  if (disable_detection()) {
+    return;
+  }
   TORCH_INTERNAL_ASSERT(!hooks || !python_gil_hooks);
   python_gil_hooks = hooks;
 }
diff --git a/c10/util/DimVector.h b/c10/util/DimVector.h
new file mode 100644
index 000000000000..fea1651a46c0
--- /dev/null
+++ b/c10/util/DimVector.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <c10/util/SmallVector.h>
+#include <cstdint>
+
+namespace c10 {
+
+constexpr size_t kDimVectorStaticSize = 5;
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, kDimVectorStaticSize>;
+
+} // namespace c10
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 0eb0c6a80bf1..327e4cbfabd1 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -314,13 +314,13 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // (unlike assert()).
 //
 #ifdef STRIP_ERROR_MESSAGES
-#define TORCH_INTERNAL_ASSERT(cond, ...)                            \
-  if (C10_UNLIKELY_OR_CONST(!(cond))) {                             \
-    ::c10::detail::torchCheckFail(                                  \
-        __func__,                                                   \
-        __FILE__,                                                   \
-        static_cast<uint32_t>(__LINE__),                            \
-        #cond "INTERNAL ASSERT FAILED at" C10_STRINGIZE(__FILE__)); \
+#define TORCH_INTERNAL_ASSERT(cond, ...)                              \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                               \
+    ::c10::detail::torchCheckFail(                                    \
+        __func__,                                                     \
+        __FILE__,                                                     \
+        static_cast<uint32_t>(__LINE__),                              \
+        #cond " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__)); \
   }
 #else
 // It would be nice if we could build a combined string literal out of
@@ -328,16 +328,16 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // as the first argument, but there doesn't seem to be any good way to
 // do that while still supporting having a first argument that isn't a
 // string literal.
-#define TORCH_INTERNAL_ASSERT(cond, ...)                                        \
-  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                         \
-    ::c10::detail::torchInternalAssertFail(                                     \
-        __func__,                                                               \
-        __FILE__,                                                               \
-        static_cast<uint32_t>(__LINE__),                                        \
-        #cond                                                                   \
-        "INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \
-            __LINE__) ", please report a bug to PyTorch. ",                     \
-        c10::str(__VA_ARGS__));                                                 \
+#define TORCH_INTERNAL_ASSERT(cond, ...)                                         \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                          \
+    ::c10::detail::torchInternalAssertFail(                                      \
+        __func__,                                                                \
+        __FILE__,                                                                \
+        static_cast<uint32_t>(__LINE__),                                         \
+        #cond                                                                    \
+        " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \
+            __LINE__) ", please report a bug to PyTorch. ",                      \
+        c10::str(__VA_ARGS__));                                                  \
   }
 #endif
 
@@ -375,7 +375,7 @@ C10_API std::string GetExceptionString(const std::exception& e);
 namespace c10 {
 namespace detail {
 template <typename... Args>
-decltype(auto) torchCheckMsgImpl(const char* msg, const Args&... args) {
+decltype(auto) torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
   return ::c10::str(args...);
 }
 inline C10_API const char* torchCheckMsgImpl(const char* msg) {
@@ -383,7 +383,7 @@ inline C10_API const char* torchCheckMsgImpl(const char* msg) {
 }
 // If there is just 1 user-provided C-string argument, use it.
 inline C10_API const char* torchCheckMsgImpl(
-    const char* msg,
+    const char* /*msg*/,
     const char* args) {
   return args;
 }
@@ -433,7 +433,7 @@ namespace detail {
     const char* file,
     uint32_t line,
     const char* condMsg,
-    ::c10::detail::CompileTimeEmptyString userMsg) {
+    ::c10::detail::CompileTimeEmptyString /*userMsg*/) {
   torchCheckFail(func, file, line, condMsg);
 }
 [[noreturn]] C10_API void torchInternalAssertFail(
diff --git a/c10/util/Half-inl.h b/c10/util/Half-inl.h
index 3e2b5071a549..b438f4a01452 100644
--- a/c10/util/Half-inl.h
+++ b/c10/util/Half-inl.h
@@ -12,7 +12,7 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#ifdef __SYCL_DEVICE_ONLY__
+#ifdef SYCL_LANGUAGE_VERSION
 #include <CL/sycl.hpp>
 #endif
 
@@ -56,6 +56,15 @@ inline C10_HOST_DEVICE Half::operator __half() const {
 }
 #endif
 
+#ifdef SYCL_LANGUAGE_VERSION
+inline C10_HOST_DEVICE Half::Half(const sycl::half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator sycl::half() const {
+  return *reinterpret_cast<const sycl::half*>(&x);
+}
+#endif
+
 // CUDA intrinsics
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \
@@ -88,6 +97,8 @@ inline C10_HOST_DEVICE Half operator-(const Half& a) {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
     defined(__HIP_DEVICE_COMPILE__)
   return __hneg(a);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return -static_cast<sycl::half>(a);
 #else
   return -static_cast<float>(a);
 #endif
diff --git a/c10/util/Half.h b/c10/util/Half.h
index 517f5807d557..dc51f032fdc2 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -45,6 +45,10 @@
 #include <hip/hip_fp16.h>
 #endif
 
+#ifdef SYCL_LANGUAGE_VERSION
+#include <CL/sycl.hpp>
+#endif
+
 // Standard check for compiling CUDA with clang
 #if defined(__clang__) && defined(__CUDA__) && defined(__CUDA_ARCH__)
 #define C10_DEVICE_HOST_FUNCTION __device__ __host__
@@ -390,29 +394,59 @@ struct alignas(2) Half {
   inline C10_HOST_DEVICE Half(const __half& value);
   inline C10_HOST_DEVICE operator __half() const;
 #endif
+#ifdef SYCL_LANGUAGE_VERSION
+  inline C10_HOST_DEVICE Half(const sycl::half& value);
+  inline C10_HOST_DEVICE operator sycl::half() const;
+#endif
 };
 
-// This is just a placeholder for whatever complex representation we
-// end up deciding to use for half-precision complex numbers.
+// TODO : move to complex.h
 template <>
 struct alignas(4) complex<Half> {
-  using value_type = Half;
   Half real_;
   Half imag_;
+
+  // Constructors
   complex() = default;
-  Half real() const {
+  // Half constructor is not constexpr so the following constructor can't
+  // be constexpr
+  C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag)
+      : real_(real), imag_(imag) {}
+  C10_HOST_DEVICE inline complex(const c10::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  // Conversion operator
+  inline C10_HOST_DEVICE operator c10::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  constexpr C10_HOST_DEVICE Half real() const {
     return real_;
   }
-  Half imag() const {
+  constexpr C10_HOST_DEVICE Half imag() const {
     return imag_;
   }
-  explicit inline complex(c10::complex<float> value)
-      : real_(value.real()), imag_(value.imag()) {}
-  explicit inline complex(c10::complex<double> value)
-      : real_(static_cast<float>(value.real())),
-        imag_(static_cast<float>(value.imag())) {}
-  inline operator c10::complex<float>() const {
-    return {real_, imag_};
+
+  complex<Half>& operator+=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) + static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) + static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  complex<Half>& operator-=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) - static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) - static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  complex<Half>& operator*=(const complex<Half>& other) {
+    auto a = static_cast<float>(real_);
+    auto b = static_cast<float>(imag_);
+    auto c = static_cast<float>(other.real());
+    auto d = static_cast<float>(other.imag());
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
   }
 };
 
@@ -442,7 +476,7 @@ struct alignas(4) complex<Half> {
 // for `f > limit::max()` below
 template <typename To, typename From>
 typename std::enable_if<std::is_same<From, bool>::value, bool>::type overflows(
-    From f) {
+    From /*f*/) {
   return false;
 }
 
diff --git a/c10/util/LeftRight.h b/c10/util/LeftRight.h
index 13529f2ea0c7..e45267cb8f7e 100644
--- a/c10/util/LeftRight.h
+++ b/c10/util/LeftRight.h
@@ -1,4 +1,5 @@
 #include <c10/macros/Macros.h>
+#include <c10/util/Synchronized.h>
 #include <array>
 #include <atomic>
 #include <functional>
@@ -192,13 +193,9 @@ class LeftRight final {
 // read-write lock to protect T (data).
 template <class T>
 class RWSafeLeftRightWrapper final {
-  using mutexType = std::mutex;
-  using rLockType = std::unique_lock<std::mutex>;
-  using wLockType = std::unique_lock<std::mutex>;
-
  public:
   template <class... Args>
-  explicit RWSafeLeftRightWrapper(const Args&... args) : _data{args...} {}
+  explicit RWSafeLeftRightWrapper(const Args&... args) : data_{args...} {}
 
   // RWSafeLeftRightWrapper is not copyable or moveable since LeftRight
   // is not copyable or moveable.
@@ -209,19 +206,17 @@ class RWSafeLeftRightWrapper final {
 
   template <typename F>
   auto read(F&& readFunc) const -> typename std::result_of<F(const T&)>::type {
-    rLockType lock(mutex_);
-    return readFunc(_data);
+    return data_.withLock(
+        [&readFunc](T const& data) { return readFunc(data); });
   }
 
   template <typename F>
   auto write(F&& writeFunc) -> typename std::result_of<F(T&)>::type {
-    wLockType lock(mutex_);
-    return writeFunc(_data);
+    return data_.withLock([&writeFunc](T& data) { return writeFunc(data); });
   }
 
  private:
-  T _data;
-  mutable mutexType mutex_;
+  c10::Synchronized<T> data_;
 };
 
 } // namespace c10
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index c1ede582ff2a..fe74e4954864 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -145,8 +145,13 @@ bool LogAPIUsageFakeReturn(const std::string& event) try {
   // static destructor race
   return true;
 }
-} // namespace detail
 
+namespace {
+
+void setLogLevelFlagFromEnv();
+
+} // namespace
+} // namespace detail
 } // namespace c10
 
 #if defined(C10_USE_GFLAGS) && defined(C10_USE_GLOG)
@@ -198,23 +203,39 @@ bool IsGoogleLoggingInitialized();
 } // namespace google
 
 namespace c10 {
-bool InitCaffeLogging(int* argc, char** argv) {
-  if (*argc == 0)
-    return true;
+namespace {
+
+void initGoogleLogging(char const* name) {
 #if !defined(_MSC_VER)
   // This trick can only be used on UNIX platforms
   if (!::google::glog_internal_namespace_::IsGoogleLoggingInitialized())
 #endif
   {
-    ::google::InitGoogleLogging(argv[0]);
+    ::google::InitGoogleLogging(name);
 #if !defined(_MSC_VER)
     // This is never defined on Windows
-#if !defined(__XROS__)
     ::google::InstallFailureSignalHandler();
-#endif
 #endif
   }
+}
+
+} // namespace
+
+void initLogging() {
+  detail::setLogLevelFlagFromEnv();
+
+  UpdateLoggingLevelsFromFlags();
+}
+
+bool InitCaffeLogging(int* argc, char** argv) {
+  if (*argc == 0) {
+    return true;
+  }
+
+  initGoogleLogging(argv[0]);
+
   UpdateLoggingLevelsFromFlags();
+
   return true;
 }
 
@@ -254,6 +275,11 @@ C10_DEFINE_int(
     "The minimum log level that caffe2 will output.");
 
 namespace c10 {
+
+void initLogging() {
+  detail::setLogLevelFlagFromEnv();
+}
+
 bool InitCaffeLogging(int* argc, char** argv) {
   // When doing InitCaffeLogging, we will assume that caffe's flag parser has
   // already finished.
@@ -356,3 +382,53 @@ MessageLogger::~MessageLogger() {
 } // namespace c10
 
 #endif // !C10_USE_GLOG
+
+namespace c10 {
+namespace detail {
+namespace {
+
+void setLogLevelFlagFromEnv() {
+  const char* level_str = std::getenv("TORCH_CPP_LOG_LEVEL");
+
+  // Not set, fallback to the default level (i.e. WARNING).
+  std::string level{level_str != nullptr ? level_str : ""};
+  if (level.empty()) {
+    return;
+  }
+
+  std::transform(
+      level.begin(), level.end(), level.begin(), [](unsigned char c) {
+        return toupper(c);
+      });
+
+  if (level == "0" || level == "INFO") {
+    FLAGS_caffe2_log_level = 0;
+
+    return;
+  }
+  if (level == "1" || level == "WARNING") {
+    FLAGS_caffe2_log_level = 1;
+
+    return;
+  }
+  if (level == "2" || level == "ERROR") {
+    FLAGS_caffe2_log_level = 2;
+
+    return;
+  }
+  if (level == "3" || level == "FATAL") {
+    FLAGS_caffe2_log_level = 3;
+
+    return;
+  }
+
+  std::cerr
+      << "`TORCH_CPP_LOG_LEVEL` environment variable cannot be parsed. Valid values are "
+         "`INFO`, `WARNING`, `ERROR`, and `FATAL` or their numerical equivalents `0`, `1`, "
+         "`2`, and `3`."
+      << std::endl;
+}
+
+} // namespace
+} // namespace detail
+} // namespace c10
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index fd78a21fc594..e2ed61de606f 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -80,7 +80,7 @@ C10_API void UpdateLoggingLevelsFromFlags();
     const char* file,
     const int line,
     const char* condition,
-    detail::CompileTimeEmptyString msg,
+    detail::CompileTimeEmptyString /*msg*/,
     const void* caller = nullptr) {
   ThrowEnforceNotMet(file, line, condition, "", caller);
 }
@@ -103,7 +103,7 @@ C10_API void UpdateLoggingLevelsFromFlags();
     const char* file,
     const int line,
     const char* condition,
-    detail::CompileTimeEmptyString msg,
+    detail::CompileTimeEmptyString /*msg*/,
     const void* caller = nullptr) {
   ThrowEnforceFiniteNotMet(file, line, condition, "", caller);
 }
@@ -305,6 +305,9 @@ namespace detail {
 C10_API bool LogAPIUsageFakeReturn(const std::string& context);
 } // namespace detail
 
+// Initializes the c10 logger.
+C10_API void initLogging();
+
 } // namespace c10
 
 #endif // C10_UTIL_LOGGING_H_
diff --git a/c10/util/MaybeOwned.h b/c10/util/MaybeOwned.h
index a3028f22ea18..a698e275c119 100644
--- a/c10/util/MaybeOwned.h
+++ b/c10/util/MaybeOwned.h
@@ -24,7 +24,7 @@ struct MaybeOwnedTraitsGenericImpl {
     lhs = rhs;
   }
 
-  static void destroyBorrow(borrow_type& toDestroy) {}
+  static void destroyBorrow(borrow_type& /*toDestroy*/) {}
 
   static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
     return *borrow;
diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h
index 30f6d7c590a5..1f7fcf363f39 100644
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@@ -398,7 +398,7 @@ template <
         index<std::tuple_size<HeadTuple>::value, int> = 0> decltype(auto)
         extract_tuple_element_by_index(
             HeadTuple&& head_tuple,
-            TailTuples&&... tail_tuples) {
+            TailTuples&&... /*tail_tuples*/) {
   // TODO if constexpr instead of enable_if
   return std::get<index>(std::forward<HeadTuple>(head_tuple));
 }
@@ -409,7 +409,7 @@ template <
     class... TailTuples,
     std::enable_if_t<index >= std::tuple_size<HeadTuple>::value, int> = 0>
 decltype(auto) extract_tuple_element_by_index(
-    HeadTuple&& head_tuple,
+    HeadTuple&& /*head_tuple*/,
     TailTuples&&... tail_tuples) {
   // TODO if constexpr instead of enable_if
   return extract_tuple_element_by_index<
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index e81911296bc9..17f4d5a8007f 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -12,7 +12,7 @@
 // C10
 // - Move file to `c10` namespace.
 // - Remove macro use in line 478 because the nvcc device compiler cannot handle
-// it it.
+// it.
 // - Revise constructor logic so that it is 1) consistent with c++ 17 standard
 // documented here in (8):
 // https://en.cppreference.com/w/cpp/utility/optional/optional, and 2) able to
diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h
new file mode 100644
index 000000000000..7ca375d7cb78
--- /dev/null
+++ b/c10/util/OptionalArrayRef.h
@@ -0,0 +1,228 @@
+// This file defines OptionalArrayRef<T>, a class that has almost the same
+// exact functionality as c10::optional<ArrayRef<T>>, except that its
+// converting constructor fixes a dangling pointer issue.
+//
+// The implicit converting constructor of both c10::optional<ArrayRef<T>> and
+// std::optional<ArrayRef<T>> can cause the underlying ArrayRef<T> to store
+// a dangling pointer. OptionalArrayRef<T> prevents this by wrapping
+// a c10::optional<ArrayRef<T>> and fixing the constructor implementation.
+//
+// See https://github.com/pytorch/pytorch/issues/63645 for more on this.
+
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+template <typename T>
+class OptionalArrayRef final {
+ public:
+  // Constructors
+
+  constexpr OptionalArrayRef() noexcept {}
+
+  constexpr OptionalArrayRef(nullopt_t) noexcept {}
+
+  OptionalArrayRef(const OptionalArrayRef& other) = default;
+
+  OptionalArrayRef(OptionalArrayRef&& other) = default;
+
+  constexpr OptionalArrayRef(const optional<ArrayRef<T>>& other) noexcept
+      : wrapped_opt_array_ref(other) {}
+
+  constexpr OptionalArrayRef(optional<ArrayRef<T>>&& other) noexcept
+      : wrapped_opt_array_ref(other) {}
+
+  constexpr OptionalArrayRef(const T& value) noexcept
+      : wrapped_opt_array_ref(value) {}
+
+  template <
+      typename U = ArrayRef<T>,
+      std::enable_if_t<
+          !std::is_same<std::decay_t<U>, OptionalArrayRef>::value &&
+              !std::is_same<std::decay_t<U>, in_place_t>::value &&
+              std::is_constructible<ArrayRef<T>, U&&>::value &&
+              std::is_convertible<U&&, ArrayRef<T>>::value &&
+              !std::is_convertible<U&&, T>::value,
+          bool> = false>
+  constexpr OptionalArrayRef(U&& value) noexcept(
+      std::is_nothrow_constructible<ArrayRef<T>, U&&>::value)
+      : wrapped_opt_array_ref(value) {}
+
+  template <
+      typename U = ArrayRef<T>,
+      std::enable_if_t<
+          !std::is_same<std::decay_t<U>, OptionalArrayRef>::value &&
+              !std::is_same<std::decay_t<U>, in_place_t>::value &&
+              std::is_constructible<ArrayRef<T>, U&&>::value &&
+              !std::is_convertible<U&&, ArrayRef<T>>::value,
+          bool> = false>
+  constexpr explicit OptionalArrayRef(U&& value) noexcept(
+      std::is_nothrow_constructible<ArrayRef<T>, U&&>::value)
+      : wrapped_opt_array_ref(value) {}
+
+  template <typename... Args>
+  constexpr explicit OptionalArrayRef(in_place_t ip, Args&&... args) noexcept
+      : wrapped_opt_array_ref(ip, args...) {}
+
+  template <typename U, typename... Args>
+  constexpr explicit OptionalArrayRef(
+      in_place_t ip,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : wrapped_opt_array_ref(ip, il, args...) {}
+
+  // Destructor
+
+  ~OptionalArrayRef() = default;
+
+  // Assignment
+
+  constexpr OptionalArrayRef& operator=(nullopt_t) noexcept {
+    wrapped_opt_array_ref = c10::nullopt;
+    return *this;
+  }
+
+  OptionalArrayRef& operator=(const OptionalArrayRef& other) = default;
+
+  OptionalArrayRef& operator=(OptionalArrayRef&& other) = default;
+
+  constexpr OptionalArrayRef& operator=(
+      const optional<ArrayRef<T>>& other) noexcept {
+    wrapped_opt_array_ref = other;
+    return *this;
+  }
+
+  constexpr OptionalArrayRef& operator=(
+      optional<ArrayRef<T>>&& other) noexcept {
+    wrapped_opt_array_ref = other;
+    return *this;
+  }
+
+  template <typename U = ArrayRef<T>>
+  constexpr std::enable_if_t<
+      !std::is_same<std::decay_t<U>, OptionalArrayRef>::value &&
+          std::is_constructible<ArrayRef<T>, U&&>::value &&
+          std::is_assignable<ArrayRef<T>&, U&&>::value,
+      OptionalArrayRef&>
+  operator=(U&& value) noexcept(
+      std::is_nothrow_constructible<ArrayRef<T>, U&&>::value&&
+          std::is_nothrow_assignable<ArrayRef<T>&, U&&>::value) {
+    wrapped_opt_array_ref = value;
+    return *this;
+  }
+
+  // Observers
+
+  constexpr ArrayRef<T>* operator->() noexcept {
+    return &wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>* operator->() const noexcept {
+    return &wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>& operator*() & noexcept {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>& operator*() const& noexcept {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>&& operator*() && noexcept {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr const ArrayRef<T>&& operator*() const&& noexcept {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr explicit operator bool() const noexcept {
+    return wrapped_opt_array_ref.has_value();
+  }
+
+  constexpr bool has_value() const noexcept {
+    return wrapped_opt_array_ref.has_value();
+  }
+
+  constexpr ArrayRef<T>& value() & {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>& value() const& {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>&& value() && {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr const ArrayRef<T>&& value() const&& {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  template <typename U>
+  constexpr std::
+      enable_if_t<std::is_convertible<U&&, ArrayRef<T>>::value, ArrayRef<T>>
+      value_or(U&& default_value) const& {
+    return wrapped_opt_array_ref.value_or(default_value);
+  }
+
+  template <typename U>
+  constexpr std::
+      enable_if_t<std::is_convertible<U&&, ArrayRef<T>>::value, ArrayRef<T>>
+      value_or(U&& default_value) && {
+    return wrapped_opt_array_ref.value_or(default_value);
+  }
+
+  // Modifiers
+
+  constexpr void swap(OptionalArrayRef& other) noexcept {
+    std::swap(wrapped_opt_array_ref, other.wrapped_opt_array_ref);
+  }
+
+  constexpr void reset() noexcept {
+    wrapped_opt_array_ref.reset();
+  }
+
+  template <typename... Args>
+  constexpr std::enable_if_t<
+      std::is_constructible<ArrayRef<T>, Args&&...>::value,
+      ArrayRef<T>&>
+  emplace(Args&&... args) noexcept(
+      std::is_nothrow_constructible<ArrayRef<T>, Args&&...>::value) {
+    return wrapped_opt_array_ref.emplace(args...);
+  }
+
+  template <typename U, typename... Args>
+  constexpr ArrayRef<T>& emplace(
+      std::initializer_list<U> il,
+      Args&&... args) noexcept {
+    return wrapped_opt_array_ref.emplace(il, args...);
+  }
+
+ private:
+  optional<ArrayRef<T>> wrapped_opt_array_ref;
+};
+
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
+
+inline bool operator==(
+    const OptionalIntArrayRef& a1,
+    const IntArrayRef& other) {
+  if (!a1.has_value()) {
+    return false;
+  }
+  return a1.value() == other;
+}
+
+inline bool operator==(
+    const c10::IntArrayRef& a1,
+    const c10::OptionalIntArrayRef& a2) {
+  return a2 == a1;
+}
+
+} // namespace c10
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 7567ce4add05..1fcc4a1a8f43 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -193,6 +193,8 @@ class SmallVectorTemplateCommon
 
   /// Check whether Elt will be invalidated by resizing the vector to NewSize.
   void assertSafeToReferenceAfterResize(const void* Elt, size_t NewSize) {
+    (void)Elt; // Suppress unused variable warning
+    (void)NewSize; // Suppress unused variable warning
     assert(
         isSafeToReferenceAfterResize(Elt, NewSize) &&
         "Attempting to reference an element of the vector in an operation "
diff --git a/c10/util/Synchronized.h b/c10/util/Synchronized.h
new file mode 100644
index 000000000000..1679d7060fe0
--- /dev/null
+++ b/c10/util/Synchronized.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <mutex>
+
+namespace c10 {
+
+/**
+ * A very simple Synchronization class for error-free use of data
+ * in a multi-threaded context. See folly/docs/Synchronized.md for
+ * the inspiration of this class.
+ *
+ * Full URL:
+ * https://github.com/facebook/folly/blob/main/folly/docs/Synchronized.md
+ *
+ * This class implements a small subset of the generic functionality
+ * implemented by folly:Synchronized<T>. Specifically, only withLock<T>
+ * is implemeted here since it's the smallest possible API that is
+ * able to cover a large surface area of functionality offered by
+ * folly::Synchronized<T>.
+ */
+template <typename T>
+class Synchronized final {
+  mutable std::mutex mutex_;
+  T data_;
+
+ public:
+  Synchronized() = default;
+  Synchronized(T const& data) : data_(data) {}
+  Synchronized(T&& data) : data_(data) {}
+
+  // Don't permit copy construction, move, assignment, or
+  // move assignment, since the underlying std::mutex
+  //  isn't necessarily copyable/moveable.
+  Synchronized(Synchronized const&) = delete;
+  Synchronized(Synchronized&&) = delete;
+  Synchronized operator=(Synchronized const&) = delete;
+  Synchronized operator=(Synchronized&&) = delete;
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by reference. Use the protected variable in the
+   * provided callback safely.
+   */
+  template <typename CB>
+  typename std::result_of<CB(T&)>::type withLock(CB cb) {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return cb(this->data_);
+  }
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by const reference. Use the protected variable in
+   * the provided callback safely.
+   */
+  template <typename CB>
+  typename std::result_of<CB(T const&)>::type withLock(CB cb) const {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return cb(this->data_);
+  }
+};
+} // end namespace c10
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index 86c5c9f62231..1c6a72bab492 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -45,7 +45,8 @@ struct static_cast_with_inter_type {
   C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline dest_t apply(
       src_t src) {
     constexpr bool real = needs_real<dest_t, src_t>::value;
-    return static_cast<dest_t>(maybe_real<real, src_t>::apply(src));
+    auto r = maybe_real<real, src_t>::apply(src);
+    return static_cast<dest_t>(r);
   }
 };
 
@@ -68,6 +69,36 @@ struct static_cast_with_inter_type<uint8_t, src_t> {
   }
 };
 
+template <>
+struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::BFloat16> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::BFloat16 src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Half> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Half src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::complex<double>> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::complex<double> src) {
+    return static_cast<c10::complex<c10::Half>>(
+        static_cast<c10::complex<float>>(src));
+  }
+};
+
 // Dynamic type casting utils:
 // - fetch_and_cast
 // - cast_and_store
@@ -130,7 +161,7 @@ C10_HOST_DEVICE inline dest_t fetch_and_cast(
     const ScalarType src_type,
     const void* ptr) {
   switch (src_type) {
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(FETCH_AND_CAST_CASE)
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(FETCH_AND_CAST_CASE)
     default:
       ERROR_UNSUPPORTED_CAST
   }
@@ -149,7 +180,7 @@ C10_HOST_DEVICE inline void cast_and_store(
     void* ptr,
     src_t value) {
   switch (dest_type) {
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(CAST_AND_STORE_CASE)
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(CAST_AND_STORE_CASE)
     default:;
   }
   ERROR_UNSUPPORTED_CAST
diff --git a/c10/util/TypeSafeSignMath.h b/c10/util/TypeSafeSignMath.h
index 155f01f292ba..7eb6d61c122e 100644
--- a/c10/util/TypeSafeSignMath.h
+++ b/c10/util/TypeSafeSignMath.h
@@ -17,8 +17,8 @@ namespace c10 {
 /// Returns false since we cannot have x < 0 if x is unsigned.
 template <typename T>
 static inline constexpr bool is_negative(
-    const T& x,
-    std::true_type is_unsigned) {
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
   return false;
 }
 
@@ -26,7 +26,7 @@ static inline constexpr bool is_negative(
 template <typename T>
 static inline constexpr bool is_negative(
     const T& x,
-    std::false_type is_unsigned) {
+    std::false_type /*is_unsigned*/) {
   return x < T(0);
 }
 
@@ -42,13 +42,15 @@ inline constexpr bool is_negative(const T& x) {
 
 /// Returns the sign of an unsigned variable x as 0, 1
 template <typename T>
-static inline constexpr int signum(const T& x, std::true_type is_unsigned) {
+static inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) {
   return T(0) < x;
 }
 
 /// Returns the sign of a signed variable x as -1, 0, 1
 template <typename T>
-static inline constexpr int signum(const T& x, std::false_type is_unsigned) {
+static inline constexpr int signum(
+    const T& x,
+    std::false_type /*is_unsigned*/) {
   return (T(0) < x) - (x < T(0));
 }
 
@@ -68,6 +70,14 @@ inline constexpr bool signs_differ(const T& a, const U& b) {
   return is_negative(a) != is_negative(b);
 }
 
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
 /// Returns true if x is greater than the greatest value of the type Limit
 template <typename Limit, typename T>
 inline constexpr bool greater_than_max(const T& x) {
@@ -76,12 +86,16 @@ inline constexpr bool greater_than_max(const T& x) {
   return can_overflow && x > std::numeric_limits<Limit>::max();
 }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
 /// Returns true if x < lowest(Limit). Standard comparison
 template <typename Limit, typename T>
 static inline constexpr bool less_than_lowest(
     const T& x,
-    std::false_type limit_is_unsigned,
-    std::false_type x_is_unsigned) {
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
   return x < std::numeric_limits<Limit>::lowest();
 }
 
@@ -89,9 +103,9 @@ static inline constexpr bool less_than_lowest(
 /// negative values but x cannot be negative because it is unsigned
 template <typename Limit, typename T>
 static inline constexpr bool less_than_lowest(
-    const T& x,
-    std::false_type limit_is_unsigned,
-    std::true_type x_is_unsigned) {
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
   return false;
 }
 
@@ -100,17 +114,17 @@ static inline constexpr bool less_than_lowest(
 template <typename Limit, typename T>
 static inline constexpr bool less_than_lowest(
     const T& x,
-    std::true_type limit_is_unsigned,
-    std::false_type x_is_unsigned) {
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
   return x < T(0);
 }
 
 /// Returns false sign both types are unsigned
 template <typename Limit, typename T>
 static inline constexpr bool less_than_lowest(
-    const T& x,
-    std::true_type limit_is_unsigned,
-    std::true_type x_is_unsigned) {
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
   return false;
 }
 
diff --git a/c10/util/accumulate.h b/c10/util/accumulate.h
index 086a7977401c..8d0cc49c8ecb 100644
--- a/c10/util/accumulate.h
+++ b/c10/util/accumulate.h
@@ -82,7 +82,7 @@ template <
 inline int64_t numelements_from_dim(const int k, const C& dims) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(k >= 0);
 
-  if (k > dims.size()) {
+  if (k > static_cast<int>(dims.size())) {
     return 1;
   } else {
     auto cbegin = dims.cbegin();
diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp
index a080e73430b3..f83dba499833 100644
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@@ -171,7 +171,7 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
 
   // Add the requisite padding.
   std::streamsize width = o.width(0);
-  if (width > rep.size()) {
+  if (width > static_cast<std::streamsize>(rep.size())) {
     if ((flags & std::ios::adjustfield) == std::ios::left) {
       rep.append(width - rep.size(), o.fill());
     } else {
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index d089ff86eeab..ef256b40ca3b 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -146,14 +146,18 @@ class C10_API intrusive_ptr_target {
   // intrusive_ptr_target supports copy and move: but refcount and weakcount
   // don't participate (since they are intrinsic properties of the memory
   // location)
-  intrusive_ptr_target(intrusive_ptr_target&& other) noexcept
+  intrusive_ptr_target(intrusive_ptr_target&& /*other*/) noexcept
       : intrusive_ptr_target() {}
-  intrusive_ptr_target& operator=(intrusive_ptr_target&& other) noexcept {
+
+  intrusive_ptr_target& operator=(intrusive_ptr_target&& /*other*/) noexcept {
     return *this;
   }
-  intrusive_ptr_target(const intrusive_ptr_target& other) noexcept
+
+  intrusive_ptr_target(const intrusive_ptr_target& /*other*/) noexcept
       : intrusive_ptr_target() {}
-  intrusive_ptr_target& operator=(const intrusive_ptr_target& other) noexcept {
+
+  intrusive_ptr_target& operator=(
+      const intrusive_ptr_target& /*other*/) noexcept {
     return *this;
   }
 
@@ -289,7 +293,6 @@ class intrusive_ptr final {
         delete target_;
       }
     }
-    target_ = NullType::singleton();
   }
 
   // raw pointer constructors are not public because we shouldn't make
@@ -413,6 +416,7 @@ class intrusive_ptr final {
 
   void reset() noexcept {
     reset_();
+    target_ = NullType::singleton();
   }
 
   void swap(intrusive_ptr& rhs) noexcept {
@@ -591,6 +595,20 @@ inline bool operator==(
   return lhs.get() == rhs.get();
 }
 
+template <class TTarget1, class NullType1>
+inline bool operator==(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return lhs.get() == nullptr;
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator==(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return nullptr == rhs.get();
+}
+
 template <class TTarget1, class NullType1, class TTarget2, class NullType2>
 inline bool operator!=(
     const intrusive_ptr<TTarget1, NullType1>& lhs,
@@ -598,6 +616,19 @@ inline bool operator!=(
   return !operator==(lhs, rhs);
 }
 
+template <class TTarget1, class NullType1>
+inline bool operator!=(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return !operator==(lhs, nullptr);
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator!=(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(nullptr, rhs);
+}
 template <typename T>
 struct MaybeOwnedTraits<c10::intrusive_ptr<T>> {
   using owned_type = c10::intrusive_ptr<T>;
@@ -624,7 +655,7 @@ struct MaybeOwnedTraits<c10::intrusive_ptr<T>> {
     return &borrow;
   }
 
-  static bool debugBorrowIsValid(const borrow_type& borrow) {
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
     return true;
   }
 };
diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
index 46b3e1e3613d..37b0ab8b6872 100644
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@@ -371,7 +371,7 @@ constexpr inline typename std::enable_if<(N < 64), bool>::type isUInt(
 }
 template <unsigned N>
 constexpr inline typename std::enable_if<N >= 64, bool>::type isUInt(
-    uint64_t X) {
+    uint64_t /*X*/) {
   return true;
 }
 
diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h
new file mode 100644
index 000000000000..7eb9ed39395d
--- /dev/null
+++ b/c10/util/safe_numerics.h
@@ -0,0 +1,74 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+
+// GCC has __builtin_mul_overflow from before it supported __has_builtin
+#ifdef _MSC_VER
+#define C10_HAS_BUILTIN_OVERFLOW() (0)
+#include <c10/util/llvmMathExtras.h>
+#include <intrin.h>
+#else
+#define C10_HAS_BUILTIN_OVERFLOW() (1)
+#endif
+
+namespace c10 {
+
+C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_add_overflow(a, b, out);
+#else
+  unsigned long long tmp;
+  auto carry = _addcarry_u64(0, a, b, &tmp);
+  *out = tmp;
+  return carry;
+#endif
+}
+
+C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_mul_overflow(a, b, out);
+#else
+  *out = a * b;
+  // This test isnt exact, but avoids doing integer division
+  return (
+      (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64);
+#endif
+}
+
+template <typename It>
+bool safe_multiplies_u64(It first, It last, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  uint64_t prod = 1;
+  bool overflow = false;
+  for (; first != last; ++first) {
+    overflow |= c10::mul_overflows(prod, *first, &prod);
+  }
+  *out = prod;
+  return overflow;
+#else
+  uint64_t prod = 1;
+  uint64_t prod_log2 = 0;
+  bool is_zero = false;
+  for (; first != last; ++first) {
+    auto x = static_cast<uint64_t>(*first);
+    prod *= x;
+    // log2(0) isn't valid, so need to track it specially
+    is_zero |= (x == 0);
+    prod_log2 += c10::llvm::Log2_64_Ceil(x);
+  }
+  *out = prod;
+  // This test isnt exact, but avoids doing integer division
+  return !is_zero && (prod_log2 >= 64);
+#endif
+}
+
+template <typename Container>
+bool safe_multiplies_u64(const Container& c, uint64_t* out) {
+  return safe_multiplies_u64(c.begin(), c.end(), out);
+}
+
+} // namespace c10
diff --git a/c10/util/strides.h b/c10/util/strides.h
new file mode 100644
index 000000000000..40315a625c61
--- /dev/null
+++ b/c10/util/strides.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+
+namespace c10 {
+
+// Computes the contiguous strides of a tensor, given its sizes.
+static inline DimVector contiguous_strides(const IntArrayRef sizes) {
+  using Int = IntArrayRef::value_type;
+  const Int dims = static_cast<Int>(sizes.size());
+
+  DimVector strides;
+
+  if (dims > 0) {
+    strides.assign(dims, 0);
+    // Start by populating the last dimension: its strides is always 1.
+    strides[dims - 1] = 1;
+    for (auto i = dims - 2; i >= 0; --i) {
+      // Strides can't be 0 even if sizes are 0.
+      strides[i] = strides[i + 1] * std::max(sizes[i + 1], Int{1});
+    }
+  }
+
+  return strides;
+}
+
+} // namespace c10
diff --git a/c10/util/variant.h b/c10/util/variant.h
index 421efdf6f870..6b22116e685f 100644
--- a/c10/util/variant.h
+++ b/c10/util/variant.h
@@ -280,7 +280,21 @@ namespace std {
 #define C10_MPARK_BUILTIN_UNREACHABLE
 #endif
 
-#if __has_builtin(__type_pack_element)
+// NOTE [nvcc bug workaround]
+//
+// The original line `typename Front = lib::type_pack_element_t<0, Ts...>,`
+// throws the following compiler error on nvcc:
+// ```
+// c10/util/variant.h(2367): error: parameter pack "Ts" was referenced but not
+// expanded
+// ```
+// As a workaround, we skip defining C10_MPARK_TYPE_PACK_ELEMENT for nvcc
+// compiler
+//
+// See the following issues for more context:
+// https://github.com/pytorch/extension-cpp/issues/58
+// https://github.com/mpark/variant/issues/77
+#if __has_builtin(__type_pack_element) && !defined(__CUDACC__)
 #define C10_MPARK_TYPE_PACK_ELEMENT
 #endif
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 02949d50960a..4e9a90ef944d 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -63,7 +63,7 @@ if(INTERN_BUILD_ATEN_OPS)
   set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE})
 
   # Generate the headers wrapped by our operator
-  file(GLOB_RECURSE all_python "${PROJECT_SOURCE_DIR}/tools/codegen/*.py")
+  file(GLOB_RECURSE all_python "${PROJECT_SOURCE_DIR}/torchgen/*.py")
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
   COMMAND
   "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
@@ -89,8 +89,10 @@ if(INTERN_BUILD_ATEN_OPS)
   list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS})
   list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
+  list(APPEND Caffe2_MPS_SRCS ${ATen_MPS_SRCS})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
+  list(APPEND Caffe2_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS})
   list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
   list(APPEND Caffe2_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
@@ -230,6 +232,11 @@ if(PRINT_CMAKE_DEBUG_INFO)
     message(STATUS "  " ${tmp})
   endforeach()
 
+  message(STATUS "MPS sources: ")
+  foreach(tmp ${Caffe2_MPS_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
   message(STATUS "HIP test sources: ")
   foreach(tmp ${Caffe2_HIP_TEST_SRCS})
     message(STATUS "  " ${tmp})
@@ -240,6 +247,11 @@ if(PRINT_CMAKE_DEBUG_INFO)
     message(STATUS "  " ${tmp})
   endforeach()
 
+  message(STATUS "ATen MPS test sources: ")
+  foreach(tmp ${ATen_MPS_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
   message(STATUS "ATen CUDA test sources: ")
   foreach(tmp ${ATen_CUDA_TEST_SRCS})
     message(STATUS "  " ${tmp})
@@ -350,6 +362,13 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
       "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
     )
+    if(BUILD_LAZY_TS_BACKEND)
+      list(APPEND GENERATED_CXX_TORCH
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.cpp"
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterAutogradLazy.cpp"
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
+      )
+    endif()
   endif()
 
   set(GENERATED_H_TORCH
@@ -360,6 +379,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if(NOT INTERN_DISABLE_AUTOGRAD)
     list(APPEND GENERATED_H_TORCH
       "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h"
     )
   endif()
 
@@ -397,18 +418,33 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     ${GENERATED_TESTING_PYTHON}
     )
 
+  set(GEN_PER_OPERATOR_FLAG)
+  if(USE_PER_OPERATOR_HEADERS)
+    list(APPEND GEN_PER_OPERATOR_FLAG "--per_operator_headers")
+  endif()
+
   add_custom_command(
     OUTPUT
     ${TORCH_GENERATED_CODE}
     COMMAND
     "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py
       --native-functions-path "aten/src/ATen/native/native_functions.yaml"
-      --nn-path "aten/src"
+      --tags-path "aten/src/ATen/native/tags.yaml"
       $<$<BOOL:${INTERN_DISABLE_AUTOGRAD}>:--disable-autograd>
       $<$<BOOL:${SELECTED_OP_LIST}>:--selected-op-list-path="${SELECTED_OP_LIST}">
       --force_schema_registration
+      --gen_lazy_ts_backend
+      ${GEN_PER_OPERATOR_FLAG}
     DEPENDS
     "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
+    "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml"
+    "${TORCH_ROOT}/aten/src/ATen/native/ts_native_functions.yaml"
+    "${TORCH_ROOT}/torch/csrc/lazy/core/shape_inference.h"
+    "${TORCH_ROOT}/torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
     "${TOOLS_PATH}/autograd/templates/VariableType.h"
     "${TOOLS_PATH}/autograd/templates/VariableType.cpp"
     "${TOOLS_PATH}/autograd/templates/ADInplaceOrViewType.cpp"
@@ -436,6 +472,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     "${TOOLS_PATH}/autograd/gen_variable_type.py"
     "${TOOLS_PATH}/autograd/gen_inplace_or_view_type.py"
     "${TOOLS_PATH}/autograd/load_derivatives.py"
+    "${TORCH_ROOT}/torchgen/gen_backend_stubs.py"
+    "${TORCH_ROOT}/torchgen/gen_lazy_tensor.py"
+    "${TORCH_ROOT}/torchgen/api/lazy.py"
+    "${TORCH_ROOT}/torchgen/dest/lazy_ir.py"
     WORKING_DIRECTORY "${TORCH_ROOT}")
 
 
@@ -475,7 +515,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
   else()
     append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
-
+    if(BUILD_LAZY_TS_BACKEND)
+      append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS)
+    endif()
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       # TODO: Delete this line once https://github.com/pytorch/pytorch/pull/55889 lands
       set_source_files_properties(../torch/csrc/jit/serialization/export.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
@@ -505,6 +547,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
         ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/cpp/context.cpp
         ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
         ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
+        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm
       )
       list(APPEND TORCH_SRCS ${COREML_DELEGATE_SRCS})
     endif()
@@ -568,6 +611,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/serialization/export_bytecode.cpp
       ${TORCH_SRC_DIR}/csrc/jit/serialization/export_module.cpp
       ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
       ${TORCH_SRC_DIR}/csrc/jit/api/module_save.cpp
       ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
@@ -625,8 +669,20 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1")
   endif()
 
-  if(USE_MLCOMPUTE)
-    include(../mlc/mlc_build.cmake)
+  if(BUILD_ONEDNN_GRAPH)
+    list(APPEND Caffe2_CPU_SRCS
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_fuser.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_rewriter.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_helper.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/register_interface.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/interface.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/kernel.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/defer_size_check.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/layout_propagation.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/prepare_binary.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/guard_shape.cpp
+    )
   endif()
 
   if(USE_ROCM)
@@ -709,14 +765,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/step_lr.cpp
       ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
       ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
-      ${TORCH_SRC_DIR}/csrc/utils/crash_handler.cpp
     )
   endif()
 
   list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
 endif()
 
-# NOTE [ Linking AVX-n and non-AVX-n files ]
+if(USE_MPS)
+  list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS})
+endif()
+
+# NOTE [ Linking AVX and non-AVX files ]
 #
 # Regardless of the CPU capabilities, we build some files with AVX2, and AVX512
 # instruction set. If the host CPU doesn't support those, we simply ignore their
@@ -764,6 +823,10 @@ if(HAVE_SOVERSION)
       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
 endif()
 torch_compile_options(torch_cpu)  # see cmake/public/utils.cmake
+if(HAS_WERROR_SIGN_COMPARE AND WERROR)
+  # target_compile_options(torch_cpu PRIVATE "-Werror=sign-compare")
+  set_property(SOURCE ${ATen_CORE_SRCS} ${ATen_CPU_SRCS} APPEND PROPERTY COMPILE_OPTIONS "-Werror=sign-compare")
+endif()
 
 set_property(SOURCE ${ATen_CORE_SRCS} APPEND
     PROPERTY COMPILE_DEFINITIONS "TORCH_ASSERT_ONLY_METHOD_OPERATORS")
@@ -901,6 +964,33 @@ elseif(USE_CUDA)
     target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
     target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
   endif()
+  if(BUILD_LAZY_CUDA_LINALG)
+    add_library(torch_cuda_linalg ${ATen_CUDA_LINALG_SRCS})
+    target_compile_definitions(torch_cuda_linalg PRIVATE USE_CUDA BUILD_LAZY_CUDA_LINALG)
+    # Library order is important during static linking
+    # `torch::magma` should be mentioned before other CUDA
+    # to transitively include all symbols present in torch_cuda/torch_cpu
+    if(USE_MAGMA)
+      target_link_libraries(torch_cuda_linalg PRIVATE torch::magma)
+      # CUDAHooks reports version of MAGMA PyTorch was compiled against, i.e. needs to be able to include magma headers
+      get_target_property(HOOKS_INCLUDE_DIRECTORIES torch_cuda INCLUDE_DIRECTORIES)
+      if(NOT "${MAGMA_INCLUDE_DIR}" IN_LIST HOOKS_INCLUDE_DIRECTORIES)
+        set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/CUDAHooks.cpp PROPERTIES INCLUDE_DIRECTORIES  "${MAGMA_INCLUDE_DIR}")
+      endif()
+    endif()
+    target_link_libraries(torch_cuda_linalg PRIVATE
+        torch_cpu
+        torch_cuda
+        ${CUDA_cusolver_LIBRARY}
+    )
+    # NS: TODO, is this really necessary?
+    if(USE_MAGMA AND CAFFE2_STATIC_LINK_CUDA)
+      target_link_libraries(torch_cuda_linalg PRIVATE
+          "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+    endif()
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp PROPERTIES COMPILE_FLAGS "-DBUILD_LAZY_CUDA_LINALG")
+    install(TARGETS torch_cuda_linalg DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endif()
 
   if(USE_PRECOMPILED_HEADERS)
     if(BUILD_SPLIT_CUDA)
@@ -914,59 +1004,7 @@ elseif(USE_CUDA)
 endif()
 
 if(USE_CUDA OR USE_ROCM)
-  if(BUILD_SPLIT_CUDA)
-    set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp
-  elseif(USE_CUDA)
-    set(TORCHLIB_FLAVOR torch_cuda)
-  elseif(USE_ROCM)
-    set(TORCHLIB_FLAVOR torch_hip)
-  endif()
-
-  # The list of NVFUSER runtime files
-  list(APPEND NVFUSER_RUNTIME_FILES
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu
-    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
-    ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/UnpackRaw.cuh
-  )
-
-  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
-
-  # "stringify" NVFUSER runtime sources
-  # (generate C++ header files embedding the original input as a string literal)
-  set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py")
-  foreach(src ${NVFUSER_RUNTIME_FILES})
-    get_filename_component(filename ${src} NAME_WE)
-    set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
-    add_custom_command(
-      COMMENT "Stringify NVFUSER runtime source file"
-      OUTPUT ${dst}
-      DEPENDS ${src}
-      COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
-    )
-    add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
-    add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename})
-
-    # also generate the resource headers during the configuration step
-    # (so tools like clang-tidy can run w/o requiring a real build)
-    execute_process(COMMAND
-      ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
-  endforeach()
-
-  target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include")
+  include(${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/nvfuser.cmake)
 endif()
 
 if(NOT MSVC AND USE_XNNPACK)
@@ -1061,7 +1099,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/QuantizedLinear.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
       set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/RNN.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
       set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
     endif()
 
 if(USE_TBB)
@@ -1069,10 +1107,9 @@ if(USE_TBB)
   target_link_libraries(torch_cpu PUBLIC TBB::tbb)
 endif()
 
-if(USE_BREAKPAD)
-  target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
-  target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party ${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad/src)
-  target_link_libraries(torch_cpu PRIVATE breakpad)
+if(BUILD_CAFFE2 AND BUILD_CAFFE2_OPS AND USE_FBGEMM)
+  # FIXME: quantization/server/conv_dnnlowp_op.cc depends on fbgemm/src/RefImplementations.h
+  target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party)
 endif()
 
   target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
@@ -1091,10 +1128,10 @@ endif()
 
   install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
     DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
-    FILES_MATCHING PATTERN "*.h")
+    FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
   install(DIRECTORY "${TORCH_SRC_DIR}/csrc/distributed/c10d"
     DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
-    FILES_MATCHING PATTERN "*.hpp")
+    FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
   install(FILES
     "${TORCH_SRC_DIR}/script.h"
     "${TORCH_SRC_DIR}/extension.h"
@@ -1108,13 +1145,16 @@ endif()
       DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
   endif()
 
-
   if(BUILD_TEST)
     if(BUILD_LITE_INTERPRETER)
       add_subdirectory(
         ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
         ${CMAKE_BINARY_DIR}/test_lite_interpreter_runtime
       )
+      add_subdirectory(
+        ${TORCH_ROOT}/test/mobile/lightweight_dispatch
+        ${CMAKE_BINARY_DIR}/test_codegen_unboxing
+      )
     else()
       add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
       add_subdirectory(
@@ -1296,8 +1336,14 @@ if(USE_DISTRIBUTED)
     else()
       if(BUILD_SPLIT_CUDA)
         target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
+        if(USE_NCCL_WITH_UCC)
+          target_compile_definitions(torch_cuda_cpp PUBLIC USE_NCCL_WITH_UCC)
+        endif()
       else()
         target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+        if(USE_NCCL_WITH_UCC)
+          target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
+        endif()
       endif()
     endif()
   endif()
@@ -1526,9 +1572,6 @@ if(USE_CUDA)
 elseif(USE_ROCM)
   target_link_libraries(torch PUBLIC torch_hip_library)
 endif()
-if(USE_MLCOMPUTE)
-  target_link_libraries(torch PUBLIC torch_mlc_library)
-endif()
 
 if(PRINT_CMAKE_DEBUG_INFO)
   print_target_properties(torch)
@@ -1784,6 +1827,25 @@ if(BUILD_TEST)
     endif()
   endforeach()
 
+  if(USE_MPS)
+    foreach(test_src ${Caffe2_MPS_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      add_executable(${test_name} "${test_src}")
+      target_link_libraries(${test_name} torch_library gtest_main)
+      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+      target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
+      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      if(INSTALL_TEST)
+        install(TARGETS ${test_name} DESTINATION test)
+        # Install PDB files for MSVC builds
+        if(MSVC AND BUILD_SHARED_LIBS)
+          install(FILES $<TARGET_PDB_FILE:${test_name}> DESTINATION test OPTIONAL)
+        endif()
+      endif()
+    endforeach()
+  endif()
+
   if(USE_CUDA)
     foreach(test_src ${Caffe2_GPU_TEST_SRCS})
       get_filename_component(test_name ${test_src} NAME_WE)
@@ -1926,6 +1988,8 @@ if(BUILD_PYTHON)
   # ---[ Python.
   if(BUILD_CAFFE2)
   add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
+  target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
+  target_compile_definitions(torch_python PRIVATE BUILD_CAFFE2)
   if(USE_NUMPY)
     target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY")
     target_link_libraries(caffe2_pybind11_state  PRIVATE numpy::numpy)
diff --git a/caffe2/__init__.py b/caffe2/__init__.py
index e69de29bb2d1..4096a9828385 100644
--- a/caffe2/__init__.py
+++ b/caffe2/__init__.py
@@ -0,0 +1,6 @@
+import warnings
+from torch.onnx import _CAFFE2_ATEN_FALLBACK
+
+if not _CAFFE2_ATEN_FALLBACK:
+    warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. "
+                  "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
diff --git a/caffe2/c2_aten_srcs.bzl b/caffe2/c2_aten_srcs.bzl
deleted file mode 100644
index 7755de9ccc13..000000000000
--- a/caffe2/c2_aten_srcs.bzl
+++ /dev/null
@@ -1,12 +0,0 @@
-ATEN_CORE_HEADER_FILES = [
-    # "aten/src/" prefix is added later
-    "ATen/core/ATenGeneral.h",
-    "ATen/core/blob.h",
-    "ATen/core/DimVector.h",
-    "ATen/core/grad_mode.h",
-    "ATen/core/UndefinedTensorImpl.h",
-]
-
-ATEN_CORE_SRC_FILES = [
-    "aten/src/ATen/core/VariableFallbackKernel.cpp",
-]
diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md
index 593079ef1393..79a4276a65f8 100644
--- a/caffe2/contrib/aten/README.md
+++ b/caffe2/contrib/aten/README.md
@@ -72,7 +72,7 @@ class Add(torch.autograd.Function):
 
     @staticmethod
     def symbolic(g, a, b):
-        return g.op("ATen", a, b, operator_s = "add")
+        return g.at("add", a, b)
 
     @staticmethod
     def forward(ctx, a, b):
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index a5d1ea40e27a..b22b840c25ad 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -179,8 +179,9 @@ class ATenOp : public Operator<Context> {
     std::vector<std::string> attrs;
     for (const auto i : c10::irange(operator_def.arg_size())) {
       auto & attr = operator_def.arg(i);
-      if(attr.name() == "operator" || attr.name() == "type" )
+      if (attr.name() == "operator" || attr.name() == "type" || attr.name() == "overload_name") {
         continue;
+      }
       attrs.push_back(attr.name());
     }
     std::sort(attrs.begin(), attrs.end());
diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py
index 4a025c3b1802..6574884245f8 100644
--- a/caffe2/contrib/aten/aten_test.py
+++ b/caffe2/contrib/aten/aten_test.py
@@ -1,9 +1,4 @@
-
-
-
-
-
-from caffe2.python import core, dyndep
+from caffe2.python import core
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/contrib/aten/docs/sample.py b/caffe2/contrib/aten/docs/sample.py
index 53ce19b86e89..6896f2379d8c 100644
--- a/caffe2/contrib/aten/docs/sample.py
+++ b/caffe2/contrib/aten/docs/sample.py
@@ -38,8 +38,8 @@ def forward(self, x, y):
 # graph(%input : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu),
 #       %y : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)):
 #   %2 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::Relu(%input)
-#   %3 : Tensor = onnx::ATen[operator="mul"](%2, %2)
-#   %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::ATen[operator="add"](%3, %y)
+#   %3 : Tensor = aten::ATen[operator="mul"](%2, %2)
+#   %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::ATen[operator="add"](%3, %y)
 #   return (%4)
 
 graph = onnx.load(f.name)
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 93d4bad29f92..55f1faba2750 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -37,9 +37,9 @@
         raise ValueError('aten_root ({}) does not exist'.format(
             args.aten_root))
     sys.path.insert(0, os.path.join(args.aten_root, '..'))
-    from tools.codegen.code_template import CodeTemplate as CT
+    from torchgen.code_template import CodeTemplate as CT
 else:
-    from tools.codegen.code_template import CodeTemplate as CT
+    from torchgen.code_template import CodeTemplate as CT
 
 OP_TEMPLATE = CT.from_file(
     os.path.join(args.template_dir, 'aten_op_template.h'))
diff --git a/caffe2/contrib/shm_mutex/shm_mutex.h b/caffe2/contrib/shm_mutex/shm_mutex.h
index f2dc4ff97e8b..8f0293caf5f3 100644
--- a/caffe2/contrib/shm_mutex/shm_mutex.h
+++ b/caffe2/contrib/shm_mutex/shm_mutex.h
@@ -58,7 +58,7 @@ class ShmProcessMutexCheck {
 template <class Derived>
 struct shm_traits;
 
-using ShmBaseHeader = struct {
+struct ShmBaseHeader {
   std::atomic<bool> isInitialized;
   std::atomic<int> countMapped;
   std::atomic<pid_t> owner;
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 2249c3bcbf2a..a7e3a8d27e23 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -1264,7 +1264,7 @@ void TestDataType(
     std::string dataTypeName) {
   LOG(INFO) << dataTypeName;
   FLAGS_caffe2_serialize_using_bytes_as_holder = true;
-  size_t numEl = 1000;
+  int numEl = 1000;
   // Proto with int32
   auto protoInt32 = CreateProtoWithInt32Data(dataType, numEl, false);
   caffe2::Blob blobInt32;
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 66ffdf21a108..82da29a44f4b 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -4,12 +4,13 @@
 
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+#include <ATen/core/dispatch/OperatorOptions.h>
 #include <ATen/core/function_schema.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <c10/core/CompileTimeFunctionPointer.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/library.h>
 #include <vector>
 
@@ -113,7 +114,9 @@ void call_caffe2_op_from_c10(
   _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op<Caffe2Operator>);
 }
 
-inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
+inline FunctionSchema make_function_schema_for_c10(
+    const char* schema_str,
+    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
 #if !defined(EXPOSE_C2_OPS) && \
     (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
   throw std::logic_error(
@@ -127,13 +130,17 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
       nullopt,
       IValue());
 
-  return FunctionSchema(
+  auto schema = FunctionSchema(
       parsed_schema.name(),
       parsed_schema.overload_name(),
       std::move(arguments),
       parsed_schema.returns(),
       parsed_schema.is_vararg(),
       parsed_schema.is_varret());
+  if (optional_alias_analysis_kind) {
+    schema.setAliasAnalysis(*optional_alias_analysis_kind);
+  }
+  return schema;
 #endif
 }
 
@@ -169,7 +176,7 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
  *   caffe2.
  * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and
  *   C10_EXPORT_CAFFE2_OP_TO_C10_CPU .
- * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted i f
+ * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if
  *   you don't want to expose the operator for CUDA operations.
  * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor
  *   inputs must precede any non-tensor inputs.
@@ -178,73 +185,85 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
  * - If your operator has a variable number of input tensors, make the first (!)
  *   input an input of type TensorList. There must be no other tensor inputs.
  */
-#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)   \
-  namespace caffe2 {                                        \
-  namespace _c10_ops {                                      \
+#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)  \
+  namespace caffe2 {                                       \
+  namespace _c10_ops {                                     \
   TORCH_API const FunctionSchema& schema_##OperatorName(); \
-  }                                                         \
+  }                                                        \
   }
 
-#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \
-  /* Register the op schema with the c10 dispatcher */                        \
-  namespace caffe2 {                                                          \
-  namespace _c10_ops {                                                        \
-  C10_EXPORT const FunctionSchema& schema_##OperatorName() {                  \
-    static const FunctionSchema schema =                                      \
-        ::caffe2::detail::make_function_schema_for_c10(OperatorSchema);       \
-    return schema;                                                            \
-  }                                                                           \
-  TORCH_LIBRARY_FRAGMENT(_caffe2, m) {                                        \
-      m.def(::caffe2::detail::make_function_schema_for_c10(OperatorSchema));  \
-  }                                                                           \
-  }                                                                           \
+#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(             \
+    OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \
+  /* Register the op schema with the c10 dispatcher */       \
+  namespace caffe2 {                                         \
+  namespace _c10_ops {                                       \
+  C10_EXPORT const FunctionSchema& schema_##OperatorName() { \
+    static const FunctionSchema schema =                     \
+        ::caffe2::detail::make_function_schema_for_c10(      \
+            OperatorSchema, OptionalAliasAnalysisKind);      \
+    return schema;                                           \
+  }                                                          \
+  TORCH_LIBRARY_FRAGMENT(_caffe2, m) {                       \
+    m.def(::caffe2::detail::make_function_schema_for_c10(    \
+        OperatorSchema, OptionalAliasAnalysisKind));         \
+  }                                                          \
+  }                                                          \
   }
 
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(                         \
     OperatorName, OperatorClass)                                             \
   /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, CPU, m) {                                    \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
+  TORCH_LIBRARY_IMPL(_caffe2, CPU, m) {                                      \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }
+
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(          \
+    OperatorName, OperatorSchema, OperatorClass)  \
+  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(        \
+      OperatorName, OperatorSchema, c10::nullopt) \
+  C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
 
-#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(                                     \
-    OperatorName, OperatorSchema, OperatorClass)                             \
-  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema)      \
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS(                \
+    OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \
+  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(                                  \
+      OperatorName, OperatorSchema, OptionalAliasAnalysisKind)              \
   C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
 
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)        \
   /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) {                                   \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
-
+  TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) {                                     \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }
 
 // You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro .
 // The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically
 // rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify .
 #define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)         \
   /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, HIP, m) {                                    \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
-
+  TORCH_LIBRARY_IMPL(_caffe2, HIP, m) {                                      \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }
 
 #else
 // Don't use c10 dispatcher on mobile because of binary size
 #define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)
-#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema)
+#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
+    OperatorName, OperatorSchema, OptionalAliasAnalysisKind)
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \
     OperatorName, OperatorSchema, OperatorClass)
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index a34da6918bcd..7dc9c59f82f6 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -60,8 +60,7 @@ class C10_EXPORT QTensor {
   void Resize(at::ArrayRef<int> dim_source) {
     if (dims_ != dim_source) {
       const auto source_size = c10::multiply_integers(dim_source);
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      if ((source_size * (precision_ + signed_)) > capacity_) {
+      if (static_cast<size_t>(source_size * (precision_ + signed_)) > capacity_) {
         data_ptr_.clear();
         capacity_ = 0;
       }
@@ -188,7 +187,7 @@ class C10_EXPORT QTensor {
    * Returns the i-th dimension of the qtensor in int.
    */
   inline int dim32(const int i) const {
-    DCHECK_LT(i, dims_.size()) << "Exceeding ndim limit " << dims_.size();
+    DCHECK_LT(i, static_cast<int>(dims_.size())) << "Exceeding ndim limit " << dims_.size();
     DCHECK_GE(i, 0) << "Cannot have negative index";
     CAFFE_ENFORCE_LT(dims_[i], std::numeric_limits<int>::max());
     return static_cast<int>(dims_[i]);
diff --git a/caffe2/core/serialization_test.cc b/caffe2/core/serialization_test.cc
index 1912802d2ac8..902a3e01e677 100644
--- a/caffe2/core/serialization_test.cc
+++ b/caffe2/core/serialization_test.cc
@@ -69,7 +69,7 @@ TEST(TensorSerialization, TestUnknownDType) {
     auto* blobTensor = BlobGetMutableTensor(&blob, CPU);
     blobTensor->Resize(kTestTensorSize, 1);
     auto *tensorData = blobTensor->mutable_data<int32_t>();
-    for (int n = 0; n < kTestTensorSize; ++n) {
+    for (unsigned n = 0; n < kTestTensorSize; ++n) {
       tensorData[n] = n;
     }
     auto data = SerializeBlob(blob, "test_blob");
@@ -85,7 +85,7 @@ TEST(TensorSerialization, TestUnknownDType) {
   EXPECT_EQ(kTestTensorSize, tensor.numel());
   EXPECT_EQ(TypeMeta::Make<int32_t>(), tensor.dtype());
   const auto* tensor_data = tensor.template data<int32_t>();
-  for (int i = 0; i < kTestTensorSize; ++i) {
+  for (unsigned i = 0; i < kTestTensorSize; ++i) {
     EXPECT_EQ(static_cast<float>(i), tensor_data[i]);
   }
 
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index 26fbdbe4a753..f04e37acfbe9 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -348,6 +348,7 @@ _ScopeGuard<T> ScopeGuard(T f) {
         stats.field.groupName.c_str(),                              \
         __caffe_event_value_,                                       \
         ##__VA_ARGS__);                                             \
+    (void)__caffe_event_value_;                                     \
   }
 
 #define CAFFE_DURATION(stats, field, ...)                        \
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 7f2f93de53fc..de7d31fd7614 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -4,6 +4,7 @@
 #include <c10/macros/Macros.h>
 #include "caffe2/core/storage.h"
 
+#include <c10/core/SymIntArrayRef.h>
 #include <ATen/core/UndefinedTensorImpl.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/util/intrusive_ptr.h>
@@ -428,6 +429,11 @@ class TORCH_API Tensor final {
     return impl_.get()->sizes();
   }
 
+  inline c10::SymIntArrayRef sym_sizes() const {
+    auto sizes = impl_.get()->sizes();
+    return c10::SymIntArrayRef(reinterpret_cast<const c10::SymInt*>(sizes.data()), sizes.size());
+  }
+
   inline int64_t size_from_dim(int k) const {
     return size_from_dim_(k, impl_->sizes());
   }
diff --git a/caffe2/core/transform_test.cc b/caffe2/core/transform_test.cc
index adb7ecae050b..0dc6ba92c7f9 100644
--- a/caffe2/core/transform_test.cc
+++ b/caffe2/core/transform_test.cc
@@ -55,7 +55,7 @@ class DummyTransform : public Transform {
       return false;
     }
     // which index are we trying to append the new node to?
-    int pattern_idx = subgraph.size();
+    auto pattern_idx = subgraph.size();
     // type doesn't match
     if (g.node(idx).op.type() != pattern_chain[pattern_idx]) {
       return false;
diff --git a/caffe2/ideep/operators/order_switch_ops.cc b/caffe2/ideep/operators/order_switch_ops.cc
index 0b682c9af83f..7b8319b6c1bf 100644
--- a/caffe2/ideep/operators/order_switch_ops.cc
+++ b/caffe2/ideep/operators/order_switch_ops.cc
@@ -22,6 +22,10 @@ class IDEEPNHWC2NCHWOp final : public IDEEPOperator {
     // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
     Y->init({X.get_dims(), X.get_data_type(), iformat::nchw});
     Y->feed_from(X);
+    // NOTE: This ops is only used to quantization path, setting scale
+    // to distinguish with fp32 path activation(always return NCHW format
+    // even ideep tensor has NHWC format) when convert to numpy memory.
+    Y->set_scale({1.0});
     return true;
   }
 
@@ -48,6 +52,10 @@ class IDEEPNCHW2NHWCOp final : public IDEEPOperator {
     // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
     Y->init({X.get_dims(), X.get_data_type(), iformat::nhwc});
     Y->feed_from(X);
+    // NOTE: This ops is only used to quantization path, setting scale
+    // to distinguish with fp32 path activation(always return NCHW format
+    // even ideep tensor has NHWC format) when convert to numpy memory.
+    Y->set_scale({1.0});
     return true;
   }
 
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index d82fe0aed866..9477666bcc5c 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -1,4 +1,4 @@
-#include "caffe2/operators/utility_ops.h"
+  #include "caffe2/operators/utility_ops.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/ideep/ideep_utils.h"
 
@@ -64,7 +64,10 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
         }
         auto* Y =
             OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(CPU));
-        X.to_public(Y->template mutable_data<float>());
+        itensor temp_ten(
+            X.get_desc().to_default_format(),
+            Y->template mutable_data<float>());
+        X.reorder_to(temp_ten);
       } else {
         CAFFE_THROW("Unsupported ideep type: ",
                     static_cast<int>(X.get_data_type()));
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 5d72898bfc69..d1c27e0845a8 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -62,7 +62,7 @@ class ImageInputOp final : public PrefetchOperator<Context> {
   bool CopyPrefetched() override;
 
  private:
-  using BoundingBox = struct {
+  struct BoundingBox {
     bool valid;
     int ymin;
     int xmin;
@@ -73,7 +73,7 @@ class ImageInputOp final : public PrefetchOperator<Context> {
   // Structure to store per-image information
   // This can be modified by the DecodeAnd* so needs
   // to be privatized per launch.
-  using PerImageArg = struct { BoundingBox bounding_params; };
+  struct PerImageArg { BoundingBox bounding_params; };
 
   bool GetImageAndLabelAndInfoFromDBValue(
       const string& value,
diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h
index 8dc71795df89..8c26b111de1c 100644
--- a/caffe2/operators/bisect_percentile_op.h
+++ b/caffe2/operators/bisect_percentile_op.h
@@ -44,7 +44,7 @@ class BisectPercentileOp final : public Operator<Context> {
         pct_upper_.size(),
         "Feature (raw) data and upper bound dimension should match.");
     n_features = pct_lens_.size();
-    index.reserve(n_features + 1);
+    index.resize(n_features + 1);
     index[0] = 0;
     for (int i = 1; i <= n_features; ++i) {
       index[i] = index[i - 1] + pct_lens_[i - 1];
@@ -63,12 +63,12 @@ class BisectPercentileOp final : public Operator<Context> {
     const auto batch_size = raw.size(0);
     const auto num_features = raw.size(1);
     CAFFE_ENFORCE_EQ(num_features, pct_lens_.size());
-    const float* raw_data = raw.template data<float>();
+    const float *const raw_data = raw.template data<float>();
 
     // Output
 
-    auto* pct = Output(PCT, raw.sizes(), at::dtype<float>());
-    float* pct_output = pct->template mutable_data<float>();
+    auto *const pct = Output(PCT, raw.sizes(), at::dtype<float>());
+    float *const pct_output = pct->template mutable_data<float>();
 
     // Compute percentile for each raw feature value
     int feature_start_index = 0;
@@ -108,20 +108,17 @@ class BisectPercentileOp final : public Operator<Context> {
   vector<int> index;
   vector<std::map<float, float>> fast_pct;
 
-  const float kEPSILON = 1e-10;
+  static constexpr float kEPSILON = 1e-10;
 
-  int binary_search(
+  int64_t binary_search(
       const std::vector<float>::iterator& data,
-      int lo,
-      int hi,
-      float val) {
-    int mid;
-    bool low_cond, high_cond;
-
+      int64_t lo,
+      int64_t hi,
+      const float val) {
     while (lo < hi) {
-      mid = (lo + hi) >> 1;
-      low_cond = (data[mid] <= val);
-      high_cond = (val < data[mid + 1]);
+      const auto mid = lo + (hi - lo) / 2;
+      const bool low_cond = (data[mid] <= val);
+      const bool high_cond = (val < data[mid + 1]);
       if (low_cond && high_cond) {
         return mid;
       } else if (!low_cond) {
@@ -148,20 +145,18 @@ class BisectPercentileOp final : public Operator<Context> {
       return 1.;
     }
 
-    float result;
     // Interpolation by binary search
     const auto k = binary_search(pct_raw_it, 0, size - 1, val);
 
     if (pct_raw_it[k] == val) {
       // Exact match
-      result = pct_mapping_it[k];
+      return pct_mapping_it[k];
     } else {
       // interpolation
-      float w = (val - pct_raw_it[k]) /
+      const float w = (val - pct_raw_it[k]) /
           (pct_raw_it[k + 1] - pct_raw_it[k] + kEPSILON);
-      result = (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1];
+      return (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1];
     }
-    return result;
   }
 };
 
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
index 511aaee47831..ad7b28331de2 100644
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@@ -286,9 +286,6 @@ NO_GRADIENT(BooleanMaskLengths);
 
 } // namespace
 
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-const float minf = -1.0f * std::numeric_limits<float>::infinity();
-
 // Template this on a functor object so we can generate different
 // implementations at compile time and have a better chance of inlining
 template <typename Functor>
diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc
index f2323bbaf06f..c0efef07eeb6 100644
--- a/caffe2/operators/copy_op.cc
+++ b/caffe2/operators/copy_op.cc
@@ -200,8 +200,10 @@ REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
 
 C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
     CopyGPUToCPU,
-    "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor");
+    "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor",
+    /*optional_alias_analysis_kind=*/c10::nullopt);
 
 C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
     CopyCPUToGPU,
-    "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor");
+    "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor",
+    /*optional_alias_analysis_kind=*/c10::nullopt);
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
index 011b1bf9204b..4acd92267b26 100644
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -156,6 +156,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2col, followed by gemm.
     for (const auto image_id : c10::irange(N)) {
+      (void)image_id; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used
       for (const auto group_id : c10::irange(group_)) {
         DeformableIm2col(
             Xdata + group_id * input_offset,
@@ -343,6 +344,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   }
 
   for (const auto image_id : c10::irange(N)) {
+    (void)image_id; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used
     for (const auto group_id : c10::irange(group_)) {
       math::Gemm<T, Context>(
           CblasTrans,
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
index 6f37407bd40e..bbd1eb1c72c9 100644
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@@ -15,13 +15,12 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
     return true;
   } else {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     // mask=true means keep, and mask=false means not keep, so we will
     // generate probability depending on 1-ratio.
     at::bernoulli_distribution<double> dist(1. - ratio_);
     const float* Xdata = X.data<float>();
     float* Ydata = Y->template mutable_data<float>();
-
     auto mask = Output(1, X.sizes(), at::dtype<bool>());
     bool* mask_data = mask->template mutable_data<bool>();
     auto* gen = context_.RandGenerator();
@@ -52,7 +51,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
     const bool* mask_data = mask.data<bool>();
     float* dXdata = dX->template mutable_data<float>();
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     for (int i = 0; i < dY.numel(); ++i) {
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       dXdata[i] = dYdata[i] * mask_data[i] * scale;
diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h
index aff0528c7ffa..ae8f0ff1bba6 100644
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@@ -19,7 +19,6 @@ class DropoutOp final : public Operator<Context> {
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;
@@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator<Context> {
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
index 932bd5dafda0..fcbe26f927ae 100644
--- a/caffe2/operators/elementwise_ops.cu
+++ b/caffe2/operators/elementwise_ops.cu
@@ -119,6 +119,9 @@ void device_reduce<at::Half>(
     int N,
     Tensor* buffer,
     CUDAContext* context) {
+  (void)N; // Suppress unused variable warning
+  (void)buffer; // Suppress unused variable warning
+  (void)context; // Suppress unused variable warning
 #if TORCH_HIP_VERSION >= 210
   auto buffer_size = 1;
 
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index e2cdab373c97..ee11de3f972d 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -146,6 +146,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
           auto& key = Input(KEY);
           auto* key_data = key.template data<int64_t>();
           vector<std::pair<int64_t, const char*>> buffer;
+          buffer.reserve(rangeLength);
           for (const auto b_i : c10::irange(rangeLength)) {
             int64_t one_key_item = key_data[rangeStart + b_i];
             auto* one_data_item = rawData + (rangeStart + b_i) * itemsize;
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index 598b8d185695..9692c2846e97 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -493,7 +493,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
       1.53593004e-01f,  -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f};
 
   // Add angle in bbox deltas
-  int num_boxes = scores.size();
+  auto num_boxes = scores.size();
   CHECK_EQ(bbx.size() / 4, num_boxes);
   vector<float> bbx_with_angle(num_boxes * box_dim);
   // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta
@@ -666,7 +666,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) {
       1.53593004e-01f,  -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f};
 
   // Add angle in bbox deltas
-  int num_boxes = scores.size();
+  auto num_boxes = scores.size();
   CHECK_EQ(bbx.size() / 4, num_boxes);
   vector<float> bbx_with_angle(num_boxes * box_dim);
   // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 0a402cdb6a3c..1c38562dbae0 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -2,7 +2,6 @@
 #define CAFFE2_OPERATORS_UTILS_BOXES_H_
 
 #include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
 
 #include <c10/util/irange.h>
 
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 09b10c8e192a..92d2c90a06c0 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -6,7 +6,6 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/macros.h"
 #include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
 
 #include <c10/util/irange.h>
 
@@ -50,8 +49,7 @@ std::vector<int> nms_cpu_upright(
   std::vector<int> keep;
   while (order.size() > 0) {
     // exit if already enough proposals
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    if (topN >= 0 && keep.size() >= topN) {
+    if (topN >= 0 && keep.size() >= static_cast<size_t>(topN)) {
       break;
     }
 
@@ -127,7 +125,7 @@ std::vector<int> soft_nms_cpu_upright(
   EArrXi pending = AsEArrXt(indices);
   while (pending.size() > 0) {
     // Exit if already enough proposals
-    if (topN >= 0 && keep.size() >= topN) {
+    if (topN >= 0 && keep.size() >= static_cast<unsigned>(topN)) {
       break;
     }
 
@@ -560,8 +558,7 @@ std::vector<int> nms_cpu_rotated(
   std::vector<int> keep;
   while (order.size() > 0) {
     // exit if already enough proposals
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    if (topN >= 0 && keep.size() >= topN) {
+    if (topN >= 0 && keep.size() >= static_cast<size_t>(topN)) {
       break;
     }
 
@@ -626,7 +623,7 @@ std::vector<int> soft_nms_cpu_rotated(
   EArrXi pending = AsEArrXt(indices);
   while (pending.size() > 0) {
     // Exit if already enough proposals
-    if (topN >= 0 && keep.size() >= topN) {
+    if (topN >= 0 && keep.size() >= static_cast<size_t>(topN)) {
       break;
     }
 
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index cc43135d5c22..3ca2a1db09f4 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -13,7 +13,6 @@ bool LpNormOp<float, CPUContext>::RunOnDevice() {
   auto* norm = Output(0, {1}, at::dtype<float>());
   const float* X_data = X.data<float>();
   const float size = average_ ? (float)X.numel() : 1.0f;
-  CAFFE_ENFORCE_GT(size, 0);
   if (p_ == 1) {
     *(norm->template mutable_data<float>()) =
         (ConstEigenVectorMap<float>(X_data, X.numel()).array()).abs().sum() /
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
index 9dcf0021f1c2..8a88f8b834be 100644
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -63,6 +63,7 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
       const int64_t num_group) {
     const T* start = bounds;
     for (const auto i : c10::irange(num_group)) {
+      (void)i; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used
       if (!std::is_sorted(start, start + num_bounds_per_group)) {
         return false;
       }
diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h
index 1eeb4f2db8ad..5253d9975c39 100644
--- a/caffe2/operators/quant_decode_op.h
+++ b/caffe2/operators/quant_decode_op.h
@@ -36,7 +36,7 @@ void Decode(
     }
 
     int sz = output->numel();
-    for (const auto i : c10::irange(sz)) {
+    for (C10_UNUSED const auto i : c10::irange(sz)) {
       DCHECK_LE(*code_ptr, cb_size);
       *out_ptr++ = cb_ptr[*code_ptr++];
     }
diff --git a/caffe2/operators/quantized/int8_fc_op.cc b/caffe2/operators/quantized/int8_fc_op.cc
index 6f0b3aa8da74..21cd23bb399c 100644
--- a/caffe2/operators/quantized/int8_fc_op.cc
+++ b/caffe2/operators/quantized/int8_fc_op.cc
@@ -10,7 +10,7 @@ REGISTER_CPU_OPERATOR(Int8FC, int8::Int8FCOp);
 
 using namespace std::placeholders;
 OPERATOR_SCHEMA(Int8FC)
-    .NumInputs(3, 4)
+    .NumInputs(3, 5)
     .NumOutputs(1, 4)
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
@@ -50,6 +50,11 @@ will throw errors.
         "Qparam",
         "Optional Qparam blob that contains quant param computed on activation histogram data"
         "Will overwrite Y_scale and Y_zero_point argument if specified")
+    .Input(
+        4,
+        "in_Qparam",
+        "Optional Qparam blob that contains quant param computed on activation histogram data"
+        "Will overwrite X_scale and X_zero_point argument if specified")
     .Output(0, "Y", "2D output tensor");
 
 } // namespace caffe2
diff --git a/caffe2/operators/quantized/int8_roi_align_op.h b/caffe2/operators/quantized/int8_roi_align_op.h
index 2a722d2dd8fa..360f4a62c089 100644
--- a/caffe2/operators/quantized/int8_roi_align_op.h
+++ b/caffe2/operators/quantized/int8_roi_align_op.h
@@ -229,8 +229,8 @@ void ROIAlignForward(
       for (const auto pw : c10::irange(pooled_width)) {
         vector<int32_t> acc_buffer(channels, 0);
 
-        for (const auto iy : c10::irange(roi_bin_grid_h)) {
-          for (const auto ix : c10::irange(roi_bin_grid_w)) {
+        for (C10_UNUSED const auto iy : c10::irange(roi_bin_grid_h)) {
+          for (C10_UNUSED const auto ix : c10::irange(roi_bin_grid_w)) {
             PreCalc pc = pre_calc[pre_calc_index];
 
             const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1;
diff --git a/caffe2/operators/quantized/int8_test.cc b/caffe2/operators/quantized/int8_test.cc
index b6d9719d5223..9b14d3eaec1d 100644
--- a/caffe2/operators/quantized/int8_test.cc
+++ b/caffe2/operators/quantized/int8_test.cc
@@ -341,8 +341,8 @@ TEST(Int8, SumRelu) {
 }
 
 void setq(int8::Int8TensorCPU* dst, const std::vector<float>& vs) {
-  CHECK_EQ(vs.size(), dst->t.numel());
-  for (auto i = 0; i < vs.size(); ++i) {
+  CHECK_EQ(vs.size(), static_cast<size_t>(dst->t.numel()));
+  for (auto i = 0U; i < vs.size(); ++i) {
     uint8_t vq = std::max(
         std::numeric_limits<uint8_t>::min(),
         std::min(
@@ -354,8 +354,8 @@ void setq(int8::Int8TensorCPU* dst, const std::vector<float>& vs) {
 }
 
 void biassetq(int8::Int8TensorCPU* dst, const std::vector<float>& vs) {
-  CHECK_EQ(vs.size(), dst->t.numel());
-  for (auto i = 0; i < vs.size(); ++i) {
+  CHECK_EQ(vs.size(), static_cast<size_t>(dst->t.numel()));
+  for (auto i = 0U; i < vs.size(); ++i) {
     int32_t vq = std::max(
         std::numeric_limits<int32_t>::min(),
         std::min(
diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h
index 01b4743a91c1..a4f2d6189860 100644
--- a/caffe2/operators/text_file_reader_utils.h
+++ b/caffe2/operators/text_file_reader_utils.h
@@ -56,7 +56,7 @@ struct TORCH_API CharRange {
 struct TORCH_API StringProvider {
   virtual void operator()(CharRange&) = 0;
   virtual void reset() = 0;
-  virtual ~StringProvider() {}
+  virtual ~StringProvider() = default;
 };
 
 class TORCH_API BufferedTokenizer {
@@ -99,7 +99,7 @@ class TORCH_API BufferedTokenizer {
   StringProvider* provider_;
   Tokenizer tokenizer_;
   TokenizedString tokenized_;
-  int tokenIndex_;
+  unsigned tokenIndex_;
   int numPasses_;
   int pass_{0};
 };
diff --git a/caffe2/operators/variable_length_sequence_padding.cc b/caffe2/operators/variable_length_sequence_padding.cc
index dbdb4ac87678..d6904523b7fc 100644
--- a/caffe2/operators/variable_length_sequence_padding.cc
+++ b/caffe2/operators/variable_length_sequence_padding.cc
@@ -19,7 +19,7 @@ N = maximum sequence length
 B = batch size
 M = hidden size
 
-set each element of INPUT to zero if it is is past the end of the
+set each element of INPUT to zero if it is past the end of the
 corresponding sequence (i.e. if LENS[j] > i for an index (i,j,k)).
 
 )DOC");
diff --git a/caffe2/opt/bound_shape_inference_test.cc b/caffe2/opt/bound_shape_inference_test.cc
index 867142746d82..8224281124e1 100644
--- a/caffe2/opt/bound_shape_inference_test.cc
+++ b/caffe2/opt/bound_shape_inference_test.cc
@@ -45,7 +45,7 @@ void verifyShapeInfo(
   EXPECT_EQ(shape_info.getDimType(), t);
   const auto& shape = shape_info.shape;
   ASSERT_EQ(shape.dims_size(), dims.size());
-  for (int i = 0; i < dims.size(); ++i) {
+  for (unsigned i = 0; i < dims.size(); ++i) {
     EXPECT_EQ(dims[i], shape.dims(i));
   }
   EXPECT_EQ(shape.data_type(), dtype);
diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
index 5803b4efd492..a240750717cd 100644
--- a/caffe2/opt/glow_net_transform.cc
+++ b/caffe2/opt/glow_net_transform.cc
@@ -71,7 +71,7 @@ C10_DEFINE_bool(
 namespace caffe2 {
 namespace glow {
 
-// The list in in the form of "0-3,5,6-7" which means, we will black list ops
+// The list in the form of "0-3,5,6-7" which means, we will black list ops
 // with net positions in [0,1,2,3,5,6,7]
 std::unordered_set<int> ParseNetPositionList(const std::string& str) {
   std::unordered_set<int> net_position_list;
diff --git a/caffe2/perfkernels/adagrad_avx2.cc b/caffe2/perfkernels/adagrad_avx2.cc
index 0039afa942f1..08c9fd00d9a0 100644
--- a/caffe2/perfkernels/adagrad_avx2.cc
+++ b/caffe2/perfkernels/adagrad_avx2.cc
@@ -18,7 +18,7 @@ void adagrad_update__avx2_fma(
     float decay,
     float lr,
     float weight_decay = 0.f) {
-  constexpr size_t kSize = 8;
+  constexpr int kSize = 8;
   auto i = 0;
   for (; i + kSize <= N; i += kSize) {
     __m256 gi = _mm256_loadu_ps(g + i);
diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py
index a753f26c5380..ce54a1aee574 100644
--- a/caffe2/proto/__init__.py
+++ b/caffe2/proto/__init__.py
@@ -1,3 +1,6 @@
+import warnings
+
+
 # NOTE: we have to import python protobuf here **before** we load cpp extension.
 # Otherwise it breaks under certain build conditions if cpp implementation of
 # protobuf is used. Presumably there's some registry in protobuf library and
@@ -8,7 +11,13 @@
 #    expected caffe2.NetDef got caffe2.NetDef."
 #
 # This has to be done for all python targets, so listing them here
-from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
+try:
+    from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
+except ImportError:
+    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
+                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    raise
+
 try:
     from caffe2.caffe2.fb.session.proto import session_pb2
 except ImportError:
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 33b4cbd4b9a9..861a6c5d4374 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -220,7 +220,7 @@ enum DeviceTypeProto {
   PROTO_FPGA = 7; // FPGA
   PROTO_ORT = 8; // ONNX Runtime
   PROTO_XLA = 9; // XLA / TPU
-  PROTO_MLC = 10; // ML Compute
+  PROTO_MPS = 10; // MPS
   // Change the following number if you add more devices in the code.
   PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 11;
 }
diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi
index f7f4430d7b76..ed1f4249a43e 100644
--- a/caffe2/proto/caffe2_pb2.pyi
+++ b/caffe2/proto/caffe2_pb2.pyi
@@ -25,7 +25,7 @@ class _DeviceTypeProto(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapp
     PROTO_FPGA = DeviceTypeProto.V(7)
     PROTO_ORT = DeviceTypeProto.V(8)
     PROTO_XLA = DeviceTypeProto.V(9)
-    PROTO_MLC = DeviceTypeProto.V(10)
+    PROTO_MPS = DeviceTypeProto.V(10)
     PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
 class DeviceTypeProto(metaclass=_DeviceTypeProto):
     V = typing.NewType('V', int)
@@ -39,7 +39,7 @@ PROTO_HIP = DeviceTypeProto.V(6)
 PROTO_FPGA = DeviceTypeProto.V(7)
 PROTO_ORT = DeviceTypeProto.V(8)
 PROTO_XLA = DeviceTypeProto.V(9)
-PROTO_MLC = DeviceTypeProto.V(10)
+PROTO_MPS = DeviceTypeProto.V(10)
 PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
 
 class TensorProto(google.protobuf.message.Message):
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 6617a62c5a51..83e393e67731 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -1,7 +1,15 @@
-
-from caffe2.proto import caffe2_pb2
 import os
 import sys
+import warnings
+
+
+try:
+    from caffe2.proto import caffe2_pb2
+except ImportError:
+    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
+                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    raise
+
 # TODO: refactor & remove the following alias
 caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
 caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py
index 5342cb314a5b..c557ebfc9536 100644
--- a/caffe2/python/benchmark_generator.py
+++ b/caffe2/python/benchmark_generator.py
@@ -106,7 +106,7 @@ def make_blob_on_context(blob_name, blob_data, context):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Utilitity to generate Caffe2 benchmark models.")
+        description="Utility to generate Caffe2 benchmark models.")
     parser.add_argument("operator", help="Caffe2 operator to benchmark.")
     parser.add_argument("-b", "--blob",
                         help="Instantiate a blob --blob name=dim1,dim2,dim3",
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index 872a66c7bd1f..c379211a509d 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -133,7 +133,7 @@ def db_name(epoch, node_name, db_prefix, path_prefix=None):
         node_name: A string. The name of the node.
         db_prefix: A string. The prefix used to construct full db name.
         path_prefix: A string. Optional param used to construct db name or path
-            where checkpoint files are are stored.
+            where checkpoint files are stored.
     Returns:
         db_name: A string. The absolute path of full_db_name where checkpoint
             files are saved
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index 293eccca0dd4..b8433c644155 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -269,7 +269,7 @@ def testUseInputButInputHasBeenChanged(self):
         in -> out, with UseInput
         in -> in
 
-        Since we overwrite in in op#1, but in will be needed by the gradient
+        Since we overwrite in op#1, but in will be needed by the gradient
         calculation of op#0, the gradient registry should raise an error.
         """
         operators = [
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 0543233b7c4f..2f143fbae07a 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -308,7 +308,7 @@ def testCreate(self):
         self.assertTrue(op.HasField('device_option'))
         self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertTrue(len(op.arg), 3)
+        self.assertEqual(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
         # to test with
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
index 049a9152878a..d0474ed70022 100644
--- a/caffe2/python/gru_cell.py
+++ b/caffe2/python/gru_cell.py
@@ -31,7 +31,7 @@ def __init__(
     # (reset gate -> output_gate)
     # So, much of the logic to calculate the reset gate output and modified
     # output gate input is set here, in the graph definition.
-    # The remaining logic lives in in gru_unit_op.{h,cc}.
+    # The remaining logic lives in gru_unit_op.{h,cc}.
     def _apply(
         self,
         model,
diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py
index 6225781bc429..178ebd8cd302 100644
--- a/caffe2/python/memonger.py
+++ b/caffe2/python/memonger.py
@@ -798,15 +798,29 @@ def canonical_name(blob):
             op.output[i] = canonical_name(output)
 
 
-
 def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
     log.debug("Applying assignments to recurrent op: {}".format(op.type))
+
+    # Apply on alias_dst
+    alias_dst_args = [a for a in op.arg if a.name.endswith("alias_dst")]
+    for alias_dst in alias_dst_args:
+        for i, blob in enumerate(alias_dst.strings):
+            alias_dst.strings[i] = canonical_name(blob.decode()).encode()
+
+    # Apply on link_external
+    link_external_args = [a for a in op.arg if a.name.endswith("link_external")]
+    for link_external in link_external_args:
+        for i, blob in enumerate(link_external.strings):
+            link_external.strings[i] = canonical_name(blob.decode()).encode()
+
+    # Recurse into step nets
     step_args = [a for a in op.arg if a.name.endswith("step_net")]
     for step_arg in step_args:
         apply_assignments(step_arg.n, blob_assignments)
         for i, einp in enumerate(step_arg.n.external_input):
             if einp in blob_assignments:
                 step_arg.n.external_input[i] = canonical_name(einp)
+
     # Store renamings
     for blob, renamed in viewitems(blob_assignments):
         if blob in list(op.input) + list(op.output):
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index d92239f5c3c1..d523eb8204ab 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -651,7 +651,13 @@ def optimize_onnx(input, init=False, predict=False):
             passes.append('split_init')
         if predict:
             passes.append('split_predict')
-        out = onnx.optimizer.optimize(input, passes)
+        try:
+            out = onnx.optimizer.optimize(input, passes)
+        except AttributeError:
+            warnings.warn("OptimizerWarning: optimizer module not found in ONNX version {}".format(onnx.__version__))
+            # ONNX does no ship onnx.optimizer since version 1.9+
+            import onnxoptimizer
+            out = onnxoptimizer.optimize(input, passes)
         return out
 
     @classmethod
@@ -881,8 +887,9 @@ def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_in
         try:
             init_model = cls.optimize_onnx(onnx_model, init=True)
             pred_model = cls.optimize_onnx(onnx_model, predict=True)
-        except AttributeError:
-            warnings.warn("OptimizerWarning: optimizer module not found in ONNX version {}".format(onnx.__version__))
+        except ModuleNotFoundError:
+            warnings.warn("OptimizerWarning: onnxoptimizer module not installed. "
+                          "init_model and pred_model models will not be splitted, which can cause a runtime error")
             init_model = onnx_model
             pred_model = onnx_model
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index ebb6018ca76e..42262d269695 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -160,11 +160,19 @@
                      ')')
 
 # Unsupported ops in opset 15
-backend_test.exclude('(test_bernoulli_*'
-                     '|test_castlike_*'
-                     '|test_optional_*'
-                     '|test_shape_end_*'
-                     '|test_shape_start_*'
+backend_test.exclude('(test_bernoulli_.*'
+                     '|test_castlike_.*'
+                     '|test_optional_.*'
+                     '|test_shape_end_.*'
+                     '|test_shape_start_.*'
+                     '|test_identity_opt_*'
+                     '|test_loop16_seq_none_*'
+                     '|test_if_opt_*'
+                     ')')
+
+# Unsupported ops in opset 16
+backend_test.exclude('(test_gridsample_.*'
+                     '|test_spacetodepth_.*'
                      ')')
 
 # Skip vgg to speed up CI
diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py
index 147a41282505..2d22064d5712 100644
--- a/caffe2/python/operator_test/bisect_percentile_op_test.py
+++ b/caffe2/python/operator_test/bisect_percentile_op_test.py
@@ -1,7 +1,4 @@
-
-
-
-
+from typing import List
 
 import hypothesis.strategies as st
 
@@ -115,7 +112,7 @@ def test_bisect_percentil_op_simple(self):
 
     @given(
         N=st.integers(min_value=20, max_value=100),
-        lengths=st.lists(
+        lengths_in=st.lists(
             elements=st.integers(min_value=2, max_value=10),
             min_size=2,
             max_size=5,
@@ -126,9 +123,9 @@ def test_bisect_percentil_op_simple(self):
         **hu.gcs_cpu_only
     )
     def test_bisect_percentil_op_large(
-        self, N, lengths, max_value, discrete, p, gc, dc
+        self, N: int, lengths_in: List[int], max_value: int, discrete: bool, p: float, gc, dc
     ):
-        lengths = np.array(lengths, dtype=np.int32)
+        lengths = np.array(lengths_in, dtype=np.int32)
         D = len(lengths)
 
         if discrete:
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index d3a5c831d875..de96554bc5cb 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -19,7 +19,7 @@ class TestDropout(serial.SerializedTestCase):
            in_place=st.booleans(),
            ratio=st.floats(0, 0.999),
            engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
+           **hu.gcs_cpu_only)
     def test_dropout_is_test(self, X, in_place, ratio, engine, gc, dc):
         """Test with is_test=True for a deterministic reference impl."""
         # TODO(lukeyeager): enable this path when the GPU path is fixed
@@ -47,7 +47,7 @@ def reference_dropout_test(x):
            in_place=st.booleans(),
            output_mask=st.booleans(),
            engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
+           **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_dropout_ratio0(self, X, in_place, output_mask, engine, gc, dc):
         """Test with ratio=0 for a deterministic reference impl."""
@@ -74,3 +74,35 @@ def reference_dropout_ratio0(x):
             gc, op, [X], reference_dropout_ratio0,
             # Don't check the mask with cuDNN because it's packed data
             outputs_to_check=None if engine != 'CUDNN' else [0])
+
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs_cpu_only)
+    @settings(deadline=10000)
+    def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+        is_test = not output_mask
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"] +
+                                 (["mask"] if output_mask else []),
+                                 ratio=1.0, engine=engine,
+                                 is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not is_test:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+        def reference_dropout_ratio1(x):
+            return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio1,
+            # Don't check the mask with cuDNN because it's packed data
+            outputs_to_check=None if engine != 'CUDNN' else [0])
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index 6e5d4e7efee8..2d8222b59c9f 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -64,7 +64,7 @@ def __test_binary_op(
             caffe2_op: A string. Name of the caffe operator to test.
             op_function: an actual python operator (e.g. operator.add)
         path_prefix: A string. Optional param used to construct db name or path
-            where checkpoint files are are stored.
+            where checkpoint files are stored.
         """
 
         for X, Y, op_args, X_out, Y_out in self.__generate_test_cases(allow_broadcast_fastpath):
diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py
index e7ab634d0e7c..2899ba929470 100644
--- a/caffe2/python/operator_test/lpnorm_op_test.py
+++ b/caffe2/python/operator_test/lpnorm_op_test.py
@@ -11,13 +11,7 @@
 
 
 class LpnormTest(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=1,
-                             min_dim=1,
-                             max_dim=3,
-                             dtype=np.float32),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_Lp_Norm(self, inputs, gc, dc):
+    def _test_Lp_Norm(self, inputs, gc, dc):
         X = inputs[0]
         # avoid kinks by moving away from 0
         X += 0.02 * np.sign(X)
@@ -74,6 +68,21 @@ def test_Lp_Norm(self, inputs, gc, dc):
             atol=1e-4
         )
 
+    @given(inputs=hu.tensors(n=1,
+                             min_dim=1,
+                             max_dim=3,
+                             dtype=np.float32),
+           **hu.gcs)
+    @settings(deadline=10000)
+    def test_Lp_Norm(self, inputs, gc, dc):
+        self._test_Lp_Norm(inputs, gc, dc)
+
+    def test_Lp_Norm_empty(self):
+        self._test_Lp_Norm([np.array([], dtype=np.float32)], hu.cpu_do, [hu.cpu_do])
+        self.assertEqual(self.ws.blobs["l1_norm"].fetch()[0], 0.0)
+        self.assertEqual(self.ws.blobs["l2_norm"].fetch()[0], 0.0)
+        self.assertTrue(np.isnan(self.ws.blobs["l2_averaged_norm"].fetch()[0]))
+
     @given(x=hu.tensor(
         min_dim=1, max_dim=10, dtype=np.float32,
         elements=st.integers(min_value=-100, max_value=100)),
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
index d7c4e0df4416..0c260d944d81 100644
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -32,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts):
         y = slopes[index] * x_ + intercepts[index]
         return y
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_multi_predictions_params_from_arg(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
@@ -60,7 +60,7 @@ def piecewise(x, *args, **kw):
         self.assertReferenceChecks(gc, op, [X], piecewise)
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_binary_predictions_params_from_arg(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
@@ -87,7 +87,7 @@ def piecewise(x):
         self.assertReferenceChecks(gc, op, [X], piecewise)
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_multi_predictions_params_from_input(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
@@ -115,7 +115,7 @@ def piecewise(x, bounds, slopes, intercepts):
             gc, op, [X, bounds, slopes, intercepts], piecewise)
         self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_binary_predictions_params_from_input(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
@@ -141,7 +141,7 @@ def piecewise(x, bounds, slopes, intercepts):
             gc, op, [X, bounds, slopes, intercepts], piecewise)
         self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_1D_predictions_params_from_input(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 6fbc445a7769..cb07a96fa0f7 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -385,7 +385,7 @@ def test_remove_data_blocks(self, data, indices, gc, dc):
             ["shrunk_data"])
 
         def op_ref(data, indices):
-            unique_indices = np.unique(indices)
+            unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64)
             sorted_indices = np.sort(unique_indices)
             shrunk_data = np.delete(data, sorted_indices, axis=0)
             return (shrunk_data,)
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index ad04cab82d5a..ccaa0afb6ac9 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -300,7 +300,7 @@ class GetPythonGradient : public GradientMakerBase {
     }
     if (gradOutputIndices.size() > 0) {
       // NOLINTNEXTLINE(modernize-loop-convert)
-      for (int i = 0; i < gradOutputIndices.size(); ++i) {
+      for (unsigned i = 0; i < gradOutputIndices.size(); ++i) {
         int GO_i = gradOutputIndices[i];
         gradientInputs.push_back(GO(GO_i));
       }
@@ -312,7 +312,7 @@ class GetPythonGradient : public GradientMakerBase {
     std::vector<std::string> gradientOutputs;
     if (gradInputIndices.size() > 0) {
       // NOLINTNEXTLINE(modernize-loop-convert)
-      for (int i = 0; i < gradInputIndices.size(); ++i) {
+      for (unsigned i = 0; i < gradInputIndices.size(); ++i) {
         int GI_i = gradInputIndices[i];
         gradientOutputs.push_back(GI(GI_i));
       }
@@ -877,7 +877,7 @@ void addObjectMethods(py::module& m) {
             std::vector<TensorCPU> tensors_data;
 #ifdef USE_NUMPY
             // NOLINTNEXTLINE(modernize-loop-convert)
-            for (auto i = 0; i < inputs.size(); ++i) {
+            for (auto i = 0U; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
@@ -988,7 +988,7 @@ void addObjectMethods(py::module& m) {
             std::vector<Tensor> tensors_data;
 #ifdef USE_NUMPY
             // NOLINTNEXTLINE(modernize-loop-convert)
-            for (auto i = 0; i < inputs.size(); ++i) {
+            for (auto i = 0U; i < inputs.size(); ++i) {
               auto input = inputs[i];
               CAFFE_ENFORCE(
                   PyArray_Check(input.ptr()),
@@ -1201,7 +1201,7 @@ void addGlobalMethods(py::module& m) {
   });
   m.def("nearby_opnames", [](const std::string& name) {
     std::vector<std::string> alternatives;
-    int editTolerance = 3;
+    unsigned editTolerance = 3;
     // NOLINTNEXTLINE(performance-for-range-copy)
     for (auto it : caffe2::CPUOperatorRegistry()->Keys()) {
       if (editDistance(it, name, editTolerance) < editTolerance + 1) {
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index 7fecf195f937..f93524b2f9d3 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -65,10 +65,19 @@ class IDeepFetcher : public BlobFetcherBase {
         numpy_type != -1,
         "Unsupported ideep memory data type? This usually should not happen "
         "since ideep memory usually only do float and double.");
-    itensor::dims dims = atensor.get_public_format_dims();
+    itensor::dims dims;
+    bool need_reorder = atensor.need_reorder();
+    if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) {
+      // For FP32 path, only support NCHW format input, so if atensor
+      // has NHWC format, we need reorder it to NCHW format.
+      dims = atensor.get_dims();
+      need_reorder = need_reorder || atensor.get_desc().is_nhwc();
+    } else {
+      dims = atensor.get_public_format_dims();
+    }
     std::vector<npy_intp> npy_dims(dims.begin(), dims.end());
 
-    result.copied = force_copy || atensor.need_reorder();
+    result.copied = force_copy || need_reorder;
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     void* outPtr;
     if (result.copied) {
@@ -87,7 +96,12 @@ class IDeepFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      atensor.to_public(outPtr);
+      if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) {
+        itensor temp_ten(atensor.get_desc().to_default_format(), outPtr);
+        atensor.reorder_to(temp_ten);
+      } else {
+        atensor.to_public(outPtr);
+      }
     }
 
     return result;
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 1bf7b607e1b7..2e2d284f92e4 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -1,13 +1,16 @@
+import errno
 import os
 import shutil
 import tempfile
 import unittest
 from collections import namedtuple
+from typing import List
 
 import caffe2.python.hypothesis_test_util as htu
 import hypothesis.strategies as st
 import numpy as np
 import torch
+from torch import Tensor
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, test_util, workspace, model_helper, brew
 from hypothesis import given, settings
@@ -783,8 +786,7 @@ def multi_input(self, x: torch.Tensor, y: torch.Tensor, z: int = 2) -> torch.Ten
         return x + y + z
 
     @torch.jit.script_method
-    def multi_input_tensor_list(self, tensor_list):  # pyre-ignore: PT type annotations
-        # type: (List[Tensor]) -> Tensor
+    def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor:
         return tensor_list[0] + tensor_list[1] + tensor_list[2]
 
     @torch.jit.script_method
diff --git a/caffe2/quantization/server/concat_dnnlowp_op.h b/caffe2/quantization/server/concat_dnnlowp_op.h
index 9f5f5a09de4a..5a4fcead155e 100644
--- a/caffe2/quantization/server/concat_dnnlowp_op.h
+++ b/caffe2/quantization/server/concat_dnnlowp_op.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "caffe2/operators/concat_split_op.h"
-#include "dnnlowp_op.h"
+#include "caffe2/quantization/server/dnnlowp_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc
index 15b71ceb1f22..aa5a39ccdac3 100644
--- a/caffe2/quantization/server/conv_dnnlowp_op.cc
+++ b/caffe2/quantization/server/conv_dnnlowp_op.cc
@@ -354,8 +354,6 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
         column_offset_ptr = packed_filter.column_offsets.get();
       } else {
-        vector<TensorQuantizationParams> temp_qparams;
-        temp_qparams.push_back(in_qparams_[1]);
         column_offset_temp.resize(M);
         ComputeColumnOffsets<T_signed>(
             KernelDim_(),
@@ -367,7 +365,7 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
       }
       for (int i = 0; i < M; ++i) {
         (*b_quantized_)[i] -=
-            in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+            in_qparams_[INPUT].zero_point * (*column_offset_ptr)[i];
       }
     }
   }
@@ -387,8 +385,6 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
       column_offset_ptr = packed_filter.column_offsets.get();
     } else {
-      vector<TensorQuantizationParams> temp_qparams;
-      temp_qparams.push_back(in_qparams_[1]);
       column_offset_temp.resize(M);
       ComputeColumnOffsets<T_signed>(
           KernelDim_(),
@@ -399,7 +395,7 @@ void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() {
       column_offset_ptr = &column_offset_temp;
     }
     for (int i = 0; i < M; ++i) {
-      (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i];
+      (*b_quantized_)[i] -= in_qparams_[INPUT].zero_point * (*column_offset_ptr)[i];
     }
   }
 }
diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py
index 0d56ea6ac127..eb006ffe59b0 100644
--- a/caffe2/quantization/server/dnnlowp_test_utils.py
+++ b/caffe2/quantization/server/dnnlowp_test_utils.py
@@ -385,6 +385,8 @@ def run_conv_or_fc(
     outputs,
     scale=None,
     zero_point=None,
+    x_scale=None,
+    x_zero_point=None,
 ):
     if order:
         # Conv
@@ -407,6 +409,11 @@ def run_conv_or_fc(
             dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                 "quant_param", float(scale), int(zero_point)
             )
+    if x_scale is not None and x_zero_point is not None:
+        with workspace.WorkspaceGuard(test_case.ws):
+            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
+                "X_quant_param", float(x_scale), int(x_zero_point)
+            )
 
     if init_net:
         test_case.ws.run(init_net)
@@ -427,6 +434,10 @@ def run_conv_or_fc(
             dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                 "quant_param", float(scale), int(zero_point)
             )
+        if x_scale is not None and x_zero_point is not None:
+            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
+                "X_quant_param", float(x_scale), int(x_zero_point)
+            )
 
         if init_net:
             workspace.RunNetOnce(init_net)
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h
index 53a140b30bbf..b8f7538d95f0 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "fully_connected_dnnlowp_op.h"
+#include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
index 4f83940f6d74..439c738c007b 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
@@ -34,6 +34,8 @@ FullyConnectedDNNLowPOp<T, ReluFused>::FullyConnectedDNNLowPOp(
     : BaseType(operator_def, ws),
       axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
       axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
+      X_scale_(this->template GetSingleArgument<float>("X_scale", -1.0)), // for fused static int8 not valid if less than 0
+      X_zero_point_(this->template GetSingleArgument<int32_t>("X_zero_point", 0)),
       quantize_channelwise_(this->template GetSingleArgument<bool>(
           "quantize_channelwise",
           false)),
@@ -109,9 +111,22 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
     t_very_begin = t_begin;
   }
 #endif
+  float X_scale = X_scale_;
+  int32_t X_zero_point = X_zero_point_;
+  if (InputSize() == 5) {
+    // float in float out, two possibilities
+    // if there are only 3 input (no qparams): dyanmic
+    // if there are 5 input (+ input qparams): fused int8 static
+    // output qparams need to be added anyway even it's dummy when dequantize_output=1
+    const auto* input_qparam_blob =
+        this->template Input<caffe2::unique_ptr<Int8QuantParamsBlob>>(4).get();
+    // input_params overwrite input arguments
+    X_scale = input_qparam_blob->qparam.scale;
+    X_zero_point = input_qparam_blob->qparam.zero_point;
+  }
 
   // Get quantization parameters
-  if (!GetQuantizationParameters_()) {
+  if (!GetQuantizationParameters_(X_scale, X_zero_point)) {
     return false;
   }
 
@@ -168,7 +183,6 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
       /* if (VLOG_IS_ON(1)) */
       { t_begin = chrono::system_clock::now(); }
 #endif
-
       Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
 
 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
@@ -181,7 +195,7 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
         t_begin = chrono::system_clock::now();
       }
 #endif
-    }
+     }
 
 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
     /* if (VLOG_IS_ON(1)) */
@@ -295,6 +309,7 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
 
       if (!X.template IsType<T>()) {
         // Both input and output are float
+        // the path for dyanmic and fused staic
         row_offsets_.resize(
             PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
         X_pack_buf_.resize(
@@ -628,7 +643,7 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::RunOnDevice() {
 }
 
 template <typename T, bool ReluFused>
-bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
+bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_(float X_scale, int X_zero_point) {
   using namespace dnnlowp;
 
 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
@@ -638,7 +653,13 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
 #endif
 
   // Choose quantization for X
-  in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
+  if (X_scale <= 0) { // non-fused static or Dynamic
+    in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
+  }
+  else { // fused int8 static
+    in_qparams_[0].scale = X_scale;
+    in_qparams_[0].zero_point = X_zero_point;
+  }
 
 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
   /* if (VLOG_IS_ON(1)) */
@@ -887,8 +908,8 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
 #endif
 
   if (!dequantize_output_ && !requantization_param_selected_) {
-    CAFFE_ENFORCE(InputSize() <= 4);
-    if (InputSize() == 4) {
+    CAFFE_ENFORCE(InputSize() <= 5);
+    if (InputSize() >= 4) {
       const auto* input_qparam_blob =
           this->template Input<caffe2::unique_ptr<caffe2::Int8QuantParamsBlob>>(
                   3)
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.h b/caffe2/quantization/server/fully_connected_dnnlowp_op.h
index 5dd90e1c0935..73d93f5a1362 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op.h
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.h
@@ -17,10 +17,12 @@ class FullyConnectedDNNLowPOp
   USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FullyConnectedOp<CPUContext>);
 
  protected:
-  bool GetQuantizationParameters_();
+  bool GetQuantizationParameters_(float X_scale_=-1.0, int X_zero_point_=0);
 
   std::size_t axis_{1};
   std::size_t axis_w_{1};
+  float X_scale_{-1.0};
+  int X_zero_point_{0};
   vector<std::int64_t> Y_shape_cache_;
 
   std::vector<dnnlowp::RequantizationParams> requantization_params_;
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
index 3a8b0c14931e..52209025f294 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
@@ -34,6 +34,7 @@ class DNNLowPFullyConnectedOpTest(hu.HypothesisTestCase):
         fuse_relu=st.booleans(),
         output_packed_bias=st.booleans(),
         use_input_qparam=st.booleans(),
+        use_output_qparam=st.booleans(),
         **hu.gcs_cpu_only
     )
     def test_dnnlowp_fully_connected_int(
@@ -50,6 +51,7 @@ def test_dnnlowp_fully_connected_int(
         fuse_relu,
         output_packed_bias,
         use_input_qparam,
+        use_output_qparam,
         gc,
         dc,
     ):
@@ -98,22 +100,26 @@ def test_dnnlowp_fully_connected_int(
         Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])
         outputs = []
 
-        op_engine_list = [("FC", "")]
+        op_engine_list = [("FC", "", False, False)]
         if fuse_relu:
-            op_engine_list += [("Int8FCRelu", "DNNLOWP")]
+            op_engine_list += [("Int8FCRelu", "DNNLOWP", False, False)]
         else:
             op_engine_list += [
-                ("FC", "DNNLOWP"),
-                ("FC", "DNNLOWP_16"),
-                ("Int8FC", "DNNLOWP"),
+                # type, engine, do_fuse, skip_requantization
+                ("FC", "DNNLOWP", False, False),
+                ("FC", "DNNLOWP_16", False, False),
+                ("Int8FC", "DNNLOWP", False, False),
+                ("Int8FC", "DNNLOWP", True, False),
+                ("Int8FC", "DNNLOWP", False, True),
+                ("Int8FC", "DNNLOWP", True, True),
             ]
 
-        for op_type, engine in op_engine_list:
+        for op_type, engine, do_fuse, skip_requantization in op_engine_list:
             init_net = core.Net("test_init_net")
             net = core.Net("test_net")
 
-            do_quantize = "DNNLOWP" in engine and in_quantized
-            do_dequantize = "DNNLOWP" in engine and out_quantized
+            do_quantize = "DNNLOWP" in engine and in_quantized and not do_fuse
+            do_dequantize = "DNNLOWP" in engine and out_quantized and not skip_requantization
             do_quantize_weight = (
                 engine == "DNNLOWP" and weight_quantized and len(outputs) > 0
             )
@@ -167,41 +173,29 @@ def test_dnnlowp_fully_connected_int(
                 )
                 init_net.Proto().op.extend([pack])
 
-            if use_input_qparam and do_dequantize and op_type != "FC":
-                fc = core.CreateOperator(
-                    op_type,
-                    [
-                        "X_q" if do_quantize else "X",
-                        "W_packed"
-                        if do_prepack_weight
-                        else ("W_q" if do_quantize_weight else "W"),
-                        "b_q" if do_quantize_weight else "b",
-                        "quant_param",
-                    ],
-                    ["Y_q" if do_dequantize else "Y"],
-                    dequantize_output=not do_dequantize,
-                    preserve_activation_sparsity=preserve_activation_sparsity,
-                    preserve_weight_sparsity=preserve_weight_sparsity,
-                    engine=engine,
-                    device_option=gc,
-                )
-            else:
-                fc = core.CreateOperator(
-                    op_type,
-                    [
-                        "X_q" if do_quantize else "X",
-                        "W_packed"
-                        if do_prepack_weight
-                        else ("W_q" if do_quantize_weight else "W"),
-                        "b_q" if do_quantize_weight else "b",
-                    ],
-                    ["Y_q" if do_dequantize else "Y"],
-                    dequantize_output=not do_dequantize,
-                    preserve_activation_sparsity=preserve_activation_sparsity,
-                    preserve_weight_sparsity=preserve_weight_sparsity,
-                    engine=engine,
-                    device_option=gc,
-                )
+            fc = core.CreateOperator(
+                op_type,
+                [
+                    "X_q" if do_quantize else "X",
+                    "W_packed"
+                    if do_prepack_weight
+                    else ("W_q" if do_quantize_weight else "W"),
+                    "b_q" if do_quantize_weight else "b",
+                    # "quant_param",
+                ],
+                ["Y_q" if do_dequantize else "Y"],
+                dequantize_output=not do_dequantize,
+                preserve_activation_sparsity=preserve_activation_sparsity,
+                preserve_weight_sparsity=preserve_weight_sparsity,
+                engine=engine,
+                device_option=gc,
+            )
+            if op_type != "FC":
+                if (do_dequantize and use_output_qparam) or (use_input_qparam and op_type == "Int8_FC"):
+                    fc.input.extend(["quant_param"])
+                if (use_input_qparam and op_type == "Int8_FC"):
+                    fc.input.extend(["X_quant_param"])
+
             if do_quantize_weight or do_prepack_weight:
                 # When quantized weight is provided, we can't rescale the
                 # output dynamically by looking at the range of output of each
@@ -221,7 +215,9 @@ def test_dnnlowp_fully_connected_int(
                 )
                 net.Proto().op.extend([dequantize])
 
-            if use_input_qparam and do_dequantize and op_type != "FC":
+
+
+            if use_output_qparam and do_dequantize and op_type != "FC":
                 ref_output = outputs[0][0]
                 ref_output_min = 0 if ref_output.size == 0 else ref_output.min()
                 ref_output_max = 0 if ref_output.size == 0 else ref_output.max()
@@ -229,25 +225,37 @@ def test_dnnlowp_fully_connected_int(
                 q_param = dnnlowp_utils.choose_quantization_params(
                     ref_output_min, ref_output_max, preserve_activation_sparsity
                 )
-                run_conv_or_fc(
-                    self,
-                    init_net,
-                    net,
-                    X,
-                    W,
-                    b,
-                    op_type,
-                    engine,
-                    None,
-                    gc,
-                    outputs,
-                    q_param.scale,
-                    q_param.zero_point,
-                )
+                q_param_scale = q_param.scale
+                q_param_zero_point = q_param.zero_point
             else:
-                run_conv_or_fc(
-                    self, init_net, net, X, W, b, op_type, engine, None, gc, outputs
-                )
+                q_param_scale = None
+                q_param_zero_point = None
+
+            if not (use_input_qparam and op_type == "Int8FC"):
+                x_q_param_scale = None
+                x_q_param_zero_point = None
+            else:
+                x_q_param_scale = x_q_param.scale
+                x_q_param_zero_point = x_q_param.zero_point
+
+            run_conv_or_fc(
+                self,
+                init_net,
+                net,
+                X,
+                W,
+                b,
+                op_type,
+                engine,
+                None,
+                gc,
+                outputs,
+                q_param_scale,
+                q_param_zero_point,
+                x_q_param_scale,
+                x_q_param_zero_point,
+            )
+
 
             if output_packed_bias and do_prepack_weight and do_dequantize:
                 bias_int32 = self.ws.blobs["B_q32"].fetch()
@@ -264,12 +272,14 @@ def test_dnnlowp_fully_connected_int(
                     "W": [output_channels, input_channels],
                     "b": [output_channels],
                     "quant_param": [1],
+                    "X_quant_param": [1],
                 },
                 blob_types={
                     "X": core.DataType.FLOAT,
                     "W": core.DataType.FLOAT,
                     "b": core.DataType.FLOAT,
                     "quant_param": core.DataType.FLOAT,
+                    "X_quant_param": core.DataType.FLOAT,
                 },
             )
             assert (
diff --git a/caffe2/quantization/server/im2col_dnnlowp.h b/caffe2/quantization/server/im2col_dnnlowp.h
index dc347142b640..4aca91811da9 100644
--- a/caffe2/quantization/server/im2col_dnnlowp.h
+++ b/caffe2/quantization/server/im2col_dnnlowp.h
@@ -216,7 +216,7 @@ static void Im2ColNHWC(
     T* data_col_temp =
         data_col + h * width_col * kernel_h * kernel_w * channels;
     int w_pad = -pad_l;
-    for (const auto w : c10::irange(width_col)) {
+    for (C10_UNUSED const auto w : c10::irange(width_col)) {
       int r = 0;
       for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
         int s = 0;
diff --git a/caffe2/quantization/server/kl_minimization.h b/caffe2/quantization/server/kl_minimization.h
index edf95f5b9a1a..9b43fce4e56f 100644
--- a/caffe2/quantization/server/kl_minimization.h
+++ b/caffe2/quantization/server/kl_minimization.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "quantization_error_minimization.h"
+#include "caffe2/quantization/server/quantization_error_minimization.h"
 
 namespace dnnlowp {
 
diff --git a/caffe2/quantization/server/l2_minimization.h b/caffe2/quantization/server/l2_minimization.h
index 5c2173f48267..2ef983b986d6 100644
--- a/caffe2/quantization/server/l2_minimization.h
+++ b/caffe2/quantization/server/l2_minimization.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "quantization_error_minimization.h"
+#include "caffe2/quantization/server/quantization_error_minimization.h"
 
 #include <algorithm>
 #include <cassert>
diff --git a/caffe2/quantization/server/quantization_error_minimization.h b/caffe2/quantization/server/quantization_error_minimization.h
index a315cf1a0977..83725d8b19cf 100644
--- a/caffe2/quantization/server/quantization_error_minimization.h
+++ b/caffe2/quantization/server/quantization_error_minimization.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "dnnlowp.h"
+#include "caffe2/quantization/server/dnnlowp.h"
 
 namespace dnnlowp {
 
diff --git a/caffe2/quantization/server/relu_dnnlowp_op.h b/caffe2/quantization/server/relu_dnnlowp_op.h
index f308e90e2881..2885f0fda26b 100644
--- a/caffe2/quantization/server/relu_dnnlowp_op.h
+++ b/caffe2/quantization/server/relu_dnnlowp_op.h
@@ -3,7 +3,7 @@
 #include "caffe2/operators/relu_op.h"
 
 #include "caffe2/core/tensor_int8.h"
-#include "caffe2_dnnlowp_utils.h"
+#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/quantization/server/sigmoid.h b/caffe2/quantization/server/sigmoid.h
index 17722405e6f0..c21303420e6a 100644
--- a/caffe2/quantization/server/sigmoid.h
+++ b/caffe2/quantization/server/sigmoid.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "tanh.h"
+#include "caffe2/quantization/server/tanh.h"
 
 namespace dnnlowp {
 
diff --git a/caffe2/quantization/server/tanh.h b/caffe2/quantization/server/tanh.h
index 2950352131d1..823ded42982b 100644
--- a/caffe2/quantization/server/tanh.h
+++ b/caffe2/quantization/server/tanh.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "dnnlowp.h"
+#include "caffe2/quantization/server/dnnlowp.h"
 
 #include <cmath>
 #include <vector>
diff --git a/caffe2/queue/blobs_queue.cc b/caffe2/queue/blobs_queue.cc
index 4398cf816481..4c890088fa2d 100644
--- a/caffe2/queue/blobs_queue.cc
+++ b/caffe2/queue/blobs_queue.cc
@@ -18,16 +18,11 @@
 namespace caffe2 {
 
 // Constants for user tracepoints
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr int SDT_NONBLOCKING_OP = 0;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr int SDT_BLOCKING_OP = 1;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
+C10_UNUSED static constexpr int SDT_NONBLOCKING_OP = 0;
+C10_UNUSED static constexpr int SDT_BLOCKING_OP = 1;
+C10_UNUSED static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
+C10_UNUSED static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
+C10_UNUSED static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
 
 BlobsQueue::BlobsQueue(
     Workspace* ws,
@@ -66,8 +61,7 @@ bool BlobsQueue::blockingRead(
     float timeout_secs) {
   Timer readTimer;
   auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
   CAFFE_SDT(queue_read_start, name, (void*)this, SDT_BLOCKING_OP);
   std::unique_lock<std::mutex> g(mutex_);
   auto canRead = [this]() {
@@ -76,7 +70,6 @@ bool BlobsQueue::blockingRead(
   };
   // Decrease queue balance before reading to indicate queue read pressure
   // is being increased (-ve queue balance indicates more reads than writes)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, queue_balance, -1);
   if (timeout_secs > 0) {
     std::chrono::milliseconds timeout_ms(int(timeout_secs * 1000));
@@ -99,17 +92,14 @@ bool BlobsQueue::blockingRead(
   CAFFE_ENFORCE(inputs.size() >= result.size());
   for (const auto i : c10::irange(result.size())) {
     auto bytes = BlobStat::sizeBytes(*result[i]);
-    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
     CAFFE_EVENT(stats_, queue_dequeued_bytes, bytes, i);
     using std::swap;
     swap(*(inputs[i]), *(result[i]));
   }
   CAFFE_SDT(queue_read_end, name, (void*)this, writer_ - reader_);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, queue_dequeued_records);
   ++reader_;
   cv_.notify_all();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, read_time_ns, readTimer.NanoSeconds());
   return true;
 }
@@ -117,8 +107,7 @@ bool BlobsQueue::blockingRead(
 bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
   Timer writeTimer;
   auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
   CAFFE_SDT(queue_write_start, name, (void*)this, SDT_NONBLOCKING_OP);
   std::unique_lock<std::mutex> g(mutex_);
   if (!canWrite()) {
@@ -127,11 +116,9 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
   }
   // Increase queue balance before writing to indicate queue write pressure is
   // being increased (+ve queue balance indicates more writes than reads)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, queue_balance, 1);
   DCHECK(canWrite());
   doWrite(inputs);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
   return true;
 }
@@ -139,13 +126,11 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
 bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
   Timer writeTimer;
   auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
   CAFFE_SDT(queue_write_start, name, (void*)this, SDT_BLOCKING_OP);
   std::unique_lock<std::mutex> g(mutex_);
   // Increase queue balance before writing to indicate queue write pressure is
   // being increased (+ve queue balance indicates more writes than reads)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, queue_balance, 1);
   cv_.wait(g, [this]() { return closing_ || canWrite(); });
   if (!canWrite()) {
@@ -154,7 +139,6 @@ bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
   }
   DCHECK(canWrite());
   doWrite(inputs);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
   CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
   return true;
 }
@@ -170,7 +154,7 @@ bool BlobsQueue::canWrite() {
   // writer is always within [reader, reader + size)
   // we can write if reader is within [reader, reader + size)
   CAFFE_ENFORCE_LE(reader_, writer_);
-  CAFFE_ENFORCE_LE(writer_, reader_ + queue_.size());
+  CAFFE_ENFORCE_LE(writer_, static_cast<int64_t>(reader_ + queue_.size()));
   // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
   return writer_ != reader_ + queue_.size();
 }
@@ -178,8 +162,7 @@ bool BlobsQueue::canWrite() {
 void BlobsQueue::doWrite(const std::vector<Blob*>& inputs) {
   auto& result = queue_[writer_ % queue_.size()];
   CAFFE_ENFORCE(inputs.size() >= result.size());
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
   for (const auto i : c10::irange(result.size())) {
     using std::swap;
     swap(*(inputs[i]), *(result[i]));
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index 108d998b602f..3299327be430 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -101,8 +101,11 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
   // Windows always little endian
   #define __BYTE_ORDER __LITTLE_ENDIAN
 
+  #if !defined(_M_ARM64)
   // intrinsics / prefetching
   #include <xmmintrin.h>
+  #endif
+
   #ifdef __MINGW32__
     #define PREFETCH(location) __builtin_prefetch(location)
   #else
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 9f0e9ce6194e..9847bc132264 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -5,6 +5,9 @@
 #include <ostream>
 #include <fstream>
 #include <algorithm>
+#include <sys/stat.h>
+#include <sys/types.h>
+
 
 #include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
@@ -49,6 +52,17 @@ static std::string basename(const std::string& name) {
   return name.substr(start, end - start);
 }
 
+static std::string parentdir(const std::string& name) {
+  size_t end = name.find_last_of('/');
+  if(end == std::string::npos)
+    end = name.find_last_of('\\');
+
+  if(end == std::string::npos)
+    return "";
+
+  return name.substr(0, end);
+}
+
 size_t PyTorchStreamReader::read(uint64_t pos, char* buf, size_t n) {
   return in_->read(pos, buf, n, "reading file");
 }
@@ -129,22 +143,27 @@ void PyTorchStreamReader::init() {
   }
   std::string version(static_cast<const char*>(version_ptr.get()), version_size);
   version_ = caffe2::stoull(version);
-  AT_ASSERTM(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      version_ >= kMinSupportedFileFormatVersion,
-      "Attempted to read a PyTorch file with version ",
-      c10::to_string(version_),
-      ", but the minimum supported version for reading is ",
-      c10::to_string(kMinSupportedFileFormatVersion),
-      ". Your PyTorch script module file is too old. Please re-export it again.");
-  AT_ASSERTM(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      version_ <= kMaxSupportedFileFormatVersion,
-      "Attempted to read a PyTorch file with version ",
-      version_,
-      ", but the maximum supported version for reading is ",
-      kMaxSupportedFileFormatVersion,
-      ". Your PyTorch installation may be too old.");
+  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+  if (version_ < kMinSupportedFileFormatVersion) {
+    CAFFE_THROW(
+        "Attempted to read a PyTorch file with version ",
+        c10::to_string(version_),
+        ", but the minimum supported version for reading is ",
+        c10::to_string(kMinSupportedFileFormatVersion),
+        ". Your PyTorch script module file is too old. Please regenerate it",
+        " with latest version of PyTorch to mitigate this issue.");
+  }
+
+  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+  if (version_ > kMaxSupportedFileFormatVersion) {
+    CAFFE_THROW(
+        "Attempted to read a PyTorch file with version ",
+        version_,
+        ", but the maximum supported version for reading is ",
+        kMaxSupportedFileFormatVersion,
+        ". The version of your PyTorch installation may be too old, ",
+        "please upgrade PyTorch to latest version to mitigate this issue.");
+  }
 }
 
 void PyTorchStreamReader::valid(const char* what, const char* info) {
@@ -333,6 +352,13 @@ void PyTorchStreamWriter::setup(const string& file_name) {
         file_name,
         std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
     valid("opening archive ", file_name.c_str());
+
+    const std::string dir_name = parentdir(file_name);
+    if(!dir_name.empty()) {
+      struct stat st;
+      bool dir_exists = (stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
+      TORCH_CHECK(dir_exists, "Parent directory ", dir_name, " does not exist.");
+    }
     TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
     writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
       file_stream_.write(static_cast<const char*>(buf), nbytes);
diff --git a/caffe2/serialize/inline_container_test.cc b/caffe2/serialize/inline_container_test.cc
index 5ceb7274b771..18f75dddfaa5 100644
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@@ -5,6 +5,7 @@
 #include <gtest/gtest.h>
 
 #include "caffe2/serialize/inline_container.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 namespace serialize {
@@ -22,14 +23,14 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
   std::array<char, 127> data1;
 
-  for (int i = 0; i < data1.size(); ++i) {
+  for (auto i: c10::irange( data1.size())) {
     data1[i] = data1.size() - i;
   }
   writer.writeRecord("key1", data1.data(), data1.size());
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
   std::array<char, 64> data2;
-  for (int i = 0; i < data2.size(); ++i) {
+  for (auto i: c10::irange(data2.size())) {
     data2[i] = data2.size() - i;
   }
   writer.writeRecord("key2", data2.data(), data2.size());
@@ -83,14 +84,14 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
   std::array<char, 127> data1;
 
-  for (int i = 0; i < data1.size(); ++i) {
+  for (auto i: c10::irange(data1.size())) {
     data1[i] = data1.size() - i;
   }
   writer.writeRecord("key1", data1.data(), data1.size());
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
   std::array<char, 64> data2;
-  for (int i = 0; i < data2.size(); ++i) {
+  for (auto i: c10::irange(data2.size())) {
     data2[i] = data2.size() - i;
   }
   writer.writeRecord("key2", data2.data(), data2.size());
diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h
index 9e89fe9acd64..78a91c64fe84 100644
--- a/caffe2/serialize/versions.h
+++ b/caffe2/serialize/versions.h
@@ -12,7 +12,7 @@ namespace serialize {
 constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
 
 #if ENABLE_UPGRADERS
-constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L;
+constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL;
 #else
 constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L;
 #endif
@@ -79,7 +79,11 @@ constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L;
 //     Bump the version number to 9 to update aten::logspace and
 //     and aten::logspace.out to error out when steps is not
 //     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
-constexpr uint64_t kProducedFileFormatVersion = 0x9L;
+// 3) [02/11/2022]
+//     Bump the version number to 10 to update aten::gelu and
+//     and aten::gelu.out to support the new approximate kwarg.
+//     (see: https://github.com/pytorch/pytorch/pull/61439)
+constexpr uint64_t kProducedFileFormatVersion = 0xAL;
 #else
 constexpr uint64_t kProducedFileFormatVersion = 0x3L;
 #endif
@@ -106,24 +110,37 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
 //  0x2L: (Comment missing)
 //  0x3L: (Comment missing)
 //  0x4L: (update) Added schema to function tuple. Forward-compatible change.
-//  0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize
-//  extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting
-//  to the unify format, the root key of tensor storage is updated from {index} to
-//  {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage`
-//  Forward-compatibility change.
+//  0x5L: (update) Update bytecode is sharing constant tensor files from
+//  torchscript, and only serialize extra tensors that are not in the
+//  torchscript constant table. Also update tensor storage schema adapting to
+//  the unify format, the root key of tensor storage is updated from {index} to
+//  {the_pointer_value_the_tensor.storage}, for example:
+//  `140245072983168.storage` Forward-compatibility change.
 //  0x6L: Implicit opereator versioning using number of specified argument.
-//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845
-//  for details.
-//  0x7L: Enable support for operators with default arguments plus out arguments.
-constexpr uint64_t kProducedBytecodeVersion = 0x7L;
+//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for
+//  details.
+//  0x7L: Enable support for operators with default arguments plus out
+//  arguments. Refer. See https://github.com/pytorch/pytorch/pull/63651 for
+//  details.
+//  0x8L: Emit promoted operators as instructions. See
+//  https://github.com/pytorch/pytorch/pull/71662 for details.
+//  0x9L: Change serialization format from pickle to format This version is to
+//  serve migration. v8 pickle and v9 flatbuffer are the same. Refer to the
+//  summary of https://github.com/pytorch/pytorch/pull/75201 for more details.
+constexpr uint64_t kProducedBytecodeVersion = 0x8L;
+
+// static_assert(
+//     kProducedBytecodeVersion >= kProducedFileFormatVersion,
+//     "kProducedBytecodeVersion must be higher or equal to
+//     kProducedFileFormatVersion.");
 
 // Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion
 // for limited backward/forward compatibility support of bytecode. If
-// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion (in loader),
-// we should support this model_version. For example, we provide a wrapper to
-// handle an updated operator.
-constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
-constexpr uint64_t kMaxSupportedBytecodeVersion = 0x8L;
+// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion
+// (in loader), we should support this model_version. For example, we provide a
+// wrapper to handle an updated operator.
+constexpr uint64_t kMinSupportedBytecodeVersion = 0x4L;
+constexpr uint64_t kMaxSupportedBytecodeVersion = 0x9L;
 
 } // namespace serialize
 } // namespace caffe2
diff --git a/caffe2/sgd/learning_rate_functors.h b/caffe2/sgd/learning_rate_functors.h
index c2b9dd976a1f..d733ccc14611 100644
--- a/caffe2/sgd/learning_rate_functors.h
+++ b/caffe2/sgd/learning_rate_functors.h
@@ -36,7 +36,7 @@ class FixedLearningRate : public LearningRateFunctor<T> {
 };
 
 // Alter: alternatate learning rate with active_period and inactive_period.
-// update for for a duration of active_period and then stop for a duration of
+// update for a duration of active_period and then stop for a duration of
 // inactive_period if active_first, and vice versa
 template <typename T>
 class AlternateLearningRate : public LearningRateFunctor<T> {
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 879f0d25068b..0f7e90e55b53 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -199,7 +199,7 @@ void runConv(
 
 } // unnamed namespace
 
-constexpr size_t kIters = 20;
+constexpr int kIters = 20;
 
 TEST(DEPTHWISE3x3, Conv) {
   for (int i = 0; i < kIters; ++i) {
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index 398be235f7f1..fe653c4d91ab 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -236,7 +236,7 @@ void runConv(
 
 } // unnamed namespace
 
-constexpr size_t kIters = 20;
+constexpr int kIters = 20;
 
 TEST(NNPACK, Conv_3x3s1) {
   for (int i = 0; i < kIters; ++i) {
diff --git a/caffe2/utils/threadpool/pthreadpool-cpp.cc b/caffe2/utils/threadpool/pthreadpool-cpp.cc
index 38846d5b143d..2c2209f225ca 100644
--- a/caffe2/utils/threadpool/pthreadpool-cpp.cc
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc
@@ -83,7 +83,7 @@ size_t getDefaultNumThreads();
 PThreadPool* pthreadpool() {
   static auto threadpool =
     std::make_unique<PThreadPool>(getDefaultNumThreads());
-#if !(defined(WIN32)) && !(defined(__XROS__))
+#if !(defined(WIN32))
   static std::once_flag flag;
   std::call_once(flag, []() {
     pthread_atfork(nullptr, nullptr, child_atfork);
diff --git a/caffe2/utils/threadpool/pthreadpool.h b/caffe2/utils/threadpool/pthreadpool.h
index 54b3cb63303c..914ebf40a699 100644
--- a/caffe2/utils/threadpool/pthreadpool.h
+++ b/caffe2/utils/threadpool/pthreadpool.h
@@ -8,7 +8,7 @@
 #include <stddef.h> // for size_t
 #include <stdint.h> // for uint32_t
 
-#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
+#if defined(USE_PTHREADPOOL)
 // This is a hack.
 // Mainly introduced here because
 // 1. NNPACK can be compiled to use internal legacy threadpool implementation because much of C2 depends on that.
diff --git a/caffe2/utils/threadpool/pthreadpool_impl.cc b/caffe2/utils/threadpool/pthreadpool_impl.cc
index 72bee75678ec..ae031ca2ae7e 100644
--- a/caffe2/utils/threadpool/pthreadpool_impl.cc
+++ b/caffe2/utils/threadpool/pthreadpool_impl.cc
@@ -2,7 +2,7 @@
 #include "caffe2/utils/threadpool/pthreadpool-cpp.h"
 #include "caffe2/utils/threadpool/ThreadPool.h"
 
-#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
+#if defined(USE_PTHREADPOOL)
 namespace caffe2 {
 namespace {
 static thread_local bool using_new_threadpool{false};
@@ -34,7 +34,7 @@ void legacy_pthreadpool_compute_1d(
     }
     return;
   }
-#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
+#if defined(USE_PTHREADPOOL)
   if (caffe2::using_new_threadpool) {
     pthreadpool_parallelize_1d(threadpool, function, argument, range, 0u);
   } else {
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index ccf4b23cc3c0..8fa42e89411f 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -67,108 +67,18 @@ if(INTERN_BUILD_ATEN_OPS)
     set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/MapAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
   endif()
 
-  file(GLOB cpu_kernel_cpp_in "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/cpu/*.cpp" "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
-
-  list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
-  list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
-
-
-  if(CXX_AVX512_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX512_CPU_DEFINITION")
-    list(APPEND CPU_CAPABILITY_NAMES "AVX512")
-    if(MSVC)
-      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
-    else(MSVC)
-      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma")
-    endif(MSVC)
-  endif(CXX_AVX512_FOUND)
-
-  if(CXX_AVX2_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
-
-    # Some versions of GCC pessimistically split unaligned load and store
-    # instructions when using the default tuning. This is a bad choice on
-    # new Intel and AMD processors so we disable it when compiling with AVX2.
-    # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
-    check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
-    if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
-      set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
-    endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
-
-    list(APPEND CPU_CAPABILITY_NAMES "AVX2")
-    if(DEFINED ENV{ATEN_AVX512_256})
-      if($ENV{ATEN_AVX512_256} MATCHES "TRUE")
-        if(CXX_AVX512_FOUND)
-          message("-- ATen AVX2 kernels will use 32 ymm registers")
-          if(MSVC)
-            list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
-          else(MSVC)
-            list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=native ${CPU_NO_AVX256_SPLIT_FLAGS}")
-          endif(MSVC)
-        endif(CXX_AVX512_FOUND)
-      endif()
-    else()
-      if(MSVC)
-        list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
-      else(MSVC)
-        list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
-      endif(MSVC)
-    endif()
-  endif(CXX_AVX2_FOUND)
-
-  if(CXX_VSX_FOUND)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION")
-    LIST(APPEND CPU_CAPABILITY_NAMES "VSX")
-    LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}  ${CXX_VSX_FLAGS}")
-  endif(CXX_VSX_FOUND)
-
-  if(CXX_ZVECTOR_FOUND)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_ZVECTOR_CPU_DEFINITION")
-    LIST(APPEND CPU_CAPABILITY_NAMES "ZVECTOR")
-    LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}  ${CXX_ZVECTOR_FLAGS}")
-  endif(CXX_ZVECTOR_FOUND)
-
-  list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
-  math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
-
-  # The sources list might get reordered later based on the capabilites.
-  # See NOTE [ Linking AVX and non-AVX files ]
-  foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
-    foreach(IMPL ${cpu_kernel_cpp_in})
-      file(RELATIVE_PATH NAME "${PROJECT_SOURCE_DIR}/aten/src/ATen/" "${IMPL}")
-      list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
-      set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
-      configure_file("${PROJECT_SOURCE_DIR}/cmake/IncludeSource.cpp.in" ${NEW_IMPL})
-      set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
-      list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
-      if(MSVC)
-        set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
-      else(MSVC)
-        set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
-      endif(MSVC)
-      # Disable certain warnings for GCC-9.X
-      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-        if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
-          # See https://github.com/pytorch/pytorch/issues/38855
-          set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
-        endif()
-        if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
-          # See https://github.com/pytorch/pytorch/issues/38854
-          set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
-        endif()
-      endif()
-      set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
-    endforeach()
-  endforeach()
-  list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
-
-  file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
+  file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py")
 
   set(GEN_ROCM_FLAG)
   if(USE_ROCM)
     set(GEN_ROCM_FLAG --rocm)
   endif()
 
+  set(GEN_MPS_FLAG)
+  if(USE_MPS)
+    set(GEN_MPS_FLAG --mps)
+  endif()
+
   set(CUSTOM_BUILD_FLAGS)
   if(INTERN_BUILD_MOBILE)
     if(USE_VULKAN)
@@ -193,24 +103,64 @@ if(INTERN_BUILD_ATEN_OPS)
   endif()
 
   if(STATIC_DISPATCH_BACKEND)
-    message(STATUS "Custom build with static dispatch backend: ${STATIC_DISPATCH_BACKEND}")
+    message(STATUS "Custom build with static dispatch backends: ${STATIC_DISPATCH_BACKEND}")
+    list(LENGTH STATIC_DISPATCH_BACKEND len)
     list(APPEND CUSTOM_BUILD_FLAGS
       --static_dispatch_backend ${STATIC_DISPATCH_BACKEND})
   endif()
 
+  # Codegen unboxing
+  if(USE_LIGHTWEIGHT_DISPATCH)
+    file(GLOB_RECURSE all_unboxing_script "${CMAKE_CURRENT_LIST_DIR}/../tools/jit/*.py")
+    list(APPEND CUSTOM_BUILD_FLAGS --skip_dispatcher_op_registration)
+    set(GEN_UNBOXING_COMMAND
+        "${PYTHON_EXECUTABLE}" -m tools.jit.gen_unboxing
+        --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
+        --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
+        )
+    set("GEN_UNBOXING_COMMAND_sources"
+        ${GEN_UNBOXING_COMMAND}
+        --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
+        )
+    message(STATUS "Generating sources for lightweight dispatch")
+    execute_process(
+        COMMAND ${GEN_UNBOXING_COMMAND_sources} --dry-run
+        RESULT_VARIABLE RETURN_VALUE
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
+    )
+    if(NOT RETURN_VALUE EQUAL 0)
+      message(FATAL_ERROR "Failed to get generated_unboxing_sources list")
+    endif()
+
+    include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake")
+    add_custom_command(
+        COMMENT "Generating ATen unboxing sources"
+        OUTPUT
+        ${generated_unboxing_sources}
+        ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
+        COMMAND ${GEN_UNBOXING_COMMAND_sources}
+        DEPENDS ${all_unboxing_script} ${sources_templates}
+        ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
+        ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
+    )
+  else() # Otherwise do not generate or include sources into build.
+    set(generated_unboxing_sources "")
+  endif()
+
   set(GEN_PER_OPERATOR_FLAG)
   if(USE_PER_OPERATOR_HEADERS)
     list(APPEND GEN_PER_OPERATOR_FLAG "--per-operator-headers")
   endif()
 
   set(GEN_COMMAND
-      "${PYTHON_EXECUTABLE}" -m tools.codegen.gen
+      "${PYTHON_EXECUTABLE}" -m torchgen.gen
       --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
       --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
       ${GEN_PER_OPERATOR_FLAG}
       ${GEN_ROCM_FLAG}
+      ${GEN_MPS_FLAG}
       ${CUSTOM_BUILD_FLAGS}
-      ${GEN_VULKAN_FLAGS}
   )
 
   file(GLOB_RECURSE headers_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*\.h")
@@ -245,6 +195,7 @@ if(INTERN_BUILD_ATEN_OPS)
 
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake")
+    include("${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake")
 
@@ -256,14 +207,17 @@ if(INTERN_BUILD_ATEN_OPS)
         ${generated_${gen_type}}
         ${cuda_generated_${gen_type}}
         ${core_generated_${gen_type}}
+        ${cpu_vec_generated_${gen_type}}
         ${ops_generated_${gen_type}}
         ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake
         ${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake
         ${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake
+        ${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake
         ${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake
       COMMAND ${GEN_COMMAND_${gen_type}}
       DEPENDS ${all_python} ${${gen_type}_templates}
         ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
+        ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml
       WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
     )
   endforeach()
@@ -272,9 +226,9 @@ if(INTERN_BUILD_ATEN_OPS)
   # not tracked correctly in CMake. We make the libATen.so depend explicitly
   # on building the generated ATen files to workaround.
   add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS
-      ${generated_headers} ${core_generated_headers} ${ops_generated_headers}
-      ${generated_sources} ${core_generated_sources} ${ops_generated_sources}
-      ${generated_declarations_yaml})
+      ${generated_headers} ${core_generated_headers} ${cpu_vec_generated_headers} ${ops_generated_headers}
+      ${generated_sources} ${core_generated_sources} ${cpu_vec_generated_sources} ${ops_generated_sources}
+      ${generated_declarations_yaml} ${generated_unboxing_sources})
   add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS
       ${cuda_generated_headers} ${cuda_generated_sources})
   add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
@@ -286,6 +240,109 @@ if(INTERN_BUILD_ATEN_OPS)
     target_compile_definitions(ATEN_CPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
     target_compile_definitions(ATEN_CUDA_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
   endif()
+
+  # Handle source files that need to be compiled multiple times for
+  # different vectorization options
+  file(GLOB cpu_kernel_cpp_in "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/cpu/*.cpp" "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
+
+  list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
+  list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
+
+  if(CXX_AVX512_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX512_CPU_DEFINITION")
+    list(APPEND CPU_CAPABILITY_NAMES "AVX512")
+    if(MSVC)
+      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
+    else(MSVC)
+      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma")
+    endif(MSVC)
+  endif(CXX_AVX512_FOUND)
+
+  if(CXX_AVX2_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
+
+    # Some versions of GCC pessimistically split unaligned load and store
+    # instructions when using the default tuning. This is a bad choice on
+    # new Intel and AMD processors so we disable it when compiling with AVX2.
+    # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
+    check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+    if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+      set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
+    endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+
+    list(APPEND CPU_CAPABILITY_NAMES "AVX2")
+    if(DEFINED ENV{ATEN_AVX512_256})
+      if($ENV{ATEN_AVX512_256} MATCHES "TRUE")
+        if(CXX_AVX512_FOUND)
+          message("-- ATen AVX2 kernels will use 32 ymm registers")
+          if(MSVC)
+            list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
+          else(MSVC)
+            list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=native ${CPU_NO_AVX256_SPLIT_FLAGS}")
+          endif(MSVC)
+        endif(CXX_AVX512_FOUND)
+      endif()
+    else()
+      if(MSVC)
+        list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
+      else(MSVC)
+        list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
+      endif(MSVC)
+    endif()
+  endif(CXX_AVX2_FOUND)
+
+  if(CXX_VSX_FOUND)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION")
+    LIST(APPEND CPU_CAPABILITY_NAMES "VSX")
+    LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}  ${CXX_VSX_FLAGS}")
+  endif(CXX_VSX_FOUND)
+
+  if(CXX_ZVECTOR_FOUND)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_ZVECTOR_CPU_DEFINITION")
+    LIST(APPEND CPU_CAPABILITY_NAMES "ZVECTOR")
+    LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}  ${CXX_ZVECTOR_FLAGS}")
+  endif(CXX_ZVECTOR_FOUND)
+
+  list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
+  math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
+
+  # The sources list might get reordered later based on the capabilites.
+  # See NOTE [ Linking AVX and non-AVX files ]
+  foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
+    function(process_vec NAME)
+      list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
+      set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
+      configure_file("${PROJECT_SOURCE_DIR}/cmake/IncludeSource.cpp.in" ${NEW_IMPL})
+      set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp} PARENT_SCOPE) # Create list of copies
+      list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
+      if(MSVC)
+        set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
+      else(MSVC)
+        set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
+      endif(MSVC)
+      # Disable certain warnings for GCC-9.X
+      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+        if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
+          # See https://github.com/pytorch/pytorch/issues/38855
+          set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
+        endif()
+        if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
+          # See https://github.com/pytorch/pytorch/issues/38854
+          set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
+        endif()
+      endif()
+      set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
+    endfunction()
+    foreach(IMPL ${cpu_kernel_cpp_in})
+      file(RELATIVE_PATH NAME "${PROJECT_SOURCE_DIR}/aten/src/ATen/" "${IMPL}")
+      process_vec("${NAME}")
+    endforeach()
+    foreach(IMPL ${cpu_vec_generated_sources})
+      file(RELATIVE_PATH NAME "${CMAKE_BINARY_DIR}/aten/src/ATen/" "${IMPL}")
+      process_vec("${NAME}")
+    endforeach()
+  endforeach()
+  list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
 endif()
 
 function(append_filelist name outputvar)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 0969055415b9..64fa6304207b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -216,7 +216,7 @@ elseif(BLAS STREQUAL "MKL")
     set(CAFFE2_USE_MKL ON)
     set(BLAS_INFO "mkl")
     set(BLAS_FOUND 1)
-    set(BLAS_LIBRARIES caffe2::mkl)
+    set(BLAS_LIBRARIES ${MKL_LIBRARIES})
   else()
     message(WARNING "MKL could not be found. Defaulting to Eigen")
     set(CAFFE2_USE_EIGEN_FOR_BLAS ON)
@@ -816,6 +816,10 @@ if(USE_FBGEMM)
     set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
     set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
     set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON)
+    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0)
+        # See https://github.com/pytorch/pytorch/issues/74352
+        target_compile_options(asmjit PRIVATE -Wno-deprecated-copy -Wno-unused-but-set-variable)
+    endif()
   endif()
 
   if(USE_FBGEMM)
@@ -1305,7 +1309,7 @@ if(USE_ROCM)
     hip_include_directories(${Caffe2_HIP_INCLUDE})
 
     set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
-      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${PYTORCH_RCCL_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
+      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
 
     # Note [rocblas & rocfft cmake bug]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1369,18 +1373,39 @@ if(USE_CUDA)
   endif()
 endif()
 
+if(USE_DISTRIBUTED AND USE_TENSORPIPE)
+  if(MSVC)
+    message(WARNING "Tensorpipe cannot be used on Windows.")
+  else()
+    if(USE_CUDA)
+      set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
+      set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
+    endif()
+    set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
+    set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
+
+    # Tensorpipe uses cuda_add_library
+    torch_update_find_cuda_flags()
+    add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
+
+    list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
+    if(USE_CUDA)
+      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
+    elseif(USE_ROCM)
+      message(WARNING "TensorPipe doesn't yet support ROCm")
+      # Not yet...
+      # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
+    endif()
+  endif()
+endif()
+
 if(USE_GLOO)
   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(WARNING "Gloo can only be used on 64-bit systems.")
     caffe2_update_option(USE_GLOO OFF)
   else()
-    if(MSVC)
-      # Don't install gloo on Windows
-      # It is already handled in builder scripts
-      set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
-    else()
-      set(GLOO_INSTALL ON CACHE BOOL "" FORCE)
-    endif()
+    # Don't install gloo
+    set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
     set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
 
     # Temporarily override variables to avoid building Gloo tests/benchmarks
@@ -1392,6 +1417,10 @@ if(USE_GLOO)
       set(ENV{GLOO_ROCM_ARCH} "${PYTORCH_ROCM_ARCH}")
     endif()
     if(NOT USE_SYSTEM_GLOO)
+      if(USE_DISTRIBUED AND USE_TENSORPIPE)
+        get_target_property(_include_dirs uv_a INCLUDE_DIRECTORIES)
+        set_target_properties(uv_a PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${_include_dirs}")
+      endif()
       # gloo uses cuda_add_library
       torch_update_find_cuda_flags()
       add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
@@ -1429,32 +1458,6 @@ if(USE_GLOO)
   endif()
 endif()
 
-if(USE_DISTRIBUTED AND USE_TENSORPIPE)
-  if(MSVC)
-    message(WARNING "Tensorpipe cannot be used on Windows.")
-  else()
-    if(USE_CUDA)
-      set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
-      set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
-    endif()
-    set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
-    set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
-
-    # Tensorpipe uses cuda_add_library
-    torch_update_find_cuda_flags()
-    add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
-
-    list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
-    if(USE_CUDA)
-      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
-    elseif(USE_ROCM)
-      message(WARNING "TensorPipe doesn't yet support ROCm")
-      # Not yet...
-      # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
-    endif()
-  endif()
-endif()
-
 # ---[ profiling
 if(USE_PROF)
   find_package(htrace)
@@ -1843,10 +1846,6 @@ set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
 list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
 set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
 
-if(USE_BREAKPAD)
-  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad)
-endif()
-
 # ---[ Kineto
 # edge profiler depends on KinetoProfiler but it only does cpu
 # profiling. Thus we dont need USE_CUDA/USE_ROCM
@@ -1906,13 +1905,15 @@ if(USE_KINETO)
     find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
         ${CUDA_SOURCE_DIR}
         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
-        ${CUDA_SOURCE_DIR}/lib64)
+        ${CUDA_SOURCE_DIR}/lib64
+        NO_DEFAULT_PATH)
 
     find_path(CUPTI_INCLUDE_DIR cupti.h PATHS
+        ${CUDA_SOURCE_DIR}/extras/CUPTI/include
         ${CUDA_INCLUDE_DIRS}
         ${CUDA_SOURCE_DIR}
-        ${CUDA_SOURCE_DIR}/extras/CUPTI/include
-        ${CUDA_SOURCE_DIR}/include)
+        ${CUDA_SOURCE_DIR}/include
+        NO_DEFAULT_PATH)
 
     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
       message(STATUS "  CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
@@ -1920,6 +1921,32 @@ if(USE_KINETO)
       message(STATUS "  CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
       message(STATUS "Found CUPTI")
       set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE)
+
+      # I've only tested this sanity check on Linux; if someone
+      # runs into this bug on another platform feel free to
+      # generalize it accordingly
+      if(NOT USE_CUPTI_SO AND UNIX)
+        include(CheckCXXSourceRuns)
+        # rt is handled by the CMAKE_REQUIRED_LIBRARIES set above
+        if(NOT APPLE)
+          set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl")
+        endif()
+        set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive")
+        check_cxx_source_runs("#include <stdexcept>
+  int main() {
+    try {
+      throw std::runtime_error(\"error\");
+    } catch (...) {
+      return 0;
+    }
+    return 1;
+  }" EXCEPTIONS_WORK)
+        set(CMAKE_REQUIRED_LINK_OPTIONS "")
+        if(NOT EXCEPTIONS_WORK)
+          message(FATAL_ERROR "Detected that statically linking against CUPTI causes exceptions to stop working.  See https://github.com/pytorch/pytorch/issues/57744 for more details.  Perhaps try: USE_CUPTI_SO=1 python setup.py develop --cmake")
+        endif()
+      endif()
+
     else()
       message(STATUS "Could not find CUPTI library, using CPU-only Kineto build")
       set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index b79a87466252..01594a5b66e0 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -168,6 +168,26 @@ IF (EXISTS ${INTEL_OMP_DIR})
   ENDIF()
 ENDIF()
 
+MACRO(GET_MKL_LIB_NAMES LIBRARIES INTERFACE MKL64)
+  cmake_parse_arguments("" "" "THREAD" "" ${ARGN})
+  SET(${LIBRARIES} mkl_${INTERFACE}${MKL64} mkl_core)
+  IF(_THREAD)
+    LIST(INSERT ${LIBRARIES} 1 ${_THREAD})
+    IF(UNIX AND ${USE_STATIC_MKL})
+      # The thread library defines symbols required by the other MKL libraries so also add it last
+      LIST(APPEND ${LIBRARIES} ${_THREAD})
+    ENDIF()
+  ENDIF()
+  IF(${USE_STATIC_MKL})
+    IF(UNIX)
+      list(TRANSFORM ${LIBRARIES} PREPEND "lib")
+      list(TRANSFORM ${LIBRARIES} APPEND ".a")
+    ELSE()
+      message(WARNING "Ignoring USE_STATIC_MKL")
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
 # Try linking multiple libs
 MACRO(CHECK_ALL_LIBRARIES LIBRARIES OPENMP_TYPE OPENMP_LIBRARY _name _list _flags)
   # This macro checks for the existence of the combination of libraries given by _list.
@@ -304,8 +324,9 @@ IF (NOT "${MKL_THREADING}" STREQUAL "SEQ")
       FOREACH(mkl64 ${mkl64s} "")
         FOREACH(mklthread ${mklthreads})
           IF (NOT MKL_LIBRARIES)
+            GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}")
             CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-              "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "")
+              "${mkl_lib_names};${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "")
           ENDIF (NOT MKL_LIBRARIES)
         ENDFOREACH(mklthread)
       ENDFOREACH(mkl64)
@@ -317,8 +338,9 @@ ENDIF (NOT "${MKL_THREADING}" STREQUAL "SEQ")
 FOREACH(mkliface ${mklifaces})
   FOREACH(mkl64 ${mkl64s} "")
     IF (NOT MKL_LIBRARIES)
+      GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "mkl_sequential")
       CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-        "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m};${mkl_dl}" "")
+        "${mkl_lib_names};${mkl_m};${mkl_dl}" "")
       IF (MKL_LIBRARIES)
         SET(mklseq "_sequential")
       ENDIF (MKL_LIBRARIES)
@@ -331,8 +353,9 @@ FOREACH(mklrtl ${mklrtls} "")
   FOREACH(mkliface ${mklifaces})
     FOREACH(mkl64 ${mkl64s} "")
       IF (NOT MKL_LIBRARIES)
+        GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}")
         CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-          "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
+          "${mkl_lib_names};${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
       ENDIF (NOT MKL_LIBRARIES)
     ENDFOREACH(mkl64)
   ENDFOREACH(mkliface)
@@ -341,6 +364,9 @@ ENDFOREACH(mklrtl)
 # Check for older versions
 IF (NOT MKL_LIBRARIES)
   SET(MKL_VERSION 900)
+  if (USE_STATIC_MKL)
+      message(WARNING "Ignoring USE_STATIC_MKL")
+  endif()
   CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
     "mkl;guide;pthread;m" "")
 ENDIF (NOT MKL_LIBRARIES)
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index 4d3febbdfc49..e2f427be67c8 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -12,86 +12,118 @@
 #  MKLDNN_USE_NATIVE_ARCH : Whether native CPU instructions should be used in MKLDNN. This should be turned off for
 #  general packaging to avoid incompatible CPU instructions. Default: OFF.
 
-IF (NOT MKLDNN_FOUND)
+IF(NOT MKLDNN_FOUND)
+  SET(MKLDNN_LIBRARIES)
+  SET(MKLDNN_INCLUDE_DIR)
 
-SET(MKLDNN_LIBRARIES)
-SET(MKLDNN_INCLUDE_DIR)
+  SET(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
+  SET(MKLDNN_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn/third_party/oneDNN")
+  IF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)
+    MESSAGE("-- Will build oneDNN Graph")
+    SET(LLGA_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn")
+    SET(BUILD_ONEDNN_GRAPH ON)
+  ENDIF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)
 
-SET(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
-SET(MKLDNN_ROOT "${IDEEP_ROOT}/mkl-dnn/third_party/oneDNN")
+  FIND_PACKAGE(BLAS)
+  FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
+  FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  IF(NOT MKLDNN_INCLUDE_DIR)
+    EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init --jobs 0 mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
+    FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  ENDIF(NOT MKLDNN_INCLUDE_DIR)
+  IF(BUILD_ONEDNN_GRAPH)
+    FIND_PATH(LLGA_INCLUDE_DIR oneapi/dnnl/dnnl_graph.hpp PATHS ${LLGA_ROOT} PATH_SUFFIXES include)
+  ENDIF(BUILD_ONEDNN_GRAPH)
 
-FIND_PACKAGE(BLAS)
-FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
-FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
-IF (NOT MKLDNN_INCLUDE_DIR)
-  EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init --jobs 0 mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
-  FIND_PATH(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
-ENDIF(NOT MKLDNN_INCLUDE_DIR)
+  IF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
+    MESSAGE(STATUS "MKLDNN source files not found!")
+    RETURN()
+  ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
+  LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
+  IF(BUILD_ONEDNN_GRAPH)
+    LIST(APPEND MKLDNN_INCLUDE_DIR ${LLGA_INCLUDE_DIR})
+  ENDIF(BUILD_ONEDNN_GRAPH)
+  IF(MKL_FOUND)
+    ADD_DEFINITIONS(-DIDEEP_USE_MKL)
+    # Append to mkldnn dependencies
+    LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
+    LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+  ELSE(MKL_FOUND)
+    SET(MKLDNN_USE_MKL "NONE" CACHE STRING "" FORCE)
+  ENDIF(MKL_FOUND)
 
-IF (NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
-  MESSAGE(STATUS "MKLDNN source files not found!")
-  RETURN()
-ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR)
-LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR})
-IF(MKL_FOUND)
-  ADD_DEFINITIONS(-DIDEEP_USE_MKL)
-  # Append to mkldnn dependencies
-  LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES})
-  LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR})
-ELSE(MKL_FOUND)
-  SET(MKLDNN_USE_MKL "NONE" CACHE STRING "" FORCE)
-ENDIF(MKL_FOUND)
+  SET(MKL_cmake_included TRUE)
+  IF(NOT MKLDNN_CPU_RUNTIME)
+    SET(MKLDNN_CPU_RUNTIME "OMP" CACHE STRING "")
+  ELSEIF(MKLDNN_CPU_RUNTIME STREQUAL "TBB")
+    IF(USE_TBB)
+      MESSAGE(STATUS "MKL-DNN is using TBB")
 
-SET(MKL_cmake_included TRUE)
-IF (NOT MKLDNN_CPU_RUNTIME)
-  SET(MKLDNN_CPU_RUNTIME "OMP" CACHE STRING "")
-ELSEIF (MKLDNN_CPU_RUNTIME STREQUAL "TBB")
-  IF (USE_TBB)
-    MESSAGE(STATUS "MKL-DNN is using TBB")
+      SET(TBB_cmake_included TRUE)
+      SET(Threading_cmake_included TRUE)
 
-    SET(TBB_cmake_included TRUE)
-    SET(Threading_cmake_included TRUE)
-
-    SET(DNNL_CPU_THREADING_RUNTIME ${MKLDNN_CPU_RUNTIME})
-    INCLUDE_DIRECTORIES(${TBB_INCLUDE_DIR})
-    LIST(APPEND EXTRA_SHARED_LIBS TBB::tbb)
-  ELSE()
-    MESSAGE(FATAL_ERROR "MKLDNN_CPU_RUNTIME is set to TBB but TBB is not used")
+      SET(DNNL_CPU_THREADING_RUNTIME ${MKLDNN_CPU_RUNTIME})
+      INCLUDE_DIRECTORIES(${TBB_INCLUDE_DIR})
+      LIST(APPEND EXTRA_SHARED_LIBS TBB::tbb)
+    ELSE()
+      MESSAGE(FATAL_ERROR "MKLDNN_CPU_RUNTIME is set to TBB but TBB is not used")
+    ENDIF()
   ENDIF()
-ENDIF()
-MESSAGE(STATUS "MKLDNN_CPU_RUNTIME = ${MKLDNN_CPU_RUNTIME}")
+  MESSAGE(STATUS "MKLDNN_CPU_RUNTIME = ${MKLDNN_CPU_RUNTIME}")
 
-SET(MKLDNN_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE)
-SET(DNNL_BUILD_TESTS FALSE CACHE BOOL "" FORCE)
-SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE)
-SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
-SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE)
-IF(MKLDNN_USE_NATIVE_ARCH)  # Disable HostOpts in MKLDNN unless MKLDNN_USE_NATIVE_ARCH is set.
-  SET(DNNL_ARCH_OPT_FLAGS "HostOpts" CACHE STRING "" FORCE)
-ELSE()
-  IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    IF(CPU_INTEL)
-      SET(DNNL_ARCH_OPT_FLAGS "-msse4" CACHE STRING "" FORCE)
-    ENDIF()
+  SET(MKLDNN_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE)
+  SET(DNNL_BUILD_TESTS FALSE CACHE BOOL "" FORCE)
+  SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE)
+  SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
+  SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE)
+  IF(BUILD_ONEDNN_GRAPH)
+    SET(DNNL_GRAPH_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
+  ENDIF(BUILD_ONEDNN_GRAPH)
+  IF(MKLDNN_USE_NATIVE_ARCH)  # Disable HostOpts in MKLDNN unless MKLDNN_USE_NATIVE_ARCH is set.
+    SET(DNNL_ARCH_OPT_FLAGS "HostOpts" CACHE STRING "" FORCE)
   ELSE()
-    SET(DNNL_ARCH_OPT_FLAGS "" CACHE STRING "" FORCE)
+    IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      IF(CPU_INTEL)
+        SET(DNNL_ARCH_OPT_FLAGS "-msse4" CACHE STRING "" FORCE)
+      ENDIF()
+    ELSE()
+      SET(DNNL_ARCH_OPT_FLAGS "" CACHE STRING "" FORCE)
+    ENDIF()
   ENDIF()
-ENDIF()
 
-ADD_SUBDIRECTORY(${MKLDNN_ROOT})
-IF(NOT TARGET dnnl)
-  MESSAGE("Failed to include MKL-DNN target")
-  RETURN()
-ENDIF(NOT TARGET dnnl)
+  IF(BUILD_ONEDNN_GRAPH)
+    ADD_SUBDIRECTORY(${LLGA_ROOT})
+    IF(NOT TARGET dnnl_graph)
+      MESSAGE("Failed to include LLGA target")
+      RETURN()
+    ENDIF(NOT TARGET dnnl_graph)
+
+    IF(CMAKE_COMPILER_IS_GNUCC)
+      TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-maybe-uninitialized)
+      TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-strict-overflow)
+      TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-error=strict-overflow)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+  ELSE(BUILD_ONEDNN_GRAPH)
+    ADD_SUBDIRECTORY(${MKLDNN_ROOT})
+  ENDIF(BUILD_ONEDNN_GRAPH)
+
+  IF(NOT TARGET dnnl)
+    MESSAGE("Failed to include MKL-DNN target")
+    RETURN()
+  ENDIF(NOT TARGET dnnl)
 
-IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
-  TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-maybe-uninitialized)
-  TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-strict-overflow)
-  TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-error=strict-overflow)
-ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
-LIST(APPEND MKLDNN_LIBRARIES dnnl)
+  IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
+    TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-maybe-uninitialized)
+    TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-strict-overflow)
+    TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-error=strict-overflow)
+  ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  LIST(APPEND MKLDNN_LIBRARIES ${MKL_OPENMP_LIBRARY})
+  IF(BUILD_ONEDNN_GRAPH)
+    LIST(APPEND MKLDNN_LIBRARIES "$<TARGET_FILE:dnnl_graph>")
+  ENDIF(BUILD_ONEDNN_GRAPH)
+  LIST(APPEND MKLDNN_LIBRARIES dnnl)
 
-SET(MKLDNN_FOUND TRUE)
-MESSAGE(STATUS "Found MKL-DNN: TRUE")
+  SET(MKLDNN_FOUND TRUE)
+  MESSAGE(STATUS "Found MKL-DNN: TRUE")
 
 ENDIF(NOT MKLDNN_FOUND)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 2040120701f1..1a99d1e567a1 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -138,6 +138,7 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_METAL             : ${USE_METAL}")
   message(STATUS "  USE_PYTORCH_METAL     : ${USE_PYTORCH_METAL}")
   message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
+  message(STATUS "  USE_MPS               : ${USE_MPS}")
   message(STATUS "  USE_FFTW              : ${USE_FFTW}")
   message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
   message(STATUS "  USE_MKLDNN            : ${USE_MKLDNN}")
@@ -148,6 +149,7 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
+    message(STATUS "    USE_NCCL_WITH_UCC   : ${USE_NCCL_WITH_UCC}")
   endif()
   message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
   message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
@@ -171,6 +173,7 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_PROF              : ${USE_PROF}")
   message(STATUS "  USE_QNNPACK           : ${USE_QNNPACK}")
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
+  message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
   message(STATUS "  USE_REDIS             : ${USE_REDIS}")
   message(STATUS "  USE_ROCKSDB           : ${USE_ROCKSDB}")
   message(STATUS "  USE_ZMQ               : ${USE_ZMQ}")
@@ -185,9 +188,9 @@ function(caffe2_print_configuration_summary)
     message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
   endif()
   message(STATUS "  USE_DEPLOY           : ${USE_DEPLOY}")
-  message(STATUS "  USE_BREAKPAD         : ${USE_BREAKPAD}")
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
+  message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
 endfunction()
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 0d8b15bd14a8..a57345f51cd4 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -127,10 +127,6 @@ if(@USE_KINETO@)
   append_torchlib_if_found(kineto)
 endif()
 
-if(@USE_DEPLOY@)
-  append_torchlib_if_found(torch_deploy)
-endif()
-
 if(@USE_CUDA@)
   if(MSVC)
     if(NOT NVTOOLEXT_HOME)
diff --git a/cmake/VulkanCodegen.cmake b/cmake/VulkanCodegen.cmake
index 8f6f4b538dd0..c39b54df3af3 100644
--- a/cmake/VulkanCodegen.cmake
+++ b/cmake/VulkanCodegen.cmake
@@ -7,10 +7,10 @@ set(VULKAN_GEN_OUTPUT_PATH "${CMAKE_BINARY_DIR}/vulkan/ATen/native/vulkan")
 set(VULKAN_GEN_ARG_ENV "")
 
 if(USE_VULKAN_RELAXED_PRECISION)
-  string(APPEND VULKAN_GEN_ARG_ENV "precision=mediump")
+  list(APPEND VULKAN_GEN_ARG_ENV "precision=mediump")
 endif()
 if(USE_VULKAN_FP16_INFERENCE)
-  string(APPEND VULKAN_GEN_ARG_ENV "format=rgba16f")
+  list(APPEND VULKAN_GEN_ARG_ENV "format=rgba16f")
 endif()
 
 if(USE_VULKAN_SHADERC_RUNTIME)
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index fa481dda1c53..0202f15270b2 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -5,7 +5,11 @@ if(NOT DEFINED ENV{ROCM_PATH})
 else()
   set(ROCM_PATH $ENV{ROCM_PATH})
 endif()
-
+if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
+  set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
+else()
+  set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
+endif()
 # HIP_PATH
 if(NOT DEFINED ENV{HIP_PATH})
   set(HIP_PATH ${ROCM_PATH}/hip)
@@ -151,8 +155,47 @@ if(HIP_FOUND)
   set(PYTORCH_FOUND_HIP TRUE)
 
   # Find ROCM version for checks
-  file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
-  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+  # ROCM 5.0 and later will have header api for version management
+  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
+
+    set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+    set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
+    file(WRITE ${file} ""
+      "#include <rocm_version.h>\n"
+      "#include <cstdio>\n"
+
+      "#ifndef ROCM_VERSION_PATCH\n"
+      "#define ROCM_VERSION_PATCH 0\n"
+      "#endif\n"
+      "#define STRINGIFYHELPER(x) #x\n"
+      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
+      "int main() {\n"
+      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
+      "  return 0;\n"
+      "}\n"
+      )
+
+    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+      RUN_OUTPUT_VARIABLE rocm_version_from_header
+      COMPILE_OUTPUT_VARIABLE output_var
+      )
+    # We expect the compile to be successful if the include directory exists.
+    if(NOT compile_result)
+      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+    endif()
+    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
+    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
+    message("\n***** ROCm version from rocm_version.h ****\n")
+
+  # ROCM < 4.5, we don't have the header api file, use flat file
+  else()
+    file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+    message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
+  endif()
+
+  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+
   if(ROCM_VERSION_DEV_MATCH)
     set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
     set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
@@ -160,7 +203,7 @@ if(HIP_FOUND)
     set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
     math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
   endif()
-  message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
+
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
   message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
   message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
@@ -187,21 +230,40 @@ if(HIP_FOUND)
   set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.###
 
-  set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
-  set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
-  set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
-  set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
-  set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
-  set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
-  set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
-  set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
-  set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
-  set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft)
-  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
-  set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
-  set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
-  set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
-  set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
+  # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/<package>
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0")
+    set(hip_DIR ${ROCM_PATH}/lib/cmake/hip)
+    set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+    set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+    set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+    set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
+    set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
+    set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
+    set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
+    set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
+    set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
+    set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
+    set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
+    set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
+    set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
+    set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
+  else()
+    set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
+    set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+    set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+    set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+    set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
+    set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
+    set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
+    set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+    set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
+    set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft)
+    set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
+    set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
+    set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
+    set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
+    set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
+  endif()
 
   find_package_and_print_version(hip REQUIRED)
   find_package_and_print_version(hsa-runtime64 REQUIRED)
@@ -221,13 +283,8 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
-    message("ROCm version >= 4.1; enabling asserts")
-  else()
-    # Disable Asserts In Code (Can't use asserts on HIP stack.)
-    add_definitions(-DNDEBUG)
-    message("ROCm version < 4.1; disablng asserts")
-  endif()
+  # Disable Asserts In Code (Can't use asserts on HIP stack.)
+  add_definitions(-DNDEBUG)
 
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 30d3b52d4883..7f6272e95a6a 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -38,6 +38,12 @@ endif()
 
 # Enable CUDA language support
 set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
+# Pass clang as host compiler, which according to the docs
+# Must be done before CUDA language is enabled, see  mast be done before
+# see https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}")
+endif()
 enable_language(CUDA)
 set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -318,15 +324,9 @@ if(CAFFE2_USE_CUDNN)
     TARGET caffe2::cudnn-private PROPERTY INTERFACE_INCLUDE_DIRECTORIES
     ${CUDNN_INCLUDE_PATH})
   if(CUDNN_STATIC AND NOT WIN32)
-    if(USE_WHOLE_CUDNN)
-      set_property(
-        TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
-        "-Wl,--whole-archive,\"${CUDNN_LIBRARY_PATH}\" -Wl,--no-whole-archive")
-    else()
-      set_property(
-        TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDNN_LIBRARY_PATH})
-    endif()
+    set_property(
+      TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
+      ${CUDNN_LIBRARY_PATH})
     set_property(
       TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES
       "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
diff --git a/cmake/public/mkldnn.cmake b/cmake/public/mkldnn.cmake
index 87935625f9bf..50404d3b30d0 100644
--- a/cmake/public/mkldnn.cmake
+++ b/cmake/public/mkldnn.cmake
@@ -16,3 +16,15 @@ set_property(
 set_property(
   TARGET caffe2::mkldnn PROPERTY INTERFACE_LINK_LIBRARIES
   ${MKLDNN_LIBRARIES})
+if(BUILD_ONEDNN_GRAPH)
+  if(NOT TARGET caffe2::dnnl_graph)
+    add_library(caffe2::dnnl_graph INTERFACE IMPORTED)
+  endif()
+
+  set_property(
+    TARGET caffe2::dnnl_graph PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${MKLDNN_INCLUDE_DIR})
+  set_property(
+    TARGET caffe2::dnnl_graph PROPERTY INTERFACE_LINK_LIBRARIES
+    ${MKLDNN_LIBRARIES})
+endif()
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 3535a5cf7ba7..0daa6b7f6a3e 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -449,7 +449,6 @@ function(torch_compile_options libname)
         -Wall
         -Wextra
         -Wno-unused-parameter
-        -Wno-unused-variable
         -Wno-unused-function
         -Wno-unused-result
         -Wno-unused-local-typedefs
@@ -470,8 +469,7 @@ function(torch_compile_options libname)
       if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
         list(APPEND private_compile_options
           -Wno-range-loop-analysis)
-      endif()
-      if(NOT APPLE)
+      else()
         list(APPEND private_compile_options
           # Considered to be flaky.  See the discussion at
           # https://github.com/pytorch/pytorch/pull/9608
diff --git a/docker.Makefile b/docker.Makefile
index dc7942518f9b..11c438d0fd22 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -8,7 +8,7 @@ $(warning WARNING: No docker user found using results from whoami)
 DOCKER_ORG                = $(shell whoami)
 endif
 
-CUDA_VERSION              = 11.1
+CUDA_VERSION              = 11.3
 CUDNN_VERSION             = 8
 BASE_RUNTIME              = ubuntu:18.04
 BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
diff --git a/docker/cpu-blis/Dockerfile b/docker/cpu-blis/Dockerfile
deleted file mode 100644
index adfce7e3ad7a..000000000000
--- a/docker/cpu-blis/Dockerfile
+++ /dev/null
@@ -1,68 +0,0 @@
-# syntax = docker/dockerfile:experimental
-#
-# NOTE: To build this you will need a docker version > 18.06 with
-#       experimental enabled and DOCKER_BUILDKIT=1
-#
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
-#
-# This Dockerfile will build Docker Image with PyTorch + DNNL + AMD BLIS and Torchvision installed for CPU only
-#
-# Example commandline to build PyTorch with AMD BLIS:
-# sudo DOCKER_BUILDKIT=1 docker build . -t docker-image-repo-name
-# Example commandline to run the built docker container:
-# sudo docker run --name container-name -it docker-image-repo-name
-
-ARG BASE_IMAGE=ubuntu:18.04
-ARG PYTHON_VERSION=3.8
-
-FROM ${BASE_IMAGE} as dev-base
-CMD echo "Welcome to the PyTorch Docker Container!" && \
-    echo "Version of PyTorch Installed: " && python -c 'import torch; print(torch.__version__)' && \
-    echo "Version of Torchvision Installed: " && python -c 'import torchvision; print(torchvision.__version__)' && \
-    echo "LDD output showing successful linking with BLIS: " && ldd /opt/conda/lib/python3.8/site-packages/torch/_C.cpython-38-x86_64-linux-gnu.so && \
-    /bin/bash
-RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
-    apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        ccache \
-        cmake \
-        curl \
-        git \
-        libjpeg-dev \
-        libpng-dev \
-        vim \
-        wget && \
-    rm -rf /var/lib/apt/lists/*
-RUN /usr/sbin/update-ccache-symlinks
-RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
-ENV PATH /opt/conda/bin:$PATH
-
-FROM dev-base as conda
-RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
-    chmod +x ~/miniconda.sh && \
-    ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh && \
-    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build && \
-    /opt/conda/bin/conda install -y nomkl pyyaml numpy ipython ninja setuptools cmake cffi typing future && \
-    /opt/conda/bin/conda clean -ya
-
-RUN conda install typing_extensions
-
-WORKDIR /root
-ARG BLIS_URL=https://github.com/amd/blis.git
-# Download, Build BLIS with multithreading support and place necessary library and include files at BLIS_HOME/lib and BLIS_HOME/include respectively
-RUN git clone ${BLIS_URL} && cd blis && \
-    ./configure --prefix=/root/BLISBuild --enable-cblas --enable-threading=openmp auto && make -j && make install && \
-    if [ ! -e /root/BLISBuild/lib/libblis.so ] ; then cp /root/BLISBuild/lib/libblis*.so /root/BLISBuild/lib/libblis.so ; fi
-
-# Build PyTorch with DNNL+BLIS
-RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && \
-    git submodule update --init --recursive --jobs 0 && \
-    export PATH=/root/BLISBuild/include/blis:$PATH LD_LIBRARY_PATH=/root/BLISBuild/lib:$LD_LIBRARY_PATH && \
-    export BLIS_HOME=/root/BLISBuild BLAS=BLIS USE_MKLDNN_CBLAS=ON WITH_BLAS=blis && python setup.py install
-
-# Build Torchvision
-RUN git clone https://github.com/pytorch/vision.git && cd vision && \
-    python setup.py install
diff --git a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile b/docker/pytorch/ubuntu_cpu_gpu/Dockerfile
deleted file mode 100644
index f7a1af093027..000000000000
--- a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile
+++ /dev/null
@@ -1,105 +0,0 @@
-# This is the Dockerfile for an image that is ready to build PyTorch from source.
-# PyTorch is not yet downloaded nor installed.
-#
-# Available BASE_IMAGE options:
-#   nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04
-#   nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
-#   nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-#
-# Available MAGMA_CUDA_VERSION options (for GPU/CUDA builds):
-#   magma-cuda112
-#   magma-cuda111
-#   magma-cuda102
-#   magma-cuda101
-#
-# Available TORCH_CUDA_ARCH_LIST_VAR options (for GPU/CUDA builds):
-#   "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0;8.6" for CUDA 11.2/11.1
-#   "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0" for CUDA 11.0
-#   "3.7+PTX;5.0;6.0;6.1;7.0;7.5" for CUDA 10.2/10.1
-#
-# Build image with CPU or GPU support with the following command:
-#   nvidia-docker build -t ${CONTAINER_TAG}
-#   --build-arg BASE_IMAGE=${BASE_IMAGE_VER} \
-#   --build-arg PYTHON_VERSION=${PYTHON_VER} \
-#   --build-arg MAGMA_CUDA_VERSION=${MAGMA_CUDA_VER} \ #(for GPU/CUDA builds)
-#   --build-arg TORCH_CUDA_ARCH_LIST_VAR=${TORCH_CUDA_ARCH_LIST} \ #(for GPU/CUDA builds):
-#   .
-#
-# For example, for a CPU Ubuntu 18.04 and Python 3.7.6 build:
-#   docker build -t ubuntu_1804_py_37_cpu_dev \
-#   --build-arg BASE_IMAGE=ubuntu:18.04 \
-#   --build-arg PYTHON_VERSION=3.7.6 .
-#
-# For example, for a CUDA 10.2 Ubuntu 18.04 and Python 3.9.1 build:
-#   nvidia-docker build -t ubuntu_1804_py_39_cuda_102_cudnn_8_dev \
-#   --build-arg BASE_IMAGE=nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 \
-#   --build-arg PYTHON_VERSION=3.9.1 \
-#   --build-arg MAGMA_CUDA_VERSION=magma-cuda102 \
-#   --build-arg TORCH_CUDA_ARCH_LIST_VAR="3.7+PTX;5.0;6.0;6.1;7.0;7.5" .
-
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE} as dev-base
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        ccache \
-        cmake \
-        curl \
-        git \
-        git-lfs \
-        libjpeg-dev \
-        libpng-dev \
-        openmpi-bin \
-        wget && \
-    rm -rf /var/lib/apt/lists/*
-RUN /usr/sbin/update-ccache-symlinks
-RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
-ENV PATH /opt/conda/bin:$PATH
-
-FROM dev-base as conda
-ARG PYTHON_VERSION
-ENV PYTHON_VER=$PYTHON_VERSION
-RUN curl -fsSL -v -o ~/miniconda.sh -O  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-    chmod +x ~/miniconda.sh && \
-    ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh && \
-    /opt/conda/bin/conda install -y python=${PYTHON_VER} conda-build pyyaml numpy ipython cython typing typing_extensions mkl mkl-include ninja && \
-    /opt/conda/bin/conda clean -ya
-
-ARG MAGMA_CUDA_VERSION
-RUN if [ -z "$MAGMA_CUDA_VERSION" ] ; then \
-    echo "Building with CPU support ..."; \
-  else \
-    echo "Building with GPU/CUDA support ..."; \
-    conda install -y -c pytorch ${MAGMA_CUDA_VERSION} && conda clean -ya; \
-  fi
-
-# Necessary step for Azure Pipelines Docker Build
-# Docker image is build by root, but the build process
-# is running from a non-priveledged user
-RUN chmod -R ugo+rw /opt/conda/
-
-WORKDIR /opt/pytorch
-# Environment variables for PyTorch
-ARG TORCH_CUDA_ARCH_LIST_VAR
-RUN if [ -z "$TORCH_CUDA_ARCH_LIST_VAR" ] ; then \
-    echo "Continuing CPU build ..."; \
-  else \
-    echo "Setting CUDA env vars and installing openmpi ..."; \
-    # Set MPI links to avoid libmpi_cxx.so.1 not found error
-    ln -s /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.20 /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1; \
-    ln -s /usr/lib/x86_64-linux-gnu/libmpi.so.20.10.1 /usr/lib/x86_64-linux-gnu/libmpi.so.12; \
-  fi
-# If the build argument TORCH_CUDA_ARCH_LIST_VAR is given, container will be
-# set for GPU/CUDA build, else for CPU build.
-ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST_VAR:+${TORCH_CUDA_ARCH_LIST_VAR}}
-ENV TORCH_NVCC_FLAGS=${TORCH_CUDA_ARCH_LIST_VAR:+"-Xfatbin -compress-all"}
-ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
-
-# Install Azure CLI and update its site packages
-RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash
-RUN pip install --upgrade pip --target /opt/az/lib/python3.6/site-packages/
-
-# Install MKL
-RUN wget https://raw.githubusercontent.com/pytorch/builder/f121b0919d799b5ea2030c92ca266cf4cddf6656/common/install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
diff --git a/docs/Makefile b/docs/Makefile
index 28d910a89b49..b9719df7ade5 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -15,6 +15,10 @@ help:
 
 figures:
 	@$(PYCMD) source/scripts/build_activation_images.py
+	@$(PYCMD) source/scripts/build_quantization_configs.py
+
+onnx_supported_aten_ops:
+	@$(PYCMD) source/scripts/build_onnx_supported_aten_op_csv_table.py
 
 docset: html
 	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url https://pytorch.org/docs/ --force $(BUILDDIR)/html/
@@ -30,13 +34,13 @@ html-stable:
 	# See conf.py for more details.
 	RELEASE=1 make html
 
-.PHONY: help Makefile docset
+.PHONY: help Makefile docset onnx_supported_aten_ops
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile figures
+%: Makefile figures onnx_supported_aten_ops
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 clean:
 	@echo "Removing everything under 'build' and 'source/generated'.."
-	@rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(SOURCEDIR)/generated
+	@rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(SOURCEDIR)/generated $(BUILDDIR)/auto_gen_aten_op_list.csv
diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt
index f5d49d2ebe91..ca3eb7da6846 100644
--- a/docs/cpp/requirements.txt
+++ b/docs/cpp/requirements.txt
@@ -1,4 +1,5 @@
 sphinx==3.1.2
+Jinja2==3.0.*
 breathe==4.25.0
 exhale==0.2.3
 docutils==0.16
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
index 7785239d1539..a17d742a461e 100644
--- a/docs/cpp/source/Doxyfile
+++ b/docs/cpp/source/Doxyfile
@@ -44,12 +44,14 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \
                          ../../../aten/src/ATen/Scalar.h \
                          ../../../aten/src/ATen/TensorOptions.h \
                          ../../../aten/src/ATen/core/Tensor.h \
+                         ../../../aten/src/ATen/native/TensorShape.h \
                          ../../../build/aten/src/ATen/Functions.h \
                          ../../../build/aten/src/ATen/core/TensorBody.h \
                          ../../../c10/core/Device.h \
                          ../../../c10/core/DeviceType.h \
                          ../../../c10/util/Half.h \
                          ../../../c10/util/ArrayRef.h \
+                         ../../../c10/util/OptionalArrayRef.h \
                          ../../../c10/util/Exception.h \
                          ../../../c10/util/Optional.h \
                          ../../../c10/cuda/CUDAGuard.h \
diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh
index 6ff6832cd056..a094af941278 100755
--- a/docs/cpp/source/check-doxygen.sh
+++ b/docs/cpp/source/check-doxygen.sh
@@ -16,12 +16,11 @@ pushd "$(dirname "$0")/../../.."
 
 cp torch/_utils_internal.py tools/shared
 
-python -m tools.codegen.gen
+python -m torchgen.gen
 
 python tools/setup_helpers/generate_code.py                 \
   --native-functions-path aten/src/ATen/native/native_functions.yaml \
-  --nn-path aten/src
-
+  --tags-path aten/src/ATen/native/tags.yaml
 popd
 
 # Run doxygen and log all output.
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index 3bc56ed060aa..54cd6acdb8fd 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -119,8 +119,8 @@
 
 # General information about the project.
 project = 'PyTorch'
-copyright = '2019, Torch Contributors'
-author = 'Torch Contributors'
+copyright = '2022, PyTorch Contributors'
+author = 'PyTorch Contributors'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 34ec6078225b..57bee508f61b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,5 @@
 sphinx==3.5.4
+Jinja2==3.0.*
 docutils==0.16
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinxcontrib.katex
@@ -7,3 +8,4 @@ tensorboard
 # required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd>=0.4.5
 sphinx_copybutton
+sphinx-panels
diff --git a/docs/source/amp.rst b/docs/source/amp.rst
index 1f70f2c6982e..0785849c579e 100644
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -1,22 +1,35 @@
 .. role:: hidden
     :class: hidden-section
 
-Automatic Mixed Precision package - torch.cuda.amp
-==================================================
+Automatic Mixed Precision package - torch.amp
+=============================================
 
-.. automodule:: torch.cuda.amp
-.. currentmodule:: torch.cuda.amp
+.. Both modules below are missing doc entry. Adding them here for now.
+.. This does not add anything to the rendered page
+.. py:module:: torch.cpu
+.. py:module:: torch.cpu.amp
+.. py:module:: torch.cuda.amp
+
+.. automodule:: torch.amp
+.. currentmodule:: torch.amp
 
-:class:`torch.cuda.amp` and :class:`torch` provide convenience methods for mixed precision,
+:class:`torch.amp` provides convenience methods for mixed precision,
 where some operations use the ``torch.float32`` (``float``) datatype and other operations
-use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
-are much faster in ``float16``. Other ops, like reductions, often require the dynamic
+use lower precision floating point datatype (``lower_precision_fp``): ``torch.float16`` (``half``) or ``torch.bfloat16``. Some ops, like linear layers and convolutions,
+are much faster in ``lower_precision_fp``. Other ops, like reductions, often require the dynamic
 range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
 
-Ordinarily, "automatic mixed precision training" uses :class:`torch.autocast` and
-:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`Automatic Mixed Precision examples<amp-examples>`
-and `Automatic Mixed Precision recipe <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_.
-However, :class:`torch.autocast` and :class:`GradScaler` are modular, and may be used separately if desired.
+Ordinarily, "automatic mixed precision training" with datatype of ``torch.float16`` uses :class:`torch.autocast` and
+:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples<amp-examples>`
+and `CUDA Automatic Mixed Precision recipe <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_.
+However, :class:`torch.autocast` and :class:`torch.cuda.amp.GradScaler` are modular, and may be used separately if desired.
+As shown in the CPU example section of :class:`torch.autocast`, "automatic mixed precision training/inference" on CPU with
+datatype of ``torch.bfloat16`` only uses :class:`torch.autocast`.
+
+For CUDA and CPU, APIs are also provided seperately:
+
+* ``torch.autocast("cuda", args...)`` is equivalent to ``torch.cuda.amp.autocast(args...)``.
+* ``torch.autocast("cpu", args...)`` is equivalent to ``torch.cpu.amp.autocast(args...)``. For CPU, only lower precision floating point datatype of ``torch.bfloat16`` is supported for now.
 
 .. contents:: :local:
 
@@ -38,6 +51,11 @@ Autocasting
 
 .. autofunction::  custom_bwd
 
+.. currentmodule:: torch.cpu.amp
+
+.. autoclass:: autocast
+    :members:
+
 .. _gradient-scaling:
 
 Gradient Scaling
@@ -56,6 +74,8 @@ so they don't flush to zero.
 Each parameter's gradient (``.grad`` attribute) should be unscaled before the optimizer
 updates the parameters, so the scale factor does not interfere with the learning rate.
 
+.. currentmodule:: torch.cuda.amp
+
 .. autoclass:: GradScaler
     :members:
 
@@ -68,8 +88,6 @@ Autocast Op Reference
 
 Op Eligibility
 --------------
-Only CUDA ops are eligible for autocasting.
-
 Ops that run in ``float64`` or non-floating-point dtypes are not eligible, and will
 run in these types whether or not autocast is enabled.
 
@@ -84,8 +102,10 @@ regions.
 Ops called with an explicit ``dtype=...`` argument are not eligible,
 and will produce output that respects the ``dtype`` argument.
 
-Op-Specific Behavior
---------------------
+.. _autocast-cuda-op-reference:
+
+CUDA Op-Specific Behavior
+-------------------------
 The following lists describe the behavior of eligible ops in autocast-enabled regions.
 These ops always go through autocasting whether they are invoked as part of a :class:`torch.nn.Module`,
 as a function, or as a :class:`torch.Tensor` method. If functions are exposed in multiple namespaces,
@@ -99,8 +119,8 @@ If an op is unlisted, we assume it's numerically stable in ``float16``.
 If you believe an unlisted op is numerically unstable in ``float16``,
 please file an issue.
 
-Ops that can autocast to ``float16``
-""""""""""""""""""""""""""""""""""""
+CUDA Ops that can autocast to ``float16``
+"""""""""""""""""""""""""""""""""""""""""
 
 ``__matmul__``,
 ``addbmm``,
@@ -126,8 +146,8 @@ Ops that can autocast to ``float16``
 ``prelu``,
 ``RNNCell``
 
-Ops that can autocast to ``float32``
-""""""""""""""""""""""""""""""""""""
+CUDA Ops that can autocast to ``float32``
+"""""""""""""""""""""""""""""""""""""""""
 
 ``__pow__``,
 ``__rdiv__``,
@@ -181,8 +201,8 @@ Ops that can autocast to ``float32``
 ``tan``,
 ``triplet_margin_loss``
 
-Ops that promote to the widest input type
-"""""""""""""""""""""""""""""""""""""""""
+CUDA Ops that promote to the widest input type
+""""""""""""""""""""""""""""""""""""""""""""""
 These ops don't require a particular dtype for stability, but take multiple inputs
 and require that the inputs' dtypes match.  If all of the inputs are
 ``float16``, the op runs in ``float16``.  If any of the inputs is ``float32``,
@@ -216,3 +236,142 @@ Many models use a sigmoid layer right before the binary cross entropy layer.
 In this case, combine the two layers using :func:`torch.nn.functional.binary_cross_entropy_with_logits`
 or :mod:`torch.nn.BCEWithLogitsLoss`.  ``binary_cross_entropy_with_logits`` and ``BCEWithLogits``
 are safe to autocast.
+
+.. _autocast-cpu-op-reference:
+
+CPU Op-Specific Behavior
+------------------------
+The following lists describe the behavior of eligible ops in autocast-enabled regions.
+These ops always go through autocasting whether they are invoked as part of a :class:`torch.nn.Module`,
+as a function, or as a :class:`torch.Tensor` method. If functions are exposed in multiple namespaces,
+they go through autocasting regardless of the namespace.
+
+Ops not listed below do not go through autocasting.  They run in the type
+defined by their inputs.  However, autocasting may still change the type
+in which unlisted ops run if they're downstream from autocasted ops.
+
+If an op is unlisted, we assume it's numerically stable in ``bfloat16``.
+If you believe an unlisted op is numerically unstable in ``bfloat16``,
+please file an issue.
+
+CPU Ops that can autocast to ``bfloat16``
+"""""""""""""""""""""""""""""""""""""""""
+
+``conv1d``,
+``conv2d``,
+``conv3d``,
+``bmm``,
+``mm``,
+``baddbmm``,
+``addmm``,
+``addbmm``,
+``linear``,
+``matmul``,
+``_convolution``
+
+CPU Ops that can autocast to ``float32``
+""""""""""""""""""""""""""""""""""""""""
+
+``conv_transpose1d``,
+``conv_transpose2d``,
+``conv_transpose3d``,
+``avg_pool3d``,
+``binary_cross_entropy``,
+``grid_sampler``,
+``grid_sampler_2d``,
+``_grid_sampler_2d_cpu_fallback``,
+``grid_sampler_3d``,
+``polar``,
+``prod``,
+``quantile``,
+``nanquantile``,
+``stft``,
+``cdist``,
+``trace``,
+``view_as_complex``,
+``cholesky``,
+``cholesky_inverse``,
+``cholesky_solve``,
+``inverse``,
+``lu_solve``,
+``matrix_rank``,
+``orgqr``,
+``inverse``,
+``ormqr``,
+``pinverse``,
+``max_pool3d``,
+``max_unpool2d``,
+``max_unpool3d``,
+``adaptive_avg_pool3d``,
+``reflection_pad1d``,
+``reflection_pad2d``,
+``replication_pad1d``,
+``replication_pad2d``,
+``replication_pad3d``,
+``mse_loss``,
+``ctc_loss``,
+``kl_div``,
+``multilabel_margin_loss``,
+``fft_fft``,
+``fft_ifft``,
+``fft_fft2``,
+``fft_ifft2``,
+``fft_fftn``,
+``fft_ifftn``,
+``fft_rfft``,
+``fft_irfft``,
+``fft_rfft2``,
+``fft_irfft2``,
+``fft_rfftn``,
+``fft_irfftn``,
+``fft_hfft``,
+``fft_ihfft``,
+``linalg_matrix_norm``,
+``linalg_cond``,
+``linalg_matrix_rank``,
+``linalg_solve``,
+``linalg_cholesky``,
+``linalg_svdvals``,
+``linalg_eigvals``,
+``linalg_eigvalsh``,
+``linalg_inv``,
+``linalg_householder_product``,
+``linalg_tensorinv``,
+``linalg_tensorsolve``,
+``fake_quantize_per_tensor_affine``,
+``eig``,
+``geqrf``,
+``lstsq``,
+``_lu_with_info``,
+``qr``,
+``solve``,
+``svd``,
+``symeig``,
+``triangular_solve``,
+``fractional_max_pool2d``,
+``fractional_max_pool3d``,
+``adaptive_max_pool3d``,
+``multilabel_margin_loss_forward``,
+``linalg_qr``,
+``linalg_cholesky_ex``,
+``linalg_svd``,
+``linalg_eig``,
+``linalg_eigh``,
+``linalg_lstsq``,
+``linalg_inv_ex``
+
+CPU Ops that promote to the widest input type
+"""""""""""""""""""""""""""""""""""""""""""""
+These ops don't require a particular dtype for stability, but take multiple inputs
+and require that the inputs' dtypes match.  If all of the inputs are
+``bfloat16``, the op runs in ``bfloat16``.  If any of the inputs is ``float32``,
+autocast casts all inputs to ``float32`` and runs the op in ``float32``.
+
+``cat``,
+``stack``,
+``index_copy``
+
+Some ops not listed here (e.g., binary ops like ``add``) natively promote
+inputs without autocasting's intervention.  If inputs are a mixture of ``bfloat16``
+and ``float32``, these ops run in ``float32`` and produce ``float32`` output,
+regardless of whether autocast is enabled.
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 45d6fdf2add2..c54cf33fbe15 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -3,6 +3,7 @@
 
 torch.backends
 ==============
+.. automodule:: torch.backends
 
 `torch.backends` controls the behavior of various backends that PyTorch supports.
 
@@ -17,6 +18,7 @@ These backends include:
 
 torch.backends.cuda
 ^^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.cuda
 
 .. autofunction::  torch.backends.cuda.is_built
 
@@ -50,6 +52,7 @@ torch.backends.cuda
 
 torch.backends.cudnn
 ^^^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.cudnn
 
 .. autofunction:: torch.backends.cudnn.version
 
@@ -75,20 +78,37 @@ torch.backends.cudnn
     A :class:`bool` that, if True, causes cuDNN to benchmark multiple convolution algorithms
     and select the fastest.
 
+torch.backends.mps
+^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.mps
+
+.. autofunction::  torch.backends.mps.is_available
+
+.. autofunction::  torch.backends.mps.is_built
+
 
 torch.backends.mkl
 ^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.mkl
 
 .. autofunction::  torch.backends.mkl.is_available
 
 
 torch.backends.mkldnn
 ^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.mkldnn
 
 .. autofunction::  torch.backends.mkldnn.is_available
 
 
 torch.backends.openmp
 ^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: torch.backends.openmp
 
 .. autofunction::  torch.backends.openmp.is_available
+
+.. Docs for other backends need to be added here.
+.. Automodules are just here to ensure checks run but they don't actually
+.. add anything to the rendered page for now.
+.. py:module:: torch.backends.quantized
+.. py:module:: torch.backends.xnnpack
diff --git a/docs/source/benchmark_utils.rst b/docs/source/benchmark_utils.rst
index c211dcb7b580..c93fbfd66c3d 100644
--- a/docs/source/benchmark_utils.rst
+++ b/docs/source/benchmark_utils.rst
@@ -18,3 +18,10 @@ Benchmark Utils - torch.utils.benchmark
 
 .. autoclass:: FunctionCounts
     :members:
+
+.. These are missing documentation. Adding them here until a better place
+.. is made in this file.
+.. py:module:: torch.utils.benchmark.examples
+.. py:module:: torch.utils.benchmark.op_fuzzers
+.. py:module:: torch.utils.benchmark.utils
+.. py:module:: torch.utils.benchmark.utils.valgrind_wrapper
diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
index d6ce122234fb..3fa1c99b5061 100644
--- a/docs/source/bottleneck.rst
+++ b/docs/source/bottleneck.rst
@@ -1,6 +1,7 @@
 torch.utils.bottleneck
 ======================
 
+.. automodule:: torch.utils.bottleneck
 .. currentmodule:: torch.utils.bottleneck
 
 `torch.utils.bottleneck` is a tool that can be used as an initial step for
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index b1d4954a6576..906d5685984d 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -149,13 +149,13 @@ C10 utils and operator dispatch
 -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
 -  (emeritus) Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
 
-ONNX <-> PyTorch
-~~~~~~~~~~~~~~~~
--  Negin Raoof (`neginraoof <https://github.com/neginraoof>`__)
--  Gary Miguel (`garymm <https://github.com/garymm>`__)
+PyTorch -> ONNX
+~~~~~~~~~~~~~~~
 -  Bowen Bao (`BowenBao <https://github.com/BowenBao>`__)
--  (emeritus) Lu Fang (`houseroad <https://github.com/houseroad>`__)
+-  Gary Miguel (`garymm <https://github.com/garymm>`__)
 -  (emeritus) Lara Haidar (`lara-hdr <https://github.com/lara-hdr>`__)
+-  (emeritus) Lu Fang (`houseroad <https://github.com/houseroad>`__)
+-  (emeritus) Negin Raoof (`neginraoof <https://github.com/neginraoof>`__)
 -  (emeritus) Spandan Tiwari (`spandantiwari <https://github.com/spandantiwari>`__)
 
 Mobile / Edge
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0b1343145bc1..2d5b60e6af82 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -57,12 +57,16 @@
     'sphinxcontrib.katex',
     'sphinx.ext.autosectionlabel',
     'sphinx_copybutton',
+    'sphinx_panels'
 ]
 
 # build the templated autosummary files
 autosummary_generate = True
 numpydoc_show_class_members = False
 
+# Theme has bootstrap already
+panels_add_bootstrap_css = False
+
 # autosectionlabel throws warnings if section names are duplicated.
 # The following tells autosectionlabel to not throw a warning for
 # duplicated section names that are in different documents.
@@ -82,6 +86,8 @@
 # TODO: document these and remove them from here.
 
 coverage_ignore_functions = [
+    # torch
+    "typename",
     # torch.autograd
     "register_py_tensor_class_for_device",
     "variable",
@@ -125,9 +131,113 @@
     "execWrapper",
     # torch.onnx
     "unregister_custom_op_symbolic",
+    # torch.ao.quantization
+    "default_eval_fn",
+    # torch.ao.quantization.backend_config
+    "validate_backend_config_dict",
+    # torch.backends
+    "disable_global_flags",
+    "flags_frozen",
+    # torch.distributed.algorithms.ddp_comm_hooks
+    "register_ddp_comm_hook",
+    # torch.nn
+    "factory_kwargs",
+    # torch.nn.parallel
+    "DistributedDataParallelCPU",
+    # torch.utils
+    "set_module",
+    # torch.utils.model_dump
+    "burn_in_info",
+    "get_info_and_burn_skeleton",
+    "get_inline_skeleton",
+    "get_model_info",
+    "get_storage_info",
+    "hierarchical_pickle",
 ]
 
 coverage_ignore_classes = [
+    # torch
+    "FatalError",
+    "QUInt2x4Storage",
+    "Size",
+    "Storage",
+    "Stream",
+    "Tensor",
+    "finfo",
+    "iinfo",
+    "qscheme",
+    "AggregationType",
+    "AliasDb",
+    "AnyType",
+    "Argument",
+    "ArgumentSpec",
+    "BenchmarkConfig",
+    "BenchmarkExecutionStats",
+    "Block",
+    "BoolType",
+    "BufferDict",
+    "CallStack",
+    "Capsule",
+    "ClassType",
+    "Code",
+    "CompleteArgumentSpec",
+    "ComplexType",
+    "ConcreteModuleType",
+    "ConcreteModuleTypeBuilder",
+    "DeepCopyMemoTable",
+    "DeserializationStorageContext",
+    "DeviceObjType",
+    "DictType",
+    "EnumType",
+    "ExecutionPlan",
+    "FileCheck",
+    "FloatType",
+    "FunctionSchema",
+    "Gradient",
+    "Graph",
+    "GraphExecutorState",
+    "IODescriptor",
+    "InferredType",
+    "IntType",
+    "InterfaceType",
+    "ListType",
+    "LockingLogger",
+    "MobileOptimizerType",
+    "ModuleDict",
+    "Node",
+    "NoneType",
+    "NoopLogger",
+    "NumberType",
+    "OperatorInfo",
+    "OptionalType",
+    "ParameterDict",
+    "PyObjectType",
+    "PyTorchFileReader",
+    "PyTorchFileWriter",
+    "RRefType",
+    "ScriptClass",
+    "ScriptClassFunction",
+    "ScriptDict",
+    "ScriptDictIterator",
+    "ScriptDictKeyIterator",
+    "ScriptList",
+    "ScriptListIterator",
+    "ScriptMethod",
+    "ScriptModule",
+    "ScriptModuleSerializer",
+    "ScriptObject",
+    "ScriptObjectProperty",
+    "SerializationStorageContext",
+    "StaticModule",
+    "StringType",
+    "SymIntType",
+    "ThroughputBenchmark",
+    "TracingState",
+    "TupleType",
+    "Type",
+    "UnionType",
+    "Use",
+    "Value",
     # torch.cuda
     "BFloat16Storage",
     "BFloat16Tensor",
@@ -153,7 +263,6 @@
     "LongTensor",
     "ShortStorage",
     "ShortTensor",
-    "UntypedStorage",
     "cudaStatus",
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
@@ -181,6 +290,7 @@
     "ReshapeTransform",
     "SigmoidTransform",
     "SoftmaxTransform",
+    "SoftplusTransform",
     "StackTransform",
     "StickBreakingTransform",
     "TanhTransform",
@@ -193,110 +303,25 @@
     # torch.onnx
     "CheckerError",
     "ExportTypes",
+    # torch.backends
+    "ContextProp",
+    "PropModule",
+    # torch.backends.cuda
+    "cuBLASModule",
+    "cuFFTPlanCache",
+    "cuFFTPlanCacheAttrContextProp",
+    "cuFFTPlanCacheManager",
+    # torch.distributed.algorithms.ddp_comm_hooks
+    "DDPCommHookType",
+    # torch.jit.mobile
+    "LiteScriptModule",
+    # torch.nn.quantized.modules
+    "DeQuantize",
+    "Quantize",
+    # torch.utils.backcompat
+    "Warning",
 ]
 
-# List of modules that do not have automodule/py:module in the doc yet
-# We should NOT add anything to this list, see the CI failure message
-# on how to solve missing automodule issues
-coverage_missing_automodule = [
-    "torch",
-    "torch.ao",
-    "torch.ao.nn",
-    "torch.ao.nn.sparse",
-    "torch.ao.nn.sparse.quantized",
-    "torch.ao.nn.sparse.quantized.dynamic",
-    "torch.ao.ns",
-    "torch.ao.ns.fx",
-    "torch.ao.quantization",
-    "torch.ao.quantization.fx",
-    "torch.ao.quantization.fx.backend_config",
-    "torch.ao.sparsity",
-    "torch.ao.sparsity.experimental",
-    "torch.ao.sparsity.experimental.pruner",
-    "torch.ao.sparsity.scheduler",
-    "torch.ao.sparsity.sparsifier",
-    "torch.backends",
-    "torch.backends.cuda",
-    "torch.backends.cudnn",
-    "torch.backends.mkl",
-    "torch.backends.mkldnn",
-    "torch.backends.openmp",
-    "torch.backends.quantized",
-    "torch.backends.xnnpack",
-    "torch.contrib",
-    "torch.cpu",
-    "torch.cpu.amp",
-    "torch.distributed.algorithms",
-    "torch.distributed.algorithms.ddp_comm_hooks",
-    "torch.distributed.algorithms.model_averaging",
-    "torch.distributed.elastic",
-    "torch.distributed.elastic.utils",
-    "torch.distributed.elastic.utils.data",
-    "torch.distributed.launcher",
-    "torch.distributed.nn",
-    "torch.distributed.nn.api",
-    "torch.distributed.nn.jit",
-    "torch.distributed.nn.jit.templates",
-    "torch.distributed.pipeline",
-    "torch.distributed.pipeline.sync",
-    "torch.distributed.pipeline.sync.skip",
-    "torch.fft",
-    "torch.for_onnx",
-    "torch.fx.experimental",
-    "torch.fx.experimental.fx_acc",
-    "torch.fx.experimental.unification",
-    "torch.fx.experimental.unification.multipledispatch",
-    "torch.fx.passes",
-    "torch.jit.mobile",
-    "torch.nn",
-    "torch.nn.backends",
-    "torch.nn.intrinsic",
-    "torch.nn.intrinsic.modules",
-    "torch.nn.intrinsic.qat",
-    "torch.nn.intrinsic.qat.modules",
-    "torch.nn.intrinsic.quantized",
-    "torch.nn.intrinsic.quantized.dynamic",
-    "torch.nn.intrinsic.quantized.dynamic.modules",
-    "torch.nn.intrinsic.quantized.modules",
-    "torch.nn.modules",
-    "torch.nn.parallel",
-    "torch.nn.qat",
-    "torch.nn.qat.modules",
-    "torch.nn.qat.dynamic",
-    "torch.nn.qat.dynamic.modules",
-    "torch.nn.quantizable",
-    "torch.nn.quantizable.modules",
-    "torch.nn.quantized",
-    "torch.nn.quantized.dynamic",
-    "torch.nn.quantized.dynamic.modules",
-    "torch.nn.quantized.modules",
-    "torch.nn.utils",
-    "torch.package",
-    "torch.package.analyze",
-    "torch.quantization",
-    "torch.quantization.fx",
-    "torch.sparse",
-    "torch.special",
-    "torch.utils",
-    "torch.utils.backcompat",
-    "torch.utils.benchmark.examples",
-    "torch.utils.benchmark.op_fuzzers",
-    "torch.utils.benchmark.utils",
-    "torch.utils.benchmark.utils.valgrind_wrapper",
-    "torch.utils.bottleneck",
-    "torch.utils.data.communication",
-    "torch.utils.data.datapipes",
-    "torch.utils.data.datapipes.dataframe",
-    "torch.utils.data.datapipes.iter",
-    "torch.utils.data.datapipes.map",
-    "torch.utils.data.datapipes.utils",
-    "torch.utils.ffi",
-    "torch.utils.hipify",
-    "torch.utils.model_dump",
-    "torch.utils.tensorboard",
-]
-
-
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
@@ -308,8 +333,8 @@
 
 # General information about the project.
 project = 'PyTorch'
-copyright = '2019, Torch Contributors'
-author = 'Torch Contributors'
+copyright = '2022, PyTorch Contributors'
+author = 'PyTorch Contributors'
 torch_version = str(torch.__version__)
 
 # The version info for the project you're documenting, acts as replacement for
@@ -326,14 +351,11 @@
 # Customized html_title here.
 # Default is " ".join(project, release, "documentation") if not set
 if RELEASE:
-    # remove hash (start with 'a') from version number if any
-    version_end = torch_version.find('a')
-    if version_end == -1:
-        html_title = " ".join((project, torch_version, "documentation"))
-        version = torch_version
-    else:
-        html_title = " ".join((project, torch_version[:version_end], "documentation"))
-        version = torch_version[:version_end]
+    # Turn 1.11.0aHASH into 1.11
+    # Note: the release candidates should no longer have the aHASH suffix, but in any
+    # case we wish to leave only major.minor, even for rc builds.
+    version = '.'.join(torch_version.split('.')[:2])
+    html_title = " ".join((project, version, "documentation"))
     release = version
 
 # The language for content autogenerated by Sphinx. Refer to documentation
@@ -417,6 +439,11 @@ def coverage_post_process(app, exception):
     if not isinstance(app.builder, CoverageBuilder):
         return
 
+    if not torch.distributed.is_available():
+        raise RuntimeError("The coverage tool cannot run with a version "
+                           "of PyTorch that was built with USE_DISTRIBUTED=0 "
+                           "as this module's API changes.")
+
     # These are all the modules that have "automodule" in an rst file
     # These modules are the ones for which coverage is checked
     # Here, we make sure that no module is missing from that list
@@ -443,26 +470,16 @@ def is_not_internal(modname):
             if modname not in modules:
                 missing.add(modname)
 
-    expected = set(coverage_missing_automodule)
-
     output = []
 
-    unexpected_missing = missing - expected
-    if unexpected_missing:
-        mods = ", ".join(unexpected_missing)
+    if missing:
+        mods = ", ".join(missing)
         output.append(f"\nYou added the following module(s) to the PyTorch namespace '{mods}' "
                       "but they have no corresponding entry in a doc .rst file. You should "
                       "either make sure that the .rst file that contains the module's documentation "
                       "properly contains either '.. automodule:: mod_name' (if you do not want "
-                      "the paragraph added by the automodule, you can simply use py:module) or "
-                      "make the module private (by appending an '_' at the beginning of its name.")
-
-    unexpected_not_missing = expected - missing
-    if unexpected_not_missing:
-        mods = ", ".join(unexpected_not_missing)
-        output.append(f"\nThank you for adding the missing .rst entries for '{mods}', please update "
-                      "the 'coverage_missing_automodule' in 'torch/docs/source/conf.py' to remove "
-                      "the module(s) you fixed and make sure we do not regress on this in the future.")
+                      "the paragraph added by the automodule, you can simply use '.. py:module:: mod_name') "
+                      " or make the module private (by appending an '_' at the beginning of its name).")
 
     # The output file is hard-coded by the coverage tool
     # Our CI is setup to fail if any line is added to this file
diff --git a/docs/source/__config__.rst b/docs/source/config_mod.rst
similarity index 100%
rename from docs/source/__config__.rst
rename to docs/source/config_mod.rst
diff --git a/docs/source/cpp_extension.rst b/docs/source/cpp_extension.rst
index db718bdacc63..471f55228f3e 100644
--- a/docs/source/cpp_extension.rst
+++ b/docs/source/cpp_extension.rst
@@ -8,6 +8,6 @@ torch.utils.cpp_extension
 .. autofunction:: load
 .. autofunction:: load_inline
 .. autofunction:: include_paths
-.. autofunction:: check_compiler_abi_compatibility
+.. autofunction:: get_compiler_abi_compatibility_and_version
 .. autofunction:: verify_ninja_availability
 .. autofunction:: is_ninja_available
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index 955feaae8309..7d3998f7fa53 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -80,6 +80,7 @@ Graphs (beta)
     :toctree: generated
     :nosignatures:
 
+    is_current_stream_capturing
     graph_pool_handle
     CUDAGraph
     graph
@@ -123,3 +124,11 @@ NVIDIA Tools Extension (NVTX)
     nvtx.mark
     nvtx.range_push
     nvtx.range_pop
+
+Jiterator (beta)
+-----------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    jiterator._create_jit_fn
diff --git a/docs/source/data.rst b/docs/source/data.rst
index 322de88e27d9..646f41436caf 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -432,3 +432,15 @@ Example::
 .. autoclass:: torch.utils.data.WeightedRandomSampler
 .. autoclass:: torch.utils.data.BatchSampler
 .. autoclass:: torch.utils.data.distributed.DistributedSampler
+
+
+.. This module is experimental and should be private, adding it here for now
+.. py:module:: torch.utils.data.communication
+
+.. These modules are documented as part of torch/data listing them here for
+.. now until we have a clearer fix
+.. py:module:: torch.utils.data.datapipes
+.. py:module:: torch.utils.data.datapipes.dataframe
+.. py:module:: torch.utils.data.datapipes.iter
+.. py:module:: torch.utils.data.datapipes.map
+.. py:module:: torch.utils.data.datapipes.utils
diff --git a/docs/source/deploy.rst b/docs/source/deploy.rst
index 931aed7ab7a9..9311ba8c4ee6 100644
--- a/docs/source/deploy.rst
+++ b/docs/source/deploy.rst
@@ -29,8 +29,7 @@ When running ``setup.py``, you will need to specify ``USE_DEPLOY=1``, like:
 
     export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
     export USE_DEPLOY=1
-    python setup.py bdist_wheel
-    python -mpip install dist/*.whl
+    python setup.py develop
 
 
 Creating a model package in Python
@@ -53,28 +52,39 @@ For now, let's create a simple model that we can load and run in ``torch::deploy
     # Package and export it.
     with PackageExporter("my_package.pt") as e:
         e.intern("torchvision.**")
+        e.extern("numpy.**")
         e.extern("sys")
+        e.extern("PIL.*")
         e.save_pickle("model", "model.pkl", model)
 
+Note that since "numpy", "sys" and "PIL" were marked as "extern", `torch.package` will
+look for these dependencies on the system that loads this package. They will not be packaged
+with the model.
+
 Now, there should be a file named ``my_package.pt`` in your working directory.
 
-.. note::
 
-    Currently, ``torch::deploy`` supports only the Python standard library and
-    ``torch`` as ``extern`` modules in ``torch.package``. In the future we plan
-    to transparently support any Conda environment you point us to.
+Loading and running the model in C++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Set an environment variable (e.g. $PATH_TO_EXTERN_PYTHON_PACKAGES) to indicate to the interpreters
+where the external Python dependencies can be found. In the example below, the path to the
+site-packages of a conda environment is provided.
 
+.. code-block:: bash
 
+    export PATH_TO_EXTERN_PYTHON_PACKAGES= \
+        "~/anaconda/envs/deploy-example-env/lib/python3.8/site-packages"
 
-Loading and running the model in C++
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Let's create a minimal C++ program to that loads the model.
 
 .. code-block:: cpp
 
-    #include <torch/deploy.h>
+    #include <torch/csrc/deploy/deploy.h>
+    #include <torch/csrc/deploy/path_environment.h>
     #include <torch/script.h>
+    #include <torch/torch.h>
 
     #include <iostream>
     #include <memory>
@@ -86,7 +96,11 @@ Let's create a minimal C++ program to that loads the model.
         }
 
         // Start an interpreter manager governing 4 embedded interpreters.
-        torch::deploy::InterpreterManager manager(4);
+        std::shared_ptr<torch::deploy::Environment> env =
+            std::make_shared<torch::deploy::PathEnvironment>(
+                std::getenv("PATH_TO_EXTERN_PYTHON_PACKAGES")
+            );
+        torch::deploy::InterpreterManager manager(4, env);
 
         try {
             // Load the model from the torch.package.
@@ -94,6 +108,7 @@ Let's create a minimal C++ program to that loads the model.
             torch::deploy::ReplicatedObj model = package.loadPickle("model", "model.pkl");
         } catch (const c10::Error& e) {
             std::cerr << "error loading the model\n";
+            std::cerr << e.msg();
             return -1;
         }
 
@@ -105,6 +120,9 @@ This small program introduces many of the core concepts of ``torch::deploy``.
 An ``InterpreterManager`` abstracts over a collection of independent Python
 interpreters, allowing you to load balance across them when running your code.
 
+``PathEnvironment`` enables you to specify the location of Python
+packages on your system which are external, but necessary, for your model.
+
 Using the ``InterpreterManager::loadPackage`` method, you can load a
 ``torch.package`` from disk and make it available to all interpreters.
 
@@ -120,20 +138,55 @@ an free interpreter to execute that interaction.
 Building and running the application
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Locate `libtorch_deployinterpreter.o` on your system. This should have been
+built when PyTorch was built from source. In the same PyTorch directory, locate
+the deploy source files. Set these locations to an environment variable for the build.
+An example of where these can be found on a system is shown below.
+
+.. code-block:: bash
+
+    export DEPLOY_INTERPRETER_PATH="/pytorch/build/torch/csrc/deploy/"
+    export DEPLOY_SRC_PATH="/pytorch/torch/csrc/deploy/"
+
+As ``torch::deploy`` is in active development, these manual steps will be removed
+soon.
+
 Assuming the above C++ program was stored in a file called, `example-app.cpp`, a
 minimal CMakeLists.txt file would look like:
 
 .. code-block:: cmake
 
-    cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+    cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
     project(deploy_tutorial)
 
+    find_package(fmt REQUIRED)
     find_package(Torch REQUIRED)
 
-    add_executable(example-app example-app.cpp)
-    target_link_libraries(example-app "${TORCH_LIBRARIES}")
-    set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
+    add_library(torch_deploy_internal STATIC
+        ${DEPLOY_INTERPRETER_PATH}/libtorch_deployinterpreter.o
+        ${DEPLOY_DIR}/deploy.cpp
+        ${DEPLOY_DIR}/loader.cpp
+        ${DEPLOY_DIR}/path_environment.cpp
+        ${DEPLOY_DIR}/elf_file.cpp)
+
+    # for python builtins
+    target_link_libraries(torch_deploy_internal PRIVATE
+        crypt pthread dl util m z ffi lzma readline nsl ncursesw panelw)
+    target_link_libraries(torch_deploy_internal PUBLIC
+        shm torch fmt::fmt-header-only)
+    caffe2_interface_library(torch_deploy_internal torch_deploy)
+
+    add_executable(example-app example.cpp)
+    target_link_libraries(example-app PUBLIC
+        "-Wl,--no-as-needed -rdynamic" dl torch_deploy "${TORCH_LIBRARIES}")
+
+Currently, it is necessary to build ``torch::deploy`` as a static library.
+In order to correctly link to a static library, the utility ``caffe2_interface_library``
+is used to appropriately set and unset ``--whole-archive`` flag.
 
+Furthermore, the ``-rdynamic`` flag is needed when linking to the executable
+to ensure that symbols are exported to the dynamic table, making them accessible
+to the deploy interpreters (which are dynamically loaded).
 
 The last step is configuring and building the project. Assuming that our code
 directory is laid out like this:
@@ -152,8 +205,9 @@ We can now run the following commands to build the application from within the
     mkdir build
     cd build
     # Point CMake at the built version of PyTorch we just installed.
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    cmake -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" ..
+    cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. \
+        -DDEPLOY_INTERPRETER_PATH="$DEPLOY_INTERPRETER_PATH" \
+        -DDEPLOY_DIR="$DEPLOY_DIR"
     cmake --build . --config Release
 
 Now we can run our app:
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 4ddc3d5f3171..58d6d1606431 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -123,14 +123,24 @@ It is imperative that all processes specify the same number of interfaces in thi
 Other NCCL environment variables
 """"""""""""""""""""""""""""""""
 
-NCCL has also provided a number of environment variables for fine-tuning purposes.
-
-Commonly used ones include the following for debugging purposes:
-
-- ``export NCCL_DEBUG=INFO``
-- ``export NCCL_DEBUG_SUBSYS=ALL``
-
-For the full list of NCCL environment variables, please refer to
+**Debugging** - in case of NCCL failure, you can set ``NCCL_DEBUG=INFO`` to print an explicit
+warning message as well as basic NCCL initialization information.
+
+You may also use ``NCCL_DEBUG_SUBSYS`` to get more details about a specific
+aspect of NCCL. For example, ``NCCL_DEBUG_SUBSYS=COLL`` would print logs of
+collective calls, which may be helpful when debugging hangs, especially those
+caused by collective type or message size mismatch. In case of topology
+detection failure, it would be helpful to set ``NCCL_DEBUG_SUBSYS=GRAPH``
+to inspect the detailed detection result and save as reference if further help
+from NCCL team is needed.
+
+**Performance tuning** - NCCL performs automatic tuning based on its topology detection to save users'
+tuning effort. On some socket-based systems, users may still try tuning
+``NCCL_SOCKET_NTHREADS`` and ``NCCL_NSOCKS_PERTHREAD`` to increase socket
+network bandwidth. These two environment variables have been pre-tuned by NCCL
+for some cloud providers, such as AWS or GCP.
+
+For a full list of NCCL environment variables, please refer to
 `NVIDIA NCCL's official documentation <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html>`_
 
 
@@ -575,6 +585,9 @@ Debugging ``torch.distributed`` applications
 Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. ``torch.distributed`` provides
 a suite of tools to help debug training applications in a self-serve fashion:
 
+Monitored Barrier
+^^^^^^^^^^^^^^^^^
+
 As of v1.10, :func:`torch.distributed.monitored_barrier` exists as an alternative to :func:`torch.distributed.barrier` which fails with helpful information about which rank may be faulty
 when crashing, i.e. not all ranks calling into :func:`torch.distributed.monitored_barrier` within the provided timeout. :func:`torch.distributed.monitored_barrier` implements a host-side
 barrier using ``send``/``recv`` communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge
@@ -613,7 +626,10 @@ The following error message is produced on rank 0, allowing the user to determin
   [gloo/transport/tcp/pair.cc:598] Connection closed by peer [2401:db00:eef0:1100:3560:0:1c05:25d]:8594
 
 
-Next, the environment variable ``TORCH_DISTRIBUTED_DEBUG``  can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks
+``TORCH_DISTRIBUTED_DEBUG``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With ``TORCH_CPP_LOG_LEVEL=INFO``, the environment variable ``TORCH_DISTRIBUTED_DEBUG``  can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks
 are synchronized appropriately. ``TORCH_DISTRIBUTED_DEBUG`` can be set to either ``OFF`` (default), ``INFO``, or ``DETAIL`` depending on the debugging level
 required. Please note that the most verbose option, ``DETAIL`` may impact the application performance and thus should only be used when debugging issues.
 
@@ -662,6 +678,7 @@ include data such as forward time, backward time, gradient communication time, e
     if __name__ == "__main__":
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "29501"
+        os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
         os.environ[
             "TORCH_DISTRIBUTED_DEBUG"
         ] = "DETAIL"  # set to DETAIL for runtime logging.
@@ -762,6 +779,7 @@ application crashes, rather than a hang or uninformative error message. As an ex
     if __name__ == "__main__":
         os.environ["MASTER_ADDR"] = "localhost"
         os.environ["MASTER_PORT"] = "29501"
+        os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
         os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
         mp.spawn(worker, nprocs=2, args=())
 
@@ -774,6 +792,49 @@ With the ``NCCL`` backend, such an application would likely result in a hang whi
     RuntimeError: Error when verifying shape tensors for collective ALLREDUCE on rank 0. This likely indicates that input shapes into the collective are mismatched across ranks. Got shapes:  10
     [ torch.LongTensor{1} ]
 
+.. note::
+    For fine-grained control of the debug level during runtime the functions :func:`torch.distributed.set_debug_level`, :func:`torch.distributed.set_debug_level_from_env`, and
+    :func:`torch.distributed.get_debug_level` can also be used.
+
 In addition, `TORCH_DISTRIBUTED_DEBUG=DETAIL` can be used in conjunction with `TORCH_SHOW_CPP_STACKTRACES=1` to log the entire callstack when a collective desynchronization is detected. These
 collective desynchronization checks will work for all applications that use ``c10d`` collective calls backed by process groups created with the
 :func:`torch.distributed.init_process_group` and :func:`torch.distributed.new_group` APIs.
+
+Logging
+-------
+
+In addition to explicit debugging support via :func:`torch.distributed.monitored_barrier` and ``TORCH_DISTRIBUTED_DEBUG``, the underlying C++ library of ``torch.distributed`` also outputs log
+messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The
+following matrix shows how the log level can be adjusted via the combination of ``TORCH_CPP_LOG_LEVEL`` and ``TORCH_DISTRIBUTED_DEBUG`` environment variables.
+
++-------------------------+-----------------------------+------------------------+
+| ``TORCH_CPP_LOG_LEVEL`` | ``TORCH_DISTRIBUTED_DEBUG`` |   Effective Log Level  |
++=========================+=============================+========================+
+| ``ERROR``               | ignored                     | Error                  |
++-------------------------+-----------------------------+------------------------+
+| ``WARNING``             | ignored                     | Warning                |
++-------------------------+-----------------------------+------------------------+
+| ``INFO``                | ignored                     | Info                   |
++-------------------------+-----------------------------+------------------------+
+| ``INFO``                | ``INFO``                    | Debug                  |
++-------------------------+-----------------------------+------------------------+
+| ``INFO``                | ``DETAIL``                  | Trace (a.k.a. All)     |
++-------------------------+-----------------------------+------------------------+
+
+
+.. Distributed modules that are missing specific entries.
+.. Adding them here for tracking purposes until they are more permanently fixed.
+.. py:module:: torch.distributed.algorithms
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks
+.. py:module:: torch.distributed.algorithms.model_averaging
+.. py:module:: torch.distributed.elastic
+.. py:module:: torch.distributed.elastic.utils
+.. py:module:: torch.distributed.elastic.utils.data
+.. py:module:: torch.distributed.launcher
+.. py:module:: torch.distributed.nn
+.. py:module:: torch.distributed.nn.api
+.. py:module:: torch.distributed.nn.jit
+.. py:module:: torch.distributed.nn.jit.templates
+.. py:module:: torch.distributed.pipeline
+.. py:module:: torch.distributed.pipeline.sync
+.. py:module:: torch.distributed.pipeline.sync.skip
diff --git a/docs/source/fft.rst b/docs/source/fft.rst
index 05f6215af513..5406b6610a60 100644
--- a/docs/source/fft.rst
+++ b/docs/source/fft.rst
@@ -7,8 +7,6 @@ torch.fft
 Discrete Fourier transforms and related functions.
 
 .. automodule:: torch.fft
-    :noindex:
-
 .. currentmodule:: torch.fft
 
 Fast Fourier Transforms
diff --git a/docs/source/fx.rst b/docs/source/fx.rst
index 65689930743d..206b39c656f8 100644
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@@ -1109,3 +1109,13 @@ API Reference
   :members:
 
 .. autofunction:: torch.fx.replace_pattern
+
+
+.. The experimental and passes submodules are missing docs.
+.. Adding it here for coverage but this doesn't add anything to the
+.. rendered doc.
+.. py:module:: torch.fx.passes
+.. py:module:: torch.fx.passes.tests
+.. py:module:: torch.fx.experimental
+.. py:module:: torch.fx.experimental.unification
+.. py:module:: torch.fx.experimental.unification.multipledispatch
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d307fee48647..f4642d49fd3c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -54,9 +54,9 @@ Features described in this documentation are classified by release status:
    tensors
    tensor_attributes
    tensor_view
+   torch.amp <amp>
    torch.autograd <autograd>
    cuda
-   torch.cuda.amp <amp>
    torch.backends <backends>
    torch.distributed <distributed>
    torch.distributed.algorithms.join <distributed.algorithms.join>
@@ -84,6 +84,7 @@ Features described in this documentation are classified by release status:
    quantization
    rpc
    torch.random <random>
+   nested
    sparse
    storage
    torch.testing <testing>
@@ -99,16 +100,18 @@ Features described in this documentation are classified by release status:
    type_info
    named_tensor
    name_inference
-   torch.__config__ <__config__>
+   torch.__config__ <config_mod>
 
 .. toctree::
    :maxdepth: 1
    :caption: Libraries
 
    torchaudio <https://pytorch.org/audio/stable>
+   TorchData <https://pytorch.org/data>
+   TorchRec <https://pytorch.org/torchrec>
+   TorchServe <https://pytorch.org/serve>
    torchtext <https://pytorch.org/text/stable>
    torchvision <https://pytorch.org/vision/stable>
-   TorchServe <https://pytorch.org/serve>
    PyTorch on XLA Devices <http://pytorch.org/xla/>
 
 .. toctree::
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 8a80b6471e1a..70c5f26c2842 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -61,6 +61,10 @@ Creating TorchScript Code
     ScriptFunction
     freeze
     optimize_for_inference
+    enable_onednn_fusion
+    onednn_fusion_enabled
+    set_fusion_strategy
+    strict_fusion
     save
     load
     ignore
@@ -877,3 +881,7 @@ References
 
     jit_python_reference
     jit_unsupported
+
+.. This package is missing doc. Adding it here for coverage
+.. This does not add anything to the rendered page.
+.. py:module:: torch.jit.mobile
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index f7b232448fbf..3ac9d211f7dd 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -34,6 +34,7 @@ Decompositions
 
     cholesky
     qr
+    lu
     lu_factor
     eig
     eigvals
@@ -95,6 +96,15 @@ Tensor Operations
     tensorinv
     tensorsolve
 
+Misc
+----
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    vander
+
 Experimental Functions
 ----------------------
 .. autosummary::
@@ -104,3 +114,6 @@ Experimental Functions
     cholesky_ex
     inv_ex
     lu_factor_ex
+    ldl_factor
+    ldl_factor_ex
+    ldl_solve
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
new file mode 100644
index 000000000000..53a5446b4b52
--- /dev/null
+++ b/docs/source/nested.rst
@@ -0,0 +1,62 @@
+torch.nested
+============
+
+.. automodule:: torch.nested
+
+Introduction
+++++++++++++
+
+.. warning::
+
+  The PyTorch API of nested tensors is in prototype stage and will change in the near future.
+
+.. warning::
+
+  torch.NestedTensor currently does not support autograd. It needs to be used in the context
+  of torch.inference_mode().
+
+NestedTensor allows the user to pack a list of Tensors into a single, efficient datastructure.
+
+The only constraint on the input Tensors is that their dimension must match.
+
+This enables more efficient metadata representations and operator coverage.
+
+Construction is straightforward and involves passing a list of Tensors to the constructor.
+
+>>> a, b = torch.arange(3), torch.arange(5) + 3
+>>> a
+tensor([0, 1, 2])
+>>> b
+tensor([3, 4, 5, 6, 7])
+>>> nt = torch.nested_tensor([a, b])
+>>> nt
+nested_tensor([
+  tensor([0, 1, 2]),
+    tensor([3, 4, 5, 6, 7])
+    ])
+
+Data type and device can be chosen via the usual keyword arguments
+
+>>> nt = torch.nested_tensor([a, b], dtype=torch.float32, device="cuda")
+>>> nt
+nested_tensor([
+  tensor([0., 1., 2.], device='cuda:0'),
+  tensor([3., 4., 5., 6., 7.], device='cuda:0')
+])
+
+
+Operator coverage
++++++++++++++++++
+
+We are currently on our path to wholesale extend operator coverage guided by specific ML use cases.
+
+Operator coverage thus is currently very limited and only unbind is supported.
+
+>>> nt = torch.nested_tensor([a, b], dtype=torch.float32, device="cuda")
+>>> nt
+nested_tensor([
+  tensor([0., 1., 2.], device='cuda:0'),
+  tensor([3., 4., 5., 6., 7.], device='cuda:0')
+])
+>>> nt.unbind()
+[tensor([0., 1., 2.], device='cuda:0'), tensor([3., 4., 5., 6., 7.], device='cuda:0')]
diff --git a/docs/source/nn.init.rst b/docs/source/nn.init.rst
index 56179d30bebf..a980f16f5f6d 100644
--- a/docs/source/nn.init.rst
+++ b/docs/source/nn.init.rst
@@ -19,5 +19,6 @@ torch.nn.init
 .. autofunction:: xavier_normal_
 .. autofunction:: kaiming_uniform_
 .. autofunction:: kaiming_normal_
+.. autofunction:: trunc_normal_
 .. autofunction:: orthogonal_
 .. autofunction:: sparse_
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 6eca9d4b16b6..571af54818e2 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -3,6 +3,8 @@
 
 torch.nn
 ===================================
+.. automodule:: torch.nn
+.. automodule:: torch.nn.modules
 
 These are the basic building blocks for graphs:
 
@@ -331,6 +333,8 @@ Shuffle Layers
 
 DataParallel Layers (multi-GPU, distributed)
 --------------------------------------------
+.. automodule:: torch.nn.parallel
+.. currentmodule:: torch
 
 .. autosummary::
     :toctree: generated
@@ -342,6 +346,7 @@ DataParallel Layers (multi-GPU, distributed)
 
 Utilities
 ---------
+.. automodule:: torch.nn.utils
 
 From the ``torch.nn.utils`` module
 
@@ -416,6 +421,14 @@ for more information on how to implement your own parametrizations.
 
     parametrize.ParametrizationList
 
+Utility functions to calls a given Module in a stateless manner.
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    stateless.functional_call
+
 Utility functions in other modules
 
 .. currentmodule:: torch
@@ -453,3 +466,8 @@ Lazy Modules Initialization
     :template: classtemplate.rst
 
     nn.modules.lazy.LazyModuleMixin
+
+
+.. This module is kept only for backward compatibility
+.. py:module:: torch.nn.backends
+.. py:module:: torch.nn.utils.stateless
diff --git a/docs/source/notes/amp_examples.rst b/docs/source/notes/amp_examples.rst
index 90cda473cb29..b6bcc38bc0f3 100644
--- a/docs/source/notes/amp_examples.rst
+++ b/docs/source/notes/amp_examples.rst
@@ -1,7 +1,7 @@
 .. _amp-examples:
 
-Automatic Mixed Precision examples
-==================================
+CUDA Automatic Mixed Precision examples
+=======================================
 
 .. currentmodule:: torch.cuda.amp
 
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index 936c0f9eddd7..9fe4551806a6 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -87,6 +87,22 @@ subject to change and that users should not rely on.
 You can control how PyTorch does packing / unpacking with :ref:`saved-tensors-hooks-doc`.
 
 
+.. _non-differentiable-func-grad:
+
+Gradients for non-differentiable functions
+------------------------------------------
+
+The gradient computation using Automatic Differentiation is only valid when each elementary function being used is differentiable.
+Unfortunately many of the function we use in practice do not have this property (relu or sqrt at 0 for example).
+And even though we cannot always guarantee that the returned gradient will be correct. For example :math:`f(x) = x = \text{relu}(x) - \text{relu}(-x)` will give a 0 gradient at 0 instead of 1 for any value we choose for the gradient of relu at 0.
+To try and reduce the impact of this limitation, we define the gradients of the elementary operations by applying the following rules in order:
+
+#. If the function is differentiable and thus a gradient exists at the current point, use it.
+#. If the function is convex (at least locally), use the sub-gradient with minimum norm (as it the steepest descent direction, see Exercise 2.7 from "Convex Optimization Algorithms" by Bertsekas, D. P and "Steepest Descent for Optimization Problems with Nondifferentiable Cost Functionals" by Bertsekas, D. P, and Mitter, S. K., 1971. for details and proofs).
+#. If the function is concave (at least locally), use the super-gradient with minimum norm (using a similar argument as above).
+#. If the function is defined, define the gradient at the current point by continuity (note that :math:`inf` is possible here, for example, :math:`sqrt(0)`). If multiple values are possible, pick one arbitrarily.
+#. If the function is not defined (:math:`\sqrt(-1)`, :math:`\log(-1)` or most functions when the input is :math:`nan` for example) then the value used as the gradient is arbitrary (we might also raise an error but that is not guaranteed). Most functions will use :math:`nan` as the gradient, but for performance reasons, some functions will use non-:math:`nan` values (:math:`\log(-1)` for example).
+
 .. _locally-disable-grad-doc:
 
 Locally disabling gradient computation
@@ -222,7 +238,7 @@ Evaluation Mode (``nn.Module.eval()``)
 Evaluation mode is not actually a mechanism to locally disable gradient computation.
 It is included here anyway because it is sometimes confused to be such a mechanism.
 
-Functionally, ``module.eval()`` (or equivalently ``module.train()``) are completely
+Functionally, ``module.eval()`` (or equivalently ``module.train(False)``) are completely
 orthogonal to no-grad mode and inference mode. How ``model.eval()`` affects
 your model depends entirely on the specific modules used in your model and
 whether they define any training-mode specific behavior.
@@ -278,8 +294,8 @@ Multithreaded Autograd
 
 The autograd engine is responsible for running all the backward operations
 necessary to compute the backward pass. This section will describe all the details
-that can help you make the best use of it in a multithreaded environment.(this is
-relevant only for PyTorch 1.6+ as the behavior in previous version was different).
+that can help you make the best use of it in a multithreaded environment. (This is
+relevant only for PyTorch 1.6+ as the behavior in previous version was different.)
 
 User could train their model with multithreading code (e.g. Hogwild training), and
 does not block on the concurrent backward computations, example code could be:
@@ -352,9 +368,9 @@ Since Autograd allows the caller thread to drive its backward execution for
 potential parallelism, it's important that we ensure thread safety on CPU with
 parallel backwards that share part/whole of the GraphTask.
 
-Custom Python ``autograd.function`` is automatically thread safe because of GIL.
-for built-in C++ Autograd Nodes(e.g. AccumulateGrad, CopySlices) and custom
-``autograd::Function``, the Autograd Engine uses thread mutex locking to protect
+Custom Python ``autograd.Function`` is automatically thread safe because of GIL.
+For built-in C++ Autograd Nodes (e.g. AccumulateGrad, CopySlices) and custom
+``autograd::Function``\s, the Autograd Engine uses thread mutex locking to ensure
 thread safety on autograd Nodes that might have state write/read.
 
 No thread safety on C++ hooks
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index b2901a6fe336..59eb7d4c72b6 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -364,6 +364,26 @@ Available options:
   :meth:`~torch.cuda.memory_summary` methods are useful for tuning.  This
   option should be used as a last resort for a workload that is aborting
   due to 'out of memory' and showing a large amount of inactive split blocks.
+* ``roundup_power2_divisions`` helps with rounding the requested allocation
+  size to nearest power-2 division and making better use of the blocks. In
+  the current CUDACachingAllocator, the sizes are rounded up in multiple
+  of blocks size of 512, so this works fine for smaller sizes. However, this
+  can be inefficient for large near-by allocations as each will go to different
+  size of blocks and re-use of those blocks are minimized. This might create
+  lots of unused blocks and will waste GPU memory capacity. This option enables
+  the rounding of allocation size to nearest power-2 division. For example, if
+  we need to round-up size of 1200 and if number of divisions is 4,
+  the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
+  them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200
+  will be rounded to 1280 as the nearest ceiling of power-2 division.
+* ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
+  avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
+  which can be unfavorable to latency-critical GPU applications (e.g., servers).
+  Upon setting this threshold (e.g., 0.8), the allocator will start reclaiming
+  GPU memory blocks if the GPU memory capacity usage exceeds the threshold (i.e.,
+  80% of the total memory allocated to the GPU application). The algorithm prefers
+  to free old & unused blocks first to avoid freeing blocks that are actively being
+  reused. The threshold value should be between greater than 0.0 and less than 1.0.
 
 .. _cufft-plan-cache:
 
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index ccc76a8a0d55..dbeb135d6e2a 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -54,7 +54,8 @@ Take the following steps:
 1. Subclass :class:`~Function` and implement the :meth:`~Function.forward` and
 :meth:`~Function.backward` methods.
 2. Call the proper methods on the `ctx` argument.
-3. Declare whether your function supports double backward.
+3. Declare whether your function supports
+`double backward <https://pytorch.org/tutorials/intermediate/custom_function_double_backward_tutorial.html>`_.
 4. Validate whether your gradients are correct using gradcheck.
 
 **Step 1:** After subclassing :class:`Function`, you'll need to define 2 methods:
@@ -354,7 +355,7 @@ Extending :mod:`torch` with a :class:`Tensor`-like type
 
 .. note:: This functionality is inspired by the NumPy ``__array_function__``
           protocol. See `the NumPy documentation
-          <https://docs.scipy.org/doc/numpy/user/basics.dispatch.html#basics-dispatch>`_
+          <https://numpy.org/doc/stable/user/basics.dispatch.html#basics-dispatch>`_
           and `NEP-0018
           <https://numpy.org/neps/nep-0018-array-function-protocol.html>`_ for
           more details.
diff --git a/docs/source/notes/mps.rst b/docs/source/notes/mps.rst
new file mode 100644
index 000000000000..6ad44ba97714
--- /dev/null
+++ b/docs/source/notes/mps.rst
@@ -0,0 +1,40 @@
+.. _MPS-Backend:
+
+MPS backend
+===========
+
+:mod:`mps` device enables high-performance
+training on GPU for MacOS devices with Metal programming framework.  It
+introduces a new device to map Machine Learning computational graphs and
+primitives on highly efficient Metal Performance Shaders Graph framework and
+tuned kernels provided by Metal Performance Shaders framework respectively.
+
+The new MPS backend extends the PyTorch ecosystem and provides existing scripts
+capabilities to setup and run operations on GPU.
+
+To get started, simply move your Tensor and Module to the ``mps`` device:
+
+.. code::
+
+    # Make sure the current PyTorch binary was built with MPS enabled
+    print(torch.backends.mps.is_built())
+    # And that the current hardware and MacOS version are sufficient to
+    # be able to use MPS
+    print(torch.backends.mps.is_available())
+
+    mps_device = torch.device("mps")
+
+    # Create a Tensor directly on the mps device
+    x = torch.ones(5, device=mps_device)
+    # Or
+    x = torch.ones(5, device="mps")
+
+    # Any operation happens on the GPU
+    y = x * 2
+
+    # Move your model to mps just like any other device
+    model = YourFavoriteNet()
+    model.to(mps_device)
+
+    # Now every call runs on the GPU
+    pred = model(x)
diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
index 49d21c516b96..c952fb1f7c59 100644
--- a/docs/source/notes/numerical_accuracy.rst
+++ b/docs/source/notes/numerical_accuracy.rst
@@ -10,7 +10,7 @@ In particular, note that floating point provides limited accuracy (about 7 decim
 for single precision floating point numbers, about 16 decimal digits for double precision
 floating point numbers) and that floating point addition and multiplication are not
 associative, so the order of the operations affects the results.
-Because of this, pytorch is not guaranteed
+Because of this, PyTorch is not guaranteed
 to produce bitwise identical results for floating point computations that are
 mathematically identical. Similarly, bitwise identical results are not guaranteed across
 PyTorch releases, individual commits, or different platforms. In particular, CPU and GPU
@@ -20,12 +20,12 @@ the sources of randomness.
 Batched computations or slice computations
 ------------------------------------------
 
-Many operations in pytorch support batched computation, where the same operation is performed
+Many operations in PyTorch support batched computation, where the same operation is performed
 for the elements of the batches of inputs. An example of this is :meth:`torch.mm` and
 :meth:`torch.bmm`. It is possible to implement batched computation as a loop over batch elements,
 and apply the necessary math operations to the individual batch elements, for efficiency reasons
 we are not doing that, and typically perform computation for the whole batch. The mathematical
-libraries that we are calling, and pytorch internal implementations of operations can produces
+libraries that we are calling, and PyTorch internal implementations of operations can produces
 slightly different results in this case, compared to non-batched computations. In particular,
 let ``A`` and ``B`` be 3D tensors with the dimensions suitable for batched matrix multiplication.
 Then ``(A@B)[0]`` (the first element of the batched result) is not guaranteed to be bitwise
@@ -54,7 +54,7 @@ datatype. E.g.:
 TensorFloat-32(TF32) on Nvidia Ampere devices
 ---------------------------------------------
 
-On Ampere Nvidia GPUs, pytorch by default uses TensorFloat32 (TF32) to speed up mathematically
+On Ampere Nvidia GPUs, PyTorch by default uses TensorFloat32 (TF32) to speed up mathematically
 intensive operations, in particular matrix multiplications and convolutions. When operation is performed
 using TF32 tensor cores, only the first 10 bits of the input mantissa are read. This leads to less accurate
 results, and surprising results such as multiplying a matrix by identity matrix produces
@@ -72,3 +72,50 @@ If reduced-precision reductions are problematic, they can be turned off with
 ``torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False``
 
 For more information see :ref:`allow_fp16_reduced_precision_reduction<fp16reducedprecision>`
+
+.. _fp16_on_mi200:
+
+Reduced Precision FP16 and BF16 GEMMs and Convolutions on AMD Instinct MI200 devices
+------------------------------------------------------------------------------------
+On AMD Instinct MI200 GPUs, the FP16 and BF16 V_DOT2 and MFMA matrix instructions flush input and output denormal values to zero. FP32 and FP64 MFMA matrix instructions do not flush input and output denormal values to zero. The affected instructions are only used by rocBLAS (GEMM) and MIOpen (convolution) kernels; all other PyTorch operations will not encounter this behavior. All other supported AMD GPUs will not encounter this behavior.
+
+rocBLAS and MIOpen provide alternate implementations for affected FP16 operations. Alternate implementations for BF16 operations are not provided; BF16 numbers have a larger dynamic range than FP16 numbers and are less likely to encounter denormal values. For the FP16 alternate implementations, FP16 input values are cast to an intermediate BF16 value and then cast back to FP16 output after the accumulate FP32 operations. In this way, the input and output types are unchanged.
+
+When training using FP16 precision, some models may fail to converge with FP16 denorms flushed to zero. Denormal values more frequently occur in the backward pass of training during gradient calculation. PyTorch by default will use the rocBLAS and MIOpen alternate implementations during the backward pass. The default behavior can be overridden using environment variables, ROCBLAS_INTERNAL_FP16_ALT_IMPL and MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL. The behavior of these environment variables is as follows:
+
++---------------+-----------+-----------+
+|               | forward   | backward  |
++===============+===========+===========+
+| Env unset     | original  | alternate |
++---------------+-----------+-----------+
+| Env set to 1  | alternate | alternate |
++---------------+-----------+-----------+
+| Env set to 0  | original  | original  |
++---------------+-----------+-----------+
+
+The following is the list of operations where rocBLAS may be used:
+
+* torch.addbmm
+* torch.addmm
+* torch.baddbmm
+* torch.bmm
+* torch.mm
+* torch.nn.GRUCell
+* torch.nn.LSTMCell
+* torch.nn.Linear
+* torch.sparse.addmm
+* the following torch._C._ConvBackend implementations:
+
+  * slowNd
+  * slowNd_transposed
+  * slowNd_dilated
+  * slowNd_dilated_transposed
+
+The following is the list of operations where MIOpen may be used:
+
+* torch.nn.Conv[Transpose]Nd
+* the following torch._C._ConvBackend implementations:
+
+  * ConvBackend::Miopen
+  * ConvBackend::MiopenDepthwise
+  * ConvBackend::MiopenTranspose
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index be78d7d3caa5..5ed8d2aebd0b 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -130,9 +130,9 @@ a :class:`torch.nn.Module`. If the passed-in model is not already a ``ScriptModu
   of different sizes. To use scripting:
 
   * Use :func:`torch.jit.script` to produce a ``ScriptModule``.
-  * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model, and set the
-    ``example_outputs`` arg. This is required so that the types and shapes of the outputs can be
-    captured without executing the model.
+  * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model. The ``args`` are still required,
+    but they will be used internally only to produce example outputs, so that the types and shapes of the
+    outputs can be captured. No tracing will be performed.
 
 See `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
 and `TorchScript <jit.html>`_ for more details, including how to compose tracing and scripting to suit the
@@ -332,19 +332,32 @@ The process for adding a symbolic function depends on the type of operator.
 ATen operators
 ^^^^^^^^^^^^^^
 
-
 `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch’s built-in tensor library.
 If the operator is an ATen operator (shows up in the TorchScript graph with the prefix
-``aten::``):
+``aten::``), make sure it is not supported already.
+
+List of supported operators
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Visit the auto generated :doc:`list of supported ATen operators <../onnx_supported_aten_ops>`
+for details on which operator are supported in each ``opset_version``.
+
+Adding support for an operator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If the operator is not in the list above:
 
 * Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
   `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic_opset9.py>`_.
   Make sure the function has the same name as the ATen function, which may be declared in
   ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at
   build time, so will not appear in your checkout until you build PyTorch).
-* The first arg is always the ONNX graph that is being built for export.
+* By default, the first arg is the ONNX graph.
   Other arg names must EXACTLY match the names in the ``.pyi`` file,
   because dispatch is done with keyword arguments.
+* A symbolic function that has a first arg (before the Graph object) with the
+  type annotation of torch.onnx.SymbolicContext will be called with that additional context.
+  See examples below.
 * In the symbolic function, if the operator is in the
   `ONNX standard operator set <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
   we only need to create a node to represent the ONNX operator in the graph.
@@ -421,8 +434,8 @@ PythonOp Symbolic
 
 Alternatively, you can register a custom symbolic function.
 This gives the symbolic function access to more info through the
-TorchScript ``Node`` object for the original operation, which gets passed in as the second
-argument (after the ``Graph`` object).
+``torch.onnx.SymbolicContext`` object, which gets passed in as the first
+argument (before the ``Graph`` object).
 
 All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes.
 In order to differentiate between different ``Function`` subclasses, the
@@ -449,7 +462,8 @@ The example below shows how you can access ``requires_grad`` via the ``Node`` ob
             ctx.save_for_backward(input)
             return input.clamp(min=0)
 
-    def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs):
+    def symbolic_python_op(ctx: torch.onnx.SymbolicContext, g: torch._C.Graph, *args, **kwargs):
+        n = ctx.cur_node
         print("original node: ", n)
         for i, out in enumerate(n.outputs()):
             print("original output {}: {}, requires grad: {}".format(i, out, out.requiresGrad()))
@@ -583,10 +597,29 @@ Q: Are lists of Tensors exportable to ONNX?
   Yes, for ``opset_version`` >= 11, since ONNX introduced the Sequence type in opset 11.
 
 
+Contributing / developing
+-------------------------
+`Developer docs <https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter>`_.
+
 Functions
---------------------------
+---------
 .. autofunction:: export
 .. autofunction:: export_to_pretty_string
 .. autofunction:: register_custom_op_symbolic
 .. autofunction:: select_model_mode_for_export
 .. autofunction:: is_in_onnx_export
+.. autofunction:: is_onnx_log_enabled
+.. autofunction:: enable_log
+.. autofunction:: disable_log
+.. autofunction:: set_log_stream
+.. autofunction:: log
+
+Classes
+-------
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    SymbolicContext
diff --git a/docs/source/onnx_supported_aten_ops.rst b/docs/source/onnx_supported_aten_ops.rst
new file mode 100644
index 000000000000..d6bf535e2e7e
--- /dev/null
+++ b/docs/source/onnx_supported_aten_ops.rst
@@ -0,0 +1,14 @@
+:orphan:
+
+ONNX supported ATen operators
+=============================
+
+This file is automatically generated during the documentation build
+by cross referencing ONNX operator symbolics with Torch JIT operators via
+``docs/source/scripts/build_onnx_supported_aten_op_csv_table.py``.
+Do not modify directly and instead `rebuild the docs <https://github.com/pytorch/pytorch#building-the-documentation>`_.
+
+.. csv-table:: Supported ATen operators
+   :file: ../build/auto_gen_aten_op_list.csv
+   :widths: 30, 70
+   :header-rows: 1
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index 62a293dec5ec..73c4d742900d 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -16,15 +16,6 @@ To construct an :class:`Optimizer` you have to give it an iterable containing th
 parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then,
 you can specify optimizer-specific options such as the learning rate, weight decay, etc.
 
-.. note::
-
-    If you need to move a model to GPU via ``.cuda()``, please do so before
-    constructing optimizers for it. Parameters of a model after ``.cuda()`` will
-    be different objects with those before the call.
-
-    In general, you should make sure that optimized parameters live in
-    consistent locations when optimizers are constructed and used.
-
 Example::
 
     optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
diff --git a/docs/source/package.rst b/docs/source/package.rst
index c7881f196140..9664460ac96a 100644
--- a/docs/source/package.rst
+++ b/docs/source/package.rst
@@ -1,3 +1,6 @@
+.. automodule:: torch.package
+.. py:module:: torch.package.analyze
+
 .. currentmodule:: torch.package
 
 torch.package
@@ -13,7 +16,7 @@ will help you learn more about ``torch.package`` and how to use it.
 
 .. warning::
 
-    This module depends on the ``pickle`` module which is is not secure. Only unpackage data you trust.
+    This module depends on the ``pickle`` module which is not secure. Only unpackage data you trust.
 
     It is possible to construct malicious pickle data which will **execute arbitrary code during unpickling**.
     Never unpackage data that could have come from an untrusted source, or that could have been tampered with.
@@ -376,7 +379,7 @@ API for accessing resources from inside a package.
 ::
 
     with PackageExporter(f) as exporter:
-        # saves text to one/a.txt in the archive
+        # saves text to my_resource/a.txt in the archive
         exporter.save_text("my_resource", "a.txt", "hello world!")
         # saves the tensor to my_pickle/obj.pkl
         exporter.save_pickle("my_pickle", "obj.pkl", torch.ones(2, 2))
diff --git a/docs/source/quantization-accuracy-debugging.rst b/docs/source/quantization-accuracy-debugging.rst
new file mode 100644
index 000000000000..69bda8706cc6
--- /dev/null
+++ b/docs/source/quantization-accuracy-debugging.rst
@@ -0,0 +1,98 @@
+Quantization Accuracy Debugging
+-------------------------------
+
+This document provides high level strategies for improving quantization
+accuracy. If a quantized model has error compared to the original model,
+we can categorize the error into:
+
+1. **data insensitive error** - caused by intrinsic model quantization error,
+   large portion of input data has large errror
+2. **data sensitive error** - caused by outlier input data, small
+   portion of input data has large error
+3. **implementation error** - quantized kernel is not matching reference implementation
+
+Data insensitive error
+~~~~~~~~~~~~~~~~~~~~~~
+
+General tips
+^^^^^^^^^^^^
+
+1. For PTQ, ensure that the data you are calibrating with is representative
+   of your dataset. For example, for a classification problem a general
+   guideline is to have multiple samples in every category, and the overall
+   number of samples should be at least 100. There is no penalty for
+   calibrating with more data other than calibration time.
+2. If your model has Conv-BN or Linear-BN patterns, consider fusing them.
+   If you are using FX graph mode quantization, this is done automatically
+   by the workflow. If you are using Eager mode quantization, you can do
+   this manually with the ``torch.ao.quantization.fuse_modules`` API.
+3. Increase the precision of dtype of the problematic ops. Usually, fp32
+   will have the highest accuracy, followed by fp16, followed by dynamically
+   quantized int8, followed by statically quantized int8.
+
+   1. Note: this is trading off performance for accuracy.
+   2. Note: availability of kernels per dtype per op can vary by backend.
+   3. Note: dtype conversions add an additional performance cost. For example,
+      ``fp32_op -> quant -> int8_op -> dequant -> fp32_op -> quant -> int8_op -> dequant``
+      will have a performance penalty compared to
+      ``fp32_op -> fp32_op -> quant -> int8_op -> int8_op -> dequant``
+      because of a higher number of required dtype conversions.
+
+4. If you are using PTQ, consider using QAT to recover some of the accuracy loss
+   from quantization.
+
+Int8 quantization tips
+^^^^^^^^^^^^^^^^^^^^^^
+
+1. If you are using per-tensor weight quantization, consider using per-channel
+   weight quantization.
+2. If you are doing inference on `fbgemm`, ensure that you set the `reduce_range`
+   argument to `False` if your CPU is Cooperlake or newer, and to `True` otherwise.
+3. Audit the input activation distribution variation across different samples.
+   If this variation is high, the layer may be suitable for dynamic quantization
+   but not static quantization.
+
+Data sensitive error
+~~~~~~~~~~~~~~~~~~~~
+
+If you are using static quantization and a small portion of your input data is
+resulting in high quantization error, you can try:
+
+1. Adjust your calibration dataset to make it more representative of your
+   inference dataset.
+2. Manually inspect (using Numeric Suite) which layers have high quantization
+   error. For these layers, consider leaving them in floating point or adjusting
+   the observer settings to choose a better scale and zero_point.
+
+
+Implementation error
+~~~~~~~~~~~~~~~~~~~~
+
+If you are using PyTorch quantization with your own backend
+you may see differences between the reference implementation of an
+operation (such as ``dequant -> op_fp32 -> quant``) and the quantized implementation
+(such as `op_int8`) of the op on the target hardware. This could mean one of two things:
+
+1. the differences (usually small) are expected due to specific behavior of
+   the target kernel on the target hardware compared to fp32/cpu. An example of this
+   is accumulating in an integer dtype. Unless the kernel guarantees bitwise
+   equivalency with the reference implementation, this is expected.
+2. the kernel on the target hardware has an accuracy issue. In this case, reach
+   out to the kernel developer.
+
+Numerical Debugging Tooling (prototype)
+---------------------------------------
+
+.. toctree::
+    :hidden:
+
+    torch.ao.ns._numeric_suite
+    torch.ao.ns._numeric_suite_fx
+
+.. warning ::
+     Numerical debugging tooling is early prototype and subject to change.
+
+* :ref:`torch_ao_ns_numeric_suite`
+  Eager mode numeric suite
+* :ref:`torch_ao_ns_numeric_suite_fx`
+  FX numeric suite
diff --git a/docs/source/quantization-backend-configuration.rst b/docs/source/quantization-backend-configuration.rst
new file mode 100644
index 000000000000..07fd875fa9b3
--- /dev/null
+++ b/docs/source/quantization-backend-configuration.rst
@@ -0,0 +1,20 @@
+Quantization Backend Configuration
+----------------------------------
+
+FX Graph Mode Quantization allows the user to configure various
+quantization behaviors of an op in order to match the expectation
+of their backend.
+
+In the future, this document will contain a detailed spec of
+these configurations.
+
+
+Default values for native configurations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Below is the output of the configuration for quantization of ops
+in fbgemm and qnnpack (PyTorch's default quantized backends).
+
+Results:
+
+.. literalinclude:: scripts/quantization_backend_configs/default_backend_config.txt
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index 78c5ea247c48..da6649a2fee3 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -217,6 +217,8 @@ to configure quantization settings for individual ops.
 
 torch.nn.intrinsic
 ~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.intrinsic
+.. automodule:: torch.nn.intrinsic.modules
 
 This module implements the combined (fused) modules conv + relu which can
 then be quantized.
@@ -243,6 +245,9 @@ then be quantized.
 
 torch.nn.intrinsic.qat
 ~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.intrinsic.qat
+.. automodule:: torch.nn.intrinsic.qat.modules
+
 
 This module implements the versions of those fused operations needed for
 quantization aware training.
@@ -268,6 +273,9 @@ quantization aware training.
 
 torch.nn.intrinsic.quantized
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.intrinsic.quantized
+.. automodule:: torch.nn.intrinsic.quantized.modules
+
 
 This module implements the quantized implementations of fused operations
 like conv + relu. No BatchNorm variants as it's usually folded into convolution
@@ -289,6 +297,8 @@ for inference.
 
 torch.nn.intrinsic.quantized.dynamic
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.intrinsic.quantized.dynamic
+.. automodule:: torch.nn.intrinsic.quantized.dynamic.modules
 
 This module implements the quantized dynamic implementations of fused operations
 like linear + relu.
@@ -304,6 +314,8 @@ like linear + relu.
 
 torch.nn.qat
 ~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.qat
+.. automodule:: torch.nn.qat.modules
 
 This module implements versions of the key nn modules **Conv2d()** and
 **Linear()** which run in FP32 but with rounding applied to simulate the
@@ -322,6 +334,8 @@ effect of INT8 quantization.
 
 torch.nn.qat.dynamic
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.qat.dynamic
+.. automodule:: torch.nn.qat.dynamic.modules
 
 This module implements versions of the key nn modules such as **Linear()**
 which run in FP32 but with rounding applied to simulate the effect of INT8
@@ -338,6 +352,8 @@ quantization and will be dynamically quantized during inference.
 
 torch.nn.quantized
 ~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.quantized
+.. automodule:: torch.nn.quantized.modules
 
 This module implements the quantized versions of the nn layers such as
 ~`torch.nn.Conv2d` and `torch.nn.ReLU`.
@@ -376,6 +392,7 @@ This module implements the quantized versions of the nn layers such as
 
 torch.nn.quantized.functional
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.quantized.functional
 
 This module implements the quantized versions of the functional layers such as
 ~`torch.nn.functional.conv2d` and `torch.nn.functional.relu`. Note:
@@ -413,6 +430,8 @@ This module implements the quantized versions of the functional layers such as
 
 torch.nn.quantized.dynamic
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: torch.nn.quantized.dynamic
+.. automodule:: torch.nn.quantized.dynamic.modules
 
 Dynamically quantized :class:`~torch.nn.Linear`, :class:`~torch.nn.LSTM`,
 :class:`~torch.nn.LSTMCell`, :class:`~torch.nn.GRUCell`, and
@@ -492,3 +511,8 @@ the `custom operator mechanism <https://pytorch.org/tutorials/advanced/torch_scr
   * :attr:`torch.quint8` — 8-bit unsigned integer
   * :attr:`torch.qint8` — 8-bit signed integer
   * :attr:`torch.qint32` — 32-bit signed integer
+
+
+.. These modules are missing docs. Adding them here only for tracking
+.. automodule:: torch.nn.quantizable
+.. automodule:: torch.nn.quantizable.modules
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 69d0abc02719..0817f08353f0 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -3,6 +3,9 @@
 Quantization
 ============
 
+.. automodule:: torch.quantization
+.. automodule:: torch.quantization.fx
+
 .. warning ::
      Quantization is in beta and subject to change.
 
@@ -11,12 +14,12 @@ Introduction to Quantization
 
 Quantization refers to techniques for performing computations and storing
 tensors at lower bitwidths than floating point precision. A quantized model
-executes some or all of the operations on tensors with integers rather than
-floating point values. This allows for a more compact model representation and
+executes some or all of the operations on tensors with reduced precision rather than
+full precision (floating point) values. This allows for a more compact model representation and
 the use of high performance vectorized operations on many hardware platforms.
 PyTorch supports INT8 quantization compared to typical FP32 models allowing for
 a 4x reduction in the model size and a 4x reduction in memory bandwidth
-requirements.  Hardware support for  INT8 computations is typically 2 to 4
+requirements. Hardware support for INT8 computations is typically 2 to 4
 times faster compared to FP32 compute. Quantization is primarily a technique to
 speed up inference and only the forward pass is supported for quantized
 operators.
@@ -35,124 +38,21 @@ that perform all or part of the computation in lower precision. Higher-level
 APIs are provided that incorporate typical workflows of converting FP32 model
 to lower precision with minimal accuracy loss.
 
-Quantization requires users to be aware of three concepts:
-
-#. Quantization Config (Qconfig): Specifies how weights and activations are to be quantized. Qconfig is needed to create a quantized model.
-#. Backend: Refers to kernels that support quantization, usually with different numerics.
-#. Quantization engine (torch.backends.quantization.engine): When a quantized model is executed, the qengine specifies which backend is to be used for execution. It is important to ensure that the qengine is consistent with the Qconfig.
-
 Quantization API Summary
----------------------------------------
-
-PyTorch provides two different modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization.
-
-Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
-
-FX Graph Mode Quantization is a new automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
-
-New users of quantization are encouraged to try out FX Graph Mode Quantization first, if it does not work, user may try to follow the guideline of `using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_ or fall back to eager mode quantization.
-
-The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization:
-
-+-----------------+-------------------+-------------------+
-|                 |Eager Mode         |FX Graph           |
-|                 |Quantization       |Mode               |
-|                 |                   |Quantization       |
-+-----------------+-------------------+-------------------+
-|Release          |beta               |prototype          |
-|Status           |                   |                   |
-+-----------------+-------------------+-------------------+
-|Operator         |Manual             |Automatic          |
-|Fusion           |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quant/DeQuant    |Manual             |Automatic          |
-|Placement        |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quantizing       |Supported          |Supported          |
-|Modules          |                   |                   |
-+-----------------+-------------------+-------------------+
-|Quantizing       |Manual             |Automatic          |
-|Functionals/Torch|                   |                   |
-|Ops              |                   |                   |
-+-----------------+-------------------+-------------------+
-|Support for      |Limited Support    |Fully              |
-|Customization    |                   |Supported          |
-+-----------------+-------------------+-------------------+
-|Quantization Mode|Post Training      |Post Training      |
-|Support          |Quantization:      |Quantization:      |
-|                 |Static, Dynamic,   |Static, Dynamic,   |
-|                 |Weight Only        |Weight Only        |
-|                 |                   |                   |
-|                 |Quantiztion Aware  |Quantiztion Aware  |
-|                 |Training:          |Training:          |
-|                 |Static             |Static             |
-+-----------------+-------------------+-------------------+
-|Input/Output     |``torch.nn.Module``|``torch.nn.Module``|
-|Model Type       |                   |(May need some     |
-|                 |                   |refactors to make  |
-|                 |                   |the model          |
-|                 |                   |compatible with FX |
-|                 |                   |Graph Mode         |
-|                 |                   |Quantization)      |
-+-----------------+-------------------+-------------------+
-
-
-There are three types of quantization supported:
-
-1. dynamic quantization (weights quantized with activations read/stored in
-   floating point and quantized for compute.)
-2. static quantization (weights quantized, activations quantized, calibration
-   required post training)
-3. static quantization aware training (weights quantized, activations quantized,
-   quantization numerics modeled during training)
-
-Please see our `Introduction to Quantization on Pytorch
-<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
-for a more comprehensive overview of the tradeoffs between these quantization
-types.
-
-Operator coverage varies between dynamic and static quantization and is captured in the table below.
-Note that for FX quantization, the corresponding functionals are also supported.
-
-+---------------------------+-------------------+--------------------+
-|                           |Static             | Dynamic            |
-|                           |Quantization       | Quantization       |
-+---------------------------+-------------------+--------------------+
-| | nn.Linear               | | Y               | | Y                |
-| | nn.Conv1d/2d/3d         | | Y               | | N                |
-+---------------------------+-------------------+--------------------+
-| | nn.LSTM                 | | N               | | Y                |
-| | nn.GRU                  | | N               | | Y                |
-+---------------------------+-------------------+--------------------+
-| | nn.RNNCell              | | N               | | Y                |
-| | nn.GRUCell              | | N               | | Y                |
-| | nn.LSTMCell             | | N               | | Y                |
-+---------------------------+-------------------+--------------------+
-|nn.EmbeddingBag            | Y (activations    |                    |
-|                           | are in fp32)      | Y                  |
-+---------------------------+-------------------+--------------------+
-|nn.Embedding               | Y                 | N                  |
-+---------------------------+-------------------+--------------------+
-|nn.MultiheadAttention      |Not Supported      | Not supported      |
-+---------------------------+-------------------+--------------------+
-|Activations                |Broadly supported  | Un-changed,        |
-|                           |                   | computations       |
-|                           |                   | stay in fp32       |
-+---------------------------+-------------------+--------------------+
-
+-----------------------------
 
 Eager Mode Quantization
 ^^^^^^^^^^^^^^^^^^^^^^^
+For a general introduction to the quantization flow, including different types of quantization, please take a look at `General Quantization Flow`_.
 
-
-Dynamic Quantization
-~~~~~~~~~~~~~~~~~~~~
+Post Training Dynamic Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This is the simplest to apply form of quantization where the weights are
 quantized ahead of time but the activations are dynamically quantized
 during inference. This is used for situations where the model execution time
 is dominated by loading weights from memory rather than computing the matrix
-multiplications. This is true for for LSTM and Transformer type models with
+multiplications. This is true for LSTM and Transformer type models with
 small batch size.
 
 Diagram::
@@ -198,16 +98,17 @@ API example::
 To learn more about dynamic quantization please see our `dynamic quantization tutorial
 <https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
 
-Static Quantization
-~~~~~~~~~~~~~~~~~~~
+Post Training Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Static quantization quantizes the weights and activations of the model.  It
+Post Training Static Quantization (PTQ static) quantizes the weights and activations of the model.  It
 fuses activations into preceding layers where possible.  It requires
 calibration with a representative dataset to determine optimal quantization
-parameters for activations. Post Training Quantization is typically used when
+parameters for activations. Post Training Static Quantization is typically used when
 both memory bandwidth and compute savings are important with CNNs being a
-typical use case.  Static quantization is also known as Post Training
-Quantization or PTQ.
+typical use case.
+
+We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_.
 
 Diagram::
 
@@ -288,18 +189,19 @@ API Example::
 To learn more about static quantization, please see the `static quantization tutorial
 <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
 
-Quantization Aware Training
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Quantization Aware Training for Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Quantization Aware Training models the effects of quantization during training
-allowing for higher accuracy compared to other quantization methods.  During
+Quantization Aware Training (QAT) models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods. We can do QAT for static, dynamic or weight only quantization.  During
 training, all calculations are done in floating point, with fake_quant modules
 modeling the effects of quantization by clamping and rounding to simulate the
 effects of INT8.  After model conversion, weights and
 activations are quantized, and activations are fused into the preceding layer
 where possible.  It is commonly used with CNNs and yields a higher accuracy
-compared to static quantization.  Quantization Aware Training is also known as
-QAT.
+compared to static quantization.
+
+We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_.
 
 Diagram::
 
@@ -383,33 +285,42 @@ To learn more about quantization aware training, please see the `QAT
 tutorial
 <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
 
-(Prototype) FX Graph Mode Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Quantization types supported by FX Graph Mode can be classified in two ways:
-
-1. Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data)
-2. Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data)
-
-And then each of these two may include any or all of the following types:
-
-- Weight Only Quantization (only weight is statically quantized)
-- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized)
-- Static Quantization (both weight and activations are statically quantized)
+Model Preparation for Eager Mode Static Quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-These two ways of classification are independent, so theoretically we can have 6 different types of quantization.
-
-The supported quantization types in FX Graph Mode Quantization are:
-
-- Post Training Quantization
+It is necessary to currently make some modifications to the model definition
+prior to Eager mode quantization. This is because currently quantization works on a module
+by module basis. Specifically, for all quantization techniques, the user needs to:
 
-  - Weight Only Quantization
-  - Dynamic Quantization
-  - Static Quantization
+1. Convert any operations that require output requantization (and thus have
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
+2. Specify which parts of the model need to be quantized either by assigning
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
 
-- Quantization Aware Training
+For static quantization techniques which quantize activations, the user needs
+to do the following in addition:
 
-  - Static Quantization
+1. Specify where activations are quantized and de-quantized. This is done using
+   :class:`~torch.quantization.QuantStub` and
+   :class:`~torch.quantization.DeQuantStub` modules.
+2. Use :class:`torch.nn.quantized.FloatFunctional` to wrap tensor operations
+   that require special handling for quantization into modules. Examples
+   are operations like ``add`` and ``cat`` which require special handling to
+   determine output quantization parameters.
+3. Fuse modules: combine operations/modules into a single module to obtain
+   higher accuracy and performance. This is done using the
+   :func:`torch.quantization.fuse_modules` API, which takes in lists of modules
+   to be fused. We currently support the following fusions:
+   [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
+(Prototype) FX Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_dict` (an argument of the `prepare_fx` function).
 
@@ -472,30 +383,22 @@ Please see the following tutorials for more information about FX Graph Mode Quan
 - `FX Graph Mode Post Training Static Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`_
 - `FX Graph Mode Post Training Dynamic Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html>`_
 
-Quantization API Reference
----------------------------
-
-The :doc:`Quantization API Reference <quantization-support>` contains documentation
-of quantization APIs, such as quantization passes, quantized tensor operations,
-and supported quantized modules and functions.
-
-.. toctree::
-    :hidden:
-
-    quantization-support
-    torch.ao.ns._numeric_suite
-    torch.ao.ns._numeric_suite_fx
+Quantization Stack
+------------------------
+Quantization is the process to convert a floating point model to a quantized model. So at high level the quantization stack can be split into two parts: 1). The building blocks or abstractions for a quantized model 2). The building blocks or abstractions for the quantization flow that converts a floating point model to a quantized model
 
-Quantized Tensors
----------------------------------------
+Quantized Model
+^^^^^^^^^^^^^^^^^^^^^^^
+Quantized Tensor
+~~~~~~~~~~~~~~~~~
+In order to do quantization in PyTorch, we need to be able to represent
+quantized data in Tensors. A Quantized Tensor allows for storing
+quantized data (represented as int8/uint8/int32) along with quantization
+parameters like scale and zero\_point. Quantized Tensors allow for many
+useful operations making quantized arithmetic easy, in addition to
+allowing for serialization of data in a quantized format.
 
-PyTorch supports both per tensor and per channel asymmetric linear
-quantization. Per tensor means that all the values within the tensor are
-scaled the same way. Per channel means that for each dimension, typically
-the channel dimension of a tensor, the values
-in the tensor are scaled and offset by a different value (effectively
-the scale and offset become vectors). This allows for lesser error in converting tensors
-to quantized values.
+PyTorch supports both per tensor and per channel symmetric and asymmetric quantization. Per tensor means that all the values within the tensor are quantized the same way with the same quantization parameters. Per channel means that for each dimension, typically the channel dimension of a tensor, the values in the tensor are quantized with different quantization parameters. This allows for less error in converting tensors to quantized values since outlier values would only impact the channel it was in, instead of the entire Tensor.
 
 The mapping is performed by converting the floating point tensors using
 
@@ -506,35 +409,243 @@ Note that, we ensure that zero in floating point is represented with no error
 after quantization, thereby ensuring that operations like padding do not cause
 additional quantization error.
 
-In order to do quantization in PyTorch, we need to be able to represent
-quantized data in Tensors. A Quantized Tensor allows for storing
-quantized data (represented as int8/uint8/int32) along with quantization
-parameters like scale and zero\_point. Quantized Tensors allow for many
-useful operations making quantized arithmetic easy, in addition to
-allowing for serialization of data in a quantized format.
+Here are a few key attributes for quantized Tensor:
 
-Natively supported backends
----------------------------
+* QScheme (torch.qscheme): a enum that specifies the way we quantize the Tensor
 
-Today, PyTorch supports the following backends for running quantized operators efficiently:
+  * torch.per_tensor_affine
+  * torch.per_tensor_symmetric
+  * torch.per_channel_affine
+  * torch.per_channel_symmetric
+
+* dtype (torch.dtype): data type of the quantized Tensor
+
+  * torch.quint8
+  * torch.qint8
+  * torch.qint32
+  * torch.float16
 
-* x86 CPUs with AVX2 support or higher (without AVX2 some operations have
-  inefficient implementations), via `fbgemm` (`<https://github.com/pytorch/FBGEMM>`_).
-* ARM CPUs (typically found in mobile/embedded devices), via
-  `qnnpack` (`<https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native/quantized/cpu/qnnpack>`_).
+* quantization parameters (varies based on QScheme): parameters for the chosen way of quantization
 
-The corresponding implementation is chosen automatically based on the PyTorch build mode, though users
-have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`.
+  * torch.per_tensor_affine would have quantization parameters of
 
-.. note::
+    * scale (float)
+    * zero_point (int)
+  * torch.per_tensor_affine would have quantization parameters of
 
-  At the moment PyTorch doesn't provide quantized operator implementations on CUDA -
-  this is the direction for future work. Move the model to CPU in order to test the
-  quantized functionality.
+    * per_channel_scales (list of float)
+    * per_channel_zero_points (list of int)
+    * axis (int)
 
-  Quantization-aware training (through :class:`~torch.quantization.FakeQuantize`,
-  which emulates quantized numerics in fp32) supports both CPU and CUDA.
+Quantize and Dequantize
+~~~~~~~~~~~~~~~~~~~~~~~
+The input and output of a model are floating point Tensors, but activations in the quantized model are quantized, so we need operators to convert between floating point and quantized Tensors.
+
+* Quantize (float -> quantized)
+
+  * torch.quantize_per_tensor(x, scale, zero_point, dtype)
+  * torch.quantize_per_channel(x, scales, zero_points, axis, dtype)
+  * torch.quantize_per_tensor_dynamic(x, dtype, reduce_range)
+  * to(torch.float16)
+
+* Dequantize (quantized -> float)
+
+  * quantized_tensor.dequantize() - calling dequantize on a torch.float16 Tensor will convert the Tensor back to torch.float
+  * torch.dequantize(x)
+
+Quantized Operators/Modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Quantized Operator are the operators that takes quantized Tensor as inputs, and outputs a quantized Tensor.
+* Quantized Modules are PyTorch Modules that performs quantized operations. They are typically defined for weighted operations like linear and conv.
+
+Quantized Engine
+~~~~~~~~~~~~~~~~~~~~
+When a quantized model is executed, the qengine (torch.backends.quantized.engine) specifies which backend is to be used for execution. It is important to ensure that the qengine is compatible with the quantized model in terms of value range of quantized activation and weights.
+
+Quantization Flow
+^^^^^^^^^^^^^^^^^^^^^^^
 
+Observer and FakeQuantize
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Observer are PyTorch Modules used to:
+
+  * collect tensor statistics like min value and max value of the Tensor passing through the observer
+  * and calculate quantization parameters based on the collected tensor statistics
+* FakeQuantize are PyTorch Modules used to:
+
+  * simulate quantization (performing quantize/dequantize) for a Tensor in the network
+  * it can calculate quantization parameters based on the collected statistics from observer, or it can learn the quantization parameters as well
+
+QConfig
+~~~~~~~~~~~
+* QConfig is a namedtuple of Observer or FakeQuantize Module class that can are configurable with qscheme, dtype etc. it is used to configure how an operator should be observed
+
+  * Quantization configuration for an operator/module
+
+    * different types of Observer/FakeQuantize
+    * dtype
+    * qscheme
+    * quant_min/quant_max: can be used to simulate lower precision Tensors
+  * Currently supports configuration for activation and weight
+  * We insert input/weight/output observer based on the qconfig that is configured for a given operator or module
+
+General Quantization Flow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In general, the flow is the following
+
+* prepare
+
+  * insert Observer/FakeQuantize modules based on user specified qconfig
+
+* calibrate/train (depending on post training quantization or quantization aware training)
+
+  * allow Observers to collect statistics or FakeQuantize modules to learn the quantization parameters
+
+* convert
+
+  * convert a calibrated/trained model to a quantized model
+
+There are different modes of quantization, they can be classified in two ways:
+
+In terms of where we apply the quantization flow, we have:
+
+1. Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data)
+2. Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data)
+
+And in terms of how we quantize the operators, we can have:
+
+- Weight Only Quantization (only weight is statically quantized)
+- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized)
+- Static Quantization (both weight and activations are statically quantized)
+
+We can mix different ways of quantizing operators in the same quantization flow. For example, we can have post training quantization that has both statically and dynamically quantized operators.
+
+Quantization Support Matrix
+--------------------------------------
+Quantization Mode Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
++-----------------------------+------------------------------------------------------+----------------+----------------+------------+-----------------+
+|                             |Quantization                                          |Dataset         | Works Best For | Accuracy   |      Notes      |
+|                             |Mode                                                  |Requirement     |                |            |                 |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|Post Training Quantization   |Dyanmic/Weight Only Quantization |activation          |None            |LSTM, MLP,      |good        |Easy to use,     |
+|                             |                                 |dynamically         |                |Embedding,      |            |close to static  |
+|                             |                                 |quantized (fp16,    |                |Transformer     |            |quantization when|
+|                             |                                 |int8) or not        |                |                |            |performance is   |
+|                             |                                 |quantized, weight   |                |                |            |compute or memory|
+|                             |                                 |statically quantized|                |                |            |bound due to     |
+|                             |                                 |(fp16, int8, in4)   |                |                |            |weights          |
+|                             +---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Static Quantization              |acivation and       |calibration     |CNN             |good        |Provides best    |
+|                             |                                 |weights statically  |dataset         |                |            |perf, may have   |
+|                             |                                 |quantized (int8)    |                |                |            |big impact on    |
+|                             |                                 |                    |                |                |            |accuracy, good   |
+|                             |                                 |                    |                |                |            |for hardwares    |
+|                             |                                 |                    |                |                |            |that only support|
+|                             |                                 |                    |                |                |            |int8 computation |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Dynamic Quantization             |activation and      |fine-tuning     |MLP, Embedding  |best        |Limited support  |
+|                             |                                 |weight are fake     |dataset         |                |            |for now          |
+|                             |                                 |quantized           |                |                |            |                 |
+|                             +---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+|                             |Static Quantization              |activatio nand      |fine-tuning     |CNN, MLP,       |best        |Typically used   |
+|                             |                                 |weight are fake     |dataset         |Embedding       |            |when static      |
+|                             |                                 |quantized           |                |                |            |quantization     |
+|                             |                                 |                    |                |                |            |leads to bad     |
+|                             |                                 |                    |                |                |            |accuracy, and    |
+|                             |                                 |                    |                |                |            |used to close the|
+|                             |                                 |                    |                |                |            |accuracy gap     |
+|Quantization Aware Training  |                                 |                    |                |                |            |                 |
++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Quantization Flow Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PyTorch provides two modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization.
+
+Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
+
+FX Graph Mode Quantization is an automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
+
+New users of quantization are encouraged to try out FX Graph Mode Quantization first, if it does not work, user may try to follow the guideline of `using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_ or fall back to eager mode quantization.
+
+The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization:
+
++-----------------+-------------------+-------------------+
+|                 |Eager Mode         |FX Graph           |
+|                 |Quantization       |Mode               |
+|                 |                   |Quantization       |
++-----------------+-------------------+-------------------+
+|Release          |beta               |prototype          |
+|Status           |                   |                   |
++-----------------+-------------------+-------------------+
+|Operator         |Manual             |Automatic          |
+|Fusion           |                   |                   |
++-----------------+-------------------+-------------------+
+|Quant/DeQuant    |Manual             |Automatic          |
+|Placement        |                   |                   |
++-----------------+-------------------+-------------------+
+|Quantizing       |Supported          |Supported          |
+|Modules          |                   |                   |
++-----------------+-------------------+-------------------+
+|Quantizing       |Manual             |Automatic          |
+|Functionals/Torch|                   |                   |
+|Ops              |                   |                   |
++-----------------+-------------------+-------------------+
+|Support for      |Limited Support    |Fully              |
+|Customization    |                   |Supported          |
++-----------------+-------------------+-------------------+
+|Quantization Mode|Post Training      |Post Training      |
+|Support          |Quantization:      |Quantization:      |
+|                 |Static, Dynamic,   |Static, Dynamic,   |
+|                 |Weight Only        |Weight Only        |
+|                 |                   |                   |
+|                 |Quantization Aware |Quantization Aware |
+|                 |Training:          |Training:          |
+|                 |Static             |Static             |
++-----------------+-------------------+-------------------+
+|Input/Output     |``torch.nn.Module``|``torch.nn.Module``|
+|Model Type       |                   |(May need some     |
+|                 |                   |refactors to make  |
+|                 |                   |the model          |
+|                 |                   |compatible with FX |
+|                 |                   |Graph Mode         |
+|                 |                   |Quantization)      |
++-----------------+-------------------+-------------------+
+
+Backend/Hardware Support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
++-----------------+---------------+------------+------------+------------+
+|Hardware         |Kernel Library |Eager Mode  |FX Graph    |Quantization|
+|                 |               |Quantization|Mode        |Mode Support|
+|                 |               |            |Quantization|            |
++-----------------+---------------+------------+------------+------------+
+|server CPU       |fbgemm         |Supported                |All         |
+|                 |               |                         |Supported   |
++-----------------+---------------+                         |            +
+|mobile CPU       |qnnpack/xnnpack|                         |            |
+|                 |               |                         |            |
++-----------------+---------------+------------+------------+------------+
+|server GPU       |TensorRT (early|Not support |Supported   |Static      |
+|                 |prototype)     |this it     |            |Quantization|
+|                 |               |requries a  |            |            |
+|                 |               |graph       |            |            |
++-----------------+---------------+------------+------------+------------+
+
+Today, PyTorch supports the following backends for running quantized operators efficiently:
+
+* x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via `fbgemm <https://github.com/pytorch/FBGEMM>`_
+* ARM CPUs (typically found in mobile/embedded devices), via `qnnpack <https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native/quantized/cpu/qnnpack>`_
+* (early prototype) support for NVidia GPU via `TensorRT <https://developer.nvidia.com/tensorrt>`_ through `fx2trt` (to be open sourced)
+
+
+Note for native CPU backends
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We expose both `fbgemm` and `qnnpack` with the same native pytorch quantized operators, so we need additional flag to distinguish between them. The corresponding implementation of `fbgemm` and `qnnpack` is chosen automatically based on the PyTorch build mode, though users have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`.
 
 When preparing a quantized model, it is necessary to ensure that qconfig
 and the engine used for quantized computations match the backend on which
@@ -561,6 +672,74 @@ Default settings for qnnpack::
     # set the qengine to control weight packing
     torch.backends.quantized.engine = 'qnnpack'
 
+Operator Support
+^^^^^^^^^^^^^^^^^^^^
+
+Operator coverage varies between dynamic and static quantization and is captured in the table below.
+Note that for FX Graph Mode Quantization, the corresponding functionals are also supported.
+
++---------------------------+-------------------+--------------------+
+|                           |Static             | Dynamic            |
+|                           |Quantization       | Quantization       |
++---------------------------+-------------------+--------------------+
+| | nn.Linear               | | Y               | | Y                |
+| | nn.Conv1d/2d/3d         | | Y               | | N                |
++---------------------------+-------------------+--------------------+
+| | nn.LSTM                 | | N               | | Y                |
+| | nn.GRU                  | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+| | nn.RNNCell              | | N               | | Y                |
+| | nn.GRUCell              | | N               | | Y                |
+| | nn.LSTMCell             | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+|nn.EmbeddingBag            | Y (activations    |                    |
+|                           | are in fp32)      | Y                  |
++---------------------------+-------------------+--------------------+
+|nn.Embedding               | Y                 | N                  |
++---------------------------+-------------------+--------------------+
+|nn.MultiheadAttention      |Not Supported      | Not supported      |
++---------------------------+-------------------+--------------------+
+|Activations                |Broadly supported  | Un-changed,        |
+|                           |                   | computations       |
+|                           |                   | stay in fp32       |
++---------------------------+-------------------+--------------------+
+
+Note: this will be updated with some information generated from native backend_config_dict soon.
+
+Quantization API Reference
+---------------------------
+
+The :doc:`Quantization API Reference <quantization-support>` contains documentation
+of quantization APIs, such as quantization passes, quantized tensor operations,
+and supported quantized modules and functions.
+
+.. toctree::
+    :hidden:
+
+    quantization-support
+
+Quantization Backend Configuration
+----------------------------------
+
+The :doc:`Quantization Backend Configuration <quantization-backend-configuration>` contains documentation
+on how to configure the quantization workflows for various backends.
+
+.. toctree::
+    :hidden:
+
+    quantization-backend-configuration
+
+Quantization Accuracy Debugging
+-------------------------------
+
+The :doc:`Quantization Accuracy Debugging <quantization-accuracy-debugging>` contains documentation
+on how to debug quantization accuracy.
+
+.. toctree::
+    :hidden:
+
+    quantization-accuracy-debugging
+
 Quantization Customizations
 ---------------------------
 
@@ -710,46 +889,14 @@ Example::
     mq = torch.quantization.quantize_fx.convert_fx(
         mp, convert_custom_config_dict=convert_custom_config_dict)
 
-Model Preparation for Quantization (Eager Mode)
------------------------------------------------
-
-It is necessary to currently make some modifications to the model definition
-prior to Eager mode quantization. This is because currently quantization works on a module
-by module basis. Specifically, for all quantization techniques, the user needs to:
-
-1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form (for example,
-   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
-2. Specify which parts of the model need to be quantized either by assigning
-   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
-   For example, setting ``model.conv1.qconfig = None`` means that the
-   ``model.conv`` layer will not be quantized, and setting
-   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
-   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
-   of the global qconfig.
-
-For static quantization techniques which quantize activations, the user needs
-to do the following in addition:
-
-1. Specify where activations are quantized and de-quantized. This is done using
-   :class:`~torch.quantization.QuantStub` and
-   :class:`~torch.quantization.DeQuantStub` modules.
-2. Use :class:`torch.nn.quantized.FloatFunctional` to wrap tensor operations
-   that require special handling for quantization into modules. Examples
-   are operations like ``add`` and ``cat`` which require special handling to
-   determine output quantization parameters.
-3. Fuse modules: combine operations/modules into a single module to obtain
-   higher accuracy and performance. This is done using the
-   :func:`torch.quantization.fuse_modules` API, which takes in lists of modules
-   to be fused. We currently support the following fusions:
-   [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
-
 Best Practices
 --------------
 
-1. Set the ``reduce_range`` argument on observers to `True` if you are using the
-   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
-   by reducing the range of quantized data type by 1 bit.
+1. If you are using the ``fbgemm`` backend, we need to use 7 bits instead of 8 bits. Make sure you reduce the range for the ``quant\_min``, ``quant\_max``, e.g.
+if ``dtype`` is ``torch.quint8``, make sure to set a custom ``quant_min`` to be ``0`` and ``quant_max`` to be ``127`` (``255`` / ``2``)
+if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be ``-64`` (``-128`` / ``2``) and ``quant_max`` to be ``63`` (``127`` / ``2``), we already set this correctly if
+you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for
+``fbgemm`` or ``qnnpack`` backend
 
 Common Errors
 ---------------------------------------
@@ -873,13 +1020,29 @@ An example::
   b.seek(0)
   scripted_quantized = torch.jit.load(b)
 
-Numerical Debugging (prototype)
--------------------------------
-
-.. warning ::
-     Numerical debugging tooling is early prototype and subject to change.
-
-* :ref:`torch_ao_ns_numeric_suite`
-  Eager mode numeric suite
-* :ref:`torch_ao_ns_numeric_suite_fx`
-  FX numeric suite
+Symbolic Trace Error when using FX Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Symbolic traceability is a requirement for `(Prototype) FX Graph Mode Quantization`_, so if you pass a PyTorch Model that is not symbolically traceable to `torch.ao.quantization.prepare_fx` or `torch.ao.quantization.prepare_qat_fx`, we might see an error like the following::
+
+  torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow
+
+Please take a look at `Limitations of Symbolic Tracing <https://docs-preview.pytorch.org/76223/fx.html#limitations-of-symbolic-tracing>`_ and use - `User Guide on Using FX Graph Mode Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html>`_ to workaround the problem.
+
+
+.. torch.ao is missing documentation. Since part of it is mentioned here, adding them here for now.
+.. They are here for tracking purposes until they are more permanently fixed.
+.. py:module:: torch.ao
+.. py:module:: torch.ao.nn
+.. py:module:: torch.ao.nn.sparse
+.. py:module:: torch.ao.nn.sparse.quantized
+.. py:module:: torch.ao.nn.sparse.quantized.dynamic
+.. py:module:: torch.ao.ns
+.. py:module:: torch.ao.ns.fx
+.. py:module:: torch.ao.quantization
+.. py:module:: torch.ao.quantization.fx
+.. py:module:: torch.ao.quantization.backend_config
+.. py:module:: torch.ao.sparsity
+.. py:module:: torch.ao.sparsity.experimental
+.. py:module:: torch.ao.sparsity.experimental.pruner
+.. py:module:: torch.ao.sparsity.scheduler
+.. py:module:: torch.ao.sparsity.sparsifier
diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index 2e801f3b69ce..89f146bfd68e 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -190,6 +190,18 @@ Example::
     :members:
     :inherited-members:
 
+.. note ::
+  The RPC framework does not automatically retry any
+  :meth:`~torch.distributed.rpc.rpc_sync`,
+  :meth:`~torch.distributed.rpc.rpc_async` and
+  :meth:`~torch.distributed.rpc.remote` calls. The reason being that there is
+  no way the RPC framework can determine whether an operation is idempotent or
+  not and whether it is safe to retry. As a result, it is the application's
+  responsibility to deal with failures and retry if necessary. RPC communication
+  is based on TCP and as a result failures could happen due to network failures
+  or intermittent network connectivity issues. In such scenarios, the application
+  needs to retry appropriately with reasonable backoffs to ensure the network
+  isn't overwhelmed by aggressive retries.
 
 .. _rref:
 
diff --git a/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py b/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py
new file mode 100644
index 000000000000..7d12a441c440
--- /dev/null
+++ b/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py
@@ -0,0 +1,21 @@
+"""
+This script generates a CSV table with all ATen operators
+supported by `torch.onnx.export`. The generated table is included by
+docs/source/onnx_supported_aten_list.rst.
+"""
+
+import os
+from torch.onnx import onnx_supported_ops
+
+# Constants
+BUILD_DIR = 'build'
+AUTO_GEN_ATEN_OPS_CSV_FILE = 'auto_gen_aten_op_list.csv'
+
+os.makedirs(BUILD_DIR, exist_ok=True)
+
+aten_list = onnx_supported_ops.onnx_supported_ops()
+
+with open(os.path.join(BUILD_DIR, AUTO_GEN_ATEN_OPS_CSV_FILE), 'w') as f:
+    f.write('Operator,opset_version(s)\n')
+    for name, opset_version in aten_list:
+        f.write(f'"``{name}``","{opset_version}"\n')
diff --git a/docs/source/scripts/build_quantization_configs.py b/docs/source/scripts/build_quantization_configs.py
new file mode 100644
index 000000000000..6ab4fd433eff
--- /dev/null
+++ b/docs/source/scripts/build_quantization_configs.py
@@ -0,0 +1,62 @@
+"""
+This script will generate default values of quantization configs.
+These are for use in the documentation.
+"""
+
+import torch
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+from torch.ao.quantization.backend_config.utils import (
+    entry_to_pretty_str,
+    remove_boolean_dispatch_from_name,
+)
+import os.path
+
+
+# Create a directory for the images, if it doesn't exist
+QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH = os.path.join(
+    os.path.realpath(os.path.join(__file__, "..")),
+    "quantization_backend_configs"
+)
+
+if not os.path.exists(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH):
+    os.mkdir(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH)
+
+output_path = os.path.join(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH, "default_backend_config.txt")
+
+with open(output_path, "w") as f:
+    native_backend_config_dict = get_native_backend_config_dict()
+
+    configs = native_backend_config_dict['configs']
+
+    def _sort_key_func(entry):
+        pattern = entry['pattern']
+        while isinstance(pattern, tuple):
+            pattern = pattern[-1]
+
+        pattern = remove_boolean_dispatch_from_name(pattern)
+        if not isinstance(pattern, str):
+            # methods are already strings
+            pattern = torch.typename(pattern)
+
+        # we want
+        #
+        #   torch.nn.modules.pooling.AdaptiveAvgPool1d
+        #
+        # and
+        #
+        #   torch._VariableFunctionsClass.adaptive_avg_pool1d
+        #
+        # to be next to each other, so convert to all lower case
+        # and remove the underscores, and compare the last part
+        # of the string
+        pattern_str_normalized = pattern.lower().replace('_', '')
+        key = pattern_str_normalized.split('.')[-1]
+        return key
+
+    configs.sort(key=_sort_key_func)
+
+    entries = []
+    for entry in configs:
+        entries.append(entry_to_pretty_str(entry))
+    entries = ",\n".join(entries)
+    f.write(entries)
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 178e4cb18603..564df4ef4323 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -1,3 +1,5 @@
+.. automodule:: torch.sparse
+
 .. currentmodule:: torch
 
 .. _sparse-docs:
diff --git a/docs/source/special.rst b/docs/source/special.rst
index 1aa24242fad9..42acd2148a6a 100644
--- a/docs/source/special.rst
+++ b/docs/source/special.rst
@@ -7,8 +7,6 @@ torch.special
 The torch.special module, modeled after SciPy's `special <https://docs.scipy.org/doc/scipy/reference/special.html>`_ module.
 
 .. automodule:: torch.special
-    :noindex:
-
 .. currentmodule:: torch.special
 
 Functions
@@ -39,6 +37,7 @@ Functions
 .. autofunction:: multigammaln
 .. autofunction:: ndtr
 .. autofunction:: ndtri
+.. autofunction:: log_ndtr
 .. autofunction:: round
 .. autofunction:: sinc
 .. autofunction:: softmax
diff --git a/docs/source/storage.rst b/docs/source/storage.rst
index 3aeec082b607..747acf11ed36 100644
--- a/docs/source/storage.rst
+++ b/docs/source/storage.rst
@@ -1,87 +1,96 @@
 torch.Storage
 ===================================
 
-A :class:`torch.Storage` is a contiguous, one-dimensional array of a single
-data type.
+A :class:`torch._TypedStorage` is a contiguous, one-dimensional array of
+elements of a particular :class:`torch.dtype`. It can be given any
+:class:`torch.dtype`, and the internal data will be interpretted appropriately.
 
-Every :class:`torch.Tensor` has a corresponding storage of the same data type.
+Every strided :class:`torch.Tensor` contains a :class:`torch._TypedStorage`,
+which stores all of the data that the :class:`torch.Tensor` views.
 
-.. autoclass:: torch.DoubleStorage
+For backward compatibility, there are also :class:`torch.<type>Storage` classes
+(like :class:`torch.FloatStorage`, :class:`torch.IntStorage`, etc). These
+classes are not actually instantiated, and calling their constructors creates
+a :class:`torch._TypedStorage` with the appropriate :class:`torch.dtype`.
+:class:`torch.<type>Storage` classes have all of the same class methods that
+:class:`torch._TypedStorage` has.
+
+Also for backward compatibility, :class:`torch.Storage` is an alias for the
+storage class that corresponds with the default data type
+(:func:`torch.get_default_dtype()`). For instance, if the default data type is
+:attr:`torch.float`, :class:`torch.Storage` resolves to
+:class:`torch.FloatStorage`.
+
+
+.. autoclass:: torch._TypedStorage
    :members:
    :undoc-members:
    :inherited-members:
 
+.. autoclass:: torch.DoubleStorage
+   :members:
+   :undoc-members:
+
 .. autoclass:: torch.FloatStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.HalfStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.LongStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.IntStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.ShortStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.CharStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.ByteStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.BoolStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.BFloat16Storage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.ComplexDoubleStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.ComplexFloatStorage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.QUInt8Storage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.QInt8Storage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.QInt32Storage
    :members:
    :undoc-members:
-   :inherited-members:
 
 .. autoclass:: torch.QUInt4x2Storage
    :members:
    :undoc-members:
-   :inherited-members:
+
+.. autoclass:: torch.QUInt2x4Storage
+   :members:
+   :undoc-members:
diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst
index e62eb29f6d4d..aa68e8f805fe 100644
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@@ -12,7 +12,7 @@ Each ``torch.Tensor`` has a :class:`torch.dtype`, :class:`torch.device`, and :cl
 torch.dtype
 -----------
 
-.. class:: torch.dtype
+.. class:: dtype
 
 A :class:`torch.dtype` is an object that represents the data type of a
 :class:`torch.Tensor`. PyTorch has twelve different data types:
@@ -134,7 +134,7 @@ Casting Examples::
 torch.device
 ------------
 
-.. class:: torch.device
+.. class:: device
 
 A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
 or will be allocated.
@@ -204,7 +204,7 @@ Via a string and device ordinal:
 torch.layout
 ------------
 
-.. class:: torch.layout
+.. class:: layout
 
 .. warning::
   The ``torch.layout`` class is in beta and subject to change.
@@ -236,7 +236,7 @@ For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`.
 torch.memory_format
 -------------------
 
-.. class:: torch.memory_format
+.. class:: memory_format
 
 A :class:`torch.memory_format` is an object representing the memory format on which a :class:`torch.Tensor` is
 or will be allocated.
diff --git a/docs/source/tensorboard.rst b/docs/source/tensorboard.rst
index d3205e3ba589..8cd138369288 100644
--- a/docs/source/tensorboard.rst
+++ b/docs/source/tensorboard.rst
@@ -1,5 +1,6 @@
 torch.utils.tensorboard
 ===================================
+.. automodule:: torch.utils.tensorboard
 
 Before going further, more details on TensorBoard can be found at
 https://www.tensorflow.org/tensorboard/
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 090824e0ee3c..e88c382df17e 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -21,8 +21,8 @@ Data type                               dtype
 64-bit floating point                   ``torch.float64`` or ``torch.double``       :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
 16-bit floating point [1]_              ``torch.float16`` or ``torch.half``         :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
 16-bit floating point [2]_              ``torch.bfloat16``                          :class:`torch.BFloat16Tensor` :class:`torch.cuda.BFloat16Tensor`
-32-bit complex                          ``torch.complex32``
-64-bit complex                          ``torch.complex64``
+32-bit complex                          ``torch.complex32`` or ``torch.chalf``
+64-bit complex                          ``torch.complex64`` or ``torch.cfloat``
 128-bit complex                         ``torch.complex128`` or ``torch.cdouble``
 8-bit integer (unsigned)                ``torch.uint8``                             :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
 8-bit integer (signed)                  ``torch.int8``                              :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
@@ -32,7 +32,7 @@ Data type                               dtype
 Boolean                                 ``torch.bool``                              :class:`torch.BoolTensor`     :class:`torch.cuda.BoolTensor`
 quantized 8-bit integer (unsigned)      ``torch.quint8``                            :class:`torch.ByteTensor`     /
 quantized 8-bit integer (signed)        ``torch.qint8``                             :class:`torch.CharTensor`     /
-quantized 32-bit integer (signed)       ``torch.qfint32``                           :class:`torch.IntTensor`      /
+quantized 32-bit integer (signed)       ``torch.qint32``                            :class:`torch.IntTensor`      /
 quantized 4-bit integer (unsigned) [3]_ ``torch.quint4x2``                          :class:`torch.ByteTensor`     /
 ======================================= =========================================== ============================= ================================
 
@@ -315,6 +315,9 @@ Tensor class reference
     Tensor.cumprod_
     Tensor.cumsum
     Tensor.cumsum_
+    Tensor.chalf
+    Tensor.cfloat
+    Tensor.cdouble
     Tensor.data_ptr
     Tensor.deg2rad
     Tensor.dequantize
@@ -416,6 +419,8 @@ Tensor class reference
     Tensor.index_fill
     Tensor.index_put_
     Tensor.index_put
+    Tensor.index_reduce_
+    Tensor.index_reduce
     Tensor.index_select
     Tensor.indices
     Tensor.inner
@@ -593,6 +598,8 @@ Tensor class reference
     Tensor.scatter_
     Tensor.scatter_add_
     Tensor.scatter_add
+    Tensor.scatter_reduce_
+    Tensor.scatter_reduce
     Tensor.select
     Tensor.select_scatter
     Tensor.set_
@@ -618,7 +625,6 @@ Tensor class reference
     Tensor.size
     Tensor.slogdet
     Tensor.slice_scatter
-    Tensor.solve
     Tensor.sort
     Tensor.split
     Tensor.sparse_mask
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 213e82b9c4ca..d1a63f645dfc 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,11 +1,6 @@
 torch.testing
 =============
 
-.. warning::
-
-    This module is a beta release, and its interfaces and functionality may change without warning in future
-    PyTorch releases.
-
 .. automodule:: torch.testing
 
 .. autofunction:: assert_close
diff --git a/docs/source/torch.overrides.rst b/docs/source/torch.overrides.rst
index 0630b60c4b17..ce3583afa71e 100644
--- a/docs/source/torch.overrides.rst
+++ b/docs/source/torch.overrides.rst
@@ -14,6 +14,8 @@ Functions
 
 .. autofunction::  get_overridable_functions
 
+.. autofunction::  resolve_name
+
 .. autofunction::  get_testing_overrides
 
 .. autofunction::  handle_torch_function
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 1d3f3ce85b2c..6c71331440ea 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -1,13 +1,6 @@
 torch
 =====
-The torch package contains data structures for multi-dimensional
-tensors and defines mathematical operations over these tensors.
-Additionally, it provides many utilities for efficient serializing of
-Tensors and arbitrary types, and other useful utilities.
-
-It has a CUDA counterpart, that enables you to run your tensor computations
-on an NVIDIA GPU with compute capability >= 3.0
-
+.. automodule:: torch
 .. currentmodule:: torch
 
 Tensors
@@ -58,6 +51,7 @@ Creation Ops
     as_tensor
     as_strided
     from_numpy
+    from_dlpack
     frombuffer
     zeros
     zeros_like
@@ -102,6 +96,7 @@ Indexing, Slicing, Joining, Mutating Ops
     hstack
     index_add
     index_copy
+    index_reduce
     index_select
     masked_select
     movedim
@@ -117,6 +112,7 @@ Indexing, Slicing, Joining, Mutating Ops
     select_scatter
     slice_scatter
     scatter_add
+    scatter_reduce
     split
     squeeze
     stack
@@ -582,7 +578,6 @@ BLAS and LAPACK Operations
     outer
     pinverse
     qr
-    solve
     svd
     svd_lowrank
     pca_lowrank
@@ -609,7 +604,24 @@ Utilities
     is_deterministic_algorithms_warn_only_enabled
     set_deterministic_debug_mode
     get_deterministic_debug_mode
+    set_float32_matmul_precision
+    get_float32_matmul_precision
     set_warn_always
     is_warn_always_enabled
     vmap
     _assert
+
+
+.. Empty submodules added only for tracking.
+.. py:module:: torch.contrib
+.. py:module:: torch.utils.backcompat
+
+.. This submodule is split manually without a top level page.
+.. py:module:: torch.utils
+
+.. This module is only used internally for ROCm builds.
+.. py:module:: torch.utils.hipify
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.utils.model_dump
diff --git a/ios/LibTorch-Lite.podspec b/ios/LibTorch-Lite.podspec
index f3ccaa43e932..d2d9264e0a62 100644
--- a/ios/LibTorch-Lite.podspec
+++ b/ios/LibTorch-Lite.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch-Lite'
-    s.version          = '1.10.0'
+    s.version          = '1.11.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'
diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index 22aaafac9d12..77bc0537e89e 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.10.0'
+    s.version          = '1.11.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'
diff --git a/ios/TestApp/TestApp/Base.lproj/Main.storyboard b/ios/TestApp/TestApp/Base.lproj/Main.storyboard
index ad8e8f7c874c..86c53ddccf22 100644
--- a/ios/TestApp/TestApp/Base.lproj/Main.storyboard
+++ b/ios/TestApp/TestApp/Base.lproj/Main.storyboard
@@ -1,38 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="18122" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="zHv-VB-ug3">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="19529" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="zHv-VB-ug3">
     <device id="retina6_1" orientation="portrait" appearance="light"/>
     <dependencies>
         <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="18093"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="19519"/>
         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="System colors in document resources" minToolsVersion="11.0"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
         <!--View Controller-->
         <scene sceneID="tne-QT-ifu">
             <objects>
-                <viewController id="BYZ-38-t0r" sceneMemberID="viewController">
+                <viewController id="BYZ-38-t0r" customClass="ViewController" sceneMemberID="viewController">
                     <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
                         <rect key="frame" x="0.0" y="0.0" width="414" height="896"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <subviews>
-                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="3QV-4Z-f2v">
-                                <rect key="frame" x="20" y="108" width="374" height="734"/>
-                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
-                                <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
-                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                            </textView>
-                        </subviews>
                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                         <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <constraints>
-                            <constraint firstItem="3QV-4Z-f2v" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="20" id="LmH-nY-m0w"/>
-                            <constraint firstItem="3QV-4Z-f2v" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" constant="20" id="Qdn-Ua-oPp"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="3QV-4Z-f2v" secondAttribute="bottom" constant="20" id="RGf-vE-yDP"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="3QV-4Z-f2v" secondAttribute="trailing" constant="20" id="rb2-GC-nMV"/>
-                        </constraints>
                     </view>
                     <navigationItem key="navigationItem" id="zRt-2x-Qpi"/>
                 </viewController>
@@ -59,12 +43,4 @@
             <point key="canvasLocation" x="131.8840579710145" y="137.94642857142856"/>
         </scene>
     </scenes>
-    <resources>
-        <systemColor name="labelColor">
-            <color white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-        </systemColor>
-        <systemColor name="systemBackgroundColor">
-            <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-        </systemColor>
-    </resources>
 </document>
diff --git a/ios/TestApp/TestApp/ViewController.mm b/ios/TestApp/TestApp/ViewController.mm
index 38404ddac3b9..d8ecacda3c83 100644
--- a/ios/TestApp/TestApp/ViewController.mm
+++ b/ios/TestApp/TestApp/ViewController.mm
@@ -4,4 +4,9 @@ @interface ViewController ()
 @end
 
 @implementation ViewController
+
+- (void)viewDidLoad {
+  [super viewDidLoad];
+}
+
 @end
diff --git a/ios/TestApp/TestAppTests/TestLiteInterpreter.mm b/ios/TestApp/TestAppTests/TestLiteInterpreter.mm
index f35642a148e3..37c8692b9980 100644
--- a/ios/TestApp/TestAppTests/TestLiteInterpreter.mm
+++ b/ios/TestApp/TestAppTests/TestLiteInterpreter.mm
@@ -11,8 +11,8 @@ @interface TestAppTests : XCTestCase
 @implementation TestAppTests {
 }
 
-- (void)testLiteInterpreter {
-  NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_lite"
+- (void)testCoreML {
+  NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_coreml"
                                                                          ofType:@"ptl"];
   auto module = torch::jit::_load_for_mobile(modelPath.UTF8String);
   c10::InferenceMode mode;
@@ -21,14 +21,173 @@ - (void)testLiteInterpreter {
   XCTAssertTrue(outputTensor.numel() == 1000);
 }
 
-- (void)testCoreML {
-  NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_coreml"
+- (void)testModel:(NSString*)filename {
+  // model generated using the current pytorch revision
+  [self runModel:[NSString stringWithFormat:@"%@_temp", filename]];
+  // model generated using older pyotrch revision
+  [self runModel:filename];
+}
+
+- (void)runModel:(NSString*)filename {
+  NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:filename
                                                                          ofType:@"ptl"];
-  auto module = torch::jit::_load_for_mobile(modelPath.UTF8String);
+  XCTAssertNotNil(modelPath);
   c10::InferenceMode mode;
-  auto input = torch::ones({1, 3, 224, 224}, at::kFloat);
-  auto outputTensor = module.forward({input}).toTensor();
-  XCTAssertTrue(outputTensor.numel() == 1000);
+  auto module = torch::jit::_load_for_mobile(modelPath.UTF8String);
+  auto has_bundled_input = module.find_method("get_all_bundled_inputs");
+  if (has_bundled_input) {
+    c10::IValue bundled_inputs = module.run_method("get_all_bundled_inputs");
+    c10::List<at::IValue> all_inputs = bundled_inputs.toList();
+    std::vector<std::vector<at::IValue>> inputs;
+    for (at::IValue input : all_inputs) {
+      inputs.push_back(input.toTupleRef().elements());
+    }
+    // run with the first bundled input
+    XCTAssertNoThrow(module.forward(inputs[0]));
+  } else {
+    XCTAssertNoThrow(module.forward({}));
+  }
+}
+
+// TODO remove this once updated test script
+- (void)testLiteInterpreter {
+  XCTAssertTrue(true);
+}
+
+- (void)testMobileNetV2 {
+  [self testModel:@"mobilenet_v2"];
+}
+
+- (void)testPointwiseOps {
+  [self testModel:@"pointwise_ops"];
+}
+
+- (void)testReductionOps {
+  [self testModel:@"reduction_ops"];
+}
+
+- (void)testComparisonOps {
+  [self testModel:@"comparison_ops"];
+}
+
+- (void)testOtherMathOps {
+  [self testModel:@"other_math_ops"];
+}
+
+- (void)testSpectralOps {
+  [self testModel:@"spectral_ops"];
+}
+
+- (void)testBlasLapackOps {
+  [self testModel:@"blas_lapack_ops"];
+}
+
+- (void)testSamplingOps {
+  [self testModel:@"sampling_ops"];
+}
+
+- (void)testTensorOps {
+  [self testModel:@"tensor_general_ops"];
+}
+
+- (void)testTensorCreationOps {
+  [self testModel:@"tensor_creation_ops"];
+}
+
+- (void)testTensorIndexingOps {
+  [self testModel:@"tensor_indexing_ops"];
+}
+
+- (void)testTensorTypingOps {
+  [self testModel:@"tensor_typing_ops"];
+}
+
+- (void)testTensorViewOps {
+  [self testModel:@"tensor_view_ops"];
+}
+
+- (void)testConvolutionOps {
+  [self testModel:@"convolution_ops"];
+}
+
+- (void)testPoolingOps {
+  [self testModel:@"pooling_ops"];
+}
+
+- (void)testPaddingOps {
+  [self testModel:@"padding_ops"];
+}
+
+- (void)testActivationOps {
+  [self testModel:@"activation_ops"];
+}
+
+- (void)testNormalizationOps {
+  [self testModel:@"normalization_ops"];
+}
+
+- (void)testRecurrentOps {
+  [self testModel:@"recurrent_ops"];
+}
+
+- (void)testTransformerOps {
+  [self testModel:@"transformer_ops"];
+}
+
+- (void)testLinearOps {
+  [self testModel:@"linear_ops"];
+}
+
+- (void)testDropoutOps {
+  [self testModel:@"dropout_ops"];
+}
+
+- (void)testSparseOps {
+  [self testModel:@"sparse_ops"];
+}
+
+- (void)testDistanceFunctionOps {
+  [self testModel:@"distance_function_ops"];
+}
+
+- (void)testLossFunctionOps {
+  [self testModel:@"loss_function_ops"];
+}
+
+- (void)testVisionFunctionOps {
+  [self testModel:@"vision_function_ops"];
+}
+
+- (void)testShuffleOps {
+  [self testModel:@"shuffle_ops"];
+}
+
+- (void)testNNUtilsOps {
+  [self testModel:@"nn_utils_ops"];
+}
+
+- (void)testQuantOps {
+  [self testModel:@"general_quant_ops"];
+}
+
+- (void)testDynamicQuantOps {
+  [self testModel:@"dynamic_quant_ops"];
+}
+
+- (void)testStaticQuantOps {
+  [self testModel:@"static_quant_ops"];
+}
+
+- (void)testFusedQuantOps {
+  [self testModel:@"fused_quant_ops"];
+}
+
+- (void)testTorchScriptBuiltinQuantOps {
+  [self testModel:@"torchscript_builtin_ops"];
+}
+
+- (void)testTorchScriptCollectionQuantOps {
+  [self testModel:@"torchscript_collection_ops"];
 }
 
 @end
diff --git a/ios/TestApp/models/activation_ops.ptl b/ios/TestApp/models/activation_ops.ptl
new file mode 100644
index 000000000000..44673efd446e
Binary files /dev/null and b/ios/TestApp/models/activation_ops.ptl differ
diff --git a/ios/TestApp/models/android_api_module.ptl b/ios/TestApp/models/android_api_module.ptl
new file mode 100644
index 000000000000..df62dd862088
Binary files /dev/null and b/ios/TestApp/models/android_api_module.ptl differ
diff --git a/ios/TestApp/models/blas_lapack_ops.ptl b/ios/TestApp/models/blas_lapack_ops.ptl
new file mode 100644
index 000000000000..fea933ee644f
Binary files /dev/null and b/ios/TestApp/models/blas_lapack_ops.ptl differ
diff --git a/ios/TestApp/models/comparison_ops.ptl b/ios/TestApp/models/comparison_ops.ptl
new file mode 100644
index 000000000000..01b1c153e751
Binary files /dev/null and b/ios/TestApp/models/comparison_ops.ptl differ
diff --git a/ios/TestApp/models/convolution_ops.ptl b/ios/TestApp/models/convolution_ops.ptl
new file mode 100644
index 000000000000..de776834eb77
Binary files /dev/null and b/ios/TestApp/models/convolution_ops.ptl differ
diff --git a/ios/TestApp/models/distance_function_ops.ptl b/ios/TestApp/models/distance_function_ops.ptl
new file mode 100644
index 000000000000..cc4d994f440a
Binary files /dev/null and b/ios/TestApp/models/distance_function_ops.ptl differ
diff --git a/ios/TestApp/models/dropout_ops.ptl b/ios/TestApp/models/dropout_ops.ptl
new file mode 100644
index 000000000000..422c2f60e6be
Binary files /dev/null and b/ios/TestApp/models/dropout_ops.ptl differ
diff --git a/ios/TestApp/models/dynamic_quant_ops.ptl b/ios/TestApp/models/dynamic_quant_ops.ptl
new file mode 100644
index 000000000000..573dee91f07b
Binary files /dev/null and b/ios/TestApp/models/dynamic_quant_ops.ptl differ
diff --git a/ios/TestApp/models/fused_quant_ops.ptl b/ios/TestApp/models/fused_quant_ops.ptl
new file mode 100644
index 000000000000..d24e3d8d4caa
Binary files /dev/null and b/ios/TestApp/models/fused_quant_ops.ptl differ
diff --git a/ios/TestApp/models/general_quant_ops.ptl b/ios/TestApp/models/general_quant_ops.ptl
new file mode 100644
index 000000000000..5254d33b4794
Binary files /dev/null and b/ios/TestApp/models/general_quant_ops.ptl differ
diff --git a/ios/TestApp/models/linear_ops.ptl b/ios/TestApp/models/linear_ops.ptl
new file mode 100644
index 000000000000..36915823843c
Binary files /dev/null and b/ios/TestApp/models/linear_ops.ptl differ
diff --git a/ios/TestApp/models/loss_function_ops.ptl b/ios/TestApp/models/loss_function_ops.ptl
new file mode 100644
index 000000000000..4c0592e5485a
Binary files /dev/null and b/ios/TestApp/models/loss_function_ops.ptl differ
diff --git a/ios/TestApp/models/mobilenet_v2.ptl b/ios/TestApp/models/mobilenet_v2.ptl
new file mode 100644
index 000000000000..b034aaf8c802
Binary files /dev/null and b/ios/TestApp/models/mobilenet_v2.ptl differ
diff --git a/ios/TestApp/models/model_coreml.ptl b/ios/TestApp/models/model_coreml.ptl
new file mode 100644
index 000000000000..1f2271b365f3
Binary files /dev/null and b/ios/TestApp/models/model_coreml.ptl differ
diff --git a/ios/TestApp/models/model_lite.ptl b/ios/TestApp/models/model_lite.ptl
new file mode 100644
index 000000000000..9aef3bd6b546
Binary files /dev/null and b/ios/TestApp/models/model_lite.ptl differ
diff --git a/ios/TestApp/models/nn_utils_ops.ptl b/ios/TestApp/models/nn_utils_ops.ptl
new file mode 100644
index 000000000000..726b200a67d1
Binary files /dev/null and b/ios/TestApp/models/nn_utils_ops.ptl differ
diff --git a/ios/TestApp/models/normalization_ops.ptl b/ios/TestApp/models/normalization_ops.ptl
new file mode 100644
index 000000000000..1846009a3b72
Binary files /dev/null and b/ios/TestApp/models/normalization_ops.ptl differ
diff --git a/ios/TestApp/models/other_math_ops.ptl b/ios/TestApp/models/other_math_ops.ptl
new file mode 100644
index 000000000000..7209c3b3bd1f
Binary files /dev/null and b/ios/TestApp/models/other_math_ops.ptl differ
diff --git a/ios/TestApp/models/padding_ops.ptl b/ios/TestApp/models/padding_ops.ptl
new file mode 100644
index 000000000000..4af0418f11a6
Binary files /dev/null and b/ios/TestApp/models/padding_ops.ptl differ
diff --git a/ios/TestApp/models/pointwise_ops.ptl b/ios/TestApp/models/pointwise_ops.ptl
new file mode 100644
index 000000000000..948ed4832660
Binary files /dev/null and b/ios/TestApp/models/pointwise_ops.ptl differ
diff --git a/ios/TestApp/models/pooling_ops.ptl b/ios/TestApp/models/pooling_ops.ptl
new file mode 100644
index 000000000000..4b98f1971ee5
Binary files /dev/null and b/ios/TestApp/models/pooling_ops.ptl differ
diff --git a/ios/TestApp/models/recurrent_ops.ptl b/ios/TestApp/models/recurrent_ops.ptl
new file mode 100644
index 000000000000..10804040be84
Binary files /dev/null and b/ios/TestApp/models/recurrent_ops.ptl differ
diff --git a/ios/TestApp/models/reduction_ops.ptl b/ios/TestApp/models/reduction_ops.ptl
new file mode 100644
index 000000000000..0f1fccea7134
Binary files /dev/null and b/ios/TestApp/models/reduction_ops.ptl differ
diff --git a/ios/TestApp/models/sampling_ops.ptl b/ios/TestApp/models/sampling_ops.ptl
new file mode 100644
index 000000000000..416be7cb1279
Binary files /dev/null and b/ios/TestApp/models/sampling_ops.ptl differ
diff --git a/ios/TestApp/models/shuffle_ops.ptl b/ios/TestApp/models/shuffle_ops.ptl
new file mode 100644
index 000000000000..5e5520118764
Binary files /dev/null and b/ios/TestApp/models/shuffle_ops.ptl differ
diff --git a/ios/TestApp/models/sparse_ops.ptl b/ios/TestApp/models/sparse_ops.ptl
new file mode 100644
index 000000000000..a16f68f8f95f
Binary files /dev/null and b/ios/TestApp/models/sparse_ops.ptl differ
diff --git a/ios/TestApp/models/spectral_ops.ptl b/ios/TestApp/models/spectral_ops.ptl
new file mode 100644
index 000000000000..9828dd2ba901
Binary files /dev/null and b/ios/TestApp/models/spectral_ops.ptl differ
diff --git a/ios/TestApp/models/static_quant_ops.ptl b/ios/TestApp/models/static_quant_ops.ptl
new file mode 100644
index 000000000000..f0f0a09b832d
Binary files /dev/null and b/ios/TestApp/models/static_quant_ops.ptl differ
diff --git a/ios/TestApp/models/tensor_creation_ops.ptl b/ios/TestApp/models/tensor_creation_ops.ptl
new file mode 100644
index 000000000000..d897b43cd36c
Binary files /dev/null and b/ios/TestApp/models/tensor_creation_ops.ptl differ
diff --git a/ios/TestApp/models/tensor_general_ops.ptl b/ios/TestApp/models/tensor_general_ops.ptl
new file mode 100644
index 000000000000..6f2855ea83ea
Binary files /dev/null and b/ios/TestApp/models/tensor_general_ops.ptl differ
diff --git a/ios/TestApp/models/tensor_indexing_ops.ptl b/ios/TestApp/models/tensor_indexing_ops.ptl
new file mode 100644
index 000000000000..ac9cb8c4b94a
Binary files /dev/null and b/ios/TestApp/models/tensor_indexing_ops.ptl differ
diff --git a/ios/TestApp/models/tensor_typing_ops.ptl b/ios/TestApp/models/tensor_typing_ops.ptl
new file mode 100644
index 000000000000..3e2f4d8cc689
Binary files /dev/null and b/ios/TestApp/models/tensor_typing_ops.ptl differ
diff --git a/ios/TestApp/models/tensor_view_ops.ptl b/ios/TestApp/models/tensor_view_ops.ptl
new file mode 100644
index 000000000000..5e2dc8294842
Binary files /dev/null and b/ios/TestApp/models/tensor_view_ops.ptl differ
diff --git a/ios/TestApp/models/torchscript_builtin_ops.ptl b/ios/TestApp/models/torchscript_builtin_ops.ptl
new file mode 100644
index 000000000000..2d2532df2fd2
Binary files /dev/null and b/ios/TestApp/models/torchscript_builtin_ops.ptl differ
diff --git a/ios/TestApp/models/torchscript_collection_ops.ptl b/ios/TestApp/models/torchscript_collection_ops.ptl
new file mode 100644
index 000000000000..ce434b3b4210
Binary files /dev/null and b/ios/TestApp/models/torchscript_collection_ops.ptl differ
diff --git a/ios/TestApp/models/transformer_ops.ptl b/ios/TestApp/models/transformer_ops.ptl
new file mode 100644
index 000000000000..4546569cd7fd
Binary files /dev/null and b/ios/TestApp/models/transformer_ops.ptl differ
diff --git a/ios/TestApp/models/vision_function_ops.ptl b/ios/TestApp/models/vision_function_ops.ptl
new file mode 100644
index 000000000000..e1f8c39c78ab
Binary files /dev/null and b/ios/TestApp/models/vision_function_ops.ptl differ
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index bdee55daf179..cfd6130f7255 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -195,7 +195,7 @@ void PerfNetObserver::Start() {
   int skipIters = ObserverConfig::getSkipIters();
   int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
   // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-  if (skipIters <= numRuns_ && sampleRate > 0 && rand() % sampleRate == 0) {
+  if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
     visitCount++;
     if (visitCount == netFollowupSampleCount) {
       visitCount = 0;
@@ -238,9 +238,9 @@ void PerfNetObserver::Stop() {
 
   if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
     const auto& operators = subject_->GetOperators();
-    for (int idx = 0; idx < operators.size(); ++idx) {
+    for (unsigned idx = 0; idx < operators.size(); ++idx) {
       const auto* op = operators[idx];
-      auto name = getObserverName(op, idx);
+      auto name = getObserverName(op, static_cast<int>(idx));
       PerformanceInformation p;
       const PerfOperatorObserver* opObserver =
           static_cast<const PerfOperatorObserver*>(observerMap_[op]);
diff --git a/mypy.ini b/mypy.ini
index a3ec144806e4..61442c1a7d69 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -41,7 +41,7 @@ files =
 #
 # `exclude` is a regex, not a list of paths like `files` (sigh)
 #
-exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal
+exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal|torch/distributed/fsdp/fully_sharded_data_parallel.py
 
 # Minimum version supported - variable annotations were introduced
 # in Python 3.7
diff --git a/mypy_plugins/check_mypy_version.py b/mypy_plugins/check_mypy_version.py
index 02a02a60b950..a34b8683c989 100644
--- a/mypy_plugins/check_mypy_version.py
+++ b/mypy_plugins/check_mypy_version.py
@@ -9,7 +9,7 @@ def get_correct_mypy_version():
     # there's probably a more elegant way to do this
     match, = re.finditer(
         r'mypy==(\d+(?:\.\d+)*)',
-        Path('.circleci/docker/common/install_conda.sh').read_text(),
+        Path('.circleci/docker/requirements-ci.txt').read_text(),
     )
     version, = match.groups()
     return version
diff --git a/pt_defs.oss.bzl b/pt_defs.oss.bzl
new file mode 100644
index 000000000000..2219138a9002
--- /dev/null
+++ b/pt_defs.oss.bzl
@@ -0,0 +1,809 @@
+load("@bazel_skylib//lib:paths.bzl", "paths")
+load(
+    "//tools:build_variables.bzl",
+    "aten_native_source_list",
+)
+load(
+    "//tools:ufunc_defs.bzl",
+    "aten_ufunc_generated_cpu_kernel_sources",
+    "aten_ufunc_generated_cpu_sources",
+    "aten_ufunc_generated_cuda_sources",
+)
+load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
+load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
+
+USED_PT_BACKENDS = [
+    "CPU",
+    "QuantizedCPU",
+    "SparseCPU",  # brings ~20 kb size regression
+]
+
+# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892
+PT_BACKEND_HEADERS = [
+    "CPU",
+    "CUDA",
+    "CompositeExplicitAutograd",
+    "CompositeImplicitAutograd",
+    "Meta",
+]
+
+PT_BASE_OPS = [
+    "aten::_coalesced_",
+    "aten::_copy_from",
+    "aten::_empty_affine_quantized",
+    "aten::_empty_per_channel_affine_quantized",
+    "aten::_indices",
+    "aten::_nnz",
+    "aten::_values",
+    "aten::add",
+    "aten::add_",
+    "aten::arange",
+    "aten::as_strided",
+    "aten::as_strided_",
+    "aten::cat",
+    "aten::clone",
+    "aten::coalesce",
+    "aten::contiguous",
+    "aten::copy_",
+    "aten::copy_sparse_to_sparse_",
+    "aten::dense_dim",
+    "aten::dequantize",
+    "aten::div",
+    "aten::div_",
+    "aten::empty",
+    "aten::empty_like",
+    "aten::empty_strided",
+    "aten::empty.memory_format",
+    "aten::eq",
+    "aten::equal",
+    "aten::expand",
+    "aten::fill_",
+    "aten::is_coalesced",
+    "aten::is_complex",
+    "aten::is_floating_point",
+    "aten::is_leaf",
+    "aten::is_nonzero",
+    "aten::item",
+    "aten::max",
+    "aten::min",
+    "aten::mul",
+    "aten::mul_",
+    "aten::narrow",
+    "aten::ne",
+    "aten::permute",
+    "aten::q_per_channel_axis",
+    "aten::q_per_channel_scales",
+    "aten::q_per_channel_zero_points",
+    "aten::q_scale",
+    "aten::q_zero_point",
+    "aten::qscheme",
+    "aten::quantize_per_tensor",
+    "aten::reshape",
+    "aten::_reshape_alias",
+    "aten::resize_",
+    "aten::resize_as_",
+    "aten::scalar_tensor",
+    "aten::select",
+    "aten::set_",
+    "aten::size",
+    "aten::slice",
+    "aten::sparse_dim",
+    "aten::sparse_resize_and_clear_",
+    "aten::squeeze",
+    "aten::squeeze_",
+    "aten::stride",
+    "aten::sub",
+    "aten::sub_",
+    "aten::sum",
+    "aten::t",
+    "aten::to",
+    "aten::_to_copy",
+    "aten::unsqueeze",
+    "aten::view",
+    "aten::zero_",
+    "aten::zeros",
+    "aten::zeros_like",
+]
+
+def get_aten_compiler_flags():
+    return ATEN_COMPILER_FLAGS
+
+def get_generate_code_bin_outs():
+    return {
+        "autograd/generated/ADInplaceOrViewTypeEverything.cpp": ["autograd/generated/ADInplaceOrViewTypeEverything.cpp"],
+        "autograd/generated/ADInplaceOrViewType_0.cpp": ["autograd/generated/ADInplaceOrViewType_0.cpp"],
+        "autograd/generated/ADInplaceOrViewType_1.cpp": ["autograd/generated/ADInplaceOrViewType_1.cpp"],
+        "autograd/generated/Functions.cpp": ["autograd/generated/Functions.cpp"],
+        "autograd/generated/Functions.h": ["autograd/generated/Functions.h"],
+        "autograd/generated/TraceTypeEverything.cpp": ["autograd/generated/TraceTypeEverything.cpp"],
+        "autograd/generated/TraceType_0.cpp": ["autograd/generated/TraceType_0.cpp"],
+        "autograd/generated/TraceType_1.cpp": ["autograd/generated/TraceType_1.cpp"],
+        "autograd/generated/TraceType_2.cpp": ["autograd/generated/TraceType_2.cpp"],
+        "autograd/generated/TraceType_3.cpp": ["autograd/generated/TraceType_3.cpp"],
+        "autograd/generated/TraceType_4.cpp": ["autograd/generated/TraceType_4.cpp"],
+        "autograd/generated/VariableType.h": ["autograd/generated/VariableType.h"],
+        "autograd/generated/VariableTypeEverything.cpp": ["autograd/generated/VariableTypeEverything.cpp"],
+        "autograd/generated/VariableType_0.cpp": ["autograd/generated/VariableType_0.cpp"],
+        "autograd/generated/VariableType_1.cpp": ["autograd/generated/VariableType_1.cpp"],
+        "autograd/generated/VariableType_2.cpp": ["autograd/generated/VariableType_2.cpp"],
+        "autograd/generated/VariableType_3.cpp": ["autograd/generated/VariableType_3.cpp"],
+        "autograd/generated/VariableType_4.cpp": ["autograd/generated/VariableType_4.cpp"],
+        "autograd/generated/variable_factories.h": ["autograd/generated/variable_factories.h"],
+    }
+
+ATEN_COMPILER_FLAGS = [
+    "-fexceptions",
+    "-frtti",
+    "-fPIC",
+    "-Os",
+    "-Wno-absolute-value",
+    "-Wno-deprecated-declarations",
+    "-Wno-macro-redefined",
+    "-Wno-tautological-constant-out-of-range-compare",
+    "-Wno-unknown-pragmas",
+    "-Wno-unknown-warning-option",
+    "-Wno-unused-function",
+    "-Wno-unused-variable",
+    "-Wno-pass-failed",
+    "-Wno-shadow",
+]
+
+PT_COMPILER_FLAGS = [
+    "-frtti",
+    "-Os",
+    "-Wno-unknown-pragmas",
+    "-Wno-write-strings",
+    "-Wno-unused-variable",
+    "-Wno-unused-function",
+    "-Wno-deprecated-declarations",
+    "-Wno-shadow",
+    "-Wno-global-constructors",
+    "-Wno-missing-prototypes",
+    "-std=gnu++17",  # to accommodate Eigen
+]
+
+def get_template_source_dict():
+    ret = {}
+    for file_path in TEMPLATE_SOURCE_LIST:
+        path_prefix = paths.dirname(file_path)
+        if path_prefix not in ret:
+            ret[path_prefix] = []
+        ret[path_prefix].append(file_path)
+    return ret
+
+def get_gen_oplist_outs():
+    return {
+        #"SupportedMobileModelsRegistration.cpp": [
+        #    "SupportedMobileModelsRegistration.cpp",
+        #],
+        "selected_mobile_ops.h": [
+            "selected_mobile_ops.h",
+        ],
+        "selected_operators.yaml": [
+            "selected_operators.yaml",
+        ],
+    }
+
+def get_pt_compiler_flags():
+    return PT_COMPILER_FLAGS
+
+def get_aten_preprocessor_flags():
+    # read_config is not allowed outside of function in Starlark
+    ATEN_PREPROCESSOR_FLAGS = [
+        "-DC10_MOBILE",
+        "-DCPU_CAPABILITY_DEFAULT",
+        "-DCPU_CAPABILITY=DEFAULT",
+        "-DCAFFE2_USE_LITE_PROTO",
+        "-DATEN_CUDNN_ENABLED_FBXPLAT=0",
+        "-DATEN_MKLDNN_ENABLED_FBXPLAT=0",
+        "-DATEN_NNPACK_ENABLED_FBXPLAT=0",
+        "-DATEN_MKL_ENABLED_FBXPLAT=0",
+        "-DATEN_MKL_SEQUENTIAL_FBXPLAT=0",
+        "-DUSE_PYTORCH_METAL",
+        "-DUSE_PYTORCH_QNNPACK",
+        "-DUSE_XNNPACK",
+        "-DNO_EXPORT",
+        "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
+        "-DAT_PARALLEL_OPENMP_FBXPLAT=0",
+        "-DAT_PARALLEL_NATIVE_FBXPLAT=1",
+        "-DAT_PARALLEL_NATIVE_TBB_FBXPLAT=0",
+        "-DUSE_LAPACK_FBXPLAT=0",
+        "-DAT_BLAS_F2C_FBXPLAT=0",
+        "-DAT_BLAS_USE_CBLAS_DOT_FBXPLAT=0",
+        "-DUSE_RUY_QMATMUL",  # need third_party:ruy
+    ]
+
+    # if get_disable_per_op_profiling():
+    ATEN_PREPROCESSOR_FLAGS.append("-DPYTORCH_DISABLE_PER_OP_PROFILING")
+    return ATEN_PREPROCESSOR_FLAGS
+
+TEMPLATE_SOURCE_LIST = [
+    "torch/csrc/jit/runtime/register_prim_ops.cpp",
+    "torch/csrc/jit/runtime/register_special_ops.cpp",
+] + aten_native_source_list
+
+# For selective build, we can lump the CPU and CPU kernel sources altogether
+# because there is only ever one vectorization variant that is compiled
+def aten_ufunc_generated_all_cpu_sources(gencode_pattern = "{}"):
+    return (
+        aten_ufunc_generated_cpu_sources(gencode_pattern) +
+        aten_ufunc_generated_cpu_kernel_sources(gencode_pattern)
+    )
+
+def get_template_registration_files_outs():
+    outs = {}
+
+    for file_path in TEMPLATE_SOURCE_LIST:
+        outs[file_path] = [file_path]
+
+    for base_name in aten_ufunc_generated_all_cpu_sources():
+        file_path = "aten/src/ATen/{}".format(base_name)
+        outs[file_path] = [file_path]
+
+    return outs
+
+def get_pt_preprocessor_flags():
+    # read_config is not allowed outside of function in Starlark
+    PT_PREPROCESSOR_FLAGS = [
+        "-D_THP_CORE",
+        "-DC10_MOBILE",
+        "-DUSE_SCALARS",
+        "-DNO_CUDNN_DESTROY_HANDLE",
+        "-DNO_EXPORT",
+        "-DBUILD_CAFFE2",
+    ]
+    return PT_PREPROCESSOR_FLAGS
+
+def is_arvr_mode():
+    return False
+
+def get_build_from_deps_query():
+    build_from_query = native.read_config("pt", "build_from_deps_query", "1")
+    return bool(int(build_from_query))
+
+def get_enable_lightweight_dispatch():
+    enable_lightweight_dispatch = native.read_config("pt", "enable_lightweight_dispatch", "0")
+    return bool(int(enable_lightweight_dispatch))
+
+def get_static_dispatch_backend():
+    static_dispatch_backend = native.read_config("pt", "static_dispatch_backend", None)
+    if static_dispatch_backend == None:
+        return []
+    return static_dispatch_backend.split(";")
+
+def get_aten_codegen_extra_params(backends):
+    if get_build_from_deps_query():
+        extra_params = {
+            "force_schema_registration": True,
+        }
+        static_backends = get_static_dispatch_backend()
+        if static_backends:
+            extra_params["static_dispatch_backend"] = static_backends
+            extra_params["enabled_backends"] = static_backends
+        else:
+            extra_params["enabled_backends"] = backends
+        return extra_params
+    else:
+        return {}
+
+def gen_aten_files(
+        name,
+        extra_flags = {},
+        visibility = [],
+        compatible_with = []):
+    extra_params = []
+    force_schema_registration = extra_flags.get("force_schema_registration", False)
+    op_registration_allowlist = extra_flags.get("op_registration_allowlist", None)
+    op_selection_yaml_path = extra_flags.get("op_selection_yaml_path", None)
+    enabled_backends = extra_flags.get("enabled_backends", None)
+    static_dispatch_backend = extra_flags.get("static_dispatch_backend", None)
+
+    if force_schema_registration:
+        extra_params.append("--force_schema_registration")
+    if op_registration_allowlist != None and is_string(op_registration_allowlist):
+        extra_params.append("--op_registration_whitelist")
+        extra_params.append(op_registration_allowlist)
+    if op_selection_yaml_path != None and is_string(op_selection_yaml_path):
+        extra_params.append("--op_selection_yaml_path")
+        extra_params.append(op_selection_yaml_path)
+    if enabled_backends != None and is_list(enabled_backends):
+        extra_params.append("--backend_whitelist")
+        extra_params.extend(enabled_backends)
+    if get_enable_lightweight_dispatch():
+        extra_params.append("--skip_dispatcher_op_registration")
+    if static_dispatch_backend:
+        extra_params.append("--static_dispatch_backend")
+        extra_params.extend(static_dispatch_backend)
+        backends = static_dispatch_backend
+    else:
+        backends = enabled_backends
+    fb_xplat_genrule(
+        name = name,
+        default_outs = ["."],
+        outs = get_aten_generated_files(backends),
+        cmd = "$(exe //torchgen:gen) " + " ".join([
+            "--source-path $(location //:aten_src_path)/aten/src/ATen",
+            "--install_dir $OUT",
+        ] + extra_params),
+        visibility = visibility,
+        compatible_with = compatible_with,
+    )
+
+def get_aten_generated_files(enabled_backends):
+    # NB: RegisterMeta counts as an optionally enabled backend,
+    # and is intentionally omitted from here
+    src_files = [
+        "RegisterBackendSelect.cpp",
+        "RegisterCompositeImplicitAutograd.cpp",
+        "RegisterCompositeExplicitAutograd.cpp",
+        "CompositeViewCopyKernels.cpp",
+        "RegisterSchema.cpp",
+        "Declarations.yaml",
+        "Functions.cpp",
+        "Functions.h",
+        "RedispatchFunctions.h",
+        "NativeFunctions.h",
+        "NativeMetaFunctions.h",
+        "MethodOperators.h",
+        "FunctionalInverses.h",
+        "Operators.h",
+        "Operators_0.cpp",
+        "Operators_1.cpp",
+        "Operators_2.cpp",
+        "Operators_3.cpp",
+        "Operators_4.cpp",
+        "CompositeImplicitAutogradFunctions.h",
+        "CompositeImplicitAutogradFunctions_inl.h",
+        "CompositeExplicitAutogradFunctions.h",
+        "CompositeExplicitAutogradFunctions_inl.h",
+        "core/ATenOpList.cpp",
+        "core/TensorBody.h",
+        "core/TensorMethods.cpp",
+        "core/aten_interned_strings.h",
+    ] + get_aten_derived_type_srcs(enabled_backends)
+
+    # This is tiresome.  A better strategy would be to unconditionally
+    # generate these files, and then only actually COMPILE them depended
+    # on the generated set.  C'est la vie...
+    if "CPU" in enabled_backends:
+        src_files.extend(aten_ufunc_generated_cpu_sources())
+        src_files.extend(aten_ufunc_generated_cpu_kernel_sources())
+    if "CUDA" in enabled_backends:
+        # Cannot unconditionally include this, because in the Edge selective
+        # build CUDA is not enabled and thus the ufunc codegen for CUDA gets
+        # skipped
+        src_files.extend(aten_ufunc_generated_cuda_sources())
+
+    res = {}
+    for file_name in src_files:
+        res[file_name] = [file_name]
+    return res
+
+def get_template_registration_file_rules(rule_name):
+    rules = []
+    for file_path in TEMPLATE_SOURCE_LIST:
+        rules.append(":{}[{}]".format(rule_name, file_path))
+    for file_path in aten_ufunc_generated_all_cpu_sources():
+        rules.append(":{}[aten/src/ATen/{}]".format(rule_name, file_path))
+
+    return rules
+
+# Originally, there were two sets of codes in caffe2:aten_cpu, native codes and non-native.
+# Now we have only non-naitve sources in aten_cpu. However, there are some aten related
+# tests that may require both native and non-native codes. This rule is used to generate
+# both aten_cpu and aten_native_cpu. They are using the same compilation setups.
+def build_aten_cpu(name, srcs, deps = []):
+    cxx_library(
+        name = name,
+        srcs = srcs,
+        header_namespace = "",
+        compiler_flags = get_pt_compiler_flags(),
+        exported_preprocessor_flags = get_aten_preprocessor_flags(),
+        link_whole = True,
+        linker_flags = ["-Wl,--no-as-needed", "-ldl"],
+        visibility = ["PUBLIC"],
+        deps = [
+            "//third_party:cpuinfo",
+            "//third_party:glog",
+            "//third_party:XNNPACK",
+            #"//third_party/linker_lib:omp",
+        ],
+        exported_deps = [
+            "//third_party:fmt",
+            "//aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack",
+            "//c10:c10",
+            ":aten_header",
+            ":caffe2_headers",
+            ":common_core",
+            ":generated_aten_config_header",
+            ":generated_aten_headers_cpu",
+            ":jit_core_headers",
+            ":pthreadpool",
+            ":th_header",
+            "//third_party:ruy_lib",
+        ],
+    )
+
+######### selective build #########
+
+def get_pt_ops_deps(name, deps, train = False, enforce_traced_op_list = False, enable_flatbuffer = False, **kwargs):
+    if not get_build_from_deps_query():
+        return deps
+    pt_operator_registry(
+        name,
+        deps,
+        train = train,
+        enforce_traced_op_list = enforce_traced_op_list,
+        enable_flatbuffer = enable_flatbuffer,
+        **kwargs
+    )
+    return deps + [":" + name]
+
+# pt_operator_registry is the method that defines the fb_xplat_cxx_library that contains
+# code for all selected PyTorch Operators and kernel functions. This also includes
+# operator registration into the dispatcher.
+#
+# template_select: bool: Indicates if template based selective build is enabled.
+#
+# enforce_traced_op_list: bool: Enforces that only new-style operator
+# lists based on the all_mobile_model_configs.yaml file and tracing based selective
+# build are used in this library.
+#
+# train: bool: Build this library for training (True) or inference only (False).
+# If built for training, codegen for VariableType is also included.
+#
+# pt_allow_forced_schema_registration: Manually disables forced schema registration when set to false, Default is true.
+# Only does anything when train=True and the app requires full jit then force_schema_registration needs to occur.
+# As Federated Learning migrates to lite interpreter
+# we can slowly turn off forced schema registration as it is useless space and floods the compatibility api
+#
+def pt_operator_registry(
+        name,
+        deps = [],
+        train = False,
+        labels = [],
+        env = [],
+        template_select = True,
+        enforce_traced_op_list = False,
+        pt_allow_forced_schema_registration = True,
+        enable_flatbuffer = False,
+        **kwargs):
+    compatible_with = kwargs.get("compatible_with", [])
+    code_gen_files = pt_operator_query_codegen(name, deps = deps, train = train, enforce_traced_op_list = enforce_traced_op_list, pt_allow_forced_schema_registration = pt_allow_forced_schema_registration, compatible_with = compatible_with)
+    code_gen_srcs = code_gen_files["srcs"]
+
+    lib_deps = [
+        ":aten_cpu",
+        ":torch_mobile_core",
+        "//c10:c10",
+        "//third_party:glog",
+    ]
+
+    #if train:
+    #    lib_deps = lib_deps + ["fbsource//xplat/caffe2:torch_mobile_train"]
+
+    exported_preprocessor_flags = get_aten_preprocessor_flags()
+    exported_preprocessor_flags += kwargs.pop("exported_preprocessor_flags", [])
+    if template_select:
+        # In addition to the
+        # original code-gen select, this option further filter more operators based on
+        # compile-time calculation. Examples include prim ops and any other ops that were
+        # not filtered out before. The purpose of this option is to reduce the production
+        # size further. However, it may have less flexibility, especially for tests from
+        # python, where the used operator list is not explicitly generated. If the tests
+        # are for functionality but not for size, and it's difficult to maintain an explicit
+        # operator list, it's suggested to turn this option off.
+        exported_preprocessor_flags.append("-DTEMPLATE_SELECTIVE_BUILD")
+    kwargs.pop("exported_headers", [])
+    cxx_library(
+        name = name,
+        srcs = code_gen_srcs,
+        linker_flags = [
+            "-Wl,--no-as-needed",
+            "-ldl",
+        ],
+        link_whole = True,
+        soname = "libtorch-code-gen.$(ext)",
+        compiler_flags = get_aten_compiler_flags(),
+        platform_compiler_flags = get_cpukernel_avx2_flags(),
+        platform_deps = get_cpukernel_avx2_deps(),
+        header_namespace = "ATen",
+        exported_headers = code_gen_files["headers"],
+        exported_preprocessor_flags = exported_preprocessor_flags,
+        headers = kwargs.pop("headers", []),
+        deps = lib_deps + [
+            "//third_party:XNNPACK",
+        ],
+        **kwargs
+    )
+
+def get_aten_derived_type_src_rules(aten_rule_name, enabled_backends):
+    return [
+        ":{}[{}]".format(aten_rule_name, "Register" + backend + ".cpp")
+        for backend in enabled_backends
+    ]
+
+def get_aten_selective_cpp_rules(aten_rule_name, enabled_backends):
+    return [
+        ":{}[{}]".format(aten_rule_name, f)
+        for f in ["RegisterCompositeImplicitAutograd.cpp", "RegisterCompositeExplicitAutograd.cpp", "RegisterSchema.cpp", "RegisterBackendSelect.cpp", "CompositeViewCopyKernels.cpp"]
+    ] + get_aten_derived_type_src_rules(aten_rule_name, enabled_backends)
+
+def get_aten_derived_type_srcs(enabled_backends):
+    return [
+        "Register" + derived_type + ".cpp"
+        for derived_type in enabled_backends
+    ] + [
+        derived_type + "Functions.h"
+        for derived_type in enabled_backends
+        if derived_type in PT_BACKEND_HEADERS or derived_type in get_static_dispatch_backend()
+    ] + [
+        derived_type + "Functions_inl.h"
+        for derived_type in enabled_backends
+        if derived_type in PT_BACKEND_HEADERS or derived_type in get_static_dispatch_backend()
+    ]
+
+def pt_operator_query_codegen(name, deps = [], train = False, enforce_traced_op_list = False, pt_allow_forced_schema_registration = True, compatible_with = []):
+    oplist_dir_name = name + "_pt_oplist"
+
+    # @lint-ignore BUCKLINT
+    fb_xplat_genrule(
+        name = oplist_dir_name,
+        cmd = ("$(exe //:gen_oplist) " +
+               "--model_file_list_path $(@query_outputs 'attrfilter(labels, pt_operator_library, deps(set({deps})))') " +
+               ("" if enforce_traced_op_list else "--allow_include_all_overloads ") +
+               "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])),
+        outs = get_gen_oplist_outs(),
+        default_outs = ["."],
+        compatible_with = compatible_with,
+    )
+
+    # Aten files
+    aten_genrule = name + "_aten"
+    extra_flags = {
+        "enabled_backends": USED_PT_BACKENDS,
+        "op_selection_yaml_path": "$(location :{}[selected_operators.yaml])".format(oplist_dir_name),
+    }
+
+    if train and pt_allow_forced_schema_registration:
+        extra_flags["force_schema_registration"] = True
+
+    # if get_enable_lightweight_dispatch():
+    #    unboxing_genrule = name + "_unboxing"
+    #    gen_aten_unboxing_files(
+    #        unboxing_genrule,
+    #        extra_flags = extra_flags,
+    #    )
+
+    static_dispatch_backend = get_static_dispatch_backend()
+    if static_dispatch_backend:
+        extra_flags["static_dispatch_backend"] = static_dispatch_backend
+
+    gen_aten_files(
+        aten_genrule,
+        extra_flags = extra_flags,
+        compatible_with = compatible_with,
+    )
+
+    # unboxing_wrappers files
+    extra_params = [
+        "--operators_yaml_path",
+        "$(location :" + oplist_dir_name + "[selected_operators.yaml])",
+    ]
+    unboxing_and_autograd_genrule = name + "_unboxing_and_autograd"
+    gen_aten_libtorch_files(unboxing_and_autograd_genrule, extra_params, compatible_with)
+
+    # Template runtime files (prim ops, etc)
+    template_registration_genrule = name + "_template_registration"
+    copy_template_registration_files(template_registration_genrule)
+
+    srcs = get_aten_selective_cpp_rules(
+        aten_genrule,
+        static_dispatch_backend if static_dispatch_backend else USED_PT_BACKENDS,
+    ) + get_template_registration_file_rules(
+        template_registration_genrule,
+    ) + ([
+        ":{}[autograd/generated/VariableType_0.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/VariableType_1.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/VariableType_2.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/VariableType_3.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/VariableType_4.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/ADInplaceOrViewType_0.cpp]".format(unboxing_and_autograd_genrule),
+        ":{}[autograd/generated/ADInplaceOrViewType_1.cpp]".format(unboxing_and_autograd_genrule),
+    ] if train else []) + ([
+        #":{}[SupportedMobileModelsRegistration.cpp]".format(oplist_dir_name),
+    ])
+
+    headers = {
+        "selected_mobile_ops.h": ":{}[selected_mobile_ops.h]".format(oplist_dir_name),
+    }
+
+    # if get_enable_lightweight_dispatch():
+    #     srcs.extend([
+    #         ":{}[UnboxingFunctions_0.cpp]".format(unboxing_genrule),
+    #         ":{}[UnboxingFunctions_1.cpp]".format(unboxing_genrule),
+    #         ":{}[UnboxingFunctions_2.cpp]".format(unboxing_genrule),
+    #         ":{}[UnboxingFunctions_3.cpp]".format(unboxing_genrule),
+    #         ":{}[UnboxingFunctions_4.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_0.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_1.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_2.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_3.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_4.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_5.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_6.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_7.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_8.cpp]".format(unboxing_genrule),
+    #         ":{}[RegisterCodegenUnboxedKernels_9.cpp]".format(unboxing_genrule),
+    #     ])
+    #     headers["UnboxingFunctions.h"] = ":{}[UnboxingFunctions.h]".format(unboxing_genrule)
+    return {"headers": headers, "srcs": srcs}
+
+def gen_aten_libtorch_files(name, extra_params = [], compatible_with = []):
+    fb_xplat_genrule(
+        name = name,
+        outs = get_generate_code_bin_outs(),
+        default_outs = ["."],
+        cmd = "mkdir -p tools && " +
+              "$(exe //tools/setup_helpers:generate_code_bin) " + " ".join(
+            # Mobile build only needs libtorch - skip python bindings for now, except
+            # for ovrsource, which needs Python bindings.
+            (["--subset libtorch"] if not is_arvr_mode() else []) + [
+                "--native-functions-path $(location :aten_src_path)/aten/src/ATen/native/native_functions.yaml",
+                "--tags-path $(location :aten_src_path)/aten/src/ATen/native/tags.yaml",  # todo D35992309
+                "--install_dir $OUT",
+            ] + extra_params,
+        ),
+        cmd_exe = "@powershell -Command New-Item -Path tools -ItemType Directory -Force; " +
+                  "$(exe //tools/setup_helpers:generate_code_bin) " + " ".join(
+            # Mobile build only needs libtorch - skip python bindings for now, except
+            # for ovrsource, which needs Python bindings.
+            (["--subset libtorch"] if not is_arvr_mode() else []) + [
+                "--native-functions-path $(location :aten_src_path)/aten/src/ATen/native/native_functions.yaml",
+                "--tags-path $(location :aten_src_path)/aten/src/ATen/native/tags.yaml",
+                "--install_dir $OUT",
+            ] + extra_params,
+        ),
+        compatible_with = compatible_with,
+    )
+
+def copy_template_registration_files(name):
+    cmd = []
+    cmd_exe = []
+
+    template_source_dict = get_template_source_dict()
+
+    # Ideally, we would run one copy command for a single source directory along
+    # with all its child directories, but it's somewhat hard to know if a directory
+    # is a child of another just bu looking at the metadata (directory relative
+    # path) that we currently have since 1 directory could look like a parent of
+    # another and yet come from a different filegroup() rule.
+    #
+    for (path_prefix, file_paths) in template_source_dict.items():
+        cmd.append("mkdir -p $OUT/{}".format(path_prefix))
+        cmd_exe.append("md $OUT/{}".format(path_prefix))
+
+        # Adding *.cpp is a workaround to prevent cp from thrown an error when it
+        # encounters a directory (since -r was not specified). If files with an
+        # extension other than .cpp need to be copied, then the command below
+        # will not work and will need to be updated.
+        #
+        cmd.append("cp -f {0}/{1}/*.cpp $OUT/{1}/".format("$(location :templated_selective_build_srcs)", path_prefix))
+        cmd_exe.append("robocopy /E {0}/{1} $OUT/{1}".format("$(location :templated_selective_build_srcs)", path_prefix))
+
+    cmd.append("mkdir -p $OUT/aten/src/ATen")
+    cmd_exe.append("md $OUT/aten/src/ATen")
+
+    # NB: CUDA is skipped here because this is selective build and CUDA is not
+    # supported for selective build
+    for ufunc_file in aten_ufunc_generated_all_cpu_sources("$(location :gen_aten[{}])"):
+        cmd.append("cp -f " + ufunc_file + " $OUT/aten/src/ATen")
+        cmd_exe.append("copy " + ufunc_file + " $OUT/aten/src/ATen")
+
+    fb_xplat_genrule(
+        name = name,
+        cmd = " && ".join(cmd),
+        cmd_exe = "@powershell -Command " + ("; ".join(cmd_exe)),
+        outs = get_template_registration_files_outs(),
+        default_outs = ["."],
+    )
+
+def pt_operator_library(
+        name,
+        ops = [],
+        exported_deps = [],
+        check_decl = True,
+        train = False,
+        model = None,
+        include_all_operators = False,
+        **kwargs):
+    model_name = name
+
+    if get_build_from_deps_query():
+        ops = [op.strip() for op in ops]
+
+        # If ops are specified, then we are in static selective build mode, so we append
+        # base ops to this list to avoid additional special case logic in subsequent code.
+        if len(ops) > 0:
+            ops.extend(PT_BASE_OPS)
+
+        visibility = kwargs.pop("visibility", ["PUBLIC"])
+
+        fb_xplat_genrule(
+            name = name,
+            out = "model_operators.yaml",
+            cmd = (
+                "$(exe :gen_operators_yaml) " +
+                "{optionally_root_ops} " +
+                "{optionally_training_root_ops} " +
+                "--rule_name {rule_name} " +
+                "--output_path \"${{OUT}}\" " +
+                "--model_name {model_name} " +
+                "--dep_graph_yaml_path pytorch_op_deps.yaml " +
+                "--models_yaml_path all_mobile_model_configs.yaml " +
+                #"{optionally_model_versions} " +
+                #"{optionally_model_assets} " +
+                #"{optionally_model_traced_backends} " +
+                "{optionally_include_all_operators}"
+            ).format(
+                rule_name = name,
+                model_name = model_name,
+                optionally_root_ops = "--root_ops " + (",".join(ops)) if len(ops) > 0 else "",
+                optionally_training_root_ops = "--training_root_ops " + (",".join(ops)) if len(ops) > 0 and train else "",
+                #optionally_model_versions = "--model_versions " + (",".join(model_versions)) if model_versions != None else "",
+                #optionally_model_assets = "--model_assets " + (",".join(model_assets)) if model_assets != None else "",
+                #optionally_model_traced_backends = "--model_traced_backends " + (",".join(model_traced_backends)) if model_traced_backends != None else "",
+                optionally_include_all_operators = "--include_all_operators " if include_all_operators else "",
+            ),
+            labels = ["pt_operator_library"],  # for pt_operator_query_codegen query
+            visibility = visibility,
+            **kwargs
+        )
+    else:
+        if check_decl:
+            pass
+            # ensure_ops_are_declared(ops)
+
+        cxx_library(
+            name = name,
+            compiler_flags = get_pt_compiler_flags(),
+            cxx_platform_compiler_flags = get_cpukernel_avx2_flags(),
+            exported_deps = exported_deps,
+            **kwargs
+        )
+
+def compose_platform_setting_list(settings):
+    """Settings object:
+    os/cpu pair: should be valid key, or at most one part can be wildcard.
+    flags: the values added to the compiler flags
+    """
+    result = []
+    for setting in settings:
+        result = result.append([
+            "^{}-{}$".format(setting["os"], setting["cpu"]),
+            setting["flags"],
+        ])
+    return result
+
+def get_cpukernel_avx2_flags():
+    # flags = compose_platform_setting_list([
+    #     {
+    #         "cpu": "x86_64",
+    #         "flags": ["-DHAVE_AVX2_CPU_DEFINITION"],
+    #         "os": "macosx",
+    #     },
+    # ]) if build_cpukernel_avx2() else []
+    return []
+
+def build_cpukernel_avx2():
+    return not is_arvr_mode()
+
+def get_cpukernel_avx2_deps():
+    # flags = compose_platform_setting_list([
+    #     {
+    #         "cpu": "x86_64",
+    #         "flags": ["fbsource//xplat/caffe2:cpukernel_avx2"],
+    #         "os": "macosx",
+    #     },
+    # ]) if build_cpukernel_avx2() else []
+    return []
diff --git a/scripts/buck_setup.sh b/scripts/buck_setup.sh
new file mode 100644
index 000000000000..0d094fd98e95
--- /dev/null
+++ b/scripts/buck_setup.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+printf "\n[Creating .buckconfig]\n"
+cp .buckconfig.oss .buckconfig
+
+cd third_party || return
+
+printf "\n[Generating wrappers for cpuionfo]\n"
+python3 generate-cpuinfo-wrappers.py
+
+printf "\n[Generating wrappers for xnnpack]\n"
+python3 generate-xnnpack-wrappers.py
+
+# bazel-skylib
+printf "\n[Downloading bazel-skylib-1.0.2]\n"
+curl -L -o /tmp/bazel-skylib-1.0.2.tar.gz https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz
+mkdir bazel-skylib
+tar -xf /tmp/bazel-skylib-1.0.2.tar.gz -C bazel-skylib/
+
+# glog
+printf "\n[Downloading glog-0.4.0]\n"
+curl -L -o /tmp/glog-0.4.0.tar.gz https://github.com/google/glog/archive/v0.4.0.tar.gz
+tar -xf /tmp/glog-0.4.0.tar.gz -C /tmp/
+mv /tmp/glog-0.4.0/ glog/
+
+# ruy
+printf "\n[Downloading ruy]\n"
+curl -L -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip
+unzip -q /tmp/ruy.zip -d /tmp/
+mv /tmp/ruy-a09683b8da7164b9c5704f88aef2dc65aa583e5d ruy/
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index a2dd690012f2..5913f5e8b768 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -117,6 +117,13 @@ if [ "${TRACING_BASED}" == 1 ]; then
 else
   CMAKE_ARGS+=("-DTRACING_BASED=OFF")
 fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
 CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK")
 CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST")
 CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
index b96b8094a606..2bb8763ef17d 100755
--- a/scripts/build_ios.sh
+++ b/scripts/build_ios.sh
@@ -88,6 +88,12 @@ if [ "${TRACING_BASED}" == 1 ]; then
 else
   CMAKE_ARGS+=("-DTRACING_BASED=OFF")
 fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
 
 CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
 
diff --git a/scripts/jit/log_extract.py b/scripts/jit/log_extract.py
new file mode 100644
index 000000000000..61e3172fe0b3
--- /dev/null
+++ b/scripts/jit/log_extract.py
@@ -0,0 +1,104 @@
+import argparse
+import functools
+import traceback
+from torch.utils.jit.log_extract import extract_ir, load_graph_and_inputs, run_baseline_no_fusion, run_nnc, run_nvfuser
+from typing import List, Tuple, Callable, Optional
+
+'''
+Usage:
+1. Run your script and pipe into a log file
+  PYTORCH_JIT_LOG_LEVEL=">>graph_fuser" python3 my_test.py &> log.txt
+2. Run log_extract:
+  log_extract.py log.txt --nvfuser --nnc-dynamic --nnc-static
+
+You can also extract the list of extracted IR:
+  log_extract.py log.txt --output
+
+Passing in --graphs 0 2 will only run graphs 0 and 2
+'''
+
+
+def test_runners(graphs: List[str], runners: List[Tuple[str, Callable]], graph_set: Optional[List[int]]):
+    for i, ir in enumerate(graphs):
+        _, inputs = load_graph_and_inputs(ir)
+        if graph_set and i not in graph_set:
+            continue
+
+        print(f"Running Graph {i}")
+        prev_result = None
+        prev_runner_name = None
+        for runner in runners:
+            runner_name, runner_fn = runner
+            try:
+                result = runner_fn(ir, inputs)
+                if prev_result:
+                    improvement = (prev_result / result - 1) * 100
+                    print(f"{runner_name} : {result:.6f} ms improvement over {prev_runner_name}: improvement: {improvement:.2f}%")
+                else:
+                    print(f"{runner_name} : {result:.6f} ms")
+                prev_result = result
+                prev_runner_name = runner_name
+            except RuntimeError:
+                print(f"  Graph {i} failed for {runner_name} :", traceback.format_exc())
+
+
+def run():
+    parser = argparse.ArgumentParser(
+        description="Extracts torchscript IR from log files and, optionally, benchmarks it or outputs the IR"
+    )
+    parser.add_argument("filename", help="Filename of log file")
+    parser.add_argument("--nvfuser", dest="nvfuser", action="store_true", help="benchmark nvfuser")
+    parser.add_argument("--no-nvfuser", dest="nvfuser", action="store_false", help="DON'T benchmark nvfuser")
+    parser.set_defaults(nvfuser=False)
+    parser.add_argument("--nnc-static", dest="nnc_static", action="store_true", help="benchmark nnc static")
+    parser.add_argument("--no-nnc-static", dest="nnc_static", action="store_false", help="DON'T benchmark nnc static")
+    parser.set_defaults(nnc_static=False)
+
+    parser.add_argument("--nnc-dynamic", dest="nnc_dynamic", action="store_true", help="nnc with dynamic shapes")
+    parser.add_argument(
+        "--no-nnc-dynamic",
+        dest="nnc_dynamic",
+        action="store_false",
+        help="DONT't benchmark nnc with dynamic shapes")
+    parser.set_defaults(nnc_dynamic=False)
+
+
+    parser.add_argument("--baseline", dest="baseline", action="store_true", help="benchmark baseline")
+    parser.add_argument("--no-baseline", dest="baseline", action="store_false", help="DON'T benchmark baseline")
+    parser.set_defaults(baseline=False)
+
+    parser.add_argument("--output", dest="output", action="store_true", help="Output graph IR")
+    parser.add_argument("--no-output", dest="output", action="store_false", help="DON'T output graph IR")
+    parser.set_defaults(output=False)
+
+    parser.add_argument('--graphs', nargs="+", type=int, help="Run only specified graph indices")
+
+
+    args = parser.parse_args()
+    graphs = extract_ir(args.filename)
+
+    graph_set = args.graphs
+    graph_set = graph_set if graph_set else None
+
+    options = []
+    if args.baseline:
+        options.append(("Baseline no fusion", run_baseline_no_fusion))
+    if args.nnc_dynamic:
+        options.append(("NNC Dynamic", functools.partial(run_nnc, dynamic=True)))
+    if args.nnc_static:
+        options.append(("NNC Static", functools.partial(run_nnc, dynamic=False)))
+    if args.nvfuser:
+        options.append(("NVFuser", run_nvfuser))
+
+    test_runners(graphs, options, graph_set)
+
+    if args.output:
+        quoted = []
+        for i, ir in enumerate(graphs):
+            if graph_set and i not in graph_set:
+                continue
+            quoted.append("\"\"\"" + ir + "\"\"\"")
+        print("[" + ", ".join(quoted) + "]")
+
+if __name__ == "__main__":
+    run()
diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 3b39f6005876..b8259eea874e 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -51,7 +51,7 @@ fi
 
 # onnxruntime only support py3
 # "Python.h" not found in py2, needed by TorchScript custom op compilation.
-if [[ "$BUILD_ENVIRONMENT" == *ort_test1* ||  "${SHARD_NUMBER}" == "1" ]]; then
+if [[ "${SHARD_NUMBER}" == "1" ]]; then
   # These exclusions are for tests that take a long time / a lot of GPU
   # memory to run; they should be passing (and you will test them if you
   # run them locally
@@ -69,18 +69,19 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test1* ||  "${SHARD_NUMBER}" == "1" ]]; then
   pytest "${args[@]}" \
     "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset7" \
     "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset8" \
-    "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime" \
+    "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset9" \
     "$top_dir/test/onnx/test_custom_ops.py" \
     "$top_dir/test/onnx/test_models_onnxruntime.py" \
     "$top_dir/test/onnx/test_utility_funs.py" \
     "$top_dir/test/onnx/test_pytorch_onnx_caffe2.py" \
     "$top_dir/test/onnx/test_pytorch_onnx_caffe2_quantized.py" \
-    "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py"
+    "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py" \
+    "$top_dir/test/onnx/test_onnx_export.py"
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *ort_test2* || "${SHARD_NUMBER}" == "2" ]]; then
+if [[ "${SHARD_NUMBER}" == "2" ]]; then
   # Update the loop for new opsets
-  for i in $(seq 10 15); do
+  for i in $(seq 10 16); do
     pytest "${args[@]}" \
       "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i"
   done
diff --git a/scripts/release/cut-release-branch.sh b/scripts/release/cut-release-branch.sh
new file mode 100644
index 000000000000..468dbfb184d9
--- /dev/null
+++ b/scripts/release/cut-release-branch.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+: '
+So you are looking to cut a release branch? Well you came
+to the right script.
+
+This script can be used to cut any branch on any repository
+
+For `pytorch/pytorch` usage would be like:
+> DRY_RUN=disabled cut-release-branch.sh
+
+For `pytorch/builder` or domains usage would be like:
+> DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 cut-release-branch.sh
+'
+
+set -eou pipefail
+
+GIT_TOP_DIR=$(git rev-parse --show-toplevel)
+GIT_REMOTE=${GIT_REMOTE:-origin}
+GIT_BRANCH_TO_CUT_FROM=${GIT_BRANCH_TO_CUT_FROM:-viable/strict}
+
+# should output something like 1.11
+RELEASE_VERSION=${RELEASE_VERSION:-$(cut -d'.' -f1-2 "${GIT_TOP_DIR}/version.txt")}
+
+DRY_RUN_FLAG="--dry-run"
+if [[ ${DRY_RUN:-enabled} == "disabled" ]]; then
+    DRY_RUN_FLAG=""
+fi
+
+
+(
+    set -x
+    git fetch --all
+    git checkout "${GIT_REMOTE}/${GIT_BRANCH_TO_CUT_FROM}"
+)
+
+for branch in "release/${RELEASE_VERSION}" "orig/release/${RELEASE_VERSION}"; do
+    if git rev-parse --verify "${branch}" >/dev/null 2>/dev/null; then
+        echo "+ Branch ${branch} already exists, skipping..."
+        continue
+    else
+        (
+            set -x
+            git checkout "${GIT_REMOTE}/${GIT_BRANCH_TO_CUT_FROM}"
+            git checkout -b "${branch}"
+            git push "${GIT_REMOTE}" "${branch}"
+        )
+    fi
+done
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index 0dd7d0a1692a..4abaffa6fb88 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -84,21 +84,35 @@ def keywordInFile(file, keywords):
     def categorize(commit_hash, title):
         features = get_features(commit_hash, return_dict=True)
         title = features['title']
+        labels = features['labels']
         category = 'Uncategorized'
         topic = 'Untopiced'
 
+        # We ask contributors to label their PR's appropriately
+        # when they're first landed.
+        # Check if the labels are there first.
+        already_categorized = already_topiced = False
+        for label in labels:
+            if label.startswith('release notes: '):
+                category = label.split('release notes: ', 1)[1]
+                already_categorized = True
+            if label.startswith('topic: '):
+                topic = label.split('topic: ', 1)[1]
+                already_topiced = True
+        if already_categorized and already_topiced:
+            return Commit(commit_hash, category, topic, title)
+
         # update this to check if each file starts with caffe2
         if 'caffe2' in title:
             return Commit(commit_hash, 'caffe2', topic, title)
         if '[codemod]' in title.lower():
             return Commit(commit_hash, 'skip', topic, title)
-        labels = features['labels']
         if 'Reverted' in labels:
             return Commit(commit_hash, 'skip', topic, title)
         if 'bc_breaking' in labels:
             topic = 'bc-breaking'
         if 'module: deprecation' in labels:
-            topic = 'module: deprecation'
+            topic = 'deprecation'
 
         files_changed = features['files_changed']
         for file in files_changed:
@@ -128,6 +142,9 @@ def categorize(commit_hash, title):
             if CommitList.keywordInFile(file, ['torch/fx', 'test_fx']):
                 category = 'fx'
                 break
+            if CommitList.keywordInFile(file, ['torch/ao', 'test/ao']):
+                category = 'ao'
+                break
             # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable}
             if CommitList.keywordInFile(file, ['torch/quantization', 'test/quantization', 'aten/src/ATen/native/quantized', 'torch/nn/quantiz']):
                 category = 'quantization'
@@ -141,15 +158,32 @@ def categorize(commit_hash, title):
             if CommitList.keywordInFile(file, ['aten/src/ATen/native/LinearAlgebra.cpp', 'test/test_linalg.py', 'torch/linalg']):
                 category = 'linalg_frontend'
                 break
-            if CommitList.keywordInFile(file, ['torch/sparse']):
+            if CommitList.keywordInFile(file, ['torch/sparse', 'aten/src/ATen/native/sparse', 'torch/_masked/__init__.py']):
                 category = 'sparse_frontend'
                 break
-            if CommitList.keywordInFile(file, ['test/test_nn.py', 'test/test_module.py', 'torch/nn/modules']):
+            if CommitList.keywordInFile(file, ['tools/autograd']):
+                category = 'autograd_frontend'
+                break
+            if CommitList.keywordInFile(file, ['test/test_nn.py', 'test/test_module.py', 'torch/nn/modules', 'torch/nn/functional.py']):
                 category = 'nn_frontend'
                 break
-            if CommitList.keywordInFile(file, ['torch/csrc/jit']):
+            if CommitList.keywordInFile(file, ['torch/csrc/jit', 'torch/jit']):
                 category = 'jit'
                 break
+        else:
+            # Below are some extra quick checks that aren't necessarily file-path related,
+            # but I found that to catch a decent number of extra commits.
+            if len(files_changed) > 0 and all([f_name.endswith('.cu') or f_name.endswith('.cuh') for f_name in files_changed]):
+                category = 'cuda'
+            elif '[PyTorch Edge]' in title:
+                category = 'mobile'
+            elif len(files_changed) == 1 and 'torch/testing/_internal/common_methods_invocations.py' in files_changed[0]:
+                # when this is the only file changed, it's almost always an OpInfo change.
+                category = 'python_frontend'
+            elif len(files_changed) == 1 and 'torch/_torch_docs.py' in files_changed[0]:
+                # individual torch_docs changes are usually for python ops
+                category = 'python_frontend'
+
 
         return Commit(commit_hash, category, topic, title)
 
@@ -198,6 +232,14 @@ def update_existing(path, new_version):
     commits.update_to(new_version)
     commits.write_to_disk()
 
+def rerun_with_new_filters(path):
+    current_commits = CommitList.from_existing(path)
+    for i in range(len(current_commits.commits)):
+        c = current_commits.commits[i]
+        if 'Uncategorized' in str(c):
+            current_commits.commits[i] = CommitList.categorize(c.commit_hash, c.title)
+    current_commits.write_to_disk()
+
 def to_markdown(commit_list, category):
     def cleanup_title(commit):
         match = re.match(r'(.*) \(#\d+\)', commit.title)
@@ -252,6 +294,11 @@ def main():
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument('--create_new', nargs=2)
     group.add_argument('--update_to')
+    # I found this flag useful when experimenting with adding new auto-categorizing filters.
+    # After running commitlist.py the first time, if you add any new filters in this file,
+    # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file,
+    # but only affect the rows that were previously marked as "Uncategorized"
+    group.add_argument('--rerun_with_new_filters', action='store_true')
     group.add_argument('--stat', action='store_true')
     group.add_argument('--export_markdown', action='store_true')
 
@@ -264,6 +311,9 @@ def main():
     if args.update_to:
         update_existing(args.path, args.update_to)
         return
+    if args.rerun_with_new_filters:
+        rerun_with_new_filters(args.path)
+        return
     if args.stat:
         commits = CommitList.from_existing(args.path)
         stats = commits.stat()
diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py
index d09c4ad8ed81..355dee12adaf 100644
--- a/scripts/release_notes/common.py
+++ b/scripts/release_notes/common.py
@@ -10,6 +10,8 @@
 categories = [
     'Uncategorized',
     'distributed',
+    'lazy',
+    'hub',
     'mobile',
     'jit',
     'visualization',
@@ -17,7 +19,9 @@
     'caffe2',
     'quantization',
     'amd',
+    'rocm',
     'cuda',
+    'cudnn',
     'benchmark',
     'profiler',
     'performance_as_product',
@@ -28,6 +32,8 @@
     'code_coverage',
     'vulkan',
     'skip',
+    'composability',
+    'meta_frontend',
     'nn_frontend',
     'linalg_frontend',
     'cpp_frontend',
diff --git a/setup.py b/setup.py
index aa1669a10d30..d23603bc90cb 100644
--- a/setup.py
+++ b/setup.py
@@ -50,6 +50,9 @@
 #   MKLDNN_CPU_RUNTIME
 #     MKL-DNN threading mode: TBB or OMP (default)
 #
+#   USE_STATIC_MKL
+#     Prefer to link with MKL statically - Unix only
+#
 #   USE_NNPACK=0
 #     disables NNPACK build
 #
@@ -506,6 +509,10 @@ def run(self):
                 report('  -- USE_MPI={}'.format(cmake_cache_vars['USE_OPENMPI']))
         else:
             report('-- Building without distributed package')
+        if cmake_cache_vars['STATIC_DISPATCH_BACKEND']:
+            report('-- Using static dispatch with backend {}'.format(cmake_cache_vars['STATIC_DISPATCH_BACKEND']))
+        if cmake_cache_vars['USE_LIGHTWEIGHT_DISPATCH']:
+            report('-- Using lightweight dispatch')
 
         # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
         # in system CFLAGS
@@ -817,7 +824,16 @@ def make_relative_rpath_args(path):
                   include_dirs=[],
                   library_dirs=library_dirs,
                   extra_link_args=extra_link_args + main_link_args + make_relative_rpath_args('lib'))
+    C_flatbuffer = Extension("torch._C_flatbuffer",
+                             libraries=main_libraries,
+                             sources=["torch/csrc/stub_with_flatbuffer.c"],
+                             language='c',
+                             extra_compile_args=main_compile_args + extra_compile_args,
+                             include_dirs=[],
+                             library_dirs=library_dirs,
+                             extra_link_args=extra_link_args + main_link_args + make_relative_rpath_args('lib'))
     extensions.append(C)
+    extensions.append(C_flatbuffer)
 
     if not IS_WINDOWS:
         DL = Extension("torch._dl",
@@ -925,6 +941,7 @@ def print_box(msg):
                 'bin/*',
                 'test/*',
                 '_C/*.pyi',
+                '_C_flatbuffer/*.pyi',
                 'cuda/*.pyi',
                 'optim/*.pyi',
                 'autograd/*.pyi',
@@ -932,6 +949,7 @@ def print_box(msg):
                 'nn/*.pyi',
                 'nn/modules/*.pyi',
                 'nn/parallel/*.pyi',
+                'utils/data/*.pyi',
                 'lib/*.so*',
                 'lib/*.dylib*',
                 'lib/*.dll',
@@ -981,6 +999,7 @@ def print_box(msg):
                 'include/c10/cuda/impl/*.h',
                 'include/c10/hip/*.h',
                 'include/c10/hip/impl/*.h',
+                'include/c10d/*.h',
                 'include/c10d/*.hpp',
                 'include/caffe2/**/*.h',
                 'include/torch/*.h',
@@ -1010,7 +1029,8 @@ def print_box(msg):
                 'include/torch/csrc/autograd/utils/*.h',
                 'include/torch/csrc/cuda/*.h',
                 'include/torch/csrc/deploy/*.h',
-                'include/torch/csrc/deploy/interpreter/interpreter_impl.h',
+                'include/torch/csrc/deploy/interpreter/*.h',
+                'include/torch/csrc/deploy/interpreter/*.hpp',
                 'include/torch/csrc/distributed/c10d/exception.h',
                 'include/torch/csrc/jit/*.h',
                 'include/torch/csrc/jit/backends/*.h',
@@ -1031,7 +1051,9 @@ def print_box(msg):
                 'include/torch/csrc/profiler/*.h',
                 'include/torch/csrc/utils/*.h',
                 'include/torch/csrc/tensor/*.h',
+                'include/torch/csrc/lazy/backend/*.h',
                 'include/torch/csrc/lazy/core/*.h',
+                'include/torch/csrc/lazy/core/ops/*.h',
                 'include/pybind11/*.h',
                 'include/pybind11/detail/*.h',
                 'include/TH/*.h*',
@@ -1058,6 +1080,7 @@ def print_box(msg):
                 'utils/model_dump/code.js',
                 'utils/model_dump/*.mjs',
             ],
+            'torchgen': [],
             'caffe2': [
                 'python/serialized_test/data/operator_test/*.zip',
             ],
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
new file mode 100644
index 000000000000..2d6a839339d9
--- /dev/null
+++ b/test/allowlist_for_publicAPI.json
@@ -0,0 +1,3564 @@
+{
+  "torch.amp.autocast_mode": [
+    "Any",
+    "Optional"
+  ],
+  "torch.ao.nn.sparse.quantized.dynamic.linear": [
+    "LinearBlockSparsePattern",
+    "Optional",
+    "hide_packed_params_repr"
+  ],
+  "torch.ao.nn.sparse.quantized.linear": [
+    "Optional",
+    "hide_packed_params_repr"
+  ],
+  "torch.ao.quantization": [
+    "ABC",
+    "ABCMeta",
+    "Any",
+    "Callable",
+    "Dict",
+    "List",
+    "Module",
+    "Optional",
+    "OrderedDict",
+    "Pattern",
+    "QConfigAny",
+    "Set",
+    "Tuple",
+    "Type",
+    "Union",
+    "abstractmethod",
+    "namedtuple",
+    "partial",
+    "type_before_parametrizations",
+    "wrap_cpp_module"
+  ],
+  "torch.ao.quantization.fake_quantize": [
+    "ABC",
+    "Any",
+    "FixedQParamsObserver",
+    "HistogramObserver",
+    "Module",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "Tuple",
+    "abstractmethod",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_affine_fixed_qparams_observer",
+    "default_dynamic_fake_quant",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_fake_quant",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_histogram_fake_quant",
+    "default_per_channel_weight_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_symmetric_fixed_qparams_observer",
+    "default_weight_fake_quant",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+    "fused_wt_fake_quant_range_neg_127_to_127"
+  ],
+  "torch.ao.quantization.fuse_modules": [
+    "List",
+    "Optional",
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "get_fuser_method",
+    "type_before_parametrizations"
+  ],
+  "torch.ao.quantization.fuser_method_mappings": [
+    "Callable",
+    "Dict",
+    "MatchAllNode",
+    "Optional",
+    "Pattern",
+    "Tuple",
+    "Type",
+    "Union",
+    "get_combined_dict"
+  ],
+  "torch.ao.quantization.backend_config.native": [
+    "Any",
+    "Dict",
+    "FixedQParamsFakeQuantize",
+    "List",
+    "ObservationType",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_affine_fixed_qparams_observer",
+    "default_symmetric_fixed_qparams_observer",
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "fuse_convtranspose_bn",
+    "fuse_linear_bn",
+    "namedtuple",
+    "reverse2",
+    "reverse3",
+    "reverse_sequential_wrapper2"
+  ],
+  "torch.ao.quantization.backend_config.observation_type": [
+    "Enum"
+  ],
+  "torch.ao.quantization.backend_config.tensorrt": [
+    "ObservationType",
+    "reverse_sequential_wrapper2"
+  ],
+  "torch.ao.quantization.quantization_types": [
+      "Any",
+      "Node",
+      "NodePattern",
+      "Pattern",
+      "QuantizerCls",
+      "Tuple",
+      "Union"
+  ],
+  "torch.ao.quantization.fx.convert": [
+    "Any",
+    "Argument",
+    "Callable",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "Node",
+    "Optional",
+    "QConfigAny",
+    "QuantizedGraphModule",
+    "Set",
+    "Tuple",
+    "activation_is_statically_quantized",
+    "collect_producer_nodes",
+    "compare_prepare_convert_qconfig_dict",
+    "convert_dict_to_ordered_dict",
+    "convert_eq_obs",
+    "create_getattr_from_value",
+    "generate_qconfig_map",
+    "get_custom_module_class_keys",
+    "get_fused_module_classes",
+    "get_native_backend_config_dict",
+    "get_pattern_to_dtype_configs",
+    "get_qat_module_classes",
+    "get_qparam_dict",
+    "get_quantize_node_info",
+    "get_root_module_to_quantized_reference_module",
+    "get_swapped_custom_module_class",
+    "graph_module_from_producer_nodes",
+    "is_activation_post_process",
+    "is_observed_module",
+    "is_observed_standalone_module",
+    "is_qconfig_supported_by_dtype_configs",
+    "lower_to_fbgemm",
+    "qconfig_equals",
+    "update_obs_for_equalization",
+    "update_qconfig_for_fusion",
+    "update_qconfig_for_qat",
+    "weight_is_quantized"
+  ],
+  "torch.ao.quantization.fx.fuse": [
+    "ABC",
+    "Any",
+    "Callable",
+    "DefaultFuseHandler",
+    "Dict",
+    "FuseHandler",
+    "FusedGraphModule",
+    "Graph",
+    "GraphModule",
+    "List",
+    "MatchAllNode",
+    "Node",
+    "NodePattern",
+    "Optional",
+    "Pattern",
+    "Tuple",
+    "Union",
+    "abstractmethod",
+    "get_fuser_method_mapping",
+    "get_fuser_method_new",
+    "get_fusion_pattern_to_extra_inputs_getter",
+    "get_fusion_pattern_to_fuse_handler_cls",
+    "get_fusion_pattern_to_root_node_getter",
+    "get_native_backend_config_dict",
+    "is_match",
+    "map_arg",
+    "sorted_patterns_dict"
+  ],
+  "torch.ao.quantization.fx.fusion_patterns": [
+    "ABC",
+    "Any",
+    "Callable",
+    "Dict",
+    "Graph",
+    "List",
+    "MatchAllNode",
+    "Node",
+    "NodePattern",
+    "Optional",
+    "Pattern",
+    "Union",
+    "abstractmethod",
+    "get_fuser_method_new"
+  ],
+  "torch.ao.quantization.fx.graph_module": [
+    "Any",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "Set",
+    "Union"
+  ],
+  "torch.ao.quantization.fx.lower_to_fbgemm": [
+    "Dict",
+    "QConfigAny",
+    "QuantizedGraphModule",
+    "Tuple"
+  ],
+  "torch.ao.quantization.fx.match_utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Graph",
+    "List",
+    "MatchAllNode",
+    "MatchResult",
+    "Node",
+    "Optional",
+    "Pattern",
+    "QConfigAny",
+    "QuantizeHandler",
+    "Set",
+    "Tuple",
+    "is_observed_standalone_module"
+  ],
+  "torch.ao.quantization.fx.pattern_utils": [
+    "Any",
+    "Dict",
+    "FixedQParamsFakeQuantize",
+    "List",
+    "MatchResult",
+    "Node",
+    "ObserverBase",
+    "Optional",
+    "OrderedDict",
+    "Pattern",
+    "QConfigAny",
+    "QuantizeHandler",
+    "Tuple"
+  ],
+  "torch.ao.quantization.fx.prepare": [
+    "Any",
+    "Argument",
+    "Callable",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "MatchResult",
+    "Node",
+    "NodePattern",
+    "ObservedGraphModule",
+    "ObservedStandaloneGraphModule",
+    "ObserverBase",
+    "Optional",
+    "Pattern",
+    "QConfigAny",
+    "QuantizeHandler",
+    "Set",
+    "Tuple",
+    "Union",
+    "activation_is_int8_quantized",
+    "activation_is_statically_quantized",
+    "all_node_args_have_no_tensors",
+    "assert_and_get_unique_device",
+    "convert",
+    "convert_dict_to_ordered_dict",
+    "defaultdict",
+    "find_matches",
+    "generate_qconfig_map",
+    "get_custom_module_class_keys",
+    "get_flattened_qconfig_dict",
+    "get_fusion_pattern_to_root_node_getter",
+    "get_module_to_qat_module",
+    "get_native_backend_config_dict",
+    "get_new_attr_name_with_prefix",
+    "get_non_observable_arg_indexes_and_types",
+    "get_pattern_to_dtype_configs",
+    "get_pattern_to_input_type_to_index",
+    "get_pattern_to_quantize_handlers",
+    "get_qconfig_dtypes",
+    "get_standalone_module_configs",
+    "get_swapped_custom_module_class",
+    "is_activation_post_process",
+    "is_equalization_observer",
+    "is_reuse_input_qconfig",
+    "node_supports_equalization",
+    "propagate_qconfig_",
+    "sorted_patterns_dict",
+    "update_qconfig_for_fusion",
+    "update_qconfig_for_qat"
+  ],
+  "torch.ao.quantization.fx.qconfig_utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "Optional",
+    "QConfig",
+    "QConfigAny",
+    "Set",
+    "Tuple",
+    "add_module_to_qconfig_obs_ctr",
+    "defaultdict",
+    "get_object_type_qconfig",
+    "get_qconfig_dtypes",
+    "is_activation_post_process",
+    "maybe_adjust_qconfig_for_module_type_or_name",
+    "qconfig_equals"
+  ],
+  "torch.ao.quantization.fx.quantization_patterns": [
+    "ABC",
+    "Any",
+    "Callable",
+    "Dict",
+    "Node",
+    "NodePattern",
+    "Optional",
+    "Pattern",
+    "all_node_args_have_no_tensors"
+  ],
+  "torch.ao.quantization.fx.quantization_types": [
+    "Any",
+    "Node",
+    "NodePattern",
+    "Pattern",
+    "QuantizerCls",
+    "Tuple",
+    "Union"
+  ],
+  "torch.ao.quantization.fx.utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "Node",
+    "Optional",
+    "Set",
+    "Tuple",
+    "Type",
+    "Union",
+    "is_activation_post_process",
+    "is_per_channel",
+    "is_per_tensor",
+    "map_arg",
+    "namedtuple"
+  ],
+  "torch.ao.quantization.fx.backend_config_utils": [
+    "Any",
+    "Callable",
+    "DefaultFuseHandler",
+    "Dict",
+    "NodePattern",
+    "ObservationType",
+    "Optional",
+    "Pattern",
+    "QuantizeHandler",
+    "QuantizerCls",
+    "activation_dtype",
+    "get_combined_dict",
+    "get_default_quant_patterns",
+    "get_native_backend_config_dict",
+    "sorted_patterns_dict",
+    "get_quantize_handler_cls"
+  ],
+  "torch.ao.quantization.observer": [
+    "ABC",
+    "ABCMeta",
+    "Any",
+    "Dict",
+    "List",
+    "Optional",
+    "OrderedDict",
+    "Tuple",
+    "Union",
+    "abstractmethod",
+    "calculate_qmin_qmax",
+    "check_min_max_valid",
+    "partial"
+  ],
+  "torch.ao.quantization.qconfig": [
+    "Any",
+    "FakeQuantize",
+    "FakeQuantizeBase",
+    "FusedMovingAvgObsFakeQuantize",
+    "HistogramObserver",
+    "MovingAverageMinMaxObserver",
+    "NoopObserver",
+    "Optional",
+    "PlaceholderObserver",
+    "QConfigAny",
+    "ReuseInputObserver",
+    "default_debug_observer",
+    "default_dynamic_fake_quant",
+    "default_dynamic_quant_observer",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_fake_quant",
+    "default_float_qparams_observer",
+    "default_float_qparams_observer_4bit",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_observer",
+    "default_per_channel_weight_fake_quant",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_reuse_input_observer",
+    "default_weight_fake_quant",
+    "default_weight_observer",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+    "fused_wt_fake_quant_range_neg_127_to_127",
+    "namedtuple",
+    "per_channel_weight_observer_range_neg_127_to_127",
+    "weight_observer_range_neg_127_to_127"
+  ],
+  "torch.ao.quantization.qconfig_dict_utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "OrderedDict",
+    "QConfigAny",
+    "Union",
+    "get_combined_dict",
+    "get_default_qat_module_mappings"
+  ],
+  "torch.ao.quantization.quantization_mappings": [
+    "Any",
+    "Callable",
+    "DeQuantStub",
+    "Dict",
+    "Optional",
+    "QuantStub",
+    "Set",
+    "Union",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "get_combined_dict",
+    "type_before_parametrizations"
+  ],
+  "torch.ao.quantization.quantize": [
+    "DeQuantStub",
+    "QuantWrapper",
+    "activation_is_memoryless",
+    "add_module_to_qconfig_obs_ctr",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qconfig_propagation_list",
+    "get_default_static_quant_module_mappings",
+    "get_default_static_quant_reference_module_mappings",
+    "get_qparam_dict",
+    "has_no_children_ignoring_parametrizations",
+    "no_observer_set",
+    "type_before_parametrizations"
+  ],
+  "torch.ao.quantization.quantize_jit": [
+    "QConfig",
+    "QuantType",
+    "wrap_cpp_module"
+  ],
+  "torch.ao.quantization.utils": [
+    "Any",
+    "Callable",
+    "Pattern",
+    "QuantType",
+    "Tuple",
+    "Union",
+    "is_parametrized",
+    "quant_type_to_str"
+  ],
+  "torch.ao.sparsity.experimental.pruner.base_pruner": [
+    "ActivationReconstruction",
+    "BaseSparsifier",
+    "BiasHook",
+    "ModuleDict",
+    "ModuleList",
+    "PruningParametrization",
+    "ZeroesParametrization",
+    "fqn_to_module",
+    "module_to_fqn"
+  ],
+  "torch.ao.sparsity.experimental.pruner.parametrization": [
+    "Any",
+    "List"
+  ],
+  "torch.ao.sparsity.scheduler.base_scheduler": [
+    "BaseSparsifier",
+    "wraps"
+  ],
+  "torch.ao.sparsity.scheduler.lambda_scheduler": [
+    "BaseScheduler"
+  ],
+  "torch.ao.sparsity.sparsifier.base_sparsifier": [
+    "Dict",
+    "FakeSparsity",
+    "Optional",
+    "Tuple",
+    "defaultdict",
+    "fqn_to_module",
+    "module_to_fqn"
+  ],
+  "torch.ao.sparsity.sparsifier.weight_norm_sparsifier": [
+    "BaseSparsifier",
+    "Tuple",
+    "reduce"
+  ],
+  "torch.autograd": [
+    "NestedIOFunction",
+    "detect_anomaly",
+    "enable_grad",
+    "grad",
+    "gradcheck",
+    "gradgradcheck",
+    "inference_mode",
+    "no_grad",
+    "set_detect_anomaly",
+    "set_grad_enabled",
+    "variable"
+  ],
+  "torch.autograd.function": [
+    "Any",
+    "List",
+    "Optional",
+    "OrderedDict",
+    "with_metaclass"
+  ],
+  "torch.autograd.functional": [
+    "List",
+    "Tuple"
+  ],
+  "torch.autograd.graph": [
+    "Any",
+    "Callable"
+  ],
+  "torch.autograd.profiler": [
+    "Any",
+    "ContextDecorator",
+    "DeviceType",
+    "Dict",
+    "Future",
+    "List",
+    "Optional",
+    "ProfilerActivity",
+    "ProfilerConfig",
+    "ProfilerState",
+    "kineto_available",
+    "warn"
+  ],
+  "torch.autograd.profiler_legacy": [
+    "DeviceType",
+    "EventList",
+    "FunctionEvent",
+    "ProfilerConfig",
+    "ProfilerState",
+    "warn"
+  ],
+  "torch.autograd.profiler_util": [
+    "DeviceType",
+    "Dict",
+    "List",
+    "Optional",
+    "Tuple",
+    "attrgetter",
+    "defaultdict",
+    "namedtuple"
+  ],
+  "torch.autograd.variable": [
+    "ImperativeEngine",
+    "with_metaclass"
+  ],
+  "torch.backends": [
+    "contextmanager"
+  ],
+  "torch.backends.cuda": [
+    "Union"
+  ],
+  "torch.cpu.amp.autocast_mode": [
+    "Any"
+  ],
+  "torch.cuda": [
+    "Any",
+    "Device",
+    "Dict",
+    "List",
+    "Optional",
+    "Tuple",
+    "Union",
+    "classproperty"
+  ],
+  "torch.cuda.amp.autocast_mode": [
+    "Any"
+  ],
+  "torch.cuda.amp.common": [
+    "find_spec"
+  ],
+  "torch.cuda.amp.grad_scaler": [
+    "Any",
+    "Dict",
+    "Enum",
+    "List",
+    "Optional",
+    "Tuple",
+    "amp_definitely_not_available",
+    "defaultdict"
+  ],
+  "torch.cuda.nccl": [
+    "init_rank",
+    "is_available",
+    "unique_id",
+    "version"
+  ],
+  "torch.cuda.profiler": [
+    "check_error",
+    "cudart"
+  ],
+  "torch.distributed": [
+    "AllToAllOptions",
+    "AllreduceCoalescedOptions",
+    "AllreduceOptions",
+    "BarrierOptions",
+    "BroadcastOptions",
+    "BuiltinCommHookType",
+    "Callable",
+    "DebugLevel",
+    "Dict",
+    "Enum",
+    "FileStore",
+    "GatherOptions",
+    "GradBucket",
+    "HashStore",
+    "Logger",
+    "Optional",
+    "PrefixStore",
+    "ProcessGroup",
+    "ProcessGroupGloo",
+    "ReduceOp",
+    "ReduceOptions",
+    "ReduceScatterOptions",
+    "Reducer",
+    "ScatterOptions",
+    "Store",
+    "TCPStore",
+    "Tuple",
+    "Union",
+    "get_debug_level",
+    "set_debug_level",
+    "set_debug_level_from_env",
+    "timedelta",
+    "ProcessGroupMPI",
+    "ProcessGroupNCCL"
+  ],
+  "torch.distributed.algorithms.ddp_comm_hooks": [
+    "DistributedDataParallel",
+    "Enum",
+    "partial"
+  ],
+  "torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks": [
+    "Any",
+    "GradBucket"
+  ],
+  "torch.distributed.algorithms.ddp_comm_hooks.default_hooks": [
+    "Any",
+    "Callable"
+  ],
+  "torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks": [
+    "Any",
+    "Callable"
+  ],
+  "torch.distributed.algorithms.join": [
+    "ABC",
+    "Any",
+    "List",
+    "NamedTuple",
+    "Optional",
+    "TracebackType",
+    "Type",
+    "abstractmethod"
+  ],
+  "torch.distributed.algorithms.model_averaging.averagers": [
+    "ABC",
+    "Dict",
+    "Iterable",
+    "Union",
+    "abstractmethod"
+  ],
+  "torch.distributed.algorithms.model_averaging.utils": [
+    "Dict",
+    "Iterable",
+    "Iterator",
+    "ProcessGroup",
+    "Union",
+    "group"
+  ],
+  "torch.distributed.autograd": [
+    "DistAutogradContext",
+    "backward",
+    "get_gradients"
+  ],
+  "torch.distributed.distributed_c10d": [
+    "AllToAllOptions",
+    "AllreduceCoalescedOptions",
+    "AllreduceOptions",
+    "BarrierOptions",
+    "BroadcastOptions",
+    "Callable",
+    "DebugLevel",
+    "Dict",
+    "GatherOptions",
+    "Optional",
+    "PrefixStore",
+    "ProcessGroup",
+    "ProcessGroupGloo",
+    "ReduceOp",
+    "ReduceOptions",
+    "ReduceScatterOptions",
+    "ScatterOptions",
+    "Store",
+    "Tuple",
+    "Union",
+    "get_debug_level",
+    "register_rendezvous_handler",
+    "rendezvous",
+    "timedelta",
+    "ProcessGroupMPI",
+    "ProcessGroupNCCL"
+  ],
+  "torch.distributed.elastic.agent.server.api": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Enum",
+    "Event",
+    "EventSource",
+    "List",
+    "Optional",
+    "ProcessFailure",
+    "SignalException",
+    "Std",
+    "Store",
+    "Tuple",
+    "Union",
+    "closing",
+    "dataclass",
+    "field",
+    "get_logger",
+    "prof",
+    "put_metric",
+    "record"
+  ],
+  "torch.distributed.elastic.events": [
+    "Dict",
+    "Enum",
+    "EventMetadataValue",
+    "Optional"
+  ],
+  "torch.distributed.elastic.events.api": [
+    "Dict",
+    "Enum",
+    "EventMetadataValue",
+    "Optional",
+    "Union",
+    "asdict",
+    "dataclass",
+    "field"
+  ],
+  "torch.distributed.elastic.events.handlers": [
+    "Dict"
+  ],
+  "torch.distributed.elastic.metrics": [
+    "Optional"
+  ],
+  "torch.distributed.elastic.metrics.api": [
+    "Dict",
+    "Optional",
+    "namedtuple",
+    "wraps"
+  ],
+  "torch.distributed.elastic.multiprocessing": [
+    "Callable",
+    "Dict",
+    "Tuple",
+    "Union",
+    "get_logger"
+  ],
+  "torch.distributed.elastic.multiprocessing.api": [
+    "Any",
+    "Callable",
+    "Dict",
+    "FrameType",
+    "IntFlag",
+    "Optional",
+    "ProcessFailure",
+    "Set",
+    "TailLog",
+    "Tuple",
+    "Union",
+    "dataclass",
+    "field",
+    "nullcontext",
+    "record",
+    "redirect_stderr",
+    "redirect_stdout"
+  ],
+  "torch.distributed.elastic.multiprocessing.errors": [
+    "Any",
+    "Callable",
+    "Dict",
+    "GlobalRank",
+    "JSON",
+    "List",
+    "Optional",
+    "Template",
+    "Tuple",
+    "TypeVar",
+    "dataclass",
+    "datetime",
+    "field",
+    "get_logger",
+    "wraps"
+  ],
+  "torch.distributed.elastic.multiprocessing.errors.error_handler": [
+    "Optional"
+  ],
+  "torch.distributed.elastic.multiprocessing.errors.handlers": [
+    "ErrorHandler"
+  ],
+  "torch.distributed.elastic.multiprocessing.redirects": [
+    "contextmanager",
+    "partial",
+    "redirect_stderr",
+    "redirect_stdout"
+  ],
+  "torch.distributed.elastic.multiprocessing.tail_log": [
+    "Dict",
+    "Event",
+    "Future",
+    "List",
+    "TextIO",
+    "ThreadPoolExecutor"
+  ],
+  "torch.distributed.elastic.rendezvous": [
+    "RendezvousHandlerCreator"
+  ],
+  "torch.distributed.elastic.rendezvous.api": [
+    "ABC",
+    "Any",
+    "Callable",
+    "Dict",
+    "Optional",
+    "RendezvousHandlerCreator",
+    "Store",
+    "Tuple",
+    "abstractmethod"
+  ],
+  "torch.distributed.elastic.rendezvous.dynamic_rendezvous": [
+    "ABC",
+    "Any",
+    "Callable",
+    "Dict",
+    "Enum",
+    "List",
+    "NodeState",
+    "Optional",
+    "PrefixStore",
+    "RendezvousClosedError",
+    "RendezvousError",
+    "RendezvousHandler",
+    "RendezvousParameters",
+    "RendezvousStateError",
+    "RendezvousTimeoutError",
+    "Set",
+    "Store",
+    "Token",
+    "Tuple",
+    "abstractmethod",
+    "cast",
+    "construct_and_record_rdzv_event",
+    "dataclass",
+    "datetime",
+    "timedelta"
+  ],
+  "torch.distributed.elastic.rendezvous.registry": [
+    "RendezvousHandler",
+    "RendezvousParameters",
+    "create_handler"
+  ],
+  "torch.distributed.elastic.rendezvous.utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Event",
+    "Optional",
+    "Thread",
+    "Tuple",
+    "Union",
+    "timedelta"
+  ],
+  "torch.distributed.elastic.timer.api": [
+    "Any",
+    "Dict",
+    "List",
+    "Optional",
+    "Set",
+    "contextmanager",
+    "getframeinfo",
+    "stack"
+  ],
+  "torch.distributed.elastic.timer.local_timer": [
+    "Any",
+    "Dict",
+    "Empty",
+    "List",
+    "RequestQueue",
+    "Set",
+    "TimerClient",
+    "TimerRequest",
+    "TimerServer",
+    "Tuple"
+  ],
+  "torch.distributed.elastic.utils.api": [
+    "Any",
+    "List",
+    "Template"
+  ],
+  "torch.distributed.elastic.utils.data.elastic_distributed_sampler": [
+    "DistributedSampler"
+  ],
+  "torch.distributed.elastic.utils.logging": [
+    "Optional",
+    "get_log_level"
+  ],
+  "torch.distributed.elastic.utils.store": [
+    "List",
+    "timedelta"
+  ],
+  "torch.distributed.fsdp.flatten_params_wrapper": [
+    "Any",
+    "Dict",
+    "Generator",
+    "Iterator",
+    "List",
+    "NamedTuple",
+    "Optional",
+    "ParamOffset",
+    "Sequence",
+    "SharedParamInfo",
+    "Tensor",
+    "Tuple",
+    "Union",
+    "accumulate"
+  ],
+  "torch.distributed.fsdp.fully_sharded_data_parallel": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Enum",
+    "FlatParameter",
+    "FlattenParamsWrapper",
+    "Generator",
+    "Iterable",
+    "Iterator",
+    "List",
+    "Mapping",
+    "NamedTuple",
+    "Optional",
+    "Parameter",
+    "ProcessGroup",
+    "Set",
+    "Shard",
+    "ShardedTensor",
+    "Tuple",
+    "Union",
+    "Variable",
+    "auto",
+    "cast",
+    "contextmanager",
+    "dataclass",
+    "init_from_local_shards"
+  ],
+  "torch.distributed.fsdp.utils": [
+    "Any",
+    "Callable",
+    "Dict",
+    "List",
+    "OrderedDict",
+    "Set",
+    "Tuple",
+    "Union"
+  ],
+  "torch.distributed.fsdp.wrap": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Generator",
+    "Optional",
+    "Set",
+    "Tuple",
+    "Type",
+    "cast"
+  ],
+  "torch.distributed.launcher.api": [
+    "Any",
+    "Callable",
+    "ChildFailedError",
+    "Dict",
+    "List",
+    "LocalElasticAgent",
+    "Optional",
+    "RendezvousParameters",
+    "SignalException",
+    "Std",
+    "Tuple",
+    "Union",
+    "WorkerSpec",
+    "dataclass",
+    "field",
+    "get_logger",
+    "parse_rendezvous_endpoint"
+  ],
+  "torch.distributed.nn": [
+    "Function",
+    "ReduceOp",
+    "group"
+  ],
+  "torch.distributed.nn.api.remote_module": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Iterator",
+    "List",
+    "Mapping",
+    "Module",
+    "Optional",
+    "Parameter",
+    "RemovableHandle",
+    "Set",
+    "Tensor",
+    "Tuple",
+    "Type",
+    "TypeVar",
+    "Union",
+    "device",
+    "dtype"
+  ],
+  "torch.distributed.nn.functional": [
+    "Function",
+    "ReduceOp",
+    "group"
+  ],
+  "torch.distributed.nn.jit.instantiator": [
+    "Optional",
+    "get_remote_module_template"
+  ],
+  "torch.distributed.optim.functional_adadelta": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.distributed.optim.functional_adagrad": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.distributed.optim.functional_adam": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.distributed.optim.functional_adamax": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.distributed.optim.functional_adamw": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.distributed.optim.functional_rmsprop": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.distributed.optim.functional_rprop": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.distributed.optim.functional_sgd": [
+    "Dict",
+    "List",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.distributed.optim.optimizer": [
+    "List",
+    "Lock",
+    "Optional",
+    "RRef",
+    "Tensor",
+    "defaultdict"
+  ],
+  "torch.distributed.optim.utils": [
+    "Type"
+  ],
+  "torch.distributed.pipeline.sync.checkpoint": [
+    "Checkpoint",
+    "Checkpointing",
+    "Context",
+    "Function",
+    "Recompute",
+    "ThreadLocal",
+    "checkpoint",
+    "enable_checkpointing",
+    "enable_recomputing",
+    "restore_rng_states",
+    "save_rng_states"
+  ],
+  "torch.distributed.pipeline.sync.copy": [
+    "Context",
+    "Copy",
+    "Wait"
+  ],
+  "torch.distributed.pipeline.sync.dependency": [
+    "Fork",
+    "Join",
+    "fork",
+    "join"
+  ],
+  "torch.distributed.pipeline.sync.microbatch": [
+    "Batch",
+    "NoChunk",
+    "check",
+    "gather",
+    "scatter"
+  ],
+  "torch.distributed.pipeline.sync.phony": [
+    "get_phony"
+  ],
+  "torch.distributed.pipeline.sync.pipe": [
+    "BalanceError",
+    "PipeSequential",
+    "Pipeline",
+    "WithDevice"
+  ],
+  "torch.distributed.pipeline.sync.pipeline": [
+    "Pipeline"
+  ],
+  "torch.distributed.pipeline.sync.skip.layout": [
+    "SkipLayout",
+    "inspect_skip_layout"
+  ],
+  "torch.distributed.pipeline.sync.skip.portal": [
+    "Context",
+    "Portal",
+    "PortalBlue",
+    "PortalCopy",
+    "PortalOrange"
+  ],
+  "torch.distributed.pipeline.sync.skip.skippable": [
+    "Skippable"
+  ],
+  "torch.distributed.pipeline.sync.skip.tracker": [
+    "SkipTracker",
+    "SkipTrackerThroughPotals",
+    "ThreadLocal",
+    "current_skip_tracker",
+    "use_skip_tracker"
+  ],
+  "torch.distributed.pipeline.sync.stream": [
+    "CPUStreamType",
+    "as_cuda",
+    "current_stream",
+    "default_stream",
+    "get_device",
+    "is_cuda",
+    "new_stream",
+    "record_stream",
+    "use_device",
+    "use_stream",
+    "wait_stream"
+  ],
+  "torch.distributed.pipeline.sync.worker": [
+    "Task",
+    "create_workers",
+    "spawn_workers",
+    "worker"
+  ],
+  "torch.distributed.remote_device": [
+    "Optional",
+    "Union"
+  ],
+  "torch.distributed.rendezvous": [
+    "Dict",
+    "FileStore",
+    "Iterable",
+    "Optional",
+    "PrefixStore",
+    "Store",
+    "TCPStore",
+    "Tuple",
+    "Union",
+    "cast",
+    "timedelta",
+    "urlparse",
+    "urlunparse"
+  ],
+  "torch.distributed.rpc": [
+    "Any",
+    "Dict",
+    "Future",
+    "Generator",
+    "Generic",
+    "GenericWithOneTypeVar",
+    "PyRRef",
+    "RemoteProfilerManager",
+    "RpcAgent",
+    "RpcBackendOptions",
+    "Set",
+    "Store",
+    "TensorPipeAgent",
+    "Tuple",
+    "TypeVar",
+    "WorkerInfo",
+    "enable_gil_profiling",
+    "get_rpc_timeout",
+    "method",
+    "timedelta",
+    "urlparse"
+  ],
+  "torch.distributed.rpc.api": [
+    "Any",
+    "Dict",
+    "Future",
+    "Generic",
+    "GenericWithOneTypeVar",
+    "PyRRef",
+    "PythonUDF",
+    "RPCExecMode",
+    "RemoteProfilerManager",
+    "Set",
+    "TypeVar",
+    "WorkerInfo",
+    "get_rpc_timeout",
+    "method"
+  ],
+  "torch.distributed.rpc.backend_registry": [
+    "Dict",
+    "List",
+    "Set",
+    "Tuple"
+  ],
+  "torch.distributed.rpc.constants": [
+    "timedelta"
+  ],
+  "torch.distributed.rpc.internal": [
+    "Enum"
+  ],
+  "torch.distributed.rpc.options": [
+    "DeviceType",
+    "Dict",
+    "List",
+    "Optional",
+    "Union"
+  ],
+  "torch.distributed.rpc.server_process_global_profiler": [
+    "profile"
+  ],
+  "torch.distributions.bernoulli": [
+    "ExponentialFamily",
+    "Number",
+    "binary_cross_entropy_with_logits",
+    "broadcast_all",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.beta": [
+    "Dirichlet",
+    "ExponentialFamily",
+    "Number",
+    "Real",
+    "broadcast_all"
+  ],
+  "torch.distributions.binomial": [
+    "Distribution",
+    "broadcast_all",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.categorical": [
+    "Distribution",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.cauchy": [
+    "Distribution",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.chi2": [
+    "Gamma"
+  ],
+  "torch.distributions.continuous_bernoulli": [
+    "ExponentialFamily",
+    "Number",
+    "binary_cross_entropy_with_logits",
+    "broadcast_all",
+    "clamp_probs",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.dirichlet": [
+    "ExponentialFamily",
+    "Function",
+    "once_differentiable"
+  ],
+  "torch.distributions.distribution": [
+    "Any",
+    "Dict",
+    "Optional",
+    "lazy_property"
+  ],
+  "torch.distributions.exp_family": [
+    "Distribution"
+  ],
+  "torch.distributions.exponential": [
+    "ExponentialFamily",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.fishersnedecor": [
+    "Distribution",
+    "Gamma",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.gamma": [
+    "ExponentialFamily",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.geometric": [
+    "Distribution",
+    "Number",
+    "binary_cross_entropy_with_logits",
+    "broadcast_all",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.gumbel": [
+    "AffineTransform",
+    "ExpTransform",
+    "Number",
+    "TransformedDistribution",
+    "Uniform",
+    "broadcast_all"
+  ],
+  "torch.distributions.half_cauchy": [
+    "AbsTransform",
+    "Cauchy",
+    "TransformedDistribution"
+  ],
+  "torch.distributions.half_normal": [
+    "AbsTransform",
+    "Normal",
+    "TransformedDistribution"
+  ],
+  "torch.distributions.independent": [
+    "Dict",
+    "Distribution"
+  ],
+  "torch.distributions.kl": [
+    "Bernoulli",
+    "Beta",
+    "Binomial",
+    "Callable",
+    "Categorical",
+    "Cauchy",
+    "ContinuousBernoulli",
+    "Dict",
+    "Dirichlet",
+    "Distribution",
+    "Exponential",
+    "ExponentialFamily",
+    "Gamma",
+    "Geometric",
+    "Gumbel",
+    "HalfNormal",
+    "Independent",
+    "Laplace",
+    "LowRankMultivariateNormal",
+    "MultivariateNormal",
+    "Normal",
+    "OneHotCategorical",
+    "Pareto",
+    "Poisson",
+    "TransformedDistribution",
+    "Tuple",
+    "Type",
+    "Uniform",
+    "total_ordering"
+  ],
+  "torch.distributions.kumaraswamy": [
+    "AffineTransform",
+    "PowerTransform",
+    "TransformedDistribution",
+    "Uniform",
+    "broadcast_all"
+  ],
+  "torch.distributions.laplace": [
+    "Distribution",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.lkj_cholesky": [
+    "Beta",
+    "Distribution",
+    "broadcast_all"
+  ],
+  "torch.distributions.log_normal": [
+    "ExpTransform",
+    "Normal",
+    "TransformedDistribution"
+  ],
+  "torch.distributions.logistic_normal": [
+    "Normal",
+    "StickBreakingTransform",
+    "TransformedDistribution"
+  ],
+  "torch.distributions.lowrank_multivariate_normal": [
+    "Distribution",
+    "lazy_property"
+  ],
+  "torch.distributions.mixture_same_family": [
+    "Categorical",
+    "Dict",
+    "Distribution"
+  ],
+  "torch.distributions.multinomial": [
+    "Binomial",
+    "Categorical",
+    "Distribution",
+    "broadcast_all"
+  ],
+  "torch.distributions.multivariate_normal": [
+    "Distribution",
+    "lazy_property"
+  ],
+  "torch.distributions.negative_binomial": [
+    "Distribution",
+    "broadcast_all",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.normal": [
+    "ExponentialFamily",
+    "Number",
+    "Real",
+    "broadcast_all"
+  ],
+  "torch.distributions.one_hot_categorical": [
+    "Categorical",
+    "Distribution"
+  ],
+  "torch.distributions.pareto": [
+    "AffineTransform",
+    "ExpTransform",
+    "Exponential",
+    "TransformedDistribution",
+    "broadcast_all"
+  ],
+  "torch.distributions.poisson": [
+    "ExponentialFamily",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.relaxed_bernoulli": [
+    "Distribution",
+    "Number",
+    "SigmoidTransform",
+    "TransformedDistribution",
+    "broadcast_all",
+    "clamp_probs",
+    "lazy_property",
+    "logits_to_probs",
+    "probs_to_logits"
+  ],
+  "torch.distributions.relaxed_categorical": [
+    "Categorical",
+    "Distribution",
+    "ExpTransform",
+    "TransformedDistribution",
+    "broadcast_all",
+    "clamp_probs"
+  ],
+  "torch.distributions.studentT": [
+    "Chi2",
+    "Distribution",
+    "broadcast_all"
+  ],
+  "torch.distributions.transformed_distribution": [
+    "ComposeTransform",
+    "Dict",
+    "Distribution",
+    "Independent",
+    "Transform"
+  ],
+  "torch.distributions.uniform": [
+    "Distribution",
+    "Number",
+    "broadcast_all"
+  ],
+  "torch.distributions.utils": [
+    "Any",
+    "Dict",
+    "Number",
+    "is_tensor_like",
+    "update_wrapper"
+  ],
+  "torch.distributions.von_mises": [
+    "Distribution",
+    "broadcast_all",
+    "lazy_property"
+  ],
+  "torch.distributions.weibull": [
+    "AffineTransform",
+    "Exponential",
+    "PowerTransform",
+    "TransformedDistribution",
+    "broadcast_all"
+  ],
+  "torch.distributions.wishart": [
+    "ExponentialFamily",
+    "Number",
+    "Union",
+    "lazy_property"
+  ],
+  "torch.fft": [
+    "Tensor",
+    "fft",
+    "fft2",
+    "fftfreq",
+    "fftn",
+    "fftshift",
+    "hfft",
+    "ifft",
+    "ifft2",
+    "ifftn",
+    "ifftshift",
+    "ihfft",
+    "irfft",
+    "irfft2",
+    "irfftn",
+    "rfft",
+    "rfft2",
+    "rfftfreq",
+    "rfftn"
+  ],
+  "torch.functional": [
+    "istft",
+    "pca_lowrank",
+    "svd_lowrank"
+  ],
+  "torch.futures": [
+    "Callable",
+    "Future",
+    "Generic",
+    "List",
+    "Optional",
+    "Type",
+    "TypeVar",
+    "Union",
+    "cast"
+  ],
+  "torch.fx": [
+    "ProxyableClassMeta",
+    "Tracer",
+    "symbolic_trace",
+    "wrap"
+  ],
+  "torch.fx.experimental.unification.core": [
+    "Iterator",
+    "assoc",
+    "dispatch",
+    "isvar",
+    "partial",
+    "unify",
+    "walk"
+  ],
+  "torch.fx.experimental.unification.dispatch": [
+    "dispatch",
+    "partial"
+  ],
+  "torch.fx.experimental.unification.more": [
+    "dispatch",
+    "reify",
+    "unify"
+  ],
+  "torch.fx.experimental.unification.multipledispatch.conflict": [
+    "groupby",
+    "isvariadic"
+  ],
+  "torch.fx.experimental.unification.multipledispatch.core": [
+    "Dispatcher",
+    "MethodDispatcher"
+  ],
+  "torch.fx.experimental.unification.multipledispatch.dispatcher": [
+    "AmbiguityWarning",
+    "Variadic",
+    "ambiguities",
+    "expand_tuples",
+    "isvariadic",
+    "ordering",
+    "super_signature",
+    "warn"
+  ],
+  "torch.fx.experimental.unification.multipledispatch.utils": [
+    "OrderedDict"
+  ],
+  "torch.fx.experimental.unification.multipledispatch.variadic": [
+    "typename"
+  ],
+  "torch.fx.experimental.unification.unification_tools": [
+    "first",
+    "getter",
+    "groupby"
+  ],
+  "torch.fx.experimental.unification.variable": [
+    "contextmanager",
+    "dispatch",
+    "hashable",
+    "isvar"
+  ],
+  "torch.fx.graph": [
+    "Any",
+    "Argument",
+    "Callable",
+    "Dict",
+    "FrozenSet",
+    "List",
+    "NamedTuple",
+    "Node",
+    "Optional",
+    "Set",
+    "Target",
+    "TransformCodeFunc",
+    "Tuple",
+    "Type",
+    "compatibility",
+    "contextmanager",
+    "dataclass",
+    "map_arg"
+  ],
+  "torch.fx.graph_module": [
+    "Any",
+    "Dict",
+    "Graph",
+    "Importer",
+    "List",
+    "Optional",
+    "PackageExporter",
+    "PackageImporter",
+    "Path",
+    "PythonCode",
+    "Set",
+    "Type",
+    "Union",
+    "compatibility"
+  ],
+  "torch.fx.immutable_collections": [
+    "Any",
+    "Context",
+    "Dict",
+    "List",
+    "Tuple",
+    "compatibility"
+  ],
+  "torch.fx.interpreter": [
+    "Any",
+    "Argument",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "Iterator",
+    "List",
+    "Node",
+    "Optional",
+    "Proxy",
+    "Target",
+    "Tracer",
+    "Tuple",
+    "Union",
+    "compatibility",
+    "map_aggregate",
+    "map_arg"
+  ],
+  "torch.fx.node": [
+    "Any",
+    "ArgsKwargsPair",
+    "Argument",
+    "BaseArgumentTypes",
+    "Callable",
+    "Dict",
+    "List",
+    "Optional",
+    "Set",
+    "Target",
+    "Tuple",
+    "Union",
+    "compatibility",
+    "immutable_dict",
+    "immutable_list",
+    "normalize_function",
+    "normalize_module"
+  ],
+  "torch.fx.operator_schemas": [
+    "Any",
+    "Callable",
+    "Dict",
+    "List",
+    "NamedTuple",
+    "OpOverload",
+    "OpOverloadPacket",
+    "Optional",
+    "Tuple",
+    "cast",
+    "compatibility"
+  ],
+  "torch.fx.passes.graph_drawer": [
+    "Any",
+    "Dict",
+    "TensorMetadata",
+    "chain",
+    "compatibility"
+  ],
+  "torch.fx.passes.graph_manipulation": [
+    "Any",
+    "Argument",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "NamedTuple",
+    "Node",
+    "Optional",
+    "ShapeProp",
+    "Target",
+    "Tuple",
+    "compatibility",
+    "lift_lowering_attrs_to_nodes",
+    "map_aggregate",
+    "map_arg"
+  ],
+  "torch.fx.passes.net_min_base": [
+    "Any",
+    "Callable",
+    "Dict",
+    "FxNetAccFusionsFinder",
+    "Names",
+    "NodeList",
+    "NodeSet",
+    "Optional",
+    "ShapeProp",
+    "TensorOrTensors",
+    "Tensors",
+    "Tuple",
+    "compatibility",
+    "dataclass",
+    "map_arg",
+    "split_by_tags"
+  ],
+  "torch.fx.passes.operator_support": [
+    "IsNodeSupported",
+    "SupportDict",
+    "SupportedArgumentDTypes",
+    "TargetTypeName",
+    "TensorMetadata",
+    "compatibility",
+    "get_node_target"
+  ],
+  "torch.fx.passes.param_fetch": [
+    "Any",
+    "Callable",
+    "Dict",
+    "GraphModule",
+    "List",
+    "Tuple",
+    "Type",
+    "compatibility"
+  ],
+  "torch.fx.passes.shape_prop": [
+    "Any",
+    "Dict",
+    "NamedTuple",
+    "Node",
+    "Optional",
+    "Tuple",
+    "compatibility",
+    "map_aggregate"
+  ],
+  "torch.fx.passes.split_module": [
+    "Any",
+    "Callable",
+    "Dict",
+    "GraphModule",
+    "List",
+    "Optional",
+    "compatibility"
+  ],
+  "torch.fx.passes.split_utils": [
+    "Dict",
+    "List",
+    "NodeList",
+    "NodeSet",
+    "Optional",
+    "compatibility",
+    "dataclass",
+    "field",
+    "map_arg"
+  ],
+  "torch.fx.passes.splitter_base": [
+    "Any",
+    "Dict",
+    "FxGraphDrawer",
+    "FxNetAccFusionsFinder",
+    "Iterable",
+    "List",
+    "NamedTuple",
+    "NodeList",
+    "NodeSet",
+    "OperatorSupportBase",
+    "Optional",
+    "Sequence",
+    "ShapeProp",
+    "Tensors",
+    "Tuple",
+    "compatibility",
+    "dataclass",
+    "defaultdict",
+    "get_node_target",
+    "get_size_of_node",
+    "is_node_output_tensor",
+    "map_arg",
+    "split_by_tags"
+  ],
+  "torch.fx.passes.tools_common": [
+    "Any",
+    "Dict",
+    "List",
+    "Mapping",
+    "Names",
+    "NodeList",
+    "NodeSet",
+    "Set",
+    "TensorOrTensors",
+    "Tensors",
+    "Tuple",
+    "Union",
+    "compatibility",
+    "dataclass"
+  ],
+  "torch.fx.proxy": [
+    "Any",
+    "Argument",
+    "Callable",
+    "Dict",
+    "Graph",
+    "Iterable",
+    "Iterator",
+    "Node",
+    "Optional",
+    "Target",
+    "Tuple",
+    "check_for_mutable_operation",
+    "compatibility",
+    "map_aggregate"
+  ],
+  "torch.fx.subgraph_rewriter": [
+    "Callable",
+    "Dict",
+    "Graph",
+    "GraphModule",
+    "List",
+    "NamedTuple",
+    "Node",
+    "Optional",
+    "Set",
+    "compatibility",
+    "symbolic_trace"
+  ],
+  "torch.hub": [
+    "HTTPError",
+    "Path",
+    "Request",
+    "tqdm",
+    "urlopen",
+    "urlparse"
+  ],
+  "torch.jit": [
+    "Attribute",
+    "Final",
+    "Iterator",
+    "ONNXTracedModule",
+    "RecursiveScriptClass",
+    "RecursiveScriptModule",
+    "ScriptModule",
+    "ScriptWarning",
+    "TopLevelTracedModule",
+    "TracedModule",
+    "TracerWarning",
+    "TracingCheckError",
+    "contextmanager",
+    "export",
+    "fork",
+    "freeze",
+    "fuser",
+    "ignore",
+    "interface",
+    "is_scripting",
+    "is_tracing",
+    "jit_module_from_flatbuffer",
+    "last_executed_optimized_graph",
+    "load",
+    "optimize_for_inference",
+    "optimized_execution",
+    "run_frozen_optimizations",
+    "save",
+    "save_jit_module_to_flatbuffer",
+    "script",
+    "script_method",
+    "set_fusion_strategy",
+    "set_module",
+    "trace",
+    "trace_module",
+    "unused",
+    "wait"
+  ],
+  "torch.jit.annotations": [
+    "Any",
+    "AnyType",
+    "ComplexType",
+    "Dict",
+    "DictType",
+    "EvalEnv",
+    "FloatType",
+    "IntType",
+    "List",
+    "ListType",
+    "StringType",
+    "TensorType",
+    "Tuple",
+    "TupleType",
+    "get_enum_value_type",
+    "is_dict",
+    "is_function_or_method",
+    "is_list",
+    "is_optional",
+    "is_tensor",
+    "is_tuple",
+    "is_union",
+    "is_vararg"
+  ],
+  "torch.jit.frontend": [
+    "Apply",
+    "Assert",
+    "Assign",
+    "Attribute",
+    "AugAssign",
+    "BinOp",
+    "Break",
+    "ClassDef",
+    "Const",
+    "Continue",
+    "Decl",
+    "Def",
+    "Delete",
+    "DictComp",
+    "DictLiteral",
+    "Dots",
+    "EmptyTypeAnnotation",
+    "ExprStmt",
+    "FalseLiteral",
+    "For",
+    "FunctionModifiers",
+    "Ident",
+    "If",
+    "List",
+    "ListComp",
+    "ListLiteral",
+    "NoneLiteral",
+    "Param",
+    "Pass",
+    "Property",
+    "Raise",
+    "Return",
+    "Select",
+    "SliceExpr",
+    "Starred",
+    "Stmt",
+    "StringLiteral",
+    "Subscript",
+    "TernaryIf",
+    "TrueLiteral",
+    "Tuple",
+    "TupleLiteral",
+    "UnaryOp",
+    "Var",
+    "While",
+    "With",
+    "WithItem",
+    "dedent",
+    "get_qualified_name",
+    "get_source_lines_and_file",
+    "is_static_fn",
+    "make_source_context",
+    "namedtuple",
+    "parse_def",
+    "should_drop",
+    "monkeytype_trace"
+  ],
+  "torch.linalg": [
+    "LinAlgError",
+    "Tensor",
+    "cholesky",
+    "cholesky_ex",
+    "cond",
+    "cross",
+    "det",
+    "diagonal",
+    "eig",
+    "eigh",
+    "eigvals",
+    "eigvalsh",
+    "householder_product",
+    "inv",
+    "inv_ex",
+    "ldl_factor",
+    "ldl_factor_ex",
+    "ldl_solve",
+    "lstsq",
+    "lu",
+    "lu_factor",
+    "lu_factor_ex",
+    "matmul",
+    "matrix_exp",
+    "matrix_norm",
+    "matrix_power",
+    "matrix_rank",
+    "multi_dot",
+    "norm",
+    "pinv",
+    "qr",
+    "slogdet",
+    "solve",
+    "solve_triangular",
+    "svd",
+    "svdvals",
+    "tensorinv",
+    "tensorsolve",
+    "vander",
+    "vector_norm"
+  ],
+  "torch.multiprocessing": [
+    "Array",
+    "AuthenticationError",
+    "Barrier",
+    "BoundedSemaphore",
+    "BufferTooShort",
+    "Condition",
+    "Event",
+    "JoinableQueue",
+    "Lock",
+    "Manager",
+    "Pipe",
+    "Pool",
+    "Process",
+    "ProcessContext",
+    "ProcessError",
+    "ProcessExitedException",
+    "ProcessRaisedException",
+    "Queue",
+    "RLock",
+    "RawArray",
+    "RawValue",
+    "Semaphore",
+    "SimpleQueue",
+    "SpawnContext",
+    "TimeoutError",
+    "Value",
+    "active_children",
+    "allow_connection_pickling",
+    "cpu_count",
+    "current_process",
+    "freeze_support",
+    "get_all_start_methods",
+    "get_context",
+    "get_logger",
+    "get_start_method",
+    "init_reductions",
+    "log_to_stderr",
+    "set_executable",
+    "set_forkserver_preload",
+    "set_start_method",
+    "spawn",
+    "start_processes",
+    "parent_process"
+  ],
+  "torch.multiprocessing.reductions": [
+    "ForkingPickler",
+    "Union",
+    "check_serializing_named_tensor",
+    "register_after_fork"
+  ],
+  "torch.multiprocessing.spawn": [
+    "Optional"
+  ],
+  "torch.nn.common_types": [
+    "Optional",
+    "Tensor",
+    "Tuple",
+    "TypeVar",
+    "Union"
+  ],
+  "torch.nn.functional": [
+    "Callable",
+    "DType",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple",
+    "Union",
+    "adaptive_avg_pool1d",
+    "avg_pool1d",
+    "avg_pool2d",
+    "avg_pool3d",
+    "bilinear",
+    "boolean_dispatch",
+    "celu_",
+    "channel_shuffle",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "cosine_similarity",
+    "elu_",
+    "gelu",
+    "handle_torch_function",
+    "hardshrink",
+    "hardtanh_",
+    "has_torch_function",
+    "has_torch_function_unary",
+    "has_torch_function_variadic",
+    "leaky_relu_",
+    "linear",
+    "logsigmoid",
+    "native_channel_shuffle",
+    "one_hot",
+    "pairwise_distance",
+    "pdist",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "prelu",
+    "relu_",
+    "rrelu_",
+    "selu_",
+    "softplus",
+    "softshrink",
+    "threshold_"
+  ],
+  "torch.nn.init": [
+    "Tensor"
+  ],
+  "torch.nn.intrinsic.modules": [
+    "_FusedModule"
+  ],
+  "torch.nn.intrinsic.modules.fused": [
+    "BatchNorm1d",
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "Linear",
+    "ReLU",
+    "type_before_parametrizations"
+  ],
+  "torch.nn.intrinsic.qat.modules.conv_fused": [
+    "Parameter",
+    "TypeVar",
+    "fuse_conv_bn_weights"
+  ],
+  "torch.nn.intrinsic.qat.modules.linear_fused": [
+    "Parameter",
+    "fuse_linear_bn_weights"
+  ],
+  "torch.nn.intrinsic.quantized.modules.conv_relu": [
+    "fuse_conv_bn_weights"
+  ],
+  "torch.nn.modules.activation": [
+    "Module",
+    "NonDynamicallyQuantizableLinear",
+    "Optional",
+    "Parameter",
+    "Tensor",
+    "Tuple",
+    "constant_",
+    "xavier_normal_",
+    "xavier_uniform_"
+  ],
+  "torch.nn.modules.adaptive": [
+    "Linear",
+    "List",
+    "Module",
+    "ModuleList",
+    "Sequence",
+    "Sequential",
+    "Tensor",
+    "log_softmax",
+    "namedtuple"
+  ],
+  "torch.nn.modules.batchnorm": [
+    "Any",
+    "LazyModuleMixin",
+    "Module",
+    "Optional",
+    "Parameter",
+    "Tensor",
+    "UninitializedBuffer",
+    "UninitializedParameter",
+    "sync_batch_norm"
+  ],
+  "torch.nn.modules.channelshuffle": [
+    "Module",
+    "Tensor"
+  ],
+  "torch.nn.modules.container": [
+    "Any",
+    "Dict",
+    "Iterable",
+    "Iterator",
+    "Mapping",
+    "Module",
+    "Optional",
+    "OrderedDict",
+    "Parameter",
+    "Tuple",
+    "TypeVar",
+    "Union",
+    "chain",
+    "islice",
+    "overload"
+  ],
+  "torch.nn.modules.conv": [
+    "LazyModuleMixin",
+    "List",
+    "Module",
+    "Optional",
+    "Parameter",
+    "Tensor",
+    "Tuple",
+    "UninitializedParameter",
+    "Union"
+  ],
+  "torch.nn.modules.distance": [
+    "Module",
+    "Tensor"
+  ],
+  "torch.nn.modules.dropout": [
+    "Module",
+    "Tensor"
+  ],
+  "torch.nn.modules.flatten": [
+    "Module",
+    "Tensor",
+    "Tuple",
+    "Union"
+  ],
+  "torch.nn.modules.fold": [
+    "Module",
+    "Tensor"
+  ],
+  "torch.nn.modules.instancenorm": [
+    "Tensor"
+  ],
+  "torch.nn.modules.lazy": [
+    "Protocol",
+    "is_lazy"
+  ],
+  "torch.nn.modules.linear": [
+    "LazyModuleMixin",
+    "Module",
+    "NonDynamicallyQuantizableLinear",
+    "Parameter",
+    "Tensor",
+    "UninitializedParameter"
+  ],
+  "torch.nn.modules.loss": [
+    "Callable",
+    "Module",
+    "Optional",
+    "PairwiseDistance",
+    "Tensor"
+  ],
+  "torch.nn.modules.module": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Iterator",
+    "List",
+    "Mapping",
+    "Optional",
+    "OrderedDict",
+    "Parameter",
+    "RemovableHandle",
+    "Set",
+    "Tensor",
+    "Tuple",
+    "TypeVar",
+    "Union",
+    "device",
+    "dtype",
+    "namedtuple",
+    "overload"
+  ],
+  "torch.nn.modules.normalization": [
+    "List",
+    "Module",
+    "Parameter",
+    "Size",
+    "Tensor",
+    "Tuple",
+    "Union"
+  ],
+  "torch.nn.modules.padding": [
+    "Module",
+    "Sequence",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.nn.modules.pixelshuffle": [
+    "Module",
+    "Tensor"
+  ],
+  "torch.nn.modules.pooling": [
+    "List",
+    "Module",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.nn.modules.rnn": [
+    "List",
+    "Module",
+    "Optional",
+    "PackedSequence",
+    "Parameter",
+    "Tensor",
+    "Tuple",
+    "overload"
+  ],
+  "torch.nn.modules.sparse": [
+    "Module",
+    "Optional",
+    "Parameter",
+    "Tensor"
+  ],
+  "torch.nn.modules.transformer": [
+    "Any",
+    "Callable",
+    "Dropout",
+    "LayerNorm",
+    "Linear",
+    "Module",
+    "ModuleList",
+    "MultiheadAttention",
+    "Optional",
+    "Tensor",
+    "Union",
+    "xavier_uniform_"
+  ],
+  "torch.nn.modules.upsampling": [
+    "Module",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.nn.modules.utils": [
+    "Any",
+    "Dict",
+    "List",
+    "repeat"
+  ],
+  "torch.nn.parallel": [
+    "DistributedDataParallelCPU"
+  ],
+  "torch.nn.parallel.comm": [
+    "List"
+  ],
+  "torch.nn.parallel.data_parallel": [
+    "Module",
+    "chain",
+    "gather",
+    "parallel_apply",
+    "replicate",
+    "scatter_kwargs"
+  ],
+  "torch.nn.parallel.distributed": [
+    "Any",
+    "Callable",
+    "Enum",
+    "Function",
+    "Join",
+    "JoinHook",
+    "Joinable",
+    "Module",
+    "RRef",
+    "ReduceOp",
+    "Type",
+    "Variable",
+    "auto",
+    "contextmanager",
+    "dataclass",
+    "gather",
+    "is_namedtuple",
+    "scatter_kwargs",
+    "tree_flatten",
+    "tree_unflatten"
+  ],
+  "torch.nn.parallel.parallel_apply": [
+    "ExceptionWrapper",
+    "autocast"
+  ],
+  "torch.nn.parallel.replicate": [
+    "OrderedDict"
+  ],
+  "torch.nn.parallel.scatter_gather": [
+    "Gather",
+    "Scatter"
+  ],
+  "torch.nn.parameter": [
+    "OrderedDict"
+  ],
+  "torch.nn.qat.dynamic.modules.linear": [
+    "activation_is_memoryless"
+  ],
+  "torch.nn.qat.modules.conv": [
+    "Tuple",
+    "TypeVar",
+    "Union"
+  ],
+  "torch.nn.qat.modules.embedding_ops": [
+    "Tensor"
+  ],
+  "torch.nn.qat.modules.linear": [
+    "LinearReLU",
+    "is_parametrized",
+    "transfer_parametrizations_and_params",
+    "type_before_parametrizations"
+  ],
+  "torch.nn.quantizable.modules.activation": [
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.nn.quantizable.modules.rnn": [
+    "Optional",
+    "Tensor",
+    "Tuple"
+  ],
+  "torch.nn.quantized": [
+    "MaxPool2d"
+  ],
+  "torch.nn.quantized.dynamic.modules.conv": [
+    "Tensor"
+  ],
+  "torch.nn.quantized.dynamic.modules.rnn": [
+    "Dict",
+    "List",
+    "Optional",
+    "PackedSequence",
+    "Tensor",
+    "Tuple",
+    "Union"
+  ],
+  "torch.nn.quantized.functional": [
+    "List",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.nn.quantized.modules": [
+    "MaxPool2d",
+    "_ConvNd"
+  ],
+  "torch.nn.quantized.modules.batchnorm": [
+    "Tensor"
+  ],
+  "torch.nn.quantized.modules.conv": [
+    "List",
+    "Optional",
+    "TypeVar",
+    "WeightedQuantizedModule",
+    "fuse_conv_bn_weights"
+  ],
+  "torch.nn.quantized.modules.embedding_ops": [
+    "List",
+    "Optional",
+    "Tensor",
+    "hide_packed_params_repr"
+  ],
+  "torch.nn.quantized.modules.functional_modules": [
+    "List",
+    "Tensor"
+  ],
+  "torch.nn.quantized.modules.linear": [
+    "Iterable",
+    "Optional",
+    "WeightedQuantizedModule",
+    "fuse_linear_bn_weights",
+    "hide_packed_params_repr",
+    "type_before_parametrizations"
+  ],
+  "torch.nn.quantized.modules.utils": [
+    "repeat"
+  ],
+  "torch.nn.utils.clip_grad": [
+    "Iterable",
+    "Union"
+  ],
+  "torch.nn.utils.convert_parameters": [
+    "Iterable",
+    "Optional"
+  ],
+  "torch.nn.utils.parametrizations": [
+    "Enum",
+    "Module",
+    "Optional",
+    "Tensor",
+    "auto"
+  ],
+  "torch.nn.utils.parametrize": [
+    "Dict",
+    "Module",
+    "ModuleDict",
+    "ModuleList",
+    "Optional",
+    "Parameter",
+    "Sequence",
+    "Tensor",
+    "Tuple",
+    "Union",
+    "contextmanager"
+  ],
+  "torch.nn.utils.rnn": [
+    "Iterable",
+    "List",
+    "Optional",
+    "Tensor",
+    "Tuple",
+    "Union",
+    "namedtuple"
+  ],
+  "torch.nn.utils.spectral_norm": [
+    "Any",
+    "Module",
+    "Optional",
+    "TypeVar",
+    "normalize"
+  ],
+  "torch.nn.utils.weight_norm": [
+    "Any",
+    "Module",
+    "Parameter",
+    "TypeVar",
+    "UninitializedParameter",
+    "norm_except_dim"
+  ],
+  "torch.onnx": [
+    "Dict",
+    "OperatorExportTypes",
+    "Optional",
+    "TensorProtoDataType",
+    "TrainingMode"
+  ],
+  "torch.optim.adadelta": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.adagrad": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.adam": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.adamax": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.adamw": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.asgd": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.lbfgs": [
+    "Optimizer",
+    "reduce"
+  ],
+  "torch.optim.lr_scheduler": [
+    "Counter",
+    "Optimizer",
+    "bisect_right",
+    "wraps"
+  ],
+  "torch.optim.nadam": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.optimizer": [
+    "chain",
+    "deepcopy",
+    "defaultdict"
+  ],
+  "torch.optim.radam": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.rmsprop": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.rprop": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.sgd": [
+    "List",
+    "Optimizer",
+    "Optional",
+    "Tensor"
+  ],
+  "torch.optim.sparse_adam": [
+    "Optimizer"
+  ],
+  "torch.optim.swa_utils": [
+    "Module",
+    "deepcopy"
+  ],
+  "torch.overrides": [
+    "BaseTorchFunctionMode",
+    "TorchFunctionMode",
+    "TorchFunctionModeMeta",
+    "enable_torch_function_mode",
+    "get_default_nowrap_functions",
+    "has_torch_function",
+    "push_torch_function_mode"
+  ],
+  "torch.package.analyze.find_first_use_of_broken_modules": [
+    "Dict",
+    "List",
+    "PackagingError"
+  ],
+  "torch.package.analyze.is_from_package": [
+    "Any",
+    "ModuleType",
+    "is_mangled"
+  ],
+  "torch.package.analyze.trace_dependencies": [
+    "Any",
+    "Callable",
+    "Iterable",
+    "List",
+    "Tuple"
+  ],
+  "torch.package.file_structure_representation": [
+    "Dict",
+    "GlobGroup",
+    "GlobPattern",
+    "List"
+  ],
+  "torch.package.find_file_dependencies": [
+    "List",
+    "Optional",
+    "Tuple"
+  ],
+  "torch.package.glob_group": [
+    "GlobPattern",
+    "Iterable",
+    "Union"
+  ],
+  "torch.package.importer": [
+    "ABC",
+    "Any",
+    "Dict",
+    "List",
+    "ModuleType",
+    "Optional",
+    "Tuple",
+    "abstractmethod",
+    "demangle",
+    "get_mangle_prefix",
+    "is_mangled"
+  ],
+  "torch.package.package_exporter": [
+    "ActionHook",
+    "Any",
+    "BinaryIO",
+    "Callable",
+    "DefaultDict",
+    "DiGraph",
+    "Dict",
+    "Enum",
+    "GlobGroup",
+    "GlobPattern",
+    "Importer",
+    "List",
+    "Optional",
+    "OrderedDict",
+    "OrderedImporter",
+    "Path",
+    "RemovableHandle",
+    "Sequence",
+    "Set",
+    "Storage",
+    "Union",
+    "cast",
+    "create_pickler",
+    "dataclass",
+    "defaultdict",
+    "demangle",
+    "find_files_source_depends_on",
+    "is_mangled",
+    "is_stdlib_module",
+    "location_tag",
+    "normalize_storage_type"
+  ],
+  "torch.package.package_importer": [
+    "Any",
+    "BinaryIO",
+    "Callable",
+    "Dict",
+    "Directory",
+    "DirectoryReader",
+    "GlobPattern",
+    "Importer",
+    "List",
+    "Optional",
+    "PackageMangler",
+    "PackageUnpickler",
+    "Path",
+    "Union",
+    "WeakValueDictionary",
+    "cast",
+    "contextmanager",
+    "demangle"
+  ],
+  "torch.profiler": [
+    "DeviceType",
+    "ProfilerActivity",
+    "kineto_available",
+    "record_function"
+  ],
+  "torch.profiler.profiler": [
+    "Any",
+    "Callable",
+    "Dict",
+    "Enum",
+    "Iterable",
+    "List",
+    "Optional",
+    "ProfilerActivity",
+    "Tuple",
+    "kineto_available",
+    "partial",
+    "warn"
+  ],
+  "torch.quantization": [
+    "ABC",
+    "DeQuantStub",
+    "FakeQuantize",
+    "FakeQuantizeBase",
+    "FixedQParamsFakeQuantize",
+    "FusedMovingAvgObsFakeQuantize",
+    "HistogramObserver",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "QConfig",
+    "QConfigAny",
+    "QConfigDynamic",
+    "QuantStub",
+    "QuantType",
+    "QuantWrapper",
+    "RecordingObserver",
+    "add_module_to_qconfig_obs_ctr",
+    "add_observer_",
+    "add_quant_dequant",
+    "assert_valid_qconfig",
+    "convert",
+    "convert_dynamic_jit",
+    "convert_jit",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_debug_observer",
+    "default_dynamic_quant_observer",
+    "default_fake_quant",
+    "default_float_qparams_observer",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_histogram_fake_quant",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_fake_quant",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_weight_fake_quant",
+    "default_weight_observer",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer",
+    "fuse_conv_bn",
+    "fuse_conv_bn_jit",
+    "fuse_conv_bn_relu",
+    "fuse_linear_bn",
+    "fuse_modules",
+    "get_default_compare_output_module_list",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qat_qconfig",
+    "get_default_qconfig",
+    "get_default_qconfig_propagation_list",
+    "get_default_static_quant_module_mappings",
+    "get_dynamic_quant_module_class",
+    "get_fuser_method",
+    "get_observer_dict",
+    "get_observer_state_dict",
+    "get_quantized_operator",
+    "get_static_quant_module_class",
+    "get_unique_devices_",
+    "is_activation_post_process",
+    "load_observer_state_dict",
+    "no_observer_set",
+    "prepare",
+    "prepare_dynamic_jit",
+    "prepare_jit",
+    "prepare_qat",
+    "propagate_qconfig_",
+    "qconfig_equals",
+    "quant_type_to_str",
+    "quantize",
+    "quantize_dynamic",
+    "quantize_dynamic_jit",
+    "quantize_jit",
+    "quantize_qat",
+    "register_activation_post_process_hook",
+    "script_qconfig",
+    "script_qconfig_dict",
+    "swap_module"
+  ],
+  "torch.quantization.fake_quantize": [
+    "FakeQuantize",
+    "FakeQuantizeBase",
+    "FixedQParamsFakeQuantize",
+    "FusedMovingAvgObsFakeQuantize",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_fake_quant",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_histogram_fake_quant",
+    "default_per_channel_weight_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_weight_fake_quant",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer"
+  ],
+  "torch.quantization.fuse_modules": [
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "fuse_known_modules",
+    "fuse_modules",
+    "get_fuser_method"
+  ],
+  "torch.quantization.fuser_method_mappings": [
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "fuse_linear_bn",
+    "get_fuser_method"
+  ],
+  "torch.quantization.observer": [
+    "ABC",
+    "HistogramObserver",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "RecordingObserver",
+    "default_debug_observer",
+    "default_dynamic_quant_observer",
+    "default_float_qparams_observer",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_weight_observer",
+    "get_observer_state_dict",
+    "load_observer_state_dict"
+  ],
+  "torch.quantization.qconfig": [
+    "QConfig",
+    "QConfigAny",
+    "QConfigDynamic",
+    "add_module_to_qconfig_obs_ctr",
+    "assert_valid_qconfig",
+    "get_default_qat_qconfig",
+    "get_default_qconfig",
+    "qconfig_equals"
+  ],
+  "torch.quantization.quant_type": [
+    "QuantType",
+    "quant_type_to_str"
+  ],
+  "torch.quantization.quantization_mappings": [
+    "get_default_compare_output_module_list",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qconfig_propagation_list",
+    "get_default_static_quant_module_mappings",
+    "get_dynamic_quant_module_class",
+    "get_quantized_operator",
+    "get_static_quant_module_class",
+    "no_observer_set"
+  ],
+  "torch.quantization.quantize": [
+    "add_observer_",
+    "add_quant_dequant",
+    "convert",
+    "get_observer_dict",
+    "get_unique_devices_",
+    "is_activation_post_process",
+    "prepare",
+    "prepare_qat",
+    "propagate_qconfig_",
+    "quantize",
+    "quantize_dynamic",
+    "quantize_qat",
+    "register_activation_post_process_hook",
+    "swap_module"
+  ],
+  "torch.quantization.quantize_jit": [
+    "convert_dynamic_jit",
+    "convert_jit",
+    "fuse_conv_bn_jit",
+    "prepare_dynamic_jit",
+    "prepare_jit",
+    "quantize_dynamic_jit",
+    "quantize_jit",
+    "script_qconfig",
+    "script_qconfig_dict"
+  ],
+  "torch.quantization.stubs": [
+    "DeQuantStub",
+    "QuantStub",
+    "QuantWrapper"
+  ],
+  "torch.quasirandom": [
+    "Optional"
+  ],
+  "torch.random": [
+    "Generator"
+  ],
+  "torch.return_types": [
+    "_det_lu_based_helper",
+    "_fake_quantize_per_tensor_affine_cachemask_tensor_qparams",
+    "_fused_moving_avg_obs_fq_helper",
+    "_linalg_svd",
+    "_linalg_svd_out",
+    "_lu_with_info",
+    "_unpack_dual",
+    "attr",
+    "pytree_register_structseq"
+  ],
+  "torch.serialization": [
+    "Any",
+    "BinaryIO",
+    "Dict",
+    "IO",
+    "Optional",
+    "Storage",
+    "Tuple",
+    "Type",
+    "Union",
+    "cast",
+    "closing",
+    "contextmanager",
+    "get_source_lines_and_file"
+  ],
+  "torch.sparse": [
+    "BFloat16Tensor",
+    "ByteTensor",
+    "CharTensor",
+    "DoubleTensor",
+    "FloatTensor",
+    "HalfTensor",
+    "IntTensor",
+    "LongTensor",
+    "ShortTensor",
+    "addmm",
+    "log_softmax",
+    "mm",
+    "softmax"
+  ],
+  "torch.special": [
+    "digamma",
+    "entr",
+    "erf",
+    "erfc",
+    "erfcx",
+    "erfinv",
+    "exp2",
+    "expit",
+    "expm1",
+    "gammainc",
+    "gammaincc",
+    "gammaln",
+    "i0",
+    "i0e",
+    "i1",
+    "i1e",
+    "log1p",
+    "log_ndtr",
+    "log_softmax",
+    "logit",
+    "logsumexp",
+    "multigammaln",
+    "ndtr",
+    "ndtri",
+    "polygamma",
+    "psi",
+    "round",
+    "sinc",
+    "softmax",
+    "xlog1py",
+    "xlogy",
+    "zeta"
+  ],
+  "torch.storage": [
+    "Any",
+    "Storage",
+    "Type",
+    "TypeVar",
+    "Union",
+    "cast",
+    "lru_cache"
+  ],
+  "torch.testing": [
+    "FileCheck",
+    "all_types",
+    "all_types_and",
+    "all_types_and_complex",
+    "all_types_and_complex_and",
+    "all_types_and_half",
+    "assert_allclose",
+    "assert_close",
+    "complex_types",
+    "double_types",
+    "empty_types",
+    "floating_and_complex_types",
+    "floating_and_complex_types_and",
+    "floating_types",
+    "floating_types_and",
+    "floating_types_and_half",
+    "get_all_complex_dtypes",
+    "get_all_device_types",
+    "get_all_dtypes",
+    "get_all_fp_dtypes",
+    "get_all_int_dtypes",
+    "get_all_math_dtypes",
+    "integral_types",
+    "integral_types_and",
+    "make_non_contiguous",
+    "make_tensor",
+    "rand",
+    "randn"
+  ],
+  "torch.torch_version": [
+    "Any",
+    "Iterable"
+  ],
+  "torch.types": [
+    "Any",
+    "Device",
+    "List",
+    "Number",
+    "Sequence",
+    "Tuple",
+    "Union"
+  ],
+  "torch.utils": [
+    "disable_minidumps",
+    "enable_minidumps",
+    "enable_minidumps_on_exceptions"
+  ],
+  "torch.utils.benchmark.utils.common": [
+    "_make_temp_dir",
+    "ordered_unique",
+    "select_unit",
+    "set_torch_threads",
+    "trim_sigfig",
+    "unit_to_english"
+  ],
+  "torch.utils.benchmark.utils.compare": [
+    "Colorize",
+    "Table",
+    "optional_min"
+  ],
+  "torch.utils.benchmark.utils.cpp_jit": [
+    "Any",
+    "CallgrindModuleType",
+    "List",
+    "Optional",
+    "TimeitModuleType"
+  ],
+  "torch.utils.benchmark.utils.fuzzer": [
+    "dtype_size",
+    "prod"
+  ],
+  "torch.utils.benchmark.utils.sparse_fuzzer": [
+    "FuzzedTensor",
+    "Number",
+    "Optional",
+    "Tuple",
+    "Union"
+  ],
+  "torch.utils.benchmark.utils.timer": [
+    "CPPTimer",
+    "timer"
+  ],
+  "torch.utils.benchmark.utils.valgrind_wrapper.timer_interface": [
+    "GlobalsBridge",
+    "Serialization",
+    "wrapper_singleton"
+  ],
+  "torch.utils.cpp_extension": [
+    "ExtensionVersioner",
+    "FileBaton",
+    "GeneratedFileCleaner",
+    "List",
+    "Optional",
+    "TorchVersion",
+    "Tuple",
+    "Union",
+    "build_ext",
+    "get_hip_file_path"
+  ],
+  "torch.utils.data": [
+    "_DatasetKind",
+    "argument_validation",
+    "default_collate",
+    "default_convert",
+    "functional_datapipe",
+    "get_worker_info",
+    "guaranteed_datapipes_determinism",
+    "non_deterministic",
+    "runtime_validation",
+    "runtime_validation_disabled"
+  ],
+  "torch.utils.data.dataloader": [
+    "default_collate",
+    "default_convert",
+    "get_worker_info"
+  ],
+  "torch.utils.data.datapipes.dataframe": [
+    "DFIterDataPipe"
+  ],
+  "torch.utils.dlpack": [
+    "Any",
+    "to_dlpack"
+  ],
+  "torch.utils.hipify.hipify_python": [
+    "Dict",
+    "HipifyFinalResult",
+    "HipifyResult",
+    "Iterable",
+    "Iterator",
+    "List",
+    "Mapping",
+    "Optional"
+  ],
+  "torch.utils.hooks": [
+    "Any",
+    "OrderedDict"
+  ],
+  "torch.utils.show_pickle": [
+    "Any",
+    "BinaryIO",
+    "IO",
+    "Union"
+  ],
+  "torch.utils.tensorboard.summary": [
+    "HistogramProto",
+    "Optional",
+    "PrCurvePluginData",
+    "Summary",
+    "SummaryMetadata",
+    "TensorProto",
+    "TensorShapeProto",
+    "TextPluginData",
+    "convert_to_HWC",
+    "make_np",
+    "range"
+  ],
+  "torch.utils.tensorboard.writer": [
+    "Event",
+    "EventFileWriter",
+    "ProjectorConfig",
+    "SessionLog",
+    "audio",
+    "custom_scalars",
+    "figure_to_image",
+    "get_embedding_info",
+    "graph",
+    "histogram",
+    "histogram_raw",
+    "hparams",
+    "image",
+    "image_boxes",
+    "load_onnx_graph",
+    "make_mat",
+    "make_np",
+    "make_sprite",
+    "make_tsv",
+    "mesh",
+    "pr_curve",
+    "pr_curve_raw",
+    "scalar",
+    "text",
+    "video",
+    "write_pbtxt"
+  ],
+  "torch": [
+    "BFloat16Storage",
+    "BFloat16Tensor",
+    "ComplexDoubleStorage",
+    "ComplexFloatStorage",
+    "DisableTorchFunction",
+    "Generator",
+    "HalfStorage",
+    "HalfTensor",
+    "QInt32Storage",
+    "QInt8Storage",
+    "QUInt2x4Storage",
+    "QUInt4x2Storage",
+    "QUInt8Storage",
+    "Storage",
+    "_TypedStorage",
+    "_adaptive_avg_pool2d",
+    "_adaptive_avg_pool3d",
+    "_add_batch_dim",
+    "_add_relu",
+    "_add_relu_",
+    "_addmm_activation",
+    "_aminmax",
+    "_amp_foreach_non_finite_check_and_unscale_",
+    "_amp_update_scale_",
+    "_assert_async",
+    "_batch_norm_impl_index",
+    "_cast_Byte",
+    "_cast_Char",
+    "_cast_Double",
+    "_cast_Float",
+    "_cast_Half",
+    "_cast_Int",
+    "_cast_Long",
+    "_cast_Short",
+    "_choose_qparams_per_tensor",
+    "_coalesce",
+    "_compute_linear_combination",
+    "_conj",
+    "_conj_copy",
+    "_conj_physical",
+    "_convert_indices_from_coo_to_csr",
+    "_convert_indices_from_csr_to_coo",
+    "_convolution",
+    "_convolution_mode",
+    "_copy_from",
+    "_copy_from_and_resize",
+    "_ctc_loss",
+    "_cudnn_ctc_loss",
+    "_cudnn_init_dropout_state",
+    "_cudnn_rnn",
+    "_cudnn_rnn_flatten_weight",
+    "_cufft_clear_plan_cache",
+    "_cufft_get_plan_cache_max_size",
+    "_cufft_get_plan_cache_size",
+    "_cufft_set_plan_cache_max_size",
+    "_cummax_helper",
+    "_cummin_helper",
+    "_debug_has_internal_overlap",
+    "_det_lu_based_helper",
+    "_det_lu_based_helper_backward_helper",
+    "_dim_arange",
+    "_dirichlet_grad",
+    "_disable_functionalization",
+    "_efficientzerotensor",
+    "_embedding_bag",
+    "_embedding_bag_forward_only",
+    "_empty_affine_quantized",
+    "_empty_per_channel_affine_quantized",
+    "_enable_functionalization",
+    "_euclidean_dist",
+    "_fake_quantize_learnable_per_channel_affine",
+    "_fake_quantize_learnable_per_tensor_affine",
+    "_fake_quantize_per_tensor_affine_cachemask_tensor_qparams",
+    "_fft_c2c",
+    "_fft_c2r",
+    "_fft_r2c",
+    "_foreach_abs",
+    "_foreach_abs_",
+    "_foreach_acos",
+    "_foreach_acos_",
+    "_foreach_add",
+    "_foreach_add_",
+    "_foreach_addcdiv",
+    "_foreach_addcdiv_",
+    "_foreach_addcmul",
+    "_foreach_addcmul_",
+    "_foreach_asin",
+    "_foreach_asin_",
+    "_foreach_atan",
+    "_foreach_atan_",
+    "_foreach_ceil",
+    "_foreach_ceil_",
+    "_foreach_cos",
+    "_foreach_cos_",
+    "_foreach_cosh",
+    "_foreach_cosh_",
+    "_foreach_div",
+    "_foreach_div_",
+    "_foreach_erf",
+    "_foreach_erf_",
+    "_foreach_erfc",
+    "_foreach_erfc_",
+    "_foreach_exp",
+    "_foreach_exp_",
+    "_foreach_expm1",
+    "_foreach_expm1_",
+    "_foreach_floor",
+    "_foreach_floor_",
+    "_foreach_frac",
+    "_foreach_frac_",
+    "_foreach_lgamma",
+    "_foreach_lgamma_",
+    "_foreach_log",
+    "_foreach_log10",
+    "_foreach_log10_",
+    "_foreach_log1p",
+    "_foreach_log1p_",
+    "_foreach_log2",
+    "_foreach_log2_",
+    "_foreach_log_",
+    "_foreach_maximum",
+    "_foreach_minimum",
+    "_foreach_mul",
+    "_foreach_mul_",
+    "_foreach_neg",
+    "_foreach_neg_",
+    "_foreach_norm",
+    "_foreach_reciprocal",
+    "_foreach_reciprocal_",
+    "_foreach_round",
+    "_foreach_round_",
+    "_foreach_sigmoid",
+    "_foreach_sigmoid_",
+    "_foreach_sin",
+    "_foreach_sin_",
+    "_foreach_sinh",
+    "_foreach_sinh_",
+    "_foreach_sqrt",
+    "_foreach_sqrt_",
+    "_foreach_sub",
+    "_foreach_sub_",
+    "_foreach_tan",
+    "_foreach_tan_",
+    "_foreach_tanh",
+    "_foreach_tanh_",
+    "_foreach_trunc",
+    "_foreach_trunc_",
+    "_foreach_zero_",
+    "_from_functional_tensor",
+    "_fused_dropout",
+    "_fused_moving_avg_obs_fq_helper",
+    "_fw_primal_copy",
+    "_grid_sampler_2d_cpu_fallback",
+    "_has_compatible_shallow_copy_type",
+    "_histogramdd_bin_edges",
+    "_histogramdd_from_bin_cts",
+    "_histogramdd_from_bin_tensors",
+    "_index_put_impl_",
+    "_indices_copy",
+    "_is_functional_tensor",
+    "_is_zerotensor",
+    "_linalg_check_errors",
+    "_linalg_inv_out_helper_",
+    "_linalg_qr_helper",
+    "_linalg_svd",
+    "_log_softmax",
+    "_log_softmax_backward_data",
+    "_logcumsumexp",
+    "_lu_with_info",
+    "_make_dual",
+    "_make_dual_copy",
+    "_make_per_channel_quantized_tensor",
+    "_make_per_tensor_quantized_tensor",
+    "_masked_scale",
+    "_masked_softmax",
+    "_mkldnn_reshape",
+    "_mkldnn_transpose",
+    "_mkldnn_transpose_",
+    "_neg_view",
+    "_neg_view_copy",
+    "_nested_from_padded",
+    "_nested_from_padded_and_nested_example",
+    "_nnpack_available",
+    "_nnpack_spatial_convolution",
+    "_pack_padded_sequence",
+    "_pad_packed_sequence",
+    "_pin_memory",
+    "_remove_batch_dim",
+    "_reshape_alias_copy",
+    "_reshape_from_tensor",
+    "_rowwise_prune",
+    "_sample_dirichlet",
+    "_saturate_weight_to_fp16",
+    "_shape_as_tensor",
+    "_sobol_engine_draw",
+    "_sobol_engine_ff_",
+    "_sobol_engine_initialize_state_",
+    "_sobol_engine_scramble_",
+    "_softmax",
+    "_softmax_backward_data",
+    "_sparse_broadcast_to",
+    "_sparse_broadcast_to_copy",
+    "_sparse_coo_tensor_unsafe",
+    "_sparse_csr_prod",
+    "_sparse_csr_sum",
+    "_sparse_csr_tensor_unsafe",
+    "_sparse_log_softmax_backward_data",
+    "_sparse_mask_helper",
+    "_sparse_softmax_backward_data",
+    "_sparse_sparse_matmul",
+    "_sparse_sum",
+    "_stack",
+    "_standard_gamma",
+    "_standard_gamma_grad",
+    "_sync",
+    "_test_serialization_subcmul",
+    "_to_cpu",
+    "_to_functional_tensor",
+    "_torch_cuda_cu_linker_symbol_op",
+    "_trilinear",
+    "_unique",
+    "_unique2",
+    "_unpack_dual",
+    "_use_cudnn_ctc_loss",
+    "_use_cudnn_rnn_flatten_weight",
+    "_validate_sparse_compressed_tensor_args",
+    "_validate_sparse_coo_tensor_args",
+    "_validate_sparse_csr_tensor_args",
+    "_values_copy",
+    "_weight_norm",
+    "_weight_norm_interface",
+    "autocast",
+    "broadcast_shapes",
+    "candidate",
+    "compiled_with_cxx11_abi",
+    "from_dlpack",
+    "lobpcg",
+    "lu",
+    "obj",
+    "set_default_dtype",
+    "set_grad_enabled",
+    "set_printoptions",
+    "unique"
+  ]
+}
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
new file mode 100644
index 000000000000..b44c88550774
--- /dev/null
+++ b/test/ao/sparsity/test_composability.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+# Owner(s): ["module: unknown"]
+
+
+import logging
+
+import torch
+import torch.ao.quantization as tq
+from torch import nn
+from torch.ao import sparsity
+from torch.testing._internal.common_utils import TestCase
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
+sparse_defaults = {
+    "sparsity_level": 0.8,
+    "sparse_block_shape": (1, 4),
+    "zeros_per_block": 4,
+}
+
+# This series of tests are to check the composability goals for sparsity and quantization. Namely
+# that performing quantization and sparsity model manipulations in various orderings
+# does not cause problems
+class TestComposability(TestCase):
+    def _get_model_and_sparsifier_and_sparse_config(self, qconfig=None):
+        model = nn.Sequential(
+            nn.Linear(4, 4),  # 0
+            nn.ReLU(),
+            nn.Linear(4, 4),  # 2
+            nn.ReLU(),
+            tq.QuantStub(),
+            nn.Linear(4, 4),  # 5
+            nn.ReLU(),
+            tq.DeQuantStub(),
+        )
+        if qconfig is None:
+            model[4].qconfig = tq.get_default_qconfig("fbgemm")
+            model[5].qconfig = tq.get_default_qconfig("fbgemm")
+        else:
+            model[4].qconfig = qconfig
+            model[5].qconfig = qconfig
+
+        sparsifier = sparsity.WeightNormSparsifier(**sparse_defaults)
+
+        sparse_config = [
+            {
+                "module": model[5],
+                "sparsity_level": 0.7,
+                "sparse_block_shape": (1, 4),
+                "zeros_per_block": 4,
+            },
+            model[0],
+        ]
+        return model, sparsifier, sparse_config
+
+    def _squash_mask_calibrate_and_convert(self, model, sparsifier, input):
+        sparsifier.step()
+        sparsifier.squash_mask()
+        model(input)
+        tq.convert(model, inplace=True)
+
+    def _calculate_sparsity(self, tensor):
+        return ((tensor == 0).sum() / tensor.numel()).item()
+
+    # This test checks whether performing quantization prepare before sparse prepare
+    # causes any issues and verifies that the correct observers are inserted and that
+    # the quantized model works as expected
+    def test_q_prep_before_s_prep(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config()
+
+        tq.prepare(mod, inplace=True)
+        sparsifier.prepare(mod, config=sparse_config)
+
+        # check that correct modules had parametrizations added
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5], "parametrizations"))
+        # check that correct observers were inserted
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+
+        self._squash_mask_calibrate_and_convert(
+            mod, sparsifier, torch.randn(1, 4, 4, 4)
+        )
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+    # This test checks whether performing sparsity prepare before quantization prepare
+    # causes any issues. In particular, previous quantization flow was unable to match
+    # the post sparse prepare module names (adding parametrizations changes the module class names)
+    # which would result in those parametrized modules not being quantized. This test verifies that
+    # the fix for this was successful.
+    def test_s_prep_before_q_prep(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config()
+
+        sparsifier.prepare(mod, config=sparse_config)
+        tq.prepare(mod, inplace=True)
+
+        # check that correct modules had parametrizations added and
+        # that none were lost during prepare
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+
+        self._squash_mask_calibrate_and_convert(
+            mod, sparsifier, torch.randn(1, 4, 4, 4)
+        )
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+    # if the sparsified modules have not undergone the final squash mask operation, its possible
+    # that the problem outlined in test_s_prep_before_q_prep would occur. This test verifies
+    # both that the fix to the convert flow avoids this issue and that the resulting quantized
+    # module uses the sparse version of the weight value.
+    def test_convert_without_squash_mask(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config()
+
+        sparsifier.prepare(mod, config=sparse_config)
+        tq.prepare(mod, inplace=True)
+
+        # check that correct modules had parametrizations added and
+        # that none were lost during prepare
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+        sparsifier.step()
+        sparsity_level = self._calculate_sparsity(mod[5].weight)
+        mod(torch.randn(1, 4, 4, 4))
+        tq.convert(mod, inplace=True)
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+        # check that module was actually sparsified
+        cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0])
+        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
+        self.assertGreaterAlmostEqual(
+            sparsity_level, sparse_config[0]["sparsity_level"]
+        )
+        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+
+    # This tests whether performing sparse prepare before fusion causes any issues. The
+    # worry was that the link created between the sparsifier and the modules that need to
+    # be sparsified would be broken.
+    def test_s_prep_before_fusion(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config()
+        sparsifier.prepare(mod, config=sparse_config)
+        tq.fuse_modules(mod, [["5", "6"]], inplace=True)
+        mod[5].qconfig = tq.get_default_qconfig("fbgemm")
+        tq.prepare(mod, inplace=True)
+
+        # check that correct modules had parametrizations added and
+        # that none were lost during prepare or fusion
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5][0], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+        self._squash_mask_calibrate_and_convert(
+            mod, sparsifier, torch.randn(1, 4, 4, 4)
+        )
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+    # This tests whether performing fusion before sparse prepare causes and issues. The
+    # main worry was that the links to the modules in the sparse config would be broken by fusion.
+    def test_fusion_before_s_prep(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config()
+        tq.fuse_modules(mod, [["5", "6"]], inplace=True)
+        sparsifier.prepare(mod, config=sparse_config)
+        mod[5].qconfig = tq.get_default_qconfig("fbgemm")
+        tq.prepare(mod, inplace=True)
+
+        # check that correct modules had parametrizations added and
+        # that none were lost during prepare
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5][0], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+        sparsifier.step()
+        sparsity_level = self._calculate_sparsity(mod[5][0].weight)
+        mod(torch.randn(1, 4, 4, 4))
+        tq.convert(mod, inplace=True)
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+        # check that module was actually sparsified
+        cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0])
+        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
+        self.assertGreaterAlmostEqual(
+            sparsity_level, sparse_config[0]["sparsity_level"]
+        )
+        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+
+    # This tests whether performing sparse prepare before qat prepare causes issues.
+    # The primary worries were that qat_prep wouldn't recognize the parametrized
+    # modules and that the convert step for qat would remove the paramerizations
+    # from the modules.
+    def test_s_prep_before_qat_prep(self):
+        (
+            mod,
+            sparsifier,
+            sparse_config,
+        ) = self._get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qat_qconfig("fbgemm")
+        )
+        sparsifier.prepare(mod, config=sparse_config)
+        tq.prepare_qat(mod, inplace=True)
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+        self.assertTrue(isinstance(mod[5], torch.nn.qat.Linear))
+        self._squash_mask_calibrate_and_convert(
+            mod, sparsifier, torch.randn(1, 4, 4, 4)
+        )
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+        # check that module was actually sparsified
+        cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0])
+        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+
+    # This tests whether performing qat prepare before sparse prepare causes issues.
+    def test_qat_prep_before_s_prep(self):
+        mod, sparsifier, _ = self._get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qat_qconfig("fbgemm")
+        )
+        tq.prepare_qat(mod, inplace=True)
+
+        # need to setup sparse_config on new modules
+        sparse_config = [
+            {
+                "module": mod[5],
+                "sparsity_level": 0.7,
+                "sparse_block_shape": (1, 4),
+                "zeros_per_block": 4,
+            },
+            mod[0],
+        ]
+        sparsifier.prepare(mod, config=sparse_config)
+
+        # check that correct modules had parametrizations added and
+        # that none were lost during qat prepare
+        self.assertTrue(hasattr(mod[0], "parametrizations"))
+        self.assertTrue(hasattr(mod[5], "parametrizations"))
+
+        # check that correct observers were inserted and that matching
+        # occured successfully
+        self.assertTrue(hasattr(mod[5], "activation_post_process"))
+        self.assertTrue(isinstance(mod[5], torch.nn.qat.Linear))
+
+        self._squash_mask_calibrate_and_convert(
+            mod, sparsifier, torch.randn(1, 4, 4, 4)
+        )
+
+        # check that final module is the expected quantized module and that the model runs
+        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
+        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
+
+        # check that module was actually sparsified
+        cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0])
+        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 8deec46b4188..04a934345999 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -22,6 +22,7 @@
     override_qengines,
     qengine_is_qnnpack,
     qengine_is_fbgemm,
+    qengine_is_onednn,
 )
 
 # TODO: Once more test files are created, move the contents to a ao folder.
@@ -48,6 +49,9 @@ def test_sparse_qlinear(self):
         # to other higher priority works.
         if qengine_is_qnnpack() and not (row_block_size == 1 and col_block_size == 4):
             return
+        # ONEDNN does not support this yet
+        if qengine_is_onednn():
+            return
 
         dense_prepack = torch.ops.quantized.linear_prepack
         dense_qlinear = torch.ops.quantized.linear
@@ -215,6 +219,10 @@ def test_sparse_qlinear(self):
                 Y_hat = sqmodel(X_fp32)
                 self.assertEqual(Y_ref, Y_hat)
 
+            # ONEDNN does not support this yet
+            elif qengine_is_onednn():
+                return
+
             row_block_size, col_block_size = sqmodel.linear._packed_params._weight_bias()[2:]
             assert row_block_size == 1 and col_block_size == 4
 
diff --git a/test/autograd/test_functional.py b/test/autograd/test_functional.py
new file mode 100644
index 000000000000..18b8fd07d736
--- /dev/null
+++ b/test/autograd/test_functional.py
@@ -0,0 +1,1420 @@
+# Owner(s): ["module: autograd"]
+
+import types
+import unittest
+import warnings
+
+import torch
+import torch.autograd.functional as autogradF
+
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, subtest, gradcheck, gradgradcheck, parametrize, instantiate_parametrized_tests)
+from torch.testing._internal.logging_tensor import LoggingTensor
+
+# Utilities for parametrizing the tensor constructors used in autograd tests
+#
+# TODO: maybe move somewhere so other tests can also use
+#
+# NB: Not all factory functions included. A complete(?) list can be found here:
+#     https://pytorch.org/cppdocs/notes/tensor_creation.html
+base_ctors_dict = {
+    "ones": torch.ones,
+    "zeros": torch.zeros,
+    "randn": torch.randn,
+    "rand": torch.rand,
+    "tensor": torch.tensor,
+}
+base_ctors = types.SimpleNamespace(**base_ctors_dict)
+
+def wrap_with_logging_tensor(ctor):
+    def wrapper(*args, **kwargs):
+        requires_grad = kwargs.pop("requires_grad", False)
+        return LoggingTensor(ctor(*args, **kwargs), requires_grad=requires_grad)
+    return wrapper
+
+logging_tensor_ctors_dict = {k: wrap_with_logging_tensor(ctor) for (k, ctor) in base_ctors_dict.items()}
+logging_tensor_ctors = types.SimpleNamespace(**logging_tensor_ctors_dict)
+
+base_and_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"),
+                                                subtest(logging_tensor_ctors, name="logging_tensor")])
+
+FIXME_base_and_xfail_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"),
+                                                            subtest(logging_tensor_ctors, name="logging_tensor",
+                                                                    decorators=[unittest.expectedFailure])])
+
+# NB: This is equivalent to having both @parmetrize("vectorized", [True, False]) and
+#     FIXME_base_and_xfail_logging_tensor, except the non-vectorized logging_tensor case is
+#     actually expected to succeed
+FIXME_xfail_vectorized_logging_tensor = (
+    parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"),
+                                    subtest((False, base_ctors), name="base_tensor"),
+                                    subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor",
+                                            decorators=[unittest.expectedFailure]),
+                                    subtest((False, logging_tensor_ctors), name="logging_tensor")]))
+
+vectorized_logging_tensor = (
+    parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"),
+                                    subtest((False, base_ctors), name="base_tensor"),
+                                    subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor"),
+                                    subtest((False, logging_tensor_ctors), name="logging_tensor")]))
+
+
+class TestAutogradFunctional(TestCase):
+    def _assert_same_struct(self, res, base):
+        # base and res should be Tensors or tuple of Tensors with the same size
+        if isinstance(base, torch.Tensor):
+            self.assertTrue(isinstance(res, torch.Tensor))
+            self.assertEqual(base.size(), res.size())
+        elif isinstance(base, tuple):
+            self.assertTrue(isinstance(res, tuple))
+            self.assertEqual(len(base), len(res))
+            for el_base, el_res in zip(base, res):
+                self.assertTrue(isinstance(el_base, torch.Tensor))
+                self.assertTrue(isinstance(el_res, torch.Tensor))
+                self.assertEqual(el_base.size(), el_res.size())
+        else:
+            # Wrong base
+            raise RuntimeError("The base given to `_assert_same_struct` doesn't have"
+                               " the right structure.")
+
+    def _assert_interleaved_struct(self, res, base1, base2):
+        # base1 and base2 can be Tensors or tuples of Tensors.
+        # If they are tuples, res should be a tuple as well.
+        # The indexing works as follows for base1, base2 being
+        # - tuple, tuple: res[i][j][k][l] = (base1[i][k], base2[j][l])
+        # - tuple, Tensor: res[i][k][l] = (base1[i][k], base2[l])
+        # - Tensor, tuple: res[i][j][l] = (base1[i], base2[j][l])
+        # - Tensor, Tensor: res[k][l] = (base1[k], base2[l])
+        if isinstance(base1, torch.Tensor) and isinstance(base2, torch.Tensor):
+            self.assertTrue(isinstance(res, torch.Tensor))
+            self.assertEqual(res.size(), base1.size() + base2.size())
+        elif isinstance(base1, tuple) and isinstance(base2, torch.Tensor):
+            self.assertTrue(isinstance(res, tuple))
+            self.assertEqual(len(res), len(base1))
+            for el_res, el_base1 in zip(res, base1):
+                self.assertTrue(isinstance(el_res, torch.Tensor))
+                self.assertTrue(isinstance(el_base1, torch.Tensor))
+                self.assertEqual(el_res.size(), el_base1.size() + base2.size())
+        elif isinstance(base1, torch.Tensor) and isinstance(base2, tuple):
+            self.assertTrue(isinstance(res, tuple))
+            self.assertEqual(len(res), len(base2))
+            for el_res, el_base2 in zip(res, base2):
+                self.assertTrue(isinstance(el_res, torch.Tensor))
+                self.assertTrue(isinstance(el_base2, torch.Tensor))
+                self.assertEqual(el_res.size(), base1.size() + el_base2.size())
+        elif isinstance(base1, tuple) and isinstance(base2, tuple):
+            self.assertTrue(isinstance(res, tuple))
+            self.assertEqual(len(res), len(base1))
+            for el_res, el_base1 in zip(res, base1):
+                self.assertTrue(isinstance(el_res, tuple))
+                self.assertEqual(len(res), len(base2))
+                for el_el_res, el_base2 in zip(el_res, base2):
+                    self.assertTrue(isinstance(el_el_res, torch.Tensor))
+                    self.assertTrue(isinstance(el_base2, torch.Tensor))
+                    self.assertEqual(el_el_res.size(), el_base1.size() + el_base2.size())
+        else:
+            # Wrong bases
+            raise RuntimeError("The bases given to `_assert_interleaved_struct` don't have"
+                               " the right structure.")
+
+    @base_and_logging_tensor
+    def test_vjp_err_check(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        inp = ctors.rand(4)
+        v = ctors.ones(3)
+        with self.assertRaisesRegex(TypeError, "The inputs given to vjp must be either a Tensor"):
+            res = autogradF.vjp(foo, (inp, 2), v)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vjp must"):
+            res = autogradF.vjp(bar, inp, v)
+
+        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the user-provided function returns"):
+            res = autogradF.vjp(foo, inp)
+
+        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
+            res = autogradF.vjp(foo, inp, (torch.ones_like(inp), torch.ones_like(inp)))
+
+        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
+            res = autogradF.vjp(foo, inp, v[:2])
+
+        res = autogradF.vjp(foo, inp, v)[1]
+        self._assert_same_struct(res, inp)
+
+    @base_and_logging_tensor
+    def test_vjp_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone()
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.vjp(foo, inp, v, strict=True)
+        res = autogradF.vjp(foo, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+            res = autogradF.vjp(bar, inp, v, strict=True)
+        res = autogradF.vjp(bar, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        # The Jacobian does not depend on the input
+        def foo(a):
+            return a.clone()
+
+        inp.requires_grad_()
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+            res = autogradF.vjp(foo, inp, v, create_graph=True, strict=True)
+        res = autogradF.vjp(foo, inp, v, create_graph=True, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1], v)
+
+    @base_and_logging_tensor
+    def test_vjp_no_grad(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4)
+        with torch.no_grad():
+            res = autogradF.vjp(reducer, inputs, v)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+        inputs.requires_grad_()
+        v.requires_grad_()
+        with torch.no_grad():
+            res = autogradF.vjp(reducer, inputs, v, create_graph=True)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+    @base_and_logging_tensor
+    def test_vjp_output(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4)
+        res = autogradF.vjp(reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+
+        def adder(x, y):
+            return 2 * x + 3 * y
+
+        inputs = (ctors.rand(2), ctors.rand(2))
+        v = ctors.ones(2)
+        out, vjp_val = autogradF.vjp(adder, inputs, v)
+        self._assert_same_struct(vjp_val, inputs)
+        self.assertIsNone(out.grad_fn)
+        self.assertIsNone(vjp_val[0].grad_fn)
+        self.assertIsNone(vjp_val[1].grad_fn)
+
+        def adder(x, y):
+            return 2 * x + 3 * y, x + y
+
+        inputs = (ctors.rand(2), ctors.rand(2))
+        v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.]))
+        out, vjp_val = autogradF.vjp(adder, inputs, v)
+        self._assert_same_struct(vjp_val, inputs)
+        self.assertIsNone(out[0].grad_fn)
+        self.assertIsNone(out[1].grad_fn)
+        self.assertIsNone(vjp_val[0].grad_fn)
+        self.assertIsNone(vjp_val[1].grad_fn)
+
+    @base_and_logging_tensor
+    def test_vjp_scalar(self, ctors):
+        def reducer(x):
+            return x.sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones([])
+        res = autogradF.vjp(reducer, inputs, v)
+        self._assert_same_struct(res[0], v)
+        self._assert_same_struct(res[1], inputs)
+
+        res = autogradF.vjp(reducer, inputs)
+        self._assert_same_struct(res[0], v)
+        self._assert_same_struct(res[1], inputs)
+
+        def expander(x):
+            return x.unsqueeze(0).repeat(4)
+        inputs = ctors.rand([])
+        v = ctors.ones(4)
+        res = autogradF.vjp(expander, inputs, v)
+        self._assert_same_struct(res[0], v)
+        self._assert_same_struct(res[1], inputs)
+
+    @base_and_logging_tensor
+    def test_vjp_create_graph(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(2, 2, dtype=torch.double)
+        v = ctors.ones(2, dtype=torch.double)
+
+        inputs.requires_grad_()
+        v.requires_grad_()
+        res = autogradF.vjp(reducer, inputs, v, create_graph=True)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+
+        gradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
+        gradgradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
+
+        def adder(x, y):
+            return 2 * x + 3 * y, x * y
+
+        inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True),
+                  ctors.rand(2, dtype=torch.double, requires_grad=True))
+        v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True),
+             ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True))
+
+        gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+
+        def foo(*args):
+            x, y = args[:2]
+            v = args[2:]
+
+            x = x.cos()
+            val, grad = autogradF.vjp(adder, (x, y), v, create_graph=True)
+
+            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
+
+        gradcheck(foo, inputs + v)
+        gradgradcheck(foo, inputs + v)
+
+    @base_and_logging_tensor
+    def test_jvp_err_check(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(TypeError, "The inputs given to jvp must be either a Tensor"):
+            res = autogradF.jvp(foo, (inp, 2), v)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jvp must"):
+            res = autogradF.jvp(bar, inp, v)
+
+        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the input to the user-provided function"):
+            res = autogradF.jvp(foo, inp)
+
+        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
+            res = autogradF.jvp(foo, inp, (v, v))
+
+        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
+            res = autogradF.jvp(foo, inp, v[:2])
+
+        res = autogradF.jvp(foo, inp, v)[1]
+        self._assert_same_struct(res, foo(inp))
+
+    @base_and_logging_tensor
+    def test_jvp_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone()
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.jvp(foo, inp, v, strict=True)
+        res = autogradF.jvp(foo, inp, v, strict=False)
+        self._assert_same_struct(res[1], res[0])
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+            res = autogradF.jvp(bar, inp, v, strict=True)
+        res = autogradF.jvp(bar, inp, v, strict=False)
+        self._assert_same_struct(res[1], res[0])
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        # The Jacobian does not depend on the input
+        def foo(a):
+            return a.clone()
+
+        inp.requires_grad_()
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+            res = autogradF.jvp(foo, inp, v, create_graph=True, strict=True)
+        res = autogradF.jvp(foo, inp, v, create_graph=True, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1], v)
+
+    @base_and_logging_tensor
+    def test_jvp_no_grad(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        with torch.no_grad():
+            res = autogradF.jvp(reducer, inputs, v)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+        inputs.requires_grad_()
+        v.requires_grad_()
+        with torch.no_grad():
+            res = autogradF.jvp(reducer, inputs, v, create_graph=True)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+    @base_and_logging_tensor
+    def test_jvp_output(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.jvp(reducer, inputs, v)
+        self._assert_same_struct(res[1], res[0])
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+
+        def adder(x, y):
+            return 2 * x + 3 * y
+
+        inputs = (ctors.rand(2), ctors.rand(2))
+        v = (ctors.ones(2), ctors.ones(2))
+        out, jvp_val = autogradF.jvp(adder, inputs, v)
+        self._assert_same_struct(jvp_val, out)
+        self.assertIsNone(out.grad_fn)
+        self.assertIsNone(jvp_val[0].grad_fn)
+        self.assertIsNone(jvp_val[1].grad_fn)
+
+        def adder(x, y):
+            return 2 * x + 3 * y, x + y
+
+        inputs = (ctors.rand(2), ctors.rand(2))
+        v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.]))
+        out, jvp_val = autogradF.jvp(adder, inputs, v)
+        self._assert_same_struct(jvp_val, out)
+        self.assertIsNone(out[0].grad_fn)
+        self.assertIsNone(out[1].grad_fn)
+        self.assertIsNone(jvp_val[0].grad_fn)
+        self.assertIsNone(jvp_val[1].grad_fn)
+
+    @base_and_logging_tensor
+    def test_jvp_scalar(self, ctors):
+        def reducer(x):
+            return x.sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.jvp(reducer, inputs, v)
+        self._assert_same_struct(res[0], ctors.zeros([]))
+        self._assert_same_struct(res[1], res[0])
+
+        def expander(x):
+            return x.unsqueeze(0).repeat(4)
+        inputs = ctors.rand([])
+        v = ctors.ones([])
+        res = autogradF.jvp(expander, inputs, v)
+        self._assert_same_struct(res[0], ctors.zeros(4))
+        self._assert_same_struct(res[1], res[0])
+
+        res = autogradF.jvp(expander, inputs)
+        self._assert_same_struct(res[0], ctors.zeros(4))
+        self._assert_same_struct(res[1], res[0])
+
+    @base_and_logging_tensor
+    def test_jvp_create_graph(self, ctors):
+        def reducer(x):
+            return x.sum(dim=1)
+        inputs = ctors.rand(2, 2, dtype=torch.double)
+        v = ctors.ones(2, 2, dtype=torch.double)
+
+        inputs.requires_grad_()
+        v.requires_grad_()
+        res = autogradF.jvp(reducer, inputs, v, create_graph=True)
+        self._assert_same_struct(res[1], res[0])
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+
+        gradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
+        gradgradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
+
+        def adder(x, y):
+            return 2 * x + 3 * y, x * y
+
+        inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True),
+                  ctors.rand(2, dtype=torch.double, requires_grad=True))
+        v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True),
+             ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True))
+
+        gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+
+        def foo(*args):
+            x, y = args[:2]
+            v = args[2:]
+
+            x = x.cos()
+            val, grad = autogradF.jvp(adder, (x, y), v, create_graph=True)
+
+            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
+
+        gradcheck(foo, inputs + v)
+        gradgradcheck(foo, inputs + v)
+
+    def _test_construct_standard_basis_for(self, inputs):
+        numels = tuple(tensor.numel() for tensor in inputs)
+        results = autogradF._construct_standard_basis_for(inputs, numels)
+        for result, inp in zip(results, inputs):
+            self.assertEqual(result.dtype, inp.dtype)
+            self.assertEqual(result.device, inp.device)
+        results = torch.cat([result.to(device='cpu', dtype=torch.float)
+                             for result in results], dim=1)
+        expected = torch.eye(results[0].shape[0], dtype=torch.float)
+        self.assertEqual(results, expected)
+
+    @base_and_logging_tensor
+    def test_construct_standard_basis_for(self, ctors):
+        test_cases = [
+            (ctors.randn(2, 3),),
+            (ctors.randn(1),),
+            (ctors.randn([]),),
+            (ctors.randn(1), ctors.randn([]), ctors.randn([])),
+            (ctors.randn(2), ctors.randn(3), ctors.randn([])),
+            (ctors.randn(2), ctors.randn([]), ctors.randn(3)),
+            (ctors.randn(2, 3), ctors.randn(3), ctors.randn(3, 4, 2)),
+            (ctors.randn(2, dtype=torch.float64), ctors.randn(3, dtype=torch.float32)),
+        ]
+
+        for inputs in test_cases:
+            self._test_construct_standard_basis_for(inputs)
+
+    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
+    @base_and_logging_tensor
+    def test_construct_standard_basis_for_cuda(self, ctors):
+        test_cases = [
+            (ctors.randn(2), ctors.randn(3, device='cuda')),
+            (ctors.randn(3, device='cuda'), ctors.randn(2)),
+        ]
+
+        for inputs in test_cases:
+            self._test_construct_standard_basis_for(inputs)
+
+    def _test_vectorize_raises_no_warnings(self, api, ctors):
+        # vmap is an experimental prototype. When someone calls torch.vmap,
+        # it raises a python warning. This test checks that
+        # autogradF.{jacobian, hessian} don't raise that experimental prototype
+        # warning; it is not nice for a public-facing API to raise a warning
+        # no matter how it is called.
+        def foo(a):
+            return (a ** 2).sum()
+
+        x = ctors.randn(3)
+        with warnings.catch_warnings(record=True) as wa:
+            result = api(foo, x, vectorize=True)
+        self.assertEqual(len(wa), 0)
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_raises_no_warnings(self, ctors):
+        return self._test_vectorize_raises_no_warnings(autogradF.jacobian, ctors)
+
+    @base_and_logging_tensor
+    def test_hessian_vectorize_raises_no_warnings(self, ctors):
+        return self._test_vectorize_raises_no_warnings(autogradF.hessian, ctors)
+
+    @parametrize("vectorize", [True, False])
+    @base_and_logging_tensor
+    def test_jacobian_err_check(self, vectorize, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(TypeError, "The inputs given to jacobian must be either a Tensor"):
+            res = autogradF.jacobian(foo, (inp, 2), vectorize=vectorize)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jacobian must"):
+            res = autogradF.jacobian(bar, inp, vectorize=vectorize)
+
+        res = autogradF.jacobian(foo, inp, vectorize=vectorize)
+        self._assert_interleaved_struct(res, foo(inp), inp)
+
+        def foo(a, b):
+            return b, 3 * a.narrow(0, 0, 3)
+
+        inp = (ctors.rand(4), ctors.rand(5))
+
+        res = autogradF.jacobian(foo, inp, vectorize=vectorize)
+        self._assert_interleaved_struct(res, foo(*inp), inp)
+
+    @base_and_logging_tensor
+    def test_jacobian_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone()
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.jacobian(foo, inp, strict=True)
+        res = autogradF.jacobian(foo, inp, strict=False)
+        self._assert_interleaved_struct(res, foo(inp), inp)
+        self.assertEqual(res.abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function is independent of input 0."):
+            res = autogradF.jacobian(bar, inp, strict=True)
+        res = autogradF.jacobian(bar, inp, strict=False)
+        self._assert_interleaved_struct(res, foo(inp), inp)
+        self.assertEqual(res.abs().sum(), 0.)
+
+        # The Jacobian does not depend on the input
+        def foo(a):
+            return a.clone()
+
+        inp.requires_grad_()
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+            res = autogradF.jacobian(foo, inp, create_graph=True, strict=True)
+        res = autogradF.jacobian(foo, inp, create_graph=True, strict=False)
+        self._assert_interleaved_struct(res, inp, inp)
+        self.assertEqual(res, torch.eye(4))
+
+    @base_and_logging_tensor
+    def test_jacobian_err_check_strict_vectorize(self, ctors):
+        def foo(x):
+            return x
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "not supported together"):
+            res = autogradF.jacobian(foo, inp, strict=True, vectorize=True)
+
+    @base_and_logging_tensor
+    def test_jacobian_no_grad(self, ctors):
+        def exp_reducer(x):
+            return x.exp().sum(dim=1)
+
+        inputs = ctors.rand(4, 4)
+        with torch.no_grad():
+            res = autogradF.jacobian(exp_reducer, inputs)
+        self.assertIsNone(res.grad_fn)
+        self.assertNotEqual(res, ctors.zeros(4, 4))
+
+        with torch.no_grad():
+            res = autogradF.jacobian(exp_reducer, inputs, create_graph=True)
+        self.assertIsNotNone(res.grad_fn)
+        self.assertNotEqual(res, ctors.zeros(4, 4))
+
+    @vectorized_logging_tensor
+    def test_jacobian_output(self, vectorize, ctors):
+        def exp_reducer(x):
+            return x.exp().sum(dim=1)
+
+        inputs = ctors.rand(4, 4)
+        res = autogradF.jacobian(exp_reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
+        self.assertIsNone(res.grad_fn)
+
+        def identity(x):
+            return x.clone()
+
+        inputs = ctors.rand(4)
+        res = autogradF.jacobian(identity, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, identity(inputs), inputs)
+        self.assertIsNone(res.grad_fn)
+        self.assertEqual(res, torch.eye(4))
+
+        def add_exp_reducer(x, y):
+            return (x + y.exp()).sum(dim=1)
+
+        inputs = (ctors.rand(4, 4), ctors.rand(4, 4))
+        res = autogradF.jacobian(add_exp_reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+
+    @vectorized_logging_tensor
+    def test_jacobian_scalar(self, vectorize, ctors):
+        def reducer(x):
+            return x.sum()
+        inputs = ctors.rand(4, 4)
+        res = autogradF.jacobian(reducer, inputs, vectorize=vectorize)
+        self._assert_same_struct(res, inputs)
+
+        def expander(x):
+            return x.unsqueeze(0).repeat(4)
+        inputs = ctors.rand([])
+        res = autogradF.jacobian(expander, inputs, vectorize=vectorize)
+        self._assert_same_struct(res, ctors.zeros(4))
+
+    @parametrize("vectorize", [True, False])
+    @base_and_logging_tensor
+    def test_jacobian_create_graph(self, vectorize, ctors):
+        def exp_reducer(x):
+            return x.exp().sum(dim=1)
+
+        inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True)
+        res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize)
+        self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
+        self.assertIsNotNone(res.grad_fn)
+
+        gradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradgradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+
+        def add_exp_reducer(x, y):
+            return (x + y).exp().sum(dim=1)
+
+        inputs = (ctors.rand(4, 4, dtype=torch.double, requires_grad=True),
+                  ctors.rand(4, 4, dtype=torch.double, requires_grad=True))
+        res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize)
+        self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+
+        gradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradgradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+
+        def foo(x, y):
+            x = x.cos()
+            val, jac = autogradF.jacobian(add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize)
+
+            res = val[0].exp().sum() + val[1].exp().sum() + jac[0].exp().sum()
+            res = res + jac[1].exp().sum() + x.exp().sum() + y.exp().sum()
+            return res
+
+        gradcheck(foo, inputs)
+        gradgradcheck(foo, inputs)
+
+    def _check_jacobian_vectorize_correctness(self, f, inputs, test_forward_ad=True):
+        expected = autogradF.jacobian(f, inputs, vectorize=False)
+        result_backward_mode = autogradF.jacobian(f, inputs, vectorize=True)
+        self.assertEqual(result_backward_mode, expected)
+
+        if test_forward_ad:
+            result_forward_mode = autogradF.jacobian(f, inputs, strategy="forward-mode", vectorize=True)
+            self.assertEqual(result_forward_mode, expected)
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_simple(self, ctors):
+        def f(x):
+            return 3 * x ** 2
+
+        x = ctors.randn(2, 3, 5)
+        self._check_jacobian_vectorize_correctness(f, x)
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_multi_input(self, ctors):
+        def f(x, y):
+            return (x.cos() * x) @ y.sin()
+
+        x = ctors.randn(2, 3)
+        y = ctors.randn(3, 5)
+        self._check_jacobian_vectorize_correctness(f, (x, y))
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_multi_input_multi_output(self, ctors):
+        def f(x, y):
+            return (x * x) @ y, x @ (x.sum(1) * y), y.sum()
+
+        x = ctors.randn(5, 3)
+        y = ctors.randn(3, 5)
+        self._check_jacobian_vectorize_correctness(f, (x, y))
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_unrelated_outputs(self, ctors):
+        def f(x, y):
+            return x, y, x, y
+
+        x = ctors.randn(2)
+        y = ctors.randn(3)
+        self._check_jacobian_vectorize_correctness(f, (x, y))
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_zero_dim(self, ctors):
+        # zero-dim output
+        def f(x, y):
+            return x.sum(), y.sum(), x * y
+
+        x = ctors.randn(3)
+        y = ctors.randn(3)
+        self._check_jacobian_vectorize_correctness(f, (x, y))
+
+        # zero-dim input
+        def g(x):
+            return torch.stack([x, x, x])
+
+        x = ctors.randn([])
+        self._check_jacobian_vectorize_correctness(g, x)
+
+        # Mixed zero-dim input / zero-dim output
+        def h(x, y):
+            return y.sum(), x * y
+
+        x = ctors.randn([])
+        y = ctors.randn(1)
+        self._check_jacobian_vectorize_correctness(h, (x, y))
+
+    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_different_devices(self, ctors):
+        def f(x, y):
+            return x * y, (x * y).cuda()
+
+        x = ctors.randn(3)
+        y = ctors.randn(3)
+        self._check_jacobian_vectorize_correctness(f, (x, y))
+
+    @base_and_logging_tensor
+    def test_jacobian_vectorize_correctness_different_dtype(self, ctors):
+        def f(x, y):
+            return (x * y).float(), (x * y).double()
+
+        x = ctors.randn(3)
+        y = ctors.randn(3)
+        # The Jacobian computed using forward AD has the dtype of the output
+        # but the Jacobian computed with reverse AD has dtype of input
+        self._check_jacobian_vectorize_correctness(f, (x, y), test_forward_ad=False)
+
+    def _check_hessian_vectorize_correctness(self, f, inputs):
+        expected = autogradF.hessian(f, inputs, vectorize=False)
+        result = autogradF.hessian(f, inputs, vectorize=True)
+        self.assertEqual(result, expected)
+
+        result_forward_mode = autogradF.hessian(f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True)
+        self.assertEqual(result_forward_mode, expected)
+
+    @base_and_logging_tensor
+    def test_hessian_vectorize_correctness_simple(self, ctors):
+        def f(x):
+            return (3 * x ** 2).sum()
+
+        x = ctors.randn(2, 3, 5)
+        self._check_hessian_vectorize_correctness(f, x)
+
+    @base_and_logging_tensor
+    def test_hessian_vectorize_correctness_multi_input(self, ctors):
+        def f(x, y, z):
+            return ((x.relu() * x) @ y.sin() @ z).sum()
+
+        x = ctors.randn(2, 3)
+        y = ctors.randn(3, 5)
+        z = ctors.randn(5, 5)
+        self._check_hessian_vectorize_correctness(f, (x, y, z))
+
+    @base_and_logging_tensor
+    def test_hessian_vectorize_correctness_unrelated_outputs(self, ctors):
+        # output unrelated to one input
+        def f(x, y):
+            return (x ** 2).sum()
+
+        x = ctors.randn(2)
+        y = ctors.randn(3)
+        self._check_hessian_vectorize_correctness(f, (x, y))
+
+        # output unrelated to all inputs
+        def f(x, y):
+            return ctors.ones([])
+
+        x = ctors.randn(2)
+        y = ctors.randn(3)
+        self._check_hessian_vectorize_correctness(f, (x, y))
+
+    @parametrize("vectorize", [True, False])
+    @base_and_logging_tensor
+    def test_hessian_err_check(self, vectorize, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        def bar2(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        def bar3(a):
+            return 3 * a.narrow(0, 0, 3), 3 * a.narrow(0, 0, 3)
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(TypeError, "The inputs given to hessian must be either a Tensor"):
+            res = autogradF.hessian(foo, (inp, 2), vectorize=vectorize)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hessian must"):
+            res = autogradF.hessian(bar, inp, vectorize=vectorize)
+
+        err_msg_out = "The Tensor returned by the function given to hessian should contain a single element"
+        with self.assertRaisesRegex(RuntimeError, err_msg_out):
+            res = autogradF.hessian(bar2, inp, vectorize=vectorize)
+
+        with self.assertRaisesRegex(RuntimeError, "The function given to hessian should return a single Tensor"):
+            res = autogradF.hessian(bar3, inp, vectorize=vectorize)
+
+        res = autogradF.hessian(foo, inp, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inp, inp)
+
+        def foo(a, b):
+            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
+
+        inp = (ctors.rand(4), ctors.rand(5))
+
+        res = autogradF.hessian(foo, inp, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inp, inp)
+
+    @base_and_logging_tensor
+    def test_hessian_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach().sum()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone().sum()
+
+        def bar2(a):
+            # A Linear function for which the jacobian is independent of the input
+            return (3 * a).sum()
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.hessian(foo, inp, strict=True)
+        res = autogradF.hessian(foo, inp, strict=False)
+        self._assert_interleaved_struct(res, inp, inp)
+        self.assertEqual(res.abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0"):
+            res = autogradF.hessian(bar, inp, strict=True)
+        res = autogradF.hessian(bar, inp, strict=False)
+        self._assert_interleaved_struct(res, inp, inp)
+        self.assertEqual(res.abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+            res = autogradF.hessian(bar2, inp, strict=True)
+        res = autogradF.hessian(bar2, inp, strict=False)
+        self._assert_interleaved_struct(res, inp, inp)
+        self.assertEqual(res.abs().sum(), 0.)
+
+    @base_and_logging_tensor
+    def test_hessian_err_check_strict_vectorize(self, ctors):
+        def foo(x):
+            return (x ** 3).sum()
+
+        inp = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "not supported together"):
+            res = autogradF.hessian(foo, inp, strict=True, vectorize=True)
+
+    @base_and_logging_tensor
+    def test_hessian_no_grad(self, ctors):
+        def pow_reducer(x):
+            return x.pow(3).sum()
+
+        inputs = ctors.rand(2, 2)
+        with torch.no_grad():
+            res = autogradF.hessian(pow_reducer, inputs)
+        self.assertIsNone(res[0][0].grad_fn)
+        self.assertIsNone(res[0][1].grad_fn)
+        self.assertIsNone(res[1][0].grad_fn)
+        self.assertIsNone(res[1][1].grad_fn)
+        self.assertNotEqual(res, ctors.zeros(2, 2, 2))
+
+        with torch.no_grad():
+            res = autogradF.hessian(pow_reducer, inputs, create_graph=True)
+        self.assertIsNotNone(res[0][0].grad_fn)
+        self.assertIsNotNone(res[0][1].grad_fn)
+        self.assertIsNotNone(res[1][0].grad_fn)
+        self.assertIsNotNone(res[1][1].grad_fn)
+        self.assertNotEqual(res, ctors.zeros(2, 2, 2))
+
+    @vectorized_logging_tensor
+    def test_hessian_output(self, vectorize, ctors):
+        def pow_reducer(x):
+            return x.pow(3).sum()
+
+        inputs = ctors.rand(2, 2)
+        res = autogradF.hessian(pow_reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+        self.assertIsNone(res.grad_fn)
+
+        def add_pow_reducer(x, y):
+            return (x + y).pow(3).sum()
+
+        inputs = (ctors.rand(2, 2), ctors.rand(2, 2))
+        res = autogradF.hessian(add_pow_reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+        self.assertIsNone(res[0][0].grad_fn)
+        self.assertIsNone(res[0][1].grad_fn)
+        self.assertIsNone(res[1][0].grad_fn)
+        self.assertIsNone(res[1][1].grad_fn)
+
+    @parametrize("vectorize", [True, False])
+    @base_and_logging_tensor
+    def test_hessian_scalar(self, vectorize, ctors):
+        def reducer(x):
+            return x.sum()
+        inputs = ctors.rand(4, 4)
+        res = autogradF.hessian(reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+
+        inputs = ctors.rand([])
+        res = autogradF.hessian(reducer, inputs, vectorize=vectorize)
+        self._assert_same_struct(res, inputs)
+
+        def bad_reducer(x):
+            return x.sum().view(1, 1, 1)
+        inputs = ctors.rand(4, 4)
+        res = autogradF.hessian(bad_reducer, inputs, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+
+    @parametrize("vectorize", [True, False])
+    @base_and_logging_tensor
+    def test_hessian_create_graph(self, vectorize, ctors):
+        def pow_reducer(x):
+            return x.pow(3).sum()
+
+        inputs = ctors.rand(2, 2, dtype=torch.double, requires_grad=True)
+        res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+        self.assertIsNotNone(res.grad_fn)
+
+        gradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradgradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+
+        def add_pow_reducer(x, y):
+            return (x + y).pow(3).sum()
+
+        inputs = (ctors.rand(2, 2, dtype=torch.double, requires_grad=True),
+                  ctors.rand(2, 2, dtype=torch.double, requires_grad=True))
+        res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize)
+        self._assert_interleaved_struct(res, inputs, inputs)
+        self.assertIsNotNone(res[0][0].grad_fn)
+        self.assertIsNotNone(res[0][1].grad_fn)
+        self.assertIsNotNone(res[1][0].grad_fn)
+        self.assertIsNotNone(res[1][1].grad_fn)
+
+        def flatten(inp):
+            return tuple(el_lvl2 for el_lvl1 in inp for el_lvl2 in el_lvl1)
+
+        gradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
+        gradgradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
+
+        def foo(x, y):
+            x = x.cos()
+            val, hess = autogradF.hessian(add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize)
+
+            res = val[0].cos().sum() + val[1].cos().sum() + hess[0].cos().sum()
+            res = res + hess[1].cos().sum() + x.cos().sum() + y.cos().sum()
+            return res
+
+        gradcheck(foo, inputs)
+        gradgradcheck(foo, inputs)
+
+    @base_and_logging_tensor
+    def test_vhp_err_check(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        def bar2(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(TypeError, "The inputs given to vhp must be either a Tensor"):
+            res = autogradF.vhp(foo, (inp, 2), v)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vhp must"):
+            res = autogradF.vhp(bar, inp, v)
+
+        err_msg_out = "The Tensor returned by the function given to vhp should contain a single element"
+        with self.assertRaisesRegex(RuntimeError, err_msg_out):
+            res = autogradF.vhp(bar2, inp, v)
+
+        with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
+            res = autogradF.vhp(foo, inp, ctors.rand(5))
+
+        with self.assertRaisesRegex(TypeError, "The v given to vhp must be either a Tensor or a tuple of Tensors"):
+            res = autogradF.vhp(foo, inp, (v, 2))
+
+        res = autogradF.vhp(foo, inp, v)
+        self._assert_same_struct(res[1], inp)
+
+        def foo(a, b):
+            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
+
+        inp = (ctors.rand(4), ctors.rand(5))
+        v = (ctors.rand(4), ctors.rand(5))
+
+        res = autogradF.vhp(foo, inp, v)
+        self._assert_same_struct(res[1], inp)
+
+    @base_and_logging_tensor
+    def test_vhp_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach().sum()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone().sum()
+
+        def bar2(a):
+            # A Linear function for which the jacobian is independent of the input
+            return (3 * a).sum()
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.vhp(foo, inp, v, strict=True)
+        res = autogradF.vhp(foo, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+            res = autogradF.vhp(bar, inp, v, strict=True)
+        res = autogradF.vhp(bar, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+            res = autogradF.vhp(bar2, inp, v, strict=True)
+        res = autogradF.vhp(bar2, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+    @base_and_logging_tensor
+    def test_vhp_no_grad(self, ctors):
+        def reducer(x):
+            return x.exp().sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        with torch.no_grad():
+            res = autogradF.vhp(reducer, inputs, v)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+        with torch.no_grad():
+            res = autogradF.vhp(reducer, inputs, v, create_graph=True)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+    @base_and_logging_tensor
+    def test_vhp_output(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.vhp(foo, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+
+        def bar(a, b):
+            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
+
+        inputs = (ctors.rand(3), ctors.rand(4))
+        v = (ctors.ones(3), ctors.ones(4))
+        out, vhp_val = autogradF.vhp(bar, inputs, v)
+        self._assert_same_struct(vhp_val, inputs)
+        self.assertIsNone(out.grad_fn)
+        self.assertIsNone(vhp_val[0].grad_fn)
+        self.assertIsNone(vhp_val[1].grad_fn)
+
+    @base_and_logging_tensor
+    def test_vhp_scalar(self, ctors):
+        def reducer(x):
+            return x.sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.vhp(reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+        inputs = ctors.rand([])
+        v = ctors.rand([])
+        res = autogradF.vhp(reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+        res = autogradF.vhp(reducer, inputs)
+        self._assert_same_struct(res[1], inputs)
+
+        def bad_reducer(x):
+            return x.sum().view(1, 1, 1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.rand(4, 4)
+        res = autogradF.vhp(bad_reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+    @base_and_logging_tensor
+    def test_vhp_create_graph(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True)
+        v = ctors.ones(4, 4, dtype=torch.double, requires_grad=True)
+        res = autogradF.vhp(foo, inputs, v, create_graph=True)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+
+        gradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
+        gradgradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
+
+        def bar(a, b):
+            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
+
+        inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True),
+                  ctors.rand(4, dtype=torch.double, requires_grad=True))
+        v = (ctors.ones(3, dtype=torch.double, requires_grad=True),
+             ctors.ones(4, dtype=torch.double, requires_grad=True))
+        out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True)
+        self._assert_same_struct(vhp_val, inputs)
+        self.assertIsNotNone(out.grad_fn)
+        self.assertIsNotNone(vhp_val[0].grad_fn)
+        self.assertIsNotNone(vhp_val[1].grad_fn)
+
+        gradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradgradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+
+        def foo(*args):
+            x, y = args[:2]
+            v = args[2:]
+
+            x = x.cos()
+            val, grad = autogradF.vhp(bar, (x, y), v, create_graph=True)
+
+            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
+
+        gradcheck(foo, inputs + v)
+        gradgradcheck(foo, inputs + v)
+
+    @base_and_logging_tensor
+    def test_hvp_err_check(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        def bar(a):
+            return 3 * a.narrow(0, 0, 3), "bar"
+
+        def bar2(a):
+            return 3 * a.narrow(0, 0, 3)
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        res = autogradF.hvp(foo, inp, v)
+        with self.assertRaisesRegex(TypeError, "The inputs given to hvp must be either a Tensor"):
+            res = autogradF.hvp(foo, (inp, 2), v)
+
+        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hvp must"):
+            res = autogradF.hvp(bar, inp, v)
+
+        err_msg_out = "The Tensor returned by the function given to hvp should contain a single element"
+        with self.assertRaisesRegex(RuntimeError, err_msg_out):
+            res = autogradF.hvp(bar2, inp, v)
+
+        with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
+            res = autogradF.hvp(foo, inp, ctors.rand(5))
+
+        with self.assertRaisesRegex(TypeError, "The v given to hvp must be either a Tensor or a tuple of Tensors"):
+            res = autogradF.hvp(foo, inp, (v, 2))
+
+        res = autogradF.hvp(foo, inp, v)
+        self._assert_same_struct(res[1], inp)
+
+        def foo(a, b):
+            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
+
+        inp = (ctors.rand(4), ctors.rand(5))
+        v = (ctors.rand(4), ctors.rand(5))
+
+        res = autogradF.hvp(foo, inp, v)
+        self._assert_same_struct(res[1], inp)
+
+    @base_and_logging_tensor
+    def test_hvp_err_check_strict(self, ctors):
+        def foo(a):
+            return a.detach().sum()
+
+        def bar(a):
+            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
+            return a.long().float().requires_grad_().clone().sum()
+
+        def bar2(a):
+            # A Linear function for which the jacobian is independent of the input
+            return (3 * a).sum()
+
+        inp = ctors.rand(4)
+        v = ctors.rand(4)
+        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+            res = autogradF.hvp(foo, inp, v, strict=True)
+        res = autogradF.hvp(foo, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+            res = autogradF.hvp(bar, inp, v, strict=True)
+        res = autogradF.hvp(bar, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+            res = autogradF.hvp(bar2, inp, v, strict=True)
+        res = autogradF.hvp(bar2, inp, v, strict=False)
+        self._assert_same_struct(res[1], inp)
+        self.assertEqual(res[1].abs().sum(), 0.)
+
+    @base_and_logging_tensor
+    def test_hvp_no_grad(self, ctors):
+        def reducer(x):
+            return x.exp().sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        with torch.no_grad():
+            res = autogradF.hvp(reducer, inputs, v)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+        with torch.no_grad():
+            res = autogradF.hvp(reducer, inputs, v, create_graph=True)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+        self.assertNotEqual(res[1], ctors.zeros(4, 4))
+
+    @base_and_logging_tensor
+    def test_hvp_output(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.hvp(foo, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNone(res[0].grad_fn)
+        self.assertIsNone(res[1].grad_fn)
+
+        def bar(a, b):
+            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
+
+        inputs = (ctors.rand(3), ctors.rand(4))
+        v = (ctors.ones(3), ctors.ones(4))
+        out, hvp_val = autogradF.hvp(bar, inputs, v)
+        self._assert_same_struct(hvp_val, inputs)
+        self.assertIsNone(out.grad_fn)
+        self.assertIsNone(hvp_val[0].grad_fn)
+        self.assertIsNone(hvp_val[1].grad_fn)
+
+    @base_and_logging_tensor
+    def test_hvp_scalar(self, ctors):
+        def reducer(x):
+            return x.exp().sum()
+        inputs = ctors.rand(4, 4)
+        v = ctors.ones(4, 4)
+        res = autogradF.hvp(reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+        inputs = ctors.rand([])
+        v = ctors.rand([])
+        res = autogradF.hvp(reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+        res = autogradF.hvp(reducer, inputs)
+        self._assert_same_struct(res[1], inputs)
+
+        def bad_reducer(x):
+            return x.exp().sum().view(1, 1, 1)
+        inputs = ctors.rand(4, 4)
+        v = ctors.rand(4, 4)
+        res = autogradF.hvp(bad_reducer, inputs, v)
+        self._assert_same_struct(res[1], inputs)
+
+    @base_and_logging_tensor
+    def test_hvp_create_graph(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True)
+        v = ctors.ones(4, 4, dtype=torch.double, requires_grad=True)
+        res = autogradF.hvp(foo, inputs, v, create_graph=True)
+        self._assert_same_struct(res[1], inputs)
+        self.assertIsNotNone(res[0].grad_fn)
+        self.assertIsNotNone(res[1].grad_fn)
+
+        gradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
+        gradgradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
+
+        def bar(a, b):
+            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
+
+        inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True),
+                  ctors.rand(4, dtype=torch.double, requires_grad=True))
+        v = (ctors.ones(3, dtype=torch.double, requires_grad=True),
+             ctors.ones(4, dtype=torch.double, requires_grad=True))
+        out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True)
+        self._assert_same_struct(hvp_val, inputs)
+        self.assertIsNotNone(out.grad_fn)
+        self.assertIsNotNone(hvp_val[0].grad_fn)
+        self.assertIsNotNone(hvp_val[1].grad_fn)
+
+        gradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradgradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+
+        def foo(*args):
+            x, y = args[:2]
+            v = args[2:]
+
+            x = x.cos()
+            val, grad = autogradF.hvp(bar, (x, y), v, create_graph=True)
+
+            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
+
+        gradcheck(foo, inputs + v)
+        gradgradcheck(foo, inputs + v)
+
+    @base_and_logging_tensor
+    def test_jacobian_match_vjp_jvp(self, ctors):
+        def foo(x):
+            return x ** 3 + x.sum()
+
+        inputs = ctors.rand(4)
+        v = ctors.rand(4)
+
+        jac = autogradF.jacobian(foo, inputs)
+        jvp = autogradF.jvp(foo, inputs, v)[1]
+        vjp = autogradF.vjp(foo, inputs, v)[1]
+
+        self.assertEqual(jvp, torch.mm(jac, v.unsqueeze(1)).squeeze(1))
+        self.assertEqual(vjp, torch.mm(v.unsqueeze(0), jac).squeeze(0))
+
+    @base_and_logging_tensor
+    def test_hessian_match_vhp_hvp(self, ctors):
+        def foo(a):
+            return 3 * a.narrow(0, 0, 3).exp().sum()
+
+        inputs = ctors.rand(4)
+        v = ctors.rand(4)
+
+        hes = autogradF.hessian(foo, inputs)
+        hvp = autogradF.hvp(foo, inputs, v)[1]
+        vhp = autogradF.vhp(foo, inputs, v)[1]
+
+        self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1))
+        self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
+
+instantiate_parametrized_tests(TestAutogradFunctional)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index a98c0ac97b4c..a1e2adaacfa9 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -170,6 +170,7 @@ def test_timer(self):
 
     @slowTest
     @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.")
+    @unittest.skipIf(True, "Failing on clang, see 74398")
     def test_timer_tiny_fast_snippet(self):
         timer = benchmark_utils.Timer(
             'auto x = 1;(void)x;',
@@ -181,6 +182,7 @@ def test_timer_tiny_fast_snippet(self):
 
     @slowTest
     @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.")
+    @unittest.skipIf(True, "Failing on clang, see 74398")
     def test_cpp_timer(self):
         timer = benchmark_utils.Timer(
             """
@@ -547,6 +549,7 @@ def add_one(x):
     @slowTest
     @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
     @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.")
+    @unittest.skipIf(True, "Failing on clang, see 74398")
     def test_collect_cpp_callgrind(self):
         timer = benchmark_utils.Timer(
             "x += 1;",
diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp
index c0622ba41cbd..9b71b721b3db 100644
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@@ -1982,7 +1982,7 @@ TEST(DataLoaderTest, ChunkDatasetSave) {
 
     for (const auto epoch_index : c10::irange(epoch_count)) {
       (void)epoch_index; // Suppress unused variable warning
-      int iteration_count = 0;
+      unsigned iteration_count = 0;
       for (auto iterator = data_loader->begin(); iterator != data_loader->end();
            ++iterator, ++iteration_count) {
         if ((iteration_count + 1) % save_interval == 0) {
@@ -2316,7 +2316,7 @@ TEST(DataLoaderTest, CustomPreprocessPolicy) {
            ++iterator) {
         auto batch_result = *iterator;
         if (batch_result.size() > chunk_size * cross_chunk_shuffle_count) {
-          for (int i = 0; i < batch_result.size(); i += chunk_size) {
+          for (unsigned i = 0; i < batch_result.size(); i += chunk_size) {
             ASSERT_TRUE(std::is_sorted(
                 batch_result.begin() + i,
                 batch_result.begin() + i + chunk_size));
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index db0f4d25168f..add7e17c910d 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -917,15 +917,21 @@ TEST_F(FunctionalTest, ELU) {
     for (const auto alpha : {0.0, 0.42, 1.0, 4.2, 42.42}) {
       auto x = torch::linspace(-10.0, 10.0, size * size * size);
       x.resize_({size, size, size});
+      auto x_bf16 = torch::linspace(-10.0, 10.0, size * size * size).to(torch::kBFloat16);
+      x_bf16.resize_({size, size, size});
+
       auto y_exp = torch::max(torch::zeros_like(x), x) +
                 torch::min(torch::zeros_like(x), alpha * (torch::exp(x) - 1.0));
       auto y = F::elu(x, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
+      auto y_bf16 = F::elu(x_bf16, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
 
       ASSERT_EQ(y.ndimension(), 3);
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       ASSERT_TRUE(torch::allclose(y, y_exp));
+      ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));
+        ASSERT_TRUE(torch::allclose(x_bf16.to(torch::kFloat), y, 1e-2, 1e-2));
       }
     }
   }
@@ -938,15 +944,19 @@ TEST_F(FunctionalTest, SELU) {
     const double alpha = 1.6732632423543772848170429916717;
     for (const auto inplace : {false, true}) {
       auto input = torch::randn({5, 5});
+      auto input_bf16 = input.clone().to(torch::kBFloat16);
       auto expected = scale *
           (torch::max(torch::zeros_like(input), input) +
            torch::min(
                torch::zeros_like(input), alpha * (torch::exp(input) - 1)));
       auto output = F::selu(input, inplace);
+      auto output_bf16 = F::selu(input_bf16, inplace);
 
       ASSERT_TRUE(output.allclose(expected));
+      ASSERT_TRUE(output_bf16.to(torch::kFloat).allclose(output, 1e-2, 1e-2));
       if (inplace) {
         ASSERT_TRUE(input.allclose(expected));
+        ASSERT_TRUE(input_bf16.to(torch::kFloat).allclose(output, 1e-2, 1e-2));
       }
     }
   }
@@ -973,10 +983,17 @@ TEST_F(FunctionalTest, GLU) {
 }
 
 TEST_F(FunctionalTest, GELU) {
-  GELU model;
   const auto x = torch::linspace(-3.0, 3.0, 100);
   const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0)));
-  const auto y = F::gelu(x);
+  const auto y = F::gelu(x, F::GELUFuncOptions().approximate("none"));
+  ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05));
+}
+
+TEST_F(FunctionalTest, TanhGELU) {
+  const auto x = torch::linspace(-3.0, 3.0, 100);
+  const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0));
+  const auto y_exp = 0.5 * x * (1.0 + inner.tanh());
+  const auto y = F::gelu(x, F::GELUFuncOptions().approximate("tanh"));
   ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05));
 }
 
@@ -1528,15 +1545,19 @@ TEST_F(FunctionalTest, CELU) {
     for (const auto alpha : {0.42, 1.0, 4.2, 42.42}) {
       auto x = torch::linspace(-10.0, 10.0, size * size * size);
       x.resize_({size, size, size});
+      auto x_bf16 = x.clone().to(torch::kBFloat16);
       auto y_exp = torch::max(torch::zeros_like(x), x) +
         torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0));
       auto y = F::celu(x, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
+      auto y_bf16 = F::celu(x_bf16, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
 
       ASSERT_EQ(y.ndimension(), 3);
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       ASSERT_TRUE(torch::allclose(y, y_exp));
+      ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));
+        ASSERT_TRUE(torch::allclose(x_bf16.to(torch::kFloat), y, 1e-2, 1e-2));
       }
     }
   }
@@ -1548,13 +1569,16 @@ TEST_F(FunctionalTest, CELUDefaultOptions) {
   const auto alpha = 1.0;
   auto x = torch::linspace(-10.0, 10.0, size * size * size);
   x.resize_({size, size, size});
+  auto x_bf16 = x.clone().to(torch::kBFloat16);
   auto y_exp = torch::max(torch::zeros_like(x), x) +
     torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0));
   auto y = F::celu(x);
+  auto y_bf16 = F::celu(x_bf16);
 
   ASSERT_EQ(y.ndimension(), 3);
   ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
   ASSERT_TRUE(torch::allclose(y, y_exp));
+  ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2));
 }
 
 TEST_F(FunctionalTest, PixelShuffle) {
@@ -2167,7 +2191,7 @@ TEST_F(FunctionalTest, Interpolate) {
   }
 }
 
-TEST_F(FunctionalTest, Pad) {
+TEST_F(FunctionalTest, Pad1) {
   {
     auto input = torch::arange(6, torch::kDouble).reshape({1, 2, 3});
     auto output = F::pad(input, F::PadFuncOptions({1, 2}).mode(torch::kCircular));
@@ -2176,6 +2200,8 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 2, 6}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad2) {
   {
     auto input = torch::arange(9, torch::kDouble).reshape({1, 1, 3, 3});
     auto output = F::pad(input, F::PadFuncOptions({3, 3, 3, 1}).mode(torch::kCircular));
@@ -2190,6 +2216,8 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 1, 7, 9}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad3) {
   {
     auto input = torch::arange(12, torch::kDouble).reshape({1, 1, 2, 2, 3});
     auto output = F::pad(input, F::PadFuncOptions({3, 3, 2, 1, 2, 2}).mode(torch::kCircular));
@@ -2232,6 +2260,8 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 1, 6, 5, 9}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad4) {
   {
     auto input = torch::arange(16, torch::kDouble).reshape({2, 2, 2, 2});
     auto output = F::pad(input, F::PadFuncOptions({1, 1, 1, 1}).mode(torch::kReflect));
@@ -2258,6 +2288,8 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({2, 2, 4, 4}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad5) {
   {
     auto input = torch::arange(12, torch::kDouble).reshape({1, 1, 2, 2, 3});
     auto output = F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1, 2}).mode(torch::kReplicate));
@@ -2294,6 +2326,8 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 1, 5, 5, 6}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad6) {
   {
     auto input = torch::arange(18, torch::kDouble).reshape({1, 1, 3, 2, 3});
     auto output = F::pad(input, F::PadFuncOptions({0, 2, 1, 0, 1, 2}).mode(torch::kReflect));
@@ -2324,12 +2358,16 @@ TEST_F(FunctionalTest, Pad) {
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 1, 6, 3, 5}));
     ASSERT_TRUE(output.allclose(expected, 1e-04));
   }
+}
+TEST_F(FunctionalTest, Pad7) {
   {
     auto input = torch::ones({1, 1, 1, 1}, torch::kDouble);
     auto output = F::pad(input, F::PadFuncOptions({1, 1}).mode(torch::kConstant).value(0));
     ASSERT_EQ(output.sizes(), std::vector<int64_t>({1, 1, 1, 3}));
     auto expected = torch::tensor({{{{0., 1., 0.}}}}, torch::kDouble);
   }
+}
+TEST_F(FunctionalTest, Pad8) {
   {
     auto input = torch::ones({1, 1, 1, 1}, torch::kDouble);
     auto output = F::pad(input, F::PadFuncOptions({1, 1}));
diff --git a/test/cpp/api/init.cpp b/test/cpp/api/init.cpp
index 9e2ed422e28b..222d4f1171c4 100644
--- a/test/cpp/api/init.cpp
+++ b/test/cpp/api/init.cpp
@@ -19,7 +19,7 @@ void check_exact_values(
     auto layerParameters = parameters[i];
     auto expectedLayerParameters = expected_parameters[i];
 
-    if (layerParameters.size(0) != expectedLayerParameters.size()) {
+    if (static_cast<size_t>(layerParameters.size(0)) != expectedLayerParameters.size()) {
       std::cout << "layer #" << i
                 << " layerParameters size: " << layerParameters.size(0)
                 << " != "
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index a8d6320e9533..734cea27e5cc 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -90,3 +90,14 @@ TEST(UtilsTest, AmbiguousOperatorDefaults) {
   at::_test_ambiguous_defaults(tmp, 1, 1);
   at::_test_ambiguous_defaults(tmp, 2, "2");
 }
+
+int64_t get_first_element(c10::OptionalIntArrayRef arr) {
+  return arr.value()[0];
+}
+
+TEST(OptionalArrayRefTest, DanglingPointerFix) {
+  // Ensure that the converting constructor of `OptionalArrayRef` does not
+  // create a dangling pointer when given a single value
+  ASSERT_TRUE(get_first_element(300) == 300);
+  ASSERT_TRUE(get_first_element({400}) == 400);
+}
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 8632f3e195cb..cdf4f0ea0deb 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2860,13 +2860,23 @@ TEST_F(ModulesTest, GLU) {
 }
 
 TEST_F(ModulesTest, GELU) {
-  GELU model;
+  GELU model(GELUOptions().approximate("none"));
   const auto x = torch::linspace(-3.0, 3.0, 100);
   const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0)));
   const auto y = model(x);
   ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05));
 }
 
+TEST_F(ModulesTest, TanhGELU) {
+  GELU model(GELUOptions().approximate("tanh"));
+  const auto x = torch::linspace(-3.0, 3.0, 100);
+  const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0));
+  const auto y_exp = 0.5 * x * (1.0 + inner.tanh());
+  const auto y = model(x);
+  ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05));
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST_F(ModulesTest, Mish) {
   Mish model;
   auto x = torch::randn(100) * 10;
diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp
index 451c72e9d776..be371b1ae6d4 100644
--- a/test/cpp/api/nn_utils.cpp
+++ b/test/cpp/api/nn_utils.cpp
@@ -615,7 +615,7 @@ TEST_F(NNUtilsTest, PackPaddedSequence) {
     }
     int64_t offset = 0;
     std::vector<torch::Tensor> tensors_to_be_cat;
-    for (int64_t i = 1; i < sorted_lengths.size() + 1; i++) {
+    for (int64_t i = 1; i < static_cast<int64_t>(sorted_lengths.size() + 1); i++) {
       int64_t l = sorted_lengths.at(i-1);
       tensors_to_be_cat.emplace_back(pad(i * 100 + torch::arange(1., 5 * l + 1).view({l, 1, 5}), max_length));
     }
diff --git a/test/cpp/api/parameterdict.cpp b/test/cpp/api/parameterdict.cpp
index 5f2eab5d6b28..21dd1b31d5a8 100644
--- a/test/cpp/api/parameterdict.cpp
+++ b/test/cpp/api/parameterdict.cpp
@@ -105,7 +105,7 @@ TEST_F(ParameterDictTest, Values) {
   auto dict = torch::nn::ParameterDict(params);
   std::vector<torch::Tensor> values = dict->values();
   std::vector<torch::Tensor> true_values{ta, tb, tc};
-  for (auto i = 0; i < values.size(); i += 1) {
+  for (auto i = 0U; i < values.size(); i += 1) {
     ASSERT_TRUE(torch::all(torch::eq(values[i], true_values[i])).item<bool>());
   }
 }
diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index b422662aa362..ecad2348674b 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -129,7 +129,7 @@ void test_serialize_optimizer(DerivedOptimizerOptions options, bool only_has_glo
   // optim3_2 and optim1 should have param_groups and state of size 1 and state_size respectively
   ASSERT_TRUE(optim3_2_param_groups.size() == 1);
   // state_size = 2 for all optimizers except LBFGS as LBFGS only maintains one global state
-  int state_size = only_has_global_state ? 1 : 2;
+  unsigned state_size = only_has_global_state ? 1 : 2;
   ASSERT_TRUE(optim3_2_state.size() == state_size);
 
   // optim3_2 and optim1 should have param_groups and state of same size
@@ -355,6 +355,7 @@ TEST(SerializeTest, ErrorOnMissingKey) {
   // We want the errors to contain hierarchy information, too.
   ASSERT_THROWS_WITH(
       torch::load(model2, stream), "No such serialized tensor 'a.b.x'");
+  stream.seekg(0, stream.beg);
   ASSERT_THROWS_WITH(
       torch::load(model3, stream), "No such serialized submodule: 'a.x'");
 }
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index 2a00cb901f4f..6e57e92389f5 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -508,7 +508,6 @@ void testReduceScatter(const std::string& path, int rank, int size) {
 void testProcessGroupNCCLHealthCheckFailHelper(const std::string& path, bool timeout) {
   // simulate world_size > 1 here via threads.
   const int worldSize = 4;
-  std::mutex m;
   std::unordered_set<uint64_t> nums;
   auto runTest = [&](int i) {
     NCCLTest test(path, worldSize, std::chrono::milliseconds(3000));
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index cfdbb28a6765..60b43b81fc8b 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 
 # Build the cpp gtest binary containing the cpp-only tests.
 set(JIT_TEST_SRCS
+  ${JIT_TEST_ROOT}/test_add_if_then_else.cpp
   ${JIT_TEST_ROOT}/test_alias_analysis.cpp
   ${JIT_TEST_ROOT}/test_argument_spec.cpp
   ${JIT_TEST_ROOT}/test_autodiff.cpp
@@ -89,12 +90,16 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_script_profile.cpp
   ${JIT_TEST_ROOT}/test_shape_analysis.cpp
   ${JIT_TEST_ROOT}/test_jit_logging_levels.cpp
+  ${JIT_TEST_ROOT}/test_file_format.cpp
   ${JIT_TEST_ROOT}/test_flatbuffer.cpp
 )
 
 if(USE_CUDA)
-  list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu.cpp)
-  list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu_shift.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp)
 endif()
 
 add_executable(test_jit
@@ -138,6 +143,10 @@ if(USE_CUDA)
     ${TORCH_CUDA_LIBRARIES})
 
   target_compile_definitions(test_jit PRIVATE USE_CUDA)
+  # Suppress sign compare checks for NVFUSER JIT tests
+  if(NOT MSVC)
+    target_compile_options(test_jit PRIVATE -Wno-sign-compare)
+  endif()
 elseif(USE_ROCM)
   target_link_libraries(test_jit PRIVATE
     ${ROCM_HIPRTC_LIB}
diff --git a/test/cpp/jit/source_range_test.cpp b/test/cpp/jit/source_range_test.cpp
new file mode 100644
index 000000000000..16c7f850bf26
--- /dev/null
+++ b/test/cpp/jit/source_range_test.cpp
@@ -0,0 +1,51 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+using namespace ::testing;
+using namespace ::torch::jit;
+
+TEST(SourceRangeTest, test_find) {
+  std::vector<std::shared_ptr<std::string>> strings;
+  strings.push_back(std::make_shared<std::string>("hello world"));
+  strings.push_back(std::make_shared<std::string>("nihaoma"));
+
+  std::vector<c10::string_view> pieces{*strings[0], *strings[1]};
+
+  StringCordView view(pieces, strings);
+
+  auto x = view.find("rldni", 0);
+  EXPECT_EQ(x, 8);
+}
+
+TEST(SourceRangeTest, test_substr) {
+  std::vector<std::shared_ptr<std::string>> strings;
+  strings.push_back(std::make_shared<std::string>("hello world"));
+  strings.push_back(std::make_shared<std::string>("nihaoma"));
+
+  std::vector<c10::string_view> pieces{*strings[0], *strings[1]};
+
+  StringCordView view(pieces, strings);
+
+  auto x = view.substr(4, 10).str();
+  EXPECT_EQ(x, view.str().substr(4, 10));
+  EXPECT_EQ(view.substr(0, view.size()).str(), view.str());
+}
+
+TEST(SourceRangeTest, test_iter) {
+  std::vector<std::shared_ptr<std::string>> strings;
+  strings.push_back(std::make_shared<std::string>("hello world"));
+  strings.push_back(std::make_shared<std::string>("nihaoma"));
+
+  std::vector<c10::string_view> pieces{*strings[0], *strings[1]};
+
+  StringCordView view(pieces, strings);
+
+  auto iter = view.iter_for_pos(5);
+  EXPECT_EQ(*iter, ' ');
+  EXPECT_EQ(iter.rest_line(), " world");
+  EXPECT_EQ(*iter.next_iter(), 'w');
+  EXPECT_EQ(iter.pos(), 5);
+
+  iter = view.iter_for_pos(13);
+  EXPECT_EQ(iter.pos(), 13);
+}
diff --git a/test/cpp/jit/test_add_if_then_else.cpp b/test/cpp/jit/test_add_if_then_else.cpp
new file mode 100644
index 000000000000..4850e1ab425b
--- /dev/null
+++ b/test/cpp/jit/test_add_if_then_else.cpp
@@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/add_if_then_else.h>
+
+namespace torch {
+namespace jit {
+
+TEST(AddIfThenElseOpTest, AddIfThenElseOpSimple) {
+  const auto src = R"IR(
+        graph(%cond: bool, %a: Tensor, %b: Tensor):
+            %result: Tensor = prim::If(%cond)
+                block0():
+                    -> (%a)
+                block1():
+                    -> (%b)
+            return (%result)
+    )IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(src, graph.get());
+  EXPECT_TRUE(AddIfThenElseOp(graph));
+
+  testing::FileCheck()
+      .check_count("= prim::IfThenElse", 1, /*exactly*/ true)
+      ->check_count("= prim::If", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(AddIfThenElseOpTest, NoIfThenElseOpMultipleOutputs) {
+  const auto src = R"IR(
+        graph(%cond: bool, %a: Tensor, %b: Tensor):
+            %result1: Tensor, %result2: Tensor = prim::If(%cond)
+                block0():
+                    -> (%a, %b)
+                block1():
+                    -> (%b, %a)
+            return (%result1, %result2)
+    )IR";
+
+  auto graph = std::make_shared<Graph>();
+  parseIR(src, graph.get());
+  EXPECT_FALSE(AddIfThenElseOp(graph));
+
+  testing::FileCheck()
+      .check_count("= prim::IfThenElse", 0, /*exactly*/ true)
+      ->check_count("= prim::If", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index e8bfefe64263..6a087adb63c8 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -289,14 +289,11 @@ class AutodiffRemoveUnusedGradientsTest : public ::testing::Test {
   void SetUp() override {
     prev_exec = getExecutorMode();
     getExecutorMode() = true;
-    prev_profiling = getProfilingMode();
-    getProfilingMode() = true;
     prev_inline_autodiff = getAutodiffSubgraphInlining();
     debugSetAutodiffSubgraphInlining(false);
   }
   void TearDown() override {
     getExecutorMode() = prev_exec;
-    getProfilingMode() = prev_profiling;
     debugSetAutodiffSubgraphInlining(prev_inline_autodiff);
   }
 
diff --git a/test/cpp/jit/test_backend.cpp b/test/cpp/jit/test_backend.cpp
index a6961a2e4030..dd4df40d9c13 100644
--- a/test/cpp/jit/test_backend.cpp
+++ b/test/cpp/jit/test_backend.cpp
@@ -143,6 +143,38 @@ TEST(BackendTest, TestCompiler) {
   AT_ASSERT(mres.toTensor().equal(ref.toTensor()));
 }
 
+TEST(BackendTest, TestCompilerWithStringTable) {
+  setShouldUseFormatWithStringTable(true);
+  Module m("m");
+  m.define(R"(
+    def forward(self, x, h):
+        return x + h
+  )");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(2.0 * torch::ones({}));
+  inputs.emplace_back(1.0 * torch::ones({}));
+  auto ref = m.forward(inputs);
+
+  c10::Dict<IValue, IValue> compile_spec(StringType::get(), AnyType::get());
+  c10::Dict<IValue, IValue> fake_dict(StringType::get(), AnyType::get());
+  fake_dict.insert("", "");
+  compile_spec.insert("forward", fake_dict);
+  auto any_dict_ty = DictType::create(StringType::get(), AnyType::get());
+  // lowered module
+  auto lm = torch::jit::detail::codegen_backend_module(
+      "backend_with_compiler_demo", m, compile_spec, any_dict_ty);
+  auto res = lm.forward(inputs);
+  AT_ASSERT(res.toTensor().equal(ref.toTensor()));
+
+  std::stringstream ss;
+  lm._save_for_mobile(ss);
+  auto mlm = _load_for_mobile(ss);
+  auto mres = mlm.forward(inputs);
+  setShouldUseFormatWithStringTable(false);
+  AT_ASSERT(mres.toTensor().equal(ref.toTensor()));
+}
+
 TEST(BackendTest, TestComposite) {
   c10::Dict<IValue, IValue> compile_spec(StringType::get(), AnyType::get());
   c10::Dict<IValue, IValue> fake_dict(StringType::get(), AnyType::get());
@@ -276,6 +308,7 @@ TEST(BackendTest, TestConsistencyOfCompositeWithSetStates) {
   c._save_for_mobile(ss);
   auto mc = _load_for_mobile(ss);
   auto res_mobile = mc.forward(inputs);
+  ss.seekg(0, ss.beg);
 
   // check if the methods names are always the same
   // by reloading the script module and saving it back as mobile
@@ -383,6 +416,56 @@ Traceback of TorchScript (most recent call last):
   ASSERT_THROWS_WITH_MESSAGE(mlm.forward(inputs), error_pattern);
 }
 
+TEST(BackendTestDebugInfo, TestCompilerWithStringTable) {
+  setShouldUseFormatWithStringTable(true);
+  Module m("m");
+  m.define(R"(
+    def forward(self, x, h):
+        return x + h
+  )");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(torch::rand({2, 4}));
+  inputs.emplace_back(torch::rand({13, 9}));
+
+  c10::Dict<IValue, IValue> compile_spec(StringType::get(), AnyType::get());
+  c10::Dict<IValue, IValue> fake_dict(StringType::get(), AnyType::get());
+  fake_dict.insert("", "");
+  compile_spec.insert("forward", fake_dict);
+  auto any_dict_ty = DictType::create(StringType::get(), AnyType::get());
+  // lowered module
+  auto lm = torch::jit::detail::codegen_backend_module(
+      "backend_with_compiler_demo", m, compile_spec, any_dict_ty);
+
+  std::stringstream ss;
+  lm._save_for_mobile(ss, ExtraFilesMap(), true);
+  auto mlm = _load_for_mobile(ss);
+  std::string error_pattern = R"(
+  Module hierarchy:top(m)::<unknown>.__loweredModule__(m)::forward.aten::add
+Traceback of TorchScript (most recent call last):
+  File "<string>", line 3, in <unknown>
+
+            def forward(self, x: Tensor, h: Tensor):
+                return self.__loweredModule__.forward(x, h)
+                       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+
+  File "<string>", line 5, in forward
+                typed_inputs: List[Any] = [x, h, ]
+                if self.__backend.is_available() :
+                  _0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
+                        ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+                  assert isinstance(_0, Tensor)
+                  return _0
+  File "<string>", line 3, in <unknown>
+
+    def forward(self, x, h):
+        return x + h
+               ~~~~~ <--- HERE
+  )";
+  setShouldUseFormatWithStringTable(false);
+  ASSERT_THROWS_WITH_MESSAGE(mlm.forward(inputs), error_pattern);
+}
+
 TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithModuleHierarchy) {
   Module a("A");
   a.define(R"(
diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp
index 0db8bd428e9e..372b08a392d1 100644
--- a/test/cpp/jit/test_backend_compiler_lib.cpp
+++ b/test/cpp/jit/test_backend_compiler_lib.cpp
@@ -2,7 +2,10 @@
 #include <c10/core/TensorImpl.h>
 #include <torch/csrc/jit/backends/backend.h>
 #include <torch/csrc/jit/backends/backend_exception.h>
+
+#ifndef NO_PROFILING
 #include <torch/csrc/jit/mobile/profiler_edge.h>
+#endif
 
 namespace torch {
 namespace jit {
@@ -72,7 +75,12 @@ class BackendWithCompiler : public PyTorchBackendInterface {
     return true;
   }
 
-  // Since the actual compilation is done AOT,
+  // Since the actual compilation is done AOT for this backend, compile just
+  // forwards everything along. In a non toy setup this could grab information
+  // from that runtime that might be relevant to execute, such as build flags
+  // the resolution of the devices camera, or basically any runtime specific
+  // information that wouldnt be available server side where preprocess is
+  // called.
   c10::impl::GenericDict compile(
       c10::IValue processed,
       c10::impl::GenericDict method_compile_spec) override {
@@ -86,8 +94,14 @@ class BackendWithCompiler : public PyTorchBackendInterface {
     return c10::impl::toGenericDict(handles);
   }
 
+  // Function that actually executes the model in the backend. Here there is
+  // nothing to dispatch to, so the backend is implemented locally within
+  // execute and it only supports add, subtract, and constant. In a non toy
+  // backend you can imagine how this function could be used to actually
+  // dispatch the inputs to the relevant backend/device.
   c10::impl::GenericList execute(
-      c10::IValue handle,
+      c10::IValue
+          handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)]
       c10::impl::GenericList inputs) override {
     TORCH_INTERNAL_ASSERT(inputs.size() == 2);
     c10::IValue val0 = inputs[0];
@@ -98,15 +112,20 @@ class BackendWithCompiler : public PyTorchBackendInterface {
     op_runtimes_us.reserve(handle.toList().size());
 
     c10::List<at::Tensor> output_list;
+#ifndef NO_PROFILING
     auto start_us = torch::profiler::impl::getTime() / 1000;
+#endif
     for (const auto& token : handle.toList()) {
       IValue val = token;
       auto instruction = val.toTupleRef().elements()[0].toStringRef();
       auto debug_handle = val.toTupleRef().elements()[1].toInt();
       double const_val = 1.0;
+#ifndef NO_PROFILING
       auto start_time_us = torch::profiler::impl::getTime() / 1000;
+#endif
       try {
         if (instruction.rfind("prim::Constant", 0) == 0) {
+          // 15 is the length of 'prim::Constant#' the constant val comes after
           TORCH_CHECK(
               instruction.size() > 15,
               "Constant value is expected in ",
@@ -146,10 +165,13 @@ class BackendWithCompiler : public PyTorchBackendInterface {
       } catch (c10::Error& e) {
         TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle);
       }
+#ifndef NO_PROFILING
       auto end_time_us = torch::profiler::impl::getTime() / 1000;
       auto duration = end_time_us - start_time_us;
       op_runtimes_us.emplace_back(duration, debug_handle, instruction);
+#endif
     }
+#ifndef NO_PROFILING
     for (const auto& tup : op_runtimes_us) {
       RECORD_BACKEND_EVENT_TO_EDGE_PROFILER(
           start_us,
@@ -159,6 +181,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
           "test_backend");
       start_us = start_us + std::get<0>(tup);
     }
+#endif
     return c10::impl::toList(output_list);
   }
 };
diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp
new file mode 100644
index 000000000000..b6b3cbcd6793
--- /dev/null
+++ b/test/cpp/jit/test_exception.cpp
@@ -0,0 +1,159 @@
+/*
+ * We have a python unit test for exceptions in test/jit/test_exception.py .
+ * Add a CPP version here to verify that excepted exception types thrown from
+ * C++. This is hard to test in python code since C++ exceptions will be
+ * translated to python exceptions.
+ */
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+#include <torch/csrc/jit/frontend/parser.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/jit.h>
+#include <iostream>
+#include <stdexcept>
+
+namespace torch {
+namespace jit {
+
+namespace py = pybind11;
+
+TEST(TestException, TestAssertion) {
+  std::string pythonCode = R"PY(
+  def foo():
+    raise AssertionError("An assertion failed")
+  )PY";
+  auto cu_ptr = torch::jit::compile(pythonCode);
+  torch::jit::GraphFunction* gf =
+      (torch::jit::GraphFunction*)&cu_ptr->get_function("foo");
+  std::cerr << "Graph is\n" << *gf->graph() << std::endl;
+
+  bool is_jit_exception = false;
+  std::string message;
+  c10::optional<std::string> exception_class;
+  try {
+    cu_ptr->run_method("foo");
+  } catch (JITException& e) {
+    is_jit_exception = true;
+    message = e.what();
+    exception_class = e.getPythonClassName();
+  }
+  EXPECT_TRUE(is_jit_exception);
+  EXPECT_FALSE(exception_class);
+  EXPECT_TRUE(
+      message.find("RuntimeError: AssertionError: An assertion failed") !=
+      std::string::npos);
+}
+
+struct MyPythonExceptionValue : public torch::jit::SugaredValue {
+  explicit MyPythonExceptionValue(const py::object& exception_class) {
+    qualified_name_ =
+        (py::str(py::getattr(exception_class, "__module__", py::str(""))) +
+         py::str(".") +
+         py::str(py::getattr(exception_class, "__name__", py::str(""))))
+            .cast<std::string>();
+  }
+
+  std::string kind() const override {
+    return "My Python exception";
+  }
+
+  // Simplified from PythonExceptionValue::call
+  std::shared_ptr<torch::jit::SugaredValue> call(
+      const torch::jit::SourceRange& loc,
+      torch::jit::GraphFunction& caller,
+      at::ArrayRef<torch::jit::NamedValue> args,
+      at::ArrayRef<torch::jit::NamedValue> kwargs,
+      size_t n_binders) override {
+    TORCH_CHECK(args.size() == 1);
+    Value* error_message = args.at(0).value(*caller.graph());
+    Value* qualified_class_name =
+        insertConstant(*caller.graph(), qualified_name_, loc);
+    return std::make_shared<ExceptionMessageValue>(
+        error_message, qualified_class_name);
+  }
+
+ private:
+  std::string qualified_name_;
+};
+
+class SimpleResolver : public torch::jit::Resolver {
+ public:
+  explicit SimpleResolver() {}
+
+  std::shared_ptr<torch::jit::SugaredValue> resolveValue(
+      const std::string& name,
+      torch::jit::GraphFunction& m,
+      const torch::jit::SourceRange& loc) override {
+    // follows toSugaredValue (toSugaredValue is defined in caffe2:_C which is
+    // a python extension. We can not add that as a cpp_binary's dep)
+    if (name == "SimpleValueError") {
+      py::object obj = py::globals()["SimpleValueError"];
+      return std::make_shared<MyPythonExceptionValue>(obj);
+    }
+    TORCH_CHECK(false, "resolveValue: can not resolve '", name, "{}'");
+  }
+
+  torch::jit::TypePtr resolveType(
+      const std::string& name,
+      const torch::jit::SourceRange& loc) override {
+    return nullptr;
+  }
+};
+
+/*
+ * - The python source code parsing for TorchScript here is learned from
+ * torch::jit::compile.
+ * - The code only parses one Def. If there are multiple in the code, those
+ * except the first one are skipped.
+ */
+TEST(TestException, TestCustomException) {
+  py::scoped_interpreter guard{};
+  py::exec(R"PY(
+  class SimpleValueError(ValueError):
+    def __init__(self, message):
+      super(SimpleValueError, self).__init__(message)
+  )PY");
+
+  std::string pythonCode = R"PY(
+  def foo():
+    raise SimpleValueError("An assertion failed")
+  )PY";
+
+  torch::jit::Parser p(
+      std::make_shared<torch::jit::Source>(pythonCode, "<string>", 1));
+  auto def = torch::jit::Def(p.parseFunction(/*is_method=*/false));
+  std::cerr << "Def is:\n" << def << std::endl;
+  auto cu = std::make_shared<torch::jit::CompilationUnit>();
+  (void)cu->define(
+      c10::nullopt,
+      {},
+      {},
+      {def},
+      // class PythonResolver is defined in
+      // torch/csrc/jit/python/script_init.cpp. It's not in a header file so I
+      // can not use it. Create a SimpleResolver insteand
+      {std::make_shared<SimpleResolver>()},
+      nullptr);
+  torch::jit::GraphFunction* gf =
+      (torch::jit::GraphFunction*)&cu->get_function("foo");
+  std::cerr << "Graph is\n" << *gf->graph() << std::endl;
+  bool is_jit_exception = false;
+  c10::optional<std::string> exception_class;
+  std::string message;
+  try {
+    cu->run_method("foo");
+  } catch (JITException& e) {
+    is_jit_exception = true;
+    exception_class = e.getPythonClassName();
+    message = e.what();
+  }
+  EXPECT_TRUE(is_jit_exception);
+  EXPECT_EQ("__main__.SimpleValueError", *exception_class);
+  EXPECT_TRUE(
+      message.find("__main__.SimpleValueError: An assertion failed") !=
+      std::string::npos);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/jit/test_file_format.cpp b/test/cpp/jit/test_file_format.cpp
new file mode 100644
index 000000000000..a3571cbf30b4
--- /dev/null
+++ b/test/cpp/jit/test_file_format.cpp
@@ -0,0 +1,124 @@
+#include <torch/csrc/jit/mobile/file_format.h>
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+TEST(FileFormatTest, IdentifiesFlatbufferStream) {
+  // Create data whose initial bytes look like a Flatbuffer stream.
+  std::stringstream data;
+  data << "abcd" // First four bytes don't matter.
+       << "PTMF" // Magic string.
+       << "efgh"; // Trailing bytes don't matter.
+
+  // The data should be identified as Flatbuffer.
+  EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat);
+}
+
+TEST(FileFormatTest, IdentifiesZipStream) {
+  // Create data whose initial bytes look like a ZIP stream.
+  std::stringstream data;
+  data << "PK\x03\x04" // Magic string.
+       << "abcd" // Trailing bytes don't matter.
+       << "efgh";
+
+  // The data should be identified as ZIP.
+  EXPECT_EQ(getFileFormat(data), FileFormat::ZipFileFormat);
+}
+
+TEST(FileFormatTest, FlatbufferTakesPrecedence) {
+  // Since the Flatbuffer and ZIP magic bytes are at different offsets,
+  // the same data could be identified as both. Demonstrate that Flatbuffer
+  // takes precedence. (See details in file_format.h)
+  std::stringstream data;
+  data << "PK\x03\x04" // ZIP magic string.
+       << "PTMF" // Flatbuffer magic string.
+       << "abcd"; // Trailing bytes don't matter.
+
+  // The data should be identified as Flatbuffer.
+  EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat);
+}
+
+TEST(FileFormatTest, HandlesUnknownStream) {
+  // Create data that doesn't look like any known format.
+  std::stringstream data;
+  data << "abcd"
+       << "efgh"
+       << "ijkl";
+
+  // The data should be classified as unknown.
+  EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat);
+}
+
+TEST(FileFormatTest, ShortStreamIsUnknown) {
+  // Create data with fewer than kFileFormatHeaderSize (8) bytes.
+  std::stringstream data;
+  data << "ABCD";
+
+  // The data should be classified as unknown.
+  EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat);
+}
+
+TEST(FileFormatTest, EmptyStreamIsUnknown) {
+  // Create an empty stream.
+  std::stringstream data;
+
+  // The data should be classified as unknown.
+  EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat);
+}
+
+TEST(FileFormatTest, BadStreamIsUnknown) {
+  // Create a stream with valid Flatbuffer data.
+  std::stringstream data;
+  data << "abcd"
+       << "PTMF" // Flatbuffer magic string.
+       << "efgh";
+
+  // Demonstrate that the data would normally be identified as Flatbuffer.
+  EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat);
+
+  // Mark the stream as bad, and demonstrate that it is in an error state.
+  data.setstate(std::stringstream::badbit);
+  // Demonstrate that the stream is in an error state.
+  EXPECT_FALSE(data.good());
+
+  // The data should now be classified as unknown.
+  EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat);
+}
+
+TEST(FileFormatTest, StreamOffsetIsObservedAndRestored) {
+  // Create data with a Flatbuffer header at a non-zero offset into the stream.
+  std::stringstream data;
+  // Add initial padding.
+  data << "PADDING";
+  size_t offset = data.str().size();
+  // Add a valid Flatbuffer header.
+  data << "abcd"
+       << "PTMF" // Flatbuffer magic string.
+       << "efgh";
+  // Seek just after the padding.
+  data.seekg(static_cast<std::stringstream::off_type>(offset), data.beg);
+  // Demonstrate that the stream points to the beginning of the Flatbuffer data,
+  // not to the padding.
+  EXPECT_EQ(data.peek(), 'a');
+
+  // The data should be identified as Flatbuffer.
+  EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat);
+
+  // The stream position should be where it was before identification.
+  EXPECT_EQ(offset, data.tellg());
+}
+
+TEST(FileFormatTest, HandlesMissingFile) {
+  // A missing file should be classified as unknown.
+  EXPECT_EQ(
+      getFileFormat("NON_EXISTENT_FILE_4965c363-44a7-443c-983a-8895eead0277"),
+      FileFormat::UnknownFileFormat);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/jit/test_flatbuffer.cpp b/test/cpp/jit/test_flatbuffer.cpp
index 25992fa106e7..07bd42c1b141 100644
--- a/test/cpp/jit/test_flatbuffer.cpp
+++ b/test/cpp/jit/test_flatbuffer.cpp
@@ -19,19 +19,25 @@
 #include <torch/csrc/jit/serialization/export.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
 
+#include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <unordered_set>
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
 
-mobile::Module parse_mobile_module(void* data, size_t) {
+mobile::Module parse_mobile_module(
+    void* data,
+    size_t,
+    bool should_copy_tensor_memory = false) {
   auto* flatbuffer_module = mobile::serialization::GetMutableModule(data);
-  return initialize_mobile_module(flatbuffer_module);
+  return initialize_mobile_module(
+      flatbuffer_module, c10::nullopt, should_copy_tensor_memory);
 }
 
 TEST(FlatbufferTest, UpsampleNearest2d) {
@@ -62,6 +68,37 @@ TEST(FlatbufferTest, UpsampleNearest2d) {
   ASSERT_TRUE(resd2.equal(refd));
 }
 
+TEST(FlatbufferTest, UpsampleNearest2dWithCopyTensorMemory) {
+  Module m("m");
+  m.define(R"(
+    def forward(self, input: Tensor, scale:float):
+      return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale))
+  )");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(torch::rand({1, 3, 128, 128}));
+  inputs.emplace_back(at::Scalar(2.0));
+  auto ref = m.forward(inputs);
+
+  CompilationOptions options;
+  mobile::Module bc = jitModuleToMobile(m, options);
+  IValue res;
+  res = bc.forward(inputs);
+
+  auto resd = res.toTensor();
+  auto refd = ref.toTensor();
+  ASSERT_TRUE(resd.equal(refd));
+
+  auto buff = save_mobile_module_to_bytes(bc);
+  mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true);
+
+  buff = flatbuffers::DetachedBuffer();
+
+  auto res2 = bc2.forward(inputs);
+  auto resd2 = res2.toTensor();
+  ASSERT_TRUE(resd2.equal(refd));
+}
+
 TEST(FlatbufferTest, CheckAttrAccess) {
   Module m("m");
   m.register_attribute("mobile_optimized", BoolType::get(), true);
@@ -137,6 +174,67 @@ TEST(FlatbufferTest, MethodInvocation) { // NOLINT (use =delete in gtest)
   }
 }
 
+#if defined(ENABLE_FLATBUFFER) && !defined(FB_XPLAT_BUILD)
+TEST(FlatbufferTest, FlatbufferBackPortTest) {
+  Module m("m");
+  m.define(R"(
+    def forward(self, input: Tensor, scale:float):
+      return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale))
+  )");
+  std::stringstream ss;
+  m._save_for_mobile(ss, {}, false, true);
+
+  std::stringstream oss;
+  bool backPortSuccess = _backport_for_mobile(ss, oss, 5);
+  ASSERT_TRUE(backPortSuccess);
+}
+#endif // defined(ENABLE_FLATBUFFER) && !defined(FB_XPLAT_BUILD)
+
+TEST(FlatbufferTest, ExtraFiles) {
+  const auto script = R"JIT(
+    def forward(self):
+        x = torch.rand(5, 5)
+        x = x.mm(x)
+        return x
+  )JIT";
+
+  auto module =
+      std::make_shared<Module>("Module", std::make_shared<CompilationUnit>());
+  module->define(script);
+  std::ostringstream oss;
+  std::unordered_map<std::string, std::string> extra_files;
+  extra_files["metadata.json"] = "abc";
+  extra_files["mobile_info.json"] = "{\"key\": 23}";
+
+  std::unordered_map<std::string, std::string> loaded_extra_files;
+#if defined ENABLE_FLATBUFFER
+  std::stringstream ss;
+  module->_save_for_mobile(ss, extra_files, true, /*use_flatbuffer=*/true);
+
+  loaded_extra_files["metadata.json"] = "";
+  auto mobile_module = _load_for_mobile(ss, c10::nullopt, loaded_extra_files);
+
+  ASSERT_EQ(loaded_extra_files["metadata.json"], "abc");
+  ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}");
+
+  // load it twice using the same stream
+  auto mobile_module2 = _load_for_mobile(ss, c10::nullopt, loaded_extra_files);
+#else
+  CompilationOptions options;
+  mobile::Module bc = jitModuleToMobile(*module, options);
+  auto buff = save_mobile_module_to_bytes(bc, extra_files);
+
+  loaded_extra_files["metadata.json"] = "";
+  auto* flatbuffer_module =
+      mobile::serialization::GetMutableModule(buff.data());
+
+  parseExtraFiles(flatbuffer_module, loaded_extra_files);
+#endif
+
+  ASSERT_EQ(loaded_extra_files["metadata.json"], "abc");
+  ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}");
+}
+
 TEST(FlatbufferTest, Conv) {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
   if (s && strcmp(s, "1") == 0)
@@ -179,6 +277,50 @@ TEST(FlatbufferTest, Conv) {
       outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
 }
 
+TEST(FlatbufferTest, ConvWithCopyTensorMemory) {
+  auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
+  if (s && strcmp(s, "1") == 0)
+    return;
+
+  std::vector<torch::jit::IValue> inputs;
+
+  Module m("m");
+  m.register_parameter("weight", torch::ones({20, 1, 5, 5}), false);
+  m.register_parameter("bias", torch::ones({20}), false);
+  m.define(R"(
+    def forward(self, input):
+      return torch._convolution(input, self.weight, self.bias, [1, 1], [0, 0], [1, 1], False, [0, 0], 1, False, False, True, True)
+  )");
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,modernize-use-emplace)
+  inputs.push_back(torch::ones({1, 1, 28, 28}));
+
+  auto outputref = m.forward(inputs).toTensor();
+
+  CompilationOptions options;
+  mobile::Module bc = jitModuleToMobile(m, options);
+  IValue res;
+  for (int i = 0; i < 3; ++i) {
+    res = bc.get_method("forward")(inputs);
+  }
+  auto output = res.toTensor();
+  AT_ASSERT(outputref.dim() == output.dim());
+  AT_ASSERT(
+      outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
+
+  auto buff = save_mobile_module_to_bytes(bc);
+  mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true);
+  buff = flatbuffers::DetachedBuffer();
+
+  for (int i = 0; i < 3; ++i) {
+    res = bc2.get_method("forward")(inputs);
+  }
+  output = res.toTensor();
+  AT_ASSERT(outputref.dim() == output.dim());
+  AT_ASSERT(
+      outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
+}
+
 TEST(FlatbufferTest, Inline) {
   Module m("m");
   m.define(R"JIT(
@@ -204,6 +346,32 @@ TEST(FlatbufferTest, Inline) {
   AT_ASSERT(output.toTensor().item<float>() == 7.0);
 }
 
+TEST(FlatbufferTest, InlineWithCopyTensorMemory) {
+  Module m("m");
+  m.define(R"JIT(
+  def foo1(self, x):
+      return x + 1
+
+  def foo2(self, x):
+      return self.foo1(x) + 2
+
+  def foo3(self, x):
+      return self.foo2(x) + 3
+  )JIT");
+  CompilationOptions options;
+  mobile::Module bc = jitModuleToMobile(m, options);
+  std::vector<torch::jit::IValue> inputs({torch::ones({})});
+  auto output = bc.get_method("foo3")(inputs);
+  AT_ASSERT(output.toTensor().item<float>() == 7.0);
+
+  auto buff = save_mobile_module_to_bytes(bc);
+  mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true);
+  buff = flatbuffers::DetachedBuffer();
+  std::vector<torch::jit::IValue> inputs2({torch::ones({})});
+  output = bc2.get_method("foo3")(inputs2);
+  AT_ASSERT(output.toTensor().item<float>() == 7.0);
+}
+
 TEST(FlatbufferTest, Tuple) {
   Module m("m");
   m.define(R"JIT(
@@ -1104,5 +1272,534 @@ TEST(FlatbufferTest, OperatorTest2) { // NOLINT (use =delete in gtest)
   }
 }
 
+Module jitModuleFromBuffer(void* data) {
+  auto* flatbuffer_module = mobile::serialization::GetMutableModule(data);
+  FlatbufferLoader loader;
+  mobile::Module mobilem = loader.parseModule(flatbuffer_module);
+  ExtraFilesMap files;
+  std::vector<IValue> constants;
+  loader.extractJitSourceAndConstants(&files, &constants);
+  return jitModuleFromSourceAndConstants(
+      mobilem._ivalue(), files, constants, 8);
+}
+
+#if defined(ENABLE_FLATBUFFER)
+TEST(TestSourceFlatbuffer, UpsampleNearest2d) {
+  Module m("m");
+  m.define(R"(
+    def forward(self, input: Tensor, scale:float):
+      return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale))
+  )");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(torch::rand({1, 3, 128, 128}));
+  inputs.emplace_back(at::Scalar(2.0));
+  auto ref = m.forward(inputs);
+
+  std::stringstream ss;
+  m._save_for_mobile(ss, {}, false, /*use_fatbuffer=*/true);
+  auto mm = _load_for_mobile(ss);
+  auto m2 = load(ss);
+
+  auto res = m2.forward(inputs);
+  auto resm = mm.forward(inputs);
+
+  auto resd = res.toTensor();
+  auto refd = ref.toTensor();
+  auto resmd = resm.toTensor();
+  ASSERT_TRUE(resd.equal(refd));
+  ASSERT_TRUE(resmd.equal(refd));
+}
+
+TEST(TestSourceFlatbuffer, CheckAttrAccess) {
+  Module m("m");
+  m.register_attribute("mobile_optimized", BoolType::get(), true);
+  auto data = save_jit_module_to_bytes(m);
+  Module m2 = jitModuleFromBuffer(data.data());
+  bool mobile_optimized = m2.attr("mobile_optimized", false).toBool();
+  AT_ASSERT(mobile_optimized);
+  mobile::Module m3 = parse_mobile_module(data.data(), data.size());
+  mobile_optimized = m3.attr("mobile_optimized", false).toBool();
+  AT_ASSERT(mobile_optimized);
+}
+
+TEST(TestSourceFlatbuffer,
+     MethodInvocation) { // NOLINT (use =delete in gtest)
+  const std::vector<std::string> test_programs{
+      // test invoking a method with default parameter
+      R"(
+      def test_func(self, x, b : int = 4):
+        return self.foo + x + b
+      )",
+      // inner method call with default parameter (gets inlined)
+      R"(
+      def add_with_default_arg(self, x, b : int = 4):
+        return self.foo + x + b
+      def test_func(self, x):
+        return self.add_with_default_arg(x)  # invoke method w/ default arg
+      )",
+      // simple method call
+      R"(
+      def test_func(self, x):
+        b = 4
+        return self.foo + x + b
+      )",
+  };
+  for (const auto& test_program : test_programs) {
+    Module m("m");
+    m.register_parameter("foo", torch::ones({}), false);
+    m.define(test_program);
+
+    const int fortyTwo = 42; // (keep linter happy)
+    auto minput = fortyTwo * torch::ones({});
+    auto ref = m.run_method("test_func", minput);
+
+    auto data = save_jit_module_to_bytes(m);
+    Module m2 = jitModuleFromBuffer(data.data());
+    const auto& test_func = m2.get_method("test_func");
+    IValue res;
+    for (int i = 0; i < 3; ++i) {
+      res = test_func({minput});
+    }
+    auto resd = res.toTensor().item<float>();
+    auto refd = ref.toTensor().item<float>();
+    AT_ASSERT(resd == refd);
+
+    mobile::Module m3 = parse_mobile_module(data.data(), data.size());
+    const auto& test_func3 = m3.get_method("test_func");
+    for (int i = 0; i < 3; ++i) {
+      res = test_func3({minput});
+    }
+    resd = res.toTensor().item<float>();
+    refd = ref.toTensor().item<float>();
+    AT_ASSERT(resd == refd);
+  }
+}
+#endif
+
+#if !defined FB_XPLAT_BUILD
+// The following test run in fbcode only
+TEST(FlatbufferUpgraderTest, DivTensorV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append("upgrader_models/test_versioned_div_tensor_v2.ptl.ff");
+  /*
+  (('__torch__.MyModule.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+       ('DROPR', 1, 0),
+       ('LOAD', 2, 0),
+       ('LOAD', 3, 0),
+       ('OP', 0, 0),
+       ('LOAD', 2, 0),
+       ('LOAD', 3, 0),
+       ('OP', 1, 0),
+       ('MOVE', 2, 0),
+       ('MOVE', 3, 0),
+       ('OP', 2, 0),
+       ('TUPLE_CONSTRUCT', 3, 0),
+       ('RET', 0, 0))),
+     ('operators',
+      (('aten::div', 'Tensor'),
+       ('aten::div', 'Tensor'),
+       ('aten::div', 'Tensor'))),
+     ('constants', ()),
+     ('types', ()),
+     ('register_size', 3))),)
+
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // 3 operators will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 3);
+
+  std::vector<IValue> inputs = {
+      IValue(6 * torch::ones({1})), IValue(3 * torch::ones({1}))};
+  auto actual_output = m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output_list = actual_output.toTuple()->elements();
+  ASSERT_TRUE(actual_output_list[0].toTensor().equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivTensorOutV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff");
+  /*
+  (('__torch__.MyModule.forward',
+    (('instructions',
+      (('STOREN', 1, 4),
+       ('DROPR', 1, 0),
+       ('MOVE', 2, 0),
+       ('MOVE', 3, 0),
+       ('MOVE', 4, 0),
+       ('OP', 0, 0),
+       ('RET', 0, 0))),
+     ('operators', (('aten::div', 'out'),)),
+     ('constants', ()),
+     ('types', ()),
+     ('register_size', 4))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{
+      IValue(6 * torch::ones({1})),
+      IValue(3 * torch::ones({1})),
+      IValue(torch::empty({1}))};
+  m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = inputs[2].toTensor();
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivTensorInplaceV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff");
+  /*
+  (('__torch__.MyModule.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+       ('DROPR', 1, 0),
+       ('MOVE', 2, 0),
+       ('MOVE', 3, 0),
+       ('OP', 0, 0),
+       ('RET', 0, 0))),
+     ('operators', (('aten::div_', 'Tensor'),)),
+     ('constants', ()),
+     ('types', ()),
+     ('register_size', 3))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{
+      IValue(6 * torch::ones({1})), IValue(3 * torch::ones({1}))};
+  m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = inputs[0].toTensor();
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarFloatV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleFloat.forward',
+    (('instructions',
+    (('STOREN', 1, 3),
+    ('DROPR', 1, 0),
+    ('MOVE', 2, 0),
+    ('MOVE', 3, 0),
+    ('OP', 0, 0),
+    ('RET', 0, 0))),
+    ('operators', (('aten::div', 'Scalar'),)),
+    ('constants', ()),
+    ('types', ()),
+    ('register_size', 3))),)
+  */
+
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3.0)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = output.toTensor();
+
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarReciprocalFloatV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleFloat.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+      ('DROPR', 1, 0),
+      ('MOVE', 2, 0),
+      ('OP', 0, 0),
+      ('MOVE', 3, 0),
+      ('OP', 1, 0),
+      ('RET', 0, 0))),
+    ('operators', (('aten::reciprocal', ''), ('aten::mul', 'Scalar'))),
+    ('constants', ()),
+    ('types', ()),
+    ('register_size', 3))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // No operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 0);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3.0)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 0.5 * torch::ones({1});
+  auto actual_output = output.toTensor();
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarReciprocalIntV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleInt.forward',
+  (('instructions',
+    (('STOREN', 1, 3),
+     ('DROPR', 1, 0),
+     ('MOVE', 2, 0),
+     ('OP', 0, 0),
+     ('MOVE', 3, 0),
+     ('OP', 1, 0),
+     ('RET', 0, 0))),
+   ('operators', (('aten::reciprocal', ''), ('aten::mul', 'Scalar'))),
+   ('constants', ()),
+   ('types', ()),
+   ('register_size', 3))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // No operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 0);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3.0)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 0.5 * torch::ones({1});
+  auto actual_output = output.toTensor();
+
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarScalarV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff");
+  /*
+  (('__torch__.MyModule.forward',
+    (('instructions',
+      (('STOREN', 1, 5),
+      ('DROPR', 1, 0),
+      ('LOAD', 2, 0),
+      ('LOAD', 3, 0),
+      ('OP', 0, 0),
+      ('MOVE', 2, 0),
+      ('LOAD', 4, 0),
+      ('OP', 1, 0),
+      ('LOAD', 3, 0),
+      ('MOVE', 4, 0),
+      ('OP', 2, 0),
+      ('MOVE', 3, 0),
+      ('MOVE', 5, 0),
+      ('OP', 3, 0),
+      ('TUPLE_CONSTRUCT', 4, 0),
+      ('RET', 0, 0))),
+    ('operators',
+      (('aten::div', ''),
+      ('aten::div', 'float'),
+      ('aten::div', ''),
+      ('aten::div', 'int'))),
+    ('constants', ()),
+    ('types', ()),
+    ('register_size', 5))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // No operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 0);
+
+  std::vector<IValue> inputs{IValue(20.0), IValue(10), IValue(2.0), IValue(5)};
+  auto output = m_module.forward(inputs);
+  auto output_list = output.toTupleRef().elements();
+  auto expect_output = std::vector<IValue>(
+      {IValue(2.0), IValue(10.0), IValue(5.0), IValue(2.0)});
+  // auto actual_output = output.toTensor();
+  for (size_t i = 0; i < expect_output.size(); i++) {
+    ASSERT_EQ(output_list[i], expect_output[i]);
+  }
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarIntV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleInt.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+      ('DROPR', 1, 0),
+      ('MOVE', 2, 0),
+      ('MOVE', 3, 0),
+      ('OP', 0, 0),
+      ('RET', 0, 0))),
+    ('operators', (('aten::div', 'Scalar'),)),
+    ('constants', ()),
+    ('types', ()),
+    ('register_size', 3))),)
+  */
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = output.toTensor();
+
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarInplaceFloatV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleFloat.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+      ('DROPR', 1, 0),
+      ('MOVE', 2, 0),
+      ('MOVE', 3, 0),
+      ('OP', 0, 0),
+      ('RET', 0, 0))),
+    ('operators', (('aten::div_', 'Scalar'),)),
+    ('constants', ()),
+    ('types', ()),
+    ('register_size', 3))),)
+  */
+
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3.0)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = output.toTensor();
+
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+TEST(FlatbufferUpgraderTest, DivScalarInplaceIntV2) {
+  std::string filePath(__FILE__);
+  auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file.append(
+      "upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff");
+  /*
+  (('__torch__.MyModuleInt.forward',
+    (('instructions',
+      (('STOREN', 1, 3),
+       ('DROPR', 1, 0),
+       ('MOVE', 2, 0),
+       ('MOVE', 3, 0),
+       ('OP', 0, 0),
+       ('RET', 0, 0))),
+     ('operators', (('aten::div_', 'Scalar'),)),
+     ('constants', ()),
+     ('types', ()),
+     ('register_size', 3))),)
+  */
+
+  mobile::Module m_module = load_mobile_module_from_file(test_model_file);
+
+  auto intrsuction_list =
+      m_module.get_method("forward").function().get_code().instructions_;
+  uint64_t number_of_call_instruction = 0;
+  for (auto& instruction : intrsuction_list) {
+    number_of_call_instruction += (instruction.op == OpCode::CALL);
+  }
+  // One operator will use upgrader
+  ASSERT_EQ(number_of_call_instruction, 1);
+
+  std::vector<IValue> inputs{IValue(6 * torch::ones({1})), IValue(3)};
+  auto output = m_module.forward(inputs);
+  auto expect_output = 2.0 * torch::ones({1});
+  auto actual_output = output.toTensor();
+
+  // The out argument will be overwritten with the output
+  ASSERT_TRUE(actual_output.equal(expect_output));
+}
+
+#endif // !defined(FB_XPLAT_BUILD)
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index 18d5b3cc33b3..87261a8b1ce7 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/frontend/tracer.h>
@@ -54,7 +55,19 @@
 namespace torch {
 namespace jit {
 
-TEST(FuserTest, TestSimple_CUDA) {
+class FuserTest : public ::testing::Test {
+  void SetUp() override {
+    old_nvfuser_value_ = fuser::cuda::setEnabled(false);
+  }
+  void TearDown() override {
+    fuser::cuda::setEnabled(old_nvfuser_value_);
+  }
+
+ private:
+  bool old_nvfuser_value_;
+};
+
+TEST_F(FuserTest, TestSimple_CUDA) {
 #if defined(FBCODE_CAFFE2)
   return;
 #endif
@@ -77,7 +90,7 @@ TEST(FuserTest, TestSimple_CUDA) {
   ASSERT_EQ(max_diff, 0);
 }
 
-TEST(FuserTest, TestOne_CUDA) {
+TEST_F(FuserTest, TestOne_CUDA) {
 #if defined(FBCODE_CAFFE2)
   return;
 #endif
@@ -137,7 +150,7 @@ TEST(FuserTest, TestOne_CUDA) {
   testOne(0, 2);
 }
 
-TEST(FuserTest, FusedConcat_CUDA) {
+TEST_F(FuserTest, FusedConcat_CUDA) {
 #if defined(FBCODE_CAFFE2)
   return;
 #endif
@@ -182,7 +195,7 @@ TEST(FuserTest, FusedConcat_CUDA) {
   };
 }
 
-TEST(FuserTest, FusionAliasing) {
+TEST_F(FuserTest, FusionAliasing) {
 #if defined(FBCODE_CAFFE2)
   return;
 #endif
@@ -210,7 +223,7 @@ TEST(FuserTest, FusionAliasing) {
       ->run(*g);
 }
 
-TEST(FuserTest, KernelCaching) {
+TEST_F(FuserTest, KernelCaching) {
 #if defined(FBCODE_CAFFE2)
   return;
 #endif
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
deleted file mode 100644
index f229ac2679e5..000000000000
--- a/test/cpp/jit/test_gpu.cpp
+++ /dev/null
@@ -1,19630 +0,0 @@
-#if defined(USE_CUDA)
-#include <gtest/gtest.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
-
-// fuser and IR parser
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/ir/irparser.h>
-
-#include "test_gpu_validator.h"
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include <algorithm>
-#include <iostream>
-
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::fuser::cuda;
-using namespace at::indexing;
-
-namespace {
-
-// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder()
-      .ndims(ndims)
-      .dtype(dtype)
-      .contiguity(std::vector<bool>(ndims, true))
-      .build();
-}
-
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
-void checkIntValue(
-    ExpressionEvaluator& evaluator,
-    Val* val,
-    Int::ScalarType expected_value) {
-  TORCH_CHECK(val->isAnInt());
-  const auto actual_value = evaluator.evaluate(val);
-  TORCH_CHECK(actual_value.has_value());
-  TORCH_CHECK(actual_value.value() == expected_value);
-}
-
-void checkIntValue(
-    kir::ExpressionEvaluator& evaluator,
-    const kir::Val* val,
-    kir::Int::ScalarType expected_value) {
-  const auto actual_value = evaluator.evaluate(val);
-  TORCH_CHECK(actual_value.has_value());
-  TORCH_CHECK(actual_value.value() == expected_value);
-}
-
-bool isPredicated(TensorView* tv, GpuLower& gpulw) {
-  auto parent_scope = gpulw.lowerValue(tv)->definition()->parentScope();
-  if (parent_scope->isA<kir::IfThenElse>()) {
-    return !parent_scope->predicate()->value()->isConst();
-  }
-  return true;
-};
-
-} // namespace
-
-// 1. Test cases are void() functions.
-// 2. They start with the prefix `test`
-
-// A few smoke tests for IrGraphGenerator
-// (These tests exercise IrGraphGenerator through a non-trivial IR,
-//  to make sure that it runs w/o crashing. The actual output is not
-//  validated)
-TEST(NVFuserTest, IrGraphGenerator_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Make sure we can handle empty IRs
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Basic)
-                   .empty());
-
-  // Construct an interesting IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv2 = add(tv0, new Double(3.141));
-  TensorView* tv3 = broadcast(tv0, {false, true, false, true});
-  TensorView* tv4 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv3);
-  TensorView* tv5 = clamp(tv4, new Double(0.f), new Double(1.f));
-  TensorView* tv6 = add(tv2, tv2);
-
-  // Another checkpoint before adding outputs
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Explicit)
-                   .empty());
-
-  fusion.addOutput(tv6);
-
-  tv4->axis(2)->parallelize(ParallelType::BIDy);
-  tv6->merge(0);
-  tv6->split(0, 4);
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->reorder({{-1, 0}});
-  tv2->computeAt(tv6, 1);
-
-  // Another checkpoint with more node types
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::ComputeOnly)
-                   .empty());
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  // Final IR graph
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Verbose)
-                   .empty());
-}
-
-TEST(NVFuserTest, FusionDispatch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* f = new Double{2.f};
-  std::stringstream ss1, ss2, ss3;
-  ss1 << f;
-  ss2 << static_cast<Val*>(f);
-  ss3 << static_cast<Statement*>(f);
-  TORCH_CHECK(
-      ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0,
-      "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*.");
-}
-
-// Evaluate basic scalar operations with constant values
-TEST(NVFuserTest, FusionExprEvalConstants_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  ExpressionEvaluator evaluator(&fusion);
-
-  auto* a = new Int(7);
-  auto* b = new Int(3);
-
-  // Avoid div operation because it casts int operands to float
-  checkIntValue(evaluator, neg(a), -7);
-  checkIntValue(evaluator, add(a, b), 10);
-  checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40);
-  checkIntValue(evaluator, mod(a, b), 1);
-  checkIntValue(evaluator, ceilDiv(a, b), 3);
-}
-
-// Evaluate basic scalar operations with bound values
-TEST(NVFuserTest, FusionExprEvalBindings_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  ExpressionEvaluator evaluator(&fusion);
-
-  auto* a = new Int();
-  auto* b = new Int();
-  auto* c = add(a, b);
-  auto* d = neg(ceilDiv(c, b));
-  auto* e = new Int(0);
-
-  // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!evaluator.evaluate(a).has_value());
-  TORCH_CHECK(!evaluator.evaluate(d).has_value());
-
-  evaluator.bind(a, 7);
-  evaluator.bind(b, 3);
-
-  // can't bind to the results of expressions
-  ASSERT_ANY_THROW(evaluator.bind(c, 100));
-
-  // can't bind to concrete values
-  ASSERT_ANY_THROW(evaluator.bind(e, 100));
-
-  checkIntValue(evaluator, c, 10);
-  checkIntValue(evaluator, sub(a, b), 4);
-  checkIntValue(evaluator, mod(a, b), 1);
-  checkIntValue(evaluator, ceilDiv(a, b), 3);
-  checkIntValue(evaluator, d, -4);
-
-  // Reset evaluation context
-  evaluator = ExpressionEvaluator(&fusion);
-
-  evaluator.bind(a, 2);
-  evaluator.bind(b, 5);
-
-  checkIntValue(evaluator, c, 7);
-  checkIntValue(evaluator, sub(a, b), -3);
-  checkIntValue(evaluator, mod(a, b), 2);
-  checkIntValue(evaluator, ceilDiv(a, b), 1);
-  checkIntValue(evaluator, d, -2);
-}
-
-// Evaluate expressions in a simple IR
-TEST(NVFuserTest, FusionExprEvalBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Create a non-trivial IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // 1. Create an evaluator
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  //
-  // IMPORTANT:
-  // a. The bindings are only as stable as the Vals are in the fusion graph
-  // b. You must use the original (rootDomain) extents
-  //  (ex. `tv0->getRootDomain()[0]->extent()`
-  //   instead of `tv0->axis(0)->extent()`)
-  //
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
-  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
-
-  // 3. Evaluate and check result values
-  TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
-}
-
-// Evaluate expressions in a more complex IR
-TEST(NVFuserTest, FusionExprEvalComplex_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(-1.0));
-  TensorView* tv2 = add(tv0, new Double(3.0));
-  TensorView* tv3 = mul(tv0, new Double(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv0, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  tv5->reorder({{-1, 0}});
-
-  tv6->split(0, 5);
-  tv5->merge(0);
-
-  // 1. Create an evaluator
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 129);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 127);
-
-  // Evaluate and check extent values
-  TORCH_CHECK(tv0->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv0->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv0->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv4->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv4->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv4->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv5->domain()->nDims() == 1);
-  checkIntValue(evaluator, tv5->axis(0)->extent(), 16383);
-
-  TORCH_CHECK(tv6->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv6->axis(0)->extent(), 26);
-  checkIntValue(evaluator, tv6->axis(1)->extent(), 5);
-  checkIntValue(evaluator, tv6->axis(2)->extent(), 127);
-}
-
-// Evaluate expressions post lowering
-TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Create a non-trivial IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto* bid_x = add(tv3->axis(0)->extent(), new Int(0));
-  auto* tid_x = add(tv3->axis(-1)->extent(), new Int(0));
-
-  // Lower
-  GpuLower gpulw(&fusion);
-
-  // 1. Create an evaluation context
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
-  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
-
-  // 3. Evaluate and check result values
-  TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
-
-  checkIntValue(evaluator, bid_x, 2);
-  checkIntValue(evaluator, tid_x, 128);
-}
-
-// Kernel IR: Evaluate basic scalar operations with constant values
-TEST(NVFuserTest, FusionKernelExprEvalConstants_CUDA) {
-  kir::Kernel kernel;
-  kir::IrBuilder ir_builder(&kernel);
-
-  auto a = ir_builder.create<kir::Int>(7);
-  auto b = ir_builder.create<kir::Int>(3);
-  auto c = ir_builder.subExpr(a, b);
-  auto d = ir_builder.divExpr(a, b);
-  auto e = ir_builder.mulExpr(c, d);
-
-  kir::ExpressionEvaluator evaluator;
-
-  checkIntValue(evaluator, ir_builder.negExpr(a), -7);
-  checkIntValue(evaluator, ir_builder.addExpr(a, b), 10);
-  checkIntValue(evaluator, ir_builder.negExpr(e), -8);
-  checkIntValue(evaluator, ir_builder.modExpr(a, b), 1);
-  checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3);
-}
-
-// Kernel IR: Evaluate basic scalar operations with bound values
-TEST(NVFuserTest, FusionKernelExprEvalBindings_CUDA) {
-  kir::Kernel kernel;
-  kir::IrBuilder ir_builder(&kernel);
-
-  kir::ExpressionEvaluator evaluator;
-
-  auto a = ir_builder.create<kir::Int>(c10::nullopt);
-  auto b = ir_builder.create<kir::Int>(c10::nullopt);
-  auto c = ir_builder.addExpr(a, b);
-  auto d = ir_builder.negExpr(ir_builder.ceilDivExpr(c, b));
-  auto e = ir_builder.create<kir::Int>(0);
-
-  // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!evaluator.evaluate(a).has_value());
-  TORCH_CHECK(!evaluator.evaluate(d).has_value());
-
-  evaluator.bind(a, 7);
-  evaluator.bind(b, 3);
-
-  // can't bind to the results of expressions
-  ASSERT_ANY_THROW(evaluator.bind(c, 100));
-
-  // can't bind to concrete values
-  ASSERT_ANY_THROW(evaluator.bind(e, 100));
-
-  checkIntValue(evaluator, c, 10);
-  checkIntValue(evaluator, ir_builder.subExpr(a, b), 4);
-  checkIntValue(evaluator, ir_builder.modExpr(a, b), 1);
-  checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3);
-  checkIntValue(evaluator, d, -4);
-
-  // Reset the evaluation context
-  evaluator = kir::ExpressionEvaluator();
-
-  evaluator.bind(a, 2);
-  evaluator.bind(b, 5);
-
-  checkIntValue(evaluator, c, 7);
-  checkIntValue(evaluator, ir_builder.subExpr(a, b), -3);
-  checkIntValue(evaluator, ir_builder.modExpr(a, b), 2);
-  checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 1);
-  checkIntValue(evaluator, d, -2);
-}
-
-TEST(NVFuserTest, FusionClear_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 1. Create a dummy IR
-
-  {
-    TensorView* tv0 = makeSymbolicTensor(2);
-    TensorView* tv1 = makeSymbolicTensor(2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-
-    TensorView* tv2 = add(tv1, new Double(2.0));
-    TensorView* tv3 = add(tv0, tv2);
-
-    fusion.addOutput(tv3);
-
-    tv3->split(0, 4);
-    tv0->computeAt(tv3, 1);
-    tv1->computeAt(tv3, 1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv2->axis(1)->parallelize(ParallelType::Unroll);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // 2. Clear the IR
-
-  fusion.clear();
-
-  TORCH_CHECK(fusion.unordered_exprs().empty());
-  TORCH_CHECK(fusion.vals().empty());
-
-  TORCH_CHECK(fusion.inputs().empty());
-  TORCH_CHECK(fusion.outputs().empty());
-
-  TORCH_CHECK(!fusion.hasReduction());
-
-  // 3. Rebuild the IR
-
-  {
-    TensorView* tv0 = makeSymbolicTensor(3);
-    TensorView* tv1 = makeSymbolicTensor(3);
-    TensorView* tv2 = add(tv1, new Double(2.0));
-    TensorView* tv3 = add(tv0, tv2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-    fusion.addOutput(tv3);
-
-    // tv3 [i0, i1, i2]
-    tv3->reorder({{0, 2}, {2, 0}});
-    // tv3 [i2, i1, i0]
-    tv3->split(-1, 4);
-    // tv3 [i2, i1, i0outer, i0inner{4}]
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-    // tv3 [i0outer, i0inner{4}, i1, i2]
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-    tv3->axis(1)->parallelize(ParallelType::BIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({16, 8, 8}, options);
-  at::Tensor input2 = at::randn_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionCopy_CUDA) {
-  Fusion original_fusion;
-
-  // Create the test IR
-  {
-    FusionGuard fg(&original_fusion);
-
-    auto tv0 = makeSymbolicTensor(3);
-    auto tv1 = makeSymbolicTensor(3);
-    auto tv2 = add(tv1, new Double(2.0));
-    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
-
-    original_fusion.addInput(tv0);
-    original_fusion.addInput(tv1);
-    original_fusion.addOutput(tv3);
-
-    tv3->reorder({{0, 2}, {2, 0}});
-    tv3->split(-1, 4);
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // Test copy before lowering
-  Fusion clone = original_fusion;
-
-  // Compare IR dumps
-  std::stringstream original_ir;
-  std::stringstream clone_ir;
-  original_ir << original_fusion;
-  clone_ir << clone;
-  ASSERT_EQ(original_ir.str(), clone_ir.str());
-
-  // Lower original fusion
-  std::string original_kernel;
-  {
-    // TODO(kir): remove this guard once we implement the cuda codegen visitor
-    FusionGuard fg(&original_fusion);
-    original_kernel =
-        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
-  }
-
-  // Make sure the "before lowering" clone was not mutated
-  // while lowering the original fusion IR
-  std::stringstream before_lowering_ir;
-  before_lowering_ir << clone;
-  ASSERT_EQ(original_ir.str(), before_lowering_ir.str());
-
-  // Test copy after lowering (including assignment operator)
-  Fusion before_lowering = clone;
-  clone = original_fusion;
-
-  // Compare IR dumps
-  std::stringstream original_lowered_ir;
-  std::stringstream clone_lowered_ir;
-  original_lowered_ir << original_fusion;
-  clone_lowered_ir << clone;
-  ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
-
-  // Lower the "before lowering" and compare kernels
-  std::string clone_kernel;
-  {
-    // TODO(kir): remove this guard once we implement the cuda codegen visitor
-    FusionGuard fg(&before_lowering);
-    clone_kernel =
-        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
-  }
-  ASSERT_EQ(original_kernel, clone_kernel);
-}
-
-TEST(NVFuserTest, FusionMove_CUDA) {
-  Fusion fusion;
-
-  // Create the test IR
-  {
-    FusionGuard fg(&fusion);
-
-    auto tv0 = makeSymbolicTensor(3);
-    auto tv1 = makeSymbolicTensor(3);
-    auto tv2 = add(tv1, new Double(2.0));
-    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-    fusion.addOutput(tv3);
-
-    tv3->reorder({{0, 2}, {2, 0}});
-    tv3->split(-1, 4);
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  std::stringstream original_ir;
-  original_ir << fusion;
-
-  // Test move before lowering
-  Fusion another_fusion = std::move(fusion);
-
-  // Check that the original fusion is "empty"
-  //
-  // IMPORTANT: these checks assume knowledge of the internal
-  //    implementation of the move operations. General uses
-  //    should only assume that the moved-from object is in
-  //    a valid, but unspecified state. This is similar to the
-  //    standard library containers:
-  //    https://en.cppreference.com/w/cpp/utility/move
-  //
-  TORCH_CHECK(fusion.unordered_exprs().empty());
-  TORCH_CHECK(fusion.vals().empty());
-  TORCH_CHECK(fusion.inputs().empty());
-  TORCH_CHECK(fusion.outputs().empty());
-
-  // clear() has no pre-conditions so it's valid to call on a moved-from object
-  fusion.clear();
-
-  // Compare IR dumps
-  std::stringstream another_ir;
-  another_ir << another_fusion;
-  ASSERT_EQ(original_ir.str(), another_ir.str());
-
-  // Lower the fusion IR
-  GpuLower lower(&another_fusion);
-
-  std::stringstream lowered_ir;
-  lowered_ir << another_fusion;
-
-  // Test move assignment after lowering
-  fusion = std::move(another_fusion);
-
-  // Compare IR dumps
-  std::stringstream moved_lowered_ir;
-  moved_lowered_ir << fusion;
-  ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
-}
-
-TEST(NVFuserTest, FusionSimpleArith_CUDA) {
-  std::stringstream ss1, ss2;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* d1 = new Double(1.f);
-  Double* d2 = new Double{2.f};
-  Double* d3 = new Double();
-
-  // Disrupt the fusion to make sure guard works well
-  {
-    Fusion fusion2;
-    FusionGuard fg(&fusion2);
-
-    Double* d1 = new Double(1.f);
-    Double* d2 = new Double(2.f);
-    add(d1, d2);
-    ss2 << fusion2;
-  }
-
-  new BinaryOp(BinaryOpType::Add, d3, d1, d2);
-  ss1 << fusion;
-
-  TORCH_CHECK(
-      ss1.str().compare(ss2.str()) == 0,
-      "Error where explicit add nodes don't match implicit add nodes.");
-}
-
-TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* d4 = new Double{4.f};
-  Int* i1 = new Int{3};
-  auto d5 = add(d4, i1);
-
-  TORCH_CHECK(d5->getDataType() == DataType::Double);
-}
-
-TEST(NVFuserTest, FusionRegister_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  Double* v1 = new Double{1.f};
-  Double* v2 = new Double{2.f};
-  Val* v3 = binaryOp(BinaryOpType::Add, v1, v2);
-  Val* v4 = binaryOp(BinaryOpType::Add, v1, v2);
-  TORCH_CHECK(v1->name() + 1 == v2->name());
-  TORCH_CHECK(v2->name() + 1 == v3->name());
-  TORCH_CHECK(v3->name() + 1 == v4->name());
-  TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name());
-}
-
-// dummy expr with 2 outputs only for toposort test.
-struct DummyExpr : public Expr {
-  ~DummyExpr() = default;
-  DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs)
-      : Expr(ExprType::UnaryOp) // Not terribly safe...
-  {
-    addOutput(_outlhs);
-    addOutput(_outrhs);
-    addInput(_lhs);
-    addInput(_rhs);
-    this->name_ = FusionGuard::getCurFusion()->registerExpr(this);
-  }
-  DummyExpr(const DummyExpr& other) = delete;
-  DummyExpr& operator=(const DummyExpr& other) = delete;
-  DummyExpr(DummyExpr&& other) = delete;
-  DummyExpr& operator=(DummyExpr&& other) = delete;
-};
-
-TEST(NVFuserTest, FusionTopoSort_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // e0: v3, v2 = dummy(v1, v0)
-  // e1: v4     =   add(v3, v2)
-  // e2: v5     =   add(v2, v4)
-  // e3: v6     =   add(v5, v5)
-  Double* v0 = new Double{1.f};
-  Double* v1 = new Double{2.f};
-  Double* v2 = new Double();
-  Double* v3 = new Double();
-  Double* v4 = new Double();
-  Double* v5 = new Double();
-  Double* v6 = new Double();
-
-  std::vector<Val*> inputs = {v0, v1};
-  for (auto val : inputs) {
-    fusion.addInput(val);
-  }
-
-  Expr* e0 = new DummyExpr(v3, v2, v1, v0);
-  Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2);
-  Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4);
-  Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5);
-
-  fusion.addOutput(v2);
-  fusion.addOutput(v3);
-  auto exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1");
-  TORCH_CHECK(exprs[0] == e0);
-
-  fusion.addOutput(v5);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-
-  fusion.addOutput(v4);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-
-  fusion.addOutput(v6);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-  TORCH_CHECK(exprs[3] == e3);
-
-  TORCH_CHECK(v2->definition()->name() == 0);
-  TORCH_CHECK(v3->definition()->name() == 0);
-  TORCH_CHECK(v4->definition()->name() == 1);
-  TORCH_CHECK(v5->definition()->name() == 2);
-  TORCH_CHECK(v6->definition()->name() == 3);
-}
-
-TEST(NVFuserTest, FusionTensor_CUDA) {
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  {
-    auto tensor = at::randn({2, 3, 4, 5}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-      // check contiguity information;
-      TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]);
-    }
-  }
-
-  // TensorType::create fills stride_properties, which helps us to mark
-  // IterDomain properly
-  // Note: implementation could change, depending on how much we want to invest
-  // in our home-brew contiguity coalescing. For now let's make sure that we
-  // properly test what we are using.
-  {
-    auto tensor = at::randn({4, 4, 4}, options);
-    auto sliced_tensor = tensor.slice(1, 0, -1, 2);
-
-    auto tensor_type = TensorType::create(sliced_tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-  }
-
-  {
-    auto tensor = at::randn({2, 3, 4, 5}, options);
-    auto permuted_tensor = tensor.permute({0, 3, 1, 2});
-    auto tensor_type = TensorType::create(permuted_tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
-    }
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]);
-  }
-}
-
-TEST(NVFuserTest, FusionFilterVals_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(1);
-  auto scalar0 = new Double(0);
-  auto scalar1 = new Int(0);
-  auto scalar2 = new Int(1);
-
-  const std::vector<Val*> vals = {tv0, scalar0, tv1, scalar1, scalar2};
-
-  std::vector<TensorView*> tvs(
-      ir_utils::filterByType<TensorView>(vals).begin(),
-      ir_utils::filterByType<TensorView>(vals).end());
-  TORCH_CHECK(tvs.size() == 2);
-  TORCH_CHECK(tvs[0] == tv0);
-  TORCH_CHECK(tvs[1] == tv1);
-
-  std::vector<Double*> floats(
-      ir_utils::filterByType<Double>(vals).begin(),
-      ir_utils::filterByType<Double>(vals).end());
-  TORCH_CHECK(floats.size() == 1);
-  TORCH_CHECK(floats[0] == scalar0);
-
-  std::vector<Int*> ints(
-      ir_utils::filterByType<Int>(vals).begin(),
-      ir_utils::filterByType<Int>(vals).end());
-  TORCH_CHECK(ints.size() == 2);
-  TORCH_CHECK(ints[0] == scalar1);
-  TORCH_CHECK(ints[1] == scalar2);
-
-  TORCH_CHECK(
-      ir_utils::filterByType<Expr>(vals).begin() ==
-          ir_utils::filterByType<Expr>(vals).end(),
-      "Not expecting any results");
-}
-
-TEST(NVFuserTest, FusionTVSplit_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv = makeSymbolicTensor(3);
-
-  tv = tv->split(2, 2);
-  TORCH_CHECK(tv->nDims() == 4);
-  Expr* outer = tv->axis(2)->extent()->definition();
-
-  TORCH_CHECK(
-      outer->getExprType().value() == ExprType::BinaryOp &&
-      static_cast<BinaryOp*>(outer)->getBinaryOpType() ==
-          BinaryOpType::CeilDiv &&
-      static_cast<BinaryOp*>(outer)->lhs()->sameAs(
-          tv->getRootDomain()[2]->extent()) &&
-      static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs())
-          ->sameAs(new Int(2)));
-
-  IterDomain* inner = static_cast<IterDomain*>(tv->axis(3));
-  TORCH_CHECK(
-      inner->extent()->isScalar() &&
-      static_cast<Int*>(inner->extent())->isConst() &&
-      static_cast<Int*>(inner->extent())->value().value() == 2);
-}
-
-TEST(NVFuserTest, FusionTVMerge_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv = makeSymbolicTensor(3);
-
-  tv = tv->merge(1);
-  Expr* axisOp = tv->axis(1)->extent()->definition();
-
-  TORCH_CHECK(
-      tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp &&
-      static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul &&
-      static_cast<BinaryOp*>(axisOp)->lhs() ==
-          tv->getRootDomain()[1]->extent() &&
-      static_cast<BinaryOp*>(axisOp)->rhs() ==
-          tv->getRootDomain()[2]->extent());
-}
-
-TEST(NVFuserTest, FusionTVReorder_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::unordered_map<int, int> shift_right{{-1, 0}};
-
-  std::unordered_map<int, int> shift_left{{0, -1}};
-
-  std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}};
-
-  std::unordered_map<int, int> swap{{0, 2}, {2, 0}};
-
-  auto tv = makeSymbolicTensor(3);
-  std::vector<IterDomain*> ref;
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_left);
-  for (const auto i : c10::irange(tv->nDims())) {
-    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_left);
-  for (const auto i : c10::irange(tv->nDims())) {
-    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_right);
-  TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0)));
-  for (const auto i : c10::irange(1, tv->nDims())) {
-    TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-  tv->reorder(swap);
-  TORCH_CHECK(ref[0]->sameAs(tv->axis(2)));
-  TORCH_CHECK(ref[2]->sameAs(tv->axis(0)));
-  TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
-}
-
-TEST(NVFuserTest, FusionEquality_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* fval1 = new Double();
-  Double* fval1_copy = fval1;
-  Double* fval2 = new Double();
-  Double* fone = new Double(1.0);
-
-  TORCH_CHECK(fval1->sameAs(fval1_copy));
-  TORCH_CHECK(!fval1->sameAs(fval2));
-  TORCH_CHECK(!fone->sameAs(fval1));
-  TORCH_CHECK(fone->sameAs(new Double(1.0)));
-
-  Int* ival1 = new Int();
-  Int* ival1_copy = ival1;
-  Int* ival2 = new Int();
-  Int* ione = new Int(1);
-
-  TORCH_CHECK(ival1->sameAs(ival1_copy));
-  TORCH_CHECK(!ival1->sameAs(ival2));
-  TORCH_CHECK(!ione->sameAs(ival1));
-  TORCH_CHECK(ione->sameAs(new Int(1)));
-
-  BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1);
-  BinaryOp* add1_copy =
-      new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1);
-  BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Double(), fval1, ival1);
-
-  UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Double(), fval1);
-  UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Double(), fval2);
-  UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Double(), fval1);
-
-  TORCH_CHECK(add1->sameAs(add1_copy));
-  TORCH_CHECK(!add1->sameAs(sub1));
-
-  TORCH_CHECK(neg1->sameAs(neg1_copy));
-  TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1));
-  TORCH_CHECK(!neg1->sameAs(neg2));
-}
-
-TEST(NVFuserTest, FusionDependency_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* d0 = new Double(0.f);
-  Double* d1 = new Double(1.f);
-  auto d2 = add(d0, d1);
-
-  auto d3 = add(d2, d2);
-
-  Double* d4 = new Double(4.f);
-  Double* d5 = new Double(5.f);
-  auto d6 = add(d4, d5);
-
-  Double* d7 = new Double(7.f);
-  Double* d8 = new Double(8.f);
-  auto d9 = add(d7, d8);
-
-  auto d10 = add(d6, d9);
-
-  auto d11 = add(d3, d10);
-
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10));
-
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8));
-
-  auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d3);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d2);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d10);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d10);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d6);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2);
-  TORCH_CHECK(dep_chain.empty());
-}
-
-TEST(NVFuserTest, FusionParser_CUDA) {
-  // This test may not pass if using a custom block sync as there may
-  // be additional calls. Skip the test as it's not specifically
-  // relevant with block synchronizatin.
-  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
-    return;
-  }
-  auto g = std::make_shared<Graph>();
-  const auto graph0_string = R"IR(
-    graph(%0 : Float(2, strides=[1]),
-          %1 : Float(2, strides=[1])):
-      %c0 : Float(2, strides=[1]) = aten::mul(%0, %1)
-      %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0)
-      return (%d0))IR";
-  parseIR(graph0_string, g.get());
-
-  // strides are not yet supported in the irparser.
-  for (auto val : g->block()->inputs()) {
-    if (val->isCompleteTensor())
-      val->setType(val->type()->castRaw<TensorType>()->contiguous());
-  }
-  for (auto node : g->block()->nodes()) {
-    for (auto val : node->outputs()) {
-      if (val->isCompleteTensor())
-        val->setType(val->type()->castRaw<TensorType>()->contiguous());
-    }
-  }
-
-  auto fusion = parseJitIR(g);
-  FusionGuard fg(fusion.get());
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  // Avoid vectorization here as those kernels can't be lowered twice at the
-  // moment
-  at::Tensor input1 = at::randn({16}, options);
-  at::Tensor input2 = at::randn({16}, options);
-  auto lparams = schedulePointwise(fusion.get(), {input1, input2});
-
-  // CONSIDER:
-  // 1. this can be moved to a dedicated "golden" file
-  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
-  const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
-  if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < T0.size[0])) {
-    constexpr nvfuser_index_t ki183 = 0;
-    float T5[1];
-    constexpr nvfuser_index_t ki217 = 0;
-    T5[ki217] = 0;
-    constexpr nvfuser_index_t ki208 = 0;
-    T5[ki208]
-       = T1[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki208) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)];
-    float T4[1];
-    constexpr nvfuser_index_t ki223 = 0;
-    T4[ki223] = 0;
-    constexpr nvfuser_index_t ki203 = 0;
-    T4[ki203]
-       = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki203) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)];
-    float T6[1];
-    constexpr nvfuser_index_t ki192 = 0;
-    float T2[1];
-    T2[0]
-      = T4[ki192]
-      * T5[ki192];
-    T6[ki192]
-      = T2[0]
-      * T4[ki192];
-    constexpr nvfuser_index_t ki185 = 0;
-    T3[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki185) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]
-       = T6[ki185];
-  }
-}
-)";
-
-  const std::string actual_kernel =
-      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
-  if (expected_kernel.size() != actual_kernel.size() ||
-      expected_kernel.compare(actual_kernel) != 0) {
-    std::cerr
-        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
-        << " \n ========= EXPECTED ========= \n"
-        << expected_kernel << "\n========= ACTUAL ========== \n"
-        << actual_kernel << "\n=================" << std::endl;
-    auto it = std::mismatch(
-        expected_kernel.begin(),
-        expected_kernel.end(),
-        actual_kernel.begin(),
-        actual_kernel.end());
-    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
-    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
-    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
-    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
-    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
-              << ", expected: " << expected_mismatched_snippet << std::endl;
-    TORCH_CHECK(false);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1, input2}, lparams);
-  at::Tensor output_ref = input1 * input2 * input1;
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionForLoop_CUDA) {
-// TODO(kir): re-enable this test
-//  due to the current "GpuLower guard" approach, we can only create
-//  kernel IR during GpuLower::lower()
-#if 0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const auto TV0 = new TensorView(
-      new TensorDomain({new IterDomain(new Int(0), new Int(16))}),
-      DataType::Float);
-  const auto TV1 = new TensorView(
-      new TensorDomain({new IterDomain(new Int(0), new Int(16))}),
-      DataType::Float);
-
-  fusion.addInput(TV0);
-  fusion.addInput(TV1);
-
-  auto ID0 = new kir::IterDomain(new IterDomain(new Int(0), new Int(8)));
-
-  TensorView* TV2 = add(TV0, TV1);
-  BinaryOp* op = static_cast<BinaryOp*>(TV2->definition();
-  fusion.addOutput(TV2);
-
-  auto fl = new kir::ForLoop(new kir::Int(c10::nullopt), ID0, {op});
-
-  std::stringstream result;
-  std::stringstream ref;
-  result << fl;
-  ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}";
-
-  if (result.str().compare(ref.str()) == 0) {
-    std::stringstream err_msg;
-    err_msg << "ForLoop printing has changed or something has gone wrong. "
-            << result.str() << "\n does not match reference: " << ref.str()
-            << std::endl;
-    TORCH_CHECK(false, err_msg.str());
-  }
-#endif
-}
-
-TEST(NVFuserTest, FusionOuterSplit_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0));
-  TensorView* tv1 = add(tv0, new Double(2.0));
-  TensorView* tv2 = add(tv1, new Double(3.0));
-  fusion.addOutput(tv2);
-
-  //[I0, I1, I2]
-  tv2->split(-1, 4, false);
-  //[I0, I1, I2o{4}, I2i]
-  tv2->merge(0);
-  tv2->merge(0);
-  //[I0*I1*I2o{4}, I2i]
-  tv2->split(0, 2);
-  //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i]
-  tv2->reorder({{0, 1}, {1, 0}});
-  // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i]
-
-  tv0->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor output = at::empty({2, 6, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({}, {output});
-
-  at::Tensor output_ref = at::zeros_like(output, options);
-  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST(NVFuserTest, FusionCodeGen_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0));
-  TensorView* tv1 = add(tv0, new Double(2.0));
-  TensorView* tv2 = add(tv1, new Double(3.0));
-  fusion.addOutput(tv2);
-
-  //[I0, I1, I2]
-  tv2 = tv2->split(0, 4);
-  //[I0o, I0i{4}, I1, I2]
-  tv2 = tv2->merge(1);
-  //[I0o, I0i{4}*I1, I2]
-  tv2 = tv2->split(-1, 2);
-  //[I0o, I0i{4}*I1, I2o, I2i{2}]
-  tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}});
-  //[I0i{4}*I1, I0o, I2i{2}, I2o]
-
-  tv0->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor output = at::empty({16, 8, 8}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({}, {output});
-
-  at::Tensor output_ref = at::zeros_like(output, options);
-  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST(NVFuserTest, FusionCodeGen2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-  TensorView* tv1 = makeSymbolicTensor(3);
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv3);
-
-  //[I0, I1, I2]
-  tv3->reorder({{0, 2}, {2, 0}});
-  //[I2, I1, I0]
-  tv3->split(-1, 4);
-  //[I2, I1, I0o, I0i{4}]
-  tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-  // I0o, I0i{4}, I1, I2]
-
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({16, 8, 8}, options);
-  at::Tensor input2 = at::randn_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionSimplePWise_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  // dimensionality of the problem
-  int nDims = 3;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(nDims);
-  TensorView* tv1 = makeContigTensor(nDims);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  // Do transformations, remember, transformations are outputs to inputs
-  // This doesn't have to be in this order
-  tv3->merge(1);
-  tv3->merge(0);
-
-  // Split by n_threads
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 2, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST(NVFuserTest, FusionExecKernel_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::ones({1, 128}, options);
-  at::Tensor input2 = at::ones_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor check = at::full({1, 128}, 4, options);
-  ;
-  TORCH_CHECK(outputs[0].equal(check));
-}
-
-int ceilDiv_(int a, int b) {
-  return (a + b - 1) / b;
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = add(tv1, new Double(3.0));
-  TensorView* tv4 = mul(tv1, new Double(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv7->merge(0);
-  tv7->split(0, 128);
-  tv7->split(0, 4);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv7, 1);
-
-  GpuLower gpulw(&fusion);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-    TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0)));
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(-1.0));
-  TensorView* tv2 = add(tv0, new Double(3.0));
-  TensorView* tv3 = mul(tv0, new Double(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  auto t1 = input.mul({-1.0});
-  auto t2 = input.add({3.0});
-  auto t3 = input.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = mul(tv1, new Double(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv3->nDims() > 1)
-    tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.mul({0.979361});
-  auto aten_output = t2.mul(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  while (tv6->nDims() > 1)
-    tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-  tv3->computeAt(tv6, 1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  auto t4 = t2.sub(t3);
-  auto t5 = t1.add(t4);
-  auto aten_output = t5.sub(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(-1, 8);
-  tv3->split(-1, 4);
-
-  tv2->computeAt(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv2->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1.0));
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, new Double(3.0));
-
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = broadcast(tv1, {false, true});
-
-  auto tv6 = makeSymbolicTensor(2);
-  fusion.addInput(tv6);
-
-  auto tv7 = mul(tv5, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->split(1, 2);
-  tv7->merge(0);
-  tv7->split(0, 4);
-  tv7->split(0, 128);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv7, 1);
-  auto tv5_domain = tv5->domain()->domain();
-
-  // These computeAt transformations should not affect the TV5 domain
-  tv0->computeAt(tv4, -1);
-  tv2->computeAt(tv4, -1);
-
-  auto tv5_domain_current = tv5->domain()->domain();
-  TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain");
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({numel_x}, options);
-  auto t2 = at::randn({numel_x}, options);
-  auto t6 = at::randn({numel_x, numel_y}, options);
-
-  auto t1 = t0.add(1.0);
-  auto t3 = t2.add(3.0);
-  auto t4 = t1.add(t3);
-  auto t5 = t1.unsqueeze(1);
-  auto t7 = t5.mul(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t6};
-  std::vector<at::Tensor> aten_outputs = {t4, t7};
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1.0));
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, new Double(3.0));
-
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = broadcast(tv1, {false, true});
-
-  auto tv6 = makeSymbolicTensor(2);
-  fusion.addInput(tv6);
-
-  auto tv7 = mul(tv5, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->split(1, 2);
-  tv7->merge(0);
-  tv7->split(0, 128, false);
-  tv7->split(0, 4, false);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(1)->parallelize(ParallelType::TIDx);
-
-  // Reverse computeAt structure from previous test
-  tv0->computeAt(tv4, -1);
-  tv2->computeAt(tv4, -1);
-  tv0->computeAt(tv7, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({numel_x}, options);
-  auto t2 = at::randn({numel_x}, options);
-  auto t6 = at::randn({numel_x, numel_y}, options);
-
-  auto t1 = t0.add(1.0);
-  auto t3 = t2.add(3.0);
-  auto t4 = t1.add(t3);
-  auto t5 = t1.unsqueeze(1);
-  auto t7 = t5.mul(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t6};
-  std::vector<at::Tensor> aten_outputs = {t4, t7};
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = add(tv1, new Double(3.0));
-  TensorView* tv4 = mul(tv1, new Double(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  tv0->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeWith(tv7, 1);
-
-  GpuLower gpulw(&fusion);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-    TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0)));
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(-1.0));
-  TensorView* tv2 = add(tv0, new Double(3.0));
-  TensorView* tv3 = mul(tv0, new Double(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  tv0->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeWith(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  auto t1 = input.mul({-1.0});
-  auto t2 = input.add({3.0});
-  auto t3 = input.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = mul(tv1, new Double(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv0->nDims() > 1)
-    tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  while (tv1->nDims() > 1)
-    tv1->merge(0);
-  tv1->split(0, 128);
-  tv1->split(0, 4);
-
-  tv0->computeWith(tv3, 1);
-  tv1->computeWith(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.mul({0.979361});
-  auto aten_output = t2.mul(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-  std::vector<TensorView*> tvs = {tv0, tv1, tv2};
-  for (auto tv : tvs) {
-    // Lets setup to actually run
-    while (tv->nDims() > 1) {
-      tv->merge(0);
-    }
-    tv->split(0, 128);
-    tv->split(0, 4);
-    tv->computeWith(tv6, 1);
-  }
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  auto t4 = t2.sub(t3);
-  auto t5 = t1.add(t4);
-  auto aten_output = t5.sub(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-
-  tv2->computeWith(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv2->computeWith(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = mul(tv1, new Double(-2.0));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  // This computeAt will affect tv2 as well, even though tv2 is not in
-  // the data-flow path between tv1 and tv3. The reason is that tv1 is
-  // now computed at tv3, so tv2 must also be computed at the same
-  // location. Overall, what will happen is basically we merge
-  // expressions of all tensors and compute them in a single loop
-  // nest.
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-  }
-
-  GpuLower gpulw(&fusion);
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
-  TORCH_CHECK(
-      tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1);
-
-  // Note that tv2 is also computed at tv3.
-  for (auto tv : {tv1, tv2}) {
-    TORCH_CHECK(
-        gpulw.caLoopMap().areMapped(tv->axis(0), computeAtTarget->axis(0)));
-  }
-
-  TORCH_CHECK(tv3->getComputeAtPosition() == 0);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-// Similar to ComputeAtMultiConsumers, but with a common consumer.
-TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -2
-  // tv4 = tv2 + tv3
-  // tv5 = tv4 * 5
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = mul(tv1, new Double(-2.0));
-  TensorView* tv4 = add(tv2, tv3);
-  TensorView* tv5 = mul(tv4, new Double(5.0));
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  // Computing tv1 at tv3. This will affect tv2 as discussed in
-  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
-  // the common consumer of tv2 and tv3, so they are computed at
-  // tv4. The indirect propagation of the computeAt should stop at the
-  // common consumer, and no further change should occur. More
-  // specifically, the computeAT position of tv4 and tv5 should be zero.
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-  }
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 0);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 0);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // Transform tv5 to make it look like the rest
-  tv5->split(0, 128);
-  tv5->axis(1)->parallelize(ParallelType::TIDx);
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-  auto t4 = t2 + t3;
-  auto t5 = t4 * 5.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t4, t5};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -1
-  // tv4 = tv1 + 4
-  // tv5 = tv3 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = mul(tv2, new Double(-1.0));
-  TensorView* tv4 = add(tv1, new Double(4.0));
-  TensorView* tv5 = add(tv3, tv4);
-
-  fusion.addOutput(tv5);
-
-  TensorView* computeAtTarget = tv3;
-
-  computeAtTarget->merge(0);
-  computeAtTarget->split(0, 128);
-  computeAtTarget->split(0, 4);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  // This computeAt will affect all tensors including tv3, tv4 and
-  // tv5, even though it appears to impact only tv1 and tv2. The
-  // reason is that tv1 is now computed at tv3, so tv4 must also be
-  // computed at the same location. Similarly, the consumer of tv4,
-  // tv5, must also be computed at the same location. Overall, what
-  // will happen is basically we merge expressions of all tensors and
-  // compute them in a single loop nest. Internally, this will be
-  // realized by making all tensors, except for those in the path
-  // between tv1 and tv3, computed at tv5, which we call the common
-  // consumer.
-  tv1->computeAt(computeAtTarget, 1);
-
-  // All tensors should have the same dimenionality as the target
-  for (Val* val : fusion.vals()) {
-    if (fusion.hasInput(val) ||
-        val->getValType().value() != ValType::TensorView) {
-      continue;
-    }
-    TensorView* tv = val->as<TensorView>();
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv5) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (!fusion.hasInput(tv)) {
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t2.mul({-1.0});
-  auto t4 = t1.add({4.0});
-  auto aten_output = t3 + t4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Similar to the above common consumer test but adds an additional
-// tensor that has no common consumer with the other tensors.
-TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -1
-  // tv4 = tv1 + 4
-  // tv5 = tv2 + tv3
-  // tv6 = tv1 + 6
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = mul(tv2, new Double(-1.0));
-  TensorView* tv4 = add(tv1, new Double(4.0));
-  TensorView* tv5 = add(tv3, tv4);
-  TensorView* tv6 = add(tv1, new Double(6.0));
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  TensorView* computeAtTarget = tv3;
-
-  computeAtTarget->merge(0);
-  computeAtTarget->split(0, 128);
-  computeAtTarget->split(0, 4);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  // This will have the same impact on the tensors except for tv5 and
-  // tv6. tv6 does not have any common consumer with the computeAt
-  // target, but since it uses tv1, it must be also computed at the
-  // same location as the other impacted tensors. We can either make
-  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
-  // should be computed at tv6 just because the current implementation
-  // orders the computeAt relationship based on the order in which
-  // tensors are specified as outputs.
-
-  tv1->computeAt(computeAtTarget, 1);
-
-  // All tensors should have the same dimenionality as the target
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (fusion.hasInput(tv)) {
-      continue;
-    }
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv5 || tv == tv6) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-      TORCH_CHECK(tv->getMaxProducerPosition() == 1);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = val->as<TensorView>();
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t2.mul({-1.0});
-  auto t4 = t1.add({4.0});
-  auto t5 = t3 + t4;
-  auto t6 = t1.add({6.0});
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
-// that does not have data dependency with the consumer.
-TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 * -2
-  // tv4 = tv2 + tv3
-  // tv5 = tv4 * 5
-  // tv6 = tv1 * 6
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = mul(tv1, new Double(-2.0));
-  TensorView* tv4 = add(tv2, tv3);
-  TensorView* tv5 = mul(tv4, new Double(5.0));
-  // Notice that tv6 is not a consumer of tv4.
-  TensorView* tv6 = mul(tv1, new Double(6.0));
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv6 || tv == tv5) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-  auto t4 = t2 + t3;
-  auto t5 = t4 * 5.0;
-  auto t6 = t1 * 6.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t4, t5, t6};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-namespace {
-
-void checkIdMapped(
-    ComputeAtRootDomainMap& root_map,
-    TensorView* v0,
-    IterDomain* id0,
-    TensorView* v1,
-    IterDomain* id1,
-    bool should_map) {
-  if (should_map) {
-    TORCH_CHECK(
-        root_map.canMap(v0->domain(), id0, v1->domain(), id1),
-        "Should be mappable: ",
-        id0,
-        " of ",
-        v0,
-        " and ",
-        id1,
-        " of ",
-        v1);
-  } else {
-    TORCH_CHECK(
-        !root_map.canMap(v0->domain(), id0, v1->domain(), id1),
-        "Should not be mappable: ",
-        id0,
-        " of ",
-        v0,
-        " and ",
-        id1,
-        " of ",
-        v1);
-  }
-}
-
-void checkIdMapped(
-    TensorView* v0,
-    const std::vector<IterDomain*>& root0,
-    const std::vector<bool> should_map0,
-    TensorView* v1,
-    const std::vector<IterDomain*>& root1,
-    const std::vector<bool> should_map1) {
-  ComputeAtRootDomainMap map;
-  map.build();
-  TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size());
-  TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size());
-  size_t idx0 = 0;
-  for (const auto i : c10::irange(root0.size())) {
-    size_t idx1 = 0;
-    for (const auto j : c10::irange(root1.size())) {
-      if (should_map0[i] && should_map1[j] && idx0 == idx1) {
-        checkIdMapped(map, v0, root0[i], v1, root1[j], true);
-      } else {
-        checkIdMapped(map, v0, root0[i], v1, root1[j], false);
-      }
-      if (should_map1[j])
-        ++idx1;
-    }
-    if (should_map0[i])
-      ++idx0;
-  }
-}
-
-void checkIdMapped(
-    TensorView* v0,
-    const std::vector<IterDomain*>& root0,
-    TensorView* v1,
-    const std::vector<IterDomain*>& root1) {
-  checkIdMapped(
-      v0,
-      root0,
-      std::vector<bool>(root0.size(), true),
-      v1,
-      root1,
-      std::vector<bool>(root1.size(), true));
-}
-
-} // namespace
-
-TEST(NVFuserTest, FusionRootMappingBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv3 = broadcast(tv0, {true, false, false});
-  auto tv4 = broadcast(tv1, {false, true, false});
-  auto tv5 = add(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {false, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {false, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false, true});
-  checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain());
-  checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain());
-  checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain());
-}
-
-TEST(NVFuserTest, FusionRootMappingRfactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // [I,I]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  // [I,I,I]
-  TensorView* tv1 = makeSymbolicTensor(3);
-
-  //[I,I,R]
-  auto tv2 = sum(tv1, {2});
-  auto tv3 = add(tv2, tv0);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv3);
-
-  // scheduling:
-  //[B,I,R0,R1=128], root = [B,I,R]
-  tv2->split(2, 128);
-
-  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
-  auto tv4 = tv2->rFactor({3});
-
-  checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain());
-  checkIdMapped(
-      tv4,
-      tv4->getRFactorDomain(),
-      {true, true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, true});
-  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRFactorDomain(),
-      {true, true, false, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true, false});
-}
-
-TEST(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  // The second dimension cannot be mapped as it would require recomputation.
-  checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain());
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-TEST(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-}
-
-TEST(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  auto tv3 = tv1->rFactor({-2});
-
-  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv3,
-      tv3->getMaybeRFactorDomain(),
-      {true, false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-TEST(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  tv1->split(-1, 4);
-  auto tv4 = tv1->rFactor({-2});
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getMaybeRFactorDomain(),
-      {true, false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-// Reproducer of issue #749
-TEST(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = add(tv4, tv1);
-  fusion.addOutput(tv5);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, true});
-}
-
-// Similar to RootMappingReductionDependency5 but with rFactor
-TEST(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = add(tv4, tv1);
-  fusion.addOutput(tv5);
-
-  tv2->split(1, 4);
-  auto tv6 = tv2->rFactor({-1});
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv6,
-      tv6->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv6,
-      tv6->getMaybeRFactorDomain(),
-      {true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, true});
-}
-
-TEST(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  // tv0 cannot be mapped with the consumers as it would mean its only
-  // domain would be mapped to both the first and second domains of
-  // the two consumers, thus computing tv0 at both corresponding loops.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv1,
-      tv1->getRootDomain(),
-      {false, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv2,
-      tv2->getRootDomain(),
-      {false, false});
-  checkIdMapped(tv1, tv1->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv3,
-      tv3->getRootDomain(),
-      {false, false});
-}
-
-TEST(NVFuserTest, FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv0, {true, false});
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-
-  // If there is no common consumer, there is no recomputation constraint.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true});
-}
-
-TEST(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-  auto tv5 = add(tv2, tv3);
-  fusion.addOutput(tv5);
-
-  // Broadcast domains can be used with multiple domains with
-  // different sizes. In this test, the broadcast domain of tv3 has
-  // two consumers, tv4 and tv5, which may have different sizes. Each
-  // of the consumers is used with the broadcast domain of tv3, but
-  // the two consumers may not have the same size, it is not possible
-  // to map those domains.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, false},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, false},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false});
-}
-
-TEST(NVFuserTest, FusionRootMappingBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  // tv0[I0]
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {true, false});
-  // tv1[B1, I0]
-  auto tv2 = broadcast(tv1, {true, false, false});
-  // tv2[B2, B1, I0]
-  fusion.addOutput(tv2);
-
-  // In this case, tv1 and tv2 has one and two broadcast domains,
-  // respectively. It is the second broadcast domain that is mapped to
-  // the broadcast of tv1.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true, true}); // Not {true, false, true}
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, false, true});
-}
-
-// Reproducer of issue #723
-TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv2, tv1);
-
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-
-  ComputeAtRootDomainMap map;
-  map.build();
-
-  checkIdMapped(
-      map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true);
-  checkIdMapped(
-      map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true);
-
-  tv2->computeAt(tv4, -1);
-
-  const int x = 11;
-  const int y = 12;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  at::Tensor t1 = at::randn({y, x}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0;
-  auto t4 = t0.unsqueeze(0).expand({y, x}) + t1;
-
-  testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = broadcast(tv1, {true, false});
-  auto tv3 = broadcast(tv1, {false, true});
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // computeAt should fail as there is no valid root mapping.
-  ASSERT_ANY_THROW(tv1->computeAt(tv4, 1));
-}
-
-TEST(NVFuserTest, FusionScalarInputs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  Double* d0 = new Double();
-  fusion.addInput(d0);
-  Double* d1 = new Double();
-  fusion.addInput(d1);
-  Double* d2 = new Double();
-  fusion.addInput(d2);
-  Double* d3 = new Double();
-  fusion.addInput(d3);
-  Val* d4 = mul(d0, d1);
-  Val* d5 = sub(d2, d3);
-
-  TensorView* tv2 = sub(tv1, d4);
-  TensorView* tv3 = add(tv0, d5);
-  TensorView* tv4 = mul(tv3, tv2);
-
-  fusion.addOutput(tv4);
-
-  // Lets setup to actually run
-  while (tv4->nDims() > 1)
-    tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  // d4 = d0 * d1
-  // d5 = d2 - d3
-  // t2 = t1 - d4
-  // t3 = t0 + d5
-  // t4 = t3 * t2
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  float fl0 = 0.1;
-  float fl1 = -0.2;
-  float fl2 = 0.3;
-  float fl3 = -0.4;
-  float fl4 = fl0 * fl1;
-  float fl5 = fl2 - fl3;
-
-  at::Tensor t0 = at::randn({129, 127}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.sub(fl4);
-  auto t3 = t0.add(fl5);
-  auto aten_output = t3.mul(t2);
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  at::Scalar test(fl0);
-
-  std::vector<IValue> aten_inputs = {
-      t0,
-      t1,
-      at::Scalar(fl0),
-      at::Scalar(fl1),
-      at::Scalar(fl2),
-      at::Scalar(fl3)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionLoopUnroll_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  TensorView* tv1 = makeSymbolicTensor(3);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  int block_size = 16;
-
-  tv3->merge(0, 1);
-  tv3->merge(0, 1);
-
-  tv3->split(0, block_size);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  // Parallelize
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({129, 13, 3}, options);
-  at::Tensor input1 = at::randn({129, 13, 3}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input0, input1});
-
-  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
-}
-
-/*
- * Helper function for single op testing that generates a codegen operand
- */
-
-Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
-  if (desc.first == ValType::TensorView) {
-    return makeSymbolicTensor(2, desc.second);
-  } else if (desc.first == ValType::Scalar) {
-    if (desc.second == DataType::Float) {
-      return new Double();
-    } else if (desc.second == DataType::Double) {
-      return new Double();
-    } else if (desc.second == DataType::Int) {
-      return new Int();
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-    }
-  } else {
-    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-  }
-  return nullptr;
-}
-
-/*
- * Helper function for single op testing that generates an ATen operand
- */
-
-IValue gen_aten_operand(
-    std::pair<ValType, DataType> desc,
-    int blocks,
-    int threads,
-    bool rand) {
-  if (desc.first == ValType::TensorView) {
-    if (desc.second == DataType::Double || desc.second == DataType::Float ||
-        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
-      auto options = at::TensorOptions()
-                         .dtype(data_type_to_aten(desc.second))
-                         .device(at::kCUDA, 0);
-      if (rand) {
-        return IValue(at::rand({blocks, threads}, options));
-      } else {
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else if (desc.second == DataType::Int || desc.second == DataType::Int32) {
-      auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong;
-      if (rand) {
-        auto options =
-            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-        return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype));
-      } else {
-        auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else if (desc.second == DataType::Bool) {
-      if (rand) {
-        auto options =
-            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-        return IValue(
-            at::rand({blocks, threads}, options).round().to(at::kBool));
-      } else {
-        auto options =
-            at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.second)
-    }
-  } else if (desc.first == ValType::Scalar) {
-    // IValue scalars can only be double int64 or bool
-    if (desc.second == DataType::Double || desc.second == DataType::Float ||
-        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
-      return IValue(at::Scalar(1.f));
-    } else if (desc.second == DataType::Int) {
-      return IValue(at::Scalar(1));
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-    }
-  } else {
-    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-  }
-  return nullptr;
-}
-
-/*
- * Templatized Helper Function To generate single Op comparison between the
- * JIT codegen for Cuda and the ATen Library.
- */
-
-using OutputPair = std::pair<ValType, DataType>;
-template <
-    typename AtenFunc,
-    typename JitFunc,
-    typename InputTuple,
-    size_t... NumInputs>
-void test_op(
-    int blocks,
-    int threads,
-    std::string op_str,
-    AtenFunc af,
-    JitFunc jf,
-    OutputPair op,
-    InputTuple it,
-    std::index_sequence<NumInputs...>) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Generate Input JIT function Inputs and add them as Inputs to the Fusion
-  // Graph
-  std::array<Val*, sizeof...(NumInputs)> jit_inputs = {
-      gen_jit_operand(std::get<NumInputs>(it))...};
-  std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) {
-    fusion.addInput(v);
-  });
-  TensorView* out =
-      static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...));
-  fusion.addOutput(out);
-
-  std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) {
-    if (v->getValType() == ValType::TensorView)
-      static_cast<TensorView*>(v)->computeAt(out, -1);
-  });
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(-1)->parallelize(ParallelType::TIDx);
-
-  std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand(
-      std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...};
-  const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs);
-
-  at::Tensor cg_output =
-      gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
-  std::vector<at::Tensor> output_vect = {cg_output};
-  cudaDeviceSynchronize();
-  if (fusion.isStochastic())
-    at::manual_seed(0);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs_ivalues, output_vect);
-  cudaDeviceSynchronize();
-
-  if (fusion.isStochastic())
-    at::manual_seed(0);
-  at::Tensor aten_output = af(aten_inputs);
-  cudaDeviceSynchronize(); // This sync shouldn't be necessary;
-
-  std::string op_msg = "Operation " + op_str;
-
-  testValidate(
-      &fusion,
-      {cg_output},
-      aten_inputs,
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      op_msg);
-}
-
-/*
- *  Templatized Helper Function that uses variadic templates to
- *  process a variable length Input Tuple of different Operand Type.
- */
-template <typename AtenFunc, typename JitFunc, typename InputTuple>
-void test_op(
-    int blocks,
-    int threads,
-    std::string op_str,
-    AtenFunc af,
-    JitFunc jf,
-    OutputPair op,
-    InputTuple it) {
-  static constexpr auto size = std::tuple_size<InputTuple>::value;
-  test_op(
-      blocks,
-      threads,
-      op_str,
-      af,
-      jf,
-      op,
-      it,
-      std::make_index_sequence<size>{});
-}
-
-TEST(NVFuserTest, FusionUnaryOps_CUDA) {
-  using OpTuple =
-      std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
-
-  // [Note: explicit tuple type for uniform initialization list]
-  // Tuple type must be explicitly specified for each uniform initialization
-  // list within the vector to make this code compatible with some old env
-  // which we still need to support. eg. gcc 5.4 + cuda 9.2.
-  std::vector<OpTuple> ops{
-      OpTuple{at::abs, UnaryOpType::Abs, "abs"},
-      OpTuple{at::acos, UnaryOpType::Acos, "acos"},
-      OpTuple{at::asin, UnaryOpType::Asin, "asin"},
-      OpTuple{at::atan, UnaryOpType::Atan, "atan"},
-      // There does not appear to be an appropriate ATen function for atanh
-      // OpTuple{at::atanh,      UnaryOpType::Atanh,      "atanh"      },
-      OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"},
-      OpTuple{at::cos, UnaryOpType::Cos, "cos"},
-      OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"},
-      OpTuple{at::erf, UnaryOpType::Erf, "erf"},
-      OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"},
-      OpTuple{at::exp, UnaryOpType::Exp, "exp"},
-      OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"},
-      OpTuple{at::floor, UnaryOpType::Floor, "floor"},
-      OpTuple{at::frac, UnaryOpType::Frac, "frac"},
-      // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"},
-      OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"},
-      OpTuple{at::log, UnaryOpType::Log, "log"},
-      OpTuple{at::log10, UnaryOpType::Log10, "log10"},
-      OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"},
-      OpTuple{at::log2, UnaryOpType::Log2, "log2"},
-      OpTuple{at::neg, UnaryOpType::Neg, "neg"},
-      OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
-      OpTuple{at::relu, UnaryOpType::Relu, "relu"},
-      OpTuple{at::round, UnaryOpType::Round, "round"},
-      OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
-      OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
-      OpTuple{at::sin, UnaryOpType::Sin, "sin"},
-      OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"},
-      OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"},
-      OpTuple{at::tan, UnaryOpType::Tan, "tan"},
-      OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"},
-      OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"}};
-
-  std::vector<DataType> dtypes = {DataType::Float, DataType::Double};
-
-  for (auto dtype : dtypes) {
-    std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ std::get<2>(op),
-          /*Aten Func   */
-          [&op](std::array<IValue, 1>& vals) {
-            return std::get<0>(op)(vals[0].toTensor());
-          },
-          /*JIT  Func   */
-          [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); },
-          /*Output      */ std::make_pair(ValType::TensorView, dtype),
-          /*Inputs Tuple*/
-          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-    });
-
-    test_op(
-        /*blocks*/ 128,
-        /*threads*/ 64,
-        /*name*/ "rand_like",
-        /*Aten Func   */
-        [](std::array<IValue, 1>& vals) {
-          return at::rand_like(vals[0].toTensor());
-        },
-        /*JIT  Func   */
-        [](Val* in1) -> Val* { return unaryOp(UnaryOpType::RandLike, in1); },
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-  }
-
-  dtypes = {DataType::Int, DataType::Int32, DataType::Bool};
-  for (auto dtype : dtypes) {
-    test_op(
-        /*blocks*/ 128,
-        /*threads*/ 64,
-        /*name*/ "bitwise_not",
-        /*Aten Func   */
-        [](std::array<IValue, 1>& vals) {
-          return at::bitwise_not(vals[0].toTensor());
-        },
-        /*JIT  Func   */
-        [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); },
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-  }
-}
-
-TEST(NVFuserTest, FusionBinaryOps_CUDA) {
-  using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
-  using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
-
-  // see [Note: explicit tuple type for uniform initialization list]
-  std::vector<OpTuple> logic_ops{
-      OpTuple{at::eq, BinaryOpType::Eq, "eq"},
-      OpTuple{at::ge, BinaryOpType::GE, "ge"},
-      OpTuple{at::gt, BinaryOpType::GT, "gt"},
-      OpTuple{at::le, BinaryOpType::LE, "le"},
-      OpTuple{at::lt, BinaryOpType::LT, "lt"},
-      OpTuple{at::ne, BinaryOpType::NE, "ne"}};
-  std::vector<DataType> dtypes = {DataType::Double, DataType::Float};
-
-  for (auto dtype : dtypes) {
-    std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ std::get<2>(op),
-          /*Aten Func   */
-          [&op](std::array<IValue, 2>& vals) {
-            return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
-          },
-          /*JIT  Func   */
-          [&op](Val* in1, Val* in2) -> Val* {
-            return binaryOp(std::get<1>(op), in1, in2);
-          },
-          /*Output      */ std::make_pair(ValType::TensorView, DataType::Bool),
-          /*Inputs Tuple*/
-          std::make_tuple(
-              std::make_pair(ValType::TensorView, dtype),
-              std::make_pair(ValType::TensorView, dtype)));
-    });
-
-    // see [Note: explicit tuple type for uniform initialization list]
-    std::vector<OpTuple> math_ops{
-        OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"},
-        OpTuple{at::div, BinaryOpType::Div, "div"},
-        OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"},
-        OpTuple{at::max, BinaryOpType::Max, "max"},
-        OpTuple{at::min, BinaryOpType::Min, "min"},
-        OpTuple{at::mul, BinaryOpType::Mul, "mul"},
-        OpTuple{at::pow, BinaryOpType::Pow, "pow"},
-        // NOTE: Remainder does not match the Aten impl exactly
-        // despite using an identical function.
-        OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"},
-    };
-
-    std::for_each(math_ops.begin(), math_ops.end(), [&](OpTuple& op) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ std::get<2>(op),
-          /*Aten Func   */
-          [&op](std::array<IValue, 2>& vals) {
-            return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
-          },
-          /*JIT  Func   */
-          [&op](Val* in1, Val* in2) -> Val* {
-            return binaryOp(std::get<1>(op), in1, in2);
-          },
-          /*Output      */ std::make_pair(ValType::TensorView, dtype),
-          /*Inputs Tuple*/
-          std::make_tuple(
-              std::make_pair(ValType::TensorView, dtype),
-              std::make_pair(ValType::TensorView, dtype)));
-    });
-
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "add_alpha",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::add(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "sub_alpha",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::sub(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-  }
-}
-
-TEST(NVFuserTest, FusionTernaryOps_CUDA) {
-  std::vector<DataType> dtypes = {DataType::Double, DataType::Float};
-
-  for (auto dtype : dtypes) {
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "clamp",
-        /*Aten Func   */
-        [](std::array<IValue, 1>& vals) {
-          return at::clamp(vals[0].toTensor(), 0.f, 1.f);
-        },
-        /*JIT  Func   */
-        [&](Val* in1) -> Val* {
-          if (dtype == DataType::Float) {
-            return clamp(in1, new Double(0.f), new Double(1.f));
-          } else {
-            return clamp(in1, new Double(0.f), new Double(1.f));
-          }
-        },
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "threshold",
-        /*Aten Func   */
-        [](std::array<IValue, 1>& vals) {
-          return at::threshold(vals[0].toTensor(), 0.f, 1.f);
-        },
-        /*JIT  Func   */
-        [&](Val* in1) -> Val* {
-          if (dtype == DataType::Float) {
-            return threshold(in1, new Double(0.f), new Double(1.f));
-          } else {
-            return threshold(in1, new Double(0.f), new Double(1.f));
-          }
-        },
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "where",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::where(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, DataType::Bool),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype)));
-  }
-}
-
-TEST(NVFuserTest, FusionCompoundOps_CUDA) {
-  std::vector<DataType> dtypes = {DataType::Double, DataType::Float};
-
-  for (auto dtype : dtypes) {
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "lerp",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::lerp(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype)));
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "addcmul",
-        /*Aten Func   */
-        [](std::array<IValue, 4>& vals) {
-          return at::addcmul(
-              vals[0].toTensor(),
-              vals[1].toTensor(),
-              vals[2].toTensor(),
-              vals[3].toScalar());
-        },
-        /*JIT  Func   */
-        static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-  }
-}
-
-TEST(NVFuserTest, FusionCastOps_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2, DataType::Half);
-
-  TensorView* intrm1 = castOp(DataType::Float, tv0);
-  TensorView* out = castOp(DataType::Half, intrm1);
-
-  fusion.addInput(tv0);
-  fusion.addOutput(out);
-  tv0->computeAt(out, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({1, 4}, options);
-  at::Tensor ref_output = at::empty_like(input1);
-
-  std::array<IValue, 1> inputs = {input1};
-  const at::ArrayRef<IValue> input_ivalues(inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(input_ivalues);
-
-  ref_output = at::_cast_Half(at::_cast_Double(input1));
-
-  TORCH_CHECK(
-      outputs[0].equal(ref_output),
-      "\nOp Type: -- ",
-      "cast FP16->FP32->FP16",
-      " -- had a mismatch.\n",
-      "\nABS MAX DIFF: ",
-      outputs[0].sub(ref_output).abs().max(),
-      "\n");
-}
-
-// Start off simple, block on the outer dim
-// block stride + thread all reduce + unrolling on inner dim
-TEST(NVFuserTest, FusionReduction1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, 128);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, 4);
-  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
-  // tv3[I0,        R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
-  // tv1[I0,                  R1i{128}] = tv3[I0,        R1oi{4}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv3, 1);
-  tv3->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 65000;
-  int numel_y = 1025;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  // switches to try some different scenarios. maybe we should iterate on all
-  // permutations.
-  bool bind_bidx = true;
-  bool bind_tidx = true;
-  bool bind_tidy = true;
-  bool bind_unroll = true;
-
-  int numel_x = 1025; // Cannot exceed block dim max size / tidy
-  int numel_y = 129;
-  int tidx = 16;
-  int tidy = 8;
-  int unroll_factor = 4;
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1]
-
-  tv1->split(1, unroll_factor);
-  // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1]
-
-  tv1->split(0, tidy);
-
-  TensorView* tv2 = tv1->rFactor({-3});
-  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
-  // tv1[I0o, I0i{tidy},          R1oi{unroll},  R1i{tidx}]
-
-  TensorView* tv3 = tv1->rFactor({-2});
-  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
-  // tv3[I0,                      R1oi{unroll}, Ir1i{tidx}]
-  // tv1[I0o, I0i{tidy},                         R1i{tidx}]
-
-  tv0->computeAt(tv1, -2);
-
-  if (bind_unroll)
-    tv2->axis(-2)->parallelize(ParallelType::Unroll);
-  if (bind_bidx)
-    tv1->axis(0)->parallelize(ParallelType::BIDx);
-  if (bind_tidy)
-    tv1->axis(1)->parallelize(ParallelType::TIDy);
-
-  if (bind_tidx) {
-    tv2->axis(-1)->parallelize(ParallelType::TIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-    tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduction3_CUDA) {
-  // What if Z participates in the reduction with X?
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int numel_x = 1025; // Cannot exceed block dim max size / tidy
-  int numel_y = 129;
-  int tidx = 16;
-  int tidz = 8;
-
-  tv1->split(1, tidz);
-  // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1]
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({-3});
-  // tv2[I0,  >R1oo<, Ir1oi{tidx}, Ir1i{tidz}]
-  // tv1[I0o,          R1oi{tidx},  R1i{tidz}]
-
-  tv0->computeAt(tv1, -3);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDz);
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDz);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, {cg_output});
-
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduction4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  TensorView* tv2 = add(tv0, tv1);
-  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2);
-  // tv3[I0, R1] = tv2[I0, I1]
-
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  // tv5[I0] = tv3[I0, R1] * tv4[I0]
-  TensorView* tv5 = mul(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  int tidx = 16;
-
-  // RFactor the reduction
-  tv3->split(1, tidx);
-  // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1]
-
-  TensorView* tv6 = tv3->rFactor({-2});
-  // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1]
-  // tv3[I0,       R1i{tidx}] = tv3[I0, I1]
-  tv2->computeAt(tv6, 2);
-
-  // Compute at inline with tv5 (only 1D)
-  tv6->computeAt(tv3, 1);
-  tv3->computeAt(tv5, 1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  // Intermediate tensors only need this, but doesn't hurt to do on inputs
-  // tv0, 1, 4
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t4 = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
-
-  auto t2 = t0.add(t1);
-  auto t3 = t2.to(at::kDouble).sum({1});
-  auto aten_output = t3.mul(t4);
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduction5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int bidy = 2;
-  int tidy = 4;
-  int tidx = 5;
-
-  int dim1 = 11;
-
-  tv1->split(-2, tidy);
-
-  TensorView* tv2 = tv1->rFactor({-3});
-
-  tv0->computeAt(tv1, 1);
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-
-  for (auto* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      val->as<TensorView>()->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({bidy, dim1, tidx}, options);
-
-  at::Tensor cg_output = at::empty({bidy, tidx}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduction6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 64;
-  const int bdimy = 8;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(2, bdimx);
-  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
-  tv1->split(1, bdimy);
-  // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  TensorView* tv2 = tv1->rFactor({3});
-  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
-  // tv1[I0, R1o, R1i{8},      R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
-  // tv3[I0, R1o, I1i{8},      I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
-  // tv1[I0,      R1i{8},      R2i{128}] = tv3[I0, R1o, I1i{8},      I2i{128}]
-
-  tv3->computeAt(tv1, 1);
-  tv2->computeAt(tv3, 2);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-  tv3->axis(-2)->parallelize(ParallelType::TIDy);
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-
-  int numel_x = 650;
-  int numel_y = 1000;
-  int numel_z = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1, 2});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionMultiGridReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = max(tv0, {0});
-  TensorView* tv2 = sum(tv0, {0});
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-
-  int numel_x = 4;
-  int numel_y = 2;
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  std::vector<at::Tensor> aten_outputs = {
-      std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)};
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionMultiGridReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionReductionTFT_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-  int tidx = 16;
-  int tidy = 8;
-  int tidz = 8;
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1o, R1i{tidx}]
-
-  tv1->split(1, tidz);
-  // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}]
-
-  tv1->split(0, tidy);
-  // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}]
-  // tv1[I0o, I0i,       R1Oi{tidz}, R1R1i{tidx}]
-
-  tv2->computeAt(tv1, 2);
-
-  tv1->axis(1)->parallelize(ParallelType::TIDy);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::TIDz);
-  tv2->axis(-2)->parallelize(ParallelType::TIDz);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) {
-  // based off FusionReduction4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  TensorView* tv2 = add(tv0, tv1);
-  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2);
-  // tv3[I0, R1] = tv2[I0, I1]
-
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  // tv5[I0] = tv3[I0, R1] * tv4[I0]
-  TensorView* tv5 = mul(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  // RFactor the reduction
-  tv3->split(1, 16, false);
-  // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1]
-
-  TensorView* tv6 = tv3->rFactor({-2});
-  // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1]
-  // tv3[I0,           R1i{tidx}] = tv3[I0, I1]
-  tv2->computeAt(tv6, 2);
-
-  // Compute at inline with tv5 (only 1D)
-  tv6->computeAt(tv3, 1);
-  tv3->computeAt(tv5, 1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  // Intermediate tensors only need this, but doesn't hurt to do on inputs
-  // tv0, 1, 4
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t4 = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
-
-  auto t2 = t0.add(t1);
-  auto t3 = t2.to(at::kDouble).sum({1});
-  auto aten_output = t3.mul(t4);
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBranches_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv0, new Double(1.0));
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv3, tv2);
-  auto tv6 = add(tv4, tv5);
-
-  fusion.addOutput(tv6);
-
-  constexpr int x = 63, y = 33;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = at::randn({x, y}, options);
-
-  FusionExecutor fe;
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::Unroll);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-2)->parallelize(ParallelType::Unroll);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = t3.add(t1);
-  auto t5 = t3.add(t2);
-  auto aten_output = t4.add(t5);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleBCast1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1.5));
-
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  TensorView* tv4 = sub(tv2, tv3);
-
-  TensorView* tv5 = broadcast(tv1, {false, false, true});
-  TensorView* tv6 = broadcast(tv4, {true, false, false});
-
-  TensorView* tv7 = add(tv5, tv6);
-  fusion.addOutput(tv7);
-
-  tv7->split(-1, 4);
-  tv7->split(0, 8);
-
-  tv0->computeAt(tv7, -1);
-  tv2->computeAt(tv7, -1);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = t0.add(1.5);
-
-  at::Tensor t2 = at::randn({y, z}, options);
-  at::Tensor t3 = at::randn({y, z}, options);
-
-  at::Tensor t4 = t2.sub(t3);
-  at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z});
-
-  at::Tensor t6 = t4.expand({x, y, z});
-
-  at::Tensor aten_output = t5.add(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleBCast2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  TensorView* tv3 = broadcast(tv2, {false, false, true});
-
-  TensorView* tv4 = makeSymbolicTensor(2);
-  fusion.addInput(tv4);
-
-  TensorView* tv5 = sub(tv4, new Double(0.1));
-
-  TensorView* tv6 = broadcast(tv5, {true, false, false});
-
-  TensorView* tv7 = add(tv3, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->merge(0, 1);
-
-  tv0->computeAt(tv7, -1);
-  tv4->computeAt(tv7, -1);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = t0.add(t1);
-  at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z});
-
-  at::Tensor t4 = at::randn({y, z}, options);
-  at::Tensor t5 = t4.sub(0.1);
-  at::Tensor t6 = t5.expand({x, y, z});
-  at::Tensor aten_output = t3.add(t6);
-
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleBCast3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  std::vector<IterDomain*> dom;
-  dom.push_back(new IterDomain(new Int(0), new Int()));
-  dom.push_back(new IterDomain(
-      new Int(0),
-      new Int(1),
-      ParallelType::Serial,
-      IterType::BroadcastWithStride));
-
-  // tv0[I1, B{1}]
-  TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float);
-  fusion.addInput(tv0);
-
-  // tv1[I0, I1, I2]
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->merge(0);
-
-  tv0->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int x = 2, y = 3, z = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y, 1}, options);
-  at::Tensor t2 = at::randn({x, y, z}, options);
-  auto aten_output = t0.add(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t2};
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleBCast4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  std::vector<IterDomain*> dom;
-  dom.push_back(new IterDomain(
-      new Int(0),
-      new Int(1),
-      ParallelType::Serial,
-      IterType::BroadcastWithStride));
-  dom.push_back(new IterDomain(new Int(0), new Int()));
-  TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float);
-
-  TensorView* tv1 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 = add(tv0, tv1);
-
-  tv3->merge(0);
-  tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({1, z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
-
-  auto aten_output = t0.add(t1);
-
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleBCast5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int m = 2, k = 3, n = 4;
-
-  auto zero = new Int(0);
-  auto M = new IterDomain(zero, new Int(m));
-  auto K = new IterDomain(zero, new Int(k));
-  auto N = new IterDomain(zero, new Int(n));
-
-  // Set up your input tensor views
-  TensorView* tv0 =
-      new TensorView(new TensorDomain({M, K}, {true, true}), DataType::Float);
-  // Note: IterDomain must not be reused, so K needs to be cloned.
-  TensorView* tv1 = new TensorView(
-      new TensorDomain({K->clone(), N}, {true, true}), DataType::Float);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  TensorView* tv4 = add(tv2, tv3);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({m, k}, options);
-  at::Tensor t1 = at::randn({k, n}, options);
-
-  auto t2 = t0.unsqueeze(-1).expand({m, k, n});
-  auto t3 = t1.expand({m, k, n});
-  auto aten_output = t2.add(t3);
-
-  at::Tensor cg_output = at::empty({m, k, n}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComplexBCast1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int x = 2, y = 3, z = 4;
-
-  auto tv0 = makeConcreteTensor({y});
-  auto tv1 = div(tv0, new Double(2.0));
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeConcreteTensor({y, z});
-  auto tv4 = mul(tv2, tv3);
-  auto tv5 = broadcast(tv4, {true, false, false});
-  auto tv6 = makeConcreteTensor({x, y, z});
-  auto tv7 = add(tv5, tv6);
-
-  // tv0[    i1    ] = input
-  // tv1[    i1    ] = tv0/2.0
-  // tv2[    i1, b2] = bcast(tv1)
-  // tv3[    i1, i2] = input
-  // tv4[    i1, i2] = tv2 * tv3
-  // tv5[b0, i1, i2] = bcast(tv4)
-  // tv6[i0, i1, i2] = input
-  // tv7[i0, i1, i2] = tv5 + tv6
-
-  // tv4 = bcast(tv1) * tv3
-  // tv7 = bcast(tv4) + tv6
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv3);
-  fusion.addInput(tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->merge(0);
-  tv7->merge(0);
-  tv0->computeAt(tv7, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y}, options);
-  at::Tensor t3 = at::randn({y, z}, options);
-  at::Tensor t6 = at::randn({x, y, z}, options);
-
-  auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3;
-  auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6;
-
-  std::vector<IValue> aten_inputs = {t0, t3, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComplexBCast2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int x = 2, y = 3, z = 4;
-
-  auto tv0 = makeConcreteTensor({y, z});
-  auto tv1 = div(tv0, new Double(2.0));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = makeConcreteTensor({x, y});
-  auto tv5 = add(tv3, tv4);
-
-  // tv0[    i1, i2] = input
-  // tv1[    i1, i2] = tv0/2.0
-  // tv2[    i1    ] = sum(tv1, 1)
-  // tv3[b0, i1    ] = bcast(tv2)
-  // tv4[i0, i1    ] = input
-  // tv5[i0, i1    ] = tv3 + tv4
-
-  // tv2 = sum(tv0/2.0, 1)
-  // tv5 = bcast(tv2) + tv4
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv4);
-
-  fusion.addOutput(tv5);
-
-  tv5->merge(0);
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t4 = at::randn({x, y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t4});
-
-  auto t1 = t0.div(2.0);
-  auto t2 = t1.to(at::kDouble).sum(1);
-  auto t3 = t2.unsqueeze(0).expand({x, y});
-  auto aten_output = t3.add(t4);
-
-  testValidate(
-      &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1.0));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->merge(0);
-
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv2->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::Unroll);
-  tv4->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto aten_output = t3.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1.0));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(-2);
-  tv4->merge(-2);
-  tv4->merge(-2);
-
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv2->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::Unroll);
-  tv4->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto aten_output = t3.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1.0));
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({4, 8});
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeConcreteTensor({4, 4, 8});
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, new Double(1));
-  TensorView* tv3 = broadcast(tv2, {true, false, false});
-  TensorView* tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({4, 8}, options);
-  at::Tensor t1 = at::randn({4, 4, 8}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(3);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, new Double(1));
-  TensorView* tv3 = broadcast(tv2, {true, false, true});
-  TensorView* tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3);
-  tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3);
-
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({7}, options);
-  at::Tensor t1 = at::randn({5, 7, 11}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.unsqueeze(-1).add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor0_shape{7, 4, 7};
-  std::vector<int64_t> tensor1_shape{4, 7};
-
-  TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size());
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size());
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-  TensorView* tv3 = sum(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn(tensor0_shape, options);
-  at::Tensor input1 = at::randn(tensor1_shape, options);
-
-  std::vector<int64_t> reduction_axes{0, 1};
-  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs =
-      fe.runFusion({input0, input1}, reduction_params.value().lparams);
-
-  auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      reduction_params.value().lparams);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) {
-  // Might be able to use this one without 6 as the heuristics in 6 may change
-  // and this test is to cover the same issue.
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv4->merge(0, 1);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  auto tv5 = tv4->rFactor({0, 1});
-
-  tv5->computeAt(tv4, -1);
-  tv0->computeAt(tv5, -1);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_x}, options);
-  auto at_t1 = at::randn({numel_x, numel_y}, options);
-
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
-
-  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) {
-  // Same as 7 but with outer splits instead of inner
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv4->merge(0, 1);
-  tv4->split(0, 128, false);
-  tv4->split(0, 4, false);
-
-  auto tv5 = tv4->rFactor({0, 1});
-
-  tv5->computeAt(tv4, -1);
-  tv0->computeAt(tv5, -1);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_x}, options);
-  auto at_t1 = at::randn({numel_x, numel_y}, options);
-
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
-
-  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) {
-  // Same as 7 but with outer splits instead of inner
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = mul(tv1, new Double(2));
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-
-  auto tv4 = add(tv3, tv2);
-  fusion.addOutput(tv4);
-
-  const int numel_x = 200;
-  const int numel_y = 300;
-  const int numel_z = 400;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_y}, options);
-  auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options);
-  std::vector<IValue> aten_inputs = {at_t0, at_t3};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  auto at_t1 = at_t0.unsqueeze(-1);
-  auto at_t2 = at_t1.mul(2.0);
-
-  auto at_t4 = at_t3.add(at_t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(2);
-  TensorView* tv1 = makeContigTensor(2);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  auto tv0_cache = tv0->cache_after();
-  auto tv1_cache = tv1->cache_after();
-
-  std::vector<TensorView*> tvs = {tv0_cache, tv1_cache, tv2, tv3};
-
-  for (auto tv : tvs) {
-    tv->split(1, 2, false);
-    tv->split(1, 1);
-    tv->split(-1, 4);
-    // [I0, 2, 1, I1/2/4, 4]
-    tv->reorder({{1, 2}, {2, 3}, {3, 1}});
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDx);
-  }
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(4);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv1, new Double(1.0));
-  auto tv3 = broadcast(tv2, {true, false, true, true});
-  auto tv4 = add(tv3, tv0);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(1);
-
-  tv4->split(1, 32);
-  tv4->split(0, 1);
-
-  tv4->reorder({{2, 1}});
-
-  tv2->computeAt(tv4, 3);
-
-  tv2->setMemoryType(MemoryType::Global);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::BIDy);
-  tv4->axis(2)->parallelize(ParallelType::Unswitch);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({w, x, y, z}, options);
-  at::Tensor t1 = at::randn({x}, options);
-
-  auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1);
-  auto aten_output = t3.add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Intended to stress the lowering of our code generator
-TEST(NVFuserTest, FusionAdvancedLowering1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({9, 5});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv1, new Double(3));
-  TensorView* tv4 = sum(tv3, {1});
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  auto tv5 = tv4->rFactor({2});
-
-  tv1->computeAt(tv5, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor aten_input = at::randn({9, 5}, options);
-
-  auto t1 = aten_input.add(1.0);
-  auto t2 = t1.add(2.0);
-  auto t3 = t1.add(3.0);
-  auto t4 = t3.sum(1);
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Progressively broadcast tensors
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, new Double(1));
-  TensorView* tv4 = broadcast(tv3, {false, true});
-  TensorView* tv5 = add(tv4, tv1);
-  TensorView* tv6 = add(tv5, tv2);
-
-  fusion.addOutput(tv6);
-
-  // Split inner dimension
-  tv6->split(1, 4);
-  // Merge middle dims with outer dimensions
-  tv6->merge(2);
-  tv6->merge(0);
-
-  // tv6[I0*I1o, I1i*I2]
-
-  // Compute everything inline
-  tv0->computeAt(tv6, -1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  int x = 13, y = 9, z = 5;
-  at::Tensor t0 = at::randn({y}, options);
-  at::Tensor t1 = at::randn({y, z}, options);
-  at::Tensor t2 = at::randn({x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = t3.unsqueeze(-1);
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-  std::vector<at::Tensor> aten_outputs = {t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// TODO: Complete test
-TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({1, -1});
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // [b0, i1]
-  auto tv2 = add(tv0, new Double(2.0));
-
-  // [i0, i1]
-  auto tv3 = add(tv1, new Double(3.0));
-
-  // [b0, i1]
-  auto tv4 = add(tv2, new Double(4.0));
-
-  // [io, i1]
-  auto tv5 = add(tv2, tv3);
-
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  tv0->computeAt(tv4, -1);
-
-  tv3->setMemoryType(MemoryType::Global);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  int x = 13, y = 9;
-  at::Tensor t0 = at::randn({1, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-
-  auto t4 = t0 + 2 + 4;
-  auto t5 = t0 + 2 + t1 + 3;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t4, t5};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// This excercises indexing with broadcast root axes. Non-broadcast
-// axes need to be preferred when propagating index exprs to root
-// axes. See, e.g., Index::getConsumerIndex_impl.
-TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv1, {false, false, true});
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->merge(1)->merge(0);
-  tv4->split(0, 8);
-  tv0->computeAt(tv4, 1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 10;
-  const int by = 20;
-  const int bz = 30;
-  at::Tensor t0 = at::randn({bx}, options);
-  at::Tensor t3 = at::randn({bx, by, bz}, options);
-  std::vector<IValue> aten_inputs = {t0, t3};
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output =
-      t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedLowering5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeConcreteTensor({5, 3});
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv1, {false, true, false});
-
-  auto tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv1->computeAt(tv2, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor t0 = at::randn({5, 4, 3}, options);
-  at::Tensor t1 = at::randn({5, 3}, options);
-  auto t2 = t1.unsqueeze(1);
-  auto t3 = t0 + t2;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedLowering6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({4});
-  fusion.addInput(tv1);
-  auto tv2 = unaryOp(UnaryOpType::Set, tv0);
-  auto tv3 = unaryOp(UnaryOpType::Set, tv1);
-
-  auto tv4 = sum(tv2, {0, 2});
-  auto tv5 = add(tv4, tv3);
-  fusion.addOutput(tv5);
-
-  auto tv6 = broadcast(tv3, {true, false, true});
-  auto tv7 = add(tv2, tv6);
-  fusion.addOutput(tv7);
-
-  tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort);
-  tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor t0 = at::randn({5, 4, 3}, options);
-  at::Tensor t1 = at::randn({4}, options);
-
-  auto t2 = t0;
-  auto t3 = t1;
-
-  std::vector<int64_t> reduction_axes{0, 2};
-  auto t4 = t2.sum(reduction_axes);
-  auto t5 = add(t4, t3);
-  auto t6 = t3.unsqueeze(0).unsqueeze(-1);
-  auto t7 = t2.add(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t5, t7};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// Test a simple Gemm but also play around with fusion executor features
-TEST(NVFuserTest, FusionSimpleGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2); // M, K
-  TensorView* tv1 = makeSymbolicTensor(2); // K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // tv2[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-  // tv3[B, I1, I2] = tv1[I1, I2]
-
-  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
-  TensorView* tv4 = mul(tv2, tv3);
-  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
-  TensorView* tv5 = sum(tv4, {1});
-  fusion.addOutput(tv5);
-
-  tv5->split(1, 32);
-  // tv5[I0, R1o, R1i{32}, I2]
-
-  auto tv6 = tv5->rFactor({1});
-  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
-  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
-
-  tv5->split(0, 4);
-  tv5->split(-1, 4);
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv5, -1);
-
-  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0->computeAt(tv6, -1);
-  tv1->computeAt(tv6, -1);
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(-2)->parallelize(ParallelType::BIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // Lets specify a few bounds in launch params to make sure it works
-  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
-
-  // Make sure bad launch params throws
-  // TODO: Re-enable once we have parallelization validation in.
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 1D tensor. Parallelized only with a single thread block.
-TEST(NVFuserTest, FusionSoftmax1D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 128;
-  const int dimx = 1000;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(1);
-  fusion.addInput(input_tv0);
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(0, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
-  exp_tv1_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({dimx}, options);
-  at::Tensor cg_output = at::empty({dimx}, options);
-  at::Tensor t3_output = at::empty_like(cg_output, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({t0}, {cg_output});
-
-  auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 1D tensor with input normalization.
-TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 128;
-  const int dimx = 1000;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(1);
-  fusion.addInput(input_tv0);
-
-  // Normalize with the max value before computing exp.
-  TensorView* max_val_tv1 =
-      reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0);
-  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true});
-  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
-  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
-  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
-
-  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
-
-  fusion.addOutput(output_tv7);
-  bcast_max_tv2->split(0, tidx);
-  bcast_sum_tv6->split(0, tidx);
-
-  max_val_tv1->split(-1, tidx);
-  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
-
-  sum_exp_tv5->split(-1, tidx);
-  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
-
-  output_tv7->split(-1, tidx);
-
-  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
-  sub_tv3_copy->computeAt(output_tv7, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      max_val_tv1,
-      bcast_max_tv2,
-      sum_exp_tv5,
-      bcast_sum_tv6,
-      output_tv7,
-      max_val_rf_tv8,
-      sum_exp_rf_tv9};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx}, options);
-  at::Tensor t3_output = at::empty({dimx}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 3D tensor, where the inner-most 3rd dimension is
-// normalized. Pallelized with multiple thread blocks.
-TEST(NVFuserTest, FusionSoftmax3D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(-1, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
-  exp_tv1_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
-
-  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 3D tensor with input normalization.
-TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  // Normalize with the max value before computing exp.
-  TensorView* max_val_tv1 =
-      reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0);
-  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true});
-  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
-  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
-  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
-
-  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
-
-  fusion.addOutput(output_tv7);
-
-  bcast_max_tv2->split(-1, tidx);
-  bcast_sum_tv6->split(-1, tidx);
-
-  max_val_tv1->split(-1, tidx);
-  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
-
-  sum_exp_tv5->split(-1, tidx);
-  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
-
-  output_tv7->split(-1, tidx);
-
-  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
-  sub_tv3_copy->computeAt(output_tv7, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      max_val_tv1,
-      bcast_max_tv2,
-      sum_exp_tv5,
-      bcast_sum_tv6,
-      output_tv7,
-      max_val_rf_tv8,
-      sum_exp_rf_tv9};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
-  at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  auto tv3 = add(tv0, new Double(1.0));
-
-  auto tv4 = mul(tv2, tv3);
-
-  auto tv5 = sum(tv4, {1});
-  auto tv6 = broadcast(tv5, {false, true});
-
-  auto tv7 = sub(tv6, tv4);
-  fusion.addOutput(tv7);
-
-  tv1->computeAt(tv7, 1);
-  ASSERT_ANY_THROW(tv1->computeAt(tv7, -1));
-}
-
-// Similar to FusionReduction but uses grid reduction
-TEST(NVFuserTest, FusionGridReduction1_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  const int gdimx = 32;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimx);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 10000;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same test as the above but uses BIDy and TIDx for reduction
-TEST(NVFuserTest, FusionGridReduction2_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  const int gdimy = 32;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimy);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(2)->parallelize(ParallelType::BIDy);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 10000;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same test but uses BIDy and BIDz for reduction. No TID used.
-TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) {
-  // Grid reductions when there aren't any threads are serial reductions
-  // keep these numbers low so our error isn't too high compared to normal cuda
-  // reductions
-  const int gdimz = 15;
-  const int gdimy = 9;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, gdimy);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimz);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDz);
-  tv2->axis(2)->parallelize(ParallelType::BIDz);
-  tv1->axis(-1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-1)->parallelize(ParallelType::BIDy);
-
-  int numel_x = 100;
-  int numel_y = 6500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
-TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) {
-  // Grid reductions when there aren't any threads are serial reductions
-  // keep these numbers low so our error isn't too high compared to normal cuda
-  // reductions
-  const int gdimz = 15;
-  const int gdimy = 9;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[R0, I1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {0}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(0, gdimy);
-  // tv1[R0o, R0i{128}, I1] = tv0[I0, I1]
-  tv1->split(0, gdimz);
-  // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({0});
-  // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1]
-  // tv1[      R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1]
-
-  // Note that computeAt isn't going to make anything better as there
-  // is no dynamically sized dimension.
-
-  // Map parallelism as [Serial, BIDz, BIDy, BIDx]
-  tv1->axis(-1)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::BIDx);
-  tv1->axis(-2)->parallelize(ParallelType::BIDy);
-  tv2->axis(-2)->parallelize(ParallelType::BIDy);
-  tv1->axis(-3)->parallelize(ParallelType::BIDz);
-  tv2->axis(-3)->parallelize(ParallelType::BIDz);
-
-  int numel_x = 6500;
-  int numel_y = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({0});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// This is similar to the FusionReduction, but swaps BIDx and TIDx
-TEST(NVFuserTest, FusionGridReduction4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 128;
-  const int gdimx = 1024;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, gdimx);
-  // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1]
-  tv1->split(1, 4);
-  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
-  // tv3[I0,        R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
-  // tv1[I0,                  R1i{1024}] = tv3[I0,        R1oi{4}, Ir1i{1024}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv3, 1);
-  tv3->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::BIDx);
-
-  int numel_x = bdimx;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Grid reduction with 2D thread blocks but only TIDx and BIDx are
-// mapped to a reduction dim
-TEST(NVFuserTest, FusionGridReduction5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 64;
-  const int bdimy = 16;
-  const int gdimx = 4;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{64}] = tv0[I0, I1]
-  tv1->split(1, gdimx);
-  // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-
-  int numel_x = bdimy;
-  int numel_y = 6500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Similar to FusionGridReduction1 but with 3D tensors
-TEST(NVFuserTest, FusionGridReduction6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion.");
-
-  // Splitting for TID
-  tv1->split(2, 128);
-  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  // Splitting for BID
-  tv1->split(1, 128);
-
-  // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  TensorView* tv2 = tv1->rFactor({3});
-  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
-  // tv1[I0, R1o, R1i{128},      R2i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
-  // tv3[I0, R1o, I1i{128},      I2i{128}]
-  // tv1[I0,      R1i{128},      R2i{128}]
-
-  tv3->computeAt(tv1, 1);
-  tv2->computeAt(tv3, 3);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-3)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::BIDx);
-
-  int numel_x = 6500;
-  int numel_y = 200;
-  int numel_z = numel_y;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1, 2});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// See issue #1049
-TEST(NVFuserTest, FusionGridReduction7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->split(0, 1000);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-
-  const int numel_x = 1;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0});
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridReduction8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 2;
-  const int numel_y = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0});
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridReduction9_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  tv1->split(1, 2);
-
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv1->axis(2)->parallelize(ParallelType::BIDy);
-
-  tv1->computeAt(tv3, 1);
-
-  const int numel_x = 4;
-  const int numel_y = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t2 = at::randn({numel_x}, options);
-
-  at::ArrayRef<IValue> aten_inputs = {t0, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_output = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.sum({1}).add(t2);
-
-  testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridReduction10_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {-1});
-  auto tv2 = sum(tv1, {-1});
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv1->axis(2)->parallelize(ParallelType::TIDy);
-  tv1->axis(3)->parallelize(ParallelType::TIDz);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDy);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv3, 1);
-
-  const int numel_w = 2;
-  const int numel_x = 3;
-  const int numel_y = 4;
-  const int numel_z = 5;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_output = fe.runFusion({t0});
-
-  auto aten_output = t0.sum({1, 2, 3});
-
-  testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) {
-  int bid_x = 3;
-  int tid_x = 2;
-  int red_dim = 0;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  tv1->split(-1, tid_x);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({16, bid_x * tid_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({red_dim});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSplitBCast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  TensorView* input_tv1 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-  fusion.addInput(input_tv1);
-
-  TensorView* sum_tv2 =
-      reductionOp(BinaryOpType::Add, {2}, new Double(0), input_tv0);
-  TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true});
-  TensorView* output_tv4 = div(input_tv1, bcast_tv3);
-
-  sum_tv2->split(-1, 32);
-  TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2});
-
-  bcast_tv3->split(-1, 32);
-  output_tv4->split(-1, 32);
-
-  sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx);
-  sum_tv2->axis(0)->parallelize(ParallelType::BIDx);
-  bcast_tv3->axis(0)->parallelize(ParallelType::BIDx);
-  output_tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy);
-  sum_tv2->axis(1)->parallelize(ParallelType::BIDy);
-  bcast_tv3->axis(1)->parallelize(ParallelType::BIDy);
-  output_tv4->axis(1)->parallelize(ParallelType::BIDy);
-
-  sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  sum_tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  output_tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  fusion.addOutput(output_tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({32, 32, 128}, options);
-  at::Tensor t1 = at::randn({32, 32, 128}, options);
-  at::Tensor cg_output = at::empty({32, 32, 128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({t0, t1}, {cg_output});
-}
-
-TEST(NVFuserTest, FusionBCastInnerDim_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // reduce then broadcast
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
-}
-
-TEST(NVFuserTest, FusionBCastReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = broadcast(tv0, {true, false, false});
-  auto tv2 = sum(tv1, {1});
-  TORCH_CHECK(
-      tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() &&
-      !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction());
-}
-
-// Multiple consumer reduction with computeAt
-// https://github.com/csarofeen/pytorch/issues/110
-TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = unaryOp(UnaryOpType::Exp, tv0);
-  auto tv2 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), tv1);
-  auto tv3 = reductionOp(BinaryOpType::Min, {-1}, new Double(0), tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-  tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort);
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 2);
-}
-
-TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) {
-  for (const auto i : c10::irange(2)) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    // Set up your input tensor views
-    TensorView* tv0 = makeSymbolicTensor(1);
-    fusion.addInput(tv0);
-
-    auto tv1 = add(tv0, new Double(1));
-    auto tv2 = add(tv0, new Double(1));
-    TensorView* tv3 = add(tv1, tv2);
-    // Set outputs tv2 or tv1 and then tv3
-    if (i == 0) {
-      fusion.addOutput(tv2);
-    } else {
-      fusion.addOutput(tv1);
-    }
-    fusion.addOutput(tv3);
-
-    if (i == 0) {
-      tv1->computeAt(tv3, -1);
-    } else {
-      tv2->computeAt(tv3, -1);
-    }
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor aten_input = at::randn({100}, options);
-    std::vector<at::Tensor> aten_outputs = {
-        aten_input + 1, (aten_input + 1) * 2};
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    auto cg_outputs = fe.runFusion({aten_input});
-
-    testValidate(
-        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv0, new Double(1));
-  TensorView* tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 32);
-
-  tv1->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100, 100}, options);
-  auto aten_output = (aten_input + 1) * 2;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const size_t dimx = 13;
-  const size_t dimy = 15;
-
-  TensorView* tv0 = makeConcreteTensor({dimx, dimy});
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv2, new Double(3));
-  TensorView* tv4 = add(tv3, new Double(4));
-  TensorView* tv5 = mul(tv2, tv4);
-  fusion.addOutput(tv5);
-
-  tv1->computeAt(tv2, 2);
-  tv3->computeAt(tv4, 1);
-  tv4->computeAt(tv5, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto t1 = aten_input.add(1.);
-  auto t2 = t1.add(2.);
-  auto t3 = t2.add(3.);
-  auto t4 = t3.add(4.);
-  auto aten_output = t2.mul(t4);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-  TORCH_CHECK(tv2->nDims() == 0);
-  tv1->computeAt(tv2, 0);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum() + 1;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(0);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, true});
-  TORCH_CHECK(tv1->nDims() == 2);
-
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv3->computeAt(tv4, -1);
-  tv3->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({}, options);
-  at::Tensor t1 = at::randn({10, 10}, options);
-
-  auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor cg_output = at::empty({}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionZeroDimReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 32;
-  const int gdimx = 32;
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->split(0, bdimx);
-  tv1->split(0, gdimx);
-  auto tv2 = tv1->rFactor({0});
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({1000}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum();
-
-  at::Tensor cg_output = at::empty({}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  const int tidx = 128;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  tv1->split(1, tidx);
-  auto tv3 = tv1->rFactor({-2});
-
-  TensorView* tv4 = makeSymbolicTensor(2);
-  fusion.addInput(tv4);
-
-  auto tv5 = add(tv2, tv4);
-  fusion.addOutput(tv5);
-  tv5->split(1, tidx);
-
-  tv3->computeAt(tv5, 1);
-
-  tv2->split(1, tidx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  int x = 63, y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t4 = at::randn({x, y}, options);
-
-  auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y});
-  auto aten_output = t3.add(t4);
-
-  std::vector<IValue> aten_inputs = {t0, t4};
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t4});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionOutputBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({2, 3});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = broadcast(tv0, {true, false, true, false, true});
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({2, 3}, options);
-  auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReductionKeepDimBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true);
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options);
-  auto aten_output =
-      aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, {red_dim}, new Double(0), tv0, /*keep_dim=*/true);
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
-  auto aten_output =
-      aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto lparams = reduction_params.value().lparams;
-
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionSumTo_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor_shape{2, 3, 4, 5, 6};
-  std::vector<int64_t> sum_to_shape{1, 5, 6};
-
-  std::vector<int64_t> tensor_shape_ref{2, 3, 4, 5, 6};
-  std::vector<int64_t> sum_to_shape_ref{1, 5, 6};
-
-  std::vector<Int*> sum_to_symb;
-  std::transform(
-      sum_to_shape.begin(),
-      sum_to_shape.end(),
-      std::back_inserter(sum_to_symb),
-      [](int s) -> Int* { return new Int(s); });
-
-  TensorView* tv0 = makeConcreteTensor(tensor_shape);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum_to(tv0, sum_to_symb);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
-  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  TORCH_CHECK(
-      cg_outputs[0].dim() == sum_to_shape.size(),
-      "sum_to not keeping the final dimension");
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSumToNoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor_shape{4, 5, 6};
-  std::vector<int64_t> sum_to_shape{4, 5, 6};
-
-  std::vector<int64_t> tensor_shape_ref{4, 5, 6};
-  std::vector<int64_t> sum_to_shape_ref{4, 5, 6};
-
-  std::vector<Int*> sum_to_symb;
-  std::transform(
-      sum_to_shape.begin(),
-      sum_to_shape.end(),
-      std::back_inserter(sum_to_symb),
-      [](int s) -> Int* { return new Int(s); });
-
-  TensorView* tv0 = makeConcreteTensor(tensor_shape);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum_to(tv0, sum_to_symb);
-
-  // Dummy operator to avoid tv0 both input and output
-  TensorView* tv2 = add(tv1, new Double(0));
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
-
-  TORCH_CHECK(
-      cg_outputs[0].dim() == sum_to_shape.size(),
-      "sum_to not keeping the final dimension");
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReductionScheduler_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({red_dim});
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-// Simple reduction parallelized on a symbolic size.
-TEST(NVFuserTest, FusionSymbolicReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 65000;
-  int numel_y = 1025;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  int runtime_threadIdx_dim = 128;
-
-  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
-  const std::vector<int> red_dims = {0, 2};
-  // Copy is because CodeGen requires int and Pytorch requires int64_t
-  // for a vector of reduction dimensions
-  const std::vector<int64_t> red_dims64 = {0, 2};
-  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
-  const std::vector<int64_t> tensor_dims_out = {10, 20};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(tensor_dims_in, options);
-  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
-  at::Tensor cg_output = at::empty(tensor_dims_out, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, {cg_output}, lparams);
-
-  testValidate(
-      &fusion,
-      {cg_output},
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
-  const std::vector<int> red_dims = {1, 3};
-  // Copy is because CodeGen requires int and Pytorch requires int64_t
-  // for a vector of reduction dimensions
-  const std::vector<int64_t> red_dims64 = {1, 3};
-  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(tensor_dims_in, options);
-  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    at::ScalarType aten_dtype = data_type_to_aten(dtype);
-    for (auto& rdim : red_dims) {
-      Fusion fusion;
-      FusionGuard fg(&fusion);
-
-      bool is_fp16 = dtype == DataType::Half;
-      bool is_bf16 = dtype == DataType::BFloat16;
-
-      TensorView* tv0 = makeSymbolicTensor(1, dtype);
-      fusion.addInput(tv0);
-
-      TensorView* tv0_cast = tv0;
-      if (is_fp16 || is_bf16) {
-        tv0_cast = castOp(DataType::Float, tv0);
-      }
-
-      TensorView* tv1 = sum(tv0_cast, {0});
-
-      TensorView* tv1_cast = tv1;
-      if (is_fp16) {
-        tv1_cast = castOp(DataType::Half, tv1);
-      }
-      if (is_bf16) {
-        tv1_cast = castOp(DataType::BFloat16, tv1);
-      }
-
-      fusion.addOutput(tv1_cast);
-
-      auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-
-      at::Tensor aten_input = at::randn({rdim}, options);
-      auto aten_output = aten_input.to(at::kDouble).sum({0});
-
-      auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-      TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
-      scheduleReduction(&fusion, reduction_params.value());
-      auto lparams = reduction_params.value().lparams;
-
-      FusionExecutor fe;
-      fe.compileFusion(&fusion);
-
-      auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-      testValidate(
-          &fusion,
-          cg_outputs,
-          {aten_input},
-          {aten_output},
-          __LINE__,
-          __FILE__,
-          "",
-          lparams);
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_axis = {1, 0};
-  std::vector<int> output_dims = {160, 320};
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    at::ScalarType aten_dtype = data_type_to_aten(dtype);
-    for (auto& axis : red_axis) {
-      for (auto& odim : output_dims) {
-        for (auto& rdim : red_dims) {
-          Fusion fusion;
-          FusionGuard fg(&fusion);
-
-          bool is_fp16 = dtype == DataType::Half;
-          bool is_bf16 = dtype == DataType::BFloat16;
-
-          TensorView* tv0 = makeSymbolicTensor(2, dtype);
-          fusion.addInput(tv0);
-
-          TensorView* tv0_cast = tv0;
-          if (is_fp16 || is_bf16) {
-            tv0_cast = castOp(DataType::Float, tv0);
-          }
-
-          TensorView* tv1 = sum(tv0_cast, {axis});
-
-          TensorView* tv1_cast = tv1;
-          if (is_fp16) {
-            tv1_cast = castOp(DataType::Half, tv1);
-          }
-          if (is_bf16) {
-            tv1_cast = castOp(DataType::BFloat16, tv1);
-          }
-          fusion.addOutput(tv1_cast);
-
-          auto options =
-              at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-
-          at::Tensor aten_input =
-              (axis ? at::randn({odim, rdim}, options)
-                    : at::randn({rdim, odim}, options));
-
-          auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-          TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
-          scheduleReduction(&fusion, reduction_params.value());
-          auto lparams = reduction_params.value().lparams;
-
-          FusionExecutor fe;
-          fe.compileFusion(&fusion);
-
-          auto cg_outputs = fe.runFusion({aten_input}, lparams);
-          auto aten_output = aten_input.to(at::kDouble).sum({axis});
-          testValidate(
-              &fusion,
-              cg_outputs,
-              {aten_input},
-              {aten_output},
-              __LINE__,
-              __FILE__,
-              "",
-              lparams);
-        }
-      }
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionCacheBefore_CUDA) {
-  // TVM Cache Write
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, new Double(1.0));
-  TensorView* tv2 = mul(tv1, new Double(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  // Before: TV2 = TV1 * 3
-  // After:  TV3 = TV1 * 3;
-  //         TV2 = TV3;
-  TensorView* tv3 = tv2->cache_before();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 750;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheAfter_CUDA) {
-  // TVM Cache Read
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, new Double(1.0));
-  TensorView* tv2 = mul(tv1, new Double(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  // Before: TV1 = TV0 + 1
-  // After:  TV3 = TV0;
-  //         TV1 = TV3 + 1
-  TensorView* tv3 = tv0->cache_after();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 457;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheFork_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, new Double(1.0));
-  TensorView* tv2 = mul(tv1, new Double(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-  // Before:  TV1 = TV0 + 1
-  //          TV2 = TV1 * 1
-  // Output:  TV1, TV2
-
-  // After:   TV1 = TV0 + 1
-  //          TV3 = TV1
-  //          TV2 = TV1 * 1
-  // Output:  TV3, TV2
-
-  // cache_fork !!does not!! automatically apply ComputeAt to the cache
-  auto tv3 = tv1->cache_fork();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 457;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output1 = aten_input + 1.0;
-  at::Tensor aten_output2 = aten_output1 * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output1, aten_output2},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheIndirect_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-  fusion.addInput(tv3);
-  fusion.addOutput(tv6);
-  // t6 = ((t1 + (t2 - t3)) - t0)
-
-  tv5->cache_after();
-  tv5->cache_before();
-
-  // cache_after on inputs placed before schedule
-  constexpr int BSX = 32;
-  tv6->split(-1, BSX);
-  tv2->computeAt(tv6, -1);
-
-  // Thread and Block binding
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 810;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor t2 = at::randn({M, N}, options);
-  at::Tensor t3 = at::randn({M, N}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheBcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1); // (M, 1)
-  TensorView* tv1 = broadcast(tv0, {false, true});
-  TensorView* tv2 = makeSymbolicTensor(1); // (1, N)
-  TensorView* tv3 = broadcast(tv2, {true, false});
-  TensorView* tv4 = mul(tv1, tv3);
-  fusion.addInput(tv0);
-  fusion.addInput(tv2);
-  fusion.addOutput(tv4);
-
-  // Case 1
-  tv0->cache_after();
-
-  // Case 2
-  tv1->cache_before();
-
-  // Case 3
-  tv1->cache_after();
-
-  // Case 4
-  TensorView* tv8 = tv4->cache_before();
-
-  constexpr int BSX = 128;
-  tv4->split(0, BSX);
-  tv4->split(-1, BSX);
-  tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // M/BSX, N/BSY, BSX, BSY
-  tv0->computeAt(tv4, 2);
-  tv2->computeAt(tv4, 2);
-  // 0, 1 | 2, 3, 4
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::BIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Replay on TV3
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv8->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 92, N = 500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M}, options);
-  at::Tensor t1 = at::randn({N}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor aten_output =
-      t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv0, new Double(1));
-  TensorView* tv4 = add(tv3, new Double(2));
-
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-
-  auto tv5 = tv1->cache_before();
-  auto tv6 = tv3->cache_before();
-  tv5->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv1->computeAt(tv2, -1);
-  tv3->computeAt(tv4, -1);
-
-  // Fails because tensor must be recomputed twice
-  // auto tv7 = tv0->cache_after();
-
-  constexpr int N = 800;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({N}, options);
-  auto aten_output = (aten_input + 1) + 2;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output, aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, N)
-  TensorView* tv1 = makeSymbolicTensor(2); // (M, N)
-  TensorView* tv2 = mul(tv0, tv1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv2);
-
-  // Schedule
-  TensorView* tv3 = tv0->cache_after();
-  TensorView* tv4 = tv1->cache_after();
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-
-  constexpr int BSY = 32;
-  constexpr int BSX = 128;
-  tv2->split(0, BSY);
-  tv2->split(2, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // M/BSX, N/BSX, BSX, BSX
-
-  tv0->computeAt(tv2, 2);
-  tv1->computeAt(tv2, 2);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 128, N = 10240;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor aten_output = mul(t0, t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST(NVFuserTest, FusionSmemReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
-  TensorView* tv1 = sum(tv0, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  TensorView* tv2 = tv0->cache_after();
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Schedule
-  constexpr int BSX = 32;
-  tv1->split(2, BSX);
-  tv1->split(1, 128);
-  tv1->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
-  TensorView* tv3 = tv1->rFactor({-2});
-
-  tv0->computeAt(tv1, -2);
-  tv0->computeAt(tv3, -2);
-
-  // Thread and Block binding
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, K, N}, options);
-  at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  constexpr int BSX = 16;
-  tv5->split(2, BSX);
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv6 = tv5->rFactor({-1});
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv5, 3);
-  tv1->computeAt(tv5, 3);
-
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-3)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-3)->parallelize(ParallelType::TIDy);
-  tv6->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Make sure BIDx is makred as exact (see issue #1119)
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx));
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  // Remove reduction axis from tv5
-  // tv6 = (M, R, N)
-  // tv5 = (M, N)
-  TensorView* tv6 = tv5->cache_before();
-
-  constexpr int BSX = 16;
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // tv5 = M/BSX, N/BSX, MSX, NSX
-
-  tv6->computeAt(tv5, 2);
-  tv6->computeAt(tv5, 2);
-
-  tv6->split(-1, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv7 = tv6->rFactor({-1});
-  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
-  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
-
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-
-  tv0->computeAt(tv7, 3);
-  tv1->computeAt(tv7, 3);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-  tv7->setMemoryType(MemoryType::Shared);
-  // Memory Type
-
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-3)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv7->axis(-3)->parallelize(ParallelType::TIDy);
-  tv7->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv6->axis(-2)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* x = makeSymbolicTensor(2);
-  fusion.addInput(x);
-  TensorView* max_val = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      new Double(std::numeric_limits<float>::lowest()),
-      x); // (M)
-  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
-  TensorView* x_max_sub = sub(x, bcast_max); // (M, N)
-  TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N)
-  TensorView* sum_exp = sum(exp, {-1}); // (M, R)
-  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
-  TensorView* softmax = div(exp, bcast_sum); // (M, N)
-  fusion.addOutput(softmax);
-
-  // Read Input into Shared Memory
-  // Load Input + Pwise into shared memory
-  auto cache_x = x->cache_after();
-  cache_x->setMemoryType(MemoryType::Shared);
-  exp->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> all_tensors(
-      {x,
-       cache_x,
-       max_val,
-       bcast_max,
-       x_max_sub,
-       exp,
-       sum_exp,
-       bcast_sum,
-       softmax});
-
-  auto tidx = new Int();
-  fusion.addInput(tidx);
-
-  for (auto tensor : all_tensors) {
-    tensor->split(-1, tidx);
-  }
-
-  auto sum_exp_rf = sum_exp->rFactor({1});
-  all_tensors.push_back(sum_exp_rf);
-
-  // computeAt
-  x->computeAt(x_max_sub, 1);
-  exp->computeAt(softmax, 1);
-  x_max_sub->computeAt(exp, 2);
-
-  softmax->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    tensor->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  const size_t dimx = 1024;
-  const size_t dimy = 4096;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input, 128});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, 128},
-      {aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int kReductionAxis = 3;
-  std::vector<int64_t> input_shape{10, 10, 10, 67};
-  TensorView* input = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-
-  auto output = softmax(input, kReductionAxis);
-
-  fusion.addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  auto aten_output =
-      at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false);
-
-  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  schedulePersistentKernel(&fusion, reduction_params.value());
-
-  auto lparams = reduction_params.value().lparams;
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, TestMaskSoftmax_CUDA) {
-  // This test is testing the usage of all padding tokens
-  // with softmax like Bert might might use in a full padding
-  // sequence.
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int kReductionAxis = 3;
-  std::vector<int64_t> input_shape{256, 16, 128, 128};
-  TensorView* input = makeSymbolicTensor(input_shape.size());
-  TensorView* mask = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-  fusion.addInput(mask);
-
-  auto out1 = add(input, mask);
-  auto output = softmax(out1, kReductionAxis);
-
-  fusion.addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  at::Tensor aten_mask = at::ones(input_shape, options);
-  // -10,000 is used here as a magic number because the padding
-  // tokens need to be a value that gives a value close to zero
-  // as to not influence softmax.  Bert, in particular, does
-  // not use -Infinity because sometimes it will have a
-  // softmax of all padding tokkens that can result a divide by
-  // zero that creates NaN result.
-  aten_mask = aten_mask * -10000.0;
-  auto aten_out1 = aten_input + aten_mask;
-  auto aten_output = at::_softmax(aten_out1, kReductionAxis, false);
-
-  auto reduction_params =
-      getPersistentHeuristics(&fusion, {aten_input, aten_mask});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  schedulePersistentKernel(&fusion, reduction_params.value());
-
-  auto lparams = reduction_params.value().lparams;
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, aten_mask},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (const auto idx : c10::irange(kOuterNumDims)) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeSymbolicTensor(shape.size());
-  auto input = makeSymbolicTensor(shape.size());
-  auto mean = makeConcreteTensor(outer_shape);
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeSymbolicTensor(norm_shape.size());
-  auto bias = makeSymbolicTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(mean);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  auto grads = layer_norm_backward(
-      grad_out,
-      input,
-      norm_shape,
-      mean,
-      rstd,
-      weight,
-      bias,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  at::Tensor aten_bias = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-  auto at_bias = c10::optional<at::Tensor>(aten_bias);
-
-  const float kEps = 1e-5;
-  auto aten_results =
-      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
-  auto aten_output = std::get<0>(aten_results);
-  auto aten_mean = std::get<1>(aten_results);
-  auto aten_rstd = std::get<2>(aten_results);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {
-      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto aten_gradients = at::native_layer_norm_backward(
-      aten_grad_out.to(at::kDouble),
-      aten_input.to(at::kDouble),
-      norm_shape,
-      aten_mean.to(at::kDouble),
-      aten_rstd.to(at::kDouble),
-      c10::optional<at::Tensor>(aten_weight.to(at::kDouble)),
-      c10::optional<at::Tensor>(aten_bias.to(at::kDouble)),
-      {true, true, true});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {std::get<0>(aten_gradients),
-       std::get<1>(aten_gradients),
-       std::get<2>(aten_gradients)},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
-
-  std::vector<int64_t> input_shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto input = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-
-  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  c10::optional<at::Tensor> aten_weight = c10::nullopt;
-  c10::optional<at::Tensor> aten_bias = c10::nullopt;
-  auto aten_outputs = at::native_layer_norm(
-      aten_input, norm_shape, aten_weight, aten_bias, kEps);
-
-  // Check reduction axis is same for all reductions
-  // Generate Launch Parameters
-  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  schedulePersistentKernel(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {std::get<0>(aten_outputs),
-       std::get<1>(aten_outputs),
-       std::get<2>(aten_outputs)},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 7) {
-    return;
-  }
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  const bool kTraining = true;
-  std::vector<int64_t> input_shape{20, 100, 35, 45};
-
-  auto input = makeSymbolicTensor(input_shape.size());
-  auto weight = makeSymbolicTensor(1);
-  auto bias = makeSymbolicTensor(1);
-  auto running_mean = makeSymbolicTensor(1);
-  auto running_var = makeSymbolicTensor(1);
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  Double* momentum = new Double(kMomentum);
-  Double* eps = new Double(kEps);
-
-  auto result = batch_norm(
-      input, weight, bias, running_mean, running_var, kTraining, momentum, eps);
-
-  fusion->addOutput(result.output);
-  fusion->addOutput(result.mean);
-  fusion->addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_input = at::randn(input_shape, options);
-  auto at_weight = at::ones({input_shape[1]}, options);
-  auto at_bias = at::zeros({input_shape[1]}, options);
-  auto at_run_mean = at::zeros({input_shape[1]}, options);
-  auto at_run_var = at::ones({input_shape[1]}, options);
-
-  std::vector<IValue> aten_inputs = {
-      at_input, at_weight, at_bias, at_run_mean, at_run_var};
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto aten_outputs = at::native_batch_norm(
-      at_input,
-      c10::optional<at::Tensor>(at_weight),
-      c10::optional<at::Tensor>(at_bias),
-      c10::optional<at::Tensor>(at_run_mean),
-      c10::optional<at::Tensor>(at_run_var),
-      kTraining,
-      kMomentum,
-      kEps);
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      {at_run_mean,
-       at_run_var,
-       std::get<0>(aten_outputs),
-       std::get<1>(aten_outputs),
-       std::get<2>(aten_outputs)},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int pixels_per_thread = 64;
-  const int TIDX = 128;
-  const int static_size = pixels_per_thread * TIDX;
-
-  TensorView* sx = makeConcreteTensor({-1, static_size});
-  TensorView* dx = makeSymbolicTensor(2);
-  fusion.addInput(sx);
-  fusion.addInput(dx);
-
-  TensorView* max_sx = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      new Double(std::numeric_limits<float>::lowest()),
-      sx); // (M)
-  TensorView* max_dx = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      new Double(std::numeric_limits<float>::lowest()),
-      dx); // (M)
-
-  // Reduction => merge local and shared memory TensorViews
-  TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx);
-  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
-
-  TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N)
-  TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N)
-
-  TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N)
-  TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N)
-
-  TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R)
-  TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R)
-
-  // Reduction => merge local and shared memory TensorViews
-  TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp);
-  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
-
-  TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N)
-  TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N)
-  fusion.addOutput(sx_softmax);
-  fusion.addOutput(dx_softmax);
-
-  auto sx_cache = sx->cache_after();
-  auto dx_cache = dx->cache_after();
-  dx_cache->setMemoryType(MemoryType::Shared);
-  dx_exp->setMemoryType(MemoryType::Shared);
-
-  // Reduction and Broadcast Tensors common to both memory TVs
-  std::vector<TensorView*> common_tensors(
-      {max_val, sum_exp, bcast_max, bcast_sum});
-
-  // Static Local Memory TVs
-  std::vector<TensorView*> static_tensors(
-      {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax});
-
-  // Dynamic Local Memory TVs
-  std::vector<TensorView*> dynamic_tensors(
-      {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax});
-
-  std::vector<TensorView*> all_tensors;
-  all_tensors.insert(
-      all_tensors.end(), common_tensors.begin(), common_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), static_tensors.begin(), static_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
-
-  // M => M
-  // M, N => M, N/128, 128
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->split(-1, TIDX);
-    }
-  }
-
-  auto sx_sum_exp_rf = sx_sum_exp->rFactor({1});
-  auto dx_sum_exp_rf = dx_sum_exp->rFactor({1});
-  all_tensors.push_back(sx_sum_exp_rf);
-  all_tensors.push_back(dx_sum_exp_rf);
-
-  // computeAt
-  sx->computeAt(sx_max_sub, 1);
-  dx->computeAt(dx_max_sub, 1);
-
-  sx_exp->computeAt(sx_softmax, 1);
-  dx_exp->computeAt(dx_softmax, 1);
-
-  sx_max_sub->computeAt(sx_exp, 2);
-  dx_max_sub->computeAt(dx_exp, 2);
-
-  sx_softmax->axis(0)->parallelize(ParallelType::BIDx);
-  dx_softmax->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  const size_t dimx = 1024;
-  const size_t dimy = 16384;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_in =
-      aten_input.narrow(1, static_size, dimy - static_size);
-
-  at::Tensor out = at::zeros({dimx, dimy}, options);
-  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
-  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
-
-  std::vector<at::Tensor> aten_outputs;
-
-  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
-  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_out =
-      aten_output.narrow(1, static_size, dimy - static_size);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(
-      {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
-
-  testValidate(
-      &fusion,
-      {cg_static_out, cg_dynamic_out},
-      {aten_static_in, aten_dynamic_in},
-      {cg_static_out, cg_dynamic_out},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int pixels_per_thread = 64;
-  const int TIDX = 128;
-  const int static_size = pixels_per_thread * TIDX;
-
-  TensorView* sx = makeConcreteTensor({-1, static_size});
-  TensorView* dx = makeSymbolicTensor(2);
-  fusion.addInput(sx);
-  fusion.addInput(dx);
-
-  Double* gamma = new Double();
-  Double* beta = new Double();
-  Double* eps = new Double();
-  Int* N = new Int();
-  fusion.addInput(gamma);
-  fusion.addInput(beta);
-  fusion.addInput(eps);
-  fusion.addInput(N);
-
-  // Reduction
-  auto sx_sum = sum(sx, {-1}); // (M, R)
-  auto dx_sum = sum(dx, {-1}); // (M, R)
-  // Reduction => merge local and shared memory TensorViews
-  auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum);
-
-  // Broadcast
-  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
-  // Pwise
-  auto x_mean = div(x_sum_bcast, N); // (M, B)
-
-  auto sx_mean_sub = sub(sx, x_mean); // (M, N)
-  auto dx_mean_sub = sub(dx, x_mean); // (M, N)
-
-  auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N)
-  auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N)
-
-  // Reduction
-  auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R)
-  auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R)
-  // Reduction => merge local and shared memory TensorViews
-  auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum);
-
-  // Broadcast
-  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
-  // Pwise
-  auto var = div(var_sum_bcast, N); // (M, B)
-  auto var_eps = add(var, eps); // (M, B)
-  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
-
-  auto sx_norm = mul(sx_mean_sub, rvar);
-  auto dx_norm = mul(dx_mean_sub, rvar);
-
-  auto sx_norm_gamma = mul(sx_norm, gamma);
-  auto dx_norm_gamma = mul(dx_norm, gamma);
-
-  auto sx_norm_gamma_beta = add(sx_norm_gamma, beta);
-  auto dx_norm_gamma_beta = add(dx_norm_gamma, beta);
-
-  fusion.addOutput(sx_norm_gamma_beta);
-  fusion.addOutput(dx_norm_gamma_beta);
-
-  sx_norm_gamma_beta->setContiguity(false);
-  dx_norm_gamma_beta->setContiguity(false);
-
-  // Read Input into Shared Memory
-  // Read Input minus Input_Mean into Shared Memory
-  auto sx_cache = sx->cache_after();
-  auto dx_cache = dx->cache_after();
-  dx_cache->setMemoryType(MemoryType::Shared);
-  dx_mean_sub->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> common_tensors(
-      {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar});
-
-  std::vector<TensorView*> static_tensors(
-      {sx,
-       sx_cache,
-       sx_sum,
-       sx_mean_sub,
-       sx_mean_sub_pow,
-       sx_var_sum,
-       sx_norm,
-       sx_norm_gamma,
-       sx_norm_gamma_beta});
-
-  std::vector<TensorView*> dynamic_tensors(
-      {dx,
-       dx_cache,
-       dx_sum,
-       dx_mean_sub,
-       dx_mean_sub_pow,
-       dx_var_sum,
-       dx_norm,
-       dx_norm_gamma,
-       dx_norm_gamma_beta});
-
-  std::vector<TensorView*> all_tensors;
-  all_tensors.insert(
-      all_tensors.end(), common_tensors.begin(), common_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), static_tensors.begin(), static_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
-
-  // M => M
-  // M, N => M, N/128, 128
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->split(-1, TIDX);
-    }
-  }
-
-  // Local Sum => Block Broadcast
-  TensorView* sx_sum_rf = sx_sum->rFactor({1});
-  TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1});
-  TensorView* dx_sum_rf = dx_sum->rFactor({1});
-  TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1});
-  all_tensors.push_back(sx_sum_rf);
-  all_tensors.push_back(sx_var_sum_rf);
-  all_tensors.push_back(dx_sum_rf);
-  all_tensors.push_back(dx_var_sum_rf);
-
-  // ComputeAt
-  sx->computeAt(sx_mean_sub_pow, 1);
-  dx->computeAt(dx_mean_sub_pow, 1);
-
-  var_sum->computeAt(rvar, 1);
-
-  sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2);
-  dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2);
-
-  sx_norm->computeAt(sx_norm_gamma_beta, 2);
-  dx_norm->computeAt(dx_norm_gamma_beta, 2);
-
-  sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
-  dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  const int dimx = 1024;
-  const int dimy = 16384;
-  const float kGamma = 1.0f;
-  const float kBeta = 0.0f;
-  const float kEps = 1e-5;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_in =
-      aten_input.narrow(1, static_size, dimy - static_size);
-
-  at::Tensor out = at::zeros({dimx, dimy}, options);
-  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
-  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
-
-  std::vector<IValue> aten_inputs = {
-      aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
-
-  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
-  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
-  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
-  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
-  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_out =
-      aten_output.narrow(1, static_size, dimy - static_size);
-
-  testValidate(
-      &fusion,
-      {cg_static_out, cg_dynamic_out},
-      aten_inputs,
-      {aten_static_out, aten_dynamic_out},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  auto x = makeSymbolicTensor(2);
-  Double* gamma = new Double();
-  Double* beta = new Double();
-  Double* eps = new Double();
-  Int* N = new Int();
-  fusion.addInput(x);
-  fusion.addInput(gamma);
-  fusion.addInput(beta);
-  fusion.addInput(eps);
-  fusion.addInput(N);
-
-  // Reduction
-  auto x_sum = sum(x, {-1}); // (M, R)
-  // Broadcast
-  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
-  // Pwise
-  auto x_mean = div(x_sum_bcast, N); // (M, B)
-  auto x_mean_sub = sub(x, x_mean); // (M, N)
-  auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N)
-  // Reduction
-  auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R)
-  // Broadcast
-  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
-  // Pwise
-  auto var = div(var_sum_bcast, N); // (M, B)
-  auto var_eps = add(var, eps); // (M, B)
-  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
-  auto norm = mul(x_mean_sub, rvar);
-  auto norm_gamma = mul(norm, gamma);
-  auto norm_gamma_beta = add(norm_gamma, beta);
-  fusion.addOutput(norm_gamma_beta);
-
-  // Read Input into Shared Memory
-  // Read Input minus Input_Mean into Shared Memory
-  auto cache_x = x->cache_after();
-  cache_x->setMemoryType(MemoryType::Shared);
-  x_mean_sub->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> all_tensors(
-      {x_sum,
-       x_mean,
-       cache_x,
-       x_sum_bcast,
-       x_mean_sub,
-       x_mean_sub_pow,
-       var_sum,
-       var_sum_bcast,
-       var,
-       var_eps,
-       rvar,
-       norm,
-       norm_gamma,
-       norm_gamma_beta});
-
-  auto tidx = new Int();
-  fusion.addInput(tidx);
-
-  for (auto tensor : all_tensors) {
-    tensor->split(-1, tidx);
-  }
-
-  // Local Sum => Block Broadcast
-  TensorView* x_sum_rf = x_sum->rFactor({1});
-  TensorView* var_sum_rf = var_sum->rFactor({1});
-  all_tensors.push_back(x_sum_rf);
-  all_tensors.push_back(var_sum_rf);
-
-  // ComputeAt
-  x->computeAt(x_mean_sub_pow, 1);
-  var_sum->computeAt(rvar, 1);
-  x_mean_sub_pow->computeAt(var_sum_rf, 2);
-  norm->computeAt(norm_gamma_beta, 2);
-
-  for (auto tv : all_tensors) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  const int dimx = 128;
-  const int dimy = 2048;
-  const float kGamma = 1.0f;
-  const float kBeta = 0.0f;
-  const float kEps = 1e-5;
-  const int TIDX = 128;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
-  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
-  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
-
-  std::vector<IValue> aten_inputs = {
-      aten_input, kGamma, kBeta, kEps, dimy, TIDX};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  // tv1[I0, R1] = tv0[I0, I1]
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  tv2->setMemoryType(MemoryType::Shared);
-  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int numel_x = 65000, numel_y = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  Int* sym_bsx = new Int();
-  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
-  fusion.addInput(tv0);
-  fusion.addInput(sym_bsx);
-
-  TensorView* tv1 = sum(tv0, {1}); // M, R, N
-  fusion.addOutput(tv1);
-
-  TensorView* tv2 = tv0->cache_after();
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Schedule
-  constexpr int BSX = 32;
-  tv1->split(2, BSX);
-  tv1->split(1, sym_bsx);
-  tv1->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
-  TensorView* tv3 = tv1->rFactor({-2});
-
-  tv0->computeAt(tv1, -2);
-  tv0->computeAt(tv3, -2);
-
-  // Thread and Block binding
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, K, N}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, runtime_threadIdx_dim},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* sym_bsx = new Int();
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(sym_bsx);
-  fusion.addOutput(tv4);
-  // Algorithm
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  constexpr int BSX = 32;
-  tv4->split(2, BSX);
-  tv4->split(1, sym_bsx);
-  tv4->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
-
-  tv0->computeAt(tv4, 3);
-  tv1->computeAt(tv4, 3);
-  // Schedule
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::BIDy);
-  // Manual Binding
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  // Thread and Block binding
-
-  constexpr int M = 128, K = 457, N = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
-  std::vector<IValue> aten_inputs = {t0, t1, BSX};
-
-  LaunchParams lparams(-1, -1, -1, BSX, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z
-  Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x
-  Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x
-  // Compile-time integer for tiling
-  int n_smem_tile = 8; // bound to threadIdx.y
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Turn the K-dimension of tv4 into a reduction dimension
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, K, N]
-  tv5->split(2, n_smem_tile);
-  tv5->split(1, symbolic_block_k_tile_dim);
-  tv5->split(1, symbolic_split_k_tile_dim);
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
-
-  // Reorder so all outer tiles are in the leftmost 3 positions
-  tv5->reorder({{1, 5}, {5, 1}});
-  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
-
-  // Factor out the outer reduction IterDomain, then run the inter-cta
-  // reduction, and intra-cta reduction
-  auto tv6 = tv5->rFactor({2});
-  // [Mo, No, rKoo, rKoi, rKi, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Setup compute at schedule
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-  //
-  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
-  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
-  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
-  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
-  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Local);
-  tv6->setMemoryType(MemoryType::Local);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-  tv2->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-  tv6->axis(3)->parallelize(ParallelType::TIDx);
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(4)->parallelize(ParallelType::BIDx);
-  tv3->axis(4)->parallelize(ParallelType::BIDx);
-  tv4->axis(4)->parallelize(ParallelType::BIDx);
-  tv6->axis(4)->parallelize(ParallelType::BIDx);
-  tv5->axis(3)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 31, K = 65, N = 33;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  FusionExecutor fe;
-  // Generate CUDA and compile with nvRTC
-  fe.compileFusion(&fusion);
-
-  // Runtime tiling
-  int m_tile = 4; // bound to threadIdx.z
-  int split_k = 7; // bound to blockIdx.x
-  int intra_cta = 8; // bound to threadIdx.x
-
-  std::vector<IValue> aten_inputs = {t0, t1, m_tile, split_k, intra_cta};
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  // tv1[I0, R1] = tv0[I0, I1]
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  tv2->setMemoryType(MemoryType::Global);
-  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int numel_x = 65000, numel_y = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input}, lparams);
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-  fusion.addInput(tv3);
-  fusion.addOutput(tv6);
-  // t6 = ((t1 + (t2 - t3)) - t0)
-
-  tv4->setMemoryType(MemoryType::Global);
-  tv5->setMemoryType(MemoryType::Global);
-  tv6->setMemoryType(MemoryType::Global);
-
-  constexpr int M = 32, N = 810;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor t2 = at::randn({M, N}, options);
-  at::Tensor t3 = at::randn({M, N}, options);
-
-  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionConstCheck_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto one = new Int(1);
-  TORCH_CHECK(one->isConstScalar());
-
-  auto one_x2 = mul(one, one);
-  TORCH_CHECK(one_x2->isConstScalar());
-
-  auto one_x3 = mul(one_x2, one);
-  TORCH_CHECK(one_x3->isConstScalar());
-
-  auto one_x4 = mul(one_x3, one);
-  TORCH_CHECK(one_x4->isConstScalar());
-}
-
-TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
-  const std::vector<int64_t> tensor_dims_in = {128, 128};
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, new Double(0));
-  TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(tensor_dims_in, options);
-  at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options);
-
-  // Schedule
-  tv2->split(1, 32);
-  tv2->split(1, 4); // unroll
-
-  auto tv2_rf = tv2->rFactor({-3, -2});
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2_rf->axis(-2)->parallelize(ParallelType::Unroll);
-
-  tv1->computeAt(tv2_rf, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = (input + 0).to(at::kDouble).sum(1);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Test isZeroInt
-TEST(NVFuserTest, FusionIsZeroInt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* x = new Int(0);
-  Int* y = new Int(1);
-  Val* z = mul(x, y);
-  TORCH_CHECK(x->isZeroInt());
-  TORCH_CHECK(!y->isZeroInt());
-  TORCH_CHECK(!z->isZeroInt());
-}
-
-// Test isOneInt
-TEST(NVFuserTest, FusionIsOneInt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* x = new Int(1);
-  Int* y = new Int(1);
-  Val* z = mul(x, y);
-  TORCH_CHECK(x->isOneInt());
-  TORCH_CHECK(y->isOneInt());
-  TORCH_CHECK(!z->isOneInt());
-}
-
-// This is to verify no cycle of computeAt is created. A more complex
-// variation of this pattern appears in one of the Python tests
-// (test_random_topo).
-TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  // Common intermediate tensor
-  auto tv1 = add(tv0, new Double(1));
-  // tv1 -> tv2
-  auto tv2 = add(tv1, new Double(2));
-  // tv1 -> tv3 -> tv4
-  auto tv3 = add(tv1, new Double(3));
-  auto tv4 = add(tv3, new Double(4));
-
-  // NOTE: This should no longer occur as of PR #201.
-  // The order of adding outputs matters. If tv3 is added before tv4,
-  // it should be fine. However, if tv4 is added before tv3, there
-  // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
-  // first, and then tv4->tv3 is created at the final phase of
-  // computeAt (ComputeAt::setupOutputs).
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv2, -1);
-
-  TORCH_CHECK(tv3->hasComputeAt());
-  TORCH_CHECK(!tv4->hasComputeAt());
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(100, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = t1 + 3;
-  auto t4 = t3 + 4;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv0, new Double(2));
-  TensorView* tv3 = add(tv1, new Double(3));
-  TensorView* tv4 = add(tv1, new Double(4));
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-
-  tv1->computeAt(tv3, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 10}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = aten_input + 2;
-  auto t3 = t1 + 3;
-  auto t4 = t1 + 4;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  fe.runFusion({aten_input}, cg_outputs);
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-
-  TensorView* tv3 = add(tv0, new Double(3));
-  TensorView* tv4 = add(tv3, new Double(4));
-
-  TensorView* tv5 = add(tv1, tv3);
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  tv1->computeAt(tv5, -1);
-  tv3->computeAt(tv5, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 10}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto t5 = t1 + t3;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder3_CUDA) {
-  for (const auto i : c10::irange(2)) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* tv0 = makeSymbolicTensor(1);
-    fusion.addInput(tv0);
-
-    TensorView* tv1 = add(tv0, new Double(1));
-    TensorView* tv2 = add(tv1, new Double(2));
-
-    TensorView* tv3 = add(tv0, new Double(3));
-    TensorView* tv4 = add(tv3, new Double(4));
-
-    TensorView* tv5 = add(tv1, tv3);
-
-    fusion.addOutput(tv2);
-    fusion.addOutput(tv4);
-    fusion.addOutput(tv5);
-
-    const int tile = 32;
-
-    tv1->split(-1, tile);
-    tv2->split(-1, tile);
-    tv3->split(-1, tile);
-    tv4->split(-1, tile);
-    tv5->split(-1, tile);
-
-    auto compute_at_outer = tv1;
-    auto compute_at_inner = tv3;
-    if (i == 1) {
-      std::swap(compute_at_inner, compute_at_outer);
-    }
-
-    compute_at_outer->computeAt(tv5, -2);
-    compute_at_inner->computeAt(tv5, -1);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor aten_input = at::randn({100}, options);
-    auto t1 = aten_input + 1;
-    auto t2 = t1 + 2;
-    auto t3 = aten_input + 3;
-    auto t4 = t3 + 4;
-    auto t5 = t1 + t3;
-
-    std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
-
-    std::vector<at::Tensor> cg_outputs = {
-        at::empty_like(aten_input, options),
-        at::empty_like(aten_input, options),
-        at::empty_like(aten_input, options)};
-
-    fe.runFusion({aten_input}, cg_outputs);
-
-    testValidate(
-        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionTraversalOrder4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // First tree
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv1, new Double(3));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  // Second tree
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-  TensorView* tv5 = add(tv4, new Double(5));
-  TensorView* tv6 = add(tv5, new Double(6));
-  TensorView* tv7 = add(tv5, new Double(7));
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  tv1->computeAt(tv2, -1);
-  tv5->computeAt(tv6, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-  at::Tensor t4 = at::rand_like(t0, options);
-
-  auto t1 = t0 + 1;
-  auto t2 = t1 + 2;
-  auto t3 = t1 + 3;
-  auto t5 = t4 + 5;
-  auto t6 = t5 + 6;
-  auto t7 = t5 + 7;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
-  std::vector<IValue> aten_inputs = {t0, t4};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(t0, options),
-      at::empty_like(t0, options),
-      at::empty_like(t0, options),
-      at::empty_like(t0, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion(aten_inputs, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv0, new Double(3));
-  TensorView* tv4 = add(tv3, new Double(4));
-  TensorView* tv5 = add(tv2, tv4);
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv5);
-
-  tv2->computeAt(tv5, -1);
-  tv4->computeAt(tv5, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  fe.runFusion({aten_input}, cg_outputs);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto t5 = t2 + t4;
-
-  std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv0, new Double(2));
-  TensorView* tv3 = add(tv1, tv2);
-  TensorView* tv4 = add(tv3, new Double(4));
-
-  fusion.addOutput(tv4);
-
-  tv1->split(0, 32);
-  tv2->split(0, 32);
-  tv3->split(0, 32);
-  tv4->split(0, 32);
-
-  tv3->computeAt(tv4, -2);
-  tv1->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -2);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = aten_input + 2;
-  auto t3 = t1 + t2;
-  auto aten_output = t3 + 4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTraversalOrder7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(2));
-  TensorView* tv3 = add(tv0, new Double(3));
-  TensorView* tv4 = add(tv3, new Double(4));
-  TensorView* tv5 = add(tv2, tv4);
-
-  fusion.addOutput(tv5);
-
-  TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->split(0, 4);
-    tv->split(0, 8);
-  }
-
-  // computeAt into inner loop nests
-  tv1->computeAt(tv2, -1);
-  tv3->computeAt(tv4, -2);
-
-  tv2->computeAt(tv5, -4);
-  tv4->computeAt(tv5, -3);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto aten_output = t2 + t4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Test predication of grid reduction
-TEST(NVFuserTest, FusionThreadPredicate_CUDA) {
-  const int gdimx = 4;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0);
-  TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
-  TensorView* tv3 = add(tv0, new Double(2));
-
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv2);
-
-  tv1->split(1, bdimx);
-  tv1->split(1, gdimx);
-  tv3->split(1, bdimx);
-  tv3->split(1, gdimx);
-
-  TensorView* tv1_rf = tv1->rFactor({1});
-
-  tv1->computeAt(tv2, -1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDy);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(2)->parallelize(ParallelType::BIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDy);
-
-  int numel_x = 100;
-  int numel_y = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-
-  auto t2 = -aten_input.to(at::kDouble).sum({1});
-  auto t3 = aten_input + 2.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t2};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty({numel_x}, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionLSTMCell_CUDA) {
-  const int hidden_features = 512;
-  const int batch_size = 64;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tvs[16];
-  for (const auto i : c10::irange(16)) {
-    tvs[i] = makeSymbolicTensor(2);
-    fusion.addInput(tvs[i]);
-  }
-
-  auto ingate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
-
-  auto forgetgate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
-
-  auto cellgate = unaryOp(
-      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
-
-  auto outgate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
-
-  auto cx = makeContigTensor(2);
-  fusion.addInput(cx);
-
-  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
-
-  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
-
-  fusion.addOutput(cy);
-  fusion.addOutput(hy);
-
-  std::vector<c10::IValue> aten_inputs;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor large_tensor0 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor1 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor2 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor3 =
-      at::randn({batch_size, hidden_features * 4}, options);
-
-  auto chunked0 = large_tensor0.chunk(4, 1);
-  auto chunked1 = large_tensor1.chunk(4, 1);
-  auto chunked2 = large_tensor2.chunk(4, 1);
-  auto chunked3 = large_tensor3.chunk(4, 1);
-
-  aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
-  aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
-  aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
-  aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
-
-  auto at_ingate =
-      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
-  auto at_forgetgate =
-      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
-  auto at_cellgate =
-      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
-  auto at_outgate =
-      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
-
-  auto at_cx = at::randn({batch_size, hidden_features}, options);
-  aten_inputs.push_back(at_cx);
-  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
-  auto at_hy = at_outgate.mul(at_cy.tanh());
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = broadcast(tv1, {true, false});
-  TensorView* tv3 = broadcast(tv1, {false, true});
-  TensorView* tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // Not possible to do computeAt at position -1 as recomputation
-  // would be required. An exception should be thrown.
-  ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
-}
-
-TEST(NVFuserTest, FusionReductionHalf_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = add(tv1, new Double(1.0));
-  auto tv3 = sum(tv2, {2});
-  auto tv4 = castOp(DataType::Half, tv3);
-
-  fusion.addOutput(tv4);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({8, 8, 16}, options);
-
-  auto reduction_tv = tv3;
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReduceSingle_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({100, 1});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100, 1}, options);
-
-  // Grab only tensor views, though there shouldn't be any other type
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {red_dim, 2}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0);
-
-  TensorView* tv2 =
-      reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  scheduleReduction(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({1, 2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0);
-
-  TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({2, 1});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({10, 20, 1});
-  fusion.addInput(tv0);
-  TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(!fusion.hasReduction(), "Trivial reduction picked up by fusion");
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 20, 1}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-  auto aten_output = aten_input.to(at::kDouble).sum({2});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTrivialReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 1, x = 1, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeConcreteTensor({w, x, y, z});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv1, {0});
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv3, tv0);
-
-  fusion.addOutput(tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-  auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTrivialReduction3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int v = 1, w = 1, x = 1, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeConcreteTensor({v, w, x, y, z});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv1, {0, 1, 2});
-  auto tv3 = add(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t1 = at::randn({v, w, x, y, z}, options);
-  auto aten_output = t1.sum({0, 1, 2}).add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Make sure trivial reductions are correctly detected even with
-// scheduling applied.
-TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 4);
-  tv2->split(1, 8);
-  auto tv3 = tv2->rFactor({-1});
-  auto tv4 = tv2->rFactor({-1});
-
-  auto tv5 = broadcast(tv0, {true, false});
-  auto tv6 = add(tv5, new Double(1));
-  auto tv7 = sub(tv6, new Double(1));
-  auto tv8 = sum(tv7, {0});
-  fusion.addOutput(tv8);
-
-  auto tv9 = broadcast(tv0, {false, true, true});
-  auto tv10 = sum(tv9, {1});
-  auto tv11 = sum(tv10, {1});
-  fusion.addOutput(tv11);
-
-  tv8->split(0, 3);
-  tv10->split(1, 4);
-  tv11->split(1, 5);
-
-  tv0->computeAt(tv2, -1);
-  tv0->computeAt(tv8, -1);
-  tv0->computeAt(tv11, 1);
-
-  // Test indexing to gmem-backed tensors
-  tv3->setMemoryType(MemoryType::Global);
-  tv8->setMemoryType(MemoryType::Global);
-
-  GpuLower gpulw(&fusion);
-
-  // No kir::ReductionOp should be generated as all the reduction
-  // exprs should be replaced with a unary set op.
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    TORCH_CHECK(!kir_node->isA<kir::ReductionOp>());
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
-}
-
-// Test detection of partially trivial reduction
-TEST(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(1, 1);
-  // tv1->axis(1): non-trivial
-  // tv1->axis(2): trivial
-
-  auto tv3 = tv1->rFactor({-1});
-
-  // Just to suppress register-allocation warning
-  tv0->computeAt(tv2, 1);
-  tv3->computeAt(tv1, -1);
-
-  GpuLower gpulw(&fusion);
-
-  // tv3's reduction axis is a trivial reduction. The only
-  // kir::ReductionOp should be for tv1.
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (kir_node->isA<kir::ReductionOp>()) {
-      auto reduction_out =
-          kir_node->as<kir::ReductionOp>()->outputs()[0]->as<kir::TensorView>();
-      TORCH_CHECK(reduction_out->fuserTv() == tv1);
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionInputsIdLookup_CUDA) {
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 8, 8}, options);
-  at::Tensor t1 = at::randn({8, 8}, options);
-  at::Tensor t2 = at::randn({6, 4}, options);
-
-  // create a cache with max size 2;
-  torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2);
-
-  // testing basic function, same encoding for identical inputs
-  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
-  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
-  TORCH_CHECK(id_0.id == id_0_lookup.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 1);
-  TORCH_CHECK(id_0.eviction == false);
-
-  // new input (even tho same shape, but we have different signature because of
-  // missing scalar input
-  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
-  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
-  TORCH_CHECK(id_1.id == id_1_lookup.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 2);
-  TORCH_CHECK(id_1.eviction == false);
-
-  // eviction should happen at this point
-  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
-  TORCH_CHECK(id_2.id != id_0.id);
-  TORCH_CHECK(id_2.id != id_1.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 2);
-  TORCH_CHECK(id_2.eviction == true);
-  TORCH_CHECK(id_2.evict_id == id_0.id);
-
-  // look at input 1 again
-  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
-  TORCH_CHECK(id_1_relook.id == id_1.id);
-  TORCH_CHECK(id_1_relook.eviction == false);
-}
-
-TEST(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({64, 8, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // pass with identical shape
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(complyWith(t0, tensor_type));
-
-  // pass with dynamic shape
-  auto t1 = at::randn({16, 16, 8}, options);
-  TORCH_CHECK(complyWith(t1, tensor_type));
-
-  // broadcasting semantic change failure
-  auto t2 = at::randn({16, 1, 8}, options);
-  TORCH_CHECK(!complyWith(t2, tensor_type));
-
-  // contiguity failure via slicing
-  auto t3 = t0.slice(1, 0, 8, 2);
-  TORCH_CHECK(!complyWith(t3, tensor_type));
-
-  // contiguity failure via slicing
-  auto t4 = t0.slice(2, 0, 8, 2);
-  TORCH_CHECK(!complyWith(t4, tensor_type));
-
-  // rank failure
-  auto t5 = at::randn({16, 8, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t5, tensor_type));
-
-  // contiguity on stride 1 dimension with implicit broadcasting
-  auto t = at::randn({4}, options);
-  auto t6 = t.unsqueeze(1).expand({4, 8});
-  TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
-}
-
-TEST(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 1, 8});
-  std::vector<int64_t> strides_vec({8, 8, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // broadcasting semantic change
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t0, tensor_type));
-
-  // dtype failure
-  auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf));
-  TORCH_CHECK(!complyWith(t1, tensor_type));
-
-  // dtype failure
-  auto t2 = at::randn({16, 1, 8}, options);
-  TORCH_CHECK(complyWith(t2, tensor_type));
-
-  // device inconsistency shouldn't fail
-  auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0));
-  TORCH_CHECK(complyWith(t3, tensor_type));
-}
-
-TEST(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({64, 1, 8});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // failing permutation
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t0, tensor_type));
-
-  // passing with dynamic shape
-  auto t1 = t0.permute({0, 2, 1});
-  TORCH_CHECK(complyWith(t1, tensor_type));
-}
-
-TEST(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({128, 16, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // contiguity check passes although it differs
-  auto t0 = at::randn({16, 16, 8}, options);
-  TORCH_CHECK(complyWith(t0, tensor_type));
-
-  // passing with dynamic shape
-  auto t1 = t0.slice(1, 0, 16, 2);
-  TORCH_CHECK(complyWith(t1, tensor_type));
-}
-
-TEST(NVFuserTest, FusionDisjointSet_CUDA) {
-  DisjointSet<int> set;
-
-  const std::set<int> group_x({0, 1, 2});
-  const std::set<int> group_y({3, 4, 5});
-  const std::set<int> group_z({6, 7, 8});
-  const std::vector<std::set<int>> groups({group_x, group_y, group_z});
-  std::set<int> group_all;
-  std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
-    group_all.insert(g.begin(), g.end());
-  });
-
-  // Initially, nothing should be considered equivalent
-  for (auto i : group_all) {
-    for (auto j : group_all) {
-      TORCH_CHECK(!set.areEquivalent(i, j));
-    }
-  }
-
-  // Sets values in group_x are equivalent
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      set.join(i, j);
-      TORCH_CHECK(set.contains(i));
-      TORCH_CHECK(set.contains(j));
-    }
-  }
-
-  // All values in group_x shoudl be equivalent with each other
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      TORCH_CHECK(set.areEquivalent(i, j));
-    }
-  }
-  // But nothing else should be equivalent
-  for (auto i : group_all) {
-    for (auto j : group_y) {
-      TORCH_CHECK(!set.areEquivalent(i, j));
-    }
-    for (auto j : group_z) {
-      TORCH_CHECK(!set.areEquivalent(i, j));
-    }
-  }
-
-  // Sets values in group_y are equivalent
-  for (auto i : group_y) {
-    for (auto j : group_y) {
-      set.join(i, j);
-      TORCH_CHECK(set.contains(i));
-      TORCH_CHECK(set.contains(j));
-    }
-  }
-
-  // group_x should be still equivalent
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      TORCH_CHECK(set.areEquivalent(i, j));
-    }
-  }
-  // group_y should be now equivalent
-  for (auto i : group_y) {
-    for (auto j : group_y) {
-      TORCH_CHECK(set.areEquivalent(i, j));
-    }
-  }
-  // But group_z should not be equivalent with anything yet
-  for (auto i : group_all) {
-    for (auto j : group_z) {
-      TORCH_CHECK(!set.areEquivalent(i, j));
-    }
-  }
-
-  // Sets values in group_z are equivalent
-  for (auto i : group_z) {
-    for (auto j : group_z) {
-      set.join(i, j);
-      TORCH_CHECK(set.contains(i));
-      TORCH_CHECK(set.contains(j));
-    }
-  }
-
-  // Now each of the three groups should be equivalent within each
-  // group
-  for (const auto gi : c10::irange(groups.size())) {
-    for (const auto gj : c10::irange(groups.size())) {
-      for (auto i : groups[gi]) {
-        for (auto j : groups[gj]) {
-          TORCH_CHECK(
-              (gi == gj && set.areEquivalent(i, j)) ||
-              (gi != gj && !set.areEquivalent(i, j)));
-        }
-      }
-    }
-  }
-
-  auto all_elements = set.getAllElements();
-  std::sort(all_elements.begin(), all_elements.end());
-  std::vector<int> group_all_vec(group_all.begin(), group_all.end());
-  std::sort(group_all_vec.begin(), group_all_vec.end());
-  TORCH_CHECK(all_elements == group_all_vec);
-
-  set.clear();
-  all_elements = set.getAllElements();
-  TORCH_CHECK(all_elements.size() == 0);
-
-  // All cleared. Nothing should be considered equivalent.
-  for (auto i : group_all) {
-    for (auto j : group_all) {
-      TORCH_CHECK(!set.areEquivalent(i, j));
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv3, tv2);
-
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  // In order to do this, tv1->axis(1) and tv2->axis(1) must have the
-  // same size, but we can't prove it, so this should throw an error.
-  ASSERT_ANY_THROW(tv3->computeAt(tv4, -1));
-}
-
-TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const float k_079 = 0.79788456;
-  const float k_004 = 0.044715;
-
-  // bias vector
-  auto t0 = makeSymbolicTensor(1, DataType::Half);
-  fusion.addInput(t0);
-  auto t1 = castOp(DataType::Float, t0);
-  // input tensor
-  auto t2 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t2);
-  auto t3 = castOp(DataType::Float, t2);
-  auto t4 = broadcast(t1, {true, true, false});
-  auto t5 = add(t4, t3);
-  auto t6 = mul(t5, new Double(0.5));
-  auto t7 = mul(t5, new Double(k_079));
-  auto t8 = mul(t5, new Double(k_004));
-  auto t9 = mul(t8, t5);
-  auto t10 = add(t9, new Int(1));
-  auto t11 = mul(t7, t10);
-  auto t12 = unaryOp(UnaryOpType::Tanh, t11);
-  auto t13 = add(t12, new Double(1));
-  auto t14 = mul(t6, t13);
-  auto t15 = castOp(DataType::Half, t14);
-  fusion.addOutput(t15);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  std::vector<int64_t> input_shape{6, 512, 4096};
-  std::vector<int64_t> bias_shape{4096};
-
-  auto at_input = at::randn(input_shape, options);
-  auto at_bias = at::randn(bias_shape, options);
-
-  auto at_x =
-      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
-  auto aten_output_float =
-      at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh());
-  auto aten_output = aten_output_float.to(c10::ScalarType::Half);
-
-  std::vector<IValue> aten_inputs = {at_bias, at_input};
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const float k_079 = 0.79788456;
-  const float k_004 = 0.044715;
-  const float k_010 = 0.1070322243;
-
-  // gradient tensor
-  auto t0 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t0);
-  auto t1 = castOp(DataType::Float, t0);
-  // bias tensor
-  auto t2 = makeSymbolicTensor(1, DataType::Half);
-  fusion.addInput(t2);
-  auto t3 = castOp(DataType::Float, t2);
-  // input tensor
-  auto t4 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t4);
-  auto t5 = castOp(DataType::Float, t4);
-  auto t6 = broadcast(t3, {true, true, false});
-  auto t7 = add(t6, t5);
-  auto t8 = mul(t7, new Double(k_079));
-  auto t9 = mul(t7, new Double(k_004));
-  auto t10 = mul(t9, t7);
-  auto t11 = add(t10, new Int(1));
-  auto t12 = mul(t8, t11);
-  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
-  auto t14 = mul(t7, new Double(0.5));
-  auto t15 = mul(t13, t13);
-  auto t16 = unaryOp(UnaryOpType::Neg, t15);
-  auto t17 = add(t16, new Int(1));
-  auto t18 = mul(t7, new Double(k_010));
-  auto t19 = mul(t18, t7);
-  auto t20 = add(t19, new Double(k_079));
-  auto t21 = mul(t17, t20);
-  auto t22 = mul(t14, t21);
-  auto t23 = add(t13, new Int(1));
-  auto t24 = mul(t23, new Double(0.5));
-  auto t25 = add(t22, t24);
-  auto t26 = mul(t25, t1);
-  // Save float output for validation
-  fusion.addOutput(t26);
-  auto t27 = castOp(DataType::Half, t26);
-  fusion.addOutput(t27);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  std::vector<int64_t> input_shape{6, 512, 4096};
-  std::vector<int64_t> bias_shape{4096};
-  auto at_input = at::randn(input_shape, options);
-  auto at_bias = at::randn(bias_shape, options);
-  auto at_grad = at::randn(input_shape, options);
-
-  auto at_x =
-      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
-  auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh();
-  auto at_ff = 0.5 * at_x *
-          ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
-      0.5 * (1 + at_tanh_out);
-  auto at_out = at_ff * at_grad;
-  auto at_out_half = at_out.to(c10::ScalarType::Half);
-
-  std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
-  std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// Reproducer of issue #459
-TEST(NVFuserTest, FusionIssue459_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv1, tv3);
-
-  // Create two outputs from the final arithmetic result
-  auto tv5 = add(tv4, new Double(1));
-  fusion.addOutput(tv5);
-  auto tv6 = add(tv4, new Double(1));
-  fusion.addOutput(tv6);
-
-  // Scheduling
-  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
-    output->merge(-2, -1);
-  }
-  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
-    output->split(0, 128);
-  }
-
-  tv0->computeAt(tv5, -1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  const int numel_x = 10;
-  const int numel_y = 20;
-  auto t0 = at::randn({numel_x}, options);
-  auto t1 = at::randn({numel_y, numel_x}, options);
-  auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {aten_output, aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv3);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Global);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto aten_input = at::randn({12, 34}, options);
-  at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0;
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmemIndexing_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = new Int();
-  Int* symbolic_split_k_tile_dim = new Int();
-  Int* symbolic_block_k_tile_dim = new Int();
-  // Compile-time integer for tiling
-  int n_smem_tile = 32;
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, rK, N]
-  tv5->split(2, n_smem_tile);
-  // [M, rK, No, Ni{32}]
-  tv5->split(1, symbolic_block_k_tile_dim);
-  // [M, rKo, rKi{i2}, No, Ni{32}]
-  tv5->split(1, symbolic_split_k_tile_dim);
-  // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
-
-  // Reorder so all outer tiles are in the leftmost 3 positions
-  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2},     No, Ni{32}]
-  // [Mo,     No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
-  tv5->reorder({{1, 5}, {5, 1}});
-
-  // Factor out the outer reduction IterDomain, then run the inter-cta
-  // reduction, and intra-cta reduction
-  // [Mo, No, rKoo,  Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
-  // [Mo, No,       rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
-  auto tv6 = tv5->rFactor({2});
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-
-  // [Mo, No, rKoo, Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
-  // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
-  tv6->reorder({
-      {5, -2},
-      {6, -1},
-      {2, 2},
-      {3, 3},
-      {4, 4},
-  });
-
-  // Setup compute at schedule
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-
-  constexpr int M = 31, K = 65, N = 32;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  // A, B, m_tile_dim, split_k, intra_cta_tile
-  std::vector<IValue> aten_inputs = {t0, t1, 3, 4, 5};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Reproducer of issue 408
-TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 4);
-
-  auto tv3 = tv2->cache_before();
-
-  tv0->computeAt(tv3, -1);
-  tv3->computeAt(tv2, -1);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  auto aten_output = (aten_input + 1).to(at::kDouble).sum({1});
-
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  auto tv4 = tv2->cache_before();
-
-  tv4->computeAt(tv3, 1);
-  tv0->computeAt(tv4, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 10;
-  const int numel_y = 20;
-  const int numel_z = 30;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
-  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
-  auto t3 = t2 + 1;
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue367_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = new Int();
-  Int* symbolic_split_k_tile_dim = new Int();
-  Int* symbolic_block_k_tile_dim = new Int();
-  // Compile-time integer for tiling
-  int n_smem_tile = 32;
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, K, N]
-  tv5->split(2, n_smem_tile);
-  tv5->split(1, symbolic_block_k_tile_dim);
-  tv5->split(1, symbolic_split_k_tile_dim);
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
-  tv5->reorder({{1, 5}, {5, 1}});
-  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
-
-  auto tv6 = tv5->rFactor({2});
-  auto tv7 = tv5->rFactor({2});
-  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Local);
-  tv6->setMemoryType(MemoryType::Local);
-  tv7->setMemoryType(MemoryType::Local);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-  tv2->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-  tv6->axis(3)->parallelize(ParallelType::TIDx);
-  tv7->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(4)->parallelize(ParallelType::BIDx);
-  tv3->axis(4)->parallelize(ParallelType::BIDx);
-  tv4->axis(4)->parallelize(ParallelType::BIDx);
-  tv6->axis(4)->parallelize(ParallelType::BIDx);
-  tv7->axis(3)->parallelize(ParallelType::BIDx);
-  tv5->axis(2)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 3, K = 6, N = 16;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  // A, B, m, split_k, block_k
-  std::vector<IValue> aten_inputs = {t0, t1, 2, 2, 3};
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue468_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 100}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0});
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue363_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-  tv4->setMemoryType(MemoryType::Global);
-
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv5, -1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 3, K = 6, N = 16;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue484_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, new Double(0));
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({M, M}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue329_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-  auto tv3 = sum(tv1, {1});
-  fusion.addOutput(tv3);
-
-  tv1->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> t0_shape{17, 19};
-  auto aten_input = at::randn(t0_shape, options);
-  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
-  auto t3 = (aten_input + 1).to(at::kDouble).sum({1});
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue382_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = broadcast(tv1, {false, false, true});
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv2->merge(1);
-  tv4->merge(1);
-
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv2->setMemoryType(MemoryType::Global);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 12;
-  const int numel_y = 34;
-  const int numel_z = 56;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({numel_x, numel_y}, options);
-  auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t3};
-  auto aten_output = (t0 + 1).unsqueeze(-1) + t3;
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue507_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> t0_shape{17, 19};
-  auto aten_input = at::randn(t0_shape, options);
-  auto t1 = (aten_input + 1);
-  auto aten_output = (t1 + 1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue532_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(1));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  const int M_BLOCK = 64;
-  const int M_THREAD = 4;
-
-  tv2->split(0, M_BLOCK);
-  // tv2: [M/M_BLOCK, M_BLOCK]
-  tv1->computeAt(tv2, 1);
-  // tv1: [M/M_BLOCK, M_BLOCK]
-
-  tv1->split(-1, M_BLOCK / M_THREAD);
-  // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
-
-  tv2->split(-1, M_THREAD);
-  // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
-
-  constexpr int M = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0 + 1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionLoopUnswitch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, new Double(1));
-  TensorView* tv2 = add(tv1, new Double(1));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 32);
-  tv1->computeAt(tv2, -1);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  constexpr int M = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0 + 1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue549_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2); // M, K
-  TensorView* tv1 = makeSymbolicTensor(2); // K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-
-  TensorView* tv3 = broadcast(tv2, {false, false, true});
-  // tv3[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv4 = broadcast(tv1, {true, false, false});
-  // tv4[B, I1, I2] = tv1[I1, I2]
-
-  // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2]
-  TensorView* tv5 = mul(tv3, tv4);
-  // tv6[I0, R1, I2] = tv5[I0, I1, I2]
-  TensorView* tv6 = sum(tv5, {1});
-  fusion.addOutput(tv6);
-
-  tv6->split(1, 32);
-  // tv6[I0, R1o, R1i{32}, I2]
-
-  auto tv7 = tv6->rFactor({1});
-  // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
-  // tv6[I0,    , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
-
-  tv6->split(0, 4);
-  tv6->split(-1, 4);
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0->computeAt(tv6, -1);
-  tv1->computeAt(tv6, -1);
-
-  // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv6[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0->computeAt(tv7, -1);
-  tv1->computeAt(tv7, -1);
-  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv6->axis(0)->parallelize(ParallelType::BIDz);
-  tv6->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv6->axis(-2)->parallelize(ParallelType::BIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-  tv7->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // Lets specify a few bounds in launch params to make sure it works
-  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
-
-  // Make sure bad launch params throws
-  // TODO: Re-enable once we have parallelization validation in.
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, simplecompileRtc_CUDA) {
-  FusionExecutor fe;
-  std::string kernel = R"(
-__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
-  if(threadIdx.x==0){
-    for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
-      T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2;
-    }
-  }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      256, // gdimx
-      1, // gdimy
-      1, // gdimz
-      1, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {8};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out0 = at::empty_like(in0);
-  fe.runRtc(lp, {in0, out0});
-
-  auto out_ref = in0 * 2;
-  TORCH_CHECK(out_ref.allclose(out0));
-}
-
-TEST(NVFuserTest, FusionSerialWelford_CUDA) {
-  FusionExecutor fe;
-  int x = 128, y = 64, z = 64;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_var,
-    Tensor<float,1> out_avg
-){
-    for(int i0=0;i0<inp.size[0];i0++){
-        float tmp_M2=0;
-        float tmp_avg=0;
-        long tmp_N=0;
-        for(int i1=0;i1<inp.size[1];i1++){
-            for(int i2=0;i2<inp.size[2];i2++){
-                welfordCombine(
-                    tmp_avg,
-                    tmp_M2,
-                    tmp_N,
-                    inp[i0*inp.stride[0]+
-                        i1*inp.stride[1]+
-                        i2*inp.stride[2]],
-                    0.f,
-                    (long)1
-                );
-            }
-        }
-        out_var[i0*out_var.stride[0]]=
-            tmp_M2/(tmp_N);
-        out_avg[i0*out_avg.stride[0]]=
-            tmp_avg;
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      1, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out_var = at::empty({x}, options);
-  auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_var, out_avg});
-
-  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
-  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST(NVFuserTest, FusionBlockWelford_CUDA) {
-  FusionExecutor fe;
-  int x = 7, y = 8, z = 9;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,2> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var,
-    Tensor<float,1> init_avg,
-    Tensor<float,1> init_var,
-    Tensor<long,0> init_N
-){
-    //actual generated kernel will use dynamic shared mem,
-    // here is just for prototype
-    __shared__ float mem_avg[512];
-    __shared__ float mem_M2[512];
-    __shared__ long mem_N[512];
-    float in=inp[threadIdx.x*inp.stride[0]+
-                        threadIdx.y*inp.stride[1]];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    blockWelford<false,true,false>(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long)1,
-        threadIdx,
-        blockDim,
-        (float*)mem_avg,
-        (float*)mem_M2,
-        (long*)mem_N,
-        (bool)(threadIdx.x<inp.size[0]),
-        0.f);
-    __syncthreads();
-    if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
-        welfordCombine(
-                    tmp_avg,
-                    tmp_M2,
-                    tmp_N,
-                    init_avg[threadIdx.x*init_avg.stride[0]],
-                    init_var[threadIdx.x*init_var.stride[0]]*init_N[0],
-                    init_N[0]
-                );
-        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      x, // bdimx
-      y, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y};
-  const std::vector<int64_t> init_dims = {x, z};
-
-  // generate initial values
-  auto init_in = at::randn(init_dims, options);
-  auto init_var = init_in.var({1}, false);
-  auto init_avg = init_in.mean({1});
-  auto init_N =
-      at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0));
-
-  auto in0 = at::randn(tensor_dims, options);
-
-  // run kernel
-  auto out_var = at::zeros({x}, options);
-  auto out_avg = at::zeros({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
-
-  // compare with reference output
-  auto cat_tensor = at::cat({init_in, in0}, 1);
-  TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var));
-  TORCH_CHECK(
-      cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
-  FusionExecutor fe;
-  int x = 7, y = 8, z = 9;
-
-  // need support IValue for integer input as initial count
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var
-){
-    //actual generated kernel will use dynamic shared mem,
-    // here is just for prototype
-    __shared__ float mem_avg[512];
-    __shared__ float mem_M2[512];
-    __shared__ long mem_N[512];
-    float in=inp[threadIdx.x*inp.stride[0]+
-                        threadIdx.y*inp.stride[1]+
-                        threadIdx.z*inp.stride[2]];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    block_sync::init();
-    blockWelford<false,true,true>(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long) 1,
-        threadIdx,
-        blockDim,
-        (float*)mem_avg,
-        (float*)mem_M2,
-        (long*)mem_N,
-        (bool)(threadIdx.x<inp.size[0]),
-        0.f);
-    __syncthreads();
-    if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
-        out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      x, // bdimx
-      y, // bdimy
-      z // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out_var = at::empty({x}, options);
-  auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var});
-
-  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
-  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
-  FusionExecutor fe;
-  int x = 128, y = 64, z = 128;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var,
-    Tensor<float,1> work_buf_avg,
-    Tensor<float,1> work_buf_M2,
-    Tensor<long,1> work_buf_N,
-    Tensor<int64_t,1> sync_flag
-){
-    __shared__ float shared_buf_avg[512];
-    __shared__ float shared_buf_M2[512];
-    __shared__ long shared_buf_N[512];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    float in = inp[ blockIdx.x  * inp.stride[0]+
-                    blockIdx.y  * inp.stride[1]+
-                    threadIdx.x * inp.stride[2]];
-    block_sync::init();
-    welford::gridWelford<
-        true,true,false,
-        true,false,false,
-        false
-    >(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long) 1,
-        &work_buf_avg[0],
-        &work_buf_M2[0],
-        &work_buf_N[0],
-        sync_flag,
-        (float*)shared_buf_avg,
-        (float*)shared_buf_M2,
-        (long*)shared_buf_N,
-        threadIdx.x<out_var.size[0],
-        threadIdx.x<out_var.size[0],
-        0.f);
-    if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
-        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      x, // gdimx
-      y, // gdimy
-      1, // gdimz
-      z, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const auto options_int =
-      at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-
-  auto out_avg = at::empty({z}, options);
-  auto out_var = at::empty({z}, options);
-  auto work_buf_avg = at::empty({x * y * z}, options);
-  auto work_buf_var = at::empty({x * y * z}, options);
-  auto work_buf_N = at::empty({x * y * z}, options_int);
-  auto sync_flag = at::zeros({1}, options_int);
-  fe.runRtc(
-      lp,
-      {in0,
-       out_avg,
-       out_var,
-       work_buf_avg,
-       work_buf_var,
-       work_buf_N,
-       sync_flag});
-  std::vector<int64_t> dims{0, 1};
-
-  TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-  TORCH_CHECK(in0.var(dims, false).allclose(out_var));
-}
-
-TEST(NVFuserTest, FusionWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, new Double(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->split(1, 32);
-  tv_avg->split(0, 32);
-  tv_avg->split(0, 4);
-  tv_avg->reorder({{-1, -3}, {-3, -1}});
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      &fusion,
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, new Double(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->computeAt(tv_avg, -1);
-
-  //
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      &fusion,
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, new Double(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->axis(0)->parallelize(ParallelType::TIDx);
-  tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
-
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      &fusion,
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, new Double(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->split(1, 4);
-  auto rtvs = tvs.rFactor({2});
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      &fusion,
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionWelfordSchedule_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, new Double(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  // TODO: Why do we use launch params from here, but not scheduling???
-  auto reduction_params = getReductionHeuristics(&fusion, {t0});
-  scheduleReduction(&fusion, reduction_params.value());
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0}, reduction_params.value().lparams);
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  auto at_avg = t0.mean({1});
-  auto at_var = t0.var({1}, false);
-  auto at_n = at::ones({M}, options_int) * N;
-
-  testValidate(
-      &fusion,
-      outputs,
-      {t0},
-      {at_avg, at_var, at_n},
-      __LINE__,
-      __FILE__,
-      "validate welford",
-      reduction_params.value().lparams);
-}
-
-namespace {
-void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
-  const int axis = red_axis;
-  at::ScalarType aten_dtype = data_type_to_aten(dtype);
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  TensorView* tv0 = makeSymbolicTensor(2, dtype);
-  bool is_fp16 = dtype == DataType::Half;
-  bool is_bf16 = dtype == DataType::BFloat16;
-  TensorView* tv0_cast = tv0;
-  if (is_fp16 || is_bf16) {
-    tv0_cast = castOp(DataType::Float, tv0);
-  }
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0_cast, new Double(1));
-  auto tvs = Welford(tv1, {axis});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-
-  TensorView* avg_cast = tv_avg;
-  TensorView* M2_cast = tv_M2;
-
-  if (is_fp16) {
-    avg_cast = castOp(DataType::Half, tv_avg);
-    M2_cast = castOp(DataType::Half, tv_M2);
-  }
-  if (is_bf16) {
-    avg_cast = castOp(DataType::BFloat16, tv_avg);
-    M2_cast = castOp(DataType::BFloat16, tv_M2);
-  }
-
-  fusion.addOutput(avg_cast);
-  fusion.addOutput(M2_cast);
-  fusion.addOutput(tv_N);
-
-  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  std::vector<TensorView*> outputs_of_red;
-  at::Tensor aten_input =
-      (axis ? at::randn({odim, rdim}, options)
-            : at::randn({rdim, odim}, options));
-
-  if (is_fp16 || is_bf16) {
-    outputs_of_red.push_back(avg_cast);
-    outputs_of_red.push_back(M2_cast);
-  }
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  scheduleReduction(&fusion, reduction_params.value());
-
-  auto lparams = reduction_params.value().lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({aten_input}, reduction_params.value().lparams);
-
-  // by default Welford outputs sum of square diff so need to divide to
-  // get var
-
-  outputs[1] /= rdim;
-
-  auto at_avg = aten_input.mean({axis});
-  auto at_var = aten_input.var({axis}, false);
-  auto at_n =
-      (axis ? at::ones({odim, rdim}, options)
-            : at::ones({rdim, odim}, options));
-  at_n = at_n.sum({axis});
-
-  testValidate(
-      &fusion,
-      outputs,
-      {aten_input},
-      {at_avg, at_var, at_n},
-      __LINE__,
-      __FILE__,
-      "validate welford",
-      reduction_params.value().lparams);
-}
-} // namespace
-
-TEST(NVFuserTest, FusionWelfordShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_axis = {1, 0};
-  std::vector<int> output_dims = {160, 320};
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    for (auto& axis : red_axis) {
-      for (auto& odim : output_dims) {
-        for (auto& rdim : red_dims) {
-          // TODO: original welford algorithm actually keeps a running sum of
-          // squares, i.e. M_{2n} in the
-          //       cf:
-          //       https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-          //       algorithm notation, and it can reach inf for large numbers
-          //       with half precision. skipping too large volumes for half for
-          //       nwo might need further numerical experiments to re-design
-          //       this.
-          if (rdim > 32768 &&
-              (dtype == DataType::Half || dtype == DataType::BFloat16)) {
-            continue;
-          }
-          testWelford(dtype, axis, odim, rdim);
-        }
-      }
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionTranspose1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = transpose(tv0, {{0, 1}});
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0.t();
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTranspose2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = transpose(tv0, {{0, 1}});
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->merge(0);
-  tv1->split(0, 32);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0.t();
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-
-  TensorView* tv0 = makeSymbolicTensor(2); // K, M
-  TensorView* tv1 = makeSymbolicTensor(2); // N, K
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv0_t = transpose(tv0, {{0, 1}});
-  TensorView* tv1_t = transpose(tv1, {{0, 1}});
-
-  TensorView* tv2 = broadcast(tv0_t, {false, false, true});
-  // tv2[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv3 = broadcast(tv1_t, {true, false, false});
-  // tv3[B, I1, I2] = tv1[I1, I2]
-
-  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
-  TensorView* tv4 = mul(tv2, tv3);
-  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
-  TensorView* tv5 = sum(tv4, {1});
-  fusion.addOutput(tv5);
-
-  tv5->split(1, 32);
-  // tv5[I0, R1o, R1i{32}, I2]
-
-  auto tv6 = tv5->rFactor({1});
-  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
-  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
-
-  tv5->split(0, 4);
-  tv5->split(-1, 4);
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0_t->computeAt(tv5, -1);
-  tv1_t->computeAt(tv5, -1);
-
-  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0_t->computeAt(tv6, -1);
-  tv1_t->computeAt(tv6, -1);
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(-2)->parallelize(ParallelType::BIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({K, M}, options);
-  at::Tensor t1 = at::randn({N, K}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  // Lets specify a few bounds in launch params to make sure it works
-  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  TensorView* input_t = transpose(input_tv0, {{1, 2}});
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* input_t_copy = transpose(input_tv0, {{1, 2}});
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(-1, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  input_t->computeAt(sum_exp_rf_tv5, -1);
-  input_t_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimz, dimy}, options);
-
-  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_input_t = at::transpose(input, 1, 2);
-  auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0, {{0, 1}});
-
-  TensorView* tv1 = mul(tv0, new Double(0.5));
-  TensorView* tv2 = mul(tv1, new Double(-1.0));
-  TensorView* tv3 = add(tv1, new Double(3.0));
-  TensorView* tv4 = mul(tv1, new Double(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv7->merge(0);
-  tv7->split(0, 128);
-  tv7->split(0, 4);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv7, 1);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  at::Tensor aten_input_t = aten_input.t();
-
-  auto t1 = aten_input_t.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0, {{0, 1}});
-
-  TensorView* tv1 = mul(tv0, new Double(-1.0));
-  TensorView* tv2 = add(tv0, new Double(3.0));
-  TensorView* tv3 = mul(tv0, new Double(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input});
-
-  auto input_t = input.t();
-  auto t1 = input_t.mul({-1.0});
-  auto t2 = input_t.add({3.0});
-  auto t3 = input_t.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv2 = mul(tv1, new Double(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv3->nDims() > 1)
-    tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t0_t = t0.permute({3, 0, 1, 2});
-  auto t1_t = t1.permute({3, 0, 1, 2});
-  auto t2 = t1_t.mul({0.979361});
-  auto aten_output = t2.mul(t0_t);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  tv2 = transpose(tv2, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  tv3 = transpose(tv3, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  while (tv6->nDims() > 1)
-    tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-  tv3->computeAt(tv6, 1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!fusion.hasInput(val) &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t0_t = t0.permute({3, 0, 1, 2});
-  auto t1_t = t1.permute({3, 0, 1, 2});
-  auto t2_t = t2.permute({3, 0, 1, 2});
-  auto t3_t = t3.permute({3, 0, 1, 2});
-  auto t4 = t2_t.sub(t3_t);
-  auto t5 = t1_t.add(t4);
-  auto aten_output = t5.sub(t0_t);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  tv0 = transpose(tv0, {{0, 1}});
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  tv1 = transpose(tv1, {{0, 1}});
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(-1, 8);
-  tv3->split(-1, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t2 = t0.t().add(2.0);
-  auto aten_output = t1.t().mul(t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  tv0 = transpose(tv0, {{0, 1}});
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  tv1 = transpose(tv1, {{0, 1}});
-  TensorView* tv2 = add(tv0, new Double(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t2 = t0.t().add(2.0);
-  auto aten_output = t1.t().mul(t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(1);
-  TensorView* tv2 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  TensorView* tv3 = add(tv0, new Double(1)); // Group 0
-  TensorView* tv4 =
-      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
-  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
-                                   //  keeps normalization scheduler away)
-  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
-
-  fusion->addOutput(tv6);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  at::Tensor t1 = at::randn({65}, options);
-  at::Tensor t2 = at::randn({128, 65}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = std::get<0>(at::max(t3, 0));
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionMultipleVectorize_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeContigTensor(1);
-  TensorView* tv1 = makeContigTensor(1);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  TensorView* tv3 = add(tv0, tv1);
-  fusion->addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({40960}, options);
-  at::Tensor t1 = at::randn({40960}, options);
-  auto t2 = t0 + t1;
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  executor_cache.profile(true);
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime1 = executor_cache.getMostRecentKernelRuntime();
-  auto log1 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
-  TORCH_CHECK(log1.has_value());
-  TORCH_CHECK(log1->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  t0 = at::randn({40964}, options);
-  t1 = at::randn({40964}, options);
-  t2 = t0 + t1;
-
-  outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime2 = executor_cache.getMostRecentKernelRuntime();
-  auto log2 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
-  TORCH_CHECK(log2.has_value());
-  TORCH_CHECK(log2->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  t0 = at::randn({40962}, options);
-  t1 = at::randn({40962}, options);
-  t2 = t0 + t1;
-
-  outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime3 = executor_cache.getMostRecentKernelRuntime();
-  auto log3 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
-  TORCH_CHECK(log3.has_value());
-  TORCH_CHECK(log3->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  TORCH_CHECK(runtime1 == runtime2);
-  TORCH_CHECK(runtime1 != runtime3);
-}
-
-TEST(NVFuserTest, FusionVectorizeSimple_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeContigTensor(3);
-
-  fusion.addInput(tv0);
-
-  auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
-
-  fusion.addOutput(tv1);
-
-  auto tv0_cache = tv0->cache_after();
-
-  auto tv1_cache = tv1->cache_before();
-
-  tv1->merge(0);
-  tv1->merge(0);
-  tv1->split(0, 4);
-  tv1->split(0, 128);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv1, 2);
-
-  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv1->axis(2)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::empty({2, 6, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  at::Tensor aten_output = aten_input.sin();
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  // dimensionality of the problem
-  int nDims = 3;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(nDims);
-  TensorView* tv1 = makeContigTensor(nDims);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Double(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  auto tv0_cache = tv0->cache_after();
-  auto tv1_cache = tv1->cache_after();
-  auto tv3_cache = tv3->cache_before();
-
-  // Do transformations, remember, transformations are outputs to inputs
-  // This doesn't have to be in this order
-  tv3->merge(1);
-
-  // Split by n_threads
-  tv3->split(1, 2);
-  tv3->split(0, 3);
-  tv3->split(0, 1);
-
-  // [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unswitch);
-  tv3->axis(2)->parallelize(ParallelType::Unroll);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-
-  tv3->reorder({{4, 2}});
-  // [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
-
-  TransformPropagator::from(tv3);
-  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
-
-  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv1_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv3->axis(2)->parallelize(ParallelType::Vectorize);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 2, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  std::vector<int64_t> input_shape{32, 64, 8};
-  const int kReductionAxis = 1;
-
-  auto tv0 = TensorViewBuilder()
-                 .ndims(input_shape.size())
-                 .dtype(DataType::Double)
-                 .build();
-
-  fusion->addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1.0));
-  auto tv2 = sum(tv1, {2}); // Group 0
-
-  auto output = softmax(tv2, kReductionAxis); // Group 1
-  fusion->addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({at_x});
-
-  auto t1 = at_x.add(1.0);
-  auto t2 = t1.sum({2});
-  auto t3 = at::_softmax(t2.to(at::kDouble), -1, false);
-
-  auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
-  TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
-  TORCH_CHECK(
-      optimized_fusion->fusionSegments()->groups().size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSwizzle1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = mul(tv1, new Double(2));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 7);
-  tv2->split(0, 9);
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->swizzle(SwizzleType::Transpose, {1, 2});
-
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv1->axis(2)->parallelize(ParallelType::TIDy);
-
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = (t0 + 1) * 2;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSwizzle2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = mul(tv1, new Double(2));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->split(-2, 4);
-
-  tv2->split(-1, 4);
-  tv2->split(-2, 4);
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->reorder({{-1, -2}});
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->swizzle(SwizzleType::Transpose, {-2, -1});
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({123}, options);
-
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = (t0 + 1) * 2;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = transpose(tv0, {{0, 1}});
-  fusion.addOutput(tv1);
-
-  // tv0: [I0, I1]
-  // tv1: [I1, I0]
-
-  const int BS = 32;
-
-  // CTA tiling by BS*BS
-  tv1->split(1, BS);
-  tv1->split(0, BS);
-  tv1->reorder({{1, 2}});
-  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
-
-  // Create a smem buffer to cache each tile
-  auto tv0_cache = tv0->cache_after();
-  tv0_cache->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv1, 2);
-  // tv0: [I0, I1]
-  // tv0_cache: [I1/BS, I0/BS, BS(I1), BS(I0)]
-  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
-
-  // Assign each thread block to a tile
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-
-  // Thread mapping for each tile. For both of the input and output
-  // tiles, map TIDx to the fastest-changing dimension to facilitate
-  // coalesced gmem accesses.
-  tv1->axis(2)->parallelize(ParallelType::TIDy);
-  tv1->axis(3)->parallelize(ParallelType::TIDx);
-  // Note that the fastest-changing axis is next to the inner-most
-  // axis since computeAt reorders the axes as the output tensor.
-  tv0_cache->axis(2)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(3)->parallelize(ParallelType::TIDy);
-
-  // Swizzles the smem cache to avoid bank conflicts
-  tv0_cache->swizzle(SwizzleType::Transpose, {3, 2});
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 100;
-  const int by = 200;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.t();
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = transpose(tv0, {{0, 1}});
-  fusion.addOutput(tv1);
-
-  // tv0: [I0, I1]
-  // tv1: [I1, I0]
-
-  const int BS = 32;
-  const int BDIM = 256;
-
-  // CTA tiling by BS*BS
-  tv1->split(1, BS);
-  tv1->split(0, BS);
-  tv1->reorder({{1, 2}});
-  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
-
-  // Create a smem buffer to cache each tile
-  auto tv0_cache = tv0->cache_after();
-  tv0_cache->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv1, 2);
-  // tv0: [I0, I1]
-  // tv0_cache: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
-  // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
-
-  // Tranform the tile axes for 1D thread mapping
-  tv1->merge(-2, -1);
-  tv1->split(-1, BDIM);
-  // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
-
-  // Transform the cache similarly but apply swizzle to the 2D tile axes.
-  tv0_cache->reorder({{-2, -1}});
-  tv0_cache->swizzle(SwizzleType::Transpose, {2, 3});
-  tv0_cache->merge(-2, -1);
-  tv0_cache->split(-1, BDIM);
-  // tv0: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
-
-  // Assign each thread block to a tile
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-
-  // Thread mapping for each tile.
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 100;
-  const int by = 200;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.t();
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridPersistence_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-  }
-
-  const int numel_x = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0}).unsqueeze(-1).add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridPersistence2_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true, false});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDy);
-    tv->axis(2)->parallelize(ParallelType::TIDx);
-  }
-
-  const int numel_x = 10;
-  const int numel_y = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0}).unsqueeze(0).add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWelfordPersistence_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0});
-  auto tv4 = add(tvs.avg, tvs.var_sum);
-  auto tv5 = broadcast(tv4, {true});
-  auto tv6 = add(tv0, tv5);
-  fusion.addOutput(tv6);
-
-  std::vector<TensorView*> schedule_tvs = {
-      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
-
-  for (auto tv : schedule_tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-  }
-
-  const int numel_x = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
-                         .unsqueeze(-1)
-                         .add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWelfordPersistence2_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0});
-  auto tv4 = add(tvs.avg, tvs.var_sum);
-  auto tv5 = broadcast(tv4, {true, false});
-  auto tv6 = add(tv0, tv5);
-  fusion.addOutput(tv6);
-
-  std::vector<TensorView*> schedule_tvs = {
-      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
-  for (auto tv : schedule_tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDy);
-    tv->axis(2)->parallelize(ParallelType::TIDx);
-  }
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 10;
-  const int numel_y = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto out = fe.runFusion({input});
-
-  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
-                         .unsqueeze(0)
-                         .add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue633_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int dx = 10;
-  const int dy = 11;
-  const int dz = 12;
-
-  auto tv0 = makeConcreteTensor({dx, dy, dz});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({dx, dy, 1});
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->merge(1);
-  tv2->merge(0);
-  tv2->split(-1, 128);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({dx, dy, dz}, options);
-  at::Tensor t1 = at::randn({dx, dy, 1}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionKirScoping_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  fusion.addOutput(tv2);
-
-  tv2->merge(0);
-  tv2->split(0, 4);
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-
-  auto kir_tv1 = gpulw.lowerValue(tv1);
-  auto tv1_scope = kir_tv1->definition()->scope();
-  TORCH_CHECK(tv1_scope != nullptr);
-  TORCH_CHECK(tv1_scope->owner()->as<kir::IfThenElse>());
-
-  auto kir_tv2 = gpulw.lowerValue(tv2);
-  auto tv2_scope = kir_tv2->definition()->scope();
-  TORCH_CHECK(tv2_scope != nullptr);
-  TORCH_CHECK(tv2_scope->owner()->as<kir::IfThenElse>());
-
-  TORCH_CHECK(tv1_scope != tv2_scope);
-
-  // tv1 and tv2 should have the same inner-most ForLoop
-  auto parent_scope = tv1_scope->owner()->scope();
-  TORCH_CHECK(parent_scope == tv2_scope->owner()->scope());
-  TORCH_CHECK(parent_scope->owner()->as<kir::ForLoop>());
-  // There should be one more loop
-  parent_scope = parent_scope->owner()->scope();
-  TORCH_CHECK(parent_scope->owner()->as<kir::ForLoop>());
-
-  // scope() should return nullptr for top-level exprs
-  auto top_level_scope = parent_scope->owner()->scope();
-  TORCH_CHECK(top_level_scope == nullptr);
-}
-
-TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape{17, 19};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->split(1, 128);
-  tv0->computeAt(tv3, 2);
-
-  for (auto tv : {tv2, tv3}) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({shape[0]}, options);
-  at::Tensor t1 = at::randn(shape, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0.unsqueeze(-1).expand(shape) + t1;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 457;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(4);
-  auto tv1 = makeContigTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->reorder({{0, 1}, {1, 0}});
-  tv2->merge(-2);
-
-  const int kTDX = 64;
-  const int kVecSize = 2;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(-1, kNumElems);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  tv2->split(0, 128);
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 32;
-  const int c = 127;
-  const int h = 51;
-  const int w = 23;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({n, c, h, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int kNumDims = 4;
-  constexpr int kTDX = 64;
-  constexpr int kVecSize = 2;
-  constexpr int kNumElems = kTDX * kVecSize;
-
-  auto tv0 = makeSymbolicTensor(kNumDims);
-  auto tv1 = makeSymbolicTensor(kNumDims);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  // Create caches for vectorization
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  // Merge all dimensions together except inner-most dim
-  for (const auto idx : c10::irange(kNumDims - 2)) {
-    tv2->merge(0);
-  }
-  // Split inner-most dim
-  tv2->split(-1, kNumElems);
-  tv2->split(-1, kVecSize);
-  TransformPropagator::from(tv2);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  // Parallelization Strategy
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 5;
-  const int c = 3;
-  const int h = 51;
-  const int w = 257;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({n, c, h, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int kNumDims = 4;
-  constexpr int kTDX = 64;
-  constexpr int kVecSize = 2;
-  constexpr int kNumElems = kTDX * kVecSize;
-  std::vector<int64_t> bcast_shape{1, 1, 1, -1};
-
-  auto tv0 = makeContigTensor(kNumDims);
-  auto tv1 = TensorViewBuilder().shape(bcast_shape).build();
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  // Create caches for vectorization
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  // Merge all dimensions together
-  // Backward merge order is necessary for vectorize validation
-  for (int idx = kNumDims - 1; idx > 0; --idx) {
-    tv2->merge(idx - 1);
-  }
-  tv2->split(-1, kNumElems);
-  tv2->split(-1, kVecSize);
-  TransformPropagator::from(tv2);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  // Parallelization Strategy
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 32;
-  const int c = 128;
-  const int h = 51;
-  const int w = 23;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({1, 1, 1, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  // TODO: throw assertion - cannot merge non-contiguous vectorization axes
-  // Make sure compilation fails
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-
-  tv3->split(-1, 128 * 4);
-  tv3->split(-1, 4);
-  // Reduce outer dim first
-  auto tv4 = tv3->rFactor({-3, -1});
-  // Tv3 will reduce threads
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv4, -2);
-  tv1->computeAt(tv4, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2050;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.add(t1).sum(1);
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    // Vectorize the wrong dimension
-    tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
-  }
-
-  FusionExecutor fe;
-  // Make sure compilation fails
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  // Failure because the input + output tensors do not have the same stride
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-}
-
-TEST(NVFuserTest, FusionViewOutput_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape{2, 10, 40};
-  std::vector<int64_t> output_shape{2, 10, 4, 10};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  auto x_view = view(x_add_bias, input_shape, output_shape);
-  fusion.addOutput(x_view);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_bias = at::randn(input_shape, options);
-  std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs, lparams);
-
-  auto at_x_add_bias = at_x + at_bias;
-  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
-
-  testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionViewFailMismatchSize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // The number of elements in input and output shapes do not match,
-  // so this view transformation is invalid.
-  // 2 * 10 * 40 != 2 * 50 * 4 * 10
-
-  std::vector<int64_t> input_shape{2, 10, 40};
-  std::vector<int64_t> output_shape{2, 50, 4, 10};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
-}
-
-TEST(NVFuserTest, FusionViewFailMulitDimInference_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Only one dimension can be inferred in the output shape.
-  // Otherwise, the size of the dimensions is ambiguous.
-  std::vector<int64_t> input_shape{2, 10, 40};
-  std::vector<int64_t> output_shape{2, -1, 4, -1};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
-}
-
-TEST(NVFuserTest, FusionViewFailReduction_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  // View is only supported by the pointwise scheduler,
-  // so it should fail with any reduction operations
-  std::vector<int64_t> input_shape{2, 10, 40};
-  std::vector<int64_t> output_shape{2, 10, 2, 20};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  auto x_view = view(x_add_bias, input_shape, output_shape);
-  auto x_sum = sum(x_view, {-1});
-
-  fusion.addOutput(x_sum);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_bias = at::randn(input_shape, options);
-
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-  ASSERT_ANY_THROW(fusion_executor_cache.runFusionWithInputs({at_x, at_bias}));
-}
-
-TEST(NVFuserTest, FusionViewFailPersistent_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  // View is only supported by the pointwise scheduler,
-  // so it should fail with any persistent normalization operations
-  std::vector<int64_t> input_shape{2, 10, 40};
-  std::vector<int64_t> output_shape{2, 10, 2, 20};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  auto x_view = view(x_add_bias, input_shape, output_shape);
-  auto x_softmax = softmax(x_view, -1);
-
-  fusion.addOutput(x_softmax);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_bias = at::randn(input_shape, options);
-
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-  ASSERT_ANY_THROW(fusion_executor_cache.runFusionWithInputs({at_x, at_bias}));
-}
-
-void addViewGeluFusion(
-    std::vector<int64_t>& input_shape,
-    std::vector<int64_t>& output_shape) {
-  for (auto hasImplicitBroadcast : {false, true}) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* x = (hasImplicitBroadcast)
-        ? makeConcreteTensor(input_shape)
-        : makeSymbolicTensor(input_shape.size());
-    TensorView* bias = (hasImplicitBroadcast)
-        ? makeConcreteTensor(input_shape)
-        : makeSymbolicTensor(input_shape.size());
-    fusion.addInput(x);
-    fusion.addInput(bias);
-
-    auto x_add_bias = add(x, bias);
-    auto x_view = view(x_add_bias, input_shape, output_shape);
-    auto y = gelu(x_view);
-    fusion.addOutput(y);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor at_x = at::randn(input_shape, options);
-    at::Tensor at_bias = at::randn(input_shape, options);
-    std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-    auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    auto outputs = fe.runFusion(aten_inputs, lparams);
-
-    auto at_x_add_bias = at_x + at_bias;
-    auto at_x_view = at::native::view(at_x_add_bias, output_shape);
-    auto at_y = at::gelu(at_x_view);
-
-    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionViewSplit_CUDA) {
-  std::vector<int64_t> input_shape{80};
-  std::vector<int64_t> output_shape{2, 4, 10};
-  addViewGeluFusion(input_shape, output_shape);
-}
-
-TEST(NVFuserTest, FusionViewBroadcast_CUDA) {
-  std::vector<int64_t> input_shape{80};
-  std::vector<int64_t> output_shape{1, 80};
-  addViewGeluFusion(input_shape, output_shape);
-}
-
-TEST(NVFuserTest, FusionViewMerge_CUDA) {
-  std::vector<int64_t> input_shape{2, 40, 7};
-  std::vector<int64_t> output_shape{560};
-  addViewGeluFusion(input_shape, output_shape);
-}
-
-TEST(NVFuserTest, FusionViewAllShmoo_CUDA) {
-  typedef std::vector<int64_t> shape;
-  typedef std::pair<shape, shape> view_example;
-
-  std::vector<view_example> examples = {
-      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
-      {{3, 17, 80, 1}, {51, 1, 2, 4, 10}},
-      {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10, 9}},
-      {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 5, 1}},
-      {{22, 22, 2}, {22, 11, 1, 1, 4}},
-      {{37, 9, 7, 6, 10}, {333, 2, 2, 3, 35}},
-      {{1, 1, 333, 1}, {1, 1, 333, 1}},
-      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}},
-      {{1, 333, 1}, {1, 37, 9, 1}},
-      {{1, 333}, {1, 1, 1, 111, 1, 3}},
-      {{22, 1, 22, 1}, {484}},
-      {{1, 333, 1}, {333}},
-      {{1, 27454, 1, 2}, {1, 7844, 1, 7}},
-      {{1, 7844, 1, 7}, {1, 27454, 2}}};
-
-  for (auto e : examples) {
-    addViewGeluFusion(e.first, e.second);
-  }
-}
-
-TEST(NVFuserTest, FusionViewInferShmoo_CUDA) {
-  typedef std::vector<int64_t> shape;
-  typedef std::pair<shape, shape> view_example;
-
-  std::vector<view_example> examples = {
-      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, -1, 3, 2772}},
-      {{3, 17, 80, 1}, {51, 1, 2, 4, -1}},
-      {{3, 17, 80, 1, 9}, {-1, 1, 2, 4, 10, 9}},
-      {{2, 3, 4, 5}, {1, 6, 1, -1, 2, 5, 1}},
-      {{22, 22, 2}, {22, -1, 1, 1, 4}},
-      {{37, 9, 7, 6, 10}, {333, 2, -1, 3, 35}},
-      {{1, 1, 333, 1}, {1, 1, -1, 1}},
-      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, -1}},
-      {{1, 333, 1}, {1, 37, -1, 1}},
-      {{1, 333}, {1, 1, 1, -1, 1, 3}},
-      {{22, 1, 22, 1}, {-1}},
-      {{1, 333, 1}, {-1}},
-      {{1, 27454, 1, 2}, {1, 7844, 1, -1}},
-      {{1, 7844, 1, 7}, {1, -1, 2}}};
-
-  for (auto e : examples) {
-    addViewGeluFusion(e.first, e.second);
-  }
-}
-
-void geluViewAddFusion(
-    std::vector<int64_t> input_shape,
-    std::vector<int64_t> output_shape) {
-  for (auto hasImplicitBroadcast : {false, true}) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* x = (hasImplicitBroadcast)
-        ? makeConcreteTensor(input_shape)
-        : makeSymbolicTensor(input_shape.size());
-    TensorView* bias = (hasImplicitBroadcast)
-        ? makeConcreteTensor(output_shape)
-        : makeSymbolicTensor(output_shape.size());
-    fusion.addInput(x);
-    fusion.addInput(bias);
-
-    auto x_gelu = gelu(x);
-    auto x_view = view(x_gelu, input_shape, output_shape);
-    auto y = add(x_view, bias);
-    fusion.addOutput(y);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor at_x = at::randn(input_shape, options);
-    at::Tensor at_bias = at::randn(output_shape, options);
-    std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-    auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    auto outputs = fe.runFusion(aten_inputs, lparams);
-
-    auto at_x_gelu = at::gelu(at_x);
-    auto at_x_view = at::native::view(at_x_gelu, output_shape);
-    auto at_y = at_x_view + at_bias;
-
-    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionViewStride_CUDA) {
-  typedef std::vector<int64_t> shape;
-  typedef std::pair<shape, shape> view_example;
-
-  std::vector<view_example> examples = {
-      {{1, 27454, 2}, {1, 7844, 7}},
-      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
-      {{1, 7844, 1, 7}, {1, 27454, 2}}};
-
-  for (auto e : examples) {
-    geluViewAddFusion(e.first, e.second);
-  }
-}
-
-void geluViewBinaryAddFusion(
-    std::vector<int64_t> input_shape1,
-    std::vector<int64_t> input_shape2,
-    std::vector<int64_t> output_shape) {
-  for (auto hasImplicitBroadcast : {false, true}) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* x = (hasImplicitBroadcast)
-        ? makeConcreteTensor(input_shape1)
-        : makeSymbolicTensor(input_shape1.size());
-    TensorView* bias = (hasImplicitBroadcast)
-        ? makeConcreteTensor(input_shape2)
-        : makeSymbolicTensor(input_shape2.size());
-    fusion.addInput(x);
-    fusion.addInput(bias);
-
-    auto x_gelu = gelu(x);
-    auto x_view = view(x_gelu, input_shape1, output_shape);
-    auto bias_view = view(bias, input_shape2, output_shape);
-    auto y = add(x_view, bias_view);
-    fusion.addOutput(y);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor at_x = at::randn(input_shape1, options);
-    at::Tensor at_bias = at::randn(input_shape2, options);
-    std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-    auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    auto outputs = fe.runFusion(aten_inputs, lparams);
-
-    auto at_x_gelu = at::gelu(at_x);
-    auto at_x_view = at::native::view(at_x_gelu, output_shape);
-    auto at_bias_view = at::native::view(at_bias, output_shape);
-    auto at_y = at_x_view + at_bias_view;
-
-    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionViewBinary_CUDA) {
-  geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7});
-}
-
-TEST(NVFuserTest, FusionVectorization1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    tv->axis(-1)->parallelize(ParallelType::Vectorize);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2048;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorization2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    // Vectorize the wrong dimension
-    tv->axis(-2)->parallelize(ParallelType::Vectorize);
-  }
-
-  FusionExecutor fe;
-  // Make sure compilation fails
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionVectorization3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cache_after();
-  auto c1 = tv1->cache_after();
-  auto c2 = tv2->cache_before();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    tv->axis(-1)->parallelize(ParallelType::Vectorize);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-
-  aten_inputs[0] = t0.index({"...", Slice(1)});
-  aten_inputs[1] = t1.index({"...", Slice(1)});
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-
-  t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
-  t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
-  aten_inputs = {t0, t1};
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 128 * 4);
-  tv3->split(-1, 4);
-  // Reduce outer dim first
-  auto tv4 = tv3->rFactor({-3, -1});
-  // Tv3 will reduce threads
-
-  auto tv6 = tv0->cache_after();
-  auto tv7 = tv1->cache_after();
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv4, -2);
-  tv1->computeAt(tv4, -2);
-
-  tv6->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv7->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2048;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.add(t1).sum(1);
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  auto t3 = t0.add(t1).sum(1);
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
-}
-
-// Unswitched loops with extent one may omit else clause.
-TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Progressively broadcast tensors
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = broadcast(tv0, {false, true});
-  TensorView* tv4 = add(tv3, tv1);
-  TensorView* tv5 = add(tv4, tv2);
-
-  fusion.addOutput(tv5);
-
-  // Split inner dimension
-  tv5->split(1, 8);
-  // Merge middle dims with outer dimensions
-  tv5->merge(2);
-  tv5->merge(0);
-
-  // tv5[I0*I1o, I1i*I2]
-  // Get a dim of size 1 to unswitch
-  tv5->split(0, 1, false);
-
-  // Compute everything inline
-  tv0->computeAt(tv5, -1);
-
-  tv5->axis(0)->parallelize(ParallelType::Unswitch);
-  tv5->axis(1)->parallelize(ParallelType::BIDx);
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-
-  // Make sure the unswitched loop does not have an else clause.
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto fl = dynamic_cast<kir::ForLoop*>(kir_node.get())) {
-      if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) {
-        continue;
-      }
-      if (auto pred = dynamic_cast<kir::IfThenElse*>(fl->parentScope())) {
-        TORCH_CHECK(!pred->hasElse());
-      }
-    }
-  }
-
-  const int x = 11;
-  const int y = 12;
-  const int z = 13;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = at::randn({z, x, y}, options);
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-  auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
-}
-
-// The unswitched loop has extent one but inner loops don't. The else
-// part should not be omitted.
-TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int x = 15;
-  auto tv0 = makeConcreteTensor({x});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  fusion.addOutput(tv1);
-
-  tv1->split(-1, 4);
-  tv1->split(-2, 1);
-
-  tv1->axis(-2)->parallelize(ParallelType::Unswitch);
-
-  // Make sure the size-one unswitched loop does not omit the else clause.
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto fl = dynamic_cast<kir::ForLoop*>(kir_node.get())) {
-      if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) {
-        continue;
-      }
-      if (auto pred = dynamic_cast<kir::IfThenElse*>(fl->parentScope())) {
-        TORCH_CHECK(pred->hasElse());
-      }
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-  auto t1 = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionValidateParallelize1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-
-  // Invalid as tv1 and tv2 do have the same ParallelType
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionValidateParallelize2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv1->setMemoryType(MemoryType::Shared);
-
-  // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
-  // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-TEST(NVFuserTest, FusionValidateParallelize3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->split(-1, 4);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-
-  // tv1 and tv2 have the same shape and ParallelType
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-TEST(NVFuserTest, FusionValidateParallelize4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->split(-1, 8);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-
-  // tv1 and tv2 do not have the same shape
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST(NVFuserTest, FusionValidateParallelize5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv2->split(-1, 8);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // tv1 and tv2 do not have the same shape, but tv1 is on shared
-  // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-// See issue #995
-TEST(NVFuserTest, FusionValidateParallelize6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->split(0, 1);
-  tv4->split(0, 1);
-
-  TransformPropagator::from(tv4);
-
-  tv0->computeAt(tv2, 2);
-  tv3->computeAt(tv4, 2);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Validation should throw an exception saying the first axes of tv2
-  // and tv3 have incompatible parallelization. See also issue #995.
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST(NVFuserTest, FusionDAGMerging_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(5);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Branch 0
-  auto tv2 = sum(tv0, {0}); // 0
-  auto tv3 = sum(tv2, {0}); // 1
-  auto tv4 = sum(tv3, {0}); // 2
-  auto tv5 = sum(tv4, {0}); // 3
-
-  // Branch 1
-  auto tv6 = add(tv1, new Double(1)); // 4
-
-  // Merge
-  auto tv7 = add(tv6, tv5); // 5
-
-  // Maximum expected output groups (can improve overtime):
-  //  {0}, {1}, {2}, {3,4,5}
-  //  without final merge would have been {0}, {1}, {2}, {3,4}, {5}
-
-  fusion.addOutput(tv7);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options);
-  at::Tensor t1 = at::randn({2}, options);
-
-  auto fusion_segments = fusion.segment({t0, t1});
-  TORCH_CHECK(fusion_segments->groups().size() <= 4);
-}
-
-TEST(NVFuserTest, FusionDAGScalarMerging_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto i0 = new Double();
-
-  fusion->addInput(tv0);
-  fusion->addInput(i0);
-
-  auto i1 = add(i0, new Double(1.0));
-  auto i2 = mul(i1, i1);
-  auto i3 = add(i2, i1);
-
-  // Branch 0
-  auto tv1 = sum(tv0, {0}); // 0
-  auto tv2 = add(tv1, i2);
-  // Branch 1
-  auto tv3 = sum(tv2, {0}); // 1
-  auto tv4 = add(tv3, i3);
-
-  auto tv5 = add(tv4, i0);
-
-  fusion->addOutput(tv5);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16, 16}, options);
-  double s0 = 0.5;
-
-  auto s1 = s0 + 1.0;
-  auto s2 = s1 * s1;
-  auto s3 = s2 + s1;
-  auto t1 = t0.sum({0});
-  auto t2 = t1 + s2;
-  auto t3 = sum(t2, {0});
-  auto t4 = t3 + s3;
-  auto t5 = t4 + s0;
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, s0});
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-  constexpr int K = 20;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = sum(tv0, {{1, 2}});
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N, K}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-  at::Tensor aten_output = t0.sum({1, 2});
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-  constexpr int K = 20;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tvs = Welford(tv0, {{1, 2}});
-  fusion.addInput(tv0);
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-
-  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
-  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N, K}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-  at::Tensor aten_avg = t0.mean({1, 2});
-  at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
-}
-
-// See Issue #716
-TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 11;
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  std::vector<int> reduction_axes = {1};
-  std::vector<bool> broadcast_mask = {false, true};
-
-  auto tv0_bcast = broadcast(tv0, broadcast_mask);
-  auto path1_bcast = add(tv0_bcast, new Double(1.0));
-  auto path1 = sum(path1_bcast, reduction_axes);
-  fusion.addOutput(path1);
-
-  auto p = path1->split(1, 1);
-  path1->rFactor({1});
-  path1->axis(0)->parallelize(ParallelType::BIDx);
-  tv0->computeAt(path1, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  at::Tensor t0_ref = t0.clone();
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  // inplace op, we are adding t0 to itself
-  auto outputs = fe.runFusion(aten_inputs, {t0});
-
-  TORCH_CHECK(outputs[0].allclose(t0_ref.add(1)));
-}
-
-TEST(NVFuserTest, FusionReductionPredicate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  auto tv2 = tv0->cache_after();
-
-  const int bdimx = 128;
-  tv1->split(1, bdimx);
-  tv1->split(1, 4);
-  tv1->split(1, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->split(0, 10);
-  tv0->computeAt(tv1, 4);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 650;
-  int numel_y = 102;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({0});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue728_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addOutput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addOutput(tv1);
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, new Double(1));
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv4, new Double(1));
-  auto tv6 = add(tv2, new Double(1));
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // tv0 -> tv3 -+
-  // tv1 --------+-> tv4 -> tv5
-  //
-  // tv2 -> tv6
-
-  auto all_vals_under_tv3 =
-      DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
-  std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
-  for (auto tv : included_tensors) {
-    TORCH_CHECK(
-        std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
-            all_vals_under_tv3.end(),
-        "TV",
-        tv->name(),
-        " not found");
-  }
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (included_tensors.find(tv) == included_tensors.end()) {
-      TORCH_CHECK(
-          std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
-              all_vals_under_tv3.end(),
-          "TV",
-          tv->name(),
-          " should not be found");
-    }
-  }
-
-  auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
-  TORCH_CHECK(no_dependency.empty(), "No val should be returned");
-
-  auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
-  TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
-
-  auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
-  TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
-
-  auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
-  TORCH_CHECK(
-      just_tv3.size() == 1 && *(just_tv3.begin()) == tv3,
-      "Only tv3 should be included");
-}
-
-TEST(NVFuserTest, FusionIssue757_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv1->computeAt(tv4, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 650;
-  int numel_y = 102;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0.sum({1});
-  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t4 = t2 + t3;
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-// See issue #759
-TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->split(0, 4);
-  tv1->computeAt(tv4, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 100;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0.sum({1});
-  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t4 = t2 + t3;
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-  // {first kernel}
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv1, tv0);
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv3, tv0);
-  auto tv5 = sum(tv4, {0});
-  auto tv6 = sum(tv5, {0});
-  // {second kernel}
-  auto tv7 = add(tv6, tv5);
-  auto tv8 = add(tv7, tv5);
-  auto tv9 = sum(tv8, {0});
-
-  fusion->addOutput(tv9);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() == 2);
-}
-
-TEST(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto i0 = new Double();
-
-  fusion->addInput(tv0);
-  fusion->addInput(i0);
-
-  // Branch 0 {first kernel}
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv0, i0);
-  auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
-  auto tv4 = sum(tv3, {0});
-
-  // Branch 1 {first kernel}
-  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
-  auto tv6 = sum(tv5, {0});
-
-  // Incompatible {second kernel}
-  auto tv7 = sum(tv6, {0});
-
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv4);
-  fusion->addOutput(tv7);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), {t0, 1.0}, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() == 2);
-}
-
-TEST(NVFuserTest, FusionSegmentMixReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  // def of tv1 in kernel 1 through horizontal
-  auto tv1 = sum(tv0, {0, 1});
-  // kernel 2
-  auto tv2 = sum(tv0, {2});
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = sum(tv4, {2});
-  // end of kernel 2
-  // kernel 1
-  auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
-  auto tv7 = sum(tv6, {0, 1});
-  auto tv8 = sum(tv6, {0, 1});
-
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv5);
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv8);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() <= 2);
-}
-
-TEST(NVFuserTest, FusionSBAR_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{656, 7, 7, 64};
-
-  auto x = makeContigTensor(4);
-  auto y = makeContigTensor(4);
-  auto weight = makeContigTensor(1);
-  auto bias = makeContigTensor(1);
-
-  fusion.addInput(x);
-  fusion.addInput(y);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  const size_t kNumberOfDims = x->nDims();
-  std::vector<bool> broadcast_mask(kNumberOfDims, false);
-  for (const auto axis : c10::irange(kNumberOfDims - 1)) {
-    broadcast_mask[axis] = true;
-  }
-
-  auto weight_bcast = broadcast(weight, broadcast_mask);
-  auto scale = mul(x, weight_bcast);
-  auto bias_bcast = broadcast(bias, broadcast_mask);
-  auto scale_bias = add(scale, bias_bcast);
-  auto scale_bias_add = add(scale_bias, y);
-  auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
-
-  fusion.addOutput(scale_bias_add_relu);
-
-  // inputs
-  at::manual_seed(0);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_y = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[3]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
-
-  // inputs
-  std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
-
-  FusionExecutor executor;
-  executor.compileFusion(&fusion);
-
-  outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
-
-  auto at_scale = at::mul(at_x, at_weight);
-  auto at_scale_bias = at::add(at_scale, at_bias);
-  auto pwise_add = at::add(at_scale_bias, at_y);
-  auto output = at::relu(pwise_add);
-
-  testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSingleElement_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(0);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(2.5));
-
-  auto tv2 = add(tv1, new Double(3.5));
-  fusion.addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({}, options);
-
-  at::Tensor cg_output = at::empty({}, options);
-
-  auto lparams = schedulePointwise(&fusion, {input});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input}, {cg_output}, lparams);
-
-  auto aten_output = input.add(2.5).add(3.5);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 4;
-  int c = 4;
-  int h = 4;
-  int w = 4;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-  auto save_mean = makeSymbolicTensor(1);
-  fusion.addInput(save_mean);
-  auto save_invstd = makeSymbolicTensor(1);
-  fusion.addInput(save_invstd);
-
-  auto grad_out_prev = makeSymbolicTensor(numDims);
-  fusion.addInput(grad_out_prev);
-  auto gt_0 =
-      makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
-  fusion.addInput(gt_0);
-
-  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1));
-  auto gt_float = castOp(DataType::Float, gt_bool);
-
-  auto grad_out = mul(grad_out_prev, gt_float);
-
-  Val* eps_ptr = new Double(1e-5);
-
-  auto grads = batch_norm_backward(
-      input,
-      grad_out,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_invstd,
-      true,
-      eps_ptr,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({c}, options);
-  at::Tensor input2 = at::randn_like(input1);
-  at::Tensor input3 = at::randn_like(input1);
-  at::Tensor input4 = at::randn_like(input1);
-  at::Tensor input5 = at::randn_like(input1);
-  at::Tensor input6 = at::randn_like(input0);
-  at::Tensor input7 = at::randn_like(input0);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {
-      input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
-}
-
-// TODO: We only changed inputs, merge this with the test above.
-TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 2;
-  int c = 81;
-  int h = 1;
-  int w = 1;
-  int numDims = 4;
-
-  // auto input = makeSymbolicTensor(numDims);
-  auto input = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-  auto save_mean = makeSymbolicTensor(1);
-  fusion.addInput(save_mean);
-  auto save_invstd = makeSymbolicTensor(1);
-  fusion.addInput(save_invstd);
-
-  // auto grad_out_prev = makeSymbolicTensor(numDims);
-  auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(grad_out_prev);
-  // auto gt_0 =
-  //     makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
-  auto gt_0 = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(gt_0);
-
-  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1));
-  auto gt_float = castOp(DataType::Float, gt_bool);
-
-  auto grad_out = mul(grad_out_prev, gt_float);
-
-  Val* eps_ptr = new Double(1e-5);
-
-  auto grads = batch_norm_backward(
-      input,
-      grad_out,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_invstd,
-      true,
-      eps_ptr,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({c}, options);
-  at::Tensor input2 = at::randn_like(input1);
-  at::Tensor input3 = at::randn_like(input1);
-  at::Tensor input4 = at::randn_like(input1);
-  at::Tensor input5 = at::randn_like(input1);
-  at::Tensor input6 = at::randn_like(input0);
-  at::Tensor input7 = at::randn_like(input0);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {
-      input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
-}
-
-TEST(NVFuserTest, FusionBNRepro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  int batch = 14;
-  int c = 65;
-  int h = 7;
-  int w = 7;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto bias = makeSymbolicTensor(1);
-  fusion.addInput(bias);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-
-  auto momentum_ptr = new Double(kMomentum);
-  auto eps_ptr = new Double(kEps);
-
-  auto result = batch_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-  at::Tensor input2 = at::randn({c}, options);
-  at::Tensor input3 = at::randn_like(input2);
-  at::Tensor input4 = at::randn_like(input2);
-  at::Tensor input5 = at::randn_like(input2);
-
-  auto input1_ref = input1.clone();
-  auto input2_ref = input2.clone();
-  auto input3_ref = input3.clone();
-  auto input4_ref = input4.clone();
-  auto input5_ref = input5.clone();
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto at_results = at::native_batch_norm(
-      input1_ref,
-      input2_ref,
-      input3_ref,
-      input4_ref,
-      input5_ref,
-      kTraining,
-      kMomentum,
-      kEps);
-
-  auto at_output = std::get<0>(at_results);
-  auto at_mean = std::get<1>(at_results);
-  auto at_invstd = std::get<2>(at_results);
-
-  std::vector<at::Tensor> aten_outputs = {
-      input4_ref, input5_ref, at_output, at_mean, at_invstd};
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBNRepro2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  int batch = 2;
-  int c = 4;
-  int h = 17;
-  int w = 17;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-
-  Val* momentum_ptr = new Double(kMomentum);
-  Val* eps_ptr = new Double(kEps);
-
-  auto result = batch_norm(
-      input,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-
-  auto input1_ref = input1.clone();
-  at::Tensor r_m;
-  at::Tensor r_v;
-  at::Tensor weight;
-  at::Tensor bias;
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {input1};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto at_results = at::native_batch_norm(
-      input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
-
-  auto at_output = std::get<0>(at_results);
-  auto at_mean = std::get<1>(at_results);
-  auto at_invstd = std::get<2>(at_results);
-
-  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(2.5));
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeConcreteTensor({0});
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  auto lparams = schedulePointwise(&fusion, {input0, input1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({input0, input1}, {cg_output2, cg_output3}, lparams);
-
-  auto aten_output2 = input0.add(2.5);
-  at::Tensor aten_output3 = at::empty({0}, options);
-
-  testValidate(
-      &fusion,
-      {cg_output2, cg_output3},
-      {input0, input1},
-      {aten_output2, aten_output3},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv0, {1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeConcreteTensor({0});
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2, 4}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, reduction_params.value());
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  auto lparams = reduction_params.value().lparams;
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
-  auto aten_output2 = input0.sum({1});
-  at::Tensor aten_output3 = at::empty({0}, options);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output2, aten_output3},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv0, {0});
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv0, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = makeConcreteTensor({0});
-  fusion.addOutput(tv5);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2, 4}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2, 4}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  schedulePersistentKernel(&fusion, reduction_params.value());
-
-  auto lparams = reduction_params.value().lparams;
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
-  auto aten_output2 = input0.sum({0}).add(input0);
-  at::Tensor aten_output3 = at::empty({0}, options);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output2, aten_output3},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(1);
-  TensorView* tv2 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  TensorView* tv3 = add(tv0, new Double(1)); // Group 0
-  TensorView* tv4 =
-      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
-  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
-                                   //  keeps normalization scheduler away)
-  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
-
-  fusion->addOutput(tv6);
-  // Note: test alias;
-  fusion->aliasOutputToInput(tv6, tv0);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  at::Tensor t1 = at::randn({65}, options);
-  at::Tensor t2 = at::randn({128, 65}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = std::get<0>(at::max(t3, 0));
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
-
-  // validating aliasing
-  TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWelford1Output_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  fusion->addOutput(tvs.var_sum);
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  auto outputs = executor_cache.runFusionWithInputs({t0});
-
-  auto t1 = t0.var({1}, false) * 65;
-  testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTranslate1Welford_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  fusion->addOutput(tvs.var_sum);
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-    // Square sums does not fit well in the testValidate assumptions,
-    //  so we just compare the divided output here.
-    outputs[0] /= inner_size;
-    auto t1 = t0.var({1}, false);
-    testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  // Run a translated welford
-  auto runtime1 = run_test(64);
-  // Check it was translated
-  TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 2);
-  TORCH_CHECK(
-      runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() ==
-      ScheduleHeuristic::Persistent);
-
-  // Run an un-translated welford
-  auto runtime2 = run_test(65536);
-  // Check it was not translated
-  TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 1);
-  TORCH_CHECK(
-      runtime2->schedulerHeuristics()->singleKernelHeuristics()->heuristc() ==
-      ScheduleHeuristic::Reduction);
-}
-
-TEST(NVFuserTest, FusionTranslate2Welford_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto tvs2 = Welford(tv0, {1});
-
-  fusion->addOutput(tvs1.var_sum);
-  fusion->addOutput(tvs2.var_sum);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    // Square sums does not fit well in the testValidate assumptions,
-    //  so we just compare the divided output here.
-    outputs[0] /= inner_size;
-    outputs[1] /= inner_size;
-    auto t1 = t0.var({1}, false);
-    testValidate(fusion, outputs, {t0}, {t1, t1}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  // Run a translated welford
-  auto runtime1 = run_test(64);
-  // Check it was translated
-  TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 4);
-  TORCH_CHECK(
-      runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() ==
-      ScheduleHeuristic::Persistent);
-
-  // Run an un-translated welford
-  auto runtime2 = run_test(65536);
-  // // Check it was not translated
-  TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 2);
-}
-
-TEST(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto sum_of_tv0 = sum(tv0, {1});
-
-  fusion->addOutput(tvs1.var_sum);
-  fusion->addOutput(sum_of_tv0);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    auto t1 = t0.var({1}, false) * inner_size;
-    auto t2 = t0.sum({1});
-    testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  auto runtime = run_test(65536);
-  TORCH_CHECK(!runtime->isSegmented());
-}
-
-TEST(NVFuserTest, FusionWelfordOtherPersistence_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto sum_of_tv0 = sum(tv0, {1});
-  auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
-  auto avg_bcasted = broadcast(tvs1.avg, {false, true});
-  auto tv0_plus_sum = add(tv0, sum_bcasted);
-  auto tv0_plus_avg = add(tv0, avg_bcasted);
-
-  fusion->addOutput(tv0_plus_sum);
-  fusion->addOutput(tv0_plus_avg);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    auto t1 = t0.mean({1}).unsqueeze(1) + t0;
-    auto t2 = t0.sum({1}).unsqueeze(1) + t0;
-    testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  for (auto inner_size : {4096, 8192, 32768}) {
-    auto runtime = run_test(4096);
-    TORCH_CHECK(!runtime->isSegmented());
-  }
-}
-
-TEST(NVFuserTest, FusionSegmentIslands_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = sum(tv0, {0});
-  auto tv3 = sum(tv1, {1});
-  fusion->addOutput(tv2);
-  fusion->addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16}, options);
-  at::Tensor t1 = at::randn({16, 16}, options);
-
-  FusionExecutorCache fusion_executor_cache(std::move(fusion));
-  fusion_executor_cache.runFusionWithInputs({t0, t1});
-}
-
-TEST(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-  auto tv2 = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv3 = broadcast(tv0, {false, true, true, true});
-  auto tv4 = broadcast(tv1, {false, false, true, true});
-  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
-
-  auto tv6 = add(tv3, tv5);
-  auto tv7 = add(tv4, tv5);
-  auto tv8 = add(tv3, tv4);
-
-  auto tv9 = add(tv6, tv7);
-  auto tv10 = add(tv9, tv8);
-
-  fusion->addOutput(tv10);
-
-  tv0->computeAt(tv10, -2);
-  tv1->computeAt(tv10, -2);
-  tv2->computeAt(tv10, -2);
-
-  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-
-  TORCH_CHECK(tv6->getMaxProducerPosition() == 3);
-  TORCH_CHECK(tv7->getMaxProducerPosition() == 3);
-  TORCH_CHECK(tv8->getMaxProducerPosition() == 2);
-}
-
-TEST(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(3);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, false, true});
-  auto tv3 = add(tv2, tv1);
-
-  fusion->addOutput(tv3);
-  tv3->split(-2, 4);
-  tv3->reorder({{-1, -2}});
-  tv0->computeAt(tv3, -2);
-  tv1->computeAt(tv3, -2);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv3->getMaxProducerPosition() == 2);
-}
-
-TEST(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, false, true});
-  auto tv3 = broadcast(tv2, {false, true, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion->addOutput(tv4);
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv3->getMaxProducerPosition() == 3);
-}
-
-TEST(NVFuserTest, FusionSimpleWarp_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  tv1->split(1, 32);
-  auto tv1_rf = tv1->rFactor({1});
-  TransformPropagator::from(tv1_rf);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSimpleWarpPad_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->split(1, 8, false);
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp(32);
-  TransformPropagator::from(tv1_rf);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(-1)->padToMultipleOfWarp(32);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->padToMultipleOfWarp(32);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->padToMultipleOfWarp(32);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 127}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2});
-  auto tv2 = broadcast(tv1, {false, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  TransformPropagator::from(tv1_rf);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 17, 128}, options);
-
-  auto at_output = input1.sum({1, 2}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSerialWarpReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2});
-  auto tv2 = broadcast(tv1, {false, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  TransformPropagator::from(tv1);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 17, 128}, options);
-
-  auto at_output = input1.sum({1, 2}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeConcreteTensor({17, 18, 128, 1});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2, 3});
-  auto tv2 = broadcast(tv1, {false, true, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->padToMultipleOfWarp();
-  TransformPropagator::from(tv1_rf);
-  tv0->axis(-2)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({17, 18, 128, 1}, options);
-
-  auto at_output = input1.sum({1, 2, 3}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionMultipleDimBinding_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv_add = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv_add);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-  auto tv4 = add(tv0, tv_add);
-
-  fusion->addOutput(tv3);
-  fusion->addOutput(tv4);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->split(1, 8, false);
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp(32);
-  TransformPropagator::from(tv1_rf);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(-1)->padToMultipleOfWarp(32);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->padToMultipleOfWarp(32);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->padToMultipleOfWarp(32);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->padToMultipleOfWarp(64);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-  at::Tensor input2 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1, input2});
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1, input2},
-      {at_output, input1 + input2},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-  tv3->axis(0)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 31}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  fusion->addOutput(tv2);
-
-  tv2->split(1, 8);
-  auto tv2_rf = tv2->rFactor({-1});
-  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2_rf->axis(-1)->padToMultipleOfWarp();
-
-  TransformPropagator::from(tv2_rf);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-  tv0->computeAt(tv2, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 31}, options);
-
-  auto at_output = (input1 + 1).sum({1});
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cache_after();
-  tv1->split(1, 8, false);
-  tv1->split(0, 4);
-  auto tv1_rf = tv1->rFactor({2});
-
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(1)->parallelize(ParallelType::Unroll);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  tv1->axis(1)->parallelize(ParallelType::Unroll);
-  TransformPropagator::from(tv1_rf);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(1)->parallelize(ParallelType::Unroll);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSegfaultReduction_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 2;
-  int c = 1;
-  int h = 1;
-  int w = 1;
-  int numDims = 4;
-
-  auto input = makeConcreteTensor({-1, 1, 1, 1});
-  fusion.addInput(input);
-  auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1});
-  fusion.addInput(bcast_bias);
-
-  std::vector<int64_t> at_sum_axes;
-  std::vector<int> outer_reduction_axes;
-  std::vector<bool> outer_broadcast_mask(numDims, false);
-  Val* N = new Double(1);
-  for (const auto axis : c10::irange(numDims)) {
-    if (axis != 1) {
-      outer_reduction_axes.push_back(axis);
-      at_sum_axes.push_back(axis);
-      outer_broadcast_mask[axis] = true;
-      N = mul(N, input->domain()->domain()[axis]->extent());
-    }
-  }
-
-  auto output0 = mul(input, bcast_bias);
-  fusion.addOutput(output0);
-  auto output1 = sum(output0, outer_reduction_axes);
-  fusion.addOutput(output1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-
-  auto at_output0 = input0.mul(input1);
-  auto at_output1 = at_output0.sum(at_sum_axes);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {input0, input1};
-  auto outputs = fec.runFusionWithInputs(inputs);
-
-  testValidate(
-      &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionPredicateElimination_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  auto tv3 = add(tv2, new Double(3));
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 32);
-  tv0->computeAt(tv3, 1);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  {
-    GpuLower gpulw(&fusion);
-    TORCH_CHECK(!isPredicated(tv2, gpulw));
-  }
-
-  tv2->axis(1)->parallelize(ParallelType::Serial);
-  tv2->split(1, 5);
-
-  {
-    GpuLower gpulw(&fusion);
-    TORCH_CHECK(isPredicated(tv2, gpulw));
-  }
-}
-
-TEST(NVFuserTest, FusionForceFp16Simple_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv2 = sum(tv0, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-
-  // Group 2
-  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
-  auto tv5 = castOp(DataType::Half, tv4);
-
-  fusion->addOutput(tv5);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{15, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  // Check the segmented edge is fp16
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
-  }
-}
-
-TEST(NVFuserTest, FusionForceBf16Simple_CUDA) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-    auto fusion = fusion_ptr.get();
-    FusionGuard fg(fusion);
-
-    auto tv0 = makeSymbolicTensor(2);
-    auto tv1 = makeSymbolicTensor(2);
-
-    fusion->addInput(tv0);
-    fusion->addInput(tv1);
-
-    // Group 1
-    auto tv2 = sum(tv0, {1});
-    auto tv3 = broadcast(tv2, {false, true});
-
-    // Group 2
-    auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
-    auto tv5 = castOp(DataType::BFloat16, tv4);
-
-    fusion->addOutput(tv5);
-
-    FusionExecutorCache fec(std::move(fusion_ptr));
-
-    std::vector<int64_t> shape{15, 16};
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    auto in0 = at::randn(shape, options);
-    auto in1 = at::randn(shape, options);
-    fec.runFusionWithInputs({in0, in1});
-
-    // Check the segmented edge is bf16
-    auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-    for (auto edge : segmented_fusion->edges()) {
-      auto edge_tv = edge->val->as<TensorView>();
-      TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
-    }
-  } else {
-    GTEST_SKIP();
-  }
-#else
-  GTEST_SKIP();
-#endif
-}
-
-TEST(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv3 = sum(tv0, {1});
-  auto tv4 = broadcast(tv3, {false, true, false});
-  auto tv5 = sum(tv0, {1});
-
-  // Group 2
-  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
-  auto tv7 = castOp(DataType::Half, tv6);
-
-  // Group 3
-  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
-
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv8);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{16, 16, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  auto complete_fusion = segmented_fusion->completeFusion();
-
-  // Check that the edge that wasn't fp16 is the producer of the
-  //  reduction op, i.e. tv8 = sum(tv5,{1});.
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    if (edge_tv->getDataType() == DataType::Float) {
-      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
-      TORCH_CHECK(consumer->isA<ReductionOp>());
-    }
-  }
-}
-
-TEST(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-    auto fusion = fusion_ptr.get();
-    FusionGuard fg(fusion);
-
-    auto tv0 = makeSymbolicTensor(3);
-    auto tv1 = makeSymbolicTensor(3);
-
-    fusion->addInput(tv0);
-    fusion->addInput(tv1);
-
-    // Group 1
-    auto tv3 = sum(tv0, {1});
-    auto tv4 = broadcast(tv3, {false, true, false});
-    auto tv5 = sum(tv0, {1});
-
-    // Group 2
-    auto tv6 = add(tv4, tv1); // edge tv4, expect cast
-    auto tv7 = castOp(DataType::BFloat16, tv6);
-
-    // Group 3
-    auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
-
-    fusion->addOutput(tv7);
-    fusion->addOutput(tv8);
-
-    FusionExecutorCache fec(std::move(fusion_ptr));
-
-    std::vector<int64_t> shape{16, 16, 16};
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    auto in0 = at::randn(shape, options);
-    auto in1 = at::randn(shape, options);
-    fec.runFusionWithInputs({in0, in1});
-
-    auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-    auto complete_fusion = segmented_fusion->completeFusion();
-
-    // Check that the edge that wasn't fp16 is the producer of the
-    //  reduction op, i.e. tv8 = sum(tv5,{1});.
-    for (auto edge : segmented_fusion->edges()) {
-      auto edge_tv = edge->val->as<TensorView>();
-      if (edge_tv->getDataType() == DataType::Float) {
-        auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
-        TORCH_CHECK(consumer->isA<ReductionOp>());
-      }
-    }
-  } else {
-    GTEST_SKIP();
-  }
-#else
-  GTEST_SKIP();
-#endif
-}
-
-TEST(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, new Double(2));
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = mul(tv4, new Double(3));
-  fusion->addOutput(tv5);
-
-  // t4 cannot inner re-use t2, because there's a broadcast
-  //  between them.
-  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
-  tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-
-  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion({in0, in1});
-
-  testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, new Double(2));
-  auto tv3 = mul(tv0, new Double(3));
-  auto tv4 = mul(tv2, tv3);
-  // Broadcast buffer can be reused through outer sharing
-  auto tv5 = broadcast(tv4, {true, false, false});
-  auto tv6 = mul(tv5, new Double(5));
-  auto tv7 = mul(tv6, tv1);
-  auto tv8 = mul(tv7, new Double(7));
-  // tv9 shouldn't alias to avoid buffer over-subscription
-  auto tv9 = broadcast(tv4, {true, false, false});
-  auto tv10 = mul(tv9, new Double(9));
-  auto tv11 = add(tv5, tv9);
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv11);
-
-  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
-  tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort);
-
-  tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort);
-  tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort);
-  tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  auto t2 = in0 * 2;
-  auto t3 = in0 * 3;
-  auto t4 = t2 * t3;
-  auto t5 = t4.unsqueeze(0);
-  auto t6 = t5 * 5;
-  auto t7 = t6 * in1;
-  auto t8 = t7 * 7;
-  auto t9 = t4.unsqueeze(0);
-  auto t10 = t9 * 9;
-  auto t11 = t5 + t9;
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-
-  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
-  auto outputs = fe.runFusion({in0, in1});
-
-  testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({256, 512});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = mul(tv0, new Double(2));
-  auto tv2 = mul(tv1, new Double(2));
-  auto tv3 = mul(tv2, new Double(2));
-  auto tv4 = mul(tv3, new Double(2));
-  auto tv5 = mul(tv4, new Double(2));
-  auto tv6 = mul(tv5, new Double(2));
-
-  fusion->addOutput(tv6);
-
-  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
-  tv6->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({256, 512}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion({in0});
-
-  auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2);
-
-  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, new Double(2));
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
-                            //  shouldn't outer alias on top
-  auto tv5 = mul(tv4, new Double(3));
-  auto tv6 = mul(tv5, new Double(3));
-  fusion->addOutput(tv6);
-
-  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
-  tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion({in0, in1});
-
-  auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0);
-
-  testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({3, 3, 3});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = mul(tv1, new Double(2));
-  auto tv3 = mul(tv2, new Double(2));
-
-  fusion->addOutput(tv3);
-
-  // In this case tv1 "reuses" allocation of tv2
-  //  due to the switched allocation order
-  tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort);
-
-  tv0->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({3, 3, 3}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion({in0});
-
-  auto at_out = in0.sum(1).mul(2).mul(2);
-
-  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({16, 16});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = mul(tv0, new Double(3));
-  auto tv2 = mul(tv1, new Double(2));
-  auto tv3 = mul(tv2, new Double(2));
-  // tv1 used till here, cannot be reused by tv2 or tv3
-  auto tv4 = mul(tv3, tv1);
-
-  fusion->addOutput(tv4);
-
-  tv0->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({16, 16}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto cg_outputs = fe.runFusion({in0});
-
-  auto at_t0 = in0 * 3.0;
-  auto at_out = at_t0 * 2.0 * 2.0 * at_t0;
-
-  testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, new Double(2));
-  auto tv3 = mul(tv0, new Double(3));
-  auto tv4 = mul(tv2, tv3);
-  auto tv5 = broadcast(tv4, {false, false, true});
-  auto tv6 = mul(tv5, tv1);
-  auto tv7 = mul(tv6, new Double(7));
-  fusion->addOutput(tv7);
-
-  // tv6 shouldn't re-use t2 or t3 because of
-  //  the broadcast in between
-  tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort);
-  tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
-  auto outputs = fe.runFusion({in0, in1});
-
-  auto t2 = in0 * 2;
-  auto t3 = in0 * 3;
-  auto t4 = t2 * t3;
-  auto t5 = t4.unsqueeze(2);
-  auto t6 = t5 * in1;
-  auto t7 = t6 * 7;
-  testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue970_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int nelm = 10;
-
-  // tv3 = tv0 + sum(tv0)
-  auto tv0 = makeConcreteTensor({nelm, nelm});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-  fusion.addOutput(tv3);
-
-  tv1->split(1, 4);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({nelm, nelm}, options);
-
-  auto outputs = fe.runFusion({t0});
-
-  auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0;
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of #1016
-TEST(NVFuserTest, FusionIssue1016_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv2->split(-1, 8);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 10;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0 + 1 + 2;
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of #1021
-TEST(NVFuserTest, FusionIssue1021_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  auto tv3 = tv2->cache_before();
-
-  tv2->split(0, 2);
-
-  tv1->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::Vectorize);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = (t0 + 1).unsqueeze(-1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of issue #1053
-TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  fusion->addOutput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-  fusion->addOutput(tv2);
-
-  tv1->split(0, 8);
-  auto tv1_rf = tv1->rFactor({-1});
-
-  tv1_rf->computeAt(tv1, 1);
-
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({32}, options);
-
-  auto at_tv1 = (input1).sum({0});
-  auto at_tv2 = input1 + 1;
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv0, new Double(1));
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv2);
-
-  tv1->split(0, 8, false);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->split(0, 8, false);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  // The extents of tv1 and tv2 axes are equal even though their
-  // actual values are not statically known
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  auto kir_tv1 = gpulw.lowerValue(tv1)->as<kir::TensorView>();
-  auto kir_tv2 = gpulw.lowerValue(tv2)->as<kir::TensorView>();
-  for (const auto i : c10::irange(kir_tv1->domain()->domain().size())) {
-    auto dom1 = kir_tv1->domain()->domain()[i];
-    auto dom2 = kir_tv2->domain()->domain()[i];
-    TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
-  }
-
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<kir::NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<kir::NamedScalar>()->name() ==
-          "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1},
-      {input1 + 1, input1 + 1},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = add(tv1, tv2);
-  fusion->addOutput(tv3);
-
-  tv3->split(-1, 8, false);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<kir::NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<kir::NamedScalar>()->name() ==
-          "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({11}, options);
-  at::Tensor input2 = at::randn({11, 13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = input1.unsqueeze(-1) + input2;
-
-  testValidate(
-      fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-// Mix symbolic and concrete tensors
-TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-
-  auto tv2 = add(tv0, new Double(1));
-  fusion->addOutput(tv2);
-  auto tv3 = add(tv0, new Double(1));
-  fusion->addOutput(tv3);
-
-  tv2->split(0, 10);
-  tv3->split(0, 20);
-
-  auto tv4 = add(tv0, new Double(1));
-  fusion->addOutput(tv4);
-  auto tv5 = add(tv0, new Double(1));
-  fusion->addOutput(tv5);
-
-  // Not mapped but equal extent
-  tv4->split(0, 10);
-  tv5->split(0, 10);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<kir::NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<kir::NamedScalar>()->name() ==
-          "blockDim.x");
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDy)->isConst() &&
-      pdmap.get(ParallelType::TIDy)->as<kir::Int>()->value().value() == 10);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1},
-      {input1 + 1, input1 + 1, input1 + 1, input1 + 1},
-      __LINE__,
-      __FILE__);
-}
-
-// Parallelizing merged broadcast domains
-TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, new Double(1));
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->merge(0);
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
-  // exact.
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  GpuLower gpulw(&fusion);
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<kir::NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<kir::NamedScalar>()->name() ==
-          "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-  at::Tensor input2 = at::randn({15, 13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = (input1 + 1).unsqueeze(0) + input2;
-
-  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::TIDy);
-  tv3->axis(-2)->parallelize(ParallelType::TIDy);
-
-  GpuLower gpulw(&fusion);
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isConst() &&
-      pdmap.get(ParallelType::TIDx)->as<kir::Int>()->value().value() == 4);
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDy)->isA<kir::NamedScalar>() &&
-      pdmap.get(ParallelType::TIDy)->as<kir::NamedScalar>()->name() ==
-          "blockDim.y");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-  at::Tensor input2 = at::randn({13, 15}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = (input1).unsqueeze(-1) + input2;
-
-  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto t0 = makeSymbolicTensor(3, DataType::Float);
-  auto t1 = makeSymbolicTensor(3, DataType::Half);
-  auto t3 = makeSymbolicTensor(3, DataType::Half);
-  auto t5 = makeSymbolicTensor(3, DataType::Half);
-  auto t7 = makeSymbolicTensor(1, DataType::Half);
-  auto t11 = makeSymbolicTensor(3, DataType::Half);
-  auto t13 = makeSymbolicTensor(3, DataType::Half);
-  auto t15 = makeSymbolicTensor(3, DataType::Half);
-  auto t17 = makeSymbolicTensor(3, DataType::Half);
-  auto d56 = new Double();
-
-  fusion.addInput(t0);
-  fusion.addInput(t1);
-  fusion.addInput(t3);
-  fusion.addInput(t5);
-  fusion.addInput(t7);
-  fusion.addInput(t11);
-  fusion.addInput(t13);
-  fusion.addInput(t15);
-  fusion.addInput(t17);
-  fusion.addInput(d56);
-
-  auto t2 = castOp(DataType::Float, t1);
-  auto t4 = castOp(DataType::Float, t3);
-  auto t22 = sub(t2, t4);
-  auto t6 = castOp(DataType::Float, t5);
-  auto t23 = mul(t22, t6);
-  auto t16 = castOp(DataType::Float, t15);
-  auto t18 = castOp(DataType::Float, t17);
-  auto t19 = add(t16, t18);
-  auto t14 = castOp(DataType::Float, t13);
-  auto t20 = add(t19, t14);
-  auto t12 = castOp(DataType::Float, t11);
-  auto t21 = add(t20, t12);
-  auto t8 = castOp(DataType::Float, t7);
-  auto t24 = broadcast(t8, {true, true, false});
-  auto t25 = mul(t21, t24);
-  auto t27 = sum(t25, {2});
-  auto t28 = broadcast(t27, {false, false, true});
-  auto t29 = mul(t25, t23);
-  auto t30 = sum(t29, {2});
-  auto t31 = broadcast(t30, {false, false, true});
-  auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1));
-  auto t26 = mul(d59, t25);
-  auto txx = mul(t26, new Double(1));
-  auto t33 = sub(txx, t28);
-  auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
-  auto t35 = mul(d70, t6);
-  auto t39 = sum(t21, {0, 1});
-  auto t47 = castOp(DataType::Half, t39);
-  auto t37 = mul(t21, t23);
-  auto t38 = sum(t37, {0, 1});
-  auto t46 = castOp(DataType::Half, t38);
-  auto t32 = mul(t23, t31);
-  auto t34 = sub(t33, t32);
-  auto t36 = mul(t35, t34);
-  auto t45 = castOp(DataType::Half, t36);
-  auto t40 = mul(t36, t0);
-  auto t41 = mul(t40, d56);
-  auto t44 = castOp(DataType::Half, t41);
-  auto t42 = sum(t41, {0, 1});
-  auto t43 = castOp(DataType::Half, t42);
-
-  fusion.addOutput(t43);
-  fusion.addOutput(t44);
-  fusion.addOutput(t45);
-  fusion.addOutput(t46);
-  fusion.addOutput(t47);
-
-  auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto options_float =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
-  at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t7 = at::randn({1024}, options_half);
-  at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
-  double at_d56 = 1.1111;
-
-  std::vector<IValue> aten_inputs = {
-      at_t0,
-      at_t1,
-      at_t3,
-      at_t5,
-      at_t7,
-      at_t11,
-      at_t13,
-      at_t15,
-      at_t17,
-      at_d56};
-  for (auto _ : c10::irange(5)) {
-    auto segmented_fusion =
-        SegmentCandidateFinder::segment(fusion_ptr.get(), aten_inputs);
-  }
-}
-
-TEST(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, new Double(1));
-  auto tv4 = add(tv3, new Double(1));
-  fusion.addOutput(tv4);
-
-  auto tv5 = add(tv0, new Double(1));
-  auto tv6 = add(tv5, new Double(1));
-  fusion.addOutput(tv6);
-
-  // Case 1: local memory tensor computed serially and used by
-  // parallel threads
-  tv2->split(-1, 4);
-  tv1->computeAt(tv2, -2);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Case 2: shared memory tensor computed serially and used by BID
-  tv4->split(-1, 4);
-  tv3->computeAt(tv4, -2);
-  tv4->axis(-1)->parallelize(ParallelType::BIDx);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  // Case 3: shared memory tensor computed by TID and used by BID
-  tv6->split(-1, 4);
-  tv5->computeAt(tv6, -2);
-  tv6->axis(-1)->parallelize(ParallelType::BIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int nx = 11;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({nx}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0 + 2;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1105
-TEST(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv3->split(0, 4);
-  tv0->computeAt(tv3, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Make sure a WAR sync is inserted at the end of the outer loop
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
-    if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
-      const auto& body = loop->body().exprs();
-      TORCH_CHECK(!body.empty());
-      auto last_expr = dynamic_cast<kir::Sync*>(body.back());
-      TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
-      TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1099_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeSymbolicTensor(1);
-  fusion.addInput(tv3);
-
-  // Just to make TIDx/y/z non-exact
-  auto tv4 = add(tv3, new Double(1));
-  auto tv5 = add(tv4, new Double(1));
-  auto tv6 = add(tv5, new Double(1));
-  fusion.addOutput(tv6);
-
-  tv2->split(0, 4);
-  tv0->computeAt(tv2, 1);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDz);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv4->split(0, 5);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv5->split(0, 6);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->setMemoryType(MemoryType::Shared);
-  tv6->split(0, 7);
-  tv6->axis(-1)->parallelize(ParallelType::TIDz);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t3 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t3};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref_t2 = t0 + 2;
-  auto ref_t3 = t3 + 3;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1080
-TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 4);
-  tv0->computeAt(tv2, 2);
-
-  tv2->split(-1, 8);
-  tv1->split(-1, 8);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-
-  // swap TIDx and TIDy
-  tv1->axis(-1)->parallelize(ParallelType::TIDy);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int nx = 4;
-  const int ny = 10;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({nx, ny}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1189_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({16, 16});
-  auto tv1 = makeConcreteTensor({16, 16});
-
-  auto tv0b = broadcast(tv0, {false, false, true});
-  auto tv1b = broadcast(tv1, {false, false, true});
-
-  fusion.addInput(tv0b);
-  fusion.addInput(tv1b);
-
-  auto tv2 = add(tv0b, tv1b);
-  auto tv3 = sum(tv2, {1});
-  fusion.addOutput(tv3);
-
-  auto parallelize = [](auto tv) {
-    tv->axis(0)->parallelize(ParallelType::TIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDx);
-    tv->axis(2)->parallelize(ParallelType::BIDy);
-  };
-
-  parallelize(tv0b);
-  parallelize(tv1b);
-  parallelize(tv2);
-  parallelize(tv3);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16, 1}, options);
-  at::Tensor t1 = at::randn({16, 16, 1}, options);
-  auto outputs = fe.runFusion({t0, t1});
-
-  auto ref = (t0 + t1).sum({1});
-
-  testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1052_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv1, new Double(1));
-  fusion.addOutput(tv3);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(tv2, {tv0});
-  scheduler_utils::parallelizeAllLike(tv3, {tv1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-  at::Tensor t1 = at::randn({100}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref_t2 = t0 + 1;
-  auto ref_t3 = t1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1115
-TEST(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape{3, 17, 80};
-  std::vector<int64_t> output_shape{3, 17, 1, 80};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
-  auto y = unaryOp(UnaryOpType::Gelu, x_bcast);
-  fusion.addOutput(y);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_bias = at::randn(input_shape, options);
-  std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-  schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto at_x_add_bias = at_x + at_bias;
-  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
-  auto aten_y = at::gelu(at_x_view);
-
-  testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionSmemAliasSerial_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-
-  fusion.addOutput(tv3);
-
-  // Just set the dimension of TIDx
-  auto tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, new Double(1));
-  fusion.addOutput(tv5);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv5->axis(0)->parallelize(ParallelType::TIDx);
-
-  // tv1 and tv2 are on shared memory and are not parallelized with
-  // TIDx. They should be predicated as they are redundant and can
-  // interfere with smem aliasing (issue #1100).
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-
-  at::Tensor t4 = at::randn({1024}, options);
-  std::vector<IValue> aten_inputs = {t0, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-  auto ref2 = t4 + 1;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-  auto tv3 = sum(tv2, {0});
-  fusion.addOutput(tv3);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t2 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t2};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 1;
-  auto ref2 = sum(t2);
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-  auto tv3 = Welford(tv2, {0}).avg;
-  fusion.addOutput(tv3);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t2 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t2};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 1;
-  auto ref2 = mean(t2, {0});
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0, 1});
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(3);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, new Double(1));
-  fusion.addOutput(tv5);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDy);
-  tv3->axis(2)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(2)->parallelize(ParallelType::BIDz);
-
-  // TODO: This needs a fix for issue #1102.
-  // Also, need to allow predicated grid reductions.
-#if 0
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 3}, options);
-  at::Tensor t2 = at::randn({5, 6, 7}, options);
-  at::Tensor t4 = at::randn({8, 9, 10}, options);
-  std::vector<IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0.sum(at::IntArrayRef{0, 1});
-  auto ref2 = t2 + 1;
-  auto ref3 = t4 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
-#endif
-}
-
-TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0, 1});
-  fusion.addOutput(tvs.avg);
-
-  auto tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(3);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, new Double(1));
-  fusion.addOutput(tv5);
-
-  tvs.avg->axis(0)->parallelize(ParallelType::BIDx);
-  tvs.avg->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDy);
-  tv3->axis(2)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(2)->parallelize(ParallelType::BIDz);
-
-  // TODO: needs a fix for issue #1102
-  // Also, need to allow predicated grid reductions.
-#if 0
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 3}, options);
-  at::Tensor t2 = at::randn({5, 6, 7}, options);
-  at::Tensor t4 = at::randn({8, 9, 10}, options);
-  std::vector<IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0.mean(at::IntArrayRef{0, 1});
-  auto ref2 = t2 + 1;
-  auto ref3 = t4 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
-#endif
-}
-
-// Repro of issue #1102
-TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  // Just to make TIDx/y/z non-exact
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  auto tv5 = add(tv4, new Double(1));
-  auto tv6 = add(tv5, new Double(1));
-  auto tv7 = add(tv6, new Double(1));
-  auto tv8 = add(tv7, new Double(1));
-  auto tv9 = sum(tv8, {0});
-  fusion.addOutput(tv9);
-
-  tv1->split(0, 5);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->split(0, 6);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->split(0, 7);
-  tv3->axis(-1)->parallelize(ParallelType::TIDz);
-
-  tv9->split(0, 4);
-  tv4->computeAt(tv9, 1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDz);
-  tv7->axis(-1)->parallelize(ParallelType::TIDz);
-  tv8->axis(-1)->parallelize(ParallelType::TIDz);
-  tv9->axis(-1)->parallelize(ParallelType::TIDz);
-  tv9->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv5->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t4 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-  auto ref2 = sum(t4 + 4);
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-// Repro of #1102 and #1129
-TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 7) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-  auto tv4 = add(tv3, new Double(1));
-  auto tv5 = add(tv4, new Double(1));
-  fusion.addOutput(tv5);
-
-  // Just to make TIDx/y/z non-exact
-  auto tvx = add(tv1, new Double(1));
-  auto tvy = add(tvx, new Double(1));
-  auto tvz = add(tvy, new Double(1));
-  fusion.addOutput(tvz);
-
-  tv5->split(0, 4);
-  tv0->computeAt(tv5, 1);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv3->axis(-1)->parallelize(ParallelType::TIDz);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->axis(0)->parallelize(ParallelType::Unswitch);
-
-  tvx->split(0, 5);
-  tvx->axis(-1)->parallelize(ParallelType::TIDx);
-  tvy->split(0, 6);
-  tvy->axis(-1)->parallelize(ParallelType::TIDy);
-  tvz->split(0, 7);
-  tvz->axis(-1)->parallelize(ParallelType::TIDz);
-
-  for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
-    tv->setMemoryType(MemoryType::Shared);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t1 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 4;
-  auto ref2 = t1 + 3;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1136
-TEST(NVFuserTest, FusionFloatPow_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = binaryOp(BinaryOpType::Pow, tv0, new Int(4));
-  // To check if pow(tv0, 2) is replaced with tv0 * tv0
-  auto tv2 = binaryOp(BinaryOpType::Pow, tv0, new Int(2));
-  // To check if pow(tv0, 2.0) is replaced with tv0 * tv0
-  auto tv3 = binaryOp(BinaryOpType::Pow, tv0, new Double(2));
-  auto tv4 = binaryOp(BinaryOpType::Pow, tv0, new Int(3));
-  auto tv5 = binaryOp(BinaryOpType::Pow, tv0, new Double(3));
-  auto s = binaryOp(BinaryOpType::Pow, new Double(3), new Double(3));
-  auto tv6 = add(tv0, s);
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  tv1->split(0, 32);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  TransformPropagator::from(tv1);
-  scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({1000}, options);
-  // Negative inputs cause nan in Fuesr as use_fast_math is enabled
-  t0 = abs(t0);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto p4 = at::pow(t0, 4);
-  auto p2 = at::pow(t0, 2);
-  auto p3 = at::pow(t0, 3);
-  auto t6 = t0 + std::pow(3, 3);
-
-  testValidate(
-      &fusion,
-      outputs,
-      aten_inputs,
-      {p4, p2, p2, p3, p3, t6},
-      __LINE__,
-      __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1127_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int numel = 4;
-
-  auto tv0 = makeConcreteTensor({numel});
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true});
-
-  auto tv3 = makeConcreteTensor({numel, numel});
-  fusion.addInput(tv3);
-
-  auto tv4 = sum(tv3, {1});
-
-  auto tv5 = add(tv2, tv4);
-  fusion.addOutput(tv5);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
-  tv5->axis(0)->parallelize(ParallelType::TIDx);
-
-  // Lowering should fail since tv5 is predicated and paralellized with TIDx.
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST(NVFuserTest, FusionChannelsLastParser_CUDA) {
-  // This test may not pass if using a custom block sync as there may
-  // be additional calls. Skip the test as it's not specifically
-  // relevant with block synchronizatin.
-  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
-    return;
-  }
-  auto g = std::make_shared<Graph>();
-  const auto graph0_string = R"IR(
-  graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
-        %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
-    %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
-    %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
-    return (%3))IR";
-  parseIR(graph0_string, g.get());
-
-  // strides are not yet supported in the irparser.
-  {
-    auto val = g->block()->inputs()[0];
-    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-        {8, 4, 10, 16}, {640, 1, 64, 4}));
-  }
-
-  {
-    auto val = g->block()->inputs()[1];
-    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-        {8, 4, 10, 16}, {640, 160, 16, 1}));
-  }
-
-  for (auto node : g->block()->nodes()) {
-    for (auto val : node->outputs()) {
-      if (val->isCompleteTensor())
-        val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-            {8, 4, 10, 16}, {640, 1, 64, 4}));
-    }
-  }
-
-  auto fusion = parseJitIR(g);
-  FusionGuard fg(fusion.get());
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor input0 =
-      at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast);
-  at::Tensor input1 = at::randn({2, 2, 2, 16}, options);
-  auto lparams = schedulePointwise(fusion.get(), {input0, input1});
-
-  // CONSIDER:
-  // 1. this can be moved to a dedicated "golden" file
-  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
-  const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
-  if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
-    constexpr nvfuser_index_t ki674 = 0;
-    __half T9[1];
-    constexpr nvfuser_index_t ki716 = 0;
-    T9[ki716] = 0;
-    constexpr nvfuser_index_t ki707 = 0;
-    T9[ki707]
-       = T2[((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * (((1 * T0.size[2]) * T0.size[1]) * T0.size[3])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * ((1 * T0.size[2]) * T0.size[1])) + (((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * (1 * T0.size[2])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3]) * 1)];
-    __half T8[1];
-    constexpr nvfuser_index_t ki722 = 0;
-    T8[ki722] = 0;
-    constexpr nvfuser_index_t ki702 = 0;
-    T8[ki702]
-       = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki702) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)];
-    __half T10[1];
-    constexpr nvfuser_index_t ki683 = 0;
-    float T3[1];
-    T3[0]
-       = __half2float(T9[ki683]);
-    float T4[1];
-    T4[0]
-       = T3[0];
-    float T1[1];
-    T1[0]
-       = __half2float(T8[ki683]);
-    float T5[1];
-    T5[0]
-      = T1[0]
-      * T4[0];
-    float T6[1];
-    T6[0]
-       = relu(T5[0]);
-    T10[ki683]
-       = __float2half(T6[0]);
-    constexpr nvfuser_index_t ki676 = 0;
-    T7[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki676) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]
-       = T10[ki676];
-  }
-}
-)";
-
-  const std::string actual_kernel =
-      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
-
-  if (expected_kernel.size() != actual_kernel.size() ||
-      expected_kernel.compare(actual_kernel) != 0) {
-    std::cerr
-        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
-        << " \n ========= EXPECTED ========= \n"
-        << expected_kernel << "\n========= ACTUAL ========== \n"
-        << actual_kernel << "\n=================" << std::endl;
-    auto it = std::mismatch(
-        expected_kernel.begin(),
-        expected_kernel.end(),
-        actual_kernel.begin(),
-        actual_kernel.end());
-    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
-    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
-    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
-    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
-    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
-              << ", expected: " << expected_mismatched_snippet << std::endl;
-    TORCH_CHECK(false);
-  }
-
-  // TODO: runFusion hits assertion. I'm probably doing something wrong here.
-  // FusionExecutor fe;
-  // fe.compileFusion(fusion.get());
-  // auto outputs = fe.runFusion({input0, input1}, lparams);
-  // at::Tensor output_ref = (input0 * input1).relu();
-  // TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({10, 1024});
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = add(tv2, new Double(1));
-
-  fusion.addOutput(tv3);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->computeAt(tv3, -1);
-  tv3->axis(0)->parallelize(ParallelType::Unswitch);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10, 1024}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = sum(t0, {1}) + 2;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionNonContigOutputs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  fusion.addOutput(tv1);
-
-  tv1->setContiguity(false);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_input = at::randn({10}, options);
-  at::Tensor at_output = at::empty_strided({10}, {2}, options);
-  auto returned_outputs = fe.runFusion({at_input}, {at_output});
-
-  // Returned outputs should only contain one tensor that is the same
-  // as the output tensor given to runFusion
-  TORCH_CHECK(returned_outputs.size() == 1);
-  TORCH_CHECK(returned_outputs[0].is_same(at_output));
-  TORCH_CHECK(!returned_outputs[0].is_contiguous());
-
-  auto at_ref = at_input + 1;
-
-  testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Setup softmax fusion
-  auto input = makeContigTensor(2);
-  fusion.addInput(input);
-  auto output = softmax(input, 1);
-  fusion.addOutput(output);
-
-  // Setup runtime input
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({8, 16 * 197}, options);
-  std::vector<c10::IValue> aten_inputs({aten_input});
-
-  // Schedule through magic scheduler
-  auto runtime_info = SchedulerRuntimeInfo(&fusion, aten_inputs, true);
-  TORCH_CHECK(SchedulerEntry::canSchedule(
-      ScheduleHeuristic::Persistent, &fusion, runtime_info));
-  auto scheduler = SchedulerEntry::makeEntry(
-      ScheduleHeuristic::Persistent, &fusion, runtime_info);
-  scheduler->schedule(&fusion);
-
-  // Modify the schedule to use warp reduction
-  auto used_vals = fusion.usedMathVals();
-  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
-    for (IterDomain* id : tv->domain()->domain()) {
-      if (id->getParallelType() == ParallelType::TIDx) {
-        id->padToMultipleOfWarp();
-      }
-    }
-  }
-
-  // Test result
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(aten_inputs);
-  auto ref_output = at::_softmax(aten_input, 1, false);
-  testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1133_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 7) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = add(tv2, new Double(1));
-
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv3, 1);
-
-  const int split_factor = 32;
-
-  tv2->split(-1, split_factor);
-  tv1->computeAt(tv2, -2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::Unswitch);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Both tv1 and tv2 should be allocated at the top-level scope
-  GpuLower gpulw(&fusion);
-  bool tv1_validated = false;
-  bool tv2_validated = false;
-  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
-      auto size = alloc->size();
-      if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) {
-        // There should be no allocation other than those for tv1 and tv2
-        TORCH_CHECK(false, "Invalid allocation detected");
-      }
-      TORCH_CHECK(size->isA<kir::Int>(), "Invalid allocation size");
-      TORCH_CHECK(size->as<kir::Int>()->isConst(), "Allocation not constant");
-      auto size_int = size->as<kir::Int>()->value().value();
-      if (alloc->buffer()->name() == 1) {
-        TORCH_CHECK(
-            size_int == split_factor,
-            "Invalid allocation size: ",
-            size->as<kir::Int>()->value().value());
-        tv1_validated = true;
-      } else {
-        TORCH_CHECK(
-            size_int == 1,
-            "Invalid allocation size: ",
-            size->as<kir::Int>()->value().value());
-        tv2_validated = true;
-      }
-    }
-  }
-
-  TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
-  TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({99, 101}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = (t0 + 1).sum({1}) + 1;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionRfactorContigIDs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  fusion.addOutput(tv1);
-
-  tv1->split(1, 32);
-
-  auto tv2 = tv1->rFactor({1});
-
-  // This merged domain is not contiguous.
-  tv2->merge(0, 2);
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({99, 101}, options);
-  std::vector<IValue> aten_inputs = {t0};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0.sum({1});
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = set(tv1);
-  auto tv5 = add(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
-  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Float));
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Float));
-}
-
-TEST(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = set(tv1);
-  auto tv5 = add(tv3, tv4);
-  auto tv6 = castOp(DataType::Half, tv5);
-  fusion.addOutput(tv6);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
-  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Float));
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Half));
-}
-
-TEST(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-
-  auto tv5 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv5);
-
-  auto tv6 = castOp(DataType::Float, tv5);
-
-  auto tv7 = add(tv6, tv4);
-  auto tv8 = set(tv1);
-  auto tv9 = add(tv7, tv8);
-  auto tv10 = sum(tv9, {1});
-  auto tv11 = broadcast(tv10, {false, true});
-  auto tv12 = set(tv7);
-  auto tv13 = add(tv12, tv11);
-
-  fusion.addOutput(tv13);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
-  TORCH_INTERNAL_ASSERT(
-      resolution.size() == 2 && resolution[0].size() == 1 &&
-      resolution[1].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
-
-  auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7);
-  TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-  at::Tensor aten_t5 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0, aten_t5}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Float) * 2);
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      aten_t0.size(1) *
-          (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float)));
-}
-
-TEST(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = set(tv1);
-  auto tv6 = add(tv4, tv5);
-  auto tv7 = set(tv2);
-  auto tv8 = add(tv7, tv6);
-  auto tv9 = castOp(DataType::Half, tv8);
-
-  fusion.addOutput(tv9);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
-  TORCH_INTERNAL_ASSERT(
-      resolution.size() == 2 && resolution[0].size() == 1 &&
-      resolution[1].size() == 1);
-
-  TORCH_INTERNAL_ASSERT(projectable.size() == 2);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
-
-  auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2);
-  TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Float) * 2);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      aten_t0.size(1) * dataTypeSize(DataType::Half));
-}
-
-TEST(NVFuserTest, PersistentBufferProjection_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = set(tv1);
-  auto tv6 = add(tv4, tv5);
-  auto tv7 = set(tv2);
-  auto tv8 = add(tv7, tv6);
-  auto tv9 = castOp(DataType::Half, tv8);
-
-  fusion.addOutput(tv9);
-
-  reduction_scheduler_utils::projectPersistentBuffers(&fusion);
-
-  auto tv5_producers = ir_utils::producerTvsOf(tv5);
-  auto tv7_producers = ir_utils::producerTvsOf(tv7);
-
-  // Projection should have broken these dependencies
-
-  TORCH_INTERNAL_ASSERT(
-      std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
-      tv5_producers.end());
-  TORCH_INTERNAL_ASSERT(
-      std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
-      tv7_producers.end());
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({aten_t0});
-
-  auto aten_t1 = aten_t0.to(c10::kDouble);
-  auto aten_t3 = aten_t1.sum({1});
-  auto aten_t4 = aten_t3.unsqueeze(1);
-  auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
-
-  testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionIssue1223_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 7) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, new Double(0));
-  fusion.addOutput(tv3);
-
-  tv2->split(0, 4);
-  tv2->split(1, 1, false);
-  tv2->split(-1, 4);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-  tv2->axis(-3)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv1->computeAt(tv2, -1);
-
-  // Make TIDx and TIDy non-exact
-  tv3->split(0, 32);
-  tv3->split(-1, 32);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDy);
-
-  // The second axis of both tv1 and tv2 are fully unswitched, so they
-  // don't need to predicate the parallel type usage of TIDy, whereas
-  // the first axis is only partially unswitched, i.e., part of its
-  // split output domains is outside the unswitched axis, so the first
-  // axis, which uses TIDx, needs to predicate the parallel
-  // dimension. Previously, as reported in issue #1223, unswitched
-  // expressions didn't predicate parallel dimensions. It should be
-  // fixed by PR #1222.
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::ones({11, 10}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({at_t0});
-
-  auto at_t1 = (at_t0 + 1).sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
-}
-
-// See #1247 and #1250
-TEST(NVFuserTest, FusionRfactorPredication1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = min(tv1, {0});
-
-  fusion.addOutput(tv2);
-
-  // Make TIDx non-exact
-  auto tv3 = makeContigTensor(1);
-  fusion.addInput(tv3);
-
-  auto tv4 = add(tv3, new Double(1));
-  fusion.addOutput(tv4);
-
-  tv2->split(0, 4);
-  auto tv5 = tv2->rFactor({1});
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::randn({9}, options);
-  at_t0 = at::abs(at_t0);
-  at::Tensor at_t3 = at::randn({128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
-
-  auto at_t2 = (at_t0 + 1).min();
-  auto at_t4 = at_t3 + 1;
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionRfactorPredication2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = min(tv0, {0});
-  fusion.addOutput(tv1);
-
-  // Make TIDx non-exact
-  auto tv2 = makeContigTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, new Double(1));
-  fusion.addOutput(tv3);
-
-  tv1->split(0, 4);
-  auto tv4 = tv1->rFactor({0});
-
-  tv1->split(0, 3);
-
-  // tv0->computeAt(tv1, 3);
-  tv4->reorder({{0, 1}});
-  tv4->split(0, 3);
-  tv4->setMemoryType(MemoryType::Shared);
-
-  // tv0: [I]
-  // tv4: [4/3, 3, I/4]
-  // tv1: [4/3, 3]
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv1, {tv4});
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_t0 = at::randn({9}, options);
-  at_t0 = at::abs(at_t0);
-  at::Tensor at_t3 = at::randn({128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
-
-  auto at_t2 = std::get<0>(at_t0.min(0));
-  auto at_t4 = at_t3 + 1;
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  // [I]
-  tv1->split(0, 5);
-  // [ceilDiv(I, 5), 5]
-
-  // This second split is non-divisible. The split domain must be predicated.
-  tv1->split(1, 3);
-  // [ceilDiv(I, 5), 2, 3]
-
-  auto tv2 = sum(tv0, {0});
-  fusion.addOutput(tv2);
-
-  // tv2 shouldn't need to have another predicate
-  tv2->split(0, 4);
-  tv2->split(1, 2);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
-      "Only tv1 should have a non-divisible predicate.");
-  for (auto tv : {tv1}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0.sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1074
-TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 2);
-  tv2->split(-1, 4);
-  tv2->reorder({{1, 2}, {2, 1}});
-  tv0->computeAt(tv2, 2);
-
-  tv2->split(-1, 3);
-
-  // To make the sanitizer catch the invalid accesses. Not necessary
-  // to expose the bug.
-  tv1->setMemoryType(MemoryType::Shared);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
-      "Only tv2 should have a non-divisible predicate.");
-  for (auto tv : {tv2}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({13, 17}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Similar to FusionNonDivisibleSplit1 but with unswitch
-TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 5);
-  tv2->split(1, 3);
-
-  tv0->computeAt(tv2, -1);
-
-  tv2->axis(0)->parallelize(ParallelType::Unswitch);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {tv1, tv2}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Non-divisible split through merge
-TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 5);
-  tv2->merge(1, 2);
-  tv2->split(1, 3);
-
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {tv1, tv2}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24, 2}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Nested splits
-TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  // [I]
-  tv2->split(0, 8);
-  // [I/8, 8]
-  tv2->split(1, 2);
-  // [I/8, 4, 2]
-  tv2->split(1, 3); // non-divisible split of outer output
-  // [I/8, 2, 3, 2]
-
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {tv1, tv2}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Vectorized non-divisible split. Must be validated at run time
-TEST(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->split(0, 8, false);
-  tv1->split(1, 4);
-
-  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
-      "There should be one split to validate");
-  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
-    const auto& splits_to_predicate = kv.second;
-    TORCH_CHECK(
-        splits_to_predicate.empty(),
-        "There must be no split to predicate, but tensor t",
-        kv.first->name(),
-        " has:",
-        splits_to_predicate);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  auto t0 = at::randn({32}, options);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-
-  auto t0_non_divisible = at::randn({8}, options);
-  // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
-  // illegal. The run-time validation of vectorization should throw an error.
-  ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
-}
-
-// If a split is validated at run time, it's not necessary to predicate.
-TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = sum(tv2, {0});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 8, false);
-  tv3->split(1, 4);
-  TransformPropagator::from(tv3);
-
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->axis(2)->parallelize(ParallelType::Vectorize);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
-      "There should be one split to validate");
-  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
-    const auto& splits_to_predicate = kv.second;
-    TORCH_CHECK(
-        splits_to_predicate.empty(),
-        "There must be no split to predicate, but tensor t",
-        kv.first->name(),
-        " has:",
-        splits_to_predicate);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  auto t0 = at::randn({1024}, options);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-} // namespace jit
-} // namespace torch
-#endif // #if defined(USE_CUDA)
diff --git a/test/cpp/jit/test_gpu_shift.cpp b/test/cpp/jit/test_gpu_shift.cpp
deleted file mode 100644
index 71fa156c2d24..000000000000
--- a/test/cpp/jit/test_gpu_shift.cpp
+++ /dev/null
@@ -1,4637 +0,0 @@
-#if defined(USE_CUDA)
-#include <gtest/gtest.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
-
-// fuser and IR parser
-#include "test_gpu_validator.h"
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include <algorithm>
-#include <iostream>
-
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::fuser::cuda;
-using namespace at::indexing;
-
-namespace {
-
-// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder()
-      .ndims(ndims)
-      .dtype(dtype)
-      .contiguity(std::vector<bool>(ndims, true))
-      .build();
-}
-
-// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
-// but unknown sizes
-TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
-  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
-}
-
-// Make a non-contiguous tensor of compile-time known sizes
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype = DataType::Float) {
-  return TensorViewBuilder().shape(shape).dtype(dtype).build();
-}
-
-void checkIntValue(
-    ExpressionEvaluator& evaluator,
-    Val* val,
-    Int::ScalarType expected_value) {
-  TORCH_CHECK(val->isAnInt());
-  const auto actual_value = evaluator.evaluate(val);
-  TORCH_CHECK(actual_value.has_value());
-  TORCH_CHECK(actual_value.value() == expected_value);
-}
-
-void checkIntValue(
-    kir::ExpressionEvaluator& evaluator,
-    const kir::Val* val,
-    kir::Int::ScalarType expected_value) {
-  const auto actual_value = evaluator.evaluate(val);
-  TORCH_CHECK(actual_value.has_value());
-  TORCH_CHECK(actual_value.value() == expected_value);
-}
-
-// ATen version of tensor shifting
-auto shift(
-    at::Tensor tensor,
-    const std::vector<int>& offsets,
-    std::vector<int> strides = {}) {
-  TORCH_INTERNAL_ASSERT(tensor.ndimension() == offsets.size());
-  if (strides.empty()) {
-    strides = std::vector<int>(tensor.ndimension(), 1);
-  }
-  at::Tensor t = tensor;
-  std::vector<at::indexing::TensorIndex> stride_indices;
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    auto stride = strides[i];
-    stride_indices.push_back(
-        at::indexing::Slice(0, at::indexing::None, stride));
-    const auto offset = offsets[i];
-    if (offset == 0) {
-      continue;
-    }
-    t = t.roll(offsets[i], i);
-    std::vector<at::indexing::TensorIndex> indices(
-        tensor.ndimension(), at::indexing::Slice(0, at::indexing::None));
-    if (offset > 0) {
-      indices[i] = at::indexing::Slice(0, offset);
-    } else {
-      indices[i] = at::indexing::Slice(offset, at::indexing::None);
-    }
-    t.index(indices) = 0;
-  }
-  t = t.index(stride_indices);
-  return t;
-}
-
-// ATen version of tensor gather
-auto gather(
-    at::Tensor tensor,
-    const std::vector<int>& window_shape,
-    const std::vector<std::vector<int>>& pad_width,
-    std::vector<int> strides = {}) {
-  TORCH_CHECK(
-      tensor.ndimension() == window_shape.size(),
-      "Invalid window shape: ",
-      window_shape,
-      ". Size of the window shape is different from the tensor dimension.");
-  TORCH_CHECK(
-      tensor.ndimension() == pad_width.size(),
-      "Invalid pad width: ",
-      pad_width,
-      ". Size of the pad width is different from the tensor dimension.");
-  if (strides.empty()) {
-    strides = std::vector<int>(tensor.ndimension(), 1);
-  } else {
-    TORCH_CHECK(
-        tensor.ndimension() == strides.size(),
-        "Invalid strides: ",
-        strides,
-        ". Size of strides is different from the tensor dimension.");
-  }
-  at::Tensor t = tensor;
-  for (size_t i = 0; i < window_shape.size(); ++i) {
-    const auto w_size = window_shape[i];
-    TORCH_CHECK(w_size != 0);
-    const auto& pad = pad_width[i];
-    TORCH_CHECK(pad.size() == 2);
-    at::Tensor concat_tensor;
-    for (int w = 0; w < w_size; ++w) {
-      std::vector<int> shift_offsets(t.ndimension(), 0);
-      shift_offsets[i] = pad[0] - w;
-      std::vector<int> shift_strides(t.ndimension(), 1);
-      shift_strides[i] = strides[i];
-      auto shifted = shift(t, shift_offsets, shift_strides);
-      shifted = shifted.unsqueeze(-1);
-      if (w == 0) {
-        concat_tensor = shifted;
-      } else {
-        concat_tensor = at::cat({concat_tensor, shifted}, -1);
-      }
-    }
-    t = concat_tensor;
-  }
-  return t;
-}
-
-} // namespace
-
-// Shift an input tensor
-TEST(NVFuserTest, FusionShift1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = shift(tv0, {-1, 0});
-  fusion.addOutput(tv1);
-
-  auto tv2 = shift(tv0, {0, 1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = shift(tv0, {2, 2});
-  fusion.addOutput(tv3);
-
-  auto tv4 = shift(tv0, {-2, -2});
-  fusion.addOutput(tv4);
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = shift(t0, {-1, 0});
-  TORCH_CHECK(t1.equal(outputs[0]));
-
-  auto t2 = shift(t0, {0, 1});
-  TORCH_CHECK(t2.equal(outputs[1]));
-
-  auto t3 = shift(t0, {2, 2});
-  TORCH_CHECK(t3.equal(outputs[2]));
-
-  auto t4 = shift(t0, {-2, -2});
-  TORCH_CHECK(t4.equal(outputs[3]));
-}
-
-// Shifts an intermediate tensor
-TEST(NVFuserTest, FusionShift2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {-1, 0});
-  fusion.addOutput(tv2);
-
-  // make it a little more complex
-  auto tv3 = add(tv0, new Double(3));
-  auto tv4 = add(tv3, new Double(4));
-  auto tv5 = shift(tv4, {-1, 0});
-  auto tv6 = shift(tv4, {0, -1});
-  auto tv7 = shift(tv4, {1, 0});
-  auto tv8 = shift(tv4, {0, 0});
-  auto tv9 = add(tv5, tv6);
-  auto tv10 = add(tv9, tv7);
-  auto tv11 = add(tv10, tv8);
-  fusion.addOutput(tv11);
-
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11}) {
-    tv->setMemoryType(MemoryType::Global);
-  }
-
-  // t1 allocation: (t1.size[0] + 1) * (t1.size[1])
-  // t3 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
-  // t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
-  GpuLower gpulw(&fusion);
-
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          if (tensor_name == 1 && i == 1) {
-            TORCH_CHECK(alloc->shape().at(i)->isA<kir::NamedScalar>());
-            continue;
-          }
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add);
-          TORCH_CHECK(def->as<kir::BinaryOp>()->lhs()->isA<kir::NamedScalar>());
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          if (tensor_name == 1) {
-            TORCH_CHECK(i == 0);
-            TORCH_CHECK(rhs_value == 1);
-          } else {
-            if (i == 0) {
-              TORCH_CHECK(rhs_value == 2);
-            } else {
-              TORCH_CHECK(rhs_value == 1);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {-1, 0});
-
-  auto t3 = t0 + 3;
-  auto t4 = t3 + 4;
-  auto t5 = shift(t4, {-1, 0});
-  auto t6 = shift(t4, {0, -1});
-  auto t7 = shift(t4, {1, 0});
-  auto t8 = shift(t4, {0, 0});
-  auto t9 = t5 + t6;
-  auto t10 = t9 + t7;
-  auto t11 = t10 + t8;
-
-  testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftRightOfCA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  tv0->computeAt(tv2, -2);
-
-  tv1->setMemoryType(MemoryType::Global);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 100;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {0, 1});
-
-  TORCH_CHECK(t2.allclose(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionShiftLeftOfCA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = shift(tv2, {-1, 0});
-  auto tv4 = add(tv3, new Double(1));
-  fusion.addOutput(tv4);
-
-  tv0->computeAt(tv4, -1);
-
-  // Lowering should trigger an assertion failure as a shifted axis is
-  // found inside an allocation position.
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST(NVFuserTest, FusionShiftSplit1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {0, 1});
-  auto tv3 = shift(tv1, {0, -2});
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  int split_factor = 4;
-  tv2->split(-1, split_factor);
-  tv3->split(-1, split_factor);
-
-  tv0->computeAt(tv2, -2);
-  tv0->computeAt(tv3, -2);
-
-  // t1 allocation: (4 + 3)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto def =
-            dynamic_cast<kir::BinaryOp*>(alloc->shape().at(0)->definition());
-        auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-        TORCH_CHECK(lhs != nullptr && lhs->isConst());
-        int lhs_value = *lhs->value();
-        auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-        TORCH_CHECK(rhs != nullptr && rhs->isConst());
-        int rhs_value = *rhs->value();
-        TORCH_CHECK(lhs_value == split_factor && rhs_value == 3);
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {0, 1});
-  auto t3 = shift(t1, {0, -2});
-
-  testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftSplit2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(1));
-  auto tv3 = shift(tv2, {0, -1});
-  auto tv4 = shift(tv2, {0, 1});
-  auto tv5 = add(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  auto tv6 = add(tv0, new Double(1));
-  auto tv7 = shift(tv6, {0, 0});
-  auto tv8 = add(tv7, new Double(1));
-  fusion.addOutput(tv8);
-
-  int split_factor = 4;
-
-  tv5->split(-1, split_factor);
-  tv8->split(-1, split_factor);
-
-  tv0->computeAt(tv5, -2);
-  tv0->computeAt(tv8, -2);
-
-  // t1 and t2 allocation: (4 + 2)
-  // t4 allocation: (4)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto def =
-            dynamic_cast<kir::BinaryOp*>(alloc->shape().at(0)->definition());
-        auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-        TORCH_CHECK(lhs != nullptr && lhs->isConst());
-        int lhs_value = *lhs->value();
-        auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-        TORCH_CHECK(rhs != nullptr && rhs->isConst());
-        int rhs_value = *rhs->value();
-        TORCH_CHECK(lhs_value == split_factor && rhs_value == 2);
-      } else if (tensor_name == 4) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto size = dynamic_cast<kir::Int*>(alloc->shape().at(0));
-        TORCH_CHECK(size != nullptr && size->isConst());
-        int size_value = *size->value();
-        TORCH_CHECK(size_value == split_factor);
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 2;
-  auto t3 = shift(t1, {0, -1});
-  auto t4 = shift(t1, {0, 1});
-  auto t5 = t3 + t4;
-
-  auto t6 = t0 + 1;
-  auto t7 = t6;
-  auto t8 = t7 + 1;
-
-  testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftDoubleSplit_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  auto tv3 = shift(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  int split_factor1 = 8;
-  int split_factor2 = 4;
-
-  tv3->split(-1, split_factor1);
-
-  tv0->computeAt(tv3, -2);
-
-  tv1->split(-1, split_factor2);
-
-  // t1: [i1, i2/8, 8/4, 4]
-  // t2: [i1, i2/8, 8]
-  // t3: [i1, i2/8, 8]
-
-  // t1 and t2 allocation: (split_factor1 + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto def =
-            dynamic_cast<kir::BinaryOp*>(alloc->shape().at(0)->definition());
-        auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-        TORCH_CHECK(lhs != nullptr && lhs->isConst());
-        int lhs_value = *lhs->value();
-        auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-        TORCH_CHECK(rhs != nullptr && rhs->isConst());
-        int rhs_value = *rhs->value();
-        TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1);
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 3;
-  auto ref = shift(t1, {0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift3ptStencil_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 3-pt stencil
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  std::vector<std::vector<int>> offsets = {{-1}, {1}};
-
-  std::vector<TensorView*> tvs;
-  for (const auto& offset : offsets) {
-    tvs.push_back(shift(tv0, offset));
-  }
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  int split_factor = 4;
-
-  tv_out->split(0, split_factor);
-
-  // This seems fine but not verified yet
-  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
-
-  auto cache = tv0->cache_after();
-
-  tv0->computeAt(tv_out, 1);
-
-  // Inline completely except for the cache
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  // cache allocation: (split_factor + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == cache->name()) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto def =
-            dynamic_cast<kir::BinaryOp*>(alloc->shape().at(0)->definition());
-        auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-        TORCH_CHECK(lhs != nullptr && lhs->isConst());
-        int lhs_value = *lhs->value();
-        auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-        TORCH_CHECK(rhs != nullptr && rhs->isConst());
-        int rhs_value = *rhs->value();
-        TORCH_CHECK(lhs_value == split_factor && rhs_value == 2);
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift5ptStencil_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 5-pt stencil
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
-
-  std::vector<TensorView*> tvs;
-  for (const auto& offset : offsets) {
-    tvs.push_back(shift(tv0, offset));
-  }
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  std::vector<int> split_factor({4, 8});
-
-  tv_out->split(-1, split_factor[1]);
-  tv_out->split(0, split_factor[0]);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  auto cache = tv0->cache_after();
-
-  tv0->computeAt(tv_out, 2);
-
-  // Inline completely except for the cache
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  // cache allocation: (split_factor + 2) * (split_factor + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == cache->name()) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0;
-  for (const auto& offset : offsets) {
-    ref = ref + shift(t0, offset);
-  }
-  ref = ref / int(offsets.size() + 1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift9ptStencil_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 9-pt stencil
-  std::vector<std::vector<int>> offsets;
-  for (int i = -1; i < 2; ++i) {
-    for (int j = -1; j < 2; ++j) {
-      if (i == 0 && j == 0) {
-        continue;
-      }
-      offsets.push_back({i, j});
-    }
-  }
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  std::vector<TensorView*> tvs;
-  for (const auto& offset : offsets) {
-    tvs.push_back(shift(tv0, offset));
-  }
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  std::vector<int> split_factor({4, 8});
-  tv_out->split(-1, split_factor[1]);
-  tv_out->split(0, split_factor[0]);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  auto cache = tv0->cache_after();
-
-  tv0->computeAt(tv_out, 2);
-
-  // Inline completely except for the cache
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  // This seems fine but not yet verified
-  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
-
-  // cache allocation: (split_factor + 2) * (split_factor + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == cache->name()) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0;
-  for (const auto& offset : offsets) {
-    ref = ref + shift(t0, offset);
-  }
-  ref = ref / int(offsets.size() + 1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftSmemBlocking_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  int smem_block_factor = 32;
-
-  tv2->split(-1, smem_block_factor);
-
-  tv0->computeAt(tv2, -2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  // tv1 allocation: (split_factor + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == tv1->name()) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        for (int i = 0; i < 1; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == smem_block_factor && rhs_value == 1);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 100;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {0, 1});
-  auto ref = t2;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 3-pt stencil
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  std::vector<TensorView*> tvs;
-  tvs.push_back(shift(tv0, {-1}));
-  tvs.push_back(shift(tv0, {1}));
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  int smem_block_factor = 32;
-
-  tv_out->split(0, smem_block_factor);
-  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
-
-  auto tv0_cache = tv0->cache_after();
-
-  tv0->computeAt(tv_out, 1);
-
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  tv0_cache->setMemoryType(MemoryType::Shared);
-  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 5-pt stencil
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
-
-  std::vector<TensorView*> tvs;
-  for (const auto& offset : offsets) {
-    tvs.push_back(shift(tv0, offset));
-  }
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  int smem_block_factor = 32;
-
-  tv_out->split(-1, smem_block_factor);
-  tv_out->split(0, smem_block_factor);
-
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  auto tv0_cache = tv0->cache_after();
-
-  tv0->computeAt(tv_out, 2);
-
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
-  tv_out->axis(-2)->parallelize(ParallelType::TIDy);
-  tv_out->axis(-3)->parallelize(ParallelType::BIDx);
-  tv_out->axis(-4)->parallelize(ParallelType::BIDy);
-
-  tv0_cache->setMemoryType(MemoryType::Shared);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-2)->parallelize(ParallelType::TIDy);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0;
-  for (const auto& offset : offsets) {
-    ref = ref + shift(t0, offset);
-  }
-  ref = ref / int(offsets.size() + 1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftMerge1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {-1, 1});
-  fusion.addOutput(tv2);
-
-  int split_factor = 4;
-
-  tv2->split(-1, split_factor);
-  tv2->split(0, split_factor);
-  tv2->reorder({{1, 2}, {2, 1}});
-  tv2->merge(2, 3);
-
-  tv0->computeAt(tv2, 2);
-
-  // t1 allocation: (split_factor + 1) * (split_factor + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor && rhs_value == 1);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {-1, 1});
-  auto ref = t2;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftMerge2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1});
-  auto tv3 = shift(tv1, {-1, 1});
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  int split_factor = 4;
-
-  tv4->split(-1, split_factor);
-  tv4->split(0, split_factor);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->merge(2, 3);
-
-  tv0->computeAt(tv4, -2);
-
-  // t1 allocation: (split_factor + 2) * (split_factor + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor && rhs_value == 2);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t1, {-1, 1});
-  auto t4 = t2 + t3;
-
-  TORCH_CHECK(t4.allclose(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionShiftGlobal_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {0, 1});
-  auto tv3 = shift(tv1, {-1, 0});
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv1->split(-1, 4);
-  tv2->split(-1, 8);
-  tv3->split(-1, 2);
-  tv4->split(-1, 3);
-
-  tv1->merge(-2, -1);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-
-  // t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add);
-          TORCH_CHECK(def->as<kir::BinaryOp>()->lhs()->isA<kir::NamedScalar>());
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(rhs_value == 1);
-        }
-      }
-    }
-  }
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {0, 1});
-  auto t3 = shift(t1, {-1, 0});
-  auto t4 = t2 + t3;
-  auto ref = t4;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  auto tv3 = shift(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  int split_factor1 = 8;
-  int split_factor2 = 4;
-
-  tv3->split(-1, split_factor1);
-
-  tv0->computeAt(tv3, -2);
-
-  tv1->split(-1, split_factor2);
-  tv1->merge(-2, -1);
-
-  // t1 and t2 allocation: (split_factor1 + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        auto def =
-            dynamic_cast<kir::BinaryOp*>(alloc->shape().at(0)->definition());
-        auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-        TORCH_CHECK(lhs != nullptr && lhs->isConst());
-        int lhs_value = *lhs->value();
-        auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-        TORCH_CHECK(rhs != nullptr && rhs->isConst());
-        int rhs_value = *rhs->value();
-        TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1);
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 3;
-  auto ref = shift(t1, {0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  auto tv3 = shift(tv2, {1, 1});
-  fusion.addOutput(tv3);
-
-  auto out = tv3;
-
-  int split_factor1 = 32;
-  int split_factor2 = 4;
-
-  out->split(-1, split_factor1);
-  out->split(-1, split_factor2);
-  out->split(0, split_factor1);
-  out->split(1, split_factor2);
-  out->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
-  out->merge(2, 3);
-  out->merge(2, 3);
-  out->merge(2, 3);
-  out->merge(0, 1);
-
-  TransformPropagator::from(out);
-
-  tv0->computeAt(out, 1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(1)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {tv1, tv2});
-
-  for (auto tv : {tv1, tv2}) {
-    tv->setMemoryType(MemoryType::Shared);
-  }
-
-  // t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = shift(t0 + 1 + 2, {1, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 5-pt stencil
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
-
-  std::vector<TensorView*> tvs;
-  for (const auto& offset : offsets) {
-    tvs.push_back(shift(tv0, offset));
-  }
-
-  auto tv_out = tv0;
-
-  for (auto tv : tvs) {
-    tv_out = add(tv_out, tv);
-  }
-
-  tv_out = div(tv_out, new Double(tvs.size() + 1));
-
-  fusion.addOutput(tv_out);
-
-  std::vector<int> split_factor({4, 32});
-
-  tv_out->split(-1, split_factor[1]);
-  tv_out->split(0, split_factor[0]);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  auto tv0_cache = tv0->cache_after();
-
-  // Merge the inner-most two axes and create
-  // a 1D thread block of split_factor1*split_factor2 threads
-  tv_out->merge(-2, -1);
-
-  tv0->computeAt(tv_out, 2);
-
-  // Inline completely except for the cache
-  for (auto tv : tvs) {
-    tv->computeAt(tv_out, -1);
-  }
-
-  tv0_cache->merge(-2, -1);
-
-  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
-  tv_out->axis(1)->parallelize(ParallelType::BIDx);
-  tv_out->axis(0)->parallelize(ParallelType::BIDy);
-
-  tv0_cache->setMemoryType(MemoryType::Shared);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // cache allocation: (split_factor1 + 2) * (split_factor2 + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == tv0_cache->name()) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2);
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0;
-  for (const auto& offset : offsets) {
-    ref = ref + shift(t0, offset);
-  }
-  ref = ref / int(offsets.size() + 1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftChain1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = shift(tv0, {0, 1});
-  auto tv2 = shift(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  int split_factor = 4;
-  tv2->split(-1, split_factor);
-
-  tv0->computeAt(tv2, -2);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = shift(shift(t0, {0, 1}), {0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftChain2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = shift(tv0, {0, 1});
-  auto tv2 = shift(tv1, {0, -1});
-  fusion.addOutput(tv2);
-
-  tv2->split(-1, 4);
-
-  tv0->computeAt(tv2, -2);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = shift(shift(t0, {0, 1}), {0, -1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftChain3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {0, 1});
-  auto tv3 = shift(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  int split_factor = 4;
-  tv3->split(-1, split_factor);
-
-  tv0->computeAt(tv3, -2);
-
-  // Halo size of tv1 is 2 as it needs to account for both of the two
-  // shift operations , while that of tv2 is still just 1
-
-  // tv1: (split_factor + 2)
-  // tv2: (split_factor + 1)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 1);
-        for (int i = 0; i < 1; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor);
-          if (tensor_name == 1) {
-            TORCH_CHECK(rhs_value == 2);
-          } else if (tensor_name == 2) {
-            TORCH_CHECK(rhs_value == 1);
-          }
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {0, 1});
-  auto t3 = shift(t2, {0, 1});
-  auto ref = t3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftChain4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = shift(tv0, {1, -1});
-  auto tv2 = shift(tv1, {2, -2});
-  auto tv3 = shift(tv2, {3, -3});
-  auto tv4 = shift(tv3, {4, -4});
-  auto tv_out = tv4;
-
-  fusion.addOutput(tv_out);
-
-  int split_factor = 4;
-
-  tv_out->split(-1, split_factor);
-  tv_out->split(0, split_factor);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  tv0->computeAt(tv_out, 2);
-
-  tv1->merge(-2, -1);
-  tv2->merge(-2, -1);
-  tv3->merge(-2, -1);
-
-  // tv1: (split_factor + 9) * (split_factor + 9)
-  // tv2: (split_factor + 7) * (split_factor + 7)
-  // tv3: (split_factor + 4) * (split_factor + 4)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == 1 || tensor_name == 2) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor);
-          if (tensor_name == 1) {
-            TORCH_CHECK(rhs_value == 9);
-          } else if (tensor_name == 2) {
-            TORCH_CHECK(rhs_value == 7);
-          } else if (tensor_name == 3) {
-            TORCH_CHECK(rhs_value == 4);
-          }
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = shift(t0, {1, -1});
-  auto t2 = shift(t1, {2, -2});
-  auto t3 = shift(t2, {3, -3});
-  auto t4 = shift(t3, {4, -4});
-  auto ref = t4;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
-
-  // First stencil: 5pt stencil
-  // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
-  std::vector<TensorView*> tv_stencil1_shifts;
-  for (const auto& offset : offsets) {
-    tv_stencil1_shifts.push_back(shift(tv0, offset));
-  }
-
-  auto tv_stencil1 = tv0;
-  for (auto tv : tv_stencil1_shifts) {
-    tv_stencil1 = add(tv_stencil1, tv);
-  }
-
-  tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1));
-
-  // Second stencil: Same 5pt stencil
-  std::vector<TensorView*> tv_stencil2_shifts;
-  for (const auto& offset : offsets) {
-    tv_stencil2_shifts.push_back(shift(tv_stencil1, offset));
-  }
-
-  auto tv_stencil2 = tv_stencil1;
-  for (auto tv : tv_stencil2_shifts) {
-    tv_stencil2 = add(tv_stencil2, tv);
-  }
-
-  tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1));
-
-  auto tv_out = tv_stencil2;
-
-  fusion.addOutput(tv_out);
-
-  auto tv0_cache = tv0->cache_after();
-
-  std::vector<int> split_factor({16, 16});
-
-  tv_out->split(-1, split_factor[1]);
-  tv_out->split(0, split_factor[0]);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  tv0->computeAt(tv_out, 2);
-
-  // Inline completely all inputs to the first stencil output, except for the
-  // tv0 cache
-  for (auto tv : tv_stencil1_shifts) {
-    tv->computeAt(tv_stencil1, -1);
-  }
-
-  // Inline completely all inputs to the second stencil output, except
-  // for the first stencil output
-  for (auto tv : tv_stencil2_shifts) {
-    tv->computeAt(tv_stencil2, -1);
-  }
-
-  tv_out->axis(1)->parallelize(ParallelType::BIDx);
-  tv_out->axis(0)->parallelize(ParallelType::BIDy);
-
-  auto all_values = DependencyCheck::getAllValsBetween(
-      {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
-  for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-    tv->axis(-2)->parallelize(ParallelType::TIDy);
-  }
-
-  tv0_cache->setMemoryType(MemoryType::Shared);
-  tv_stencil1->setMemoryType(MemoryType::Shared);
-
-  // tv0_cache: (split_factor + 4) * (split_factor + 4)
-  // tv_stencil1: (split_factor + 2) * (split_factor + 2)
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->irNodes()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node.get())) {
-      auto tensor_name = alloc->buffer()->name();
-      if (tensor_name == tv0_cache->name() ||
-          tensor_name == tv_stencil1->name()) {
-        TORCH_CHECK(alloc->shape().size() == 2);
-        for (int i = 0; i < 2; ++i) {
-          auto def =
-              dynamic_cast<kir::BinaryOp*>(alloc->shape().at(i)->definition());
-          auto lhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->lhs());
-          TORCH_CHECK(lhs != nullptr && lhs->isConst());
-          int lhs_value = *lhs->value();
-          auto rhs = dynamic_cast<kir::Int*>(def->as<kir::BinaryOp>()->rhs());
-          TORCH_CHECK(rhs != nullptr && rhs->isConst());
-          int rhs_value = *rhs->value();
-          TORCH_CHECK(lhs_value == split_factor[i]);
-          if (tensor_name == tv0_cache->name()) {
-            TORCH_CHECK(rhs_value == 4);
-          } else if (tensor_name == tv_stencil1->name()) {
-            TORCH_CHECK(rhs_value == 2);
-          }
-        }
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto stencil1 = t0;
-  for (const auto& offset : offsets) {
-    stencil1 = stencil1 + shift(t0, offset);
-  }
-  stencil1 = stencil1 / int(offsets.size() + 1);
-  auto stencil2 = stencil1;
-  for (const auto& offset : offsets) {
-    stencil2 = stencil2 + shift(stencil1, offset);
-  }
-  stencil2 = stencil2 / int(offsets.size() + 1);
-  auto ref = stencil2;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Shift a reduced tensor
-TEST(NVFuserTest, FusionShiftReduction1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = shift(tv2, {1});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-  tv0->computeAt(tv3, 1);
-  tv0->computeAt(tv2, -1);
-
-  const int numel_x = 9;
-  const int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = sum(t1, {1});
-  auto t3 = shift(t2, {1});
-  auto ref = t3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Parallelized version of FusionShiftReduction1
-TEST(NVFuserTest, FusionShiftReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = shift(tv2, {1});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-  tv0->computeAt(tv3, 1);
-
-  tv2->split(-1, 32);
-  tv0->computeAt(tv2, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  const int numel_x = 201;
-  const int numel_y = 301;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = sum(t1, {1});
-  auto t3 = shift(t2, {1});
-  auto ref = t3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftRfactor1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = shift(tv2, {1});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-  tv0->computeAt(tv3, 1);
-
-  tv2->split(-1, 32);
-  auto rf = tv2->rFactor({-2});
-  tv0->computeAt(tv2, -1);
-  tv0->computeAt(rf, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  const int numel_x = 201;
-  const int numel_y = 301;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = sum(t1, {1});
-  auto t3 = shift(t2, {1});
-  auto ref = t3;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftBcast1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = shift(tv2, {0, 1});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-
-  const int numel_x = 9;
-  const int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1;
-  auto ref = t4;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftBcast2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = shift(tv2, {1, 0});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->split(0, 4);
-  tv0->computeAt(tv4, 1);
-
-  const int numel_x = 9;
-  const int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t3 = shift(t2, {1, 0});
-  auto ref = t3 + t1;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Combine ShiftBcast1 and ShiftBcast2 with parallelization
-TEST(NVFuserTest, FusionShiftBcast3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = shift(tv2, {1, 0});
-  auto tv4 = shift(tv2, {0, 1});
-  auto tv5 = shift(tv2, {-1, -1});
-  auto tv6 = add(tv3, tv4);
-  auto tv7 = add(tv6, tv5);
-  auto tv8 = add(tv7, tv1);
-  fusion.addOutput(tv8);
-
-  tv8->split(0, 4);
-  tv8->split(-1, 4);
-  tv0->computeAt(tv8, 1);
-
-  tv8->axis(-1)->parallelize(ParallelType::TIDx);
-  for (auto tv : {tv8, tv7, tv6, tv5, tv4, tv3, tv2}) {
-    tv->axis(1)->parallelize(ParallelType::TIDy);
-  }
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  const int numel_x = 101;
-  const int numel_y = 201;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t3 = shift(t2, {1, 0});
-  auto t4 = t2;
-  auto t5 = shift(t2, {-1, 0});
-  auto ref = t3 + t4 + t5 + t1;
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// See issue #893
-TEST(NVFuserTest, FusionShiftSyncPlacement1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv0, new Double(2));
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = shift(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 8);
-  tv0->computeAt(tv4, 2);
-
-  tv2->computeAt(tv3, -1);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = t0 + 2;
-  auto t3 = add(t1, t2);
-  auto t4 = shift(t3, {0, 1});
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-// See issue #893. Top-level placement.
-TEST(NVFuserTest, FusionShiftSyncPlacement2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv0, new Double(2));
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = shift(tv3, {1});
-  fusion.addOutput(tv4);
-
-  tv2->computeAt(tv3, -1);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = t0 + 2;
-  auto t3 = add(t1, t2);
-  auto t4 = shift(t3, {1});
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftSyncPlacement3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = add(tv1, new Double(2));
-  auto tv3 = shift(tv2, {1});
-  fusion.addOutput(tv3);
-
-  // This doesn't work. syncthreads is needed between tv1 and tv2, but
-  // both the loop extent of both tv1 and tv2 has halo, so the loop is
-  // not eliminated even though it is parallelized. Moving syncthreads
-  // out of the loop would make it placed before tv1, which would make
-  // it meaningless.
-  // Ideally, an exception should be thrown at this computeAt, but at
-  // this point, the fusion is not yet parallelized, nor memory type
-  // is set, so this computeAt itself is not an error yet.
-  tv1->computeAt(tv2, -1);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // The error should be detected when the fusion is lowered.
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-// Based on original CUDA provided by Vishal Mehta.
-// Major differences with the original version:
-// - The original version uses additional 2 warps to load the halos
-//   along the Y dimension. The other 10 warps are used to load a 32x10
-//   tile, and all warps will do coalesced loads. No such optimization
-//   is done in the fuser version.
-TEST(NVFuserTest, FusionHdiff_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-  auto coeff = makeSymbolicTensor(3);
-  fusion.addInput(coeff);
-
-  std::vector<std::vector<int>> offsets{
-      {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
-
-  // T2, T3, T4, T5
-  std::vector<TensorView*> inp_neighbors;
-  for (const auto& offset : offsets) {
-    inp_neighbors.push_back(shift(inp, offset, false));
-  }
-
-  // T8
-  TensorView* sum_of_neighbors = nullptr;
-  for (auto inp_neighbor : inp_neighbors) {
-    if (sum_of_neighbors == nullptr) {
-      sum_of_neighbors = inp_neighbor;
-    } else {
-      sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
-    }
-  }
-
-  // T9 = T0 * 4
-  // T10 = T9 - T8
-  auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors);
-
-  // T11 = shift(T10)
-  // T12 = T11 - T10
-  auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
-  // T14 = T13 - T0
-  // T15 = T12 * T14
-  // T16 = T15 > 0
-  // T17 = T16 ? 0 : T12
-  auto flx_cond =
-      gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0));
-  auto flx0 = where(flx_cond, new Double(0), flx);
-
-  // T18 = shift(T10)
-  // T19 = T18 - T10
-  auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
-  // T20 = shift(T0)
-  // T21 = T20 - T0
-  // T22 = T19 * T21
-  // T23 = T22 > 0
-  auto fly_cond =
-      gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0));
-  // T24 = T23 ? 0 : T19
-  auto fly0 = where(fly_cond, new Double(0), fly);
-
-  // T25 = shift(flx0)
-  // T26 = T17 - T25
-  // T27 = shift(fly0)
-  // T28 = T24 - T27
-  // T29 = T26 + T28
-  // T30 = T1 * T29
-  // T31 = T0 - T30
-  auto out =
-      sub(inp,
-          mul(coeff,
-              add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
-                  sub(fly0, shift(fly0, {0, 1, 0}, false)))));
-
-  fusion.addOutput(out);
-
-  /////////////////////////////////
-  // Scheduling
-  /////////////////////////////////
-
-  out->setContiguity(false);
-
-  // Step 1: 2D Tiling
-
-  const int tile_x = 32;
-  const int tile_y = 8;
-
-  out->split(-1, tile_x);
-  out->split(-3, tile_y);
-  out->reorder({{-2, -3}});
-  inp->computeAt(out, -3);
-  coeff->computeAt(out, -3);
-
-  // Step 2: Inlining
-
-  // Inline inputs to lap
-  auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
-  for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
-    if (val != lap && val != inp) {
-      val->computeAt(lap, -1);
-    }
-  }
-
-  // Inline inputs to flx0
-  auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
-  for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
-    if (val != lap && val != flx0 && val != inp) {
-      val->computeAt(flx0, -1);
-    }
-  }
-
-  // Inline inputs to fly0
-  auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
-  for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
-    if (val != lap && val != fly0 && val != inp) {
-      val->computeAt(fly0, -1);
-    }
-  }
-
-  // Inline inputs to out
-  auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
-  for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
-    if (val != flx0 && val != fly0 && val != out) {
-      val->computeAt(out, -1);
-    }
-  }
-
-  // Step 3: Parallelization
-
-  // Block parallelization
-  out->axis(0)->parallelize(ParallelType::BIDz);
-  out->axis(1)->parallelize(ParallelType::BIDy);
-  out->axis(2)->parallelize(ParallelType::BIDx);
-  // Thread parallelization
-  out->axis(3)->parallelize(ParallelType::TIDy);
-  out->axis(4)->parallelize(ParallelType::TIDx);
-  // Apply the same parallelization to all other tensors
-  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
-
-  // Store intermediate stencil results on smem so that they can be
-  // accessed by threads
-  for (auto tv : {flx0, fly0, lap}) {
-    tv->setMemoryType(MemoryType::Shared);
-  }
-
-  /////////////////////////////////
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 101;
-  int numel_y = 99;
-  int numel_z = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
-  at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
-  std::vector<IValue> inputs = {inp_at, coeff_at};
-  auto fuser_output = fe.runFusion(inputs)[0];
-  // Trim the outer rim
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(0, at::indexing::None),
-      at::indexing::Slice(2, -2),
-      at::indexing::Slice(2, -2)};
-  fuser_output = fuser_output.index(indices);
-
-  {
-    at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
-    auto lap = inp_at * 4 -
-        (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
-         shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
-    auto flx = shift(lap, {0, 0, -1}) - lap;
-    auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
-    auto flx0 = at::where(flx_cond, zeros, flx);
-    auto fly = shift(lap, {0, -1, 0}) - lap;
-    auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
-    auto fly0 = at::where(fly_cond, zeros, fly);
-
-    auto ref = inp_at -
-        coeff_at *
-            ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
-    ref = ref.index(indices);
-
-    testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
-  }
-}
-
-TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-  auto coeff = makeSymbolicTensor(3);
-  fusion.addInput(coeff);
-
-  std::vector<std::vector<int>> offsets{
-      {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
-
-  // T2, T3, T4, T5
-  std::vector<TensorView*> inp_neighbors;
-  for (const auto& offset : offsets) {
-    inp_neighbors.push_back(shift(inp, offset, false));
-  }
-
-  // T8
-  TensorView* sum_of_neighbors = nullptr;
-  for (auto inp_neighbor : inp_neighbors) {
-    if (sum_of_neighbors == nullptr) {
-      sum_of_neighbors = inp_neighbor;
-    } else {
-      sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
-    }
-  }
-
-  // T9 = T0 * 4
-  // T10 = T9 - T8
-  auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors);
-
-  // T11 = shift(T10)
-  // T12 = T11 - T10
-  auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
-  // T14 = T13 - T0
-  // T15 = T12 * T14
-  // T16 = T15 > 0
-  // T17 = T16 ? 0 : T12
-  auto flx_cond =
-      gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0));
-  auto flx0 = where(flx_cond, new Double(0), flx);
-
-  // T18 = shift(T10)
-  // T19 = T18 - T10
-  auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
-  // T20 = shift(T0)
-  // T21 = T20 - T0
-  // T22 = T19 * T21
-  // T23 = T22 > 0
-  auto fly_cond =
-      gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0));
-  // T24 = T23 ? 0 : T19
-  auto fly0 = where(fly_cond, new Double(0), fly);
-
-  // T25 = shift(flx0)
-  // T26 = T17 - T25
-  // T27 = shift(fly0)
-  // T28 = T24 - T27
-  // T29 = T26 + T28
-  // T30 = T1 * T29
-  // T31 = T0 - T30
-  auto out =
-      sub(inp,
-          mul(coeff,
-              add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
-                  sub(fly0, shift(fly0, {0, 1, 0}, false)))));
-
-  fusion.addOutput(out);
-
-  out->setContiguity(false);
-
-  /////////////////////////////////
-  // Scheduling
-  /////////////////////////////////
-
-  const auto all_vals = fusion.usedMathVals();
-  const std::vector<TensorView*> all_tensors(
-      {ir_utils::filterByType<TensorView>(all_vals).begin(),
-       ir_utils::filterByType<TensorView>(all_vals).end()});
-
-  // Step 1: Blocking
-  // - Thread block size: (tile_x, tile_y)
-  // - Each thread computes a vertical column of length tile_z along the Z
-  // axis.
-  // - Grid dize: (NX / block_x, NY / block_y, NZ / tile_z)
-
-  const int tile_x = 32;
-  const int tile_y = 8;
-  const int tile_z = 16;
-
-  out->split(0, tile_z);
-  out->split(-1, tile_x, true, true);
-  out->split(-3, tile_y, true, true);
-  // out: [NZ/tz, tz, NY/by, by, NX/bx, bx]
-  out->reorder({{1, 3}, {2, 1}, {3, 4}, {4, 2}});
-  // out: [NZ/tz, NY/by, NX/bx, tz, by, bx]
-
-  TransformPropagator::from(out);
-
-  inp->computeAt(out, 4);
-
-  // Step 2: Inlining
-
-  // Inline inputs to lap
-  auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
-  for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
-    if (val != lap && val != inp) {
-      val->computeAt(lap, -1);
-    }
-  }
-
-  // Inline inputs to flx0
-  auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
-  for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
-    if (val != lap && val != flx0 && val != inp) {
-      val->computeAt(flx0, -1);
-    }
-  }
-
-  // Inline inputs to fly0
-  auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
-  for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
-    if (val != lap && val != fly0 && val != inp) {
-      val->computeAt(fly0, -1);
-    }
-  }
-
-  // Inline inputs to out
-  auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
-  for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
-    if (val != flx0 && val != fly0 && val != out) {
-      val->computeAt(out, -1);
-    }
-  }
-
-  // Step 3: Parallelization
-
-  // Block parallelization
-  out->axis(0)->parallelize(ParallelType::BIDz);
-  out->axis(1)->parallelize(ParallelType::BIDy);
-  out->axis(2)->parallelize(ParallelType::BIDx);
-  out->axis(4)->parallelize(ParallelType::TIDy);
-  out->axis(5)->parallelize(ParallelType::TIDx);
-  // Unswitch at the tz axis
-  out->axis(3)->parallelize(ParallelType::Unswitch);
-
-  scheduler_utils::parallelizeAllLike(out, all_tensors);
-
-  // These need to be on smem
-  for (auto tv : {flx0, fly0, lap}) {
-    tv->setMemoryType(MemoryType::Shared);
-  }
-
-  /////////////////////////////////
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int halo_extent = 2;
-  const int numel_x = 64 + halo_extent * 2;
-  const int numel_y = 64 + halo_extent * 2;
-  const int numel_z = 32;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
-  at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
-  std::vector<IValue> inputs = {inp_at, coeff_at};
-  auto fuser_output = fe.runFusion(inputs)[0];
-  // Trim the outer rim
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(0, at::indexing::None),
-      at::indexing::Slice(2, -2),
-      at::indexing::Slice(2, -2)};
-  fuser_output = fuser_output.index(indices);
-
-  {
-    at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
-    auto lap = inp_at * 4 -
-        (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
-         shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
-    auto flx = shift(lap, {0, 0, -1}) - lap;
-    auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
-    auto flx0 = at::where(flx_cond, zeros, flx);
-    auto fly = shift(lap, {0, -1, 0}) - lap;
-    auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
-    auto fly0 = at::where(fly_cond, zeros, fly);
-
-    auto ref = inp_at -
-        coeff_at *
-            ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
-    ref = ref.index(indices);
-
-    testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
-  }
-}
-
-// 3x3 max pooling
-TEST(NVFuserTest, FusionMaxPooling_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Format: CHW
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // 3x3 pooling of the HW spatial domain
-  std::vector<std::vector<int>> offsets;
-  for (int i = -1; i <= 1; ++i) {
-    for (int j = -1; j <= 1; ++j) {
-      if (i == 0 && j == 0) {
-        continue;
-      }
-      offsets.push_back({i, j});
-    }
-  }
-
-  std::vector<TensorView*> inp_tile({inp});
-  for (auto offset : offsets) {
-    offset.insert(offset.begin(), 0);
-    inp_tile.push_back(shift(inp, offset));
-  }
-
-  TensorView* max_tensor = nullptr;
-  for (auto tv : inp_tile) {
-    if (max_tensor == nullptr) {
-      max_tensor = tv;
-    } else {
-      max_tensor = binaryOp(BinaryOpType::Max, max_tensor, tv);
-    }
-  }
-
-  fusion.addOutput(max_tensor);
-
-  ////////////////////////////////////
-
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Tiling the spatial domain
-  const int tile_x = 32;
-  const int tile_y = 8;
-
-  max_tensor->split(-2, tile_y);
-  max_tensor->axis(-2)->parallelize(ParallelType::TIDy);
-  max_tensor->split(-1, tile_x);
-  max_tensor->axis(-1)->parallelize(ParallelType::TIDx);
-  max_tensor->reorder({{-3, -2}});
-
-  inp_cache->computeAt(max_tensor, 3);
-  inp_cache->axis(-2)->parallelize(ParallelType::TIDy);
-  inp_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  auto max_tensor_dep =
-      DependencyCheck::getAllValsBetween({inp_cache}, {max_tensor});
-  for (auto tv : ir_utils::filterByType<TensorView>(max_tensor_dep)) {
-    if (tv == inp_cache || tv == max_tensor) {
-      continue;
-    }
-    tv->computeAt(max_tensor, -1);
-  }
-
-  max_tensor->axis(0)->parallelize(ParallelType::BIDx);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int hw = 50;
-  const int num_channels = 20;
-  const int pooling_window = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
-  // shift always pads by zero, so if all surrounding values are
-  // negative, max pooling would pick a padded value, which isn't the
-  // correct behavior. We need to be able to choose the value of
-  // padding. In this case, padding by the minimum value would not
-  // have this problem. For now, avoid the problem by making sure all
-  // values are not negative.
-  aten_inp = at::abs(aten_inp);
-  std::vector<IValue> inputs = {aten_inp};
-
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = at::max_pool2d(
-      aten_inp, {pooling_window, pooling_window}, {1, 1}, {1, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGatherPadding1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  const std::vector<int> window_shape = {1, 3};
-  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
-
-  auto tv1 = gather(tv0, window_shape, padding_width);
-
-  fusion.addOutput(tv1);
-
-  const int s1 = 11;
-  const int s2 = 13;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1, s2}, options);
-
-  auto ref = gather(t0, window_shape, padding_width);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  TORCH_CHECK(ref.equal(outputs[0]));
-}
-
-TEST(NVFuserTest, FusionGatherPadding2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {1, 3};
-  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  tv3->split(1, 32);
-  tv0->computeAt(tv3, 2);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDy);
-  tv3->axis(1)->parallelize(ParallelType::BIDx);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-  tv1->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  const int s1 = 99;
-  const int s2 = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1, s2}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionConv2DStatic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [C, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // Weights: [K, C, 3, 3]
-  auto w = makeSymbolicTensor(4);
-  fusion.addInput(w);
-
-  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
-  // side of the spatial dimensions
-  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}});
-  // inp_tile: [C, H, W, 1, 3, 3]
-
-  auto inp_bc =
-      broadcast(inp_tile, {true, false, false, false, false, false, false});
-  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
-
-  auto inp_times_w = mul(inp_bc, w_bc);
-
-  // Reduce the channel and neighbor tile dimensions
-  auto out = sum(inp_times_w, {1, 4, 5, 6});
-
-  fusion.addOutput(out);
-
-  ////////////////////////////////////
-
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-  // Blocking the channel dimension
-  const int block_c = 8;
-
-  out->split(2, block_h);
-  out->split(4, block_w);
-  out->reorder({{3, 4}});
-  // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
-
-  out->split(1, block_c);
-  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
-
-  auto out_rf = out->rFactor({1, -3, -2, -1});
-  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
-  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
-
-  // Create a [block_x, block_y] tile on smem
-  inp_cache->computeAt(out, 4);
-  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  // Move Ci forward
-  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
-  inp_cache->computeAt(out_rf, 5);
-
-  inp_tile->computeAt(out_rf, -1);
-  w->computeAt(out_rf, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(1)->parallelize(ParallelType::TIDz);
-  out->axis(4)->parallelize(ParallelType::TIDy);
-  out->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 99;
-  const int dim_w = 101;
-  const int dim_c = 10;
-  const int dim_f = 20;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
-  at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
-  std::vector<IValue> inputs = {at_inp, at_w};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
-  auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
-  at_out = at_out.squeeze(0); // drop the N axis
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
-}
-
-// Mostly the same as the static conv test, but the shape of the weights,
-// 3x3 in this case, is given dynamically
-TEST(NVFuserTest, FusionConv2DDynamic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [C, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // Weights: [K, C, S, T]
-  auto w = makeSymbolicTensor(4);
-  fusion.addInput(w);
-
-  auto w_h = new Int();
-  fusion.addInput(w_h);
-  auto w_w = new Int();
-  fusion.addInput(w_w);
-
-  auto pad_h = new Int();
-  fusion.addInput(pad_h);
-  auto pad_w = new Int();
-  fusion.addInput(pad_w);
-
-  // Gather a neighbor tile of [w_dim_h, w_dim_w] with padding
-  auto inp_tile = gather(
-      inp,
-      {new Int(1), w_h, w_w},
-      {{new Int(0), new Int(0)}, {pad_h, pad_h}, {pad_w, pad_w}});
-  // inp_tile: [C, 1, H - w_h + 1, W - w_w + 1, w_h, w_w]
-
-  auto inp_bc =
-      broadcast(inp_tile, {true, false, false, false, false, false, false});
-  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
-
-  auto inp_times_w = mul(inp_bc, w_bc);
-
-  // Reduce the channel and neighbor tile dimensions
-  auto out = sum(inp_times_w, {1, 4, 5, 6});
-
-  fusion.addOutput(out);
-
-  ////////////////////////////////////
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-  // Blocking the channel dimension
-  const int block_c = 8;
-
-  out->split(2, block_h);
-  out->split(4, block_w);
-  out->reorder({{3, 4}});
-  // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
-
-  out->split(1, block_c);
-  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
-
-  auto out_rf = out->rFactor({1, -3, -2, -1});
-  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
-  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
-
-  // Create a [block_x, block_y] tile on smem
-  inp_cache->computeAt(out, 4);
-  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  // Move Ci forward
-  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
-  inp_cache->computeAt(out_rf, 5);
-
-  inp_tile->computeAt(out_rf, -1);
-  w->computeAt(out_rf, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(1)->parallelize(ParallelType::TIDz);
-  out->axis(4)->parallelize(ParallelType::TIDy);
-  out->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 99;
-  const int dim_w = 101;
-  const int dim_c = 10;
-  const int dim_f = 20;
-  const int dim_w_h = 3;
-  const int dim_w_w = 3;
-  const int dim_pad_h = (dim_w_h - 1) / 2;
-  const int dim_pad_w = (dim_w_w - 1) / 2;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
-  at::Tensor at_w = at::randn({dim_f, dim_c, dim_w_h, dim_w_w}, options);
-  std::vector<IValue> inputs = {
-      at_inp, at_w, dim_w_h, dim_w_w, dim_pad_h, dim_pad_w};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
-  auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
-  at_out = at_out.squeeze(0); // drop the N axis
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
-}
-
-// 5x5 followed by 3x3
-TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [K1, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // Weights: [K2, K1, S1, T1]
-  auto w1 = makeSymbolicTensor(4);
-  fusion.addInput(w1);
-
-  // Weights: [K3, K2, S2, T2]
-  auto w2 = makeSymbolicTensor(4);
-  fusion.addInput(w2);
-
-  auto w1_h = new Int();
-  fusion.addInput(w1_h);
-  auto w1_w = new Int();
-  fusion.addInput(w1_w);
-
-  auto w2_h = new Int();
-  fusion.addInput(w2_h);
-  auto w2_w = new Int();
-  fusion.addInput(w2_w);
-
-  auto pad_h1 = new Int();
-  fusion.addInput(pad_h1);
-  auto pad_w1 = new Int();
-  fusion.addInput(pad_w1);
-
-  auto pad_h2 = new Int();
-  fusion.addInput(pad_h2);
-  auto pad_w2 = new Int();
-  fusion.addInput(pad_w2);
-
-  // Gather a neighbor tile of [w1_h, w1_w] with padding
-  auto inp_tile = gather(
-      inp,
-      {new Int(1), w1_h, w1_w},
-      {{new Int(0), new Int(0)}, {pad_h1, pad_h1}, {pad_w1, pad_w1}});
-  // inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w]
-
-  auto inp_bc =
-      broadcast(inp_tile, {true, false, false, false, false, false, false});
-  auto w1_bc = broadcast(w1, {false, false, true, true, true, false, false});
-
-  auto inp_times_w1 = mul(inp_bc, w1_bc);
-
-  // Reduce the channel and neighbor tile dimensions
-  auto out1 = sum(inp_times_w1, {1, 4, 5, 6});
-
-  // Second conv
-  auto out1_tile = gather(
-      out1,
-      {new Int(1), w2_h, w2_w},
-      {{new Int(0), new Int(0)}, {pad_h2, pad_h2}, {pad_w2, pad_w2}});
-
-  auto out1_bc =
-      broadcast(out1_tile, {true, false, false, false, false, false, false});
-  auto w2_bc = broadcast(w2, {false, false, true, true, true, false, false});
-
-  auto out1_times_w2 = mul(out1_bc, w2_bc);
-
-  auto out2 = sum(out1_times_w2, {1, 4, 5, 6});
-
-  fusion.addOutput(out2);
-
-  ////////////////////////////////////
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-
-  out2->split(2, block_h);
-  out2->split(4, block_w);
-  out2->reorder({{3, 4}});
-  // out2: [K3, K2, Ho, Wo, Hi, Wi, 1, 3, 3]
-
-  // Create a [block_x, block_y] tile on smem
-  inp_cache->computeAt(out2, 4);
-  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  // Move Ci forward
-  out1->reorder({{5, 3}, {3, 4}, {4, 5}});
-  out1->setMemoryType(MemoryType::Shared);
-
-  inp_cache->computeAt(out1, 4);
-
-  inp_tile->computeAt(out1, -1);
-  w1->computeAt(out1, -1);
-
-  out1_tile->computeAt(out2, -1);
-  w2->computeAt(out2, -1);
-
-  out2->axis(0)->parallelize(ParallelType::BIDx);
-  out2->axis(4)->parallelize(ParallelType::TIDy);
-  out2->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 99;
-  const int dim_w = 101;
-  const int dim_k1 = 3;
-  const int dim_k2 = 5;
-  const int dim_k3 = 7;
-  const int dim_w1_h = 5;
-  const int dim_w1_w = 5;
-  const int dim_pad1_h = (dim_w1_h - 1) / 2;
-  const int dim_pad1_w = (dim_w1_w - 1) / 2;
-  const int dim_w2_h = 3;
-  const int dim_w2_w = 3;
-  const int dim_pad2_h = (dim_w2_h - 1) / 2;
-  const int dim_pad2_w = (dim_w2_w - 1) / 2;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options);
-  at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options);
-  at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options);
-  std::vector<IValue> inputs = {
-      at_inp,
-      at_w1,
-      at_w2,
-      dim_w1_h,
-      dim_w1_w,
-      dim_w2_h,
-      dim_w2_w,
-      dim_pad1_h,
-      dim_pad1_w,
-      dim_pad2_h,
-      dim_pad2_w};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
-  auto at_out1 = at::conv2d(at_inp, at_w1, {}, 1, 2);
-  auto at_out2 = at::conv2d(at_out1, at_w2, {}, 1, 1);
-  at_out2 = at_out2.squeeze(0); // drop the N axis
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [C, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // Weights: [K, C, 2, 2]
-  auto w = makeSymbolicTensor(4);
-  fusion.addInput(w);
-
-  // Gather a neighbor tile of [2, 2] with padding size of 1 only for
-  // the right side of the spatial dimensions. The left padding is
-  // zero so that the output axis stays the same.
-  auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 1}, {0, 1}});
-  // inp_tile: [C, H, W, 1, 2, 2]
-
-  auto inp_bc =
-      broadcast(inp_tile, {true, false, false, false, false, false, false});
-  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
-
-  auto inp_times_w = mul(inp_bc, w_bc);
-
-  // Reduce the channel and neighbor tile dimensions
-  auto out = sum(inp_times_w, {1, 4, 5, 6});
-
-  fusion.addOutput(out);
-
-  ////////////////////////////////////
-
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-  // Blocking the channel dimension
-  const int block_c = 8;
-
-  out->split(2, block_h);
-  out->split(4, block_w);
-  out->reorder({{3, 4}});
-  // out: [K, C, Ho, Wo, Hi, Wi, 1, 2, 2]
-
-  out->split(1, block_c);
-  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
-
-  auto out_rf = out->rFactor({1, -3, -2, -1});
-  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
-  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
-
-  // Create a [block_x, block_y] tile on smem
-  inp_cache->computeAt(out, 4);
-  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  // Move Ci forward
-  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
-  inp_cache->computeAt(out_rf, 5);
-
-  inp_tile->computeAt(out_rf, -1);
-  w->computeAt(out_rf, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(1)->parallelize(ParallelType::TIDz);
-  out->axis(4)->parallelize(ParallelType::TIDy);
-  out->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 99;
-  const int dim_w = 101;
-  const int dim_c = 10;
-  const int dim_f = 20;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
-  at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
-  std::vector<IValue> inputs = {at_inp, at_w};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
-  auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
-  at_out = at_out.squeeze(0); // drop the N axis
-  // The shape of the spatial domain is (dim_h+1)x(dim_w+1), whereas
-  // the fuser output has dim_h*dim_w. Drop the first elements to make
-  // it match with the fuser output.
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(0, at::indexing::None),
-      at::indexing::Slice(1, at::indexing::None),
-      at::indexing::Slice(1, at::indexing::None)};
-  at_out = at_out.index(indices);
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
-}
-
-// POC implementation of im2col for 3-by-3 kernels
-TEST(NVFuserTest, FusionIm2Col_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [N, C, H, W]
-  auto inp = makeSymbolicTensor(4);
-  fusion.addInput(inp);
-
-  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
-  // side of the spatial dimensions
-  auto inp_tile = gather(inp, {1, 1, 3, 3}, {{0, 0}, {0, 0}, {1, 1}, {1, 1}});
-  // inp_tile: [N, C, H, W, 1, 1, 3, 3]
-
-  auto inp_col = transpose(inp_tile, {{1, 3}, {2, 1}, {3, 2}});
-  // inp_col: [N, H, W, C, 1, 1, 3, 3]
-
-  fusion.addOutput(inp_col);
-
-  ////////////////////////////////////
-
-  // Cache the input tensor
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-
-  auto out = inp_col;
-
-  out->split(1, block_h);
-  out->split(3, block_w);
-  out->reorder({{2, 3}});
-  // out: [N, Ho, Wo, Hi, Wi, C, 1, 1, 3, 3]
-  // Move the C axis out of Hi*Wi
-  out->reorder({{5, 3}, {3, 4}, {4, 5}});
-  // out: [N, Ho, Wo, C, Hi, Wi, 1, 1, 3, 3]
-
-  // Create a [block_x, block_y] tile on smem
-  inp_cache->computeAt(out, 4);
-  inp_cache->setMemoryType(MemoryType::Shared);
-  // Fully inline inp_tile
-  inp_tile->computeAt(out, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDz);
-  out->axis(1)->parallelize(ParallelType::BIDy);
-  out->axis(2)->parallelize(ParallelType::BIDx);
-  out->axis(4)->parallelize(ParallelType::TIDy);
-  out->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 31;
-  const int dim_w = 33;
-  const int dim_c = 5;
-  const int dim_n = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options);
-  std::vector<IValue> inputs = {at_inp};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1});
-
-  // at::im2col outputs [N, C*3*3, N*H]
-  at_out = at::transpose(at_out, 1, 2);
-  at_out = at::reshape(at_out, {dim_n, dim_h, dim_w, dim_c, 1, 1, 3, 3});
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, false);
-  auto tv3 = shift(tv1, {-1, 1}, false);
-  auto tv4 = add(tv2, tv3);
-  auto tv5 = sum(tv4, {0, 1});
-
-  fusion.addOutput(tv5);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv5->split(0, 4);
-  tv5->split(-1, 8);
-  tv5->reorder({{1, 2}});
-
-  TransformPropagator::from(tv5);
-
-  tv2->computeAt(tv5, -1);
-  tv3->computeAt(tv5, -1);
-
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t1, {-1, 1});
-  auto t4 = t2 + t3;
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
-  t4 = t4.index(indices);
-  auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Split and merge
-TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, false);
-  auto tv3 = shift(tv1, {-1, 1}, false);
-  auto tv4 = add(tv2, tv3);
-  auto tv5 = sum(tv4, {0, 1});
-
-  fusion.addOutput(tv5);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv5->split(0, 4);
-  tv5->split(-1, 8);
-  tv5->reorder({{1, 2}});
-  tv5->merge(-2, -1);
-
-  TransformPropagator::from(tv5);
-
-  tv2->computeAt(tv5, -1);
-  tv3->computeAt(tv5, -1);
-
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t1, {-1, 1});
-  auto t4 = t2 + t3;
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
-  t4 = t4.index(indices);
-  auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Split and merge, then welford
-TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, false);
-  auto tv3 = shift(tv1, {-1, 1}, false);
-  auto tv4 = add(tv2, tv3);
-  auto tvs = Welford(tv4, {0, 1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv_avg->split(0, 4);
-  tv_avg->split(-1, 8);
-  tv_avg->reorder({{1, 2}});
-  tv_avg->merge(-2, -1);
-
-  TransformPropagator::from(tv_avg);
-
-  tv2->computeAt(tv_avg, -1);
-  tv3->computeAt(tv_avg, -1);
-
-  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv_avg, ir_utils::allTvs(&fusion));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-  outputs[1] /= (numel_x - 2) * (numel_y - 2);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t1, {-1, 1});
-  auto t4 = t2 + t3;
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
-  t4 = t4.index(indices);
-  auto ref_avg = t4.mean(at::ArrayRef<int64_t>{0, 1});
-  auto ref_M2 = t4.var(at::ArrayRef<int64_t>{0, 1}, false);
-  auto ref_N = at::ones({}, options_int) * (numel_x - 2) * (numel_y - 2);
-
-  testValidate(
-      &fusion, outputs, inputs, {ref_avg, ref_M2, ref_N}, __LINE__, __FILE__);
-}
-
-// Shift indexing and predication with contiguous merge
-TEST(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, true);
-  auto tv3 = shift(tv1, {-1, 1}, false);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv2->merge(0);
-  tv3->merge(0);
-  tv4->merge(0);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
-
-  auto fuser_out = outputs[0].index(indices);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t1, {-1, 1});
-  auto ref = t2 + t3;
-
-  ref = ref.index(indices);
-
-  testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, false);
-  auto tv3 = shift(tv2, {1, -1}, false);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv4->split(0, 4);
-  tv4->split(-1, 8);
-  tv4->reorder({{1, 2}});
-
-  tv1->computeAt(tv4, 2);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::TIDy);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDy);
-  tv4->axis(1)->parallelize(ParallelType::BIDx);
-
-  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  int numel_x = 99;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = shift(t1, {1, -1});
-  auto t3 = shift(t2, {1, -1});
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(2, at::indexing::None), at::indexing::Slice(0, -2)};
-  t3 = t3.index(indices);
-  auto ref = t3.sum(at::ArrayRef<int64_t>{0, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Rfactor is not allowed with partial domains
-TEST(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1, -1}, false);
-  auto tv3 = sum(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-  tv3->split(-1, 8);
-  tv3->reorder({{1, 2}});
-
-  ASSERT_ANY_THROW(tv3->rFactor({-2}));
-}
-
-TEST(NVFuserTest, FusionPartialSplit1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  // [I]
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(0));
-  // [I]
-  auto tv2 = shift(tv1, {1}, false);
-  // [1:I]
-  auto tv3 = shift(tv1, {-1}, false);
-  // [0:I-1]
-  auto tv4 = add(tv2, tv3);
-  // [1:I-1]
-  fusion.addOutput(tv4);
-
-  // Partial split of tv4. Split only the valid range, which is
-  // [1:-1].
-  tv4->split(0, 8, true, true);
-  // [(I-2)/8, 8]
-
-  // Propagates the partial split back to tv1. This means that all of
-  // the other tensors are also shaped as [(I-2)/8, 8], which appears
-  // to mean only the sub region of ((I-2)/8 * 8) is
-  // computed for tv1, tv2 and tv3. It's fine for the tv2 and tv3
-  // tensors as only that sub region is used by tv4. It's also fine
-  // for tv1 since it has halo of size one at each side, so the whole
-  // region is actually calculated for tv1.
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::BIDx);
-  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  // gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8),
-  // so it's going to be just 2 rather than 3.
-  const int numel_x = 18;
-
-  ExpressionEvaluator evaluator(&fusion);
-  auto root_extent = tv4->getRootDomain()[0]->extent();
-  evaluator.bind(root_extent, numel_x);
-  auto extent_eval = evaluator.evaluate(tv4->axis(0)->extent());
-  TORCH_CHECK(
-      extent_eval.has_value(),
-      "Invalid evaluation of outer domain extent of partial split");
-  TORCH_CHECK(
-      extent_eval.value() == (numel_x - 2) / 8,
-      "Invalid extent of outer domain of partial split");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{at::indexing::Slice(1, -1)};
-
-  outputs[0] = outputs[0].index(indices);
-
-  auto ref = (shift(t0, {1}) + shift(t0, {-1})).index(indices);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionPartialSplit2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(0));
-  auto tv2 = shift(tv1, {1}, false);
-  auto tv3 = shift(tv1, {-1}, false);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = add(tv1, new Double(1));
-  auto tv6 = add(tv5, new Double(1));
-  fusion.addOutput(tv6);
-
-  tv4->split(0, 4, true, true);
-
-  // This causes tv5 and tv6 also to be split with the same partial
-  // offsets, however, since they need to be calculated entirely, the
-  // resulting code would be invalid. It should be detected as part of
-  // initial fusion validation during lowering.
-  tv1->computeAt(tv4, 1);
-
-  // Validation should throw an error due to tv5 and tv6.
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-// 2D version of PartialSplit1
-TEST(NVFuserTest, FusionPartialSplit3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(0));
-  auto tv2 = shift(tv1, {1, 2}, false);
-  auto tv3 = shift(tv1, {-2, -1}, false);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 8, true, true);
-  tv4->split(0, 4, true, true);
-  tv4->reorder({{1, 2}, {2, 1}});
-
-  tv1->computeAt(tv4, 2);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDy);
-  tv4->axis(1)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::TIDy);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int numel_x = 32 + 3;
-  const int numel_y = 32 + 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, -2), at::indexing::Slice(2, -1)};
-
-  outputs[0] = outputs[0].index(indices);
-
-  auto ref = (shift(t0, {1, 2}) + shift(t0, {-2, -1})).index(indices);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Almost same fusion with Shift5ptStencilChain but non-padded shift
-// and partial split.
-TEST(NVFuserTest, FusionPartialSplit4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
-
-  // First stencil: 5pt stencil
-  // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
-  std::vector<TensorView*> tv_stencil1_shifts;
-  for (const auto& offset : offsets) {
-    tv_stencil1_shifts.push_back(shift(tv0, offset, false));
-  }
-
-  auto tv_stencil1 = tv0;
-  for (auto tv : tv_stencil1_shifts) {
-    tv_stencil1 = add(tv_stencil1, tv);
-  }
-
-  tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1));
-
-  // Second stencil: Same 5pt stencil
-  std::vector<TensorView*> tv_stencil2_shifts;
-  for (const auto& offset : offsets) {
-    tv_stencil2_shifts.push_back(shift(tv_stencil1, offset, false));
-  }
-
-  auto tv_stencil2 = tv_stencil1;
-  for (auto tv : tv_stencil2_shifts) {
-    tv_stencil2 = add(tv_stencil2, tv);
-  }
-
-  tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1));
-
-  auto tv_out = tv_stencil2;
-
-  fusion.addOutput(tv_out);
-
-  auto tv0_cache = tv0->cache_after();
-
-  std::vector<int> split_factor({16, 16});
-
-  tv_out->split(-1, split_factor[1], true, true);
-  tv_out->split(0, split_factor[0], true, true);
-  tv_out->reorder({{1, 2}, {2, 1}});
-
-  tv0->computeAt(tv_out, 2);
-
-  // Inline completely all inputs to the first stencil output, except for the
-  // tv0 cache
-  for (auto tv : tv_stencil1_shifts) {
-    tv->computeAt(tv_stencil1, -1);
-  }
-
-  // Inline completely all inputs to the second stencil output, except
-  // for the first stencil output
-  for (auto tv : tv_stencil2_shifts) {
-    tv->computeAt(tv_stencil2, -1);
-  }
-
-  tv_out->axis(0)->parallelize(ParallelType::BIDy);
-  tv_out->axis(1)->parallelize(ParallelType::BIDx);
-  tv_out->axis(2)->parallelize(ParallelType::TIDy);
-  tv_out->axis(3)->parallelize(ParallelType::TIDx);
-
-  auto all_values = DependencyCheck::getAllValsBetween(
-      {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
-  for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
-    scheduler_utils::parallelizeAllLike(tv_out, {tv});
-  }
-
-  tv0_cache->setMemoryType(MemoryType::Shared);
-  tv_stencil1->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  // Input matrix size is 68x68, and the output is 64x64. Both
-  // gridDim.x and gridim.y should be ceilDiv(numel - 4,
-  // split_factor), which is 4. If full split is used, the grid
-  // dimension would be 5.
-  const int numel_x = 64 + 4;
-  const int numel_y = 64 + 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(2, -2), at::indexing::Slice(2, -2)};
-
-  outputs[0] = outputs[0].index(indices);
-
-  auto stencil1 = t0;
-  for (const auto& offset : offsets) {
-    stencil1 = stencil1 + shift(t0, offset);
-  }
-  stencil1 = stencil1 / int(offsets.size() + 1);
-  auto stencil2 = stencil1;
-  for (const auto& offset : offsets) {
-    stencil2 = stencil2 + shift(stencil1, offset);
-  }
-  stencil2 = stencil2 / int(offsets.size() + 1);
-  auto ref = stencil2.index(indices);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionPartialSplit5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int numel_x = 10;
-  const int numel_y = 11;
-
-  // auto tv0 = makeSymbolicTensor(2);
-  auto tv0 = makeConcreteTensor({numel_x, numel_y});
-  fusion.addInput(tv0);
-
-  auto tv1 = shift(tv0, {0, 1}, false);
-  auto tv2 = add(tv1, new Double(1));
-
-  fusion.addOutput(tv2);
-
-  // Partially split tv2 but not tv1. Producer indexing with tv2 as a consumer
-  // requires adjustment of the index to account for the difference of split
-  // offsets.
-  tv2->split(1, 4, true, true);
-  tv1->split(1, 4);
-
-  tv1->computeAt(tv2, 1);
-
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(0, at::indexing::None),
-      at::indexing::Slice(1, at::indexing::None)};
-
-  outputs[0] = outputs[0].index(indices);
-
-  auto ref = (shift(t0, {0, 1}) + 1).index(indices);
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionPartialSplit6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int numel_x = 9;
-
-  auto tv0 = makeConcreteTensor({numel_x});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {1}, false);
-  auto tv3 = add(tv2, new Double(1));
-
-  fusion.addOutput(tv3);
-
-  // Another mix of partial and non-partial split
-  tv1->split(0, 4);
-  tv2->split(0, 4, true, true);
-  tv3->split(0, 4);
-
-  // Just make it easier for compute-sanitizer to flag invalid memory accesses
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {t0};
-  auto outputs = fe.runFusion(inputs);
-
-  std::vector<at::indexing::TensorIndex> indices{
-      at::indexing::Slice(1, at::indexing::None)};
-
-  outputs[0] = outputs[0].index(indices);
-
-  auto ref = (shift(t0 + 1, {1}) + 1).index(indices);
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = shift(tv0, {-1, 0});
-  fusion.addOutput(tv1);
-
-  auto tv2 = shift(tv0, {0, 1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = shift(tv0, {2, 2});
-  fusion.addOutput(tv3);
-
-  auto tv4 = shift(tv0, {-2, -2});
-  fusion.addOutput(tv4);
-
-  auto tv5 = add(tv0, new Double(1));
-  auto tv6 = shift(tv5, {0, -1});
-  fusion.addOutput(tv6);
-
-  tv1->axis(1)->parallelize(ParallelType::Unswitch);
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-  tv3->axis(0)->parallelize(ParallelType::Unswitch);
-  tv4->axis(0)->parallelize(ParallelType::Unswitch);
-
-  tv5->axis(1)->parallelize(ParallelType::TIDx);
-  tv6->axis(1)->parallelize(ParallelType::TIDx);
-  tv5->axis(0)->parallelize(ParallelType::Unswitch);
-  tv5->setMemoryType(MemoryType::Shared);
-
-  int numel_x = 9;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = shift(t0, {-1, 0});
-  TORCH_CHECK(t1.equal(outputs[0]));
-
-  auto t2 = shift(t0, {0, 1});
-  TORCH_CHECK(t2.equal(outputs[1]));
-
-  auto t3 = shift(t0, {2, 2});
-  TORCH_CHECK(t3.equal(outputs[2]));
-
-  auto t4 = shift(t0, {-2, -2});
-  TORCH_CHECK(t4.equal(outputs[3]));
-
-  auto t6 = shift(t0 + 1, {0, -1});
-  TORCH_CHECK(t6.equal(outputs[4]));
-}
-
-TEST(NVFuserTest, FusionGatherUnswitch1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1_gather_param = new Int();
-  fusion.addInput(tv1_gather_param);
-  auto tv1_gather_pad_param = new Int();
-  fusion.addInput(tv1_gather_pad_param);
-  auto tv1 = gather(
-      tv0, {tv1_gather_param}, {{tv1_gather_pad_param, tv1_gather_pad_param}});
-  fusion.addOutput(tv1);
-
-  auto tv2_gather_param = new Int();
-  fusion.addInput(tv2_gather_param);
-  auto tv2_gather_pad_param = new Int();
-  fusion.addInput(tv2_gather_pad_param);
-  auto tv2 = gather(
-      tv0, {tv2_gather_param}, {{tv2_gather_pad_param, tv2_gather_pad_param}});
-  fusion.addOutput(tv2);
-
-  // Static gather
-  auto tv3 = gather(tv0, {3}, {{1, 1}});
-  fusion.addOutput(tv3);
-
-  // Static gather
-  auto tv4 = gather(tv0, {5}, {{2, 2}});
-  fusion.addOutput(tv4);
-
-  auto tv0_cache = tv0->cache_after();
-  tv0_cache->setMemoryType(MemoryType::Shared);
-
-  tv4->split(0, 32);
-
-  tv0->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::Unswitch);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 100;
-  const int tv1_gather = 3;
-  const int tv1_gather_pad = 1;
-  const int tv2_gather = 5;
-  const int tv2_gather_pad = 2;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x}, options);
-  std::vector<IValue> inputs = {
-      t0, tv1_gather, tv1_gather_pad, tv2_gather, tv2_gather_pad};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
-  TORCH_CHECK(t1.equal(outputs[0]));
-
-  auto t2 = gather(t0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
-  TORCH_CHECK(t2.equal(outputs[1]));
-
-  auto t3 = gather(t0, {3}, {{1, 1}});
-  TORCH_CHECK(t3.equal(outputs[2]));
-
-  auto t4 = gather(t0, {5}, {{2, 2}});
-  TORCH_CHECK(t4.equal(outputs[3]));
-}
-
-TEST(NVFuserTest, FusionGatherStrided1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  const std::vector<int> window_shape = {1, 3};
-  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
-
-  const std::vector<int> strides = {1, 3};
-
-  auto tv1 = gather(tv0, window_shape, padding_width, strides);
-
-  fusion.addOutput(tv1);
-
-  const int s1 = 11;
-  const int s2 = 13;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1, s2}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  // tv1 has a stride dimension, so its number of dimensions should be
-  // input_ndims + window_ndims + stride.
-  TORCH_CHECK(tv1->nDims() == tv0->nDims() * 2 + 1);
-
-  // However, the number of dimensions of the Aten tensor should still
-  // be just the twice of the number of dimensions of the input
-  // tensor.
-  auto fuser_out = outputs[0];
-  TORCH_CHECK(
-      fuser_out.ndimension() == tv0->nDims() * 2,
-      "Invalid dimensionality of output tensor: ",
-      fuser_out.ndimension());
-
-  // Each output dimension should be: ceilDiv(input_size + padding_width -
-  // window, stride).
-  for (const auto i : c10::irange(window_shape.size())) {
-    auto valid_dim = ceilDiv(
-        t0.size(i) + padding_width[i][0] + padding_width[i][1] -
-            window_shape[i] + 1,
-        strides[i]);
-    auto actual_dim = outputs[0].size(i);
-    TORCH_CHECK(
-        valid_dim == actual_dim,
-        "Invalid output size at dimension ",
-        i,
-        ". Expected: ",
-        valid_dim,
-        ", actual: ",
-        actual_dim);
-  }
-
-  auto ref = gather(t0, window_shape, padding_width, strides);
-
-  TORCH_CHECK(ref.equal(outputs[0]));
-}
-
-// Split strided domain
-TEST(NVFuserTest, FusionGatherStrided2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {3};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  // Split the strided domain
-  tv3->split(0, 4);
-
-  // Propagate the split by 4 of the tv3 domain to pre-stride domains,
-  // making them split by 4 * 3
-  tv0->computeAt(tv3, 1);
-
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  const int s1 = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width, strides);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Outer split
-TEST(NVFuserTest, FusionGatherStrided3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {3};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-
-  auto tv3 = sum(tv2, {-1});
-  fusion.addOutput(tv3);
-
-  // Outer split
-  tv3->split(0, 2, false);
-
-  tv0->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  const int s1 = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width, strides);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionGatherStrided4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {3};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  // Test propagation of split from one gather output to another
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-  auto tv3 = gather(tv1, window_shape, padding_width, strides);
-
-  auto tv4 = sum(tv2, {-1});
-  fusion.addOutput(tv4);
-
-  auto tv5 = sum(tv3, {-1});
-  fusion.addOutput(tv5);
-
-  tv4->split(0, 2);
-
-  // Test forward computeAt propagation from tv1 to tv3
-  tv0->computeAt(tv4, 1);
-
-  const int s1 = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width, strides);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref, ref}, __LINE__, __FILE__);
-}
-
-// Same as GatherStrided1 but with stride != window
-TEST(NVFuserTest, FusionGatherStrided5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  const std::vector<int> window_shape = {1, 3};
-  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
-
-  const std::vector<int> strides = {1, 2};
-
-  auto tv1 = gather(tv0, window_shape, padding_width, strides);
-
-  fusion.addOutput(tv1);
-
-  const int s1 = 11;
-  const int s2 = 13;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1, s2}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
-
-  auto ref = gather(t0, window_shape, padding_width, strides);
-
-  TORCH_CHECK(ref.equal(outputs[0]));
-}
-
-// Same as GatherStrided2 but with stride != window
-TEST(NVFuserTest, FusionGatherStrided6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {2};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  // Split the strided domain
-  tv3->split(0, 4);
-
-  // Propagate the split by 4 of the tv3 domain to pre-stride domains,
-  // making them split by 4 * 2
-  tv0->computeAt(tv3, 1);
-
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  const int s1 = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width, strides);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Same as GatherStrided4 but different strides
-TEST(NVFuserTest, FusionGatherStrided7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  // Use different strides
-  auto tv2 = gather(tv1, window_shape, padding_width, {3});
-  auto tv3 = gather(tv1, window_shape, padding_width, {2});
-
-  auto tv4 = sum(tv2, {-1});
-  fusion.addOutput(tv4);
-
-  auto tv5 = sum(tv3, {-1});
-  fusion.addOutput(tv5);
-
-  tv4->split(0, 2);
-
-  // Since tv3 has a different stride factor, this should fail.
-  ASSERT_ANY_THROW(tv0->computeAt(tv4, 1));
-}
-
-// Same as GatherStrided2 but with unswitch
-TEST(NVFuserTest, FusionGatherStrided8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {3};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  const int tidx = 32;
-
-  // Split the strided domain
-  tv3->split(0, tidx);
-
-  // Split for unswitch
-  tv3->split(0, 1);
-
-  tv0->computeAt(tv3, 2);
-
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unswitch);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  const int s1 = 1023;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({s1}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0 + 1;
-  auto t2 = gather(t1, window_shape, padding_width, strides);
-  auto ref = sum(t2, {-1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Chained strided gather. Not supported yet.
-TEST(NVFuserTest, FusionGatherStridedChain_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const std::vector<int> window_shape = {3};
-  const std::vector<std::vector<int>> padding_width = {{1, 1}};
-  const std::vector<int> strides = {3};
-  // const std::vector<int> strides = {1};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-
-  auto tv2 = gather(tv1, window_shape, padding_width, strides);
-  // Reduce gathered window
-  auto tv3 = sum(tv2, {-1});
-
-  // Repeat
-  auto tv4 = gather(tv3, window_shape, padding_width, strides);
-  auto tv5 = sum(tv4, {-1});
-  auto out = tv5;
-
-  fusion.addOutput(out);
-
-  // This should throw an error at HaloInfo::build.
-  ASSERT_ANY_THROW(GpuLower gpulw(&fusion));
-}
-
-TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input:  CHW
-  // Pooling window: 3x3
-  // Strides: 3
-  // Padding: 1 at each end of the inner 2 dimensions
-
-  // [C, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // [C, H/3, W/3, 1, 3, 3]
-  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
-
-  // [C, H/3, W/3]
-  auto max_tensor = reductionOp(
-      BinaryOpType::Max,
-      {-3, -2, -1},
-      new Double(std::numeric_limits<float>::lowest()),
-      inp_tile);
-  fusion.addOutput(max_tensor);
-
-  ////////////////////////////////////
-
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Tiling the spatial domain
-  const int tile_x = 32;
-  const int tile_y = 8;
-
-  max_tensor->split(1, tile_y);
-  max_tensor->split(3, tile_x);
-  max_tensor->reorder({{2, 3}});
-  // [C, H/tile_y, W/tile_x, tile_y, tile_x]
-  max_tensor->split(2, 1);
-  // [C, H/tile_y, W/tile_x, 1, tile_y, tile_x]
-
-  inp->computeAt(max_tensor, 4);
-
-  max_tensor->axis(0)->parallelize(ParallelType::BIDx);
-  max_tensor->axis(3)->parallelize(ParallelType::Unswitch);
-  max_tensor->axis(4)->parallelize(ParallelType::TIDy);
-  max_tensor->axis(5)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(max_tensor, ir_utils::allTvs(&fusion));
-
-  inp_cache->setMemoryType(MemoryType::Shared);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int hw = 50;
-  const int num_channels = 20;
-  const int pooling_window = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
-  // We always pad inputs by zero, so if all surrounding values are
-  // negative, max pooling would pick a padded value, which isn't the
-  // correct behavior. We need to be able to choose the value of
-  // padding. In this case, padding by the minimum value would not
-  // have this problem. For now, avoid the problem by making sure all
-  // values are not negative.
-  aten_inp = at::abs(aten_inp);
-  std::vector<IValue> inputs = {aten_inp};
-
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = at::max_pool2d(
-      aten_inp, {pooling_window, pooling_window}, {3, 3}, {1, 1});
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Input: [C, H, W]
-  auto inp = makeSymbolicTensor(3);
-  fusion.addInput(inp);
-
-  // Weights: [K, C, 3, 3]
-  auto w = makeSymbolicTensor(4);
-  fusion.addInput(w);
-
-  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
-  // side of the spatial dimensions
-  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
-  // inp_tile: [C, H/3, s3, W/3, s3, 1, 3, 3]
-
-  auto inp_bc =
-      broadcast(inp_tile, {true, false, false, false, false, false, false});
-  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
-
-  auto inp_times_w = mul(inp_bc, w_bc);
-
-  // Reduce the channel and neighbor tile dimensions
-  auto out = sum(inp_times_w, {1, 4, 5, 6});
-
-  fusion.addOutput(out);
-
-  ////////////////////////////////////
-
-  // Cache the input and weight tensors
-  auto inp_cache = inp->cache_after();
-
-  // Blocking the spatial dimensions
-  const int block_w = 16;
-  const int block_h = 4;
-  const int block_c = 2;
-
-  // [K, C, H/s, W/s, 1, 3, 3]
-  out->split(2, block_h);
-  // [K, C, H/s/block_h, block_h, W/s, 1, 3, 3]
-  out->split(4, block_w);
-  // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 3, 3]
-  out->reorder({{3, 4}});
-  // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, 3]
-  out->split(1, block_c);
-  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3,
-  // 3]
-  out->split(4, 1);
-  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
-  // 3, 3]
-
-  auto out_rf = out->rFactor({1, -3, -2, -1});
-  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
-  // 3, 3]
-
-  // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
-
-  inp_cache->computeAt(out, 5);
-  inp_cache->setMemoryType(MemoryType::Shared);
-  // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
-  // 3, 3]
-
-  // Move C/block_c before block_h/2 and share the domain from
-  // inp_cache to out_rf
-  out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
-  inp_cache->computeAt(out_rf, 6);
-
-  inp_tile->computeAt(out_rf, -1);
-  w->computeAt(out_rf, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(1)->parallelize(ParallelType::TIDz);
-  out->axis(4)->parallelize(ParallelType::Unswitch);
-  out->axis(5)->parallelize(ParallelType::TIDy);
-  out->axis(6)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  const int dim_h = 99;
-  const int dim_w = 101;
-  const int dim_c = 10;
-  const int dim_f = 20;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
-  at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
-  std::vector<IValue> inputs = {at_inp, at_w};
-
-  auto cg_outputs = fe.runFusion(inputs);
-
-  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
-  auto at_out = at::conv2d(at_inp, at_w, {}, 3, 1);
-  at_out = at_out.squeeze(0); // drop the N axis
-
-  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionNonDivisibleHalo1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, new Double(1));
-  auto tv2 = shift(tv1, {-1});
-  fusion.addOutput(tv2);
-
-  // [I]
-  tv2->split(0, 8);
-  // [I/8, 8]
-  tv2->split(1, 3);
-  // [I/8, 3, 3]
-
-  tv0->computeAt(tv2, -2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = shift((t0 + 1), {-1});
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST(NVFuserTest, FusionNonDivisibleHalo2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
-  auto tv2 = sum(tv1, {-2, -1});
-  auto tv3 = add(tv0, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  const int gy = 50;
-  const int gx = 50;
-  const int by = 8;
-  const int bx = 16;
-
-  auto tv5 = tv0->cache_after();
-
-  // [I, J]
-  tv4->split(0, gy);
-  // [I/gy, gy, J]
-  tv4->split(1, by);
-  // [I/gy, gy/by, by, J]
-  tv4->split(-1, gx);
-  // [I/gy, gy/by, by, J/gx, gx]
-  tv4->split(-1, bx);
-  // [I/gy, gy/by, by, J/gx, gx/bx, bx]
-  tv4->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
-  // [I/gy, J/gx, gy/by, gx/bx, by, bx]
-
-  auto tv6 = tv4->rFactor({2, 3});
-
-  tv0->computeAt(tv6, 4);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDy);
-  tv4->axis(1)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::TIDy);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3, tv5, tv6});
-
-  tv5->setMemoryType(MemoryType::Shared);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({111, 222}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
-  auto t2 = t1.sum({-2, -1});
-  auto t3 = t0 + t2;
-  auto t4 = t3.sum({-2, -1});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
-}
-
-} // namespace jit
-} // namespace torch
-#endif // #if defined(USE_CUDA)
diff --git a/test/cpp/jit/test_graph_iterator.cpp b/test/cpp/jit/test_graph_iterator.cpp
index 75edac875b19..00d1f9a6a28c 100644
--- a/test/cpp/jit/test_graph_iterator.cpp
+++ b/test/cpp/jit/test_graph_iterator.cpp
@@ -62,7 +62,7 @@ void assert_ordering(
   ASSERT_EQ(expected.size(), actual.size())
       << "Got " << actual.size() << " elements (" << actual << ")"
       << " expected " << expected.size() << " elements (" << expected << ")";
-  for (int i = 0; i < expected.size(); i++) {
+  for (unsigned i = 0; i < expected.size(); i++) {
     ASSERT_EQ(expected[i], actual[i])
         << "Difference at index " << i << " in " << actual << " (expected "
         << actual << ")";
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 0e40e48514d1..d01c611bbaec 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -571,20 +571,35 @@ namespace {
 
 void compareModelOutput(
     c10::ArrayRef<IValue> actual_result_list,
-    const std::vector<Tensor>& expect_result_list) {
+    const std::vector<IValue>& expect_result_list) {
   AT_ASSERT(actual_result_list.size() == expect_result_list.size());
-  AT_ASSERT(actual_result_list[0].toTensor().equal(expect_result_list[0]));
   AT_ASSERT(
-      actual_result_list[1].toTensor().dim() == expect_result_list[1].dim());
-  AT_ASSERT(actual_result_list[2].toTensor().equal(expect_result_list[2]));
-  AT_ASSERT(actual_result_list[3].toTensor().equal(expect_result_list[3]));
+      actual_result_list[0].toTensor().equal(expect_result_list[0].toTensor()));
+  AT_ASSERT(
+      actual_result_list[1].toTensor().dim() ==
+      expect_result_list[1].toTensor().dim());
+  AT_ASSERT(
+      actual_result_list[2].toTensor().equal(expect_result_list[2].toTensor()));
+  AT_ASSERT(
+      actual_result_list[3].toTensor().equal(expect_result_list[3].toTensor()));
+  ASSERT_EQ(
+      actual_result_list[4].toStringRef(), expect_result_list[4].toStringRef());
+  ASSERT_EQ(actual_result_list[5].toBool(), expect_result_list[5].toBool());
+  ASSERT_EQ(actual_result_list[6].toBool(), expect_result_list[6].toBool());
+  ASSERT_EQ(actual_result_list[7].toBool(), expect_result_list[7].toBool());
+  AT_ASSERT(
+      actual_result_list[8].toTensor().equal(expect_result_list[8].toTensor()));
+  ASSERT_EQ(
+      actual_result_list[9].toStringRef(), expect_result_list[9].toStringRef());
+  ASSERT_EQ(actual_result_list[10].toInt(), expect_result_list[10].toInt());
+  ASSERT_EQ(actual_result_list[11].toBool(), expect_result_list[11].toBool());
 }
 
 void runAndCheckTorchScriptModel(
     std::stringstream& input_model_stream,
     const std::vector<IValue>& input_data,
-    const std::vector<Tensor>& expect_result_list,
-    const int64_t expect_version) {
+    const std::vector<IValue>& expect_result_list,
+    const uint64_t expect_version) {
   auto actual_version = _get_model_bytecode_version(input_model_stream);
   AT_ASSERT(actual_version == expect_version);
 
@@ -600,8 +615,8 @@ void runAndCheckTorchScriptModel(
 void runAndCheckBytecodeModel(
     std::stringstream& input_model_stream,
     const std::vector<IValue>& input_data,
-    const std::vector<Tensor>& expect_result_list,
-    const int64_t expect_version) {
+    const std::vector<IValue>& expect_result_list,
+    const uint64_t expect_version) {
   auto actual_version = _get_model_bytecode_version(input_model_stream);
   AT_ASSERT(actual_version == expect_version);
 
@@ -618,14 +633,15 @@ void runAndCheckBytecodeModel(
 void backportAllVersionCheck(
     std::stringstream& test_model_file_stream,
     std::vector<IValue>& input_data,
-    std::vector<Tensor>& expect_result_list,
-    const int64_t expect_from_version) {
+    std::vector<IValue>& expect_result_list,
+    const uint64_t expect_from_version) {
   auto from_version = _get_model_bytecode_version(test_model_file_stream);
   AT_ASSERT(from_version == expect_from_version);
+  AT_ASSERT(from_version > 0);
 
   // Backport script_module_v5.ptl to an older version
   constexpr int64_t minimum_to_version = 4;
-  int64_t current_to_version = from_version - 1;
+  auto current_to_version = from_version - 1;
 
   // Verify all candidate to_version work as expected. All backport to version
   // larger than minimum_to_version should success.
@@ -641,12 +657,14 @@ void backportAllVersionCheck(
 
     // Check backport model version
     auto backport_version = _get_model_bytecode_version(oss);
+    backport_version = _get_model_bytecode_version(oss);
     AT_ASSERT(backport_version == current_to_version);
 
     // Load and run the backport model, then compare the result with expect
     // result
     runAndCheckBytecodeModel(
         oss, input_data, expect_result_list, current_to_version);
+    oss.seekg(0, oss.beg);
     runAndCheckTorchScriptModel(
         oss, input_data, expect_result_list, current_to_version);
 
@@ -668,6 +686,9 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   module.register_parameter("bias", torch::ones({20}), false);
   module.define(R"(
+    def fn(self, x:float=1.0):
+      return x
+
     def forward(self, input):
       x1 = torch.zeros(2, 2)
       x2 = torch.empty_like(torch.empty(2, 2))
@@ -677,21 +698,52 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) {
       x = 2 * torch.ones(1)
       h = torch.ones(1)
       torch.add(x, h, out=x)
-      return (x1, x2, x3, x)
-  )");
+      device = torch.ones(1, 1).cpu().device.type
+      is_cuda = x1.is_cuda
+      bool_val = True
+      check_is = [] is None
+      check_is_not = [1] is not None
+      check_not = not bool_val
+      num_to_tensor = torch.tensor([self.fn()])
+      d = {"a": "abc"}
+      check_dict_index = d["a"]
+      check_dim = x1.dim()
+      return (
+        x1, x2, x3, x, device, is_cuda, check_is,
+        check_is_not, num_to_tensor, check_dict_index,
+        check_dim, check_not
+        )
+      )");
 
   torch::jit::Module module_freeze = freeze(module);
 
   std::stringstream input_model_stream;
+#if defined(ENABLE_FLATBUFFER)
+  module_freeze._save_for_mobile(
+      input_model_stream,
+      /*extra_files=*/{},
+      /*save_mobile_debug_info=*/false,
+      /*use_flatbuffer=*/true);
+#else
   module_freeze._save_for_mobile(input_model_stream);
+#endif
   std::vector<IValue> input_data =
       std::vector<IValue>({torch::ones({1, 1, 28, 28})});
-  std::vector<Tensor> expect_result_list;
+  std::vector<IValue> expect_result_list;
   expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float) * 0);
   expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float));
   expect_result_list.emplace_back(
       at::ones({1, 20, 24, 24}, ScalarType::Float) * 26);
   expect_result_list.emplace_back(3 * at::ones({1}));
+  // "cpu" False, False, True, tensor(1), "abc", 2, False)
+  expect_result_list.emplace_back(c10::IValue("cpu"));
+  expect_result_list.emplace_back(c10::IValue(false));
+  expect_result_list.emplace_back(c10::IValue(false));
+  expect_result_list.emplace_back(c10::IValue(true));
+  expect_result_list.emplace_back(c10::IValue(at::ones({1})));
+  expect_result_list.emplace_back(c10::IValue("abc"));
+  expect_result_list.emplace_back(c10::IValue(2));
+  expect_result_list.emplace_back(c10::IValue(false));
 
   backportAllVersionCheck(
       input_model_stream,
@@ -950,7 +1002,6 @@ TEST(LiteInterpreterTest, ExtraFiles) {
   module->_save_for_mobile(oss, extra_files);
 
   std::istringstream iss(oss.str());
-  caffe2::serialize::IStreamAdapter adapter{&iss};
   std::unordered_map<std::string, std::string> loaded_extra_files;
   loaded_extra_files["metadata.json"] = "";
   torch::jit::_load_for_mobile(iss, torch::kCPU, loaded_extra_files);
@@ -965,7 +1016,7 @@ TEST(LiteInterpreterTest, ExtraFiles) {
       loaded_extra_files[file_name.substr(6)] = "";
     }
   }
-
+  iss.seekg(0, iss.beg);
   torch::jit::_load_for_mobile(iss, torch::kCPU, loaded_extra_files);
   ASSERT_EQ(loaded_extra_files["metadata.json"], "abc");
   ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}");
@@ -1145,7 +1196,6 @@ TEST(RunTimeTest, ParseOperator) {
       function.get());
   parseOperators(
       std::move(*c10::ivalue::Tuple::create(operators)).elements(),
-      model_version,
       1,
       function.get());
   const size_t rsize = 5;
@@ -1528,7 +1578,6 @@ TEST(RunTimeTest, RuntimeCall) {
       foo.get());
   parseOperators(
       std::move(*c10::ivalue::Tuple::create(operatorsFoo)).elements(),
-      model_version,
       1,
       foo.get());
   parseConstants(
@@ -1545,7 +1594,6 @@ TEST(RunTimeTest, RuntimeCall) {
       call.get());
   parseOperators(
       std::move(*c10::ivalue::Tuple::create(operatorsCall)).elements(),
-      model_version,
       1,
       call.get());
   parseConstants(
@@ -2043,16 +2091,14 @@ TEST(LiteInterpreterUpgraderTest, Upgrader) {
   std::vector<mobile::Function> upgrader_functions;
 
   for (auto& byteCodeFunctionWithOperator : getUpgraderBytecodeList()) {
+    byteCodeFunctionWithOperator.function.initialize_operators(true);
     ASSERT_EQ(
         byteCodeFunctionWithOperator.function.get_code().operators_.size(),
         byteCodeFunctionWithOperator.function.get_code().op_names_.size());
     if (byteCodeFunctionWithOperator.function.get_code().operators_.empty()) {
       for (const auto& op : byteCodeFunctionWithOperator.operators) {
         byteCodeFunctionWithOperator.function.append_operator(
-            op.name,
-            op.overload_name,
-            op.num_specified_args,
-            caffe2::serialize::kMaxSupportedFileFormatVersion);
+            op.name, op.overload_name, op.num_specified_args);
       }
     }
     upgrader_functions.push_back(byteCodeFunctionWithOperator.function);
diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp
index cf3040f4fba4..ede1c3a8355b 100644
--- a/test/cpp/jit/test_lite_trainer.cpp
+++ b/test/cpp/jit/test_lite_trainer.cpp
@@ -158,6 +158,139 @@ TEST(MobileTest, SaveLoadParametersEmpty) {
   AT_ASSERT(mobile_params.size() == 0);
 }
 
+TEST(MobileTest, SaveParametersDefaultsToZip) {
+  // Save some empty parameters.
+  std::map<std::string, at::Tensor> empty_parameters;
+  std::stringstream ss_data;
+  _save_parameters(empty_parameters, ss_data);
+
+  // Verify that parameters were serialized to a ZIP container.
+  EXPECT_GE(ss_data.str().size(), 4);
+  EXPECT_EQ(ss_data.str()[0], 'P');
+  EXPECT_EQ(ss_data.str()[1], 'K');
+  EXPECT_EQ(ss_data.str()[2], '\x03');
+  EXPECT_EQ(ss_data.str()[3], '\x04');
+}
+
+#if defined(ENABLE_FLATBUFFER)
+TEST(MobileTest, SaveParametersCanUseFlatbuffer) {
+  // Save some empty parameters using flatbuffer.
+  std::map<std::string, at::Tensor> empty_parameters;
+  std::stringstream ss_data;
+  _save_parameters(empty_parameters, ss_data, /*use_flatbuffer=*/true);
+
+  // Verify that parameters were serialized to a flatbuffer. The flatbuffer
+  // magic bytes should be at offsets 4..7. The first four bytes contain an
+  // offset to the actual flatbuffer data.
+  EXPECT_GE(ss_data.str().size(), 8);
+  EXPECT_EQ(ss_data.str()[4], 'P');
+  EXPECT_EQ(ss_data.str()[5], 'T');
+  EXPECT_EQ(ss_data.str()[6], 'M');
+  EXPECT_EQ(ss_data.str()[7], 'F');
+}
+#else // !defined(ENABLE_FLATBUFFER)
+TEST(MobileTest, SaveParametersThrowsWithoutFlatbufferSupport) {
+  // Some empty parameters to try saving.
+  std::map<std::string, at::Tensor> empty_parameters;
+  std::stringstream ss_data;
+
+  // Save using flatbuffers should fail when support isn't compiled in. Make
+  // sure we get the exception that explicitly mentions the lack of flatbuffer
+  // support.
+  try {
+    _save_parameters(empty_parameters, ss_data, /*use_flatbuffer=*/true);
+    FAIL() << "_save_parameters should have thrown";
+  } catch (const ::c10::Error& e) {
+    static const std::string kExpectedSubstring =
+        "build hasn't enabled flatbuffer";
+    EXPECT_TRUE(
+        std::string(e.msg()).find(kExpectedSubstring) != std::string::npos)
+        << "Exception message does not contain expected substring \""
+        << kExpectedSubstring << "\": actual message \"" << e.msg() << "\"";
+  } catch (...) {
+    FAIL() << "Unexpected exception type";
+  }
+}
+#endif // !defined(ENABLE_FLATBUFFER)
+
+#if defined(ENABLE_FLATBUFFER)
+TEST(MobileTest, SaveLoadParametersUsingFlatbuffers) {
+  // Create some simple parameters to save.
+  std::map<std::string, at::Tensor> input_params;
+  input_params["four_by_ones"] = 4 * torch::ones({});
+  input_params["three_by_ones"] = 3 * torch::ones({});
+
+  // Serialize them using flatbuffers.
+  std::stringstream data;
+  _save_parameters(input_params, data, /*use_flatbuffer=*/true);
+
+  // The flatbuffer magic bytes should be at offsets 4..7.
+  EXPECT_EQ(data.str()[4], 'P');
+  EXPECT_EQ(data.str()[5], 'T');
+  EXPECT_EQ(data.str()[6], 'M');
+  EXPECT_EQ(data.str()[7], 'F');
+
+  // Read them back and check that they survived the trip.
+  auto output_params = _load_parameters(data);
+  EXPECT_EQ(output_params.size(), 2);
+  {
+    auto four_by_ones = 4 * torch::ones({});
+    EXPECT_EQ(
+        output_params["four_by_ones"].item<int>(), four_by_ones.item<int>());
+  }
+  {
+    auto three_by_ones = 3 * torch::ones({});
+    EXPECT_EQ(
+        output_params["three_by_ones"].item<int>(), three_by_ones.item<int>());
+  }
+}
+#else // !defined(ENABLE_FLATBUFFER)
+TEST(MobileTest, LoadParametersFailsWithoutFlatbufferSupport) {
+  // Create some data that looks like a flatbuffer header.
+  std::stringstream data;
+  data << "abcd"
+       << "PTMF" // Flatbuffer magic
+       << "ijkl";
+
+  // Loading the "flatbuffer" data should fail. Make sure we see the expected
+  // exception, not just any exception; since this isn't properly-formed
+  // flatbuffer data, any attempt to parse it might throw a different error type
+  // or message, but we don't expect anyone to try parsing it.
+  try {
+    _load_parameters(data);
+    FAIL() << "_load_parameters should have thrown";
+  } catch (const ::c10::Error& e) {
+    static const std::string kExpectedSubstring =
+        "build hasn't enabled flatbuffer";
+    EXPECT_TRUE(
+        std::string(e.msg()).find(kExpectedSubstring) != std::string::npos)
+        << "Exception message does not contain expected substring \""
+        << kExpectedSubstring << "\": actual message \"" << e.msg() << "\"";
+  } catch (...) {
+    FAIL() << "Unexpected exception type";
+  }
+}
+#endif // !defined(ENABLE_FLATBUFFER)
+
+TEST(MobileTest, LoadParametersUnexpectedFormatShouldThrow) {
+  // Manually create some data that doesn't look like a ZIP or Flatbuffer file.
+  // Make sure it's longer than 8 bytes, since getFileFormat() needs that much
+  // data to detect the type.
+  std::stringstream bad_data;
+  bad_data << "abcd"
+           << "efgh"
+           << "ijkl";
+
+  // Loading parameters from it should throw an exception.
+  EXPECT_ANY_THROW(_load_parameters(bad_data));
+}
+
+TEST(MobileTest, LoadParametersEmptyDataShouldThrow) {
+  // Loading parameters from an empty data stream should throw an exception.
+  std::stringstream empty;
+  EXPECT_ANY_THROW(_load_parameters(empty));
+}
+
 TEST(LiteTrainerTest, SGD) {
   Module m("m");
   m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false);
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 099588d90d45..88d447fdf2d7 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -4,6 +4,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type_base.h>
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
@@ -42,15 +43,18 @@
 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
 #include <torch/csrc/jit/passes/restore_mutation.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/argument_spec.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/runtime/jit_trace.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/symbolic_script.h>
+#include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/jit.h>
@@ -1379,6 +1383,39 @@ TEST(ThreadLocalDebugInfoTest, Basic) {
   }
 }
 
+TEST(TestSymIntArrayRef, BasicConversion) {
+  const size_t X = 2, Y = 4, Z = 5;
+  std::vector<int64_t> tgt_size_v{2, 4, 5};
+  std::vector<c10::SymInt> tgt_size({SymInt(X), SymInt(Y), SymInt(Z)});
+  auto a = at::randn({1, 4, 1}, at::kCPU);
+  auto b = a.expand(tgt_size);
+  auto c = a.expand(tgt_size_v);
+  ASSERT_TRUE(torch::allclose(b, c));
+}
+
+TEST(TestSymInt, NarrowCopyWithSymbolicInt) {
+  static const size_t LENGTH = 5;
+  auto a = at::randn({10}, at::kCPU);
+  c10::SymInt si(LENGTH);
+  auto b = a.narrow_copy(0, 0, si);
+  auto c = a.narrow(0, 0, LENGTH);
+  ASSERT_TRUE(torch::allclose(b, c));
+}
+
+TEST(TestSymInt, NarrowCopy) {
+  static const size_t LENGTH = 5;
+  auto a = at::randn({10}, at::kCPU);
+  auto b = a.narrow_copy(0, 0, LENGTH);
+  auto c = a.narrow(0, 0, LENGTH);
+  ASSERT_TRUE(torch::allclose(b, c));
+}
+
+TEST(TestSymInt, AddSymbolicInt) {
+  c10::SymInt a(5);
+  c10::SymInt b(3);
+  ASSERT_TRUE((a + b).expect_int() == 8);
+}
+
 TEST(FallbackGraphsTest, Basic) {
   static const auto nestGraphIntoFallbackGraph =
       [](const std::shared_ptr<Graph>& graph) {
@@ -2867,6 +2904,33 @@ graph(%x.1 : Tensor):
   testing::FileCheck().check_not("aten::relu_")->run(*graph);
 }
 
+TEST(TestRegisterShapeOp, Basic) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(
+      R"IR(
+graph():
+  %2 : int = prim::Constant[value=5]()
+  %3: int[] = prim::ListConstruct(%2, %2)
+  return (%3))IR",
+      &*graph,
+      vmap);
+
+  auto g2 = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
+graph():
+  %2 : Tensor = prim::MakeTestTensor()
+  return (%2))IR",
+      &*g2,
+      vmap);
+
+  const FunctionSchema& schema = g2->nodes().begin()->schema();
+  torch::jit::RegisterShapeComputeGraphForSchema(schema, graph);
+  PropagateShapesOnGraph(g2);
+  testing::FileCheck().check("5, 5")->run(*g2);
+}
+
 TEST(TestFunctionalToInplaceActivation, Basic) {
   auto graph = std::make_shared<Graph>();
   std::unordered_map<std::string, Value*> vmap;
@@ -2884,6 +2948,70 @@ graph(%x.1 : Tensor):
   testing::FileCheck().check_not("aten::relu(")->run(*graph);
 }
 
+TEST(TestFunctionExecutor, SimpleExecutorTest) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
+graph(%x.1 : Tensor):
+  %2 : int = prim::Constant[value=1]()
+  %x.3 : Tensor = aten::add(%x.1, %2, %2)
+  %y : Tensor = aten::relu(%x.3)
+  return (%y))IR",
+      &*graph);
+  {
+    auto func = torch::make_unique<GraphFunction>(
+        "name", graph, [](GraphFunction&) {}, ExecutorExecutionMode::PROFILING);
+    auto a = at::rand({2, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    Stack stack = {a};
+    func->run(stack);
+    auto g = lastExecutedOptimizedGraph();
+    testing::FileCheck()
+        .check("prim::profile")
+        ->check("aten::add")
+        ->check("aten::relu")
+        ->run(*g);
+  }
+  {
+    auto func = torch::make_unique<GraphFunction>(
+        "name", graph, [](GraphFunction&) {}, ExecutorExecutionMode::SIMPLE);
+    auto a = at::rand({2, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+    Stack stack = {a};
+    func->run(stack);
+    auto g = func->getDebugState().graph;
+    testing::FileCheck()
+        .check_not("prim::profile")
+        ->check("aten::add")
+        ->check("aten::relu")
+        ->run(*g);
+  }
+}
+
+TEST(TestFunctionExecutor, RunDecompositionTest) {
+  static auto* func = torch::jit::GetDecompositionExecutor(
+      "aten::var(Tensor self, bool unbiased=True) -> Tensor");
+  for (bool unbiased : {true, false}) {
+    auto input = at::rand({4, 4});
+    Stack stack = {input, unbiased};
+    func->run(stack);
+    at::Tensor out = pop(stack).toTensor();
+    ASSERT_TRUE(at::allclose(out, input.var(unbiased)));
+  }
+}
+
+TEST(TestShapeGraphLinting, Basic) {
+  auto schemas = RegisteredShapeComputeSchemas();
+  for (const auto& schema : schemas) {
+    // arange does not acually support complex, leave as
+    // union[int, float] for now
+    if (schema->name() == "aten::arange") {
+      continue;
+    }
+    auto g = shapeComputeGraphForSchema(*schema);
+    TORCH_INTERNAL_ASSERT(g);
+    LintShapeComputeGraph(schema, *g);
+  }
+}
+
 // TODO: move to test_kernel when global settings are explicit
 // fusion parameters
 class Composed : public ::testing::Test {
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 88bff7ea93e8..6ecf67917ec0 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -3,7 +3,9 @@
 #include <test/cpp/jit/test_utils.h>
 #include <sstream>
 
+#include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/serialization/export.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/torch.h>
@@ -13,6 +15,32 @@
 namespace torch {
 namespace jit {
 
+namespace {
+
+Module roundtripThroughMobile(const Module& m) {
+  ExtraFilesMap files;
+  std::vector<IValue> constants;
+  jitModuleToPythonCodeAndConstants(m, &files, &constants);
+  CompilationOptions options;
+  mobile::Module mobilem = jitModuleToMobile(m, options);
+  return jitModuleFromSourceAndConstants(
+      mobilem._ivalue(), files, constants, 8);
+}
+
+template <class Functor>
+inline void expectThrowsEq(Functor&& functor, const char* expectedMessage) {
+  try {
+    std::forward<Functor>(functor)();
+  } catch (const Error& e) {
+    EXPECT_STREQ(e.what_without_backtrace(), expectedMessage);
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception with message \""
+                << expectedMessage << "\" but didn't throw";
+}
+
+} // namespace
+
 TEST(SerializationTest, ExtraFilesHookPreference) {
   // Tests that an extra file written explicitly has precedence over
   //   extra files written by a hook
@@ -149,5 +177,87 @@ TEST(SerializationTest, TestJitStream_CUDA) {
   // Check if both the output tensors are equal
   ASSERT_TRUE(op.equal(c));
 }
+
+TEST(TestSourceRoundTrip, UpsampleNearest2d) {
+  Module m("m");
+  m.define(R"(
+    def forward(self, input: Tensor, scale:float):
+      return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale))
+  )");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(torch::rand({1, 3, 128, 128}));
+  inputs.emplace_back(at::Scalar(2.0));
+  auto ref = m.forward(inputs);
+
+  Module m2 = roundtripThroughMobile(m);
+  auto res = m2.forward(inputs);
+
+  auto resd = res.toTensor();
+  auto refd = ref.toTensor();
+  ASSERT_TRUE(resd.equal(refd));
+}
+
+TEST(TestSourceRoundTrip, CheckAttrAccess) {
+  Module m("m");
+  m.register_attribute("mobile_optimized", BoolType::get(), true);
+  Module m2 = roundtripThroughMobile(m);
+  bool mobile_optimized = m2.attr("mobile_optimized", false).toBool();
+  AT_ASSERT(mobile_optimized);
+}
+
+TEST(TestSourceRoundTrip,
+     MethodInvocation) { // NOLINT (use =delete in gtest)
+  const std::vector<std::string> test_programs{
+      // test invoking a method with default parameter
+      R"(
+      def test_func(self, x, b : int = 4):
+        return self.foo + x + b
+      )",
+      // inner method call with default parameter (gets inlined)
+      R"(
+      def add_with_default_arg(self, x, b : int = 4):
+        return self.foo + x + b
+      def test_func(self, x):
+        return self.add_with_default_arg(x)  # invoke method w/ default arg
+      )",
+      // simple method call
+      R"(
+      def test_func(self, x):
+        b = 4
+        return self.foo + x + b
+      )",
+  };
+  for (const auto& test_program : test_programs) {
+    Module m("m");
+    m.register_parameter("foo", torch::ones({}), false);
+    m.define(test_program);
+
+    const int fortyTwo = 42; // (keep linter happy)
+    auto minput = fortyTwo * torch::ones({});
+    auto ref = m.run_method("test_func", minput);
+
+    Module m2 = roundtripThroughMobile(m);
+    const auto& test_func = m2.get_method("test_func");
+    IValue res;
+    for (int i = 0; i < 3; ++i) {
+      res = test_func({minput});
+    }
+
+    auto resd = res.toTensor().item<float>();
+    auto refd = ref.toTensor().item<float>();
+    AT_ASSERT(resd == refd);
+  }
+}
+
+TEST(SerializationTest, ParentDirNotExist) {
+  expectThrowsEq(
+      []() {
+        auto t = torch::nn::Linear(5, 5);
+        torch::save(t, "./doesnotexist/file.pt");
+      },
+      "Parent directory ./doesnotexist does not exist.");
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_shape_analysis.cpp b/test/cpp/jit/test_shape_analysis.cpp
index baf9f16e6e79..15f41da22952 100644
--- a/test/cpp/jit/test_shape_analysis.cpp
+++ b/test/cpp/jit/test_shape_analysis.cpp
@@ -8,6 +8,8 @@
 #include <torch/csrc/jit/ir/ir_views.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+#include <torch/csrc/jit/passes/symbolic_shape_cache.h>
 #include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/graph_iterator.h>
@@ -30,7 +32,6 @@ Node* findNode(std::shared_ptr<Graph>& g, Symbol k) {
   }
   TORCH_INTERNAL_ASSERT(false, "Couldn't find node");
 }
-
 } // namespace
 
 TEST(ShapeAnalysisTest, DynamicShapesFusion) {
@@ -169,7 +170,7 @@ TEST(ShapeAnalysisTest, DynamicShapesFusion) {
 
   /*
     Test guard behaves correctly at runtime and symbolic shapes are computed
-    correctly. As we don't have have TE Kernel support for dynamic shapes we're
+    correctly. As we don't have TE Kernel support for dynamic shapes we're
     going to return all of the computed runtime symbolic dimensions as outputs
     of the graph on guard success, and return None on guard failure
   */
@@ -292,5 +293,191 @@ TEST(ShapeAnalysisTest, MovingConstantOutOfFusionGroups) {
       ->run(*g);
 }
 
+namespace {
+
+// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) {
+  auto a_canonical = CanonicalizedSymbolicShape(a);
+  auto e_canonical = CanonicalizedSymbolicShape(e);
+  EXPECT_EQ(a_canonical, e_canonical);
+}
+
+void assertShapeEqual(
+    c10::optional<std::vector<c10::SymbolicShape>>& actual,
+    std::vector<c10::optional<int64_t>> expected) {
+  ASSERT_TRUE(actual.has_value());
+  ASSERT_EQ(actual->size(), 1);
+
+  auto symb_expected = c10::SymbolicShape(expected);
+  assertShapeEqual(actual->at(0), symb_expected);
+}
+
+const FunctionSchema* getSchema(const char* name) {
+  return &(getOperatorForLiteral(name)->schema());
+}
+} // namespace
+
+TEST(ShapeAnalysisTest, SymbolicShapeAPI) {
+  // Figure out how to fetch a function schema
+
+  // Ask someone else how to create a function schema / operator in C++
+  auto schema = getSchema(
+      "aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor");
+
+  c10::IValue const_size_1 = std::vector<int64_t>{64, 56, 56};
+  c10::IValue const_size_2 = std::vector<int64_t>{1, 56, 56};
+
+  // Check vector initializer list syntax
+  c10::optional<int64_t> sym_dim = c10::nullopt;
+  c10::SymbolicShape ss_concrete =
+      std::vector<c10::optional<int64_t>>{1, 56, 56};
+  c10::SymbolicShape ss1 = std::vector<c10::optional<int64_t>>{sym_dim, 56, 56};
+  c10::SymbolicShape ss2 =
+      std::vector<c10::optional<int64_t>>{64, sym_dim, sym_dim};
+  c10::SymbolicShape ss3 =
+      std::vector<c10::optional<int64_t>>{sym_dim, sym_dim, sym_dim, sym_dim};
+
+  auto res = calculateSymbolicShapesOnOp(
+      schema, std::vector<SSAInput>{const_size_1, const_size_1});
+  assertShapeEqual(res, {64, 56, 56});
+
+  res = calculateSymbolicShapesOnOp(
+      schema, std::vector<SSAInput>{const_size_1, const_size_2});
+  assertShapeEqual(res, {64, 56, 56});
+
+  res = calculateSymbolicShapesOnOp(
+      schema, std::vector<SSAInput>{const_size_1, ss1});
+  assertShapeEqual(res, {64, 56, 56});
+
+  res = calculateSymbolicShapesOnOp(
+      schema, std::vector<SSAInput>{const_size_2, ss1});
+  assertShapeEqual(res, {sym_dim, 56, 56});
+
+  res = calculateSymbolicShapesOnOp(
+      schema, std::vector<SSAInput>{ss_concrete, ss2});
+  assertShapeEqual(res, {64, 56, 56});
+
+  res = calculateSymbolicShapesOnOp(schema, std::vector<SSAInput>{ss2, ss3});
+  assertShapeEqual(res, {sym_dim, 64, sym_dim, sym_dim});
+}
+
+TEST(ShapeAnalysisTest, SymbolicShapeCaching) {
+  clear_shape_cache();
+  auto schema = getSchema("aten::mm(Tensor self, Tensor mat2) -> Tensor");
+
+  c10::IValue const_size_1 = std::vector<int64_t>{64, 56};
+  c10::IValue const_size_2 = std::vector<int64_t>{64, 56};
+  c10::IValue const_size_3 = std::vector<int64_t>{64, 20};
+
+  c10::optional<int64_t> sym_dim = c10::nullopt;
+  c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64});
+  c10::SymbolicShape ss2 = c10::SymbolicShape({sym_dim, 64});
+  c10::SymbolicShape ss3 = c10::SymbolicShape({sym_dim, sym_dim});
+
+  auto res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_1});
+  assertShapeEqual(res, {sym_dim, 56});
+  auto res1_val = res->at(0);
+
+  // The exact same arguments should return the exact same result
+  res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_1});
+  auto res2_val = res->at(0);
+  EXPECT_EQ(res1_val, res2_val);
+  EXPECT_EQ(get_shape_cache_size(), 1);
+
+  // Same shape but different symbols should return same shape
+  // but different symbolic indicies
+  res = calculateSymbolicShapesOnOp(schema, {ss2, const_size_2});
+  auto res3_val = res->at(0);
+
+  assertShapeEqual(res3_val, res2_val);
+  EXPECT_NE(res3_val, res2_val);
+  EXPECT_EQ(get_shape_cache_size(), 1);
+
+  // Different concrete shape should be cached separately
+  res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_3});
+  assertShapeEqual(res, {sym_dim, 20});
+  EXPECT_EQ(get_shape_cache_size(), 2);
+
+  res = calculateSymbolicShapesOnOp(schema, {ss3, const_size_3});
+  assertShapeEqual(res, {sym_dim, 20});
+  EXPECT_EQ(get_shape_cache_size(), 3);
+
+  res = calculateSymbolicShapesOnOp(schema, {ss3, ss3});
+  assertShapeEqual(res, {sym_dim, sym_dim});
+  EXPECT_EQ(get_shape_cache_size(), 4);
+}
+
+TEST(ShapeAnalysisTest, ShapeCacheMultipleFns) {
+  clear_shape_cache();
+
+  auto squeeze_op =
+      getSchema("aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)");
+  auto mul_tensor =
+      getSchema("aten::mul.Tensor(Tensor self, Tensor other) -> Tensor");
+  auto mul_scalar =
+      getSchema("aten::mul.Scalar(Tensor self, Scalar other) -> Tensor");
+  auto div_tensor =
+      getSchema("aten::div.Tensor(Tensor self, Tensor other) -> Tensor");
+  auto matmul = getSchema("aten::mm(Tensor self, Tensor mat2) -> Tensor");
+
+  c10::IValue const_int = 1;
+
+  c10::optional<int64_t> sym_dim = c10::nullopt;
+  c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64});
+
+  auto res = calculateSymbolicShapesOnOp(squeeze_op, {ss1, const_int});
+  assertShapeEqual(res, {sym_dim, 64});
+
+  // Show that cache can handle multiple functions
+  res = calculateSymbolicShapesOnOp(mul_scalar, {ss1, const_int});
+  assertShapeEqual(res, {sym_dim, 64});
+  EXPECT_EQ(get_shape_cache_size(), 2);
+
+  res = calculateSymbolicShapesOnOp(mul_tensor, {ss1, ss1});
+  assertShapeEqual(res, {sym_dim, 64});
+  EXPECT_EQ(get_shape_cache_size(), 3);
+
+  // Even when the expected outcome is the same, should not collide
+  res = calculateSymbolicShapesOnOp(div_tensor, {ss1, ss1});
+  assertShapeEqual(res, {sym_dim, 64});
+  EXPECT_EQ(get_shape_cache_size(), 4);
+
+  // Don't lose cached objects
+  res = calculateSymbolicShapesOnOp(mul_scalar, {ss1, const_int});
+  assertShapeEqual(res, {sym_dim, 64});
+  EXPECT_EQ(get_shape_cache_size(), 4);
+
+  res = calculateSymbolicShapesOnOp(matmul, {ss1, ss1});
+  // SSA can infer that sym_dim is 64 as both tensors
+  // use the same sym_dim
+  assertShapeEqual(res, {64, 64});
+  EXPECT_EQ(get_shape_cache_size(), 5);
+}
+
+TEST(ShapeAnalysisTest, TestShapeMultipleReturns) {
+  clear_shape_cache();
+
+  auto max_dim_op = getSchema(
+      "aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
+  c10::IValue const_int = 1;
+  c10::IValue false_ival = false;
+
+  c10::optional<int64_t> sym_dim = c10::nullopt;
+  c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64});
+  c10::SymbolicShape ss2 = c10::SymbolicShape({sym_dim, 64});
+
+  auto res =
+      calculateSymbolicShapesOnOp(max_dim_op, {ss1, const_int, false_ival});
+  c10::SymbolicShape expected_res = c10::SymbolicShape({sym_dim});
+  assertShapeEqual(res->at(0), expected_res);
+  // res0 and res1 should share the same symbolic symbol
+  EXPECT_EQ(res->at(0), res->at(1));
+
+  // Also test that the shape cache also returns consistent result shapes
+  res = calculateSymbolicShapesOnOp(max_dim_op, {ss2, const_int, false_ival});
+  assertShapeEqual(res->at(0), expected_res);
+  EXPECT_EQ(res->at(0), res->at(1));
+  EXPECT_EQ(get_shape_cache_size(), 1);
+}
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h
index 1a1e1b82b10e..a1b1a76c851c 100644
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@@ -17,37 +17,33 @@ static inline void trim(std::string& s) {
           [](unsigned char ch) { return !std::isspace(ch); })
           .base(),
       s.end());
-  for (int64_t i = 0; i < s.size(); ++i) {
-    if (s[i] == '\n') {
+  for (size_t i = 0; i < s.size(); ++i) {
+    while (i < s.size() && s[i] == '\n') {
       s.erase(i, 1);
-      i--;
     }
   }
-  for (int64_t i = 0; i < s.size(); ++i) {
+  for (size_t i = 0; i < s.size(); ++i) {
     if (s[i] == ' ') {
-      for (int64_t j = i + 1; j < s.size(); j++) {
-        if (s[j] == ' ') {
-          s.erase(j, 1);
-          j--;
-        } else {
-          break;
-        }
+      while (i + 1 < s.size() && s[i + 1] == ' ') {
+        s.erase(i + 1, 1);
       }
     }
   }
 }
 } // namespace
 
-#define ASSERT_THROWS_WITH_MESSAGE(statement, substring)              \
-  try {                                                               \
-    (void)statement;                                                  \
-    FAIL();                                                           \
-  } catch (const std::exception& e) {                                 \
-    std::string substring_s(substring);                               \
-    trim(substring_s);                                                \
-    auto exception_string = std::string(e.what());                    \
-    trim(exception_string);                                           \
-    ASSERT_NE(exception_string.find(substring_s), std::string::npos); \
+#define ASSERT_THROWS_WITH_MESSAGE(statement, substring)             \
+  try {                                                              \
+    (void)statement;                                                 \
+    FAIL();                                                          \
+  } catch (const std::exception& e) {                                \
+    std::string substring_s(substring);                              \
+    trim(substring_s);                                               \
+    auto exception_string = std::string(e.what());                   \
+    trim(exception_string);                                          \
+    ASSERT_NE(exception_string.find(substring_s), std::string::npos) \
+        << " Error was: \n"                                          \
+        << exception_string;                                         \
   }
 
 namespace torch {
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl
index be67cecf9705..ddee6be4c35a 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff
new file mode 100644
index 000000000000..4f62dbfbeb80
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl
index e5663224ac76..cb36f9aeba8b 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff
new file mode 100644
index 000000000000..01891bc9e4a9
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl
index 8698001427a9..443074fe7130 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff
new file mode 100644
index 000000000000..f932d478d0ab
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl
index c52d92b29f44..ac8b1b918de7 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff
new file mode 100644
index 000000000000..d20ba9bf4820
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl
index 749614fa5309..323aa42dde4e 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff
new file mode 100644
index 000000000000..7299062135c9
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl
index b20c456058be..6d06dea6b589 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff
new file mode 100644
index 000000000000..700a0e5bae11
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl
index f33f3a8cf8de..4fd551d073ae 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff
new file mode 100644
index 000000000000..0b1200312851
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl
index ac7cc7479e79..9680713a83e2 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff
new file mode 100644
index 000000000000..ce5daf444635
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl
index 0b70614b0936..0381636677b5 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff
new file mode 100644
index 000000000000..46b57c83fe78
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl
index 5f6ae1a90b1e..21792d35b892 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff
new file mode 100644
index 000000000000..963070db5149
Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff differ
diff --git a/test/cpp/lazy/CMakeLists.txt b/test/cpp/lazy/CMakeLists.txt
index ede4308816cf..4d98400323fb 100644
--- a/test/cpp/lazy/CMakeLists.txt
+++ b/test/cpp/lazy/CMakeLists.txt
@@ -9,9 +9,16 @@ set(LAZY_TEST_SRCS
   ${LAZY_TEST_ROOT}/test_misc.cpp
   ${LAZY_TEST_ROOT}/test_permutation_util.cpp
   ${LAZY_TEST_ROOT}/test_shape.cpp
-  ${LAZY_TEST_ROOT}/test_tensor_impl.cpp
+  ${LAZY_TEST_ROOT}/test_symbolic_shape.cpp
+  ${LAZY_TEST_ROOT}/test_trie_cache.cpp
   ${LAZY_TEST_ROOT}/test_util.cpp
 )
+if(BUILD_LAZY_TS_BACKEND)
+    list(APPEND LAZY_TEST_SRCS
+      ${LAZY_TEST_ROOT}/test_lazy_ops.cpp
+      ${LAZY_TEST_ROOT}/test_lazy_ops_util.cpp
+    )
+endif()
 
 add_executable(test_lazy
   ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/cpp/lazy/test_backend_device.cpp b/test/cpp/lazy/test_backend_device.cpp
index b75f0512d387..f8ce49b9e287 100644
--- a/test/cpp/lazy/test_backend_device.cpp
+++ b/test/cpp/lazy/test_backend_device.cpp
@@ -74,9 +74,13 @@ TEST(BackendDeviceTest, FromAten) {
   auto device = c10::Device(c10::kCPU);
   EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error);
 
-  // TODO(alanwaketan): Update the following test once we have TorchScript backend upstreamed.
   device = c10::Device(c10::kLazy);
+#ifndef FBCODE_CAFFE2
+  auto backend_device = atenDeviceToBackendDevice(device);
+#else
+  // Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl
   EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error);
+#endif // FBCODE_CAFFE2
 }
 
 TEST(BackendDeviceTest, ToAten) {
diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp
index 033b6c21b1e7..ddbf6611d36a 100644
--- a/test/cpp/lazy/test_cache.cpp
+++ b/test/cpp/lazy/test_cache.cpp
@@ -4,6 +4,8 @@
 #include <torch/csrc/lazy/core/cache.h>
 #include <torch/csrc/lazy/core/hash.h>
 #include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
 
 namespace torch {
 namespace lazy {
@@ -11,7 +13,8 @@ namespace lazy {
 class CacheNode : public Node {
  public:
   explicit CacheNode(const std::string& str)
-      : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)),
+      : Node(OpKind(), /* num_outputs */ 1),
+        hash_(Hash(str)),
         str_(str) {}
   ~CacheNode() override = default;
 
@@ -23,7 +26,10 @@ class CacheNode : public Node {
     TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of test node");
   }
 
+  hash_t hash() const override { return hash_; }
+  hash_t shapeHash() const override { return hash_; }
  private:
+  hash_t hash_;
   std::string str_;
 };
 
@@ -33,30 +39,57 @@ TEST(CacheTest, BasicTest) {
   std::shared_ptr<CacheNode> c = std::make_shared<CacheNode>("c");
   Cache<hash_t, CacheNode, HashReducer> cache(2);
 
-  cache.Add(a->node_hash(), a);
-  EXPECT_EQ(cache.Get(a->node_hash()), a);
-  EXPECT_EQ(cache.Get(b->node_hash()), nullptr);
-  EXPECT_EQ(cache.Get(c->node_hash()), nullptr);
+  cache.Add(a->hash(), a);
+  EXPECT_EQ(cache.Get(a->hash()), a);
+  EXPECT_EQ(cache.Get(b->hash()), nullptr);
+  EXPECT_EQ(cache.Get(c->hash()), nullptr);
 
-  cache.Add(b->node_hash(), b);
-  EXPECT_EQ(cache.Get(a->node_hash()), a);
-  EXPECT_EQ(cache.Get(b->node_hash()), b);
-  EXPECT_EQ(cache.Get(c->node_hash()), nullptr);
+  cache.Add(b->hash(), b);
+  EXPECT_EQ(cache.Get(a->hash()), a);
+  EXPECT_EQ(cache.Get(b->hash()), b);
+  EXPECT_EQ(cache.Get(c->hash()), nullptr);
 
-  cache.Add(c->node_hash(), c);
-  EXPECT_EQ(cache.Get(a->node_hash()), nullptr); // a has been evicted
-  EXPECT_EQ(cache.Get(b->node_hash()), b);
-  EXPECT_EQ(cache.Get(c->node_hash()), c);
+  cache.Add(c->hash(), c);
+  EXPECT_EQ(cache.Get(a->hash()), nullptr); // a has been evicted
+  EXPECT_EQ(cache.Get(b->hash()), b);
+  EXPECT_EQ(cache.Get(c->hash()), c);
 
-  cache.Erase(c->node_hash());
-  EXPECT_EQ(cache.Get(a->node_hash()), nullptr);
-  EXPECT_EQ(cache.Get(b->node_hash()), b);
-  EXPECT_EQ(cache.Get(c->node_hash()), nullptr); // c has been removed
+  cache.Erase(c->hash());
+  EXPECT_EQ(cache.Get(a->hash()), nullptr);
+  EXPECT_EQ(cache.Get(b->hash()), b);
+  EXPECT_EQ(cache.Get(c->hash()), nullptr); // c has been removed
 
   cache.Clear();
-  EXPECT_EQ(cache.Get(a->node_hash()), nullptr);
-  EXPECT_EQ(cache.Get(b->node_hash()), nullptr);
-  EXPECT_EQ(cache.Get(c->node_hash()), nullptr);
+  EXPECT_EQ(cache.Get(a->hash()), nullptr);
+  EXPECT_EQ(cache.Get(b->hash()), nullptr);
+  EXPECT_EQ(cache.Get(c->hash()), nullptr);
+}
+
+class CacheNodeWithShape : public TsNode {
+ public:
+  explicit CacheNodeWithShape(const Shape& shape)
+      : TsNode(OpKind(), shape, /* num_outputs */ 1, /* seed */ 0){}
+};
+
+TEST(CacheTest, ShapeCacheTestForDynamicShape) {
+  // enable dynamic shape
+  FLAGS_ltc_enable_dynamic_shapes = true;
+
+  CacheNodeWithShape nodes[] = {
+    CacheNodeWithShape(Shape(c10::kFloat, {2, 4})),
+    CacheNodeWithShape(Shape(c10::kFloat, {4, 2})) };
+
+  /*
+   * Make sure the cached shape for node (2, 4) is not used for node (4, 2)
+   */
+  for (auto& node : nodes) {
+    EXPECT_EQ(node.shape(), node.computeShape([&]() {
+      return node.shape();
+    }));
+  }
+
+  // reset the flag
+  FLAGS_ltc_enable_dynamic_shapes = false;
 }
 
 } // namespace lazy
diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp
index 78b94618c7fd..1ce666164a64 100644
--- a/test/cpp/lazy/test_ir.cpp
+++ b/test/cpp/lazy/test_ir.cpp
@@ -1,18 +1,29 @@
 #include <gtest/gtest.h>
 
+#include <torch/csrc/lazy/generated/LazyIr.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <memory>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
 
 namespace torch {
 namespace lazy {
 
 class TestLeafNode : public Node {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind();
+  }
+
   explicit TestLeafNode(size_t param)
-      : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)),
+      : Node(ClassOpKind(), /* num_outputs */ 1),
+        hash_(Hash(param)),
         param_(param) {}
   ~TestLeafNode() override = default;
 
@@ -24,7 +35,10 @@ class TestLeafNode : public Node {
     TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of leaf node");
   }
 
+  hash_t hash() const override { return hash_; }
+  hash_t shapeHash() const override { return hash_; }
  private:
+  hash_t hash_;
   size_t param_;
 };
 
@@ -35,7 +49,7 @@ TEST(IrTest, BasicTest) {
 
   EXPECT_EQ(node1->num_outputs(), 1);
 
-  const TestLeafNode* leafptr = NodeCast<TestLeafNode>(node1.get(), OpKind());
+  const TestLeafNode* leafptr = NodeCast<TestLeafNode>(node1.get());
   EXPECT_TRUE(leafptr != nullptr);
 }
 
@@ -51,22 +65,22 @@ TEST(IrTest, MetaDataTest) {
   node = MakeNode<TestLeafNode>(1);
   auto metaWithEmptyDebug = node->metadata();
   EXPECT_EQ(metaWithEmptyDebug.scope.size(), 0);
-  EXPECT_EQ(metaWithEmptyDebug.frame_info.size(), 0);
+  EXPECT_EQ(metaWithEmptyDebug.frame_info.size(), 1);
 
   {
     ScopePusher scope("TestScope");
     node = MakeNode<TestLeafNode>(1);
     auto metaWithScope = node->metadata();
     EXPECT_EQ(metaWithScope.scope, "TestScope.1");
-    EXPECT_EQ(metaWithScope.frame_info.size(), 0);
+    EXPECT_EQ(metaWithScope.frame_info.size(), 1);
   }
 
   SourceLocation dummySourceLocation;
   dummySourceLocation.file = "file";
   dummySourceLocation.function = "function";
   dummySourceLocation.line = 10;
-  RegisterGetFrameInfo(
-      [&]() -> std::vector<SourceLocation> { return {dummySourceLocation}; });
+  GetPythonFramesFunction() =
+      [&]() -> std::vector<SourceLocation> { return {dummySourceLocation}; };
   node = MakeNode<TestLeafNode>(1);
   auto metaWithSourceLoc = node->metadata();
   EXPECT_EQ(metaWithSourceLoc.scope.size(), 0);
@@ -77,7 +91,7 @@ TEST(IrTest, MetaDataTest) {
   FLAGS_torch_lazy_ir_debug = restore_FLAGS_torch_lazy_ir_debug;
 }
 
-TEST(IrTest, TsNode) {
+TEST(IrTest, TsNodeTest) {
   NodePtr node1 = MakeNode<TsNode>(
       OpKind(at::aten::view),
       Shape(),
@@ -92,9 +106,32 @@ TEST(IrTest, TsNode) {
 
   EXPECT_EQ(node1->num_outputs(), 1);
 
-  const TsNode* leafptr = NodeCast<TsNode>(node1.get(), OpKind(at::aten::view));
+  const TsNode* leafptr = dynamic_cast<const TsNode*>(node1.get());
   EXPECT_TRUE(leafptr != nullptr);
 }
 
+TEST(IrTest, DimensionNodeTest) {
+
+  const size_t DIM0 = 5;
+  const size_t DIM1 = 8;
+  NodePtr node1 = MakeNode<TsNode>(
+      OpKind(at::aten::view),
+      Shape(c10::kFloat, {DIM0, DIM1}),
+      /*num_outputs*/ 1,
+      /*hash_seed*/ kHashSeed);
+
+  auto size0 = std::dynamic_pointer_cast<SizeNode>(MakeNode<SizeNode>(Value{node1}, 0));
+  auto size1 = std::dynamic_pointer_cast<SizeNode>(MakeNode<SizeNode>(Value{node1}, 1));
+
+  ASSERT_EQ(DIM0, size0->getStaticValue());
+  ASSERT_EQ(DIM1, size1->getStaticValue());
+
+  auto add_dim = std::dynamic_pointer_cast<SizeAdd>(MakeNode<SizeAdd>(Value{size0}, Value{size1}));
+  ASSERT_EQ(DIM0 + DIM1, add_dim->getStaticValue());
+
+  auto mul_dim = std::dynamic_pointer_cast<SizeMul>(MakeNode<SizeMul>(Value{size0}, Value{size1}));
+  ASSERT_EQ(DIM0 * DIM1, mul_dim->getStaticValue());
+}
+
 } // namespace lazy
 } // namespace torch
diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp
index 5c216258f9ac..ad951956db7d 100644
--- a/test/cpp/lazy/test_ir_util.cpp
+++ b/test/cpp/lazy/test_ir_util.cpp
@@ -3,6 +3,7 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
 #include <torch/csrc/lazy/core/ir_util.h>
 
@@ -12,7 +13,7 @@ namespace lazy {
 class IrUtilNode : public Node {
  public:
   explicit IrUtilNode()
-      : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {}
+      : Node(OpKind(), /* num_outputs */ 1), hash_(Hash(0)) {}
   ~IrUtilNode() override = default;
 
   void AddOperand(Value v) {
@@ -23,17 +24,10 @@ class IrUtilNode : public Node {
     operands_.push_back(std::move(v.node));
   }
 
-  const std::vector<Output>& operands() const override {
-    return operands_as_outputs_;
-  }
-
-  const Output& operand(size_t i) const override {
-    return operands_as_outputs_.at(i);
-  }
-
+  hash_t hash() const override { return hash_; }
+  hash_t shapeHash() const override { return hash_; }
  private:
-  std::vector<NodePtr> operands_;
-  std::vector<Output> operands_as_outputs_;
+  hash_t hash_;
 };
 
 /*  a
diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp
new file mode 100644
index 000000000000..f12d357760e6
--- /dev/null
+++ b/test/cpp/lazy/test_lazy_ops.cpp
@@ -0,0 +1,10773 @@
+#include <gtest/gtest.h>
+#include <iostream>
+#include "c10/core/DeviceType.h"
+
+#include <c10/core/Device.h>
+#include <test/cpp/lazy/test_lazy_ops_util.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
+#include <torch/torch.h>
+
+namespace torch {
+namespace lazy {
+
+
+// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl
+#ifndef FBCODE_CAFFE2
+
+namespace {
+  // This registers the torchscript backend, without which lazy device won't work
+static bool inline init_backend(){
+  torch::lazy::InitTorchScriptBackend();
+  return true;
+}
+static const bool backend_initialized = init_backend();
+
+}
+
+class LazyTsTest : public ::testing::Test {
+ protected:
+  void SetUp() override;
+
+  void TearDown() override;
+
+  static void CommonSetup() {}
+
+  void ExpectCounterNotChanged(
+      const std::string& counter_regex,
+      const std::unordered_set<std::string>* ignore_set) {}
+
+  void ExpectCounterChanged(const std::string& counter_regex,
+                            const std::unordered_set<std::string>* ignore_set) {
+  }
+
+  void ResetCounters() {}
+
+ private:
+  void MakeEndSnapshot() {}
+};
+
+class LazyOpsTestBase : public LazyTsTest {
+ protected:
+  static void SetUpTestCase() {}
+};
+
+void LazyTsTest::SetUp() {
+  (void)backend_initialized;  // avoid unused parameter warning
+  at::manual_seed(42);
+  torch::lazy::LazyGraphExecutor::Get()->SetRngSeed(torch::lazy::BackendDevice(), 42);
+}
+
+void LazyTsTest::TearDown() {}
+
+namespace {
+using torch::lazy::DebugUtil;
+
+class LazyOpsTest : public LazyOpsTestBase {};
+
+static inline bool IsCuda() {
+  return torch::lazy::getBackend()->EagerFallbackDeviceType() == at::kCUDA;
+}
+
+static inline at::DeviceType DefaultDevice() {
+  return torch::lazy::getBackend()->EagerFallbackDeviceType();
+}
+
+
+}  // namespace
+
+TEST(LazyDynamicOpsTest, NarrowCopy) {
+  auto x = torch::rand({5, 10, 10}).to(kLazy);
+  const size_t Y_DIM = 3;
+  const size_t X_DIM_INDEX = 2;
+  auto y = torch::rand({Y_DIM}).to(kLazy);
+  auto ly = torch::lazy::TryGetLtcTensor(y);
+  auto dim_node = MakeNode<SizeNode>(ly->GetIrValue(), 0);
+  auto lmn = std::make_shared<torch::lazy::SymbolicIntNode>(dim_node);
+  auto z = x.narrow_copy(X_DIM_INDEX, 0, lmn->toSymInt());
+  AllClose(z.cpu(), x.cpu().narrow_copy(X_DIM_INDEX, 0, Y_DIM));
+}
+
+TEST_F(LazyOpsTest, TestScalarTensor) {
+  torch::Tensor scalar_tensor = torch::scalar_tensor(
+      1., torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_scalar_tensor = torch::scalar_tensor(
+        1., torch::TensorOptions(torch::kFloat).device(torch::kLazy));
+    AllClose(scalar_tensor, lazy_scalar_tensor);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClone) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = lazy_a.clone();
+    AllClose(a, lazy_b);
+    lazy_a.add_(1.0);
+    AllClose(a, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTo) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestIsFloatingPoint) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    bool is_float = torch::is_floating_point(a);
+    bool lazy_is_float = torch::is_floating_point(lazy_a);
+    EXPECT_EQ(is_float, lazy_is_float);
+  });
+}
+
+TEST_F(LazyOpsTest, TestIsSigned) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    bool is_signed = torch::is_signed(a);
+    bool lazy_is_signed = torch::is_signed(lazy_a);
+    EXPECT_EQ(is_signed, lazy_is_signed);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastByte) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Byte(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Byte(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastChar) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Char(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Char(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastShort) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Short(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Short(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastInt) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Int(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Int(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastLong) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Long(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Long(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCastFloat) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::_cast_Float(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::_cast_Float(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRetainType) {
+  torch::Tensor lazy_a = torch::zeros(
+      {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy));
+  torch::Tensor lazy_b = torch::ones(
+      {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy));
+  torch::Tensor lazy_c = lazy_a + lazy_b;
+  EXPECT_EQ(lazy_c.scalar_type(), torch::ScalarType::Byte);
+}
+
+TEST_F(LazyOpsTest, TestLogicalTypeWithInterop) {
+  torch::Tensor query =
+      torch::rand({2, 12, 20, 64},
+                  torch::TensorOptions(torch::kFloat).device(torch::kLazy));
+  torch::Tensor key =
+      torch::rand({2, 12, 64, 20},
+                  torch::TensorOptions(torch::kFloat).device(torch::kLazy));
+  torch::Tensor scores =
+      torch::matmul(query, key) /
+      torch::scalar_tensor(
+          8, torch::TensorOptions(torch::kDouble).device(torch::kLazy));
+  torch::Tensor p_attn = torch::softmax(scores, /*dim=*/-1);
+  EXPECT_EQ(p_attn.scalar_type(), torch::ScalarType::Float);
+}
+
+TEST_F(LazyOpsTest, TestAdd) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::add(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddHalf) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
+  torch::Tensor c = torch::add(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddMixedPrecision) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
+  torch::Tensor c = torch::add(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddInPlace) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor c = a.add_(b);
+    torch::Tensor lazy_c = lazy_a.add_(lazy_b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddScalar) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar b(1);
+  torch::Tensor c = torch::add(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_c = torch::add(lazy_a, b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddScalarInPlace) {
+  torch::Scalar b(1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor c = a.add_(b);
+    torch::Tensor lazy_c = lazy_a.add_(b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddZeroSizeDim) {
+  torch::Tensor a = torch::rand(
+      {0, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {1, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::add(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSub) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::sub(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::sub(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSubInPlace) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor c = a.sub_(b);
+    torch::Tensor lazy_c = lazy_a.sub_(lazy_b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSubScalar) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar b(1);
+  torch::Tensor c = torch::sub(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_c = torch::sub(lazy_a, b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSubScalarInPlace) {
+  torch::Scalar b(1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor c = a.sub_(b);
+    torch::Tensor lazy_c = lazy_a.sub_(b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMul) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::mul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::mul(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMulInPlace) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor c = a.mul_(b);
+    torch::Tensor lazy_c = lazy_a.mul_(lazy_b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMulScalar) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar b(3);
+  torch::Tensor c = torch::mul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_c = torch::mul(lazy_a, b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMulScalarInPlace) {
+  torch::Scalar b(3);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a = torch::rand(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor c = a.mul_(b);
+    torch::Tensor lazy_c = lazy_a.mul_(b);
+    AllClose(a, lazy_a);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestDiv) {
+  for (torch::ScalarType scalar_type1 :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type1)
+            ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
+            : torch::randint(0, 100, {3, 4},
+                             torch::TensorOptions(scalar_type1));
+    for (torch::ScalarType scalar_type2 :
+         {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+          torch::kLong}) {
+      torch::Tensor b =
+          isFloatingType(scalar_type2)
+              ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
+              : torch::randint(1, 100, {3, 4},
+                               torch::TensorOptions(scalar_type2));
+      torch::Tensor c = torch::div(a, b);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor lazy_b = CopyToDevice(b, device);
+        torch::Tensor lazy_c = torch::div(lazy_a, lazy_b);
+        AllClose(c, lazy_c);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivWithRoundingMode) {
+  c10::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
+                                                      c10::nullopt};
+  for (const auto& rounding_mode : rounding_modes) {
+    for (torch::ScalarType scalar_type1 :
+         {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+          torch::kLong}) {
+      int lower_bound = (scalar_type1 == torch::kByte) ? 0 : -100;
+      torch::Tensor a =
+          isFloatingType(scalar_type1)
+              ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
+              : torch::randint(lower_bound, 50, {3, 4},
+                               torch::TensorOptions(scalar_type1));
+      for (torch::ScalarType scalar_type2 :
+           {torch::kFloat, torch::kByte, torch::kChar, torch::kShort,
+            torch::kInt, torch::kLong}) {
+        torch::Tensor b =
+            isFloatingType(scalar_type2)
+                ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
+                : torch::randint(51, 100, {3, 4},
+                                 torch::TensorOptions(scalar_type2));
+        torch::Tensor c = torch::div(a, b, rounding_mode);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = CopyToDevice(b, device);
+          torch::Tensor lazy_c = torch::div(lazy_a, lazy_b, rounding_mode);
+          AllClose(c, lazy_c);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivInPlace) {
+  for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type1)
+            ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
+            : torch::randint(0, 100, {3, 4},
+                             torch::TensorOptions(scalar_type1));
+    for (torch::ScalarType scalar_type2 : {torch::kFloat}) {
+      torch::Tensor b =
+          isFloatingType(scalar_type2)
+              ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
+              : torch::randint(1, 100, {3, 4},
+                               torch::TensorOptions(scalar_type2));
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor c = a.div_(b);
+        torch::Tensor lazy_b = CopyToDevice(b, device);
+        torch::Tensor lazy_c = lazy_a.div_(lazy_b);
+        ;
+        AllClose(c, lazy_c);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivInPlaceWithRoundingMode) {
+  c10::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
+                                                      c10::nullopt};
+  for (const auto& rounding_mode : rounding_modes) {
+    for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
+      torch::Tensor a =
+          isFloatingType(scalar_type1)
+              ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
+              : torch::randint(-100, 100, {3, 4},
+                               torch::TensorOptions(scalar_type1));
+      for (torch::ScalarType scalar_type2 : {torch::kFloat}) {
+        torch::Tensor b =
+            isFloatingType(scalar_type2)
+                ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
+                : torch::randint(1, 100, {3, 4},
+                                 torch::TensorOptions(scalar_type2));
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor c = a.div_(b, rounding_mode);
+          torch::Tensor lazy_b = CopyToDevice(b, device);
+          torch::Tensor lazy_c = lazy_a.div_(lazy_b, rounding_mode);
+          AllClose(c, lazy_c);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivScalar) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  1, 100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool is_float : {true, false}) {
+      torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3);
+      torch::Tensor c = torch::div(a, b);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor lazy_c = torch::div(lazy_a, b);
+        AllClose(c, lazy_c);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivScalarInPlace) {
+  for (torch::ScalarType scalar_type : {torch::kFloat}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  1, 100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool is_float : {true, false}) {
+      torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor c = a.div_(b);
+        torch::Tensor lazy_c = lazy_a.div_(b);
+        AllClose(c, lazy_c);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDivOut) {
+  for (torch::ScalarType scalar_type : {torch::kFloat, torch::kDouble}) {
+    torch::Tensor a = torch::rand(
+        {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor b = torch::rand(
+        {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor c = torch::empty(
+        {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::div_out(c, a, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = torch::empty({3, 4}, lazy_b.options());
+      torch::div_out(lazy_c, lazy_a, lazy_b);
+      AllClose(c, lazy_c);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestRsubScalar) {
+  torch::Tensor input = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar other(1.5);
+  torch::Scalar alpha(2.5);
+  torch::Tensor result = torch::rsub(input, other, alpha);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::rsub(lazy_input, other, alpha);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNe) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::ne(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::ne(lazy_a, lazy_b);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNeInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor a_copy = a.clone();
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  a.ne_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.ne_(lazy_b);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEq) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  torch::Tensor c = torch::eq(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::eq(lazy_a, lazy_b);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEqInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  torch::Tensor a_copy = a.clone();
+  a.eq_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.eq_(lazy_b);
+    AllClose(lazy_a, a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGe) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  torch::Tensor c = torch::ge(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::ge(lazy_a, lazy_b);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGeInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  b[1] -= 1;
+  torch::Tensor a_copy = a.clone();
+  a.ge_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.ge_(lazy_b);
+    AllClose(lazy_a, a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLe) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  torch::Tensor c = torch::le(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::le(lazy_a, lazy_b);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLeInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  b[1] -= 1;
+  torch::Tensor a_copy = a.clone();
+  a.le_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.le_(lazy_b);
+    AllClose(lazy_a, a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGt) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::add(a.clone(), torch::ones_like(a));
+  torch::Tensor c = torch::gt(b, a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::gt(lazy_b, lazy_a);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGtInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  b[1] -= 1;
+  torch::Tensor a_copy = a.clone();
+  a.gt_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.gt_(lazy_b);
+    AllClose(lazy_a, a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLt) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::add(a.clone(), torch::ones_like(a));
+  torch::Tensor c = torch::lt(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::lt(lazy_a, lazy_b);
+    AllEqual(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLtInplace) {
+  torch::Tensor a = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.clone();
+  b[0] += 1;
+  b[1] -= 1;
+  torch::Tensor a_copy = a.clone();
+  a.lt_(b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    lazy_a.lt_(lazy_b);
+    AllClose(lazy_a, a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNeScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(0));
+  torch::Tensor result = torch::ne(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::ne(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEqScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(1));
+  torch::Tensor result = torch::eq(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::eq(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGeScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(1));
+  torch::Tensor result = torch::ge(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::ge(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGeScalarInplace) {
+  torch::Tensor input = torch::arange(
+      -1., 1.5, 0.5,
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar other(float(0));
+  torch::Tensor input_copy = input.clone();
+  input.ge_(other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    lazy_input.ge_(other);
+    AllClose(lazy_input, input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLeScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(1));
+  torch::Tensor result = torch::le(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::le(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLeScalarInplace) {
+  torch::Tensor input = torch::arange(
+      -1., 1.5, 0.5,
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar other(float(0));
+  torch::Tensor input_copy = input.clone();
+  input.le_(other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    lazy_input.le_(other);
+    AllClose(lazy_input, input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGtScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(0.5));
+  torch::Tensor result = torch::gt(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::gt(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGtScalarInplace) {
+  torch::Tensor input = torch::arange(
+      -1., 1.5, 0.5,
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar other(float(0));
+  torch::Tensor input_copy = input.clone();
+  input.gt_(other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    lazy_input.gt_(other);
+    AllClose(lazy_input, input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLtScalar) {
+  torch::Tensor input = torch::ones({2, 3});
+  torch::Scalar other(float(1.5));
+  torch::Tensor result = torch::lt(input, other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::lt(lazy_input, other);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLtScalarInplace) {
+  torch::Tensor input = torch::arange(
+      -1., 1.5, 0.5,
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar other(float(0));
+  torch::Tensor input_copy = input.clone();
+  input.lt_(other);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    lazy_input.lt_(other);
+    AllClose(lazy_input, input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestIntegerAdd) {
+  std::vector<torch::ScalarType> types(
+      {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});
+
+  ForEachDevice([&](const torch::Device& device) {
+    for (auto type : types) {
+      torch::Tensor a =
+          torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
+      torch::Tensor b =
+          torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
+      torch::Scalar one =
+          isIntegralType(type) ? torch::Scalar(1) : torch::Scalar(1.0);
+      torch::Tensor c = torch::add(b, one);
+
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = torch::add(lazy_b, one);
+
+      AllEqual(c, lazy_c);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSVD) {
+  static const int dims[] = {4, 7};
+  for (auto m : dims) {
+    for (auto n : dims) {
+      torch::Tensor a = torch::rand(
+          {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      auto b = torch::svd(a, /*some=*/true, /*compute_uv=*/true);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        auto lazy_b = torch::svd(lazy_a, /*some=*/true, /*compute_uv=*/true);
+        // The U and V matrices might have different sign for column vectors, so
+        // cannot be compared if not by absolute value.
+        AllClose(std::get<0>(b).abs(), std::get<0>(lazy_b).abs(), /*rtol=*/1e-3,
+                 /*atol=*/1e-4);
+        torch::Tensor diag = std::get<1>(b);
+        torch::Tensor lazy_diag = std::get<1>(lazy_b);
+        ASSERT_EQ(diag.sizes(), lazy_diag.sizes());
+        AllClose(diag, lazy_diag, /*rtol=*/1e-3,
+                 /*atol=*/1e-4);
+        AllClose(std::get<2>(b).abs(), std::get<2>(lazy_b).abs(), /*rtol=*/1e-3,
+                 /*atol=*/1e-4);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestQR) {
+  static const int dims[] = {4, 7};
+  for (auto m : dims) {
+    for (auto n : dims) {
+      torch::Tensor a = torch::rand(
+          {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      auto b = torch::qr(a);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        auto lazy_b = torch::qr(lazy_a);
+        AllClose(std::get<0>(b).abs(), std::get<0>(lazy_b).abs(), /*rtol=*/1e-3,
+                 /*atol=*/1e-4);
+        AllClose(std::get<1>(b).abs(), std::get<1>(lazy_b).abs(), /*rtol=*/1e-3,
+                 /*atol=*/1e-4);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSymEig) {
+  static const int dims[] = {4, 7};
+  for (auto m : dims) {
+    for (bool eigenvectors : {true, false}) {
+      for (bool upper : {true, false}) {
+        torch::Tensor a = torch::rand(
+            {m, m},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor sym_a = a.mm(a.t());
+        auto b = torch::symeig(sym_a, eigenvectors, upper);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(sym_a, device);
+          auto lazy_b = torch::symeig(lazy_a, eigenvectors, upper);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b), /*rtol=*/3e-2,
+                   /*atol=*/1e-2);
+          if (eigenvectors) {
+            AllClose(std::get<1>(b).abs(), std::get<1>(lazy_b).abs(),
+                     /*rtol=*/3e-2,
+                     /*atol=*/1e-2);
+          } else {
+            EXPECT_EQ(std::get<1>(b).sizes(), std::get<1>(lazy_b).sizes());
+          }
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestCholesky) {
+  static const int dims[] = {4, 7};
+  for (auto m : dims) {
+    for (bool upper : {true, false}) {
+      torch::Tensor a = torch::rand(
+          {3, m, m},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor pd_a =
+          torch::matmul(a, torch::transpose(a, 1, 2)) +
+          torch::eye(
+              m, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      auto b = torch::cholesky(pd_a, upper);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(pd_a, device);
+        auto lazy_b = torch::cholesky(lazy_a, upper);
+        AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestLogDet) {
+  static const int dims[] = {4, 7};
+  for (auto m : dims) {
+    torch::Tensor a = torch::rand(
+        {3, m, m}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor pd_a =
+        torch::matmul(a, torch::transpose(a, 1, 2)) +
+        torch::eye(m,
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor b = torch::logdet(pd_a);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(pd_a, device);
+      torch::Tensor lazy_b = torch::logdet(lazy_a);
+      AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTriangularSolve) {
+  static const int dims[] = {4, 7};
+  for (bool batched_a : {true, false}) {
+    for (bool batched_b : {true, false}) {
+      for (auto m : dims) {
+        for (auto n : dims) {
+          for (bool upper : {true, false}) {
+            for (bool transpose : {true, false}) {
+              for (bool unitriangular : {true, false}) {
+                torch::Tensor a =
+                    torch::randn({m, m}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice()));
+                torch::Tensor b =
+                    torch::randn({m, n}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice()));
+                a = batched_a ? a.expand({3, m, m}).clone() : a;
+                b = batched_b ? b.expand({3, m, n}).clone() : b;
+                auto result = torch::triangular_solve(
+                    b, a, /*upper=*/upper, /*transpose=*/transpose,
+                    /*unitriangular=*/unitriangular);
+                ForEachDevice([&](const torch::Device& device) {
+                  torch::Tensor lazy_a = CopyToDevice(a, device);
+                  torch::Tensor lazy_b = CopyToDevice(b, device);
+                  auto lazy_result = torch::triangular_solve(
+                      lazy_b, lazy_a, /*upper=*/upper, /*transpose=*/transpose,
+                      /*unitriangular=*/unitriangular);
+                  AllClose(std::get<0>(result), std::get<0>(lazy_result),
+                           /*rtol=*/1e-3, /*atol=*/1e-4);
+                  AllClose(std::get<1>(result), std::get<1>(lazy_result),
+                           /*rtol=*/1e-3, /*atol=*/1e-4);
+                });
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestKthValue) {
+  torch::Tensor a = torch::rand(
+      {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int k = 1; k <= 3; ++k) {
+    int rank = a.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      for (bool keepdim : {false, true}) {
+        auto b = torch::kthvalue(a, k, dim, keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          auto lazy_b = torch::kthvalue(lazy_a, k, dim, keepdim);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b));
+          AllEqual(std::get<1>(b), std::get<1>(lazy_b));
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestTopK) {
+  torch::Tensor a = torch::rand(
+      {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int k = 1; k <= 3; ++k) {
+    int rank = a.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      for (bool largest : {false, true}) {
+        auto b = torch::topk(a, k, dim, largest, /*sorted=*/true);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          auto lazy_b = torch::topk(lazy_a, k, dim, largest, /*sorted=*/true);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b));
+          AllEqual(std::get<1>(b), std::get<1>(lazy_b));
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSort) {
+  torch::Tensor a = torch::rand(
+      {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int k = 1; k <= 3; ++k) {
+    for (int dim = 0; dim < 3; ++dim) {
+      for (bool descending : {false, true}) {
+        auto b = torch::sort(a, dim, descending);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          auto lazy_b = torch::sort(lazy_a, dim, descending);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b));
+          AllEqual(std::get<1>(b), std::get<1>(lazy_b));
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSortDescWithMinValue) {
+  std::vector<int8_t> values{-128, 100};
+  torch::Tensor input =
+      torch::tensor(values, torch::TensorOptions(torch::kChar));
+  auto output = torch::sort(input, /*dim=*/0, /*descending=*/true);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    auto lazy_output = torch::sort(lazy_input, /*dim=*/0, /*descending=*/true);
+    AllEqual(std::get<0>(output), std::get<0>(lazy_output));
+    AllEqual(std::get<1>(output), std::get<1>(lazy_output));
+  });
+}
+
+TEST_F(LazyOpsTest, TestArgSort) {
+  torch::Tensor a = torch::rand(
+      {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int k = 1; k <= 3; ++k) {
+    for (int dim = 0; dim < 3; ++dim) {
+      for (bool descending : {false, true}) {
+        torch::Tensor b = torch::argsort(a, dim, descending);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = torch::argsort(lazy_a, dim, descending);
+          AllEqual(b, lazy_b);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMin) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::min(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::min(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMax) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::max(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::max(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUnaryMin) {
+  torch::Tensor input = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::min(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::min(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUnaryMax) {
+  torch::Tensor input = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::max(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::max(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAll) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor b = torch::all(a);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::all(lazy_a);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAllDim) {
+  torch::Tensor a = torch::randint(
+      0, 5, {2, 3, 4},
+      torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::all(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/false);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAllDimKeep) {
+  torch::Tensor a = torch::randint(
+      0, 5, {2, 3, 4},
+      torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::all(a, dim, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/true);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAmax) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (bool keepdim : {false, true}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor values = torch::amax(input, {dim}, /*keepdim=*/keepdim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_values =
+            torch::amax(lazy_input, {dim}, /*keepdim=*/keepdim);
+        AllClose(values, lazy_values);
+      });
+    }
+    for (int dim1 = -rank; dim1 < rank; ++dim1) {
+      for (int dim2 = -rank; dim2 < rank; ++dim2) {
+        if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1))
+          continue;
+        torch::Tensor values =
+            torch::amax(input, {dim1, dim2}, /*keepdim=*/keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_input = CopyToDevice(input, device);
+          torch::Tensor lazy_values =
+              torch::amax(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim);
+          AllClose(values, lazy_values);
+        });
+      }
+    }
+  }
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("xla::amax", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestAmin) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (bool keepdim : {false, true}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor values = torch::amin(input, {dim}, /*keepdim=*/keepdim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_values =
+            torch::amin(lazy_input, {dim}, /*keepdim=*/keepdim);
+        AllClose(values, lazy_values);
+      });
+    }
+    for (int dim1 = -rank; dim1 < rank; ++dim1) {
+      for (int dim2 = -rank; dim2 < rank; ++dim2) {
+        if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1))
+          continue;
+        torch::Tensor values =
+            torch::amin(input, {dim1, dim2}, /*keepdim=*/keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_input = CopyToDevice(input, device);
+          torch::Tensor lazy_values =
+              torch::amin(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim);
+          AllClose(values, lazy_values);
+        });
+      }
+    }
+  }
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("xla::amin", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestAny) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor b = torch::any(a);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::any(lazy_a);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAnyDim) {
+  torch::Tensor a = torch::randint(
+      0, 5, {2, 3, 4},
+      torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::any(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/false);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAnyDimKeep) {
+  torch::Tensor a = torch::randint(
+      0, 5, {2, 3, 4},
+      torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::any(a, dim, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/true);
+      EqualValues(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMean) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::mean(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::mean(lazy_a);
+    ASSERT_EQ(b.sizes(), lazy_b.sizes());
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMeanCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::mean(a, torch::kDouble);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::mean(lazy_a, torch::kDouble);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMeanInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::mean(a, {dim});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::mean(lazy_a, {dim});
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMeanInDims) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    torch::Tensor b = torch::mean(a, dims);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::mean(lazy_a, dims);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMeanInDimsKeepCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    torch::Tensor b = torch::mean(a, dims, true, torch::kDouble);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::mean(lazy_a, dims, true, torch::kDouble);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMeanInDimOut) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::empty(
+        {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::mean_out(b, a, {dim});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::empty({4, 4}, lazy_a.options());
+      torch::mean_out(lazy_b, lazy_a, {dim});
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestStd) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto unbiased : {true, false}) {
+    torch::Tensor b = torch::std(a, unbiased);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::std(lazy_a, unbiased);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestStdInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (auto unbiased : {true, false}) {
+    for (auto keepdim : {true, false}) {
+      for (int dim = -rank; dim < rank; ++dim) {
+        torch::Tensor b = torch::std(a, {dim}, unbiased, keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = torch::std(lazy_a, {dim}, unbiased, keepdim);
+          AllClose(b, lazy_b);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestStdWithCorrection) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // int rank = a.dim();
+  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  for (const auto& correction : corrections) {
+    for (auto keepdim : {true, false}) {
+      for (const auto& dim :
+           std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+        torch::Tensor b = torch::std(a, dim, correction, keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = torch::std(lazy_a, dim, correction, keepdim);
+          AllClose(b, lazy_b);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestStdMeanWithCorrection) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // int rank = a.dim();
+  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  for (const auto& correction : corrections) {
+    for (auto keepdim : {true, false}) {
+      for (const auto& dim :
+           std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+        auto b = torch::std_mean(a, dim, correction, keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          auto lazy_b = torch::std_mean(lazy_a, dim, correction, keepdim);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b));
+          AllClose(std::get<1>(b), std::get<1>(lazy_b));
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSum) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::sum(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sum(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSumCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::sum(a, torch::kDouble);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sum(lazy_a, torch::kDouble);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSumU8) {
+  torch::Tensor a = torch::ones(
+      {256}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  torch::Tensor b = torch::sum(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sum(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSumInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::sum(a, {dim});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::sum(lazy_a, {dim});
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSumInDims) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    torch::Tensor b = torch::sum(a, dims);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::sum(lazy_a, dims);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSumInDimsKeep) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::sum(lazy_a, dims, /*keepdim=*/true);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSumInDimsKeepCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true, torch::kDouble);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b =
+          torch::sum(lazy_a, dims, /*keepdim=*/true, torch::kDouble);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestVar) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (bool unbiased : {true, false}) {
+    torch::Tensor b = torch::var(a, unbiased);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::var(lazy_a, unbiased);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestVarWithDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    for (bool keepDim : {true, false}) {
+      for (bool unbiased : {true, false}) {
+        torch::Tensor b = torch::var(a, dims, unbiased, keepDim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = torch::var(lazy_a, dims, unbiased, keepDim);
+          AllClose(b, lazy_b);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestVarWithCorrection) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    for (bool keepDim : {true, false}) {
+      for (const auto& correction : corrections) {
+        torch::Tensor b = torch::var(a, dim, correction, keepDim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = torch::var(lazy_a, dim, correction, keepDim);
+          AllClose(b, lazy_b);
+        });
+      }
+    }
+  }
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::var", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestVarMeanWithCorrection) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    for (const auto& correction : corrections) {
+      for (auto keepdim : {true, false}) {
+        auto b = torch::var_mean(a, dim, correction, keepdim);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          auto lazy_b = torch::var_mean(lazy_a, dim, correction, keepdim);
+          AllClose(std::get<0>(b), std::get<0>(lazy_b));
+          AllClose(std::get<1>(b), std::get<1>(lazy_b));
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxInDim) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    for (bool keepdim : {false, true}) {
+      auto values_indices = torch::max(input, dim, /*keepdim=*/keepdim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        auto lazy_values_indices =
+            torch::max(lazy_input, dim, /*keepdim=*/keepdim);
+        AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices));
+        AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices));
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMinInDim) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    for (bool keepdim : {false, true}) {
+      auto values_indices = torch::min(input, dim, /*keepdim=*/keepdim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        auto lazy_values_indices =
+            torch::min(lazy_input, dim, /*keepdim=*/keepdim);
+        AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices));
+        AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices));
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNorm) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::norm(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::norm(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNormInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::norm(a, 2, {dim}, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::norm(lazy_a, 2, {dim}, /*keepdim=*/false);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestNormInDims) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
+    torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/false);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestNormInDimsKeep) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
+    torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/true);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestNormalTwoTensor) {
+  at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
+  at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat));
+  ForEachDevice([&](const torch::Device& device) {
+    at::Tensor lazy_mean = CopyToDevice(mean, device);
+    at::Tensor lazy_std = CopyToDevice(std, device);
+    at::Tensor lazy_normal = at::normal(lazy_mean, lazy_std);
+    double res_mean = lazy_normal.mean().item().toDouble();
+    double res_std = lazy_normal.std().item().toDouble();
+    EXPECT_GT(res_mean, -0.06);
+    EXPECT_LT(res_mean, 0.06);
+    EXPECT_GT(res_std, 0.94);
+    EXPECT_LT(res_std, 1.06);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNormalDoubleMean) {
+  at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat));
+  ForEachDevice([&](const torch::Device& device) {
+    at::Tensor lazy_std = CopyToDevice(std, device);
+    at::Tensor lazy_normal = at::normal(0, lazy_std);
+    double res_mean = lazy_normal.mean().item().toDouble();
+    double res_std = lazy_normal.std().item().toDouble();
+    EXPECT_GT(res_mean, -0.06);
+    EXPECT_LT(res_mean, 0.06);
+    EXPECT_GT(res_std, 0.94);
+    EXPECT_LT(res_std, 1.06);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNormalDoubleStd) {
+  at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
+  ForEachDevice([&](const torch::Device& device) {
+    at::Tensor lazy_mean = CopyToDevice(mean, device);
+    at::Tensor lazy_normal = at::normal(lazy_mean, 1);
+    double res_mean = lazy_normal.mean().item().toDouble();
+    double res_std = lazy_normal.std().item().toDouble();
+    EXPECT_GT(res_mean, -0.06);
+    EXPECT_LT(res_mean, 0.06);
+    EXPECT_GT(res_std, 0.94);
+    EXPECT_LT(res_std, 1.06);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNormalInPlace) {
+  at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
+  ForEachDevice([&](const torch::Device& device) {
+    at::Tensor lazy_a = CopyToDevice(a, device);
+    lazy_a.normal_(/*mean=*/0, /*std=*/1);
+    double res_mean = lazy_a.mean().item().toDouble();
+    double res_std = lazy_a.std().item().toDouble();
+    EXPECT_GT(res_mean, -0.06);
+    EXPECT_LT(res_mean, 0.06);
+    EXPECT_GT(res_std, 0.94);
+    EXPECT_LT(res_std, 1.06);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUniformInPlace) {
+  const double eps = 1e-3;
+  at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
+  ForEachDevice([&](const torch::Device& device) {
+    at::Tensor lazy_a = CopyToDevice(a, device);
+    lazy_a.uniform_(/*from=*/0, /*to=*/1);
+    at::Tensor cpu_a = ToCpuTensor(lazy_a);
+    double res_min = cpu_a.min().item().toDouble();
+    double res_max = cpu_a.max().item().toDouble();
+    EXPECT_GT(res_min, 0.0 - eps);
+    EXPECT_LT(res_max, 1.0 + eps);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRandomInPlace) {
+  for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar,
+                     torch::kShort, torch::kInt, torch::kLong}) {
+    const double eps = 0.2;
+    torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype));
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      lazy_a.random_(/*from=*/0, /*to=*/10);
+      double res_mean = lazy_a.sum().item().toDouble() / a.numel();
+      double res_min = lazy_a.min().item().toDouble();
+      double res_max = lazy_a.max().item().toDouble();
+      EXPECT_GT(res_mean, 4.5 - eps);
+      EXPECT_LT(res_mean, 4.5 + eps);
+      EXPECT_EQ(res_min, 0.0);
+      EXPECT_EQ(res_max, 9.0);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestRandomInPlaceDefaultFrom) {
+  for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar,
+                     torch::kShort, torch::kInt, torch::kLong}) {
+    const double eps = 0.2;
+    torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype));
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      lazy_a.random_(/*to=*/10);
+      double res_mean = lazy_a.sum().item().toDouble() / a.numel();
+      double res_min = lazy_a.min().item().toDouble();
+      double res_max = lazy_a.max().item().toDouble();
+      EXPECT_GT(res_mean, 4.5 - eps);
+      EXPECT_LT(res_mean, 4.5 + eps);
+      EXPECT_EQ(res_min, 0.0);
+      EXPECT_EQ(res_max, 9.0);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestRandomInPlaceDefault) {
+  for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar,
+                     torch::kShort, torch::kInt, torch::kLong}) {
+    auto input = torch::zeros({10}, torch::TensorOptions(dtype));
+    ForEachDevice([&](const torch::Device& device) {
+      auto lazyInput = CopyToDevice(input, device);
+      lazyInput.random_();
+      auto output = ToCpuTensor(lazyInput);
+      EXPECT_TRUE(torch::all(output.ne(input)).item<bool>());
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestNormGeneral) {
+  torch::Tensor a = torch::randn(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::norm(a, 3.5);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::norm(lazy_a, 3.5);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNormNuclear) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::norm(a, 1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::norm(lazy_a, 1);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFrobeniusNorm) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::frobenius_norm(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::frobenius_norm(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFrobeniusNormInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::frobenius_norm(a, {dim}, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b =
+          torch::frobenius_norm(lazy_a, {dim}, /*keepdim=*/false);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestFrobeniusNormInDims) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
+    torch::Tensor b = torch::frobenius_norm(a, dims, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b =
+          torch::frobenius_norm(lazy_a, dims, /*keepdim=*/false);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestGroupNorm) {
+  int num_channels = 6;
+  torch::Tensor input =
+      torch::rand({20, num_channels, 10, 10},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({num_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias =
+      torch::rand({num_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double eps = 1e-05;
+  for (int num_groups : {3, 6, 1}) {
+    torch::Tensor output =
+        torch::group_norm(input, num_groups, weight, bias, eps,
+                          /*cudnn_enabled=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_weight = CopyToDevice(weight, device);
+      torch::Tensor lazy_bias = CopyToDevice(bias, device);
+      torch::Tensor lazy_output =
+          torch::group_norm(lazy_input, num_groups, lazy_weight, lazy_bias, eps,
+                            /*cudnn_enabled=*/false);
+      AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestGroupNormBackward) {
+  int num_channels = 6;
+  torch::Tensor input =
+      torch::rand({2, num_channels, 5, 5}, torch::TensorOptions(torch::kFloat)
+                                               .device(DefaultDevice())
+                                               .requires_grad(true));
+  torch::Tensor weight =
+      torch::rand({num_channels}, torch::TensorOptions(torch::kFloat)
+                                      .device(DefaultDevice())
+                                      .requires_grad(true));
+  torch::Tensor bias =
+      torch::rand({num_channels}, torch::TensorOptions(torch::kFloat)
+                                      .device(DefaultDevice())
+                                      .requires_grad(true));
+  double eps = 1e-05;
+  for (bool undef_weight : {true, false}) {
+    for (int num_groups : {3, 6, 1}) {
+      auto testfn =
+          [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+        return torch::group_norm(
+            /*input=*/inputs[0], num_groups, inputs[1], inputs[2],
+            /*eps=*/eps,
+            /*cudnn_enabled=*/false);
+      };
+      torch::Tensor undef;
+      ForEachDevice([&](const torch::Device& device) {
+        TestBackward(
+            {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
+            device, testfn,
+            /*rtol=*/1e-3, /*atol=*/1e-3,
+            /*derivative_level=*/2);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestInstanceNorm) {
+  int batch = 5;
+  int num_channels = 20;
+  torch::Tensor input =
+      torch::rand({batch, num_channels, 10, 10},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({num_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias =
+      torch::rand({num_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_mean =
+      torch::zeros({num_channels},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_var =
+      torch::ones({num_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double momentum = 0.1;
+  double eps = 1e-05;
+  torch::Tensor output = torch::instance_norm(
+      input, weight, bias, running_mean, running_var,
+      /*use_input_stats=*/true, momentum, eps, /*cudnn_enabled=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    torch::Tensor lazy_bias = CopyToDevice(bias, device);
+    torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
+    torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
+    torch::Tensor lazy_output = torch::instance_norm(
+        lazy_input, lazy_weight, lazy_bias, lazy_running_mean, lazy_running_var,
+        /*use_input_stats=*/true, momentum, eps, /*cudnn_enabled=*/false);
+    AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLayerNorm) {
+  torch::Tensor input =
+      torch::rand({20, 10, 10, 10},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double eps = 1e-05;
+  torch::Tensor undef;
+  for (bool undef_weight : {true, false}) {
+    for (int64_t normalized_size : {2, 3}) {
+      std::vector<int64_t> normalized_shape(normalized_size, 10);
+      torch::Tensor weight = torch::rand(
+          normalized_shape,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor bias = torch::rand(
+          normalized_shape,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor output = torch::layer_norm(input, normalized_shape,
+                                               undef_weight ? undef : weight,
+                                               undef_weight ? undef : bias, eps,
+                                               /*cudnn_enabled=*/false);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_weight =
+            undef_weight ? undef : CopyToDevice(weight, device);
+        torch::Tensor lazy_bias =
+            undef_weight ? undef : CopyToDevice(bias, device);
+        torch::Tensor lazy_output = torch::layer_norm(
+            lazy_input, normalized_shape, lazy_weight, lazy_bias, eps,
+            /*cudnn_enabled=*/false);
+        AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestLayerNormBackward) {
+  torch::Tensor input =
+      torch::rand({2, 3, 3, 3}, torch::TensorOptions(torch::kFloat)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true));
+  double eps = 1e-05;
+  for (bool undef_weight : {true, false}) {
+    for (int64_t normalized_size : {2, 3}) {
+      std::vector<int64_t> normalized_shape(normalized_size, 3);
+      auto testfn =
+          [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+        return torch::layer_norm(
+            /*input=*/inputs[0], normalized_shape, inputs[1], inputs[2],
+            /*eps=*/eps,
+            /*cudnn_enabled=*/false);
+      };
+      torch::Tensor weight =
+          torch::rand(normalized_shape, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true));
+      torch::Tensor bias =
+          torch::rand(normalized_shape, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true));
+      torch::Tensor undef;
+      ForEachDevice([&](const torch::Device& device) {
+        TestBackward(
+            {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
+            device, testfn,
+            /*rtol=*/1e-3, /*atol=*/1e-4, /*derivative_level=*/2);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNuclearNorm) {
+  torch::Tensor a = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::nuclear_norm(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::nuclear_norm(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPairwiseDistance) {
+  torch::Tensor x1 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor x2 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double eps = 1e-6;
+  for (bool keepdim : {false, true}) {
+    for (double p : {1, 2, 3, 4}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output =
+            torch::pairwise_distance(x1, x2, p, eps, keepdim);
+        torch::Tensor lazy_x1 = CopyToDevice(x1, device);
+        torch::Tensor lazy_x2 = CopyToDevice(x2, device);
+        torch::Tensor lazy_output =
+            torch::pairwise_distance(lazy_x1, lazy_x2, p, eps, keepdim);
+        AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-5);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestCosineSimilarity) {
+  torch::Tensor x1 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor x2 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double eps = 1e-8;
+  int rank = x1.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor output = torch::cosine_similarity(x1, x2, dim, eps);
+      torch::Tensor lazy_x1 = CopyToDevice(x1, device);
+      torch::Tensor lazy_x2 = CopyToDevice(x2, device);
+      torch::Tensor lazy_output =
+          torch::cosine_similarity(lazy_x1, lazy_x2, dim, eps);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCosineEmbeddingLoss) {
+  torch::Tensor input1 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor input2 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum}) {
+    for (double margin : {0., 0.2}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output = torch::cosine_embedding_loss(
+            input1, input2, target, margin, reduction);
+        torch::Tensor lazy_input1 = CopyToDevice(input1, device);
+        torch::Tensor lazy_input2 = CopyToDevice(input2, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_output = torch::cosine_embedding_loss(
+            lazy_input1, lazy_input2, lazy_target, margin, reduction);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestHingeEmbeddingLoss) {
+  torch::Tensor input = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum}) {
+    for (double margin : {0., 0.2}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output =
+            torch::hinge_embedding_loss(input, target, margin, reduction);
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_output = torch::hinge_embedding_loss(
+            lazy_input, lazy_target, margin, reduction);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestTripletMarginLoss) {
+  torch::Tensor anchor = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor positive = torch::abs(torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor negative = torch::neg(torch::abs(torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))));
+  double eps = 1e-6;
+  for (double margin : {0., 0.2}) {
+    for (double p : {1, 2, 3, 4}) {
+      for (bool swap : {false, true}) {
+        for (torch::Reduction::Reduction reduction :
+             {torch::Reduction::Mean, torch::Reduction::Sum}) {
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor output = torch::triplet_margin_loss(
+                anchor, positive, negative, margin, p, eps, swap, reduction);
+            torch::Tensor lazy_anchor = CopyToDevice(anchor, device);
+            torch::Tensor lazy_positive = CopyToDevice(positive, device);
+            torch::Tensor lazy_negative = CopyToDevice(negative, device);
+            torch::Tensor lazy_output = torch::triplet_margin_loss(
+                lazy_anchor, lazy_positive, lazy_negative, margin, p, eps, swap,
+                reduction);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestBinaryCrossEntropy) {
+  int batch = 10;
+  int classes = 5;
+  torch::Tensor input =
+      torch::rand({batch, classes},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target =
+      torch::rand({batch, classes},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({batch, classes},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor undef;
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum,
+        torch::Reduction::None}) {
+    for (bool undef_weight : {false, true}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output = torch::binary_cross_entropy(
+            input, target, undef_weight ? undef : weight, reduction);
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_weight =
+            undef_weight ? undef : CopyToDevice(weight, device);
+        torch::Tensor lazy_output = torch::binary_cross_entropy(
+            lazy_input, lazy_target, lazy_weight, reduction);
+        AllClose(output, lazy_output, /*rtol=*/1e-4, /*atol=*/1e-5);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMarginRankingLoss) {
+  torch::Tensor input1 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor input2 = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum}) {
+    for (double margin : {0., 0.2}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output = torch::margin_ranking_loss(
+            input1, input2, target, margin, reduction);
+        torch::Tensor lazy_input1 = CopyToDevice(input1, device);
+        torch::Tensor lazy_input2 = CopyToDevice(input2, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_output = torch::margin_ranking_loss(
+            lazy_input1, lazy_input2, lazy_target, margin, reduction);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestBCEWithLogits) {
+  int batch = 10;
+  int classes = 5;
+  torch::Tensor input =
+      torch::rand({batch, classes},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target =
+      torch::rand({batch, classes},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight = torch::rand(
+      {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor pos_weight = torch::rand(
+      {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor undef;
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum}) {
+    for (bool undef_weight : {false, true}) {
+      for (bool undef_pos_weight : {false, true}) {
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor output = torch::binary_cross_entropy_with_logits(
+              input, target, undef_weight ? undef : weight,
+              undef_pos_weight ? undef : pos_weight, reduction);
+          torch::Tensor lazy_input = CopyToDevice(input, device);
+          torch::Tensor lazy_target = CopyToDevice(target, device);
+          torch::Tensor lazy_weight =
+              undef_weight ? undef : CopyToDevice(weight, device);
+          torch::Tensor lazy_pos_weight =
+              undef_pos_weight ? undef : CopyToDevice(pos_weight, device);
+          torch::Tensor lazy_output = torch::binary_cross_entropy_with_logits(
+              lazy_input, lazy_target, lazy_weight, lazy_pos_weight, reduction);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestKlDiv) {
+  torch::Tensor input = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (bool log_target : {true, false}) {
+    for (torch::Reduction::Reduction reduction :
+         {torch::Reduction::Mean, torch::Reduction::Sum}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor output =
+            torch::kl_div(input, target, reduction, log_target);
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_output =
+            torch::kl_div(lazy_input, lazy_target, reduction, log_target);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestProd) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::prod(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::prod(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestProdCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::prod(a, torch::kDouble);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::prod(lazy_a, torch::kDouble);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestProdInDim) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::prod(a, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::prod(lazy_a, dim);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestProdInDimKeepCast) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true, torch::kDouble);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b =
+          torch::prod(lazy_a, dim, /*keepdim=*/true, torch::kDouble);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestProdInDimKeep) {
+  torch::Tensor a = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::prod(lazy_a, dim, /*keepdim=*/true);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumSum) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumSumCast) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim, torch::kDouble);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kDouble);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumSumLong) {
+  torch::Tensor input = torch::randint(
+      1000, {4, 3, 4},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumSumCastLong) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim, torch::kLong);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong);
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumProd) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumprod(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumprod(lazy_input, dim);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumProdCast) {
+  torch::Tensor input = torch::mul(
+      torch::rand({4, 3, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())),
+      10);
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumprod(input, dim, torch::kDouble);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumprod(lazy_input, dim, torch::kDouble);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumProdLong) {
+  torch::Tensor input = torch::randint(
+      7, {2, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCumProdCastLong) {
+  torch::Tensor input =
+      torch::rand({2, 3},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      7;
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cumsum(input, dim, torch::kLong);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong);
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMin) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::argmin(a, c10::nullopt, /*keepdim=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::argmin(lazy_a, c10::nullopt, /*keepdim=*/false);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestArgMinDim) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMinDimKeep) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/true);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMinSameValue) {
+  torch::Tensor a = torch::ones(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::argmin(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::argmin(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestArgMinWrapper) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMax) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestArgMaxDim) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMaxDimKeep) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/true);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/true);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestArgMaxSameValue) {
+  torch::Tensor a = torch::ones(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestArgMaxWrapper) {
+  torch::Tensor a = torch::rand(
+      {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false);
+      AllEqual(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAsin) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::asin(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::asin(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsinh) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::asinh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::asinh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsinhInPlace) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::asinh_(a);
+    torch::Tensor lazy_b = torch::asinh_(lazy_a);
+    AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSin) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::sin(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sin(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSinh) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::sinh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sinh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAcos) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::acos(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::acos(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAcosh) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100;
+  torch::Tensor b = torch::acosh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::acosh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAcoshInPlace) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::acosh_(a);
+    torch::Tensor lazy_b = torch::acosh_(lazy_a);
+    AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCos) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::cos(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::cos(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCosh) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::cosh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::cosh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAtan) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::atan(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::atan(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAtanh) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::atanh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::atanh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAtanhInPlace) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::atanh_(a);
+    torch::Tensor lazy_b = torch::atanh_(lazy_a);
+    AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAtan2) {
+  torch::Tensor a = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::atan2(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::atan2(lazy_a, lazy_b);
+    AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTan) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::tan(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::tan(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTanh) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::tanh(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::tanh(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMinMax) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar min_val(0.311);
+  torch::Scalar max_val(0.409);
+  torch::Tensor b = torch::clamp(a, min_val, max_val);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, max_val);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMin) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar min_val(0.311);
+  torch::Tensor b = torch::clamp(a, min_val, c10::nullopt);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, c10::nullopt);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMax) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar max_val(0.409);
+  torch::Tensor b = torch::clamp(a, c10::nullopt, max_val);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::clamp(lazy_a, c10::nullopt, max_val);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMinExplicit) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar min_val(0.311);
+  torch::Tensor b = torch::clamp_min(a, min_val);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::clamp_min(lazy_a, min_val);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMaxExplicit) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar max_val(0.409);
+  torch::Tensor b = torch::clamp_max(a, max_val);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::clamp_max(lazy_a, max_val);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMinExplicitInPlace) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar min_val(0.311);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::clamp_min_(a, min_val);
+    torch::Tensor lazy_b = torch::clamp_min_(lazy_a, min_val);
+    AllClose(a, lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestClampMaxExplicitInPlace) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar max_val(0.409);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = torch::clamp_max_(a, max_val);
+    torch::Tensor lazy_b = torch::clamp_max_(lazy_a, max_val);
+    AllClose(a, lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCeil) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::ceil(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::ceil(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFloor) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::floor(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::floor(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRound) {
+  torch::Tensor a = torch::cat(
+      {torch::randn(
+           {8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+           100.0,
+       // Special case: 0.5, -0.5. lazy::Round impl rounds to -1/1 whereas
+       // lazy::RoundToEven properly implements bankers rounding.
+       torch::tensor(
+           {-0.5, 0.5},
+           torch::TensorOptions(torch::kFloat).device(DefaultDevice()))},
+      0);
+  torch::Tensor b = torch::round(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::round(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTrunc) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::trunc(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::trunc(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFrac) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::frac(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::frac(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNeg) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::neg(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::neg(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseNot) {
+  std::vector<torch::ScalarType> types(
+      {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});
+
+  ForEachDevice([&](const torch::Device& device) {
+    for (auto type : types) {
+      torch::Tensor a =
+          torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
+      torch::Tensor b = torch::bitwise_not(a);
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = torch::bitwise_not(lazy_a);
+      AllEqual(b, lazy_b);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseNotInPlace) {
+  std::vector<torch::ScalarType> types(
+      {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});
+
+  ForEachDevice([&](const torch::Device& device) {
+    for (auto type : types) {
+      torch::Tensor a =
+          torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      a.bitwise_not_();
+      lazy_a.bitwise_not_();
+      AllEqual(a, lazy_a);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSign) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b = torch::sign(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sign(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSignByte) {
+  torch::Tensor a = torch::randint(
+      256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  torch::Tensor b = torch::sign(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sign(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAbs) {
+  torch::Tensor a = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::abs(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::abs(lazy_a);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAbsByte) {
+  torch::Tensor a = torch::randint(
+      256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  torch::Tensor b = torch::abs(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::abs(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEmptyLike) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::empty_like(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::empty_like(lazy_a);
+    EXPECT_EQ(b.sizes(), lazy_b.sizes());
+  });
+}
+
+TEST_F(LazyOpsTest, TestEmptyLikeOptions) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::empty_like(
+      a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::empty_like(
+        lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    EXPECT_EQ(b.sizes(), lazy_b.sizes());
+  });
+}
+
+TEST_F(LazyOpsTest, TestEmpty) {
+  torch::Tensor a = torch::zeros(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = torch::empty(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(device));
+    EXPECT_EQ(a.sizes(), lazy_a.sizes());
+  });
+}
+
+TEST_F(LazyOpsTest, TestZeroInPlace) {
+  torch::Tensor input = torch::ones(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazyInput = CopyToDevice(input, device);
+    auto& output = torch::zero_(input);
+    auto& lazyOutput = torch::zero_(lazyInput);
+    AllClose(output, lazyOutput);
+  });
+}
+
+TEST_F(LazyOpsTest, TestZerosLike) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::zeros_like(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::zeros_like(lazy_a);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestZerosLikeOptions) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::zeros_like(
+      a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::zeros_like(
+        lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestZeros) {
+  torch::Tensor a = torch::zeros(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = torch::zeros(
+        {2, 2}, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestOnes) {
+  torch::Tensor a = torch::ones(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a =
+        torch::ones({2, 2}, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestOnesLike) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::ones_like(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::ones_like(lazy_a);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestOnesLikeOptions) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::ones_like(
+      a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::ones_like(
+        lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFull) {
+  torch::Tensor a =
+      torch::full({2, 2}, 3.1165,
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = torch::full(
+        {2, 2}, 3.1165, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFullLike) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::full_like(a, 3.1165);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::full_like(lazy_a, 3.1165);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFullLikeOptions) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::full_like(
+      a, 3.1165, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::full_like(
+        lazy_a, 3.1165,
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestARange) {
+  for (auto& ranges : std::vector<std::vector<float>>{{0.0, 100.0, 0.5},
+                                                      {0.0, -100.0, -0.5}}) {
+    torch::Tensor a = torch::arange(
+        ranges[0], ranges[1], ranges[2],
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a =
+          torch::arange(ranges[0], ranges[1], ranges[2],
+                        torch::TensorOptions(torch::kFloat).device(device));
+      AllClose(a, lazy_a);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestARangeOut) {
+  torch::Tensor a = torch::randn(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto& ranges : std::vector<std::vector<float>>{{0.0, 100.0, 0.5},
+                                                      {0.0, -100.0, -0.5}}) {
+    torch::Tensor b = torch::arange_out(a, ranges[0], ranges[1], ranges[2]);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b =
+          torch::arange_out(lazy_a, ranges[0], ranges[1], ranges[2]);
+      AllClose(b, lazy_b);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDimARange) {
+  torch::Tensor like = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor a = torch::_dim_arange(like, 1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_like = CopyToDevice(like, device);
+    torch::Tensor lazy_a = torch::_dim_arange(lazy_like, 1);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBartlettWindow) {
+  int window_length = 10;
+  for (bool periodic : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor output = torch::bartlett_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+
+      torch::Tensor lazy_output = torch::bartlett_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(device));
+      AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestBlackmanWindow) {
+  int window_length = 10;
+  for (bool periodic : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor output = torch::blackman_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_output = torch::blackman_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(device));
+      AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestHammingWindow) {
+  double alpha = 0.54;
+  double beta = 0.46;
+  int window_length = 10;
+  for (bool periodic : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor output = torch::hamming_window(
+          window_length, periodic, alpha, beta,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_output = torch::hamming_window(
+          window_length, periodic, alpha, beta,
+          torch::TensorOptions(torch::kFloat).device(device));
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestHannWindow) {
+  int window_length = 10;
+  for (bool periodic : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor output = torch::hann_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_output = torch::hann_window(
+          window_length, periodic,
+          torch::TensorOptions(torch::kFloat).device(device));
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestLogSigmoid) {
+  torch::Tensor a = torch::empty(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  a.uniform_(-1.0, 1.0);
+  torch::Tensor b = torch::log_sigmoid(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::log_sigmoid(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSigmoidForward) {
+  torch::Tensor a = torch::empty(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  a.uniform_(-1.0, 1.0);
+  auto tuple = torch::log_sigmoid_forward(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    auto lazy_tuple = torch::log_sigmoid_forward(lazy_a);
+    AllClose(std::get<0>(tuple), std::get<0>(lazy_tuple),
+             /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(std::get<1>(tuple), std::get<1>(lazy_tuple),
+             /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogsumexp) {
+  torch::Tensor a = torch::rand(
+      {3, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
+    for (bool keepdim : {false, true}) {
+      torch::Tensor b = torch::logsumexp(a, dims, keepdim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor lazy_b = torch::logsumexp(lazy_a, dims, keepdim);
+        AllClose(b, lazy_b);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSiLU) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::silu(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::silu(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+  ExpectCounterChanged("lazy::silu_out", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestSigmoid) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::sigmoid(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sigmoid(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMatmul_1x1) {
+  torch::Tensor a = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::matmul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMatmul_2x1) {
+  torch::Tensor a = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::matmul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMatmul_1x2) {
+  torch::Tensor a = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::matmul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMatmul_2x2) {
+  torch::Tensor a = torch::rand(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::matmul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
+    AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMatmulBcast) {
+  torch::Tensor a =
+      torch::rand({4, 2, 3, 2, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b =
+      torch::rand({2, 1, 4, 3},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::matmul(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestDot) {
+  torch::Tensor a = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::dot(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::dot(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTensorDot) {
+  torch::Tensor a = torch::rand(
+      {6, 4, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4, 7, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> dims_a = {1, 2};
+  std::vector<int64_t> dims_b = {0, 2};
+  torch::Tensor c = torch::tensordot(a, b, dims_a, dims_b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::tensordot(lazy_a, lazy_b, dims_a, dims_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGer) {
+  torch::Tensor a = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::ger(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::ger(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMv) {
+  torch::Tensor a = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::mv(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::mv(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMvOut) {
+  torch::Tensor a = torch::rand(
+      {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::mv_out(c, a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::empty({4}, lazy_b.options());
+    torch::mv_out(lazy_c, lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBatchAddBatchMatMul) {
+  torch::Tensor a = torch::rand(
+      {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 0.5;
+  torch::Scalar beta = 1.5;
+  torch::Tensor d = torch::baddbmm(a, b, c, beta, alpha);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::baddbmm(lazy_a, lazy_b, lazy_c, beta, alpha);
+    AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBatchAddBatchMatMulInPlace) {
+  torch::Tensor a = torch::rand(
+      {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 0.5;
+  torch::Scalar beta = 1.5;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor d = a.baddbmm_(b, c, beta, alpha);
+    torch::Tensor lazy_d = lazy_a.baddbmm_(lazy_b, lazy_c, beta, alpha);
+    AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4);
+    AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBatchMatMul) {
+  torch::Tensor a = torch::rand(
+      {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::bmm(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::bmm(lazy_a, lazy_b);
+    AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestChainMatMul) {
+  torch::Tensor a = torch::rand(
+      {5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {6, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor d = torch::rand(
+      {2, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::chain_matmul({a, b, c, d});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = CopyToDevice(d, device);
+    torch::Tensor lazy_result =
+        torch::chain_matmul({lazy_a, lazy_b, lazy_c, lazy_d});
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLinear) {
+  torch::Tensor input = torch::rand(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias = torch::rand(
+      {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::linear(input, weight);
+  torch::Tensor result_with_bias = torch::linear(input, weight, bias);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    torch::Tensor lazy_bias = CopyToDevice(bias, device);
+    torch::Tensor lazy_result = torch::linear(lazy_input, lazy_weight);
+    torch::Tensor lazy_result_with_bias =
+        torch::linear(lazy_input, lazy_weight, lazy_bias);
+    AllClose(result, lazy_result, /*rtol=*/1e-2, /*atol=*/1e-4);
+    AllClose(result_with_bias, lazy_result_with_bias, /*rtol=*/1e-2,
+             /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPinverse) {
+  torch::Tensor input = torch::rand(
+      {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::pinverse(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::pinverse(lazy_input);
+    AllClose(result, lazy_result, /*rtol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumOuter) {
+  torch::Tensor a = torch::rand(
+      {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "i,j->ij";
+  torch::Tensor c = torch::einsum(equation, {a, b});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b});
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumOuterBackward) {
+  torch::Tensor a = torch::rand({5}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true));
+  torch::Tensor b = torch::rand({5}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true));
+  std::string equation = "i,j->ij";
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::einsum(equation, inputs);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({a, b}, device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumBatchMatMul) {
+  torch::Tensor a = torch::rand(
+      {3, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "bij,bjk->bik";
+  torch::Tensor c = torch::einsum(equation, {a, b});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b});
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBilinear) {
+  torch::Tensor a = torch::rand(
+      {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor l = torch::rand(
+      {2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor r = torch::rand(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "bn,anm,bm->ba";
+  torch::Tensor c = torch::einsum(equation, {l, a, r});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_l = CopyToDevice(l, device);
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_r = CopyToDevice(r, device);
+    torch::Tensor lazy_c = torch::einsum(equation, {lazy_l, lazy_a, lazy_r});
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumPyTorchLowerDiagonal) {
+  torch::Tensor input = torch::rand(
+      {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "ii->i";
+  torch::Tensor result = torch::einsum(equation, {input});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchDiagonal) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "...ii->...i";
+  torch::Tensor result = torch::einsum(equation, {input});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchPermute) {
+  torch::Tensor input =
+      torch::rand({2, 3, 4, 5},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "...ij->...ji";
+  torch::Tensor result = torch::einsum(equation, {input});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEinsumPyTorchLowerRepeatedAxis) {
+  torch::Tensor x = torch::rand(
+      {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor y = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::string equation = "ijj,k->ik";
+  torch::Tensor result = torch::einsum(equation, {x, y});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_x = CopyToDevice(x, device);
+    torch::Tensor lazy_y = CopyToDevice(y, device);
+    torch::Tensor lazy_result = torch::einsum(equation, {lazy_x, lazy_y});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBilinear) {
+  int batch_size = 16;
+  int in1_features = 4;
+  int in2_features = 6;
+  int out_features = 8;
+  torch::Tensor input1 =
+      torch::rand({batch_size, in1_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor input2 =
+      torch::rand({batch_size, in2_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({out_features, in1_features, in2_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias =
+      torch::rand({out_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input1 = CopyToDevice(input1, device);
+    torch::Tensor lazy_input2 = CopyToDevice(input2, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    torch::Tensor lazy_bias = CopyToDevice(bias, device);
+    torch::Tensor result = torch::bilinear(input1, input2, weight, bias);
+    torch::Tensor lazy_result =
+        torch::bilinear(lazy_input1, lazy_input2, lazy_weight, lazy_bias);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUpsampleNearest2D) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int uh = 8;
+  int uw = 8;
+  int chans = 2;
+  torch::Tensor input =
+      torch::rand({batch_size, chans, h, w},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = torch::upsample_nearest2d(input, {uh, uw});
+    torch::Tensor lazy_result = torch::upsample_nearest2d(lazy_input, {uh, uw});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUpsampleNearest2DBackward) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int uh = 8;
+  int uw = 8;
+  int chans = 2;
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::upsample_nearest2d(inputs[0], {uh, uw});
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({batch_size, chans, h, w},
+                              torch::TensorOptions(torch::kFloat)
+                                  .device(DefaultDevice())
+                                  .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUpsampleNearest2DWithScale) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int chans = 2;
+  double scale_h = 2.5;
+  double scale_w = 3.4;
+  torch::Tensor input =
+      torch::rand({batch_size, chans, h, w},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = torch::upsample_nearest2d(
+        input, c10::nullopt, at::ArrayRef<double>{scale_h, scale_w});
+    torch::Tensor lazy_result = torch::upsample_nearest2d(
+        lazy_input, c10::nullopt, at::ArrayRef<double>{scale_h, scale_w});
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUpsampleNearest2DBackwardWithScale) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int chans = 2;
+  double scale_h = 2.5;
+  double scale_w = 3.4;
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::upsample_nearest2d(inputs[0], c10::nullopt,
+                                     at::ArrayRef<double>{scale_h, scale_w});
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({batch_size, chans, h, w},
+                              torch::TensorOptions(torch::kFloat)
+                                  .device(DefaultDevice())
+                                  .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUpsampleBilinear2D) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int uh = 8;
+  int uw = 8;
+  int chans = 2;
+  for (bool align_corners : {true, false}) {
+    torch::Tensor input = torch::rand(
+        {batch_size, chans, h, w},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor result =
+          torch::upsample_bilinear2d(input, {uh, uw}, align_corners);
+      torch::Tensor lazy_result =
+          torch::upsample_bilinear2d(lazy_input, {uh, uw}, align_corners);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestUpsampleBilinear2DBackward) {
+  int batch_size = 2;
+  int h = 5;
+  int w = 5;
+  int uh = 8;
+  int uw = 8;
+  int chans = 2;
+  for (bool align_corners : {true, false}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::upsample_bilinear2d(inputs[0], {uh, uw}, align_corners);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand({batch_size, chans, h, w},
+                                torch::TensorOptions(torch::kFloat)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true))},
+                   device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAddCMul) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor d = torch::addcmul(a, b, c, 3.1165);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::addcmul(lazy_a, lazy_b, lazy_c, 3.1165);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddCDiv) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c =
+      torch::abs(torch::rand(
+          {2, 2},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) +
+      1.0;
+  torch::Tensor d = torch::addcdiv(a, b, c, 3.1165);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddCDivWithBroadcast) {
+  torch::Tensor a = torch::rand(
+      {1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c =
+      torch::abs(torch::rand(
+          {1, 3},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) +
+      1.0;
+  torch::Tensor d = torch::addcdiv(a, b, c, 3.1165);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSize) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    for (int dim = -rank; dim < rank; ++dim) {
+      EXPECT_EQ(torch::size(input, dim), torch::size(lazy_input, dim));
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSelect) {
+  std::vector<int64_t> input_sizes = {14, 24, 8};
+  int rank = input_sizes.size();
+  for (int dim = -rank; dim < rank; ++dim) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::select(inputs[0], dim, 0);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand(input_sizes, torch::TensorOptions(torch::kFloat)
+                                                 .requires_grad(true))},
+                   device, testfn);
+    });
+  };
+}
+
+TEST_F(LazyOpsTest, TestBernoulliScalarProb) {
+  torch::Tensor input = torch::zeros(
+      1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::bernoulli(lazy_input, 0.1);
+    double frac = lazy_output.sum().item().toDouble() / input.numel();
+    EXPECT_GT(frac, 0.06);
+    EXPECT_LT(frac, 0.14);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBernoulliTensorProb) {
+  std::vector<float> prob_values(1000, 0.1);
+  torch::Tensor input = torch::tensor(
+      prob_values, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::bernoulli(lazy_input);
+    double frac = lazy_output.sum().item().toDouble() / input.numel();
+    EXPECT_GT(frac, 0.06);
+    EXPECT_LT(frac, 0.14);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBernoulliScalarProbInPlace) {
+  torch::Tensor input = torch::zeros(
+      1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    lazy_input.bernoulli_(0.1);
+    double frac = lazy_input.sum().item().toDouble() / input.numel();
+    EXPECT_GT(frac, 0.06);
+    EXPECT_LT(frac, 0.14);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBernoulliTensorProbInPlace) {
+  torch::Tensor input = torch::zeros(
+      1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor prob = torch::scalar_tensor(
+      0.1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_prob = CopyToDevice(prob, device);
+    lazy_input.bernoulli_(lazy_prob);
+    double frac = lazy_input.sum().item().toDouble() / input.numel();
+    EXPECT_GT(frac, 0.06);
+    EXPECT_LT(frac, 0.14);
+  });
+}
+
+TEST_F(LazyOpsTest, TestDropout) {
+  torch::Tensor a = torch::rand(
+      {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::dropout(lazy_a, 0.1, /*train=*/true);
+    double prob =
+        static_cast<double>(lazy_b.cpu().ne(0.0f).sum().item().toDouble()) /
+        a.numel();
+    EXPECT_GT(prob, 0.86);
+    EXPECT_LT(prob, 0.94);
+  });
+}
+
+TEST_F(LazyOpsTest, TestDropoutInPlace) {
+  torch::Tensor a = torch::rand(
+      {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::dropout_(lazy_a, 0.1, /*train=*/true);
+    double prob =
+        static_cast<double>(lazy_a.cpu().ne(0.0f).sum().item().toDouble()) /
+        a.numel();
+    EXPECT_GT(prob, 0.85);
+    EXPECT_LT(prob, 0.94);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRandperm) {
+  unsigned n = 5;
+  torch::Tensor shuffle = torch::randperm(
+      n, torch::TensorOptions(torch::kLong).device(torch::kLazy));
+  torch::Tensor shuffle_cpu = CopyToDevice(shuffle, torch::kCPU);
+  std::vector<int64_t> shuffle_data(shuffle_cpu.data_ptr<int64_t>(),
+                                    shuffle_cpu.data_ptr<int64_t>() + n);
+  EXPECT_TRUE(shuffle_data.size() == n &&
+              torch::lazy::IsPermutation(shuffle_data));
+}
+
+TEST_F(LazyOpsTest, TestSlice) {
+  torch::Tensor a =
+      torch::rand({32, 24, 16},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::slice(a, 1, 0, 16, 1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::slice(lazy_a, 1, 0, 16, 1);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTake) {
+  torch::Tensor a = torch::rand(
+      {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::randint(
+      16, {5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor c = torch::take(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::take(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTakeBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::take(inputs[0], inputs[1]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward(
+        {torch::rand({4, 4}, torch::TensorOptions(torch::kFloat)
+                                 .device(DefaultDevice())
+                                 .requires_grad(true)),
+         torch::randint(
+             16, {5},
+             torch::TensorOptions(torch::kLong).device(DefaultDevice()))},
+        device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestStack) {
+  torch::Tensor a = torch::rand(
+      {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = a.dim() + 1;
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor d = torch::stack({a, b, c}, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::stack({lazy_a, lazy_b, lazy_c}, dim);
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCat) {
+  torch::Tensor a = torch::rand(
+      {2, 1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int dim : {1, -2}) {
+    torch::Tensor d = torch::cat({a, b, c}, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::cat({lazy_a, lazy_b, lazy_c}, dim);
+      EXPECT_TRUE(d.sizes() == lazy_d.sizes() && d.dtype() == lazy_d.dtype());
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestUnbind) {
+  torch::Tensor input = torch::rand(
+      {4, 3, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    std::vector<torch::Tensor> output = torch::unbind(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      std::vector<torch::Tensor> lazy_output = torch::unbind(lazy_input, dim);
+      ASSERT_EQ(output.size(), lazy_output.size());
+      for (size_t i = 0; i < output.size(); ++i) {
+        AllClose(output[i], lazy_output[i]);
+      }
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestRepeat) {
+  std::vector<std::vector<int64_t>> repeats_list = {{4, 2}, {4, 2, 3}};
+  std::vector<std::vector<int64_t>> input_size_list = {{3}, {2, 4}};
+  for (const auto& repeats : repeats_list) {
+    for (const auto& input_size : input_size_list) {
+      torch::Tensor input = torch::rand(
+          input_size,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor output = input.repeat(repeats);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_output = lazy_input.repeat(repeats);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestGather) {
+  torch::Tensor a = torch::rand(
+      {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::empty(
+      {3, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      b[i][j] = (i + j) % 3;
+    }
+  }
+  for (bool sparse_grad : {false, true}) {
+    torch::Tensor c = torch::gather(a, 1, b, sparse_grad);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = torch::gather(lazy_a, 1, lazy_b, sparse_grad);
+      AllClose(c, lazy_c);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestScatter) {
+  torch::Tensor a = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int dim = 0; dim < 2; ++dim) {
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 5; j++) {
+        c[i][j] = (i + j) % c.sizes()[dim];
+      }
+    }
+    torch::Tensor d = torch::scatter(a, dim, c, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b);
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestScatterR1) {
+  torch::Tensor a = torch::rand(
+      {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  c[0] = 1;
+  c[1] = 3;
+  torch::Tensor d = torch::scatter(a, 0, c, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::scatter(lazy_a, 0, lazy_c, lazy_b);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestScatterR3) {
+  torch::Tensor a = torch::rand(
+      {3, 5, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 4, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      for (int k = 0; k < 2; k++) {
+        c[i][j][k] = (i + j + k) % 4;
+      }
+    }
+  }
+  torch::Tensor d = torch::scatter(a, 1, c, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::scatter(lazy_a, 1, lazy_c, lazy_b);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestScatterBiggerSource) {
+  torch::Tensor a = torch::rand(
+      {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {8, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      c[i][j] = (i + j) % 4;
+    }
+  }
+  for (int dim = 0; dim < 2; ++dim) {
+    torch::Tensor d = torch::scatter(a, dim, c, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b);
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestScatterScalar) {
+  torch::Tensor a = torch::rand(
+      {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar b = 1.0f;
+  torch::Tensor c = torch::empty(
+      {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      c[i][j] = (i + j) % 4;
+    }
+  }
+  for (int dim = 0; dim < 2; ++dim) {
+    torch::Tensor d = torch::scatter(a, dim, c, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, b);
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestScatterReduceAdd) {
+  torch::Tensor a = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int dim = 0; dim < 2; ++dim) {
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 5; j++) {
+        c[i][j] = (i + j) % c.sizes()[dim];
+      }
+    }
+    torch::Tensor d = torch::scatter(a, dim, c, b, "add");
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b, "add");
+      AllClose(d, lazy_d);
+    });
+  }
+
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::scatter_out", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestScatterAdd) {
+  torch::Tensor a = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int dim = 0; dim < 2; ++dim) {
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 5; j++) {
+        c[i][j] = (i + j) % c.sizes()[dim];
+      }
+    }
+    torch::Tensor d = torch::scatter_add(a, dim, c, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = torch::scatter_add(lazy_a, dim, lazy_c, lazy_b);
+      AllClose(d, lazy_d);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestScatterAddInPlace) {
+  torch::Tensor b = torch::rand(
+      {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      c[i][j] = (i + j) % 4;
+    }
+  }
+  for (int dim = 0; dim < 2; ++dim) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor a = torch::rand(
+          {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor d = a.scatter_add_(dim, c, b);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c = CopyToDevice(c, device);
+      torch::Tensor lazy_d = lazy_a.scatter_add_(dim, lazy_c, lazy_b);
+      AllClose(d, lazy_d);
+      AllClose(a, lazy_a);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexSelect) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) {
+      torch::Tensor b = torch::empty(
+          {2}, torch::TensorOptions(index_scalar_type).device(DefaultDevice()));
+      b[0] = 0;
+      b[1] = 2;
+      for (auto offset : {-2, 0}) {
+        torch::Tensor c0 = torch::index_select(a, 0 + offset, b);
+        torch::Tensor c1 = torch::index_select(a, 1 + offset, b);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a, device);
+          torch::Tensor lazy_b = CopyToDevice(b, device);
+          torch::Tensor lazy_c0 = torch::index_select(lazy_a, 0 + offset, lazy_b);
+          torch::Tensor lazy_c1 = torch::index_select(lazy_a, 1 + offset, lazy_b);
+          AllEqual(c0, lazy_c0);
+          AllEqual(c1, lazy_c1);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexSelectRank0) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor a =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor b = torch::scalar_tensor(
+        2, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor c0 = torch::index_select(a, 0, b);
+    torch::Tensor c1 = torch::index_select(a, 1, b);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_a = CopyToDevice(a, device);
+      torch::Tensor lazy_b = CopyToDevice(b, device);
+      torch::Tensor lazy_c0 = torch::index_select(lazy_a, 0, lazy_b);
+      torch::Tensor lazy_c1 = torch::index_select(lazy_a, 1, lazy_b);
+      AllEqual(c0, lazy_c0);
+      AllEqual(c1, lazy_c1);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestInverse) {
+  if (IsCuda()) {
+    // TODO(whc) debug failure on cuda, lazy_b comes back transposed
+    GTEST_SKIP();
+  }
+  torch::Tensor a = torch::randn(
+      {5, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::inverse(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::inverse(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestIsnan) {
+  torch::Tensor a = torch::tensor(
+      {1.0, 2.0, std::nan("1"), 4.0},
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::isnan(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::isnan(lazy_a);
+    AllEqual(b, lazy_b);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::isnan", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestExpand) {
+  torch::Tensor a = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.expand({2, 3, 4}, /*implicit=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = lazy_a.expand({2, 3, 4}, /*implicit=*/false);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestExpandBack) {
+  torch::Tensor a = torch::rand(
+      {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = a.expand({3, 4}, /*implicit=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = lazy_a.expand({3, 4}, /*implicit=*/false);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestExpandAs) {
+  torch::Tensor a = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::native::expand_as(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::native::expand_as(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEye) {
+  int n = 5;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor out = torch::eye(
+        n, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_out =
+        torch::eye(n, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(out, lazy_out);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEyeWide) {
+  int lines = 3;
+  int cols = 5;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor out =
+        torch::eye(lines, cols,
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_out = torch::eye(
+        lines, cols, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(out, lazy_out);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEyeNarrow) {
+  int lines = 5;
+  int cols = 3;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor out =
+        torch::eye(lines, cols,
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_out = torch::eye(
+        lines, cols, torch::TensorOptions(torch::kFloat).device(device));
+    AllClose(out, lazy_out);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBroadcastTensors) {
+  torch::Tensor a = torch::rand(
+      {2, 1, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<torch::Tensor> c = torch::broadcast_tensors({a, b});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    std::vector<torch::Tensor> lazy_c = torch::broadcast_tensors({lazy_a, lazy_b});
+    ASSERT_EQ(c.size(), lazy_c.size());
+    for (size_t i = 0; i < c.size(); ++i) {
+      AllClose(c[i], lazy_c[i]);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestOneIndex) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result = torch::index(params, {indices});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices = CopyToDevice(indices, device);
+      torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestOneIndexTransfer) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result = torch::index(params, {indices});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_result = torch::index(lazy_params, {indices});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestNonzero) {
+  torch::Tensor a = torch::zeros(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  a[0][1] = 1.0;
+  a[1][0] = 2.0;
+  a[3][1] = 3.0;
+  torch::Tensor b = torch::nonzero(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::nonzero(lazy_a);
+    AllClose(b, lazy_b);
+
+    if (DebugUtil::ExperimentEnabled("nonzero")) {
+      // If the nonzero support is enabled, we must not see any aten:: calls.
+      ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+    }
+    ResetCounters();
+  });
+}
+
+TEST_F(LazyOpsTest, TestMaskedSelect) {
+  torch::Tensor a = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::randint(
+      0, 2, {5}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+  torch::Tensor c = torch::masked_select(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::masked_select(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+
+    if (DebugUtil::ExperimentEnabled("masked_select")) {
+      // If the masked_select support is enabled, we must not see any aten::
+      // calls.
+      ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+    }
+    ResetCounters();
+  });
+}
+
+TEST_F(LazyOpsTest, TestMaskedScatter) {
+  torch::Tensor a = torch::rand(
+      {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::randint(
+      0, 2, {3, 5}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {15}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor d = torch::masked_scatter(a, b, c);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::masked_scatter(lazy_a, lazy_b, lazy_c);
+    AllClose(d, lazy_d);
+
+    if (DebugUtil::ExperimentEnabled("masked_scatter")) {
+      // If the masked_select support is enabled, we must not see any aten::
+      // calls.
+      ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+    }
+    ResetCounters();
+  });
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexHeadNull) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices_null;
+    torch::Tensor indices_0 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor indices_1 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result =
+        torch::index(params, {indices_null, indices_0, indices_1});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+      torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+      torch::Tensor lazy_result = torch::index(
+          lazy_params, {indices_null, lazy_indices_0, lazy_indices_1});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexMiddleNull) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices_0 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor indices_null;
+    torch::Tensor indices_1 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result =
+        torch::index(params, {indices_0, indices_null, indices_1});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+      torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+      torch::Tensor lazy_result = torch::index(
+          lazy_params, {lazy_indices_0, indices_null, lazy_indices_1});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexTailNull) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices_0 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor indices_null;
+    torch::Tensor indices_1 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result =
+        torch::index(params, {indices_0, indices_1, indices_null});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+      torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+      torch::Tensor lazy_result = torch::index(
+          lazy_params, {lazy_indices_0, lazy_indices_1, indices_null});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexMiddleBroadcast) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices_0 = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor indices_1 = torch::randint(
+        -3, 3, {2, 1, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result = torch::index(params, {indices_0, indices_1});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+      torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+      torch::Tensor lazy_result =
+          torch::index(lazy_params, {lazy_indices_0, lazy_indices_1});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexTailBroadcast) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices_0 = torch::randint(
+        -3, 3, {2, 1, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor indices_1 = torch::randint(
+        -3, 3, {2, 1},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor result = torch::index(params, {indices_0, indices_1});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+      torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+      torch::Tensor lazy_result =
+          torch::index(lazy_params, {lazy_indices_0, lazy_indices_1});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaskIndex) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {2, 2},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {2, 2},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices = torch::randint(
+        0, 2, {2, 2},
+        torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+    torch::Tensor result = torch::index(params, {indices});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_params = CopyToDevice(params, device);
+      torch::Tensor lazy_indices = CopyToDevice(indices, device);
+      torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices});
+      AllEqual(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestOneIndexPut) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor indices = torch::randint(
+        -3, 3, {2, 4, 3},
+        torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+    torch::Tensor values =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result =
+          torch::index_put(params, {indices}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices = CopyToDevice(indices, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result =
+            torch::index_put(lazy_params, {lazy_indices}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestOneIndexPutInPlace) {
+  torch::Tensor indices = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor values =
+        torch::ones({3, 5, 6, 7},
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor params =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {4, 3, 5, 6, 7},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {4, 3, 5, 6, 7},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_params = CopyToDevice(params.clone(), device);
+        torch::Tensor result =
+            torch::index_put_(params, {indices}, values, accumulate);
+        torch::Tensor lazy_indices = CopyToDevice(indices, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put_(lazy_params, {lazy_indices},
+                                                     lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+        AllEqual(params, lazy_params);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestOneIndexPutTransfer) {
+  torch::Tensor indices = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values =
+        torch::ones({3, 5, 6, 7},
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result =
+          torch::index_put(params, {indices}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result =
+            torch::index_put(lazy_params, {indices}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPut) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result =
+          torch::index_put(params, {indices_0, indices_1}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPutHeadNull) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_null;
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result = torch::index_put(
+          params, {indices_null, indices_0, indices_1}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {indices_null, lazy_indices_0, lazy_indices_1},
+            lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPutMiddleNull) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_null;
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result = torch::index_put(
+          params, {indices_0, indices_null, indices_1}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {lazy_indices_0, indices_null, lazy_indices_1},
+            lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPutTailNull) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_null;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 3, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result = torch::index_put(
+          params, {indices_0, indices_1, indices_null}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {lazy_indices_0, lazy_indices_1, indices_null},
+            lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPutMiddleBroadcast) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 1, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result =
+          torch::index_put(params, {indices_0, indices_1}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMultiIndexPutTailBroadcast) {
+  torch::Tensor indices_0 = torch::randint(
+      -3, 3, {2, 1, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor indices_1 = torch::randint(
+      -3, 3, {2, 1},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {4, 3, 5, 6, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      torch::Tensor result =
+          torch::index_put(params, {indices_0, indices_1}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
+        torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::index_put(
+            lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaskIndexPut) {
+  torch::Tensor indices =
+      torch::tensor({0, 1},
+                    torch::TensorOptions(torch::kByte).device(DefaultDevice()))
+          .to(torch::kBool);
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor params =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {2, 2},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {2, 2},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor values = torch::ones(
+        {2}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      torch::Tensor result =
+          torch::index_put(params, {indices}, values, accumulate);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_params = CopyToDevice(params, device);
+        torch::Tensor lazy_indices = CopyToDevice(indices, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result =
+            torch::index_put(lazy_params, {lazy_indices}, lazy_values, accumulate);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexPutImpl) {
+  torch::Tensor indices = torch::randint(
+      -3, 3, {2, 4, 3},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor values =
+        torch::ones({3, 5, 6, 7},
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    for (bool accumulate : {false, true}) {
+      if (accumulate && IsCuda()) {
+        GTEST_SKIP();
+      }
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor params =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {4, 3, 5, 6, 7},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {4, 3, 5, 6, 7},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_params = CopyToDevice(params.clone(), device);
+        torch::Tensor result = torch::_index_put_impl_(
+            params, {indices}, values, accumulate, /*unsafe=*/true);
+        torch::Tensor lazy_indices = CopyToDevice(indices, device);
+        torch::Tensor lazy_values = CopyToDevice(values, device);
+        torch::Tensor lazy_result = torch::_index_put_impl_(
+            lazy_params, {lazy_indices}, lazy_values, accumulate, /*unsafe=*/true);
+        AllEqual(result, lazy_result);
+        AllEqual(params, lazy_params);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexFillWithScalar) {
+  torch::Tensor index = torch::tensor(
+      {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Scalar value = 42;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor result = torch::index_fill(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_result =
+            torch::index_fill(lazy_base, dim, lazy_index, value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexFillWithScalarInPlace) {
+  torch::Tensor index = torch::tensor(
+      {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Scalar value = 42;
+  int rank = 3;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor base =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {3, 4, 5},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {3, 4, 5},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+        torch::Tensor result = base.index_fill_(dim, index, value);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_result = lazy_base.index_fill_(dim, lazy_index, value);
+        AllEqual(result, lazy_result);
+        AllEqual(base, lazy_base);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexFillWithTensor) {
+  torch::Tensor index = torch::tensor(
+      {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor value = torch::scalar_tensor(
+        42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor result = torch::index_fill(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            torch::index_fill(lazy_base, dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexFillWithTensorInPlace) {
+  torch::Tensor index = torch::tensor(
+      {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor value = torch::scalar_tensor(
+        42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = 3;
+    for (int dim = -rank; dim < rank; ++dim) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor base =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {3, 4, 5},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {3, 4, 5},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+        torch::Tensor result = base.index_fill_(dim, index, value);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            lazy_base.index_fill_(dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+        AllEqual(base, lazy_base);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexFillRank0) {
+  torch::Tensor index = torch::scalar_tensor(
+      2, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {3, 4, 5},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    torch::Tensor value = torch::scalar_tensor(
+        42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor result = torch::index_fill(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            torch::index_fill(lazy_base, dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexAdd) {
+  int index_size = 10;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) {
+        torch::Tensor index = torch::randint(
+            0, base.size(dim), {index_size},
+            torch::TensorOptions(index_scalar_type).device(DefaultDevice()));
+        std::vector<int64_t> value_sizes(base.sizes().begin(),
+                                         base.sizes().end());
+        int canonical_dim = dim < 0 ? dim + rank : dim;
+        value_sizes[canonical_dim] = index_size;
+        torch::Tensor value =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      value_sizes,
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, value_sizes,
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor result = torch::index_add(base, dim, index, value);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_base = CopyToDevice(base, device);
+          torch::Tensor lazy_index = CopyToDevice(index, device);
+          torch::Tensor lazy_value = CopyToDevice(value, device);
+          torch::Tensor lazy_result =
+              torch::index_add(lazy_base, dim, lazy_index, lazy_value);
+          AllClose(result, lazy_result);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexAddInPlace) {
+  int index_size = 10;
+  int rank = 3;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor base =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {5, 3, 7},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {5, 3, 7},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor index = torch::randint(
+            0, base.size(dim), {index_size},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        std::vector<int64_t> value_sizes(base.sizes().begin(),
+                                         base.sizes().end());
+        int canonical_dim = dim < 0 ? dim + rank : dim;
+        value_sizes[canonical_dim] = index_size;
+        torch::Tensor value =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      value_sizes,
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, value_sizes,
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+        torch::Tensor result = base.index_add_(dim, index, value);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            lazy_base.index_add_(dim, lazy_index, lazy_value);
+        AllClose(result, lazy_result);
+        AllClose(base, lazy_base);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexAddRank0) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor index = torch::randint(
+          0, base.size(dim), at::IntArrayRef{},
+          torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+      std::vector<int64_t> value_sizes(base.sizes().begin(),
+                                       base.sizes().end());
+      int canonical_dim = dim < 0 ? dim + rank : dim;
+      value_sizes[canonical_dim] = 1;
+      torch::Tensor value =
+          isFloatingType(scalar_type)
+              ? torch::rand(
+                    value_sizes,
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()))
+              : torch::randint(
+                    100, value_sizes,
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+      torch::Tensor result = torch::index_add(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            torch::index_add(lazy_base, dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexCopy) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor index = torch::randperm(
+          base.size(dim),
+          torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+      torch::Tensor value =
+          isFloatingType(scalar_type)
+              ? torch::rand(
+                    base.sizes(),
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()))
+              : torch::randint(
+                    100, base.sizes(),
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+      torch::Tensor result = torch::index_copy(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            torch::index_copy(lazy_base, dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexCopyInPlace) {
+  if (IsCuda()) {
+    GTEST_SKIP();
+  }
+  int index_size = 10;
+  int rank = 3;
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor base =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      {5, 3, 7},
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, {5, 3, 7},
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor index = torch::randint(
+            0, base.size(dim), {index_size},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        std::vector<int64_t> value_sizes(base.sizes().begin(),
+                                         base.sizes().end());
+        int canonical_dim = dim < 0 ? dim + rank : dim;
+        value_sizes[canonical_dim] = index_size;
+        torch::Tensor value =
+            isFloatingType(scalar_type)
+                ? torch::rand(
+                      value_sizes,
+                      torch::TensorOptions(scalar_type).device(DefaultDevice()))
+                : torch::randint(100, value_sizes,
+                                 torch::TensorOptions(scalar_type)
+                                     .device(DefaultDevice()));
+        torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+        torch::Tensor result = base.index_copy_(dim, index, value);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            lazy_base.index_copy_(dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+        AllEqual(base, lazy_base);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestIndexCopyRank0) {
+  for (torch::ScalarType scalar_type :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor base =
+        isFloatingType(scalar_type)
+            ? torch::rand(
+                  {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()))
+            : torch::randint(
+                  100, {5, 3, 7},
+                  torch::TensorOptions(scalar_type).device(DefaultDevice()));
+    int rank = base.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor index = torch::randint(
+          0, base.size(dim), at::IntArrayRef{},
+          torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+      std::vector<int64_t> value_sizes(base.sizes().begin(),
+                                       base.sizes().end());
+      int canonical_dim = dim < 0 ? dim + rank : dim;
+      value_sizes[canonical_dim] = 1;
+      torch::Tensor value =
+          isFloatingType(scalar_type)
+              ? torch::rand(
+                    value_sizes,
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()))
+              : torch::randint(
+                    100, value_sizes,
+                    torch::TensorOptions(scalar_type).device(DefaultDevice()));
+      torch::Tensor result = torch::index_copy(base, dim, index, value);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_base = CopyToDevice(base, device);
+        torch::Tensor lazy_index = CopyToDevice(index, device);
+        torch::Tensor lazy_value = CopyToDevice(value, device);
+        torch::Tensor lazy_result =
+            torch::index_copy(lazy_base, dim, lazy_index, lazy_value);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestRelu) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::relu(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::relu(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReluInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::relu_(input);
+    torch::Tensor lazy_output = torch::relu_(lazy_input);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardshrink) {
+  torch::Tensor input = torch::randn(
+      {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::hardshrink(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::hardshrink(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardSigmoid) {
+  torch::Tensor input = torch::randn(
+      {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::hardsigmoid(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::hardsigmoid(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardSigmoidInPlace) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor input = torch::randn(
+        {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::hardsigmoid_(input);
+    torch::Tensor lazy_output = torch::hardsigmoid_(lazy_input);
+    AllClose(input, lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardSigmoidBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::hardsigmoid(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::randn({10}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftshrink) {
+  torch::Tensor input = torch::randn(
+      {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::softshrink(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::softshrink(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardtanh) {
+  torch::Tensor input = torch::randn(
+      {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::hardtanh(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::hardtanh(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardtanhInPlace) {
+  torch::Tensor input = torch::randn(
+      {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::hardtanh_(input);
+    torch::Tensor lazy_output = torch::hardtanh_(lazy_input);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLeakyRelu) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double negative_slope = 0.01;
+  torch::Tensor output = torch::leaky_relu(input, negative_slope);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::leaky_relu(lazy_input, negative_slope);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLeakyReluInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double negative_slope = 0.01;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::leaky_relu_(input, negative_slope);
+    torch::Tensor lazy_output = torch::leaky_relu_(lazy_input, negative_slope);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestExp) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::exp(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::exp(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestExpm1) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::expm1(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::expm1(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLog) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::log(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::log(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLog2) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::log2(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::log2(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLog10) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::log10(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::log10(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLog1p) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::log1p(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::log1p(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestErf) {
+  torch::Tensor a = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::erf(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::erf(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestErfc) {
+  torch::Tensor a = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::erfc(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::erfc(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestErfinv) {
+  torch::Tensor a = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::erfinv(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::erfinv(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSqrt) {
+  torch::Tensor a = torch::abs(torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor b = torch::sqrt(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::sqrt(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRsqrt) {
+  torch::Tensor a = torch::abs(torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor b = torch::rsqrt(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::rsqrt(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReciprocal) {
+  torch::Tensor a = torch::randn(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::reciprocal(a);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::reciprocal(lazy_a);
+    AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowTensorScalar) {
+  torch::Tensor base = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar exponent = 4.09;
+  torch::Tensor result = torch::pow(base, exponent);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base, device);
+    torch::Tensor lazy_result = torch::pow(lazy_base, exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowTensorScalarInPlace) {
+  torch::Tensor base = torch::rand(
+      {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar exponent = 4.09;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+    torch::Tensor result = base.pow_(exponent);
+    torch::Tensor lazy_result = lazy_base.pow_(exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowTensorTensor) {
+  torch::Tensor base = torch::abs(torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor exponent = torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::pow(base, exponent);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base, device);
+    torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
+    torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowTensorTensorInPlace) {
+  torch::Tensor base = torch::abs(torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor exponent = torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
+    torch::Tensor result = base.pow_(exponent);
+    torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
+    torch::Tensor lazy_result = lazy_base.pow_(lazy_exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+    AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowTensorTensorBroadcast) {
+  torch::Tensor base = torch::abs(torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Tensor exponent = torch::rand(
+      {4, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::pow(base, exponent);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base, device);
+    torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
+    torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowScalarTensor) {
+  torch::Scalar base = 3.5;
+  torch::Tensor exponent = torch::rand({4, 2});
+  torch::Tensor result = torch::pow(base, exponent);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
+    torch::Tensor lazy_result = torch::pow(base, lazy_exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPowIntExponent) {
+  torch::Tensor base = torch::abs(torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
+  torch::Scalar exponent = 3;
+  torch::Tensor result = torch::pow(base, exponent);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_base = CopyToDevice(base, device);
+    torch::Tensor lazy_result = torch::pow(lazy_base, exponent);
+    AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFmodScalar) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Scalar divisor = 2.0;
+  torch::Tensor b = torch::fmod(a, divisor);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::fmod(lazy_a, divisor);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFmodScalarInPlace) {
+  torch::Scalar divisor = 2.0;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a =
+        torch::rand(
+            {2, 2},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+        100.0;
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = a.fmod_(divisor);
+    torch::Tensor lazy_b = lazy_a.fmod_(divisor);
+    AllClose(b, lazy_b);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFmodTensor) {
+  torch::Tensor a =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      10.0;
+  torch::Tensor c = torch::fmod(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::fmod(lazy_a, lazy_b);
+    AllClose(c, lazy_c);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFmodTensorInPlace) {
+  torch::Tensor b =
+      torch::rand({2, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      10.0;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a =
+        torch::rand(
+            {2, 2},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+        100.0;
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor c = a.fmod_(b);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = lazy_a.fmod_(lazy_b);
+    AllClose(c, lazy_c);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRemainderScalar) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Scalar divisor = -2.0;
+  torch::Tensor b = torch::remainder(a, divisor);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = torch::remainder(lazy_a, divisor);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRemainderScalarInPlace) {
+  torch::Scalar divisor = -2.0;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a =
+        torch::randn(
+            {2, 2},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+        100.0;
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor b = a.remainder_(divisor);
+    torch::Tensor lazy_b = lazy_a.remainder_(divisor);
+    AllClose(b, lazy_b);
+    AllClose(a, lazy_a);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRemainderTensor) {
+  torch::Tensor a =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      100.0;
+  torch::Tensor b =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      10.0;
+  torch::Tensor c = torch::remainder(a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = torch::remainder(lazy_a, lazy_b);
+    AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRemainderTensorInPlace) {
+  torch::Tensor b =
+      torch::randn(
+          {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+      10.0;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor a =
+        torch::randn(
+            {2, 2},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
+        100.0;
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor c = a.remainder_(b);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = lazy_a.remainder_(lazy_b);
+    AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6);
+    AllClose(a, lazy_a, /*rtol=*/1e-4, /*atol=*/1e-6);
+  });
+}
+
+TEST_F(LazyOpsTest, TestWhere) {
+  torch::Tensor a = torch::rand(
+      {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      c[i][j] = i == j;
+    }
+  }
+  torch::Tensor d = torch::where(c, a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestWhereBroadcast) {
+  torch::Tensor a = torch::rand(
+      {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::zeros(
+      {}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::empty(
+      {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      c[i][j] = i == j;
+    }
+  }
+  torch::Tensor d = torch::where(c, a, b);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b);
+    AllClose(d, lazy_d);
+  });
+}
+
+TEST_F(LazyOpsTest, TestThreshold) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  float threshold = 0.4;
+  float value = 20;
+  torch::Tensor output = torch::threshold(input, threshold, value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::threshold(lazy_input, threshold, value);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestThresholdBackward) {
+  float threshold = 0.4;
+  float value = 20;
+
+  auto testFunction = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::threshold(inputs[0], threshold, value);
+  };
+
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testFunction);
+  });
+}
+
+TEST_F(LazyOpsTest, TestThresholdInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = input.clone();
+  float threshold = 0.4;
+  float value = 20;
+  torch::threshold_(output, threshold, value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_output = CopyToDevice(input, device);
+    torch::threshold_(lazy_output, threshold, value);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestElu) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 0.5;
+  torch::Scalar scale = 2.5;
+  torch::Scalar input_scale = 1.5;
+  torch::Tensor output = torch::elu(input, alpha, scale, input_scale);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::elu(lazy_input, alpha, scale, input_scale);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEluInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 0.5;
+  torch::Scalar scale = 2.5;
+  torch::Scalar input_scale = 1.5;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::elu_(input, alpha, scale, input_scale);
+    torch::Tensor lazy_output =
+        torch::elu_(lazy_input, alpha, scale, input_scale);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSelu) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::selu(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::selu(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSeluInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::selu_(input);
+    torch::Tensor lazy_output = torch::selu_(lazy_input);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCelu) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 2.5;
+  torch::Tensor output = torch::celu(input, alpha);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::celu(lazy_input, alpha);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestCeluInPlace) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar alpha = 2.5;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::celu_(input, alpha);
+    torch::Tensor lazy_output = torch::celu_(lazy_input, alpha);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGelu) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::gelu(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::gelu(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddMatMul) {
+  int in_channels = 32;
+  int out_channels = 320;
+  int labels = 50;
+  torch::Tensor input =
+      torch::rand({in_channels, out_channels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({out_channels, labels},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias = torch::rand(
+      {labels}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test beta != 1. through the CPU interop.
+  for (double beta : {1., 2.}) {
+    torch::Tensor output = torch::addmm(bias, input, weight, /*beta=*/beta);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_weight = CopyToDevice(weight, device);
+      torch::Tensor lazy_bias = CopyToDevice(bias, device);
+      torch::Tensor lazy_output =
+          torch::addmm(lazy_bias, lazy_input, lazy_weight, /*beta=*/beta);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestEmbedding) {
+  torch::Tensor a = torch::rand(
+      {32, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor i = torch::randint(
+      0, 31, {3, 4},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor b =
+      torch::embedding(a, i, /*padding_idx=*/0, /*scale_grad_by_freq=*/false,
+                       /*sparse=*/false);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_i = CopyToDevice(i, device);
+    torch::Tensor lazy_b = torch::embedding(lazy_a, lazy_i, /*padding_idx=*/0,
+                                           /*scale_grad_by_freq=*/false,
+                                           /*sparse=*/false);
+    AllClose(b, lazy_b);
+  });
+}
+
+TEST_F(LazyOpsTest, TestOneHot) {
+  int num_classes = 5;
+  torch::Tensor input = torch::randint(
+      0, num_classes, {10},
+      torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+  torch::Tensor output = torch::one_hot(input, num_classes);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::one_hot(lazy_input, num_classes);
+    AllEqual(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTranspose) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::t(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::t(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTransposeInPlace) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = input.t_();
+    torch::Tensor lazy_output = lazy_input.t_();
+    EXPECT_EQ(lazy_output.sizes(), output.sizes());
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReshape) {
+  torch::Tensor input =
+      torch::rand({32, 20, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::reshape(input, {-1, 320});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::reshape(lazy_input, {-1, 320});
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestResize) {
+  // Testing a resize_() with target size bigger than original size is not
+  // possible, as we fill with zeros, while pytorch fills with random garbage.
+  torch::Tensor input = torch::rand(
+      {2, 2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor saved_input = input.clone();
+  input.resize_({3, 3});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(saved_input, device);
+    lazy_input.resize_({3, 3});
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestViewResize) {
+  torch::Tensor input = torch::zeros(
+      {8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor saved_input = input.clone();
+  torch::Tensor output = input.view({4, 4});
+  output.resize_({3, 3});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(saved_input, device);
+    torch::Tensor lazy_output = lazy_input.view({4, 4});
+    lazy_output.resize_({3, 3});
+    AllClose(input, lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestView) {
+  torch::Tensor input =
+      torch::rand({32, 20, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = input.view({-1, 320});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = lazy_input.view({-1, 320});
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestViewMod) {
+  torch::Tensor input =
+      torch::zeros({32, 20, 4, 4},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor one = torch::tensor(
+      1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = input.view({-1, 320});
+  output.add_(one, 1.0);
+  input.add_(one, 1.0);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xinput = torch::zeros(
+        {32, 20, 4, 4},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(xinput, device);
+    torch::Tensor lazy_one = CopyToDevice(one, device);
+    torch::Tensor lazy_output = lazy_input.view({-1, 320});
+    lazy_output.add_(lazy_one, 1.0);
+    lazy_input.add_(lazy_one, 1.0);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestViewModComplex) {
+  torch::Tensor input =
+      torch::zeros({32, 20, 4, 4},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor one = torch::tensor(
+      1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output1 = input.view({-1, 320});
+  output1.add_(one, 1.0);
+  torch::Tensor output2 = input.view({-1, 160});
+  output2.add_(one, 1.0);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xinput = torch::zeros(
+        {32, 20, 4, 4},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(xinput, device);
+    torch::Tensor lazy_one = CopyToDevice(one, device);
+    torch::Tensor lazy_output1 = lazy_input.view({-1, 320});
+    lazy_output1.add_(lazy_one, 1.0);
+    torch::Tensor lazy_output2 = lazy_input.view({-1, 160});
+    lazy_output2.add_(lazy_one, 1.0);
+    AllClose(output1, lazy_output1);
+    AllClose(output2, lazy_output2);
+  });
+}
+
+TEST_F(LazyOpsTest, TestViewOfViewMod) {
+  torch::Tensor input =
+      torch::zeros({32, 20, 4, 4},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor one = torch::tensor(
+      1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output1 = input.view({-1, 320});
+  output1.add_(one, 1.0);
+  torch::Tensor output2 = output1.view({-1, 160});
+  output2.add_(one, 1.0);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xinput = torch::zeros(
+        {32, 20, 4, 4},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(xinput, device);
+    torch::Tensor lazy_one = CopyToDevice(one, device);
+    torch::Tensor lazy_output1 = lazy_input.view({-1, 320});
+    lazy_output1.add_(lazy_one, 1.0);
+    torch::Tensor lazy_output2 = lazy_output1.view({-1, 160});
+    lazy_output2.add_(lazy_one, 1.0);
+    AllClose(output1, lazy_output1);
+    AllClose(output2, lazy_output2);
+  });
+}
+
+TEST_F(LazyOpsTest, TestViewSqueezeAddInPlace) {
+  torch::Tensor input = torch::zeros(
+      {2, 3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> view_size = {2, 3, 1, 1};
+  int squeeze_dim = 2;
+  torch::Tensor one = torch::tensor(
+      1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = input.view(view_size);
+    output.squeeze_(squeeze_dim);
+    output.add_(one, 1.0);
+    torch::Tensor lazy_one = CopyToDevice(one, device);
+    torch::Tensor lazy_output = lazy_input.view(view_size);
+    lazy_output.squeeze_(squeeze_dim);
+    lazy_output.add_(lazy_one, 1.0);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestUnsafeView) {
+  torch::Tensor input =
+      torch::rand({32, 20, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::_unsafe_view(input, {-1, 320});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::_unsafe_view(lazy_input, {-1, 320});
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestNarrow) {
+  torch::Tensor a =
+      torch::rand({8, 10, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int64_t dim : {1, -3}) {
+    for (int64_t start : {2, -8}) {
+      torch::Tensor b = a.narrow(dim, start, 6);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a, device);
+        torch::Tensor lazy_b = lazy_a.narrow(dim, start, 6);
+        AllClose(b, lazy_b);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowUpdate) {
+  for (int64_t dim : {1, -2}) {
+    for (int64_t start : {2, -6}) {
+      torch::Tensor a = torch::rand(
+          {3, 8, 3},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor a_copy = a.clone();
+      torch::Tensor b = torch::rand(
+          {3, 4, 3},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor c = a.narrow(dim, start, 4);
+      c.add_(b, 1.0);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+        torch::Tensor lazy_b = CopyToDevice(b, device);
+        torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
+        lazy_c.add_(lazy_b, 1.0);
+        AllClose(c, lazy_c);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowUpdateBaseCheck) {
+  for (int64_t dim : {0, -2}) {
+    for (int64_t start : {2, -6}) {
+      torch::Tensor a = torch::zeros(
+          {8, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor a_copy = a.clone();
+      torch::Tensor b = torch::ones(
+          {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor c = a.narrow(dim, start, 4);
+      c.add_(b, 1.0);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+        torch::Tensor lazy_b = CopyToDevice(b, device);
+        torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
+        lazy_c.add_(lazy_b, 1.0);
+        AllClose(a, lazy_a);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowUpdateTwoSlices) {
+  for (int64_t dim : {0, -2}) {
+    for (int64_t start0 : {2, -6}) {
+      for (int64_t start1 : {6, -2}) {
+        torch::Tensor a = torch::zeros(
+            {8, 3},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor a_copy = a.clone();
+        torch::Tensor b = torch::ones(
+            {2, 3},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor c = b + 1;
+        torch::Tensor d = a.narrow(dim, start0, 2);
+        torch::Tensor e = a.narrow(dim, start1, 2);
+        d.add_(b, 1.0);
+        e.add_(c, 1.0);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+          torch::Tensor lazy_b = CopyToDevice(b, device);
+          torch::Tensor lazy_c = CopyToDevice(c, device);
+          torch::Tensor lazy_d = lazy_a.narrow(dim, start0, 2);
+          torch::Tensor lazy_e = lazy_a.narrow(dim, start1, 2);
+          lazy_d.add_(lazy_b, 1.0);
+          lazy_e.add_(lazy_c, 1.0);
+          AllClose(d, lazy_d);
+          AllClose(e, lazy_e);
+          AllClose(a, lazy_a);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowUpdateView) {
+  for (int64_t dim : {0, -3}) {
+    for (int64_t start : {2, -6}) {
+      torch::Tensor a = torch::rand(
+          {8, 2, 3},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor a_copy = a.clone();
+      torch::Tensor b = torch::rand(
+          {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor c = a.narrow(dim, start, 4);
+      torch::Tensor d = c.view({4, 6});
+      d.add_(b, 1.0);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+        torch::Tensor lazy_b = CopyToDevice(b, device);
+        torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
+        torch::Tensor lazy_d = lazy_c.view({4, 6});
+        lazy_d.add_(lazy_b, 1.0);
+        AllClose(d, lazy_d);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowInNarrowUpdate) {
+  for (int64_t dim : {1, -2}) {
+    for (int64_t start0 : {1, -7}) {
+      for (int64_t start1 : {1, -5}) {
+        torch::Tensor a = torch::rand(
+            {3, 8, 3},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor a_copy = a.clone();
+        torch::Tensor b = torch::rand(
+            {3, 2, 3},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor c = a.narrow(dim, start0, 6);
+        torch::Tensor d = c.narrow(dim, start1, 2);
+        d.add_(b, 1.0);
+        ForEachDevice([&](const torch::Device& device) {
+          torch::Tensor lazy_a = CopyToDevice(a_copy, device);
+          torch::Tensor lazy_b = CopyToDevice(b, device);
+          torch::Tensor lazy_c = lazy_a.narrow(dim, start0, 6);
+          torch::Tensor lazy_d = lazy_c.narrow(dim, start1, 2);
+          lazy_d.add_(lazy_b, 1.0);
+          AllClose(a, lazy_a);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNarrowCopy) {
+  for (int64_t dim : {1, -3}) {
+    for (int64_t start : {2, -8}) {
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor input = torch::rand(
+            {8, 10, 4, 4},
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor result = input.narrow_copy(dim, start, 6);
+        input.add_(1);
+        torch::Tensor lazy_result = lazy_input.narrow_copy(dim, start, 6);
+        lazy_input.add_(1);
+        AllClose(result, lazy_result);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestViewAs) {
+  torch::Tensor input =
+      torch::rand({32, 20, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor empty = torch::empty({32, 320});
+  torch::Tensor output = input.view_as(empty);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_empty = CopyToDevice(empty, device);
+    torch::Tensor lazy_output = lazy_input.view_as(lazy_empty);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSoftmax) {
+  torch::Tensor input =
+      torch::rand({5, 3, 4, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output = torch::log_softmax(input, dim);
+      torch::Tensor lazy_output = torch::log_softmax(lazy_input, dim);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSoftmaxCast) {
+  torch::Tensor input =
+      torch::rand({5, 3, 4, 2},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output = torch::log_softmax(input, dim, torch::kDouble);
+      torch::Tensor lazy_output =
+          torch::log_softmax(lazy_input, dim, torch::kDouble);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSoftmaxWrapper) {
+  torch::Tensor input =
+      torch::rand({10, 2, 6, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output =
+          torch::_log_softmax(input, dim, /*half_to_float=*/false);
+      torch::Tensor lazy_output =
+          torch::_log_softmax(lazy_input, dim, /*half_to_float=*/false);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftmax) {
+  torch::Tensor input =
+      torch::rand({10, 2, 6, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output = torch::softmax(input, dim);
+      torch::Tensor lazy_output = torch::softmax(lazy_input, dim);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftmaxCast) {
+  torch::Tensor input =
+      torch::rand({10, 2, 6, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output = torch::softmax(input, dim, torch::kDouble);
+      torch::Tensor lazy_output = torch::softmax(lazy_input, dim, torch::kDouble);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftmaxWrapper) {
+  torch::Tensor input =
+      torch::rand({10, 2, 6, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    int rank = input.dim();
+    for (int dim = -rank; dim < rank; ++dim) {
+      torch::Tensor output =
+          torch::_softmax(input, dim, /*half_to_float=*/false);
+      torch::Tensor lazy_output =
+          torch::_softmax(lazy_input, dim, /*half_to_float=*/false);
+      AllClose(output, lazy_output, /*rtol=*/1e-3);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftplus) {
+  torch::Tensor input =
+      torch::rand({2, 1, 4, 6},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::softplus(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::softplus(lazy_input);
+    AllClose(output, lazy_output, /*rtol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMaxPool1D) {
+  torch::Tensor input = torch::rand(
+      {1, 16, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output =
+              torch::max_pool1d(input, /*kernel_size=*/{kernel_size},
+                                /*stride=*/{stride},
+                                /*padding=*/{padding}, /*dilation=*/{dilation},
+                                /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::max_pool1d(lazy_input,
+                                  /*kernel_size=*/{kernel_size},
+                                  /*stride=*/{stride},
+                                  /*padding=*/{padding},
+                                  /*dilation=*/{dilation},
+                                  /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2D) {
+  torch::Tensor input =
+      torch::rand({1, 4, 14, 14},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::max_pool2d(lazy_input,
+                                  /*kernel_size=*/{kernel_size, kernel_size},
+                                  /*stride=*/{stride, stride},
+                                  /*padding=*/{padding, padding},
+                                  /*dilation=*/{dilation, dilation},
+                                  /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2DWithIndices) {
+  torch::Tensor input =
+      torch::rand({1, 4, 14, 14},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          auto outputs = torch::max_pool2d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            auto lazy_outputs = torch::max_pool2d_with_indices(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size},
+                /*stride=*/{stride, stride},
+                /*padding=*/{padding, padding},
+                /*dilation=*/{dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs));
+            AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs));
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2DNonSquare) {
+  torch::Tensor input =
+      torch::rand({1, 4, 14, 14},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 4;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size + 1},
+              /*stride=*/{stride, stride + 1},
+              /*padding=*/{padding, padding + 1},
+              /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::max_pool2d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size + 1},
+                /*stride=*/{stride, stride + 1},
+                /*padding=*/{padding, padding + 1},
+                /*dilation=*/{dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3D) {
+  torch::Tensor input =
+      torch::rand({1, 1, 8, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::max_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*dilation=*/{dilation, dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DWithIndices) {
+  torch::Tensor input =
+      torch::rand({1, 1, 8, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          auto outputs = torch::max_pool3d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            auto lazy_outputs = torch::max_pool3d_with_indices(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*dilation=*/{dilation, dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+
+            AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs));
+            AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs));
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DIncompleteAttributes) {
+  torch::Tensor input =
+      torch::rand({1, 1, 8, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{},
+              /*padding=*/{padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::max_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{},
+                /*padding=*/{padding},
+                /*dilation=*/{dilation, dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DNonSquare) {
+  torch::Tensor input =
+      torch::rand({1, 1, 8, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 4;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool3d(
+              input,
+              /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
+              /*stride=*/{stride, stride + 1, stride},
+              /*padding=*/{padding, padding + 1, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::max_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
+                /*stride=*/{stride, stride + 1, stride},
+                /*padding=*/{padding, padding + 1, padding},
+                /*dilation=*/{dilation, dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2DNoBatch) {
+  torch::Tensor input = torch::rand(
+      {4, 14, 14}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::max_pool2d(lazy_input,
+                                  /*kernel_size=*/{kernel_size, kernel_size},
+                                  /*stride=*/{stride, stride},
+                                  /*padding=*/{padding, padding},
+                                  /*dilation=*/{dilation, dilation},
+                                  /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DNoBatch) {
+  torch::Tensor input =
+      torch::rand({1, 8, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output = torch::max_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::max_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*dilation=*/{dilation, dilation, dilation},
+                /*ceil_mode=*/ceil_mode);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool1D) {
+  torch::Tensor input = torch::rand(
+      {4, 1, 28}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output =
+              torch::avg_pool1d(input, /*kernel_size=*/{kernel_size},
+                                /*stride=*/{stride},
+                                /*padding=*/{padding}, /*ceil_mode=*/ceil_mode,
+                                /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::avg_pool1d(lazy_input,
+                                  /*kernel_size=*/{kernel_size},
+                                  /*stride=*/{stride},
+                                  /*padding=*/{padding},
+                                  /*ceil_mode=*/ceil_mode,
+                                  /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool2D) {
+  torch::Tensor input =
+      torch::rand({2, 1, 14, 14},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            // torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::avg_pool2d(lazy_input,
+                                  /*kernel_size=*/{kernel_size, kernel_size},
+                                  /*stride=*/{stride, stride},
+                                  /*padding=*/{padding, padding},
+                                  /*ceil_mode=*/ceil_mode,
+                                  /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output.to(torch::kCPU));
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool2DNonSquare) {
+  torch::Tensor input =
+      torch::rand({2, 1, 14, 14},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 4;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size + 1},
+              /*stride=*/{stride, stride + 1},
+              /*padding=*/{padding, padding + 1}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::avg_pool2d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size + 1},
+                /*stride=*/{stride, stride + 1},
+                /*padding=*/{padding, padding + 1},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3D) {
+  torch::Tensor input =
+      torch::rand({1, 1, 7, 7, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::avg_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3DIncompleteAttributes) {
+  torch::Tensor input =
+      torch::rand({1, 1, 7, 7, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{},
+              /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::avg_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{},
+                /*padding=*/{padding, padding, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3DNonSquare) {
+  torch::Tensor input =
+      torch::rand({1, 1, 7, 7, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 4;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool3d(
+              input,
+              /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
+              /*stride=*/{stride, stride + 1, stride},
+              /*padding=*/{padding, padding + 1, padding},
+              /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::avg_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
+                /*stride=*/{stride, stride + 1, stride},
+                /*padding=*/{padding, padding + 1, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool2DNoBatch) {
+  torch::Tensor input = torch::rand(
+      {1, 7, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool2d(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::avg_pool2d(lazy_input,
+                                  /*kernel_size=*/{kernel_size, kernel_size},
+                                  /*stride=*/{stride, stride},
+                                  /*padding=*/{padding, padding},
+                                  /*ceil_mode=*/ceil_mode,
+                                  /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3DNoBatch) {
+  torch::Tensor input =
+      torch::rand({1, 7, 7, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          torch::Tensor output = torch::avg_pool3d(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode,
+              /*count_include_pad=*/count_include_pad);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output = torch::avg_pool3d(
+                lazy_input,
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool2D) {
+  torch::Tensor input =
+      torch::rand({4, 1, 28, 28},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int64_t output_size : {7, 4}) {
+    torch::Tensor output =
+        torch::adaptive_avg_pool2d(input, {output_size, output_size});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output =
+          torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size});
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool3D) {
+  torch::Tensor input =
+      torch::rand({9, 4, 56, 28, 28},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int64_t output_size : {7, 4}) {
+    torch::Tensor output = torch::adaptive_avg_pool3d(
+        input, {output_size, output_size, output_size});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::adaptive_avg_pool3d(
+          lazy_input, {output_size, output_size, output_size});
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatch) {
+  torch::Tensor input =
+      torch::rand({3, 56, 28, 28},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int64_t output_size : {7, 4}) {
+    torch::Tensor output = torch::adaptive_avg_pool3d(
+        input, {output_size, output_size, output_size});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::adaptive_avg_pool3d(
+          lazy_input, {output_size, output_size, output_size});
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatch) {
+  torch::Tensor input = torch::rand(
+      {1, 56, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int64_t output_size : {7, 8}) {
+    torch::Tensor output =
+        torch::adaptive_avg_pool2d(input, {output_size, output_size});
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output =
+          torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size});
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxUnpool2D) {
+  int kernel_size = 2;
+  torch::Tensor input =
+      torch::rand({2, 2, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output;
+          torch::Tensor indices;
+          std::tie(output, indices) = torch::max_pool2d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+
+          std::vector<int64_t> output_size({input.size(2), input.size(3)});
+          at::Tensor utensor =
+              torch::max_unpool2d(output, indices, output_size);
+
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_output = CopyToDevice(output, device);
+            torch::Tensor lazy_indices = CopyToDevice(indices, device);
+            at::Tensor lazy_utensor =
+                torch::max_unpool2d(lazy_output, lazy_indices, output_size);
+            AllClose(utensor, lazy_utensor);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxUnpool3D) {
+  int kernel_size = 2;
+  torch::Tensor input =
+      torch::rand({1, 1, 4, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        // Test dilation through the CPU interop.
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output;
+          torch::Tensor indices;
+          std::tie(output, indices) = torch::max_pool3d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+
+          std::vector<int64_t> output_size(
+              {input.size(2), input.size(3), input.size(4)});
+          at::Tensor utensor = torch::max_unpool3d(
+              output, indices, output_size, /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding});
+
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_output = CopyToDevice(output, device);
+            torch::Tensor lazy_indices = CopyToDevice(indices, device);
+            at::Tensor lazy_utensor =
+                torch::max_unpool3d(lazy_output, lazy_indices, output_size,
+                                    /*stride=*/{stride, stride, stride},
+                                    /*padding=*/{padding, padding, padding});
+            AllClose(utensor, lazy_utensor);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNllLoss) {
+
+  // TODO(whc) debug divide-by-zero failure under ASAN
+  GTEST_SKIP();
+
+  int batch = 6;
+  int classes = 2;
+  // TODO(asuhan): Fix the torch::kDouble case.
+  for (auto dtype : {torch::kFloat}) {
+    for (int ignore_index : {-1, 0, 1, 5}) {
+      for (bool def_weight : {false, true}) {
+        torch::Tensor input =
+            torch::rand({batch, classes},
+                        torch::TensorOptions(dtype).device(DefaultDevice()));
+        torch::Tensor target = torch::randint(
+            std::min(ignore_index, 0), classes, {batch},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        torch::Tensor weight;
+        if (def_weight) {
+          weight = torch::rand(
+              {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
+        }
+        for (torch::Reduction::Reduction reduction :
+             {torch::Reduction::Mean, torch::Reduction::Sum,
+              torch::Reduction::None}) {
+          torch::Tensor output =
+              torch::nll_loss(/*self=*/input, /*target=*/target,
+                              /*weight=*/weight,
+                              /*reduction=*/reduction,
+                              /*ignore_index=*/ignore_index);
+
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_target = CopyToDevice(target, device);
+            torch::Tensor lazy_weight =
+                def_weight ? CopyToDevice(weight, device) : torch::Tensor();
+            torch::Tensor lazy_output = torch::nll_loss(
+                /*self=*/lazy_input, /*target=*/lazy_target,
+                /*weight=*/lazy_weight,
+                /*reduction=*/reduction, /*ignore_index=*/ignore_index);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNllLoss2d) {
+  int batch = 6;
+  int classes = 2;
+  int height = 3;
+  int width = 3;
+  // TODO(asuhan): Fix the torch::kDouble case.
+  for (auto dtype : {torch::kFloat}) {
+    for (int ignore_index : {-1, 0, 1, 5}) {
+      for (bool def_weight : {false, true}) {
+        torch::Tensor input =
+            torch::rand({batch, classes, height, width},
+                        torch::TensorOptions(dtype).device(DefaultDevice()));
+        torch::Tensor target = torch::randint(
+            std::min(ignore_index, 0), classes, {batch, height, width},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        torch::Tensor weight;
+        if (def_weight) {
+          weight = torch::rand(
+              {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
+        }
+        for (torch::Reduction::Reduction reduction :
+             {torch::Reduction::Mean, torch::Reduction::Sum,
+              torch::Reduction::None}) {
+          torch::Tensor output =
+              torch::nll_loss2d(/*self=*/input, /*target=*/target,
+                                /*weight=*/weight,
+                                /*reduction=*/reduction,
+                                /*ignore_index=*/ignore_index);
+
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_target = CopyToDevice(target, device);
+            torch::Tensor lazy_weight =
+                def_weight ? CopyToDevice(weight, device) : torch::Tensor();
+            torch::Tensor lazy_output = torch::nll_loss2d(
+                /*self=*/lazy_input, /*target=*/lazy_target,
+                /*weight=*/lazy_weight,
+                /*reduction=*/reduction, /*ignore_index=*/ignore_index);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSmoothL1Loss) {
+  torch::Tensor input = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    for (double beta : {0.25, 1.}) {
+      torch::Tensor output =
+          torch::smooth_l1_loss(input, target, reduction, beta);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_target = CopyToDevice(target, device);
+        torch::Tensor lazy_output =
+            torch::smooth_l1_loss(lazy_input, lazy_target, reduction, beta);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestL1Loss) {
+  torch::Tensor input = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    torch::Tensor output = torch::l1_loss(input, target, reduction);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_target = CopyToDevice(target, device);
+      torch::Tensor lazy_output =
+          torch::l1_loss(lazy_input, lazy_target, reduction);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestL1LossBackward) {
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::l1_loss(inputs[0], inputs[1], reduction);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                    torch::rand({2, 4}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice()))},
+                   device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMseLoss) {
+  torch::Tensor input = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor target = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    torch::Tensor output = torch::mse_loss(input, target, reduction);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_target = CopyToDevice(target, device);
+      torch::Tensor lazy_output =
+          torch::mse_loss(lazy_input, lazy_target, reduction);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMseLossBackward) {
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::mse_loss(inputs[0], inputs[1], reduction);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                    torch::rand({2, 4}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice()))},
+                   device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestBatchNorm1D) {
+  int num_features = 3;
+  torch::Tensor input =
+      torch::rand({2, num_features, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias =
+      torch::rand({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_mean =
+      torch::zeros({num_features},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_var =
+      torch::ones({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double momentum = 0.1;
+  double eps = 0.5;
+  torch::Tensor undef;
+  for (bool training : {true, false}) {
+    for (bool undef_weight_bias : {false, true}) {
+      torch::Tensor output = torch::batch_norm(
+          /*input=*/input, /*weight=*/undef_weight_bias ? undef : weight,
+          /*bias=*/undef_weight_bias ? undef : bias,
+          /*running_mean=*/running_mean, /*running_var=*/running_var,
+          /*training=*/training, /*momentum=*/momentum, /*eps=*/eps,
+          /*cudnn_enabled=*/false);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_weight =
+            undef_weight_bias ? undef : CopyToDevice(weight, device);
+        torch::Tensor lazy_bias =
+            undef_weight_bias ? undef : CopyToDevice(bias, device);
+        torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
+        torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
+        torch::Tensor lazy_output = torch::batch_norm(
+            /*input=*/lazy_input, /*weight=*/lazy_weight, /*bias=*/lazy_bias,
+            /*running_mean=*/lazy_running_mean, /*running_var=*/lazy_running_var,
+            /*training=*/training, /*momentum=*/momentum, /*eps=*/eps,
+            /*cudnn_enabled=*/false);
+        AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestBatchNorm2D) {
+  int num_features = 3;
+  torch::Tensor input =
+      torch::rand({2, num_features, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight =
+      torch::rand({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor bias =
+      torch::rand({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_mean =
+      torch::zeros({num_features},
+                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor running_var =
+      torch::ones({num_features},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  double momentum = 0.1;
+  double eps = 0.5;
+  torch::Tensor undef;
+  for (bool training : {true, false}) {
+    for (bool undef_weight_bias : {false, true}) {
+      torch::Tensor output = torch::batch_norm(
+          /*input=*/input, /*weight=*/undef_weight_bias ? undef : weight,
+          /*bias=*/undef_weight_bias ? undef : bias,
+          /*running_mean=*/running_mean, /*running_var=*/running_var,
+          /*training=*/training, /*momentum=*/momentum, /*eps=*/eps,
+          /*cudnn_enabled=*/false);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_weight =
+            undef_weight_bias ? undef : CopyToDevice(weight, device);
+        torch::Tensor lazy_bias =
+            undef_weight_bias ? undef : CopyToDevice(bias, device);
+        torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
+        torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
+        torch::Tensor lazy_output = torch::batch_norm(
+            /*input=*/lazy_input, /*weight=*/lazy_weight, /*bias=*/lazy_bias,
+            /*running_mean=*/lazy_running_mean, /*running_var=*/lazy_running_var,
+            /*training=*/training, /*momentum=*/momentum, /*eps=*/eps,
+            /*cudnn_enabled=*/false);
+        AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestDim) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    EXPECT_EQ(input.dim(), lazy_input.dim());
+  });
+}
+
+TEST_F(LazyOpsTest, TestContiguous) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::native::contiguous(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::native::contiguous(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSqueezeAll) {
+  torch::Tensor input =
+      torch::rand({2, 1, 3, 1},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::squeeze(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::squeeze(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSqueezeAllInPlace) {
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor input = torch::rand(
+        {2, 1, 3, 1},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = input.squeeze_();
+    torch::Tensor lazy_output = lazy_input.squeeze_();
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+    ASSERT_EQ(input.dim(), lazy_input.dim());
+    for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
+      ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSqueezeOne) {
+  torch::Tensor input =
+      torch::rand({2, 1, 3, 1},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor output = torch::squeeze(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::squeeze(lazy_input, dim);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSqueezeOneInPlace) {
+  int rank = 4;
+  for (int dim = -rank; dim < rank; ++dim) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor input = torch::rand(
+          {2, 1, 3, 1},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor output = input.squeeze_(dim);
+      torch::Tensor lazy_output = lazy_input.squeeze_(dim);
+      AllClose(output, lazy_output);
+      AllClose(input, lazy_input);
+      ASSERT_EQ(input.dim(), lazy_input.dim());
+      for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
+        ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
+      }
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestUnsqueeze) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim() + 1;
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor output = torch::unsqueeze(input, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::unsqueeze(lazy_input, dim);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestUnsqueezeInPlace) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim() + 1;
+  for (int dim = -rank; dim < rank; ++dim) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor output = input.unsqueeze_(dim);
+      torch::Tensor lazy_output = lazy_input.unsqueeze_(dim);
+      AllClose(output, lazy_output);
+      AllClose(input, lazy_input);
+      ASSERT_EQ(input.dim(), lazy_input.dim());
+      for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
+        ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
+      }
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaskedFill) {
+  torch::Tensor input = torch::rand(
+      {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor mask = torch::randint(
+      0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+  torch::Scalar value(42);
+  torch::Tensor result = torch::masked_fill(input, mask, value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_mask = CopyToDevice(mask, device);
+    torch::Tensor lazy_result = torch::masked_fill(lazy_input, lazy_mask, value);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMaskedFillInPlace) {
+  torch::Scalar value(42);
+  torch::Tensor mask = torch::randint(
+      0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor input = torch::rand(
+        {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_mask = CopyToDevice(mask, device);
+    torch::Tensor result = input.masked_fill_(mask, value);
+    torch::Tensor lazy_result = lazy_input.masked_fill_(lazy_mask, value);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMaskedFillBroadcast) {
+  torch::Tensor input =
+      torch::rand({2, 5, 4, 3},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor mask = torch::randint(
+      0, 2, {4, 1}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
+  torch::Scalar value(42);
+  torch::Tensor result = torch::masked_fill(input, mask, value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_mask = CopyToDevice(mask, device);
+    torch::Tensor lazy_result = torch::masked_fill(lazy_input, lazy_mask, value);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFill) {
+  torch::Scalar value(42);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor input = torch::empty(
+        {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = torch::fill_(input, value);
+    torch::Tensor lazy_result = torch::fill_(lazy_input, value);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestFillWithRank0) {
+  torch::Tensor value = torch::scalar_tensor(42);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor input = torch::empty(
+        {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = torch::fill_(input, value);
+    torch::Tensor lazy_value = CopyToDevice(value, device);
+    torch::Tensor lazy_result = torch::fill_(lazy_input, value);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestPermute) {
+  torch::Tensor input = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<std::vector<int64_t>> dims_permutations = {
+      {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}};
+  int rank = input.dim();
+  for (std::vector<int64_t> dims_permutation : dims_permutations) {
+    for (bool negative_dims : {false, true}) {
+      if (negative_dims) {
+        std::for_each(dims_permutation.begin(), dims_permutation.end(),
+                      [rank](int64_t& dim) { dim -= rank; });
+      }
+      torch::Tensor output = input.permute(dims_permutation);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_output = lazy_input.permute(dims_permutation);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestPermuteMod) {
+  std::vector<std::vector<int64_t>> dims_permutations = {
+      {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}};
+  std::vector<int64_t> input_sizes = {2, 3, 4};
+  int rank = input_sizes.size();
+  for (std::vector<int64_t> dims_permutation : dims_permutations) {
+    for (bool negative_dims : {false, true}) {
+      if (negative_dims) {
+        std::for_each(dims_permutation.begin(), dims_permutation.end(),
+                      [rank](int64_t& dim) { dim -= rank; });
+      }
+      torch::Tensor input = torch::zeros(
+          input_sizes,
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor one = torch::tensor(
+          1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor output = input.permute(dims_permutation);
+      output.add_(one, 1.0);
+      input.add_(one, 1.0);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor xinput = torch::zeros(
+            input_sizes,
+            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+        torch::Tensor lazy_input = CopyToDevice(xinput, device);
+        torch::Tensor lazy_one = CopyToDevice(one, device);
+        torch::Tensor lazy_output = lazy_input.permute(dims_permutation);
+        lazy_output.add_(lazy_one, 1.0);
+        lazy_input.add_(lazy_one, 1.0);
+        AllClose(output, lazy_output);
+        AllClose(input, lazy_input);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestFlip) {
+  torch::Tensor input = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<std::vector<int64_t>> dim_powerset = {
+      {0}, {1}, {2}, {0, 1}, {1, 2}, {2, 0}, {0, 1, 2}};
+  for (std::vector<int64_t> flip_dims : dim_powerset) {
+    for (bool negative_dims : {false, true}) {
+      if (negative_dims) {
+        std::for_each(flip_dims.begin(), flip_dims.end(),
+                      [](int64_t& dim) { dim -= 3; });
+      }
+      torch::Tensor output = torch::flip(input, flip_dims);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        torch::Tensor lazy_output = torch::flip(lazy_input, flip_dims);
+        AllClose(output, lazy_output);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestPixelShuffle) {
+  torch::Tensor input =
+      torch::rand({5, 18, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int upscale_factor = 3;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = torch::pixel_shuffle(input, upscale_factor);
+    torch::Tensor lazy_output = torch::pixel_shuffle(lazy_input, upscale_factor);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSumToSize) {
+  torch::Tensor input =
+      torch::rand({4, 6, 3, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> out_size = {4, 1, 1, 7};
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = input.sum_to_size(out_size);
+    torch::Tensor lazy_output = lazy_input.sum_to_size(out_size);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTransposeDims) {
+  torch::Tensor input = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int dim0 = 0;
+  int dim1 = 2;
+  torch::Tensor output = torch::transpose(input, dim0, dim1);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTransposeDimsMod) {
+  std::vector<int64_t> input_sizes = {2, 3, 4};
+  int dim0 = 0;
+  int dim1 = 2;
+  torch::Tensor input = torch::zeros(
+      input_sizes, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor one = torch::tensor(
+      1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::transpose(input, dim0, dim1);
+  output.add_(one, 1.0);
+  input.add_(one, 1.0);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor xinput = torch::zeros(
+        input_sizes,
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor lazy_input = CopyToDevice(xinput, device);
+    torch::Tensor lazy_one = CopyToDevice(one, device);
+    torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1);
+    lazy_output.add_(lazy_one, 1.0);
+    lazy_input.add_(lazy_one, 1.0);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTransposeDimsInPlace) {
+  torch::Tensor input = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int dim0 = 0;
+  int dim1 = 2;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output = input.transpose_(dim0, dim1);
+    torch::Tensor lazy_output = lazy_input.transpose_(dim0, dim1);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSplit) {
+  torch::Tensor input = torch::rand(
+      {7, 8, 9}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int split_size : {2, 3}) {
+    for (int dim = -rank; dim < rank; ++dim) {
+      std::vector<torch::Tensor> outputs = torch::split(input, split_size, dim);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_input = CopyToDevice(input, device);
+        std::vector<torch::Tensor> lazy_outputs =
+            torch::split(lazy_input, split_size, dim);
+        ASSERT_EQ(outputs.size(), lazy_outputs.size());
+        for (size_t i = 0; i < outputs.size(); ++i) {
+          AllClose(outputs[i], lazy_outputs[i]);
+        }
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSplitEmpty) {
+  torch::Tensor input = torch::rand(
+      {0}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int split_size = 0;
+  int dim = 0;
+  std::vector<torch::Tensor> outputs = torch::split(input, split_size, dim);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    std::vector<torch::Tensor> lazy_outputs =
+        torch::split(lazy_input, split_size, dim);
+    ASSERT_EQ(outputs.size(), lazy_outputs.size());
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      AllClose(outputs[i], lazy_outputs[i]);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestSplitWithSizes) {
+  torch::Tensor input =
+      torch::rand({15, 15, 15},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = input.dim();
+  for (int dim = -rank; dim < rank; ++dim) {
+    std::vector<torch::Tensor> outputs =
+        torch::split_with_sizes(input, {4, 5, 6}, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      std::vector<torch::Tensor> lazy_outputs =
+          torch::split_with_sizes(lazy_input, {4, 5, 6}, dim);
+      ASSERT_EQ(outputs.size(), lazy_outputs.size());
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        AllClose(outputs[i], lazy_outputs[i]);
+      }
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCrossImplicitDim) {
+  std::vector<std::vector<int64_t>> dim_sizes = {
+      {4, 5, 3}, {4, 3, 5}, {3, 4, 5}};
+  for (auto dim_size : dim_sizes) {
+    torch::Tensor input = torch::rand(
+        dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor other = torch::rand(
+        dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    torch::Tensor result = torch::cross(input, other);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_other = CopyToDevice(other, device);
+      torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCrossExplicitDim) {
+  std::vector<int64_t> dim_size = {3, 3};
+  torch::Tensor input = torch::rand(
+      dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor other = torch::rand(
+      dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  int rank = dim_size.size();
+  for (int dim = -rank; dim < rank; ++dim) {
+    torch::Tensor result = torch::cross(input, other, dim);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_other = CopyToDevice(other, device);
+      torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other, dim);
+      AllClose(result, lazy_result);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestCrossZeroDim) {
+  torch::Tensor input =
+      torch::rand({0, 1, 3, 0},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor result = torch::cross(input, input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::cross(lazy_input, lazy_input);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTriu) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::triu(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTriuNonSquare) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size + 1},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::triu(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTriuBatch) {
+  int size = 5;
+  int batch_size = 3;
+  torch::Tensor input =
+      torch::rand({batch_size, size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::triu(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTril) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::tril(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTrilNonSquare) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size + 1},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::tril(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTrilBatch) {
+  int size = 5;
+  int batch_size = 3;
+  torch::Tensor input =
+      torch::rand({batch_size, size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::tril(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTriuInPlace) {
+  int size = 5;
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor input = torch::rand(
+          {size, size},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor output = input.triu_(diagonal);
+      torch::Tensor lazy_output = lazy_input.triu_(diagonal);
+      AllClose(output, lazy_output);
+      AllClose(input, lazy_input);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTrilInPlace) {
+  int size = 5;
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor input = torch::rand(
+          {size, size},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor output = input.tril_(diagonal);
+      torch::Tensor lazy_output = lazy_input.tril_(diagonal);
+      AllClose(output, lazy_output);
+      AllClose(input, lazy_input);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestTrace) {
+  int n = 5;
+  torch::Tensor input = torch::rand(
+      {n, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::trace(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::trace(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTraceWide) {
+  int lines = 3;
+  int cols = 5;
+  torch::Tensor input =
+      torch::rand({lines, cols},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::trace(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::trace(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTraceNarrow) {
+  int lines = 5;
+  int cols = 3;
+  torch::Tensor input =
+      torch::rand({lines, cols},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor output = torch::trace(input);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::trace(lazy_input);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestDiagRank1) {
+  int size = 7;
+  torch::Tensor input = torch::rand(
+      {size}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -2 * size; diagonal <= 2 * size; ++diagonal) {
+    torch::Tensor output = torch::diag(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::diag(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagRank2) {
+  int size = 7;
+  torch::Tensor input =
+      torch::rand({size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::diag(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::diag(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagFlat) {
+  torch::Tensor input =
+      torch::rand({4, 3, 6, 7},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int diagonal = -10; diagonal < 10; ++diagonal) {
+    torch::Tensor output = torch::diagflat(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::diagflat(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagonal) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::diagonal(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagonalUpdate) {
+  int size = 5;
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    auto input = torch::rand({size, size},
+        torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+    auto input_clone = input.clone();
+    auto output = torch::diagonal(input, diagonal);
+    output.add_(1);
+
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input_clone, device);
+      torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
+      lazy_output.add_(1);
+
+      AllClose(output, lazy_output);
+      AllClose(input, lazy_input);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagonalNonSquare) {
+  int size = 5;
+  torch::Tensor input =
+      torch::rand({size, size + 1},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output = torch::diagonal(input, diagonal);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestDiagonalBatch) {
+  int size = 5;
+  int batch_size = 3;
+  int dim1 = 1;
+  int dim2 = 2;
+  torch::Tensor input =
+      torch::rand({batch_size, size, size},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  // Test all diagonals and out of bounds (must be no-op).
+  for (int diagonal = -size; diagonal <= size; ++diagonal) {
+    torch::Tensor output =
+        torch::diagonal(input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2);
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor lazy_input = CopyToDevice(input, device);
+      torch::Tensor lazy_output =
+          torch::diagonal(lazy_input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2);
+      AllClose(output, lazy_output);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestFlatten) {
+  torch::Tensor input = torch::rand({4, 7, 5, 3});
+  int rank = input.dim();
+  for (int pos_start_dim = 0; pos_start_dim < rank; ++pos_start_dim) {
+    for (int pos_end_dim = pos_start_dim; pos_end_dim < rank; ++pos_end_dim) {
+      for (bool negative_start_dim : {false, true}) {
+        for (bool negative_end_dim : {false, true}) {
+          int start_dim =
+              negative_start_dim ? pos_start_dim - rank : pos_start_dim;
+          int end_dim = negative_end_dim ? pos_end_dim - rank : pos_end_dim;
+          torch::Tensor output = torch::flatten(input, start_dim, end_dim);
+          ForEachDevice([&](const torch::Device& device) {
+            torch::Tensor lazy_input = CopyToDevice(input, device);
+            torch::Tensor lazy_output =
+                torch::flatten(lazy_input, start_dim, end_dim);
+            AllClose(output, lazy_output);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestLogicalAnd) {
+  for (torch::ScalarType scalar_type1 :
+       {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+        torch::kLong}) {
+    torch::Tensor lhs =
+        isFloatingType(scalar_type1)
+            ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
+            : torch::randint(0, 100, {3, 4},
+                             torch::TensorOptions(scalar_type1));
+    for (torch::ScalarType scalar_type2 :
+         {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
+          torch::kLong}) {
+      torch::Tensor rhs =
+          isFloatingType(scalar_type2)
+              ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
+              : torch::randint(1, 100, {3, 4},
+                               torch::TensorOptions(scalar_type2));
+      torch::Tensor result = torch::logical_and(lhs, rhs);
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+        torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+        torch::Tensor lazy_result = torch::logical_and(lazy_lhs, lazy_rhs);
+        AllEqual(result, lazy_result);
+      });
+    }
+  }
+
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("xla::logical_and_out", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestBitwiseAnd) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor result = lhs.__and__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__and__(lazy_rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseAndInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__iand__(rhs);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__iand__(lazy_rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseAndScalar) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  torch::Tensor result = lhs.__and__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__and__(rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseAndScalarInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__iand__(rhs);
+    torch::Tensor lazy_result = lazy_lhs.__iand__(rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseAndPromotion) {
+  torch::Tensor input = torch::rand(
+      {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor view = input.reshape(-1);
+  torch::Tensor result = torch::__and__(view.gt(0), view.ne(0));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_view = lazy_input.reshape(-1);
+    torch::Tensor lazy_result = torch::__and__(lazy_view.gt(0), lazy_view.ne(0));
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseOr) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor result = lhs.__or__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__or__(lazy_rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseOrInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__ior__(rhs);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__ior__(lazy_rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseOrScalar) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  torch::Tensor result = lhs.__or__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__or__(rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseOrScalarInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__ior__(rhs);
+    torch::Tensor lazy_result = lazy_lhs.__ior__(rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseXor) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor result = lhs.__xor__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__xor__(lazy_rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseXorInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Tensor rhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__ixor__(rhs);
+    torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__ixor__(lazy_rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseXorScalar) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  torch::Tensor result = lhs.__xor__(rhs);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor lazy_result = lazy_lhs.__xor__(rhs);
+    AllEqual(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBitwiseXorScalarInPlace) {
+  torch::Tensor lhs = torch::randint(0, std::numeric_limits<int32_t>::max(),
+                                     {4, 2}, torch::TensorOptions(torch::kInt));
+  torch::Scalar rhs(123456789);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
+    torch::Tensor result = lhs.__ixor__(rhs);
+    torch::Tensor lazy_result = lazy_lhs.__ixor__(rhs);
+    AllEqual(result, lazy_result);
+    AllEqual(lhs, lazy_lhs);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLshift) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor shift_amount = torch::randint(
+      16,
+      input.sizes(),
+      torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor result = torch::__lshift__(input, shift_amount);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
+    torch::Tensor lazy_result =
+        torch::__lshift__(lazy_input, lazy_shift_amount);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLshiftInPlace) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor shift_amount = torch::randint(
+        16,
+        input.sizes(),
+        torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+    torch::Tensor result = input.__ilshift__(shift_amount);
+    torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
+    torch::Tensor lazy_result = lazy_input.__ilshift__(lazy_shift_amount);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLshiftScalar) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Scalar shift_amount = 3;
+  torch::Tensor result = torch::__lshift__(input, shift_amount);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::__lshift__(lazy_input, shift_amount);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLshiftScalarInPlace) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Scalar shift_amount = 3;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = input.__ilshift__(shift_amount);
+    torch::Tensor lazy_result = lazy_input.__ilshift__(shift_amount);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRshift) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor shift_amount = torch::randint(
+      16,
+      input.sizes(),
+      torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor result = torch::__rshift__(input, shift_amount);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
+    torch::Tensor lazy_result =
+        torch::__rshift__(lazy_input, lazy_shift_amount);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRshiftInPlace) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor shift_amount = torch::randint(
+        16,
+        input.sizes(),
+        torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+    torch::Tensor result = input.__irshift__(shift_amount);
+    torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
+    torch::Tensor lazy_result = lazy_input.__irshift__(lazy_shift_amount);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRshiftScalar) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Scalar shift_amount = 3;
+  torch::Tensor result = torch::__rshift__(input, shift_amount);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_result = torch::__rshift__(lazy_input, shift_amount);
+    AllClose(result, lazy_result);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRshiftScalarInPlace) {
+  torch::Tensor input = torch::ones(
+      {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Scalar shift_amount = 3;
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor result = input.__irshift__(shift_amount);
+    torch::Tensor lazy_result = lazy_input.__irshift__(shift_amount);
+    AllClose(result, lazy_result);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestMeshgrid) {
+  torch::Tensor a = torch::rand(
+      {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor b = torch::rand(
+      {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor c = torch::rand(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  auto d = torch::meshgrid({a, b, c});
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_a = CopyToDevice(a, device);
+    torch::Tensor lazy_b = CopyToDevice(b, device);
+    torch::Tensor lazy_c = CopyToDevice(c, device);
+    auto lazy_d = torch::meshgrid({lazy_a, lazy_b, lazy_c});
+    EXPECT_EQ(d.size(), lazy_d.size());
+    for (size_t i = 0; i < d.size(); ++i) {
+      AllClose(d[i], lazy_d[i]);
+    }
+  });
+}
+
+TEST_F(LazyOpsTest, TestConstantPad) {
+  torch::Tensor input = torch::rand(
+      {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 2, 3, 4, 5, 6};
+  float pad_value = 5;
+  torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output =
+        torch::constant_pad_nd(lazy_input, pad, pad_value);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestConstantPadIncomplete) {
+  torch::Tensor input = torch::rand(
+      {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 2};
+  float pad_value = 5;
+  torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output =
+        torch::constant_pad_nd(lazy_input, pad, pad_value);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReflectionPad2dRank3) {
+  torch::Tensor input = torch::rand(
+      {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{2, 2, 2, 2};
+  torch::Tensor output = torch::reflection_pad2d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReflectionPad2dRank4) {
+  torch::Tensor input =
+      torch::rand({2, 2, 3, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{2, 2, 2, 2};
+  torch::Tensor output = torch::reflection_pad2d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReflectionPad2dBackward) {
+  std::vector<int64_t> pad{2, 3, 1, 2};
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::reflection_pad2d(inputs[0], pad);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({1, 2, 4, 4}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad1d) {
+  torch::Tensor input = torch::rand(
+      {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 2};
+  torch::Tensor output = torch::replication_pad1d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad1dZeroPad) {
+  torch::Tensor input = torch::rand(
+      {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 0};
+  torch::Tensor output = torch::replication_pad1d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad1dBackward) {
+  std::vector<int64_t> pad{2, 3};
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::replication_pad1d(inputs[0], pad);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad2d) {
+  torch::Tensor input = torch::rand(
+      {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 2, 2, 1};
+  torch::Tensor output = torch::replication_pad2d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad2dZeroPad) {
+  torch::Tensor input = torch::rand(
+      {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> pad{1, 0, 0, 1};
+  torch::Tensor output = torch::replication_pad2d(input, pad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReplicationPad2dBackward) {
+  std::vector<int64_t> pad{2, 3, 1, 1};
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::replication_pad2d(inputs[0], pad);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 3, 4}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice())
+                                             .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsStrided) {
+  torch::Tensor input = torch::rand(
+      {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> size = {128, 20, 4, 4};
+  std::vector<int64_t> stride = {320, 16, 4, 1};
+  torch::Tensor output =
+      torch::as_strided(input, /*size=*/size, /*stride=*/stride);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output =
+        torch::as_strided(lazy_input, /*size=*/size, /*stride=*/stride);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsStridedInPlace) {
+  torch::Tensor input = torch::rand(
+      {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> size = {128, 20, 4, 4};
+  std::vector<int64_t> stride = {320, 16, 4, 1};
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor output =
+        torch::as_strided_(input, /*size=*/size, /*stride=*/stride);
+    torch::Tensor lazy_output =
+        torch::as_strided_(lazy_input, /*size=*/size, /*stride=*/stride);
+    AllClose(output, lazy_output);
+    AllClose(input, lazy_input);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsStridedWithOffset) {
+  torch::Tensor input = torch::rand(
+      {4, 8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> size = {4, 4, 2};
+  std::vector<int64_t> stride = {8, 2, 1};
+  int64_t storage_offset = 4;
+  torch::Tensor output =
+      torch::as_strided(input, /*size=*/size, /*stride=*/stride,
+                        /*storage_offset=*/storage_offset);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input, device);
+    torch::Tensor lazy_output =
+        torch::as_strided(lazy_input, /*size=*/size, /*stride=*/stride,
+                          /*storage_offset=*/storage_offset);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAsStridedWithInplaceCopy) {
+  torch::Tensor grad = torch::ones(
+      {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  std::vector<int64_t> size = {4};
+  std::vector<int64_t> stride = {1};
+  torch::Tensor output = torch::zeros({4}, grad.options());
+  output.as_strided(size, stride).copy_(grad);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_grad = CopyToDevice(grad, device);
+    torch::Tensor lazy_output = torch::zeros({4}, lazy_grad.options());
+    lazy_output.as_strided(size, stride).copy_(lazy_grad);
+    AllClose(output, lazy_output);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEmptyStrided) {
+  std::vector<int64_t> size = {4, 4, 2};
+  std::vector<int64_t> stride = {8, 2, 1};
+  torch::Tensor output = torch::empty_strided(/*size=*/size, /*stride=*/stride);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_output =
+        torch::empty_strided(/*size=*/size, /*stride=*/stride);
+    EXPECT_EQ(output.sizes(), lazy_output.sizes());
+    EXPECT_EQ(output.strides(), lazy_output.strides());
+  });
+}
+
+TEST_F(LazyOpsTest, TestAvgPool2DBackward) {
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::avg_pool2d(inputs[0],
+                                     /*kernel_size=*/{kernel_size, kernel_size},
+                                     /*stride=*/{stride, stride},
+                                     /*padding=*/{padding, padding},
+                                     /*ceil_mode=*/ceil_mode,
+                                     /*count_include_pad=*/count_include_pad);
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward(
+                {torch::rand({1, 1, 7, 7}, torch::TensorOptions(torch::kFloat)
+                                               .device(DefaultDevice())
+                                               .requires_grad(true))},
+                device, testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3DBackward) {
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::avg_pool3d(
+                inputs[0],
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward({torch::rand({1, 1, 7, 7, 7},
+                                      torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                         device, testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool2DNoBatchBackward) {
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::avg_pool2d(inputs[0],
+                                     /*kernel_size=*/{kernel_size, kernel_size},
+                                     /*stride=*/{stride, stride},
+                                     /*padding=*/{padding, padding},
+                                     /*ceil_mode=*/ceil_mode,
+                                     /*count_include_pad=*/count_include_pad);
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward(
+                {torch::rand({1, 7, 7}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true))},
+                device, testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAvgPool3DNoBatchBackward) {
+  int kernel_size = 2;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (bool count_include_pad : {true, false}) {
+        // Test ceil_mode=true through the CPU interop.
+        for (bool ceil_mode : {false, true}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::avg_pool3d(
+                inputs[0],
+                /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+                /*stride=*/{stride, stride, stride},
+                /*padding=*/{padding, padding, padding},
+                /*ceil_mode=*/ceil_mode,
+                /*count_include_pad=*/count_include_pad);
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward(
+                {torch::rand({1, 7, 7, 7}, torch::TensorOptions(torch::kFloat)
+                                               .device(DefaultDevice())
+                                               .requires_grad(true))},
+                device, testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatchBackward) {
+  if (IsCuda()) {
+    GTEST_SKIP();
+  }
+  for (int64_t output_size : {7, 4}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::adaptive_avg_pool3d(
+          inputs[0], {output_size, output_size, output_size});
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward(
+          {torch::rand({1, 56, 28, 28}, torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true))},
+          device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DBackward) {
+  if (IsCuda()) {
+    GTEST_SKIP();
+  }
+  for (int64_t output_size : {7, 4}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::adaptive_avg_pool3d(
+          inputs[0], {output_size, output_size, output_size});
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward(
+          {torch::rand({4, 1, 56, 28, 28}, torch::TensorOptions(torch::kFloat)
+                                               .device(DefaultDevice())
+                                               .requires_grad(true))},
+          device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DBackward) {
+  for (int64_t output_size : {7, 8}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size});
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward(
+          {torch::rand({4, 1, 56, 56}, torch::TensorOptions(torch::kFloat)
+                                           .device(DefaultDevice())
+                                           .requires_grad(true))},
+          device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatchBackward) {
+  for (int64_t output_size : {7, 8}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size});
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand({1, 56, 56}, torch::TensorOptions(torch::kFloat)
+                                                 .requires_grad(true))},
+                   device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestConv2D) {
+  int in_channels = 4;
+  int out_channels = 4;
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 3; ++stride) {
+    for (int padding = 0; padding <= 2; ++padding) {
+      for (bool with_bias : {true, false}) {
+        for (int dilation = 1; dilation <= 3; ++dilation) {
+          for (int groups :
+               {1, 2, 4}) {  // covers normal, grouped, depthwise conv.
+            ForEachDevice([&](const torch::Device& device) {
+              torch::Tensor input = torch::rand(
+                  {1, in_channels, 7, 7},
+                  torch::TensorOptions(torch::kDouble).device(DefaultDevice()));
+              torch::Tensor weight = torch::rand(
+                  {out_channels, in_channels / groups, kernel_size,
+                   kernel_size},
+                  torch::TensorOptions(torch::kDouble).device(DefaultDevice()));
+              torch::Tensor bias =
+                  with_bias ? torch::rand({out_channels},
+                                          torch::TensorOptions(torch::kDouble)
+                                              .device(DefaultDevice()))
+                            : torch::Tensor();
+
+              torch::Tensor lazy_input = CopyToDevice(input, device);
+              torch::Tensor lazy_weight = CopyToDevice(weight, device);
+              torch::Tensor lazy_bias =
+                  with_bias ? CopyToDevice(bias, device) : torch::Tensor();
+
+              torch::Tensor output =
+                  torch::conv2d(input, weight, bias,
+                                /*stride=*/{stride, stride},
+                                /*padding=*/{padding, padding},
+                                /*dilation=*/{dilation, dilation}, groups);
+              torch::Tensor lazy_output =
+                  torch::conv2d(lazy_input, lazy_weight, lazy_bias,
+                                /*stride=*/{stride, stride},
+                                /*padding=*/{padding, padding},
+                                /*dilation=*/{dilation, dilation}, groups);
+              AllClose(output, lazy_output);
+            });
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestConv2DBackward) {
+  int in_channels = 4;
+  int out_channels = 4;
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 3; ++stride) {
+    for (int padding = 0; padding <= 2; ++padding) {
+      for (bool with_bias : {true, false}) {
+        for (int dilation = 1; dilation <= 3; ++dilation) {
+          for (int groups :
+               {1, 2, 4}) {  // covers normal, grouped, depthwise conv.
+            auto testfn =
+                [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+              return torch::conv2d(inputs[0], inputs[1], inputs[2],
+                                   /*stride=*/{stride, stride},
+                                   /*padding=*/{padding, padding},
+                                   /*dilation=*/{dilation, dilation}, groups);
+            };
+
+            ForEachDevice([&](const torch::Device& device) {
+              torch::Tensor bias =
+                  with_bias ? torch::rand({out_channels},
+                                          torch::TensorOptions(torch::kDouble)
+                                              .device(DefaultDevice()))
+                            : torch::Tensor();
+              TestBackward({torch::rand({1, in_channels, 7, 7},
+                                        torch::TensorOptions(torch::kDouble)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                            torch::rand({out_channels, in_channels / groups,
+                                         kernel_size, kernel_size},
+                                        torch::TensorOptions(torch::kDouble)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                            bias},
+                           device, testfn);
+            });
+          }
+        };
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestTransposedConv2DBackward) {
+  int in_channels = 4;
+  int out_channels = 4;
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (int dilation = 1; dilation <= 2; ++dilation) {
+        for (int output_padding = 0;
+             output_padding < std::max(stride, dilation); ++output_padding) {
+          for (bool with_bias : {true, false}) {
+            for (int groups :
+                 {1, 2, 4}) {  // covers normal, grouped, depthwise conv.
+              auto testfn = [&](const std::vector<torch::Tensor>& inputs)
+                  -> torch::Tensor {
+                return torch::conv_transpose2d(
+                    inputs[0], inputs[1], inputs[2],
+                    /*stride=*/{stride, stride + 1},
+                    /*padding=*/{padding, padding + 1},
+                    /*output_padding=*/output_padding,
+                    /*groups=*/groups,
+                    /*dilation=*/{dilation, dilation + 1});
+              };
+              ForEachDevice([&](const torch::Device& device) {
+                torch::Tensor input = torch::rand(
+                    {4, out_channels, 7, 7}, torch::TensorOptions(torch::kFloat)
+                                                 .device(DefaultDevice())
+                                                 .requires_grad(true));
+                torch::Tensor weight =
+                    torch::rand({out_channels, in_channels / groups,
+                                 kernel_size, kernel_size},
+                                torch::TensorOptions(torch::kFloat)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true));
+                torch::Tensor bias =
+                    with_bias ? torch::rand({in_channels},
+                                            torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))
+                              : torch::Tensor();
+                TestBackward({input, weight, bias}, device, testfn,
+                             /*rtol=*/1e-5, /*atol=*/1e-5);
+              });
+            }
+          };
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestConv3DBackward) {
+  int in_channels = 4;
+  int out_channels = 4;
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 3; ++stride) {
+    for (int padding = 1; padding <= 2; ++padding) {
+      for (bool with_bias : {true, false}) {
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          for (int groups :
+               {1, 2, 4}) {  // covers normal, grouped, depthwise conv.
+            auto testfn =
+                [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+              return torch::conv3d(inputs[0], inputs[1], inputs[2],
+                                   /*stride=*/{stride, stride, stride},
+                                   /*padding=*/{padding, padding, padding},
+                                   /*dilation=*/{dilation, dilation, dilation},
+                                   groups);
+            };
+
+            ForEachDevice([&](const torch::Device& device) {
+              torch::Tensor bias =
+                  with_bias ? torch::rand({out_channels},
+                                          torch::TensorOptions(torch::kDouble)
+                                              .device(DefaultDevice()))
+                            : torch::Tensor();
+              TestBackward({torch::rand({4, in_channels, 7, 7, 7},
+                                        torch::TensorOptions(torch::kDouble)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                            torch::rand({out_channels, in_channels / groups,
+                                         kernel_size, kernel_size, kernel_size},
+                                        torch::TensorOptions(torch::kDouble)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true)),
+                            bias},
+                           device, testfn);
+            });
+          }
+        };
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestTransposedConv3DBackward) {
+  int in_channels = 4;
+  int out_channels = 4;
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      for (int dilation = 1; dilation <= 2; ++dilation) {
+        for (int output_padding = 0;
+             output_padding < std::max(stride, dilation); ++output_padding) {
+          for (bool with_bias : {true, false}) {
+            for (int groups :
+                 {1, 2, 4}) {  // covers normal, grouped, depthwise conv.
+              auto testfn = [&](const std::vector<torch::Tensor>& inputs)
+                  -> torch::Tensor {
+                return torch::conv_transpose3d(
+                    inputs[0], inputs[1], inputs[2],
+                    /*stride=*/{stride, stride + 1, stride},
+                    /*padding=*/{padding, padding + 1, stride},
+                    /*output_padding=*/output_padding,
+                    /*groups=*/groups,
+                    /*dilation=*/{dilation, dilation + 1, dilation});
+              };
+              ForEachDevice([&](const torch::Device& device) {
+                torch::Tensor input =
+                    torch::rand({4, out_channels, 7, 7, 7},
+                                torch::TensorOptions(torch::kDouble)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true));
+                torch::Tensor weight =
+                    torch::rand({out_channels, in_channels / groups,
+                                 kernel_size, kernel_size, kernel_size},
+                                torch::TensorOptions(torch::kDouble)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true));
+                torch::Tensor bias =
+                    with_bias ? torch::rand({in_channels},
+                                            torch::TensorOptions(torch::kDouble)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))
+                              : torch::Tensor();
+                TestBackward({input, weight, bias}, device, testfn);
+              });
+            }
+          };
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2DBackward) {
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        auto testfn =
+            [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+          return torch::max_pool2d(
+              inputs[0], /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{1, 1},
+              /*ceil_mode=*/ceil_mode);
+        };
+
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward(
+              {torch::rand({1, 2, 8, 8}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice())
+                                             .requires_grad(true))},
+              device, testfn);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DBackward) {
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        auto testfn =
+            [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+          return torch::max_pool3d(
+              inputs[0],
+              /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding}, /*dilation=*/{1, 1, 1},
+              /*ceil_mode=*/ceil_mode);
+        };
+
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward(
+              {torch::rand({1, 2, 4, 4, 4}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+              device, testfn);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool2DNoBatchBackward) {
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        auto testfn =
+            [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+          return torch::max_pool2d(
+              inputs[0], /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{1, 1},
+              /*ceil_mode=*/ceil_mode);
+        };
+
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward(
+              {torch::rand({2, 8, 8}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+              device, testfn);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxPool3DNoBatchBackward) {
+  int kernel_size = 3;
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        auto testfn =
+            [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+          return torch::max_pool3d(
+              inputs[0],
+              /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding}, /*dilation=*/{1, 1, 1},
+              /*ceil_mode=*/ceil_mode);
+        };
+
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward(
+              {torch::rand({2, 4, 4, 4}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice())
+                                             .requires_grad(true))},
+              device, testfn);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxUnpool2DBackward) {
+  int kernel_size = 2;
+  torch::Tensor input =
+      torch::rand({2, 2, 8, 8},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output;
+          torch::Tensor indices;
+          std::tie(output, indices) = torch::max_pool2d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size},
+              /*stride=*/{stride, stride},
+              /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+
+          std::vector<int64_t> output_size({input.size(2), input.size(3)});
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::max_unpool2d(inputs[0], inputs[1], output_size);
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward({output.requires_grad_(true), indices}, device,
+                         testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestMaxUnpool3DBackward) {
+  int kernel_size = 2;
+  torch::Tensor input =
+      torch::rand({1, 1, 4, 4, 4},
+                  torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (int stride = 1; stride <= 2; ++stride) {
+    for (int padding = 0; padding <= 1; ++padding) {
+      // Test ceil_mode=true through the CPU interop.
+      for (bool ceil_mode : {false, true}) {
+        for (int dilation = 1; dilation <= 2; ++dilation) {
+          torch::Tensor output;
+          torch::Tensor indices;
+          std::tie(output, indices) = torch::max_pool3d_with_indices(
+              input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
+              /*stride=*/{stride, stride, stride},
+              /*padding=*/{padding, padding, padding},
+              /*dilation=*/{dilation, dilation, dilation},
+              /*ceil_mode=*/ceil_mode);
+
+          std::vector<int64_t> output_size(
+              {input.size(2), input.size(3), input.size(4)});
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::max_unpool3d(inputs[0], inputs[1], output_size,
+                                       /*stride=*/{stride, stride, stride},
+                                       /*padding=*/{padding, padding, padding});
+          };
+
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward({output.requires_grad_(true), indices}, device,
+                         testfn);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestTanhBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::tanh(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSigmoidBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::sigmoid(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSigmoidBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::log_sigmoid(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn, /*rtol=*/1e-3, /*atol=*/1e-5);
+  });
+}
+
+TEST_F(LazyOpsTest, TestLogSoftmaxBackward) {
+  for (int dim = -4; dim < 4; ++dim) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::log_softmax(inputs[0], dim);
+    };
+
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward(
+          {torch::rand({5, 3, 4, 2}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true))},
+          device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSoftmaxBackward) {
+  for (int dim = -4; dim < 4; ++dim) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::softmax(inputs[0], dim);
+    };
+
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward(
+          {torch::rand({5, 3, 4, 2}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true))},
+          device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestSoftplusBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::softplus(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn, /*rtol=*/1e-4);
+  });
+}
+
+TEST_F(LazyOpsTest, TestReluBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::relu(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestRreluBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::rrelu(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardshrinkBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::hardshrink(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestSoftshrinkBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::softshrink(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestHardtanhBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::hardtanh(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestEluBackward) {
+  torch::Scalar alpha = 0.5;
+  torch::Scalar scale = 2.5;
+  torch::Scalar input_scale = 1.5;
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::elu(inputs[0], alpha, scale, input_scale);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestGeluBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::gelu(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 3}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+  ExpectCounterChanged("lazy::gelu_backward", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLeakyReluBackward) {
+  double negative_slope = 0.01;
+  auto testfn = [=](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::leaky_relu(inputs[0], negative_slope);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestTransposeBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::t(inputs[0]);
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward({torch::rand({2, 3}, torch::TensorOptions(torch::kFloat)
+                                          .device(DefaultDevice())
+                                          .requires_grad(true))},
+                 device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAddMatMulBackward) {
+  int in_channels = 32;
+  int out_channels = 320;
+  int labels = 50;
+  // Test beta != 1. through the CPU interop.
+  for (double beta : {1., 2.}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::addmm(inputs[0], inputs[1], inputs[2], /*beta=*/beta);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({torch::rand({labels}, torch::TensorOptions(torch::kFloat)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true)),
+                    torch::rand({in_channels, out_channels},
+                                torch::TensorOptions(torch::kFloat)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true)),
+                    torch::rand({out_channels, labels},
+                                torch::TensorOptions(torch::kFloat)
+                                    .device(DefaultDevice())
+                                    .requires_grad(true))},
+                   device, testfn);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestBinaryCrossEntropyBackward) {
+  int batch = 6;
+  int classes = 2;
+  // TODO(asuhan): Fix the torch::kDouble case.
+  for (auto dtype : {torch::kFloat}) {
+    for (bool def_weight : {false, true}) {
+      torch::Tensor input = torch::rand(
+          {batch, classes}, torch::TensorOptions(dtype).requires_grad(true));
+      torch::Tensor target =
+          torch::rand({batch, classes}, torch::TensorOptions(dtype));
+      torch::Tensor weight;
+      if (def_weight) {
+        weight = torch::rand({batch, classes}, torch::TensorOptions(dtype));
+      }
+      for (torch::Reduction::Reduction reduction :
+           {torch::Reduction::Mean, torch::Reduction::Sum,
+            torch::Reduction::None}) {
+        auto testfn =
+            [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+          return torch::binary_cross_entropy(
+              /*self=*/inputs[0], /*target=*/inputs[1],
+              /*weight=*/inputs[2],
+              /*reduction=*/reduction);
+        };
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-4,
+                       /*atol=*/1e-7);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNllLossBackward) {
+  // TODO(whc) debug divide-by-zero failure under ASAN
+  GTEST_SKIP();
+
+  int batch = 6;
+  int classes = 2;
+  // TODO(asuhan): Fix the torch::kDouble case.
+  for (auto dtype : {torch::kFloat}) {
+    for (int ignore_index : {-1, 0, 1, 5}) {
+      for (bool def_weight : {false, true}) {
+        torch::Tensor input =
+            torch::rand({batch, classes}, torch::TensorOptions(dtype)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true));
+        torch::Tensor target = torch::randint(
+            std::min(ignore_index, 0), classes, {batch},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        torch::Tensor weight;
+        if (def_weight) {
+          weight = torch::rand(
+              {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
+        }
+        for (torch::Reduction::Reduction reduction :
+             {torch::Reduction::Mean, torch::Reduction::Sum,
+              torch::Reduction::None}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::nll_loss(
+                /*self=*/inputs[0], /*target=*/inputs[1],
+                /*weight=*/inputs[2],
+                /*reduction=*/reduction, /*ignore_index=*/ignore_index);
+          };
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-5,
+                         /*atol=*/1e-8);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestNllLoss2dBackward) {
+  int batch = 6;
+  int classes = 2;
+  int height = 3;
+  int width = 3;
+  // TODO(asuhan): Fix the torch::kDouble case.
+  for (auto dtype : {torch::kFloat}) {
+    for (int ignore_index : {-1, 0, 1, 5}) {
+      for (bool def_weight : {false, true}) {
+        torch::Tensor input = torch::rand({batch, classes, height, width},
+                                          torch::TensorOptions(dtype)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true));
+        torch::Tensor target = torch::randint(
+            std::min(ignore_index, 0), classes, {batch, height, width},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        torch::Tensor weight;
+        if (def_weight) {
+          weight = torch::rand(
+              {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
+        }
+        for (torch::Reduction::Reduction reduction :
+             {torch::Reduction::Mean, torch::Reduction::Sum,
+              torch::Reduction::None}) {
+          auto testfn =
+              [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+            return torch::nll_loss2d(
+                /*self=*/inputs[0], /*target=*/inputs[1],
+                /*weight=*/inputs[2],
+                /*reduction=*/reduction, /*ignore_index=*/ignore_index);
+          };
+          ForEachDevice([&](const torch::Device& device) {
+            TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-5,
+                         /*atol=*/1e-8);
+          });
+        }
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestSmoothL1LossBackward) {
+  torch::Tensor input = torch::randn({2, 4}, torch::TensorOptions(torch::kFloat)
+                                                 .device(DefaultDevice())
+                                                 .requires_grad(true));
+  torch::Tensor target = torch::randn(
+      {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    for (double beta : {0.25, 1.}) {
+      auto testfn =
+          [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+        return torch::smooth_l1_loss(/*input=*/inputs[0], /*target=*/inputs[1],
+                                     /*reduction=*/reduction, /*beta=*/beta);
+      };
+      ForEachDevice([&](const torch::Device& device) {
+        TestBackward({input, target}, device, testfn, /*rtol=*/1e-5,
+                     /*atol=*/1e-8);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestViewBackward) {
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return inputs[0].view({-1, 320});
+  };
+  ForEachDevice([&](const torch::Device& device) {
+    TestBackward(
+        {torch::rand({32, 20, 4, 4}, torch::TensorOptions(torch::kFloat)
+                                         .device(DefaultDevice())
+                                         .requires_grad(true))},
+        device, testfn);
+  });
+}
+
+TEST_F(LazyOpsTest, TestBatchNorm2DBackward) {
+  double momentum = 0.1;
+  double eps = 0.5;
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::batch_norm(
+        /*input=*/inputs[0], /*weight=*/inputs[1], /*bias=*/inputs[2],
+        /*running_mean=*/inputs[3], /*running_var=*/inputs[4],
+        /*training=*/true, /*momentum=*/momentum, /*eps=*/eps,
+        /*cudnn_enabled=*/false);
+  };
+  int num_features = 3;
+  torch::Tensor undef;
+  for (bool undef_weight_bias : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor input = torch::rand({2, num_features, 4, 4},
+                                        torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true));
+      torch::Tensor weight =
+          undef_weight_bias
+              ? undef
+              : torch::rand({num_features}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true));
+      torch::Tensor bias =
+          undef_weight_bias
+              ? undef
+              : torch::rand({num_features}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true));
+      torch::Tensor running_mean = torch::zeros(
+          {num_features},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor running_var = torch::ones(
+          {num_features},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      TestBackward({input, weight, bias, running_mean, running_var}, device,
+                   testfn,
+                   /*rtol=*/1e-3, /*atol=*/1e-4);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestBatchNorm3DBackward) {
+  double momentum = 0.1;
+  double eps = 0.5;
+  auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+    return torch::batch_norm(
+        /*input=*/inputs[0], /*weight=*/inputs[1], /*bias=*/inputs[2],
+        /*running_mean=*/inputs[3], /*running_var=*/inputs[4],
+        /*training=*/true, /*momentum=*/momentum, /*eps=*/eps,
+        /*cudnn_enabled=*/false);
+  };
+  int num_features = 3;
+  torch::Tensor undef;
+  for (bool undef_weight_bias : {false, true}) {
+    ForEachDevice([&](const torch::Device& device) {
+      torch::Tensor input = torch::rand({2, num_features, 4, 4, 2},
+                                        torch::TensorOptions(torch::kFloat)
+                                            .device(DefaultDevice())
+                                            .requires_grad(true));
+      torch::Tensor weight =
+          undef_weight_bias
+              ? undef
+              : torch::rand({num_features}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true));
+      torch::Tensor bias =
+          undef_weight_bias
+              ? undef
+              : torch::rand({num_features}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true));
+      torch::Tensor running_mean = torch::zeros(
+          {num_features},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      torch::Tensor running_var = torch::ones(
+          {num_features},
+          torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+      TestBackward({input, weight, bias, running_mean, running_var}, device,
+                   testfn,
+                   /*rtol=*/1e-3, /*atol=*/1e-3);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestBCEWithLogitsBackward) {
+  int batch = 10;
+  int classes = 5;
+  torch::Tensor undef;
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::None, torch::Reduction::Mean,
+        torch::Reduction::Sum}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::binary_cross_entropy_with_logits(
+          /*input=*/inputs[0], /*target=*/inputs[1], /*weight=*/inputs[2],
+          /*pos_weight=*/inputs[3],
+          /*reduction=*/reduction);
+    };
+    for (bool undef_weight : {false, true}) {
+      for (bool undef_pos_weight : {false, true}) {
+        torch::Tensor input =
+            torch::rand({batch, classes}, torch::TensorOptions(torch::kFloat)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true));
+        torch::Tensor target =
+            torch::rand({batch, classes}, torch::TensorOptions(torch::kFloat)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true));
+        torch::Tensor weight =
+            undef_weight
+                ? undef
+                : torch::rand({classes}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice()));
+        torch::Tensor pos_weight =
+            undef_pos_weight
+                ? undef
+                : torch::rand({classes}, torch::TensorOptions(torch::kFloat)
+                                             .device(DefaultDevice()));
+        ForEachDevice([&](const torch::Device& device) {
+          TestBackward({input, target, weight, pos_weight}, device, testfn,
+                       /*rtol=*/1e-3, /*atol=*/1e-5);
+        });
+      }
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestKlDivBackward) {
+  torch::Tensor input = torch::rand({4, 3}, torch::TensorOptions(torch::kFloat)
+                                                .device(DefaultDevice())
+                                                .requires_grad(true));
+  torch::Tensor target = torch::rand({4, 3}, torch::TensorOptions(torch::kFloat)
+                                                 .device(DefaultDevice())
+                                                 .requires_grad(true));
+  for (torch::Reduction::Reduction reduction :
+       {torch::Reduction::Mean, torch::Reduction::Sum,
+        torch::Reduction::None}) {
+    auto testfn =
+        [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+      return torch::kl_div(/*self=*/inputs[0], /*target=*/inputs[1], reduction);
+    };
+    ForEachDevice([&](const torch::Device& device) {
+      TestBackward({input, target}, device, testfn, /*rtol=*/1e-4,
+                   /*atol=*/1e-5);
+    });
+  }
+}
+
+TEST_F(LazyOpsTest, TestEmbeddingBackward) {
+  int num_weights = 32;
+  for (int padding_idx = -1; padding_idx < num_weights; ++padding_idx) {
+    for (bool scale_grad_by_freq : {false, true}) {
+      auto testfn =
+          [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
+        return torch::embedding(inputs[0], inputs[1],
+                                /*padding_idx=*/padding_idx,
+                                /*scale_grad_by_freq=*/scale_grad_by_freq,
+                                /*sparse=*/false);
+      };
+      ForEachDevice([&](const torch::Device& device) {
+        torch::Tensor weight =
+            torch::rand({num_weights, 7}, torch::TensorOptions(torch::kFloat)
+                                              .device(DefaultDevice())
+                                              .requires_grad(true));
+        torch::Tensor indices = torch::randint(
+            num_weights, {3, 9, 4},
+            torch::TensorOptions(torch::kLong).device(DefaultDevice()));
+        TestBackward({weight, indices}, device, testfn, /*rtol=*/1e-5,
+                     /*atol=*/1e-8);
+      });
+    }
+  }
+}
+
+TEST_F(LazyOpsTest, TestAmpForeachNonFiniteCheckAndUnscale) {
+  if (IsCuda()) {
+    // TODO(whc) debug failure on cuda
+    GTEST_SKIP();
+  }
+
+  torch::Tensor grads0 = torch::tensor(
+      {1, 2, 3, 4},
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor grads1 = torch::tensor(
+      {1.0, 2.0, std::nan("1"), 4.0},
+      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor inv_scale = torch::scalar_tensor(
+      0.2, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor found_inf = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor grads_output0 = grads0 * inv_scale;
+  torch::Tensor found_inf_output0 = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor found_inf_output1 = torch::scalar_tensor(
+      1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ForEachDevice([&](const torch::Device& device) {
+    if (grads0.device() == at::kCPU) {
+      GTEST_SKIP();
+    }
+    torch::Tensor lazy_grads0 = CopyToDevice(grads0, device);
+    torch::Tensor lazy_inv_scale = CopyToDevice(inv_scale, device);
+    torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device);
+    torch::_amp_foreach_non_finite_check_and_unscale_(lazy_grads0, lazy_found_inf,
+                                                      lazy_inv_scale);
+    AllClose(grads_output0, lazy_grads0, /*rtol=*/1e-2, /*atol=*/1e-4);
+    AllEqual(found_inf_output0, lazy_found_inf);
+
+    torch::Tensor lazy_grads1 = CopyToDevice(grads1, device);
+    torch::_amp_foreach_non_finite_check_and_unscale_(lazy_grads1, lazy_found_inf,
+                                                      lazy_inv_scale);
+    AllEqual(found_inf_output1, lazy_found_inf);
+  });
+}
+
+TEST_F(LazyOpsTest, TestAmpUpdateScale) {
+  torch::Tensor growth_tracker = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor current_scale = torch::scalar_tensor(
+      4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor found_inf = torch::scalar_tensor(
+      1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor not_found_inf = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  float scale_growth_factor = 2.0;
+  float scale_backoff_factor = 0.5;
+  int growth_interval = 3;
+
+  torch::Tensor growth_tracker_result0 = torch::scalar_tensor(
+      1, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor current_scale_result0 = torch::scalar_tensor(
+      4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor growth_tracker_result1 = torch::scalar_tensor(
+      2, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor current_scale_result1 = torch::scalar_tensor(
+      4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor growth_tracker_result2 = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor current_scale_result2 = torch::scalar_tensor(
+      8, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor growth_tracker_result3 = torch::scalar_tensor(
+      0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
+  torch::Tensor current_scale_result3 = torch::scalar_tensor(
+      4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+
+  ForEachDevice([&](const torch::Device& device) {
+    if (growth_tracker.device() == at::kCPU) {
+      GTEST_SKIP();
+    }
+    torch::Tensor lazy_growth_tracker = CopyToDevice(growth_tracker, device);
+    torch::Tensor lazy_current_scale = CopyToDevice(current_scale, device);
+    torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device);
+    torch::Tensor lazy_not_found_inf = CopyToDevice(not_found_inf, device);
+
+    torch::_amp_update_scale_(lazy_current_scale, lazy_growth_tracker,
+                              lazy_not_found_inf, scale_growth_factor,
+                              scale_backoff_factor, growth_interval);
+    AllClose(current_scale_result0, lazy_current_scale, /*rtol=*/1e-2,
+             /*atol=*/1e-4);
+    AllEqual(growth_tracker_result0, lazy_growth_tracker);
+
+    torch::_amp_update_scale_(lazy_current_scale, lazy_growth_tracker,
+                              lazy_not_found_inf, scale_growth_factor,
+                              scale_backoff_factor, growth_interval);
+    AllClose(current_scale_result1, lazy_current_scale, /*rtol=*/1e-2,
+             /*atol=*/1e-4);
+    AllEqual(growth_tracker_result1, lazy_growth_tracker);
+
+    // torch::_amp_update_scale_ returns the reference of current_scale
+    lazy_current_scale = torch::_amp_update_scale_(
+        lazy_current_scale, lazy_growth_tracker, lazy_not_found_inf,
+        scale_growth_factor, scale_backoff_factor, growth_interval);
+    AllClose(current_scale_result2, lazy_current_scale, /*rtol=*/1e-2,
+             /*atol=*/1e-4);
+    AllEqual(growth_tracker_result2, lazy_growth_tracker);
+
+    lazy_current_scale = torch::_amp_update_scale_(
+        lazy_current_scale, lazy_growth_tracker, lazy_found_inf,
+        scale_growth_factor, scale_backoff_factor, growth_interval);
+    AllClose(current_scale_result3, lazy_current_scale, /*rtol=*/1e-2,
+             /*atol=*/1e-4);
+    AllEqual(growth_tracker_result3, lazy_growth_tracker);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::_amp_update_scale_",
+                       GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestEarlySyncLiveTensors) {
+  torch::Tensor scalar_tensor = torch::scalar_tensor(
+      1., torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar scalar1 = scalar_tensor.item();
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_scalar_tensor = CopyToDevice(scalar_tensor, device);
+    torch::Scalar scalar2 = lazy_scalar_tensor.item();
+    ASSERT_EQ(scalar1.to<float>(), scalar2.to<float>());
+  });
+  if (DebugUtil::ExperimentEnabled("early_sync")) {
+    ExpectCounterChanged("EarlySyncLiveTensorsCount",
+                         GetIgnoredCounters());
+  } else {
+    ExpectCounterNotChanged("EarlySyncLiveTensorsCount",
+                            GetIgnoredCounters());
+  }
+  ExpectCounterChanged("aten::_local_scalar_dense",
+                       GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerp) {
+  torch::Tensor start = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor res = torch::lerp(start, end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_start = CopyToDevice(start, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, lazy_weight);
+    AllClose(res, lazy_res);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerpScalar) {
+  torch::Tensor start = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar weight = torch::Scalar(3.0);
+  torch::Tensor res = torch::lerp(start, end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_start = CopyToDevice(start, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, weight);
+    AllClose(res, lazy_res);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerpInplace) {
+  torch::Tensor input = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor input_copy = input.clone();
+  input.lerp_(end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    lazy_input.lerp_(lazy_end, lazy_weight);
+    AllClose(lazy_input, input);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerpScalarInplace) {
+  torch::Tensor input = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar weight = torch::Scalar(3.0);
+  torch::Tensor input_copy = input.clone();
+  input.lerp_(end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_input = CopyToDevice(input_copy, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    lazy_input.lerp_(lazy_end, weight);
+    AllClose(lazy_input, input);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerpOut) {
+  torch::Tensor start = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor weight = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor res = torch::empty(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  ;
+  torch::lerp_out(res, start, end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_start = CopyToDevice(start, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    torch::Tensor lazy_weight = CopyToDevice(weight, device);
+    torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options());
+    torch::lerp_out(lazy_res, lazy_start, lazy_end, lazy_weight);
+    AllClose(res, lazy_res);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, TestLerpScalarOut) {
+  torch::Tensor start = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Tensor end = torch::rand(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::Scalar weight = torch::Scalar(3.0);
+  torch::Tensor res = torch::empty(
+      {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  torch::lerp_out(res, start, end, weight);
+  ForEachDevice([&](const torch::Device& device) {
+    torch::Tensor lazy_start = CopyToDevice(start, device);
+    torch::Tensor lazy_end = CopyToDevice(end, device);
+    torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options());
+    torch::lerp_out(lazy_res, lazy_start, lazy_end, weight);
+    AllClose(res, lazy_res);
+  });
+  ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
+  ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
+}
+
+TEST_F(LazyOpsTest, IsAliasOf) {
+  auto a = torch::empty(4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+  auto b = torch::empty(4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
+
+  ForEachDevice([&](const torch::Device& device) {
+    auto lazy_a = CopyToDevice(a, device);
+    auto lazy_b = CopyToDevice(b, device);
+    EXPECT_EQ(!a.is_alias_of(b), !lazy_a.is_alias_of(lazy_b));
+
+    auto c = a.view({2, 2});
+    auto lazy_c = lazy_a.view({2, 2});
+    EXPECT_EQ(a.is_alias_of(c), lazy_a.is_alias_of(lazy_c));
+
+    auto d = c.view({1, 4});
+    auto lazy_d = lazy_c.view({1, 4});
+    EXPECT_EQ(d.is_alias_of(c), lazy_d.is_alias_of(lazy_c));
+    EXPECT_EQ(d.is_alias_of(a), lazy_d.is_alias_of(lazy_a));
+  });
+}
+
+#endif // FBCODE_CAFFE2
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/test/cpp/lazy/test_lazy_ops_util.cpp b/test/cpp/lazy/test_lazy_ops_util.cpp
new file mode 100644
index 000000000000..91c9b653e041
--- /dev/null
+++ b/test/cpp/lazy/test_lazy_ops_util.cpp
@@ -0,0 +1,194 @@
+#include <test/cpp/lazy/test_lazy_ops_util.h>
+
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir_dump_util.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/tensor_impl.h>
+
+#include <iostream>
+#include <string>
+
+
+namespace torch {
+namespace lazy {
+namespace {
+
+bool IsLtcTensor(const at::Tensor& tensor) {
+  return dynamic_cast<torch::lazy::LTCTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+std::unordered_set<std::string>* CreateIgnoredCounters() {
+  std::unordered_set<std::string>* icounters =
+      new std::unordered_set<std::string>();
+  // Add below the counters whose name need to be ignored when doing
+  // is-any-counter-changed assertins.
+  icounters->insert("aten::rand");
+  return icounters;
+}
+
+}  // namespace
+
+const std::unordered_set<std::string>* GetIgnoredCounters() {
+  static const std::unordered_set<std::string>* icounters =
+      CreateIgnoredCounters();
+  return icounters;
+}
+
+at::Tensor ToCpuTensor(const at::Tensor& tensor) {
+  // tensor.to() implicitly triggers a sync if t.device=torch::kLazy.
+  return tensor.to(torch::kCPU);
+}
+
+torch::Tensor CopyToDevice(const torch::Tensor& tensor,
+                           const torch::Device& device) {
+  return tensor.clone().to(device, /*non_blocking=*/false, /*copy=*/true);
+}
+
+bool EqualValues(at::Tensor tensor1, at::Tensor tensor2) {
+  tensor1 = ToCpuTensor(tensor1);
+  tensor2 = ToCpuTensor(tensor2);
+  if (torch::isnan(tensor1).any().item<bool>()) {
+    EXPECT_TRUE(EqualValues(torch::isnan(tensor1), torch::isnan(tensor2)));
+    tensor1.nan_to_num_();
+    tensor2.nan_to_num_();
+  }
+  if (tensor1.sizes() != tensor2.sizes() ||
+      tensor1.dtype() != tensor2.dtype()) {
+    std::cerr << "Different shape:\n"
+              << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n"
+              << tensor2.dtype() << " " << tensor2.sizes() << "\n";
+    return false;
+  }
+  at::ScalarType type1 = tensor1.scalar_type();
+  at::ScalarType type2 = tensor2.scalar_type();
+  if (type1 != type2) {
+    tensor1 = tensor1.toType(type2);
+  }
+  bool equal = tensor1.equal(tensor2);
+  return equal;
+}
+
+bool EqualValuesNoElementTypeCheck(at::Tensor tensor1, at::Tensor tensor2) {
+  tensor1 = ToCpuTensor(tensor1);
+  tensor2 = ToCpuTensor(tensor2);
+  if (tensor1.sizes() != tensor2.sizes()) {
+    std::cerr << "Different shape:\n"
+              << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n"
+              << tensor2.dtype() << " " << tensor2.sizes() << "\n";
+    return false;
+  }
+  at::ScalarType type1 = tensor1.scalar_type();
+  at::ScalarType type2 = tensor2.scalar_type();
+  if (type1 != type2) {
+    tensor1 = tensor1.toType(type2);
+  }
+  bool equal = tensor1.equal(tensor2);
+  return equal;
+}
+
+void ForEachDevice(const std::function<void(const torch::Device&)>& devfn) {
+  // Currently TorchScript backend only supports one type of hardware per process,
+  // which is set by env. And the ordinal is always 0 given distributed training/
+  // multi-device is not supported yet.
+  auto device = torch::lazy::BackendDevice();
+  torch::Device torch_device = torch::lazy::backendDeviceToAtenDevice(device);
+  devfn(torch_device);
+}
+
+bool CloseValues(at::Tensor tensor1, at::Tensor tensor2, double rtol,
+                 double atol) {
+  tensor1 = ToCpuTensor(tensor1);
+  tensor2 = ToCpuTensor(tensor2);
+  if (torch::isnan(tensor1).any().item<bool>()) {
+    EXPECT_TRUE(EqualValues(torch::isnan(tensor1), torch::isnan(tensor2)));
+    tensor1.nan_to_num_();
+    tensor2.nan_to_num_();
+  }
+  if (tensor1.sizes() != tensor2.sizes() ||
+      tensor1.dtype() != tensor2.dtype()) {
+    std::cerr << "Different shape:\n"
+              << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n"
+              << tensor2.dtype() << " " << tensor2.sizes() << "\n";
+    return false;
+  }
+  bool equal = tensor1.allclose(tensor2, rtol, atol);
+  return equal;
+}
+
+std::string GetTensorTextGraph(at::Tensor tensor) {
+  torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
+  return torch::lazy::DumpUtil::ToText({lazy_tensor->GetIrValue().node.get()});
+}
+
+std::string GetTensorDotGraph(at::Tensor tensor) {
+  torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
+  return torch::lazy::DumpUtil::ToDot({lazy_tensor->GetIrValue().node.get()});
+}
+
+void TestBackward(
+    const std::vector<torch::Tensor>& inputs, const torch::Device& device,
+    const std::function<torch::Tensor(const std::vector<torch::Tensor>&)>&
+        testfn,
+    double rtol, double atol, int derivative_level) {
+  std::vector<torch::Tensor> input_vars;
+  std::vector<torch::Tensor> xinput_vars;
+  std::vector<torch::Tensor> inputs_w_grad;
+  std::vector<torch::Tensor> xinputs_w_grad;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const torch::Tensor& input = inputs[i];
+    if (input.defined()) {
+      torch::Tensor oinput =
+          input.clone().detach().set_requires_grad(input.requires_grad());
+      input_vars.push_back(oinput);
+
+      torch::Tensor xinput = CopyToDevice(input, device)
+                                 .detach()
+                                 .set_requires_grad(input.requires_grad());
+      xinput_vars.push_back(xinput);
+      if (input.requires_grad()) {
+        inputs_w_grad.push_back(oinput);
+        xinputs_w_grad.push_back(xinput);
+      }
+    } else {
+      input_vars.emplace_back();
+      xinput_vars.emplace_back();
+    }
+  }
+
+  torch::Tensor output = testfn(input_vars);
+  torch::Tensor xoutput = testfn(xinput_vars);
+  torch::lazy::AllClose(output, xoutput, rtol, atol);
+
+  std::vector<torch::Tensor> outs = {output};
+  std::vector<torch::Tensor> xouts = {xoutput};
+  for (int d = 1; d <= derivative_level; ++d) {
+    // Check grad of sum(outs) w.r.t inputs_w_grad.
+    torch::Tensor sum = torch::zeros_like(outs[0]).sum();
+    torch::Tensor xsum = torch::zeros_like(xouts[0]).sum();
+    for (size_t i = 0; i < outs.size(); ++i) {
+      if (outs[i].requires_grad()) {
+        sum += outs[i].sum();
+        xsum += xouts[i].sum();
+      }
+    }
+    // Calculating higher order derivative requires create_graph=true
+    bool create_graph = d != derivative_level;
+    outs = torch::autograd::grad({sum}, inputs_w_grad, /*grad_outputs=*/{},
+                                 /*retain_graph=*/c10::nullopt,
+                                 /*create_graph=*/create_graph,
+                                 /*allow_unused=*/true);
+    xouts = torch::autograd::grad({xsum}, xinputs_w_grad, /*grad_outputs=*/{},
+                                  /*retain_graph=*/c10::nullopt,
+                                  /*create_graph=*/create_graph,
+                                  /*allow_unused=*/true);
+    for (size_t i = 0; i < outs.size(); ++i) {
+      ASSERT_EQ(outs[i].defined(), xouts[i].defined());
+      if (outs[i].defined()) {
+        AllClose(outs[i], xouts[i], rtol, atol);
+      }
+    }
+  }
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/test/cpp/lazy/test_lazy_ops_util.h b/test/cpp/lazy/test_lazy_ops_util.h
new file mode 100644
index 000000000000..6dc26b48be95
--- /dev/null
+++ b/test/cpp/lazy/test_lazy_ops_util.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <gtest/gtest.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/torch.h>
+
+#include <cmath>
+#include <functional>
+#include <string>
+#include <unordered_set>
+
+namespace torch {
+namespace lazy {
+
+const std::unordered_set<std::string>* GetIgnoredCounters();
+
+// Converts an at::Tensor(device=torch::kLazy) to at::Tensor(device=torch::kCPU)
+// This at::Tensor can be torch::Tensor which is a Variable, or at::Tensor which
+// know nothing about autograd. If the input tensor is already a CPU tensor, it
+// will be returned. Needed because EqualValues and AllClose require CPU tensors
+// on both sides.
+at::Tensor ToCpuTensor(const at::Tensor& tensor);
+
+// Helper function to copy a tensor to device.
+torch::Tensor CopyToDevice(const torch::Tensor& tensor,
+                           const torch::Device& device);
+
+bool EqualValues(at::Tensor tensor1, at::Tensor tensor2);
+
+bool EqualValuesNoElementTypeCheck(at::Tensor tensor1, at::Tensor tensor2);
+
+bool CloseValues(at::Tensor tensor1, at::Tensor tensor2, double rtol = 1e-5,
+                 double atol = 1e-8);
+
+static inline void AllClose(at::Tensor tensor, at::Tensor xla_tensor,
+                            double rtol = 1e-5, double atol = 1e-8) {
+  EXPECT_TRUE(CloseValues(tensor, xla_tensor, rtol, atol));
+}
+
+static inline void AllClose(at::Tensor tensor, torch::lazy::LazyTensor& xla_tensor,
+                            double rtol = 1e-5, double atol = 1e-8) {
+  EXPECT_TRUE(
+      CloseValues(tensor, xla_tensor.ToTensor(/*detached=*/false), rtol, atol));
+}
+
+static inline void AllEqual(at::Tensor tensor, at::Tensor xla_tensor) {
+  EXPECT_TRUE(EqualValues(tensor, xla_tensor));
+}
+
+void ForEachDevice(const std::function<void(const torch::Device&)>& devfn);
+
+std::string GetTensorTextGraph(at::Tensor tensor);
+
+std::string GetTensorDotGraph(at::Tensor tensor);
+
+std::string GetTensorHloGraph(at::Tensor tensor);
+
+void TestBackward(
+    const std::vector<torch::Tensor>& inputs, const torch::Device& device,
+    const std::function<torch::Tensor(const std::vector<torch::Tensor>&)>&
+        testfn,
+    double rtol = 1e-5, double atol = 1e-8, int derivative_level = 1);
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/test/cpp/lazy/test_misc.cpp b/test/cpp/lazy/test_misc.cpp
index 45b54fd2824b..b2f941c42dd6 100644
--- a/test/cpp/lazy/test_misc.cpp
+++ b/test/cpp/lazy/test_misc.cpp
@@ -71,6 +71,11 @@ TEST(HashTest, Sanity) {
   auto b = std::vector<int32_t>({1, 1, 2, 3, 5, 8, 12});
   test_hash_repeatable_sensitive(a, b);
   test_hash_repeatable_sensitive(c10::ArrayRef<int32_t>(a), c10::ArrayRef<int32_t>(b));
+
+  // vector<bool> is a special case bc it is implemented as vector<bit>
+  auto bool_a = std::vector<bool>({true, false, false, true});
+  auto bool_b = std::vector<bool>({true, true, false, true});
+  test_hash_repeatable_sensitive(bool_a, bool_b);
 }
 
 } // namespace lazy
diff --git a/test/cpp/lazy/test_symbolic_shape.cpp b/test/cpp/lazy/test_symbolic_shape.cpp
new file mode 100644
index 000000000000..b2224aec0d1c
--- /dev/null
+++ b/test/cpp/lazy/test_symbolic_shape.cpp
@@ -0,0 +1,159 @@
+
+#include <c10/core/Device.h>
+#include <gtest/gtest.h>
+#include <test/cpp/lazy/test_lazy_ops_util.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+#include <torch/torch.h>
+#include <iostream>
+
+namespace torch {
+namespace lazy {
+
+// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g.
+// sizes) in TensorImpl
+#ifndef FBCODE_CAFFE2
+
+namespace {
+// This registers the torchscript backend, without which lazy device won't work
+torch::lazy::BackendRegistrar g_registrar(GetTSBackendImpl());
+
+static inline at::DeviceType DefaultDevice() {
+  return torch::lazy::getBackend()->EagerFallbackDeviceType();
+}
+
+std::vector<bool> getIsSymbolic(at::Tensor& lazy_tensor) {
+  auto ltc_tensor = GetLtcTensor(lazy_tensor);
+  Value ir_val = ltc_tensor->GetIrValue();
+  const Shape& shape = ir_val->shape();
+  return shape.is_symbolic().value();
+}
+
+class LazyShapeTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {}
+  void SetUp() override {
+    at::manual_seed(42);
+    torch::lazy::LazyGraphExecutor::Get()->SetRngSeed(
+        torch::lazy::BackendDevice(), 42);
+    FLAGS_ltc_enable_symbolic_shapes = true;
+  }
+  void TearDown() override {
+    FLAGS_ltc_enable_symbolic_shapes = false;
+  }
+};
+
+class DynamicInputShapeNode : public Node {
+ public:
+  explicit DynamicInputShapeNode(Shape& shape)
+      : Node(OpKind(), /* num_outputs */ 1),
+        hash_(0),
+        shape_(shape) {}
+  ~DynamicInputShapeNode() override = default;
+
+  const std::vector<Output>& operands() const override {
+    TORCH_INTERNAL_ASSERT(false, "Can't access operands of test node");
+  }
+
+  const Output& operand(size_t i) const override {
+    TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of test node");
+  }
+  const Shape& shape(size_t i) const override {
+    return shape_;
+  }
+  c10::ArrayRef<Shape> shapes() const override {
+    return {shape_};
+  }
+
+  hash_t hash() const override { return hash_; }
+  hash_t shapeHash() const override { return hash_; }
+
+ private:
+  hash_t hash_;
+  Shape shape_;
+};
+
+} // namespace
+
+Tensor tensorWithSymbolicShape(
+    const std::vector<int64_t>& sizes,
+    const std::vector<bool>& is_symbolic) {
+  Shape shape = Shape(torch::kFloat32, sizes);
+  Shape shape_with_symbolic = shape.with_symbolic_dims(is_symbolic);
+  auto n = torch::lazy::MakeNode<DynamicInputShapeNode>(shape_with_symbolic);
+  auto device = BackendDevice();
+  auto lt = torch::lazy::LazyTensor::Create(n, device);
+  return torch::lazy::CreateAtenFromLtcTensor(lt);
+}
+
+TEST_F(LazyShapeTest, TestMulBasic) {
+  // Basic propagation
+  torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false});
+  torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false});
+  torch::Tensor res = torch::mul(a, b);
+
+  std::vector<bool> expected = {true, false};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+
+  // Test when some inputs are symbolic
+  a = tensorWithSymbolicShape({2, 2}, {true, true});
+  b = tensorWithSymbolicShape({2, 2}, {true, false});
+  res = torch::mul(a, b);
+
+  // This is not {true, false}, as the SSA shape propagation
+  // is not able to simplify
+  // expandedSizes.append(sizeB if sizeA == 1 else sizeA)
+  // in broadcast() in shape_functions_1.h
+  // due to sizeA being symbolic
+  expected = {true, true};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+
+  // Test correct handling of broadcasting dim
+  a = tensorWithSymbolicShape({2, 2}, {false, true});
+  b = tensorWithSymbolicShape({2, 1}, {true, false});
+  res = torch::mul(a, b);
+
+  expected = {false, true};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+
+  // Test correct handling of scalar values
+  a = tensorWithSymbolicShape({2, 2}, {false, true});
+  res = torch::mul(a, 3);
+  expected = {false, true};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+};
+
+TEST_F(LazyShapeTest, TestCatBasic) {
+  // Basic propagation
+  torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false});
+  torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false});
+  torch::Tensor c = tensorWithSymbolicShape({2, 2}, {true, false});
+
+  auto res = torch::cat({a, b, c}, 1);
+  std::vector<bool> expected = {true, false};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+
+  torch::Tensor d = tensorWithSymbolicShape({2, 2}, {false, true});
+  res = torch::cat({a, d}, 0);
+  expected = {true, false};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+
+  // Test handling of symbolic dims of inequal sizes, Currently crashes
+  // As we can't handle cases where upper bound dims are not equal
+  /*
+  torch::Tensor e = tensorWithSymbolicShape({2, 2}, {true, false});
+  torch::Tensor f = tensorWithSymbolicShape({2, 3}, {false, true});
+  res = torch::cat({e, f}, 0);
+  expected = {true, false};
+  EXPECT_EQ(getIsSymbolic(res), expected);
+  */
+}
+#endif // FBCODE_CAFFE2
+} // namespace lazy
+} // namespace torch
diff --git a/test/cpp/lazy/test_tensor_impl.cpp b/test/cpp/lazy/test_tensor_impl.cpp
index 2a7f2893c724..8d968f620b6b 100644
--- a/test/cpp/lazy/test_tensor_impl.cpp
+++ b/test/cpp/lazy/test_tensor_impl.cpp
@@ -6,12 +6,14 @@
 namespace torch {
 namespace lazy {
 
-// TODO(alanwaketan): Update the following unit tests once the TorchScript backend is merged.
+#ifdef FBCODE_CAFFE2
+// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl
 TEST(LazyTensorImplTest, BasicThrow) {
   EXPECT_THROW({
     auto input = torch::rand({0, 1, 3, 0}, torch::TensorOptions(torch::kFloat).device("lazy"));
   }, ::c10::Error);
 }
+#endif // FBCODE_CAFFE2
 
 }  // namespace lazy
 }  // namespace torch
diff --git a/test/cpp/lazy/test_trie_cache.cpp b/test/cpp/lazy/test_trie_cache.cpp
new file mode 100644
index 000000000000..df7d578b94b4
--- /dev/null
+++ b/test/cpp/lazy/test_trie_cache.cpp
@@ -0,0 +1,92 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/lazy/core/config.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <memory>
+
+namespace torch {
+namespace lazy {
+
+class TrieCacheNode : public Node {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind();
+  }
+
+  explicit TrieCacheNode(size_t id)
+      : Node(ClassOpKind(), /* num_outputs */ 1), id_(id), hash_(Hash(id_)) {}
+  ~TrieCacheNode() override = default;
+
+  bool CanBeReused(size_t id) const {
+    return (id_ == id);
+  }
+
+  void AddOperand(Value v) {
+    if (!v.node) {
+      return;
+    }
+    operands_as_outputs_.emplace_back(v.node.get(), v.index);
+    operands_.push_back(std::move(v.node));
+  }
+
+  hash_t hash() const override { return hash_; }
+  hash_t shapeHash() const override { return hash_; }
+ private:
+  size_t id_;
+  hash_t hash_;
+};
+
+TEST(TrieCacheTest, TestSinglePath) {
+  FLAGS_torch_lazy_reuse_ir = true;
+  TrieCache::Get()->Clear();
+
+  NodePtr a = ReuseOrMakeNode<TrieCacheNode>(0);
+  NodePtr b = ReuseOrMakeNode<TrieCacheNode>(1);
+  NodePtr c = ReuseOrMakeNode<TrieCacheNode>(2);
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(0).get(), a.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(1).get(), b.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(2).get(), c.get());
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+}
+
+/*
+*    0
+*    |
+*    1
+*   / \
+*  2   3
+*/
+TEST(TrieCacheTest, TestTwoPaths) {
+  FLAGS_torch_lazy_reuse_ir = true;
+  TrieCache::Get()->Clear();
+
+  NodePtr a = ReuseOrMakeNode<TrieCacheNode>(0);
+  NodePtr b = ReuseOrMakeNode<TrieCacheNode>(1);
+  NodePtr c = ReuseOrMakeNode<TrieCacheNode>(2);
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(0).get(), a.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(1).get(), b.get());
+  NodePtr d = ReuseOrMakeNode<TrieCacheNode>(3);
+  EXPECT_NE(d.get(), c.get());
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(0).get(), a.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(1).get(), b.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(3).get(), d.get());
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(0).get(), a.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(1).get(), b.get());
+  EXPECT_EQ(ReuseOrMakeNode<TrieCacheNode>(2).get(), c.get());
+  TrieCache::Get()->ResetCurrent(); // MarkStep
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/test/cpp/lite_interpreter_runtime/CMakeLists.txt b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
index 503203d7be08..6a2e6db6eaa9 100644
--- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt
+++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
@@ -23,6 +23,10 @@ target_include_directories(
 
 target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_with_compiler_runtime)
 
+if(LINUX)
+  target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$<TARGET_FILE:backend_with_compiler_runtime>,--as-needed")
+endif()
+
 if(INSTALL_TEST)
   install(TARGETS test_lite_interpreter_runtime DESTINATION bin)
   # Install PDB files for MSVC builds
diff --git a/test/cpp/profiler/containers.cpp b/test/cpp/profiler/containers.cpp
new file mode 100644
index 000000000000..60e6d0f238b1
--- /dev/null
+++ b/test/cpp/profiler/containers.cpp
@@ -0,0 +1,76 @@
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <c10/util/irange.h>
+#include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/util.h>
+
+TEST(ProfilerTest, AppendOnlyList) {
+    const int n = 4096;
+    torch::profiler::impl::AppendOnlyList<int, 1024> list;
+    for (const auto i : c10::irange(n)) {
+        list.emplace_back(i);
+        ASSERT_EQ(list.size(), i + 1);
+    }
+
+    int expected = 0;
+    for (const auto i : list) {
+        ASSERT_EQ(i, expected++);
+    }
+    ASSERT_EQ(expected, n);
+
+    list.clear();
+    ASSERT_EQ(list.size(), 0);
+}
+
+TEST(ProfilerTest, AppendOnlyList_ref) {
+    const int n = 512;
+    torch::profiler::impl::AppendOnlyList<std::pair<int, int>, 64> list;
+    std::vector<std::pair<int, int>*> refs;
+    for (const auto _ : c10::irange(n)) {
+        refs.push_back(list.emplace_back());
+    }
+
+    for (const auto i : c10::irange(n)) {
+        *refs.at(i) = {i, 0};
+    }
+
+    int expected = 0;
+    for (const auto& i : list) {
+        ASSERT_EQ(i.first, expected++);
+    }
+}
+
+// Test that we can convert TSC measurements back to wall clock time.
+TEST(ProfilerTest, clock_converter) {
+    const int n = 10001;
+    torch::profiler::impl::ApproximateClockToUnixTimeConverter converter;
+    std::vector<torch::profiler::impl::ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair> pairs;
+    for (const auto i : c10::irange(n)) {
+        pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter::measurePair());
+    }
+    auto count_to_ns = converter.makeConverter();
+    std::vector<int64_t> deltas;
+    for (const auto& i : pairs) {
+        deltas.push_back(i.t_ - count_to_ns(i.approx_t_));
+    }
+    std::sort(deltas.begin(), deltas.end());
+
+    // In general it's not a good idea to put clocks in unit tests as it leads
+    // to flakiness. We mitigate this by:
+    //   1) Testing the clock itself. While the time to complete a task may
+    //      vary, two clocks measuring the same time should be much more
+    //      consistent.
+    //   2) Only testing the interquartile range. Context switches between
+    //      calls to the two timers do occur and can result in hundreds of
+    //      nanoseconds of noise, but such switches are only a few percent
+    //      of cases.
+    //   3) We're willing to accept a somewhat large bias which can emerge from
+    //      differences in the cost of calling each clock.
+    EXPECT_LT(std::abs(deltas[n / 2]), 200);
+    EXPECT_LT(deltas[n * 3 / 4] - deltas[n / 4], 50);
+}
diff --git a/test/cpp/profiler/record_function.cpp b/test/cpp/profiler/record_function.cpp
new file mode 100644
index 000000000000..ba76c5af5888
--- /dev/null
+++ b/test/cpp/profiler/record_function.cpp
@@ -0,0 +1,307 @@
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include <fmt/format.h>
+#include <gtest/gtest.h>
+
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+#include <c10/util/irange.h>
+
+// Test that we can add and remove callbacks (both global and thread local.)
+TEST(RecordFunctionTest, AddRemove) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  auto start_callback =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    return nullptr;
+  };
+  auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {};
+
+  auto handle = at::addThreadLocalCallback(
+      at::RecordFunctionCallback(start_callback, end_callback));
+
+  ASSERT_TRUE(at::hasCallbacks());
+  ASSERT_TRUE(at::hasThreadLocalCallbacks());
+  ASSERT_FALSE(at::hasGlobalCallbacks());
+
+  at::removeCallback(handle);
+  ASSERT_FALSE(at::hasCallbacks());
+
+  handle = at::addGlobalCallback(
+      at::RecordFunctionCallback(start_callback, end_callback));
+
+  ASSERT_TRUE(at::hasCallbacks());
+  ASSERT_FALSE(at::hasThreadLocalCallbacks());
+  ASSERT_TRUE(at::hasGlobalCallbacks());
+
+  at::removeCallback(handle);
+  ASSERT_FALSE(at::hasCallbacks());
+}
+
+// Test that the callbacks that we register are actually run.
+TEST(RecordFunctionTest, ThreadLocalState) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  static int tls_test_start_counter;
+  static int tls_test_end_counter;
+  tls_test_start_counter = 0;
+  tls_test_end_counter = 0;
+
+  auto start_callback =
+      [](const at::RecordFunction&) -> std::unique_ptr<at::ObserverContext> {
+    ++tls_test_start_counter;
+    return nullptr;
+  };
+  auto end_callback = [](const at::RecordFunction&, at::ObserverContext*) {
+    ++tls_test_end_counter;
+  };
+
+  auto handle = at::addThreadLocalCallback(
+      at::RecordFunctionCallback(start_callback, end_callback));
+
+  {
+    at::RecordFunction guard(at::RecordScope::USER_SCOPE);
+    guard.before("Test");
+    EXPECT_EQ(tls_test_start_counter, 1);
+    EXPECT_EQ(tls_test_end_counter, 0);
+  }
+  EXPECT_EQ(tls_test_start_counter, 1);
+  EXPECT_EQ(tls_test_end_counter, 1);
+
+  {
+    tls_test_start_counter = 0;
+    tls_test_end_counter = 0;
+    at::DisableRecordFunctionGuard no_profile_guard;
+    at::RecordFunction guard(at::RecordScope::USER_SCOPE);
+    guard.before("Test");
+    EXPECT_EQ(tls_test_start_counter, 0);
+    EXPECT_EQ(tls_test_end_counter, 0);
+  }
+  EXPECT_EQ(tls_test_start_counter, 0);
+  EXPECT_EQ(tls_test_end_counter, 0);
+
+  {
+    tls_test_start_counter = 0;
+    tls_test_end_counter = 0;
+    RECORD_FUNCTION("Test", {});
+    EXPECT_EQ(tls_test_start_counter, 1);
+    EXPECT_EQ(tls_test_end_counter, 0);
+  }
+  EXPECT_EQ(tls_test_start_counter, 1);
+  EXPECT_EQ(tls_test_end_counter, 1);
+
+  at::removeCallback(handle);
+  ASSERT_FALSE(at::hasCallbacks());
+}
+
+// Test that callbacks are run in the order that they are registered.
+TEST(RecordFunctionTest, CallOrder) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  static int current_index;
+  current_index = 0;
+
+  static std::array<std::string, 8> expected_order = {
+      "Start Callback 0 Outer",
+      "Start Callback 1 Outer",
+      "Start Callback 0 Inner",
+      "Start Callback 1 Inner",
+      "End Callback 0 Inner",
+      "End Callback 1 Inner",
+      "End Callback 0 Outer",
+      "End Callback 1 Outer",
+  };
+
+#define REGISTER_CALLBACK(index)                                       \
+  at::addThreadLocalCallback(                                          \
+      at::RecordFunctionCallback(                                      \
+          [](const at::RecordFunction& fn)                             \
+              -> std::unique_ptr<at::ObserverContext> {                \
+            EXPECT_EQ(                                                 \
+                fmt::format("Start Callback {} {}", index, fn.name()), \
+                expected_order[current_index++]);                      \
+            return nullptr;                                            \
+          },                                                           \
+          [](const at::RecordFunction& fn, at::ObserverContext*) {     \
+            EXPECT_EQ(                                                 \
+                fmt::format("End Callback {} {}", index, fn.name()),   \
+                expected_order[current_index++]);                      \
+          })                                                           \
+          .scopes({at::RecordScope::FUNCTION}))
+
+  REGISTER_CALLBACK(0);
+  REGISTER_CALLBACK(1);
+#undef REGISTER_CALLBACK
+
+  RECORD_FUNCTION("Outer", {});
+  { RECORD_FUNCTION("Inner", {}); }
+
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+}
+
+// Make sure TLS migrates when tasks are launched.
+TEST(RecordFunctionTest, ThreadMigration) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  static int call_count;
+  call_count = 0;
+
+  auto handle = at::addThreadLocalCallback(
+      at::RecordFunctionCallback(
+          [](const at::RecordFunction&)
+              -> std::unique_ptr<at::ObserverContext> { return nullptr; },
+          [](const at::RecordFunction&, at::ObserverContext*) {
+            ++call_count;
+          })
+          .scopes({at::RecordScope::FUNCTION}));
+
+  EXPECT_EQ(call_count, 0);
+
+  std::condition_variable cv;
+  std::mutex lock;
+  at::launch([&cv]() {
+    RECORD_FUNCTION("Test", {});
+    cv.notify_all();
+  });
+  auto guard = std::unique_lock<std::mutex>(lock);
+  cv.wait(guard, []{ return call_count > 0; });
+
+  EXPECT_EQ(call_count, 1);
+
+  at::removeCallback(handle);
+  ASSERT_FALSE(at::hasCallbacks());
+}
+
+// Test sampling logic and validate that callbacks fire at the correct times.
+TEST(RecordFunctionTest, Sampling) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  static int sample_test_counter;
+  sample_test_counter = 0;
+
+  uint32_t seed = 12345;
+  double p = 0.25;
+
+  at::set_record_function_seed_for_testing(seed);
+  std::mt19937 generator;
+  generator.seed(seed);
+  auto dist = std::geometric_distribution<int>(p);
+
+  // Make sure we know which steps should fire.
+  auto outcomes = std::array<int, 5>{7, 0, 0, 6, 2};
+  for (const auto i : c10::irange(outcomes.size())) {
+    ASSERT_EQ(dist(generator), outcomes[i]);
+  }
+
+  std::vector<int> expected_counts;
+  int running_count = 0;
+  for (const auto i : c10::irange(outcomes.size())) {
+    for (const auto j : c10::irange(outcomes[i])) {
+      expected_counts.push_back(running_count);
+    }
+    expected_counts.push_back(++running_count);
+  }
+
+  auto start_callback =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    ++sample_test_counter;
+    return nullptr;
+  };
+  auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {};
+
+  auto handle = at::addThreadLocalCallback(
+      at::RecordFunctionCallback(start_callback, end_callback)
+          .samplingProb(p)
+          .scopes({at::RecordScope::FUNCTION}));
+
+  for (const auto i : c10::irange(expected_counts.size())) {
+    RECORD_FUNCTION("Test", {});
+    EXPECT_EQ(sample_test_counter, expected_counts[i]);
+  }
+
+  at::removeCallback(handle);
+  ASSERT_FALSE(at::hasCallbacks());
+}
+
+// Validate sampling against a simple reference implementation for a complex set
+// of registered callbacks.
+TEST(RecordFunctionTest, MultipleCallbacks) {
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+
+  uint32_t seed = 54321;
+
+  std::mt19937 generator;
+  generator.seed(seed);
+
+  auto sample = [&](double p) {
+    return (p < 1.0 ? std::geometric_distribution<int>(p)(generator) : 0) + 1;
+  };
+
+  std::array<double, 4> probabilities{0.1, 1.0, 1.0, 0.3};
+  std::array<int, 4> next_call;
+  std::array<int, 4> counts;
+  static std::array<int, 4> counts_from_rec_fn;
+  counts_from_rec_fn.fill(0);
+
+  auto start_callback_0 =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    ++counts_from_rec_fn[0];
+    return nullptr;
+  };
+
+  auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {};
+
+#define REGISTER_CALLBACK(register_fn, index)                   \
+  register_fn(at::RecordFunctionCallback(                       \
+                  [](const at::RecordFunction& fn)              \
+                      -> std::unique_ptr<at::ObserverContext> { \
+                    ++counts_from_rec_fn[index];                \
+                    return nullptr;                             \
+                  },                                            \
+                  end_callback)                                 \
+                  .samplingProb(probabilities[index])           \
+                  .scopes({at::RecordScope::FUNCTION}))
+
+  REGISTER_CALLBACK(at::addGlobalCallback, 0);
+  REGISTER_CALLBACK(at::addGlobalCallback, 1);
+  REGISTER_CALLBACK(at::addThreadLocalCallback, 2);
+
+  // The RecordFunction machinery will rebuild callbacks whenever a new observer
+  // is registered, so we need to wait until the last callback to seed the
+  // random number generator.
+  at::set_record_function_seed_for_testing(seed);
+  REGISTER_CALLBACK(at::addThreadLocalCallback, 3);
+#undef REGISTER_CALLBACK
+
+  for (const auto i : c10::irange(probabilities.size())) {
+    next_call[i] = sample(probabilities[i]);
+  }
+
+  for (const auto i : c10::irange(50)) {
+    RECORD_FUNCTION("Test", {});
+    for (const auto j : c10::irange(next_call.size())) {
+      if (!(--next_call[j])) {
+        ++counts[j];
+        next_call[j] = sample(probabilities[j]);
+      }
+      EXPECT_EQ(counts[j], counts_from_rec_fn[j]);
+    }
+  }
+
+  at::clearCallbacks();
+  ASSERT_FALSE(at::hasCallbacks());
+}
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index 8fc5a0a18331..7dff70630d3e 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -23,6 +23,7 @@ set(TENSOREXPR_TEST_SRCS
   ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp
   ${TENSOREXPR_TEST_ROOT}/test_te_fuser_pass.cpp
   ${TENSOREXPR_TEST_ROOT}/test_type.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_type_specializations.cpp
 )
 
 if(USE_CUDA)
diff --git a/test/cpp/tensorexpr/test_base.h b/test/cpp/tensorexpr/test_base.h
index 4a8e667de3ac..510cad450012 100644
--- a/test/cpp/tensorexpr/test_base.h
+++ b/test/cpp/tensorexpr/test_base.h
@@ -78,7 +78,7 @@ static void assertAllEqual(const std::vector<T>& vec, const T& val) {
 template <typename T>
 static void assertAllEqual(const std::vector<T>& v1, const std::vector<T>& v2) {
   ASSERT_EQ(v1.size(), v2.size());
-  for (int i = 0; i < v1.size(); i++) {
+  for (size_t i = 0; i < v1.size(); ++i) {
     ASSERT_EQ(v1[i], v2[i]);
   }
 }
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index 7cabee0ce55e..a7df88b8ab99 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -49,8 +49,7 @@ TEST(BoundsInference, _1) {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
   ExprHandle n(100);
   BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -73,8 +72,7 @@ TEST(BoundsInference, _2) {
   // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
   VarHandle n("n", kInt);
   BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -97,9 +95,8 @@ TEST(BoundsInference, _3) {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
   ExprHandle n(100);
   BufHandle a("a", {n + 10}, kFloat);
-  Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
-    return a.load(i) * a.load(i + 10);
-  });
+  Tensor b = Compute(
+      "b", {n}, [&](const VarHandle& i) { return a.load(i) * a.load(i + 10); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -126,14 +123,12 @@ TEST(BoundsInference, _4) {
   ExprHandle W(320);
   ExprHandle H(200);
   BufHandle a("a", {H, W}, kFloat);
-  Tensor b = Compute(
-      "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return x * y;
-      });
-  Tensor c = Compute(
-      "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y, x) * b.load(y, x);
-      });
+  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return x * y;
+  });
+  Tensor c = Compute("c", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return a.load(y, x) * b.load(y, x);
+  });
   LoopNest l({c});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
   StmtPtr body = l.getLoopBodyFor(c);
@@ -204,8 +199,7 @@ TEST(BoundsInference, _5) {
   //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
   ExprHandle n(100);
   BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -258,12 +252,11 @@ TEST(BoundsInference, _6) {
   ExprHandle CW(32);
   ExprHandle CH(20);
   BufHandle a("a", {H, W}, kFloat);
-  Tensor b = Compute(
-      "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return x * y;
-      });
-  Tensor c = Compute(
-      "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
+  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
+    return x * y;
+  });
+  Tensor c =
+      Compute("c", {CH, CW}, [&](const VarHandle& y, const VarHandle& x) {
         return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
       });
   LoopNest l({c});
@@ -325,10 +318,9 @@ TEST(BoundsInference, _6) {
 TEST(BoundsInference, Adjacent) {
   ExprHandle H(6);
   BufHandle a("a", {20}, kFloat);
-  Tensor b =
-      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor c = Compute(
-      "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
+  Tensor b = Compute("b", {H}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor c =
+      Compute("c", {H}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
 
@@ -383,12 +375,11 @@ TEST(BoundsInference, Adjacent) {
 
 TEST(BoundsInference, MultipleTopLoopLoad) {
   BufHandle a("a", {100}, kFloat);
-  Tensor b =
-      Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor c = Compute(
-      "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); });
-  Tensor d = Compute(
-      "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); });
+  Tensor b = Compute("b", {64}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor c =
+      Compute("c", {32}, [&](const VarHandle& x) { return a.load(x + 10); });
+  Tensor d =
+      Compute("d", {96}, [&](const VarHandle& x) { return a.load(x + 2); });
   LoopNest l({b, c, d});
 
   auto bounds_info = inferBounds(l.root_stmt());
@@ -496,16 +487,15 @@ TEST(BoundsInference, MultipleTopLoopStore) {
 }
 
 TEST(BoundsInference, CacheReads) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 30, j + 3);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -562,7 +552,7 @@ TEST(BoundsInference, CacheReads) {
 TEST(BoundsInference, Flattened) {
   Tensor b = Compute(
       "b",
-      {{3, "z"}, {4, "y"}, {5, "x"}},
+      {3, 4, 5},
       [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
         return x * y + z;
       });
@@ -637,14 +627,12 @@ TEST(BoundsInference, GetPotentialHazards) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return (i + 1) * (j + 1);
-      });
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B = Compute("B", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return (i + 1) * (j + 1);
+  });
 
   LoopNest l({A, B});
 
@@ -663,12 +651,11 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopCall) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {64, 64}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i, j) + 5;
       });
 
@@ -688,10 +675,9 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
 
   LoopNest l({A});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
index 4f43e4f8621c..cf458af02095 100644
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -191,7 +191,7 @@ TEST(Conv, Conv2D) {
 
   te::Tensor conv = te::Reduce(
       "conv",
-      {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
+      {N, K, OH, OW},
       te::Sum(),
       // FIXME: We have to use a `std::vector` parameter here and then unpack
       // it, because we don't have an overload allowing for an arbitrary number
@@ -211,7 +211,7 @@ TEST(Conv, Conv2D) {
       },
       // FIXME: If you forget one of the reduction dims, you get a segfault.
       // Could that be caught by a verifier?
-      {{C, "c"}, {R, "r"}, {S, "s"}});
+      {C, R, S});
 
   // FIXME: It'd be nice to have a single header that pulls in things like
   // LoopNest, IRSimplifier, etc.
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index feca646a657c..cc945834d7a5 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -37,9 +37,9 @@ static void testCudaTestVectorAdd01_impl() {
   Tensor c = Compute(
       "c",
       {
-          {num_iter, "n"},
-          {block_count, "b_id"},
-          {block_size, "t_id"},
+          num_iter,
+          block_count,
+          block_size,
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
         return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id);
@@ -101,9 +101,9 @@ TEST(Cuda, Sigmoid_CUDA) {
   Tensor c = Compute(
       "c",
       {
-          {num_iter, "n"},
-          {block_count, "b_id"},
-          {block_size, "t_id"},
+          num_iter,
+          block_count,
+          block_size,
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
         return sigmoid(sigmoid(a_buf.load(n, b_id, t_id)));
@@ -163,12 +163,9 @@ TEST(Cuda, TestVectorAdd01_CUDA) {
 static void testCudaTestVectorAdd02_impl(int64_t N, int64_t block_size) {
   BufHandle a_buf("a", {N}, kFloat);
   BufHandle b_buf("b", {N}, kFloat);
-  Tensor c = Compute(
-      "c",
-      {
-          {N, "N"},
-      },
-      [&](const VarHandle& n) { return a_buf.load(n) + b_buf.load(n); });
+  Tensor c = Compute("c", {N}, [&](const VarHandle& n) {
+    return a_buf.load(n) + b_buf.load(n);
+  });
   LoopNest l({c});
   ForPtr n_inner;
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -222,7 +219,7 @@ TEST(Cuda, TestVectorAdd02_CUDA) {
 TEST(Cuda, HalfCast_CUDA) {
   auto half = ToDtype<at::Half>();
   BufHandle a("a", {4}, half);
-  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
     return Cast::make(kFloat, a.load(i));
   });
 
@@ -263,8 +260,8 @@ TEST(Cuda, DynamicShape2D_CUDA) {
     VarHandle n("n", kInt);
     BufHandle a("a", {m, n}, kFloat);
     BufHandle b("b", {m, n}, kFloat);
-    Tensor c = Compute(
-        "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
@@ -326,9 +323,9 @@ TEST(Cuda, TestRand01_CUDA) {
   Tensor c = Compute(
       "c",
       {
-          {num_iter, "n"},
-          {block_count, "b_id"},
-          {block_size, "t_id"},
+          num_iter,
+          block_count,
+          block_size,
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
         return Intrinsics::make(IntrinsicsOp::kRand, kFloat);
@@ -381,8 +378,8 @@ TEST(Cuda, DynamicShapeSplit_CUDA) {
   constexpr int64_t N = 4096;
   VarHandle n("n", kLong);
   BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute(
-      "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
+  Tensor b =
+      Compute("b", {n}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
   LoopNest l({b});
   ForPtr inner;
   std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
@@ -914,15 +911,15 @@ TEST(Cuda, LocalMemReduce_1_CUDA) {
 TEST(Cuda, HalfSupport_CUDA) {
   auto half = ToDtype<at::Half>();
   BufHandle a("a", {4}, half);
-  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
     return Cast::make(half, ExprHandle(2.0f) * a.load(i));
   });
 
-  Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {4}, [&](const VarHandle& i) {
     return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
   });
 
-  Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {4}, [&](const VarHandle& i) {
     return Cast::make(half, c.load(i));
   });
 
@@ -971,7 +968,7 @@ TEST(Cuda, HalfSupport_CUDA) {
 TEST(Cuda, HalfPropagation_CUDA) {
   auto half = ToDtype<at::Half>();
   BufHandle a("a", {4}, half);
-  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
   });
 
@@ -987,8 +984,8 @@ TEST(Cuda, HalfPropagation_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK: for (
-# CHECK:  float v = float(a[n]);
-# CHECK:  relu[n] = half(Max(v, 0.f
+# CHECK:  float v = float(a[i]);
+# CHECK:  relu[i] = half(Max(v, 0.f
 # CHECK: })IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -1020,7 +1017,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
   BufHandle a("a", {4}, kFloat);
   auto half = ToDtype<at::Half>();
   BufHandle b("b", {4}, half);
-  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
   });
 
@@ -1036,8 +1033,8 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK: for (
-# CHECK:  float v = a[n];
-# CHECK:  relu[n] = Max(v, 0.f
+# CHECK:  float v = a[i];
+# CHECK:  relu[i] = Max(v, 0.f
 # CHECK: })IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -1150,10 +1147,9 @@ TEST(Cuda, MaskBlockDim_CUDA) {
   int B_SIZE = 50;
   BufHandle a_buf("a", {A_SIZE}, kFloat);
   BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf.load(i) + 10;
-  });
-  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1242,10 +1238,9 @@ TEST(Cuda, MaskThreadDim_CUDA) {
   int B_SIZE = 100;
   BufHandle a_buf("a", {A_SIZE}, kFloat);
   BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf.load(i) + 10;
-  });
-  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
     return a_buf.load(i / 2) + b_buf.load(i);
   });
 
@@ -1336,10 +1331,9 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) {
   int B_SIZE = 50;
   BufHandle a_buf("a", {A_SIZE}, kFloat);
   BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf.load(i) + 10;
-  });
-  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1429,10 +1423,9 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
   int B_SIZE = 50;
   BufHandle a_buf("a", {A_SIZE}, kFloat);
   BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf.load(i) + 10;
-  });
-  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute(
+      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
+  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1522,15 +1515,11 @@ TEST(Cuda, MaskMultiDim_CUDA) {
   BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
   BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
   Tensor c = Compute(
-      "C",
-      {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor d = Compute(
-      "D",
-      {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
@@ -1651,15 +1640,11 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
   BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
   BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
   Tensor c = Compute(
-      "C",
-      {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor d = Compute(
-      "D",
-      {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
@@ -2062,15 +2047,11 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
   BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
   BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
   Tensor c = Compute(
-      "C",
-      {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor d = Compute(
-      "D",
-      {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
@@ -2192,15 +2173,11 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
   BufHandle a_buf("a", {OUTER_A_SIZE, A_SIZE}, kFloat);
   BufHandle b_buf("b", {OUTER_B_SIZE, B_SIZE}, kFloat);
   Tensor c = Compute(
-      "C",
-      {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "C", {OUTER_A_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor d = Compute(
-      "D",
-      {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+      "D", {OUTER_B_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
         return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
index 46b55272ddf7..07b9872fb832 100644
--- a/test/cpp/tensorexpr/test_dynamic_shapes.cpp
+++ b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
@@ -1,5 +1,7 @@
 #include <gtest/gtest.h>
 
+#include <ATen/code_template.h>
+#include <c10/core/DeviceType.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -10,6 +12,7 @@
 #include <cmath>
 #include <sstream>
 #include <stdexcept>
+#include <thread>
 
 namespace torch {
 namespace jit {
@@ -626,5 +629,73 @@ TEST(DynamicShapes, GraphFromModel) {
 #endif
 }
 
+TEST(DynamicShapes, MultiThreadedExecution) {
+#ifdef TORCH_ENABLE_LLVM
+  const auto graph_template = R"IR(
+      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
+            %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
+            %SS_2 : int,
+            %SS_3 : int):
+        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x)
+        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3)
+        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y)
+        return (%5))IR";
+  for (bool use_cuda : {false, true}) {
+    if (!torch::cuda::is_available() && use_cuda) {
+      continue;
+    }
+    auto device = use_cuda ? at::kCUDA : at::kCPU;
+    at::jit::TemplateEnv env;
+    env.s("device", use_cuda ? "cuda:0" : "cpu");
+    const auto graph_string = format(graph_template, env);
+    std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, graph.get());
+
+    std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
+
+    std::vector<torch::jit::StrideInput> input_desc = {
+        torch::jit::StrideInput::TENSOR_CONT};
+    std::unordered_map<
+        const torch::jit::Value*,
+        std::vector<torch::jit::StrideInput>>
+        symbolic_strides;
+    symbolic_strides[graph->inputs().at(0)] = input_desc;
+    symbolic_strides[graph->inputs().at(1)] = input_desc;
+    symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+    TensorExprKernel kernel(
+        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+    auto run_kernel = [&](int dim1, int dim2) {
+      auto a =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+      auto b =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+
+      auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+      std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+      stack.emplace_back(dim1);
+      stack.emplace_back(dim2);
+      kernel.run(stack);
+
+      auto o = stack[0].toTensor();
+      ASSERT_TRUE(at::allclose(o, ref));
+    };
+
+    // Run the kernel in parallel to ensure that the run() method calls in
+    // TensorExprKernel are not changing any state.
+    constexpr size_t kNumThreads = 4;
+    std::vector<std::thread> threads;
+    for (size_t id = 0; id < kNumThreads; ++id) {
+      threads.emplace_back(run_kernel, id + 5, id + 20);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+#endif
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index 1b2a393fea7b..6a7a7e75704d 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/ir_verifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
@@ -41,6 +42,133 @@ TEST(Expr, BasicValueTest02) {
   ASSERT_EQ(eval.value<float>(), -4.0f);
 }
 
+TEST(Expr, IsChannelsLastContiguous) {
+  std::vector<VarHandle> vars = {
+      VarHandle("var1", kLong),
+      VarHandle("var2", kLong),
+      VarHandle("var3", kLong),
+      VarHandle("var4", kLong),
+      VarHandle("var5", kLong)};
+
+  // {
+  //   key: ndims,
+  //   value: [
+  //     ...
+  //     [dim_2, dim_1, ..., dim_n]
+  //   ]
+  // }
+  using shapGenInfo = std::unordered_map<int, std::vector<std::vector<int>>>;
+
+  // {
+  //   size: [ExprHandle_1, ExprHandle_2, ..., ExprHandle_n],
+  //   strides: [
+  //     ...
+  //     [ExprHandle_x, ExprHandle_y, ..., ExprHandle_z]
+  //   ]
+  // }
+  using shapeInfo =
+      std::pair<std::vector<ExprHandle>, std::vector<std::vector<ExprHandle>>>;
+
+  std::vector<int> dims = {3, 4, 5};
+
+  std::unordered_map<int, std::vector<ExprHandle>> dims_expr_vec_conf = {
+      {3, std::vector<ExprHandle>(vars.begin(), vars.begin() + 2)},
+      {4, std::vector<ExprHandle>(vars.begin(), vars.begin() + 3)},
+      {5, std::vector<ExprHandle>(vars.begin(), vars.begin() + 4)},
+  };
+
+  shapGenInfo channels_last_cont_shape_conf = {
+      {3, {{1, 2, 0}}}, {4, {{1, 3, 2, 0}}}, {5, {{1, 4, 3, 2, 0}}}};
+  shapGenInfo channels_last_non_cont_shape_conf = {
+      {3, {{2, 1, 0}, {1, 0, 2}}},
+      {4, {{3, 1, 2, 0}, {1, 2, 3, 0}, {1, 0, 2, 3}}},
+      {5, {{4, 3, 2, 1, 0}, {1, 3, 2, 4, 0}, {1, 4, 3, 2, 0}}}};
+
+  shapGenInfo cont_shape_conf = {
+      {3, {{0, 1, 2}}}, {4, {{0, 1, 2, 3}}}, {5, {{0, 1, 2, 3, 4}}}};
+
+  auto shape_gen_fn = [dims_expr_vec_conf](
+                          int ndims, shapGenInfo shape_gen_info) -> shapeInfo {
+    auto dims_expr_vec = dims_expr_vec_conf.at(ndims);
+    std::vector<std::vector<ExprHandle>> strides_expr_vec;
+    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
+      strides_expr_vec[i].resize(ndims);
+    }
+
+    auto stride_gen_fn = [](int indicator, ExprHandle a, ExprHandle b) {
+      if (indicator % 2 == 0) {
+        return a * b;
+      } else {
+        return b * a;
+      }
+    };
+
+    auto stride_order_vec = shape_gen_info.at(ndims);
+    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
+      auto stride_order = stride_order_vec[i];
+
+      strides_expr_vec[i][stride_order[0]] = 1;
+      for (size_t j = 1; j < stride_order.size(); j++) {
+        auto cur_dim_idx = stride_order[j];
+        auto adjacent_dim_idx = stride_order[j - 1];
+
+        strides_expr_vec[i][cur_dim_idx] = stride_gen_fn(
+            i,
+            dims_expr_vec[adjacent_dim_idx],
+            strides_expr_vec[i][adjacent_dim_idx]);
+      }
+    }
+
+    return {dims_expr_vec, strides_expr_vec};
+  };
+
+  auto check_channels_last_fn = [](int ndims, BufHandle buf_handle) -> bool {
+    if (ndims == 3) {
+      return buf_handle.is_channels_last_1d_contiguous();
+    } else if (ndims == 4) {
+      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast);
+    } else {
+      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast3d);
+    }
+  };
+
+  // channels-last contigous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), true);
+    }
+  }
+
+  // channels-last non-contigous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_non_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), false);
+    }
+  }
+
+  // contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(buf_handle.is_contiguous(), true);
+    }
+  }
+
+  // non-contiguous
+  for (size_t i = 0; i < dims.size(); i++) {
+    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
+    for (size_t j = 0; j < shape_info.second.size(); j++) {
+      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
+      ASSERT_EQ(buf_handle.is_contiguous(), false);
+    }
+  }
+}
+
 TEST(Expr, LetTest01) {
   VarHandle x("x", kFloat);
   ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index b814ae344df3..88b75667b654 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -2,8 +2,17 @@
 
 #include <test/cpp/tensorexpr/test_base.h>
 
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
 #include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
@@ -11,6 +20,9 @@
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
+#include <torch/csrc/jit/testing/file_check.h>
+#include <torch/jit.h>
+
 #include <ATen/NativeFunctions.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/native/xnnpack/OpContext.h>
@@ -777,14 +789,14 @@ TEST(ExternalCall, ComputeInterop) {
 
   Tensor Input = Compute(
       "Input",
-      {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
+      {1, 16, 32, 32},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
           const VarHandle& w) { return FloatImm::make(5.0f); });
   Tensor Weight = Compute(
       "Weight",
-      {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}},
+      {16, 16, 1, 1},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
@@ -806,7 +818,7 @@ TEST(ExternalCall, ComputeInterop) {
           {}));
   Tensor Result = Compute(
       "Result",
-      {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
+      {1, 16, 32, 32},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
@@ -866,14 +878,12 @@ TEST(ExternalCall, Inlining) {
 
   BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
 
-  Tensor A = Compute(
-      "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return FloatImm::make(5.0f);
-      });
-  Tensor B = Compute(
-      "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return FloatImm::make(4.0f);
-      });
+  Tensor A = Compute("A", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
+    return FloatImm::make(5.0f);
+  });
+  Tensor B = Compute("B", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
+    return FloatImm::make(4.0f);
+  });
   Tensor MatmulResult = Tensor(
       MatmulResultBuf.node(),
       ExternalCall::make(
@@ -881,14 +891,12 @@ TEST(ExternalCall, Inlining) {
           "nnc_aten_matmul",
           {BufHandle(A.buf()), BufHandle(B.buf())},
           {}));
-  Tensor Result = Compute(
-      "Result",
-      {{8, "i"}, {8, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) {
+  Tensor Result =
+      Compute("Result", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
         return MatmulResult.load(i, j) + FloatImm::make(3.0f);
       });
 
-  StmtPtr root_stmt = alloc<Block>(std::vector<StmtPtr>(
+  StmtPtr root_stmt = alloc<torch::jit::tensorexpr::Block>(std::vector<StmtPtr>(
       {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
   LoopNest l(root_stmt, {Result.buf()});
 
@@ -927,5 +935,131 @@ TEST(ExternalCall, Inlining) {
   ASSERT_TRUE(at::allclose(nnc_result, ref));
 }
 
+TEST(ExternalCall, JitCustomFusionOp) {
+  const char* custom_op_schema_literal =
+      "nnc_custom::add_mul(Tensor a, Tensor b, Tensor c) -> Tensor";
+  const char* external_func_name = "nnc_add_mul";
+
+  auto add_mul_lowering_func =
+      [external_func_name](
+          const std::vector<torch::jit::tensorexpr::ArgValue>& inputs,
+          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_shape,
+          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_strides,
+          const c10::optional<torch::jit::tensorexpr::ScalarType>& output_type,
+          at::Device device) {
+        auto output_dtype = Dtype(*output_type);
+        torch::jit::tensorexpr::BufHandle result_buf(
+            "nnc_add_mul_res_buf", output_shape, output_dtype);
+        const torch::jit::tensorexpr::BufHandle& a =
+            c10::get<torch::jit::tensorexpr::BufHandle>(inputs[0]);
+        const torch::jit::tensorexpr::BufHandle& b =
+            c10::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
+        const torch::jit::tensorexpr::BufHandle& c =
+            c10::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
+        torch::jit::tensorexpr::StmtPtr s =
+            torch::jit::tensorexpr::ExternalCall::make(
+                result_buf, external_func_name, {a, b, c}, {});
+        return Tensor(result_buf.node(), s);
+      };
+
+  auto add_mul_external_func = [](int64_t bufs_num,
+                                  void** buf_data,
+                                  int64_t* buf_ranks,
+                                  int64_t* buf_dims,
+                                  int64_t* buf_strides,
+                                  int8_t* buf_dtypes,
+                                  int64_t args_num,
+                                  int64_t* extra_args) {};
+
+  torch::jit::RegisterOperators reg({Operator(
+      custom_op_schema_literal,
+      [](const Node* node) -> Operation {
+        return [](Stack& _stack) {
+          auto a = std::move(peek(_stack, 0, 3)).toTensor();
+          auto b = std::move(peek(_stack, 1, 3)).toTensor();
+          auto c = std::move(peek(_stack, 2, 3)).toTensor();
+          drop(_stack, 3);
+          auto result = (a + b) * c;
+          pack(_stack, std::move(result));
+          return 0;
+        };
+      },
+      c10::AliasAnalysisKind::FROM_SCHEMA)});
+
+  auto& custom_operator_set = torch::jit::tensorexpr::getCustomOperatorSet();
+  custom_operator_set.insert({custom_op_schema_literal});
+
+  auto& te_lowering_registry = torch::jit::tensorexpr::getNNCLoweringRegistry();
+  te_lowering_registry.insert(
+      parseSchema(custom_op_schema_literal), add_mul_lowering_func);
+
+  auto& te_nnc_func_registry = torch::jit::tensorexpr::getNNCFunctionRegistry();
+  te_nnc_func_registry[external_func_name] = add_mul_external_func;
+
+  std::string graph_string = R"IR(
+    graph(%a : Float(10, 20, strides=[20, 1], device=cpu),
+          %b : Float(10, 20, strides=[20, 1], device=cpu),
+          %c : Float(10, 20, strides=[20, 1], device=cpu)):
+      %res : Float(10, 20, strides=[20, 1], device=cpu) = nnc_custom::add_mul(%a, %b, %c)
+      return (%res))IR";
+
+  auto graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::string shape_compute_python_string = R"PY(
+  def computOutput(a: List[int], b: List[int], c: List[int]):
+    expandedSizes: List[int] = []
+    dimsA = len(a)
+    dimsB = len(b)
+    dimsC = len(c)
+    ndim = max(dimsA, dimsB, dimsC)
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        dimC = dimsC - 1 - offset
+        sizeA = a[dimA] if (dimA >= 0) else 1
+        sizeB = b[dimB] if (dimB >= 0) else 1
+        sizeC = a[dimC] if (dimC >= 0) else 1
+
+        if sizeA != sizeB and sizeB != sizeC and sizeA != 1 and sizeB != 1 and sizeC != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                "The size of tensor a {} must match the size of tensor b ("
+                "{} and c {}) at non-singleton dimension {}".format(sizeA, sizeB, sizeC, i)
+            )
+
+        expandedSizes.append(max(sizeA, sizeB, sizeC))
+
+    return expandedSizes
+  )PY";
+  auto cu_ptr = torch::jit::compile(shape_compute_python_string);
+  torch::jit::GraphFunction* gf =
+      (torch::jit::GraphFunction*)&cu_ptr->get_function("computOutput");
+  ASSERT_TRUE(gf);
+
+#ifdef TORCH_ENABLE_LLVM
+  auto static_graph_case = graph->copy();
+  FuseTensorExprs(static_graph_case, 1);
+  torch::jit::testing::FileCheck()
+      .check("prim::TensorExprGroup_")
+      ->check("nnc_custom::add_mul")
+      ->run(*static_graph_case);
+
+  auto dynamic_graph_case = graph->copy();
+  auto custom_op = torch::jit::getOperatorForLiteral(custom_op_schema_literal);
+  ASSERT_TRUE(custom_op);
+  torch::jit::RegisterShapeComputeGraphForSchema(
+      custom_op->schema(), gf->graph());
+  FuseTensorExprs(dynamic_graph_case, 1, false, true);
+  torch::jit::testing::FileCheck()
+      .check("prim::TensorExprGroup_")
+      ->check("nnc_custom::add_mul")
+      ->run(*dynamic_graph_case);
+#else
+  torch::jit::testing::FileCheck().check("nnc_custom::add_mul")->run(*graph);
+#endif
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
index 820f12689acc..2c98e093afcc 100644
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -53,42 +53,36 @@ TEST(IRPrinter, FunctionName) {
   int N = 20;
 
   Tensor producer = Compute(
-      "producer",
-      {{M, "m"}, {N, "n"}},
-      [&](const ExprHandle& m, const ExprHandle& n) { return m * n; });
+      "producer", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return m * n;
+      });
 
   Tensor chunk_0 = Compute(
-      "chunk",
-      {{M, "m"}, {N / 2, "n"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      "chunk_0", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
         return producer.load(m, n);
       });
 
   Tensor chunk_1 = Compute(
-      "chunk",
-      {{M, "m"}, {N / 2, "n"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      "chunk_1", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
         return producer.load(m, n + ExprHandle(N / 2));
       });
 
   Tensor consumer = Compute(
-      "consumer",
-      {{M, "i"}, {N / 2, "j"}},
-      [&](const ExprHandle& i, const ExprHandle& j) {
+      "consumer", {M, N / 2}, [&](const ExprHandle& i, const ExprHandle& j) {
         return i * chunk_1.load(i, j);
       });
 
   LoopNest l({chunk_0, chunk_1, consumer});
-  auto body = l.root_stmt();
+  auto body = LoopNest::sanitizeNames(l.root_stmt());
 
   std::stringstream ss;
   ss << *body;
 
   const std::string& verification_pattern =
       R"IR(
- # CHECK:   for (int i
- # CHECK:    for (int j
- # CHECK:     consumer[i, j] = i * (chunk_1[i, j])IR";
+ # CHECK:   for (int i_2
+ # CHECK:    for (int j_2
+ # CHECK:     consumer[i_2, j_2] = i_2 * (chunk_1[i_2, j_2])IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, ss.str());
 }
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 2ae99ef58b22..fc755ed6caae 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -647,6 +647,7 @@ TEST_F(Kernel, CatWithEmptyInputs) {
 }
 
 TEST_F(Kernel, CatWoConditionals) {
+  bool old_cat_wo_conditionals = getCatWoConditionals();
   getCatWoConditionals() = true;
   const auto graph_string = R"IR(
       graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
@@ -702,7 +703,7 @@ TEST_F(Kernel, CatWoConditionals) {
   for (const auto i : c10::irange(num_el)) {
     CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
   }
-  getCatWoConditionals() = false;
+  getCatWoConditionals() = old_cat_wo_conditionals;
 }
 
 TEST_F(Kernel, OptimizeConditionals) {
@@ -1597,12 +1598,14 @@ TEST_F(Kernel, CodegenInspection) {
 Tensor lowerNanToNum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   auto input_buf = c10::get<BufHandle>(inputs[0]);
   auto e = Compute(
       "custom_nan_to_num",
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [&](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         auto load = input_buf.load(indices);
@@ -1836,5 +1839,228 @@ graph(%x : int,
   ASSERT_TRUE(at::equal(stack[3].toTensor(), xt * yt));
 }
 
+TEST_F(Kernel, FuseLoopsWithVariableBounds) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), 3, SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), 7, SS(-3), requires_grad=0, device=cpu),
+            %c : Float(SS(-2), 9, SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(SS(-2), 19, SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->inputs().at(2)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim1, int dim2) {
+    auto a =
+        at::rand({dim1, 3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim1, 7, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto c =
+        at::rand({dim1, 9, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b, c}, 1);
+
+    std::vector<IValue> stack =
+        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
+    stack.emplace_back(dim1);
+    stack.emplace_back(dim2);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
+TEST_F(Kernel, FuseLoopsWithVariableConcatDim) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %c : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int,
+            %SS_4 : int,
+            %SS_5 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
+        %r : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->inputs().at(2)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim1, int dim2, int dim3) {
+    auto a =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto c =
+        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b, c}, 1);
+
+    std::vector<IValue> stack =
+        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
+    stack.emplace_back(dim1);
+    stack.emplace_back(dim2);
+    stack.emplace_back(dim3);
+    stack.emplace_back(3 * dim3);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20, 15);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
+TEST_F(Kernel, DoNotFuseLoopsWithMismatchingVariableDims) {
+#ifdef TORCH_ENABLE_LLVM
+  bool old_cat_wo_conditionals = getCatWoConditionals();
+  getCatWoConditionals() = true;
+  const auto graph_string = R"IR(
+      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
+            %b : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu),
+            %SS_2 : int,
+            %SS_3 : int,
+            %SS_4 : int,
+            %SS_5 : int,
+            %SS_6 : int):
+        %dim : int = prim::Constant[value=1]()
+        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
+        %r : Float(SS(-2), SS(-6), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
+        return (%r))IR";
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, graph.get());
+
+  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5, -6};
+
+  std::vector<torch::jit::StrideInput> input_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  symbolic_strides[graph->inputs().at(0)] = input_desc;
+  symbolic_strides[graph->inputs().at(1)] = input_desc;
+  symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+  TensorExprKernel kernel(
+      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+  std::ostringstream oss;
+  oss << *kernel.getCodeGenStmt();
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (int64_t i
+# CHECK-NEXT: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK: for (int64_t j
+# CHECK-NEXT: for (int64_t k
+# CHECK-NOT: for (int64_t j
+# CHECK-NOT: for (int64_t i
+      )IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  auto run_kernel = [&](int dim2, int dim3, int dim4, int dim5) {
+    auto a =
+        at::rand({dim2, dim4, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
+    auto b =
+        at::rand({dim2, dim5, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
+
+    auto ref = at::cat({a, b}, 1);
+
+    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+    stack.emplace_back(dim2);
+    stack.emplace_back(dim3);
+    stack.emplace_back(dim4);
+    stack.emplace_back(dim5);
+    stack.emplace_back(dim4 + dim5);
+    kernel.run(stack);
+
+    auto o = stack[0].toTensor();
+    ASSERT_TRUE(at::allclose(o, ref));
+  };
+
+  run_kernel(10, 20, 15, 8);
+  getCatWoConditionals() = old_cat_wo_conditionals;
+#endif
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 52464a6d0afa..520ae6301ceb 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -584,8 +584,7 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4)
 TEST(LLVM, VectorizerLoadStoreTest) {
   BufHandle a("A", {1}, kInt);
 
-  Tensor c =
-      Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return a.load(i); });
 
   BufHandle c_buf(c.buf());
   LoopNest l({c});
@@ -606,7 +605,7 @@ TEST(LLVM, VectorizerLoadStoreTest) {
 TEST(LLVM, VectorizeBitCast) {
   BufHandle a("A", {128}, kInt);
 
-  Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {128}, [&](const VarHandle& i) {
     return bitcast<float>(a.load(i));
   });
 
@@ -1186,9 +1185,8 @@ TEST(LLVM, StoreFloat) {
 
 TEST(LLVM, SimpleMath01) {
   const int N = 1024;
-  Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
-    return cast<float>(i * i + 1);
-  });
+  Tensor tensor = Compute(
+      "f", {N}, [](const VarHandle& i) { return cast<float>(i * i + 1); });
   LoopNest l({tensor});
   StmtPtr stmt = l.root_stmt();
   BufHandle f_buf(tensor.buf());
@@ -1209,9 +1207,8 @@ TEST(LLVM, ComputeMul) {
   const int N = 1024;
   BufHandle a("a", {N}, kFloat);
   BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
-    return a.load(i) * b.load(i);
-  });
+  Tensor c = Compute(
+      "c", {N}, [&](const VarHandle& i) { return a.load(i) * b.load(i); });
 
   BufHandle c_buf(c.buf());
   LoopNest l({c});
@@ -1232,10 +1229,9 @@ TEST(LLVM, BroadcastAdd) {
   const int N = 1024;
   BufHandle a("a", {M, N}, kFloat);
   BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute(
-      "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return a.load(i, j) + b.load(j);
-      });
+  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
+    return a.load(i, j) + b.load(j);
+  });
 
   BufHandle c_buf(c.buf());
   LoopNest l({c});
@@ -1333,9 +1329,8 @@ TEST(LLVM, TensorDynamicShapeAdd) {
     VarHandle n("n", kInt);
     BufHandle a("a", {n}, kFloat);
     BufHandle b("b", {n}, kFloat);
-    Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
-      return a.load(i) + b.load(i);
-    });
+    Tensor c = Compute(
+        "c", {n}, [&](const VarHandle& i) { return a.load(i) + b.load(i); });
     LoopNest l({c});
     StmtPtr s = l.root_stmt();
     LLVMCodeGen cg(s, {a, b, c, n});
@@ -1356,8 +1351,8 @@ TEST(LLVM, DynamicShape2D) {
     VarHandle n("n", kInt);
     BufHandle a("a", {m, n}, kFloat);
     BufHandle b("b", {m, n}, kFloat);
-    Tensor c = Compute(
-        "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
@@ -1386,7 +1381,7 @@ TEST(LLVM, EmptyStmt) {
 TEST(LLVM, EliminatedStmt) {
   BufHandle a("a", {1}, kFloat);
 
-  Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
+  Tensor c = Compute("c", {0}, [&](const VarHandle& m) { return m; });
 
   LoopNest l({c});
   l.prepareForCodegen();
@@ -1405,10 +1400,7 @@ TEST(LLVM, SimpleReduction) {
 
   BufHandle a("a", {1, M, N}, kFloat);
 
-  // TODO: why doesn't implicit vector<DimArg> work?
-  std::vector<DimArg> axis = {DimArg(1)};
-  std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
   LoopNest loop({b});
 
   loop.prepareForCodegen();
@@ -1442,10 +1434,7 @@ TEST(LLVM, RFactorReduction) {
 
   BufHandle a("a", {1, M, N}, kFloat);
 
-  // TODO: why doesn't implicit vector<DimArg> work?
-  std::vector<DimArg> axis = {DimArg(1)};
-  std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
   LoopNest loop({b});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
@@ -1490,7 +1479,7 @@ TEST(LLVM, RFactorVectorizedReduction) {
 
   BufHandle a("a", {1, M, N}, kFloat);
 
-  Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
+  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
   LoopNest loopnest({b});
   std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
   // Reorder n and m loops
@@ -1536,10 +1525,9 @@ static void testSimpleParallel() {
   // parallel or sequential.
   const int M = 4;
   const int N = 6;
-  Tensor f = Compute(
-      "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
-        return cast<float>(m + n);
-      });
+  Tensor f = Compute("f", {M, N}, [](const VarHandle& m, const VarHandle& n) {
+    return cast<float>(m + n);
+  });
   LoopNest loop_nest({f});
   auto const& loops = loop_nest.getLoopStmtsFor(f);
   ForPtr m = loops[0];
@@ -1588,20 +1576,14 @@ TEST(LLVM, CompositeParallel) {
   for (const auto test_cfg : c10::irange(test_count)) {
     int M = 5;
     int N = 7;
-    Tensor t1 =
-        Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; });
-    Tensor t2 =
-        Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; });
-    Tensor t3 = Compute(
-        "t3",
-        {{M, "M"}, {N, "N"}},
-        [=](const VarHandle& m, const VarHandle& n) {
+    Tensor t1 = Compute("t1", {M}, [](const VarHandle& m) { return m + 1.f; });
+    Tensor t2 = Compute("t2", {N}, [](const VarHandle& n) { return n + 2.f; });
+    Tensor t3 =
+        Compute("t3", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
           return t1.load(m) * t2.load(n);
         });
-    Tensor t4 = Compute(
-        "t4",
-        {{M, "M"}, {N, "N"}},
-        [=](const VarHandle& m, const VarHandle& n) {
+    Tensor t4 =
+        Compute("t4", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
           return t3.load(m, n) + m + n;
         });
     LoopNest loop_nest({t4}, {t1, t2, t3, t4});
@@ -1657,12 +1639,12 @@ TEST(LLVM, VectorizedGEMM) {
   BufHandle BP("B", {K, N}, kFloat);
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
+      {K});
   LoopNest loop({CT});
 
   {
@@ -1735,10 +1717,9 @@ TEST(LLVM, CallRaw) {
   VarHandle N("N", kInt);
   BufHandle a("a", {M, N}, kFloat);
   BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute(
-      "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return a.load(i, j) + b.load(j);
-      });
+  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
+    return a.load(i, j) + b.load(j);
+  });
 
   LoopNest l({c});
   l.prepareForCodegen();
@@ -1776,7 +1757,7 @@ TEST(LLVM, CustomTarget) {
   BufHandle a("a", {M}, kFloat);
   BufHandle b("b", {M}, kFloat);
   BufHandle c("c", {M}, kFloat);
-  Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor d = Compute("d", {M}, [&](const VarHandle& m) {
     return a.load(m) * b.load(m) + c.load(m);
   });
   LoopNest nest({d});
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index becf3bdffbac..f2609b0f4166 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -41,8 +41,8 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) {
 }
 
 TEST(LoopNest, ExprSimple01) {
-  Tensor tensor = Compute(
-      "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
+  Tensor tensor =
+      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
@@ -53,8 +53,8 @@ TEST(LoopNest, ExprSimple01) {
 }
 
 TEST(LoopNest, ExprLower01) {
-  Tensor tensor = Compute(
-      "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
+  Tensor tensor =
+      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
@@ -69,7 +69,7 @@ TEST(LoopNest, ExprSimple02) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {26, 5}, func);
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
@@ -83,10 +83,10 @@ TEST(LoopNest, ExprSimple02) {
 
   {
     // Compare to a reference loop structure structure.
-    VarHandle x_outer("x_outer", kInt);
-    VarHandle x_inner("x_inner", kInt);
-    VarHandle y("y", kInt);
-    VarHandle x_tail("x_tail", kInt);
+    VarHandle x_outer("i_outer", kInt);
+    VarHandle x_inner("i_inner", kInt);
+    VarHandle y("i", kInt);
+    VarHandle x_tail("i_tail", kInt);
     BufHandle f("f", {26, 5}, kFloat);
     ExprHandle x_1 = x_outer * 4 + x_inner;
     ExprHandle x_outer_end = (ExprHandle(26) - 0) / 4;
@@ -162,7 +162,7 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -185,7 +185,7 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -217,7 +217,7 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -237,7 +237,7 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -257,7 +257,7 @@ TEST(LoopNest, ExprSliceHead) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -279,7 +279,7 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
@@ -305,7 +305,7 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -327,7 +327,7 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -347,7 +347,7 @@ TEST(LoopNest, ExprSliceTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -372,7 +372,7 @@ TEST(LoopNest, ExprSplitAndSlice) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{100, "x"}}, func);
+  Tensor tensor = Compute("f", {100}, func);
   LoopNest l({tensor});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -424,7 +424,7 @@ TEST(LoopNest, ExprSliceAndNormalize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
@@ -455,7 +455,7 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
          const std::vector<std::pair<int, int>>& expected_for_ranges) {
         VarHandle dim("dim", kInt);
         Tensor tensor =
-            Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; });
+            Compute("f", {dim}, [](const ExprHandle& x) { return x; });
         LoopNest l({tensor});
         std::vector<ForPtr> loops =
             l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
@@ -492,7 +492,7 @@ TEST(LoopNest, ExprSplitWithTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor tensor = Compute("f", {{199, "x"}}, func);
+  Tensor tensor = Compute("f", {199}, func);
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
@@ -521,7 +521,7 @@ TEST(LoopNest, ExprSplitWithTailNone) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {24, 5}, func);
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithTail(loops[0], 4);
@@ -534,10 +534,10 @@ TEST(LoopNest, ExprSplitWithTailNone) {
 
   {
     // Compare to a reference loop structure structure.
-    VarHandle x_outer("x_outer", kInt);
-    VarHandle x_inner("x_inner", kInt);
-    VarHandle y("y", kInt);
-    VarHandle x_tail("x_tail", kInt);
+    VarHandle x_outer("i_outer", kInt);
+    VarHandle x_inner("i_inner", kInt);
+    VarHandle y("i", kInt);
+    VarHandle x_tail("i_tail", kInt);
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
     BufHandle f("f", {24, 5}, kFloat);
     ExprHandle x_1 = x_outer * 4 + x_inner;
@@ -579,8 +579,8 @@ TEST(LoopNest, ExprSplitWithMask01) {
   const int N = 5;
   BufHandle a_buf("a", {M, N}, kFloat);
   BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor = Compute(
-      "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
       });
 
@@ -613,7 +613,7 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
   const int M = 64;
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
 
@@ -697,8 +697,8 @@ TEST(LoopNest, TileSimple) {
   const int M = 64, N = 64;
   BufHandle a_buf("a", {M, N}, kFloat);
   BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor = Compute(
-      "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
@@ -710,13 +710,13 @@ TEST(LoopNest, TileSimple) {
   // IR check
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   checkIR(stmt, R"IR(
-# CHECK: for (int m_outer
-# CHECK:   for (int n_outer
-# CHECK:     for (int m_inner
-# CHECK:       for (int n_inner
+# CHECK: for (int i_outer
+# CHECK:   for (int i_outer_1
+# CHECK:     for (int i_inner
+# CHECK:       for (int i_inner_1
 # CHECK:         f[
-# CHECK-NOT:     for (int n_tail
-# CHECK-NOT: for (int m_tail)IR");
+# CHECK-NOT:     for (int i_tail
+# CHECK-NOT: for (int i_tail)IR");
 
   // Correctness check
   PaddedBuffer<float> a_v(M, N, "a");
@@ -742,8 +742,8 @@ TEST(LoopNest, TileWithTails) {
   const int M = 64, N = 64;
   BufHandle a_buf("a", {M, N}, kFloat);
   BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor = Compute(
-      "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor tensor =
+      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
@@ -755,14 +755,14 @@ TEST(LoopNest, TileWithTails) {
   // IR check
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   checkIR(stmt, R"IR(
-# CHECK: for (int m_outer
-# CHECK:   for (int n_outer
-# CHECK:     for (int m_inner
-# CHECK:       for (int n_inner
+# CHECK: for (int i_outer
+# CHECK:   for (int i_outer_1
+# CHECK:     for (int i_inner
+# CHECK:       for (int i_inner_1
 # CHECK:         f[
-# CHECK:   for (int m_inner
+# CHECK:   for (int i_inner
 # CHECK:     f[
-# CHECK: for (int m_tail)IR");
+# CHECK: for (int i_tail)IR");
 
   // Correctness check
   PaddedBuffer<float> a_v(M, N, "a");
@@ -790,7 +790,7 @@ TEST(LoopNest, TileInMiddle) {
   BufHandle b_buf("b", {M, N, L, K}, kFloat);
   Tensor tensor = Compute(
       "f",
-      {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}},
+      {M, N, L, K},
       [&](const ExprHandle& m,
           const ExprHandle& n,
           const ExprHandle& l,
@@ -807,18 +807,18 @@ TEST(LoopNest, TileInMiddle) {
   // IR check
   StmtPtr stmt = IRSimplifier::simplify(nest.root_stmt());
   checkIR(stmt, R"IR(
-# CHECK: for (int m
-# CHECK:   for (int n_outer
-# CHECK:     for (int l_outer
-# CHECK:       for (int n_inner
-# CHECK:         for (int l_inner
-# CHECK:           for (int k
+# CHECK: for (int i
+# CHECK:   for (int i_outer
+# CHECK:     for (int i_outer_1
+# CHECK:       for (int i_inner
+# CHECK:         for (int i_inner_1
+# CHECK:           for (int i_1
 # CHECK:             f[
-# CHECK:     for (int l_tail
-# CHECK:       for (int n_inner
-# CHECK:         for (int k
+# CHECK:     for (int i_tail_1
+# CHECK:       for (int i_inner_1
+# CHECK:         for (int i_1
 # CHECK:           f[
-# CHECK:   for (int n_tail)IR");
+# CHECK:   for (int i_tail)IR");
 
   // Correctness check
   PaddedBuffer<float> a_v(M, N, L, K, "a");
@@ -847,7 +847,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
   const int M = 21;
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -877,7 +877,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
   const int M = 21;
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -905,7 +905,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) {
   BufHandle b_buf("b", {N, K}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
@@ -953,13 +953,13 @@ TEST(LoopNest, ScheduleFunctionCall01) {
   BufHandle b_buf("b", {N, K}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
   Tensor d = Compute(
       "d",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c.load(m, n, k) + 1;
       });
@@ -1012,13 +1012,13 @@ TEST(LoopNest, ScheduleInlineSimple) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
@@ -1092,19 +1092,19 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
   Tensor z = Compute(
       "z",
-      {{M, "m3"}, {N, "n3"}, {K, "k3"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m, n, k) + y.load(m, n, k);
       });
@@ -1171,7 +1171,7 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   if (inline_order.size() == 2) {
     Tensor z2 = Compute(
         "z",
-        {{M, "m3"}, {N, "n3"}, {K, "k3"}},
+        {M, N, K},
         [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
           return a_buf.load(m, n) * b_buf.load(n, k) +
               (c_buf.load(m, n) * d_buf.load(m, k) +
@@ -1206,13 +1206,13 @@ TEST(LoopNest, ScheduleInlineRandom) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Mod::make(Intrinsics::make(kRand, kInt), 5);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m, n, k) + x.load(m, n, k);
       });
@@ -1226,11 +1226,11 @@ TEST(LoopNest, ScheduleInlineRandom) {
 
   // Check the IR we produced
   checkIR(stmt1, R"IR(
-# CHECK: for (int m2 = 0; m2 < 4; m2++)
-# CHECK:   for (int n2 = 0; n2 < 5; n2++)
-# CHECK:     for (int k2 = 0; k2 < 6; k2++)
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
 # CHECK:       int x = rand();
-# CHECK:       y[m2, n2, k2] = 2 * (x % 5);)IR");
+# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
 }
 
 // Make sure we don't cache random vars that are not being inlined.
@@ -1241,13 +1241,13 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
             Intrinsics::make(kRand, kInt);
@@ -1262,10 +1262,10 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
 
   // Check the IR we produced
   checkIR(stmt1, R"IR(
-# CHECK: for (int m2 = 0; m2 < 4; m2++)
-# CHECK:   for (int n2 = 0; n2 < 5; n2++)
-# CHECK:     for (int k2 = 0; k2 < 6; k2++)
-# CHECK:       y[m2, n2, k2] = ((k2 * m2) * n2 + (rand())) + (rand());)IR");
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       y[i, i_1, i_2] = ((i * i_1) * i_2 + (rand())) + (rand());)IR");
 }
 
 // Make sure we generate the right number of random values == the dimensionality
@@ -1275,12 +1275,12 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
   const int N = 5;
   const int K = 6;
 
-  Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
+  Tensor x = Compute("x", {M}, [&](const VarHandle& m) {
     return Mod::make(Intrinsics::make(kRand, kInt), 5);
   });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m) + x.load(m);
       });
@@ -1294,11 +1294,11 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
 
   // Check the IR we produced
   checkIR(stmt1, R"IR(
-# CHECK: for (int m2 = 0; m2 < 4; m2++)
+# CHECK: for (int i = 0; i < 4; i++)
 # CHECK:   int x = rand();
-# CHECK:   for (int n2 = 0; n2 < 5; n2++)
-# CHECK:     for (int k2 = 0; k2 < 6; k2++)
-# CHECK:       y[m2, n2, k2] = 2 * (x % 5);)IR");
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
 }
 
 // Make sure we don't screw up intrinsics thinking they're rand.
@@ -1311,13 +1311,13 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
@@ -1369,13 +1369,13 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Intrinsics::make(kRand, kFloat);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
@@ -1387,20 +1387,18 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
 
   // Check the IR we produced
   checkIR(stmt1, R"IR(
-# CHECK: for (int m2 = 0; m2 < 4; m2++)
-# CHECK:   for (int n2 = 0; n2 < 5; n2++)
-# CHECK:     for (int k2 = 0; k2 < 6; k2++)
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
 # CHECK:       float x = rand();
-# CHECK:       y[m2, n2, k2] = sqrt(x);)IR");
+# CHECK:       y[i, i_1, i_2] = sqrt(x);)IR");
 }
 
 // Split a Compute then inline it into another compute.
 TEST(LoopNest, ScheduleSplitAThenInline) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
-  });
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
 
   LoopNest l({b}, {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1410,11 +1408,9 @@ TEST(LoopNest, ScheduleSplitAThenInline) {
 
 // Split a Compute then inline another Compute into it.
 TEST(LoopNest, ScheduleSplitBThenInline) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
-  });
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
 
   LoopNest l({b}, {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
@@ -1434,11 +1430,9 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
 
 // Split a Compute twice then inline it.
 TEST(LoopNest, ScheduleSplitTwiceThenInline) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
-  });
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr i_inner;
 
@@ -1451,11 +1445,9 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) {
 
 // Inline a Compute, then split.
 TEST(LoopNest, ScheduleInlineThenSplit) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
-  });
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
 
   LoopNest l({b}, {a, b});
   l.computeInline(a.buf());
@@ -1475,11 +1467,9 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
 
 // Split a Compute, inline it, then split the result.
 TEST(LoopNest, ScheduleSplitInlineThenSplit) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
-  });
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {16}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
 
   LoopNest l({b}, {a, b});
   auto loops = NodeFinder<For>::find(l.root_stmt());
@@ -1501,12 +1491,11 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
 
 // Oversplit a loop that is simplified out after inlining.
 TEST(LoopNest, ScheduleSplitInlineSimplify) {
-  Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) {
     return ExprHandle(4) * i - ExprHandle(2) * i;
   });
-  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a.load(j) - ExprHandle(1);
-  });
+  Tensor b = Compute(
+      "b", {2}, [&](const VarHandle& j) { return a.load(j) - ExprHandle(1); });
 
   LoopNest l({b}, {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1516,15 +1505,12 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) {
 
 // Inline a Compute with two consumers.
 TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
   });
-  Tensor c = Compute(
-      "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a.load(k) * b.load(l);
-      });
 
   LoopNest l({c}, {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1545,15 +1531,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
 
 // Inline Compute A into B, then inline B into C.
 TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
   });
-  Tensor c = Compute(
-      "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a.load(k) * b.load(l);
-      });
 
   LoopNest l({c}, {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1575,15 +1558,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
 
 // Inline a Compute that is both a producer and consumer.
 TEST(LoopNest, ScheduleInlineThreeMixedInner) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
   });
-  Tensor c = Compute(
-      "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a.load(k) * b.load(l);
-      });
 
   LoopNest l({c}, {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1604,15 +1584,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
 
 // Split 3 Computes, then inline the first two into the last.
 TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
-  Tensor a =
-      Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a.load(j + ExprHandle(8));
+  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
+  Tensor b = Compute(
+      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
+    return a.load(k) * b.load(l);
   });
-  Tensor c = Compute(
-      "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a.load(k) * b.load(l);
-      });
 
   LoopNest l({c}, {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
@@ -1633,13 +1610,13 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m, n, k) + m;
       });
@@ -1653,14 +1630,14 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
 
   // Check the IR we produced
   checkIR(stmt1, R"IR(
-# CHECK: for (int m1 = 0; m1 < 4; m1++)
-# CHECK:   for (int n1 = 0; n1 < 5; n1++)
-# CHECK:     for (int k1 = 0; k1 < 6; k1++)
-# CHECK:       x[m1, n1, k1] = (k1 * m1) * n1;
-# CHECK: for (int m2 = 0; m2 < 4; m2++)
-# CHECK:   for (int n2 = 0; n2 < 5; n2++)
-# CHECK:     for (int k2 = 0; k2 < 6; k2++)
-# CHECK:       y[m2, n2, k2] = (k2 * m2) * n2 + m2;)IR");
+# CHECK: for (int i = 0; i < 4; i++)
+# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
+# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
+# CHECK:       x[i, i_1, i_2] = (i * i_1) * i_2;
+# CHECK: for (int i_3 = 0; i_3 < 4; i_3++)
+# CHECK:   for (int i_4 = 0; i_4 < 5; i_4++)
+# CHECK:     for (int i_5 = 0; i_5 < 6; i_5++)
+# CHECK:       y[i_3, i_4, i_5] = i_3 + (i_3 * i_4) * i_5;)IR");
 }
 
 TEST(LoopNest, ScheduleInlineWithCompoundIndices) {
@@ -1790,13 +1767,13 @@ TEST(LoopNest, ScheduleFuserStyle) {
 
   BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
 
-  Tensor b = Compute(
-      "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
+  Tensor b =
+      Compute("f", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
         return a_buf.load(axes[0]) + 11.0f;
       });
 
-  Tensor c = Compute(
-      "g", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
+  Tensor c =
+      Compute("g", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
         return b.load(axes[0]) + 1.0f;
       });
 
@@ -1825,13 +1802,13 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
   BufHandle c("C", {ExprHandle(kTotalSize)}, kFloat);
   BufHandle d("D", {ExprHandle(kTotalSize)}, kFloat);
 
-  Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor e = Compute("e", {kTotalSize}, [&](const VarHandle& i) {
     return a.load(i) + b.load(i);
   });
-  Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor f = Compute("f", {kTotalSize}, [&](const VarHandle& i) {
     return e.load(i) + c.load(i);
   });
-  Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor g = Compute("g", {kTotalSize}, [&](const VarHandle& i) {
     return f.load(i) + d.load(i);
   });
 
@@ -1859,8 +1836,8 @@ TEST(LoopNest, ScheduleDynamicShape2D) {
     VarHandle n("n", kInt);
     BufHandle a("a", {m, n}, kFloat);
     BufHandle b("b", {m, n}, kFloat);
-    Tensor c = Compute(
-        "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
+    Tensor c =
+        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
@@ -1893,10 +1870,9 @@ TEST(LoopNest, LoopNestComputeAt_1) {
   // should not be inlined into B. Instead, it should be computed into the temp,
   // and the temp should be used in B.
   VarHandle N("N", kInt);
-  Tensor A = Compute(
-      "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
-  Tensor B = Compute(
-      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); });
+  Tensor A = Compute("A", {N}, [&](const VarHandle& i_a) { return i_a * i_a; });
+  Tensor B =
+      Compute("B", {N}, [&](const VarHandle& i_b) { return A.load(i_b); });
   LoopNest l({B}, {A, B});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
@@ -1906,10 +1882,10 @@ TEST(LoopNest, LoopNestComputeAt_1) {
 
   checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[1]
-# CHECK: for (int i_b = 0; i_b < N; i_b++)
+# CHECK: for (int i = 0; i < N; i++)
 # CHECK:   temp[
 # CHECK-NOT: A[
-# CHECK:   B[i_b] = temp[0]
+# CHECK:   B[i_1] = temp[0]
 # CHECK:   Free(temp))IR");
 
   // Now check that the loop still produces the correct result.
@@ -1942,13 +1918,11 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
   Tensor p = Compute(
-      "prod",
-      {{H + 1, "py"}, {W + 1, "px"}},
-      [&](const VarHandle& py, const VarHandle& px) { return px * py; });
-  Tensor c = Compute(
-      "cons",
-      {{H, "cy"}, {W, "cx"}},
-      [&](const VarHandle& y, const VarHandle& x) {
+      "prod", {H + 1, W + 1}, [&](const VarHandle& py, const VarHandle& px) {
+        return px * py;
+      });
+  Tensor c =
+      Compute("cons", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
         return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
             p.load(y + 1, x + 1);
       });
@@ -1973,10 +1947,10 @@ TEST(LoopNest, LoopNestComputeAt_2) {
     // Check the IR we produced
     checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
-# CHECK: for (int cy = 0; cy < H; cy++)
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
 # CHECK:   for
 # CHECK:     for
-# CHECK:   for (int cx = 0; cx < W; cx++)
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
 # CHECK-NOT: prod[
 # CHECK:     cons[
 # CHECK: Free(temp))IR");
@@ -1999,8 +1973,8 @@ TEST(LoopNest, LoopNestComputeAt_2) {
     // Check the IR we produced
     checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
-# CHECK: for (int cy = 0; cy < H; cy++)
-# CHECK:   for (int cx = 0; cx < W; cx++)
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
 # CHECK:     for
 # CHECK:       for
 # CHECK-NOT: prod[
@@ -2029,23 +2003,19 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
   Tensor A = Compute(
-      "A",
-      {{H + 1, "ay"}, {W + 1, "ax"}},
-      [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; });
+      "A", {H + 1, W + 1}, [&](const VarHandle& ay, const VarHandle& ax) {
+        return ax * ay;
+      });
   Tensor B = Compute(
-      "B",
-      {{H + 1, "by"}, {W + 1, "bx"}},
-      [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); });
-  Tensor C = Compute(
-      "C",
-      {{H, "cy"}, {W, "cx"}},
-      [&](const VarHandle& cy, const VarHandle& cx) {
+      "B", {H + 1, W + 1}, [&](const VarHandle& by, const VarHandle& bx) {
+        return A.load(by, bx);
+      });
+  Tensor C =
+      Compute("C", {H, W}, [&](const VarHandle& cy, const VarHandle& cx) {
         return B.load(cy, cx + 1);
       });
-  Tensor D = Compute(
-      "D",
-      {{H, "dy"}, {W, "dx"}},
-      [&](const VarHandle& dy, const VarHandle& dx) {
+  Tensor D =
+      Compute("D", {H, W}, [&](const VarHandle& dy, const VarHandle& dx) {
         return A.load(dy + 1, dx) + C.load(dy, dx);
       });
 
@@ -2069,17 +2039,17 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     // Check the IR we produced
     checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[1, W]
-# CHECK: for (int ay = 0; ay < H + 1; ay++)
-# CHECK:   for (int ax = 0; ax < W + 1; ax++)
+# CHECK: for (int i = 0; i < H + 1; i++)
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
 # CHECK:     A[
-# CHECK: for (int by = 0; by < H + 1; by++)
-# CHECK:   for (int bx = 0; bx < W + 1; bx++)
+# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
 # CHECK:     B[
-# CHECK: for (int cy = 0; cy < H; cy++)
-# CHECK:   for (int cx = 0; cx < W; cx++)
+# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
+# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
 # CHECK:     C[
-# CHECK: for (int dy = 0; dy < H; dy++)
-# CHECK:   for (int dx = 0; dx < W; dx++)
+# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
+# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
 # CHECK-NOT: A[)IR");
 
     // Now check that the loop still produces the correct result.
@@ -2100,17 +2070,17 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     // Check the IR we produced
     checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[1, 1]
-# CHECK: for (int ay = 0; ay < H + 1; ay++)
-# CHECK:   for (int ax = 0; ax < W + 1; ax++)
+# CHECK: for (int i = 0; i < H + 1; i++)
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
 # CHECK:     A[
-# CHECK: for (int by = 0; by < H + 1; by++)
-# CHECK:   for (int bx = 0; bx < W + 1; bx++)
+# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
+# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
 # CHECK:     B[
-# CHECK: for (int cy = 0; cy < H; cy++)
-# CHECK:   for (int cx = 0; cx < W; cx++)
+# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
+# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
 # CHECK:     C[
-# CHECK: for (int dy = 0; dy < H; dy++)
-# CHECK:   for (int dx = 0; dx < W; dx++)
+# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
+# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
 # CHECK-NOT: A[)IR");
 
     // Now check that the loop still produces the correct result.
@@ -2128,16 +2098,14 @@ TEST(LoopNest, Reduce2dComputeAt) {
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
 
-  Tensor p =
-      Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) {
-        return px * py;
-      });
+  Tensor p = Compute(
+      "prod", {H + 1, W + 1}, [&](Axis py, Axis px) { return px * py; });
   Tensor c = Reduce(
       "cons",
-      {{H, "cy"}, {W, "cx"}},
+      {H, W},
       Sum(),
       [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
-      {{2, "r"}, {2, "s"}});
+      {2, 2});
 
   std::vector<int> c_ref(kW * kH, 0);
   for (int y = 0; y < kH; y++) {
@@ -2147,17 +2115,17 @@ TEST(LoopNest, Reduce2dComputeAt) {
   }
   LoopNest orig_loopnest({c}, {p, c});
   checkIR(orig_loopnest.root_stmt(), R"IR(
-# CHECK: for (int py = 0; py < H + 1; py++) {
-# CHECK:   for (int px = 0; px < W + 1; px++) {
-# CHECK:     prod[py, px] = px * py;
+# CHECK: for (int i = 0; i < H + 1; i++) {
+# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++) {
+# CHECK:     prod[i, i_1] = i_1 * i;
 # CHECK:   }
 # CHECK: }
-# CHECK: for (int cy = 0; cy < H; cy++) {
-# CHECK:   for (int cx = 0; cx < W; cx++) {
-# CHECK:     cons[cy, cx] = int(0);
-# CHECK:     for (int r = 0; r < 2; r++) {
-# CHECK:       for (int s = 0; s < 2; s++) {
-# CHECK:         cons[cy, cx] = ReduceOp((cons[cy, cx]) + (prod[cy + r, cx + s]), reduce_args={r, s});
+# CHECK: for (int i_2 = 0; i_2 < H; i_2++) {
+# CHECK:   for (int i_3 = 0; i_3 < W; i_3++) {
+# CHECK:     cons[i_2, i_3] = int(0);
+# CHECK:     for (int i_4 = 0; i_4 < 2; i_4++) {
+# CHECK:       for (int i_5 = 0; i_5 < 2; i_5++) {
+# CHECK:         cons[i_2, i_3] = ReduceOp((cons[i_2, i_3]) + (prod[i_2 + i_4, i_3 + i_5]), reduce_args={i_4, i_5});
 # CHECK:       }
 # CHECK:     }
 # CHECK:   }
@@ -2177,17 +2145,17 @@ TEST(LoopNest, Reduce2dComputeAt) {
     SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
     checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
-# CHECK: for (int cy = 0; cy < H; cy++) {
+# CHECK: for (int i = 0; i < H; i++) {
 # CHECK:   for (int idx0 = 0; idx0 < 2; idx0++) {
 # CHECK:     for (int idx1 = 0; idx1 < W + 1; idx1++) {
-# CHECK:       temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + cy) * (idx1 + 0);
+# CHECK:       temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + i) * (idx1 + 0);
 # CHECK:     }
 # CHECK:   }
-# CHECK:   for (int cx = 0; cx < W; cx++) {
-# CHECK:     cons[(0 + cy * (1 * W)) + cx * 1] = int(0);
-# CHECK:     for (int r = 0; r < 2; r++) {
-# CHECK:       for (int s = 0; s < 2; s++) {
-# CHECK:         cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * (W + 1))) + (cx + s) * 1]);
+# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
+# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = int(0);
+# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
+# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
+# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * (W + 1))) + (i_1 + i_3) * 1]);
 # CHECK:       }
 # CHECK:     }
 # CHECK:   }
@@ -2211,17 +2179,17 @@ TEST(LoopNest, Reduce2dComputeAt) {
     SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
     checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
-# CHECK: for (int cy = 0; cy < H; cy++) {
-# CHECK:   for (int cx = 0; cx < W; cx++) {
+# CHECK: for (int i = 0; i < H; i++) {
+# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
 # CHECK:     for (int idx0 = 0; idx0 < 2; idx0++) {
 # CHECK:       for (int idx1 = 0; idx1 < 2; idx1++) {
-# CHECK:         temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (cy + idx0) * (cx + idx1);
+# CHECK:         temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (i + idx0) * (i_1 + idx1);
 # CHECK:       }
 # CHECK:     }
-# CHECK:     cons[(0 + cy * (1 * W)) + cx * 1] = 0;
-# CHECK:     for (int r = 0; r < 2; r++) {
-# CHECK:       for (int s = 0; s < 2; s++) {
-# CHECK:         cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * 2)) + s * 1]);
+# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = 0;
+# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
+# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
+# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * 2)) + i_3 * 1]);
 # CHECK:       }
 # CHECK:     }
 # CHECK:   }
@@ -2247,18 +2215,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
   int Pad = 1;
   BufHandle IP("input", {H}, kFloat);
 
-  Tensor A =
-      Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) {
-        auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
-        cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
-        return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
-      });
+  Tensor A = Compute("A", {N, H + 2 * Pad}, [&](Axis n, Axis h) {
+    auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
+    cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
+    return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
+  });
   Tensor B = Reduce(
       "B",
-      {{N, "n"}, {H, "h"}},
+      {N, H},
       Sum(),
       [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
-      {{R, "r"}});
+      {R});
   LoopNest l({B});
   checkIR(l.root_stmt(), R"IR(
 # CHECK: for (int np = 0; np < 4; np++) {
@@ -2333,12 +2300,12 @@ class LoopOrderHelper : public IRVisitor {
 };
 
 TEST(LoopNest, LoopNestReorderAxis1) {
-  Tensor tensor = Compute(
-      "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
+  Tensor tensor =
+      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   std::vector<int> stmt1_output(6, 0);
   SimpleIREvaluator cg(stmt1, {tensor});
@@ -2346,15 +2313,15 @@ TEST(LoopNest, LoopNestReorderAxis1) {
 
   auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
-  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   ASSERT_NE(stmt1, stmt2);
   LoopOrderHelper loopOrderHelper;
   std::string order1 = loopOrderHelper.getOrder(stmt1);
   std::string order2 = loopOrderHelper.getOrder(stmt2);
 
-  ASSERT_EQ(order1, "x,y,");
-  ASSERT_EQ(order2, "y,x,");
+  ASSERT_EQ(order1, "j,i,");
+  ASSERT_EQ(order2, "i,j,");
 
   std::vector<int> stmt2_output(6, 0);
   SimpleIREvaluator cg2(stmt2, {tensor});
@@ -2383,7 +2350,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
 TEST(LoopNest, LoopNestReorderPartialAxes) {
   Tensor tensor = Compute(
       "f",
-      {{2, "x"}, {3, "y"}, {4, "z"}},
+      {2, 3, 4},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
             cast<float>(z) * z;
@@ -2391,8 +2358,8 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
   LoopNest l({tensor});
 
   LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
-  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "x,y,z,");
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,");
 
   std::vector<int> stmt1_output(24, 0);
   SimpleIREvaluator cg(stmt1, {tensor});
@@ -2400,7 +2367,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 
   auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,");
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,i,k,");
 
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2414,7 +2381,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 
   loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[1], loops[2]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,");
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,k,i,");
 
   StmtPtr stmt3 = Stmt::clone(l.root_stmt());
 
@@ -2430,7 +2397,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 TEST(LoopNest, LoopNestReorderInternalAxis) {
   Tensor tensor = Compute(
       "f",
-      {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
+      {1, 2, 3, 4},
       [](const VarHandle& w,
          const VarHandle& x,
          const VarHandle& y,
@@ -2441,8 +2408,8 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
   LoopNest l({tensor});
 
   LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
-  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "w,x,y,z,");
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
+  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,l,");
 
   std::vector<int> stmt1_output(24, 0);
   SimpleIREvaluator cg(stmt1, {tensor});
@@ -2450,7 +2417,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
 
   auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[2], loops[1]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,");
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "i,k,j,l,");
 
   StmtPtr stmt2 = l.root_stmt();
 
@@ -2466,7 +2433,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
 TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   Tensor tensor = Compute(
       "f",
-      {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
+      {1, 2, 3, 4},
       [](const VarHandle& w,
          const VarHandle& x,
          const VarHandle& y,
@@ -2477,7 +2444,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   LoopNest l({tensor});
 
   LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   std::vector<int> stmt1_output(24, 0);
   SimpleIREvaluator cg(stmt1, {tensor});
@@ -2485,7 +2452,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
 
   auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[3]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,");
+  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "l,j,k,i,");
 
   StmtPtr stmt2 = l.root_stmt();
 
@@ -2499,8 +2466,8 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
 }
 
 TEST(LoopNest, LoopNestReorderSameAxis) {
-  Tensor tensor = Compute(
-      "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
+  Tensor tensor =
+      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
@@ -2518,18 +2485,18 @@ TEST(LoopNest, LoopNestReorderSameAxis) {
 
 TEST(LoopNest, LoopNestReorderExtraStatements) {
   /* We're going for a structure like this:
-   * for x in ...
+   * for i in ...
    *   Stmt 1
-   *   for y in ...
+   *   for j in ...
    *     Stmt 2
-   *     for z in ...
+   *     for k in ...
    *       Stmt 3
    *     Stmt 4
    */
 
   Tensor tensor = Compute(
       "f",
-      {{2, "x"}, {3, "y"}, {4, "z"}},
+      {2, 3, 4},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
             cast<float>(z) * z;
@@ -2542,15 +2509,15 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   VarHandle i = VarHandle(loops[0]->var());
 
-  StmtPtr store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f));
-  StmtPtr store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f));
+  StmtPtr store_1 = Store::make(extra, {i, 0}, 1.f);
+  StmtPtr store_2 = Store::make(extra, {i, 1}, 2.f);
   // stmt 3 is the Function body.
-  StmtPtr store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f));
+  StmtPtr store_3 = Store::make(extra, {i, 2}, 4.f);
 
   loops[0]->body()->prepend_stmt(store_1);
   loops[1]->body()->prepend_stmt(store_2);
   loops[1]->body()->append_stmt(store_3);
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   std::vector<int> extra1(6, 0);
   std::vector<int> res1(24, 0);
@@ -2559,14 +2526,14 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   /* Then we reorder loop y and z, we want it to look like:
    *
-   * for x in ...
+   * for i in ...
    *   Stmt 1
-   *   for y in ...
+   *   for j in ...
    *     Stmt 2
-   *   for z in ...
-   *    for y in ...
+   *   for j_1 in ...
+   *    for k in ...
    *       Stmt 3
-   *   for y in ...
+   *   for j_2 in ...
    *     Stmt 4
    *
    * We need extra loops because we don't have dependency info about stmt 3
@@ -2575,19 +2542,19 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
    */
 
   LoopNest::reorderAxis(loops[1], loops[2]);
-  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   // Check the IR we produced
   checkIR(stmt2, R"IR(
-# CHECK: for (int x
-# CHECK:   res[x, 0] = 1
-# CHECK:   for (int y
-# CHECK:     res[x, 1] = 2
-# CHECK:   for (int z
-# CHECK:     for (int y
+# CHECK: for
+# CHECK:   res[i, 0] = 1
+# CHECK:   for
+# CHECK:     res[i, 1] = 2
+# CHECK:   for
+# CHECK:     for
 # CHECK:       f[
-# CHECK:   for (int y
-# CHECK:     res[x, 2] = 4
+# CHECK:   for
+# CHECK:     res[i, 2] = 4
 )IR");
 
   std::vector<int> extra2(6, 0);
@@ -2623,21 +2590,21 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
    */
   loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[2]);
-  StmtPtr stmt3 = Stmt::clone(l.root_stmt());
+  StmtPtr stmt3 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
 
   // Check the IR we produced
   checkIR(stmt3, R"IR(
-# CHECK: for (int x
-# CHECK:   res[x, 0] = 1
-# CHECK:   for (int y
-# CHECK:     res[x, 1] = 2
-# CHECK: for (int y
-# CHECK:   for (int z
-# CHECK:     for (int x
+# CHECK: for
+# CHECK:   res[i, 0] = 1
+# CHECK:   for
+# CHECK:     res[i, 1] = 2
+# CHECK: for
+# CHECK:   for
+# CHECK:     for
 # CHECK:       f[
-# CHECK: for (int x
-# CHECK:   for (int y
-# CHECK:     res[x, 2] = 4
+# CHECK: for
+# CHECK:   for
+# CHECK:     res[i_2, 2] = 4
 )IR");
 
   std::vector<int> extra3(6, 0);
@@ -2659,9 +2626,7 @@ void LoopNestReorderTestHelper(
     int index1,
     int index2) {
   Tensor c = Compute(
-      "5d",
-      {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
-      [](const std::vector<VarHandle>&) { return -1; });
+      "5d", {2, 3, 2, 3, 2}, [](const std::vector<VarHandle>&) { return -1; });
   LoopNest l({c});
 
   BufHandle extra("extra", {5}, kInt);
@@ -2783,34 +2748,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor y = Compute(
       "y",
-      {{M, "m2"}, {N, "n2"}, {K, "k2"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
   Tensor z = Compute(
       "z",
-      {{M, "m3"}, {N, "n3"}, {K, "k3"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return x.load(m, n, k) + y.load(m, n, k);
       });
 
   LoopNest l({z}, {x, y, z});
-  ForPtr a = nullptr;
-  ForPtr b = nullptr;
-  auto fors = NodeFinder<For>::find(l.root_stmt());
-  for (auto f : fors) {
-    if (f->var()->name_hint() == "m2") {
-      a = f;
-    } else if (f->var()->name_hint() == "k2") {
-      b = f;
-    }
-  }
+  ForPtr a = l.getAllLoopNestsWritingToBuf(y.buf())[0][2];
+  ForPtr b = l.getAllLoopNestsWritingToBuf(y.buf())[0][0];
   LoopNest::reorderAxis(a, b);
 
   l.prepareForCodegen();
@@ -2819,15 +2776,15 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
   // Check the IR we produced has the 3 nests in the right order, but k and m
   // swapped in the middle.
   checkIR(stmt, R"IR(
-# CHECK: for (int m1
-# CHECK:   for (int n1
-# CHECK:     for (int k1
-# CHECK: for (int k2
-# CHECK:   for (int n2
-# CHECK:     for (int m2
-# CHECK: for (int m3
-# CHECK:   for (int n3
-# CHECK:     for (int k3)IR");
+# CHECK: < 4
+# CHECK: < 5
+# CHECK: < 6
+# CHECK: < 6
+# CHECK: < 5
+# CHECK: < 4
+# CHECK: < 4
+# CHECK: < 5
+# CHECK: < 6)IR");
 
   {
     PaddedBuffer<float> a_v(M, N);
@@ -2873,8 +2830,8 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
 }
 
 TEST(LoopNest, OuterLoopVectorization) {
-  Tensor tensor = Compute(
-      "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) {
+  Tensor tensor =
+      Compute("f", {8, 8}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
@@ -2924,8 +2881,8 @@ namespace {
 
 std::string constantUpperBoundLoopIR(int upper_bound_val) {
   ExprHandle upper_bound(upper_bound_val);
-  Tensor A = Compute(
-      "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
+  Tensor A =
+      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
@@ -2953,21 +2910,21 @@ TEST(LoopNest, UnrollOuter) {
   ExprHandle inner_bound(4);
   Tensor A = Compute(
       "A",
-      {{outer_bound, "x"}, {inner_bound, "y"}},
+      {outer_bound, inner_bound},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::fullUnroll(loops[0], &unrolled);
   checkIR(unrolled, R"IR(
-# CHECK: for (int y = 0; y < 4; y++) {
-# CHECK: A[0, y] = y;
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[0, i] = i;
 # CHECK: }
-# CHECK: for (int y = 0; y < 4; y++) {
-# CHECK: A[1, y] = y + 1;
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[1, i] = i + 1;
 # CHECK: }
-# CHECK: for (int y = 0; y < 4; y++) {
-# CHECK: A[2, y] = y + 2;
+# CHECK: for (int i = 0; i < 4; i++) {
+# CHECK: A[2, i] = i + 2;
 # CHECK: })IR");
 }
 
@@ -2976,7 +2933,7 @@ TEST(LoopNest, UnrollInner) {
   ExprHandle inner_bound(4);
   Tensor A = Compute(
       "A",
-      {{outer_bound, "x"}, {inner_bound, "y"}},
+      {outer_bound, inner_bound},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
@@ -2984,11 +2941,11 @@ TEST(LoopNest, UnrollInner) {
   LoopNest::fullUnroll(
       static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
   checkIR(loops[0], R"IR(
-# CHECK: for (int x = 0; x < 3; x++) {
-# CHECK: A[x, 0] = x;
-# CHECK: A[x, 1] = x + 1;
-# CHECK: A[x, 2] = x + 2;
-# CHECK: A[x, 3] = x + 3;
+# CHECK: for (int i = 0; i < 3; i++) {
+# CHECK: A[i, 0] = i;
+# CHECK: A[i, 1] = i + 1;
+# CHECK: A[i, 2] = i + 2;
+# CHECK: A[i, 3] = i + 3;
 # CHECK: })IR");
 }
 
@@ -3174,8 +3131,8 @@ TEST(LoopNest, UnrollEmpty) {
 
 TEST(LoopNest, NoUnroll) {
   VarHandle upper_bound("N", kInt);
-  Tensor A = Compute(
-      "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
+  Tensor A =
+      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
@@ -3439,8 +3396,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
   BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   // Input IR:
@@ -3486,8 +3442,7 @@ TEST(LoopNest, NotNormalizeAndSplitWithTail) {
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
   BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
+  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   // Input IR:
@@ -3760,7 +3715,7 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   BufHandle b("b", {m, n}, kFloat);
-  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
   LoopNest loop({c});
   HashProvider hasher;
   auto hash_before = hasher.hash(loop.root_stmt());
@@ -3815,28 +3770,26 @@ TEST(LoopNest, DetectInlineRankMismatch) {
   const int kTotalSize = 8;
 
   BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return a_buf.load(i);
-  });
+  Tensor a = Compute(
+      "a", {kTotalSize}, [&](const VarHandle& i) { return a_buf.load(i); });
   Tensor reshape = Compute(
       "reshape",
-      {{kTotalSize / 2, "i"}, {2, "j"}},
+      {kTotalSize / 2, 2},
       [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
   LoopNest l({reshape}, {a, reshape});
   ASSERT_FALSE(l.computeInline(l.getLoopBodyFor(a)));
 }
 
 TEST(LoopNest, CacheReadsSimple) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 30, j + 3);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -3845,7 +3798,8 @@ TEST(LoopNest, CacheReadsSimple) {
   LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
 
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {B, C});
   result = cg.stmt();
 
@@ -3894,16 +3848,15 @@ TEST(LoopNest, CacheReadsSimple) {
 }
 
 TEST(LoopNest, CacheReadsOuter) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -3912,7 +3865,8 @@ TEST(LoopNest, CacheReadsOuter) {
   LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
 
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {B, C});
   result = cg.stmt();
 
@@ -3941,16 +3895,15 @@ TEST(LoopNest, CacheReadsOuter) {
 }
 
 TEST(LoopNest, CacheReadsInternal) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -3958,13 +3911,14 @@ TEST(LoopNest, CacheReadsInternal) {
   StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
   LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {B, C});
   result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11]
-#CHECK: A_local[j_1 + 11 * i_2] =
+#CHECK: A_local[k + 11 * j_1] =
 #CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]);
       )IR");
 
@@ -3987,17 +3941,16 @@ TEST(LoopNest, CacheReadsInternal) {
 }
 
 TEST(LoopNest, CacheReadsInner) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
   // note im changing the offset of the first arg of the first call to A.
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -4005,13 +3958,14 @@ TEST(LoopNest, CacheReadsInner) {
   StmtPtr body = l.getLoopBodyFor(B);
   LoopNest::cacheAccesses(A.buf(), "A_local", body);
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {B, C});
   result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2]
-#CHECK: A_local[j_2 + 2 * i_2] =
+#CHECK: A_local[l + 2 * k] =
 #CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]);
       )IR");
 
@@ -4034,16 +3988,15 @@ TEST(LoopNest, CacheReadsInner) {
 }
 
 TEST(LoopNest, CacheWritesSimple) {
-  Tensor A = Compute(
-      "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
-        return i * j;
-      });
-  Tensor B = Compute(
-      "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
+    return i * j;
+  });
+  Tensor B =
+      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor C = Compute(
-      "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
+  Tensor C =
+      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
         return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
@@ -4052,7 +4005,8 @@ TEST(LoopNest, CacheWritesSimple) {
   LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
 
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {B, C});
   result = cg.stmt();
 
@@ -4212,13 +4166,13 @@ TEST(LoopNest, InlineConstantIndex) {
   BufHandle x_buf("a", {1, N, 1}, kFloat);
   Tensor y = Compute(
       "f",
-      {{1, "m"}, {N, "n"}, {1, "o"}},
+      {1, N, 1},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
         return x_buf.load(m, n, o);
       });
   Tensor z = Compute(
       "f",
-      {{1, "m"}, {N, "n"}, {1, "o"}},
+      {1, N, 1},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
         return y.load(m, n, o);
       });
@@ -4244,10 +4198,9 @@ TEST(LoopNest, CompoundTensorUsed) {
   BlockPtr body = Block::make({outer_for1, outer_for2});
 
   Tensor A = Tensor(a_buf.node(), body);
-  Tensor B = Compute(
-      "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i, j + 1) + A.load(i, j + 2);
-      });
+  Tensor B = Compute("B", {10, 3}, [&](const VarHandle& i, const VarHandle& j) {
+    return A.load(i, j + 1) + A.load(i, j + 2);
+  });
 
   LoopNest l({B}, {A, B});
   ASSERT_FALSE(l.computeInline(A.buf()));
@@ -4485,7 +4438,7 @@ TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) {
       R"IR(
 # CHECK: for (int i = 0; i < 5
 # CHECK-NEXT: A[i] = B[i]
-# CHECK-NEXT: B[i] = IfThenElse(i<30 ? 1 : 0, C[i], D[i])
+# CHECK-NEXT: B[i] = C[i]
 # CHECK: for (int i = 0; i < 45
 # CHECK-NEXT: A[i + 5] = C[i]
 # CHECK-NEXT: B[i + 5] = IfThenElse(i + 5<30 ? 1 : 0, C[i + 5], D[i + 5])
@@ -4813,11 +4766,11 @@ static std::pair<BufHandle, Tensor> colReduce(int M, int N) {
   BufHandle a("a", {M, N}, kFloat);
   Tensor t = Reduce(
       "b",
-      {{N, "n"}},
+      {N},
       Sum(),
       [&](const VarHandle& n, const VarHandle& m) { return a.load(m, n); },
-      {{M, "m"}});
-  return {a, t};
+      {M});
+  return {a, Tensor(t.buf(), LoopNest::sanitizeNames(t.stmt()))};
 }
 
 static StmtPtr splitTailReorder(Tensor b) {
@@ -4827,23 +4780,23 @@ static StmtPtr splitTailReorder(Tensor b) {
   nest.splitWithTail(loops[0], kVectorWidth);
   // Now the loopnests will look like:
   //
-  // for (int n_outer = 0; ...
-  //   for (int n_inner = 0; ...
-  //     b[n_outer * 8 + n_inner] = float(0);
-  //     for (int m = 0; ...
-  //       b[n_outer * 8 + n_inner] = ReduceOp(...);
+  // for (int i_outer = 0; ...
+  //   for (int i_inner = 0; ...
+  //     b[i_outer * 8 + i_inner] = float(0);
+  //     for (int j = 0; ...
+  //       b[i_outer * 8 + i_inner] = ReduceOp(...);
   //
-  // for (int n_tail = 0; ...
-  //   b[n_tail + ((100 - 0) / 8) * 8] = float(0);
-  //   for (int m = 0; ...
-  //     b[n_tail + ((100 - 0) / 8) * 8] = ReduceOp(...);
+  // for (int i_tail = 0; ...
+  //   b[i_tail + ((100 - 0) / 8) * 8] = float(0);
+  //   for (int j = 0; ...
+  //     b[i_tail + ((100 - 0) / 8) * 8] = ReduceOp(...);
   //
   // Since there are 4 writes to b, we will get 4 loopnests from the
   // call to `getAllLoopNestsWritingToBuf` below.
   //
-  // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)"
-  // Loopnest #2: {n_outer, n_inner, m};
-  // We will have to reorder n_inner and m.
+  // Write #2: "b[i_outer * 8 + i_inner] = ReduceOp(...)"
+  // Loopnest #2: {i_outer, i_inner, j};
+  // We will have to reorder i_inner and j.
   auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
   LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
   nest.prepareForCodegen();
@@ -4891,11 +4844,11 @@ TEST(LoopNest, ColReduceSplitTailEvenReorder) {
   oss << *s;
   const std::string& verification_pattern =
       R"IR(
-# CHECK: for (int n_outer
-# CHECK-NEXT: for (int n_inner
+# CHECK: for (int i_outer
+# CHECK-NEXT: for (int i_inner
 # CHECK-NEXT: b[
-# CHECK: for (int m
-# CHECK-NEXT: for (int n_inner
+# CHECK: for (int j
+# CHECK-NEXT: for (int i_inner
 # CHECK-NEXT: b[
 # CHECK-NOT: for (
       )IR";
@@ -4913,15 +4866,15 @@ TEST(LoopNest, ColReduceSplitTailUnevenReorder) {
   oss << *s;
   const std::string& verification_pattern =
       R"IR(
-# CHECK: for (int n_outer
-# CHECK-NEXT: for (int n_inner
+# CHECK: for (int i_outer
+# CHECK-NEXT: for (int i_inner
 # CHECK-NEXT: b[
-# CHECK: for (int m
-# CHECK-NEXT: for (int n_inner
+# CHECK: for (int j
+# CHECK-NEXT: for (int i_inner
 # CHECK-NEXT: b[
-# CHECK: for (int n_tail
+# CHECK: for (int i_tail
 # CHECK-NEXT: b[
-# CHECK-NEXT: for (int m
+# CHECK-NEXT: for (int j
 # CHECK-NEXT: b[
       )IR";
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -4985,10 +4938,10 @@ TEST(LoopNest, ReorderAxisWithMultipleConds) {
 TEST(LoopNest, VectorizeUse) {
   constexpr int N = 8;
   BufHandle a("a", {N}, kFloat);
-  Tensor b = Compute(
-      "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
-  Tensor c = Compute(
-      "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
+  Tensor b =
+      Compute("b", {N}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
+  Tensor c =
+      Compute("c", {N}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
   LoopNest nest({c}, {b, c});
   auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
@@ -5007,8 +4960,8 @@ TEST(LoopNest, VectorizeUse) {
 }
 
 const char* int64Loop = R"IR(
-# CHECK: for (int64_t n = 0ll; n < 12ll; n++) {
-# CHECK:   b[n] = (a[n]) + 1ll;
+# CHECK: for (int64_t i = 0ll; i < 12ll; i++) {
+# CHECK:   b[i] = (a[i]) + 1ll;
 # CHECK: }
 )IR";
 
@@ -5016,7 +4969,7 @@ TEST(LoopNest, Int64Direct) {
   constexpr int64_t N = 12;
   BufHandle a("a", {N}, kLong);
   BufHandle b("b", {N}, kLong);
-  VarHandle n("n", kLong);
+  VarHandle n("i", kLong);
   StmtPtr s = For::make(
       n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
   s = IRSimplifier::simplify(s);
@@ -5028,7 +4981,7 @@ TEST(LoopNest, Int64Direct) {
 TEST(LoopNest, Int64Compute) {
   constexpr int64_t N = 12;
   BufHandle a("a", {N}, kLong);
-  Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
+  Tensor b = Compute("b", {N}, [&](const VarHandle& n) {
     return a.load(n) + LongImm::make(1l);
   });
   LoopNest nest({b});
@@ -6935,15 +6888,15 @@ TEST(LoopNest, compressMultipleBuffers) {
 }
 
 TEST(LoopNest, sanitizeNames) {
-  std::vector<DimArg> dim_args;
+  std::vector<ExprHandle> dim_args;
   // Let's pick names that would overlap with default index names if not
   // sanitized properly:
-  dim_args.emplace_back(ExprHandle(alloc<Var>("i", kInt)), "");
-  dim_args.emplace_back(ExprHandle(alloc<Var>("N:2", kInt)), "");
+  dim_args.emplace_back(ExprHandle(alloc<Var>("i", kInt)));
+  dim_args.emplace_back(ExprHandle(alloc<Var>("N:2", kInt)));
   // Now let's create a many dimensions so that we had to use the same letter
   // for different loops
   for (int i = 0; i < 10; i++) {
-    dim_args.emplace_back(ExprHandle(alloc<Var>("N", kInt)), "");
+    dim_args.emplace_back(ExprHandle(alloc<Var>("N", kInt)));
   }
 
   // Now create two Computes with conflicting after sanitization names:
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index 7019353937b7..03ea24a87afd 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -76,6 +76,134 @@ TEST(MemDependency, BoundOverlap) {
   ASSERT_EQ(ContainedOrEqual, boundOverlap(CB(15, 15), CB(2, 15)));
 }
 
+TEST(MemDependency, BoundComparison) {
+  using namespace analysis;
+
+  auto CB = [](int s, int e) {
+    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
+  };
+
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kEQ));
+
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kNE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kNE));
+
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLT));
+
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGE));
+
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGT));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGT));
+
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::TRUE,
+      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::FALSE,
+      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLE));
+  ASSERT_EQ(
+      CmpEvalResult::NOT_DETERMINED,
+      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLE));
+}
+
 TEST(MemDependency, BoundOverlapSymbolic) {
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -274,7 +402,7 @@ TEST(MemDependency, BoundSubtractMultiDim) {
     if (x.size() != y.size()) {
       return false;
     }
-    for (auto i = 0; i < x.size(); ++i) {
+    for (auto i = 0U; i < x.size(); ++i) {
       if (!indexBoundsEquals(x[i], y[i])) {
         return false;
       }
@@ -338,7 +466,7 @@ TEST(MemDependency, BoundSubtractMultiDimSymbolic) {
     if (x.size() != y.size()) {
       return false;
     }
-    for (auto i = 0; i < x.size(); ++i) {
+    for (auto i = 0U; i < x.size(); ++i) {
       if (!indexBoundsEquals(x[i], y[i])) {
         return false;
       }
@@ -543,8 +671,7 @@ TEST(MemDependency, MemDependencyCheckerLoopReduce) {
    */
 
   StorePtr aInit = Store::make(a, {0}, 0);
-  ExprHandle reduce =
-      ExprHandle(Sum()(a.node(), ExprHandle(1), {x.node()}, {x.node()}));
+  ExprHandle reduce = Sum()(a, 1, {x}, {x});
   StorePtr aReduce = Store::make(a, {0}, reduce);
   StmtPtr loop = For::make(x, 0, 10, aReduce);
   StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
@@ -2697,13 +2824,13 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
   BufHandle b_buf("b", {5, 6}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
   Tensor d = Compute(
       "d",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c.load(m, n, k) + 1;
       });
@@ -2742,13 +2869,13 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
   BufHandle b_buf("b", {5, 6}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
   Tensor d = Compute(
       "d",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return c.load(m, n, k) + 1;
       });
@@ -2777,7 +2904,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
   BufHandle b_buf("b", {5, 6}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
@@ -2823,7 +2950,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
   BufHandle b_buf("b", {5, 6}, kFloat);
   Tensor c = Compute(
       "broadcast_add",
-      {{4, "m"}, {5, "n"}, {6, "k"}},
+      {4, 5, 6},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
@@ -2889,11 +3016,11 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
 
   Tensor c = Compute(
       "scale",
-      {{2, "l2"}, {3, "n1"}, {6, "m1"}},
+      {2, 3, 6},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
+  Tensor d = Reduce("sum", {2}, Sum(), c, {3, 6});
   LoopNest l({d}, {c, d});
 
   MemDependencyChecker analyzer({a.node(), b.node()}, {d.buf()});
@@ -2925,12 +3052,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
   BufHandle BP("B", {K, N}, kFloat);
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
+      {K});
   LoopNest loop({CT});
 
   {
diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp
index ec58aa8f6668..f5ee8747650f 100644
--- a/test/cpp/tensorexpr/test_memplanning.cpp
+++ b/test/cpp/tensorexpr/test_memplanning.cpp
@@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include <test/cpp/tensorexpr/test_base.h>
 
+#include <c10/util/irange.h>
+#include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
@@ -85,6 +87,232 @@ TEST(BufLiveRange, MulRangeLine) {
   ASSERT_TRUE(std::get<1>(range_b) == 1);
 }
 
+TEST(MemPlanning, MemReuseWithTypeCast) {
+  int M = 4;
+  int N = 4;
+  int K = 4;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return CompareSelect::make(
+            CT.load(m, n), 0.0f, 0.0f, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return Cast::make(kQUInt8, DT.load(m, n) + DT.load(m, n));
+      });
+  Tensor FT =
+      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n);
+      });
+  StmtPtr stmt =
+      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
+  // different: 'E' type quint8 < 'gemm' type float. We'll reuse 'gemm' for 'E'
+  // with typecasting.
+  //{
+  //  for (int i = 0; i < 4; i++) {
+  //    for (int i_1 = 0; i_1 < 4; i_1++) {
+  //      gemm[i, i_1] = float(0);
+  //      for (int i_2 = 0; i_2 < 4; i_2++) {
+  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
+  //        i_1]), reduce_args={i_2});
+  //      }
+  //    }
+  //  }
+  //  for (int i_3 = 0; i_3 < 4; i_3++) {
+  //    for (int i_4 = 0; i_4 < 4; i_4++) {
+  //      relu[i_3, i_4] = (gemm[i_3, i_4])<0.f ? 0.f : (gemm[i_3, i_4]);
+  //    }
+  //  }
+  //  for (int i_5 = 0; i_5 < 4; i_5++) {
+  //    for (int i_6 = 0; i_6 < 4; i_6++) {
+  //      E[i_5, i_6] = quint8((relu[i_5, i_6]) + (relu[i_5, i_6]));
+  //    }
+  //  }
+  //  for (int i_7 = 0; i_7 < 4; i_7++) {
+  //    for (int i_8 = 0; i_8 < 4; i_8++) {
+  //      F[i_7, i_8] = E[i_7, i_8];
+  //    }
+  //  }
+  //}
+
+  LoopNest l(stmt, {FT.buf()});
+  l.prepareForCodegen();
+  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
+# CHECK: Alias(E,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  PaddedBuffer<float> a_v(M, K, "a");
+  PaddedBuffer<float> b_v(K, N, "b");
+  PaddedBuffer<uint8_t> o1(M, N, "e_before");
+  PaddedBuffer<uint8_t> o2(M, N, "e_after");
+
+  for (const auto m : c10::irange(M)) {
+    for (const auto k : c10::irange(K)) {
+      a_v(m, k) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  for (const auto k : c10::irange(K)) {
+    for (const auto n : c10::irange(N)) {
+      b_v(k, n) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  cg.call({a_v, b_v, o1});
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
+# CHECK: Alias(E,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  cg_llvm.call({a_v, b_v, o2});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(o1, o2, 1e-5);
+#endif
+}
+
+TEST(MemPlanning, NoMemReuseForLargerType) {
+  int M = 4;
+  int N = 4;
+  int K = 4;
+
+  BufHandle AP("A", {M, K}, kShort);
+  BufHandle BP("B", {K, N}, kShort);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {M, N},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {K});
+  auto zero = Cast::make(CT.buf()->dtype(), 0);
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET =
+      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return Cast::make(kFloat, DT.load(m, n) + DT.load(m, n));
+      });
+  Tensor FT =
+      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n);
+      });
+  StmtPtr stmt =
+      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
+  // different: 'E' type float > 'gemm' type int16. We won't reuse 'gemm' for
+  // 'E'.
+  //{
+  //  for (int i = 0; i < 4; i++) {
+  //    for (int i_1 = 0; i_1 < 4; i_1++) {
+  //      gemm[i, i_1] = int16_t(0);
+  //      for (int i_2 = 0; i_2 < 4; i_2++) {
+  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
+  //        i_1]), reduce_args={i_2});
+  //      }
+  //    }
+  //  }
+  //  for (int i_3 = 0; i_3 < 4; i_3++) {
+  //    for (int i_4 = 0; i_4 < 4; i_4++) {
+  //      relu[i_3, i_4] = (gemm[i_3, i_4])<int16_t(0) ? int16_t(0) : (gemm[i_3,
+  //      i_4]);
+  //    }
+  //  }
+  //  for (int i_5 = 0; i_5 < 4; i_5++) {
+  //    for (int i_6 = 0; i_6 < 4; i_6++) {
+  //      E[i_5, i_6] = float((relu[i_5, i_6]) + (relu[i_5, i_6]));
+  //    }
+  //  }
+  //  for (int i_7 = 0; i_7 < 4; i_7++) {
+  //    for (int i_8 = 0; i_8 < 4; i_8++) {
+  //      F[i_7, i_8] = E[i_7, i_8];
+  //    }
+  //  }
+  //}
+
+  LoopNest l(stmt, {FT.buf()});
+  l.prepareForCodegen();
+  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT.buf()});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
+# CHECK: Free(E);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  PaddedBuffer<short> a_v(M, K, "a");
+  PaddedBuffer<short> b_v(K, N, "b");
+  PaddedBuffer<float> o1(M, N, "e_before");
+  PaddedBuffer<float> o2(M, N, "e_after");
+
+  for (const auto m : c10::irange(M)) {
+    for (const auto k : c10::irange(K)) {
+      a_v(m, k) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  for (const auto k : c10::irange(K)) {
+    for (const auto n : c10::irange(N)) {
+      b_v(k, n) = at::randn({1}).item().to<float>();
+    }
+  }
+
+  cg.call({a_v, b_v, o1});
+
+#ifdef TORCH_ENABLE_LLVM
+  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
+# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
+# CHECK: Free(E);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+  cg_llvm.call({a_v, b_v, o2});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  ExpectAllNear(o1, o2, 1e-5);
+#endif
+}
+
 TEST(MemPlanning, SameBufSizeMemReuse) {
   int M = 1024;
   int N = 1024;
@@ -95,30 +323,24 @@ TEST(MemPlanning, SameBufSizeMemReuse) {
 
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
-  Tensor DT = Compute(
-      "relu",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         auto zero = Cast::make(CT.buf()->dtype(), 0);
         return CompareSelect::make(
             CT.load(m, n), zero, zero, CT.load(m, n), kLT);
       });
-  Tensor ET = Compute(
-      "add",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return DT.load(m, n) + DT.load(m, n);
       });
-  Tensor FT = Compute(
-      "mul",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return ET.load(m, n) * ET.load(m, n);
       });
   auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
@@ -188,36 +410,28 @@ TEST(MemPlanning, SameBufSizeMultiMemReuses) {
 
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
-  Tensor DT = Compute(
-      "relu",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         auto zero = Cast::make(CT.buf()->dtype(), 0);
         return CompareSelect::make(
             CT.load(m, n), zero, zero, CT.load(m, n), kLT);
       });
-  Tensor ET = Compute(
-      "add",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return DT.load(m, n) + DT.load(m, n);
       });
-  Tensor FT = Compute(
-      "mul",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return ET.load(m, n) * ET.load(m, n);
       });
-  Tensor GT = Compute(
-      "sub",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor GT =
+      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return FT.load(m, n) - ET.load(m, n);
       });
 
@@ -296,42 +510,32 @@ TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) {
 
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
-  Tensor DT = Compute(
-      "relu",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         auto zero = Cast::make(CT.buf()->dtype(), 0);
         return CompareSelect::make(
             CT.load(m, n), zero, zero, CT.load(m, n), kLT);
       });
-  Tensor ET = Compute(
-      "add",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor ET =
+      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return DT.load(m, n) + DT.load(m, n);
       });
-  Tensor FT = Compute(
-      "mul",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor FT =
+      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return ET.load(m, n) * ET.load(m, n);
       });
-  Tensor GT = Compute(
-      "sub",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor GT =
+      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return FT.load(m, n) - 1;
       });
-  Tensor HT = Compute(
-      "div",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+  Tensor HT =
+      Compute("div", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         return GT.load(m, n) / 2;
       });
 
@@ -418,30 +622,24 @@ TEST(MemPlanning, SmallerBufSizeNonMemReuse) {
 
   Tensor CT = Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return AP.load(m, k) * BP.load(k, n);
       },
-      {{K, "K"}});
-  Tensor DT = Compute(
-      "relu",
-      {{M, "M"}, {N, "N"}},
-      [&](const ExprHandle& m, const ExprHandle& n) {
+      {K});
+  Tensor DT =
+      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
         auto zero = Cast::make(CT.buf()->dtype(), 0);
         return CompareSelect::make(
             CT.load(m, n), zero, zero, CT.load(m, n), kLT);
       });
   Tensor ET = Compute(
-      "add",
-      {{M * 2, "EM"}, {N * 2, "EN"}},
-      [&](const ExprHandle& em, const ExprHandle& en) {
+      "add", {M * 2, N * 2}, [&](const ExprHandle& em, const ExprHandle& en) {
         return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2);
       });
   Tensor FT = Compute(
-      "mul",
-      {{M * 2, "FM"}, {N * 2, "FN"}},
-      [&](const ExprHandle& fm, const ExprHandle& fn) {
+      "mul", {M * 2, N * 2}, [&](const ExprHandle& fm, const ExprHandle& fn) {
         return ET.load(fm, fn) * ET.load(fm, fn);
       });
   auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
index e4c9155ff60c..379c901968d5 100644
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 #include <torch/torch.h>
@@ -24,12 +25,15 @@ TEST(Ops, Sum) {
   constexpr int N = 16;
   std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
   std::vector<std::vector<ExprHandle>> outputShapes = {{N}, {M}, {}};
-  for (int idx = 0; idx < testDims.size(); idx++) {
+  for (unsigned idx = 0; idx < testDims.size(); idx++) {
     const auto& dims = testDims[idx];
     const auto& outShape = outputShapes[idx];
 
     BufHandle a("a", {M, N}, kFloat);
-    Tensor b = computeSum({a, dims, false}, outShape, c10::kFloat, at::kCPU);
+    std::vector<ExprHandle> outStrides =
+        c10::fmap<ExprHandle>(make_contiguous_strides(outShape));
+    Tensor b = computeSum(
+        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
     auto cg = compile({a}, {b});
 
     auto at = at::arange(M * N, at::kFloat).view({M, N});
@@ -41,3 +45,34 @@ TEST(Ops, Sum) {
     ASSERT_TRUE(at::allclose(bt, ref));
   }
 }
+
+TEST(Ops, ChannelsLastSum) {
+  constexpr int A = 2;
+  constexpr int B = 3;
+  constexpr int C = 4;
+  constexpr int D = 5;
+  constexpr int E = 6;
+  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
+
+  std::vector<std::vector<ExprHandle>> outputShapes = {
+      {B, C, D, E}, {A, C, D, E}, {C, D, E}};
+  for (unsigned idx = 0; idx < testDims.size(); idx++) {
+    const auto& dims = testDims[idx];
+    const auto& outShape = outputShapes[idx];
+
+    BufHandle a("a", {A, B, C, D, E}, kFloat);
+    std::vector<ExprHandle> outStrides =
+        c10::fmap<ExprHandle>(make_channels_last_strides(outShape));
+    Tensor b = computeSum(
+        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
+    auto cg = compile({a}, {b});
+
+    auto at = at::arange(A * B * C * D * E, at::kFloat).view({A, B, C, D, E});
+    auto ref = at::sum(at, dims);
+    auto bt = at::empty_like(ref);
+
+    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
+
+    ASSERT_TRUE(at::allclose(bt, ref));
+  }
+}
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
index f6643c86846f..82eb8573cff5 100644
--- a/test/cpp/tensorexpr/test_quantization.cpp
+++ b/test/cpp/tensorexpr/test_quantization.cpp
@@ -1,6 +1,6 @@
 #include <gtest/gtest.h>
 
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -90,6 +90,38 @@ TEST_F(Quantization, QuantDequantUInt8) {
   CHECK_EQ(check, 1);
 }
 
+TEST_F(Quantization, QuantDequantUInt8_NLC) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %3 : int = prim::Constant[value=122]()
+        %4 : float = prim::Constant[value=0.1]()
+        %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
+        %6 : Float(1, 2, 2) = aten::dequantize(%q.1)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  x.unsafeGetTensorImpl()->set_sizes_and_strides({1, 2, 2}, {4, 1, 2});
+  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
+  auto y_expected = at::dequantize(q);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x:\n" << x << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  CHECK_EQ(check, 1);
+}
+
 at::Tensor quantized_add(
     at::Tensor x1,
     at::Tensor x2,
@@ -189,7 +221,99 @@ TEST_F(Quantization, QuantAddDequantUInt8) {
   CHECK_EQ(check, 1);
 }
 
-TEST_F(Quantization, QuantUpsampleNearest2dDequantUInt8) {
+TEST_F(Quantization, QuantSigmoidDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %qa : QUInt8(2, 2) = aten::sigmoid(%q1)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
+  auto qs = at::sigmoid(q1);
+  auto y_expected = at::dequantize(qs);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "qs:\n" << qs << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  CHECK_EQ(check, 1);
+}
+
+at::Tensor quantized_mul(
+    at::Tensor x1,
+    at::Tensor x2,
+    double scale,
+    int64_t zero) {
+  const auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("quantized::mul", "")
+          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
+  return op.call(x1, x2, scale, zero);
+}
+
+TEST_F(Quantization, QuantMulDequantUInt8) {
+  const auto graph_string = R"IR(
+      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
+        %2 : int = prim::Constant[value=13]()
+        %qz1 : int = prim::Constant[value=13]()
+        %qs1 : float = prim::Constant[value=0.1]()
+        %qz2 : int = prim::Constant[value=13]()
+        %qs2 : float = prim::Constant[value=0.1]()
+        %qza : int = prim::Constant[value=13]()
+        %qsa : float = prim::Constant[value=0.1]()
+        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
+        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
+        %qa : QUInt8(2, 2) = quantized::mul(%q1, %q2, %qsa, %qza)
+        %6 : Float(2, 2) = aten::dequantize(%qa)
+        return (%6))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
+  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
+  auto qa = quantized_mul(q1, q2, 0.1f, 13);
+  auto y_expected = at::dequantize(qa);
+
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {x1, x2};
+  StmtPtr s = k.getCodeGenStmt();
+
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto y = stack[0].toTensor();
+  bool check = at::allclose(y_expected, y);
+  if (!check) {
+    std::cout << "x1:\n" << x1 << std::endl;
+    std::cout << "q1:\n" << q1 << std::endl;
+    std::cout << "x2:\n" << x2 << std::endl;
+    std::cout << "q2:\n" << q2 << std::endl;
+    std::cout << "y_expected:\n" << y_expected << std::endl;
+    std::cout << "y:\n" << y << std::endl;
+  }
+  CHECK_EQ(check, 1);
+}
+
+TEST_F(Quantization, QuantUpsampleNearst2dDequantUInt8) {
   const auto graph_string = R"IR(
       graph(%x : Float(1, 1, 4, 4, strides=[16, 16, 4, 1], device=cpu)):
         %2 : int = prim::Constant[value=13]()
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index dc0ecceb980e..5d3c44574234 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
+  Tensor c = Reduce("sum", {M}, Sum(), b, {});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -80,7 +80,7 @@ TEST(Reductions, ReduceSum1D) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {10});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -109,7 +109,7 @@ TEST(Reductions, ReduceSum2D) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -138,7 +138,7 @@ TEST(Reductions, ReduceSum3D) {
 
   BufHandle b("b", {2, 3, m}, kFloat);
 
-  Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
+  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -168,7 +168,7 @@ TEST(Reductions, ReduceSum3D) {
     ASSERT_EQ(cData[i], expected);
   }
 
-  Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
+  Tensor d = Reduce("sum2", {2}, Sum(), b, {3, m});
   LoopNest loop2({d});
   loop2.prepareForCodegen();
   StmtPtr s2 = loop2.root_stmt();
@@ -186,7 +186,7 @@ TEST(Reductions, ReduceSum3D) {
 
   // This is the same as just reducing the original result across that axis.
   BufHandle c_buf(c.buf());
-  Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
+  Tensor e = Reduce("sum3", {2}, Sum(), c_buf, {3});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
   StmtPtr s3 = loop3.root_stmt();
@@ -210,12 +210,7 @@ TEST(Reductions, ReduceSum10D) {
   std::vector<float> in(InputSize, 1.f);
   std::vector<float> out(OutputSize, -1.f);
 
-  Tensor c = Reduce(
-      "sum",
-      {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
-      Sum(),
-      in_,
-      {{3, "f"}, {2, "g"}, {3, "h"}, {2, "i"}, {3, "j"}});
+  Tensor c = Reduce("sum", {2, 3, 2, 3, 2}, Sum(), in_, {3, 2, 3, 2, 3});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -250,7 +245,7 @@ TEST(Reductions, ReduceProduct) {
   Reducer product(
       ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
 
-  Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
+  Tensor c = Reduce("product", {M}, product, b, {N});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -281,7 +276,7 @@ TEST(Reductions, ReduceMax) {
     in[j] = j;
   }
 
-  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
+  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {10});
 
   LoopNest loop({dm1});
   loop.prepareForCodegen();
@@ -296,7 +291,7 @@ TEST(Reductions, ReduceMax) {
   BufHandle in2_("b", {2, 5}, kFloat);
   std::vector<float> out2(2, -1.f);
 
-  Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
+  Tensor m2d = Reduce("max", {2}, Maximum(kFloat), in2_, {5});
 
   LoopNest loop2({m2d});
   loop2.prepareForCodegen();
@@ -326,7 +321,7 @@ TEST(Reductions, ReduceMinCustomInitializer) {
       {},
       Minimum(ExprHandle(minInit)),
       [&](ParameterList& v) { return in_.load(v); },
-      {{10, "m"}});
+      {10});
 
   LoopNest loop({min});
   loop.prepareForCodegen();
@@ -357,12 +352,12 @@ TEST(Reductions, ReduceAnyAll) {
 
   Tensor any = Reduce(
       "anyEqual",
-      {{4, "i"}},
+      {4},
       anyEqSV,
       [&](const auto& i, const auto& j) {
         return CompareSelect::make(b.load(i, j), searchValue, kEQ);
       },
-      {{10, "j"}});
+      {10});
 
   LoopNest loop({any});
   loop.prepareForCodegen();
@@ -400,12 +395,12 @@ TEST(Reductions, ReduceAnyAll) {
 
   Tensor allGreaterThan = Reduce(
       "allGreaterThan",
-      {{4, "i"}},
+      {4},
       allGTSV,
       [&](const auto& i, const auto& j) {
         return CompareSelect::make(b.load(i, j), searchValue, kGT);
       },
-      {{10, "j"}});
+      {10});
 
   LoopNest loop2({allGreaterThan});
   loop2.prepareForCodegen();
@@ -448,12 +443,12 @@ TEST(Reductions, ReduceMatmul2D) {
 
   Tensor mm = Reduce(
       "mm",
-      {{3, "m"}, {3, "n"}},
+      {3, 3},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
         return tA.load(m, k) * tB.load(k, n);
       },
-      {{2, "k"}});
+      {2});
 
   LoopNest loop({mm});
   loop.prepareForCodegen();
@@ -480,10 +475,10 @@ TEST(Reductions, ReduceRfactorLike) {
   std::vector<float> in_rf_(10, -2.f);
   std::vector<float> out(1, -1.f);
 
-  Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
+  Tensor l1 = Reduce("l1", {10}, Sum(), in, {10});
   BufHandle in_rf(l1.buf());
 
-  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
+  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {10});
 
   LoopNest loop({l1, l2});
   loop.prepareForCodegen();
@@ -503,11 +498,9 @@ TEST(Reductions, ReduceAsProducer) {
   BufHandle a("a", {2, 3}, kFloat);
   BufHandle b("b", {2, 3, m}, kFloat);
 
-  Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
-  Tensor d = Compute(
-      "scale",
-      {{2, "l2"}, {3, "n1"}},
-      [&](const VarHandle& l, const VarHandle& n) {
+  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
+  Tensor d =
+      Compute("scale", {2, 3}, [&](const VarHandle& l, const VarHandle& n) {
         return c.load(l, n) * a.load(l, n);
       });
   LoopNest loop({d}, {c, d});
@@ -548,11 +541,11 @@ TEST(Reductions, ReduceAsConsumer) {
 
   Tensor c = Compute(
       "scale",
-      {{2, "l2"}, {3, "n1"}, {m, "m1"}},
+      {2, 3, m},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
+  Tensor d = Reduce("sum", {2}, Sum(), c, {3, m});
   LoopNest loop({d}, {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -599,7 +592,7 @@ TEST(Reductions, SplitReduceAxis) {
   }
   std::vector<float> out(16, -1.f);
 
-  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[1], 2);
@@ -627,7 +620,7 @@ TEST(Reductions, SplitNonReduceAxis) {
     }
   }
   std::vector<float> out(16, -1.f);
-  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[0], 2);
@@ -657,14 +650,14 @@ TEST(Reductions, ReorderedReductionInitializer) {
   BufHandle in("in", {1, 12, 6}, kFloat);
   std::vector<float> in_(12 * 6, 1.f);
 
-  Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor_ = Reduce("sum", {1, 12}, Sum(), in, {6});
   LoopNest l_({tensor_});
 
   l_.prepareForCodegen();
   StmtPtr s_ = Stmt::clone(l_.root_stmt());
   s_ = IRSimplifier::simplify(s_);
 
-  Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor = Reduce("sum", {1, 12}, Sum(), in, {6});
   LoopNest l({tensor});
 
   auto loops = l.getLoopStmtsFor(tensor);
@@ -709,7 +702,7 @@ TEST(Reductions, ReduceRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   auto c_body = loop.getAllWritesToBuf(c.buf())[1];
@@ -742,7 +735,7 @@ TEST(Reductions, Reduce3DRfactorInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   auto c_body = loop.getAllWritesToBuf(c.buf())[1];
@@ -775,7 +768,7 @@ TEST(Reductions, Reduce3DRfactorOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   auto c_body = loop.getAllWritesToBuf(c.buf())[1];
@@ -799,12 +792,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
   std::vector<float> out(1, -1.f);
   std::vector<float> ref(1, -1.f);
 
-  Tensor c = Reduce(
-      "sum",
-      {},
-      Sum(),
-      in_,
-      {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}});
+  Tensor c = Reduce("sum", {}, Sum(), in_, {2, 3, 4, 5, 6});
   LoopNest orig_loop({c});
 
   // Try rfactoring N outer loops
@@ -850,7 +838,7 @@ TEST(Reductions, ReduceSplitTail) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 8);
@@ -880,7 +868,7 @@ TEST(Reductions, ReduceSplitNoTail) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 5);
@@ -912,7 +900,7 @@ TEST(Reductions, ReduceOverSplitTail) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 16);
@@ -943,7 +931,7 @@ TEST(Reductions, ReduceSplitMask) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 8);
@@ -973,7 +961,7 @@ TEST(Reductions, ReduceSplitNoMask) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 5);
@@ -1004,7 +992,7 @@ TEST(Reductions, ReduceOverSplitMask) {
   for (const auto i : c10::irange(3)) {
     std::vector<float> out(M, -1.f);
 
-    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 16);
@@ -1038,7 +1026,7 @@ TEST(Reductions, ReduceSplitRfactor) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
@@ -1078,7 +1066,7 @@ TEST(Reductions, ReduceOverSplitRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {N, K});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1128,10 +1116,9 @@ TEST(Reductions, ReduceInlineReduction) {
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M, N, K}, kFloat);
 
-  Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
-  Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf.load(m) + x.load(m);
-  });
+  Tensor x = Reduce("x", {M}, Sum(), b_buf, {N, K});
+  Tensor y = Compute(
+      "y", {M}, [&](const VarHandle& m) { return a_buf.load(m) + x.load(m); });
 
   PaddedBuffer<float> a_v(M);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1162,11 +1149,11 @@ TEST(Reductions, ReduceInlineConsumer) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
-  Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {M}, Sum(), x, {N, K});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1215,7 +1202,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
 
   Tensor x = Compute(
       "x",
-      {{M, "m1"}, {N, "n1"}, {K, "k1"}},
+      {M, N, K},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
@@ -1223,7 +1210,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
     return Add::make(ExprHandle(1.f), Min::make(a, b, false));
   });
-  Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {M}, minimum, x, {N, K});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1272,26 +1259,28 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
 
   Tensor c = Compute(
       "scale",
-      {{L, "l2"}, {N, "n1"}, {M, "m1"}},
+      {L, N, M},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
 
-  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
   LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
+  SimpleIREvaluator cg_before(
+      LoopNest::sanitizeNames(l_before.root_stmt()), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
   l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
@@ -1299,16 +1288,16 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[4]
-#CHECK: for (int l1
-#CHECK:   d_local[l1] = 0.f
-#CHECK:   for (int n1
-#CHECK:     for (int m1
-#CHECK:       d_local[l1] = (d_local[l1]) + (scale[
+#CHECK: for (int i_2
+#CHECK:   d_local[i_2] = 0.f
+#CHECK:   for (int
+#CHECK:     for (int
+#CHECK:       d_local[i_2] = (d_local[i_2]) + (scale[
 #CHECK:     }
 #CHECK:   }
 #CHECK: }
-#CHECK: for (int i
-#CHECK:   sum[i] = d_local[i]
+#CHECK: for (int i_3
+#CHECK:   sum[i_3] = d_local[i_3]
 #CHECK: Free(d_local);
 #CHECK-NOT: d_local
       )IR";
@@ -1347,13 +1336,13 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
 
   Tensor c = Compute(
       "scale",
-      {{L, "l2"}, {N, "n1"}, {M, "m1"}},
+      {L, N, M},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
 
-  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1366,7 +1355,8 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
   l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
@@ -1374,14 +1364,14 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[1]
-#CHECK: sum[l1] = 0
-#CHECK: d_local[0] = sum[l1]
-#CHECK: for (int n1
-#CHECK:   for (int m1
+#CHECK: sum[i_1] = 0
+#CHECK: d_local[0] = sum[i_1]
+#CHECK: for (int j_1
+#CHECK:   for (int k_1
 #CHECK: d_local[0] = (d_local[0]) + (scale[
 #CHECK:   }
 #CHECK: }
-#CHECK: sum[l1] = d_local[0]
+#CHECK: sum[i_1] = d_local[0]
 #CHECK: Free(d_local);
 #CHECK-NOT: d_local
       )IR";
@@ -1420,13 +1410,13 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
 
   Tensor c = Compute(
       "scale",
-      {{L, "l2"}, {N, "n1"}, {M, "m1"}},
+      {L, N, M},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
 
-  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1439,7 +1429,8 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
   l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
@@ -1447,13 +1438,13 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[1]
-#CHECK: sum[l1] = 0
-#CHECK: for (int n1
+#CHECK: sum[i_1] = 0
+#CHECK: for (int
 #CHECK:   d_local[0] = 0
-#CHECK:   for (int m1
+#CHECK:   for (int
 #CHECK:     d_local[0] = (d_local[0]) + (scale[
 #CHECK:   }
-#CHECK:   sum[l1] = (sum[l1]) + (d_local[0])
+#CHECK:   sum[i_1] = (sum[i_1]) + (d_local[0])
 #CHECK: }
 #CHECK: Free(d_local);
 #CHECK-NOT: d_local
@@ -1489,13 +1480,13 @@ TEST(Reductions, ReductionCacheBodyAccess) {
 
   Tensor c = Compute(
       "scale",
-      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
+      {24, 32, 12},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
 
-  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1505,7 +1496,8 @@ TEST(Reductions, ReductionCacheBodyAccess) {
   l.cacheAccesses(c.buf(), "scale_local", d_loop);
 
   l.prepareForCodegen();
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {a, b, e});
 
   std::ostringstream oss;
@@ -1513,11 +1505,11 @@ TEST(Reductions, ReductionCacheBodyAccess) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12]
-#CHECK: for (int j = 0; j < 32; j++) {
-#CHECK:   for (int k = 0; k < 12; k++) {
-#CHECK:     scale_local[k + 12 * j] = scale[(k + 12 * j) + 384 * l1];
-#CHECK: sum[l1] = (sum[l1]) + (scale_local[m1_1 + 12 * n1_1]);
-#CHECK: scale_1[l] = (b[l]) * (sum[l]);
+#CHECK: for (int j_1 = 0; j_1 < 32; j_1++) {
+#CHECK:   for (int k_1 = 0; k_1 < 12; k_1++) {
+#CHECK:     scale_local[k_1 + 12 * j_1] = scale[(k_1 + 12 * j_1) + 384 * i_1];
+#CHECK: sum[i_1] = (sum[i_1]) + (scale_local[k_2 + 12 * j_2]);
+#CHECK: scale_1[i_2] = (b[i_2]) * (sum[i_2]);
 #CHECK: Free(scale_local);
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
@@ -1529,13 +1521,13 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
 
   Tensor c = Compute(
       "scale",
-      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
+      {24, 32, 12},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
 
-  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1547,7 +1539,8 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
   l.cacheAccesses(d.buf(), "sum_local", e_loop);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {a, b, e});
 
   std::ostringstream oss;
@@ -1555,10 +1548,10 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Alias(sum_local,scale);
-#CHECK: sum[l1] = (sum[l1]) + (scale[
-#CHECK: for (int i = 0; i < 4
-#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
-#CHECK:   scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
+#CHECK: sum[i_1] = (sum[i_1]) + (scale[
+#CHECK: for (int j_2 = 0; j_2 < 4
+#CHECK:   sum_local[j_2] = sum[j_2 + 4 * i_2];
+#CHECK:   scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 }
@@ -1569,13 +1562,13 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
 
   Tensor c = Compute(
       "scale",
-      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
+      {24, 32, 12},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
 
-  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1593,7 +1586,8 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {a, b, e});
 
   // reduction changes but cache does not.
@@ -1602,10 +1596,12 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Alias(sum_local,scale);
-#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((m1_1 + 12 * n1_1) + 1536 * l1_outer) + 384 * l1_inner]);
-#CHECK: for (int i = 0; i < 4
-#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
-#CHECK:   scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
+#CHECK:         sum[j_1 + 4 * i_1] = (sum[j_1 + 4 * i_1]) + (scale[((l + 12 * k_1) + 1536 * i_1) + 384 * j_1]);
+#CHECK: for (int i_2 = 0; i_2 < 6
+#CHECK:   for (int j_2 = 0; j_2 < 4
+#CHECK:     sum_local[j_2] = sum[j_2 + 4 * i_2];
+#CHECK:   for (int j_3 = 0; j_3 < 4
+#CHECK:     scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 }
@@ -1616,13 +1612,13 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
 
   Tensor c = Compute(
       "scale",
-      {{24, "l2"}, {32, "n1"}, {12, "m1"}},
+      {24, 32, 12},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
 
-  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
     return b.load(0, 0, l) * d.load(l);
   });
 
@@ -1641,7 +1637,8 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
-  StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  StmtPtr result =
+      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
   SimpleIREvaluator cg(result, {a, b, e});
 
   // neither reduction body not cache changes.
@@ -1649,10 +1646,12 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: sum[l1] = (sum[l1]) + (scale[(m1_1 + 12 * n1_1) + 384 * l1]);
-#CHECK: for (int i = 0; i < 4
-#CHECK:   sum_local[i] = sum[i + 4 * l_outer];
-#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
+#CHECK:        sum[j_1] = (sum[j_1]) + (scale[(k_1 + 12 * i_2) + 384 * j_1]);
+#CHECK:  for (int i_3 = 0; i_3 < 6;
+#CHECK:    for (int j_2 = 0; j_2 < 4;
+#CHECK:      sum_local[j_2] = sum[j_2 + 4 * i_3];
+#CHECK:    for (int j_3 = 0; j_3 < 4;
+#CHECK:      scale_1[j_3 + 4 * i_3] = (b[j_3 + 4 * i_3]) * (sum_local[j_3]);
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 }
@@ -1673,7 +1672,7 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
   LoopNest loop({c});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
@@ -1693,7 +1692,7 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
   LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][1]);
   loop.simplify();
   loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
+  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
   SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   std::ostringstream oss;
@@ -1702,17 +1701,17 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
       R"IR(
 #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
 #CHECK: Allocate(tmp); // dtype=float, dims=[n]
-#CHECK: for (int a = 0; a < m
-#CHECK:   for (int i = 0; i < n
-#CHECK:     tmp[i] = 0
+#CHECK: for (int i_1 = 0; i_1 < m
+#CHECK:   for (int j = 0; j < n
+#CHECK:     tmp[j] = 0
 #CHECK:   }
-#CHECK:   for (int b = 0; b < n
-#CHECK:     for (int c
-#CHECK:       tmp[b] = (tmp[b]) + (B[
+#CHECK:   for (int j_1 = 0; j_1 < n
+#CHECK:     for (int k
+#CHECK:       tmp[j_1] = (tmp[j_1]) + (B[
 #CHECK:     }
 #CHECK:   }
-#CHECK:   for (int i = 0; i < n
-#CHECK:     sum_rfac[i] = (sum_rfac[i]) + (tmp[i]);
+#CHECK:   for (int j_2 = 0; j_2 < n
+#CHECK:     sum_rfac[j_2] = (sum_rfac[j_2]) + (tmp[j_2]);
 #CHECK:   }
 #CHECK:   Free(tmp);
 #CHECK-NOT: tmp
@@ -1739,7 +1738,7 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   auto c_body = loop.getAllWritesToBuf(c.buf())[1];
@@ -1759,7 +1758,7 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
   LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][2]);
   loop.prepareForCodegen();
   loop.simplify();
-  StmtPtr s = loop.root_stmt();
+  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
   SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   std::ostringstream oss;
@@ -1768,13 +1767,13 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
       R"IR(
 #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
 #CHECK: Allocate(tmp); // dtype=float, dims=[1]
-#CHECK: for (int a = 0; a < m
-#CHECK:   for (int b = 0; b < n
+#CHECK: for (int i_1 = 0; i_1 < m
+#CHECK:   for (int j = 0; j < n
 #CHECK:     tmp[0] = 0
-#CHECK:     for (int c
+#CHECK:     for (int k
 #CHECK:       tmp[0] = (tmp[0]) + (B[
 #CHECK:     }
-#CHECK:   sum_rfac[b] = (sum_rfac[b]) + (tmp[0]);
+#CHECK:   sum_rfac[j] = (sum_rfac[j]) + (tmp[0]);
 #CHECK:   Free(tmp);
 #CHECK-NOT: tmp
       )IR";
@@ -1796,7 +1795,7 @@ TEST(Reductions, ReductionVectorize) {
 
   BufHandle in("in", {8, 8}, kFloat);
 
-  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
   LoopNest l_before({tensor});
   LoopNest l(l_before);
   l_before.prepareForCodegen();
@@ -1806,15 +1805,15 @@ TEST(Reductions, ReductionVectorize) {
   ASSERT_TRUE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[0]));
 
   StmtPtr s = l.root_stmt();
-  s = IRSimplifier::simplify(s);
+  s = LoopNest::sanitizeNames(IRSimplifier::simplify(s));
 
   std::ostringstream oss;
   oss << *s;
   const std::string& expected_ir =
       R"IR(
 #CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8);
-#CHECK: for (int n = 0; n < 8; n++) {
-#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(n, 8, 8)]), reduce_args={n});
+#CHECK: for (int i = 0; i < 8; i++) {
+#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(i, 8, 8)]), reduce_args={i});
 #CHECK: }
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
@@ -1832,7 +1831,7 @@ TEST(Reductions, ReductionVectorize) {
 TEST(Reductions, ReductionVectorizeInner) {
   BufHandle in("in", {8, 8}, kFloat);
 
-  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
   LoopNest l({tensor});
 
   ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
@@ -1850,7 +1849,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
 
   BufHandle in("in", {8, 8}, kFloat);
 
-  Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
+  Tensor tensor = Reduce("sum", {}, Sum(), in, {8, 8});
 
   LoopNest l_before({tensor});
   LoopNest l(l_before);
@@ -1875,21 +1874,21 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   ASSERT_TRUE(LoopNest::vectorize(rfac_loops[1][0]));
   l.simplify();
 
-  StmtPtr s = l.root_stmt();
+  StmtPtr s = LoopNest::sanitizeNames(l.root_stmt());
 
   std::ostringstream oss;
   oss << *s;
   const std::string& expected_ir =
       R"IR(
 #CHECK: sum = 0.f;
-#CHECK: for (int n = 0; n < 8; n++) {
-#CHECK:   sum_rfac[n] = 0.f;
+#CHECK: for (int i = 0; i < 8; i++) {
+#CHECK:   sum_rfac[i] = 0.f;
 #CHECK: }
-#CHECK: for (int m = 0; m < 8; m++) {
-#CHECK:   sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * m, 1, 8)]), reduce_args={m});
+#CHECK: for (int i_1 = 0; i_1 < 8; i_1++) {
+#CHECK:   sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * i_1, 1, 8)]), reduce_args={i_1});
 #CHECK: }
-#CHECK: for (int n = 0; n < 8; n++) {
-#CHECK:   sum = ReduceOp((sum) + (sum_rfac[n]), reduce_args={n});
+#CHECK: for (int i_2 = 0; i_2 < 8; i_2++) {
+#CHECK:   sum = ReduceOp((sum) + (sum_rfac[i_2]), reduce_args={i_2});
 #CHECK: }
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
@@ -1910,22 +1909,22 @@ TEST(Reductions, InitFunction) {
   BufHandle B("B", {N}, kFloat);
   Tensor C = Reduce(
       "C",
-      {{N, "n"}},
+      {N},
       Sum(),
       [&](const std::vector<VarHandle>& v) { return B.load(v[0]); },
       [&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); },
-      {{M, "m"}});
+      {M});
   LoopNest nest({C});
   nest.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
+  StmtPtr s = LoopNest::sanitizeNames(IRSimplifier::simplify(nest.root_stmt()));
   std::ostringstream oss;
   oss << *s << "\n";
   const std::string& expected_ir =
       R"IR(
-#CHECK:  for (int n = 0; n < 16; n++) {
-#CHECK:    C[n] = B[n];
-#CHECK:    for (int m = 0; m < 32; m++) {
-#CHECK:      C[n] = (C[n]) + (A[n + 16 * m]);
+#CHECK:  for (int i = 0; i < 16; i++) {
+#CHECK:    C[i] = B[i];
+#CHECK:    for (int j = 0; j < 32; j++) {
+#CHECK:      C[i] = (C[i]) + (A[i + 16 * j]);
 #CHECK:    }
 #CHECK:  }
       )IR";
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index 21f85ce160af..2a4322a64f9c 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3858,26 +3858,25 @@ TEST(Simplify, SimplifyForCleansUp) {
     BufHandle a("a", {1, 12, 1}, kFloat);
     VarHandle x("x", kInt);
     Tensor b = Compute(
-        // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
         "x",
-        {{1, "i"}, {12, "m"}, {1, "n"}},
+        {1, 12, 1},
         [](const VarHandle& i, const VarHandle& m, const VarHandle& n) {
           return i + m + n;
         });
     LoopNest l({b});
     l.prepareForCodegen();
 
-    StmtPtr body = l.root_stmt();
+    StmtPtr body = LoopNest::sanitizeNames(l.root_stmt());
     StmtPtr simplified = IRSimplifier::simplify(body);
 
     BlockPtr block = to<Block>(simplified);
     IS_NODE_WITH_NAME(For, block->front(), for_);
     // for is over "m".
-    IS_VAR_WITH_NAME(for_->var(), "m");
+    IS_VAR_WITH_NAME(for_->var(), "j");
     // x[m] = m;
     IS_NODE_WITH_NAME(Store, for_->body()->front(), store);
-    IS_VAR_WITH_NAME(store->flat_index(), "m");
-    IS_VAR_WITH_NAME(store->value(), "m");
+    IS_VAR_WITH_NAME(store->flat_index(), "j");
+    IS_VAR_WITH_NAME(store->value(), "j");
   }
 }
 
@@ -4118,7 +4117,7 @@ TEST(Simplify, SimplifyReorderForCond) {
         0,
         4,
         Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
+            CompareSelect::make(i, 2, CompareSelectOperation::kEQ),
             Store::make(c, {i}, Load::make(a, {i})),
             nullptr));
 
@@ -4235,7 +4234,7 @@ TEST(Simplify, SimplifyReorderForCond) {
             CompareSelect::make(
                 Load::make(a, {0}), 10, CompareSelectOperation::kLT),
             Cond::make(
-                CompareSelect::make(i, 10, CompareSelectOperation::kEQ),
+                CompareSelect::make(i, 3, CompareSelectOperation::kEQ),
                 Store::make(c, {0}, Load::make(a, {i})),
                 nullptr),
             nullptr));
@@ -4825,7 +4824,739 @@ TEST(Simplify, SimplifyBroadcastTermExpander) {
   }
 }
 
-TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) {
+TEST(Simplify, CompareSelectLoopBounds) {
+  constexpr int N = 8;
+  BufHandle b("b", {N}, kFloat);
+  VarHandle n("n", kInt);
+  VarHandle m("m", kInt);
+  VarHandle var_N("var_N", kInt);
+  VarHandle var_M("var_M", kInt);
+
+  auto test_case_fn = [](const VarHandle& n,
+                         const BufHandle& b,
+                         const ExprHandle& start,
+                         const ExprHandle& stop,
+                         const int& cmp_val,
+                         const CompareSelectOperation& cmp_op,
+                         const std::string& check_string) {
+    StmtPtr s = For::make(
+        n,
+        start,
+        stop,
+        b.store({n}, CompareSelect::make(n, cmp_val, 0.f, 1.0f, cmp_op)));
+    s = IRSimplifier::simplify(s);
+    std::ostringstream oss;
+    oss << *s;
+    std::string target_string = "# CHECK: ";
+    target_string += check_string;
+    torch::jit::testing::FileCheck().run(target_string, oss.str());
+  };
+
+  auto test_case_nest_loops_fn = [](const VarHandle& n,
+                                    const VarHandle& m,
+                                    const BufHandle& b,
+                                    const ExprHandle& n_start,
+                                    const ExprHandle& n_stop,
+                                    const ExprHandle& m_start,
+                                    const ExprHandle& m_stop,
+                                    const CompareSelectOperation& cmp_op,
+                                    const std::string& check_string) {
+    StmtPtr s = For::make(
+        m,
+        m_start,
+        m_stop,
+        b.store({n, m}, CompareSelect::make(n, m, 0.f, 1.0f, cmp_op)));
+    StmtPtr root_s = For::make(n, n_start, n_stop, s);
+    root_s = IRSimplifier::simplify(root_s);
+    std::ostringstream oss;
+    oss << *root_s;
+    std::string target_string = "# CHECK: ";
+    target_string += check_string;
+    torch::jit::testing::FileCheck().run(target_string, oss.str());
+  };
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kLT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kLE, "b[n] = n<=1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kLE, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kLT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kLT, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kLE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n <= 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kLE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n < 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kLT, "b[n] = n<7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kGT, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kGT, "b[n] = n>1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kGE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kGT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kGE, "b[n] = n>=7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kGT, "b[n] = n>5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kGE, "b[n] = n>=5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n > 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kGT, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n >= 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kGE, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, 2)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, 2)) {
+  //     b[1] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, 2, 1, kEQ, "b[1] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kEQ, "b[n] = n==1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kEQ, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kEQ, "b[n] = n==7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n == 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kEQ, "b[n] = 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 1 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 1 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 1, kNE, "b[n] = n!=1 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 7 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 7 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, N - 1, kNE, "b[n] = n!=7 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 5 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 5 ? 0.f : 1.f;
+  //   }
+  test_case_fn(n, b, 1, N, 5, kNE, "b[n] = n!=5 ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 0 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, 0, kNE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = n != 8 ? 0.f : 1.f;
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(1, N)) {
+  //     b[n] = 0.f;
+  //   }
+  test_case_fn(n, b, 1, N, N, kNE, "b[n] = 0.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kNE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kNE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kNE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kNE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(
+      n, m, b, 30, 40, 10, 31, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n != m) ? 0.f : 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(
+      n, m, b, 10, 31, 30, 40, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kNE,
+      "b[n, m] = n!=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n < m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kLT, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kLT,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kLT,
+      "b[n, m] = n<m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n < m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kLT, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kLT,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kLT,
+      "b[n, m] = n<m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n > m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kGT, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kGT,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kGT,
+      "b[n, m] = n>m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n > m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kGT, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kGT,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kGT,
+      "b[n, m] = n>m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = (n >= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 31)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kGE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 31,
+      kGE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 31,
+      kGE,
+      "b[n, m] = n>=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n >= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 20)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 1.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kGE, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_N + 30,
+      var_N + 40,
+      kGE,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 20,
+      var_M + 30,
+      var_M + 40,
+      kGE,
+      "b[n, m] = n>=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = (n <= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(10, 31)) {
+  //     for(const auto m : c10::irange(30, 40)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kLE, "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_N + 30,
+      var_N + 40,
+      kLE,
+      "b[n, m] = 0.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 10,
+      var_N + 31,
+      var_M + 30,
+      var_M + 40,
+      kLE,
+      "b[n, m] = n<=m ? 0.f : 1.f;");
+
+  // Before:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = (n <= m) ? 0.f : 1.f;
+  //     }
+  //   }
+  // After:
+  //   for (const auto n : c10::irange(30, 40)) {
+  //     for(const auto m : c10::irange(10, 20)) {
+  //       b[n, m] = 0.f;
+  //     }
+  //   }
+  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kLE, "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_N + 10,
+      var_N + 20,
+      kLE,
+      "b[n, m] = 1.f;");
+  test_case_nest_loops_fn(
+      n,
+      m,
+      b,
+      var_N + 30,
+      var_N + 40,
+      var_M + 10,
+      var_M + 20,
+      kLE,
+      "b[n, m] = n<=m ? 0.f : 1.f;");
+}
+
+TEST(Simplify, CompareSelectCondAlwaysInLoopBounds) {
   // Before:
   //   for (const auto n : c10::irange(1, N)) {
   //     b[n] = n < 1 ? 0.f : 1.f;
@@ -4849,7 +5580,7 @@ TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) {
       oss.str());
 }
 
-TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) {
+TEST(Simplify, IfThenCondAlwaysInLoopBounds) {
   // Before:
   //   for (const auto n : c10::irange(1, N)) {
   //     b[n] = IfThenElse(n < 1 ? 1 : 0, 0.f, 1.f);
@@ -4873,7 +5604,7 @@ TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) {
       oss.str());
 }
 
-TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) {
+TEST(Simplify, MultiClauseCondAlwaysInLoopBounds) {
   // This test mimics the unpadded region of a conv2d.  We want to remove any
   // conditional that is provably satisfied (or unsatisfied) by the entire loop
   // range.
@@ -4902,7 +5633,7 @@ TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) {
   oss << *s;
   torch::jit::testing::FileCheck().run(
       R"IR(
-# CHECK: b[n] = 1.f;
+# CHECK: b[i, j] = 1.f;
 )IR",
       oss.str());
 }
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index d3e91784fb56..56535de914e4 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>
 
@@ -350,5 +351,52 @@ TEST(TEFuserPass, FuserPass_WhereList) {
   testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 
+TEST(TEFuserPass, DynamicShapeFusion) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
+          %1 : Float(10, 5, strides=[5, 1], device=cpu)):
+      %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
+      %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
+      return (%3))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(
+      g,
+      /* min_group_size = */ 2,
+      /* add_composed_op = */ true,
+      /* fuse_to_dynamic_shapes = */ true);
+  Code code(g, "");
+
+  testing::FileCheck()
+      .check("prim::TensorExprDynamicGroup_")
+      ->check("prim::TensorExprDynamicGuard")
+      ->check("prim::TensorExprGroup_")
+      ->run(*g);
+
+  auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
+    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
+
+    auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
+
+    InterpreterState interp(code);
+    Stack stack(inputs.begin(), inputs.end());
+    interp.run(stack);
+    at::Tensor out = pop(stack).toTensor();
+    ASSERT_TRUE(at::allclose(out, ref));
+  };
+
+  std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
+  run_and_compare(inputs);
+
+  std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
+  run_and_compare(inputs2);
+
+  std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
+  run_and_compare(inputs3);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_type_specializations.cpp b/test/cpp/tensorexpr/test_type_specializations.cpp
new file mode 100644
index 000000000000..5d2e9462e4aa
--- /dev/null
+++ b/test/cpp/tensorexpr/test_type_specializations.cpp
@@ -0,0 +1,75 @@
+#include <gtest/gtest.h>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+
+// Test that tensor type specializations are availabie in
+// the custom passes
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+bool hasTensorTypeSpecializations(torch::jit::Block* block) {
+  for (Value* v : block->inputs()) {
+    if (hasTensorTypeSpecialization(v))
+      return true;
+  }
+  for (Node* n : block->nodes()) {
+    for (torch::jit::Block* b : n->blocks()) {
+      if (hasTensorTypeSpecializations(b))
+        return true;
+    }
+    for (Value* v : n->outputs()) {
+      if (hasTensorTypeSpecialization(v))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool hasSpecializations = false;
+void detectTTSpecializationPass(std::shared_ptr<Graph>& graph) {
+  GRAPH_DUMP("In detectTTSpecialization Custom Post Pass: ", graph);
+  hasSpecializations = hasTensorTypeSpecializations(graph->block());
+}
+
+} // namespace
+
+TEST(SpecializationsInCustomPasses, Basic) {
+  RegisterPass p(detectTTSpecializationPass);
+  hasSpecializations = false;
+  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
+graph(%a.1 : Tensor,
+      %b.1 : Tensor):
+  %c.1 : Tensor = aten::mul(%a.1, %b.1) # misc/test_specializations.py:5:8
+  %d.1 : Tensor = aten::mul(%c.1, %b.1) # misc/test_specializations.py:6:8
+  return (%d.1)
+  )IR",
+      &*graph);
+
+  IValue ival = IValue(torch::randn({22}, at::kCPU));
+  std::vector<IValue> stack = {ival, ival};
+  auto run = [&](std::shared_ptr<Graph>& graph, std::vector<IValue> stack) {
+    GraphExecutor executor(graph, "");
+    executor.run(stack);
+    return stack;
+  };
+  run(graph, stack);
+
+  // Priofiling mode will not be run with simple executor
+  if (!getExecutorMode()) {
+    EXPECT_TRUE(hasSpecializations);
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index b89fcc3396df..e34d980cf708 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -186,10 +186,10 @@ int main(int argc, char* argv[]) {
     // structure is simply a pair of a buffer that was created to represent the
     // result of the computation (BufPtr) and a statement representing the
     // computation itself (StmtPtr).
-    Tensor C = Compute(
-        "C",
-        {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) { return i * j; });
+    Tensor C =
+        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return i * j;
+        });
     std::cout << "Stmt produced by 'Compute' API: " << std::endl
               << *C.stmt() << std::endl;
     // Prints:
@@ -209,7 +209,7 @@ int main(int argc, char* argv[]) {
         {},
         Sum(),
         [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
-        {{64, "i"}, {32, "j"}});
+        {64, 32});
     std::cout << "Stmt produced by 'Reduce' API: " << std::endl
               << *D.stmt() << std::endl;
   }
@@ -223,15 +223,13 @@ int main(int argc, char* argv[]) {
     // Let's look at a couple of transformations that are used in NNC. We will
     // begin with constructing a Block statement like we did before.
 
-    Tensor C = Compute(
-        "C",
-        {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) { return i * (j + 1); });
+    Tensor C =
+        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
+          return i * (j + 1);
+        });
     BufHandle c_buf(C.buf());
-    Tensor D = Compute(
-        "D",
-        {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) {
+    Tensor D =
+        Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return c_buf.load(i, j) - i;
         });
     StmtPtr block = Block::make({C.stmt(), D.stmt()});
@@ -353,10 +351,8 @@ int main(int argc, char* argv[]) {
     // Let's start by constructing a simple computation for us to work with:
     BufHandle A("A", {64, 32}, kInt);
     BufHandle B("B", {64, 32}, kInt);
-    Tensor X = Compute(
-        "X",
-        {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) {
+    Tensor X =
+        Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
           return A.load(i, j) + B.load(i, j);
         });
 
diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
index 3b25f1e60bb9..df5339417304 100644
--- a/test/cpp_extensions/setup.py
+++ b/test/cpp_extensions/setup.py
@@ -51,15 +51,18 @@
 
 # todo(mkozuki): Figure out the root cause
 if (not IS_WINDOWS) and torch.cuda.is_available() and CUDA_HOME is not None:
+    # malfet: One shoudl not assume that PyTorch re-exports CUDA dependencies
     cublas_extension = CUDAExtension(
         name='torch_test_cpp_extension.cublas_extension',
-        sources=['cublas_extension.cpp']
+        sources=['cublas_extension.cpp'],
+        libraries=['cublas'] if torch.version.hip is None else [],
     )
     ext_modules.append(cublas_extension)
 
     cusolver_extension = CUDAExtension(
         name='torch_test_cpp_extension.cusolver_extension',
-        sources=['cusolver_extension.cpp']
+        sources=['cusolver_extension.cpp'],
+        libraries=['cusolver'] if torch.version.hip is None else [],
     )
     ext_modules.append(cusolver_extension)
 
diff --git a/test/create_dummy_torchscript_model.py b/test/create_dummy_torchscript_model.py
new file mode 100644
index 000000000000..ffd869e27f0b
--- /dev/null
+++ b/test/create_dummy_torchscript_model.py
@@ -0,0 +1,28 @@
+# Usage: python create_dummy_model.py <name_of_the_file>
+import sys
+import torch
+from torch import nn
+
+
+class NeuralNetwork(nn.Module):
+
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+
+if __name__ == '__main__':
+    jit_module = torch.jit.script(NeuralNetwork())
+    torch.jit.save(jit_module, sys.argv[1])
diff --git a/test/custom_backend/CMakeLists.txt b/test/custom_backend/CMakeLists.txt
index 96322e397d63..71f83442e085 100644
--- a/test/custom_backend/CMakeLists.txt
+++ b/test/custom_backend/CMakeLists.txt
@@ -2,6 +2,10 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_backend)
 
+if(USE_ROCM)
+include(utils)
+include(LoadHIP)
+endif()
 find_package(Torch REQUIRED)
 
 add_library(custom_backend SHARED custom_backend.cpp)
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 883424e36da9..47c1c9d45e81 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -2,6 +2,10 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_ops)
 
+if(USE_ROCM)
+include(utils)
+include(LoadHIP)
+endif()
 find_package(Torch REQUIRED)
 
 add_library(custom_ops SHARED op.cpp)
diff --git a/test/distributed/_shard/checkpoint/test_checkpoint.py b/test/distributed/_shard/checkpoint/test_checkpoint.py
new file mode 100644
index 000000000000..4816b0c38b34
--- /dev/null
+++ b/test/distributed/_shard/checkpoint/test_checkpoint.py
@@ -0,0 +1,517 @@
+# Owner(s): ["oncall: distributed"]
+
+import random
+import sys
+from typing import Optional, List, Union
+from torch.distributed._shard.checkpoint import (
+    StorageReader,
+    StorageWriter,
+    CheckpointException,
+    load_state_dict,
+    save_state_dict,
+)
+
+import torch
+import torch.distributed as dist
+import torch.nn
+import torch.futures
+from torch.futures import Future
+from torch.testing._internal.common_utils import TestCase
+
+from torch.distributed._shard.checkpoint.resharding import (
+    _prepare_sharded_tensor_write,
+    _create_storage_key
+)
+
+from torch.distributed._shard import sharded_tensor
+from torch.distributed._shard.checkpoint.state_dict_loader import (
+    validate_metadata,
+)
+
+from torch.distributed._shard.checkpoint.state_dict_saver import (
+    _prepare,
+)
+
+from torch.distributed._shard.checkpoint.metadata import (
+    Metadata,
+    BytesReadRequest,
+    BytesWriteRequest,
+    TensorReadRequest,
+    TensorWriteRequest,
+)
+
+from torch.distributed._shard.sharded_tensor import (
+    state_dict_hook,
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    ShardedTensorTestBase,
+    with_comms,
+)
+
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.sharded: ShardedTensor = sharded_tensor.zeros(self.spec(), 4, 4)
+        self.regular = torch.nn.Parameter(torch.ones(4, 4))
+        self.extra_sharded: Optional[ShardedTensor] = None
+        self.extra_param: Optional[torch.nn.Parameter] = None
+        self._register_state_dict_hook(state_dict_hook)
+
+    def spec(self) -> ChunkShardingSpec:
+        # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+        return ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        )
+
+
+class TestDistributedCheckpointing(ShardedTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_validate_metadata(self) -> None:
+        module = TestModule()
+
+        metadata, _, _ = _prepare(module.state_dict(), True)
+        self.assertTrue(
+            "regular" in metadata.state_dict_metadata,
+            f"keys: {metadata.state_dict_metadata.keys()}",
+        )
+
+        module = TestModule()
+        validate_metadata(module.state_dict(), metadata)
+
+        module = TestModule()
+        module.extra_param = torch.nn.Parameter(torch.zeros(2, 2))
+        with self.assertRaisesRegex(ValueError, "Could not find Tensor metadata"):
+            validate_metadata(module.state_dict(), metadata)
+
+        module = TestModule()
+        module.regular = torch.nn.Parameter(torch.zeros(2, 4))
+
+        with self.assertRaisesRegex(ValueError, "Incompatible tensor size"):
+            validate_metadata(module.state_dict(), metadata)
+
+        module = TestModule()
+        module.extra_sharded = sharded_tensor.zeros(module.spec(), 4, 2)
+        with self.assertRaisesRegex(ValueError, "Could not find ShardedTensor metadata"):
+            validate_metadata(module.state_dict(), metadata)
+
+        module = TestModule()
+        module.sharded = sharded_tensor.zeros(module.spec(), 4, 2)
+        with self.assertRaisesRegex(ValueError, "Incompatible ShardedTensor size"):
+            validate_metadata(module.state_dict(), metadata)
+
+    def gen_metadata(self) -> Metadata:
+        module = TestModule()
+        # compute the default saved metadata (must pass include_non_replicated_tensors or we'll get incomplete MD)
+        metadata, _, _ = _prepare(module.state_dict(), True)
+
+        # _prepare only produc
+        metadata = [metadata]
+        dist.broadcast_object_list(metadata)
+
+        return metadata[0]
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_checkpoint_has_shard_too_small(self) -> None:
+        metadata = self.gen_metadata()
+
+        # we make the first stored shard smaller
+        self.assertTrue(
+            ".sharded" in metadata.state_dict_metadata,
+            f"keys: {metadata.state_dict_metadata.keys()}",
+        )
+
+        sizes = (
+            metadata.state_dict_metadata[".sharded"]
+            .storage_metadata[0]
+            .shard_metadata.shard_sizes
+        )
+        for i in range(len(sizes)):
+            sizes[i] = 1
+
+        module = TestModule()
+        with self.assertRaisesRegex(ValueError, "only has 1 available"):
+            validate_metadata(module.state_dict(), metadata)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_checkpoint_has_shard_overlap(self) -> None:
+        metadata = self.gen_metadata()
+
+        # we make the first stored shard smaller
+        self.assertTrue(
+            ".sharded" in metadata.state_dict_metadata,
+            f"keys: {metadata.state_dict_metadata.keys()}",
+        )
+
+        sizes = (
+            metadata.state_dict_metadata[".sharded"]
+            .storage_metadata[0]
+            .shard_metadata.shard_sizes
+        )
+        for i in range(len(sizes)):
+            sizes[i] += 1
+
+        module = TestModule()
+        with self.assertRaisesRegex(ValueError, "overlap"):
+            validate_metadata(module.state_dict(), metadata)
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_checkpoint_has_storage_type_mismatch(self) -> None:
+        module = TestModule()
+
+        metadata = self.gen_metadata()
+        regular = metadata.state_dict_metadata["regular"]
+        metadata.state_dict_metadata[".sharded"] = regular
+        with self.assertRaisesRegex(ValueError, "ShardedTensorStorageMetadata but found"):
+            validate_metadata(module.state_dict(), metadata)
+
+        metadata = self.gen_metadata()
+        sharded = metadata.state_dict_metadata[".sharded"]
+        metadata.state_dict_metadata["regular"] = sharded
+        with self.assertRaisesRegex(ValueError, "TensorStorageMetadata but found"):
+            validate_metadata(module.state_dict(), metadata)
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_tensor_metadata_with_missing_rank_spec(self) -> None:
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:1/cuda:1",
+            ],
+        )
+
+        st = sharded_tensor.zeros(spec, 4, 4, dtype=torch.float64)
+        mapping = dict()
+
+        (_, md) = _prepare_sharded_tensor_write(st, "tensor", mapping)
+
+        self.assertEqual(1, len(md.storage_metadata))
+        self.assertEqual(4 * 4 * 8, md.storage_metadata[0].length)
+        self.assertEqual(1, len(mapping))
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_storage_key_mapping(self) -> None:
+        device = f"cuda:{dist.get_rank()}"
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        )
+
+        state_dict = {
+            'sharded': sharded_tensor.rand(spec, (10, 10, )),
+            'replicated': torch.rand(4, device=device),
+            'bytes': [1, 2, 3, 4],
+        }
+
+        metadata, bytes_reqs, tensor_reqs = _prepare(state_dict, write_replicated_data=self.rank == 0)
+
+        if self.rank == 0:
+            self.assertEqual(1, len(bytes_reqs))
+            self.assertEqual(2, len(tensor_reqs))
+
+            self.assertTrue('bytes' in metadata.state_dict_metadata)
+            self.assertEqual(bytes_reqs[0].storage_key, metadata.state_dict_metadata['bytes'].storage_key)
+
+            # tensor ordering is unspecified
+            if len(tensor_reqs[0].tensor.size()) == 1:
+                replicated = tensor_reqs[0]
+                shard = tensor_reqs[1]
+            else:
+                replicated = tensor_reqs[1]
+                shard = tensor_reqs[0]
+
+            self.assertTrue('replicated' in metadata.state_dict_metadata)
+            self.assertEqual(replicated.storage_key, metadata.state_dict_metadata['replicated'].storage_key)
+        else:
+            self.assertEqual(0, len(bytes_reqs))
+            self.assertEqual(1, len(tensor_reqs))
+            shard = tensor_reqs[0]
+
+            self.assertTrue('sharded' in metadata.state_dict_metadata)
+            shard_keys = [sm.storage_key for sm in metadata.state_dict_metadata['sharded'].storage_metadata]
+            self.assertTrue(shard.storage_key in shard_keys)
+
+class TestStorageKeys(TestCase):
+    def test_create_key_handles_collision(self):
+        keys = dict()
+        key0 = _create_storage_key(keys, "foo")
+        key1 = _create_storage_key(keys, "foo")
+        self.assertNotEqual(key0, key1)
+
+
+
+
+class TestStorageBase:
+    def __init__(
+        self,
+        fail_conf
+    ):
+        self.fail_conf = fail_conf
+        self.rank = 0 if not dist.is_initialized() else dist.get_rank()
+
+    def _get_ranks(self, name):
+        return self.fail_conf[name] if name in self.fail_conf else None
+
+    def _fail_rank(self, name):
+        ranks = self._get_ranks(name)
+        if ranks is not None and self.rank in ranks:
+            raise ValueError(f"rank fail {self.rank} for {name}")
+
+    def _fail_rank_async(self, name):
+        ranks = self._get_ranks(name)
+        fut = Future()
+        if ranks is not None and self.rank in ranks:
+            fut.set_exception(ValueError(f"async rank fail {self.rank} for {name}"))
+        else:
+            fut.set_result(None)
+        return fut
+
+
+class FaultyStorageWriter(TestStorageBase, StorageWriter):
+    def __init__(
+        self,
+        fail_conf
+    ):
+        super(FaultyStorageWriter, self).__init__(fail_conf)
+
+    def prepare(self) -> None:
+        self._fail_rank("fail_prepare")
+
+    def write_bytes(self, requests: List[BytesWriteRequest]) -> Future[None]:
+        self._fail_rank("fail_write_bytes_on_ranks")
+        return self._fail_rank_async("fail_write_bytes_on_ranks_async")
+
+    def write_tensors(self, requests: List[TensorWriteRequest]) -> Future[None]:
+        self._fail_rank("fail_write_tensors_on_ranks")
+        return self._fail_rank_async("fail_write_tensors_on_ranks_async")
+
+    def finish(self, metadata: Metadata) -> None:
+        self._fail_rank("fail_finish")
+
+    def prepare_storage(self, storage_writes: List[Union[TensorWriteRequest, BytesWriteRequest]]) -> None:
+        self._fail_rank("fail_prepare_storage")
+
+class FaultyStorageReader(TestStorageBase, StorageReader):
+    def __init__(
+        self,
+        metadata,
+        fail_conf
+    ):
+        super(FaultyStorageReader, self).__init__(fail_conf)
+        self.metadata = metadata
+
+    def read_bytes(self, requests: List[BytesReadRequest]) -> Future[None]:
+        self._fail_rank("fail_read_bytes")
+        bad_ranks = self._get_ranks("fail_deser_bytes")
+        for r in requests:
+            if bad_ranks is not None and self.rank in bad_ranks:
+                # this is not "guaranteed" to fail, but hard to beat
+                rand = random.Random(1237)
+                r.bytes.write(rand.randbytes(32))
+            else:
+                torch.save([1, 2, 3], r.bytes)
+
+        return self._fail_rank_async("fail_read_bytes_async")
+
+    def read_tensors(self, requests: List[TensorReadRequest]) -> Future[None]:
+        self._fail_rank("fail_read_tensors")
+        return self._fail_rank_async("fail_read_tensors_async")
+
+    def read_metadata(self) -> Metadata:
+        self._fail_rank("fail_read_metadata")
+        return self.metadata
+
+class TestDistributedFailure(ShardedTensorTestBase):
+    def get_spec(self):
+        return ChunkShardingSpec(
+            dim=0,
+            placements=[
+                f"rank:{r}/cuda:{r}" for r in range(dist.get_world_size())
+            ]
+        )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_dummy_writer_works(self) -> None:
+        state_dict = {
+            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
+            'replicated': torch.rand(10, 10),
+            'bytes': [1, 2, 3, 4]
+        }
+
+        save_state_dict(state_dict, FaultyStorageWriter({}))
+
+
+    def _test_dist_failure(self, callback, kwargs):
+        bad_ranks = list(kwargs.values())[0] if len(kwargs) > 0 else []
+
+        # Empty bad_ranks means it must work
+        if len(bad_ranks) == 0:
+            callback()
+        else:
+            with self.assertRaises(CheckpointException) as cm:
+                callback()
+            e = cm.exception
+            for rank, ex in e.failures.items():
+                self.assertTrue(rank in bad_ranks, msg=f"{rank} did not fail")
+                if not kwargs.get("ignore_exception_type", False):
+                    self.assertEqual(ValueError, type(ex), str(ex))
+
+            failed_ranks = e.failures.keys()
+            for rank in bad_ranks:
+                self.assertTrue(rank in failed_ranks, msg=f"{rank} was supposed to fail was fine")
+
+
+    def _test_save(self, state_dict, coordinator=0, **kwargs):
+        no_dist = not dist.is_initialized()
+
+        def _save():
+            save_state_dict(
+                state_dict,
+                storage_writer=FaultyStorageWriter(kwargs),
+                coordinator_rank=coordinator,
+                no_dist=no_dist,
+            )
+        self._test_dist_failure(_save, kwargs)
+
+    def _test_load(self, state_dict, coordinator=0, **kwargs):
+        no_dist = not dist.is_initialized()
+        write_replicated = dist.is_initialized() and dist.get_rank() == coordinator
+
+        def _load():
+            metadata, _, _ = _prepare(state_dict, write_replicated)
+            load_state_dict(
+                state_dict,
+                storage_reader=FaultyStorageReader(metadata, kwargs),
+                coordinator_rank=coordinator,
+                no_dist=no_dist,
+            )
+
+        self._test_dist_failure(_load, kwargs)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_save_error_handling(self) -> None:
+        state_dict = {
+            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
+            'replicated': torch.rand(10, 10),
+            'bytes': [1, 2, 3, 4]
+        }
+
+        self._test_save(state_dict, fail_prepare=[0])
+        self._test_save(state_dict, fail_finish=[0])
+
+        self._test_save(state_dict, fail_prepare_storage=[0])
+        self._test_save(state_dict, fail_write_tensors_on_ranks=[1])
+        self._test_save(state_dict, fail_write_tensors_on_ranks_async=[2])
+        self._test_save(state_dict, fail_write_bytes_on_ranks=[3])
+        self._test_save(state_dict, fail_write_bytes_on_ranks_async=[1])
+
+        self._test_save(state_dict, fail_write_tensors_on_ranks_async=[1, 3])
+
+        self._test_save(state_dict, coordinator=1, fail_prepare=[1])
+        self._test_save(state_dict, coordinator=1, fail_finish=[1])
+
+
+    def test_save_error_handling_no_dist(self) -> None:
+        state_dict = {
+            'replicated': torch.rand(10, 10),
+            'bytes': [1, 2, 3, 4]
+        }
+
+        self.assertFalse(dist.is_initialized())
+
+        self._test_save(state_dict, fail_prepare=[0])
+        self._test_save(state_dict, fail_finish=[0])
+
+        self._test_save(state_dict, fail_prepare_storage=[0])
+        self._test_save(state_dict, fail_write_tensors_on_ranks=[0])
+        self._test_save(state_dict, fail_write_tensors_on_ranks_async=[0])
+        self._test_save(state_dict, fail_write_bytes_on_ranks=[0])
+        self._test_save(state_dict, fail_write_bytes_on_ranks_async=[0])
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_load_error_handling(self) -> None:
+        state_dict = {
+            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
+            'replicated': torch.rand(10, 10),
+            'bytes': [1, 2, 3, 4]
+        }
+
+        self._test_load(state_dict)
+        self._test_load(state_dict, fail_read_metadata=[0])
+        self._test_load(state_dict, fail_read_bytes=[1])
+        self._test_load(state_dict, fail_read_bytes_async=[2])
+        self._test_load(state_dict, fail_read_tensors=[3])
+        self._test_load(state_dict, fail_read_tensors_async=[1])
+        # We don't want to depend on the actual exception raised by pickle
+        self._test_load(state_dict, fail_deser_bytes=[2], ignore_exception_type=True)
+
+        self._test_load(state_dict, coordinator=1, fail_read_metadata=[3])
+        self._test_load(state_dict, coordinator=2, fail_read_bytes=[0])
+        self._test_load(state_dict, coordinator=3, fail_read_tensors_async=[2])
+
+
+    def test_load_error_handling_no_dist(self) -> None:
+        state_dict = {
+            'replicated': torch.rand(10, 10),
+            'bytes': [1, 2, 3, 4]
+        }
+        self._test_load(state_dict)
+        self._test_load(state_dict, fail_read_metadata=[0])
+        self._test_load(state_dict, fail_read_bytes=[0])
+        self._test_load(state_dict, fail_read_bytes_async=[0])
+        self._test_load(state_dict, fail_read_tensors=[0])
+        self._test_load(state_dict, fail_read_tensors_async=[0])
+        self._test_load(state_dict, fail_deser_bytes=[0], ignore_exception_type=True)
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py b/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py
new file mode 100644
index 000000000000..ca0c121b3638
--- /dev/null
+++ b/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py
@@ -0,0 +1,466 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import os
+import shutil
+import tempfile
+from typing import Dict, cast
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from torch.distributed._shard import sharded_tensor
+from torch.distributed._shard.sharded_tensor import ShardedTensor, state_dict_hook
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+    EnumerableShardingSpec,
+    ShardingSpec,
+    ShardMetadata,
+)
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
+    MyShardedModel1
+)
+
+
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+from torch.distributed._shard.checkpoint import (
+    FileSystemReader,
+    FileSystemWriter,
+    load_state_dict,
+    save_state_dict,
+)
+
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+def _sharded_tensor_gather(
+        self,
+        dst=0,
+        out=None,
+):
+    """
+    This is a reimplementation of ST:gather using gather instead of gather_object.
+    The later hangs on CI inside NCCL.
+    """
+
+    def shard_size(shard_md):
+        res = 1
+        for s in shard_md.shard_sizes:
+            res *= s
+        return res
+    rank = dist.get_rank(self._process_group)
+    full_size = self.metadata().size
+
+    world_size = dist.get_world_size(self._process_group)
+    rank_sizes = [0 for _ in range(world_size)]
+    max_rank_size = 0
+    shard_placement = dict()
+    local_shards_placement = []
+    # collect sizes
+    for shard_idx, shard_md in enumerate(self.metadata().shards_metadata):
+        shard_rank = shard_md.placement.rank()
+        shard_placement[shard_idx] = (shard_rank, rank_sizes[shard_rank])
+        if shard_rank == rank:
+            local_shards_placement.append((shard_md, rank_sizes[shard_rank],))
+
+        rank_sizes[shard_rank] += shard_size(shard_md)
+        max_rank_size = max(max_rank_size, rank_sizes[shard_rank])
+
+
+    if rank == dst:
+        gather_list = [torch.empty((max_rank_size,), device=out.device) for _ in range(world_size)]
+    else:
+        gather_list = None
+
+    # FIXME is a rank allowed to not have any data?
+    with torch.no_grad():
+        # XXX we can fastpath this to torch.cat if max_rank_size == rank_sizes[rank]
+        data = torch.empty(max_rank_size, device=self.local_shards()[0].tensor.device)
+        for shard in self.local_shards():
+            for placement in local_shards_placement:
+                if placement[0] == shard.metadata:
+                    src = shard.tensor.flatten()
+                    data[placement[1]: placement[1] + src.numel()].copy_(src)
+                    break
+
+    dist.gather(
+        tensor=data,
+        gather_list=gather_list,
+        dst=dst,
+        group=self._process_group,
+    )
+    if rank != dst:
+        return
+    if out is None:
+        raise ValueError("`out` Tensor must be provided on dst rank!")
+
+    full_size = self.metadata().size
+    dims = len(full_size)
+
+
+    for shard_idx, shard_md in enumerate(self.metadata().shards_metadata):
+        placement = shard_placement[shard_idx]
+        tensor = gather_list[placement[0]]
+        tensor = tensor[placement[1] : placement[1] + shard_size(shard_md)]
+        tensor = tensor.view(shard_md.shard_sizes)
+
+        out_narrow_view = out
+        for dim in range(dims):
+            out_narrow_view = out_narrow_view.narrow(
+                dim,
+                shard_md.shard_offsets[dim],
+                shard_md.shard_sizes[dim],
+            )
+
+        out_narrow_view.copy_(tensor)
+
+
+def assert_state_dict_equal(
+    self: TestCase,
+    state_dict_1: Dict[str, torch.Tensor],
+    state_dict_2: Dict[str, torch.Tensor],
+) -> bool:
+    self.assertEqual(
+        len(state_dict_1), len(state_dict_2), "state_dict must be the same size"
+    )
+    self.assertEqual(
+        set(state_dict_1.keys()),
+        set(state_dict_2.keys()),
+        "state_dict keys do not match",
+    )
+
+    for key, value_1 in state_dict_1.items():
+        value_2 = state_dict_2[key]
+        if isinstance(value_1, torch.Tensor):
+            self.assertTrue(
+                torch.equal(value_1, value_2), f"Key {key}'s tensor does not match"
+            )
+        elif isinstance(value_1, ShardedTensor):
+            for local_shard_1, local_shard_2 in zip(
+                value_1.local_shards(), value_2.local_shards()
+            ):
+                self.assertTrue(
+                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
+                    f"Key {key}'s shard does not match",
+                )
+
+    return True
+
+
+class MyTestModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(5, 5)
+        self.linear_2 = torch.nn.Linear(5, 1)
+        self.emb = torch.nn.EmbeddingBag(5, 10)
+
+
+# The ShardedModels are borrowed from test/distributed/_sharded_tensor/test_sharded_tensor.py
+class MyShardedModel3(torch.nn.Module):
+    def __init__(
+        self,
+        spec: ShardingSpec,
+    ) -> None:
+        super(MyShardedModel3, self).__init__()
+        self.sharded_tensor: ShardedTensor = sharded_tensor.rand(
+            spec, 10, 20, init_rrefs=False
+        )
+
+
+class TestDistributedStateDictSaveLoad(TestCase):
+    def test_read_write_only_tensor(self) -> None:
+        with tempfile.TemporaryDirectory() as path:
+            state_dict_to_save = MyTestModule().state_dict()
+
+            fs_writer = FileSystemWriter(path=path)
+            save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer, no_dist=True)
+
+            state_dict_to_load_to = MyTestModule().state_dict()
+
+            with self.assertRaises(AssertionError):
+                assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+
+            # Load from file without any resharding
+            fs_reader = FileSystemReader(path=path)
+            load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader, no_dist=True)
+
+            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+
+
+class TestDistributedStateDictSaveLoadWithSharedTensor(ShardedTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_read_write_shard_tensor(self) -> None:
+        paths = [tempfile.mkdtemp()]
+        dist.broadcast_object_list(paths)
+
+        path = paths[0]
+
+        # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        )
+
+        model_to_save = MyShardedModel1(spec, init_rrefs=False)
+
+        # Test save
+        model_to_save._register_state_dict_hook(state_dict_hook)
+        state_dict_to_save = model_to_save.state_dict()
+
+        fs_writer = FileSystemWriter(path=path)
+        save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+
+        dist.barrier()
+
+        # Create a new model
+        model_to_load = MyShardedModel1(spec, init_rrefs=False)
+        # This is not the correct hook for loading the state dict
+        # model_to_load._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True)
+        model_to_load._register_state_dict_hook(state_dict_hook)
+        state_dict_to_load_to = model_to_load.state_dict()
+
+        dist.barrier()
+
+        with self.assertRaises(AssertionError):
+            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+
+        # Test load.
+        fs_reader = FileSystemReader(path=path)
+        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+
+        assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+        dist.barrier()
+
+
+class TestDistributedReshardOnLoad(ShardedTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def get_file_path(self) -> str:
+        paths = [tempfile.mkdtemp()] if dist.get_rank() == 0 else [None]
+        dist.broadcast_object_list(paths)
+        return paths[0]
+
+    def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor:
+        res = torch.zeros(tensor.shape, device="cuda:0") if dist.get_rank() == 0 else None
+        _sharded_tensor_gather(tensor, out=res)
+        return cast(Tensor, res)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_load_with_different_shard_plan(self) -> None:
+        path = self.get_file_path()
+
+        # We hardcode the assumption of how many shards are around
+        self.assertEqual(self.world_size, dist.get_world_size())
+
+        specs = [
+            # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+            ChunkShardingSpec(
+                dim=0,
+                placements=[
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+                ],
+            ),
+            # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+            ChunkShardingSpec(
+                dim=0,
+                placements=[
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+                    "rank:1/cuda:1",
+                    "rank:0/cuda:0",
+                ],
+            ),
+            # This requires the tensors to be [10, 20]
+            EnumerableShardingSpec(
+                shards=[
+                    ShardMetadata(
+                        shard_offsets=[0, 0],
+                        shard_sizes=[2, 20],
+                        placement="rank:0/cuda:0",
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[2, 0],
+                        shard_sizes=[1, 20],
+                        placement="rank:1/cuda:1",
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[3, 0],
+                        shard_sizes=[3, 20],
+                        placement="rank:0/cuda:0",
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[6, 0],
+                        shard_sizes=[3, 20],
+                        placement="rank:1/cuda:1",
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[9, 0],
+                        shard_sizes=[1, 20],
+                        placement="rank:0/cuda:0",
+                    ),
+                ]
+            ),
+            # This requires the tensors to be [10, 20]
+            EnumerableShardingSpec(
+                shards=[
+                    ShardMetadata(
+                        shard_offsets=[0, 0],
+                        shard_sizes=[8, 20],
+                        placement="rank:1/cuda:1",
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[8, 0],
+                        shard_sizes=[2, 20],
+                        placement="rank:0/cuda:0",
+                    ),
+                ]
+            ),
+        ]
+
+        for s0 in specs:
+            for s1 in specs:
+                if s0 == s1:
+                    continue
+
+                if dist.get_rank() == 0:
+                    shutil.rmtree(path, ignore_errors=True)
+                    os.makedirs(path)
+                dist.barrier()
+
+                model_to_save = MyShardedModel3(s0)
+                model_to_save._register_state_dict_hook(state_dict_hook)
+                state_dict_to_save = model_to_save.state_dict()
+
+                fs_writer = FileSystemWriter(path=path)
+                save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+
+                dist.barrier()
+
+                model_to_load = MyShardedModel3(s1)
+                model_to_load._register_state_dict_hook(state_dict_hook)
+                state_dict_to_load_to = model_to_load.state_dict()
+                dist.barrier()
+
+                fs_reader = FileSystemReader(path=path)
+                load_state_dict(
+                    state_dict=state_dict_to_load_to, storage_reader=fs_reader
+                )
+
+                dist.barrier()
+                store_tensor = self.load_tensor(model_to_save.sharded_tensor)
+                dist.barrier()
+                load_tensor = self.load_tensor(model_to_load.sharded_tensor)
+
+                if dist.get_rank() == 0:
+                    self.assertTrue(
+                        torch.allclose(store_tensor, load_tensor), msg=f"{s0} vs {s1}"
+                    )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_load_rowwise_to_colwise(self) -> None:
+        path = self.get_file_path()
+        self.assertEqual(self.world_size, dist.get_world_size())
+
+        # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+        src_spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        )
+
+        # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
+        dst_spec = ChunkShardingSpec(
+            dim=1,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        )
+
+        if dist.get_rank() == 0:
+            shutil.rmtree(path, ignore_errors=True)
+            os.makedirs(path)
+
+        model_to_save = MyShardedModel3(src_spec).cuda(dist.get_rank())
+        model_to_save._register_state_dict_hook(state_dict_hook)
+        state_dict_to_save = model_to_save.state_dict()
+
+        fs_writer = FileSystemWriter(path=path)
+        save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+
+        model_to_load = MyShardedModel3(dst_spec).cuda(dist.get_rank())
+        model_to_load._register_state_dict_hook(state_dict_hook)
+        state_dict_to_load_to = model_to_load.state_dict()
+
+        fs_reader = FileSystemReader(path=path)
+
+        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+
+        # We can't use torch.allclose since each ST has a different sharding spec
+        store_tensor = self.load_tensor(model_to_save.sharded_tensor)
+        load_tensor = self.load_tensor(model_to_load.sharded_tensor)
+
+        if dist.get_rank() == 0:
+            self.assertTrue(torch.allclose(store_tensor, load_tensor))
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    def test_save_load_bytes(self) -> None:
+        path = self.get_file_path()
+
+        state_dict_to_save = {
+            'bytes0': [1],
+            'bytes1': 'string'
+        }
+
+        fs_writer = FileSystemWriter(path=path)
+        save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+
+        state_dict_to_load = {
+            'bytes0': [2],
+            'bytes1': 'other'
+        }
+
+        fs_reader = FileSystemReader(path=path)
+        load_state_dict(state_dict=state_dict_to_load, storage_reader=fs_reader)
+
+        self.assertEqual([1], state_dict_to_load['bytes0'])
+        self.assertEqual('string', state_dict_to_load['bytes1'])
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index 085c928985eb..d3f1468aea3c 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -2,7 +2,10 @@
 
 import torch
 import torch.optim as optim
-import torch.distributed._shard.sharded_tensor
+from torch.distributed._shard import (
+    sharded_tensor,
+    shard_parameter
+)
 
 from copy import deepcopy
 from torch.distributed._shard.sharding_spec import (
@@ -77,8 +80,8 @@ def shard_parameter(self):
             ],
         )
 
-        sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
-        sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+        shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+        shard_parameter(self.linear2, "weight", colwise_sharding_spec)
 
     def forward(self, inp):
         return self.linear2(self.gelu(self.linear1(inp)))
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
index c20727169523..33fc49f81c0f 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
@@ -106,11 +106,17 @@ def _test_common_failures(self, cmp_op):
 
         pg = dist.new_group([1, 0, 3, 2])
         st1, st2 = self.get_random_tensors(spec, spec, 10, 10, pg2=pg)
-        self.assertFalse(cmp_op(st1, st2))
+        with self.assertRaisesRegex(
+            RuntimeError, "All distributed tensors should use the same ProcessGroup"
+        ):
+            cmp_op(st1, st2)
 
         pg = dist.new_group([0, 1, 2, 3])
         st1, st2 = self.get_random_tensors(spec, spec, 10, 10, pg2=pg)
-        self.assertFalse(cmp_op(st1, st2))
+        with self.assertRaisesRegex(
+            RuntimeError, "All distributed tensors should use the same ProcessGroup"
+        ):
+            cmp_op(st1, st2)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_chunk.py b/test/distributed/_shard/sharded_tensor/ops/test_chunk.py
new file mode 100644
index 000000000000..f0dcd4d7aad8
--- /dev/null
+++ b/test/distributed/_shard/sharded_tensor/ops/test_chunk.py
@@ -0,0 +1,90 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+import torch
+from torch.distributed._shard import sharded_tensor, _shard_tensor
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
+    generate_chunk_sharding_specs_for_test,
+    generate_enumerable_sharding_specs_for_test,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestShardedTensorChunkOps(ShardedTensorTestBase):
+    def _compare_chunk_result(self, chunked_list, chunked_st_list):
+        self.assertEqual(len(chunked_list), len(chunked_st_list))
+        for idx, chunked_st in enumerate(chunked_st_list):
+            tensor = chunked_list[idx]
+            st = _shard_tensor(tensor.contiguous(), chunked_st.sharding_spec())
+            # _shard_tensor generate sharded tensor with metadata ranked by # of rank.
+            st._metadata.shards_metadata.sort(
+                key=lambda x: x.shard_offsets[chunked_st.sharding_spec().dim],
+            )
+            self.assertTrue(torch.allclose(chunked_st, st))
+
+    def _run_sharded_chunk_test(self, local_tensor_size, shard_spec, chunk_num):
+        torch.manual_seed(0)
+        local_tensor = torch.rand(*local_tensor_size).cuda(self.rank)
+        st_tensor = _shard_tensor(local_tensor.clone().detach(), shard_spec)
+        local_tensor_chunked = torch.chunk(local_tensor, chunk_num, dim=-1)
+        chunked_st = torch.chunk(st_tensor, chunk_num, dim=-1)
+        self._compare_chunk_result(local_tensor_chunked, chunked_st)
+        chunked_st = st_tensor.chunk(chunk_num, dim=-1)
+        self._compare_chunk_result(local_tensor_chunked, chunked_st)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_chunk(self):
+        sharding_dims = [0]
+        specs = []
+        for dim in sharding_dims:
+            specs.extend(generate_chunk_sharding_specs_for_test(dim))
+        for spec in specs:
+            self._run_sharded_chunk_test([17, 14], spec, 3)
+            self._run_sharded_chunk_test([17, 15, 20], spec, 5)
+            self._run_sharded_chunk_test([17, 16], spec, 2)
+            # Large matrix case.
+            self._run_sharded_chunk_test([128, 512], spec, 8)
+            self._run_sharded_chunk_test([1024, 2048], spec, 4)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_chunk_error(self):
+        chunk_spec = generate_chunk_sharding_specs_for_test(-1)
+        with self.assertRaisesRegex(
+            NotImplementedError, "Chunk by sharding dim is not supported."
+        ):
+            st = sharded_tensor.rand(chunk_spec[0], [17, 24])
+            torch.chunk(st, 5, dim=-1)
+        enumerable_spec = generate_enumerable_sharding_specs_for_test()
+        with self.assertRaisesRegex(
+            NotImplementedError, "Only ChunkShardingSpec is supported for chunk."
+        ):
+            st = sharded_tensor.rand(enumerable_spec[0], [10, 10])
+            torch.chunk(st, 5, dim=-1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py
index 50f880b55b3a..382af65ab0f5 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py
@@ -30,14 +30,18 @@
 
 
 class TestShardedTensorElementWiseOps(ShardedTensorTestBase):
-    def _run_sharded_elementwise_ops(self, spec, input_size, op):
+    def _run_sharded_elementwise_ops(
+        self, spec, input_size, op, reset_seed=None, **kwargs
+    ):
         torch.manual_seed(self.rank)
         st = sharded_tensor.rand(spec, *input_size)
-        new_st = op(st)
+        reset_seed() if reset_seed else None
+        new_st = op(st, **kwargs)
         local_shard = st.local_tensor()
         new_st_local_shard = new_st.local_tensor()
+        reset_seed() if reset_seed else None
         self.assertEqual(
-            op(local_shard),
+            op(local_shard, **kwargs),
             new_st_local_shard,
         )
 
@@ -67,6 +71,37 @@ def test_sharded_relu(self):
             self._run_sharded_elementwise_ops(spec, [17, 23], torch.nn.functional.relu)
             self._run_sharded_elementwise_ops(spec, [14, 15], torch.nn.functional.relu)
 
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_dropout(self):
+        def _reset_random_seed():
+            torch.manual_seed(self.rank + 4)
+
+        specs = generate_chunk_sharding_specs_for_test(
+            0
+        ) + generate_chunk_sharding_specs_for_test(1)
+        for spec in specs:
+            self._run_sharded_elementwise_ops(
+                spec,
+                [12, 17],
+                torch.nn.functional.dropout,
+                p=0.4,
+                reset_seed=_reset_random_seed,
+            )
+            self._run_sharded_elementwise_ops(
+                spec,
+                [18, 21],
+                torch.nn.functional.dropout,
+                p=0.5,
+                reset_seed=_reset_random_seed,
+            )
+            _reset_random_seed()
+            dropout = torch.nn.Dropout(p=0.8)
+            self._run_sharded_elementwise_ops(
+                spec, [17, 23], dropout, reset_seed=_reset_random_seed
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_linear.py b/test/distributed/_shard/sharded_tensor/ops/test_linear.py
index f08797cb7b23..67e7dd2cb774 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_linear.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_linear.py
@@ -5,15 +5,17 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._shard import shard_parameter
+from torch.distributed._shard.api import (
+    shard_parameter,
+    _collect_local_shard,
+    _reshard_output,
+)
 from torch.distributed._shard.sharded_optim import (
     ShardedOptimizer,
     named_params_with_sharded_tensor,
 )
 from torch.distributed._shard.sharded_tensor import (
     empty,
-    _collect_local_shard,
-    _reshard_output,
 )
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
@@ -68,6 +70,7 @@ def _run_sharded_linear(
         inp = torch.rand(*input_size).cuda(self.rank)
         reshard_spec = copy.deepcopy(spec)
         reshard_spec.dim = 0
+        reshard_spec.placements.sort(key=lambda placement: placement.rank())
         sharded_linear = _collect_local_shard(
             _reshard_output(sharded_linear, reshard_spec)
         )
@@ -241,7 +244,10 @@ def test_sharded_linear_errors(self):
             ])
 
             fc6.weight = empty(enumerable_spec, 10, 10)
-            with self.assertRaisesRegex(ValueError, 'Only ChunkShardingSpec supported for ShardedTensor ops!'):
+            # Sharded Tensor metadata has parenthesis imbalance issue when using re.compile
+            error_msg = r"torch function 'linear', with args: (?s).* "
+            r"and kwargs: None not supported for ShardedTensor!"
+            with self.assertRaisesRegex(RuntimeError, error_msg):
                 fc6(torch.rand(10, 10).cuda(self.rank))
 
             fc7 = torch.nn.Linear(10, 80).cuda(self.rank)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py
new file mode 100644
index 000000000000..e080a6387515
--- /dev/null
+++ b/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py
@@ -0,0 +1,186 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.distributed._shard import _shard_tensor
+import torch.distributed._shard.sharded_tensor as sharded_tensor
+import torch.distributed as dist
+
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+    EnumerableShardingSpec,
+    ShardMetadata
+)
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+
+from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
+    gen_binary_op_func,
+    generate_chunk_sharding_specs_for_test,
+)
+
+class TestMathOps(ShardedTensorTestBase):
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_basic_math_ops(self):
+        ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"]
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+
+        sharded_lhs = sharded_tensor.rand(spec, (12, 3))
+        sharded_rhs = sharded_tensor.rand(spec, (12, 3))
+        current_rank = dist.get_rank()
+        global_lhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None
+        global_rhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None
+        sharded_lhs.gather(dst=0, out=global_lhs)
+        sharded_rhs.gather(dst=0, out=global_rhs)
+
+        for op in ops:
+            binary_op = gen_binary_op_func(op)
+            binary_op_ = gen_binary_op_func(op, inplace=True)
+            # test basic math ops between ShardedTensors
+            sharded_output = binary_op(sharded_lhs, sharded_rhs)
+            output = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None
+            sharded_output.gather(dst=0, out=output)
+
+            if current_rank == 0:
+                global_output = binary_op(global_lhs, global_rhs)
+
+                self.assertEqual(output, global_output)
+
+            # test basic math ops between ShardedTensor and scalar
+            scalars = [3, 1.8]
+            for scalar in scalars:
+                sharded_output_lhs = binary_op(sharded_lhs, scalar)
+
+                sharded_output_lhs_ = binary_op_(sharded_lhs, scalar)
+                self.assertTrue(torch.allclose(sharded_output_lhs, sharded_output_lhs_))
+                output_lhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None
+                sharded_output_lhs.gather(dst=0, out=output_lhs)
+
+                sharded_output_rhs = binary_op(scalar, sharded_lhs)
+                output_rhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None
+                sharded_output_rhs.gather(dst=0, out=output_rhs)
+
+                if current_rank == 0:
+                    global_output_lhs = binary_op(global_lhs, scalar)
+                    global_output_rhs = binary_op(scalar, global_lhs)
+
+                    self.assertEqual(output_lhs, global_output_lhs)
+                    self.assertEqual(output_rhs, global_output_rhs)
+
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_math_ops_errors(self):
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        sharded_lhs = sharded_tensor.rand(spec, (20, 3))
+        sharded_rhs = sharded_tensor.rand(spec, (12, 3))
+
+        with self.assertRaisesRegex(RuntimeError, 'Implicit broadcasting not supported'):
+            torch.add(sharded_lhs, sharded_rhs)
+
+        spec = EnumerableShardingSpec([
+            ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="rank:0/cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[0, 5],
+                shard_sizes=[5, 5],
+                placement="rank:1/cuda:1",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 0],
+                shard_sizes=[5, 5],
+                placement="rank:2/cuda:2",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 5],
+                shard_sizes=[5, 5],
+                placement="rank:3/cuda:3",
+            )
+        ])
+
+        st = sharded_tensor.rand(spec, 10, 10)
+
+        with self.assertRaisesRegex(RuntimeError, 'not supported'):
+            torch.add(st, sharded_rhs)
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_bmm(self):
+        for spec in generate_chunk_sharding_specs_for_test(0):
+            lhs = torch.rand(15, 4, 5).cuda(self.rank)
+            rhs = torch.rand(15, 5, 6).cuda(self.rank)
+            tensor = lhs.bmm(rhs)
+            st_lhs = _shard_tensor(lhs, spec)
+            st_rhs = _shard_tensor(rhs, spec)
+            st_expected = _shard_tensor(tensor, spec)
+            self.assertTrue(torch.allclose(torch.bmm(st_lhs, st_rhs), st_expected))
+            self.assertTrue(torch.allclose(st_lhs.bmm(st_rhs), st_expected))
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_bmm_errors(self):
+        specs = generate_chunk_sharding_specs_for_test(0)
+        st_lhs = sharded_tensor.rand(specs[0], (15, 5, 6))
+        st_rhs = sharded_tensor.rand(specs[1], (15, 5, 6))
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            'Both st and st2 need to have same placements for bmm',
+        ):
+            torch.bmm(st_lhs, st_rhs)
+        for spec in specs:
+            st_lhs = sharded_tensor.rand(spec, (20, 3))
+            st_rhs = sharded_tensor.rand(spec, (20, 3))
+            with self.assertRaisesRegex(
+                TypeError,
+                'both st and st2 need to be a 3D ShardedTensor',
+            ):
+                torch.bmm(st_lhs, st_rhs)
+            rhs = torch.rand(15, 5, 6).cuda(self.rank)
+            with self.assertRaisesRegex(
+                TypeError,
+                'st2 needs to be a ShardedTensor for torch.bmm',
+            ):
+                torch.bmm(st_lhs, rhs)
+            spec.dim = 1
+            st_lhs = sharded_tensor.rand(spec, (15, 5, 6))
+            st_rhs = sharded_tensor.rand(spec, (15, 5, 6))
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                'Only support performing bmm on tensors sharded on dim 0 now',
+            ):
+                torch.bmm(st_lhs, st_rhs)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py
new file mode 100644
index 000000000000..dd074f324df4
--- /dev/null
+++ b/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py
@@ -0,0 +1,294 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import itertools
+import sys
+
+import torch
+from torch.distributed._shard import sharded_tensor, _shard_tensor
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
+    generate_enumerable_sharding_specs_for_test,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
+    _chunk_sharding_specs_list_for_test,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestShardedTensorMatrixOps(ShardedTensorTestBase):
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_contiguous(self):
+        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
+        for spec in specs:
+            st = sharded_tensor.rand(spec, 10, 22, 5, init_rrefs=False)
+            st = st.transpose(1, 0)
+            st = st.contiguous()
+            self.assertTrue(st.is_contiguous())
+            self.assertTrue(st.local_tensor().is_contiguous())
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_type_as(self):
+        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
+        for spec in specs:
+            st = sharded_tensor.rand(
+                spec, 16, 30, 5, init_rrefs=False, dtype=torch.double
+            )
+            st_2 = sharded_tensor.rand(
+                spec, 16, 30, 5, init_rrefs=False, dtype=torch.float
+            )
+            st_3 = st.type_as(st_2)
+            self.assertEqual(torch.float, st_3.dtype)
+            self.assertEqual(torch.float, st_3.local_tensor().dtype)
+            st_3 = st.type_as(torch.zeros(10).type(torch.BoolTensor).cuda())
+            self.assertEqual(torch.bool, st_3.dtype)
+            self.assertEqual(torch.bool, st_3.local_tensor().dtype)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_transpose(self):
+        specs = _chunk_sharding_specs_list_for_test([0, 1, 2], seed=7)
+        for spec in specs:
+            tensor = torch.rand(15, 27, 16).cuda(self.rank)
+            tensor_t = tensor.transpose(0, 1).contiguous()
+            spec_n = copy.deepcopy(spec)
+            if spec_n.dim in (0, 1):
+                spec_n.dim = 1 - spec_n.dim
+            st_expected = _shard_tensor(tensor_t, spec_n)
+            self.assertTrue(
+                torch.allclose(
+                    torch.transpose(_shard_tensor(tensor, spec), 0, 1), st_expected
+                )
+            )
+            tensor_t = torch.transpose(tensor, 1, 2).contiguous()
+            spec_n = copy.deepcopy(spec)
+            if spec_n.dim in (1, 2):
+                spec_n.dim = 3 - spec_n.dim
+            st_expected = _shard_tensor(tensor_t, spec_n)
+            self.assertTrue(
+                torch.allclose(_shard_tensor(tensor, spec).transpose(1, 2), st_expected)
+            )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_transpose_error(self):
+        enumerable_spec = generate_enumerable_sharding_specs_for_test()[0]
+        st = sharded_tensor.rand(
+            enumerable_spec, 10, 10, init_rrefs=False, dtype=torch.double
+        )
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Only ChunkShardingSpec supported for 'transpose'",
+        ):
+            st.transpose(1, 0)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_softmax(self):
+        specs = _chunk_sharding_specs_list_for_test([0, 2], seed=17)
+        for spec in specs:
+            tensor = torch.rand(15, 27, 16).cuda(self.rank)
+            tensor_n = torch.nn.functional.softmax(tensor, dim=1, dtype=torch.float32)
+            st_expected = _shard_tensor(tensor_n, spec)
+            self.assertTrue(
+                torch.allclose(
+                    torch.nn.functional.softmax(
+                        _shard_tensor(tensor, spec), dim=1, dtype=torch.float32
+                    ),
+                    st_expected,
+                )
+            )
+
+    def _test_masked_fill_with_sizes(self, mask_size, broadcast_style=False):
+        specs = _chunk_sharding_specs_list_for_test([0, 1, 2], seed=7)
+        for spec in specs:
+            tensor = torch.rand(35, 17, 26).cuda(self.rank)
+            mask = torch.randint(0, 2, mask_size).type(torch.BoolTensor).cuda(self.rank)
+            if broadcast_style:
+                mask = mask.unsqueeze(1)
+            tensor_m = tensor.masked_fill(mask, 25.0)
+            st_expected = _shard_tensor(tensor_m, spec)
+            self.assertTrue(
+                torch.allclose(
+                    _shard_tensor(tensor, spec).masked_fill(mask, 25.0),
+                    st_expected,
+                )
+            )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_masked_fill(self):
+        self._test_masked_fill_with_sizes((35, 17, 26))
+        self._test_masked_fill_with_sizes((17, 26))
+        self._test_masked_fill_with_sizes((35, 26), broadcast_style=True)
+        self._test_masked_fill_with_sizes((26,))
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_masked_fill_error(self):
+        specs = _chunk_sharding_specs_list_for_test([1, 2], seed=7)
+        for spec in specs:
+            st = sharded_tensor.rand(
+                spec, 35, 17, 26, init_rrefs=False, dtype=torch.double
+            )
+            mask = (
+                torch.randint(0, 2, (2, 35, 17, 26))
+                .type(torch.BoolTensor)
+                .cuda(self.rank)
+            )
+            with self.assertRaisesRegex(
+                ValueError,
+                "mask dim must not greater than the dim of the sharded tensor.",
+            ):
+                st.masked_fill(mask, 25.0)
+            mask = torch.randint(0, 2, (16, 26)).type(torch.BoolTensor).cuda(self.rank)
+            with self.assertRaisesRegex(
+                ValueError,
+                "The size of mask 0 must match the size of sharded tensor 1 "
+                "at non-singleton dimension 0",
+            ):
+                st.masked_fill(mask, 25.0)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_view(self):
+        specs = _chunk_sharding_specs_list_for_test([0, 0, -3], seed=10)
+        for spec in specs:
+            tensor = torch.rand(16, 35, 26).cuda(self.rank)
+            tensor_v = tensor.view(16, 35, 26).view(4, 4, 35, 26)
+            new_spec = copy.deepcopy(spec)
+            if new_spec.dim < 0:
+                new_spec.dim -= 1
+            st_expected = _shard_tensor(tensor_v, new_spec)
+            self.assertTrue(
+                torch.allclose(
+                    _shard_tensor(tensor, spec).view(4, 4, 35, 26),
+                    st_expected,
+                )
+            )
+            st_expected = _shard_tensor(tensor, spec)
+            self.assertTrue(
+                torch.allclose(
+                    _shard_tensor(tensor_v, new_spec).view(16, 35, 26),
+                    st_expected,
+                )
+            )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_view_error(self):
+        for spec in _chunk_sharding_specs_list_for_test([2], seed=7):
+            st = sharded_tensor.rand(
+                spec, 35, 17, 26, init_rrefs=False, dtype=torch.double
+            )
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                "Shape having dim 2 is not supported "
+                "for sharded tensor sharded on dim 2.",
+            ):
+                st.view(35 * 17, 26)
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Shape '\[5, 7, 35, 17, 26\]' is invalid for sharded tensor size 15470.",
+            ):
+                st.view(5, 7, 35, 17, 26)
+            with self.assertRaisesRegex(
+                ValueError,
+                "Only one dimension can be inferred for sharded view op.",
+            ):
+                st.view(5, 7, -1, -1)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_layer_norm(self):
+        specs = _chunk_sharding_specs_list_for_test([1, 2], seed=10)
+        flags = [True, False]
+        for spec, flag in itertools.product(specs, flags):
+            tensor = torch.rand(16, 35, 26).cuda(self.rank)
+            layer_norm = torch.nn.LayerNorm((35, 26), elementwise_affine=flag).cuda(
+                self.rank
+            )
+            st = layer_norm(_shard_tensor(tensor, spec))
+            with torch.no_grad():
+                tensor_normed = layer_norm(tensor)
+            st_expected = _shard_tensor(tensor_normed, spec)
+            self.assertEqual(
+                st.local_tensor(),
+                st_expected.local_tensor(),
+            )
+            self.assertTrue(
+                torch.allclose(
+                    st,
+                    st_expected,
+                    atol=1e-6,
+                )
+            )
+            st_expected = torch.nn.functional.layer_norm(
+                _shard_tensor(tensor, spec),
+                (35, 26),
+                weight=layer_norm.weight,
+                bias=layer_norm.bias,
+            )
+            self.assertTrue(
+                torch.allclose(
+                    st,
+                    st_expected,
+                    atol=1e-6,
+                )
+            )
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_tensor_layer_norm_error(self):
+        specs = _chunk_sharding_specs_list_for_test([2], seed=10)
+        for spec in specs:
+            tensor = torch.rand(16, 35, 26).cuda(self.rank)
+            with self.assertRaisesRegex(
+                ValueError,
+                "normalized_shape dim must not be greater "
+                "than the dim of the sharded tensor.",
+            ):
+                layer_norm = torch.nn.LayerNorm((14, 55, 35, 26)).cuda(self.rank)
+                layer_norm(_shard_tensor(tensor, spec))
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Given normalized_shape=\[35\], expected input with shape "
+                r"\[\*, 35\], but got input of size \[16, 35, 26\].",
+            ):
+                layer_norm = torch.nn.LayerNorm((35)).cuda(self.rank)
+                layer_norm(_shard_tensor(tensor, spec))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_softmax.py b/test/distributed/_shard/sharded_tensor/ops/test_softmax.py
new file mode 100644
index 000000000000..f55ca9391d9f
--- /dev/null
+++ b/test/distributed/_shard/sharded_tensor/ops/test_softmax.py
@@ -0,0 +1,57 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import torch
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard import _shard_tensor
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestShardedSoftmax(ShardedTensorTestBase):
+
+    def _test_sharded_softmax(self, softmax_dim, sharding_dim):
+        torch.manual_seed(0)
+        local_tensor = torch.rand(10, 10, device=self.rank)
+        local_softmax = torch.nn.functional.softmax(local_tensor, softmax_dim)
+
+        spec = ChunkShardingSpec(dim=sharding_dim, placements=[f'rank:{idx}/cuda:{idx}' for idx in range(self.world_size)])
+        st = _shard_tensor(local_tensor, spec)
+        sharded_softmax = torch.nn.functional.softmax(st, softmax_dim)
+
+        self.assertEqual(local_softmax.chunk(self.world_size, dim=sharding_dim)[self.rank], sharded_softmax.local_tensor())
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_softmax_basic(self):
+        self._test_sharded_softmax(0, 1)
+        self._test_sharded_softmax(-2, 1)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharded_softmax_on_sharding_dim(self):
+        self._test_sharded_softmax(1, 1)
+        self._test_sharded_softmax(-1, 1)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
new file mode 100644
index 000000000000..3f9bec1f38f5
--- /dev/null
+++ b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
@@ -0,0 +1,115 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+
+import torch.distributed._shard.sharded_tensor as sharded_tensor
+
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+)
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+)
+
+class TestTensorOps(ShardedTensorTestBase):
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_deep_copy(self):
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        st = sharded_tensor.rand(spec, (12, 5))
+        copied_st = copy.deepcopy(st)
+        self.assertTrue(type(copied_st) is type(st))
+        self.assertEqual(copied_st.local_tensor(), st.local_tensor())
+        self.assertFalse(copied_st is st)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_clone(self):
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        st = sharded_tensor.rand(spec, (12, 5))
+        copied_st = st.clone()
+        self.assertTrue(type(copied_st) is type(st))
+        self.assertEqual(copied_st.local_tensor(), st.local_tensor())
+        self.assertFalse(copied_st is st)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_detach(self):
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        st = sharded_tensor.rand(spec, (12, 5), requires_grad=True)
+        local_shards = st.local_shards()
+        # before set requires_grad, all local shards should not require grads
+        for local_shard in local_shards:
+            self.assertTrue(local_shard.tensor.requires_grad)
+
+        detached_st = st.detach()
+        self.assertFalse(detached_st.requires_grad)
+
+        for local_shard in detached_st.local_shards():
+            self.assertFalse(local_shard.tensor.requires_grad)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_set_requires_grad(self):
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        st = sharded_tensor.rand(spec, (12, 5))
+        local_shards = st.local_shards()
+        # before set requires_grad, all local shards should not require grads
+        for local_shard in local_shards:
+            self.assertFalse(local_shard.tensor.requires_grad)
+
+        st.requires_grad_()
+        self.assertTrue(st.requires_grad)
+
+        for local_shard in local_shards:
+            self.assertTrue(local_shard.tensor.requires_grad)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py b/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
index 2b11c49d9589..cff259aad8a9 100644
--- a/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
+++ b/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
@@ -9,12 +9,10 @@
     ShardedOptimizer,
     named_params_with_sharded_tensor,
 )
-from torch.distributed._shard import (
+from torch.distributed._shard.api import (
     shard_parameter,
-)
-from torch.distributed._shard.sharded_tensor import (
-    _collect_local_shard,
     _reshard_output,
+    _collect_local_shard
 )
 from torch.testing._internal.common_distributed import (
     requires_nccl,
@@ -34,6 +32,7 @@
     generate_chunk_sharding_specs_for_test,
     generate_local_weight_sharding_params_for_test,
 )
+from torch.testing._internal.distributed._shard.test_common import SimpleMegatronLM
 
 if TEST_WITH_DEV_DBG_ASAN:
     print(
@@ -44,19 +43,6 @@
 
 
 class TestShardedTensorMegatronLinear(ShardedTensorTestBase):
-    class SimpleMegatronLM(torch.nn.Module):
-        def __init__(self, linear_size, rank=None):
-            super().__init__()
-            self.fc1 = torch.nn.Linear(*linear_size[0])
-            self.gelu = torch.nn.GELU()
-            self.fc2 = torch.nn.Linear(*linear_size[1])
-            if rank:
-                self.fc1.cuda(rank)
-                self.fc2.cuda(rank)
-
-        def forward(self, inp):
-            return self.fc2(self.gelu(self.fc1(inp)))
-
     def _run_megatron_linear(self, spec, input_size, linear_size):
         def _weight_override(module_dst, module_src):
             module_dst.fc1.weight = clone_module_parameter(module_src.fc1, "weight")
@@ -68,30 +54,12 @@ def _shard_parameter(module, spec):
             shard_parameter(module.fc1, "weight", spec[0])
             shard_parameter(module.fc2, "weight", spec[1])
 
-        def _get_weight_grad(module):
-            return (module.fc1.weight.grad, module.fc2.weight.grad)
-
-        def _get_bias_grad(module):
-            return (module.fc1.bias.grad, module.fc2.bias.grad)
-
-        def _get_weights(module):
-            return (module.fc1.weight, module.fc2.weight)
-
-        def _get_bias(module):
-            return (module.fc1.bias, module.fc2.bias)
-
-        def _get_weight_local_shard(module):
-            return (
-                module.fc1.weight.local_tensor(),
-                module.fc2.weight.local_tensor(),
-            )
-
         # Use same seed.
         torch.manual_seed(0)
-        local_megatron_lm = self.SimpleMegatronLM(linear_size, rank=self.rank).cuda(
+        local_megatron_lm = SimpleMegatronLM(linear_size, rank=self.rank).cuda(
             self.rank
         )
-        sharded_megatron_lm = self.SimpleMegatronLM(linear_size)
+        sharded_megatron_lm = SimpleMegatronLM(linear_size)
         _weight_override(sharded_megatron_lm, local_megatron_lm)
 
         # Shard the parameter. First col-wise sharding and then row-wise
@@ -121,15 +89,15 @@ def _get_weight_local_shard(module):
         (
             local_weight_grad_fc1,
             local_weight_grad_fc2,
-        ) = _get_weight_grad(local_megatron_lm)
-        local_bias_grad_fc1, local_bias_grad_fc2 = _get_bias_grad(local_megatron_lm)
+        ) = local_megatron_lm.get_weight_grads()
+        local_bias_grad_fc1, local_bias_grad_fc2 = local_megatron_lm.get_bias_grads()
 
         # Verify that weights in both layers and biases in the sharded linear has non-None grad.
         (
             sharded_weight_fc1,
             sharded_weight_fc2,
-        ) = _get_weight_local_shard(sharded_megatron_lm)
-        bias_grad_fc1, bias_grad_fc2 = _get_bias_grad(sharded_megatron_lm)
+        ) = sharded_megatron_lm.get_weights()
+        bias_grad_fc1, bias_grad_fc2 = sharded_megatron_lm.get_bias_grads()
         self.assertNotEqual(sharded_weight_fc1.grad, None)
         self.assertNotEqual(sharded_weight_fc2.grad, None)
         self.assertNotEqual(bias_grad_fc1, None)
@@ -140,7 +108,7 @@ def _get_weight_local_shard(module):
         dist.all_reduce(local_weight_grad_fc2)
         dist.all_reduce(local_bias_grad_fc1)
         dist.all_reduce(local_bias_grad_fc2)
-        local_weight_fc1, local_weight_fc2 = _get_weights(local_megatron_lm)
+        local_weight_fc1, local_weight_fc2 = local_megatron_lm.get_weights()
         (
             start_pos_fc1,
             chunk_size_fc1,
@@ -167,8 +135,8 @@ def _get_weight_local_shard(module):
         self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
 
         # Test optimizer.
-        bias_fc1, bias_fc2 = _get_bias(sharded_megatron_lm)
-        local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
+        bias_fc1, bias_fc2 = sharded_megatron_lm.get_biases()
+        local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases()
         self.assertEqual(bias_fc1, local_bias_fc1)
         self.assertEqual(bias_fc2, local_bias_fc2)
         self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
@@ -201,7 +169,7 @@ def _get_weight_local_shard(module):
         self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
 
         # Test bias value after optimizer.
-        local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
+        local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases()
         self.assertNotEqual(previous_bias_fc1, bias_fc1)
         self.assertEqual(bias_fc1, local_bias_fc1)
         self.assertNotEqual(previous_bias_fc2, bias_fc2)
diff --git a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py b/test/distributed/_shard/sharded_tensor/test_partial_tensor.py
deleted file mode 100644
index 18418f8fb517..000000000000
--- a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-
-import sys
-
-import torch
-import torch.distributed as dist
-from torch.distributed._shard.sharded_tensor import (
-    _PartialTensor,
-)
-from torch.distributed._shard.sharding_spec import (
-    EnumerableShardingSpec,
-    ShardMetadata,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-)
-from torch.testing._internal.distributed._shard.sharded_tensor import (
-    ShardedTensorTestBase,
-    with_comms,
-)
-from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
-    _chunk_sharding_specs_list_for_test,
-)
-
-if TEST_WITH_DEV_DBG_ASAN:
-    print(
-        "Skip dev-asan as torch + multiprocessing spawn have known issues",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-
-
-class TestPartialTensorReshard(ShardedTensorTestBase):
-    def _run_partial_tensor_n_reshard(
-        self, reshard_spec, input_size, world_size, reduce_op, dtype=torch.float
-    ):
-        results = []
-        results_compare = []
-        for _ in range(0, world_size):
-            tensor = torch.rand(*input_size, dtype=dtype).cuda(self.rank)
-            results.append(tensor)
-            results_compare.append(tensor.clone().detach())
-        pg = dist.distributed_c10d._get_default_group()
-        parital_tensor = _PartialTensor(torch.cat(results), pg, reduce_op=reduce_op)
-        local_sharded_result = parital_tensor.reshard(reshard_spec)
-        local_shards = local_sharded_result.local_shards()
-        local_result_compare = torch.empty_like(results_compare[0])
-        dist.reduce_scatter(local_result_compare, results_compare, op=reduce_op)
-        self.assertEqual(1, len(local_shards))
-        self.assertEqual(local_shards[0].tensor, local_result_compare)
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(4)
-    @requires_nccl()
-    def test_partial_tensor_reshard(self):
-        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
-        spec = specs[0]
-        self._run_partial_tensor_n_reshard(spec, [13, 21], 4, dist.ReduceOp.SUM)
-        self._run_partial_tensor_n_reshard(spec, [12, 22], 4, dist.ReduceOp.MAX)
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(4)
-    @requires_nccl()
-    def test_partial_tensor_reshard_errors(self):
-        enumerable_sharding_spec = EnumerableShardingSpec(
-            [
-                ShardMetadata(
-                    shard_offsets=[0, 0],
-                    shard_sizes=[5, 5],
-                    placement="rank:0/cuda:0",
-                ),
-                ShardMetadata(
-                    shard_offsets=[5, 0],
-                    shard_sizes=[5, 5],
-                    placement="rank:1/cuda:1",
-                ),
-            ]
-        )
-        with self.assertRaisesRegex(
-            NotImplementedError, "Only ChunkShardingSpec supported for reshard."
-        ):
-            self._run_partial_tensor_n_reshard(
-                enumerable_sharding_spec, [13, 21], 4, dist.ReduceOp.SUM
-            )
-            self._run_partial_tensor_n_reshard(
-                enumerable_sharding_spec, [12, 22], 4, dist.ReduceOp.MAX
-            )
-        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
-        spec = specs[0]
-        with self.assertRaisesRegex(
-            NotImplementedError, "Only real partial tensor supported for reshard."
-        ):
-            self._run_partial_tensor_n_reshard(
-                spec, [13, 21], 4, dist.ReduceOp.SUM, dtype=torch.cfloat
-            )
-            self._run_partial_tensor_n_reshard(
-                spec, [12, 22], 4, dist.ReduceOp.MAX, dtype=torch.cfloat
-            )
-        with self.assertRaisesRegex(
-            ValueError, "World size need to divide the length of the dimension."
-        ):
-            self._run_partial_tensor_n_reshard(
-                spec, [13, 21], 3, dist.ReduceOp.SUM, dtype=torch.cfloat
-            )
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index cbad9458ae4f..ae00f47cecff 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -9,25 +9,29 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import rpc
-from torch.distributed._shard import (
+from torch.distributed import distributed_c10d
+from torch.distributed._shard import sharded_tensor
+from torch.distributed._shard.api import (
     shard_parameter,
-    sharded_tensor,
     _shard_tensor,
+    load_with_process_group,
+    _collect_local_shard,
+    _reshard_output,
 )
 from torch.distributed._shard.sharded_tensor import (
     sharded_op_impl,
-    load_with_process_group,
     pre_load_state_dict_hook,
     state_dict_hook,
     ShardedTensor,
-    _collect_local_shard,
-    _reshard_output,
 )
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
     EnumerableShardingSpec,
     ShardMetadata,
 )
+from torch.distributed._shard.sharded_tensor.utils import (
+    _parse_and_validate_remote_device
+)
 from torch.distributed._shard.sharded_tensor.api import (
     TensorProperties,
     _create_tensor_from_params,
@@ -35,6 +39,7 @@
 from torch.testing._internal.common_distributed import (
     requires_nccl,
     skip_if_lt_x_gpu,
+    tp_transports,
 )
 from torch.testing._internal.common_utils import (
     TestCase,
@@ -49,32 +54,13 @@
 from torch.distributed.remote_device import _remote_device
 from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
     _chunk_sharding_specs_list_for_test,
+    MyShardedModel1,
 )
 
 if TEST_WITH_DEV_DBG_ASAN:
     print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)
 
-class MyShardedModel2(torch.nn.Module):
-    def __init__(self, spec=None, group=None):
-        super(MyShardedModel2, self).__init__()
-        if spec is not None:
-            self.sharded_tensor2 = sharded_tensor.empty(spec, 10, 20, process_group=group, init_rrefs=True)
-        else:
-            self.sharded_tensor2 = None
-        self.random_tensor2 = torch.nn.Parameter(torch.rand(2, 2))
-
-
-class MyShardedModel1(torch.nn.Module):
-    def __init__(self, spec=None, group=None):
-        super(MyShardedModel1, self).__init__()
-        if spec is not None:
-            self.sharded_tensor1 = sharded_tensor.empty(spec, 10, 20, process_group=group, init_rrefs=True)
-        else:
-            self.sharded_tensor1 = None
-        self.random_tensor1 = torch.nn.Parameter(torch.rand(2, 2))
-        self.submodule = MyShardedModel2(spec, group)
-
 class TestShardedTensorMetadata(TestCase):
     def test_serialize_and_deserialize(self):
         shard_metadatas = [
@@ -188,7 +174,7 @@ def test_shard_parameter_errors(self):
         with self.assertRaisesRegex(ValueError, 'does not match with src_rank'):
             shard_parameter(fc, 'weight', spec, src_rank=self.rank)
 
-        with self.assertRaisesRegex(ValueError, 'does not have parameter'):
+        with self.assertRaisesRegex(AttributeError, 'Linear have no attribute'):
             shard_parameter(fc, 'foo', spec)
 
         with self.assertRaisesRegex(ValueError, 'Expected Linear.bias to be a Tensor, but found str'):
@@ -224,9 +210,7 @@ def test_shard_parameter_errors(self):
                 placement="rank:1/cuda:1",
             ),
         ])
-        with self.assertRaisesRegex(
-            NotImplementedError, 'Only ChunkShardingspec is supported.'
-        ):
+        with self.assertRaisesRegex(NotImplementedError, 'not implemented yet!'):
             shard_parameter(fc, 'weight', spec)
 
 
@@ -301,7 +285,7 @@ def test_shard_tensor_errors(self):
             ),
         ])
         with self.assertRaisesRegex(
-            NotImplementedError, 'Only ChunkShardingspec is supported.'
+            NotImplementedError, 'not implemented yet!'
         ):
             _shard_tensor(tensor, spec)
 
@@ -634,7 +618,7 @@ def test_create_sharded_tensor_with_zeros(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_rand(self):
-        """ Test sharded_tensor.rand(...) """
+        """ Test sharded_tensor.rand(...)/randn(...) """
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -652,6 +636,7 @@ def test_create_sharded_tensor_with_rand(self):
         expected_device = torch.device(f"cuda:{self.rank}")
         dtype = torch.double
         torch.manual_seed(seed)
+        # Test sharded_tensor.rand creation
         expected = torch.rand(expected_h, w, device=expected_device, dtype=dtype)
         # reset seed to ensure the same random numbers are generated
         torch.manual_seed(seed)
@@ -665,6 +650,20 @@ def test_create_sharded_tensor_with_rand(self):
         self.assertEqual((expected_h, w), local_shard.size())
         self.assertEqual(expected, local_shard)
 
+        # Test sharded_tensor.randn creation
+        torch.manual_seed(seed)
+        expected_randn = torch.randn(expected_h, w, device=expected_device, dtype=dtype)
+        # reset seed to ensure the same random numbers are generated
+        torch.manual_seed(seed)
+        st_randn = sharded_tensor.randn(spec, h, w, dtype=dtype)
+
+        # Validate local shard is initialized with torch.randn
+        local_shards = st_randn.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(expected_device, local_shard.device)
+        self.assertEqual((expected_h, w), local_shard.size())
+        self.assertEqual(expected_randn, local_shard)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -696,6 +695,52 @@ def test_create_sharded_tensor_with_full(self):
         self.assertEqual(local_shard,
                          torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32))
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_like(self):
+        """ Test tensor like methods, i.e. torch.zeros_like(...), torch.full_like, etc. """
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 8, 8
+        expected_h = 2
+        seed = 1234
+        dtype = torch.double
+        expected_device = torch.device(f"cuda:{self.rank}")
+        st = sharded_tensor.rand(spec, (h, w), dtype=dtype)
+        tensor_like_ops = {
+            torch.zeros_like: torch.zeros,
+            torch.ones_like: torch.ones,
+            torch.rand_like: torch.rand,
+            torch.randn_like: torch.randn,
+            torch.empty_like: torch.empty,
+            torch.full_like: torch.full
+        }
+        for op, expect_local_op in tensor_like_ops.items():
+            if op == torch.full_like:
+                # special handle full/full_like as it needs to have additional fill_value arg
+                expect_tensor = expect_local_op((expected_h, w), 8.8, device=expected_device, dtype=dtype)
+                new_op_st = op(st, 8.8, dtype=dtype)
+                self.assertEqual(new_op_st.local_tensor(), expect_tensor)
+            elif op == torch.empty_like:
+                # empty/empty_like we only compare the shape
+                expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype)
+                new_op_st = op(st, dtype=dtype)
+                self.assertEqual(new_op_st.local_tensor().shape, expect_tensor.shape)
+            else:
+                torch.manual_seed(seed)
+                expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype)
+                torch.manual_seed(seed)
+                new_op_st = op(st, dtype=dtype)
+                self.assertEqual(new_op_st.local_tensor(), expect_tensor)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -885,8 +930,8 @@ def test_sharding_columns(self):
     def test_invalid_sharding(self):
         self.init_pg()
 
-        spec = ChunkShardingSpec(dim='H', placements=["rank:1/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'needs to be an integer'):
+        with self.assertRaisesRegex(NotImplementedError, 'does not support named dimension'):
+            spec = ChunkShardingSpec(dim='H', placements=["rank:1/cuda:1"])
             sharded_tensor.empty(spec, 10, 20)
 
         for dim in [2, 3, 4, -3, -4, -5]:
@@ -901,7 +946,7 @@ def test_invalid_sharding(self):
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
         st = sharded_tensor.empty(spec, 10, 20)
         tensor = torch.empty(10, 20)
-        with self.assertRaisesRegex(RuntimeError, "not supported for ShardedTensor!"):
+        with self.assertRaisesRegex(RuntimeError, "not supported yet for ShardedTensor!"):
             torch.add(st, tensor)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
@@ -935,7 +980,7 @@ def test_invalid_pg_rpc_ranks(self):
         self.init_pg()
 
         # Init RPC with different ranks.
-        rpc_backend_options = rpc.TensorPipeRpcBackendOptions()
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports())
         rpc_backend_options.init_method = f"file://{self.file_name}"
         rank = (self.rank + 1) % self.world_size
         rpc.init_rpc(
@@ -1025,9 +1070,9 @@ def test_sharded_tensor_sizes(self):
 
         # Test with invalid input
         st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True)
-        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'):
-            st.size(-1)
-        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'):
+        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[-2, 2\\)'):
+            st.size(-3)
+        with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[-2, 2\\)'):
             st.size(2)
 
         with self.assertRaises(TypeError):
@@ -1463,6 +1508,92 @@ def test_gather_uneven(self) -> None:
         else:
             self.assertIsNone(full_tensor)
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_sharded_tensor_to_cpu(self):
+        cpu_spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cpu",
+                "rank:1/cpu",
+                "rank:2/cpu",
+                "rank:3/cpu",
+            ],
+        )
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 10, 20
+        gloo_pg = dist.new_group(backend="gloo")
+
+        # CPU sharded tensor should return the same instance (no copy)
+        st_cpu = sharded_tensor.zeros(cpu_spec, h, w, process_group=gloo_pg)
+        new_st_cpu = st_cpu.cpu()
+        self.assertEqual(st_cpu, new_st_cpu)
+
+        # GPU sharded tensor to cpu
+        st = sharded_tensor.zeros(spec, h, w)
+        # test ability to move st to CPU
+        spec_before_move = st.sharding_spec()
+        new_st = st.cpu(process_group=gloo_pg)
+        # return a copy of orginal st
+        self.assertNotEqual(st, new_st)
+        # check the spec is still ChunkShardingSpec
+        spec_after_move = new_st.sharding_spec()
+        self.assertIsInstance(spec_after_move, ChunkShardingSpec)
+        # now it should be ProcessGroupGloo since it's on CPU
+        self.assertIsInstance(new_st._process_group, distributed_c10d.ProcessGroupGloo)
+        # test specs before and after the move almost the same except placement device
+        self.assertEqual(spec_before_move.dim, spec_after_move.dim)
+        self.assertEqual(len(spec_before_move.placements), len(spec_after_move.placements))
+        for i, remote_device_after in enumerate(spec_after_move.placements):
+            remote_device_before = spec_before_move.placements[i]
+            self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
+            self.assertEqual(str(remote_device_after.device()), "cpu")
+
+        # ensure metdata also get changed to CPU
+        metas = new_st.metadata().shards_metadata
+        for meta in metas:
+            self.assertEqual(str(meta.placement.device()), "cpu")
+
+        # Test if a mixed sharded tensor (ShardedTensor with different devices) to cpu
+        mixed_spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cpu",
+                "rank:1/cpu",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+
+        st = sharded_tensor.zeros(mixed_spec, h, w, process_group=gloo_pg)
+        new_st = st.cpu()
+        # return a copy of orginal st
+        self.assertNotEqual(st, new_st)
+        # check the spec is still ChunkShardingSpec
+        spec_after_move = new_st.sharding_spec()
+        self.assertIsInstance(spec_after_move, ChunkShardingSpec)
+        # test specs before and after the move almost the same except placement device
+        self.assertEqual(mixed_spec.dim, spec_after_move.dim)
+        self.assertEqual(len(mixed_spec.placements), len(spec_after_move.placements))
+        for i, remote_device_after in enumerate(spec_after_move.placements):
+            remote_device_before = mixed_spec.placements[i]
+            self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
+            self.assertEqual(str(remote_device_after.device()), "cpu")
+
+        # ensure metdata also get changed to CPU
+        metas = new_st.metadata().shards_metadata
+        for meta in metas:
+            self.assertEqual(str(meta.placement.device()), "cpu")
+
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_uneven_shards(self):
@@ -1781,6 +1912,116 @@ def test_with_rpc_names(self):
                 self.assertEqual((5, 5), shard.tensor.size())
 
 
+class TestShardedTensorFromLocalTensor(ShardedTensorTestBase):
+    def _generate_st_from_chunk_local_tensor(self, st_size, sharding_spec):
+        tensor_meta = sharding_spec.build_metadata(st_size, TensorProperties())
+        pg = dist.distributed_c10d._get_default_group()
+
+        local_tensor = None
+        local_shard_metadata = None
+        rank_to_metadata = {}
+        for shard_metadata in tensor_meta.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(pg, shard_metadata.placement)
+            rank_to_metadata[rank] = shard_metadata
+            if rank == self.rank:
+                local_tensor = torch.rand(shard_metadata.shard_sizes).cuda(device)
+                local_shard_metadata = shard_metadata
+
+        # TODO: figure out what the API should behave when some rank have no shard
+        # see https://github.com/pytorch/pytorch/issues/73133
+        assert local_tensor is not None
+        st = ShardedTensor._init_from_local_tensor(
+            local_tensor,
+            sharding_spec,
+            st_size,
+            init_rrefs=True,
+        )
+        self.assertEqual(tuple(st_size), st.size())
+        self.assertEqual(1, len(st.local_shards()))
+
+        # Verify local shard.
+        local_shard = st.local_shards()[0]
+        self.assertEqual(st.local_tensor(), local_tensor)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
+
+        # Verify local shard metadata.
+        self.assertEqual(
+            local_shard_metadata.shard_offsets, local_shard.metadata.shard_offsets
+        )
+        self.assertEqual(
+            local_shard_metadata.shard_sizes, local_shard.metadata.shard_sizes
+        )
+        self.assertEqual(local_shard_metadata.placement, local_shard.metadata.placement)
+
+        # Verify global metadata.
+        st_shards_metadata = st.metadata().shards_metadata
+        self.assertEqual(self.world_size, len(st_shards_metadata))
+        self.assertEqual(tensor_meta.shards_metadata, st_shards_metadata)
+
+        # Validate remote shards.
+        remote_shards = st.remote_shards()
+        self.assertEqual(self.world_size - 1, len(remote_shards))
+        for rpc_rank, shards in remote_shards.items():
+            self.assertEqual(1, len(shards))
+            for remote_shard in shards:
+                self.assertEqual(rpc_rank, remote_shard.owner().id)
+                # If remote shard does not exist, to_here() will throw exception.
+                if tensor_meta.shards_metadata[rpc_rank]:
+                    shard = remote_shard.to_here()
+                    self.assertEqual(
+                        rank_to_metadata[rpc_rank].shard_sizes, shard.tensor.size()
+                    )
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_init_from_local_tensor(self):
+        chunk_specs = _chunk_sharding_specs_list_for_test([0, 1, 1, 0], seed=31)
+        for spec in chunk_specs:
+            self._generate_st_from_chunk_local_tensor([20, 10], spec)
+            self._generate_st_from_chunk_local_tensor([21, 11], spec)
+            self._generate_st_from_chunk_local_tensor([23, 16], spec)
+            self._generate_st_from_chunk_local_tensor([44, 16, 8], spec)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_init_from_local_tensor_errors(self):
+        enumerable_sharding_spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
+        st_size = [24, 12]
+        local_tensor = torch.rand(*st_size).cuda(self.rank)
+        with self.assertRaisesRegex(
+            ValueError, "do not cover the entire tensor"
+        ):
+            ShardedTensor._init_from_local_tensor(
+                local_tensor,
+                enumerable_sharding_spec,
+                st_size,
+            )
+        chunk_specs = _chunk_sharding_specs_list_for_test([0], seed=31)
+        with self.assertRaisesRegex(
+            ValueError, "local_tensor is not a contiguous Tensor."
+        ):
+            ShardedTensor._init_from_local_tensor(
+                local_tensor.t(),
+                chunk_specs[0],
+                st_size,
+            )
+
+
 class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
 
     @with_comms(init_rpc=False)
@@ -2247,8 +2488,10 @@ def test_custom_op_override(self):
 
         t = torch.rand(10, 10).cuda(self.rank)
 
-        @sharded_op_impl(torch.nn.functional.linear)
-        def my_sharded_linear(types, args, kwargs, process_group):
+        from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+
+        @custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.linear)
+        def my_sharded_linear(types, args, kwargs):
             return t
 
         spec = ChunkShardingSpec(
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
index 1a106772e673..ec053c95b47a 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
@@ -18,6 +18,7 @@
 )
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
@@ -44,6 +45,7 @@ def _run_sharded_tensor_reshard(self, sharding_spec, reshard_spec, input_size):
         st.reshard(reshard_spec)
         self.assertEqual(1, len(st.local_shards()))
         self.assertEqual(1, len(st_compare.local_shards()))
+        st_compare._metadata.shards_metadata.sort(key=lambda metadata: metadata.placement.rank())
         self.assertEqual(st._metadata, st_compare._metadata)
         self.assertEqual(st.local_tensor(), st_compare.local_tensor())
         self.assertEqual(
@@ -95,3 +97,7 @@ def test_sharded_tensor_reshard_errors(self):
             NotImplementedError, "Only single local shard supported for reshard."
         ):
             st.reshard(reshard_spec)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharding_plan/test_sharding_plan.py b/test/distributed/_shard/sharding_plan/test_sharding_plan.py
new file mode 100644
index 000000000000..e9c907c9d6d4
--- /dev/null
+++ b/test/distributed/_shard/sharding_plan/test_sharding_plan.py
@@ -0,0 +1,331 @@
+
+# Owner(s): ["oncall: distributed"]
+import sys
+import copy
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from torch.distributed._shard.sharded_optim import (
+    ShardedOptimizer,
+    named_params_with_sharded_tensor,
+)
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.distributed._shard import shard_module
+from torch.distributed._shard.sharding_plan import ShardingPlan, ShardingPlanner
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
+    generate_chunk_sharding_specs_for_test,
+    generate_local_weight_sharding_params_for_test,
+)
+from torch.testing._internal.distributed._shard.test_common import SimpleMegatronLM
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+# Example ShardingPlanner that chunks every parameter in the module
+# to all available devices defined.
+class ChunkAllShardingPlanner(ShardingPlanner):
+    dim = 0
+    devices = []
+
+    def __init__(self, chunk_dim=0, device_count=0):
+        self.dim = chunk_dim
+        self.devices = [f"rank:{i}/cuda:{i}" for i in range(device_count)]
+
+    def build_plan(self, module: nn.Module) -> ShardingPlan:
+        named_params = module.named_parameters()
+        plan = {}
+        for name, param in named_params:
+            plan[name] = ChunkShardingSpec(self.dim, placements=self.devices)
+
+        return ShardingPlan(plan=plan)
+
+
+class TestShardingPlan(ShardedTensorTestBase):
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharding_plan_simple_megatron(self):
+        colwise_sharding_spec = generate_chunk_sharding_specs_for_test(0)
+        rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1)
+        for spec in zip(colwise_sharding_spec, rowwise_sharding_spec):
+            # test each sharding spec pair and see if we can apply sharding
+            reshard_spec = copy.deepcopy(spec[1])
+            reshard_spec.placements.sort(key=lambda placement: placement.rank())
+            reshard_spec.dim = 0
+
+            sharding_plan = ShardingPlan(
+                plan={
+                    "fc1.weight": spec[0],
+                    "fc2.weight": spec[1]
+                },
+                output_plan={
+                    "": reshard_spec
+                },
+                return_local_tensor=[""])
+
+            # Use same seed.
+            torch.manual_seed(0)
+            local_megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]]).cuda(self.rank)
+            megatron_lm = copy.deepcopy(local_megatron_lm)
+
+            # shard the module with the provided sharding plan
+            shard_module(megatron_lm, sharding_plan)
+
+            # check to make sure the module already been sharded
+            self.assertTrue(isinstance(megatron_lm.fc1.weight, ShardedTensor))
+            self.assertTrue(isinstance(megatron_lm.fc2.weight, ShardedTensor))
+            self.assertEqual(megatron_lm.fc1.weight.sharding_spec(), spec[0])
+            self.assertEqual(megatron_lm.fc2.weight.sharding_spec(), spec[1])
+
+            # make sure we can run sharded computation
+            input = torch.rand(22, 17).cuda(self.rank)
+            sharded_output = megatron_lm(input)
+            local_output = local_megatron_lm(input)
+
+            # verify and make sure local and sharded output matches
+            self.assertEqual(local_output, sharded_output)
+
+            # Compute loss and run backward pass.
+            local_output.sum().backward()
+            sharded_output.sum().backward()
+            (
+                local_weight_grad_fc1,
+                local_weight_grad_fc2,
+            ) = local_megatron_lm.get_weight_grads()
+            local_bias_grad_fc1, local_bias_grad_fc2 = local_megatron_lm.get_bias_grads()
+
+            # Verify that weights in both layers and biases in the sharded linear has non-None grad.
+            (
+                sharded_weight_fc1,
+                sharded_weight_fc2,
+            ) = megatron_lm.get_weights()
+            bias_grad_fc1, bias_grad_fc2 = megatron_lm.get_bias_grads()
+            self.assertNotEqual(sharded_weight_fc1.grad, None)
+            self.assertNotEqual(sharded_weight_fc2.grad, None)
+            self.assertNotEqual(bias_grad_fc1, None)
+            self.assertNotEqual(bias_grad_fc2, None)
+
+            # Shard the local linear's weight grad so that we can compare.
+            dist.all_reduce(local_weight_grad_fc1)
+            dist.all_reduce(local_weight_grad_fc2)
+            dist.all_reduce(local_bias_grad_fc1)
+            dist.all_reduce(local_bias_grad_fc2)
+            local_weight_fc1, local_weight_fc2 = local_megatron_lm.get_weights()
+            (
+                start_pos_fc1,
+                chunk_size_fc1,
+            ) = generate_local_weight_sharding_params_for_test(
+                local_weight_fc1, 0, TEST_GPU_NUM, spec[0], self.rank
+            )
+            local_grad_narrowed_fc1 = local_weight_grad_fc1.narrow(
+                0, start_pos_fc1, chunk_size_fc1
+            )
+            (
+                start_pos_fc2,
+                chunk_size_fc2,
+            ) = generate_local_weight_sharding_params_for_test(
+                local_weight_fc2, 1, TEST_GPU_NUM, spec[1], self.rank
+            )
+            local_grad_narrowed_fc2 = local_weight_grad_fc2.narrow(
+                1, start_pos_fc2, chunk_size_fc2
+            )
+
+            # Test backward gradient calculation.
+            self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1)
+            self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2)
+            self.assertEqual(bias_grad_fc1, local_bias_grad_fc1)
+            self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
+
+            # Test optimizer.
+            bias_fc1, bias_fc2 = megatron_lm.get_biases()
+            local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases()
+            self.assertEqual(bias_fc1, local_bias_fc1)
+            self.assertEqual(bias_fc2, local_bias_fc2)
+            self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
+            self.assertEqual(bias_fc2.grad, local_bias_fc2.grad)
+            previous_sharded_weight_fc1 = sharded_weight_fc1.clone()
+            previous_sharded_weight_fc2 = sharded_weight_fc2.clone()
+            previous_bias_fc1 = bias_fc1.clone()
+            previous_bias_fc2 = bias_fc2.clone()
+            optim = torch.optim.SGD(local_megatron_lm.parameters(), lr=0.1)
+            optim.step()
+            sharded_optim = ShardedOptimizer(
+                dict(named_params_with_sharded_tensor(megatron_lm)),
+                torch.optim.SGD,
+                lr=0.1,
+            )
+            sharded_optim.step()
+            local_weight_fc1_narrowed = local_weight_fc1.narrow(
+                0, start_pos_fc1, chunk_size_fc1
+            )
+            local_weight_fc2_narrowed = local_weight_fc2.narrow(
+                1, start_pos_fc2, chunk_size_fc2
+            )
+
+            # Test weight value after optimizer.
+            self.assertEqual(sharded_weight_fc1.size(), local_weight_fc1_narrowed.size())
+            self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size())
+            self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1)
+            self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2)
+            self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed)
+            self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
+
+            # Test bias value after optimizer.
+            local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases()
+            self.assertNotEqual(previous_bias_fc1, bias_fc1)
+            self.assertEqual(bias_fc1, local_bias_fc1)
+            self.assertNotEqual(previous_bias_fc2, bias_fc2)
+            self.assertEqual(bias_fc2, local_bias_fc2)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_reshard_to_ddp_sharding_plan(self):
+        colwise_sharding_spec = generate_chunk_sharding_specs_for_test(0)[0]
+        rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1)[0]
+
+        # test each sharding spec pair and see if we can apply sharding
+        output_spec = copy.deepcopy(rowwise_sharding_spec)
+        output_spec.placements.sort(key=lambda placement: placement.rank())
+        output_spec.dim = 0
+
+        # new module with megatron as submodule
+        class MyModule(nn.Module):
+            def __init__(self, rank=None):
+                super().__init__()
+                self.megatron = SimpleMegatronLM([[17, 12], [12, 29]], rank=rank)
+                self.relu = nn.ReLU()
+
+            def forward(self, input):
+                return self.relu(self.megatron(input))
+
+        sharding_plan = ShardingPlan(
+            plan={
+                "megatron.fc1.weight": colwise_sharding_spec,
+                "megatron.fc2.weight": rowwise_sharding_spec,
+            },
+            output_plan={
+                "megatron": output_spec
+            },
+            return_local_tensor=[
+                "megatron"
+            ]
+        )
+
+        # Use same seed.
+        torch.manual_seed(0)
+        local_module = MyModule().cuda(self.rank)
+        sharded_module = copy.deepcopy(local_module)
+
+        # shard the module with the provided sharding plan
+        shard_module(sharded_module, sharding_plan)
+
+        # check to make sure the module already been sharded
+        self.assertTrue(isinstance(sharded_module.megatron.fc1.weight, ShardedTensor))
+        self.assertTrue(isinstance(sharded_module.megatron.fc2.weight, ShardedTensor))
+        self.assertEqual(sharded_module.megatron.fc1.weight.sharding_spec(), colwise_sharding_spec)
+        self.assertEqual(sharded_module.megatron.fc2.weight.sharding_spec(), rowwise_sharding_spec)
+
+        # make sure we can run sharded computation
+        input = torch.rand(22, 17).cuda(self.rank)
+        sharded_output = sharded_module(input)
+        local_output = local_module(input)
+
+        # verify and make sure local and sharded output matches
+        self.assertEqual(local_output, sharded_output)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_sharding_plan_errors(self):
+        rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1)[0]
+        sharding_plan_wrong_plan = ShardingPlan(
+            plan={
+                "fc1.weight": torch.randn(3, 4),
+            },
+            output_plan={
+                "": rowwise_sharding_spec
+            },
+        )
+
+        megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]]).cuda(self.rank)
+
+        with self.assertRaisesRegex(
+            TypeError, "Only `ShardingSpec` and `Sharder` are supported to shard"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(megatron_lm, sharding_plan_wrong_plan)
+
+        sharding_plan_wrong_output_plan = ShardingPlan(
+            plan={
+                "fc1.weight": rowwise_sharding_spec,
+            },
+            output_plan={
+                "": torch.randn(3, 4)
+            },
+        )
+
+        with self.assertRaisesRegex(
+            TypeError, "Only `ShardingSpec` is supported as output_plan"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(megatron_lm, sharding_plan_wrong_output_plan)
+
+        sharding_plan_wrong_module_path = ShardingPlan(
+            plan={
+                "fc3.weight": rowwise_sharding_spec,
+            },
+        )
+        with self.assertRaisesRegex(
+            AttributeError, "has no attribute"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(megatron_lm, sharding_plan_wrong_module_path)
+
+        sharding_plan_wrong_param_path = ShardingPlan(
+            plan={
+                "fc1.biass": rowwise_sharding_spec,
+            },
+        )
+        with self.assertRaisesRegex(
+            AttributeError, "has no attribute"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(megatron_lm, sharding_plan_wrong_param_path)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_custom_sharding_planner(self):
+        megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]], rank=self.rank).cuda(
+            self.rank
+        )
+        planner = ChunkAllShardingPlanner(device_count=TEST_GPU_NUM)
+        sharding_plan = planner.build_plan(megatron_lm)
+
+        shard_module(megatron_lm, sharding_plan)
+
+        # check to make sure the module already been sharded
+        self.assertTrue(isinstance(megatron_lm.fc1.weight, ShardedTensor))
+        self.assertTrue(isinstance(megatron_lm.fc2.weight, ShardedTensor))
+        self.assertTrue(isinstance(megatron_lm.fc1.bias, ShardedTensor))
+        self.assertTrue(isinstance(megatron_lm.fc2.bias, ShardedTensor))
diff --git a/test/distributed/_shard/sharding_spec/test_sharding_spec.py b/test/distributed/_shard/sharding_spec/test_sharding_spec.py
index d760b5499fd8..a0e13d80d93e 100644
--- a/test/distributed/_shard/sharding_spec/test_sharding_spec.py
+++ b/test/distributed/_shard/sharding_spec/test_sharding_spec.py
@@ -1,12 +1,27 @@
 # Owner(s): ["oncall: distributed"]
+from typing import List, Union
+from dataclasses import dataclass
 
+import copy
 import torch
 from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.distributed._shard import sharded_tensor, _shard_tensor
 from torch.distributed._shard.sharding_spec import (
+    ShardingSpec,
     ChunkShardingSpec,
     DevicePlacementSpec,
     EnumerableShardingSpec,
     ShardMetadata,
+    _infer_sharding_spec_from_shards_metadata,
+)
+from torch.distributed._shard.sharded_tensor import (
+    TensorProperties,
+    ShardedTensor,
+    ShardedTensorMetadata,
 )
 from torch.distributed._shard.sharding_spec._internals import (
     check_tensor,
@@ -19,6 +34,13 @@
     run_tests,
     sandcastle_skip_if,
 )
+from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
+    _chunk_sharding_specs_list_for_test,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    ShardedTensorTestBase,
+    with_comms,
+)
 
 class TestShardingSpec(TestCase):
 
@@ -46,18 +68,21 @@ def test_device_placement(self):
     def test_chunked_sharding_spec(self):
         # Test valid specs.
         ChunkShardingSpec(0, [torch.device(0), torch.device(1)])
-        # Named dimension.
-        ChunkShardingSpec("N", ["cuda:0", "cuda:1"])
         ChunkShardingSpec(0, [torch.device("cuda:0"), torch.device("cuda:1")])
         ChunkShardingSpec(-1, ["cuda:0", "cuda:1"])
         ChunkShardingSpec(0, ["rank:0/cuda:0", "rank:0/cuda:1"])
         ChunkShardingSpec(0, ["rank:0", "rank:1"])
         ChunkShardingSpec(0, ["rank:0/cpu", "rank:1/cpu"])
 
+        # Test unimplemented error
+        with self.assertRaisesRegex(NotImplementedError, "not support named dimension"):
+            # Named dimension.
+            ChunkShardingSpec("N", ["cuda:0", "cuda:1"])
+
         # Test invalid specs
-        with self.assertRaisesRegex(ValueError, "int or str"):
+        with self.assertRaisesRegex(ValueError, "needs to be an integer"):
             ChunkShardingSpec(None, ["cuda:0", "cuda:1"])
-        with self.assertRaisesRegex(ValueError, "int or str"):
+        with self.assertRaisesRegex(ValueError, "needs to be an integer"):
             ChunkShardingSpec({}, ["cuda:0", "cuda:1"])
         with self.assertRaisesRegex(ValueError, "Could not parse remote_device"):
             ChunkShardingSpec(0, ["random:0", "cuda:1"])
@@ -276,5 +301,224 @@ def test_get_chunk_sharding_params(self):
         self.assertEqual(0, result[0])
         self.assertEqual(6, result[1])
 
+    def _infer_enum_sharding_spec_case(self):
+        shards_metadata = [
+            ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 0],
+                shard_sizes=[10, 5],
+                placement="cuda:1",
+            )
+        ]
+        spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
+        self.assertTrue(isinstance(spec, EnumerableShardingSpec))
+        self.assertEqual(spec.shards, shards_metadata)
+
+        shards_metadata = [
+            ShardMetadata(
+                shard_offsets=[0],
+                shard_sizes=[16],
+                placement="cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[16],
+                shard_sizes=[9],
+                placement="cuda:1",
+            )
+        ]
+        spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
+        self.assertTrue(isinstance(spec, EnumerableShardingSpec))
+        self.assertEqual(spec.shards, shards_metadata)
+
+        shards_metadata = [
+            ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="rank:0/cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 0],
+                shard_sizes=[5, 5],
+                placement="rank:1/cuda:1",
+            ),
+            ShardMetadata(
+                shard_offsets=[0, 5],
+                shard_sizes=[5, 5],
+                placement="rank:2/cuda:2",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 5],
+                shard_sizes=[5, 5],
+                placement="rank:3/cuda:3",
+            ),
+        ]
+        spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
+        self.assertTrue(isinstance(spec, EnumerableShardingSpec))
+        self.assertEqual(spec.shards, shards_metadata)
+
+    def _infer_chunk_sharding_spec_case(self, placements, sharding_dim, st_size):
+        world_size = len(placements)
+        split_size = get_split_size(st_size[sharding_dim], world_size)
+        shards_metadata = [None] * world_size
+        for idx, placement in enumerate(placements):
+            shard_size = copy.deepcopy(st_size)
+            offsets = [0] * len(st_size)
+            offsets[sharding_dim] = split_size * idx
+            shard_size[sharding_dim] = get_chunked_dim_size(st_size[sharding_dim], split_size, idx)
+            shards_metadata[placement.rank()] = ShardMetadata(
+                shard_offsets=offsets,
+                shard_sizes=shard_size,
+                placement=placement,
+            )
+
+        spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
+        self.assertTrue(isinstance(spec, ChunkShardingSpec))
+        self.assertEqual(spec.dim, sharding_dim)
+        self.assertEqual(spec.placements, placements)
+
+    def test_infer_sharding_spec_from_shards_metadata(self):
+        self._infer_enum_sharding_spec_case()
+        chunk_specs = _chunk_sharding_specs_list_for_test([0, 0, 1, 1], seed=31)
+        for spec in chunk_specs:
+            self._infer_chunk_sharding_spec_case(spec.placements, 0, [4, 16])
+            self._infer_chunk_sharding_spec_case(spec.placements, 0, [5, 15, 16])
+            self._infer_chunk_sharding_spec_case(spec.placements, 1, [12, 16])
+            self._infer_chunk_sharding_spec_case(spec.placements, 2, [4, 18, 15])
+            self._infer_chunk_sharding_spec_case(spec.placements, 3, [7, 12, 16, 37])
+            self._infer_chunk_sharding_spec_case(spec.placements, 4, [50, 4, 18, 15, 77])
+
+# Custom ShardingSpec, an simple example to do grid sharding
+@dataclass
+class GridShardingSpec(ShardingSpec):
+    grid_size: int
+    placements: List[Union[torch.distributed._remote_device, str]]
+
+    def __post_init__(self):
+        for i, remote_device in enumerate(self.placements):
+            if not isinstance(remote_device, torch.distributed._remote_device):
+                self.placements[i] = torch.distributed._remote_device(remote_device)
+
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: TensorProperties,
+                       ) -> ShardedTensorMetadata:
+        tensor_num_dim = len(tensor_sizes)
+        assert tensor_num_dim == 2, "only support 2-dim tensor for grid sharding"
+        shards_metadata = []
+
+        def chunk_num(dim_size, grid_size):
+            assert dim_size % grid_size == 0, "only support dim_size mod grid_size == 0"
+            return dim_size // grid_size
+
+        row_chunks = chunk_num(tensor_sizes[0], self.grid_size)
+        col_chunks = chunk_num(tensor_sizes[1], self.grid_size)
+
+        assert row_chunks * col_chunks == len(self.placements)
+        for row_idx in range(row_chunks):
+            for col_idx in range(col_chunks):
+                shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=[row_idx * self.grid_size, col_idx * self.grid_size],
+                        shard_sizes=[self.grid_size, self.grid_size],
+                        placement=self.placements[row_idx * row_chunks + col_idx]
+                    )
+                )
+        return ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=tensor_sizes,
+            tensor_properties=tensor_properties
+        )
+
+
+    def shard(self,
+              tensor: torch.Tensor,
+              src_rank: int = 0,
+              process_group=None) -> ShardedTensor:
+
+        raise NotImplementedError("GridShardingSpec.shard not implemented yet!")
+
+class TestCustomShardingSpec(ShardedTensorTestBase):
+    def test_custom_sharding_spec(self):
+        ranks = [
+            "rank:0/cuda:0",
+            "rank:1/cuda:1",
+            "rank:2/cuda:2",
+            "rank:3/cuda:3",
+        ]
+
+        grid_spec = GridShardingSpec(
+            grid_size=4,
+            placements=ranks
+        )
+
+        tensor_properties = TensorProperties(
+            dtype=torch.get_default_dtype(),
+            layout=torch.strided,
+            requires_grad=False,
+            memory_format=torch.contiguous_format,
+            pin_memory=False,
+        )
+
+        meta = grid_spec.build_metadata(torch.Size((8, 8)), tensor_properties)
+        check_tensor(meta.shards_metadata, torch.Size((8, 8)))
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_custom_sharding_spec_tensor_ctor(self):
+        """ Test sharded_tensor.ones(...) with the custom
+            grid sharding spec.
+        """
+
+        ranks = [
+            "rank:0/cuda:0",
+            "rank:1/cuda:1",
+            "rank:2/cuda:2",
+            "rank:3/cuda:3",
+        ]
+
+        grid_spec = GridShardingSpec(
+            grid_size=2,
+            placements=ranks
+        )
+
+        st = sharded_tensor.ones(grid_spec, 4, 4)
+
+        # Validate local shard is initialized with torch.ones
+        local_shards = st.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device)
+        self.assertEqual((2, 2), local_shard.size())
+        self.assertEqual(local_shard, torch.ones(2, 2))
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_custom_sharding_spec_shard_tensor(self):
+        """ Test custom spec can be invoked from the
+            _shard_tensor callsite.
+        """
+
+        ranks = [
+            "rank:0/cuda:0",
+            "rank:1/cuda:1",
+            "rank:2/cuda:2",
+            "rank:3/cuda:3",
+        ]
+
+        grid_spec = GridShardingSpec(
+            grid_size=2,
+            placements=ranks
+        )
+
+        with self.assertRaisesRegex(NotImplementedError, 'not implemented'):
+            _shard_tensor(torch.randn(8, 8), grid_spec)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/distributed/_shard/test_partial_tensor.py b/test/distributed/_shard/test_partial_tensor.py
new file mode 100644
index 000000000000..fd0b58a4aabb
--- /dev/null
+++ b/test/distributed/_shard/test_partial_tensor.py
@@ -0,0 +1,174 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.partial_tensor import (
+    _PartialTensor,
+)
+from torch.distributed._shard.sharding_spec import (
+    EnumerableShardingSpec,
+    ShardMetadata,
+)
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    ShardedTensorTestBase,
+    with_comms,
+    TEST_GPU_NUM
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
+    _chunk_sharding_specs_list_for_test,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestPartialTensorReshard(ShardedTensorTestBase):
+    def _run_partial_tensor_n_reshard(
+        self, reshard_spec, input_size, world_size, reduce_op, dtype=torch.float
+    ):
+        results_compare = []
+        local_result = []
+        pg = dist.distributed_c10d._get_default_group()
+        for rank in range(pg.size()):
+            torch.manual_seed(rank)
+            results = []
+            for _ in range(world_size):
+                tensor = torch.rand(*input_size, dtype=dtype).cuda(self.rank)
+                results.append(tensor)
+                if self.rank == rank:
+                    local_result.append(tensor.clone().detach())
+            results_compare.append(torch.cat(results))
+        parital_tensor = _PartialTensor(
+            torch.cat(local_result), pg, reduce_op=reduce_op
+        )
+        local_sharded_result = parital_tensor.reshard(reshard_spec)
+        local_shards = local_sharded_result.local_shards()
+        results_compare = torch.stack(results_compare)
+        if reduce_op == dist.ReduceOp.SUM:
+            results_compare = torch.sum(results_compare, dim=0)
+        else:
+            results_compare = torch.max(results_compare, dim=0).values
+        rank_idx = None
+        for idx, placement in enumerate(reshard_spec.placements):
+            if placement.rank() == self.rank:
+                rank_idx = idx
+        local_result_compare = results_compare.chunk(pg.size())[rank_idx]
+        self.assertEqual(1, len(local_shards))
+        self.assertEqual(local_shards[0].tensor, local_result_compare)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_partial_tensor_reshard(self):
+        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
+        spec = specs[0]
+        self._run_partial_tensor_n_reshard(spec, [13, 21], 4, dist.ReduceOp.SUM)
+        self._run_partial_tensor_n_reshard(spec, [12, 22], 4, dist.ReduceOp.MAX)
+        self._run_partial_tensor_n_reshard(spec, [13, 21], 3, dist.ReduceOp.SUM)
+        self._run_partial_tensor_n_reshard(spec, [17, 21], 2, dist.ReduceOp.MAX)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_partial_tensor_reshard_errors(self):
+        enumerable_sharding_spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
+        with self.assertRaisesRegex(
+            NotImplementedError, "Only ChunkShardingSpec supported for reshard."
+        ):
+            self._run_partial_tensor_n_reshard(
+                enumerable_sharding_spec, [13, 21], 4, dist.ReduceOp.SUM
+            )
+            self._run_partial_tensor_n_reshard(
+                enumerable_sharding_spec, [12, 22], 4, dist.ReduceOp.MAX
+            )
+        specs = _chunk_sharding_specs_list_for_test([0], seed=7)
+        spec = specs[0]
+        with self.assertRaisesRegex(
+            NotImplementedError, "Only real partial tensor supported for reshard."
+        ):
+            self._run_partial_tensor_n_reshard(
+                spec, [13, 21], 4, dist.ReduceOp.SUM, dtype=torch.cfloat
+            )
+            self._run_partial_tensor_n_reshard(
+                spec, [12, 22], 4, dist.ReduceOp.MAX, dtype=torch.cfloat
+            )
+
+class TestPartialTensorOps(ShardedTensorTestBase):
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_transpose(self):
+        partial_tensor = _PartialTensor(torch.rand(5, 10))
+        partial_tensor = partial_tensor.transpose(0, 1)
+        self.assertEqual(partial_tensor.size(), torch.Size((10, 5)))
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_cat(self):
+        t1 = torch.rand(5, 10)
+        t2 = torch.rand(3, 10)
+        t3 = torch.rand(4, 10)
+        partial_tensors = [_PartialTensor(t1), _PartialTensor(t2), _PartialTensor(t3)]
+        partial_concat = torch.cat(partial_tensors)
+        local_concat = torch.cat([t1, t2, t3])
+        self.assertEqual(local_concat.size(), partial_concat.size())
+
+        # Test dim kwarg
+        t1 = torch.rand(5, 10)
+        t2 = torch.rand(5, 12)
+        t3 = torch.rand(5, 11)
+        partial_tensors = [_PartialTensor(t1), _PartialTensor(t2), _PartialTensor(t3)]
+        partial_concat = torch.cat(partial_tensors, dim=1)
+        local_concat = torch.cat([t1, t2, t3], dim=1)
+        self.assertEqual(local_concat.size(), partial_concat.size())
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_cat_errors(self):
+        with self.assertRaisesRegex(
+            RuntimeError, 'All inputs need to be an instance of _PartialTensor'
+        ):
+            torch.cat([_PartialTensor(torch.rand(10)), torch.rand(10)])
+
+        with self.assertRaisesRegex(
+            RuntimeError, 'reduce_ops need to be the same'
+        ):
+            torch.cat([_PartialTensor(torch.rand(10)), _PartialTensor(torch.rand(10), reduce_op=dist.ReduceOp.MAX)])
+
+        with self.assertRaisesRegex(
+            RuntimeError, '"out" kwarg is not supported'
+        ):
+            torch.cat([_PartialTensor(torch.rand(10)), _PartialTensor(torch.rand(10))], out=torch.rand(10))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/test_replicated_tensor.py b/test/distributed/_shard/test_replicated_tensor.py
new file mode 100644
index 000000000000..9dfdd8703588
--- /dev/null
+++ b/test/distributed/_shard/test_replicated_tensor.py
@@ -0,0 +1,336 @@
+# Owner(s): ["oncall: distributed"]
+import io
+
+import torch
+import torch.distributed._shard.sharded_tensor as sharded_tensor
+
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from torch.distributed._shard import _shard_tensor
+from torch.distributed._shard.replicated_tensor import ReplicatedTensor
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    ShardedTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
+    gen_binary_op_func
+)
+from torch.testing._internal.distributed._shard.sharded_tensor import TEST_GPU_NUM
+
+
+class TestReplicatedTensor(ShardedTensorTestBase):
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_basics(self):
+        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
+        replica_tensor = ReplicatedTensor(local_tensor)
+        # validate it's a replicated tensor by checking values on all rank
+        validated = replica_tensor.validate()
+        self.assertEqual(validated, True)
+        res = replica_tensor + 2
+        self.assertIsInstance(res, torch.Tensor)
+        self.assertNotIsInstance(res, ReplicatedTensor)
+        self.assertEqual(res, torch.ones(3, 3) * 6)
+
+        # modify local tensor on certain rank, and test if validation raise
+        if self.rank == 2:
+            local_tensor += 3
+
+        with self.assertRaisesRegex(ValueError, 'have different values'):
+            replica_tensor.validate()
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_inter_op_replicated_tensor(self):
+        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}")
+        replica_tensor1 = ReplicatedTensor(local_tensor * 4)
+        replica_tensor2 = ReplicatedTensor(local_tensor * 6)
+
+        new_tensor = replica_tensor1 * replica_tensor2
+        self.assertIsInstance(new_tensor, ReplicatedTensor)
+        self.assertEqual(new_tensor, torch.ones(3, 3) * 24)
+
+        # test replicated tensor inter-op with different pgs
+        new_pg = dist.new_group(ranks=[1, 2, 3])
+        replica_tensor_new_group = ReplicatedTensor(local_tensor * 3, process_group=new_pg)
+
+        with self.assertRaisesRegex(RuntimeError, 'must be in the same'):
+            replica_tensor_new_group * replica_tensor1
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_inter_op_tensor(self):
+        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
+        replica_tensor = ReplicatedTensor(local_tensor)
+
+        local_rand_tensor = torch.randn(3, 3, device=f"cuda:{self.rank}")
+
+        new_tensor = replica_tensor + local_rand_tensor
+        self.assertIsInstance(new_tensor, torch.Tensor)
+        self.assertNotIsInstance(new_tensor, ReplicatedTensor)
+
+        self.assertEqual(new_tensor, local_tensor + local_rand_tensor)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_inter_op_sharded_tensor(self):
+        torch.manual_seed(self.rank)
+
+        local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4
+        local_tensor2 = torch.ones(12, 3, device=f"cuda:{self.rank}") * 4
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+
+        st = _shard_tensor(local_tensor1, spec, src_rank=0)
+        replica_tensor = ReplicatedTensor(local_tensor2)
+
+        ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"]
+
+        for op in ops:
+            binary_op = gen_binary_op_func(op)
+            res = binary_op(st, replica_tensor)
+            self.assertIsInstance(res, sharded_tensor.ShardedTensor)
+            self.assertNotIsInstance(res, ReplicatedTensor)
+            output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
+            res.gather(dst=0, out=output)
+
+            if self.rank == 0:
+                local_output = binary_op(local_tensor1, local_tensor2)
+                self.assertEqual(output, local_output)
+
+            # reflective
+            reflect_res = binary_op(replica_tensor, st)
+            self.assertIsInstance(reflect_res, sharded_tensor.ShardedTensor)
+            self.assertNotIsInstance(reflect_res, ReplicatedTensor)
+            reflect_output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
+            reflect_res.gather(dst=0, out=reflect_output)
+
+            if self.rank == 0:
+                reflect_local_output = binary_op(local_tensor2, local_tensor1)
+                self.assertEqual(reflect_output, reflect_local_output)
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_implicit_broadcasting(self):
+        #  use same seed
+        torch.manual_seed(self.rank)
+
+        # test implicit broadcasting
+        local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4
+        # we use size (3) to trigger the implicit broadcasting logic
+        # and it will fail if implicit broadcasting not happen.
+        local_tensor2 = torch.ones(3, device=f"cuda:{self.rank}")
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+
+        st = _shard_tensor(local_tensor1, spec, src_rank=0)
+        replica_tensor = ReplicatedTensor(local_tensor2)
+
+        ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"]
+
+        for op in ops:
+            binary_op = gen_binary_op_func(op)
+            # replicated tensor should automatically broadcasted
+            res = binary_op(st, replica_tensor)
+
+            self.assertIsInstance(res, sharded_tensor.ShardedTensor)
+            output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
+            res.gather(dst=0, out=output)
+
+            if self.rank == 0:
+                local_output = binary_op(local_tensor1, local_tensor2)
+                self.assertEqual(output, local_output)
+
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_replicated_tensor_inter_op_sharded_tensor_errors(self):
+        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
+        replica_tensor = ReplicatedTensor(local_tensor)
+
+        torch.manual_seed(self.rank)
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+
+        st1 = sharded_tensor.rand(spec, (20, 3, 3))
+        st2 = sharded_tensor.rand(spec, (30, 3, 3))
+
+        with self.assertRaisesRegex(RuntimeError, 'Implicit broadcasting'):
+            st1 + st2
+
+        with self.assertRaisesRegex(RuntimeError, 'not supported for ShardedTensor'):
+            st1 % replica_tensor
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_with_ddp(self):
+        # Test Replicated params for DDP
+        replica_tensor = ReplicatedTensor(torch.rand(4, 8, device=self.rank))
+        model = torch.nn.Linear(8, 2).cuda(self.rank)
+        optim = torch.optim.SGD(model.parameters(), lr=0.1)
+        ddp = DDP(model)
+
+        # Test module.parameters.
+        params = list(ddp.parameters())
+        self.assertEqual(2, len(params))
+        self.assertEqual(ddp.module.weight, params[0])
+        self.assertEqual(ddp.module.bias, params[1])
+
+        params = list(model.parameters())
+        self.assertEqual(2, len(params))
+        self.assertEqual(model.weight, params[0])
+        self.assertEqual(model.bias, params[1])
+
+        # Validate output
+        out = ddp(replica_tensor)
+        self.assertIsInstance(out, ReplicatedTensor)
+
+        # Test backward and optimizer.
+
+        # Validate backward.
+        out.sum().backward()
+        self.assertIsNotNone(model.weight.grad)
+        self.assertIsNotNone(model.bias.grad)
+        self.assertIsNotNone(ddp.module.weight.grad)
+        self.assertIsNotNone(ddp.module.bias.grad)
+
+        original_params = []
+        for param_group in optim.param_groups:
+            for original_param in param_group['params']:
+                self.assertIsNotNone(original_param.grad)
+                original_params.append(original_param)
+
+        self.assertEqual(model.weight.grad, original_params[0].grad)
+        self.assertEqual(model.bias.grad, original_params[1].grad)
+        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
+        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
+
+        # Validate optimizer.
+        optim.step()
+        self.assertEqual(model.weight, ddp.module.weight)
+        self.assertEqual(model.weight, original_params[0])
+
+        self.assertEqual(model.bias, ddp.module.bias)
+        self.assertEqual(model.bias, original_params[1])
+
+        # Validate zero_grad
+        optim.zero_grad()
+        self.assertEqual(model.weight.grad, torch.zeros_like(model.weight.grad))
+        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
+        self.assertEqual(model.weight.grad, original_params[0].grad)
+
+        self.assertEqual(model.bias.grad, torch.zeros_like(model.bias.grad))
+        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
+        self.assertEqual(model.bias.grad, original_params[1].grad)
+
+        # Validate zero_grad set_to_none
+        optim.zero_grad(set_to_none=True)
+        self.assertIsNone(model.weight.grad)
+        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
+        self.assertEqual(model.weight.grad, original_params[0].grad)
+
+        self.assertIsNone(model.bias.grad)
+        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
+        self.assertEqual(model.bias.grad, original_params[1].grad)
+
+        # Multiple forward passes.
+        for _ in range(5):
+            out = ddp(replica_tensor)
+            self.assertIsInstance(out, ReplicatedTensor)
+
+        # Test with context manager.
+        from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
+        with _ddp_replicated_tensor(False):
+            for _ in range(5):
+                with _ddp_replicated_tensor(True):
+                    ddp = DDP(model)
+                    out = ddp(replica_tensor)
+                self.assertIsInstance(out, ReplicatedTensor)
+
+        # Test save and load.
+        with _ddp_replicated_tensor(False):
+            ddp = DDP(model)
+            expected_state_dict = ddp.state_dict()
+            buffer = io.BytesIO()
+            torch.save(ddp, buffer)
+
+            buffer.seek(0)
+            obj = torch.load(buffer)
+            self.assertEqual(expected_state_dict, obj.state_dict())
+
+        with _ddp_replicated_tensor(True):
+            ddp = DDP(model)
+            buffer = io.BytesIO()
+            torch.save(ddp, buffer)
+
+            buffer.seek(0)
+            obj = torch.load(buffer)
+            self.assertEqual(expected_state_dict, obj.state_dict())
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_unsqueeze(self):
+        local_tensor = torch.rand(3, 3, device=self.rank)
+        replicated_tensor = ReplicatedTensor(local_tensor)
+
+        unsqueezed_replicated_tensor = replicated_tensor.unsqueeze(0)
+        unsqueezed_local_tensor = local_tensor.unsqueeze(0)
+
+        self.assertIsInstance(unsqueezed_replicated_tensor, ReplicatedTensor)
+        self.assertIsInstance(torch.unsqueeze(replicated_tensor, 0), ReplicatedTensor)
+        self.assertEqual(unsqueezed_local_tensor, unsqueezed_replicated_tensor)
+        self.assertEqual(torch.unsqueeze(replicated_tensor, 0), unsqueezed_replicated_tensor)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_getitem(self):
+        local_tensor = torch.rand(3, 3, device=self.rank)
+        replicated_tensor = ReplicatedTensor(local_tensor)
+
+        replicated_tensor_view = replicated_tensor[0]
+        local_tensor_view = local_tensor[0]
+
+        self.assertIsInstance(replicated_tensor_view, ReplicatedTensor)
+        self.assertEqual(local_tensor_view, replicated_tensor_view)
diff --git a/test/distributed/_shard/test_sharder.py b/test/distributed/_shard/test_sharder.py
new file mode 100644
index 000000000000..d6e0b799ec6e
--- /dev/null
+++ b/test/distributed/_shard/test_sharder.py
@@ -0,0 +1,165 @@
+
+# Owner(s): ["oncall: distributed"]
+import sys
+import copy
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
+from torch.distributed._shard import shard_module
+from torch.distributed._shard.sharding_plan import ShardingPlan
+from torch.distributed._shard.sharder import Sharder
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.distributed._shard.sharded_tensor import (
+    TEST_GPU_NUM,
+    ShardedTensorTestBase,
+    with_comms,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+# a simple collection of embedding bag implementation
+class CustomEmbeddingBagCollection(nn.Module):
+    def __init__(self, num_bags, num_embeddings_per_bag, num_dims):
+        super().__init__()
+        self.num_bags = num_bags
+        self.embedding_bags: nn.ModuleDict = nn.ModuleDict()
+
+        for i in range(num_bags):
+            self.embedding_bags[f"embedding_bag_{i}"] = nn.EmbeddingBag(
+                num_embeddings_per_bag,
+                num_dims,
+                mode="sum")
+
+    def forward(self, inputs):
+        outputs = []
+        for bag in self.embedding_bags.values():
+            outputs.append(bag(inputs))
+        return torch.cat(outputs)
+
+# a simple sharded version of EBC
+class CustomShardedEBC(nn.Module):
+    def __init__(self, ebc, split_idx, specs):
+        super().__init__()
+        self.split_idx = split_idx
+        row_spec, col_spec = specs
+
+        # create embedding bags base on the spec
+        self.embedding_bags: nn.ModuleDict = nn.ModuleDict()
+
+        assert self.split_idx < ebc.num_bags
+        for i in range(ebc.num_bags):
+            bag_key = f"embedding_bag_{i}"
+            if i < self.split_idx:
+                shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": row_spec}))
+            else:
+                shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": col_spec}))
+
+            self.embedding_bags[bag_key] = ebc.embedding_bags[bag_key]
+
+
+class CustomSharder(Sharder):
+    def __init__(self, devices, split_sharding_idx):
+        self.devices = devices
+        self.split_sharding_idx = split_sharding_idx
+        self.rowwise_spec = ChunkShardingSpec(dim=0, placements=devices)
+        self.colwise_spec = ChunkShardingSpec(dim=1, placements=devices)
+
+    def shard(self, ebc: nn.Module) -> nn.Module:
+        if not isinstance(ebc, CustomEmbeddingBagCollection):
+            raise RuntimeError("The custom sharder only supports CustomEmbeddingBagCollection")
+
+        return CustomShardedEBC(ebc, self.split_sharding_idx, (self.rowwise_spec, self.colwise_spec))
+
+
+class TestCustomSharder(ShardedTensorTestBase):
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_custom_sharder(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.ebc = CustomEmbeddingBagCollection(10, 10, 8)
+
+            def forward(self, inputs):
+                return self.ebc(inputs)
+
+        custom_sharder = CustomSharder(
+            devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)],
+            split_sharding_idx=TEST_GPU_NUM // 2
+        )
+
+        sharding_plan = ShardingPlan(
+            plan={
+                "ebc": custom_sharder,
+            })
+
+        local_model = MyModule().cuda(self.rank)
+        sharded_model = copy.deepcopy(local_model)
+
+        # shard the module with the provided sharding plan
+        shard_module(sharded_model, sharding_plan)
+
+        # check to make sure the module already been sharded
+        emb_bags = sharded_model.ebc.embedding_bags
+        self.assertTrue(isinstance(emb_bags["embedding_bag_0"].weight, ShardedTensor))
+        self.assertTrue(isinstance(emb_bags["embedding_bag_9"].weight, ShardedTensor))
+        self.assertEqual(emb_bags["embedding_bag_0"].weight.sharding_spec(), custom_sharder.rowwise_spec)
+        self.assertEqual(emb_bags["embedding_bag_9"].weight.sharding_spec(), custom_sharder.colwise_spec)
+
+        # make sure we can run sharded computation and compare outputs
+        # with the local model version
+        input = torch.arange(8).reshape((2, 4)).cuda(self.rank)
+        local_output = local_model(input)
+        sharded_output = sharded_model(input)
+
+        self.assertEqual(local_output, sharded_output)
+
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(TEST_GPU_NUM)
+    @requires_nccl()
+    def test_custom_sharder_errors(self):
+        custom_sharder = CustomSharder(
+            devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)],
+            split_sharding_idx=TEST_GPU_NUM // 2
+        )
+
+        sharding_plan = ShardingPlan(
+            plan={
+                "": custom_sharder,
+            })
+
+        sharded_model = CustomEmbeddingBagCollection(10, 10, 8).cuda(self.rank)
+
+        with self.assertRaisesRegex(
+            KeyError, "path must not be empty for custom sharder!"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(sharded_model, sharding_plan)
+
+        # test conflicted sharding plan
+        spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:0", "rank:1/cuda:1"])
+        sharding_plan = ShardingPlan(
+            plan={
+                "embedding_bags.embedding_bag_0.weight": spec,
+                "embedding_bags": custom_sharder,
+            })
+
+        with self.assertRaisesRegex(
+            RuntimeError, "should not conflict with the submodule tree"
+        ):
+            # shard the module with the provided sharding plan
+            shard_module(sharded_model, sharding_plan)
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 7388c55db197..9f382a97f6ab 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -12,7 +12,7 @@
 import unittest
 import uuid
 from typing import Any, Dict
-from unittest.mock import call, patch
+from unittest.mock import call, patch, MagicMock
 
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
 from torch.distributed.elastic.agent.server.api import (
@@ -497,8 +497,8 @@ def verify_worker_ranks(
         )
         self.assertEqual(expected_role_ranks, [worker.role_rank for worker in workers])
 
-    @patch("torch.distributed.elastic.utils.store.get_all")
-    def test_share_and_gather(self, store_mock):
+    @patch("torch.distributed.elastic.utils.store.synchronize")
+    def test_share_and_gather(self, sync_mock):
         # when the state is unknown we exit immediately; no retries
         spec = self._get_worker_spec(max_restarts=100, monitor_interval=0.1)
         agent = TestAgent(spec)
@@ -508,26 +508,15 @@ def test_share_and_gather(self, store_mock):
             _RoleInstanceInfo("validator", 2, 10),
         ]
 
-        store_mock.return_value = [obj.serialize() for obj in expected_agent_infos]
-
-        class DummyStore:
-            def __init__(self):
-                self.key = None
-                self.value = None
-
-            def set(self, key, value):
-                self.key = key
-                self.value = value
-
-            def set_timeout(self, timeout):
-                pass
-
-        store = DummyStore()
-        agent._share_and_gather(store, 1, 3, spec)
-        self.assertEquals("torchelastic/role_info1", store.key)
-        expected_info = _RoleInstanceInfo(spec.role, 1, spec.local_world_size)
-        self.assertEquals(expected_info.serialize(), store.value)
-        store_mock.assert_called_once()
+        sync_mock.return_value = [obj.serialize() for obj in expected_agent_infos]
+        result = agent._share_and_gather(MagicMock(), 1, 3, spec)
+        sync_mock.assert_called_once()
+        for expected_role_info, actual_role_info in zip(expected_agent_infos, result):
+            self.assertEqual(expected_role_info.role, actual_role_info.role)
+            self.assertEqual(expected_role_info.rank, actual_role_info.rank)
+            self.assertEqual(
+                expected_role_info.local_world_size, actual_role_info.local_world_size
+            )
 
     def test_get_event(self):
         spec = self._get_worker_spec(max_restarts=1)
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index a931f3ef1d4e..9c5a39505490 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -38,8 +38,8 @@
 from torch.distributed.rpc.backend_registry import BackendType
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    sandcastle_skip_if,
     TEST_WITH_TSAN,
+    sandcastle_skip_if,
 )
 
 
@@ -170,11 +170,26 @@ def _check_env_function():
         "TORCHELASTIC_MAX_RESTARTS",
         "TORCHELASTIC_RUN_ID",
         "TORCHELASTIC_USE_AGENT_STORE",
+        "NCCL_ASYNC_ERROR_HANDLING",
     ]
     for var in env_vars:
         _ = os.environ[var]
 
 
+def _check_env_value(key: str, expected: str):
+    # checks if the env var ``key`` matches ``value``
+    # this function is intended to be used as the entrypoint to the elastic run
+    if key not in os.environ:
+        raise RuntimeError(f"Environment variable {key} not found in os.environ")
+    else:
+        actual = os.getenv(key)
+        if expected != actual:
+            raise RuntimeError(
+                f"os.environ['{key}']={actual}"
+                f" does not equal the expected value: {expected}"
+            )
+
+
 def acquire_available_port():
     """
     Uses sockets to acquire an available port from the os for use.
@@ -184,10 +199,7 @@ def acquire_available_port():
           the port as quickly as possible.
     """
     addrs = socket.getaddrinfo(
-        host="localhost",
-        port=None,
-        family=socket.AF_UNSPEC,
-        type=socket.SOCK_STREAM
+        host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
     )
 
     for addr in addrs:
@@ -398,7 +410,6 @@ def run_test_with_backend(self, backend: str, test_to_run: Callable):
 
         test_to_run()
 
-
     def dummy_compute(self):
         res = self.run_agent(Conf(entrypoint=dummy_compute, local_world_size=2))
         self.assertFalse(res.is_failed())
@@ -406,21 +417,15 @@ def dummy_compute(self):
             self.assertIsInstance(return_value, torch.Tensor)
             self.assertEqual((100, 100), return_value.shape)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_dummy_compute_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_dummy_compute_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_dummy_compute_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute)
 
@@ -430,23 +435,19 @@ def run_happy_function(self):
         self.assertIsNone(res.return_values[0])
         self.assertIsNone(res.return_values[1])
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_happy_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_happy_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_happy_function_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.run_happy_function
+        )
 
     def check_master_addr_port_override(self):
         master_addr = "test_host"
@@ -463,17 +464,17 @@ def check_master_addr_port_override(self):
         self.assertFalse(res.is_failed())
         self.assertIsNone(res.return_values[0])
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_check_master_addr_port_override_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.check_master_addr_port_override
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_check_master_addr_port_override_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.check_master_addr_port_override
+        )
 
     def run_check_env_function(self):
         # just checks that all env vars that we need to set on the user script
@@ -481,11 +482,47 @@ def run_check_env_function(self):
         res = self.run_agent(Conf(entrypoint=_check_env_function, local_world_size=1))
         self.assertFalse(res.is_failed())
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    def run_check_nccl_async_error_handling_env(self):
+        # make sure NCCL_ASYNC_ERROR_HANDLING set in os.environ is honored
+        with patch.dict(os.environ, {"NCCL_ASYNC_ERROR_HANDLING": "0"}):
+            res = self.run_agent(
+                Conf(
+                    entrypoint=_check_env_value,
+                    local_world_size=1,
+                    args=("NCCL_ASYNC_ERROR_HANDLING", "0"),
+                )
+            )
+            self.assertFalse(res.is_failed())
+
+    def run_check_nccl_async_error_handling_env_default(self):
+        # if not present in env var it should default to 1
+        res = self.run_agent(
+            Conf(
+                entrypoint=_check_env_value,
+                local_world_size=1,
+                args=("NCCL_ASYNC_ERROR_HANDLING", "1"),
+            )
+        )
+        self.assertFalse(res.is_failed())
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_check_env_function_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_check_env_function
+        )
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_run_check_nccl_async_error_handling_env_c10d(self):
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_check_nccl_async_error_handling_env
+        )
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_run_check_nccl_async_error_handling_env_default_c10d(self):
+        self.run_test_with_backend(
+            backend="c10d",
+            test_to_run=self.run_check_nccl_async_error_handling_env_default,
+        )
 
     def run_function_with_return_value(self):
         res = self.run_agent(Conf(entrypoint=_echo, args=("foo",), local_world_size=2))
@@ -493,44 +530,38 @@ def run_function_with_return_value(self):
         self.assertEqual("foo", res.return_values[0])
         self.assertEqual("foo", res.return_values[1])
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_function_with_return_value_c10d(self):
-        self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value)
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_function_with_return_value
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_function_with_return_value_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_function_with_return_value
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_function_with_return_value_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.run_function_with_return_value
+        )
 
     def simple_dist_sum(self):
         res = self.run_agent(Conf(entrypoint=_dist_sum, local_world_size=2))
         self.assertFalse(res.is_failed())
         # _dist_sum internally checks that the sum computed is valid
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_simple_dist_sum_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_simple_dist_sum_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_simple_dist_sum_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum)
 
@@ -556,21 +587,27 @@ def run_distributed_sum_homogeneous(self):
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_run_distributed_sum_homogeneous_c10d(self):
-        self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous)
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_distributed_sum_homogeneous
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_run_distributed_sum_homogeneous_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_distributed_sum_homogeneous
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_run_distributed_sum_homogeneous_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous
+        )
 
     def run_distributed_sum_heterogeneous(self):
         # sums all ranks on 3 agents; each running 1, 2, 3 workers respectively
@@ -593,23 +630,23 @@ def run_distributed_sum_heterogeneous(self):
             ranks.update(run_results.return_values.keys())
         self.assertSetEqual(set(range(1 + 2 + 3)), ranks)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_distributed_sum_heterogeneous_c10d(self):
-        self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous)
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_distributed_sum_heterogeneous_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_distributed_sum_heterogeneous_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous
+        )
 
     def run_sad_function(self):
         """
@@ -632,21 +669,15 @@ def run_sad_function(self):
                 self.assertEqual(data["message"], failure_data["message"])
                 self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_sad_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_sad_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_sad_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function)
 
@@ -663,23 +694,23 @@ def run_bipolar_function(self):
         self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
         self.assertTrue(agent._total_execution_time > 0)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_bipolar_function_c10d(self):
-        self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function)
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_bipolar_function
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_bipolar_function_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_bipolar_function
+        )
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_run_bipolar_function_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.run_bipolar_function
+        )
 
     def correct_rank_assignment_heterogeneous(self):
         node_configs = [
@@ -710,14 +741,18 @@ def correct_rank_assignment_heterogeneous(self):
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_correct_rank_assignment_heterogeneous_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_correct_rank_assignment_heterogeneous_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous
+        )
 
     def correct_rank_assignment_homogeneous(self):
         node_configs = [
@@ -744,14 +779,18 @@ def correct_rank_assignment_homogeneous(self):
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_correct_rank_assignment_homogeneous_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_correct_rank_assignment_homogeneous_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous
+        )
 
     def assert_rank_consistency(
         self,
@@ -853,14 +892,18 @@ def double_agent_fault_tolerance(self):
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_double_agent_fault_tolerance_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.double_agent_fault_tolerance
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_double_agent_fault_tolerance_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance
+        )
 
     def double_agent_elastic(self):
         """
@@ -907,21 +950,27 @@ def double_agent_elastic(self):
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_double_agent_elastic_c10d(self):
-        self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic)
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.double_agent_elastic
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_double_agent_elastic_etcd(self):
-        self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic)
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.double_agent_elastic
+        )
 
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
     )
     def test_double_agent_elastic_etcd_v2(self):
-        self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic)
+        self.run_test_with_backend(
+            backend="etcd-v2", test_to_run=self.double_agent_elastic
+        )
 
     def torch_rpc(self):
         """
@@ -1056,21 +1105,15 @@ def barrier_failed(self, barrier_mock):
         self.assertFalse(res.is_failed())
         barrier_mock.assert_called_once()
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_barrier_failed_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_barrier_failed_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_barrier_failed_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed)
 
@@ -1089,20 +1132,14 @@ def shutdown_called(self, start_processes_mock):
             agent.run("worker")
         pcontext_mock.close.assert_called_once()
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_shutdown_called_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_shutdown_called_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called)
 
-    @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
-    )
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_shutdown_called_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called)
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 6f10dc8e386c..915a848a160a 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -17,7 +17,6 @@
 from itertools import product
 from typing import Callable, Dict, List, Union
 from unittest import mock
-from unittest.mock import patch
 
 import torch
 import torch.multiprocessing as mp
@@ -31,7 +30,7 @@
     _wrap,
     to_map,
 )
-from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error
+from torch.distributed.elastic.multiprocessing.errors import ErrorHandler
 from torch.testing._internal.common_utils import (
     IS_IN_CI,
     IS_MACOS,
@@ -40,9 +39,9 @@
     TEST_WITH_ASAN,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
+    TestCase,
     run_tests,
     sandcastle_skip_if,
-    TestCase
 )
 
 
@@ -65,27 +64,29 @@ def test_is_failed(self):
         pr_fail = RunProcsResult(failures={0: fail0})
         self.assertTrue(pr_fail.is_failed())
 
-    @patch("torch.distributed.elastic.multiprocessing.errors.log")
-    def test_get_failures(self, log_mock):
-        with mock.patch("time.time", side_effect=[3, 2, 1]):
-            error_file0 = os.path.join(self.test_dir, "error0.json")
-            error_file1 = os.path.join(self.test_dir, "error1.json")
-            _write_error(RuntimeError("error 0"), error_file0)
-            _write_error(RuntimeError("error 1"), error_file1)
+    def test_get_failures(self):
 
-            fail0 = ProcessFailure(
-                local_rank=0, pid=997, exitcode=1, error_file=error_file0
-            )
-            fail1 = ProcessFailure(
-                local_rank=1, pid=998, exitcode=3, error_file=error_file1
-            )
-            fail2 = ProcessFailure(
-                local_rank=2, pid=999, exitcode=15, error_file="no_exist.json"
-            )
+        error_file0 = os.path.join(self.test_dir, "error0.json")
+        error_file1 = os.path.join(self.test_dir, "error1.json")
+        eh = ErrorHandler()
+        with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": error_file0}):
+            eh.record_exception(RuntimeError("error 0"))
+
+        with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": error_file0}):
+            eh.record_exception(RuntimeError("error 1"))
+
+        fail0 = ProcessFailure(
+            local_rank=0, pid=997, exitcode=1, error_file=error_file0
+        )
+        fail1 = ProcessFailure(
+            local_rank=1, pid=998, exitcode=3, error_file=error_file1
+        )
+        fail2 = ProcessFailure(
+            local_rank=2, pid=999, exitcode=15, error_file="no_exist.json"
+        )
 
-            self.assertEqual(3, fail0.timestamp)
-            self.assertEqual(2, fail1.timestamp)
-            self.assertEqual(1, fail2.timestamp)
+        self.assertLessEqual(fail0.timestamp, fail1.timestamp)
+        self.assertLessEqual(fail1.timestamp, fail2.timestamp)
 
 
 class StdTest(TestCase):
diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py
index 7868624b8603..a9590bea313d 100644
--- a/test/distributed/elastic/multiprocessing/errors/api_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/api_test.py
@@ -14,7 +14,7 @@
     ProcessFailure,
     record,
 )
-from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error
+from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler
 
 
 class SentinelError(Exception):
@@ -36,7 +36,8 @@ def good_fn():
 @record
 def raise_child_failure_error_fn(name, child_error_file=""):
     if child_error_file:
-        _write_error(SentinelError("foobar"), child_error_file)
+        with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": child_error_file}):
+            ErrorHandler().record_exception(SentinelError("foobar"))
     pf = ProcessFailure(local_rank=0, pid=997, exitcode=1, error_file=child_error_file)
     raise ChildFailedError(name, {0: pf})
 
@@ -64,7 +65,10 @@ def test_failure_incorrect_reply_file(self):
             )
 
     def failure_with_error_file(self, exception):
-        _write_error(exception, self.test_error_file)
+        with mock.patch.dict(
+            os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file}
+        ):
+            ErrorHandler().record_exception(exception)
         return ProcessFailure(
             local_rank=0, pid=997, exitcode=1, error_file=self.test_error_file
         )
diff --git a/test/distributed/elastic/multiprocessing/errors/error_handler_test.py b/test/distributed/elastic/multiprocessing/errors/error_handler_test.py
index 9905859a6aa7..6adf97de9a27 100644
--- a/test/distributed/elastic/multiprocessing/errors/error_handler_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/error_handler_test.py
@@ -9,7 +9,7 @@
 import unittest
 from unittest.mock import patch
 
-from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler, _write_error
+from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler
 from torch.distributed.elastic.multiprocessing.errors.handlers import get_error_handler
 
 
@@ -78,15 +78,15 @@ def test_record_exception_no_error_file(self):
 
     def test_dump_error_file(self):
         src_error_file = os.path.join(self.test_dir, "src_error.json")
-        _write_error(RuntimeError("foobar"), src_error_file)
+        eh = ErrorHandler()
+        with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": src_error_file}):
+            eh.record_exception(RuntimeError("foobar"))
 
         with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file}):
-            eh = ErrorHandler()
             eh.dump_error_file(src_error_file)
             self.assertTrue(filecmp.cmp(src_error_file, self.test_error_file))
 
         with patch.dict(os.environ, {}):
-            eh = ErrorHandler()
             eh.dump_error_file(src_error_file)
             # just validate that dump_error_file works when
             # my error file is not set
@@ -95,10 +95,13 @@ def test_dump_error_file(self):
     def test_dump_error_file_overwrite_existing(self):
         dst_error_file = os.path.join(self.test_dir, "dst_error.json")
         src_error_file = os.path.join(self.test_dir, "src_error.json")
-        _write_error(RuntimeError("foo"), dst_error_file)
-        _write_error(RuntimeError("bar"), src_error_file)
+        eh = ErrorHandler()
+        with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}):
+            eh.record_exception(RuntimeError("foo"))
+
+        with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": src_error_file}):
+            eh.record_exception(RuntimeError("bar"))
 
         with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}):
-            eh = ErrorHandler()
             eh.dump_error_file(src_error_file)
             self.assertTrue(filecmp.cmp(src_error_file, dst_error_file))
diff --git a/test/distributed/elastic/utils/util_test.py b/test/distributed/elastic/utils/util_test.py
index f6f29d7c6438..fefe40537a8f 100644
--- a/test/distributed/elastic/utils/util_test.py
+++ b/test/distributed/elastic/utils/util_test.py
@@ -7,48 +7,77 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from unittest import mock
+
 import torch.distributed.elastic.utils.store as store_util
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
-class TestStore:
-    def get(self, key: str):
-        return f"retrieved:{key}"
-
-
 class StoreUtilTest(TestCase):
-    def test_get_data(self):
-        store = TestStore()
-        data = store_util.get_all(store, "test/store", 10)
-        for idx in range(0, 10):
-            self.assertEqual(f"retrieved:test/store{idx}", data[idx])
+    def test_get_all_rank_0(self):
+        store = mock.MagicMock()
+        world_size = 3
+        store_util.get_all(store, 0, "test/store", world_size)
+        # omit empty kwargs, get only key
+        actual_set_call_args = [
+            call_args[0][0] for call_args in store.set.call_args_list
+        ]
+        self.assertListEqual(["test/store0.FIN"], actual_set_call_args)
+
+        actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list]
+        expected_get_call_args = [
+            ("test/store0",),
+            ("test/store1",),
+            ("test/store2",),
+            ("test/store0.FIN",),
+            ("test/store1.FIN",),
+            ("test/store2.FIN",),
+        ]
+        self.assertListEqual(expected_get_call_args, actual_get_call_args)
+
+    def test_get_all_rank_n(self):
+        store = mock.MagicMock()
+        world_size = 3
+        store_util.get_all(store, 1, "test/store", world_size)
+        # omit empty kwargs, get only key
+        actual_set_call_args = [
+            call_args[0][0] for call_args in store.set.call_args_list
+        ]
+        self.assertListEqual(["test/store1.FIN"], actual_set_call_args)
+
+        actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list]
+        expected_get_call_args = [
+            ("test/store0",),
+            ("test/store1",),
+            ("test/store2",),
+        ]
+        self.assertListEqual(expected_get_call_args, actual_get_call_args)
 
     def test_synchronize(self):
-        class DummyStore:
-            def __init__(self):
-                self._data = {
-                    "torchelastic/test0": "data0".encode(encoding="UTF-8"),
-                    "torchelastic/test1": "data1".encode(encoding="UTF-8"),
-                    "torchelastic/test2": "data2".encode(encoding="UTF-8"),
-                }
-
-            def set(self, key, value):
-                self._data[key] = value
-
-            def get(self, key):
-                return self._data[key]
-
-            def set_timeout(self, timeout):
-                pass
-
+        store_mock = mock.MagicMock()
         data = "data0".encode(encoding="UTF-8")
-        store = DummyStore()
-        res = store_util.synchronize(store, data, 0, 3, key_prefix="torchelastic/test")
-        self.assertEqual(3, len(res))
-        for idx, res_data in enumerate(res):
-            actual_str = res_data.decode(encoding="UTF-8")
-            self.assertEqual(f"data{idx}", actual_str)
+        store_util.synchronize(store_mock, data, 0, 3, key_prefix="torchelastic/test")
+        actual_set_call_args = store_mock.set.call_args_list
+        # omit empty kwargs
+        actual_set_call_args = [call_args[0] for call_args in actual_set_call_args]
+        expected_set_call_args = [
+            ("torchelastic/test0", b"data0"),
+            ("torchelastic/test0.FIN", b"FIN"),
+        ]
+        self.assertListEqual(expected_set_call_args, actual_set_call_args)
+
+        expected_get_call_args = [
+            ("torchelastic/test0",),
+            ("torchelastic/test1",),
+            ("torchelastic/test2",),
+            ("torchelastic/test0.FIN",),
+            ("torchelastic/test1.FIN",),
+            ("torchelastic/test2.FIN",),
+        ]
+        actual_get_call_args = store_mock.get.call_args_list
+        actual_get_call_args = [call_args[0] for call_args in actual_get_call_args]
+        self.assertListEqual(expected_get_call_args, actual_get_call_args)
 
 
 class UtilTest(TestCase):
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
new file mode 100644
index 000000000000..ef95973764c4
--- /dev/null
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -0,0 +1,105 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import tempfile
+
+import torch
+from torch import distributed as dist
+from torch.distributed._shard.checkpoint import (
+    FileSystemReader,
+    FileSystemWriter,
+    save_state_dict,
+    load_state_dict,
+)
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    StateDictType,
+)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+from torch.distributed.fsdp.wrap import enable_wrap, wrap
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    SkipModel,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+_DISTRIBUTED_STATE_DICT_IMPLS = {
+    StateDictType.LOCAL_STATE_DICT,
+    StateDictType.SHARDED_STATE_DICT,
+}
+
+
+class TestDistributedCheckpoint(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _DISTRIBUTED_STATE_DICT_IMPLS)
+    def test_distributed_checkpoint(self, state_dict_type) -> None:
+        with enable_wrap(wrapper_cls=FSDP):
+            torch.manual_seed(100)
+            model = wrap(SkipModel(double_nest=True))
+            torch.manual_seed(200)
+            new_model = wrap(SkipModel(double_nest=True))
+
+        with FullyShardedDataParallel.summon_full_params(
+            model
+        ), FullyShardedDataParallel.summon_full_params(new_model):
+            params = list(model.parameters())
+            new_params = list(new_model.parameters())
+            self.assertNotEqual(params, new_params)
+
+        with tempfile.TemporaryDirectory() as path:
+            paths = [path]
+            dist.broadcast_object_list(paths)
+            path = paths[0]
+            writer = FileSystemWriter(path)
+            reader = FileSystemReader(path)
+            with FSDP.state_dict_type(
+                model, state_dict_type
+            ), FSDP.state_dict_type(new_model, state_dict_type):
+                state_dict = model.state_dict()
+
+            save_state_dict(state_dict, writer)
+
+            with FSDP.state_dict_type(
+                model, state_dict_type
+            ), FSDP.state_dict_type(new_model, state_dict_type):
+                state_dict = new_model.state_dict()
+                load_state_dict(state_dict, reader)
+                new_model.load_state_dict(state_dict)
+
+        with FullyShardedDataParallel.summon_full_params(
+            model
+        ), FullyShardedDataParallel.summon_full_params(new_model):
+            params = list(model.parameters())
+            new_params = list(new_model.parameters())
+            self.assertEqual(params, new_params)
+
+        # TODO: add resharding test case.
+
+
+instantiate_parametrized_tests(TestDistributedCheckpoint)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_flatten_params_wrapper.py
index c4a7eb657078..69c78ee6dde7 100644
--- a/test/distributed/fsdp/test_flatten_params_wrapper.py
+++ b/test/distributed/fsdp/test_flatten_params_wrapper.py
@@ -198,7 +198,7 @@ def _test(kwargs, expected, exception=None, regex=None):
                     expected,
                     msg=f"{flat_p.shard_metadata()}, {expected}",
                 )
-                self.assertEqual(flat_p._num_padded, kwargs["num_padded"])
+                self.assertEqual(flat_p.num_padded, kwargs["num_padded"])
 
         _test(
             kwargs={"start": -1, "end": -1, "num_padded": 0},
diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py
new file mode 100644
index 000000000000..7870804d78fc
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_apply.py
@@ -0,0 +1,104 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    NestedWrappedModule,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestApply(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    @torch.no_grad()
+    def _init_linear_weights(self, m):
+        if type(m) == nn.Linear:
+            m.weight.fill_(1.0)
+            m.bias.fill_(1.0)
+
+    @property
+    def process_group(self):
+        return dist.distributed_c10d._get_default_group()
+
+    def check_weights(self, fsdp, expected_tensor_fn, check):
+        with fsdp.summon_full_params(fsdp, recurse=True):
+            linear_modules = [
+                module for module in fsdp.modules() if type(module) == nn.Linear
+            ]
+            for module in linear_modules:
+                for param in module.parameters():
+                    expected = expected_tensor_fn(param)
+                    check(param, expected, f"Got {param} but expected {expected}")
+
+    def _check_apply(self, fsdp):
+        # Assert linear weights are not all 1.0
+        self.check_weights(
+            fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertNotEqual
+        )
+
+        fsdp.apply(self._init_linear_weights)
+
+        # Ensure all weights are 1.0
+        self.check_weights(
+            fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertEqual
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_nested_module_apply(self):
+        """
+        Checks apply() modifies weights appropriately on a nested FSDP instance.
+        """
+        nested_module = NestedWrappedModule(
+            self.process_group, wrap_fsdp=True, wrap_everything=True
+        )
+        fsdp_module = FSDP(nested_module, self.process_group).cuda(self.rank)
+        self._check_apply(fsdp_module)
+
+    @skip_if_lt_x_gpu(2)
+    def test_transformer_module_apply(self):
+        """
+        Checks apply() modifies weights appropriately on a wrapped Transformer
+        module.
+        """
+        transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank)
+        self._check_apply(transformer)
+
+    @skip_if_lt_x_gpu(2)
+    def test_apply_in_summon_raises_error(self):
+        """
+        Ensures that if user calls apply() on FSDP instance within full param
+        summon context, appropriate error is raised.
+        """
+        transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank)
+        with transformer.summon_full_params(transformer, recurse=True):
+            with self.assertRaisesRegex(ValueError, "expected to be in states"):
+                transformer.apply(self._init_linear_weights)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index e3dd483eaf69..1b3510e2b55e 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -10,7 +10,7 @@
     FullyShardedDataParallel as FSDP,
     CPUOffload,
 )
-from torch.distributed.algorithms._checkpoint._checkpoint_wrapper import (
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
 )
 from torch.testing._internal.common_distributed import (
@@ -115,7 +115,7 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations):
 
         models = [ckpt_sequential_wrapped_fsdp, inner_ckpt, baseline]
 
-        offload_to_cpu_event = "Memcpy DtoH"
+        offload_to_cpu_event = "Memcpy DtoH" if torch.version.cuda else "CopyDeviceToHost"
 
         for i in range(2):
             losses = []
@@ -177,7 +177,7 @@ def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations):
             fsdp_call_checkpoint,
         ]
 
-        offload_to_cpu_event = "Memcpy DtoH"
+        offload_to_cpu_event = "Memcpy DtoH" if torch.version.cuda else "CopyDeviceToHost"
 
         for i in range(6):
             losses = []
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
new file mode 100644
index 000000000000..9e39254ec423
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -0,0 +1,105 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+from math import inf
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel as FSDP,
+    CPUOffload,
+    _calc_grad_norm,
+)
+from torch.nn import utils as nn_utils
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    DeterministicModel,
+    FSDPTest,
+    _collect_total_grad_norm_fsdp,
+    _collect_total_grad_norm_local,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+    parametrize,
+    instantiate_parametrized_tests,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestClipGradNorm(FSDPTest):
+    def _run_fsdp_one_iteration(self, norm_type, nested_fsdp, cpu_offload):
+        """Test FSDP with clip grad norm."""
+        fsdp_model = DeterministicModel(nested_fsdp, cpu_offload=cpu_offload)
+        local_model = DeterministicModel(False)
+        input = torch.rand(14, 2, device=self.rank)
+        fsdp_model = FSDP(fsdp_model, cpu_offload=cpu_offload)
+        self.assertTrue(len(input) >= self.world_size)
+        out = local_model(input[: self.world_size])
+        out.sum().backward()
+        in_data = torch.tensor(input[self.rank], device=self.rank)
+        out_fsdp = fsdp_model(in_data)
+        out_fsdp.sum().backward()
+        total_norms_fsdp = _collect_total_grad_norm_fsdp(
+            fsdp_model, norm_type, self.rank
+        )
+        total_norms_local = _collect_total_grad_norm_local(local_model, norm_type)
+        total_norms_local /= self.world_size
+        norm_cap = total_norms_fsdp / 2.0
+        self.assertEqual(total_norms_local, total_norms_fsdp)
+        fsdp_model.clip_grad_norm_(norm_cap, norm_type=norm_type)
+        nn_utils.clip_grad_norm_(
+            local_model.parameters(), norm_cap, norm_type=norm_type
+        )
+        total_norms_after_clip_fsdp = _collect_total_grad_norm_fsdp(
+            fsdp_model, norm_type, self.rank
+        )
+        total_norms_after_clip_local = _collect_total_grad_norm_local(
+            local_model, norm_type
+        )
+        self.assertTrue(total_norms_after_clip_fsdp <= norm_cap)
+        self.assertEqual(total_norms_after_clip_local, total_norms_after_clip_fsdp)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("norm_type", [2.0, inf])
+    @parametrize("nested_fsdp", [True, False])
+    @parametrize(
+        "cpu_offload",
+        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
+    )
+    def test_fsdp_clip_grad_norm(self, norm_type, nested_fsdp, cpu_offload):
+        """Test FSDP with clip grad norm."""
+        self._run_fsdp_one_iteration(norm_type, nested_fsdp, cpu_offload)
+
+
+class TestCalcuGradNorm(FSDPTest):
+    @skip_if_lt_x_gpu(2)
+    @parametrize("norm_type", [2.0, inf, 1.3, 2.5])
+    @parametrize("nested_fsdp", [True, False])
+    def test_fsdp_calc_grad_norm(self, norm_type, nested_fsdp):
+        """Test grad norm cal API."""
+        model = FSDP(DeterministicModel(nested_fsdp))
+        input = torch.rand(15, 2, device=self.rank)
+        out = model(input)
+        out.sum().backward()
+        total_norm = _calc_grad_norm(model.params_with_grad, norm_type)
+        total_norm_expected = _collect_total_grad_norm_local(model, norm_type)
+        self.assertEqual(total_norm, total_norm_expected)
+
+
+instantiate_parametrized_tests(TestClipGradNorm)
+instantiate_parametrized_tests(TestCalcuGradNorm)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
new file mode 100644
index 000000000000..c527ca7aebc8
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -0,0 +1,253 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+from contextlib import suppress
+from enum import Enum, auto
+from typing import Optional
+from unittest.mock import patch
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, NestedWrappedModule
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class PassType(Enum):
+    __order__ = "FWD BWD"
+    FWD = auto()
+    BWD = auto()
+
+
+class TestCommunication(FSDPTest):
+    """Tests ``FullyShardedDataParallel``'s collective communication usage."""
+    def _init_model(
+        self,
+        nested_model: bool,
+        sharding_strategy: ShardingStrategy,
+        device: torch.device,
+    ):
+        group = dist.distributed_c10d._get_default_group()
+        if nested_model:
+            model = NestedWrappedModule(
+                group, wrap_fsdp=True, sharding_strategy=sharding_strategy,
+            )
+            fsdp_model: FSDP = FSDP(
+                model, group, sharding_strategy=sharding_strategy,
+            ).to(device)
+        else:
+            fsdp_model: FSDP = self._get_wrapped_model(
+                group,
+                cuda_first=False,
+                config={"sharding_strategy": sharding_strategy},
+            )
+        return fsdp_model
+
+    def _run_iter(self, fsdp_model, batch, use_no_sync: bool):
+        """Runs an iteration inside or outside the ``no_sync()`` context."""
+        context = fsdp_model.no_sync() if use_no_sync else suppress()
+        with context:
+            output = fsdp_model(*batch)
+            loss = fsdp_model.module.get_loss(batch, output)
+            loss.backward()
+
+    def _get_ref_num_reduce_scatters(
+        self,
+        num_fsdp: int,
+        in_no_sync: bool,
+    ) -> int:
+        """Returns the reference number of reduce-scatters for an iteration
+        in the ``no_sync()`` context."""
+        return num_fsdp if not in_no_sync else 0
+
+    def _get_ref_num_all_gathers(
+        self,
+        num_fsdp: int,
+        sharding_strategy: Optional[ShardingStrategy],
+        is_first_iter: bool,
+        is_last_iter_no_sync: bool,
+    ) -> int:
+        """Returns the reference number of all-gathers in an iteration, summing
+        over the forward and backward passes."""
+        return sum(
+            self._get_ref_num_all_gathers_in_pass(
+                num_fsdp,
+                sharding_strategy,
+                pass_type,
+                is_first_iter,
+                is_last_iter_no_sync,
+            ) for pass_type in PassType
+        )
+
+    def _get_ref_num_all_gathers_in_pass(
+        self,
+        num_fsdp: int,
+        sharding_strategy: Optional[ShardingStrategy],
+        pass_type: PassType,
+        is_first_iter: bool,
+        is_last_iter_no_sync: bool,
+    ):
+        """Returns the reference number of all-gathers for a given setting."""
+        if sharding_strategy is None:
+            sharding_strategy = ShardingStrategy.FULL_SHARD  # default
+        # Forward pass:
+        if pass_type == PassType.FWD and \
+            sharding_strategy == ShardingStrategy.SHARD_GRAD_OP and \
+                is_last_iter_no_sync:
+            # Modules do not free the full parameters in the last
+            # iteration's backward pass if it was in `no_sync()`
+            num_all_gathers = 0
+        elif pass_type == PassType.FWD:
+            # Otherwise, all modules all-gather the full parameters in the
+            # forward pass
+            num_all_gathers = num_fsdp
+        # Backward pass:
+        elif pass_type == PassType.BWD and \
+                sharding_strategy == ShardingStrategy.FULL_SHARD:
+            # Root does not free the full parameters at the end of the
+            # forward pass
+            num_all_gathers = num_fsdp - 1
+        elif pass_type == PassType.BWD and \
+                sharding_strategy == ShardingStrategy.SHARD_GRAD_OP:
+            # Modules do not free the full parameters at the end of the
+            # forward pass
+            num_all_gathers = 0
+        else:
+            assert 0, f"Unsupported: add a branch for pass_type={pass_type} " \
+                f"is_first_iter={is_first_iter} " \
+                f"is_last_iter_no_sync={is_last_iter_no_sync} " \
+                f"sharding_strategy={sharding_strategy}"
+        if is_first_iter and pass_type == PassType.FWD:
+            # With execution order validation, on the first iteration, we have
+            # an additional all-gather before every actual all-gather in the
+            # forward pass
+            num_all_gathers *= 2
+        return num_all_gathers
+
+    def _print_ref_num_all_gathers_in_pass(
+        self,
+        num_fsdp: int,
+        sharding_strategy: ShardingStrategy,
+        pass_type: PassType,
+        is_first_iter: bool,
+        is_last_iter_no_sync: bool,
+    ):
+        """Helper method for printing the number of all-gathers for a specific
+        setting. This may be helpful since the branching is complex."""
+        if self.rank != 0:
+            return  # only print on one rank
+        num_all_gathers = self._get_ref_num_all_gathers_in_pass(
+            num_fsdp, sharding_strategy, pass_type, is_first_iter,
+            is_last_iter_no_sync,
+        )
+        print(
+            f"Pass: {pass_type}\n"
+            f"Is First Iteration: {is_first_iter}\n"
+            f"Sharding Strategy: {sharding_strategy}\n"
+            f"Last iteration in `no_sync()`: {is_last_iter_no_sync}\n"
+            f"Number of all-gathers: {num_all_gathers}"
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("nested_model", [False, True])
+    @parametrize("use_no_sync", [False, True])
+    @parametrize("sharding_strategy", [ShardingStrategy.SHARD_GRAD_OP, None])
+    def test_communication(
+        self,
+        nested_model: bool,
+        use_no_sync: bool,
+        sharding_strategy: Optional[ShardingStrategy],
+    ):
+        """
+        Tests FSDP's communication cost in terms of calls to collective
+        communication primitives (i.e. all-gather and reduce-scatter).
+
+        Arguments:
+            nested_model (bool): If ``True``, uses ``NestedWrappedModule``,
+                which has nested FSDP instances; if ``False``, uses the default
+                model, which does not have nested FSDP instances.
+            use_no_sync (bool): If ``True``, runs some iterations inside the
+                ``no_sync()`` context manager to accumulate gradients, followed
+                by some iterations outside the context manager; if ``False``,
+                only runs some iterations outside the context manager.
+            sharding_strategy (Optional[ShardingStrategy]): Configures the
+                FSDP algorithm.
+        """
+        # Initialize the model and inputs
+        device = torch.device("cuda")
+        fsdp_model = self._init_model(nested_model, sharding_strategy, device)
+        batch = fsdp_model.module.get_input(device)
+
+        # Count the number of FSDP instances that manage parameters since the
+        # number of collectives are a function of this number
+        num_fsdp = sum(
+            (isinstance(m, FSDP) and len(m.params) > 0)
+            for m in fsdp_model.modules()
+        )
+
+        # If `use_no_sync=True`, we run `num_iters` iterations inside
+        # `no_sync()` followed by `num_iters` iterations outside `no_sync()`,
+        # and if `use_no_sync=False`, we only run `num_iters` iterations
+        # outside `no_sync()`
+        num_iters = 3
+        with patch("torch.distributed._all_gather_base") as mock_all_gather, \
+                patch("torch.distributed._reduce_scatter_base") as mock_reduce_scatter:
+            def reset_mocks():
+                mock_all_gather.reset_mock()
+                mock_reduce_scatter.reset_mock()
+            # Check the communication cost when using `no_sync()`
+            if use_no_sync:
+                for i in range(num_iters):
+                    reset_mocks()
+                    self._run_iter(fsdp_model, batch, use_no_sync=True)
+                    num_all_gathers = mock_all_gather.call_count
+                    num_reduce_scatters = mock_reduce_scatter.call_count
+                    ref_num_all_gathers = self._get_ref_num_all_gathers(
+                        num_fsdp, sharding_strategy, is_first_iter=i == 0,
+                        is_last_iter_no_sync=i > 0,
+                    )
+                    ref_num_reduce_scatters = self._get_ref_num_reduce_scatters(
+                        num_fsdp, in_no_sync=True,
+                    )
+                    self.assertEqual(num_all_gathers, ref_num_all_gathers)
+                    self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters)
+            # Check the normal communication cost (when not using `no_sync()`)
+            for i in range(num_iters):
+                reset_mocks()
+                self._run_iter(fsdp_model, batch, use_no_sync=False)
+                num_all_gathers = mock_all_gather.call_count
+                num_reduce_scatters = mock_reduce_scatter.call_count
+                ref_num_all_gathers = self._get_ref_num_all_gathers(
+                    num_fsdp, sharding_strategy,
+                    is_first_iter=not use_no_sync and i == 0,
+                    is_last_iter_no_sync=use_no_sync and i == 0,
+                )
+                ref_num_reduce_scatters = self._get_ref_num_reduce_scatters(
+                    num_fsdp, in_no_sync=False,
+                )
+                self.assertEqual(num_all_gathers, ref_num_all_gathers)
+                self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters)
+
+
+instantiate_parametrized_tests(TestCommunication)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index ef91d4db0836..38e29fc29e34 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import functools
+import itertools
 import sys
 from unittest import mock
 
@@ -18,6 +19,7 @@
     NestedWrappedModule,
     NestedWrappedModuleWithDelay,
     TransformerWithSharedParams,
+    subtest_name
 )
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
@@ -26,8 +28,8 @@
     run_tests,
 )
 
-from torch.distributed.fsdp import CPUOffload
-from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch
+from torch.distributed.fsdp import CPUOffload, MixedPrecision
+from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy
 
 
 if not dist.is_available():
@@ -41,6 +43,24 @@
     )
     sys.exit(0)
 
+params = "cpu_offload,backward_prefetch,sharding_strategy"
+cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
+backward_prefetch_config = [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
+sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None, ShardingStrategy.NO_SHARD]
+configs = list(itertools.product(cpu_offload_config,
+                                 backward_prefetch_config,
+                                 sharding_strategy_config))
+test_name_mapping = {
+    str(CPUOffload(offload_params=True)): "offload_true",
+    str(CPUOffload(offload_params=False)): "offload_false",
+    str(BackwardPrefetch.BACKWARD_PRE): "prefetch_pre",
+    str(BackwardPrefetch.BACKWARD_POST): "prefetch_post",
+    str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op",
+    str(ShardingStrategy.NO_SHARD): "no_shard",
+}
+
+subtest_name = functools.partial(subtest_name, test_name_mapping)
+
 
 class TestParityWithDDP(FSDPTest):
     """
@@ -63,15 +83,8 @@ def _get_init_modes_for_test(self, cpu_offload):
         return modes
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_nested_wrapped_model(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    def test_nested_wrapped_model(self, cpu_offload, backward_prefetch, sharding_strategy):
         init_modes = self._get_init_modes_for_test(cpu_offload)
         for fsdp_init_mode in init_modes:
             with self.subTest(fsdp_init_mode=fsdp_init_mode):
@@ -80,18 +93,43 @@ def test_nested_wrapped_model(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    sharding_strategy=sharding_strategy,
                 )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch):
+    @parametrize("cpu_offload", cpu_offload_config)
+    @parametrize("sharding_strategy", sharding_strategy_config)
+    @parametrize("mixed_precision", [True, False])
+    def test_nested_wrapped_model_single_iteration_mixed_precision(
+        self,
+        cpu_offload,
+        sharding_strategy,
+        mixed_precision
+    ):
+        init_modes = self._get_init_modes_for_test(cpu_offload)
+        mixed_precision = MixedPrecision(
+            param_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+        ) if mixed_precision else None
+        for fsdp_init_mode in init_modes:
+            with self.subTest(fsdp_init_mode=fsdp_init_mode):
+                self._test_identical_outputs(
+                    NestedWrappedModule,
+                    # Only run one step for comparison, as usually grad scaler
+                    # is needed to avoid NaN after first step.
+                    num_steps=1,
+                    fsdp_init_mode=fsdp_init_mode,
+                    cpu_offload=cpu_offload,
+                    sharding_strategy=sharding_strategy,
+                    mixed_precision=mixed_precision,
+                )
+
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(params, configs, subtest_name)
+    @parametrize("clip_norm_type", [2.0, None])
+    def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type):
         init_modes = self._get_init_modes_for_test(cpu_offload)
         for fsdp_init_mode in init_modes:
             with self.subTest(fsdp_init_mode=fsdp_init_mode):
@@ -101,18 +139,14 @@ def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    norm_type=clip_norm_type,
+                    sharding_strategy=sharding_strategy,
                 )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_transformer_parameterized(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    @parametrize("clip_norm_type", [2.0, None])
+    def test_transformer_parameterized(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type):
         init_modes = self._get_init_modes_for_test(cpu_offload)
         for fsdp_init_mode in init_modes:
             with self.subTest(fsdp_init_mode=fsdp_init_mode):
@@ -121,18 +155,13 @@ def test_transformer_parameterized(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    norm_type=clip_norm_type,
+                    sharding_strategy=sharding_strategy,
                 )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_delayed_optim_step(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    def test_delayed_optim_step(self, cpu_offload, backward_prefetch, sharding_strategy):
         # We use a model with a long CUDA delay right before the optimizer step.
         # This tests our streams logic, and that we don't start the allgather
         # until after the optimization step completes.
@@ -147,18 +176,12 @@ def test_delayed_optim_step(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    sharding_strategy=sharding_strategy,
                 )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch, sharding_strategy):
         # We insert a delay in the torch.distributed._reduce_scatter_base op, so that
         # the post_backward_stream takes much longer than the backward pass.
         # This tests that we properly block at the end of the backward pass for
@@ -174,21 +197,16 @@ def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    sharding_strategy=sharding_strategy,
                 )
 
     def _dummy_ddp_fn(self, model):
         return DummyDDP(model)
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_mixture_of_experts(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    @parametrize("clip_norm_type", [2.0, None])
+    def test_mixture_of_experts(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type):
         init_modes = self._get_init_modes_for_test(cpu_offload)
         for fsdp_init_mode in init_modes:
             with self.subTest(fsdp_init_mode=fsdp_init_mode):
@@ -200,18 +218,13 @@ def test_mixture_of_experts(self, cpu_offload, backward_prefetch):
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    norm_type=clip_norm_type,
+                    sharding_strategy=sharding_strategy,
                 )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None]
-    )
-    def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_prefetch):
+    @parametrize(params, configs, subtest_name)
+    def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_prefetch, sharding_strategy):
         init_modes = self._get_init_modes_for_test(cpu_offload)
         for fsdp_init_mode in init_modes:
             with self.subTest(fsdp_init_mode=fsdp_init_mode):
@@ -222,15 +235,21 @@ def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_p
                     fsdp_init_mode=fsdp_init_mode,
                     cpu_offload=cpu_offload,
                     backward_prefetch=backward_prefetch,
+                    sharding_strategy=sharding_strategy,
                 )
 
 
 class TestParamInit(FSDPTest):
     @skip_if_lt_x_gpu(2)
-    def test_param_change_after_init(self):
+    @parametrize("mixed_precision", [True, False])
+    def test_param_change_after_init(self, mixed_precision):
         group = dist.distributed_c10d._get_default_group()
         # Establish reference behavior.
-        model = self._get_wrapped_model(group, cuda_first=False)
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        config = {"mixed_precision": mixed_precision}
+        model = self._get_wrapped_model(
+            group, mixed_precision=mixed_precision, cuda_first=False
+        )
         model.eval()  # no dropout for this test
         input = model.module.get_input(torch.device("cuda"))
         ref_output = model(*input)
@@ -284,10 +303,15 @@ def _test_output_backward_hooks(self, model):
 
     @skip_if_lt_x_gpu(2)
     @parametrize("cuda_first", [False, True])
-    def test_register_functions_called(self, cuda_first):
+    @parametrize("mixed_precision", [True, False])
+    def test_register_functions_called(self, cuda_first, mixed_precision):
         """Tests that _register_{pre|post}_backward_hooks called during forward."""
         group = dist.distributed_c10d._get_default_group()
-        model = self._get_wrapped_model(group, cuda_first=cuda_first)
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        config = {"mixed_precision": mixed_precision}
+        model = self._get_wrapped_model(
+            group, mixed_precision=mixed_precision, cuda_first=cuda_first
+        )
         input = model.module.get_input(torch.device("cuda"))
         model._register_post_backward_hooks = mock.MagicMock(return_value=None)
         model._register_pre_backward_hooks = mock.MagicMock(return_value=None)
@@ -300,11 +324,23 @@ def test_register_functions_called(self, cuda_first):
 
 class TestNoGrad(FSDPTest):
     @skip_if_lt_x_gpu(2)
-    def test_transformer_no_grad(self):
+    @parametrize("mixed_precision", [True, False])
+    def test_transformer_no_grad(self, mixed_precision):
         group = dist.distributed_c10d._get_default_group()
-        model = self._get_wrapped_model(group, cuda_first=False)
+        mixed_precision = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        ) if mixed_precision else None
+        config = {"mixed_precision": mixed_precision}
+        model = self._get_wrapped_model(group, config=config, cuda_first=False)
         # Train model for a step
-        self._train_for_several_steps(model, num_steps=1, autocast=False)
+        self._train_for_several_steps(
+            model,
+            num_steps=1,
+            autocast=False,
+            mixed_precision=config["mixed_precision"]
+        )
 
         model.eval()  # no dropout for this test
 
@@ -321,6 +357,8 @@ def test_transformer_no_grad(self):
 
 instantiate_parametrized_tests(TestHooks)
 instantiate_parametrized_tests(TestParityWithDDP)
+instantiate_parametrized_tests(TestNoGrad)
+instantiate_parametrized_tests(TestParamInit)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py
new file mode 100644
index 000000000000..14a704b53f78
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_exec_order.py
@@ -0,0 +1,194 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import warnings
+from contextlib import suppress
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class Model(torch.nn.Module):
+    """
+    Model that supports two computation paths: `layer0` -> `layer1` and
+    `layer0` -> `layer2`. Notably, both `layer1` and `layer2` have 36 elements
+    when flattened, which means that their corresponding all-gathers and
+    reduce-scatters may be silently matched if we do not perform any checks.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.layer0 = torch.nn.Linear(5, 6)
+        self.layer1 = torch.nn.Linear(6, 6, bias=False)
+        self.layer2 = torch.nn.Sequential(
+            torch.nn.Linear(6, 3, bias=False),
+            torch.nn.ReLU(),
+            torch.nn.Linear(3, 6, bias=False),
+        )
+        self.relu = torch.nn.ReLU()
+        self.use_alt_path = False
+        for param in self.layer2.parameters():
+            param.requires_grad = False
+
+    def forward(self, x):
+        # `layer0` -> `layer1` (normal)
+        # `layer0` -> `layer2` (alternate)
+        z = self.relu(self.layer0(x))
+        z = self.relu(self.layer2(z)) if self.use_alt_path \
+            else self.relu(self.layer1(z))
+        return z
+
+    def get_input(self, device: torch.device):
+        return (torch.randn((8, 5)).to(device),)
+
+    def get_loss(self, input, output):
+        return output.sum()
+
+    def run_backward(self, loss):
+        loss.backward()
+
+    def flip_path(self):
+        params_to_freeze = self.layer2.parameters() if self.use_alt_path \
+            else self.layer1.parameters()
+        params_to_unfreeze = self.layer1.parameters() if self.use_alt_path \
+            else self.layer2.parameters()
+        for param in params_to_freeze:
+            param.requires_grad = False
+        for param in params_to_unfreeze:
+            param.requires_grad = True
+        self.use_alt_path = not self.use_alt_path
+
+    @staticmethod
+    def wrap(sharding_strategy: ShardingStrategy, device: torch.device):
+        model = Model()
+        model.layer1 = FSDP(model.layer1, sharding_strategy=sharding_strategy)
+        model.layer2 = FSDP(model.layer2, sharding_strategy=sharding_strategy)
+        fsdp_model = FSDP(model, sharding_strategy=sharding_strategy)
+        return fsdp_model.to(device)
+
+
+class TestFSDPExecOrder(FSDPTest):
+    @property
+    def device(self):
+        return torch.device("cuda")
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "sharding_strategy",
+        [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP],
+    )
+    def test_invalid_first_iter_order(
+        self,
+        sharding_strategy: ShardingStrategy,
+    ):
+        """Tests that FSDP errors if the all-gather order differs across ranks
+        in the first iteration."""
+        # Rank 0 runs the forward pass in one order and all other ranks run in
+        # different order
+        fsdp_model = Model.wrap(sharding_strategy, self.device)
+        if self.rank != 0:
+            fsdp_model.flip_path()
+        inp = fsdp_model.module.get_input(self.device)
+        # Match the error message with the following prefix
+        error_regex = "^(Forward order differs across ranks)"
+        with self.assertRaisesRegex(RuntimeError, error_regex):
+            fsdp_model(*inp)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "sharding_strategy",
+        [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP],
+    )
+    @parametrize("iters_before_path_change", [1, 3])
+    def test_invalid_later_iter_order(
+        self,
+        sharding_strategy: ShardingStrategy,
+        iters_before_path_change: int,
+    ):
+        """Tests that FSDP warns the user if the all-gather order changes after
+        the first iteration."""
+        # On the first iteration, all ranks run the same order, and on the next
+        # iteration, all but rank 0 run in a different order
+        fsdp_model = Model.wrap(sharding_strategy, self.device)
+        for _ in range(iters_before_path_change):
+            inp = fsdp_model.module.get_input(self.device)
+            output = fsdp_model(*inp)
+            loss = fsdp_model.module.get_loss(inp, output).to(self.device)
+            fsdp_model.module.run_backward(loss)
+        # Match the warning message with the following prefix
+        regex = "^(Forward order differs from that of the first iteration " \
+            f"on rank {self.rank} -- collectives are unchecked and may give " \
+            "incorrect results or hang)"
+        context = self.assertWarnsRegex(
+            expected_warning=UserWarning, expected_regex=regex,
+        ) if self.rank != 0 else suppress()
+        if self.rank != 0:
+            fsdp_model.flip_path()
+        inp = fsdp_model.module.get_input(self.device)
+        # Expect a warning for the forward pass all-gather
+        with context:  # warning for forward pass all-gather
+            output = fsdp_model(*inp)
+        loss = fsdp_model.module.get_loss(inp, output).to(self.device)
+        fsdp_model.module.run_backward(loss)
+        # Run an additional iteration to check that there are no more warnings
+        inp = fsdp_model.module.get_input(self.device)
+        output = fsdp_model(*inp)
+        loss = fsdp_model.module.get_loss(inp, output).to(self.device)
+        fsdp_model.module.run_backward(loss)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "sharding_strategy",
+        [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP],
+    )
+    def test_train_eval(self, sharding_strategy: ShardingStrategy):
+        fsdp_model = Model.wrap(sharding_strategy, self.device)
+        NUM_ITERS = 3
+        NUM_EPOCHS = 2
+        with warnings.catch_warnings(record=True) as w:  # records warnings to `w`
+            for _ in range(NUM_EPOCHS):
+                fsdp_model.train()
+                for _ in range(NUM_ITERS):
+                    inp = fsdp_model.module.get_input(self.device)
+                    output = fsdp_model(*inp)
+                    loss = fsdp_model.module.get_loss(inp, output).to(self.device)
+                    fsdp_model.module.run_backward(loss)
+                fsdp_model.eval()
+                for _ in range(NUM_ITERS):
+                    inp = fsdp_model.module.get_input(self.device)
+                    output = fsdp_model(*inp)
+                    fsdp_model.module.get_loss(inp, output).to(self.device)
+        # Check that the order validation warning was not issued (errors do not
+        # need to be checked since they will be directly reported)
+        warning_prefix = "Forward order differs"
+        for warning in w:
+            if str(warning.message).startswith(warning_prefix):
+                raise AssertionError(f"Warning was incorrectly issued: {warning.message}")
+        # If we still validate the forward execution order in eval mode, then
+        # an `AssertionError` will be raised above for both sharding strategies
+
+
+instantiate_parametrized_tests(TestFSDPExecOrder)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index 6a45ff9039db..9a92d9d0f546 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -143,7 +143,7 @@ def _dist_train(
             optimizer.step()
 
         if with_fsdp:
-            get_full_params(model)
+            return get_full_params(model)
 
         return list(model.parameters())
 
diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py
new file mode 100644
index 000000000000..f2569266c347
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_grad_acc.py
@@ -0,0 +1,261 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import itertools
+import sys
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp import CPUOffload
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+@dataclass
+class _GradAccConfig:
+    """
+    This configures how gradients are accumulated in :meth:`_test_grad_acc`.
+    Each instance of this class represents ``num_iters``-many consecutive
+    iterations, where the ``no_sync()`` context manager is used or not as given
+    by ``use_no_sync``.
+
+    Attributes:
+        use_no_sync (bool): Indicates whether to use the ``no_sync()`` context
+            manager as the way to accumulate gradients.
+        num_iters (int): Number of iterations to accumulate gradients.
+    """
+    use_no_sync: bool
+    num_iters: int
+
+    def __repr__(self) -> str:
+        # Override to remove any spaces in the string to appease the internal
+        # build's test name parser
+        return (
+            f"(use_no_sync={self.use_no_sync},"
+            f"num_iters={self.num_iters})"
+        )
+
+
+@dataclass
+class _GradAccConfigs:
+    """
+    This wraps a :class:`list` of :class:`_GradAccConfig` instances with the
+    sole purpose of overriding :meth:`__repr__` to remove spaces.
+    """
+    configs: List[_GradAccConfig]
+
+    def __repr__(self) -> str:
+        # Override to remove any spaces in the string to appease the internal
+        # build's test name parser
+        return (
+            "[" + ",".join(config.__repr__() for config in self.configs) + "]"
+        )
+
+
+class TestGradAcc(FSDPTest):
+    """Tests ``FullyShardedDataParallel``'s gradient accumulation via both its
+    ``no_sync()`` context manager and without the context manager."""
+
+    def _test_grad_acc(
+        self,
+        batch_dim: int,
+        configs: List[_GradAccConfig],
+        cpu_offload: CPUOffload,
+        backward_prefetch: Optional[BackwardPrefetch],
+    ):
+        """
+        Tests gradient accumulation by comparing a run that trains sequentially
+        through some batches while accumulating gradients with a run that
+        trains on the concatenation of those batches in a single iteration.
+
+        The last iteration always synchronizes gradients regardless of what is
+        specified by the last element of ``configs``.
+
+        Arguments:
+            batch_dim (int): Batch dimension in the input tensor to be passed
+                into the model for the forward pass.
+            configs (List[_GradAccConfig]): :class:`list` of configurations
+                specifying how gradients are accumulated; for example, a list
+                corresponding to [(False, 2), (True, 2), (False, 2)] indicates
+                to accumulate over 2 + 2 + 2 = 6 total iterations, where the
+                first two do not use ``no_sync()``, the middle two do use
+                ``no_sync()``, and the final two again do not use
+                ``no_sync()``.
+            cpu_offload (CPUOffload): Configures CPU offloading.
+            backward_prefetch (Optional[BackwardPrefetch]): Specifies at which
+                point to prefetch the next layer's full parameters during the
+                backward pass, if at all.
+        """
+        # Gradient accumulation outside `no_sync()` is not currently compatible
+        # with CPU offloading
+        if cpu_offload.offload_params and \
+                any(not config.use_no_sync for config in configs):
+            return
+        old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        try:
+            # Disable TF32 to prevent floating point drift
+            torch.backends.cuda.matmul.allow_tf32 = False
+
+            # Initialize the FSDP model and optimizer
+            group = dist.distributed_c10d._get_default_group()
+            fsdp_model: FSDP = self._get_wrapped_model(
+                group, cuda_first=False, add_bn=False,
+                config={
+                    "cpu_offload": cpu_offload,
+                    "backward_prefetch": backward_prefetch,
+                },
+            )  # disable BN since the test uses varying batch sizes
+            fsdp_model.eval()  # disable dropout
+            device = torch.device("cuda")
+            optim = torch.optim.SGD(
+                fsdp_model.parameters(), lr=0.01, momentum=0.9,
+            )
+
+            # Generate the sequence of batches, each containing the same data
+            # but permuted
+            def permute_tensor(x: torch.Tensor):
+                return x.view(-1)[torch.randperm(x.numel())].view_as(x)
+
+            batch: Tuple[torch.Tensor, ...] = \
+                fsdp_model.module.get_input(device)
+            batches: List[Tuple[torch.Tensor, ...]] = [batch]
+            num_iters_to_acc = sum(config.num_iters for config in configs)
+            for _ in range(num_iters_to_acc - 1):
+                batches.append(tuple(permute_tensor(t) for t in batch))
+            for (batch1, batch2) in itertools.combinations(batches, r=2):
+                for t1, t2 in zip(batch1, batch2):
+                    assert not torch.all(t1 == t2), \
+                        "Check the test to make sure that batches are distinct"
+
+            # Concatenate the batches along the given batch dimension
+            concat_batch: Tuple[torch.Tensor, ...] = tuple(
+                torch.cat(ts, dim=batch_dim) for ts in zip(*batches)
+            )
+
+            # Establish reference gradients using the concatenated batch
+            fsdp_model.zero_grad()
+            output = fsdp_model(*concat_batch)
+            ref_loss = fsdp_model.module.get_loss(concat_batch, output)
+            ref_loss.backward()
+            ref_grads = [
+                p.grad.detach().clone() for p in fsdp_model.parameters()
+            ]
+
+            # Compute and accumulate the gradients
+            fsdp_model.zero_grad()
+            losses = []
+            batch_idx = 0
+            for config in configs:
+                sync_context = fsdp_model.no_sync() if config.use_no_sync \
+                    else contextlib.suppress()
+                with sync_context:
+                    for _ in range(config.num_iters):
+                        if batch_idx == num_iters_to_acc - 1:
+                            break  # always sync on the last iteration
+                        batch = batches[batch_idx]
+                        batch_idx += 1
+                        output = fsdp_model(*batch)
+                        loss = fsdp_model.module.get_loss(batch, output)
+                        loss.backward()
+                        losses.append(loss)
+            output = fsdp_model(*batches[-1])
+            loss = fsdp_model.module.get_loss(batches[-1], output)
+            loss.backward()
+            losses.append(loss)
+            acc_loss = sum(losses)
+            acc_grads = [
+                p.grad.detach().clone() for p in fsdp_model.parameters()
+            ]
+
+            # Compare the losses and gradients
+            torch.testing.assert_close(ref_loss, acc_loss)
+            self.assertEqual(len(ref_grads), len(acc_grads))
+            for ref_grad, acc_grad in zip(ref_grads, acc_grads):
+                self.assertEqual(ref_grad.device, acc_grad.device)
+                self.assertEqual(ref_grad.size(), acc_grad.size())
+                self.assertEqual(ref_grad.dtype, acc_grad.dtype)
+                torch.testing.assert_close(ref_grad, acc_grad)
+
+            # Check that the optimizer step does not error
+            optim.step()
+        finally:
+            torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "configs",
+        [
+            _GradAccConfigs([
+                _GradAccConfig(use_no_sync=True, num_iters=3),
+                _GradAccConfig(use_no_sync=False, num_iters=3),
+                _GradAccConfig(use_no_sync=True, num_iters=3),
+            ]),
+            _GradAccConfigs([
+                _GradAccConfig(use_no_sync=False, num_iters=3),
+                _GradAccConfig(use_no_sync=True, num_iters=3),
+                _GradAccConfig(use_no_sync=False, num_iters=3),
+            ]),
+        ]
+    )
+    @parametrize(
+        "cpu_offload",
+        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
+    )
+    @parametrize(
+        "backward_prefetch",
+        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None],
+    )
+    def test_grad_acc(
+        self,
+        configs: _GradAccConfigs,
+        cpu_offload: CPUOffload,
+        backward_prefetch: Optional[BackwardPrefetch],
+    ):
+        """
+        Tests gradient accumulation.
+
+        This exercises gradient accumulation inside and outside the
+        ``no_sync()`` context manager, in particular by interleaving the two.
+        It tests both interleaving starting with (and ending with, resp.)
+        inside versus outside ``no_sync()`` to ensure that initial conditions
+        (and final conditions, resp.) do not affect the correctness. This test
+        also checks for compatibility with the CPU offload and backward
+        prefetch options.
+
+        NOTE: Gradient accumulation without using the ``no_sync()`` context
+        manager is not currently compatible with CPU offloading, so those tests
+        are vacuous.
+        """
+        self._test_grad_acc(
+            batch_dim=1,
+            configs=configs.configs,
+            cpu_offload=cpu_offload,
+            backward_prefetch=backward_prefetch,
+        )
+
+
+instantiate_parametrized_tests(TestGradAcc)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
new file mode 100644
index 000000000000..6c653b92ece4
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -0,0 +1,136 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class Model(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.layer0 = torch.nn.Linear(3, 5)
+        self.layer1 = torch.nn.Sequential(
+            torch.nn.Linear(5, 5),
+            torch.nn.Linear(5, 4),
+            torch.nn.Linear(4, 4),
+        )
+        self.layer2 = torch.nn.Linear(4, 1)
+
+    def forward(self, x):
+        return self.layer2(self.layer1(self.layer0(x)))
+
+    def get_input(self, device):
+        return (torch.randn((8, 3)).to(device),)
+
+    def get_loss(self, input, output):
+        return output.sum()
+
+    def run_backward(self, loss):
+        loss.backward()
+
+class TestFSDPIgnoredModules(FSDPTest):
+    @skip_if_lt_x_gpu(2)
+    def test_ignored_modules_transformer(self):
+        """Tests that ignored modules' parameters are not flattened for a
+        transformer model with shared parameters."""
+        # Initialize an FSDP-wrapped transformer model that has FSDP ignore
+        # the `nn.Transformer` module's parameters
+        group = dist.distributed_c10d._get_default_group()
+        wrapped_model = self._get_wrapped_model(group, ignore_modules=True)
+        # Check that the wrapped model's flattened parameter does not include
+        # the ignored transformer module's parameters
+        nonwrapped_model = self._get_nonwrapped_model(group)
+        total_numel = sum(p.numel() for p in nonwrapped_model.parameters())
+        ignored_numel = sum(
+            p.numel() for p in nonwrapped_model.transformer.parameters()
+        )
+        nonignored_numel = total_numel - ignored_numel
+        with FSDP.summon_full_params(wrapped_model):
+            flat_param_numel = wrapped_model.params[0].numel()
+            self.assertEqual(flat_param_numel, nonignored_numel)
+        # Check that we can run a few iterations
+        device = torch.device("cuda")
+        optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)
+        for _ in range(3):
+            inp = wrapped_model.module.get_input(device)
+            output = wrapped_model(*inp)
+            loss = wrapped_model.module.get_loss(inp, output).to(device)
+            wrapped_model.module.run_backward(loss)
+            optim.step()
+
+    @skip_if_lt_x_gpu(2)
+    def test_ignored_modules_nested(self):
+        """Tests that passing a module with nested FSDP modules does not
+        error and still ignores non-FSDP modules' parameters."""
+        # Initialize an FSDP-wrapped nested model that first wraps the nested
+        # sequential's middle linear layer (`layer1[1]`) and then wraps the
+        # overall model while ignoring the nested sequential (`layer1`)
+        model = Model().cuda()
+        model.layer1[1] = FSDP(model.layer1[1])
+        wrapped_model = FSDP(model, ignored_modules=[model.layer1])
+        # Check that the wrapped model's flattened parameter does not include
+        # the ignored nested sequential's parameters
+        nonwrapped_model = Model()
+        total_numel = sum(p.numel() for p in nonwrapped_model.parameters())
+        ignored_numel = sum(
+            p.numel() for p in nonwrapped_model.layer1.parameters()
+        )
+        nonignored_numel = total_numel - ignored_numel
+        with FSDP.summon_full_params(wrapped_model):
+            flat_param_numel = wrapped_model.params[0].numel()
+            self.assertEqual(flat_param_numel, nonignored_numel)
+        # Check that we can run a few iterations
+        device = torch.device("cuda")
+        optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)
+        for _ in range(3):
+            inp = wrapped_model.get_input(device)
+            output = wrapped_model(*inp)
+            loss = wrapped_model.get_loss(inp, output).to(device)
+            wrapped_model.run_backward(loss)
+            optim.step()
+
+    @skip_if_lt_x_gpu(2)
+    def test_ignored_modules_invalid(self):
+        """Tests that passing an FSDP module as an ignored module or the
+        top-level module itself errors."""
+        model = Model()
+        model.layer1 = FSDP(model.layer1)
+        # Passing an FSDP module as an ignored module should error
+        with self.assertRaises(
+            ValueError,
+            msg="`ignored_modules` should not include FSDP modules",
+        ):
+            FSDP(model, ignored_modules=[model.layer1])
+        with self.assertRaises(
+            ValueError,
+            msg="Trying to ignore the top-level module passed into the FSDP "
+            "constructor itself will result in all parameters being ignored "
+            "and is not supported",
+        ):
+            FSDP(model, ignored_modules=[model])
+
+
+instantiate_parametrized_tests(TestFSDPIgnoredModules)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py
new file mode 100644
index 000000000000..1aa426800db6
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_meta.py
@@ -0,0 +1,328 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.wrap import always_wrap_policy as always_wrap
+from torch.distributed.fsdp.wrap import wrap, enable_wrap
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+    parametrize,
+    instantiate_parametrized_tests,
+    sandcastle_skip_if,
+)
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+)
+
+_TORCHDISTX_AVAIL = True
+try:
+    from torchdistx import deferred_init
+except ImportError:
+    _TORCHDISTX_AVAIL = False
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+def _reset_params_if_meta(is_meta, model):
+    # For torchdistX init, we don't need to call reset_params, as
+    # deferred_init(model).materialize() is equivalent to model().
+    if is_meta:
+        model.reset_parameters()
+
+class MyLinear(nn.Linear):
+    """
+    Linear layer with deterministic reset_parameters for testing.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def reset_parameters(self, *args, **kwargs):
+        with torch.no_grad():
+            self.weight.fill_(1)
+
+class MyModel(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.lin1 = MyLinear(2, 2, bias=False, device=device)
+        self.lin2 = MyLinear(2, 2, bias=False, device=device)
+
+    def forward(self, x):
+        return self.lin2(self.lin1(x))
+
+    def reset_parameters(self, *args, **kwargs):
+        for m in [self.lin1, self.lin2]:
+            if not isinstance(m, FSDP):
+                m.reset_parameters()
+
+
+class NestedModel(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.lin1 = MyLinear(2, 2, bias=False, device=device)
+        self.lin1 = wrap(self.lin1)
+        self.lin2 = MyLinear(2, 2, bias=False, device=device)
+        self.l3 = MyModel(device=device)
+        self.l3 = wrap(self.l3)
+
+    def forward(self, x):
+        return self.l3(self.lin2(self.lin1(x)))
+
+    def reset_parameters(self):
+        for m in [self.lin1, self.lin2, self.l3]:
+            if not isinstance(m, FSDP):
+                m.reset_parameters()
+
+def _init_with_reset_params(module):
+    """
+    to_empty + reset_parameters() init function example for modules
+    initailized with device="meta"
+    """
+    is_meta = any(t.is_meta for t in module.parameters())
+    if is_meta:
+        module.to_empty(device=torch.cuda.current_device())
+    with torch.no_grad():
+        module.reset_parameters()
+
+def _init_with_torchdistX(module):
+    """
+    torchdistX-based deferred module initialization function example
+    using ``materialize_module``.
+    """
+    assert _TORCHDISTX_AVAIL
+
+    def check_fn(k):
+        return not isinstance(k, FSDP)
+
+    deferred_init.materialize_module(module, check_fn=check_fn)
+
+class TestFSDPWithMetaDevice(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def process_group(self):
+        return dist.distributed_c10d._get_default_group()
+
+    def _compare_fsdp(self, fsdp1, fsdp2):
+        with FSDP.summon_full_params(fsdp1):
+            with FSDP.summon_full_params(fsdp2):
+                for p1, p2 in zip(fsdp1.parameters(), fsdp2.parameters()):
+                    self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
+
+    def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
+        # Create model on meta device and wrap with FSDP.
+        model = meta_module_fn()
+        is_meta = next(model.parameters()).is_meta
+        fsdp_meta = FSDP(
+            model,
+            auto_wrap_policy=always_wrap,
+            param_init_fn=init_fn,
+        )
+
+        meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+
+        # Test to make sure it is the same model parameters as regular FSDP
+        # approach.
+        regular = MyModel(device="cuda")
+        _reset_params_if_meta(is_meta, regular)
+        fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
+        regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
+
+        self._compare_fsdp(fsdp_meta, fsdp_regular)
+        inp = torch.randn(10, 2, device='cuda')
+        fsdp_meta(inp).sum().backward()
+        fsdp_regular(inp).sum().backward()
+        meta_opt.step()
+        regular_opt.step()
+        self._compare_fsdp(fsdp_meta, fsdp_regular)
+
+        # Test that meta init works if all submodules are contained in only a
+        # single FSDP unit.
+        model = meta_module_fn()
+        fsdp_meta = FSDP(model, param_init_fn=init_fn)
+        meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+        regular = MyModel(device="cuda")
+        _reset_params_if_meta(is_meta, regular)
+        fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
+        regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
+
+        # Run a forward + backward pass + optimizer step
+        fsdp_meta(inp).sum().backward()
+        fsdp_regular(inp).sum().backward()
+        meta_opt.step()
+        regular_opt.step()
+        self._compare_fsdp(fsdp_meta, fsdp_regular)
+
+    @skip_if_lt_x_gpu(2)
+    def test_simple_model_with_meta_device_reset_params(self):
+        def meta_module_fn():
+            return MyModel(device="meta")
+        self._test_simple_model_with_meta_device(
+            meta_module_fn, _init_with_reset_params
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_simple_model_with_meta_device_default_init(self):
+        def meta_module_fn():
+            return MyModel(device="meta")
+        self._test_simple_model_with_meta_device(meta_module_fn)
+
+    @skip_if_lt_x_gpu(2)
+    @sandcastle_skip_if(
+        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+    )
+    def test_simple_model_with_torchdistX_default_init(self):
+        def meta_module_fn():
+            return deferred_init.deferred_init(MyModel, device="cuda")
+
+        self._test_simple_model_with_meta_device(meta_module_fn)
+
+    @skip_if_lt_x_gpu(2)
+    @sandcastle_skip_if(
+        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+    )
+    def test_simple_model_with_torchdistX_init_fn(self):
+        def meta_module_fn():
+            return deferred_init.deferred_init(MyModel, device="cuda")
+
+        self._test_simple_model_with_meta_device(meta_module_fn, init_fn=_init_with_torchdistX)
+
+    def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn=None):
+        if auto_wrap:
+            module = meta_module_fn()
+            is_meta = next(module.parameters()).is_meta
+            fsdp_meta = FSDP(
+                module,
+                auto_wrap_policy=always_wrap,
+                param_init_fn=init_fn,
+            )
+            meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+            module_regular = NestedModel(device="cuda")
+            _reset_params_if_meta(is_meta, module_regular)
+            fsdp_regular = FSDP(
+                module_regular,
+                auto_wrap_policy=always_wrap,
+            )
+            regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
+        else:
+            with enable_wrap(
+                wrapper_cls=FSDP, param_init_fn=init_fn,
+            ):
+                module = meta_module_fn()
+                is_meta = next(module.parameters()).is_meta
+                # Non FSDP modules will still be initialized because they bubble up
+                # to be part of a larger FSDP unit.
+                fsdp_meta = wrap(module)
+                meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
+
+            # Init and reset parameters before wrapping so that reset_params
+            # matches up with meta device's initialization.
+            module_regular = NestedModel(device="cuda")
+            _reset_params_if_meta(is_meta, module_regular)
+            with enable_wrap(wrapper_cls=FSDP):
+                module_regular.lin1 = wrap(module_regular.lin1)
+                module_regular.l3 = wrap(module_regular.l3)
+                fsdp_regular = wrap(module_regular)
+                regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
+
+        # Compare it before training
+        self._compare_fsdp(fsdp_meta, fsdp_regular)
+        inp = torch.randn(10, 2, device='cuda')
+        fsdp_meta(inp).sum().backward()
+        fsdp_regular(inp).sum().backward()
+        meta_opt.step()
+        regular_opt.step()
+        self._compare_fsdp(fsdp_meta, fsdp_regular)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("auto_wrap", [True, False])
+    def test_nested_model_with_meta_device_reset_params(self, auto_wrap):
+        def meta_module_fn():
+            return NestedModel(device="meta")
+
+        self._test_nested_model_with_meta_device(
+            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_reset_params
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("auto_wrap", [True, False])
+    def test_nested_model_with_meta_device_default_init(self, auto_wrap):
+        def meta_module_fn():
+            return NestedModel(device="meta")
+
+        self._test_nested_model_with_meta_device(
+            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @sandcastle_skip_if(
+        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+    )
+    @parametrize("auto_wrap", [True, False])
+    def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
+        def meta_module_fn():
+            return deferred_init.deferred_init(NestedModel, device="cuda")
+
+        self._test_nested_model_with_meta_device(
+            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @sandcastle_skip_if(
+        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+    )
+    @parametrize("auto_wrap", [True, False])
+    def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
+        def meta_module_fn():
+            return deferred_init.deferred_init(NestedModel, device="cuda")
+
+        self._test_nested_model_with_meta_device(
+            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_torchdistX,
+        )
+
+    def _test_bad_arg(self, meta_module_fn):
+        mod = meta_module_fn()
+        with self.assertRaisesRegex(ValueError, "to be callable"):
+            FSDP(mod, param_init_fn=42)
+
+    @skip_if_lt_x_gpu(2)
+    @sandcastle_skip_if(
+        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+    )
+    def test_bad_arg_torchdistx(self):
+        def meta_module_fn():
+            return deferred_init.deferred_init(NestedModel, "cuda")
+
+        self._test_bad_arg(meta_module_fn)
+
+    @skip_if_lt_x_gpu(2)
+    def test_bad_arg_meta(self):
+        def meta_module_fn():
+            return NestedModel(device="meta")
+
+        self._test_bad_arg(meta_module_fn)
+
+
+instantiate_parametrized_tests(TestFSDPWithMetaDevice)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
new file mode 100644
index 000000000000..4d486d7b0407
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -0,0 +1,253 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+from contextlib import suppress
+import functools
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    NestedWrappedModule,
+    FSDPInitMode,
+    TransformerWithSharedParams,
+    _validate,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestFSDPMisc(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def process_group(self):
+        return dist.distributed_c10d._get_default_group()
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_id_auto_wrap(self):
+        """
+        Test auto wrapping propagates the device id.
+        """
+        model = TransformerWithSharedParams(group=self.process_group)
+        my_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer}
+        )
+        wrapped = FSDP(
+            model,
+            auto_wrap_policy=my_auto_wrap_policy,
+            device_id=torch.cuda.current_device()
+        )
+        # All FSDP instances should have device_id set
+        for m in FSDP.fsdp_modules(wrapped):
+            self.assertEqual(m.device_id, torch.device("cuda", torch.cuda.current_device()))
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_index", [True, False])
+    def test_fsdp_device_id(self, use_index):
+        """
+        If CPU module is passed into FSDP with device_id
+        argument, it is moved to the GPU with that device_id.
+        """
+        dev_id = (
+            torch.cuda.current_device() if use_index
+            else torch.device("cuda", torch.cuda.current_device())
+        )
+
+        def _check_device_matches(fsdp, dev_id):
+            devices = {p.device for p in fsdp.parameters()}
+            self.assertEqual(1, len(devices))
+            found_dev = devices.pop()
+            if use_index and not isinstance(dev_id, torch.device):
+                dev_id = torch.device("cuda", dev_id)
+            self.assertEqual(found_dev, dev_id)
+
+        mod = NestedWrappedModule(
+            group=self.process_group,
+            wrap_fsdp=True,
+            wrap_everything=True,
+            fsdp_init_mode=FSDPInitMode.CUDA_NEVER,
+            device_id=dev_id
+        )
+        fsdp = FSDP(mod, device_id=dev_id)
+        # Check FSDP parameters are moved.
+        _check_device_matches(fsdp, dev_id)
+        # device_id matching module device before FSDP construction
+        # should not throw errors.
+        mod = NestedWrappedModule(
+            group=self.process_group,
+            wrap_fsdp=True,
+            wrap_everything=True,
+            fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+            device_id=dev_id
+        )
+        fsdp = FSDP(mod, device_id=dev_id)
+        _check_device_matches(fsdp, dev_id)
+        # Passing in torch.device("cuda") should work.
+        regex = "does not have explicit index"
+        context = self.assertWarnsRegex(
+            expected_warning=UserWarning, expected_regex=regex
+        )
+        with context:
+            mod = NestedWrappedModule(
+                group=self.process_group,
+                wrap_fsdp=True,
+                wrap_everything=True,
+                fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+                device_id=torch.device("cuda")
+            )
+            fsdp = FSDP(mod, device_id=torch.device("cuda"))
+        _check_device_matches(fsdp, torch.device("cuda", torch.cuda.current_device()))
+
+    @skip_if_lt_x_gpu(2)
+    def test_module_device_mismatches_device_id(self):
+        """
+        FSDP raises errors when module is on a GPU that does
+        not match device_id.
+        """
+        context = (
+            self.assertRaisesRegex(
+                RuntimeError,
+                f"on rank {self.rank}.*cuda:0, but is on cuda:{self.rank}"
+            ) if self.rank != 0 else suppress()
+        )
+        with context:
+            mod = NestedWrappedModule(
+                group=self.process_group,
+                wrap_fsdp=True,
+                wrap_everything=True,
+                # Would move module to current cuda device before
+                # wrapping with FSDP
+                fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+                # Rank 1 is given device id 0, but model is on cuda:1,
+                # should throw errors.
+                device_id=0
+            )
+
+    @skip_if_lt_x_gpu(2)
+    def test_multi_device_not_supported(self):
+        """
+        FSDP throws appropriate error when we wrap multi-device module.
+        """
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = nn.Linear(1, 1).cuda()
+                self.b = nn.Linear(1, 1)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FSDP only supports single device modules"
+        ):
+            FSDP(MyModule())
+
+    @skip_if_lt_x_gpu(2)
+    def test_no_params(self):
+        """
+        Test that device_id and cpu init work if module has no params
+        (they are effective noops, but ensure FSDP does not assume module
+        has parameters during init)
+        """
+        # Test CPU
+        no_params = nn.ReLU()
+        module = FSDP(no_params)
+        # Test CUDA
+        no_params = nn.ReLU().cuda()
+        module = FSDP(no_params)
+        # Test CPU + device_id
+        no_params = nn.ReLU()
+        module = FSDP(no_params, device_id=torch.cuda.current_device())
+        # For modules with no params, wrong device_id will raise error about
+        # inconsistency between compute_device and device_id, since compute_device
+        # is computed as torch.cuda.current_device when there are no params.
+        no_params = nn.ReLU().cuda()
+        context = (
+            self.assertRaisesRegex(
+                AssertionError,
+                f"Inconsistent.*cuda:{self.rank} vs cuda:0"
+            )
+        ) if self.rank != 0 else suppress()
+        with context:
+            module = FSDP(no_params, device_id=0)
+
+    @skip_if_lt_x_gpu(2)
+    def test_fsdp_cpu_init_stays_on_cpu(self):
+        """
+        Ensure that CPU model input stays on CPU
+        after FSDP init even though sharding, flattening
+        is run on GPU.
+        """
+        torch.cuda.set_device(self.rank)
+        regex = "Module is input on CPU"
+        context = self.assertWarnsRegex(
+            expected_warning=UserWarning, expected_regex=regex
+        )
+        with context:
+            mod = NestedWrappedModule(
+                group=self.process_group,
+                wrap_fsdp=True,
+                wrap_everything=True,
+                fsdp_init_mode=FSDPInitMode.CUDA_NEVER,
+            )
+            fsdp = FSDP(mod)
+        devices = {p.device for p in fsdp.parameters()}
+        self.assertEqual(1, len(devices))
+        self.assertEqual(torch.device("cpu"), devices.pop())
+        fsdp = fsdp.cuda()
+        # Ensure fwd + backward can be performed after moving to CUDA.
+        # CPU input also tests that input is correctly moved to appropriate
+        # CUDA device.
+        inp = mod.get_input(device=torch.device("cpu"))
+        fsdp(inp[0]).sum().backward()
+
+    @skip_if_lt_x_gpu(2)
+    def test_fsdp_same_model_across_ranks(self):
+        """
+        FSDP broadcasts model from rank 0 to ensure it starts off with the same
+        values.
+        """
+        class MyModel(nn.Module):
+            def __init__(self, rank):
+                super().__init__()
+                # Seed via rank to make model different across ranks
+                torch.manual_seed(rank)
+                torch.cuda.manual_seed(rank)
+                self.lin = nn.Linear(10, 10, bias=False)
+                self.register_buffer("buffer", torch.ones(1) * rank)
+
+        m = MyModel(self.rank).cuda()
+        _validate(m, process_group=self.process_group, assert_fn=self.assertNotEqual)
+        # Passing sync_module_states into FSDP makes model the same during init.
+        fsdp = FSDP(m, sync_module_states=True)
+        with fsdp.summon_full_params(fsdp):
+            _validate(fsdp, process_group=self.process_group, assert_fn=self.assertEqual)
+
+instantiate_parametrized_tests(TestFSDPMisc)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
new file mode 100644
index 000000000000..f0bac76fd1d0
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -0,0 +1,665 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import contextlib
+from functools import partial
+from itertools import product
+
+import torch
+import torch.cuda.nccl as nccl
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import distributed as dist
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    CPUOffload,
+    MixedPrecision,
+    BackwardPrefetch,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    subtest_name,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+    sandcastle_skip_if,
+)
+from torch.testing._internal.common_cuda import CUDA11OrLater
+
+try:
+    import torchvision
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+
+skipIfNoTorchVision = sandcastle_skip_if(not HAS_TORCHVISION, "no torchvision")
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+# Various mixed precision configs to test under.
+default_mp = MixedPrecision(
+    param_dtype=torch.float16,
+    buffer_dtype=torch.float16,
+    reduce_dtype=torch.float16,
+)
+
+# Params and buffers are not cast, comm only happens
+# in reduced precision.
+mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16)
+
+# Only parameters are cast (thus comm should happen in the param_dtype precision)
+mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16)
+
+# Nothing is cast (thus param, comm, grad, and buffer should be in the full precision)
+mp_no_mixed_precision = MixedPrecision()
+
+nccl_supports_bf16 = (
+    CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10)
+)
+
+mp_configs = [default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision]
+if nccl_supports_bf16:
+    mp_diff_buffer_and_reduce = MixedPrecision(
+        param_dtype=torch.float16,
+        buffer_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32
+    )
+    mp_configs.extend([mp_diff_buffer_and_reduce])
+
+# Buffer original dtype, which can differ from model params.
+_BUFFER_ORIG_DTYPE = torch.float64
+
+params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype,sharded_grad_scaler"
+cpu_offload_config = [
+    CPUOffload(offload_params=True), CPUOffload(offload_params=False)
+]
+backward_prefetch_config = [
+    BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST
+]
+full_precision_param_dtype_config = [torch.float32, torch.float64]
+sharded_grad_scaler = ["enable_sharded_grad_scaler", None]
+
+configs = list(product(
+    mp_configs,
+    cpu_offload_config,
+    backward_prefetch_config,
+    full_precision_param_dtype_config,
+    sharded_grad_scaler,
+))
+
+test_name_mapping = {
+    str(CPUOffload(offload_params=True)): "offload_true",
+    str(CPUOffload(offload_params=False)): "offload_false",
+    str(BackwardPrefetch.BACKWARD_PRE): "prefetch_pre",
+    str(BackwardPrefetch.BACKWARD_POST): "prefetch_post",
+    str(default_mp): "mp_fp16",
+    str(mp_only_reduce): "mp_only_reduce",
+    str(mp_only_param_and_buf): "mp_only_param_and_buf",
+    str(mp_no_mixed_precision): "mp_no_mp",
+    str(torch.float32): "fp32",
+    str(torch.float64): "fp64",
+    "enable_sharded_grad_scaler": "sharded_grad_scaler"
+}
+
+if nccl_supports_bf16:
+    test_name_mapping.update({
+        str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce",
+    })
+
+subtest_name = partial(subtest_name, test_name_mapping)
+
+_CURRENT_FULL_PRECISION_PARAM_DTYPE = None
+
+@contextlib.contextmanager
+def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype):
+    """
+    Patches dist._reduce_scatter_base with a new reduce_scatter_base and
+    restores upon exiting. Used for validation of mixed precision
+    """
+    orig_reduce_scatter = dist._reduce_scatter_base
+    dist._reduce_scatter_base = new_reduce_scatter
+    global _CURRENT_FULL_PRECISION_PARAM_DTYPE
+    _CURRENT_FULL_PRECISION_PARAM_DTYPE = full_precision_param_dtype
+    try:
+        yield
+    finally:
+        dist._reduce_scatter_base = orig_reduce_scatter
+        _CURRENT_FULL_PRECISION_PARAM_DTYPE = None
+
+class LinearMixedPrecision(nn.Module):
+    """
+    A linear module with extra checks for mixed precision training.
+    """
+    def __init__(self, param_dtype):
+        super().__init__()
+        self.lin = nn.Linear(10, 10, bias=False).to(param_dtype)
+        self.register_buffer('buffer', torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE))
+        self._orig_param_type = param_dtype
+        self._orig_buffer_dtype = _BUFFER_ORIG_DTYPE
+
+    def forward(self, tup):
+        # Param and input should be the mixed precision type
+        inp, cls, fsdp, mp_config, full_precision_param_dtype = tup
+        expected_param_type = (
+            mp_config.param_dtype if mp_config.param_dtype is not None
+            else self._orig_param_type
+        )
+        expected_buffer_type = (
+            mp_config.buffer_dtype if mp_config.buffer_dtype is not None
+            else self._orig_buffer_dtype
+        )
+        cls.assertEqual(inp.dtype, expected_param_type)
+        # Buffer should be in specified precision as well.
+        cls.assertEqual(self.buffer.dtype, expected_buffer_type)
+
+        # In FSDP, self.params should point to the right type.
+        num_active_fsdp = 0
+        for fsdp_module in FSDP.fsdp_modules(fsdp):
+            fsdp_managed_params = fsdp_module.params
+            # Single param assumption
+            cls.assertEqual(1, len(fsdp_managed_params))
+            for param in fsdp_managed_params:
+                # FSDP unit is currently active if it is not using the param
+                # local shard. This supports both FULL_SHARD and SHARD_GRAD_OP
+                # cases. In FULL_SHARD, we have the additional property that
+                # param._full_param_padded has not been freed.
+                is_fsdp_unit_active = (
+                    param._is_sharded and
+                    (param.data.data_ptr() != param._local_shard.data_ptr())
+                )
+                if is_fsdp_unit_active:
+                    num_active_fsdp += 1
+                    # This FSDP unit is active, verify param points to mixed
+                    cls.assertEqual(param.dtype, expected_param_type)
+                    # _rebuild_full_param should have also freed the fp16 shard.
+                    # Shard is never allocated if param_dtype mixed precision is not
+                    # enabled.
+                    if mp_config.param_dtype is not None:
+                        cls.assertEqual(0, param._mp_shard.storage().size())
+                    else:
+                        cls.assertFalse(hasattr(param, '_mp_shard'))
+                elif param._is_sharded:
+                    # This FSDP unit is not active as full param has been
+                    # freed or not yet allocated. Ensure param points to full
+                    # precision param.
+                    cls.assertEqual(param.dtype, full_precision_param_dtype)
+        # We should have gotten at least one active FSDP unit for sharded
+        # (world size > 1) cases. For cases where param is not sharded
+        # (ie world_size == 1) it is a bit hard to check if FSDP unit is active
+        # as we'd always point to the local shard, so we rely on the forward
+        # pass self.lin(inp) working well and inp being reduced precision to
+        # implicitly validate that the param is indeed in the reduced precision.
+        if cls.world_size > 1:
+            cls.assertGreater(num_active_fsdp, 0)
+
+        return (self.lin(inp), cls, fsdp, mp_config, full_precision_param_dtype)
+
+
+class TestFSDPMixedPrecision(FSDPTest):
+    @property
+    def world_size(self):
+        raise ValueError("To be implemented by child classes")
+
+    def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
+        model = FSDP(
+            nn.Sequential(
+                FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs),
+                LinearMixedPrecision(param_dtype).cuda(),
+            ),
+            *fsdp_args,
+            **fsdp_kwargs,
+        )
+        return model
+
+    def _get_simple_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
+        model = FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs)
+        return model
+
+    def _validate_no_mp_shard(self, fsdp_model):
+        """
+        Validates that there is no mixed precision _mp_shard allocated
+        when it is not expected to be.
+        """
+        fsdp_units = FSDP.fsdp_modules(fsdp_model)
+        for fsdp in fsdp_units:
+            for param in fsdp.params:
+                self.assertFalse(hasattr(param, '_mp_shard'))
+
+    def _validate_mp_shard_freed(self, fsdp_model):
+        """
+        Ensures that the mixed precision shard is greed for all FSDP units.
+        """
+        fsdp_units = FSDP.fsdp_modules(fsdp_model)
+        for fsdp in fsdp_units:
+            for param in fsdp.params:
+                self.assertEqual(0, param._mp_shard.storage().size())
+
+    def _reduce_scatter_base_validate_mp(
+        self,
+        orig_reduce_scatter,
+        mp_config,
+        *args,
+        **kwargs
+    ):
+        """
+        Performs dist._reduce_scatter_base but verifies mixed precision settings
+        before. This is to test mixed precision is working as expected during
+        backward pass. In particular it ensures that the gradients were cast to the right type
+        and comm. is going to happen in the right type.
+        """
+        tensors = []
+        for x in args:
+            if isinstance(x, torch.Tensor):
+                tensors.append(x)
+        for _, x in kwargs.items():
+            if isinstance(x, torch.Tensor):
+                tensors.append(x)
+
+        # reduce_dtype has higher priority than param_dtype, because mixed_precision
+        # supports overriding param_dtype with reduce_dtype to control the
+        # reduction precision. In the case where reduce_dtype == param_dtype
+        # this tests that gradients are in the expected precision as well.
+        # If reduce_dtype is not specified (is None) we comm. in the param_dtype
+        # if that is specified, otherwise full precision dtype.
+        expected_dtype = (
+            mp_config.reduce_dtype if mp_config.reduce_dtype is not None
+            else (
+                mp_config.param_dtype if mp_config.param_dtype is not None
+                else _CURRENT_FULL_PRECISION_PARAM_DTYPE
+            )
+        )
+
+        # for t in tensors:
+        #     print(f"tensor type {t.dtype} expected {expected_dtype}")
+        for t in tensors:
+            self.assertEqual(expected_dtype, t.dtype)
+
+        return orig_reduce_scatter(*args, **kwargs)
+
+    def _run_test_mixed_precision_e2e(
+        self,
+        mp_config,
+        cpu_offload,
+        backward_prefetch,
+        full_precision_param_dtype,
+        sharding_strategy,
+        sharded_grad_scaler,
+    ):
+        torch.cuda.set_device(self.rank)
+        fsdp_models = [
+            self._get_simple_model(
+                param_dtype=full_precision_param_dtype,
+                sharding_strategy=sharding_strategy,
+                cpu_offload=cpu_offload,
+                mixed_precision=mp_config,
+                backward_prefetch=backward_prefetch
+            ),
+            self._get_simple_nested_model(
+                param_dtype=full_precision_param_dtype,
+                sharding_strategy=sharding_strategy,
+                cpu_offload=cpu_offload,
+                mixed_precision=mp_config,
+                backward_prefetch=backward_prefetch
+            ),
+        ]
+        for model in fsdp_models:
+            if not cpu_offload.offload_params:
+                model.cuda()
+
+            # Patch reduce_scatter to add validation for mixed precision types.
+            orig_reduce_scatter = dist._reduce_scatter_base
+            test_reduce_scatter = partial(
+                self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config,
+            )
+            with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype):
+                scaler = ShardedGradScaler(enabled=sharded_grad_scaler)
+                optim = torch.optim.Adam(model.parameters())
+
+                for _ in range(3):
+                    inp = torch.randn(3, 10, device='cuda', dtype=full_precision_param_dtype)
+                    # Forward pass of LinearMixedPrecision check casting of
+                    # inputs, params, buffers.
+                    act, *_ = model(
+                        (inp, self, model, mp_config, full_precision_param_dtype)
+                    )
+                    # Buffers should be casted.
+                    for buf in model.buffers():
+                        if mp_config.buffer_dtype is not None:
+                            self.assertEqual(buf.dtype, mp_config.buffer_dtype)
+                        else:
+                            self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE)
+                    # p._mp_shard should be freed.
+                    if model.params[0]._is_sharded:  # i.e. world_size > 1
+                        # TODO: free the mixed precision shard after forward
+                        # when world_size == 1 as well, currently when
+                        # world_size == 1 it is only freed after backward.
+                        if mp_config.param_dtype is not None:
+                            self._validate_mp_shard_freed(model)
+                        else:
+                            # We never should have allocated an _mp_shard.
+                            self._validate_no_mp_shard(model)
+
+                    loss = act.sum()
+                    loss = scaler.scale(loss)
+                    if mp_config.param_dtype is not None:
+                        self.assertEqual(loss.dtype, mp_config.param_dtype)
+                    else:
+                        self.assertEqual(loss.dtype, full_precision_param_dtype)
+                    # Will run patched reduce scatter that validates mixed_precision
+                    # types in backward.
+                    loss.backward()
+                    # Buffers stay casted even after backwards.
+                    for buf in model.buffers():
+                        if mp_config.buffer_dtype is not None:
+                            self.assertEqual(buf.dtype, mp_config.buffer_dtype)
+                        else:
+                            self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE)
+                    # p._mp_shard should be freed.
+                    if mp_config.param_dtype is not None:
+                        self._validate_mp_shard_freed(model)
+                    else:
+                        self._validate_no_mp_shard(model)
+
+                    # Ensure params and grads are in full precision,
+                    # as after fwd/backward we maintain full precision shards.
+                    for param in model.parameters():
+                        self.assertEqual(param.dtype, full_precision_param_dtype)
+                        if param.grad is not None:
+                            self.assertEqual(param.grad.dtype, full_precision_param_dtype)
+
+                    # Unscale the gradients and step
+                    scaler.step(optim)
+                    # Update the scale factor
+                    scaler.update()
+
+                    # Summon full params should be in full precision
+                    with model.summon_full_params(model):
+                        # It is not expected for summon_full_params to allocate
+                        # a mixed precision shard.
+                        if mp_config.param_dtype is not None:
+                            self._validate_mp_shard_freed(model)
+                        else:
+                            self._validate_no_mp_shard(model)
+                        params = list(model.parameters())
+                        for p in params:
+                            self.assertEqual(p.dtype, full_precision_param_dtype)
+
+                        # Note that buffers are cast only once and only restored
+                        # to the original buffer dtype in state_dict, so
+                        # summon_full_params is not expected to restore buffer
+                        # types to their original.
+                        named_buffers = dict(model.named_buffers())
+                        for v in named_buffers.values():
+                            if mp_config.buffer_dtype is not None:
+                                self.assertEqual(v.dtype, mp_config.buffer_dtype)
+                            else:
+                                self.assertEqual(v.dtype, _BUFFER_ORIG_DTYPE)
+
+                    # state_dict should be in full precision
+                    state_dict = {k: v.clone() for k, v in model.state_dict().items()}
+                    for name, tensor in state_dict.items():
+                        # Parameters and buffers are checkpointed in their
+                        # original dtypes, which may be different.
+                        if name in named_buffers.keys():
+                            self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
+                        else:
+                            self.assertEqual(
+                                tensor.dtype, full_precision_param_dtype,
+                                f"{name}: {tensor.dtype} vs {full_precision_param_dtype}"
+                            )
+
+                    # After state_dict, buffer's dtype should have been restored
+                    # to the mixed precision one.
+                    for buf in model.buffers():
+                        if mp_config.buffer_dtype is not None:
+                            self.assertEqual(buf.dtype, mp_config.buffer_dtype)
+                        else:
+                            self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE)
+
+
+class TestFSDPMixedPrecisionSharded(TestFSDPMixedPrecision):
+
+    @property
+    def world_size(self):
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_mixed_precision_no_reshard_after_forward(self):
+        # Note that we don't exercise all possible different configs so as to
+        # not increase test TTS too much.
+        mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce
+        self._run_test_mixed_precision_e2e(
+            mp_config=mp,
+            cpu_offload=CPUOffload(offload_params=True),
+            backward_prefetch=None,
+            full_precision_param_dtype=torch.float64,
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+            sharded_grad_scaler=False,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(params, configs, subtest_name)
+    def test_mixed_precision_e2e_full_shard(
+        self,
+        mp_config,
+        cpu_offload,
+        backward_prefetch,
+        full_precision_param_dtype,
+        sharded_grad_scaler,
+    ):
+        self._run_test_mixed_precision_e2e(
+            mp_config,
+            cpu_offload,
+            backward_prefetch,
+            full_precision_param_dtype,
+            ShardingStrategy.FULL_SHARD,
+            sharded_grad_scaler,
+        )
+
+    def _test_mixed_precision_embedding_table(self, mp_config):
+        # Basic test to ensure int inputs are not casted which would break
+        # modules such as embedding tables.
+        param_dtype = mp_config.param_dtype or torch.float32
+        orig_reduce_scatter = dist._reduce_scatter_base
+        test_reduce_scatter = partial(
+            self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config,
+        )
+        with patch_reduce_scatter(test_reduce_scatter, param_dtype):
+            model = self._get_wrapped_model(
+                group=torch.distributed.distributed_c10d._get_default_group(),
+                config={"mixed_precision": mp_config}
+            )
+            optim = torch.optim.SGD(model.parameters(), lr=0.1)
+            for _ in range(6):
+                inp = model.module.get_input(torch.device("cuda"))
+                # This would fail if we casted integer module inputs such as for
+                # embedding tables.
+                output = model(*inp)
+                loss = model.module.get_loss(inp, output).cuda()
+                self.assertEqual(loss.dtype, param_dtype)
+                model.module.run_backward(loss)
+                optim.step()
+
+    @skip_if_lt_x_gpu(2)
+    def test_mp_embedding_reduce(self):
+        self._test_mixed_precision_embedding_table(
+            mp_config=MixedPrecision(reduce_dtype=torch.float16)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_mp_embedding_only_params_and_bufs(self):
+        self._test_mixed_precision_embedding_table(
+            mp_config=MixedPrecision(
+                param_dtype=torch.float16,
+                buffer_dtype=torch.float16,
+            )
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_mp_embedding_default(self):
+        default_mp_config = MixedPrecision(
+            param_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+        )
+        self._test_mixed_precision_embedding_table(mp_config=default_mp_config)
+
+    @skip_if_lt_x_gpu(2)
+    def test_mp_embedding_params_and_reduce_diff(self):
+        params_and_reduce_different = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float32,
+            buffer_dtype=torch.float16
+        )
+        self._test_mixed_precision_embedding_table(mp_config=params_and_reduce_different)
+
+    @skip_if_lt_x_gpu(2)
+    @skipIfNoTorchVision
+    def test_mixed_precision_resnet(self):
+        """
+        End to end test to ensure mixed precision + auto_wrap works
+        for ResNet model.
+        """
+        resnet_model = torchvision.models.resnet50().cuda()
+        resnet_model = nn.SyncBatchNorm.convert_sync_batchnorm(
+            resnet_model,
+            process_group=dist.distributed_c10d._get_default_group()
+        )
+        n_bn = sum(1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules())
+        inp = torch.ones(1, 3, 1000, 1000, device='cuda')
+        mp_config = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        )
+        fsdp = FSDP(
+            resnet_model,
+            auto_wrap_policy=size_based_auto_wrap_policy,
+            mixed_precision=mp_config
+        )
+        # Batchnorm units should be wrapped individually. Validate this by
+        # ensuring there are equal no. of FSDP units that are BN as BN units
+        # in original resnet model.
+        fsdp_bn = 0
+        for module in fsdp.fsdp_modules(fsdp):
+            wrapped_module = module.module.module
+            if isinstance(wrapped_module, _BatchNorm):
+                fsdp_bn += 1
+
+        self.assertEqual(fsdp_bn, n_bn)
+        # Would throw type mismatch issue without mixed precision autowrapping.
+        loss = fsdp(inp).sum()
+        loss.backward()
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("convert_sync_bn", [True, False])
+    def test_mp_batchnorm(self, convert_sync_bn):
+        class BatchNormNet(nn.Module):
+            def __init__(self, affine=True):
+                super(BatchNormNet, self).__init__()
+                self.fc1 = nn.Linear(2, 40, bias=False)
+                self.bn = nn.BatchNorm1d(4, affine=affine)
+                self.fc2 = nn.Linear(40, 4, bias=False)
+
+            def forward(self, x):
+                x = torch.reshape(self.fc1(x), (-1, 4, 10))
+                x = self.bn(x)
+                x = torch.reshape(x, (-1, 40))
+                x = self.fc2(x)
+                return F.softmax(x, dim=1)
+
+        def never_wrap_policy(*args, **kwargs):
+            return False
+
+        net = BatchNormNet().cuda()
+        if convert_sync_bn:
+            net = nn.SyncBatchNorm.convert_sync_batchnorm(net)
+        # FSDP detects that mixed precision + batchnorm will cause issues
+        # and thus wrap batchnorm in a distinct FSDP unit that does not
+        # use mixed precision.
+        mp_config = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        )
+        with self.assertWarnsRegex(
+            expected_warning=UserWarning,
+            expected_regex="BatchNorm units will be wrapped as a separate"
+        ):
+            model = FSDP(
+                net,
+                mixed_precision=mp_config,
+                auto_wrap_policy=never_wrap_policy,
+            )
+
+        bn = model.bn
+        self.assertTrue(isinstance(bn, FSDP))
+        # policy should not have wrapped any other submodules
+        self.assertFalse(isinstance(model.fc1, FSDP))
+        self.assertFalse(isinstance(model.fc2, FSDP))
+        self.assertEqual(None, bn.mixed_precision)
+        self.assertNotEqual(None, model.mixed_precision)
+
+        inp = torch.randn((1, 2), device='cuda')
+        # Without FSDP BN mixed precision fix, this would result in
+        # RuntimeError: Expected counts to have type Half but got Float
+        # for syncBN
+        model(inp).sum().backward()
+
+
+class TestFSDPMixedPrecisionUnsharded(TestFSDPMixedPrecision):
+    """
+    Smaller test suite for unshared param (i.e. world_size == 1) case.
+    """
+    @property
+    def world_size(self):
+        return 1
+
+    @skip_if_lt_x_gpu(1)
+    def test_mixed_precision_no_reshard_after_forward(self):
+        # Note that we don't exercise all possible different configs so as to
+        # not increase test TTS too much.
+        mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce
+        self._run_test_mixed_precision_e2e(
+            mp_config=mp,
+            cpu_offload=CPUOffload(offload_params=True),
+            backward_prefetch=None,
+            full_precision_param_dtype=torch.float64,
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+            sharded_grad_scaler=False,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_mixed_precision_e2e_full_shard(self):
+        mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce
+        self._run_test_mixed_precision_e2e(
+            mp_config=mp,
+            cpu_offload=CPUOffload(offload_params=True),
+            backward_prefetch=None,
+            full_precision_param_dtype=torch.float64,
+            sharding_strategy=ShardingStrategy.FULL_SHARD,
+            sharded_grad_scaler=False,
+        )
+
+instantiate_parametrized_tests(TestFSDPMixedPrecisionSharded)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_multiple_forward.py b/test/distributed/fsdp/test_fsdp_multiple_forward.py
index e0a3ccea16c9..c9afbd465f28 100644
--- a/test/distributed/fsdp/test_fsdp_multiple_forward.py
+++ b/test/distributed/fsdp/test_fsdp_multiple_forward.py
@@ -66,7 +66,7 @@ def _dist_train(self, wrap_fsdp):
             optim.zero_grad()
 
         if wrap_fsdp:
-            get_full_params(model)
+            return get_full_params(model)
 
         return list(model.parameters())
 
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
new file mode 100644
index 000000000000..9a51405cfaeb
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -0,0 +1,774 @@
+# Owner(s): ["oncall: distributed"]
+
+import bisect
+import sys
+from enum import Enum, auto
+from typing import Any, Dict, List, Tuple, Type
+
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    OptimStateKeyType,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class _OSDCommMethod(Enum):
+    """Method for communicating the optimizer state dict for internal tests."""
+    BROADCAST_OBJECT_LIST = auto()
+    SCATTER_FULL_OSD = auto()
+
+
+class Bias(torch.nn.Module):
+    """This module applies a 1D additive bias with dimension ``dim``."""
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        assert dim > 0
+        torch.manual_seed(0)
+        self.bias = torch.nn.Parameter(torch.randn((dim,)))
+
+    def forward(self, x):
+        return x + self.bias
+
+
+class BlockA(torch.nn.Module):
+    """
+    Used to define interesting nested structure for FSDP wrapping.
+    BlockA
+        Bias0
+            bias
+        weight
+        Bias1
+            bias
+    """
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        assert all(v > 0 for v in (in_dim, out_dim))
+        torch.manual_seed(0)
+        self.bias_module0 = Bias(out_dim)
+        self.weight = torch.nn.Parameter(torch.randn((in_dim, out_dim)))
+        self.bias_module1 = Bias(out_dim)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = x @ self.weight
+        x = self.bias_module0(x)
+        x = self.relu(x)  # ensure biases have different gradients
+        x = self.bias_module1(x)
+        return x
+
+class BlockB(torch.nn.Module):
+    """
+    Used to define interesting nested structure for FSDP wrapping.
+    BlockB
+        weight
+        Bias
+            bias
+        Bias
+            bias
+    """
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        assert all(v > 0 for v in (in_dim, out_dim))
+        torch.manual_seed(0)
+        self.weight = torch.nn.Parameter(torch.randn((in_dim, out_dim)))
+        self.bias_module0 = Bias(out_dim)
+        self.bias_module1 = Bias(out_dim)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = x @ self.weight
+        x = self.bias_module0(x)
+        x = self.relu(x)  # ensure biases have different gradients
+        x = self.bias_module1(x)
+        return x
+
+
+class NestedModel(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.block0 = BlockB(5, 7)
+        self.block1 = BlockB(7, 7)
+        self.bias = torch.nn.Parameter(torch.randn((5,)))
+        self.block2 = torch.nn.Sequential(
+            BlockA(7, 9),
+            BlockA(9, 9),
+            BlockB(9, 5),
+        )
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x) -> torch.Tensor:
+        x = self.relu(self.block0(x))
+        x = self.relu(self.block1(x))
+        x = self.relu(self.block2(x))
+        x = x + self.bias
+        return x
+
+    def get_input(self, device):
+        BATCH_SIZE = 8
+        return (torch.randn((BATCH_SIZE, 5)).to(device),)
+
+    def get_loss(self, inp, output):
+        return output.sum()
+
+    def run_backward(self, loss):
+        loss.backward()
+
+    @staticmethod
+    def wrap(model, group=None) -> torch.nn.Module:
+        # Flatten Bias0; then flatten weight and Bias1 together into `block1`
+        model.block1.bias_module0 = FSDP(
+            model.block1.bias_module0, process_group=group,
+        )
+        model.block1 = FSDP(model.block1, process_group=group)
+        # Flatten Bias0; flatten Bias1; then flatten weight into `block2[1]`
+        model.block2[1].bias_module0 = FSDP(
+            model.block2[1].bias_module0, process_group=group,
+        )
+        model.block2[1].bias_module1 = FSDP(
+            model.block2[1].bias_module1, process_group=group,
+        )
+        model.block2[1] = FSDP(model.block2[1], process_group=group)
+        # Flatten weight, Bias, bias into `block2[2]`
+        model.block2[2] = FSDP(model.block2[2], process_group=group)
+        return model
+
+    @staticmethod
+    def wrap_alt(model, group=None) -> torch.nn.Module:
+        model.block0.bias_module0 = FSDP(
+            model.block0.bias_module0, process_group=group,
+        )
+        model.block0 = FSDP(model.block0, process_group=group)
+        return model
+
+    @staticmethod
+    def wrap_with_unmanaged_params(
+        model,
+        add_to_fsdp_module: bool,
+        group=None,
+    ) -> Tuple[torch.nn.Module, List[torch.nn.Parameter]]:
+        """Registers unmanaged parameters before wrapping with :meth:`wrap`."""
+        device = next(model.parameters()).device
+        unmanaged_param = torch.nn.Parameter(torch.randn(5, 5, device=device))
+        # Either register the parameter to a module to be wrapped with FSDP
+        # (`model.block2[2]`) or a module not to be wrapped with FSDP (`model`)
+        register_module = model.block2[2] if add_to_fsdp_module else model
+        register_module.register_parameter(
+            "unmanaged_param", unmanaged_param,
+        )
+        # For simplicity, we only add a single unmanaged parameter, but should
+        # be easy to generalize if needed
+        return NestedModel.wrap(model, group), [unmanaged_param]
+
+    @staticmethod
+    def add_unmanaged_param_entry(osd, unmanaged_param, step) -> None:
+        """Adds an entry for the unmanaged parameter ``unmanaged_param``
+        assuming Adam optimizer and a single parameter group."""
+        # The unmanaged parameters should be passed to this method in
+        # `model.parameters()` order since their parameter IDs will be assigned
+        # in order of the skipped IDs
+        # Assign a parameter ID to the unmanaged parameter
+        unmanaged_param_id = -1
+        param_ids = osd["param_groups"][0]["params"]
+        for i in range(1, len(param_ids)):
+            diff = param_ids[i] - param_ids[i - 1]
+            if diff != 1:
+                assert diff > 1, f"Invalid IDs: {param_ids[i - 1]} {param_ids[i]}"
+                unmanaged_param_id = param_ids[i - 1] + 1
+                break
+        if unmanaged_param_id == -1:
+            unmanaged_param_id = len(param_ids)  # last ID skipped
+        assert unmanaged_param_id >= 0, "One parameter ID should be skipped"
+        # Add a state entry for the unmanaged parameter
+        state_device = next(iter(next(iter(osd["state"].values())).values())).device
+        osd["state"][unmanaged_param_id] = {
+            "step": torch.tensor(float(step), device=state_device),
+            "exp_avg": torch.randn(unmanaged_param.shape, device=state_device),
+            "exp_avg_sq": torch.randn(unmanaged_param.shape, device=state_device),
+        }
+        # Insert the ID into the parameter group in order
+        bisect.insort(osd["param_groups"][0]["params"], unmanaged_param_id)
+
+    # NOTE: We exclude `self.bias` from either parameter group to test the
+    # case where the optimizer input does not include all model parameters
+    def param_group0(self) -> List[torch.nn.Parameter]:
+        # Use `block1`'s parameters for the first parameter group to deviate
+        # from the `model.parameters()` order
+        return list(self.block1.parameters())
+
+    def param_group1(self) -> List[torch.nn.Parameter]:
+        # Deviate from the `model.parameters()` order further by rearranging
+        # `block2`'s parameters to be before `block0`'s parameters
+        return list(self.block2.parameters()) + \
+            list(self.block0.parameters())
+
+
+class TestFSDPOptimState(FSDPTest):
+    def _init_nested_model(
+        self,
+        wrap: bool,
+        wrap_alt: bool = False,  # ignored if `wrap=False`
+        device: torch.device = torch.device("cuda"),
+        group=None,
+        optim_class: Type[torch.optim.Optimizer] = torch.optim.Adam,
+        use_multiple_param_groups: bool = False,
+    ):
+        model = NestedModel().to(device)
+        if wrap:
+            model = NestedModel.wrap_alt(model, group) if wrap_alt \
+                else NestedModel.wrap(model, group)
+        if not use_multiple_param_groups:
+            optim_input = list(model.parameters())
+        else:
+            optim_input = [
+                {"params": model.param_group0()},
+                {"params": model.param_group1(), "weight_decay": 0.9}
+            ]
+        optim = optim_class(optim_input, lr=0.01)
+        return model, optim, optim_input
+
+    def _init_transformer_model(
+        self,
+        wrap: bool,
+        device: torch.device = torch.device("cuda"),
+        group=None,
+        optim_class: Type[torch.optim.Optimizer] = torch.optim.Adam,
+        use_multiple_param_groups: bool = False,
+    ):
+        assert not use_multiple_param_groups, \
+            "Multiple parameter groups for the transformer is not implemented"
+        if group is None:
+            group = dist.distributed_c10d._get_default_group()
+        model = self._get_wrapped_model(group=group).to(device) if wrap \
+            else self._get_nonwrapped_model(group=group).to(device)
+        model.eval()  # disable dropout for determinism
+        optim = optim_class(model.parameters(), lr=0.01)
+        return model, optim, None
+
+    def _step_model(
+        self,
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        device: torch.device = torch.device("cuda"),
+        num_iters: int = 1,
+    ) -> List[float]:
+        """Performs a forward pass, backward pass, and optimizer step
+        ``num_iters``-many times, and returns the per-iteration losses."""
+        torch.manual_seed(0)  # set seed for determinism
+        losses = []
+        module = model.module if hasattr(model, "module") else model
+        for _ in range(num_iters):
+            inp = module.get_input(device)
+            output = model(*inp)
+            loss = module.get_loss(inp, output).to(device)
+            losses.append(loss.item())
+            module.run_backward(loss)
+            optim.step()
+        return losses
+
+    def _broadcast_full_osd(self, full_osd: Dict[str, Any], group=None):
+        """Broadcasts the full optimizer state dict in place of using
+        ``torch.save()`` and ``torch.load()`` so that all ranks can have it."""
+        obj_list = [full_osd]
+        dist.broadcast_object_list(
+            obj_list, src=0, group=group,
+        )
+        full_osd = obj_list[0]
+        return full_osd
+
+    def _are_equal_states(
+        self,
+        state1: Dict[str, Any],
+        state2: Dict[str, Any],
+    ) -> bool:
+        """Checks if ``state1`` and ``state2`` contain the same mappings."""
+        if set(state1.keys()) != set(state2.keys()):
+            return False
+        for state_name, value1 in state1.items():
+            value2 = state2[state_name]
+            if type(value1) != type(value2):
+                return False
+            if torch.is_tensor(value1):  # tensor state
+                assert torch.is_tensor(value2)
+                # Check the values on CPU to be device-agnostic
+                value1 = value1.cpu()
+                value2 = value2.cpu()
+                if value1.shape != value2.shape or \
+                        not torch.all(torch.isclose(value1, value2)):
+                    return False
+            else:  # non-tensor state
+                if value1 != value2:
+                    return False
+        return True
+
+    def _check_same_state(
+        self,
+        full_osd,
+        ref_osd,
+        check_same_param_keys: bool,
+    ):
+        """Checks that ``full_osd`` and ``ref_osd`` have the same "state" part.
+        If ``check_same_param_keys=True``, then checks that the parameter keys
+        match (e.g. when both should be parameter names), and does not check
+        the parameter keys otherwise."""
+        assert "state" in ref_osd
+        self.assertTrue("state" in full_osd)
+        ref_osd_state = ref_osd["state"]
+        full_osd_state = full_osd["state"]
+        if check_same_param_keys:
+            # Check parameter keys are the same
+            ref_osd_param_ids = set(ref_osd_state.keys())
+            full_osd_param_ids = set(full_osd_state.keys())
+            self.assertTrue(ref_osd_param_ids == full_osd_param_ids)
+            for param_id, param_state in full_osd_state.items():
+                for state_name, value in param_state.items():
+                    ref_value = ref_osd_state[param_id][state_name]
+                    self.assertEqual(value, ref_value)
+            return
+        # Otherwise, only require the parameter keys to be isomorphic (e.g.
+        # between IDs and names)
+        ref_osd_states = list(ref_osd["state"].values())
+        full_osd_states = list(full_osd["state"].values())
+        assert len(ref_osd_states) == len(full_osd_states)
+        # Use brute-force quadratic-time comparison since it is hard to
+        # hash a tensor by value instead of by object
+        for full_osd_state in full_osd_states:
+            # Check for at least one match (may be > 1 in toy edge cases, e.g.
+            # multiple biases); nonetheless, each having >= 1 match and the two
+            # lists having equal length imply that the list contents are equal
+            self.assertTrue(any(
+                self._are_equal_states(full_osd_state, ref_osd_state)
+                for ref_osd_state in ref_osd_states
+            ))
+
+    def _check_same_param_groups(
+        self,
+        full_osd,
+        ref_osd,
+        check_same_param_keys: bool,
+    ):
+        """Checks that ``full_osd`` and ``ref_osd`` have the same
+        "param_groups" part. If ``check_same_param_keys=True`, then checks that
+        the parameter keys match (e.g. when both should be parameter names),
+        and does not check the parameter keys otherwise."""
+        assert "param_groups" in ref_osd
+        self.assertTrue("param_groups" in full_osd)
+        ref_osd_param_groups = ref_osd["param_groups"]
+        full_osd_param_groups = full_osd["param_groups"]
+        self.assertTrue(len(full_osd_param_groups), len(ref_osd_param_groups))
+        if self.rank == 0:
+            for full_osd_pg, ref_osd_pg in zip(
+                full_osd_param_groups, ref_osd_param_groups,
+            ):
+                self.assertEqual(
+                    set(full_osd_pg.keys()), set(ref_osd_pg.keys()),
+                )
+                for name, full_osd_value in full_osd_pg.items():
+                    if name == "params" and not check_same_param_keys:
+                        continue
+                    self.assertEqual(full_osd_value, ref_osd_pg[name])
+
+    def _check_state_device(self, osd: Dict[str, Any], on_gpu: bool):
+        """Checks that all tensors in ``osd["state"]`` are on GPU if
+        ``on_gpu=True`` and on CPU if ``on_gpu=False``."""
+        for param_state in osd["state"].values():
+            for value in param_state.values():
+                if torch.is_tensor(value):
+                    if on_gpu:
+                        self.assertTrue(value.is_cuda)
+                    else:
+                        self.assertFalse(value.is_cuda)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_multiple_param_groups", [False, True])
+    @parametrize("rank0_only", [False, True])
+    def test_full_optim_state_dict_nested(
+        self,
+        use_multiple_param_groups: bool,
+        rank0_only: bool,
+    ) -> None:
+        """
+        Tests :meth:`full_optim_state_dict` by comparing the returned dict for
+        an FSDP-wrapped model with that of an equivalent non-wrapped model.
+
+        The parameter groups in the "param_groups" part and the values in the
+        "state" part should be the same, but the parameter keys may be
+        different (e.g. the full optimizer state dict uses parameter names
+        while the non-wrapped equivalent uses parameter IDs).
+        """
+        NUM_ITERS = 3
+        model1, optim1, optim_input = self._init_nested_model(
+            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        losses1 = self._step_model(model1, optim1, num_iters=NUM_ITERS)
+        full_osd = FSDP.full_optim_state_dict(
+            model1, optim1, optim_input, rank0_only=rank0_only,
+        )
+        # Non-target ranks get an empty state dict
+        if rank0_only and self.rank != 0:
+            self.assertEqual(len(full_osd), 0)
+            return
+        model2, optim2, _ = self._init_nested_model(
+            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        losses2 = self._step_model(model2, optim2, num_iters=NUM_ITERS)
+        ref_osd = optim2.state_dict()
+        # Check the losses to eliminate model drift as a source of error
+        for i, (l1, l2) in enumerate(zip(losses1, losses2)):
+            assert l1 == l2, f"Losses differ on iter {i}: {l1:.5f} {l2:.5f}"
+        # Do not check the parameter keys since the full optimizer state dict
+        # uses parameter names, while the non-wrapped equivalent uses parameter
+        # IDs
+        check_same_param_keys = False
+        self._check_same_param_groups(
+            full_osd, ref_osd, check_same_param_keys=check_same_param_keys,
+        )
+        self._check_same_state(
+            full_osd, ref_osd, check_same_param_keys=check_same_param_keys,
+        )
+
+    # Require 4 GPUs since we test halving the world size
+    @skip_if_lt_x_gpu(4)
+    @parametrize("use_multiple_param_groups", [False, True])
+    @parametrize("wrap_alt", [False, True])
+    @parametrize("halve_world_size", [False, True])
+    def test_shard_full_optim_state_dict_nested(
+        self,
+        use_multiple_param_groups: bool,
+        wrap_alt: bool,
+        halve_world_size: bool,
+    ):
+        """Tests :meth:`shard_full_optim_state_dict` for a non-FSDP-root model
+        with nested FSDP instances."""
+        self._test_shard_full_optim_state(
+            model_class="nested",
+            use_multiple_param_groups=use_multiple_param_groups,
+            halve_world_size=halve_world_size,
+            osd_comm_method=_OSDCommMethod.BROADCAST_OBJECT_LIST,
+            wrap_alt=wrap_alt,
+        )
+
+    # Require 4 GPUs since we test halving the world size
+    @skip_if_lt_x_gpu(4)
+    def test_shard_full_optim_state_dict_transformer(self) -> None:
+        """Tests :meth:`shard_full_optim_state_dict` for an FSDP-root
+        transformer model with shared parameters."""
+        self._test_shard_full_optim_state(
+            model_class="transformer", use_multiple_param_groups=False,
+            halve_world_size=True,
+            osd_comm_method=_OSDCommMethod.BROADCAST_OBJECT_LIST,
+        )
+
+    # Require 4 GPUs since we test halving the world size
+    @skip_if_lt_x_gpu(4)
+    @parametrize("use_multiple_param_groups", [False, True])
+    @parametrize("wrap_alt", [False, True])
+    @parametrize("halve_world_size", [False, True])
+    def test_scatter_full_optim_state_dict_nested(
+        self,
+        use_multiple_param_groups: bool,
+        wrap_alt: bool,
+        halve_world_size: bool,
+    ):
+        """Tests :meth:`scatter_full_optim_state_dict` for a non-FSDP-root
+        model with nested FSDP instances."""
+        self._test_shard_full_optim_state(
+            model_class="nested",
+            use_multiple_param_groups=use_multiple_param_groups,
+            halve_world_size=halve_world_size,
+            osd_comm_method=_OSDCommMethod.SCATTER_FULL_OSD,
+            wrap_alt=wrap_alt,
+        )
+
+    # Require 4 GPUs since we test halving the world size
+    @skip_if_lt_x_gpu(4)
+    def test_scatter_full_optim_state_dict_transformer(self) -> None:
+        """Tests :meth:`scatter_full_optim_state_dict` for an FSDP-root
+        transformer model with shared parameters."""
+        self._test_shard_full_optim_state(
+            model_class="transformer", use_multiple_param_groups=False,
+            halve_world_size=True,
+            osd_comm_method=_OSDCommMethod.SCATTER_FULL_OSD,
+        )
+
+    def _test_shard_full_optim_state(
+        self,
+        model_class: str,
+        use_multiple_param_groups: bool,
+        halve_world_size: bool,
+        osd_comm_method: _OSDCommMethod,
+        **new_model_kwargs,
+    ):
+        """
+        (1) Runs a model with full world size for K iterations to generate a
+        full optimizer state dict;
+        (2) initializes a model with halved world size and possibly different
+        FSDP wrapping scheme (based on ``new_model_kwargs``);
+        (3) shards the full optimizer state dict from (1) according to the
+        halved-world-size model;
+        (4) runs the halved-world-size model for K iterations; and
+        (5) checks that the sharded optimizer state dict from (3) matches the
+        halved-world-size model's local optimizer state dict, meaning that the
+        former could have equivalently been loaded into the local optimizer.
+        """
+        NUM_ITERS = 3
+        initializer = self._init_nested_model if model_class == "nested" \
+            else self._init_transformer_model if model_class == "transformer" \
+            else None
+        assert initializer is not None, f"Unsupported model: {model_class}"
+        # First, run a wrapped model with full world size for a few iterations
+        model1, optim1, optim_input1 = initializer(
+            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        self._step_model(model1, optim1, num_iters=NUM_ITERS)
+        full_osd1 = FSDP.full_optim_state_dict(model1, optim1, optim_input1)
+        if halve_world_size:
+            # Create a new process group with halved world size
+            new_group_ranks = [r for r in range(self.world_size) if r % 2 == 0]
+            new_group = dist.new_group(ranks=new_group_ranks)
+            if self.rank not in new_group_ranks:
+                return
+        else:
+            # Continue using the same group and hence world size
+            new_group = dist.distributed_c10d._get_default_group()
+        # Second, run a wrapped model with (possibly) halved world size
+        model2, optim2, optim_input2 = initializer(
+            wrap=True, group=new_group,
+            use_multiple_param_groups=use_multiple_param_groups,
+            **new_model_kwargs,  # specify `wrap_alt` to change wrapping
+        )
+        self._step_model(model2, optim2, num_iters=NUM_ITERS)
+        full_osd2 = FSDP.full_optim_state_dict(model2, optim2, optim_input2)
+        # Compute two sharded optim state dicts: (1) for the first model
+        # according to the second model and (2) for the second model according
+        # to the second model
+        if osd_comm_method == _OSDCommMethod.BROADCAST_OBJECT_LIST:
+            full_osd1 = self._broadcast_full_osd(full_osd1, group=new_group)
+            sharded_osd1 = FSDP.shard_full_optim_state_dict(
+                full_osd1, model2, optim_input2,
+            )
+            full_osd2 = self._broadcast_full_osd(full_osd2, group=new_group)
+            sharded_osd2 = FSDP.shard_full_optim_state_dict(
+                full_osd2, model2, optim_input2,
+            )
+        elif osd_comm_method == _OSDCommMethod.SCATTER_FULL_OSD:
+            sharded_osd1 = FSDP.scatter_full_optim_state_dict(
+                full_osd1 if self.rank == 0 else None, model2, optim_input2,
+                group=new_group,
+            )
+            sharded_osd2 = FSDP.scatter_full_optim_state_dict(
+                full_osd2 if self.rank == 0 else None, model2, optim_input2,
+                group=new_group,
+            )
+            self._check_state_device(sharded_osd1, on_gpu=True)
+            self._check_state_device(sharded_osd2, on_gpu=True)
+        # As a sanity check, check that sharding the second model's full
+        # optimizer state dict according to itself is equivalent to its local
+        # optimizer's state dict
+        local_osd2 = optim2.state_dict()
+        check_same_param_keys = True  # should all have matching parameter IDs
+        self._check_same_param_groups(
+            sharded_osd2, local_osd2,
+            check_same_param_keys=check_same_param_keys,
+        )
+        self._check_same_state(
+            sharded_osd2, local_osd2,
+            check_same_param_keys=check_same_param_keys,
+        )
+        # Check that sharding the first model's full optimizer state dict
+        # according to the second model is equivalent to the second model's
+        # local optimizer state dict
+        self._check_same_param_groups(
+            sharded_osd1, local_osd2,
+            check_same_param_keys=check_same_param_keys,
+        )
+        self._check_same_state(
+            sharded_osd1, local_osd2,
+            check_same_param_keys=check_same_param_keys,
+        )
+        # As a sanity check, check that we can load and run a few iterations
+        optim2.load_state_dict(sharded_osd1)
+        self._step_model(model2, optim2, num_iters=NUM_ITERS)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("add_to_fsdp_module", [False, True])
+    def test_shard_full_optim_state_dict_unmanaged_params(
+        self,
+        add_to_fsdp_module: bool,
+    ):
+        """
+        Tests :meth:`shard_full_optim_state_dict` when there are unmanaged
+        parameters.
+          - If ``add_to_fsdp_module=True``, then the unmanaged parameters are
+          added to a module to be wrapped with FSDP, in which case there should
+          be an error since we require that all unflattened parameter
+          comprising a flattened parameter have the same scalar state (e.g.
+          Adam "step") but the added parameter is missing its entry.
+          - If ``add_to_fsdp_module=False``, then the unmanaged parameters are
+          added to a module not to be wrapped with FSDP, in which case there
+          should be no error (emulating model parallel use cases where some
+          parameters may be managed externally to FSDP).
+        We do not separately test unmanaged parameters for
+        :meth:`scatter_full_optim_state_dict` to save CI cost since it calls
+        into the same subroutine :meth:`_flatten_full_optim_state_dict`.
+        """
+        NUM_ITERS = 1
+        # Create a normal wrapped model
+        model, optim, optim_input = self._init_nested_model(wrap=True)
+        self._step_model(model, optim, num_iters=NUM_ITERS)
+        full_osd = FSDP.full_optim_state_dict(
+            model, optim, optim_input, rank0_only=False,
+        )  # save on all ranks to avoid having to broadcast from rank 0
+        # Create a new model with the same structure but additional unmanaged
+        # parameters, representing the model for which we want to load
+        device = torch.device("cuda")
+        model = NestedModel().to(device)
+        model, unmanaged_params = NestedModel.wrap_with_unmanaged_params(
+            model, add_to_fsdp_module,
+        )
+        optim_input = list(model.parameters())
+        if add_to_fsdp_module:
+            # If we add the unmanaged parameters to a module wrapped with FSDP,
+            # then the flattened parameter will be comprised of some
+            # unflattened parameters with zero-dimensional tensor state (i.e.
+            # Adam "step") and others without (i.e. the unmanaged parameters),
+            # which triggers an error that we have to ensure correctness
+            error_prefix = "^(All unflattened parameters comprising a " \
+                "single flattened parameter must have scalar state with the " \
+                "same value and dtype)"
+            with self.assertRaisesRegex(ValueError, error_prefix):
+                FSDP.shard_full_optim_state_dict(
+                    full_osd, model, optim_input,
+                )
+        else:
+            # If we add the unmanaged parameters to a module not wrapped with
+            # FSDP, then we simply ignore them without erroring to enable
+            # model parallelism use cases, where some parameters are managed
+            # externally to FSDP
+            sharded_osd = FSDP.shard_full_optim_state_dict(
+                full_osd, model, optim_input,
+            )
+            # Add entries for the unmanaged parameters to be able to load
+            for unmanaged_param in unmanaged_params:
+                NestedModel.add_unmanaged_param_entry(
+                    sharded_osd, unmanaged_param, NUM_ITERS,
+                )
+            # Check that we can load the optimizer state dict
+            optim = torch.optim.Adam(optim_input, lr=1e-3)
+            optim.load_state_dict(sharded_osd)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_multiple_param_groups", [False, True])
+    def test_rekey_optim_state_dict_to_ids(
+        self,
+        use_multiple_param_groups: bool,
+    ):
+        """Tests :meth:`rekey_optim_state_dict` with the new keys being
+        parameter IDs by checking that a wrapped model (i.e. with FSDP modules)
+        can rekey its optimizer state dict to match that of an equivalent
+        non-wrapped model (i.e. without FSDP modules)."""
+        NUM_ITERS = 3
+        # Run a wrapped model for a few iterations
+        model1, optim1, optim_input1 = self._init_nested_model(
+            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        self._step_model(model1, optim1, num_iters=NUM_ITERS)
+        full_osd = FSDP.full_optim_state_dict(model1, optim1, optim_input1)
+        # Broadcast instead of `torch.save()`/`torch.load()` so that all ranks
+        # have the full state dict
+        full_osd = self._broadcast_full_osd(full_osd)
+        # Run a non-wrapped model for a few iterations
+        model2, optim2, optim_input2 = self._init_nested_model(
+            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        self._step_model(model2, optim2, num_iters=NUM_ITERS)
+        # Re-key the wrapped model's optimizer state dict using parameter IDs
+        # according to the non-wrapped model
+        rekeyed_osd = FSDP.rekey_optim_state_dict(
+            full_osd, OptimStateKeyType.PARAM_ID, model2, optim_input2,
+        )
+        # Check that the re-keyed dict and actual dict are the same
+        osd = optim2.state_dict()
+        check_same_param_keys = True
+        self._check_same_param_groups(
+            rekeyed_osd, osd, check_same_param_keys=check_same_param_keys,
+        )
+        self._check_same_state(
+            rekeyed_osd, osd, check_same_param_keys=check_same_param_keys,
+        )
+        # As a sanity check, check that we can load and run a few iterations
+        optim2.load_state_dict(rekeyed_osd)
+        self._step_model(model2, optim2, num_iters=NUM_ITERS)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_multiple_param_groups", [False])
+    def test_rekey_optim_state_dict_to_names(
+        self,
+        use_multiple_param_groups: bool,
+    ):
+        """Tests :meth:`rekey_optim_state_dict` with the new keys being
+        parameter names by checking that a non-wrapped model (i.e. without FSDP
+        modules) can rekey its optimizer state dict to match the expected
+        output of :meth:`full_optim_state_dict`, hence be sharded using
+        :meth:`shard_full_optim_state_dict`, and finally match the per-rank
+        optimizer state dict of a wrapped model (i.e. with FSDP modules)."""
+        NUM_ITERS = 3
+        # Run a wrapped model for a few iterations
+        model1, optim1, optim_input1 = self._init_nested_model(
+            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        self._step_model(model1, optim1, num_iters=NUM_ITERS)
+        # Run a non-wrapped model for a few iterations
+        model2, optim2, optim_input2 = self._init_nested_model(
+            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+        )
+        self._step_model(model2, optim2, num_iters=NUM_ITERS)
+        # Re-key the non-wrapped model's optimizer state dict using parameter
+        # names (still according to itself)
+        osd2 = optim2.state_dict()
+        rekeyed_osd = FSDP.rekey_optim_state_dict(
+            osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input2,
+        )
+        # Shard the non-wrapped model's re-keyed optimizer state dict, which
+        # maps back to (flattened) parameter IDs
+        sharded_osd = FSDP.shard_full_optim_state_dict(
+            rekeyed_osd, model1, optim_input1,
+        )
+        # Check that this sharded optimizer state dict matches the wrapped
+        # model's per-rank optimizer state dict
+        osd1 = optim1.state_dict()
+        check_same_param_keys = True
+        self._check_same_param_groups(
+            sharded_osd, osd1, check_same_param_keys=check_same_param_keys,
+        )
+        self._check_same_state(
+            sharded_osd, osd1, check_same_param_keys=check_same_param_keys,
+        )
+        # As a sanity check, check that we can load and run a few iterations
+        optim1.load_state_dict(sharded_osd)
+        self._step_model(model1, optim1, num_iters=NUM_ITERS)
+
+
+instantiate_parametrized_tests(TestFSDPOptimState)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
index 4d98fbfa8e2c..82648ea457a8 100644
--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
+++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -5,13 +5,13 @@
 import torch
 from torch import distributed as dist
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, CPUOffload
-from torch.nn import Linear, Module
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     FSDPTest,
     get_full_params,
+    DeterministicModel,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -33,21 +33,6 @@
     sys.exit(0)
 
 
-class Model(Module):
-    def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
-        super().__init__()
-        # keep everything deterministic for model initialization
-        torch.manual_seed(0)
-        self.inner = Linear(2, 2).cuda()
-        if wrap_fsdp:
-            self.inner = FSDP(self.inner, cpu_offload=cpu_offload)
-        self.outer = Linear(2, 2).cuda()
-
-    def forward(self, x):
-        y = self.inner(x)
-        return self.outer(y)
-
-
 # Test pure fp16 training, also testing the case when the parameter's data type is
 # changed after FSDP wrapping and before training loop starts.
 # Only run one step for comparision, as usually grad scaler is needed to avoid NaN value
@@ -57,7 +42,7 @@ def _dist_train(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
         # keep everything deterministic for input data
         torch.manual_seed(0)
 
-        model = Model(wrap_fsdp, cpu_offload)
+        model = DeterministicModel(wrap_fsdp, cpu_offload)
         if wrap_fsdp:
             model = FSDP(model, cpu_offload=cpu_offload)
         else:
@@ -74,7 +59,9 @@ def _dist_train(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
             optim.zero_grad()
 
         if wrap_fsdp:
-            get_full_params(model)
+            full_params = get_full_params(model)
+            torch.cuda.synchronize()
+            return full_params
 
         return list(model.parameters())
 
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
new file mode 100644
index 000000000000..44b8815a9a4b
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -0,0 +1,159 @@
+# Owner(s): ["oncall: distributed"]
+
+import functools
+import itertools
+import sys
+import torch
+import unittest
+
+from torch import distributed as dist
+from torch.cuda.amp.common import amp_definitely_not_available
+from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision, CPUOffload
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.testing._internal.common_fsdp import DummyProcessGroup, subtest_name, FSDPInitMode, NestedWrappedModule, FSDPTest
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests,
+    instantiate_parametrized_tests,
+    parametrize,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+params = "cpu_offload,sharding_strategy,mixed_precision"
+cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
+sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None]
+mixed_precision = ["enable_mixed_precision", None]
+
+configs = list(itertools.product(cpu_offload_config,
+                                 sharding_strategy_config,
+                                 mixed_precision))
+test_name_mapping = {
+    str(CPUOffload(offload_params=True)): "offload_true",
+    str(CPUOffload(offload_params=False)): "offload_false",
+    str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op",
+    "enable_mixed_precision": "mixed_precision"
+}
+
+subtest_name = functools.partial(subtest_name, test_name_mapping)
+
+
+class TestShardGradScaler(TestCase):
+    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    def test_grad_scaling(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
+        t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu")
+        outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]]
+        outputs = scaler.scale(outputs)
+        self.assertTrue(outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0)
+        self.assertTrue(outputs[2][0] == 8.0 and outputs[2][1] == 16.0)
+        self.assertTrue(scaler._scale.device == t1.device)
+
+    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    def test_scaling_unscaling_sparse(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu")
+        found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu")
+
+        i = torch.tensor([[0, 1, 1],
+                          [2, 0, 2]], device="cpu", dtype=torch.int64)
+        v = torch.tensor([16.0, 32.0, 64.0], dtype=torch.float, device="cpu")
+        s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float)
+
+        # unscale sparse tensors
+        s1 = s.clone()
+        s1.grad = s.clone()
+        opt = torch.optim.SGD([s1], lr=1.0)
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device]
+        self.assertEqual(found_inf, 0.0)
+        self.assertEqual(s1.grad.to_dense(), (s / 2).to_dense())
+
+        # unscale sparse tensor: inf
+        v = torch.tensor([16.0, 32.0, float('inf')], dtype=torch.float, device="cpu")
+        s1.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float)
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device]
+        self.assertEqual(found_inf, 1.0)
+
+        # unscale sparse tensor: overflow (marked as inf)
+        i = torch.tensor([[1, 1, 1],
+                          [0, 0, 2]], device="cpu", dtype=torch.int64)
+        # coalescing sparse tensor here will cause the value to be Inf
+        v = torch.tensor([2**15, 2**15, 1.0], dtype=torch.float16, device="cpu")
+        s1 = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16)
+        s1.grad = s1.clone()
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device]
+        self.assertEqual(found_inf, 1.0)
+
+    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    def test_inf_gradients_skip_optim_step(self):
+        pg = DummyProcessGroup(0, 1)
+        scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
+        loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
+        t0 = torch.tensor([float('inf')], dtype=torch.float32, device="cpu")
+        t0.grad = t0.clone()
+        opt = torch.optim.SGD([t0], lr=1.0)
+        scaler.scale(loss)
+        ret_val = scaler.step(opt)
+        self.assertTrue(ret_val is None)
+
+
+class TestShardedGradScalerParityWithDDP(FSDPTest):
+    def _get_init_modes_for_test(self, cpu_offload):
+        modes = [
+            FSDPInitMode.CUDA_AFTER,
+            FSDPInitMode.CUDA_BEFORE
+        ]
+        # Note that FSDPInitMode.CUDA_NEVER works currently only with CPU
+        # offload as we explicitly bring the param back to CUDA device. In
+        # general, it will not work since we try to all_gather p.data which is
+        # on CPU but NCCL only supports GPU.
+        if cpu_offload.offload_params:
+            modes.append(FSDPInitMode.CUDA_NEVER)
+
+        return modes
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize(params, configs, subtest_name)
+    def test_scaler_enabled(self, cpu_offload, sharding_strategy, mixed_precision):
+        init_modes = self._get_init_modes_for_test(cpu_offload)
+        mp = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        ) if mixed_precision else None
+        for fsdp_init_mode in init_modes:
+            self._test_identical_outputs(
+                NestedWrappedModule,
+                fsdp_init_mode=fsdp_init_mode,
+                cpu_offload=cpu_offload,
+                sharding_strategy=sharding_strategy,
+                mixed_precision=mp,
+                enable_sharded_grad_scaler=True,
+            )
+
+
+instantiate_parametrized_tests(TestShardGradScaler)
+instantiate_parametrized_tests(TestShardedGradScalerParityWithDDP)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
new file mode 100644
index 000000000000..6d8b9959efb5
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -0,0 +1,734 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+from contextlib import suppress
+from copy import deepcopy
+from functools import partial
+from typing import Any, Dict
+
+import torch
+import torch.nn as nn
+from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
+from torch import distributed as dist
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    StateDictType,
+    FullStateDictConfig,
+    LocalStateDictConfig,
+    CPUOffload,
+    MixedPrecision,
+)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+from torch.distributed.fsdp.shard_utils import _gather_state_dict
+from torch.distributed.fsdp.wrap import enable_wrap, wrap, transformer_auto_wrap_policy
+from torch.nn import Linear, Module
+from torch.nn.parallel import DistributedDataParallel
+from torch.optim import SGD
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    get_full_params,
+    _get_full_detached_param,
+    _get_state_dict,
+    SkipModel,
+    _zero_model,
+    TransformerWithSharedParams,
+    _validate,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+INNER_SHAPE = [4, 4]
+OUTER_SHAPE = [4, 5]
+BUFFER_SHAPE = [5, 5]
+
+NON_ROOT_FSDP_PREFIX = 'non_fsdp_lin'
+
+_UNFLATTENED_STATE_DICT_IMPLS = ["state_dict", "sharded_state_dict"]
+_FLATTENED_STATE_DICT_IMPLS = ["local_state_dict"]
+_SUPPORTED_STATE_DICT_IMPLS = (
+    _UNFLATTENED_STATE_DICT_IMPLS + _FLATTENED_STATE_DICT_IMPLS
+)
+
+STATE_DICT_MAPPING = {
+    "state_dict": StateDictType.FULL_STATE_DICT,
+    "local_state_dict": StateDictType.LOCAL_STATE_DICT,
+    "sharded_state_dict": StateDictType.SHARDED_STATE_DICT,
+}
+
+
+class Model(Module):
+    def __init__(self, wrap_fsdp, register_buffers=False):
+        super().__init__()
+        self.inner = Linear(*INNER_SHAPE)
+        if register_buffers:
+            self.inner.register_buffer("buffer", torch.randn(BUFFER_SHAPE))
+        if wrap_fsdp:
+            self.inner = FSDP(self.inner)
+        self.outer = Linear(*OUTER_SHAPE)
+        if register_buffers:
+            self.outer.register_buffer("buffer", torch.randn(BUFFER_SHAPE))
+
+    def forward(self, x):
+        # Forward twice.
+        i = self.inner(x)
+        j = self.inner(x)
+        return self.outer(i + j)
+
+
+class TestFSDPStateDict(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    def _broadcast_state_dict(self, state_dict):
+        olist = [state_dict if self.rank == 0 else None]
+        dist.broadcast_object_list(olist)
+        return olist[0]
+
+    def _compare_models(self, model, model_new, assert_fn, check_fp16=False):
+        with FullyShardedDataParallel.summon_full_params(model):
+            with FullyShardedDataParallel.summon_full_params(model_new):
+                params = list(model.parameters())
+                params_new = list(model_new.parameters())
+                assert_fn(params, params_new)
+                if check_fp16:
+                    for tensor in model_new.parameters():
+                        self.assertEqual(tensor.dtype, torch.float16)
+
+    def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs):
+        if wrap:
+            lin1 = nn.Linear(10, 10, bias=False).cuda()
+            lin2 = nn.Linear(10, 10, bias=False).cuda()
+            if checkpoint_wrap:
+                lin1 = checkpoint_wrapper(lin1)
+                lin2 = checkpoint_wrapper(lin2)
+            seq = nn.Sequential(FSDP(lin1, *fsdp_args, **fsdp_kwargs), lin2)
+            if checkpoint_wrap:
+                seq = checkpoint_wrapper(seq)
+            model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
+        else:
+            model = nn.Sequential(
+                nn.Linear(10, 10, bias=False).cuda(), nn.Linear(10, 10, bias=False).cuda()
+            )
+        return model
+
+    def _get_simple_model(self, *fsdp_args, checkpoint_wrap=False, **fsdp_kwargs):
+        lin = nn.Linear(10, 10, bias=False).cuda()
+        if checkpoint_wrap:
+            lin = checkpoint_wrapper(lin)
+        model = FSDP(lin, *fsdp_args, **fsdp_kwargs)
+        return model
+
+    def _get_non_fsdp_root_module(self, *fsdp_args, wrap=True, **fsdp_kwargs):
+        class FSDPContainer(nn.Module):
+            def __init__(self, fsdp_1, fsdp_2):
+                super().__init__()
+                self.non_fsdp_lin = nn.Linear(10, 10, bias=False).cuda()
+                self.fsdp_1 = fsdp_1
+                self.fsdp_2 = fsdp_2
+
+            def forward(self, x):
+                x = self.non_fsdp_lin(x)
+                x = self.fsdp_1(x)
+                x = self.fsdp_2(x)
+                return x
+
+        return FSDPContainer(
+            self._get_simple_nested_model(*fsdp_args, wrap=wrap, **fsdp_kwargs),
+            self._get_simple_nested_model(*fsdp_args, wrap=wrap, **fsdp_kwargs),
+        )
+
+    def _get_state_dict_mgr(self, model, state_dict_type, state_dict_rank0_and_offload):
+        _state_dict_type = STATE_DICT_MAPPING[state_dict_type]
+        if state_dict_type == "state_dict":
+            config = FullStateDictConfig(
+                rank0_only=state_dict_rank0_and_offload,
+                offload_to_cpu=state_dict_rank0_and_offload,
+            )
+        else:
+            config = None
+        return FSDP.state_dict_type(model, _state_dict_type, config)
+
+    def _validate_state_dict_contents(
+        self, model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=None
+    ):
+        if state_dict_rank0_and_offload:
+            if self.rank == 0:
+                self.assertNotEqual(fsdp_state_dict, {})
+                for key, tensor in fsdp_state_dict.items():
+                    if ignore_keys and key in ignore_keys:
+                        continue
+                    self.assertEqual(
+                        tensor.device,
+                        torch.device("cpu"),
+                        f"{key} is unexpectedly on device {tensor.device}",
+                    )
+            else:
+                # For non-FSDP roots, the non FSDP portion can still have parameters on rank 0,
+                # so bypass the check for now.
+                if isinstance(model, FSDP):
+                    self.assertEqual(fsdp_state_dict, {})
+
+    @skip_if_lt_x_gpu(2)
+    def test_load_activation_checkpointed_module(self):
+        # TODO: move this tests to checkpoint_wrapper tests once there is a dedicated
+        # test suite for them: https://github.com/pytorch/pytorch/issues/77478.
+        lin = nn.Linear(10, 10, bias=False).cuda()
+        lin = checkpoint_wrapper(lin)
+        state_dict = deepcopy(lin.state_dict())
+        # Load into non-checkpoint wrapped linear module
+        lin_new = nn.Linear(10, 10, bias=False).cuda()
+        lin_new.load_state_dict(state_dict)
+        for p1, p2 in zip(lin.parameters(), lin_new.parameters()):
+            self.assertEqual(p1, p2)
+
+        # Load non-checkpoint wrapped module into checkpoint wrapped one
+        # Make params different
+        for p in lin_new.parameters():
+            with torch.no_grad():
+                p.add_(0.5)
+
+        state_dict = deepcopy(lin_new.state_dict())
+        # Verify checkpoint wrapped linear can load unwrapped linear
+        lin.load_state_dict(state_dict)
+        print(type(lin))
+        for p1, p2 in zip(lin.parameters(), lin_new.parameters()):
+            self.assertEqual(p1, p2)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("checkpoint_wrap", ["first", "second", "both"])
+    def test_fsdp_state_dict_with_activation_checkpoint(self, checkpoint_wrap):
+        for model_call in [
+            partial(self._get_simple_model),
+            partial(self._get_simple_nested_model)
+        ]:
+            model = model_call(checkpoint_wrap=(checkpoint_wrap in ["first", "both"]))
+            state_dict = _get_state_dict(model, False, False)
+            # Possibly wrap new model in activation checkpoint wrapper to test save/
+            # load with this wrapper
+            model_new = model_call(checkpoint_wrap=(checkpoint_wrap in ["second", "both"]))
+            _zero_model(model_new)
+            self._compare_models(model, model_new, self.assertNotEqual)
+            # Would fail if checkpoint_wrapper did not correctly implement state_dict pre/post hooks
+            model_new.load_state_dict(state_dict)
+            self._compare_models(model, model_new, self.assertEqual)
+
+    @skip_if_lt_x_gpu(2)
+    def test_state_dict_rank0_offload_save_load_flow(self):
+        # Test taking checkpoint on rank 0 only, and reload
+        # without redundant CPU memories.
+        model = TransformerWithSharedParams(group=dist.distributed_c10d._get_default_group())
+        my_auto_wrap_policy = partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer}
+        )
+        model = FSDP(model, auto_wrap_policy=my_auto_wrap_policy)
+        ctx = self._get_state_dict_mgr(
+            model, "state_dict", True
+        )
+        with ctx:
+            state_dict = deepcopy(_get_state_dict(model))
+
+        # All ranks initialize non-FSDP model
+        grp = dist.distributed_c10d._get_default_group()
+        model_new = TransformerWithSharedParams(group=grp)
+        for p in model_new.parameters():
+            with torch.no_grad():
+                p.zero_()
+        # Only rank 0 loads the checkpoint
+        if self.rank == 0:
+            model_new.load_state_dict(state_dict)
+
+        # TransformerWithSharedParams has a buffer of zeros, so can't pass in
+        # self.assertNotEqual since the buffers would be equal. So just checking that
+        # there is some difference in the model across ranks before state_dict is
+        # broadcasted.
+        with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
+            _validate(model_new, process_group=grp, assert_fn=self.assertEqual)
+        # FSDP with sync_module_states=True broadcasts the checkpointed states.
+        model_new = FSDP(
+            model_new,
+            device_id=torch.cuda.current_device(),
+            auto_wrap_policy=my_auto_wrap_policy,
+            sync_module_states=True
+        )
+        # After wrapping with FSDP models are equal across ranks, and have loaded the checkpoint
+        with FSDP.summon_full_params(model_new):
+            _validate(model_new, process_group=grp, assert_fn=self.assertEqual)
+
+        with FullyShardedDataParallel.summon_full_params(model):
+            with FullyShardedDataParallel.summon_full_params(model_new):
+                params = list(model.parameters())
+                params_new = list(model_new.parameters())
+                self.assertEqual(params, params_new)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    @parametrize(
+        "cpu_offload",
+        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
+    )
+    @parametrize("fp16", [True, False])
+    @parametrize("state_dict_rank0_and_offload", [True, False])
+    def test_basic_save_and_load_state_dict(
+        self, state_dict_type, cpu_offload, fp16, state_dict_rank0_and_offload
+    ):
+        """
+        Tests that we can save a state_dict and load it into a blank model
+        with various configs such as fp16 and cpu offload and parameters
+        match as expected.
+        """
+        if state_dict_rank0_and_offload and state_dict_type != "state_dict":
+            return
+        for model_call in [
+            partial(self._get_non_fsdp_root_module, cpu_offload=cpu_offload),
+            partial(self._get_simple_nested_model, cpu_offload=cpu_offload),
+            partial(self._get_simple_model, cpu_offload=cpu_offload),
+        ]:
+            model = model_call()
+
+            ctx = self._get_state_dict_mgr(
+                model, state_dict_type, state_dict_rank0_and_offload
+            )
+            with ctx:
+                fsdp_state_dict = _get_state_dict(
+                    model, cpu_offload.offload_params, fp16
+                )
+
+            # if self.rank == 0:
+            #     print(f"FSDP keys {fsdp_state_dict.keys()}")
+
+            ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
+
+            self._validate_state_dict_contents(
+                model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys,
+            )
+            if fp16:
+                # Verify fp16 is the type
+                for tensor in fsdp_state_dict.values():
+                    self.assertEqual(tensor.dtype, torch.float16)
+
+            model_new = model_call()
+            if not cpu_offload.offload_params:
+                model_new = model_new.cuda()
+            if fp16:
+                model_new.half()
+
+            # zero the model to ensure parameters are different.
+            _zero_model(model_new)
+            self._compare_models(model, model_new, self.assertNotEqual)
+
+            # Verify parameters are the same in the new model.
+            if state_dict_rank0_and_offload:
+                # Broadcast the state dict and move it back to GPU in
+                # preparation for loading.
+                if not isinstance(model, FSDP):
+                    # Move everything to CPU to avoid running into
+                    # https://github.com/pytorch/pytorch/issues/77113, some params
+                    # will still be on GPU for non FSDP root modules.
+                    for k in fsdp_state_dict.keys():
+                        fsdp_state_dict[k] = fsdp_state_dict[k].cpu()
+                fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict)
+                for key in fsdp_state_dict.keys():
+                    fsdp_state_dict[key] = fsdp_state_dict[key].cuda()
+            with FSDP.state_dict_type(model_new, STATE_DICT_MAPPING[state_dict_type]):
+                model_new.load_state_dict(fsdp_state_dict)
+
+            self._compare_models(model, model_new, self.assertEqual, check_fp16=fp16)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    @parametrize("mixed_precision", [True, False])
+    @parametrize("state_dict_rank0_and_offload", [True, False])
+    def test_save_and_load_after_forward_state_dict(
+        self, state_dict_type, mixed_precision, state_dict_rank0_and_offload
+    ):
+        """
+        Test that saving after some training results in params being updated as
+        expected.
+        """
+        if state_dict_rank0_and_offload and state_dict_type != "state_dict":
+            return
+        torch.cuda.set_device(self.rank)
+        mixed_precision = (
+            MixedPrecision(
+                param_dtype=torch.float16,
+                reduce_dtype=torch.float16,
+                buffer_dtype=torch.float16,
+            )
+            if mixed_precision
+            else None
+        )
+        model = self._get_simple_nested_model(mixed_precision=mixed_precision)
+        optim = torch.optim.SGD(model.parameters(), lr=0.1)
+        initial_params = _get_full_detached_param(model)
+        for _ in range(6):
+            inp = torch.randn(1, 10, device=torch.cuda.current_device())
+            output = model(*inp)
+            loss = output.sum()
+            expected_dtype = torch.float32 if mixed_precision is None else torch.float16
+            self.assertEqual(expected_dtype, loss.dtype)
+            loss.backward()
+            optim.step()
+
+        trained_params = _get_full_detached_param(model)
+        # Ensure some training occured
+        self.assertNotEqual(initial_params, trained_params)
+        # Save a copy of the state_dict
+        fsd_mgr = self._get_state_dict_mgr(
+            model, state_dict_type, state_dict_rank0_and_offload
+        )
+        with fsd_mgr:
+            state_dict = model.state_dict()
+            if state_dict_type == "state_dict":
+                state_dict = {k: v.clone() for k, v in state_dict.items()}
+            else:
+                for sharded_tensor in state_dict.values():
+                    shard = sharded_tensor._local_shards[0]
+                    shard.tensor = shard.tensor.clone().detach_()
+        self._validate_state_dict_contents(model, state_dict, state_dict_rank0_and_offload)
+        _zero_model(model)
+
+        # Ensure checkpointed params have the full param dtype
+        for tensor in state_dict.values():
+            self.assertEqual(tensor.dtype, torch.float32)
+
+        # Load state_dict into zeroed model
+        if state_dict_rank0_and_offload:
+            # Broadcast the state dict and move it back to GPU in
+            # preparation for loading.
+            state_dict = self._broadcast_state_dict(state_dict)
+            for key in state_dict.keys():
+                state_dict[key] = state_dict[key].cuda()
+
+        with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]):
+            model.load_state_dict(state_dict)
+        loaded_params = _get_full_detached_param(model)
+        self.assertEqual(loaded_params, trained_params)
+
+    def _initialize_model(
+        self,
+        wrap_fsdp: bool,
+        wrap_ddp: bool = True,
+        register_buffers: bool = False,
+    ):
+        # keep everything deterministic for input data
+        torch.manual_seed(0)
+
+        model = Model(wrap_fsdp, register_buffers=register_buffers).cuda()
+        if wrap_fsdp:
+            model = FSDP(model)
+        elif wrap_ddp:
+            model = DistributedDataParallel(model, device_ids=[self.rank])
+        return model
+
+    @staticmethod
+    def _state_dict(model: Module, state_dict_type: str):
+        try:
+            enum_val = STATE_DICT_MAPPING[state_dict_type]
+        except KeyError:
+            raise ValueError(f"No state_dict type for {state_dict_type}")
+
+        with FSDP.state_dict_type(model, enum_val):
+            return model.state_dict()
+
+    @staticmethod
+    def _load_state_dict(
+        model: Module, state_dict_type: str, state_dict: Dict[str, Any]
+    ):
+        try:
+            enum_val = STATE_DICT_MAPPING[state_dict_type]
+        except KeyError:
+            raise ValueError(f"No state_dict for {state_dict_type}")
+
+        with FSDP.state_dict_type(model, enum_val):
+            return model.load_state_dict(state_dict)
+
+    def _dist_train(self, wrap_fsdp: bool, state_dict_type: str = ""):
+        # TODO: Move this test to common_fsdp.
+        model = self._initialize_model(wrap_fsdp)
+        optim = SGD(model.parameters(), lr=0.1)
+
+        in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+        for _ in range(3):
+            out = model(in_data)
+            out.sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+        if wrap_fsdp:
+            blank_model = FSDP(Model(True).cuda())
+            _zero_model(blank_model)
+            state_dict = self._state_dict(model, state_dict_type)
+            self._load_state_dict(blank_model, state_dict_type, state_dict)
+            return get_full_params(blank_model)
+        else:
+            return list(model.parameters())
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    def test_state_dict_save_load_flow(self, state_dict_type):
+        fsdp_params = self._dist_train(wrap_fsdp=True, state_dict_type=state_dict_type)
+        ddp_params = self._dist_train(wrap_fsdp=False)
+        self.assertEqual(ddp_params, fsdp_params)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    def test_fsdp_state_dict_keys(self, state_dict_type):
+        state_dict = self._state_dict(self._initialize_model(True), state_dict_type)
+        if state_dict_type == "local_state_dict":
+            self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys())
+        elif state_dict_type in ("state_dict", "sharded_state_dict"):
+            # Keys should match local model.
+            local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False)
+            local_keys = local_model.state_dict().keys()
+            self.assertEqual(state_dict.keys(), local_keys)
+        else:
+            raise NotImplementedError(f"No test for {state_dict_type}!")
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS)
+    @parametrize("state_dict_rank0_and_offload", [True, False])
+    @parametrize("fsdp_root", [True, False])
+    def test_state_dict_load_into_local_module(
+        self, state_dict_type, state_dict_rank0_and_offload, fsdp_root,
+    ):
+        """
+        Tests that FSDP's state_dict can be loaded into a local model.
+        """
+        if state_dict_rank0_and_offload and state_dict_type != "state_dict":
+            return
+        if not fsdp_root:
+            model = self._get_non_fsdp_root_module()
+        else:
+            model = self._initialize_model(wrap_fsdp=True, register_buffers=True)
+        optim = SGD(model.parameters(), lr=0.1)
+        if not fsdp_root:
+            in_data = torch.randn(1, 10, requires_grad=True, device=torch.device("cuda"))
+        else:
+            in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
+        for _ in range(3):
+            out = model(in_data)
+            out.sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+        with FullyShardedDataParallel.summon_full_params(model):
+            fsdp_params = deepcopy(list(model.parameters()))
+
+        # get FSDP state_dict. Note that by default we return full_state_dict.
+        sd_mgr = self._get_state_dict_mgr(
+            model, state_dict_type, state_dict_rank0_and_offload
+        )
+        with sd_mgr:
+            fsdp_state_dict = model.state_dict()
+
+        ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
+        self._validate_state_dict_contents(
+            model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys,
+        )
+        # Create zeroed local model
+        if not fsdp_root:
+            blank_local_model = self._get_non_fsdp_root_module(wrap=False)
+        else:
+            blank_local_model = self._initialize_model(
+                wrap_fsdp=False, wrap_ddp=False, register_buffers=True
+            )
+
+        # Nothing should be FSDP
+        for mod in blank_local_model.modules():
+            self.assertFalse(isinstance(mod, FSDP))
+
+        for param in blank_local_model.parameters():
+            with torch.no_grad():
+                param.zero_()
+
+        fsdp_state_dict = _gather_state_dict(fsdp_state_dict)
+
+        # Load fsdp's full state dict into the local and verify params are as
+        # expected.
+        if state_dict_rank0_and_offload:
+            # Broadcast + CUDA state_dict
+            if not isinstance(model, FSDP):
+                # Some portions of the model on rank 0 might not be on CPU,
+                # move everything to CPU to avoid running into
+                # https://github.com/pytorch/pytorch/issues/77113.
+                for k, t in fsdp_state_dict.items():
+                    if t.device != torch.device("cpu"):
+                        fsdp_state_dict[k] = t.cpu()
+            fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict)
+            for key in fsdp_state_dict.keys():
+                fsdp_state_dict[key] = fsdp_state_dict[key].cuda()
+
+        # if self.rank == 0:
+        blank_local_model.load_state_dict(fsdp_state_dict)
+        local_params = list(blank_local_model.parameters())
+        for fsdp_param, local_param in zip(fsdp_params, local_params):
+            self.assertEqual(fsdp_param, local_param)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    @parametrize("double_nest", [True])
+    def test_state_dict_skip_module(self, state_dict_type, double_nest):
+        torch.cuda.set_device(self.rank)
+
+        def _create_module(wrap_fsdp=True):
+            LINEAR_SKIP = "linear_skip"
+            ctx = enable_wrap(wrapper_cls=FSDP) if wrap_fsdp else suppress()
+            with ctx:
+                module = SkipModel(double_nest=double_nest)
+                # Full name of linear_skip param tensors in SkipModel, as would be
+                # stored in checkpoint.
+                linear_skip_tensor_names = [
+                    k
+                    for k in dict(module.named_parameters()).keys()
+                    if LINEAR_SKIP in k
+                ]
+                # skip SkipModule
+                linear_skip = getattr(module, LINEAR_SKIP)
+                delattr(module, LINEAR_SKIP)
+                # Wrap FSDP
+                fsdp = wrap(module)
+                # reattach
+                setattr(module, LINEAR_SKIP, linear_skip)
+                return fsdp, linear_skip_tensor_names
+
+        fsdp, linear_skip_tensor_names = _create_module()
+        # Run a forward pass
+        inp = torch.randn((1, 10), device=torch.cuda.current_device())
+        loss = fsdp(inp)
+        loss.sum().backward()
+
+        with FSDP.state_dict_type(fsdp, STATE_DICT_MAPPING[state_dict_type]):
+            state_dict = fsdp.state_dict()
+        if self.rank == 0 and state_dict_type != "local_state_dict":
+            sd_keys = list(state_dict.keys())
+            expected = list(SkipModel(double_nest=False).state_dict().keys())
+            self.assertEqual(sorted(sd_keys), sorted(expected))
+            # TODO: parameters in linear_skip_tensor_names should not be handled
+            # by FSDP.state_dict(). Have a check once this is implemented in
+            # FSDP.state_dict().
+
+        # Check that it can be loaded into FSDP.
+        new_fsdp, _ = _create_module()
+        _zero_model(new_fsdp)
+        for (p1, p2) in zip(fsdp.parameters(), new_fsdp.parameters()):
+            self.assertNotEqual(p1, p2)
+        with FSDP.state_dict_type(new_fsdp, STATE_DICT_MAPPING[state_dict_type]):
+            if state_dict_type != "local_state_dict":
+                # FlatParameter has not supported deepcopy yet.
+                state_dict = deepcopy(state_dict)
+            new_fsdp.load_state_dict(state_dict)
+        for (p1, p2) in zip(fsdp.parameters(), new_fsdp.parameters()):
+            self.assertEqual(p1, p2)
+
+        # Test that the checkpoint can be loaded into a local model.
+        local, _ = _create_module(wrap_fsdp=False)
+        for param in local.parameters():
+            with torch.no_grad():
+                param.zero_()
+
+        with fsdp.summon_full_params(fsdp):
+            for (p1, p2) in zip(fsdp.parameters(), local.parameters()):
+                self.assertNotEqual(p1, p2)
+
+        if state_dict_type == "local_state_dict":
+            return
+        state_dict = _gather_state_dict(state_dict)
+        with fsdp.summon_full_params(fsdp):
+            if self.rank == 0:
+                local.load_state_dict(state_dict)
+                for (p1, p2) in zip(fsdp.parameters(), local.parameters()):
+                    self.assertEqual(p1, p2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_wrong_state_dict_config(self):
+        model = FSDP(Model(wrap_fsdp=True).cuda())
+        with self.assertRaisesRegex(RuntimeError, "Expected state_dict_config of type"):
+            with model.state_dict_type(
+                model, StateDictType.FULL_STATE_DICT, LocalStateDictConfig()
+            ):
+                pass
+
+    @skip_if_lt_x_gpu(2)
+    def test_state_dict_with_ignored_modules(self):
+        # Initialize an FSDP-wrapped model with an ignored module that includes
+        # both parameters and a buffer
+        model = Model(wrap_fsdp=True, register_buffers=True).cuda()
+        ignored_modules = [model.outer]
+        ignored_tensor_to_tensor_name = {
+            model.outer.bias: "outer.bias",
+            model.outer.weight: "outer.weight",
+            model.outer.buffer: "outer.buffer",
+        }
+        buffer_to_buffer_name = {
+            model.inner.buffer: "inner.buffer", model.outer.buffer: "outer.buffer",
+        }
+        fsdp_model = FSDP(model, ignored_modules=ignored_modules)
+        with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT):
+            sd1 = fsdp_model.state_dict()
+        with FSDP.summon_full_params(fsdp_model):
+            fsdp_params = deepcopy(list(fsdp_model.parameters()))
+        # Check that the ignored parameters and all buffers are not cloned
+        for tensor, tensor_name in {
+            **ignored_tensor_to_tensor_name,
+            **buffer_to_buffer_name,
+        }.items():
+            self.assertTrue(tensor_name in sd1)
+            self.assertEqual(tensor.data_ptr(), sd1[tensor_name].data_ptr())
+        # Check that the state dict can be loaded into a non-wrapped version of
+        # the model
+        nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda()
+        for param in nonwrapped_model.parameters():
+            with torch.no_grad():
+                param.zero_()
+        nonwrapped_model.load_state_dict(sd1)
+        local_params = list(nonwrapped_model.parameters())
+        for fsdp_param, local_param in zip(fsdp_params, local_params):
+            self.assertEqual(fsdp_param, local_param)
+        # Check that if we save a state dict again, the ignored parameters and
+        # buffer still have the same data pointer
+        with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT):
+            sd2 = fsdp_model.state_dict()
+        for tensor, tensor_name in {
+            **ignored_tensor_to_tensor_name,
+            **buffer_to_buffer_name,
+        }.items():
+            self.assertTrue(tensor_name in sd1)  # check again just in case
+            self.assertTrue(tensor_name in sd2)
+            self.assertEqual(tensor.data_ptr(), sd2[tensor_name].data_ptr())
+            self.assertEqual(sd1[tensor_name].data_ptr(), sd2[tensor_name].data_ptr())
+
+    @skip_if_lt_x_gpu(2)
+    def test_state_dict_type(self):
+        module = SkipModel(double_nest=True)
+        with enable_wrap(wrapper_cls=FSDP):
+            fsdp = wrap(module)
+        with FSDP.state_dict_type(fsdp, StateDictType.LOCAL_STATE_DICT):
+            pass
+        for module in FSDP.fsdp_modules(fsdp):
+            self.assertEqual(module._state_dict_type, StateDictType.FULL_STATE_DICT)
+
+
+instantiate_parametrized_tests(TestFSDPStateDict)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index e2dee05178a2..dbacacc1450e 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -1,15 +1,23 @@
 # Owner(s): ["oncall: distributed"]
-import sys
+import itertools
 import math
+import sys
+from copy import deepcopy
 
 import torch
 import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed.fsdp import CPUOffload, MixedPrecision
+from torch.distributed.fsdp import FlatParameter
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import CPUOffload
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+from torch.distributed.fsdp.wrap import wrap, enable_wrap
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
+    FSDPInitMode,
     FSDPTest,
+    NestedWrappedModule,
+    DeterministicModel,
 )
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
@@ -31,6 +39,62 @@
     sys.exit(0)
 
 
+def _run_test_summon_full_param_writeback(
+    cls, writeback, modify_outer, *fsdp_args, **fsdp_kwargs
+):
+    with enable_wrap(wrapper_cls=FSDP, *fsdp_args, **fsdp_kwargs):
+        lin1 = wrap(nn.Linear(5, 5, bias=False).cuda(cls.rank))
+        lin2 = nn.Linear(5, 3, bias=False).cuda(cls.rank)
+        model = wrap(nn.Sequential(lin1, lin2))
+
+    # set the value
+    outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
+    inner_param = model.get_parameter(
+        "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
+    )
+    p = outer_param if modify_outer else inner_param
+
+    with torch.no_grad():
+        # This sets the local shard value
+        p[0] = cls.rank + 2
+
+    with model.summon_full_params(model, writeback=writeback):
+        with torch.no_grad():
+            p.copy_(torch.zeros_like(p))
+
+    if writeback or cls.world_size == 1:
+        # When world_size = 1, FSDP does not shard and parameter is not set to
+        # a local shard, so write is always reflected.
+        cls.assertEqual(p.cpu()[0], 0)
+    else:
+        cls.assertEqual(p.cpu()[0], cls.rank + 2)
+
+
+class TestSummonFullParamsNoShard(FSDPTest):
+    @property
+    def world_size(self):
+        return 1  # does not shard
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("writeback", [True, False])
+    @parametrize("modify_outer", [True, False])
+    @parametrize("mixed_precision", [True, False])
+    # TODO: CPUOffload summon + writeback does not
+    # work when param is not sharded
+    # (currently when world_size == 1)
+    def test_summon_full_param_writeback(
+        self, writeback, modify_outer, mixed_precision
+    ):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        return _run_test_summon_full_param_writeback(
+            self,
+            writeback,
+            modify_outer=modify_outer,
+            cpu_offload=CPUOffload(offload_params=False),
+            mixed_precision=mixed_precision,
+        )
+
+
 class TestSummonFullParams(FSDPTest):
     @property
     def world_size(self):
@@ -44,50 +108,34 @@ def get_expected_sharded_size(self, global_size):
         return int(math.ceil(global_size / self.world_size))
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "writeback",
-        [True, False]
-    )
+    @parametrize("writeback", [True, False])
     @parametrize(
         "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-    )
-    @parametrize(
-        "modify_outer",
-        [True, False]
+        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
     )
-    def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer):
-        model = FSDP(nn.Sequential(
-            FSDP(nn.Linear(5, 5, bias=False)),
-            nn.Linear(5, 3, bias=False)
-        )).cuda(self.rank)
-
-        # set the value
-        outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param")
-        p = outer_param if modify_outer else inner_param
-
-        with torch.no_grad():
-            # This sets the local shard value
-            p[0] = self.rank + 2
-
-        with model._summon_full_params(writeback=writeback):
-            with torch.no_grad():
-                p.copy_(torch.zeros_like(p))
-
-        if writeback:
-            self.assertEqual(p.cpu()[0], 0)
-        else:
-            self.assertEqual(p.cpu()[0], self.rank + 2)
+    @parametrize("mixed_precision", [True, False])
+    @parametrize("modify_outer", [True, False])
+    def test_summon_full_param_writeback(
+        self, writeback, cpu_offload, mixed_precision, modify_outer
+    ):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        return _run_test_summon_full_param_writeback(
+            self,
+            writeback,
+            modify_outer,
+            cpu_offload=cpu_offload,
+            mixed_precision=mixed_precision,
+        )
 
     @skip_if_lt_x_gpu(2)
-    def test_summon_full_param_shard_value(self):
-
+    @parametrize("mixed_precision", [True, False])
+    def test_summon_full_param_shard_value(self, mixed_precision):
+        mixed_precision = MixedPrecision() if mixed_precision else None
         raw_model = nn.Linear(10, 11)
         raw_model_size = self.get_model_param_count(raw_model)
         expected_shard_size = self.get_expected_sharded_size(raw_model_size)
 
-        model = FSDP(raw_model.cuda(self.rank))
+        model = FSDP(raw_model.cuda(self.rank), mixed_precision=mixed_precision)
         self.assertEqual(expected_shard_size, self.get_model_param_count(model))
 
         # we're assuming a single flatenned param
@@ -95,29 +143,31 @@ def test_summon_full_param_shard_value(self):
 
         my_shard = torch.clone(next(model.parameters()))
 
-        with model._summon_full_params():
+        with model.summon_full_params(model):
             self.assertEqual(raw_model_size, self.get_model_param_count(model))
-            all_shards = next(model.parameters())
+            parameters = list(model.parameters())
+            all_shards = FlatParameter(parameters, requires_grad=False)
             my_slice = torch.chunk(all_shards, self.world_size)[self.rank]
 
             # shards are padded but the full_param tensor is not
-            a, b = my_shard[0: my_slice.numel()], my_slice
-            self.assertTrue(torch.equal(my_shard[0: my_slice.numel()].cpu(), my_slice.cpu()))
+            a, b = my_shard[0 : my_slice.numel()], my_slice
+            self.assertTrue(
+                torch.equal(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu())
+            )
 
     @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "recurse",
-        [True, False]
-    )
-    @parametrize(
-        "summon_outer",
-        [True, False]
-    )
-    def test_summon_full_param_recursive(self, recurse, summon_outer):
-        model = FSDP(nn.Sequential(
-            FSDP(nn.Linear(5, 5, bias=False)),
-            nn.Linear(5, 3, bias=False)
-        )).cuda(self.rank)
+    @parametrize("recurse", [True, False])
+    @parametrize("summon_outer", [True, False])
+    @parametrize("mixed_precision", [True, False])
+    def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precision):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        model = FSDP(
+            nn.Sequential(
+                FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
+                nn.Linear(5, 3, bias=False),
+            ),
+            mixed_precision=mixed_precision,
+        ).cuda(self.rank)
 
         global_inner_numel = self.get_model_param_count(nn.Linear(5, 5, bias=False))
         global_outer_numel = self.get_model_param_count(nn.Linear(5, 3, bias=False))
@@ -126,7 +176,9 @@ def test_summon_full_param_recursive(self, recurse, summon_outer):
         shard_outer_numel = int(math.ceil(global_outer_numel / self.world_size))
 
         outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param")
+        inner_param = model.get_parameter(
+            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
+        )
         self.assertEqual(shard_outer_numel, outer_param.numel())
         self.assertEqual(shard_inner_numel, inner_param.numel())
 
@@ -135,9 +187,11 @@ def test_summon_full_param_recursive(self, recurse, summon_outer):
         expected_outer_numel = global_outer_numel if summon_outer else shard_outer_numel
 
         # inner is summoned if _summon_full_param is called with recursion or on the inner FSDP module
-        expected_inner_numel = global_inner_numel if recurse or not summon_outer else shard_inner_numel
+        expected_inner_numel = (
+            global_inner_numel if recurse or not summon_outer else shard_inner_numel
+        )
 
-        with model_to_summon._summon_full_params(recurse=recurse):
+        with model_to_summon.summon_full_params(model_to_summon, recurse=recurse):
             self.assertEqual(expected_outer_numel, outer_param.numel())
             self.assertEqual(expected_inner_numel, inner_param.numel())
 
@@ -149,14 +203,15 @@ def __init__(self):
                 self.a = nn.Parameter(torch.zeros(5))
 
             def forward(self, fsdp_module):
-                with fsdp_module._summon_full_params():
+                with fsdp_module.summon_full_params(fsdp_module):
                     pass
 
         model = FSDP(MyModule()).cuda(self.rank)
-        with self.assertRaisesRegex(ValueError, "current state is TrainingState_.FORWARD"):
+        with self.assertRaisesRegex(
+            ValueError, "current state is TrainingState_.FORWARD"
+        ):
             model(model)
 
-
     @skip_if_lt_x_gpu(2)
     def test_cannot_summon_full_params_from_backward(self):
         model = FSDP(nn.Linear(2, 1)).cuda(self.rank)
@@ -164,39 +219,50 @@ def test_cannot_summon_full_params_from_backward(self):
         output = model(torch.ones(2).cuda(self.rank))
 
         def bad_backwards_hook(tensor):
-            with model._summon_full_params():
+            with model.summon_full_params(model):
                 pass
             return None
 
         self.assertTrue(output.requires_grad)
         output.register_hook(bad_backwards_hook)
 
-        with self.assertRaisesRegex(ValueError, "current state is TrainingState_.BACKWARD_PRE"):
+        with self.assertRaisesRegex(
+            ValueError, "current state is TrainingState_.BACKWARD_PRE"
+        ):
             output.backward()
 
-
     @skip_if_lt_x_gpu(2)
-    def test_summon_full_params_respects_reshard_after_forward(self):
-        model = FSDP(nn.Sequential(
-            FSDP(nn.Linear(5, 5, bias=False)),
-            nn.Linear(5, 3, bias=False)
-        )).cuda(self.rank)
+    @parametrize("mixed_precision", [True, False])
+    def test_summon_full_params_respects_reshard_after_forward(self, mixed_precision):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        model = FSDP(
+            nn.Sequential(
+                FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
+                nn.Linear(5, 3, bias=False),
+            ),
+            mixed_precision=mixed_precision,
+        ).cuda(self.rank)
 
         outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param")
+        inner_param = model.get_parameter(
+            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
+        )
         outer_full_param_size = outer_param.numel() * self.world_size
 
         # trigger lazy init
         model(torch.zeros(5).cuda(self.rank))
-
         # the root FSDP module keeps all params around
-        self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size())
+        self.assertEqual(
+            outer_full_param_size, outer_param._full_param_padded.storage().size()
+        )
         self.assertEqual(0, inner_param._full_param_padded.storage().size())
 
-        # similarly _summon_full_params should have the same behavior
-        with model._summon_full_params():
+        # similarly summon_full_params should have the same behavior
+        with model.summon_full_params(model):
             pass
-        self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size())
+        self.assertEqual(
+            outer_full_param_size, outer_param._full_param_padded.storage().size()
+        )
         self.assertEqual(0, inner_param._full_param_padded.storage().size())
 
     @skip_if_lt_x_gpu(2)
@@ -210,7 +276,7 @@ def test_summon_single_param(self):
             # This sets the local shard value
             p[0] = self.rank + 2
 
-        with model._summon_full_params(writeback=True):
+        with model.summon_full_params(model, writeback=True):
             self.assertEqual(1, p.numel())
             with torch.no_grad():
                 p.copy_(torch.zeros_like(p))
@@ -222,21 +288,97 @@ def test_summon_single_param(self):
             self.assertEqual(self.rank + 2, p[0])
 
     @skip_if_lt_x_gpu(2)
-    def test_reshard_outside_forward_backward_iteration(self):
-        model = FSDP(nn.Sequential(
-            FSDP(nn.Linear(5, 5, bias=False)),
-            nn.Linear(5, 1, bias=False)
-        )).cuda(self.rank)
+    @parametrize("rank0_only", [True, False])
+    @parametrize("offload_to_cpu", [True, False])
+    def test_summon_full_params_equivalence(self, rank0_only, offload_to_cpu):
+        offload = CPUOffload(offload_params=True)
+        model = FSDP(
+            DeterministicModel(wrap_fsdp=True, cpu_offload=offload), cpu_offload=offload
+        )
+        local_model = DeterministicModel(wrap_fsdp=False)
+
+        dev = (
+            torch.device("cpu")
+            if offload_to_cpu
+            else torch.device("cuda", torch.cuda.current_device())
+        )
+
+        params_to_compare = (
+            [p.clone() for p in model.parameters()]
+            if rank0_only and self.rank != 0
+            else list(local_model.parameters())
+        )
+
+        with model.summon_full_params(
+            model,
+            recurse=True,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
+            # Below sleep causes failures without stream synchronization in
+            # summon_full_params fix.
+            torch.cuda._sleep(1000000)
+            # FSDP param deepcopy() of params has issues
+            fsdp_params = [p.clone() for p in model.parameters()]
+
+        self.assertEqual(fsdp_params, params_to_compare)
+
+    @skip_if_lt_x_gpu(2)
+    def test_summon_from_non_fsdp(self):
+        class FSDPContainer(nn.Module):
+            def __init__(self, fsdp_1, fsdp_2, fsdp_3):
+                super().__init__()
+                self.fsdp_1 = fsdp_1
+                self.fsdp_2 = fsdp_2
+                self.fsdp_3 = fsdp_3
+
+        model_fsdp = FSDPContainer(
+            FSDP(DeterministicModel(wrap_fsdp=True)),
+            FSDP(DeterministicModel(wrap_fsdp=True)),
+            DeterministicModel(wrap_fsdp=False),
+        )
+        model_no_fsdp = FSDPContainer(
+            DeterministicModel(wrap_fsdp=False),
+            DeterministicModel(wrap_fsdp=False),
+            DeterministicModel(wrap_fsdp=False),
+        )
+
+        params_to_compare = list(model_no_fsdp.parameters())
+        with FullyShardedDataParallel.summon_full_params(model_fsdp):
+            fsdp_params = [p.clone() for p in model_fsdp.parameters()]
+
+        self.assertEqual(params_to_compare, fsdp_params)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("rank0_only", [True, False])
+    @parametrize("offload_to_cpu", [True, False])
+    @parametrize("mixed_precision", [True, False])
+    def test_reshard_outside_forward_backward_iteration(
+        self, rank0_only, offload_to_cpu, mixed_precision
+    ):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        model = FSDP(
+            nn.Sequential(
+                FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
+                nn.Linear(5, 1, bias=False),
+            ),
+            mixed_precision=mixed_precision,
+        ).cuda(self.rank)
 
         outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param")
+        inner_param = model.get_parameter(
+            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
+        )
         outer_full_param_size = outer_param.numel() * self.world_size
 
         # First lets validate our assumption about resharding
 
         output = model(torch.zeros(5).cuda(self.rank))
         # the root FSDP module keeps all params around
-        self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size())
+        self.assertEqual(
+            outer_full_param_size, outer_param._full_param_padded.storage().size()
+        )
         self.assertEqual(0, inner_param._full_param_padded.storage().size())
 
         output.backward()
@@ -247,31 +389,171 @@ def test_reshard_outside_forward_backward_iteration(self):
         # now lets repeat it with summon done in between
 
         output = model(torch.zeros(5).cuda(self.rank))
-        with model._summon_full_params():
+        self.assertEqual(
+            outer_full_param_size, outer_param._full_param_padded.storage().size()
+        )
+        self.assertEqual(0, inner_param._full_param_padded.storage().size())
+        with model.summon_full_params(
+            model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
             pass
-        self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size())
+        self.assertEqual(
+            outer_full_param_size, outer_param._full_param_padded.storage().size()
+        )
         self.assertEqual(0, inner_param._full_param_padded.storage().size())
 
         output.backward()
-        with model._summon_full_params():
+        with model.summon_full_params(
+            model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
             pass
         self.assertEqual(0, outer_param._full_param_padded.storage().size())
         self.assertEqual(0, inner_param._full_param_padded.storage().size())
 
+    @skip_if_lt_x_gpu(2)
+    @parametrize("rank0_only", [True, False])
+    @parametrize("offload_to_cpu", [True, False])
+    @parametrize("mixed_precision", [True, False])
+    def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precision):
+        layer_shape = (10, 12)
+        model = nn.Linear(*layer_shape, bias=False).cuda(self.rank)
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        fsdp_model = FSDP(deepcopy(model), mixed_precision=mixed_precision).cuda(
+            self.rank
+        )
+
+        def _get_flat_param():
+            return fsdp_model.get_parameter("_fsdp_wrapped_module.flat_param")
+
+        flattened_param = _get_flat_param()
+        self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel())
+
+        with fsdp_model.summon_full_params(
+            fsdp_model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
+            if self.rank == 0 or not rank0_only:
+                self.assertEqual(fsdp_model.weight.shape, model.weight.shape)
+                expected_device = (
+                    torch.device("cpu")
+                    if offload_to_cpu
+                    else torch.device("cuda", torch.cuda.current_device())
+                )
+                self.assertTrue(expected_device == fsdp_model.weight.device)
+            else:
+                # Nonzero rank with rank0_only maintains original params.
+                flat_within_ctx = _get_flat_param()
+                self.assertEqual(flat_within_ctx, flattened_param)
+                self.assertEqual(
+                    flat_within_ctx.device, torch.device(torch.cuda.current_device())
+                )
+
+        # CPU offload should restore the param device
+        param = next(fsdp_model.parameters())
+        self.assertTrue(
+            param.device == torch.device("cuda", torch.cuda.current_device())
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("rank0_only", [True, False])
+    @parametrize("offload_to_cpu", [True, False])
+    @parametrize("mixed_precision", [True, False])
+    def test_params_count_and_value(self, rank0_only, offload_to_cpu, mixed_precision):
+        mixed_precision = MixedPrecision() if mixed_precision else None
+        fsdp_model = FSDP(
+            NestedWrappedModule(
+                group=dist.distributed_c10d._get_default_group(),
+                wrap_fsdp=True,
+                fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+                mixed_precision=mixed_precision,
+            ),
+            mixed_precision=mixed_precision,
+        )
+        model = NestedWrappedModule(
+            group=dist.distributed_c10d._get_default_group(),
+            wrap_fsdp=False,
+            fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+        )
+
+        dev = (
+            torch.device("cpu")
+            if offload_to_cpu
+            else torch.device("cuda", torch.cuda.current_device())
+        )
+
+        params_to_compare = (
+            [p.to(dev) for p in model.module.parameters()]
+            if not rank0_only or self.rank == 0
+            else list(p.clone() for p in fsdp_model.parameters())
+        )
+        with fsdp_model.summon_full_params(
+            fsdp_model, rank0_only=rank0_only, writeback=not rank0_only
+        ):
+            for p1, p2 in itertools.zip_longest(
+                fsdp_model.parameters(), params_to_compare
+            ):
+                self.assertEqual(p1, p2)
+
+        # CPU offload should restore the param device
+        param = next(fsdp_model.parameters())
+        self.assertTrue(
+            param.device == torch.device("cuda", torch.cuda.current_device())
+        )
 
     @skip_if_lt_x_gpu(2)
-    def test_params_are_unflatenned(self):
-        model = FSDP(nn.Linear(self.world_size, 1, bias=False)).cuda(self.rank)
+    def test_raises_rank0_with_writeback(self):
+        fsdp_model = FSDP(
+            NestedWrappedModule(
+                group=dist.distributed_c10d._get_default_group(),
+                wrap_fsdp=True,
+                fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+            )
+        )
+
+        with self.assertRaisesRegex(ValueError, "is not supported"):
+            with fsdp_model.summon_full_params(
+                fsdp_model, rank0_only=True, writeback=True
+            ):
+                pass
 
-        flattened_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        self.assertEqual(1, flattened_param.numel())
+    @skip_if_lt_x_gpu(2)
+    @parametrize("prefix", ["", "test_prefix"])
+    @parametrize("recurse", [False, True])
+    def test_named_parameters_buffers(self, prefix: str, recurse: bool):
+        fsdp_model = FSDP(
+            NestedWrappedModule(
+                group=dist.distributed_c10d._get_default_group(),
+                wrap_fsdp=True,
+                fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+            )
+        )
+        fsdp_model.register_buffer("buffer", torch.ones(1))
+        model = NestedWrappedModule(
+            group=dist.distributed_c10d._get_default_group(),
+            wrap_fsdp=False,
+            fsdp_init_mode=FSDPInitMode.CUDA_BEFORE,
+        )
+        model.register_buffer("buffer", torch.ones(1))
+        with fsdp_model.summon_full_params(fsdp_model):
+            for call in ["named_parameters", "named_buffers"]:
+                for (n1, p1), (n2, p2) in itertools.zip_longest(
+                    getattr(fsdp_model, call)(prefix=prefix, recurse=recurse),
+                    getattr(model, call)(prefix=prefix, recurse=recurse),
+                ):
+                    self.assertEqual(n1, n2)
+                    self.assertEqual(p1, p2)
 
-        with model._summon_full_params():
-            a = model.weight.flatten().detach()
-            b = flattened_param.detach()
-            self.assertTrue(torch.equal(a, b))
 
 instantiate_parametrized_tests(TestSummonFullParams)
+instantiate_parametrized_tests(TestSummonFullParamsNoShard)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
new file mode 100644
index 000000000000..69ceca082441
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -0,0 +1,57 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+
+from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    NestedWrappedModule,
+)
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestTraversal(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_fsdp_modules(self):
+        group = dist.distributed_c10d._get_default_group()
+        model = NestedWrappedModule(group, wrap_fsdp=True)
+        modules = FSDP.fsdp_modules(model)
+        self.assertEquals(
+            modules, [
+                model.module.get_submodule("1"),
+                model.module.get_submodule("1").get_submodule("0"),
+                model.module.get_submodule("2"),
+            ]
+        )
+        modules = FSDP.fsdp_modules(model, root_only=True)
+        self.assertEqual(
+            modules, [
+                model.module.get_submodule("1"),
+                model.module.get_submodule("2"),
+            ]
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py
index 59b111d6d3cd..93b89f547e1f 100644
--- a/test/distributed/fsdp/test_fsdp_uneven.py
+++ b/test/distributed/fsdp/test_fsdp_uneven.py
@@ -10,7 +10,6 @@
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     FSDPTest,
-    get_full_params,
 )
 from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests
 
@@ -61,11 +60,13 @@ def test_one_iteration(self):
         out.float().sum().backward()
         optim.step()
         optim.zero_grad()
-        get_full_params(model)
-        weight_out = model.module.weight.T.clone()
 
-        self.assertEqual(ref_forward_output_my_rank, out)
-        self.assertEqual(ref_weight_out, weight_out)
+        with model.summon_full_params(model):
+            torch.cuda.synchronize()  # TODO: This is here because it was
+            # originally part of get_full_params(), debug why it is needed here.
+            weight_out = model.module.weight.T.clone()
+            self.assertEqual(ref_forward_output_my_rank, out)
+            self.assertEqual(ref_weight_out, weight_out)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/fsdp/test_shard_utils.py b/test/distributed/fsdp/test_shard_utils.py
new file mode 100644
index 000000000000..1d24b2e3c681
--- /dev/null
+++ b/test/distributed/fsdp/test_shard_utils.py
@@ -0,0 +1,187 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard,
+    ShardMetadata,
+)
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+    EnumerableShardingSpec,
+)
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp.shard_utils import (
+    _offsets_to_split_sizes,
+    _reshard_flatten_tensor,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestShardUtils(TestCase):
+    def test_offsets_to_split_sizes(self):
+        tensor_numel = 40
+
+        def _get_and_check_split_sizes(
+            world_size,
+            in_offsets,
+            out_offsets,
+            in_split_sizes,
+        ):
+
+            for my_rank in range(world_size):
+                _in_split_sizes = in_split_sizes[my_rank]
+                _out_split_sizes = [
+                    in_split_sizes[i][my_rank] for i in range(world_size)
+                ]
+                res_in_split_sizes, res_out_split_sizes = _offsets_to_split_sizes(
+                    in_offsets, out_offsets, tensor_numel, world_size, my_rank
+                )
+                self.assertEqual(_in_split_sizes, res_in_split_sizes)
+                self.assertEqual(_out_split_sizes, res_out_split_sizes)
+
+        # The tensor size can be evenly divided by the world size.
+        world_size = 4
+        in_offsets = [0, 10, 20, 30]
+        out_offsets = [0, 10, 20, 30]
+        in_split_sizes = [
+            [10, 0, 0, 0],
+            [0, 10, 0, 0],
+            [0, 0, 10, 0],
+            [0, 0, 0, 10],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+        world_size = 4
+        in_offsets = [0, 3, 17, 18]
+        out_offsets = [0, 10, 20, 30]
+        in_split_sizes = [
+            [3, 0, 0, 0],
+            [7, 7, 0, 0],
+            [0, 1, 0, 0],
+            [0, 2, 10, 10],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+        world_size = 4
+        in_offsets = [0, 10, 20, 30]
+        out_offsets = [0, 3, 17, 18]
+        in_split_sizes = [
+            [3, 7, 0, 0],
+            [0, 7, 1, 2],
+            [0, 0, 0, 10],
+            [0, 0, 0, 10],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+        world_size = 4
+        in_offsets = [0, 7, 11, 25]
+        out_offsets = [0, 10, 17, 18]
+        in_split_sizes = [
+            [7, 0, 0, 0],
+            [3, 1, 0, 0],
+            [0, 6, 1, 7],
+            [0, 0, 0, 15],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+        # The tensor size cannot be evenly divided by the world size.
+        world_size = 6
+        in_offsets = [0, 7, 14, 21, 28, 35]
+        out_offsets = [0, 7, 14, 21, 28, 35]
+        in_split_sizes = [
+            [7, 0, 0, 0, 0, 0],
+            [0, 7, 0, 0, 0, 0],
+            [0, 0, 7, 0, 0, 0],
+            [0, 0, 0, 7, 0, 0],
+            [0, 0, 0, 0, 7, 0],
+            [0, 0, 0, 0, 0, 5],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+        world_size = 6
+        in_offsets = [0, 0, 10, 11, 28, 40]
+        out_offsets = [0, 7, 14, 21, 28, 35]
+        in_split_sizes = [
+            [0, 0, 0, 0, 0, 0],
+            [7, 3, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [0, 3, 7, 7, 0, 0],
+            [0, 0, 0, 0, 7, 5],
+            [0, 0, 0, 0, 0, 0],
+        ]
+        _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes)
+
+
+class TestShardUtilsDistributed(FSDPTest):
+    @property
+    def world_size(self):
+        return 2
+
+    def _create_local_chunk(self, tensor):
+        chunk = tensor.chunk(2)[self.rank]
+        offsets = [0] if self.rank == 0 else [tensor.shape[0] - chunk.shape[0]]
+        shard = Shard.from_tensor_and_offsets(chunk, offsets, self.rank)
+        return init_from_local_shards([shard], tensor.numel())
+
+    def _create_enumerate_spec(self, tensor):
+        # Since placement is not used, always set placement to rank0 to mimic
+        # the actual usage.
+        metadata = [
+            ShardMetadata([0], [101], placement="rank0/cuda:0"),
+            ShardMetadata([101], [900], placement="rank0/cuda:0"),
+        ]
+        return EnumerableShardingSpec(metadata)
+
+    def _create_chunk_spec(self):
+        return ChunkShardingSpec(dim=0, placements=["rank0/cuda:0"])
+
+    def _create_tensor(self):
+        # Keep everything deterministic.
+        torch.manual_seed(0)
+        return torch.rand(1001).cuda()
+
+    @skip_if_lt_x_gpu(2)
+    def test_reshard_flatten_tensor(self):
+        def get_offsets(tensor, shard):
+            if self.rank == 0:
+                return [0]
+            else:
+                return [tensor.shape[0] - shard.shape[0]]
+
+        tensor = self._create_tensor()
+
+        shard = _reshard_flatten_tensor(
+            self._create_local_chunk(tensor),
+            self._create_enumerate_spec(tensor),
+            self.world_size,
+            self.rank,
+            tensor.device,
+            _get_default_group(),
+        )
+        offsets = [0] if self.rank == 0 else [tensor.shape[0] - shard.shape[0]]
+        shard = Shard.from_tensor_and_offsets(shard, offsets, self.rank)
+        uneven_sharded_tensor = init_from_local_shards([shard], tensor.numel())
+
+        shard = _reshard_flatten_tensor(
+            uneven_sharded_tensor,
+            self._create_chunk_spec(),
+            self.world_size,
+            self.rank,
+            tensor.device,
+            _get_default_group(),
+        )
+        offsets = [0] if self.rank == 0 else [tensor.shape[0] - shard.shape[0]]
+        shard = Shard.from_tensor_and_offsets(shard, offsets, self.rank)
+        even_sharded_tensor = init_from_local_shards([shard], tensor.numel())
+
+        output = torch.empty(tensor.shape).cuda() if self.rank == 0 else None
+        even_sharded_tensor.gather(0, output)
+        if self.rank == 0:
+            self.assertEqual(tensor, output)
+        output = torch.empty(tensor.shape).cuda() if self.rank == 0 else None
+        uneven_sharded_tensor.gather(0, output)
+        if self.rank == 0:
+            self.assertEqual(tensor, output)
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
index 2fdede0e4a05..2326c7137c30 100644
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@@ -1,14 +1,17 @@
 # Owner(s): ["oncall: distributed"]
 
+from collections import OrderedDict
 import random
 import sys
 import unittest
 
 import torch
+import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp.utils import (
+from torch.distributed.fsdp._utils import (
     _apply_to_tensors,
 )
+from torch.distributed.utils import _replace_by_prefix
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
@@ -57,7 +60,7 @@ def get_a_tensor():
         data.append({"key1": get_a_tensor(), "key2": {1: get_a_tensor()}, "key3": 3})
         data.insert(0, set(["x", get_a_tensor(), get_a_tensor()]))
         data.append(([1], get_a_tensor(), (1), [get_a_tensor()], set((1, 2))))
-        od = dict()
+        od = OrderedDict()
         od["k"] = "value"
         data.append(od)
 
@@ -73,6 +76,39 @@ def fn(t):
         for i, v in enumerate(data):
             self.assertEqual(type(new_data[i]), type(v))
 
+    def test_replace_by_prefix(self):
+        state_dict = {
+            "layer.a": torch.tensor(1),
+            "abc.layer.def": torch.tensor(2),
+            "layer.b": torch.tensor(3),
+        }
+        original_state_dict = state_dict.copy()
+        _replace_by_prefix(state_dict, "layer.", "module.layer.")
+        assert state_dict == {
+            "module.layer.a": torch.tensor(1),
+            "abc.layer.def": torch.tensor(2),
+            "module.layer.b": torch.tensor(3),
+        }
+        _replace_by_prefix(state_dict, "module.layer.", "layer.")
+        assert state_dict == original_state_dict
+
+
+    def test_packed_sequence(self):
+        """Test to ensure RNN packed sequences are modified correctly."""
+        rnn = nn.RNN(5, 5)
+
+        x = torch.rand((5, 1, 5), dtype=torch.float)
+        seq_length = torch.tensor([4], dtype=torch.int)
+
+        def fill_fn(x):
+            x.fill_(0)
+
+        x = nn.utils.rnn.pack_padded_sequence(x, seq_length)
+        x, h = rnn(x)
+        x = _apply_to_tensors(fill_fn, x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(x)
+        self.assertEqual(torch.sum(x), 0)
+
 
 instantiate_parametrized_tests(TestUtils)
 
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index 0b4c1f8acc6c..2a72860d1f5b 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -5,7 +5,6 @@
 import os
 import tempfile
 import unittest
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -15,9 +14,13 @@
     BackwardPrefetch,
 )
 from torch.distributed.fsdp.wrap import (
-    default_auto_wrap_policy,
+    always_wrap_policy,
+    size_based_auto_wrap_policy,
     enable_wrap,
+    _or_policy,
     wrap,
+    _wrap_batchnorm_individually,
+    transformer_auto_wrap_policy,
 )
 from torch.testing._internal.common_distributed import (
     skip_if_lt_x_gpu,
@@ -27,6 +30,7 @@
     FSDPTest,
     FSDPInitMode,
     _maybe_cuda,
+    TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
@@ -36,6 +40,16 @@
     parametrize,
     instantiate_parametrized_tests,
 )
+from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
+
+class BatchNormNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = nn.Linear(10, 10, bias=False)
+        self.bn1 = nn.BatchNorm1d(10)
+        self.bn2 = nn.BatchNorm2d(10)
+        self.bn3 = nn.BatchNorm3d(10)
+        self.sync_bn = nn.SyncBatchNorm(10)
 
 class WrapMethod(Enum):
     FSDP_CTOR = auto()
@@ -67,6 +81,15 @@ def get_model(cuda=True):
                 sequential = sequential.cuda()
             return sequential
 
+        @staticmethod
+        def verify_model_all_wrapped(cls, model):
+            cls.assertTrue(isinstance(model, FSDP))
+            cls.assertTrue(isinstance(model.module[0], FSDP))
+            cls.assertTrue(isinstance(model.module[1], FSDP))
+            cls.assertTrue(isinstance(model.module[2], FSDP))
+            cls.assertTrue(isinstance(model.module[2].module[0], FSDP))
+            cls.assertTrue(isinstance(model.module[2].module[1], FSDP))
+
         @staticmethod
         def verify_model(cls, model):
             cls.assertTrue(isinstance(model, FSDP))
@@ -123,7 +146,76 @@ def test_error_already_wrapped(self, nested, fsdp_init_mode):
             wrapped_fsdp = wrapped_fsdp.cuda()
 
         with self.assertRaisesRegex(ValueError, "to NOT be FullyShardedDataParallel"):
-            mod = FSDP(wrapped_fsdp, fsdp_auto_wrap_policy=default_auto_wrap_policy)
+            mod = FSDP(wrapped_fsdp, auto_wrap_policy=size_based_auto_wrap_policy)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_or_policy", [True, False])
+    def test_wrap_batchnorm_individually(self, use_or_policy):
+        def never_wrap_policy(*args, **kwargs):
+            return False
+
+        policy = (
+            functools.partial(
+                _or_policy,
+                policies=[never_wrap_policy, _wrap_batchnorm_individually]
+            ) if use_or_policy else _wrap_batchnorm_individually
+        )
+        model = BatchNormNet()
+        fsdp = FSDP(model, auto_wrap_policy=policy)
+        # Batchnorms should be wrapped
+        for layer in [fsdp.bn1, fsdp.bn2, fsdp.bn3, fsdp.sync_bn]:
+            self.assertTrue(isinstance(layer, FSDP))
+
+        self.assertFalse(isinstance(fsdp.lin, FSDP))
+
+    @skip_if_lt_x_gpu(2)
+    def test_bn_always_wrapped_individually(self):
+        """
+        Ensures that by using _or_policy with _wrap_batchnorm_individually, even
+        if the other policy results in a module containing a BN unit being
+        wrapped, the contained BN unit will still be individually wrapped.
+        """
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn_container = BatchNormNet()
+
+        def wrap_bn_container(module, recurse, *args, **kwargs):
+            if recurse:
+                return True
+            return isinstance(module, BatchNormNet)
+
+        my_policy = functools.partial(
+            _or_policy,
+            policies=[wrap_bn_container, _wrap_batchnorm_individually]
+        )
+        mod = MyModule()
+        fsdp = FSDP(mod, auto_wrap_policy=my_policy)
+
+        # Wrapping should be FSDP(FSDP(BatchNormNet(FSDP(BN))))
+        # and not FSDP(FSDP(BatchNormNet(BN))) (in the latter the inner
+        # BN is not individually wrapped.)
+
+        for bn in [
+            fsdp.bn_container.bn1,
+            fsdp.bn_container.bn2,
+            fsdp.bn_container.bn3,
+            fsdp.bn_container.sync_bn
+        ]:
+            self.assertTrue(isinstance(bn, FSDP))
+
+        # if we just wrapped BN container, individual batchnorms are not
+        # wrapped.
+        mod = MyModule()
+        fsdp = FSDP(mod, auto_wrap_policy=wrap_bn_container)
+        self.assertTrue(isinstance(mod.bn_container, FSDP))
+        for bn in [
+            fsdp.bn_container.bn1,
+            fsdp.bn_container.bn2,
+            fsdp.bn_container.bn3,
+            fsdp.bn_container.sync_bn
+        ]:
+            self.assertFalse(isinstance(bn, FSDP))
 
     @skip_if_lt_x_gpu(2)
     @parametrize(
@@ -168,8 +260,8 @@ def forward(self, input):
         model = MyModel()
         wrapped_model = FSDP(
             model,
-            fsdp_auto_wrap_policy=functools.partial(
-                default_auto_wrap_policy,
+            auto_wrap_policy=functools.partial(
+                size_based_auto_wrap_policy,
                 min_num_params=0,  # wrap all modules
             ),
             cpu_offload=cpu_offload,
@@ -216,6 +308,7 @@ def setUp(self) -> None:
         # For all the tests here, we use a fake group
         self.process_group = DummyProcessGroup(rank=0, size=1)
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
     def test_wrap(self, wrap_method):
         if wrap_method == WrapMethod.WRAP_API:
@@ -226,12 +319,13 @@ def test_wrap(self, wrap_method):
             layer = FSDP(
                 nn.Linear(5, 5),
                 process_group=self.process_group,
-                fsdp_auto_wrap_policy=functools.partial(default_auto_wrap_policy, min_num_params=1)
+                auto_wrap_policy=functools.partial(size_based_auto_wrap_policy, min_num_params=1)
             )
         self.assertTrue(isinstance(layer, FSDP))
         self.assertEqual(layer.rank, self.process_group.rank())
         self.assertEqual(layer.world_size, self.process_group.size())
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_wrap_disabled_outside_context(self):
         pg = self.process_group
 
@@ -248,6 +342,7 @@ def __init__(self):
         self.assertFalse(isinstance(model.lin, FSDP))
         self.assertTrue(isinstance(model.lin, nn.Linear))
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_wrap_override_defaults(self):
         new_process_group = DummyProcessGroup(rank=0, size=2)
         with enable_wrap(wrapper_cls=FSDP, process_group=self.process_group):
@@ -257,6 +352,35 @@ def test_wrap_override_defaults(self):
         self.assertEqual(layer.rank, 0)
         self.assertEqual(layer.world_size, 2)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Test Requires CUDA")
+    def test_always_wrap(self):
+        """
+        Test to ensure that if `always_wrap_policy` is
+        passed into FSDP, all submodules are wrapped.
+        """
+        seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True)
+        model = FSDP(seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy)
+        TestFSDPWrap.NestedSequentialModel.verify_model_all_wrapped(self, model)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
+    def test_transformer_auto_wrap_policy(self):
+        model = TransformerWithSharedParams(group=self.process_group)
+        my_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer}
+        )
+        fsdp_model = FSDP(
+            model,
+            process_group=self.process_group,
+            auto_wrap_policy=my_auto_wrap_policy
+        )
+        self.assertTrue(isinstance(fsdp_model, FSDP))
+        for layer in fsdp_model.module.module.transformer.encoder.layers:
+            self.assertTrue(isinstance(layer, FSDP))
+        for layer in fsdp_model.module.module.transformer.decoder.layers:
+            self.assertTrue(isinstance(layer, FSDP))
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_auto_wrap_api(self):
         """
         Test to ensure with auto wrap, we wrap child modules correctly based on the min_num_params.
@@ -264,37 +388,38 @@ def test_auto_wrap_api(self):
         """
         sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
         my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy, min_num_params=40
+            size_based_auto_wrap_policy, min_num_params=40
         )
         model = FSDP(
             sequential,
             process_group=self.process_group,
-            fsdp_auto_wrap_policy=my_auto_wrap_policy
+            auto_wrap_policy=my_auto_wrap_policy
         )
 
         TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
 
-
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_auto_wrap_preset_exclude_wrap(self):
         """
         Test to ensure excluded modules are not wrapped, regardless if the total param size is greater than the
-        min_num_params. the default_auto_wrap_policy excludes wrapping for {nn.ModuleList, nn.ModuleDict}
+        min_num_params. the size_based_auto_wrap_policy excludes wrapping for {nn.ModuleList, nn.ModuleDict}
         """
         sequential = nn.ModuleList([nn.Linear(5, 5), nn.Linear(5, 5)])
         my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy, min_num_params=40
+            size_based_auto_wrap_policy, min_num_params=40
         )
 
         model = FSDP(
             sequential,
             process_group=self.process_group,
-            fsdp_auto_wrap_policy=my_auto_wrap_policy
+            auto_wrap_policy=my_auto_wrap_policy
         )
 
         self.assertTrue(isinstance(model, FSDP))
         self.assertTrue(isinstance(model[0], nn.Linear))
         self.assertTrue(isinstance(model[1], nn.Linear))
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_auto_wrap_preset_exclude_wrap_include_children(self):
         """
         Test to ensure excluded modules are not wrapped, but children are if param size is greater than
@@ -302,43 +427,45 @@ def test_auto_wrap_preset_exclude_wrap_include_children(self):
         """
         sequential = nn.ModuleList([nn.Linear(10, 10)])
         my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy, min_num_params=40
+            size_based_auto_wrap_policy, min_num_params=40
         )
-        model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
 
         self.assertTrue(isinstance(model, FSDP))
         self.assertTrue(isinstance(model[0], FSDP))
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_auto_wrap_preset_force_leaf(self):
         """
         Test to ensure force-leaf modules are not wrapped, and children are not wrapped. The
-        default_auto_wrap_policy forces leaf modules of type {nn.MultiheadAttention} to not be wrapped
+        size_based_auto_wrap_policy forces leaf modules of type {nn.MultiheadAttention} to not be wrapped
         """
         sequential = nn.Sequential(nn.Linear(10, 10), nn.MultiheadAttention(100, 1))
         my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy, min_num_params=40
+            size_based_auto_wrap_policy, min_num_params=40
         )
-        model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
         self.assertTrue(isinstance(model.module[0], FSDP))
         # Assert children of multihead attention are not wrapped
         self.assertTrue(isinstance(model.module[1], nn.MultiheadAttention))
         self.assertTrue(isinstance(model.module[1].out_proj, nn.Linear))
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
     def test_auto_wrap_preset_force_leaf_custom(self):
         """
         Test to ensure force-leaf modules are not wrapped.
         """
         my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy,
+            size_based_auto_wrap_policy,
             min_num_params=40,
-            force_leaf_modules=default_auto_wrap_policy.FORCE_LEAF_MODULES.union(
+            force_leaf_modules=size_based_auto_wrap_policy.FORCE_LEAF_MODULES.union(
                 {nn.Linear}
             ),
         )
         sequential = nn.Sequential(
             nn.Linear(10, 10), nn.ModuleList([nn.Linear(10, 10)])
         )
-        model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
         # Model was wrapped in FSDP as no inner modules were wrapped.
         self.assertTrue(isinstance(model, FSDP))
         self.assertTrue(isinstance(model.module[0], nn.Linear))
@@ -350,7 +477,8 @@ def test_auto_wrap_preset_force_leaf_custom(self):
         "cpu_offload",
         [CPUOffload(offload_params=False), CPUOffload(offload_params=True)]
     )
-    def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload):
+    @parametrize("use_device_id", [True, False])
+    def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload, use_device_id):
         # CPU offload and CUDA after don't work together as expected.
         if (
             cpu_offload.offload_params and fsdp_init_mode == FSDPInitMode.CUDA_AFTER
@@ -359,6 +487,9 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload):
 
         device = torch.device("cuda")
         torch.cuda.set_device(0)
+        device_id = (
+            torch.device("cuda", torch.cuda.current_device()) if use_device_id else None
+        )
 
         # Random port in case the next test run quickly, same port would cause conflict.
         os.environ["MASTER_ADDR"] = "localhost"
@@ -378,9 +509,11 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload):
         try:
             sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=(not cuda_after_init))
             my_auto_wrap_policy = functools.partial(
-                default_auto_wrap_policy, min_num_params=40
+                size_based_auto_wrap_policy, min_num_params=40
+            )
+            model = FSDP(
+                sequential, cpu_offload=cpu_offload, auto_wrap_policy=my_auto_wrap_policy, device_id=device_id
             )
-            model = FSDP(sequential, cpu_offload=cpu_offload, fsdp_auto_wrap_policy=my_auto_wrap_policy)
             TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
             if cuda_after_init:
                 model = model.cuda()
@@ -396,6 +529,62 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload):
         except FileNotFoundError:
             pass
 
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
+    @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
+    def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+        ignored_modules = [sequential[1], sequential[2][0]]
+        fsdp_kwargs = {
+            "process_group": self.process_group,
+            "auto_wrap_policy": always_wrap_policy,
+            "ignored_modules": ignored_modules,
+        }
+        if wrap_method == WrapMethod.FSDP_CTOR:
+            model = FSDP(sequential, **fsdp_kwargs)
+        elif wrap_method == WrapMethod.WRAP_API:
+            with enable_wrap(wrapper_cls=FSDP, **fsdp_kwargs):
+                model = wrap(sequential)
+        else:
+            assert 0, f"Unsupported wrap method: {wrap_method}"
+        # All non-ignored modules should be wrapped with FSDP
+        self.assertTrue(isinstance(model, FSDP))
+        self.assertTrue(isinstance(model.module[0], FSDP))
+        self.assertTrue(isinstance(model.module[1], nn.Linear))
+        self.assertTrue(isinstance(model.module[2], FSDP))
+        self.assertTrue(isinstance(model.module[2].module[0], nn.Linear))
+        self.assertTrue(isinstance(model.module[2].module[1], FSDP))
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
+    @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API])
+    def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
+        sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
+        ignored_modules = [sequential[1], sequential[2][0]]
+        my_auto_wrap_policy = functools.partial(
+            size_based_auto_wrap_policy, min_num_params=40,
+        )
+        fsdp_kwargs = {
+            "process_group": self.process_group,
+            "auto_wrap_policy": my_auto_wrap_policy,
+            "ignored_modules": ignored_modules,
+        }
+        if wrap_method == WrapMethod.FSDP_CTOR:
+            model = FSDP(sequential, **fsdp_kwargs)
+        elif wrap_method == WrapMethod.WRAP_API:
+            with enable_wrap(wrapper_cls=FSDP, **fsdp_kwargs):
+                model = wrap(sequential)
+        else:
+            assert 0, f"Unsupported wrap method: {wrap_method}"
+        # Since the 2nd linear (`sequential[1]`) is ignored, the wrapping
+        # policy does not exceed the parameter threshold before the inner
+        # sequential (`sequential[2]`) anymore; hence, it flattens
+        # `sequential[0]` and `sequential[2][0]` into `model` and leaves
+        # `sequential[1]` and `sequential[2][1]` as-is since they are ignored
+        self.assertTrue(isinstance(model, FSDP))
+        self.assertTrue(isinstance(model.module[0], nn.Linear))
+        self.assertTrue(isinstance(model.module[1], nn.Linear))
+        self.assertTrue(isinstance(model.module[2], nn.Sequential))
+        self.assertTrue(isinstance(model.module[2][0], nn.Linear))
+        self.assertTrue(isinstance(model.module[2][1], nn.Linear))
 
 
 instantiate_parametrized_tests(TestFSDPWrap)
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index de8ea511b636..ec7db75d49a1 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -6,19 +6,17 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
-import itertools
 import os
 import sys
+import unittest
 from contextlib import suppress
-from typing import Any, List, Type, cast
+from typing import Any, List, cast
 
 import numpy as np
 
 import torch
 import torch.distributed as dist
 
-import unittest
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -33,16 +31,17 @@
 from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim import SGD
-from torch.testing._internal import common_distributed, common_utils
+from torch.optim import SGD, AdamW
+from torch.testing._internal import common_distributed
 from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
     TEST_WITH_ASAN,
     TEST_WITH_DEV_DBG_ASAN,
-    sandcastle_skip_if,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
 )
 
-from torch.testing._internal.common_utils import IS_WINDOWS
-
 try:
     import torchvision
     HAS_TORCHVISION = True
@@ -60,30 +59,19 @@ def _get_backend_for_tests():
 
 BACKEND = _get_backend_for_tests()
 
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def check_same_model_params(model_a: torch.nn.Module, model_b: torch.nn.Module, message: str = "") -> None:
-    for p_a, p_b in zip(model_a.parameters(), model_b.parameters()):
-        assert torch.allclose(p_a, p_b, atol=1e-3), f"Model parameters differ\n{p_a} {p_b}\n" + message
-
-    for b_a, b_b in zip(model_a.buffers(), model_b.buffers()):
-        assert torch.allclose(b_a, b_b), f"Model buffers differ {b_a} - {b_b}\n" + message
-
-
 @unittest.skipIf(
-    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN doesnt work."
+    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work."
 )
 class TestZeroRedundancyOptimizer(common_distributed.MultiProcessTestCase):
     def setUp(self):
         super(TestZeroRedundancyOptimizer, self).setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
-
         self._spawn_processes()
 
     @property
     def device(self):
-        return torch.device(self.rank) if torch.cuda.is_available() else torch.device("cpu")
+        return torch.device("cuda") if torch.cuda.is_available() \
+            else torch.device("cpu")
 
     @property
     def world_size(self):
@@ -94,7 +82,6 @@ def tearDown(self):
             torch.distributed.destroy_process_group()
         except AssertionError:
             pass
-
         try:
             os.remove(self.file_name)
         except OSError:
@@ -104,75 +91,94 @@ def dist_init(self, rank, world_size=-1, backend=BACKEND):
         if (world_size < 1):
             world_size = self.world_size
         store = dist.FileStore(self.file_name, world_size)
-        return dist.init_process_group(backend=backend, store=store, rank=rank, world_size=world_size)
+        return dist.init_process_group(
+            backend=backend, store=store, rank=rank, world_size=world_size,
+        )
 
 
 # TODO: sandcastle_skip_if does not work here.
 @unittest.skipIf(
-    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN doesnt work."
+    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work."
 )
 class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
     def test_state_dict(self):
-        """Check that the ZeroRedundancyOptimizer exposes the expected state dict interface,
-        irrespective of the sharding.
-        """
+        """Check that ZeroRedundancyOptimizer exposes the expected state dict
+        interface, irrespective of the sharding."""
         self.dist_init(self.rank)
-        x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.1, momentum=0.9)
+        LR1 = 0.1
+        LR2 = 0.01
+        MOMENTUM = 0.9
+        RECIPIENT_RANK = 0  # rank 0 is the only rank since the world size is 1
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer(
+            [x], optimizer_class=SGD, lr=LR1, momentum=MOMENTUM,
+        )
         x.backward()
         o.step()
-        self.assertEqual(x, torch.tensor([0.9], device=DEVICE))
-        self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.0], device=DEVICE))
+        self.assertEqual(x, torch.tensor([0.9], device=self.device))
+        self.assertEqual(
+            o.optim.state[x]["momentum_buffer"],
+            torch.tensor([1.0], device=self.device),
+        )
 
         o.zero_grad()
-        o.consolidate_state_dict()  # Sync state dict in between replicas - even if there are none
+        o.consolidate_state_dict(to=RECIPIENT_RANK)
         state_dict = o.state_dict()
 
-        # Check that the state dict is pytorch-compliant key wise
+        # Check that the state dict has keys compliant with PyTorch
         self.assertIn("param_groups", state_dict.keys())
         self.assertIn("state", state_dict.keys())
 
-        # Check that the pulled state is what we expect, and that we have all the expected keys
+        # Check that the state has the expected keys
         self.assertEqual(state_dict["param_groups"][0]["lr"], 0.1)
         self.assertEqual(state_dict["param_groups"][0]["momentum"], 0.9)
         self.assertFalse(state_dict["param_groups"][0]["nesterov"])
         self.assertEqual(state_dict["param_groups"][0]["weight_decay"], 0.0)
         self.assertEqual(state_dict["param_groups"][0]["dampening"], 0.0)
 
-        # Check that the pulled state and the .param_groups attribute are in sync
-        for k in state_dict["param_groups"][0].keys():
+        # Check that the state and the `param_groups` attribute are in sync
+        for k in state_dict["param_groups"][0]:
             if k != "params":
-                self.assertEqual(state_dict["param_groups"][0][k], o.param_groups[0][k])
+                self.assertEqual(
+                    state_dict["param_groups"][0][k],
+                    o.param_groups[0][k],
+                )
 
-        # Check that it's correctly loaded
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01)
+        # Check that the state is reloaded with the correct values and device
+        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=LR2)
         o.load_state_dict(state_dict)
+        self.assertEqual(
+            o.optim.state[x]["momentum_buffer"],
+            torch.tensor([1.0], device=self.device),
+        )
 
-        # Check that state is correct and on proper device
-        self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.0], device=DEVICE))
-
-        # We should now be using a lr of 0.1, both within the optimizer
-        # and as exposed by the .param_groups attribute
-        assert o.param_groups[0]["lr"] == 0.1
+        # We should we using `LR1` and not `LR2` after reloading, both within
+        # the optimizer and as exposed by the `param_groups` attribute
+        self.assertEqual(o.param_groups[0]["lr"], LR1)
         x.backward()
         o.step()
-        self.assertEqual(x, torch.tensor([0.71], device=DEVICE))
-        self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.9], device=DEVICE))
+        self.assertEqual(x, torch.tensor([0.71], device=self.device))
+        self.assertEqual(
+            o.optim.state[x]["momentum_buffer"],
+            torch.tensor([1.9], device=self.device),
+        )
 
-        # Check that the exposed param_groups are on the proper device
+        # Check that the exposed `param_groups`` are on the proper device
         self.assertEqual(o.param_groups[0]["params"][0].device, x.device)
 
     def test_lr_scheduler(self):
-        """ Check that a normal torch lr_scheduler is usable with ZeroRedundancyOptimizer"""
-
+        """Check that a normal PyTorch ``lr_scheduler`` is usable with
+        ZeroRedundancyOptimizer."""
         self.dist_init(self.rank)
-        x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        x2 = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01)
-        o2 = torch.optim.SGD([x2], lr=0.01)
+        NUM_ITERS = 5
+        LR = 0.01
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        x2 = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=LR)
+        o2 = torch.optim.SGD([x2], lr=LR)
         s = torch.optim.lr_scheduler.StepLR(o, 1)
         s2 = torch.optim.lr_scheduler.StepLR(o2, 1)
-        for _ in range(5):
+        for _ in range(NUM_ITERS):
             x.backward()
             o.zero_grad()
             o.step()
@@ -184,8 +190,9 @@ def test_lr_scheduler(self):
             self.assertEqual(x, x2)
 
     def test_step_with_kwargs(self):
-        """ Check that the `step(**kwargs)` interface is properly exposed"""
+        """Check that the ``step(**kwargs)`` interface is properly exposed."""
         self.dist_init(self.rank)
+        LR = 0.1
 
         class SGDWithStepKWArg(torch.optim.SGD):
             def step(self, closure=None, kwarg=None):
@@ -193,18 +200,21 @@ def step(self, closure=None, kwarg=None):
                 kwarg.append(5)
 
         kwarg: List[Any] = []
-        x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithStepKWArg, lr=0.1)
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer(
+            [x], optimizer_class=SGDWithStepKWArg, lr=LR,
+        )
         x.backward()
         o.step(0, kwarg=kwarg)
         self.assertEqual(kwarg, [5])
-        self.assertEqual(x, torch.tensor([0.9], device=DEVICE))
+        self.assertEqual(x, torch.tensor([0.9], device=self.device))
 
     def test_step_with_extra_inner_key(self):
-        """Check that an optimizer adding extra keys to the param_groups
-        is properly handled, in that the new key is exposed to the user
-        """
+        """Check that ZeroRedundancyOptimizer wrapping an optimizer that adds
+        extra keys to ``param_groups`` exposes those keys through ZeRO's own
+        ``param_groups``."""
         self.dist_init(self.rank)
+        LR = 0.1
 
         class SGDWithNewKey(torch.optim.SGD):
             # Dummy optimizer which adds a new key to the param groups
@@ -212,33 +222,38 @@ def step(self, closure=None):
                 super().step()
                 self.param_groups[0]["new_key"] = 0.1
 
-        x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithNewKey, lr=0.1)
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithNewKey, lr=LR)
         x.backward()
         o.step()
         self.assertEqual(o.param_groups[0]["new_key"], 0.1)
-        self.assertEqual(x, torch.tensor([0.9], device=DEVICE))
+        self.assertEqual(x, torch.tensor([0.9], device=self.device))
 
     def test_step_without_closure(self):
-        """Check that the step() method (without closure) is handlded as expected"""
+        """Check that the ``step()`` method (without closure) is handled as
+        expected."""
         self.dist_init(self.rank)
+        LR = 0.1
 
         class SGDWithoutClosure(torch.optim.SGD):
             def step(self):
                 return super().step()
 
-        x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
-        o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithoutClosure, lr=0.1)
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer(
+            [x], optimizer_class=SGDWithoutClosure, lr=LR,
+        )
         x.backward()
         o.step()
-        self.assertEqual(x, torch.tensor([0.9], device=DEVICE))
+        self.assertEqual(x, torch.tensor([0.9], device=self.device))
 
     def test_zero_grad(self):
-        """Check that the zero_grad attribute is properly handled"""
+        """Check that the ``zero_grad`` method is properly handled."""
         self.dist_init(self.rank)
+        LR = 0.01
         x = torch.rand(1)
         m = torch.nn.Linear(1, 1)
-        o = ZeroRedundancyOptimizer(m.parameters(), optimizer_class=SGD, lr=0.1)
+        o = ZeroRedundancyOptimizer(m.parameters(), optimizer_class=SGD, lr=LR)
         y = m(x)
         y.backward(x)
         self.assertNotEqual(m.weight.grad, torch.zeros_like(m.weight))
@@ -249,27 +264,51 @@ def test_zero_grad(self):
 
     def test_constructor(self):
         """Check the robustness of the ZeroRedundancyOptimizer constructor by
-        passing different values for `params`"""
+        passing different values for the ``params`` argument."""
         self.dist_init(self.rank)
-
-        m = torch.nn.Linear(1, 1)
-        # (input, expected error)
-        inputs = [
-            ([], ValueError),                           # empty parameter list
-            (torch.randn(1), TypeError),                # non-iterable: `torch.Tensor`
-            (1.2, TypeError),                           # non-iterable: `float`
-            ([{"params": m.parameters()}], TypeError),  # iterable of dict
-            (list(m.parameters()) + [42], TypeError),   # iterable containing non-`torch.Tensor`
-            (m.parameters(), None),                     # `params` as a generator
-            (list(m.parameters()), None)                # `params` as a list
+        LR = 0.01
+        m = torch.nn.Sequential(
+            torch.nn.Linear(5, 10),
+            torch.nn.Linear(10, 10),
+            torch.nn.Linear(10, 10),
+        )
+        # Test various constructor inputs in the form: (input, expected error)
+        ctor_inputs = [
+            ([], ValueError),                          # empty parameter list
+            (torch.randn(1), TypeError),               # non-iterable: `torch.Tensor`
+            (1.2, TypeError),                          # non-iterable: `float`
+            ([
+                {"params": [l.weight for l in m]},
+                {"params": [l.bias for l in m]},
+            ], None),                                  # iterable of dict
+            (list(m.parameters()) + [42], TypeError),  # iterable containing invalid type
+            (m.parameters(), None),                    # `params` as a generator
+            (list(m.parameters()), None)               # `params` as a list
         ]
+        for ctor_input, error in ctor_inputs:
+            context = self.assertRaises(error) if error else suppress()
+            with context:
+                ZeroRedundancyOptimizer(
+                    ctor_input, optimizer_class=SGD, lr=LR,
+                )
 
-        for input, error in inputs:
-            if (error):
-                with self.assertRaises(error):
-                    ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
-            else:
-                ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
+        # Test constructing with multiple parameter groups more thoroughly
+        WD = 0.01
+        BETAS = (0.9, 0.999)
+        EPS = 1e-8
+        params = [
+            {"params": [l.weight for l in m], "weight_decay": 0.},
+            {"params": [l.bias for l in m], "weight_decay": WD},
+        ]
+        o = ZeroRedundancyOptimizer(
+            params, optimizer_class=AdamW,
+            lr=LR, betas=BETAS, eps=EPS,
+        )
+        assert len(o.param_groups) == 2, \
+            f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
+        assert len(o.optim.param_groups) == 2, \
+            "Expected 2 local optimizer param groups, but got " \
+            f"{len(o.optim.param_groups)}"
 
     def test_same_dense_param_type(self):
         """Check that ZeroRedundancyOptimizer raises an exception if the input
@@ -279,7 +318,7 @@ def test_same_dense_param_type(self):
         and varying parameter types is added.
         """
         self.dist_init(self.rank)
-
+        LR = 0.01
         inputs = [
             [torch.sparse_coo_tensor(size=(2, 3))],
             [torch.FloatTensor(1), torch.DoubleTensor(1)],
@@ -288,37 +327,63 @@ def test_same_dense_param_type(self):
         ]
         for input in inputs:
             with self.assertRaises(ValueError):
-                ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
+                ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=LR)
 
 
 class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
+    @property
+    def device(self):
+        return torch.device(self.rank) if torch.cuda.is_available() \
+            else torch.device("cpu")
+
     @property
     def world_size(self):
         return min(4, max(2, torch.cuda.device_count()))
 
-    @common_distributed.skip_if_rocm
-    def test_step(self):
-        """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step()` interface"""
+    @property
+    def context(self):
+        return suppress() if not torch.cuda.is_available() \
+            else torch.cuda.device(self.rank)
 
-        if self.rank >= self.world_size or (torch.cuda.is_available() and torch.cuda.device_count() < 2):
-            return
+    def _check_same_model_params(
+        self,
+        model_a: torch.nn.Module,
+        model_b: torch.nn.Module,
+        message: str = "",
+    ) -> None:
+        # Check that model parameters match
+        for p_a, p_b in zip(model_a.parameters(), model_b.parameters()):
+            torch.testing.assert_close(
+                p_a, p_b, atol=1e-3, rtol=1e-5,
+                msg=f"Model parameters differ:\n{p_a} {p_b}\n" + message,
+            )
+        # Check that model buffers match
+        for b_a, b_b in zip(model_a.buffers(), model_b.buffers()):
+            torch.testing.assert_close(
+                b_a, b_b,
+                msg=f"Model buffers differ:\n{b_a} {b_b}\n" + message,
+            )
 
+    @common_distributed.skip_if_no_gpu
+    @common_distributed.skip_if_rocm
+    def test_step(self):
+        """Check that ZeroRedundancyOptimizer properly exposes the ``step()``
+        interface."""
         self.dist_init(self.rank, world_size=self.world_size)
+        LR = 0.01
 
-        context = suppress() if not torch.cuda.is_available() else torch.cuda.device(self.rank)
-
-        with context:
+        with self.context:
             x = torch.tensor([float(self.rank + 1)], device=self.device)
             m = torch.nn.Linear(1, 1)
             m.weight.data = torch.tensor([[1.0]])
             m.bias.data = torch.tensor([2.0])
-            m_zero = copy.deepcopy(m)
-            m.to(self.device)
-            m_zero.to(self.device)
+            m = m.to(self.device)
+            m_zero = copy.deepcopy(m).to(self.device)
 
-            lr = 0.1
-            o = SGD(m.parameters(), lr=lr)
-            o_zero = ZeroRedundancyOptimizer(m_zero.parameters(), optimizer_class=SGD, lr=lr)
+            o = SGD(m.parameters(), lr=LR)
+            o_zero = ZeroRedundancyOptimizer(
+                m_zero.parameters(), optimizer_class=SGD, lr=LR,
+            )
 
             y = m(x)
             y.backward(x)
@@ -337,24 +402,23 @@ def test_step(self):
             self.assertEqual(m.weight, m_zero.weight)
             self.assertEqual(m.bias, m_zero.bias)
 
+    @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm
     def test_step_with_closure(self):
-        """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step(closure)` interface"""
-
-        if self.rank >= self.world_size or (torch.cuda.is_available() and torch.cuda.device_count() < 2):
-            return
-
+        """Check that ZeroRedundancyOptimizer properly exposes the
+        ``step(closure)`` interface."""
         self.dist_init(self.rank, world_size=self.world_size)
 
-        context = suppress() if not torch.cuda.is_available() else torch.cuda.device(self.rank)
-
-        with context:
+        with self.context:
             for bucket_view in [False, True]:
                 x_val = self.rank + 1
                 weight = 1.0
                 bias = 2.0
                 error = 1.0
-                target = torch.tensor([x_val * weight + bias + error], device=self.device)
+                target = torch.tensor(
+                    [x_val * weight + bias + error],
+                    device=self.device,
+                )
                 loss_fn = torch.nn.L1Loss()
 
                 x = torch.tensor([float(x_val)], device=self.device)
@@ -389,32 +453,62 @@ def closure():
                 self.assertEqual(m.weight, torch.tensor([[1.1]]))
                 self.assertEqual(m.bias, torch.tensor([2.1]))
 
+    @common_distributed.skip_if_no_gpu
+    def test_lr_scheduler(self):
+        """Check that a normal PyTorch ``lr_scheduler`` is usable with
+        ZeroRedundancyOptimizer."""
+        self.dist_init(self.rank)
+        x = torch.tensor([1.0], device=self.device, requires_grad=True)
+        x2 = torch.tensor([1.0], device=self.device, requires_grad=True)
+        o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01)
+        o2 = torch.optim.SGD([x2], lr=0.01)
+        s = torch.optim.lr_scheduler.StepLR(o, 1)
+        s2 = torch.optim.lr_scheduler.StepLR(o2, 1)
+        for _ in range(5):
+            x.backward()
+            o.zero_grad()
+            o.step()
+            s.step()
+            x2.backward()
+            o2.zero_grad()
+            o2.step()
+            s2.step()
+            self.assertEqual(x, x2)
+
     def test_sharding(self):
-        """ Check the sharding at construction time
+        """
+        Check ZeroRedundancyOptimizer's parameter sharding at construction
+        time.
 
         NOTE: The correctness of this test depends on the ZeRO implementation
         using the sorted-greedy partitioning algorithm. For details, see
-        `ZeroRedundancyOptimizer._partition_parameters()` in
-        `zero_redundancy_optimizer.py`.
+        ``ZeroRedundancyOptimizer._partition_parameters()`` in
+        zero_redundancy_optimizer.py.
         """
         self.dist_init(self.rank)
+        LR = 0.01
         sizes = [9, 7, 5, 3]
         params = []
         for size in sizes * self.world_size:
             params.append(torch.rand(size, 1))
-        o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1)
-        self.assertEqual(sum([x.numel() for x in o.optim.param_groups[0]["params"]]), sum(sizes))
+        o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR)
+        self.assertEqual(
+            sum([x.numel() for x in o.optim.param_groups[0]["params"]]),
+            sum(sizes),
+        )
 
     def test_add_param_group(self):
-        """Check that ZeroRedundancyOptimizer properly handles adding a new param_group a posteriori,
-        and that all ranks get a shard
+        """Check that ZeroRedundancyOptimizer properly handles adding a new
+        parameter group a posteriori and that all ranks get a shard of the
+        contained parameters.
 
         NOTE: The correctness of this test depends on the ZeRO implementation
         using the sorted-greedy partitioning algorithm. For details, see
-        `ZeroRedundancyOptimizer._partition_parameters()` in
-        `zero_redundancy_optimizer.py`.
+        ``ZeroRedundancyOptimizer._partition_parameters()`` in
+        zero_redundancy_optimizer.py.
         """
         self.dist_init(self.rank)
+        LR = 0.01
 
         # Test with all parameters trainable to begin with
         def all_trainable():
@@ -424,19 +518,26 @@ def all_trainable():
             for size in sizes_world[:-1]:
                 params.append(torch.rand(size, 1))
 
-            # Make sure that the params are trainable, enforces size-based partitioning
+            # Make sure that the params are trainable so that they are factored
+            # into the size-based parameter partitioning
             for p in params:
                 p.requires_grad = True
 
-            o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1)
-
-            assert len(o.param_groups) == 1
+            o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR)
+            self.assertEqual(len(o.param_groups), 1)
             o.add_param_group({"params": [torch.rand(3, 1)]})
-
-            assert len(o.param_groups) == 2
-            # Verify that added group is added to the correct partition making all have the same elements.
-            assert sum([x.numel() for g in o.optim.param_groups for x in g["params"]]) == sum(sizes)
-            assert len(o.optim.param_groups) == 2
+            # Verify that new group is added to the correct partition, making
+            # all partitions have the same elements
+            self.assertEqual(len(o.param_groups), 2)
+            self.assertEqual(
+                sum([
+                    x.numel()
+                    for g in o.optim.param_groups
+                    for x in g["params"]
+                ]),
+                sum(sizes),
+            )
+            self.assertEqual(len(o.optim.param_groups), 2)
 
         # Test a pathological config with a first big non-trainable param
         def some_trainable():
@@ -444,40 +545,108 @@ def some_trainable():
             for size in [100, 3, 5, 2, 6, 4]:
                 params.append(torch.rand(size, 1))
 
-            # Make sure that the params are trainable, enforces size-based partitioning
+            # Make sure that all but the first param are trainable so that they
+            # are factored into the size-based parameter partitioning
             for p in params[1:]:
                 p.requires_grad = True
 
-            o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1)
-
-            assert len(o.param_groups) == 1
+            o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR)
+            self.assertEqual(len(o.param_groups), 1)
             o.add_param_group({"params": [torch.rand(3, 1)]})
-
-            assert len(o.param_groups) == 2
-            assert len(o.optim.param_groups) == 2
+            self.assertEqual(len(o.param_groups), 2)
+            self.assertEqual(len(o.optim.param_groups), 2)
 
         all_trainable()
         some_trainable()
 
-    @common_distributed.skip_if_lt_x_gpu(2)
-    def test_collect_shards(self):
-        """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer"""
+    @common_distributed.skip_if_no_gpu
+    def test_multiple_param_groups(self):
+        """
+        Check parity between constructing ZeRO with multiple parameter groups
+        upfront versus adding parameter groups to ZeRO after construction
+        versus a non-sharded optimizer.
+        """
         self.dist_init(self.rank)
-        RECIPIENT_RANK = 0
-
-        # Run a dummy step so that the optimizer state dict exists
-        batch, input_width, hidden, target_width = 3, 20, 10, 5
-        target = torch.rand((batch, target_width), device=self.device)
-        inputs = torch.rand((batch, input_width), device=self.device)
-
-        model = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, target_width))
-        model.to(self.device)
+        BATCH_SIZE, NUM_ITERS = 8, 3
+        INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 5, 10, 5
+        WD, LR = 0.01, 0.01
+        model1 = torch.nn.Sequential(
+            torch.nn.Linear(INPUT_DIM, HIDDEN_DIM),
+            torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+            torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
+        )
+        model2 = copy.deepcopy(model1)
+        model3 = copy.deepcopy(model1)
+        model1 = model1.to(self.device)
+        model2 = model2.to(self.device)
+        model3 = model3.to(self.device)
+        inputs = [
+            torch.randn(BATCH_SIZE, INPUT_DIM).to(self.device)
+            for _ in range(NUM_ITERS)
+        ]
+        # Construct `optim1` with both parameter groups upfront
+        optim1 = ZeroRedundancyOptimizer(
+            [
+                {"params": [l.weight for l in model1], "weight_decay": 0.},
+                {"params": [l.bias for l in model1], "weight_decay": WD},
+            ],
+            optimizer_class=AdamW, lr=LR,
+        )
+        # Construct `optim2` by adding the second parameter after
+        optim2 = ZeroRedundancyOptimizer(
+            [l.weight for l in model2],
+            optimizer_class=AdamW, lr=LR, weight_decay=0.,
+        )
+        optim2.add_param_group(
+            {"params": [l.bias for l in model2], "weight_decay": WD}
+        )
+        # Construct `optim3` as a non-sharded optimizer
+        optim3 = AdamW(
+            [
+                {"params": [l.weight for l in model3], "weight_decay": 0.},
+                {"params": [l.bias for l in model3], "weight_decay": WD},
+            ], lr=LR,
+        )
+        # Check parity over a few iterations
+        for input in inputs:
+            for model, optim in (
+                (model1, optim1), (model2, optim2), (model3, optim3),
+            ):
+                optim.zero_grad()
+                out = model(input)
+                loss = out.sum()
+                loss.backward()
+                optim.step()
+            for layer1, layer2, layer3 in zip(model1, model2, model3):
+                torch.testing.assert_close(layer1.weight, layer2.weight)
+                torch.testing.assert_close(layer1.weight, layer3.weight)
+                torch.testing.assert_close(layer1.bias, layer2.bias)
+                torch.testing.assert_close(layer1.bias, layer3.bias)
 
+    @common_distributed.skip_if_no_gpu
+    @common_distributed.skip_if_rocm
+    def test_collect_shards(self):
+        """Check the state consolidation mechanism and the state dict exposed
+        by ZeroRedundancyOptimizer."""
+        self.dist_init(self.rank)
+        LR = 1e-3
+        MOMENTUM = 0.99
+        BATCH_SIZE, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 3, 20, 10, 5
+        REFERENCE_RANK = 0
+        target = torch.rand((BATCH_SIZE, OUTPUT_DIM), device=self.device)
+        inputs = torch.rand((BATCH_SIZE, INPUT_DIM), device=self.device)
+        model = torch.nn.Sequential(
+            torch.nn.Linear(INPUT_DIM, HIDDEN_DIM),
+            torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
+        ).to(self.device)
         loss_fn = torch.nn.L1Loss()
         loss_fn.to(self.device)
-
-        # With SGD, Momentum is required to get a state to shard
-        optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99)
+        optimizer = ZeroRedundancyOptimizer(
+            model.parameters(),
+            optimizer_class=SGD,
+            lr=LR,
+            momentum=MOMENTUM,  # ensure there exists state to shard
+        )
 
         def closure():
             optimizer.zero_grad()
@@ -486,56 +655,78 @@ def closure():
             loss.backward()
             return loss
 
+        # Run a dummy step so that the optimizer state dict exists
         _ = optimizer.step(closure=closure)
 
-        # Update the optimizer state on the reference rank
-        optimizer.consolidate_state_dict(to=RECIPIENT_RANK)
-
-        # Fetch the state on the reference rank
-        # - check that it has the correct size
-        # - load it again
-        if self.rank == RECIPIENT_RANK:
+        # Get the optimizer state on the reference rank
+        optimizer.consolidate_state_dict(to=REFERENCE_RANK)
+        if self.rank == REFERENCE_RANK:
+            # Check that the state has the correct size
             optimizer_state_dict = optimizer.state_dict()
-            self.assertEqual(len(optimizer_state_dict["state"]), len(list(model.parameters())))
+            self.assertEqual(
+                len(optimizer_state_dict["state"]),
+                len(list(model.parameters())),
+            )
         else:
             optimizer_state_dict = {}
 
+        # Load the optimizer state on all ranks without any exceptions
         optimizer_state_dict = _broadcast_object(
             optimizer_state_dict,
-            src_rank=RECIPIENT_RANK,
+            src_rank=REFERENCE_RANK,
             group=dist.group.WORLD,
             device=self.device,
         )
-
-        # Load the optimizer state dict, check that no exception is raised
         optimizer.load_state_dict(optimizer_state_dict)
 
-    @sandcastle_skip_if(
-        IS_WINDOWS,
-        "Test is flaky on windows: https://github.com/pytorch/pytorch/issues/66059"
-    )
-    def test_multiple_groups(self):
-        """ Check that the ZeroRedundancyOptimizer handles working with multiple process groups"""
-        self.dist_init(self.rank, self.world_size, dist.Backend.GLOO)
-
-        # Only work with the even ranks, to check that the global_rank indexing is properly used
-        sub_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
-        process_group = torch.distributed.new_group(ranks=sub_group_ranks, backend="gloo")
+    def test_nondefault_process_group(self):
+        """Check that ZeroRedundancyOptimizer works with a non-default process
+        group consisting only of even ranks."""
+        # Skip the test if below the minimum world size since then the test is
+        # trivial
+        MIN_WORLD_SIZE = 4
+        if self.world_size < MIN_WORLD_SIZE:
+            common_distributed.logger.info(
+                "Skipping `test_nondefault_process_group()` since world size "
+                f"of {self.world_size} is less than {MIN_WORLD_SIZE}"
+            )
+            return
+        BACKEND = dist.Backend.GLOO
+        self.dist_init(self.rank, self.world_size, BACKEND)
+        # Use GPU if enough are available, or fall back to CPU otherwise, which
+        # is fine since Gloo backend supports both
+        if torch.cuda.is_available() and \
+                torch.cuda.device_count() >= self.world_size:
+            device = torch.device(self.rank)
+        else:
+            device = torch.device("cpu")
+        # Create a new process group consisting of the even ranks to exercise
+        # the case where the global and local ranks do not necessarily match
+        subgroup_ranks = [r for r in range(self.world_size) if r % 2 == 0]
+        process_group = dist.new_group(
+            ranks=subgroup_ranks, backend=BACKEND,
+        )
+        # Ranks not participating in the new process group are no longer needed
+        if self.rank not in subgroup_ranks:
+            return
 
-        # Make sure that all the ranks get different training data
-        # So that the sync check in between their models is meaningful
+        # Set different seeds across ranks so that each rank gets different
+        # training data and hence the model sync check is meaningful
         torch.manual_seed(self.rank)
         np.random.seed(self.rank)
 
-        # Standard deep learning setup
-        epochs, batch, input_width, hidden, target_width = 5, 3, 20, 10, 5
-        loss_fn = torch.nn.L1Loss().to(self.device)
+        EPOCHS, BATCH_SIZE, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 5, 3, 20, 10, 5
+        LR = 1e-3
+        MOMENTUM = 0.99
+        REFERENCE_RANK = 0
+        assert REFERENCE_RANK in subgroup_ranks, \
+            "Reference rank must be in the new process group"
+        loss_fn = torch.nn.L1Loss().to(device)
 
         def check(optimizer):
-            # Just run a couple of epochs, check that the model is properly updated
-            for _ in range(epochs):
-                target = torch.rand((batch, target_width), device=self.device)
-                inputs = torch.rand((batch, input_width), device=self.device)
+            for _ in range(EPOCHS):
+                target = torch.rand((BATCH_SIZE, OUTPUT_DIM), device=device)
+                inputs = torch.rand((BATCH_SIZE, INPUT_DIM), device=device)
 
                 def closure():
                     optimizer.zero_grad()
@@ -543,167 +734,189 @@ def closure():
                     loss = loss_fn(output, target)
                     loss /= self.world_size
                     loss.backward()
-                    dist.all_reduce(loss, group=process_group)  # Not strictly needed for the test below
-
+                    dist.all_reduce(loss, group=process_group)
                     return loss
 
                 _ = optimizer.step(closure=closure)
 
-                # Check that all the params are the same on all ranks
+                # Check that the parameters match across ranks after a step
                 for pg in optimizer.param_groups:
                     for p in pg["params"]:
-                        receptacle = [p.clone() for _ in sub_group_ranks] if self.rank == 0 else []
-                        dist.gather(p, receptacle, dst=0, group=process_group)
-                        if self.rank == 0:
-                            for sync_p in receptacle[1:]:
-                                assert torch.all(torch.eq(receptacle[0], sync_p)), "Models differ in between ranks"
-
-        if self.rank in sub_group_ranks:
-            # Model fitting in the broadcast bucket
-            model = torch.nn.Sequential(
-                torch.nn.Linear(input_width, hidden),
-                torch.nn.Linear(hidden, target_width),
-            ).to(self.device)
+                        receptacle = [
+                            p.clone() for _ in subgroup_ranks
+                        ] if self.rank == REFERENCE_RANK else []
+                        dist.gather(
+                            p, receptacle, dst=REFERENCE_RANK,
+                            group=process_group,
+                        )
+                        if self.rank == REFERENCE_RANK:
+                            reference_param = receptacle[0]
+                            for param in receptacle[1:]:
+                                torch.testing.assert_close(
+                                    reference_param,
+                                    param,
+                                    msg="Models differ between ranks",
+                                )
 
-            # With SGD, Momentum is required to get a state to shard
-            optimizer = ZeroRedundancyOptimizer(
-                model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99, process_group=process_group
-            )
-            check(optimizer)
+        model = torch.nn.Sequential(
+            torch.nn.Linear(INPUT_DIM, HIDDEN_DIM),
+            torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
+        ).to(device)
+        optimizer = ZeroRedundancyOptimizer(
+            model.parameters(),
+            optimizer_class=SGD,
+            lr=LR,
+            momentum=MOMENTUM,  # ensure there exists state to shard
+            process_group=process_group,
+        )
+        check(optimizer)
 
-            # Model not-fitting in the broadcast bucket
+    @common_distributed.skip_if_no_gpu
+    @parametrize(
+        "optimizer_class_str",
+        ["Adam", "AdamW", "SGD"],
+        # Use string to appease the internal test name parser
+    )
+    @parametrize(
+        "maximize",
+        [False, True],
+    )
+    def test_local_optimizer_parity(
+        self,
+        optimizer_class_str: str,
+        maximize: bool,
+    ):
+        """When combined with DDP, check that a local optimizer gives the same
+        results as wrapping that optimizer with ZeroRedundancyOptimizer."""
+        self.dist_init(self.rank)
+        BATCHES = 20
+        BATCH_SIZE = 64
+        LR = 1e-3
+        INPUT_DIM = 2
+        HIDDEN_DIM = 3
+        OUTPUT_DIM = 3
+        torch.manual_seed(self.rank)
+        np.random.seed(self.rank)
+        if optimizer_class_str == "Adam":
+            optimizer_class = torch.optim.Adam
+        elif optimizer_class_str == "AdamW":
+            optimizer_class = torch.optim.AdamW
+        elif optimizer_class_str == "SGD":
+            optimizer_class = torch.optim.SGD
+        else:
+            assert 0, f"Unsupported optimizer class: {optimizer_class_str}"
+
+        with self.context:
+            # Define a base model with a different buffer for each rank
             model = torch.nn.Sequential(
-                torch.nn.Linear(input_width, hidden),
-                torch.nn.Linear(hidden, target_width),
+                torch.nn.Linear(INPUT_DIM, HIDDEN_DIM),
+                torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+                torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
             ).to(self.device)
-
-            # With SGD, Momentum is required to get a state to shard
-            optimizer = ZeroRedundancyOptimizer(
-                model.parameters(),
-                optimizer_class=SGD,
-                lr=0.1,
-                momentum=0.99,
-                process_group=process_group,
+            model.register_buffer(
+                "test_buffer", torch.ones((1), device=self.device) * self.rank,
+            )
+            # Define models/optimizers for DDP with ZeRO and DDP with local
+            # optimizer
+            defaults = {"maximize": True} if maximize else {}
+            sharded_optimizer = ZeroRedundancyOptimizer(
+                params=model.parameters(), optimizer_class=optimizer_class,
+                lr=LR, **defaults,
+            )
+            sharded_ddp_model = DDP(
+                module=model, device_ids=[self.rank],
+                broadcast_buffers=True, find_unused_parameters=True,
+            )
+            local_model = copy.deepcopy(model).to(self.device)
+            ddp_optimizer = optimizer_class(
+                local_model.parameters(), lr=LR, **defaults,
+            )
+            ddp_model = DDP(
+                local_model, device_ids=[self.rank],
+                broadcast_buffers=True, find_unused_parameters=True,
+            )
+            # Check that the model is properly synchronized between ranks
+            # at construction time
+            self._check_same_model_params(
+                sharded_ddp_model, ddp_model,
+                "Models differ from the start",
             )
-            check(optimizer)
-
-    @common_distributed.skip_if_no_gpu
-    def test_local_optimizer_parity(self):
-        """When combined with DDP, check that ZeroRedundancyOptimizer(optimizer) and the same monolithic optimizer
-        give the exact same results
-        """
 
-        self.dist_init(self.rank)
-        BATCHS = 20
-
-        with torch.cuda.device(self.rank):
-            torch.manual_seed(self.rank)
-            np.random.seed(self.rank)
-
-            def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer], maximize: bool = False):
-                # Any model works. Add one different buffer per rank
-                model = torch.nn.Sequential(
-                    torch.nn.Linear(2, 3),
-                    torch.nn.Linear(3, 3),
-                    torch.nn.Linear(3, 3),
-                )
-                model.register_buffer("test_buffer", torch.ones((1)) * self.rank)
-                model.to(self.device)
+            def check_step():
+                input_tensor = torch.rand((BATCH_SIZE, INPUT_DIM))
 
-                defaults = dict()
+                def closure_ddp(input_tensor=input_tensor):
+                    ddp_optimizer.zero_grad()
+                    ddp_loss = ddp_model(input_tensor).abs().sum()
+                    ddp_loss.backward()
+                    return ddp_loss
 
-                if maximize:
-                    defaults['maximize'] = True
+                def closure_sharded(input_tensor=input_tensor):
+                    sharded_optimizer.zero_grad()
+                    sharded_loss = sharded_ddp_model(input_tensor).abs().sum()
+                    sharded_loss.backward()
+                    return sharded_loss
 
-                sharded_optimizer = ZeroRedundancyOptimizer(
-                    params=model.parameters(), optimizer_class=optimizer, lr=1e-3, **defaults
+                loss_ddp = cast(
+                    torch.Tensor, ddp_optimizer.step(closure=closure_ddp),
                 )
-                sharded_ddp_model = DDP(
-                    module=model, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True
+                loss_sharded_optim = cast(
+                    torch.Tensor,
+                    sharded_optimizer.step(closure=closure_sharded),
                 )
-
-                ddp_model_single = copy.deepcopy(model)
-                ddp_model_single.to(self.device)
-
-                ddp_optimizer = optimizer(ddp_model_single.parameters(), lr=1e-3, **defaults)
-                ddp_model = DDP(
-                    ddp_model_single, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True
+                torch.testing.assert_close(
+                    loss_ddp, loss_sharded_optim,
+                    msg="Losses differ between local optimizer and ZeRO",
+                )
+                self._check_same_model_params(
+                    sharded_ddp_model, ddp_model,
+                    "Models differ after a step",
                 )
 
-                # The model should be synchronized in between the ranks at construction time, check that
-                check_same_model_params(sharded_ddp_model, ddp_model, "Models differ from the start")
-
-                def check_step():
-                    input_tensor = torch.rand((64, 2))
-
-                    def closure_ddp(input_tensor=input_tensor):
-                        ddp_optimizer.zero_grad()
-                        ddp_loss = ddp_model(input_tensor).abs().sum()
-                        ddp_loss.backward()
-                        return ddp_loss
-
-                    def closure_sharded(input_tensor=input_tensor):
-                        sharded_optimizer.zero_grad()
-                        sharded_loss = sharded_ddp_model(input_tensor).abs().sum()
-                        sharded_loss.backward()
-                        return sharded_loss
-
-                    loss_ddp = cast(torch.Tensor, ddp_optimizer.step(closure=closure_ddp))
-                    loss_sharded_optim = cast(torch.Tensor, sharded_optimizer.step(closure=closure_sharded))
-
-                    assert torch.allclose(
-                        loss_ddp, loss_sharded_optim
-                    ), "Losses differ in between Pytorch optim and ZeroRedundancyOptimizer"
-
-                    check_same_model_params(sharded_ddp_model, ddp_model, "Models differ after a step")
-
-                # The models should stay the same in between the ranks
-                for i in range(BATCHS):
-                    check_step()
-
-                    # Change the models trainability, check that parity is maintained
-                    # only check after a couple of constant batchs to go through both regimes
-                    if i > BATCHS // 2:
-                        next(ddp_model.parameters()).requires_grad = bool(i % 2)
-                        next(sharded_ddp_model.parameters()).requires_grad = bool(i % 2)
-
-                # Check that the checkpoints are compatible
-                reference_rank = 0
-                # - get states
-                ddp_state_dict = ddp_optimizer.state_dict()
-                sharded_optimizer.consolidate_state_dict(to=reference_rank)
-                sharded_optim_state_dict = [sharded_optimizer.state_dict() if self.rank == reference_rank else {}]
-                dist.broadcast_object_list(sharded_optim_state_dict, src=reference_rank, group=dist.group.WORLD)
-                sharded_optim_state_dict = sharded_optim_state_dict[0]
-
-                # - cross load the states
-                # run one step and check that the models are still the same
-                ddp_state_dict_ref = copy.deepcopy(ddp_state_dict)  # OSS will remove some states
-                ddp_optimizer.load_state_dict(sharded_optim_state_dict)  # mixup on purpose !
-                sharded_optimizer.load_state_dict(ddp_state_dict)
-                check_step()
-
-                #  - self load, rewind, check no problem
-                # run one step and check that the models are still the same
-                ddp_optimizer.load_state_dict(ddp_state_dict_ref)
-                sharded_optimizer.load_state_dict(sharded_optim_state_dict)
+            # Check that parity is maintained
+            for i in range(BATCHES):
                 check_step()
+                # For the second half of batches, change the parameter
+                # trainability to further test parity
+                if i > BATCHES // 2:
+                    next(ddp_model.parameters()).requires_grad = bool(i % 2)
+                    next(sharded_ddp_model.parameters()).requires_grad = bool(i % 2)
+
+            # Check that the `state_dict` checkpoints are compatible between
+            # the local optimizer and ZeRO
+            REFERENCE_RANK = 0
+            # - Get states
+            ddp_state_dict = ddp_optimizer.state_dict()
+            sharded_optimizer.consolidate_state_dict(to=REFERENCE_RANK)
+            sharded_optim_state_dict = [
+                sharded_optimizer.state_dict()
+                if self.rank == REFERENCE_RANK else {}
+            ]
+            dist.broadcast_object_list(
+                sharded_optim_state_dict, src=REFERENCE_RANK,
+                group=dist.group.WORLD,
+            )
+            sharded_optim_state_dict = sharded_optim_state_dict[0]
 
-            for opt in [torch.optim.Adam, torch.optim.AdamW, torch.optim.SGD]:
-                for maximize in (True, False):
-                    check_optimizer_equivalence(opt, maximize=maximize)
+            # - Cross-load the states
+            # Run one step and check that the models are still the same
+            ddp_state_dict_ref = copy.deepcopy(ddp_state_dict)
+            ddp_optimizer.load_state_dict(sharded_optim_state_dict)
+            sharded_optimizer.load_state_dict(ddp_state_dict)
+            check_step()
 
+            # - Reload their respective states
+            # Run one step and check that the models are still the same
+            ddp_optimizer.load_state_dict(ddp_state_dict_ref)
+            sharded_optimizer.load_state_dict(sharded_optim_state_dict)
+            check_step()
 
     def _test_zero_join(self, device):
-        r"""
-        Check that the ZeRO join hook allows training with uneven inputs when using the given device.
-
-        Arguments:
-            device (torch.device): device used to store parameters and perform
-                collective communications.
-        """
+        """Check that the ZeRO join hook allows training with uneven inputs
+        when using the given device."""
         NUM_INPUTS = 3
         NUM_EPOCHS = 2
+        LR = 0.01
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
 
@@ -712,8 +925,6 @@ def _test_zero_join(self, device):
         is_gpu = device.type == "cuda"
         backend = _get_backend_for_tests() if is_gpu else dist.Backend.GLOO
         self.dist_init(rank, world_size, backend)
-        if is_gpu:
-            torch.cuda.set_device(self.device)
 
         model = torch.nn.Sequential(
             torch.nn.Linear(2, 3),
@@ -726,14 +937,18 @@ def _test_zero_join(self, device):
         # local optimizers on uneven inputs should be equivalent to ZeRO on
         # uneven inputs with gradients being manually set
         ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model)
-        local_optim = torch.optim.Adam(ddp_model.parameters(), lr=0.01)
+        local_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
         zero_model = copy.deepcopy(model)
         zero_model.to(device)
-        zero_optim = ZeroRedundancyOptimizer(zero_model.parameters(), torch.optim.Adam, lr=0.01)
+        zero_optim = ZeroRedundancyOptimizer(
+            zero_model.parameters(), torch.optim.Adam, lr=LR,
+        )
         loss_fn = torch.nn.MSELoss()
 
         # Use uneven inputs: rank i has i extra inputs
-        inputs = [torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)]
+        inputs = [
+            torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)
+        ]
         labels = torch.randn(20, 3).to(device)
 
         # Save the gradients and parameters from DDP as the ground truth; do
@@ -760,7 +975,10 @@ def _test_zero_join(self, device):
         # Broadcast the saved gradients and parameters to all of the other
         # ranks (which joined early)
         grads_and_params = [grads_at_each_iter, params_at_each_iter]
-        grads_and_params = _broadcast_object(grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD, device=device)
+        grads_and_params = _broadcast_object(
+            grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD,
+            device=device,
+        )
         grads_at_each_iter = grads_and_params[0]
         params_at_each_iter = grads_and_params[1]
         # TODO: Replace this `_broadcast_object` with `broadcast_object_list`
@@ -781,8 +999,9 @@ def __init__(self, zero_optim, grads):
                 super().__init__()
 
             def main_hook(self):
-                grads = self.zero._join_grad_info.grads[self.zero._join_grad_info.index]
-                self.zero._join_grad_info.index += 1
+                join_grad_info = self.zero._join_grad_info
+                grads = self.zero._join_grad_info.grads[join_grad_info.index]
+                join_grad_info.index += 1
                 for p, grad in zip(self.zero._all_params, grads):
                     p.grad = grad.detach().clone().to(device)
 
@@ -809,39 +1028,48 @@ def join_process_group(self):
         grads = grads_at_each_iter[-num_grads_after_joining:]
         gradient_setter = _GradientSetter()
         iter = 0
-        with Join([gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads):
+        with Join(
+            [gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads,
+        ):
             for _ in range(NUM_EPOCHS):
                 for input in inputs:
                     # Notify join context that this process has not joined
                     Join.notify_join_context(gradient_setter)
-
                     # Set gradients manually
-                    for p, grad in zip(zero_model.parameters(), grads_at_each_iter[iter]):
+                    for p, grad in zip(
+                        zero_model.parameters(), grads_at_each_iter[iter],
+                    ):
                         p.grad = grad.detach().clone().to(device)
-
                     # Perform optimizer step and check parity
                     zero_optim.step()
-                    for p, ddp_p in zip(zero_model.parameters(), params_at_each_iter[iter]):
-                        assert torch.allclose(p, ddp_p), \
-                            "Parameters differ between using ZeRO and local optimizer"
+                    for p, ddp_p in zip(
+                        zero_model.parameters(), params_at_each_iter[iter],
+                    ):
+                        torch.testing.assert_close(
+                            p, ddp_p,
+                            msg="Parameters differ between using ZeRO and "
+                            "local optimizer",
+                        )
                     iter += 1
 
     @common_distributed.requires_nccl()
-    @common_distributed.skip_if_lt_x_gpu(2)
+    @common_distributed.skip_if_no_gpu
     def test_zero_join_gpu(self):
-        """Check that the ZeRO join hook allows training with uneven inputs on GPU."""
+        """Check that the ZeRO join hook allows training with uneven inputs
+        on GPU."""
         self._test_zero_join(self.device)
 
     @common_distributed.requires_gloo()
     def test_zero_join_cpu(self):
-        """Check that the ZeRO join hook allows training with uneven inputs on CPU."""
+        """Check that the ZeRO join hook allows training with uneven inputs
+        on CPU."""
         self._test_zero_join(torch.device("cpu"))
 
     def _test_zero_model_parallel(self, parameters_as_bucket_view: bool):
         # Use two processes each with two GPUs
         assert self.rank < 2
-        NUM_EPOCHS = 3
-        NUM_INPUTS = 5
+        NUM_EPOCHS = 2
+        NUM_INPUTS = 4
         LR = 0.01
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
@@ -871,17 +1099,20 @@ def __init__(self):
             def forward(self, x):
                 return self.net1(self.relu(self.net0(x)))
 
-        dev0 = 2 * self.rank
-        dev1 = 2 * self.rank + 1
+        dev0 = torch.device(2 * self.rank)
+        dev1 = torch.device(2 * self.rank + 1)
         mp_model = ModelParallelModel(dev0, dev1)
         ddp_model = DDP(mp_model)
-        local_model = LocalModel()
-        cpu_device = torch.device("cpu")
+        local_model = LocalModel().to(dev0)
+
         # Ensure the parameters are the same across the two models
-        local_model.net0.weight = torch.nn.Parameter(mp_model.net0.weight.detach().clone().to(cpu_device))
-        local_model.net0.bias = torch.nn.Parameter(mp_model.net0.bias.detach().clone().to(cpu_device))
-        local_model.net1.weight = torch.nn.Parameter(mp_model.net1.weight.detach().clone().to(cpu_device))
-        local_model.net1.bias = torch.nn.Parameter(mp_model.net1.bias.detach().clone().to(cpu_device))
+        def copy_param(p):
+            return torch.nn.Parameter(p.detach().clone().to(dev0))
+
+        local_model.net0.weight = copy_param(mp_model.net0.weight)
+        local_model.net0.bias = copy_param(mp_model.net0.bias)
+        local_model.net1.weight = copy_param(mp_model.net1.weight)
+        local_model.net1.bias = copy_param(mp_model.net1.bias)
 
         # Compare parity between DDP with model parallelism using ZeRO and
         # a local model using a local optimizer
@@ -889,10 +1120,10 @@ def forward(self, x):
             ddp_model.parameters(),
             optimizer_class=torch.optim.Adam,
             parameters_as_bucket_view=parameters_as_bucket_view,
-            lr=LR
+            lr=LR,
         )
         local_optim = torch.optim.Adam(local_model.parameters(), lr=LR)
-        inputs = [torch.randn(20, 10) for _ in range(NUM_INPUTS)]
+        inputs = [torch.randn(20, 10).to(dev0) for _ in range(NUM_INPUTS)]
 
         for _ in range(NUM_EPOCHS):
             for input in inputs:
@@ -908,40 +1139,42 @@ def closure_ddp():
                     ddp_loss.backward()
                     return ddp_loss
 
-                local_loss = cast(torch.Tensor, local_optim.step(closure=closure_local))
-                ddp_loss = cast(torch.Tensor, zero_optim.step(closure=closure_ddp)).to(cpu_device)
-
-                # Increased tolerances are needed to pass test when using TensorFloat32
-                # see https://github.com/pytorch/pytorch/issues/67764
-                assert torch.allclose(
-                    local_loss, ddp_loss, rtol=1e-03
-                ), "Losses differ between local optim and ZeroRedundancyOptimizer"
+                local_loss = cast(
+                    torch.Tensor, local_optim.step(closure=closure_local)
+                )
+                ddp_loss = cast(
+                    torch.Tensor, zero_optim.step(closure=closure_ddp)
+                )
 
-                for local_p, ddp_p in zip(local_model.parameters(), ddp_model.parameters()):
-                    ddp_p = ddp_p.to(cpu_device)
-                    assert torch.allclose(local_p, ddp_p, rtol=1e-03, atol=1e-04), "Models differ after a step"
+                # Increased tolerances are needed to pass when using TF32
+                # See: https://github.com/pytorch/pytorch/issues/67764
+                torch.testing.assert_close(
+                    local_loss.cpu(), ddp_loss.cpu(), rtol=1e-03, atol=1e-08,
+                ), "Losses differ between local optimizer and ZeRO"
 
-    @common_distributed.skip_if_lt_x_gpu(4)
-    def test_zero_model_parallel_with_bucket_view(self):
-        """
-        Check that ZeRO works with model parallelism where layers are sharded
-        across devices when ``parameters_as_bucket_view=True``.
-        """
-        if self.rank >= 2:
-            return
-        self.dist_init(self.rank, world_size=2)
-        self._test_zero_model_parallel(parameters_as_bucket_view=True)
+                for local_p, ddp_p in zip(
+                    local_model.parameters(),
+                    ddp_model.parameters()
+                ):
+                    torch.testing.assert_close(
+                        local_p.cpu(), ddp_p.cpu(), rtol=1e-03, atol=1e-04,
+                    ), "Models differ after a step"
 
     @common_distributed.skip_if_lt_x_gpu(4)
-    def test_zero_model_parallel_without_bucket_view(self):
-        """
-        Check that ZeRO works with model parallelism where layers are sharded
-        across devices when ``parameters_as_bucket_view=False``.
-        """
+    @parametrize(
+        "parameters_as_bucket_view",
+        [False, True],
+    )
+    def test_zero_model_parallel(
+        self,
+        parameters_as_bucket_view: bool,
+    ):
+        """Check that ZeRO works with model parallelism where the model's
+        layers are assigned to different devices."""
         if self.rank >= 2:
             return
         self.dist_init(self.rank, world_size=2)
-        self._test_zero_model_parallel(parameters_as_bucket_view=False)
+        self._test_zero_model_parallel(parameters_as_bucket_view)
 
     def _test_ddp_zero_overlap(
         self,
@@ -962,22 +1195,21 @@ def _test_ddp_zero_overlap(
         is_gpu = device.type == "cuda"
         if is_gpu:
             torch.cuda.set_device(device)
-        models_to_test = [
-            (
-                torch.nn.Sequential(
-                    torch.nn.Linear(1000, 2000),
-                    torch.nn.Linear(2000, 500)
-                ),
-                [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)]
+        models_to_test = [(
+            torch.nn.Sequential(
+                torch.nn.Linear(1000, 2000),
+                torch.nn.Linear(2000, 500),
             ),
-        ]
+            [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)],
+        )]
         if HAS_TORCHVISION:
-            models_to_test.append(
-                (
-                    torchvision.models.resnet50(),
-                    [torch.randn(1, 3, 3, 1000).to(device) for _ in range(NUM_INPUTS)]
-                )
-            )
+            models_to_test.append((
+                torchvision.models.resnet50(),
+                [
+                    torch.randn(1, 3, 3, 1000).to(device)
+                    for _ in range(NUM_INPUTS)
+                ]
+            ))
         for (model, inputs) in models_to_test:
             # Enable determinism in cudnn operators
             with torch.backends.cudnn.flags(
@@ -1002,7 +1234,10 @@ def _test_ddp_zero_overlap(
                 )
                 ddp_model_overlap.register_comm_hook(
                     None,
-                    hook_constructor(allreduce_hook, ddp_model_overlap, zero_optim, **kwargs)
+                    hook_constructor(
+                        allreduce_hook, ddp_model_overlap, zero_optim,
+                        **kwargs,
+                    )
                 )
 
                 # Set up the DDP model with local optimizer
@@ -1067,120 +1302,73 @@ def _test_ddp_zero_overlap(
                     self.assertEqual(p1, p2)
 
                 # Check that the parameters were updated
-                self.assertNotEqual(init_params_overlap, list(ddp_model_overlap.parameters()))
+                self.assertNotEqual(
+                    init_params_overlap, list(ddp_model_overlap.parameters()),
+                )
 
                 # Ensure that this test runs independently
                 dist.barrier()
 
+    # NOTE: The test is skipped if using Windows since functional optimizers
+    # are not currently supported.
     @common_distributed.skip_if_win32()
     @common_distributed.requires_nccl()
     @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm
-    def test_ddp_with_zero_step_parity_gpu(self):
-        r"""
-        Check that overlapping DDP with ZeRO using ``hook_with_zero_step()``
-        achieves parity with DDP using a local optimizer when running on GPU.
-
-        NOTE: The test is skipped if using Windows since functional optimizers
-        are not currently supported.
+    @parametrize(
+        "use_gpu",
+        [True],
+        # Add `False` once the Gloo sync issue causing hangs is fixed
+        # See: https://github.com/pytorch/pytorch/issues/62300
+    )
+    @parametrize(
+        "use_interleaved_hook",
+        [False, True],
+    )
+    @parametrize(
+        "gradient_as_bucket_view",
+        [False, True],
+    )
+    @parametrize(
+        "static_graph",
+        [False, True],
+    )
+    @parametrize(
+        "shard_buckets",
+        [False, True],
+    )
+    def test_ddp_zero_overlap(
+        self,
+        use_gpu: bool,
+        use_interleaved_hook: bool,
+        gradient_as_bucket_view: bool,
+        static_graph: bool,
+        shard_buckets: bool,
+    ):
         """
-        self.dist_init(self.rank, self.world_size, dist.Backend.NCCL)
-        for gradient_as_bucket_view, static_graph in itertools.product(
-            [True, False],
-            [True, False]
-        ):
-            self._test_ddp_zero_overlap(
-                torch.device(self.rank),
-                hook_with_zero_step,
-                gradient_as_bucket_view,
-                static_graph
-            )
-    # TODO: Add `test_ddp_with_zero_step_parity_cpu()` once the Gloo
-    # synchronization issue causing hangs is fixed.
-
-    @common_distributed.skip_if_win32()
-    @common_distributed.requires_nccl()
-    @common_distributed.skip_if_no_gpu
-    @common_distributed.skip_if_rocm
-    def test_ddp_with_zero_step_interleaved_parity_gpu(self):
-        r"""
-        Check that overlapping DDP with ZeRO using
-        ``hook_with_zero_step_interleaved()`` achieves parity with DDP using a
-        local optimizer when running on GPU.
-
-        NOTE: The test is skipped if using Windows since functional optimizers
-        are not currently supported.
+        Check that overlapping DDP with ZeRO using the given method determined
+        by ``hook_constructor`` and ``shard_buckets`` and using the given ZeRO
+        and DDP arguments achieves parity with DDP using a local optimizer.
         """
-        self.dist_init(self.rank, self.world_size, dist.Backend.NCCL)
-        for gradient_as_bucket_view, static_graph in itertools.product(
-            [True, False],
-            [True, False]
-        ):
+        device = torch.device(self.rank) if use_gpu else torch.device("cpu")
+        backend = _get_backend_for_tests()
+        self.dist_init(self.rank, self.world_size, backend)
+        hook_constructor = hook_with_zero_step if not use_interleaved_hook \
+            else hook_with_zero_step_interleaved
+
+        # Disable DDP + ReplicatedTensor since ZeroRedundancyOptimizer
+        # modifies the model parameters in place.
+        from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
+        with _ddp_replicated_tensor(False):
             self._test_ddp_zero_overlap(
-                torch.device(self.rank),
-                hook_with_zero_step_interleaved,
-                gradient_as_bucket_view,
-                static_graph
+                device, hook_constructor, gradient_as_bucket_view, static_graph,
+                shard_buckets=shard_buckets,
             )
-    # TODO: Add `test_ddp_with_zero_step_interleaved_parity_cpu()` once the
-    # Gloo synchronization issue causing hangs is fixed.
 
-    @common_distributed.skip_if_win32()
-    @common_distributed.requires_nccl()
-    @common_distributed.skip_if_no_gpu
-    @common_distributed.skip_if_rocm
-    def test_ddp_with_zero_step_uniform_parity_gpu(self):
-        r"""
-        Check that overlapping DDP with ZeRO using
-        ``hook_with_zero_step()`` with ``shard_buckets=True``
-        achieves parity with DDP using a local optimizer when running on GPU.
-
-        NOTE: The test is skipped if using Windows since functional optimizers
-        are not currently supported.
-        """
-        self.dist_init(self.rank, self.world_size, dist.Backend.NCCL)
-        for gradient_as_bucket_view, static_graph in itertools.product(
-            [True, False],
-            [True, False]
-        ):
-            self._test_ddp_zero_overlap(
-                torch.device(self.rank),
-                hook_with_zero_step,
-                gradient_as_bucket_view,
-                static_graph,
-                shard_buckets=True,
-            )
-    # TODO: Add `test_ddp_with_zero_step_uniform_parity_cpu()` once the Gloo
-    # synchronization issue causing hangs is fixed.
 
-    @common_distributed.skip_if_win32()
-    @common_distributed.requires_nccl()
-    @common_distributed.skip_if_no_gpu
-    @common_distributed.skip_if_rocm
-    def test_ddp_with_zero_step_interleaved_uniform_parity_gpu(self):
-        r"""
-        Check that overlapping DDP with ZeRO using
-        ``hook_with_zero_step()`` with ``shard_buckets=True``
-        achieves parity with DDP using a local optimizer when running on GPU.
-
-        NOTE: The test is skipped if using Windows since functional optimizers
-        are not currently supported.
-        """
-        self.dist_init(self.rank, self.world_size, dist.Backend.NCCL)
-        for gradient_as_bucket_view, static_graph in itertools.product(
-            [True, False],
-            [True, False]
-        ):
-            self._test_ddp_zero_overlap(
-                torch.device(self.rank),
-                hook_with_zero_step_interleaved,
-                gradient_as_bucket_view,
-                static_graph,
-                shard_buckets=True,
-            )
-    # TODO: Add `test_ddp_with_zero_step_interleaved_uniform_parity_cpu()` once
-    # the Gloo synchronization issue causing hangs is fixed.
+instantiate_parametrized_tests(TestZeroRedundancyOptimizerSingleRank)
+instantiate_parametrized_tests(TestZeroRedundancyOptimizerDistributed)
 
 if __name__ == "__main__":
     # ! unittest should not be used here, else the tests are not properly registered
-    common_utils.run_tests()
+    run_tests()
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 822cd3b09d3a..5c29f1fd448d 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -9,6 +9,7 @@
 from datetime import timedelta
 from itertools import product
 from sys import platform
+from contextlib import suppress
 
 import torch
 import torch.distributed as dist
@@ -18,6 +19,7 @@
     sys.exit(0)
 
 import torch.distributed.distributed_c10d as c10d
+from torch.utils.checkpoint import checkpoint
 import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
@@ -25,12 +27,16 @@
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
+    skip_if_lt_x_gpu,
 )
+
 from torch.testing._internal.common_utils import (
     TestCase,
     load_tests,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize
 )
 
 if TEST_WITH_DEV_DBG_ASAN:
@@ -238,7 +244,7 @@ def forward(self, x):
         return F.softmax(self.embedding(x), dim=1)
 
 
-class AbstractDistributedDataParallelTest(object):
+class CommonDistributedDataParallelTest(object):
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
         # TODO: investigate this test and the test is known to have issues
@@ -307,6 +313,363 @@ def _prepare_multi_device_module(
 
         return model, ddp_model, input, target
 
+    def _get_store(self):
+        return dist.FileStore(self.file_name, self.world_size)
+
+    def _get_process_group(self):
+        raise NotImplementedError("To be implemented by child class")
+
+    def _train_model(self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True):
+        model.train()
+        if run_checkpoint:
+            output = checkpoint(model, input_var, use_reentrant=use_reentrant)
+        else:
+            output = model(input_var)
+        l = loss(output, target)
+        l.backward()
+
+    def _test_ddp_checkpointing(
+        self,
+        input_model,
+        process_group,
+        use_bucket_view,
+        find_unused_parameters=False,
+        static_graph=False,
+        run_checkpoint=False,
+        use_reentrant=True,
+        allow_none_grads=False,
+    ):
+        # to reproduce the same training results
+        torch.cuda.set_device(self.rank)
+        torch.manual_seed(31415)
+        model = copy.deepcopy(input_model).cuda()
+        ddp_model = copy.deepcopy(input_model).cuda()
+        ddp_model = nn.parallel.DistributedDataParallel(
+            ddp_model,
+            bucket_cap_mb=1,
+            gradient_as_bucket_view=use_bucket_view,
+            device_ids=[self.rank],
+            process_group=process_group,
+            find_unused_parameters=find_unused_parameters,
+            static_graph=static_graph,
+        )
+        self.assertEqual(
+            ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph
+        )
+        input, ddp_input, target, ddp_target = self._prepare_dummy_data()
+        loss = nn.MSELoss()
+        n_iters = 5
+        for i in range(n_iters):
+            model.zero_grad(set_to_none=False)
+            ddp_model.zero_grad(set_to_none=False)
+            self._train_model(model, input, target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant)
+            self._train_model(
+                ddp_model, ddp_input, ddp_target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant
+            )
+            for i, j in zip(model.parameters(), ddp_model.parameters()):
+                if not allow_none_grads:
+                    self.assertTrue(i.grad is not None)
+                    self.assertTrue(j.grad is not None)
+                self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
+
+    # A list of tests for ddp with activation checkpointing
+    # when gradient_as_bucket_view=True, False.
+    # Most of the tests are referred to
+    # https://github.com/facebookresearch/fairscale/blob/main/tests/nn/pipe/test_checkpoint_ddp.py
+    class CheckpointOnceModule(nn.Module):
+        """
+        Runs checkpoint for a single layer in the model.
+        """
+        def __init__(self, use_reentrant=True):
+            super().__init__()
+            self.l1 = nn.Linear(20, 20)
+            self.l2 = nn.Linear(20, 20)
+            self.use_reentrant = use_reentrant
+
+        def forward(self, inp):
+            x = self.l1(inp)
+            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
+            return x
+
+    class CheckpointTwiceModule(CheckpointOnceModule):
+        """
+        Runs checkpoint for the same layer twice in a model. This simulates use
+        cases such as pipeline parallel where the same layer can be checkpointed
+        more than one time.
+        """
+        def __init__(self, use_reentrant=True):
+            super().__init__(use_reentrant=use_reentrant)
+
+        def forward(self, inp):
+            x = self.l1(inp)
+            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
+            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
+            return x
+
+    class CheckpointTwiceModuleWeightSharing(CheckpointTwiceModule):
+        """
+        Similar to CheckpointTwiceModule but the weights are shared.
+        """
+        def __init__(self, use_reentrant=True):
+            super().__init__(use_reentrant=use_reentrant)
+            # Share weights
+            self.l1.weight = self.l2.weight
+
+        def forward(self, inp):
+            x = self.l1(inp)
+            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
+            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
+            return x
+
+
+    class DynamicCheckpointTwiceModule(CheckpointTwiceModule):
+        def __init__(self, use_reentrant=True):
+            super().__init__(use_reentrant=use_reentrant)
+            self.count = 0
+
+        def forward(self, inp):
+            if self.count % 2:
+                x = checkpoint(self.l1, inp, use_reentrant=self.use_reentrant)
+            else:
+                x = checkpoint(self.l2, inp, use_reentrant=self.use_reentrant)
+
+            self.count += 1
+            return x
+
+    class DynamicCheckpointTwiceModuleWeightSharing(DynamicCheckpointTwiceModule):
+        def __init__(self, use_reentrant=True):
+            super().__init__(use_reentrant=use_reentrant)
+            # Share weights
+            self.l1.weight = self.l2.weight
+
+
+    def _prepare_dummy_data(self):
+        ddp_bs = 16
+        bs = ddp_bs * self.world_size
+        input = torch.rand((bs, 20), device="cuda", requires_grad=True)
+        target = torch.randn((bs, 20), device="cuda")
+        offset = self.rank * ddp_bs
+        ddp_input = input[offset : offset + ddp_bs]
+        ddp_target = target[offset : offset + ddp_bs]
+        return input, ddp_input, target, ddp_target
+
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_ddp_checkpointing_once(self, use_reentrant):
+        """
+        DDP works as expected when layer is checkpointed only once.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view, static_graph in product((False, True), (False, True)):
+            self._test_ddp_checkpointing(
+                self.CheckpointOnceModule(use_reentrant=use_reentrant),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                static_graph=static_graph,
+            )
+            if static_graph:
+                # find_unused_parameters does not make a difference, since it is
+                # ignored for static graph.
+                self._test_ddp_checkpointing(
+                    self.CheckpointOnceModule(),
+                    process_group=process_group,
+                    use_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                    find_unused_parameters=True,
+                )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_ddp_checkpointing_unused_params(self, use_reentrant):
+        """
+        With reentrant autograd checkpointing impl, DDP will fail when there are
+        unused params in the model and no static graph training. With
+        non-reentrant checkpointing implementation, this works as expected.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view in (True, False):
+            err_ctx = (
+                suppress() if not use_reentrant else
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to mark a variable ready only once."
+                )
+            )
+            with err_ctx:
+                model = self._test_ddp_checkpointing(
+                    self.CheckpointOnceModule(use_reentrant=use_reentrant),
+                    process_group=process_group,
+                    use_bucket_view=use_bucket_view,
+                    find_unused_parameters=True,
+                )
+            # test passes when static_graph is true
+            model = self._test_ddp_checkpointing(
+                self.CheckpointOnceModule(use_reentrant=use_reentrant),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                find_unused_parameters=True,
+                static_graph=True,
+            )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_ddp_checkpointing_twice(self, use_reentrant):
+        """
+        Checkpoitning twice fails for non-static graph with reentrant checkpoint
+        implementation, succeeds with non-reentrant checkpoint implementation.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view in (True, False):
+            err_ctx = (
+                suppress() if not use_reentrant else
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to mark a variable ready only once."
+                )
+            )
+            with err_ctx:
+                model = self._test_ddp_checkpointing(
+                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
+                    process_group=process_group,
+                    use_bucket_view=use_bucket_view,
+                    static_graph=False,
+                )
+
+            with err_ctx:
+                model = self._test_ddp_checkpointing(
+                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
+                    process_group=process_group,
+                    use_bucket_view=use_bucket_view,
+                    static_graph=False,
+                    find_unused_parameters=True,
+                )
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_ddp_checkpointing_twice_static_graph(self, use_reentrant):
+        """
+        Regardless of reentrant or non-reentrant checkpointing impl,
+        checkpointing twice works with static graph enabled.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view in (True, False):
+            # Test passes when static_graph=True.
+            model = self._test_ddp_checkpointing(
+                self.CheckpointTwiceModule(use_reentrant=use_reentrant),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                static_graph=True,
+            )
+
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_checkpointing_dynamic_module(self):
+        """
+        Dynamic module can be checkpointed, multiple times, with non-reentrant
+        checkpointing implementation.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view in (True, False):
+            model = self._test_ddp_checkpointing(
+                self.DynamicCheckpointTwiceModule(use_reentrant=False),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                static_graph=False,
+                find_unused_parameters=True,
+                # Grads can be none sometimes due to dynamic module not using
+                # all params.
+                allow_none_grads=True
+            )
+
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_checkpointing_dynamic_weight_sharing(self):
+        """
+        Dynamic module can be checkpointed multiple times with weight sharing
+        using non-reentrant checkpointing implementation.
+        """
+        process_group = self._get_process_group()
+        for use_bucket_view in (True, False):
+            model = self._test_ddp_checkpointing(
+                self.DynamicCheckpointTwiceModuleWeightSharing(use_reentrant=False),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                static_graph=False,
+                find_unused_parameters=True,
+                # Grads can be none sometimes due to dynamic module not using
+                # all params.
+                allow_none_grads=True
+            )
+
+    # DDP works as expected if there is weight sharing among layers
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_ddp_checkpointing_weight_sharing(self, use_reentrant):
+        """
+        Test that checkpointing with weight sharing works.
+        """
+        process_group = self._get_process_group()
+        torch.cuda.set_device(self.rank)
+        for use_bucket_view, static_graph in product((False, True), (False, True)):
+            torch.manual_seed(31415)
+            l1 = nn.Linear(20, 20)
+            l2 = nn.Linear(20, 20)
+            l1.weight = l2.weight
+            model = nn.Sequential(l1, l2)
+            # TODO: non-reentrant based checkpointing of DDP module with
+            # static_graph runs into the below issue, see
+            # https://github.com/pytorch/pytorch/issues/70865 and
+            # https://github.com/pytorch/pytorch/issues/58111 for details.
+            err_ctx = (
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    "Your training graph has changed in this iteration"
+                ) if static_graph and not use_reentrant else suppress()
+            )
+            with err_ctx:
+                self._test_ddp_checkpointing(
+                    model,
+                    process_group=process_group,
+                    use_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                    run_checkpoint=True,
+                    use_reentrant=use_reentrant,
+                )
+
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_checkpointing_twice_weight_sharing(self):
+        """
+        Checkpointing should work with static graph in the case of checkpointing
+        same layer twice and having weights shared acrosss layers.
+        """
+        process_group = self._get_process_group()
+        torch.cuda.set_device(self.rank)
+        for use_bucket_view in (True, False):
+            model = self._test_ddp_checkpointing(
+                self.CheckpointTwiceModuleWeightSharing(),
+                process_group=process_group,
+                use_bucket_view=use_bucket_view,
+                static_graph=True,
+            )
+
+    def test_invalid_powerSGD_state(self):
+        for start_powerSGD_iter, use_error_feedback, warm_start in product(
+            [0, 1], [True, False], [True, False]
+        ):
+            if not use_error_feedback and not warm_start:
+                continue
+            with self.assertRaisesRegex(
+                ValueError,
+                "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
+                "because PowerSGD can only be applied after the first two iterations in DDP.",
+            ):
+                state = powerSGD.PowerSGDState(
+                    process_group=None,
+                    matrix_approximation_rank=1,
+                    start_powerSGD_iter=start_powerSGD_iter,
+                    use_error_feedback=use_error_feedback,
+                    warm_start=warm_start,
+                )
+
     def _test_ddp_with_process_group(
         self,
         process_group,
@@ -443,33 +806,101 @@ def fut_then(fut):
 
         return fut.then(fut_then)
 
+    def _test_not_nan(self, model, x):
+        y = model(x)
+        self.assertFalse(y.isnan().any().item())
+        y.sum().backward()
+        for p in model.parameters():
+            self.assertFalse(p.grad.isnan().any().item())
+
+    @skip_if_lt_x_gpu(2)
+    def test_sync_batch_norm_only_empty_input(self):
+        pg = self._get_process_group()
+
+        model = torch.nn.Sequential(
+            nn.BatchNorm2d(2),
+        ).to(device=self.rank)
+        model = DistributedDataParallel(
+            model,
+            device_ids=[self.rank],
+            process_group=pg,
+        )
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(
+            model,
+            process_group=pg,
+        )
 
-class DistributedDataParallelTest(
-    AbstractDistributedDataParallelTest, MultiProcessTestCase
-):
-    def setUp(self):
-        super(DistributedDataParallelTest, self).setUp()
-        self._spawn_processes()
+        model.train()
 
-    def test_invalid_powerSGD_state(self):
-        for start_powerSGD_iter, use_error_feedback, warm_start in product(
-            [0, 1], [True, False], [True, False]
-        ):
-            if not use_error_feedback and not warm_start:
-                continue
-            with self.assertRaisesRegex(
-                ValueError,
-                "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
-                "because PowerSGD can only be applied after the first two iterations in DDP.",
-            ):
-                state = powerSGD.PowerSGDState(
-                    process_group=None,
-                    matrix_approximation_rank=1,
-                    start_powerSGD_iter=start_powerSGD_iter,
-                    use_error_feedback=use_error_feedback,
-                    warm_start=warm_start,
-                )
+        # only rank 0 receives empty inputs
+        x = torch.zeros(
+            (1 if self.rank != 0 else 0, 2, 11, 13),
+            dtype=torch.float32,
+            device=self.rank
+        )
 
+        # input requires grad, this will trigger the collective communication
+        # in the backward pass
+        x.requires_grad = True
+        self._test_not_nan(model, x)
+
+        # input does not requires grad
+        x.requires_grad = False
+        self._test_not_nan(model, x)
+
+        # all ranks receive empty inputs
+        x = torch.zeros(
+            (0, 2, 11, 13),
+            dtype=torch.float32,
+            device=self.rank
+        )
+
+        # input requires grad, this will trigger the collective communication
+        # in the backward pass
+        x.requires_grad = True
+        self._test_not_nan(model, x)
+
+        # input does not requires grad
+        x.requires_grad = False
+        self._test_not_nan(model, x)
+
+    @skip_if_lt_x_gpu(2)
+    def test_sync_batch_norm_empty_input(self):
+        pg = self._get_process_group()
+
+        model = torch.nn.Sequential(
+            nn.Conv2d(2, 2, 3),
+            nn.BatchNorm2d(2),
+            nn.Linear(28, 2),
+        ).to(device=self.rank)
+        model = DistributedDataParallel(
+            model,
+            device_ids=[self.rank],
+            process_group=pg,
+        )
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(
+            model,
+            process_group=pg,
+        )
+
+        model.train()
+        # only rank 0 receives empty inputs
+        x = torch.zeros(
+            (3 if self.rank != 0 else 0, 2, 30, 30),
+            dtype=torch.float32,
+            device=self.rank
+        )
+
+        self._test_not_nan(model, x)
+
+        # all ranks receive empty inputs
+        x = torch.zeros(
+            (0, 2, 30, 30),
+            dtype=torch.float32,
+            device=self.rank
+        )
+
+        self._test_not_nan(model, x)
 
 class ComputeBucketAssignmentTest(TestCase):
     def test_single_limit_single_dtype(self):
@@ -698,20 +1129,33 @@ def tearDown(self):
         except OSError:
             pass
 
-    def test_distributed_debug_mode(self):
+    def test_debug_level(self):
+        try:
+            del os.environ["TORCH_DISTRIBUTED_DEBUG"]
+        except KeyError:
+            pass
+
+        dist.set_debug_level_from_env()
         # Default should be off
-        default_debug_mode = dist._get_debug_mode()
-        self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF)
+        default_debug_mode = dist.get_debug_level()
+        self.assertEqual(default_debug_mode, dist.DebugLevel.OFF)
         mapping = {
-            "OFF": dist._DistributedDebugLevel.OFF,
-            "INFO": dist._DistributedDebugLevel.INFO,
-            "DETAIL": dist._DistributedDebugLevel.DETAIL,
+            "OFF": dist.DebugLevel.OFF,
+            "off": dist.DebugLevel.OFF,
+            "oFf": dist.DebugLevel.OFF,
+            "INFO": dist.DebugLevel.INFO,
+            "info": dist.DebugLevel.INFO,
+            "INfO": dist.DebugLevel.INFO,
+            "DETAIL": dist.DebugLevel.DETAIL,
+            "detail": dist.DebugLevel.DETAIL,
+            "DeTaIl": dist.DebugLevel.DETAIL,
         }
         invalid_debug_modes = ["foo", 0, 1, -1]
 
         for mode in mapping.keys():
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-            set_debug_mode = dist._get_debug_mode()
+            dist.set_debug_level_from_env()
+            set_debug_mode = dist.get_debug_level()
             self.assertEqual(
                 set_debug_mode,
                 mapping[mode],
@@ -720,8 +1164,8 @@ def test_distributed_debug_mode(self):
 
         for mode in invalid_debug_modes:
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-            with self.assertRaisesRegex(RuntimeError, "to be one of"):
-                dist._get_debug_mode()
+            with self.assertRaisesRegex(RuntimeError, "The value of TORCH_DISTRIBUTED_DEBUG must"):
+                dist.set_debug_level_from_env()
 
 
 class DummyWork(dist._Work):
@@ -879,6 +1323,8 @@ def test_send_recv(self):
         # user applications would explicitly that.
 
 
+instantiate_parametrized_tests(CommonDistributedDataParallelTest)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 0594aae287fc..e49d65ea33d2 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -39,7 +39,6 @@
     skip_if_win32,
     create_device,
     verify_ddp_error_logged,
-    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import (
     TestCase,
@@ -538,7 +537,6 @@ def test_allreduce_stress(self):
         self._test_allreduce_stress(inputs)
 
     @skip_if_lt_x_gpu(2)
-    @skip_if_rocm
     @requires_gloo()
     def test_allreduce_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -979,7 +977,7 @@ def _test_gather_basics(self, fn):
                 futures.append(pg.gather([], input, opts).get_future())
 
         # Wait for work to complete
-        expected = [torch.tensor([rank]) for rank in range(self.world_size)]
+        expected = [fn(torch.tensor([rank])) for rank in range(self.world_size)]
         for i in range(self.world_size):
             futures[i].wait()
             result = futures[i].value()
@@ -995,6 +993,11 @@ def test_gather_basics(self):
     def test_gather_basics_cuda(self):
         self._test_gather_basics(lambda t: t.clone().cuda())
 
+    @requires_gloo()
+    def test_gather_noncontiguous_input(self):
+        # Take a column of 2D tensor, such that memory is not dense
+        self._test_gather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0])
+
     def _test_gather_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_gloo(
@@ -1037,7 +1040,6 @@ def test_gather_stress(self):
         self._test_gather_stress(inputs, lambda t: t.clone())
 
     @skip_if_lt_x_gpu(2)
-    @skip_if_rocm
     @requires_gloo()
     def test_gather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -1103,7 +1105,7 @@ def _test_allgather_basics(self, fn):
                 for _ in range(n)
             ]
             expected_output = [
-                [torch.tensor([i]) for i in range(n * self.world_size)]
+                [fn(torch.tensor([i])) for i in range(n * self.world_size)]
                 for _ in range(n)
             ]
             fut = pg.allgather(output, input).get_future()
@@ -1122,6 +1124,11 @@ def test_allgather_basics(self):
     def test_allgather_basics_cuda(self):
         self._test_allgather_basics(lambda t: t.clone().cuda())
 
+    @requires_gloo()
+    def test_allgather_noncontiguous_input(self):
+        # Take a column of 2D tensor, such that memory is not dense
+        self._test_allgather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0])
+
     def _test_allgather_stress(self, inputs, fn):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_gloo(
@@ -1136,8 +1143,14 @@ def _test_allgather_stress(self, inputs, fn):
             [[torch.tensor([i + j]) for j in range(self.world_size)]]
             for i in range(len(inputs))
         ]
+        input_holder = {}
         for i in range(len(inputs)):
-            fut = pg.allgather(outputs[i], [fn(inputs[i])]).get_future()
+            # Note that this works around the data race discussed in
+            # https://github.com/pytorch/pytorch/issues/75529, but we should
+            # actually be able to pass the list directly into allgather when
+            # that race is fixed.
+            input_holder[i] = [fn(inputs[i])]
+            fut = pg.allgather(outputs[i], input_holder[i]).get_future()
             future_handles.append(fut)
 
         for i, future_handle in enumerate(future_handles):
@@ -1155,7 +1168,6 @@ def test_allgather_stress(self):
         self._test_allgather_stress(inputs, lambda t: t.clone())
 
     @skip_if_lt_x_gpu(2)
-    @skip_if_rocm
     @requires_gloo()
     def test_allgather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -1336,7 +1348,6 @@ def test_reduce_stress(self):
         self._test_reduce_stress(inputs)
 
     @skip_if_lt_x_gpu(2)
-    @skip_if_rocm
     @requires_gloo()
     def test_reduce_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -1457,12 +1468,16 @@ def create(num, prefix):
 
 
 class DistributedDataParallelTest(
-    test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase
+    test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
 ):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
         self._spawn_processes()
 
+    def _get_process_group(self):
+        store = self._get_store()
+        return c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+
     def _test_gloo_backend(
         self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
     ):
@@ -1757,7 +1772,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         # Check that the gradients are sparse and identical
         vanilla_parameter = next(vanilla_model.parameters())
         ddp_parameter = next(ddp_model.parameters())
-        self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad)
+        self.assertEqual(vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce())
 
     @requires_gloo()
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 1c8b224776a0..5ceadc43b265 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -9,7 +9,7 @@
 import tempfile
 import threading
 import time
-from contextlib import contextmanager, suppress
+from contextlib import contextmanager
 from datetime import timedelta
 from itertools import product
 from unittest import mock
@@ -49,11 +49,8 @@
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_ROCM,
     sandcastle_skip,
-    instantiate_parametrized_tests,
-    parametrize,
     sandcastle_skip_if,
 )
-from torch.utils.checkpoint import checkpoint
 
 if TEST_WITH_DEV_DBG_ASAN:
     print(
@@ -949,7 +946,7 @@ def allreduce(tensors):
 
 
 class DistributedDataParallelTest(
-    test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase
+    test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
 ):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
@@ -958,6 +955,10 @@ def setUp(self):
         os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
         self._spawn_processes()
 
+    def _get_process_group(self):
+        store = self._get_store()
+        return c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
     def _test_nccl_backend(
         self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
     ):
@@ -1350,7 +1351,7 @@ def test_find_unused_parameters(
                         # Only one such parameter in model.fc3, since bias=False
                         break
 
-            if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF:
+            if dist.get_debug_level() != dist.DebugLevel.OFF:
                 unused_index_str += f" with name {unused_fqn}"
 
             self.assertTrue(unused_index_str in str(ex))
@@ -2013,12 +2014,15 @@ def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False):
 
         # Get GPU model with the hook registered.
         # Test the hook with different algorithmic configs.
-        for use_error_feedback, warm_start in product([True, False], [True, False]):
+        for use_error_feedback, warm_start, batch_tensors_with_same_shape in product(
+            [True, False], [True, False], [True, False],
+        ):
             state = powerSGD.PowerSGDState(
                 process_group=process_group,
                 matrix_approximation_rank=1,
                 use_error_feedback=use_error_feedback,
                 warm_start=warm_start,
+                batch_tensors_with_same_shape=batch_tensors_with_same_shape,
             )
             for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]:
                 gpu_model = self._gpu_model_with_ddp_comm_hook(
@@ -2216,349 +2220,6 @@ def test_ddp_weight_sharing(self):
                         ),
                     )
 
-    # A list of tests for ddp with activation checkpointing
-    # when gradient_as_bucket_view=True, False.
-    # Most of the tests are referred to
-    # https://github.com/facebookresearch/fairscale/blob/main/tests/nn/pipe/test_checkpoint_ddp.py
-    class CheckpointOnceModule(nn.Module):
-        """
-        Runs checkpoint for a single layer in the model.
-        """
-        def __init__(self, use_reentrant=True):
-            super().__init__()
-            self.l1 = nn.Linear(20, 20)
-            self.l2 = nn.Linear(20, 20)
-            self.use_reentrant = use_reentrant
-
-        def forward(self, inp):
-            x = self.l1(inp)
-            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
-            return x
-
-    class CheckpointTwiceModule(CheckpointOnceModule):
-        """
-        Runs checkpoint for the same layer twice in a model. This simulates use
-        cases such as pipeline parallel where the same layer can be checkpointed
-        more than one time.
-        """
-        def __init__(self, use_reentrant=True):
-            super().__init__(use_reentrant=use_reentrant)
-
-        def forward(self, inp):
-            x = self.l1(inp)
-            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
-            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
-            return x
-
-    class CheckpointTwiceModuleWeightSharing(CheckpointTwiceModule):
-        """
-        Similar to CheckpointTwiceModule but the weights are shared.
-        """
-        def __init__(self, use_reentrant=True):
-            super().__init__(use_reentrant=use_reentrant)
-
-        def forward(self, inp):
-            x = self.l1(inp)
-            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
-            x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
-            return x
-
-
-    class DynamicCheckpointTwiceModule(CheckpointTwiceModule):
-        def __init__(self, use_reentrant=True):
-            super().__init__(use_reentrant=use_reentrant)
-            self.count = 0
-
-        def forward(self, inp):
-            if self.count % 2:
-                x = checkpoint(self.l1, inp, use_reentrant=self.use_reentrant)
-            else:
-                x = checkpoint(self.l2, inp, use_reentrant=self.use_reentrant)
-
-            self.count += 1
-            return x
-
-    class DynamicCheckpointTwiceModuleWeightSharing(DynamicCheckpointTwiceModule):
-        def __init__(self, use_reentrant=True):
-            super().__init__(use_reentrant=use_reentrant)
-            self.l1.weight = self.l2.weight
-
-
-    def _prepare_dummy_data(self):
-        ddp_bs = 16
-        bs = ddp_bs * self.world_size
-        input = torch.rand((bs, 20), device="cuda", requires_grad=True)
-        target = torch.randn((bs, 20), device="cuda")
-        offset = self.rank * ddp_bs
-        ddp_input = input[offset : offset + ddp_bs]
-        ddp_target = target[offset : offset + ddp_bs]
-        return input, ddp_input, target, ddp_target
-
-    def _train_model(self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True):
-        model.train()
-        if run_checkpoint:
-            output = checkpoint(model, input_var, use_reentrant=use_reentrant)
-        else:
-            output = model(input_var)
-        l = loss(output, target)
-        l.backward()
-
-    def _test_ddp_checkpointing(
-        self,
-        input_model,
-        process_group,
-        use_bucket_view,
-        find_unused_parameters=False,
-        static_graph=False,
-        run_checkpoint=False,
-        use_reentrant=True,
-        allow_none_grads=False,
-    ):
-        # to reproduce the same training results
-        torch.cuda.set_device(self.rank)
-        torch.manual_seed(31415)
-        model = copy.deepcopy(input_model).cuda()
-        ddp_model = copy.deepcopy(input_model).cuda()
-        ddp_model = nn.parallel.DistributedDataParallel(
-            ddp_model,
-            bucket_cap_mb=1,
-            gradient_as_bucket_view=use_bucket_view,
-            device_ids=[self.rank],
-            process_group=process_group,
-            find_unused_parameters=find_unused_parameters,
-            static_graph=static_graph,
-        )
-        self.assertEqual(
-            ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph
-        )
-        input, ddp_input, target, ddp_target = self._prepare_dummy_data()
-        loss = nn.MSELoss()
-        n_iters = 5
-        for i in range(n_iters):
-            model.zero_grad(set_to_none=False)
-            ddp_model.zero_grad(set_to_none=False)
-            self._train_model(model, input, target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant)
-            self._train_model(
-                ddp_model, ddp_input, ddp_target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant
-            )
-            for i, j in zip(model.parameters(), ddp_model.parameters()):
-                if not allow_none_grads:
-                    self.assertTrue(i.grad is not None)
-                    self.assertTrue(j.grad is not None)
-                self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_reentrant", [True, False])
-    def test_ddp_checkpointing_once(self, use_reentrant):
-        """
-        DDP works as expected when layer is checkpointed only once.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view, static_graph in product((False, True), (False, True)):
-            self._test_ddp_checkpointing(
-                self.CheckpointOnceModule(use_reentrant=use_reentrant),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                static_graph=static_graph,
-            )
-            if static_graph:
-                # find_unused_parameters does not make a difference, since it is
-                # ignored for static graph.
-                self._test_ddp_checkpointing(
-                    self.CheckpointOnceModule(),
-                    process_group=process_group,
-                    use_bucket_view=use_bucket_view,
-                    static_graph=static_graph,
-                    find_unused_parameters=True,
-                )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_reentrant", [True, False])
-    def test_ddp_checkpointing_unused_params(self, use_reentrant):
-        """
-        With reentrant autograd checkpointing impl, DDP will fail when there are
-        unused params in the model and no static graph training. With
-        non-reentrant checkpointing implementation, this works as expected.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view in (True, False):
-            err_ctx = (
-                suppress() if not use_reentrant else
-                self.assertRaisesRegex(
-                    RuntimeError,
-                    "Expected to mark a variable ready only once."
-                )
-            )
-            with err_ctx:
-                model = self._test_ddp_checkpointing(
-                    self.CheckpointOnceModule(use_reentrant=use_reentrant),
-                    process_group=process_group,
-                    use_bucket_view=use_bucket_view,
-                    find_unused_parameters=True,
-                )
-            # test passes when static_graph is true
-            model = self._test_ddp_checkpointing(
-                self.CheckpointOnceModule(use_reentrant=use_reentrant),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                find_unused_parameters=True,
-                static_graph=True,
-            )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_reentrant", [True, False])
-    def test_ddp_checkpointing_twice(self, use_reentrant):
-        """
-        Checkpoitning twice fails for non-static graph with reentrant checkpoint
-        implementation, succeeds with non-reentrant checkpoint implementation.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view in (True, False):
-            err_ctx = (
-                suppress() if not use_reentrant else
-                self.assertRaisesRegex(
-                    RuntimeError,
-                    "Expected to mark a variable ready only once."
-                )
-            )
-            with err_ctx:
-                model = self._test_ddp_checkpointing(
-                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
-                    process_group=process_group,
-                    use_bucket_view=use_bucket_view,
-                    static_graph=False,
-                )
-
-            with err_ctx:
-                model = self._test_ddp_checkpointing(
-                    self.CheckpointTwiceModule(use_reentrant=use_reentrant),
-                    process_group=process_group,
-                    use_bucket_view=use_bucket_view,
-                    static_graph=False,
-                    find_unused_parameters=True,
-                )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_reentrant", [True, False])
-    def test_ddp_checkpointing_twice_static_graph(self, use_reentrant):
-        """
-        Regardless of reentrant or non-reentrant checkpointing impl,
-        checkpointing twice works with static graph enabled.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view in (True, False):
-            # Test passes when static_graph=True.
-            model = self._test_ddp_checkpointing(
-                self.CheckpointTwiceModule(use_reentrant=use_reentrant),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                static_graph=True,
-            )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_ddp_checkpointing_dynamic_module(self):
-        """
-        Dynamic module can be checkpointed, multiple times, with non-reentrant
-        checkpointing implementation.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
-                self.DynamicCheckpointTwiceModule(use_reentrant=False),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                static_graph=False,
-                find_unused_parameters=True,
-                # Grads can be none sometimes due to dynamic module not using
-                # all params.
-                allow_none_grads=True
-            )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_ddp_checkpointing_dynamic_weight_sharing(self):
-        """
-        Dynamic module can be checkpointed multiple times with weight sharing
-        using non-reentrant checkpointing implementation.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
-                self.DynamicCheckpointTwiceModuleWeightSharing(use_reentrant=False),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                static_graph=False,
-                find_unused_parameters=True,
-                # Grads can be none sometimes due to dynamic module not using
-                # all params.
-                allow_none_grads=True
-            )
-
-    # DDP works as expected if there is weight sharing among layers
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @parametrize("use_reentrant", [True, False])
-    def test_ddp_checkpointing_weight_sharing(self, use_reentrant):
-        """
-        Test that checkpointing with weight sharing works.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        torch.cuda.set_device(self.rank)
-        for use_bucket_view, static_graph in product((False, True), (False, True)):
-            torch.manual_seed(31415)
-            l1 = nn.Linear(20, 20)
-            l2 = nn.Linear(20, 20)
-            l1.weight = l2.weight
-            model = nn.Sequential(l1, l2)
-            # TODO: non-reentrant based checkpointing of DDP module with
-            # static_graph runs into the below issue, see
-            # https://github.com/pytorch/pytorch/issues/70865 and
-            # https://github.com/pytorch/pytorch/issues/58111 for details.
-            err_ctx = (
-                self.assertRaisesRegex(
-                    RuntimeError,
-                    "Your training graph has changed in this iteration"
-                ) if static_graph and not use_reentrant else suppress()
-            )
-            with err_ctx:
-                self._test_ddp_checkpointing(
-                    model,
-                    process_group=process_group,
-                    use_bucket_view=use_bucket_view,
-                    static_graph=static_graph,
-                    run_checkpoint=True,
-                    use_reentrant=use_reentrant,
-                )
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_ddp_checkpointing_twice_weight_sharing(self):
-        """
-        Checkpointing should work with static graph in the case of checkpointing
-        same layer twice and having weights shared acrosss layers.
-        """
-        store = c10d.FileStore(self.file_name, self.world_size)
-        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        torch.cuda.set_device(self.rank)
-        for use_bucket_view in (True, False):
-            model = self._test_ddp_checkpointing(
-                self.CheckpointTwiceModuleWeightSharing(),
-                process_group=process_group,
-                use_bucket_view=use_bucket_view,
-                static_graph=True,
-            )
 
 
 class NcclErrorHandlingTest(MultiProcessTestCase):
@@ -3053,8 +2714,6 @@ def test_nccl_warn_not_in_group_debug_info(self):
     def test_nccl_warn_not_in_group_debug_off(self):
         self._test_warn_not_in_group(backend="nccl")
 
-instantiate_parametrized_tests(DistributedDataParallelTest)
-
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index 92ce8ccc56e5..c1720344e49d 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -383,7 +383,7 @@ def test_data_parallel_sparse(self):
                 self.assertEqual(out.get_device(), dev_id[0])
                 self.assertEqual(out, expected_out)
                 for expected, param in zip(expected_grads, l.parameters()):
-                    self.assertEqual(param.grad, expected)
+                    self.assertEqual(param.grad.coalesce(), expected.coalesce())
 
         # Check for None device_ids
         l = l.cuda()
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index 02484585c68e..bcff510bfe0c 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -1,7 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
 import os
-import random
 import sys
 import tempfile
 import time
@@ -248,7 +247,7 @@ def test_numkeys_delkeys(self):
         self._test_numkeys_delkeys(self._create_store())
 
     def _create_client(self, index, addr, port, world_size):
-        client_store = dist.TCPStore(addr, port, world_size, timeout=timedelta(seconds=10))
+        client_store = dist.TCPStore(addr, port, world_size=world_size, timeout=timedelta(seconds=10))
         self.assertEqual("value".encode(), client_store.get("key"))
         client_store.set(f"new_key{index}", f"new_value{index}")
         self.assertEqual(f"next_value{index}".encode(),
@@ -259,15 +258,16 @@ def _multi_worker_helper(self, world_size):
         server_store = create_tcp_store(addr, world_size, wait_for_workers=False)
         server_store.set("key", "value")
         port = server_store.port
-        world_size = random.randint(5, 10) if world_size == -1 else world_size
-        for i in range(world_size):
+
+        num_indices = world_size if world_size else 1
+        for i in range(num_indices):
             self._create_client(i, addr, port, world_size)
 
     def test_multi_worker_with_fixed_world_size(self):
         self._multi_worker_helper(5)
 
     def test_multi_worker_with_nonfixed_world_size(self):
-        self._multi_worker_helper(-1)
+        self._multi_worker_helper(None)
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
@@ -404,6 +404,14 @@ def test_common_errors(self):
             gen = dist.rendezvous("tcp://127.0.0.1:23456?rank=0")
             next(gen)
 
+    def test_dns_timeout(self):
+        with self.assertRaisesRegex(TimeoutError, "client socket has timed out after.*dnsnotexist"):
+            gen = dist.rendezvous(
+                "tcp://dnsnotexist:23456?world_size=2&rank=0",
+                timeout=timedelta(seconds=1),
+            )
+            next(gen)
+
     @retry_on_connect_failures
     def test_nominal(self):
         url = self.create_tcp_url()
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 1855c8434bec..c8b5551c8937 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -34,6 +34,7 @@
 from collections import namedtuple
 from itertools import product
 from random import shuffle
+from packaging import version
 
 import torch
 
@@ -41,7 +42,7 @@
 # Distributions tests use double as the default dtype
 torch.set_default_dtype(torch.double)
 
-from torch._six import inf
+from torch._six import inf, nan
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN, load_tests,
      gradcheck)
@@ -480,27 +481,27 @@ def is_all_nan(tensor):
     Example(Wishart, [
         {
             'covariance_matrix': torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True),
-            'df': torch.tensor([4.], requires_grad=True),
+            'df': torch.tensor([3.], requires_grad=True),
         },
         {
             'precision_matrix': torch.tensor([[2.0, 0.1, 0.0],
                                               [0.1, 0.25, 0.0],
                                               [0.0, 0.0, 0.3]], requires_grad=True),
-            'df': torch.tensor([2.5, 3], requires_grad=True),
+            'df': torch.tensor([5., 4], requires_grad=True),
         },
         {
             'scale_tril': torch.tensor([[[2.0, 0.0], [-0.5, 0.25]],
                                         [[2.0, 0.0], [0.3, 0.25]],
                                         [[5.0, 0.0], [-0.5, 1.5]]], requires_grad=True),
-            'df': torch.tensor([5., 3.5, 2], requires_grad=True),
+            'df': torch.tensor([5., 3.5, 3], requires_grad=True),
         },
         {
             'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
-            'df': torch.tensor([2.0]),
+            'df': torch.tensor([3.0]),
         },
         {
             'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
-            'df': 2.0,
+            'df': 3.0,
         },
     ]),
     Example(MixtureSameFamily, [
@@ -866,9 +867,15 @@ def _check_sampler_discrete(self, torch_dist, ref_dist, message,
         torch_samples = torch_samples.cpu().numpy()
         unique, counts = np.unique(torch_samples, return_counts=True)
         pmf = ref_dist.pmf(unique)
+        pmf = pmf / pmf.sum()  # renormalize to 1.0 for chisq test
         msk = (counts > 5) & ((pmf * num_samples) > 5)
         self.assertGreater(pmf[msk].sum(), 0.9, "Distribution is too sparse for test; try increasing num_samples")
-        chisq, p = scipy.stats.chisquare(counts[msk], pmf[msk] * num_samples)
+        # Add a remainder bucket that combines counts for all values
+        # below threshold, if such values exist (i.e. mask has False entries).
+        if not msk.all():
+            counts = np.concatenate([counts[msk], np.sum(counts[~msk], keepdims=True)])
+            pmf = np.concatenate([pmf[msk], np.sum(pmf[~msk], keepdims=True)])
+        chisq, p = scipy.stats.chisquare(counts, pmf * num_samples)
         self.assertGreater(p, failure_rate, message)
 
     def _check_enumerate_support(self, dist, examples):
@@ -2214,39 +2221,42 @@ def test_multivariate_normal_moments(self):
 
     # We applied same tests in Multivariate Normal distribution for Wishart distribution
     def test_wishart_shape(self):
-        df = (torch.rand(5, requires_grad=True) + 1) * 10
-        df_no_batch = (torch.rand([], requires_grad=True) + 1) * 10
-        df_multi_batch = (torch.rand(6, 5, requires_grad=True) + 1) * 10
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        ndim = 3
+
+        df = torch.rand(5, requires_grad=True) + ndim
+        df_no_batch = torch.rand([], requires_grad=True) + ndim
+        df_multi_batch = torch.rand(6, 5, requires_grad=True) + ndim
 
         # construct PSD covariance
-        tmp = torch.randn(3, 10)
+        tmp = torch.randn(ndim, 10)
         cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
         prec = cov.inverse().requires_grad_()
         scale_tril = torch.linalg.cholesky(cov).requires_grad_()
 
         # construct batch of PSD covariances
-        tmp = torch.randn(6, 5, 3, 10)
+        tmp = torch.randn(6, 5, ndim, 10)
         cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
         prec_batched = cov_batched.inverse()
         scale_tril_batched = torch.linalg.cholesky(cov_batched)
 
         # ensure that sample, batch, event shapes all handled correctly
-        self.assertEqual(Wishart(df, cov).sample().size(), (5, 3, 3))
-        self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (3, 3))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, 3, 3))
-        self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, 3, 3))
-        self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, 3, 3))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, 3, 3))
-        self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, 3, 3))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3, 3))
-        self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
-        self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3, 3))
-        self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3))
+        self.assertEqual(Wishart(df, cov).sample().size(), (5, ndim, ndim))
+        self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (ndim, ndim))
+        self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, ndim, ndim))
+        self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, ndim, ndim))
+        self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
+        self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim))
+        self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
+        self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
 
         # check gradients
         # Modified and applied the same tests for multivariate_normal
@@ -2272,14 +2282,21 @@ def gradcheck_func(samples, nu, sigma, prec, scale_tril):
         wishart_log_prob_gradcheck(df_no_batch, None, None, scale_tril_batched)
 
     def test_wishart_stable_with_precision_matrix(self):
-        x = torch.randn(10)
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        ndim = 10
+        x = torch.randn(ndim)
         P = torch.exp(-(x - x.unsqueeze(-1)) ** 2)  # RBF kernel
-        Wishart(torch.tensor(10), precision_matrix=P)
+        Wishart(torch.tensor(ndim), precision_matrix=P)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_wishart_log_prob(self):
-        df = (torch.rand([], requires_grad=True) + 1) * 10
-        tmp = torch.randn(3, 10)
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        ndim = 3
+        df = torch.rand([], requires_grad=True) + ndim - 1
+        # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
+        if version.parse(scipy.__version__) < version.parse("1.7.0"):
+            df += 1.
+        tmp = torch.randn(ndim, 10)
         cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
         prec = cov.inverse().requires_grad_()
         scale_tril = torch.linalg.cholesky(cov).requires_grad_()
@@ -2291,7 +2308,7 @@ def test_wishart_log_prob(self):
         dist3 = Wishart(df, scale_tril=scale_tril)
         ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy())
 
-        x = dist1.sample((10,))
+        x = dist1.sample((1000,))
         expected = ref_dist.logpdf(x.transpose(0, 2).numpy())
 
         self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
@@ -2299,14 +2316,17 @@ def test_wishart_log_prob(self):
         self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
 
         # Double-check that batched versions behave the same as unbatched
-        df = (torch.rand(5, requires_grad=True) + 1) * 3
-        tmp = torch.randn(5, 3, 10)
+        df = torch.rand(5, requires_grad=True) + ndim - 1
+        # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
+        if version.parse(scipy.__version__) < version.parse("1.7.0"):
+            df += 1.
+        tmp = torch.randn(5, ndim, 10)
         cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
 
         dist_batched = Wishart(df, cov)
         dist_unbatched = [Wishart(df[i], cov[i]) for i in range(df.size(0))]
 
-        x = dist_batched.sample((10,))
+        x = dist_batched.sample((1000,))
         batched_prob = dist_batched.log_prob(x)
         unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
 
@@ -2316,28 +2336,36 @@ def test_wishart_log_prob(self):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_wishart_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
-        df = (torch.rand([], requires_grad=True) + 1) * 3
-        tmp = torch.randn(3, 10)
+        ndim = 3
+        df = torch.rand([], requires_grad=True) + ndim - 1
+        # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
+        if version.parse(scipy.__version__) < version.parse("1.7.0"):
+            df += 1.
+        tmp = torch.randn(ndim, 10)
         cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
         prec = cov.inverse().requires_grad_()
         scale_tril = torch.linalg.cholesky(cov).requires_grad_()
 
+        ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy())
+
         self._check_sampler_sampler(Wishart(df, cov),
-                                    scipy.stats.wishart(df.item(), cov.detach().numpy()),
+                                    ref_dist,
                                     'Wishart(df={}, covariance_matrix={})'.format(df, cov),
                                     multivariate=True)
         self._check_sampler_sampler(Wishart(df, precision_matrix=prec),
-                                    scipy.stats.wishart(df.item(), cov.detach().numpy()),
+                                    ref_dist,
                                     'Wishart(df={}, precision_matrix={})'.format(df, prec),
                                     multivariate=True)
         self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril),
-                                    scipy.stats.wishart(df.item(), cov.detach().numpy()),
+                                    ref_dist,
                                     'Wishart(df={}, scale_tril={})'.format(df, scale_tril),
                                     multivariate=True)
 
     def test_wishart_properties(self):
-        df = (torch.rand([]) + 1) * 5
-        scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5))
+        set_rng_seed(0)  # see Note [Randomized statistical tests]
+        ndim = 5
+        df = torch.rand([]) + ndim - 1
+        scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim))
         m = Wishart(df=df, scale_tril=scale_tril)
         self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t()))
         self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0]))
@@ -2345,14 +2373,15 @@ def test_wishart_properties(self):
 
     def test_wishart_moments(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
-        df = (torch.rand([]) + 1) * 3
-        scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(3, 3))
+        ndim = 3
+        df = torch.rand([]) + ndim - 1
+        scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim))
         d = Wishart(df=df, scale_tril=scale_tril)
-        samples = d.rsample((100000,))
+        samples = d.rsample((ndim * ndim * 100000,))
         empirical_mean = samples.mean(0)
-        self.assertEqual(d.mean, empirical_mean, atol=5, rtol=0)
+        self.assertEqual(d.mean, empirical_mean, atol=0.5, rtol=0)
         empirical_var = samples.var(0)
-        self.assertEqual(d.variance, empirical_var, atol=5, rtol=0)
+        self.assertEqual(d.variance, empirical_var, atol=0.5, rtol=0)
 
     def test_exponential(self):
         rate = torch.randn(5, 5).abs().requires_grad_()
@@ -2727,6 +2756,18 @@ def test_dirichlet_sample(self):
                                     'Dirichlet(alpha={})'.format(list(alpha)),
                                     multivariate=True)
 
+    def test_dirichlet_mode(self):
+        # Test a few edge cases for the Dirichlet distribution mode. This also covers beta distributions.
+        concentrations_and_modes = [
+            ([2, 2, 1], [.5, .5, 0.]),
+            ([3, 2, 1], [2 / 3, 1 / 3, 0]),
+            ([.5, .2, .2], [1., 0., 0.]),
+            ([1, 1, 1], [nan, nan, nan]),
+        ]
+        for concentration, mode in concentrations_and_modes:
+            dist = Dirichlet(torch.tensor(concentration))
+            self.assertEqual(dist.mode, torch.tensor(mode))
+
     def test_beta_shape(self):
         con1 = torch.randn(2, 3).exp().requires_grad_()
         con0 = torch.randn(2, 3).exp().requires_grad_()
@@ -2922,6 +2963,14 @@ def test_cdf_icdf_inverse(self):
                     'icdf(cdf(x)) = {}'.format(actual),
                 ]))
 
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_gamma_log_prob_at_boundary(self):
+        for concentration, log_prob in [(.5, inf), (1, 0), (2, -inf)]:
+            dist = Gamma(concentration, 1)
+            scipy_dist = scipy.stats.gamma(concentration)
+            self.assertAlmostEqual(dist.log_prob(0), log_prob)
+            self.assertAlmostEqual(dist.log_prob(0), scipy_dist.logpdf(0))
+
     def test_cdf_log_prob(self):
         # Tests if the differentiation of the CDF gives the PDF at a given value
         for Dist, params in EXAMPLES:
@@ -3105,18 +3154,88 @@ def test_invalid_parameter_broadcasting(self):
                 'alpha': torch.tensor([1, 1, 1])
             }),
             (StudentT, {
-                'df': torch.tensor([1, 1]),
-                'scale': torch.tensor([1, 1, 1])
+                'df': torch.tensor([1., 1.]),
+                'scale': torch.tensor([1., 1., 1.])
             }),
             (StudentT, {
-                'df': torch.tensor([1, 1]),
-                'loc': torch.tensor([1, 1, 1])
+                'df': torch.tensor([1., 1.]),
+                'loc': torch.tensor([1., 1., 1.])
             })
         ]
 
         for dist, kwargs in invalid_examples:
             self.assertRaises(RuntimeError, dist, **kwargs)
 
+    def _test_discrete_distribution_mode(self, dist, sanitized_mode, batch_isfinite):
+        # We cannot easily check the mode for discrete distributions, but we can look left and right
+        # to ensure the log probability is smaller than at the mode.
+        for step in [-1, 1]:
+            log_prob_mode = dist.log_prob(sanitized_mode)
+            if isinstance(dist, OneHotCategorical):
+                idx = (dist._categorical.mode + 1) % dist.probs.shape[-1]
+                other = torch.nn.functional.one_hot(idx, num_classes=dist.probs.shape[-1]).to(dist.mode)
+            else:
+                other = dist.mode + step
+            mask = batch_isfinite & dist.support.check(other)
+            self.assertTrue(mask.any() or dist.mode.unique().numel() == 1)
+            # Add a dimension to the right if the event shape is not a scalar, e.g. OneHotCategorical.
+            other = torch.where(mask[..., None] if mask.ndim < other.ndim else mask, other, dist.sample())
+            log_prob_other = dist.log_prob(other)
+            delta = log_prob_mode - log_prob_other
+            self.assertTrue((-1e-12 < delta[mask].detach()).all())  # Allow up to 1e-12 rounding error.
+
+    def _test_continuous_distribution_mode(self, dist, sanitized_mode, batch_isfinite):
+        if isinstance(dist, Wishart):
+            return
+        # We perturb the mode in the unconstrained space and expect the log probability to decrease.
+        num_points = 10
+        transform = transform_to(dist.support)
+        unconstrained_mode = transform.inv(sanitized_mode)
+        perturbation = 1e-5 * (torch.rand((num_points,) + unconstrained_mode.shape) - 0.5)
+        perturbed_mode = transform(perturbation + unconstrained_mode)
+        log_prob_mode = dist.log_prob(sanitized_mode)
+        log_prob_other = dist.log_prob(perturbed_mode)
+        delta = log_prob_mode - log_prob_other
+
+        # We pass the test with a small tolerance to allow for rounding and manually set the
+        # difference to zero if both log probs are infinite with the same sign.
+        both_infinite_with_same_sign = (log_prob_mode == log_prob_other) & (log_prob_mode.abs() == inf)
+        delta[both_infinite_with_same_sign] = 0.
+        ordering = (delta > -1e-12).all(axis=0)
+        self.assertTrue(ordering[batch_isfinite].all())
+
+    def test_mode(self):
+        discrete_distributions = (
+            Bernoulli, Binomial, Categorical, Geometric, NegativeBinomial, OneHotCategorical, Poisson,
+        )
+        no_mode_available = (
+            ContinuousBernoulli, LKJCholesky, LogisticNormal, MixtureSameFamily, Multinomial,
+            RelaxedBernoulli, RelaxedOneHotCategorical,
+        )
+
+        for dist_cls, params in EXAMPLES:
+            for param in params:
+                dist = dist_cls(**param)
+                if isinstance(dist, no_mode_available) or type(dist) is TransformedDistribution:
+                    with self.assertRaises(NotImplementedError):
+                        dist.mode
+                    continue
+
+                # Check that either all or no elements in the event shape are nan: the mode cannot be
+                # defined for part of an event.
+                isfinite = dist.mode.isfinite().reshape(dist.batch_shape + (dist.event_shape.numel(),))
+                batch_isfinite = isfinite.all(axis=-1)
+                self.assertTrue((batch_isfinite | ~isfinite.any(axis=-1)).all())
+
+                # We sanitize undefined modes by sampling from the distribution.
+                sanitized_mode = torch.where(~dist.mode.isnan(), dist.mode, dist.sample())
+                if isinstance(dist, discrete_distributions):
+                    self._test_discrete_distribution_mode(dist, sanitized_mode, batch_isfinite)
+                else:
+                    self._test_continuous_distribution_mode(dist, sanitized_mode, batch_isfinite)
+
+                self.assertFalse(dist.log_prob(sanitized_mode).isnan().any())
+
 
 # These tests are only needed for a few distributions that implement custom
 # reparameterized gradients. Most .rsample() implementations simply rely on
@@ -4617,8 +4736,16 @@ def setUp(self):
                 scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0])
             ),
             (
-                Wishart(20 + positive_var[0], cov_tensor),  # scipy var for Wishart only supports scalars
-                scipy.stats.wishart(20 + positive_var[0].item(), cov_tensor),
+                # scipy var for Wishart only supports scalars
+                # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
+                Wishart(
+                    (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0],
+                    cov_tensor,
+                ),
+                scipy.stats.wishart(
+                    (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0].item(),
+                    cov_tensor,
+                ),
             ),
         ]
 
@@ -4894,7 +5021,7 @@ def _examples(self):
     def _perturb_tensor(self, value, constraint):
         if isinstance(constraint, constraints._IntegerGreaterThan):
             return value + 1
-        if isinstance(constraint, constraints._PositiveDefinite):
+        if isinstance(constraint, constraints._PositiveDefinite) or isinstance(constraint, constraints._PositiveSemidefinite):
             return value + torch.eye(value.shape[-1])
         if value.dtype in [torch.float, torch.double]:
             transform = transform_to(constraint)
diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py
index 40f636c53f7e..da645e0e5036 100644
--- a/test/distributions/test_transforms.py
+++ b/test/distributions/test_transforms.py
@@ -12,7 +12,7 @@
                                             ExpTransform, IndependentTransform,
                                             LowerCholeskyTransform, PowerTransform,
                                             ReshapeTransform, SigmoidTransform, TanhTransform,
-                                            SoftmaxTransform, StickBreakingTransform,
+                                            SoftmaxTransform, SoftplusTransform, StickBreakingTransform,
                                             identity_transform, Transform, _InverseTransform)
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
 
@@ -38,6 +38,7 @@ def get_transforms(cache_size):
                         torch.randn(4, 5),
                         cache_size=cache_size),
         SoftmaxTransform(cache_size=cache_size),
+        SoftplusTransform(cache_size=cache_size),
         StickBreakingTransform(cache_size=cache_size),
         LowerCholeskyTransform(cache_size=cache_size),
         CorrCholeskyTransform(cache_size=cache_size),
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
index 32b8be2f5cd7..f01221172b70 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -1,6 +1,6 @@
 torch.fx._symbolic_trace.ProxyableClassMeta []
 torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'is_leaf_module', 'path_of_module', 'trace']
-torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'flatten_inps', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'python_code', 'unflatten_outs']
+torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'process_inputs', 'process_outputs', 'python_code', 'set_codegen']
 torch.fx.graph.PythonCode []
 torch.fx.graph_module.GraphModule ['add_submodule', 'code', 'delete_all_unused_submodules', 'delete_submodule', 'graph', 'recompile', 'to_folder']
 torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'update']
@@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node']
 torch.fx.proxy.GraphAppendingTracer []
 torch.fx.proxy.Proxy ['keys']
 torch.fx.proxy.TraceError []
-torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool', 'trace_asserts']
+torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'proxy_buffer_attributes', 'record_stack_traces', 'to_bool', 'trace_asserts', 'traced_func_name']
 torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map']
\ No newline at end of file
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index a7a2a37d98d0..bd8c0e63a52c 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -6,7 +6,7 @@ torch.fx._symbolic_trace.Tracer.path_of_module(self, mod: torch.nn.modules.modul
 torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph.Graph
 torch.fx._symbolic_trace.symbolic_trace(root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph_module.GraphModule
 torch.fx._symbolic_trace.wrap(fn_or_name: Union[str, Callable])
-torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None)
+torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None, tracer_extras: Optional[Dict[str, Any]] = None)
 torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
 torch.fx.graph.Graph.call_method(self, method_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
 torch.fx.graph.Graph.call_module(self, module_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
@@ -41,7 +41,7 @@ torch.fx.interpreter.Interpreter.get_attr(self, target: 'Target', args: Tuple[to
 torch.fx.interpreter.Interpreter.map_nodes_to_values(self, args: torch.fx.node.Argument, n: torch.fx.node.Node) -> torch.fx.node.Argument
 torch.fx.interpreter.Interpreter.output(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
 torch.fx.interpreter.Interpreter.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
-torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None) -> Any
+torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None, enable_io_processing: bool = True) -> Any
 torch.fx.interpreter.Interpreter.run_node(self, n: torch.fx.node.Node) -> Any
 torch.fx.interpreter.Transformer.__init__(self, module)
 torch.fx.interpreter.Transformer.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
@@ -53,13 +53,13 @@ torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'T
 torch.fx.node.Node.append(self, x: 'Node') -> None
 torch.fx.node.Node.format_node(self, placeholder_names: Optional[List[str]] = None, maybe_return_typename: Optional[List[str]] = None) -> Optional[str]
 torch.fx.node.Node.prepend(self, x: 'Node') -> None
-torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node') -> List[Node]
+torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Callable[[Node], bool] = <function <lambda>>) -> List[Node]
 torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node')
 torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
 torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None
 torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Argument], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
-torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int])
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None)
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/expect/TestPytorchExportModes.test_aten_fallback.expect b/test/expect/TestPytorchExportModes.test_aten_fallback.expect
index 41059587af0b..83c481fd7e9b 100644
--- a/test/expect/TestPytorchExportModes.test_aten_fallback.expect
+++ b/test/expect/TestPytorchExportModes.test_aten_fallback.expect
@@ -11,7 +11,7 @@ ModelProto {
       nodes: [
         Node {type: "Add", inputs: [0,1], outputs: [2], attributes: []},
         Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "ATen", inputs: [2,3], outputs: [4,5], attributes: [{ name: 'operator', type: string, value: 'qr'}]}
+        Node {type: "ATen", domain: "org.pytorch.aten", inputs: [2,3], outputs: [4,5], attributes: [{ name: 'operator', type: string, value: 'qr'}, { name: 'overload_name', type: string, value: ''}]}
       ]
     }
   opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}],
diff --git a/test/expect/TestPytorchExportModes.test_onnx_aten.expect b/test/expect/TestPytorchExportModes.test_onnx_aten.expect
index 22f1c57f9570..3c2960f91f96 100644
--- a/test/expect/TestPytorchExportModes.test_onnx_aten.expect
+++ b/test/expect/TestPytorchExportModes.test_onnx_aten.expect
@@ -9,7 +9,7 @@ ModelProto {
       outputs: [{name: "2", type:Tensor dims: 3 4}]
       initializers: []
       nodes: [
-        Node {type: "ATen", inputs: [0,1], outputs: [2], attributes: [{ name: 'operator', type: string, value: 'fmod'}]}
+        Node {type: "ATen", domain: "org.pytorch.aten", inputs: [0,1], outputs: [2], attributes: [{ name: 'operator', type: string, value: 'fmod'}, { name: 'overload_name', type: string, value: ''}]}
       ]
     }
   opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}],
diff --git a/test/expect/TestScript.test_listconstruct_erasure.expect b/test/expect/TestScript.test_listconstruct_erasure.expect
index 0f7d470b0709..8172b3fe0c76 100644
--- a/test/expect/TestScript.test_listconstruct_erasure.expect
+++ b/test/expect/TestScript.test_listconstruct_erasure.expect
@@ -13,7 +13,7 @@ ModelProto {
         Node {type: "Less", inputs: [0,1], outputs: [2], attributes: []},
         Node {type: "Cast", inputs: [2], outputs: [3], attributes: [{ name: 'to', type: int, value: 2}]},
         Node {type: "Cast", inputs: [3], outputs: [4], attributes: [{ name: 'to', type: int, value: 9}]},
-        Node {type: "ATen", inputs: [0,4], outputs: [5], attributes: [{ name: 'operator', type: string, value: 'index'}]}
+        Node {type: "ATen", domain: "org.pytorch.aten", inputs: [0,4], outputs: [5], attributes: [{ name: 'operator', type: string, value: 'index'}, { name: 'overload_name', type: string, value: ''}]}
       ]
     }
   opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}],
diff --git a/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect b/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect
deleted file mode 100644
index a30958d09d97..000000000000
--- a/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect
+++ /dev/null
@@ -1,176 +0,0 @@
-# shape: torch.Size([10, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([11])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-
-# shape: torch.Size([100, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([101])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-
-# shape: torch.Size([1000, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([1001])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 1])
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
diff --git a/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect b/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect
deleted file mode 100644
index 551092b4a56e..000000000000
--- a/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect
+++ /dev/null
@@ -1,176 +0,0 @@
-# shape: torch.Size([10, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([11])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-
-# shape: torch.Size([100, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([101])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-
-# shape: torch.Size([1000, 10])
-# nnz: 10
-# crow_indices shape: torch.Size([1001])
-# col_indices shape: torch.Size([10])
-# values_shape: torch.Size([10])
-########## torch.float32/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int32 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float64/torch.int64 ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 1]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 1], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
new file mode 100644
index 000000000000..bcffa8293c93
--- /dev/null
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
@@ -0,0 +1,907 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]])
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], dtype=torch.int32)
+# _row_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], size=(1, 0, 0))
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]])
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]])
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], dtype=torch.int32)
+# _row_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4])
+# _row_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]])
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0])
+# _row_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], size=(1, 0, 0))
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]])
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]])
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4])
+# _row_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0])
+# _row_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
new file mode 100644
index 000000000000..9f74cd7eb53f
--- /dev/null
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
@@ -0,0 +1,907 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]])
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], dtype=torch.int32)
+# _col_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], size=(1, 0, 0))
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]])
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]])
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], dtype=torch.int32)
+# _col_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4])
+# _col_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]])
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0])
+# _col_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], size=(1, 0, 0))
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]])
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]])
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4])
+# _col_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0])
+# _col_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect
new file mode 100644
index 000000000000..a449883a3fe2
--- /dev/null
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect
@@ -0,0 +1,379 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.])
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], dtype=torch.int32)
+# _row_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([])
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]])
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], dtype=torch.int32)
+# _row_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4])
+# _row_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([1., 2., 3., 4.])
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0])
+# _row_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([])
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]])
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4])
+# _row_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([1., 2., 3., 4.], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0])
+# _row_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect
new file mode 100644
index 000000000000..02476652e4b7
--- /dev/null
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect
@@ -0,0 +1,379 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.])
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], dtype=torch.int32)
+# _col_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([])
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]])
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], dtype=torch.int32)
+# _col_indices
+tensor([], dtype=torch.int32)
+# _values
+tensor([], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4])
+# _col_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([1., 2., 3., 4.])
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0])
+# _col_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([])
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]])
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4])
+# _col_indices
+tensor([0, 1, 0, 1])
+# _values
+tensor([1., 2., 3., 4.], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0])
+# _col_indices
+tensor([], dtype=torch.int64)
+# _values
+tensor([], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]])
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]])
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]])
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]])
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
new file mode 100644
index 000000000000..df75cb3a4f61
--- /dev/null
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
@@ -0,0 +1,907 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0))
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0')
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0')
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], device='cuda:0')
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0))
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0')
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0')
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0], device='cuda:0')
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
new file mode 100644
index 000000000000..5ab909227272
--- /dev/null
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
@@ -0,0 +1,907 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0))
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0')
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0')
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], device='cuda:0')
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0))
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0')
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([[[ 1., 11.]],
+
+                      [[ 2., 22.]],
+
+                      [[ 3., 33.]],
+
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0')
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([[[ 1., 11.]],
+
+        [[ 2., 22.]],
+
+        [[ 3., 33.]],
+
+        [[ 4., 44.]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0], device='cuda:0')
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]],
+
+
+                      [[[ 1., 11.]],
+
+                       [[ 2., 22.]],
+
+                       [[ 3., 33.]],
+
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]],
+
+
+        [[[ 1., 11.]],
+
+         [[ 2., 22.]],
+
+         [[ 3., 33.]],
+
+         [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]],
+
+
+
+                      [[[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]],
+
+
+                       [[[ 1., 11.]],
+
+                        [[ 2., 22.]],
+
+                        [[ 3., 33.]],
+
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]],
+
+
+
+        [[[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]],
+
+
+         [[[ 1., 11.]],
+
+          [[ 2., 22.]],
+
+          [[ 3., 33.]],
+
+          [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect
new file mode 100644
index 000000000000..4292bfcd2199
--- /dev/null
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect
@@ -0,0 +1,379 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0')
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], device='cuda:0')
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 2, 4]),
+       row_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0, 2, 4], device='cuda:0')
+# _row_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0]),
+       row_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([0], device='cuda:0')
+# _row_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       row_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _row_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+# _ccol_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
+
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect
new file mode 100644
index 000000000000..918f2570807f
--- /dev/null
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect
@@ -0,0 +1,379 @@
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0')
+
+########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0')
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], device='cuda:0')
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0')
+
+########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 4]),
+       col_indices=tensor([0, 1, 0, 1]),
+       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0, 2, 4], device='cuda:0')
+# _col_indices
+tensor([0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=()/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([0]),
+       col_indices=tensor([], size=(0,)),
+       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([0], device='cuda:0')
+# _col_indices
+tensor([], device='cuda:0', dtype=torch.int64)
+# _values
+tensor([], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[0, 2, 4],
+                            [0, 2, 4]]),
+       col_indices=tensor([[0, 1, 0, 1],
+                           [0, 1, 0, 1]]),
+       values=tensor([[1., 2., 3., 4.],
+                      [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[0, 2, 4],
+        [0, 2, 4]], device='cuda:0')
+# _col_indices
+tensor([[0, 1, 0, 1],
+        [0, 1, 0, 1]], device='cuda:0')
+# _values
+tensor([[1., 2., 3., 4.],
+        [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]],
+
+                            [[0, 2, 4],
+                             [0, 2, 4],
+                             [0, 2, 4]]]),
+       col_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]],
+
+                           [[0, 1, 0, 1],
+                            [0, 1, 0, 1],
+                            [0, 1, 0, 1]]]),
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([[[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]],
+
+        [[0, 2, 4],
+         [0, 2, 4],
+         [0, 2, 4]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]],
+
+        [[0, 1, 0, 1],
+         [0, 1, 0, 1],
+         [0, 1, 0, 1]]], device='cuda:0')
+# _values
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
+
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
+
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index b5e3343489b8..b8927d0cfc70 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -34,9 +34,11 @@
     ("prim::MKLDNNRelu6", datetime.date(9999, 1, 1)),
     ("prim::MKLDNNRelu6_", datetime.date(9999, 1, 1)),
     ("prim::Concat", datetime.date(9999, 1, 1)),
+    ("prim::is_mlc", datetime.date(2022, 5, 20)),
     # Internal, profiler-specific ops
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),
+    ("aten::_sparse_addmm", datetime.date(2022, 6, 30)),
     ("aten::linalg_matrix_rank", datetime.date(2021, 10, 30)),
     ("aten::linalg_pinv", datetime.date(2021, 10, 30)),
     ("aten::_cholesky_helper", datetime.date(9999, 1, 1)),
@@ -50,12 +52,8 @@
     ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)),
     ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)),
     ("aten::randperm", datetime.date(9999, 1, 1)),
-    ("aten::_conv_depthwise2d_backward", datetime.date(2022, 1, 31)),
-    ("aten::conv_depthwise3d_backward", datetime.date(2022, 1, 31)),
-    ("aten::cudnn_convolution.deprecated", datetime.date(2022, 1, 31)),
-    ("aten::cudnn_convolution.deprecated2", datetime.date(2022, 1, 31)),
-    ("aten::cudnn_convolution_transpose.deprecated", datetime.date(2022, 1, 31)),
-    ("aten::cudnn_convolution_transpose.deprecated2", datetime.date(2022, 1, 31)),
+    ("aten::gelu", datetime.date(2022, 3, 1)),
+    ("aten::gelu_backward", datetime.date(2022, 3, 1)),
     ("aten::cudnn_convolution_backward", datetime.date(2022, 1, 31)),
     ("aten::cudnn_convolution_backward_input", datetime.date(2022, 1, 31)),
     ("aten::cudnn_convolution_backward_weight", datetime.date(2022, 1, 31)),
@@ -78,16 +76,23 @@
     ("aten::slow_conv_transpose2d_backward", datetime.date(2022, 1, 31)),
     ("aten::slow_conv_transpose3d", datetime.date(2022, 1, 31)),
     ("aten::slow_conv_transpose3d_backward", datetime.date(2022, 1, 31)),
+    ("aten::solve", datetime.date(9999, 1, 1)),
+    ("aten::solve.solution", datetime.date(9999, 1, 1)),
+    ("aten::_solve_helper", datetime.date(9999, 1, 1)),
     ("aten::_index_copy_", datetime.date(2022, 5, 31)),
     ("aten::_svd_helper", datetime.date(2022, 3, 31)),
     ("aten::linalg_svdvals", datetime.date(2022, 3, 31)),
     ("aten::linalg_svdvals_out", datetime.date(2022, 3, 31)),
     ("aten::linalg_svd", datetime.date(2022, 3, 31)),
     ("aten::linalg_svd_out", datetime.date(2022, 3, 31)),
+    ("aten::linalg_qr_out", datetime.date(2022, 5, 31)),
+    ("aten::linalg_qr", datetime.date(2022, 5, 31)),
     ("aten::_max_pool1d_cpu_forward", datetime.date(2022, 2, 8)),
+    ("aten::max_unpool2d_backward", datetime.date(2022, 5, 15)),
+    ("aten::max_unpool2d_backward.grad_input", datetime.date(2022, 5, 15)),
+    ("aten::max_unpool3d_backward", datetime.date(2022, 5, 15)),
+    ("aten::max_unpool3d_backward.grad_input", datetime.date(2022, 5, 15)),
     ("aten::_convolution_nogroup", datetime.date(9999, 1, 1)),
-    ("aten::linspace", datetime.date(2022, 3, 1)),  # TODO this will be removed soon
-    ("aten::logspace", datetime.date(2022, 3, 1)),  # TODO this will be removed soon
     ("aten::miopen_convolution_backward", datetime.date(9999, 1, 1)),
     ("aten::miopen_convolution_backward_bias", datetime.date(9999, 1, 1)),
     ("aten::miopen_convolution_backward_input", datetime.date(9999, 1, 1)),
@@ -98,6 +103,8 @@
     ("aten::miopen_depthwise_convolution_backward", datetime.date(9999, 1, 1)),
     ("aten::miopen_depthwise_convolution_backward_input", datetime.date(9999, 1, 1)),
     ("aten::miopen_depthwise_convolution_backward_weight", datetime.date(9999, 1, 1)),
+    ("aten::is_mlc", datetime.date(2022, 5, 20)),
+    ("aten::_nested_tensor", datetime.date(9999, 1, 1)),
     ("caffe2::", datetime.date(2021, 10, 23)),
     ("prepacked::unpack_prepacked_sizes_conv2d", datetime.date(9999, 1, 1)),
     ("prepacked::unpack_prepacked_sizes_linear", datetime.date(9999, 1, 1)),
@@ -112,6 +119,25 @@
     ("aten::_scatter_reduce", datetime.date(2022, 1, 31)),
     ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)),
     ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)),
+    ("aten::grid_sampler_3d_backward", datetime.date(9999, 1, 1)),
+    ("aten::_transform_bias_rescale_qkv", datetime.date(9999, 1, 1)),
+    ("aten::scatter_reduce.two", datetime.date(2022, 4, 15)),
+    ("aten::_s_where", datetime.date(2022, 9, 30)),
+    ("quantized::conv2d_cudnn", datetime.date(2022, 3, 22)),
+    ("quantized::conv2d_relu_cudnn", datetime.date(2022, 3, 22)),
+    ("prim::infer_squeeze_size.dim", datetime.date(9999, 1, 1)),
+    ("prim::infer_squeeze_size", datetime.date(9999, 1, 1)),
+    ("aten::_cat", datetime.date(2022, 5, 15)),
+    ("aten::nansum", datetime.date(2022, 5, 15)),
+    ("aten::zero", datetime.date(2022, 5, 15)),
+    ("aten::_validate_sparse_compressed_tensor_args", datetime.date(2022, 5, 15)),
+    ("aten::stft", datetime.date(2022, 5, 23)),
+    ("aten::linalg_lu_solve", datetime.date(2022, 5, 23)),
+    ("aten::linalg_lu_solve.out", datetime.date(2022, 5, 23)),
+    ("aten::_index_reduce", datetime.date(2022, 5, 15)),
+    ("aten::_csr_to_block_csr", datetime.date(2022, 5, 20)),
+    ("aten::_weight_norm_cuda_interface", datetime.date(9999, 1, 1)),
+    ("aten::_weight_norm_cuda_interface_backward", datetime.date(9999, 1, 1)),
 ]
 
 ALLOW_LIST_COMPILED = [
@@ -140,6 +166,33 @@ def allow_listed(schema):
     ("dist_c10d", datetime.date(2099, 9, 17)),
 ]
 
+def has_valid_upgraders(schema, version_map):
+    # we want to parse through the map to find if
+    # the schema has valid upgraders. Since the
+    # version map has entry for each overload
+    # we need to do some ugly parsing.
+
+    # the name of the operator
+    schema_name = schema.name
+
+    if schema_name not in version_map:
+        return False
+
+    entries = version_map[schema_name]
+
+    possible_overloads = []
+    possible_schemas = []
+    for key, upgrader_schema_entries in entries.items():
+        possible_overloads.append(key)
+        possible_schemas.extend(upgrader_schema_entries)
+
+    # let's make sure this existing schema is part of possible
+    # schemas
+    for old_schema in possible_schemas:
+        if old_schema == schema:
+            return True
+
+    return False
 
 def dont_parse(schema_line):
     for item in dont_parse_list:
@@ -158,14 +211,33 @@ def load_schemas_to_dict():
         new_schema_dict[s.name].append(s)
     return new_schema_dict
 
+def process_version_map(version_map):
+    # version map maps full schema name to
+    # list of upgraders. Since we only have
+    # the name of the schema (aka no overload)
+    # we want to first process the map to make
+    # the key lookup easier. After this it will be:
+    # Dict[schema_name, Dict[overload, List[schema]]]
+
+    output = defaultdict(dict)
+    for (key, entries) in version_map.items():
+        operator_name = key.split(".")[0]
+        schema_entries = [parse_schema(entry.old_schema) for entry in entries]
+        output[operator_name][key] = schema_entries
+    return output
+
 def check_bc(existing_schemas):
     new_schema_dict = load_schemas_to_dict()
+    version_map = process_version_map(torch._C._get_operator_version_map())
     is_bc = True
     broken_ops = []
     for existing_schema in existing_schemas:
         if allow_listed(existing_schema):
             print("schema: ", str(existing_schema), " found on allowlist, skipping")
             continue
+        if has_valid_upgraders(existing_schema, version_map):
+            print("schema: ", str(existing_schema), " has valid upgrader, skipping")
+            continue
         print("processing existing schema: ", str(existing_schema))
         matching_new_schemas = new_schema_dict.get(existing_schema.name, [])
         found = False
diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py
index 0d178e956c47..80198c2baeaa 100644
--- a/test/fx/test_fx_const_fold.py
+++ b/test/fx/test_fx_const_fold.py
@@ -5,7 +5,6 @@
 import torch
 import torch.fx
 from torch.fx.experimental import const_fold
-from torch.fx.experimental.fx_acc import acc_tracer, acc_ops
 from torch.testing._internal.common_utils import TestCase
 
 
@@ -610,14 +609,14 @@ def forward(self, x):
 
         mod = ConstFoldTestModule()
         in_x = torch.randn(2, 4)
-        gm = acc_tracer.trace(mod, in_x)
+        gm = torch.fx.symbolic_trace(mod)
 
         def skip_folding_quant_dequant(node: torch.fx.Node):
-            if node.target != acc_ops.quantize_per_tensor:
+            if node.target != torch.quantize_per_tensor:
                 return False
             # If quantize_per_node -> dequantize, then skip folding.
             for user in node.users:
-                if user.target == acc_ops.dequantize:
+                if user.target == torch.dequantize:
                     return True
             return False
 
diff --git a/test/fx_acc/test_acc_tracer.py b/test/fx_acc/test_acc_tracer.py
deleted file mode 100644
index f16eef8e5286..000000000000
--- a/test/fx_acc/test_acc_tracer.py
+++ /dev/null
@@ -1,2104 +0,0 @@
-# Owner(s): ["oncall: fx"]
-
-import unittest
-from typing import Callable, List
-
-import numpy as np
-import torch
-import torch.fx.experimental.fx_acc.acc_normalizer as acc_normalizer
-import torch.fx.experimental.fx_acc.acc_ops as acc_ops
-import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer
-import torch.fx.experimental.fx_acc.acc_utils as acc_utils
-import torch.nn as nn
-import torchvision
-from parameterized import parameterized, param
-
-torch.manual_seed(0)
-
-
-class AccTracerTest(unittest.TestCase):
-    def _make_model_unit_test(
-        self,
-        model,
-        *args,
-        input_shape=None,
-        enable_allclose=False,
-        **kwargs,
-    ):
-        """
-        Test that the model can be traced correctly and is producing correct
-        result.
-        """
-        if input_shape is None:
-            input_shape = [1, 3, 224, 224]
-        input = torch.randn(input_shape)
-        traced = acc_tracer.trace(model, [input])
-        if enable_allclose:
-            torch.testing.assert_allclose(model(input), traced(input))
-        else:
-            self.assertTrue(torch.equal(model(input), traced(input)))
-        traced_again = acc_tracer.trace(traced, [input])
-        if enable_allclose:
-            torch.testing.assert_allclose(model(input), traced_again(input))
-        else:
-            self.assertTrue(torch.equal(model(input), traced_again(input)))
-
-    def _make_acc_op_function_test(
-        self,
-        acc_op: Callable,
-        torch_op,
-        *args,
-        input_shape=(2, 3),
-        validate_same_kwargs=True,
-        enable_allclose=False,
-        **kwargs,
-    ):
-        """
-        Test that acc_op is traced somewhat.
-        """
-
-        class TestModule(torch.nn.Module):
-            def __init__(self, torch_op, args, kwargs):
-                super().__init__()
-                self._torch_op = torch_op
-                self._args = args
-                self._kwargs = kwargs
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self._torch_op(a, *self._args, **self._kwargs)
-        m = TestModule(torch_op, args, kwargs)
-        m.eval()
-        a = torch.randn(*input_shape)
-        traced = acc_tracer.trace(m, [a])
-        ph_a = acc_op_node = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_op)
-                self.assertEqual(node.kwargs["input"], ph_a)
-                if validate_same_kwargs:
-                    for key, value in kwargs.items():
-                        self.assertEqual(node.kwargs[key], value)
-                acc_op_node = node
-            elif node.op == "output":
-                if acc_op is None:
-                    # If we expect no new acc_op after graph building
-                    # and found we have only output in traced graph
-                    continue
-                self.assertEqual(acc_op_node, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        ref_outputs = m(a)
-        outputs = traced(a)
-        traced_again = acc_tracer.trace(traced, [a])
-        outputs_again = traced_again(a)
-        if isinstance(ref_outputs, torch.Tensor):
-            ref_outputs = [ref_outputs]
-            outputs = [outputs]
-            outputs_again = [outputs_again]
-
-        for ref_output, output, output_again in zip(
-            ref_outputs, outputs, outputs_again
-        ):
-            if enable_allclose:
-                torch.testing.assert_allclose(
-                    torch.nan_to_num(ref_output), torch.nan_to_num(output)
-                )
-                torch.testing.assert_allclose(
-                    torch.nan_to_num(ref_output), torch.nan_to_num(output_again)
-                )
-            else:
-                self.assertTrue(
-                    torch.equal(torch.nan_to_num(ref_output), torch.nan_to_num(output))
-                )
-                self.assertTrue(
-                    torch.equal(
-                        torch.nan_to_num(ref_output), torch.nan_to_num(output_again)
-                    )
-                )
-
-    def test_sum(self):
-        self._make_acc_op_function_test(acc_ops.sum, torch.sum)
-        self._make_acc_op_function_test(acc_ops.sum, torch.sum, dim=(1,), keepdim=True)
-
-    def test_prod(self):
-        self._make_acc_op_function_test(acc_ops.prod, torch.prod)
-        self._make_acc_op_function_test(acc_ops.prod, torch.prod, dim=1, keepdim=True)
-
-    def test_mean(self):
-        self._make_acc_op_function_test(acc_ops.mean, torch.mean)
-        self._make_acc_op_function_test(
-            acc_ops.mean, torch.mean, dim=(1,), keepdim=True
-        )
-
-    def test_pad(self):
-        self._make_acc_op_function_test(
-            acc_ops.pad, torch.nn.functional.pad, pad=(2, 0)
-        )
-
-    def test_max(self):
-        def torch_max(x, *args, **kwargs):
-            return x.max(*args, **kwargs)
-
-        self._make_acc_op_function_test(acc_ops.max_full_reduce, torch_max)
-        self._make_acc_op_function_test(
-            acc_ops.max_dim_reduce, torch_max, dim=1, keepdim=True
-        )
-        self._make_acc_op_function_test(
-            acc_ops.max_dim_reduce, torch_max, input_shape=(1, 4), dim=1, keepdim=True
-        )
-        self._make_acc_op_function_test(
-            acc_ops.max_dim_reduce, torch_max, input_shape=(3, 4, 3), dim=2
-        )
-
-    @parameterized.expand(
-        [
-            param("max_maximum", orig_op=torch.max, expected_op=acc_ops.maximum),
-            param(
-                "maximum_maximum", orig_op=torch.maximum, expected_op=acc_ops.maximum
-            ),
-            param("min_minimum", orig_op=torch.min, expected_op=acc_ops.minimum),
-            param(
-                "minimum_minimum", orig_op=torch.minimum, expected_op=acc_ops.minimum
-            ),
-        ]
-    )
-    def test_maximum_minimum(self, _: str, orig_op, expected_op):
-        class TestModule(torch.nn.Module):
-            def __init__(self, orig_op):
-                super().__init__()
-                self.orig_op = orig_op
-
-            def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
-                return self.orig_op(input, other)
-
-        m = TestModule(orig_op)
-        input, other = torch.randn(2, 2), torch.randn(2, 2)
-        traced = acc_tracer.trace(m, [input, other])
-
-        ph_in = ph_oth = mxm = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "other":
-                    ph_oth = node
-                else:
-                    self.assertTrue(str(node.target) == "input")
-                    ph_in = node
-            elif node.op == "call_function":
-                if node.target == expected_op:
-                    self.assertEqual(node.kwargs["input"], ph_in)
-                    self.assertEqual(node.kwargs["other"], ph_oth)
-                    mxm = node
-            elif node.op == "output":
-                self.assertEqual(mxm, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input, other), traced(input, other)))
-
-    def test_conv(self):
-        """
-        Test that a conv is traced as expected.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = nn.Conv2d(8, 7, 3, stride=2)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.conv(a)
-
-        m = TestModule()
-        input = torch.randn(3, 8, 10, 10)
-        traced = acc_tracer.trace(m, [input])
-
-        ph = weight_attr = bias_attr = conv = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "conv.weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "conv.bias":
-                bias_attr = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.conv2d)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                self.assertEqual(node.kwargs["stride"], (2, 2))
-                self.assertEqual(node.kwargs["padding"], (0, 0))
-                self.assertEqual(node.kwargs["dilation"], (1, 1))
-                self.assertEqual(node.kwargs["groups"], 1)
-                conv = node
-            elif node.op == "output":
-                self.assertEqual(conv, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_quantized_conv2d(self):
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = nn.quantized.Conv2d(3, 3, 1)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.conv(a)
-
-        m = TestModule()
-        input = torch.quantize_per_tensor(
-            torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8
-        )
-        traced = acc_tracer.trace(m, [input])
-        print(traced.graph)
-        ph = weight_attr = bias_attr = conv = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "conv_weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "conv_bias":
-                bias_attr = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.quantized_conv2d)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                conv = node
-            elif node.op == "output":
-                self.assertEqual(conv, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_quantized_convrelu2d(self):
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = nn.intrinsic.quantized.ConvReLU2d(3, 3, 1)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.conv(a)
-
-        m = TestModule()
-        input = torch.quantize_per_tensor(
-            torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8
-        )
-        traced = acc_tracer.trace(m, [input])
-        ph = weight_attr = bias_attr = conv = relu = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "conv_weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "conv_bias":
-                bias_attr = node
-            elif node.op == "call_function" and node.target == acc_ops.quantized_conv2d:
-                self.assertEqual(node.target, acc_ops.quantized_conv2d)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                conv = node
-            elif node.op == "call_function" and node.target == acc_ops.relu:
-                self.assertEqual(node.target, acc_ops.relu)
-                self.assertEqual(node.kwargs["input"], conv)
-                relu = node
-            elif node.op == "output":
-                self.assertEqual(relu, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_embedding_bag(self):
-        """
-        Test that an embedding_bag is traced as expected.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.eb = nn.EmbeddingBag(10, 3, mode="sum", include_last_offset=True)
-
-            def forward(self, inp: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor:
-                return self.eb(inp, offsets)
-
-        m = TestModule()
-        inp = torch.LongTensor([1, 2, 4, 5, 4, 3, 2, 9])
-        offsets = torch.LongTensor([0, 4])
-        traced = acc_tracer.trace(m, [inp, offsets])
-
-        inp_node = offsets_node = weight_attr = eb_node = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "inp":
-                    inp_node = node
-                elif str(node.target) == "offsets":
-                    offsets_node = node
-                else:
-                    self.fail(f"Unexpected placeholder {node.target}.")
-                continue
-            elif node.op == "get_attr" and node.target == "eb.weight":
-                weight_attr = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.embedding_bag)
-                # Note: Normalization called from acc_tracer means we use all kwargs.
-                self.assertEqual(node.kwargs["input"], inp_node)
-                self.assertEqual(node.kwargs["offsets"], offsets_node)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["mode"], "sum")
-                self.assertEqual(node.kwargs["include_last_offset"], True)
-                # The rest of these were unspecified, so verify they fell back
-                # to their respective default values thanks to normalization.
-                self.assertEqual(node.kwargs["max_norm"], None)
-                self.assertEqual(node.kwargs["norm_type"], 2.0)
-                self.assertEqual(node.kwargs["scale_grad_by_freq"], False)
-                self.assertEqual(node.kwargs["sparse"], False)
-                self.assertEqual(node.kwargs["per_sample_weights"], None)
-                eb_node = node
-            elif node.op == "output":
-                self.assertEqual(eb_node, node.args[0])
-
-        self.assertTrue(torch.equal(m(inp, offsets), traced(inp, offsets)))
-
-    def test_embedding_bag_byte_and_4bit_rowwise_offsets(self):
-        """
-        Test that 4 bit quantized embedding_bag is traced as expected.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(
-                self,
-                op,
-                q_weights,
-                per_index_weights,
-            ):
-                super().__init__()
-                self.emb = op
-                self.q_weights = q_weights
-                self.per_index_weights = per_index_weights
-
-            def forward(
-                self,
-                indices,
-                offsets,
-            ):
-                return self.emb(
-                    self.q_weights,
-                    indices,
-                    offsets,
-                    mode=0,
-                    per_sample_weights=self.per_index_weights,
-                    include_last_offset=True,
-                )
-
-        def run_embedding_bag_test(is_4bit, use_weights):
-            # generate random indices, offsets, and weights.
-            num_embeddings = 16
-            embedding_dim = 32
-            num_lengths = 10
-
-            weights = torch.from_numpy(
-                (np.random.random_sample((num_embeddings, embedding_dim)) + 1).astype(
-                    np.float32
-                )
-            )
-            q_weights = (
-                torch.ops.quantized.embedding_bag_4bit_prepack(weights)
-                if is_4bit
-                else torch.ops.quantized.embedding_bag_byte_prepack(weights)
-            )
-            np_lengths = np.random.randint(0, num_lengths, size=10).astype(np.int32)
-
-            num_lengths = np.sum(np_lengths)
-            indices = torch.from_numpy(
-                np.random.randint(low=0, high=num_embeddings, size=num_lengths)
-            ).int()
-
-            lengths = torch.from_numpy(np_lengths)
-            offsets = torch.cat([torch.zeros([1]), torch.cumsum(lengths, 0)]).int()
-
-            weights = torch.randint(low=0, high=4, size=indices.size())
-            per_sample_weights = weights.to(torch.float32)
-
-            indices = indices.to(torch.int32)
-            offsets = offsets.to(torch.int32)
-            inputs = [
-                indices,
-                offsets,
-            ]
-
-            op = (
-                torch.ops.quantized.embedding_bag_4bit_rowwise_offsets
-                if is_4bit
-                else torch.ops.quantized.embedding_bag_byte_rowwise_offsets
-            )
-
-            m = TestModule(
-                op,
-                q_weights,
-                per_sample_weights,
-            )
-
-            traced = acc_tracer.trace(m, inputs)
-            print(traced.graph)
-
-            expected_target = (
-                acc_ops.embedding_bag_4bit_rowwise_offsets
-                if is_4bit
-                else acc_ops.embedding_bag_byte_rowwise_offsets
-            )
-
-            for node in traced.graph.nodes:
-                if node.op == "placeholder":
-                    if str(node.target) == "indices":
-                        inp_node = node
-                    elif str(node.target) == "offsets":
-                        offsets_node = node
-                    else:
-                        self.fail(f"Unexpected placeholder {node.target}.")
-                    continue
-                elif node.op == "get_attr" and node.target == "q_weights":
-                    weight_attr = node
-                elif node.op == "call_function":
-                    self.assertEqual(node.target, expected_target)
-                    # Note: Normalization called from acc_tracer means we use all kwargs.
-                    self.assertEqual(node.kwargs["indices"], inp_node)
-                    self.assertEqual(node.kwargs["offsets"], offsets_node)
-                    self.assertEqual(node.kwargs["weight"], weight_attr)
-                    self.assertEqual(node.kwargs["mode"], 0)
-                    self.assertEqual(node.kwargs["include_last_offset"], True)
-                    # The rest of these were unspecified, so verify they fell back
-                    # to their respective default values thanks to normalization.
-                    eb_node = node
-                elif node.op == "output":
-                    self.assertEqual(eb_node, node.args[0])
-            self.assertTrue(torch.equal(m(indices, offsets), traced(indices, offsets)))
-
-        # test 8-bit
-        run_embedding_bag_test(is_4bit=False, use_weights=True)
-        # test 4-bit
-        run_embedding_bag_test(is_4bit=True, use_weights=True)
-
-    def test_quantized_batch_norm2d(self):
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bn = nn.quantized.BatchNorm2d(3)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.bn(a)
-
-        m = TestModule()
-        m.eval()
-        input = torch.quantize_per_tensor(
-            torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8
-        )
-        traced = acc_tracer.trace(m, [input])
-        ph = weight_attr = bias_attr = bn_mean = bn_var = bn = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "bn.weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "bn.bias":
-                bias_attr = node
-            elif node.op == "get_attr" and node.target == "bn.running_mean":
-                bn_mean = node
-            elif node.op == "get_attr" and node.target == "bn.running_var":
-                bn_var = node
-            elif node.op == "get_attr" and node.target == "bn.scale":
-                bn_scale = node
-            elif node.op == "get_attr" and node.target == "bn.zero_point":
-                bn_zero_point = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.quantized_batch_norm2d)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                self.assertEqual(node.kwargs["running_mean"], bn_mean)
-                self.assertEqual(node.kwargs["running_var"], bn_var)
-                self.assertEqual(node.kwargs["acc_out_ty"][6]["scale"], bn_scale)
-                self.assertEqual(
-                    node.kwargs["acc_out_ty"][6]["zero_point"], bn_zero_point
-                )
-                bn = node
-            elif node.op == "output":
-                self.assertEqual(bn, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_linear(self):
-        """
-        Test that a linear is traced as expected, i.e. to the functional level and with
-        kwarg normalization. Also verify that symbolic shape inference worked as part of
-        the acc_tracer.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = nn.Linear(3, 5, bias=True)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.linear(a)
-
-        m = TestModule()
-        test_input = torch.randn(1, 3)
-        traced = acc_tracer.trace(m, test_input)
-        ph = weight_attr = bias_attr = linear = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "linear.weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "linear.bias":
-                bias_attr = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.linear)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                linear = node
-            elif node.op == "output":
-                self.assertEqual(linear, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-        self.assertTrue(torch.equal(m(test_input), traced(test_input)))
-
-    def test_quantized_linear(self):
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = nn.quantized.Linear(3, 5)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.linear(a)
-
-        m = TestModule()
-        input = torch.quantize_per_tensor(
-            torch.randn(2, 3), scale=0.01, zero_point=3, dtype=torch.quint8
-        )
-        traced = acc_tracer.trace(m, [input])
-        ph = weight_attr = bias_attr = linear = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "linear_weight":
-                weight_attr = node
-            elif node.op == "get_attr" and node.target == "linear_bias":
-                bias_attr = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.quantized_linear)
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight_attr)
-                self.assertEqual(node.kwargs["bias"], bias_attr)
-                linear = node
-            elif node.op == "output":
-                self.assertEqual(linear, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    @parameterized.expand(
-        [
-            param("remove_exceptions_false", remove_exceptions=False),
-            param("remove_exceptions_true", remove_exceptions=True),
-        ]
-    )
-    def test_batch_norm(self, _, remove_exceptions):
-        """
-        Test that a batch norm is traced as expected, i.e. to the functional level
-        and with kwarg normalization. Note that we also expect to see a
-        ConditionalExceptionWrapper in the graph that the AST rewriter converted
-        from `if x: raise y`.
-
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bn = torch.nn.BatchNorm2d(2)
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.bn(a)
-
-        m = TestModule()
-        input = torch.randn(2, 2, 1, 1)
-        # Note: Explicitly not removing exceptions so that we can check they
-        # were found and exist below.
-        traced = acc_tracer.trace(
-            m,
-            [input],
-            remove_exceptions=remove_exceptions,
-        )
-
-        ph = exception_wrapper = weight = bias = mean = var = bn = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "get_attr" and node.target == "bn.weight":
-                weight = node
-            elif node.op == "get_attr" and node.target == "bn.bias":
-                bias = node
-            elif node.op == "get_attr" and node.target == "bn.running_mean":
-                mean = node
-            elif node.op == "get_attr" and node.target == "bn.running_var":
-                var = node
-            elif node.op == "call_function" and node.target == acc_ops.batch_norm:
-                # Note: Normalization called from acc_tracer means we use
-                # all kwargs.
-                self.assertEqual(node.kwargs["input"], ph)
-                self.assertEqual(node.kwargs["weight"], weight)
-                self.assertEqual(node.kwargs["bias"], bias)
-                self.assertEqual(node.kwargs["running_mean"], mean)
-                self.assertEqual(node.kwargs["running_var"], var)
-                bn = node
-            elif (
-                node.op == "call_module"
-                and node.target == "bn._conditional_exception_wrapper_ValueError"
-            ):
-                exception_wrapper = node
-            elif node.op == "output":
-                self.assertEqual(bn, node.args[0])
-
-        self.assertTrue(remove_exceptions or exception_wrapper is not None)
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_remove_asserts(self):
-        """
-        Test that a Module with asserts has the asserts automatically removed, as
-        well as calls to a class method that should be dead.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def _test_method(self, a):
-                return a
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                assert torch.equal(self._test_method(a), a)
-                return a
-
-        m = TestModule()
-        input = torch.randn(10)
-        traced = acc_tracer.trace(m, [input], ast_rewriter_allow_list={TestModule})
-        # Check we have no call_functions. If remove asserts didn't work
-        # correctly we would see a call to torch._assert, _test_method, and
-        # torch.equal.
-        for node in traced.graph.nodes:
-            self.assertFalse(node.op == "call_function")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_no_rewrite_leaf_module(self):
-        """
-        Test that when we supply a leaf module, we don't rewrite it
-        """
-
-        class TestChildModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return a.relu()
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.child = TestChildModule()
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.child(a) + self.child(a)
-
-        m = TestModule()
-        input = torch.randn(10)
-        traced = acc_tracer.trace(m, [input], leaf_module_list={TestChildModule})
-        # trace it again just in case
-        traced = acc_tracer.trace(traced, [input], leaf_module_list={TestChildModule})
-
-        for _, m in traced.named_children():
-            self.assertFalse("__AccRewrittenModule" in str(type(m)), str(type(m)))
-
-    def test_sequential(self):
-        """
-        Test that the tracer works for torch.nn.Sequential.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.model = nn.Sequential(nn.Sigmoid(), nn.ReLU())
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return self.model(a)
-
-        m = TestModule()
-        input = torch.randn(10)
-        traced = acc_tracer.trace(m, [input])
-
-        for node in traced.graph.nodes:
-            if node.op == "call_function":
-                is_sigmoid = node.target == acc_ops.sigmoid
-                is_relu = node.target == acc_ops.relu
-                self.assertTrue(is_sigmoid or is_relu)
-            else:
-                self.assertTrue(node.op == "placeholder" or node.op == "output")
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_unsqueeze(self):
-        """
-        Test that torch.unsqueeze is traced correctly.
-        """
-        self._make_acc_op_function_test(
-            acc_ops.unsqueeze,
-            torch.unsqueeze,
-            validate_same_kwargs=False,
-            dim=1,
-        )
-
-    def test_stack(self):
-        """
-        Test that torch.stack is traced correctly.
-        """
-
-        class TestModule(torch.nn.Module):
-            def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                return torch.stack((a, b), dim=1)
-
-        a, b = torch.randn(4, 5, 6), torch.randn(4, 5, 6)
-        mod = TestModule()
-        traced = acc_tracer.trace(mod, [a, b])
-        self.assertTrue(torch.equal(mod(a, b), traced(a, b)))
-
-        ph_a = ph_b = unsqueeze_a = unsqueeze_b = cat_node = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-                    ph_b = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.unsqueeze:
-                    if node.kwargs["input"] is ph_a:
-                        unsqueeze_a = node
-                    else:
-                        self.assertEqual(node.kwargs["input"], ph_b)
-                        unsqueeze_b = node
-                else:
-                    self.assertEqual(node.target, acc_ops.cat)
-                    self.assertEqual(node.kwargs["tensors"], [unsqueeze_a, unsqueeze_b])
-                    cat_node = node
-            elif node.op == "output":
-                self.assertEqual(cat_node, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-    def test_no_raise(self):
-        """
-        self that we can trace `if x: raise y(msg)` when the raise isn't executed.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a, b):
-                if torch.equal(a, b):
-                    raise AssertionError("a equaled b!")
-                return a
-
-        m = TestModule()
-        in_a, in_b = torch.randn(5), torch.randn(5)
-        traced = acc_tracer.trace(
-            m,
-            [in_a, in_b],
-            remove_exceptions=False,
-            use_acc_normalization=False,
-            ast_rewriter_allow_list={TestModule},
-        )
-
-        # Verify the structure of the graph, including the existence of the
-        # exception_wrapper.
-        ph_a = exception_wrapper = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-            elif node.op == "call_module":
-                self.assertEqual(
-                    node.target, "_conditional_exception_wrapper_AssertionError"
-                )
-                exception_wrapper = node
-            elif node.op == "output":
-                self.assertEqual(ph_a, node.args[0])
-
-        self.assertTrue(exception_wrapper is not None)
-
-        self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b)))
-
-    def test_yes_raise(self):
-        """
-        Test that we can trace `if x: raise y(msg)` when the raise is executed.
-        """
-        err_str = "a equaled b!"
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.err_str = err_str
-
-            def forward(self, a, b):
-                if torch.equal(a, b):
-                    raise RuntimeError(self.err_str)
-                return a
-
-        m = TestModule()
-        # Note: We must use different inputs here in order for shape_prop to work, as
-        # otherwise the exception is thrown (as expected/checked below).
-        in_a, in_b = torch.randn(5), torch.randn(5)
-        traced = acc_tracer.trace(
-            m,
-            [in_a, in_b],
-            remove_exceptions=False,
-            ast_rewriter_allow_list={TestModule},
-        )
-
-        # Verify the structure of the graph, including the existence of the
-        # exception_wrapper.
-        ph_a = exception_wrapper = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-            elif node.op == "call_module":
-                self.assertEqual(
-                    node.target, "_conditional_exception_wrapper_RuntimeError"
-                )
-                exception_wrapper = node
-            elif node.op == "output":
-                self.assertEqual(ph_a, node.args[0])
-
-        self.assertTrue(exception_wrapper is not None)
-
-        def test(mod):
-            try:
-                # Note: Use the same input here to ensure the exception is thrown.
-                mod(in_a, in_a)
-                self.fail("Shouldn't get here because exception should be thrown.")
-            except RuntimeError as e:
-                self.assertEqual(err_str, str(e))
-
-        test(m)
-        test(traced)
-
-    def test_remove_raise(self):
-        """
-        Test that we can trace `if x: raise y(msg)` and then remove the exception_wrapper.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a, b):
-                if torch.equal(a, b):
-                    raise AssertionError("a equaled b!")
-                return a
-
-        m = TestModule()
-        in_a, in_b = torch.randn(5), torch.randn(5)
-        traced = acc_tracer.trace(
-            m,
-            [in_a, in_b],
-            remove_exceptions=True,
-            ast_rewriter_allow_list={TestModule},
-        )
-
-        # Verify the structure of the graph, including the existence of the
-        # exception_wrapper.
-        ph_a = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-            elif node.op == "output":
-                self.assertEqual(ph_a, node.args[0])
-            else:
-                # Should not encounter any call_modules, e.g. to the
-                # exception_wrapper.
-                self.assertFalse(node.op == "call_module")
-
-        # Note: Using input in_a twice for the tracer version, which would
-        # trigger the raise if it was still there.
-        self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_a)))
-
-    def test_raise_no_message(self):
-        """
-        Test that we can trace `if x: raise y` when `y` has no message.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a, b):
-                if torch.equal(a, b):
-                    raise AssertionError
-                return a
-
-        m = TestModule()
-        in_a, in_b = torch.randn(5), torch.randn(5)
-        traced = acc_tracer.trace(
-            m,
-            [in_a, in_b],
-            remove_exceptions=False,
-            use_acc_normalization=False,
-            ast_rewriter_allow_list={TestModule},
-        )
-
-        # Verify the structure of the graph, including the existence of the
-        # exception_wrapper.
-        ph_a = exception_wrapper = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-            elif node.op == "call_module":
-                self.assertEqual(
-                    node.target, "_conditional_exception_wrapper_AssertionError"
-                )
-                exception_wrapper = node
-            elif node.op == "output":
-                self.assertEqual(ph_a, node.args[0])
-
-        self.assertTrue(exception_wrapper is not None)
-        self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b)))
-
-    def test_quantized_add(self):
-        """
-        Test that a quantized_add and acc_ops.quantize_per_tensor are traced as expected,
-        verifying the acc_out_tys are set as expected.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.q_input = torch.nn.quantized.Quantize(
-                    scale=1.0 / 128, zero_point=5, dtype=torch.quint8
-                )
-                self.q_other = torch.nn.quantized.Quantize(
-                    scale=1.0 / 128, zero_point=10, dtype=torch.quint8
-                )
-
-            def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
-                return torch.ops.quantized.add(
-                    self.q_input(input),
-                    self.q_other(other),
-                    scale=0.05,
-                    zero_point=1,
-                )
-
-        m = TestModule()
-        input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4)
-        traced = acc_tracer.trace(m, [input, other])
-
-        input_ph = other_ph = q_input = q_other = q_add = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "input":
-                    input_ph = node
-                else:
-                    self.assertTrue(str(node.target) == "other")
-                    other_ph = node
-            elif (
-                node.op == "call_function"
-                and node.target == acc_ops.quantize_per_tensor
-            ):
-                qparams = {
-                    "scale": 1.0 / 128,
-                    "zero_point": 5,
-                }
-                expected_md = acc_utils.build_raw_tensor_meta(
-                    dtype=torch.quint8,
-                    qparams=qparams,
-                )
-                if node.kwargs["input"] == input_ph:
-                    q_input = node
-                else:
-                    self.assertTrue(node.kwargs["input"] == other_ph)
-                    q_other = node
-                    qparams_copy = qparams.copy()
-                    qparams_copy["zero_point"] = 10
-                    expected_md = expected_md._replace(qparams=qparams_copy)
-                self.assertEqual(node.kwargs["acc_out_ty"], expected_md)
-            elif node.op == "call_function" and node.target == acc_ops.quantized_add:
-                self.assertEqual(node.kwargs["input"], q_input)
-                self.assertEqual(node.kwargs["other"], q_other)
-                qparams = {
-                    "scale": 0.05,
-                    "zero_point": 1,
-                }
-                expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams)
-                self.assertEqual(node.kwargs["acc_out_ty"], expected_md)
-                q_add = node
-            elif node.op == "output":
-                self.assertEqual(q_add, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input, other), traced(input, other)))
-
-    def test_quantized_mul(self):
-        """
-        Test that a quantized_mul and acc_ops.quantize_per_tensor are traced as expected,
-        verifying the acc_out_tys are set as expected.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.q_input = torch.nn.quantized.Quantize(
-                    scale=1.0 / 128, zero_point=5, dtype=torch.quint8
-                )
-                self.q_other = torch.nn.quantized.Quantize(
-                    scale=1.0 / 128, zero_point=10, dtype=torch.quint8
-                )
-
-            def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
-                return torch.ops.quantized.mul(
-                    self.q_input(input),
-                    self.q_other(other),
-                    scale=0.05,
-                    zero_point=1,
-                )
-
-        m = TestModule()
-        input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4)
-        traced = acc_tracer.trace(m, [input, other])
-
-        input_ph = other_ph = q_input = q_other = q_add = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "input":
-                    input_ph = node
-                else:
-                    self.assertTrue(str(node.target) == "other")
-                    other_ph = node
-            elif (
-                node.op == "call_function"
-                and node.target == acc_ops.quantize_per_tensor
-            ):
-                qparams = {
-                    "scale": 1.0 / 128,
-                    "zero_point": 5,
-                }
-                expected_md = acc_utils.build_raw_tensor_meta(
-                    dtype=torch.quint8,
-                    qparams=qparams,
-                )
-                if node.kwargs["input"] == input_ph:
-                    q_input = node
-                else:
-                    self.assertTrue(node.kwargs["input"] == other_ph)
-                    q_other = node
-                    qparams_copy = qparams.copy()
-                    qparams_copy["zero_point"] = 10
-                    expected_md = expected_md._replace(qparams=qparams_copy)
-                self.assertEqual(node.kwargs["acc_out_ty"], expected_md)
-            elif node.op == "call_function" and node.target == acc_ops.quantized_mul:
-                self.assertEqual(node.kwargs["input"], q_input)
-                self.assertEqual(node.kwargs["other"], q_other)
-                qparams = {
-                    "scale": 0.05,
-                    "zero_point": 1,
-                }
-                expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams)
-                self.assertEqual(node.kwargs["acc_out_ty"], expected_md)
-                q_add = node
-            elif node.op == "output":
-                self.assertEqual(q_add, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input, other), traced(input, other)))
-
-    def test_cat(self):
-        """
-        Test that torch.cat is traced correctly.
-        """
-
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                return torch.cat([a, a, b], 0)
-
-        m = TestModule()
-        a, b = torch.randn(2, 2), torch.randn(2, 2)
-        traced = acc_tracer.trace(m, (a, b))
-
-        ph_a = ph_b = cat = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-                    ph_b = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.cat)
-                self.assertEqual(node.kwargs["tensors"][0], ph_a)
-                self.assertEqual(node.kwargs["tensors"][1], ph_a)
-                self.assertEqual(node.kwargs["tensors"][2], ph_b)
-                self.assertEqual(node.kwargs["dim"], 0)
-                cat = node
-            elif node.op == "output":
-                self.assertEqual(cat, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(a, b), traced(a, b)))
-
-    def test_square(self):
-        """
-        Test that torch.square is traced correctly.
-        """
-        self._make_acc_op_function_test(acc_ops.mul, torch.square)
-
-    def test_reshape(self):
-        """
-        Test that torch.reshape is traced correctly.
-        """
-        self._make_acc_op_function_test(acc_ops.reshape, torch.reshape, (1, -1))
-        # arg = (1, -1)
-        self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape(1, -1))
-        # arg = ((1, -1))
-        self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape((1, -1)))
-
-    def test_transpose(self):
-        """
-        Test that torch.transpose is traced correctly.
-        """
-        self._make_acc_op_function_test(
-            acc_ops.permute, lambda x: torch.transpose(x, 1, 0)
-        )
-
-    def test_permute(self):
-        """
-        Test that torch.permute is traced correctly.
-        """
-
-        def torch_permute(a, *dim):
-            return a.permute(*dim)
-
-        self._make_acc_op_function_test(acc_ops.permute, torch_permute, 1, 0)
-
-    def test_min_full_reduce(self):
-        """
-        Test that test_min_full_reduce is traced correctly.
-        """
-        self._make_acc_op_function_test(acc_ops.min_full_reduce, torch.min)
-
-    def test_matmul(self):
-        """
-        Test that torch.matmul is traced correctly.
-        """
-
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                return torch.matmul(a, b)
-
-        m = TestModule()
-        a, b = torch.randn(2, 2), torch.randn(2, 2)
-        traced = acc_tracer.trace(m, [a, b])
-
-        ph_a = ph_b = matmul = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                else:
-                    self.assertTrue(str(node.target) == "b")
-                    ph_b = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.matmul)
-                self.assertEqual(node.kwargs["input"], ph_a)
-                self.assertEqual(node.kwargs["other"], ph_b)
-                matmul = node
-            elif node.op == "output":
-                self.assertEqual(matmul, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(a, b), traced(a, b)))
-
-    def test_bmm(self):
-        self._make_acc_op_function_test(
-            acc_ops.matmul, lambda x: torch.bmm(x, x), input_shape=(2, 4, 4)
-        )
-
-    def test_tile(self):
-        return self._make_acc_op_function_test(
-            acc_ops.tile, lambda x: torch.tile(x, (2, 1, 2)), input_shape=(1, 2)
-        )
-
-    def test_dropout(self):
-        self._make_acc_op_function_test(
-            None,
-            lambda x: nn.functional.dropout(x, training=False),
-            input_shape=(1, 2, 3),
-        )
-
-    def test_stochastic_depth(self):
-        self._make_acc_op_function_test(
-            None,
-            lambda x, p, mode, training: torchvision.ops.stochastic_depth(
-                x, p=p, mode=mode, training=training
-            ),
-            input_shape=(1, 2, 3),
-            p=0.5,
-            mode="row",
-            training=False,
-        )
-
-    def test_hardsigmoid(self):
-        self._make_acc_op_function_test(
-            acc_ops.hardsigmoid,
-            lambda x: nn.functional.hardsigmoid(x),
-            input_shape=(3, 4, 5),
-        )
-
-    def test_hardtanh(self):
-        self._make_acc_op_function_test(
-            acc_ops.hardtanh,
-            lambda x: nn.functional.hardtanh(x),
-            input_shape=(3, 4, 5),
-        )
-
-    def test_hardswish(self):
-        class TestModule(nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                y = nn.functional.hardswish(x)
-                return y
-
-        m = TestModule()
-        x = torch.randn(3, 4, 5)
-        traced = acc_tracer.trace(m, x)
-        ph_x = hardsigmoid_y = res_y = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                ph_x = node
-            elif node.op == "call_function" and node.target == acc_ops.hardsigmoid:
-                hardsigmoid_y = node
-                self.assertEqual(node.kwargs["input"], ph_x)
-            elif node.op == "call_function" and node.target == acc_ops.mul:
-                res_y = node
-                self.assertEqual(node.kwargs["input"], hardsigmoid_y)
-                self.assertEqual(node.kwargs["other"], ph_x)
-            elif node.op == "output":
-                self.assertEqual(node.args[0], res_y)
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        ref = m(x)
-        res = traced(x)
-        torch.testing.assert_allclose(ref, res)
-
-    def test_add_with_alpha(self):
-        """
-        Test that normalization works for torch add with alpha, which requires special
-        normalization handling.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                a1 = torch.add(a, b)
-                a2 = torch.add(a, b, alpha=1.0)
-                a3 = torch.add(a, b, alpha=0.5)
-                return a1, a2, a3
-
-        m = TestModule()
-        input_a = torch.randn(2, 3)
-        input_b = torch.randn(2, 3)
-        traced = acc_tracer.trace(m, [input_a, input_b])
-
-        ph_a = ph_b = add_1 = add_2 = add_3 = mul = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                elif str(node.target) == "b":
-                    ph_b = node
-                else:
-                    self.fail(f"Unexpected placeholder {node.target}.")
-            elif node.op == "call_function" and node.target == acc_ops.mul:
-                mul = node
-                self.assertEqual(node.kwargs["input"], ph_b)
-                self.assertEqual(node.kwargs["other"], 0.5)
-            elif node.op == "call_function" and node.target == acc_ops.add:
-                if add_1 is None:
-                    add_1 = node
-                    self.assertEqual(node.kwargs["input"], ph_a)
-                    self.assertEqual(node.kwargs["other"], ph_b)
-                elif add_2 is None:
-                    add_2 = node
-                    self.assertEqual(node.kwargs["input"], ph_a)
-                    self.assertEqual(node.kwargs["other"], ph_b)
-                elif add_3 is None:
-                    add_3 = node
-                    self.assertEqual(node.kwargs["input"], ph_a)
-                    self.assertEqual(node.kwargs["other"], mul)
-                else:
-                    self.fail(f"Unexpected add: {node.format_node()}")
-            elif node.op == "output":
-                self.assertEqual(node.args[0][0], add_1)
-                self.assertEqual(node.args[0][1], add_2)
-                self.assertEqual(node.args[0][2], add_3)
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        ref = m(input_a, input_b)
-        res = traced(input_a, input_b)
-        self.assertTrue(torch.equal(ref[0], res[0]))
-        self.assertTrue(torch.equal(ref[1], res[1]))
-        self.assertTrue(torch.equal(ref[2], res[2]))
-
-    def test_leaf_module_list(self):
-        """
-        Test leaf_module_list is working properly.
-        """
-
-        class LeafModule(nn.Module):
-            def forward(self, x):
-                return x
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.mod = LeafModule()
-
-            def forward(self, x):
-                return self.mod(x)
-
-        x = torch.randn(1, 1)
-        mod = TestModule()
-        acc_mod = acc_tracer.trace(
-            mod,
-            [x],
-            leaf_module_list={LeafModule},
-        )
-        ph = leaf_module = None
-        for node in acc_mod.graph.nodes:
-            if node.op == "placeholder":
-                ph = node
-            elif node.op == "call_module":
-                leaf_module = node
-                self.assertEqual(leaf_module.target, "mod")
-                self.assertEqual(leaf_module.args[0], ph)
-            elif node.op == "output":
-                self.assertEqual(node.args[0], leaf_module)
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-        self.assertTrue(torch.equal(mod(x), acc_mod(x)))
-
-    def test_sign(self):
-        self._make_acc_op_function_test(acc_ops.sign, torch.sign)
-
-    def test_relu(self):
-        self._make_acc_op_function_test(acc_ops.relu, torch.relu)
-
-    def test_leaky_relu(self):
-        self._make_acc_op_function_test(
-            acc_ops.leaky_relu, torch.nn.functional.leaky_relu
-        )
-
-    def test_elu(self):
-        self._make_acc_op_function_test(acc_ops.elu, torch.nn.functional.elu)
-
-    def test_selu(self):
-        self._make_acc_op_function_test(acc_ops.selu, torch.nn.functional.selu)
-
-    def test_softsign(self):
-        self._make_acc_op_function_test(acc_ops.softsign, torch.nn.functional.softsign)
-
-    def test_sigmoid(self):
-        self._make_acc_op_function_test(acc_ops.sigmoid, torch.sigmoid)
-
-    def test_sin(self):
-        self._make_acc_op_function_test(acc_ops.sin, torch.sin)
-
-    def test_cos(self):
-        self._make_acc_op_function_test(acc_ops.cos, torch.cos)
-
-    def test_tan(self):
-        self._make_acc_op_function_test(acc_ops.tan, torch.tan)
-
-    def test_sinh(self):
-        self._make_acc_op_function_test(acc_ops.sinh, torch.sinh)
-
-    def test_cosh(self):
-        self._make_acc_op_function_test(acc_ops.cosh, torch.cosh)
-
-    def test_tanh(self):
-        self._make_acc_op_function_test(acc_ops.tanh, torch.tanh)
-
-    def test_asin(self):
-        self._make_acc_op_function_test(acc_ops.asin, torch.asin)
-
-    def test_acos(self):
-        self._make_acc_op_function_test(acc_ops.acos, torch.acos)
-
-    def test_atan(self):
-        self._make_acc_op_function_test(acc_ops.atan, torch.atan)
-
-    def test_exp(self):
-        self._make_acc_op_function_test(acc_ops.exp, torch.exp)
-
-    def test_log(self):
-        self._make_acc_op_function_test(acc_ops.log, torch.log)
-
-    def test_sqrt(self):
-        self._make_acc_op_function_test(acc_ops.sqrt, torch.sqrt)
-
-    def test_reciprocal(self):
-        self._make_acc_op_function_test(acc_ops.reciprocal, torch.reciprocal)
-
-    def test_abs(self):
-        self._make_acc_op_function_test(acc_ops.abs, torch.abs)
-
-    def test_neg(self):
-        self._make_acc_op_function_test(acc_ops.neg, torch.neg)
-
-    def test_floor(self):
-        self._make_acc_op_function_test(acc_ops.floor, torch.floor)
-
-    def test_ceil(self):
-        self._make_acc_op_function_test(acc_ops.ceil, torch.ceil)
-
-    def test_softmax(self):
-        self._make_acc_op_function_test(acc_ops.softmax, torch.nn.functional.softmax)
-
-    def test_tensor_squeeze(self):
-        self._make_acc_op_function_test(acc_ops.squeeze, lambda x: x.squeeze())
-
-    def test_torch_squeeze(self):
-        self._make_acc_op_function_test(acc_ops.squeeze, lambda x: torch.squeeze(x))
-
-    def test_operator_mul(self):
-        self._make_acc_op_function_test(acc_ops.mul, lambda x: x * 7)
-
-    def test_torch_mul(self):
-        self._make_acc_op_function_test(acc_ops.mul, lambda x: torch.mul(x, 7))
-
-    def test_div(self):
-        self._make_acc_op_function_test(acc_ops.div, lambda x: torch.div(x, 2))
-        self._make_acc_op_function_test(acc_ops.div, lambda x: x / 2)
-
-    def test_floor_div(self):
-        self._make_acc_op_function_test(
-            acc_ops.floor_div, lambda x: torch.div(x, 2, rounding_mode="floor")
-        )
-
-    def test_trunc_div(self):
-        self._make_acc_op_function_test(
-            acc_ops.trunc_div, lambda x: torch.div(x, 2, rounding_mode="trunc")
-        )
-        self._make_acc_op_function_test(
-            acc_ops.trunc_div, lambda x: torch.floor_divide(x, 2)
-        )
-
-    def test_view(self):
-        """
-        Test that Tensor.view is traced correctly.
-        """
-
-        self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.view(1, -1))
-
-    def test_narrow(self):
-        """
-        Test that torch.narrow is traced correctly.
-        """
-        return self._make_acc_op_function_test(
-            acc_ops.slice_tensor,
-            torch.narrow,
-            validate_same_kwargs=False,
-            dim=1,
-            start=1,
-            length=2,
-        )
-
-    def test_pow(self):
-        self._make_acc_op_function_test(acc_ops.pow, torch.pow, exponent=2)
-
-    def test_size(self):
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a):
-                idx = a.size(1)
-                return a.shape[idx]
-
-        m = TestModule()
-        a = torch.randn(2, 1, 4)
-        traced = acc_tracer.trace(m, [a])
-
-        ph_a = size_1 = size_2 = getitem_1 = getitem_2 = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertTrue(node.target == "a")
-                ph_a = node
-            elif node.op == "call_function" and node.target == acc_ops.size:
-                if size_1:
-                    size_2 = node
-                    self.assertTrue(size_2.kwargs["input"] is ph_a)
-                else:
-                    size_1 = node
-                    self.assertTrue(size_1.kwargs["input"] is ph_a)
-            elif node.op == "call_function" and node.target == acc_ops.getitem:
-                if getitem_1:
-                    getitem_2 = node
-                    self.assertTrue(getitem_2.kwargs["idx"] == getitem_1)
-                    self.assertTrue(getitem_2.kwargs["input"] == size_2)
-                else:
-                    getitem_1 = node
-                    self.assertTrue(getitem_1.kwargs["idx"] == 1)
-                    self.assertTrue(getitem_1.kwargs["input"] == size_1)
-            elif node.op == "output":
-                self.assertEqual(node.args[0], getitem_2)
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        ref = m(a)
-        res = traced(a)
-        self.assertEqual(ref, res)
-
-    def test_flatten(self):
-        """
-        Test that torch.flatten is traced correctly.
-        """
-        self._make_acc_op_function_test(
-            acc_ops.flatten, torch.flatten, start_dim=1, end_dim=1
-        )
-        self._make_acc_op_function_test(acc_ops.flatten, lambda x: x.flatten())
-
-    def test_topk_multi_output(self):
-        """
-        Test that torch.topk multi outputs work.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return torch.topk(a, 3)[1]
-
-        m = TestModule()
-        input_a = torch.randn(10)
-        traced = acc_tracer.trace(m, [input_a])
-
-        ph_a = topk = getitem = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder" and str(node.target) == "a":
-                ph_a = node
-            elif node.op == "call_function" and node.target == acc_ops.topk:
-                topk = node
-                self.assertEqual(node.kwargs["input"], ph_a)
-                self.assertEqual(node.kwargs["k"], 3)
-            elif node.op == "call_function" and node.target == acc_ops.getitem:
-                getitem = node
-                self.assertEqual(node.kwargs["input"], topk)
-                self.assertEqual(node.kwargs["idx"], 1)
-            elif node.op == "output":
-                self.assertEqual(node.args[0], getitem)
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input_a), traced(input_a)))
-
-    def test_addmm_with_alpha_beta(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(
-                self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor
-            ) -> torch.Tensor:
-                return torch.addmm(input, a, b, alpha=1.2, beta=1.1)
-
-        m = TestModule()
-        input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2)
-        traced = acc_tracer.trace(m, [input, a, b])
-
-        ph_in = ph_a = ph_b = mm = add = mm_mul = add_mul = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                elif str(node.target) == "b":
-                    ph_b = node
-                else:
-                    self.assertTrue(str(node.target) == "input")
-                    ph_in = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.matmul:
-                    self.assertEqual(node.kwargs["input"], ph_a)
-                    self.assertEqual(node.kwargs["other"], ph_b)
-                    mm = node
-                elif node.target == acc_ops.add:
-                    self.assertEqual(node.kwargs["input"], mm_mul)
-                    self.assertEqual(node.kwargs["other"], add_mul)
-                    add = node
-                elif mm_mul:
-                    self.assertEqual(node.kwargs["input"], ph_in)
-                    self.assertEqual(node.kwargs["other"], 1.1)
-                    add_mul = node
-                else:
-                    self.assertEqual(node.kwargs["input"], mm)
-                    self.assertEqual(node.kwargs["other"], 1.2)
-                    mm_mul = node
-            elif node.op == "output":
-                self.assertEqual(add, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        torch.testing.assert_allclose(m(input, a, b), traced(input, a, b))
-
-    def test_log1p(self):
-        class TestModule(torch.nn.Module):
-            def forward(self, input: torch.Tensor) -> torch.Tensor:
-                return torch.log1p(input)
-
-        m = TestModule().eval()
-        input = torch.tensor([[1.2, 0.3, -0.4]])
-        traced = acc_tracer.trace(m, [input])
-
-        ph_in = add = log = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertTrue(str(node.target) == "input")
-                ph_in = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.add:
-                    self.assertEqual(node.kwargs["input"], ph_in)
-                    self.assertEqual(node.kwargs["other"], 1)
-                    add = node
-                else:
-                    self.assertEqual(node.target, acc_ops.log)
-                    self.assertEqual(node.kwargs["input"], add)
-                    log = node
-            elif node.op == "output":
-                self.assertEqual(log, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        torch.testing.assert_allclose(m(input), traced(input))
-
-    def test_addmm(self):
-        class TestModule(torch.nn.Module):
-            def forward(
-                self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor
-            ) -> torch.Tensor:
-                return torch.addmm(input, a, b)
-
-        m = TestModule()
-        input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2)
-        traced = acc_tracer.trace(m, [input, a, b])
-
-        ph_in = ph_a = ph_b = mm = add = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                if str(node.target) == "a":
-                    ph_a = node
-                elif str(node.target) == "b":
-                    ph_b = node
-                else:
-                    self.assertTrue(str(node.target) == "input")
-                    ph_in = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.matmul:
-                    self.assertEqual(node.kwargs["input"], ph_a)
-                    self.assertEqual(node.kwargs["other"], ph_b)
-                    mm = node
-                else:
-                    self.assertEqual(node.target, acc_ops.add)
-                    self.assertEqual(node.kwargs["input"], mm)
-                    self.assertEqual(node.kwargs["other"], ph_in)
-                    add = node
-            elif node.op == "output":
-                self.assertEqual(add, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        self.assertTrue(torch.equal(m(input, a, b), traced(input, a, b)))
-
-    def test_gelu(self):
-        return self._make_acc_op_function_test(acc_ops.gelu, torch.nn.functional.gelu)
-
-    @parameterized.expand(
-        [
-            (1, True),
-            (1, False),
-            (None, False),
-        ]
-    )
-    def test_argmin(self, dim, keepdim):
-        class TestModule(torch.nn.Module):
-            def __init__(self, dim, keepdim):
-                super().__init__()
-                self.dim = dim
-                self.keepdim = keepdim
-
-            def forward(self, input: torch.Tensor) -> torch.Tensor:
-                return torch.argmin(input, dim=self.dim, keepdim=self.keepdim)
-
-        m = TestModule(dim, keepdim)
-        input = torch.randn(2, 2)
-        traced = acc_tracer.trace(m, [input])
-
-        ph_in = flatten = topk = getitem = squeeze = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertTrue(str(node.target) == "input")
-                ph_in = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.flatten:
-                    self.assertEqual(node.kwargs["input"], ph_in)
-                    flatten = node
-                elif node.target == acc_ops.topk:
-                    self.assertEqual(
-                        node.kwargs["input"], flatten if flatten else ph_in
-                    )
-                    topk = node
-                elif node.target == acc_ops.getitem:
-                    self.assertEqual(node.kwargs["input"], topk)
-                    getitem = node
-                elif node.target == acc_ops.squeeze:
-                    self.assertEqual(node.kwargs["input"], getitem)
-                    squeeze = node
-            elif node.op == "output":
-                self.assertEqual(squeeze if squeeze else getitem, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-        if dim is None:
-            self.assertTrue(flatten is not None)
-        if not keepdim:
-            self.assertTrue(squeeze is not None)
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_t(self):
-        """
-        Test Tensor.t() is traced correctly.
-        """
-        self._make_acc_op_function_test(acc_ops.permute, lambda x: x.t())
-        self._make_acc_op_function_test(
-            acc_ops.permute, lambda x: x.t(), input_shape=(3,)
-        )
-
-    def test_split_size(self):
-        self._make_acc_op_function_test(
-            acc_ops.split,
-            torch.split,
-            validate_same_kwargs=False,
-            split_size_or_sections=2,
-            dim=1,
-        )
-
-    def test_split_sections(self):
-        class TestModule(torch.nn.Module):
-            def forward(self, input: torch.Tensor) -> torch.Tensor:
-                return torch.split(input, [2, 5, 3], 1)
-
-        m = TestModule()
-        input = torch.randn(1, 10)
-        traced = acc_tracer.trace(m, [input])
-
-        ph_in = slice_node_0 = slice_node_1 = slice_node_2 = None
-        tuple_construct_node = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertTrue(str(node.target) == "input")
-                ph_in = node
-            elif node.op == "call_function":
-                if node.target == acc_ops.slice_tensor:
-                    self.assertEqual(node.kwargs["input"], ph_in)
-                    if slice_node_0:
-                        if slice_node_1:
-                            slice_node_2 = node
-                        else:
-                            slice_node_1 = node
-                    else:
-                        slice_node_0 = node
-                else:
-                    self.assertEqual(node.target, acc_ops.tuple_construct)
-                    self.assertEqual(
-                        node.kwargs["tensors"],
-                        (slice_node_0, slice_node_1, slice_node_2),
-                    )
-                    tuple_construct_node = node
-            elif node.op == "output":
-                self.assertEqual(tuple_construct_node, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        ref_output = m(input)
-        output = traced(input)
-        for i, j in zip(ref_output, output):
-            self.assertTrue(torch.equal(i, j))
-
-    @parameterized.expand(
-        [
-            ("neg_1", -1, 1, 3),
-            ("neg_2", -2, 1, 3),
-            ("neg_4", -4, 1, 1),
-        ]
-    )
-    def test_negative_slicing(self, _, dim, start, length):
-        """
-        Test that slicing with negative dims works.
-        """
-        self._make_acc_op_function_test(
-            acc_ops.slice_tensor,
-            torch.narrow,
-            input_shape=(2, 3, 4, 5),
-            validate_same_kwargs=False,
-            dim=dim,
-            start=start,
-            length=length,
-        )
-
-    def test_list_input(self):
-        """
-        Test that list inputs are traced correctly.
-        """
-
-        class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a: List[torch.Tensor]) -> torch.Tensor:
-                return a[0] + a[1]
-
-        m = TestModule()
-        input = [torch.randn(2, 3), torch.randn(2, 3)]
-        traced = acc_tracer.trace(m, [input])
-
-        ph = getitem_0 = getitem_1 = add = None
-        for node in traced.graph.nodes:
-            if node.op == "placeholder":
-                self.assertEqual(str(node.target), "a")
-                ph = node
-            elif node.op == "call_function" and node.target == acc_ops.getitem:
-                self.assertTrue(node.kwargs["idx"] == 0 or node.kwargs["idx"] == 1)
-                if node.kwargs["idx"] == 0:
-                    getitem_0 = node
-                else:
-                    getitem_1 = node
-            elif node.op == "call_function":
-                self.assertEqual(node.target, acc_ops.add)
-                self.assertEqual(node.kwargs["input"], getitem_0)
-                self.assertEqual(node.kwargs["other"], getitem_1)
-                add = node
-            elif node.op == "output":
-                self.assertEqual(add, node.args[0])
-            else:
-                self.fail(f"Unexpected node: {node.format_node()}")
-
-        # Check the tensor metadatas are correct given the input is a list.
-        self.assertTrue(isinstance(ph.meta["tensor_meta"], list))
-        self.assertEqual(len(ph.meta["tensor_meta"]), 2)
-        self.assertEqual(getitem_0.meta["tensor_meta"], ph.meta["tensor_meta"][0])
-        self.assertEqual(getitem_1.meta["tensor_meta"], ph.meta["tensor_meta"][1])
-
-        self.assertTrue(torch.equal(m(input), traced(input)))
-
-    def test_mobilenet_v3(self):
-        """
-        Test that we can trace mobilenet v3 small and run/compare against the untraced version.
-        """
-        m = torchvision.models.mobilenet_v3_small(pretrained=True)
-        self._make_model_unit_test(m, enable_allclose=True)
-
-    def test_mobilenet_v2(self):
-        """
-        Test that we can trace mobilenet v2 small and run/compare against the untraced version.
-        """
-        m = torchvision.models.mobilenet_v2(pretrained=True)
-        self._make_model_unit_test(m)
-
-    def test_vgg16(self):
-        """
-        Test that we can trace vgg16 and run/compare against the untraced version.
-        """
-        m = torchvision.models.vgg16(pretrained=True)
-        self._make_model_unit_test(m)
-
-    def test_resnet18(self):
-        """
-        Test that we can trace resnet18 and run/compare against the untraced version.
-        """
-        m = torchvision.models.resnet18(pretrained=True)
-        self._make_model_unit_test(m)
-
-    def test_resnext50_32x4d(self):
-        """
-        Test that we can trace resnext and run/compare against the untraced version.
-        """
-        m = torchvision.models.resnext50_32x4d(pretrained=True)
-        self._make_model_unit_test(m)
-
-    def test_cumsum(self):
-        self._make_acc_op_function_test(acc_ops.cumsum, torch.cumsum, dim=1)
-        self._make_acc_op_function_test(
-            acc_ops.cumsum, torch.cumsum, dim=1, dtype=torch.float
-        )
-
-    def test_chunk(self):
-        self._make_acc_op_function_test(acc_ops.chunk, torch.chunk, chunks=2, dim=0)
-
-    def test_retrace_reshape(self):
-        """
-        Retrace reshape to verify it's retraceable.
-        """
-
-        class TestModule(torch.nn.Module):
-            def forward(self, a: torch.Tensor) -> torch.Tensor:
-                return a.reshape(a.size()[0], 1, 2)
-
-        m = TestModule()
-        a = torch.randn(2, 2)
-        gm = acc_tracer.trace(m, [a])
-        self.assertTrue(torch.equal(m(a), gm(a)))
-        gm_retrace = acc_tracer.trace(gm, [a])
-        self.assertTrue(torch.equal(m(a), gm_retrace(a)))
-
-    def test_all_acc_ops_registered(self):
-        self.assertEqual(
-            acc_normalizer._acc_ops,
-            {
-                acc_ops.linear,
-                acc_ops.max_pool2d,
-                acc_ops.flatten,
-                acc_ops.adaptive_avg_pool2d,
-                acc_ops.avg_pool2d,
-                acc_ops.add,
-                acc_ops.min_full_reduce,
-                acc_ops.min_dim_reduce,
-                acc_ops.minimum,
-                acc_ops.cat,
-                acc_ops.softmax,
-                acc_ops.sign,
-                acc_ops.permute,
-                acc_ops.matmul,
-                acc_ops.quantize_per_tensor,
-                acc_ops.quantize_per_channel,
-                acc_ops.quantized_add,
-                acc_ops.quantized_mul,
-                acc_ops.dequantize,
-                acc_ops.sub,
-                acc_ops.mul,
-                acc_ops.div,
-                acc_ops.floor_div,
-                acc_ops.trunc_div,
-                acc_ops.pow,
-                acc_ops.relu,
-                acc_ops.leaky_relu,
-                acc_ops.elu,
-                acc_ops.selu,
-                acc_ops.softsign,
-                acc_ops.tuple_construct,
-                acc_ops.unsqueeze,
-                acc_ops.sigmoid,
-                acc_ops.sum,
-                acc_ops.prod,
-                acc_ops.max_full_reduce,
-                acc_ops.max_dim_reduce,
-                acc_ops.maximum,
-                acc_ops.sinh,
-                acc_ops.cosh,
-                acc_ops.tanh,
-                acc_ops.asin,
-                acc_ops.acos,
-                acc_ops.atan,
-                acc_ops.exp,
-                acc_ops.log,
-                acc_ops.sqrt,
-                acc_ops.reciprocal,
-                acc_ops.abs,
-                acc_ops.neg,
-                acc_ops.floor,
-                acc_ops.ceil,
-                acc_ops.size,
-                acc_ops.split,
-                acc_ops.conv2d,
-                acc_ops.batch_norm,
-                acc_ops.embedding_bag,
-                acc_ops.embedding_bag_byte_rowwise_offsets,
-                acc_ops.embedding_bag_4bit_rowwise_offsets,
-                acc_ops.contiguous,
-                acc_ops.pad,
-                acc_ops.sin,
-                acc_ops.cos,
-                acc_ops.tan,
-                acc_ops.topk,
-                acc_ops.getitem,
-                acc_ops.squeeze,
-                acc_ops.tile,
-                acc_ops.reshape,
-                acc_ops.quantized_linear,
-                acc_ops.quantized_conv2d,
-                acc_ops.quantized_batch_norm2d,
-                acc_ops.to_dtype,
-                acc_ops.clamp,
-                acc_ops.layer_norm,
-                acc_ops.linalg_norm,
-                acc_ops.slice_tensor,
-                acc_ops.hardsigmoid,
-                acc_ops.mean,
-                acc_ops.hardtanh,
-                acc_ops.gelu,
-                acc_ops.cumsum,
-                acc_ops.chunk,
-                acc_ops.rescale_quantize_per_tensor,
-                acc_ops.rescale_quantize_per_channel,
-                acc_ops.nan_to_num,
-            },
-        )
diff --git a/test/jit/fixtures/test_versioned_gelu_out_v9.ptl b/test/jit/fixtures/test_versioned_gelu_out_v9.ptl
new file mode 100644
index 000000000000..208ae5100757
Binary files /dev/null and b/test/jit/fixtures/test_versioned_gelu_out_v9.ptl differ
diff --git a/test/jit/fixtures/test_versioned_gelu_v9.ptl b/test/jit/fixtures/test_versioned_gelu_v9.ptl
new file mode 100644
index 000000000000..5e4ffb20f823
Binary files /dev/null and b/test/jit/fixtures/test_versioned_gelu_v9.ptl differ
diff --git a/test/jit/fixtures_srcs/fixtures_src.py b/test/jit/fixtures_srcs/fixtures_src.py
index 545152b6a3a0..dff23702311a 100644
--- a/test/jit/fixtures_srcs/fixtures_src.py
+++ b/test/jit/fixtures_srcs/fixtures_src.py
@@ -42,3 +42,18 @@ def __init__(self):
 
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
         return torch.logspace(a, b, out=out)
+
+class TestVersionedGeluV9(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch._C._nn.gelu(x)
+
+class TestVersionedGeluOutV9(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out = torch.zeros_like(x)
+        return torch._C._nn.gelu(x, out=out)
diff --git a/test/jit/fixtures_srcs/generate_models.py b/test/jit/fixtures_srcs/generate_models.py
index 36b6b5ffe684..e00153745138 100644
--- a/test/jit/fixtures_srcs/generate_models.py
+++ b/test/jit/fixtures_srcs/generate_models.py
@@ -52,7 +52,7 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
 fbcode/caffe2/torch/csrc/jit/mobile/upgrader_mobile.cpp
 
 ```
-python pytorch/tools/codegen/operator_versions/gen_mobile_upgraders.py
+python pytorch/torchgen/operator_versions/gen_mobile_upgraders.py
 ```
 
 4. Generate the test to cover upgrader.
@@ -94,6 +94,8 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
     TestVersionedLinspaceOutV7(): "aten::linspace.out",
     TestVersionedLogspaceV8(): "aten::logspace",
     TestVersionedLogspaceOutV8(): "aten::logspace.out",
+    TestVersionedGeluV9(): "aten::gelu",
+    TestVersionedGeluOutV9(): "aten::gelu.out",
 }
 
 """
diff --git a/test/jit/myexception.py b/test/jit/myexception.py
new file mode 100644
index 000000000000..5937bd3c91b7
--- /dev/null
+++ b/test/jit/myexception.py
@@ -0,0 +1,8 @@
+r"""
+Define exceptions used in test_exception.py. We define them in a
+separate file on purpose to make sure the fully qualified exception class name
+is captured correctly in suce cases.
+"""
+class MyKeyError(KeyError):
+    def __init__(self, msg):
+        super(KeyError, self).__init__(msg)
diff --git a/test/jit/test_alias_analysis.py b/test/jit/test_alias_analysis.py
index 00c015ccfab2..2f8216eaaf9a 100644
--- a/test/jit/test_alias_analysis.py
+++ b/test/jit/test_alias_analysis.py
@@ -42,3 +42,52 @@ def foo(x):
         output = next(graph.outputs())
         self.assertTrue(alias_db.may_contain_alias(ten_construct, output))
         self.assertFalse(alias_db.may_contain_alias(next(graph.inputs()), ten_construct))
+
+    def test_recursive_calls(self):
+        @torch.jit.script
+        def foo(x, y):
+            x.add_(1)
+            return x + y
+
+        @torch.jit.script
+        def caller():
+            a = torch.rand([2, 2])
+            b = torch.ones([2, 2])
+            out1 = foo(a, b)
+            c = torch.rand([1])
+            d = torch.ones([2])
+            out2 = foo(d, c)
+            return out1, out2
+
+        isFrozen = False
+        descend_function_calls = True
+        alias_db = caller.graph.alias_db(isFrozen, descend_function_calls)
+        func_calls = caller.graph.findAllNodes("prim::CallFunction")
+        self.assertEqual(len(func_calls), 2)
+        for node in func_calls:
+            inps = list(node.inputs())
+            self.assertTrue(alias_db.has_writers(inps[1]))
+            self.assertFalse(alias_db.has_writers(inps[2]))
+
+        class Mod(torch.nn.Module):
+            def forward(self):
+                a = torch.rand([2, 2])
+                b = torch.ones([2, 2])
+                out1 = self.foo2(a, b)
+                c = torch.rand([1])
+                d = torch.ones([2])
+                out2 = self.foo2(d, c)
+                return out1, out2
+
+            def foo2(self, x, y):
+                x.add_(1)
+                return x + y
+
+        mod = torch.jit.script(Mod())
+        alias_db = mod.graph.alias_db(isFrozen, descend_function_calls)
+        func_calls = mod.graph.findAllNodes("prim::CallMethod")
+        self.assertEqual(len(func_calls), 2)
+        for node in func_calls:
+            inps = list(node.inputs())
+            self.assertTrue(alias_db.has_writers(inps[1]))
+            self.assertFalse(alias_db.has_writers(inps[2]))
diff --git a/test/jit/test_autodiff.py b/test/jit/test_autodiff.py
new file mode 100644
index 000000000000..518826f602e1
--- /dev/null
+++ b/test/jit/test_autodiff.py
@@ -0,0 +1,51 @@
+# Owner(s): ["oncall: jit"]
+
+import torch
+
+from torch.testing._internal.jit_utils import JitTestCase
+from typing import List
+
+class TestAutodiffJit(JitTestCase):
+    def test_undefined_tensor_lists(self):
+        def fn(tensor_list: List[torch.Tensor], add_tensor):
+            cat = torch.cat(tensor_list, dim=1)
+            r = torch.sin(cat + add_tensor)
+            return r
+
+        fn_s = torch.jit.script(fn)
+
+        a = torch.rand((3, 6), requires_grad=True)
+        b = torch.rand((3, 10), requires_grad=True)
+        x = [a, b]
+        y = torch.rand((3, 16), requires_grad=True)
+
+        ret = fn_s(x, y)
+        ret.sum().backward()
+        ret = fn_s(x, y)
+        ret.sum().backward()
+
+        ret = fn_s(x, y)
+        s = ret.sum()
+
+        # backward_fn expects 2 inputs: (grad_output, current_grad_r)
+        # current_grad_r is provided because we need to add this contribution
+        # to grad_r when we return it.
+        backward_fn = s.grad_fn.next_functions[0][0]
+
+        # check behavior with defined tensor
+        grad_out = torch.rand((3, 16))
+        grad_inputs = backward_fn(grad_out, None)
+
+        # expect 3 tensors: grad_y, grad_a, grad_b
+        self.assertEqual(3, len(grad_inputs))
+        for x in grad_inputs:
+            self.assertTrue(isinstance(x, torch.Tensor))
+
+        # now test with undefined grad_out
+        grad_inputs = backward_fn(None, None)
+
+        # expect all of them to be None
+        self.assertEqual(3, len(grad_inputs))
+        for x in grad_inputs:
+            if x is not None:
+                self.assertEqual(0, torch.max(torch.abs(x)).item())
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index 8454f786edb8..4b72fc6f4561 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -447,7 +447,7 @@ def test_aliased_outputs(self):
         %0 : int[] = prim::Constant[value=[2, 2, 1]]()
         %1 : int = prim::Constant[value=0]()
         %2 : Tensor = aten::t(%b)
-        %3 : Tensor = aten::gelu(%2)
+        %3 : Tensor = aten::relu(%2)
         %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%b, %3, %2)
         return (%4)
 """
@@ -471,7 +471,7 @@ def test_aliased_outputs(self):
         %1 : int = prim::Constant[value=0]()
         %d : Tensor = aten::t(%c)
         %2 : Tensor = aten::t(%b)
-        %3 : Tensor = aten::gelu(%2)
+        %3 : Tensor = aten::relu(%2)
         %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%3, %2, %d, %b, %c, %b)
         return (%4)
 """
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 086a44eee3f1..0ed7d0c19b2d 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -81,7 +81,7 @@ def setUp(self):
         # Subclasses are expected to set up three variables in their setUp methods:
         # module - a regular, Python version of the module being tested
         # scripted_module - a scripted version of module
-        # lowered_modle - a version of module lowered to a backend
+        # lowered_module - a version of module lowered to a backend
 
     def check_function(self, function_name, input):
         """
@@ -498,7 +498,7 @@ def setUp(self):
         # Subclasses are expected to set up four variables in their setUp methods:
         # module - a regular, Python version of the module being tested
         # scripted_module - a scripted version of module
-        # lowered_modle - a version of module lowered to a backend
+        # lowered_module - a version of module lowered to a backend
         # mobile_module - a module with a format that Pytorch Mobile can execute
 
     def check_forward(self, input):
diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 56c3831341ee..09a58b3cd735 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -1430,8 +1430,8 @@ def __init__(self, val):
         class Mod(nn.Module):
             def __init__(self):
                 super(Mod, self).__init__()
-                self.mod1 = ValHolder(1)
-                self.mod2 = ValHolder(2)
+                self.mod1 = ValHolder("1")
+                self.mod2 = ValHolder("2")
 
             def forward(self, cond: bool):
                 if cond:
diff --git a/test/jit/test_complex.py b/test/jit/test_complex.py
index e4137067a8ea..3b7d34427167 100644
--- a/test/jit/test_complex.py
+++ b/test/jit/test_complex.py
@@ -328,3 +328,33 @@ def tensor_imag(x):
         t = torch.randn(2, 3, dtype=torch.cdouble)
         self.checkScript(tensor_real, (t, ))
         self.checkScript(tensor_imag, (t, ))
+
+    def test_binary_op_complex_tensor(self):
+        def mul(x: complex, y: torch.Tensor):
+            return x * y
+
+        def add(x: complex, y: torch.Tensor):
+            return x + y
+
+        def eq(x: complex, y: torch.Tensor):
+            return x == y
+
+        def ne(x: complex, y: torch.Tensor):
+            return x != y
+
+        def sub(x: complex, y: torch.Tensor):
+            return x - y
+
+        def div(x: complex, y: torch.Tensor):
+            return x - y
+
+        ops = [mul, add, eq, ne, sub, div]
+
+        for shape in [(1, ), (2, 2)]:
+            x = 0.71 + 0.71j
+            y = torch.randn(shape, dtype=torch.cfloat)
+            for op in ops:
+                eager_result = op(x, y)
+                scripted = torch.jit.script(op)
+                jit_result = scripted(x, y)
+                self.assertEqual(eager_result, jit_result)
diff --git a/test/jit/test_custom_operators.py b/test/jit/test_custom_operators.py
index cdb973590cb4..feb3b8eb8fb6 100644
--- a/test/jit/test_custom_operators.py
+++ b/test/jit/test_custom_operators.py
@@ -50,10 +50,6 @@ def test_default_arguments_are_used(self):
         output = torch.ops._test.leaky_relu(torch.tensor([-1.0, 1.0]))
         self.assertEqual(output, torch.tensor([-0.01, 1]))
 
-    def test_only_kwargs(self):
-        output = torch.ops._test.leaky_relu(self=torch.tensor(-1.0))
-        self.assertEqual(output, torch.tensor(-0.01))
-
     def test_passing_too_many_args(self):
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
@@ -78,14 +74,6 @@ def test_passing_one_positional_but_not_the_second(self):
         ):
             torch.ops.aten.type_as(torch.ones(5, 5))
 
-    def test_passing_an_argument_both_as_positional_and_kwarg(self):
-        with self.assertRaisesRegexWithHighlight(
-            RuntimeError,
-            "Argument 'self' specified both as positional and keyword argument",
-            ""
-        ):
-            torch.ops._test.leaky_relu(torch.ones(5), self=torch.ones(5))
-
     def test_passing_unknown_kwargs(self):
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
diff --git a/test/jit/test_device_analysis.py b/test/jit/test_device_analysis.py
index efdc2fc92e6c..3ce42e171b65 100644
--- a/test/jit/test_device_analysis.py
+++ b/test/jit/test_device_analysis.py
@@ -6,6 +6,7 @@
 import torch
 from torch.testing._internal.common_utils import TEST_CUDA
 from torch.testing._internal.jit_utils import JitTestCase
+from torch.jit._passes._property_propagation import apply_input_props_using_example
 
 try:
     from torchvision import models
@@ -19,40 +20,6 @@
         "instead."
     )
 
-# TODO: Delete this when PR #67786 is merged.
-def apply_input_props_using_example(graph, example_input):
-    """
-    Applies properties for each tensor in the graph inputs
-    using the example supplied.
-    """
-    graph_inputs = list(graph.inputs())
-    if len(graph_inputs) == 0:
-        return
-
-    # Strip self args off for methods
-    in_0 = graph_inputs[0]
-    if isinstance(in_0.type(), torch._C.ClassType) and in_0.debugName() == "self":
-        graph_inputs = graph_inputs[1:]
-
-    if not len(graph_inputs) == len(example_input):
-        raise RuntimeError(
-            "Number of inputs in graph does not match number of inputs in the example"
-        )
-
-    for i, (graph_i, example_i) in enumerate(zip(graph_inputs, example_input)):
-        if example_i is None:
-            continue  # Skip the type check
-
-        if isinstance(example_i, torch.Tensor) != isinstance(
-            graph_i.type(), torch.TensorType
-        ):
-            raise RuntimeError(
-                f"Input {i} does not match type of example", graph_i, example_i
-            )
-
-        if isinstance(example_i, torch.Tensor):
-            graph_i.setType(torch.TensorType.create_from_tensor(example_i))  # type: ignore[arg-type]
-
 
 class TestDeviceAnalysis(JitTestCase):
     @classmethod
diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py
new file mode 100644
index 000000000000..dce38e3be892
--- /dev/null
+++ b/test/jit/test_exception.py
@@ -0,0 +1,176 @@
+# Owner(s): ["oncall: jit"]
+from torch.testing._internal.common_utils import TestCase
+import torch
+from torch import nn
+
+r"""
+Test TorchScript exception handling.
+"""
+class TestException(TestCase):
+    def test_pyop_exception_message(self):
+        class Foo(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Foo, self).__init__()
+                self.conv = nn.Conv2d(1, 10, kernel_size=5)
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.conv(x)
+        foo = Foo()
+        # testing that the correct error message propagates
+        with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"):
+            foo(torch.ones([123]))  # wrong size
+
+    def test_builtin_error_messsage(self):
+        with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"):
+            @torch.jit.script
+            def close_match(x):
+                return x.masked_fill(True)
+
+        with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently "
+                                    "supported in TorchScript"):
+            @torch.jit.script
+            def unknown_op(x):
+                torch.set_anomaly_enabled(True)
+                return x
+
+    def test_exceptions(self):
+        cu = torch.jit.CompilationUnit('''
+            def foo(cond):
+                if bool(cond):
+                    raise ValueError(3)
+                return 1
+        ''')
+
+        cu.foo(torch.tensor(0))
+        with self.assertRaisesRegex(torch.jit.Error, "3"):
+            cu.foo(torch.tensor(1))
+
+        def foo(cond):
+            a = 3
+            if bool(cond):
+                raise ArbitraryError(a, "hi")
+                if 1 == 2:
+                    raise ArbitraryError
+            return a
+
+        with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"):
+            torch.jit.script(foo)
+
+        def exception_as_value():
+            a = Exception()
+            print(a)
+
+        with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"):
+            torch.jit.script(exception_as_value)
+
+        @torch.jit.script
+        def foo_no_decl_always_throws():
+            raise RuntimeError("Hi")
+
+        # function that has no declared type but always throws set to None
+        output_type = next(foo_no_decl_always_throws.graph.outputs()).type()
+        self.assertTrue(str(output_type) == "NoneType")
+
+        @torch.jit.script
+        def foo_decl_always_throws():
+            # type: () -> Tensor
+            raise Exception("Hi")
+
+        output_type = next(foo_decl_always_throws.graph.outputs()).type()
+        self.assertTrue(str(output_type) == "Tensor")
+
+        def foo():
+            raise 3 + 4
+
+        with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"):
+            torch.jit.script(foo)
+
+        # a escapes scope
+        @torch.jit.script
+        def foo():
+            if 1 == 1:
+                a = 1
+            else:
+                if 1 == 1:
+                    raise Exception("Hi")
+                else:
+                    raise Exception("Hi")
+            return a
+        self.assertEqual(foo(), 1)
+
+        @torch.jit.script
+        def tuple_fn():
+            raise RuntimeError("hello", "goodbye")
+
+        with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"):
+            tuple_fn()
+
+        @torch.jit.script
+        def no_message():
+            raise RuntimeError
+
+        with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"):
+            no_message()
+
+    def test_assertions(self):
+        cu = torch.jit.CompilationUnit('''
+            def foo(cond):
+                assert bool(cond), "hi"
+                return 0
+        ''')
+
+        cu.foo(torch.tensor(1))
+        with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"):
+            cu.foo(torch.tensor(0))
+
+        @torch.jit.script
+        def foo(cond):
+            assert bool(cond), "hi"
+
+        foo(torch.tensor(1))
+        # we don't currently validate the name of the exception
+        with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"):
+            foo(torch.tensor(0))
+
+    def test_python_op_exception(self):
+        @torch.jit.ignore
+        def python_op(x):
+            raise Exception("bad!")
+
+        @torch.jit.script
+        def fn(x):
+            return python_op(x)
+
+        with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"):
+            fn(torch.tensor(4))
+
+    def test_dict_expansion_raises_error(self):
+        def fn(self):
+            d = {"foo": 1, "bar": 2, "baz": 3}
+            return {**d}
+
+        with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError,
+                                    "Dict expansion "):
+            torch.jit.script(fn)
+
+    def test_custom_python_exception(self):
+        class MyValueError(ValueError):
+            def __init__(self, msg):
+                super(MyValueError, self).__init__(msg)
+
+        @torch.jit.script
+        def fn():
+            raise MyValueError("test custom exception")
+
+        with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"):
+            fn()
+
+    def test_custom_python_exception_defined_elsewhere(self):
+        from jit.myexception import MyKeyError
+
+        @torch.jit.script
+        def fn():
+            raise MyKeyError("This is a user defined key error")
+        with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"):
+            fn()
diff --git a/test/jit/test_export_modes.py b/test/jit/test_export_modes.py
index 70d2193201a3..dbf10cddc059 100644
--- a/test/jit/test_export_modes.py
+++ b/test/jit/test_export_modes.py
@@ -15,7 +15,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import skipIfNoLapack
+from torch.testing._internal.common_utils import skipIfNoLapack, skipIfCaffe2, skipIfNoCaffe2
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
@@ -68,8 +68,9 @@ def foo(a):
         x = torch.ones(3)
         torch.onnx._export(foo, (x,), f)
 
+    @skipIfNoCaffe2
     @skipIfNoLapack
-    def test_aten_fallback(self):
+    def test_caffe2_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
                 abcd = x + y
@@ -84,6 +85,25 @@ def forward(self, x, y):
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
 
+    @skipIfCaffe2
+    @skipIfNoLapack
+    def test_aten_fallback(self):
+        class ModelWithAtenNotONNXOp(nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        torch.onnx.export_to_pretty_string(
+            ModelWithAtenNotONNXOp(), (x, y),
+            add_node_names=False,
+            do_constant_folding=False,
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # support for linalg.qr was added in later op set versions.
+            opset_version=9)
+
     # torch.fmod is using to test ONNX_ATEN.
     # If you plan to remove fmod from aten, or found this test failed.
     # please contact @Rui.
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index b4cc5a10a075..599ada43f1b2 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -1678,6 +1678,36 @@ def make_prediction(self, x):
         scripted_mod = torch.jit.freeze(scripted_mod, preserved_attrs=["make_prediction", "amt"])
         FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.make_prediction.graph)
 
+    @unittest.skipIf(True, "Caching allocator leak sometimes causes failures")
+    @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU")
+    def test_conv_bn_folding_autocast_scenario_cuda(self):
+        # CUDA conv takes input tensors which must all be the same dtype,
+        # which can cause issues if folding produces inputs of different dtypes.
+
+        class ConvBN(torch.nn.Module):
+            def __init__(self, in_channels, out_channels, **kwargs):
+                super(ConvBN, self).__init__()
+                self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, dtype=torch.half, **kwargs)
+                self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001, dtype=torch.float)
+
+            def forward(self, x):
+                return self.bn(self.conv(x))
+
+        mod_eager = ConvBN(3, 32, kernel_size=3, stride=2).cuda().eval()
+        scripted_mod = torch.jit.script(mod_eager)
+        scripted_mod = torch.jit.freeze(scripted_mod)
+        FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.graph)
+        conv_node = scripted_mod.graph.findNode("aten::conv2d", True)
+        self.assertTrue(conv_node is not None)
+        bias_input = conv_node.namedInput("bias")
+        self.assertTrue(bias_input is not None)
+        self.assertTrue(bias_input.type().dtype() == torch.half)
+
+        x = torch.rand((3, 3, 32, 32), dtype=torch.half).cuda()
+
+        self.assertEqual(mod_eager(x), scripted_mod(x), atol=1e-2, rtol=1e-2)
+        self.assertEqual(mod_eager(x), scripted_mod(x), atol=1e-2, rtol=1e-2)
+
     def test_conv_add_folding(self):
 
         @torch.no_grad()
@@ -1760,7 +1790,32 @@ def forward(self, x):
 
             # add with different dtype
             test_conv_fusion(use_bias, nn.Conv2d, False, pytorch_op, False,
-                             add_tensor=torch.rand(1).to(torch.int), expect_success=False)
+                             add_tensor=torch.tensor([2]).to(torch.int), expect_success=True)
+
+    def test_conv_mul_add_bn(self):
+        class Conv_Mul_Add_Bn(nn.Module):
+
+            def __init__(self, in_channels, out_channels, **kwargs):
+                super(Conv_Mul_Add_Bn, self).__init__()
+                self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
+                self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
+                self.tensor1 = torch.tensor(2.2)
+                self.tensor2 = torch.tensor(2)
+
+            def forward(self, x):
+                return self.bn(torch.add(torch.mul(self.conv(x), self.tensor1), self.tensor2))
+
+        input = torch.randn(8, 3, 64, 64)
+        model = Conv_Mul_Add_Bn(3, 32, kernel_size=3, stride=1).eval()
+
+        with torch.no_grad():
+            result = model(input)
+            traced_model = torch.jit.trace(model, input).eval()
+            traced_model = torch.jit.freeze(traced_model)
+            tresult = traced_model(input)
+            self.assertEqual(result, tresult)
+            FileCheck().check("conv").check_not("aten::batch_norm").run(traced_model.graph)
+            FileCheck().check("conv").check_not("aten::add").run(traced_model.graph)
 
     @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU")
     def test_linear_concat(self):
diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py
deleted file mode 100644
index 939ceda3c56c..000000000000
--- a/test/jit/test_if_hoisting.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Owner(s): ["oncall: jit"]
-
-import torch
-from torch.testing import FileCheck
-from torch.testing._internal.jit_utils import JitTestCase
-
-if __name__ == "__main__":
-    raise RuntimeError(
-        "This test file is not meant to be run directly, use:\n\n"
-        "\tpython test/test_jit.py TESTNAME\n\n"
-        "instead."
-    )
-
-
-class TestIfHoisting(JitTestCase):
-    def test_if_hoist_basic(self):
-        def fn(x: bool, y: int):
-            if x:
-                z = y + 3
-            else:
-                z = y + 3
-            return z
-
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-
-    def test_if_hoist_transposed_expr(self):
-        """
-        Making sure that we can properly eliminate
-        an expression even if it is not at the start
-        of a block
-        """
-        def fn(x: bool, y: int):
-            if x:
-                a = y + 3
-                b = y * 2
-            else:
-                b = y * 2
-                a = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_if_hoist_swapped_expr(self):
-        """
-        Making sure that the if statement
-        doesn't get fully eliminated here
-        """
-        def fn(x: bool, y: int):
-            if x:
-                a = y + 3
-                b = y * 2
-            else:
-                a = y * 2
-                b = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_if_hoist_reused_var(self):
-        """
-        Making sure that cases where the python variable is reused
-        is handled correctly
-        """
-        def fn(x: bool, y: int):
-            b = 6
-            if x:
-                a = y + 3
-                a = y * 2
-            else:
-                a = y * 2
-                b = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_no_hoist(self):
-        """
-        Nothing should happen here, expressions are different
-        """
-        def fn(x: bool, y: int, z: int):
-            if x:
-                a = y + 3
-            else:
-                a = z + 3
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3))
-        self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10))
-
-    def test_mutate_before(self):
-        """
-        Make sure that if there is a mutation before the common
-        op, the hoist doesn't happen
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                y.add_(8)
-                a = y + 3
-            else:
-                a = y + 3
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1), fn_script(True, t1))
-        self.assertEqual(fn(False, t2), fn_script(False, t2))
-
-    def test_mutate_after(self):
-        """
-        Check that the hoist can happen properly, and
-        that the output is still correct.
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                b = 1
-                a = y + 3
-                y.add_(8)
-            else:
-                b = 2
-                a = y + 3
-            c = b + a
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone()))
-        self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone()))
-
-    def test_multiple_hoists(self):
-        """
-        test that hoists that depend on other hoists are done correctly
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                a = y + 3
-                b = a + y
-            else:
-                a = y + 3
-                b = a + y
-            c = b * 2
-            return c
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1), fn_script(True, t1))
-        self.assertEqual(fn(False, t2), fn_script(False, t2))
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index bf3c3c3e71c1..4d10ad37aa65 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -12,6 +12,7 @@
 import torch
 import torch.testing._internal.jit_utils
 import torch.nn as nn
+from torch.testing._internal.common_utils import freeze_rng_state
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -171,6 +172,22 @@ def if_function(inp: torch.Tensor) -> Any:
 
         self.checkScript(if_function, (torch.randn(5),))
 
+    def test_hacked_twin(self):
+
+        def gen_data():
+            with freeze_rng_state():
+                return torch.randn(10), torch.randint(10, (20,)), torch.randn(20)
+
+        input, index, value, = gen_data()
+        input1, index1, value1, = gen_data()
+        out1 = torch.ops.aten.index_put.hacked_twin(input, [index], value, accumulate=False)
+        out2 = torch.index_put(input1, [index1], value1, accumulate=False)
+        self.assertEqual(out1, out2)
+
+        torch.ops.aten.index_put_.hacked_twin(input, [index], value, accumulate=False)
+        torch.index_put_(input1, [index1], value1, accumulate=False)
+        self.assertEqual(input, input1)
+
     def test_export_opnames_interface(self):
 
         @torch.jit.interface
@@ -228,6 +245,91 @@ def use_module_interface(mod_list: List[OneTwoModule], x: torch.Tensor):
         self.assertTrue(set(['aten::add.Tensor', 'aten::mul.Scalar']).issubset(
             set(torch.jit.export_opnames(scripted_M_mod))))
 
+    def test_math_inf(self):
+        from math import inf
+
+        def foo():
+            return inf
+
+        self.checkScript(foo, ())
+
+    def test_list_literal_infer(self):
+        def expects_intlist(x: List[int]):
+            x.append(3)
+            return x
+
+        def foo():
+            return expects_intlist([])
+
+        self.checkScript(foo, ())
+
+        def annotated_list_fail():
+            return expects_intlist(torch.jit.annotate([], List[Tensor]))
+
+        with self.assertRaises(RuntimeError):
+            torch.jit.script(annotated_list_fail)
+
+        def non_temporary_fail():
+            a = []
+            return expects_intlist(a)
+
+        with self.assertRaises(RuntimeError):
+            torch.jit.script(non_temporary_fail)
+
+
+        @torch.jit.script
+        def test_return():
+            return []
+
+        FileCheck().check("Tensor[] = prim::ListConstruct").run(test_return.graph)
+
+    def test_legacy_tensor_constructor(self):
+        # testing PyObject overload
+        def test_all_dtypes():
+            return (
+                torch.BoolTensor([2]),
+                torch.LongTensor([3]),
+                torch.ByteTensor([4]),
+                torch.CharTensor([5]),
+                torch.DoubleTensor([6]),
+                torch.FloatTensor([7]),
+                torch.IntTensor([8]),
+                torch.ShortTensor([1]),
+                torch.HalfTensor([1]),
+            )
+
+        self.checkScript(test_all_dtypes, ())
+
+        # now test empty overload
+        def empty_overload():
+            return torch.LongTensor(2, 3, 4)
+
+        eager = empty_overload()
+        jit = torch.jit.script(empty_overload)()
+        eager[:] = 1
+        jit[:] = 1
+        self.assertEqual(eager, jit)
+
+        def no_inputs():
+            return torch.DoubleTensor()
+
+        self.checkScript(no_inputs, ())
+
+        # bad schema
+        def multiple_args():
+            return torch.LongTensor(1, [2])
+
+        with self.assertRaisesRegex(RuntimeError, "multiple positional arguments that were not all integers"):
+            torch.jit.script(multiple_args)
+
+        # kwarg bad schema
+        def bad_kwarg():
+            return torch.LongTensor(hello="1")
+
+        with self.assertRaisesRegex(RuntimeError, "hello"):
+            torch.jit.script(bad_kwarg)
+
+
     def test_broadcasting_list(self):
         """
         Test BroadcastingList and torch.nn._size_N_t alias
@@ -243,3 +345,38 @@ def sum_f(x: BroadcastingList2[float]) -> float:
 
         self.assertTrue(torch.jit.script(sum_i)(4) == 8)
         self.assertTrue(torch.jit.script(sum_f)(4.5) == 9.)
+
+    def test_parse_ir_annotate(self):
+        ir = """
+        graph():
+          %3 : int[] = prim::Constant[value=annotate(List[int], [])]()
+          return (%3)
+        """
+        graph = torch._C.parse_ir(ir, True)
+        func = torch._C._create_function_from_graph("forward", graph)
+        ret = func()
+        self.assertTrue(ret == [])
+
+    def test_parse_ir_single_element_tensor_positive(self):
+        ir = """
+        graph():
+          %7 : Long(1, strides=[1], requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+          return (%7)
+        """
+        graph = torch._C.parse_ir(ir, True)
+        func = torch._C._create_function_from_graph("forward", graph)
+        ret = func()
+        self.assertTrue(ret.numel() == 1)
+        self.assertTrue(len(ret.size()) == 1)
+
+    def test_parse_ir_single_element_tensor_negative(self):
+        ir = """
+        graph():
+          %7 : Long(1, strides=[1], requires_grad=0, device=cpu) = prim::Constant[value={-17}]()
+          return (%7)
+        """
+        graph = torch._C.parse_ir(ir, True)
+        func = torch._C._create_function_from_graph("forward", graph)
+        ret = func()
+        self.assertTrue(ret.numel() == 1)
+        self.assertTrue(len(ret.size()) == 1)
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index 8cab53168236..2f67e27cb1d7 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -41,7 +41,7 @@ def __init__(self):
     def forward(self, x):
         x = F.relu(F.max_pool2d(self.conv1(x), 2))
         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-        x = x.view(-1, 320)
+        x = x.reshape(-1, 320)
         x = F.relu(self.fc1(x))
         x = F.dropout(x, training=self.training)
         x = self.fc2(x)
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index a4892aa6ea3a..f253c2453b3b 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -663,3 +663,43 @@ def forward(self, x):
 
         # Check that ignored method is still intact.
         self.assertEqual(inp, n.ignored_method(inp))
+
+    def test_parameterlist_script_getitem(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)])
+                self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)])
+
+            def forward(self, x):
+                self.module_list[0]
+                self.parameter_list[0]
+                return x
+
+        self.checkModule(MyModule(), (torch.zeros(1)))
+
+    def test_parameterlist_script_iter(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)])
+                self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)])
+
+            def forward(self, x):
+                r = x
+                for i, p in enumerate(self.parameter_list):
+                    r = r + p + i
+                return r
+
+        self.checkModule(MyModule(), (torch.zeros(1),))
+
+    def test_parameterdict_script_getitem(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.parameter_dict = nn.ParameterDict({k: nn.Parameter(torch.zeros(1)) for k in ['a', 'b', 'c']})
+
+            def forward(self, x):
+                return self.parameter_dict['a'] * x + self.parameter_dict['b'] * self.parameter_dict['c']
+
+        self.checkModule(MyModule(), (torch.ones(1),))
diff --git a/test/jit/test_op_decompositions.py b/test/jit/test_op_decompositions.py
new file mode 100644
index 000000000000..6b4569cd6e39
--- /dev/null
+++ b/test/jit/test_op_decompositions.py
@@ -0,0 +1,38 @@
+# Owner(s): ["oncall: jit"]
+
+import torch
+from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import JitTestCase
+
+if __name__ == '__main__':
+    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
+                       "\tpython test/test_jit.py TESTNAME\n\n"
+                       "instead.")
+
+class TestOpDecompositions(JitTestCase):
+    def test_op_decomposition(self):
+        def foo(x):
+            return torch.var(x, unbiased=True)
+
+        # TODO: more robust testing
+        foo_s = torch.jit.script(foo)
+        FileCheck().check("aten::var").run(foo_s.graph)
+        torch._C._jit_pass_run_decompositions(foo_s.graph)
+        inp = torch.rand([10, 10])
+        self.assertEqual(foo(inp), foo_s(inp))
+        FileCheck().check_not("aten::var").run(foo_s.graph)
+
+    def test_registered_decomposition(self):
+        @torch.jit.script
+        def foo(x):
+            return torch.square(x)
+
+        @torch.jit.script
+        def square_decomp(x):
+            return torch.pow(x, 2)
+
+        torch.jit._register_decomposition(torch.ops.aten.square.default, square_decomp.graph)
+        torch._C._jit_pass_run_decompositions(foo.graph)
+        FileCheck().check_not("aten::square").check("aten::pow").run(foo.graph)
+        x = torch.rand([4])
+        self.assertEqual(foo(x), torch.square(x))
diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
index b0a14f56d41f..a6527a3ffdff 100644
--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@@ -2,9 +2,9 @@
 
 import torch
 import torch._C
-import torch.backends.xnnpack
 import torch.nn.functional as F
 from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import skipIfNoXNNPACK
 
 class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
     def check_replacement(
@@ -36,6 +36,7 @@ def check_replacement(
                     original_source_ranges[replacements[node.kind()]],
                 )
 
+    @skipIfNoXNNPACK
     def test_replace_conv1d_with_conv2d(self):
         class TestConv1d(torch.nn.Module):
             def __init__(self, weight, bias):
@@ -63,6 +64,7 @@ def forward(self, x):
             jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d,
         )
 
+    @skipIfNoXNNPACK
     def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self):
         class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module):
             def __init__(
@@ -139,6 +141,7 @@ def forward(self, x):
             jit_pass=torch._C._jit_pass_insert_prepacked_ops,
         )
 
+    @skipIfNoXNNPACK
     def test_insert_pre_packed_linear_op(self):
         self.check_replacement(
             model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)),
@@ -230,6 +233,7 @@ def forward(self, x):
             jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv,
         )
 
+    @skipIfNoXNNPACK
     def test_fuse_activation_with_pack_ops_linear_conv2d_1(self):
         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
             linear_activation=F.hardtanh,
@@ -238,6 +242,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_1(self):
             conv2d_activation_kind="aten::hardtanh_"
         )
 
+    @skipIfNoXNNPACK
     def test_fuse_activation_with_pack_ops_linear_conv2d_2(self):
         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
             linear_activation=F.hardtanh_,
@@ -246,6 +251,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_2(self):
             conv2d_activation_kind="aten::hardtanh"
         )
 
+    @skipIfNoXNNPACK
     def test_fuse_activation_with_pack_ops_linear_conv2d_3(self):
         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
             linear_activation=F.relu,
@@ -254,6 +260,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_3(self):
             conv2d_activation_kind="aten::relu_"
         )
 
+    @skipIfNoXNNPACK
     def test_fuse_activation_with_pack_ops_linear_conv2d_4(self):
         self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
             linear_activation=F.relu_,
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 74f85dc22deb..81df055f55b7 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -18,7 +18,7 @@
 class TestProfiler(JitTestCase):
     def setUp(self):
         self.prev_exec = torch._C._jit_set_profiling_executor(True)
-        self.prev_profiling = torch._C._jit_set_profiling_mode(True)
+        self.prev_profiling = torch._C._get_graph_executor_optimize(True)
         self.inline_autodiff = torch._C._debug_set_autodiff_subgraph_inlining(False)
         self.texpr_fuser_state = torch._C._jit_texpr_fuser_enabled()
         self.can_fuse_on_cpu = torch._C._jit_can_fuse_on_cpu()
@@ -34,7 +34,7 @@ def setUp(self):
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
-        torch._C._jit_set_profiling_mode(self.prev_profiling)
+        torch._C._get_graph_executor_optimize(self.prev_profiling)
         torch._C._debug_set_autodiff_subgraph_inlining(self.inline_autodiff)
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_override_can_fuse_on_cpu(self.can_fuse_on_cpu)
@@ -232,6 +232,24 @@ def foo(a, b):
         g = torch.jit.last_executed_optimized_graph()
         FileCheck().check_count("aten::add", 2, exactly=True).run(g)
 
+    def test_local_fusion_strategy(self):
+        @torch.jit.script
+        def foo(x):
+            return x + x + x
+
+        torch.jit.set_fusion_strategy([("STATIC", 1)])
+        for _ in range(3):
+            foo(torch.rand([10]))
+
+        torch.jit.set_fusion_strategy([("STATIC", 10)])
+
+        for i in range(10):
+            foo(torch.rand([i]))
+            foo(torch.rand([i]))
+
+        g = torch.jit.last_executed_optimized_graph()
+        FileCheck().check_count(":TensorExprGroup", 2, exactly=True).run(g)
+
     def test_iterative_fusion(self):
         @torch.jit.script
         def foo(a, b, c, d):
diff --git a/test/jit/test_python_bindings.py b/test/jit/test_python_bindings.py
index 2f086feaa904..37c2ef7f85af 100644
--- a/test/jit/test_python_bindings.py
+++ b/test/jit/test_python_bindings.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 import torch
+from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
@@ -82,3 +83,28 @@ def test_graph_create(self):
         gr = torch._C.Graph()
         with self.assertRaises(ValueError):
             gr.create("prim::Constant", [None])
+
+    def test_canonicalize(self):
+        ir = """
+graph(%p207 : Tensor,
+      %1 : Tensor,
+      %p407 : int):
+  %11 : Tensor = aten::view_expand_placeholder(%1)
+  %12 : Tensor = aten::pointwise_placeholder(%11, %p207, %p407)
+  %13 : Tensor = aten::view_expand_placeholder(%12)
+  %14 : Tensor = aten::pointwise_placeholder(%13)
+  return (%14)
+        """
+
+        graph1 = torch._C.parse_ir(ir)
+        graph1 = torch._C._jit_pass_canonicalize(graph1, True)
+
+        graph2 = torch._C.parse_ir(ir)
+        graph2 = torch._C._jit_pass_canonicalize(graph2)
+
+        self.assertEqual(str(graph1), str(graph2))
+        FileCheck().check("%p207").check_not("%14").run(graph1)
+
+        graph3 = torch._C.parse_ir(ir)
+        graph3 = torch._C._jit_pass_canonicalize(graph3, False)
+        FileCheck().check_not("%p207").run(graph3)
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index 0544a039286e..4c393a7f1a0f 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -146,16 +146,16 @@ def test_successful():
 
         # full_like is not implemented for a tensor fill value
 
-        def test_unsuccessful():
+        def test_successful():
             x = torch.tensor([2, 2])
             y = torch.tensor([2, 4])
             x.fill_(y)
             return x + x
 
-        fn = torch.jit.script(test_unsuccessful)
+        fn = torch.jit.script(test_successful)
         graph = fn.graph
         self.run_pass('remove_mutation', graph)
-        FileCheck().check('aten::fill_').run(graph)
+        FileCheck().check_not('aten::fill_').run(graph)
 
         def normal():
             return torch.rand(2, 1, 3, 4).normal_()
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index fbc1443024cb..47cbc0fd9b3a 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -1,20 +1,22 @@
 # Owner(s): ["oncall: jit"]
 
-from typing import NamedTuple, Optional
 import io
 import os
 import pathlib
 import sys
+import unittest
+from typing import NamedTuple, Optional
 
+import torch
 from torch import Tensor
 from torch.testing._internal.common_utils import TemporaryFileName
-import torch
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import (JitTestCase,
-                                               clear_class_registry)
+from torch.testing._internal.jit_utils import JitTestCase, clear_class_registry
+
+ENABLE_FLATBUFFER = os.environ.get("ENABLE_FLATBUFFER", "0") == "1"
 
 if __name__ == "__main__":
     raise RuntimeError(
@@ -23,12 +25,14 @@
         "instead."
     )
 
+
 class TestSaveLoad(JitTestCase):
     def test_different_modules(self):
         """
         Exercise the situation where we have the same qualified name
         in two different CompilationUnits on save/load.
         """
+
         class Foo(torch.nn.Module):
             def __init__(self):
                 super(Foo, self).__init__()
@@ -64,7 +68,8 @@ def forward(self, x):
         clear_class_registry()
 
         self.assertEqual(
-            first_script_module._c.qualified_name, second_script_module._c.qualified_name
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
         )
 
         class ContainsBoth(torch.nn.Module):
@@ -89,6 +94,7 @@ def test_different_functions(self):
         Exercise the situation where we have the same qualified name
         in two different CompilationUnits on save/load.
         """
+
         def lol(x):
             return x
 
@@ -118,7 +124,8 @@ def forward(self, x):
         clear_class_registry()
 
         self.assertEqual(
-            first_script_module._c.qualified_name, second_script_module._c.qualified_name
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
         )
 
         class ContainsBoth(torch.nn.Module):
@@ -143,6 +150,7 @@ def test_different_interfaces(self):
         Exercise the situation where we have the same qualified name
         in two different CompilationUnits on save/load.
         """
+
         @torch.jit.interface
         class MyInterface(object):
             def bar(self, x: Tensor) -> Tensor:
@@ -204,7 +212,8 @@ def forward(self, x):
         clear_class_registry()
 
         self.assertEqual(
-            first_script_module._c.qualified_name, second_script_module._c.qualified_name
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
         )
 
         class ContainsBoth(torch.nn.Module):
@@ -261,7 +270,6 @@ def forward(self, x):
 
                 return x, MyCoolNamedTuple(a=5)
 
-
         first_script_module = torch.jit.script(Foo())
         first_saved_module = io.BytesIO()
         torch.jit.save(first_script_module, first_saved_module)
@@ -310,7 +318,8 @@ def forward(self, x):
         clear_class_registry()
 
         self.assertEqual(
-            first_script_module._c.qualified_name, second_script_module._c.qualified_name
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
         )
 
         class ContainsBoth(torch.nn.Module):
@@ -340,44 +349,44 @@ def forward(self, a):
         value = b"bar\x00\xffbaz"
 
         expected_extra_files = {}
-        expected_extra_files['foo'] = value
+        expected_extra_files["foo"] = value
         # verify that str to bytes conversion also works
-        expected_extra_files['foo2'] = "bar"
+        expected_extra_files["foo2"] = "bar"
         m = MyMod()
 
         # Save to file.
         with TemporaryFileName() as fname:
             m.save(fname, _extra_files=expected_extra_files)
             # values don't matter
-            extra_files = {'foo': '', 'foo2': None}
+            extra_files = {"foo": "", "foo2": None}
             torch.jit.load(fname, _extra_files=extra_files)
-            self.assertEqual(value, extra_files['foo'])
+            self.assertEqual(value, extra_files["foo"])
             # results come back always as bytes
-            self.assertEqual(b"bar", extra_files['foo2'])
+            self.assertEqual(b"bar", extra_files["foo2"])
 
             # Use torch.jit API
             torch.jit.save(m, fname, _extra_files=expected_extra_files)
-            extra_files['foo'] = ''
+            extra_files["foo"] = ""
             torch.jit.load(fname, _extra_files=extra_files)
-            self.assertEqual(value, extra_files['foo'])
+            self.assertEqual(value, extra_files["foo"])
 
         # Save to buffer.
         buffer = io.BytesIO(m.save_to_buffer(_extra_files=expected_extra_files))
-        extra_files = {'foo': ''}
+        extra_files = {"foo": ""}
         torch.jit.load(buffer, _extra_files=extra_files)
-        self.assertEqual(value, extra_files['foo'])
+        self.assertEqual(value, extra_files["foo"])
 
         # Use torch.jit API
         buffer = io.BytesIO()
         torch.jit.save(m, buffer, _extra_files=expected_extra_files)
         buffer.seek(0)
-        extra_files = {'foo': ''}
+        extra_files = {"foo": ""}
         torch.jit.load(buffer, _extra_files=extra_files)
-        self.assertEqual(value, extra_files['foo'])
+        self.assertEqual(value, extra_files["foo"])
 
         # Non-existent file 'bar'
         with self.assertRaises(RuntimeError):
-            extra_files['bar'] = ''
+            extra_files["bar"] = ""
             torch.jit.load(buffer, _extra_files=extra_files)
 
     def test_save_load_using_pathlib(self):
@@ -394,7 +403,7 @@ def forward(self, a):
             m.save(path)
             m2 = torch.jit.load(path)
 
-        x = torch.tensor([1., 2., 3., 4.])
+        x = torch.tensor([1.0, 2.0, 3.0, 4.0])
         self.assertTrue(torch.equal(m(x), m2(x)))
 
     def test_save_nonexit_file(self):
@@ -455,7 +464,9 @@ class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.add_module("submodule_a", Submodule())
-                self.register_parameter("parameter_a", torch.nn.Parameter(torch.randn(4)))
+                self.register_parameter(
+                    "parameter_a", torch.nn.Parameter(torch.randn(4))
+                )
                 self.register_buffer("buffer", torch.randn(4))
                 self.t = torch.rand(4)  # not buffer
 
@@ -466,7 +477,9 @@ def __init__(self):
         m_loaded = self.getExportImportCopy(torch.jit.script(m))
 
         # Check submodules.
-        self.assertEqual(len(list(m.named_modules())), len(list(m_loaded.named_modules())))
+        self.assertEqual(
+            len(list(m.named_modules())), len(list(m_loaded.named_modules()))
+        )
         for m_s, loaded_s in zip(m.named_modules(), m_loaded.named_modules()):
             m_name, _ = m_s
             loaded_name, _ = loaded_s
@@ -478,7 +491,504 @@ def __init__(self):
             self.assertEqual(m_p, loaded_p)
 
         # Check buffers.
-        self.assertEqual(len(list(m.named_buffers())), len(list(m_loaded.named_buffers())))
+        self.assertEqual(
+            len(list(m.named_buffers())), len(list(m_loaded.named_buffers()))
+        )
+        for m_b, loaded_b in zip(m.named_buffers(), m_loaded.named_buffers()):
+            m_name, m_buffer = m_b
+            loaded_name, loaded_buffer = loaded_b
+            self.assertEqual(m_name, loaded_name)
+            self.assertEqual(m_buffer, loaded_buffer)
+
+    def test_save_load_meta_tensors(self):
+        """
+        Check that parameters, buffers, and submodules are the same after loading
+        for a module with parameters and buffers that are meta tensors
+        """
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+                self.foo = torch.nn.Linear(2, 3, device="meta")
+                self.bar = torch.nn.Linear(3, 4)
+                self.register_buffer("buffer", torch.randn(4, device="meta"))
+
+            def forward(self, x):
+                x = self.foo(x)
+                x = self.bar(x)
+                return x
+
+        m = Foo()
+        m_loaded = self.getExportImportCopy(torch.jit.script(m))
+        # Check submodules.
+        self.assertEqual(
+            len(list(m.named_modules())), len(list(m_loaded.named_modules()))
+        )
+        self.assertEqual(
+            set(name for name, _ in m.named_modules()),
+            set(name for name, _ in m_loaded.named_modules()),
+        )
+        # Check parameters.
+        m_params = dict(m.named_parameters())
+        m_loaded_params = dict(m_loaded.named_parameters())
+        self.assertEqual(len(m_params), len(m_loaded_params))
+        self.assertEqual(m_params, m_loaded_params)
+        # Check buffers.
+        m_buffers = dict(m.named_buffers())
+        m_loaded_buffers = dict(m_loaded.named_buffers())
+        self.assertEqual(len(m_buffers), len(m_loaded_buffers))
+        self.assertEqual(m_buffers, m_loaded_buffers)
+        # Check params and buffers that are/are not meta tensors
+        self.assertTrue(m_params["foo.weight"].is_meta)
+        self.assertTrue(m_loaded_params["foo.weight"].is_meta)
+        self.assertTrue(m_params["foo.bias"].is_meta)
+        self.assertTrue(m_loaded_params["foo.bias"].is_meta)
+        self.assertFalse(m_params["bar.weight"].is_meta)
+        self.assertFalse(m_loaded_params["bar.weight"].is_meta)
+        self.assertFalse(m_params["bar.bias"].is_meta)
+        self.assertFalse(m_loaded_params["bar.bias"].is_meta)
+        self.assertTrue(m_buffers["buffer"].is_meta)
+        self.assertTrue(m_loaded_buffers["buffer"].is_meta)
+
+
+def script_module_to_buffer(script_module):
+    module_buffer = io.BytesIO(
+        script_module._save_to_buffer_for_lite_interpreter(_use_flatbuffer=True)
+    )
+    module_buffer.seek(0)
+    return module_buffer
+
+
+@unittest.skipIf(
+    not ENABLE_FLATBUFFER, "Need to enable flatbuffer to run the below tests"
+)
+class TestSaveLoadFlatbuffer(JitTestCase):
+    def test_different_modules(self):
+        """
+        Exercise the situation where we have the same qualified name
+        in two different CompilationUnits on save/load.
+        """
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+                self.foo = torch.nn.Linear(2, 2)
+                self.bar = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                x = self.foo(x)
+                x = self.bar(x)
+                return x
+
+        first_script_module = torch.jit.script(Foo())
+        first_saved_module = script_module_to_buffer(first_script_module)
+
+        clear_class_registry()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+                self.foo = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                x = self.foo(x)
+                return x
+
+        second_script_module = torch.jit.script(Foo())
+        second_saved_module = script_module_to_buffer(second_script_module)
+
+        clear_class_registry()
+
+        self.assertEqual(
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
+        )
+
+        class ContainsBoth(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module(
+                    "second", torch.jit.load(second_saved_module)
+                )
+                self.add_module(
+                    "first", torch.jit.load(first_saved_module)
+                )
+
+            def forward(self, x):
+                x = self.first(x)
+                x = self.second(x)
+                return x
+
+        sm = torch.jit.script(ContainsBoth())
+        contains_both = script_module_to_buffer(sm)
+        sm = torch.jit.load(contains_both)
+
+    def test_different_functions(self):
+        """
+        Exercise the situation where we have the same qualified name
+        in two different CompilationUnits on save/load.
+        """
+
+        def lol(x):
+            return x
+
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return lol(x)
+
+        first_script_module = torch.jit.script(Foo())
+        first_saved_module = script_module_to_buffer(first_script_module)
+        clear_class_registry()
+
+        def lol(x):  # noqa: F811
+            return "hello"
+
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return lol(x)
+
+        second_script_module = torch.jit.script(Foo())
+        second_saved_module = script_module_to_buffer(second_script_module)
+
+        clear_class_registry()
+
+        self.assertEqual(
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
+        )
+
+        class ContainsBoth(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module(
+                    "second", torch.jit.load(second_saved_module)
+                )
+                self.add_module(
+                    "first", torch.jit.load(first_saved_module)
+                )
+
+            def forward(self, x):
+                x = self.first(x)
+                x = self.second(x)
+                return x
+
+        sm = torch.jit.script(ContainsBoth())
+        contains_both = script_module_to_buffer(sm)
+        sm = torch.jit.load(contains_both)
+
+    def test_different_interfaces(self):
+        """
+        Exercise the situation where we have the same qualified name
+        in two different CompilationUnits on save/load.
+        """
+
+        @torch.jit.interface
+        class MyInterface(object):
+            def bar(self, x: Tensor) -> Tensor:
+                pass
+
+        @torch.jit.script
+        class ImplementInterface(object):
+            def __init__(self):
+                pass
+
+            def bar(self, x):
+                return x
+
+        class Foo(torch.nn.Module):
+            __annotations__ = {"interface": MyInterface}
+
+            def __init__(self):
+                super().__init__()
+                self.interface = ImplementInterface()
+
+            def forward(self, x):
+                return self.interface.bar(x)
+
+        first_script_module = torch.jit.script(Foo())
+        first_saved_module = script_module_to_buffer(first_script_module)
+        clear_class_registry()
+
+        @torch.jit.interface
+        class MyInterface(object):
+            def not_bar(self, x: Tensor) -> Tensor:
+                pass
+
+        @torch.jit.script  # noqa: F811
+        class ImplementInterface(object):  # noqa: F811
+            def __init__(self):
+                pass
+
+            def not_bar(self, x):
+                return x
+
+        class Foo(torch.nn.Module):
+            __annotations__ = {"interface": MyInterface}
+
+            def __init__(self):
+                super().__init__()
+                self.interface = ImplementInterface()
+
+            def forward(self, x):
+                return self.interface.not_bar(x)
+
+        second_script_module = torch.jit.script(Foo())
+        second_saved_module = script_module_to_buffer(second_script_module)
+
+        clear_class_registry()
+
+        self.assertEqual(
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
+        )
+
+        class ContainsBoth(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module(
+                    "second", torch.jit.load(second_saved_module)
+                )
+                self.add_module(
+                    "first", torch.jit.load(first_saved_module)
+                )
+
+            def forward(self, x):
+                x = self.first(x)
+                x = self.second(x)
+                return x
+
+        sm = torch.jit.script(ContainsBoth())
+        contains_both = script_module_to_buffer(sm)
+        sm = torch.jit.load(contains_both)
+
+    def test_many_collisions(self):
+        class MyCoolNamedTuple(NamedTuple):
+            a: int
+
+        @torch.jit.interface
+        class MyInterface(object):
+            def bar(self, x: Tensor) -> Tensor:
+                pass
+
+        @torch.jit.script
+        class ImplementInterface(object):
+            def __init__(self):
+                pass
+
+            def bar(self, x):
+                return x
+
+        def lol(x):
+            return x
+
+        class Foo(torch.nn.Module):
+            interface: MyInterface
+
+            def __init__(self):
+                super().__init__()
+                self.foo = torch.nn.Linear(2, 2)
+                self.bar = torch.nn.Linear(2, 2)
+                self.interface = ImplementInterface()
+
+            def forward(self, x):
+                x = self.foo(x)
+                x = self.bar(x)
+                x = lol(x)
+                x = self.interface.bar(x)
+
+                return x, MyCoolNamedTuple(a=5)
+
+        first_script_module = torch.jit.script(Foo())
+        first_saved_module = script_module_to_buffer(first_script_module)
+
+        clear_class_registry()
+
+        @torch.jit.interface
+        class MyInterface(object):
+            def not_bar(self, x: Tensor) -> Tensor:
+                pass
+
+        @torch.jit.script  # noqa: F811
+        class ImplementInterface(object):  # noqa: F811
+            def __init__(self):
+                pass
+
+            def not_bar(self, x):
+                return x
+
+        def lol(x):  # noqa: F811
+            return "asdofij"
+
+        class MyCoolNamedTuple(NamedTuple):  # noqa: F811
+            a: str
+
+        class Foo(torch.nn.Module):
+            interface: MyInterface
+
+            def __init__(self):
+                super().__init__()
+                self.foo = torch.nn.Linear(2, 2)
+                self.interface = ImplementInterface()
+
+            def forward(self, x):
+                x = self.foo(x)
+                self.interface.not_bar(x)
+                x = lol(x)
+                return x, MyCoolNamedTuple(a="hello")
+
+        second_script_module = torch.jit.script(Foo())
+        second_saved_module = script_module_to_buffer(second_script_module)
+
+        clear_class_registry()
+
+        self.assertEqual(
+            first_script_module._c.qualified_name,
+            second_script_module._c.qualified_name,
+        )
+
+        class ContainsBoth(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module(
+                    "second", torch.jit.load(second_saved_module)
+                )
+                self.add_module(
+                    "first", torch.jit.load(first_saved_module)
+                )
+
+            def forward(self, x):
+                x, named_tuple_1 = self.first(x)
+                x, named_tuple_2 = self.second(x)
+                return len(x + named_tuple_2.a) + named_tuple_1.a
+
+        sm = torch.jit.script(ContainsBoth())
+        contains_both = script_module_to_buffer(sm)
+        sm = torch.jit.load(contains_both)
+
+    def test_save_load_using_pathlib(self):
+        class MyMod(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, a):
+                return 2 * a
+
+        m = MyMod()
+
+        # Save then load.
+        with TemporaryFileName() as fname:
+            path = pathlib.Path(fname)
+            torch.jit.save_jit_module_to_flatbuffer(m, path)
+            m2 = torch.jit.load(path)
+
+        x = torch.tensor([1.0, 2.0, 3.0, 4.0])
+        self.assertTrue(torch.equal(m(x), m2(x)))
+
+    def test_save_namedtuple_input_only(self):
+        """
+        Even if a NamedTuple is only used as an input argument, saving and
+        loading should work correctly.
+        """
+        global FooTuple  # see [local resolution in python]
+
+        class FooTuple(NamedTuple):
+            a: int
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x: FooTuple) -> torch.Tensor:
+                return torch.tensor(3)
+
+        m_loaded = self.getExportImportCopy(torch.jit.script(MyModule()))
+        output = m_loaded(FooTuple(a=5))
+        self.assertEqual(output, torch.tensor(3))
+
+    def test_save_namedtuple_output_only(self):
+        """
+        Even if a NamedTuple is only used as an output argument, saving and
+        loading should work correctly.
+        """
+        global FooTuple  # see [local resolution in python]
+
+        class FooTuple(NamedTuple):
+            a: int
+
+        class MyModule(torch.nn.Module):
+            def forward(self) -> Optional[FooTuple]:
+                return None
+
+        m_loaded = self.getExportImportCopy(torch.jit.script(MyModule()))
+        output = m_loaded()
+        self.assertEqual(output, None)
+
+    def test_module_info_flatbuffer(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super(Foo, self).__init__()
+                self.foo = torch.nn.Linear(2, 2)
+                self.bar = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                x = self.foo(x)
+                x = self.bar(x)
+                return x
+
+        first_script_module = torch.jit.script(Foo())
+        first_saved_module = io.BytesIO()
+        torch.jit.save_jit_module_to_flatbuffer(
+            first_script_module, first_saved_module)
+        first_saved_module.seek(0)
+        expected = {
+            'bytecode_version': 4,
+            'operator_version': 4,
+            'function_names': {'__torch__.___torch_mangle_0.Foo.forward'},
+            'type_names': set(),
+            'opname_to_num_args': {'aten::linear': 3}}
+        self.assertEqual(
+            torch.jit._serialization.get_flatbuffer_module_info(first_saved_module),
+            expected)
+
+
+    def test_save_load_params_buffers_submodules(self):
+        """
+        Check that parameters, buffers, and submodules are the same after loading.
+        """
+
+        class Submodule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module("submodule_a", Submodule())
+                self.register_parameter(
+                    "parameter_a", torch.nn.Parameter(torch.randn(4))
+                )
+                self.register_buffer("buffer", torch.randn(4))
+                self.t = torch.rand(4)  # not buffer
+
+                self.parameter_b = torch.nn.Parameter(torch.randn(4))
+                self.submodule_b = Submodule()
+
+        m = TestModule()
+        m_loaded = self.getExportImportCopy(torch.jit.script(m))
+
+        # Check submodules.
+        self.assertEqual(
+            len(list(m.named_modules())), len(list(m_loaded.named_modules()))
+        )
+        for m_s, loaded_s in zip(m.named_modules(), m_loaded.named_modules()):
+            m_name, _ = m_s
+            loaded_name, _ = loaded_s
+            self.assertEqual(m_name, loaded_name)
+
+        # Check parameters.
+        self.assertEqual(len(list(m.parameters())), len(list(m_loaded.parameters())))
+        for m_p, loaded_p in zip(m.parameters(), m_loaded.parameters()):
+            self.assertEqual(m_p, loaded_p)
+
+        # Check buffers.
+        self.assertEqual(
+            len(list(m.named_buffers())), len(list(m_loaded.named_buffers()))
+        )
         for m_b, loaded_b in zip(m.named_buffers(), m_loaded.named_buffers()):
             m_name, m_buffer = m_b
             loaded_name, loaded_buffer = loaded_b
diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index cd25caa92b2b..e756cdb67889 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -12,6 +12,7 @@
 )
 from torch.testing._internal.common_utils import make_tensor
 from torch.testing._internal.jit_utils import JitTestCase, execWrapper
+from typing import List, Any
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
@@ -498,3 +499,37 @@ def test_shape_function_includes(self):
         m2_shape = [20, 10]
         res = torch.jit._shapes.matmul(m1_shape, m2_shape)
         self.assertEqual(res, [10, 10])
+
+    def test_register_function_error_checking(self):
+        # this will error before registering on global map, so
+        # no issue in overwriting schema mappings
+        @torch.jit.script
+        def foo(x, y):
+            return x + y
+
+        node = foo.graph.findNode("aten::add")
+
+        @torch.jit.script
+        def wrong_input_types(x, y):
+            x: List[int] = []
+            return x
+        with self.assertRaisesRegex(RuntimeError, "Expected supertype of int"):
+            torch._C._jit_register_shape_compute_graph_for_node(node, wrong_input_types.graph)
+
+        @torch.jit.script
+        def wrong_output_types(x: List[int], y: List[int]):
+            x: List[Tensor] = []
+            return x
+
+        with self.assertRaisesRegex(RuntimeError, "but got graph_type"):
+            torch._C._jit_register_shape_compute_graph_for_node(node, wrong_output_types.graph)
+
+        @torch.jit.script
+        def too_many_inputs(x: List[int], y: List[int], z: Any, z2: Any):
+            x: List[int] = []
+            return x
+
+        with self.assertRaises(RuntimeError) as error:
+            torch._C._jit_register_shape_compute_graph_for_node(node, too_many_inputs.graph)
+
+        self.assertTrue("fewer arguments than schema" in str(error.exception))
diff --git a/test/jit/test_tensor_methods.py b/test/jit/test_tensor_methods.py
new file mode 100644
index 000000000000..c761a3884c92
--- /dev/null
+++ b/test/jit/test_tensor_methods.py
@@ -0,0 +1,39 @@
+# Owner(s): ["oncall: jit"]
+
+import os
+import sys
+
+import torch
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing import FileCheck
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
+class TestTensorMethods(JitTestCase):
+    def test_getitem(self):
+        def tensor_getitem(inp: torch.Tensor):
+            indices = torch.tensor([0, 2], dtype=torch.long)
+            return inp.__getitem__(indices)
+
+        inp = torch.rand(3, 4)
+        self.checkScript(tensor_getitem, (inp, ))
+
+        scripted = torch.jit.script(tensor_getitem)
+        FileCheck().check("aten::index").run(scripted.graph)
+
+    def test_getitem_invalid(self):
+        def tensor_getitem_invalid(inp: torch.Tensor):
+            return inp.__getitem__()
+
+        with self.assertRaisesRegexWithHighlight(
+                RuntimeError, "expected exactly 1 argument", "inp.__getitem__"):
+            torch.jit.script(tensor_getitem_invalid)
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 9d6849829240..99d078dd4ad1 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -17,7 +17,7 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.common_utils import suppress_warnings, \
     skipIfCompiledWithoutNumpy, enable_profiling_mode_for_profiling_tests, \
-    IS_SANDCASTLE, TemporaryFileName
+    IS_SANDCASTLE, TemporaryFileName, skipIfCrossRef
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
     _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, \
     RUN_CUDA_MULTI_GPU, make_global
@@ -377,6 +377,17 @@ def test_trace_size(self):
     def test_trace_size_with_grad(self):
         self.do_trace_size(True)
 
+    def test_trace_numel(self):
+        def fn(x):
+            return x.numel()
+
+        x = torch.randn(2, 3, 4)
+        y = torch.randn(4, 5, 6)
+
+        traced_fn = torch.jit.trace(fn, x)
+        self.assertEqual(traced_fn(y), fn(y))
+        self.assertEqual(traced_fn(x), fn(x))
+
     def do_trace_arange(self, requires_grad):
         def arange(x):
             return torch.arange(x.shape[0])
@@ -500,6 +511,7 @@ def to_tensor(x, y):
         self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
 
     @skipIfCompiledWithoutNumpy
+    @skipIfCrossRef
     def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
@@ -836,7 +848,7 @@ def forward(self, x):
     def test_trace_c10_ops(self):
         try:
             _ = torch.ops._caffe2.GenerateProposals
-        except RuntimeError:
+        except AttributeError:
             self.skipTest("Skip the test since c2 ops are not registered.")
 
         class MyModel(torch.nn.Module):
@@ -1768,6 +1780,7 @@ def forward(self, x):
 
         torch.jit.trace(Mod(), (torch.rand(3, 4),))
 
+    @skipIfCrossRef
     def test_trace_records_names(self):
         def foo(bar, baz):
             baz = bar + 3
diff --git a/test/jit/test_types.py b/test/jit/test_types.py
index 9fadbedb272b..ca3da3c17c8c 100644
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@@ -39,7 +39,7 @@ def fn(x: torch.Tensor) -> Tuple[Tuple[torch.Tensor], Dict[str, int]]:
         expected = fn(x)
         scripted = torch.jit.script(fn)(x)
 
-        self.assertEquals(expected, scripted)
+        self.assertEqual(expected, scripted)
 
     def test_types_as_values(self):
         def fn(m: torch.Tensor) -> torch.device:
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index e2800e0119e9..e0932d40ebde 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -591,4 +591,4 @@ def foo(x):
 
         with self.assertRaisesRegex(RuntimeError,
                                     r'aka NamedTuple\(logits, aux_logits2, aux_logits1\)'):
-            out = foo(_GoogLeNetOutputs(logits=3, aux_logits2=4, aux_logits1=5))
+            out = foo(_GoogLeNetOutputs(logits="3", aux_logits2="4", aux_logits1="5"))
diff --git a/test/jit/test_upgraders.py b/test/jit/test_upgraders.py
index 8b180c43b989..ab1ee534531f 100644
--- a/test/jit/test_upgraders.py
+++ b/test/jit/test_upgraders.py
@@ -133,11 +133,53 @@ def test_func():
         traced_func = torch.jit.trace(test_func, ())
         buffer = io.BytesIO()
         torch.jit.save(traced_func, buffer)
+
+        current_flag_value = torch._C._get_version_calculator_flag()
+        # calculate based on old version
+        torch._C._calculate_package_version_based_on_upgraders(False)
+        buffer.seek(0)
+        loaded_func = torch.jit.load(buffer)
+        version = self._load_model_version(loaded_func)
+        self.assertTrue(version == 4)
+
+        # calculate based on new version
+        torch._C._calculate_package_version_based_on_upgraders(True)
         buffer.seek(0)
         loaded_func = torch.jit.load(buffer)
         version = self._load_model_version(loaded_func)
         self.assertTrue(version == 4)
 
+        # make sure we preserve old behaviou
+        torch._C._calculate_package_version_based_on_upgraders(current_flag_value)
+
+    @unittest.skipIf(not _is_upgraders_enabled(), "Skipping because upgraders are not enabled")
+    def test_aten_full_other_variants(self):
+        def test_func():
+            a = torch.full([4, 5, 6], 4, names=["a", "b", "c"], dtype=torch.int64)
+            return a
+
+        scripted_func = torch.jit.script(test_func)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_func, buffer)
+
+        current_flag_value = torch._C._get_version_calculator_flag()
+        # calculate based on old version
+        torch._C._calculate_package_version_based_on_upgraders(False)
+        buffer.seek(0)
+        loaded_func = torch.jit.load(buffer)
+        version = self._load_model_version(loaded_func)
+        self.assertTrue(version == 5)
+
+        # calculate based on new version
+        torch._C._calculate_package_version_based_on_upgraders(True)
+        buffer.seek(0)
+        loaded_func = torch.jit.load(buffer)
+        version = self._load_model_version(loaded_func)
+        self.assertTrue(version == 5)
+
+        # make sure we preserve old behaviou
+        torch._C._calculate_package_version_based_on_upgraders(current_flag_value)
+
     @unittest.skipIf(not _is_upgraders_enabled(), "Skipping because upgraders are not enabled")
     def test_aten_linspace(self):
         model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_v7.ptl"
@@ -248,7 +290,7 @@ def test_aten_div_scalar_at_3(self):
         torch.jit.save(loaded_model, buffer)
         buffer.seek(0)
         version = self._load_model_version(loaded_model)
-        self.assertTrue(version == 4)
+        self.assertEqual(version, 4)
         loaded_model_twice = torch.jit.load(buffer)
 
         self.assertEqual(loaded_model(torch.Tensor([5.0, 3.0]), 2.0),
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index b56324093ce1..bd09a36c6860 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -621,7 +621,7 @@ def with_rf(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         function_events = p.function_events
         # Event with name "foo" should be recorded.
         rf_events = [evt for evt in function_events if evt.name == "foo"]
-        self.assertTrue(len(rf_events), 1)
+        self.assertEqual(len(rf_events), 1)
         rf_event = rf_events[0]
         child_events = rf_event.cpu_children
         # Ensure we find nested record_function event
diff --git a/test/jit_hooks/CMakeLists.txt b/test/jit_hooks/CMakeLists.txt
index be29bb463390..546a3040f49b 100644
--- a/test/jit_hooks/CMakeLists.txt
+++ b/test/jit_hooks/CMakeLists.txt
@@ -2,6 +2,10 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(jit_hooks)
 
+if(USE_ROCM)
+include(utils)
+include(LoadHIP)
+endif()
 find_package(Torch REQUIRED)
 
 add_executable(test_jit_hooks test_jit_hooks.cpp)
diff --git a/tools/codegen/__init__.py b/test/lazy/__init__.py
similarity index 100%
rename from tools/codegen/__init__.py
rename to test/lazy/__init__.py
diff --git a/test/lazy/test_bindings.py b/test/lazy/test_bindings.py
new file mode 100644
index 000000000000..57151d408560
--- /dev/null
+++ b/test/lazy/test_bindings.py
@@ -0,0 +1,7 @@
+# Owner(s): ["oncall: jit"]
+
+import torch._lazy.metrics
+
+def test_metrics():
+    names = torch._lazy.metrics.counter_names()
+    assert len(names) == 0, f"Expected no counter names, but got {names}"
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
new file mode 100644
index 000000000000..f4152d0af68b
--- /dev/null
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -0,0 +1,195 @@
+# Owner(s): ["oncall: jit"]
+
+import unittest
+
+from torch._lazy.ts_backend import init as init_ts_backend
+init_ts_backend()
+from torch._lazy import config
+from torch._lazy.extract_compiled_graph import extract_compiled_graph
+import torch
+from torch import nn
+import dis
+import inspect
+from torch import fx
+import re
+from contextlib import contextmanager
+import copy
+
+class ModuleConstScale(nn.Module):
+    def __init__(self):
+        super(ModuleConstScale, self).__init__()
+
+    def forward(self, a):
+        return a * 2
+
+class ModuleSub(nn.Module):
+    def __init__(self):
+        super(ModuleSub, self).__init__()
+
+    def forward(self, a, b):
+        return a - b
+
+class ModuleAddcmul(nn.Module):
+    """
+    addcmul function takes a at::Scalar which results in a special TSData containing a Scalar rather than a Tensor.
+    """
+    def __init__(self):
+        super(ModuleAddcmul, self).__init__()
+
+    def forward(self, a, b, c):
+        return torch.addcmul(a, b, c, value=5)
+
+class ModuleReturnMulti(nn.Module):
+    def __init__(self):
+        super(ModuleReturnMulti, self).__init__()
+
+    def forward(self, a, b):
+        return (b + 1, a - 1)
+
+# The default fx tracer will convert torch.randn to a constant.. We may need
+# a custom tracer.
+# class ModuleEagerTensor(nn.Module):
+#     def __init__(self):
+#         super(ModuleEagerTensor, self).__init__()
+#
+#     def forward(self, a):
+#         b = torch.randn(2, 3, device="cpu") # eager device
+#         return a + b
+
+# The module was planned to cover the case that a Fx graph return an eager
+# tensor on the default device. It's harder than ModuleEagerTensor because
+# we can not just override the device argument to Lazy since there is no
+# explicit device argument.
+#
+# Unfortunately, the default fx tracer convert the return value of the forward
+# method to a constant.. Comment out for now
+# class ModuleReturnEagerTensorOnDefaultDevice(nn.Module):
+#     def __init__(self):
+#         super(ModuleReturnEagerTensorOnDefaultDevice, self).__init__()
+#
+#     def forward(self):
+#         return torch.tensor((2, 3), dtype=torch.float32)
+
+class ModuleReturnDupTensor(nn.Module):
+    """
+    Handle the corner case that the same tensor appears multiple times in the
+    returned tuple. torchbench like drq will hit this corner case when running
+    thru torchdynamo..
+    """
+    def __init__(self):
+        super(ModuleReturnDupTensor, self).__init__()
+
+    def forward(self, a, b):
+        c = a + b
+        return a - b, c, a + 1, c
+
+class ModuleInplaceUpdate(nn.Module):
+    def __init__(self):
+        super(ModuleInplaceUpdate, self).__init__()
+
+    def forward(self, a, b):
+        a.sub_(b)
+        return b - 1, b + 1
+
+@contextmanager
+def force_fallback_ctx_mgr(fallback_op):
+    oldconfig = config.get_force_fallback()
+    config.set_force_fallback(fallback_op)
+    try:
+        yield None
+    finally:
+        config.set_force_fallback(oldconfig)
+
+@contextmanager
+def nop_ctx_mgr():
+    try:
+        yield None
+    finally:
+        pass
+
+def gen_rand_args(mod):
+    args = []
+    for _ in range(len(inspect.signature(mod.forward).parameters)):
+        args.append(torch.randn(2, 3))
+    return args
+
+def allclose(expected, actual):
+    def unwrap(cont):
+        if isinstance(cont, (list, tuple)) and len(cont) == 1:
+            return cont[0]
+        return cont
+    expected = unwrap(expected)
+    actual = unwrap(actual)
+
+    if isinstance(expected, torch.Tensor) and isinstance(actual, torch.Tensor):
+        return torch.allclose(expected, actual)
+    elif isinstance(expected, (tuple, list)) and isinstance(actual, (tuple, list)):
+        return len(expected) == len(actual) and all(torch.allclose(a, b) for a, b in zip(expected, actual))
+    else:
+        raise RuntimeError("Unexpected types")
+
+def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
+    args = gen_rand_args(mod)
+    out = mod(*args)
+
+    dis.dis(mod.forward)
+
+    try:
+        optimized_mod = extract_compiled_graph(fx.symbolic_trace(mod), args)
+    except RuntimeError as e:
+        if exception_msg_pattern is None:
+            raise e  # reraise the exception
+        exception_message = str(e)
+        if not re.search(exception_msg_pattern, exception_message):
+            raise RuntimeError(f"Expection message does not match the required pattern: {exception_message}")
+        else:
+            # We are done for the test case that expects an exception
+            return
+
+    if exception_msg_pattern is not None:
+        raise RuntimeError(f"Expect an exception matching pattern {exception_msg_pattern}")
+    print("return value of optimized_mod", optimized_mod(*args))
+
+    # check correctness
+    failed_index = []
+    for i in range(ncase):
+        rand_args = gen_rand_args(mod)
+        rand_args_copy = copy.deepcopy(rand_args)
+        expected = mod(*rand_args)
+        actual = optimized_mod(*rand_args_copy)
+
+        if not allclose(expected, actual):
+            print(f"Incorrect results. expected {expected}, actual {actual}")
+            failed_index.append(i)
+            continue
+
+        # make sure arguments match after calling the model forward method to handle inplace
+        # updates.
+        if not allclose(rand_args, rand_args_copy):
+            print(f"Incorrect updated arguments. expected {rand_args}, actual {rand_args_copy}")
+            failed_index.append(i)
+            continue
+
+    if len(failed_index) > 0:
+        raise RuntimeError(f"Failed {len(failed_index)}/{ncase} cases")
+
+def maketest(module_cls, exception_msg_pattern=None, ctxmgr=None):
+    def wrapper(self):
+        nonlocal ctxmgr
+        if not ctxmgr:
+            ctxmgr = nop_ctx_mgr()
+        with ctxmgr:
+            verify_reusing_compiled_graph(module_cls(), exception_msg_pattern)
+
+    return wrapper
+
+class OptimizeTest(unittest.TestCase):
+    test_sub = maketest(ModuleSub)
+    # Same as test_sub but force aten::sub to fallback
+    # We expect an exception caught because of LTC fallabck.
+    test_ltc_fallback = maketest(ModuleSub, exception_msg_pattern="fallback.*aten::sub", ctxmgr=force_fallback_ctx_mgr("aten::sub"))
+    test_const_scale = maketest(ModuleConstScale)
+    test_addcmul = maketest(ModuleAddcmul)
+    test_return_multi = maketest(ModuleReturnMulti)
+    test_return_dup_tensor = maketest(ModuleReturnDupTensor)
+    test_inplace_update = maketest(ModuleInplaceUpdate)
diff --git a/test/lazy/test_reuse_ir.py b/test/lazy/test_reuse_ir.py
new file mode 100644
index 000000000000..9a8c1400a4a1
--- /dev/null
+++ b/test/lazy/test_reuse_ir.py
@@ -0,0 +1,106 @@
+# Owner(s): ["oncall: jit"]
+
+import torch
+import torch._lazy
+import torch._lazy.config
+import torch._lazy.ir_cache
+import torch._lazy.ts_backend
+import torch._lazy.metrics as metrics
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
+import os
+import unittest
+
+torch._lazy.ts_backend.init()
+torch._lazy.config.set_reuse_ir(True)
+
+def get_test_device():
+    return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu'
+
+@unittest.skipIf(IS_WINDOWS, "To be fixed")
+class TestLazyReuseIr(TestCase):
+    def testAdd(self):
+        device = get_test_device()
+        x = torch.randn(2, 3, 4, device=device)
+        y = torch.randn(2, 3, 4, device=device)
+        z = torch.zeros(2, 3, 4, device=device)
+
+        device = 'lazy'
+        x_lazy = x.detach().clone().to(device=device)
+        y_lazy = y.detach().clone().to(device=device)
+        z_lazy = z.detach().clone().to(device=device)
+
+        for i in range(10):
+            z += (x + y)
+
+        for i in range(10):
+            z_lazy += (x_lazy + y_lazy)
+            torch._lazy.mark_step()
+
+        torch.testing.assert_close(z.cpu(), z_lazy.cpu())
+        assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 16
+        metrics.reset()
+        torch._lazy.ir_cache.reset()
+
+    def testAddSub(self):
+        device = get_test_device()
+        x = torch.randn(2, 3, 4, device=device)
+        y = torch.randn(2, 3, 4, device=device)
+        z = torch.zeros(2, 3, 4, device=device)
+
+        device = 'lazy'
+        x_lazy = x.detach().clone().to(device=device)
+        y_lazy = y.detach().clone().to(device=device)
+        z_lazy = z.detach().clone().to(device=device)
+
+        for i in range(10):
+            if i < 5:
+                z += (x + y)
+            else:
+                z += (x - y)
+
+        for i in range(10):
+            if i < 5:
+                z_lazy += (x_lazy + y_lazy)
+            else:
+                z_lazy += (x_lazy - y_lazy)
+            torch._lazy.mark_step()
+
+        torch.testing.assert_close(z.cpu(), z_lazy.cpu())
+        assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 10
+        assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 4
+        metrics.reset()
+        torch._lazy.ir_cache.reset()
+
+    def testAddSubFallback(self):
+        torch._lazy.config.set_force_fallback("aten::sub")
+        device = get_test_device()
+        x = torch.randn(2, 3, 4, device=device)
+        y = torch.randn(2, 3, 4, device=device)
+        z = torch.zeros(2, 3, 4, device=device)
+
+        device = 'lazy'
+        x_lazy = x.detach().clone().to(device=device)
+        y_lazy = y.detach().clone().to(device=device)
+        z_lazy = z.detach().clone().to(device=device)
+
+        for i in range(10):
+            if i < 5:
+                z += (x + y)
+            else:
+                z += (x - y)
+
+        for i in range(10):
+            if i < 5:
+                z_lazy += (x_lazy + y_lazy)
+            else:
+                z_lazy += (x_lazy - y_lazy)
+            torch._lazy.mark_step()
+
+        torch.testing.assert_close(z.cpu(), z_lazy.cpu())
+        assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 11
+        metrics.reset()
+        torch._lazy.ir_cache.reset()
+        torch._lazy.config.set_force_fallback("")
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
new file mode 100644
index 000000000000..c14483cf6308
--- /dev/null
+++ b/test/lazy/test_ts_opinfo.py
@@ -0,0 +1,232 @@
+# Owner(s): ["oncall: jit"]
+
+from typing import Sequence
+import torch
+import functools
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests
+import torch._lazy
+import torch._lazy.config
+import torch._lazy.metrics
+import torch._lazy.ir_cache
+import torch._lazy.ts_backend
+import itertools
+import yaml
+import os
+import pathlib
+
+torch._lazy.ts_backend.init()
+
+def get_test_device():
+    return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu'
+
+def remove_suffixes(l):
+    return [x.split(".")[0] for x in l]
+
+def init_lists():
+    path_to_script = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
+    TS_NATIVE_FUNCTIONS_PATH = path_to_script.parent.parent / "aten/src/ATen/native/ts_native_functions.yaml"
+    with open(TS_NATIVE_FUNCTIONS_PATH) as f:
+        yaml_ts = yaml.load(f, yaml.Loader)
+    LAZY_OPS_LIST = set(remove_suffixes(itertools.chain(yaml_ts["full_codegen"], yaml_ts["supported"], yaml_ts["autograd"])))
+    FALLBACK_LIST = set(["clamp"])
+    SKIP_RUNTIME_ERROR_LIST = set([
+        'index_select',  # Empty output_sizes is not supported
+        'clone',  # is clone decomposed?
+
+        # General ASAN Failure due to related to generating bool values.
+        # https://github.com/pytorch/pytorch/issues/74519
+        # https://github.com/pytorch/pytorch/issues/63034
+        'nonzero',  # ASAN failure (paste: P501906539)
+        'all',  # ASAN failure
+        'any',  # ASAN failure
+        'logdet',  # ASAN failure
+    ])
+    SKIP_INCORRECT_RESULTS_LIST = set([
+        'squeeze',  # Value out of range
+        't',  # Value out of range
+        'transpose',  # Value out of range
+        'bernoulli',  # incorrect results
+        'pow',  # incorrect results
+        'addcdiv',  # incorrect results (on CI not locally?)
+    ])
+
+    return (LAZY_OPS_LIST, FALLBACK_LIST, SKIP_RUNTIME_ERROR_LIST, SKIP_INCORRECT_RESULTS_LIST)
+
+(LAZY_OPS_LIST, FALLBACK_LIST, SKIP_RUNTIME_ERROR_LIST, SKIP_INCORRECT_RESULTS_LIST) = init_lists()
+
+torch.manual_seed(42)
+
+def clone_move(t):
+    dev = 'lazy'
+    copy_t = t.detach().clone().requires_grad_(True).to(device=dev)
+    return copy_t
+
+class TestLazyTensor(JitTestCase):
+    def testConvolutionBackward(self):
+        test_device = get_test_device()
+        inp = torch.rand(1, 3, 128, 128, device=test_device, requires_grad=True)
+        inp_copy = clone_move(inp)
+        grad = torch.rand(1, 32, 121, 121, device=test_device)  # no requires_grad
+        grad_copy = clone_move(grad)
+        weight = torch.rand(32, 3, 8, 8, device=test_device, requires_grad=True)
+        weight_copy = clone_move(weight)
+        bias = torch.rand(32, device=test_device, requires_grad=True)
+        bias_copy = clone_move(bias)
+
+        # run eager
+        conv_out = torch.nn.functional.conv2d(inp, weight, bias)
+        (inp_grad, weight_grad, bias_grad) = torch.autograd.grad([conv_out], [inp, weight, bias], [grad])
+
+        # run lazy
+        conv_copy_out = torch.nn.functional.conv2d(inp_copy, weight_copy, bias_copy)
+        (inp_copy_grad, weight_copy_grad, bias_copy_grad) = torch.autograd.grad(
+            [conv_copy_out], [inp_copy, weight_copy, bias_copy], [grad_copy])
+
+        # check numerics
+        torch.testing.assert_close(bias_copy_grad.cpu(), bias_grad.cpu())
+
+        torch.testing.assert_close(weight_copy_grad.cpu(), weight_grad.cpu())
+        torch.testing.assert_close(inp_copy_grad.cpu(), inp_grad.cpu())
+
+class TestLazyOpInfo(TestCase):
+
+    @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST], allowed_dtypes=(torch.float,))
+    def test_dispatched_to_lazy(self, device, dtype, op):
+        def get_name(op):
+            l = [op.name]
+            if op.variant_test_name != '':
+                l.append(op.variant_test_name)
+            return '.'.join(l)
+
+        global FALLBACK_LIST
+        samples = op.sample_inputs("lazy", dtype, requires_grad=False)
+        sample = list(samples)[0]
+        args = [sample.input] + list(sample.args)
+        kwargs = sample.kwargs
+        torch._lazy.mark_step()
+        torch._lazy.wait_device_ops()
+        torch._lazy.metrics.reset()
+
+        r = op(*args, **kwargs)
+        torch._lazy.mark_step()
+        torch._lazy.wait_device_ops()
+        prefix = "aten" if op.name in FALLBACK_LIST else "lazy"
+        found = f"{prefix}::{op.name}" in remove_suffixes(torch._lazy.metrics.counter_names())
+        # check aliases
+        if not found:
+            for alias in op.aliases:
+                alias_found = f"{prefix}::{alias.name}" in remove_suffixes(torch._lazy.metrics.counter_names())
+                found = found or alias_found
+                if found:
+                    break
+        self.assertTrue(found)
+
+
+    @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,))  # noqa: B950
+    def test_correctness(self, device, dtype, op):
+
+        test_device = get_test_device()
+
+        def clone_to_device(input, dev):
+            if isinstance(input, torch.Tensor):
+                return input.detach().clone().to(device=dev)
+            if isinstance(input, Sequence) and not isinstance(input, str):
+                return tuple(map(functools.partial(clone_to_device, dev=dev), input))
+            return input
+
+        def assert_allclose_rec(t):
+            a, b = t
+            self.assertEqual(type(a), type(b))
+            if isinstance(a, torch.Tensor):
+                self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4))
+
+            if isinstance(a, Sequence):
+                map(assert_allclose_rec, zip(a, b))
+
+        samples = op.sample_inputs("lazy", dtype, requires_grad=False)
+        for sample in samples:
+            args = [sample.input] + list(sample.args)
+            kwargs = sample.kwargs
+            copy_args = clone_to_device(args, test_device)
+
+            r_exp = op(*copy_args, **kwargs)
+            r_actual = op(*args, **kwargs)
+
+            assert_allclose_rec((r_actual, r_exp))
+
+    @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,))  # noqa: B950
+    def test_correctness_with_reusing_ir(self, device, dtype, op):
+        torch._lazy.config.set_reuse_ir(True)
+        test_device = get_test_device()
+
+        def clone_to_device(input, dev):
+            if isinstance(input, torch.Tensor):
+                return input.detach().clone().to(device=dev)
+            if isinstance(input, Sequence) and not isinstance(input, str):
+                return tuple(map(functools.partial(clone_to_device, dev=dev), input))
+            return input
+
+        def assert_allclose_rec(t):
+            a, b = t
+            self.assertEqual(type(a), type(b))
+            if isinstance(a, torch.Tensor):
+                self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4))
+
+            if isinstance(a, Sequence):
+                map(assert_allclose_rec, zip(a, b))
+
+        samples = op.sample_inputs("lazy", dtype, requires_grad=False)
+        for sample in samples:
+            args = [sample.input] + list(sample.args)
+            kwargs = sample.kwargs
+            copy_args = clone_to_device(args, test_device)
+
+            r_exp = op(*copy_args, **kwargs)
+            r_actual = op(*args, **kwargs)
+
+            torch._lazy.mark_step()
+            assert_allclose_rec((r_actual, r_exp))
+
+        torch._lazy.ir_cache.reset()
+        torch._lazy.config.set_reuse_ir(False)
+
+
+
+# TODO: after we move to master, add Lazy as a new Device here:
+# https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_device_type.py#L532
+instantiate_device_type_tests(TestLazyOpInfo, globals(), only_for="cpu")
+
+
+class TestLazyDynamicOps(TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        # Setup the dynamic shape mode
+        cls.old_ssa_mode = torch._C._lazy._get_symbolic_shape_mode()
+        torch._C._lazy._set_symbolic_shape_mode(True)
+        return super().setUpClass()
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        torch._C._lazy._set_symbolic_shape_mode(cls.old_ssa_mode)
+        return super().tearDownClass()
+
+    def test_nonzero_dynamic(self):
+        # Test that nonzero gives upper bounds sizes when symbolic shape mode is enabled
+        test_device = get_test_device()
+        x1 = torch.tensor([[0, 1.0, 2.0], [3.0, 0, 0]], device=test_device, requires_grad=True)
+        x1_lazy = clone_move(x1)
+        x2_lazy = torch.nonzero(x1_lazy)
+        print(x2_lazy.size())
+        self.assertEqual(tuple(x2_lazy.size()), (6, 2))
+
+        # We should still be able to instantiate it and get the actual result
+        x2_eager = x2_lazy.cpu()
+        self.assertEqual(tuple(x2_eager.size()), (3, 2))
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/load_torchscript_model.py b/test/load_torchscript_model.py
new file mode 100644
index 000000000000..dc8d4159d7ff
--- /dev/null
+++ b/test/load_torchscript_model.py
@@ -0,0 +1,6 @@
+import sys
+import torch
+
+if __name__ == '__main__':
+    print(torch.jit.load(sys.argv[1]))
+    sys.exit(0)
diff --git a/test/mobile/lightweight_dispatch/CMakeLists.txt b/test/mobile/lightweight_dispatch/CMakeLists.txt
new file mode 100644
index 000000000000..5ab3232f6a44
--- /dev/null
+++ b/test/mobile/lightweight_dispatch/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../..)
+set(TEST_ROOT ${TORCH_ROOT}/test/mobile/lightweight_dispatch)
+
+add_executable(test_codegen_unboxing
+  ${TEST_ROOT}/test_lightweight_dispatch.cpp
+  ${TEST_ROOT}/test_codegen_unboxing.cpp
+)
+
+target_include_directories(test_codegen_unboxing PRIVATE ${ATen_CPU_INCLUDE})
+
+target_compile_definitions(test_codegen_unboxing PRIVATE USE_GTEST)
+
+set(TEST_UNBOXING_DEPENDENCIES torch gtest)
+
+target_link_libraries(test_codegen_unboxing PRIVATE
+  ${TEST_UNBOXING_DEPENDENCIES}
+)
+
+if(INSTALL_TEST)
+  install(TARGETS test_codegen_unboxing DESTINATION bin)
+endif()
diff --git a/test/mobile/lightweight_dispatch/build.sh b/test/mobile/lightweight_dispatch/build.sh
new file mode 100755
index 000000000000..13de97d55829
--- /dev/null
+++ b/test/mobile/lightweight_dispatch/build.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# This script should be called from .jenkins/pytorch/build.sh. Assuming we are at pytorch source root directory.
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+set -ex -o pipefail
+
+# shellcheck disable=SC2034
+echo "Build lite interpreter with lightweight dispatch."
+
+CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}
+mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}"
+
+BUILD_LIBTORCH_PY="$PWD/tools/build_libtorch.py"
+TEST_SRC_ROOT="$PWD/test/mobile/lightweight_dispatch"
+
+pushd "$CUSTOM_TEST_ARTIFACT_BUILD_DIR"
+
+# prepare test
+python "$TEST_SRC_ROOT/tests_setup.py" setup
+
+export USE_DISTRIBUTED=0
+export USE_LIGHTWEIGHT_DISPATCH=1
+export STATIC_DISPATCH_BACKEND="CPU"
+export BUILD_LITE_INTERPRETER=1
+
+python "${BUILD_LIBTORCH_PY}"
+ret=$?
+
+if [ "$ret" -ne 0 ]; then
+  echo "Lite interpreter build failed!"
+  exit "$ret"
+fi
+
+
+# run test
+if ! build/bin/test_codegen_unboxing; then
+  echo "test_codegen_unboxing has failure!"
+  exit 1
+fi
+
+# shutdown test
+python "$TEST_SRC_ROOT/tests_setup.py" shutdown
+
+# run lite interpreter tests
+if ! build/bin/test_lite_interpreter_runtime; then
+  echo "test_lite_interpreter_runtime has failure!"
+  exit 1
+fi
+
+popd
+
+exit 0
diff --git a/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
new file mode 100644
index 000000000000..07a845d6008b
--- /dev/null
+++ b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
@@ -0,0 +1,219 @@
+#include <gtest/gtest.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/torch.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+// Cover codegen'd unboxing logic for these types:
+//'Device',
+//'Device?',
+//'Dimname',
+//'Dimname[1]',
+//'Dimname[]',
+//'Dimname[]?',
+//'Generator?',
+//'Layout?',
+//'MemoryFormat',
+//'MemoryFormat?',
+//'Scalar',
+//'Scalar?',
+//'ScalarType',
+//'ScalarType?',
+//'Scalar[]',
+//'Storage',
+//'Stream',
+//'Tensor',
+//'Tensor(a!)',
+//'Tensor(a!)[]',
+//'Tensor(a)',
+//'Tensor(b!)',
+//'Tensor(c!)',
+//'Tensor(d!)',
+//'Tensor?',
+//'Tensor?[]',
+//'Tensor[]',
+//'bool',
+//'bool?',
+//'bool[2]',
+//'bool[3]',
+//'bool[4]',
+//'float',
+//'float?',
+//'float[]?',
+//'int',
+//'int?',
+//'int[1]',
+//'int[1]?',
+//'int[2]',
+//'int[2]?',
+//'int[3]',
+//'int[4]',
+//'int[5]',
+//'int[6]',
+//'int[]',
+//'int[]?',
+//'str',
+//'str?'
+namespace torch {
+namespace jit {
+namespace mobile {
+// covers int[], ScalarType?, Layout?, Device?, bool?
+TEST(LiteInterpreterTest, Ones) {
+  // Load check in model: ones.ptl
+  auto testModelFile = "ones.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, x: int):
+  //        a = torch.ones([3, x], dtype=torch.int64, layout=torch.strided, device="cpu")
+  //        return a
+  Module bc = _load_for_mobile(testModelFile);
+  std::vector<c10::IValue> input{c10::IValue(4)};
+  const auto result = bc.forward(input);
+  ASSERT_EQ(result.toTensor().size(0), 3);
+  ASSERT_EQ(result.toTensor().size(1), 4);
+}
+
+TEST(LiteInterpreterTest, Index) {
+  // Load check in model: index.ptl
+  auto testModelFile = "index.ptl";
+
+  //    class Model(torch.nn.Module):
+  //      def forward(self, index):
+  //        a = torch.zeros(2, 2)
+  //        a[0][1] = 1
+  //        a[1][0] = 2
+  //        a[1][1] = 3
+  //        return a[index]
+  Module bc = _load_for_mobile(testModelFile);
+  int64_t ind_1 = 0;
+
+  const auto result_1 = bc.forward({at::tensor(ind_1)});
+
+  at::Tensor expected = at::empty({1, 2}, c10::TensorOptions(c10::ScalarType::Float));
+  expected[0][0] = 0;
+  expected[0][1] = 1;
+
+  AT_ASSERT(result_1.toTensor().equal(expected));
+}
+
+TEST(LiteInterpreterTest, Gradient) {
+  // Load check in model: gradient.ptl
+  auto testModelFile = "gradient.ptl";
+
+  //    class Model(torch.nn.Module):
+  //      def forward(self, a: int):
+  //        values = torch.tensor([4., 1., 1., 16.], )
+  //        if a == 0:
+  //          return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64))
+  //        elif a == 1:
+  //          return torch.gradient(values, spacing=[torch.tensor(1.).item()])
+  Module bc = _load_for_mobile(testModelFile);
+
+  const auto result_1 = bc.forward({0});
+  at::Tensor expected_1 = at::tensor({-1.5, -0.75, 3.75, 7.5}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1));
+
+  const auto result_2 = bc.forward({1});
+  at::Tensor expected_2 = at::tensor({-3.0, -1.5, 7.5, 15.0}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_2.toList().get(0).toTensor().equal(expected_2));
+}
+
+TEST(LiteInterpreterTest, Upsample) {
+  // Load check in model: upsample.ptl
+  auto testModelFile = "upsample.ptl";
+
+  // model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear")
+  Module bc = _load_for_mobile(testModelFile);
+
+  const auto result_1 = bc.forward({at::ones({1, 2, 3})});
+  at::Tensor expected_1 = at::ones({1, 2, 6}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, IndexTensor) {
+  // Load check in model: Index_Tensor.ptl
+  auto testModelFile = "index_Tensor.ptl";
+
+  // class Model(torch.nn.Module):
+  //   def forward(self, index):
+  //      values = torch.tensor([4., 1., 1., 16.], )
+  //      return values[[index, torch.tensor(0)]]
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({1}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({1.}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, Conv2d) {
+  // Load check in model: conv2d.ptl
+  auto testModelFile = "conv2d.ptl";
+
+  // model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1))
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::ones({1, 1, 1, 1})});
+
+  ASSERT_EQ(result_1.toTensor().sizes(), c10::IntArrayRef ({1,2,2,2}));
+}
+
+TEST(LiteInterpreterTest, AddTensor) {
+  // Load check in model: add_Tensor.ptl
+  auto testModelFile = "add_Tensor.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, a):
+  //      values = torch.ones(size=[2, 3], names=['N', 'C'])
+  //      values[0][0] = a[0]
+  //      return values
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({1, 2, 3}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({2, 3, 4}, c10::TensorOptions(c10::ScalarType::Long));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, DivideTensor) {
+  // Load check in model: add_Tensor.ptl
+  auto testModelFile = "divide_Tensor.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, b):
+  //      a = torch.tensor(3, dtype=torch.int64)
+  //      out = torch.empty(size=[1], dtype=torch.float)
+  //      torch.div(b, a, out=out)
+  //      return [torch.div(b, a, rounding_mode='trunc'), out]
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({-12}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({-4}, c10::TensorOptions(c10::ScalarType::Long));
+  at::Tensor expected_2 = at::tensor({-4.}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1));
+  AT_ASSERT(result_1.toList().get(1).toTensor().equal(expected_2));
+}
+
+TEST(LiteInterpreterTest, MultipleOps) {
+  // Load check in model: multiple_ops.ptl
+  auto testModelFile = "multiple_ops.ptl";
+
+  // class Model(torch.nn.Module):
+  //           def __init__(self):
+  //               super(Model, self).__init__()
+  //               self.ops = torch.nn.Sequential(
+  //                   torch.nn.ReLU(),
+  //                   torch.nn.Flatten(),
+  //               )
+  //           def forward(self, x):
+  //               x[1] = -2
+  //               return self.ops(x)
+
+  Module bc = _load_for_mobile(testModelFile);
+  auto b = at::ones({2, 2, 2, 2});
+  const auto result = bc.forward({b});
+
+  at::Tensor expected = torch::tensor({{1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0, 0, 0, 0, 0, 0}}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result.toTensor().equal(expected));
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp b/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp
new file mode 100644
index 000000000000..5c5cabccaaaa
--- /dev/null
+++ b/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp
@@ -0,0 +1,18 @@
+#include <gtest/gtest.h>
+
+std::string add_negative_flag(const std::string& flag) {
+  std::string filter = ::testing::GTEST_FLAG(filter);
+  if (filter.find('-') == std::string::npos) {
+    filter.push_back('-');
+  } else {
+    filter.push_back(':');
+  }
+  filter += flag;
+  return filter;
+}
+int main(int argc, char* argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::GTEST_FLAG(filter) = add_negative_flag("*_CUDA:*_MultiCUDA");
+
+    return RUN_ALL_TESTS();
+}
diff --git a/test/mobile/lightweight_dispatch/tests_setup.py b/test/mobile/lightweight_dispatch/tests_setup.py
new file mode 100644
index 000000000000..91af29796b9d
--- /dev/null
+++ b/test/mobile/lightweight_dispatch/tests_setup.py
@@ -0,0 +1,203 @@
+import os
+import sys
+
+import torch
+
+
+class Setup(object):
+    def setup(self):
+        raise NotImplementedError()
+
+    def shutdown(self):
+        raise NotImplementedError()
+
+
+class FileSetup(object):
+    path = None
+
+    def shutdown(self):
+        if os.path.exists(self.path):
+            os.remove(self.path)
+            pass
+
+
+class ModelWithDTypeDeviceLayoutPinMemory(FileSetup):
+    path = 'ones.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: int):
+                a = torch.ones(size=[3, x], dtype=torch.int64, layout=torch.strided, device="cpu", pin_memory=False)
+                return a
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+class ModelWithTensorOptional(FileSetup):
+    path = 'index.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, index):
+                a = torch.zeros(2, 2)
+                a[0][1] = 1
+                a[1][0] = 2
+                a[1][1] = 3
+                return a[index]
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+class ModelWithScalarList(FileSetup):
+    path = 'gradient.ptl'
+
+    def setup(self):
+
+        class Model(torch.nn.Module):
+            def forward(self, a: int):
+                values = torch.tensor([4., 1., 1., 16.], )
+                if a == 0:
+                    return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64))
+                elif a == 1:
+                    return torch.gradient(values, spacing=[torch.tensor(1.).item()])
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+class ModelWithFloatList(FileSetup):
+    path = 'upsample.ptl'
+
+    def setup(self):
+        model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear", align_corners=False, recompute_scale_factor=True)
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+class ModelWithListOfOptionalTensors(FileSetup):
+    path = 'index_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, index):
+                values = torch.tensor([[4., 1., 1., 16.]])
+                return values[torch.tensor(0), index]
+
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1,
+# int groups=1) -> Tensor
+class ModelWithArrayOfInt(FileSetup):
+    path = 'conv2d.ptl'
+
+    def setup(self):
+        model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1))
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+# ones_like(Tensor self, *, ScalarType?, dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None,
+# MemoryFormat? memory_format=None) -> Tensor
+class ModelWithTensors(FileSetup):
+    path = 'add_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, a):
+                b = torch.ones_like(a)
+                return a + b
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+class ModelWithStringOptional(FileSetup):
+    path = 'divide_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, b):
+                a = torch.tensor(3, dtype=torch.int64)
+                out = torch.empty(size=[1], dtype=torch.float)
+                torch.div(b, a, out=out)
+                return [torch.div(b, a, rounding_mode='trunc'), out]
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+class ModelWithMultipleOps(FileSetup):
+    path = 'multiple_ops.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.ops = torch.nn.Sequential(
+                    torch.nn.ReLU(),
+                    torch.nn.Flatten(),
+                )
+
+            def forward(self, x):
+                x[1] = -2
+                return self.ops(x)
+
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+tests = [
+    ModelWithDTypeDeviceLayoutPinMemory(),
+    ModelWithTensorOptional(),
+    ModelWithScalarList(),
+    ModelWithFloatList(),
+    ModelWithListOfOptionalTensors(),
+    ModelWithArrayOfInt(),
+    ModelWithTensors(),
+    ModelWithStringOptional(),
+    ModelWithMultipleOps(),
+]
+
+
+def setup():
+    for test in tests:
+        test.setup()
+
+
+def shutdown():
+    for test in tests:
+        test.shutdown()
+
+
+if __name__ == "__main__":
+    command = sys.argv[1]
+    if command == "setup":
+        setup()
+    elif command == "shutdown":
+        shutdown()
diff --git a/test/mobile/model_test/README.md b/test/mobile/model_test/README.md
new file mode 100644
index 000000000000..49b21051c655
--- /dev/null
+++ b/test/mobile/model_test/README.md
@@ -0,0 +1,87 @@
+## What is this?
+Python scripts in this folder are used to generate lite interpreter models for Android and iOS simulator tests. The goal of these tests is to detect changes that would break existing mobile models used in production (usually they are generated by earlier PyTorch versions). These scripts are based on PyTorch public API (https://pytorch.org/docs/stable/), and are grouped in a similar way:
+- math_ops (https://pytorch.org/docs/stable/torch.html#math-operations)
+  - pointwise_ops
+  - reduction_ops
+  - comparison_ops
+  - spectral_ops
+  - other_math_ops
+  - blas_lapack_ops
+- sampling_ops (https://pytorch.org/docs/stable/torch.html#random-sampling)
+- tensor ops (https://pytorch.org/docs/stable/torch.html#tensors)
+  - tensor_general_ops
+  - tensor_creation_ops
+  - tensor_indexing_ops
+  - tensor_typing_ops
+  - tensor_view_ops
+- nn ops (https://pytorch.org/docs/stable/nn.html)
+  - convolution_ops
+  - pooling_ops
+  - padding_ops
+  - activation_ops
+  - normalization_ops
+  - recurrent_ops
+  - transformer_ops
+  - linear_ops
+  - dropout_ops
+  - sparse_ops
+  - distance_function_ops
+  - loss_function_ops
+  - vision_function_ops
+  - shuffle_ops
+  - nn_utils_ops
+- quantization ops (https://pytorch.org/docs/stable/quantization.html)
+  - general_quant_ops
+  - dynamic_quant_ops
+  - static_quant_ops
+  - fused_quant_ops
+- TorchScript builtin ops (https://pytorch.org/docs/stable/jit_builtin_functions.html)
+  - torchscript_builtin_ops
+  - torchscript_collection_ops
+- torchvision_models (https://pytorch.org/vision/stable/models.html)
+  - mobilenet_v2
+
+The generated models are located at
+https://github.com/pytorch/pytorch/tree/master/android/pytorch_android/src/androidTest/assets (Android)
+https://github.com/pytorch/pytorch/tree/master/ios/TestApp/models/ (iOS)
+
+These test models will be executed in Android and iOS simulator tests. Note that we only check if there's error in model execution, but don't check the correctness of model output.
+
+## Checked-in models and on-the-fly models
+Each test model has a checked-in version and a on-the-fly version. The checked-in versions are stored in this repo (see above model paths) and will only be updated when necessary. The on-the-fly version will be generated during simulator test, with a "_temp" suffix, e.g., "reduction_ops_temp.ptl". Do not commit them.
+
+NOTE: currently Android simulator test does not generate on-the-fly models. Only iOS test does.
+
+## Diagnose failed test
+If the simulator test is falling, that means the current change will potentially break a production model. So be careful. The detailed error message can be found in test log. If the change has to be made, make sure it doesn't break existing production models, and update the failed test model as appropriate (see the next section).
+
+You can also run these tests locally, please see the insturction in android and ios folder. Remember to generate on-the-fly test models if you want to test it locally (but don't commit these models with _temp suffix).
+```
+python test/mobile/model_test/gen_test_model.py ios-test
+```
+
+## Update test model
+If for any reason a test model needs to be updated, run this script:
+```
+python test/mobile/model_test/gen_test_model.py <model_name_without_suffix>
+```
+For example,
+```
+python test/mobile/model_test/gen_test_model.py reduction_ops
+python test/mobile/model_test/gen_test_model.py mobilenet_v2
+```
+
+You can also update all test models for android and iOS:
+```
+python test/mobile/model_test/gen_test_model.py android
+python test/mobile/model_test/gen_test_model.py ios
+```
+
+## Test Coverage
+The test coverage is based on the number of root ops tested in these test models. The full list of generated ops can be found in:
+https://github.com/pytorch/pytorch/blob/master/test/mobile/model_test/coverage.yaml
+
+In additional, the simulator tests will also report the percentage of Meta's production ops that are covered. The list of production ops changes overtime, so a Meta employee needs to regularly udpate the list it using
+```
+python test/mobile/model_test/update_production_ops.py ~/fbsource/xplat/pytorch_models/build/all_mobile_model_configs.yaml
+```
diff --git a/test/mobile/model_test/android_api_module.py b/test/mobile/model_test/android_api_module.py
new file mode 100644
index 000000000000..109e3aa963e8
--- /dev/null
+++ b/test/mobile/model_test/android_api_module.py
@@ -0,0 +1,128 @@
+from typing import Dict, List, Tuple, Optional
+
+import torch
+from torch import Tensor
+
+
+class AndroidAPIModule(torch.jit.ScriptModule):
+    def __init__(self):
+        super(AndroidAPIModule, self).__init__()
+
+    @torch.jit.script_method
+    def forward(self, input):
+        return None
+
+    @torch.jit.script_method
+    def eqBool(self, input: bool) -> bool:
+        return input
+
+    @torch.jit.script_method
+    def eqInt(self, input: int) -> int:
+        return input
+
+    @torch.jit.script_method
+    def eqFloat(self, input: float) -> float:
+        return input
+
+    @torch.jit.script_method
+    def eqStr(self, input: str) -> str:
+        return input
+
+    @torch.jit.script_method
+    def eqTensor(self, input: Tensor) -> Tensor:
+        return input
+
+    @torch.jit.script_method
+    def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]:
+        return input
+
+    @torch.jit.script_method
+    def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]:
+        return input
+
+    @torch.jit.script_method
+    def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]:
+        return input
+
+    @torch.jit.script_method
+    def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]:
+        sum = 0
+        for x in input:
+            sum += x
+        return (input, sum)
+
+    @torch.jit.script_method
+    def listBoolConjunction(self, input: List[bool]) -> bool:
+        res = True
+        for x in input:
+            res = res and x
+        return res
+
+    @torch.jit.script_method
+    def listBoolDisjunction(self, input: List[bool]) -> bool:
+        res = False
+        for x in input:
+            res = res or x
+        return res
+
+    @torch.jit.script_method
+    def tupleIntSumReturnTuple(
+        self, input: Tuple[int, int, int]
+    ) -> Tuple[Tuple[int, int, int], int]:
+        sum = 0
+        for x in input:
+            sum += x
+        return (input, sum)
+
+    @torch.jit.script_method
+    def optionalIntIsNone(self, input: Optional[int]) -> bool:
+        return input is None
+
+    @torch.jit.script_method
+    def intEq0None(self, input: int) -> Optional[int]:
+        if input == 0:
+            return None
+        return input
+
+    @torch.jit.script_method
+    def str3Concat(self, input: str) -> str:
+        return input + input + input
+
+    @torch.jit.script_method
+    def newEmptyShapeWithItem(self, input):
+        return torch.tensor([int(input.item())])[0]
+
+    @torch.jit.script_method
+    def testAliasWithOffset(self) -> List[Tensor]:
+        x = torch.tensor([100, 200])
+        a = [x[0], x[1]]
+        return a
+
+    @torch.jit.script_method
+    def testNonContiguous(self):
+        x = torch.tensor([100, 200, 300])[::2]
+        assert not x.is_contiguous()
+        assert x[0] == 100
+        assert x[1] == 300
+        return x
+
+    @torch.jit.script_method
+    def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+        r = torch.nn.functional.conv2d(x, w)
+        if toChannelsLast:
+            r = r.contiguous(memory_format=torch.channels_last)
+        else:
+            r = r.contiguous()
+        return r
+
+    @torch.jit.script_method
+    def contiguous(self, x: Tensor) -> Tensor:
+        return x.contiguous()
+
+    @torch.jit.script_method
+    def contiguousChannelsLast(self, x: Tensor) -> Tensor:
+        return x.contiguous(memory_format=torch.channels_last)
+
+    @torch.jit.script_method
+    def contiguousChannelsLast3d(self, x: Tensor) -> Tensor:
+        return x.contiguous(memory_format=torch.channels_last_3d)
diff --git a/test/mobile/model_test/builtin_ops.py b/test/mobile/model_test/builtin_ops.py
new file mode 100644
index 000000000000..75b57f7b0613
--- /dev/null
+++ b/test/mobile/model_test/builtin_ops.py
@@ -0,0 +1,125 @@
+import torch
+
+
+# https://pytorch.org/docs/stable/jit_builtin_functions.html#builtin-functions
+
+
+class TSBuiltinOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TSBuiltinOpsModule, self).__init__()
+
+    def forward(self):
+        x = torch.tensor(1)
+        y = torch.tensor(0.5)
+        b = float(1)
+        s = "abcde"
+        l = ["1", "2", "test", "a{}b"]
+        d = {"key": 1}
+        d2 = {0: 100}
+        return len(
+            # type
+            bool(x),
+            bool(x.item()),
+            int(y),
+            int(y.item()),
+            float(x),
+            float(x.item()),
+            # math
+            x & x,
+            bool(x) & bool(x),
+            int(x) & int(x),
+            x | x,
+            bool(x) | bool(x),
+            int(x) | int(x),
+            x << x,
+            int(x) << int(x),
+            x >> x,
+            int(x) >> int(x),
+            x ^ x,
+            bool(x) ^ bool(x),
+            int(x) ^ int(x),
+            b * float(x),
+            b * int(x),
+            b + float(x),
+            b - float(x),
+            x.item() + y.item(),
+            x.item() - y.item(),
+            x.item() * y.item(),
+            x.item() / y.item(),
+            float(x) < float(y),
+            float(x) <= float(y),
+            float(x) > float(y),
+            float(x) > int(y),
+            float(x) >= float(y),
+            float(x) >= int(y),
+            float(x) == float(y),
+            float(x) == int(y),
+            float(x) != float(y),
+            int(x) != float(y),
+            float(x) / float(y),
+            int(x) / int(y),
+            max(x),
+            max(x.item(), y.item()),
+            max(int(x), int(y)),
+            max(float(x), float(y)),
+            min(x),
+            min(x.item(), y.item()),
+            min(int(x), int(y)),
+            min(float(x), float(y)),
+            int(l[0]),
+            float(l[0]),
+            # string
+            str(torch.tensor(1)),
+            l[2].find("t"),
+            l[2].replace("t", "x"),
+            l[2].lower(),
+            l[2].startswith("t"),
+            l[2].split("t"),
+            l[2].strip(),
+            l[2].rstrip(),
+            l[2].lstrip(),
+            l[2][slice(2)],
+            l[3].format("x"),
+            ord(l[2][0]),
+            len(torch.randn(3)),
+            len(l),
+            len(l[2]),
+            len(d),
+            len(d2),
+        )
+
+
+class TSCollectionOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TSCollectionOpsModule, self).__init__()
+
+    def forward(self):
+        s = "abcde"
+        # list
+        l = ["1", "2", "test"]
+        l.reverse()
+        l.reverse()
+        l[1] = "3"
+        l.extend(["4"])
+        # str dict
+        d = {"key": 1}
+        d.clear()
+        d.update({"key": 0})
+        if "key" in d:
+            d["key"] = 2
+        #  int dict
+        d2 = {0: 100}
+        if 0 in d2:
+            d2.clear()
+            d2[0] = 100
+
+        return len(
+            s[torch.tensor(1)],
+            d["key"],
+            d2[0],
+            d.keys(),
+            d.items(),
+            d.values(),
+            d2.values(),
+            l.pop(),
+        )
diff --git a/test/mobile/model_test/coverage.yaml b/test/mobile/model_test/coverage.yaml
new file mode 100644
index 000000000000..5433fea4df10
--- /dev/null
+++ b/test/mobile/model_test/coverage.yaml
@@ -0,0 +1,1094 @@
+_coverage: 87.53
+_covered_ops: 344
+_generated_ops: 693
+_production_ops: 393
+_uncovered_ops: 49
+all_generated_ops:
+- aten::Bool.Tensor
+- aten::Bool.int
+- aten::Float.Scalar
+- aten::Float.Tensor
+- aten::Float.str
+- aten::FloatImplicit
+- aten::Int.Scalar
+- aten::Int.Tensor
+- aten::Int.float
+- aten::Int.str
+- aten::IntImplicit
+- aten::ScalarImplicit
+- aten::__and__.Tensor
+- aten::__and__.bool
+- aten::__and__.int
+- aten::__contains__.int
+- aten::__contains__.int_list
+- aten::__contains__.str
+- aten::__contains__.str_list
+- aten::__derive_index
+- aten::__getitem__.str
+- aten::__getitem__.t
+- aten::__lshift__.Tensor
+- aten::__lshift__.int
+- aten::__or__.Tensor
+- aten::__or__.bool
+- aten::__or__.int
+- aten::__range_length
+- aten::__rshift__.Tensor
+- aten::__rshift__.int
+- aten::__xor__.Tensor
+- aten::__xor__.bool
+- aten::__xor__.int
+- aten::_infer_size
+- aten::_set_item.int
+- aten::_set_item.str
+- aten::_set_item.t
+- aten::_shape_as_tensor
+- aten::_unique2
+- aten::abs
+- aten::acos
+- aten::acosh
+- aten::adaptive_avg_pool1d
+- aten::adaptive_avg_pool2d
+- aten::adaptive_avg_pool3d
+- aten::adaptive_max_pool1d
+- aten::adaptive_max_pool2d
+- aten::adaptive_max_pool3d
+- aten::add
+- aten::add.Scalar
+- aten::add.Tensor
+- aten::add.float
+- aten::add.int
+- aten::add.out
+- aten::add.str
+- aten::add.t
+- aten::add_.Scalar
+- aten::add_.Tensor
+- aten::add_.t
+- aten::addbmm
+- aten::addcdiv
+- aten::addcmul
+- aten::addmm
+- aten::addmv
+- aten::addr
+- aten::all
+- aten::allclose
+- aten::alpha_dropout
+- aten::alpha_dropout_
+- aten::amax
+- aten::amin
+- aten::aminmax
+- aten::angle
+- aten::any
+- aten::append.t
+- aten::arange
+- aten::arange.start
+- aten::arange.start_step
+- aten::argmax
+- aten::argmin
+- aten::argsort
+- aten::as_strided
+- aten::as_tensor.list
+- aten::asin
+- aten::asinh
+- aten::atan
+- aten::atan2
+- aten::atanh
+- aten::atleast_1d
+- aten::atleast_2d
+- aten::atleast_3d
+- aten::avg_pool1d
+- aten::avg_pool2d
+- aten::avg_pool3d
+- aten::baddbmm
+- aten::bartlett_window
+- aten::batch_norm
+- aten::bernoulli
+- aten::bernoulli_.float
+- aten::bilinear
+- aten::binary_cross_entropy
+- aten::binary_cross_entropy_with_logits
+- aten::bincount
+- aten::bitwise_and.Tensor
+- aten::bitwise_not
+- aten::bitwise_or.Tensor
+- aten::bitwise_xor.Tensor
+- aten::blackman_window
+- aten::block_diag
+- aten::bmm
+- aten::broadcast_tensors
+- aten::broadcast_to
+- aten::bucketize.Tensor
+- aten::cartesian_prod
+- aten::cat
+- aten::cauchy_
+- aten::cdist
+- aten::ceil
+- aten::ceil.Scalar
+- aten::ceil.float
+- aten::celu
+- aten::chain_matmul
+- aten::channel_shuffle
+- aten::chunk
+- aten::clamp
+- aten::clamp_
+- aten::clamp_min
+- aten::clear.int
+- aten::clear.str
+- aten::clone
+- aten::coalesce
+- aten::col2im
+- aten::column_stack
+- aten::combinations
+- aten::complex
+- aten::conj
+- aten::constant_pad_nd
+- aten::contiguous
+- aten::conv1d
+- aten::conv2d
+- aten::conv3d
+- aten::conv_transpose1d
+- aten::conv_transpose2d.input
+- aten::conv_transpose3d.input
+- aten::copy_
+- aten::copy_.float
+- aten::copy_.int
+- aten::copysign.Scalar
+- aten::copysign.Tensor
+- aten::corrcoef
+- aten::cos
+- aten::cosh
+- aten::cosine_embedding_loss
+- aten::cosine_similarity
+- aten::count_nonzero
+- aten::cpu
+- aten::cross
+- aten::cross_entropy_loss
+- aten::ctc_loss.Tensor
+- aten::cummax
+- aten::cummin
+- aten::cumprod
+- aten::cumsum
+- aten::cumulative_trapezoid.x
+- aten::deg2rad
+- aten::dense_dim
+- aten::dequantize.self
+- aten::detach
+- aten::detach_
+- aten::diag
+- aten::diag_embed
+- aten::diagflat
+- aten::diagonal
+- aten::diagonal_scatter
+- aten::diff
+- aten::digamma
+- aten::dist
+- aten::div
+- aten::div.Scalar
+- aten::div.Tensor
+- aten::div.Tensor_mode
+- aten::div.float
+- aten::div.int
+- aten::div_.Tensor
+- aten::dot
+- aten::dropout
+- aten::dropout_
+- aten::dsplit.array
+- aten::dstack
+- aten::einsum
+- aten::element_size
+- aten::elu
+- aten::embedding
+- aten::embedding_bag.padding_idx
+- aten::empty.memory_format
+- aten::empty_like
+- aten::empty_strided
+- aten::eq.Scalar
+- aten::eq.Tensor
+- aten::eq.float
+- aten::eq.float_int
+- aten::eq.int
+- aten::eq.int_list
+- aten::eq.str
+- aten::equal
+- aten::erf
+- aten::erfc
+- aten::erfinv
+- aten::exp
+- aten::exp.float
+- aten::exp2
+- aten::expand
+- aten::expand_as
+- aten::expm1
+- aten::exponential_
+- aten::extend.t
+- aten::eye
+- aten::fake_quantize_per_channel_affine
+- aten::fake_quantize_per_tensor_affine
+- aten::feature_alpha_dropout
+- aten::feature_alpha_dropout_
+- aten::feature_dropout
+- aten::feature_dropout_
+- aten::fill_.Scalar
+- aten::fill_diagonal_
+- aten::find
+- aten::flatten.using_ints
+- aten::flip
+- aten::fliplr
+- aten::flipud
+- aten::float_power.Tensor_Scalar
+- aten::float_power.Tensor_Tensor
+- aten::floor
+- aten::floor.float
+- aten::floor_divide
+- aten::floor_divide.Scalar
+- aten::floordiv.int
+- aten::fmax
+- aten::fmin
+- aten::fmod.Scalar
+- aten::frac
+- aten::fractional_max_pool2d
+- aten::fractional_max_pool3d
+- aten::frobenius_norm.dim
+- aten::frobenius_norm.out
+- aten::full
+- aten::full_like
+- aten::gather
+- aten::gcd
+- aten::ge.Scalar
+- aten::ge.Tensor
+- aten::ge.float
+- aten::ge.float_int
+- aten::ge.int
+- aten::gelu
+- aten::geometric_
+- aten::glu
+- aten::grid_sampler
+- aten::group_norm
+- aten::gru.input
+- aten::gru_cell
+- aten::gt.Scalar
+- aten::gt.Tensor
+- aten::gt.float
+- aten::gt.float_int
+- aten::gt.int
+- aten::hamming_window
+- aten::hann_window
+- aten::hardshrink
+- aten::hardsigmoid
+- aten::hardsigmoid_
+- aten::hardswish
+- aten::hardswish_
+- aten::hardtanh
+- aten::hardtanh_
+- aten::heaviside
+- aten::hinge_embedding_loss
+- aten::histc
+- aten::histogram.bin_ct
+- aten::hsplit.array
+- aten::hstack
+- aten::huber_loss
+- aten::hypot
+- aten::i0
+- aten::igamma
+- aten::igammac
+- aten::im2col
+- aten::imag
+- aten::index.Tensor
+- aten::index_fill.int_Scalar
+- aten::index_put.hacked_twin
+- aten::index_put_.hacked_twin
+- aten::index_select
+- aten::inner
+- aten::instance_norm
+- aten::is_coalesced
+- aten::is_complex
+- aten::is_conj
+- aten::is_contiguous
+- aten::is_floating_point
+- aten::is_leaf
+- aten::is_nonzero
+- aten::is_pinned
+- aten::is_set_to
+- aten::is_signed
+- aten::isclose
+- aten::isfinite
+- aten::isin.Tensor_Tensor
+- aten::isinf
+- aten::isnan
+- aten::isneginf
+- aten::isposinf
+- aten::isreal
+- aten::istft
+- aten::item
+- aten::items.str
+- aten::kaiser_window
+- aten::keys.str
+- aten::kl_div
+- aten::kron
+- aten::kthvalue
+- aten::l1_loss
+- aten::layer_norm
+- aten::lcm
+- aten::ldexp.Tensor
+- aten::le.Scalar
+- aten::le.Tensor
+- aten::le.float
+- aten::le.int
+- aten::leaky_relu
+- aten::leaky_relu_
+- aten::len.Dict_int
+- aten::len.Dict_str
+- aten::len.Tensor
+- aten::len.str
+- aten::len.t
+- aten::lerp.Scalar
+- aten::lerp.Tensor
+- aten::lgamma
+- aten::linalg_matrix_exp
+- aten::linalg_matrix_power
+- aten::linear
+- aten::linspace
+- aten::list.t
+- aten::log
+- aten::log10
+- aten::log1p
+- aten::log2
+- aten::log_normal_
+- aten::log_sigmoid
+- aten::log_softmax.int
+- aten::logaddexp
+- aten::logaddexp2
+- aten::logcumsumexp
+- aten::logical_and
+- aten::logical_and.out
+- aten::logical_not
+- aten::logical_not.out
+- aten::logical_or
+- aten::logical_or.out
+- aten::logical_xor
+- aten::logical_xor.out
+- aten::logit
+- aten::logspace
+- aten::logsumexp
+- aten::lower
+- aten::lstm.input
+- aten::lstm_cell
+- aten::lstrip
+- aten::lt.Scalar
+- aten::lt.Tensor
+- aten::lt.float
+- aten::lt.int
+- aten::margin_ranking_loss
+- aten::masked_fill.Scalar
+- aten::masked_fill_.Scalar
+- aten::masked_select
+- aten::matmul
+- aten::max
+- aten::max.dim
+- aten::max.other
+- aten::max_pool1d
+- aten::max_pool2d
+- aten::max_pool3d
+- aten::maximum
+- aten::mean
+- aten::mean.dim
+- aten::median
+- aten::meshgrid
+- aten::meshgrid.indexing
+- aten::min
+- aten::min.dim
+- aten::min.other
+- aten::minimum
+- aten::mish
+- aten::mm
+- aten::mode
+- aten::movedim.int
+- aten::mse_loss
+- aten::msort
+- aten::mul
+- aten::mul.Scalar
+- aten::mul.Tensor
+- aten::mul.float
+- aten::mul.float_int
+- aten::mul.int
+- aten::mul.int_float
+- aten::mul.left_t
+- aten::mul.out
+- aten::mul_.Scalar
+- aten::mul_.Tensor
+- aten::multi_margin_loss
+- aten::multilabel_margin_loss
+- aten::multinomial
+- aten::mv
+- aten::mvlgamma
+- aten::nan_to_num
+- aten::nan_to_num_
+- aten::nanmean
+- aten::nanmedian
+- aten::nanquantile
+- aten::nansum
+- aten::narrow
+- aten::ne.Scalar
+- aten::ne.Tensor
+- aten::ne.float
+- aten::ne.int
+- aten::ne.int_float
+- aten::ne.int_list
+- aten::ne.str
+- aten::neg
+- aten::neg.int
+- aten::new_empty
+- aten::new_full
+- aten::new_ones
+- aten::new_zeros
+- aten::nll_loss_nd
+- aten::nonzero
+- aten::norm.Scalar
+- aten::norm.ScalarOpt_dim
+- aten::norm.ScalarOpt_dim_dtype
+- aten::norm.dtype_out
+- aten::norm.out
+- aten::normal.float_float
+- aten::normal_
+- aten::nuclear_norm
+- aten::nuclear_norm.dim
+- aten::nuclear_norm.dim_out
+- aten::nuclear_norm.out
+- aten::numel
+- aten::one_hot
+- aten::ones
+- aten::ones_like
+- aten::ord
+- aten::outer
+- aten::pad_sequence
+- aten::pairwise_distance
+- aten::pdist
+- aten::permute
+- aten::pixel_shuffle
+- aten::pixel_unshuffle
+- aten::poisson
+- aten::poisson_nll_loss
+- aten::polar
+- aten::polygamma
+- aten::pop.t
+- aten::pow.Tensor_Scalar
+- aten::pow.Tensor_Tensor
+- aten::pow.int_float
+- aten::prelu
+- aten::prod
+- aten::quantile
+- aten::quantile.scalar
+- aten::quantize_per_channel
+- aten::quantize_per_tensor
+- aten::quantize_per_tensor.tensor_qparams
+- aten::quantized_gru.input
+- aten::quantized_lstm.input
+- aten::rad2deg
+- aten::rand
+- aten::rand_like
+- aten::randint
+- aten::randint.low
+- aten::randint_like
+- aten::randn
+- aten::randn_like
+- aten::random_
+- aten::randperm
+- aten::range.step
+- aten::ravel
+- aten::real
+- aten::reciprocal
+- aten::reflection_pad1d
+- aten::reflection_pad2d
+- aten::reflection_pad3d
+- aten::relu
+- aten::relu_
+- aten::remainder.Scalar
+- aten::remainder.int
+- aten::renorm
+- aten::repeat
+- aten::repeat_interleave.Tensor
+- aten::replace
+- aten::replication_pad1d
+- aten::replication_pad2d
+- aten::replication_pad3d
+- aten::requires_grad_
+- aten::reshape
+- aten::resize_as_
+- aten::resolve_conj
+- aten::resolve_neg
+- aten::reverse.t
+- aten::rnn_tanh.input
+- aten::rnn_tanh_cell
+- aten::roll
+- aten::rot90
+- aten::round
+- aten::round.Scalar
+- aten::rrelu
+- aten::rsqrt
+- aten::rstrip
+- aten::scatter.src
+- aten::scatter_.src
+- aten::scatter_add
+- aten::scatter_add_
+- aten::searchsorted.Tensor
+- aten::select.int
+- aten::select_scatter
+- aten::selu
+- aten::sgn
+- aten::sigmoid
+- aten::sign
+- aten::signbit
+- aten::silu
+- aten::sin
+- aten::sinc
+- aten::sinh
+- aten::size
+- aten::size.int
+- aten::slice.Tensor
+- aten::slice.str
+- aten::slice.t
+- aten::slice_scatter
+- aten::smooth_l1_loss
+- aten::soft_margin_loss
+- aten::softmax.int
+- aten::softplus
+- aten::softshrink
+- aten::sort
+- aten::split
+- aten::split.Tensor
+- aten::split.str
+- aten::sqrt
+- aten::sqrt.int
+- aten::square
+- aten::squeeze.dim
+- aten::squeeze_.dim
+- aten::stack
+- aten::startswith
+- aten::std
+- aten::std_mean
+- aten::stft
+- aten::str
+- aten::strip
+- aten::sub
+- aten::sub.Scalar
+- aten::sub.Tensor
+- aten::sub.float
+- aten::sub.int
+- aten::sub_.Tensor
+- aten::sum
+- aten::sum.dim_IntList
+- aten::sum.int
+- aten::t
+- aten::take
+- aten::take_along_dim
+- aten::tan
+- aten::tanh
+- aten::tensor
+- aten::tensor.float
+- aten::tensor.int
+- aten::tensor_split.indices
+- aten::tensor_split.sections
+- aten::tensordot
+- aten::tensordot.out
+- aten::tile
+- aten::to.device
+- aten::to.dtype
+- aten::to.dtype_layout
+- aten::to.prim_Device
+- aten::topk
+- aten::trace
+- aten::transpose.int
+- aten::trapezoid.x
+- aten::trapz.x
+- aten::tril
+- aten::tril_indices
+- aten::triplet_margin_loss
+- aten::triu
+- aten::triu_indices
+- aten::trunc
+- aten::trunc_
+- aten::type_as
+- aten::unbind.int
+- aten::unflatten.int
+- aten::unfold
+- aten::uniform_
+- aten::unique_consecutive
+- aten::unique_dim
+- aten::unsqueeze
+- aten::unsqueeze_
+- aten::update.str
+- aten::upsample_bicubic2d.vec
+- aten::upsample_bilinear2d.vec
+- aten::upsample_linear1d.vec
+- aten::upsample_nearest1d.vec
+- aten::upsample_nearest2d.vec
+- aten::upsample_nearest3d.vec
+- aten::upsample_trilinear3d.vec
+- aten::values.int
+- aten::values.str
+- aten::vander
+- aten::var
+- aten::var_mean
+- aten::vdot
+- aten::view
+- aten::view_as
+- aten::view_as_complex
+- aten::view_as_real
+- aten::vsplit.array
+- aten::vstack
+- aten::where
+- aten::where.ScalarOther
+- aten::where.self
+- aten::xlogy.Scalar_Other
+- aten::xlogy.Scalar_Self
+- aten::xlogy.Tensor
+- aten::zeros
+- aten::zeros.out
+- aten::zeros_like
+- prepacked::conv2d_clamp_run
+- prepacked::linear_clamp_run
+- prim::TupleUnpack
+- prim::is_meta
+- prim::is_quantized
+- prim::is_sparse
+- prim::max
+- prim::max.float
+- prim::max.int
+- prim::max.self_int
+- prim::min
+- prim::min.float
+- prim::min.int
+- prim::min.self_int
+- prim::unchecked_cast
+- quantized::add
+- quantized::add_relu
+- quantized::add_scalar
+- quantized::batch_norm2d
+- quantized::batch_norm3d
+- quantized::cat
+- quantized::conv1d
+- quantized::conv1d_prepack
+- quantized::conv1d_relu
+- quantized::conv1d_unpack
+- quantized::conv2d.new
+- quantized::conv2d_prepack
+- quantized::conv2d_relu.new
+- quantized::conv2d_unpack
+- quantized::conv3d.new
+- quantized::conv3d_prepack
+- quantized::conv3d_relu.new
+- quantized::conv3d_unpack
+- quantized::conv_transpose1d
+- quantized::conv_transpose1d_prepack
+- quantized::conv_transpose1d_unpack
+- quantized::conv_transpose2d
+- quantized::conv_transpose2d_prepack
+- quantized::conv_transpose3d_prepack
+- quantized::embedding_4bit
+- quantized::embedding_byte
+- quantized::hardswish
+- quantized::instance_norm
+- quantized::leaky_relu
+- quantized::linear
+- quantized::linear_dynamic
+- quantized::linear_dynamic_fp16
+- quantized::linear_relu
+- quantized::mul
+- quantized::mul_scalar
+- quantized::quantized_gru_cell_dynamic
+- quantized::quantized_lstm_cell_dynamic
+- quantized::quantized_rnn_tanh_cell_dynamic
+covered_ops:
+  aten::Bool.Tensor: 19
+  aten::Bool.int: 7
+  aten::Float.Scalar: 18
+  aten::Float.Tensor: 11
+  aten::Float.str: 6
+  aten::FloatImplicit: 2
+  aten::Int.Scalar: 19
+  aten::Int.Tensor: 35
+  aten::Int.float: 6
+  aten::Int.str: 12
+  aten::IntImplicit: 11
+  aten::ScalarImplicit: 3
+  aten::__and__.Tensor: 13
+  aten::__and__.bool: 11
+  aten::__and__.int: 2
+  aten::__contains__.int: 5
+  aten::__contains__.int_list: 17
+  aten::__contains__.str: 22
+  aten::__contains__.str_list: 5
+  aten::__derive_index: 24
+  aten::__getitem__.str: 20
+  aten::__getitem__.t: 178
+  aten::__lshift__.int: 2
+  aten::__range_length: 23
+  aten::__rshift__.int: 2
+  aten::__xor__.bool: 10
+  aten::_infer_size: 7
+  aten::_set_item.int: 7
+  aten::_set_item.str: 163
+  aten::_set_item.t: 8
+  aten::_shape_as_tensor: 10
+  aten::adaptive_avg_pool1d: 1
+  aten::adaptive_avg_pool2d: 33
+  aten::adaptive_avg_pool3d: 1
+  aten::add.Scalar: 33
+  aten::add.Tensor: 63
+  aten::add.float: 5
+  aten::add.int: 49
+  aten::add.out: 2
+  aten::add.str: 29
+  aten::add.t: 11
+  aten::add_.Scalar: 15
+  aten::add_.Tensor: 29
+  aten::addcmul: 2
+  aten::addmm: 7
+  aten::all: 6
+  aten::allclose: 1
+  aten::any: 14
+  aten::append.t: 59
+  aten::arange: 16
+  aten::arange.start: 6
+  aten::arange.start_step: 16
+  aten::argmax: 2
+  aten::as_strided: 10
+  aten::as_tensor.list: 4
+  aten::atan: 4
+  aten::avg_pool1d: 6
+  aten::avg_pool2d: 7
+  aten::batch_norm: 15
+  aten::binary_cross_entropy: 15
+  aten::binary_cross_entropy_with_logits: 3
+  aten::bitwise_not: 13
+  aten::bmm: 16
+  aten::broadcast_tensors: 1
+  aten::cat: 90
+  aten::ceil: 3
+  aten::ceil.float: 7
+  aten::chunk: 19
+  aten::clamp: 36
+  aten::clamp_: 12
+  aten::clamp_min: 3
+  aten::clear.str: 2
+  aten::clone: 26
+  aten::coalesce: 2
+  aten::conj: 1
+  aten::constant_pad_nd: 17
+  aten::contiguous: 113
+  aten::conv1d: 12
+  aten::conv2d: 10
+  aten::conv_transpose2d.input: 5
+  aten::copy_: 15
+  aten::copy_.int: 1
+  aten::cos: 4
+  aten::count_nonzero: 4
+  aten::ctc_loss.Tensor: 1
+  aten::cumsum: 13
+  aten::dequantize.self: 30
+  aten::detach: 34
+  aten::div: 9
+  aten::div.Scalar: 8
+  aten::div.Tensor: 71
+  aten::div.Tensor_mode: 7
+  aten::div.float: 3
+  aten::div.int: 7
+  aten::div_.Tensor: 7
+  aten::dropout: 41
+  aten::embedding: 16
+  aten::embedding_bag.padding_idx: 2
+  aten::empty.memory_format: 11
+  aten::empty_like: 11
+  aten::empty_strided: 3
+  aten::eq.Scalar: 24
+  aten::eq.Tensor: 6
+  aten::eq.int: 57
+  aten::eq.int_list: 20
+  aten::eq.str: 43
+  aten::exp: 18
+  aten::exp.float: 4
+  aten::expand: 26
+  aten::expand_as: 3
+  aten::extend.t: 38
+  aten::feature_dropout: 1
+  aten::fill_.Scalar: 17
+  aten::find: 3
+  aten::flatten.using_ints: 45
+  aten::flip: 1
+  aten::floor: 5
+  aten::floor.float: 2
+  aten::floor_divide: 4
+  aten::floor_divide.Scalar: 7
+  aten::floordiv.int: 21
+  aten::full: 10
+  aten::full_like: 10
+  aten::gather: 10
+  aten::ge.Scalar: 4
+  aten::ge.Tensor: 6
+  aten::ge.int: 29
+  aten::gelu: 12
+  aten::glu: 18
+  aten::grid_sampler: 3
+  aten::gt.Scalar: 16
+  aten::gt.float: 16
+  aten::gt.float_int: 3
+  aten::gt.int: 52
+  aten::hardsigmoid: 3
+  aten::hardsigmoid_: 2
+  aten::hardswish_: 4
+  aten::hardtanh: 3
+  aten::hardtanh_: 3
+  aten::hstack: 2
+  aten::index.Tensor: 23
+  aten::index_fill.int_Scalar: 15
+  aten::index_select: 31
+  aten::is_coalesced: 2
+  aten::is_floating_point: 9
+  aten::isnan: 1
+  aten::item: 40
+  aten::items.str: 3
+  aten::keys.str: 15
+  aten::layer_norm: 26
+  aten::le.Scalar: 1
+  aten::le.Tensor: 10
+  aten::le.float: 2
+  aten::le.int: 17
+  aten::leaky_relu: 1
+  aten::leaky_relu_: 5
+  aten::len.Dict_int: 5
+  aten::len.Tensor: 19
+  aten::len.str: 23
+  aten::len.t: 177
+  aten::linear: 46
+  aten::linspace: 3
+  aten::list.t: 24
+  aten::log: 18
+  aten::log10: 4
+  aten::log1p: 5
+  aten::log_softmax.int: 31
+  aten::logical_and: 1
+  aten::logical_not: 10
+  aten::logit: 7
+  aten::lower: 10
+  aten::lstm.input: 4
+  aten::lt.Scalar: 8
+  aten::lt.Tensor: 1
+  aten::lt.float: 16
+  aten::lt.int: 46
+  aten::masked_fill.Scalar: 16
+  aten::matmul: 12
+  aten::max: 18
+  aten::max.dim: 30
+  aten::max.other: 7
+  aten::max_pool2d: 10
+  aten::maximum: 4
+  aten::mean: 10
+  aten::mean.dim: 16
+  aten::meshgrid.indexing: 2
+  aten::min: 2
+  aten::min.dim: 4
+  aten::min.other: 17
+  aten::minimum: 4
+  aten::mse_loss: 1
+  aten::mul.Scalar: 26
+  aten::mul.Tensor: 90
+  aten::mul.float: 5
+  aten::mul.float_int: 3
+  aten::mul.int: 26
+  aten::mul.int_float: 4
+  aten::mul.left_t: 15
+  aten::mul.out: 1
+  aten::mul_.Scalar: 11
+  aten::mul_.Tensor: 5
+  aten::nan_to_num: 3
+  aten::nan_to_num_: 10
+  aten::narrow: 10
+  aten::ne.Scalar: 14
+  aten::ne.Tensor: 5
+  aten::ne.int: 44
+  aten::ne.int_float: 2
+  aten::ne.int_list: 20
+  aten::ne.str: 3
+  aten::neg: 29
+  aten::neg.int: 19
+  aten::new_zeros: 6
+  aten::nll_loss_nd: 3
+  aten::nonzero: 4
+  aten::norm.Scalar: 1
+  aten::norm.ScalarOpt_dim: 4
+  aten::numel: 8
+  aten::one_hot: 2
+  aten::ones: 38
+  aten::ones_like: 16
+  aten::ord: 20
+  aten::permute: 43
+  aten::pop.t: 7
+  aten::pow.Tensor_Scalar: 3
+  aten::pow.int_float: 2
+  aten::quantile.scalar: 1
+  aten::quantize_per_tensor: 66
+  aten::quantize_per_tensor.tensor_qparams: 1
+  aten::rand: 25
+  aten::randint.low: 2
+  aten::randn_like: 17
+  aten::reciprocal: 1
+  aten::reflection_pad2d: 1
+  aten::relu: 82
+  aten::relu_: 9
+  aten::remainder.Scalar: 2
+  aten::remainder.int: 22
+  aten::repeat: 16
+  aten::replace: 1
+  aten::replication_pad1d: 1
+  aten::replication_pad2d: 2
+  aten::replication_pad3d: 1
+  aten::requires_grad_: 4
+  aten::reshape: 36
+  aten::resize_as_: 1
+  aten::resolve_conj: 1
+  aten::resolve_neg: 1
+  aten::reverse.t: 2
+  aten::round.Scalar: 4
+  aten::rstrip: 1
+  aten::scatter_.src: 6
+  aten::scatter_add_: 10
+  aten::select.int: 57
+  aten::selu: 2
+  aten::sigmoid: 93
+  aten::sin: 4
+  aten::size: 66
+  aten::size.int: 66
+  aten::slice.Tensor: 75
+  aten::slice.str: 12
+  aten::slice.t: 43
+  aten::softmax.int: 63
+  aten::softplus: 2
+  aten::sort: 18
+  aten::split.str: 10
+  aten::sqrt: 1
+  aten::squeeze.dim: 26
+  aten::stack: 30
+  aten::startswith: 10
+  aten::str: 16
+  aten::strip: 3
+  aten::sub: 8
+  aten::sub.Scalar: 26
+  aten::sub.Tensor: 94
+  aten::sub.int: 52
+  aten::sub_.Tensor: 4
+  aten::sum: 17
+  aten::sum.dim_IntList: 19
+  aten::sum.int: 1
+  aten::t: 3
+  aten::tanh: 26
+  aten::tensor: 51
+  aten::tensor.float: 28
+  aten::tensor.int: 34
+  aten::tensor_split.indices: 4
+  aten::to.device: 11
+  aten::to.dtype: 23
+  aten::to.dtype_layout: 27
+  aten::to.prim_Device: 23
+  aten::topk: 10
+  aten::transpose.int: 33
+  aten::triu: 10
+  aten::trunc_: 3
+  aten::type_as: 6
+  aten::unbind.int: 24
+  aten::unique_consecutive: 2
+  aten::unsqueeze: 34
+  aten::unsqueeze_: 6
+  aten::update.str: 4
+  aten::upsample_bicubic2d.vec: 1
+  aten::upsample_bilinear2d.vec: 8
+  aten::upsample_linear1d.vec: 1
+  aten::upsample_nearest1d.vec: 2
+  aten::upsample_nearest2d.vec: 30
+  aten::upsample_nearest3d.vec: 2
+  aten::upsample_trilinear3d.vec: 1
+  aten::values.int: 3
+  aten::view: 61
+  aten::vstack: 1
+  aten::where.ScalarOther: 4
+  aten::where.self: 10
+  aten::zeros: 75
+  aten::zeros.out: 1
+  aten::zeros_like: 7
+  prepacked::conv2d_clamp_run: 32
+  prepacked::linear_clamp_run: 26
+  prim::TupleUnpack: 120
+  prim::max.float: 7
+  prim::max.int: 14
+  prim::max.self_int: 17
+  prim::min: 4
+  prim::min.int: 35
+  prim::min.self_int: 25
+  prim::unchecked_cast: 100
+  quantized::add: 58
+  quantized::add_relu: 1
+  quantized::batch_norm2d: 1
+  quantized::cat: 4
+  quantized::conv1d: 1
+  quantized::conv2d.new: 55
+  quantized::conv2d_prepack: 14
+  quantized::conv2d_relu.new: 50
+  quantized::conv_transpose2d: 2
+  quantized::embedding_4bit: 1
+  quantized::embedding_byte: 14
+  quantized::hardswish: 1
+  quantized::instance_norm: 1
+  quantized::leaky_relu: 2
+  quantized::linear: 27
+  quantized::linear_dynamic: 21
+  quantized::linear_dynamic_fp16: 18
+  quantized::linear_relu: 2
+  quantized::mul: 4
+uncovered_ops:
+  aten::__getitem__.Dict_int: 4
+  aten::__getitem__.Dict_str: 39
+  aten::__is__: 83
+  aten::__isnot__: 81
+  aten::__not__: 32
+  aten::_aminmax: 4
+  aten::_convolution: 12
+  aten::_convolution.deprecated: 3
+  aten::_make_per_tensor_quantized_tensor: 2
+  aten::_pack_padded_sequence: 10
+  aten::_pad_packed_sequence: 10
+  aten::_reshape_from_tensor: 10
+  aten::backward: 23
+  aten::copy_.Tensor: 27
+  aten::dequantize.list: 1
+  aten::dequantize.tensor: 36
+  aten::dim: 36
+  aten::format: 58
+  aten::get.default_str: 14
+  aten::index_put_: 16
+  aten::lstm.data: 8
+  aten::nll_loss: 1
+  aten::nll_loss2d: 1
+  aten::quantized_lstm.data: 2
+  aten::rsub.Scalar: 5
+  aten::sparse_coo_tensor.indices: 1
+  aten::sparse_resize_and_clear_: 1
+  aten::to.prim_dtype: 38
+  aten::true_divide.Tensor: 2
+  aten::upsample_nearest2d: 7
+  prepacked::conv2d_clamp_prepack: 2
+  prepacked::conv2d_transpose_clamp_prepack: 1
+  prepacked::conv2d_transpose_clamp_run: 1
+  prim::ModuleContainerIndex.list: 2
+  prim::NumToTensor.Scalar: 15
+  prim::Print: 1
+  prim::RaiseException: 103
+  prim::TupleIndex: 157
+  prim::Uninitialized: 80
+  prim::device: 46
+  prim::dtype: 45
+  prim::is_cuda: 1
+  quantized::conv2d: 4
+  quantized::conv_prepack: 5
+  quantized::linear_prepack: 29
+  quantized::linear_prepack_fp16: 25
+  quantized::linear_unpack: 4
+  quantized::linear_unpack_fp16: 4
+  quantized::mul.Scalar: 1
diff --git a/test/mobile/model_test/gen_test_model.py b/test/mobile/model_test/gen_test_model.py
new file mode 100644
index 000000000000..e9e3908630be
--- /dev/null
+++ b/test/mobile/model_test/gen_test_model.py
@@ -0,0 +1,243 @@
+import io
+import sys
+import torch
+import yaml
+from android_api_module import AndroidAPIModule
+from builtin_ops import (
+    TSBuiltinOpsModule,
+    TSCollectionOpsModule,
+)
+from math_ops import (
+    PointwiseOpsModule,
+    ReductionOpsModule,
+    ComparisonOpsModule,
+    OtherMathOpsModule,
+    SpectralOpsModule,
+    BlasLapackOpsModule,
+)
+from nn_ops import (
+    NNConvolutionModule,
+    NNPoolingModule,
+    NNPaddingModule,
+    NNNormalizationModule,
+    NNActivationModule,
+    NNRecurrentModule,
+    NNTransformerModule,
+    NNLinearModule,
+    NNDropoutModule,
+    NNSparseModule,
+    NNDistanceModule,
+    NNLossFunctionModule,
+    NNVisionModule,
+    NNShuffleModule,
+    NNUtilsModule,
+)
+from quantization_ops import (
+    GeneralQuantModule,
+    DynamicQuantModule,
+    StaticQuantModule,
+    FusedQuantModule,
+)
+from sampling_ops import SamplingOpsModule
+from tensor_ops import (
+    TensorOpsModule,
+    TensorCreationOpsModule,
+    TensorIndexingOpsModule,
+    TensorTypingOpsModule,
+    TensorViewOpsModule,
+)
+from torch.jit.mobile import _load_for_lite_interpreter
+from torchvision_models import MobileNetV2Module
+
+test_path_ios = "ios/TestApp/models/"
+test_path_android = "android/pytorch_android/src/androidTest/assets/"
+
+production_ops_path = "test/mobile/model_test/model_ops.yaml"
+coverage_out_path = "test/mobile/model_test/coverage.yaml"
+
+all_modules = {
+    # math ops
+    "pointwise_ops": PointwiseOpsModule(),
+    "reduction_ops": ReductionOpsModule(),
+    "comparison_ops": ComparisonOpsModule(),
+    "spectral_ops": SpectralOpsModule(),
+    "other_math_ops": OtherMathOpsModule(),
+    "blas_lapack_ops": BlasLapackOpsModule(),
+    # sampling
+    "sampling_ops": SamplingOpsModule(),
+    # tensor ops
+    "tensor_general_ops": TensorOpsModule(),
+    "tensor_creation_ops": TensorCreationOpsModule(),
+    "tensor_indexing_ops": TensorIndexingOpsModule(),
+    "tensor_typing_ops": TensorTypingOpsModule(),
+    "tensor_view_ops": TensorViewOpsModule(),
+    # nn ops
+    "convolution_ops": NNConvolutionModule(),
+    "pooling_ops": NNPoolingModule(),
+    "padding_ops": NNPaddingModule(),
+    "activation_ops": NNActivationModule(),
+    "normalization_ops": NNNormalizationModule(),
+    "recurrent_ops": NNRecurrentModule(),
+    "transformer_ops": NNTransformerModule(),
+    "linear_ops": NNLinearModule(),
+    "dropout_ops": NNDropoutModule(),
+    "sparse_ops": NNSparseModule(),
+    "distance_function_ops": NNDistanceModule(),
+    "loss_function_ops": NNLossFunctionModule(),
+    "vision_function_ops": NNVisionModule(),
+    "shuffle_ops": NNShuffleModule(),
+    "nn_utils_ops": NNUtilsModule(),
+    # quantization ops
+    "general_quant_ops": GeneralQuantModule(),
+    "dynamic_quant_ops": DynamicQuantModule(),
+    "static_quant_ops": StaticQuantModule(),
+    "fused_quant_ops": FusedQuantModule(),
+    # TorchScript buildin ops
+    "torchscript_builtin_ops": TSBuiltinOpsModule(),
+    "torchscript_collection_ops": TSCollectionOpsModule(),
+    # vision
+    "mobilenet_v2": MobileNetV2Module(),
+    # android api module
+    "android_api_module": AndroidAPIModule(),
+}
+
+models_need_trace = [
+    "static_quant_ops",
+]
+
+
+def calcOpsCoverage(ops):
+    with open(production_ops_path) as input_yaml_file:
+        production_ops_dict = yaml.safe_load(input_yaml_file)
+
+    production_ops = set(production_ops_dict["root_operators"].keys())
+    all_generated_ops = set(ops)
+    covered_ops = production_ops.intersection(all_generated_ops)
+    uncovered_ops = production_ops - covered_ops
+    coverage = round(100 * len(covered_ops) / len(production_ops), 2)
+
+    # weighted coverage (take op occurances into account)
+    total_occurances = sum(production_ops_dict["root_operators"].values())
+    covered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in covered_ops}
+    uncovered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in uncovered_ops}
+    covered_occurances = sum(covered_ops_dict.values())
+    occurances_coverage = round(100 * covered_occurances / total_occurances, 2)
+
+    print(f"\n{len(uncovered_ops)} uncovered ops: {uncovered_ops}\n")
+    print(f"Generated {len(all_generated_ops)} ops")
+    print(f"Covered {len(covered_ops)}/{len(production_ops)} ({coverage}%) production ops")
+    print(f"Covered {covered_occurances}/{total_occurances} ({occurances_coverage}%) occurances")
+    print(f"pytorch ver {torch.__version__}\n")
+
+    with open(coverage_out_path, "w") as f:
+        yaml.safe_dump(
+            {
+                "_covered_ops": len(covered_ops),
+                "_production_ops": len(production_ops),
+                "_generated_ops": len(all_generated_ops),
+                "_uncovered_ops": len(uncovered_ops),
+                "_coverage": round(coverage, 2),
+                "uncovered_ops": uncovered_ops_dict,
+                "covered_ops": covered_ops_dict,
+                "all_generated_ops": sorted(list(all_generated_ops)),
+            },
+            f,
+        )
+
+
+def getModuleFromName(model_name):
+    if model_name not in all_modules:
+        print("Cannot find test model for " + model_name)
+        return None, []
+
+    module = all_modules[model_name]
+    if not isinstance(module, torch.nn.Module):
+        module = module.getModule()
+
+    has_bundled_inputs = False  # module.find_method("get_all_bundled_inputs")
+
+    if model_name in models_need_trace:
+        module = torch.jit.trace(module, [])
+    else:
+        module = torch.jit.script(module)
+
+    ops = torch.jit.export_opnames(module)
+    print(ops)
+
+    # try to run the model
+    runModule(module)
+
+    return module, ops
+
+
+def runModule(module):
+    buffer = io.BytesIO(module._save_to_buffer_for_lite_interpreter())
+    buffer.seek(0)
+    lite_module = _load_for_lite_interpreter(buffer)
+    if lite_module.find_method("get_all_bundled_inputs"):
+        # run with the first bundled input
+        input = lite_module.run_method("get_all_bundled_inputs")[0]
+        lite_module.forward(*input)
+    else:
+        # assuming model has no input
+        lite_module()
+
+
+# generate all models in the given folder.
+# If it's "on the fly" mode, add "_temp" suffix to the model file.
+def generateAllModels(folder, on_the_fly=False):
+    all_ops = []
+    for name in all_modules:
+        module, ops = getModuleFromName(name)
+        all_ops = all_ops + ops
+        path = folder + name + ("_temp.ptl" if on_the_fly else ".ptl")
+        module._save_for_lite_interpreter(path)
+        print("model saved to " + path)
+    calcOpsCoverage(all_ops)
+
+
+# generate/update a given model for storage
+def generateModel(name):
+    module, ops = getModuleFromName(name)
+    if module is None:
+        return
+    path_ios = test_path_ios + name + ".ptl"
+    path_android = test_path_android + name + ".ptl"
+    module._save_for_lite_interpreter(path_ios)
+    module._save_for_lite_interpreter(path_android)
+    print("model saved to " + path_ios + " and " + path_android)
+
+
+def main(argv):
+    if argv is None or len(argv) != 1:
+        print(
+            """
+This script generate models for mobile test. For each model we have a "storage" version
+and an "on-the-fly" version. The "on-the-fly" version will be generated during test,and
+should not be committed to the repo.
+The "storage" version is for back compatibility # test (a model generated today should
+run on master branch in the next 6 months). We can use this script to update a model that
+is no longer supported.
+- use 'python gen_test_model.py android-test' to generate on-the-fly models for android
+- use 'python gen_test_model.py ios-test' to generate on-the-fly models for ios
+- use 'python gen_test_model.py android' to generate checked-in models for android
+- use 'python gen_test_model.py ios' to generate on-the-fly models for ios
+- use 'python gen_test_model.py <model_name_no_suffix>' to update the given storage model
+"""
+        )
+        return
+
+    if argv[0] == "android":
+        generateAllModels(test_path_android, on_the_fly=False)
+    elif argv[0] == "ios":
+        generateAllModels(test_path_ios, on_the_fly=False)
+    elif argv[0] == "android-test":
+        generateAllModels(test_path_android, on_the_fly=True)
+    elif argv[0] == "ios-test":
+        generateAllModels(test_path_ios, on_the_fly=True)
+    else:
+        generateModel(argv[0])
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/test/mobile/model_test/math_ops.py b/test/mobile/model_test/math_ops.py
new file mode 100644
index 000000000000..f89e3bca70d6
--- /dev/null
+++ b/test/mobile/model_test/math_ops.py
@@ -0,0 +1,469 @@
+# https://pytorch.org/docs/stable/torch.html#math-operations
+
+import math
+
+import torch
+
+
+class PointwiseOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(PointwiseOpsModule, self).__init__()
+
+    def forward(self):
+        return self.pointwise_ops()
+
+    def pointwise_ops(self):
+        a = torch.randn(4)
+        b = torch.randn(4)
+        t = torch.tensor([-1, -2, 3], dtype=torch.int8)
+        r = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        t = torch.tensor([-1, -2, 3], dtype=torch.int8)
+        s = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        f = torch.zeros(3)
+        g = torch.tensor([-1, 0, 1])
+        w = torch.tensor([0.3810, 1.2774, -0.2972, -0.3719, 0.4637])
+        return len(
+            torch.abs(torch.tensor([-1, -2, 3])),
+            torch.absolute(torch.tensor([-1, -2, 3])),
+            torch.acos(a),
+            torch.arccos(a),
+            torch.acosh(a.uniform_(1.0, 2.0)),
+            torch.add(a, 20),
+            torch.add(a, b, out=a),
+            b.add(a),
+            b.add(a, out=b),
+            b.add_(a),
+            b.add(1),
+            torch.add(a, torch.randn(4, 1), alpha=10),
+            torch.addcdiv(
+                torch.randn(1, 3), torch.randn(3, 1), torch.randn(1, 3), value=0.1
+            ),
+            torch.addcmul(
+                torch.randn(1, 3), torch.randn(3, 1), torch.randn(1, 3), value=0.1
+            ),
+            torch.angle(a),
+            torch.asin(a),
+            torch.arcsin(a),
+            torch.asinh(a),
+            torch.arcsinh(a),
+            torch.atan(a),
+            torch.arctan(a),
+            torch.atanh(a.uniform_(-1.0, 1.0)),
+            torch.arctanh(a.uniform_(-1.0, 1.0)),
+            torch.atan2(a, a),
+            torch.bitwise_not(t),
+            torch.bitwise_and(t, torch.tensor([1, 0, 3], dtype=torch.int8)),
+            torch.bitwise_or(t, torch.tensor([1, 0, 3], dtype=torch.int8)),
+            torch.bitwise_xor(t, torch.tensor([1, 0, 3], dtype=torch.int8)),
+            torch.ceil(a),
+            torch.ceil(float(torch.tensor(0.5))),
+            torch.ceil(torch.tensor(0.5).item()),
+            torch.clamp(a, min=-0.5, max=0.5),
+            torch.clamp(a, min=0.5),
+            torch.clamp(a, max=0.5),
+            torch.clip(a, min=-0.5, max=0.5),
+            torch.conj(a),
+            torch.copysign(a, 1),
+            torch.copysign(a, b),
+            torch.cos(a),
+            torch.cosh(a),
+            torch.deg2rad(
+                torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]])
+            ),
+            torch.div(a, b),
+            a.div(b),
+            a.div(1),
+            a.div_(b),
+            torch.divide(a, b, rounding_mode="trunc"),
+            torch.divide(a, b, rounding_mode="floor"),
+            torch.digamma(torch.tensor([1.0, 0.5])),
+            torch.erf(torch.tensor([0.0, -1.0, 10.0])),
+            torch.erfc(torch.tensor([0.0, -1.0, 10.0])),
+            torch.erfinv(torch.tensor([0.0, 0.5, -1.0])),
+            torch.exp(torch.tensor([0.0, math.log(2.0)])),
+            torch.exp(float(torch.tensor(1))),
+            torch.exp2(torch.tensor([0.0, math.log(2.0), 3.0, 4.0])),
+            torch.expm1(torch.tensor([0.0, math.log(2.0)])),
+            torch.fake_quantize_per_channel_affine(
+                torch.randn(2, 2, 2),
+                (torch.randn(2) + 1) * 0.05,
+                torch.zeros(2),
+                1,
+                0,
+                255,
+            ),
+            torch.fake_quantize_per_tensor_affine(a, 0.1, 0, 0, 255),
+            torch.float_power(torch.randint(10, (4,)), 2),
+            torch.float_power(torch.arange(1, 5), torch.tensor([2, -3, 4, -5])),
+            torch.floor(a),
+            torch.floor(float(torch.tensor(1))),
+            torch.floor_divide(torch.tensor([4.0, 3.0]), torch.tensor([2.0, 2.0])),
+            torch.floor_divide(torch.tensor([4.0, 3.0]), 1.4),
+            torch.fmod(torch.tensor([-3, -2, -1, 1, 2, 3]), 2),
+            torch.fmod(torch.tensor([1, 2, 3, 4, 5]), 1.5),
+            torch.frac(torch.tensor([1.0, 2.5, -3.2])),
+            torch.randn(4, dtype=torch.cfloat).imag,
+            torch.ldexp(torch.tensor([1.0]), torch.tensor([1])),
+            torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4])),
+            torch.lerp(torch.arange(1.0, 5.0), torch.empty(4).fill_(10), 0.5),
+            torch.lerp(
+                torch.arange(1.0, 5.0),
+                torch.empty(4).fill_(10),
+                torch.full_like(torch.arange(1.0, 5.0), 0.5),
+            ),
+            torch.lgamma(torch.arange(0.5, 2, 0.5)),
+            torch.log(torch.arange(5) + 10),
+            torch.log10(torch.rand(5)),
+            torch.log1p(torch.randn(5)),
+            torch.log2(torch.rand(5)),
+            torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1, -2, -3])),
+            torch.logaddexp(
+                torch.tensor([-100.0, -200.0, -300.0]), torch.tensor([-1, -2, -3])
+            ),
+            torch.logaddexp(
+                torch.tensor([1.0, 2000.0, 30000.0]), torch.tensor([-1, -2, -3])
+            ),
+            torch.logaddexp2(torch.tensor([-1.0]), torch.tensor([-1, -2, -3])),
+            torch.logaddexp2(
+                torch.tensor([-100.0, -200.0, -300.0]), torch.tensor([-1, -2, -3])
+            ),
+            torch.logaddexp2(
+                torch.tensor([1.0, 2000.0, 30000.0]), torch.tensor([-1, -2, -3])
+            ),
+            torch.logical_and(r, s),
+            torch.logical_and(r.double(), s.double()),
+            torch.logical_and(r.double(), s),
+            torch.logical_and(r, s, out=torch.empty(4, dtype=torch.bool)),
+            torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8)),
+            torch.logical_not(torch.tensor([0.0, 1.5, -10.0], dtype=torch.double)),
+            torch.logical_not(
+                torch.tensor([0.0, 1.0, -10.0], dtype=torch.double),
+                out=torch.empty(3, dtype=torch.int16),
+            ),
+            torch.logical_or(r, s),
+            torch.logical_or(r.double(), s.double()),
+            torch.logical_or(r.double(), s),
+            torch.logical_or(r, s, out=torch.empty(4, dtype=torch.bool)),
+            torch.logical_xor(r, s),
+            torch.logical_xor(r.double(), s.double()),
+            torch.logical_xor(r.double(), s),
+            torch.logical_xor(r, s, out=torch.empty(4, dtype=torch.bool)),
+            torch.logit(torch.rand(5), eps=1e-6),
+            torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0])),
+            torch.i0(torch.arange(5, dtype=torch.float32)),
+            torch.igamma(a, b),
+            torch.igammac(a, b),
+            torch.mul(torch.randn(3), 100),
+            b.mul(a),
+            b.mul(5),
+            b.mul(a, out=b),
+            b.mul_(a),
+            b.mul_(5),
+            torch.multiply(torch.randn(4, 1), torch.randn(1, 4)),
+            torch.mvlgamma(torch.empty(2, 3).uniform_(1.0, 2.0), 2),
+            torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14]),
+            torch.nan_to_num(w),
+            torch.nan_to_num_(w),
+            torch.nan_to_num(w, nan=2.0),
+            torch.nan_to_num(w, nan=2.0, posinf=1.0),
+            torch.neg(torch.randn(5)),
+            # torch.nextafter(torch.tensor([1, 2]), torch.tensor([2, 1])) == torch.tensor([eps + 1, 2 - eps]),
+            torch.polygamma(1, torch.tensor([1.0, 0.5])),
+            torch.polygamma(2, torch.tensor([1.0, 0.5])),
+            torch.polygamma(3, torch.tensor([1.0, 0.5])),
+            torch.polygamma(4, torch.tensor([1.0, 0.5])),
+            torch.pow(a, 2),
+            torch.pow(2, float(torch.tensor(0.5))),
+            torch.pow(torch.arange(1.0, 5.0), torch.arange(1.0, 5.0)),
+            torch.rad2deg(
+                torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]])
+            ),
+            torch.randn(4, dtype=torch.cfloat).real,
+            torch.reciprocal(a),
+            torch.remainder(torch.tensor([-3.0, -2.0]), 2),
+            torch.remainder(torch.tensor([1, 2, 3, 4, 5]), 1.5),
+            torch.round(a),
+            torch.round(torch.tensor(0.5).item()),
+            torch.rsqrt(a),
+            torch.sigmoid(a),
+            torch.sign(torch.tensor([0.7, -1.2, 0.0, 2.3])),
+            torch.sgn(a),
+            torch.signbit(torch.tensor([0.7, -1.2, 0.0, 2.3])),
+            torch.sin(a),
+            torch.sinc(a),
+            torch.sinh(a),
+            torch.sqrt(a),
+            torch.square(a),
+            torch.sub(torch.tensor((1, 2)), torch.tensor((0, 1)), alpha=2),
+            b.sub(a),
+            b.sub_(a),
+            b.sub(5),
+            torch.sum(5),
+            torch.tan(a),
+            torch.tanh(a),
+            torch.true_divide(a, a),
+            torch.trunc(a),
+            torch.trunc_(a),
+            torch.xlogy(f, g),
+            torch.xlogy(f, g),
+            torch.xlogy(f, 4),
+            torch.xlogy(2, g),
+        )
+
+
+class ReductionOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(ReductionOpsModule, self).__init__()
+
+    def forward(self):
+        return self.reduction_ops()
+
+    def reduction_ops(self):
+        a = torch.randn(4)
+        b = torch.randn(4)
+        c = torch.tensor(0.5)
+        return len(
+            torch.argmax(a),
+            torch.argmin(a),
+            torch.amax(a),
+            torch.amin(a),
+            torch.aminmax(a),
+            torch.all(a),
+            torch.any(a),
+            torch.max(a),
+            a.max(a),
+            torch.max(a, 0),
+            torch.min(a),
+            a.min(a),
+            torch.min(a, 0),
+            torch.dist(a, b),
+            torch.logsumexp(a, 0),
+            torch.mean(a),
+            torch.mean(a, 0),
+            torch.nanmean(a),
+            torch.median(a),
+            torch.nanmedian(a),
+            torch.mode(a),
+            torch.norm(a),
+            a.norm(2),
+            torch.norm(a, dim=0),
+            torch.norm(c, torch.tensor(2)),
+            torch.nansum(a),
+            torch.prod(a),
+            torch.quantile(a, torch.tensor([0.25, 0.5, 0.75])),
+            torch.quantile(a, 0.5),
+            torch.nanquantile(a, torch.tensor([0.25, 0.5, 0.75])),
+            torch.std(a),
+            torch.std_mean(a),
+            torch.sum(a),
+            torch.unique(a),
+            torch.unique_consecutive(a),
+            torch.var(a),
+            torch.var_mean(a),
+            torch.count_nonzero(a),
+        )
+
+
+class ComparisonOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(ComparisonOpsModule, self).__init__()
+
+    def forward(self):
+        a = torch.tensor(0)
+        b = torch.tensor(1)
+        return len(
+            torch.allclose(a, b),
+            torch.argsort(a),
+            torch.eq(a, b),
+            torch.eq(a, 1),
+            torch.equal(a, b),
+            torch.ge(a, b),
+            torch.ge(a, 1),
+            torch.greater_equal(a, b),
+            torch.greater_equal(a, 1),
+            torch.gt(a, b),
+            torch.gt(a, 1),
+            torch.greater(a, b),
+            torch.isclose(a, b),
+            torch.isfinite(a),
+            torch.isin(a, b),
+            torch.isinf(a),
+            torch.isposinf(a),
+            torch.isneginf(a),
+            torch.isnan(a),
+            torch.isreal(a),
+            torch.kthvalue(a, 1),
+            torch.le(a, b),
+            torch.le(a, 1),
+            torch.less_equal(a, b),
+            torch.lt(a, b),
+            torch.lt(a, 1),
+            torch.less(a, b),
+            torch.maximum(a, b),
+            torch.minimum(a, b),
+            torch.fmax(a, b),
+            torch.fmin(a, b),
+            torch.ne(a, b),
+            torch.ne(a, 1),
+            torch.not_equal(a, b),
+            torch.sort(a),
+            torch.topk(a, 1),
+            torch.msort(a),
+        )
+
+
+class OtherMathOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(OtherMathOpsModule, self).__init__()
+
+    def forward(self):
+        return self.other_ops()
+
+    def other_ops(self):
+        a = torch.randn(4)
+        b = torch.randn(4)
+        c = torch.randint(0, 8, (5,), dtype=torch.int64)
+        e = torch.randn(4, 3)
+        f = torch.randn(4, 4, 4)
+        size = [0, 1]
+        dims = [0, 1]
+        return len(
+            torch.atleast_1d(a),
+            torch.atleast_2d(a),
+            torch.atleast_3d(a),
+            torch.bincount(c),
+            torch.block_diag(a),
+            torch.broadcast_tensors(a),
+            torch.broadcast_to(a, (4)),
+            # torch.broadcast_shapes(a),
+            torch.bucketize(a, b),
+            torch.cartesian_prod(a),
+            torch.cdist(e, e),
+            torch.clone(a),
+            torch.combinations(a),
+            torch.corrcoef(a),
+            # torch.cov(a),
+            torch.cross(e, e),
+            torch.cummax(a, 0),
+            torch.cummin(a, 0),
+            torch.cumprod(a, 0),
+            torch.cumsum(a, 0),
+            torch.diag(a),
+            torch.diag_embed(a),
+            torch.diagflat(a),
+            torch.diagonal(e),
+            torch.diff(a),
+            torch.einsum("iii", f),
+            torch.flatten(a),
+            torch.flip(e, dims),
+            torch.fliplr(e),
+            torch.flipud(e),
+            torch.kron(a, b),
+            torch.rot90(e),
+            torch.gcd(c, c),
+            torch.histc(a),
+            torch.histogram(a),
+            torch.meshgrid(a),
+            torch.meshgrid(a, indexing="xy"),
+            torch.lcm(c, c),
+            torch.logcumsumexp(a, 0),
+            torch.ravel(a),
+            torch.renorm(e, 1, 0, 5),
+            torch.repeat_interleave(c),
+            torch.roll(a, 1, 0),
+            torch.searchsorted(a, b),
+            torch.tensordot(e, e),
+            torch.trace(e),
+            torch.tril(e),
+            torch.tril_indices(3, 3),
+            torch.triu(e),
+            torch.triu_indices(3, 3),
+            torch.vander(a),
+            torch.view_as_real(torch.randn(4, dtype=torch.cfloat)),
+            torch.view_as_complex(torch.randn(4, 2)).real,
+            torch.resolve_conj(a),
+            torch.resolve_neg(a),
+        )
+
+
+class SpectralOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(SpectralOpsModule, self).__init__()
+
+    def forward(self):
+        return self.spectral_ops()
+
+    def spectral_ops(self):
+        a = torch.randn(10)
+        b = torch.randn(10, 8, 4, 2)
+        return len(
+            torch.stft(a, 8),
+            torch.stft(a, torch.tensor(8)),
+            torch.istft(b, 8),
+            torch.bartlett_window(2, dtype=torch.float),
+            torch.blackman_window(2, dtype=torch.float),
+            torch.hamming_window(4, dtype=torch.float),
+            torch.hann_window(4, dtype=torch.float),
+            torch.kaiser_window(4, dtype=torch.float),
+        )
+
+
+class BlasLapackOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(BlasLapackOpsModule, self).__init__()
+
+    def forward(self):
+        return self.blas_lapack_ops()
+
+    def blas_lapack_ops(self):
+        m = torch.randn(3, 3)
+        a = torch.randn(10, 3, 4)
+        b = torch.randn(10, 4, 3)
+        v = torch.randn(3)
+        return len(
+            torch.addbmm(m, a, b),
+            torch.addmm(torch.randn(2, 3), torch.randn(2, 3), torch.randn(3, 3)),
+            torch.addmv(torch.randn(2), torch.randn(2, 3), torch.randn(3)),
+            torch.addr(torch.zeros(3, 3), v, v),
+            torch.baddbmm(m, a, b),
+            torch.bmm(a, b),
+            torch.chain_matmul(torch.randn(3, 3), torch.randn(3, 3), torch.randn(3, 3)),
+            # torch.cholesky(a), # deprecated
+            # torch.cholesky_inverse(torch.randn(3, 3)), # had some error
+            # torch.cholesky_solve(torch.randn(3, 3), torch.randn(3, 3)),
+            torch.dot(v, v),
+            # torch.linalg.eig(m), # not build with lapack
+            # torch.geqrf(a),
+            torch.ger(v, v),
+            torch.inner(m, m),
+            # torch.inverse(m),
+            # torch.det(m),
+            # torch.logdet(m),
+            # torch.slogdet(m),
+            # torch.lstsq(m, m),
+            # torch.lu(m),
+            # torch.lu_solve(m, *torch.lu(m)),
+            # torch.lu_unpack(*torch.lu(m)),
+            torch.matmul(m, m),
+            torch.matrix_power(m, 2),
+            # torch.matrix_rank(m),
+            torch.matrix_exp(m),
+            torch.mm(m, m),
+            torch.mv(m, v),
+            # torch.orgqr(a, m),
+            # torch.ormqr(a, m, v),
+            torch.outer(v, v),
+            # torch.pinverse(m),
+            # torch.qr(a),
+            # torch.solve(m, m),
+            # torch.svd(a),
+            # torch.svd_lowrank(a),
+            # torch.pca_lowrank(a),
+            # torch.symeig(a), # deprecated
+            # torch.lobpcg(a, b), # not supported
+            torch.trapz(m, m),
+            torch.trapezoid(m, m),
+            torch.cumulative_trapezoid(m, m),
+            # torch.triangular_solve(m, m),
+            torch.vdot(v, v),
+        )
diff --git a/test/mobile/model_test/model_ops.yaml b/test/mobile/model_test/model_ops.yaml
new file mode 100644
index 000000000000..06a3640e4cbe
--- /dev/null
+++ b/test/mobile/model_test/model_ops.yaml
@@ -0,0 +1,752 @@
+root_operators:
+  aten::Bool.Tensor: 19
+  aten::Bool.int: 7
+  aten::Float.Scalar: 18
+  aten::Float.Tensor: 11
+  aten::Float.str: 6
+  aten::FloatImplicit: 2
+  aten::Int.Scalar: 19
+  aten::Int.Tensor: 35
+  aten::Int.float: 6
+  aten::Int.str: 12
+  aten::IntImplicit: 11
+  aten::ScalarImplicit: 3
+  aten::__and__.Tensor: 13
+  aten::__and__.bool: 11
+  aten::__and__.int: 2
+  aten::__contains__.int: 5
+  aten::__contains__.int_list: 17
+  aten::__contains__.str: 22
+  aten::__contains__.str_list: 5
+  aten::__derive_index: 24
+  aten::__getitem__.Dict_int: 4
+  aten::__getitem__.Dict_str: 39
+  aten::__getitem__.str: 20
+  aten::__getitem__.t: 178
+  aten::__is__: 83
+  aten::__isnot__: 81
+  aten::__lshift__.int: 2
+  aten::__not__: 32
+  aten::__range_length: 23
+  aten::__rshift__.int: 2
+  aten::__xor__.bool: 10
+  aten::_aminmax: 4
+  aten::_convolution: 12
+  aten::_convolution.deprecated: 3
+  aten::_infer_size: 7
+  aten::_make_per_tensor_quantized_tensor: 2
+  aten::_pack_padded_sequence: 10
+  aten::_pad_packed_sequence: 10
+  aten::_reshape_from_tensor: 10
+  aten::_set_item.int: 7
+  aten::_set_item.str: 163
+  aten::_set_item.t: 8
+  aten::_shape_as_tensor: 10
+  aten::adaptive_avg_pool1d: 1
+  aten::adaptive_avg_pool2d: 33
+  aten::adaptive_avg_pool3d: 1
+  aten::add.Scalar: 33
+  aten::add.Tensor: 63
+  aten::add.float: 5
+  aten::add.int: 49
+  aten::add.out: 2
+  aten::add.str: 29
+  aten::add.t: 11
+  aten::add_.Scalar: 15
+  aten::add_.Tensor: 29
+  aten::addcmul: 2
+  aten::addmm: 7
+  aten::all: 6
+  aten::allclose: 1
+  aten::any: 14
+  aten::append.t: 59
+  aten::arange: 16
+  aten::arange.start: 6
+  aten::arange.start_step: 16
+  aten::argmax: 2
+  aten::as_strided: 10
+  aten::as_tensor.list: 4
+  aten::atan: 4
+  aten::avg_pool1d: 6
+  aten::avg_pool2d: 7
+  aten::backward: 23
+  aten::batch_norm: 15
+  aten::binary_cross_entropy: 15
+  aten::binary_cross_entropy_with_logits: 3
+  aten::bitwise_not: 13
+  aten::bmm: 16
+  aten::broadcast_tensors: 1
+  aten::cat: 90
+  aten::ceil: 3
+  aten::ceil.float: 7
+  aten::chunk: 19
+  aten::clamp: 36
+  aten::clamp_: 12
+  aten::clamp_min: 3
+  aten::clear.str: 2
+  aten::clone: 26
+  aten::coalesce: 2
+  aten::conj: 1
+  aten::constant_pad_nd: 17
+  aten::contiguous: 113
+  aten::conv1d: 12
+  aten::conv2d: 10
+  aten::conv_transpose2d.input: 5
+  aten::copy_: 15
+  aten::copy_.Tensor: 27
+  aten::copy_.int: 1
+  aten::cos: 4
+  aten::count_nonzero: 4
+  aten::ctc_loss.Tensor: 1
+  aten::cumsum: 13
+  aten::dequantize.list: 1
+  aten::dequantize.self: 30
+  aten::dequantize.tensor: 36
+  aten::detach: 34
+  aten::dim: 36
+  aten::div: 9
+  aten::div.Scalar: 8
+  aten::div.Tensor: 71
+  aten::div.Tensor_mode: 7
+  aten::div.float: 3
+  aten::div.int: 7
+  aten::div_.Tensor: 7
+  aten::dropout: 41
+  aten::embedding: 16
+  aten::embedding_bag.padding_idx: 2
+  aten::empty.memory_format: 11
+  aten::empty_like: 11
+  aten::empty_strided: 3
+  aten::eq.Scalar: 24
+  aten::eq.Tensor: 6
+  aten::eq.int: 57
+  aten::eq.int_list: 20
+  aten::eq.str: 43
+  aten::exp: 18
+  aten::exp.float: 4
+  aten::expand: 26
+  aten::expand_as: 3
+  aten::extend.t: 38
+  aten::feature_dropout: 1
+  aten::fill_.Scalar: 17
+  aten::find: 3
+  aten::flatten.using_ints: 45
+  aten::flip: 1
+  aten::floor: 5
+  aten::floor.float: 2
+  aten::floor_divide: 4
+  aten::floor_divide.Scalar: 7
+  aten::floordiv.int: 21
+  aten::format: 58
+  aten::full: 10
+  aten::full_like: 10
+  aten::gather: 10
+  aten::ge.Scalar: 4
+  aten::ge.Tensor: 6
+  aten::ge.int: 29
+  aten::gelu: 12
+  aten::get.default_str: 14
+  aten::glu: 18
+  aten::grid_sampler: 3
+  aten::gt.Scalar: 16
+  aten::gt.float: 16
+  aten::gt.float_int: 3
+  aten::gt.int: 52
+  aten::hardsigmoid: 3
+  aten::hardsigmoid_: 2
+  aten::hardswish_: 4
+  aten::hardtanh: 3
+  aten::hardtanh_: 3
+  aten::hstack: 2
+  aten::index.Tensor: 23
+  aten::index_fill.int_Scalar: 15
+  aten::index_put_: 16
+  aten::index_select: 31
+  aten::is_coalesced: 2
+  aten::is_floating_point: 9
+  aten::isnan: 1
+  aten::item: 40
+  aten::items.str: 3
+  aten::keys.str: 15
+  aten::layer_norm: 26
+  aten::le.Scalar: 1
+  aten::le.Tensor: 10
+  aten::le.float: 2
+  aten::le.int: 17
+  aten::leaky_relu: 1
+  aten::leaky_relu_: 5
+  aten::len.Dict_int: 5
+  aten::len.Tensor: 19
+  aten::len.str: 23
+  aten::len.t: 177
+  aten::linear: 46
+  aten::linspace: 3
+  aten::list.t: 24
+  aten::log: 18
+  aten::log10: 4
+  aten::log1p: 5
+  aten::log_softmax.int: 31
+  aten::logical_and: 1
+  aten::logical_not: 10
+  aten::logit: 7
+  aten::lower: 10
+  aten::lstm.data: 8
+  aten::lstm.input: 4
+  aten::lt.Scalar: 8
+  aten::lt.Tensor: 1
+  aten::lt.float: 16
+  aten::lt.int: 46
+  aten::masked_fill.Scalar: 16
+  aten::matmul: 12
+  aten::max: 18
+  aten::max.dim: 30
+  aten::max.other: 7
+  aten::max_pool2d: 10
+  aten::maximum: 4
+  aten::mean: 10
+  aten::mean.dim: 16
+  aten::meshgrid.indexing: 2
+  aten::min: 2
+  aten::min.dim: 4
+  aten::min.other: 17
+  aten::minimum: 4
+  aten::mse_loss: 1
+  aten::mul.Scalar: 26
+  aten::mul.Tensor: 90
+  aten::mul.float: 5
+  aten::mul.float_int: 3
+  aten::mul.int: 26
+  aten::mul.int_float: 4
+  aten::mul.left_t: 15
+  aten::mul.out: 1
+  aten::mul_.Scalar: 11
+  aten::mul_.Tensor: 5
+  aten::nan_to_num: 3
+  aten::nan_to_num_: 10
+  aten::narrow: 10
+  aten::ne.Scalar: 14
+  aten::ne.Tensor: 5
+  aten::ne.int: 44
+  aten::ne.int_float: 2
+  aten::ne.int_list: 20
+  aten::ne.str: 3
+  aten::neg: 29
+  aten::neg.int: 19
+  aten::new_zeros: 6
+  aten::nll_loss: 1
+  aten::nll_loss2d: 1
+  aten::nll_loss_nd: 3
+  aten::nonzero: 4
+  aten::norm.Scalar: 1
+  aten::norm.ScalarOpt_dim: 4
+  aten::numel: 8
+  aten::one_hot: 2
+  aten::ones: 38
+  aten::ones_like: 16
+  aten::ord: 20
+  aten::permute: 43
+  aten::pop.t: 7
+  aten::pow.Tensor_Scalar: 3
+  aten::pow.int_float: 2
+  aten::quantile.scalar: 1
+  aten::quantize_per_tensor: 66
+  aten::quantize_per_tensor.tensor_qparams: 1
+  aten::quantized_lstm.data: 2
+  aten::rand: 25
+  aten::randint.low: 2
+  aten::randn_like: 17
+  aten::reciprocal: 1
+  aten::reflection_pad2d: 1
+  aten::relu: 82
+  aten::relu_: 9
+  aten::remainder.Scalar: 2
+  aten::remainder.int: 22
+  aten::repeat: 16
+  aten::replace: 1
+  aten::replication_pad1d: 1
+  aten::replication_pad2d: 2
+  aten::replication_pad3d: 1
+  aten::requires_grad_: 4
+  aten::reshape: 36
+  aten::resize_as_: 1
+  aten::resolve_conj: 1
+  aten::resolve_neg: 1
+  aten::reverse.t: 2
+  aten::round.Scalar: 4
+  aten::rstrip: 1
+  aten::rsub.Scalar: 5
+  aten::scatter_.src: 6
+  aten::scatter_add_: 10
+  aten::select.int: 57
+  aten::selu: 2
+  aten::sigmoid: 93
+  aten::sin: 4
+  aten::size: 66
+  aten::size.int: 66
+  aten::slice.Tensor: 75
+  aten::slice.str: 12
+  aten::slice.t: 43
+  aten::softmax.int: 63
+  aten::softplus: 2
+  aten::sort: 18
+  aten::sparse_coo_tensor.indices: 1
+  aten::sparse_resize_and_clear_: 1
+  aten::split.str: 10
+  aten::sqrt: 1
+  aten::squeeze.dim: 26
+  aten::stack: 30
+  aten::startswith: 10
+  aten::str: 16
+  aten::strip: 3
+  aten::sub: 8
+  aten::sub.Scalar: 26
+  aten::sub.Tensor: 94
+  aten::sub.int: 52
+  aten::sub_.Tensor: 4
+  aten::sum: 17
+  aten::sum.dim_IntList: 19
+  aten::sum.int: 1
+  aten::t: 3
+  aten::tanh: 26
+  aten::tensor: 51
+  aten::tensor.float: 28
+  aten::tensor.int: 34
+  aten::tensor_split.indices: 4
+  aten::to.device: 11
+  aten::to.dtype: 23
+  aten::to.dtype_layout: 27
+  aten::to.prim_Device: 23
+  aten::to.prim_dtype: 38
+  aten::topk: 10
+  aten::transpose.int: 33
+  aten::triu: 10
+  aten::true_divide.Tensor: 2
+  aten::trunc_: 3
+  aten::type_as: 6
+  aten::unbind.int: 24
+  aten::unique_consecutive: 2
+  aten::unsqueeze: 34
+  aten::unsqueeze_: 6
+  aten::update.str: 4
+  aten::upsample_bicubic2d.vec: 1
+  aten::upsample_bilinear2d.vec: 8
+  aten::upsample_linear1d.vec: 1
+  aten::upsample_nearest1d.vec: 2
+  aten::upsample_nearest2d: 7
+  aten::upsample_nearest2d.vec: 30
+  aten::upsample_nearest3d.vec: 2
+  aten::upsample_trilinear3d.vec: 1
+  aten::values.int: 3
+  aten::view: 61
+  aten::vstack: 1
+  aten::where.ScalarOther: 4
+  aten::where.self: 10
+  aten::zeros: 75
+  aten::zeros.out: 1
+  aten::zeros_like: 7
+  prepacked::conv2d_clamp_prepack: 2
+  prepacked::conv2d_clamp_run: 32
+  prepacked::conv2d_transpose_clamp_prepack: 1
+  prepacked::conv2d_transpose_clamp_run: 1
+  prepacked::linear_clamp_run: 26
+  prim::ModuleContainerIndex.list: 2
+  prim::NumToTensor.Scalar: 15
+  prim::Print: 1
+  prim::RaiseException: 103
+  prim::TupleIndex: 157
+  prim::TupleUnpack: 120
+  prim::Uninitialized: 80
+  prim::device: 46
+  prim::dtype: 45
+  prim::is_cuda: 1
+  prim::max.float: 7
+  prim::max.int: 14
+  prim::max.self_int: 17
+  prim::min: 4
+  prim::min.int: 35
+  prim::min.self_int: 25
+  prim::unchecked_cast: 100
+  quantized::add: 58
+  quantized::add_relu: 1
+  quantized::batch_norm2d: 1
+  quantized::cat: 4
+  quantized::conv1d: 1
+  quantized::conv2d: 4
+  quantized::conv2d.new: 55
+  quantized::conv2d_prepack: 14
+  quantized::conv2d_relu.new: 50
+  quantized::conv_prepack: 5
+  quantized::conv_transpose2d: 2
+  quantized::embedding_4bit: 1
+  quantized::embedding_byte: 14
+  quantized::hardswish: 1
+  quantized::instance_norm: 1
+  quantized::leaky_relu: 2
+  quantized::linear: 27
+  quantized::linear_dynamic: 21
+  quantized::linear_dynamic_fp16: 18
+  quantized::linear_prepack: 29
+  quantized::linear_prepack_fp16: 25
+  quantized::linear_relu: 2
+  quantized::linear_unpack: 4
+  quantized::linear_unpack_fp16: 4
+  quantized::mul: 4
+  quantized::mul.Scalar: 1
+traced_operators:
+  aten::__and__.Tensor: 13
+  aten::__iand__.Tensor: 1
+  aten::__ior__.Tensor: 1
+  aten::_adaptive_avg_pool2d: 23
+  aten::_aminmax: 4
+  aten::_batch_norm_impl_index: 15
+  aten::_cat: 95
+  aten::_coalesce: 2
+  aten::_coalesced_: 3
+  aten::_convolution: 34
+  aten::_convolution.deprecated: 3
+  aten::_ctc_loss: 1
+  aten::_embedding_bag: 2
+  aten::_embedding_bag_backward: 1
+  aten::_embedding_bag_sparse_backward: 1
+  aten::_empty_affine_quantized: 87
+  aten::_empty_per_channel_affine_quantized: 28
+  aten::_index_put_impl_: 16
+  aten::_indices: 4
+  aten::_local_scalar_dense: 188
+  aten::_log_softmax: 28
+  aten::_log_softmax_backward_data: 4
+  aten::_make_per_tensor_quantized_tensor: 2
+  aten::_nnz: 3
+  aten::_pack_padded_sequence: 10
+  aten::_pack_padded_sequence_backward: 3
+  aten::_pad_packed_sequence: 10
+  aten::_reshape_alias: 93
+  aten::_reshape_from_tensor: 10
+  aten::_s_where: 15
+  aten::_shape_as_tensor: 10
+  aten::_slow_conv2d_backward.output_mask: 3
+  aten::_slow_conv2d_forward: 33
+  aten::_softmax: 63
+  aten::_sparse_coo_tensor_unsafe: 4
+  aten::_sparse_coo_tensor_with_dims_and_tensors: 5
+  aten::_to_copy: 188
+  aten::_unsafe_view: 28
+  aten::_values: 4
+  aten::abs: 1
+  aten::abs.out: 1
+  aten::adaptive_avg_pool2d: 29
+  aten::add.Scalar: 30
+  aten::add.Tensor: 72
+  aten::add.out: 2
+  aten::add_.Scalar: 11
+  aten::add_.Tensor: 48
+  aten::addmm: 41
+  aten::alias: 14
+  aten::all: 8
+  aten::allclose: 1
+  aten::aminmax: 4
+  aten::any: 14
+  aten::any.dim: 1
+  aten::arange: 10
+  aten::arange.start: 26
+  aten::arange.start_out: 28
+  aten::arange.start_step: 8
+  aten::argmax: 2
+  aten::as_strided: 188
+  aten::as_strided_: 39
+  aten::atan: 4
+  aten::atleast_1d.Sequence: 2
+  aten::atleast_2d.Sequence: 1
+  aten::avg_pool2d: 7
+  aten::batch_norm: 15
+  aten::bernoulli_.float: 2
+  aten::binary_cross_entropy: 13
+  aten::binary_cross_entropy_backward: 12
+  aten::binary_cross_entropy_with_logits: 3
+  aten::binary_cross_entropy_with_logits_backward: 2
+  aten::bitwise_and.Tensor: 13
+  aten::bitwise_and_.Tensor: 1
+  aten::bitwise_not: 13
+  aten::bitwise_or_.Tensor: 1
+  aten::bmm: 18
+  aten::broadcast_tensors: 1
+  aten::cat: 95
+  aten::ceil: 4
+  aten::ceil_: 1
+  aten::chunk: 20
+  aten::clamp: 38
+  aten::clamp_: 12
+  aten::clamp_min: 73
+  aten::clamp_min.out: 74
+  aten::clamp_min_: 4
+  aten::clone: 134
+  aten::coalesce: 2
+  aten::conj: 1
+  aten::constant_pad_nd: 14
+  aten::contiguous: 139
+  aten::conv1d: 12
+  aten::conv2d: 7
+  aten::conv_transpose2d.input: 5
+  aten::convolution: 19
+  aten::convolution_backward: 3
+  aten::copy_: 188
+  aten::copy_sparse_to_sparse_: 3
+  aten::cos: 4
+  aten::count_nonzero: 4
+  aten::count_nonzero.dim_IntList: 4
+  aten::ctc_loss.Tensor: 1
+  aten::cudnn_is_acceptable: 12
+  aten::cumsum: 14
+  aten::dense_dim: 3
+  aten::dequantize.self: 63
+  aten::dequantize.tensors: 1
+  aten::detach: 49
+  aten::div.Scalar: 188
+  aten::div.Tensor: 188
+  aten::div.Tensor_mode: 8
+  aten::div_.Scalar: 27
+  aten::div_.Tensor: 34
+  aten::dropout: 41
+  aten::elu: 2
+  aten::embedding: 16
+  aten::embedding_backward: 4
+  aten::embedding_bag.padding_idx: 2
+  aten::embedding_dense_backward: 4
+  aten::embedding_sparse_backward: 1
+  aten::empty.memory_format: 188
+  aten::empty_like: 162
+  aten::empty_strided: 188
+  aten::eq.Scalar: 25
+  aten::eq.Tensor: 188
+  aten::exp: 15
+  aten::exp_: 3
+  aten::expand: 63
+  aten::expand_as: 17
+  aten::feature_dropout: 1
+  aten::fill_.Scalar: 188
+  aten::flatten.using_ints: 42
+  aten::flip: 1
+  aten::floor: 6
+  aten::floor_divide: 7
+  aten::floor_divide.Scalar: 7
+  aten::full: 21
+  aten::full_like: 10
+  aten::gather: 11
+  aten::ge.Scalar: 2
+  aten::gelu: 12
+  aten::glu: 18
+  aten::grid_sampler: 3
+  aten::grid_sampler_2d: 3
+  aten::gt.Scalar: 16
+  aten::hardsigmoid: 3
+  aten::hardsigmoid_: 2
+  aten::hardswish_: 4
+  aten::hardtanh: 3
+  aten::hstack: 2
+  aten::index.Tensor: 20
+  aten::index_add_: 4
+  aten::index_fill.int_Scalar: 1
+  aten::index_fill_.int_Scalar: 1
+  aten::index_put_: 16
+  aten::index_select: 28
+  aten::index_select_backward: 3
+  aten::is_coalesced: 3
+  aten::is_floating_point: 8
+  aten::isclose: 1
+  aten::isfinite: 1
+  aten::isnan: 1
+  aten::item: 188
+  aten::layer_norm: 26
+  aten::le.Scalar: 2
+  aten::le.Tensor: 1
+  aten::leaky_relu: 1
+  aten::leaky_relu_: 5
+  aten::lerp_.Tensor: 1
+  aten::linear: 51
+  aten::linspace: 3
+  aten::linspace.out: 3
+  aten::log: 15
+  aten::log10: 4
+  aten::log1p: 5
+  aten::log_: 3
+  aten::log_softmax.int: 28
+  aten::logical_and: 1
+  aten::logical_and.out: 2
+  aten::logical_and_: 1
+  aten::logit: 7
+  aten::lstm.data: 8
+  aten::lstm.input: 4
+  aten::lt.Scalar: 8
+  aten::lt.Tensor: 1
+  aten::masked_fill.Scalar: 3
+  aten::masked_fill_.Scalar: 18
+  aten::matmul: 31
+  aten::max: 27
+  aten::max.dim: 31
+  aten::max.other: 4
+  aten::max_pool2d: 7
+  aten::maximum: 4
+  aten::mean: 16
+  aten::mean.dim: 26
+  aten::meshgrid.indexing: 2
+  aten::min: 25
+  aten::min.dim: 5
+  aten::min.other: 4
+  aten::minimum: 5
+  aten::mm: 40
+  aten::mul.Scalar: 31
+  aten::mul.Tensor: 103
+  aten::mul.out: 12
+  aten::mul_.Scalar: 11
+  aten::mul_.Tensor: 7
+  aten::nan_to_num: 3
+  aten::nan_to_num.out: 13
+  aten::nan_to_num_: 10
+  aten::narrow: 188
+  aten::native_batch_norm: 15
+  aten::native_layer_norm: 26
+  aten::native_layer_norm_backward: 1
+  aten::ne.Scalar: 15
+  aten::ne.Tensor: 6
+  aten::neg: 29
+  aten::new_empty_strided: 188
+  aten::nll_loss: 4
+  aten::nll_loss_backward: 4
+  aten::nll_loss_forward: 4
+  aten::nll_loss_nd: 3
+  aten::nonzero: 16
+  aten::norm.Scalar: 1
+  aten::norm.ScalarOpt_dim: 5
+  aten::normal_: 17
+  aten::one_hot: 2
+  aten::ones: 188
+  aten::ones_like: 25
+  aten::permute: 44
+  aten::pow.Tensor_Scalar: 3
+  aten::q_per_channel_scales: 28
+  aten::q_per_channel_zero_points: 28
+  aten::q_scale: 65
+  aten::q_zero_point: 85
+  aten::qscheme: 85
+  aten::quantile.scalar: 1
+  aten::quantize_per_tensor: 84
+  aten::quantize_per_tensor.tensor_qparams: 1
+  aten::quantized_lstm.data: 2
+  aten::quantized_max_pool2d: 3
+  aten::rand: 25
+  aten::randint.low: 2
+  aten::randn_like: 17
+  aten::random_.from: 2
+  aten::reciprocal: 1
+  aten::reflection_pad2d: 1
+  aten::relu: 79
+  aten::relu_: 4
+  aten::remainder.Scalar: 2
+  aten::remainder.Tensor: 2
+  aten::repeat: 14
+  aten::replication_pad2d: 2
+  aten::requires_grad_: 2
+  aten::reshape: 69
+  aten::resize_: 188
+  aten::resize_as_: 18
+  aten::resolve_conj: 70
+  aten::resolve_neg: 1
+  aten::result_type.Scalar: 3
+  aten::rsub.Scalar: 5
+  aten::scalar_tensor: 1
+  aten::scatter_.src: 6
+  aten::scatter_.value: 2
+  aten::scatter_add_: 10
+  aten::select.int: 77
+  aten::select_backward: 1
+  aten::selu: 2
+  aten::set_.source_Storage: 186
+  aten::set_.source_Storage_storage_offset: 186
+  aten::sigmoid: 90
+  aten::sigmoid_: 14
+  aten::sigmoid_backward: 17
+  aten::sin: 4
+  aten::slice.Tensor: 188
+  aten::slice_backward: 4
+  aten::slow_conv_transpose2d: 6
+  aten::softmax.int: 63
+  aten::softplus: 2
+  aten::sort: 20
+  aten::sparse_coo_tensor.indices: 1
+  aten::sparse_dim: 3
+  aten::sparse_resize_and_clear_: 1
+  aten::split.Tensor: 20
+  aten::sqrt: 1
+  aten::squeeze: 13
+  aten::squeeze.dim: 38
+  aten::squeeze_.dim: 36
+  aten::stack: 39
+  aten::sub.Scalar: 23
+  aten::sub.Tensor: 105
+  aten::sub_.Scalar: 1
+  aten::sub_.Tensor: 7
+  aten::sum: 18
+  aten::sum.IntList_out: 29
+  aten::sum.dim_IntList: 41
+  aten::t: 49
+  aten::tanh: 40
+  aten::tanh_: 14
+  aten::tanh_backward: 5
+  aten::tensor_split.indices: 4
+  aten::thnn_conv2d: 33
+  aten::threshold_backward: 17
+  aten::to.device: 35
+  aten::to.dtype: 188
+  aten::to.dtype_layout: 184
+  aten::topk: 10
+  aten::transpose.int: 73
+  aten::triu: 10
+  aten::true_divide.Tensor: 2
+  aten::trunc_: 4
+  aten::type_as: 6
+  aten::unbind.int: 38
+  aten::unfold: 14
+  aten::uniform_: 25
+  aten::unique_consecutive: 2
+  aten::unsafe_chunk: 14
+  aten::unsafe_split.Tensor: 14
+  aten::unsqueeze: 56
+  aten::unsqueeze_: 31
+  aten::upsample_bilinear2d: 7
+  aten::upsample_bilinear2d.vec: 7
+  aten::upsample_nearest2d: 31
+  aten::upsample_nearest2d.vec: 27
+  aten::value_selecting_reduction_backward: 3
+  aten::view: 95
+  aten::vstack: 1
+  aten::where.ScalarOther: 4
+  aten::where.self: 15
+  aten::zero_: 188
+  aten::zeros: 188
+  aten::zeros.out: 1
+  aten::zeros_like: 6
+  prepacked::conv2d_clamp_prepack: 1
+  prepacked::conv2d_clamp_run: 32
+  prepacked::conv2d_transpose_clamp_run: 1
+  prepacked::linear_clamp_run: 26
+  quantized::add: 58
+  quantized::add_relu: 1
+  quantized::batch_norm2d: 1
+  quantized::cat: 4
+  quantized::conv1d: 1
+  quantized::conv2d: 4
+  quantized::conv2d.new: 55
+  quantized::conv2d_prepack: 14
+  quantized::conv2d_relu.new: 50
+  quantized::conv_prepack: 5
+  quantized::conv_transpose2d: 2
+  quantized::embedding_byte: 14
+  quantized::hardswish: 1
+  quantized::instance_norm: 1
+  quantized::leaky_relu: 2
+  quantized::linear: 27
+  quantized::linear_dynamic: 21
+  quantized::linear_prepack: 29
+  quantized::linear_relu: 2
+  quantized::mul: 4
+  quantized::mul.Scalar: 1
diff --git a/test/mobile/model_test/nn_ops.py b/test/mobile/model_test/nn_ops.py
new file mode 100644
index 000000000000..338359c96408
--- /dev/null
+++ b/test/mobile/model_test/nn_ops.py
@@ -0,0 +1,427 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# https://pytorch.org/docs/stable/nn.html
+class NNConvolutionModule(torch.nn.Module):
+    def __init__(self):
+        super(NNConvolutionModule, self).__init__()
+        self.input1d = torch.randn(1, 4, 36)
+        self.input2d = torch.randn(1, 4, 30, 10)
+        self.input3d = torch.randn(1, 4, 10, 4, 4)
+        self.module1d = nn.ModuleList(
+            [
+                nn.Conv1d(4, 33, 3),
+                nn.ConvTranspose1d(4, 33, 3),
+                nn.Fold(output_size=(5, 10), kernel_size=(2, 2)),
+            ]
+        )
+        self.module2d = nn.ModuleList(
+            [
+                nn.Conv2d(4, 33, 3),
+                nn.ConvTranspose2d(4, 33, 3),
+                nn.Unfold(kernel_size=3),
+            ]
+        )
+        self.module3d = nn.ModuleList(
+            [
+                nn.Conv3d(4, 33, 2),
+                nn.ConvTranspose3d(4, 33, 3),
+            ]
+        )
+
+    def forward(self):
+        return len((
+            [module(self.input1d) for i, module in enumerate(self.module1d)],
+            [module(self.input2d) for i, module in enumerate(self.module2d)],
+            [module(self.input3d) for i, module in enumerate(self.module3d)],
+        ))
+
+
+class NNPoolingModule(torch.nn.Module):
+    def __init__(self):
+        super(NNPoolingModule, self).__init__()
+        self.input1d = torch.randn(1, 16, 50)
+        self.module1d = nn.ModuleList(
+            [
+                nn.MaxPool1d(3, stride=2),
+                nn.AvgPool1d(3, stride=2),
+                nn.LPPool1d(2, 3, stride=2),
+                nn.AdaptiveMaxPool1d(3),
+                nn.AdaptiveAvgPool1d(3),
+            ]
+        )
+
+        self.input2d = torch.randn(1, 16, 30, 10)
+        self.module2d = nn.ModuleList(
+            [
+                nn.MaxPool2d((3, 2), stride=(2, 1)),
+                nn.AvgPool2d((3, 2), stride=(2, 1)),
+                nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5)),
+                nn.LPPool2d(2, 3, stride=(2, 1)),
+                nn.AdaptiveMaxPool2d((5, 7)),
+                nn.AdaptiveAvgPool2d((7)),
+            ]
+        )
+
+        self.input3d = torch.randn(1, 16, 20, 4, 4)
+        self.module3d = nn.ModuleList(
+            [
+                nn.MaxPool3d(2),
+                nn.AvgPool3d(2),
+                nn.FractionalMaxPool3d(2, output_ratio=(0.5, 0.5, 0.5)),
+                nn.AdaptiveMaxPool3d((5, 7, 9)),
+                nn.AdaptiveAvgPool3d((5, 7, 9)),
+            ]
+        )
+        # TODO max_unpool
+
+    def forward(self):
+        return len((
+            [module(self.input1d) for i, module in enumerate(self.module1d)],
+            [module(self.input2d) for i, module in enumerate(self.module2d)],
+            [module(self.input3d) for i, module in enumerate(self.module3d)],
+        ))
+
+
+class NNPaddingModule(torch.nn.Module):
+    def __init__(self):
+        super(NNPaddingModule, self).__init__()
+        self.input1d = torch.randn(1, 4, 50)
+        self.module1d = nn.ModuleList(
+            [
+                nn.ReflectionPad1d(2),
+                nn.ReplicationPad1d(2),
+                nn.ConstantPad1d(2, 3.5),
+            ]
+        )
+
+        self.input2d = torch.randn(1, 4, 30, 10)
+        self.module2d = nn.ModuleList(
+            [
+                nn.ReflectionPad2d(2),
+                nn.ReplicationPad2d(2),
+                nn.ZeroPad2d(2),
+                nn.ConstantPad2d(2, 3.5),
+            ]
+        )
+
+        self.input3d = torch.randn(1, 4, 10, 4, 4)
+        self.module3d = nn.ModuleList(
+            [
+                nn.ReflectionPad3d(1),
+                nn.ReplicationPad3d(3),
+                nn.ConstantPad3d(3, 3.5),
+            ]
+        )
+
+    def forward(self):
+        return len((
+            [module(self.input1d) for i, module in enumerate(self.module1d)],
+            [module(self.input2d) for i, module in enumerate(self.module2d)],
+            [module(self.input3d) for i, module in enumerate(self.module3d)],
+        ))
+
+
+class NNNormalizationModule(torch.nn.Module):
+    def __init__(self):
+        super(NNNormalizationModule, self).__init__()
+        self.input1d = torch.randn(1, 4, 50)
+        self.module1d = nn.ModuleList(
+            [
+                nn.BatchNorm1d(4),
+                nn.InstanceNorm1d(4),
+            ]
+        )
+
+        self.input2d = torch.randn(1, 4, 30, 10)
+        self.module2d = nn.ModuleList(
+            [
+                nn.BatchNorm2d(4),
+                nn.GroupNorm(4, 4),
+                nn.InstanceNorm2d(4),
+                nn.LayerNorm([4, 30, 10]),
+                nn.LocalResponseNorm(2),
+            ]
+        )
+
+        self.input3d = torch.randn(1, 4, 10, 4, 4)
+        self.module3d = nn.ModuleList(
+            [
+                nn.BatchNorm3d(4),
+                nn.InstanceNorm3d(4),
+                nn.ChannelShuffle(2),
+            ]
+        )
+
+    def forward(self):
+        return len((
+            [module(self.input1d) for i, module in enumerate(self.module1d)],
+            [module(self.input2d) for i, module in enumerate(self.module2d)],
+            [module(self.input3d) for i, module in enumerate(self.module3d)],
+        ))
+
+
+class NNActivationModule(torch.nn.Module):
+    def __init__(self):
+        super(NNActivationModule, self).__init__()
+        self.activations = nn.ModuleList(
+            [
+                nn.ELU(),
+                nn.Hardshrink(),
+                nn.Hardsigmoid(),
+                nn.Hardtanh(),
+                nn.Hardswish(),
+                nn.LeakyReLU(),
+                nn.LogSigmoid(),
+                # nn.MultiheadAttention(),
+                nn.PReLU(),
+                nn.ReLU(),
+                nn.ReLU6(),
+                nn.RReLU(),
+                nn.SELU(),
+                nn.CELU(),
+                nn.GELU(),
+                nn.Sigmoid(),
+                nn.SiLU(),
+                nn.Mish(),
+                nn.Softplus(),
+                nn.Softshrink(),
+                nn.Softsign(),
+                nn.Tanh(),
+                nn.Tanhshrink(),
+                # nn.Threshold(0.1, 20),
+                nn.GLU(),
+                nn.Softmin(),
+                nn.Softmax(),
+                nn.Softmax2d(),
+                nn.LogSoftmax(),
+                # nn.AdaptiveLogSoftmaxWithLoss(),
+            ]
+        )
+
+    def forward(self):
+        input = torch.randn(2, 3, 4)
+        return len((
+            [module(input) for i, module in enumerate(self.activations)],
+        ))
+
+
+class NNRecurrentModule(torch.nn.Module):
+    def __init__(self):
+        super(NNRecurrentModule, self).__init__()
+        self.rnn = nn.ModuleList(
+            [
+                nn.RNN(4, 8, 2),
+                nn.RNNCell(4, 8),
+            ]
+        )
+        self.gru = nn.ModuleList([nn.GRU(4, 8, 2), nn.GRUCell(4, 8)])
+        self.lstm = nn.ModuleList(
+            [
+                nn.LSTM(4, 8, 2),
+                nn.LSTMCell(4, 8),
+            ]
+        )
+
+    def forward(self):
+        input = torch.randn(5, 3, 4)
+        h = torch.randn(2, 3, 8)
+        c = torch.randn(2, 3, 8)
+        r = self.rnn[0](input, h)
+        r = self.rnn[1](input[0], h[0])
+        r = self.gru[0](input, h)
+        r = self.gru[1](input[0], h[0])
+        r = self.lstm[0](input, (h, c))
+        r = self.lstm[1](input[0], (h[0], c[0]))
+        return len(r)
+
+
+class NNTransformerModule(torch.nn.Module):
+    def __init__(self):
+        super(NNTransformerModule, self).__init__()
+        self.transformers = nn.ModuleList(
+            [
+                nn.Transformer(
+                    d_model=2, nhead=2, num_encoder_layers=1, num_decoder_layers=1
+                ),
+                nn.TransformerEncoder(
+                    nn.TransformerEncoderLayer(d_model=2, nhead=2), num_layers=1
+                ),
+                nn.TransformerDecoder(
+                    nn.TransformerDecoderLayer(d_model=2, nhead=2), num_layers=1
+                ),
+            ]
+        )
+
+    def forward(self):
+        input = torch.rand(1, 16, 2)
+        tgt = torch.rand((1, 16, 2))
+        r = self.transformers[0](input, tgt)
+        r = self.transformers[1](input)
+        r = self.transformers[2](input, tgt)
+        return len(r)
+
+
+class NNLinearModule(torch.nn.Module):
+    def __init__(self):
+        super(NNLinearModule, self).__init__()
+        self.linears = nn.ModuleList(
+            [
+                nn.Identity(54),
+                nn.Linear(20, 20),
+                nn.Bilinear(20, 20, 40),
+                # nn.LazyLinear(20, 30),
+            ]
+        )
+
+    def forward(self):
+        input = torch.randn(32, 20)
+        r = self.linears[0](input)
+        r = self.linears[1](input)
+        r = self.linears[2](input, input)
+        return len(r)
+
+
+class NNDropoutModule(torch.nn.Module):
+    def __init__(self):
+        super(NNDropoutModule, self).__init__()
+
+    def forward(self):
+        a = torch.randn(8, 4)
+        b = torch.randn(8, 4, 4, 4)
+        c = torch.randn(8, 4, 4, 4, 4)
+        return len(
+            F.dropout(a),
+            F.dropout2d(b),
+            F.dropout3d(c),
+            F.alpha_dropout(a),
+            F.feature_alpha_dropout(c),
+        )
+
+
+class NNSparseModule(torch.nn.Module):
+    def __init__(self):
+        super(NNSparseModule, self).__init__()
+
+    def forward(self):
+        input = torch.tensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        input2 = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
+        embedding_matrix = torch.rand(10, 3)
+        offsets = torch.tensor([0, 4])
+        return len(
+            F.embedding(input, embedding_matrix),
+            F.embedding_bag(input2, embedding_matrix, offsets),
+            F.one_hot(torch.arange(0, 5) % 3, num_classes=5),
+        )
+
+
+class NNDistanceModule(torch.nn.Module):
+    def __init__(self):
+        super(NNDistanceModule, self).__init__()
+
+    def forward(self):
+        a = torch.randn(8, 4)
+        b = torch.randn(8, 4)
+        return len(
+            F.pairwise_distance(a, b),
+            F.cosine_similarity(a, b),
+            F.pdist(a),
+        )
+
+
+class NNLossFunctionModule(torch.nn.Module):
+    def __init__(self):
+        super(NNLossFunctionModule, self).__init__()
+        self.x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
+        self.y = torch.LongTensor([[3, 0, -1, 1]])
+
+    def forward(self):
+        a = torch.randn(3, 2)
+        b = torch.rand(3, 2)
+        c = torch.rand(3)
+        log_probs = torch.randn(50, 16, 20).log_softmax(2).detach()
+        targets = torch.randint(1, 20, (16, 30), dtype=torch.long)
+        input_lengths = torch.full((16,), 50, dtype=torch.long)
+        target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
+        return len(
+            F.binary_cross_entropy(torch.sigmoid(a), b),
+            F.binary_cross_entropy_with_logits(torch.sigmoid(a), b),
+            F.poisson_nll_loss(a, b),
+            F.cosine_embedding_loss(a, b, c),
+            F.cross_entropy(a, b),
+            F.ctc_loss(log_probs, targets, input_lengths, target_lengths),
+            # F.gaussian_nll_loss(a, b, torch.ones(5, 1)), # ENTER is not supported in mobile module
+            F.hinge_embedding_loss(a, b),
+            F.kl_div(a, b),
+            F.l1_loss(a, b),
+            F.mse_loss(a, b),
+            F.margin_ranking_loss(c, c, c),
+            F.multilabel_margin_loss(self.x, self.y),
+            F.multilabel_soft_margin_loss(self.x, self.y),
+            F.multi_margin_loss(self.x, torch.tensor([3])),
+            F.nll_loss(a, torch.tensor([1, 0, 1])),
+            F.huber_loss(a, b),
+            F.smooth_l1_loss(a, b),
+            F.soft_margin_loss(a, b),
+            F.triplet_margin_loss(a, b, -b),
+            # F.triplet_margin_with_distance_loss(a, b, -b), # can't take variable number of arguments
+        )
+
+
+class NNVisionModule(torch.nn.Module):
+    def __init__(self):
+        super(NNVisionModule, self).__init__()
+        self.input = torch.randn(1, 4, 9, 9)
+        self.vision_modules = nn.ModuleList(
+            [
+                nn.PixelShuffle(2),
+                nn.PixelUnshuffle(3),
+                nn.Upsample(scale_factor=2, mode="nearest"),
+                nn.Upsample(scale_factor=2, mode="bilinear"),
+                nn.Upsample(scale_factor=2, mode="bicubic"),
+                nn.UpsamplingNearest2d(scale_factor=2),
+                nn.UpsamplingBilinear2d(scale_factor=2),
+            ]
+        )
+        self.linear_sample = nn.Upsample(scale_factor=2, mode="linear")
+        self.trilinear_sample = nn.Upsample(scale_factor=2, mode="trilinear")
+
+    def forward(self):
+        input = torch.randn(1, 3, 16, 16)
+        for i, module in enumerate(self.vision_modules):
+            r = module(self.input)
+        return len(
+            r,
+            self.linear_sample(torch.randn(4, 9, 9)),
+            self.trilinear_sample(torch.randn(1, 3, 4, 9, 9)),
+            F.grid_sample(input, torch.ones(1, 4, 4, 2)),
+        )
+
+
+class NNShuffleModule(torch.nn.Module):
+    def __init__(self):
+        super(NNShuffleModule, self).__init__()
+        self.shuffle = nn.ChannelShuffle(2)
+
+    def forward(self):
+        return len(self.shuffle(torch.randn(1, 4, 2, 2)),)
+
+
+class NNUtilsModule(torch.nn.Module):
+    def __init__(self):
+        super(NNUtilsModule, self).__init__()
+        self.flatten = nn.Sequential(
+            nn.Linear(50, 50),
+            nn.Unflatten(1, (2, 5, 5))
+        )
+
+    def forward(self):
+        a = [torch.tensor([1, 2, 3]), torch.tensor([3, 4])]
+        b = nn.utils.rnn.pad_sequence(a, batch_first=True)
+        # c = nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=torch.tensor([3, 2]))
+        input = torch.randn(2, 50)
+        return len(
+            self.flatten(input),
+            b,
+        )
diff --git a/test/mobile/model_test/quantization_ops.py b/test/mobile/model_test/quantization_ops.py
new file mode 100644
index 000000000000..d0fdb346545e
--- /dev/null
+++ b/test/mobile/model_test/quantization_ops.py
@@ -0,0 +1,227 @@
+import torch
+import torch.nn as nn
+
+
+class GeneralQuantModule(torch.nn.Module):
+    def __init__(self):
+        super(GeneralQuantModule, self).__init__()
+        self.embedding = torch.nn.quantized.Embedding(
+            num_embeddings=10, embedding_dim=12
+        )
+        self.embedding_input = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8])
+        self.func = torch.nn.quantized.QFunctional()
+        self.conv1 = torch.nn.quantized.ConvTranspose1d(16, 33, 3, stride=2)
+        self.conv2 = torch.nn.quantized.ConvTranspose2d(16, 33, 3, stride=2)
+        self.conv3 = torch.nn.quantized.ConvTranspose3d(16, 33, 3, stride=2)
+
+    def forward(self):
+        a = torch.quantize_per_tensor(torch.tensor([3.0]), 1.0, 0, torch.qint32)
+        b = torch.quantize_per_tensor(torch.tensor(4.0), 1.0, 0, torch.qint32)
+        c = torch.quantize_per_tensor(
+            torch.tensor([3.0]), torch.tensor(1.0), torch.tensor(0), torch.qint32
+        )
+        input1 = torch.randn(1, 16, 4)
+        input2 = torch.randn(1, 16, 4, 4)
+        input3 = torch.randn(1, 16, 4, 4, 4)
+        return len(
+            self.func.add(a, b),
+            self.func.cat((a, a), 0),
+            self.func.mul(a, b),
+            self.func.add_relu(a, b),
+            self.func.add_scalar(a, b),
+            self.func.mul_scalar(a, b),
+            self.embedding(self.embedding_input),
+            self.conv1(
+                torch.quantize_per_tensor(
+                    input1, scale=1.0, zero_point=0, dtype=torch.quint8
+                )
+            ),
+            self.conv2(
+                torch.quantize_per_tensor(
+                    input2, scale=1.0, zero_point=0, dtype=torch.quint8
+                )
+            ),
+            c,
+            # self.conv3(torch.quantize_per_tensor(input3, scale=1.0, zero_point=0, dtype=torch.quint8)), # failed on iOS
+        )
+
+
+class DynamicQuantModule:
+    def __init__(self):
+        super(DynamicQuantModule, self).__init__()
+        self.module = self.M()
+
+    def getModule(self):
+        return torch.quantization.quantize_dynamic(self.module, dtype=torch.qint8)
+
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(DynamicQuantModule.M, self).__init__()
+            self.rnn = nn.RNN(4, 8, 2)
+            self.rnncell = nn.RNNCell(4, 8)
+            self.gru = nn.GRU(4, 8, 2)
+            self.grucell = nn.GRUCell(4, 8)
+            self.lstm = nn.LSTM(4, 8, 2)
+            self.lstmcell = nn.LSTMCell(4, 8)
+            self.linears = nn.ModuleList(
+                [
+                    nn.Identity(54),
+                    nn.Linear(20, 20),
+                    nn.Bilinear(20, 20, 40),
+                ]
+            )
+            self.transformers = nn.ModuleList(
+                [
+                    nn.Transformer(
+                        d_model=2, nhead=2, num_encoder_layers=1, num_decoder_layers=1
+                    ),
+                    nn.TransformerEncoder(
+                        nn.TransformerEncoderLayer(d_model=2, nhead=2), num_layers=1
+                    ),
+                    nn.TransformerDecoder(
+                        nn.TransformerDecoderLayer(d_model=2, nhead=2), num_layers=1
+                    ),
+                ]
+            )
+            # self.a = torch.nn.utils.rnn.pad_sequence([torch.tensor([1,2,3]), torch.tensor([3,4])], batch_first=True)
+
+        def forward(self):
+            input = torch.randn(5, 3, 4)
+            h = torch.randn(2, 3, 8)
+            c = torch.randn(2, 3, 8)
+            linear_input = torch.randn(32, 20)
+            trans_input = torch.randn(1, 16, 2)
+            tgt = torch.rand(1, 16, 2)
+
+            return len((
+                self.rnn(input, h),
+                self.rnncell(input[0], h[0]),
+                self.gru(input, h),
+                self.grucell(input[0], h[0]),
+                self.lstm(input, (h, c)),
+                # self.lstm(torch.nn.utils.rnn.pack_padded_sequence(self.a, lengths=torch.tensor([3,2,1])), (h, c)),
+                self.lstmcell(input[0], (h[0], c[0])),
+                self.transformers[0](trans_input, tgt),
+                self.transformers[1](trans_input),
+                self.transformers[2](trans_input, tgt),
+                self.linears[0](linear_input),
+                self.linears[1](linear_input),
+                self.linears[2](linear_input, linear_input),
+            ))
+
+
+class StaticQuantModule:
+    def __init__(self):
+        super(StaticQuantModule, self).__init__()
+
+    def getModule(self):
+        model_fp32 = self.M()
+        model_fp32.eval()
+        model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+        model_fp32_prepared = torch.quantization.prepare(model_fp32)
+        model_int8 = torch.quantization.convert(model_fp32_prepared)
+        return model_int8
+
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(StaticQuantModule.M, self).__init__()
+            self.quant = torch.quantization.QuantStub()
+            self.input1d = torch.randn(4, 2, 2)
+            self.input2d = torch.randn((4, 2, 4, 4))
+            self.input3d = torch.randn(4, 2, 2, 4, 4)
+            self.linear_input = torch.randn(32, 20)
+
+            self.layer1 = nn.Sequential(
+                nn.Conv1d(2, 2, 1), nn.InstanceNorm1d(1), nn.Hardswish()
+            )
+            self.layer2 = nn.Sequential(
+                nn.Conv2d(2, 2, 1),
+                nn.BatchNorm2d(2),
+                nn.InstanceNorm2d(1),
+                nn.LeakyReLU(),
+            )
+            self.layer3 = nn.Sequential(
+                nn.Conv3d(2, 2, 1), nn.BatchNorm3d(2), nn.InstanceNorm3d(1), nn.ReLU()
+            )
+            self.layer4 = nn.Sequential(nn.Linear(4, 3))
+            self.dequant = torch.quantization.DeQuantStub()
+
+        def forward(self):
+            x = self.quant(self.input1d)
+            x = self.layer1(x)
+            x = self.dequant(x)
+
+            y = self.input2d
+            y = self.quant(y)
+            y = self.layer2(y)
+            y = self.layer4(y)
+            y = self.dequant(y)
+
+            z = self.quant(self.input3d)
+            z = self.layer3(z)
+            z = self.dequant(z)
+
+            return (x, y, z)
+
+
+class FusedQuantModule:
+    def __init__(self):
+        super(FusedQuantModule, self).__init__()
+
+    def getModule(self):
+        model_fp32 = self.M()
+        model_fp32.eval()
+        model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+        model_fp32_fused = torch.quantization.fuse_modules(
+            model_fp32,
+            [
+                ["conv1d", "relu1"],
+                ["conv2d", "relu2"],
+                ["conv3d", "relu3"],
+                ["linear", "relu4"],
+            ],
+        )
+        model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+        model_int8 = torch.quantization.convert(model_fp32_prepared)
+        return model_int8
+
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(FusedQuantModule.M, self).__init__()
+            self.quant = torch.quantization.QuantStub()
+            self.input1d = torch.randn(4, 2, 2)
+            self.input2d = torch.randn((4, 2, 4, 4))
+            self.input3d = torch.randn(4, 2, 2, 4, 4)
+            self.conv1d = nn.Conv1d(2, 2, 1)
+            self.conv2d = nn.Conv2d(2, 2, 1)
+            self.conv3d = nn.Conv3d(2, 2, 1)
+            self.linear = nn.Linear(4, 2)
+            self.relu1 = nn.ReLU()
+            self.relu2 = nn.ReLU()
+            self.relu3 = nn.ReLU()
+            self.relu4 = nn.ReLU()
+            self.dequant = torch.quantization.DeQuantStub()
+
+        def forward(self):
+            x = self.input1d
+            y = self.input2d
+            z = self.input3d
+
+            x = self.quant(x)
+            x = self.conv1d(x)
+            x = self.relu1(x)
+            x = self.dequant(x)
+
+            y = self.quant(y)
+            y = self.conv2d(y)
+            y = self.relu2(y)
+            y = self.dequant(y)
+
+            z = self.quant(z)
+            z = self.conv3d(z)
+            z = self.relu3(z)
+            z = self.linear(z)
+            z = self.relu4(z)
+            z = self.dequant(z)
+
+            return (x, y, z)
diff --git a/test/mobile/model_test/sampling_ops.py b/test/mobile/model_test/sampling_ops.py
new file mode 100644
index 000000000000..a1ac71a3a319
--- /dev/null
+++ b/test/mobile/model_test/sampling_ops.py
@@ -0,0 +1,37 @@
+import torch
+
+
+# https://pytorch.org/docs/stable/torch.html#random-sampling
+
+class SamplingOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(SamplingOpsModule, self).__init__()
+
+    def forward(self):
+        a = torch.empty(3, 3).uniform_(0.0, 1.0)
+        size = (1, 4)
+        weights = torch.tensor([0, 10, 3, 0], dtype=torch.float)
+        return len(
+            # torch.seed(),
+            # torch.manual_seed(0),
+            torch.bernoulli(a),
+            # torch.initial_seed(),
+            torch.multinomial(weights, 2),
+            torch.normal(2.0, 3.0, size),
+            torch.poisson(a),
+            torch.rand(2, 3),
+            torch.rand_like(a),
+            torch.randint(10, size),
+            torch.randint_like(a, 4),
+            torch.rand(4),
+            torch.randn_like(a),
+            torch.randperm(4),
+            a.bernoulli_(),
+            a.cauchy_(),
+            a.exponential_(),
+            a.geometric_(0.5),
+            a.log_normal_(),
+            a.normal_(),
+            a.random_(),
+            a.uniform_(),
+        )
diff --git a/test/mobile/model_test/tensor_ops.py b/test/mobile/model_test/tensor_ops.py
new file mode 100644
index 000000000000..9e04c6703d27
--- /dev/null
+++ b/test/mobile/model_test/tensor_ops.py
@@ -0,0 +1,279 @@
+import torch
+
+
+class TensorOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TensorOpsModule, self).__init__()
+
+    def forward(self):
+        return self.tensor_general_ops()
+
+    def tensor_general_ops(self):
+        a = torch.randn(4)
+        b = torch.tensor([1.5])
+        x = torch.ones((2,))
+        c = torch.randn(4, dtype=torch.cfloat)
+        w = torch.rand(4, 4, 4, 4)
+        v = torch.rand(4, 4, 4, 4)
+        return len(
+            # torch.is_tensor(a),
+            # torch.is_storage(a),
+            torch.is_complex(a),
+            torch.is_conj(a),
+            torch.is_floating_point(a),
+            torch.is_nonzero(b),
+            # torch.set_default_dtype(torch.float32),
+            # torch.get_default_dtype(),
+            # torch.set_default_tensor_type(torch.DoubleTensor),
+            torch.numel(a),
+            # torch.set_printoptions(),
+            # torch.set_flush_denormal(False),
+            # https://pytorch.org/docs/stable/tensors.html#tensor-class-reference
+            # x.new_tensor([[0, 1], [2, 3]]),
+            x.new_full((3, 4), 3.141592),
+            x.new_empty((2, 3)),
+            x.new_ones((2, 3)),
+            x.new_zeros((2, 3)),
+            x.is_cuda,
+            x.is_quantized,
+            x.is_meta,
+            x.device,
+            x.dim(),
+            c.real,
+            c.imag,
+            # x.backward(),
+            x.clone(),
+            w.contiguous(),
+            w.contiguous(memory_format=torch.channels_last),
+            w.copy_(v),
+            w.copy_(1),
+            w.copy_(0.5),
+            x.cpu(),
+            # x.cuda(),
+            # x.data_ptr(),
+            x.dense_dim(),
+            w.fill_diagonal_(0),
+            w.element_size(),
+            w.exponential_(),
+            w.fill_(0),
+            w.geometric_(0.5),
+            a.index_fill(0, torch.tensor([0, 2]), 1),
+            a.index_put_([torch.argmax(a)], torch.tensor(1.0)),
+            a.index_put([torch.argmax(a)], torch.tensor(1.0)),
+            w.is_contiguous(),
+            c.is_complex(),
+            w.is_conj(),
+            w.is_floating_point(),
+            w.is_leaf,
+            w.is_pinned(),
+            w.is_set_to(w),
+            # w.is_shared,
+            w.is_coalesced(),
+            w.coalesce(),
+            w.is_signed(),
+            w.is_sparse,
+            torch.tensor([1]).item(),
+            x.log_normal_(),
+            # x.masked_scatter_(),
+            # x.masked_scatter(),
+            # w.normal(),
+            w.numel(),
+            # w.pin_memory(),
+            # w.put_(0, torch.tensor([0, 1], w)),
+            x.repeat(4, 2),
+            a.clamp_(0),
+            a.clamp(0),
+            a.clamp_min(0),
+            a.hardsigmoid_(),
+            a.hardsigmoid(),
+            a.hardswish_(),
+            a.hardswish(),
+            a.hardtanh_(),
+            a.hardtanh(),
+            a.leaky_relu_(),
+            a.leaky_relu(),
+            a.relu_(),
+            a.relu(),
+            a.resize_as_(a),
+            a.type_as(a),
+            a._shape_as_tensor(),
+            a.requires_grad_(False),
+        )
+
+
+class TensorCreationOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TensorCreationOpsModule, self).__init__()
+
+    def forward(self):
+        return self.tensor_creation_ops()
+
+    def tensor_creation_ops(self):
+        i = torch.tensor([[0, 1, 1], [2, 0, 2]])
+        v = torch.tensor([3, 4, 5], dtype=torch.float32)
+        real = torch.tensor([1, 2], dtype=torch.float32)
+        imag = torch.tensor([3, 4], dtype=torch.float32)
+        inp = torch.tensor([-1.5, 0.0, 2.0])
+        values = torch.tensor([0.5])
+        quantized = torch.quantize_per_channel(
+            torch.tensor([[-1.0, 0.0], [1.0, 2.0]]),
+            torch.tensor([0.1, 0.01]),
+            torch.tensor([10, 0]),
+            0,
+            torch.quint8,
+        )
+        return len(
+            torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]]),
+            # torch.sparse_coo_tensor(i, v, [2, 3]), # not work for iOS
+            torch.as_tensor([1, 2, 3]),
+            torch.as_strided(torch.randn(3, 3), (2, 2), (1, 2)),
+            torch.zeros(2, 3),
+            torch.zeros((2, 3)),
+            torch.zeros([2, 3], out=i),
+            torch.zeros(5),
+            torch.zeros_like(torch.empty(2, 3)),
+            torch.ones(2, 3),
+            torch.ones((2, 3)),
+            torch.ones([2, 3]),
+            torch.ones(5),
+            torch.ones_like(torch.empty(2, 3)),
+            torch.arange(5),
+            torch.arange(1, 4),
+            torch.arange(1, 2.5, 0.5),
+            torch.range(1, 4),
+            torch.range(1, 4, 0.5),
+            torch.linspace(3.0, 3.0, steps=1),
+            torch.logspace(start=2, end=2, steps=1, base=2.0),
+            torch.eye(3),
+            torch.empty(2, 3),
+            torch.empty_like(torch.empty(2, 3), dtype=torch.int64),
+            torch.empty_strided((2, 3), (1, 2)),
+            torch.full((2, 3), 3.141592),
+            torch.full_like(torch.full((2, 3), 3.141592), 2.71828),
+            torch.quantize_per_tensor(
+                torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8
+            ),
+            torch.dequantize(quantized),
+            torch.complex(real, imag),
+            torch.polar(real, imag),
+            torch.heaviside(inp, values),
+        )
+
+
+class TensorIndexingOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TensorIndexingOpsModule, self).__init__()
+
+    def forward(self):
+        return self.tensor_indexing_ops()
+
+    def tensor_indexing_ops(self):
+        x = torch.randn(2, 4)
+        y = torch.randn(4, 4)
+        t = torch.tensor([[0, 0], [1, 0]])
+        mask = x.ge(0.5)
+        i = [0, 1]
+        return len(
+            torch.cat((x, x, x), 0),
+            torch.concat((x, x, x), 0),
+            torch.conj(x),
+            torch.chunk(x, 2),
+            torch.dsplit(torch.randn(2, 2, 4), i),
+            torch.column_stack((x, x)),
+            torch.dstack((x, x)),
+            torch.gather(x, 0, t),
+            torch.hsplit(x, i),
+            torch.hstack((x, x)),
+            torch.index_select(x, 0, torch.tensor([0, 1])),
+            x.index(t),
+            torch.masked_select(x, mask),
+            torch.movedim(x, 1, 0),
+            torch.moveaxis(x, 1, 0),
+            torch.narrow(x, 0, 0, 2),
+            torch.nonzero(x),
+            torch.permute(x, (0, 1)),
+            torch.reshape(x, (-1,)),
+            torch.row_stack((x, x)),
+            torch.select(x, 0, 0),
+            torch.scatter(x, 0, t, x),
+            x.scatter(0, t, x.clone()),
+            torch.diagonal_scatter(y, torch.ones(4)),
+            torch.select_scatter(y, torch.ones(4), 0, 0),
+            torch.slice_scatter(x, x),
+            torch.scatter_add(x, 0, t, x),
+            x.scatter_(0, t, y),
+            x.scatter_add_(0, t, y),
+            # torch.scatter_reduce(x, 0, t, reduce="sum"),
+            torch.split(x, 1),
+            torch.squeeze(x, 0),
+            torch.stack([x, x]),
+            torch.swapaxes(x, 0, 1),
+            torch.swapdims(x, 0, 1),
+            torch.t(x),
+            torch.take(x, t),
+            torch.take_along_dim(x, torch.argmax(x)),
+            torch.tensor_split(x, 1),
+            torch.tensor_split(x, [0, 1]),
+            torch.tile(x, (2, 2)),
+            torch.transpose(x, 0, 1),
+            torch.unbind(x),
+            torch.unsqueeze(x, -1),
+            torch.vsplit(x, i),
+            torch.vstack((x, x)),
+            torch.where(x),
+            torch.where(t > 0, t, 0),
+            torch.where(t > 0, t, t),
+        )
+
+
+class TensorTypingOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TensorTypingOpsModule, self).__init__()
+
+    def forward(self):
+        return self.tensor_typing_ops()
+
+    def tensor_typing_ops(self):
+        x = torch.randn(1, 3, 4, 4)
+        return len(
+            x.to(torch.float),
+            x.to(torch.double),
+            x.to(torch.cfloat),
+            x.to(torch.cdouble),
+            x.to(torch.half),
+            x.to(torch.bfloat16),
+            x.to(torch.uint8),
+            x.to(torch.int8),
+            x.to(torch.short),
+            x.to(torch.int),
+            x.to(torch.long),
+            x.to(torch.bool),
+            x.to(torch.device("cpu")),
+            x.to(device="cpu", dtype=torch.float),
+            x.to(memory_format=torch.channels_last),
+        )
+
+
+class TensorViewOpsModule(torch.nn.Module):
+    def __init__(self):
+        super(TensorViewOpsModule, self).__init__()
+
+    def forward(self):
+        return self.tensor_view_ops()
+
+    def tensor_view_ops(self):
+        x = torch.randn(4, 4, 1)
+        y = torch.randn(4, 4, 2)
+        return len(
+            x[0, 2:],
+            x.detach(),
+            x.detach_(),
+            x.diagonal(),
+            x.expand(-1, -1, 3),
+            x.expand_as(y),
+            x.select(0, 1),
+            x.unflatten(1, (2, 2)),
+            x.unfold(1, 2, 2),
+            x.view(16),
+            x.view_as(torch.randn(16)),
+        )
diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py
new file mode 100644
index 000000000000..232afbc54b1e
--- /dev/null
+++ b/test/mobile/model_test/torchvision_models.py
@@ -0,0 +1,24 @@
+import torch
+import torchvision
+from torch.utils.bundled_inputs import augment_model_with_bundled_inputs
+from torch.utils.mobile_optimizer import optimize_for_mobile
+
+
+class MobileNetV2Module:
+    def __init__(self):
+        super(MobileNetV2Module, self).__init__()
+
+    def getModule(self):
+        model = torchvision.models.mobilenet_v2(pretrained=True)
+        model.eval()
+        example = torch.zeros(1, 3, 224, 224)
+        traced_script_module = torch.jit.trace(model, example)
+        optimized_module = optimize_for_mobile(traced_script_module)
+        augment_model_with_bundled_inputs(
+            optimized_module,
+            [
+                (example, ),
+            ],
+        )
+        optimized_module(example)
+        return optimized_module
diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py
new file mode 100644
index 000000000000..6bb685e6296d
--- /dev/null
+++ b/test/mobile/model_test/update_production_ops.py
@@ -0,0 +1,35 @@
+"""
+This is a script to aggregate production ops from xplat/pytorch_models/build/all_mobile_model_configs.yaml.
+Specify the file path in the first argument. The results will be dump to model_ops.yaml
+"""
+
+import sys
+import yaml
+
+root_operators = {}
+traced_operators = {}
+kernel_metadata = {}
+
+with open(sys.argv[1]) as input_yaml_file:
+    model_infos = yaml.safe_load(input_yaml_file)
+    for info in model_infos:
+        for op in info["root_operators"]:
+            # aggregate occurance per op
+            root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0)
+        for op in info["traced_operators"]:
+            # aggregate occurance per op
+            traced_operators[op] = 1 + (traced_operators[op] if op in traced_operators else 0)
+        # merge dtypes for each kernel
+        for kernal, dtypes in info["kernel_metadata"].items():
+            new_dtypes = dtypes + (kernel_metadata[kernal] if kernal in kernel_metadata else [])
+            kernel_metadata[kernal] = list(set(new_dtypes))
+
+
+# Only test these built-in ops. No custom ops or non-CPU ops.
+namespaces = ["aten", "prepacked", "prim", "quantized"]
+root_operators = {x: root_operators[x] for x in root_operators if x.split("::")[0] in namespaces}
+traced_operators = {x: traced_operators[x] for x in traced_operators if x.split("::")[0] in namespaces}
+
+out_path = "test/mobile/model_test/model_ops.yaml"
+with open(out_path, "w") as f:
+    yaml.safe_dump({"root_operators": root_operators}, f)
diff --git a/test/mobile/nnc/test_nnc_backend.cpp b/test/mobile/nnc/test_nnc_backend.cpp
index f7adcb62459f..35bf60f2cca7 100644
--- a/test/mobile/nnc/test_nnc_backend.cpp
+++ b/test/mobile/nnc/test_nnc_backend.cpp
@@ -23,7 +23,9 @@ c10::Dict<c10::IValue, c10::IValue> create_compile_spec(
     const std::string& method_name,
     const std::string& model_name,
     const std::string& input_shapes,
-    const std::string& input_types) {
+    const std::string& input_types,
+    const std::string& memory_formats,
+    const std::string& dynamic_sizes) {
   c10::Dict<c10::IValue, c10::IValue> method_spec(
       c10::StringType::get(), c10::AnyType::get());
 
@@ -33,6 +35,8 @@ c10::Dict<c10::IValue, c10::IValue> create_compile_spec(
   method_spec.insert("model_version", "v1");
   method_spec.insert("asmfile", "fake_nnc_model.s");
   method_spec.insert("arch", "x86-64");
+  method_spec.insert("memory_formats", memory_formats);
+  method_spec.insert("dynamic_sizes", dynamic_sizes);
 
   c10::Dict<c10::IValue, c10::IValue> compile_spec(
       c10::StringType::get(), c10::AnyType::get());
@@ -63,7 +67,7 @@ REGISTER_NNC_KERNEL(
 
 TEST(NNCBackendTest, AOTCompileThenExecute) {
   torch::jit::Module m("m");
-  auto param = torch::ones({});
+  auto param = torch::ones({1});
   m.register_parameter("param", param, false);
   m.define(R"(
     def forward(self, input):
@@ -77,7 +81,7 @@ TEST(NNCBackendTest, AOTCompileThenExecute) {
 
   // Compile the model with NNC.
   auto compile_spec = create_compile_spec(
-      "forward", "_add_kernel_nnc_fake_model", "4,4", "float");
+      "forward", "_add_kernel_nnc_fake_model", "4,4", "float", "", "");
   auto any_dict_ty =
       c10::DictType::create(c10::StringType::get(), c10::AnyType::get());
   auto frozen_m = torch::jit::freeze_module(m.clone());
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 90abdab4ceea..638ac37eb88b 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -522,6 +522,49 @@ def forward(self, x):
         input = torch.randn(4, 1, 4, 4)
         self._compare_script_and_mobile(model=model_int8, input=input)
 
+    def test_bundled_input_with_dynamic_type(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+
+            def forward(
+                self,
+                x: Dict[int, torch.Tensor],
+                y: Dict[int, torch.Tensor],
+                z: Dict[int, torch.Tensor],
+            ):
+                return x
+
+        model = Model()
+        script_module = torch.jit.script(model)
+
+        sample_input = {
+            script_module.forward: [
+                (
+                    {0: torch.ones(1)},
+                    {1: torch.ones(1)},
+                    {2: torch.ones(1)},
+                )
+            ]
+        }
+
+        bundled_model = torch.utils.bundled_inputs.bundle_inputs(
+            script_module, sample_input
+        )
+
+        buf = bundled_model._save_to_buffer_for_lite_interpreter()
+        mobile_module = _load_for_lite_interpreter(io.BytesIO(buf))
+
+        i = mobile_module.run_method("get_all_bundled_inputs")
+
+        self.assertEqual(
+            i[0],
+            (
+                {0: torch.ones(1)},
+                {1: torch.ones(1)},
+                {2: torch.ones(1)},
+            ),
+        )
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index 53db16ce9031..67d6ac859683 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -3,7 +3,7 @@
 import torch
 import torch.utils.bundled_inputs
 import io
-from typing import List, NamedTuple
+from typing import Dict, List, NamedTuple
 
 from torch.jit.mobile import _load_for_lite_interpreter
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -33,6 +33,69 @@ def forward(self, a: torch.Tensor):
             mobile_module_result
         )
 
+
+    def test_typing_dict_with_namedtuple(self):
+        class Foo(NamedTuple):
+            id: torch.Tensor
+
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super(Bar, self).__init__()
+                self.foo = Foo(torch.tensor(1))
+
+            def forward(self, a: torch.Tensor):
+                self.foo = Foo(a)
+                re: Dict[str, Foo] = dict()
+                re["test"] = Foo(a)
+                return self.foo, re["test"]
+
+        # The corresponding bytecode is
+        # (8,
+        #  ('__torch__.___torch_mangle_2.Bar.forward',
+        #   (('instructions',
+        #     (('STOREN', 1, 2),
+        #      ('DROPR', 1, 0),
+        #      ('DICT_CONSTRUCT', 0, 0),
+        #      ('STORE', 3, 0),
+        #      ('LOAD', 3, 0),
+        #      ('LOADC', 1, 0),
+        #      ('MOVE', 2, 0),
+        #      ('NAMED_TUPLE_CONSTRUCT', 1, 1),
+        #      ('OP', 0, 0),
+        #      ('MOVE', 3, 0),
+        #      ('LOADC', 1, 0),
+        #      ('DICT_INDEX', 0, 0),
+        #      ('LOADC', 0, 0),
+        #      ('TUPLE_INDEX', 0, 0),
+        #      ('RET', 0, 0))),
+        #    ('operators', (('aten::_set_item', 'str', 3),)),
+        #    ('constants', (0, 'test')),
+        #    ('types',
+        #     ('Dict[str,__torch__.Foo[NamedTuple, [[id, Tensor]]]]',
+        #      '__torch__.Foo[NamedTuple, [[id, Tensor]]]')),
+        #    ('register_size', 3)),
+        #   (('arguments',
+        #     ((('name', 'self'),
+        #       ('type', '__torch__.___torch_mangle_2.Bar'),
+        #       ('default_value', None)),
+        #      (('name', 'a'), ('type', 'Tensor'), ('default_value', None)))),
+        #    ('returns',
+        #     ((('name', ''), ('type', 'Tensor'), ('default_value', None)),)))))
+
+        sample_input = torch.tensor(5)
+        script_module = torch.jit.script(Bar())
+
+        script_module_result = script_module(sample_input)
+
+        buffer_mobile = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
+        buffer_mobile.seek(0)
+        mobile_module = _load_for_lite_interpreter(buffer_mobile)
+        mobile_module_result = mobile_module(sample_input)
+        torch.testing.assert_allclose(
+            script_module_result,
+            mobile_module_result
+        )
+
     def test_typing_namedtuple_custom_classtype(self):
         class Foo(NamedTuple):
             id: torch.Tensor
diff --git a/test/mobile/test_upgrader_codegen.py b/test/mobile/test_upgrader_codegen.py
index 5a09ad8a877d..5ccf9a020a5b 100644
--- a/test/mobile/test_upgrader_codegen.py
+++ b/test/mobile/test_upgrader_codegen.py
@@ -2,7 +2,7 @@
 
 from torch.testing._internal.common_utils import TestCase, run_tests
 
-from tools.codegen.operator_versions.gen_mobile_upgraders import (
+from torchgen.operator_versions.gen_mobile_upgraders import (
     sort_upgrader,
     write_cpp,
 )
diff --git a/test/onnx/autograd_helper.py b/test/onnx/autograd_helper.py
new file mode 100644
index 000000000000..4a3a3eca3844
--- /dev/null
+++ b/test/onnx/autograd_helper.py
@@ -0,0 +1,19 @@
+# Owner(s): ["module: onnx"]
+
+import torch
+
+
+# Autograd funtion that is a replica of the autograd funtion in
+# test_utility_funs.py (test_autograd_module_name)
+class CustomFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return input.clamp(min=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (input,) = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        grad_input[input < 0] = 0
+        return grad_input
diff --git a/test/onnx/debug_embed_params.py b/test/onnx/debug_embed_params.py
index 8499a1d8d216..7fe40a5906dc 100644
--- a/test/onnx/debug_embed_params.py
+++ b/test/onnx/debug_embed_params.py
@@ -1,13 +1,12 @@
 import sys
 
-import torch
-import torch.jit
-from torch.autograd import Variable
-
 import onnx
-import caffe2.python.onnx.backend as c2
 from test_pytorch_common import flatten
 
+import caffe2.python.onnx.backend as c2
+import torch
+import torch.jit
+from torch.autograd import Variable
 
 torch.set_default_tensor_type("torch.FloatTensor")
 try:
diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect
index 0d978b1e3687..40fc61e29b7f 100644
--- a/test/onnx/expect/TestOperators.test_acos.expect
+++ b/test/onnx/expect/TestOperators.test_acos.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Acos_0"
     op_type: "Acos"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Acos_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect
index 455df1b03a27..569b2400df88 100644
--- a/test/onnx/expect/TestOperators.test_add_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
index c1dd2341ba8f..ffa632ca475b 100644
--- a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
index 9219269bc5e2..9917880a8a22 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
index 455df1b03a27..569b2400df88 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
index 45cd1d21faf4..96d2dca59325 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect
index 684f08eaf9be..0e1570eb62da 100644
--- a/test/onnx/expect/TestOperators.test_addconstant.expect
+++ b/test/onnx/expect/TestOperators.test_addconstant.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Add_1"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect
index f700dcd41ed3..1ef0a81e2a90 100644
--- a/test/onnx/expect/TestOperators.test_addmm.expect
+++ b/test/onnx/expect/TestOperators.test_addmm.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -38,7 +38,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Gemm_0"
     type {
@@ -102,5 +102,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_arange_dynamic.expect b/test/onnx/expect/TestOperators.test_arange_dynamic.expect
index 6de30ee46c49..09d75955ca26 100644
--- a/test/onnx/expect/TestOperators.test_arange_dynamic.expect
+++ b/test/onnx/expect/TestOperators.test_arange_dynamic.expect
@@ -16,7 +16,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
diff --git a/test/onnx/expect/TestOperators.test_argmax.expect b/test/onnx/expect/TestOperators.test_argmax.expect
index 36822ef07ff7..38add716ff36 100644
--- a/test/onnx/expect/TestOperators.test_argmax.expect
+++ b/test/onnx/expect/TestOperators.test_argmax.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,8 +17,13 @@ graph {
       i: 0
       type: INT
     }
+    attribute {
+      name: "select_last_index"
+      i: 0
+      type: INT
+    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ArgMax_0"
     type {
@@ -50,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect
index aa584d51c854..f5a44b850eb1 100644
--- a/test/onnx/expect/TestOperators.test_asin.expect
+++ b/test/onnx/expect/TestOperators.test_asin.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Asin_0"
     op_type: "Asin"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Asin_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect
index a0f13b754247..8890f6535756 100644
--- a/test/onnx/expect/TestOperators.test_at_op.expect
+++ b/test/onnx/expect/TestOperators.test_at_op.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,8 +13,14 @@ graph {
       s: "add"
       type: STRING
     }
+    attribute {
+      name: "overload_name"
+      s: ""
+      type: STRING
+    }
+    domain: "org.pytorch.aten"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
@@ -49,5 +55,9 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
+}
+opset_import {
+  domain: "org.pytorch.aten"
+  version: 1
 }
diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect
index 72ff4ba536bc..c8d189e1415e 100644
--- a/test/onnx/expect/TestOperators.test_atan.expect
+++ b/test/onnx/expect/TestOperators.test_atan.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Atan_0"
     op_type: "Atan"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Atan_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_aten_embedding_1.expect b/test/onnx/expect/TestOperators.test_aten_embedding_1.expect
index cb4454337751..25a4fb256e2e 100644
--- a/test/onnx/expect/TestOperators.test_aten_embedding_1.expect
+++ b/test/onnx/expect/TestOperators.test_aten_embedding_1.expect
@@ -16,7 +16,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "3"
     type {
diff --git a/test/onnx/expect/TestOperators.test_aten_embedding_2.expect b/test/onnx/expect/TestOperators.test_aten_embedding_2.expect
index 2b175cf621b2..98779b99d98d 100644
--- a/test/onnx/expect/TestOperators.test_aten_embedding_2.expect
+++ b/test/onnx/expect/TestOperators.test_aten_embedding_2.expect
@@ -6,19 +6,24 @@ graph {
     input: "emb.weight"
     input: "input_1"
     output: "onnx::Add_3"
-    name: "ATenOp_0"
-    op_type: "ATenOp"
+    name: "ATen_0"
+    op_type: "ATen"
     attribute {
       name: "custom_attributes_json"
       s: "{\"padding_idx\":-1,\"scale_grad_by_freq\":false,\"sparse\":false}"
       type: STRING
     }
     attribute {
-      name: "name"
-      s: "aten::embedding"
+      name: "operator"
+      s: "embedding"
       type: STRING
     }
-    domain: "com.microsoft"
+    attribute {
+      name: "overload_name"
+      s: ""
+      type: STRING
+    }
+    domain: "org.pytorch.aten"
   }
   node {
     input: "onnx::Add_3"
@@ -95,7 +100,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 4
     dims: 8
@@ -145,27 +150,11 @@ graph {
       }
     }
   }
-  value_info {
-    name: "onnx::Add_3"
-    type {
-      tensor_type {
-        elem_type: 1
-        shape {
-          dim {
-            dim_param: "ATenOponnx::Add_3_dim_0"
-          }
-          dim {
-            dim_param: "ATenOponnx::Add_3_dim_1"
-          }
-        }
-      }
-    }
-  }
 }
 opset_import {
   version: 12
 }
 opset_import {
-  domain: "com.microsoft"
+  domain: "org.pytorch.aten"
   version: 1
 }
diff --git a/test/onnx/expect/TestOperators.test_avg_pool2d.expect b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
index 5647d2b36ff4..344022ec2688 100644
--- a/test/onnx/expect/TestOperators.test_avg_pool2d.expect
+++ b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
@@ -1,40 +1,43 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    output: "onnx::Pad_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 8
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
   node {
     input: "onnx::Pad_0"
-    output: "onnx::AveragePool_1"
-    name: "Pad_0"
+    input: "onnx::Pad_1"
+    output: "onnx::AveragePool_2"
+    name: "Pad_1"
     op_type: "Pad"
     attribute {
       name: "mode"
       s: "constant"
       type: STRING
     }
-    attribute {
-      name: "pads"
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 0
-      type: INTS
-    }
-    attribute {
-      name: "value"
-      f: 0
-      type: FLOAT
-    }
   }
   node {
-    input: "onnx::AveragePool_1"
-    output: "2"
-    name: "AveragePool_1"
+    input: "onnx::AveragePool_2"
+    output: "3"
+    name: "AveragePool_2"
     op_type: "AveragePool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
     attribute {
       name: "kernel_shape"
       ints: 3
@@ -56,7 +59,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Pad_0"
     type {
@@ -80,7 +83,7 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "3"
     type {
       tensor_type {
         elem_type: 1
@@ -103,5 +106,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_baddbmm.expect b/test/onnx/expect/TestOperators.test_baddbmm.expect
index c021baac505d..fc7eb0f8295e 100644
--- a/test/onnx/expect/TestOperators.test_baddbmm.expect
+++ b/test/onnx/expect/TestOperators.test_baddbmm.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -30,7 +30,7 @@ graph {
     name: "Add_3"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     data_type: 1
     name: "onnx::Mul_11"
@@ -119,5 +119,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect
index 280b8114034c..3d151aefabdb 100644
--- a/test/onnx/expect/TestOperators.test_basic.expect
+++ b/test/onnx/expect/TestOperators.test_basic.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -34,7 +34,7 @@ graph {
     name: "Neg_4"
     op_type: "Neg"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -76,5 +76,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect
index 5071995d8b34..d9c9ec338c8c 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     data_type: 1
@@ -145,5 +145,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
index 136fc681ecfc..a4d2e1f10249 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     data_type: 1
@@ -133,5 +133,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
index 7ca2c910c7ab..a421443cdcda 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -50,7 +50,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 128
     data_type: 1
@@ -135,5 +135,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect
index 68ae47d8d243..a556e38c7198 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     data_type: 1
@@ -93,5 +93,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
index a0bc171ed9e5..5e8f2049e143 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_training.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -26,7 +26,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     data_type: 1
@@ -149,5 +149,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_bitshift.expect b/test/onnx/expect/TestOperators.test_bitshift.expect
index 3b2affd5c36f..10199d03efcd 100644
--- a/test/onnx/expect/TestOperators.test_bitshift.expect
+++ b/test/onnx/expect/TestOperators.test_bitshift.expect
@@ -3,48 +3,22 @@ producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::Pow_4"
-    name: "Constant_0"
-    op_type: "Constant"
-    attribute {
-      name: "value"
-      t {
-        data_type: 1
-        raw_data: "\000\000\000@"
-      }
-      type: TENSOR
-    }
-  }
-  node {
-    input: "onnx::Pow_4"
-    input: "onnx::Pow_11"
-    output: "onnx::Cast_5"
-    name: "Pow_1"
-    op_type: "Pow"
-  }
-  node {
-    input: "onnx::Cast_5"
-    output: "onnx::Div_6"
-    name: "Cast_2"
-    op_type: "Cast"
+    input: "onnx::BitShift_0"
+    input: "onnx::BitShift_7"
+    output: "3"
+    name: "BitShift_0"
+    op_type: "BitShift"
     attribute {
-      name: "to"
-      i: 1
-      type: INT
+      name: "direction"
+      s: "RIGHT"
+      type: STRING
     }
   }
   node {
-    input: "onnx::Div_0"
-    input: "onnx::Div_6"
-    output: "7"
-    name: "Div_3"
-    op_type: "Div"
-  }
-  node {
-    input: "onnx::BitShift_1"
-    input: "onnx::BitShift_12"
-    output: "10"
-    name: "BitShift_4"
+    input: "onnx::BitShift_0"
+    input: "onnx::BitShift_8"
+    output: "6"
+    name: "BitShift_1"
     op_type: "BitShift"
     attribute {
       name: "direction"
@@ -52,38 +26,19 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
-    data_type: 1
-    name: "onnx::Pow_11"
-    raw_data: "\000\000\200?"
+    data_type: 2
+    name: "onnx::BitShift_7"
+    raw_data: "\001"
   }
   initializer {
     data_type: 2
-    name: "onnx::BitShift_12"
+    name: "onnx::BitShift_8"
     raw_data: "\002"
   }
   input {
-    name: "onnx::Div_0"
-    type {
-      tensor_type {
-        elem_type: 1
-        shape {
-          dim {
-            dim_value: 3
-          }
-          dim {
-            dim_value: 4
-          }
-          dim {
-            dim_value: 2
-          }
-        }
-      }
-    }
-  }
-  input {
-    name: "onnx::BitShift_1"
+    name: "onnx::BitShift_0"
     type {
       tensor_type {
         elem_type: 2
@@ -102,10 +57,10 @@ graph {
     }
   }
   output {
-    name: "7"
+    name: "3"
     type {
       tensor_type {
-        elem_type: 1
+        elem_type: 2
         shape {
           dim {
             dim_value: 3
@@ -121,7 +76,7 @@ graph {
     }
   }
   output {
-    name: "10"
+    name: "6"
     type {
       tensor_type {
         elem_type: 2
diff --git a/test/onnx/expect/TestOperators.test_c2_op.expect b/test/onnx/expect/TestOperators.test_c2_op.expect
index 941cde493661..bd525b881aee 100644
--- a/test/onnx/expect/TestOperators.test_c2_op.expect
+++ b/test/onnx/expect/TestOperators.test_c2_op.expect
@@ -63,7 +63,7 @@ graph {
     }
     domain: "org.pytorch._caffe2"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "_caffe2::GenerateProposals_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect
index d623c913aeec..575245c807eb 100644
--- a/test/onnx/expect/TestOperators.test_chunk.expect
+++ b/test/onnx/expect/TestOperators.test_chunk.expect
@@ -1,28 +1,158 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Split_0"
-    output: "1"
-    output: "2"
-    name: "Split_0"
-    op_type: "Split"
+    input: "onnx::Shape_0"
+    output: "onnx::Gather_1"
+    name: "Shape_0"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_2"
+    name: "Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_1"
+    input: "onnx::Gather_2"
+    output: "onnx::Add_3"
+    name: "Gather_2"
+    op_type: "Gather"
     attribute {
       name: "axis"
       i: 0
       type: INT
     }
+  }
+  node {
+    output: "onnx::Slice_4"
+    name: "Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Add_5"
+    name: "Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Add_3"
+    input: "onnx::Add_5"
+    output: "onnx::Div_6"
+    name: "Add_5"
+    op_type: "Add"
+  }
+  node {
+    output: "onnx::Div_7"
+    name: "Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Div_6"
+    input: "onnx::Div_7"
+    output: "onnx::Mul_8"
+    name: "Div_7"
+    op_type: "Div"
+  }
+  node {
+    output: "onnx::Mul_9"
+    name: "Constant_8"
+    op_type: "Constant"
     attribute {
-      name: "split"
-      ints: 2
-      ints: 1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "onnx::Mul_8"
+    input: "onnx::Mul_9"
+    output: "onnx::Slice_10"
+    name: "Mul_9"
+    op_type: "Mul"
+  }
+  node {
+    input: "onnx::Shape_0"
+    input: "onnx::Slice_4"
+    input: "onnx::Slice_10"
+    input: "onnx::Gather_2"
+    output: "11"
+    name: "Slice_10"
+    op_type: "Slice"
+  }
+  node {
+    output: "onnx::Mul_12"
+    name: "Constant_11"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Mul_8"
+    input: "onnx::Mul_12"
+    output: "onnx::Slice_13"
+    name: "Mul_12"
+    op_type: "Mul"
+  }
+  node {
+    input: "onnx::Shape_0"
+    input: "onnx::Slice_10"
+    input: "onnx::Slice_13"
+    input: "onnx::Gather_2"
+    output: "14"
+    name: "Slice_13"
+    op_type: "Slice"
+  }
+  name: "torch_jit"
   input {
-    name: "onnx::Split_0"
+    name: "onnx::Shape_0"
     type {
       tensor_type {
         elem_type: 1
@@ -35,7 +165,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "11"
     type {
       tensor_type {
         elem_type: 1
@@ -48,7 +178,7 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "14"
     type {
       tensor_type {
         elem_type: 1
@@ -62,5 +192,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect
index d2b0febe45c2..81606851e785 100644
--- a/test/onnx/expect/TestOperators.test_clip.expect
+++ b/test/onnx/expect/TestOperators.test_clip.expect
@@ -1,24 +1,26 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::Clip_0"
-    output: "1"
+    input: "onnx::Clip_6"
+    input: "onnx::Clip_7"
+    output: "5"
     name: "Clip_0"
     op_type: "Clip"
-    attribute {
-      name: "max"
-      f: 0.5
-      type: FLOAT
-    }
-    attribute {
-      name: "min"
-      f: -0.5
-      type: FLOAT
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
+  initializer {
+    data_type: 1
+    name: "onnx::Clip_6"
+    raw_data: "\000\000\000\277"
+  }
+  initializer {
+    data_type: 1
+    name: "onnx::Clip_7"
+    raw_data: "\000\000\000?"
+  }
   input {
     name: "onnx::Clip_0"
     type {
@@ -36,7 +38,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
@@ -53,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect
index 0a254a516e5c..ceb89b3048c6 100644
--- a/test/onnx/expect/TestOperators.test_clip_max.expect
+++ b/test/onnx/expect/TestOperators.test_clip_max.expect
@@ -1,19 +1,21 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::Clip_0"
-    output: "1"
+    input: ""
+    input: "onnx::Clip_7"
+    output: "5"
     name: "Clip_0"
     op_type: "Clip"
-    attribute {
-      name: "max"
-      f: 0.1
-      type: FLOAT
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
+  initializer {
+    data_type: 1
+    name: "onnx::Clip_7"
+    raw_data: "\315\314\314="
+  }
   input {
     name: "onnx::Clip_0"
     type {
@@ -37,22 +39,22 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Clip5_dim_0"
           }
           dim {
-            dim_value: 2
+            dim_param: "Clip5_dim_1"
           }
           dim {
-            dim_value: 3
+            dim_param: "Clip5_dim_2"
           }
           dim {
-            dim_value: 4
+            dim_param: "Clip5_dim_3"
           }
         }
       }
@@ -60,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect
index d54354d6f212..22826be3fd54 100644
--- a/test/onnx/expect/TestOperators.test_clip_min.expect
+++ b/test/onnx/expect/TestOperators.test_clip_min.expect
@@ -1,19 +1,21 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::Clip_0"
-    output: "1"
+    input: "onnx::Clip_7"
+    input: ""
+    output: "5"
     name: "Clip_0"
     op_type: "Clip"
-    attribute {
-      name: "min"
-      f: -0.1
-      type: FLOAT
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
+  initializer {
+    data_type: 1
+    name: "onnx::Clip_7"
+    raw_data: "\315\314\314\275"
+  }
   input {
     name: "onnx::Clip_0"
     type {
@@ -37,22 +39,22 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Clip5_dim_0"
           }
           dim {
-            dim_value: 2
+            dim_param: "Clip5_dim_1"
           }
           dim {
-            dim_value: 3
+            dim_param: "Clip5_dim_2"
           }
           dim {
-            dim_value: 4
+            dim_param: "Clip5_dim_3"
           }
         }
       }
@@ -60,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect
index a5f3481305ae..f5b6aec0c229 100644
--- a/test/onnx/expect/TestOperators.test_concat2.expect
+++ b/test/onnx/expect/TestOperators.test_concat2.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -14,7 +14,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Concat_0"
     type {
@@ -65,5 +65,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect
index 9f63f64dae0a..f1078cef39c1 100644
--- a/test/onnx/expect/TestOperators.test_conv.expect
+++ b/test/onnx/expect/TestOperators.test_conv.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -40,7 +40,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 13
     dims: 16
@@ -118,5 +118,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect
index 2347dc7da914..18e3c683e9bc 100644
--- a/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect
+++ b/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -40,7 +40,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 13
     dims: 16
@@ -96,5 +96,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect b/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect
index 3737ed4a571d..94ad47523905 100644
--- a/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect
+++ b/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect
@@ -40,7 +40,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 4
     dims: 2
diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect
index b1f9bda52040..0beedca2f292 100644
--- a/test/onnx/expect/TestOperators.test_convtranspose.expect
+++ b/test/onnx/expect/TestOperators.test_convtranspose.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -46,7 +46,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 3
     dims: 3
@@ -124,5 +124,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect
index c0821c91e8aa..1185bca62c59 100644
--- a/test/onnx/expect/TestOperators.test_cos.expect
+++ b/test/onnx/expect/TestOperators.test_cos.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Cos_0"
     op_type: "Cos"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cos_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_cumsum.expect b/test/onnx/expect/TestOperators.test_cumsum.expect
index 0f8c5981a222..19d4d97d0817 100644
--- a/test/onnx/expect/TestOperators.test_cumsum.expect
+++ b/test/onnx/expect/TestOperators.test_cumsum.expect
@@ -22,7 +22,7 @@ graph {
     name: "CumSum_1"
     op_type: "CumSum"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::CumSum_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_det.expect b/test/onnx/expect/TestOperators.test_det.expect
index 8495b85fc50d..a15b0e2d32de 100644
--- a/test/onnx/expect/TestOperators.test_det.expect
+++ b/test/onnx/expect/TestOperators.test_det.expect
@@ -8,7 +8,7 @@ graph {
     name: "Det_0"
     op_type: "Det"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Det_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dict.expect b/test/onnx/expect/TestOperators.test_dict.expect
index 26b7031d7760..e041d535d768 100644
--- a/test/onnx/expect/TestOperators.test_dict.expect
+++ b/test/onnx/expect/TestOperators.test_dict.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dict_str.expect b/test/onnx/expect/TestOperators.test_dict_str.expect
index 0bdfa7638616..eaab2752fb7d 100644
--- a/test/onnx/expect/TestOperators.test_dict_str.expect
+++ b/test/onnx/expect/TestOperators.test_dict_str.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Add_1"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -63,5 +63,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dim.expect b/test/onnx/expect/TestOperators.test_dim.expect
index 4b1f6a3881d1..59e910a646ca 100644
--- a/test/onnx/expect/TestOperators.test_dim.expect
+++ b/test/onnx/expect/TestOperators.test_dim.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -15,7 +15,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -28,5 +28,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dropout.expect b/test/onnx/expect/TestOperators.test_dropout.expect
index 6cea69c5c17f..27aab5c71821 100644
--- a/test/onnx/expect/TestOperators.test_dropout.expect
+++ b/test/onnx/expect/TestOperators.test_dropout.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dropout_default.expect b/test/onnx/expect/TestOperators.test_dropout_default.expect
index a8fce1dd2745..89c0e988aacb 100644
--- a/test/onnx/expect/TestOperators.test_dropout_default.expect
+++ b/test/onnx/expect/TestOperators.test_dropout_default.expect
@@ -1,23 +1,46 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "x"
-    output: "onnx::ReduceMax_1"
-    output: "2"
-    name: "Dropout_0"
-    op_type: "Dropout"
+    output: "onnx::Dropout_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\000\000\000?"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Dropout_2"
+    name: "Constant_1"
+    op_type: "Constant"
     attribute {
-      name: "ratio"
-      f: 0.5
-      type: FLOAT
+      name: "value"
+      t {
+        data_type: 9
+        raw_data: "\001"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::ReduceMax_1"
-    output: "3"
-    name: "ReduceMax_1"
+    input: "x"
+    input: "onnx::Dropout_1"
+    input: "onnx::Dropout_2"
+    output: "onnx::ReduceMax_3"
+    output: "4"
+    name: "Dropout_2"
+    op_type: "Dropout"
+  }
+  node {
+    input: "onnx::ReduceMax_3"
+    output: "5"
+    name: "ReduceMax_3"
     op_type: "ReduceMax"
     attribute {
       name: "keepdims"
@@ -25,7 +48,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
@@ -43,7 +66,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
@@ -54,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dropout_opset12.expect b/test/onnx/expect/TestOperators.test_dropout_opset12.expect
index b2f908d4b1c2..af5738700bc3 100644
--- a/test/onnx/expect/TestOperators.test_dropout_opset12.expect
+++ b/test/onnx/expect/TestOperators.test_dropout_opset12.expect
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dropout_training.expect b/test/onnx/expect/TestOperators.test_dropout_training.expect
index a8fce1dd2745..89c0e988aacb 100644
--- a/test/onnx/expect/TestOperators.test_dropout_training.expect
+++ b/test/onnx/expect/TestOperators.test_dropout_training.expect
@@ -1,23 +1,46 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "x"
-    output: "onnx::ReduceMax_1"
-    output: "2"
-    name: "Dropout_0"
-    op_type: "Dropout"
+    output: "onnx::Dropout_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\000\000\000?"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Dropout_2"
+    name: "Constant_1"
+    op_type: "Constant"
     attribute {
-      name: "ratio"
-      f: 0.5
-      type: FLOAT
+      name: "value"
+      t {
+        data_type: 9
+        raw_data: "\001"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::ReduceMax_1"
-    output: "3"
-    name: "ReduceMax_1"
+    input: "x"
+    input: "onnx::Dropout_1"
+    input: "onnx::Dropout_2"
+    output: "onnx::ReduceMax_3"
+    output: "4"
+    name: "Dropout_2"
+    op_type: "Dropout"
+  }
+  node {
+    input: "onnx::ReduceMax_3"
+    output: "5"
+    name: "ReduceMax_3"
     op_type: "ReduceMax"
     attribute {
       name: "keepdims"
@@ -25,7 +48,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
@@ -43,7 +66,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
@@ -54,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect b/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect
index 657f7bd38817..7effb1e17421 100644
--- a/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect
+++ b/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect
@@ -48,7 +48,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect
index 7ad54ca1031c..cf6c6b358037 100644
--- a/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect
+++ b/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input_1"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect
index 9698a3285217..8fd558eda82f 100644
--- a/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect
+++ b/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input_1"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect
index dd0943e8ece4..ee4c3f82c7b4 100644
--- a/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect
+++ b/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect
@@ -9,7 +9,7 @@ graph {
     name: "MatMul_0"
     op_type: "MatMul"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input_1"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect
index c41a46d54519..673e34cc4dc4 100644
--- a/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect
+++ b/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect
index a3931b6ebd1d..2dbf0d186ccc 100644
--- a/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect
+++ b/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect
@@ -37,7 +37,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect
index 5dc0dc6db48b..9fc2d5aab1fe 100644
--- a/test/onnx/expect/TestOperators.test_elu.expect
+++ b/test/onnx/expect/TestOperators.test_elu.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_embedding_bags.expect b/test/onnx/expect/TestOperators.test_embedding_bags.expect
index 630fee8ba8e0..eb4a94b75590 100644
--- a/test/onnx/expect/TestOperators.test_embedding_bags.expect
+++ b/test/onnx/expect/TestOperators.test_embedding_bags.expect
@@ -1,43 +1,354 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "weight"
-    input: "input"
-    input: "offsets"
-    output: "3"
-    output: "4"
     output: "5"
-    output: "6"
-    op_type: "ATen"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "include_last_offset"
-      i: 0
-      type: INT
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "input"
+    output: "onnx::Gather_6"
+    name: "Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_7"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "mode"
-      i: 1
+      name: "value"
+      t {
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_6"
+    input: "onnx::Gather_7"
+    output: "onnx::Unsqueeze_8"
+    name: "Gather_3"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
       type: INT
     }
+  }
+  node {
+    output: "onnx::Unsqueeze_9"
+    name: "Constant_4"
+    op_type: "Constant"
     attribute {
-      name: "operator"
-      s: "embedding_bag"
-      type: STRING
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::Unsqueeze_8"
+    input: "onnx::Unsqueeze_9"
+    output: "onnx::Concat_10"
+    name: "Unsqueeze_5"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "offsets"
+    input: "onnx::Concat_10"
+    output: "onnx::Slice_11"
+    name: "Concat_6"
+    op_type: "Concat"
     attribute {
-      name: "scale_grad_by_freq"
+      name: "axis"
       i: 0
       type: INT
     }
+  }
+  node {
+    output: "onnx::Slice_12"
+    name: "Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_13"
+    name: "Constant_8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_14"
+    name: "Constant_9"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\177"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_15"
+    name: "Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Slice_11"
+    input: "onnx::Slice_13"
+    input: "onnx::Slice_14"
+    input: "onnx::Slice_12"
+    input: "onnx::Slice_15"
+    output: "onnx::Shape_16"
+    name: "Slice_11"
+    op_type: "Slice"
+  }
+  node {
+    input: "onnx::Shape_16"
+    output: "onnx::Gather_17"
+    name: "Shape_12"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_18"
+    name: "Constant_13"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_17"
+    input: "onnx::Gather_18"
+    output: "onnx::Loop_19"
+    name: "Gather_14"
+    op_type: "Gather"
     attribute {
-      name: "sparse"
+      name: "axis"
       i: 0
       type: INT
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "onnx::Loop_19"
+    input: "onnx::Loop_33"
+    output: "20"
+    name: "Loop_15"
+    op_type: "Loop"
+    attribute {
+      name: "body"
+      g {
+        node {
+          input: "onnx::Slice_11"
+          input: "21"
+          output: "23"
+          name: "Gather_16"
+          op_type: "Gather"
+          attribute {
+            name: "axis"
+            i: 0
+            type: INT
+          }
+        }
+        node {
+          input: "onnx::Shape_16"
+          input: "21"
+          output: "24"
+          name: "Gather_17"
+          op_type: "Gather"
+          attribute {
+            name: "axis"
+            i: 0
+            type: INT
+          }
+        }
+        node {
+          output: "25"
+          name: "Constant_18"
+          op_type: "Constant"
+          attribute {
+            name: "value"
+            t {
+              dims: 1
+              data_type: 7
+              raw_data: "\000\000\000\000\000\000\000\000"
+            }
+            type: TENSOR
+          }
+        }
+        node {
+          input: "23"
+          input: "25"
+          output: "26"
+          name: "Unsqueeze_19"
+          op_type: "Unsqueeze"
+        }
+        node {
+          output: "27"
+          name: "Constant_20"
+          op_type: "Constant"
+          attribute {
+            name: "value"
+            t {
+              dims: 1
+              data_type: 7
+              raw_data: "\000\000\000\000\000\000\000\000"
+            }
+            type: TENSOR
+          }
+        }
+        node {
+          input: "24"
+          input: "27"
+          output: "28"
+          name: "Unsqueeze_21"
+          op_type: "Unsqueeze"
+        }
+        node {
+          input: "input"
+          input: "26"
+          input: "28"
+          input: "5"
+          output: "29"
+          name: "Slice_22"
+          op_type: "Slice"
+        }
+        node {
+          input: "weight"
+          input: "29"
+          output: "30"
+          name: "Gather_23"
+          op_type: "Gather"
+          attribute {
+            name: "axis"
+            i: 0
+            type: INT
+          }
+        }
+        node {
+          input: "30"
+          output: "31"
+          name: "ReduceMean_24"
+          op_type: "ReduceMean"
+          attribute {
+            name: "axes"
+            ints: 0
+            type: INTS
+          }
+          attribute {
+            name: "keepdims"
+            i: 0
+            type: INT
+          }
+        }
+        node {
+          input: "onnx::Loop_33"
+          output: "32"
+          name: "Cast_25"
+          op_type: "Cast"
+          attribute {
+            name: "to"
+            i: 9
+            type: INT
+          }
+        }
+        name: "torch_jit1"
+        input {
+          name: "21"
+          type {
+            tensor_type {
+              elem_type: 7
+              shape {
+              }
+            }
+          }
+        }
+        input {
+          name: "22"
+          type {
+            tensor_type {
+              elem_type: 9
+              shape {
+              }
+            }
+          }
+        }
+        output {
+          name: "32"
+          type {
+            tensor_type {
+              elem_type: 9
+              shape {
+              }
+            }
+          }
+        }
+        output {
+          name: "31"
+          type {
+            tensor_type {
+              elem_type: 1
+              shape {
+                dim {
+                  dim_param: "Loop20_dim_1"
+                }
+              }
+            }
+          }
+        }
+      }
+      type: GRAPH
+    }
+  }
+  name: "torch_jit"
   initializer {
     dims: 10
     dims: 8
@@ -45,6 +356,11 @@ graph {
     name: "weight"
     raw_data: "\264\314\344\275\017A\376\276\313\374&>J\266a\277s\306\\=\212\032+?\211[t\275\344[\357\276Dk\\\276OKb?\234\'B\277A\334\274\2767N\257\276\320s\263\277\371+\244>:\314\202\277K\200L??\001\275\275\236u4\2774\032\315\277\214\004\224>Z\320\372>\267B\305\276\346G6\277N\265.\276\343\316\272\277t\364a>\201)|>p\223\251\277Qm2?\346\275)\277\354\235\233?\027X\277\277\253\206a?\354\335\226\277L\032o\277\251J\021\277\311\360\215\276\312\274\013\300\252\320\273>\220\"p?\267\020\000<R\262\240\276\343\016\224\2779\241\353?8;\202\277\023\020\234?E\370#>\222\233\314?\334\360?\275|t\303\277\214\351\000\300\3065\302\2775\206\306>X\251\227\277x\2160?U^\251?d\221\350?\237F.?\rp9?9X\004=/c\324\277SL\360\277\'\274<?t\375l?\342\270l?\240\352:>\332\356\226\275\211\035\241>*\271\204\277>\025W>\036K\035?\036\233\200=\035\313\250\276\017\003\346\277\374p_?\313WD?!\006\351\275\232\\q\277\230\007A?"
   }
+  initializer {
+    data_type: 9
+    name: "onnx::Loop_33"
+    raw_data: "\001"
+  }
   input {
     name: "input"
     type {
@@ -87,17 +403,27 @@ graph {
       }
     }
   }
+  input {
+    name: "onnx::Loop_33"
+    type {
+      tensor_type {
+        elem_type: 9
+        shape {
+        }
+      }
+    }
+  }
   output {
-    name: "3"
+    name: "20"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Loop20_dim_0"
           }
           dim {
-            dim_value: 8
+            dim_param: "Loop20_dim_1"
           }
         }
       }
@@ -105,5 +431,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_empty_like.expect b/test/onnx/expect/TestOperators.test_empty_like.expect
index 27ac22983187..e4f6c6ede2ca 100644
--- a/test/onnx/expect/TestOperators.test_empty_like.expect
+++ b/test/onnx/expect/TestOperators.test_empty_like.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,7 +17,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -36,5 +36,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_empty_like_opset7.expect b/test/onnx/expect/TestOperators.test_empty_like_opset7.expect
index 095d3dc89bdb..504162493a00 100644
--- a/test/onnx/expect/TestOperators.test_empty_like_opset7.expect
+++ b/test/onnx/expect/TestOperators.test_empty_like_opset7.expect
@@ -29,7 +29,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Shape_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
index 9da730440ef5..5a9877d484f8 100644
--- a/test/onnx/expect/TestOperators.test_equal.expect
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Equal_0"
     op_type: "Equal"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Equal_0"
     type {
@@ -72,5 +72,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_erf.expect b/test/onnx/expect/TestOperators.test_erf.expect
index 023f051c05e6..f8f70c37598d 100644
--- a/test/onnx/expect/TestOperators.test_erf.expect
+++ b/test/onnx/expect/TestOperators.test_erf.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Erf_0"
     op_type: "Erf"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Erf_0"
     type {
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect
index f51786608e44..49d9f74cb20d 100644
--- a/test/onnx/expect/TestOperators.test_exp.expect
+++ b/test/onnx/expect/TestOperators.test_exp.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Exp_0"
     op_type: "Exp"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Exp_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_expand.expect b/test/onnx/expect/TestOperators.test_expand.expect
index 2e0fd4fdf725..6634173a0a63 100644
--- a/test/onnx/expect/TestOperators.test_expand.expect
+++ b/test/onnx/expect/TestOperators.test_expand.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -87,7 +87,7 @@ graph {
     name: "Expand_7"
     op_type: "Expand"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 1
     data_type: 7
@@ -131,5 +131,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect
index e44e542c4138..12160e8b9e66 100644
--- a/test/onnx/expect/TestOperators.test_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_flatten.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,29 +9,59 @@ graph {
     op_type: "Shape"
   }
   node {
-    input: "onnx::Slice_1"
-    output: "onnx::Concat_2"
-    name: "Slice_1"
-    op_type: "Slice"
+    output: "onnx::Slice_2"
+    name: "Constant_1"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_3"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "ends"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_4"
+    name: "Constant_3"
+    op_type: "Constant"
     attribute {
-      name: "starts"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
-    output: "onnx::Concat_3"
-    name: "Constant_2"
+    input: "onnx::Slice_1"
+    input: "onnx::Slice_3"
+    input: "onnx::Slice_4"
+    input: "onnx::Slice_2"
+    output: "onnx::Concat_5"
+    name: "Slice_4"
+    op_type: "Slice"
+  }
+  node {
+    output: "onnx::Concat_6"
+    name: "Constant_5"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -44,10 +74,10 @@ graph {
     }
   }
   node {
-    input: "onnx::Concat_2"
-    input: "onnx::Concat_3"
-    output: "onnx::Reshape_4"
-    name: "Concat_3"
+    input: "onnx::Concat_5"
+    input: "onnx::Concat_6"
+    output: "onnx::Reshape_7"
+    name: "Concat_6"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -57,12 +87,12 @@ graph {
   }
   node {
     input: "onnx::Shape_0"
-    input: "onnx::Reshape_4"
-    output: "5"
-    name: "Reshape_4"
+    input: "onnx::Reshape_7"
+    output: "8"
+    name: "Reshape_7"
     op_type: "Reshape"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Shape_0"
     type {
@@ -86,7 +116,7 @@ graph {
     }
   }
   output {
-    name: "5"
+    name: "8"
     type {
       tensor_type {
         elem_type: 1
@@ -100,5 +130,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_flatten2D.expect b/test/onnx/expect/TestOperators.test_flatten2D.expect
index c8f2e1c16045..f60b1ba7066f 100644
--- a/test/onnx/expect/TestOperators.test_flatten2D.expect
+++ b/test/onnx/expect/TestOperators.test_flatten2D.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Flatten_0"
     type {
@@ -54,5 +54,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_fmod.expect b/test/onnx/expect/TestOperators.test_fmod.expect
index 91c8f93ee004..a93ed8980ce2 100644
--- a/test/onnx/expect/TestOperators.test_fmod.expect
+++ b/test/onnx/expect/TestOperators.test_fmod.expect
@@ -14,7 +14,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Mod_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_frobenius_norm.expect b/test/onnx/expect/TestOperators.test_frobenius_norm.expect
index 5b759b291c82..fba4585b18b8 100644
--- a/test/onnx/expect/TestOperators.test_frobenius_norm.expect
+++ b/test/onnx/expect/TestOperators.test_frobenius_norm.expect
@@ -1,38 +1,52 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    output: "onnx::ReduceSum_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
   node {
     input: "x"
     input: "x"
-    output: "onnx::ReduceSum_1"
-    name: "Mul_0"
+    output: "onnx::ReduceSum_2"
+    name: "Mul_1"
     op_type: "Mul"
   }
   node {
+    input: "onnx::ReduceSum_2"
     input: "onnx::ReduceSum_1"
-    output: "onnx::Sqrt_2"
-    name: "ReduceSum_1"
+    output: "onnx::Sqrt_3"
+    name: "ReduceSum_2"
     op_type: "ReduceSum"
-    attribute {
-      name: "axes"
-      ints: 0
-      ints: 1
-      type: INTS
-    }
     attribute {
       name: "keepdims"
       i: 1
       type: INT
     }
+    attribute {
+      name: "noop_with_empty_axes"
+      i: 0
+      type: INT
+    }
   }
   node {
-    input: "onnx::Sqrt_2"
-    output: "3"
-    name: "Sqrt_2"
+    input: "onnx::Sqrt_3"
+    output: "4"
+    name: "Sqrt_3"
     op_type: "Sqrt"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "x"
     type {
@@ -53,7 +67,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "4"
     type {
       tensor_type {
         elem_type: 1
@@ -73,5 +87,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect
index a832bd8e2c58..fc8acf5ee80d 100644
--- a/test/onnx/expect/TestOperators.test_full.expect
+++ b/test/onnx/expect/TestOperators.test_full.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,7 +17,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -36,5 +36,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_full_like.expect b/test/onnx/expect/TestOperators.test_full_like.expect
index a832bd8e2c58..fc8acf5ee80d 100644
--- a/test/onnx/expect/TestOperators.test_full_like.expect
+++ b/test/onnx/expect/TestOperators.test_full_like.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,7 +17,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -36,5 +36,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_gather.expect b/test/onnx/expect/TestOperators.test_gather.expect
index 97658076b969..609f89853ac6 100644
--- a/test/onnx/expect/TestOperators.test_gather.expect
+++ b/test/onnx/expect/TestOperators.test_gather.expect
@@ -1,114 +1,22 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::OneHot_2"
-    name: "Constant_0"
-    op_type: "Constant"
-    attribute {
-      name: "value"
-      t {
-        dims: 2
-        data_type: 7
-        raw_data: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
-      }
-      type: TENSOR
-    }
-  }
-  node {
-    output: "onnx::Gather_3"
-    name: "Constant_1"
-    op_type: "Constant"
-    attribute {
-      name: "value"
-      t {
-        dims: 1
-        data_type: 7
-        raw_data: "\001\000\000\000\000\000\000\000"
-      }
-      type: TENSOR
-    }
-  }
-  node {
-    input: "onnx::Shape_0"
-    output: "onnx::Gather_4"
-    name: "Shape_2"
-    op_type: "Shape"
-  }
-  node {
-    input: "onnx::Gather_4"
-    input: "onnx::Gather_3"
-    output: "onnx::OneHot_5"
-    name: "Gather_3"
-    op_type: "Gather"
-    attribute {
-      name: "axis"
-      i: 0
-      type: INT
-    }
-  }
-  node {
-    input: "onnx::OneHot_1"
-    input: "onnx::OneHot_5"
-    input: "onnx::OneHot_2"
-    output: "onnx::Cast_6"
-    name: "OneHot_4"
-    op_type: "OneHot"
+    input: "onnx::GatherElements_0"
+    input: "onnx::GatherElements_1"
+    output: "2"
+    name: "GatherElements_0"
+    op_type: "GatherElements"
     attribute {
       name: "axis"
       i: 1
       type: INT
     }
   }
-  node {
-    input: "onnx::Cast_6"
-    output: "onnx::Mul_7"
-    name: "Cast_5"
-    op_type: "Cast"
-    attribute {
-      name: "to"
-      i: 1
-      type: INT
-    }
-  }
-  node {
-    input: "onnx::Shape_0"
-    output: "onnx::Mul_8"
-    name: "Unsqueeze_6"
-    op_type: "Unsqueeze"
-    attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
-    }
-  }
-  node {
-    input: "onnx::Mul_8"
-    input: "onnx::Mul_7"
-    output: "onnx::ReduceSum_9"
-    name: "Mul_7"
-    op_type: "Mul"
-  }
-  node {
-    input: "onnx::ReduceSum_9"
-    output: "10"
-    name: "ReduceSum_8"
-    op_type: "ReduceSum"
-    attribute {
-      name: "axes"
-      ints: 1
-      type: INTS
-    }
-    attribute {
-      name: "keepdims"
-      i: 0
-      type: INT
-    }
-  }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
-    name: "onnx::Shape_0"
+    name: "onnx::GatherElements_0"
     type {
       tensor_type {
         elem_type: 1
@@ -127,7 +35,7 @@ graph {
     }
   }
   input {
-    name: "onnx::OneHot_1"
+    name: "onnx::GatherElements_1"
     type {
       tensor_type {
         elem_type: 7
@@ -146,7 +54,7 @@ graph {
     }
   }
   output {
-    name: "10"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -166,5 +74,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_gather_opset11.expect b/test/onnx/expect/TestOperators.test_gather_opset11.expect
index 42d94992e7ea..88c8a4542f45 100644
--- a/test/onnx/expect/TestOperators.test_gather_opset11.expect
+++ b/test/onnx/expect/TestOperators.test_gather_opset11.expect
@@ -14,7 +14,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::GatherElements_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 9d338b1e2ae1..8d578a4d25bd 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -1,23 +1,17 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Less_0"
-    input: "onnx::Less_1"
-    output: "onnx::Not_2"
-    name: "Less_0"
-    op_type: "Less"
+    input: "onnx::GreaterOrEqual_0"
+    input: "onnx::GreaterOrEqual_1"
+    output: "2"
+    name: "GreaterOrEqual_0"
+    op_type: "GreaterOrEqual"
   }
-  node {
-    input: "onnx::Not_2"
-    output: "3"
-    name: "Not_1"
-    op_type: "Not"
-  }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
-    name: "onnx::Less_0"
+    name: "onnx::GreaterOrEqual_0"
     type {
       tensor_type {
         elem_type: 6
@@ -33,7 +27,7 @@ graph {
     }
   }
   input {
-    name: "onnx::Less_1"
+    name: "onnx::GreaterOrEqual_1"
     type {
       tensor_type {
         elem_type: 6
@@ -49,7 +43,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "2"
     type {
       tensor_type {
         elem_type: 9
@@ -66,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_gelu.expect b/test/onnx/expect/TestOperators.test_gelu.expect
index 65265bc4f860..dfc7d1d88468 100644
--- a/test/onnx/expect/TestOperators.test_gelu.expect
+++ b/test/onnx/expect/TestOperators.test_gelu.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -75,7 +75,7 @@ graph {
     name: "Mul_7"
     op_type: "Mul"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Div_0"
     type {
@@ -122,5 +122,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
index 08dec7abe29d..5aab77798bf6 100644
--- a/test/onnx/expect/TestOperators.test_gt.expect
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Greater_0"
     op_type: "Greater"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Greater_0"
     type {
@@ -72,5 +72,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect
index 3648a367d050..1268a4c14cfd 100644
--- a/test/onnx/expect/TestOperators.test_hardtanh.expect
+++ b/test/onnx/expect/TestOperators.test_hardtanh.expect
@@ -1,24 +1,42 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "input"
-    output: "1"
-    name: "Clip_0"
-    op_type: "Clip"
+    output: "onnx::Clip_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "max"
-      f: 0.5
-      type: FLOAT
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\000\000\000\277"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Clip_2"
+    name: "Constant_1"
+    op_type: "Constant"
     attribute {
-      name: "min"
-      f: -0.5
-      type: FLOAT
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\000\000\000?"
+      }
+      type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "input"
+    input: "onnx::Clip_1"
+    input: "onnx::Clip_2"
+    output: "3"
+    name: "Clip_2"
+    op_type: "Clip"
+  }
+  name: "torch_jit"
   input {
     name: "input"
     type {
@@ -36,7 +54,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "3"
     type {
       tensor_type {
         elem_type: 1
@@ -53,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_implicit_expand.expect b/test/onnx/expect/TestOperators.test_implicit_expand.expect
index 9d64c5d16568..3c94edc85b4b 100644
--- a/test/onnx/expect/TestOperators.test_implicit_expand.expect
+++ b/test/onnx/expect/TestOperators.test_implicit_expand.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Add_1"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect
index c65cffea5c2b..330d2de0d7fc 100644
--- a/test/onnx/expect/TestOperators.test_index.expect
+++ b/test/onnx/expect/TestOperators.test_index.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -27,7 +27,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Gather_0"
     type {
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_isnan.expect b/test/onnx/expect/TestOperators.test_isnan.expect
index b0f390e4d6f6..198d3bdb2387 100644
--- a/test/onnx/expect/TestOperators.test_isnan.expect
+++ b/test/onnx/expect/TestOperators.test_isnan.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "IsNaN_0"
     op_type: "IsNaN"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::IsNaN_0"
     type {
@@ -37,5 +37,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_layer_norm_aten.expect b/test/onnx/expect/TestOperators.test_layer_norm_aten.expect
index 41a1ae695019..071437686117 100644
--- a/test/onnx/expect/TestOperators.test_layer_norm_aten.expect
+++ b/test/onnx/expect/TestOperators.test_layer_norm_aten.expect
@@ -1,36 +1,106 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "input"
-    input: "weight"
-    input: "bias"
-    output: "3"
-    op_type: "ATen"
+    output: "onnx::Sub_3"
+    name: "ReduceMean_0"
+    op_type: "ReduceMean"
     attribute {
-      name: "cudnn_enable"
-      i: 1
-      type: INT
+      name: "axes"
+      ints: -2
+      ints: -1
+      type: INTS
     }
+  }
+  node {
+    input: "input"
+    input: "onnx::Sub_3"
+    output: "onnx::Pow_4"
+    name: "Sub_1"
+    op_type: "Sub"
+  }
+  node {
+    output: "onnx::Pow_5"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "eps"
-      f: 1e-05
-      type: FLOAT
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\000\000\000@"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::Pow_4"
+    input: "onnx::Pow_5"
+    output: "onnx::ReduceMean_6"
+    name: "Pow_3"
+    op_type: "Pow"
+  }
+  node {
+    input: "onnx::ReduceMean_6"
+    output: "onnx::Add_7"
+    name: "ReduceMean_4"
+    op_type: "ReduceMean"
     attribute {
-      name: "normalized_shape"
-      ints: 10
-      ints: 10
+      name: "axes"
+      ints: -2
+      ints: -1
       type: INTS
     }
+  }
+  node {
+    output: "onnx::Add_8"
+    name: "Constant_5"
+    op_type: "Constant"
     attribute {
-      name: "operator"
-      s: "layer_norm"
-      type: STRING
+      name: "value"
+      t {
+        data_type: 1
+        raw_data: "\254\305\'7"
+      }
+      type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "onnx::Add_7"
+    input: "onnx::Add_8"
+    output: "onnx::Sqrt_9"
+    name: "Add_6"
+    op_type: "Add"
+  }
+  node {
+    input: "onnx::Sqrt_9"
+    output: "onnx::Div_10"
+    name: "Sqrt_7"
+    op_type: "Sqrt"
+  }
+  node {
+    input: "onnx::Pow_4"
+    input: "onnx::Div_10"
+    output: "onnx::Mul_11"
+    name: "Div_8"
+    op_type: "Div"
+  }
+  node {
+    input: "onnx::Mul_11"
+    input: "weight"
+    output: "onnx::Add_12"
+    name: "Mul_9"
+    op_type: "Mul"
+  }
+  node {
+    input: "onnx::Add_12"
+    input: "bias"
+    output: "13"
+    name: "Add_10"
+    op_type: "Add"
+  }
+  name: "torch_jit"
   initializer {
     dims: 10
     dims: 10
@@ -100,7 +170,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "13"
     type {
       tensor_type {
         elem_type: 1
@@ -123,5 +193,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index a29a46f89ebb..374a0d0e0d52 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -1,23 +1,17 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Greater_0"
-    input: "onnx::Greater_1"
-    output: "onnx::Not_2"
-    name: "Greater_0"
-    op_type: "Greater"
+    input: "onnx::LessOrEqual_0"
+    input: "onnx::LessOrEqual_1"
+    output: "2"
+    name: "LessOrEqual_0"
+    op_type: "LessOrEqual"
   }
-  node {
-    input: "onnx::Not_2"
-    output: "3"
-    name: "Not_1"
-    op_type: "Not"
-  }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
-    name: "onnx::Greater_0"
+    name: "onnx::LessOrEqual_0"
     type {
       tensor_type {
         elem_type: 6
@@ -33,7 +27,7 @@ graph {
     }
   }
   input {
-    name: "onnx::Greater_1"
+    name: "onnx::LessOrEqual_1"
     type {
       tensor_type {
         elem_type: 6
@@ -49,7 +43,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "2"
     type {
       tensor_type {
         elem_type: 9
@@ -66,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_linear.expect b/test/onnx/expect/TestOperators.test_linear.expect
index 806f1cf83eed..71c64dfe5a50 100644
--- a/test/onnx/expect/TestOperators.test_linear.expect
+++ b/test/onnx/expect/TestOperators.test_linear.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -25,7 +25,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 5
     dims: 4
@@ -102,5 +102,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_log_sigmoid.expect b/test/onnx/expect/TestOperators.test_log_sigmoid.expect
index 528952692684..2681f1193102 100644
--- a/test/onnx/expect/TestOperators.test_log_sigmoid.expect
+++ b/test/onnx/expect/TestOperators.test_log_sigmoid.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -14,7 +14,7 @@ graph {
     name: "Log_1"
     op_type: "Log"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sigmoid_0"
     type {
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect
index 4ae37e23b95e..1c4de89b6402 100644
--- a/test/onnx/expect/TestOperators.test_logsoftmax.expect
+++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect b/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect
index db2fcb4dcda3..94c9c72db8e6 100644
--- a/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect
+++ b/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect
@@ -18,7 +18,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "7"
     type {
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
index 29be0d629e82..2dbcc07cd9e1 100644
--- a/test/onnx/expect/TestOperators.test_lt.expect
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Less_0"
     op_type: "Less"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Less_0"
     type {
@@ -72,5 +72,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_master_opset.expect b/test/onnx/expect/TestOperators.test_master_opset.expect
index b9a30f4b545c..f468400e7c6a 100644
--- a/test/onnx/expect/TestOperators.test_master_opset.expect
+++ b/test/onnx/expect/TestOperators.test_master_opset.expect
@@ -9,7 +9,7 @@ graph {
     name: "Add_0"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect
index 29476215fd34..d9fcc0fb5f7a 100644
--- a/test/onnx/expect/TestOperators.test_max.expect
+++ b/test/onnx/expect/TestOperators.test_max.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Max_0"
     op_type: "Max"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Max_0"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect
index 4def8b60c6dd..f43712bbfc58 100644
--- a/test/onnx/expect/TestOperators.test_maxpool.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -7,6 +7,11 @@ graph {
     output: "1"
     name: "MaxPool_0"
     op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
     attribute {
       name: "kernel_shape"
       ints: 3
@@ -24,7 +29,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::MaxPool_0"
     type {
@@ -65,5 +70,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
index b0cfe51b0545..2d07fc6fadc7 100644
--- a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
@@ -34,7 +34,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::MaxPool_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_maxpool_indices.expect b/test/onnx/expect/TestOperators.test_maxpool_indices.expect
index 9b999cb6bf8d..46c23e3a4cae 100644
--- a/test/onnx/expect/TestOperators.test_maxpool_indices.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool_indices.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,6 +8,11 @@ graph {
     output: "onnx::Sub_2"
     name: "MaxPool_0"
     op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
     attribute {
       name: "kernel_shape"
       ints: 3
@@ -43,34 +48,64 @@ graph {
     }
   }
   node {
-    input: "onnx::Slice_4"
-    output: "onnx::Sub_5"
-    name: "Slice_2"
-    op_type: "Slice"
+    output: "onnx::Slice_5"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_6"
+    name: "Constant_3"
+    op_type: "Constant"
     attribute {
-      name: "ends"
-      ints: 1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_7"
+    name: "Constant_4"
+    op_type: "Constant"
     attribute {
-      name: "starts"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
+  node {
+    input: "onnx::Slice_4"
+    input: "onnx::Slice_6"
+    input: "onnx::Slice_7"
+    input: "onnx::Slice_5"
+    output: "onnx::Sub_8"
+    name: "Slice_5"
+    op_type: "Slice"
+  }
   node {
     input: "onnx::Sub_2"
-    input: "onnx::Sub_5"
-    output: "6"
-    name: "Sub_3"
+    input: "onnx::Sub_8"
+    output: "9"
+    name: "Sub_6"
     op_type: "Sub"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::MaxPool_0"
     type {
@@ -110,7 +145,7 @@ graph {
     }
   }
   output {
-    name: "6"
+    name: "9"
     type {
       tensor_type {
         elem_type: 7
@@ -130,5 +165,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect
index 6a3e26dc6d2d..b53b8c2f1248 100644
--- a/test/onnx/expect/TestOperators.test_mean.expect
+++ b/test/onnx/expect/TestOperators.test_mean.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMean_0"
     type {
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_mean_dtype.expect b/test/onnx/expect/TestOperators.test_mean_dtype.expect
index acac724e1c16..92ce0ae3aa99 100644
--- a/test/onnx/expect/TestOperators.test_mean_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_mean_dtype.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -24,7 +24,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_meshgrid.expect b/test/onnx/expect/TestOperators.test_meshgrid.expect
index 7f9321046a8c..05b9de875d94 100644
--- a/test/onnx/expect/TestOperators.test_meshgrid.expect
+++ b/test/onnx/expect/TestOperators.test_meshgrid.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -219,7 +219,7 @@ graph {
     name: "Expand_21"
     op_type: "Expand"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Reshape_0"
     type {
@@ -318,5 +318,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect
index 13c08b82a548..28ca14779f71 100644
--- a/test/onnx/expect/TestOperators.test_min.expect
+++ b/test/onnx/expect/TestOperators.test_min.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Min_0"
     op_type: "Min"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Min_0"
     type {
@@ -60,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect
index 97f7761b985f..9492d651fd9e 100644
--- a/test/onnx/expect/TestOperators.test_mm.expect
+++ b/test/onnx/expect/TestOperators.test_mm.expect
@@ -1,27 +1,12 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
-  node {
-    output: "onnx::Gemm_2"
-    name: "Constant_0"
-    op_type: "Constant"
-    attribute {
-      name: "value"
-      t {
-        dims: 1
-        data_type: 1
-        raw_data: "\000\000\200?"
-      }
-      type: TENSOR
-    }
-  }
   node {
     input: "onnx::Gemm_0"
     input: "onnx::Gemm_1"
-    input: "onnx::Gemm_2"
-    output: "3"
-    name: "Gemm_1"
+    output: "2"
+    name: "Gemm_0"
     op_type: "Gemm"
     attribute {
       name: "alpha"
@@ -34,7 +19,7 @@ graph {
       type: FLOAT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Gemm_0"
     type {
@@ -68,7 +53,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -85,5 +70,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_narrow.expect b/test/onnx/expect/TestOperators.test_narrow.expect
index 70b4ef8b35c8..a7b13c89a646 100644
--- a/test/onnx/expect/TestOperators.test_narrow.expect
+++ b/test/onnx/expect/TestOperators.test_narrow.expect
@@ -1,29 +1,35 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::Slice_0"
-    output: "1"
+    input: "onnx::Slice_14"
+    input: "onnx::Slice_15"
+    input: "onnx::Slice_16"
+    output: "12"
     name: "Slice_0"
     op_type: "Slice"
-    attribute {
-      name: "axes"
-      ints: 0
-      type: INTS
-    }
-    attribute {
-      name: "ends"
-      ints: 2
-      type: INTS
-    }
-    attribute {
-      name: "starts"
-      ints: 0
-      type: INTS
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
+  initializer {
+    dims: 1
+    data_type: 7
+    name: "onnx::Slice_14"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 1
+    data_type: 7
+    name: "onnx::Slice_15"
+    raw_data: "\002\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 1
+    data_type: 7
+    name: "onnx::Slice_16"
+    raw_data: "\000\000\000\000\000\000\000\000"
+  }
   input {
     name: "onnx::Slice_0"
     type {
@@ -41,7 +47,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "12"
     type {
       tensor_type {
         elem_type: 1
@@ -58,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_ne.expect b/test/onnx/expect/TestOperators.test_ne.expect
index 6849f2711765..ab053fbcf67e 100644
--- a/test/onnx/expect/TestOperators.test_ne.expect
+++ b/test/onnx/expect/TestOperators.test_ne.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -15,7 +15,7 @@ graph {
     name: "Not_1"
     op_type: "Not"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Equal_0"
     type {
@@ -78,5 +78,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_nonzero.expect b/test/onnx/expect/TestOperators.test_nonzero.expect
index 48a57dc61587..cfcb1f505f87 100644
--- a/test/onnx/expect/TestOperators.test_nonzero.expect
+++ b/test/onnx/expect/TestOperators.test_nonzero.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -20,7 +20,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::NonZero_0"
     type {
@@ -58,5 +58,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_norm_p1.expect b/test/onnx/expect/TestOperators.test_norm_p1.expect
index 519819695b20..ec5e12b90a16 100644
--- a/test/onnx/expect/TestOperators.test_norm_p1.expect
+++ b/test/onnx/expect/TestOperators.test_norm_p1.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceL1_0"
     type {
@@ -62,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_norm_p2.expect b/test/onnx/expect/TestOperators.test_norm_p2.expect
index 68d1762f4174..0388ec620821 100644
--- a/test/onnx/expect/TestOperators.test_norm_p2.expect
+++ b/test/onnx/expect/TestOperators.test_norm_p2.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceL2_0"
     type {
@@ -62,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_ones_like.expect b/test/onnx/expect/TestOperators.test_ones_like.expect
index ec25d269f79d..fafec789b174 100644
--- a/test/onnx/expect/TestOperators.test_ones_like.expect
+++ b/test/onnx/expect/TestOperators.test_ones_like.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,7 +17,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -36,5 +36,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect
index db2cdefe75cb..293877ab834a 100644
--- a/test/onnx/expect/TestOperators.test_pad.expect
+++ b/test/onnx/expect/TestOperators.test_pad.expect
@@ -1,33 +1,192 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "input"
-    output: "1"
-    name: "Pad_0"
-    op_type: "Pad"
+    input: "onnx::ConstantOfShape_27"
+    output: "onnx::Concat_10"
+    name: "ConstantOfShape_0"
+    op_type: "ConstantOfShape"
     attribute {
-      name: "mode"
-      s: "reflect"
-      type: STRING
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::Concat_28"
+    input: "onnx::Concat_10"
+    output: "onnx::Reshape_11"
+    name: "Concat_1"
+    op_type: "Concat"
     attribute {
-      name: "pads"
-      ints: 0
-      ints: 0
-      ints: 0
-      ints: 2
-      ints: 0
-      ints: 0
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Reshape_12"
+    name: "Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\377\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Reshape_11"
+    input: "onnx::Reshape_12"
+    output: "onnx::Slice_13"
+    name: "Reshape_3"
+    op_type: "Reshape"
+  }
+  node {
+    output: "onnx::Slice_14"
+    name: "Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_15"
+    name: "Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\377"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_16"
+    name: "Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\200"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_17"
+    name: "Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\377"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Slice_13"
+    input: "onnx::Slice_15"
+    input: "onnx::Slice_16"
+    input: "onnx::Slice_14"
+    input: "onnx::Slice_17"
+    output: "onnx::Transpose_18"
+    name: "Slice_8"
+    op_type: "Slice"
+  }
+  node {
+    input: "onnx::Transpose_18"
+    output: "onnx::Reshape_19"
+    name: "Transpose_9"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
       ints: 1
-      ints: 3
+      ints: 0
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  node {
+    output: "onnx::Reshape_20"
+    name: "Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\377"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Reshape_19"
+    input: "onnx::Reshape_20"
+    output: "onnx::Cast_21"
+    name: "Reshape_11"
+    op_type: "Reshape"
+  }
+  node {
+    input: "onnx::Cast_21"
+    output: "onnx::Pad_22"
+    name: "Cast_12"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Pad_0"
+    input: "onnx::Pad_22"
+    output: "23"
+    name: "Pad_13"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "reflect"
+      type: STRING
+    }
+  }
+  name: "torch_jit"
+  initializer {
+    dims: 1
+    data_type: 7
+    name: "onnx::ConstantOfShape_27"
+    raw_data: "\004\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 4
+    data_type: 7
+    name: "onnx::Concat_28"
+    raw_data: "\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+  }
   input {
-    name: "input"
+    name: "onnx::Pad_0"
     type {
       tensor_type {
         elem_type: 1
@@ -49,22 +208,22 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "23"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Pad23_dim_0"
           }
           dim {
-            dim_value: 1
+            dim_param: "Pad23_dim_1"
           }
           dim {
-            dim_value: 3
+            dim_param: "Pad23_dim_2"
           }
           dim {
-            dim_value: 9
+            dim_param: "Pad23_dim_3"
           }
         }
       }
@@ -72,5 +231,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect
index e12dc3843b25..67064d8087ae 100644
--- a/test/onnx/expect/TestOperators.test_params.expect
+++ b/test/onnx/expect/TestOperators.test_params.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -34,7 +34,7 @@ graph {
     name: "Neg_4"
     op_type: "Neg"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     dims: 2
@@ -92,5 +92,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect
index 3ee76556d819..8dbc34a20640 100644
--- a/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect
+++ b/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -34,7 +34,7 @@ graph {
     name: "Neg_4"
     op_type: "Neg"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     dims: 2
@@ -76,5 +76,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect
index f98608d97962..7f7b6afd9d2d 100644
--- a/test/onnx/expect/TestOperators.test_permute2.expect
+++ b/test/onnx/expect/TestOperators.test_permute2.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INTS
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Transpose_0"
     type {
@@ -77,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_pixel_shuffle.expect b/test/onnx/expect/TestOperators.test_pixel_shuffle.expect
index 1fe12cb04a1a..c5b5a8008d51 100644
--- a/test/onnx/expect/TestOperators.test_pixel_shuffle.expect
+++ b/test/onnx/expect/TestOperators.test_pixel_shuffle.expect
@@ -18,7 +18,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::DepthToSpace_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect
index 5bdefab8bbab..f20fd9555090 100644
--- a/test/onnx/expect/TestOperators.test_pow.expect
+++ b/test/onnx/expect/TestOperators.test_pow.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -9,7 +9,7 @@ graph {
     name: "Pow_0"
     op_type: "Pow"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Pow_0"
     type {
@@ -78,5 +78,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect
index 36e318f858f0..f2bcb50ef777 100644
--- a/test/onnx/expect/TestOperators.test_prelu.expect
+++ b/test/onnx/expect/TestOperators.test_prelu.expect
@@ -1,21 +1,21 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::PRelu_0"
-    input: "onnx::PRelu_4"
-    output: "3"
+    input: "onnx::PRelu_5"
+    output: "4"
     name: "PRelu_0"
     op_type: "PRelu"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     dims: 1
     dims: 1
     data_type: 1
-    name: "onnx::PRelu_4"
+    name: "onnx::PRelu_5"
     raw_data: "\000\000\200>\000\000\200>"
   }
   input {
@@ -41,7 +41,7 @@ graph {
     }
   }
   input {
-    name: "onnx::PRelu_4"
+    name: "onnx::PRelu_5"
     type {
       tensor_type {
         elem_type: 1
@@ -60,7 +60,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "4"
     type {
       tensor_type {
         elem_type: 1
@@ -83,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect
index 5c4960f49285..0cfeafa4da32 100644
--- a/test/onnx/expect/TestOperators.test_prod.expect
+++ b/test/onnx/expect/TestOperators.test_prod.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceProd_0"
     type {
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_prod_dtype.expect b/test/onnx/expect/TestOperators.test_prod_dtype.expect
index ec46842ed037..26a63ac840ad 100644
--- a/test/onnx/expect/TestOperators.test_prod_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_prod_dtype.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -24,7 +24,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_rand.expect b/test/onnx/expect/TestOperators.test_rand.expect
index 76f59f55f556..b4d2dbd6cb19 100644
--- a/test/onnx/expect/TestOperators.test_rand.expect
+++ b/test/onnx/expect/TestOperators.test_rand.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Add_1"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -69,5 +69,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_randn.expect b/test/onnx/expect/TestOperators.test_randn.expect
index 919e8252474c..bc2d0b23dd7b 100644
--- a/test/onnx/expect/TestOperators.test_randn.expect
+++ b/test/onnx/expect/TestOperators.test_randn.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Add_1"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -69,5 +69,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect b/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect
index c3db24de6651..7e5fefad2eb7 100644
--- a/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect
+++ b/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect
@@ -1,24 +1,34 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::ReduceSum_0"
-    output: "1"
-    name: "ReduceSum_0"
-    op_type: "ReduceSum"
+    output: "onnx::ReduceSum_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: -1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\377\377\377\377\377\377\377\377"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::ReduceSum_0"
+    input: "onnx::ReduceSum_1"
+    output: "2"
+    name: "ReduceSum_1"
+    op_type: "ReduceSum"
     attribute {
       name: "keepdims"
       i: 0
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceSum_0"
     type {
@@ -36,7 +46,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -50,5 +60,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect
index e06b21babdd5..ce69ab65a6a6 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMean_0"
     type {
@@ -62,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect
index bef20c43d0ba..71d9d296aecd 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -29,7 +29,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -73,5 +73,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
index d421af653a72..98bb26aaea36 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -19,7 +19,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMean_0"
     type {
@@ -66,5 +66,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect
index d43679dce55b..cdfbc0f5fbb6 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceProd_0"
     type {
@@ -62,5 +62,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect
index 6f10f754eaf2..641d21cb9c79 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -29,7 +29,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -73,5 +73,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
index 01405e24d99a..62befc2cf1cf 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -18,7 +18,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceProd_0"
     type {
@@ -65,5 +65,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect
index 1d43496bf517..e03a204a3f99 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect
@@ -1,25 +1,34 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::ReduceSum_0"
-    output: "1"
-    name: "ReduceSum_0"
-    op_type: "ReduceSum"
+    output: "onnx::ReduceSum_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 1
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::ReduceSum_0"
+    input: "onnx::ReduceSum_1"
+    output: "2"
+    name: "ReduceSum_1"
+    op_type: "ReduceSum"
     attribute {
       name: "keepdims"
       i: 0
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceSum_0"
     type {
@@ -43,7 +52,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -60,5 +69,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect
index d8a839d4c747..e8ffa49295a5 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect
@@ -1,11 +1,25 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Cast_0"
     output: "onnx::ReduceSum_1"
-    name: "Cast_0"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Cast_0"
+    output: "onnx::ReduceSum_2"
+    name: "Cast_1"
     op_type: "Cast"
     attribute {
       name: "to"
@@ -14,22 +28,18 @@ graph {
     }
   }
   node {
+    input: "onnx::ReduceSum_2"
     input: "onnx::ReduceSum_1"
-    output: "2"
-    name: "ReduceSum_1"
+    output: "3"
+    name: "ReduceSum_2"
     op_type: "ReduceSum"
-    attribute {
-      name: "axes"
-      ints: 0
-      type: INTS
-    }
     attribute {
       name: "keepdims"
       i: 0
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -53,7 +63,7 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "3"
     type {
       tensor_type {
         elem_type: 11
@@ -73,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
index e6711d19a7e8..7d05fdc26041 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
@@ -1,24 +1,34 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::ReduceSum_0"
-    output: "1"
-    name: "ReduceSum_0"
-    op_type: "ReduceSum"
+    output: "onnx::ReduceSum_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    input: "onnx::ReduceSum_0"
+    input: "onnx::ReduceSum_1"
+    output: "2"
+    name: "ReduceSum_1"
+    op_type: "ReduceSum"
     attribute {
       name: "keepdims"
       i: 1
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceSum_0"
     type {
@@ -42,7 +52,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -65,5 +75,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reducemax.expect b/test/onnx/expect/TestOperators.test_reducemax.expect
index c5df8d7ddc34..bbd770761f3a 100644
--- a/test/onnx/expect/TestOperators.test_reducemax.expect
+++ b/test/onnx/expect/TestOperators.test_reducemax.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMax_0"
     type {
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_reducemin.expect b/test/onnx/expect/TestOperators.test_reducemin.expect
index 5beb9eef96a7..a555fac90f0a 100644
--- a/test/onnx/expect/TestOperators.test_reducemin.expect
+++ b/test/onnx/expect/TestOperators.test_reducemin.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMin_0"
     type {
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_remainder.expect b/test/onnx/expect/TestOperators.test_remainder.expect
index aa75e08f4ba2..ecf44141260e 100644
--- a/test/onnx/expect/TestOperators.test_remainder.expect
+++ b/test/onnx/expect/TestOperators.test_remainder.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -29,7 +29,7 @@ graph {
     name: "Sub_3"
     op_type: "Sub"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Div_0"
     type {
@@ -89,5 +89,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect
index 01d061c19c63..5206bce0d88f 100644
--- a/test/onnx/expect/TestOperators.test_repeat.expect
+++ b/test/onnx/expect/TestOperators.test_repeat.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -45,7 +45,7 @@ graph {
     name: "Tile_3"
     op_type: "Tile"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 1
     data_type: 7
@@ -98,5 +98,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index e29b825e2a18..2dbb3a436d42 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -45,7 +45,7 @@ graph {
     name: "Tile_3"
     op_type: "Tile"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 1
     data_type: 7
@@ -92,5 +92,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_round.expect b/test/onnx/expect/TestOperators.test_round.expect
index 069fb7efc7f7..07809e1abdb8 100644
--- a/test/onnx/expect/TestOperators.test_round.expect
+++ b/test/onnx/expect/TestOperators.test_round.expect
@@ -8,7 +8,7 @@ graph {
     name: "Round_0"
     op_type: "Round"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Round_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_rrelu.expect b/test/onnx/expect/TestOperators.test_rrelu.expect
index ed5f0c5b865c..3fb75ab0bb4a 100644
--- a/test/onnx/expect/TestOperators.test_rrelu.expect
+++ b/test/onnx/expect/TestOperators.test_rrelu.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -25,7 +25,7 @@ graph {
     name: "PRelu_1"
     op_type: "PRelu"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
@@ -72,5 +72,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_rsqrt.expect b/test/onnx/expect/TestOperators.test_rsqrt.expect
index 45c1468d5331..32e4df543ae9 100644
--- a/test/onnx/expect/TestOperators.test_rsqrt.expect
+++ b/test/onnx/expect/TestOperators.test_rsqrt.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -28,7 +28,7 @@ graph {
     name: "Div_2"
     op_type: "Div"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sqrt_0"
     type {
@@ -63,5 +63,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect
index 21a031c72ded..75344bfc68de 100644
--- a/test/onnx/expect/TestOperators.test_rsub.expect
+++ b/test/onnx/expect/TestOperators.test_rsub.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -22,7 +22,7 @@ graph {
     name: "Sub_1"
     op_type: "Sub"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sub_0"
     type {
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_scatter_add.expect b/test/onnx/expect/TestOperators.test_scatter_add.expect
index 19302ffcb396..fd7514e30630 100644
--- a/test/onnx/expect/TestOperators.test_scatter_add.expect
+++ b/test/onnx/expect/TestOperators.test_scatter_add.expect
@@ -1,9 +1,9 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::Scatter_3"
+    output: "onnx::ScatterElements_3"
     name: "Constant_0"
     op_type: "Constant"
     attribute {
@@ -18,12 +18,12 @@ graph {
     }
   }
   node {
-    input: "onnx::Scatter_3"
-    input: "onnx::Scatter_1"
-    input: "onnx::Scatter_2"
+    input: "onnx::ScatterElements_3"
+    input: "onnx::ScatterElements_1"
+    input: "onnx::ScatterElements_2"
     output: "onnx::Add_4"
-    name: "Scatter_1"
-    op_type: "Scatter"
+    name: "ScatterElements_1"
+    op_type: "ScatterElements"
     attribute {
       name: "axis"
       i: 1
@@ -37,7 +37,7 @@ graph {
     name: "Add_2"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
@@ -55,7 +55,7 @@ graph {
     }
   }
   input {
-    name: "onnx::Scatter_1"
+    name: "onnx::ScatterElements_1"
     type {
       tensor_type {
         elem_type: 7
@@ -71,7 +71,7 @@ graph {
     }
   }
   input {
-    name: "onnx::Scatter_2"
+    name: "onnx::ScatterElements_2"
     type {
       tensor_type {
         elem_type: 1
@@ -104,5 +104,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect b/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect
index 1695e9db8120..bc4fabc15ddb 100644
--- a/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect
+++ b/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect
@@ -37,7 +37,7 @@ graph {
     name: "Add_2"
     op_type: "Add"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Add_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect
index bbfedf15051c..7cdc4dc8bac4 100644
--- a/test/onnx/expect/TestOperators.test_selu.expect
+++ b/test/onnx/expect/TestOperators.test_selu.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Selu_0"
     op_type: "Selu"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_shape_value_map.expect b/test/onnx/expect/TestOperators.test_shape_value_map.expect
index 0962d8b8cc83..174551f9a7c5 100644
--- a/test/onnx/expect/TestOperators.test_shape_value_map.expect
+++ b/test/onnx/expect/TestOperators.test_shape_value_map.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -34,23 +34,33 @@ graph {
     }
   }
   node {
-    input: "onnx::Unsqueeze_3"
-    output: "onnx::Concat_7"
-    name: "Unsqueeze_3"
-    op_type: "Unsqueeze"
+    output: "onnx::Unsqueeze_7"
+    name: "Constant_3"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::Concat_7"
-    input: "onnx::Concat_21"
-    input: "onnx::Concat_22"
-    input: "onnx::Concat_23"
-    output: "onnx::Reshape_11"
-    name: "Concat_4"
+    input: "onnx::Unsqueeze_3"
+    input: "onnx::Unsqueeze_7"
+    output: "onnx::Concat_8"
+    name: "Unsqueeze_4"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "onnx::Concat_8"
+    input: "onnx::Concat_26"
+    input: "onnx::Concat_27"
+    input: "onnx::Concat_28"
+    output: "onnx::Reshape_15"
+    name: "Concat_5"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -60,66 +70,62 @@ graph {
   }
   node {
     input: "x"
-    input: "onnx::Reshape_11"
-    output: "onnx::Transpose_12"
-    name: "Reshape_5"
+    input: "onnx::Reshape_15"
+    output: "onnx::Transpose_16"
+    name: "Reshape_6"
     op_type: "Reshape"
   }
   node {
-    input: "onnx::Transpose_12"
-    output: "onnx::Softmax_13"
-    name: "Transpose_6"
+    input: "onnx::Transpose_16"
+    output: "x.1"
+    name: "Transpose_7"
     op_type: "Transpose"
     attribute {
       name: "perm"
       ints: 0
-      ints: 3
-      ints: 1
       ints: 2
+      ints: 1
+      ints: 3
       type: INTS
     }
   }
   node {
-    input: "onnx::Softmax_13"
-    output: "onnx::Transpose_14"
-    name: "Softmax_7"
+    input: "x.1"
+    output: "onnx::Reshape_18"
+    name: "Softmax_8"
     op_type: "Softmax"
     attribute {
       name: "axis"
-      i: 3
+      i: 1
       type: INT
     }
   }
   node {
-    input: "onnx::Transpose_14"
-    output: "onnx::Reshape_15"
-    name: "Transpose_8"
-    op_type: "Transpose"
+    output: "onnx::Unsqueeze_20"
+    name: "Constant_9"
+    op_type: "Constant"
     attribute {
-      name: "perm"
-      ints: 0
-      ints: 3
-      ints: 2
-      ints: 1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
     input: "onnx::Unsqueeze_3"
-    output: "onnx::Concat_17"
-    name: "Unsqueeze_9"
+    input: "onnx::Unsqueeze_20"
+    output: "onnx::Concat_21"
+    name: "Unsqueeze_10"
     op_type: "Unsqueeze"
-    attribute {
-      name: "axes"
-      ints: 0
-      type: INTS
-    }
   }
   node {
-    input: "onnx::Concat_17"
-    input: "onnx::Concat_24"
-    output: "onnx::Reshape_19"
-    name: "Concat_10"
+    input: "onnx::Concat_21"
+    input: "onnx::Concat_29"
+    output: "onnx::Reshape_24"
+    name: "Concat_11"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -128,35 +134,35 @@ graph {
     }
   }
   node {
-    input: "onnx::Reshape_15"
-    input: "onnx::Reshape_19"
-    output: "20"
-    name: "Reshape_11"
+    input: "onnx::Reshape_18"
+    input: "onnx::Reshape_24"
+    output: "25"
+    name: "Reshape_12"
     op_type: "Reshape"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 1
     data_type: 7
-    name: "onnx::Concat_21"
+    name: "onnx::Concat_26"
     raw_data: "\001\000\000\000\000\000\000\000"
   }
   initializer {
     dims: 1
     data_type: 7
-    name: "onnx::Concat_22"
+    name: "onnx::Concat_27"
     raw_data: "\002\000\000\000\000\000\000\000"
   }
   initializer {
     dims: 1
     data_type: 7
-    name: "onnx::Concat_23"
+    name: "onnx::Concat_28"
     raw_data: "\377\377\377\377\377\377\377\377"
   }
   initializer {
     dims: 1
     data_type: 7
-    name: "onnx::Concat_24"
+    name: "onnx::Concat_29"
     raw_data: "\377\377\377\377\377\377\377\377"
   }
   input {
@@ -182,7 +188,7 @@ graph {
     }
   }
   output {
-    name: "20"
+    name: "25"
     type {
       tensor_type {
         elem_type: 1
@@ -199,5 +205,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_sign.expect b/test/onnx/expect/TestOperators.test_sign.expect
index baeb2b1505eb..6cb9200dc073 100644
--- a/test/onnx/expect/TestOperators.test_sign.expect
+++ b/test/onnx/expect/TestOperators.test_sign.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Sign_0"
     op_type: "Sign"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sign_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect
index 4852ac6060f7..4ca6284c48d9 100644
--- a/test/onnx/expect/TestOperators.test_sin.expect
+++ b/test/onnx/expect/TestOperators.test_sin.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Sin_0"
     op_type: "Sin"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sin_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_slice.expect b/test/onnx/expect/TestOperators.test_slice.expect
index 1524b308a951..15aa37bc2f7e 100644
--- a/test/onnx/expect/TestOperators.test_slice.expect
+++ b/test/onnx/expect/TestOperators.test_slice.expect
@@ -1,29 +1,74 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Slice_0"
-    output: "1"
-    name: "Slice_0"
-    op_type: "Slice"
+    output: "onnx::Slice_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_2"
+    name: "Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_3"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "ends"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_4"
+    name: "Constant_3"
+    op_type: "Constant"
     attribute {
-      name: "starts"
-      ints: 1
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "onnx::Slice_0"
+    input: "onnx::Slice_2"
+    input: "onnx::Slice_3"
+    input: "onnx::Slice_1"
+    input: "onnx::Slice_4"
+    output: "5"
+    name: "Slice_4"
+    op_type: "Slice"
+  }
+  name: "torch_jit"
   input {
     name: "onnx::Slice_0"
     type {
@@ -41,7 +86,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
@@ -58,5 +103,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_slice_dynamic.expect b/test/onnx/expect/TestOperators.test_slice_dynamic.expect
index f954b3a11128..5a47c596d3f5 100644
--- a/test/onnx/expect/TestOperators.test_slice_dynamic.expect
+++ b/test/onnx/expect/TestOperators.test_slice_dynamic.expect
@@ -93,7 +93,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Slice_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
index 6282b2b4016d..a7d7237e2212 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
@@ -19,7 +19,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
index 6d35c3f3bc30..99870e60c6b7 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
@@ -19,7 +19,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
index 3eba0f943ac1..bad2ffc222be 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
@@ -19,7 +19,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
index 223f8d3d3219..198f2b568912 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
@@ -19,7 +19,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect
index 21c0a3540cda..4b861c407122 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect
@@ -19,7 +19,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
index ede001c6b8ce..830de3396953 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
@@ -20,7 +20,7 @@ graph {
       type: STRING
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 5
     data_type: 1
diff --git a/test/onnx/expect/TestOperators.test_split.expect b/test/onnx/expect/TestOperators.test_split.expect
index 5566c1bfa3bb..e1616e4a52cd 100644
--- a/test/onnx/expect/TestOperators.test_split.expect
+++ b/test/onnx/expect/TestOperators.test_split.expect
@@ -1,28 +1,36 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    output: "onnx::Split_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 3
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
   node {
     input: "tensor"
-    output: "1"
+    input: "onnx::Split_1"
     output: "2"
     output: "3"
-    name: "Split_0"
+    output: "4"
+    name: "Split_1"
     op_type: "Split"
     attribute {
       name: "axis"
       i: 1
       type: INT
     }
-    attribute {
-      name: "split"
-      ints: 2
-      ints: 2
-      ints: 2
-      type: INTS
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "tensor"
     type {
@@ -40,7 +48,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -56,7 +64,7 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "3"
     type {
       tensor_type {
         elem_type: 1
@@ -72,7 +80,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "4"
     type {
       tensor_type {
         elem_type: 1
@@ -89,5 +97,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_split_with_sizes.expect b/test/onnx/expect/TestOperators.test_split_with_sizes.expect
index addd7dba3425..964ba363a56e 100644
--- a/test/onnx/expect/TestOperators.test_split_with_sizes.expect
+++ b/test/onnx/expect/TestOperators.test_split_with_sizes.expect
@@ -1,28 +1,36 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    output: "onnx::Split_1"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 3
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
   node {
     input: "tensor"
-    output: "1"
+    input: "onnx::Split_1"
     output: "2"
     output: "3"
-    name: "Split_0"
+    output: "4"
+    name: "Split_1"
     op_type: "Split"
     attribute {
       name: "axis"
       i: 1
       type: INT
     }
-    attribute {
-      name: "split"
-      ints: 2
-      ints: 1
-      ints: 3
-      type: INTS
-    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "tensor"
     type {
@@ -40,7 +48,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -56,7 +64,7 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "3"
     type {
       tensor_type {
         elem_type: 1
@@ -72,7 +80,7 @@ graph {
     }
   }
   output {
-    name: "3"
+    name: "4"
     type {
       tensor_type {
         elem_type: 1
@@ -89,5 +97,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect
index d46c5b7272c6..91fc7bac0b77 100644
--- a/test/onnx/expect/TestOperators.test_sqrt.expect
+++ b/test/onnx/expect/TestOperators.test_sqrt.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Sqrt_0"
     op_type: "Sqrt"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Sqrt_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_std.expect b/test/onnx/expect/TestOperators.test_std.expect
index adf8398352a2..69df37b90452 100644
--- a/test/onnx/expect/TestOperators.test_std.expect
+++ b/test/onnx/expect/TestOperators.test_std.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -144,7 +144,7 @@ graph {
     name: "Sqrt_13"
     op_type: "Sqrt"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceMean_0"
     type {
@@ -185,5 +185,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect
index 75195a79a7a5..6722064ace20 100644
--- a/test/onnx/expect/TestOperators.test_sum.expect
+++ b/test/onnx/expect/TestOperators.test_sum.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -13,7 +13,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::ReduceSum_0"
     type {
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_sum_dtype.expect b/test/onnx/expect/TestOperators.test_sum_dtype.expect
index 3e149b422bf9..2b5f417b0eee 100644
--- a/test/onnx/expect/TestOperators.test_sum_dtype.expect
+++ b/test/onnx/expect/TestOperators.test_sum_dtype.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -24,7 +24,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Cast_0"
     type {
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect
index e6f6e855f8e3..84bc3e9420df 100644
--- a/test/onnx/expect/TestOperators.test_tan.expect
+++ b/test/onnx/expect/TestOperators.test_tan.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Tan_0"
     op_type: "Tan"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Tan_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_topk.expect b/test/onnx/expect/TestOperators.test_topk.expect
index 0310fe86367a..25c206668f87 100644
--- a/test/onnx/expect/TestOperators.test_topk.expect
+++ b/test/onnx/expect/TestOperators.test_topk.expect
@@ -36,7 +36,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::TopK_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect b/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect
index b76aaf172a6a..f94c62abcbed 100644
--- a/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect
+++ b/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect
@@ -46,7 +46,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::TopK_0"
     type {
diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect
index 1a30352aeb65..f1350a1b2623 100644
--- a/test/onnx/expect/TestOperators.test_transpose.expect
+++ b/test/onnx/expect/TestOperators.test_transpose.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Identity_0"
     op_type: "Identity"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Identity_0"
     type {
@@ -43,5 +43,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect
index c9480c168ddf..31803483edbd 100644
--- a/test/onnx/expect/TestOperators.test_type_as.expect
+++ b/test/onnx/expect/TestOperators.test_type_as.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -8,7 +8,7 @@ graph {
     name: "Identity_0"
     op_type: "Identity"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Identity_0"
     type {
@@ -37,5 +37,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_unfold.expect b/test/onnx/expect/TestOperators.test_unfold.expect
index 164c5caeecc4..9b5e20281d20 100644
--- a/test/onnx/expect/TestOperators.test_unfold.expect
+++ b/test/onnx/expect/TestOperators.test_unfold.expect
@@ -1,76 +1,156 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Slice_0"
-    output: "onnx::Unsqueeze_1"
-    name: "Slice_0"
-    op_type: "Slice"
+    output: "onnx::Slice_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_2"
+    name: "Constant_1"
+    op_type: "Constant"
     attribute {
-      name: "ends"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_3"
+    name: "Constant_2"
+    op_type: "Constant"
     attribute {
-      name: "starts"
-      ints: 0
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
     input: "onnx::Slice_0"
-    output: "onnx::Unsqueeze_2"
-    name: "Slice_1"
+    input: "onnx::Slice_2"
+    input: "onnx::Slice_3"
+    input: "onnx::Slice_1"
+    output: "onnx::Unsqueeze_4"
+    name: "Slice_3"
     op_type: "Slice"
+  }
+  node {
+    output: "onnx::Slice_5"
+    name: "Constant_4"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_6"
+    name: "Constant_5"
+    op_type: "Constant"
     attribute {
-      name: "ends"
-      ints: 4
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
+  }
+  node {
+    output: "onnx::Slice_7"
+    name: "Constant_6"
+    op_type: "Constant"
     attribute {
-      name: "starts"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\004\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::Unsqueeze_1"
-    output: "onnx::Concat_3"
-    name: "Unsqueeze_2"
-    op_type: "Unsqueeze"
+    input: "onnx::Slice_0"
+    input: "onnx::Slice_6"
+    input: "onnx::Slice_7"
+    input: "onnx::Slice_5"
+    output: "onnx::Unsqueeze_8"
+    name: "Slice_7"
+    op_type: "Slice"
+  }
+  node {
+    output: "onnx::Unsqueeze_9"
+    name: "Constant_8"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::Unsqueeze_2"
-    output: "onnx::Concat_4"
-    name: "Unsqueeze_3"
+    input: "onnx::Unsqueeze_4"
+    input: "onnx::Unsqueeze_9"
+    output: "onnx::Concat_10"
+    name: "Unsqueeze_9"
     op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_11"
+    name: "Constant_10"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
   node {
-    input: "onnx::Concat_3"
-    input: "onnx::Concat_4"
-    output: "5"
-    name: "Concat_4"
+    input: "onnx::Unsqueeze_8"
+    input: "onnx::Unsqueeze_11"
+    output: "onnx::Concat_12"
+    name: "Unsqueeze_11"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "onnx::Concat_10"
+    input: "onnx::Concat_12"
+    output: "13"
+    name: "Concat_12"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -78,7 +158,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Slice_0"
     type {
@@ -99,7 +179,7 @@ graph {
     }
   }
   output {
-    name: "5"
+    name: "13"
     type {
       tensor_type {
         elem_type: 1
@@ -122,5 +202,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_unique.expect b/test/onnx/expect/TestOperators.test_unique.expect
index 1ceb90528b28..55e6e2909a3f 100644
--- a/test/onnx/expect/TestOperators.test_unique.expect
+++ b/test/onnx/expect/TestOperators.test_unique.expect
@@ -21,7 +21,7 @@ graph {
       type: INT
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "input"
     type {
diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect
index 32cfe697e3dd..49a61c2b8451 100644
--- a/test/onnx/expect/TestOperators.test_unsqueeze.expect
+++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect
@@ -1,19 +1,29 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "onnx::Unsqueeze_0"
-    output: "1"
-    name: "Unsqueeze_0"
-    op_type: "Unsqueeze"
+    output: "onnx::Unsqueeze_1"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axes"
-      ints: 2
-      type: INTS
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  node {
+    input: "onnx::Unsqueeze_0"
+    input: "onnx::Unsqueeze_1"
+    output: "2"
+    name: "Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  name: "torch_jit"
   input {
     name: "onnx::Unsqueeze_0"
     type {
@@ -31,7 +41,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
@@ -51,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
index 198d2367ad08..e1f31dc406a0 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
@@ -1,24 +1,40 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "x"
-    input: "onnx::Upsample_5"
-    output: "4"
-    name: "Upsample_0"
-    op_type: "Upsample"
+    input: ""
+    input: "onnx::Resize_6"
+    output: "5"
+    name: "Resize_0"
+    op_type: "Resize"
+    attribute {
+      name: "coordinate_transformation_mode"
+      s: "asymmetric"
+      type: STRING
+    }
+    attribute {
+      name: "cubic_coeff_a"
+      f: -0.75
+      type: FLOAT
+    }
     attribute {
       name: "mode"
       s: "nearest"
       type: STRING
     }
+    attribute {
+      name: "nearest_mode"
+      s: "floor"
+      type: STRING
+    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 4
     data_type: 1
-    name: "onnx::Upsample_5"
+    name: "onnx::Resize_6"
     raw_data: "\000\000\200?\000\000\200?\000\000\000@\000\000\000@"
   }
   input {
@@ -44,22 +60,22 @@ graph {
     }
   }
   output {
-    name: "4"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Resize5_dim_0"
           }
           dim {
-            dim_value: 2
+            dim_param: "Resize5_dim_1"
           }
           dim {
-            dim_value: 6
+            dim_param: "Resize5_dim_2"
           }
           dim {
-            dim_value: 8
+            dim_param: "Resize5_dim_3"
           }
         }
       }
@@ -67,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
index 198d2367ad08..e1f31dc406a0 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
@@ -1,24 +1,40 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "x"
-    input: "onnx::Upsample_5"
-    output: "4"
-    name: "Upsample_0"
-    op_type: "Upsample"
+    input: ""
+    input: "onnx::Resize_6"
+    output: "5"
+    name: "Resize_0"
+    op_type: "Resize"
+    attribute {
+      name: "coordinate_transformation_mode"
+      s: "asymmetric"
+      type: STRING
+    }
+    attribute {
+      name: "cubic_coeff_a"
+      f: -0.75
+      type: FLOAT
+    }
     attribute {
       name: "mode"
       s: "nearest"
       type: STRING
     }
+    attribute {
+      name: "nearest_mode"
+      s: "floor"
+      type: STRING
+    }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 4
     data_type: 1
-    name: "onnx::Upsample_5"
+    name: "onnx::Resize_6"
     raw_data: "\000\000\200?\000\000\200?\000\000\000@\000\000\000@"
   }
   input {
@@ -44,22 +60,22 @@ graph {
     }
   }
   output {
-    name: "4"
+    name: "5"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Resize5_dim_0"
           }
           dim {
-            dim_value: 2
+            dim_param: "Resize5_dim_1"
           }
           dim {
-            dim_value: 6
+            dim_param: "Resize5_dim_2"
           }
           dim {
-            dim_value: 8
+            dim_param: "Resize5_dim_3"
           }
         }
       }
@@ -67,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect
index dc30ada92252..cbd32608d2ae 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect
@@ -1,34 +1,112 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::Upsample_1"
-    name: "Constant_0"
+    input: "x"
+    output: "onnx::Slice_2"
+    name: "Shape_0"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Slice_3"
+    name: "Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_4"
+    name: "Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Slice_5"
+    name: "Constant_3"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
-        dims: 4
-        data_type: 1
-        raw_data: "\000\000\200?\000\000\200?\253\252\252@\000\000\200@"
+        dims: 1
+        data_type: 7
+        raw_data: "\002\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
+  node {
+    input: "onnx::Slice_2"
+    input: "onnx::Slice_4"
+    input: "onnx::Slice_5"
+    input: "onnx::Slice_3"
+    output: "onnx::Concat_6"
+    name: "Slice_4"
+    op_type: "Slice"
+  }
+  node {
+    input: "onnx::Concat_6"
+    input: "onnx::Concat_12"
+    output: "onnx::Resize_8"
+    name: "Concat_5"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
   node {
     input: "x"
-    input: "onnx::Upsample_1"
-    output: "2"
-    name: "Upsample_1"
-    op_type: "Upsample"
+    input: ""
+    input: ""
+    input: "onnx::Resize_8"
+    output: "11"
+    name: "Resize_6"
+    op_type: "Resize"
+    attribute {
+      name: "coordinate_transformation_mode"
+      s: "asymmetric"
+      type: STRING
+    }
+    attribute {
+      name: "cubic_coeff_a"
+      f: -0.75
+      type: FLOAT
+    }
     attribute {
       name: "mode"
       s: "nearest"
       type: STRING
     }
+    attribute {
+      name: "nearest_mode"
+      s: "floor"
+      type: STRING
+    }
+  }
+  name: "torch_jit"
+  initializer {
+    dims: 2
+    data_type: 7
+    name: "onnx::Concat_12"
+    raw_data: "\020\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000"
   }
-  name: "torch-jit-export"
   input {
     name: "x"
     type {
@@ -52,22 +130,22 @@ graph {
     }
   }
   output {
-    name: "2"
+    name: "11"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 1
+            dim_param: "Resize11_dim_0"
           }
           dim {
-            dim_value: 2
+            dim_param: "Resize11_dim_1"
           }
           dim {
-            dim_value: 16
+            dim_param: "Resize11_dim_2"
           }
           dim {
-            dim_value: 16
+            dim_param: "Resize11_dim_3"
           }
         }
       }
@@ -75,5 +153,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
index c7a1eb6adbd6..097625822969 100644
--- a/test/onnx/expect/TestOperators.test_view.expect
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -23,7 +23,7 @@ graph {
     name: "Reshape_1"
     op_type: "Reshape"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   input {
     name: "onnx::Reshape_0"
     type {
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 2465f1d0032d..ac814160d5bd 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -1,19 +1,19 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
     input: "onnx::Reshape_0"
-    input: "onnx::Reshape_9"
-    output: "6"
+    input: "onnx::Reshape_11"
+    output: "8"
     name: "Reshape_0"
     op_type: "Reshape"
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   initializer {
     dims: 2
     data_type: 7
-    name: "onnx::Reshape_9"
+    name: "onnx::Reshape_11"
     raw_data: "\001\000\000\000\000\000\000\000\030\000\000\000\000\000\000\000"
   }
   input {
@@ -39,7 +39,7 @@ graph {
     }
   }
   output {
-    name: "6"
+    name: "8"
     type {
       tensor_type {
         elem_type: 1
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/expect/TestOperators.test_zeros_like.expect b/test/onnx/expect/TestOperators.test_zeros_like.expect
index 27ac22983187..e4f6c6ede2ca 100644
--- a/test/onnx/expect/TestOperators.test_zeros_like.expect
+++ b/test/onnx/expect/TestOperators.test_zeros_like.expect
@@ -1,4 +1,4 @@
-ir_version: 4
+ir_version: 7
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
@@ -17,7 +17,7 @@ graph {
       type: TENSOR
     }
   }
-  name: "torch-jit-export"
+  name: "torch_jit"
   output {
     name: "1"
     type {
@@ -36,5 +36,5 @@ graph {
   }
 }
 opset_import {
-  version: 9
+  version: 13
 }
diff --git a/test/onnx/export_onnx_tests_filter.py b/test/onnx/export_onnx_tests_filter.py
index 0cb42cd439d6..9b781fa53c8f 100644
--- a/test/onnx/export_onnx_tests_filter.py
+++ b/test/onnx/export_onnx_tests_filter.py
@@ -1,25 +1,30 @@
 import argparse
 import glob
-import onnx.backend.test
 import os
 import shutil
-from test_caffe2_common import run_generated_test
+import traceback
+
 import google.protobuf.text_format
+import onnx.backend.test
 import test_onnx_common
-import traceback
+from test_caffe2_common import run_generated_test
 
 from torch.testing._internal.common_device_type import get_all_device_types
 
-_fail_test_dir = os.path.join(os.path.dirname(
-    os.path.realpath(__file__)), "fail", "generated")
+_fail_test_dir = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "fail", "generated"
+)
 
 
-_expect_dir = os.path.join(os.path.dirname(
-    os.path.realpath(__file__)), "expect")
+_expect_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "expect")
 
 
-def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir,
-                                verbose=False, fail_dir=None, expect=True):
+def collect_generated_testcases(
+    root_dir=test_onnx_common.pytorch_converted_dir,
+    verbose=False,
+    fail_dir=None,
+    expect=True,
+):
     total_pass = 0
     total_fail = 0
     for d in os.listdir(root_dir):
@@ -33,13 +38,16 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir,
                     for device in get_all_device_types():
                         run_generated_test(model_file, data_dir, device)
                 if expect:
-                    expect_file = os.path.join(_expect_dir,
-                                               "PyTorch-generated-{}.expect".format(d))
+                    expect_file = os.path.join(
+                        _expect_dir, "PyTorch-generated-{}.expect".format(d)
+                    )
                     with open(expect_file, "w") as text_file:
                         model = onnx.load(model_file)
                         onnx.checker.check_model(model)
                         onnx.helper.strip_doc_string(model)
-                        text_file.write(google.protobuf.text_format.MessageToString(model))
+                        text_file.write(
+                            google.protobuf.text_format.MessageToString(model)
+                        )
                 total_pass += 1
             except Exception as e:
                 if verbose:
@@ -53,17 +61,28 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir,
                         shutil.rmtree(target_dir)
                     shutil.move(dir_name, target_dir)
                 total_fail += 1
-    print("Successfully generated/updated {} test cases from PyTorch.".format(total_pass))
+    print(
+        "Successfully generated/updated {} test cases from PyTorch.".format(total_pass)
+    )
     if expect:
         print("Expected pbtxt files are generated in {}.".format(_expect_dir))
     print("Failed {} testcases are moved to {}.".format(total_fail, _fail_test_dir))
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Check and filter the failed test cases.")
+    parser = argparse.ArgumentParser(
+        description="Check and filter the failed test cases."
+    )
     parser.add_argument("-v", action="store_true", default=False, help="verbose")
-    parser.add_argument("--delete", action="store_true", default=False, help="delete failed test cases")
-    parser.add_argument("--no-expect", action="store_true", default=False, help="generate expect txt files")
+    parser.add_argument(
+        "--delete", action="store_true", default=False, help="delete failed test cases"
+    )
+    parser.add_argument(
+        "--no-expect",
+        action="store_true",
+        default=False,
+        help="generate expect txt files",
+    )
     args = parser.parse_args()
     verbose = args.v
     delete = args.delete
@@ -77,5 +96,9 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir,
 
     collect_generated_testcases(verbose=verbose, fail_dir=fail_dir, expect=expect)
     # We already generate the expect files for test_operators.py.
-    collect_generated_testcases(root_dir=test_onnx_common.pytorch_operator_dir,
-                                verbose=verbose, fail_dir=fail_dir, expect=False)
+    collect_generated_testcases(
+        root_dir=test_onnx_common.pytorch_operator_dir,
+        verbose=verbose,
+        fail_dir=fail_dir,
+        expect=False,
+    )
diff --git a/test/onnx/export_onnx_tests_generator.py b/test/onnx/export_onnx_tests_generator.py
index b658a19f6530..2a949af80000 100644
--- a/test/onnx/export_onnx_tests_generator.py
+++ b/test/onnx/export_onnx_tests_generator.py
@@ -1,17 +1,17 @@
-from torch.autograd import Variable
-from onnx import numpy_helper
-
 import io
-import onnx
 import os
 import shutil
-import torch
 import traceback
 
+import onnx
 import test_onnx_common
-from torch.testing._internal.common_nn import module_tests
+from onnx import numpy_helper
 from test_nn import new_module_tests
 
+import torch
+from torch.autograd import Variable
+from torch.testing._internal.common_nn import module_tests
+
 
 # Take a test case (a dict) as input, return the test name.
 def get_test_name(testcase):
@@ -27,7 +27,11 @@ def get_test_name(testcase):
 # Take a test case (a dict) as input, return the input for the module.
 def gen_input(testcase):
     if "input_size" in testcase:
-        if testcase["input_size"] == () and "desc" in testcase and testcase["desc"][-6:] == "scalar":
+        if (
+            testcase["input_size"] == ()
+            and "desc" in testcase
+            and testcase["desc"][-6:] == "scalar"
+        ):
             testcase["input_size"] = (1,)
         return Variable(torch.randn(*testcase["input_size"]))
     elif "input_fn" in testcase:
@@ -54,11 +58,11 @@ def print_stats(FunctionalModule_nums, nn_module):
     unsupported = []
     not_fully_supported = []
     for key, value in nn_module.items():
-        if (value == 1):
+        if value == 1:
             supported.append(key)
-        elif (value == 2):
+        elif value == 2:
             unsupported.append(key)
-        elif (value == 3):
+        elif value == 3:
             not_fully_supported.append(key)
 
     def fun(info, l):
@@ -69,12 +73,14 @@ def fun(info, l):
     # Fully Supported Ops: All related test cases of these ops have been exported
     # Semi-Supported Ops: Part of related test cases of these ops have been exported
     # Unsupported Ops: None of related test cases of these ops have been exported
-    for info, l in [["{} Fully Supported Operators:".format(len(supported)),
-                     supported],
-                    ["{} Semi-Supported Operators:".format(len(not_fully_supported)),
-                     not_fully_supported],
-                    ["{} Unsupported Operators:".format(len(unsupported)),
-                     unsupported]]:
+    for info, l in [
+        ["{} Fully Supported Operators:".format(len(supported)), supported],
+        [
+            "{} Semi-Supported Operators:".format(len(not_fully_supported)),
+            not_fully_supported,
+        ],
+        ["{} Unsupported Operators:".format(len(unsupported)), unsupported],
+    ]:
         fun(info, l)
 
 
@@ -87,16 +93,20 @@ def convert_tests(testcases, sets=1):
         test_name = get_test_name(t)
         module = gen_module(t)
         module_name = str(module).split("(")[0]
-        if (module_name == "FunctionalModule"):
+        if module_name == "FunctionalModule":
             FunctionalModule_nums += 1
         else:
-            if (module_name not in nn_module):
+            if module_name not in nn_module:
                 nn_module[module_name] = 0
         try:
             input = gen_input(t)
             f = io.BytesIO()
-            torch.onnx._export(module, input, f,
-                               operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+            torch.onnx._export(
+                module,
+                input,
+                f,
+                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            )
             onnx_model = onnx.load_from_string(f.getvalue())
             onnx.checker.check_model(onnx_model)
             onnx.helper.strip_doc_string(onnx_model)
@@ -115,26 +125,38 @@ def convert_tests(testcases, sets=1):
 
                 for index, var in enumerate([input]):
                     tensor = numpy_helper.from_array(var.data.numpy())
-                    with open(os.path.join(data_dir, "input_{}.pb".format(index)), "wb") as file:
+                    with open(
+                        os.path.join(data_dir, "input_{}.pb".format(index)), "wb"
+                    ) as file:
                         file.write(tensor.SerializeToString())
                 for index, var in enumerate([output]):
                     tensor = numpy_helper.from_array(var.data.numpy())
-                    with open(os.path.join(data_dir, "output_{}.pb".format(index)), "wb") as file:
+                    with open(
+                        os.path.join(data_dir, "output_{}.pb".format(index)), "wb"
+                    ) as file:
                         file.write(tensor.SerializeToString())
                 input = gen_input(t)
-                if (module_name != "FunctionalModule"):
+                if module_name != "FunctionalModule":
                     nn_module[module_name] |= 1
         except:  # noqa: E722,B001
             traceback.print_exc()
-            if (module_name != "FunctionalModule"):
+            if module_name != "FunctionalModule":
                 nn_module[module_name] |= 2
             failed += 1
 
-    print("Collect {} test cases from PyTorch repo, failed to export {} cases.".format(
-        len(testcases), failed))
-    print("PyTorch converted cases are stored in {}.".format(test_onnx_common.pytorch_converted_dir))
+    print(
+        "Collect {} test cases from PyTorch repo, failed to export {} cases.".format(
+            len(testcases), failed
+        )
+    )
+    print(
+        "PyTorch converted cases are stored in {}.".format(
+            test_onnx_common.pytorch_converted_dir
+        )
+    )
     print_stats(FunctionalModule_nums, nn_module)
 
+
 if __name__ == "__main__":
     testcases = module_tests + new_module_tests
     convert_tests(testcases)
diff --git a/test/onnx/model_defs/__init__.py b/test/onnx/model_defs/__init__.py
index 07967b0df70d..7bfa2c833cf3 100644
--- a/test/onnx/model_defs/__init__.py
+++ b/test/onnx/model_defs/__init__.py
@@ -1,4 +1,4 @@
-from .squeezenet import *  # noqa: F403
-from .super_resolution import *  # noqa: F403
 from .op_test import *  # noqa: F403
+from .squeezenet import *  # noqa: F403
 from .srresnet import *  # noqa: F403
+from .super_resolution import *  # noqa: F403
diff --git a/test/onnx/model_defs/dcgan.py b/test/onnx/model_defs/dcgan.py
index b65cd10106cc..5054835ca13f 100644
--- a/test/onnx/model_defs/dcgan.py
+++ b/test/onnx/model_defs/dcgan.py
@@ -1,7 +1,6 @@
 import torch
 import torch.nn as nn
 
-
 # configurable
 bsz = 64
 imgsz = 64
@@ -14,9 +13,9 @@
 # custom weights initialization called on netG and netD
 def weights_init(m):
     classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
+    if classname.find("Conv") != -1:
         m.weight.data.normal_(0.0, 0.02)
-    elif classname.find('BatchNorm') != -1:
+    elif classname.find("BatchNorm") != -1:
         m.weight.data.normal_(1.0, 0.02)
         m.bias.data.fill_(0)
 
@@ -78,7 +77,7 @@ def __init__(self, ngpu):
             nn.LeakyReLU(0.2, inplace=True),
             # state size. (ndf*8) x 4 x 4
             nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
-            nn.Sigmoid()
+            nn.Sigmoid(),
         )
 
     def forward(self, input):
diff --git a/test/onnx/model_defs/emb_seq.py b/test/onnx/model_defs/emb_seq.py
index 09a289aaf821..5200aa4f5888 100644
--- a/test/onnx/model_defs/emb_seq.py
+++ b/test/onnx/model_defs/emb_seq.py
@@ -1,4 +1,3 @@
-
 import torch.nn as nn
 
 
@@ -17,15 +16,10 @@ def forward(self, input):
 
 
 class EmbeddingNetwork2(nn.Module):
-
     def __init__(self, in_space=10, dim=3):
         super(EmbeddingNetwork2, self).__init__()
         self.embedding = nn.Embedding(in_space, dim)
-        self.seq = nn.Sequential(
-            self.embedding,
-            nn.Linear(dim, 1),
-            nn.Sigmoid()
-        )
+        self.seq = nn.Sequential(self.embedding, nn.Linear(dim, 1), nn.Sigmoid())
 
     def forward(self, indices):
         return self.seq(indices)
diff --git a/test/onnx/model_defs/lstm_flattening_result.py b/test/onnx/model_defs/lstm_flattening_result.py
index dbbc07ed21f5..62e8450eff92 100644
--- a/test/onnx/model_defs/lstm_flattening_result.py
+++ b/test/onnx/model_defs/lstm_flattening_result.py
@@ -7,27 +7,39 @@ def forward(self, input, *fargs, **fkwargs):
         output, (hidden, cell) = nn.LSTM.forward(self, input, *fargs, **fkwargs)
         return output, hidden, cell
 
+
 class LstmFlatteningResultWithSeqLength(nn.Module):
     def __init__(self, input_size, hidden_size, layers, bidirect, dropout, batch_first):
         super(LstmFlatteningResultWithSeqLength, self).__init__()
 
         self.batch_first = batch_first
-        self.inner_model = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=layers,
-                                   bidirectional=bidirect, dropout=dropout,
-                                   batch_first=batch_first)
+        self.inner_model = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=layers,
+            bidirectional=bidirect,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
 
     def forward(self, input: PackedSequence, hx=None):
         output, (hidden, cell) = self.inner_model.forward(input, hx)
         return output, hidden, cell
 
+
 class LstmFlatteningResultWithoutSeqLength(nn.Module):
     def __init__(self, input_size, hidden_size, layers, bidirect, dropout, batch_first):
         super(LstmFlatteningResultWithoutSeqLength, self).__init__()
 
         self.batch_first = batch_first
-        self.inner_model = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=layers,
-                                   bidirectional=bidirect, dropout=dropout,
-                                   batch_first=batch_first)
+        self.inner_model = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=layers,
+            bidirectional=bidirect,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
 
     def forward(self, input, hx=None):
         output, (hidden, cell) = self.inner_model.forward(input, hx)
diff --git a/test/onnx/model_defs/mnist.py b/test/onnx/model_defs/mnist.py
index a8a0b3fe4231..176822852c94 100644
--- a/test/onnx/model_defs/mnist.py
+++ b/test/onnx/model_defs/mnist.py
@@ -3,7 +3,6 @@
 
 
 class MNIST(nn.Module):
-
     def __init__(self):
         super(MNIST, self).__init__()
         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
diff --git a/test/onnx/model_defs/op_test.py b/test/onnx/model_defs/op_test.py
index d223c071bec7..774f3070824c 100644
--- a/test/onnx/model_defs/op_test.py
+++ b/test/onnx/model_defs/op_test.py
@@ -5,13 +5,12 @@
 
 
 class DummyNet(nn.Module):
-
     def __init__(self, num_classes=1000):
         super(DummyNet, self).__init__()
         self.features = nn.Sequential(
             nn.LeakyReLU(0.02),
             nn.BatchNorm2d(3),
-            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False)
+            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False),
         )
 
     def forward(self, x):
@@ -20,7 +19,6 @@ def forward(self, x):
 
 
 class ConcatNet(nn.Module):
-
     def __init__(self):
         super(ConcatNet, self).__init__()
 
@@ -29,7 +27,6 @@ def forward(self, inputs):
 
 
 class PermuteNet(nn.Module):
-
     def __init__(self):
         super(PermuteNet, self).__init__()
 
@@ -38,7 +35,6 @@ def forward(self, input):
 
 
 class PReluNet(nn.Module):
-
     def __init__(self):
         super(PReluNet, self).__init__()
         self.features = nn.Sequential(
@@ -49,6 +45,7 @@ def forward(self, x):
         output = self.features(x)
         return output
 
+
 class FakeQuantNet(nn.Module):
     def __init__(self):
         super(FakeQuantNet, self).__init__()
diff --git a/test/onnx/model_defs/rnn_model_with_packed_sequence.py b/test/onnx/model_defs/rnn_model_with_packed_sequence.py
index b0288baeb33e..153d9b7da5e7 100644
--- a/test/onnx/model_defs/rnn_model_with_packed_sequence.py
+++ b/test/onnx/model_defs/rnn_model_with_packed_sequence.py
@@ -16,6 +16,7 @@ def forward(self, input, *args):
         ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first)
         return tuple([ret] + list(rets))
 
+
 class RnnModelWithPackedSequenceWithoutState(nn.Module):
     def __init__(self, model, batch_first):
         super(RnnModelWithPackedSequenceWithoutState, self).__init__()
@@ -29,6 +30,7 @@ def forward(self, input, seq_lengths):
         ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first)
         return list([ret] + list(rets))
 
+
 class RnnModelWithPackedSequenceWithState(nn.Module):
     def __init__(self, model, batch_first):
         super(RnnModelWithPackedSequenceWithState, self).__init__()
diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py
index 984f724c1562..acf4dc5e2375 100644
--- a/test/onnx/model_defs/squeezenet.py
+++ b/test/onnx/model_defs/squeezenet.py
@@ -4,35 +4,37 @@
 
 
 class Fire(nn.Module):
-
-    def __init__(self, inplanes, squeeze_planes,
-                 expand1x1_planes, expand3x3_planes):
+    def __init__(self, inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes):
         super(Fire, self).__init__()
         self.inplanes = inplanes
         self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
         self.squeeze_activation = nn.ReLU(inplace=True)
-        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
-                                   kernel_size=1)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1)
         self.expand1x1_activation = nn.ReLU(inplace=True)
-        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
-                                   kernel_size=3, padding=1)
+        self.expand3x3 = nn.Conv2d(
+            squeeze_planes, expand3x3_planes, kernel_size=3, padding=1
+        )
         self.expand3x3_activation = nn.ReLU(inplace=True)
 
     def forward(self, x):
         x = self.squeeze_activation(self.squeeze(x))
-        return torch.cat([
-            self.expand1x1_activation(self.expand1x1(x)),
-            self.expand3x3_activation(self.expand3x3(x))
-        ], 1)
+        return torch.cat(
+            [
+                self.expand1x1_activation(self.expand1x1(x)),
+                self.expand3x3_activation(self.expand3x3(x)),
+            ],
+            1,
+        )
 
 
 class SqueezeNet(nn.Module):
-
     def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
         super(SqueezeNet, self).__init__()
         if version not in [1.0, 1.1]:
-            raise ValueError("Unsupported SqueezeNet version {version}:"
-                             "1.0 or 1.1 expected".format(version=version))
+            raise ValueError(
+                "Unsupported SqueezeNet version {version}:"
+                "1.0 or 1.1 expected".format(version=version)
+            )
         self.num_classes = num_classes
         if version == 1.0:
             self.features = nn.Sequential(
@@ -69,10 +71,7 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
         # Final convolution is initialized differently from the rest
         final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
         self.classifier = nn.Sequential(
-            nn.Dropout(p=0.5),
-            final_conv,
-            nn.ReLU(inplace=True),
-            nn.AvgPool2d(13)
+            nn.Dropout(p=0.5), final_conv, nn.ReLU(inplace=True), nn.AvgPool2d(13)
         )
 
         for m in self.modules():
diff --git a/test/onnx/model_defs/srresnet.py b/test/onnx/model_defs/srresnet.py
index 0328d39f7a34..65795471293f 100644
--- a/test/onnx/model_defs/srresnet.py
+++ b/test/onnx/model_defs/srresnet.py
@@ -14,10 +14,14 @@ def _initialize_orthogonal(conv):
 class ResidualBlock(nn.Module):
     def __init__(self, n_filters):
         super(ResidualBlock, self).__init__()
-        self.conv1 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+        self.conv1 = nn.Conv2d(
+            n_filters, n_filters, kernel_size=3, padding=1, bias=False
+        )
         self.bn1 = nn.BatchNorm2d(n_filters)
         self.prelu = nn.PReLU(n_filters)
-        self.conv2 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+        self.conv2 = nn.Conv2d(
+            n_filters, n_filters, kernel_size=3, padding=1, bias=False
+        )
         self.bn2 = nn.BatchNorm2d(n_filters)
 
         # Orthogonal initialisation
@@ -33,7 +37,9 @@ def forward(self, x):
 class UpscaleBlock(nn.Module):
     def __init__(self, n_filters):
         super(UpscaleBlock, self).__init__()
-        self.upscaling_conv = nn.Conv2d(n_filters, 4 * n_filters, kernel_size=3, padding=1)
+        self.upscaling_conv = nn.Conv2d(
+            n_filters, 4 * n_filters, kernel_size=3, padding=1
+        )
         self.upscaling_shuffler = nn.PixelShuffle(2)
         self.upscaling = nn.PReLU(n_filters)
         _initialize_orthogonal(self.upscaling_conv)
@@ -54,14 +60,21 @@ def __init__(self, rescale_factor, n_filters, n_blocks):
 
         for residual_block_num in range(1, n_blocks + 1):
             residual_block = ResidualBlock(self.n_filters)
-            self.add_module('residual_block' + str(residual_block_num), nn.Sequential(residual_block))
-
-        self.skip_conv = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False)
+            self.add_module(
+                "residual_block" + str(residual_block_num),
+                nn.Sequential(residual_block),
+            )
+
+        self.skip_conv = nn.Conv2d(
+            n_filters, n_filters, kernel_size=3, padding=1, bias=False
+        )
         self.skip_bn = nn.BatchNorm2d(n_filters)
 
         for upscale_block_num in range(1, self.rescale_levels + 1):
             upscale_block = UpscaleBlock(self.n_filters)
-            self.add_module('upscale_block' + str(upscale_block_num), nn.Sequential(upscale_block))
+            self.add_module(
+                "upscale_block" + str(upscale_block_num), nn.Sequential(upscale_block)
+            )
 
         self.output_conv = nn.Conv2d(n_filters, 3, kernel_size=9, padding=4)
 
@@ -74,8 +87,8 @@ def forward(self, x):
         x_init = self.prelu1(self.conv1(x))
         x = self.residual_block1(x_init)
         for residual_block_num in range(2, self.n_blocks + 1):
-            x = getattr(self, 'residual_block' + str(residual_block_num))(x)
+            x = getattr(self, "residual_block" + str(residual_block_num))(x)
         x = self.skip_bn(self.skip_conv(x)) + x_init
         for upscale_block_num in range(1, self.rescale_levels + 1):
-            x = getattr(self, 'upscale_block' + str(upscale_block_num))(x)
+            x = getattr(self, "upscale_block" + str(upscale_block_num))(x)
         return self.output_conv(x)
diff --git a/test/onnx/model_defs/super_resolution.py b/test/onnx/model_defs/super_resolution.py
index 958d2f95b62e..dc84ec4192ee 100644
--- a/test/onnx/model_defs/super_resolution.py
+++ b/test/onnx/model_defs/super_resolution.py
@@ -10,7 +10,7 @@ def __init__(self, upscale_factor):
         self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
         self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
         self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
-        self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+        self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1))
         self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
 
         self._initialize_weights()
@@ -23,7 +23,7 @@ def forward(self, x):
         return x
 
     def _initialize_weights(self):
-        init.orthogonal_(self.conv1.weight, init.calculate_gain('relu'))
-        init.orthogonal_(self.conv2.weight, init.calculate_gain('relu'))
-        init.orthogonal_(self.conv3.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv1.weight, init.calculate_gain("relu"))
+        init.orthogonal_(self.conv2.weight, init.calculate_gain("relu"))
+        init.orthogonal_(self.conv3.weight, init.calculate_gain("relu"))
         init.orthogonal_(self.conv4.weight)
diff --git a/test/onnx/model_defs/word_language_model.py b/test/onnx/model_defs/word_language_model.py
index 2b500d93eff4..e4ad3bf51976 100644
--- a/test/onnx/model_defs/word_language_model.py
+++ b/test/onnx/model_defs/word_language_model.py
@@ -1,28 +1,43 @@
 # The model is from here:
 #   https://github.com/pytorch/examples/blob/master/word_language_model/model.py
 
+from typing import Optional, Tuple
+
 import torch
 import torch.nn as nn
 from torch import Tensor
-from typing import Tuple, Optional
+
 
 class RNNModel(nn.Module):
     """Container module with an encoder, a recurrent module, and a decoder."""
 
-    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
-                 dropout=0.5, tie_weights=False, batchsize=2):
+    def __init__(
+        self,
+        rnn_type,
+        ntoken,
+        ninp,
+        nhid,
+        nlayers,
+        dropout=0.5,
+        tie_weights=False,
+        batchsize=2,
+    ):
         super(RNNModel, self).__init__()
         self.drop = nn.Dropout(dropout)
         self.encoder = nn.Embedding(ntoken, ninp)
-        if rnn_type in ['LSTM', 'GRU']:
+        if rnn_type in ["LSTM", "GRU"]:
             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
         else:
             try:
-                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
+                nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type]
             except KeyError:
-                raise ValueError("""An invalid option for `--model` was supplied,
-                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") from None
-            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
+                raise ValueError(
+                    """An invalid option for `--model` was supplied,
+                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"""
+                ) from None
+            self.rnn = nn.RNN(
+                ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout
+            )
         self.decoder = nn.Linear(nhid, ntoken)
 
         # Optionally tie weights as in:
@@ -33,7 +48,9 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
         # https://arxiv.org/abs/1611.01462
         if tie_weights:
             if nhid != ninp:
-                raise ValueError('When using the tied flag, nhid must be equal to emsize')
+                raise ValueError(
+                    "When using the tied flag, nhid must be equal to emsize"
+                )
             self.decoder.weight = self.encoder.weight
 
         self.init_weights()
@@ -61,20 +78,26 @@ def forward(self, input, hidden):
         emb = self.drop(self.encoder(input))
         output, hidden = self.rnn(emb, hidden)
         output = self.drop(output)
-        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
+        decoded = self.decoder(
+            output.view(output.size(0) * output.size(1), output.size(2))
+        )
         self.hidden = RNNModel.repackage_hidden(hidden)
         return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
     def init_hidden(self, bsz):
         weight = next(self.parameters()).data
-        if self.rnn_type == 'LSTM':
-            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
-                    weight.new(self.nlayers, bsz, self.nhid).zero_())
+        if self.rnn_type == "LSTM":
+            return (
+                weight.new(self.nlayers, bsz, self.nhid).zero_(),
+                weight.new(self.nlayers, bsz, self.nhid).zero_(),
+            )
         else:
             return weight.new(self.nlayers, bsz, self.nhid).zero_()
 
+
 class RNNModelWithTensorHidden(RNNModel):
     """Supports GRU scripting."""
+
     @staticmethod
     def repackage_hidden(h):
         """Detach hidden states from their history."""
@@ -84,12 +107,16 @@ def forward(self, input: Tensor, hidden: Tensor):
         emb = self.drop(self.encoder(input))
         output, hidden = self.rnn(emb, hidden)
         output = self.drop(output)
-        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
+        decoded = self.decoder(
+            output.view(output.size(0) * output.size(1), output.size(2))
+        )
         self.hidden = RNNModelWithTensorHidden.repackage_hidden(hidden)
         return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
+
 class RNNModelWithTupleHidden(RNNModel):
     """Supports LSTM scripting."""
+
     @staticmethod
     def repackage_hidden(h: Tuple[Tensor, Tensor]):
         """Detach hidden states from their history."""
@@ -99,6 +126,8 @@ def forward(self, input: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None)
         emb = self.drop(self.encoder(input))
         output, hidden = self.rnn(emb, hidden)
         output = self.drop(output)
-        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
+        decoded = self.decoder(
+            output.view(output.size(0) * output.size(1), output.size(2))
+        )
         self.hidden = self.repackage_hidden(tuple(hidden))
         return decoded.view(output.size(0), output.size(1), decoded.size(1))
diff --git a/test/onnx/pytorch_helper.py b/test/onnx/pytorch_helper.py
index a07652b4d06f..38e7f7b672a1 100644
--- a/test/onnx/pytorch_helper.py
+++ b/test/onnx/pytorch_helper.py
@@ -1,9 +1,10 @@
 import io
-import torch.onnx
+
 import onnx
-from caffe2.python.onnx.backend import Caffe2Backend
-from caffe2.python.core import BlobReference, Net
 
+import torch.onnx
+from caffe2.python.core import BlobReference, Net
+from caffe2.python.onnx.backend import Caffe2Backend
 
 _next_idx = 0
 # Clone net takes a dict instead of a lambda
@@ -54,19 +55,23 @@ def PyTorchModule(helper, model, sample_arguments, caffe2_inputs, prefix_name=No
     # TODO: handle the case where model cannot be exported
     # and embed as a Python op in Caffe2
     f = io.BytesIO()
-    torch.onnx.export(
-        model, sample_arguments, f, export_params=True)
+    torch.onnx.export(model, sample_arguments, f, export_params=True)
     onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(
-        onnx_model)
+    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
 
     initialized = set([x.name for x in onnx_model.graph.initializer])
-    uninitialized_inputs = {x.name: i for i, x in enumerate(
-        onnx_model.graph.input) if x.name not in initialized}
+    uninitialized_inputs = {
+        x.name: i
+        for i, x in enumerate(onnx_model.graph.input)
+        if x.name not in initialized
+    }
 
-    if(len(uninitialized_inputs) != len(caffe2_inputs)):
-        raise ValueError("Expected {} inputs but found {}".format(
-            len(uninitialized_inputs), len(caffe2_inputs)))
+    if len(uninitialized_inputs) != len(caffe2_inputs):
+        raise ValueError(
+            "Expected {} inputs but found {}".format(
+                len(uninitialized_inputs), len(caffe2_inputs)
+            )
+        )
 
     def remap_blob_name(name):
         if name in uninitialized_inputs:
@@ -80,6 +85,10 @@ def remap_blob_name(name):
     init_net = Net(init_net).Clone("anon", _FakeDict(remap_blob_name))
     helper.param_init_net.AppendNet(init_net)
 
-    results = tuple([BlobReference(remap_blob_name(x.name), helper.net)
-                     for x in onnx_model.graph.output])
+    results = tuple(
+        [
+            BlobReference(remap_blob_name(x.name), helper.net)
+            for x in onnx_model.graph.output
+        ]
+    )
     return results
diff --git a/test/onnx/test_caffe2_common.py b/test/onnx/test_caffe2_common.py
index 52dc6363760e..9f7f288e0e7c 100644
--- a/test/onnx/test_caffe2_common.py
+++ b/test/onnx/test_caffe2_common.py
@@ -1,12 +1,14 @@
 # Owner(s): ["module: onnx"]
 
 import glob
+import os
+
 import numpy as np
 import onnx.backend.test
-import caffe2.python.onnx.backend as c2
-import os
 from onnx import numpy_helper
 
+import caffe2.python.onnx.backend as c2
+
 
 def load_tensor_as_numpy_array(f):
     tensor = onnx.TensorProto()
@@ -26,13 +28,23 @@ def run_generated_test(model_file, data_dir, device="CPU"):
     input_num = len(glob.glob(os.path.join(data_dir, "input_*.pb")))
     inputs = []
     for i in range(input_num):
-        inputs.append(numpy_helper.to_array(load_tensor_as_numpy_array(
-            os.path.join(data_dir, "input_{}.pb".format(i)))))
+        inputs.append(
+            numpy_helper.to_array(
+                load_tensor_as_numpy_array(
+                    os.path.join(data_dir, "input_{}.pb".format(i))
+                )
+            )
+        )
     output_num = len(glob.glob(os.path.join(data_dir, "output_*.pb")))
     outputs = []
     for i in range(output_num):
-        outputs.append(numpy_helper.to_array(load_tensor_as_numpy_array(
-            os.path.join(data_dir, "output_{}.pb".format(i)))))
+        outputs.append(
+            numpy_helper.to_array(
+                load_tensor_as_numpy_array(
+                    os.path.join(data_dir, "output_{}.pb".format(i))
+                )
+            )
+        )
     prepared = c2.prepare(model, device=device)
     c2_outputs = prepared.run(inputs)
     assert_similar(outputs, c2_outputs)
diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py
index f356317a5b81..b04d5df34a17 100644
--- a/test/onnx/test_custom_ops.py
+++ b/test/onnx/test_custom_ops.py
@@ -1,20 +1,19 @@
 # Owner(s): ["module: onnx"]
 
 import unittest
-import torch
-import torch.utils.cpp_extension
-
-import onnx
-import caffe2.python.onnx.backend as c2
 
 import numpy as np
-
+import onnx
 from test_pytorch_onnx_caffe2 import do_export
 from test_pytorch_onnx_onnxruntime import run_model_test
+
+import caffe2.python.onnx.backend as c2
+import torch
+import torch.utils.cpp_extension
 from torch.onnx.symbolic_helper import _unimplemented
 
-class TestCustomOps(unittest.TestCase):
 
+class TestCustomOps(unittest.TestCase):
     def test_custom_add(self):
         op_source = """
         #include <torch/script.h>
@@ -42,7 +41,10 @@ def symbolic_custom_add(g, self, other):
             return g.op("Add", self, other)
 
         from torch.onnx import register_custom_op_symbolic
-        register_custom_op_symbolic("custom_namespace::custom_add", symbolic_custom_add, 9)
+
+        register_custom_op_symbolic(
+            "custom_namespace::custom_add", symbolic_custom_add, 9
+        )
 
         x = torch.randn(2, 3, 4, requires_grad=False)
         y = torch.randn(2, 3, 4, requires_grad=False)
@@ -62,7 +64,6 @@ class TestCustomAutogradFunction(unittest.TestCase):
 
     def test_symbolic(self):
         class MyClip(torch.autograd.Function):
-
             @staticmethod
             def forward(ctx, input, scalar):
                 ctx.save_for_backward(input)
@@ -83,18 +84,16 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4, requires_grad=True)
         model = MyModule()
-        run_model_test(self, model, input=(x, ))
+        run_model_test(self, model, input=(x,))
 
     def test_register_custom_op(self):
         class MyClip(torch.autograd.Function):
-
             @staticmethod
             def forward(ctx, input, scalar):
                 ctx.save_for_backward(input)
                 return input.clamp(min=scalar)
 
         class MyRelu(torch.autograd.Function):
-
             @staticmethod
             def forward(ctx, input):
                 ctx.save_for_backward(input)
@@ -111,21 +110,24 @@ def forward(self, x):
                 h = self.relu(h)
                 return h
 
-        def symbolic_pythonop(g, n, *args, **kwargs):
+        def symbolic_pythonop(ctx: torch.onnx.SymbolicContext, g, *args, **kwargs):
+            n = ctx.cur_node
             name = kwargs["name"]
             if name == "MyClip":
-                return g.op("Clip", args[0], min_f=args[1])
+                return g.op("Clip", args[0], min_f=args[1], outputs=n.outputsSize())
             elif name == "MyRelu":
-                return g.op("Relu", args[0])
+                return g.op("Relu", args[0], outputs=n.outputsSize())
             else:
                 return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
 
         from torch.onnx import register_custom_op_symbolic
+
         register_custom_op_symbolic("prim::PythonOp", symbolic_pythonop, 1)
 
         x = torch.randn(2, 3, 4, requires_grad=True)
         model = MyModule()
-        run_model_test(self, model, input=(x, ))
+        run_model_test(self, model, input=(x,))
+
 
 class TestExportAsContribOps(unittest.TestCase):
     opset_version = 14
@@ -136,7 +138,7 @@ def test_contrib_op_with_loop(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.gelu = torch.nn.GELU()
+                self.gelu = torch.nn.GELU(approximate="none")
 
             def forward(self, x):
                 res = []
@@ -149,15 +151,17 @@ def forward(self, x):
                     res.append(x[0])
                 return torch.stack(res), torch.stack(res2)
 
-        def symbolic_custom_gelu(g, input):
+        def symbolic_custom_gelu(g, input, approximate):
             return g.op("com.microsoft::Gelu", input).setType(input.type())
 
         from torch.onnx import register_custom_op_symbolic
+
         register_custom_op_symbolic("::gelu", symbolic_custom_gelu, 1)
 
         x = torch.randn(3, 3, 4, requires_grad=True)
         model = torch.jit.script(M())
-        run_model_test(self, model, input=(x, ))
+        run_model_test(self, model, input=(x,))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index 5d22c255f832..dc849528842a 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -1,90 +1,104 @@
 # Owner(s): ["module: onnx"]
 
+import unittest
+
+from model_defs.dcgan import _netD, _netG, bsz, imgsz, nz, weights_init
+from model_defs.emb_seq import EmbeddingNetwork1, EmbeddingNetwork2
+from model_defs.mnist import MNIST
+from model_defs.op_test import (
+    ConcatNet,
+    DummyNet,
+    FakeQuantNet,
+    PermuteNet,
+    PReluNet,
+)
+from model_defs.squeezenet import SqueezeNet
+from model_defs.srresnet import SRResNet
+from model_defs.super_resolution import SuperResolutionNet
+from test_pytorch_common import (
+    TestCase,
+    run_tests,
+    skipIfNoLapack,
+    skipIfUnsupportedMinOpsetVersion,
+    skipScriptTest,
+)
+from torchvision.models import shufflenet_v2_x1_0
 from torchvision.models.alexnet import alexnet
-from torchvision.models.inception import inception_v3
 from torchvision.models.densenet import densenet121
-from torchvision.models.resnet import resnet50
-from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn
 from torchvision.models.googlenet import googlenet
+from torchvision.models.inception import inception_v3
 from torchvision.models.mnasnet import mnasnet1_0
 from torchvision.models.mobilenet import mobilenet_v2
-from torchvision.models import shufflenet_v2_x1_0
-from torchvision.models.segmentation import fcn_resnet101, deeplabv3_resnet101
-from torchvision.models.video import r3d_18, mc3_18, r2plus1d_18
-
-from model_defs.mnist import MNIST
-from model_defs.squeezenet import SqueezeNet
-from model_defs.super_resolution import SuperResolutionNet
-from model_defs.srresnet import SRResNet
-from model_defs.dcgan import _netD, _netG, weights_init, bsz, imgsz, nz
-from model_defs.op_test import DummyNet, ConcatNet, PermuteNet, PReluNet, FakeQuantNet
-from model_defs.emb_seq import EmbeddingNetwork1, EmbeddingNetwork2
-
-from test_pytorch_common import TestCase, run_tests, skipIfNoLapack, skipIfUnsupportedMinOpsetVersion, disableScriptTest
+from torchvision.models.resnet import resnet50
+from torchvision.models.segmentation import deeplabv3_resnet101, fcn_resnet101
+from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn
+from torchvision.models.video import mc3_18, r2plus1d_18, r3d_18
+from verify import verify
 
+import caffe2.python.onnx.backend as backend
 import torch
 import torch.onnx
 import torch.onnx.utils
+from torch import quantization
 from torch.autograd import Variable
 from torch.onnx import OperatorExportTypes
-from torch import quantization
-
-import unittest
-
-import caffe2.python.onnx.backend as backend
-
-from verify import verify
 
 if torch.cuda.is_available():
+
     def toC(x):
         return x.cuda()
+
 else:
+
     def toC(x):
         return x
 
+
 BATCH_SIZE = 2
 
 
 class TestModels(TestCase):
+    opset_version = 9  # Caffe2 doesn't support the default.
     keep_initializers_as_inputs = False
-    from torch.onnx.symbolic_helper import _export_onnx_opset_version
-    opset_version = _export_onnx_opset_version
 
     def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7):
         with torch.onnx.select_model_mode_for_export(model, None):
             graph = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX)
             torch._C._jit_pass_lint(graph)
-            verify(model, inputs, backend, rtol=rtol, atol=atol)
+            verify(
+                model,
+                inputs,
+                backend,
+                rtol=rtol,
+                atol=atol,
+                opset_version=self.opset_version,
+            )
 
     def test_ops(self):
-        x = Variable(
-            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
-        )
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(DummyNet()), toC(x))
 
     def test_prelu(self):
-        x = Variable(
-            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
-        )
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(PReluNet(), x)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_concat(self):
         input_a = Variable(torch.randn(BATCH_SIZE, 3))
         input_b = Variable(torch.randn(BATCH_SIZE, 3))
-        inputs = ((toC(input_a), toC(input_b)), )
+        inputs = ((toC(input_a), toC(input_b)),)
         self.exportTest(toC(ConcatNet()), inputs)
 
     def test_permute(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 10, 12))
         self.exportTest(PermuteNet(), x)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_embedding_sequential_1(self):
         x = Variable(torch.randint(0, 10, (BATCH_SIZE, 3)))
         self.exportTest(EmbeddingNetwork1(), x)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_embedding_sequential_2(self):
         x = Variable(torch.randint(0, 10, (BATCH_SIZE, 3)))
         self.exportTest(EmbeddingNetwork2(), x)
@@ -92,19 +106,17 @@ def test_embedding_sequential_2(self):
     @unittest.skip("This model takes too much memory")
     def test_srresnet(self):
         x = Variable(torch.randn(1, 3, 224, 224).fill_(1.0))
-        self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x))
+        self.exportTest(
+            toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x)
+        )
 
     @skipIfNoLapack
     def test_super_resolution(self):
-        x = Variable(
-            torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0)
-        )
+        x = Variable(torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0))
         self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6)
 
     def test_alexnet(self):
-        x = Variable(
-            torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
-        )
+        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(alexnet()), toC(x))
 
     def test_mnist(self):
@@ -140,7 +152,7 @@ def test_resnet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(resnet50()), toC(x), atol=1e-6)
 
-    @disableScriptTest()  # None type in outputs
+    @skipScriptTest(min_opset_version=15)  # None type in outputs
     def test_inception(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 299, 299))
         self.exportTest(toC(inception_v3()), toC(x))
@@ -163,14 +175,14 @@ def test_densenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(densenet121()), toC(x), rtol=1e-2, atol=1e-5)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_dcgan_netD(self):
         netD = _netD(1)
         netD.apply(weights_init)
         input = Variable(torch.empty(bsz, 3, imgsz, imgsz).normal_(0, 1))
         self.exportTest(toC(netD), toC(input))
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_dcgan_netG(self):
         netG = _netG(1)
         netG.apply(weights_init)
@@ -190,7 +202,9 @@ def test_qat_resnet_pertensor(self):
 
         # Use per tensor for weight. Per channel support will come with opset 13
         qat_resnet50.qconfig = quantization.QConfig(
-            activation=quantization.default_fake_quant, weight=quantization.default_fake_quant)
+            activation=quantization.default_fake_quant,
+            weight=quantization.default_fake_quant,
+        )
         quantization.prepare_qat(qat_resnet50, inplace=True)
         qat_resnet50.apply(torch.ao.quantization.enable_observer)
         qat_resnet50.apply(torch.ao.quantization.enable_fake_quant)
@@ -211,7 +225,8 @@ def test_qat_resnet_per_channel(self):
 
         qat_resnet50.qconfig = quantization.QConfig(
             activation=quantization.default_fake_quant,
-            weight=quantization.default_per_channel_weight_fake_quant)
+            weight=quantization.default_per_channel_weight_fake_quant,
+        )
         quantization.prepare_qat(qat_resnet50, inplace=True)
         qat_resnet50.apply(torch.ao.quantization.enable_observer)
         qat_resnet50.apply(torch.ao.quantization.enable_fake_quant)
@@ -224,7 +239,7 @@ def test_qat_resnet_per_channel(self):
 
         self.exportTest(toC(qat_resnet50), toC(x))
 
-    @disableScriptTest()  # None type in outputs
+    @skipScriptTest(min_opset_version=15)  # None type in outputs
     def test_googlenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(googlenet()), toC(x), rtol=1e-3, atol=1e-5)
@@ -237,7 +252,7 @@ def test_mobilenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(mobilenet_v2()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()  # prim_data
+    @skipScriptTest()  # prim_data
     def test_shufflenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(shufflenet_v2_x1_0()), toC(x), rtol=1e-3, atol=1e-5)
@@ -245,12 +260,22 @@ def test_shufflenet(self):
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_fcn(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
-        self.exportTest(toC(fcn_resnet101()), toC(x), rtol=1e-3, atol=1e-5)
+        self.exportTest(
+            toC(fcn_resnet101(pretrained=False, pretrained_backbone=False)),
+            toC(x),
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_deeplab(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
-        self.exportTest(toC(deeplabv3_resnet101()), toC(x), rtol=1e-3, atol=1e-5)
+        self.exportTest(
+            toC(deeplabv3_resnet101(pretrained=False, pretrained_backbone=False)),
+            toC(x),
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     def test_r3d_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 62006fd8a068..a4f4295fad65 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -1,10 +1,11 @@
 # Owner(s): ["module: onnx"]
 
 import unittest
-import onnxruntime  # noqa: F401
 
+import onnxruntime  # noqa: F401
 from test_models import TestModels
 from test_pytorch_onnx_onnxruntime import run_model_test
+
 import torch
 
 
@@ -14,29 +15,33 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):
     for opset_version in opset_versions:
         self.opset_version = opset_version
         self.onnx_shape_inference = True
-        run_model_test(self, model, False,
-                       input=inputs, rtol=rtol, atol=atol)
+        run_model_test(self, model, False, input=inputs, rtol=rtol, atol=atol)
 
         if self.is_script_test_enabled and opset_version > 11:
             script_model = torch.jit.script(model)
-            run_model_test(self, script_model, False,
-                           input=inputs, rtol=rtol, atol=atol)
+            run_model_test(
+                self, script_model, False, input=inputs, rtol=rtol, atol=atol
+            )
 
 
-TestModels = type(str("TestModels"),
-                  (unittest.TestCase,),
-                  dict(TestModels.__dict__,
-                       is_script_test_enabled=False,
-                       exportTest=exportTest))
+TestModels = type(
+    str("TestModels"),
+    (unittest.TestCase,),
+    dict(TestModels.__dict__, is_script_test_enabled=False, exportTest=exportTest),
+)
 
 
 # model tests for scripting with new JIT APIs and shape inference
-TestModels_new_jit_API = type(str("TestModels_new_jit_API"),
-                              (unittest.TestCase,),
-                              dict(TestModels.__dict__,
-                                   exportTest=exportTest,
-                                   is_script_test_enabled=True,
-                                   onnx_shape_inference=True))
+TestModels_new_jit_API = type(
+    str("TestModels_new_jit_API"),
+    (unittest.TestCase,),
+    dict(
+        TestModels.__dict__,
+        exportTest=exportTest,
+        is_script_test_enabled=True,
+        onnx_shape_inference=True,
+    ),
+)
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/test_onnx_common.py b/test/onnx/test_onnx_common.py
index dabc10fb649f..2e5a907a361a 100644
--- a/test/onnx/test_onnx_common.py
+++ b/test/onnx/test_onnx_common.py
@@ -2,10 +2,16 @@
 
 import os
 
-
-onnx_model_dir = os.path.join(os.path.dirname(
-    os.path.realpath(__file__)), os.pardir, "repos", "onnx", "onnx",
-    "backend", "test", "data")
+onnx_model_dir = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)),
+    os.pardir,
+    "repos",
+    "onnx",
+    "onnx",
+    "backend",
+    "test",
+    "data",
+)
 
 
 pytorch_converted_dir = os.path.join(onnx_model_dir, "pytorch-converted")
diff --git a/test/onnx/test_onnx_export.py b/test/onnx/test_onnx_export.py
new file mode 100644
index 000000000000..6e955d1d0b98
--- /dev/null
+++ b/test/onnx/test_onnx_export.py
@@ -0,0 +1,143 @@
+# Owner(s): ["module: onnx"]
+
+import contextlib
+import io
+import itertools
+import os
+import sys
+import unittest.mock
+from typing import Callable, Iterable, Optional, Tuple, Union
+
+import onnx
+from test_pytorch_common import TestCase
+
+import torch
+from torch.onnx import OperatorExportTypes, symbolic_registry
+from torch.onnx._globals import GLOBALS
+from torch.onnx.symbolic_helper import _onnx_unsupported
+from torch.testing._internal.common_utils import custom_op, skipIfCaffe2
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+
+def export_to_onnx(
+    model: Union[torch.nn.Module, torch.jit.ScriptFunction],
+    input: Tuple[torch.Tensor],
+    custom_ops: Optional[
+        Iterable[
+            Union[contextlib.AbstractContextManager, contextlib.ContextDecorator],
+        ]
+    ] = None,
+    mocks: Optional[Iterable] = None,
+    operator_export_type: OperatorExportTypes = OperatorExportTypes.ONNX,
+    opset_version: int = GLOBALS.export_onnx_opset_version,
+) -> onnx.ModelProto:
+    """Exports `model(input)` to ONNX and returns it.
+
+    Custom operators and/or unittest patches can be used help reproducing specific behaviors.
+
+    Args:
+        model: model to export
+        input: model input with same format as `torch.onnx.export(..,args,...)`
+        custom_ops: list of custom operators to use during export
+        mocks: list of mocks to use during export
+        operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)`
+        opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)`
+    Returns:
+        A valid ONNX model (`onnx.ModelProto`)
+    """
+    custom_ops = custom_ops or []
+    mocks = mocks or []
+    with contextlib.ExitStack() as stack:
+        for ctx in itertools.chain(custom_ops, mocks):
+            stack.enter_context(ctx)
+
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            input,
+            f,
+            operator_export_type=operator_export_type,
+            opset_version=opset_version,
+        )
+
+    # Validate ONNX graph before returning it
+    onnx_model = onnx.load_from_string(f.getvalue())
+    onnx.checker.check_model(onnx_model)
+    return onnx_model
+
+
+class TestONNXExport(TestCase):
+    @skipIfCaffe2
+    def test_clip_aten_fallback_due_exception(self):
+        def bad_clamp(g, self, min, max):
+            return _onnx_unsupported("Bad boy!")
+
+        class MyClip(torch.nn.Module):
+            def forward(self, x):
+                return torch.clamp(x, min=-0.5, max=0.5)
+
+        onnx_model = export_to_onnx(
+            MyClip(),
+            torch.randn(3, 4, requires_grad=True),
+            custom_ops=[custom_op("aten::clamp", bad_clamp, 9)],
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        self.assertAtenOp(onnx_model, "clamp", "Tensor")
+
+    @skipIfCaffe2
+    def test_clip_aten_fallback_explicit_request(self):
+        class MyClip(torch.nn.Module):
+            def forward(self, x):
+                return torch.clamp(x, min=-0.5, max=0.5)
+
+        def break_is_registered_op_api(opname, domain, version):
+            fake_missing_symbolics = ("clamp",)
+            if opname in fake_missing_symbolics:
+                return False
+            return (
+                (domain, version) in symbolic_registry._registry
+                and opname in symbolic_registry._registry[(domain, version)]
+            )
+
+        # Force missing symbolic for well-known op using a mock
+        onnx_model = export_to_onnx(
+            MyClip(),
+            torch.randn(3, 4, requires_grad=True),
+            mocks=[
+                unittest.mock.patch(
+                    "torch.onnx.symbolic_registry.is_registered_op",
+                    side_effect=break_is_registered_op_api,
+                )
+            ],
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        self.assertAtenOp(onnx_model, "clamp", "Tensor")
+
+    def _helper_test_to_(self, cast_fn: Callable[[torch.Tensor], torch.Tensor]):
+        """Helper to test aten::to(device) variants.
+
+        `cast_fn` is converted into a `torch.jit.script`. It wraps `aten::to`
+        during export to preventing the devices to be hard-coded.
+
+        Needed by detectron2 after https://github.com/facebookresearch/detectron2/pull/4132/
+        """
+        cast_fn = torch.jit.script(cast_fn)
+        onnx_model = export_to_onnx(cast_fn, torch.zeros([1, 3, 32, 32]))
+        for n in onnx_model.graph.node:
+            self.assertNotEqual(n.op_type, "To")
+            self.assertNotEqual(n.op_type, "Cast")
+
+    def test_to__cpu_string(self):
+        def cast_cpu_string(src: torch.Tensor) -> torch.Tensor:
+            return src.to("cpu")
+
+        self._helper_test_to_(cast_cpu_string)
+
+    def test_to__device_cpu_string(self):
+        def cast_device_cpu_string(src: torch.Tensor) -> torch.Tensor:
+            return src.to(device="cpu")
+
+        self._helper_test_to_(cast_device_cpu_string)
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index c6b13f8693bd..cd672ac0dc3a 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -1,25 +1,27 @@
 # Owner(s): ["module: onnx"]
 
+import io
+import itertools
+
+import onnx
 from test_pytorch_common import TestCase, run_tests
 
 import torch
 import torch.onnx
 from torch.nn import Module
-
-import onnx
-
-import io
-
-from torch.onnx.symbolic_helper import _export_onnx_opset_version
 from torch.onnx import producer_name, producer_version
+from torch.onnx._globals import GLOBALS
 
 
-def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_version):
+def check_onnx_opset_operator(
+    model, ops, opset_version=GLOBALS.export_onnx_opset_version
+):
     # check_onnx_components
     assert (
-        model.producer_name == producer_name and
-        model.producer_version == producer_version and
-        model.opset_import[0].version == opset_version)
+        model.producer_name == producer_name
+        and model.producer_version == producer_version
+        and model.opset_import[0].version == opset_version
+    )
 
     # check the schema with the onnx checker
     onnx.checker.check_model(model)
@@ -34,36 +36,48 @@ def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_versi
     assert len(ops) == len(graph.node)
     for i in range(0, len(ops)):
         assert graph.node[i].op_type == ops[i]["op_name"]
-        if "attributes" in ops[i] :
+        if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
             for j in range(0, len(attributes)):
                 for attribute_field in attributes[j].keys():
-                    assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field)
-
-
-def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL,
-                               input_names=None, dynamic_axes=None):
+                    assert attributes[j][attribute_field] == getattr(
+                        graph.node[i].attribute[j], attribute_field
+                    )
+
+
+def check_onnx_opsets_operator(
+    module,
+    x,
+    ops,
+    opset_versions,
+    training=torch.onnx.TrainingMode.EVAL,
+    input_names=None,
+    dynamic_axes=None,
+):
     for opset_version in opset_versions:
         f = io.BytesIO()
-        torch.onnx.export(module, x, f,
-                          opset_version=opset_version,
-                          training=training,
-                          input_names=input_names,
-                          dynamic_axes=dynamic_axes)
+        torch.onnx.export(
+            module,
+            x,
+            f,
+            opset_version=opset_version,
+            training=training,
+            input_names=input_names,
+            dynamic_axes=dynamic_axes,
+        )
         model = onnx.load(io.BytesIO(f.getvalue()))
         check_onnx_opset_operator(model, ops[opset_version], opset_version)
 
 
 class TestONNXOpset(TestCase):
-
     def test_opset_fallback(self):
         class MyModule(Module):
             def forward(self, x):
                 return torch.isnan(x)
 
-        ops = [{"op_name" : "IsNaN"}]
-        ops = {9 : ops, 10 : ops}
+        ops = [{"op_name": "IsNaN"}]
+        ops = {9: ops, 10: ops}
         x = torch.tensor([1.0, float("nan"), 2.0])
         check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
 
@@ -72,11 +86,20 @@ class MyModule(Module):
             def forward(self, x):
                 return torch.topk(x, 3)
 
-        ops_9 = [{"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2},
-                 {"name": "k", "i": 3, "type": 2}]}]
-        ops_10 = [{"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}]
+        ops_9 = [
+            {
+                "op_name": "TopK",
+                "attributes": [
+                    {"name": "axis", "i": -1, "type": 2},
+                    {"name": "k", "i": 3, "type": 2},
+                ],
+            }
+        ]
+        ops_10 = [
+            {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}
+        ]
         ops = {9: ops_9, 10: ops_10}
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
 
         # test with dynamic k
@@ -85,45 +108,61 @@ class MyModuleDynamic(torch.jit.ScriptModule):
             def forward(self, input, k):
                 return torch.topk(input, k)
 
-        ops_10 = [{"op_name": "Constant", "attributes": [{"name": "value", "type": 4}]},
-                  {"op_name": "Reshape"},
-                  {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}]
+        ops_10 = [
+            {"op_name": "Constant", "attributes": [{"name": "value", "type": 4}]},
+            {"op_name": "Reshape"},
+            {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]},
+        ]
         ops = {10: ops_10}
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         k = torch.tensor(3)
         module = MyModuleDynamic()
-        check_onnx_opsets_operator(module, [x, k], ops,
-                                   opset_versions=[10])
+        check_onnx_opsets_operator(module, [x, k], ops, opset_versions=[10])
 
     def test_maxpool(self):
         module = torch.nn.MaxPool1d(2, stride=1)
 
-        ops_9 = [{"op_name" : "MaxPool",
-                  "attributes" :
-                  [{"name": "kernel_shape", "ints": [2], "type": 7},
-                   {"name": "pads", "ints": [0, 0], "type": 7},
-                   {"name": "strides", "ints": [1], "type": 7}]}]
-        ops_10 = [{"op_name" : "MaxPool",
-                   "attributes" :
-                   [{"name": "ceil_mode", "i": 0, "type": 2},
+        ops_9 = [
+            {
+                "op_name": "MaxPool",
+                "attributes": [
+                    {"name": "kernel_shape", "ints": [2], "type": 7},
+                    {"name": "pads", "ints": [0, 0], "type": 7},
+                    {"name": "strides", "ints": [1], "type": 7},
+                ],
+            }
+        ]
+        ops_10 = [
+            {
+                "op_name": "MaxPool",
+                "attributes": [
+                    {"name": "ceil_mode", "i": 0, "type": 2},
                     {"name": "kernel_shape", "ints": [2], "type": 7},
                     {"name": "pads", "ints": [0, 0], "type": 7},
-                    {"name": "strides", "ints": [1], "type": 7}]}]
-        ops = {9 : ops_9, 10 : ops_10}
+                    {"name": "strides", "ints": [1], "type": 7},
+                ],
+            }
+        ]
+        ops = {9: ops_9, 10: ops_10}
         x = torch.randn(20, 16, 50)
         check_onnx_opsets_operator(module, x, ops, opset_versions=[9, 10])
 
         # add test with dilations
         module = torch.nn.MaxPool1d(2, stride=1, dilation=2)
 
-        ops_10 = [{"op_name" : "MaxPool",
-                   "attributes" :
-                   [{"name": "ceil_mode", "i": 0, "type": 2},
+        ops_10 = [
+            {
+                "op_name": "MaxPool",
+                "attributes": [
+                    {"name": "ceil_mode", "i": 0, "type": 2},
                     {"name": "dilations", "ints": [2], "type": 7},
                     {"name": "kernel_shape", "ints": [2], "type": 7},
                     {"name": "pads", "ints": [0, 0], "type": 7},
-                    {"name": "strides", "ints": [1], "type": 7}]}]
-        ops = {10 : ops_10}
+                    {"name": "strides", "ints": [1], "type": 7},
+                ],
+            }
+        ]
+        ops = {10: ops_10}
         x = torch.randn(20, 16, 50)
         check_onnx_opsets_operator(module, x, ops, opset_versions=[10])
 
@@ -138,11 +177,23 @@ def forward(self, x):
                 return torch.nn.functional.interpolate(x, size=size, mode="nearest")
 
         module = MyModule()
-        ops8 = [{"op_name" : "Upsample", "attributes" : [{"name": "mode", "s": ("nearest").encode(), "type": 3},
-                {"name": "scales", "floats": [1.0, 1.0, 2.0, 2.0], "type": 6}]}]
-        ops9 = [{"op_name" : "Constant"},
-                {"op_name" : "Upsample", "attributes" : [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops = {8 : ops8, 9 : ops9}
+        ops8 = [
+            {
+                "op_name": "Upsample",
+                "attributes": [
+                    {"name": "mode", "s": ("nearest").encode(), "type": 3},
+                    {"name": "scales", "floats": [1.0, 1.0, 2.0, 2.0], "type": 6},
+                ],
+            }
+        ]
+        ops9 = [
+            {"op_name": "Constant"},
+            {
+                "op_name": "Upsample",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+        ops = {8: ops8, 9: ops9}
         x = torch.randn(2, 2, 2, 2)
         check_onnx_opsets_operator(module, x, ops, opset_versions=[8, 9])
 
@@ -155,11 +206,13 @@ def forward(self, x):
                 return x - 1
 
         module = MyModule()
-        ops_8 = [{"op_name" : "Constant"},
-                 {"op_name" : "Cast", "attributes": [{"name": "to", "i": 7, "type": 2}]},
-                 {"op_name" : "Sub"}]
-        ops_9 = [{"op_name" : "Constant"}, {"op_name" : "Sub"}]
-        ops = {8 : ops_8, 9 : ops_9}
+        ops_8 = [
+            {"op_name": "Constant"},
+            {"op_name": "Cast", "attributes": [{"name": "to", "i": 7, "type": 2}]},
+            {"op_name": "Sub"},
+        ]
+        ops_9 = [{"op_name": "Constant"}, {"op_name": "Sub"}]
+        ops = {8: ops_8, 9: ops_9}
         x = torch.ones(5, 6, dtype=torch.long)
         check_onnx_opsets_operator(module, x, ops, opset_versions=[8, 9])
 
@@ -168,48 +221,63 @@ class MyModule(Module):
             def forward(self, x):
                 return x[0:1]
 
-        ops_9 = [{"op_name" : "Slice",
-                  "attributes" :
-                  [{"name": "axes", "ints": [0], "type": 7},
-                   {"name": "ends", "ints": [1], "type": 7},
-                   {"name": "starts", "ints": [0], "type": 7}]}]
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Slice",
-                   "attributes" : []}]
-        ops = {9 : ops_9, 10 : ops_10}
+        ops_9 = [
+            {
+                "op_name": "Slice",
+                "attributes": [
+                    {"name": "axes", "ints": [0], "type": 7},
+                    {"name": "ends", "ints": [1], "type": 7},
+                    {"name": "starts", "ints": [0], "type": 7},
+                ],
+            }
+        ]
+        ops_10 = [
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Slice", "attributes": []},
+        ]
+        ops = {9: ops_9, 10: ops_10}
         x = torch.randn(3)
         check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
 
         class DynamicSliceModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return x[1:x.size(0)]
+                return x[1 : x.size(0)]
 
         module = DynamicSliceModel()
         x = torch.rand(1, 2)
-        ops_10 = [{"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Gather",
-                   "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]},
-                  {"op_name" : "Unsqueeze",
-                   "attributes" : [{"name" : "axes", "i" : 0, "type" : 7}]},
-                  {"op_name": "Constant"},
-                  {"op_name" : "Slice",
-                   "attributes" : []}]
-        ops = {10 : ops_10}
-        check_onnx_opsets_operator(module, x, ops, opset_versions=[10],
-                                   input_names=['x'], dynamic_axes={"x": [0, 1]})
-
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Slice",
-                   "attributes" : []}]
-        ops = {10 : ops_10}
+        ops_10 = [
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Gather", "attributes": [{"name": "axis", "i": 0, "type": 2}]},
+            {
+                "op_name": "Unsqueeze",
+                "attributes": [{"name": "axes", "i": 0, "type": 7}],
+            },
+            {"op_name": "Constant"},
+            {"op_name": "Slice", "attributes": []},
+        ]
+        ops = {10: ops_10}
+        check_onnx_opsets_operator(
+            module,
+            x,
+            ops,
+            opset_versions=[10],
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
+
+        ops_10 = [
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Slice", "attributes": []},
+        ]
+        ops = {10: ops_10}
         check_onnx_opsets_operator(module, x, ops, opset_versions=[10])
 
     def test_flip(self):
@@ -217,14 +285,16 @@ class MyModule(Module):
             def forward(self, x):
                 return torch.flip(x, dims=[0])
 
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Slice",
-                   "attributes" : []}]
-        ops = {10 : ops_10}
+        ops_10 = [
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Slice", "attributes": []},
+        ]
+        ops = {10: ops_10}
         import numpy
+
         x = torch.tensor(numpy.arange(6.0).reshape(2, 3))
         check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[10])
 
@@ -242,110 +312,145 @@ def forward(self, x):
         # we should only export the onnx Dropout op in training mode; test both modes
 
         # test training mode
-        ops = [{"op_name" : "Dropout", "attributes" : [{"name" : "ratio", "f" : 0.5, "type" : 1}]}]
-        ops = {9 : ops, 10 : ops}
-        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10], training=torch.onnx.TrainingMode.TRAINING)
+        ops = [
+            {
+                "op_name": "Dropout",
+                "attributes": [{"name": "ratio", "f": 0.5, "type": 1}],
+            }
+        ]
+        ops = {9: ops, 10: ops}
+        check_onnx_opsets_operator(
+            MyModule(),
+            x,
+            ops,
+            opset_versions=[9, 10],
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
 
         # test eval mode
-        ops = [{"op_name" : "Identity"}]
-        ops = {9 : ops, 10 : ops}
-        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10], training=torch.onnx.TrainingMode.EVAL)
+        ops = [{"op_name": "Identity"}]
+        ops = {9: ops, 10: ops}
+        check_onnx_opsets_operator(
+            MyModule(),
+            x,
+            ops,
+            opset_versions=[9, 10],
+            training=torch.onnx.TrainingMode.EVAL,
+        )
 
     def test_full(self):
         class MyModule(Module):
             def forward(self, x):
                 return torch.full((3, 4), x)
 
-        ops = [{"op_name" : "Constant"},
-               {"op_name" : "ConstantOfShape"},
-               {"op_name" : "Add"}]
-        ops = {9 : ops, 10 : ops}
-        x = torch.tensor(12.)
+        ops = [
+            {"op_name": "Constant"},
+            {"op_name": "ConstantOfShape"},
+            {"op_name": "Add"},
+        ]
+        ops = {9: ops, 10: ops}
+        x = torch.tensor(12.0)
         check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
 
     def test_interpolate(self):
         class MyModel(torch.nn.Module):
             def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
-                return torch.nn.functional.interpolate(x,
-                                                       size=size,
-                                                       mode="nearest")
-        ops_9 = [{"op_name" : "Shape"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Gather"},
-                 {"op_name" : "Shape"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Gather"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Mul"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Mul"},
-                 {"op_name" : "Unsqueeze"},
-                 {"op_name" : "Unsqueeze"},
-                 {"op_name" : "Concat"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Cast"},
-                 {"op_name" : "Shape"},
-                 {"op_name" : "Slice"},
-                 {"op_name" : "Cast"},
-                 {"op_name" : "Div"},
-                 {"op_name" : "Concat"},
-                 {"op_name" : "Upsample",
-                  "attributes" :
-                  [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops_10 = [{"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Gather"},
-                  {"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Gather"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Mul"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Mul"},
-                  {"op_name" : "Unsqueeze"},
-                  {"op_name" : "Unsqueeze"},
-                  {"op_name" : "Concat"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Cast"},
-                  {"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Slice"},
-                  {"op_name" : "Cast"},
-                  {"op_name" : "Div"},
-                  {"op_name" : "Concat"},
-                  {"op_name" : "Resize",
-                   "attributes" :
-                   [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-
-        ops = {9 : ops_9, 10 : ops_10}
+                return torch.nn.functional.interpolate(x, size=size, mode="nearest")
+
+        ops_9 = [
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Gather"},
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Gather"},
+            {"op_name": "Constant"},
+            {"op_name": "Mul"},
+            {"op_name": "Constant"},
+            {"op_name": "Mul"},
+            {"op_name": "Unsqueeze"},
+            {"op_name": "Unsqueeze"},
+            {"op_name": "Concat"},
+            {"op_name": "Cast"},
+            {"op_name": "Shape"},
+            {"op_name": "Slice"},
+            {"op_name": "Cast"},
+            {"op_name": "Div"},
+            {"op_name": "Constant"},
+            {"op_name": "Concat"},
+            {
+                "op_name": "Upsample",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+        ops_10 = [
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Gather"},
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Gather"},
+            {"op_name": "Constant"},
+            {"op_name": "Mul"},
+            {"op_name": "Constant"},
+            {"op_name": "Mul"},
+            {"op_name": "Unsqueeze"},
+            {"op_name": "Unsqueeze"},
+            {"op_name": "Concat"},
+            {"op_name": "Cast"},
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Slice"},
+            {"op_name": "Cast"},
+            {"op_name": "Div"},
+            {"op_name": "Constant"},
+            {"op_name": "Concat"},
+            {
+                "op_name": "Resize",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+
+        ops = {9: ops_9, 10: ops_10}
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
-        check_onnx_opsets_operator(MyModel(), x, ops, opset_versions=[9, 10],
-                                   input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
-
-        ops_9 = [{"op_name" : "Constant"},
-                 {"op_name" : "Shape"},
-                 {"op_name" : "Slice"},
-                 {"op_name" : "Cast"},
-                 {"op_name" : "Div"},
-                 {"op_name" : "Concat"},
-                 {"op_name" : "Upsample",
-                  "attributes" :
-                  [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Slice"},
-                  {"op_name" : "Cast"},
-                  {"op_name" : "Div"},
-                  {"op_name" : "Concat"},
-                  {"op_name" : "Resize"}]
-
-        ops = {9 : ops_9, 10 : ops_10}
+        check_onnx_opsets_operator(
+            MyModel(),
+            x,
+            ops,
+            opset_versions=[9, 10],
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+        )
+
+        ops_9 = [
+            {"op_name": "Shape"},
+            {"op_name": "Slice"},
+            {"op_name": "Cast"},
+            {"op_name": "Div"},
+            {"op_name": "Constant"},
+            {"op_name": "Concat"},
+            {
+                "op_name": "Upsample",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+        ops_10 = [
+            {"op_name": "Shape"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Constant"},
+            {"op_name": "Slice"},
+            {"op_name": "Cast"},
+            {"op_name": "Div"},
+            {"op_name": "Constant"},
+            {"op_name": "Concat"},
+            {"op_name": "Resize"},
+        ]
+
+        ops = {9: ops_9, 10: ops_10}
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         check_onnx_opsets_operator(MyModel(), x, ops, opset_versions=[9, 10])
 
@@ -354,21 +459,64 @@ def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
                 # work around for now: turn the dynamic sizes into constant
                 size = [int(i) for i in size]
-                return torch.nn.functional.interpolate(x,
-                                                       size=size,
-                                                       mode="nearest")
-        ops_9 = [{"op_name" : "Constant"},
-                 {"op_name" : "Upsample",
-                  "attributes" :
-                  [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Resize",
-                   "attributes" :
-                   [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops = {9 : ops_9, 10 : ops_10}
+                return torch.nn.functional.interpolate(x, size=size, mode="nearest")
+
+        ops_9 = [
+            {"op_name": "Constant"},
+            {
+                "op_name": "Upsample",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+        ops_10 = [
+            {"op_name": "Constant"},
+            {
+                "op_name": "Resize",
+                "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}],
+            },
+        ]
+        ops = {9: ops_9, 10: ops_10}
         x = torch.randn(20, 16, 50)
         check_onnx_opsets_operator(MyDynamicModel(), x, ops, opset_versions=[9, 10])
 
+    def test_grid_sample(self):
+        n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4
+        ops = {16: [{"op_name": "GridSample"}]}
+
+        class MyModule(Module):
+            def forward(self, x, grid, mode, padding_mode, align_corers):
+                return torch.nn.functional.grid_sample(
+                    x, grid, mode, padding_mode, align_corners
+                )
+
+        for mode, padding_mode, align_corners in itertools.product(
+            ("bilinear", "nearest", "bicubic"),
+            ("zeros", "border", "reflection"),
+            (True, False),
+        ):
+
+            args = (
+                torch.randn(n, c, h_in, w_in),  # x
+                torch.randn(n, h_out, w_out, 2),  # grid,
+                mode,
+                padding_mode,
+                align_corners,
+            )
+            check_onnx_opsets_operator(
+                MyModule(),
+                args,
+                ops,
+                opset_versions=[16],
+                training=torch.onnx.TrainingMode.TRAINING,
+            )
+            check_onnx_opsets_operator(
+                MyModule(),
+                args,
+                ops,
+                opset_versions=[16],
+                training=torch.onnx.TrainingMode.EVAL,
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index ca69f0fb0306..1cded5a9b9f2 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -1,31 +1,52 @@
 # Owner(s): ["module: onnx"]
 
-from test_pytorch_common import TestCase, run_tests, flatten, skipIfNoLapack, \
-    BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
-
-import torch
-import torch.onnx
-from torch.onnx.symbolic_helper import parse_args, _get_tensor_dim_size, _get_tensor_sizes
-from torch.onnx import register_custom_op_symbolic, unregister_custom_op_symbolic
-from torch.autograd import Variable, Function
-from torch.nn import Module, functional
-import torch.nn as nn
-import torch.nn.functional as F
-
-import itertools
-import io
-import inspect
 import glob
+import inspect
+import io
+import itertools
 import os
 import shutil
 import tempfile
-import torch.testing._internal.common_utils as common
 
-'''Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data]
+from test_pytorch_common import (
+    BATCH_SIZE,
+    RNN_HIDDEN_SIZE,
+    RNN_INPUT_SIZE,
+    RNN_SEQUENCE_LENGTH,
+    TestCase,
+    flatten,
+    run_tests,
+    skipIfNoLapack,
+)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.onnx
+import torch.testing._internal.common_utils as common
+from torch.autograd import Function, Variable
+from torch.nn import Module, functional
+from torch.onnx import (
+    register_custom_op_symbolic,
+    unregister_custom_op_symbolic,
+)
+from torch.onnx.symbolic_helper import (
+    _get_tensor_dim_size,
+    _get_tensor_sizes,
+    parse_args,
+)
+from torch.testing._internal.common_utils import skipIfCaffe2
+
+"""Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data]
           --no-onnx: no onnx python dependence
           --produce-onnx-test-data: generate onnx test data
           --accept: accept onnx updates and overwrite models
-'''
+"""
+
+# Full diff for expect files
+import unittest
+
+unittest.TestCase.maxDiff = None
 
 _onnx_test = False  # flag to produce onnx test cases.
 _onnx_dep = True  # flag to import onnx package.
@@ -33,7 +54,8 @@
 
 def export_to_pbtxt(model, inputs, *args, **kwargs):
     return torch.onnx.export_to_pretty_string(
-        model, inputs, google_printer=True, *args, **kwargs)
+        model, inputs, google_printer=True, *args, **kwargs
+    )
 
 
 def export_to_pb(model, inputs, *args, **kwargs):
@@ -56,7 +78,6 @@ def forward(self, *args):
 
 
 class TestOperators(TestCase):
-
     def assertONNX(self, f, args, params=None, **kwargs):
         if params is None:
             params = ()
@@ -74,16 +95,21 @@ def assertONNX(self, f, args, params=None, **kwargs):
             import onnx.checker
             import onnx.numpy_helper
             import test_onnx_common
+
             model_def = onnx.ModelProto.FromString(onnx_model_pb)
             onnx.checker.check_model(model_def)
             if _onnx_test:
                 test_function = inspect.stack()[1][0].f_code.co_name
                 test_name = test_function[0:4] + "_operator" + test_function[4:]
-                output_dir = os.path.join(test_onnx_common.pytorch_operator_dir, test_name)
+                output_dir = os.path.join(
+                    test_onnx_common.pytorch_operator_dir, test_name
+                )
                 # Assume:
                 #     1) the old test should be delete before the test.
                 #     2) only one assertONNX in each test, otherwise will override the data.
-                assert not os.path.exists(output_dir), "{} should not exist!".format(output_dir)
+                assert not os.path.exists(output_dir), "{} should not exist!".format(
+                    output_dir
+                )
                 os.makedirs(output_dir)
                 with open(os.path.join(output_dir, "model.onnx"), "wb") as file:
                     file.write(model_def.SerializeToString())
@@ -93,14 +119,18 @@ def assertONNX(self, f, args, params=None, **kwargs):
                     args = (args,)
                 for index, var in enumerate(flatten(args)):
                     tensor = onnx.numpy_helper.from_array(var.data.numpy())
-                    with open(os.path.join(data_dir, "input_{}.pb".format(index)), "wb") as file:
+                    with open(
+                        os.path.join(data_dir, "input_{}.pb".format(index)), "wb"
+                    ) as file:
                         file.write(tensor.SerializeToString())
                 outputs = m(*args)
                 if isinstance(outputs, Variable):
                     outputs = (outputs,)
                 for index, var in enumerate(flatten(outputs)):
                     tensor = onnx.numpy_helper.from_array(var.data.numpy())
-                    with open(os.path.join(data_dir, "output_{}.pb".format(index)), "wb") as file:
+                    with open(
+                        os.path.join(data_dir, "output_{}.pb".format(index)), "wb"
+                    ) as file:
                         file.write(tensor.SerializeToString())
 
     def assertONNXRaises(self, err, f, args, params=None, **kwargs):
@@ -181,11 +211,15 @@ def test_chunk(self):
         self.assertONNX(lambda x: x.chunk(2), x)
 
     def test_split(self):
-        x = torch.tensor([[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]])
+        x = torch.tensor(
+            [[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]]
+        )
         self.assertONNX(lambda x: torch.split(x, 2, 1), x)
 
     def test_split_with_sizes(self):
-        x = torch.tensor([[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]])
+        x = torch.tensor(
+            [[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]]
+        )
         self.assertONNX(lambda x: torch.split(x, [2, 1, 3], 1), x)
 
     def test_concat2(self):
@@ -202,27 +236,39 @@ def test_addmm(self):
         m1 = torch.randn(2, 3, requires_grad=True)
         m2 = torch.randn(3, 4, requires_grad=True)
         m3 = torch.randn(4, requires_grad=True)
-        self.assertONNX(lambda x, y, z: torch.addmm(torch.addmm(z, x, y), x, y), (m1, m2, m3))
+        self.assertONNX(
+            lambda x, y, z: torch.addmm(torch.addmm(z, x, y), x, y), (m1, m2, m3)
+        )
 
     def test_permute2(self):
         x = torch.tensor([[[[[[0.0]]]]]], requires_grad=True)
         self.assertONNX(lambda x: x.permute(0, 1, 4, 2, 5, 3), x)
 
     def test_pad(self):
-        x = torch.tensor([[[[0.0, 1.0, 1.0, 1.0], [2.0, 3.0, 7.0, 7.0]]]], requires_grad=True)
+        x = torch.tensor(
+            [[[[0.0, 1.0, 1.0, 1.0], [2.0, 3.0, 7.0, 7.0]]]], requires_grad=True
+        )
         self.assertONNX(nn.ReflectionPad2d((2, 3, 0, 1)), x)
 
     def test_params(self):
         x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
         y = nn.Parameter(torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True))
-        self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), x, params=(y, ),
-                        keep_initializers_as_inputs=True)
+        self.assertONNX(
+            lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))),
+            x,
+            params=(y,),
+            keep_initializers_as_inputs=True,
+        )
 
     def test_params_onnx_irv4(self):
         x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
         y = nn.Parameter(torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True))
-        self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), x, params=(y, ),
-                        keep_initializers_as_inputs=False)
+        self.assertONNX(
+            lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))),
+            x,
+            params=(y,),
+            keep_initializers_as_inputs=False,
+        )
 
     def test_symbolic_mismatch(self):
         class MyFun(Function):
@@ -258,11 +304,18 @@ def test_batchnorm_1d(self):
 
     def test_batchnorm_training(self):
         x = torch.ones(2, 2, 2, 2, requires_grad=True)
-        self.assertONNX(nn.BatchNorm2d(2), x, training=torch.onnx.TrainingMode.TRAINING, keep_initializers_as_inputs=True)
+        self.assertONNX(
+            nn.BatchNorm2d(2),
+            x,
+            training=torch.onnx.TrainingMode.TRAINING,
+            keep_initializers_as_inputs=True,
+        )
 
     def test_conv(self):
         x = torch.ones(20, 16, 50, 40, requires_grad=True)
-        self.assertONNX(nn.Conv2d(16, 13, 3, bias=False), x, keep_initializers_as_inputs=True)
+        self.assertONNX(
+            nn.Conv2d(16, 13, 3, bias=False), x, keep_initializers_as_inputs=True
+        )
 
     def test_conv_onnx_irv4(self):
         x = torch.ones(20, 16, 50, 40, requires_grad=True)
@@ -276,35 +329,67 @@ def test_conv_onnx_irv4_opset8(self):
         x = torch.ones(1, 2, 5, 7, requires_grad=True)
         conv_node = nn.Conv2d(2, 4, 3, bias=False)
         conv_node.weight.data.fill_(1.0)
-        self.assertONNX(conv_node, x, opset_version=8, keep_initializers_as_inputs=False)
+        self.assertONNX(
+            conv_node, x, opset_version=8, keep_initializers_as_inputs=False
+        )
 
     def test_conv_variable_length(self):
         x = torch.ones(5, 3, 6, 6, requires_grad=True)
         model = torch.nn.Conv2d(3, 2, 3)
 
-        dynamic_axes = {"input_1": [0, 2, 3], "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"}}
+        dynamic_axes = {
+            "input_1": [0, 2, 3],
+            "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"},
+        }
         model_proto_file = tempfile.NamedTemporaryFile()
-        torch.onnx.export(model, x, model_proto_file.name, verbose=True, input_names=["input_1"], output_names=["output_1"],
-                          dynamic_axes=dynamic_axes)
+        torch.onnx.export(
+            model,
+            x,
+            model_proto_file.name,
+            verbose=True,
+            input_names=["input_1"],
+            output_names=["output_1"],
+            dynamic_axes=dynamic_axes,
+        )
 
         import onnx
+
         onnx_model = onnx.load(model_proto_file.name)
         onnx.checker.check_model(onnx_model)
 
         # Asserting the default dynamic axes names are generated when custom names are not provided
-        assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_param == "input_1_dynamic_axes_1")
-        assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[2].dim_param == "input_1_dynamic_axes_2")
-        assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[3].dim_param == "input_1_dynamic_axes_3")
+        assert (
+            onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_param
+            == "input_1_dynamic_axes_1"
+        )
+        assert (
+            onnx_model.graph.input[0].type.tensor_type.shape.dim[2].dim_param
+            == "input_1_dynamic_axes_2"
+        )
+        assert (
+            onnx_model.graph.input[0].type.tensor_type.shape.dim[3].dim_param
+            == "input_1_dynamic_axes_3"
+        )
 
         # Asserting the custom names are applied when provided
-        assert(onnx_model.graph.output[0].type.tensor_type.shape.dim[0].dim_param == "output_1_variable_dim_0")
-        assert(onnx_model.graph.output[0].type.tensor_type.shape.dim[1].dim_param == "output_1_variable_dim_1")
+        assert (
+            onnx_model.graph.output[0].type.tensor_type.shape.dim[0].dim_param
+            == "output_1_variable_dim_0"
+        )
+        assert (
+            onnx_model.graph.output[0].type.tensor_type.shape.dim[1].dim_param
+            == "output_1_variable_dim_1"
+        )
 
     def test_convtranspose(self):
         x = torch.ones(2, 3, 4, 5, requires_grad=True)
-        self.assertONNX(nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False,
-                                           padding=1, output_padding=2), x,
-                        keep_initializers_as_inputs=True)
+        self.assertONNX(
+            nn.ConvTranspose2d(
+                3, 3, 3, stride=3, bias=False, padding=1, output_padding=2
+            ),
+            x,
+            keep_initializers_as_inputs=True,
+        )
 
     def test_maxpool(self):
         x = torch.randn(20, 16, 50)
@@ -322,11 +407,11 @@ def test_maxpool_indices(self):
         x = torch.randn(20, 16, 50)
         self.assertONNX(nn.MaxPool1d(3, stride=2, return_indices=True), x)
 
+    @skipIfCaffe2
     def test_at_op(self):
         x = torch.randn(3, 4)
 
         class MyFun(Function):
-
             @staticmethod
             def symbolic(g, x):
                 return g.at("add", x, x)
@@ -339,7 +424,11 @@ class MyModule(Module):
             def forward(self, x):
                 return MyFun.apply(x)
 
-        self.assertONNX(MyModule(), x)
+        self.assertONNX(
+            MyModule(),
+            x,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
     def test_clip(self):
         x = torch.randn(3, 4, requires_grad=True)
@@ -359,7 +448,7 @@ def test_hardtanh(self):
 
     def test_full(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.full(x.shape, 2.), x)
+        self.assertONNX(lambda x: torch.full(x.shape, 2.0), x)
 
     def test_full_like(self):
         x = torch.randn(3, 4, requires_grad=True)
@@ -498,7 +587,7 @@ def test_slice(self):
 
     def test_slice_dynamic(self):
         x = torch.rand(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: x[x.size(0):, x.size(1) - 3], x, opset_version=10)
+        self.assertONNX(lambda x: x[x.size(0) :, x.size(1) - 3], x, opset_version=10)
 
     def test_sign(self):
         x = torch.rand(3, 4, requires_grad=True)
@@ -567,17 +656,24 @@ def test_norm_p2(self):
 
     def test_upsample_nearest_scale(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
-        self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2.,
-                        mode="nearest", recompute_scale_factor=False), x)
+        self.assertONNX(
+            lambda x: nn.functional.interpolate(
+                x, scale_factor=2.0, mode="nearest", recompute_scale_factor=False
+            ),
+            x,
+        )
 
     def test_upsample_nearest_scale_default_scale_factor(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
-        self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2.,
-                        mode="nearest"), x)
+        self.assertONNX(
+            lambda x: nn.functional.interpolate(x, scale_factor=2.0, mode="nearest"), x
+        )
 
     def test_upsample_nearest_size(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
-        self.assertONNX(lambda x: nn.functional.interpolate(x, size=16, mode="nearest"), x)
+        self.assertONNX(
+            lambda x: nn.functional.interpolate(x, size=16, mode="nearest"), x
+        )
 
     def test_unsqueeze(self):
         x = torch.randn(3, 4, requires_grad=True)
@@ -585,15 +681,23 @@ def test_unsqueeze(self):
 
     def test_batchnorm_noaffine(self):
         x = torch.randn(128, 128, 1, 1, requires_grad=True)
-        self.assertONNX(nn.BatchNorm2d(128, affine=False, momentum=0.3), x,
-                        keep_initializers_as_inputs=True)
+        self.assertONNX(
+            nn.BatchNorm2d(128, affine=False, momentum=0.3),
+            x,
+            keep_initializers_as_inputs=True,
+        )
 
+    @skipIfCaffe2
     def test_embedding_bags(self):
         emb_bag = nn.EmbeddingBag(10, 8)
         input = torch.tensor([1, 2, 3, 4]).long()
         offset = torch.tensor([0]).long()
-        self.assertONNX(emb_bag, (input, offset), keep_initializers_as_inputs=True,
-                        operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        self.assertONNX(
+            emb_bag,
+            (input, offset),
+            keep_initializers_as_inputs=True,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
     def test_implicit_expand(self):
         x = torch.randn(3, 4, requires_grad=True)
@@ -625,8 +729,9 @@ def test_log_sigmoid(self):
 
     def test_linear(self):
         x = torch.randn(3, 4)
-        self.assertONNX(torch.nn.Linear(4, 5, bias=True), x,
-                        keep_initializers_as_inputs=True)
+        self.assertONNX(
+            torch.nn.Linear(4, 5, bias=True), x, keep_initializers_as_inputs=True
+        )
 
     def test_empty_like(self):
         x = torch.randn(5, 8, requires_grad=True)
@@ -671,22 +776,44 @@ def test_dropout(self):
 
     def test_dropout_default(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.max(functional.dropout(x,)), x)
+        self.assertONNX(
+            lambda x: torch.max(
+                functional.dropout(
+                    x,
+                )
+            ),
+            x,
+        )
 
     def test_dropout_training(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, training=torch.onnx.TrainingMode.TRAINING)
+        self.assertONNX(
+            lambda x: torch.max(functional.dropout(x)),
+            x,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
 
     def test_dropout_opset12(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x, opset_version=12)
+        self.assertONNX(
+            lambda x: torch.max(functional.dropout(x, training=False)),
+            x,
+            opset_version=12,
+        )
 
     def test_dropout_training_opset12(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, opset_version=12, training=torch.onnx.TrainingMode.TRAINING)
+        self.assertONNX(
+            lambda x: torch.max(functional.dropout(x)),
+            x,
+            opset_version=12,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
 
     def test_nonzero(self):
-        x = torch.tensor([[[2., 2.], [1., 0.]], [[0., 0.], [1., 1.]]], requires_grad=True)
+        x = torch.tensor(
+            [[[2.0, 2.0], [1.0, 0.0]], [[0.0, 0.0], [1.0, 1.0]]], requires_grad=True
+        )
         self.assertONNX(lambda x: torch.nonzero(x), x)
 
     def test_gather(self):
@@ -697,19 +824,28 @@ def test_gather(self):
     def test_gather_opset11(self):
         data = torch.randn(3, 4, 3, requires_grad=True)
         index = torch.tensor([2, 0]).view(1, 2, 1).expand(3, 2, 3)
-        self.assertONNX(lambda data, index: data.gather(1, index), (data, index), opset_version=11)
+        self.assertONNX(
+            lambda data, index: data.gather(1, index), (data, index), opset_version=11
+        )
 
     def test_scatter_add(self):
-        data = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
+        data = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])
-        self.assertONNX(lambda data, index: data.scatter_add(1, indices, values), (data, (indices, values)))
+        self.assertONNX(
+            lambda data, index: data.scatter_add(1, indices, values),
+            (data, (indices, values)),
+        )
 
     def test_scatter_add_opset11(self):
-        data = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
+        data = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])
-        self.assertONNX(lambda data, index: data.scatter_add(1, indices, values), (data, (indices, values)), opset_version=11)
+        self.assertONNX(
+            lambda data, index: data.scatter_add(1, indices, values),
+            (data, (indices, values)),
+            opset_version=11,
+        )
 
     def test_master_opset(self):
         x = torch.randn(2, 3).float()
@@ -718,54 +854,58 @@ def test_master_opset(self):
 
     def test_std(self):
         x = torch.randn(2, 3, 4).float()
-        self.assertONNX(lambda x: torch.std(x, dim=(0, 1), unbiased=True, keepdim=True), x)
+        self.assertONNX(
+            lambda x: torch.std(x, dim=(0, 1), unbiased=True, keepdim=True), x
+        )
 
     def test_cumsum(self):
         x = torch.randn(2, 3, 4, requires_grad=True)
         self.assertONNX(lambda x: torch.cumsum(x, dim=1), x, opset_version=11)
 
-# Github Issue: https://github.com/pytorch/pytorch/issues/71095
-#    def test_c2_op(self):
-#        class MyModel(torch.nn.Module):
-#            def __init__(self):
-#                super(MyModel, self).__init__()
-#
-#            def forward(self, scores, bbox_deltas, im_info, anchors):
-#                a, b = torch.ops._caffe2.GenerateProposals(
-#                    (scores), (bbox_deltas), (im_info), (anchors),
-#                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
-#                )
-#                return a, b
-#
-#        model = MyModel()
-#        A = 4
-#        H = 10
-#        W = 8
-#        img_count = 3
-#        scores = torch.ones(img_count, A, H, W, dtype=torch.float32)
-#        bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W,
-#                                     dtype=torch.float32)
-#        bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W)
-#        im_info = torch.ones(img_count, 3, dtype=torch.float32)
-#        anchors = torch.ones(A, 4, dtype=torch.float32)
-#        inputs = (scores, bbox_deltas, im_info, anchors)
-#        self.assertONNX(model, inputs, custom_opsets={"org.pytorch._caffe2": 0})
+    # Github Issue: https://github.com/pytorch/pytorch/issues/71095
+    #    def test_c2_op(self):
+    #        class MyModel(torch.nn.Module):
+    #            def __init__(self):
+    #                super(MyModel, self).__init__()
+    #
+    #            def forward(self, scores, bbox_deltas, im_info, anchors):
+    #                a, b = torch.ops._caffe2.GenerateProposals(
+    #                    (scores), (bbox_deltas), (im_info), (anchors),
+    #                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
+    #                )
+    #                return a, b
+    #
+    #        model = MyModel()
+    #        A = 4
+    #        H = 10
+    #        W = 8
+    #        img_count = 3
+    #        scores = torch.ones(img_count, A, H, W, dtype=torch.float32)
+    #        bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W,
+    #                                     dtype=torch.float32)
+    #        bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W)
+    #        im_info = torch.ones(img_count, 3, dtype=torch.float32)
+    #        anchors = torch.ones(A, 4, dtype=torch.float32)
+    #        inputs = (scores, bbox_deltas, im_info, anchors)
+    #        self.assertONNX(model, inputs, custom_opsets={"org.pytorch._caffe2": 0})
 
     def test_dict(self):
         class MyModel(torch.nn.Module):
             def forward(self, x_in):
                 x_out = {}
-                x_out["test_key_out"] = torch.add(x_in[list(x_in.keys())[0]], list(x_in.keys())[0])
+                x_out["test_key_out"] = torch.add(
+                    x_in[list(x_in.keys())[0]], list(x_in.keys())[0]
+                )
                 return x_out
 
-        x = {torch.tensor(1.): torch.randn(1, 2, 3)}
+        x = {torch.tensor(1.0): torch.randn(1, 2, 3)}
         self.assertONNX(MyModel(), (x, {}))
 
     def test_dict_str(self):
         class MyModel(torch.nn.Module):
             def forward(self, x_in):
                 x_out = {}
-                x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.)
+                x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.0)
                 return x_out
 
         x = {"test_key_in": torch.randn(1, 2, 3)}
@@ -781,21 +921,27 @@ def forward(self, input):
 
     def test_bitshift(self):
         class BitshiftModel(torch.nn.Module):
-            def forward(self, input, input2):
-                return input >> 1, input2 >> 2
-        input = torch.arange(24, dtype=torch.float32).reshape(3, 4, 2)
-        input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2)
-        self.assertONNX(BitshiftModel(), (input, input2), opset_version=11)
+            def forward(self, input):
+                return input >> 1, input >> 2
+
+        input = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2)
+        self.assertONNX(BitshiftModel(), input, opset_version=11)
 
+    @skipIfCaffe2
     def test_layer_norm_aten(self):
         model = torch.nn.LayerNorm([10, 10])
         x = torch.randn(20, 5, 10, 10)
-        self.assertONNX(model, x,
-                        operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        self.assertONNX(
+            model,
+            x,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
     def test_pixel_shuffle(self):
         x = torch.randn(2, 8, 3, 4).float()
-        self.assertONNX(lambda x: torch.pixel_shuffle(x, upscale_factor=2), x, opset_version=11)
+        self.assertONNX(
+            lambda x: torch.pixel_shuffle(x, upscale_factor=2), x, opset_version=11
+        )
 
     def test_frobenius_norm(self):
         x = torch.randn(2, 3, 4).float()
@@ -821,8 +967,13 @@ def test_gelu(self):
 
     def test_unique(self):
         x = torch.randint(3, (2, 3, 4, 5)).float()
-        self.assertONNX(lambda x: torch.unique(x, dim=0, sorted=True, return_inverse=False, return_counts=True), x,
-                        opset_version=11)
+        self.assertONNX(
+            lambda x: torch.unique(
+                x, dim=0, sorted=True, return_inverse=False, return_counts=True
+            ),
+            x,
+            opset_version=11,
+        )
 
     def test_meshgrid(self):
         x = torch.ones(3, requires_grad=True)
@@ -831,14 +982,18 @@ def test_meshgrid(self):
         self.assertONNX(lambda x, y, z: torch.meshgrid(x, y, z), (x, y, z))
 
     def test_topk(self):
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         k = torch.tensor(3)
         self.assertONNX(lambda x, k: torch.topk(x, k), (x, k), opset_version=10)
 
     def test_topk_smallest_unsorted(self):
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         k = torch.tensor(3)
-        self.assertONNX(lambda x, k: torch.topk(x, k, largest=False, sorted=False), (x, k), opset_version=11)
+        self.assertONNX(
+            lambda x, k: torch.topk(x, k, largest=False, sorted=False),
+            (x, k),
+            opset_version=11,
+        )
 
     def test_baddbmm(self):
         x = torch.randn(10, 3, 5)
@@ -868,12 +1023,16 @@ def test_softmaxcrossentropy(self):
     def test_softmaxcrossentropy_ignore_index(self):
         x = torch.randn(3, 5)
         y = torch.empty(3, dtype=torch.long).random_(5)
-        self.assertONNX(torch.nn.CrossEntropyLoss(ignore_index=1), (x, y), opset_version=12)
+        self.assertONNX(
+            torch.nn.CrossEntropyLoss(ignore_index=1), (x, y), opset_version=12
+        )
 
     def test_softmaxcrossentropy_weights(self):
         x = torch.randn(3, 5)
         y = torch.empty(3, dtype=torch.long).random_(5)
-        self.assertONNX(torch.nn.CrossEntropyLoss(weight=torch.randn(5)), (x, y), opset_version=12)
+        self.assertONNX(
+            torch.nn.CrossEntropyLoss(weight=torch.randn(5)), (x, y), opset_version=12
+        )
 
     def test_softmaxcrossentropy_3d(self):
         x = torch.randn(3, 5, 2)
@@ -883,7 +1042,9 @@ def test_softmaxcrossentropy_3d(self):
     def test_softmaxcrossentropy_3d_none(self):
         x = torch.randn(3, 5, 2)
         y = torch.empty(3, 2, dtype=torch.long).random_(5)
-        self.assertONNX(torch.nn.CrossEntropyLoss(reduction="none"), (x, y), opset_version=12)
+        self.assertONNX(
+            torch.nn.CrossEntropyLoss(reduction="none"), (x, y), opset_version=12
+        )
 
     def test_softmaxcrossentropy_4d(self):
         x = torch.randn(3, 5, 2, 1)
@@ -899,66 +1060,96 @@ def test_lstm_none_sequence_lens(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
+                self.rnn = torch.nn.LSTM(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+                )
 
             def forward(self, x, h0, c0):
                 a, b = self.rnn(x, (h0, c0))
                 return torch.ones(b[0].shape)
 
-        self.assertONNX(LSTMModel(),
-                        (input, h0, c0), input_names=["x", "y"],
-                        dynamic_axes={"x" : {0: 'batch'}}, opset_version=12)
+        self.assertONNX(
+            LSTMModel(),
+            (input, h0, c0),
+            input_names=["x", "y"],
+            dynamic_axes={"x": {0: "batch"}},
+            opset_version=12,
+        )
 
     def test_dynamic_axes_add(self):
         m1 = torch.randn(2, 3, requires_grad=True)
         m2 = torch.randn(2, 1, requires_grad=True)
-        self.assertONNX(lambda x, y: torch.add(x, y), (m1, m2), input_names=["input_1", "input_2"],
-                        dynamic_axes={"input_1": {1: "dim_1"}, "input_2": {1: "dim_2"}},
-                        opset_version=12)
+        self.assertONNX(
+            lambda x, y: torch.add(x, y),
+            (m1, m2),
+            input_names=["input_1", "input_2"],
+            dynamic_axes={"input_1": {1: "dim_1"}, "input_2": {1: "dim_2"}},
+            opset_version=12,
+        )
 
     def test_dynamic_axes_add_inputs_same_symbolic_shape(self):
         m1 = torch.randn(2, 3, requires_grad=True)
-        self.assertONNX(lambda x: torch.add(x, x), (m1,), input_names=["input_1"],
-                        dynamic_axes={"input_1": {1: "dim_1"}},
-                        opset_version=12)
+        self.assertONNX(
+            lambda x: torch.add(x, x),
+            (m1,),
+            input_names=["input_1"],
+            dynamic_axes={"input_1": {1: "dim_1"}},
+            opset_version=12,
+        )
 
     def test_dynamic_axes_matmul(self):
         m1 = torch.randn(2, 2, 4, requires_grad=True)
         m2 = torch.randn(2, 4, 3, requires_grad=True)
-        self.assertONNX(lambda x, y: torch.matmul(x, y), (m1, m2), input_names=["input_1", "input_2"],
-                        dynamic_axes={"input_1": {1: "dim_0"}, "input_2": {2: "dim_1"}},
-                        opset_version=12)
+        self.assertONNX(
+            lambda x, y: torch.matmul(x, y),
+            (m1, m2),
+            input_names=["input_1", "input_2"],
+            dynamic_axes={"input_1": {1: "dim_0"}, "input_2": {2: "dim_1"}},
+            opset_version=12,
+        )
 
     def test_dynamic_axes_reduce_mean(self):
         m1 = torch.randn(2, 3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.mean(x, dim=1), (m1), input_names=["input"],
-                        dynamic_axes={"input": {1: "dim_1", 2: "dim_2"}},
-                        opset_version=12)
+        self.assertONNX(
+            lambda x: torch.mean(x, dim=1),
+            (m1),
+            input_names=["input"],
+            dynamic_axes={"input": {1: "dim_1", 2: "dim_2"}},
+            opset_version=12,
+        )
 
     def test_dynamic_axes_unchange(self):
         """Test ProcessUnchangeNode in symbolic shape inference."""
         m1 = torch.randn(2, 3, requires_grad=True)
-        self.assertONNX(lambda x: torch.softmax(x, dim=0), (m1,), input_names=["input"],
-                        dynamic_axes={"input": {1: "dim_1"}},
-                        opset_version=12)
+        self.assertONNX(
+            lambda x: torch.softmax(x, dim=0),
+            (m1,),
+            input_names=["input"],
+            dynamic_axes={"input": {1: "dim_1"}},
+            opset_version=12,
+        )
 
     def test_aten_embedding_1(self):
         _onnx_opset_version = 12
 
-        @parse_args('v', 'v', 'i', 'b', 'b')
+        @parse_args("v", "v", "i", "b", "b")
         def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
             custom_attributes_json = (
-                '{'
+                "{"
                 f'"padding_idx":{str(padding_idx)},'
                 f'"scale_grad_by_freq":{str(scale_grad_by_freq).lower()},'
                 f'"sparse":{str(sparse).lower()}'
-                '}'
+                "}"
+            )
+            output = g.at(
+                "embedding",
+                weight,
+                indices,
+                custom_attributes_json_s=custom_attributes_json,
             )
-            output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
-                          custom_attributes_json_s=custom_attributes_json)
             return output
 
-        register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version)
+        register_custom_op_symbolic("::embedding", embedding, _onnx_opset_version)
 
         class Model(torch.nn.Module):
             def __init__(self):
@@ -975,32 +1166,39 @@ def forward(self, x, y):
         y = torch.randn(1, 8)
         self.assertONNX(model, (x, y), opset_version=_onnx_opset_version)
 
-        unregister_custom_op_symbolic('::embedding', _onnx_opset_version)
+        unregister_custom_op_symbolic("::embedding", _onnx_opset_version)
 
     # This is test_aten_embedding_1 with shape inference on custom symbolic aten::embedding.
+    @skipIfCaffe2
     def test_aten_embedding_2(self):
         _onnx_opset_version = 12
 
-        @parse_args('v', 'v', 'i', 'b', 'b')
+        @parse_args("v", "v", "i", "b", "b")
         def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
             custom_attributes_json = (
-                '{'
+                "{"
                 f'"padding_idx":{str(padding_idx)},'
                 f'"scale_grad_by_freq":{str(scale_grad_by_freq).lower()},'
                 f'"sparse":{str(sparse).lower()}'
-                '}'
+                "}"
+            )
+            output = g.at(
+                "embedding",
+                weight,
+                indices,
+                custom_attributes_json_s=custom_attributes_json,
             )
-            output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
-                          custom_attributes_json_s=custom_attributes_json)
 
             # do shape inference and set it via setType
             indices_shape = _get_tensor_sizes(indices)
-            if indices_shape is not None and hasattr(weight.type(), 'with_sizes'):
-                output_type = weight.type().with_sizes(indices_shape + [_get_tensor_dim_size(weight, 1)])
+            if indices_shape is not None and hasattr(weight.type(), "with_sizes"):
+                output_type = weight.type().with_sizes(
+                    indices_shape + [_get_tensor_dim_size(weight, 1)]
+                )
                 output.setType(output_type)
             return output
 
-        register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version)
+        register_custom_op_symbolic("::embedding", embedding, _onnx_opset_version)
 
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1015,10 +1213,17 @@ def forward(self, x, y):
         model = Model()
         x = torch.ones(32, dtype=torch.long)
         y = torch.randn(1, 8)
-        self.assertONNX(model, (x, y), opset_version=_onnx_opset_version, input_names=['input_1', 'input_2'],
-                        dynamic_axes={"input_1": {0: "dim_0"}, 'input_2': {0: "dim_1", 1: "dim_2"}})
-
-        unregister_custom_op_symbolic('::embedding', _onnx_opset_version)
+        self.assertONNX(
+            model,
+            (x, y),
+            opset_version=_onnx_opset_version,
+            input_names=["input_1", "input_2"],
+            dynamic_axes={"input_1": {0: "dim_0"}, "input_2": {0: "dim_1", 1: "dim_2"}},
+            keep_initializers_as_inputs=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+
+        unregister_custom_op_symbolic("::embedding", _onnx_opset_version)
 
     # Without shapeValueMap, the onnx graph looks like:
     # graph(%0 : Float(*, 1, 128, 1, strides=[128, 128, 1, 1], requires_grad=0, device=cpu)):
@@ -1048,12 +1253,17 @@ def forward(self, x):
                 x = F.softmax(x, dim=1)
                 x = x.reshape(batch, -1)
                 return x
+
         radix = 2
         cardinality = 1
         x = torch.randn(10, 1, 128, 1)
-        self.assertONNX(RSoftMax(radix, cardinality), (x,),
-                        input_names=["x"],
-                        dynamic_axes={"x": {0: "dim_0"}})
+        self.assertONNX(
+            RSoftMax(radix, cardinality),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": {0: "dim_0"}},
+        )
+
 
 if __name__ == "__main__":
     no_onnx_dep_flag = "--no-onnx"
@@ -1067,6 +1277,9 @@ def forward(self, x):
     if _onnx_test:
         _onnx_dep = True
         import test_onnx_common
-        for d in glob.glob(os.path.join(test_onnx_common.pytorch_operator_dir, "test_operator_*")):
+
+        for d in glob.glob(
+            os.path.join(test_onnx_common.pytorch_operator_dir, "test_operator_*")
+        ):
             shutil.rmtree(d)
     run_tests()
diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py
index 13b4585a5def..44ccc303cff7 100644
--- a/test/onnx/test_pytorch_common.py
+++ b/test/onnx/test_pytorch_common.py
@@ -2,8 +2,9 @@
 
 import functools
 import os
-import unittest
 import sys
+import unittest
+
 import torch
 import torch.autograd.function as function
 
@@ -29,18 +30,19 @@ def wrapper(*args, **kwargs):
             if condition():
                 raise unittest.SkipTest(reason)
             return f(*args, **kwargs)
+
         return wrapper
+
     return decorator
 
 
-skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(),
-                        "CUDA is not available")
+skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(), "CUDA is not available")
 
-skipIfTravis = _skipper(lambda: os.getenv("TRAVIS"),
-                        "Skip In Travis")
+skipIfTravis = _skipper(lambda: os.getenv("TRAVIS"), "Skip In Travis")
 
-skipIfNoBFloat16Cuda = _skipper(lambda: not torch.cuda.is_bf16_supported(),
-                                "BFloat16 CUDA is not available")
+skipIfNoBFloat16Cuda = _skipper(
+    lambda: not torch.cuda.is_bf16_supported(), "BFloat16 CUDA is not available"
+)
 
 # skips tests for all versions below min_opset_version.
 # if exporting the op is only supported after a specific version,
@@ -50,48 +52,55 @@ def skipIfUnsupportedMinOpsetVersion(min_opset_version):
     def skip_dec(func):
         def wrapper(self):
             if self.opset_version < min_opset_version:
-                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
+                raise unittest.SkipTest(
+                    f"Unsupported opset_version: {self.opset_version} < {min_opset_version}"
+                )
             return func(self)
+
         return wrapper
+
     return skip_dec
 
-# skips tests for all versions above min_opset_version.
-def skipIfUnsupportedMaxOpsetVersion(min_opset_version):
+
+# skips tests for all versions above max_opset_version.
+def skipIfUnsupportedMaxOpsetVersion(max_opset_version):
     def skip_dec(func):
         def wrapper(self):
-            if self.opset_version > min_opset_version:
-                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
+            if self.opset_version > max_opset_version:
+                raise unittest.SkipTest(
+                    f"Unsupported opset_version: {self.opset_version} > {max_opset_version}"
+                )
             return func(self)
+
         return wrapper
+
     return skip_dec
 
+
 # skips tests for all opset versions.
 def skipForAllOpsetVersions():
     def skip_dec(func):
         def wrapper(self):
             if self.opset_version:
-                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
+                raise unittest.SkipTest(
+                    "Skip verify test for unsupported opset_version"
+                )
             return func(self)
-        return wrapper
-    return skip_dec
 
-# Enables tests for scripting, instead of only tracing the model.
-def enableScriptTest():
-    def script_dec(func):
-        def wrapper(self):
-            self.is_script_test_enabled = True
-            return func(self)
         return wrapper
-    return script_dec
 
+    return skip_dec
 
-# Disable tests for scripting.
-def disableScriptTest():
+
+# skips tests for scripting.
+def skipScriptTest(min_opset_version=float("inf")):
     def script_dec(func):
         def wrapper(self):
-            self.is_script_test_enabled = False
+            self.is_script_test_enabled = self.opset_version >= min_opset_version
             return func(self)
+
         return wrapper
+
     return script_dec
 
 
@@ -102,19 +111,15 @@ def skipIfUnsupportedOpsetVersion(unsupported_opset_versions):
     def skip_dec(func):
         def wrapper(self):
             if self.opset_version in unsupported_opset_versions:
-                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
+                raise unittest.SkipTest(
+                    "Skip verify test for unsupported opset_version"
+                )
             return func(self)
-        return wrapper
-    return skip_dec
 
-def skipIfONNXShapeInference(onnx_shape_inference):
-    def skip_dec(func):
-        def wrapper(self):
-            if self.onnx_shape_inference is onnx_shape_inference:
-                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
-            return func(self)
         return wrapper
+
     return skip_dec
 
+
 def flatten(x):
     return tuple(function._iter_filter(lambda o: isinstance(o, torch.Tensor))(x))
diff --git a/test/onnx/test_pytorch_helper.py b/test/onnx/test_pytorch_helper.py
index 3ffd88746ff3..ca6ad876b13b 100644
--- a/test/onnx/test_pytorch_helper.py
+++ b/test/onnx/test_pytorch_helper.py
@@ -1,24 +1,23 @@
 # Owner(s): ["module: onnx"]
 
 # Some standard imports
-import numpy as np
-from torch import nn
-import torch.onnx
-import torch.nn.init as init
-from caffe2.python.model_helper import ModelHelper
-from pytorch_helper import PyTorchModule
 import unittest
-from caffe2.python.core import workspace
 
+import numpy as np
+from pytorch_helper import PyTorchModule
 from test_pytorch_common import skipIfNoLapack
 
+import torch.nn.init as init
+import torch.onnx
+from caffe2.python.core import workspace
+from caffe2.python.model_helper import ModelHelper
+from torch import nn
 
-class TestCaffe2Backend(unittest.TestCase):
 
+class TestCaffe2Backend(unittest.TestCase):
     @skipIfNoLapack
     @unittest.skip("test broken because Lapack was always missing.")
     def test_helper(self):
-
         class SuperResolutionNet(nn.Module):
             def __init__(self, upscale_factor, inplace=False):
                 super(SuperResolutionNet, self).__init__()
@@ -27,7 +26,7 @@ def __init__(self, upscale_factor, inplace=False):
                 self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
                 self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
                 self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
-                self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+                self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1))
                 self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
 
                 self._initialize_weights()
@@ -53,7 +52,7 @@ def _initialize_weights(self):
         helper = ModelHelper(name="test_model")
         start = helper.Sigmoid(["the_input"])
         # Embed the ONNX-converted pytorch net inside it
-        toutput, = PyTorchModule(helper, torch_model, (fake_input,), [start])
+        (toutput,) = PyTorchModule(helper, torch_model, (fake_input,), [start])
         output = helper.Sigmoid(toutput)
 
         workspace.RunNetOnce(helper.InitProto())
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
new file mode 100644
index 000000000000..aaa842d171a3
--- /dev/null
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -0,0 +1,97 @@
+# Owner(s): ["module: onnx"]
+import unittest
+
+import onnxruntime
+from test_pytorch_onnx_onnxruntime import ort_compare_with_pytorch, run_ort
+
+import torch
+from torch._C import parse_ir
+
+
+def _jit_graph_to_onnx_model(graph, operator_export_type, opset_version):
+    r"""
+    This function exports torch::jit::Graph object
+    to serialized ONNX ModelProto.
+    This function is for testing purpose.
+    It only keeps the essential parts for IR graph conversions.
+    It also does not interact with actual PyTorch modules nor
+    PyTorch tensor inputs.
+    """
+    from torch.onnx.symbolic_helper import (
+        _set_onnx_shape_inference,
+        _set_opset_version,
+    )
+    from torch.onnx.utils import _optimize_graph
+
+    # Shape inference is required because some ops' symbolic functions
+    # generate sub-graphs based on inputs' types.
+    _set_onnx_shape_inference(True)
+    _set_opset_version(opset_version)
+    graph = _optimize_graph(graph, operator_export_type, params_dict={})
+    proto, _, _, _ = graph._export_onnx(
+        {},
+        opset_version,
+        {},
+        False,
+        operator_export_type,
+        False,
+        False,
+        {},
+        True,
+        "",
+        {},
+    )
+    return proto
+
+
+class _TestJITIRToONNX:
+    """Abstract base class for test cases.
+
+    Intentionally not a sub-class of unittest.TestCase so that unittest / pytest
+    don't run it directly. unitest.TestCase is mixed in as another base class when
+    creating concrete sub-types. See MakeTestCase().
+    """
+
+    opset_version = -1  # Sub-classes must override
+    ort_providers = ["CPUExecutionProvider"]
+
+    def run_test(self, graph_ir, example_inputs):
+        graph = parse_ir(graph_ir)
+        jit_outs = torch._C._jit_interpret_graph(graph, example_inputs)
+
+        onnx_proto = _jit_graph_to_onnx_model(
+            graph, torch.onnx.OperatorExportTypes.ONNX, self.opset_version
+        )
+        ort_sess = onnxruntime.InferenceSession(
+            onnx_proto, providers=self.ort_providers
+        )
+        ort_outs = run_ort(ort_sess, example_inputs)
+
+        ort_compare_with_pytorch(ort_outs, jit_outs, rtol=1e-3, atol=1e-7)
+
+    def test_example_ir(self):
+        graph_ir = """
+        graph(%1 : Float(2, 3),
+              %2 : Float(2, 3)):
+          %3 : int = prim::Constant[value=1]()
+          %4 : Float(2, 3) = aten::add(%1, %2, %3)
+          return (%4)
+        """
+        a = torch.randn(2, 3)
+        b = torch.randn(2, 3)
+        self.run_test(graph_ir, (a, b))
+
+
+def MakeTestCase(opset_version: int) -> type:
+    name = f"TestJITIRToONNX_opset{opset_version}"
+    return type(
+        str(name),
+        (unittest.TestCase,),
+        dict(_TestJITIRToONNX.__dict__, opset_version=opset_version),
+    )
+
+
+TestJITIRToONNX_opset14 = MakeTestCase(14)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 77c2b85f27f0..79ae0a36f37b 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -1,21 +1,37 @@
 # Owner(s): ["module: onnx"]
 
-from typing import Tuple
 import io
 import itertools
 import sys
 import unittest
+from typing import Tuple
 
+import model_defs.dcgan as dcgan
+import model_defs.word_language_model as word_language_model
 import numpy as np
-
+import onnx
+import verify
 from debug_embed_params import run_embed_params
-from torch import nn
-from torch.autograd import Variable, function
-from torch.nn.utils import rnn as rnn_utils
-from torch.onnx import ExportTypes
-import torch.onnx
-import torch.onnx.operators
-import torch.utils.model_zoo as model_zoo
+from model_defs.lstm_flattening_result import LstmFlatteningResult
+from model_defs.mnist import MNIST
+from model_defs.rnn_model_with_packed_sequence import (
+    RnnModelWithPackedSequence,
+)
+from model_defs.squeezenet import SqueezeNet
+from model_defs.srresnet import SRResNet
+from model_defs.super_resolution import SuperResolutionNet
+from test_pytorch_common import (
+    BATCH_SIZE,
+    RNN_BATCH_SIZE,
+    RNN_HIDDEN_SIZE,
+    RNN_INPUT_SIZE,
+    RNN_SEQUENCE_LENGTH,
+    skipIfNoCuda,
+    skipIfNoLapack,
+    skipIfTravis,
+    skipIfUnsupportedMinOpsetVersion,
+    skipIfUnsupportedOpsetVersion,
+)
 
 # Import various models for testing
 from torchvision.models.alexnet import alexnet
@@ -24,24 +40,18 @@
 from torchvision.models.resnet import resnet50
 from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn
 
-from model_defs.squeezenet import SqueezeNet
-from model_defs.super_resolution import SuperResolutionNet
-from model_defs.srresnet import SRResNet
-import model_defs.dcgan as dcgan
-import model_defs.word_language_model as word_language_model
-from model_defs.mnist import MNIST
-from model_defs.lstm_flattening_result import LstmFlatteningResult
-from model_defs.rnn_model_with_packed_sequence import RnnModelWithPackedSequence
-from caffe2.python.operator_test.torch_integration_test import (generate_rois_rotated,
-                                                                create_bbox_transform_inputs)
-
-import onnx
 import caffe2.python.onnx.backend as c2
-
-from test_pytorch_common import skipIfTravis, skipIfNoLapack, skipIfNoCuda
-from test_pytorch_common import BATCH_SIZE, RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
-from test_pytorch_common import skipIfUnsupportedOpsetVersion, skipIfUnsupportedMinOpsetVersion
-import verify
+import torch.onnx
+import torch.onnx.operators
+import torch.utils.model_zoo as model_zoo
+from caffe2.python.operator_test.torch_integration_test import (
+    create_bbox_transform_inputs,
+    generate_rois_rotated,
+)
+from torch import nn
+from torch.autograd import Variable, function
+from torch.nn.utils import rnn as rnn_utils
+from torch.onnx import ExportTypes
 
 skip = unittest.skip
 
@@ -51,15 +61,19 @@ def wrapper(self):
         if self.embed_params:
             raise unittest.SkipTest("Skip embed_params verify test")
         return func(self)
+
     return wrapper
 
+
 def skipIfNoEmbed(func):
     def wrapper(self):
         if not self.embed_params:
             raise unittest.SkipTest("Skip debug embed_params test")
         return func(self)
+
     return wrapper
 
+
 # def import_model(proto, input, workspace=None, use_gpu=True):
 #    model_def = onnx.ModelProto.FromString(proto)
 #    onnx.checker.check_model(model_def)
@@ -117,8 +131,7 @@ def do_export(model, inputs, *args, **kwargs):
 
 
 class TestCaffe2Backend_opset9(unittest.TestCase):
-    from torch.onnx.symbolic_helper import _export_onnx_opset_version
-    opset_version = _export_onnx_opset_version
+    opset_version = 9
     embed_params = False
 
     def setUp(self):
@@ -132,12 +145,20 @@ def convert_cuda(self, model, input):
         # input might be nested - we want to move everything to GPU
         cuda_input = function._nested_map(
             lambda o: isinstance(o, Variable) or isinstance(o, torch.Tensor),
-            lambda o: o.cuda())(input)
+            lambda o: o.cuda(),
+        )(input)
         return cuda_model, cuda_input
 
-    def run_debug_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True,
-                       operator_export_type=torch.onnx.OperatorExportTypes.ONNX):
+    def run_debug_test(
+        self,
+        model,
+        train,
+        batch_size,
+        state_dict=None,
+        input=None,
+        use_gpu=True,
+        operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    ):
         """
         # TODO: remove this from the final release version
         This test is for our debugging only for the case where
@@ -154,12 +175,17 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
         if use_gpu:
             model, input = self.convert_cuda(model, input)
 
-        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False,
-                                      do_constant_folding=False,
-                                      opset_version=self.opset_version,
-                                      keep_initializers_as_inputs=True,
-                                      add_node_names=False,
-                                      operator_export_type=operator_export_type)
+        onnxir, torch_out = do_export(
+            model,
+            input,
+            export_params=self.embed_params,
+            verbose=False,
+            do_constant_folding=False,
+            opset_version=self.opset_version,
+            keep_initializers_as_inputs=True,
+            add_node_names=False,
+            operator_export_type=operator_export_type,
+        )
         if isinstance(torch_out, torch.autograd.Variable):
             torch_out = (torch_out,)
 
@@ -167,12 +193,22 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
         for _, (x, y) in enumerate(zip(torch_out, caffe2_out)):
             np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3)
 
-    def run_actual_test(self, model, train, batch_size, state_dict=None,
-                        input=None, use_gpu=True, rtol=0.001, atol=1e-7,
-                        do_constant_folding=True,
-                        operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
-                        input_names=None, dynamic_axes=None,
-                        remained_onnx_input_idx=None):
+    def run_actual_test(
+        self,
+        model,
+        train,
+        batch_size,
+        state_dict=None,
+        input=None,
+        use_gpu=True,
+        rtol=0.001,
+        atol=1e-7,
+        do_constant_folding=True,
+        operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+        input_names=None,
+        dynamic_axes=None,
+        remained_onnx_input_idx=None,
+    ):
         """
         This is what the user facing version will look like
         """
@@ -191,37 +227,68 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
             model, input = self.convert_cuda(model, input)
 
         # Verify the model runs the same in Caffe2
-        verify.verify(model, input, c2, rtol=rtol, atol=atol,
-                      do_constant_folding=do_constant_folding,
-                      opset_version=self.opset_version,
-                      keep_initializers_as_inputs=True,
-                      operator_export_type=operator_export_type,
-                      input_names=input_names,
-                      dynamic_axes=dynamic_axes,
-                      remained_onnx_input_idx=remained_onnx_input_idx)
-
-    def run_model_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True, rtol=0.001, atol=1e-7,
-                       do_constant_folding=True,
-                       operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
-                       input_names=None, dynamic_axes=None,
-                       remained_onnx_input_idx=None):
+        verify.verify(
+            model,
+            input,
+            c2,
+            rtol=rtol,
+            atol=atol,
+            do_constant_folding=do_constant_folding,
+            opset_version=self.opset_version,
+            keep_initializers_as_inputs=True,
+            operator_export_type=operator_export_type,
+            input_names=input_names,
+            dynamic_axes=dynamic_axes,
+            remained_onnx_input_idx=remained_onnx_input_idx,
+        )
+
+    def run_model_test(
+        self,
+        model,
+        train,
+        batch_size,
+        state_dict=None,
+        input=None,
+        use_gpu=True,
+        rtol=0.001,
+        atol=1e-7,
+        do_constant_folding=True,
+        operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+        input_names=None,
+        dynamic_axes=None,
+        remained_onnx_input_idx=None,
+    ):
         use_gpu_ = torch.cuda.is_available() and use_gpu
         # NOTE: do_constant_folding is turned on only when model has
         # parameters embedded (which are needed for constant folding),
         # i.e. for self.embed_params=True case. self.embed_params=True
         # for the TestCaffe2BackendEmbed class defined at the bottom.
         if self.embed_params:
-            self.run_actual_test(model, train, batch_size, state_dict, input,
-                                 use_gpu=use_gpu_, rtol=rtol, atol=atol,
-                                 do_constant_folding=do_constant_folding,
-                                 operator_export_type=operator_export_type,
-                                 input_names=input_names,
-                                 dynamic_axes=dynamic_axes,
-                                 remained_onnx_input_idx=remained_onnx_input_idx)
+            self.run_actual_test(
+                model,
+                train,
+                batch_size,
+                state_dict,
+                input,
+                use_gpu=use_gpu_,
+                rtol=rtol,
+                atol=atol,
+                do_constant_folding=do_constant_folding,
+                operator_export_type=operator_export_type,
+                input_names=input_names,
+                dynamic_axes=dynamic_axes,
+                remained_onnx_input_idx=remained_onnx_input_idx,
+            )
         else:
-            self.run_debug_test(model, train, batch_size, state_dict, input,
-                                use_gpu=use_gpu_, operator_export_type=operator_export_type)
+            self.run_debug_test(
+                model,
+                train,
+                batch_size,
+                state_dict,
+                input,
+                use_gpu=use_gpu_,
+                operator_export_type=operator_export_type,
+            )
 
     def test_linear(self):
         class MyModel(torch.nn.Module):
@@ -259,9 +326,15 @@ def forward(self, input):
         # Note that the export call explicitly sets the names of not just the input,
         # but also the parameters. This test checks that the model can be loaded and
         # executed in Caffe2 backend correctly.
-        torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE,
-                           input_names=["input1", "parameter1", "parameter2"],
-                           keep_initializers_as_inputs=True)
+        torch.onnx._export(
+            model,
+            input,
+            f,
+            verbose=True,
+            export_type=ExportTypes.ZIP_ARCHIVE,
+            input_names=["input1", "parameter1", "parameter2"],
+            keep_initializers_as_inputs=True,
+        )
 
         f.seek(0)
         model_c2 = c2.prepare_zip_archive(f)
@@ -286,9 +359,15 @@ def forward(self, input):
         # But note that the target first parameter name is the same as the second parameter name.
         # This test checks that given this edge condition, the model can be loaded and executed
         # in Caffe2 backend correctly.
-        torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE,
-                           input_names=["input1", "fc1.bias"],
-                           keep_initializers_as_inputs=True)
+        torch.onnx._export(
+            model,
+            input,
+            f,
+            verbose=True,
+            export_type=ExportTypes.ZIP_ARCHIVE,
+            input_names=["input1", "fc1.bias"],
+            keep_initializers_as_inputs=True,
+        )
 
         f.seek(0)
         model_c2 = c2.prepare_zip_archive(f)
@@ -300,13 +379,21 @@ def test_lstm_cell(self):
         input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE)
         h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE)
         c0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE)
-        self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, (h0, c0)), use_gpu=False)
+        self.run_model_test(
+            model,
+            train=False,
+            batch_size=BATCH_SIZE,
+            input=(input, (h0, c0)),
+            use_gpu=False,
+        )
 
     def test_gru_cell(self):
         model = nn.GRUCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE)
         input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE)
         h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE)
-        self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, h0), use_gpu=False)
+        self.run_model_test(
+            model, train=False, batch_size=BATCH_SIZE, input=(input, h0), use_gpu=False
+        )
 
     def _dispatch_rnn_test(self, name, *args, **kwargs):
         if name == "elman":
@@ -316,15 +403,25 @@ def _dispatch_rnn_test(self, name, *args, **kwargs):
         if name == "gru":
             self._gru_test(*args, **kwargs)
 
-    def _elman_rnn_test(self, layers, nonlinearity, bidirectional,
-                        initial_state, packed_sequence, dropout):
+    def _elman_rnn_test(
+        self,
+        layers,
+        nonlinearity,
+        bidirectional,
+        initial_state,
+        packed_sequence,
+        dropout,
+    ):
         batch_first = True if packed_sequence == 2 else False
-        model = nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE,
-                       layers,
-                       nonlinearity=nonlinearity,
-                       bidirectional=bidirectional,
-                       dropout=dropout,
-                       batch_first=batch_first)
+        model = nn.RNN(
+            RNN_INPUT_SIZE,
+            RNN_HIDDEN_SIZE,
+            layers,
+            nonlinearity=nonlinearity,
+            bidirectional=bidirectional,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
 
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
@@ -352,24 +449,41 @@ def make_input(batch_size):
             return input
 
         input = make_input(RNN_BATCH_SIZE)
-        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False, atol=1e-7)
+        self.run_model_test(
+            model,
+            train=False,
+            batch_size=RNN_BATCH_SIZE,
+            input=input,
+            use_gpu=False,
+            atol=1e-7,
+        )
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
         # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
-        onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
-                              do_constant_folding=False)
+        onnxir, _ = do_export(
+            model,
+            variable_batch_size_init_input,
+            keep_initializers_as_inputs=True,
+            do_constant_folding=False,
+        )
         other_input = make_input(RNN_BATCH_SIZE + 1)
         _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
 
-    def _lstm_test(self, layers, bidirectional, initial_state,
-                   packed_sequence, dropout):
+    def _lstm_test(
+        self, layers, bidirectional, initial_state, packed_sequence, dropout
+    ):
         batch_first = True if packed_sequence == 2 else False
         model = LstmFlatteningResult(
-            RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-            bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
+            RNN_INPUT_SIZE,
+            RNN_HIDDEN_SIZE,
+            layers,
+            bidirectional=bidirectional,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
         if packed_sequence == 2:
@@ -397,23 +511,34 @@ def make_input(batch_size):
             return input
 
         input = make_input(RNN_BATCH_SIZE)
-        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False)
+        self.run_model_test(
+            model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False
+        )
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
         # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
-        onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
-                              do_constant_folding=False)
+        onnxir, _ = do_export(
+            model,
+            variable_batch_size_init_input,
+            keep_initializers_as_inputs=True,
+            do_constant_folding=False,
+        )
         other_input = make_input(RNN_BATCH_SIZE + 1)
         _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
 
-    def _gru_test(self, layers, bidirectional, initial_state,
-                  packed_sequence, dropout):
+    def _gru_test(self, layers, bidirectional, initial_state, packed_sequence, dropout):
         batch_first = True if packed_sequence == 2 else False
-        model = nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-                       bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
+        model = nn.GRU(
+            RNN_INPUT_SIZE,
+            RNN_HIDDEN_SIZE,
+            layers,
+            bidirectional=bidirectional,
+            dropout=dropout,
+            batch_first=batch_first,
+        )
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
         if packed_sequence == 2:
@@ -440,15 +565,21 @@ def make_input(batch_size):
             return input
 
         input = make_input(RNN_BATCH_SIZE)
-        self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False)
+        self.run_model_test(
+            model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False
+        )
 
         # test that the model still runs with a different batch size
         # (save the model with a batch_size of 1 with rnn with a variable batch size,
         # otherwise expand will fail)
         variable_batch_size_init_input = make_input(1)
         # Constant folding works when model has parameters embedded. For this case, we need to disable it
-        onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True,
-                              do_constant_folding=False)
+        onnxir, _ = do_export(
+            model,
+            variable_batch_size_init_input,
+            keep_initializers_as_inputs=True,
+            do_constant_folding=False,
+        )
         other_input = make_input(RNN_BATCH_SIZE + 1)
         _ = run_embed_params(onnxir, model, other_input, use_gpu=False)
 
@@ -463,9 +594,15 @@ def test_rnn_init_predict_split(self):
         # Test that we are correctly splitting between init and
         # predict net. When we embed parameters, there should be more
         # ops in the init net.
-        mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params,
-                                                  keep_initializers_as_inputs=True,
-                                                  do_constant_folding=False)[0])
+        mp = onnx.ModelProto.FromString(
+            do_export(
+                model,
+                input,
+                export_params=self.embed_params,
+                keep_initializers_as_inputs=True,
+                do_constant_folding=False,
+            )[0]
+        )
         prepared = c2.prepare(mp, device="CPU")
         if self.embed_params:
             assert len(prepared.init_net.op) == 950
@@ -476,8 +613,13 @@ def test_rnn_init_predict_split(self):
 
     def test_alexnet(self):
         state_dict = model_zoo.load_url(model_urls["alexnet"], progress=False)
-        self.run_model_test(alexnet(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict, atol=1e-3)
+        self.run_model_test(
+            alexnet(),
+            train=False,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+            atol=1e-3,
+        )
 
     @skipIfNoCuda
     def test_dcgan(self):
@@ -490,23 +632,35 @@ def test_dcgan(self):
         netD = dcgan._netD(1)
         netD.apply(dcgan.weights_init)
         input = torch.randn(BATCH_SIZE, 3, dcgan.imgsz, dcgan.imgsz)
-        self.run_model_test(netD, train=False, batch_size=BATCH_SIZE,
-                            input=input)
+        self.run_model_test(netD, train=False, batch_size=BATCH_SIZE, input=input)
 
         netG = dcgan._netG(1)
         netG.apply(dcgan.weights_init)
         state_dict = model_zoo.load_url(model_urls["dcgan_b"], progress=False)
         # state_dict = model_zoo.load_url(model_urls["dcgan_f"], progress=False)
         noise = torch.randn(BATCH_SIZE, dcgan.nz, 1, 1).normal_(0, 1)
-        self.run_model_test(netG, train=False, batch_size=BATCH_SIZE,
-                            input=noise, state_dict=state_dict, rtol=1e-2, atol=1e-6)
+        self.run_model_test(
+            netG,
+            train=False,
+            batch_size=BATCH_SIZE,
+            input=noise,
+            state_dict=state_dict,
+            rtol=1e-2,
+            atol=1e-6,
+        )
 
-    @unittest.skipIf(not torch.cuda.is_available(),
-                     "model on net has cuda in it, awaiting fix")
+    @unittest.skipIf(
+        not torch.cuda.is_available(), "model on net has cuda in it, awaiting fix"
+    )
     def test_densenet(self):
         state_dict = model_zoo.load_url(model_urls["densenet121"], progress=False)
-        self.run_model_test(densenet121(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict, atol=1e-7)
+        self.run_model_test(
+            densenet121(),
+            train=False,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+            atol=1e-7,
+        )
 
     @skip("doesn't match exactly...")
     # TODO: figure out the numerical instabilities
@@ -514,33 +668,48 @@ def test_inception(self):
         x = torch.randn(BATCH_SIZE, 3, 299, 299, requires_grad=True)
         # state_dict = model_zoo.load_url(model_urls["inception_v3_google"], progress=False)
         state_dict = None
-        self.run_model_test(inception_v3(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict, input=x)
+        self.run_model_test(
+            inception_v3(),
+            train=False,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+            input=x,
+        )
 
     @skipIfNoEmbed
     def test_resnet(self):
         state_dict = model_zoo.load_url(model_urls["resnet50"], progress=False)
-        self.run_model_test(resnet50(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict, atol=1e-5)
+        self.run_model_test(
+            resnet50(),
+            train=False,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+            atol=1e-5,
+        )
 
     def test_squeezenet(self):
         sqnet_v1_1 = SqueezeNet(version=1.1)
         state_dict = model_zoo.load_url(model_urls["squeezenet1_1"], progress=False)
         # state_dict = model_zoo.load_url(model_urls["squeezenet1_0"], progress=False)
-        self.run_model_test(sqnet_v1_1, train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict)
+        self.run_model_test(
+            sqnet_v1_1, train=False, batch_size=BATCH_SIZE, state_dict=state_dict
+        )
 
     # @skip("takes long to run, LAPACK needed for gpu")
     @skipIfNoLapack
     @unittest.skip("This model takes too much memory")
     def test_srresnet(self):
-        super_resolution_net = SRResNet(
-            rescale_factor=4, n_filters=64, n_blocks=8)
+        super_resolution_net = SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)
         state_dict = model_zoo.load_url(model_urls["srresNet"], progress=False)
         x = torch.randn(1, 3, 224, 224, requires_grad=True)
-        self.run_model_test(super_resolution_net, train=False,
-                            batch_size=1, state_dict=state_dict,
-                            input=x, use_gpu=False)
+        self.run_model_test(
+            super_resolution_net,
+            train=False,
+            batch_size=1,
+            state_dict=state_dict,
+            input=x,
+            use_gpu=False,
+        )
 
     @skipIfTravis
     @skipIfNoLapack
@@ -549,31 +718,37 @@ def test_super_resolution(self):
         super_resolution_net = SuperResolutionNet(upscale_factor=3)
         state_dict = model_zoo.load_url(model_urls["super_resolution"], progress=False)
         x = torch.randn(1, 1, 224, 224, requires_grad=True)
-        self.run_model_test(super_resolution_net, train=False,
-                            batch_size=BATCH_SIZE, state_dict=state_dict,
-                            input=x, use_gpu=False, atol=1e-6)
+        self.run_model_test(
+            super_resolution_net,
+            train=False,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+            input=x,
+            use_gpu=False,
+            atol=1e-6,
+        )
 
     @unittest.skip("This model takes too much memory")
     def test_vgg16(self):
         state_dict = model_zoo.load_url(model_urls["vgg16"], progress=False)
-        self.run_model_test(vgg16(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict)
+        self.run_model_test(
+            vgg16(), train=False, batch_size=BATCH_SIZE, state_dict=state_dict
+        )
 
     @skip("disable to run tests faster...")
     def test_vgg16_bn(self):
-        self.run_model_test(vgg16_bn(), train=False,
-                            batch_size=BATCH_SIZE)
+        self.run_model_test(vgg16_bn(), train=False, batch_size=BATCH_SIZE)
 
     @skip("disable to run tests faster...")
     def test_vgg19(self):
         state_dict = model_zoo.load_url(model_urls["vgg19"], progress=False)
-        self.run_model_test(vgg19(), train=False, batch_size=BATCH_SIZE,
-                            state_dict=state_dict)
+        self.run_model_test(
+            vgg19(), train=False, batch_size=BATCH_SIZE, state_dict=state_dict
+        )
 
     @skip("disable to run tests faster...")
     def test_vgg19_bn(self):
-        self.run_model_test(vgg19_bn(), train=False,
-                            batch_size=BATCH_SIZE)
+        self.run_model_test(vgg19_bn(), train=False, batch_size=BATCH_SIZE)
 
     def run_word_language_model(self, model_name):
         ntokens = 50
@@ -583,13 +758,18 @@ def run_word_language_model(self, model_name):
         dropout = 0.2
         tied = False
         batchsize = 5
-        model = word_language_model.RNNModel(model_name, ntokens, emsize,
-                                             nhid, nlayers, dropout, tied,
-                                             batchsize)
+        model = word_language_model.RNNModel(
+            model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize
+        )
         x = torch.arange(0, ntokens).long().view(-1, batchsize)
         # Only support CPU version, since tracer is not working in GPU RNN.
-        self.run_model_test(model, train=False, input=(x, model.hidden),
-                            batch_size=batchsize, use_gpu=False)
+        self.run_model_test(
+            model,
+            train=False,
+            input=(x, model.hidden),
+            batch_size=batchsize,
+            use_gpu=False,
+        )
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     @skipIfUnsupportedOpsetVersion([10])
@@ -705,20 +885,49 @@ def test_tensor_index_newaxis(self):
 
     def test_tensor_index_advanced_indexing(self):
         self._test_index_generic(
-            lambda input: input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])])
+            lambda input: input[
+                :,
+                torch.tensor([[0, 2], [1, 1]]),
+                :,
+                torch.tensor([2, 1]),
+                torch.tensor([0, 3]),
+            ]
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_tensor_index_advanced_indexing_with_slice(self):
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])])
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])])
+        self._test_index_generic(
+            lambda input: input[
+                :, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])
+            ]
+        )
+        self._test_index_generic(
+            lambda input: input[
+                :,
+                torch.tensor([0, 2]),
+                torch.tensor([1]),
+                2:4,
+                torch.tensor([[1], [4]]),
+            ]
+        )
 
     def test_tensor_index_advanced_indexing_consecutive(self):
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None])
+        self._test_index_generic(
+            lambda input: input[
+                :, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None
+            ]
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_tensor_index_advanced_indexing_masked(self):
         self._test_index_generic(
-            lambda input: input[:, torch.tensor([1, 0, 1, 0], dtype=torch.uint8), torch.tensor([[1, 3], [4, 0]]), None])
+            lambda input: input[
+                :,
+                torch.tensor([1, 0, 1, 0], dtype=torch.uint8),
+                torch.tensor([[1, 3], [4, 0]]),
+                None,
+            ]
+        )
 
     def test_chunk(self):
         class MyModel(torch.nn.Module):
@@ -729,6 +938,7 @@ def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
                 return input.chunk(8, dim=2)[-1]
+
         self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
 
     def test_sqrt(self):
@@ -738,6 +948,7 @@ def __init__(self):
 
             def forward(self, input):
                 return input.sqrt()
+
         input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9)
         self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
 
@@ -756,6 +967,7 @@ def __init__(self):
 
             def forward(self, input):
                 return input.log()
+
         input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9)
         self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
 
@@ -767,6 +979,7 @@ def __init__(self):
 
             def forward(self, input):
                 return input.erf()
+
         input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9)
         self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
 
@@ -778,8 +991,11 @@ def __init__(self):
 
                 def forward(self, input):
                     return getattr(input, name)()
+
             input = torch.empty(BATCH_SIZE, 10, 10).uniform_()
-            self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
+            self.run_model_test(
+                MyModel(), train=False, input=input, batch_size=BATCH_SIZE
+            )
 
         test_func("cos")
         test_func("sin")
@@ -797,6 +1013,7 @@ def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
                 return input + 1
+
         self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
 
     def test_subconstant(self):
@@ -808,6 +1025,7 @@ def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
                 return input - 1
+
         self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE)
 
     def test_arithmetic(self):
@@ -820,7 +1038,9 @@ def forward(self, x):
                 return x
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(ArithmeticModule(), input=x, train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArithmeticModule(), input=x, train=False, batch_size=BATCH_SIZE
+        )
 
     def test_embedding(self):
         model = nn.Embedding(10, 3, padding_idx=-1)
@@ -938,17 +1158,20 @@ def test_adaptive_max_pool3D(self):
     def test_weight_norm(self):
         model = nn.utils.weight_norm(nn.Conv1d(1, 1, 3))
         input = torch.randn(1, 1, 5, requires_grad=True)
-        self.run_model_test(
-            model, train=True, batch_size=0, input=input, use_gpu=False
-        )
+        self.run_model_test(model, train=True, batch_size=0, input=input, use_gpu=False)
 
     def test_mnist(self):
         model = MNIST()
         input = torch.randn(BATCH_SIZE, 1, 28, 28)
         state_dict = None
         # TODO: test with state_dict
-        self.run_model_test(model, train=False, input=input, batch_size=BATCH_SIZE,
-                            state_dict=state_dict)
+        self.run_model_test(
+            model,
+            train=False,
+            input=input,
+            batch_size=BATCH_SIZE,
+            state_dict=state_dict,
+        )
 
     def test_mm(self):
         class MyModel(torch.nn.Module):
@@ -957,9 +1180,12 @@ def __init__(self):
 
             def forward(self, m1, m2):
                 return torch.mm(m1, m2)
+
         m1 = torch.randn(3, 4)
         m2 = torch.randn(4, 5)
-        self.run_model_test(MyModel(), train=False, input=(m1, m2), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=(m1, m2), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_addmm(self):
         class MyModel(torch.nn.Module):
@@ -968,10 +1194,17 @@ def __init__(self):
 
             def forward(self, ma, m1, m2):
                 return torch.addmm(ma, m1, m2)
+
         ma = torch.randn(5)
         m1 = torch.randn(3, 4)
         m2 = torch.randn(4, 5)
-        self.run_model_test(MyModel(), train=False, input=(ma, m1, m2), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(),
+            train=False,
+            input=(ma, m1, m2),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     def test_fuse_addmm(self):
         class AddmmModel(torch.nn.Module):
@@ -979,7 +1212,9 @@ def forward(self, x):
                 return torch.mm(x, x) + x
 
         x = torch.randn(3, 3)
-        self.run_model_test(AddmmModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            AddmmModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_scalar_type(self):
         class ArithmeticModel(torch.nn.Module):
@@ -987,14 +1222,18 @@ def forward(self, x):
                 return x.size(0) * 2 * x
 
         x = torch.ones(2, 3, dtype=torch.float32)
-        self.run_model_test(ArithmeticModel(), input=x, train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArithmeticModel(), input=x, train=False, batch_size=BATCH_SIZE
+        )
 
         class ReciprocalModel(torch.nn.Module):
             def forward(self, x):
                 return torch.reciprocal(x)
 
         x = torch.tensor([2.0, 4.0], dtype=torch.double)
-        self.run_model_test(ReciprocalModel(), input=x, train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ReciprocalModel(), input=x, train=False, batch_size=BATCH_SIZE
+        )
 
         class ComparisonModel(torch.nn.Module):
             def forward(self, x, y):
@@ -1002,7 +1241,9 @@ def forward(self, x, y):
 
         x = torch.ones(2, 3, dtype=torch.int32)
         y = torch.ones(2, 3, dtype=torch.float32)
-        self.run_model_test(ComparisonModel(), input=(x, y), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ComparisonModel(), input=(x, y), train=False, batch_size=BATCH_SIZE
+        )
 
         class MatMulModel(torch.nn.Module):
             def forward(self, x, y):
@@ -1010,7 +1251,9 @@ def forward(self, x, y):
 
         x = torch.ones(3, 4)
         y = torch.ones(4, 5)
-        self.run_model_test(MatMulModel(), input=(x, y), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            MatMulModel(), input=(x, y), train=False, batch_size=BATCH_SIZE
+        )
 
         class AddMMModel(torch.nn.Module):
             def forward(self, x):
@@ -1027,41 +1270,61 @@ def __init__(self):
 
             def forward(self, x):
                 return x.transpose(1, 2).transpose(2, 3)
+
         x = torch.randn(5, 6, 7, 8)
-        self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_sum(self):
         shape = (3, 4, 5)
         for params in [{}] + [{"dim": i} for i in range(len(shape))]:
+
             class MyModel(torch.nn.Module):
                 def __init__(self):
                     super(MyModel, self).__init__()
 
                 def forward(self, x):
                     return torch.sum(x, **params)
+
             x = torch.randn(*shape)
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+            self.run_model_test(
+                MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+            )
 
     def test_cumsum(self):
         shape = (3, 4, 5)
         for params in [{"dim": i} for i in range(len(shape))]:
+
             class MyModel(torch.nn.Module):
                 def __init__(self):
                     super(MyModel, self).__init__()
 
                 def forward(self, x):
                     return torch.cumsum(x, **params)
+
             x = torch.randn(*shape)
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False,
-                                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+            self.run_model_test(
+                MyModel(),
+                train=False,
+                input=(x),
+                batch_size=BATCH_SIZE,
+                use_gpu=False,
+                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            )
 
     def test_cosine_similarity(self):
         shape = (100, 128)
         x = torch.randn(*shape)
         y = torch.randn(*shape)
-        self.run_model_test(torch.nn.CosineSimilarity(dim=1, eps=1e-6), train=False,
-                            input=(x, y), batch_size=BATCH_SIZE, use_gpu=False,
-                            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        self.run_model_test(
+            torch.nn.CosineSimilarity(dim=1, eps=1e-6),
+            train=False,
+            input=(x, y),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     @skipIfUnsupportedOpsetVersion([10])
@@ -1069,13 +1332,16 @@ def test_lstm_constant_folding(self):
         class LstmNet(nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
                 super(LstmNet, self).__init__()
-                self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional)
+                self.lstm = nn.LSTM(
+                    input_size, hidden_size, num_layers, bidirectional=bidirectional
+                )
 
             def forward(self, input, initial_state):
                 return self.lstm(input, initial_state)
 
-        def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
-                                         seq_len, bidirectional):
+        def get_LstmNet_model_and_inputs(
+            input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+        ):
             num_directions = 2 if bidirectional else 1
             model = LstmNet(input_size, hidden_size, num_layers, bidirectional)
             input = torch.randn(seq_len, batch_size, input_size)
@@ -1085,11 +1351,25 @@ def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size
 
         batch_size1 = 3
         model1, input1 = get_LstmNet_model_and_inputs(7, 3, 2, batch_size1, 5, True)
-        self.run_actual_test(model1, train=False, batch_size=batch_size1, input=input1, use_gpu=False, do_constant_folding=True)
+        self.run_actual_test(
+            model1,
+            train=False,
+            batch_size=batch_size1,
+            input=input1,
+            use_gpu=False,
+            do_constant_folding=True,
+        )
 
         batch_size2 = 4
         model2, input2 = get_LstmNet_model_and_inputs(5, 4, 3, batch_size2, 7, False)
-        self.run_actual_test(model2, train=False, batch_size=batch_size2, input=input2, use_gpu=False, do_constant_folding=True)
+        self.run_actual_test(
+            model2,
+            train=False,
+            batch_size=batch_size2,
+            input=input2,
+            use_gpu=False,
+            do_constant_folding=True,
+        )
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     @skipIfUnsupportedOpsetVersion([10])
@@ -1097,14 +1377,17 @@ def test_gru_constant_folding(self):
         class GruNet(nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
                 super(GruNet, self).__init__()
-                self.mygru = nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional)
+                self.mygru = nn.GRU(
+                    input_size, hidden_size, num_layers, bidirectional=bidirectional
+                )
 
             def forward(self, input, initial_state):
                 out = self.mygru(input, initial_state)
                 return out
 
-        def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
-                                        seq_len, bidirectional):
+        def get_GruNet_model_and_inputs(
+            input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+        ):
             num_directions = 2 if bidirectional else 1
             model = GruNet(input_size, hidden_size, num_layers, bidirectional)
             input = torch.randn(seq_len, batch_size, input_size)
@@ -1113,11 +1396,25 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
 
         batch_size1 = 3
         model1, input1 = get_GruNet_model_and_inputs(7, 3, 2, batch_size1, 5, True)
-        self.run_actual_test(model1, train=False, batch_size=batch_size1, input=input1, use_gpu=False, do_constant_folding=True)
+        self.run_actual_test(
+            model1,
+            train=False,
+            batch_size=batch_size1,
+            input=input1,
+            use_gpu=False,
+            do_constant_folding=True,
+        )
 
         batch_size2 = 4
         model2, input2 = get_GruNet_model_and_inputs(5, 4, 3, batch_size2, 7, False)
-        self.run_actual_test(model2, train=False, batch_size=batch_size2, input=input2, use_gpu=False, do_constant_folding=True)
+        self.run_actual_test(
+            model2,
+            train=False,
+            batch_size=batch_size2,
+            input=input2,
+            use_gpu=False,
+            do_constant_folding=True,
+        )
 
     def test_repeat(self):
         class MyModel(torch.nn.Module):
@@ -1128,14 +1425,17 @@ def forward(self, x):
                 return x.repeat(1, 2, 3, 4)
 
         x = torch.randn(4, 3, 2, 1, requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_upsample(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model = nn.Upsample(size=[v * 2 for v in x.size()[2:]], mode="nearest")
-        self.run_model_test(model, train=False, input=(x),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_interpolate_upsample(self):
@@ -1147,14 +1447,13 @@ def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
                 # work around for now: turn the dynamic sizes into constant
                 size = [int(i) for i in size]
-                return nn.functional.interpolate(x,
-                                                 size=size,
-                                                 mode="nearest")
+                return nn.functional.interpolate(x, size=size, mode="nearest")
 
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model = MyModel()
-        self.run_model_test(model, train=False, input=(x),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedOpsetVersion([7, 8, 10])
     def test_interpolate_upsample_dynamic_sizes(self):
@@ -1164,14 +1463,13 @@ def __init__(self):
 
             def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
-                return nn.functional.interpolate(x,
-                                                 size=size,
-                                                 mode="nearest")
+                return nn.functional.interpolate(x, size=size, mode="nearest")
 
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model = MyModel()
-        self.run_model_test(model, train=False, input=(x),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_repeat_dim_overflow(self):
         class MyModel(torch.nn.Module):
@@ -1182,7 +1480,9 @@ def forward(self, x):
                 return x.repeat(1, 2, 3, 4)
 
         x = torch.randn(1, 2, requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_repeat_dynamic(self):
         class MyModel(torch.nn.Module):
@@ -1194,21 +1494,39 @@ def forward(self, x, y):
 
         x = torch.randn(1, 2, requires_grad=True)
         y = torch.randn(2, 4, requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False,
-                            input_names=["x", "y"], dynamic_axes={"x": [0, 1], "y": [0, 1]})
-        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[0])
+        self.run_model_test(
+            MyModel(),
+            train=False,
+            input=(x, y),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1], "y": [0, 1]},
+        )
+        self.run_model_test(
+            MyModel(),
+            train=False,
+            input=(x, y),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[0],
+        )
 
     def test_mean(self):
         shape = (3, 4, 5)
         for params in [{}] + [{"dim": i} for i in range(len(shape))]:
+
             class MyModel(torch.nn.Module):
                 def __init__(self):
                     super(MyModel, self).__init__()
 
                 def forward(self, x):
                     return torch.mean(x, **params)
+
             x = torch.randn(*shape)
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+            self.run_model_test(
+                MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False
+            )
 
     # TODO: Add test cases for prod once Caffe2 has support for ReduceProd
     def test_softmax(self):
@@ -1217,7 +1535,9 @@ def test_softmax(self):
                 model = nn.Softmax(dim=d)
                 dims = [2] * (i - 2) + [3, 4]
                 input = torch.ones(*dims, requires_grad=True)
-                self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=input)
+                self.run_model_test(
+                    model, train=False, batch_size=BATCH_SIZE, input=input
+                )
 
     def test_softmax_dtype(self):
         class SoftmaxModel(torch.nn.Module):
@@ -1246,8 +1566,15 @@ def test_randn(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
                 return (torch.randn(1, 2, 3, 4) + x).shape
-        self.run_model_test(MyModule(), train=False, input=(x),
-                            batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[])
+
+        self.run_model_test(
+            MyModule(),
+            train=False,
+            input=(x),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     def test_rand(self):
         x = torch.randn(1, 2, 3, 4)
@@ -1255,11 +1582,20 @@ def test_rand(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
                 return (torch.rand(1, 2, 3, 4) + x).shape
-        self.run_model_test(MyModule(), train=False, input=(x),
-                            batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[])
+
+        self.run_model_test(
+            MyModule(),
+            train=False,
+            input=(x),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     def test_convtranspose(self):
-        model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2)
+        model = nn.ConvTranspose2d(
+            3, 3, 3, stride=3, bias=False, padding=1, output_padding=2
+        )
         self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7)
 
     def test_unsqueeze(self):
@@ -1273,8 +1609,11 @@ def __init__(self):
 
                 def forward(self, x):
                     return x.unsqueeze(dim)
+
             x = torch.randn(*shape)
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7)
+            self.run_model_test(
+                MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7
+            )
 
     def test_squeeze(self):
         shape = (1, 1, 1)
@@ -1287,8 +1626,11 @@ def __init__(self):
 
                 def forward(self, x):
                     return x.squeeze(dim)
+
             x = torch.randn(*shape)
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7)
+            self.run_model_test(
+                MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7
+            )
 
     # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed
     # TODO: We should have another pass to eliminate the unused initializers in ONNX models.
@@ -1301,10 +1643,10 @@ def test_instance_norm(self):
     def test_pixel_shuffle(self):
         underlying = nn.PixelShuffle(4)
         shape = (1, 32, 5, 5)
-        input = Variable(torch.randn(*shape),
-                         requires_grad=True)
-        self.run_model_test(underlying, train=False, input=(input),
-                            batch_size=BATCH_SIZE)
+        input = Variable(torch.randn(*shape), requires_grad=True)
+        self.run_model_test(
+            underlying, train=False, input=(input), batch_size=BATCH_SIZE
+        )
 
     def test_dynamic_sizes(self):
         class MyModel(torch.nn.Module):
@@ -1315,8 +1657,11 @@ def forward(self, x):
                 shape = torch.onnx.operators.shape_as_tensor(x)
                 new_shape = torch.cat((torch.LongTensor([-1]), shape[0].view(1)))
                 return torch.onnx.operators.reshape_from_tensor_shape(x, new_shape)
+
         x = torch.randn(3, 5, 7)
-        self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_advanced_broadcast(self):
         class MyModel(torch.nn.Module):
@@ -1325,9 +1670,12 @@ def __init__(self):
 
             def forward(self, x, y):
                 return torch.mul(x, y)
+
         x = torch.randn(1, 5, 10)
         y = torch.randn(1, 5, 1)
-        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_int8_export(self):
         class MyModel(torch.nn.Module):
@@ -1339,15 +1687,24 @@ def forward(self, x):
                 return x * self.param.float()
 
         import io
+
         f = io.BytesIO()
         from torch.onnx import ExportTypes
-        torch.onnx._export(MyModel(), (torch.rand(3, 4),), f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE,
-                           keep_initializers_as_inputs=True)
+
+        torch.onnx._export(
+            MyModel(),
+            (torch.rand(3, 4),),
+            f,
+            verbose=True,
+            export_type=ExportTypes.ZIP_ARCHIVE,
+            keep_initializers_as_inputs=True,
+        )
 
         X = np.random.rand(3, 4).astype(np.float32)
 
         f.seek(0)
         import caffe2.python.onnx.backend as c2
+
         model = c2.prepare_zip_archive(f)
         model.run(X)
 
@@ -1358,7 +1715,9 @@ def forward(self, x):
                 return x[-1, :, :]
 
         x = torch.randn(3, 4, 5)
-        self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_neg_slice_large(self):
@@ -1367,7 +1726,9 @@ def forward(self, x):
                 return x[:, :, :, :, -3]
 
         x = torch.randn(3, 4, 5, 6, 7)
-        self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @unittest.skip("https://github.com/pytorch/pytorch/issues/10984")
     @skipIfUnsupportedOpsetVersion([10])
@@ -1377,7 +1738,9 @@ def forward(self, x):
                 return x[:, :, :, :, -1]
 
         x = torch.randn(3, 4, 5, 6, 7)
-        self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dynamic_slice(self):
@@ -1385,22 +1748,34 @@ class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
                 results = []
                 for i in range(4):
-                    results.append(x[:x.size(0) - i, i:x.size(2), i:3])
+                    results.append(x[: x.size(0) - i, i : x.size(2), i:3])
                 return tuple(results)
 
         x = torch.rand(5, 5, 5)
-        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            DynamicSliceExportMod(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dynamic_slice_script(self):
         class DynamicSliceModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return x[1:x.size(0)]
+                return x[1 : x.size(0)]
+
         module = DynamicSliceModel()
         x = torch.rand(1, 2)
-        self.run_model_test(DynamicSliceModel(), train=False, input=(x,),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            DynamicSliceModel(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dynamic_slice_to_the_end(self):
@@ -1412,7 +1787,13 @@ def forward(self, x):
                 return tuple(results)
 
         x = torch.rand(5, 5, 5)
-        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            DynamicSliceExportMod(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     def test_unbind(self):
         class UnbindModel(torch.nn.Module):
@@ -1420,7 +1801,9 @@ def forward(self, input):
                 return input.unbind()
 
         x = torch.randn(3, 4, 5)
-        self.run_model_test(UnbindModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            UnbindModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
         class UnbindModel2(torch.nn.Module):
             def forward(self, input):
@@ -1428,7 +1811,13 @@ def forward(self, input):
                 return out
 
         x = torch.randn(3, 4, 5)
-        self.run_model_test(UnbindModel2(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            UnbindModel2(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_zero(self):
@@ -1437,9 +1826,23 @@ def forward(self, x):
                 return x.zero_()
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(Zero_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False,
-                            input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
-        self.run_model_test(Zero_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[])
+        self.run_model_test(
+            Zero_(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
+        self.run_model_test(
+            Zero_(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_fill(self):
@@ -1448,9 +1851,23 @@ def forward(self, x):
                 return x.fill_(3)
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(Fill_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False,
-                            input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
-        self.run_model_test(Fill_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[])
+        self.run_model_test(
+            Fill_(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
+        self.run_model_test(
+            Fill_(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     # ConstantFill is a deprecated experimental op (used in opsets < 9).
     # Shape inference does not cover this op.
@@ -1467,9 +1884,9 @@ def forward(self):
 
         x = torch.ones(2, 3, 4)
         y = torch.ones(2, 3, 4) * 2
-        self.run_model_test(Arithmetic(),
-                            train=False, input=(), batch_size=BATCH_SIZE,
-                            use_gpu=False)
+        self.run_model_test(
+            Arithmetic(), train=False, input=(), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_tensor_factories(self):
         class TensorFactory(torch.nn.Module):
@@ -1477,37 +1894,88 @@ def forward(self, x):
                 return torch.zeros(x.size()) + torch.ones(x.size())
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, remained_onnx_input_idx=[])
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     def test_tensor_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return torch.zeros(x.shape, dtype=torch.float) + torch.ones(x.shape, dtype=torch.float)
+                return torch.zeros(x.shape, dtype=torch.float) + torch.ones(
+                    x.shape, dtype=torch.float
+                )
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, remained_onnx_input_idx=[])
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=[],
+        )
 
     def test_tensor_like_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
-                ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
+                zeros = torch.zeros_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
+                ones = torch.ones_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
                 return zeros + ones
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         remained_onnx_input_idx = None if self.opset_version < 9 else []
-        self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False, remained_onnx_input_idx=remained_onnx_input_idx)
+        self.run_model_test(
+            TensorFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+            remained_onnx_input_idx=remained_onnx_input_idx,
+        )
 
     def test_full(self):
         class FullModel(torch.nn.Module):
@@ -1515,8 +1983,9 @@ def forward(self, x):
                 return torch.full((3, 4), x, dtype=torch.long)
 
         x = torch.tensor(12)
-        self.run_model_test(FullModel(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            use_gpu=False)
+        self.run_model_test(
+            FullModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_full_script(self):
         class FullClass(torch.jit.ScriptModule):
@@ -1525,7 +1994,9 @@ def forward(self, x):
                 return torch.full((4, 5), x, dtype=torch.long)
 
         x = torch.tensor(12)
-        self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_clamp(self):
         class ClampModel(torch.nn.Module):
@@ -1533,21 +2004,27 @@ def forward(self, x):
                 return x.clamp(-0.5, 0.5)
 
         x = torch.randn(3, 4)
-        self.run_model_test(ClampModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ClampModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
         class ClampMinModel(torch.nn.Module):
             def forward(self, x):
                 return x.clamp(min=-0.5)
 
         x = torch.randn(3, 4)
-        self.run_model_test(ClampMinModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ClampMinModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
         class ClampMaxModel(torch.nn.Module):
             def forward(self, x):
                 return x.clamp(max=0.5)
 
         x = torch.randn(3, 4)
-        self.run_model_test(ClampMaxModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ClampMaxModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_where_functional(self):
@@ -1556,7 +2033,13 @@ def forward(self, x):
                 return torch.where(x > 2.0, x, torch.neg(x))
 
         x = torch.randn(3, 4)
-        self.run_model_test(WhereFunctional(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            WhereFunctional(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_where_method(self):
@@ -1565,15 +2048,25 @@ def forward(self, x):
                 return x.where(x > 2.0, torch.neg(x))
 
         x = torch.randn(3, 4)
-        self.run_model_test(WhereMethod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            WhereMethod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     def test_data_dependent_zeros_factory(self):
         class ZerosFactory(torch.nn.Module):
             def forward(self, input):
-                return torch.cat([input, torch.zeros(input.size(0), 1).type_as(input)], dim=1)
+                return torch.cat(
+                    [input, torch.zeros(input.size(0), 1).type_as(input)], dim=1
+                )
 
         x = torch.zeros(3, 4)
-        self.run_model_test(ZerosFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ZerosFactory(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     def test_implicit_expand(self):
         class ImplicitExpandExportMod(torch.nn.Module):
@@ -1581,7 +2074,13 @@ def forward(self, x):
                 return x + 1
 
         x = torch.randn(3, 4)
-        self.run_model_test(ImplicitExpandExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ImplicitExpandExportMod(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     def test_reduce_sum(self):
         class ReduceSumNegativeIndices(torch.nn.Module):
@@ -1589,7 +2088,13 @@ def forward(self, x):
                 return x.sum(-1)
 
         x = torch.randn(2, 3, 4)
-        self.run_model_test(ReduceSumNegativeIndices(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ReduceSumNegativeIndices(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     def test_reduce_sum_multi_dim(self):
         class ReduceSumMultipleAxes(torch.nn.Module):
@@ -1597,7 +2102,13 @@ def forward(self, x):
                 return x.sum(dim=(2, 3), keepdim=True)
 
         x = torch.randn(16, 3, 256, 256)
-        self.run_model_test(ReduceSumMultipleAxes(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ReduceSumMultipleAxes(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     # InstanceNorm model (used in the subgraph) includes unused weights,
     # so skip this in TestCaffe2BackendEmbed
@@ -1621,8 +2132,9 @@ def forward(self, x):
                 return 1 - x
 
         x = torch.randn(1, 2)
-        self.run_model_test(RsubModel(), train=False, input=(x,),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            RsubModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_isnan(self):
@@ -1631,7 +2143,9 @@ def forward(self, input):
                 return torch.isnan(input)
 
         x = torch.tensor([1.0, float("nan"), 2.0])
-        self.run_model_test(IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_scatter(self):
@@ -1642,21 +2156,36 @@ def forward(self, input, indices, values):
         input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
         indices = torch.tensor([[1, 0], [0, 2], [0, 1]], dtype=torch.int64)
         values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])
-        self.run_model_test(ScatterModel(), train=False, input=(input, indices, values),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ScatterModel(),
+            train=False,
+            input=(input, indices, values),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
         input = torch.zeros(3, 4, 5, 6)
         indices = torch.tensor([[1, 0], [0, 2], [0, 1]], dtype=torch.int64)
         indices = indices.view(3, 2, 1, 1).expand(3, 2, 5, 6)
         values = torch.arange(3 * 2 * 5 * 6, dtype=torch.float32).view(3, 2, 5, 6)
-        self.run_model_test(ScatterModel(), train=False, input=(input, indices, values),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ScatterModel(),
+            train=False,
+            input=(input, indices, values),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
         input = torch.zeros(3, 4, 2)
         indices = torch.tensor([[[1, 0], [0, 2]], [[1, 1], [0, 1]], [[2, 1], [2, 2]]])
         values = torch.arange(3 * 2 * 2, dtype=torch.float32).view(3, 2, 2)
-        self.run_model_test(ScatterModel(), train=False, input=(input, indices, values),
-                            batch_size=BATCH_SIZE, use_gpu=False)
+        self.run_model_test(
+            ScatterModel(),
+            train=False,
+            input=(input, indices, values),
+            batch_size=BATCH_SIZE,
+            use_gpu=False,
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_flatten(self):
@@ -1698,7 +2227,9 @@ def forward(self, input, other):
 
         x = torch.randn(4, 4, requires_grad=True)
         y = torch.randn(4, 4, requires_grad=True)
-        self.run_model_test(MaxModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            MaxModel(), train=False, input=(x, y), batch_size=BATCH_SIZE
+        )
 
     def test_min(self):
         class MinModel(torch.nn.Module):
@@ -1755,7 +2286,9 @@ def forward(self, input):
                 return input.reshape_as(y)
 
         x = torch.randn(2, 3, requires_grad=True)
-        self.run_model_test(ReshapeAsModel(), train=False, input=x, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ReshapeAsModel(), train=False, input=x, batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedOpsetVersion([10])
     def test_narrow(self):
@@ -1795,11 +2328,24 @@ def __init__(self):
             def forward(self, feature, im_info, anchors):
                 bbox_deltas = self.conv(feature)
                 a, b = torch.ops._caffe2.GenerateProposals(
-                    feature, bbox_deltas, im_info, anchors,
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
+                    feature,
+                    bbox_deltas,
+                    im_info,
+                    anchors,
+                    2.0,
+                    6000,
+                    300,
+                    0.7,
+                    16,
+                    True,
+                    -90,
+                    90,
+                    1.0,
+                    True,
                 )
                 output = torch.ops._caffe2.RoIAlign(
-                    feature, a,
+                    feature,
+                    a,
                     order="NCHW",
                     spatial_scale=1.0,
                     pooled_h=3,
@@ -1816,7 +2362,9 @@ def forward(self, feature, im_info, anchors):
 
         model = MyModel()
         with torch.no_grad():
-            self.run_model_test(MyModel(), train=False, input=inputs, batch_size=BATCH_SIZE)
+            self.run_model_test(
+                MyModel(), train=False, input=inputs, batch_size=BATCH_SIZE
+            )
 
     def test_c2_roi_align(self):
         class MyModel(torch.nn.Module):
@@ -1825,8 +2373,14 @@ def __init__(self):
 
             def forward(self, feature, rois):
                 roi_feature = torch.ops._caffe2.RoIAlign(
-                    feature, rois, order="NCHW", spatial_scale=1.0,
-                    pooled_h=3, pooled_w=3, sampling_ratio=3, aligned=False,
+                    feature,
+                    rois,
+                    order="NCHW",
+                    spatial_scale=1.0,
+                    pooled_h=3,
+                    pooled_w=3,
+                    sampling_ratio=3,
+                    aligned=False,
                 )
                 return roi_feature
 
@@ -1852,8 +2406,20 @@ def __init__(self):
 
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
-                    scores, bbox_deltas, im_info, anchors,
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
+                    scores,
+                    bbox_deltas,
+                    im_info,
+                    anchors,
+                    2.0,
+                    6000,
+                    300,
+                    0.7,
+                    16,
+                    True,
+                    -90,
+                    90,
+                    1.0,
+                    True,
                 )
                 return a, b
 
@@ -1862,8 +2428,9 @@ def forward(self, scores, bbox_deltas, im_info, anchors):
         W = 8
         img_count = 3
         scores = torch.ones(img_count, A, H, W, dtype=torch.float32)
-        bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W,
-                                     dtype=torch.float32)
+        bbox_deltas = torch.linspace(
+            0, 10, steps=img_count * 4 * A * H * W, dtype=torch.float32
+        )
         bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W)
         im_info = torch.ones(img_count, 3, dtype=torch.float32)
         anchors = torch.ones(A, 4, dtype=torch.float32)
@@ -1880,7 +2447,7 @@ def forward(self, rois, deltas, im_info):
                     rois,
                     deltas,
                     im_info,
-                    weights=[1., 1., 1., 1.],
+                    weights=[1.0, 1.0, 1.0, 1.0],
                     apply_scale=False,
                     rotated=True,
                     angle_bound_on=True,
@@ -1905,7 +2472,9 @@ def forward(self, rois, deltas, im_info):
         im_info[:, 2] = 1.0
         im_info = torch.zeros((batch_size, 3))
         inputs = (torch.tensor(rois), torch.tensor(deltas), torch.tensor(im_info))
-        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False
+        )
 
     # BoxWithNMSLimits has requirements for the inputs, so randomly generated inputs
     # in Caffe2BackendTestEmbed doesn't work with this op.
@@ -1964,8 +2533,14 @@ def forward(self, class_prob, pred_bbox, batch_splits):
                 )
                 return a, b, c, d, e, f
 
-        inputs = (torch.tensor(class_prob), torch.tensor(pred_bbox), torch.tensor(batch_splits))
-        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False)
+        inputs = (
+            torch.tensor(class_prob),
+            torch.tensor(pred_bbox),
+            torch.tensor(batch_splits),
+        )
+        self.run_model_test(
+            MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False
+        )
 
     def test_c2_inference_lstm(self):
         num_layers = 4
@@ -1998,39 +2573,54 @@ def forward(self, lstm_in):
             bias=has_bias,
             num_layers=num_layers,
         )
-        lstm_in = [
-            torch.from_numpy(inputs),
-            torch.from_numpy(hx),
-            torch.from_numpy(hx),
-        ] + [param.detach() for param in torch_lstm._flat_weights]
+        lstm_in = (
+            [
+                torch.from_numpy(inputs),
+                torch.from_numpy(hx),
+                torch.from_numpy(hx),
+            ]
+            + [param.detach() for param in torch_lstm._flat_weights],
+        )
 
-        self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False)
+        self.run_model_test(
+            MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False
+        )
 
     def test_tuple_input_output(self):
         class TupleModel(torch.jit.ScriptModule):
             @torch.jit.script_method
-            def forward(self, a: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+            def forward(
+                self, a: Tuple[torch.Tensor, torch.Tensor]
+            ) -> Tuple[torch.Tensor, torch.Tensor]:
                 return a
 
         x = (torch.randn(3, 4), torch.randn(4, 3))
-        self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
     def test_nested_tuple_input_output(self):
         class NestedTupleModel(torch.jit.ScriptModule):
             @torch.jit.script_method
-            def forward(self, a: torch.Tensor, b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]) -> torch.Tensor:
+            def forward(
+                self,
+                a: torch.Tensor,
+                b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+            ) -> torch.Tensor:
                 return a + b[0] + b[1][0] + b[1][1]
 
         x = torch.randn(4, 5)
         y = (torch.randn(4, 5), (torch.randn(4, 5), torch.randn(4, 5)))
-        self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE
+        )
 
     def test_topk(self):
         class TopKModel(torch.nn.Module):
             def forward(self, input):
                 return torch.topk(input, 3)
 
-        x = torch.arange(1., 6.)
+        x = torch.arange(1.0, 6.0)
         self.run_model_test(TopKModel(), train=False, input=x, batch_size=BATCH_SIZE)
 
     def test_topk_script(self):
@@ -2065,8 +2655,13 @@ def forward(self, input):
                 return torch._dim_arange(input, 1)
 
         x = torch.ones(5, 6)
-        self.run_model_test(DimArange(), train=False, input=x, batch_size=BATCH_SIZE,
-                            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        self.run_model_test(
+            DimArange(),
+            train=False,
+            input=x,
+            batch_size=BATCH_SIZE,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_arange_end(self):
@@ -2076,13 +2671,17 @@ def forward(self, a):
                 return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a
 
         x = torch.randn(3, 4, requires_grad=True)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
                 return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a
 
-        self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_arange_start_end(self):
@@ -2092,29 +2691,47 @@ def forward(self, a):
                 return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a
 
         x = torch.randn(3, 4, requires_grad=True)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
                 return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a
 
-        self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_arange_start_end_step(self):
         class ArangeScript(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, a):
-                return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a
+                return (
+                    torch.arange(
+                        2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float
+                    ).view(-1, 1)
+                    + a
+                )
 
         x = torch.randn(3, 4, requires_grad=True)
-        self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
-                return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a
+                return (
+                    torch.arange(
+                        2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float
+                    ).view(-1, 1)
+                    + a
+                )
 
-        self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_size(self):
@@ -2123,9 +2740,21 @@ def forward(self, input):
                 return torch.arange(input.size(0)), torch.arange(input.size(-1))
 
         x = torch.randn(5, 3, 2)
-        self.run_model_test(SizeModel(), train=False, input=(x,), batch_size=BATCH_SIZE,
-                            input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
-        self.run_model_test(SizeModel(), train=False, input=(x,), batch_size=BATCH_SIZE, remained_onnx_input_idx=[])
+        self.run_model_test(
+            SizeModel(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
+        self.run_model_test(
+            SizeModel(),
+            train=False,
+            input=(x,),
+            batch_size=BATCH_SIZE,
+            remained_onnx_input_idx=[],
+        )
 
     def test_log2(self):
         class Log2Model(torch.nn.Module):
@@ -2142,8 +2771,12 @@ def forward(self, input):
 
         x = torch.randn(2, 3, 4, requires_grad=False)
         model = DirichletModel()
-        onnxir, _ = do_export(model, x, keep_initializers_as_inputs=True,
-                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        onnxir, _ = do_export(
+            model,
+            x,
+            keep_initializers_as_inputs=True,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
         onnx_model = onnx.ModelProto.FromString(onnxir)
         prepared = c2.prepare(onnx_model)
         caffe2_out = prepared.run(inputs=[x.cpu().numpy()])
@@ -2156,8 +2789,12 @@ def forward(self, input):
 
         x = torch.randn(2, 3, 4, requires_grad=False)
         model = GammaModel()
-        onnxir, _ = do_export(model, x, keep_initializers_as_inputs=True,
-                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        onnxir, _ = do_export(
+            model,
+            x,
+            keep_initializers_as_inputs=True,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
         onnx_model = onnx.ModelProto.FromString(onnxir)
         prepared = c2.prepare(onnx_model)
         caffe2_out = prepared.run(inputs=[x.cpu().numpy()])
@@ -2176,8 +2813,12 @@ def forward(self, weight):
                 return torch.multinomial(weight, 1)
 
         weight = torch.tensor([[0, 10, 0, 0], [0, 0, 100, 0]], dtype=torch.float)
-        self.run_model_test(Multinomial(), train=False, input=weight, batch_size=BATCH_SIZE)
-        self.run_model_test(MultinomialNoReplacement(), train=False, input=weight, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            Multinomial(), train=False, input=weight, batch_size=BATCH_SIZE
+        )
+        self.run_model_test(
+            MultinomialNoReplacement(), train=False, input=weight, batch_size=BATCH_SIZE
+        )
 
     def test_prim_shape(self):
         x = torch.randn(4, 5, requires_grad=True)
@@ -2189,7 +2830,10 @@ def view_by_prim_shape(x):
         class PrimShapeModel(torch.nn.Module):
             def forward(self, input):
                 return view_by_prim_shape(input)
-        self.run_model_test(PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE)
+
+        self.run_model_test(
+            PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE
+        )
 
     def test_and(self):
         class AndModel(torch.nn.Module):
@@ -2198,7 +2842,9 @@ def forward(self, x, y):
 
         x = torch.randint(0, 1, (3, 5), dtype=torch.bool)
         y = torch.randint(0, 1, (3, 5), dtype=torch.bool)
-        self.run_model_test(AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
+        self.run_model_test(
+            AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE
+        )
 
     def test_or(self):
         class OrModel(torch.nn.Module):
@@ -2233,16 +2879,21 @@ def forward(self, x):
 
         model = WhileModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
+        self.run_model_test(
+            model,
+            train=False,
+            input=(inputs,),
+            batch_size=BATCH_SIZE,
+        )
 
     def test_while_cond(self):
         class WhileModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x, a):
-                b = (a < 4)
+                b = a < 4
                 while b:
                     a += b.to(torch.long)
-                    b = (a < 4)
+                    b = a < 4
                 return x + a
 
         model = WhileModel()
@@ -2293,7 +2944,12 @@ def forward(self, x):
 
         model = NestedLoopsModel()
         inputs = torch.zeros(1, 2, 3, dtype=torch.long)
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
+        self.run_model_test(
+            model,
+            train=False,
+            input=(inputs,),
+            batch_size=BATCH_SIZE,
+        )
 
     def test_select(self):
         class SelectModel(torch.nn.Module):
@@ -2302,7 +2958,7 @@ def forward(self, x):
 
         model = SelectModel()
         inputs = torch.randn(3, 2, 1)
-        self.run_model_test(model, train=False, input=(inputs, ), batch_size=BATCH_SIZE)
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
 
     def test_std(self):
         class StandardDeviation(torch.nn.Module):
@@ -2330,14 +2986,18 @@ def forward(self, x):
                 return x.masked_fill(mask, 2)
 
         x = torch.zeros(4, 2, 3, requires_grad=True)
-        self.run_model_test(MaskedFillModel(), input=(x, ), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            MaskedFillModel(), input=(x,), train=False, batch_size=BATCH_SIZE
+        )
 
         class MaskedFillModel2(torch.nn.Module):
             def forward(self, x):
                 return x.masked_fill(x > 3, -1)
 
         x = torch.arange(16).view(2, 2, 4).to(torch.float32)
-        self.run_model_test(MaskedFillModel2(), input=(x, ), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            MaskedFillModel2(), input=(x,), train=False, batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_meshgrid(self):
@@ -2368,22 +3028,42 @@ def forward(self, input):
 
         inputs = torch.randint(10, (2, 3))
         model = RemainderModel()
-        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,)
+        self.run_model_test(
+            model,
+            train=False,
+            input=(inputs,),
+            batch_size=BATCH_SIZE,
+        )
 
     def test_baddbmm(self):
         class MyModule(torch.nn.Module):
             def forward(self, input, batch1, batch2):
-                return torch.baddbmm(input, batch1, batch2, alpha=torch.tensor(5), beta=3.5)
+                return torch.baddbmm(
+                    input, batch1, batch2, alpha=torch.tensor(5), beta=3.5
+                )
+
         x = torch.randn(10, 3, 5)
         batch1 = torch.randn(10, 3, 4)
         batch2 = torch.randn(10, 4, 5)
-        self.run_model_test(MyModule(), input=(x, batch1, batch2), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            MyModule(), input=(x, batch1, batch2), train=False, batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_gelu(self):
         class GeluModel(torch.nn.Module):
             def forward(self, x):
-                return torch.nn.functional.gelu(x)
+                return torch.nn.functional.gelu(x, approximate="none")
+
+        model = GeluModel()
+        inputs = torch.randn(2, 4, 5, 6, requires_grad=True)
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE)
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_tanh_gelu(self):
+        class GeluModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.gelu(x, approximate="tanh")
 
         model = GeluModel()
         inputs = torch.randn(2, 4, 5, 6, requires_grad=True)
@@ -2397,7 +3077,9 @@ def forward(self, input):
                 return input.index_fill(2, index, -1)
 
         x = torch.randn(3, 4, 5, requires_grad=True)
-        self.run_model_test(IndexFillModel(), input=(x, ), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            IndexFillModel(), input=(x,), train=False, batch_size=BATCH_SIZE
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_copy(self):
@@ -2408,19 +3090,37 @@ def forward(self, input):
                 return input.index_copy(1, index, source)
 
         x = torch.randn(3, 4, 5, requires_grad=True)
-        self.run_model_test(IndexCopyModel(), input=(x, ), train=False, batch_size=BATCH_SIZE)
+        self.run_model_test(
+            IndexCopyModel(), input=(x,), train=False, batch_size=BATCH_SIZE
+        )
+
 
 # a bit of metaprogramming to set up all the rnn tests
 
 
-def make_test(name, base, layer, bidirectional, initial_state,
-              variable_length, dropout,
-              **extra_kwargs):
-    test_name = str("_".join([
-        "test", name, layer[1],
-        bidirectional[1], initial_state[1],
-        variable_length[1], dropout[1]
-    ]))
+def make_test(
+    name,
+    base,
+    layer,
+    bidirectional,
+    initial_state,
+    variable_length,
+    dropout,
+    **extra_kwargs
+):
+    test_name = str(
+        "_".join(
+            [
+                "test",
+                name,
+                layer[1],
+                bidirectional[1],
+                initial_state[1],
+                variable_length[1],
+                dropout[1],
+            ]
+        )
+    )
 
     @unittest.skip("Disabled due to onnx optimizer deprecation")
     @skipIfUnsupportedOpsetVersion([10])
@@ -2433,53 +3133,54 @@ def f(self):
             initial_state=initial_state[0],
             packed_sequence=variable_length[0],
             dropout=dropout[0],
-            **extra_kwargs)
+            **extra_kwargs
+        )
 
     f.__name__ = test_name
     setattr(TestCaffe2Backend_opset9, f.__name__, f)
 
 
 def setup_rnn_tests():
-    layers_opts = [
-        (1, "unilayer"),
-        (3, "trilayer")
-    ]
-    bidirectional_opts = [
-        (False, "forward"),
-        (True, "bidirectional")
-    ]
-    initial_state_opts = [
-        (True, "with_initial_state"),
-        (False, "no_initial_state")
-    ]
+    layers_opts = [(1, "unilayer"), (3, "trilayer")]
+    bidirectional_opts = [(False, "forward"), (True, "bidirectional")]
+    initial_state_opts = [(True, "with_initial_state"), (False, "no_initial_state")]
     variable_length_opts = [
         (0, "without_sequence_lengths"),
         (1, "with_variable_length_sequences"),
-        (2, "with_batch_first_sequence_lengths")
-    ]
-    dropout_opts = [
-        (0.2, "with_dropout"),
-        (0.0, "without_dropout")
+        (2, "with_batch_first_sequence_lengths"),
     ]
+    dropout_opts = [(0.2, "with_dropout"), (0.0, "without_dropout")]
     test_count = 0
-    for (layer, bidirectional, initial_state, variable_length, dropout) in \
-        itertools.product(
-            layers_opts,
-            bidirectional_opts,
-            initial_state_opts,
-            variable_length_opts,
-            dropout_opts,
+    for (
+        layer,
+        bidirectional,
+        initial_state,
+        variable_length,
+        dropout,
+    ) in itertools.product(
+        layers_opts,
+        bidirectional_opts,
+        initial_state_opts,
+        variable_length_opts,
+        dropout_opts,
     ):
 
         for base, name, extra_kwargs in (
-                ("elman", "elman_relu", {"nonlinearity": u"relu"}),
-                ("elman", "elman_tanh", {"nonlinearity": u"tanh"}),
-                ("lstm", "lstm", {}),
-                ("gru", "gru", {})
+            ("elman", "elman_relu", {"nonlinearity": "relu"}),
+            ("elman", "elman_tanh", {"nonlinearity": "tanh"}),
+            ("lstm", "lstm", {}),
+            ("gru", "gru", {}),
         ):
-            make_test(name, base, layer, bidirectional, initial_state,
-                      variable_length, dropout,
-                      **extra_kwargs)
+            make_test(
+                name,
+                base,
+                layer,
+                bidirectional,
+                initial_state,
+                variable_length,
+                dropout,
+                **extra_kwargs
+            )
             test_count += 1
 
     # sanity check that a representative example does exist
@@ -2488,47 +3189,62 @@ def setup_rnn_tests():
     # make sure no one accidentally disables all the tests without
     # noticing
     assert test_count == 192, test_count
+
+
 setup_rnn_tests()
 
 # add the same test suite as above, but switch embed_params=False
 # to embed_params=True
-TestCaffe2BackendEmbed_opset9 = type(str("TestCaffe2BackendEmbed_opset9"),
-                                     (unittest.TestCase,),
-                                     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True))
+TestCaffe2BackendEmbed_opset9 = type(
+    str("TestCaffe2BackendEmbed_opset9"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, embed_params=True),
+)
 
 # opset 7 tests
-TestCaffe2Backend_opset7 = type(str("TestCaffe2Backend_opset7"),
-                                (unittest.TestCase,),
-                                dict(TestCaffe2Backend_opset9.__dict__, opset_version=7))
-TestCaffe2BackendEmbed_opset7 = type(str("TestCaffe2BackendEmbed_opset7"),
-                                     (unittest.TestCase,),
-                                     dict(TestCaffe2Backend_opset9.__dict__,
-                                          embed_params=True, opset_version=7))
+TestCaffe2Backend_opset7 = type(
+    str("TestCaffe2Backend_opset7"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, opset_version=7),
+)
+TestCaffe2BackendEmbed_opset7 = type(
+    str("TestCaffe2BackendEmbed_opset7"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=7),
+)
 
 # opset 8 tests
-TestCaffe2Backend_opset8 = type(str("TestCaffe2Backend_opset8"),
-                                (unittest.TestCase,),
-                                dict(TestCaffe2Backend_opset9.__dict__, opset_version=8))
-TestCaffe2BackendEmbed_opset8 = type(str("TestCaffe2BackendEmbed_opset8"),
-                                     (unittest.TestCase,),
-                                     dict(TestCaffe2Backend_opset9.__dict__,
-                                          embed_params=True, opset_version=8))
+TestCaffe2Backend_opset8 = type(
+    str("TestCaffe2Backend_opset8"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, opset_version=8),
+)
+TestCaffe2BackendEmbed_opset8 = type(
+    str("TestCaffe2BackendEmbed_opset8"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=8),
+)
 
 # opset 10 tests
-TestCaffe2Backend_opset10 = type(str("TestCaffe2Backend_opset10"),
-                                 (unittest.TestCase,),
-                                 dict(TestCaffe2Backend_opset9.__dict__, opset_version=10))
-
-TestCaffe2BackendEmbed_opset10 = type(str("TestCaffe2BackendEmbed_opset10"),
-                                      (unittest.TestCase,),
-                                      dict(TestCaffe2Backend_opset9.__dict__,
-                                           embed_params=True, opset_version=10))
+TestCaffe2Backend_opset10 = type(
+    str("TestCaffe2Backend_opset10"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, opset_version=10),
+)
+
+TestCaffe2BackendEmbed_opset10 = type(
+    str("TestCaffe2BackendEmbed_opset10"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=10),
+)
 
 # add the same test suite as above, but switch embed_params=False
 # to embed_params=True
-TestCaffe2BackendEmbed_opset9_new_jit_API = type(str("TestCaffe2BackendEmbed_opset9_new_jit_API"),
-                                                 (unittest.TestCase,),
-                                                 dict(TestCaffe2Backend_opset9.__dict__, embed_params=True))
+TestCaffe2BackendEmbed_opset9_new_jit_API = type(
+    str("TestCaffe2BackendEmbed_opset9_new_jit_API"),
+    (unittest.TestCase,),
+    dict(TestCaffe2Backend_opset9.__dict__, embed_params=True),
+)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_caffe2_quantized.py b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
index b427b85a2b56..2bd8ac54941f 100644
--- a/test/onnx/test_pytorch_onnx_caffe2_quantized.py
+++ b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
@@ -1,19 +1,21 @@
 # Owner(s): ["module: unknown"]
 
-import numpy as np
-import unittest
-import torch.onnx
-import torch.nn as nn
-import torch.nn.quantized as nnq
 import io
+import unittest
 
+import numpy as np
 import onnx
-import caffe2.python.onnx.backend as c2
 
-class TestQuantizedOps(unittest.TestCase):
+import caffe2.python.onnx.backend as c2
+import torch.nn as nn
+import torch.nn.quantized as nnq
+import torch.onnx
 
 
-    def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxed_check=False):
+class TestQuantizedOps(unittest.TestCase):
+    def generic_test(
+        self, model, sample_inputs, input_names=None, decimal=3, relaxed_check=False
+    ):
         torch.backends.quantized.engine = "qnnpack"
         pt_inputs = tuple(torch.from_numpy(x) for x in sample_inputs)
         model.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
@@ -30,8 +32,15 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe
         output = q_model(*pt_inputs)
 
         f = io.BytesIO()
-        torch.onnx.export(q_model, pt_inputs, f, input_names=input_names,
-                          operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        torch.onnx.export(
+            q_model,
+            pt_inputs,
+            f,
+            input_names=input_names,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # Caffe2 doesn't support newer opset versions
+            opset_version=9,
+        )
         f.seek(0)
         onnx_model = onnx.load(f)
         caffe_res = c2.run_model(onnx_model, dict(zip(input_names, sample_inputs)))[0]
@@ -45,10 +54,13 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe
 
             # This check had to be changed to account for changes in
             # qnnpack's requant logic.
-            np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1")
+            np.testing.assert_(
+                max_diff <= 1, "Maximum absolute difference must be less than 1"
+            )
         else:
-            np.testing.assert_almost_equal(output.detach().numpy(), caffe_res, decimal=decimal)
-
+            np.testing.assert_almost_equal(
+                output.detach().numpy(), caffe_res, decimal=decimal
+            )
 
     def generic_unary_test(self, op):
         class QModule(torch.nn.Module):
@@ -65,7 +77,6 @@ def forward(self, x):
         x = np.random.random((1, 2)).astype("float32")
         self.generic_test(QModule(op), (x,), input_names=["x"])
 
-
     def test_quantized_add(self):
         class QAddModule(torch.nn.Module):
             def __init__(self):
@@ -93,8 +104,15 @@ def export_to_onnx(self, model, input, input_names):
 
         model = torch.jit.load(buf)
         f = io.BytesIO()
-        torch.onnx.export(model, input, f, input_names=input_names,
-                          operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+        torch.onnx.export(
+            model,
+            input,
+            f,
+            input_names=input_names,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # Caffe2 doesn't support newer opset versions
+            opset_version=9,
+        )
         f.seek(0)
 
         onnx_model = onnx.load(f)
@@ -105,7 +123,9 @@ class LinearModel(torch.nn.Module):
             def __init__(self):
                 super(LinearModel, self).__init__()
                 self.qconfig = torch.ao.quantization.default_qconfig
-                self.fc1 = torch.ao.quantization.QuantWrapper(torch.nn.Linear(5, 10).to(dtype=torch.float))
+                self.fc1 = torch.ao.quantization.QuantWrapper(
+                    torch.nn.Linear(5, 10).to(dtype=torch.float)
+                )
 
             def forward(self, x):
                 x = self.fc1(x)
@@ -131,18 +151,23 @@ def forward(self, x):
         # Permute pytorch output to NHWC
         # This check had to be changed to account for changes in
         # qnnpack's requant logic.
-        np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1")
+        np.testing.assert_(
+            max_diff <= 1, "Maximum absolute difference must be less than 1"
+        )
 
     def test_qconv_model(self):
         class ConvModel(torch.nn.Module):
             def __init__(self):
                 super(ConvModel, self).__init__()
                 self.qconfig = torch.ao.quantization.default_qconfig
-                self.fc1 = torch.ao.quantization.QuantWrapper(torch.nn.Conv2d(3, 5, 2, bias=True).to(dtype=torch.float))
+                self.fc1 = torch.ao.quantization.QuantWrapper(
+                    torch.nn.Conv2d(3, 5, 2, bias=True).to(dtype=torch.float)
+                )
 
             def forward(self, x):
                 x = self.fc1(x)
                 return x
+
         torch.backends.quantized.engine = "qnnpack"
         qconfig = torch.ao.quantization.default_qconfig
         model = ConvModel()
@@ -164,7 +189,9 @@ def forward(self, x):
         # Permute pytorch output to NHWC
         # This check had to be changed to account for changes in
         # qnnpack's requant logic.
-        np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1")
+        np.testing.assert_(
+            max_diff <= 1, "Maximum absolute difference must be less than 1"
+        )
 
     def test_upsample(self):
         class QUpsampleModule(torch.nn.Module):
@@ -174,7 +201,9 @@ def __init__(self):
                 self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
-                res = torch.nn.quantized.functional.interpolate(self.quant1(x), size=[6, 8], mode="nearest")
+                res = torch.nn.quantized.functional.interpolate(
+                    self.quant1(x), size=[6, 8], mode="nearest"
+                )
                 return self.dequant(res)
 
         x = np.random.rand(1, 2, 3, 4).astype("float32")
@@ -188,11 +217,15 @@ def __init__(self):
                 self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
-                res = torch.nn.functional.avg_pool2d(self.quant1(x), kernel_size=2, stride=1, padding=0)
+                res = torch.nn.functional.avg_pool2d(
+                    self.quant1(x), kernel_size=2, stride=1, padding=0
+                )
                 return self.dequant(res)
 
         x = np.random.rand(1, 2, 8, 8).astype("float32")
-        self.generic_test(QAvgPool2dModule(), (x,), input_names=["x"], relaxed_check=True)
+        self.generic_test(
+            QAvgPool2dModule(), (x,), input_names=["x"], relaxed_check=True
+        )
 
     def test_reshape(self):
         class QReshapeModule(torch.nn.Module):
@@ -231,12 +264,21 @@ def __init__(self):
                 self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x, y):
-                res = torch.ops.quantized.cat([self.quant1(x), self.quant1(y)], dim=1, scale=1.0, zero_point=0)
+                res = torch.ops.quantized.cat(
+                    [self.quant1(x), self.quant1(y)], dim=1, scale=1.0, zero_point=0
+                )
                 return self.dequant(res)
 
         x = np.random.rand(1, 2, 3, 4).astype("float32")
         y = np.random.rand(1, 4, 3, 4).astype("float32")
-        self.generic_test(QConcatModule(), (x, y,), input_names=["x", "y"])
+        self.generic_test(
+            QConcatModule(),
+            (
+                x,
+                y,
+            ),
+            input_names=["x", "y"],
+        )
 
     def test_max_pool2d(self):
         class QMaxPool2dModule(torch.nn.Module):
@@ -246,7 +288,9 @@ def __init__(self):
                 self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
-                res = torch.nn.functional.max_pool2d(self.quant1(x), kernel_size=2, stride=1, padding=0)
+                res = torch.nn.functional.max_pool2d(
+                    self.quant1(x), kernel_size=2, stride=1, padding=0
+                )
                 return self.dequant(res)
 
         x = np.random.rand(1, 2, 8, 8).astype("float32")
@@ -288,7 +332,7 @@ def __init__(self):
                 super().__init__(
                     nn.Conv2d(3, 3, 1, 1, bias=False),
                     nn.BatchNorm2d(3),
-                    nn.ReLU(inplace=False)
+                    nn.ReLU(inplace=False),
                 )
 
         class ModelWithClassifierHead(nn.Module):
@@ -318,14 +362,20 @@ def forward(self, x):
                 return x
 
         model = ModelWithClassifierHead().eval()
-        torch.ao.quantization.fuse_modules(model, [["conv1", "relu1"] ,
-                                           ["features.0.0", "features.0.1", "features.0.2"],
-                                           ["features.1.0", "features.1.1", "features.1.2"],
-                                           ["features.2.0", "features.2.1", "features.2.2"]], inplace=True)
-
+        torch.ao.quantization.fuse_modules(
+            model,
+            [
+                ["conv1", "relu1"],
+                ["features.0.0", "features.0.1", "features.0.2"],
+                ["features.1.0", "features.1.1", "features.1.2"],
+                ["features.2.0", "features.2.1", "features.2.2"],
+            ],
+            inplace=True,
+        )
 
         x = np.random.rand(1, 3, 10, 10).astype("float32")
         self.generic_test(model, (x,), input_names=["x"], relaxed_check=True)
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
new file mode 100644
index 000000000000..429e3ba82ed6
--- /dev/null
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -0,0 +1,114 @@
+# Owner(s): ["module: onnx"]
+
+"""Tests for onnx export that don't run the exported model."""
+
+import io
+import unittest
+from typing import Optional, Type
+
+import onnx
+
+import torch
+from torch import Tensor
+from torch.onnx import symbolic_helper
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+class TestOptionalOutput(unittest.TestCase):
+    # TODO: Move these tests to test_pytorch_onnx_onnxruntime once
+    # ONNX Runtime 1.11 is released and supports opset 16.
+
+    class IfNoneInput(torch.nn.Module):
+        def forward(self, x) -> Optional[Tensor]:
+            y: Optional[Tensor] = None
+            if x.size(0) > 1:
+                y = x
+            return y
+
+    class IfNoneOutput(torch.nn.Module):
+        def forward(self, x) -> Optional[Tensor]:
+            y: Optional[Tensor] = x
+            if x.size(0) > 1:
+                y = None
+            return y
+
+    class LoopNoneInput(torch.nn.Module):
+        def forward(self, x) -> Optional[Tensor]:
+            y: Optional[Tensor] = None
+            for _ in range(x.size(0)):
+                y = x
+            return y
+
+    class LoopNoneOutput(torch.nn.Module):
+        def forward(self, x) -> Optional[Tensor]:
+            y: Optional[Tensor] = x
+            for _ in range(x.size(0)):
+                y = None
+            return y
+
+    @parametrize(
+        "module_class",
+        (IfNoneInput, IfNoneOutput, LoopNoneInput, LoopNoneOutput),
+        name_fn=lambda module_class: module_class.__name__,
+    )
+    @parametrize("x_size", (0, 1), name_fn=lambda x_size: str(x_size))
+    def test_optional_output(self, module_class: Type[torch.nn.Module], x_size: int):
+        # Need scripting to preserve control flow for this test to be meaningful.
+        model = torch.jit.script(module_class())
+        f = io.BytesIO()
+        x = torch.ones(x_size)
+        dynamic_axis_name = "condition"
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=15,
+            # Ensure condition is not constant
+            dynamic_axes={"x": {0: dynamic_axis_name}},
+            input_names=["x"],
+        )
+        exported = onnx.load_from_string(f.getvalue())
+        expected_elem_type = symbolic_helper.scalar_type_to_onnx[
+            symbolic_helper.scalar_type_to_pytorch_type.index(x.dtype)
+        ].value
+        expected_output_type = onnx.helper.make_optional_type_proto(
+            onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,))
+        )
+        self.assertEqual(expected_output_type, exported.graph.output[0].type)
+        for node in exported.graph.node:
+            # Both branches output types should match.
+            if node.op_type == "If":
+                for attr in node.attribute:
+                    if attr.name in ("then_branch", "else_branch"):
+                        self.assertEqual(expected_output_type, attr.g.output[0].type)
+
+    def test_uninitialized_optional(self):
+        class Module(torch.nn.Module):
+            def forward(self, y: Optional[Tensor]) -> Optional[Tensor]:
+                if y is not None:
+                    if y.shape[1] < 5:
+                        if y.size(0) == 1:
+                            y = y + 4
+                        else:
+                            return y
+                return y
+
+        y = torch.ones((3, 4), dtype=torch.int)
+        torch.onnx.export(
+            torch.jit.script(Module()),
+            y,
+            io.BytesIO(),
+            opset_version=15,
+            dynamic_axes={"y": {0: "y0", 1: "y1"}},
+            input_names=["y"],
+        )
+
+
+instantiate_parametrized_tests(TestOptionalOutput)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index af4c6b9ec5bb..a3bb26571c59 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1,65 +1,84 @@
 # Owner(s): ["module: onnx"]
 
-import unittest
-import onnxruntime
-import torch
-import torchvision
-
-import numpy as np
+import copy
 import io
 import itertools
-import copy
 import os
 import random
+import unittest
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
 
 import model_defs.word_language_model as word_language_model
+import numpy as np
 import onnx
-
-import torch.nn.functional as F
-from torch.nn.utils import rnn as rnn_utils
-from model_defs.lstm_flattening_result import (LstmFlatteningResultWithSeqLength,
-                                               LstmFlatteningResultWithoutSeqLength)
-from model_defs.rnn_model_with_packed_sequence import (RnnModelWithPackedSequence,
-                                                       RnnModelWithPackedSequenceWithState,
-                                                       RnnModelWithPackedSequenceWithoutState)
-from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion,
-                                 skipIfNoLapack, disableScriptTest, skipIfONNXShapeInference,
-                                 skipIfUnsupportedMaxOpsetVersion, skipForAllOpsetVersions)
-from test_pytorch_common import BATCH_SIZE
-from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
-from typing import List, Tuple, Optional, Dict
-from torch import Tensor
-
+import onnxruntime
+import torchvision
+from model_defs.lstm_flattening_result import (
+    LstmFlatteningResultWithoutSeqLength,
+    LstmFlatteningResultWithSeqLength,
+)
+from model_defs.rnn_model_with_packed_sequence import (
+    RnnModelWithPackedSequence,
+    RnnModelWithPackedSequenceWithoutState,
+    RnnModelWithPackedSequenceWithState,
+)
+from test_pytorch_common import (
+    BATCH_SIZE,
+    RNN_BATCH_SIZE,
+    RNN_HIDDEN_SIZE,
+    RNN_INPUT_SIZE,
+    RNN_SEQUENCE_LENGTH,
+    skipIfNoLapack,
+    skipIfUnsupportedMaxOpsetVersion,
+    skipIfUnsupportedMinOpsetVersion,
+    skipIfUnsupportedOpsetVersion,
+    skipScriptTest,
+)
 from torchvision import ops
+from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, TwoMLPHead
 from torchvision.models.detection.image_list import ImageList
-from torchvision.models.detection.transform import GeneralizedRCNNTransform
-from torchvision.models.detection.rpn import AnchorGenerator, RPNHead, RegionProposalNetwork
 from torchvision.models.detection.roi_heads import RoIHeads
-from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, TwoMLPHead
-from collections import OrderedDict
+from torchvision.models.detection.rpn import (
+    AnchorGenerator,
+    RegionProposalNetwork,
+    RPNHead,
+)
+from torchvision.models.detection.transform import GeneralizedRCNNTransform
 
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.utils import rnn as rnn_utils
 from torch.nn.utils.rnn import PackedSequence
-from torch.onnx import CheckerError, register_custom_op_symbolic, unregister_custom_op_symbolic
+from torch.onnx import (
+    CheckerError,
+    register_custom_op_symbolic,
+    unregister_custom_op_symbolic,
+)
 from torch.onnx.symbolic_helper import _unimplemented
+from torch.onnx.utils import unpack_quantized_tensor
+
+_ORT_PROVIDERS = ["CPUExecutionProvider"]
 
 
 def flatten_tuples(elem):
-    tup = []
+    flattened = []
     for t in elem:
-        if isinstance(t, (tuple)):
-            tup += flatten_tuples(t)
+        if isinstance(t, tuple):
+            flattened.extend(flatten_tuples(t))
         else:
-            tup += [t]
-    return tup
+            flattened.append(t)
+    return flattened
 
 
 def to_numpy(elem):
-    if isinstance(elem, torch.Tensor):
+    if isinstance(elem, Tensor):
         if elem.requires_grad:
             return elem.detach().cpu().numpy()
         else:
             return elem.cpu().numpy()
-    elif isinstance(elem, list) or isinstance(elem, tuple):
+    elif isinstance(elem, (list, tuple)):
         return [to_numpy(inp) for inp in elem]
     elif isinstance(elem, bool):
         return np.array(elem, dtype=bool)
@@ -68,72 +87,123 @@ def to_numpy(elem):
     elif isinstance(elem, float):
         return np.array(elem, dtype=float)
     elif isinstance(elem, dict):
-        dict_ = []
+        flattened = []
         for k in elem:
-            dict_ += [to_numpy(k)] + [to_numpy(elem[k])]
-        return dict_
-    else:
-        return RuntimeError("Input has unknown type.")
-
-
-def convert_to_onnx(model, input=None, opset_version=9, do_constant_folding=True,
-                    keep_initializers_as_inputs=True, dynamic_axes=None,
-                    input_names=None, output_names=None,
-                    fixed_batch_size=False, training=None,
-                    onnx_shape_inference=True):
-    # export the model to ONNX
+            flattened += [to_numpy(k)] + [to_numpy(elem[k])]
+        return flattened
+    return elem
+
+
+def convert_to_onnx(
+    model,
+    input=None,
+    opset_version=9,
+    do_constant_folding=True,
+    keep_initializers_as_inputs=True,
+    dynamic_axes=None,
+    input_names=None,
+    output_names=None,
+    fixed_batch_size=False,
+    training=None,
+    verbose=False,
+):
     f = io.BytesIO()
     input_copy = copy.deepcopy(input)
-    torch.onnx._export(model, input_copy, f,
-                       opset_version=opset_version,
-                       do_constant_folding=do_constant_folding,
-                       keep_initializers_as_inputs=keep_initializers_as_inputs,
-                       dynamic_axes=dynamic_axes,
-                       input_names=input_names, output_names=output_names,
-                       fixed_batch_size=fixed_batch_size, training=training,
-                       onnx_shape_inference=onnx_shape_inference)
+    torch.onnx._export(
+        model,
+        input_copy,
+        f,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        dynamic_axes=dynamic_axes,
+        input_names=input_names,
+        output_names=output_names,
+        fixed_batch_size=fixed_batch_size,
+        training=training,
+        verbose=verbose,
+    )
 
     # compute onnxruntime output prediction
     so = onnxruntime.SessionOptions()
     # suppress ort warnings.
     # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
     so.log_severity_level = 3
-    ort_sess = onnxruntime.InferenceSession(f.getvalue(), so)
+    ort_sess = onnxruntime.InferenceSession(f.getvalue(), so, providers=_ORT_PROVIDERS)
     return ort_sess
 
 
 def inline_flatten_list(inputs, res_list):
     for i in inputs:
-        res_list.append(i) if not isinstance(i, (list, tuple)) else inline_flatten_list(i, res_list)
+        res_list.append(i) if not isinstance(i, (list, tuple)) else inline_flatten_list(
+            i, res_list
+        )
     return res_list
 
 
-def run_ort(ort_sess, input):
-    input = flatten_tuples(input)
-    input = to_numpy(input)
-    ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(input))
+def unpack_to_numpy(values):
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(unpack_quantized_tensor(value))
+    return [to_numpy(v) for v in value_unpacked]
+
+
+def run_ort(ort_sess, inputs):
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = unpack_to_numpy(flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = to_numpy(input)
+    inputs = to_numpy(inputs)
+    ort_sess_inputs = ort_sess.get_inputs()
+    for i, input in enumerate(inputs):
+        if i == len(ort_sess_inputs) or ort_sess_inputs[i].name in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}"
+            )
+        ort_inputs[ort_sess_inputs[i].name] = input
     ort_outs = ort_sess.run(None, ort_inputs)
     return inline_flatten_list(ort_outs, [])
 
 
 def ort_compare_with_pytorch(ort_outs, output, rtol, atol):
     output, _ = torch.jit._flatten(output)
-    outputs = [to_numpy(outp) for outp in output]
+    outputs = unpack_to_numpy(output)
 
     # compare onnxruntime and PyTorch results
     assert len(outputs) == len(ort_outs), "number of outputs differ"
 
     # compare onnxruntime and PyTorch results
-    [np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol) for out, ort_out in zip(outputs, ort_outs)]
+    [
+        np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol)
+        for out, ort_out in zip(outputs, ort_outs)
+    ]
 
 
-def run_model_test(self, model, batch_size=2, state_dict=None,
-                   input=None, use_gpu=True, rtol=0.001, atol=1e-7,
-                   do_constant_folding=True, dynamic_axes=None,
-                   test_with_inputs=None, input_names=None,
-                   output_names=None, fixed_batch_size=False,
-                   dict_check=True, training=None,
-                   remained_onnx_input_idx=None, flatten=True):
+def run_model_test(
+    self,
+    model,
+    batch_size=2,
+    state_dict=None,
+    input=None,
+    use_gpu=True,
+    rtol=0.001,
+    atol=1e-7,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    test_with_inputs=None,
+    input_names=None,
+    output_names=None,
+    fixed_batch_size=False,
+    dict_check=True,
+    training=None,
+    remained_onnx_input_idx=None,
+    flatten=True,
+    verbose=False,
+):
     if training is not None and training == torch.onnx.TrainingMode.TRAINING:
         model.train()
     elif training is None or training == torch.onnx.TrainingMode.EVAL:
@@ -141,12 +211,10 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
     if input is None:
         input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)
     with torch.no_grad():
-        if isinstance(input, torch.Tensor):
+        if isinstance(input, (Tensor, dict)):
             input = (input,)
         # In-place operators will update input tensor data as well.
         # Thus inputs are replicated before every forward call.
-        if isinstance(input, dict):
-            input = (input,)
         input_args = copy.deepcopy(input)
         input_kwargs = {}
         if dict_check and isinstance(input_args[-1], dict):
@@ -157,18 +225,25 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
             output = model_copy(*input_args, **input_kwargs)
         except Exception:
             output = model(*input_args, **input_kwargs)
-        if isinstance(output, torch.Tensor):
+        if isinstance(output, Tensor):
             output = (output,)
 
         if not dict_check and isinstance(input[-1], dict):
             input = input + ({},)
 
-        ort_sess = convert_to_onnx(model, input=input, opset_version=self.opset_version,
-                                   do_constant_folding=do_constant_folding,
-                                   keep_initializers_as_inputs=self.keep_initializers_as_inputs,
-                                   dynamic_axes=dynamic_axes, input_names=input_names,
-                                   output_names=output_names, fixed_batch_size=fixed_batch_size, training=training,
-                                   onnx_shape_inference=self.onnx_shape_inference)
+        ort_sess = convert_to_onnx(
+            model,
+            input=input,
+            opset_version=self.opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=self.keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
         # compute onnxruntime output prediction
         if remained_onnx_input_idx is not None:
             input_onnx = []
@@ -179,20 +254,21 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
         input_copy = copy.deepcopy(input)
         if flatten:
             input_copy, _ = torch.jit._flatten(input_copy)
-
+        elif input_copy and input_copy[-1] == {}:
+            # Handle empty kwargs (normally removed by flatten).
+            input_copy = input_copy[:-1]
         ort_outs = run_ort(ort_sess, input_copy)
         ort_compare_with_pytorch(ort_outs, output, rtol, atol)
 
-
         # if additional test inputs are provided run the onnx
         # model with these inputs and check the outputs
         if test_with_inputs is not None:
             for test_input in test_with_inputs:
-                if isinstance(test_input, torch.Tensor):
+                if isinstance(test_input, Tensor):
                     test_input = (test_input,)
                 test_input_copy = copy.deepcopy(test_input)
                 output = model(*test_input_copy)
-                if isinstance(output, torch.Tensor):
+                if isinstance(output, Tensor):
                     output = (output,)
                 if remained_onnx_input_idx is not None:
                     test_input_onnx = []
@@ -230,13 +306,20 @@ def _init_test_rpn():
     rpn_score_thresh = 0.0
 
     rpn = RegionProposalNetwork(
-        rpn_anchor_generator, rpn_head,
-        rpn_fg_iou_thresh, rpn_bg_iou_thresh,
-        rpn_batch_size_per_image, rpn_positive_fraction,
-        rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh,
-        score_thresh=rpn_score_thresh)
+        rpn_anchor_generator,
+        rpn_head,
+        rpn_fg_iou_thresh,
+        rpn_bg_iou_thresh,
+        rpn_batch_size_per_image,
+        rpn_positive_fraction,
+        rpn_pre_nms_top_n,
+        rpn_post_nms_top_n,
+        rpn_nms_thresh,
+        score_thresh=rpn_score_thresh,
+    )
     return rpn
 
+
 def _init_test_roi_heads_faster_rcnn():
     out_channels = 256
     num_classes = 91
@@ -251,39 +334,74 @@ def _init_test_roi_heads_faster_rcnn():
     box_detections_per_img = 100
 
     box_roi_pool = ops.MultiScaleRoIAlign(
-        featmap_names=["0", "1", "2", "3"],
-        output_size=7,
-        sampling_ratio=2)
+        featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2
+    )
 
     resolution = box_roi_pool.output_size[0]
     representation_size = 1024
-    box_head = TwoMLPHead(
-        out_channels * resolution ** 2,
-        representation_size)
+    box_head = TwoMLPHead(out_channels * resolution**2, representation_size)
 
     representation_size = 1024
-    box_predictor = FastRCNNPredictor(
-        representation_size,
-        num_classes)
+    box_predictor = FastRCNNPredictor(representation_size, num_classes)
 
     roi_heads = RoIHeads(
-        box_roi_pool, box_head, box_predictor,
-        box_fg_iou_thresh, box_bg_iou_thresh,
-        box_batch_size_per_image, box_positive_fraction,
+        box_roi_pool,
+        box_head,
+        box_predictor,
+        box_fg_iou_thresh,
+        box_bg_iou_thresh,
+        box_batch_size_per_image,
+        box_positive_fraction,
         bbox_reg_weights,
-        box_score_thresh, box_nms_thresh, box_detections_per_img)
+        box_score_thresh,
+        box_nms_thresh,
+        box_detections_per_img,
+    )
     return roi_heads
 
+
+def _construct_tensor_for_quantization_test(
+    shape: Tuple[int, ...],
+    offset: Optional[Union[int, float]] = None,
+    max_val: Optional[Union[int, float]] = None,
+) -> torch.Tensor:
+    """Helper function to generate weights and test inputs in a deterministic way.
+
+    Due to difference in implementation details between PyTorch and ONNXRuntime, randomly generated
+    test data for quantization tests can be flaky. To help stablize the test, this helper function is
+    used to generate weights and test inputs in a deterministic way.
+
+    Args:
+        shape (Tuple[int]): Shape for tensor to construct.
+        offset (Optional[Union[int, float]]): Offset to be added to the generated tensor.
+        max_val (Optional[Union[int, float]]): If any element within tensor has a larger absolute value than
+            max_val, the tensor will be scaled by max_val / tensor.abs().max(). This step is done after
+            applying offset.
+    """
+    tensor = torch.arange(np.prod(shape), dtype=torch.float).view(shape)
+    if offset is not None:
+        tensor = tensor + offset
+    if max_val is not None and tensor.abs().max() > max_val:
+        tensor = tensor * max_val / tensor.abs().max()
+    return tensor
+
+
 def set_rng_seed(seed):
     torch.manual_seed(seed)
     random.seed(seed)
     np.random.seed(seed)
 
-class TestONNXRuntime(unittest.TestCase):
-    from torch.onnx.symbolic_helper import _export_onnx_opset_version
-    opset_version = _export_onnx_opset_version
+
+class _TestONNXRuntime:
+    """Abstract base class for test cases.
+
+    Intentionally not a sub-class of unittest.TestCase so that unittest / pytest
+    don't run it directly. unitest.TestCase is mixed in as another base class when
+    creating concrete sub-types. See MakeTestCase().
+    """
+
+    opset_version = -1  # Sub-classes must override
     keep_initializers_as_inputs = True  # For IR version 3 type export.
-    onnx_shape_inference = True
 
     def setUp(self):
         torch.manual_seed(0)
@@ -298,37 +416,78 @@ def setUp(self):
     # This mostly happens in unit test, where we widely use torch.size or torch.shape.
     # So the output is only dependent on the input shape, not value.
     # remained_onnx_input_idx is used to indicate which pytorch model input idx is remained in ONNX model.
-    def run_test(self, model, input, rtol=1e-3, atol=1e-7, do_constant_folding=True,
-                 batch_size=2, use_gpu=True, dynamic_axes=None, test_with_inputs=None,
-                 input_names=None, output_names=None, fixed_batch_size=False, dict_check=True,
-                 training=None, remained_onnx_input_idx=None):
+    def run_test(
+        self,
+        model,
+        input,
+        rtol=1e-3,
+        atol=1e-7,
+        do_constant_folding=True,
+        batch_size=2,
+        use_gpu=True,
+        dynamic_axes=None,
+        test_with_inputs=None,
+        input_names=None,
+        output_names=None,
+        fixed_batch_size=False,
+        dict_check=True,
+        training=None,
+        remained_onnx_input_idx=None,
+        verbose=False,
+    ):
         def _run_test(m, remained_onnx_input_idx, flatten=True):
-            return run_model_test(self, m, batch_size=batch_size,
-                                  input=input, use_gpu=use_gpu, rtol=rtol, atol=atol,
-                                  do_constant_folding=do_constant_folding,
-                                  dynamic_axes=dynamic_axes, test_with_inputs=test_with_inputs,
-                                  input_names=input_names, output_names=output_names,
-                                  fixed_batch_size=fixed_batch_size, dict_check=dict_check,
-                                  training=training, remained_onnx_input_idx=remained_onnx_input_idx,
-                                  flatten=flatten)
+            return run_model_test(
+                self,
+                m,
+                batch_size=batch_size,
+                input=input,
+                use_gpu=use_gpu,
+                rtol=rtol,
+                atol=atol,
+                do_constant_folding=do_constant_folding,
+                dynamic_axes=dynamic_axes,
+                test_with_inputs=test_with_inputs,
+                input_names=input_names,
+                output_names=output_names,
+                fixed_batch_size=fixed_batch_size,
+                dict_check=dict_check,
+                training=training,
+                remained_onnx_input_idx=remained_onnx_input_idx,
+                flatten=flatten,
+                verbose=verbose,
+            )
 
         if isinstance(remained_onnx_input_idx, dict):
-            scripting_remained_onnx_input_idx = remained_onnx_input_idx['scripting']
-            tracing_remained_onnx_input_idx = remained_onnx_input_idx['tracing']
+            scripting_remained_onnx_input_idx = remained_onnx_input_idx["scripting"]
+            tracing_remained_onnx_input_idx = remained_onnx_input_idx["tracing"]
         else:
             scripting_remained_onnx_input_idx = remained_onnx_input_idx
             tracing_remained_onnx_input_idx = remained_onnx_input_idx
 
-        if self.is_script_test_enabled and not isinstance(model, torch.jit.ScriptModule):
-            script_model = torch.jit.script(model)
-            _run_test(script_model, scripting_remained_onnx_input_idx, flatten=False)
+        is_script = isinstance(
+            model, (torch.jit.ScriptModule, torch.jit.ScriptFunction)
+        )
 
-        _run_test(model, tracing_remained_onnx_input_idx)
+        if self.is_script_test_enabled:
+            script_model = model if is_script else torch.jit.script(model)
+            _run_test(script_model, scripting_remained_onnx_input_idx, flatten=False)
 
-    def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
-                                          do_constant_folding=True, dynamic_axes=None,
-                                          input_names=None, output_names=None,
-                                          ort_optim_on=True, training=None):
+        if not is_script:
+            _run_test(model, tracing_remained_onnx_input_idx)
+
+    def run_model_test_with_external_data(
+        self,
+        model,
+        input,
+        rtol=0.001,
+        atol=1e-7,
+        do_constant_folding=True,
+        dynamic_axes=None,
+        input_names=None,
+        output_names=None,
+        ort_optim_on=True,
+        training=None,
+    ):
         import os
         import tempfile
 
@@ -337,41 +496,51 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
         elif training is None or training == torch.onnx.TrainingMode.EVAL:
             model.eval()
         with torch.no_grad():
-            if isinstance(input, torch.Tensor):
+            if isinstance(input, Tensor):
                 input = (input,)
             # In-place operators will update input tensor data as well.
             # Thus inputs are replicated before every forward call.
             input_copy = copy.deepcopy(input)
             output = model(*input_copy)
-            if isinstance(output, torch.Tensor):
+            if isinstance(output, Tensor):
                 output = (output,)
 
             # export the model to ONNX
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model_file_name = os.path.join(tmpdirname, "model.onnx")
                 input_copy = copy.deepcopy(input)
-                torch.onnx.export(model, input_copy, model_file_name,
-                                  opset_version=self.opset_version,
-                                  verbose=False,
-                                  do_constant_folding=do_constant_folding,
-                                  keep_initializers_as_inputs=self.keep_initializers_as_inputs,
-                                  dynamic_axes=dynamic_axes,
-                                  input_names=input_names, output_names=output_names)
+                torch.onnx.export(
+                    model,
+                    input_copy,
+                    model_file_name,
+                    opset_version=self.opset_version,
+                    verbose=False,
+                    do_constant_folding=do_constant_folding,
+                    keep_initializers_as_inputs=self.keep_initializers_as_inputs,
+                    dynamic_axes=dynamic_axes,
+                    input_names=input_names,
+                    output_names=output_names,
+                )
                 # compute onnxruntime output prediction
                 ort_sess_opt = onnxruntime.SessionOptions()
-                ort_sess_opt.graph_optimization_level = \
-                    onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED if ort_optim_on else \
-                    onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+                ort_sess_opt.graph_optimization_level = (
+                    onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+                    if ort_optim_on
+                    else onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+                )
                 # suppress ort warnings.
                 # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
                 ort_sess_opt.log_severity_level = 3
-                ort_sess = onnxruntime.InferenceSession(model_file_name, sess_options=ort_sess_opt)
+                ort_sess = onnxruntime.InferenceSession(
+                    model_file_name, sess_options=ort_sess_opt, providers=_ORT_PROVIDERS
+                )
                 input_copy = copy.deepcopy(input)
                 ort_outs = run_ort(ort_sess, input_copy)
                 ort_compare_with_pytorch(ort_outs, output, rtol, atol)
 
-
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because external data format was released with Opset 9.
     def test_embedding_model_with_external_data(self):
         class LargeModel(torch.nn.Module):
             def __init__(self):
@@ -392,13 +561,15 @@ def forward(self, input):
         x = torch.tensor([2], dtype=torch.long)
         self.run_model_test_with_external_data(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because external data format was released with Opset 9.
     def test_large_model_with_external_data(self):
         class LargeModel(torch.nn.Module):
             def __init__(self):
                 super(LargeModel, self).__init__()
                 dim = 5
-                n = 40 * 4 * 10 ** 6
+                n = 40 * 4 * 10**6
                 self.emb = torch.nn.Embedding(n, dim)
                 self.lin1 = torch.nn.Linear(dim, 1)
                 self.seq = torch.nn.Sequential(
@@ -412,13 +583,15 @@ def forward(self, input):
         x = torch.tensor([2], dtype=torch.long)
         self.run_model_test_with_external_data(LargeModel(), x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because external data format was released with Opset 9.
     def test_large_model_with_non_str_file(self):
         class LargeModel(torch.nn.Module):
             def __init__(self):
                 super(LargeModel, self).__init__()
                 dim = 5
-                n = 40 * 4 * 10 ** 6
+                n = 40 * 4 * 10**6
                 self.emb = torch.nn.Embedding(n, dim)
                 self.lin1 = torch.nn.Linear(dim, 1)
                 self.seq = torch.nn.Sequential(
@@ -431,9 +604,11 @@ def forward(self, input):
 
         x = torch.tensor([2], dtype=torch.long)
         f = io.BytesIO()
-        err_msg = ("The serialized model is larger than the 2GiB limit imposed by the protobuf library. "
-                   "Therefore the output file must be a file path, so that the ONNX external data can be written to "
-                   "the same directory. Please specify the output file name.")
+        err_msg = (
+            "The serialized model is larger than the 2GiB limit imposed by the protobuf library. "
+            "Therefore the output file must be a file path, so that the ONNX external data can be written to "
+            "the same directory. Please specify the output file name."
+        )
         with self.assertRaisesRegex(RuntimeError, err_msg):
             torch.onnx.export(LargeModel(), x, f)
 
@@ -456,7 +631,9 @@ def test_fuse_conv_bn2d(self):
         class Fuse(torch.nn.Module):
             def __init__(self):
                 super(Fuse, self).__init__()
-                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
+                self.conv = torch.nn.Conv2d(
+                    3, 2, kernel_size=1, stride=2, padding=3, bias=False
+                )
                 self.bn = torch.nn.BatchNorm2d(2)
 
             def forward(self, x):
@@ -471,7 +648,9 @@ def test_fuse_conv_bn3d(self):
         class Fuse(torch.nn.Module):
             def __init__(self):
                 super(Fuse, self).__init__()
-                self.conv = torch.nn.Conv3d(3, 2, (3, 5, 2), stride=(2, 1, 1), padding=(3, 2, 0), bias=False)
+                self.conv = torch.nn.Conv3d(
+                    3, 2, (3, 5, 2), stride=(2, 1, 1), padding=(3, 2, 0), bias=False
+                )
                 self.bn = torch.nn.BatchNorm3d(2)
 
             def forward(self, x):
@@ -492,7 +671,7 @@ def __init__(self):
                     kernel_size=3,
                     stride=1,
                     padding=2,
-                    dilation=1
+                    dilation=1,
                 )
                 self.bn = torch.nn.BatchNorm1d(5)
 
@@ -510,9 +689,14 @@ def forward(self, x):
 
         model = Fuse()
         x = torch.randn(2, 5, 9, requires_grad=True)
-        self.run_test(torch.jit.script(model), (x,),
-                      input_names=['x'], dynamic_axes={'x': [0, 2]},
-                      rtol=1e-3, atol=1e-6)
+        self.run_test(
+            torch.jit.script(model),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 2]},
+            rtol=1e-3,
+            atol=1e-6,
+        )
 
     def test_conv_tbc(self):
         from torch.nn.modules.utils import _single
@@ -526,9 +710,9 @@ def __init__(self, in_channels, out_channels, kernel_size, padding=0):
                 self.padding = _single(padding)
 
                 self.weight = torch.nn.Parameter(
-                    torch.Tensor(self.kernel_size[0], in_channels, out_channels)
+                    Tensor(self.kernel_size[0], in_channels, out_channels)
                 )
-                self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+                self.bias = torch.nn.Parameter(Tensor(out_channels))
                 self.reset_parameters()
 
             def reset_parameters(self):
@@ -552,7 +736,9 @@ def forward(self, input):
 
     def test_reshape_constant_fold(self):
         class Reshape(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Reshape, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -572,23 +758,24 @@ def run_word_language_model(self, model_name):
         tied = False
         batchsize = 5
         if model_name == "GRU":
-            model = word_language_model.RNNModelWithTensorHidden(model_name, ntokens, emsize,
-                                                                 nhid, nlayers, dropout, tied,
-                                                                 batchsize)
+            model = word_language_model.RNNModelWithTensorHidden(
+                model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize
+            )
         elif model_name == "LSTM":
-            model = word_language_model.RNNModelWithTupleHidden(model_name, ntokens, emsize,
-                                                                nhid, nlayers, dropout, tied,
-                                                                batchsize)
+            model = word_language_model.RNNModelWithTupleHidden(
+                model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize
+            )
         else:
-            model = word_language_model.RNNModel(model_name, ntokens, emsize,
-                                                 nhid, nlayers, dropout, tied,
-                                                 batchsize)
+            model = word_language_model.RNNModel(
+                model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize
+            )
         x = torch.arange(0, ntokens).long().view(-1, batchsize)
         # Only support CPU version, since tracer is not working in GPU RNN.
         self.run_test(model, (x, model.hidden))
 
-    def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
+    def get_image(self, rel_path: str, size: Tuple[int, int]) -> Tensor:
         import os
+
         from PIL import Image
         from torchvision import transforms
 
@@ -598,29 +785,53 @@ def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
 
         return transforms.ToTensor()(image)
 
-    def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-        return ([self.get_image("grace_hopper_517x606.jpg", (100, 320))],
-                [self.get_image("rgb_pytorch.png", (250, 380))])
+    def get_test_images(self) -> Tuple[List[Tensor], List[Tensor]]:
+        return (
+            [self.get_image("grace_hopper_517x606.jpg", (100, 320))],
+            [self.get_image("rgb_pytorch.png", (250, 380))],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()  # Faster RCNN model is not scriptable
+    @skipScriptTest()  # Faster RCNN model is not scriptable
     def test_faster_rcnn(self):
-        model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=False, min_size=200,
-                                                                                 max_size=300)
+        model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(
+            pretrained=False, pretrained_backbone=True, min_size=200, max_size=300
+        )
         model.eval()
         x1 = torch.randn(3, 200, 300, requires_grad=True)
         x2 = torch.randn(3, 200, 300, requires_grad=True)
         self.run_test(model, ([x1, x2],), rtol=1e-3, atol=1e-5)
-        self.run_test(model, ([x1, x2],), input_names=["images_tensors"], output_names=["outputs"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            ([x1, x2],),
+            input_names=["images_tensors"],
+            output_names=["outputs"],
+            dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
+            rtol=1e-3,
+            atol=1e-5,
+        )
         dummy_image = [torch.ones(3, 100, 100) * 0.3]
         images, test_images = self.get_test_images()
-        self.run_test(model, (images,), test_with_inputs=[(images, ), (test_images, ), (dummy_image, )],
-                      input_names=["images_tensors"], output_names=["outputs"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
-        self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image, ), (images, )],
-                      input_names=["images_tensors"], output_names=["outputs"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (images,),
+            test_with_inputs=[(images,), (test_images,), (dummy_image,)],
+            input_names=["images_tensors"],
+            output_names=["outputs"],
+            dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        self.run_test(
+            model,
+            (dummy_image,),
+            test_with_inputs=[(dummy_image,), (images,)],
+            input_names=["images_tensors"],
+            output_names=["outputs"],
+            dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     def test_paste_mask_in_image(self):
         masks = torch.rand(10, 1, 26, 26)
@@ -629,12 +840,15 @@ def test_paste_mask_in_image(self):
         boxes *= 50
         o_im_s = (100, 100)
         from torchvision.models.detection.roi_heads import paste_masks_in_image
+
         out = paste_masks_in_image(masks, boxes, o_im_s)
-        jit_trace = torch.jit.trace(paste_masks_in_image,
-                                    (masks, boxes,
-                                     [torch.tensor(o_im_s[0]),
-                                      torch.tensor(o_im_s[1])]))
-        out_trace = jit_trace(masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])])
+        jit_trace = torch.jit.trace(
+            paste_masks_in_image,
+            (masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])]),
+        )
+        out_trace = jit_trace(
+            masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])]
+        )
 
         assert torch.all(out.eq(out_trace))
 
@@ -644,35 +858,76 @@ def test_paste_mask_in_image(self):
         boxes2 *= 100
         o_im_s2 = (200, 200)
         from torchvision.models.detection.roi_heads import paste_masks_in_image
+
         out2 = paste_masks_in_image(masks2, boxes2, o_im_s2)
-        out_trace2 = jit_trace(masks2, boxes2, [torch.tensor(o_im_s2[0]), torch.tensor(o_im_s2[1])])
+        out_trace2 = jit_trace(
+            masks2, boxes2, [torch.tensor(o_im_s2[0]), torch.tensor(o_im_s2[1])]
+        )
 
         assert torch.all(out2.eq(out_trace2))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_mask_rcnn(self):
-        model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn(pretrained=False, min_size=200,
-                                                                             max_size=300)
+        model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn(
+            pretrained=False, pretrained_backbone=True, min_size=200, max_size=300
+        )
         images, test_images = self.get_test_images()
         self.run_test(model, (images,), rtol=1e-3, atol=1e-5)
-        self.run_test(model, (images,), input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
-                                    "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (images,),
+            input_names=["images_tensors"],
+            output_names=["boxes", "labels", "scores", "masks"],
+            dynamic_axes={
+                "images_tensors": [0, 1, 2],
+                "boxes": [0, 1],
+                "labels": [0],
+                "scores": [0],
+                "masks": [0, 1, 2],
+            },
+            rtol=1e-3,
+            atol=1e-5,
+        )
         dummy_image = [torch.ones(3, 100, 100) * 0.3]
-        self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)],
-                      input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
-                                    "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
-        self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)],
-                      input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
-                      dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
-                                    "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (images,),
+            test_with_inputs=[(images,), (test_images,), (dummy_image,)],
+            input_names=["images_tensors"],
+            output_names=["boxes", "labels", "scores", "masks"],
+            dynamic_axes={
+                "images_tensors": [0, 1, 2],
+                "boxes": [0, 1],
+                "labels": [0],
+                "scores": [0],
+                "masks": [0, 1, 2],
+            },
+            rtol=1e-3,
+            atol=1e-5,
+        )
+        self.run_test(
+            model,
+            (dummy_image,),
+            test_with_inputs=[(dummy_image,), (images,)],
+            input_names=["images_tensors"],
+            output_names=["boxes", "labels", "scores", "masks"],
+            dynamic_axes={
+                "images_tensors": [0, 1, 2],
+                "boxes": [0, 1],
+                "labels": [0],
+                "scores": [0],
+                "masks": [0, 1, 2],
+            },
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     def test_heatmaps_to_keypoints(self):
         maps = torch.rand(10, 1, 26, 26)
         rois = torch.rand(10, 4)
         from torchvision.models.detection.roi_heads import heatmaps_to_keypoints
+
         out = heatmaps_to_keypoints(maps, rois)
         jit_trace = torch.jit.trace(heatmaps_to_keypoints, (maps, rois))
         out_trace = jit_trace(maps, rois)
@@ -683,6 +938,7 @@ def test_heatmaps_to_keypoints(self):
         maps2 = torch.rand(20, 2, 21, 21)
         rois2 = torch.rand(20, 4)
         from torchvision.models.detection.roi_heads import heatmaps_to_keypoints
+
         out2 = heatmaps_to_keypoints(maps2, rois2)
         out_trace2 = jit_trace(maps2, rois2)
 
@@ -691,46 +947,125 @@ def test_heatmaps_to_keypoints(self):
 
     @unittest.skip("Failing, see https://github.com/pytorch/pytorch/issues/66528")
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_keypoint_rcnn(self):
-        model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(pretrained=False, min_size=200,
-                                                                                     max_size=300)
+        model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(
+            pretrained=False, pretrained_backbone=False, min_size=200, max_size=300
+        )
         images, test_images = self.get_test_images()
         self.run_test(model, (images,), rtol=1e-3, atol=1e-5)
-        self.run_test(model, (images,), input_names=["images_tensors"],
-                      output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
-                      dynamic_axes={"images_tensors": [0, 1, 2]},
-                      rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (images,),
+            input_names=["images_tensors"],
+            output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
+            dynamic_axes={"images_tensors": [0, 1, 2]},
+            rtol=1e-3,
+            atol=1e-5,
+        )
         dummy_images = [torch.ones(3, 100, 100) * 0.3]
-        self.run_test(model, (images,), test_with_inputs=[(images, ), (test_images, ), (dummy_images, )],
-                      input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
-                      dynamic_axes={"images_tensors": [0, 1, 2]},
-                      rtol=5e-3, atol=1e-5)
-        self.run_test(model, (dummy_images,), test_with_inputs=[(dummy_images, ), (test_images, )],
-                      input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
-                      dynamic_axes={"images_tensors": [0, 1, 2]},
-                      rtol=5e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (images,),
+            test_with_inputs=[(images,), (test_images,), (dummy_images,)],
+            input_names=["images_tensors"],
+            output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
+            dynamic_axes={"images_tensors": [0, 1, 2]},
+            rtol=5e-3,
+            atol=1e-5,
+        )
+        self.run_test(
+            model,
+            (dummy_images,),
+            test_with_inputs=[(dummy_images,), (test_images,)],
+            input_names=["images_tensors"],
+            output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
+            dynamic_axes={"images_tensors": [0, 1, 2]},
+            rtol=5e-3,
+            atol=1e-5,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_shufflenet_v2_dynamic_axes(self):
         model = torchvision.models.shufflenet_v2_x0_5(pretrained=False)
         dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True)
         test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True)
-        self.run_test(model, (dummy_input,), test_with_inputs=[(dummy_input,), (test_inputs,)],
-                      input_names=["input_images"], output_names=["outputs"],
-                      dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}},
-                      rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model,
+            (dummy_input,),
+            test_with_inputs=[(dummy_input,), (test_inputs,)],
+            input_names=["input_images"],
+            output_names=["outputs"],
+            dynamic_axes={
+                "input_images": {0: "batch_size"},
+                "output": {0: "batch_size"},
+            },
+            rtol=1e-3,
+            atol=1e-5,
+        )
+
+    @skipScriptTest()
+    def test_mobilenet_v3(self):
+        model = torchvision.models.quantization.mobilenet_v3_large(pretrained=False)
+        dummy_input = torch.randn(1, 3, 224, 224)
+        self.run_test(model, (dummy_input,))
+
+    @unittest.skip(
+        "Unstable loading pretrained quantized mobilenet v3: https://github.com/pytorch/vision/issues/5303"
+    )
+    @skipIfUnsupportedMinOpsetVersion(10)
+    @skipScriptTest()
+    def test_mobilenet_v3_quant(self):
+        model = torchvision.models.quantization.mobilenet_v3_large(
+            pretrained=True, quantize=True
+        )
+        from PIL import Image
+        from torchvision import transforms
+
+        data_dir = os.path.join(os.path.dirname(__file__), "assets")
+        path = os.path.join(data_dir, "grace_hopper_517x606.jpg")
+        input_image = Image.open(path)
+        # Based on example from https://pytorch.org/hub/pytorch_vision_resnet/
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        input_tensor = preprocess(input_image).unsqueeze(0)
+
+        # Due to precision error from quantization, check only that the top prediction matches.
+        class TopPredictor(torch.nn.Module):
+            def __init__(self, mobilenet):
+                super().__init__()
+                self.mobilenet = mobilenet
+
+            def forward(self, x):
+                x = self.mobilenet(x)
+                _, topk_catid = torch.topk(x[0], 1)
+                return topk_catid
 
-    @disableScriptTest()
+        # Currently, we need convert the model to ScriptModule before export.
+        # The reason is that PackedParams contains int (not tensor).
+        # Then it fails when the exporter calls _trace_and_get_graph_from_model().
+        # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1547858
+        model = torch.jit.trace(TopPredictor(model), input_tensor)
+        self.run_test(model, (input_tensor,))
+
+    @skipScriptTest()
     def test_word_language_model_RNN_TANH(self):
         self.run_word_language_model("RNN_TANH")
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_word_language_model_RNN_RELU(self):
         self.run_word_language_model("RNN_RELU")
 
-    @disableScriptTest()  # scripting prim::unchecked_cast prim::setattr
+    @skipScriptTest()  # scripting prim::unchecked_cast prim::setattr
     def test_word_language_model_LSTM(self):
         self.run_word_language_model("LSTM")
 
@@ -805,34 +1140,36 @@ def forward(self, input):
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_dict(self):
         class MyModel(torch.nn.Module):
             def forward(self, x_in):
                 x_out = {}
-                x_out["test_key_out"] = torch.add(x_in[list(x_in.keys())[0]], list(x_in.keys())[0])
+                x_out["test_key_out"] = torch.add(
+                    x_in[list(x_in.keys())[0]], list(x_in.keys())[0]
+                )
                 return x_out
 
-        x = {torch.tensor(1.): torch.randn(1, 2, 3)}
+        x = {torch.tensor(1.0): torch.randn(1, 2, 3)}
         self.run_test(MyModel(), (x, {}))
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_dict_str(self):
         class MyModel(torch.nn.Module):
             def forward(self, x_in):
                 x_out = {}
-                x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.)
+                x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.0)
                 return x_out
 
         x = {"test_key_in": torch.randn(1, 2, 3)}
         self.run_test(MyModel(), (x, {}))
 
-    @disableScriptTest()  # User-defined class not supported
+    @skipScriptTest()  # User-defined class not supported
     def test_dict_output(self):
         class DictModelOutput(OrderedDict):
-            tensor_out: torch.Tensor
-            tuple_out: Optional[Tuple[torch.Tensor]] = None
-            list_out: Optional[List[torch.Tensor]] = None
+            tensor_out: Tensor
+            tuple_out: Optional[Tuple[Tensor]] = None
+            list_out: Optional[List[Tensor]] = None
 
         class MyModel(torch.nn.Module):
             def forward(self, a, b, c, d):
@@ -872,7 +1209,7 @@ def forward(self, a, b, c, d):
 
     def test_tuple_input(self):
         class TupleModel(torch.nn.Module):
-            def forward(self, a: Tuple[torch.Tensor, torch.Tensor]):
+            def forward(self, a: Tuple[Tensor, Tensor]):
                 return a
 
         x = (torch.randn(3, 4), torch.randn(4, 3))
@@ -880,7 +1217,7 @@ def forward(self, a: Tuple[torch.Tensor, torch.Tensor]):
 
     def test_tuple_primitive_input(self):
         class TupleModel(torch.nn.Module):
-            def forward(self, a: Tuple[int, torch.Tensor], b):
+            def forward(self, a: Tuple[int, Tensor], b):
                 return a[0], a[1] + b
 
         x = (3, torch.randn(4, 3))
@@ -889,30 +1226,27 @@ def forward(self, a: Tuple[int, torch.Tensor], b):
 
     def test_nested_tuple_input(self):
         class NestedTupleModel(torch.nn.Module):
-            def forward(self, a, b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]):
+            def forward(self, a, b: Tuple[Tensor, Tuple[Tensor, Tensor]]):
                 return a + b[0] + b[1][0] + b[1][1]
 
         x = torch.randn(4, 5)
         y = (torch.randn(4, 5), (torch.randn(1, 5), torch.randn(4, 1)))
         self.run_test(NestedTupleModel(), input=(x, y))
 
-    @disableScriptTest()
-    def test_optional_inputs_with_no_optionals(self):
-        class NoOptionalModel(torch.nn.Module):
+    def test_empty_kwargs(self):
+        class IdentityModel(torch.nn.Module):
             def forward(self, input):
                 return input
 
-        # Without empty optional arguments dictionary
-        x = torch.randn(2, 3)
-        self.run_test(NoOptionalModel(), (x,))
-        # With empty optional arguments dictionary
-        y = torch.randn(2, 3)
-        self.run_test(NoOptionalModel(), (y, {}))
+        self.run_test(IdentityModel(), (torch.randn(2, 3), {}))
 
-    @disableScriptTest()  # ScriptModule could not be exported without the Input Descriptor for optional inputs
-    def test_optional_inputs_with_mixed_optionals(self):
-        class MixedModel(torch.nn.Module):
-            def forward(self, x, y=None, z=None):
+    @skipScriptTest()  # Needs https://github.com/pytorch/rfcs/pull/21
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_mixed_optional_default_none(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self, x, y: Optional[Tensor] = None, z: Optional[Tensor] = None
+            ):
                 if y is not None:
                     return x + y
                 if z is not None:
@@ -922,45 +1256,49 @@ def forward(self, x, y=None, z=None):
         x = torch.randn(2, 3)
         y = torch.randn(2, 3)
         z = torch.randn(2, 3)
-        # Without optional arguments dictionary
-        self.run_test(MixedModel(), (x, y, None))
-        self.run_test(MixedModel(), (x, None, z))
-        # With optional arguments dictionary
-        self.run_test(MixedModel(), (x, {"y": y, "z": None}))
-        self.run_test(MixedModel(), (x, {"y": None, "z": z}))
-        self.run_test(MixedModel(), (x, {"z": z}))
-        self.run_test(MixedModel(), (x, {"y": y}))
-
-    @disableScriptTest()  # ScriptModule could not be exported without the Input Descriptor for optional inputs
-    def test_optional_inputs_with_all_optionals(self):
-        class AllOptionalModel(torch.nn.Module):
-            def forward(self, y=None, z=None):
+        model = Model()
+        # Without kwargs dict.
+        self.run_test(model, (x, y, None))
+        self.run_test(model, (x, None, z))
+        # With kwargs dict.
+        self.run_test(model, (x, {"y": y, "z": None}))
+        self.run_test(model, (x, {"y": None, "z": z}))
+        self.run_test(model, (x, {"z": z}))
+        self.run_test(model, (x, {"y": y}))
+
+    @skipScriptTest()  # tracing eliminates None inputs so it works differently. See _script version below.
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_mixed_optional_default_tensor(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                x,
+                y: Optional[Tensor] = torch.ones(2, 3),
+                z: Optional[Tensor] = torch.zeros(2, 3),
+            ):
                 if y is not None:
-                    return y
+                    return x + y
                 if z is not None:
-                    return z
-
-        y = torch.randn(2, 3)
-        # Without optional arguments dictionary
-        self.run_test(AllOptionalModel(), (y, None))
-        # With optional arguments dictionary
-        self.run_test(AllOptionalModel(), {"y": y, "z": None})
-
-    @disableScriptTest()
-    def test_input_names_with_optional_args(self):
-        class NoOptionalModel(torch.nn.Module):
-            def forward(self, input):
-                return input
+                    return x + z
+                return x
 
-        # Without empty optional arguments dictionary
         x = torch.randn(2, 3)
-        self.run_test(NoOptionalModel(), (x,), input_names=["input_x"])
-        # With empty optional arguments dictionary
         y = torch.randn(2, 3)
-        self.run_test(NoOptionalModel(), (y, {}))
+        z = torch.randn(2, 3)
+        model = Model()
+
+        self.run_test(model, (x, y, None))
+        self.run_test(model, (x, None, z))
 
-        class MixedModel(torch.nn.Module):
-            def forward(self, x, y=None, z=None):
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_mixed_optional_default_tensor_script(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                x,
+                y: Optional[Tensor] = torch.ones(2, 3),
+                z: Optional[Tensor] = torch.zeros(2, 3),
+            ):
                 if y is not None:
                     return x + y
                 if z is not None:
@@ -970,54 +1308,128 @@ def forward(self, x, y=None, z=None):
         x = torch.randn(2, 3)
         y = torch.randn(2, 3)
         z = torch.randn(2, 3)
-        # Without optional arguments dictionary
-        self.run_test(MixedModel(), (x, y, None), input_names=["input_x", "input_y"])
-        self.run_test(MixedModel(), (x, None, z), input_names=["input_x", "input_z"])
+        model = torch.jit.script(Model())
+
+        self.run_test(model, (x, y, z), input_names=("x", "y", "z"))
+        self.run_test(model, (x, {"y": y, "z": z}), input_names=("x", "y", "z"))
 
-        # With optional arguments dictionary
-        self.run_test(MixedModel(), (x, {"y": y, "z": None}), input_names=["input_x", "input_y"])
-        self.run_test(MixedModel(), (x, {"y": None, "z": z}), input_names=["input_x", "input_z"])
+        # Requires input_names to be set so that we can feed the inputs properly into ORT.
+        # TODO: Export default values as ONNX initializers, then this should not raise.
+        # https://msdata.visualstudio.com/Vienna/_workitems/edit/969268
+        # Default values are accessible via FunctionSchema.
+        with self.assertRaisesRegex(
+            ValueError, "Model requires 3 inputs. Input Feed contains 2"
+        ):
+            self.run_test(model, (x, {"y": y}), input_names=("x", "y"))
 
-        class AllOptionalModel(torch.nn.Module):
-            def forward(self, y=None, z=None):
+        for example_inputs in (
+            (x, y, None),
+            (x, None, z),
+            (x, {"y": y, "z": None}),
+            (x, {"y": None, "z": z}),
+        ):
+            with self.assertRaisesRegex(
+                ValueError, "args contained 1 None's after flattening."
+            ):
+                self.run_test(model, example_inputs, input_names=("x", "y", "z"))
+
+    @skipScriptTest()  # Needs https://github.com/pytorch/rfcs/pull/21
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_all_optional_default_none(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: Optional[Tensor] = None, y: Optional[Tensor] = None):
+                if x is not None:
+                    return x
                 if y is not None:
                     return y
-                if z is not None:
-                    return z
+                else:
+                    return torch.tensor(-1.0)
+
+        x = torch.randn(2, 3)
+        model = Model()
+        self.run_test(model, (x, None))
+        self.run_test(
+            model,
+            ({"x": x, "y": None},),
+            # y disappears in tracing.
+            input_names=("x",),
+        )
+
+    @skipScriptTest()  # tracing eliminates None inputs so it works differently. See _script version below.
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_all_optional_default_tensor(self):
+        class Model(torch.nn.Module):
+            def forward(
+                self,
+                x: Optional[Tensor] = torch.ones(2, 3),
+                y: Optional[Tensor] = torch.zeros(2, 3),
+            ):
+                if x is not None:
+                    return x
+                elif y is not None:
+                    return y
+                else:
+                    return torch.tensor(-1.0)
 
+        x = torch.randn(2, 3)
         y = torch.randn(2, 3)
-        z = torch.randn(2, 3)
-        # Without optional arguments dictionary
-        self.run_test(AllOptionalModel(), (y, None), input_names=["input_y"])
-        self.run_test(AllOptionalModel(), (None, z), input_names=["input_z"])
-        # With optional arguments dictionary
-        self.run_test(AllOptionalModel(), {"y": y, "z": None}, input_names=["input_y"])
-        self.run_test(AllOptionalModel(), {"y": None, "z": z}, input_names=["input_z"])
-
-    def test_input_as_output(self):
+        model = Model()
+        self.run_test(model, (x, None))
+        self.run_test(model, (None, y))
+        # tracing means y is never used so it's removed from the exported model inputs,
+        # and we fail when trying to run ORT.
+        with self.assertRaisesRegex(ValueError, "got too many positional inputs"):
+            self.run_test(model, (x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_all_optional_default_tensor_script(self):
         class Model(torch.nn.Module):
-            def forward(self, x, y):
-                return x, y
+            def forward(
+                self,
+                x: Optional[Tensor] = torch.ones(2, 3),
+                y: Optional[Tensor] = torch.zeros(2, 3),
+            ):
+                if x is not None:
+                    return x
+                elif y is not None:
+                    return y
+                else:
+                    return torch.tensor(-1.0)
 
         x = torch.randn(2, 3)
-        y = torch.randn(3, 4)
-        self.run_test(Model(), (x, y), input_names=["x", "y"], output_names=["x_out", "y_out"])
+        y = torch.randn(2, 3)
+        model = torch.jit.script(Model())
+
+        # TODO: Export default values as ONNX initializers, then this should not raise.
+        # https://msdata.visualstudio.com/Vienna/_workitems/edit/969268
+        # Default values are accessible via FunctionSchema.
+        with self.assertRaisesRegex(
+            ValueError, "Model requires 2 inputs. Input Feed contains 1"
+        ):
+            self.run_test(model, (x,))
+            self.run_test(model, ({"y": y},))
+        self.run_test(model, (x, y))
+        self.run_test(model, ({"x": x, "y": y},), input_names=("x", "y"))
 
-    @disableScriptTest()
-    def test_none_as_input(self):
+    @skipScriptTest()  # Needs https://github.com/pytorch/rfcs/pull/21
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_mixed_optional(self):
         class Model(torch.nn.Module):
-            def forward(self, x, y):
+            def forward(self, x, y: Optional[Tensor]):
                 if y is not None:
                     return x + y
                 return x
 
         x = torch.randn(2, 3)
-        self.run_test(Model(), (x, None))
+        model = Model()
+        self.run_test(model, (x, None))
+        self.run_test(model, (x, x))
 
-    @disableScriptTest()  # ScriptModule could not be exported without the Input Descriptor for optional inputs
-    def test_none_as_tuple_input(self):
+    @skipScriptTest()  # Needs https://github.com/pytorch/rfcs/pull/21
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_tuple_of_optional(self):
         class Model(torch.nn.Module):
-            def forward(self, x, y):
+            def forward(self, x, y: Tuple[Optional[Tensor], Optional[Tensor]]):
                 if y[0] is not None:
                     return x + y[0]
                 if y[1] is not None:
@@ -1025,28 +1437,67 @@ def forward(self, x, y):
                 return x
 
         x = torch.randn(2, 3)
-        y = torch.randn(2, 3)
-        self.run_test(Model(), (x, (None, y)))
+        y1 = torch.randn(2, 3)
+        self.run_test(Model(), (x, (None, y1)))
 
-    @disableScriptTest()  # ScriptModule could not be exported without the Input Descriptor for optional inputs
-    def test_none_as_named_input(self):
+    @skipScriptTest()  # tracing eliminates None inputs so it works differently. See _script version below.
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_tuple_of_optional_default_tensor(self):
         class Model(torch.nn.Module):
-            def forward(self, x, y=None, z=None):
-                if y is not None:
-                    return x + y
-                if z is not None:
-                    return x + z
+            def forward(
+                self,
+                x,
+                y: Tuple[Optional[Tensor], Optional[Tensor]] = (
+                    torch.zeros(2, 3),
+                    torch.zeros(2, 3),
+                ),
+            ):
+                y0, y1 = y
+                if y0 is not None:
+                    return x + y0
+                if y1 is not None:
+                    return x + y1
                 return x
 
         x = torch.randn(2, 3)
-        z = torch.randn(2, 3)
-        self.run_test(Model(), (x, None, z))
+        y1 = torch.randn(2, 3)
+        self.run_test(Model(), (x, (None, y1)))
 
-    def test_primitive_input_integer(self):
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_tuple_of_optional_default_tensor_script(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
+            def forward(
+                self,
+                x,
+                y: Tuple[Optional[Tensor], Optional[Tensor]] = (
+                    torch.zeros(2, 3),
+                    torch.zeros(2, 3),
+                ),
+            ):
+                y0, y1 = y
+                if y0 is not None:
+                    return x + y0
+                if y1 is not None:
+                    return x + y1
+                return x
+
+        x = torch.randn(2, 3)
+        y0 = torch.randn(2, 3)
+        y1 = torch.randn(2, 3)
+        model = torch.jit.script(Model())
+        with self.assertRaisesRegex(
+            ValueError, "args contained 1 None's after flattening."
+        ):
+            self.run_test(model, (x, (None, y1)))
+        self.run_test(model, (x, (y0, y1)))
+        # export succeeds, but running ORT through run_test would fail because the exported model
+        # has the inputs flattened into 3 inputs.
+        torch.onnx.export(
+            model, (x, {"y": (y0, y1)}), io.BytesIO(), opset_version=self.opset_version
+        )
 
+    def test_primitive_input_integer(self):
+        class Model(torch.nn.Module):
             def forward(self, x: int, y):
                 return x + y
 
@@ -1087,7 +1538,9 @@ def test_cste_script(self):
         class MyModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return torch.zeros(x.size(0)), torch.ones((x.size(1), x.size(0)), dtype=torch.int64)
+                return torch.zeros(x.size(0)), torch.ones(
+                    (x.size(1), x.size(0)), dtype=torch.int64
+                )
 
         x = torch.randn(3, 4)
         self.run_test(MyModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]})
@@ -1096,15 +1549,20 @@ def forward(self, x):
     def test_scalar_tensor(self):
         class test(torch.nn.Module):
             def forward(self, input):
-                return torch.scalar_tensor(input.size(0)), \
-                    torch.scalar_tensor(input.size(1), dtype=torch.int64)
+                return torch.scalar_tensor(input.size(0)), torch.scalar_tensor(
+                    input.size(1), dtype=torch.int64
+                )
 
         x = torch.randn(2, 3, 4)
         y = torch.randn(7, 8, 9)
         model = test()
-        self.run_test(model, x, test_with_inputs=[y],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2]})
+        self.run_test(
+            model,
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2]},
+        )
 
     def test_tensor(self):
         class ScalarInputModel(torch.jit.ScriptModule):
@@ -1113,7 +1571,9 @@ def forward(self, input):
                 return torch.tensor(input.shape[1])
 
         x = torch.randn(3, 4)
-        self.run_test(ScalarInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]})
+        self.run_test(
+            ScalarInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
         self.run_test(ScalarInputModel(), x, remained_onnx_input_idx=[])
 
         class TensorInputModel(torch.jit.ScriptModule):
@@ -1122,7 +1582,9 @@ def forward(self, input):
                 return torch.tensor([input.shape[0], input.shape[1]])
 
         x = torch.randn(3, 4)
-        self.run_test(TensorInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]})
+        self.run_test(
+            TensorInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
         self.run_test(TensorInputModel(), x, remained_onnx_input_idx=[])
 
         class FloatInputModel(torch.jit.ScriptModule):
@@ -1139,7 +1601,9 @@ def forward(self, input):
                 return torch.tensor(input.shape[1], dtype=torch.long)
 
         x = torch.randn(3, 4)
-        self.run_test(InputWithDtypeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]})
+        self.run_test(
+            InputWithDtypeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
         self.run_test(InputWithDtypeModel(), x, remained_onnx_input_idx=[])
 
         class MixedInputModel(torch.jit.ScriptModule):
@@ -1261,7 +1725,6 @@ def forward(self, x):
         x = torch.arange(16).view(4, 4).float()
         self.run_test(ClampMaxModel(), x)
 
-
         class ClampMinModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
@@ -1336,8 +1799,12 @@ class TraceModel(torch.nn.Module):
             def __init__(self):
                 super(TraceModel, self).__init__()
                 self.conv1 = torch.nn.Conv1d(16, 33, 3, stride=2)
-                self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-                self.conv3 = torch.nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+                self.conv2 = torch.nn.Conv2d(
+                    16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)
+                )
+                self.conv3 = torch.nn.Conv3d(
+                    16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)
+                )
 
             def forward(self, input1, input2, input3):
                 return self.conv1(input1), self.conv2(input2), self.conv3(input3)
@@ -1352,23 +1819,29 @@ def test_conv_shape_inference(self):
         class Model(torch.nn.Module):
             def __init__(self):
                 super(Model, self).__init__()
-                self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+                self.conv2 = torch.nn.Conv2d(
+                    16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)
+                )
 
             def forward(self, input):
                 return self.conv2(input) + 2
 
         x = torch.randn(20, 16, 50, 100)
-        self.run_test(Model(), x, atol=10e-5,
-                      input_names=["x"],
-                      dynamic_axes={"x": [0]})
+        self.run_test(
+            Model(), x, atol=10e-5, input_names=["x"], dynamic_axes={"x": [0]}
+        )
 
     def test_conv_transpose(self):
         class TraceModel(torch.nn.Module):
             def __init__(self):
                 super(TraceModel, self).__init__()
                 self.conv1 = torch.nn.ConvTranspose1d(16, 33, 3, stride=2)
-                self.conv2 = torch.nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-                self.conv3 = torch.nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+                self.conv2 = torch.nn.ConvTranspose2d(
+                    16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)
+                )
+                self.conv3 = torch.nn.ConvTranspose3d(
+                    16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)
+                )
 
             def forward(self, input1, input2, input3):
                 return self.conv1(input1), self.conv2(input2), self.conv3(input3)
@@ -1381,7 +1854,6 @@ def forward(self, input1, input2, input3):
 
     # Conversion of Transpose depends on input shape to be known.
     # The following test only works when onnx shape inference is enabled.
-    @skipIfONNXShapeInference(False)
     def test_transpose_infer_shape(self):
         class TransposeModule(torch.jit.ScriptModule):
             def __init__(self):
@@ -1395,9 +1867,13 @@ def forward(self, x):
 
         x = torch.randn(32, 3, 64, 64)
         y = torch.randn(16, 3, 8, 64)
-        self.run_test(TransposeModule(), x, input_names=["x"],
-                      dynamic_axes={"x": [0, 2]},
-                      test_with_inputs=[y])
+        self.run_test(
+            TransposeModule(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 2]},
+            test_with_inputs=[y],
+        )
 
     def squeeze_model_tests(self, d, x1, x2):
         class Squeeze(torch.nn.Module):
@@ -1413,9 +1889,13 @@ def forward(self, x):
 
         x2 = [] if x2 is None else [x2]
         if len(x2) > 0:
-            self.run_test(Squeeze(d), x1,
-                          input_names=["input"], dynamic_axes={"input": {0: "0", 1: "1", 2: "2"}},
-                          test_with_inputs=x2)
+            self.run_test(
+                Squeeze(d),
+                x1,
+                input_names=["input"],
+                dynamic_axes={"input": {0: "0", 1: "1", 2: "2"}},
+                test_with_inputs=x2,
+            )
         else:
             self.run_test(Squeeze(d), x1)
 
@@ -1471,6 +1951,16 @@ def forward(self, x):
         x = torch.randn(2, 1, 4)
         self.run_test(Squeeze(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_squeeze_dynamic_dim(self):
+        class Squeeze(torch.nn.Module):
+            def forward(self, x, dim: int):
+                return torch.squeeze(x, dim)
+
+        x = torch.randn(2, 1, 4)
+        dim = 1
+        self.run_test(Squeeze(), (x, dim))
+
     def test_unsqueeze(self):
         class Unsqueeze(torch.nn.Module):
             def forward(self, x):
@@ -1479,6 +1969,16 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Unsqueeze(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_unsqueeze_dynamic_dim(self):
+        class Unsqueeze(torch.nn.Module):
+            def forward(self, x, dim: int):
+                return torch.unsqueeze(x, dim)
+
+        x = torch.randn(2, 1, 4)
+        dim = -1
+        self.run_test(Unsqueeze(), (x, dim))
+
     def test_maxpool_default_stride(self):
         class MaxPoolModel(torch.nn.Module):
             def forward(self, x):
@@ -1493,9 +1993,9 @@ def test_maxpool_adaptive(self):
         model = torch.nn.AdaptiveMaxPool1d((5), return_indices=False)
         x = torch.randn(20, 16, 50, requires_grad=True)
         y = torch.randn(32, 16, 50, requires_grad=True)
-        self.run_test(model, x, input_names=["x"],
-                      dynamic_axes={"x" : [0]},
-                      test_with_inputs=[y])
+        self.run_test(
+            model, x, input_names=["x"], dynamic_axes={"x": [0]}, test_with_inputs=[y]
+        )
 
     def test_maxpool_2d(self):
         model = torch.nn.MaxPool2d(5, padding=(1, 2))
@@ -1557,9 +2057,13 @@ def test_avgpool_3d_ceil(self):
         model = torch.nn.AvgPool3d(3, 2, ceil_mode=True)
         x = torch.randn(20, 16, 50, 44, 31)
         y = torch.randn(32, 8, 50, 44, 31)
-        self.run_test(model, x, input_names=["x"],
-                      dynamic_axes={"x" : [0, 1]},
-                      test_with_inputs=[y])
+        self.run_test(
+            model,
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+            test_with_inputs=[y],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_floating_point(self):
@@ -1571,7 +2075,9 @@ def forward(self, x):
                 return x.new_zeros(x.shape)
 
         x = torch.randn(2, 3, 4)
-        self.run_test(FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(FloatingPoint(), x, remained_onnx_input_idx=[])
 
         class FloatingPoint(torch.jit.ScriptModule):
@@ -1589,7 +2095,6 @@ def forward(self, x):
 
     # Operator rank mismatch between outputs of two branches for opsets below 11.
     @skipIfUnsupportedMinOpsetVersion(11)
-    @skipIfONNXShapeInference(False)
     def test_floating_point_infer_dtype(self):
         class FloatingPoint(torch.jit.ScriptModule):
             @torch.jit.script_method
@@ -1602,7 +2107,9 @@ def forward(self, x):
                 return x
 
         x = torch.randn(2, 3, 4)
-        self.run_test(FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(FloatingPoint(), x, remained_onnx_input_idx=[])
 
         class FloatingPoint(torch.jit.ScriptModule):
@@ -1621,9 +2128,11 @@ def forward(self, x):
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_prim_min(self):
         @torch.jit.script
-        def list_append(boxes: List[torch.Tensor]):
+        def list_append(boxes: List[Tensor]):
             temp = []
-            for i, b in enumerate(boxes):  # enumerate is creating a prim::min op in torch graph
+            for i, b in enumerate(
+                boxes
+            ):  # enumerate is creating a prim::min op in torch graph
                 temp.append(torch.full_like(b[:, 1], i))
             return temp[0]
 
@@ -1700,7 +2209,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(ArithmeticModule(), x, remained_onnx_input_idx=[])
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_arithmetic_prim_bool(self):
         class ArithmeticModule(torch.nn.Module):
             def forward(self, x, y: int, z: bool, t: float):
@@ -1725,19 +2233,21 @@ def forward(self, x: int, y: int):
         y = 2
         self.run_test(ArithmeticModule(), (x, y))
 
-    @disableScriptTest()
+    # In tracing, None outputs are removed. In scripting they're kept but
+    # we don't know Optional.elem_type, so we can't construct a valid Optional.
+    # Tests for Optional outputs (control flow with None in one branch,
+    # not-None in another) are in test_pytorch_onnx_no_runtime.py.
+    @skipScriptTest()
     def test_tuple_with_none_outputs(self):
         class TupleModel(torch.nn.Module):
             def forward(self, x):
-                l = (x, None, (x, None))
-                return (x, l)
+                return (x, (x, None, (x, None)))
 
         x = torch.randn(3, 4)
         self.run_test(TupleModel(), (x,))
 
     # In scripting the first transpose node do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
-    @skipIfONNXShapeInference(False)
     def test_arithmetic_infer_dtype(self):
         class ArithmeticModule(torch.jit.ScriptModule):
             @torch.jit.script_method
@@ -1755,12 +2265,20 @@ def forward(self, x):
     def test_floor_div(self):
         class FloorDivModule(torch.nn.Module):
             def forward(self, x, y):
-                return x // 3, x // 2., \
-                    x.to(dtype=torch.float64) // 3, x.to(dtype=torch.float64) // 2., \
-                    x.to(dtype=torch.int64) // 3, x.to(dtype=torch.int64) // 2., \
-                    x // (y + 1.).to(dtype=torch.int64), x // y, \
-                    x.to(dtype=torch.float64) // y.to(dtype=torch.int64), x.to(dtype=torch.float64) // y.to(dtype=torch.float64), \
-                    x.to(dtype=torch.int64) // y.to(dtype=torch.int64), x.to(dtype=torch.int64) // y
+                return (
+                    x // 3,
+                    x // 2.0,
+                    x.to(dtype=torch.float64) // 3,
+                    x.to(dtype=torch.float64) // 2.0,
+                    x.to(dtype=torch.int64) // 3,
+                    x.to(dtype=torch.int64) // 2.0,
+                    x // (y + 1.0).to(dtype=torch.int64),
+                    x // y,
+                    x.to(dtype=torch.float64) // y.to(dtype=torch.int64),
+                    x.to(dtype=torch.float64) // y.to(dtype=torch.float64),
+                    x.to(dtype=torch.int64) // y.to(dtype=torch.int64),
+                    x.to(dtype=torch.int64) // y,
+                )
 
         x = torch.arange(-2, 4).reshape(2, 3, 1)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4)
@@ -1770,7 +2288,7 @@ def test_floor_div_script(self):
         class FloorDivModule(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x, y):
-                return x // 3, x // 2., x // y
+                return x // 3, x // 2.0, x // y
 
         x = torch.arange(-2, 4).reshape(2, 3, 1)
         y = torch.randn(2, 3, 4)
@@ -1783,7 +2301,9 @@ def forward(self, x):
                 return x.new_zeros(x.size(2) // x.size(1))
 
         x = torch.randn(2, 3, 4)
-        self.run_test(FloordivModule(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            FloordivModule(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(FloordivModule(), (x,), remained_onnx_input_idx=[])
 
     def test_div(self):
@@ -1820,7 +2340,6 @@ def forward(self, x, y):
 
     # In scripting x, y do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
-    @skipIfONNXShapeInference(False)
     def test_div_promotion_script(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
@@ -1857,18 +2376,24 @@ def forward(self, x, y):
     def test_div_rounding_mode(self):
         class TrueDivModule(torch.nn.Module):
             def forward(self, x, y):
-                return (x.div(y, rounding_mode=None),
-                        torch.div(x, y, rounding_mode=None))
+                return (
+                    x.div(y, rounding_mode=None),
+                    torch.div(x, y, rounding_mode=None),
+                )
 
         class TruncDivModule(torch.nn.Module):
             def forward(self, x, y):
-                return (x.div(y, rounding_mode="trunc"),
-                        torch.div(x, y, rounding_mode="trunc"))
+                return (
+                    x.div(y, rounding_mode="trunc"),
+                    torch.div(x, y, rounding_mode="trunc"),
+                )
 
         class FloorDivModule(torch.nn.Module):
             def forward(self, x, y):
-                return (x.div(y, rounding_mode="floor"),
-                        torch.div(x, y, rounding_mode="floor"))
+                return (
+                    x.div(y, rounding_mode="floor"),
+                    torch.div(x, y, rounding_mode="floor"),
+                )
 
         modules = [TrueDivModule(), TruncDivModule(), FloorDivModule()]
 
@@ -1924,7 +2449,7 @@ def forward(self, x):
     def test_slice_with_input_index(self):
         class InputIndexSlice(torch.nn.Module):
             def forward(self, x, y):
-                x[:y.size(0), 0, :] = y
+                x[: y.size(0), 0, :] = y
                 return x
 
         x = torch.zeros((56, 6, 256))
@@ -1932,29 +2457,32 @@ def forward(self, x, y):
         self.run_test(InputIndexSlice(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()  # scripting tuple/list append
+    @skipScriptTest()  # scripting tuple/list append
     def test_slice_dynamic(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
                 results = []
                 for i in range(4):
-                    results.append(x[:x.size(0) - i, i:x.size(2), i:3])
+                    results.append(x[: x.size(0) - i, i : x.size(2), i:3])
                 return tuple(results)
 
         x = torch.rand(5, 5, 5)
         y = torch.randn(6, 7, 8)
-        self.run_test(DynamicSliceExportMod(), x, test_with_inputs=[y],
-                      input_names=["input_1"],
-                      output_names=["output_1"],
-                      dynamic_axes={"input_1": [0, 1, 2],
-                                    "output_1": [0, 1, 2]})
+        self.run_test(
+            DynamicSliceExportMod(),
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            output_names=["output_1"],
+            dynamic_axes={"input_1": [0, 1, 2], "output_1": [0, 1, 2]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_slice_dynamic_script(self):
         class DynamicSliceModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return x[1:x.size(1)]
+                return x[1 : x.size(1)]
 
         x = torch.rand(1, 2)
         self.run_test(DynamicSliceModel(), x)
@@ -1963,14 +2491,16 @@ def forward(self, x):
     def test_slice_dynamic_shape_script(self):
         class DynamicSliceModel(torch.nn.Module):
             def forward(self, x):
-                return x.new_zeros(x.shape[1:x.size(2)])
+                return x.new_zeros(x.shape[1 : x.size(2)])
 
         x = torch.rand(1, 2, 3, 4)
-        self.run_test(DynamicSliceModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
+        self.run_test(
+            DynamicSliceModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}
+        )
         self.run_test(DynamicSliceModel(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()   # scripting tuple/list append
+    @skipScriptTest()  # scripting tuple/list append
     def test_slice_dynamic_to_end(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
@@ -1980,9 +2510,11 @@ def forward(self, x):
                 return tuple(results)
 
         x = torch.rand(5, 5, 5)
-        self.run_test(DynamicSliceExportMod(), x,
-                      dynamic_axes={"input_1": [0, 1, 2],
-                                    "output_1": [0, 1, 2]})
+        self.run_test(
+            DynamicSliceExportMod(),
+            x,
+            dynamic_axes={"input_1": [0, 1, 2], "output_1": [0, 1, 2]},
+        )
 
     def test_square(self):
         class Square(torch.nn.Module):
@@ -1996,22 +2528,30 @@ def forward(self, x):
     def test_arange_dynamic(self):
         class ArangeModel(torch.nn.Module):
             def forward(self, input):
-                return torch.arange(input.shape[0]), \
-                    torch.arange(12), \
-                    torch.arange(start=input.shape[0], end=input.shape[0] + 5)
+                return (
+                    torch.arange(input.shape[0]),
+                    torch.arange(12),
+                    torch.arange(start=input.shape[0], end=input.shape[0] + 5),
+                )
 
         x = torch.randn(5, 3, 2)
         y = torch.randn(8, 3, 2)
-        self.run_test(ArangeModel(), x, test_with_inputs=[y],
-                      input_names=["input_1"],
-                      output_names=["output_1", "output_2", "output_3"],
-                      dynamic_axes={"input_1": [0],
-                                    "output_1": [0]})
-        self.run_test(torch.jit.script(ArangeModel()), x,
-                      test_with_inputs=[y], input_names=["input_1"],
-                      output_names=["output_1", "output_2", "output_3"],
-                      dynamic_axes={"input_1": [0],
-                                    "output_1": [0]})
+        self.run_test(
+            ArangeModel(),
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            output_names=["output_1", "output_2", "output_3"],
+            dynamic_axes={"input_1": [0], "output_1": [0]},
+        )
+        self.run_test(
+            torch.jit.script(ArangeModel()),
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            output_names=["output_1", "output_2", "output_3"],
+            dynamic_axes={"input_1": [0], "output_1": [0]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_dynamic_arange_out(self):
@@ -2032,8 +2572,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8)
-        self.run_test(ArangeStartOutModel(), (x, y),
-                      input_names=["x", "y"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeStartOutModel(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeStartOutModel(), (x, y), remained_onnx_input_idx=[1])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -2047,6 +2591,17 @@ def forward(self, start, end, steps):
         z = torch.tensor(5, dtype=torch.int)
         self.run_test(LinspaceModel(), (x, y, z))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_linspace_negative_start(self):
+        class LinspaceModel(torch.nn.Module):
+            def forward(self, start, end, steps):
+                return torch.linspace(start, end, steps)
+
+        x = torch.tensor(-1, dtype=torch.float)
+        y = torch.tensor(1, dtype=torch.float)
+        z = torch.tensor(6, dtype=torch.int)
+        self.run_test(LinspaceModel(), (x, y, z))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_arange_with_floats_out(self):
         class ArangeModelEnd(torch.nn.Module):
@@ -2064,8 +2619,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModelStep(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -2083,8 +2642,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModelStep(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1])
 
         class ArangeModelStepNeg(torch.nn.Module):
@@ -2093,8 +2656,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeModelStepNeg(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModelStepNeg(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeModelStepNeg(), (x, y), remained_onnx_input_idx=[1])
 
         class ArangeModelStart(torch.nn.Module):
@@ -2103,8 +2670,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeModelStart(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModelStart(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeModelStart(), (x, y), remained_onnx_input_idx=[1])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -2122,8 +2693,12 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModelStep(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1])
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -2145,16 +2720,19 @@ def forward(self, start, end):
 
         x = torch.randn(2, 3, 4)
         y = torch.tensor(8.5, dtype=torch.float)
-        self.run_test(ArangeStartOutModel(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeStartOutModel(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         self.run_test(ArangeStartOutModel(), (x, y), remained_onnx_input_idx=[1])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_arange_no_type(self):
         class ArangeModel(torch.nn.Module):
             def forward(self, end):
-                return torch.arange(end), \
-                    torch.arange(0, end)
+                return torch.arange(end), torch.arange(0, end)
 
         x = torch.tensor(6.2, dtype=torch.float)
         self.run_test(ArangeModel(), x)
@@ -2163,14 +2741,18 @@ def forward(self, end):
     def test_size(self):
         class SizeModel(torch.nn.Module):
             def forward(self, input):
-                return torch.arange(input.size(0)), torch.arange(input.size(-1)), torch.ones(input.shape)
+                return (
+                    torch.arange(input.size(0)),
+                    torch.arange(input.size(-1)),
+                    torch.ones(input.shape),
+                )
 
         x = torch.randn(5, 3, 2)
         self.run_test(SizeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
         self.run_test(SizeModel(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()  # x.stride() not scriptable
+    @skipScriptTest()  # x.stride() not scriptable
     def test_as_strided(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -2178,12 +2760,14 @@ def forward(self, x):
                 chunk_size[1] = chunk_size[1] * 2 - 1
                 chunk_stride = list(x.stride())
                 chunk_stride[1] = chunk_stride[1] // 2
-                return x.as_strided((3, 3, 3), (1, 4, 2), storage_offset=2), x.as_strided(chunk_size, chunk_stride)
+                return x.as_strided(
+                    (3, 3, 3), (1, 4, 2), storage_offset=2
+                ), x.as_strided(chunk_size, chunk_stride)
 
         x = torch.randn(5, 8, 7)
         self.run_test(Model(), x)
 
-    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
+    @skipScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_tensor_index_advanced_indexing_ellipsis(self):
         class MyModel(torch.nn.Module):
             def forward(self, input):
@@ -2195,27 +2779,43 @@ def forward(self, input):
     def test_tensor_index_advanced_indexing(self):
         class MyModel(torch.nn.Module):
             def forward(self, input):
-                return input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])]
+                return input[
+                    :,
+                    torch.tensor([[0, 2], [1, 1]]),
+                    :,
+                    torch.tensor([2, 1]),
+                    torch.tensor([0, 3]),
+                ]
 
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), (m1,))
 
         class MyModel(torch.nn.Module):
             def forward(self, input):
-                return input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])]
+                return input[
+                    :, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])
+                ]
 
         self.run_test(MyModel(), (m1,))
 
         class MyModel(torch.nn.Module):
             def forward(self, input):
-                return input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])]
+                return input[
+                    :,
+                    torch.tensor([0, 2]),
+                    torch.tensor([1]),
+                    2:4,
+                    torch.tensor([[1], [4]]),
+                ]
 
         self.run_test(MyModel(), (m1,))
 
     def test_tensor_index_advanced_indexing_consecutive(self):
         class MyModel(torch.nn.Module):
             def forward(self, input):
-                return input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None]
+                return input[
+                    :, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None
+                ]
 
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), (m1,))
@@ -2256,7 +2856,7 @@ def forward(self, mask, indices):
     def test_index_put_accumulate(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, ind, update):
-                return x.index_put((ind, ), update, accumulate=True)
+                return x.index_put((ind,), update, accumulate=True)
 
         x = torch.randn(3, 4)
         ind = torch.tensor([2], dtype=torch.long)
@@ -2359,7 +2959,7 @@ def forward(self, x, ind, update):
         self.run_test(IndexPutModel10(), (x, ind, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
+    @skipScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_index_put_ellipsis(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, update):
@@ -2382,8 +2982,12 @@ def forward(self, x, update):
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_loop(self):
         @torch.jit.script
-        def ngram_attention_bias(sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype):
-            bias = torch.ones((ngram, sequence_length), device=device, dtype=dtype) * float("-inf")
+        def ngram_attention_bias(
+            sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype
+        ):
+            bias = torch.ones(
+                (ngram, sequence_length), device=device, dtype=dtype
+            ) * float("-inf")
             for stream_idx in range(ngram):
                 for i in range(sequence_length):
                     bias = bias * 2
@@ -2406,15 +3010,23 @@ def __init__(self):
             def forward(self, hidden_states):
                 seq_length, batch_size = hidden_states.shape[:2]
                 predict_causal_mask = ngram_attention_bias(
-                    self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+                    self.max_target_positions,
+                    self.ngram,
+                    hidden_states.device,
+                    hidden_states.dtype,
                 )
                 predict_causal_mask = predict_causal_mask[:, :seq_length]
                 return predict_causal_mask
 
         x = torch.randn(6, 2)
         y = torch.randn(4, 1)
-        self.run_test(ScriptModel(), x, input_names=["x"],
-                      dynamic_axes={"x": {0: "seq_length", 1: "batch_size"}}, test_with_inputs=[y])
+        self.run_test(
+            ScriptModel(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": {0: "seq_length", 1: "batch_size"}},
+            test_with_inputs=[y],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_(self):
@@ -2479,7 +3091,7 @@ def forward(self, x, mask):
         self.run_test(CopyModel5(), (x, mask))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()  # Model not scriptable (output with shape doesn't match the broadcast shape)
+    @skipScriptTest()  # Model not scriptable (output with shape doesn't match the broadcast shape)
     def test_copy_tracing(self):
         class CopyModel(torch.nn.Module):
             def forward(self, x, data):
@@ -2580,14 +3192,18 @@ def forward(self, x):
     def test_random_like_dtype(self):
         class RandNLike(torch.nn.Module):
             def forward(self, x):
-                return torch.mul(x.to(torch.double), torch.randn_like(x, dtype=torch.double).size(0))
+                return torch.mul(
+                    x.to(torch.double), torch.randn_like(x, dtype=torch.double).size(0)
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(RandNLike(), x)
 
         class RandLike(torch.nn.Module):
             def forward(self, x):
-                return torch.mul(x.to(torch.double), torch.rand_like(x, dtype=torch.double).size(0))
+                return torch.mul(
+                    x.to(torch.double), torch.rand_like(x, dtype=torch.double).size(0)
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(RandLike(), x)
@@ -2603,8 +3219,7 @@ def forward(self, x):
         x = torch.empty(2, 3, 3, dtype=torch.double).uniform_(0, 1)
         self.run_test(Bernoulli(), x)
 
-    # Enable test when fix for allowzero is in ORT
-    @skipForAllOpsetVersions()
+    @unittest.skip("Bug in ORT, skip test until rel-1.11.")
     @skipIfUnsupportedMinOpsetVersion(14)
     def test_reshape_allowzero(self):
         class ReshapeModel(torch.nn.Module):
@@ -2626,7 +3241,16 @@ def forward(self, x):
 
     def _interpolate(self, x, mode, use_size, is_upsample, align_corners=False):
         class MyModel(torch.nn.Module):
-            __constants__ = ["mode", "use_size", "is_upsample", "size", "scale", "size_array", "scale_array", "align_corners"]
+            __constants__ = [
+                "mode",
+                "use_size",
+                "is_upsample",
+                "size",
+                "scale",
+                "size_array",
+                "scale_array",
+                "align_corners",
+            ]
 
             def __init__(self, mode, use_size, is_upsample, align_corners):
                 super(MyModel, self).__init__()
@@ -2649,19 +3273,39 @@ def __init__(self, mode, use_size, is_upsample, align_corners):
             def forward(self, x):
                 if self.use_size:
                     if self.align_corners:
-                        return torch.nn.functional.interpolate(x, mode=self.mode, size=self.size, align_corners=True), \
-                            torch.nn.functional.interpolate(x, mode=self.mode, size=self.size_array, align_corners=True)
-                    return torch.nn.functional.interpolate(x, mode=self.mode, size=self.size), \
-                        torch.nn.functional.interpolate(x, mode=self.mode, size=self.size_array)
+                        return torch.nn.functional.interpolate(
+                            x, mode=self.mode, size=self.size, align_corners=True
+                        ), torch.nn.functional.interpolate(
+                            x, mode=self.mode, size=self.size_array, align_corners=True
+                        )
+                    return torch.nn.functional.interpolate(
+                        x, mode=self.mode, size=self.size
+                    ), torch.nn.functional.interpolate(
+                        x, mode=self.mode, size=self.size_array
+                    )
                 if self.align_corners:
-                    return torch.nn.functional.interpolate(x, mode=self.mode,
-                                                           scale_factor=self.scale, recompute_scale_factor=False), \
-                        torch.nn.functional.interpolate(x, mode=self.mode,
-                                                        scale_factor=self.scale_array, recompute_scale_factor=False)
-                return torch.nn.functional.interpolate(x, mode=self.mode,
-                                                       scale_factor=self.scale, recompute_scale_factor=False), \
-                    torch.nn.functional.interpolate(x, mode=self.mode,
-                                                    scale_factor=self.scale_array, recompute_scale_factor=False)
+                    return torch.nn.functional.interpolate(
+                        x,
+                        mode=self.mode,
+                        scale_factor=self.scale,
+                        recompute_scale_factor=False,
+                    ), torch.nn.functional.interpolate(
+                        x,
+                        mode=self.mode,
+                        scale_factor=self.scale_array,
+                        recompute_scale_factor=False,
+                    )
+                return torch.nn.functional.interpolate(
+                    x,
+                    mode=self.mode,
+                    scale_factor=self.scale,
+                    recompute_scale_factor=False,
+                ), torch.nn.functional.interpolate(
+                    x,
+                    mode=self.mode,
+                    scale_factor=self.scale_array,
+                    recompute_scale_factor=False,
+                )
 
         model = MyModel(mode, use_size, is_upsample, align_corners)
         self.run_test(model, x, atol=1e-6)
@@ -2672,9 +3316,11 @@ def _interpolate_tests(self, is_upsample):
         modes = ["nearest", "linear", "bicubic"]
         if self.opset_version < 11:
             modes = ["nearest"]
-        x = [torch.randn(1, 2, 6, requires_grad=True),
-             torch.randn(1, 2, 4, 6, requires_grad=True),
-             torch.randn(1, 2, 4, 4, 6, requires_grad=True)]
+        x = [
+            torch.randn(1, 2, 6, requires_grad=True),
+            torch.randn(1, 2, 4, 6, requires_grad=True),
+            torch.randn(1, 2, 4, 4, 6, requires_grad=True),
+        ]
 
         for mode in modes:
             for xi in x:
@@ -2711,7 +3357,7 @@ def test_interpolate_upsample(self):
         self._interpolate_tests(True)
 
     @skipIfUnsupportedMaxOpsetVersion(8)
-    @disableScriptTest()  # Scripting supported for opsets > 8. See test_interpolate_upsample
+    @skipScriptTest()  # Scripting supported for opsets > 8. See test_interpolate_upsample
     def test_interpolate_upsample_trace(self):
         self._interpolate_tests(True)
 
@@ -2720,7 +3366,9 @@ def test_interpolate_function_substitution(self):
         class ScriptModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.)
+                return torch.nn.functional.interpolate(
+                    x, mode="nearest", scale_factor=2.0
+                )
 
         class ScriptModule(torch.jit.ScriptModule):
             def __init__(self):
@@ -2736,7 +3384,7 @@ def forward(self, input):
 
         @torch.jit.script
         def script_method(x):
-            return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.)
+            return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.0)
 
         class TracingModule(torch.nn.Module):
             def forward(self, x):
@@ -2754,16 +3402,25 @@ class MyModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x, y):
                 x = torch.add(x, x)
-                out1 = torch.nn.functional.interpolate(x, mode="bilinear", size=(16, 16), align_corners=False)
-                out2 = torch.nn.functional.interpolate(x, mode="nearest", size=(int(y.size(0)), int(y.size(1))))
+                out1 = torch.nn.functional.interpolate(
+                    x, mode="bilinear", size=(16, 16), align_corners=False
+                )
+                out2 = torch.nn.functional.interpolate(
+                    x, mode="nearest", size=(int(y.size(0)), int(y.size(1)))
+                )
                 return out1, out2
 
         x = torch.randn(1, 2, 4, 4, requires_grad=True)
         y = torch.randn(16, 16, requires_grad=True)
-        self.run_test(MyModel(), (x, y), input_names=["x", "y"], dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1]})
+        self.run_test(
+            MyModel(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1]},
+        )
         self.run_test(MyModel(), (x, y), remained_onnx_input_idx=[0])
 
-    @disableScriptTest()  # scripting throws the ONNXRuntimeError
+    @skipScriptTest()  # scripting raises OnnxRuntimeError
     def test_interpolate_adaptive_pooling_error(self):
         x = torch.randn(1, 2, 6, requires_grad=True)
         with self.assertRaises(RuntimeError) as cm:
@@ -2772,7 +3429,6 @@ def test_interpolate_adaptive_pooling_error(self):
         with self.assertRaises(RuntimeError) as cm:
             self._interpolate(x, "area", False, True)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_groupnorm(self):
         model = torch.nn.GroupNorm(3, 6, 0.002)
         x = torch.randn(4, 6, 180, 180, 180)
@@ -2786,7 +3442,6 @@ def test_groupnorm(self):
         x = torch.randn(4, 6, 180, 180)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_groupnorm_noaffine(self):
         model = torch.nn.GroupNorm(4, 8, 0.002, affine=False)
         x = torch.randn(3, 8, 224, 224)
@@ -2819,7 +3474,9 @@ def forward(self, x):
                 return x.new_zeros((a, b))
 
         x = torch.randn(2, 3, 4, 5)
-        self.run_test(ListUnpackSlice(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
+        self.run_test(
+            ListUnpackSlice(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}
+        )
         self.run_test(ListUnpackSlice(), x, remained_onnx_input_idx=[])
 
     def test_pow(self):
@@ -2969,7 +3626,8 @@ def forward(self, x):
                 y = torch.zeros(())
                 y += x
                 return y
-        x = torch.tensor(42.)
+
+        x = torch.tensor(42.0)
         self.run_test(Zeros(), x)
 
         class Ones(torch.nn.Module):
@@ -2977,15 +3635,17 @@ def forward(self, x):
                 y = torch.ones(())
                 y += x
                 return y
-        x = torch.tensor(42.)
+
+        x = torch.tensor(42.0)
         self.run_test(Ones(), x)
 
         class Full(torch.nn.Module):
             def forward(self, x):
-                y = torch.full((), 1.)
+                y = torch.full((), 1.0)
                 y += x
                 return y
-        x = torch.tensor(42.)
+
+        x = torch.tensor(42.0)
         self.run_test(Full(), x)
 
         class Empty(torch.nn.Module):
@@ -2993,7 +3653,8 @@ def forward(self, x):
                 y = torch.empty(()).fill_(0)
                 y += x
                 return y
-        x = torch.tensor(42.)
+
+        x = torch.tensor(42.0)
         self.run_test(Empty(), x)
 
     def test_std(self):
@@ -3267,18 +3928,15 @@ def forward(self, input):
         self.run_test(model, x)
 
     def test_bitshift(self):
-        class BitshiftModel(torch.nn.Module):
-            def forward(self, input, input2):
-                return input >> 1, input << 3.1, \
-                    input2 >> torch.tensor([1, 2]), input2 << 4.2
-        input = torch.arange(24, dtype=torch.float32).reshape(3, 4, 2)
-        input2 = torch.arange(24, dtype=torch.int64).reshape(3, 4, 2)
-        self.run_test(BitshiftModel(), (input, input2))
-
-    def test_bitshift_other_fp(self):
         class BitshiftModel(torch.nn.Module):
             def forward(self, input):
-                return input << 2.4
+                return (
+                    input >> 1,
+                    input << 3,
+                    input >> torch.tensor([1, 2]),
+                    input << 4,
+                )
+
         input = torch.arange(24, dtype=torch.int64).reshape(3, 4, 2)
         self.run_test(BitshiftModel(), input)
 
@@ -3288,8 +3946,13 @@ def forward(self, input):
     def test_bitshift_uint8(self):
         class BitshiftModel(torch.nn.Module):
             def forward(self, input, input2):
-                return input >> 1, input << 3., \
-                    input2 >> torch.tensor([1, 2], dtype=torch.uint8), input2 << 4.
+                return (
+                    input >> 1,
+                    input << 3,
+                    input2 >> torch.tensor([1, 2], dtype=torch.uint8),
+                    input2 << 4,
+                )
+
         input = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2)
         input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2)
         self.run_test(BitshiftModel(), (input, input2))
@@ -3353,6 +4016,7 @@ class IndexSelectScalerIndexModel(torch.nn.Module):
             def forward(self, x):
                 index = 2
                 return torch.index_select(x, 1, torch.tensor(index))
+
         x = torch.randn(3, 4)
         self.run_test(IndexSelectScalerIndexModel(), x)
 
@@ -3365,6 +4029,7 @@ def __init__(self, index_base):
             def forward(self, x, index_offset):
                 index = self.index_base + index_offset
                 return torch.index_select(x, 1, index)
+
         x = torch.randn(3, 4)
         offset = 2
         index_offset = torch.tensor(offset)
@@ -3385,9 +4050,19 @@ class MyModule(torch.nn.Module):
             def forward(self, x):
                 return torch.topk(x, 3)
 
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         self.run_test(MyModule(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_topk_int32_k(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, k):
+                return torch.topk(x, k)
+
+        x = torch.arange(1.0, 6.0)
+        k = torch.tensor(3, dtype=torch.int32)
+        self.run_test(Model(), (x, k))
+
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_topk_smallest_unsorted(self):
         class MyModule(torch.nn.Module):
@@ -3398,7 +4073,7 @@ def forward(self, x, k):
                 topk_sorted = torch.topk(x, k, largest=False, sorted=True)
                 return topk_sorted, torch.sort(topk_unsorted.values).values
 
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         k = torch.tensor(3)
         self.run_test(MyModule(), (x, k))
 
@@ -3409,11 +4084,11 @@ class MyModuleDynamic(torch.jit.ScriptModule):
             def forward(self, x, k):
                 return torch.topk(x, k)
 
-        x = torch.arange(1., 6., requires_grad=True)
+        x = torch.arange(1.0, 6.0, requires_grad=True)
         k = torch.tensor(3)
         self.run_test(MyModuleDynamic(), [x, k])
 
-    @disableScriptTest()  # Python builtin apply of FunctionMeta object is currently not supported in Torchscript.
+    @skipScriptTest()  # Python builtin apply of FunctionMeta object is currently not supported in Torchscript.
     @skipIfUnsupportedMinOpsetVersion(11)  # Clip op min is an input since opset 11.
     def test_auto_grad(self):
         class MyClip(torch.autograd.Function):
@@ -3428,12 +4103,15 @@ def forward(ctx, input):
                 ctx.save_for_backward(input)
                 return input.clamp(min=0)
 
-        def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs):
+        def symbolic_python_op(
+            ctx: torch.onnx.SymbolicContext, g: torch._C.Graph, *args, **kwargs
+        ):
+            n = ctx.cur_node
             name = kwargs["name"]
             if name == "MyClip":
-                return g.op("Clip", args[0], args[1])
+                return g.op("Clip", args[0], args[1], outputs=n.outputsSize())
             elif name == "MyRelu":
-                return g.op("Relu", args[0])
+                return g.op("Relu", args[0], outputs=n.outputsSize())
             else:
                 return _unimplemented("prim::PythonOp", "unknown node kind: " + name)
 
@@ -3443,6 +4121,7 @@ def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs):
         class MyClipModule(torch.nn.Module):
             def forward(self, x, min):
                 return MyClip.apply(x, min)
+
         x = torch.randn(3, 3)
         min = torch.tensor([0.0])
         self.run_test(MyClipModule(), (x, min))
@@ -3450,9 +4129,41 @@ def forward(self, x, min):
         class MyReluModule(torch.nn.Module):
             def forward(self, x):
                 return MyRelu.apply(x)
+
         x = torch.randn(3, 3)
         self.run_test(MyReluModule(), x)
 
+    def test_clip_int(self):
+        class MyClipInt(torch.nn.Module):
+            def forward(self, x):
+                return torch.clamp(x, 0, 1)
+
+        self.run_test(MyClipInt(), torch.randn(3, 3).to(torch.int64))
+
+    def test_relu_int(self):
+        self.run_test(torch.nn.ReLU(), torch.randn(3, 3).to(torch.int32))
+
+    def test_pad_int(self):
+        class MyPadInt(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.pad(x, (1, 1))
+
+        self.run_test(MyPadInt(), torch.randn(3, 3).to(torch.int32))
+
+    def test_min_int(self):
+        class MyMinInt(torch.nn.Module):
+            def forward(self, x):
+                return torch.min(x, x + 1)
+
+        self.run_test(MyMinInt(), torch.randn(3, 3).to(torch.int32))
+
+    def test_max_int(self):
+        class MyMaxnInt(torch.nn.Module):
+            def forward(self, x):
+                return torch.max(x, x + 1)
+
+        self.run_test(MyMaxnInt(), torch.randn(3, 3).to(torch.int32))
+
     @skipIfUnsupportedOpsetVersion([7])
     def test_normalize(self):
         class Model(torch.nn.Module):
@@ -3483,7 +4194,6 @@ def test_batchnorm1d_noaffine(self):
         x = torch.randn(10, 10, 128)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_batchnorm1d_norunningstats(self):
         x = torch.randn(10, 10)
         model = torch.nn.BatchNorm1d(10, track_running_stats=False)
@@ -3502,7 +4212,6 @@ def test_batchnorm2d_noaffine(self):
         model = torch.nn.BatchNorm2d(3, affine=False)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_batchnorm2d_norunningstats(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.BatchNorm2d(3, track_running_stats=False)
@@ -3518,7 +4227,9 @@ def test_batchnorm3d_noaffine(self):
         model = torch.nn.BatchNorm3d(3, affine=False)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because ConstantOfShape op is not supported for opset < 9
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because ConstantOfShape op is not supported for opset < 9
     def test_instancenorm1d_runningstats(self):
         x = torch.randn(10, 5, 128)
         model = torch.nn.InstanceNorm1d(5, affine=True, track_running_stats=True)
@@ -3527,7 +4238,6 @@ def test_instancenorm1d_runningstats(self):
         model = torch.nn.InstanceNorm1d(5, affine=False, track_running_stats=True)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm1d_norunningstats(self):
         x = torch.randn(10, 5, 128)
         model = torch.nn.InstanceNorm1d(5, affine=True, track_running_stats=False)
@@ -3536,7 +4246,9 @@ def test_instancenorm1d_norunningstats(self):
         model = torch.nn.InstanceNorm1d(5, affine=False, track_running_stats=False)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because ConstantOfShape op is not supported for opset < 9
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because ConstantOfShape op is not supported for opset < 9
     def test_instancenorm2d_runningstats(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.InstanceNorm2d(3, affine=True, track_running_stats=True)
@@ -3545,7 +4257,6 @@ def test_instancenorm2d_runningstats(self):
         model = torch.nn.InstanceNorm2d(3, affine=False, track_running_stats=True)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm2d_norunningstats(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.InstanceNorm2d(3, affine=True, track_running_stats=False)
@@ -3554,7 +4265,9 @@ def test_instancenorm2d_norunningstats(self):
         model = torch.nn.InstanceNorm2d(3, affine=False, track_running_stats=False)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because ConstantOfShape op is not supported for opset < 9
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because ConstantOfShape op is not supported for opset < 9
     def test_instancenorm3d_runningstats(self):
         x = torch.randn(10, 3, 128, 128, 128)
         model = torch.nn.InstanceNorm3d(3, affine=True, track_running_stats=True)
@@ -3563,7 +4276,6 @@ def test_instancenorm3d_runningstats(self):
         model = torch.nn.InstanceNorm3d(3, affine=False, track_running_stats=True)
         self.run_test(model, x)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm3d_norunningstats(self):
         x = torch.randn(10, 3, 128, 128, 128)
         model = torch.nn.InstanceNorm3d(3, affine=True, track_running_stats=False)
@@ -3579,7 +4291,9 @@ def forward(self, input, indices):
                 values = 1.0
                 return input.scatter(1, indices, values)
 
-        input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], dtype=torch.float64)
+        input = torch.tensor(
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], dtype=torch.float64
+        )
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(ScatterModel(), input=(input, indices))
 
@@ -3593,7 +4307,9 @@ def forward(self, input, indices):
                 values = 1.0
                 return input.scatter(1, indices, values)
 
-        input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], dtype=torch.float32)
+        input = torch.tensor(
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], dtype=torch.float32
+        )
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(ScatterModel(), input=(input, indices))
 
@@ -3603,7 +4319,7 @@ class ScatterModel(torch.nn.Module):
             def forward(self, input, indices, values):
                 return input.scatter(1, indices, values)
 
-        input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
+        input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])
         self.run_test(ScatterModel(), input=(input, indices, values))
@@ -3636,7 +4352,7 @@ def forward(self, input, indices, values):
         self.run_test(ScatterModel(), input=(input, indices, values))
 
         @torch.jit.script
-        def scatter_sum(src: torch.Tensor, index: torch.Tensor):
+        def scatter_sum(src: Tensor, index: Tensor):
             size = src.size()
             out = torch.zeros(size, dtype=src.dtype)
             return out.scatter_add_(1, index, src)
@@ -3649,6 +4365,18 @@ def forward(self, src, index):
         index = torch.tensor([[0, 1], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(ScatterModel(), (src, index))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_bucketize(self):
+        class BucketModel(torch.nn.Module):
+            def forward(self, input, boundaries):
+                return torch.bucketize(input, boundaries), torch.bucketize(
+                    input, boundaries, right=True
+                )
+
+        input = torch.tensor([[2, 5, 10], [6, 8, 3]])
+        boundaries = torch.tensor([1, 5, 7, 8, 10])
+        self.run_test(BucketModel(), (input, boundaries))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_one_hot(self):
         class OneHot(torch.nn.Module):
@@ -3677,11 +4405,11 @@ class GatherModel(torch.nn.Module):
             def forward(self, input, indices):
                 return input.gather(1, indices)
 
-        input = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
+        input = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(GatherModel(), input=(input, indices))
 
-    @disableScriptTest()  # Scripting error: Cannot instantiate nn module
+    @skipScriptTest()  # Scripting error: Cannot instantiate nn module
     def test_gather_constant_fold(self):
         class GatherModule(torch.nn.Module):
             def __init__(self):
@@ -3727,10 +4455,16 @@ def forward(self, x):
                 return x
 
         x = torch.randn(1, 3, 224, 224)
-        self.run_test(GatherModule(), (x,),
-                      dynamic_axes={"input": {0: "batch", 2: "height", 3: "width"},
-                                    "output": {0: "batch", 1: "class", 2: "height", 3: "width"}},
-                      input_names=['input'], output_names=['output'])
+        self.run_test(
+            GatherModule(),
+            (x,),
+            dynamic_axes={
+                "input": {0: "batch", 2: "height", 3: "width"},
+                "output": {0: "batch", 1: "class", 2: "height", 3: "width"},
+            },
+            input_names=["input"],
+            output_names=["output"],
+        )
 
     @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -3753,7 +4487,9 @@ class ExpandTensorSizeModel(torch.nn.Module):
             def forward(self, input, size):
                 return input.expand(size)
 
-        input = torch.randn(3,)
+        input = torch.randn(
+            3,
+        )
         size = torch.tensor(-1)
         self.run_test(ExpandTensorSizeModel(), input=(input, size))
 
@@ -3761,27 +4497,33 @@ def forward(self, input, size):
     def test_dynamic_expand_as(self):
         class Model(torch.nn.Module):
             def forward(self, x):
-                x[:, x.size(0):] = 0
+                x[:, x.size(0) :] = 0
                 return x
 
         x = torch.ones(2, 5)
         x2 = torch.randn(3, 4)
-        self.run_test(Model(), (x, ),
-                      input_names=["x"],
-                      dynamic_axes={"x": [0, 1]},
-                      test_with_inputs=[x2])
+        self.run_test(
+            Model(),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+            test_with_inputs=[x2],
+        )
 
         class Model(torch.nn.Module):
             def forward(self, x):
-                x[:, x.size(0):] = torch.tensor([1, 2, 3])
+                x[:, x.size(0) :] = torch.tensor([1, 2, 3])
                 return x
 
         x = torch.ones(2, 5, 3)
         x2 = torch.randn(3, 4, 3)
-        self.run_test(Model(), (x, ),
-                      input_names=["x"],
-                      dynamic_axes={"x": [0, 1, 2]},
-                      test_with_inputs=[x2])
+        self.run_test(
+            Model(),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+            test_with_inputs=[x2],
+        )
 
     def test_multinomial(self):
         class Multinomial(torch.nn.Module):
@@ -3856,6 +4598,7 @@ def test_reduced_min_max(self):
         class ReducedMinMaxModule(torch.nn.Module):
             def forward(self, input):
                 return torch.min(input, dim=-1)[0], torch.max(input, dim=0)[0]
+
         x = torch.randint(10, (4, 4), dtype=torch.int32)
         self.run_test(ReducedMinMaxModule(), x)
 
@@ -3893,7 +4636,9 @@ def forward(self, x):
             self.run_test(model, input)
 
     def test_softmax_large_values(self):
-        input = torch.tensor([[-1e12, -1e12, -1e12], [1e12, 0.0, -5.0], [3.0, 4.0, 5.0]])
+        input = torch.tensor(
+            [[-1e12, -1e12, -1e12], [1e12, 0.0, -5.0], [3.0, 4.0, 5.0]]
+        )
         for i in range(-2, 1):
             model = torch.nn.Softmax(dim=i)
             self.run_test(model, input)
@@ -3984,7 +4729,9 @@ def test_lstm(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
+                self.rnn = torch.nn.LSTM(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+                )
 
             def forward(self, x, h0, c0):
                 return self.rnn(x, (h0, c0))
@@ -3999,7 +4746,9 @@ def test_lstm_cell(self):
         class LSTMCellModel(torch.nn.Module):
             def __init__(self, bias):
                 super().__init__()
-                self.lstm_cell = torch.nn.LSTMCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias)
+                self.lstm_cell = torch.nn.LSTMCell(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias
+                )
 
             def forward(self, x, h0, c0):
                 return self.lstm_cell(x, (h0, c0))
@@ -4015,7 +4764,9 @@ def test_lstm_default_init_state(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
+                self.rnn = torch.nn.LSTM(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+                )
 
             def forward(self, x):
                 return self.rnn(x)
@@ -4028,7 +4779,9 @@ def test_lstm_fixed_batch_size(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
                 super(LSTMModel, self).__init__()
-                self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
+                self.lstm = torch.nn.LSTM(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+                )
                 self.RNN_HIDDEN_SIZE = RNN_HIDDEN_SIZE
 
             def forward(self, input):
@@ -4040,14 +4793,18 @@ def forward(self, input):
         input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
         # verify with different input of same batch size
         input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
-        self.run_test(LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2])
+        self.run_test(
+            LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2]
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_lstm_post_fix_init_state(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
                 super(LSTMModel, self).__init__()
-                self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
+                self.lstm = torch.nn.LSTM(
+                    RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+                )
                 self.RNN_HIDDEN_SIZE = RNN_HIDDEN_SIZE
 
             def forward(self, input):
@@ -4060,20 +4817,28 @@ def forward(self, input):
         input = torch.randn(RNN_SEQUENCE_LENGTH, 1, RNN_INPUT_SIZE)
         # verify with different input of different batch size
         input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
-        self.run_test(model, input, input_names=["input.1"], dynamic_axes={"input.1" : {0 : "seq", 1 : "batch"}},
-                      test_with_inputs=[input2])
+        self.run_test(
+            model,
+            input,
+            input_names=["input.1"],
+            dynamic_axes={"input.1": {0: "seq", 1: "batch"}},
+            test_with_inputs=[input2],
+        )
 
     def test_lstm_constant_folding(self):
         class LstmNet(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
                 super(LstmNet, self).__init__()
-                self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional)
+                self.lstm = torch.nn.LSTM(
+                    input_size, hidden_size, num_layers, bidirectional=bidirectional
+                )
 
-            def forward(self, input, initial_state: Tuple[torch.Tensor, torch.Tensor]):
+            def forward(self, input, initial_state: Tuple[Tensor, Tensor]):
                 return self.lstm(input, initial_state)
 
-        def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
-                                         seq_len, bidirectional):
+        def get_LstmNet_model_and_inputs(
+            input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+        ):
             num_directions = 2 if bidirectional else 1
             model = LstmNet(input_size, hidden_size, num_layers, bidirectional)
             input = torch.randn(seq_len, batch_size, input_size)
@@ -4094,9 +4859,15 @@ def test_lstm_no_bias(self):
         class LstmNet(torch.nn.Module):
             def __init__(self, num_layers, bidirectional):
                 super(LstmNet, self).__init__()
-                self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers, bias=False, bidirectional=bidirectional)
+                self.lstm = torch.nn.LSTM(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    num_layers,
+                    bias=False,
+                    bidirectional=bidirectional,
+                )
 
-            def forward(self, input, initial_state: Tuple[torch.Tensor, torch.Tensor]):
+            def forward(self, input, initial_state: Tuple[Tensor, Tensor]):
                 return self.lstm(input, initial_state)
 
         def get_LstmNet_model_and_inputs(num_layers, bidirectional):
@@ -4109,16 +4880,54 @@ def get_LstmNet_model_and_inputs(num_layers, bidirectional):
 
         num_layers = [1, 1, 2, 3]
         bidirectional = [True, False, True, False]
-        models_and_inputs = [get_LstmNet_model_and_inputs(n, b) for n, b in zip(num_layers, bidirectional)]
+        models_and_inputs = [
+            get_LstmNet_model_and_inputs(n, b)
+            for n, b in zip(num_layers, bidirectional)
+        ]
         for model, input in models_and_inputs:
             self.run_test(model, input)
 
-    @disableScriptTest()
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_lstm_sequence(self):
+        class LstmNet(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.rnn1 = torch.nn.LSTM(8, 8, bidirectional=True, batch_first=True)
+                self.linear1 = torch.nn.Linear(8 * 2, 8)
+                self.rnn2 = torch.nn.LSTM(8, 8, bidirectional=True, batch_first=True)
+                self.linear2 = torch.nn.Linear(8 * 2, 8)
+
+            def forward(self, input):
+                rnn_output1, _ = self.rnn1(input)
+                linear_output1 = self.linear1(rnn_output1)
+                rnn_output2, _ = self.rnn2(linear_output1)
+                linear_output2 = self.linear2(rnn_output2)
+                return linear_output2
+
+        input = torch.zeros((1, 100, 8), dtype=torch.float32)
+        self.run_test(
+            LstmNet(),
+            input,
+            input_names=["input"],
+            output_names=["output"],
+            dynamic_axes={
+                "input": {0: "batch_size", 1: "w", 2: "h"},
+                "output": {0: "batch_size", 1: "w", 2: "h"},
+            },
+        )
+
+    @skipScriptTest()
     def test_rnn_no_bias(self):
         def make_model(layers, packed_sequence):
             batch_first = True if packed_sequence == 2 else False
-            model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, bidirectional=False,
-                                 batch_first=batch_first, bias=False)
+            model = torch.nn.RNN(
+                RNN_INPUT_SIZE,
+                RNN_HIDDEN_SIZE,
+                layers,
+                bidirectional=False,
+                batch_first=batch_first,
+                bias=False,
+            )
 
             if packed_sequence == 1:
                 model = RnnModelWithPackedSequence(model, False)
@@ -4147,7 +4956,9 @@ def make_input(batch_size, layers, packed_sequence):
         layers = [1, 3, 1, 3, 1, 3]
         packed_sequence = [0, 0, 1, 1, 2, 2]
         models = [make_model(l, p) for l, p in zip(layers, packed_sequence)]
-        inputs = [make_input(RNN_BATCH_SIZE, l, p) for l, p in zip(layers, packed_sequence)]
+        inputs = [
+            make_input(RNN_BATCH_SIZE, l, p) for l, p in zip(layers, packed_sequence)
+        ]
 
         for model, input in zip(models, inputs):
             self.run_test(model, input, batch_size=RNN_BATCH_SIZE)
@@ -4156,14 +4967,21 @@ def test_gru_no_bias(self):
         class GruNet(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
                 super(GruNet, self).__init__()
-                self.mygru = torch.nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional, bias=False)
+                self.mygru = torch.nn.GRU(
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    bidirectional=bidirectional,
+                    bias=False,
+                )
 
             def forward(self, input, initial_state):
                 out = self.mygru(input, initial_state)
                 return out
 
-        def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
-                                        seq_len, bidirectional):
+        def get_GruNet_model_and_inputs(
+            input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+        ):
             num_directions = 2 if bidirectional else 1
             model = GruNet(input_size, hidden_size, num_layers, bidirectional)
             input = torch.randn(seq_len, batch_size, input_size)
@@ -4176,8 +4994,12 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
         batch_size = [3, 4]
         seq_len = [5, 7]
         bidirectional = [True, False]
-        models_and_inputs = [get_GruNet_model_and_inputs(i, h, n, b, s, bi)
-                             for i, h, n, b, s, bi in zip(input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional)]
+        models_and_inputs = [
+            get_GruNet_model_and_inputs(i, h, n, b, s, bi)
+            for i, h, n, b, s, bi in zip(
+                input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+            )
+        ]
         for model, input in models_and_inputs:
             self.run_test(model, input, do_constant_folding=True)
 
@@ -4185,14 +5007,17 @@ def test_gru_constant_folding(self):
         class GruNet(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
                 super(GruNet, self).__init__()
-                self.mygru = torch.nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional)
+                self.mygru = torch.nn.GRU(
+                    input_size, hidden_size, num_layers, bidirectional=bidirectional
+                )
 
             def forward(self, input, initial_state):
                 out = self.mygru(input, initial_state)
                 return out
 
-        def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
-                                        seq_len, bidirectional):
+        def get_GruNet_model_and_inputs(
+            input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional
+        ):
             num_directions = 2 if bidirectional else 1
             model = GruNet(input_size, hidden_size, num_layers, bidirectional)
             input = torch.randn(seq_len, batch_size, input_size)
@@ -4218,6 +5043,28 @@ def forward(self, input, other):
         y = torch.randn(4, 1, requires_grad=True)
         self.run_test(model, (x, y))
 
+    def test_amax_amin(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.amax(x, dim=0, keepdim=True), torch.amin(
+                    x, dim=[0, 1], keepdim=False
+                )
+
+        model = Model()
+        x = torch.randn(4, 4)
+        self.run_test(model, x)
+
+    def test_aminmax(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.aminmax(x, dim=1, keepdim=True), torch.aminmax(
+                    x, keepdim=False
+                )
+
+        model = Model()
+        x = torch.randn(3, 4)
+        self.run_test(model, x)
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_arange_end(self):
         class ArangeScript(torch.jit.ScriptModule):
@@ -4291,14 +5138,24 @@ def test_arange_start_end_step(self):
         class ArangeScript(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, a):
-                return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a
+                return (
+                    torch.arange(
+                        2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float
+                    ).view(-1, 1)
+                    + a
+                )
 
         x = torch.randn(3, 4, requires_grad=True)
         self.run_test(ArangeScript(), x)
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
-                return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a
+                return (
+                    torch.arange(
+                        2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float
+                    ).view(-1, 1)
+                    + a
+                )
 
         self.run_test(ArangeModel(), x)
 
@@ -4307,14 +5164,20 @@ def test_arange_start_end_step_notype(self):
         class ArangeScript(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, a):
-                return torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + a
+                return (
+                    torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1)
+                    + a
+                )
 
         x = torch.randn(3, 4, requires_grad=True)
         self.run_test(ArangeScript(), x)
 
         class ArangeModel(torch.nn.Module):
             def forward(self, a):
-                return torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + a
+                return (
+                    torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1)
+                    + a
+                )
 
         self.run_test(ArangeModel(), x)
 
@@ -4424,12 +5287,14 @@ def test_eq(self):
         class EqualModel(torch.nn.Module):
             def forward(self, input, other):
                 return input == other
+
         self._test_compare_ops(EqualModel(), 2)
 
     def test_gt(self):
         class GreaterModel(torch.nn.Module):
             def forward(self, input, other):
                 return input > other
+
         self._test_compare_ops(GreaterModel(), 2)
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -4437,37 +5302,41 @@ def test_ge(self):
         class GreaterOrEqualModel(torch.nn.Module):
             def forward(self, input, other):
                 return input >= other
+
         self._test_compare_ops(GreaterOrEqualModel(), 2)
 
     def test_gt_scalar(self):
         class GreaterModel(torch.nn.Module):
             def forward(self, input):
                 return input > 1
+
         self._test_compare_ops(GreaterModel(), 1)
 
     def test_gt_primitive(self):
         class GreaterModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.y : int = 2
+                self.y: int = 2
 
             def forward(self, x: int):
                 return self.y > x
 
         x = 3
-        self.run_test(GreaterModel(), (x, ))
+        self.run_test(GreaterModel(), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_ge_scalar(self):
         class GreaterOrEqualModel(torch.nn.Module):
             def forward(self, input):
                 return input >= 1
+
         self._test_compare_ops(GreaterOrEqualModel(), 1)
 
     def test_lt(self):
         class LessModel(torch.nn.Module):
             def forward(self, input, other):
                 return input > other
+
         self._test_compare_ops(LessModel(), 2)
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -4475,12 +5344,14 @@ def test_le(self):
         class LessOrEqualModel(torch.nn.Module):
             def forward(self, input, other):
                 return input <= other
+
         self._test_compare_ops(LessOrEqualModel(), 2)
 
     def test_lt_scalar(self):
         class LessModel(torch.nn.Module):
             def forward(self, input):
                 return input < 1
+
         self._test_compare_ops(LessModel(), 1)
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -4488,6 +5359,7 @@ def test_le_scalar(self):
         class LessOrEqualModel(torch.nn.Module):
             def forward(self, input):
                 return input <= 1
+
         self._test_compare_ops(LessOrEqualModel(), 1)
 
     def test_matmul(self):
@@ -4519,10 +5391,12 @@ def forward(self, input, other):
     def _argmin_argmax_model(self, input):
         class ArgminArgmaxModel(torch.nn.Module):
             def forward(self, input):
-                return torch.argmin(input), \
-                    torch.argmax(input), \
-                    torch.argmin(input, keepdim=True), \
-                    torch.argmax(input, keepdim=True)
+                return (
+                    torch.argmin(input),
+                    torch.argmax(input),
+                    torch.argmin(input, keepdim=True),
+                    torch.argmax(input, keepdim=True),
+                )
 
         self.run_test(ArgminArgmaxModel(), input)
 
@@ -4535,8 +5409,7 @@ def test_argmin_argmax(self):
     # same value appears multiple times in the tensor
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_argmin_argmax_select_last_index(self):
-        input = torch.tensor([[1., 2., 3.],
-                             [1., 1., 2.]])
+        input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 2.0]])
         self._argmin_argmax_model(input)
 
         input = torch.ones(7, 3, 5)
@@ -4593,6 +5466,15 @@ def forward(self, x):
         x = torch.tensor([[1, 2], [3, 4]])
         self.run_test(RepeatsDimsModel2(), (x,))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_repeat_interleave_noop(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x.repeat_interleave(1, dim=1)
+
+        x = torch.randn(4, 1, 8)
+        self.run_test(Model(), (x,))
+
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_dynamic_repeat_interleave(self):
         class SingleDynamicModel(torch.nn.Module):
@@ -4602,8 +5484,13 @@ def forward(self, x):
 
         x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         another_x = torch.tensor([[7, 8], [5, 6]])
-        self.run_test(SingleDynamicModel(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}})
+        self.run_test(
+            SingleDynamicModel(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": {1: "w"}},
+        )
 
         class NegDynamicModel(torch.nn.Module):
             def forward(self, x):
@@ -4612,8 +5499,13 @@ def forward(self, x):
 
         x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         another_x = torch.tensor([[7, 8], [5, 6]])
-        self.run_test(NegDynamicModel(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}})
+        self.run_test(
+            NegDynamicModel(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": {1: "w"}},
+        )
 
         class SingleDynamicModelFloat(torch.nn.Module):
             def forward(self, x):
@@ -4622,8 +5514,13 @@ def forward(self, x):
 
         x = torch.tensor([[1.1, 2.1], [3.1, 4.1]])
         another_x = torch.tensor([[7.1, 8.1], [5.1, 6.1]])
-        self.run_test(SingleDynamicModelFloat(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h"}})
+        self.run_test(
+            SingleDynamicModelFloat(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": {0: "h"}},
+        )
 
         class DynamicRepeatsModel(torch.nn.Module):
             def forward(self, x, repeats):
@@ -4633,9 +5530,13 @@ def forward(self, x, repeats):
         another_x = torch.tensor([[7, 8], [5, 6]])
         repeats = torch.tensor([2])
         another_repeats = torch.tensor([4])
-        self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(another_x, another_repeats)],
-                      input_names=["input_1", "repeats_1"],
-                      dynamic_axes={"input_1" : {1 : "w"}, "repeats_1" : {0 : "r"}})
+        self.run_test(
+            DynamicRepeatsModel(),
+            (x, repeats),
+            test_with_inputs=[(another_x, another_repeats)],
+            input_names=["input_1", "repeats_1"],
+            dynamic_axes={"input_1": {1: "w"}, "repeats_1": {0: "r"}},
+        )
 
         class DynamicRepeatsModel2(torch.nn.Module):
             def forward(self, x, repeats):
@@ -4644,9 +5545,13 @@ def forward(self, x, repeats):
         x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         repeats = torch.tensor([2])
         another_repeats = torch.tensor([4])
-        self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)],
-                      input_names=["input_1", "repeats_1"],
-                      dynamic_axes={"repeats_1" : {0 : "r"}})
+        self.run_test(
+            DynamicRepeatsModel2(),
+            (x, repeats),
+            test_with_inputs=[(x, another_repeats)],
+            input_names=["input_1", "repeats_1"],
+            dynamic_axes={"repeats_1": {0: "r"}},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_multiple_dynamic_repeat_interleave(self):
@@ -4657,9 +5562,13 @@ def forward(self, x, repeats):
         x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         repeats = torch.tensor([2, 3, 4])
         another_repeats = torch.tensor([4, 3, 2])
-        self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(x, another_repeats)],
-                      input_names=["input_1", "repeats_1"],
-                      dynamic_axes={"repeats_1" : {0 : "r"}})
+        self.run_test(
+            DynamicRepeatsModel(),
+            (x, repeats),
+            test_with_inputs=[(x, another_repeats)],
+            input_names=["input_1", "repeats_1"],
+            dynamic_axes={"repeats_1": {0: "r"}},
+        )
 
         class DynamicRepeatsModel2(torch.nn.Module):
             def forward(self, x, repeats):
@@ -4668,9 +5577,13 @@ def forward(self, x, repeats):
         x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         repeats = torch.tensor([2, 3])
         another_repeats = torch.tensor([4, 3])
-        self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)],
-                      input_names=["input_1", "repeats_1"],
-                      dynamic_axes={"repeats_1" : {0 : "r"}})
+        self.run_test(
+            DynamicRepeatsModel2(),
+            (x, repeats),
+            test_with_inputs=[(x, another_repeats)],
+            input_names=["input_1", "repeats_1"],
+            dynamic_axes={"repeats_1": {0: "r"}},
+        )
 
     def test_view(self):
         class ViewModel(torch.nn.Module):
@@ -4687,8 +5600,12 @@ def forward(self, input, other):
 
         x = torch.randn(2, 3, 4)
         shape = torch.randn(6, 4)
-        self.run_test(ViewModel(), (x, shape),
-                      input_names=["x", "shape"], dynamic_axes={"x": [0, 1, 2], "shape": [0, 1]})
+        self.run_test(
+            ViewModel(),
+            (x, shape),
+            input_names=["x", "shape"],
+            dynamic_axes={"x": [0, 1, 2], "shape": [0, 1]},
+        )
         self.run_test(ViewModel(), (x, shape), remained_onnx_input_idx=[0])
 
     def test_view_dynamic_zero_dim(self):
@@ -4699,8 +5616,17 @@ def forward(self, input):
 
         x = torch.ones(2)
         another_x = torch.empty((0,))
-        self.run_test(ViewModel(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1": [0, ]})
+        self.run_test(
+            ViewModel(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={
+                "input_1": [
+                    0,
+                ]
+            },
+        )
 
     def test_view_as(self):
         class ViewModel(torch.nn.Module):
@@ -4741,7 +5667,7 @@ def forward(self, input, weight, bias):
         z = torch.randn(1)
         self.run_test(LinearModel(), (x, y, z))
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_weight_norm(self):
         # addmm for 3-d inputs converts to onnx::MatMul
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1)
@@ -4765,7 +5691,7 @@ def test_weight_norm(self):
         x = torch.randn(3, 3, 5, requires_grad=True)
         self.run_test(model, x)
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_weight_norm_nodim(self):
         # addmm for 3-d inputs converts to onnx::MatMul
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None)
@@ -4785,6 +5711,9 @@ def forward(self, input):
         x = torch.randint(10, (1, 2, 3, 4))
         self.run_test(FlattenModel(), x)
 
+        x = torch.randn(4)
+        self.run_test(FlattenModel(), x)
+
     def test_flatten2d(self):
         class FlattenModel(torch.nn.Module):
             def forward(self, input):
@@ -4796,7 +5725,11 @@ def forward(self, input):
     def test_flatten2d_neg(self):
         class FlattenModel(torch.nn.Module):
             def forward(self, x):
-                return torch.flatten(x, 1, -1), torch.flatten(x, 0, -2), torch.flatten(x, 1, -2)
+                return (
+                    torch.flatten(x, 1, -1),
+                    torch.flatten(x, 0, -2),
+                    torch.flatten(x, 1, -2),
+                )
 
         x = torch.randint(10, (1, 2, 3, 4))
         self.run_test(FlattenModel(), x)
@@ -4811,11 +5744,14 @@ def forward(self, x):
         x = torch.randn(batch_size, 5, 4, 5)
         y = torch.randn(5, 5, 4, 5)
         model = MyModule()
-        self.run_test(model, x, test_with_inputs=[y],
-                      input_names=["input"],
-                      output_names=["output"],
-                      dynamic_axes={"input" : {0 : "batch_size"},
-                                    "output" : {0 : "batch_size"}})
+        self.run_test(
+            model,
+            x,
+            test_with_inputs=[y],
+            input_names=["input"],
+            output_names=["output"],
+            dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_getitem(self):
@@ -4845,7 +5781,7 @@ def forward(self, x, y, i: int):
         i = 3
         self.run_test(torch.jit.script(M()), (x, y, i))
 
-    @disableScriptTest()  # torch.nonzero(x, as_tuple=True) is not scriptable.
+    @skipScriptTest()  # torch.nonzero(x, as_tuple=True) is not scriptable.
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_nonzero(self):
         class NonzeroModel(torch.nn.Module):
@@ -4888,8 +5824,13 @@ def forward(self, input):
                 return len(input.unbind()) + input
 
         x = torch.randn(4, 5)
-        self.run_test(LenModel(), x, input_names=["input"], dynamic_axes={"input": {0: "seq"}},
-                      test_with_inputs=(torch.randn(5, 5),))
+        self.run_test(
+            LenModel(),
+            x,
+            input_names=["input"],
+            dynamic_axes={"input": {0: "seq"}},
+            test_with_inputs=(torch.randn(5, 5),),
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_len_list(self):
@@ -4919,7 +5860,7 @@ def forward(self, input):
         x = torch.randn(3, 4, 5)
         self.run_test(UnbindModel2(), x)
 
-    @disableScriptTest()  # scripting tests run for opsets > 11. See: test_split_script
+    @skipScriptTest()  # scripting tests run for opsets > 11. See: test_split_script
     def test_split(self):
         class SplitModel(torch.nn.Module):
             def forward(self, input):
@@ -4966,12 +5907,12 @@ def forward(self, input):
         self.run_test(SplitModel3(), x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_split_size_as_list(self):
         class SplitModel(torch.nn.Module):
             def forward(self, input, split_sizes: List[int]):
                 out = []
-                split_list: List[torch.Tensor] = input.split(split_sizes)
+                split_list: List[Tensor] = input.split(split_sizes)
 
                 for ob in split_list:
                     out.append(ob)
@@ -4992,8 +5933,12 @@ def forward(self, x, y, t):
         x = torch.randn(2, 3)
         y = torch.randn(2, 4)
         t = torch.randn(2, 7)
-        self.run_test(SplitModule(), (x, y, t), input_names=["x", "y", "t"],
-                      dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]})
+        self.run_test(
+            SplitModule(),
+            (x, y, t),
+            input_names=["x", "y", "t"],
+            dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]},
+        )
         self.run_test(SplitModule(), (x, y, t), remained_onnx_input_idx=[2])
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -5022,8 +5967,12 @@ def forward(self, x):
 
         x = torch.randn(4, 384, 2)
         input_names = ["logits"]
-        self.run_test(Split(), x, input_names=input_names,
-                      dynamic_axes={input_names[0]: {0: 'batch'}})
+        self.run_test(
+            Split(),
+            x,
+            input_names=input_names,
+            dynamic_axes={input_names[0]: {0: "batch"}},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_chunk(self):
@@ -5043,13 +5992,21 @@ def forward(self, x):
 
         for dim_size_ in range(13, 16):
             y = torch.randn(1, dim_size_)
-            self.run_test(model, x, test_with_inputs=[y],
-                          input_names=["x"],
-                          dynamic_axes={"x": {0: "batch_size", 1: "dims"}})
-
-            self.run_test(model_neg_dim, x, test_with_inputs=[y],
-                          input_names=["x"],
-                          dynamic_axes={"x": {0: "batch_size", 1: "dims"}})
+            self.run_test(
+                model,
+                x,
+                test_with_inputs=[y],
+                input_names=["x"],
+                dynamic_axes={"x": {0: "batch_size", 1: "dims"}},
+            )
+
+            self.run_test(
+                model_neg_dim,
+                x,
+                test_with_inputs=[y],
+                input_names=["x"],
+                dynamic_axes={"x": {0: "batch_size", 1: "dims"}},
+            )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dynamic_chunk(self):
@@ -5069,13 +6026,21 @@ def forward(self, x):
 
         for dim_size_ in range(13, 16):
             y = torch.randn(3, dim_size_)
-            self.run_test(model, x, test_with_inputs=[y],
-                          input_names=["x"],
-                          dynamic_axes={"x": {0: "batch_size", 1: "dims"}})
-
-            self.run_test(model_neg_dim, x, test_with_inputs=[y],
-                          input_names=["x"],
-                          dynamic_axes={"x": {0: "batch_size", 1: "dims"}})
+            self.run_test(
+                model,
+                x,
+                test_with_inputs=[y],
+                input_names=["x"],
+                dynamic_axes={"x": {0: "batch_size", 1: "dims"}},
+            )
+
+            self.run_test(
+                model_neg_dim,
+                x,
+                test_with_inputs=[y],
+                input_names=["x"],
+                dynamic_axes={"x": {0: "batch_size", 1: "dims"}},
+            )
 
     def test_concat(self):
         class ConcatModel(torch.nn.Module):
@@ -5168,7 +6133,6 @@ def forward(self, x):
         inputs = torch.randn(16)
         self.run_test(model, inputs)
 
-    @skipIfONNXShapeInference(False)
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_loop_transpose(self):
         class LoopModel(torch.nn.Module):
@@ -5262,7 +6226,7 @@ def forward(self, x):
 
         model = torch.jit.script(ListModel())
         x = torch.randn(4, 4, 3, 4)
-        self.run_test(model, (x, ))
+        self.run_test(model, (x,))
 
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_list_append_nested_mixed_dtype(self):
@@ -5383,7 +6347,9 @@ def forward(self, x):
                 return torch.zeros(x.size()) + torch.ones(x.size())
 
         x = torch.randn(2, 3, 4)
-        self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(TensorFactory(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -5391,10 +6357,14 @@ def test_tensor_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                return torch.zeros(x.shape, dtype=torch.float) + torch.ones(x.shape, dtype=torch.float)
+                return torch.zeros(x.shape, dtype=torch.float) + torch.ones(
+                    x.shape, dtype=torch.float
+                )
 
         x = torch.randn(2, 3, 4)
-        self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(TensorFactory(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -5402,26 +6372,47 @@ def test_tensor_like_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
-                zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
-                ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
+                zeros = torch.zeros_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
+                ones = torch.ones_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
                 return zeros + ones
 
         x = torch.randn(2, 3, 4)
-        self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(TensorFactory(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_eye(self):
         class TensorFactory(torch.nn.Module):
             def forward(self, x):
-                return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), \
-                    torch.eye(x.size()[1], 2, dtype=torch.long), torch.eye(x.shape[0]), \
-                    torch.eye(x.shape[0], dtype=torch.float64)
+                return (
+                    torch.eye(x.size()[1], 3),
+                    torch.eye(4, 4, dtype=torch.long),
+                    torch.eye(x.size()[1], 2, dtype=torch.long),
+                    torch.eye(x.shape[0]),
+                    torch.eye(x.shape[0], dtype=torch.float64),
+                )
 
         x = torch.randn(2, 3, 4)
         another_x = torch.randn(5, 6, 7)
-        self.run_test(TensorFactory(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1": [0, 1, 2]})
+        self.run_test(
+            TensorFactory(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_diagonal(self):
@@ -5432,9 +6423,13 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 2)
         # Other test inputs to test dynamic behavior
         another_x = torch.randn(5, 6, 7, 8)
-        self.run_test(DiagonalModel(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2, 3]})
+        self.run_test(
+            DiagonalModel(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2, 3]},
+        )
 
         class DiagonalModelNegOffset(torch.nn.Module):
             def forward(self, x):
@@ -5443,9 +6438,13 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 2)
         # Other test inputs to test dynamic behavior
         another_x = torch.randn(5, 6, 7, 8)
-        self.run_test(DiagonalModelNegOffset(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2, 3]})
+        self.run_test(
+            DiagonalModelNegOffset(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2, 3]},
+        )
 
         class DiagonalModelPosOffset(torch.nn.Module):
             def forward(self, x):
@@ -5454,9 +6453,13 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 2)
         # Other test inputs to test dynamic behavior
         another_x = torch.randn(5, 6, 7, 8)
-        self.run_test(DiagonalModelPosOffset(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2, 3]})
+        self.run_test(
+            DiagonalModelPosOffset(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2, 3]},
+        )
 
         class DiagonalModelWithDims(torch.nn.Module):
             def forward(self, x):
@@ -5465,9 +6468,13 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 2)
         # Other test inputs to test dynamic behavior
         another_x = torch.randn(5, 6, 7, 8)
-        self.run_test(DiagonalModelWithDims(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2, 3]})
+        self.run_test(
+            DiagonalModelWithDims(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2, 3]},
+        )
 
         class DiagonalModelOffsetOverrun(torch.nn.Module):
             def forward(self, x):
@@ -5476,9 +6483,13 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 2)
         # Other test inputs to test dynamic behavior
         another_x = torch.randn(5, 6, 7, 8)
-        self.run_test(DiagonalModelOffsetOverrun(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1, 2, 3]})
+        self.run_test(
+            DiagonalModelOffsetOverrun(),
+            x,
+            test_with_inputs=[another_x],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1, 2, 3]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_zero(self):
@@ -5494,7 +6505,9 @@ def forward(self, x):
     def test_new_zeros(self):
         class Zero_(torch.nn.Module):
             def forward(self, x):
-                return x.new_zeros(x.shape[1:2]), x.new_zeros(x.shape[2:], dtype=torch.long)
+                return x.new_zeros(x.shape[1:2]), x.new_zeros(
+                    x.shape[2:], dtype=torch.long
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(Zero_(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
@@ -5504,25 +6517,33 @@ def forward(self, x):
     def test_new_ones(self):
         class OnesModel(torch.nn.Module):
             def forward(self, x):
-                return x.new_ones(x.shape[1:2]), x.new_ones(x.shape[2:], dtype=torch.long)
+                return x.new_ones(x.shape[1:2]), x.new_ones(
+                    x.shape[2:], dtype=torch.long
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(OnesModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
         self.run_test(OnesModel(), x, remained_onnx_input_idx=[])
 
-    @skipIfONNXShapeInference(True)
+    @skipIfUnsupportedMinOpsetVersion(9)
+    @skipScriptTest()  # torch.zeros/torch.ones with size tensor of dim != 0 not scriptable.
+    def test_zeros_ones_with_tensor_input(self):
+        class ZeroAndOnes(torch.nn.Module):
+            def forward(self, x):
+                return torch.zeros(x, 1), torch.ones(x, 1)
+
+        x = torch.tensor([2])
+        self.run_test(ZeroAndOnes(), (x,))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_tolist(self):
         class List(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, input):
-                cur_shape = torch._shape_as_tensor(input)
-                final_shape: List[int] = cur_shape.tolist()
-                pad_tensor = torch.zeros([1, 2] + final_shape)
-                return pad_tensor
+                res: List[int] = input.tolist()
+                return res
 
-        x = torch.randn(2, 3)
-        self.run_test(List(), (x,))
+        self.run_test(List(), (torch.randint(100, (1,)),))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_list_pass(self):
@@ -5532,8 +6553,12 @@ def forward(self, x, y):
 
         x = torch.randn(2, 3, 4, 5)
         y = torch.randn(1, 2, 3, 4)
-        self.run_test(Slice(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]})
+        self.run_test(
+            Slice(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]},
+        )
         self.run_test(Slice(), (x, y), remained_onnx_input_idx=[])
 
         class Size(torch.nn.Module):
@@ -5542,8 +6567,12 @@ def forward(self, x, y):
 
         x = torch.randn(2, 3, 4)
         y = torch.randn(1, 2, 3)
-        self.run_test(Size(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]})
+        self.run_test(
+            Size(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]},
+        )
         self.run_test(Size(), (x, y), remained_onnx_input_idx=[])
 
         class Array(torch.nn.Module):
@@ -5554,8 +6583,12 @@ def forward(self, x, y):
 
         x = torch.randn(2, 3, 4)
         y = torch.randn(1, 2, 3)
-        self.run_test(Array(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]})
+        self.run_test(
+            Array(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]},
+        )
         self.run_test(Array(), (x, y), remained_onnx_input_idx=[])
 
         class List(torch.nn.Module):
@@ -5566,15 +6599,22 @@ def forward(self, x, y):
 
         x = torch.randn(2, 3, 4)
         y = torch.randn(1, 2, 3)
-        self.run_test(List(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]})
+        self.run_test(
+            List(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]},
+        )
         self.run_test(List(), (x, y), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_new_empty(self):
         class Emtpy(torch.nn.Module):
             def forward(self, x):
-                return x.new_empty(x.shape[0]).fill_(0), x.new_empty(x.shape[0], dtype=torch.long) * 0
+                return (
+                    x.new_empty(x.shape[0]).fill_(0),
+                    x.new_empty(x.shape[0], dtype=torch.long) * 0,
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(Emtpy(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
@@ -5584,7 +6624,9 @@ def forward(self, x):
     def test_new_full(self):
         class Full(torch.nn.Module):
             def forward(self, x):
-                return x.new_full(x.shape[1:2], 5), x.new_full(x.shape[0:1], 1.3, dtype=torch.long)
+                return x.new_full(x.shape[1:2], 5), x.new_full(
+                    x.shape[0:1], 1.3, dtype=torch.long
+                )
 
         x = torch.randn(2, 3, 4)
         self.run_test(Full(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
@@ -5599,8 +6641,12 @@ def forward(self, x, y):
 
         x = torch.randn(2, 3)
         y = torch.randn(2, 3)
-        self.run_test(Arithmetic(), (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1], "y": [0, 1]})
+        self.run_test(
+            Arithmetic(),
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1], "y": [0, 1]},
+        )
         self.run_test(Arithmetic(), (x, y), remained_onnx_input_idx=[0])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -5643,21 +6689,33 @@ def forward(self, x, y):
     def test_inplace_with_loop(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                a = torch.ones(12,)
+                a = torch.ones(
+                    12,
+                )
                 for i in range(10):
-                    a.add_(torch.ones(12,))
+                    a.add_(
+                        torch.ones(
+                            12,
+                        )
+                    )
                 return a + x
 
         m = M()
-        x = torch.randn(12,)
+        x = torch.randn(
+            12,
+        )
         self.run_test(torch.jit.script(M()), (x))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_with_loop_2(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                _bias = torch.ones(12,)
-                a = torch.ones(12,)  # used in loop, altered.
+                _bias = torch.ones(
+                    12,
+                )
+                a = torch.ones(
+                    12,
+                )  # used in loop, altered.
                 a_ref = a  # not used in loop, should be altered.
                 b = x.clone()  # used in loop, not be altered.
                 b_ref = b  # not used in loop, should not be altered.
@@ -5665,18 +6723,32 @@ def forward(self, x):
                     if i == 3:
                         for j in range(5):
                             a += _bias
-                            _bias.add_(torch.ones(12,))
-                            b = b + torch.ones(12,)
-
-                    _bias.add_(torch.ones(12,))
+                            _bias.add_(
+                                torch.ones(
+                                    12,
+                                )
+                            )
+                            b = b + torch.ones(
+                                12,
+                            )
+
+                    _bias.add_(
+                        torch.ones(
+                            12,
+                        )
+                    )
                     a += _bias
                 # TODO: value for a_ref is incorrect.
                 # a_ref += torch.ones(12,)
-                b_ref += torch.ones(12,)
+                b_ref += torch.ones(
+                    12,
+                )
                 return _bias + x, a, b, b_ref
 
         m = M()
-        x = torch.zeros(12,)
+        x = torch.zeros(
+            12,
+        )
         self.run_test(torch.jit.script(M()), (x))
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -5684,18 +6756,26 @@ def test_inplace_attr_with_loop(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self._bias = torch.arange(12,)
+                self._bias = torch.arange(
+                    12,
+                )
 
             def forward(self, x):
-                self._bias = torch.arange(12,)
+                self._bias = torch.arange(
+                    12,
+                )
                 for i in range(10):
                     if i == 3:
                         for j in range(5):
-                            self._bias += torch.arange(12,)
+                            self._bias += torch.arange(
+                                12,
+                            )
                 return self._bias + x
 
         m = M()
-        x = torch.zeros(12,)
+        x = torch.zeros(
+            12,
+        )
         self.run_test(torch.jit.script(M()), (x))
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -5703,27 +6783,47 @@ def test_inplace_attr_copy_with_loop(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self._bias = torch.arange(12,)
+                self._bias = torch.arange(
+                    12,
+                )
 
             def forward(self, x):
-                self._bias = torch.arange(12,)
+                self._bias = torch.arange(
+                    12,
+                )
                 for i in range(10):
                     if i == 3:
                         for j in range(5):
-                            self._bias.copy_(torch.arange(12,))
-                        self._bias.copy_(self._bias + torch.arange(12,))
-
-                    self._bias.copy_(self._bias + torch.arange(12,))
+                            self._bias.copy_(
+                                torch.arange(
+                                    12,
+                                )
+                            )
+                        self._bias.copy_(
+                            self._bias
+                            + torch.arange(
+                                12,
+                            )
+                        )
+
+                    self._bias.copy_(
+                        self._bias
+                        + torch.arange(
+                            12,
+                        )
+                    )
                 return self._bias + x
 
         m = M()
-        x = torch.zeros(12,)
+        x = torch.zeros(
+            12,
+        )
         self.run_test(torch.jit.script(M()), (x))
 
     @skipIfUnsupportedMinOpsetVersion(14)  # Need onnx::Identity of sequence in opset 14
     def test_inplace_sequence_with_loop(self):
         class M(torch.nn.Module):
-            def process(self, beam_hyps: List[torch.Tensor], done: torch.Tensor, x):
+            def process(self, beam_hyps: List[Tensor], done: Tensor, x):
                 batch_size = x.shape[0]
                 for i in range(batch_size):
                     if done[i]:
@@ -5742,7 +6842,7 @@ def process(self, beam_hyps: List[torch.Tensor], done: torch.Tensor, x):
                 return beam_hyps, done
 
             def forward(self, x):
-                beam_hyps: List[torch.Tensor] = []
+                beam_hyps: List[Tensor] = []
                 batch_size = x.shape[0]
                 cur_len = 0
                 max_len = x.shape[1]
@@ -5757,8 +6857,7 @@ def forward(self, x):
         x = torch.randn(8, 4, 3)
         self.run_test(torch.jit.script(M()), (x))
 
-
-    @disableScriptTest()  # Sort with dynamic dim not supported in ONNX
+    @skipScriptTest()  # Sort with dynamic dim not supported in ONNX
     def test_sort(self):
         class SortModel(torch.nn.Module):
             def forward(self, x):
@@ -5771,7 +6870,7 @@ def forward(self, x):
         self.run_test(SortModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()  # Sort with dynamic dim not supported in ONNX
+    @skipScriptTest()  # Sort with dynamic dim not supported in ONNX
     def test_sort_ascending(self):
         class SortModel(torch.nn.Module):
             def forward(self, x):
@@ -5802,7 +6901,6 @@ def forward(self, x):
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_masked_fill_inplace(self):
-
         class MaskedFillModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x):
@@ -5872,7 +6970,32 @@ def forward(self, x):
                 return torch.pixel_shuffle(x, upscale_factor=2)
 
         x = torch.randn(2, 16, 4, 3, requires_grad=True)
+        y = torch.randn(4, 32, 8, 4, requires_grad=True)
         self.run_test(PixelShuffle(), x)
+        self.run_test(
+            PixelShuffle(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+            test_with_inputs=[y],
+        )
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_pixel_unshuffle(self):
+        class PixelUnshuffle(torch.nn.Module):
+            def forward(self, x):
+                return torch.pixel_unshuffle(x, downscale_factor=2)
+
+        x = torch.randn(2, 16, 4, 6, requires_grad=True)
+        y = torch.randn(4, 32, 8, 4, requires_grad=True)
+        self.run_test(PixelUnshuffle(), x)
+        self.run_test(
+            PixelUnshuffle(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+            test_with_inputs=[y],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_reciprocal(self):
@@ -5895,7 +7018,6 @@ def forward(self, x):
         x = torch.ones(2, 3, dtype=torch.float32)
         self.run_test(ArithmeticModel(), x)
 
-
         class ComparisonModel(torch.nn.Module):
             def forward(self, x, y):
                 a = torch.tensor([12.0])
@@ -5907,7 +7029,7 @@ def forward(self, x, y):
 
         class MatMulModel(torch.nn.Module):
             def forward(self, x):
-                return (torch.mm(x, x) + x + torch.mm(x, x) + x)
+                return torch.mm(x, x) + x + torch.mm(x, x) + x
 
         x = torch.ones(3, 3)
         self.run_test(MatMulModel(), x)
@@ -5923,9 +7045,19 @@ class FullModel(torch.nn.Module):
             # add is used for exporting full
             def forward(self, x):
                 return torch.full((3, 4), x)
-        x = torch.tensor(12.)
+
+        x = torch.tensor(12.0)
         self.run_test(FullModel(), x)
 
+        class CatModel(torch.nn.Module):
+            def forward(self, fp16, fp32):
+                return torch.cat([fp16, fp32])
+
+        fp16 = Tensor([0.5])
+        fp16 = fp16.half()
+        fp32 = Tensor([1.5])
+        self.run_test(CatModel(), (fp16, fp32))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_full_like(self):
         class FullLikeModel(torch.nn.Module):
@@ -5985,12 +7117,14 @@ def forward(self, x):
 
         x = torch.randn(4, 2, 3, requires_grad=True)
         y = torch.randn(2, 1, 3, requires_grad=True)
-        self.run_test(UnfoldModel(), x,
-                      dynamic_axes={"x": [0, 1]},
-                      input_names=["x"],
-                      test_with_inputs=[y])
+        self.run_test(
+            UnfoldModel(),
+            x,
+            dynamic_axes={"x": [0, 1]},
+            input_names=["x"],
+            test_with_inputs=[y],
+        )
 
-    @skipIfONNXShapeInference(False)
     def test_unfold_infer_shape(self):
         class UnfoldModule(torch.jit.ScriptModule):
             def __init__(self):
@@ -6032,7 +7166,7 @@ def forward(self, input, other):
         self.run_test(MatmulModel(), (x, y))
 
         x = torch.randint(10, (4, 5))
-        y = torch.randint(10, (5, ))
+        y = torch.randint(10, (5,))
         self.run_test(MatmulModel(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)  # MatMul long inputs is added in ONNX opset 9.
@@ -6045,16 +7179,16 @@ def forward(self, input, other):
         y = torch.randn(5, requires_grad=True)
         self.run_test(MatmulModel(), (x, y))
 
-        x = torch.randint(10, (5, ))
-        y = torch.randint(10, (5, ))
+        x = torch.randint(10, (5,))
+        y = torch.randint(10, (5,))
         self.run_test(MatmulModel(), (x, y))
 
-    @disableScriptTest()  # SpectralNorm not TorchScript compatible.
+    @skipScriptTest()  # SpectralNorm not TorchScript compatible.
     def test_spectral_norm(self):
         m = torch.nn.utils.spectral_norm(torch.nn.Linear(2, 4))
 
         x = torch.randn(6, 2)
-        self.run_test(m, (x, ))
+        self.run_test(m, (x,))
 
     def test_prelu(self):
         class PReluModel(torch.nn.Module):
@@ -6067,9 +7201,17 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4)
         y = torch.randn(2, 4, 5)
-        self.run_test(PReluModel(), x, input_names=["x"],
-                      dynamic_axes={"x": [1, 2]},
-                      test_with_inputs=[y])
+        self.run_test(
+            PReluModel(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [1, 2]},
+            test_with_inputs=[y],
+        )
+
+    def test_prelu_scalar(self):
+        x = torch.scalar_tensor(1.0)
+        self.run_test(torch.nn.PReLU(), x, input_names=["x"])
 
     def test_relu6(self):
         class Relu6Model(torch.nn.Module):
@@ -6082,9 +7224,13 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4) * 100.0
         y = torch.randn(2, 4, 5) * 100.0
-        self.run_test(Relu6Model(), x, input_names=['x'],
-                      dynamic_axes={'x': [1, 2]},
-                      test_with_inputs=[y])
+        self.run_test(
+            Relu6Model(),
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [1, 2]},
+            test_with_inputs=[y],
+        )
 
     def test_silu(self):
         class SiLUModel(torch.nn.Module):
@@ -6225,7 +7371,16 @@ def forward(self, x):
     def test_gelu(self):
         class GeluModel(torch.nn.Module):
             def forward(self, x):
-                return torch.nn.functional.gelu(x)
+                return torch.nn.functional.gelu(x, approximate="none")
+
+        x = torch.randn(2, 4, 5, 6, requires_grad=True)
+        self.run_test(GeluModel(), x)
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_tanh_gelu(self):
+        class GeluModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.gelu(x, approximate="tanh")
 
         x = torch.randn(2, 4, 5, 6, requires_grad=True)
         self.run_test(GeluModel(), x)
@@ -6239,6 +7394,16 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(InplaceAddModel(), x)
 
+    def test_addcmul(self):
+        class AddcmulModel(torch.nn.Module):
+            def forward(self, x, t1, t2):
+                return torch.addcmul(x, t1, t2), torch.addcmul(x, t1, t2, value=2.2)
+
+        x = torch.randn(1, 3)
+        t1 = torch.randn(3, 1)
+        t2 = torch.randn(1, 3)
+        self.run_test(AddcmulModel(), (x, t1, t2))
+
     def test_rsqrt(self):
         class RsqrtModel(torch.nn.Module):
             def forward(self, x):
@@ -6251,6 +7416,7 @@ def test_rsqrt_zeros(self):
         class RsqrtModel(torch.nn.Module):
             def forward(self, x):
                 return x.rsqrt()
+
         x = torch.zeros(4, 2, 3, requires_grad=True, dtype=torch.float64)
         self.run_test(RsqrtModel(), x)
 
@@ -6258,7 +7424,9 @@ def forward(self, x):
     def test_unique(self):
         class UniqueModel(torch.nn.Module):
             def forward(self, x):
-                return torch.unique(x, sorted=True, return_inverse=False, return_counts=True)
+                return torch.unique(
+                    x, sorted=True, return_inverse=False, return_counts=True
+                )
 
         x = torch.tensor([1, 3, 2, 3], dtype=torch.long)
         self.run_test(UniqueModel(), x)
@@ -6267,7 +7435,9 @@ def forward(self, x):
     def test_unique_along_dim(self):
         class UniqueModel(torch.nn.Module):
             def forward(self, x):
-                return torch.unique(x, dim=0, sorted=True, return_inverse=True, return_counts=False)
+                return torch.unique(
+                    x, dim=0, sorted=True, return_inverse=True, return_counts=False
+                )
 
         x = torch.tensor([1, 3, 2, 3], dtype=torch.long)
         self.run_test(UniqueModel(), x)
@@ -6277,6 +7447,7 @@ def test_cumsum(self):
         class CumSum(torch.nn.Module):
             def forward(self, input):
                 return torch.cumsum(input, dim=0)
+
         x = torch.randn(2, 3, 4)
         model = CumSum()
         self.run_test(model, x)
@@ -6293,7 +7464,7 @@ def forward(self, input):
         x = torch.tensor([False, True, True])
         self.run_test(model, x)
 
-    @disableScriptTest()  # error in propagate as assign input shape
+    @skipScriptTest()  # error in propagate as assign input shape
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_embedding_bag(self):
         model = torch.nn.EmbeddingBag(10, 5, mode="sum", scale_grad_by_freq=True)
@@ -6314,12 +7485,19 @@ def test_embedding_bag(self):
     def test_embedding_bag_1d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, offset, weights):
-                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offset,
-                                                         mode="sum", per_sample_weights=weights)
+                return torch.nn.functional.embedding_bag(
+                    input,
+                    embedding_matrix,
+                    offsets=offset,
+                    mode="sum",
+                    per_sample_weights=weights,
+                )
 
         model = EmbeddingModel()
         x = torch.randint(7, (6,))
-        w = torch.randn(6, )
+        w = torch.randn(
+            6,
+        )
         offset = torch.tensor([0, 2, 5])
         embedding_matrix = torch.rand(10, 15)
         self.run_test(model, (embedding_matrix, x, offset, w))
@@ -6328,43 +7506,78 @@ def forward(self, embedding_matrix, input, offset, weights):
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
-                return torch.nn.functional.embedding_bag(input, embedding_matrix,
-                                                         mode="sum", per_sample_weights=weights)
+                return torch.nn.functional.embedding_bag(
+                    input, embedding_matrix, mode="sum", per_sample_weights=weights
+                )
 
         embedding_matrix = torch.rand(10, 15)
         model = EmbeddingModel()
         x = torch.randint(7, (2, 3))
         w = torch.randn(2, 3)
-        self.run_test(model, (embedding_matrix, x, w))
 
-    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
+        x2 = torch.randint(7, (4, 3))
+        w2 = torch.randn(4, 3)
+        self.run_test(
+            model,
+            (embedding_matrix, x, w),
+            input_names=["embed", "x", "w"],
+            dynamic_axes={"x": [0], "w": [0]},
+            test_with_inputs=[(embedding_matrix, x2, w2)],
+        )
+
+    @skipScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
     @skipIfUnsupportedMinOpsetVersion(11)
-    @unittest.skip("Due to ONNX Loop shape inference issue. "
-                   "https://msdata.visualstudio.com/Vienna/_workitems/edit/1352001")
+    @unittest.skip(
+        "Due to ONNX Loop shape inference issue. "
+        "https://msdata.visualstudio.com/Vienna/_workitems/edit/1352001"
+    )
     def test_embedding_bag_dynamic_input(self):
         class EmbeddingModel1D(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights, offsets):
-                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offsets,
-                                                         mode="sum", per_sample_weights=weights)
+                return torch.nn.functional.embedding_bag(
+                    input,
+                    embedding_matrix,
+                    offsets=offsets,
+                    mode="sum",
+                    per_sample_weights=weights,
+                )
 
         model = EmbeddingModel1D()
         x = torch.randint(7, (6,))
-        w = torch.randn(6, )
+        w = torch.randn(
+            6,
+        )
         offsets = torch.tensor([0, 2, 5], dtype=torch.long)
         embedding_matrix = torch.rand(10, 15)
         x2 = torch.randint(7, (2,))
-        w2 = torch.randn(2, )
+        w2 = torch.randn(
+            2,
+        )
         embedding_matrix2 = torch.rand(12, 25)
-        offsets2 = torch.tensor([0, ], dtype=torch.long)
-        self.run_test(model, (embedding_matrix, x, w, offsets),
-                      test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)],
-                      input_names=["embedding_matrix", "x", "offsets", "w"],
-                      dynamic_axes={"embedding_matrix": [0, 1], "x": [0], "offsets": [0], "w": [0]})
+        offsets2 = torch.tensor(
+            [
+                0,
+            ],
+            dtype=torch.long,
+        )
+        self.run_test(
+            model,
+            (embedding_matrix, x, w, offsets),
+            test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)],
+            input_names=["embedding_matrix", "x", "offsets", "w"],
+            dynamic_axes={
+                "embedding_matrix": [0, 1],
+                "x": [0],
+                "offsets": [0],
+                "w": [0],
+            },
+        )
 
         class EmbeddingModel2D(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
-                return torch.nn.functional.embedding_bag(input, embedding_matrix,
-                                                         mode="sum", per_sample_weights=weights)
+                return torch.nn.functional.embedding_bag(
+                    input, embedding_matrix, mode="sum", per_sample_weights=weights
+                )
 
         model = EmbeddingModel2D()
         x = torch.randint(7, (2, 3))
@@ -6373,10 +7586,13 @@ def forward(self, embedding_matrix, input, weights):
         x2 = torch.randint(7, (3, 5))
         w2 = torch.randn(3, 5)
         embedding_matrix2 = torch.rand(12, 25)
-        self.run_test(model, (embedding_matrix, x, w),
-                      test_with_inputs=[(embedding_matrix2, x2, w2)],
-                      input_names=["embedding_matrix", "x", "w"],
-                      dynamic_axes={"embedding_matrix": [0, 1], "x": [0, 1], "w": [0, 1]})
+        self.run_test(
+            model,
+            (embedding_matrix, x, w),
+            test_with_inputs=[(embedding_matrix2, x2, w2)],
+            input_names=["embedding_matrix", "x", "w"],
+            dynamic_axes={"embedding_matrix": [0, 1], "x": [0, 1], "w": [0, 1]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_meshgrid(self):
@@ -6405,7 +7621,10 @@ def forward(self, x, y, z):
     def test_baddbmm(self):
         class MyModule(torch.nn.Module):
             def forward(self, input, batch1, batch2):
-                return torch.baddbmm(input, batch1, batch2, alpha=torch.tensor(5), beta=3.5)
+                return torch.baddbmm(
+                    input, batch1, batch2, alpha=torch.tensor(5), beta=3.5
+                )
+
         x = torch.randn(10, 3, 5)
         batch1 = torch.randn(10, 3, 4)
         batch2 = torch.randn(10, 4, 5)
@@ -6416,6 +7635,7 @@ def test_baddbmm_dynamic(self):
         class MyModule(torch.nn.Module):
             def forward(self, input, batch1, batch2, alpha, beta):
                 return torch.baddbmm(input, batch1, batch2, alpha=alpha, beta=beta)
+
         x = torch.randn(10, 3, 5)
         batch1 = torch.randn(10, 3, 4)
         batch2 = torch.randn(10, 4, 5)
@@ -6425,24 +7645,36 @@ def forward(self, input, batch1, batch2, alpha, beta):
         self.run_test(model, (x, batch1, batch2, alpha, beta))
 
     def test_numel(self):
-        class MyModule(torch.jit.ScriptModule):
-            @torch.jit.script_method
+        class MyModule(torch.nn.Module):
             def forward(self, input):
                 return input.numel() * input
 
         x = torch.randn(2, 3, 5)
+        x2 = torch.randn(4, 5, 6)
         model = MyModule()
-        self.run_test(model, (x,))
+        self.run_test(
+            model,
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+            test_with_inputs=[(x2,)],
+        )
 
     def test_numel_empty(self):
-        class MyModule(torch.jit.ScriptModule):
-            @torch.jit.script_method
+        class MyModule(torch.nn.Module):
             def forward(self, input):
                 return input.numel() * input
 
         x = torch.randn(0)
+        x2 = torch.randn(4)
         model = MyModule()
-        self.run_test(model, (x,))
+        self.run_test(
+            model,
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0]},
+            test_with_inputs=[(x2,)],
+        )
 
     def test_dtype(self):
         class MyModel(torch.jit.ScriptModule):
@@ -6530,6 +7762,7 @@ def test_log(self):
         class Log(torch.nn.Module):
             def forward(self, input):
                 return torch.log(input)
+
         x = torch.rand(2, 3, 4)
         model = Log()
         self.run_test(model, x)
@@ -6538,6 +7771,7 @@ def test_log1p(self):
         class Log1p(torch.nn.Module):
             def forward(self, input):
                 return torch.log1p(input)
+
         x = torch.rand(2, 3, 4)
         model = Log1p()
         self.run_test(model, x)
@@ -6546,6 +7780,7 @@ def test_log10(self):
         class Log10(torch.nn.Module):
             def forward(self, input):
                 return torch.log10(input)
+
         x = torch.rand(2, 3, 4)
         model = Log10()
         self.run_test(model, x)
@@ -6580,29 +7815,30 @@ def forward(self, x, pad: List[int]):
         y = pad = [2, 4]
         self.run_test(Pad(), (x, y))
 
-        y = pad = [torch.tensor(2, dtype=torch.int64), torch.tensor(4, dtype=torch.int64)]
+        y = pad = [
+            torch.tensor(2, dtype=torch.int64),
+            torch.tensor(4, dtype=torch.int64),
+        ]
         self.run_test(Pad(), (x, y))
 
-
     @skipIfUnsupportedMaxOpsetVersion(10)
+    @skipScriptTest()  # TODO: the logic in symbolic_opset9 doesn't handle script
     def test_unsupported_pad(self):
         class Pad(torch.nn.Module):
-            def forward(self, x, pad):
+            def forward(self, x, pad: List[int]):
                 return torch.nn.functional.pad(x, pad)
 
-        def run():
-            x = torch.randn(2, 2, 4, 4)
-            y = pad = (torch.tensor(2, dtype=torch.int32), torch.tensor(4, dtype=torch.int32))
-            p = Pad()
-            f = io.BytesIO()
-            torch.onnx._export(p, (x, y), f)
-
-        with self.assertRaises(RuntimeError) as cm:
-            run()
-
-        the_exception = cm.exception
-        self.assertEqual("Unsupported: ONNX export of Pad in opset 9. The sizes of the padding must be constant. " +
-                         "Please try opset version 11.", the_exception.args[0])
+        x = torch.randn(2, 2, 4, 4)
+        y = [2, 4]
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            (
+                "Unsupported: ONNX export of Pad.*"
+                + "The sizes of the padding must be constant"
+            ),
+        ):
+            self.run_test(Pad(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_if_fold(self):
@@ -6614,6 +7850,7 @@ def forward(self, y):
                 else:
                     y = y - 1
                 return y
+
         x = torch.ones((3, 4), dtype=torch.int)
         self.run_test(IfFoldModel(), x)
 
@@ -6723,7 +7960,6 @@ def forward(self, x, y):
         self.run_test(IfFoldModel(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized(self):
         class UninitializedModel(torch.nn.Module):
             def forward(self, y):
@@ -6738,7 +7974,6 @@ def forward(self, y):
         self.run_test(UninitializedModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized_dynamic(self):
         class UninitializedModel(torch.nn.Module):
             def forward(self, y):
@@ -6751,13 +7986,16 @@ def forward(self, y):
 
         x = torch.ones((3, 4), dtype=torch.int)
         y = torch.ones((6, 7), dtype=torch.int)
-        self.run_test(UninitializedModel(), x, test_with_inputs=[y],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1]})
+        self.run_test(
+            UninitializedModel(),
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1]},
+        )
 
     # onnx::Identity of sequence supported for ONNX opset >= 14
     @skipIfUnsupportedMinOpsetVersion(14)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized_tensorList(self):
         class UninitializedTensorListModel(torch.nn.Module):
             def forward(self, x):
@@ -6773,7 +8011,6 @@ def forward(self, x):
 
     # onnx::Identity of sequence supported for ONNX opset >= 14
     @skipIfUnsupportedMinOpsetVersion(14)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized_tensorList_dynamic(self):
         class UninitializedTensorListModel(torch.nn.Module):
             def forward(self, x):
@@ -6785,12 +8022,15 @@ def forward(self, x):
                 return list(x)
 
         x = torch.ones((3, 4), dtype=torch.double)
-        self.run_test(torch.jit.script(UninitializedTensorListModel()), x, input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1]})
+        self.run_test(
+            torch.jit.script(UninitializedTensorListModel()),
+            x,
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1]},
+        )
 
     # onnx::Identity of sequence supported for ONNX opset >= 14
     @skipIfUnsupportedMinOpsetVersion(14)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized_intList(self):
         class UninitializedListModel(torch.nn.Module):
             def forward(self, x):
@@ -6804,12 +8044,15 @@ def forward(self, x):
                 return y
 
         x = torch.ones((3, 4), dtype=torch.int)
-        self.run_test(torch.jit.script(UninitializedListModel()), x, input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1]})
+        self.run_test(
+            torch.jit.script(UninitializedListModel()),
+            x,
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1]},
+        )
 
     # onnx::Identity of sequence supported for ONNX opset >= 14
     @skipIfUnsupportedMinOpsetVersion(14)
-    @skipIfONNXShapeInference(False)
     def test_uninitialized_tensorList_shape(self):
         class UninitializedModel(torch.nn.Module):
             def forward(self, x):
@@ -6824,9 +8067,13 @@ def forward(self, x):
 
         x = torch.ones((3, 4), dtype=torch.int)
         y = torch.ones((4, 6), dtype=torch.int)
-        self.run_test(torch.jit.script(UninitializedModel()), x, test_with_inputs=[y],
-                      input_names=["input_1"],
-                      dynamic_axes={"input_1": [0, 1]})
+        self.run_test(
+            torch.jit.script(UninitializedModel()),
+            x,
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1]},
+        )
 
     # Sequence type as loop-carried dependencies only supported for ONNX opset >= 13
     @skipIfUnsupportedMinOpsetVersion(13)
@@ -6863,9 +8110,17 @@ def test_replication_pad(self):
     def test_im2col(self):
         class Unfold(torch.nn.Module):
             def forward(self, input):
-                return torch.nn.functional.unfold(input, kernel_size=(10, 15), dilation=2, padding=5, stride=3), \
-                    torch.nn.functional.unfold(input, kernel_size=(2, 2), dilation=1, padding=0, stride=3), \
-                    torch.nn.functional.unfold(input, kernel_size=(1, 1), dilation=5, padding=2, stride=3)
+                return (
+                    torch.nn.functional.unfold(
+                        input, kernel_size=(10, 15), dilation=2, padding=5, stride=3
+                    ),
+                    torch.nn.functional.unfold(
+                        input, kernel_size=(2, 2), dilation=1, padding=0, stride=3
+                    ),
+                    torch.nn.functional.unfold(
+                        input, kernel_size=(1, 1), dilation=5, padding=2, stride=3
+                    ),
+                )
 
         x = torch.rand(1, 1, 200, 100)
         self.run_test(Unfold(), x)
@@ -6880,6 +8135,142 @@ def forward(self, x):
         x = torch.randn(2, 3, 5, 5)
         self.run_test(Det(), x)
 
+    def test_linalg_norm(self):
+        class LinalgSingleDimModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgSingleDimModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord, dim=1)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgSingleDimModel(None), x)
+        self.run_test(LinalgSingleDimModel(2), x)
+        self.run_test(LinalgSingleDimModel(float("inf")), x)
+        self.run_test(LinalgSingleDimModel(-float("inf")), x)
+        self.run_test(LinalgSingleDimModel(-4), x)
+        self.run_test(LinalgSingleDimModel(1.5), x)
+
+        class LinalgMultiDimModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgMultiDimModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord, dim=(0, 2))
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgMultiDimModel("fro"), x)
+        self.run_test(LinalgMultiDimModel(float("inf")), x)
+        self.run_test(LinalgMultiDimModel(-float("inf")), x)
+        self.run_test(LinalgMultiDimModel(1), x)
+        self.run_test(LinalgMultiDimModel(-1), x)
+
+        class LinalgNoDimNoOrdModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.linalg.norm(x)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgNoDimNoOrdModel(), x)
+        y = torch.randn(2, 3)
+        self.run_test(LinalgNoDimNoOrdModel(), y)
+        z = torch.randn(2)
+        self.run_test(LinalgNoDimNoOrdModel(), z)
+
+        class LinalgNoDim1DModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgNoDim1DModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord)
+
+        x = torch.randn(2)
+        self.run_test(LinalgNoDim1DModel(None), x)
+        self.run_test(LinalgNoDim1DModel(2), x)
+        self.run_test(LinalgNoDim1DModel(float("inf")), x)
+        self.run_test(LinalgNoDim1DModel(-float("inf")), x)
+        self.run_test(LinalgNoDim1DModel(-4), x)
+        self.run_test(LinalgNoDim1DModel(1.5), x)
+
+        class LinalgNoDim2DModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgNoDim2DModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord)
+
+        x = torch.randn(2, 3)
+        self.run_test(LinalgNoDim2DModel("fro"), x)
+        self.run_test(LinalgNoDim2DModel(float("inf")), x)
+        self.run_test(LinalgNoDim2DModel(-float("inf")), x)
+        self.run_test(LinalgNoDim2DModel(1), x)
+        self.run_test(LinalgNoDim2DModel(-1), x)
+
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_linalg_vector_norm_zero(self):
+        class LinalgVectorNormModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgVectorNormModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.vector_norm(x, ord=self.ord)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgVectorNormModel(0), x)
+
+    def test_linalg_vector_norm(self):
+        class LinalgVectorNormModel(torch.nn.Module):
+            def __init__(self, ord_val, dim_info):
+                super(LinalgVectorNormModel, self).__init__()
+                self.ord = ord_val
+                self.dim, self.keepdim = dim_info
+
+            def forward(self, x):
+                return torch.linalg.vector_norm(
+                    x, ord=self.ord, dim=self.dim, keepdim=self.keepdim
+                )
+
+        x = torch.randn(2, 3, 5, 5)
+        ord_options = [2, float("inf"), -float("inf"), -4, 1.5]
+        dim_options = [(None, False), (1, False), ((1, 2), False), ((1, 2), True)]
+        for ord_val in ord_options:
+            for dim_info in dim_options:
+                self.run_test(LinalgVectorNormModel(ord_val, dim_info), x)
+
+    def test_linalg_matrix_norm(self):
+        class LinalgMatrixNormModel(torch.nn.Module):
+            def __init__(self, ord_val, dim_val=(-2, -1), keepdim_val=False):
+                super(LinalgMatrixNormModel, self).__init__()
+                self.ord = ord_val
+                self.dim = dim_val
+                self.keepdim = keepdim_val
+
+            def forward(self, x):
+                return torch.linalg.matrix_norm(
+                    x, ord=self.ord, dim=self.dim, keepdim=self.keepdim
+                )
+
+        x = torch.randn(2, 3, 5, 5)
+        ord_options = ["fro", float("inf"), -float("inf"), 1, -1]
+        for ord_val in ord_options:
+            self.run_test(LinalgMatrixNormModel(ord_val), x)
+            self.run_test(LinalgMatrixNormModel(ord_val, (0, 2)), x)
+            self.run_test(LinalgMatrixNormModel(ord_val, (0, 2), True), x)
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_linalg_cross(self):
+        class Cross(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.linalg.cross(x, y, dim=1), torch.linalg.cross(x, y)
+
+        x = torch.randn(5, 3, 2, 3)
+        y = torch.randn(1, 3, 1, 3)
+        self.run_test(Cross(), input=(x, y))
+
     # This test checks output scalar type in the ONNX graph should not be null
     # https://github.com/pytorch/pytorch/issues/28607
     @skipIfUnsupportedMinOpsetVersion(10)
@@ -6931,7 +8322,7 @@ def forward(self, poses):
                 return batch_boxes
 
         dummy_inputs = torch.rand(2, 2, 3)
-        self.run_test(M(), (dummy_inputs, ), input_names=['x'], dynamic_axes={"x": [0]})
+        self.run_test(M(), (dummy_inputs,), input_names=["x"], dynamic_axes={"x": [0]})
 
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_outer(self):
@@ -6991,6 +8382,38 @@ def forward(self, x):
         for x in [torch.randn(3, 4), torch.randn(3, 4).to(dtype=torch.bool)]:
             self.run_test(EinsumModelTranspose(), input=(x,))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_cosine_similarity(self):
+        x = torch.randn(5, 3, 2)
+        y = torch.randn(5, 3, 2)
+        self.run_test(torch.nn.CosineSimilarity(dim=2), input=(x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_pairwise_distance(self):
+        x = torch.randn(5, 3, 2)
+        y = torch.randn(5, 3, 2)
+        self.run_test(torch.nn.PairwiseDistance(p=2.0), input=(x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_cross(self):
+        class Cross(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.cross(x, y, dim=3), torch.cross(x, y)
+
+        x = torch.randn(5, 3, 2, 3)
+        y = torch.randn(5, 3, 2, 3)
+        self.run_test(Cross(), input=(x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_cdist(self):
+        class Cdist(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.cdist(x, y)
+
+        x = torch.randn(5, 3, 3)
+        y = torch.randn(5, 2, 3)
+        self.run_test(Cdist(), input=(x, y))
+
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_crossentropyloss(self):
         for ignore_index in [-100, 1]:
@@ -7017,7 +8440,9 @@ def __init__(self, ignore_index):
                 if ignore_index == -100:
                     self.loss = torch.nn.CrossEntropyLoss(reduction="none")
                 else:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=ignore_index)
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="none", ignore_index=ignore_index
+                    )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7028,9 +8453,15 @@ class CrossEntropyLossNoneWeight(torch.nn.Module):
             def __init__(self, ignore_index):
                 super(CrossEntropyLossNoneWeight, self).__init__()
                 if ignore_index == -100:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="none", weight=torch.randn(5))
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="none", weight=torch.randn(5)
+                    )
                 else:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="none", weight=torch.randn(5), ignore_index=ignore_index)
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="none",
+                        weight=torch.randn(5),
+                        ignore_index=ignore_index,
+                    )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7043,7 +8474,9 @@ def __init__(self, ignore_index):
                 if ignore_index == -100:
                     self.loss = torch.nn.CrossEntropyLoss(reduction="sum")
                 else:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=ignore_index)
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="sum", ignore_index=ignore_index
+                    )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7054,9 +8487,15 @@ class CrossEntropyLossSumWeight(torch.nn.Module):
             def __init__(self, ignore_index):
                 super(CrossEntropyLossSumWeight, self).__init__()
                 if ignore_index == -100:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="sum", weight=torch.randn(5))
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="sum", weight=torch.randn(5)
+                    )
                 else:
-                    self.loss = torch.nn.CrossEntropyLoss(reduction="sum", weight=torch.randn(5), ignore_index=ignore_index)
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        reduction="sum",
+                        weight=torch.randn(5),
+                        ignore_index=ignore_index,
+                    )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7082,7 +8521,9 @@ def __init__(self, ignore_index):
                 if ignore_index == -100:
                     self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5))
                 else:
-                    self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=ignore_index)
+                    self.loss = torch.nn.CrossEntropyLoss(
+                        weight=torch.randn(5), ignore_index=ignore_index
+                    )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7148,7 +8589,9 @@ def forward(self, input, target):
         class KLDivLossMiniBatchMean(torch.nn.Module):
             def __init__(self):
                 super(KLDivLossMiniBatchMean, self).__init__()
-                self.loss = torch.nn.KLDivLoss(reduction="batchmean", size_average=False, log_target=True)
+                self.loss = torch.nn.KLDivLoss(
+                    reduction="batchmean", size_average=False, log_target=True
+                )
 
             def forward(self, input, target):
                 return self.loss(input, target)
@@ -7284,11 +8727,17 @@ def test_nllloss_dynamic_ignore_index(self):
         def linear_combination(x, y, epsilon):
             return epsilon * x + (1 - epsilon) * y
 
-        def reduce_loss(loss, reduction='mean'):
-            return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss
+        def reduce_loss(loss, reduction="mean"):
+            return (
+                loss.mean()
+                if reduction == "mean"
+                else loss.sum()
+                if reduction == "sum"
+                else loss
+            )
 
         class LabelSmoothingCrossEntropy(torch.nn.Module):
-            def __init__(self, epsilon: float = 0.1, reduction='mean'):
+            def __init__(self, epsilon: float = 0.1, reduction="mean"):
                 super().__init__()
                 self.epsilon = epsilon
                 self.reduction = reduction
@@ -7297,7 +8746,12 @@ def forward(self, preds, target, start_position):
                 n = preds.size()[-1]
                 log_preds = F.log_softmax(preds, dim=-1)
                 ignore_index = start_position.size(1)
-                nll = F.nll_loss(log_preds, target, reduction=self.reduction, ignore_index=ignore_index)
+                nll = F.nll_loss(
+                    log_preds,
+                    target,
+                    reduction=self.reduction,
+                    ignore_index=ignore_index,
+                )
                 return nll + start_position.float()
 
         N = 5
@@ -7311,7 +8765,9 @@ def test_nllloss_2d_mean_ignore_index_weights(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
                 super(NLLModel, self).__init__()
-                self.loss = torch.nn.NLLLoss(reduction="mean", weight=torch.randn(C), ignore_index=1)
+                self.loss = torch.nn.NLLLoss(
+                    reduction="mean", weight=torch.randn(C), ignore_index=1
+                )
                 self.conv = torch.nn.Conv2d(16, C, (3, 3))
                 self.m = torch.nn.LogSoftmax(dim=1)
 
@@ -7349,80 +8805,116 @@ def test_binary_cross_entropy_with_logits(self):
     def _bce_logits(self, x, y):
         class BCEWithLogitsLossNone(torch.nn.Module):
             def forward(self, input, target):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="none")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, reduction="none"
+                )
 
         self.run_test(BCEWithLogitsLossNone(), input=(x, y))
 
         class BCEWithLogitsLossMean(torch.nn.Module):
             def forward(self, input, target):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="mean")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, reduction="mean"
+                )
 
         self.run_test(BCEWithLogitsLossMean(), input=(x, y))
 
         class BCEWithLogitsLossSum(torch.nn.Module):
             def forward(self, input, target):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="sum")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, reduction="sum"
+                )
 
         self.run_test(BCEWithLogitsLossSum(), input=(x, y))
 
     def _bce_logits_wegiht(self, x, y, weight):
         class BCEWithLogitsLossWegihtNone(torch.nn.Module):
             def forward(self, input, target, weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="none")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, weight=weight, reduction="none"
+                )
+
         self.run_test(BCEWithLogitsLossWegihtNone(), input=(x, y, weight))
 
         class BCEWithLogitsLossWegihtMean(torch.nn.Module):
             def forward(self, input, target, weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="mean")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, weight=weight, reduction="mean"
+                )
 
         self.run_test(BCEWithLogitsLossWegihtMean(), input=(x, y, weight))
 
         class BCEWithLogitsLossWegihtSum(torch.nn.Module):
             def forward(self, input, target, weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="sum")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, weight=weight, reduction="sum"
+                )
 
         self.run_test(BCEWithLogitsLossWegihtSum(), input=(x, y, weight))
 
     def _bce_logits_posweight(self, x, y, pos_weight):
         class BCEWithLogitsLossPosWegihtNone(torch.nn.Module):
             def forward(self, input, target, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="none")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, pos_weight=pos_weight, reduction="none"
+                )
+
         self.run_test(BCEWithLogitsLossPosWegihtNone(), input=(x, y, pos_weight))
 
         class BCEWithLogitsLossPosWegihtMean(torch.nn.Module):
             def forward(self, input, target, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="mean")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, pos_weight=pos_weight, reduction="mean"
+                )
 
         self.run_test(BCEWithLogitsLossPosWegihtMean(), input=(x, y, pos_weight))
 
         class BCEWithLogitsLossPosWegihtSum(torch.nn.Module):
             def forward(self, input, target, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="sum")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, pos_weight=pos_weight, reduction="sum"
+                )
 
         self.run_test(BCEWithLogitsLossPosWegihtSum(), input=(x, y, pos_weight))
 
     def _bce_logits_loss_weight_posweight(self, x, y, weight, pos_weight):
         class BCEWithLogitsLossWeightPosweightNone(torch.nn.Module):
             def forward(self, input, target, weight, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
-                                                                            pos_weight=pos_weight, reduction="none")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input,
+                    target,
+                    weight=weight,
+                    pos_weight=pos_weight,
+                    reduction="none",
+                )
 
-        self.run_test(BCEWithLogitsLossWeightPosweightNone(), input=(x, y, weight, pos_weight))
+        self.run_test(
+            BCEWithLogitsLossWeightPosweightNone(), input=(x, y, weight, pos_weight)
+        )
 
         class BCEWithLogitsLossWeightPosweightMean(torch.nn.Module):
             def forward(self, input, target, weight, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
-                                                                            pos_weight=pos_weight, reduction="mean")
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input,
+                    target,
+                    weight=weight,
+                    pos_weight=pos_weight,
+                    reduction="mean",
+                )
 
-        self.run_test(BCEWithLogitsLossWeightPosweightMean(), input=(x, y, weight, pos_weight))
+        self.run_test(
+            BCEWithLogitsLossWeightPosweightMean(), input=(x, y, weight, pos_weight)
+        )
 
         class BCEWithLogitsLossWeightPosweightSum(torch.nn.Module):
             def forward(self, input, target, weight, pos_weight):
-                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
-                                                                            pos_weight=pos_weight, reduction="sum")
-
-        self.run_test(BCEWithLogitsLossWeightPosweightSum(), input=(x, y, weight, pos_weight))
+                return torch.nn.functional.binary_cross_entropy_with_logits(
+                    input, target, weight=weight, pos_weight=pos_weight, reduction="sum"
+                )
 
+        self.run_test(
+            BCEWithLogitsLossWeightPosweightSum(), input=(x, y, weight, pos_weight)
+        )
 
     def test_torch_mm(self):
         class M(torch.nn.Module):
@@ -7434,7 +8926,9 @@ def forward(self, mat1, mat2):
         mat2 = torch.randn(3, 3)
         self.run_test(M(), input=(mat1, mat2))
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because where op is not supported for opset < 9.
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because where op is not supported for opset < 9.
     def test_where_with_bool_tensor(self):
         class M(torch.nn.Module):
             def forward(self, mat1, mat2):
@@ -7445,7 +8939,9 @@ def forward(self, mat1, mat2):
         mat2 = torch.ones(2, 3)
         self.run_test(M(), input=(mat1, mat2))
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # Because where op is not supported for opset < 9.
+    @skipIfUnsupportedMinOpsetVersion(
+        9
+    )  # Because where op is not supported for opset < 9.
     def test_where_with_byte_tensor(self):
         class M(torch.nn.Module):
             def forward(self, cond, mat1, mat2):
@@ -7465,7 +8961,7 @@ def forward(self, x):
                 return x.isinf()
 
         x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), float("inf")]])
-        self.run_test(M(), (x, ))
+        self.run_test(M(), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_isfinite(self):
@@ -7474,7 +8970,7 @@ def forward(self, x):
                 return x.isfinite()
 
         x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]])
-        self.run_test(M(), (x, ))
+        self.run_test(M(), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(9)  # ONNX IsNaN op is added in opset 9.
     def test_isnan(self):
@@ -7483,7 +8979,61 @@ def forward(self, x):
                 return x.isnan()
 
         x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), float("inf")]])
-        self.run_test(M(), (x, ))
+        self.run_test(M(), (x,))
+
+    @skipIfUnsupportedMinOpsetVersion(
+        10
+    )  # ONNX IsNaN, IsInf op is added in opset 9, 10 respectively.
+    def test_nan_to_num(self):
+        class NoParams(torch.nn.Module):
+            def forward(self, x):
+                return x.nan_to_num()
+
+        x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]])
+        xint = torch.ones((2, 4), dtype=torch.int)
+        xhalf = torch.ones((2, 4), dtype=torch.half)
+        self.run_test(NoParams(), (x,))
+        self.run_test(NoParams(), (xint,))
+        self.run_test(NoParams(), (xhalf,))
+
+        class WithParams(torch.nn.Module):
+            def forward(self, x):
+                return x.nan_to_num(nan=2.3, posinf=4.5, neginf=6.7)
+
+        x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]])
+        self.run_test(WithParams(), (x,))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_maximum_minimum(self):
+        class ModelWithNan(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.maximum(x, y), torch.minimum(x, y)
+
+        x = torch.tensor([-2, -2, float("nan")])
+        y = torch.rand(1, 3)
+        self.run_test(ModelWithNan(), (x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(12)
+    def test_minimum_dtypes(self):
+        class MinimumModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.minimum(x, y)
+
+        x = torch.randn((5, 5), dtype=torch.float16)
+        y = torch.randn((5, 5), dtype=torch.float)
+        self.run_test(MinimumModel(), (x, y))
+
+        x = torch.randn((5, 5), dtype=torch.float16)
+        y = torch.randint(10, (5, 5), dtype=torch.int16)
+        self.run_test(MinimumModel(), (x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int16)
+        y = torch.randint(10, (5, 5), dtype=torch.int32)
+        self.run_test(MinimumModel(), (x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int)
+        y = torch.full_like(x, True)
+        self.run_test(MinimumModel(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_any(self):
@@ -7492,21 +9042,21 @@ def forward(self, x):
                 return x.any()
 
         x = torch.tensor([[True, False], [False, False]])
-        self.run_test(M(), (x, ))
+        self.run_test(M(), (x,))
 
         class MDim(torch.nn.Module):
             def forward(self, x):
                 return x.any(dim=1)
 
         x = torch.rand(3, 4).bool()
-        self.run_test(MDim(), (x, ))
+        self.run_test(MDim(), (x,))
 
         class MKeepdim(torch.nn.Module):
             def forward(self, x):
                 return x.any(dim=1, keepdim=True)
 
         x = torch.rand(3, 4).bool()
-        self.run_test(MKeepdim(), (x, ))
+        self.run_test(MKeepdim(), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_all(self):
@@ -7515,21 +9065,21 @@ def forward(self, x):
                 return x.all()
 
         x = torch.tensor([[True, False], [False, False]])
-        self.run_test(M(), (x, ))
+        self.run_test(M(), (x,))
 
         class MDim(torch.nn.Module):
             def forward(self, x):
                 return x.all(dim=1)
 
         x = torch.rand(3, 4).bool()
-        self.run_test(MDim(), (x, ))
+        self.run_test(MDim(), (x,))
 
         class MKeepdim(torch.nn.Module):
             def forward(self, x):
                 return x.all(dim=1, keepdim=True)
 
         x = torch.rand(3, 4).bool()
-        self.run_test(MKeepdim(), (x, ))
+        self.run_test(MKeepdim(), (x,))
 
     def test_dropout(self):
         class M(torch.nn.Module):
@@ -7588,7 +9138,7 @@ def test_celu_alpha(self):
         class Celu(torch.nn.Module):
             def __init__(self):
                 super(Celu, self).__init__()
-                self.celu = torch.nn.CELU(alpha=2.)
+                self.celu = torch.nn.CELU(alpha=2.0)
 
             def forward(self, input):
                 return self.celu(input)
@@ -7611,8 +9161,7 @@ def forward(self, input):
 
     def test_lower_tuple(self):
         class TupleModule(torch.nn.Module):
-            def forward(self, input1, input2, input3):
-                # type: (Tensor, Tensor, Tensor) -> Tensor
+            def forward(self, input1: Tensor, input2: Tensor, input3: Tensor) -> Tensor:
                 a = (input1, input2)
                 b = a
                 c = (input1, input2, input3)
@@ -7640,8 +9189,7 @@ def forward(self, input1, input2, input3):
 
     def test_lower_tuple_2(self):
         class TupleModule(torch.nn.Module):
-            def forward(self, input1, input2):
-                # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+            def forward(self, input1: Tensor, input2: Tensor) -> Tuple[Tensor, Tensor]:
                 a = (input1, input2)
                 for x in range(5):
                     c, d = a
@@ -7654,8 +9202,11 @@ def forward(self, input1, input2):
 
     def test_lower_tuple_3(self):
         class TupleModule(torch.nn.Module):
-            def forward(self, input1, input2):
-                # type: (Tuple[Tensor, Tensor], Tuple[Tensor, Tensor])
+            def forward(
+                self,
+                input1: Tuple[Tensor, Tensor],
+                input2: Tuple[Tensor, Tensor],
+            ) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]:
                 a = input1
                 b = input2
                 for x in range(5):
@@ -7667,7 +9218,7 @@ def forward(self, input1, input2):
                         f = f + d
                     a = (e, f)
                     b = (c, d)
-                return a , b
+                return a, b
 
         input1 = (torch.randn(2), torch.randn(2))
         input2 = (torch.randn(2), torch.randn(2))
@@ -7685,7 +9236,7 @@ def forward(self, cond, input, other):
         self.run_test(Model(), (x, y, z))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()  # scripting tests run for opsets > 11. See: test_where_condition_script
+    @skipScriptTest()  # scripting tests run for opsets > 11. See: test_where_condition_script
     def test_where_condition(self):
         class Model1(torch.nn.Module):
             def forward(self, input):
@@ -7740,7 +9291,7 @@ def forward(self, input):
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_derive_index_scripting(self):
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(len(x) - 1, -len(x), -2):
                     y = x[idx]
@@ -7751,7 +9302,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(-len(x), len(x) - 1, 2):
                     y = x[idx]
@@ -7762,7 +9313,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(len(x) - 1, -len(x), -3):
                     y = x[idx]
@@ -7772,7 +9323,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(-len(x), len(x) - 1, 3):
                     y = x[idx]
@@ -7781,10 +9332,10 @@ def forward(self, x: torch.Tensor):
 
         self.run_test(MyModule(), x)
 
-    @disableScriptTest()  # Scripting fails for add lists for opsets < 11. Chek test_derive_index_scripting
+    @skipScriptTest()  # Scripting fails for add lists for opsets < 11. Chek test_derive_index_scripting
     def test_derive_index(self):
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(len(x) - 1, -len(x), -2):
                     y = x[idx]
@@ -7795,7 +9346,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(-len(x), len(x) - 1, 2):
                     y = x[idx]
@@ -7806,7 +9357,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(len(x) - 1, -len(x), -3):
                     y = x[idx]
@@ -7816,7 +9367,7 @@ def forward(self, x: torch.Tensor):
         self.run_test(MyModule(), x)
 
         class MyModule(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
+            def forward(self, x: Tensor):
                 j = []
                 for idx in range(-len(x), len(x) - 1, 3):
                     y = x[idx]
@@ -7825,7 +9376,6 @@ def forward(self, x: torch.Tensor):
 
         self.run_test(MyModule(), x)
 
-    @skipIfONNXShapeInference(False)
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_if_transpose(self):
         class IfModel(torch.nn.Module):
@@ -7837,11 +9387,13 @@ def forward(self, x):
                     return x
 
         x = torch.randn(2, 3)
-        self.run_test(torch.jit.script(IfModel()), x,
-                      output_names=["output_1"],
-                      dynamic_axes={"output_1": [0, 1]})
+        self.run_test(
+            torch.jit.script(IfModel()),
+            x,
+            output_names=["output_1"],
+            dynamic_axes={"output_1": [0, 1]},
+        )
 
-    @skipIfONNXShapeInference(False)
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_if_list(self):
         class IfModel(torch.nn.Module):
@@ -7872,9 +9424,12 @@ def forward(self, x, y, cond):
         x = torch.randn(2, 16, 2, 2)
         y = torch.randn(2, 16, 8)
         cond = torch.tensor(1, dtype=torch.bool)
-        self.run_test(torch.jit.script(IfModel()), (x, y, cond),
-                      output_names=["output_1"],
-                      dynamic_axes={"output_1": [1]})
+        self.run_test(
+            torch.jit.script(IfModel()),
+            (x, y, cond),
+            output_names=["output_1"],
+            dynamic_axes={"output_1": [1]},
+        )
 
     def test_onnx_proto_checker(self):
         class Model(torch.nn.Module):
@@ -7895,16 +9450,7 @@ def check_proto():
 
         self.assertRaises(RuntimeError, check_proto)
 
-    @skipIfUnsupportedMinOpsetVersion(11)
-    def test_split_tensor_scalar_scripting(self):
-        class SplitModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.split(x, x.size(1))
-
-        x = torch.randn(1, 2, 3, requires_grad=True)
-        self.run_test(SplitModel(), x)
-
-    @disableScriptTest()  # Scripting fails to export dynamic split for opsets < 11
+    @skipScriptTest(min_opset_version=11)  # dynamic split support addded in 11
     def test_split_tensor_scalar(self):
         class SplitModel(torch.nn.Module):
             def forward(self, x):
@@ -7941,7 +9487,9 @@ def forward(self, input, emb):
         x[2] = 1
         x[0][1] = 1
         self.run_test(model, (x, embedding_matrix))
-        self.run_test(model, (x, embedding_matrix), training=torch.onnx.TrainingMode.TRAINING)
+        self.run_test(
+            model, (x, embedding_matrix), training=torch.onnx.TrainingMode.TRAINING
+        )
 
         class EmbedModelWithoutPaddingIdx(torch.nn.Module):
             def forward(self, input, emb):
@@ -7986,6 +9534,17 @@ def forward(self, input):
         x = torch.randint(4, (4, 3, 2))
         self.run_test(model, (x,))
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_embedding_renorm(self):
+        n, d = 7, 5
+        embedding = torch.nn.Embedding(n, d, max_norm=0.2)
+        idx = torch.tensor([2, 1])
+        self.run_test(embedding, idx)
+
+        embedding = torch.nn.Embedding(n, d, max_norm=0.5, norm_type=1.0)
+        idx = torch.tensor([4, 3, 4, 2])
+        self.run_test(embedding, idx)
+
     def _dispatch_rnn_test(self, name, *args, **kwargs):
         if name == "elman":
             self._elman_rnn_test(*args, **kwargs)
@@ -7994,16 +9553,29 @@ def _dispatch_rnn_test(self, name, *args, **kwargs):
         if name == "gru":
             self._gru_test(*args, **kwargs)
 
-    def _elman_rnn_test(self, layers, nonlinearity, bidirectional,
-                        initial_state, packed_sequence, dropout):
-
+    def _elman_rnn_test(
+        self,
+        layers,
+        nonlinearity,
+        bidirectional,
+        initial_state,
+        packed_sequence,
+        dropout,
+    ):
         class ElmanWithStateModel(torch.nn.Module):
             def __init__(self, layers, nonlinearity, bidirect, dropout, batch_first):
                 super(ElmanWithStateModel, self).__init__()
 
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, nonlinearity=nonlinearity,
-                                                bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
+                self.inner_model = torch.nn.RNN(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    layers,
+                    nonlinearity=nonlinearity,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input: PackedSequence, hx=None):
                 return self.inner_model(input, hx)
@@ -8012,8 +9584,15 @@ class ElmanWithoutStateModel(torch.nn.Module):
             def __init__(self, layers, nonlinearity, bidirect, dropout, batch_first):
                 super(ElmanWithoutStateModel, self).__init__()
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, nonlinearity=nonlinearity,
-                                                bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
+                self.inner_model = torch.nn.RNN(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    layers,
+                    nonlinearity=nonlinearity,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input: PackedSequence):
                 return self.inner_model(input)
@@ -8021,14 +9600,23 @@ def forward(self, input: PackedSequence):
         batch_first = packed_sequence == 2
 
         if initial_state:
-            model = ElmanWithStateModel(layers=layers, bidirect=bidirectional, nonlinearity=nonlinearity,
-                                        dropout=dropout, batch_first=batch_first)
+            model = ElmanWithStateModel(
+                layers=layers,
+                bidirect=bidirectional,
+                nonlinearity=nonlinearity,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
             if packed_sequence:
                 model = RnnModelWithPackedSequenceWithState(model, batch_first)
         else:
-            model = ElmanWithStateModel(layers=layers, bidirect=bidirectional,
-                                        nonlinearity=nonlinearity, dropout=dropout,
-                                        batch_first=batch_first)
+            model = ElmanWithStateModel(
+                layers=layers,
+                bidirect=bidirectional,
+                nonlinearity=nonlinearity,
+                dropout=dropout,
+                batch_first=batch_first,
+            )
             if packed_sequence:
                 model = RnnModelWithPackedSequenceWithoutState(model, batch_first)
 
@@ -8059,20 +9647,33 @@ def make_input(batch_size):
         other_input = make_input(RNN_BATCH_SIZE + 1)
         self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1)
 
-    def _lstm_test(self, layers, bidirectional, initial_state,
-                   packed_sequence, dropout):
+    def _lstm_test(
+        self, layers, bidirectional, initial_state, packed_sequence, dropout
+    ):
         batch_first = packed_sequence == 2
 
         if packed_sequence:
-            model = LstmFlatteningResultWithSeqLength(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-                                                      bidirectional, dropout, batch_first)
+            model = LstmFlatteningResultWithSeqLength(
+                RNN_INPUT_SIZE,
+                RNN_HIDDEN_SIZE,
+                layers,
+                bidirectional,
+                dropout,
+                batch_first,
+            )
             if initial_state:
                 model = RnnModelWithPackedSequenceWithState(model, batch_first)
             else:
                 model = RnnModelWithPackedSequenceWithoutState(model, batch_first)
         else:
-            model = LstmFlatteningResultWithoutSeqLength(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-                                                         bidirectional, dropout, batch_first)
+            model = LstmFlatteningResultWithoutSeqLength(
+                RNN_INPUT_SIZE,
+                RNN_HIDDEN_SIZE,
+                layers,
+                bidirectional,
+                dropout,
+                batch_first,
+            )
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
@@ -8102,17 +9703,20 @@ def make_input(batch_size):
         other_input = make_input(RNN_BATCH_SIZE + 1)
         self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1)
 
-    def _gru_test(self, layers, bidirectional, initial_state,
-                  packed_sequence, dropout):
-
+    def _gru_test(self, layers, bidirectional, initial_state, packed_sequence, dropout):
         class GRUWithStateModel(torch.nn.Module):
             def __init__(self, layers, bidirect, dropout, batch_first):
                 super(GRUWithStateModel, self).__init__()
 
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers,
-                                                bidirectional=bidirectional, dropout=dropout,
-                                                batch_first=batch_first)
+                self.inner_model = torch.nn.GRU(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    num_layers=layers,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input: PackedSequence, hx):
                 return self.inner_model(input, hx)
@@ -8121,9 +9725,14 @@ class GRUWithoutStateModel(torch.nn.Module):
             def __init__(self, layers, bidirect, dropout, batch_first):
                 super(GRUWithoutStateModel, self).__init__()
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers,
-                                                bidirectional=bidirectional, dropout=dropout,
-                                                batch_first=batch_first)
+                self.inner_model = torch.nn.GRU(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    num_layers=layers,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input: PackedSequence):
                 return self.inner_model(input)
@@ -8132,9 +9741,14 @@ class GRUNoSeqLengthWithoutStateModel(torch.nn.Module):
             def __init__(self, layers, bidirect, dropout, batch_first):
                 super(GRUNoSeqLengthWithoutStateModel, self).__init__()
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers,
-                                                bidirectional=bidirectional, dropout=dropout,
-                                                batch_first=batch_first)
+                self.inner_model = torch.nn.GRU(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    num_layers=layers,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input):
                 return self.inner_model(input)
@@ -8143,9 +9757,14 @@ class GRUNoSeqLengthWithStateModel(torch.nn.Module):
             def __init__(self, layers, bidirect, dropout, batch_first):
                 super(GRUNoSeqLengthWithStateModel, self).__init__()
                 self.batch_first = batch_first
-                self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers,
-                                                bidirectional=bidirectional, dropout=dropout,
-                                                batch_first=batch_first)
+                self.inner_model = torch.nn.GRU(
+                    RNN_INPUT_SIZE,
+                    RNN_HIDDEN_SIZE,
+                    num_layers=layers,
+                    bidirectional=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
             def forward(self, input, hx):
                 return self.inner_model(input, hx)
@@ -8154,20 +9773,36 @@ def forward(self, input, hx):
 
         if packed_sequence:
             if initial_state:
-                model = GRUWithStateModel(layers=layers, bidirect=bidirectional, dropout=dropout,
-                                          batch_first=batch_first)
+                model = GRUWithStateModel(
+                    layers=layers,
+                    bidirect=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
                 model = RnnModelWithPackedSequenceWithState(model, batch_first)
             else:
-                model = GRUWithoutStateModel(layers=layers, bidirect=bidirectional, dropout=dropout,
-                                             batch_first=batch_first)
+                model = GRUWithoutStateModel(
+                    layers=layers,
+                    bidirect=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
                 model = RnnModelWithPackedSequenceWithoutState(model, batch_first)
         else:
             if initial_state:
-                model = GRUNoSeqLengthWithStateModel(layers=layers, bidirect=bidirectional,
-                                                     dropout=dropout, batch_first=batch_first)
+                model = GRUNoSeqLengthWithStateModel(
+                    layers=layers,
+                    bidirect=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
             else:
-                model = GRUNoSeqLengthWithoutStateModel(layers=layers, bidirect=bidirectional,
-                                                        dropout=dropout, batch_first=batch_first)
+                model = GRUNoSeqLengthWithoutStateModel(
+                    layers=layers,
+                    bidirect=bidirectional,
+                    dropout=dropout,
+                    batch_first=batch_first,
+                )
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
@@ -8196,9 +9831,9 @@ def make_input(batch_size):
         other_input = make_input(RNN_BATCH_SIZE + 1)
         self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1)
 
-    @disableScriptTest()  # TODO: RuntimeError: Exporting the operator __is_ to ONNX is not supported
+    @skipScriptTest()  # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1253950
     def test_transformer_encoder(self):
-        from torch.nn import TransformerEncoderLayer, TransformerEncoder
+        from torch.nn import TransformerEncoder, TransformerEncoderLayer
 
         class MyModule(torch.nn.Module):
             def __init__(self, ninp, nhead, nhid, dropout, nlayers):
@@ -8210,36 +9845,84 @@ def forward(self, input):
                 return self.transformer_encoder(input)
 
         x = torch.rand(10, 32, 512)
-        self.run_test(MyModule(512, 8, 2048 , 0., 3), (x,), atol=1e-6)
+        self.run_test(MyModule(512, 8, 2048, 0.0, 3), (x,), atol=1e-6)
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_fake_quantize_per_tensor(self):
         class FakeQuantizePerTensorModel(torch.nn.Module):
             def forward(self, input):
-                scale = 1. / 127
+                scale = 1.0 / 127
                 zero_point = 0
                 quant_min = -128
                 quant_max = 127
-                return torch.fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max)
+                return torch.fake_quantize_per_tensor_affine(
+                    input, scale, zero_point, quant_min, quant_max
+                )
 
         x = torch.randn(6, 4, 3, 3)
         self.run_test(FakeQuantizePerTensorModel(), (x))
 
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_fake_quantize_per_tensor_dynamic_scale_zeropoint(self):
+        class FakeQuantizePerTensorModel(torch.nn.Module):
+            def forward(self, input, scale, zero_point):
+                quant_min = -128
+                quant_max = 127
+                return torch.fake_quantize_per_tensor_affine(
+                    input, scale, zero_point, quant_min, quant_max
+                )
+
+        x = torch.randn(6, 4, 3, 3)
+        scale = torch.tensor(1.0 / 127)
+        zero_point = torch.tensor(0)
+        self.run_test(FakeQuantizePerTensorModel(), (x, scale, zero_point))
+
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_fake_quantize_per_channel(self):
         class FakeQuantizePerChannelModel(torch.nn.Module):
             def forward(self, input):
                 amax = torch.ones(4)
-                scale = amax / 127.
+                scale = amax / 127.0
                 zero_point = torch.zeros_like(amax, dtype=torch.int)
                 # Quantize twice to test differnet branches
-                y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255)
-                return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127)
+                y = torch.fake_quantize_per_channel_affine(
+                    input, scale, zero_point, 1, 0, 255
+                )
+                return torch.fake_quantize_per_channel_affine(
+                    y, scale, zero_point, 1, -128, 127
+                )
 
         x = torch.randn(6, 4, 3, 3)
         self.run_test(FakeQuantizePerChannelModel(), (x))
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
+    @skipIfUnsupportedMinOpsetVersion(13)
+    @skipScriptTest()  # RuntimeError: Can't redefine method: forward on class: __torch__.torch.nn.modules.linear.Linear
+    def test_fake_quantize_activation(self):
+        from torch import quantization
+
+        m = torch.nn.Linear(1, 1)
+        m.qconfig = quantization.QConfig(
+            activation=quantization.default_fake_quant,
+            weight=quantization.default_per_channel_weight_fake_quant,
+        )
+        quantization.prepare_qat(m.train(), inplace=True)
+        m.apply(quantization.enable_observer)
+        m.apply(quantization.enable_fake_quant)
+        for module in m.modules():
+            if isinstance(module, quantization.FakeQuantize):
+                module.calculate_qparams()
+
+        m.apply(quantization.disable_observer)
+        m.eval()
+
+        # Fake quantize activation is a special case, as it restricts quantized range to be (0, 127),
+        # while standard 8bit quantization range is (-128, 127) or (0, 255).
+        # Set fixed weight, bias and inputs to test if ONNX handles the overflow correctly.
+        m.weight = torch.nn.Parameter(torch.tensor([[1.0], [1.0], [1.0]]))
+        m.bias = torch.nn.Parameter(torch.tensor([0.0]))
+        x = torch.tensor([[150.0], [127.0], [-5.0]])
+        self.run_test(m, x)
+
     def test_batchnorm_training(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -8260,11 +9943,22 @@ def forward(self, x):
 
         x = torch.randn(10, 3, 20, 20) * 2
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.train()
-        self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_batchnorm_training_mode_fix_layer(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -8286,9 +9980,21 @@ def forward(self, x):
 
         x = torch.randn(10, 3, 128, 128)
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.train()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     def test_batchnorm_eval_mode_train_layer(self):
         class MyModule(torch.nn.Module):
@@ -8311,11 +10017,22 @@ def forward(self, x):
 
         x = torch.randn(10, 3, 128, 128)
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.EVAL,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.eval()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm_training(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -8336,11 +10053,22 @@ def forward(self, x):
 
         x = torch.randn(10, 3, 128, 128)
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.train()
-        self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm_training_mode_fix_layer(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -8362,11 +10090,22 @@ def forward(self, x):
 
         x = torch.randn(10, 3, 128, 128)
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.train()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_instancenorm_eval_mode_train_layer(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -8388,9 +10127,21 @@ def forward(self, x):
 
         x = torch.randn(10, 8, 128, 128)
         model_export = MyModule()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.EVAL,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         model_export.eval()
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.PRESERVE,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_dropout_training(self):
@@ -8407,16 +10158,24 @@ def forward(self, x):
         x = torch.randn(10)
         model.train()
 
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs = run_ort(ort_sess, input=(x,))
+        ort_sess = convert_to_onnx(
+            model,
+            input=(x,),
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
+        ort_outs = run_ort(ort_sess, (x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
 
         script_model = torch.jit.script(model)
         output = model(x)
-        ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs = run_ort(ort_sess, input=(x,))
+        ort_sess = convert_to_onnx(
+            script_model,
+            input=(x,),
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
+        ort_outs = run_ort(ort_sess, (x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
 
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -8440,9 +10199,13 @@ def forward(self, x):
         nb_elements = torch.numel(input)
 
         model.train()
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs = run_ort(ort_sess, input=(x,))
+        ort_sess = convert_to_onnx(
+            model,
+            input=(x,),
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
+        ort_outs = run_ort(ort_sess, (x,))
 
         y = model(input)
         output = y.cpu().numpy()
@@ -8457,9 +10220,13 @@ def forward(self, x):
         script_model = torch.jit.script(model)
         y = model(input)
         output = y.cpu().numpy()
-        ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs = run_ort(ort_sess, input=(x,))
+        ort_sess = convert_to_onnx(
+            script_model,
+            input=(x,),
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
+        ort_outs = run_ort(ort_sess, (x,))
         ort_mask = np.where(ort_outs[0] != 0, 1, 0)
         pyt_mask = np.where(output != 0, 1, 0)
 
@@ -8468,12 +10235,13 @@ def forward(self, x):
 
         np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01)
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
                 super(MyModule, self).__init__()
-                self.conv = torch.nn.Conv2d(3, 16, kernel_size=1, stride=2, padding=3, bias=True)
+                self.conv = torch.nn.Conv2d(
+                    3, 16, kernel_size=1, stride=2, padding=3, bias=True
+                )
                 self.bn = torch.nn.BatchNorm2d(16, affine=True)
 
             def forward(self, x):
@@ -8484,16 +10252,27 @@ def forward(self, x):
         model_export = MyModule()
         x = torch.randn(10, 3, 128, 128)
         self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL)
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
 
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
                 super(MyModule, self).__init__()
-                self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-                self.conv2 = torch.nn.Conv2d(64, 2, kernel_size=1, stride=1, padding=0, bias=False)
-                self.conv3 = torch.nn.Conv2d(2, 2, kernel_size=3, stride=1, padding=1, bias=False)
+                self.conv1 = torch.nn.Conv2d(
+                    3, 64, kernel_size=7, stride=2, padding=3, bias=False
+                )
+                self.conv2 = torch.nn.Conv2d(
+                    64, 2, kernel_size=1, stride=1, padding=0, bias=False
+                )
+                self.conv3 = torch.nn.Conv2d(
+                    2, 2, kernel_size=3, stride=1, padding=1, bias=False
+                )
                 self.bn = torch.nn.BatchNorm2d(64)
                 self.bn2 = torch.nn.BatchNorm2d(2)
                 self.relu = torch.nn.ReLU(inplace=True)
@@ -8514,19 +10293,26 @@ def forward(self, x):
 
         model_export = MyModule()
         x = torch.randn(2, 3, 224, 224)
-        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(
+            model_export,
+            (x,),
+            training=torch.onnx.TrainingMode.TRAINING,
+            rtol=1e-3,
+            atol=1e-5,
+        )
         self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL)
 
     def test_script_custom_class_error(self):
         class BoxCoder(object):
-            def __init__(self, bbox_xform_clip: float):
-                # type: (float) -> None
+            def __init__(self, bbox_xform_clip: float) -> None:
                 self.bbox_xform_clip = bbox_xform_clip
 
-            def decode(self, rel_codes, boxes):
-                # type: (Tensor, List[Tensor]) -> Tensor
+            def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor:
                 boxes = torch.cat(boxes, dim=0)
-                pred_ctr_x = torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip) * boxes[:, 2]
+                pred_ctr_x = (
+                    torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip)
+                    * boxes[:, 2]
+                )
                 return pred_ctr_x
 
         class MyModule(torch.nn.Module):
@@ -8538,7 +10324,7 @@ def __init__(self):
                 super(MyModule, self).__init__()
                 self.box_coder = BoxCoder(1.4)
 
-            def forward(self, box_regression: torch.Tensor, proposals: List[torch.Tensor]):
+            def forward(self, box_regression: Tensor, proposals: List[Tensor]):
                 return self.box_coder.decode(box_regression, proposals)
 
         model = torch.jit.script(MyModule())
@@ -8572,20 +10358,28 @@ def forward(self, x):
         loaded_model = onnx.load_from_string(f.getvalue())
 
         actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert actual_list == state_dict_list, \
-            "Initializers' sequence is not as same as state_dict(). Expected: (" \
-            + ", ".join(state_dict_list) + "). Actual:(" + ", ".join(actual_list) + ")."
-        assert actual_list == named_params_list, \
-            "Initializers' sequence is not as same as named_parameters(). Expected: (" \
-            + ", ".join(named_params_list) + "). Actual:(" + ", ".join(actual_list) + ")."
+        assert actual_list == state_dict_list, (
+            "Initializers' sequence is not as same as state_dict(). Expected: ("
+            + ", ".join(state_dict_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+        assert actual_list == named_params_list, (
+            "Initializers' sequence is not as same as named_parameters(). Expected: ("
+            + ", ".join(named_params_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
 
     def test_initializer_sequence_script_model(self):
         def list_is_expected(short_list, long_list) -> bool:
-            if (len(short_list) > len(long_list)):
+            if len(short_list) > len(long_list):
                 return False
 
             for i in range(len(short_list)):
-                if (short_list[i] not in long_list[i]):
+                if short_list[i] not in long_list[i]:
                     return False
 
             return True
@@ -8621,12 +10415,20 @@ def forward(self, x, y):
         loaded_model = onnx.load_from_string(f.getvalue())
 
         actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert list_is_expected(state_dict_list, actual_list), \
-            "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: (" \
-            + ", ".join(state_dict_list) + "). Actual:(" + ", ".join(actual_list) + ")."
-        assert list_is_expected(named_params_list, actual_list), \
-            "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: (" \
-            + ", ".join(named_params_list) + "). Actual:(" + ", ".join(actual_list) + ")."
+        assert list_is_expected(state_dict_list, actual_list), (
+            "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: ("
+            + ", ".join(state_dict_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
+        assert list_is_expected(named_params_list, actual_list), (
+            "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: ("
+            + ", ".join(named_params_list)
+            + "). Actual:("
+            + ", ".join(actual_list)
+            + ")."
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_nms(self):
@@ -8635,7 +10437,6 @@ def test_nms(self):
         boxes[:, 2:] += boxes[:, :2]
         scores = torch.randn(num_boxes)
 
-
         class Module(torch.nn.Module):
             def forward(self, boxes, scores):
                 return ops.nms(boxes, scores, 0.5)
@@ -8657,6 +10458,7 @@ def forward(self, boxes, scores, idxs):
         self.run_test(Module(), (boxes, scores, idxs))
 
     @skipIfUnsupportedMinOpsetVersion(11)
+    @skipScriptTest()
     def test_clip_boxes_to_image(self):
         boxes = torch.randn(5, 4) * 500
         boxes[:, 2:] += boxes[:, :2]
@@ -8669,23 +10471,28 @@ def forward(self, boxes, size):
                 shape = (size.shape[0], size.shape[1])
                 return ops.boxes.clip_boxes_to_image(boxes, shape)
 
-        self.run_test(Module(), (boxes, size),
-                      input_names=["boxes", "size"],
-                      dynamic_axes={"size": [0, 1]},
-                      test_with_inputs=[(boxes, size), (boxes, size_2)])
+        self.run_test(
+            Module(),
+            (boxes, size),
+            input_names=["boxes", "size"],
+            dynamic_axes={"size": [0, 1]},
+            test_with_inputs=[(boxes, size), (boxes, size_2)],
+        )
 
+    @skipIfUnsupportedMaxOpsetVersion(15)  # TODO: Opset 16 RoiAlign result mismatch
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_roi_align(self):
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32)
-        model = ops.RoIAlign((5, 5), 1., 2)
+        model = ops.RoIAlign((5, 5), 1.0, 2)
         self.run_test(model, (x, single_roi))
 
+    @skipIfUnsupportedMaxOpsetVersion(15)  # TODO: Opset 16 RoiAlign result mismatch
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_roi_align_aligned(self):
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32)
-        model1 = ops.RoIAlign((5, 5), 1., 2, aligned=True)
+        model1 = ops.RoIAlign((5, 5), 1.0, 2, aligned=True)
         self.run_test(model1, (x, single_roi))
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
@@ -8709,7 +10516,7 @@ def test_roi_pool(self):
         rois = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32)
         pool_h = 5
         pool_w = 5
-        model = ops.RoIPool((pool_h, pool_w), 2.)
+        model = ops.RoIPool((pool_h, pool_w), 2.0)
         self.run_test(model, (x, rois))
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -8724,25 +10531,30 @@ def forward(self, images):
 
         input = torch.rand(3, 10, 20)
         input_test = torch.rand(3, 100, 150)
-        self.run_test(TransformModule(), (input,),
-                      input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]},
-                      test_with_inputs=[(input,), (input_test,)])
+        self.run_test(
+            TransformModule(),
+            (input,),
+            input_names=["input1"],
+            dynamic_axes={"input1": [0, 1, 2]},
+            test_with_inputs=[(input,), (input_test,)],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_transform_images(self):
-
         class TransformModule(torch.nn.Module):
             def __init__(self):
                 super(TransformModule, self).__init__()
                 self.transform = _init_test_generalized_rcnn_transform()
 
-            def forward(self, images: List[torch.Tensor]):
+            def forward(self, images: List[Tensor]):
                 return self.transform(images)[0].tensors
 
         input = torch.rand(3, 100, 200), torch.rand(3, 200, 200)
         input_test = torch.rand(3, 100, 200), torch.rand(3, 200, 200)
-        self.run_test(TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)])
+        self.run_test(
+            TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)]
+        )
 
     def get_features(self, images):
         s0, s1 = images.shape[-2:]
@@ -8757,7 +10569,7 @@ def get_features(self, images):
         return features
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_rpn(self):
         set_rng_seed(0)
 
@@ -8766,8 +10578,10 @@ def __init__(self):
                 super(RPNModule, self).__init__()
                 self.rpn = _init_test_rpn()
 
-            def forward(self, images, features: Dict[str, torch.Tensor]):
-                images_m = ImageList(images, [(i.shape[-1], i.shape[-2]) for i in images])
+            def forward(self, images, features: Dict[str, Tensor]):
+                images_m = ImageList(
+                    images, [(i.shape[-1], i.shape[-2]) for i in images]
+                )
                 return self.rpn(images_m, features)
 
         images = torch.rand(2, 3, 150, 150)
@@ -8778,16 +10592,25 @@ def forward(self, images, features: Dict[str, torch.Tensor]):
         model = RPNModule()
         model.eval()
         model(images, features)
-        self.run_test(model, (images, features),
-                      input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
-                      dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3],
-                                    "input3": [0, 1, 2, 3], "input4": [0, 1, 2, 3],
-                                    "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]},
-                      test_with_inputs=[(images, features), (images2, test_features)],
-                      dict_check=False)
-
+        self.run_test(
+            model,
+            (images, features),
+            input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
+            dynamic_axes={
+                "input1": [0, 1, 2, 3],
+                "input2": [0, 1, 2, 3],
+                "input3": [0, 1, 2, 3],
+                "input4": [0, 1, 2, 3],
+                "input5": [0, 1, 2, 3],
+                "input6": [0, 1, 2, 3],
+            },
+            test_with_inputs=[(images, features), (images2, test_features)],
+            dict_check=False,
+        )
+
+    @skipIfUnsupportedMaxOpsetVersion(15)  # TODO: Opset 16 RoiAlign result mismatch
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_multi_scale_roi_align(self):
         class TransformModule(torch.nn.Module):
             def __init__(self):
@@ -8795,8 +10618,7 @@ def __init__(self):
                 self.model = ops.MultiScaleRoIAlign(["feat1", "feat2"], 3, 2)
                 self.image_sizes = [(512, 512)]
 
-            def forward(self, input, boxes):
-                # type: (Dict[str, torch.Tensor], List[torch.Tensor]) -> torch.Tensor
+            def forward(self, input: Dict[str, Tensor], boxes: List[Tensor]) -> Tensor:
                 return self.model(input, boxes, self.image_sizes)
 
         i = OrderedDict()
@@ -8811,10 +10633,26 @@ def forward(self, input, boxes):
         boxes1 = torch.rand(6, 4) * 256
         boxes1[:, 2:] += boxes1[:, :2]
 
-        self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i, [boxes],), (i1, [boxes1],)])
+        self.run_test(
+            TransformModule(),
+            (
+                i,
+                [boxes],
+            ),
+            test_with_inputs=[
+                (
+                    i,
+                    [boxes],
+                ),
+                (
+                    i1,
+                    [boxes1],
+                ),
+            ],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_roi_heads(self):
         class RoiHeadsModule(torch.nn.Module):
             def __init__(self):
@@ -8823,15 +10661,21 @@ def __init__(self):
                 self.rpn = _init_test_rpn()
                 self.roi_heads = _init_test_roi_heads_faster_rcnn()
 
-            def forward(self, images, features: Dict[str, torch.Tensor]):
-                original_image_sizes = [(img.shape[-1], img.shape[-2]) for img in images]
+            def forward(self, images, features: Dict[str, Tensor]):
+                original_image_sizes = [
+                    (img.shape[-1], img.shape[-2]) for img in images
+                ]
 
-                images_m = ImageList(images, [(i.shape[-1], i.shape[-2]) for i in images])
+                images_m = ImageList(
+                    images, [(i.shape[-1], i.shape[-2]) for i in images]
+                )
                 proposals, _ = self.rpn(images_m, features)
-                detections, _ = self.roi_heads(features, proposals, images_m.image_sizes)
-                detections = self.transform.postprocess(detections,
-                                                        images_m.image_sizes,
-                                                        original_image_sizes)
+                detections, _ = self.roi_heads(
+                    features, proposals, images_m.image_sizes
+                )
+                detections = self.transform.postprocess(
+                    detections, images_m.image_sizes, original_image_sizes
+                )
                 return detections
 
         images = torch.rand(2, 3, 100, 100)
@@ -8843,12 +10687,21 @@ def forward(self, images, features: Dict[str, torch.Tensor]):
         model.eval()
         model(images, features)
 
-        self.run_test(model, (images, features),
-                      input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
-                      dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], "input3": [0, 1, 2, 3],
-                                    "input4": [0, 1, 2, 3], "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]},
-                      test_with_inputs=[(images, features), (images2, test_features)],
-                      dict_check=False)
+        self.run_test(
+            model,
+            (images, features),
+            input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
+            dynamic_axes={
+                "input1": [0, 1, 2, 3],
+                "input2": [0, 1, 2, 3],
+                "input3": [0, 1, 2, 3],
+                "input4": [0, 1, 2, 3],
+                "input5": [0, 1, 2, 3],
+                "input6": [0, 1, 2, 3],
+            },
+            test_with_inputs=[(images, features), (images2, test_features)],
+            dict_check=False,
+        )
 
     def test_set_(self):
         class M(torch.nn.Module):
@@ -8861,9 +10714,14 @@ def forward(self, x, y):
         self.run_test(M(), (x, y), remained_onnx_input_idx=[1])
 
         y2 = torch.randn(5, 2)
-        self.run_test(M(), (x, y), remained_onnx_input_idx=[1], input_names=['x', 'y'],
-                      dynamic_axes={'x': [0, 1], 'y': [0, 1]},
-                      test_with_inputs=[(y, y2)])
+        self.run_test(
+            M(),
+            (x, y),
+            remained_onnx_input_idx=[1],
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1], "y": [0, 1]},
+            test_with_inputs=[(y, y2)],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_set_attr_modules(self):
@@ -8877,10 +10735,12 @@ def __init__(self, embedding_dim):
             @staticmethod
             def get_embedding(embedding_dim: int):
                 emb = 4 / ((embedding_dim // 2) - 1)
-                emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb)
+                emb = torch.exp(
+                    torch.arange((embedding_dim // 2), dtype=torch.float) * -emb
+                )
                 return emb
 
-            def forward(self, input, incremental_state: Optional[torch.Tensor] = None):
+            def forward(self, input, incremental_state: Optional[Tensor] = None):
                 bsz, seq_len = input.shape[0], input.shape[1]
                 self.const = 3
                 if self.weights is None:
@@ -8890,9 +10750,9 @@ def forward(self, input, incremental_state: Optional[torch.Tensor] = None):
                 if incremental_state is not None:
                     pos = seq_len
                     return self.weights[1 + pos, :].expand(bsz, 1, -1)
-                return (
-                    self.weights.index_select(0, torch.ones((bsz * seq_len), dtype=torch.int64)).view(bsz, seq_len, -1)
-                )
+                return self.weights.index_select(
+                    0, torch.ones((bsz * seq_len), dtype=torch.int64)
+                ).view(bsz, seq_len, -1)
 
         class InnerModule(torch.nn.Module):
             def __init__(self, embedding_dim):
@@ -8903,7 +10763,9 @@ def __init__(self, embedding_dim):
             @staticmethod
             def get_embedding(embedding_dim: int):
                 emb = 4 / ((embedding_dim // 2) - 1)
-                emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb)
+                emb = torch.exp(
+                    torch.arange((embedding_dim // 2), dtype=torch.float) * -emb
+                )
                 return emb
 
             def forward(self, x):
@@ -8918,8 +10780,8 @@ def forward(self, x):
                 return self.module(x)
 
         x = torch.randn(3, 256)
-        self.run_test(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]})
-        self.run_test(Module(), (x, ), remained_onnx_input_idx=[])
+        self.run_test(Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]})
+        self.run_test(Module(), (x,), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_set_attr_modules_2(self):
@@ -8934,15 +10796,19 @@ def __init__(self, embedding_dim):
             @staticmethod
             def get_embedding(embedding_dim: int):
                 emb = 4 / ((embedding_dim // 2) - 1)
-                emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb)
+                emb = torch.exp(
+                    torch.arange((embedding_dim // 2), dtype=torch.float) * -emb
+                )
                 return emb
 
-            def forward(self, input, incremental_state: Optional[torch.Tensor] = None):
+            def forward(self, input, incremental_state: Optional[Tensor] = None):
                 bsz, seq_len = input.shape[0], input.shape[1]
                 self.const = 1.5
                 self.weights = InnerModule.get_embedding(self.embedding_dim)
                 return (
-                    self.weights.index_select(0, torch.ones((bsz * seq_len), dtype=torch.int64)).view(bsz, seq_len, -1)
+                    self.weights.index_select(
+                        0, torch.ones((bsz * seq_len), dtype=torch.int64)
+                    ).view(bsz, seq_len, -1)
                 ) * self.const
 
         class Module(torch.nn.Module):
@@ -8954,8 +10820,8 @@ def forward(self, x):
                 return self.module(x)
 
         x = torch.randn(3, 256)
-        self.run_test(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]})
-        self.run_test(Module(), (x, ), remained_onnx_input_idx=[])
+        self.run_test(Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]})
+        self.run_test(Module(), (x,), remained_onnx_input_idx=[])
 
     def test_set_attr(self):
         class MyModule(torch.nn.Module):
@@ -8996,7 +10862,7 @@ def set_cell_anchors(self, anchors):
                     self.conv.weight = torch.randn(3, 10)
                     self.conv.bias = self.conv.weight[:]
 
-            def forward(self, anchors) -> Optional[torch.Tensor]:
+            def forward(self, anchors) -> Optional[Tensor]:
                 self.set_cell_anchors(anchors)
                 return self.conv.bias
 
@@ -9020,7 +10886,7 @@ def set_cell_anchors(self, anchors, boxes):
                     self.conv.weight = anchors + self.conv.weight
                     boxes[:] = torch.zeros(2, 3)
 
-            def forward(self, anchors) -> Tuple[torch.Tensor, torch.Tensor]:
+            def forward(self, anchors) -> Tuple[Tensor, Tensor]:
                 boxes = torch.ones(2, 2, 3)
                 self.set_cell_anchors(anchors, boxes)
                 if self.conv.bias is not None:
@@ -9048,7 +10914,7 @@ def set_cell_anchors(self, anchors):
                 else:
                     self.conv.bias = torch.ones(3, 10, 3)
 
-            def forward(self, feature_maps, anchors) -> Tuple[torch.Tensor, torch.Tensor]:
+            def forward(self, feature_maps, anchors) -> Tuple[Tensor, Tensor]:
                 self.set_cell_anchors(anchors)
                 result = []
                 if self.conv.bias is not None:
@@ -9111,7 +10977,7 @@ def set_cell_anchors(self, anchors, boxes):
                         self.conv.weight = anchors * i
                         boxes[j] += torch.ones(3, 3)
 
-            def forward(self, anchors) -> Tuple[torch.Tensor, torch.Tensor]:
+            def forward(self, anchors) -> Tuple[Tensor, Tensor]:
                 boxes = torch.ones(10, 3, 3)
                 self.set_cell_anchors(anchors, boxes)
                 if self.conv.bias is not None:
@@ -9130,7 +10996,9 @@ def __init__(self):
                 self.conv = torch.nn.Conv1d(10, 3, 3)
                 self.conv.weight = torch.nn.Parameter(torch.zeros(3, 10))
                 self.conv.bias = torch.nn.Parameter(torch.zeros(3, 10, 3))
-                self.boxes : List[torch.Tensor] = [torch.ones(1)]  # Workaround placeholder for TorchScript
+                self.boxes: List[Tensor] = [
+                    torch.ones(1)
+                ]  # Workaround placeholder for TorchScript
 
             def set_cell_anchors(self, anchors):
                 self.conv.weight = torch.randn(3, 10)
@@ -9140,7 +11008,7 @@ def set_cell_anchors(self, anchors):
                         self.conv.weight = anchors * i
                         self.boxes.append(torch.ones(3, 3))
 
-            def forward(self, anchors) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+            def forward(self, anchors) -> Tuple[Tensor, List[Tensor]]:
                 self.boxes = []
                 self.set_cell_anchors(anchors)
                 if self.conv.bias is not None:
@@ -9154,8 +11022,9 @@ def forward(self, anchors) -> Tuple[torch.Tensor, List[torch.Tensor]]:
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_if(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size, prev_state):
-            # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        def check_init(
+            input_data: Tensor, hidden_size: int, prev_state: Tensor
+        ) -> Tuple[Tensor, Tensor]:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9164,11 +11033,23 @@ def check_init(input_data, hidden_size, prev_state):
             state = torch.zeros(state_size, device=input_data.device)
             state_copy = torch.zeros(state_size, device=input_data.device)
             if prev_state.size(0) == 0:
-                state[:] = torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) + state[:]
-                state_copy[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2
-                state_copy[:] = torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2
+                state[:] = (
+                    torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    + state[:]
+                )
+                state_copy[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 2
+                )
+                state_copy[:] = (
+                    torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 2
+                )
             else:
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 4
+                )
             return state, state_copy
 
         class Example(torch.nn.Module):
@@ -9183,16 +11064,20 @@ def forward(self, input_data, prev_state):
         model = Example(10)
         random_data = torch.rand((1, 5, 30, 30))
         empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0)
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["random_data", "empty_tensor"],
-                      dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]})
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["random_data", "empty_tensor"],
+            dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]},
+        )
         self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_if_2(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size, prev_state):
-            # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        def check_init(
+            input_data: Tensor, hidden_size: int, prev_state: Tensor
+        ) -> Tuple[Tensor, Tensor]:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9202,13 +11087,26 @@ def check_init(input_data, hidden_size, prev_state):
             state_copy = torch.zeros(state_size, device=input_data.device)
             if prev_state.size(0) == 0:
                 for i in range(2):
-                    state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i
-                    state_copy[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i
+                    state[:] = (
+                        torch.ones(
+                            batch_size, hidden_size, spatial_size_0, spatial_size_1
+                        )
+                        * i
+                    )
+                    state_copy[:] = (
+                        torch.ones(
+                            batch_size, hidden_size, spatial_size_0, spatial_size_1
+                        )
+                        * i
+                    )
             elif prev_state.size(0) == 1:
                 s = state[:]
                 state[:] = prev_state + s
             elif prev_state.size(0) == 2:
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 4
+                )
             return state, state_copy
 
         class Example(torch.nn.Module):
@@ -9224,22 +11122,29 @@ def forward(self, input_data, prev_state):
         random_data = torch.rand((1, 5, 30, 30))
         empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0)
         random_state = torch.rand((1, 1, 10, 30, 30))
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["data", "state"],
-                      dynamic_axes={"data": [0, 1, 2], "state": [0, 1, 2, 3, 4]},
-                      test_with_inputs=[(random_data, random_state)])
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["data", "state"],
-                      dynamic_axes={"state": [0, 1, 2, 3, 4]},
-                      test_with_inputs=[(random_data, random_state)],
-                      remained_onnx_input_idx=[1])
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["data", "state"],
+            dynamic_axes={"data": [0, 1, 2], "state": [0, 1, 2, 3, 4]},
+            test_with_inputs=[(random_data, random_state)],
+        )
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["data", "state"],
+            dynamic_axes={"state": [0, 1, 2, 3, 4]},
+            test_with_inputs=[(random_data, random_state)],
+            remained_onnx_input_idx=[1],
+        )
         self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_if_3(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size, prev_state):
-            # type: (torch.Tensor, int, torch.Tensor) -> torch.Tensor
+        def check_init(
+            input_data: Tensor, hidden_size: int, prev_state: Tensor
+        ) -> Tensor:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9249,7 +11154,12 @@ def check_init(input_data, hidden_size, prev_state):
             if prev_state.size(0) < 2:
                 state = state * 3
                 if prev_state.size(0) == 0:
-                    state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3
+                    state[:] = (
+                        torch.ones(
+                            batch_size, hidden_size, spatial_size_0, spatial_size_1
+                        )
+                        * 3
+                    )
                 else:
                     state = state + 2
 
@@ -9267,16 +11177,20 @@ def forward(self, input_data, prev_state):
         model = Example(4)
         random_data = torch.rand((1, 5, 4, 4))
         empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0)
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["random_data", "empty_tensor"],
-                      dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]})
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["random_data", "empty_tensor"],
+            dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]},
+        )
         self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_if_4(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size, prev_state):
-            # type: (torch.Tensor, int, torch.Tensor) -> torch.Tensor
+        def check_init(
+            input_data: Tensor, hidden_size: int, prev_state: Tensor
+        ) -> Tensor:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9285,9 +11199,15 @@ def check_init(input_data, hidden_size, prev_state):
             state = torch.zeros(state_size, device=input_data.device)
             if prev_state.size(0) == 0:
                 state = state + 3
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 3
+                )
                 state = state + 3
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 4
+                )
             else:
                 state = state + 2
             return state
@@ -9304,17 +11224,20 @@ def forward(self, input_data, prev_state):
         model = Example(4)
         random_data = torch.rand((1, 5, 4, 4))
         empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0)
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["random_data", "empty_tensor"],
-                      dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]})
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["random_data", "empty_tensor"],
+            dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]},
+        )
         self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[])
 
-
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_if_5(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size, prev_state):
-            # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        def check_init(
+            input_data: Tensor, hidden_size: int, prev_state: Tensor
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9323,9 +11246,15 @@ def check_init(input_data, hidden_size, prev_state):
             state = torch.zeros(state_size, device=input_data.device)
             state_ref = state
             if prev_state.size(0) == 0:
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 3
+                )
                 state = state + 3
-                state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4
+                state[:] = (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 4
+                )
             else:
                 state = state + 2
             return state, state_ref
@@ -9336,15 +11265,20 @@ def __init__(self, hidden_size):
                 self.hidden_size = hidden_size
 
             def forward(self, input_data, prev_state):
-                prev_state, state_ref = check_init(input_data, self.hidden_size, prev_state)
+                prev_state, state_ref = check_init(
+                    input_data, self.hidden_size, prev_state
+                )
                 return prev_state, state_ref
 
         model = Example(4)
         random_data = torch.rand((1, 5, 4, 4))
         empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0)
-        self.run_test(model, (random_data, empty_tensor),
-                      input_names=["random_data", "empty_tensor"],
-                      dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]})
+        self.run_test(
+            model,
+            (random_data, empty_tensor),
+            input_names=["random_data", "empty_tensor"],
+            dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]},
+        )
         self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -9396,7 +11330,6 @@ def forward(self, x, y):
         y = torch.randn(4, 5)
         self.run_test(model, (x, y))
 
-
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_list_del_in_block(self):
         class ListModel(torch.nn.Module):
@@ -9436,8 +11369,7 @@ def forward(self, x, y):
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_inplace_ops(self):
         @torch.jit.script
-        def check_init(input_data, hidden_size):
-            # type: (torch.Tensor, int) -> torch.Tensor
+        def check_init(input_data: Tensor, hidden_size: int) -> Tensor:
             batch_size = input_data.size(0)
             spatial_size_0 = input_data.size(2)
             spatial_size_1 = input_data.size(3)
@@ -9445,11 +11377,22 @@ def check_init(input_data, hidden_size):
             state_size = (2, batch_size, hidden_size, spatial_size_0, spatial_size_1)
             state = torch.zeros(state_size, device=input_data.device)
             if input_data.size(0) == 1:
-                state[1] += torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2
-                state[1] /= torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3
+                state[1] += (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 2
+                )
+                state[1] /= (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * 3
+                )
             for i in range(input_data.size(0)):
-                state[1] += torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
-                state[1] /= torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i
+                state[1] += torch.ones(
+                    batch_size, hidden_size, spatial_size_0, spatial_size_1
+                )
+                state[1] /= (
+                    torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1)
+                    * i
+                )
             return state
 
         class Example(torch.nn.Module):
@@ -9463,8 +11406,12 @@ def forward(self, input_data):
 
         model = Example(10)
         random_data = torch.rand((1, 5, 30, 30))
-        self.run_test(model, (random_data), input_names=["random_data"],
-                      dynamic_axes={"random_data": [0, 1, 2, 3]})
+        self.run_test(
+            model,
+            (random_data),
+            input_names=["random_data"],
+            dynamic_axes={"random_data": [0, 1, 2, 3]},
+        )
         self.run_test(model, (random_data), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -9472,9 +11419,9 @@ def test_input_mask_model(self):
         class InputMaskModel(torch.nn.Module):
             def __init__(self, output_size):
                 super(InputMaskModel, self).__init__()
-                self.bias = torch.nn.Parameter(torch.empty(
-                    output_size,
-                    dtype=torch.float))
+                self.bias = torch.nn.Parameter(
+                    torch.empty(output_size, dtype=torch.float)
+                )
                 with torch.no_grad():
                     self.bias.zero_()
 
@@ -9487,8 +11434,15 @@ def forward(self, model_input, y):
         output_size = 4
         m = InputMaskModel(output_size)
         x = torch.tensor([0, 4, 24, 25], dtype=torch.int64)
-        y = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4],
-                         [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]], dtype=torch.float)
+        y = torch.tensor(
+            [
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+            ],
+            dtype=torch.float,
+        )
         self.run_test(m, (x, y))
 
         class InputMaskModel(torch.nn.Module):
@@ -9505,11 +11459,18 @@ def forward(self, model_input_1, model_input_2, y):
         m = InputMaskModel(output_size)
         x1 = torch.tensor([0, 4, 24, 25], dtype=torch.int64)
         x2 = torch.tensor([0, 3, 12, 15], dtype=torch.int64)
-        y = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4],
-                         [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]], dtype=torch.float)
+        y = torch.tensor(
+            [
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+                [0.1, 0.2, 0.3, 0.4],
+            ],
+            dtype=torch.float,
+        )
         self.run_test(m, (x1, x2, y))
 
-    @disableScriptTest()
+    @skipScriptTest()
     def test_unsafe_chunk(self):
         class ChunkModel(torch.nn.Module):
             def forward(self, x):
@@ -9534,8 +11495,12 @@ def forward(self, x, y):
         model.eval()
         x = torch.ones(2, 3, 4, 5)
         y = torch.ones(3, 4, 5, 2)
-        self.run_test(model, (x, y), input_names=["x", "y"],
-                      dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]})
+        self.run_test(
+            model,
+            (x, y),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]},
+        )
         self.run_test(model, (x, y), remained_onnx_input_idx=[1])
 
         class ViewModel(torch.nn.Module):
@@ -9544,7 +11509,7 @@ def forward(self, x):
 
         model = ViewModel()
         model.eval()
-        x = torch.tensor(2.)
+        x = torch.tensor(2.0)
         self.run_test(model, (x,))
 
         # test prim::ListConstruct for Reshape input 1
@@ -9585,8 +11550,13 @@ def forward(self, signal):
         y = torch.randint(5, (M, C + 1, K + 1, N + 1))
         self.run_test(model, x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
         self.run_test(model, x, remained_onnx_input_idx=[])
-        self.run_test(model, x, input_names=["x"],
-                      dynamic_axes={"x" : [0, 1, 2, 3]}, test_with_inputs=[(x,), (y,)])
+        self.run_test(
+            model,
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+            test_with_inputs=[(x,), (y,)],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_symbolic_shape_inference_box(self):
@@ -9604,8 +11574,13 @@ def forward(self, boxes):
         x = torch.ones(2, 4)
         y = torch.ones(3, 5)
         self.run_test(model, x)
-        self.run_test(model, x, input_names=["x"],
-                      dynamic_axes={"x" : [0, 1]}, test_with_inputs=[(x,), (y,)])
+        self.run_test(
+            model,
+            x,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+            test_with_inputs=[(x,), (y,)],
+        )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_symbolic_shape_inference_box_if(self):
@@ -9632,41 +11607,61 @@ def test_symbolic_shape_inference_arange_2(self):
         class ArangeModel(torch.nn.Module):
             def forward(self, start):
                 return torch.arange(start.size(0), 8.5, 1.5, dtype=torch.int64)
+
         x = torch.randn(2, 3, 4)
-        self.run_test(ArangeModel(), (x,), input_names=['x'], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModel(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(ArangeModel(), (x,), remained_onnx_input_idx=[])
 
         class ArangeModel2(torch.nn.Module):
             def forward(self, start):
                 return torch.arange(start.size(0), 8.5, 1.5, dtype=torch.double)
+
         x = torch.randn(2, 3, 4)
-        self.run_test(ArangeModel2(), (x,), input_names=['x'], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ArangeModel2(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(ArangeModel2(), (x,), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_symbolic_shape_inference_nonzero(self):
         class OneLikeModel(torch.nn.Module):
             def forward(self, x):
-                ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
+                ones = torch.ones_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
                 return torch.nonzero(ones)
 
         x = torch.randn(2)
-        self.run_test(OneLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0]})
+        self.run_test(OneLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0]})
         self.run_test(OneLikeModel(), x, remained_onnx_input_idx=[])
         x = torch.randn(2, 3, 4)
-        self.run_test(OneLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            OneLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(OneLikeModel(), x, remained_onnx_input_idx=[])
 
         class ZeroLikeModel(torch.nn.Module):
             def forward(self, x):
-                zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu"))
+                zeros = torch.zeros_like(
+                    x,
+                    dtype=torch.float,
+                    layout=torch.strided,
+                    device=torch.device("cpu"),
+                )
                 return torch.nonzero(zeros)
 
         x = torch.randn(2)
-        self.run_test(ZeroLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0]})
+        self.run_test(ZeroLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0]})
         self.run_test(ZeroLikeModel(), x, remained_onnx_input_idx=[])
         x = torch.randn(2, 3, 4)
-        self.run_test(ZeroLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0, 1, 2]})
+        self.run_test(
+            ZeroLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
         self.run_test(ZeroLikeModel(), x, remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -9674,25 +11669,30 @@ def test_symbolic_shape_inference_expand_1(self):
         class ExpandModel(torch.nn.Module):
             def forward(self, x):
                 return x.expand(4, 6, 2)
+
         x = torch.randn(6, 1, requires_grad=True)
         self.run_test(ExpandModel(), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()  # Test code not scriptable
+    @skipScriptTest()  # Test code not scriptable
     def test_symbolic_shape_inference_expand_2(self):
         class M(torch.nn.Module):
             def forward(self, x):
                 input_shape = x.size()
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
                 return causal_mask.transpose(0, 1)
+
         x = torch.randn(3, 16)
         self.run_test(M(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]})
         self.run_test(M(), (x,), remained_onnx_input_idx=[])
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()  # Test code not scriptable
+    @skipScriptTest()  # Test code not scriptable
     def test_symbolic_shape_inference_slice(self):
         class M(torch.nn.Module):
             def forward(self, x, position_bias):
@@ -9700,10 +11700,15 @@ def forward(self, x, position_bias):
                 batch_size, seq_length = input_shape
                 position_bias = position_bias[:, :, -seq_length:, :]
                 return position_bias.transpose(0, 1)
+
         x = torch.randn(3, 16)
         position_bias = torch.randn(1, 3, 20, 8)
-        self.run_test(M(), (x, position_bias), input_names=["x", "position_bias"],
-                      dynamic_axes={"x": [0, 1], "position_bias": [0, 1, 2, 3]})
+        self.run_test(
+            M(),
+            (x, position_bias),
+            input_names=["x", "position_bias"],
+            dynamic_axes={"x": [0, 1], "position_bias": [0, 1, 2, 3]},
+        )
         self.run_test(M(), (x, position_bias), remained_onnx_input_idx=[1])
 
     def test_symbolic_shape_inference_slice_2(self):
@@ -9711,24 +11716,37 @@ class M(torch.nn.Module):
             def forward(self, position_bias):
                 position_bias = position_bias[:, :, -2:, :]
                 return position_bias.transpose(0, 1)
+
         position_bias = torch.randn(1, 3, 20, 8)
         self.run_test(M(), (position_bias,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_symbolic_shape_inference_time(self):
         input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
         h0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE)
         c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE)
-        model_lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
-        self.run_test(model_lstm, (input, (h0, c0)), input_names=["x", "y"],
-                      dynamic_axes={"x" : [0, 1]})
-        model_gru = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False)
-        self.run_test(model_gru, (input, h0), input_names=["x", "y"],
-                      dynamic_axes={"x" : [0, 1]})
-        model_rnn = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False)
-        self.run_test(model_rnn, (input, h0), input_names=["x", "y"],
-                      dynamic_axes={"x" : [0, 1]})
+        model_lstm = torch.nn.LSTM(
+            RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False
+        )
+        self.run_test(
+            model_lstm,
+            (input, (h0, c0)),
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1]},
+        )
+        model_gru = torch.nn.GRU(
+            RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False
+        )
+        self.run_test(
+            model_gru, (input, h0), input_names=["x", "y"], dynamic_axes={"x": [0, 1]}
+        )
+        model_rnn = torch.nn.RNN(
+            RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False
+        )
+        self.run_test(
+            model_rnn, (input, h0), input_names=["x", "y"], dynamic_axes={"x": [0, 1]}
+        )
 
     def test_symbolic_shape_inference_dynamic_axes(self):
         class M(torch.nn.Module):
@@ -9736,9 +11754,14 @@ def forward(self, input_ids):
                 input_shape = input_ids.size()
                 input_ids = input_ids.view(-1, input_shape[-1])
                 return input_ids.transpose(0, 1)
+
         x = torch.randn(3, 16)
-        self.run_test(M(), (x,), input_names=["input_ids"],
-                      dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}})
+        self.run_test(
+            M(),
+            (x,),
+            input_names=["input_ids"],
+            dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_hann_window_periodic(self):
@@ -9749,7 +11772,12 @@ def __init__(self):
 
             def forward(self, x, window_length: int):
                 self.window_length = window_length
-                return torch.add(x, torch.hann_window(self.window_length, periodic=True, dtype=torch.float))
+                return torch.add(
+                    x,
+                    torch.hann_window(
+                        self.window_length, periodic=True, dtype=torch.float
+                    ),
+                )
 
         win_length = 100
         x = torch.randn(win_length)
@@ -9766,7 +11794,12 @@ def __init__(self):
 
             def forward(self, x, window_length: int):
                 self.window_length = window_length
-                return torch.add(x, torch.hann_window(self.window_length, periodic=False, dtype=torch.float))
+                return torch.add(
+                    x,
+                    torch.hann_window(
+                        self.window_length, periodic=False, dtype=torch.float
+                    ),
+                )
 
         win_length = 100
         x = torch.randn(win_length)
@@ -9775,7 +11808,7 @@ def forward(self, x, window_length: int):
         self.run_test(module, (x, win_length))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_hann_window_default_values(self):
         class HannWindowModule(torch.nn.Module):
             def __init__(self):
@@ -9784,6 +11817,7 @@ def __init__(self):
 
             def forward(self, x, window_length: int):
                 import torch.nn.functional as F
+
                 self.window_length = window_length
                 return torch.add(x, F.relu(torch.hann_window(self.window_length)))
 
@@ -9795,7 +11829,7 @@ def forward(self, x, window_length: int):
         self.run_test(module, (x, win_length))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_tensordot_dim_count(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -9820,7 +11854,7 @@ def forward(self, x, y):
         self.run_test(M(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_tensordot_dynamic_dim(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -9833,9 +11867,13 @@ def forward(self, x, y):
         new_x = torch.randint(6, (8, 6, 2, 5))
         new_y = torch.randint(6, (2, 5, 3, 4))
 
-        self.run_test(M(), (x, y), test_with_inputs=[(new_x, new_y)],
-                      input_names=["input_x", "input_y"],
-                      dynamic_axes={"input_x": [0, 1, 2, 3], "input_y": [0, 1, 2, 3]})
+        self.run_test(
+            M(),
+            (x, y),
+            test_with_inputs=[(new_x, new_y)],
+            input_names=["input_x", "input_y"],
+            dynamic_axes={"input_x": [0, 1, 2, 3], "input_y": [0, 1, 2, 3]},
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_to_device(self):
@@ -9854,7 +11892,7 @@ def forward(self, x, y):
         self.run_test(M_ToDeviceDtype(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @skipScriptTest()
     def test_fill(self):
         class FillModule(torch.nn.Module):
             def forward(self, x, filled_value: int):
@@ -9891,11 +11929,15 @@ def forward(self, x):
         index = torch.tensor([0, 2, 3, 1, 4])
         self.run_test(M(0, index, updates), (x,))
 
-        updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float)
+        updates = torch.tensor(
+            [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float
+        )
         index = torch.tensor([0, 2, 3, 1])
         self.run_test(M(1, index, updates), (x,))
 
-        updates = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9], [2, 3, 4]]], dtype=torch.float)
+        updates = torch.tensor(
+            [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [2, 3, 4]]], dtype=torch.float
+        )
         index = torch.tensor([0, 2, 1])
         self.run_test(M(2, index, updates), (x,))
 
@@ -9933,9 +11975,11 @@ def forward(self, x):
                 return x
 
         x = torch.ones(5, 4, 3)
-        updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float)
+        updates = torch.tensor(
+            [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float
+        )
         index = torch.tensor([0, 2, 3, 1])
-        loop_count = torch.randint(20, (1, ))[0].item()
+        loop_count = torch.randint(20, (1,))[0].item()
         self.run_test(M(1, index, updates, loop_count), (x,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
@@ -9956,11 +12000,15 @@ def forward(self, x, cond):
                 return x
 
         x = torch.ones(5, 4, 3)
-        updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float)
+        updates = torch.tensor(
+            [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float
+        )
         index_true = torch.tensor([0, 2, 3, 1])
         index_false = torch.tensor([1, 0, 2, 3])
         cond = torch.tensor(1, dtype=torch.bool)
-        self.run_test(torch.jit.script(M(1, updates, index_true, index_false)), (x, cond))
+        self.run_test(
+            torch.jit.script(M(1, updates, index_true, index_false)), (x, cond)
+        )
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_add_dynamic_axes(self):
@@ -9977,12 +12025,18 @@ def forward(self, x):
 
         x = torch.ones(5, 4, 3)
         y = torch.ones(7, 8, 3)
-        updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float)
+        updates = torch.tensor(
+            [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float
+        )
         index = torch.tensor([0, 2, 3, 1])
 
-        self.run_test(M(1, index, updates), (x,), test_with_inputs=[y],
-                      input_names=['input_1'],
-                      dynamic_axes={'input_1': [0, 1]})
+        self.run_test(
+            M(1, index, updates),
+            (x,),
+            test_with_inputs=[y],
+            input_names=["input_1"],
+            dynamic_axes={"input_1": [0, 1]},
+        )
 
     def test_roll(self):
         class M(torch.nn.Module):
@@ -10006,7 +12060,7 @@ def forward(self, x):
                 return torch.sum(x)
 
         x = torch.ones(12, 3)
-        self.run_test(M(), (x,), input_names=['x'], dynamic_axes={'x': [0]})
+        self.run_test(M(), (x,), input_names=["x"], dynamic_axes={"x": [0]})
 
     def test_sum_empty_tensor(self):
         class M(torch.nn.Module):
@@ -10044,7 +12098,7 @@ def forward(self, x, y):
 
         self.run_test(M(), (x, y))
 
-    @disableScriptTest()
+    @skipScriptTest()
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dist_normal(self):
         class M(torch.nn.Module):
@@ -10054,9 +12108,15 @@ def forward(self, x, y):
         self.run_test(M(), (torch.tensor([0.0]), torch.tensor([[1.0], [2.0]])))
         self.run_test(M(), (torch.tensor([0.0]), torch.tensor([1.0])))
 
-        self.run_test(M(), (torch.tensor([[[0.0], [10.0]], [[2.0], [8.0]], [[2.0], [8.0]]]), torch.tensor([[1.0], [3.0]])))
+        self.run_test(
+            M(),
+            (
+                torch.tensor([[[0.0], [10.0]], [[2.0], [8.0]], [[2.0], [8.0]]]),
+                torch.tensor([[1.0], [3.0]]),
+            ),
+        )
 
-    @disableScriptTest()
+    @skipScriptTest()
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dist_normal_correctness(self):
         class M(torch.nn.Module):
@@ -10068,31 +12128,39 @@ def forward(self, x, y):
 
         model_export = M()
         dummy_input = (torch.tensor([expected_mean]), torch.tensor([expected_std]))
-        ort_sess = convert_to_onnx(model_export, input=dummy_input, opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.EVAL)
+        ort_sess = convert_to_onnx(
+            model_export,
+            input=dummy_input,
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.EVAL,
+        )
 
-        ort_out = run_ort(ort_sess, input=dummy_input)
+        ort_out = run_ort(ort_sess, inputs=dummy_input)
 
         actual_std = np.std(ort_out)
         actual_mean = np.mean(ort_out)
 
-        assert abs(abs(actual_mean) - expected_mean) <= expected_mean * 0.1, \
-               "the gap of mean between ort outputs and expected one is unacceptable."
-        assert abs(abs(actual_std) - expected_std) <= expected_std * 0.1, \
-               "the gap of variance between ort outputs and expected one is unacceptable."
+        assert (
+            abs(abs(actual_mean) - expected_mean) <= expected_mean * 0.1
+        ), "the gap of mean between ort outputs and expected one is unacceptable."
+        assert (
+            abs(abs(actual_std) - expected_std) <= expected_std * 0.1
+        ), "the gap of variance between ort outputs and expected one is unacceptable."
 
-    @disableScriptTest()
+    @skipScriptTest()
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dist_uniform(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
-                return torch.distributions.Uniform(x, y).sample().size(0), x , y
+                return torch.distributions.Uniform(x, y).sample().size(0), x, y
 
         self.run_test(M(), (torch.tensor([0.0]), torch.tensor([10.0])))
         self.run_test(M(), (torch.tensor([[0.0], [6.0]]), torch.tensor([[1.0], [7.0]])))
-        self.run_test(M(), (torch.tensor([1.0]), torch.tensor([[10.0], [7.0], [9.0], [20.0]])))
+        self.run_test(
+            M(), (torch.tensor([1.0]), torch.tensor([[10.0], [7.0], [9.0], [20.0]]))
+        )
 
-    @disableScriptTest()
+    @skipScriptTest()
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_dist_uniform_correctness(self):
         class M(torch.nn.Module):
@@ -10105,18 +12173,27 @@ def forward(self, x, y):
 
         model_export = M()
         dummy_input = (torch.tensor([expected_min]), torch.tensor([expected_max]))
-        ort_sess = convert_to_onnx(model_export, input=dummy_input, opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.EVAL)
-
-        ort_out = run_ort(ort_sess, input=dummy_input)
+        ort_sess = convert_to_onnx(
+            model_export,
+            input=dummy_input,
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.EVAL,
+        )
+
+        ort_out = run_ort(ort_sess, inputs=dummy_input)
         actual_min = np.min(ort_out)
         actual_max = np.max(ort_out)
         actual_mean = np.mean(ort_out)
 
-        assert actual_min >= expected_min, "the minimum value of ort outputs is out of scope."
-        assert actual_max <= expected_max, "the maximum value of ort outputs is out of scope."
-        assert abs(actual_mean - expected_mean) <= expected_mean * 0.05, \
-               "the mean value of ort outputs is out of scope."
+        assert (
+            actual_min >= expected_min
+        ), "the minimum value of ort outputs is out of scope."
+        assert (
+            actual_max <= expected_max
+        ), "the maximum value of ort outputs is out of scope."
+        assert (
+            abs(actual_mean - expected_mean) <= expected_mean * 0.05
+        ), "the mean value of ort outputs is out of scope."
 
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_sequence_to_int(self):
@@ -10132,7 +12209,9 @@ def forward(self, x):
     def test_sequence_to_float(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                result = torch.tensor([1.1 for i in range(x.size()[0])], dtype=torch.float)
+                result = torch.tensor(
+                    [1.1 for i in range(x.size()[0])], dtype=torch.float
+                )
                 return x, result
 
         x = torch.randn(10, 5)
@@ -10142,7 +12221,9 @@ def forward(self, x):
     def test_sequence_to_bool(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                result = torch.tensor([False for i in range(x.size()[0])], dtype=torch.bool)
+                result = torch.tensor(
+                    [False for i in range(x.size()[0])], dtype=torch.bool
+                )
                 return x, result
 
         x = torch.randn(10, 5)
@@ -10173,8 +12254,6 @@ def symbolic_custom_invalid_add(g, input, other, alpha=None):
         self.assertTrue(f.getvalue(), "ONNX graph was not exported.")
         loaded_model = onnx.load_from_string(f.getvalue())
 
-
-    @skipIfUnsupportedMinOpsetVersion(9)  # https://github.com/microsoft/onnxruntime/issues/9663
     def test_tuple_output_from_if_with_raised_exception(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -10185,6 +12264,7 @@ def forward(self, t: Tensor) -> Tuple[Tensor, Tensor]:
                     raise Exception("Negative input")
                 else:
                     return torch.zeros(5), torch.zeros(5)
+
         x = torch.zeros(1)
         self.run_test(torch.jit.script(M()), (x,))
 
@@ -10201,22 +12281,455 @@ def forward(self, x):
                 x = F.softmax(x, dim=1)
                 x = x.reshape(batch, -1)
                 return x
+
         radix = 2
         cardinality = 1
         x = torch.randn(10, 1, 128, 1)
         f = io.BytesIO()
-        torch.onnx.export(RSoftMax(radix, cardinality), (x, ), f, input_names=["x"], dynamic_axes={"x": [0]})
+        torch.onnx.export(
+            RSoftMax(radix, cardinality),
+            (x,),
+            f,
+            input_names=["x"],
+            dynamic_axes={"x": [0]},
+        )
         loaded_model = onnx.load_from_string(f.getvalue())
-        self.assertEqual(loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128)
+        self.assertEqual(
+            loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128
+        )
+
+    # NOTE: For quantization tests, choose scale and zero point carefully
+    #       such that inputs and outputs do not always overflow/underflow.
+    #       Otherwise test results could be inaccurate.
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_linear(self):
+        model = torch.nn.quantized.Linear(4, 8)
+        # Set fixed weight to avoid flaky test.
+        weight = torch.quantize_per_tensor(
+            torch.arange(32, dtype=torch.float).view(8, 4), 0.5, 0, torch.qint8
+        )
+        # Set non-zero bias.
+        bias = torch.arange(8, dtype=torch.float)
+        model.set_weight_bias(weight, bias)
+        # Set fixed input to avoid flaky test.
+        input = torch.randn(4, 4)
+        input = torch.arange(16, dtype=torch.float).view(4, 4) - 8
+        input_tensor = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8)
+        self.run_test(model, input_tensor)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_conv2d(self):
+        model = torch.nn.quantized.Conv2d(16, 33, 3, stride=2)
+        # Manually initialize model weight and bias to random numbers.
+        # By default all zeros.
+        q_weight = torch.quantize_per_tensor(
+            torch.randn(33, 16, 3, 3), 0.5, 0, torch.qint8
+        )
+        bias = torch.arange(33).to(torch.float) - 16
+        model.set_weight_bias(q_weight, bias)
+        input = torch.randn(3, 16, 32, 32)
+        q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8)
+        self.run_test(model, q_input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_adaptive_avg_pool2d(self):
+        model = torch.nn.AdaptiveAvgPool2d((5, 7))
+        input = torch.randn(4, 3, 10, 14)
+        q_input = torch.quantize_per_tensor(input, 0.2, 128, torch.quint8)
+        self.run_test(model, q_input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_conv2d_relu(self):
+        model = torch.nn.intrinsic.quantized.ConvReLU2d(16, 33, 3, stride=2)
+        # Manually initialize model weight and bias to random numbers.
+        # By default all zeros.
+        q_weight = torch.quantize_per_tensor(
+            torch.randn(33, 16, 3, 3), 0.5, 0, torch.qint8
+        )
+        bias = torch.arange(33).to(torch.float) - 16
+        model.set_weight_bias(q_weight, bias)
+        input = torch.randn(3, 16, 32, 32)
+        q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8)
+        self.run_test(model, q_input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_hardswish(self):
+        model = torch.nn.quantized.Hardswish(1.0, 0)
+        input = torch.randn(2, 6)
+        q_input = torch.quantize_per_tensor(input, 0.26, 128, torch.quint8)
+        self.run_test(model, q_input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_hardsigmoid(self):
+        model = torch.nn.Hardsigmoid()
+        input = torch.randn(2, 6)
+        q_input = torch.quantize_per_tensor(input, 0.26, 128, torch.quint8)
+        self.run_test(model, q_input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_flatten(self):
+        class FlattenModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.flatten(input)
+
+        x = torch.quantize_per_tensor(torch.randn(1, 2, 3, 4), 1, 0, torch.quint8)
+        self.run_test(FlattenModel(), x)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    @skipScriptTest()  # torch.jit.frontend.FrontendError: Cannot instantiate class 'QFunctional' in a script function:
+    def test_quantized_arithmetic_qfunctional(self):
+        x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8)
+        y = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8)
+
+        class ArithmeticModel(torch.nn.Module):
+            def forward(self, x, y):
+                o = torch.nn.quantized.QFunctional().add(x, y)
+                o = torch.nn.quantized.QFunctional().mul(o, x)
+                return o
+
+        self.run_test(ArithmeticModel(), (x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_arithmetic(self):
+        x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8)
+        y = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8)
 
-def make_test(name, base, layer, bidirectional, initial_state,
-              variable_length, dropout, script_test_min_opset_version,
-              **extra_kwargs):
-    test_name = str("_".join([
-        "test", name, layer[1],
-        bidirectional[1], initial_state[1],
-        variable_length[1], dropout[1]
-    ]))
+        class ArithmeticModel2(torch.nn.Module):
+            def forward(self, x, y):
+                o = torch.ops.quantized.add(x, y, 0.4, 100)
+                o = torch.ops.quantized.mul(o, x, 0.4, 100)
+                return o
+
+        self.run_test(ArithmeticModel2(), (x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantize_per_tensor(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return (
+                    torch.quantize_per_tensor(x, 0.2, 0, torch.qint8),
+                    torch.quantize_per_tensor(x, 0.2, 128, torch.quint8),
+                )
+
+        x = torch.randn(4, 6)
+        self.run_test(Module(), x)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_dequantize(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.dequantize(x)
+
+        x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 0, torch.qint8)
+        self.run_test(Module(), x)
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_linear_per_channel(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.linear = torch.nn.Linear(4, 3)
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.linear(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.prepare_qat(model)
+        # Set fixed weight and bias to avoid flaky test.
+        model.linear.weight = torch.nn.Parameter(
+            _construct_tensor_for_quantization_test((3, 4))
+        )
+        model.linear.bias = torch.nn.Parameter(torch.arange(3, dtype=torch.float))
+        model = torch.quantization.convert(model)
+
+        # Set fixed input to avoid flaky test.
+        input = _construct_tensor_for_quantization_test((4, 4), offset=-8)
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_relu(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.relu = torch.nn.ReLU()
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.relu(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.prepare_qat(model)
+        model = torch.quantization.convert(model)
+        input = torch.randn(8, 4)
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_conv2d(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.prepare_qat(model)
+        # Set fixed weight and bias to avoid flaky test.
+        model.conv.weight = torch.nn.Parameter(
+            _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
+        )
+        model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
+        model = torch.quantization.convert(model)
+
+        # Set fixed input to avoid flaky test.
+        input = _construct_tensor_for_quantization_test(
+            (3, 4, 8, 8), offset=-384, max_val=12
+        )
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_conv2d_relu(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
+                self.relu = torch.nn.ReLU()
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.relu(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.prepare_qat(model)
+        # Set fixed weight and bias to avoid flaky test.
+        model.conv.weight = torch.nn.Parameter(
+            _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
+        )
+        model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
+        model = torch.quantization.convert(model)
+
+        # Set fixed input to avoid flaky test.
+        input = _construct_tensor_for_quantization_test(
+            (3, 4, 8, 8), offset=-384, max_val=12
+        )
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_conv2d_relu_fused(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
+                self.relu = torch.nn.ReLU()
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.relu(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.fuse_modules(model.eval(), [["conv", "relu"]])
+        model = torch.quantization.prepare_qat(model.train())
+        # Set fixed weight and bias to avoid flaky test.
+        model.conv.weight = torch.nn.Parameter(
+            _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
+        )
+        model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
+        model = torch.quantization.convert(model)
+
+        # Set fixed input to avoid flaky test.
+        input = _construct_tensor_for_quantization_test(
+            (3, 4, 8, 8), offset=-384, max_val=12
+        )
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_qat_maxpool2d(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.pool(x)
+                x = self.dequant(x)
+                return x
+
+        model = M()
+        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        model = torch.quantization.prepare_qat(model.train())
+        model = torch.quantization.convert(model)
+
+        # Set fixed input to avoid flaky test.
+        input = _construct_tensor_for_quantization_test((4, 4, 3, 2))
+        self.run_test(model, input)
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_convolution_allow_tf32(self):
+        class Module(torch.nn.Module):
+            def __init__(self, allow_tf32):
+                super().__init__()
+
+                self.allow_tf32 = allow_tf32
+                weight = torch.rand(32, 3, 3, 3)
+                self.weight = torch.nn.Parameter(weight)
+
+            def forward(self, x):
+                if self.allow_tf32:
+                    return torch._convolution(
+                        x,
+                        self.weight,
+                        None,
+                        [2, 2],
+                        [0, 0],
+                        [1, 1],
+                        False,
+                        [0, 0],
+                        1,
+                        False,
+                        False,
+                        True,
+                        True,
+                    )
+                else:
+                    return torch._convolution(
+                        x,
+                        self.weight,
+                        None,
+                        [2, 2],
+                        [0, 0],
+                        [1, 1],
+                        False,
+                        [0, 0],
+                        1,
+                        False,
+                        False,
+                        True,
+                    )
+
+        x = torch.randn(1, 3, 224, 224)
+        self.run_test(Module(False), x, rtol=1e-3, atol=1e-6)
+        self.run_test(Module(True), x, rtol=1e-3, atol=1e-6)
+
+    @skipIfUnsupportedMinOpsetVersion(16)
+    def test_grid_sample(self):
+        n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4
+
+        class GridSampleModule(torch.nn.Module):
+            def __init__(self, mode, padding_mode, align_corners) -> None:
+                super().__init__()
+                self.mode, self.padding_mode, self.align_corners = (
+                    mode,
+                    padding_mode,
+                    align_corners,
+                )
+
+            def forward(self, input, grid):
+                return torch.nn.functional.grid_sample(
+                    input, grid, self.mode, self.padding_mode, self.align_corners
+                )
+
+        for mode, padding_mode, align_corners in itertools.product(
+            ("bilinear", "nearest", "bicubic"),
+            ("zeros", "border", "reflection"),
+            (True, False),
+        ):
+            atol_rtol = {}
+            if (mode, padding_mode) == ("bicubic", "border"):
+                if align_corners:
+                    atol_rtol.update({"atol": 0.3, "rtol": 0.4})
+                else:
+                    atol_rtol.update({"atol": 0.02, "rtol": 0.02})
+            input, grid = torch.randn(n, c, h_in, w_in), torch.randn(n, h_out, w_out, 2)
+            self.run_test(
+                GridSampleModule(mode, padding_mode, align_corners),
+                (input, grid),
+                **atol_rtol,
+            )
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_device_eq(self):
+        class M(torch.nn.Module):
+            def forward(self, a):
+                # exercise both Tensor.device (prim::device)
+                # and torch.device (prim::Constant).
+                if a.device != torch.device("cpu"):
+                    return a
+                return torch.zeros_like(a)
+
+        mod = torch.jit.script(M())  # preserve control flow
+
+        self.run_test(
+            mod,
+            # In order for the ONNX model behavior to match the torch model, we
+            # need to construct input that has the same device that is checked for
+            # in forward(). In ONNX there is no such thing as a device, so the if
+            # condition is always false.
+            torch.randn(3, 3, device="cpu"),
+            # Force dynamic axes so that the output shape depends on the input.
+            # Otherwise the entire model will just return a constant and not have
+            # any inputs.
+            input_names=["a"],
+            dynamic_axes={"a": {0: "a0"}},
+        )
+
+
+def make_test(
+    name,
+    base,
+    layer,
+    bidirectional,
+    initial_state,
+    variable_length,
+    dropout,
+    script_test_min_opset_version,
+    **extra_kwargs,
+):
+    test_name = str(
+        "_".join(
+            [
+                "test",
+                name,
+                layer[1],
+                bidirectional[1],
+                initial_state[1],
+                variable_length[1],
+                dropout[1],
+            ]
+        )
+    )
 
     # Cannot export with older opsets because of "ConstantFill" op
     # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime
@@ -10226,10 +12739,12 @@ def make_test(name, base, layer, bidirectional, initial_state,
     #       - https://msdata.visualstudio.com/Vienna/_workitems/edit/1055382
     #   Operator aten::_pack_padded_sequence is not supported by exporter yet.
     #       - https://msdata.visualstudio.com/Vienna/_workitems/edit/1055384
-    @disableScriptTest()
+    @skipScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
     def f(self):
-        self.is_script_test_enabled = self.opset_version >= script_test_min_opset_version
+        self.is_script_test_enabled = (
+            self.opset_version >= script_test_min_opset_version
+        )
         self._dispatch_rnn_test(
             base,
             layers=layer[0],
@@ -10237,153 +12752,126 @@ def f(self):
             initial_state=initial_state[0],
             packed_sequence=variable_length[0],
             dropout=dropout[0],
-            **extra_kwargs)
+            **extra_kwargs,
+        )
 
     f.__name__ = test_name
-    setattr(TestONNXRuntime, f.__name__, f)
+    setattr(_TestONNXRuntime, f.__name__, f)
+
 
 def setup_rnn_tests():
-    layers_opts = [
-        (1, "unilayer"),
-        (3, "trilayer")
-    ]
-    bidirectional_opts = [
-        (False, "forward"),
-        (True, "bidirectional")
-    ]
-    initial_state_opts = [
-        (True, "with_initial_state"),
-        (False, "no_initial_state")
-    ]
+    layers_opts = [(1, "unilayer"), (3, "trilayer")]
+    bidirectional_opts = [(False, "forward"), (True, "bidirectional")]
+    initial_state_opts = [(True, "with_initial_state"), (False, "no_initial_state")]
     variable_length_opts = [
         (0, "without_sequence_lengths"),
         (1, "with_variable_length_sequences"),
-        (2, "with_batch_first_sequence_lengths")
-    ]
-    dropout_opts = [
-        (0.2, "with_dropout"),
-        (0.0, "without_dropout")
+        (2, "with_batch_first_sequence_lengths"),
     ]
+    dropout_opts = [(0.2, "with_dropout"), (0.0, "without_dropout")]
     test_count = 0
-    for (layer, bidirectional, initial_state, variable_length, dropout) in \
-            itertools.product(
-                layers_opts,
-                bidirectional_opts,
-                initial_state_opts,
-                variable_length_opts,
-                dropout_opts,):
+    for (
+        layer,
+        bidirectional,
+        initial_state,
+        variable_length,
+        dropout,
+    ) in itertools.product(
+        layers_opts,
+        bidirectional_opts,
+        initial_state_opts,
+        variable_length_opts,
+        dropout_opts,
+    ):
 
         for base, name, extra_kwargs in (
-                ("elman", "elman_relu", {"nonlinearity": u"relu"}),
-                ("elman", "elman_tanh", {"nonlinearity": u"tanh"}),
-                ("lstm", "lstm", {}),
-                ("gru", "gru", {})
+            ("elman", "elman_relu", {"nonlinearity": "relu"}),
+            ("elman", "elman_tanh", {"nonlinearity": "tanh"}),
+            ("lstm", "lstm", {}),
+            ("gru", "gru", {}),
         ):
             # Need Add between list of tensors
             script_test_min_opset_version = 11
 
-            if (    # compiling in script mode fails with errors like:
-                    # torch.jit.frontend.UnsupportedNodeError: annotated assignments
-                    # without assigned value aren't supported
-                    # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723
-                    base == 'elman' or
-                    # compiling in script mode fails with errors like:
-                    # RuntimeError: Arguments for call are not valid.
-                    # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723
-                    base == 'lstm'):
+            if (  # compiling in script mode fails with errors like:
+                # torch.jit.frontend.UnsupportedNodeError: annotated assignments
+                # without assigned value aren't supported
+                # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723
+                base == "elman"
+                or
+                # compiling in script mode fails with errors like:
+                # RuntimeError: Arguments for call are not valid.
+                # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723
+                base == "lstm"
+            ):
                 script_test_min_opset_version = float("inf")
-            make_test(name, base, layer, bidirectional, initial_state,
-                      variable_length, dropout, script_test_min_opset_version,
-                      **extra_kwargs)
+            make_test(
+                name,
+                base,
+                layer,
+                bidirectional,
+                initial_state,
+                variable_length,
+                dropout,
+                script_test_min_opset_version,
+                **extra_kwargs,
+            )
             test_count += 1
 
     # sanity check that a representative example does exist
-    TestONNXRuntime.test_gru_trilayer_forward_with_initial_state_without_sequence_lengths_with_dropout
+    _TestONNXRuntime.test_gru_trilayer_forward_with_initial_state_without_sequence_lengths_with_dropout
 
     # make sure no one accidentally disables all the tests without
     # noticing
     if test_count != 192:
         raise ValueError("Expected 192 tests but found {}".format(test_count))
 
+
 setup_rnn_tests()
 
 
-# opset 7 tests
-TestONNXRuntime_opset7 = type(str("TestONNXRuntime_opset7"),
-                              (unittest.TestCase,),
-                              dict(TestONNXRuntime.__dict__, opset_version=7))
-
-# opset 8 tests
-TestONNXRuntime_opset8 = type(str("TestONNXRuntime_opset8"),
-                              (unittest.TestCase,),
-                              dict(TestONNXRuntime.__dict__, opset_version=8))
-
-
-# opset 10 tests
-TestONNXRuntime_opset10 = type(str("TestONNXRuntime_opset10"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=10))
-
-# opset 11 tests
-TestONNXRuntime_opset11 = type(str("TestONNXRuntime_opset11"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=11))
-
-# opset 12 tests
-TestONNXRuntime_opset12 = type(str("TestONNXRuntime_opset12"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=12))
-
-# opset 9 tests, with keep_initializers_as_inputs=False for
-# IR version 4 style export.
-TestONNXRuntime_opset9_IRv4 = type(str("TestONNXRuntime_opset9_IRv4"),
-                                   (unittest.TestCase,),
-                                   dict(TestONNXRuntime.__dict__,
-                                        keep_initializers_as_inputs=False))
-
-
-# opset 10 tests, with keep_initializers_as_inputs=False for
-# IR version 4 style export.
-TestONNXRuntime_opset10_IRv4 = type(str("TestONNXRuntime_opset10_IRv4"),
-                                    (unittest.TestCase,),
-                                    dict(TestONNXRuntime.__dict__, opset_version=10,
-                                         keep_initializers_as_inputs=False))
-
-
-# opset 11 tests, with keep_initializers_as_inputs=False for
-# IR version 4 style export.
-TestONNXRuntime_opset11_IRv4 = type(str("TestONNXRuntime_opset11_IRv4"),
-                                    (unittest.TestCase,),
-                                    dict(TestONNXRuntime.__dict__, opset_version=11,
-                                         keep_initializers_as_inputs=False))
-
-# opset 12 tests, with keep_initializers_as_inputs=False for
-# IR version 4 style export.
-TestONNXRuntime_opset12_IRv4 = type(str("TestONNXRuntime_opset12_IRv4"),
-                                    (unittest.TestCase,),
-                                    dict(TestONNXRuntime.__dict__, opset_version=12,
-                                         keep_initializers_as_inputs=False))
-
-# opset 13 tests
-TestONNXRuntime_opset13 = type(str("TestONNXRuntime_opset13"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=13,
-                                    keep_initializers_as_inputs=False,
-                                    onnx_shape_inference=True))
-
-# opset 14 tests
-TestONNXRuntime_opset14 = type(str("TestONNXRuntime_opset14"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=14,
-                                    keep_initializers_as_inputs=False,
-                                    onnx_shape_inference=True))
-
-# opset 15 tests
-TestONNXRuntime_opset15 = type(str("TestONNXRuntime_opset15"),
-                               (unittest.TestCase,),
-                               dict(TestONNXRuntime.__dict__, opset_version=15,
-                                    keep_initializers_as_inputs=False,
-                                    onnx_shape_inference=True))
+def MakeTestCase(opset_version: int, keep_initializers_as_inputs: bool = True) -> type:
+    name = f"TestONNXRuntime_opset{opset_version}"
+    if not keep_initializers_as_inputs:
+        name += "_IRv4"
+    return type(
+        str(name),
+        (unittest.TestCase,),
+        dict(
+            _TestONNXRuntime.__dict__,
+            opset_version=opset_version,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+        ),
+    )
+
+
+TestONNXRuntime_opset7 = MakeTestCase(7)
+
+TestONNXRuntime_opset8 = MakeTestCase(8)
+
+TestONNXRuntime_opset9 = MakeTestCase(9)
+
+TestONNXRuntime_opset9_IRv4 = MakeTestCase(9, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset10 = MakeTestCase(10)
+
+TestONNXRuntime_opset10_IRv4 = MakeTestCase(10, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset11 = MakeTestCase(11)
+
+TestONNXRuntime_opset11_IRv4 = MakeTestCase(11, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset12 = MakeTestCase(12)
+
+TestONNXRuntime_opset12_IRv4 = MakeTestCase(12, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset13 = MakeTestCase(13, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset14 = MakeTestCase(14, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset15 = MakeTestCase(15, keep_initializers_as_inputs=False)
+
+TestONNXRuntime_opset16 = MakeTestCase(16, keep_initializers_as_inputs=False)
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
index 575d4caa16ce..38ac87d46d13 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@@ -1,19 +1,26 @@
 # Owner(s): ["module: onnx"]
 
 import unittest
+
 import onnxruntime  # noqa: F401
-import torch
+from test_pytorch_common import (
+    skipIfNoBFloat16Cuda,
+    skipIfNoCuda,
+    skipIfUnsupportedMinOpsetVersion,
+    skipScriptTest,
+)
+
+# TODO(justinchuby): Remove reference to other unit tests.
+from test_pytorch_onnx_onnxruntime import TestONNXRuntime
 
+import torch
 from torch.cuda.amp import autocast
+from torch.onnx._globals import GLOBALS
 
-from test_pytorch_common import disableScriptTest, skipIfUnsupportedMinOpsetVersion
-from test_pytorch_common import skipIfNoCuda, skipIfNoBFloat16Cuda
-
-from test_pytorch_onnx_onnxruntime import TestONNXRuntime
 
 class TestONNXRuntime_cuda(unittest.TestCase):
-    from torch.onnx.symbolic_helper import _export_onnx_opset_version
-    opset_version = _export_onnx_opset_version
+
+    opset_version = GLOBALS.export_onnx_opset_version
     keep_initializers_as_inputs = True
     onnx_shape_inference = True
 
@@ -24,12 +31,20 @@ class GeluModel(torch.nn.Module):
             def forward(self, x):
                 return torch.nn.functional.gelu(x)
 
-        x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device("cuda"))
+        x = torch.randn(
+            2,
+            4,
+            5,
+            6,
+            requires_grad=True,
+            dtype=torch.float16,
+            device=torch.device("cuda"),
+        )
         self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)
 
     @skipIfUnsupportedMinOpsetVersion(9)
     @skipIfNoCuda
-    @disableScriptTest()
+    @skipScriptTest()
     def test_layer_norm_fp16(self):
         class LayerNormModel(torch.nn.Module):
             def __init__(self):
@@ -40,13 +55,20 @@ def __init__(self):
             def forward(self, x):
                 return self.layer_norm(x)
 
-        x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device("cuda"))
+        x = torch.randn(
+            20,
+            5,
+            10,
+            10,
+            requires_grad=True,
+            dtype=torch.float16,
+            device=torch.device("cuda"),
+        )
         self.run_test(LayerNormModel().cuda(), x, rtol=1e-3, atol=1e-5)
 
-
     @skipIfUnsupportedMinOpsetVersion(12)
     @skipIfNoCuda
-    @disableScriptTest()
+    @skipScriptTest()
     def test_softmaxCrossEntropy_fusion_fp16(self):
         class FusionModel(torch.nn.Module):
             def __init__(self):
@@ -61,14 +83,16 @@ def forward(self, input, target):
 
         N, C = 5, 4
         input = torch.randn(N, 16, dtype=torch.float16, device=torch.device("cuda"))
-        target = torch.empty(N, dtype=torch.long, device=torch.device("cuda")).random_(0, C)
+        target = torch.empty(N, dtype=torch.long, device=torch.device("cuda")).random_(
+            0, C
+        )
 
         # using test data containing default ignore_index=-100
         target[target == 1] = -100
         self.run_test(FusionModel(), (input, target))
 
     @skipIfNoCuda
-    @disableScriptTest()
+    @skipScriptTest()
     def test_apex_o2(self):
         class LinearModel(torch.nn.Module):
             def __init__(self):
@@ -77,6 +101,7 @@ def __init__(self):
 
             def forward(self, x):
                 return self.linear(x)
+
         try:
             from apex import amp
         except Exception:
@@ -94,11 +119,33 @@ class MyModule(torch.nn.Module):
             def forward(self, x):
                 y = torch.ones(3, 4, dtype=torch.bfloat16, device=torch.device("cuda"))
                 x = x.type_as(y)
-                return torch.mul(torch.add(x, y), torch.sub(x, y)).to(dtype=torch.float16)
+                return torch.mul(torch.add(x, y), torch.sub(x, y)).to(
+                    dtype=torch.float16
+                )
 
-        x = torch.ones(3, 4, requires_grad=True, dtype=torch.float16, device=torch.device("cuda"))
+        x = torch.ones(
+            3, 4, requires_grad=True, dtype=torch.float16, device=torch.device("cuda")
+        )
         self.run_test(MyModule(), x, rtol=1e-3, atol=1e-5)
 
+    @skipIfNoCuda
+    def test_deduplicate_initializers_diff_devices(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.nn.Parameter(
+                    torch.ones(2, 3, device=torch.device("cpu"))
+                )
+                self.b = torch.nn.Parameter(torch.ones(3, device=torch.device("cuda")))
+
+            def forward(self, x, y):
+                return torch.matmul(self.w, x), y + self.b
+
+        x = torch.randn(3, 3, device=torch.device("cpu"))
+        y = torch.randn(3, 3, device=torch.device("cuda"))
+        self.run_test(Model(), (x, y))
+
+
 TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
 TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test
 
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index a1eb4b7018d0..50e40ae95618 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -1,11 +1,14 @@
 # Owner(s): ["module: onnx"]
 
 import unittest
-import torch
+
 import numpy as np
-from torch.onnx.symbolic_helper import (_set_onnx_shape_inference,
-                                        _onnx_main_opset,
-                                        _set_opset_version)
+from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
+
+import torch
+from torch.onnx import _constants
+from torch.onnx.symbolic_helper import _set_onnx_shape_inference, _set_opset_version
+
 
 def expect_tensor(scalar_type, shape=None):
     def verify(actual_type):
@@ -14,12 +17,14 @@ def verify(actual_type):
         #     np.testing.assert_equal(actual_type.sizes(), shape)
         if shape is not None:
             np.testing.assert_equal(actual_type.varyingSizes(), shape)
+
     return verify
 
+
 class TestONNXShapeInference(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         unittest.TestCase.__init__(self, *args, **kwargs)
-        self.opset_version = _onnx_main_opset
+        self.opset_version = _constants.onnx_main_opset
         _set_onnx_shape_inference(True)
         _set_opset_version(self.opset_version)
 
@@ -53,17 +58,23 @@ def test_constant_of_shape(self):
         constant = self.insert_tensor_constant(g, torch.ones(1, 2, 3, 4))
         shape = g.op("Shape", constant)
         constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
-        self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4)))
+        self.run_test(
+            g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))
+        )
 
     def test_constant_of_shape_static(self):
         # Test ConstantOfShape with input of prim::ListConstruct of static tensor
         rank = 4
         g = self.create_empty_graph()
-        constants = [self.insert_tensor_constant(g, torch.tensor(i + 1)) for i in range(rank)]
+        constants = [
+            self.insert_tensor_constant(g, torch.tensor(i + 1)) for i in range(rank)
+        ]
         shape = g.op("prim::ListConstruct", *constants)
         shape.setType(torch._C.ListType.ofInts())
         constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
-        self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4)))
+        self.run_test(
+            g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))
+        )
 
     def test_constant_of_shape_dynamic(self):
         # Test ConstantOfShape with input of prim::ListConstruct of dynamic tensor
@@ -73,7 +84,34 @@ def test_constant_of_shape_dynamic(self):
         shape = g.op("prim::ListConstruct", *inputs)
         shape.setType(torch._C.ListType.ofInts())
         constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
-        self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(None, None, None, None)))
+        self.run_test(
+            g,
+            constant_of_shape.node(),
+            expect_tensor("Float", shape=(None, None, None, None)),
+        )
+
+    def test_gather_dynamic_index(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        input.setType(
+            input.type().with_dtype(torch.float).with_sizes([None, 3, 16, 16])
+        )
+        indices = g.addInput()
+        indices.setType(indices.type().with_dtype(torch.int64).with_sizes([None]))
+        output = g.op("Gather", input, indices, axis_i=1)
+        self.run_test(
+            g, output.node(), expect_tensor("Float", shape=([None, None, 16, 16]))
+        )
+
+    def test_gather_scalar_index(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        input.setType(
+            input.type().with_dtype(torch.float).with_sizes([None, 3, 16, 16])
+        )
+        indices = self.insert_tensor_constant(g, torch.tensor(1))
+        output = g.op("Gather", input, indices, axis_i=1)
+        self.run_test(g, output.node(), expect_tensor("Float", shape=([None, 16, 16])))
 
     def test_reshape(self):
         g = self.create_empty_graph()
@@ -94,6 +132,23 @@ def test_reshape(self):
         shape = g.op("Reshape", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(8, 16, 5)))
 
+    def test_reshape_symbolic(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        input.setType(input.type().with_sizes([None, None, 2, 8]))
+        constant = self.insert_tensor_constant(g, torch.tensor([0, 0, -1]))
+        output = g.op("Reshape", input, constant)
+        self.run_test(g, output.node(), expect_tensor(None, shape=(None, None, 16)))
+
+    @skipIfUnsupportedMinOpsetVersion(14)
+    def test_reshape_allowzero(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        input.setType(input.type().with_sizes([3, 4, 0]))
+        constant = self.insert_tensor_constant(g, torch.tensor([0, 4, 3]))
+        output = g.op("Reshape", input, constant, allowzero_i=1)
+        self.run_test(g, output.node(), expect_tensor(None, shape=(0, 4, 3)))
+
     def test_slice(self):
         g = self.create_empty_graph()
         input = g.addInput()
@@ -106,5 +161,52 @@ def test_slice(self):
         slice = g.op("Slice", input, start_input, end, axis, step)
         self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None)))
 
-if __name__ == '__main__':
+    def test_broadcast_matmul(self):
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1)))
+
+        # test when first input is of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1)))
+
+        # test when second input is of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(2))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1)))
+
+        # test when both inputs are of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(2))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=()))
+
+    def test_expand(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        constant = self.insert_tensor_constant(g, torch.ones(2, 4))
+        input.setType(constant.type().with_sizes([None, None]))
+        shape = g.op("Shape", input)
+        expand = g.op("Expand", constant, shape)
+        self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None)))
+
+    def test_pad(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        input.setType(input.type().with_dtype(torch.float).with_sizes([3, 320, 100]))
+        constant = self.insert_tensor_constant(g, torch.ones(6, dtype=torch.long))
+        none = g.op("prim::Constant").setType(torch.NoneType.get())
+        pad = g.op("Pad", input, constant, none, mode_s="constant")
+        self.run_test(g, pad.node(), expect_tensor("Float", shape=(None, None, None)))
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index dca45fc5c311..9638e53d6c06 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -1,47 +1,55 @@
 # Owner(s): ["module: onnx"]
 
-from test_pytorch_common import TestCase, run_tests
+import copy
+import io
+
+import onnx
+import torchvision
+from autograd_helper import CustomFunction as CustomFunction2
+from test_pytorch_common import (
+    TestCase,
+    run_tests,
+    skipIfNoCuda,
+    skipIfUnsupportedMaxOpsetVersion,
+    skipIfUnsupportedMinOpsetVersion,
+)
+from verify import verify
 
 import torch
 import torch.onnx
-from torch.onnx import (utils,
-                        OperatorExportTypes,
-                        TrainingMode,
-                        register_custom_op_symbolic,
-                        unregister_custom_op_symbolic)
-from torch.onnx.symbolic_helper import (_set_opset_version,
-                                        _set_operator_export_type,
-                                        _set_onnx_shape_inference)
 import torch.utils.cpp_extension
-from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion,
-                                 skipIfUnsupportedMaxOpsetVersion)
-import caffe2.python.onnx.backend as backend
-from verify import verify
-
-import torchvision
-
-import onnx
-
-import io
-import copy
-import unittest
-
-skip = unittest.skip
+from torch.onnx import (
+    OperatorExportTypes,
+    TrainingMode,
+    register_custom_op_symbolic,
+    unregister_custom_op_symbolic,
+    utils,
+)
+from torch.onnx.symbolic_helper import (
+    _set_onnx_shape_inference,
+    _set_operator_export_type,
+    _set_opset_version,
+    _unpack_list,
+    parse_args,
+)
 
 
 class _BaseTestCase(TestCase):
-
     def setUp(self):
         torch.manual_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
 
-    def _model_to_graph(self, model, input,
-                        do_constant_folding=True,
-                        training=TrainingMode.EVAL,
-                        operator_export_type=OperatorExportTypes.ONNX,
-                        input_names=None,
-                        dynamic_axes=None):
+    def _model_to_graph(
+        self,
+        model,
+        input,
+        do_constant_folding=True,
+        training=TrainingMode.EVAL,
+        operator_export_type=OperatorExportTypes.ONNX,
+        input_names=None,
+        dynamic_axes=None,
+    ):
         if training == torch.onnx.TrainingMode.TRAINING:
             model.train()
         elif training == torch.onnx.TrainingMode.EVAL:
@@ -49,19 +57,21 @@ def _model_to_graph(self, model, input,
         # Need disable onnx_shape_inference for this test because it puts const node to initializers.
         _set_onnx_shape_inference(False)
         utils._validate_dynamic_axes(dynamic_axes, model, None, None)
-        graph, params_dict, torch_out = utils._model_to_graph(model, input,
-                                                              do_constant_folding=do_constant_folding,
-                                                              _disable_torch_constant_prop=True,
-                                                              operator_export_type=operator_export_type,
-                                                              training=training,
-                                                              input_names=input_names,
-                                                              dynamic_axes=dynamic_axes)
+        graph, params_dict, torch_out = utils._model_to_graph(
+            model,
+            input,
+            do_constant_folding=do_constant_folding,
+            _disable_torch_constant_prop=True,
+            operator_export_type=operator_export_type,
+            training=training,
+            input_names=input_names,
+            dynamic_axes=dynamic_axes,
+        )
         _set_onnx_shape_inference(True)
         return graph, params_dict, torch_out
 
 
 class TestUtilityFuns_opset_independent(_BaseTestCase):
-
     def test_unconvertible_ops(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
@@ -100,18 +110,24 @@ def forward(self, x):
 
     def test_validate_dynamic_axes_invalid_input_output_name(self):
         import warnings
+
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            utils._validate_dynamic_axes({"input1": {}, "output": {},
-                                         "invalid_name1": {}, "invalid_name2": {}},
-                                         None, ["input1", "input2"], ["output"])
+            utils._validate_dynamic_axes(
+                {"input1": {}, "output": {}, "invalid_name1": {}, "invalid_name2": {}},
+                None,
+                ["input1", "input2"],
+                ["output"],
+            )
             messages = [str(warning.message) for warning in w]
         self.assertIn(
             "Provided key invalid_name1 for dynamic axes is not a valid input/output name",
-            messages)
+            messages,
+        )
         self.assertIn(
             "Provided key invalid_name2 for dynamic axes is not a valid input/output name",
-            messages)
+            messages,
+        )
         self.assertEqual(len(messages), 2)
 
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -127,23 +143,28 @@ def forward(self, x, y, t):
         x = torch.randn(2, 3)
         y = torch.randn(2, 4)
         t = torch.randn(2, 7)
-        graph, _, _ = self._model_to_graph(SplitModule(), (x, y, t), input_names=["x", "y", "t"],
-                                           dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]})
+        graph, _, _ = self._model_to_graph(
+            SplitModule(),
+            (x, y, t),
+            input_names=["x", "y", "t"],
+            dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]},
+        )
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::SplitToSequence")
 
     def test_constant_fold_transpose(self):
         class TransposeModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.transpose(a, 1, 0)
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(3, 2)
-        graph, _, __ = self._model_to_graph(TransposeModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            TransposeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Transpose")
@@ -154,15 +175,16 @@ def forward(self, x):
     def test_constant_fold_reduceL2(self):
         class ReduceModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.norm(a, p=2, dim=-2, keepdim=False)
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(2, 3)
-        graph, _, __ = self._model_to_graph(ReduceModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            ReduceModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::ReduceL2")
@@ -171,15 +193,16 @@ def forward(self, x):
     def test_constant_fold_reduceL1(self):
         class NormModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.norm(a, p=1, dim=-2)
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(2, 3)
-        graph, _, __ = self._model_to_graph(NormModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            NormModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::ReduceL1")
@@ -188,15 +211,16 @@ def forward(self, x):
     def test_constant_fold_slice(self):
         class NarrowModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.narrow(a, 0, 0, 1)
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(1, 3)
-        graph, _, __ = self._model_to_graph(NarrowModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            NarrowModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Slice")
@@ -207,15 +231,19 @@ def forward(self, x):
     def test_constant_fold_slice_index_exceeds_dim(self):
         class SliceIndexExceedsDimModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
-                b = a[1:10]         # index exceeds dimension
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+                b = a[1:10]  # index exceeds dimension
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(1, 3)
-        graph, _, __ = self._model_to_graph(SliceIndexExceedsDimModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            SliceIndexExceedsDimModule(),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Slice")
@@ -226,8 +254,8 @@ def forward(self, x):
     def test_constant_fold_slice_negative_index(self):
         class SliceNegativeIndexModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
-                b = a[0:-1]        # index relative to the end
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+                b = a[0:-1]  # index relative to the end
                 c = torch.select(a, dim=-1, index=-2)
                 d = torch.select(a, dim=1, index=0)
                 return b + x, c + d
@@ -235,8 +263,12 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(1, 3)
-        graph, _, __ = self._model_to_graph(SliceNegativeIndexModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            SliceNegativeIndexModule(),
+            (x,),
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Slice")
@@ -246,7 +278,7 @@ def forward(self, x):
     def test_constant_fold_gather(self):
         class GatherModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.select(a, dim=1, index=-2)
                 c = torch.index_select(a, dim=-2, index=torch.tensor([0, 1]))
                 return b + 1, c + x
@@ -256,8 +288,9 @@ def forward(self, x):
         x = torch.ones(1, 3)
         model = GatherModule()
         model(x)
-        graph, _, __ = self._model_to_graph(GatherModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            GatherModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Gather")
@@ -265,15 +298,16 @@ def forward(self, x):
     def test_constant_fold_unsqueeze(self):
         class UnsqueezeModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+                a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
                 b = torch.unsqueeze(a, -2)
                 return b + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(1, 2, 3)
-        graph, _, __ = self._model_to_graph(UnsqueezeModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1, 2]})
+        graph, _, __ = self._model_to_graph(
+            UnsqueezeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Unsqueeze")
@@ -294,8 +328,9 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.randn(2, 3, 4, 5, 8, 7)
-        graph, _, __ = self._model_to_graph(PReluModel(), x, input_names=["x"],
-                                            dynamic_axes={"x": [0, 1, 2, 3, 4, 5]})
+        graph, _, __ = self._model_to_graph(
+            PReluModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3, 4, 5]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Unsqueeze")
@@ -306,14 +341,15 @@ def forward(self, x):
     def test_constant_fold_squeeze_without_axes(self):
         class SqueezeModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[[1., 2., 3.], [4., 5., 6.]]])
+                a = torch.tensor([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]])
                 return torch.squeeze(a) + x + torch.squeeze(a)
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(2, 3)
-        graph, _, __ = self._model_to_graph(SqueezeModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            SqueezeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Squeeze")
             self.assertNotEqual(node.kind(), "onnx::Cast")
@@ -323,14 +359,15 @@ def forward(self, x):
     def test_constant_fold_squeeze_with_axes(self):
         class SqueezeAxesModule(torch.nn.Module):
             def forward(self, x):
-                a = torch.tensor([[[1., 2., 3.], [4., 5., 6.]]])
+                a = torch.tensor([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]])
                 return torch.squeeze(a, dim=-3) + x
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(2, 3)
-        graph, _, __ = self._model_to_graph(SqueezeAxesModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            SqueezeAxesModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Squeeze")
@@ -356,8 +393,8 @@ def forward(self, x):
                 #
                 # More commentary at
                 # https://github.com/pytorch/pytorch/pull/18698/files#r340107552
-                a = torch.tensor([[1., 2., 3.]]).to(torch.float)
-                b = torch.tensor([[4., 5., 6.]]).to(torch.float)
+                a = torch.tensor([[1.0, 2.0, 3.0]]).to(torch.float)
+                b = torch.tensor([[4.0, 5.0, 6.0]]).to(torch.float)
                 c = torch.cat((a, b), 0)
                 d = b + c
                 return x + d
@@ -365,8 +402,9 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.ones(2, 3)
-        graph, _, __ = self._model_to_graph(ConcatModule(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            ConcatModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Concat")
@@ -387,8 +425,12 @@ def forward(self, input, initial_state):
         _set_operator_export_type(OperatorExportTypes.ONNX)
         input = torch.randn(5, 3, 7)
         h0 = torch.randn(1, 3, 3)
-        graph, _, __ = self._model_to_graph(GruNet(), (input, h0), input_names=["input", "h0"],
-                                            dynamic_axes={"input": [0, 1, 2], "h0": [0, 1, 2]})
+        graph, _, __ = self._model_to_graph(
+            GruNet(),
+            (input, h0),
+            input_names=["input", "h0"],
+            dynamic_axes={"input": [0, 1, 2], "h0": [0, 1, 2]},
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Slice")
@@ -413,8 +455,9 @@ def forward(self, A):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         A = torch.randn(2, 3)
-        graph, _, __ = self._model_to_graph(MatMulNet(), (A, ),
-                                            input_names=["A"], dynamic_axes={"A": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            MatMulNet(), (A,), input_names=["A"], dynamic_axes={"A": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Transpose")
@@ -422,7 +465,9 @@ def forward(self, A):
 
     def test_constant_fold_reshape(self):
         class ReshapeModule(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(ReshapeModule, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -433,8 +478,9 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         x = torch.randn(4, 5)
-        graph, _, __ = self._model_to_graph(ReshapeModule(), (x, ),
-                                            input_names=["x"], dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            ReshapeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Reshape")
@@ -442,7 +488,9 @@ def forward(self, x):
 
     def test_constant_fold_div(self):
         class Module(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Module, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -453,8 +501,9 @@ def forward(self, x):
         x = torch.randn(2, 5)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Div")
@@ -462,7 +511,9 @@ def forward(self, x):
 
     def test_constant_fold_mul(self):
         class Module(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Module, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -473,8 +524,9 @@ def forward(self, x):
         x = torch.randn(2, 5)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"],
-                                            dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Mul")
@@ -482,7 +534,9 @@ def forward(self, x):
 
     def test_constant_fold_add(self):
         class Module(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Module, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -494,9 +548,13 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         graph, params_dict, __ = self._model_to_graph(
-            Module(), (x, ), do_constant_folding=True,
+            Module(),
+            (x,),
+            do_constant_folding=True,
             operator_export_type=OperatorExportTypes.ONNX,
-            input_names=["x"], dynamic_axes={"x": [0, 1]})
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
         for node in graph.nodes():
             self.assertTrue(node.kind() != "onnx::Add")
         self.assertEqual(len(list(graph.nodes())), 1)
@@ -508,7 +566,9 @@ def forward(self, x):
 
     def test_constant_fold_sub(self):
         class Module(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Module, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -520,8 +580,13 @@ def forward(self, x):
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         graph, params_dict, __ = self._model_to_graph(
-            Module(), (x, ), do_constant_folding=True,
-            operator_export_type=OperatorExportTypes.ONNX, input_names=["x"], dynamic_axes={"x": [0, 1]})
+            Module(),
+            (x,),
+            do_constant_folding=True,
+            operator_export_type=OperatorExportTypes.ONNX,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Sub")
         self.assertEqual(len(list(graph.nodes())), 1)
@@ -533,7 +598,9 @@ def forward(self, x):
 
     def test_constant_fold_sqrt(self):
         class Module(torch.nn.Module):
-            def __init__(self, ):
+            def __init__(
+                self,
+            ):
                 super(Module, self).__init__()
                 self.register_buffer("weight", torch.ones(5))
 
@@ -544,7 +611,9 @@ def forward(self, x):
         x = torch.randn(2, 5)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Sqrt")
         self.assertEqual(len(list(graph.nodes())), 1)
@@ -562,7 +631,9 @@ def forward(self, x):
         x = torch.randn(2, 5)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(ShapeModule(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]})
+        graph, _, __ = self._model_to_graph(
+            ShapeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::Shape")
@@ -572,14 +643,16 @@ def test_verbose(self):
         class MyModule(torch.nn.Module):
             def forward(self, input):
                 return torch.exp(input)
+
         x = torch.randn(3, 4)
 
         def is_model_stripped(f, verbose=None):
             if verbose is None:
                 torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
             else:
-                torch.onnx.export(MyModule(), x, f, verbose=verbose,
-                                  opset_version=self.opset_version)
+                torch.onnx.export(
+                    MyModule(), x, f, verbose=verbose, opset_version=self.opset_version
+                )
             model = onnx.load(io.BytesIO(f.getvalue()))
             model_strip = copy.copy(model)
             onnx.helper.strip_doc_string(model_strip)
@@ -595,12 +668,55 @@ def test_error_on_data_parallel(self):
         model = torch.nn.DataParallel(torch.nn.ReflectionPad2d((1, 2, 3, 4)))
         x = torch.randn(1, 2, 3, 4)
         f = io.BytesIO()
-        with self.assertRaisesRegex(ValueError,
-                                    "torch.nn.DataParallel is not supported by ONNX "
-                                    "exporter, please use 'attribute' module to "
-                                    "unwrap model from torch.nn.DataParallel. Try "):
+        with self.assertRaisesRegex(
+            ValueError,
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try ",
+        ):
             torch.onnx.export(model, x, f, opset_version=self.opset_version)
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_sequence_dim(self):
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                return [x, y]
+
+        model = Module()
+        # Export with scripting to keep output as Sequence type.
+        # Tracing unpacks the list.
+        script_model = torch.jit.script(model)
+        x = torch.randn(2, 3)
+
+        # Case 1: dynamic axis
+        f = io.BytesIO()
+        y = torch.randn(2, 3)
+        torch.onnx.export(
+            script_model,
+            (x, y),
+            f,
+            opset_version=self.opset_version,
+            input_names=["x", "y"],
+            dynamic_axes={"y": [1]},
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        loop_output_value_info_proto = onnx_model.graph.output[0]
+        ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info(
+            loop_output_value_info_proto.name, 1, [2, None]
+        )
+        self.assertEqual(loop_output_value_info_proto, ref_value_info_proto)
+
+        # Case 2: no dynamic axes.
+        f = io.BytesIO()
+        y = torch.randn(2, 3)
+        torch.onnx.export(script_model, (x, y), f, opset_version=self.opset_version)
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        loop_output_value_info_proto = onnx_model.graph.output[0]
+        ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info(
+            loop_output_value_info_proto.name, 1, [2, 3]
+        )
+        self.assertEqual(loop_output_value_info_proto, ref_value_info_proto)
+
     def test_export_mode(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
@@ -614,16 +730,26 @@ def forward(self, x):
         # set mode to in inference mode and export in training mode
         model.eval()
         old_state = model.training
-        torch.onnx.export(model, (x,), f,
-                          opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.TRAINING,
+        )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
 
         # set mode to training mode and export in inference mode
         model.train()
         old_state = model.training
-        torch.onnx.export(model, (x,), f,
-                          opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL)
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            training=torch.onnx.TrainingMode.EVAL,
+        )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
 
@@ -641,7 +767,9 @@ class M(torch.nn.Module):
             def __init__(self, num_layers):
                 super().__init__()
                 self.num_layers = num_layers
-                self.lns = torch.nn.ModuleList([torch.nn.LayerNorm(3, eps=i) for i in range(num_layers)])
+                self.lns = torch.nn.ModuleList(
+                    [torch.nn.LayerNorm(3, eps=i) for i in range(num_layers)]
+                )
                 self.celu1 = torch.nn.CELU(1.0)
                 self.celu2 = torch.nn.CELU(2.0)
                 self.dropout = N(0.5)
@@ -662,8 +790,17 @@ def forward(self, x, y, z):
         # Model export in inference mode will remove dropout node,
         # thus the dropout module no longer exist in graph.
         f = io.BytesIO()
-        torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version,
-                          export_modules_as_functions={torch.nn.CELU, torch.nn.Dropout, torch.nn.LayerNorm})
+        torch.onnx.export(
+            M(3),
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            export_modules_as_functions={
+                torch.nn.CELU,
+                torch.nn.Dropout,
+                torch.nn.LayerNorm,
+            },
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
 
@@ -672,11 +809,11 @@ def forward(self, x, y, z):
         celu_funcs = [f for f in funcs if f.name == "CELU"]
         self.assertEqual(len(celu_funcs), 1)
         self.assertEqual(celu_funcs[0].domain, "torch.nn.modules.activation")
-        self.assertEqual(len(celu_funcs[0].attribute), 1)
+        self.assertEqual(len(celu_funcs[0].attribute), 3)
         ln_funcs = [f for f in funcs if f.name == "LayerNorm"]
         self.assertEqual(len(ln_funcs), 1)
         self.assertEqual(ln_funcs[0].domain, "torch.nn.modules.normalization")
-        self.assertEqual(len(ln_funcs[0].attribute), 1)
+        self.assertEqual(len(ln_funcs[0].attribute), 3)
 
         # Check local function nodes
         nodes = onnx_model.graph.node
@@ -684,15 +821,20 @@ def forward(self, x, y, z):
         ln_ns = [n for n in nodes if n.op_type == "LayerNorm"]
         self.assertEqual(len(celu_ns), 2)
         self.assertEqual(celu_ns[0].domain, "torch.nn.modules.activation")
-        self.assertEqual(len(celu_ns[0].attribute), 1)
+        self.assertEqual(len(celu_ns[0].attribute), 3)
         self.assertEqual(len(ln_ns), 3)
         self.assertEqual(ln_ns[0].domain, "torch.nn.modules.normalization")
-        self.assertEqual(len(ln_ns[0].attribute), 1)
+        self.assertEqual(len(ln_ns[0].attribute), 3)
 
         # Export specified modules.
         f = io.BytesIO()
-        torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version,
-                          export_modules_as_functions={torch.nn.CELU})
+        torch.onnx.export(
+            M(3),
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            export_modules_as_functions={torch.nn.CELU},
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         funcs = onnx_model.functions
@@ -701,8 +843,13 @@ def forward(self, x, y, z):
 
         # Export with empty specified modules. Normal export.
         f = io.BytesIO()
-        torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version,
-                          export_modules_as_functions=set())
+        torch.onnx.export(
+            M(3),
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            export_modules_as_functions=set(),
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         funcs = onnx_model.functions
@@ -710,8 +857,13 @@ def forward(self, x, y, z):
 
         # Export all modules. Should contain {M, CELU, LayerNorm}.
         f = io.BytesIO()
-        torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version,
-                          export_modules_as_functions=True)
+        torch.onnx.export(
+            M(3),
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            export_modules_as_functions=True,
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         funcs = onnx_model.functions
@@ -741,8 +893,13 @@ def forward(self, x, y, z):
         z = torch.randn(2, 3)
 
         f = io.BytesIO()
-        torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version,
-                          export_modules_as_functions={NWithOverloads})
+        torch.onnx.export(
+            M(3),
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            export_modules_as_functions={NWithOverloads},
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         funcs = onnx_model.functions
@@ -763,13 +920,73 @@ def forward(self, x):
 
         x = torch.randn(4, 5)
         f = io.BytesIO()
-        torch.onnx.export(M(), (x,), f, export_modules_as_functions=True,
-                          opset_version=self.opset_version, do_constant_folding=False)
+        torch.onnx.export(
+            M(),
+            (x,),
+            f,
+            export_modules_as_functions=True,
+            opset_version=self.opset_version,
+            do_constant_folding=False,
+        )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         funcs = onnx_model.functions
         self.assertIn("M", [f.name for f in funcs])
 
+    @skipIfUnsupportedMinOpsetVersion(15)
+    def test_local_function_predefined_attributes(self):
+        class M(torch.nn.Module):
+            num_layers: int
+
+            def __init__(self, num_layers):
+                super().__init__()
+                self.num_layers = num_layers
+                self.lns = torch.nn.ModuleList(
+                    [torch.nn.LayerNorm(3, eps=1e-4) for _ in range(num_layers)]
+                )
+
+            def forward(self, x):
+                for ln in self.lns:
+                    x = ln(x)
+                return x
+
+        x = torch.randn(2, 3)
+        f = io.BytesIO()
+        model = M(3)
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            export_modules_as_functions=True,
+            opset_version=self.opset_version,
+        )
+
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        funcs = onnx_model.functions
+        m_funcs = [fn for fn in funcs if fn.name == "M"]
+        self.assertEqual(m_funcs[0].attribute, ["num_layers"])
+        ln_funcs = [fn for fn in funcs if fn.name == "LayerNorm"]
+        self.assertEqual(ln_funcs[0].attribute, ["eps", "elementwise_affine"])
+
+        from onnx import helper
+
+        m_node = [n for n in onnx_model.graph.node if n.op_type == "M"]
+        self.assertEqual(
+            m_node[0].attribute[0],
+            helper.make_attribute("num_layers", model.num_layers),
+        )
+
+        ln_nodes = [n for n in m_funcs[0].node if n.op_type == "LayerNorm"]
+        expected_ln_attrs = [
+            helper.make_attribute(
+                "elementwise_affine", model.lns[0].elementwise_affine
+            ),
+            helper.make_attribute("eps", model.lns[0].eps),
+        ]
+        for ln_node in ln_nodes:
+            self.assertIn(ln_node.attribute[0], expected_ln_attrs)
+            self.assertIn(ln_node.attribute[1], expected_ln_attrs)
+
     def test_aten_fallthrough(self):
         # Test aten export of op with no symbolic
         class Module(torch.nn.Module):
@@ -778,9 +995,13 @@ def forward(self, x):
 
         x = torch.randn(2, 3, 4)
         _set_opset_version(self.opset_version)
-        graph, _, __ = self._model_to_graph(Module(), (x, ),
-                                            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
-                                            input_names=["x"], dynamic_axes={"x": [0, 1, 2]})
+        graph, _, __ = self._model_to_graph(
+            Module(),
+            (x,),
+            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2]},
+        )
         iter = graph.nodes()
         self.assertEqual(next(iter).kind(), "aten::erfc")
 
@@ -812,25 +1033,33 @@ def forward(self, input, other):
         x = torch.randn(2, 3, 4, requires_grad=False)
         y = torch.randn(2, 3, 4, requires_grad=False)
         model = FooModel()
-        graph, _, __ = self._model_to_graph(model, (x, y),
-                                            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH,
-                                            input_names=["x", "y"],
-                                            dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]})
+        graph, _, __ = self._model_to_graph(
+            model,
+            (x, y),
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH,
+            input_names=["x", "y"],
+            dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]},
+        )
         iter = graph.nodes()
         self.assertEqual(next(iter).kind(), "custom_namespace::custom_op")
 
     def test_custom_opsets_gelu(self):
         self.addCleanup(unregister_custom_op_symbolic, "::gelu", 1)
 
-        def gelu(g, self):
+        def gelu(g, self, approximate):
             return g.op("com.microsoft::Gelu", self).setType(self.type())
 
         register_custom_op_symbolic("::gelu", gelu, 1)
-        model = torch.nn.GELU()
+        model = torch.nn.GELU(approximate="none")
         x = torch.randn(3, 3)
         f = io.BytesIO()
-        torch.onnx.export(model, (x, ), f,
-                          opset_version=self.opset_version, custom_opsets={"com.microsoft": 1})
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+        )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.node[0].op_type, "Gelu")
@@ -838,18 +1067,17 @@ def gelu(g, self):
         self.assertEqual(graph.opset_import[1].domain, "com.microsoft")
         self.assertEqual(graph.opset_import[1].version, 1)
 
-
     def test_register_aten_custom_op_symbolic(self):
         self.addCleanup(unregister_custom_op_symbolic, "aten::gelu", 1)
 
-        def gelu(g, self):
+        def gelu(g, self, approximate):
             return g.op("com.microsoft::Gelu", self).setType(self.type())
 
         register_custom_op_symbolic("aten::gelu", gelu, 1)
-        model = torch.nn.GELU()
+        model = torch.nn.GELU(approximate="none")
         x = torch.randn(3, 3)
         f = io.BytesIO()
-        torch.onnx.export(model, (x, ), f, opset_version=self.opset_version)
+        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
         graph = onnx.load(io.BytesIO(f.getvalue()))
 
         self.assertEqual(graph.graph.node[0].op_type, "Gelu")
@@ -867,8 +1095,13 @@ def inverse(g, self):
         model = CustomInverse()
         x = torch.randn(2, 3, 3)
         f = io.BytesIO()
-        torch.onnx.export(model, (x, ), f,
-                          opset_version=self.opset_version, custom_opsets={"com.microsoft": 1})
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+        )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.node[0].op_type, "Inverse")
@@ -878,51 +1111,20 @@ def inverse(g, self):
 
     def test_onnx_fallthrough(self):
         # Test aten export of op with symbolic for aten
-        x = torch.randn(100, 128)
-        y = torch.randn(100, 128)
-        model = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
-
-        graph, _, __ = self._model_to_graph(model, (x, y),
-                                            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
-                                            input_names=["x", "y"],
-                                            dynamic_axes={"x": [0, 1], "y": [0, 1]})
-        iter = graph.nodes()
-        self.assertEqual(next(iter).kind(), "onnx::Constant")
-        self.assertEqual(next(iter).kind(), "onnx::Constant")
-        self.assertEqual(next(iter).kind(), "aten::cosine_similarity")
-
-    def test_quantized_fallthrough(self):
-        # Test Quantized op
-        class QModule(torch.nn.Module):
-            def __init__(self):
-                super(QModule, self).__init__()
-                self.quant1 = torch.ao.quantization.QuantStub()
-                self.dequant = torch.ao.quantization.DeQuantStub()
-
+        class Module(torch.nn.Module):
             def forward(self, x):
-                res = self.quant1(x)
-                return self.dequant(res)
-
-        model = QModule()
-        torch.backends.quantized.engine = "qnnpack"
-        pt_inputs = (torch.randn(1, 2, 3, 4))
-        model.qconfig = torch.ao.quantization.default_qconfig
-        q_model = torch.ao.quantization.prepare(model, inplace=False)
-        q_model = torch.ao.quantization.convert(q_model, inplace=False)
-
-        q_model.eval()
-
-        graph, _, __ = self._model_to_graph(q_model, pt_inputs,
-                                            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
-                                            input_names=["pt_inputs"],
-                                            dynamic_axes={"pt_inputs": [0, 1, 2, 3]})
+                return torch.digamma(x)
 
+        x = torch.randn(100, 128)
+        graph, _, __ = self._model_to_graph(
+            Module(),
+            (x,),
+            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1]},
+        )
         iter = graph.nodes()
-        self.assertEqual(next(iter).kind(), "onnx::Constant")
-        self.assertEqual(next(iter).kind(), "onnx::Constant")
-        self.assertEqual(next(iter).kind(), "onnx::Constant")
-        self.assertEqual(next(iter).kind(), "aten::quantize_per_tensor")
-        self.assertEqual(next(iter).kind(), "aten::dequantize")
+        self.assertEqual(next(iter).kind(), "aten::digamma")
 
     # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11
     @skipIfUnsupportedMaxOpsetVersion(10)
@@ -940,9 +1142,13 @@ def forward(self, x):
         x = torch.tensor([2])
         model = PrimModule()
         model.eval()
-        graph, _, __ = self._model_to_graph(model, (x,),
-                                            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
-                                            input_names=["x"], dynamic_axes={"x": [0]})
+        graph, _, __ = self._model_to_graph(
+            model,
+            (x,),
+            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
+            input_names=["x"],
+            dynamic_axes={"x": [0]},
+        )
         iter = graph.nodes()
         self.assertEqual(next(iter).kind(), "prim::ListConstruct")
 
@@ -963,8 +1169,9 @@ def forward(self, input):
         model = Custom()
         batch = torch.FloatTensor(1, 3)
 
-        graph, _, _ = self._model_to_graph(model, batch,
-                                           input_names=["batch"], dynamic_axes={"batch": [0, 1]})
+        graph, _, _ = self._model_to_graph(
+            model, batch, input_names=["batch"], dynamic_axes={"batch": [0, 1]}
+        )
         iter = graph.nodes()
         self.assertEqual(next(iter).kind(), "CustomNamespace::Custom")
 
@@ -977,7 +1184,7 @@ def forward(ctx, input):
 
             @staticmethod
             def backward(ctx, grad_output):
-                input, = ctx.saved_tensors
+                (input,) = ctx.saved_tensors
                 grad_input = grad_output.clone()
                 grad_input[input < 0] = 0
                 return grad_input
@@ -989,17 +1196,54 @@ def forward(self, input):
         model = Custom()
         batch = torch.FloatTensor(1, 3)
 
-        graph, _, _ = self._model_to_graph(model, batch,
-                                           operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
-                                           input_names=["batch"], dynamic_axes={"batch": [0, 1]})
+        graph, _, _ = self._model_to_graph(
+            model,
+            batch,
+            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
+            input_names=["batch"],
+            dynamic_axes={"batch": [0, 1]},
+        )
         iter = graph.nodes()
         self.assertEqual(next(iter).kind(), "prim::PythonOp")
 
+    def test_autograd_module_name(self):
+        class CustomFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, input):
+                ctx.save_for_backward(input)
+                return input.clamp(min=0)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (input,) = ctx.saved_tensors
+                grad_input = grad_output.clone()
+                grad_input[input < 0] = 0
+                return grad_input
+
+        class Custom(torch.nn.Module):
+            def forward(self, input):
+                return CustomFunction.apply(input) + CustomFunction2.apply(input)
+
+        model = Custom()
+        batch = torch.FloatTensor(1, 3)
+
+        graph, _, _ = self._model_to_graph(
+            model, batch, input_names=["batch"], dynamic_axes={"batch": [0, 1]}
+        )
+        iter = graph.nodes()
+        autograd1 = next(iter)
+        autograd2 = next(iter)
+        self.assertEqual(autograd1.kind(), "prim::PythonOp")
+        self.assertEqual(autograd2.kind(), "prim::PythonOp")
+        self.assertNotEqual(autograd1.s("module"), autograd2.s("module"))
+
     def test_unused_initializers(self):
         class Model(torch.nn.Module):
             def __init__(self):
                 super(Model, self).__init__()
-                self.conv2 = torch.nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(1, 1))
+                self.conv2 = torch.nn.ConvTranspose2d(
+                    16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(1, 1)
+                )
                 self.k_proj = torch.nn.Linear(5, 5, bias=True)
 
             def forward(self, x):
@@ -1009,10 +1253,14 @@ def forward(self, x):
         x = torch.randn(20, 16, 50, 100)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        _, params_dict, __ = self._model_to_graph(Model(), (x, ), do_constant_folding=False,
-                                                  operator_export_type=OperatorExportTypes.ONNX,
-                                                  input_names=["x"],
-                                                  dynamic_axes={"x": [0, 1, 2, 3]})
+        _, params_dict, __ = self._model_to_graph(
+            Model(),
+            (x,),
+            do_constant_folding=False,
+            operator_export_type=OperatorExportTypes.ONNX,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+        )
 
         self.assertEqual(len(params_dict), 2)
 
@@ -1020,7 +1268,9 @@ def test_scripting_param(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
                 super(MyModule, self).__init__()
-                self.conv = torch.nn.Conv2d(3, 16, kernel_size=1, stride=2, padding=3, bias=True)
+                self.conv = torch.nn.Conv2d(
+                    3, 16, kernel_size=1, stride=2, padding=3, bias=True
+                )
                 self.bn = torch.nn.BatchNorm2d(16, affine=True)
 
             def forward(self, x):
@@ -1032,16 +1282,23 @@ def forward(self, x):
         x = torch.randn(10, 3, 128, 128)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True,
-                                            operator_export_type=OperatorExportTypes.ONNX,
-                                            training=torch.onnx.TrainingMode.TRAINING,
-                                            input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
+        graph, _, __ = self._model_to_graph(
+            model,
+            (x,),
+            do_constant_folding=True,
+            operator_export_type=OperatorExportTypes.ONNX,
+            training=torch.onnx.TrainingMode.TRAINING,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+        )
 
         graph_input_params = [param.debugName() for param in graph.inputs()]
         for item in dict(model.named_parameters()):
             self.assertIn(
-                item, graph_input_params,
-                "Graph parameter names does not match model parameters.")
+                item,
+                graph_input_params,
+                "Graph parameter names does not match model parameters.",
+            )
 
     def test_modifying_params(self):
         class MyModel(torch.nn.Module):
@@ -1055,13 +1312,19 @@ def forward(self, x):
                 return y
 
         x = torch.tensor([1, 2])
+        # Move import to local as caffe2 backend requires additional build flag,
+        # and is only used in this test case.
+        import caffe2.python.onnx.backend as backend
+
         verify(MyModel(), x, backend, do_constant_folding=False)
 
     def test_fuse_conv_bn(self):
         class Fuse(torch.nn.Module):
             def __init__(self):
                 super(Fuse, self).__init__()
-                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=True)
+                self.conv = torch.nn.Conv2d(
+                    3, 2, kernel_size=1, stride=2, padding=3, bias=True
+                )
                 self.bn = torch.nn.BatchNorm2d(2)
 
             def forward(self, x):
@@ -1069,9 +1332,13 @@ def forward(self, x):
                 return self.bn(out)
 
         x = torch.randn(2, 3, 2, 2, requires_grad=True)
-        graph, _, __ = self._model_to_graph(Fuse(), (x, ),
-                                            training=TrainingMode.EVAL, input_names=["x"],
-                                            dynamic_axes={"x": [0, 1, 2, 3]})
+        graph, _, __ = self._model_to_graph(
+            Fuse(),
+            (x,),
+            training=TrainingMode.EVAL,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+        )
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::BatchNormalization")
             self.assertEqual(node.kind(), "onnx::Conv")
@@ -1081,17 +1348,20 @@ def forward(self, x):
     def test_fuse_resnet18(self):
         model = torchvision.models.resnet18(pretrained=False)
         x = torch.randn(2, 3, 224, 224, requires_grad=True)
-        graph, _, __ = self._model_to_graph(model, (x, ),
-                                            training=TrainingMode.EVAL,
-                                            input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]})
+        graph, _, __ = self._model_to_graph(
+            model,
+            (x,),
+            training=TrainingMode.EVAL,
+            input_names=["x"],
+            dynamic_axes={"x": [0, 1, 2, 3]},
+        )
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::BatchNormalization")
 
     def test_onnx_function_substitution_pass(self):
-
         @torch.jit.script
-        def f(x : torch.Tensor, y : torch.Tensor):
+        def f(x: torch.Tensor, y: torch.Tensor):
             z = x - y
             return x + z
 
@@ -1106,16 +1376,22 @@ def forward(self, x, y):
         input_2 = torch.tensor(12)
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        graph, _, __ = self._model_to_graph(MyModule(), (input_1, input_2), do_constant_folding=True,
-                                            operator_export_type=OperatorExportTypes.ONNX,
-                                            input_names=["input_1", "input_2"],
-                                            dynamic_axes={"input_1": [0], "input_2": [0]})
+        graph, _, __ = self._model_to_graph(
+            MyModule(),
+            (input_1, input_2),
+            do_constant_folding=True,
+            operator_export_type=OperatorExportTypes.ONNX,
+            input_names=["input_1", "input_2"],
+            dynamic_axes={"input_1": [0], "input_2": [0]},
+        )
         # Check that the prim::Constant node in the graph for representing the
         # scripted function `f` is removed and the following prim::CallFunction
         # is replced by inline graph, with onnx::Sub and onnx::Add nodes.
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "prim::Constant")
-        self.assertEqual(len(list(graph.nodes())), 2)  # onnx::Sub and onnx::Add nodes only.
+        self.assertEqual(
+            len(list(graph.nodes())), 2
+        )  # onnx::Sub and onnx::Add nodes only.
 
     def test_onnx_value_name(self):
         class MyModule(torch.nn.Module):
@@ -1139,9 +1415,13 @@ def forward(self, x):
         f = io.BytesIO()
 
         model.eval()
-        torch.onnx.export(model, (x,), f,
-                          opset_version=self.opset_version,
-                          keep_initializers_as_inputs=True)
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            keep_initializers_as_inputs=True,
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.input[1].name, "in_weight")
         self.assertEqual(graph.graph.input[2].name, "in_bias")
@@ -1164,7 +1444,7 @@ def forward(self, x):
 
         module = RenamedIntermediateModule()
 
-        g, p, o = utils._model_to_graph(module, torch.ones(1, 10), output_names=['y'])
+        g, p, o = utils._model_to_graph(module, torch.ones(1, 10), output_names=["y"])
         renamed_intermediate = 0
         for n in g.nodes():
             for v in n.inputs():
@@ -1172,6 +1452,101 @@ def forward(self, x):
                     renamed_intermediate += 1
         self.assertEqual(renamed_intermediate, 2)
 
+    def _test_deduplicate_initializers(self, torchscript=False):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer1 = torch.nn.Linear(3, 3)
+                self.layer2 = torch.nn.Linear(3, 3)
+
+                # Reusing layers.
+                self.layer3 = self.layer1
+
+                # Reusing parameters.
+                self.layer2.weight = self.layer1.weight
+                self.layer1.bias = self.layer2.bias
+
+                # Parameter with different tensors equal in value.
+                self.param1 = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+                self.param2 = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+
+            def forward(self, x):
+                return (
+                    self.layer3(self.layer2(self.layer1(x))) + self.param1 + self.param2
+                )
+
+        model = torch.jit.script(MyModule()) if torchscript else MyModule()
+
+        x = torch.randn(3, 3)
+        param_name_set = set([k for k, _ in model.named_parameters()])
+
+        # Test training mode.
+        model.train()
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            training=TrainingMode.TRAINING,
+            opset_version=self.opset_version,
+        )
+        graph = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertSetEqual(
+            set([i.name for i in graph.graph.initializer]), param_name_set
+        )
+
+        model.train()
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            training=TrainingMode.PRESERVE,
+            opset_version=self.opset_version,
+        )
+        graph = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertSetEqual(
+            set([i.name for i in graph.graph.initializer]), param_name_set
+        )
+
+        # Test eval mode.
+        model.eval()
+        f = io.BytesIO()
+        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+        graph = onnx.load(io.BytesIO(f.getvalue()))
+        param_name_set.remove("param2")
+        self.assertSetEqual(
+            set([i.name for i in graph.graph.initializer]), param_name_set
+        )
+
+    def test_deduplicate_initializers(self):
+        self._test_deduplicate_initializers(torchscript=False)
+
+    def test_deduplicate_initializers_torchscript(self):
+        self._test_deduplicate_initializers(torchscript=True)
+
+    @skipIfNoCuda
+    def test_deduplicate_initializers_diff_devices(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w_cpu = torch.nn.Parameter(
+                    torch.ones(3, device=torch.device("cpu"))
+                )
+                self.w_cuda = torch.nn.Parameter(
+                    torch.ones(3, device=torch.device("cuda"))
+                )
+
+            def forward(self, x, y):
+                return x + self.w_cpu, y + self.w_cuda
+
+        x = torch.randn(3, 3, device=torch.device("cpu"))
+        y = torch.randn(3, 3, device=torch.device("cuda"))
+        f = io.BytesIO()
+        torch.onnx.export(Model(), (x, y), f, opset_version=self.opset_version)
+        graph = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertSetEqual(set([i.name for i in graph.graph.initializer]), {"w_cpu"})
+
     def test_duplicated_output_node(self):
         class DuplicatedOutputNet(torch.nn.Module):
             def __init__(self, input_size, num_classes):
@@ -1195,18 +1570,21 @@ def forward(self, input0, input1):
             "output-1": {0: "output-1_dim0", 1: "output-1_dim1"},
             "output-2": {0: "output-2_dim0", 1: "output-2_dim1"},
             "output-3": {0: "output-3_dim0", 1: "output-3_dim1"},
-            "output-4": {0: "output-4_dim0", 1: "output-4_dim1"}}
-
-        torch.onnx.export(pt_model,
-                          (x, x),
-                          f,
-                          input_names=["input0", "input1"],
-                          output_names=["output-0", "output-1", "output-2", "output-3", "output-4"],
-                          do_constant_folding=False,
-                          training=torch.onnx.TrainingMode.TRAINING,
-                          dynamic_axes=dynamic_axes,
-                          verbose=True,
-                          keep_initializers_as_inputs=True)
+            "output-4": {0: "output-4_dim0", 1: "output-4_dim1"},
+        }
+
+        torch.onnx.export(
+            pt_model,
+            (x, x),
+            f,
+            input_names=["input0", "input1"],
+            output_names=["output-0", "output-1", "output-2", "output-3", "output-4"],
+            do_constant_folding=False,
+            training=torch.onnx.TrainingMode.TRAINING,
+            dynamic_axes=dynamic_axes,
+            verbose=True,
+            keep_initializers_as_inputs=True,
+        )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.input[0].name, "input0")
@@ -1219,6 +1597,37 @@ def forward(self, input0, input1):
         self.assertEqual(graph.graph.node[3].op_type, "Gemm")
         self.assertEqual(graph.graph.node[4].op_type, "Identity")
 
+    def test_bad_symbolic_registration(self):
+        _onnx_opset_version = 9
+
+        @parse_args("v")
+        def cat(g, tensor_list, dim):
+            tensors = _unpack_list(tensor_list)
+            return g.op("Concat", *tensors, axis_i=dim)
+
+        register_custom_op_symbolic("::cat", cat, _onnx_opset_version)
+
+        class CatModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.cat((x, x, x), 0)
+
+        model = CatModel()
+        x = torch.randn(2, 3)
+        f = io.BytesIO()
+        self.assertExpectedRaisesInline(
+            AssertionError,
+            lambda: torch.onnx.export(
+                model, (x,), f, opset_version=_onnx_opset_version
+            ),
+            (
+                "A mismatch between the number of arguments (2) and their descriptors (1) was found at symbolic function "
+                "'cat'. If you believe this is not due to custom symbolic implementation within your code or an external "
+                "library, please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to "
+                "report this bug."
+            ),
+        )
+        unregister_custom_op_symbolic("::cat", _onnx_opset_version)
+
 
 class TestUtilityFuns_opset10(TestUtilityFuns_opset9):
     opset_version = 10
diff --git a/test/onnx/test_verify.py b/test/onnx/test_verify.py
index 2884fa86472c..083b76f3bfc8 100644
--- a/test/onnx/test_verify.py
+++ b/test/onnx/test_verify.py
@@ -1,12 +1,12 @@
 # Owner(s): ["module: onnx"]
 
+from test_pytorch_common import TestCase, run_tests
+from verify import verify
+
+import caffe2.python.onnx.backend as backend
 import torch
 from torch.autograd import Function
 from torch.nn import Module, Parameter
-import caffe2.python.onnx.backend as backend
-from verify import verify
-
-from test_pytorch_common import TestCase, run_tests
 
 
 class TestVerify(TestCase):
diff --git a/test/onnx/verify.py b/test/onnx/verify.py
index 4897a8f43f4a..f8f7b73f4b11 100644
--- a/test/onnx/verify.py
+++ b/test/onnx/verify.py
@@ -1,14 +1,13 @@
-import torch
-import torch.jit
-import torch.onnx
+import difflib
+import io
 
+import numpy as np
 import onnx
 import onnx.helper
 
-import numpy as np
-
-import difflib
-import io
+import torch
+import torch.jit
+import torch.onnx
 
 
 def colonize(msg, sep=": "):
@@ -39,6 +38,7 @@ def __init__(self, msg, rtol=1e-3, atol=1e-5):
         # can be used
         class ShortCircuit(Exception):
             pass
+
         self.exc_class = ShortCircuit
 
     def requireAlmostEqual(self, x, y, msg=None):
@@ -67,8 +67,9 @@ def almostEqualAndThen(self, x, y, msg, k):
         """
         if isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
             try:
-                np.testing.assert_allclose(x, y, rtol=self.rtol, atol=self.atol,
-                                           equal_nan=True, verbose=True)
+                np.testing.assert_allclose(
+                    x, y, rtol=self.rtol, atol=self.atol, equal_nan=True, verbose=True
+                )
             except AssertionError as e:
                 raise
                 k("{}{}".format(colonize(msg), str(e).lstrip()))
@@ -117,8 +118,11 @@ def equalAndThen(self, x, y, msg, k):
                 if len(sx) > 40 or len(sy) > 40 or "\n" in sx or "\n" in sy:
                     # long form
                     l = "=" * 50
-                    k("\n{}The value\n{}\n{}\n{}\n\ndoes not equal\n\n{}\n{}\n{}"
-                        .format(colonize(msg, ":\n"), l, sx, l, l, sy, l))
+                    k(
+                        "\n{}The value\n{}\n{}\n{}\n\ndoes not equal\n\n{}\n{}\n{}".format(
+                            colonize(msg, ":\n"), l, sx, l, l, sy, l
+                        )
+                    )
                 else:
                     k("{}{} != {}".format(colonize(msg), sx, sy))
 
@@ -193,6 +197,7 @@ def __enter__(self):
             def __exit__(self, exc_type, exc_value, traceback):
                 if exc_type == parent_self.exc_class:
                     return True
+
         return Recover()
 
     def addErrCtxt(self, msg):
@@ -212,6 +217,7 @@ def __enter__(self):
 
             def __exit__(self, exc_type, exc_value, traceback):
                 parent_self.context.pop()
+
         return AddContext()
 
     def __enter__(self):
@@ -225,12 +231,25 @@ def __exit__(self, exc_type, exc_value, traceback):
         if exc_type == self.exc_class:
             raise RuntimeError("ShortCircuit was raised, but no errors were recorded")
 
-def verify(model, args, backend, verbose=False, training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-7,
-           test_args=2, do_constant_folding=True, opset_version=None,
-           keep_initializers_as_inputs=True, add_node_names=False,
-           operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
-           input_names=None, dynamic_axes=None,
-           remained_onnx_input_idx=None):
+
+def verify(
+    model,
+    args,
+    backend,
+    verbose=False,
+    training=torch.onnx.TrainingMode.EVAL,
+    rtol=1e-3,
+    atol=1e-7,
+    test_args=2,
+    do_constant_folding=True,
+    opset_version=None,
+    keep_initializers_as_inputs=True,
+    add_node_names=False,
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    input_names=None,
+    dynamic_axes=None,
+    remained_onnx_input_idx=None,
+):
     """
     Export a model into ONNX, import it into a specified ONNX backend, and then
     on a few random inputs verify that PyTorch and the backend produced the same
@@ -279,6 +298,7 @@ def verify(model, args, backend, verbose=False, training=torch.onnx.TrainingMode
         dynamic_axes (dict of (string, list)): dynamic_axes.
         remained_onnx_input_idx (list of int, default None): The remained ONNX input index.
     """
+
     def _nested_map(condition, fn, condition_msg=None):
         def _map(obj):
             if condition(obj):
@@ -288,11 +308,18 @@ def _map(obj):
             elif isinstance(obj, (list, tuple)):
                 return type(obj)(_map(x) for x in obj)
             else:
-                raise ValueError("Auto nesting doesn't know how to process "
-                                 "an input object of type " + torch.typename(obj) +
-                                 (". Accepted types: " + condition_msg +
-                                  ", or lists/tuples of them"
-                                  if condition_msg else ""))
+                raise ValueError(
+                    "Auto nesting doesn't know how to process "
+                    "an input object of type "
+                    + torch.typename(obj)
+                    + (
+                        ". Accepted types: "
+                        + condition_msg
+                        + ", or lists/tuples of them"
+                        if condition_msg
+                        else ""
+                    )
+                )
 
         return _map
 
@@ -309,11 +336,18 @@ def _iter(obj):
             elif allow_unknown:
                 yield obj
             else:
-                raise ValueError("Auto nesting doesn't know how to process "
-                                 "an input object of type " + torch.typename(obj) +
-                                 (". Accepted types: " + condition_msg +
-                                  ", or lists/tuples of them"
-                                  if condition_msg else ""))
+                raise ValueError(
+                    "Auto nesting doesn't know how to process "
+                    "an input object of type "
+                    + torch.typename(obj)
+                    + (
+                        ". Accepted types: "
+                        + condition_msg
+                        + ", or lists/tuples of them"
+                        if condition_msg
+                        else ""
+                    )
+                )
 
         return _iter
 
@@ -352,14 +386,19 @@ def load_bytes(b):
 
     with torch.onnx.select_model_mode_for_export(model, training):
         proto_bytes = io.BytesIO()
-        torch_out = torch.onnx._export(model, args, proto_bytes, verbose=verbose,
-                                       do_constant_folding=do_constant_folding,
-                                       opset_version=opset_version,
-                                       keep_initializers_as_inputs=keep_initializers_as_inputs,
-                                       add_node_names=add_node_names,
-                                       operator_export_type=operator_export_type,
-                                       input_names=input_names,
-                                       dynamic_axes=dynamic_axes)
+        torch_out = torch.onnx._export(
+            model,
+            args,
+            proto_bytes,
+            verbose=verbose,
+            do_constant_folding=do_constant_folding,
+            opset_version=opset_version,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            add_node_names=add_node_names,
+            operator_export_type=operator_export_type,
+            input_names=input_names,
+            dynamic_axes=dynamic_axes,
+        )
         if isinstance(model, torch.jit.ScriptModule):
             torch_out = model(*args)
         proto = load_bytes(proto_bytes)
@@ -367,14 +406,19 @@ def load_bytes(b):
 
         def run(args, remained_onnx_input_idx):
             alt_proto_bytes = io.BytesIO()
-            torch_out = torch.onnx._export(model, args, alt_proto_bytes, verbose=verbose,
-                                           do_constant_folding=do_constant_folding,
-                                           opset_version=opset_version,
-                                           keep_initializers_as_inputs=keep_initializers_as_inputs,
-                                           add_node_names=add_node_names,
-                                           operator_export_type=operator_export_type,
-                                           input_names=input_names,
-                                           dynamic_axes=dynamic_axes)
+            torch_out = torch.onnx._export(
+                model,
+                args,
+                alt_proto_bytes,
+                verbose=verbose,
+                do_constant_folding=do_constant_folding,
+                opset_version=opset_version,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                add_node_names=add_node_names,
+                operator_export_type=operator_export_type,
+                input_names=input_names,
+                dynamic_axes=dynamic_axes,
+            )
             if isinstance(model, torch.jit.ScriptModule):
                 torch_out = model(*args)
             alt_proto = load_bytes(alt_proto_bytes)
@@ -386,26 +430,36 @@ def run(args, remained_onnx_input_idx):
                 with Errors(msg, rtol=rtol, atol=atol) as errs:
                     # First, check if we have the same number of parameters, and
                     # that they"re the same order.  If they don"t, something has *really* gone wrong.
-                    initializer_order_hint = ("This is really strange! The second time I exported your model,\n"
-                                              "it had a different set of parameters.  Are you assigning Parameters\n"
-                                              "in the forward() of your model definition?")
+                    initializer_order_hint = (
+                        "This is really strange! The second time I exported your model,\n"
+                        "it had a different set of parameters.  Are you assigning Parameters\n"
+                        "in the forward() of your model definition?"
+                    )
                     with errs.addErrCtxt(initializer_order_hint):
-                        errs.requireEqual([x.name for x in proto.graph.initializer],
-                                          [x.name for x in alt_proto.graph.initializer],
-                                          msg="Parameters list differs")
+                        errs.requireEqual(
+                            [x.name for x in proto.graph.initializer],
+                            [x.name for x in alt_proto.graph.initializer],
+                            msg="Parameters list differs",
+                        )
 
                     # Now check if the embedded parameters are actually the same
-                    initializer_hint = ("A difference in embedded parameters usually means that\n"
-                                        "your model is updating parameters/buffers even in inference\n"
-                                        "mode.  Look for a buggy nn.Module which isn't respecting train().\n")
+                    initializer_hint = (
+                        "A difference in embedded parameters usually means that\n"
+                        "your model is updating parameters/buffers even in inference\n"
+                        "mode.  Look for a buggy nn.Module which isn't respecting train().\n"
+                    )
                     with errs.recover(), errs.addErrCtxt(initializer_hint):
-                        for x, y in zip(proto.graph.initializer, alt_proto.graph.initializer):
+                        for x, y in zip(
+                            proto.graph.initializer, alt_proto.graph.initializer
+                        ):
                             errs.checkEqual(x, y)
 
                     # Next, check if the model structure lines up.
-                    structure_hint = ("A difference in model structure usually means that\n"
-                                      "your model has dynamic control flow.  These models are not\n"
-                                      "currently supported by the exporter.")
+                    structure_hint = (
+                        "A difference in model structure usually means that\n"
+                        "your model has dynamic control flow.  These models are not\n"
+                        "currently supported by the exporter."
+                    )
                     with errs.recover(), errs.addErrCtxt(structure_hint):
                         # Delete initializers since we already tested them
                         stripped_proto = onnx.ModelProto()
@@ -417,12 +471,16 @@ def run(args, remained_onnx_input_idx):
                         del stripped_alt_proto.graph.initializer[:]
 
                         # Compare the printable graph representations first
-                        errs.requireMultiLineEqual(onnx.helper.printable_graph(stripped_proto.graph),
-                                                   onnx.helper.printable_graph(stripped_alt_proto.graph))
+                        errs.requireMultiLineEqual(
+                            onnx.helper.printable_graph(stripped_proto.graph),
+                            onnx.helper.printable_graph(stripped_alt_proto.graph),
+                        )
 
                         # Compare the actual protobuf text formats now (not
                         # very user-friendly!)
-                        errs.requireMultiLineEqual(str(stripped_proto), str(stripped_alt_proto))
+                        errs.requireMultiLineEqual(
+                            str(stripped_proto), str(stripped_alt_proto)
+                        )
 
                         # One last ditch effort, using built-in equality on
                         # protobufs
@@ -437,7 +495,9 @@ def run(args, remained_onnx_input_idx):
                     # case.  We EXPECT these requires to fail.  If they don't,
                     # that is a bug in verify
                     errs.requireEqual(proto, alt_proto)
-                    errs.requireEqual(proto_bytes.getvalue(), alt_proto_bytes.getvalue())
+                    errs.requireEqual(
+                        proto_bytes.getvalue(), alt_proto_bytes.getvalue()
+                    )
                     raise AssertionError()
 
             # TODO: test that the traced model also returns the same thing...
@@ -457,12 +517,18 @@ def run_helper(torch_out, args, remained_onnx_input_idx):
             torch_out, _ = torch._C._jit_flatten(torch_out)
             # NB: onnx backend NEVER returns bare numpy array
             msg = "ONNX backend returned different results from PyTorch"
-            result_hint = ("If you are not using trained parameters, a difference in results\n"
-                           "could mean that your network is numerically unstable.  Otherwise\n"
-                           "it indicates a bug in PyTorch/ONNX; please file a bug report.")
-            with Errors(msg, rtol=rtol, atol=atol) as errs, errs.addErrCtxt(result_hint):
+            result_hint = (
+                "If you are not using trained parameters, a difference in results\n"
+                "could mean that your network is numerically unstable.  Otherwise\n"
+                "it indicates a bug in PyTorch/ONNX; please file a bug report."
+            )
+            with Errors(msg, rtol=rtol, atol=atol) as errs, errs.addErrCtxt(
+                result_hint
+            ):
                 for i, (x, y) in enumerate(zip(torch_out, backend_out)):
-                    errs.checkAlmostEqual(x.data.cpu().numpy(), y, "In output {}".format(i))
+                    errs.checkAlmostEqual(
+                        x.data.cpu().numpy(), y, "In output {}".format(i)
+                    )
 
         run_helper(torch_out, args, remained_onnx_input_idx)
 
diff --git a/test/package/package_a/use_dunder_package.py b/test/package/package_a/use_dunder_package.py
index 119cb4ee7b5c..4e0b2b3ebeac 100644
--- a/test/package/package_a/use_dunder_package.py
+++ b/test/package/package_a/use_dunder_package.py
@@ -3,7 +3,6 @@
     def is_from_package():
         return True
 
-
 else:
 
     def is_from_package():
diff --git a/test/package/package_c/test_module.py b/test/package/package_c/test_module.py
index 98fd7310eedc..c0d6f41839ea 100644
--- a/test/package/package_c/test_module.py
+++ b/test/package/package_c/test_module.py
@@ -14,7 +14,6 @@ def forward(self, x):
             x = a_non_torch_leaf(x, x)
             return torch.relu(x + 3.0)
 
-
 except ImportError:
     pass
 
diff --git a/test/package/package_e/test_nn_module.pt b/test/package/package_e/test_nn_module.pt
new file mode 100644
index 000000000000..1c1a8964a8a4
Binary files /dev/null and b/test/package/package_e/test_nn_module.pt differ
diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index be867528282d..9f1a9c9899e8 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -182,7 +182,7 @@ def test_pickle_mocked(self):
         obj2 = package_a.PackageAObject(obj)
 
         buffer = BytesIO()
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(PackagingError):
             with PackageExporter(buffer) as he:
                 he.mock(include="package_a.subpackage")
                 he.intern("**")
diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index 659355b62e59..480217b8feb3 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -2,12 +2,15 @@
 # Owner(s): ["oncall: package/deploy"]
 
 import inspect
+import platform
 from io import BytesIO
+from pathlib import Path
 from textwrap import dedent
+from unittest import skipIf
 
 from torch.package import PackageExporter, PackageImporter, is_from_package
 from torch.package.package_exporter import PackagingError
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import IS_FBCODE, IS_SANDCASTLE, run_tests
 
 try:
     from .common import PackageTestCase
@@ -31,6 +34,7 @@ def test_file_structure(self):
             """\
                 ├── .data
                 │   ├── extern_modules
+                │   ├── python_version
                 │   └── version
                 ├── main
                 │   └── main
@@ -54,6 +58,7 @@ def test_file_structure(self):
             """\
                 ├── .data
                 │   ├── extern_modules
+                │   ├── python_version
                 │   └── version
                 ├── main
                 │   └── main
@@ -99,6 +104,36 @@ def test_file_structure(self):
             import_exclude,
         )
 
+    def test_python_version(self):
+        """
+        Tests that the current python version is stored in the package and is available
+        via PackageImporter's python_version() method.
+        """
+        buffer = BytesIO()
+
+        with PackageExporter(buffer) as he:
+            from package_a.test_module import SimpleTest
+
+            he.intern("**")
+            obj = SimpleTest()
+            he.save_pickle("obj", "obj.pkl", obj)
+
+        buffer.seek(0)
+        hi = PackageImporter(buffer)
+
+        self.assertEqual(hi.python_version(), platform.python_version())
+
+    @skipIf(
+        IS_FBCODE or IS_SANDCASTLE,
+        "Tests that use temporary files are disabled in fbcode",
+    )
+    def test_load_python_version_from_package(self):
+        """Tests loading a package with a python version embdded"""
+        importer1 = PackageImporter(
+            f"{Path(__file__).parent}/package_e/test_nn_module.pt"
+        )
+        self.assertEqual(importer1.python_version(), "3.9.7")
+
     def test_file_structure_has_file(self):
         """
         Test Directory's has_file() method.
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index 756507c26552..05931cc5f21b 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -111,8 +111,8 @@ def test_function_import_fake_quantize(self):
             'FusedMovingAvgObsFakeQuantize',
             'default_fake_quant',
             'default_weight_fake_quant',
-            'default_symmetric_fixed_qparams_fake_quant',
-            'default_affine_fixed_qparams_fake_quant',
+            'default_fixed_qparams_range_neg1to1_fake_quant',
+            'default_fixed_qparams_range_0to1_fake_quant',
             'default_per_channel_weight_fake_quant',
             'default_histogram_fake_quant',
             'default_fused_act_fake_quant',
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 35ff8aedaaa7..89b69d1ef182 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -81,8 +81,8 @@ def test_function_import_fake_quantize(self):
             'FusedMovingAvgObsFakeQuantize',
             'default_fake_quant',
             'default_weight_fake_quant',
-            'default_symmetric_fixed_qparams_fake_quant',
-            'default_affine_fixed_qparams_fake_quant',
+            'default_fixed_qparams_range_neg1to1_fake_quant',
+            'default_fixed_qparams_range_0to1_fake_quant',
             'default_per_channel_weight_fake_quant',
             'default_histogram_fake_quant',
             'default_fused_act_fake_quant',
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 23ee9c580005..223134724f7b 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -32,7 +32,7 @@ def test_function_import_fx(self):
         function_list = [
             'prepare',
             'convert',
-            'Fuser',
+            'fuse',
         ]
         self._test_function_import('fx', function_list)
 
@@ -155,9 +155,7 @@ def test_package_import_fx_fuse(self):
         self._test_package_import('fx.fuse')
 
     def test_function_import_fx_fuse(self):
-        function_list = [
-            'Fuser'
-        ]
+        function_list = ['fuse']
         self._test_function_import('fx.fuse', function_list)
 
     def test_package_import_fx_fusion_patterns(self):
@@ -170,15 +168,10 @@ def test_function_import_fx_fusion_patterns(self):
         ]
         self._test_function_import('fx.fusion_patterns', function_list)
 
-    def test_package_import_fx_quantization_types(self):
-        self._test_package_import('fx.quantization_types')
-
-    def test_function_import_fx_quantization_types(self):
-        function_list = [
-            'Pattern',
-            'QuantizerCls'
-        ]
-        self._test_function_import('fx.quantization_types', function_list)
+    # we removed matching test for torch.quantization.fx.quantization_types
+    # old: torch.quantization.fx.quantization_types
+    # new: torch.ao.quantization.quantization_types
+    # both are valid, but we'll deprecate the old path in the future
 
     def test_package_import_fx_utils(self):
         self._test_package_import('fx.utils')
@@ -199,7 +192,7 @@ def test_function_import_fx_utils(self):
             'create_qparam_nodes',
             'all_node_args_have_no_tensors',
             'node_return_type_is_int',
-            'node_bool_tensor_arg_indexes',
+            'get_non_observable_arg_indexes_and_types',
             'is_get_tensor_info_node',
             'maybe_get_next_module'
         ]
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index 613c237bdada..7cbab3be475e 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -6,6 +6,7 @@
 import torch.nn.intrinsic.quantized as nniq
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
+import torch.nn.quantized._reference as nnqr
 import torch.ao.quantization
 
 from torch.ao.quantization import (
@@ -26,6 +27,7 @@
     override_quantized_engine,
     override_qengines,
     qengine_is_qnnpack,
+    qengine_is_onednn,
 )
 from hypothesis import assume, given
 from hypothesis import strategies as st
@@ -98,7 +100,9 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
                                              zero_points=zero_point_tensor,
                                              axis=0, dtype=torch.qint8)
         else:
-            W_q = torch.quantize_per_tensor(W, 0.1, 4, torch.qint8)
+            # ONEDNN only supports symmetric quantization of weight
+            W_zp = 0 if qengine_is_onednn() else 4
+            W_q = torch.quantize_per_tensor(W, 0.1, W_zp, torch.qint8)
 
         X = torch.rand(batch_size, in_features).float()
         X_q = torch.quantize_per_tensor(X, 0.2, 10, torch.quint8)
@@ -433,7 +437,7 @@ def test_conv1d_api(self):
             X_scale = 1.3
             X_zero_point = 2
             W_scale = [0.5]
-            W_zero_point = [3]
+            W_zero_point = [0] if qengine_is_onednn() else [3]
             Y_scale = 5.0
             Y_zero_point = 4
             if torch.backends.quantized.engine == 'qnnpack':
@@ -500,7 +504,7 @@ def test_conv2d_api(self):
             X_scale = 1.3
             X_zero_point = 2
             W_scale = [0.5]
-            W_zero_point = [3]
+            W_zero_point = [0] if qengine_is_onednn() else [3]
             Y_scale = 5.0
             Y_zero_point = 4
             # use_fused -> quantized class
@@ -569,7 +573,7 @@ def test_conv3d_api(self):
             X_scale = 1.3
             X_zero_point = 2
             W_scale = [0.5]
-            W_zero_point = [3]
+            W_zero_point = [0] if qengine_is_onednn() else [3]
             Y_scale = 5.0
             Y_zero_point = 4
             # use_fused -> quantized class
@@ -1199,7 +1203,8 @@ def test_dynamic_convtranspose3d(self):
     def test_linear_api(self, batch_size, in_features, out_features, use_bias, use_default_observer):
         """test API functionality for nn.quantized.dynamic.Linear"""
         W = torch.rand(out_features, in_features).float()
-        W_scale, W_zp = _calculate_dynamic_qparams(W, torch.qint8)
+        qscheme = torch.per_tensor_symmetric if qengine_is_onednn() else torch.per_tensor_affine
+        W_scale, W_zp = _calculate_dynamic_qparams(W, torch.qint8, qscheme=qscheme)
         W_q = torch.quantize_per_tensor(W, W_scale, W_zp, torch.qint8)
         X = torch.rand(batch_size, in_features).float()
         B = torch.rand(out_features).float() if use_bias else None
@@ -1310,8 +1315,8 @@ def test_lstm_api(self, dtype, bidirectional):
                 bias_keys.append(key_name1)
                 bias_keys.append(key_name2)
 
-        if not (dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack"):
-            # fp16 dynamic quant is not supported for qnnpack
+        if not (dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn")):
+            # fp16 dynamic quant is not supported for qnnpack or onednn
             x = torch.randn(seq_len, batch, input_size)
             h = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size)
             c = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size)
@@ -1361,8 +1366,8 @@ def test_gru_api(self):
         # instantiated for all engines and dtypes
 
         for dtype in [torch.qint8, torch.float16]:
-            if dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack":
-                # fp16 dynamic quant is not supported for qnnpack
+            if dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn"):
+                # fp16 dynamic quant is not supported for qnnpack or onednn
                 continue
                 # Test default instantiation
             seq_len = 4
@@ -1434,8 +1439,8 @@ def test_cell_api(self, dtype):
                     'RNNReLU': torch.ops.quantized.quantized_rnn_relu_cell_dynamic}
 
         for rnn_type in cell_dict.keys():
-            if not (dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack"):
-                # fp16 dynamic quant is not supported for qnnpack
+            if not (dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn")):
+                # fp16 dynamic quant is not supported for qnnpack or onednn
                 kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias, 'dtype': dtype}
                 if rnn_type == 'RNNReLU':
                     kwargs['nonlinearity'] = "relu"
@@ -1453,3 +1458,204 @@ def test_cell_api(self, dtype):
                 bias_keys = ['bias_ih', 'bias_hh']
                 self.check_eager_serialization(cell_dq, cell_dict[rnn_type](**kwargs), [x])
                 self.check_weight_bias_api(cell_dq, weight_keys, bias_keys)
+
+class TestReferenceQuantizedModule(QuantizationTestCase):
+    def _quant_dequant_weight(self, weight, weight_qparams):
+        qscheme = weight_qparams["qscheme"]
+        scale = weight_qparams["scale"]
+        zero_point = weight_qparams["zero_point"]
+        dtype = weight_qparams["dtype"]
+        if qscheme == torch.per_tensor_affine:
+            weight = torch.quantize_per_tensor(weight, scale, zero_point, dtype)
+        else:
+            # per channel affine
+            axis = weight_qparams["axis"]
+            weight = torch.quantize_per_channel(weight, scale, zero_point, axis, dtype)
+        weight = weight.dequantize()
+        return weight
+
+    # TODO: add tests for conv and linear
+    def test_rnn_cell(self):
+        """ Checks the rnn cell reference quantized modules has correct numerics
+        This includes LSTMCell, GRUCell, RNNCell
+        """
+        batch = 7
+        input_size = 3
+        hidden_size = 7
+        bias = True
+
+        x = torch.rand(batch, input_size)
+        h = torch.rand(batch, hidden_size)
+        cell_dict = {'LSTMCell': torch.nn.LSTMCell,
+                     'GRUCell': torch.nn.GRUCell,
+                     'RNNTanh': torch.nn.RNNCell,
+                     'RNNReLU': torch.nn.RNNCell
+                     }
+        state = {'LSTMCell': (h, h),
+                 'GRUCell': h,
+                 'RNNTanh': h,
+                 'RNNReLU': h}
+
+        qfn_dict = {'LSTMCell': nnqr.LSTMCell,
+                    'GRUCell': nnqr.GRUCell,
+                    'RNNTanh': nnqr.RNNCell,
+                    'RNNReLU': nnqr.RNNCell}
+
+        for rnn_type in cell_dict.keys():
+            kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias}
+            if rnn_type == 'RNNReLU':
+                kwargs['nonlinearity'] = "relu"
+            elif rnn_type == 'RNNTanh':
+                kwargs['nonlinearity'] = "tanh"
+
+            fp_cell = cell_dict[rnn_type](**kwargs)
+            # initialize ref rnn cell module
+            weight_qparams = {
+                'qscheme': torch.per_tensor_affine,
+                'dtype': torch.quint8,
+                'scale': 2.0,
+                'zero_point': 5
+            }
+            weight_qparams_dict = {
+                "weight_ih": weight_qparams,
+                "weight_hh": weight_qparams,
+            }
+            ref_kwargs = kwargs.copy()
+            ref_kwargs["weight_qparams_dict"] = weight_qparams_dict
+            ref_cell = qfn_dict[rnn_type](**ref_kwargs)
+            # reassign the weights from fp32 rnn cell modulea
+            ref_cell.weight_ih = fp_cell.weight_ih
+            ref_cell.weight_hh = fp_cell.weight_hh
+            ref_cell.bias_ih = fp_cell.bias_ih
+            ref_cell.bias_hh = fp_cell.bias_hh
+
+            ref_res = ref_cell(x, state[rnn_type])
+
+            # change the weight of fp_res, we first want to run a quantie and
+            # dequantize on the weight
+            fp_cell.weight_ih = torch.nn.Parameter(self._quant_dequant_weight(fp_cell.weight_ih, weight_qparams_dict["weight_ih"]))
+            fp_cell.weight_hh = torch.nn.Parameter(self._quant_dequant_weight(fp_cell.weight_hh, weight_qparams_dict["weight_hh"]))
+            fp_res = fp_cell(x, state[rnn_type])
+            self.assertEqual(ref_res[0], fp_res[0], msg="RNNCell module API failed")
+            self.assertEqual(ref_res[1], fp_res[1], msg="RNNCell module API failed")
+
+    def test_rnn(self):
+        """ Checks the rnn reference quantized modules has correct numerics
+        This includes LSTM
+        """
+        seq_len = 4
+        batch = 2
+        input_size = 3
+        hidden_size = 7
+        num_layers = 2
+        bias = True
+        for bidirectional in [True, False]:
+            x = torch.randn(seq_len, batch, input_size)
+            h = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size)
+            c = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size)
+            fp32_rnn = torch.nn.LSTM(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                bias=bias,
+                batch_first=False,
+                dropout=0.0,
+                bidirectional=bidirectional)
+            # initialize ref rnn module
+            weight_qparams = {
+                'qscheme': torch.per_tensor_affine,
+                'dtype': torch.qint8,
+                'scale': 2.0,
+                'zero_point': 5
+            }
+            weight_qparams_dict = {key: weight_qparams for key in fp32_rnn._flat_weights_names if key.startswith("weight")}
+            ref_rnn = nnqr.LSTM(
+                input_size=input_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                bias=bias,
+                batch_first=False,
+                dropout=0.0,
+                bidirectional=bidirectional,
+                weight_qparams_dict=weight_qparams_dict)
+            for wn in fp32_rnn._flat_weights_names:
+                setattr(ref_rnn, wn, copy.deepcopy(getattr(fp32_rnn, wn)))
+
+            ref_rnn._flat_weights = copy.deepcopy(fp32_rnn._flat_weights)
+
+            # quantize and dequantize the weights for fp32_rnn module
+            flat_weights = []
+            for wn in fp32_rnn._flat_weights_names:
+                if wn.startswith("weight"):
+                    weight = self._quant_dequant_weight(getattr(fp32_rnn, wn), weight_qparams)
+                else:
+                    weight = getattr(fp32_rnn, wn)
+                flat_weights.append(weight)
+            fp32_rnn._flat_weights = flat_weights
+
+            fp32_res = fp32_rnn(x, (h, c))
+            ref_res = ref_rnn(x, (h, c))
+            self.assertEqual(fp32_res, ref_res)
+
+    def test_sparse(self):
+        """ Embedding and EmbeddingBag
+        """
+        num_embeddings = 10
+        embedding_dim = 3
+        # embedding input
+        ex = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+
+        # embedding bag input
+        ebx = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
+        offsets = torch.tensor([0, 4], dtype=torch.long)
+
+        fp_to_ref = {
+            nn.Embedding: (nnqr.Embedding, (ex,)),
+            nn.EmbeddingBag: (nnqr.EmbeddingBag, (ebx, offsets)),
+        }
+
+        per_tensor_weight_qparams = {
+            'qscheme': torch.per_tensor_affine,
+            'dtype': torch.quint8,
+            'scale': 2.0,
+            'zero_point': 5,
+        }
+
+        per_channel_weight_qparams = {
+            'qscheme': torch.per_channel_affine,
+            'dtype': torch.quint8,
+            'scale': torch.randn(10),
+            'zero_point': torch.randint(0, 255, (10,)),
+            'axis': 0,
+        }
+
+        per_channel_weight_qparams_quint4x2 = {
+            'qscheme': torch.per_channel_affine_float_qparams,
+            'dtype': torch.quint4x2,
+            'scale': torch.randn(10),
+            'zero_point': torch.randint(0, 255, (10,)),
+            'axis': 0,
+        }
+
+        weight_qparams_options = [
+            per_tensor_weight_qparams,
+            per_channel_weight_qparams,
+            per_channel_weight_qparams_quint4x2,
+        ]
+        for fp_cls, weight_qparams in itertools.product([nn.Embedding, nn.EmbeddingBag], weight_qparams_options):
+            # TODO: torch.quint4x2 not supported in quantize_per_channel, need to add support
+            if weight_qparams['dtype'] == torch.quint4x2:
+                continue
+            ref_cls, args = fp_to_ref[fp_cls]
+
+            fp32_embedding = fp_cls(num_embeddings, embedding_dim)
+
+            ref_embedding = ref_cls(num_embeddings, embedding_dim, weight_qparams=weight_qparams)
+            ref_embedding.weight = fp32_embedding.weight
+
+            # quantize and dequantize the weight for fp32 module
+            fp32_embedding.weight = torch.nn.Parameter(self._quant_dequant_weight(fp32_embedding.weight, weight_qparams))
+
+            fp32_res = fp32_embedding(*args)
+            ref_res = ref_embedding(*args)
+            self.assertEqual(fp32_res, ref_res)
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index be84e7bd4e81..935204a3de49 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -26,9 +26,13 @@
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
     override_quantized_engine, supported_qengines, override_qengines, _snr
-from torch.testing._internal.common_quantized import qengine_is_qnnpack
+from torch.testing._internal.common_quantized import (
+    qengine_is_qnnpack,
+    qengine_is_onednn,
+)
 from torch.ao.quantization import PerChannelMinMaxObserver
 from torch.testing._internal.common_cuda import TEST_CUDNN
+import torch.backends.xnnpack
 
 from typing import Optional
 
@@ -71,7 +75,7 @@ def avoid_vpmaddubsw_overflow_linear(
 
 
 # Reference quantized Linear operator
-def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp):
+def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp, dtype=np.uint8):
     X_q = np.reshape(X_q, (-1, X_q.shape[X_q.ndim - 1]))
     row_offsets_ref = X_q.sum(axis=1).astype(np.int32).reshape((-1, 1))
     col_offsets_ref = W_q.sum(axis=1).astype(np.int32).reshape((1, -1))
@@ -85,7 +89,7 @@ def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp):
     )
     if b_q is not None:
         Prod_XqWq_ref += b_q
-    Y_q_ref = _quantize(Prod_XqWq_ref, Y_scale / (X_scale * W_scale), Y_zp)
+    Y_q_ref = _quantize(Prod_XqWq_ref, Y_scale / (X_scale * W_scale), Y_zp, dtype=dtype)
     return Y_q_ref
 
 """Computes the output shape given pooling parameters."""
@@ -441,8 +445,9 @@ def test_qgelu(self):
         shapes = ((4,), (4, 4), (4, 4, 4), (4, 4, 4, 4))
         dtypes = (torch.quint8, torch.qint8)
         memory_formats = (torch.channels_last, torch.contiguous_format)
-        test_cases = itertools.product(shapes, dtypes, memory_formats)
-        for shape, dtype, memory_format in test_cases:
+        approximation = ['none', 'tanh']
+        test_cases = itertools.product(shapes, dtypes, memory_formats, approximation)
+        for shape, dtype, memory_format, approximate in test_cases:
             if memory_format == torch.channels_last and len(shape) != 4:
                 continue
             X, scale, zero_point, torch_type = \
@@ -454,7 +459,7 @@ def test_qgelu(self):
             dqX = qX.dequantize()
 
             op = torch.nn.functional.gelu
-            dqY = op(dqX)
+            dqY = op(dqX, approximate=approximate)
             qY = torch.quantize_per_tensor(dqY, scale=scale, zero_point=zero_point,
                                            dtype=torch_type)
             qY_hat = op(qX)
@@ -824,6 +829,76 @@ def test_qadd_relu_same_qparams(self):
             self.assertEqual(qCrelu_hat, qCrelu_out_hat,
                              msg="AddReLU.out failed")
 
+    """Tests the correctness of the cudnn add and add_relu op
+    (Similar to test_qadd_relu_different_qparams, will probably merge in the future)"""
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the test_qadd_relu_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qadd_relu_cudnn(self):
+        dtype = torch.qint8
+        add_relu = torch.ops.quantized.add_relu
+        add = torch.ops.quantized.add
+
+        A = torch.arange(-128, 130, dtype=torch.float).to(torch.device("cuda"))
+        B = torch.arange(-128, 130, dtype=torch.float).to(torch.device("cuda"))
+        scale_A = 2.5
+        scale_B = 6.3
+        scale_C = 12.9
+        zero_point = 0
+        qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point,
+                                       dtype=dtype)
+        qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point,
+                                       dtype=dtype)
+        # Add ground truth
+        C = (qA.dequantize() + qB.dequantize()).to(device="cpu").numpy()
+        qC = _quantize(C, scale_C, zero_point, dtype=np_dtype[dtype])
+        qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qC, qC_hat.int_repr(),
+                                "Quantized addition failed.")
+
+        # Add + ReLU ground truth
+        Crelu = C.copy()
+        Crelu[C < 0] = 0
+        qCrelu = _quantize(Crelu, scale_C, zero_point, dtype=np_dtype[dtype])
+        qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
+                                "Quantized addition with ReLU failed.")
+
+    """Tests the correctness of the cudnn add and add_relu op for nhwc format"""
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the test_qadd_relu_cudnn_nhwc op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qadd_relu_cudnn_nhwc(self):
+        dtype = torch.qint8
+        add_relu = torch.ops.quantized.add_relu
+        add = torch.ops.quantized.add
+
+        A = torch.rand(16, 8, 4, 12).to(device="cuda")
+        B = torch.rand(16, 8, 4, 12).to(device="cuda")
+        scale_A = 2.5
+        scale_B = 6.3
+        scale_C = 12.9
+        zero_point = 0
+        qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point,
+                                       dtype=dtype)
+        qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point,
+                                       dtype=dtype)
+        # Add ground truth
+        C = (qA.dequantize() + qB.dequantize()).to(device="cpu").numpy()
+        qC = _quantize(C, scale_C, zero_point, dtype=np_dtype[dtype])
+        qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qC, qC_hat.int_repr(),
+                                "Quantized addition failed.")
+
+        # Add + ReLU ground truth
+        Crelu = C.copy()
+        Crelu[C < 0] = 0
+        qCrelu = _quantize(Crelu, scale_C, zero_point, dtype=np_dtype[dtype])
+        qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu")
+        np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
+                                "Quantized addition with ReLU failed.")
 
     """Tests the correctness of the add and add_relu op."""
     def test_qadd_relu_different_qparams(self):
@@ -991,9 +1066,20 @@ def test_qmul_relu_different_qparams(self):
                              msg="mulReLU.out failed")
 
     """Tests the correctness of the matmul op."""
-    def test_qmatmul(self):
-        A = torch.randn(size=(3, 4), dtype=torch.float32) * 3
-        B = torch.randn(size=(4, 5), dtype=torch.float32) * 3
+    @given(num_dims=st.integers(2, 5),
+           outer_dims=st.lists(st.integers(2, 6), min_size=3, max_size=3),
+           m=st.integers(2, 6),
+           k=st.integers(2, 6),
+           n=st.integers(2, 6),
+           dtypes=st.sampled_from(((torch.qint8, np.int8),
+                                   (torch.quint8, np.uint8))))
+    def test_qmatmul(self, num_dims, outer_dims, m, k, n, dtypes):
+        (torch_dtype, np_dtype) = dtypes
+
+        size_a = outer_dims[:num_dims - 2] + [m, k]
+        size_b = outer_dims[:num_dims - 2] + [k, n]
+        A = torch.randn(size=size_a, dtype=torch.float32) * 3
+        B = torch.randn(size=size_b, dtype=torch.float32) * 3
 
         scale_A = 3.1
         zero_point_A = 7
@@ -1003,15 +1089,22 @@ def test_qmatmul(self):
         scale_C = 1.3
         zero_point_C = 5
 
-        qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point_A,
-                                       dtype=torch.qint8)
-        qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point_B,
-                                       dtype=torch.qint8)
+        qA = torch.quantize_per_tensor(A,
+                                       scale=scale_A,
+                                       zero_point=zero_point_A,
+                                       dtype=torch_dtype)
+        qB = torch.quantize_per_tensor(B,
+                                       scale=scale_B,
+                                       zero_point=zero_point_B,
+                                       dtype=torch_dtype)
 
         # matmul ground truth
         C = torch.matmul(qA.dequantize(), qB.dequantize()).numpy()
-        qC = _quantize(C, scale_C, zero_point_C, dtype=np.int8)
-        qC_hat = torch.ops.quantized.matmul(qA, qB, scale=scale_C, zero_point=zero_point_C)
+        qC = _quantize(C, scale_C, zero_point_C, dtype=(np_dtype))
+        qC_hat = torch.ops.quantized.matmul(qA,
+                                            qB,
+                                            scale=scale_C,
+                                            zero_point=zero_point_C)
         np.testing.assert_equal(qC, qC_hat.int_repr(),
                                 "Quantized multiplication failed.")
 
@@ -1022,10 +1115,16 @@ def test_qmatmul(self):
         scales_B = torch.rand(size=(B.shape[axis],))
         zero_points_B = torch.randint(low=0, high=5, size=(B.shape[axis],))
 
-        qA = torch.quantize_per_channel(A, scales=scales_A, zero_points=zero_points_A,
-                                        axis=axis, dtype=torch.qint8)
-        qB = torch.quantize_per_channel(B, scales=scales_B, zero_points=zero_points_B,
-                                        axis=axis, dtype=torch.qint8)
+        qA = torch.quantize_per_channel(A,
+                                        scales=scales_A,
+                                        zero_points=zero_points_A,
+                                        axis=axis,
+                                        dtype=torch.qint8)
+        qB = torch.quantize_per_channel(B,
+                                        scales=scales_B,
+                                        zero_points=zero_points_B,
+                                        axis=axis,
+                                        dtype=torch.qint8)
         np.testing.assert_raises_regex(RuntimeError,
                                        ".*per-tensor.*",
                                        torch.ops.quantized.matmul,
@@ -1034,6 +1133,53 @@ def test_qmatmul(self):
                                        scale_C,
                                        zero_point_C)
 
+
+    """Tests the correctness of the quantized softmax op."""
+    @given(dims=st.lists(st.integers(2, 5), min_size=5, max_size=5))
+    def test_qsoftmax(self, dims):
+        for (num_dims, dim, memory_format) in [
+            (2, 1, torch.contiguous_format),  # 2d softmax over last dim
+            (4, 3, torch.contiguous_format),  # >2 dims, softmax along last dim
+            (5, 2, torch.contiguous_format),  # >2 dims, softmax along not last dim (requires permute)
+            (4, 3, torch.channels_last),      # >2 dims, softmax along last dim, but not contiguous
+            (4, 1, torch.channels_last),      # Channels Last, doesn't require permute
+            (5, 1, torch.channels_last_3d),   # Channels Last 3D, doesn't require permute
+        ]:
+            size = dims[:num_dims]
+            torch_dtype = torch.quint8
+            np_dtype = np.uint8
+
+            scale_X = 1.3
+            zero_point_X = 5
+            X = torch.rand(size=size, dtype=torch.float32) * 8 + zero_point_X
+            X = X.to(memory_format=memory_format)
+
+            scale_Y = 1 / 256
+            zero_point_Y = 0
+
+            qX = torch.quantize_per_tensor(X,
+                                           scale=scale_X,
+                                           zero_point=zero_point_X,
+                                           dtype=torch_dtype)
+
+
+            # softmax ground truth
+            Y = torch.softmax(qX.dequantize(), dim=dim).numpy()
+            qY = _quantize(Y, scale_Y, zero_point_Y, dtype=np_dtype)
+            qY_hat = torch.ops.quantized.softmax(qX,
+                                                 dim=dim,
+                                                 output_scale=scale_Y,
+                                                 output_zero_point=zero_point_Y)
+
+            np.testing.assert_equal(qY, qY_hat.int_repr(),
+                                    "Quantized softmax failed.")
+
+    """Tests the correctness of the quantized softmax op using qnnpack."""
+    @skipIfNoQNNPACK
+    def test_qsoftmax_qnnpack(self):
+        with override_quantized_engine('qnnpack'):
+            self.test_qsoftmax()
+
     """Tests the correctness of the mul and mul_relu op."""
     def test_qmul_broadcast(self):
         mul_relu = torch.ops.quantized.mul_relu
@@ -1160,6 +1306,52 @@ def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode):
         self.assertEqual(a_ref, a_hat.dequantize(),
                          msg="ops.quantized.max_pool1d results are off")
 
+    # TODO: merge this test with test_max_pool2d when USE_EXPERIMENTAL_CUDNN_V8_API flag is enabled in CI
+    """Tests 2D cudnn max pool operation on quantized tensors."""
+    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
+                                              min_side=1, max_side=10),
+                       # cudnn's support for quantized pooling is limited to
+                       # int8 currently
+                       qparams=hu.qparams(dtypes=[torch.qint8])),
+           kernel=st.sampled_from((3, 5, 7)),
+           stride=st.sampled_from((None, 1, 2)),
+           # currently there is no support for dilation for cudnn
+           # pooling
+           dilation=st.integers(1, 1),
+           padding=st.integers(0, 2),
+           ceil_mode=st.booleans())
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the qconv2d_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_max_pool2d_cudnn(self, X, kernel, stride, dilation, padding, ceil_mode):
+        X, (scale, zero_point, torch_type) = X
+        assume(kernel // 2 >= padding)  # Kernel cannot be overhanging!
+        iH, iW = X.shape[-2:]
+        oH = pool_output_shape(iH, kernel, padding, stride, dilation, ceil_mode)
+        assume(oH > 0)
+        oW = pool_output_shape(iW, kernel, padding, stride, dilation, ceil_mode)
+        assume(oW > 0)
+
+        a = torch.from_numpy(X).to(device="cuda")
+        a_pool = torch.nn.functional.max_pool2d(a, kernel_size=kernel,
+                                                stride=stride,
+                                                padding=padding, dilation=dilation,
+                                                ceil_mode=ceil_mode)
+        a_ref = torch.quantize_per_tensor(a_pool, scale=scale,
+                                          zero_point=zero_point, dtype=torch_type)
+        a_ref = a_ref.dequantize()
+        qa = torch.quantize_per_tensor(a, scale=scale, zero_point=zero_point,
+                                       dtype=torch_type)
+
+        # Test the ops.quantized separately, because None is not treated.
+        a_hat = torch.ops.quantized.max_pool2d(
+            qa, kernel_size=_pair(kernel),
+            stride=_pair(kernel if stride is None else stride),
+            padding=_pair(padding), dilation=_pair(dilation), ceil_mode=ceil_mode)
+        self.assertEqual(a_ref, a_hat.dequantize(),
+                         msg="ops.quantized.max_pool2d results are off")
+
     """Tests 2D max pool operation on quantized tensors."""
     @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
                                               min_side=1, max_side=10),
@@ -1621,19 +1813,23 @@ def test_adaptive_avg_pool(self):
                 error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
 
                 for name, op in ops_under_test.items():
-                    qX_hat = op(qX, output_size=output_size)
-                    # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                    self.assertEqualIgnoreType(
-                        X_ref, qX_hat.int_repr(), atol=1.0,
-                        rtol=0, msg=error_message.format(name, X_ref, qX_hat))
-                    self.assertEqual(
-                        scale, qX_hat.q_scale(),
-                        msg=error_message.format(name + '.scale', scale,
-                                                 qX_hat.q_scale()))
-                    self.assertEqual(
-                        zero_point, qX_hat.q_zero_point(),
-                        msg=error_message.format(name + '.zero_point', scale,
-                                                 qX_hat.q_zero_point()))
+                    # TODO: torch.cuda.is_available() should be swapped for a flag that checks if cudnn
+                    # is enabled in the build when cudnn supports adaptive average pooling
+                    devices = ["cpu", "cuda"] if (dim == 2 and torch.cuda.is_available()) else ["cpu"]
+                    for device in devices:
+                        qX_hat = op(qX.to(device=device), output_size=output_size)
+                        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
+                        self.assertEqualIgnoreType(
+                            X_ref, qX_hat.int_repr(), atol=1.0,
+                            rtol=0, msg=error_message.format(name, X_ref, qX_hat))
+                        self.assertEqual(
+                            scale, qX_hat.q_scale(),
+                            msg=error_message.format(name + '.scale', scale,
+                                                     qX_hat.q_scale()))
+                        self.assertEqual(
+                            zero_point, qX_hat.q_zero_point(),
+                            msg=error_message.format(name + '.zero_point', scale,
+                                                     qX_hat.q_zero_point()))
 
     """Tests adaptive average pool operation on NHWC quantized tensors."""
     def test_adaptive_avg_pool3d_ndhwc(self):
@@ -2066,7 +2262,7 @@ def test_group_norm(self):
                     torch_type, Y_scale, Y_zero_point, channels_last, \
                     affine = test_case
                 num_channels = num_groups * channels_per_group
-                # minimum rank for for channels_last
+                # minimum rank for channels_last
                 shapes = (batches, num_channels, elements_per_channel, 1)
 
                 # In the FP kernel, sums and sums of squares are calculated in floating point.
@@ -2632,7 +2828,7 @@ def forward(
             ]
 
             q_data = []
-            reduce_range = (qengine == 'fbgemm')
+            reduce_range = (qengine in ('fbgemm', 'onednn'))
             for idx, x in enumerate(fp_data):
                 scale, zero_point = _calculate_dynamic_qparams(
                     x, dtype=dtype, reduce_range=reduce_range)
@@ -2653,7 +2849,13 @@ def forward(
                     mha.eval()
 
                     # Prepare
-                    mha.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+                    if qengine_is_onednn():
+                        # `reduce_range` is False by default for ONEDNN backend
+                        # but the test fails on earlier CPUs without VNNI.
+                        # So we use a default qconfig with `reduce_range=True` here
+                        mha.qconfig = torch.ao.quantization.get_default_qconfig()
+                    else:
+                        mha.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
                     mha_prepared = torch.ao.quantization.prepare(
                         mha, prepare_custom_config_dict=custom_module_config)
 
@@ -2746,7 +2948,7 @@ def test_qlinear(self, batch_size, input_channels, output_channels,
             (b_value_max - b_value_min) + b_value_min
         ).astype(np.int32) if use_bias else None
 
-        if torch.backends.quantized.engine == 'fbgemm':
+        if torch.backends.quantized.engine in ('fbgemm', 'onednn'):
             avoid_vpmaddubsw_overflow_linear(
                 batch_size,
                 input_channels,
@@ -2879,6 +3081,19 @@ def test_qlinear_legacy(self, batch_size, input_channels, output_channels):
         self.assertEqual(Y_fp32, Y_fp32_ref,
                          msg="torch.ops.quantized.fbgemm_linear_dynamic results are off")
 
+    @skipIfNoFBGEMM
+    @given(
+        input_channels=st.integers(16, 32),
+        output_channels=st.integers(4, 8),
+        exponent=st.integers(0, 8))
+    def test_linear_prepack_fp16_numerics(self, input_channels, output_channels, exponent):
+        w = torch.randn(output_channels, input_channels) * 10**exponent
+        bias = None
+        w_packed_fp16 = torch.ops.quantized.linear_prepack_fp16(w, bias)
+        w_unpacked_fp16 = torch.ops.quantized.linear_unpack_fp16(w_packed_fp16)
+        w_fp16 = w.to(torch.float16).to(torch.float32)
+        self.assertTrue(torch.equal(w_fp16, w_unpacked_fp16[0]))
+
     @skipIfNoFBGEMM
     def test_qlinear_dynamic_fp16(self):
 
@@ -2970,8 +3185,8 @@ def test_qlstmGRU(self, num_batches, input_size, hidden_size,
 
         for rnn_type in ['LSTM', 'GRU']:
             for dtype in [torch.qint8, torch.float16]:
-                # Fp16 quantization is not supported for qnnpack
-                if torch.backends.quantized.engine == 'qnnpack' and dtype == torch.float16:
+                # Fp16 quantization is not supported for qnnpack or onednn
+                if torch.backends.quantized.engine in ('qnnpack', 'onednn') and dtype == torch.float16:
                     continue
 
                 if torch.backends.quantized.engine == 'qnnpack':
@@ -3104,8 +3319,8 @@ def test_qrnncell(self, num_batches, input_size, hidden_size, per_channel_quant)
 
         for rnn_type in ['LSTMCell', 'GRUCell', 'RNNTanh', 'RNNReLU']:
             for dtype in [torch.qint8, torch.float16]:
-                # Fp16 quantization is not supported for qnnpack
-                if torch.backends.quantized.engine == 'qnnpack' and dtype == torch.float16:
+                # Fp16 quantization is not supported for qnnpack or onednn
+                if torch.backends.quantized.engine in ('qnnpack', 'onednn') and dtype == torch.float16:
                     continue
 
                 if torch.backends.quantized.engine == 'qnnpack':
@@ -3246,6 +3461,7 @@ class TestQuantizedLinear(TestCase):
     def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
                      use_relu, use_multi_dim_input, use_channelwise):
         decimal_val = 4
+        dtypes = [torch.quint8]
         if torch.backends.quantized.engine == 'qnnpack':
             # QNNPACK supports uint8 in the kernels. In the op we shift the int8
             # weight values to uint8 to be on par with fbgemm. However, this causes
@@ -3253,24 +3469,164 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
             # off by one results.
             decimal_val = 0
 
+            # only qnnpack qengine supports qint8 when xnnpack is available
+            if torch.backends.xnnpack.enabled:
+                dtypes.append(torch.qint8)
+
+        for dtype in dtypes:
+            # No support for channelwise in xnnpack (int8)
+            # ONEDNN does not support qint8
+            if dtype == torch.qint8 and (use_channelwise or qengine_is_onednn()):
+                return
+
+            nptype = np_dtype[dtype]
+            qlinear_prepack = torch.ops.quantized.linear_prepack
+            if use_relu:
+                qlinear = torch.ops.quantized.linear_relu
+            else:
+                qlinear = torch.ops.quantized.linear
+            if use_multi_dim_input:
+                batch_size *= 3  # Test the multi-dim input tensor
+            X_scale = 1.5
+            X_zp = 5
+            X_value_min = -128 if dtype == torch.qint8 else 0
+            X_value_max = 127 if dtype == torch.qint8 else 255
+            X_q0 = np.round(
+                np.random.rand(batch_size, input_channels) *
+                (X_value_max - X_value_min)
+                + X_value_min
+            ).astype(nptype)
+
+            W_scales = np.random.rand(output_channels)
+            # xnnpack forces W_zp to 0 when using symmetric quantization
+            # ONEDNN only supports symmetric quantization of weight
+            if dtype == torch.qint8 or qengine_is_onednn():
+                W_zps = np.zeros(output_channels).astype(np.int)
+            else:
+                W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+            # when using symmetric quantization
+            # special restriction for xnnpack fully connected op weight
+            # [-127, 127] instead of [-128, 127]
+            W_value_min = -127 if dtype == torch.qint8 else -128
+            W_value_max = 127
+            W_q0 = np.round(
+                np.random.rand(output_channels, input_channels)
+                * (W_value_max - W_value_min)
+                + W_value_min
+            ).astype(np.int8)  # weight is always int8_t
+            b_value_min = -10
+            b_value_max = 10
+            b_q0 = np.round(
+                np.random.rand(output_channels) *
+                (b_value_max - b_value_min) + b_value_min
+            ).astype(np.int32) if use_bias else None
+            if torch.backends.quantized.engine in ('fbgemm', 'onednn'):
+                avoid_vpmaddubsw_overflow_linear(
+                    batch_size,
+                    input_channels,
+                    output_channels,
+                    X_q0,
+                    X_value_min,
+                    X_value_max,
+                    W_q0,
+                    W_value_min,
+                    W_value_max,
+                )
+            X = torch.from_numpy(_dequantize(
+                X_q0, X_scale, X_zp)).to(dtype=torch.float)
+            X_q = torch.quantize_per_tensor(
+                X, scale=X_scale, zero_point=X_zp, dtype=dtype)
+            if use_channelwise:
+                W = torch.from_numpy(_dequantize(W_q0, W_scales.reshape(
+                    (-1, 1)), W_zps.reshape((-1, 1)))).to(dtype=torch.float)
+                W_q = torch.quantize_per_channel(W, scales=torch.from_numpy(W_scales),
+                                                 zero_points=torch.from_numpy(W_zps), axis=0, dtype=torch.qint8)
+                b = torch.from_numpy(_dequantize(
+                    b_q0, X_scale * W_scales, 0)).to(dtype=torch.float) if use_bias else None
+                b_q = torch.quantize_per_channel(b, scales=torch.from_numpy(X_scale * W_scales),
+                                                 zero_points=torch.zeros(output_channels, dtype=torch.long),
+                                                 axis=0, dtype=torch.qint32) if use_bias else None
+            else:
+                W = torch.from_numpy(_dequantize(
+                    W_q0, W_scales[0], W_zps[0])).to(dtype=torch.float)
+                W_q = torch.quantize_per_tensor(W, scale=W_scales[0], zero_point=(
+                    W_zps[0].astype(int).item()), dtype=torch.qint8)
+                b = torch.from_numpy(_dequantize(
+                    b_q0, X_scale * (W_scales[0].item()), 0)).to(dtype=torch.float) if use_bias else None
+                b_q = torch.quantize_per_tensor(
+                    b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None
+            # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
+            # Y_scale * 255 (max for uint8).
+            Y_scale = 125.1234
+            Y_zp = 5
+            # Weight prepacking operator for quantized Linear
+            float_bias = b if use_bias else None
+            W_prepack = qlinear_prepack(W_q, float_bias)
+            if use_multi_dim_input:
+                X_q = X_q.view(3, int(batch_size / 3), input_channels)
+            # Quantized Linear operator with prepacked weight
+            Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp)
+            if not use_channelwise:
+                # Test the per-tensor quantization only
+                # Reference quantized Linear operator
+                Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0,
+                                      W_scales[0], W_zps[0], b_q0, Y_scale, Y_zp, dtype=nptype)
+                if use_relu:
+                    Y_q_ref[Y_q_ref < Y_zp] = Y_zp
+                if use_multi_dim_input:
+                    Y_q_ref = np.reshape(
+                        Y_q_ref, (3, int(batch_size / 3), output_channels))
+                # Assert equal
+                np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val)
+            # Test both per-tensor and per-channel quantization
+            # Reference quantized result from PyTorch Linear operator
+            W_fp32 = W_q.dequantize().to(dtype=torch.float)
+            X_fp32 = X_q.dequantize().to(dtype=torch.float)
+            b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None
+            Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
+            if use_relu:
+                Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0
+            Y_q_ref2 = torch.quantize_per_tensor(
+                Y_fp32_ref, Y_scale, Y_zp, dtype)
+            # Assert equal
+            np.testing.assert_array_almost_equal(
+                Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val)
+
+    @given(batch_size=st.integers(1, 4),
+           # in cudnn v. 8.4.0, there is a limitation that input channels
+           # should be a multiple of 4 for int8 tensors. in cudnn v.8.3.3
+           # this should be a multiple of 16
+           input_channels=st.sampled_from([4, 8, 12, 16, 32]),
+           # constraints on output channels appear to be relax, as it seems we can use any positive integer here
+           # except 1. It is not clear why 1 will not work. TODO: check with Yang
+           output_channels=st.integers(2, 36),
+           use_bias=st.booleans(),
+           use_relu=st.booleans(),
+           use_multi_dim_input=st.booleans(),
+           use_channelwise=st.sampled_from([False]))  # channelwise currently not supported for qlinear cudnn
+    @skipIfNoFBGEMM
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the qlinear_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    # TODO: check with yang regarding CUDNN flags
+    def test_qlinear_cudnn(self, batch_size, input_channels, output_channels, use_bias,
+                           use_relu, use_multi_dim_input, use_channelwise):
         qlinear_prepack = torch.ops.quantized.linear_prepack
         if use_relu:
-            qlinear = torch.ops.quantized.linear_relu
+            qlinear_op = torch.ops.quantized.linear_relu
         else:
-            qlinear = torch.ops.quantized.linear
-        if use_multi_dim_input:
-            batch_size *= 3  # Test the multi-dim input tensor
+            qlinear_op = torch.ops.quantized.linear
         X_scale = 1.5
-        X_zp = 5
-        X_value_min = 0
-        X_value_max = 225
+        X_zp = 0
+        X_value_min = -128
+        X_value_max = 127
         X_q0 = np.round(
             np.random.rand(batch_size, input_channels) *
             (X_value_max - X_value_min)
-            + X_value_min
-        ).astype(np.uint8)
-        W_scales = np.random.rand(output_channels)
-        W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+            + X_value_min).astype(np.int8)
+        W_scale = 2.5
+        W_zp = 0
         W_value_min = -128
         W_value_max = 127
         W_q0 = np.round(
@@ -3284,6 +3640,15 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
             np.random.rand(output_channels) *
             (b_value_max - b_value_min) + b_value_min
         ).astype(np.int32) if use_bias else None
+        if use_bias:
+            b_value_min = -10
+            b_value_max = 10
+            b_q0 = np.round(
+                np.random.rand(output_channels) *
+                (b_value_max - b_value_min) + b_value_min
+            ).astype(np.int32)
+        else:
+            bias = None
         avoid_vpmaddubsw_overflow_linear(
             batch_size,
             input_channels,
@@ -3295,65 +3660,31 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
             W_value_min,
             W_value_max,
         )
+        quant_dtype = torch.qint8
         X = torch.from_numpy(_dequantize(
-            X_q0, X_scale, X_zp)).to(dtype=torch.float)
+            X_q0, X_scale, X_zp)).to(dtype=torch.float).to(device="cuda")
         X_q = torch.quantize_per_tensor(
-            X, scale=X_scale, zero_point=X_zp, dtype=torch.quint8)
-        if use_channelwise:
-            W = torch.from_numpy(_dequantize(W_q0, W_scales.reshape(
-                (-1, 1)), W_zps.reshape((-1, 1)))).to(dtype=torch.float)
-            W_q = torch.quantize_per_channel(W, scales=torch.from_numpy(W_scales),
-                                             zero_points=torch.from_numpy(W_zps), axis=0, dtype=torch.qint8)
-            b = torch.from_numpy(_dequantize(
-                b_q0, X_scale * W_scales, 0)).to(dtype=torch.float) if use_bias else None
-            b_q = torch.quantize_per_channel(b, scales=torch.from_numpy(X_scale * W_scales),
-                                             zero_points=torch.zeros(output_channels, dtype=torch.long),
-                                             axis=0, dtype=torch.qint32) if use_bias else None
-        else:
-            W = torch.from_numpy(_dequantize(
-                W_q0, W_scales[0], W_zps[0])).to(dtype=torch.float)
-            W_q = torch.quantize_per_tensor(W, scale=W_scales[0], zero_point=(
-                W_zps[0].astype(int).item()), dtype=torch.qint8)
-            b = torch.from_numpy(_dequantize(
-                b_q0, X_scale * (W_scales[0].item()), 0)).to(dtype=torch.float) if use_bias else None
-            b_q = torch.quantize_per_tensor(
-                b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None
-        # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
-        # Y_scale * 255 (max for uint8).
-        Y_scale = 125.1234
-        Y_zp = 5
+            X, scale=X_scale, zero_point=X_zp, dtype=quant_dtype)
+        W = torch.from_numpy(_dequantize(
+            W_q0, W_scale, W_zp)).to(dtype=torch.float).to(device="cuda")
+        W_q = torch.quantize_per_tensor(W, scale=W_scale, zero_point=W_zp, dtype=quant_dtype)
+        b = torch.from_numpy(_dequantize(
+            b_q0, X_scale * (W_zp), 0)).to(dtype=torch.float).to(device="cuda") if use_bias else None
+        b_q = torch.quantize_per_tensor(
+            b, scale=X_scale * W_scale, zero_point=0, dtype=quant_dtype) if use_bias else None
+        Y_scale = 0.5
+        Y_zp = 0
         # Weight prepacking operator for quantized Linear
         float_bias = b if use_bias else None
-        W_prepack = qlinear_prepack(W_q, float_bias)
-        if use_multi_dim_input:
-            X_q = X_q.view(3, int(batch_size / 3), input_channels)
+        W_prepack = qlinear_prepack(W_q, float_bias if use_bias else None)
         # Quantized Linear operator with prepacked weight
-        Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp)
-        if not use_channelwise:
-            # Test the per-tensor quantization only
-            # Reference quantized Linear operator
-            Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0,
-                                  W_scales[0], W_zps[0], b_q0, Y_scale, Y_zp)
-            if use_relu:
-                Y_q_ref[Y_q_ref < Y_zp] = Y_zp
-            if use_multi_dim_input:
-                Y_q_ref = np.reshape(
-                    Y_q_ref, (3, int(batch_size / 3), output_channels))
-            # Assert equal
-            np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val)
-        # Test both per-tensor and per-channel quantization
-        # Reference quantized result from PyTorch Linear operator
-        W_fp32 = W_q.dequantize().to(dtype=torch.float)
-        X_fp32 = X_q.dequantize().to(dtype=torch.float)
-        b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None
-        Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
+        Y_q = qlinear_op(X_q, W_prepack, Y_scale, Y_zp).to(device="cpu")
+        Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0,
+                              W_scale, W_zp, b_q0, Y_scale, Y_zp, dtype=np.int8)
         if use_relu:
-            Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0
-        Y_q_ref2 = torch.quantize_per_tensor(
-            Y_fp32_ref, Y_scale, Y_zp, torch.quint8)
-        # Assert equal
-        np.testing.assert_array_almost_equal(
-            Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val)
+            Y_q_ref[Y_q_ref < Y_zp] = Y_zp
+        decimal_val = 0
+        np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val)
 
     """Tests the correctness of the quantized::linear_unpack op."""
     @given(W=hu.tensor(shapes=hu.array_shapes(2, 2,),
@@ -3370,6 +3701,13 @@ def test_qlinear_unpack(self, W, use_channelwise):
         qlinear_prepack = torch.ops.quantized.linear_prepack
         qlinear_unpack = torch.ops.quantized.linear_unpack
 
+        # ONEDNN only supports symmetric quantization of weight
+        if qengine_is_onednn():
+            if use_channelwise:
+                W_zps = torch.zeros(output_channels).to(torch.int64)
+            else:
+                W_zp = 0
+
         W = torch.from_numpy(W)
         if use_channelwise:
             W_q = torch.quantize_per_channel(
@@ -3833,6 +4171,10 @@ def _test_qconv_unpack_impl(self, qconv_prepack_fn, qconv_unpack_fn, inputs,
         if channelwise and transposed:
             # currently transposed conv and per-channel per quantization does not work
             return
+        # ONEDNN only supports symmetric quantization of weight and zero output padding
+        if qengine_is_onednn():
+            W_zero_point = 0
+            o_pads = len(o_pads) * [0] if o_pads is not None else None
         if channelwise:
             if transposed:
                 output_channels = W.shape[1]  # IC OC/G
@@ -3971,6 +4313,9 @@ def _test_qconv_impl(
         weight_dtype=torch.qint8,
         output_dtype=torch.quint8,
     ):
+        # ONEDNN only supports symmetric quantization of weight
+        if qengine_is_onednn() and W_zero_point is not None:
+            W_zero_point = len(W_zero_point) * [0]
         (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors(
             batch_size, input_channels_per_group, input_feature_map_shape,
             output_channels_per_group, groups, kernels,
@@ -4055,7 +4400,7 @@ def _test_qconv_impl(
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.integers(0, 4),
            use_bias=st.booleans(),
-           use_relu=st.sampled_from([False]),
+           use_relu=st.booleans(),
            use_channelwise=st.booleans())
     @override_qengines
     def test_qconv2d(
@@ -4103,26 +4448,33 @@ def test_qconv2d(
             dilations,
             groups,
         )
-        self._test_qconv_impl(
-            qconv, qconv_prepack, conv_op, batch_size,
-            input_channels_per_group, (height, width),
-            output_channels_per_group, groups, kernels, strides, pads, None,
-            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False)
 
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
+
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
+
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (height, width),
+                output_channels_per_group, groups, kernels, strides, pads, None,
+                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+    # TODO: merge this test with test_qconv2d when CUDNN runtime flags becomes available
+    """Tests the correctness of quantized 2D convolution cudnn op."""
     @given(batch_size=st.integers(1, 3),
-           # only multiples of 16 are supported right now, might be fixed in
-           # next release of cudnn
-           # input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
-           input_channels_per_group=st.sampled_from([16, 32]),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           input_channels_per_group=st.integers(1, 32),
            height=st.integers(10, 16),
            width=st.integers(7, 14),
-           # only multiples of 16 are supported right now, might be fixed in
-           # next release of cudnn
-           # output_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
-           output_channels_per_group=st.sampled_from([16, 32]),
-           # groups=st.integers(1, 3),
-           groups=st.integers(1, 1),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 1),  # currently padding only supports groups=1
            kernel_h=st.integers(1, 7),
            kernel_w=st.integers(1, 7),
            stride_h=st.integers(1, 2),
@@ -4131,6 +4483,8 @@ def test_qconv2d(
            pad_w=st.integers(0, 2),
            # result for dilation == 2 is not correct
            # dilation=st.integers(1, 2),
+           # currently cudnn has only been verified to work for dilation = 1
+           # TODO: check backend works for dilation > 1
            dilation=st.integers(1, 1),
            X_scale=st.floats(1.2, 1.6),
            X_zero_point=st.sampled_from([0]),
@@ -4138,10 +4492,8 @@ def test_qconv2d(
            W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2),
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.sampled_from([0]),
-           # TODO: enable bias
-           use_bias=st.sampled_from([False]),
-           # TODO: enable relu
-           use_relu=st.sampled_from([False]),
+           use_bias=st.booleans(),
+           use_relu=st.booleans(),
            # TODO: enable channelwise
            use_channelwise=st.sampled_from([False]))
     @skipIfNoFBGEMM
@@ -4181,8 +4533,10 @@ def test_qconv2d_cudnn(
         pads = (pad_h, pad_w)
         dilations = (dilation, dilation)
 
-        qconv = torch.ops.quantized.conv2d_cudnn
-        assert not use_relu, "conv2d_relu_cudnn is not supported yet"
+        if use_relu:
+            qconv = torch.ops.quantized.conv2d_relu
+        else:
+            qconv = torch.ops.quantized.conv2d
         conv_op = torch.nn.Conv2d(
             input_channels,
             output_channels,
@@ -4193,7 +4547,7 @@ def test_qconv2d_cudnn(
             groups,
         ).to(torch.device("cuda"))
         self._test_qconv_impl(
-            qconv, None, conv_op, batch_size,
+            qconv, torch.ops.quantized.conv2d_prepack, conv_op, batch_size,
             input_channels_per_group, (height, width),
             output_channels_per_group, groups, kernels, strides, pads, None,
             dilations, X_scale, X_zero_point, W_scale, W_zero_point,
@@ -4269,13 +4623,14 @@ def trace_handler(p):
         weight_int8 = torch.quantize_per_tensor(weight, 1, 0, torch.qint8).contiguous(memory_format=torch.channels_last)
         scale = 1.0
         zero_point = 0
-        conv_op = torch.ops.quantized.conv2d_cudnn
+        conv_op = torch.ops.quantized.conv2d
+        weight_prepacked = torch.ops.quantized.conv2d_prepack(weight_int8, None, stride, padding, dilation, groups)
         with profile(
                 activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 schedule=my_schedule,
                 on_trace_ready=trace_handler) as prof:
             for i in range(30):
-                conv_op(input_int8, weight_int8, None, stride, padding, dilation, groups, scale, zero_point)
+                conv_op(input_int8, weight_prepacked, scale, zero_point)
                 prof.step()
 
         print("int8 benchmark result:")
@@ -4323,7 +4678,7 @@ def test_qconv_transpose1d(
             return  # Currently only the QNNPACK is supported
         if qengine_is_qnnpack() and (IS_PPC or TEST_WITH_UBSAN):
             return  # QNNPACK doesn't support these
-        assume(o_pad < stride or o_pad < dilation)
+        assume(o_pad < stride and o_pad < dilation)
 
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
@@ -4346,40 +4701,51 @@ def test_qconv_transpose1d(
             dilation=dilations,
             bias=use_bias
         )
-        X_q, W_q, bias_float = self._test_qconv_impl(
-            qconv, qconv_prepack, conv_op, batch_size,
-            input_channels_per_group, (width, ),
-            output_channels_per_group, groups, kernels, strides, pads, o_pads,
-            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu=False,
-            use_channelwise=False, use_transpose=True)
 
-        # check that this doesn't error
-        test_conv = torch.nn.quantized.ConvTranspose1d(input_channels, output_channels, 1)
-        test_conv(X_q)
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
 
-        # Test the module implementation
-        qconv_op = torch.nn.quantized.ConvTranspose1d(
-            in_channels=input_channels,
-            out_channels=output_channels,
-            kernel_size=kernels,
-            stride=strides,
-            padding=pads,
-            output_padding=o_pads,
-            groups=groups,
-            dilation=dilations,
-            bias=use_bias
-        )
-        qconv_op.scale = Y_scale
-        qconv_op.zero_point = Y_zero_point
-        qconv_op.set_weight_bias(W_q, bias_float)
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
 
-        Y_dq_ref = conv_op(X_q.dequantize())
-        Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale,
-                                            zero_point=Y_zero_point,
-                                            dtype=torch.quint8)
-        Y_q = qconv_op(X_q)
-        self.assertEqual(Y_q_ref, Y_q)
+            X_q, W_q, bias_float = self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (width, ),
+                output_channels_per_group, groups, kernels, strides, pads, o_pads,
+                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, use_relu=False,
+                use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+            # check that this doesn't error
+            test_conv = torch.nn.quantized.ConvTranspose1d(input_channels, output_channels, 1)
+            test_conv.scale = Y_scale
+            test_conv(X_q)
+
+            # Test the module implementation
+            qconv_op = torch.nn.quantized.ConvTranspose1d(
+                in_channels=input_channels,
+                out_channels=output_channels,
+                kernel_size=kernels,
+                stride=strides,
+                padding=pads,
+                output_padding=o_pads,
+                groups=groups,
+                dilation=dilations,
+                bias=use_bias
+            )
+            qconv_op.scale = Y_scale
+            qconv_op.zero_point = Y_zero_point
+            qconv_op.set_weight_bias(W_q, bias_float)
+
+            Y_dq_ref = conv_op(X_q.dequantize())
+            Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale,
+                                                zero_point=Y_zero_point,
+                                                dtype=X_qdtype)
+            Y_q = qconv_op(X_q)
+            self.assertEqual(Y_q_ref, Y_q)
 
 
     """Tests the correctness of quantized convolution op."""
@@ -4432,8 +4798,11 @@ def test_qconv_transpose2d(
             use_bias):
         if qengine_is_qnnpack() and (IS_PPC or TEST_WITH_UBSAN):
             return  # QNNPACK doesn't support these
-        assume(o_pad_h < stride_h or o_pad_h < dilation)
-        assume(o_pad_w < stride_w or o_pad_w < dilation)
+        # ONEDNN does not support output paddings
+        if qengine_is_onednn() and (o_pad_h, o_pad_w) != (0, 0):
+            return
+        assume(o_pad_h < stride_h and o_pad_h < dilation)
+        assume(o_pad_w < stride_w and o_pad_w < dilation)
 
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
@@ -4456,40 +4825,50 @@ def test_qconv_transpose2d(
             dilation=dilations,
             bias=use_bias
         )
-        X_q, W_q, bias_float = self._test_qconv_impl(
-            qconv, qconv_prepack, conv_op, batch_size,
-            input_channels_per_group, (height, width),
-            output_channels_per_group, groups, kernels, strides, pads, o_pads,
-            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu=False,
-            use_channelwise=False, use_transpose=True)
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
 
-        # check that this doesn't error
-        test_conv = torch.nn.quantized.ConvTranspose2d(input_channels, output_channels, 1)
-        test_conv(X_q)
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
 
-        # Test the module implementation
-        qconv_op = torch.nn.quantized.ConvTranspose2d(
-            in_channels=input_channels,
-            out_channels=output_channels,
-            kernel_size=kernels,
-            stride=strides,
-            padding=pads,
-            output_padding=o_pads,
-            groups=groups,
-            dilation=dilations,
-            bias=use_bias
-        )
-        qconv_op.scale = Y_scale
-        qconv_op.zero_point = Y_zero_point
-        qconv_op.set_weight_bias(W_q, bias_float)
+            X_q, W_q, bias_float = self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (height, width),
+                output_channels_per_group, groups, kernels, strides, pads, o_pads,
+                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, use_relu=False,
+                use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+            # check that this doesn't error
+            test_conv = torch.nn.quantized.ConvTranspose2d(input_channels, output_channels, 1)
+            test_conv.scale = Y_scale
+            test_conv(X_q)
+
+            # Test the module implementation
+            qconv_op = torch.nn.quantized.ConvTranspose2d(
+                in_channels=input_channels,
+                out_channels=output_channels,
+                kernel_size=kernels,
+                stride=strides,
+                padding=pads,
+                output_padding=o_pads,
+                groups=groups,
+                dilation=dilations,
+                bias=use_bias
+            )
+            qconv_op.scale = Y_scale
+            qconv_op.zero_point = Y_zero_point
+            qconv_op.set_weight_bias(W_q, bias_float)
 
-        Y_dq_ref = conv_op(X_q.dequantize())
-        Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale,
-                                            zero_point=Y_zero_point,
-                                            dtype=torch.quint8)
-        Y_q = qconv_op(X_q)
-        self.assertEqual(Y_q_ref, Y_q)
+            Y_dq_ref = conv_op(X_q.dequantize())
+            Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale,
+                                                zero_point=Y_zero_point,
+                                                dtype=X_qdtype)
+            Y_q = qconv_op(X_q)
+            self.assertEqual(Y_q_ref, Y_q)
 
     """Tests the correctness of quantized convolution op."""
     @given(batch_size=st.integers(1, 3),
@@ -4551,6 +4930,9 @@ def test_qconv_transpose3d(
             use_bias):
         if qengine_is_qnnpack():
             return  # QNNPACK doesn't support this
+        # ONEDNN doesn't support output paddings
+        if qengine_is_onednn() and (o_pad_t, o_pad_h, o_pad_w) != (0, 0, 0):
+            return
         assume(o_pad_t < stride_t or o_pad_t < dilation)
         assume(o_pad_h < stride_h or o_pad_h < dilation)
         assume(o_pad_w < stride_w or o_pad_w < dilation)
@@ -4586,6 +4968,7 @@ def test_qconv_transpose3d(
 
         # check that this doesn't error
         test_conv = torch.nn.quantized.ConvTranspose3d(input_channels, output_channels, 1)
+        test_conv.scale = Y_scale
         test_conv(X_q)
 
         # Test the module implementation
@@ -4730,12 +5113,11 @@ def test_qconv1d(
         use_relu,
         use_channelwise,
     ):
-
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
         if torch.backends.quantized.engine == 'qnnpack':
             use_channelwise = False
-        true_conv1d = torch.nn.Conv1d(
+        conv1d = torch.nn.Conv1d(
             input_channels,
             output_channels,
             kernel,
@@ -4748,12 +5130,104 @@ def test_qconv1d(
         qconv = torch.ops.quantized.conv1d
         if use_relu:
             qconv = torch.ops.quantized.conv1d_relu
+
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
+
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
+
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv1d, batch_size,
+                input_channels_per_group, (length, ),
+                output_channels_per_group, groups, kernel, [stride], [pad], None,
+                [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+                input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+    # TODO: merge this test with test_qconv1d when CUDNN runtime flags becomes available
+    """Tests the correctness of quantized 1D convolution cudnn op."""
+    @given(batch_size=st.integers(1, 6),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           input_channels_per_group=st.integers(1, 32),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 1),  # currently padding only supports groups=1
+           length=st.integers(4, 16),
+           kernel=st.integers(1, 7),
+           stride=st.integers(1, 2),
+           pad=st.integers(0, 2),
+           # currently cudnn has only been verified to work for dilation = 1
+           # TODO: check backend works for dilation > 1
+           dilation=st.integers(1, 1),
+           X_scale=st.floats(1.2, 1.6),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           X_zero_point=st.sampled_from([0]),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           Y_zero_point=st.sampled_from([0]),
+           use_bias=st.booleans(),
+           use_relu=st.booleans(),
+           # TODO: enable channelwise
+           use_channelwise=st.sampled_from([False]))
+    @skipIfNoFBGEMM
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the qconv1d_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qconv1d_cudnn(
+        self,
+        batch_size,
+        input_channels_per_group,
+        output_channels_per_group,
+        groups,
+        length,
+        kernel,
+        stride,
+        pad,
+        dilation,
+        X_scale,
+        X_zero_point,
+        W_scale,
+        W_zero_point,
+        Y_scale,
+        Y_zero_point,
+        use_bias,
+        use_relu,
+        use_channelwise,
+    ):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+
+        conv1d = torch.nn.Conv1d(
+            input_channels,
+            output_channels,
+            kernel,
+            stride,
+            pad,
+            dilation,
+            groups,
+        ).to(torch.device("cuda"))
+        qconv_prepack = torch.ops.quantized.conv1d_prepack
+        if use_relu:
+            qconv = torch.ops.quantized.conv1d_relu
+        else:
+            qconv = torch.ops.quantized.conv1d
+
         self._test_qconv_impl(
-            qconv, qconv_prepack, true_conv1d, batch_size,
+            qconv, qconv_prepack, conv1d, batch_size,
             input_channels_per_group, (length, ),
             output_channels_per_group, groups, kernel, [stride], [pad], None,
             [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False)
+            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+            device=torch.device("cuda"),
+            input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8)
 
     @given(batch_size=st.integers(1, 4),
            input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16]),
@@ -5088,7 +5562,7 @@ def test_qnnpack_sigmoid_sweep(self):
     """Tests the correctness of the quantized::add (qnnpack) op."""
     @settings(suppress_health_check=(HealthCheck.filter_too_much,))
     @given(A=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
-                       qparams=hu.qparams(dtypes=torch.quint8)),
+                       qparams=hu.qparams(dtypes=[torch.quint8, torch.qint8])),
            zero_point=st.sampled_from([0, 2, 5, 15, 127]),
            scale_A=st.sampled_from([0.001, 0.057, 0.889, 12.3]),
            scale_B=st.sampled_from([0.008, 0.0821, 0.67, 7]),
@@ -5096,39 +5570,96 @@ def test_qnnpack_sigmoid_sweep(self):
     def test_qnnpack_add(self, A, zero_point, scale_A, scale_B, scale_C):
         with override_quantized_engine('qnnpack'):
             A_temp = A
-            A, (scale_a, zero_point_A, torch_type) = A_temp
-            B, (scale_b, zero_point_B, torch_type) = A_temp
-            A = torch.from_numpy(A)
-            B = torch.from_numpy(B)
-
-            assume(scale_A // scale_C >= 2**-14)
-            assume(scale_A // scale_C < 2**8)
-            assume(scale_B // scale_C >= 2**-14)
-            assume(scale_B // scale_C < 2**8)
-
-            zero_point_C = 127
-            qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point,
-                                           dtype=torch.quint8)
-            qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point,
-                                           dtype=torch.quint8)
+            for channels_last in [True, False]:
+                if channels_last and len(A_temp[0].shape) != 4:
+                    continue
+                A, (scale_a, zero_point_A, torch_type) = A_temp
+                B, (scale_b, zero_point_B, torch_type) = A_temp
+                A = torch.from_numpy(A)
+                B = torch.from_numpy(B)
 
-            # Add ground truth
-            C = (qA.dequantize() + qB.dequantize()).numpy()
+                if torch_type == torch.qint8 and not torch.backends.xnnpack.enabled:
+                    continue
 
-            qC = _quantize(C, scale_C, zero_point_C)
+                if channels_last:
+                    A = A.to(memory_format=torch.channels_last)
+                    B = B.to(memory_format=torch.channels_last)
+                assume(scale_A // scale_C >= 2**-14)
+                assume(scale_A // scale_C < 2**8)
+                assume(scale_B // scale_C >= 2**-14)
+                assume(scale_B // scale_C < 2**8)
 
-            qC_qnnp = torch.ops.quantized.add(qA, qB, scale_C, zero_point_C)
+                zero_point_C = 127
+                np_dtype = np.uint8
 
-            np.testing.assert_equal(qC, qC_qnnp.int_repr(),
-                                    "Quantized addition failed.")
+                if torch_type == torch.qint8:
+                    zero_point_C = 0
+                    np_dtype = np.int8
 
-            Crelu = C.copy()
-            Crelu[C < 0] = 0
-            qCrelu = torch.quantize_per_tensor(torch.from_numpy(Crelu), scale_C,
-                                               zero_point_C, dtype=torch.quint8)
-            qCrelu_hat = torch.ops.quantized.add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C)
-            np.testing.assert_equal(qCrelu.int_repr().numpy(), qCrelu_hat.int_repr(),
-                                    "Quantized addition with ReLU failed.")
+                qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point,
+                                               dtype=torch_type)
+                qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point,
+                                               dtype=torch_type)
+
+                # Add ground truth
+                C = (qA.dequantize() + qB.dequantize()).numpy()
+
+                qC = _quantize(C, scale_C, zero_point_C, dtype=np_dtype)
+
+                qC_qnnp = torch.ops.quantized.add(qA, qB, scale_C, zero_point_C)
+
+                np.testing.assert_equal(qC, qC_qnnp.int_repr(),
+                                        "Quantized addition failed.")
+
+                Crelu = C.copy()
+                Crelu[C < 0] = 0
+                qCrelu = torch.quantize_per_tensor(torch.from_numpy(Crelu), scale_C,
+                                                   zero_point_C, dtype=torch_type)
+                qCrelu_hat = torch.ops.quantized.add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C)
+                np.testing.assert_equal(qCrelu.int_repr().numpy(), qCrelu_hat.int_repr(),
+                                        "Quantized addition with ReLU failed.")
+
+    """Tests that quantized add works with broadcasting """
+    def test_qnnpack_add_broadcast(self):
+        def _run_test(A, B):
+            qA = torch.quantize_per_tensor(A, 0.02, 0, dtype)
+            qB = torch.quantize_per_tensor(B, 0.04, 2, dtype)
+
+            output_scale = 0.01
+            output_zp = 1
+
+            # ground truth
+            C = qA.dequantize() + qB.dequantize()
+            qC = torch.quantize_per_tensor(C, output_scale, output_zp, dtype)
+
+            # quantized
+            qC_hat_1 = torch.ops.quantized.add(qA, qB, output_scale, output_zp)
+            qC_hat_2 = torch.ops.quantized.add(qB, qA, output_scale, output_zp)
+
+            self.assertTrue(torch.allclose(qC.dequantize(), qC_hat_1.dequantize()))
+            self.assertTrue(torch.allclose(qC.dequantize(), qC_hat_2.dequantize()))
+
+        with override_quantized_engine("qnnpack"):
+            for dtype in (torch.qint8, torch.quint8):
+                if dtype == torch.qint8 and not torch.backends.xnnpack.enabled:
+                    continue
+
+                for channels_last in [True, False]:
+                    # 4d
+                    A = torch.randn(1, 3, 4, 4)
+                    B = torch.randn(1, 1, 1, 1)
+                    if channels_last:
+                        A = A.to(memory_format=torch.channels_last)
+                        B = B.to(memory_format=torch.channels_last)
+                    _run_test(A, B)
+
+                    # 5d
+                    C = torch.randn(1, 3, 4, 4, 4)
+                    D = torch.randn(1, 1, 1, 1, 1)
+                    if channels_last:
+                        C = C.to(memory_format=torch.channels_last_3d)
+                        D = D.to(memory_format=torch.channels_last_3d)
+                    _run_test(C, D)
 
     """Tests the correctness of quantized::qnnpack_maxpool2d op."""
     @given(A=hu.tensor(shapes=hu.array_shapes(4, 4, 3, 5),
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 45931637eb68..61dda57268bd 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -140,6 +140,56 @@ def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True):
     return Xq, loss
 
 class TestQuantizedTensor(TestCase):
+    def test_per_tensor_qtensor_to_memory_format(self):
+        n = np.random.randint(1, 10)
+        c = np.random.randint(2, 10)
+        h = np.random.randint(2, 10)
+        w = np.random.randint(2, 10)
+        x = torch.rand(n, c, h, w)
+        scale = np.random.uniform(0.1, 1.0)
+        zero_point = np.random.randint(0.0, 10)
+        qints = [torch.qint8, torch.quint8, torch.qint32]
+        dtype = qints[np.random.randint(0, len(qints))]
+        qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype)
+        x_nhwc = x.to(memory_format=torch.channels_last)
+        qx_nhwc_using_to = qx.to(memory_format=torch.channels_last)
+        qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last)
+        self.assertEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride())
+        self.assertEqual(qx_nhwc_using_to.stride(), x_nhwc.stride())
+
+        # When the last two dimensions of a 4D tensor are both size 1 or if c == 1, we have a degenerate case
+        # see https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html
+        # In this case, the output of torch.Tensor.to and torch.Tensor.contiguous should not be the same
+        x = torch.rand(10, 2, 1, 1)
+        qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype)
+        qx_nhwc_using_to = qx.to(memory_format=torch.channels_last)
+        qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last)
+        self.assertNotEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride())
+
+        x = torch.rand(10, 1, 2, 2)
+        qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype)
+        qx_nhwc_using_to = qx.to(memory_format=torch.channels_last)
+        qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last)
+        self.assertNotEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride())
+
+    def test_per_channel_qtensor_to_memory_format(self):
+        n = np.random.randint(1, 10)
+        c = np.random.randint(2, 10)
+        h = np.random.randint(2, 10)
+        w = np.random.randint(2, 10)
+        x = torch.rand(n, c, h, w)
+        x_nhwc = x.to(memory_format=torch.channels_last)
+        scale = np.random.uniform(0.1, 1.0)
+        zero_point = np.random.randint(0.0, 10)
+        qints = [torch.qint8, torch.quint8, torch.qint32]
+        dtype = qints[np.random.randint(0, len(qints))]
+        for axis in range(x.ndim):
+            scales = torch.rand(x.size(axis)) + 0.00001
+            zero_points = torch.randint(low=0, high=10, size=(x.size(axis), ))
+            qx = torch.quantize_per_channel(x, scales=scales, zero_points=zero_points, dtype=dtype, axis=axis)
+            qx_nhwc_using_to = qx.to(memory_format=torch.channels_last)
+            self.assertEqual(qx_nhwc_using_to.stride(), x_nhwc.stride())
+
     @unittest.skipIf(not TEST_CUDA, "No gpu is available.")
     def test_qtensor_cuda(self):
         self._test_qtensor(torch.device('cuda'))
@@ -304,25 +354,33 @@ def test_qtensor_float_assignment(self):
         # item
         scale = 1.0
         zero_point = 2
-        r = torch.ones(1, dtype=torch.float)
-        for dtype in [torch.qint8, torch.quint8, torch.qint32]:
-            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
-            self.assertEqual(qr.item(), 1)
-            self.assertEqual(qr[0].item(), 1)
-            # assignment
-            self.assertTrue(qr[0].is_quantized)
-            qr[0] = 11.3  # float assignment
-            self.assertEqual(qr.item(), 11)
-            x = torch.ones(1, dtype=torch.float) * 15.3
-            # Copying from a float Tensor
-            qr[:] = x
-            self.assertEqual(qr.item(), 15)
-
-            dtype_msg = str(dtype) + ", "
-            self.assertEqual(' '.join(str(qr).split()),
-                             "tensor([15.], size=(1,), dtype=" + dtype_msg +
-                             "quantization_scheme=torch.per_tensor_affine, " +
-                             "scale=1.0, zero_point=2)")
+        devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+        for device in devices:
+            r = torch.ones(1, dtype=torch.float).to(device=device)
+            for dtype in [torch.qint8, torch.quint8, torch.qint32]:
+                qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
+                self.assertEqual(qr.item(), 1)
+                self.assertEqual(qr[0].item(), 1)
+                # assignment
+                self.assertTrue(qr[0].is_quantized)
+                qr[0] = torch.Tensor([11.3]).to(device=device)  # float assignment
+                self.assertEqual(qr.item(), 11)
+                x = torch.ones(1, dtype=torch.float).to(device=device) * 15.3
+                # Copying from a float Tensor
+                qr[:] = x
+                self.assertEqual(qr.item(), 15)
+
+                dtype_msg = str(dtype) + ", "
+                if device == "cuda":
+                    self.assertEqual(' '.join(str(qr).split()),
+                                     "tensor([15.], device='" + str(qr.device) + "', size=(1,), dtype=" + dtype_msg +
+                                     "quantization_scheme=torch.per_tensor_affine, " +
+                                     "scale=1.0, zero_point=2)")
+                else:
+                    self.assertEqual(' '.join(str(qr).split()),
+                                     "tensor([15.], size=(1,), dtype=" + dtype_msg +
+                                     "quantization_scheme=torch.per_tensor_affine, " +
+                                     "scale=1.0, zero_point=2)")
 
     def test_qtensor_quant_dequant(self):
         scale = 0.02
@@ -490,7 +548,7 @@ def test_per_channel_to_device(self):
             self.assertEqual('cpu', dqr_cuda.q_per_channel_scales().device.type)
             self.assertEqual('cpu', dqr_cuda.q_per_channel_zero_points().device.type)
 
-    @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available')
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available')
     def test_compare_per_tensor_device_numerics(self):
         dtypes = [
             torch.quint8,
@@ -511,7 +569,7 @@ def test_compare_per_tensor_device_numerics(self):
             self.assertEqual(qtr.int_repr(), qtr_cuda.int_repr())
             self.assertTrue(np.allclose(dqtr, dqtr_cuda.cpu()))
 
-    @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available')
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available')
     def test_compare_per_channel_device_numerics(self):
         dtype_and_zero_types = [
             (torch.quint8, torch.float),
@@ -665,7 +723,7 @@ def _quantize_per_channel_sub_byte_ref(data, scales, zero_points, axis, bit_widt
             data = data.view(-1, dims[axis], np.prod(dims[axis + 1:]))
             qtensor_size = math.ceil(data.numel() / 2)
             res = torch.empty(qtensor_size, dtype=torch.uint8)
-            elem_per_byte = 8 / bit_width
+            elem_per_byte = 8 // bit_width
             quant_min, quant_max = _get_qranges(bit_width)
             for i in range(data.size()[0]):
                 for j in range(data.size()[1]):
@@ -1101,7 +1159,7 @@ def test_choose_qparams(self, X, reduce_range):
         np.testing.assert_array_almost_equal(X_scale, qparams[0], decimal=3)
         self.assertEqual(X_zp, qparams[1])
 
-    @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available')
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available')
     def test_cuda_quantization_does_not_pin_memory(self):
         # Context - https://github.com/pytorch/pytorch/issues/41115
         x = torch.randn(3)
@@ -1114,7 +1172,7 @@ def test_cuda_quantization_does_not_pin_memory(self):
         self.assertEqual(x.is_pinned(), False)
 
     # There's no way to actually pin the memory of a quantized tensor
-    @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available')
+    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available')
     def test_quant_pin_memory(self):
         x = torch.randn(3).pin_memory()
         self.assertEqual(x.is_pinned(), True)
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 5415e2b03dcb..f299026b3192 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -68,50 +68,70 @@
 tolerance = 1e-6
 
 class TestObserver(QuantizationTestCase):
-    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
+    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8, torch.qint32)),
            qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)),
            reduce_range=st.booleans())
     def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
         # reduce_range cannot be true for symmetric quantization with uint8
-        if qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric:
+        if (qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric) or qdtype == torch.qint32:
             reduce_range = False
         ObserverList = [MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range),
                         MovingAverageMinMaxObserver(averaging_constant=0.5,
                                                     dtype=qdtype,
                                                     qscheme=qscheme,
                                                     reduce_range=reduce_range)]
+
+        def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val):
+            eps = torch.tensor([tolerance])
+            if dtype == torch.qint8:
+                if reduce_range:
+                    quant_min, quant_max = -64, 63
+                else:
+                    quant_min, quant_max = -128, 127
+            elif dtype == torch.quint8:
+                if reduce_range:
+                    quant_min, quant_max = 0, 127
+                else:
+                    quant_min, quant_max = 0, 255
+            elif dtype == torch.qint32:
+                quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1
+
+            min_val_neg = torch.tensor([0.])
+            max_val_pos = torch.tensor([input_scale * max_val]) if qdtype is torch.qint32 else torch.tensor([max_val])
+
+            scale, zero_point = 1.0, 0
+            if qscheme == torch.per_tensor_symmetric or qscheme == torch.per_channel_symmetric:
+                scale = torch.max(-min_val_neg, max_val_pos) / (float(quant_max - quant_min) / 2)
+                scale = torch.max(scale, eps)
+                if dtype == torch.quint8:
+                    zero_point = 128
+            else:
+                scale = torch.max((max_val_pos - min_val_neg) / float(quant_max - quant_min), eps)
+                zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+                zero_point = torch.clamp(zero_point, quant_min, quant_max)
+            return scale, zero_point
+
         for myobs in ObserverList:
             # Calculate Qparams should return with a warning for observers with no data
             qparams = myobs.calculate_qparams()
+            input_scale = 2**16 if qdtype is torch.qint32 else 1
             if type(myobs) == MinMaxObserver:
-                x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0])
-                y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0])
+                x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale
+                y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) * input_scale
             else:
                 # Moving average of min/max for x and y matches that of
                 # extreme values for x/y used for minmax observer
-                x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0])
-                y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0])
+                x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale
+                y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0]) * input_scale
 
             result = myobs(x)
             result = myobs(y)
             self.assertEqual(result, y)
-            self.assertEqual(myobs.min_val, 1.0)
-            self.assertEqual(myobs.max_val, 8.0)
+            self.assertEqual(myobs.min_val, 1.0 * input_scale)
+            self.assertEqual(myobs.max_val, 8.0 * input_scale)
             qparams = myobs.calculate_qparams()
-            if reduce_range:
-                if qscheme == torch.per_tensor_symmetric:
-                    ref_scale = 0.062745 * 255 / 127
-                    ref_zero_point = 0 if qdtype is torch.qint8 else 128
-                else:
-                    ref_scale = 0.0313725 * 255 / 127
-                    ref_zero_point = -64 if qdtype is torch.qint8 else 0
-            else:
-                if qscheme == torch.per_tensor_symmetric:
-                    ref_scale = 0.062745
-                    ref_zero_point = 0 if qdtype is torch.qint8 else 128
-                else:
-                    ref_scale = 0.0313725
-                    ref_zero_point = -128 if qdtype is torch.qint8 else 0
+            ref_scale, ref_zero_point = _get_ref_params(reduce_range, qscheme, qdtype, input_scale, 1.0, 8.0)
+
             self.assertEqual(qparams[1].item(), ref_zero_point)
             self.assertEqual(qparams[0].item(), ref_scale, atol=1e-5, rtol=0)
             state_dict = myobs.state_dict()
@@ -380,7 +400,7 @@ def test_zero_numel(self):
             x = obs(x)
 
     def _test_memoryless(self, obs_class):
-        obs = obs_class(memoryless=True)
+        obs = obs_class(averaging_constant=1)
         x = torch.randn((3, 3))
         obs(x)
         params = obs.calculate_qparams()
@@ -391,10 +411,10 @@ def _test_memoryless(self, obs_class):
             self.assertEqual(params, obs.calculate_qparams())
 
     def test_memoryless_minmaxobserver(self):
-        self._test_memoryless(MinMaxObserver)
+        self._test_memoryless(MovingAverageMinMaxObserver)
 
     def test_memoryless_perchannelminmaxobserver(self):
-        self._test_memoryless(PerChannelMinMaxObserver)
+        self._test_memoryless(MovingAveragePerChannelMinMaxObserver)
 
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
@@ -555,10 +575,9 @@ def test_record_observer(self):
                 self.assertEqual(observer_dict['fc1.module.activation_post_process'].get_tensor_value()[0],
                                  model(self.calib_data[0][0]))
 
-    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
-           qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)))
-    def test_observer_scriptable(self, qdtype, qscheme):
-        obs = RecordingObserver(dtype=qdtype, qscheme=qscheme)
+    @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)))
+    def test_observer_scriptable(self, qdtype):
+        obs = RecordingObserver(dtype=qdtype)
         scripted = torch.jit.script(obs)
 
         x = torch.rand(3, 4)
@@ -738,6 +757,17 @@ def test_fq_serializable_per_channel(self):
         for key in state_dict:
             self.assertEqual(state_dict[key], loaded_dict[key])
 
+    def test_quant_min_max_override(self):
+        observer = default_per_channel_weight_observer
+        # test no override
+        fq_module = FakeQuantize(observer)
+        self.assertEqual(fq_module.activation_post_process.quant_min, -128)
+        self.assertEqual(fq_module.activation_post_process.quant_max, 127)
+        # test quant_min/quant_max override
+        fq_module = FakeQuantize(observer, quant_min=0, quant_max=127)
+        self.assertEqual(fq_module.activation_post_process.quant_min, 0)
+        self.assertEqual(fq_module.activation_post_process.quant_max, 127)
+
 def _get_buffer_ids(module):
     """
     Object addresses stay constant if and only if all modifications are in-place
@@ -1124,9 +1154,8 @@ def test_fused_mod_per_channel(self):
 
     def test_fused_mod_reduce_range(self):
         obs = FusedMovingAvgObsFakeQuantize(quant_min=0, quant_max=255, dtype=torch.quint8, reduce_range=True)
-
-        self.assertEqual(obs.quant_min, 0)
-        self.assertEqual(obs.quant_max, 127)
+        self.assertEqual(obs.activation_post_process.quant_min, 0)
+        self.assertEqual(obs.activation_post_process.quant_max, 127)
 
     def test_embedding_bag_qat_config(self):
         class Model(nn.Module):
@@ -1241,16 +1270,19 @@ def forward(self, x):
             self.assertEqual(count_fake_quant, 3)
 
             if qengine == "fbgemm":
-                self.assertEqual(ref_model.quant.activation_post_process.quant_min, 0)
-                self.assertEqual(ref_model.quant.activation_post_process.quant_max, 127)
-                self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process),
-                                 MovingAveragePerChannelMinMaxObserver)
-            else:
-                self.assertEqual(ref_model.quant.activation_post_process.quant_min, 0)
-                self.assertEqual(ref_model.quant.activation_post_process.quant_max, 255)
-                self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process),
-                                 MovingAverageMinMaxObserver)
+                lower_bnd = 0
+                upper_bnd = 127
+                obs2match = MovingAveragePerChannelMinMaxObserver
 
+            else:
+                lower_bnd = 0
+                upper_bnd = 255
+                obs2match = MovingAverageMinMaxObserver
+
+            self.assertEqual(ref_model.quant.activation_post_process.activation_post_process.quant_min, lower_bnd)
+            self.assertEqual(ref_model.quant.activation_post_process.activation_post_process.quant_max, upper_bnd)
+            self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process),
+                             obs2match)
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 8b4baf10d45d..4972a0324e2c 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -7,7 +7,7 @@
     FakeQuantize,
     MovingAverageMinMaxObserver,
     default_observer,
-    default_affine_fixed_qparams_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
 )
 
 from torch.ao.quantization._learnable_fake_quantize import _LearnableFakeQuantize
@@ -544,7 +544,7 @@ def test_fq_module_per_tensor(self, device, X):
     def test_fixed_qparams_fq_module(self, device, X):
         X, (scale, zero_point, torch_type) = X
         X = to_tensor(X, device)
-        fq_module = default_affine_fixed_qparams_fake_quant()
+        fq_module = default_fixed_qparams_range_0to1_fake_quant()
         fq_module.to(device)
         fixed_scale = fq_module.scale.clone()
         fixed_zero_point = fq_module.zero_point.clone()
diff --git a/test/quantization/dbr/test_quantize_dbr.py b/test/quantization/dbr/test_quantize_dbr.py
index 30d3b64bdeb0..cd6dd6968ad0 100644
--- a/test/quantization/dbr/test_quantize_dbr.py
+++ b/test/quantization/dbr/test_quantize_dbr.py
@@ -22,6 +22,8 @@
 from torch.quantization import (
     ObserverBase,
     FakeQuantizeBase,
+    QConfig,
+    MinMaxObserver,
 )
 from torch.quantization.quantize_fx import (
     prepare_fx,
@@ -33,6 +35,9 @@
 import torch.ao.ns._numeric_suite_dbr as ns
 # TODO(future PR): move these utils out of the FX folder
 import torch.ao.ns._numeric_suite_fx as ns_fx
+from torch.ao.quantization._dbr.torchscript_utils import (
+    remove_redundant_aliases,
+)
 
 def _allclose(a, b):
     if isinstance(a, tuple):
@@ -248,9 +253,9 @@ def forward(self, x):
                 x = torch.cat([x, x], dim=1)
                 return x
 
-        m = M().eval()
         qconfig = torch.quantization.default_qconfig
         for dtype in (torch.int32, torch.int64):
+            m = M().eval()
             self._test_auto_tracing(
                 m, qconfig, (torch.zeros(1, 1, 1, 1, dtype=dtype),),
                 # FX graph mode quant does not support this yet
@@ -422,6 +427,10 @@ def test_fusion_called_multiple_times(self):
         """
         Tests that fusion works if the modules to fuse get called multiple
         times in the same forward.
+
+        Currently, observers are not shared between successive calls of
+        the same module.
+        TODO(future PR): make them shared (this is easy to detect)
         """
         class M(torch.nn.Module):
             def __init__(self):
@@ -437,7 +446,10 @@ def forward(self, x):
 
         m = M().eval()
         qconfig = torch.quantization.default_qconfig
-        self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 2, 2),))
+        # fx graph mode quant doesn't support using a single module multiple times
+        # right now, so this would crash, we can handle this case later
+        # if it is needed
+        self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 2, 2),), do_fx_comparison=False)
 
     def test_fusion_functions(self):
         class M(torch.nn.Module):
@@ -553,6 +565,7 @@ def forward(self, x):
         # test backprop does not crash
         inputs = torch.randn(1, 1, 1, 1)
         inputs.requires_grad = True
+        m = M(torch.randn(1, 1, 1, 1), torch.randn(1)).eval()
         mp = _quantize_dbr.prepare(m, {'': qconfig}, (inputs,))
         output = mp(inputs)
         labels = torch.randn(1, 1, 1, 1)
@@ -853,9 +866,10 @@ def forward(self, x):
         qconfig = torch.quantization.default_qconfig
         self._test_auto_tracing(model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))
 
-    @unittest.skip('this depends on unsupported syntax detection, currently disabled')
     def test_vovnet_sequential(self):
-
+        # We cannot quantize SequentialAppendList directly because
+        # AutoQuantizationStateModuleDict would appear in self.items.
+        # However, we can wrap it and quantize the wrapper.
         class SequentialAppendList(nn.Sequential):
             def __init__(self, *args):
                 super(SequentialAppendList, self).__init__(*args)
@@ -870,7 +884,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 x = torch.cat(concat_list, dim=1)
                 return x
 
-        m = SequentialAppendList(torch.nn.Conv2d(1, 1, 1)).eval()
+        class Wrapper(nn.Module):
+            def __init__(self, *args):
+                super().__init__()
+                self.append_list = SequentialAppendList(*args)
+
+            def forward(self, x):
+                x = self.append_list(x)
+                return x
+
+        m = Wrapper(torch.nn.Conv2d(1, 1, 1)).eval()
         qconfig = torch.quantization.default_qconfig
         self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 1, 1),))
 
@@ -922,10 +945,11 @@ def forward(self, x):
             model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
             fuse_modules=False)
 
-    # this is broken because AutoQuantizationState appears in self.items
-    @unittest.skip('TODO fix this')
     def test_module_calls_items(self):
-        class M(torch.nn.ModuleDict):
+        # We cannot quantize M1 directly because
+        # AutoQuantizationStateModuleDict would appear in self.items.
+        # However, we can wrap it and quantize the wrapper.
+        class M1(torch.nn.ModuleDict):
             def __init__(self):
                 super().__init__()
                 for i in range(2):
@@ -938,10 +962,22 @@ def forward(self, x):
                     layers.append(layer(x))
                 return torch.cat(layers, dim=1)
 
-        model_fp32 = M().eval()
+        class M2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.m1 = M1()
+
+            def forward(self, x):
+                x = self.m1(x)
+                return x
+
+        model_fp32 = M2().eval()
         qconfig = torch.quantization.default_qconfig
         self._test_auto_tracing(
-            model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))
+            model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
+            # TODO(future PR): implement observer sharing for torch.cat
+            # in DBR quant, to ensure that numerical behavior matches
+            do_fx_comparison=False)
 
     def test_subclass_of_quantizeable_module(self):
         """
@@ -1280,6 +1316,52 @@ def forward(self, x):
         input_shape = (1, 1, 1, 1)
         self._test_serialization(M, input_shape)
 
+    def test_jit_tracing_removes_aliases(self):
+        m = nn.Sequential(
+            nn.Conv2d(1, 1, 1),
+            nn.Sequential(
+                nn.Conv2d(1, 1, 1),
+            ),
+        )
+        qconfig_dict = {'': torch.quantization.default_qconfig}
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mq = _quantize_dbr.convert(mp)
+        mqs = torch.jit.trace(mq, example_args)
+        FileCheck().check_count("aten::alias", 5, exactly=True).run(
+            mqs.inlined_graph)
+        res1 = mqs(*example_args)
+        mqs = remove_redundant_aliases(mqs)
+        res2 = mqs(*example_args)
+        self.assertTrue(torch.allclose(res1, res2))
+        # TODO(future PR): figure out why aliasing still appears in the inlined
+        # graph, and if that is fixed then just check the inlined graph.
+        for graph in (
+            mqs.graph,
+            getattr(mqs, '1').graph,
+            getattr(getattr(mqs, '1'), '0').graph,
+        ):
+            FileCheck().check_count("aten::alias", 0, exactly=True).run(graph)
+
+    def test_conv_int32_reference_model(self):
+        m = nn.Sequential(nn.Conv2d(1, 1, 1)).eval()
+        int32_obs_ctr = MinMaxObserver.with_args(dtype=torch.qint32)
+        int32_qconfig = QConfig(weight=int32_obs_ctr, activation=int32_obs_ctr)
+        qconfig_dict = {'': int32_qconfig}
+        mp = _quantize_dbr.prepare(m, qconfig_dict, (torch.randn(1, 1, 1, 1),))
+        mp(torch.randn(1, 1, 1, 1))
+        mq = _quantize_dbr.convert(mp)
+        res = mq(torch.randn(1, 1, 1, 1))
+        mqt = torch.jit.trace(mq, (torch.randn(1, 1, 1, 1),))
+        # verify the right ops are present:
+        # x0 -> quant -> (dequant -> conv_ref -> quant) -> dequant -> x1
+        FileCheck()\
+            .check_count("aten::quantize_per_tensor", 2, exactly=True)\
+            .run(mqt.graph)
+        FileCheck()\
+            .check_count("aten::dequantize", 2, exactly=True)\
+            .run(mqt.graph)
+
 @skipIfNoFBGEMM
 class TestQuantizeDBRMultipleOps(QuantizeDBRTestCase):
     """
@@ -1520,3 +1602,18 @@ def test_mobilenet_v2(self):
             m, qconfig, (torch.randn(1, 3, 224, 224),),
             # TODO fix this (reason TBD)
             do_torchscript_checks=False)
+
+    @skip_if_no_torchvision
+    def test_mobilenet_v2_removes_aliases(self):
+        import torchvision
+        m = torchvision.models.__dict__['mobilenet_v2'](pretrained=False)\
+            .eval().float()
+        qconfig_dict = {'': torch.quantization.default_qconfig}
+        example_args = (torch.randn(1, 3, 224, 224),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mq = _quantize_dbr.convert(mp)
+        mqs = torch.jit.trace(mq, example_args)
+        res1 = mqs(*example_args)
+        mqs = remove_redundant_aliases(mqs)
+        res2 = mqs(*example_args)
+        self.assertTrue(torch.allclose(res1, res2))
diff --git a/test/quantization/eager/test_numeric_suite_eager.py b/test/quantization/eager/test_numeric_suite_eager.py
index 3bf969395c51..3714a1f28c67 100644
--- a/test/quantization/eager/test_numeric_suite_eager.py
+++ b/test/quantization/eager/test_numeric_suite_eager.py
@@ -19,6 +19,8 @@
     compare_model_outputs,
     compare_model_stub,
     compare_weights,
+    prepare_model_outputs,
+    get_matching_activations,
 )
 from torch.testing._internal.common_quantization import (
     AnnotatedConvBnReLUModel,
@@ -30,6 +32,7 @@
     QuantizationTestCase,
     SingleLayerLinearDynamicModel,
     test_only_eval_fn,
+    skip_if_no_torchvision,
 )
 from torch.testing._internal.common_quantized import override_qengines
 
@@ -421,14 +424,12 @@ def test_compare_model_outputs_functional_static(self):
         q_model(self.img_data_2d[0][0])
         q_model = convert(q_model)
         act_compare_dict = compare_model_outputs(model, q_model, self.img_data_2d[0][0])
-        self.assertEqual(len(act_compare_dict), 7)
+        self.assertEqual(len(act_compare_dict), 5)
         expected_act_compare_dict_keys = {
             "mycat.stats",
             "myadd.stats",
             "mymul.stats",
             "myadd_relu.stats",
-            "my_scalar_add.stats",
-            "my_scalar_mul.stats",
             "quant.stats",
         }
         self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
@@ -534,3 +535,50 @@ def test_shadow_logger(self):
 
         self.assertEqual(len(logger.stats["float"]), 2)
         self.assertEqual(len(logger.stats["quantized"]), 2)
+
+    @skip_if_no_torchvision
+    def _test_vision_model(self, float_model):
+        float_model.to('cpu')
+        float_model.eval()
+        float_model.fuse_model()
+        float_model.qconfig = torch.quantization.default_qconfig
+        img_data = [(torch.rand(2, 3, 224, 224, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)]
+        qmodel = quantize(float_model, torch.quantization.default_eval_fn, [img_data], inplace=False)
+
+        wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict())
+
+        def compute_error(x, y):
+            Ps = torch.norm(x)
+            Pn = torch.norm(x - y)
+            return 20 * torch.log10(Ps / Pn)
+
+        data = img_data[0][0]
+        # Take in floating point and quantized model as well as input data, and returns a dict, with keys
+        # corresponding to the quantized module names and each entry being a dictionary with two keys 'float' and
+        # 'quantized', containing the activations of floating point and quantized model at matching locations.
+        act_compare_dict = compare_model_outputs(float_model, qmodel, data)
+
+
+        for key in act_compare_dict:
+            compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize())
+
+        prepare_model_outputs(float_model, qmodel)
+
+        for data in img_data:
+            float_model(data[0])
+            qmodel(data[0])
+
+        # Find the matching activation between floating point and quantized modules, and return a dict with key
+        # corresponding to quantized module names and each entry being a dictionary with two keys 'float'
+        # and 'quantized', containing the matching floating point and quantized activations logged by the logger
+        act_compare_dict = get_matching_activations(float_model, qmodel)
+
+    @skip_if_no_torchvision
+    def test_mobilenet_v2(self):
+        from torchvision.models.quantization import mobilenet_v2
+        self._test_vision_model(mobilenet_v2(pretrained=True, quantize=False))
+
+    @skip_if_no_torchvision
+    def test_mobilenet_v3(self):
+        from torchvision.models.quantization import mobilenet_v3_large
+        self._test_vision_model(mobilenet_v3_large(pretrained=True, quantize=False))
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 6587740bdf9e..d06575c51bf2 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -62,6 +62,8 @@
     override_qengines,
 )
 from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import skipIfNoCaffe2
+
 from hypothesis import given
 from hypothesis import strategies as st
 import torch.testing._internal.hypothesis_utils as hu
@@ -74,6 +76,202 @@
 import numpy as np
 
 class TestQuantizeEagerOps(QuantizationTestCase):
+    @override_qengines
+    def _test_reference_module_impl(self,
+                                    float_module_class,
+                                    quantized_module_class,
+                                    extra_module_kwargs,
+                                    input_size):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = float_module_class(**extra_module_kwargs)
+                self.quant = QuantStub()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.dequant(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = float_module_class(**extra_module_kwargs)
+                self.quant1 = QuantStub()
+                self.dequant1 = DeQuantStub()
+                self.quant2 = QuantStub()
+                self.dequant2 = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant1(x)
+                x = self.dequant1(x)
+                x = self.conv(x)
+                x = self.quant2(x)
+                x = self.dequant2(x)
+                return x
+
+        qengine = torch.backends.quantized.engine
+        if qengine not in supported_qengines or qengine == 'qnnpack':
+            return   # qnnpack does not support nnq.ConvTranspose3d
+
+        data = torch.randn(*input_size, dtype=torch.float)
+        original_m = M()
+        original_ref_m = RefM()
+
+        original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+
+        original_m.qconfig = torch.quantization.default_qconfig
+
+        m = prepare(original_m)
+        # calibration
+        m(data)
+        m = convert(m)
+        # check if the module is properly quantized
+        self.assertEqual(type(m.quant), nnq.Quantize)
+        self.assertEqual(type(m.conv), quantized_module_class)
+        self.assertEqual(type(m.dequant), nnq.DeQuantize)
+        res = m(data)
+
+        # quantize the reference model
+        original_ref_m.eval()
+        original_ref_m.qconfig = torch.quantization.default_qconfig
+
+        ref_m = prepare(original_ref_m)
+        ref_m(data)
+        ref_m = convert(ref_m, is_reference=True)
+        ref_res = ref_m(data)
+        self.assertEqual(res, ref_res)
+
+    def test_conv_1d(self):
+        self._test_reference_module_impl(
+            nn.Conv1d,
+            nnq.Conv1d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 1)
+        )
+
+    def test_conv_2d(self):
+        self._test_reference_module_impl(
+            nn.Conv2d,
+            nnq.Conv2d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 10, 10)
+        )
+
+    def test_conv_3d(self):
+        self._test_reference_module_impl(
+            nn.Conv3d,
+            nnq.Conv3d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 10, 10, 10)
+        )
+
+    def test_conv_transpose_1d(self):
+        self._test_reference_module_impl(
+            nn.ConvTranspose1d,
+            nnq.ConvTranspose1d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 1)
+        )
+
+    def test_conv_transpose_2d(self):
+        self._test_reference_module_impl(
+            nn.ConvTranspose2d,
+            nnq.ConvTranspose2d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 10, 10)
+        )
+
+    def test_conv_transpose_3d(self):
+        self._test_reference_module_impl(
+            nn.ConvTranspose3d,
+            nnq.ConvTranspose3d,
+            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
+            (16, 1, 10, 10, 10)
+        )
+
+    def test_linear(self):
+        self._test_reference_module_impl(
+            nn.Linear,
+            nnq.Linear,
+            {'in_features': 5, 'out_features': 10},
+            (16, 5)
+        )
+
+    @override_qengines
+    def test_int16_reference_module(self):
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.ConvTranspose2d(1, 1, 1)
+                self.quant1 = QuantStub()
+                self.dequant1 = DeQuantStub()
+                self.quant2 = QuantStub()
+                self.dequant2 = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant1(x)
+                x = self.dequant1(x)
+                x = self.conv(x)
+                x = self.quant2(x)
+                x = self.dequant2(x)
+                return x
+
+
+        input_size = (16, 1, 10, 10)
+        data = torch.randn(*input_size, dtype=torch.float)
+
+        original_ref_m = RefM()
+        rand_w = torch.randn_like(original_ref_m.conv.weight)
+        rand_b = torch.randn_like(original_ref_m.conv.bias)
+        original_ref_m.conv.weight = torch.nn.Parameter(rand_w, requires_grad=False)
+        original_ref_m.conv.bias = torch.nn.Parameter(rand_b, requires_grad=False)
+
+        qengine = torch.backends.quantized.engine
+        if qengine not in supported_qengines:
+            return
+        from torch.ao.quantization.observer import MovingAverageMinMaxObserver
+
+        weight_obs = MovingAverageMinMaxObserver.with_args(
+            dtype=torch.qint32,
+            # set qmin and qmax to represent qint16
+            quant_min=-1 * (2 ** 15),
+            quant_max=(2 ** 15) - 1,
+            qscheme=torch.per_tensor_symmetric,
+        )
+        act_obs = MovingAverageMinMaxObserver.with_args(
+            dtype=torch.qint32,
+            quant_min=-1 * (2 ** 15),
+            quant_max=(2 ** 15) - 1,
+        )
+        custom_qconfig = QConfig(activation=act_obs, weight=weight_obs)
+
+        # quantize the reference model
+        original_ref_m.eval()
+        original_ref_m.qconfig = custom_qconfig
+
+        ref_m = prepare(original_ref_m)
+        # calibration
+        ref_m(torch.randn(*input_size, dtype=torch.float))
+
+        ref_m = convert(ref_m, is_reference=True)
+
+        myobs = MovingAverageMinMaxObserver(averaging_constant=0.5,
+                                            dtype=torch.qint32,
+                                            # set qmin and qmax to represent qint16
+                                            quant_min=-1 * (2 ** 15),
+                                            quant_max=(2 ** 15) - 1,
+                                            qscheme=torch.per_tensor_symmetric,
+                                            )
+        result = myobs(rand_w)
+        qparams = myobs.calculate_qparams()
+        self.assertEqual(ref_m.conv.weight_scale, qparams[0])
+
+
     def _test_activation_op_impl(
             self, float_module_class, quantized_module_class, extra_module_kwargs):
         """ Implementation for testing common activation ops like leaky relu
@@ -815,6 +1013,19 @@ def test_convtranspose_per_channel_qconfig_none(self):
         m[0].qconfig = None
         mp = torch.ao.quantization.prepare(m)
 
+    @skipIfNoFBGEMM
+    def test_quantwrapper_attaches_qconfig_to_dequant(self):
+        qconfig = torch.ao.quantization.default_qconfig
+
+        m = nn.Sequential(nn.Conv2d(1, 1, 1)).eval()
+        for i in range(len(m)):
+            m[i].qconfig = qconfig
+            m[i] = torch.ao.quantization.QuantWrapper(m[i])
+
+        mp = torch.ao.quantization.prepare(m)
+        mq = torch.ao.quantization.convert(mp)
+        self.assertTrue(isinstance(mq[0].dequant, nnq.DeQuantize))
+
 
 @skipIfNoFBGEMM
 class TestQuantizeEagerPTQDynamic(QuantizationTestCase):
@@ -1250,10 +1461,12 @@ def export_to_onnx(model, input, input_names):
             model = torch.jit.load(buf)
             f = io.BytesIO()
             torch.onnx.export(model, input, f, input_names=input_names,
-                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                              opset_version=9)
         onnx_model = export_to_onnx(model, data, input_names)
 
     @skipIfNoFBGEMM
+    @skipIfNoCaffe2
     def test_lower_graph_linear(self):
         model = torch.ao.quantization.QuantWrapper(torch.nn.Linear(5, 10, bias=True)).to(dtype=torch.float)
         data_numpy = np.random.rand(1, 2, 5).astype(np.float32)
@@ -1261,6 +1474,7 @@ def test_lower_graph_linear(self):
         self._test_lower_graph_impl(model, data)
 
     @skipIfNoFBGEMM
+    @skipIfNoCaffe2
     def test_lower_graph_conv2d(self):
         model = torch.ao.quantization.QuantWrapper(torch.nn.Conv2d(3, 5, 2, bias=True)).to(dtype=torch.float)
         data_numpy = np.random.rand(1, 3, 6, 6).astype(np.float32)
diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index efb7882c2dc3..984e87dacbbc 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 
+import copy
 import math
 import torch
 import torch.nn as nn
@@ -10,6 +11,7 @@
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 import torch.nn.qat as nnqat
+import torch.nn.intrinsic.qat as nniqat
 import torch.nn.qat.dynamic as nnqatd
 from torch.ao.quantization import (
     prepare,
@@ -21,6 +23,7 @@
     default_qconfig,
     default_qat_qconfig,
     default_embedding_qat_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
     get_default_qat_qconfig,
     FixedQParamsFakeQuantize,
     FusedMovingAvgObsFakeQuantize,
@@ -37,6 +40,7 @@
     ManualDropoutQATModel,
     ManualLinearDynamicQATModel,
     ManualConvLinearQATModel,
+    ManualConvLinearSymmQATModel,
     ManualEmbeddingBagLinear,
     TwoLayerLinearModel,
     test_only_eval_fn,
@@ -49,6 +53,8 @@
     override_qengines,
 )
 
+from torch.testing._internal.common_utils import skipIfNoXNNPACK
+
 from hypothesis import given
 from hypothesis import strategies as st
 import torch.testing._internal.hypothesis_utils as hu
@@ -338,11 +344,45 @@ def checkQuantized(model):
                 model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train])
                 checkQuantized(model)
 
+    @skipIfNoXNNPACK
+    def test_conv_linear_symm(self):
+        r"""Same as test_conv_linear but with Symmetric quantization.
+        Supported only with qengine=qnnpack, which uses symmetric
+        kernels from xnnpack library."""
+        for qengine in supported_qengines:
+            if qengine != 'qnnpack':
+                continue
+            with override_quantized_engine(qengine):
+                model = ManualConvLinearSymmQATModel()
+
+                model = prepare_qat(model)
+                self.checkObservers(model)
+
+                test_only_train_fn(model, self.img_data_2d_train)
+                model = convert(model)
+
+                def checkQuantized(model):
+                    self.assertEqual(type(model.conv), nnq.Conv2d)
+                    self.assertEqual(type(model.fc1), nnq.Linear)
+                    self.assertEqual(type(model.fc2), nnq.Linear)
+                    test_only_eval_fn(model, self.img_data_2d)
+                    self.checkScriptable(model, self.img_data_2d)
+                    self.checkNoQconfig(model)
+
+                checkQuantized(model)
+
+                model = ManualConvLinearSymmQATModel()
+                model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train])
+                checkQuantized(model)
+
     def test_dynamic_qat_linear(self):
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
                 # Dynamic QAT without memoryless observers should fail
-                with self.assertRaisesRegex(ValueError, "Dynamic QAT requires a memoryless observer"):
+                with self.assertRaisesRegex(ValueError,
+                                            "Dynamic QAT requires a memoryless observer." +
+                                            "This means a MovingAverage observer with averaging constant equal to 1"
+                                            ):
                     model = ManualLinearDynamicQATModel(default_qat_qconfig)
                     model = prepare_qat(model, mapping={torch.nn.Linear: nnqatd.Linear})
 
@@ -984,6 +1024,66 @@ def test_conv_bn_folded_vs_unfolded(
             qat_op_optim.step()
             qat_ref_op_optim.step()
 
+    @override_qengines
+    def test_linear_bn_numerics(self):
+        qengine = torch.backends.quantized.engine
+        m_ref = nn.Sequential(
+            nn.Linear(4, 4),
+            nn.BatchNorm1d(4),
+        )
+        m_ref_copy = copy.deepcopy(m_ref)
+        m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
+        qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        m_ref_copy[0].qconfig = qconfig
+        m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
+
+        # without fake_quants, fused QAT module should match fp32 module
+        m.apply(torch.quantization.disable_fake_quant)
+        data = torch.randn(4, 4)
+        r1 = m_ref(data)
+        r2 = m(data)
+        self.assertTrue(torch.allclose(r1, r2))
+
+    @skipIfNoXNNPACK
+    @override_qengines
+    def test_linear_bn_symm_numerics(self):
+        qengine = torch.backends.quantized.engine
+        if qengine != "qnnpack":
+            return  # Only qnnpack support symmetric quantization
+        m_ref = nn.Sequential(
+            nn.Linear(4, 4),
+            nn.BatchNorm1d(4),
+        )
+        m_ref_copy = copy.deepcopy(m_ref)
+        m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
+        qconfig = default_symmetric_qnnpack_qat_qconfig
+        m_ref_copy[0].qconfig = qconfig
+        m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
+
+        # without fake_quants, fused QAT module should match fp32 module
+        m.apply(torch.quantization.disable_fake_quant)
+        data = torch.randn(4, 4)
+        r1 = m_ref(data)
+        r2 = m(data)
+        self.assertTrue(torch.allclose(r1, r2))
+
+    @override_qengines
+    def test_linear_bn_workflow(self):
+        qengine = torch.backends.quantized.engine
+        m = nn.Sequential(
+            QuantStub(),
+            nn.Linear(4, 4),
+            nn.BatchNorm1d(4),
+        )
+        data = torch.randn(4, 4)
+        m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']])
+        mp = prepare_qat(m)
+        mp(data)
+        mq = convert(mp)
+        self.assertTrue(type(mq[1]) == nnq.Linear)
+        self.assertTrue(type(mq[2]) == nn.Identity)
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 145b2af81b37..4559c6389be6 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -71,6 +71,8 @@
     extract_shadow_logger_info,
     extend_logger_results_with_comparison,
 )
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers
 
 
 # Note: these models are not for use outside of this file. While it's good
@@ -274,7 +276,19 @@ def _wrapped_sigmoid(x):
 def _wrapped_linear(x, w, b):
     return F.linear(x, w, b)
 
-
+def get_all_quant_patterns():
+    """ we are in the process to migrate the frontend of fx graph mode quant
+    to use backend_config_dict, so some of the patterns are moved to backend_config_dict
+    this function will include these patterns so that we can still have all the patterns
+    """
+    # TODO: we can remove this call, and get all patterns from backend_config_dict in
+    # the future when the frontend refactor is done in fx graph mode quantization
+    all_quant_patterns = get_default_quant_patterns()
+    # some of the patterns are moved to (native) backend_config_dict so we need to
+    # add them back here
+    for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config_dict()).items():
+        all_quant_patterns[pattern] = quantize_handler
+    return all_quant_patterns
 
 class TestFXGraphMatcher(QuantizationTestCase):
 
@@ -463,7 +477,6 @@ def forward(self, x0):
         self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq)
 
     @skipIfNoFBGEMM
-    @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, need dtype inference support")
     def test_nodes_with_equal_types_get_matched(self):
         class M(nn.Module):
             def __init__(self):
@@ -510,13 +523,12 @@ def forward(self, x):
             conv_name_0:
                 ((nn.Conv2d, torch.ao.quantization.MinMaxObserver), (nn.Conv2d, nn.Conv2d)),
             mul_name_0: ((torch.mul, torch.ao.quantization.MinMaxObserver), (toq.mul, toq.mul)),
-            relu_name_0: ((F.relu, torch.ao.quantization.MinMaxObserver), (F.relu, F.relu)),
+            relu_name_0: ((F.relu, torch.ao.quantization.FixedQParamsObserver), (F.relu, F.relu)),
             sigmoid_name_0:
-                ((torch.sigmoid, torch.sigmoid), (torch.sigmoid, torch.sigmoid)),
+                ((torch.sigmoid, torch.ao.quantization.FixedQParamsObserver), (torch.sigmoid, torch.sigmoid)),
         }
         self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq)
 
-    @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, need dtype inference support")
     def test_methods(self):
         """
         Verify that graph matching works on methods
@@ -537,12 +549,11 @@ def forward(self, x):
             base_name_to_sets_of_related_ops, torch.sigmoid) + '_0'
         expected_types = {
             sigmoid_name_0:
-                (('sigmoid', 'sigmoid'), ('sigmoid', 'sigmoid')),
+                (('sigmoid', torch.ao.quantization.FixedQParamsObserver), ('sigmoid', torch.ao.quantization.FixedQParamsObserver)),
         }
         self.assert_types_for_matched_subgraph_pairs(
             results, expected_types, m1p, m2p)
 
-
     def test_op_relationship_mapping(self):
         """
         Tests that the mapping of op relationships is complete.
@@ -559,6 +570,15 @@ def test_op_relationship_mapping(self):
                 torch.ao.quantization.QuantStub,
                 torch.ao.quantization.DeQuantStub,
                 nnq.FloatFunctional,
+                # the ConvTranspose3d swap is not implemented in FX Graph
+                # mode quantization yet
+                nn.ConvTranspose3d,
+                # the GroupNorm swap is not implemented in FX Graph
+                # mode quantization yet
+                nn.GroupNorm,
+                # nnq.ReLU6 is no longer swapped, because nn.ReLU6 can
+                # take quantized inputs
+                nn.ReLU6,
             )
             if fp32_type in types_to_skip:
                 continue
@@ -620,7 +640,7 @@ def _op_is_unmatchable(op):
                 op in METHS_UNMATCHABLE
             )
 
-        default_quant_patterns = get_default_quant_patterns()
+        default_quant_patterns = get_all_quant_patterns()
         for pattern, qhandler_cls in default_quant_patterns.items():
             base_op = None
             if isinstance(pattern, tuple):
@@ -664,9 +684,6 @@ def _op_is_unmatchable(op):
                 # RNNDynamicQuantizeHandler
                 pass
             elif qhandler_cls == qp.DefaultNodeQuantizeHandler:
-                # torch.sum does not have quantized equivalents
-                if base_op == torch.sum:
-                    continue
                 self.assertTrue(
                     _op_in_base_sets_of_related_ops(base_op),
                     f"{base_op} not in sets of related ops")
@@ -682,8 +699,23 @@ def _op_is_unmatchable(op):
                     _op_in_base_sets_of_related_ops(base_op),
                     f"{base_op} not in sets of related ops")
             else:
-                raise AssertionError(
-                    f"handing for {qhandler_cls} not implemented")
+                # torch.sum does not have quantized equivalents
+                if base_op in [
+                        torch.sum,
+                        nn.GRUCell,
+                        nn.GRU,
+                        nn.LSTMCell,
+                        nn.RNNCell,
+                ]:
+                    continue
+                if isinstance(base_op, tuple):
+                    # skip fusion patterns
+                    continue
+                # didn't match explicit quantize handler class, we can check if the
+                # operator is in the related op set directly
+                if not (_op_in_base_sets_of_related_ops(base_op) or _op_is_unmatchable(base_op)):
+                    raise AssertionError(
+                        f"handling for {qhandler_cls} for op {base_op} not implemented")
 
     @skipIfNoFBGEMM
     def test_user_defined_function(self):
@@ -1106,8 +1138,6 @@ def _test_add_shadow_loggers_mod_impl(self, prepare_fn=prepare_fx):
             prepare_fn=prepare_fn, qconfig_dict=qconfig_dict)
 
     @skipIfNoFBGEMM
-    @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after"
-                   "dtype inference is supported")
     def test_add_shadow_loggers_mod_ptq(self):
         self._test_add_shadow_loggers_mod_impl(prepare_fn=prepare_fx)
 
@@ -1133,8 +1163,6 @@ def test_add_shadow_loggers_fun_qat(self):
         self._test_add_shadow_loggers_fun_impl(prepare_fn=prepare_qat_fx)
 
     @skipIfNoFBGEMM
-    @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after"
-                   "dtype inference is supported")
     def test_add_shadow_loggers_meth_ptq(self):
         """
         Verify that add_loggers works on methods
@@ -1147,34 +1175,10 @@ def forward(self, x):
         m = M().eval()
         res = self._test_match_shadow_activations(
             m, (torch.randn(4, 4),),
-            results_len=1)
-
-    @skipIfNoFBGEMM
-    def test_add_shadow_loggers_multiple_dtype_casts(self):
-        """
-        Verifies that for nodes where the first input arg is a list,
-        such as `cat`, we insert an individual dtype cast for each
-        arg of the list.
-        """
-        class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.cat([x, x, x], dim=0)
-                return x
-
-        m = M().eval()
-        expected_occurrence = {
-            # 3 dequantize function calls from the 3 dtype casts for [x, x, x]
-            ns.call_module(torch.nn.Identity): 3,
-            # 1 dequantize method call for module output
-            ns.call_method("dequantize"): 1,
-        }
-        self._test_match_shadow_activations(
-            m, (torch.randn(4, 4),),
-            prepared_expected_node_occurrence=expected_occurrence,
-            results_len=1, compare_fp32_vs_fp32_prepared=False)
+            # For now, sigmoid is not supported for shadowing because the dtype
+            # inference for it is not implemented yet. So, this is just testing
+            # that shadowing models with method calls does not crash.
+            results_len=0)
 
     @skipIfNoFBGEMM
     def test_shadow_activations_fqn(self):
@@ -1215,7 +1219,7 @@ def forward(self, x):
         m = M().eval()
         self._test_match_shadow_activations(
             m, (torch.randn(1, 1, 4, 4),),
-            results_len=2,
+            results_len=1,
             should_log_inputs=True)
 
     @skipIfNoFBGEMM
@@ -1301,7 +1305,6 @@ def test_linear_fp16_vs_linear_fp16_shadow_activations(self):
 
 
     @skipIfNoFBGEMM
-    @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later")
     def test_op_with_either_fp32_or_int8_input(self):
         """
         Verify that shadowing works with ops which accept either fp32 or
@@ -1320,7 +1323,9 @@ def forward(self, x):
         m = M()
         res = self._test_match_shadow_activations(
             m, (torch.randn(4, 4),),
-            results_len=2)
+            # Note: shadowing relu by itself is currently not supported,
+            # this test is just testing that it does not crash
+            results_len=0)
 
     def _test_int8_shadows_int8_impl(self, m):
         """
@@ -1488,6 +1493,15 @@ def test_op_io_dtype_coverage(self):
                 # makes sense
                 nn.Embedding,
                 nn.EmbeddingBag,
+                # the ConvTranspose3d swap is not implemented in FX Graph
+                # mode quantization yet
+                nn.ConvTranspose3d,
+                # the GroupNorm swap is not implemented in FX Graph
+                # mode quantization yet
+                nn.GroupNorm,
+                # nnq.ReLU6 is no longer swapped, because nn.ReLU6 can
+                # take quantized inputs
+                nn.ReLU6,
             )
             if fp32_type in types_to_skip:
                 continue
@@ -1534,7 +1548,7 @@ def test_op_io_dtype_coverage(self):
 
         # 4. go through the ops mapped to each QuantizeHandler type, and verify
         # correctness.
-        default_quant_patterns = get_default_quant_patterns()
+        default_quant_patterns = get_all_quant_patterns()
         for pattern, qhandler_cls in default_quant_patterns.items():
             base_op = None
             if isinstance(pattern, tuple):
@@ -1585,14 +1599,35 @@ def test_op_io_dtype_coverage(self):
                 self.assertTrue(
                     (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or
                     (base_op in MODS_IO_TYPE_FP32_OR_INT8) or
-                    (base_op in METHS_IO_TYPE_FP32_OR_INT8),
+                    (base_op in METHS_IO_TYPE_FP32_OR_INT8) or
+                    # Softmax has a different signature for the quantized
+                    # version, so it does not fit into the cases above.
+                    (base_op is torch.nn.Softmax),
                     f"missing IO type handling for {base_op}")
             elif qhandler_cls == qp.EmbeddingQuantizeHandler:
                 # embedding shadowing is not implemented, for now
                 continue
             else:
-                raise AssertionError(
-                    f"handing for {qhandler_cls} not implemented")
+                if (
+                    base_op in FUNS_UNMATCHABLE or
+                    base_op in MODS_UNMATCHABLE or
+                    base_op in METHS_UNMATCHABLE
+                ):
+                    continue
+                if qhandler_cls(None, {}).is_general_tensor_value_op():
+                    self.assertTrue(
+                        (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or
+                        (base_op in MODS_IO_TYPE_FP32_OR_INT8) or
+                        (base_op in METHS_IO_TYPE_FP32_OR_INT8),
+                        f"missing IO type handling for {base_op} using {qhandler_cls}")
+                else:
+                    self.assertTrue(
+                        (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or
+                        (base_op in MODS_IO_TYPE_FP32_OR_INT8) or
+                        (base_op in METHS_IO_TYPE_FP32_OR_INT8) or
+                        (base_op in FUNS_IO_TYPE_FP32) or
+                        (base_op in MODS_IO_TYPE_FP32) or
+                        f"missing IO type handling for {base_op} using {qhandler_cls}")
 
     @skipIfNoFBGEMM
     def test_user_defined_function(self):
@@ -1689,8 +1724,6 @@ def forward(self, x):
         self.assert_ns_compare_dict_valid(act_compare_dict)
 
     @skipIfNoFBGEMM
-    @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after"
-                   "dtype inference is supported")
     def test_layer_names(self):
         m = nn.Sequential(
             nn.Conv2d(1, 1, 1),
@@ -1822,7 +1855,7 @@ def forward(self, x):
         mp_shadows_mq(torch.randn(1, 1, 1, 1))
         act_compare_dict = extract_shadow_logger_info(
             mp_shadows_mq, OutputLogger, 'fp32')
-        self.assertTrue(len(act_compare_dict) == 4)
+        self.assertTrue(len(act_compare_dict) == 3)
         self.assert_ns_compare_dict_valid(act_compare_dict)
 
     @skipIfNoFBGEMM
@@ -1905,6 +1938,58 @@ def test_add_shadow_loggers_cuda(self):
         extend_logger_results_with_comparison(
             act_compare_dict, 'a', 'b', compute_sqnr, 'sqnr')
 
+    def test_fp16_shadows_fp32(self):
+        m = LinearReluFunctional().eval()
+        qconfig_dict = {"": torch.ao.quantization.float16_static_qconfig}
+        mp = prepare_fx(copy.deepcopy(m), qconfig_dict)
+        mq = convert_fx(mp, is_reference=True)
+        mq_shadows_m = add_shadow_loggers('a', mq, 'b', m, OutputLogger)
+
+    def test_mul_add_cat_stack_skips_shadowing(self):
+        class M(nn.Module):
+            def forward(self, x):
+                x = x * x
+                x = torch.mul(x, x)
+                x = x + x
+                x = torch.add(x, x)
+                x = torch.cat([x])
+                x = torch.stack([x])
+                return x
+
+        m = M().eval()
+        self._test_match_shadow_activations(
+            m, (torch.randn(1, 1, 4, 4),),
+            results_len=0)
+
+    def test_op_with_only_kwargs_skips_shadowing(self):
+        class M(nn.Module):
+            def forward(self, x):
+                x = torch.cat(tensors=[x])
+                x = torch.stack(tensors=[x])
+                return x
+
+        m = M().eval()
+        self._test_match_shadow_activations(
+            m, (torch.randn(1, 1, 4, 4),),
+            results_len=0)
+
+    def test_unsupported_op_copy_skips_shadowing(self):
+        """
+        Copying a `call_function` node is not implemented, test that this
+        does not crash shadowing but instead skips the node.
+        """
+        class M(nn.Module):
+            def forward(self, x):
+                # the second argument leads to attempting to copy a
+                # call_function node
+                x = F.layer_norm(x, x.shape[1:])
+                return x
+
+        m = M().eval()
+        self._test_match_shadow_activations(
+            m, (torch.randn(1, 1, 4, 4),),
+            results_len=0)
+
 
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
@@ -2038,12 +2123,11 @@ def test_sparsenn_shadow(self):
             x = torch.randn(2, 4)
             self._test_match_shadow_activations(
                 sparse_nn, (idx, offsets, x),
-                results_len=4,
+                results_len=3,
                 should_log_inputs=should_log_inputs)
 
     @skip_if_no_torchvision
     @skipIfNoFBGEMM
-    @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later")
     def test_resnet18(self):
         import torchvision
         m = torchvision.models.quantization.resnet18(pretrained=False, quantize=False).eval()
@@ -2055,7 +2139,6 @@ def test_resnet18(self):
 
     @skip_if_no_torchvision
     @skipIfNoFBGEMM
-    @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later")
     def test_mobilenet_v2(self):
         import torchvision
         m = torchvision.models.quantization.mobilenet_v2(pretrained=False, quantize=False).eval()
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 20bf20ea4027..27a83c5e7874 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -11,6 +11,7 @@
 import torch.nn.intrinsic.quantized as nniq
 import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.multiprocessing as mp
+from torch.ao.quantization import is_activation_post_process
 
 # graph mode quantization based on fx
 from torch.ao.quantization.quantize_fx import (
@@ -40,6 +41,7 @@
     default_qconfig,
     default_dynamic_qconfig,
     default_qat_qconfig,
+    default_reuse_input_qconfig,
     per_channel_dynamic_qconfig,
     float16_dynamic_qconfig,
     float16_static_qconfig,
@@ -48,6 +50,7 @@
     get_default_qconfig,
     get_default_qat_qconfig,
     get_default_qconfig_dict,
+    get_default_qat_qconfig_dict,
     fuse_modules,
     fuse_modules_qat,
     prepare,
@@ -77,14 +80,16 @@
     get_default_output_activation_post_process_map
 )
 
+from torch.ao.quantization.fx.utils import NodeInfo
+
 from torch.ao.quantization.fake_quantize import (
-    default_affine_fixed_qparams_fake_quant,
-    default_symmetric_fixed_qparams_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
 )
 
 from torch.ao.quantization.observer import (
-    default_affine_fixed_qparams_observer,
-    default_symmetric_fixed_qparams_observer,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
 )
 
 # test utils
@@ -130,7 +135,9 @@
 import operator
 import unittest
 import io
-from typing import Callable, Optional
+from typing import Callable, Optional, List
+
+
 
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
 
@@ -475,6 +482,195 @@ def forward(self, x):
         })
         self.checkGraphModuleNodes(m, expected_node=ns.call_module(MyConvReLU))
 
+    def test_fuse_custom_pattern(self):
+        class M(torch.nn.Module):
+            def __init__(self, use_torch_add=True):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 3, 3)
+                self.bn = torch.nn.BatchNorm2d(3)
+                self.relu = torch.nn.ReLU()
+                self.maxpool = torch.nn.MaxPool2d(3)
+                if use_torch_add:
+                    self.add = torch.add
+                else:
+                    self.add = operator.add
+
+            def forward(self, x):
+                y = x
+                y = self.maxpool(x)
+                x = self.conv(x)
+                x = self.bn(x)
+                x = self.add(y, x)
+                x = self.relu(x)
+                return x
+
+        for use_torch_add in [True, False]:
+            m = M(use_torch_add).eval()
+
+            def fuse_conv_bn_relu(is_qat, relu, add_pattern):
+                _, _, bn_pattern = add_pattern
+                bn, conv = bn_pattern
+                return conv
+
+            conv_bn_res_relu_config1 = {
+                "pattern": (nn.ReLU, (torch.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))),
+                "fuser_method": fuse_conv_bn_relu,
+            }
+
+            conv_bn_res_relu_config2 = {
+                "pattern": (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))),
+                "fuser_method": fuse_conv_bn_relu,
+            }
+
+            backend_config_dict = {
+                "configs": [conv_bn_res_relu_config1, conv_bn_res_relu_config2]
+            }
+            m = fuse_fx(m, backend_config_dict=backend_config_dict)
+            self.assertEqual(type(m.conv), torch.nn.Conv2d)
+            # check bn and relu are gone since we replaced the whole pattern to conv
+            self.assertFalse(hasattr(m, "bn"))
+            self.assertFalse(hasattr(m, "relu"))
+
+    def test_fusion_pattern_with_multiple_inputs(self):
+        """ This test tests two keys in backend_config_dict: root_node_getter and
+        extra_inputs_getter,
+        root_node_getter is used to identify a "root" module in the node pattern,
+        the node that we'll keep after fusion.
+        extra_inputs_getter will return a list of node that needs to be added to the
+        fused node as extra inputs.
+        """
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 3, 3)
+                self.bn = torch.nn.BatchNorm2d(3)
+                self.relu = torch.nn.ReLU()
+                self.maxpool = torch.nn.MaxPool2d(3)
+
+            def forward(self, x):
+                y = x
+                y = self.maxpool(x)
+                x = self.conv(x)
+                x = self.bn(x)
+                x = torch.add(x, y)
+                x = self.relu(x)
+                return x
+
+        m = M().eval()
+
+        def fuse_conv_bn_relu(is_qat, relu, add_pattern):
+            _, bn_pattern, _ = add_pattern
+            bn, conv = bn_pattern
+            return conv
+
+        def conv_bn_res_relu_root_node_getter(pattern):
+            relu, add_pattern = pattern
+            _, bn_pattern, _ = add_pattern
+            bn, conv = bn_pattern
+            return conv
+
+        def conv_bn_res_relu_extra_inputs_getter(pattern):
+            """ get inputs pattern for extra inputs, inputs for root node
+            are assumed to be copied over from root node to the fused node
+            """
+            relu, add_pattern = pattern
+            _, bn_pattern, extra_input = add_pattern
+            bn, conv = bn_pattern
+            return [extra_input]
+
+        conv_bn_res_relu_config = {
+            "pattern": (nn.ReLU, (torch.add, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode)),
+            "fuser_method": fuse_conv_bn_relu,
+            "root_node_getter": conv_bn_res_relu_root_node_getter,
+            "extra_inputs_getter": conv_bn_res_relu_extra_inputs_getter
+        }
+
+        backend_config_dict = {
+            "configs": [conv_bn_res_relu_config],
+        }
+        m = fuse_fx(m, backend_config_dict=backend_config_dict)
+        self.assertEqual(type(m.conv), torch.nn.Conv2d)
+        # check bn and relu are gone since we replaced the whole pattern to conv
+        self.assertFalse(hasattr(m, "bn"))
+        self.assertFalse(hasattr(m, "relu"))
+
+        # check conv module has two inputs
+        named_modules = dict(m.named_modules())
+        for node in m.graph.nodes:
+            if node.op == "call_module" and type(named_modules[node.target]) == torch.nn.Conv2d:
+                self.assertTrue(len(node.args) == 2), "Expecting the fused op to have two arguments"
+
+    def test_fusion_pattern_with_matchallnode(self):
+        """This test tests that the node matched by MatchAllNode will be regared as an input
+        instead of a module to be fused. For instance, we have two patterns:
+            (nn.ReLU, (torch.add, MatchAllNode, nn.Conv2d))
+            (nn.ReLU, nn.Conv2d)
+        And we wanna fuse the following model
+            Conv2d -> ReLU +
+            Conv2d ------ Add -> ReLU
+        ReLU in the first row is matched as MatchAllNode in the residual pattern. But it won't be
+        fused as part of that pattnern. It needs to be properly fused with the upstream Conv2d.
+        """
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(3, 3, 3)
+                self.relu1 = torch.nn.ReLU()
+                self.conv2 = torch.nn.Conv2d(3, 3, 3)
+                self.relu2 = torch.nn.ReLU()
+
+            def forward(self, x):
+                y = self.conv1(x)
+                y = self.relu1(y)
+
+                x = self.conv2(x)
+                x = torch.add(x, y)
+                x = self.relu2(x)
+                return x
+
+        m = M().eval()
+
+        def fuse_conv_relu(is_qat, relu, conv):
+            return conv
+
+        def fuse_conv_res_relu(is_qat, relu, add_pattern):
+            _, conv, _ = add_pattern
+            return conv
+
+        def conv_res_relu_root_node_getter(pattern):
+            relu, (_, conv, _) = pattern
+            return conv
+
+        def conv_res_relu_extra_inputs_getter(pattern):
+            relu, (_, _, extra_input) = pattern
+            return [extra_input]
+
+        conv_relu_config = {
+            "pattern": (nn.ReLU, nn.Conv2d),
+            "fuser_method": fuse_conv_relu,
+        }
+        conv_res_relu_config = {
+            "pattern": (nn.ReLU, (torch.add, nn.Conv2d, MatchAllNode)),
+            "fuser_method": fuse_conv_res_relu,
+            "root_node_getter": conv_res_relu_root_node_getter,
+            "extra_inputs_getter": conv_res_relu_extra_inputs_getter,
+        }
+
+        backend_config_dict = {
+            "configs": [
+                conv_relu_config,
+                conv_res_relu_config,
+            ],
+        }
+        m = fuse_fx(m, backend_config_dict=backend_config_dict)
+        self.assertEqual(type(m.conv1), torch.nn.Conv2d)
+        self.assertEqual(type(m.conv2), torch.nn.Conv2d)
+        # check relu are gone since we replaced the both patterns to conv
+        self.assertFalse(hasattr(m, "relu1"))
+        self.assertFalse(hasattr(m, "relu2"))
+
+
 @skipIfNoFBGEMM
 class TestQuantizeFx(QuantizationTestCase):
     def test_pattern_match(self):
@@ -826,7 +1022,7 @@ def forward(self, x):
         qconfig_dict = {'': qconfig}
         prepared = prepare_fx(m, qconfig_dict)
         quantized = convert_fx(prepared, is_reference=True)
-        qparams = (quantized._input_scale_0, quantized._input_zero_point_0)
+        qparams = (quantized._scale_0, quantized._zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
         # Get the actual value to avoid tensor size mismatch error, torch.Size([]) vs torch.Size([1])
@@ -834,6 +1030,8 @@ def forward(self, x):
         self.assertEqual(qparams, ref_qparams)
 
     def test_conv_bn_relu(self):
+        """ Tests fusion and quantization for "Conv - Bn" and "Conv - Bn - ReLU"
+        """
         convs = {
             1: nn.Conv1d,
             2: nn.Conv2d,
@@ -874,8 +1072,7 @@ def forward(self, x):
                 x = self.dequant(x)
                 return x
 
-        # TODO: add 1d support
-        options = itertools.product([2, 3], [True, False], self.static_quant_types)
+        options = itertools.product([1, 2, 3], [True, False], self.static_quant_types)
         for dim, has_relu, quant_type in options:
             expected_node = ns.call_module(
                 quantized_conv_relus[dim] if has_relu
@@ -912,11 +1109,56 @@ def forward(self, x):
                 fuse_modules(m_eager, fuse_list, inplace=True)
             m_eager.qconfig = qconfig
             m_eager = prepare_fn(m_eager)
+            prepared_fx = result_dict["prepared"]
+
             m_eager(*self.img_data_dict[dim][0])
             m_eager = convert(m_eager)
             result_eager = m_eager(*self.img_data_dict[dim][0])
             self.assertEqual(result, result_eager)
 
+    def test_linear_bn(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(4, 4)
+                self.bn = nn.BatchNorm1d(4)
+                self.quant = QuantStub()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.linear(x)
+                x = self.bn(x)
+                x = self.dequant(x)
+                return x
+
+        data = (torch.randn(4, 4),)
+        for quant_type in self.static_quant_types:
+            expected_node = ns.call_module(nnq.Linear)
+            m = M()
+            m_eager = copy.deepcopy(m)
+            result_dict = self.checkGraphModeFxOp(m, data, quant_type, expected_node=expected_node)
+            result = result_dict["quantized_output"]
+
+            # check numerics vs eager mode
+            fuse_list = ["linear", "bn"]
+            qengine = torch.backends.quantized.engine
+            if quant_type == QuantType.STATIC:
+                m_eager.eval()
+                qconfig = get_default_qconfig(qengine)
+                prepare_fn = prepare
+                fuse_modules(m_eager, fuse_list, inplace=True)
+            else:
+                m_eager.train()
+                qconfig = get_default_qat_qconfig(qengine)
+                prepare_fn = prepare_qat
+                fuse_modules_qat(m_eager, fuse_list, inplace=True)
+            m_eager.qconfig = qconfig
+            m_eager = prepare_fn(m_eager)
+            m_eager(*data)
+            m_eager = convert(m_eager)
+            result_eager = m_eager(*data)
+            self.assertEqual(result, result_eager)
 
     @skipIfNoFBGEMM
     def test_dynamic_quant_fp16(self):
@@ -1536,6 +1778,49 @@ def forward(self, x):
         self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
 
+    def test_qconfig_dict_with_fused_modules(self):
+        class LinearReLUModel(torch.nn.Module):
+            def __init__(self, relu):
+                super(LinearReLUModel, self).__init__()
+                self.linear = torch.nn.Linear(3, 3)
+                self.relu = relu
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.relu(x)
+                return x
+
+        class ConvReLUModel(torch.nn.Module):
+            def __init__(self, relu):
+                super(ConvReLUModel, self).__init__()
+                self.conv = torch.nn.Conv1d(3, 3, 3)
+                self.relu = relu
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.relu(x)
+                return x
+
+        class ConvBnReLUModel(torch.nn.Module):
+            def __init__(self, relu):
+                super(ConvBnReLUModel, self).__init__()
+                self.conv = torch.nn.Conv1d(3, 3, 3)
+                self.bn = torch.nn.BatchNorm1d(3)
+                self.relu = relu
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.bn(x)
+                x = self.relu(x)
+                return x
+
+        for model in [LinearReLUModel, ConvReLUModel, ConvBnReLUModel]:
+            for relu in [torch.nn.ReLU(), torch.nn.functional.relu, torch.relu]:
+                m = model(relu).eval()
+                qconfig_dict = torch.ao.quantization.get_default_qconfig_dict("fbgemm")
+                # should not crash as in https://github.com/pytorch/pytorch/issues/75825
+                prepare_fx(m, qconfig_dict)
+
     def test_qconfig_dict_validity(self):
         r"""
         Verifies that if a user passes an invalid key or makes a typo when
@@ -1770,7 +2055,7 @@ def forward(self, x):
 
         def assertAttrPreserved(m):
             self.assertTrue(hasattr(m, "preserved_attr"))
-            self.assertTrue(m.preserved_attr, 3)
+            self.assertEqual(m.preserved_attr, 3)
 
         assertAttrPreserved(m)
         convert_custom_config_dict = {
@@ -2004,6 +2289,88 @@ def forward(self, x):
             ref_res = ref_m(data)
             self.assertEqual(res, ref_res)
 
+    @skipIfNoFBGEMM
+    def test_custom_module_class_input_has_multiple_users(self):
+        """ Tests that the flow still works when the input of custom module
+        has multiple users
+        """
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class ObservedCustomModule(torch.nn.Module):
+            def __init__(self, linear):
+                super().__init__()
+                self.linear = linear
+
+            def forward(self, x):
+                return self.linear(x)
+
+            @classmethod
+            def from_float(cls, float_module):
+                assert hasattr(float_module, 'qconfig')
+                observed = cls(float_module.linear)
+                observed.qconfig = float_module.qconfig
+                return observed
+
+        class StaticQuantCustomModule(torch.nn.Module):
+            def __init__(self, linear):
+                super().__init__()
+                self.linear = linear
+
+            def forward(self, x):
+                return self.linear(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                observed_module.linear.activation_post_process = \
+                    observed_module.activation_post_process
+                quantized = cls(nnq.Linear.from_float(observed_module.linear))
+                return quantized
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+                self.custom = CustomModule()
+
+            def forward(self, x0):
+                x1 = self.custom(x0)
+                x2 = self.linear(x0)
+                return x1 + x2
+
+        prepare_custom_config_dict = {
+            "float_to_observed_custom_module_class": {
+                "static": {
+                    CustomModule: ObservedCustomModule
+                }
+            }
+        }
+        convert_custom_config_dict = {
+            "observed_to_quantized_custom_module_class": {
+                "static": {
+                    ObservedCustomModule: StaticQuantCustomModule
+                }
+            }
+        }
+        m = M().eval()
+        m = prepare_fx(
+            m,
+            {"": default_qconfig},
+            prepare_custom_config_dict=prepare_custom_config_dict)
+        # make sure it works
+        m = convert_fx(
+            m,
+            convert_custom_config_dict=convert_custom_config_dict)
+        # make sure it runs
+        m(torch.randn(3, 3))
+
     @skipIfNoFBGEMM
     def test_non_traceable_module(self):
         class NonTraceable(torch.nn.Module):
@@ -2305,12 +2672,13 @@ def forward(self, x):
         self.assertTrue(
             set(scripted_keys) == set(non_packed_weight_keys),
             "Expected the scripted model to preserve the state_dict for non-packed weight attributes")
+        # TODO: probably don't want to hardcode the attribute names, since they are generated
         for attr_name in [
                 "mods1_0_input_scale_0", "mods1_0_input_zero_point_0",
-                "mods1_0_scale_0", "mods1_0_zero_point_0",
-                "mods1_1_scale_0", "mods1_1_zero_point_0",
-                "mods2_scale_0", "mods2_zero_point_0"]:
-            self.assertTrue(hasattr(m, attr_name))
+                "mods1_0_scale_1", "mods1_0_zero_point_1",
+                "mods1_1_scale_1", "mods1_1_zero_point_1",
+                "mods2_scale_1", "mods2_zero_point_1"]:
+            self.assertTrue(hasattr(m, attr_name), attr_name + " not found.")
 
     @skipIfNoFBGEMM
     def test_packed_weight_fused_op(self):
@@ -2423,6 +2791,234 @@ def forward(self, x):
             mp(torch.rand(4, 4, 4, 4))
             mc = convert_fx(mp)
 
+    class _NonReferenceTestModel(nn.Module):
+        def __init__(self, func, lin_in, lin_out):
+            super().__init__()
+            self.conv1 = nn.Conv2d(3, 6, 5)
+            self.pool = nn.MaxPool2d(2, 2)
+            self.lin = nn.Linear(lin_in, lin_out)
+            self.func = func
+
+        def forward(self, x, y, z):
+            x = self.pool(F.relu(self.conv1(x)))
+            x = torch.flatten(x, 1)
+            x = self.func(x, y, z)
+            x = self.lin(x)
+            return x
+
+    # This function looks at the node specified by the NodeInfo in the key of
+    # node_info_to_non_tensor_args and checks that the args at specified indices
+    # are not observed (since they are non tensors). If the args at those indices
+    # are a tuple/list (which do not show up as nodes) the function checks the
+    # individual elements of the tuple/list recursively.
+    def _check_not_observed(self, model, node_info_to_non_tensor_args):
+
+        # this is a helper function (for easier recursion) that checks whether
+        # arg_node is observed
+        def _check_node_not_observed(model, arg_node, node):
+            if isinstance(arg_node, tuple) or isinstance(arg_node, list):
+                for new_node in arg_node:
+                    _check_node_not_observed(model, new_node, node)
+            elif arg_node.op == "call_module":
+                self.assertTrue(
+                    not is_activation_post_process(getattr(model, arg_node.target)),
+                    "Arg: {0} of node: {1} is observed but is not a float tensor".format(
+                        arg_node, node
+                    ),
+                )
+
+        for node in model.graph.nodes:
+            indices = node_info_to_non_tensor_args.get(
+                NodeInfo(node.op, node.target), []
+            )
+            for index in indices:
+                if index < len(node.args):
+                    arg_node = node.args[index]
+                    _check_node_not_observed(model, arg_node, node)
+
+    # This test checks that the model gets prepared correct, doesn't have observers
+    # on specific ops (see _check_not_observed) and that the prepared model runs
+    def _test_dtype_propagation(self, model, node_info_to_non_tensor_args, *args):
+        model.eval()
+        qconfig_dict = {"": torch.ao.quantization.get_default_qconfig("fbgemm")}
+        prepared_model = prepare_fx(model, qconfig_dict)
+        self._check_not_observed(prepared_model, node_info_to_non_tensor_args)
+        prepared_model(*args)
+
+    def test_masked_fill_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.masked_fill(y, z)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), torch.randn(1176) > 0, 0.1]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "masked_fill"): [1, 2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_permute_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.permute(y, z)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), 0, 1]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "permute"): [1, 2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_repeat_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.repeat(y, z)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), 2, 1]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "repeat"): [1, 2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_reshape_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.reshape(-1, y)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), 5, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_size_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.reshape((-1, x.size(y)))
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), 0, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "size"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_transpose_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.transpose(y, z)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), 0, 1]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1, 2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_torch_transpose_nontensor_args_not_observed(self):
+        # TODO: make torch.transpose traceable by fx when using
+        # variable nontensor arguments
+        # func = lambda x, y, z: torch.transpose(x, y, z) # error
+        def func(x, y, z):
+            return torch.transpose(x, 0, 1)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        node_info_to_non_tensor_args = {
+            NodeInfo("call_method", torch.transpose): [1, 2]
+        }
+        args = [torch.randn(5, 3, 32, 32), 0, 1]
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_unsqueeze_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.unsqueeze(y)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), 1, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "unsqueeze"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_unsqueeze__nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.unsqueeze_(y)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), 1, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "unsqueeze_"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_torch_unsqueeze_nontensor_args_not_observed(self):
+        # TODO: make torch.unsqueeze scriptable by fx when using
+        # variable nontensor arguments
+        # func = lambda x, y, z: torch.unsqueeze(x, y) # error
+        def func(x, y, z):
+            return torch.unsqueeze(x, 1)
+
+        model = self._NonReferenceTestModel(func, 1176, 1)
+        args = [torch.randn(5, 3, 32, 32), 1, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", torch.unsqueeze): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_view_nontensor_args_not_observed(self):
+        def func(x, y, z):
+            return x.view(-1, y)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), 5, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "view"): [2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_list_args(self):
+        def func(x, y, z):
+            return x.reshape(y)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), [-1, 5], None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_split_list_args(self):
+        def func(x, y, z):
+            return x.reshape([y, z])
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), -1, 5]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_tuple_args(self):
+        def func(x, y, z):
+            return x.reshape(y)
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), (-1, 5), None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_split_tuple_args(self):
+        def func(x, y, z):
+            return x.reshape((y, z))
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), -1, 5]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_dict_args(self):
+        def func(x, y, z):
+            return x.transpose(y["first"], y["second"])
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), {"first": 0, "second": 1}, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1, 2]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_dict_tuple_args(self):
+        class reshape_module(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y, z):
+                return x.reshape(y["shape"])
+
+        model = self._NonReferenceTestModel(reshape_module(), 5, 1)
+        args = [torch.randn(5, 3, 32, 32), {"shape": (-1, 5)}, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
+    def test_propagate_dtypes_for_known_nodes_dict_split_tuple_args(self):
+        def func(x, y, z):
+            return x.reshape((y["first"], y["second"]))
+
+        model = self._NonReferenceTestModel(func, 5, 1)
+        args = [torch.randn(5, 3, 32, 32), {"first": -1, "second": 5}, None]
+        node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1]}
+        self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args)
+
     def test_assert_on_size_after_quant_layer(self):
         """
         Verifies that calculating a size of a quantized tensor works
@@ -2697,11 +3293,12 @@ def forward(self, x):
         m = convert_fx(m)
         keys = m.state_dict().keys()
         m(torch.randn(5, 5))
+        # TODO: probably don't want to hardcode the attribute names, since they are generated
         for attr_name in [
                 "mods1_0_input_scale_0", "mods1_0_input_zero_point_0",
                 "mods1_0_scale_0", "mods1_0_zero_point_0",
                 "mods1_1_scale_0", "mods1_1_zero_point_0"]:
-            self.assertTrue(hasattr(m, attr_name))
+            self.assertTrue(hasattr(m, attr_name), attr_name + " not found.")
 
     def test_no_obs_between_unmatched_node_and_copy_node(self):
         """
@@ -3033,7 +3630,6 @@ def forward(self, x):
     def test_preserve_tuple(self):
         """ Test tuple input type is preserved
         """
-        from typing import List
 
         class LSTM(nn.Module):
             def __init__(self):
@@ -3111,23 +3707,101 @@ def forward(self, x):
                 x = self.relu(x)
                 return x
 
-        model = M().eval()
-
         dynamic_quantized_ops = {
             float16_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic_fp16,
             default_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic
         }
-        for config in [float16_dynamic_qconfig, default_dynamic_qconfig]:
-            qconfig = {
-                "": config
+        for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]:
+            model = M().eval()
+            qconfig_dict = {
+                "": qconfig
+            }
+            m = prepare_fx(model, qconfig_dict)
+            m = convert_fx(m)
+            m(torch.rand(5, 5))
+            node_list = [
+                ns.call_module(nniqd.LinearReLU),
+                ns.call_module(nniqd.LinearReLU),
+                ns.call_function(dynamic_quantized_ops[qconfig]),
+            ]
+            self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    @skipIfNoFBGEMM
+    def test_dynamic_with_fusion_multiple_uses(self):
+        """
+        Tests that dynamic quantization APIs work with Linear + Relu fusion
+        """
+        class LinearRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                x = self.linear(x)
+                return self.relu(x)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear_relu = LinearRelu()
+
+            def forward(self, x):
+                x = self.linear_relu(x)
+                x = self.linear_relu(x)
+                return x
+
+        for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]:
+            model = M().eval()
+            qconfig_dict = {
+                "": qconfig
             }
-            m = prepare_fx(model, qconfig)
+            m = prepare_fx(model, qconfig_dict)
             m = convert_fx(m)
             m(torch.rand(5, 5))
             node_list = [
                 ns.call_module(nniqd.LinearReLU),
                 ns.call_module(nniqd.LinearReLU),
-                ns.call_function(dynamic_quantized_ops[config]),
+            ]
+            self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    @skipIfNoFBGEMM
+    def test_dynamic_linear_input_multiple_use(self):
+        """
+        Tests input for dynamic linear being used by multiple ops
+        """
+        class LinearRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                x = self.linear(x)
+                return self.relu(x)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod1 = LinearRelu()
+                self.mod2 = LinearRelu()
+
+            def forward(self, x):
+                y1 = self.mod1(x)
+                y2 = self.mod2(x)
+                return y1 + y2
+
+        for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]:
+            model = M().eval()
+            qconfig_dict = {
+                "": qconfig
+            }
+            m = prepare_fx(model, qconfig_dict)
+            m = convert_fx(m)
+            m(torch.rand(5, 5, 5))
+            node_list = [
+                ns.call_module(nniqd.LinearReLU),
+                ns.call_module(nniqd.LinearReLU),
             ]
             self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
@@ -3379,6 +4053,7 @@ def forward(self, x):
                     ns.call_function(torch.quantize_per_tensor): 1,
                     ns.call_function(torch.ops.quantized.linear): 2,
                     ns.call_function(torch.ops.quantized.add): 1,
+                    ns.call_function(torch.mul): 1,
                     ns.call_method("dequantize"): 1
                 }
                 order_check = [
@@ -3387,6 +4062,7 @@ def forward(self, x):
                     ns.call_function(torch.ops.quantized.linear),
                     ns.call_function(torch.ops.quantized.add),
                     ns.call_method("dequantize"),
+                    ns.call_function(torch.mul),
                     ns.call_module(nn.Linear),
                 ]
 
@@ -3400,19 +4076,6 @@ def forward(self, x):
     def _assertFixedQParamsFakeQuantizeEqual(self, fq1, fq2):
         self.assertEqual(fq1()._observer_ctr, fq2()._observer_ctr)
 
-    def test_fixed_qparams_patterns(self):
-        hard_sigmoid_keys = [torch.nn.Hardsigmoid, torch.nn.functional.hardsigmoid, "hardsigmoid", "hardsigmoid_"]
-        sigmoid_keys = [torch.nn.Sigmoid, torch.sigmoid, "sigmoid", "sigmoid_"]
-        tanh_keys = [torch.nn.Tanh, torch.tanh, "tanh", "tanh_"]
-        for k in hard_sigmoid_keys + sigmoid_keys:
-            self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP[k], default_affine_fixed_qparams_observer)
-            self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[k],
-                                                      default_affine_fixed_qparams_fake_quant)
-        for k in tanh_keys:
-            self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP[k], default_symmetric_fixed_qparams_observer)
-            self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[k],
-                                                      default_symmetric_fixed_qparams_fake_quant)
-
     def test_register_patterns(self):
         @register_fusion_pattern("dummy_fusion")
         class DummyFusion():
@@ -3422,11 +4085,11 @@ class DummyFusion():
         class DummyQuant():
             pass
 
-        @register_quant_pattern("dummy_quant2", default_affine_fixed_qparams_observer)
+        @register_quant_pattern("dummy_quant2", default_fixed_qparams_range_0to1_observer)
         class DummyQuant2():
             pass
 
-        @register_quant_pattern("dummy_quant3", default_symmetric_fixed_qparams_observer)
+        @register_quant_pattern("dummy_quant3", default_fixed_qparams_range_neg1to1_observer)
         class DummyQuant3():
             pass
 
@@ -3434,16 +4097,19 @@ class DummyQuant3():
         self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant"], DummyQuant)
         self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant2"], DummyQuant2)
         self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant3"], DummyQuant3)
-        self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant2"], default_affine_fixed_qparams_observer)
-        self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant3"], default_symmetric_fixed_qparams_observer)
+        self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant2"], default_fixed_qparams_range_0to1_observer)
+        self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant3"], default_fixed_qparams_range_neg1to1_observer)
         self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP["dummy_quant2"],
-                                                  default_affine_fixed_qparams_fake_quant)
+                                                  default_fixed_qparams_range_0to1_fake_quant)
         self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP["dummy_quant3"],
-                                                  default_symmetric_fixed_qparams_fake_quant)
-        self.assertTrue(get_default_output_activation_post_process_map(is_training=True) is
-                        DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP)
-        self.assertTrue(get_default_output_activation_post_process_map(is_training=False) is
-                        DEFAULT_OUTPUT_OBSERVER_MAP)
+                                                  default_fixed_qparams_range_neg1to1_fake_quant)
+        output_fake_quantize_map = get_default_output_activation_post_process_map(is_training=True)
+        output_observer_map = get_default_output_activation_post_process_map(is_training=False)
+        self.assertEqual(output_observer_map.get("dummy_quant3"), default_fixed_qparams_range_neg1to1_observer)
+        self._assertFixedQParamsFakeQuantizeEqual(output_fake_quantize_map.get("dummy_quant3"),
+                                                  default_fixed_qparams_range_neg1to1_fake_quant)
+
+
 
     def test_reuse_input_qconfig(self):
         class M1(torch.nn.Module):
@@ -3532,23 +4198,132 @@ def forward(self, x):
                 break
         self.assertTrue(found_stack_trace, f"stack trace not found, node: {n.format_node()}, is_reference: False")
 
-    def test_stack_trace_preserved_subgraph_rewriter(self):
-        # a functional relu is taking the subgraph rewriter code path
+    def test_qat_skip_untraced(self):
+        class UnTraceableModuleClass(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class UnTraceableModuleName(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
         class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.untraceable_module_class = UnTraceableModuleClass()
+                self.untraceable_module_name = UnTraceableModuleClass()
+
             def forward(self, x):
-                x = F.relu(x)
+                x = self.untraceable_module_class(x)
+                x = self.untraceable_module_name(x)
                 return x
 
-        m = M().eval()
-        mp = prepare_fx(m, get_default_qconfig_dict())
-        mq = convert_fx(copy.deepcopy(mp), is_reference=False)
-        found_stack_trace = False
-        for n in mq.graph.nodes:
-            if n.op == 'call_function' and n.target == F.relu:
-                found_stack_trace = n.stack_trace is not None
-                break
-        self.assertTrue(found_stack_trace, f"stack trace not found, node: {n.format_node()}, is_reference: True")
+        mod = M()
+
+        qconfig_dict = {"": torch.quantization.get_default_qat_qconfig()}
+        prepare_custom_config_dict = {
+            "non_traceable_module_class": [UnTraceableModuleClass],
+            "non_traceable_module_name": ["untraceable_module_name"],
+        }
+        mod_prep = torch.ao.quantization.quantize_fx.prepare_qat_fx(
+            mod.train(), qconfig_dict, prepare_custom_config_dict
+        )
+        mod_prep = torch.ao.quantization.quantize_fx.prepare_qat_fx(
+            mod.train(), qconfig_dict, prepare_custom_config_dict
+        )
+        self.assertTrue(
+            isinstance(mod_prep.untraceable_module_class.linear, torch.nn.Linear)
+        )
+        self.assertTrue(
+            isinstance(mod_prep.untraceable_module_name.linear, torch.nn.Linear)
+        )
+        self.assertTrue(
+            type(mod_prep.untraceable_module_class.linear)
+            is not torch.nn.qat.modules.linear.Linear,
+            "prepare_qat_fx shold not convert anything inside untraced module classes",
+        )
+        self.assertTrue(
+            type(mod_prep.untraceable_module_name.linear)
+            is not torch.nn.qat.modules.linear.Linear,
+            "prepare_qat_fx shold not convert anything inside modules named in untraced_module_names",
+        )
+
+    def test_qconfig_dict_setup(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.Conv1d = torch.nn.Conv1d(1, 1, 1)
+                self.Conv2d = torch.nn.Conv2d(1, 1, 1)
+                self.Conv3d = torch.nn.Conv3d(1, 1, 1)
+                self.ConvTranspose1d = torch.nn.ConvTranspose1d(1, 1, 1)
+                self.ConvTranspose2d = torch.nn.ConvTranspose2d(1, 1, 1)
+                self.ConvTranspose3d = torch.nn.ConvTranspose3d(1, 1, 1)
+                self.Linear = torch.nn.Linear(1, 1, 1)
+
+            def forward(self, x):
+                x = self.Conv1d(x)
+                x = self.Conv2d(x)
+                x = self.Conv3d(x)
+                x = self.ConvTranspose1d(x)
+                x = self.ConvTranspose2d(x)
+                x = self.ConvTranspose3d(x)
+                x = self.Linear(x)
+                x = torch.nn.functional.conv1d(x, torch.rand(2, 2))
+                x = torch.nn.functional.conv2d(x, torch.rand(2, 2))
+                x = torch.nn.functional.conv3d(x, torch.rand(2, 2))
+                x = torch.nn.functional.linear(x, torch.rand(2, 2))
+                return x
+
+        backends = ["qnnpack", "fbgemm"]
+        for func in [get_default_qconfig_dict, get_default_qat_qconfig_dict]:
+            for backend in backends:
+                m = M().eval()
+                qconfig_dict = func(backend)
+                m = prepare_fx(m, qconfig_dict)
+                for name, mod in m.named_modules():
+                    if is_activation_post_process(mod) and mod.dtype == torch.quint8:
+                        if backend == "fbgemm":
+                            lower_bnd = 0
+                            upper_bnd = 127
+                        else:
+                            lower_bnd = 0
+                            upper_bnd = 255
+                        if issubclass(type(mod), FakeQuantize):
+                            self.assertEqual(mod.activation_post_process.quant_min, lower_bnd)
+                            self.assertEqual(mod.activation_post_process.quant_max, upper_bnd)
+                        else:
+                            self.assertEqual(mod.quant_min, lower_bnd)
+                            self.assertEqual(mod.quant_max, upper_bnd)
+
+    def test_prepare_mode(self):
+        class LinearModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        def _test(prepare_fn, qconfig_dict):
+            m = LinearModel()
+            m1 = copy.deepcopy(m)
+            m1.train()
+            prepare_fn(m1, qconfig_dict)
+            m2 = copy.deepcopy(m)
+            m2.eval()
+            prepare_fn(m2, qconfig_dict)
 
+        # Ensure prepare_fx and prepare_qat_fx work in both training and eval modes
+        _test(prepare_fx, get_default_qconfig_dict())
+        _test(prepare_qat_fx, get_default_qat_qconfig_dict())
 
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
@@ -3590,41 +4365,64 @@ def setUp(self):
     """
     @skipIfNoFBGEMM
     def test_linear_module(self):
-        class ModuleLinear(torch.nn.Module):
-            def __init__(self, has_relu=False, f_relu=False):
-                super(ModuleLinear, self).__init__()
+        class LinearModel(torch.nn.Module):
+            def __init__(self):
+                super(LinearModel, self).__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
-                if has_relu:
-                    if f_relu:
-                        self.relu = F.relu
-                    else:
-                        self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class LinearReLUModel(torch.nn.Module):
+            def __init__(self, f_relu=False):
+                super(LinearReLUModel, self).__init__()
+                self.linear = torch.nn.Linear(30, 4).float()
+                if f_relu:
+                    self.relu = F.relu
                 else:
-                    self.relu = torch.nn.Identity()
+                    self.relu = torch.nn.ReLU()
 
             def forward(self, x):
-                return self.relu(self.linear(x))
+                x = self.linear(x)
+                x = self.relu(x)
+                return x
 
+        class LinearBnModel(torch.nn.Module):
+            def __init__(self):
+                super(LinearBnModel, self).__init__()
+                self.linear = torch.nn.Linear(4, 4).float()
+                self.bn = torch.nn.BatchNorm1d(4)
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.bn(x)
+                return x
+
+        # Test linear
         data = (torch.rand((1, 30), dtype=torch.float),)
-        options = itertools.product(
-            [ModuleLinear(has_relu=False)],
-            self.all_quant_types)
-        quantized_nodes = {
-            # quant_type:
-            QuantType.DYNAMIC: ns.call_module(nnqd.Linear),
-            QuantType.STATIC: ns.call_module(nnq.Linear),
-            # note that we are checking the final result
-            QuantType.QAT: ns.call_module(nnq.Linear),
-        }
-        for model, quant_type in options:
-            self.checkGraphModeFxOp(
-                model, data, quant_type, quantized_nodes[quant_type])
+        for quant_type in self.all_quant_types:
+            model = LinearModel()
+            quantized_module = nnqd.Linear if quant_type == QuantType.DYNAMIC else nnq.Linear
+            quantized_node = ns.call_module(quantized_module)
+            result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
+            if quant_type in self.static_quant_types:
+                self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
 
+        # TODO: enable test for dynamic quant
+        # Test linear-relu
         for f_relu, quant_type in itertools.product([True, False], [QuantType.STATIC, QuantType.QAT]):
-            for model, quantized_node in [
-                    (ModuleLinear(has_relu=True, f_relu=f_relu), ns.call_module(nniq.LinearReLU))]:
-                result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
-                self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
+            model = LinearReLUModel(f_relu)
+            quantized_node = ns.call_module(nniq.LinearReLU)
+            result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
+            self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
+
+        # Test linear-bn
+        data = (torch.rand((4, 4), dtype=torch.float),)
+        for quant_type in self.static_quant_types:
+            model = LinearBnModel()
+            quantized_node = ns.call_module(nnq.Linear)
+            result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
+            self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
 
     @skipIfNoFBGEMM
     def test_functional_linear(self):
@@ -3636,18 +4434,18 @@ def __init__(self, use_bias, has_relu, f_relu):
                 self.use_bias = use_bias
                 if has_relu:
                     if f_relu:
-                        self.relu = F.relu
+                        self.relu_or_id = F.relu
                     else:
-                        self.relu = torch.nn.ReLU()
+                        self.relu_or_id = torch.nn.ReLU()
                 else:
-                    self.relu = torch.nn.Identity()
+                    self.relu_or_id = torch.nn.Identity()
 
             def forward(self, x):
                 if self.use_bias:
                     x = F.linear(x, self.w, self.b)
                 else:
                     x = F.linear(x, self.w)
-                x = self.relu(x)
+                x = self.relu_or_id(x)
                 return x
 
         data = (torch.rand((1, 30), dtype=torch.float),)
@@ -3675,7 +4473,10 @@ def forward(self, x):
             # it is a copy node, that's why we have extra observer/fake_quant
             # when has_relu is False
             quant_type_to_prepare_expected_node_occurrence = {
-                QuantType.DYNAMIC: {},
+                QuantType.DYNAMIC: {
+                    ns.call_module(torch.ao.quantization.PlaceholderObserver): 1,
+                    ns.call_module(torch.ao.quantization.MinMaxObserver): 1,
+                },
                 # There should be 3 observers: after input, weight and activation.
                 # one more observer for torch.nn.Identity when there is no relu
                 QuantType.STATIC: {
@@ -3693,17 +4494,29 @@ def forward(self, x):
             else:
                 qlinear_fun = quant_type_to_qlinear_fun[quant_type]
 
+            if quant_type != QuantType.DYNAMIC:
+                num_dequantize = 1
+            else:
+                # we will have an extra quantize_per_tensor_dynamic + dequantize for
+                # nn.Identity right now, but it will be fixed after we use
+                # backend_config_dict to configure the default pt backend
+                num_dequantize = int(not has_relu)
+
             convert_node_occurrence = {
                 ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0,
                 qlinear_fun: 1,
-                ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0
+                ns.call_method("dequantize"): num_dequantize if quant_type != QuantType.DYNAMIC else 0,
             }
             prepare_expected_node_occurrence = \
                 quant_type_to_prepare_expected_node_occurrence[quant_type]
-            self.checkGraphModeFxOp(
+            result_dict = self.checkGraphModeFxOp(
                 model, data, quant_type, qlinear_fun,
                 prepare_expected_node_occurrence=prepare_expected_node_occurrence,
                 expected_node_occurrence=convert_node_occurrence)
+            if quant_type != QuantType.DYNAMIC:
+                self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
+                # Ensure packed weights in lowered models are folded
+                self.assertIn("_packed_weight_0", result_dict["quantized"].state_dict().keys())
 
     def test_linear_dynamic_fp16(self):
         class FuncLinear(torch.nn.Module):
@@ -3745,8 +4558,8 @@ def forward(self, x):
                 else:
                     qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16)
             prepare_node_occurrence = {
-                # weight
-                ns.call_module(torch.ao.quantization.PlaceholderObserver): 1
+                # activation and weight
+                ns.call_module(torch.ao.quantization.PlaceholderObserver): 2
             }
             convert_node_occurrence = {
                 qlinear_fun: 1,
@@ -3760,6 +4573,7 @@ def forward(self, x):
                 prepare_expected_node_occurrence=prepare_node_occurrence,
                 expected_node_occurrence=convert_node_occurrence)
 
+    # TODO: maybe remove this support
     def test_linear_static_fp16(self):
         class FuncLinear(torch.nn.Module):
             def __init__(self, use_bias, has_relu, f_relu):
@@ -3943,10 +4757,14 @@ def forward(self, x):
             }
             prepare_expected_node_occurrence = \
                 quant_type_to_prepare_expected_node_occurrence[quant_type]
-            self.checkGraphModeFxOp(
+            result_dict = self.checkGraphModeFxOp(
                 model, data, quant_type, qconv_fun,
                 prepare_expected_node_occurrence=prepare_expected_node_occurrence,
                 expected_node_occurrence=convert_node_occurrence)
+            if quant_type != QuantType.DYNAMIC:
+                self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"])
+                # Ensure packed weights in lowered models are folded
+                self.assertIn("_packed_weight_0", result_dict["quantized"].state_dict().keys())
 
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
@@ -4096,10 +4914,12 @@ def test_add(self):
         self._test_binary_op_float16_impl(
             operator.add, operator.iadd)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_sub(self):
         self._test_binary_op_float16_impl(operator.sub, operator.isub)
         self._test_binary_op_float16_impl(torch.sub, None)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_div(self):
         self._test_binary_op_float16_impl(operator.truediv, operator.itruediv)
         self._test_binary_op_float16_impl(torch.div, None)
@@ -4110,6 +4930,7 @@ def test_mul(self):
             operator.mul, operator.imul, torch.ops.quantized.mul)
         self._test_binary_op_float16_impl(operator.mul, operator.imul)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_sum(self):
         class Sum(torch.nn.Module):
             def forward(self, x):
@@ -4133,6 +4954,7 @@ def forward(self, x):
             expected_node_occurrence=node_occurrence,
             custom_qconfig_dict=custom_qconfig_dict)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_bmm(self):
         class BMMMethod(torch.nn.Module):
             def __init__(self):
@@ -4174,6 +4996,39 @@ def test_add_relu(self):
         self._test_binary_op_relu_float16_impl(
             operator.add, operator.iadd)
 
+    @skipIfNoFBGEMM
+    def test_add_relu_multiple_uses_of_relu(self):
+        class Sub(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU(inplace=True)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sub = Sub()
+
+            def forward(self, x, y):
+                x = x + y
+                x = self.sub.relu(x)
+                x = x + y
+                x = self.sub.relu(x)
+                return x
+
+        m = M().eval()
+        m = prepare_fx(m, {"": default_qconfig})
+        m = convert_fx(m)
+        node_occurrence = {
+            ns.call_function(torch.quantize_per_tensor): 2,
+            ns.call_function(torch.ops.quantized.add_relu): 2,
+            ns.call_method("dequantize"): 1,
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+        # check the model is scriptable
+        m = torch.jit.script(m)
+        # check the model is runnable
+        m(torch.randn(3), torch.randn(3))
+
     @skipIfNoFBGEMM
     def test_mul_relu(self):
         self._test_binary_op_relu_int8_impl(
@@ -4206,7 +5061,7 @@ def forward(self, x):
 
         m = M()
         expected_node_occurrence = {
-            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 6,
+            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 5,
         }
         self._test_quantized_add_mul_qat(m, expected_node_occurrence)
 
@@ -4222,14 +5077,13 @@ def forward(self, x):
                 x = torch.mul(x, 1.0)
                 x = self.conv1(x)
                 x = torch.mul(x, 1.0)
-                # TODO: add support for add + torch.relu?
                 x = torch.relu(x)
                 x = self.conv2(x)
                 return x
 
         m = M()
         expected_node_occurrence = {
-            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 6,
+            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 5,
         }
         self._test_quantized_add_mul_qat(m, expected_node_occurrence)
 
@@ -4253,7 +5107,7 @@ def forward(self, x):
             m, {'': torch.ao.quantization.get_default_qat_qconfig('fbgemm')},
             prepare_custom_config_dict={"input_quantized_idxs": [0]})
         expected_node_occurrence = {
-            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 0,
+            ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 1,
         }
         self.checkGraphModuleNodes(
             mp, expected_node_occurrence=expected_node_occurrence)
@@ -4531,9 +5385,6 @@ def test_layer_norm(self):
         self._test_norm_impl(
             nn.LayerNorm, F.layer_norm, [[2, 5, 5]], data, nnq.LayerNorm, torch.ops.quantized.layer_norm)
 
-        self._test_norm_float16_impl(
-            nn.LayerNorm, F.layer_norm, [[2, 5, 5]], data)
-
     def test_instance_norm(self):
         data_1d = (torch.rand((1, 4, 5), dtype=torch.float),)
         data_2d = (torch.rand((1, 4, 5, 1), dtype=torch.float),)
@@ -4625,6 +5476,7 @@ def forward(self, x):
 
         self.checkGraphModuleNodes(m_quant, expected_node_list=node_list)
 
+    @unittest.skip("TODO: reenable with backend_config_dict api")
     def test_gelu_normal(self):
         module = torch.nn.GELU
         functional = torch.nn.functional.gelu
@@ -4637,18 +5489,20 @@ def test_gelu_normal(self):
         self._test_default_node_quant_handler_ops(
             module, functional, qconfig, is_reference, node_list)
 
+    @unittest.skip("TODO: reenable with backend_config_dict api")
     def test_softmax_normal(self):
         module = torch.nn.Softmax
         functional = torch.nn.functional.softmax
         qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
         is_reference = False
         node_list = [
-            ns.call_module(module),
+            ns.call_module(torch.nn.quantized.Softmax),
             ns.call_function(functional),
         ]
         self._test_default_node_quant_handler_ops(
             module, functional, qconfig, is_reference, node_list)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_gelu_reference(self):
         module = torch.nn.GELU
         functional = torch.nn.functional.gelu
@@ -4664,6 +5518,7 @@ def test_gelu_reference(self):
             ns.call_function(torch.quantize_per_tensor),
             ns.call_method('dequantize')
         ]
+        # TODO: change these to use backend_config_dict
         additional_patterns = {torch.nn.GELU: DefaultNodeQuantizeHandler,
                                torch.nn.functional.gelu: DefaultNodeQuantizeHandler}
         self._test_default_node_quant_handler_ops(
@@ -4672,6 +5527,7 @@ def test_gelu_reference(self):
         self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list,
                                                   additional_quant_pattern_dict=self.common_quant_patterns)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_softmax_reference(self):
         module = torch.nn.Softmax
         functional = torch.nn.functional.softmax
@@ -4695,6 +5551,7 @@ def test_softmax_reference(self):
         self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list,
                                                   additional_quant_pattern_dict=self.common_quant_patterns)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_silu_reference(self):
         module = torch.nn.SiLU
         functional = torch.nn.functional.silu
@@ -4726,6 +5583,7 @@ def test_silu_reference(self):
         self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list,
                                                   additional_quant_pattern_dict=self.common_quant_patterns)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_mish_reference(self):
         module = torch.nn.Mish
         functional = torch.nn.functional.mish
@@ -4868,7 +5726,7 @@ def forward(self, x):
         data = (torch.randn((2, 2, 2, 2), dtype=torch.float),)
         quant_type = QuantType.STATIC
         qconfig = torch.ao.quantization.QConfig(
-            activation=HistogramObserver.with_args(qscheme=torch.per_tensor_symmetric, dtype=torch.qint8),
+            activation=HistogramObserver.with_args(qscheme=torch.per_tensor_symmetric, dtype=torch.quint8),
             weight=default_weight_observer)
         qconfig_dict = {"": qconfig}
         node_occurrence = {
@@ -4979,7 +5837,7 @@ def forward(self, x):
         # observers and also successfully fused two quantized::conv2d
         # patterns
         # one quantize_per_tensor for input
-        # check exact counts of quantize and dequantiz
+        # check exact counts of quantize and dequantize
         count_check = {
             # input of conv and two outputs of getitem
             ns.call_function(torch.quantize_per_tensor) : 2,
@@ -5006,6 +5864,47 @@ def forward(self, x):
         quantized = convert_fx(prepared, is_reference=True)
 
 
+    @skipIfNoFBGEMM
+    def test_ave_pool_with_custom_cfg(self):
+        """ A test that checks correct patterns are produced for
+        avg_pool2d with customized config
+        """
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.avg_pool2d = torch.nn.AvgPool2d(3)
+
+
+            def forward(self, x):
+                x = self.avg_pool2d(x)
+                return x
+
+        # This model is not executable since we just put all ops
+        # in the same forward
+        m = M().eval()
+        # nothing to fuse so skipping the fuse step
+        qconfig_dict = {'': default_qconfig}
+        prepared = prepare_fx(m, qconfig_dict, prepare_custom_config_dict={"input_quantized_idxs": [0]})
+
+        # not runnable
+        quantized = convert_fx(prepared)
+
+        # This checks that the dequantize from the output of first conv
+        # is being propagated to the end, so that we don't insert extra
+        # observers
+        # check exact counts of quantize and dequantize
+        count_check = {
+            ns.call_method('dequantize') : 1
+        }
+        order_check = [
+            ns.call_module(nn.AvgPool2d),
+            ns.call_method('dequantize'),
+        ]
+        self.checkGraphModuleNodes(
+            quantized,
+            expected_node_occurrence=count_check,
+            expected_node_list=order_check)
+
     @skipIfNoFBGEMM
     def test_general_value_ops(self):
         """ A test that checks correct patterns are produced for
@@ -5074,6 +5973,21 @@ def forward(self, x):
             expected_node_occurrence=count_check,
             expected_node_list=order_check)
 
+    def test_copy_node_fp32_input(self):
+        """ CopyNode works for both fp32 and int8 inputs, this is a test to make
+        sure that a CopyNode can be successfully quantized in both cases
+        """
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = x.relu()
+                return x
+
+        m = M().eval()
+        m = prepare_fx(m, {"": default_reuse_input_qconfig})
+        m = convert_fx(m)
+        # make sure it runs
+        m(torch.rand(1))
+
     def test_getitem(self):
         """ Make sure we only insert observer for getitem if the following node is matched
         or needs to be quantized
@@ -5139,6 +6053,7 @@ def __init__(self):
                 self.sigmoid = torch.nn.Sigmoid()
                 self.hardsigmoid = torch.nn.Hardsigmoid()
                 self.tanh = torch.nn.Tanh()
+                self.softmax = torch.nn.Softmax(dim=0)
 
             def forward(self, x):
                 x = self.conv(x)
@@ -5146,7 +6061,6 @@ def forward(self, x):
                 x = self.sigmoid(x)
                 x = torch.sigmoid(x)
                 x = x.sigmoid()
-                x.sigmoid_()
                 x = self.hardsigmoid(x)
                 x = F.hardsigmoid(x)
                 x = F.hardsigmoid(x, inplace=True)
@@ -5154,7 +6068,8 @@ def forward(self, x):
                 # F.tanh is deprecated
                 x = torch.tanh(x)
                 x = x.tanh()
-                x.tanh_()
+                # TODO(future PR): handle F.softmax
+                x = self.softmax(x)
                 return x
 
         for eval_mode in [True, False]:
@@ -5165,12 +6080,12 @@ def forward(self, x):
                 m.eval()
                 qconfig = default_qconfig
                 prepare = prepare_fx
-                fq_count = 11
+                fq_count = 10
             else:
                 m.train()
                 qconfig = default_qat_qconfig
                 prepare = prepare_qat_fx
-                fq_count = 11
+                fq_count = 10
 
             # nothing to fuse so skipping the fuse step
             m_copy = copy.deepcopy(m)
@@ -5205,6 +6120,7 @@ def forward(self, x):
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_module(nnq.Conv2d),
                 ns.call_module(nn.Sigmoid),
+                ns.call_module(nnq.Softmax),
                 ns.call_method('dequantize'),
             ]
             self.checkGraphModuleNodes(
@@ -5213,8 +6129,8 @@ def forward(self, x):
                 expected_node_list=order_check)
 
             reference_count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 13,
-                ns.call_method('dequantize') : 11
+                ns.call_function(torch.quantize_per_tensor) : 12,
+                ns.call_method('dequantize') : 12
             }
             reference_order_check = [
                 ns.call_function(torch.quantize_per_tensor),
@@ -5225,12 +6141,18 @@ def forward(self, x):
                 ns.call_module(nn.Sigmoid),
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_method('dequantize'),
+                ns.call_module(nn.Softmax),
+                ns.call_function(torch.quantize_per_tensor),
+                ns.call_method('dequantize'),
             ]
             self.checkGraphModuleNodes(
                 quantized_reference,
                 expected_node_occurrence=reference_count_check,
                 expected_node_list=reference_order_check)
 
+            # Verify that softmax scale and zero_point are correct
+            self.assertTrue(quantized.softmax.scale - (1.0 / 256) <= 1e-8)
+            self.assertTrue(quantized.softmax.zero_point == 0)
 
     def test_float_functional(self):
         class TorchAdd(nn.Module):
@@ -5627,6 +6549,7 @@ def forward(self, x):
             m,
             expected_node_occurrence=expected_occurrence)
 
+    @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_qmatmul(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -5634,7 +6557,7 @@ def forward(self, x, y):
                 return z
 
         m = M().eval()
-        qconfig_dict = {"": torch.quantization.default_qconfig}
+        qconfig_dict = {"": torch.ao.quantization.default_qconfig}
         mp = prepare_fx(m, qconfig_dict)
         mp(torch.randn(2, 2), torch.randn(2, 2))
         mq = convert_fx(mp)
@@ -6025,15 +6948,7 @@ def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None,
             model = EmbeddingBagLinear().train()
             prepared_fx_model = prepare_qat_fx(model, qconfig_dict)
             test_only_train_fn(prepared_fx_model, train_indices)
-            convert_custom_config_dict = {
-                "additional_object_mapping": {
-                    "static": {
-                        torch.nn.qat.EmbeddingBag: nn.quantized.EmbeddingBag,
-                    }
-                }
-            }
             quant_model = convert_fx(prepared_fx_model,
-                                     convert_custom_config_dict=convert_custom_config_dict,
                                      qconfig_dict=qconfig_dict)
 
             def checkQuantized(model):
@@ -6073,15 +6988,7 @@ def forward(self, input: torch.Tensor):
             model = EmbeddingLinear().train()
             prepared_fx_model = prepare_qat_fx(model, qconfig_dict)
             test_only_train_fn(prepared_fx_model, train_indices)
-            convert_custom_config_dict = {
-                "additional_object_mapping": {
-                    "static": {
-                        torch.nn.qat.Embedding: nn.quantized.Embedding,
-                    }
-                }
-            }
             quant_model = convert_fx(prepared_fx_model,
-                                     convert_custom_config_dict=convert_custom_config_dict,
                                      qconfig_dict=qconfig_dict)
 
             def checkQuantized(model):
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 025b031bb8d8..6648bcaa9afc 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -1416,6 +1416,38 @@ def forward(self, x, y, z):
             str(get_forward_graph(m.conv3d._c))
         )
 
+    def test_convtranspose_trace(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.convtranspose1d = torch.nn.ConvTranspose1d(3, 3, 3).float()
+                self.convtranspose2d = torch.nn.ConvTranspose2d(3, 3, 3).float()
+                self.convtranspose3d = torch.nn.ConvTranspose3d(3, 3, 3).float()
+
+            def forward(self, x, y, z):
+                a = self.convtranspose1d(x)
+                b = self.convtranspose2d(y)
+                c = self.convtranspose3d(z)
+                return (a, b, c)
+
+        qconfig_dict = {"": default_qconfig}
+        inputs = (
+            torch.rand((1, 3, 10), dtype=torch.float),
+            torch.rand((1, 3, 10, 10), dtype=torch.float),
+            torch.rand((1, 3, 10, 10, 10), dtype=torch.float),
+        )
+        model = torch.jit.trace(M(), inputs).eval()
+        m = prepare_jit(model, qconfig_dict)
+        FileCheck().check("aten::conv_transpose1d").check_not("aten::_convolution").run(
+            str(get_forward_graph(m.convtranspose1d._c))
+        )
+        FileCheck().check("aten::conv_transpose2d").check_not("aten::_convolution").run(
+            str(get_forward_graph(m.convtranspose2d._c))
+        )
+        FileCheck().check("aten::conv_transpose3d").check_not("aten::_convolution").run(
+            str(get_forward_graph(m.convtranspose3d._c))
+        )
+
     @unittest.skipUnless(
         "fbgemm" in torch.backends.quantized.supported_engines,
         " Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs"
diff --git a/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt b/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt
index bb34a57f962a..6887e8c614a5 100644
Binary files a/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt and b/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt differ
diff --git a/test/run_test.py b/test/run_test.py
index 2b772bc6f368..c0ad0a55a02b 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -22,6 +22,7 @@
     TEST_WITH_ROCM,
     shell,
     set_cwd,
+    parser as common_parser,
 )
 import torch.distributed as dist
 from typing import Dict, Optional, List
@@ -92,6 +93,7 @@ def skip_test_p(name: str) -> bool:
         'onnx',
         'package',  # executed by test_package.py
         'quantization',  # executed by test_quantization.py
+        'autograd',  # executed by test_autograd.py
     ],
     blocklisted_tests=[
         'test_bundled_images',
@@ -103,7 +105,6 @@ def skip_test_p(name: str) -> bool:
         'test_kernel_launch_checks',
         'test_metal',
         'test_nnapi',
-        'test_functionalization',
         'test_segment_reductions',
         'test_static_runtime',
         'test_throughput_benchmark',
@@ -132,6 +133,7 @@ def skip_test_p(name: str) -> bool:
         "distributed/elastic/utils/util_test",
         "distributed/elastic/utils/distributed_test",
         "distributed/elastic/multiprocessing/api_test",
+        "test_deploy",
     ]
 )
 
@@ -167,6 +169,7 @@ def skip_test_p(name: str) -> bool:
     "test_typing",
     "distributed/elastic/events/lib_test",
     "distributed/elastic/agent/server/test/api_test",
+    "test_deploy",
 ]
 
 WINDOWS_BLOCKLIST = [
@@ -199,17 +202,25 @@ def skip_test_p(name: str) -> bool:
     "distributed/elastic/agent/server/test/api_test",
     "distributed/elastic/multiprocessing/api_test",
     "distributed/_shard/sharding_spec/test_sharding_spec",
+    "distributed/_shard/sharding_plan/test_sharding_plan",
     "distributed/_shard/sharded_tensor/test_megatron_prototype",
     "distributed/_shard/sharded_tensor/test_sharded_tensor",
     "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard",
-    "distributed/_shard/sharded_tensor/test_partial_tensor",
+    "distributed/_shard/sharded_tensor/ops/test_chunk",
     "distributed/_shard/sharded_tensor/ops/test_elementwise_ops",
     "distributed/_shard/sharded_tensor/ops/test_embedding",
     "distributed/_shard/sharded_tensor/ops/test_embedding_bag",
     "distributed/_shard/sharded_tensor/ops/test_binary_cmp",
     "distributed/_shard/sharded_tensor/ops/test_init",
     "distributed/_shard/sharded_tensor/ops/test_linear",
+    "distributed/_shard/sharded_tensor/ops/test_math_ops",
+    "distributed/_shard/sharded_tensor/ops/test_matrix_ops",
+    "distributed/_shard/sharded_tensor/ops/test_softmax",
+    "distributed/_shard/sharded_tensor/ops/test_tensor_ops",
+    "distributed/_shard/sharding_spec/test_sharding_spec",
     "distributed/_shard/sharded_optim/test_sharded_optim",
+    "distributed/_shard/test_partial_tensor",
+    "distributed/_shard/test_replicated_tensor",
 ] + FSDP_TEST
 
 ROCM_BLOCKLIST = [
@@ -217,23 +228,31 @@ def skip_test_p(name: str) -> bool:
     "distributed/rpc/test_faulty_agent",
     "distributed/rpc/test_tensorpipe_agent",
     "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/_shard/sharding_spec/test_sharding_spec",
+    "distributed/_shard/sharding_plan/test_sharding_plan",
     "distributed/_shard/sharded_tensor/test_megatron_prototype",
     "distributed/_shard/sharded_tensor/test_sharded_tensor",
     "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard",
-    "distributed/_shard/sharded_tensor/test_partial_tensor",
+    "distributed/_shard/sharded_tensor/ops/test_chunk",
     "distributed/_shard/sharded_tensor/ops/test_elementwise_ops",
     "distributed/_shard/sharded_tensor/ops/test_embedding",
     "distributed/_shard/sharded_tensor/ops/test_embedding_bag",
     "distributed/_shard/sharded_tensor/ops/test_binary_cmp",
     "distributed/_shard/sharded_tensor/ops/test_init",
     "distributed/_shard/sharded_tensor/ops/test_linear",
+    "distributed/_shard/sharded_tensor/ops/test_math_ops",
+    "distributed/_shard/sharded_tensor/ops/test_matrix_ops",
+    "distributed/_shard/sharded_tensor/ops/test_softmax",
+    "distributed/_shard/sharded_tensor/ops/test_tensor_ops",
+    "distributed/_shard/sharding_spec/test_sharding_spec",
     "distributed/_shard/sharded_optim/test_sharded_optim",
+    "distributed/_shard/test_partial_tensor",
+    "distributed/_shard/test_replicated_tensor",
     "test_determination",
-    "test_multiprocessing",
     "test_jit_legacy",
     "test_type_hints",
     "test_openmp",
-] + FSDP_TEST
+]
 
 RUN_PARALLEL_BLOCKLIST = [
     "test_cpp_extensions_jit",
@@ -256,6 +275,8 @@ def skip_test_p(name: str) -> bool:
     "test_modules",
     "test_nn",
     "test_ops",
+    "test_ops_gradients",
+    "test_ops_jit",
     "test_torch"
 ]
 
@@ -305,73 +326,17 @@ def skip_test_p(name: str) -> bool:
 )
 
 JIT_EXECUTOR_TESTS = [
-    "test_jit_cuda_fuser",
     "test_jit_profiling",
     "test_jit_legacy",
     "test_jit_fuser_legacy",
 ]
 
-DISTRIBUTED_TESTS = [
-    "distributed/test_data_parallel",
-    "distributed/test_launcher",
-    "distributed/nn/jit/test_instantiator",
-    "distributed/rpc/test_faulty_agent",
-    "distributed/rpc/test_tensorpipe_agent",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
-    "distributed/test_c10d_common",
-    "distributed/test_c10d_gloo",
-    "distributed/test_c10d_nccl",
-    "distributed/test_c10d_spawn_gloo",
-    "distributed/test_c10d_spawn_nccl",
-    "distributed/test_store",
-    "distributed/test_pg_wrapper",
-    "distributed/algorithms/test_join",
-    "distributed/test_distributed_spawn",
-    "distributed/pipeline/sync/skip/test_api",
-    "distributed/pipeline/sync/skip/test_gpipe",
-    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
-    "distributed/pipeline/sync/skip/test_leak",
-    "distributed/pipeline/sync/skip/test_portal",
-    "distributed/pipeline/sync/skip/test_stash_pop",
-    "distributed/pipeline/sync/skip/test_tracker",
-    "distributed/pipeline/sync/skip/test_verify_skippables",
-    "distributed/pipeline/sync/test_balance",
-    "distributed/pipeline/sync/test_bugs",
-    "distributed/pipeline/sync/test_checkpoint",
-    "distributed/pipeline/sync/test_copy",
-    "distributed/pipeline/sync/test_deferred_batch_norm",
-    "distributed/pipeline/sync/test_dependency",
-    "distributed/pipeline/sync/test_inplace",
-    "distributed/pipeline/sync/test_microbatch",
-    "distributed/pipeline/sync/test_phony",
-    "distributed/pipeline/sync/test_pipe",
-    "distributed/pipeline/sync/test_pipeline",
-    "distributed/pipeline/sync/test_stream",
-    "distributed/pipeline/sync/test_transparency",
-    "distributed/pipeline/sync/test_worker",
-    "distributed/optim/test_zero_redundancy_optimizer",
-    "distributed/elastic/timer/api_test",
-    "distributed/elastic/timer/local_timer_example",
-    "distributed/elastic/timer/local_timer_test",
-    "distributed/elastic/events/lib_test",
-    "distributed/elastic/metrics/api_test",
-    "distributed/elastic/utils/logging_test",
-    "distributed/elastic/utils/util_test",
-    "distributed/elastic/utils/distributed_test",
-    "distributed/elastic/multiprocessing/api_test",
-    "distributed/_shard/sharding_spec/test_sharding_spec",
-    "distributed/_shard/sharded_tensor/test_megatron_prototype",
-    "distributed/_shard/sharded_tensor/test_sharded_tensor",
-    "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard",
-    "distributed/_shard/sharded_tensor/test_partial_tensor",
-    "distributed/_shard/sharded_tensor/ops/test_elementwise_ops",
-    "distributed/_shard/sharded_tensor/ops/test_embedding",
-    "distributed/_shard/sharded_tensor/ops/test_embedding_bag",
-    "distributed/_shard/sharded_tensor/ops/test_binary_cmp",
-    "distributed/_shard/sharded_tensor/ops/test_init",
-    "distributed/_shard/sharded_tensor/ops/test_linear",
-    "distributed/_shard/sharded_optim/test_sharded_optim",
-] + [test for test in TESTS if test.startswith("distributed/fsdp")]
+DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith("distributed")]
+
+TESTS_REQUIRING_LAPACK = [
+    "distributions/test_constraints",
+    "distributions/test_distributions",
+]
 
 # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when
 # options.run_specified_test_cases is enabled.
@@ -577,6 +542,7 @@ def test_distributed(test_module, test_directory, options):
                         backend, with_init
                     )
                 )
+            old_environ = dict(os.environ)
             os.environ["TEMP_DIR"] = tmp_dir
             os.environ["BACKEND"] = backend
             os.environ["INIT_METHOD"] = "env://"
@@ -627,6 +593,8 @@ def test_distributed(test_module, test_directory, options):
                     return return_code
             finally:
                 shutil.rmtree(tmp_dir)
+                os.environ.clear()
+                os.environ.update(old_environ)
     return 0
 
 
@@ -664,6 +632,7 @@ def parse_args():
         description="Run the PyTorch unit test suite",
         epilog="where TESTS is any of: {}".format(", ".join(TESTS)),
         formatter_class=argparse.RawTextHelpFormatter,
+        parents=[common_parser]
     )
     parser.add_argument(
         "-v",
@@ -816,6 +785,11 @@ def parse_args():
         " within a specified test module. For unspecified test modules with the bring-to-front "
         "option, all test cases will be run, as one may expect.",
     )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Only list the test that will run.",
+    )
     return parser.parse_args()
 
 
@@ -916,6 +890,10 @@ def get_selected_tests(options):
     if options.exclude_distributed_tests:
         options.exclude.extend(DISTRIBUTED_TESTS)
 
+    # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
+    if torch.version.cuda is not None and LooseVersion(torch.version.cuda) == "11.6":
+        options.exclude.extend(["distributions/test_constraints"])
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == "win32" and not options.ignore_win_blocklist:
@@ -961,6 +939,11 @@ def get_selected_tests(options):
         selected_tests = exclude_tests(DISTRIBUTED_TESTS, selected_tests,
                                        "PyTorch is built without distributed support.")
 
+    # skip tests that require LAPACK when it's not available
+    if not torch._C.has_lapack:
+        selected_tests = exclude_tests(TESTS_REQUIRING_LAPACK, selected_tests,
+                                       "PyTorch is built without LAPACK support.")
+
     return selected_tests
 
 
@@ -1012,7 +995,10 @@ def main():
     selected_tests = get_selected_tests(options)
 
     if options.verbose:
-        print_to_stderr("Selected tests: {}".format(", ".join(selected_tests)))
+        print_to_stderr("Selected tests:\n {}".format("\n ".join(selected_tests)))
+
+    if options.dry_run:
+        return
 
     if options.coverage and not PYTORCH_COLLECT_COVERAGE:
         shell(["coverage", "erase"])
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index 32b95973928e..6b5c8574c2e6 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -20,5 +20,8 @@
 # Scheduler
 from ao.sparsity.test_scheduler import TestScheduler  # noqa: F401
 
+# Composability
+from ao.sparsity.test_composability import TestComposability  # noqa: F401
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_autocast.py b/test/test_autocast.py
index aed0c3496223..bfbe46d08b89 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -104,8 +104,9 @@ def test_autocast_torch_bf16(self):
             self._run_autocast_outofplace(op, args, torch.bfloat16, add_kwargs=maybe_kwargs)
 
     def test_autocast_nn_bf16(self):
-        for op, args in self.autocast_lists.nn_bf16:
-            self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn)
+        for op_with_args in self.autocast_lists.nn_bf16:
+            op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
+            self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn, add_kwargs=maybe_kwargs)
 
     def test_autocast_torch_fp32(self):
         for op_with_args in self.autocast_lists.torch_fp32:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 2fe584ab0b68..7abf53148190 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -14,11 +14,12 @@
 import uuid
 import warnings
 import operator
+import subprocess
 from copy import deepcopy
 from collections import OrderedDict
 from itertools import product
 from operator import mul
-from functools import reduce
+from functools import reduce, partial
 import torch
 
 from torch import nn
@@ -26,21 +27,21 @@
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import (profile, record_function, emit_nvtx)
 from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg)
-import torch.autograd.functional as autogradF
 from torch.utils.checkpoint import checkpoint
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoLapack, slowTest, IS_WINDOWS, IS_MACOS,
-    disable_gc, gradcheck, gradgradcheck, parametrize, instantiate_parametrized_tests)
+    disable_gc, gradcheck, gradgradcheck, parametrize,
+    instantiate_parametrized_tests, skipIfMps)
 from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
 import torch.autograd.forward_ad as fwAD
 from torch.testing._internal.common_methods_invocations import mask_not_all_zeros
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, skipCUDAIfRocm,
                                                         onlyCPU, onlyCUDA, dtypes, dtypesIfCUDA,
-                                                        deviceCountAtLeast, skipMeta)
-from torch.testing._internal.common_dtype import get_all_dtypes
+                                                        deviceCountAtLeast, skipMeta, dtypesIfMPS)
+from torch.testing._internal.common_dtype import floating_types_and
 from torch.testing._internal.logging_tensor import no_dispatch
 
 import pickle
@@ -389,8 +390,8 @@ def test_not_implemented_fwad(self):
             hint_msg = "Running forward AD for an OP that does not implement it should raise a NotImplementedError"
 
             with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
-                # if forward AD ends up being implemented for torch.atan2, choose a different op
-                torch.atan2(dual_x, dual_x)
+                # if forward AD ends up being implemented for torch.igamma, choose a different op
+                torch.igamma(dual_x, dual_x)
 
     def test_accumulate_grad(self):
         grad_output = torch.ones(5, 5)
@@ -2820,7 +2821,7 @@ def test_profiler(self):
         for evt in p.function_events:
             if evt.name in names:
                 found_indices.add(names.index(evt.name))
-        self.assertEquals(len(found_indices), len(names))
+        self.assertEqual(len(found_indices), len(names))
 
     def test_profiler_seq_nr(self):
         with profile(use_kineto=kineto_available()) as p:
@@ -2931,6 +2932,21 @@ def test_record_function_callbacks(self):
         foo_event = [event for event in function_events if "foo" in event.name][0]
         self.assertEqual(foo_event.count, 1)
 
+    def test_record_function_new_signatures(self):
+        # Test the new _record_function ops work
+        # Note: Remove once record_function uses these directly
+        x = torch.randn(10, 10)
+        with profile(use_kineto=kineto_available()) as p:
+            record = torch.ops.profiler._record_function_enter_new("bar", None)
+            try:
+                y = x * 2 + 4
+            finally:
+                torch.ops.profiler._record_function_exit(record)
+
+        function_events = p.function_events
+        foo_event = [event for event in function_events if "bar" in event.name][0]
+        self.assertEqual(foo_event.count, 1)
+
     def test_profiler_aggregation_fake(self):
         events = EventList()
         id = [0]
@@ -3658,6 +3674,22 @@ def fn(sparse):
         check(fast_mode=True)
         check(fast_mode=False)
 
+    @unittest.expectedFailure
+    def test_gradcheck_sparse_csr_input(self):
+        def check(fast_mode):
+            def fn(sparse_csr):
+                return torch.clone(sparse_csr).to_dense()
+
+            # Fails because gradcheck can't work with sparse csr inputs yet
+            gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=True,
+                      check_batched_grad=False, fast_mode=fast_mode)
+
+            with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
+                gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=False,
+                          check_batched_grad=False, fast_mode=fast_mode)
+        # check(fast_mode=True) # Segmentation fault
+        check(fast_mode=False)
+
     def test_gradcheck_nondeterministic(self):
         class NonDetFunc(Function):
             @staticmethod
@@ -4293,7 +4325,13 @@ def backward(ctx, grad):
     MyFunction.apply(v).backward()
 """
         s = TestCase.runWithPytorchAPIUsageStderr(code)
-        self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
+        # The autograd engine creates worker threads only when GPU devices are present.
+        # So make sure that we do shutdown threads when we're testing cuda and make sure
+        # that there is no thread to shutdown when we're not using cuda.
+        if TEST_CUDA or torch.backends.mps.is_available():
+            self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
+        else:
+            self.assertNotRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
 
     @unittest.skipIf(IS_MACOS, "Fails with SIGBUS on macOS; https://github.com/pytorch/pytorch/issues/25941")
     def test_deep_reentrant(self):
@@ -4793,7 +4831,10 @@ def test_grad_fn_attr_bindings(self):
         self.assertIsInstance(out.grad_fn._saved_output_size[0], int)
         self.assertEqual(out.grad_fn._saved_align_corners, False)         # bool -> bool
         self.assertIsInstance(out.grad_fn._saved_align_corners, bool)
-        self.assertIsNone(out.grad_fn._saved_scale_factors)               # c10::optional<ArrayRef<double>> -> float[]?
+        if hasattr(out.grad_fn, '_saved_scale_factors'):
+            self.assertIsNone(out.grad_fn._saved_scale_factors)           # c10::optional<ArrayRef<double>> -> float[]?
+        else:
+            self.assertIsNone(out.grad_fn._saved_scales)                  # c10::optional<ArrayRef<double>> -> float[]?
 
         out = torch.nn.functional.interpolate(a, scale_factor=0.5, mode="linear")
         self.assertIsNone(out.grad_fn._saved_output_size)
@@ -6267,7 +6308,14 @@ def test(get_input, cuda, pin_memory):
                 if y.is_sparse:
                     y = y.to_dense()
                 y.sum().backward()
-                self.assertEqual(2 * a, a.grad)
+
+                actual = 2 * a
+                expected = a.grad
+                if a.is_sparse:
+                    actual = actual.coalesce()
+                    expected = expected.coalesce()
+
+                self.assertEqual(actual, expected)
 
         for cuda in [False] + ([True] if torch.cuda.is_available() else []):
             for pin_memory in [True, False]:
@@ -6311,1361 +6359,76 @@ def f(x):
             memory_with_hooks = torch.cuda.memory_allocated()
             self.assertEqual(memory_with_hooks, memory_without_grad)
 
+    def test_pynode_destruction_deadlock(self):
+        script = """
+import torch
 
-def index_perm_variable(shape, max_indices):
-    if not isinstance(shape, tuple):
-        shape = (shape,)
-
-    index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
-    return index
-
-def bernoulli_scalar():
-    return torch.tensor(0, dtype=torch.uint8).bernoulli_()
-
-
-class TestAutogradFunctional(TestCase):
-    def _assert_same_struct(self, res, base):
-        # base and res should be Tensors or tuple of Tensors with the same size
-        if isinstance(base, torch.Tensor):
-            self.assertTrue(isinstance(res, torch.Tensor))
-            self.assertEqual(base.size(), res.size())
-        elif isinstance(base, tuple):
-            self.assertTrue(isinstance(res, tuple))
-            self.assertEqual(len(base), len(res))
-            for el_base, el_res in zip(base, res):
-                self.assertTrue(isinstance(el_base, torch.Tensor))
-                self.assertTrue(isinstance(el_res, torch.Tensor))
-                self.assertEqual(el_base.size(), el_res.size())
-        else:
-            # Wrong base
-            raise RuntimeError("The base given to `_assert_same_struct` doesn't have"
-                               " the right structure.")
-
-    def _assert_interleaved_struct(self, res, base1, base2):
-        # base1 and base2 can be Tensors or tuples of Tensors.
-        # If they are tuples, res should be a tuple as well.
-        # The indexing works as follows for base1, base2 being
-        # - tuple, tuple: res[i][j][k][l] = (base1[i][k], base2[j][l])
-        # - tuple, Tensor: res[i][k][l] = (base1[i][k], base2[l])
-        # - Tensor, tuple: res[i][j][l] = (base1[i], base2[j][l])
-        # - Tensor, Tensor: res[k][l] = (base1[k], base2[l])
-        if isinstance(base1, torch.Tensor) and isinstance(base2, torch.Tensor):
-            self.assertTrue(isinstance(res, torch.Tensor))
-            self.assertEqual(res.size(), base1.size() + base2.size())
-        elif isinstance(base1, tuple) and isinstance(base2, torch.Tensor):
-            self.assertTrue(isinstance(res, tuple))
-            self.assertEqual(len(res), len(base1))
-            for el_res, el_base1 in zip(res, base1):
-                self.assertTrue(isinstance(el_res, torch.Tensor))
-                self.assertTrue(isinstance(el_base1, torch.Tensor))
-                self.assertEqual(el_res.size(), el_base1.size() + base2.size())
-        elif isinstance(base1, torch.Tensor) and isinstance(base2, tuple):
-            self.assertTrue(isinstance(res, tuple))
-            self.assertEqual(len(res), len(base2))
-            for el_res, el_base2 in zip(res, base2):
-                self.assertTrue(isinstance(el_res, torch.Tensor))
-                self.assertTrue(isinstance(el_base2, torch.Tensor))
-                self.assertEqual(el_res.size(), base1.size() + el_base2.size())
-        elif isinstance(base1, tuple) and isinstance(base2, tuple):
-            self.assertTrue(isinstance(res, tuple))
-            self.assertEqual(len(res), len(base1))
-            for el_res, el_base1 in zip(res, base1):
-                self.assertTrue(isinstance(el_res, tuple))
-                self.assertEqual(len(res), len(base2))
-                for el_el_res, el_base2 in zip(el_res, base2):
-                    self.assertTrue(isinstance(el_el_res, torch.Tensor))
-                    self.assertTrue(isinstance(el_base2, torch.Tensor))
-                    self.assertEqual(el_el_res.size(), el_base1.size() + el_base2.size())
-        else:
-            # Wrong bases
-            raise RuntimeError("The bases given to `_assert_interleaved_struct` don't have"
-                               " the right structure.")
-
-    def test_vjp_err_check(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        inp = torch.rand(4)
-        v = torch.ones(3)
-        with self.assertRaisesRegex(TypeError, "The inputs given to vjp must be either a Tensor"):
-            res = autogradF.vjp(foo, (inp, 2), v)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vjp must"):
-            res = autogradF.vjp(bar, inp, v)
-
-        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the user-provided function returns"):
-            res = autogradF.vjp(foo, inp)
-
-        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
-            res = autogradF.vjp(foo, inp, (torch.ones_like(inp), torch.ones_like(inp)))
-
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
-            res = autogradF.vjp(foo, inp, v[:2])
-
-        res = autogradF.vjp(foo, inp, v)[1]
-        self._assert_same_struct(res, inp)
-
-    def test_vjp_err_check_strict(self):
-        def foo(a):
-            return a.detach()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone()
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.vjp(foo, inp, v, strict=True)
-        res = autogradF.vjp(foo, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
-            res = autogradF.vjp(bar, inp, v, strict=True)
-        res = autogradF.vjp(bar, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        # The Jacobian does not depend on the input
-        def foo(a):
-            return a.clone()
-
-        inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
-            res = autogradF.vjp(foo, inp, v, create_graph=True, strict=True)
-        res = autogradF.vjp(foo, inp, v, create_graph=True, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1], v)
-
-    def test_vjp_no_grad(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4)
-        with torch.no_grad():
-            res = autogradF.vjp(reducer, inputs, v)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-        inputs.requires_grad_()
-        v.requires_grad_()
-        with torch.no_grad():
-            res = autogradF.vjp(reducer, inputs, v, create_graph=True)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-    def test_vjp_output(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4)
-        res = autogradF.vjp(reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-
-        def adder(x, y):
-            return 2 * x + 3 * y
-
-        inputs = (torch.rand(2), torch.rand(2))
-        v = torch.ones(2)
-        out, vjp_val = autogradF.vjp(adder, inputs, v)
-        self._assert_same_struct(vjp_val, inputs)
-        self.assertIsNone(out.grad_fn)
-        self.assertIsNone(vjp_val[0].grad_fn)
-        self.assertIsNone(vjp_val[1].grad_fn)
-
-        def adder(x, y):
-            return 2 * x + 3 * y, x + y
-
-        inputs = (torch.rand(2), torch.rand(2))
-        v = (torch.tensor([1., 0.]), torch.tensor([1., 0.]))
-        out, vjp_val = autogradF.vjp(adder, inputs, v)
-        self._assert_same_struct(vjp_val, inputs)
-        self.assertIsNone(out[0].grad_fn)
-        self.assertIsNone(out[1].grad_fn)
-        self.assertIsNone(vjp_val[0].grad_fn)
-        self.assertIsNone(vjp_val[1].grad_fn)
-
-    def test_vjp_scalar(self):
-        def reducer(x):
-            return x.sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones([])
-        res = autogradF.vjp(reducer, inputs, v)
-        self._assert_same_struct(res[0], v)
-        self._assert_same_struct(res[1], inputs)
-
-        res = autogradF.vjp(reducer, inputs)
-        self._assert_same_struct(res[0], v)
-        self._assert_same_struct(res[1], inputs)
-
-        def expander(x):
-            return x.unsqueeze(0).repeat(4)
-        inputs = torch.rand([])
-        v = torch.ones(4)
-        res = autogradF.vjp(expander, inputs, v)
-        self._assert_same_struct(res[0], v)
-        self._assert_same_struct(res[1], inputs)
-
-    def test_vjp_create_graph(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(2, 2, dtype=torch.double)
-        v = torch.ones(2, dtype=torch.double)
-
-        inputs.requires_grad_()
-        v.requires_grad_()
-        res = autogradF.vjp(reducer, inputs, v, create_graph=True)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-
-        gradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
-
-        def adder(x, y):
-            return 2 * x + 3 * y, x * y
-
-        inputs = (torch.rand(2, dtype=torch.double, requires_grad=True),
-                  torch.rand(2, dtype=torch.double, requires_grad=True))
-        v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True),
-             torch.tensor([1., 0.], dtype=torch.double, requires_grad=True))
-
-        gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-
-        def foo(*args):
-            x, y = args[:2]
-            v = args[2:]
-
-            x = x.cos()
-            val, grad = autogradF.vjp(adder, (x, y), v, create_graph=True)
-
-            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
-
-        gradcheck(foo, inputs + v)
-        gradgradcheck(foo, inputs + v)
-
-    def test_jvp_err_check(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to jvp must be either a Tensor"):
-            res = autogradF.jvp(foo, (inp, 2), v)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jvp must"):
-            res = autogradF.jvp(bar, inp, v)
-
-        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the input to the user-provided function"):
-            res = autogradF.jvp(foo, inp)
-
-        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
-            res = autogradF.jvp(foo, inp, (v, v))
-
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
-            res = autogradF.jvp(foo, inp, v[:2])
-
-        res = autogradF.jvp(foo, inp, v)[1]
-        self._assert_same_struct(res, foo(inp))
-
-    def test_jvp_err_check_strict(self):
-        def foo(a):
-            return a.detach()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone()
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.jvp(foo, inp, v, strict=True)
-        res = autogradF.jvp(foo, inp, v, strict=False)
-        self._assert_same_struct(res[1], res[0])
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
-            res = autogradF.jvp(bar, inp, v, strict=True)
-        res = autogradF.jvp(bar, inp, v, strict=False)
-        self._assert_same_struct(res[1], res[0])
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        # The Jacobian does not depend on the input
-        def foo(a):
-            return a.clone()
-
-        inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
-            res = autogradF.jvp(foo, inp, v, create_graph=True, strict=True)
-        res = autogradF.jvp(foo, inp, v, create_graph=True, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1], v)
-
-    def test_jvp_no_grad(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        with torch.no_grad():
-            res = autogradF.jvp(reducer, inputs, v)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-        inputs.requires_grad_()
-        v.requires_grad_()
-        with torch.no_grad():
-            res = autogradF.jvp(reducer, inputs, v, create_graph=True)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-    def test_jvp_output(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.jvp(reducer, inputs, v)
-        self._assert_same_struct(res[1], res[0])
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-
-        def adder(x, y):
-            return 2 * x + 3 * y
-
-        inputs = (torch.rand(2), torch.rand(2))
-        v = (torch.ones(2), torch.ones(2))
-        out, jvp_val = autogradF.jvp(adder, inputs, v)
-        self._assert_same_struct(jvp_val, out)
-        self.assertIsNone(out.grad_fn)
-        self.assertIsNone(jvp_val[0].grad_fn)
-        self.assertIsNone(jvp_val[1].grad_fn)
-
-        def adder(x, y):
-            return 2 * x + 3 * y, x + y
-
-        inputs = (torch.rand(2), torch.rand(2))
-        v = (torch.tensor([1., 0.]), torch.tensor([1., 0.]))
-        out, jvp_val = autogradF.jvp(adder, inputs, v)
-        self._assert_same_struct(jvp_val, out)
-        self.assertIsNone(out[0].grad_fn)
-        self.assertIsNone(out[1].grad_fn)
-        self.assertIsNone(jvp_val[0].grad_fn)
-        self.assertIsNone(jvp_val[1].grad_fn)
-
-    def test_jvp_scalar(self):
-        def reducer(x):
-            return x.sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.jvp(reducer, inputs, v)
-        self._assert_same_struct(res[0], torch.zeros([]))
-        self._assert_same_struct(res[1], res[0])
-
-        def expander(x):
-            return x.unsqueeze(0).repeat(4)
-        inputs = torch.rand([])
-        v = torch.ones([])
-        res = autogradF.jvp(expander, inputs, v)
-        self._assert_same_struct(res[0], torch.zeros(4))
-        self._assert_same_struct(res[1], res[0])
-
-        res = autogradF.jvp(expander, inputs)
-        self._assert_same_struct(res[0], torch.zeros(4))
-        self._assert_same_struct(res[1], res[0])
-
-    def test_jvp_create_graph(self):
-        def reducer(x):
-            return x.sum(dim=1)
-        inputs = torch.rand(2, 2, dtype=torch.double)
-        v = torch.ones(2, 2, dtype=torch.double)
-
-        inputs.requires_grad_()
-        v.requires_grad_()
-        res = autogradF.jvp(reducer, inputs, v, create_graph=True)
-        self._assert_same_struct(res[1], res[0])
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-
-        gradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
-
-        def adder(x, y):
-            return 2 * x + 3 * y, x * y
-
-        inputs = (torch.rand(2, dtype=torch.double, requires_grad=True),
-                  torch.rand(2, dtype=torch.double, requires_grad=True))
-        v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True),
-             torch.tensor([1., 0.], dtype=torch.double, requires_grad=True))
-
-        gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-
-        def foo(*args):
-            x, y = args[:2]
-            v = args[2:]
-
-            x = x.cos()
-            val, grad = autogradF.jvp(adder, (x, y), v, create_graph=True)
-
-            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
-
-        gradcheck(foo, inputs + v)
-        gradgradcheck(foo, inputs + v)
-
-    def _test_construct_standard_basis_for(self, inputs):
-        numels = tuple(tensor.numel() for tensor in inputs)
-        results = autogradF._construct_standard_basis_for(inputs, numels)
-        for result, inp in zip(results, inputs):
-            self.assertEqual(result.dtype, inp.dtype)
-            self.assertEqual(result.device, inp.device)
-        results = torch.cat([result.to(device='cpu', dtype=torch.float)
-                             for result in results], dim=1)
-        expected = torch.eye(results[0].shape[0], dtype=torch.float)
-        self.assertEqual(results, expected)
-
-    def test_construct_standard_basis_for(self):
-        test_cases = [
-            (torch.randn(2, 3),),
-            (torch.randn(1),),
-            (torch.randn([]),),
-            (torch.randn(1), torch.randn([]), torch.randn([])),
-            (torch.randn(2), torch.randn(3), torch.randn([])),
-            (torch.randn(2), torch.randn([]), torch.randn(3)),
-            (torch.randn(2, 3), torch.randn(3), torch.randn(3, 4, 2)),
-            (torch.randn(2, dtype=torch.float64), torch.randn(3, dtype=torch.float32)),
-        ]
-
-        for inputs in test_cases:
-            self._test_construct_standard_basis_for(inputs)
-
-    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
-    def test_construct_standard_basis_for_cuda(self):
-        test_cases = [
-            (torch.randn(2), torch.randn(3, device='cuda')),
-            (torch.randn(3, device='cuda'), torch.randn(2)),
-        ]
-
-        for inputs in test_cases:
-            self._test_construct_standard_basis_for(inputs)
-
-    def _test_vectorize_raises_no_warnings(self, api):
-        # vmap is an experimental prototype. When someone calls torch.vmap,
-        # it raises a python warning. This test checks that
-        # autogradF.{jacobian, hessian} don't raise that experimental prototype
-        # warning; it is not nice for a public-facing API to raise a warning
-        # no matter how it is called.
-        def foo(a):
-            return (a ** 2).sum()
-
-        x = torch.randn(3)
-        with warnings.catch_warnings(record=True) as wa:
-            result = api(foo, x, vectorize=True)
-        self.assertEqual(len(wa), 0)
-
-    def test_jacobian_vectorize_raises_no_warnings(self):
-        return self._test_vectorize_raises_no_warnings(autogradF.jacobian)
-
-    def test_hessian_vectorize_raises_no_warnings(self):
-        return self._test_vectorize_raises_no_warnings(autogradF.hessian)
-
-    def _test_jacobian_err_check(self, vectorize):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to jacobian must be either a Tensor"):
-            res = autogradF.jacobian(foo, (inp, 2), vectorize=vectorize)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jacobian must"):
-            res = autogradF.jacobian(bar, inp, vectorize=vectorize)
-
-        res = autogradF.jacobian(foo, inp, vectorize=vectorize)
-        self._assert_interleaved_struct(res, foo(inp), inp)
-
-        def foo(a, b):
-            return b, 3 * a.narrow(0, 0, 3)
-
-        inp = (torch.rand(4), torch.rand(5))
-
-        res = autogradF.jacobian(foo, inp, vectorize=vectorize)
-        self._assert_interleaved_struct(res, foo(*inp), inp)
-
-    def test_jacobian_err_check(self):
-        return self._test_jacobian_err_check(vectorize=False)
-
-    def test_jacobian_err_check_vectorize(self):
-        return self._test_jacobian_err_check(vectorize=True)
-
-    def test_jacobian_err_check_strict(self):
-        def foo(a):
-            return a.detach()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone()
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.jacobian(foo, inp, strict=True)
-        res = autogradF.jacobian(foo, inp, strict=False)
-        self._assert_interleaved_struct(res, foo(inp), inp)
-        self.assertEqual(res.abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function is independent of input 0."):
-            res = autogradF.jacobian(bar, inp, strict=True)
-        res = autogradF.jacobian(bar, inp, strict=False)
-        self._assert_interleaved_struct(res, foo(inp), inp)
-        self.assertEqual(res.abs().sum(), 0.)
-
-        # The Jacobian does not depend on the input
-        def foo(a):
-            return a.clone()
-
-        inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
-            res = autogradF.jacobian(foo, inp, create_graph=True, strict=True)
-        res = autogradF.jacobian(foo, inp, create_graph=True, strict=False)
-        self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res, torch.eye(4))
-
-    def test_jacobian_err_check_strict_vectorize(self):
-        def foo(x):
-            return x
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "not supported together"):
-            res = autogradF.jacobian(foo, inp, strict=True, vectorize=True)
-
-    def test_jacobian_no_grad(self):
-        def exp_reducer(x):
-            return x.exp().sum(dim=1)
-
-        inputs = torch.rand(4, 4)
-        with torch.no_grad():
-            res = autogradF.jacobian(exp_reducer, inputs)
-        self.assertIsNone(res.grad_fn)
-        self.assertNotEqual(res, torch.zeros(4, 4))
-
-        with torch.no_grad():
-            res = autogradF.jacobian(exp_reducer, inputs, create_graph=True)
-        self.assertIsNotNone(res.grad_fn)
-        self.assertNotEqual(res, torch.zeros(4, 4))
-
-    def _test_jacobian_output(self, vectorize):
-        def exp_reducer(x):
-            return x.exp().sum(dim=1)
-
-        inputs = torch.rand(4, 4)
-        res = autogradF.jacobian(exp_reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
-        self.assertIsNone(res.grad_fn)
-
-        def identity(x):
-            return x.clone()
-
-        inputs = torch.rand(4)
-        res = autogradF.jacobian(identity, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, identity(inputs), inputs)
-        self.assertIsNone(res.grad_fn)
-        self.assertEqual(res, torch.eye(4))
-
-        def add_exp_reducer(x, y):
-            return (x + y.exp()).sum(dim=1)
-
-        inputs = (torch.rand(4, 4), torch.rand(4, 4))
-        res = autogradF.jacobian(add_exp_reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-
-    def test_jacobian_output(self):
-        self._test_jacobian_output(vectorize=False)
-
-    def test_jacobian_output_vectorize(self):
-        self._test_jacobian_output(vectorize=True)
-
-    def _test_jacobian_scalar(self, vectorize):
-        def reducer(x):
-            return x.sum()
-        inputs = torch.rand(4, 4)
-        res = autogradF.jacobian(reducer, inputs, vectorize=vectorize)
-        self._assert_same_struct(res, inputs)
-
-        def expander(x):
-            return x.unsqueeze(0).repeat(4)
-        inputs = torch.rand([])
-        res = autogradF.jacobian(expander, inputs, vectorize=vectorize)
-        self._assert_same_struct(res, torch.zeros(4))
-
-    def test_jacobian_scalar(self):
-        self._test_jacobian_scalar(vectorize=False)
-
-    def test_jacobian_scalar_vectorize(self):
-        self._test_jacobian_scalar(vectorize=True)
-
-    def _test_jacobian_create_graph(self, vectorize):
-        def exp_reducer(x):
-            return x.exp().sum(dim=1)
-
-        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
-        res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize)
-        self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
-        self.assertIsNotNone(res.grad_fn)
-
-        gradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-
-        def add_exp_reducer(x, y):
-            return (x + y).exp().sum(dim=1)
-
-        inputs = (torch.rand(4, 4, dtype=torch.double, requires_grad=True),
-                  torch.rand(4, 4, dtype=torch.double, requires_grad=True))
-        res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize)
-        self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-
-        gradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-
-        def foo(x, y):
-            x = x.cos()
-            val, jac = autogradF.jacobian(add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize)
-
-            res = val[0].exp().sum() + val[1].exp().sum() + jac[0].exp().sum()
-            res = res + jac[1].exp().sum() + x.exp().sum() + y.exp().sum()
-            return res
-
-        gradcheck(foo, inputs)
-        gradgradcheck(foo, inputs)
-
-    def test_jacobian_create_graph(self):
-        self._test_jacobian_create_graph(vectorize=False)
-
-    def test_jacobian_create_graph_vectorize(self):
-        self._test_jacobian_create_graph(vectorize=True)
-
-    def _check_jacobian_vectorize_correctness(self, f, inputs, test_forward_ad=True):
-        expected = autogradF.jacobian(f, inputs, vectorize=False)
-        result_backward_mode = autogradF.jacobian(f, inputs, vectorize=True)
-        self.assertEqual(result_backward_mode, expected)
-
-        if test_forward_ad:
-            result_forward_mode = autogradF.jacobian(f, inputs, strategy="forward-mode", vectorize=True)
-            self.assertEqual(result_forward_mode, expected)
-
-    def test_jacobian_vectorize_correctness_simple(self):
-        def f(x):
-            return 3 * x ** 2
-
-        x = torch.randn(2, 3, 5)
-        self._check_jacobian_vectorize_correctness(f, x)
-
-    def test_jacobian_vectorize_correctness_multi_input(self):
-        def f(x, y):
-            return (x.cos() * x) @ y.sin()
-
-        x = torch.randn(2, 3)
-        y = torch.randn(3, 5)
-        self._check_jacobian_vectorize_correctness(f, (x, y))
-
-    def test_jacobian_vectorize_correctness_multi_input_multi_output(self):
-        def f(x, y):
-            return (x * x) @ y, x @ (x.sum(1) * y), y.sum()
-
-        x = torch.randn(5, 3)
-        y = torch.randn(3, 5)
-        self._check_jacobian_vectorize_correctness(f, (x, y))
-
-    def test_jacobian_vectorize_correctness_unrelated_outputs(self):
-        def f(x, y):
-            return x, y, x, y
-
-        x = torch.randn(2)
-        y = torch.randn(3)
-        self._check_jacobian_vectorize_correctness(f, (x, y))
-
-    def test_jacobian_vectorize_correctness_zero_dim(self):
-        # zero-dim output
-        def f(x, y):
-            return x.sum(), y.sum(), x * y
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        self._check_jacobian_vectorize_correctness(f, (x, y))
-
-        # zero-dim input
-        def g(x):
-            return torch.stack([x, x, x])
-
-        x = torch.randn([])
-        self._check_jacobian_vectorize_correctness(g, x)
-
-        # Mixed zero-dim input / zero-dim output
-        def h(x, y):
-            return y.sum(), x * y
-
-        x = torch.randn([])
-        y = torch.randn(1)
-        self._check_jacobian_vectorize_correctness(h, (x, y))
-
-    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
-    def test_jacobian_vectorize_correctness_different_devices(self):
-        def f(x, y):
-            return x * y, (x * y).cuda()
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        self._check_jacobian_vectorize_correctness(f, (x, y))
-
-    def test_jacobian_vectorize_correctness_different_dtype(self):
-        def f(x, y):
-            return (x * y).float(), (x * y).double()
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        # The Jacobian computed using forward AD has the dtype of the output
-        # but the Jacobian computed with reverse AD has dtype of input
-        self._check_jacobian_vectorize_correctness(f, (x, y), test_forward_ad=False)
-
-    def _check_hessian_vectorize_correctness(self, f, inputs):
-        expected = autogradF.hessian(f, inputs, vectorize=False)
-        result = autogradF.hessian(f, inputs, vectorize=True)
-        self.assertEqual(result, expected)
-
-        result_forward_mode = autogradF.hessian(f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True)
-        self.assertEqual(result_forward_mode, expected)
-
-    def test_hessian_vectorize_correctness_simple(self):
-        def f(x):
-            return (3 * x ** 2).sum()
-
-        x = torch.randn(2, 3, 5)
-        self._check_hessian_vectorize_correctness(f, x)
-
-    def test_hessian_vectorize_correctness_multi_input(self):
-        def f(x, y, z):
-            return ((x.relu() * x) @ y.sin() @ z).sum()
-
-        x = torch.randn(2, 3)
-        y = torch.randn(3, 5)
-        z = torch.randn(5, 5)
-        self._check_hessian_vectorize_correctness(f, (x, y, z))
-
-    def test_hessian_vectorize_correctness_unrelated_outputs(self):
-        # output unrelated to one input
-        def f(x, y):
-            return (x ** 2).sum()
-
-        x = torch.randn(2)
-        y = torch.randn(3)
-        self._check_hessian_vectorize_correctness(f, (x, y))
-
-        # output unrelated to all inputs
-        def f(x, y):
-            return torch.ones([])
-
-        x = torch.randn(2)
-        y = torch.randn(3)
-        self._check_hessian_vectorize_correctness(f, (x, y))
-
-    def _test_hessian_err_check(self, vectorize):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        def bar2(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        def bar3(a):
-            return 3 * a.narrow(0, 0, 3), 3 * a.narrow(0, 0, 3)
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to hessian must be either a Tensor"):
-            res = autogradF.hessian(foo, (inp, 2), vectorize=vectorize)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hessian must"):
-            res = autogradF.hessian(bar, inp, vectorize=vectorize)
-
-        err_msg_out = "The Tensor returned by the function given to hessian should contain a single element"
-        with self.assertRaisesRegex(RuntimeError, err_msg_out):
-            res = autogradF.hessian(bar2, inp, vectorize=vectorize)
-
-        with self.assertRaisesRegex(RuntimeError, "The function given to hessian should return a single Tensor"):
-            res = autogradF.hessian(bar3, inp, vectorize=vectorize)
-
-        res = autogradF.hessian(foo, inp, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inp, inp)
-
-        def foo(a, b):
-            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
-
-        inp = (torch.rand(4), torch.rand(5))
-
-        res = autogradF.hessian(foo, inp, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inp, inp)
-
-    def test_hessian_err_check(self):
-        self._test_hessian_err_check(vectorize=False)
-
-    def test_hessian_err_check_vectorize(self):
-        self._test_hessian_err_check(vectorize=True)
-
-    def test_hessian_err_check_strict(self):
-        def foo(a):
-            return a.detach().sum()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone().sum()
-
-        def bar2(a):
-            # A Linear function for which the jacobian is independent of the input
-            return (3 * a).sum()
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.hessian(foo, inp, strict=True)
-        res = autogradF.hessian(foo, inp, strict=False)
-        self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0"):
-            res = autogradF.hessian(bar, inp, strict=True)
-        res = autogradF.hessian(bar, inp, strict=False)
-        self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
-            res = autogradF.hessian(bar2, inp, strict=True)
-        res = autogradF.hessian(bar2, inp, strict=False)
-        self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
-
-    def test_hessian_err_check_strict_vectorize(self):
-        def foo(x):
-            return (x ** 3).sum()
-
-        inp = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "not supported together"):
-            res = autogradF.hessian(foo, inp, strict=True, vectorize=True)
-
-    def test_hessian_no_grad(self):
-        def pow_reducer(x):
-            return x.pow(3).sum()
-
-        inputs = torch.rand(2, 2)
-        with torch.no_grad():
-            res = autogradF.hessian(pow_reducer, inputs)
-        self.assertIsNone(res[0][0].grad_fn)
-        self.assertIsNone(res[0][1].grad_fn)
-        self.assertIsNone(res[1][0].grad_fn)
-        self.assertIsNone(res[1][1].grad_fn)
-        self.assertNotEqual(res, torch.zeros(2, 2, 2))
-
-        with torch.no_grad():
-            res = autogradF.hessian(pow_reducer, inputs, create_graph=True)
-        self.assertIsNotNone(res[0][0].grad_fn)
-        self.assertIsNotNone(res[0][1].grad_fn)
-        self.assertIsNotNone(res[1][0].grad_fn)
-        self.assertIsNotNone(res[1][1].grad_fn)
-        self.assertNotEqual(res, torch.zeros(2, 2, 2))
-
-
-    def _test_hessian_output(self, vectorize):
-        def pow_reducer(x):
-            return x.pow(3).sum()
-
-        inputs = torch.rand(2, 2)
-        res = autogradF.hessian(pow_reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-        self.assertIsNone(res.grad_fn)
-
-        def add_pow_reducer(x, y):
-            return (x + y).pow(3).sum()
-
-        inputs = (torch.rand(2, 2), torch.rand(2, 2))
-        res = autogradF.hessian(add_pow_reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-        self.assertIsNone(res[0][0].grad_fn)
-        self.assertIsNone(res[0][1].grad_fn)
-        self.assertIsNone(res[1][0].grad_fn)
-        self.assertIsNone(res[1][1].grad_fn)
-
-    def test_hessian_output(self):
-        self._test_hessian_output(vectorize=False)
-
-    def test_hessian_output_vectorize(self):
-        self._test_hessian_output(vectorize=True)
-
-    def _test_hessian_scalar(self, vectorize):
-        def reducer(x):
-            return x.sum()
-        inputs = torch.rand(4, 4)
-        res = autogradF.hessian(reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-
-        inputs = torch.rand([])
-        res = autogradF.hessian(reducer, inputs, vectorize=vectorize)
-        self._assert_same_struct(res, inputs)
-
-        def bad_reducer(x):
-            return x.sum().view(1, 1, 1)
-        inputs = torch.rand(4, 4)
-        res = autogradF.hessian(bad_reducer, inputs, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-
-    def test_hessian_scalar(self):
-        return self._test_hessian_scalar(vectorize=False)
-
-    def test_hessian_scalar_vectorize(self):
-        return self._test_hessian_scalar(vectorize=True)
-
-    def _test_hessian_create_graph(self, vectorize):
-        def pow_reducer(x):
-            return x.pow(3).sum()
-
-        inputs = torch.rand(2, 2, dtype=torch.double, requires_grad=True)
-        res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-        self.assertIsNotNone(res.grad_fn)
-
-        gradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-
-        def add_pow_reducer(x, y):
-            return (x + y).pow(3).sum()
-
-        inputs = (torch.rand(2, 2, dtype=torch.double, requires_grad=True),
-                  torch.rand(2, 2, dtype=torch.double, requires_grad=True))
-        res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize)
-        self._assert_interleaved_struct(res, inputs, inputs)
-        self.assertIsNotNone(res[0][0].grad_fn)
-        self.assertIsNotNone(res[0][1].grad_fn)
-        self.assertIsNotNone(res[1][0].grad_fn)
-        self.assertIsNotNone(res[1][1].grad_fn)
-
-        def flatten(inp):
-            return tuple(el_lvl2 for el_lvl1 in inp for el_lvl2 in el_lvl1)
-
-        gradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
-        gradgradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
-
-        def foo(x, y):
-            x = x.cos()
-            val, hess = autogradF.hessian(add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize)
-
-            res = val[0].cos().sum() + val[1].cos().sum() + hess[0].cos().sum()
-            res = res + hess[1].cos().sum() + x.cos().sum() + y.cos().sum()
-            return res
-
-        gradcheck(foo, inputs)
-        gradgradcheck(foo, inputs)
-
-    def test_hessian_create_graph(self):
-        self._test_hessian_create_graph(vectorize=False)
-
-    def test_hessian_create_graph_vectorize(self):
-        self._test_hessian_create_graph(vectorize=True)
-
-    def test_vhp_err_check(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        def bar2(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to vhp must be either a Tensor"):
-            res = autogradF.vhp(foo, (inp, 2), v)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vhp must"):
-            res = autogradF.vhp(bar, inp, v)
-
-        err_msg_out = "The Tensor returned by the function given to vhp should contain a single element"
-        with self.assertRaisesRegex(RuntimeError, err_msg_out):
-            res = autogradF.vhp(bar2, inp, v)
-
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
-            res = autogradF.vhp(foo, inp, torch.rand(5))
-
-        with self.assertRaisesRegex(TypeError, "The v given to vhp must be either a Tensor or a tuple of Tensors"):
-            res = autogradF.vhp(foo, inp, (v, 2))
-
-        res = autogradF.vhp(foo, inp, v)
-        self._assert_same_struct(res[1], inp)
-
-        def foo(a, b):
-            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
-
-        inp = (torch.rand(4), torch.rand(5))
-        v = (torch.rand(4), torch.rand(5))
-
-        res = autogradF.vhp(foo, inp, v)
-        self._assert_same_struct(res[1], inp)
-
-    def test_vhp_err_check_strict(self):
-        def foo(a):
-            return a.detach().sum()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone().sum()
-
-        def bar2(a):
-            # A Linear function for which the jacobian is independent of the input
-            return (3 * a).sum()
+class Foo(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x.clone()
 
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.vhp(foo, inp, v, strict=True)
-        res = autogradF.vhp(foo, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+    @staticmethod
+    def forward(ctx, gO):
+        return gO.clone()
 
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
-            res = autogradF.vhp(bar, inp, v, strict=True)
-        res = autogradF.vhp(bar, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+def get_out():
+    inp = torch.rand(2, requires_grad=True)
 
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
-            res = autogradF.vhp(bar2, inp, v, strict=True)
-        res = autogradF.vhp(bar2, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+    # The python function is first so that it runs
+    # last in the backward pass
+    right = Foo.apply(inp)
 
-    def test_vhp_no_grad(self):
-        def reducer(x):
-            return x.exp().sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        with torch.no_grad():
-            res = autogradF.vhp(reducer, inputs, v)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
+    # An op that creates new memory
+    left1 = inp.clone()
+    # An op that saves its input
+    left2 = left1 ** 2
 
-        with torch.no_grad():
-            res = autogradF.vhp(reducer, inputs, v, create_graph=True)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-    def test_vhp_output(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.vhp(foo, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-
-        def bar(a, b):
-            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
-
-        inputs = (torch.rand(3), torch.rand(4))
-        v = (torch.ones(3), torch.ones(4))
-        out, vhp_val = autogradF.vhp(bar, inputs, v)
-        self._assert_same_struct(vhp_val, inputs)
-        self.assertIsNone(out.grad_fn)
-        self.assertIsNone(vhp_val[0].grad_fn)
-        self.assertIsNone(vhp_val[1].grad_fn)
-
-    def test_vhp_scalar(self):
-        def reducer(x):
-            return x.sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.vhp(reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-        inputs = torch.rand([])
-        v = torch.rand([])
-        res = autogradF.vhp(reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-        res = autogradF.vhp(reducer, inputs)
-        self._assert_same_struct(res[1], inputs)
-
-        def bad_reducer(x):
-            return x.sum().view(1, 1, 1)
-        inputs = torch.rand(4, 4)
-        v = torch.rand(4, 4)
-        res = autogradF.vhp(bad_reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-    def test_vhp_create_graph(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
-        v = torch.ones(4, 4, dtype=torch.double, requires_grad=True)
-        res = autogradF.vhp(foo, inputs, v, create_graph=True)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-
-        gradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
-
-        def bar(a, b):
-            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
-
-        inputs = (torch.rand(3, dtype=torch.double, requires_grad=True),
-                  torch.rand(4, dtype=torch.double, requires_grad=True))
-        v = (torch.ones(3, dtype=torch.double, requires_grad=True),
-             torch.ones(4, dtype=torch.double, requires_grad=True))
-        out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True)
-        self._assert_same_struct(vhp_val, inputs)
-        self.assertIsNotNone(out.grad_fn)
-        self.assertIsNotNone(vhp_val[0].grad_fn)
-        self.assertIsNotNone(vhp_val[1].grad_fn)
-
-        gradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-
-        def foo(*args):
-            x, y = args[:2]
-            v = args[2:]
-
-            x = x.cos()
-            val, grad = autogradF.vhp(bar, (x, y), v, create_graph=True)
-
-            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
-
-        gradcheck(foo, inputs + v)
-        gradgradcheck(foo, inputs + v)
-
-    def test_hvp_err_check(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        def bar(a):
-            return 3 * a.narrow(0, 0, 3), "bar"
-
-        def bar2(a):
-            return 3 * a.narrow(0, 0, 3)
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        res = autogradF.hvp(foo, inp, v)
-        with self.assertRaisesRegex(TypeError, "The inputs given to hvp must be either a Tensor"):
-            res = autogradF.hvp(foo, (inp, 2), v)
-
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hvp must"):
-            res = autogradF.hvp(bar, inp, v)
-
-        err_msg_out = "The Tensor returned by the function given to hvp should contain a single element"
-        with self.assertRaisesRegex(RuntimeError, err_msg_out):
-            res = autogradF.hvp(bar2, inp, v)
-
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
-            res = autogradF.hvp(foo, inp, torch.rand(5))
-
-        with self.assertRaisesRegex(TypeError, "The v given to hvp must be either a Tensor or a tuple of Tensors"):
-            res = autogradF.hvp(foo, inp, (v, 2))
-
-        res = autogradF.hvp(foo, inp, v)
-        self._assert_same_struct(res[1], inp)
-
-        def foo(a, b):
-            return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum()
-
-        inp = (torch.rand(4), torch.rand(5))
-        v = (torch.rand(4), torch.rand(5))
-
-        res = autogradF.hvp(foo, inp, v)
-        self._assert_same_struct(res[1], inp)
-
-    def test_hvp_err_check_strict(self):
-        def foo(a):
-            return a.detach().sum()
-
-        def bar(a):
-            # Make a non-leaf Tensor that requires_grad but that is not connected to the input
-            return a.long().float().requires_grad_().clone().sum()
-
-        def bar2(a):
-            # A Linear function for which the jacobian is independent of the input
-            return (3 * a).sum()
-
-        inp = torch.rand(4)
-        v = torch.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
-            res = autogradF.hvp(foo, inp, v, strict=True)
-        res = autogradF.hvp(foo, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
-            res = autogradF.hvp(bar, inp, v, strict=True)
-        res = autogradF.hvp(bar, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
-            res = autogradF.hvp(bar2, inp, v, strict=True)
-        res = autogradF.hvp(bar2, inp, v, strict=False)
-        self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
-
-    def test_hvp_no_grad(self):
-        def reducer(x):
-            return x.exp().sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        with torch.no_grad():
-            res = autogradF.hvp(reducer, inputs, v)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
+    # Inplace modify so that the backward for
+    # left2 always raises an error
+    left1 += 1
 
-        with torch.no_grad():
-            res = autogradF.hvp(reducer, inputs, v, create_graph=True)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-        self.assertNotEqual(res[1], torch.zeros(4, 4))
-
-    def test_hvp_output(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.hvp(foo, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNone(res[0].grad_fn)
-        self.assertIsNone(res[1].grad_fn)
-
-        def bar(a, b):
-            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
-
-        inputs = (torch.rand(3), torch.rand(4))
-        v = (torch.ones(3), torch.ones(4))
-        out, hvp_val = autogradF.hvp(bar, inputs, v)
-        self._assert_same_struct(hvp_val, inputs)
-        self.assertIsNone(out.grad_fn)
-        self.assertIsNone(hvp_val[0].grad_fn)
-        self.assertIsNone(hvp_val[1].grad_fn)
-
-    def test_hvp_scalar(self):
-        def reducer(x):
-            return x.exp().sum()
-        inputs = torch.rand(4, 4)
-        v = torch.ones(4, 4)
-        res = autogradF.hvp(reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-        inputs = torch.rand([])
-        v = torch.rand([])
-        res = autogradF.hvp(reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-        res = autogradF.hvp(reducer, inputs)
-        self._assert_same_struct(res[1], inputs)
-
-        def bad_reducer(x):
-            return x.exp().sum().view(1, 1, 1)
-        inputs = torch.rand(4, 4)
-        v = torch.rand(4, 4)
-        res = autogradF.hvp(bad_reducer, inputs, v)
-        self._assert_same_struct(res[1], inputs)
-
-    def test_hvp_create_graph(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
-
-        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
-        v = torch.ones(4, 4, dtype=torch.double, requires_grad=True)
-        res = autogradF.hvp(foo, inputs, v, create_graph=True)
-        self._assert_same_struct(res[1], inputs)
-        self.assertIsNotNone(res[0].grad_fn)
-        self.assertIsNotNone(res[1].grad_fn)
-
-        gradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
-
-        def bar(a, b):
-            return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
-
-        inputs = (torch.rand(3, dtype=torch.double, requires_grad=True),
-                  torch.rand(4, dtype=torch.double, requires_grad=True))
-        v = (torch.ones(3, dtype=torch.double, requires_grad=True),
-             torch.ones(4, dtype=torch.double, requires_grad=True))
-        out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True)
-        self._assert_same_struct(hvp_val, inputs)
-        self.assertIsNotNone(out.grad_fn)
-        self.assertIsNotNone(hvp_val[0].grad_fn)
-        self.assertIsNotNone(hvp_val[1].grad_fn)
-
-        gradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-
-        def foo(*args):
-            x, y = args[:2]
-            v = args[2:]
-
-            x = x.cos()
-            val, grad = autogradF.hvp(bar, (x, y), v, create_graph=True)
-
-            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
-
-        gradcheck(foo, inputs + v)
-        gradgradcheck(foo, inputs + v)
-
-    def test_jacobian_match_vjp_jvp(self):
-        def foo(x):
-            return x ** 3 + x.sum()
+    # An op that takes both side as input.
+    # After running, both side's last op will be in
+    # the ready queue
+    # And the op for left will run first as it was
+    # executed last during the forward
+    out = left2 + right
 
-        inputs = torch.rand(4)
-        v = torch.rand(4)
+    return out
 
-        jac = autogradF.jacobian(foo, inputs)
-        jvp = autogradF.jvp(foo, inputs, v)[1]
-        vjp = autogradF.vjp(foo, inputs, v)[1]
+# Nothing should be global variables here as, from what
+# I can see, python leaks all the global objects
+get_out().sum().backward()
 
-        self.assertEqual(jvp, torch.mm(jac, v.unsqueeze(1)).squeeze(1))
-        self.assertEqual(vjp, torch.mm(v.unsqueeze(0), jac).squeeze(0))
+# This used to deadlock when the PyNode is being destroyed after
+# the error is raised.
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, '-c', script],
+                stderr=subprocess.STDOUT,
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+                # It is ok to have an extra long timeout here as a timeout means the test failed
+                timeout=20)
+        except subprocess.TimeoutExpired as e:
+            self.fail(msg="Example code timed out! See the code sample in the test for details.")
+        except subprocess.CalledProcessError as e:
+            err_msg = "RuntimeError: one of the variables needed for gradient computation"
+            self.assertTrue(err_msg in e.output.decode("utf-8"))
 
-    def test_hessian_match_vhp_hvp(self):
-        def foo(a):
-            return 3 * a.narrow(0, 0, 3).exp().sum()
+def index_perm_variable(shape, max_indices):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
 
-        inputs = torch.rand(4)
-        v = torch.rand(4)
+    index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
+    return index
 
-        hes = autogradF.hessian(foo, inputs)
-        hvp = autogradF.hvp(foo, inputs, v)[1]
-        vhp = autogradF.vhp(foo, inputs, v)[1]
+def bernoulli_scalar():
+    return torch.tensor(0, dtype=torch.uint8).bernoulli_()
 
-        self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1))
-        self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
 
 class TestAutogradForwardModeBatchedGrad(TestCase):
     def test_out_of_place_basic(self):
@@ -7814,6 +6577,18 @@ def test_metadata_check_checks_storage_numel(self):
             # as_strided runs without error
             dual.as_strided((5,), (1,), 0)
 
+    def test_metadata_check_checks_ignores_size_zero(self):
+        a = torch.ones(0).as_strided((0, 1,), (1, 1,), 0)
+        b = torch.ones(0).as_strided((0, 1,), (1, 0,), 0)
+
+        with fwAD.dual_level():
+            dual = fwAD.make_dual(a, b)
+            torch.diagonal(dual, offset=0)
+
+        input = torch.rand([0, 1], dtype=torch.complex128, requires_grad=True)
+        func = partial(torch.diagonal, offset=0)
+        torch.autograd.gradcheck(func, (input,), check_forward_ad=True)
+
     def test_metadata_check_when_primal_has_conj_bit(self):
         # Make sure the _has_same_storage_numel is a fallthrough, so that
         # conj bit does not materialize. If it materializes it would
@@ -7910,13 +6685,17 @@ class MySubclass(torch.Tensor):
             def __new__(cls, data=None):
                 return torch.Tensor._make_subclass(cls, data)
 
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-                if func == torch.ops.aten.alias:
+                if func.overloadpacket == torch.ops.aten.alias:
                     counter[0] += 1
 
-                    with no_dispatch():
-                        return MySubclass(torch.ops.aten.alias(*args))
+                    # Make sure we can re-enable autograd here
+                    with torch.overrides.enable_reentrant_dispatch():
+                        foo = torch.rand(1, requires_grad=True)
+                        self.assertIsNotNone(foo.exp().grad_fn)
 
                 with no_dispatch():
                     return func(*args, **kwargs)
@@ -7925,10 +6704,11 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         s = MySubclass(a)
 
         with fwAD.dual_level():
+            # Only the primal has "alias" called on it
             fwAD.make_dual(s, torch.rand_like(s))
             self.assertEqual(counter[0], 1)
             fwAD.make_dual(torch.rand_like(s), s)
-            self.assertEqual(counter[0], 2)
+            self.assertEqual(counter[0], 1)
 
     def test_print(self):
         with fwAD.dual_level() as level:
@@ -8299,6 +7079,35 @@ def test_min_max_median_backprops_to_all_values(self, device):
                 self.assertEqual(x.grad.sum(), 1.)
                 self.assertEqual((x.grad == 1 / 3).sum(), 3)
 
+    def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
+        # tests that gradients are evenly distributed when there are multiple max/min values
+        # tested here instead of adding a SampleInput as the backward for this case is non-differentiable for gradgrad
+        # as is the case for test_min_max_median_backprops_to_all_values above
+        fns = (torch.scatter_reduce, torch.index_reduce)
+        reduces = ('amin', 'amax')
+        for fn, reduction in product(fns, reduces):
+            input = torch.randn((2, 3), device=device, dtype=torch.float64, requires_grad=True)
+            src = input.clone().detach_().requires_grad_(True)
+            idx = torch.arange(2).to(dtype=torch.long, device=device)
+            if fn == torch.scatter_reduce:
+                idx = idx.unsqueeze(-1).expand((2, 3))
+
+            gradcheck(fn, (input, 0, idx, src, reduction), check_batched_grad=False)
+
+    def test_scatter_index_reduce_prod_gradgrad_error(self, device):
+        # test that double backward raises an error for the case where 2 zeros in src
+        # are scattered to the same position in self
+        input = torch.tensor([1.], device=device, dtype=torch.float64, requires_grad=True)
+        src = torch.tensor([0., 0.], device=device, dtype=torch.float64, requires_grad=True)
+        idx = torch.tensor([0, 0], device=device, dtype=torch.long)
+
+        for fn in (torch.scatter_reduce, torch.index_reduce):
+            # check that this case passes on gradcheck
+            gradcheck(fn, (input, 0, idx, src, 'prod'), check_batched_grad=False)
+            with self.assertRaisesRegex(RuntimeError, "Double backward is unsupported for"):
+                gradgradcheck(fn, (input, 0, idx, src, 'prod'))
+
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_parameter_resize(self, device):
         asd = torch.nn.Parameter(torch.ones(16, dtype=torch.double, device=device))
 
@@ -8310,6 +7119,7 @@ def test_parameter_resize(self, device):
             m = torch.cat((asd, asd))
             m.sum().backward()
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_ctor_getter_backward(self, device, dtype):
         # See NOTE [ Sparse: autograd and API ] on the expected behavior of this test
@@ -8346,6 +7156,7 @@ def fn(v):
             _test(sparse_size + dense_size, len(sparse_size), nnz, device)
 
     @skipMeta
+    @skipIfMps
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_backward(self, device, dtype):
         class FixedGradientFunction(Function):
@@ -8391,6 +7202,7 @@ def backward(ctx, grad_x):
     # autograd tests via common_method_invocations don't allow input tensors to
     # be sparse (RuntimeError: gradcheck expects all tensor inputs are dense when
     # check_sparse_nnz is set to False.)
+    @skipIfMps
     def test_sparse_mask_autograd(self, device):
         tensor = torch.randn(3, requires_grad=True, device=device)
         mask = torch.ones(3, device=device)
@@ -8400,6 +7212,7 @@ def test_sparse_mask_autograd(self, device):
         converted.sum().backward()
         self.assertEqual(tensor.grad, mask.to_dense())
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_pyscalar_conversions(self, device):
         def _test_pyscalar_conversions(t, integral_conv):
             # integral -> integral
@@ -8454,6 +7267,7 @@ def test_nonzero(tensor, value, expected):
 
         _test_pyscalar_conversions(lambda x: x.to(device), lambda x: int(x))
 
+    @dtypesIfMPS(torch.float32)
     @dtypesIfCUDA(torch.half, torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64)
     @dtypes(torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64)
     def test_set_requires_grad_only_for_floats(self, device, dtype):
@@ -8559,6 +7373,7 @@ def _get_cuda_memory_usage():
 
         self.assertEqual(before, after)
 
+    @skipIfMps  # the test doesn't work on MPS
     # TODO: see if these tests can be ported to OpInfos or moved to where's test suite
     def test_where_functional(self, device):
         x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
@@ -8576,6 +7391,7 @@ def where(cond, x, y):
         gradcheck(where, [cond, x, y], raise_exception=True)
         gradgradcheck(where, [cond, x, y], [torch.randn(5, 5, 5, device=device)])
 
+    @skipIfMps  # the test doesn't work on MPS
     def test_where_scalar(self, device):
         x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
         scalar = 4.
@@ -8641,6 +7457,7 @@ def test_rnn_backward_to_input_but_not_parameters(self, device):
         out.sum().backward()
         self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)
 
+    @skipIfMps  # the test doesn't work as randn is not supported with type long
     @deviceCountAtLeast(1)
     def test_grad_assignment(self, devices):
         x = torch.randn(5, 5, device=devices[0])
@@ -8678,6 +7495,7 @@ def test_grad_assignment(self, devices):
             with self.assertRaises(RuntimeError):
                 x.grad = torch.randn(5, 5, device=devices[1])
 
+    @dtypesIfMPS(torch.float32)
     @deviceCountAtLeast(1)
     @dtypes(torch.float, torch.double)
     def test_requires_grad_factory(self, devices, dtype):
@@ -8731,7 +7549,7 @@ def test_copy_(self, device):
         # At the time of writing this test, copy_ is not generated from native_functions.yaml
         # there was a bug that bfloat16 was not recognized as floating.
         x = torch.randn(10, device=device, requires_grad=True)
-        floating_dt = [dt for dt in get_all_dtypes() if dt.is_floating_point]
+        floating_dt = floating_types_and(torch.half, torch.bfloat16)
         for dt in floating_dt:
             y = torch.empty(10, device=device, dtype=dt)
             y.copy_(x)
@@ -8842,12 +7660,14 @@ def test_inplace_on_view_of_view(self, device):
         # modify view-of-view and backprop through base
         root = torch.randn(2, 2, device=device, requires_grad=True)
         x = root.clone()
+
         v1 = x.narrow(0, 0, 1)
         v2 = v1.narrow(1, 1, 1)
         v2.mul_(2)
         x.sum().backward()
         self.assertEqual(root.grad.tolist(), [[1, 2], [1, 1]])
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_then_no_grad(self, device):
         # Perform an in-place operation on a view of a non-leaf variable.
         a = torch.ones(3, 1, dtype=torch.double, device=device, requires_grad=True)
@@ -8861,6 +7681,7 @@ def test_inplace_on_view_then_no_grad(self, device):
 
         c.sum().backward()
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_gradcheck(self, device):
         # gradcheck modifications to views
         a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True)
@@ -8883,6 +7704,7 @@ def test_inplace_on_view_multiple_outputs(self, device):
         with self.assertRaises(RuntimeError):
             v1[0].mul_(2)
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_of_multiple_output_view(self, device):
         a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
         b = a.unbind(0)
@@ -8890,6 +7712,7 @@ def test_inplace_on_view_of_multiple_output_view(self, device):
         with self.assertRaises(RuntimeError):
             c.mul_(2)
 
+    @skipIfMps  # MPS backend doesn't support double types
     def test_inplace_multiple_output_view_of_view(self, device):
         a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
         b = a.view_as(a)
@@ -8897,6 +7720,7 @@ def test_inplace_multiple_output_view_of_view(self, device):
         with self.assertRaises(RuntimeError):
             c[0].mul_(2)
 
+    @skipIfMps  # MPS backend doesn't support double types
     def test_inplace_on_view_makes_base_require_grad(self, device):
         # in-place modification to view makes base require grad
         a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=False)
@@ -8922,6 +7746,7 @@ def test_inplace_on_view_backprop_view(self, device):
         self.assertEqual(b.grad.tolist(), [5])
         self.assertIsNone(a.grad)
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_modify_base(self, device):
         # Test that an in-place operation on a base that forced it to require
         # grad also forces any previous views to require grad and backprop
@@ -8940,6 +7765,7 @@ def fn(r):
         gradcheck(fn, [r])
         gradgradcheck(fn, [r])
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_python(self, device):
         # in-place modifications of Python-autograd created view
         a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True)
@@ -8996,6 +7822,7 @@ def test_inplace_on_view_multi_output_safe(self, device):
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 s1.mul_(s2)
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_mv_grad_stride_0(self, device):
         # Reference: https://github.com/pytorch/pytorch/issues/38315
         mat = torch.randn(2, 2, dtype=torch.double, device=device)
@@ -9050,6 +7877,7 @@ def test_strided_leaf_grad_layout(self, device):
         (c * d).sum().backward()
         self.assertEqual(c.grad.stride(), (2, 1))
 
+    @skipIfMps
     def test_copy_r_to_c(self, device):
         out_c = torch.empty(3, 2, dtype=torch.cdouble, device=device)
         inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
@@ -9062,6 +7890,16 @@ def do_test():
 
         self.assertNotWarn(do_test)
 
+    def test_to_r_to_c(self, device):
+        def do_test():
+            inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
+                                requires_grad=True)
+            out = inp_r.to(torch.complex128)
+            out.sum().backward()
+            self.assertEqual(inp_r.grad, torch.ones_like(inp_r))
+
+        self.assertNotWarn(do_test)
+
     def test_non_differentiable_ops(self, device):
         # Just make sure the op doesn't raise an error
         # and resulting tensor has requires_grad=False.
@@ -9693,6 +8531,7 @@ def fn(x1, x2):
 # the suppressions.
 
 from autograd.test_complex import TestAutogradComplex  # noqa: F401
+from autograd.test_functional import TestAutogradFunctional  # noqa: F401
 
 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA
 instantiate_device_type_tests(
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 0bd2a9e4d527..a4d3db0ff82a 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -13,229 +13,74 @@
 import operator
 from functools import partial
 
+import torch.autograd.forward_ad as fwAD
 from torch._six import inf, nan
 from torch.testing._internal.common_utils import (
-    TestCase, slowTest, iter_indices, TEST_WITH_ASAN, run_tests, gradcheck,
-    torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, TEST_SCIPY, set_default_dtype)
+    TestCase,
+    slowTest,
+    iter_indices,
+    TEST_WITH_ASAN,
+    run_tests,
+    gradcheck,
+    torch_to_numpy_dtype_dict,
+    numpy_to_torch_dtype_dict,
+    TEST_SCIPY,
+    set_default_dtype,
+)
 from torch.testing._internal.common_device_type import (
-    expectedFailureMeta, instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
-    dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyNativeDeviceTypes,
-    skipCUDAIfRocm, skipIf, ops, OpDTypes)
+    expectedFailureMeta,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    onlyCPU,
+    dtypes,
+    dtypesIfCUDA,
+    dtypesIfCPU,
+    deviceCountAtLeast,
+    precisionOverride,
+    onlyNativeDeviceTypes,
+    skipIf,
+    ops,
+    OpDTypes,
+    skipMeta,
+)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
-    all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes,
-    get_all_complex_dtypes, get_all_fp_dtypes,
+    all_types_and_complex_and,
+    all_types_and,
+    integral_types,
+    complex_types,
+    integral_types_and,
+    floating_types_and,
+    floating_and_complex_types,
+    get_all_math_dtypes,
 )
 from torch.testing._internal.common_methods_invocations import (
-    binary_ufuncs, _NOTHING)
+    binary_ufuncs,
+    binary_ufuncs_and_refs,
+    _NOTHING,
+    generate_elementwise_binary_tensors,
+    generate_elementwise_binary_small_value_tensors,
+    generate_elementwise_binary_large_value_tensors,
+    generate_elementwise_binary_extremal_value_tensors,
+    generate_elementwise_binary_broadcasting_tensors,
+    generate_elementwise_binary_with_scalar_samples,
+)
 
 if TEST_SCIPY:
     import scipy.special
     import scipy.integrate
 
-# TODO: remove this
-def _generate_input(shape, dtype, device, with_extremal):
-    if shape == ():
-        x = torch.tensor((), dtype=dtype, device=device)
-    else:
-        if dtype.is_floating_point or dtype.is_complex:
-            # work around torch.randn not being implemented for bfloat16
-            if dtype == torch.bfloat16:
-                x = torch.randn(*shape, device=device) * random.randint(30, 100)
-                x = x.to(torch.bfloat16)
-            else:
-                x = torch.randn(*shape, dtype=dtype, device=device) * random.randint(30, 100)
-            x[torch.randn(*shape) > 0.5] = 0
-            if with_extremal and dtype.is_floating_point:
-                # Use extremal values
-                x[torch.randn(*shape) > 0.5] = float('nan')
-                x[torch.randn(*shape) > 0.5] = float('inf')
-                x[torch.randn(*shape) > 0.5] = float('-inf')
-            elif with_extremal and dtype.is_complex:
-                x[torch.randn(*shape) > 0.5] = complex('nan')
-                x[torch.randn(*shape) > 0.5] = complex('inf')
-                x[torch.randn(*shape) > 0.5] = complex('-inf')
-        elif dtype == torch.bool:
-            x = torch.zeros(shape, dtype=dtype, device=device)
-            x[torch.randn(*shape) > 0.5] = True
-        else:
-            x = torch.randint(15, 100, shape, dtype=dtype, device=device)
-
-    return x
-
-# TODO: refactor this out
-# Converts half/bfloat16 dtype to float when device is cpu
-def _convert_t(dtype, device):
-    if device == 'cpu' and dtype in {torch.half, torch.bfloat16}:
-        return torch.float
-    return dtype
-
-# TODO: revise the tests to use make_tensor in common_utils.py instead
-# Returns a tensor of the requested shape, dtype, and device
-# Requesting a half CPU tensor returns a float CPU tensor with
-# values representable by a half.
-# Initialization uses randint for non-float types and randn for float types.
-def _make_tensor(shape, dtype, device, fill_ones=False) -> torch.Tensor:
-    # Returns a tensor filled with ones
-    if fill_ones:
-        return torch.ones(*shape, dtype=_convert_t(dtype, device), device=device)
-
-    # Returns a tensor with random integer values
-    if not (dtype.is_floating_point or dtype.is_complex):
-        t = torch.randint(0, 10, shape, device=device)
-        if dtype != torch.uint8:
-            t = t - 5  # generate negative values also
-        return t.to(_convert_t(dtype, device))
-
-    # Populates the CPU tensor with floats representable as half/bfloat16
-    if dtype == torch.half and device == 'cpu':
-        return torch.randn(*shape, dtype=torch.float, device=device).half().float()
-    if dtype == torch.bfloat16 and device == 'cpu':
-        return torch.randn(*shape, dtype=torch.float, device=device).bfloat16().float()
-
-    # Default: returns a tensor with random float values
-    return torch.randn(shape, dtype=dtype, device=device).to(dtype=dtype)
-
 # TODO: update to use opinfos consistently
 class TestBinaryUfuncs(TestCase):
     # Generic tests for elementwise binary (AKA binary universal (u) functions (funcs))
     # TODO: below contiguous tensor results are compared with a variety of noncontiguous results.
     #   It would be interesting to have the lhs and rhs have different discontiguities.
 
-    # Returns a pair of iterables of contiguous tensors on the requested device
-    #   and with the requested dtype.
-    #
-    # This function is intended to test the non-vectorized and vectorized code
-    #   paths of unary functions, as well as their handling of odd tensor
-    #   sizes (like zero-dim tensors and tensors with zero elements).
-    #
-    # Each iterable will include an a tensor with no elements,
-    #   zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and
-    #   a large 2D tensor.
-    def _generate_numeric_tensors(self, op, *, device, dtype, lhs_kwargs, rhs_kwargs):
-        lhs_tensors = []
-        rhs_tensors = []
-
-        shapes = ((0,),  # tensors with no elements
-                  (1, 0, 3),
-                  # zero dim (scalar) tensor
-                  (),
-                  # small 1D tensor
-                  (20,),
-                  # medium 1D tensor
-                  (812,),
-                  # large 2D tensor
-                  (1029, 917))
-
-        for kwargs, tensors in ((lhs_kwargs, lhs_tensors), (rhs_kwargs, rhs_tensors)):
-            for shape in shapes:
-                tensors.append(make_tensor(shape, device, dtype, **kwargs))
-
-        return lhs_tensors, rhs_tensors
-
-    # Returns a pair of iterables of contiguous tensors on the requested device and with
-    #   the requested dtype.
-    #
-    # Unlike the previous function, the values in these tensors are specified manually.
-    def _generate_interesting_small_valued_tensors(self, device, dtype):
-        # defines interesting values
-        _unsigned_int_vals = (0, 1, 55, 127, 128, 190, 210, 220, 254, 255, 256)
-        _int_vals = (0, -1, 1, -55, 55, -127, 127, -128, 128)
-        _float_vals = (0.,
-                       -.001, .001,
-                       -.25, .25,
-                       -1., 1.,
-                       -math.pi / 2, math.pi / 2,
-                       -math.pi + .00001, math.pi - .00001,
-                       -math.pi, math.pi,
-                       -math.pi - .00001, math.pi + .00001)
-
-        l_vals = []
-        r_vals = []
-
-        if dtype.is_floating_point:
-            prod = product(_float_vals, _float_vals)
-        elif dtype.is_complex:
-            complex_vals = product(_float_vals, _float_vals)
-            # Note the use of list is required here or the map generator will be
-            #  emptied by the following product and it won't produce the desired cross-product
-            complex_vals = list(map(lambda x: complex(*x), complex_vals))
-            prod = product(complex_vals, complex_vals)
-        elif dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
-            prod = product(_int_vals, _int_vals)
-        elif dtype is torch.uint8:
-            prod = product(_unsigned_int_vals, _unsigned_int_vals)
-        else:
-            raise ValueError("Unsupported dtype!")
-
-        for l, r in prod:
-            l_vals.append(l)
-            r_vals.append(r)
-
-        lhs = torch.tensor(l_vals, device=device, dtype=dtype)
-        rhs = torch.tensor(r_vals, device=device, dtype=dtype)
-
-        return lhs, rhs
-
-    def _generate_interesting_large_valued_tensors(self, device, dtype):
-        _large_int_vals = (-1113, 1113, -10701, 10701)
-        _large_float16_vals = (-501, 501, -1001.2, 1001.2, -13437.7, 13437.7)
-        _large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20)
-
-        l_vals = []
-        r_vals = []
-
-        if dtype == torch.float16:
-            prod = product(_large_float16_vals, _large_float16_vals)
-        elif dtype.is_floating_point:
-            prod = product(_large_float_vals, _large_float_vals)
-        elif dtype.is_complex:
-            complex_vals = product(_large_float_vals, _large_float_vals)
-            # Note the use of list is required here or the map generator will be
-            #  emptied by the following product and it won't produce the desired cross-product
-            complex_vals = list(map(lambda x: complex(*x), complex_vals))
-            prod = product(complex_vals, complex_vals)
-        elif dtype in (torch.int16, torch.int32, torch.int64):
-            prod = product(_large_int_vals, _large_int_vals)
-        else:
-            raise ValueError("Unsupported dtype!")
-
-        for l, r in prod:
-            l_vals.append(l)
-            r_vals.append(r)
-        lhs = torch.tensor(l_vals, device=device, dtype=dtype)
-        rhs = torch.tensor(r_vals, device=device, dtype=dtype)
-
-        return lhs, rhs
-
-    def _generate_interesting_extremal_valued_tensors(self, device, dtype):
-        _float_extremals = (float('inf'), float('-inf'), float('nan'))
-
-        l_vals = []
-        r_vals = []
-
-        if dtype.is_floating_point:
-            prod = product(_float_extremals, _float_extremals)
-        elif dtype.is_complex:
-            complex_vals = product(_float_extremals, _float_extremals)
-            # Note the use of list is required here or the map generator will be
-            #  emptied by the following product and it won't produce the desired cross-product
-            complex_vals = list(map(lambda x: complex(*x), complex_vals))
-            prod = product(complex_vals, complex_vals)
-        else:
-            raise ValueError("Unsupported dtype!")
-
-        for l, r in prod:
-            l_vals.append(l)
-            r_vals.append(r)
-        lhs = torch.tensor(l_vals, device=device, dtype=dtype)
-        rhs = torch.tensor(r_vals, device=device, dtype=dtype)
-
-        return lhs, rhs
-
     # Helper for comparing torch tensors and NumPy arrays
     # TODO: should this or assertEqual also validate that strides are equal?
-    def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs):
+    def assertEqualHelper(
+        self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs
+    ):
         assert isinstance(actual, torch.Tensor)
 
         # Some NumPy functions return scalars, not arrays
@@ -249,71 +94,104 @@ def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, *
                 # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16
                 # to float32
                 if expected.dtype == np.float32:
-                    assert actual.dtype in (torch.float16, torch.bfloat16, torch.float32)
+                    assert actual.dtype in (
+                        torch.float16,
+                        torch.bfloat16,
+                        torch.float32,
+                    )
                 else:
                     assert expected.dtype == torch_to_numpy_dtype_dict[actual.dtype]
 
-            self.assertEqual(actual,
-                             torch.from_numpy(expected).to(actual.dtype),
-                             msg,
-                             exact_device=False,
-                             **kwargs)
+            self.assertEqual(
+                actual,
+                torch.from_numpy(expected).to(actual.dtype),
+                msg,
+                exact_device=False,
+                **kwargs,
+            )
         else:
             self.assertEqual(actual, expected, msg, exact_device=False, **kwargs)
 
     # Tests that the function and its (array-accepting) reference produce the same
     #   values on given tensors
-    def _test_reference_numerics(self, dtype, op, tensor_pairs, equal_nan=True):
-        def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=True):
-            if not torch.can_cast(numpy_to_torch_dtype_dict[expected.dtype.type], dtype):
+    def _test_reference_numerics(self, dtype, op, gen, equal_nan=True):
+        def _helper_reference_numerics(
+            expected, actual, msg, exact_dtype, equal_nan=True
+        ):
+            if not torch.can_cast(
+                numpy_to_torch_dtype_dict[expected.dtype.type], dtype
+            ):
                 exact_dtype = False
 
             if dtype is torch.bfloat16 and expected.dtype == np.float32:
                 # Ref: https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_utils.py#L1149
-                self.assertEqualHelper(actual, expected, msg, dtype=dtype,
-                                       exact_dtype=exact_dtype, rtol=16e-3, atol=1e-5)
+                self.assertEqualHelper(
+                    actual,
+                    expected,
+                    msg,
+                    dtype=dtype,
+                    exact_dtype=exact_dtype,
+                    rtol=16e-3,
+                    atol=1e-5,
+                )
             else:
-                self.assertEqualHelper(actual, expected, msg, dtype=dtype, equal_nan=equal_nan, exact_dtype=exact_dtype)
-
-        for l, r in tensor_pairs:
-            if dtype is torch.bfloat16:
-                l_numpy = l.cpu().to(torch.float32).numpy()
-                r_numpy = r.cpu().to(torch.float32).numpy()
-            else:
-                l_numpy = l.cpu().numpy()
-                r_numpy = r.cpu().numpy()
+                self.assertEqualHelper(
+                    actual,
+                    expected,
+                    msg,
+                    dtype=dtype,
+                    equal_nan=equal_nan,
+                    exact_dtype=exact_dtype,
+                )
+
+        for sample in gen:
+            # Each sample input acquired from the generator is just one lhs tensor
+            #   and one rhs tensor
+            l = sample.input
+            r = sample.args[0]
+
+            numpy_sample = sample.numpy()
+            l_numpy = numpy_sample.input
+            r_numpy = numpy_sample.args[0]
 
             actual = op(l, r)
             expected = op.ref(l_numpy, r_numpy)
 
             # Crafts a custom error message for smaller, printable tensors
-            if l.numel() < 10 and r.numel() < 10:
-                msg = ("Failed to produce expected results! Input lhs tensor was"
-                       " {0}, rhs tensor was {1}, torch result is {2}, and reference result is"
-                       " {3}.").format(l, r, actual, expected)
+            def _numel(x):
+                if isinstance(x, torch.Tensor):
+                    return x.numel()
+                # Assumes x is a scalar
+                return 1
+
+            if _numel(l) < 10 and _numel(r) < 10:
+                msg = (
+                    "Failed to produce expected results! Input lhs tensor was"
+                    " {0}, rhs tensor was {1}, torch result is {2}, and reference result is"
+                    " {3}."
+                ).format(l, r, actual, expected)
             else:
                 msg = None
 
             exact_dtype = True
             if isinstance(actual, torch.Tensor):
-                _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan)
+                _helper_reference_numerics(
+                    expected, actual, msg, exact_dtype, equal_nan
+                )
             else:
                 for x, y in zip(expected, actual):
                     # testing multi-outputs results
                     _helper_reference_numerics(x, y, msg, exact_dtype, equal_nan)
 
     # The following tests only apply to elementwise binary operators with references
-    binary_ufuncs_with_references = list(filter(lambda op: op.ref is not None and op.ref is not _NOTHING, binary_ufuncs))
+    binary_ufuncs_with_references = list(
+        filter(lambda op: op.ref is not None and op.ref is not _NOTHING, binary_ufuncs)
+    )
 
     @ops(binary_ufuncs_with_references)
     def test_reference_numerics(self, device, dtype, op):
-        lhs_tensors, rhs_tensors = self._generate_numeric_tensors(op,
-                                                                  device=device,
-                                                                  dtype=dtype,
-                                                                  lhs_kwargs=op.lhs_make_tensor_kwargs,
-                                                                  rhs_kwargs=op.rhs_make_tensor_kwargs)
-
-        self._test_reference_numerics(dtype, op, zip(lhs_tensors, rhs_tensors), equal_nan=True)
+        gen = generate_elementwise_binary_tensors(op, device=device, dtype=dtype)
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
     # runtime error: 128 is outside the range of representable values of type 'signed char'
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
@@ -322,81 +200,84 @@ def test_reference_numerics_small_values(self, device, dtype, op):
         if dtype is torch.bool:
             self.skipTest("Doesn't support bool!")
 
-        lhs, rhs = self._generate_interesting_small_valued_tensors(device, dtype)
-        self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True)
+        gen = generate_elementwise_binary_small_value_tensors(
+            op, device=device, dtype=dtype
+        )
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
     # TODO: review if this skip is necessary
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @ops(binary_ufuncs_with_references,
-         allowed_dtypes=(torch.int16, torch.int32, torch.int64, torch.float16,
-                         torch.bfloat16, torch.float32, torch.float64, torch.complex64, torch.complex128))
+    @ops(
+        binary_ufuncs_with_references,
+        allowed_dtypes=(
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+            torch.complex64,
+            torch.complex128,
+        ),
+    )
     def test_reference_numerics_large_values(self, device, dtype, op):
-        lhs, rhs = self._generate_interesting_large_valued_tensors(device, dtype)
-        self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True)
+        gen = generate_elementwise_binary_large_value_tensors(
+            op, device=device, dtype=dtype
+        )
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
     # TODO: review if this skip is necessary
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @ops(binary_ufuncs_with_references,
-         allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32,
-                         torch.float64, torch.complex64, torch.complex128))
+    @ops(
+        binary_ufuncs_with_references,
+        allowed_dtypes=(
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+            torch.complex64,
+            torch.complex128,
+        ),
+    )
     def test_reference_numerics_extremal_values(self, device, dtype, op):
-        lhs, rhs = self._generate_interesting_extremal_valued_tensors(device, dtype)
-        self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True)
+        gen = generate_elementwise_binary_extremal_value_tensors(
+            op, device=device, dtype=dtype
+        )
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
     # tests broadcasting and noncontiguous broadcasting behavior
-    @ops(binary_ufuncs_with_references, allowed_dtypes=(torch.long, torch.float32,))
+    @ops(
+        binary_ufuncs_with_references,
+        allowed_dtypes=(
+            torch.long,
+            torch.float32,
+        ),
+    )
     def test_broadcasting(self, device, dtype, op):
-        shapes = (
-            ((1,), ()),
-            ((2,), ()),
-            ((1,), (2,)),
-            ((2,), (2,)),
-            ((2, 1), (2,)),
-            ((1, 2), (2,)),
-            ((3, 2), (2,)),
-            ((3, 2), (3, 2)),
-            ((1, 3, 2), (2,)),
-            ((1, 3, 2), (3, 2)),
-            ((3, 1, 2), (3, 2)),
-            ((1, 3, 2), (1, 3, 2)),
-            ((2, 3, 2), ()),
-            ((2, 3, 2), (2, 3, 2)),
-            ((3, 1, 2), (1, 3, 2)),
-        )
-
-        for shape, noncontiguous in product(shapes, [True, False]):
-            shape_lhs, shape_rhs = shape
-            lhs = make_tensor(shape_lhs, device=device, dtype=dtype,
-                              noncontiguous=noncontiguous, **op.lhs_make_tensor_kwargs)
-            rhs = make_tensor(shape_rhs, device=device, dtype=dtype,
-                              noncontiguous=noncontiguous, **op.rhs_make_tensor_kwargs)
-
-            actual = op(lhs, rhs)
-            expected = op.ref(lhs.cpu().numpy(), rhs.cpu().numpy())
-
-            self.assertEqual(actual, expected, exact_dtype=False)
-
-    @ops(binary_ufuncs, allowed_dtypes=(torch.long, torch.float32,))
-    def test_broadcast_python_scalar(self, device, dtype, op):
-        for shape_lhs in ((), (1,), (2,), (1, 2, 3),):
-            lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
-
-            rhs_tensor = make_tensor((), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
-            rhs_expanded = rhs_tensor.expand_as(lhs)
-            rhs_scalar = rhs_tensor.item()
-
-            expected = op(lhs, rhs_expanded)
-
-            actual_tensor = op(lhs, rhs_tensor)
-            actual_scalar = op(lhs, rhs_scalar)
-
-            self.assertEqual(actual_tensor, expected)
-            self.assertEqual(actual_scalar, expected)
+        gen = generate_elementwise_binary_broadcasting_tensors(
+            op, device=device, dtype=dtype
+        )
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
+
+    @ops(
+        binary_ufuncs_with_references,
+        allowed_dtypes=(torch.long, torch.float32, torch.complex64),
+    )
+    def test_scalar_support(self, device, dtype, op):
+        gen = generate_elementwise_binary_with_scalar_samples(
+            op, device=device, dtype=dtype
+        )
+        self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
     @ops(binary_ufuncs)
     def test_contig_vs_every_other(self, device, dtype, op):
-        lhs = make_tensor((1026,), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor((1026,), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            (1026,), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            (1026,), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs
+        )
 
         lhs_non_contig = lhs[::2]
         rhs_non_contig = rhs[::2]
@@ -413,8 +294,12 @@ def test_contig_vs_every_other(self, device, dtype, op):
 
     @ops(binary_ufuncs)
     def test_contig_vs_transposed(self, device, dtype, op):
-        lhs = make_tensor((789, 357), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor((789, 357), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            (789, 357), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            (789, 357), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs
+        )
 
         lhs_non_contig = lhs.T
         rhs_non_contig = rhs.T
@@ -433,13 +318,21 @@ def test_contig_vs_transposed(self, device, dtype, op):
     def test_non_contig(self, device, dtype, op):
         shapes = ((5, 7), (1024,))
         for shape in shapes:
-            lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-            rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+            lhs = make_tensor(
+                shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+            )
+            rhs = make_tensor(
+                shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+            )
 
-            lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+            lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[
+                ..., 0
+            ]
             lhs_non_contig.copy_(lhs)
 
-            rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+            rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[
+                ..., 0
+            ]
             rhs_non_contig.copy_(rhs)
 
             self.assertTrue(lhs.is_contiguous())
@@ -455,8 +348,12 @@ def test_non_contig(self, device, dtype, op):
     @ops(binary_ufuncs)
     def test_non_contig_index(self, device, dtype, op):
         shape = (2, 2, 1, 2)
-        lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+        )
 
         lhs_non_contig = lhs[:, 1, ...]
         lhs = lhs_non_contig.contiguous()
@@ -478,8 +375,12 @@ def test_non_contig_index(self, device, dtype, op):
     def test_non_contig_expand(self, device, dtype, op):
         shapes = [(1, 3), (1, 7), (5, 7)]
         for shape in shapes:
-            lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-            rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+            lhs = make_tensor(
+                shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+            )
+            rhs = make_tensor(
+                shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+            )
 
             lhs_non_contig = lhs.clone().expand(3, -1, -1)
             rhs_non_contig = rhs.clone().expand(3, -1, -1)
@@ -498,8 +399,12 @@ def test_non_contig_expand(self, device, dtype, op):
     @ops(binary_ufuncs)
     def test_contig_size1(self, device, dtype, op):
         shape = (5, 100)
-        lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+        )
 
         lhs = lhs[:1, :50]
         lhs_alt = torch.empty(lhs.size(), device=device, dtype=dtype)
@@ -522,8 +427,12 @@ def test_contig_size1(self, device, dtype, op):
     @ops(binary_ufuncs)
     def test_contig_size1_large_dim(self, device, dtype, op):
         shape = (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4)
-        lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+        )
 
         lhs = lhs[:1, :, :, :, :, :, :, :, :, :, :, :]
         lhs_alt = torch.empty(lhs.size(), device=device, dtype=dtype)
@@ -546,8 +455,12 @@ def test_contig_size1_large_dim(self, device, dtype, op):
     @ops(binary_ufuncs)
     def test_batch_vs_slicing(self, device, dtype, op):
         shape = (32, 512)
-        lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs)
-        rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs)
+        lhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_tensor(
+            shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs
+        )
 
         expected = op(lhs, rhs)
 
@@ -562,40 +475,63 @@ def test_batch_vs_slicing(self, device, dtype, op):
     # NOTE: because the cross-product of all possible type promotion tests is huge, this
     #   just spot checks some handwritten cases.
     # NOTE: It may be possible to refactor this test into something simpler
-    @ops(binary_ufuncs, dtypes=OpDTypes.none)
+    @ops(binary_ufuncs_and_refs, dtypes=OpDTypes.none)
     def test_type_promotion(self, device, op):
         supported_dtypes = op.supported_dtypes(torch.device(device).type)
 
+        make_lhs = partial(
+            make_tensor, (5,), device=device, **op.lhs_make_tensor_kwargs
+        )
+        make_rhs = partial(
+            make_tensor, (5,), device=device, **op.rhs_make_tensor_kwargs
+        )
+
+        make_lhs_scalar_tensor = partial(
+            make_tensor, (), device='cpu', **op.lhs_make_tensor_kwargs
+        )
+        make_rhs_scalar_tensor = partial(
+            make_tensor, (), device='cpu', **op.rhs_make_tensor_kwargs
+        )
+
         def _supported(dtypes):
             return all(map(lambda x: x in supported_dtypes, dtypes))
 
         # int x int type promotion
         if _supported((torch.int16, torch.int32, torch.int64)):
-            lhs_i16 = make_tensor((5,), device=device, dtype=torch.int16, **op.lhs_make_tensor_kwargs)
-            lhs_i32 = make_tensor((5,), device=device, dtype=torch.int32, **op.lhs_make_tensor_kwargs)
-            lhs_i64 = make_tensor((5,), device=device, dtype=torch.int64, **op.lhs_make_tensor_kwargs)
-
-            rhs_i16 = make_tensor((5,), device=device, dtype=torch.int16, **op.rhs_make_tensor_kwargs)
-            rhs_i32 = make_tensor((5,), device=device, dtype=torch.int32, **op.rhs_make_tensor_kwargs)
-            rhs_i64 = make_tensor((5,), device=device, dtype=torch.int64, **op.rhs_make_tensor_kwargs)
+            lhs_i16 = make_lhs(dtype=torch.int16)
+            lhs_i32 = make_lhs(dtype=torch.int32)
+            lhs_i64 = make_lhs(dtype=torch.int64)
 
+            rhs_i16 = make_rhs(dtype=torch.int16)
+            rhs_i32 = make_rhs(dtype=torch.int32)
+            rhs_i64 = make_rhs(dtype=torch.int64)
 
             if op.promotes_int_to_float:
                 default_dtype = torch.get_default_dtype()
                 self.assertEqual(op(lhs_i16, rhs_i32).dtype, default_dtype)
-                self.assertEqual(op(lhs_i16, rhs_i32), op(lhs_i16.to(default_dtype), rhs_i32.to(default_dtype)))
+                self.assertEqual(
+                    op(lhs_i16, rhs_i32),
+                    op(lhs_i16.to(default_dtype), rhs_i32.to(default_dtype)),
+                )
 
                 self.assertEqual(op(lhs_i32, rhs_i64).dtype, default_dtype)
-                self.assertEqual(op(lhs_i32, rhs_i64), op(lhs_i32.to(default_dtype), rhs_i64.to(default_dtype)))
+                self.assertEqual(
+                    op(lhs_i32, rhs_i64),
+                    op(lhs_i32.to(default_dtype), rhs_i64.to(default_dtype)),
+                )
             elif op.always_returns_bool:
                 self.assertEqual(op(lhs_i16, rhs_i32).dtype, torch.bool)
                 self.assertEqual(op(lhs_i32, rhs_i64).dtype, torch.bool)
             else:  # standard type promotion
                 self.assertEqual(op(lhs_i16, rhs_i32).dtype, torch.int32)
-                self.assertEqual(op(lhs_i16, rhs_i32), op(lhs_i16.to(torch.int32), rhs_i32))
+                self.assertEqual(
+                    op(lhs_i16, rhs_i32), op(lhs_i16.to(torch.int32), rhs_i32)
+                )
 
                 self.assertEqual(op(lhs_i32, rhs_i64).dtype, torch.int64)
-                self.assertEqual(op(lhs_i32, rhs_i64), op(lhs_i32.to(torch.int64), rhs_i64))
+                self.assertEqual(
+                    op(lhs_i32, rhs_i64), op(lhs_i32.to(torch.int64), rhs_i64)
+                )
 
             if op.supports_out:
                 if not op.promotes_int_to_float:
@@ -606,7 +542,6 @@ def _supported(dtypes):
 
                     out = torch.empty_like(lhs_i16)
                     self.assertEqual(op(lhs_i32, rhs_i64, out=out).dtype, torch.int16)
-                    self.assertEqual(op(lhs_i32, rhs_i64), out, exact_dtype=False)
                 else:
                     # Float outs cannot be safely cast to integer types
                     with self.assertRaisesRegex(RuntimeError, "can't be cast"):
@@ -615,16 +550,18 @@ def _supported(dtypes):
                 if not op.always_returns_bool:
                     # Neither integer nor float outs can be cast to bool
                     with self.assertRaisesRegex(RuntimeError, "can't be cast"):
-                        op(lhs_i16, rhs_i32, out=torch.empty_like(lhs_i64, dtype=torch.bool))
+                        op(
+                            lhs_i16,
+                            rhs_i32,
+                            out=torch.empty_like(lhs_i64, dtype=torch.bool),
+                        )
 
                 # All these output types can be cast to any float or complex type
                 out = torch.empty_like(lhs_i64, dtype=torch.float16)
                 self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.float16)
-                self.assertEqual(op(lhs_i16, rhs_i32), out, exact_dtype=False)
 
                 out = torch.empty_like(lhs_i64, dtype=torch.bfloat16)
                 self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.bfloat16)
-                self.assertEqual(op(lhs_i16, rhs_i32), out, exact_dtype=False)
 
                 out = torch.empty_like(lhs_i64, dtype=torch.float32)
                 self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.float32)
@@ -636,23 +573,24 @@ def _supported(dtypes):
 
         # float x float type promotion
         if _supported((torch.float32, torch.float64)):
-            lhs_f32 = make_tensor((5,), device=device, dtype=torch.float32, **op.lhs_make_tensor_kwargs)
-            lhs_f64 = make_tensor((5,), device=device, dtype=torch.float64, **op.lhs_make_tensor_kwargs)
+            lhs_f32 = make_lhs(dtype=torch.float32)
+            lhs_f64 = make_lhs(dtype=torch.float64)
 
-            rhs_f32 = make_tensor((5,), device=device, dtype=torch.float32, **op.rhs_make_tensor_kwargs)
-            rhs_f64 = make_tensor((5,), device=device, dtype=torch.float64, **op.rhs_make_tensor_kwargs)
+            rhs_f32 = make_rhs(dtype=torch.float32)
+            rhs_f64 = make_rhs(dtype=torch.float64)
 
             if op.always_returns_bool:
                 self.assertEqual(op(lhs_f32, rhs_f64).dtype, torch.bool)
             else:  # normal float type promotion
                 self.assertEqual(op(lhs_f32, rhs_f64).dtype, torch.float64)
-                self.assertEqual(op(lhs_f32, rhs_f64), op(lhs_f32.to(torch.float64), rhs_f64))
+                self.assertEqual(
+                    op(lhs_f32, rhs_f64), op(lhs_f32.to(torch.float64), rhs_f64)
+                )
 
             if op.supports_out:
                 # All these output types can be cast to any float or complex type
                 out = torch.empty_like(lhs_f64, dtype=torch.float16)
                 self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.float16)
-                self.assertEqual(op(lhs_f32, rhs_f64), out, exact_dtype=False)
 
                 out = torch.empty_like(lhs_f64, dtype=torch.bfloat16)
                 self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.bfloat16)
@@ -669,7 +607,11 @@ def _supported(dtypes):
                 if not op.always_returns_bool:
                     # float outs can't be cast to an integer dtype
                     with self.assertRaisesRegex(RuntimeError, "can't be cast"):
-                        op(lhs_f32, rhs_f64, out=torch.empty_like(lhs_f64, dtype=torch.int64))
+                        op(
+                            lhs_f32,
+                            rhs_f64,
+                            out=torch.empty_like(lhs_f64, dtype=torch.int64),
+                        )
                 else:
                     # bool outs can be cast to an integer dtype
                     out = torch.empty_like(lhs_f64, dtype=torch.int64)
@@ -678,35 +620,49 @@ def _supported(dtypes):
 
         # complex x complex type promotion
         if _supported((torch.complex64, torch.complex128)):
-            lhs_c64 = make_tensor((5,), device=device, dtype=torch.complex64, **op.lhs_make_tensor_kwargs)
-            lhs_c128 = make_tensor((5,), device=device, dtype=torch.complex128, **op.lhs_make_tensor_kwargs)
+            lhs_c64 = make_lhs(dtype=torch.complex64)
+            lhs_c128 = make_lhs(dtype=torch.complex128)
 
-            rhs_c64 = make_tensor((5,), device=device, dtype=torch.complex64, **op.rhs_make_tensor_kwargs)
-            rhs_c128 = make_tensor((5,), device=device, dtype=torch.complex128, **op.rhs_make_tensor_kwargs)
+            rhs_c64 = make_rhs(dtype=torch.complex64)
+            rhs_c128 = make_rhs(dtype=torch.complex128)
 
             if op.always_returns_bool:
                 self.assertEqual(op(lhs_c64, lhs_c128).dtype, torch.bool)
             else:  # normal complex type promotion
                 self.assertEqual(op(lhs_c64, rhs_c128).dtype, torch.complex128)
-                self.assertEqual(op(lhs_c64, rhs_c128), op(lhs_c64.to(torch.complex128), rhs_c128))
+                self.assertEqual(
+                    op(lhs_c64, rhs_c128), op(lhs_c64.to(torch.complex128), rhs_c128)
+                )
 
             if op.supports_out:
                 # All these output types can be cast to any or complex type
                 out = torch.empty_like(lhs_c64, dtype=torch.complex64)
+
                 self.assertEqual(op(lhs_c64, rhs_c128, out=out).dtype, torch.complex64)
-                self.assertEqual(op(lhs_c64, rhs_c128), out, exact_dtype=False)
+                result = op(lhs_c64, rhs_c128)
+                self.assertEqual(result, out.to(result.dtype))
 
                 if not op.always_returns_bool:
                     # complex outs can't be cast to float types
                     with self.assertRaisesRegex(RuntimeError, "can't be cast"):
-                        op(lhs_c64, rhs_c128, out=torch.empty_like(lhs_c64, dtype=torch.float64))
+                        op(
+                            lhs_c64,
+                            rhs_c128,
+                            out=torch.empty_like(lhs_c64, dtype=torch.float64),
+                        )
                     # complex outs can't be cast to an integer dtype
                     with self.assertRaisesRegex(RuntimeError, "can't be cast"):
-                        op(lhs_c64, rhs_c128, out=torch.empty_like(lhs_c64, dtype=torch.int64))
+                        op(
+                            lhs_c64,
+                            rhs_c128,
+                            out=torch.empty_like(lhs_c64, dtype=torch.int64),
+                        )
                 else:
                     # bool outs can be cast to a float type
                     out = torch.empty_like(lhs_c64, dtype=torch.float64)
-                    self.assertEqual(op(lhs_c64, rhs_c128, out=out).dtype, torch.float64)
+                    self.assertEqual(
+                        op(lhs_c64, rhs_c128, out=out).dtype, torch.float64
+                    )
                     self.assertEqual(op(lhs_c64, rhs_c128), out, exact_dtype=False)
 
                     # bool outs can be cast to an integer dtype
@@ -714,17 +670,139 @@ def _supported(dtypes):
                     self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.int64)
                     self.assertEqual(op(lhs_f32, rhs_f64), out, exact_dtype=False)
 
+        # int x float type promotion
+        # Note: float type is the result dtype
+        if _supported((torch.long, torch.float32)):
+            lhs_i64 = make_lhs(dtype=torch.int64)
+            rhs_f32 = make_rhs(dtype=torch.float32)
+
+            result = op(lhs_i64, rhs_f32)
+            expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool
+            self.assertEqual(result.dtype, expected_dtype)
+
+        # float x complex type promotion
+        # Note: complex type with highest "value type" is the result dtype
+        if _supported((torch.float64, torch.complex64)):
+            lhs_f64 = make_lhs(dtype=torch.float64)
+            rhs_c64 = make_rhs(dtype=torch.complex64)
+
+            result = op(lhs_f64, rhs_c64)
+            expected_dtype = (
+                torch.complex128 if not op.always_returns_bool else torch.bool
+            )
+            self.assertEqual(result.dtype, expected_dtype)
+
+        # int x float scalar type promotion
+        # Note: default float dtype is the result dtype
+        if _supported((torch.int64, torch.float32)) and op.supports_rhs_python_scalar:
+            lhs_i64 = make_lhs(dtype=torch.int64)
+            rhs_f_scalar = 1.0
+
+            result = op(lhs_i64, rhs_f_scalar)
+            expected_dtype = (
+                torch.get_default_dtype() if not op.always_returns_bool else torch.bool
+            )
+            self.assertEqual(result.dtype, expected_dtype)
+
+            # repeats with a scalar float tensor, which should set the dtype
+            rhs_f32_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float32)
+            result = op(lhs_i64, rhs_f32_scalar_tensor)
+            expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool
+            self.assertEqual(result.dtype, expected_dtype)
+
+            # Additional test with double
+            if _supported((torch.float64,)):
+                rhs_f64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float64)
+                result = op(lhs_i64, rhs_f64_scalar_tensor)
+                expected_dtype = (
+                    torch.float64 if not op.always_returns_bool else torch.bool
+                )
+                self.assertEqual(result.dtype, expected_dtype)
+
+        # float x complex scalar type promotion
+        # Note: result dtype is complex with highest "value type" among all tensors
+        if (
+            _supported((torch.float32, torch.complex64))
+            and op.supports_rhs_python_scalar
+        ):
+            lhs_f32 = make_lhs(dtype=torch.float32)
+            rhs_c_scalar = complex(1, 1)
+
+            result = op(lhs_f32, rhs_c_scalar)
+            expected_dtype = (
+                torch.complex64 if not op.always_returns_bool else torch.bool
+            )
+            self.assertEqual(result.dtype, expected_dtype)
+
+            # repeats with a scalar complex tensor
+            rhs_c64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex64)
+            result = op(lhs_f32, rhs_c64_scalar_tensor)
+            expected_dtype = (
+                torch.complex64 if not op.always_returns_bool else torch.bool
+            )
+            self.assertEqual(result.dtype, expected_dtype)
+
+            # Additional test with complexdouble
+            if _supported((torch.complex128,)):
+                rhs_c128_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex128)
+                result = op(lhs_f32, rhs_c128_scalar_tensor)
+                expected_dtype = (
+                    torch.complex128 if not op.always_returns_bool else torch.bool
+                )
+                self.assertEqual(result.dtype, expected_dtype)
+
+        # float x float scalar tensor
+        # Note: result dtype is the type of the float tensor
+        if _supported((torch.float32, torch.float64)) and op.supports_rhs_python_scalar:
+            lhs_f32 = make_lhs(dtype=torch.float32)
+            rhs_f64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float64)
+
+            result = op(lhs_f32, rhs_f64_scalar_tensor)
+            expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool
+            self.assertEqual(result.dtype, expected_dtype)
+
+        # complex x complex scalar tensor
+        # Note: result dtype is the type of the complex tensor
+        if (
+            _supported((torch.complex64, torch.complex128))
+            and op.supports_rhs_python_scalar
+        ):
+            lhs_c64 = make_lhs(dtype=torch.complex64)
+            rhs_c128_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex128)
+
+            result = op(lhs_c64, rhs_c128_scalar_tensor)
+            expected_dtype = (
+                torch.complex64 if not op.always_returns_bool else torch.bool
+            )
+            self.assertEqual(result.dtype, expected_dtype)
+
+        # scalar int x scalar float
+        # Note: result dtype is default float type
+        # TODO: FIXME: re-enable this, scalar x scalar type promotion is currently broken
+        # https://github.com/pytorch/pytorch/issues/76801
+        # if op.supports_two_python_scalars and _supported((torch.long, torch.float32)):
+        #     lhs_i_scalar = 1
+        #     rhs_f_scalar = 2.
+
+        #     result = op(lhs_i_scalar, rhs_f_scalar)
+        #     expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool
+        #     self.assertEqual(result.dtype, expected_dtype)
+
     # TODO: move to error input test
     @ops(binary_ufuncs, allowed_dtypes=(torch.float32,))
     def test_not_broadcastable(self, device, dtype, op):
         for shape_lhs, shape_rhs in (
-                ((2,), (3,)),
-                ((3, 1), (2, 1)),
-                ((1, 3, 2), (3,)),
-                ((3, 1, 2), (2, 1, 2)),
+            ((2,), (3,)),
+            ((3, 1), (2, 1)),
+            ((1, 3, 2), (3,)),
+            ((3, 1, 2), (2, 1, 2)),
         ):
-            lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
-            rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+            lhs = make_tensor(
+                shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs
+            )
+            rhs = make_tensor(
+                shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs
+            )
 
             try:
                 broadcasted_shape = op(lhs, rhs).shape
@@ -739,27 +817,48 @@ def test_not_broadcastable(self, device, dtype, op):
 
     def test_add_broadcast_empty(self, device):
         # empty + empty
-        self.assertRaises(RuntimeError, lambda: torch.randn(5, 0, device=device) + torch.randn(0, 5, device=device))
-        self.assertEqual(torch.randn(5, 0, device=device), torch.randn(0, device=device) + torch.randn(5, 0, device=device))
-        self.assertEqual(torch.randn(5, 0, 0, device=device), torch.randn(0, device=device) + torch.randn(5, 0, 1, device=device))
+        self.assertRaises(
+            RuntimeError,
+            lambda: torch.randn(5, 0, device=device) + torch.randn(0, 5, device=device),
+        )
+        self.assertEqual(
+            torch.randn(5, 0, device=device),
+            torch.randn(0, device=device) + torch.randn(5, 0, device=device),
+        )
+        self.assertEqual(
+            torch.randn(5, 0, 0, device=device),
+            torch.randn(0, device=device) + torch.randn(5, 0, 1, device=device),
+        )
 
         # scalar + empty
-        self.assertEqual(torch.randn(5, 0, 6, device=device), torch.randn((), device=device) + torch.randn(5, 0, 6, device=device))
+        self.assertEqual(
+            torch.randn(5, 0, 6, device=device),
+            torch.randn((), device=device) + torch.randn(5, 0, 6, device=device),
+        )
 
         # non-empty, empty
-        self.assertEqual(torch.randn(0, device=device), torch.randn(0, device=device) + torch.randn(1, device=device))
-        self.assertEqual(torch.randn(0, 7, 0, 6, 5, 0, 7, device=device),
-                         torch.randn(0, 7, 0, 6, 5, 0, 1, device=device) + torch.randn(1, 1, 5, 1, 7, device=device))
-        self.assertRaises(RuntimeError, lambda: torch.randn(7, 0, device=device) + torch.randn(2, 1, device=device))
+        self.assertEqual(
+            torch.randn(0, device=device),
+            torch.randn(0, device=device) + torch.randn(1, device=device),
+        )
+        self.assertEqual(
+            torch.randn(0, 7, 0, 6, 5, 0, 7, device=device),
+            torch.randn(0, 7, 0, 6, 5, 0, 1, device=device)
+            + torch.randn(1, 1, 5, 1, 7, device=device),
+        )
+        self.assertRaises(
+            RuntimeError,
+            lambda: torch.randn(7, 0, device=device) + torch.randn(2, 1, device=device),
+        )
 
     def test_addcmul_scalars_as_floats(self, device):
         # zero-dim variables that don't require grad should bind to scalar arguments
-        x = torch.tensor(2.)
-        y = torch.tensor(3., device=device)
+        x = torch.tensor(2.0)
+        y = torch.tensor(3.0, device=device)
         # 3 + (3 * 3) * 2
         self.assertEqual(y.addcmul(y, y, value=x), 21)
 
-        x = torch.tensor(2., requires_grad=True)
+        x = torch.tensor(2.0, requires_grad=True)
         self.assertRaises(Exception, lambda: y.addcmul(y, y, value=x))
 
     # TODO: update to work on CUDA, too
@@ -796,8 +895,8 @@ def test_comparison_ops(self, device):
     def test_comparison_ops_device_computation(self, device):
         operands = (
             torch.tensor(0),
-            torch.tensor(2, device='cuda'),
-            torch.tensor([0, 2], device='cuda')
+            torch.tensor(2, device="cuda"),
+            torch.tensor([0, 2], device="cuda"),
         )
         # Checks that comparison operators compute the correct
         # output device, given a combination of devices
@@ -811,38 +910,49 @@ def test_comparison_ops_device_computation(self, device):
     # TODO: update to work on CUDA, too
     @onlyCPU
     def test_comparison_ops_must_take_bool_output(self, device):
-        for op in [torch.lt, torch.le, torch.gt, torch.ge, torch.eq, torch.ne,
-                   torch.logical_and, torch.logical_or, torch.logical_xor]:
-            self.assertEqual(op(torch.tensor([True]), torch.tensor([False])).dtype, torch.bool)
+        for op in [
+            torch.lt,
+            torch.le,
+            torch.gt,
+            torch.ge,
+            torch.eq,
+            torch.ne,
+            torch.logical_and,
+            torch.logical_or,
+            torch.logical_xor,
+        ]:
+            self.assertEqual(
+                op(torch.tensor([True]), torch.tensor([False])).dtype, torch.bool
+            )
 
     # TODO: update to work on CUDA, too
     @onlyCPU
     def test_comparison_ops_check_for_scalar_overflow(self, device):
         s = 1 << 20
         t = torch.tensor([1 << 5], dtype=torch.uint8)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t < s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s < t)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t <= s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s <= t)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t > s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s > t)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t >= s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s >= t)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t == s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s == t)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t != s)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(s != t)
 
     # TODO: update to work on CUDA, too
@@ -852,29 +962,29 @@ def test_comparison_ops_check_for_zerodim_tensor_overflow(self, device):
         t2 = torch.tensor([1 << 30], dtype=torch.int32)
         ts1 = torch.tensor(1 << 20, dtype=torch.int32)
         ts2 = torch.tensor(1 << 40, dtype=torch.int64)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 < ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 < t2)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 <= ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 <= t2)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 > ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 > t2)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 >= ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 >= t2)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 == ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 == t2)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(t1 != ts1)
-        with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'):
+        with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
             self.assertTrue(ts2 != t2)
 
     # Tests that the binary operators and, or, and xor (as well as their reflected and inplace versions)
@@ -882,7 +992,14 @@ def test_comparison_ops_check_for_zerodim_tensor_overflow(self, device):
     @dtypes(*integral_types_and(torch.bool))
     def test_bitwise_ops(self, device, dtype):
         # Tensor x Tensor and Tensor x Scalar ops
-        ops = (operator.and_, operator.iand, operator.or_, operator.ior, operator.xor, operator.ixor)
+        ops = (
+            operator.and_,
+            operator.iand,
+            operator.or_,
+            operator.ior,
+            operator.xor,
+            operator.ixor,
+        )
         inplace_ops = (operator.iand, operator.ior, operator.ixor)
         shapes = ((5,), (15, 15), (500, 500))
 
@@ -896,12 +1013,12 @@ def test_bitwise_ops(self, device, dtype):
 
             # Tests tensor x scalar case
             a = make_tensor(shape, device=device, dtype=dtype)
-            b_scalar = make_tensor((), device='cpu', dtype=dtype).item()
+            b_scalar = make_tensor((), device="cpu", dtype=dtype).item()
             a_np = a.cpu().clone().numpy()
             self.assertEqual(op(a, b_scalar), op(a_np, b_scalar))
 
             # Tests scalar x tensor case
-            a_scalar = make_tensor((), device='cpu', dtype=dtype).item()
+            a_scalar = make_tensor((), device="cpu", dtype=dtype).item()
             b = make_tensor(shape, device=device, dtype=dtype)
             b_np = b.cpu().clone().numpy()
             self.assertEqual(op(a_scalar, b), op(a_scalar, b_np))
@@ -919,7 +1036,7 @@ def test_bitwise_ops(self, device, dtype):
 
                 # Tests tensor x scalar case
                 a = make_tensor(shape, device=device, dtype=dtype)
-                b_scalar = make_tensor((), device='cpu', dtype=dtype).item()
+                b_scalar = make_tensor((), device="cpu", dtype=dtype).item()
                 a_np = a.cpu().clone().numpy()
                 op(a, b_scalar)
                 op(a_np, b_scalar)
@@ -932,7 +1049,7 @@ def test_inplace_division(self, device):
         id_after = id(t)
         self.assertEqual(id_before, id_after)
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_div_rounding_modes(self, device, dtype):
         if dtype.is_floating_point:
             low, high = -10.0, 10.0
@@ -940,8 +1057,8 @@ def test_div_rounding_modes(self, device, dtype):
             info = torch.iinfo(dtype)
             low, high = info.min, info.max
 
-        a = make_tensor((100,), device, dtype, low=low, high=high)
-        b = make_tensor((100,), device, dtype, low=low, high=high)
+        a = make_tensor((100,), dtype=dtype, device=device, low=low, high=high)
+        b = make_tensor((100,), dtype=dtype, device=device, low=low, high=high)
 
         # Avoid division by zero so we can test (a / b) * b == a
         if dtype.is_floating_point:
@@ -958,17 +1075,23 @@ def test_div_rounding_modes(self, device, dtype):
         self.assertTrue(d_true.is_floating_point())
         self.assertEqual(d_true * b, a.to(d_true.dtype))
 
-        d_floor = torch.divide(a, b, rounding_mode='floor')
+        d_floor = torch.divide(a, b, rounding_mode="floor")
         if dtype not in (torch.bfloat16, torch.half):
             self.assertEqual(d_floor * b + torch.remainder(a, b), a)
         else:
-            self.assertEqual(d_floor * b + torch.remainder(a.float(), b.float()), a,
-                             exact_dtype=False)
+            self.assertEqual(
+                d_floor * b + torch.remainder(a.float(), b.float()),
+                a,
+                exact_dtype=False,
+            )
 
-        d_trunc = torch.divide(a, b, rounding_mode='trunc')
+        d_trunc = torch.divide(a, b, rounding_mode="trunc")
         rounding_unsupported = (
-            dtype == torch.half and device != 'cuda' or
-            dtype == torch.bfloat16 and device != 'cpu')
+            dtype == torch.half
+            and device != "cuda"
+            or dtype == torch.bfloat16
+            and device != "cpu"
+        )
         d_ref = d_true.float() if rounding_unsupported else d_true
         self.assertEqual(d_trunc, d_ref.trunc().to(dtype))
 
@@ -976,8 +1099,10 @@ def test_div_rounding_modes(self, device, dtype):
     def test_div_rounding_nonfinite(self, device, dtype):
 
         # Compare division of special floating point values against NumPy
-        num = torch.tensor([1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan],
-                           dtype=dtype)
+        num = torch.tensor(
+            [1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan],
+            dtype=dtype,
+        )
         # Divide by zero is tested seperately
         denom = num[num != 0]
 
@@ -991,18 +1116,26 @@ def test_div_rounding_nonfinite(self, device, dtype):
             an, bn = a.float().cpu().numpy(), b.float().cpu().numpy()
 
         for mode, np_ref in ((None, np.true_divide), ("floor", np.floor_divide)):
-            with np.errstate(all='ignore'):
+            with np.errstate(all="ignore"):
                 expect = np_ref(an, bn)
             kwargs = dict(rounding_mode=mode) if mode is not None else {}
             with set_default_dtype(torch.double):
                 actual = torch.divide(a, b, **kwargs)
-            self.assertEqual(actual, torch.from_numpy(expect),
-                             exact_device=False, exact_dtype=exact_dtype)
+            self.assertEqual(
+                actual,
+                torch.from_numpy(expect),
+                exact_device=False,
+                exact_dtype=exact_dtype,
+            )
 
         # Compare contiguous (likely vectorized) against non-contiguous (not vectorized)
-        a_noncontig = torch.empty([2 * i for i in a.shape], dtype=dtype, device=device)[::2, ::2]
+        a_noncontig = torch.empty([2 * i for i in a.shape], dtype=dtype, device=device)[
+            ::2, ::2
+        ]
         a_noncontig[:] = a
-        b_noncontig = torch.empty([2 * i for i in b.shape], dtype=dtype, device=device)[::2, ::2]
+        b_noncontig = torch.empty([2 * i for i in b.shape], dtype=dtype, device=device)[
+            ::2, ::2
+        ]
         b_noncontig[:] = b
 
         for rounding_mode in (None, "trunc", "floor"):
@@ -1012,9 +1145,11 @@ def test_div_rounding_nonfinite(self, device, dtype):
 
     @dtypes(torch.bfloat16, torch.half, torch.float32, torch.float64)
     def test_divide_by_zero_rounding(self, device, dtype):
-        a = torch.tensor([1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan],
-                         dtype=dtype)
-        exact_dtype = (dtype != torch.bfloat16)
+        a = torch.tensor(
+            [1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan],
+            dtype=dtype,
+        )
+        exact_dtype = dtype != torch.bfloat16
         if exact_dtype:
             an = a.cpu().numpy()
         else:
@@ -1024,7 +1159,7 @@ def test_divide_by_zero_rounding(self, device, dtype):
 
         # NOTE: NumPy's floor_divide rounding changed in 1.20.0 to be consistent with divide
         expect = np.divide(an, 0)
-        for rounding_mode in (None, 'floor'):
+        for rounding_mode in (None, "floor"):
             # CPU scalar
             actual = torch.divide(a, 0, rounding_mode=rounding_mode)
             self.assertEqual(actual, expect, exact_dtype=exact_dtype)
@@ -1032,16 +1167,14 @@ def test_divide_by_zero_rounding(self, device, dtype):
             actual = torch.divide(a, zero, rounding_mode=rounding_mode)
             self.assertEqual(actual, expect, exact_dtype=exact_dtype)
 
-    @dtypes(*get_all_dtypes(
-        include_bool=False, include_complex=False, include_bfloat16=False))
+    @dtypes(*all_types_and(torch.half))
     def test_div_rounding_numpy(self, device, dtype):
-        info = (torch.finfo(dtype) if dtype.is_floating_point
-                else torch.iinfo(dtype))
+        info = torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype)
         low, high = info.min, info.max
 
         # Compare division of random values against NumPy
-        a = make_tensor((4096,), device, dtype, low=low, high=high)
-        b = make_tensor((4096,), device, dtype, low=low, high=high)
+        a = make_tensor((4096,), dtype=dtype, device=device, low=low, high=high)
+        b = make_tensor((4096,), dtype=dtype, device=device, low=low, high=high)
 
         # Avoid division by zero which raises for integers and, for floats,
         # NumPy 1.20 changed floor_divide to follow IEEE rules for inf/nan
@@ -1057,34 +1190,39 @@ def test_div_rounding_numpy(self, device, dtype):
             an, bn = a.float().cpu().numpy(), b.float().cpu().numpy()
 
         for mode, np_ref in (
-                (None, np.true_divide),
-                ("floor", np.floor_divide),
-                ("trunc", lambda a, b: np.trunc(np.true_divide(a, b)).astype(a.dtype))
+            (None, np.true_divide),
+            ("floor", np.floor_divide),
+            ("trunc", lambda a, b: np.trunc(np.true_divide(a, b)).astype(a.dtype)),
         ):
-            with np.errstate(all='ignore'):
+            with np.errstate(all="ignore"):
                 expect = torch.from_numpy(np_ref(an, bn))
 
             kwargs = dict(rounding_mode=mode) if mode is not None else {}
             # Contiguous (likely vectorized)
             with set_default_dtype(torch.double):
                 actual = torch.divide(a, b, **kwargs)
-            self.assertEqual(actual, expect, exact_device=False, exact_dtype=exact_dtype)
+            self.assertEqual(
+                actual, expect, exact_device=False, exact_dtype=exact_dtype
+            )
 
             # Non-contiguous (not vectorized)
             expect = expect[::2]
             with set_default_dtype(torch.double):
                 actual = torch.divide(a[::2], b[::2], **kwargs)
 
-            self.assertEqual(actual, expect, exact_device=False, exact_dtype=exact_dtype)
+            self.assertEqual(
+                actual, expect, exact_device=False, exact_dtype=exact_dtype
+            )
 
     # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor
     #   throws the correct error message
     @onlyCUDA
     def test_cross_device_inplace_error_msg(self, device):
-        a = torch.tensor(2.)
-        b = torch.tensor(2., device=device)
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
+        a = torch.tensor(2.0)
+        b = torch.tensor(2.0, device=device)
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             a += b
 
     # TODO: refactor this test into a more generic one, it's parked here currently
@@ -1097,7 +1235,7 @@ def test_out_resize_warning(self, device):
         binary_inputs = (a, b)
         unary_ops = (torch.ceil, torch.exp)
         binary_ops = (torch.add, torch.sub)
-        for op in (unary_ops + binary_ops):
+        for op in unary_ops + binary_ops:
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 inputs = unary_inputs if op in unary_ops else binary_inputs
@@ -1121,30 +1259,30 @@ def test_inplace_dunders(self, device):
         t -= 1
         t *= 1
         t /= 1
-        with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
+        t **= 1
+        with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
             t //= 1
         t %= 1
         self.assertEqual(expected, t.data_ptr())
 
-    def check_internal_mem_overlap(self, inplace_op, num_inputs,
-                                   dtype, device,
-                                   expected_failure=False):
+    def check_internal_mem_overlap(
+        self, inplace_op, num_inputs, dtype, device, expected_failure=False
+    ):
         if isinstance(inplace_op, str):
             inplace_op = getattr(torch.Tensor, inplace_op)
         input = torch.randn(1, dtype=dtype, device=device).expand(3, 3)
-        inputs = [input] + [torch.randn_like(input)
-                            for i in range(num_inputs - 1)]
+        inputs = [input] + [torch.randn_like(input) for i in range(num_inputs - 1)]
         if not expected_failure:
-            with self.assertRaisesRegex(RuntimeError, 'single memory location'):
+            with self.assertRaisesRegex(RuntimeError, "single memory location"):
                 inplace_op(*inputs)
         else:
             with self.assertRaises(AssertionError):
-                with self.assertRaisesRegex(RuntimeError, 'single memory location'):
+                with self.assertRaisesRegex(RuntimeError, "single memory location"):
                     inplace_op(*inputs)
 
-    def unary_check_input_output_mem_overlap(self, data, sz, op,
-                                             expected_failure=False):
-
+    def unary_check_input_output_mem_overlap(
+        self, data, sz, op, expected_failure=False
+    ):
         def _test(op, output, input):
             output_exp = torch.empty_like(output)
             op(input, out=output_exp)
@@ -1153,93 +1291,114 @@ def _test(op, output, input):
         # output is identical to input:
         _test(op, output=data[0:sz], input=data[0:sz])
         # output and input are independent:
-        _test(op, output=data[0:sz], input=data[sz:2 * sz])
+        _test(op, output=data[0:sz], input=data[sz : 2 * sz])
         # output partially overlaps with input:
         if not expected_failure:
-            with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-                _test(op, data[0:sz], data[1:sz + 1])
+            with self.assertRaisesRegex(RuntimeError, "unsupported operation"):
+                _test(op, data[0:sz], data[1 : sz + 1])
         else:
             with self.assertRaises(AssertionError):
-                with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-                    _test(op, data[0:sz], data[1:sz + 1])
+                with self.assertRaisesRegex(RuntimeError, "unsupported operation"):
+                    _test(op, data[0:sz], data[1 : sz + 1])
 
-    def binary_check_input_output_mem_overlap(self, op, device,
-                                              expected_failure=False):
+    def binary_check_input_output_mem_overlap(self, op, device, expected_failure=False):
         sz = 3
         data = torch.randn(2 * sz, device=device)
         other = torch.randn(sz, device=device)
 
         self.unary_check_input_output_mem_overlap(
-            data, sz, lambda input, out: op(other, input, out=out),
-            expected_failure=expected_failure)
+            data,
+            sz,
+            lambda input, out: op(other, input, out=out),
+            expected_failure=expected_failure,
+        )
 
         self.unary_check_input_output_mem_overlap(
-            data, sz, lambda input, out: op(input, other, out=out),
-            expected_failure=expected_failure)
+            data,
+            sz,
+            lambda input, out: op(input, other, out=out),
+            expected_failure=expected_failure,
+        )
 
     @dtypes(torch.double)
     def test_binary_op_mem_overlap(self, device, dtype):
         ops = [
-            ("add", True, True, 'cpu'),
-            ("add", True, True, 'cuda'),
-            ("mul", True, True, 'cpu'),
-            ("mul", True, True, 'cuda'),
-            ("sub", True, True, 'cpu'),
-            ("sub", True, True, 'cuda'),
-            ("div", True, True, 'cpu'),
-            ("div", True, True, 'cuda'),
-            ("pow", True, True, 'cpu'),
-            ("pow", True, True, 'cuda'),
-            ("fmod", True, True, 'cpu'),
-            ("fmod", True, True, 'cuda'),
-            ("atan2", True, True, 'cpu'),
-            ("atan2", True, True, 'cuda'),
-            ("hypot", True, True, 'cpu'),
-            ("hypot", True, True, 'cuda'),
-            ("igamma", True, True, 'cpu'),
-            ("igamma", True, True, 'cuda'),
-            ("igammac", True, True, 'cpu'),
-            ("igammac", True, True, 'cuda'),
-            ("nextafter", True, True, 'cpu'),
-            ("nextafter", True, True, 'cuda'),
-            ("le", True, True, 'cpu'),
-            ("le", True, True, 'cuda'),
-            ("lt", True, True, 'cpu'),
-            ("lt", True, True, 'cuda'),
-            ("ge", True, True, 'cpu'),
-            ("ge", True, True, 'cuda'),
-            ("gt", True, True, 'cpu'),
-            ("gt", True, True, 'cuda'),
-            ("eq", True, True, 'cpu'),
-            ("eq", True, True, 'cuda'),
-            ("ne", True, True, 'cpu'),
-            ("ne", True, True, 'cuda'),
-            ("logical_and", True, True, 'cpu'),
-            ("logical_and", True, True, 'cuda'),
-            ("logical_or", True, True, 'cpu'),
-            ("logical_or", True, True, 'cuda'),
-            ("logical_xor", True, True, 'cpu'),
-            ("logical_xor", True, True, 'cuda'),
+            ("add", True, True, "cpu"),
+            ("add", True, True, "cuda"),
+            ("mul", True, True, "cpu"),
+            ("mul", True, True, "cuda"),
+            ("sub", True, True, "cpu"),
+            ("sub", True, True, "cuda"),
+            ("div", True, True, "cpu"),
+            ("div", True, True, "cuda"),
+            ("pow", True, True, "cpu"),
+            ("pow", True, True, "cuda"),
+            ("fmod", True, True, "cpu"),
+            ("fmod", True, True, "cuda"),
+            ("atan2", True, True, "cpu"),
+            ("atan2", True, True, "cuda"),
+            ("hypot", True, True, "cpu"),
+            ("hypot", True, True, "cuda"),
+            ("igamma", True, True, "cpu"),
+            ("igamma", True, True, "cuda"),
+            ("igammac", True, True, "cpu"),
+            ("igammac", True, True, "cuda"),
+            ("nextafter", True, True, "cpu"),
+            ("nextafter", True, True, "cuda"),
+            ("le", True, True, "cpu"),
+            ("le", True, True, "cuda"),
+            ("lt", True, True, "cpu"),
+            ("lt", True, True, "cuda"),
+            ("ge", True, True, "cpu"),
+            ("ge", True, True, "cuda"),
+            ("gt", True, True, "cpu"),
+            ("gt", True, True, "cuda"),
+            ("eq", True, True, "cpu"),
+            ("eq", True, True, "cuda"),
+            ("ne", True, True, "cpu"),
+            ("ne", True, True, "cuda"),
+            ("logical_and", True, True, "cpu"),
+            ("logical_and", True, True, "cuda"),
+            ("logical_or", True, True, "cpu"),
+            ("logical_or", True, True, "cuda"),
+            ("logical_xor", True, True, "cpu"),
+            ("logical_xor", True, True, "cuda"),
         ]
 
-        for (fn, has_input_output_mem_overlap_check,
-             has_internal_mem_overlap_check, dev) in ops:
+        for (
+            fn,
+            has_input_output_mem_overlap_check,
+            has_internal_mem_overlap_check,
+            dev,
+        ) in ops:
             if dev != device:
                 continue
             out_op = getattr(torch, fn)
-            inplace_op = getattr(torch.Tensor, fn + '_')
+            inplace_op = getattr(torch.Tensor, fn + "_")
             self.check_internal_mem_overlap(
-                inplace_op, 2, dtype, device,
-                expected_failure=not has_internal_mem_overlap_check)
+                inplace_op,
+                2,
+                dtype,
+                device,
+                expected_failure=not has_internal_mem_overlap_check,
+            )
 
-            self.binary_check_input_output_mem_overlap(out_op, device,
-                                                       expected_failure=not has_input_output_mem_overlap_check)
+            self.binary_check_input_output_mem_overlap(
+                out_op, device, expected_failure=not has_input_output_mem_overlap_check
+            )
 
     def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol):
         for num in exponents:
-            if isinstance(num, int) and num < 0 and not m1.is_floating_point() and not m1.is_complex():
-                with self.assertRaisesRegex(RuntimeError,
-                                            r'Integers to negative integer powers are not allowed\.'):
+            if (
+                isinstance(num, int)
+                and num < 0
+                and not m1.is_floating_point()
+                and not m1.is_complex()
+            ):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Integers to negative integer powers are not allowed\.",
+                ):
                     torch.pow(m1[4], num)
             else:
                 # base - tensor, exponent - number
@@ -1262,7 +1421,9 @@ def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol):
                 # scalar ** tensor to enforce correct handling of dtypes for __rpow__().
                 expected_dtype = torch.result_type(num, m1)
                 res1 = num ** m1[4]
-                res2 = torch.tensor(num, dtype=expected_dtype, device=m1.device) ** m1[4]
+                res2 = (
+                    torch.tensor(num, dtype=expected_dtype, device=m1.device) ** m1[4]
+                )
                 self.assertEqual(res1, res2)
                 self.assertEqual(res1.dtype, expected_dtype)
 
@@ -1270,14 +1431,27 @@ def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol):
     def test_pow(self, device, dtype):
         m1 = torch.empty(0, dtype=dtype, device=device)
         if m1.is_floating_point() or m1.is_complex():
-            m1 = make_tensor((100, 100), low=0, high=1, dtype=dtype, device=device) + 0.5
+            m1 = (
+                make_tensor((100, 100), low=0, high=1, dtype=dtype, device=device) + 0.5
+            )
         else:
             # math.pow will overflow and throw exceptions for large integers
             range_high = 4 if dtype in (torch.int8, torch.uint8) else 10
-            m1 = make_tensor((100, 100), low=1, high=range_high, dtype=dtype, device=device)
+            m1 = make_tensor(
+                (100, 100), low=1, high=range_high, dtype=dtype, device=device
+            )
 
         exponents = [-2.8, -2, -1, -0.5, 0, 0.5, 1, 2, 3, 4, 3.3]
-        complex_exponents = [-2.5j, -1.0j, 0j, 1.0j, 2.5j, 1.0 + 1.0j, -1.0 - 1.5j, 3.3j]
+        complex_exponents = [
+            -2.5j,
+            -1.0j,
+            0j,
+            1.0j,
+            2.5j,
+            1.0 + 1.0j,
+            -1.0 - 1.5j,
+            3.3j,
+        ]
         if m1.is_complex():
             self._do_pow_for_exponents(m1, exponents + complex_exponents, pow, 10e-4)
         else:
@@ -1311,7 +1485,11 @@ def to_np(value):
 
         try:
             np_res = np.power(to_np(base), to_np(np_exponent))
-            expected = torch.from_numpy(np_res) if isinstance(np_res, np.ndarray) else torch.tensor(np_res, dtype=base.dtype)
+            expected = (
+                torch.from_numpy(np_res)
+                if isinstance(np_res, np.ndarray)
+                else torch.tensor(np_res, dtype=base.dtype)
+            )
         except ValueError as e:
             err_msg = "Integers to negative integer powers are not allowed."
             self.assertEqual(str(e), err_msg)
@@ -1320,7 +1498,7 @@ def to_np(value):
                 lambda: base.pow(exponent),
                 lambda: base.pow_(exponent),
                 lambda: torch.pow(base, exponent),
-                lambda: torch.pow(base, exponent, out=out)
+                lambda: torch.pow(base, exponent, out=out),
             ]
             for test_case in test_cases:
                 self.assertRaisesRegex(RuntimeError, err_msg, test_case)
@@ -1331,16 +1509,24 @@ def to_np(value):
                 actual = base.clone()
                 # When base is a 0-dim cpu tensor and exp is a cuda tensor, we exp `pow` to work but `pow_` to fail, since
                 # `pow` will try to create the output tensor on a cuda device, but `pow_` needs to use the cpu tensor as the output
-                if (isinstance(exponent, torch.Tensor) and base.dim() == 0 and base.device.type == 'cpu' and
-                        exponent.device.type == 'cuda'):
-                    regex = 'Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!'
+                if (
+                    isinstance(exponent, torch.Tensor)
+                    and base.dim() == 0
+                    and base.device.type == "cpu"
+                    and exponent.device.type == "cuda"
+                ):
+                    regex = "Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!"
                     self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
                     self.assertEqual(actual, expected)
                     self.assertEqual(actual2, expected)
                 else:
-                    self.assertRaisesRegex(RuntimeError, "Found dtype \\w+ but expected \\w+", lambda: actual.pow_(exponent))
+                    self.assertRaisesRegex(
+                        RuntimeError,
+                        "Found dtype \\w+ but expected \\w+",
+                        lambda: actual.pow_(exponent),
+                    )
 
             actual = torch.pow(base, exponent)
             self.assertEqual(actual, expected.to(actual))
@@ -1354,13 +1540,16 @@ def to_np(value):
     # a lambada that switches the inputs, because we also want to test samples inputs
     # where the second input is a scalar. The wrapper would need some more logic.
     def test_pow_scalar_base(self, device):
-        a = torch.arange(1, 13, dtype=torch.double, device=device).view(3, 4).requires_grad_()
+        a = (
+            torch.arange(1, 13, dtype=torch.double, device=device)
+            .view(3, 4)
+            .requires_grad_()
+        )
         gradcheck(lambda a: torch.pow(2, a), (a,))
 
     # Tests pow() for integral, floating-type tensors, with integral, floating-type
     # exponents (tensor or scalar), respectively. noncontiguous tensors are also tested.
     def test_int_and_float_pow(self, device):
-
         def _test_int_and_float_pow(dt, low, high, dev):
             test_cases = (
                 ((4, 4), 0, (4, 1)),
@@ -1372,23 +1561,59 @@ def _test_int_and_float_pow(dt, low, high, dev):
                 ((), 2, ()),
             )
             for base_shape, exp_scalar, exp_shape in test_cases:
-                base_tensor = make_tensor(base_shape, dtype=dt, device=dev, low=low, high=high)
+                base_tensor = make_tensor(
+                    base_shape, dtype=dt, device=dev, low=low, high=high
+                )
                 # int tensors don't take negative exponents
-                if dt in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]:
-                    exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=0, high=high)
+                if dt in [
+                    torch.uint8,
+                    torch.int8,
+                    torch.int16,
+                    torch.int32,
+                    torch.int64,
+                ]:
+                    exp_tensor = make_tensor(
+                        exp_shape, dtype=dt, device=dev, low=0, high=high
+                    )
                 else:
-                    exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=low, high=high)
+                    exp_tensor = make_tensor(
+                        exp_shape, dtype=dt, device=dev, low=low, high=high
+                    )
                 self._test_pow(base_tensor, exp_scalar)
                 self._test_pow(base_tensor, exp_tensor)
                 # test non-contiguous tensors as well
-                base_tensor = make_tensor(base_shape, dtype=dt, device=dev, low=low, high=high,
-                                          noncontiguous=True)
-                if dt in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]:
-                    exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=0, high=high,
-                                             noncontiguous=True)
+                base_tensor = make_tensor(
+                    base_shape,
+                    dtype=dt,
+                    device=dev,
+                    low=low,
+                    high=high,
+                    noncontiguous=True,
+                )
+                if dt in [
+                    torch.uint8,
+                    torch.int8,
+                    torch.int16,
+                    torch.int32,
+                    torch.int64,
+                ]:
+                    exp_tensor = make_tensor(
+                        exp_shape,
+                        dtype=dt,
+                        device=dev,
+                        low=0,
+                        high=high,
+                        noncontiguous=True,
+                    )
                 else:
-                    exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=low, high=high,
-                                             noncontiguous=True)
+                    exp_tensor = make_tensor(
+                        exp_shape,
+                        dtype=dt,
+                        device=dev,
+                        low=low,
+                        high=high,
+                        noncontiguous=True,
+                    )
                 self._test_pow(base_tensor, exp_scalar)
                 self._test_pow(base_tensor, exp_tensor)
 
@@ -1397,12 +1622,12 @@ def _test_int_and_float_pow(dt, low, high, dev):
         _test_int_and_float_pow(torch.int16, -5, 5, device)
         _test_int_and_float_pow(torch.int64, -10, 10, device)
         _test_int_and_float_pow(torch.int32, -10, 10, device)
-        _test_int_and_float_pow(torch.float16, 0., 5., device)
-        _test_int_and_float_pow(torch.float32, 0., 10., device)
-        _test_int_and_float_pow(torch.float64, 0., 10., device)
+        _test_int_and_float_pow(torch.float16, 0.0, 5.0, device)
+        _test_int_and_float_pow(torch.float32, 0.0, 10.0, device)
+        _test_int_and_float_pow(torch.float64, 0.0, 10.0, device)
         # pow's output would have some NaNs as well
-        _test_int_and_float_pow(torch.float32, -10., 10., device)
-        _test_int_and_float_pow(torch.float64, -10., 10., device)
+        _test_int_and_float_pow(torch.float32, -10.0, 10.0, device)
+        _test_int_and_float_pow(torch.float64, -10.0, 10.0, device)
 
     # Tests that a Runtime error occurs when a base tensor cannot be resized
     # by pow's inplace variant due to PyTorch's broadcasting semantics.
@@ -1413,19 +1638,33 @@ def test_pow_inplace_resizing_exception(self, device):
             ((2, 1), (2, 2)),
             ((2, 2), (2, 1, 1)),
         )
-        test_inputs = list((make_tensor(base_size, dtype=torch.float64, device=device,
-                                        high=10., low=0.),
-                            make_tensor(exp_size, dtype=torch.float64, device=device,
-                                        high=10., low=0.))
-                           for base_size, exp_size in test_cases)
+        test_inputs = list(
+            (
+                make_tensor(
+                    base_size, dtype=torch.float64, device=device, high=10.0, low=0.0
+                ),
+                make_tensor(
+                    exp_size, dtype=torch.float64, device=device, high=10.0, low=0.0
+                ),
+            )
+            for base_size, exp_size in test_cases
+        )
         for base, exponent in test_inputs:
             regex = "doesn't match the broadcast shape"
             self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
 
     def test_int_tensor_pow_neg_ints(self, device):
-        ints = [torch.iinfo(torch.int32).min,
-                -3, -2, -1, 0, 1, 2, 3,
-                torch.iinfo(torch.int32).max]
+        ints = [
+            torch.iinfo(torch.int32).min,
+            -3,
+            -2,
+            -1,
+            0,
+            1,
+            2,
+            3,
+            torch.iinfo(torch.int32).max,
+        ]
         neg_ints = [torch.iinfo(torch.int32).min, -3, -2, -1]
         tensor = torch.tensor(ints, dtype=torch.int32, device=device)
         for pow in neg_ints:
@@ -1440,16 +1679,17 @@ def test_long_tensor_pow_floats(self, device):
 
     @dtypes(*[torch.float32, torch.float64])
     def test_float_scalar_pow_float_tensor(self, device, dtype):
-        floats = [2.0, -3 / 2, -1.0, -1 / 2, -1 / 3, 0.0,
-                  1 / 3, 1 / 2, 1.0, 3 / 2, 2.0]
+        floats = [2.0, -3 / 2, -1.0, -1 / 2, -1 / 3, 0.0, 1 / 3, 1 / 2, 1.0, 3 / 2, 2.0]
         exponent_shapes = (
             (1,),
             (2, 2),
             (2, 1),
             (2, 2, 2),
         )
-        tensors = list(make_tensor(shape, dtype=dtype, device=device, low=0)
-                       for shape in exponent_shapes)
+        tensors = list(
+            make_tensor(shape, dtype=dtype, device=device, low=0)
+            for shape in exponent_shapes
+        )
         floats_tensor = torch.tensor(floats, dtype=dtype, device=device)
         for base in floats:
             self._test_pow(base, floats_tensor)
@@ -1458,38 +1698,50 @@ def test_float_scalar_pow_float_tensor(self, device, dtype):
 
     @onlyCUDA
     def test_cuda_tensor_pow_scalar_tensor(self, device):
-        cuda_tensors = [torch.randn((3, 3), device=device), torch.tensor(3.0, device=device)]
-        scalar_tensors = [torch.tensor(5.0, device='cpu'), torch.tensor(-3), torch.tensor(1)]
+        cuda_tensors = [
+            torch.randn((3, 3), device=device),
+            torch.tensor(3.0, device=device),
+        ]
+        scalar_tensors = [
+            torch.tensor(5.0, device="cpu"),
+            torch.tensor(-3),
+            torch.tensor(1),
+        ]
         for base, exp in product(cuda_tensors, scalar_tensors):
             self._test_pow(base, exp)
 
     @onlyCUDA
     def test_cpu_tensor_pow_cuda_scalar_tensor(self, device):
-        cuda_tensors = [torch.tensor(5.0, device='cuda'), torch.tensor(-3, device='cuda')]
+        cuda_tensors = [
+            torch.tensor(5.0, device="cuda"),
+            torch.tensor(-3, device="cuda"),
+        ]
         for exp in cuda_tensors:
-            base = torch.randn((3, 3), device='cpu')
-            regex = 'Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!'
+            base = torch.randn((3, 3), device="cpu")
+            regex = "Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!"
             self.assertRaisesRegex(RuntimeError, regex, torch.pow, base, exp)
         for exp in cuda_tensors:
             # Binary ops with a cpu + cuda tensor are allowed if the cpu tensor has 0 dimension
-            base = torch.tensor(3.0, device='cpu')
+            base = torch.tensor(3.0, device="cpu")
             self._test_pow(base, exp)
 
     @onlyCUDA
     @dtypes(torch.complex64, torch.complex128)
     def test_pow_cuda_complex_extremal_failing(self, device, dtype):
-        t = torch.tensor(complex(-1., float('inf')), dtype=dtype, device=device)
+        t = torch.tensor(complex(-1.0, float("inf")), dtype=dtype, device=device)
         with self.assertRaises(AssertionError):
             cuda_out = t.pow(2)
             cpu_out = t.cpu().pow(2)
             self.assertEqual(cpu_out, cuda_out)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_dtypes(include_bool=False, include_bfloat16=False)))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_complex_scalar_pow_tensor(self, device, dtype):
-        complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j, 1 + 0j]
-        first_exp = make_tensor((100,), device, dtype, low=-2, high=2)
-        second_exp = make_tensor((100,), device, dtype, low=-2, high=2, noncontiguous=True)
+        complexes = [0.5j, 1.0 + 1.0j, -1.5j, 2.2 - 1.6j, 1 + 0j]
+        first_exp = make_tensor((100,), dtype=dtype, device=device, low=-2, high=2)
+        second_exp = make_tensor(
+            (100,), dtype=dtype, device=device, low=-2, high=2, noncontiguous=True
+        )
         first_exp[0] = first_exp[10] = first_exp[20] = 0
         second_exp[0] = second_exp[10] = second_exp[20] = 0
         for base in complexes:
@@ -1497,20 +1749,32 @@ def test_complex_scalar_pow_tensor(self, device, dtype):
             self._test_pow(base, second_exp)
 
     @onlyNativeDeviceTypes
+    @skipMeta
     def test_pow_scalar_type_promotion(self, device):
         # Test against a scalar and non-scalar input
         inputs = [17, [17]]
         for input in inputs:
             # We expect the computation to be performed in uint8 (overflowing to 0), and then cast to int64
             input_tensor_uint8 = torch.tensor(input, dtype=torch.uint8, device=device)
-            out_uint8_computation = torch.pow(2, input_tensor_uint8, out=torch.tensor(0, dtype=torch.int64, device=device))
+            out_uint8_computation = torch.pow(
+                2,
+                input_tensor_uint8,
+                out=torch.tensor(0, dtype=torch.int64, device=device),
+            )
 
             # Computation should run in int64, and not overflow
             input_tensor_int64 = torch.tensor(input, dtype=torch.int64, device=device)
-            out_int64_computation = torch.pow(2, input_tensor_int64, out=torch.tensor(0, dtype=torch.int64, device=device))
+            out_int64_computation = torch.pow(
+                2,
+                input_tensor_int64,
+                out=torch.tensor(0, dtype=torch.int64, device=device),
+            )
 
             self.assertNotEqual(out_uint8_computation, out_int64_computation)
-            self.assertEqual(out_uint8_computation.to(dtype=torch.uint8), out_int64_computation.to(dtype=torch.uint8))
+            self.assertEqual(
+                out_uint8_computation.to(dtype=torch.uint8),
+                out_int64_computation.to(dtype=torch.uint8),
+            )
 
     def test_tensor_pow_tensor(self, device):
         def rotate(l, n):
@@ -1530,26 +1794,24 @@ def test_tensor_pow_tensor(values, torch_type, numpy_type):
         test_tensor_pow_tensor(ints, torch.int32, np.int32)
         test_tensor_pow_tensor(ints, torch.int64, np.int64)
 
-        floats = [-3.0, -2.0, -1.0, -1 / 2, -1 / 3,
-                  0.0, 1 / 3, 1 / 2, 1.0, 2.0, 3.0]
+        floats = [-3.0, -2.0, -1.0, -1 / 2, -1 / 3, 0.0, 1 / 3, 1 / 2, 1.0, 2.0, 3.0]
         test_tensor_pow_tensor(floats, torch.float16, np.float16)
         test_tensor_pow_tensor(floats, torch.float32, np.float32)
         test_tensor_pow_tensor(floats, torch.float64, np.float64)
 
-
     def test_logical_xor_with_nontrivial_alignment(self, device):
         # test tensor that is not aligned to multiple of 16 bytes
         size = 128
-        a = (torch.randn(size, device=device) > 0)
-        b = (torch.randn(size, device=device) > 0)
-        c = (torch.randn(size, device=device) > 0)
+        a = torch.randn(size, device=device) > 0
+        b = torch.randn(size, device=device) > 0
+        c = torch.randn(size, device=device) > 0
         non_trivial_alignment = [1, 2, 4, 8, 15]
         for i in non_trivial_alignment:
             for j in non_trivial_alignment:
                 for k in non_trivial_alignment:
-                    a_ = a[i: 100 + i]
-                    b_ = b[j: 100 + j]
-                    c_ = c[k: 100 + k]
+                    a_ = a[i : 100 + i]
+                    b_ = b[j : 100 + j]
+                    c_ = c[k : 100 + k]
                     torch.logical_xor(a_, b_, out=c_)
                     for x, y, z in zip(a_.tolist(), b_.tolist(), c_.tolist()):
                         self.assertEqual(x ^ y, z)
@@ -1572,7 +1834,7 @@ def test_add_with_tail(self, device, dtype):
     @deviceCountAtLeast(2)
     @onlyCUDA
     def test_cross_device_binary_ops(self, devices):
-        vals = (1., (2.,))
+        vals = (1.0, (2.0,))
         cpu_tensor = torch.randn(2, 2)
 
         def do_test(op, a, b):
@@ -1585,11 +1847,18 @@ def do_test(op, a, b):
             with self.assertRaisesRegex(RuntimeError, "Expected all tensors.+"):
                 op(cpu_tensor, a)
 
-        for op in (operator.add, torch.add,
-                   operator.sub, torch.sub,
-                   operator.mul, torch.mul,
-                   operator.truediv, torch.true_divide,
-                   operator.floordiv, torch.floor_divide):
+        for op in (
+            operator.add,
+            torch.add,
+            operator.sub,
+            torch.sub,
+            operator.mul,
+            torch.mul,
+            operator.truediv,
+            torch.true_divide,
+            operator.floordiv,
+            torch.floor_divide,
+        ):
             for a, b in product(vals, vals):
                 a = torch.tensor(a, device=devices[0])
                 b = torch.tensor(b, device=devices[1])
@@ -1602,7 +1871,7 @@ def do_test(op, a, b):
     @deviceCountAtLeast(2)
     @onlyCUDA
     def test_binary_op_scalar_device_unspecified(self, devices):
-        scalar_val = torch.tensor(1.)
+        scalar_val = torch.tensor(1.0)
         for default_device in devices:
             with torch.cuda.device(default_device):
                 for device in devices:
@@ -1621,7 +1890,7 @@ def test_div_and_floordiv_vs_python(self, device):
         #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
         def _scalar_helper(python_op, torch_op):
             for a, b in product(range(-10, 10), range(-10, 10)):
-                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                     a = op(a)
                     b = op(b)
 
@@ -1648,7 +1917,7 @@ def _scalar_helper(python_op, torch_op):
 
             _scalar_helper(operator.truediv, operator.truediv)
             _scalar_helper(operator.truediv, torch.true_divide)
-            with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
+            with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
                 _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv)
                 _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide)
 
@@ -1666,7 +1935,7 @@ def _wrapped_floordiv(a, b):
         scripted_div = torch.jit.script(_wrapped_div)
         scripted_floordiv = torch.jit.script(_wrapped_floordiv)
         for a, b in product(range(-10, 10), range(-10, 10)):
-            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+            for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                 a = op(a)
                 b = op(b)
 
@@ -1680,7 +1949,7 @@ def _wrapped_floordiv(a, b):
                 b_t = torch.tensor(b, device=device)
 
                 self.assertEqual(scripted_div(a_t, b_t), expected_div)
-                with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
+                with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
                     self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv)
 
         # Creates jitted functions of one tensor
@@ -1705,13 +1974,13 @@ def _wrapped_rfloordiv_scalar(a):
         scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar)
 
         for a in range(-10, 10):
-            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+            for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                 a = op(a)
 
                 a_t = torch.tensor(a, device=device)
 
                 self.assertEqual(a / 5, scripted_div_scalar(a_t))
-                with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
+                with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
                     self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t))
 
                 # Skips zero divisors
@@ -1780,7 +2049,7 @@ def _wrapped_ifloordiv_scalar(a):
         scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar)
 
         for a, b in product(range(-10, 10), range(-10, 10)):
-            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+            for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                 a = op(a)
                 b = op(b)
 
@@ -1804,8 +2073,13 @@ def _wrapped_ifloordiv_scalar(a):
 
                     self.assertEqual(tmp0.item(), expected_idiv)
                     self.assertEqual(tmp1.item(), expected_idiv)
-                    self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv)
-                    self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5)
+                    self.assertEqual(
+                        scripted_true_divide__tensor(a_t.clone(), b_t).item(),
+                        expected_idiv,
+                    )
+                    self.assertEqual(
+                        scripted_true_divide__scalar(a_t.clone()).item(), a / 5
+                    )
                 else:
                     tmp = a_t.clone()
                     with self.assertRaises(RuntimeError):
@@ -1817,42 +2091,56 @@ def _wrapped_ifloordiv_scalar(a):
                     with self.assertRaises(RuntimeError):
                         scripted_true_divide__scalar(tmp)
 
-
                 if not a_t.is_floating_point() and b_t.is_floating_point():
                     # Inplace modification fails because a float tensor is required
                     #   if the divisor is a float tensor
-                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
+                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(
+                        UserWarning, "floor_divide"
+                    ):
                         a_t.clone().floor_divide_(b_t)
-                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
+                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(
+                        UserWarning, "floor_divide"
+                    ):
                         scripted_floor_divide_tensor(a_t.clone(), b_t)
                     tmp = a_t.clone()
-                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
+                    with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(
+                        UserWarning, "floor_divide"
+                    ):
                         tmp //= b_t
                 else:
                     # Inplace modification is OK when both or neither tensor is
                     #   a float tensor
                     with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
-                        self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv)
-                        self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv)
+                        self.assertEqual(
+                            a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv
+                        )
+                        self.assertEqual(
+                            scripted_floor_divide__tensor(a_t.clone(), b_t).item(),
+                            expected_itruncdiv,
+                        )
                     tmp = a_t.clone()
                     with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
                         tmp //= b_t
                     self.assertEqual(tmp.item(), expected_itruncdiv)
 
                 with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
-                    self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5))
+                    self.assertEqual(
+                        scripted_floor_divide__scalar(a_t), math.trunc(a / 5)
+                    )
 
     # Tests binary op equivalence with Python builtin ops
     # Also tests that reverse operations are equivalent to forward ops
     # NOTE: division ops are tested separately above
     def test_binary_ops_with_scalars(self, device):
-        for python_op, torch_op in ((operator.add, torch.add),
-                                    (operator.sub, torch.sub),
-                                    (operator.mul, torch.mul),
-                                    (operator.truediv, torch.div)):
+        for python_op, torch_op in (
+            (operator.add, torch.add),
+            (operator.sub, torch.sub),
+            (operator.mul, torch.mul),
+            (operator.truediv, torch.div),
+        ):
 
             for a, b in product(range(-10, 10), range(-10, 10)):
-                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                     a = op(a)
                     b = op(b)
 
@@ -1869,28 +2157,56 @@ def test_binary_ops_with_scalars(self, device):
                     for args in product(vals, vals):
                         first, second = args
 
-                        first_scalar = first if not isinstance(first, torch.Tensor) else first.item()
-                        second_scalar = second if not isinstance(second, torch.Tensor) else second.item()
+                        first_scalar = (
+                            first
+                            if not isinstance(first, torch.Tensor)
+                            else first.item()
+                        )
+                        second_scalar = (
+                            second
+                            if not isinstance(second, torch.Tensor)
+                            else second.item()
+                        )
                         expected = python_op(first_scalar, second_scalar)
 
                         self.assertEqual(expected, python_op(first, second))
                         self.assertEqual(expected, torch_op(first, second))
 
-    @dtypes(*product(get_all_dtypes(include_complex=False), get_all_dtypes(include_complex=False)))
+    @dtypes(
+        *product(
+            all_types_and(torch.half, torch.bfloat16, torch.bool),
+            all_types_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_maximum_minimum_type_promotion(self, device, dtypes):
         a = torch.tensor((0, 1), device=device, dtype=dtypes[0])
         b = torch.tensor((1, 0), device=device, dtype=dtypes[1])
-        for op in (torch.maximum, torch.max, torch.fmax, torch.minimum, torch.min, torch.fmin):
+        for op in (
+            torch.maximum,
+            torch.max,
+            torch.fmax,
+            torch.minimum,
+            torch.min,
+            torch.fmin,
+        ):
             result = op(a, b)
             self.assertEqual(result.dtype, torch.result_type(a, b))
 
-    @dtypes(*(get_all_int_dtypes() + [torch.bool]))
+    @dtypes(*integral_types_and(torch.bool))
     def test_maximum_minimum_int_and_bool(self, device, dtype):
-        ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum),
-               (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin))
+        ops = (
+            (torch.maximum, torch.max, np.maximum),
+            (torch.minimum, torch.min, np.minimum),
+            (torch.fmax, None, np.fmax),
+            (torch.fmin, None, np.fmin),
+        )
         rng = np.random.default_rng()
-        a_np = np.array(rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype])
-        b_np = np.array(rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype])
+        a_np = np.array(
+            rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype]
+        )
+        b_np = np.array(
+            rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype]
+        )
 
         for torch_op, alias, numpy_op in ops:
             a_tensor = torch.from_numpy(a_np).to(device=device, dtype=dtype)
@@ -1910,10 +2226,14 @@ def test_maximum_minimum_int_and_bool(self, device, dtype):
             self.assertEqual(out, numpy_result)
 
     @precisionOverride({torch.bfloat16: 1e-2})
-    @dtypes(*(get_all_fp_dtypes()))
+    @dtypes(*(floating_types_and(torch.half, torch.bfloat16)))
     def test_maximum_minimum_float(self, device, dtype):
-        ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum),
-               (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin))
+        ops = (
+            (torch.maximum, torch.max, np.maximum),
+            (torch.minimum, torch.min, np.minimum),
+            (torch.fmax, None, np.fmax),
+            (torch.fmin, None, np.fmin),
+        )
 
         if dtype == torch.bfloat16:
             a_np = np.random.randn(10).astype(np.float64)
@@ -1938,14 +2258,36 @@ def test_maximum_minimum_float(self, device, dtype):
             self.assertEqual(tensor_result, numpy_result, exact_dtype=False)
             self.assertEqual(out, numpy_result, exact_dtype=False)
 
-    @dtypes(*(get_all_fp_dtypes()))
+    @dtypes(*(floating_types_and(torch.half, torch.bfloat16)))
     def test_maximum_minimum_float_nan_and_inf(self, device, dtype):
         # np.maximum and np.minimum functions compare input arrays element-wisely.
         # if one of the elements being compared is a NaN, then that element is returned.
-        ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum),
-               (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin))
-        a_vals = (float('inf'), -float('inf'), float('nan'), float('inf'), float('nan'), float('nan'), 1, float('nan'))
-        b_vals = (-float('inf'), float('inf'), float('inf'), float('nan'), float('nan'), 0, float('nan'), -5)
+        ops = (
+            (torch.maximum, torch.max, np.maximum),
+            (torch.minimum, torch.min, np.minimum),
+            (torch.fmax, None, np.fmax),
+            (torch.fmin, None, np.fmin),
+        )
+        a_vals = (
+            float("inf"),
+            -float("inf"),
+            float("nan"),
+            float("inf"),
+            float("nan"),
+            float("nan"),
+            1,
+            float("nan"),
+        )
+        b_vals = (
+            -float("inf"),
+            float("inf"),
+            float("inf"),
+            float("nan"),
+            float("nan"),
+            0,
+            float("nan"),
+            -5,
+        )
         if dtype == torch.bfloat16:
             a_np = np.array(a_vals, dtype=np.float64)
             b_np = np.array(b_vals, dtype=np.float64)
@@ -1974,16 +2316,32 @@ def test_maximum_minimum_float_nan_and_inf(self, device, dtype):
                 self.assertEqual(tensor_result, numpy_result)
                 self.assertEqual(out, numpy_result)
 
-    @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes()))
+    @dtypes(
+        *product(
+            complex_types(),
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_maximum_minimum_complex(self, device, dtypes):
-        for torch_op in (torch.maximum, torch.minimum, torch.max, torch.min, torch.fmax, torch.fmin):
-            with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'):
-                torch_op(torch.ones(1, device=device, dtype=dtypes[0]),
-                         torch.ones(1, device=device, dtype=dtypes[1]))
-
-            with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'):
-                torch_op(torch.ones(1, device=device, dtype=dtypes[1]),
-                         torch.ones(1, device=device, dtype=dtypes[0]))
+        for torch_op in (
+            torch.maximum,
+            torch.minimum,
+            torch.max,
+            torch.min,
+            torch.fmax,
+            torch.fmin,
+        ):
+            with self.assertRaisesRegex(RuntimeError, ".+not implemented for.+"):
+                torch_op(
+                    torch.ones(1, device=device, dtype=dtypes[0]),
+                    torch.ones(1, device=device, dtype=dtypes[1]),
+                )
+
+            with self.assertRaisesRegex(RuntimeError, ".+not implemented for.+"):
+                torch_op(
+                    torch.ones(1, device=device, dtype=dtypes[1]),
+                    torch.ones(1, device=device, dtype=dtypes[0]),
+                )
 
     @onlyCUDA
     def test_maximum_minimum_cross_device(self, device):
@@ -1992,12 +2350,14 @@ def test_maximum_minimum_cross_device(self, device):
         ops = (torch.maximum, torch.minimum)
 
         for torch_op in ops:
-            with self.assertRaisesRegex(RuntimeError,
-                                        "Expected all tensors to be on the same device"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Expected all tensors to be on the same device"
+            ):
                 torch_op(a, b)
 
-            with self.assertRaisesRegex(RuntimeError,
-                                        "Expected all tensors to be on the same device"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Expected all tensors to be on the same device"
+            ):
                 torch_op(b, a)
 
         # test cuda tensor and cpu scalar
@@ -2016,7 +2376,12 @@ def test_maximum_minimum_cross_device(self, device):
             self.assertEqual(tensor_result_1, numpy_result_1)
             self.assertEqual(tensor_result_2, numpy_result_2)
 
-    @dtypes(*product(get_all_fp_dtypes(), get_all_fp_dtypes()))
+    @dtypes(
+        *product(
+            floating_types_and(torch.half, torch.bfloat16),
+            floating_types_and(torch.half, torch.bfloat16),
+        )
+    )
     def test_maximum_and_minimum_subgradient(self, device, dtypes):
         def run_test(f, a, b, expected_a_grad, expected_b_grad):
             a = torch.tensor(a, requires_grad=True, device=device, dtype=dtypes[0])
@@ -2026,8 +2391,47 @@ def run_test(f, a, b, expected_a_grad, expected_b_grad):
             self.assertEqual(a.grad, expected_a_grad)
             self.assertEqual(b.grad, expected_b_grad)
 
-        run_test(torch.maximum, [0., 1., 2.], [1., 1., 1.], [0., 0.5, 1.], [1., 0.5, 0.])
-        run_test(torch.minimum, [0., 1., 2.], [1., 1., 1.], [1., 0.5, 0.], [0., 0.5, 1.])
+        run_test(
+            torch.maximum,
+            [0.0, 1.0, 2.0],
+            [1.0, 1.0, 1.0],
+            [0.0, 0.5, 1.0],
+            [1.0, 0.5, 0.0],
+        )
+        run_test(
+            torch.minimum,
+            [0.0, 1.0, 2.0],
+            [1.0, 1.0, 1.0],
+            [1.0, 0.5, 0.0],
+            [0.0, 0.5, 1.0],
+        )
+
+    def test_maximum_minimum_forward_ad_float32(self, device):
+        # TODO: This should really be covered by OpInfo but it isn't. The problem
+        # is that our gradient tests test using float64 but it should also test
+        # float32
+        x = torch.randn(3, device=device, dtype=torch.float32)
+        y = torch.randn(3, device=device, dtype=torch.float32)
+        tx = torch.randn(3, device=device, dtype=torch.float32)
+        ty = torch.randn(3, device=device, dtype=torch.float32)
+
+        with fwAD.dual_level():
+            x_dual = fwAD.make_dual(x, tx)
+            y_dual = fwAD.make_dual(y, ty)
+            result = torch.maximum(x_dual, y_dual)
+            _, result_tangent = fwAD.unpack_dual(result)
+
+        expected = torch.where(x > y, tx, ty)
+        self.assertEqual(result_tangent, expected)
+
+        with fwAD.dual_level():
+            x_dual = fwAD.make_dual(x, tx)
+            y_dual = fwAD.make_dual(y, ty)
+            result = torch.minimum(x_dual, y_dual)
+            _, result_tangent = fwAD.unpack_dual(result)
+
+        expected = torch.where(x < y, tx, ty)
+        self.assertEqual(result_tangent, expected)
 
     # TODO: tests like this should be generic
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
@@ -2039,24 +2443,37 @@ def test_mul_intertype_scalar(self, device, dtype):
         self.assertEqual(x * y, 4.5)
         self.assertEqual(y * x, 4.5)
 
-        with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "can't be cast to the desired output type"
+        ):
             y *= x
         x *= y
         self.assertEqual(x, 4.5)
 
     @onlyCPU
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_sub(self, device, dtype):
-        m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device)
-        m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device)
+        if dtype in integral_types():
+            # Before Python 3.10, floats were implicitly converted to ints, but with
+            #   DeprecationWarning: an integer is required (got type float).
+            #   Implicit conversion to integers using __int__ is deprecated,
+            #   and may be removed in a future version of Python.
+            # Since Python 3.10, that attempt gives an error.
+            m1 = torch.tensor([2, 4], dtype=dtype, device=device)
+            m2 = torch.tensor([1, 2], dtype=dtype, device=device)
+            diff = torch.tensor([1, 2], dtype=dtype)
+        else:
+            m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device)
+            m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device)
+            diff = torch.tensor([1.11, 2.11], dtype=dtype)
 
         if dtype == torch.bool:
             self.assertRaises(RuntimeError, lambda: m1 - m2)
-        elif (dtype == torch.bfloat16 or dtype == torch.half):
+        elif dtype == torch.bfloat16 or dtype == torch.half:
             # bfloat16 has a lower precision so we have to have a separate check for it
-            self.assertEqual(m1 - m2, torch.tensor([1.11, 2.11], dtype=dtype), atol=0.01, rtol=0)
+            self.assertEqual(m1 - m2, diff, atol=0.01, rtol=0)
         else:
-            self.assertEqual(m1 - m2, torch.tensor([1.11, 2.11], dtype=dtype))
+            self.assertEqual(m1 - m2, diff)
 
     # TODO: what is this test testing?
     @onlyCPU
@@ -2088,27 +2505,43 @@ def test_min_max_binary_op_nan(self, device, dtype):
         b = torch.rand(1000, dtype=dtype, device=device)
 
         # 0:250: a -- nan, b -- not nan
-        a[:250] = float('nan')
+        a[:250] = float("nan")
         # 250:500: a -- not nan, b -- nan
-        b[250:500] = float('nan')
+        b[250:500] = float("nan")
         # 500:750: a and b both nan
-        a[500:750] = float('nan')
-        b[500:750] = float('nan')
+        a[500:750] = float("nan")
+        b[500:750] = float("nan")
         # 750:1000: neither nan
 
         ma = torch.max(a, b)
         mi = torch.min(a, b)
 
         for i in range(750):
-            self.assertTrue(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]))
-            self.assertTrue(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]))
+            self.assertTrue(
+                torch.isnan(ma[i]),
+                "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]),
+            )
+            self.assertTrue(
+                torch.isnan(mi[i]),
+                "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]),
+            )
 
         for i in range(750, 1000):
-            self.assertFalse(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]))
-            self.assertFalse(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]))
+            self.assertFalse(
+                torch.isnan(ma[i]),
+                "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]),
+            )
+            self.assertFalse(
+                torch.isnan(mi[i]),
+                "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]),
+            )
 
-    @dtypes(*product(get_all_dtypes(include_complex=False),
-                     get_all_dtypes(include_complex=False)))
+    @dtypes(
+        *product(
+            all_types_and(torch.half, torch.bfloat16, torch.bool),
+            all_types_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_copysign(self, device, dtypes):
         def _test_copysign_numpy(a, b):
             torch_result = torch.copysign(a, b)
@@ -2125,7 +2558,7 @@ def _test_copysign_numpy(a, b):
             expected = torch.from_numpy(np.copysign(np_a, np_b))
             # To handle inconsistencies of type promotion between PyTorch and Numpy
             # Applied for both arguments having integral precision and bfloat16
-            types = [torch.bool, torch.bfloat16] + get_all_int_dtypes()
+            types = integral_types_and(torch.bool, torch.bfloat16)
             if a.dtype in types or b.dtype in types:
                 promoted_type = torch.promote_types(torch_result.dtype, expected.dtype)
                 torch_result = torch_result.to(promoted_type)
@@ -2140,8 +2573,10 @@ def _test_copysign_numpy(a, b):
             # Special case: NaN conversions between FP32 and FP16 is not bitwise
             # equivalent to pass this assertion.
             if a.dtype != torch.float16 and b.dtype != torch.float16:
-                self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result),
-                                 torch.copysign(torch.tensor(1.0), expected))
+                self.assertEqual(
+                    torch.copysign(torch.tensor(1.0), torch_result),
+                    torch.copysign(torch.tensor(1.0), expected),
+                )
 
         # Compare Result with NumPy
         # Type promotion
@@ -2159,52 +2594,76 @@ def _test_copysign_numpy(a, b):
         _test_copysign_numpy(a, b)
 
         # 0.0/-0.0/inf/-inf/nan
-        cases = [0.0, -0.0, float('inf'), float('-inf'), float('nan')]
+        cases = [0.0, -0.0, float("inf"), float("-inf"), float("nan")]
         # torch.bfloat16 can not hold '-nan'
         # torch.half can not hold '-nan' on CUDA
         types = [torch.float32, torch.float64]
-        if device == 'cpu':
+        if device == "cpu":
             types.append(torch.float16)
         if dtypes[0] in types:
             b = make_tensor((10, 10), device=device, dtype=dtypes[1], low=-9, high=9)
             for case in cases:
-                _test_copysign_numpy(torch.tensor([case], device=device, dtype=dtypes[0]), b)
+                _test_copysign_numpy(
+                    torch.tensor([case], device=device, dtype=dtypes[0]), b
+                )
 
-        if dtypes[1] in get_all_fp_dtypes():
+        if dtypes[1] in floating_types_and(torch.half, torch.bfloat16):
             a = make_tensor((10, 10), device=device, dtype=dtypes[0], low=-9, high=9)
             for case in cases:
-                _test_copysign_numpy(a, torch.tensor([case], device=device, dtype=dtypes[1]))
-
-    @dtypes(*product(get_all_fp_dtypes(),
-                     get_all_fp_dtypes()))
+                _test_copysign_numpy(
+                    a, torch.tensor([case], device=device, dtype=dtypes[1])
+                )
+
+    @dtypes(
+        *product(
+            floating_types_and(torch.half, torch.bfloat16),
+            floating_types_and(torch.half, torch.bfloat16),
+        )
+    )
     def test_copysign_subgradient(self, device, dtypes):
         # Input is 0.0
-        x = torch.tensor([0.0, 0.0, 0.0], dtype=dtypes[0], device=device, requires_grad=True)
-        y = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True)
+        x = torch.tensor(
+            [0.0, 0.0, 0.0], dtype=dtypes[0], device=device, requires_grad=True
+        )
+        y = torch.tensor(
+            [-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True
+        )
         out = torch.copysign(x, y)
         out.sum().backward()
         self.assertEqual(x.grad.tolist(), [0.0, 0.0, 0.0])
         self.assertEqual(y.grad.tolist(), [0.0] * 3)
 
         # Input is -0.0
-        x = torch.tensor([-0.0, -0.0, -0.0], dtype=dtypes[0], device=device, requires_grad=True)
-        y = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True)
+        x = torch.tensor(
+            [-0.0, -0.0, -0.0], dtype=dtypes[0], device=device, requires_grad=True
+        )
+        y = torch.tensor(
+            [-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True
+        )
         out = torch.copysign(x, y)
         out.sum().backward()
         self.assertEqual(x.grad.tolist(), [0.0, 0.0, 0.0])
         self.assertEqual(y.grad.tolist(), [0.0] * 3)
 
         # Other is 0.0
-        x = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True)
-        y = torch.tensor([0.0, 0.0, 0.0], dtype=dtypes[1], device=device, requires_grad=True)
+        x = torch.tensor(
+            [-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True
+        )
+        y = torch.tensor(
+            [0.0, 0.0, 0.0], dtype=dtypes[1], device=device, requires_grad=True
+        )
         out = torch.copysign(x, y)
         out.sum().backward()
         self.assertEqual(x.grad.tolist(), [-1.0, 0.0, 1.0])
         self.assertEqual(y.grad.tolist(), [0.0] * 3)
 
         # Other is -0.0
-        x = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True)
-        y = torch.tensor([-0.0, -0.0, -0.0], dtype=dtypes[1], device=device, requires_grad=True)
+        x = torch.tensor(
+            [-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True
+        )
+        y = torch.tensor(
+            [-0.0, -0.0, -0.0], dtype=dtypes[1], device=device, requires_grad=True
+        )
         out = torch.copysign(x, y)
         out.sum().backward()
         self.assertEqual(x.grad.tolist(), [1.0, 0.0, -1.0])
@@ -2212,9 +2671,10 @@ def test_copysign_subgradient(self, device, dtypes):
 
     @dtypes(torch.bfloat16, torch.float)
     def test_div(self, device, dtype):
-        for op, method, inplace in ((torch.div, torch.Tensor.div, torch.Tensor.div_),
-                                    (torch.true_divide, torch.Tensor.true_divide,
-                                     torch.Tensor.true_divide_)):
+        for op, method, inplace in (
+            (torch.div, torch.Tensor.div, torch.Tensor.div_),
+            (torch.true_divide, torch.Tensor.true_divide, torch.Tensor.true_divide_),
+        ):
             m1 = torch.randn(10, 10, dtype=torch.float, device=device).to(dtype=dtype)
             res1 = m1.clone()
             inplace(res1[:, 3], 2)
@@ -2225,40 +2685,48 @@ def test_div(self, device, dtype):
 
             if dtype == torch.bfloat16:
                 a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device)
-                a2 = torch.tensor([2., 2.], dtype=dtype, device=device)
-                self.assertEqual(op(a1, a2),
-                                 torch.tensor([2.1, 3.1], dtype=dtype, device=device),
-                                 atol=0.01, rtol=0)
+                a2 = torch.tensor([2.0, 2.0], dtype=dtype, device=device)
+                self.assertEqual(
+                    op(a1, a2),
+                    torch.tensor([2.1, 3.1], dtype=dtype, device=device),
+                    atol=0.01,
+                    rtol=0,
+                )
                 self.assertEqual(method(a1, a2), op(a1, a2))
 
     @dtypes(torch.bfloat16, torch.float)
     def test_true_divide_out(self, device, dtype):
         a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device)
-        a2 = torch.tensor([2., 2.], dtype=dtype, device=device)
+        a2 = torch.tensor([2.0, 2.0], dtype=dtype, device=device)
         res = torch.empty_like(a1)
-        self.assertEqual(torch.true_divide(a1, a2, out=res),
-                         torch.tensor([2.1, 3.1], dtype=dtype, device=device),
-                         atol=0.01, rtol=0)
+        self.assertEqual(
+            torch.true_divide(a1, a2, out=res),
+            torch.tensor([2.1, 3.1], dtype=dtype, device=device),
+            atol=0.01,
+            rtol=0,
+        )
 
     @onlyCUDA
     @dtypes(torch.half)
     def test_divmul_scalar(self, device, dtype):
-        x = torch.tensor(100., device=device, dtype=dtype)
+        x = torch.tensor(100.0, device=device, dtype=dtype)
         x_ref = x.float()
         scale = 1e5
         res = x.div(scale)
         expected = x_ref.div(scale)
-        self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.)
+        self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0)
         x = torch.tensor(1e-5, device=device, dtype=dtype)
         x_ref = x.float()
         res = x.mul(scale)
         expected = x_ref.mul(scale)
-        self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.)
+        self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0)
         res = scale * x
-        self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.)
+        self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0)
 
-    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
-    @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
+    @dtypesIfCUDA(
+        *set(get_all_math_dtypes("cuda")) - {torch.complex64, torch.complex128}
+    )
+    @dtypes(*set(get_all_math_dtypes("cpu")) - {torch.complex64, torch.complex128})
     def test_floor_divide_tensor(self, device, dtype):
         x = torch.randn(10, device=device).mul(30).to(dtype)
         y = torch.arange(1, 11, dtype=dtype, device=device)
@@ -2270,14 +2738,18 @@ def test_floor_divide_tensor(self, device, dtype):
         self.assertEqual(z.dtype, x.dtype)
         self.assertEqual(z, z_alt)
 
-    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
-    @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
+    @dtypesIfCUDA(
+        *set(get_all_math_dtypes("cuda")) - {torch.complex64, torch.complex128}
+    )
+    @dtypes(*set(get_all_math_dtypes("cpu")) - {torch.complex64, torch.complex128})
     def test_floor_divide_scalar(self, device, dtype):
         x = torch.randn(100, device=device).mul(10).to(dtype)
 
         with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"):
             z = x // 3
-        z_alt = torch.tensor([math.trunc(v.item() / 3.) for v in x], dtype=x.dtype, device=device)
+        z_alt = torch.tensor(
+            [math.trunc(v.item() / 3.0) for v in x], dtype=x.dtype, device=device
+        )
 
         self.assertEqual(z.dtype, x.dtype)
         self.assertEqual(z, z_alt)
@@ -2304,7 +2776,7 @@ def test_floor_divide_out(self, device, dtype):
                 self.assertEqual(o, torch.floor_divide(x.float(), y.float()))
 
     @onlyCPU
-    @dtypes(*get_all_math_dtypes('cpu'))
+    @dtypes(*get_all_math_dtypes("cpu"))
     def test_rdiv(self, device, dtype):
         if dtype is torch.float16:
             return
@@ -2316,7 +2788,7 @@ def test_rdiv(self, device, dtype):
         z = torch.tensor([30 / v.item() for v in x], device=device)
         self.assertEqual(y, z, exact_dtype=False)
 
-    @dtypes(*get_all_fp_dtypes(include_bfloat16=False))
+    @dtypes(*floating_types_and(torch.half))
     def test_fmod_remainder_by_zero_float(self, device, dtype):
         fn_list = (torch.fmod, torch.remainder)
         for fn in fn_list:
@@ -2327,8 +2799,7 @@ def test_fmod_remainder_by_zero_float(self, device, dtype):
             self.assertTrue(torch.all(fn(x, zero).isnan()))
 
     @onlyNativeDeviceTypes  # Check Issue https://github.com/pytorch/pytorch/issues/48130
-    @skipCUDAIfRocm  # Error happens on both ROCM and XLA
-    @dtypes(*get_all_int_dtypes())
+    @dtypes(*integral_types())
     def test_fmod_remainder_by_zero_integral(self, device, dtype):
         fn_list = (torch.fmod, torch.remainder)
         for fn in fn_list:
@@ -2336,16 +2807,19 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype):
             x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9)
             zero = torch.zeros_like(x)
             # RuntimeError on CPU
-            if self.device_type == 'cpu':
+            if self.device_type == "cpu":
                 with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
                     fn(x, zero)
-            # Different value for different dtype on CUDA:
-            # Due to it's an undefined behavior, CUDA returns a pattern of all 1s
-            # for integral dividend (other than int64) divided by zero. For int64,
-            # CUDA returns all 1s for negative dividend, half 1s for positive dividend.
-            # uint8: 0xff -> 255
-            # int32: 0xffffffff -> -1
+            elif torch.version.hip is not None:
+                # ROCm behavior: x % 0 is a no-op; x is returned
+                self.assertEqual(fn(x, zero), x)
             else:
+                # CUDA behavior: Different value for different dtype
+                # Due to it's an undefined behavior, CUDA returns a pattern of all 1s
+                # for integral dividend (other than int64) divided by zero. For int64,
+                # CUDA returns all 1s for negative dividend, half 1s for positive dividend.
+                # uint8: 0xff -> 255
+                # int32: 0xffffffff -> -1
                 if dtype == torch.int64:
                     self.assertEqual(fn(x, zero) == 4294967295, x >= 0)
                     self.assertEqual(fn(x, zero) == -1, x < 0)
@@ -2353,7 +2827,7 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype):
                     value = 255 if dtype == torch.uint8 else -1
                     self.assertTrue(torch.all(fn(x, zero) == value))
 
-    @dtypes(*get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
+    @dtypes(*all_types_and(torch.half))
     def test_fmod_remainder(self, device, dtype):
         # Use numpy as reference
         def _helper(x, mod, fns_list):
@@ -2377,9 +2851,12 @@ def _helper(x, mod, fns_list):
                         inplace_fn(x, mod)
                         self.assertEqual(x, exp, exact_dtype=False)
                     except RuntimeError as e:
-                        self.assertRegex(str(e), "result type (Half|Float|Double) "
-                                                 "can't be cast to the desired output "
-                                                 "type (Byte|Char|Short|Int|Long)")
+                        self.assertRegex(
+                            str(e),
+                            "result type (Half|Float|Double) "
+                            "can't be cast to the desired output "
+                            "type (Byte|Char|Short|Int|Long)",
+                        )
 
         x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9)
         # mod with same dtype as x
@@ -2390,21 +2867,31 @@ def _helper(x, mod, fns_list):
         # Mods: Integer, Float, Tensor, Non-contiguous Tensor
         mods = [3, 2.3, mod, mod.t()]
         # mod with floating-point dtype
-        if dtype in get_all_int_dtypes():
-            mod_float = make_tensor((10, 10), device=device, dtype=torch.float, low=-9, high=9)
+        if dtype in integral_types():
+            mod_float = make_tensor(
+                (10, 10), device=device, dtype=torch.float, low=-9, high=9
+            )
             mod[mod == 0] = 1
             mods.append(mod_float)
 
         for dividend, mod in product([x, x.t()], mods):
-            _helper(dividend, mod,
-                    ((torch.fmod, torch.Tensor.fmod_, np.fmod),
-                     (torch.remainder, torch.Tensor.remainder_, np.remainder),))
+            _helper(
+                dividend,
+                mod,
+                (
+                    (torch.fmod, torch.Tensor.fmod_, np.fmod),
+                    (torch.remainder, torch.Tensor.remainder_, np.remainder),
+                ),
+            )
 
         # Tests for torch.remainder(scalar, tensor)
         for dividend, mod in product([5, 3.14], mods):
             if torch.is_tensor(mod):
-                _helper(dividend, mod,
-                        ((torch.remainder, torch.Tensor.remainder_, np.remainder),))
+                _helper(
+                    dividend,
+                    mod,
+                    ((torch.remainder, torch.Tensor.remainder_, np.remainder),),
+                )
 
     @dtypes(torch.float, torch.double)
     def test_remainder_fmod_large_dividend(self, device, dtype):
@@ -2416,23 +2903,45 @@ def test_remainder_fmod_large_dividend(self, device, dtype):
                 b = torch.tensor([bvalue], dtype=dtype, device=device)
                 c = torch.remainder(a, b)
                 d = torch.fmod(a, b)
-                self.assertTrue((b[0] > 0) == (c[0] > 0))  # remainder has same sign as divisor
-                self.assertTrue((a[0] > 0) == (d[0] > 0))  # fmod has same sign as dividend
-                self.assertTrue(abs(c[0]) < abs(b[0]))     # remainder is within range of divisor
-                self.assertTrue(abs(d[0]) < abs(b[0]))     # fmod is within range of divisor
-                if ((a[0] > 0) == (b[0] > 0)):
-                    self.assertTrue(c[0] == d[0])   # remainder is same as fmod
+                self.assertTrue(
+                    (b[0] > 0) == (c[0] > 0)
+                )  # remainder has same sign as divisor
+                self.assertTrue(
+                    (a[0] > 0) == (d[0] > 0)
+                )  # fmod has same sign as dividend
+                self.assertTrue(
+                    abs(c[0]) < abs(b[0])
+                )  # remainder is within range of divisor
+                self.assertTrue(
+                    abs(d[0]) < abs(b[0])
+                )  # fmod is within range of divisor
+                if (a[0] > 0) == (b[0] > 0):
+                    self.assertTrue(c[0] == d[0])  # remainder is same as fmod
                 else:
-                    self.assertTrue(abs(c[0] - d[0]) == abs(b[0]))  # differ by one divisor
+                    self.assertTrue(
+                        abs(c[0] - d[0]) == abs(b[0])
+                    )  # differ by one divisor
 
     @dtypesIfCPU(torch.bfloat16, torch.float32, torch.float64)
     @dtypes(torch.float32, torch.float64)
     def test_hypot(self, device, dtype):
         inputs = [
-            (torch.randn(10, device=device).to(dtype), torch.randn(10, device=device).to(dtype)),
-            (torch.randn((3, 3, 3), device=device).to(dtype), torch.randn((3, 3, 3), device=device).to(dtype)),
-            (torch.randn((10, 1), device=device).to(dtype), torch.randn((10, 1), device=device).to(dtype).transpose(0, 1)),
-            (torch.randint(100, (10, ), device=device, dtype=torch.long), torch.randn(10, device=device).to(dtype))
+            (
+                torch.randn(10, device=device).to(dtype),
+                torch.randn(10, device=device).to(dtype),
+            ),
+            (
+                torch.randn((3, 3, 3), device=device).to(dtype),
+                torch.randn((3, 3, 3), device=device).to(dtype),
+            ),
+            (
+                torch.randn((10, 1), device=device).to(dtype),
+                torch.randn((10, 1), device=device).to(dtype).transpose(0, 1),
+            ),
+            (
+                torch.randint(100, (10,), device=device, dtype=torch.long),
+                torch.randn(10, device=device).to(dtype),
+            ),
         ]
         for input in inputs:
             actual = torch.hypot(input[0], input[1])
@@ -2511,8 +3020,8 @@ def test_nextafter(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(torch.bfloat16)
     def test_nextafter_bfloat16(self, device, dtype):
-        nan = float('nan')
-        inf = float('inf')
+        nan = float("nan")
+        inf = float("inf")
         cases = (
             # (from, to, expected)
             (0, 1, 9.183549615799121e-41),
@@ -2528,7 +3037,7 @@ def test_nextafter_bfloat16(self, device, dtype):
             (20, -3000, 19.875),
             (3000, -20, 2992.0),
             (-3000, 20, -2992.0),
-            (65536, 0, 65280.0) ,
+            (65536, 0, 65280.0),
             (65536, inf, 66048.0),
             (-65536, 0, -65280.0),
             (-65536, -inf, -66048.0),
@@ -2537,11 +3046,11 @@ def test_nextafter_bfloat16(self, device, dtype):
             (nan, nan, nan),
             (nan, inf, nan),
             (inf, nan, nan),
-            (inf, -inf, 3.3895313892515355e+38),
-            (-inf, inf, -3.3895313892515355e+38),
-            (inf, 0, 3.3895313892515355e+38),
+            (inf, -inf, 3.3895313892515355e38),
+            (-inf, inf, -3.3895313892515355e38),
+            (inf, 0, 3.3895313892515355e38),
             (0, inf, 9.183549615799121e-41),
-            (-inf, 0, -3.3895313892515355e+38),
+            (-inf, 0, -3.3895313892515355e38),
             (0, -inf, -9.183549615799121e-41),
         )
 
@@ -2574,10 +3083,17 @@ def reference_implementation(res2):
         sm1 = m1[:, 4]
         sm2 = m2[:, 4]
         # view as sm1.size()
-        sm2.set_(sm2.storage(), sm2.storage_offset(), sm1.size(), (sm2.stride()[0] * 10, sm2.stride()[0]))
+        sm2.set_(
+            sm2.storage(),
+            sm2.storage_offset(),
+            sm1.size(),
+            (sm2.stride()[0] * 10, sm2.stride()[0]),
+        )
         res1 = torchfn(sm1, sm2)
         # reference_implementation assumes 1-d sm2
-        sm2.set_(sm2.storage(), sm2.storage_offset(), m2[:, 4].size(), m2[:, 4].stride())
+        sm2.set_(
+            sm2.storage(), sm2.storage_offset(), m2[:, 4].size(), m2[:, 4].stride()
+        )
         res2 = reference_implementation(res1.clone())
         self.assertEqual(res1, res2)
 
@@ -2599,29 +3115,69 @@ def test_cmul(self, device, dtype):
     @onlyCPU
     @dtypes(torch.float)
     def test_cpow(self, device, dtype):
-        self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y), dtype, device)
+        self._test_cop(
+            torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y), dtype, device
+        )
 
     @onlyCPU
     @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
     def test_floor_divide_zero(self, device, dtype):
         a = torch.tensor([0, 1], dtype=dtype, device=device)
         b = torch.tensor([0, 1], dtype=dtype, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'ZeroDivisionError'):
+        with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
             with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
                 a // b
 
     @unittest.skipIf(TEST_WITH_ASAN, "Integer overflows are not allowed under ASAN")
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_muldiv_scalar(self, device, dtype):
-        x = make_tensor((10, 3), device, dtype, low=None, high=None)
-        s = make_tensor((1,), 'cpu', dtype, low=None, high=None).item()
+        x = make_tensor((10, 3), dtype=dtype, device=device, low=None, high=None)
+        s = make_tensor((1,), dtype=dtype, device="cpu", low=None, high=None).item()
         y = torch.full_like(x, s)
         self.assertEqual(x * s, x * y)
         self.assertEqual(s * x, y * x)
         self.assertEqual(x / s, x / y)
         self.assertEqual(s / x, y / x)
 
-    @dtypes(*tuple(itertools.combinations_with_replacement(get_all_dtypes(), 2)))
+    # TODO: update make_tensor to support extremal additions and remove this in favor of make_tensor
+    def _generate_input(self, shape, dtype, device, with_extremal):
+        if shape == ():
+            x = torch.tensor((), dtype=dtype, device=device)
+        else:
+            if dtype.is_floating_point or dtype.is_complex:
+                # work around torch.randn not being implemented for bfloat16
+                if dtype == torch.bfloat16:
+                    x = torch.randn(*shape, device=device) * random.randint(30, 100)
+                    x = x.to(torch.bfloat16)
+                else:
+                    x = torch.randn(
+                        *shape, dtype=dtype, device=device
+                    ) * random.randint(30, 100)
+                x[torch.randn(*shape) > 0.5] = 0
+                if with_extremal and dtype.is_floating_point:
+                    # Use extremal values
+                    x[torch.randn(*shape) > 0.5] = float("nan")
+                    x[torch.randn(*shape) > 0.5] = float("inf")
+                    x[torch.randn(*shape) > 0.5] = float("-inf")
+                elif with_extremal and dtype.is_complex:
+                    x[torch.randn(*shape) > 0.5] = complex("nan")
+                    x[torch.randn(*shape) > 0.5] = complex("inf")
+                    x[torch.randn(*shape) > 0.5] = complex("-inf")
+            elif dtype == torch.bool:
+                x = torch.zeros(shape, dtype=dtype, device=device)
+                x[torch.randn(*shape) > 0.5] = True
+            else:
+                x = torch.randint(15, 100, shape, dtype=dtype, device=device)
+
+        return x
+
+    @dtypes(
+        *tuple(
+            itertools.combinations_with_replacement(
+                all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), 2
+            )
+        )
+    )
     def test_comparison_ops_type_promotion_and_broadcasting(self, device, dtypes):
         # issue #42660
         # testing all combinations of broadcasting and type promotion
@@ -2630,37 +3186,45 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None):
             # working around the fact that numpy doesn't support bfloat16
             # by letting numpy treat them as float32's
             x_np = x if x.dtype != torch.bfloat16 else x.to(torch.float32)
-            y_np = y.cpu().numpy() if y.dtype != torch.bfloat16 else y.to(torch.float32).cpu().numpy()
-            self.compare_with_numpy(lambda inp: torch_fn(inp, y, out=out) if out else torch_fn(inp, y),
-                                    lambda inp: np_fn(inp, y_np, out=out) if out else np_fn(inp, y_np),
-                                    x_np)
+            y_np = (
+                y.cpu().numpy()
+                if y.dtype != torch.bfloat16
+                else y.to(torch.float32).cpu().numpy()
+            )
+            self.compare_with_numpy(
+                lambda inp: torch_fn(inp, y, out=out) if out else torch_fn(inp, y),
+                lambda inp: np_fn(inp, y_np, out=out) if out else np_fn(inp, y_np),
+                x_np,
+            )
 
-        complex_op_denylist = [torch.lt, torch.le, torch.gt, torch.ge]  # complex not supported
-        input_sizes = [
-            (1,),
-            (10,),
-            (10, 1),
-            (1, 10),
-            (4, 10),
-            (64, 10),
-            (12, 3)]
-        op_pairs = [(torch.lt, np.less),
-                    (torch.le, np.less_equal),
-                    (torch.gt, np.greater),
-                    (torch.ge, np.greater_equal),
-                    (torch.eq, np.equal),
-                    (torch.ne, np.not_equal),
-                    (torch.logical_and, np.logical_and),
-                    (torch.logical_or, np.logical_or),
-                    (torch.logical_xor, np.logical_xor)]
+        complex_op_denylist = [
+            torch.lt,
+            torch.le,
+            torch.gt,
+            torch.ge,
+        ]  # complex not supported
+        input_sizes = [(1,), (10,), (10, 1), (1, 10), (4, 10), (64, 10), (12, 3)]
+        op_pairs = [
+            (torch.lt, np.less),
+            (torch.le, np.less_equal),
+            (torch.gt, np.greater),
+            (torch.ge, np.greater_equal),
+            (torch.eq, np.equal),
+            (torch.ne, np.not_equal),
+            (torch.logical_and, np.logical_and),
+            (torch.logical_or, np.logical_or),
+            (torch.logical_xor, np.logical_xor),
+        ]
 
         for size1 in input_sizes:
             size2 = (2,) + size1  # perform broadcasting
             for with_extremal in [False, True]:
-                a = _generate_input(size1, dtypes[0], device, with_extremal)
-                b = _generate_input(size2, dtypes[1], device, with_extremal)
+                a = self._generate_input(size1, dtypes[0], device, with_extremal)
+                b = self._generate_input(size2, dtypes[1], device, with_extremal)
                 for torch_op, numpy_op in op_pairs:
-                    if (dtypes[0].is_complex or dtypes[1].is_complex) and torch_op in complex_op_denylist:
+                    if (
+                        dtypes[0].is_complex or dtypes[1].is_complex
+                    ) and torch_op in complex_op_denylist:
                         continue
                     # functional version of op
                     compare_with_numpy_bin_op(torch_op, numpy_op, a, b)
@@ -2669,7 +3233,9 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None):
                     self.assertEqual(torch_op(a, b).dtype, torch.bool)
 
                     # out version of op
-                    out = torch.zeros(1, dtype=torch.complex128)  # all casts to complex128 are safe
+                    out = torch.zeros(
+                        1, dtype=torch.complex128
+                    )  # all casts to complex128 are safe
                     compare_with_numpy_bin_op(torch_op, numpy_op, a, b, out=out)
 
     @onlyNativeDeviceTypes
@@ -2677,145 +3243,47 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None):
     def test_signed_shift(self, device, dtype):
         "Ensure that signed integer bit shifting works as expected."
         a = torch.tensor([-10, 10], device=device, dtype=dtype)  # [11...1110110, 1010]
-        expected_l = torch.tensor([-40, 40], device=device, dtype=dtype)  # [11...11011000, 101000]
+        expected_l = torch.tensor(
+            [-40, 40], device=device, dtype=dtype
+        )  # [11...11011000, 101000]
         self.assertEqual(a << 2, expected_l)
         self.compare_with_numpy(lambda x: x << 2, lambda x: np.left_shift(x, 2), a)
-        expected_r = torch.tensor([-5, 5], device=device, dtype=dtype)  # [1111...111011, 101]
+        expected_r = torch.tensor(
+            [-5, 5], device=device, dtype=dtype
+        )  # [1111...111011, 101]
         self.assertEqual(a >> 1, expected_r)
         self.compare_with_numpy(lambda x: x >> 1, lambda x: np.right_shift(x, 1), a)
 
-    @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
-    def test_bitwise_and(self, device, dtype):
-        a = torch.tensor([1, -2, 3], dtype=dtype, device=device)
-        b = torch.tensor([2, 1, 3], dtype=dtype, device=device)
-
-        a_np = a.cpu().numpy()
-        b_np = b.cpu().numpy()
-
-        # Tensor x Tensor
-        self.assertEqual(torch.bitwise_and(a, b), torch.tensor(np.bitwise_and(a_np, b_np), device=device))
-        # Tensor x int scaler
-        self.assertEqual(torch.bitwise_and(a, 2), torch.tensor(np.bitwise_and(a_np, 2), device=device))
-
-        self.assertEqual(torch.tensor([False, True, False], device=device),
-                         torch.bitwise_and(torch.tensor([True, True, False], device=device),
-                                           torch.tensor([False, True, False], device=device)))
-
-        # type promotion
-        c = torch.zeros(2) >= 1
-        self.assertEqual(torch.bitwise_and(c, c.byte()), torch.bitwise_and(c.byte(), c))
-
-    def test_bitwise_or(self, device):
-        for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
-            a = torch.tensor([1, -2, 3], dtype=dtype, device=device)
-            b = torch.tensor([2, 1, 3], dtype=dtype, device=device)
-            expected_res = torch.tensor([3, -1, 3], dtype=dtype, device=device)
-            b_scalar = 2
-            expected_res_scalar = torch.tensor([3, -2, 3], dtype=dtype, device=device)
-
-            # standard version
-            self.assertEqual(torch.bitwise_or(a, b), expected_res)
-            self.assertEqual(torch.bitwise_or(a, b_scalar), expected_res_scalar)
-
-            # out
-            c = torch.empty(0, dtype=dtype, device=device)
-            torch.bitwise_or(a, b, out=c)
-            self.assertEqual(c, expected_res)
-            torch.bitwise_or(a, b_scalar, out=c)
-            self.assertEqual(c, expected_res_scalar)
-
-            # in-place
-            a1 = a.clone()
-            a1.bitwise_or_(b)
-            self.assertEqual(a1, expected_res)
-            a.bitwise_or_(b_scalar)
-            self.assertEqual(a, expected_res_scalar)
-
-        self.assertEqual(torch.tensor([True, True, False], device=device),
-                         torch.bitwise_or(torch.tensor([True, True, False], device=device),
-                                          torch.tensor([False, True, False], device=device)))
-
-    def test_bitwise_xor(self, device):
-        for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
-            a = torch.tensor([1, -2, 3], dtype=dtype, device=device)
-            b = torch.tensor([2, 1, 3], dtype=dtype, device=device)
-            expected_res = torch.tensor([3, -1, 0], dtype=dtype, device=device)
-            b_scalar = 2
-            expected_res_scalar = torch.tensor([3, -4, 1], dtype=dtype, device=device)
-
-            # standard version
-            self.assertEqual(torch.bitwise_xor(a, b), expected_res)
-            self.assertEqual(torch.bitwise_xor(a, b_scalar), expected_res_scalar)
-
-            # out
-            c = torch.empty(0, dtype=dtype, device=device)
-            torch.bitwise_xor(a, b, out=c)
-            self.assertEqual(c, expected_res)
-            torch.bitwise_xor(a, b_scalar, out=c)
-            self.assertEqual(c, expected_res_scalar)
-
-            # in-place
-            a1 = a.clone()
-            a1.bitwise_xor_(b)
-            self.assertEqual(a1, expected_res)
-            a.bitwise_xor_(b_scalar)
-            self.assertEqual(a, expected_res_scalar)
-
-        self.assertEqual(torch.tensor([True, False, False], device=device),
-                         torch.bitwise_xor(torch.tensor([True, True, False], device=device),
-                                           torch.tensor([False, True, False], device=device)))
-
-    @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
-    def test_bitwise_shift(self, device, dtype):
-        ops = [
-            (torch.bitwise_left_shift, np.left_shift),
-            (operator.lshift, operator.lshift),
-            (torch.bitwise_right_shift, np.right_shift),
-            (operator.rshift, operator.rshift),
-        ]
-        for torch_op, numpy_op in ops:
-            a = torch.tensor([19, -20, -21, 22], dtype=dtype, device=device)
-            b = torch.tensor([2, 1, 3, 1], dtype=dtype, device=device)
-            a_np = a.cpu().numpy()
-            b_np = b.cpu().numpy()
-
-            # Tensor x Tensor
-            self.assertEqual(torch_op(a, b), torch.tensor(numpy_op(a_np, b_np), device=device))
-            # Tensor x int scalar
-            self.assertEqual(torch_op(a, 2), torch.tensor(numpy_op(a_np, 2), device=device))
-
-    def test_bitwise_shift_float(self, device):
-        ops = [
-            (torch.bitwise_left_shift, lambda x, y: x * 2. ** y),
-            (operator.lshift, lambda x, y: x * 2. ** y),
-            (torch.bitwise_right_shift, lambda x, y: x / 2. ** y),
-            (operator.rshift, lambda x, y: x / 2. ** y),
-        ]
-        for torch_op, expected_op in ops:
-            # int tensor x float
-            a = torch.tensor([19, -20, -21, 22], dtype=torch.int64, device=device)
-            self.assertEqual(torch_op(a, 1.8), torch.floor(expected_op(a, 1)).to(a.dtype))
-            # float tensor x int scalar
-            a = torch.tensor([19.1, -20.2, -21.3, 22.4], dtype=torch.float32, device=device)
-            self.assertEqual(torch_op(a, 2), expected_op(a, 2))
-            # float tensor x float scalar
-            a = torch.tensor([19.1, -20.2, -21.3, 22.4], dtype=torch.float32, device=device)
-            self.assertEqual(torch_op(a, 2.2), expected_op(a, 2.2))
-
     @onlyNativeDeviceTypes
-    @dtypes(*list(product(get_all_dtypes(include_complex=False),
-                          get_all_dtypes(include_complex=False))))
+    @dtypes(
+        *list(
+            product(
+                all_types_and(torch.half, torch.bfloat16, torch.bool),
+                all_types_and(torch.half, torch.bfloat16, torch.bool),
+            )
+        )
+    )
     def test_heaviside(self, device, dtypes):
         input_dtype = dtypes[0]
         values_dtype = dtypes[1]
 
         rng = np.random.default_rng()
-        input = np.array(rng.integers(-10, 10, size=10),
-                         dtype=torch_to_numpy_dtype_dict[input_dtype if (input_dtype != torch.bfloat16) else torch.float64])
+        input = np.array(
+            rng.integers(-10, 10, size=10),
+            dtype=torch_to_numpy_dtype_dict[
+                input_dtype if (input_dtype != torch.bfloat16) else torch.float64
+            ],
+        )
         input[0] = input[3] = input[7] = 0
-        values = np.array(rng.integers(-10, 10, size=10),
-                          dtype=torch_to_numpy_dtype_dict[values_dtype if (values_dtype != torch.bfloat16) else torch.float64])
-        np_result = torch.from_numpy(np.heaviside(input, values)).to(device=device, dtype=input_dtype)
+        values = np.array(
+            rng.integers(-10, 10, size=10),
+            dtype=torch_to_numpy_dtype_dict[
+                values_dtype if (values_dtype != torch.bfloat16) else torch.float64
+            ],
+        )
+        np_result = torch.from_numpy(np.heaviside(input, values)).to(
+            device=device, dtype=input_dtype
+        )
 
         input = torch.from_numpy(input).to(device=device, dtype=input_dtype)
         values = torch.from_numpy(values).to(device=device, dtype=values_dtype)
@@ -2834,13 +3302,25 @@ def test_heaviside(self, device, dtypes):
             input.heaviside_(values)
             self.assertEqual(np_result, input)
         else:
-            with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "heaviside is not yet implemented for tensors with different dtypes.",
+            ):
                 torch.heaviside(input, values)
-            with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "heaviside is not yet implemented for tensors with different dtypes.",
+            ):
                 input.heaviside(values)
-            with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "heaviside is not yet implemented for tensors with different dtypes.",
+            ):
                 torch.heaviside(input, values, out=out)
-            with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "heaviside is not yet implemented for tensors with different dtypes.",
+            ):
                 input.heaviside_(values)
 
     @onlyCUDA
@@ -2857,14 +3337,17 @@ def test_heaviside_cross_device(self, device):
 
         x = torch.tensor([-9, 5, 0, 6, -2, 2])
         y = torch.tensor(0, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             torch.heaviside(x, y)
 
-        with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             torch.heaviside(y, x)
 
-    @dtypes(*list(product(get_all_complex_dtypes(),
-                          get_all_complex_dtypes())))
+    @dtypes(*list(product(complex_types(), complex_types())))
     def test_heaviside_complex(self, device, dtypes):
         input_dtype = dtypes[0]
         values_dtype = dtypes[1]
@@ -2875,13 +3358,21 @@ def test_heaviside_complex(self, device, dtypes):
         out = torch.empty_like(input)
         real = input.real
 
-        with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'):
+        with self.assertRaisesRegex(
+            RuntimeError, "heaviside is not yet implemented for complex tensors."
+        ):
             torch.heaviside(input, real)
-        with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'):
+        with self.assertRaisesRegex(
+            RuntimeError, "heaviside is not yet implemented for complex tensors."
+        ):
             real.heaviside(values)
-        with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'):
+        with self.assertRaisesRegex(
+            RuntimeError, "heaviside is not yet implemented for complex tensors."
+        ):
             input.heaviside_(values)
-        with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'):
+        with self.assertRaisesRegex(
+            RuntimeError, "heaviside is not yet implemented for complex tensors."
+        ):
             torch.heaviside(real, real, out=out)
 
     def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
@@ -2896,20 +3387,41 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         getattr(torch, op)(a, b, out=c)
         self.assertEqual(expected_res.bool(), c)
 
-        getattr(a, op + '_')(b)
+        getattr(a, op + "_")(b)
         self.assertEqual(expected_res, a)
 
-    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
+    @dtypes(
+        *product(
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_logical_xor(self, device, dtypes):
-        self._test_logical(device, dtypes, 'logical_xor', [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1])
+        self._test_logical(
+            device, dtypes, "logical_xor", [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1]
+        )
 
-    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
+    @dtypes(
+        *product(
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_logical_and(self, device, dtypes):
-        self._test_logical(device, dtypes, 'logical_and', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0])
+        self._test_logical(
+            device, dtypes, "logical_and", [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0]
+        )
 
-    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
+    @dtypes(
+        *product(
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     def test_logical_or(self, device, dtypes):
-        self._test_logical(device, dtypes, 'logical_or', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1])
+        self._test_logical(
+            device, dtypes, "logical_or", [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1]
+        )
 
     def test_remainder_overflow(self, device):
         # Check Integer Overflows
@@ -2945,7 +3457,9 @@ def test_ldexp(self, device):
         self.assertEqual(np_outcome, mantissas)
 
         # test bounds
-        mantissas = torch.tensor([float('inf'), float('-inf'), float('inf'), float('nan')], device=device)
+        mantissas = torch.tensor(
+            [float("inf"), float("-inf"), float("inf"), float("nan")], device=device
+        )
         exponents = torch.randint(0, 31, (4,), device=device, dtype=torch.int32)
         np_outcome = np.ldexp(mantissas.numpy(), exponents.numpy())
         pt_outcome = torch.ldexp(mantissas, exponents)
@@ -2954,12 +3468,17 @@ def test_ldexp(self, device):
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_lerp(self, device, dtype):
         start_end_weight_shapes = [(), (5,), (5, 5)]
-        for shapes in product(start_end_weight_shapes, start_end_weight_shapes, start_end_weight_shapes):
+        for shapes in product(
+            start_end_weight_shapes, start_end_weight_shapes, start_end_weight_shapes
+        ):
             start = torch.randn(shapes[0], device=device, dtype=dtype)
             end = torch.randn(shapes[1], device=device, dtype=dtype)
 
             # Tensor weights
-            weights = [torch.randn(shapes[2], device=device, dtype=dtype), random.random()]
+            weights = [
+                torch.randn(shapes[2], device=device, dtype=dtype),
+                random.random(),
+            ]
             if dtype.is_complex:
                 weights += [complex(0, 1), complex(0.4, 1.2)]
 
@@ -2967,12 +3486,29 @@ def test_lerp(self, device, dtype):
                 actual = torch.lerp(start, end, weight)
                 actual_method = start.lerp(end, weight)
                 self.assertEqual(actual, actual_method)
-                actual_out = torch.tensor(1., dtype=dtype, device=device)
+                actual_out = torch.tensor(1.0, dtype=dtype, device=device)
                 torch.lerp(start, end, weight, out=actual_out)
                 self.assertEqual(actual, actual_out)
                 expected = start + weight * (end - start)
                 self.assertEqual(expected, actual)
 
+    @onlyCUDA
+    @dtypes(torch.half, torch.bfloat16)
+    def test_lerp_lowp(self, device, dtype):
+        ref_dtype = torch.float
+        xvals = (0.0, -30000.0)
+        yvals = (0.1, -20000.0)
+        xs = [torch.full((4,), xval, device=device, dtype=dtype) for xval in xvals]
+        ys = [torch.full((4,), yval, device=device, dtype=dtype) for yval in yvals]
+        weights = [70000, torch.full((4,), 8, device=device, dtype=dtype)]
+        for x, y, w in zip(xs, ys, weights):
+            xref = x.float()
+            yref = y.float()
+            wref = w.float() if isinstance(w, torch.Tensor) else w
+            actual = torch.lerp(x, y, w)
+            expected = torch.lerp(xref, yref, wref).to(dtype)
+            self.assertEqual(actual, expected, atol=0.0, rtol=0.0)
+
     def _test_logaddexp(self, device, dtype, base2):
         if base2:
             ref_func = np.logaddexp2
@@ -3003,8 +3539,16 @@ def _test_helper(a, b):
         _test_helper(a, b)
         _test_helper(a[:3], b[:3])
 
-        a = torch.tensor([float('inf'), float('-inf'), float('inf'), float("nan")], dtype=dtype, device=device)
-        b = torch.tensor([float('inf'), float('-inf'), float('-inf'), float("nan")], dtype=dtype, device=device)
+        a = torch.tensor(
+            [float("inf"), float("-inf"), float("inf"), float("nan")],
+            dtype=dtype,
+            device=device,
+        )
+        b = torch.tensor(
+            [float("inf"), float("-inf"), float("-inf"), float("nan")],
+            dtype=dtype,
+            device=device,
+        )
         _test_helper(a, b)
 
     @dtypes(torch.float32, torch.float64, torch.bfloat16)
@@ -3016,7 +3560,7 @@ def test_logaddexp2(self, device, dtype):
         self._test_logaddexp(device, dtype, base2=True)
 
     def test_add(self, device):
-        dtypes = [torch.float, torch.double] + get_all_complex_dtypes()
+        dtypes = floating_and_complex_types()
         for dtype in dtypes:
             # [res] torch.add([res,] tensor1, tensor2)
             m1 = torch.randn(100, 100, dtype=dtype, device=device)
@@ -3082,9 +3626,15 @@ def test_add(self, device):
         self.assertEqual(torch.add(one, 1).dtype, torch.uint8)
 
         # bool
-        m1 = torch.tensor([True, False, False, True, False, False], dtype=torch.bool, device=device)
-        m2 = torch.tensor([True, True, False, False, False, True], dtype=torch.bool, device=device)
-        expected = torch.tensor([True, True, False, True, False, True], dtype=torch.bool, device=device)
+        m1 = torch.tensor(
+            [True, False, False, True, False, False], dtype=torch.bool, device=device
+        )
+        m2 = torch.tensor(
+            [True, True, False, False, False, True], dtype=torch.bool, device=device
+        )
+        expected = torch.tensor(
+            [True, True, False, True, False, True], dtype=torch.bool, device=device
+        )
         self.assertEqual(m1 + m2, expected)
 
         # fused multiply add
@@ -3094,56 +3644,70 @@ def test_add(self, device):
         self.assertEqual(res, expected)
 
         # bfloat16
-        m1 = torch.tensor([1., 2.], dtype=torch.bfloat16)
-        m2 = torch.tensor([3., 4.], dtype=torch.bfloat16)
-        self.assertEqual(m1 + m2, torch.tensor([4., 6.], dtype=torch.bfloat16))
+        m1 = torch.tensor([1.0, 2.0], dtype=torch.bfloat16)
+        m2 = torch.tensor([3.0, 4.0], dtype=torch.bfloat16)
+        self.assertEqual(m1 + m2, torch.tensor([4.0, 6.0], dtype=torch.bfloat16))
 
         # different alpha types
         m1 = torch.tensor([2 + 3j, 4 + 5j], dtype=torch.complex64, device=device)
         m2 = torch.tensor([4 + 5j, 2 + 3j], dtype=torch.complex64, device=device)
         # add complex numbers with float alpha
         res = torch.add(m1, m2, alpha=0.1)
-        expected = torch.tensor([2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device)
+        expected = torch.tensor(
+            [2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device
+        )
         self.assertEqual(res, expected)
 
         # add complex numbers with complex alpha
         res = torch.add(m1, m2, alpha=complex(0.1, 0.2))
-        expected = torch.tensor([1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device)
+        expected = torch.tensor(
+            [1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device
+        )
         self.assertEqual(res, expected)
 
         # add complex numbers with integer alpha
         res = torch.add(m1, m2, alpha=2)
-        expected = torch.tensor([10. + 13.j, 8. + 11.j], dtype=torch.complex64, device=device)
+        expected = torch.tensor(
+            [10.0 + 13.0j, 8.0 + 11.0j], dtype=torch.complex64, device=device
+        )
         self.assertEqual(res, expected)
 
         # mismatched alpha
         m1 = torch.tensor([1], dtype=torch.int8, device=device)
         m2 = torch.tensor([2], dtype=torch.int8, device=device)
-        self.assertRaisesRegex(RuntimeError,
-                               r"Boolean alpha only supported for Boolean results\.",
-                               lambda: torch.add(m1, m2, alpha=True))
-        self.assertRaisesRegex(RuntimeError,
-                               r"For integral input tensors, argument alpha must not be a floating point number\.",
-                               lambda: torch.add(m1, m2, alpha=1.0))
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"Boolean alpha only supported for Boolean results\.",
+            lambda: torch.add(m1, m2, alpha=True),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"For integral input tensors, argument alpha must not be a floating point number\.",
+            lambda: torch.add(m1, m2, alpha=1.0),
+        )
 
         # mismatched alpha, float / double tensor and complex alpha
         msg = r"For non-complex input tensors, argument alpha must not be a complex number\."
-        m1 = torch.tensor([3., 4.], device=device)
-        m2 = torch.tensor([4., 3.], device=device)
-        self.assertRaisesRegex(RuntimeError, msg,
-                               lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+        m1 = torch.tensor([3.0, 4.0], device=device)
+        m2 = torch.tensor([4.0, 3.0], device=device)
+        self.assertRaisesRegex(
+            RuntimeError, msg, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))
+        )
 
-        m1 = torch.tensor([3., 4.], dtype=torch.double, device=device)
-        m2 = torch.tensor([4., 3.], dtype=torch.double, device=device)
-        self.assertRaisesRegex(RuntimeError, msg,
-                               lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+        m1 = torch.tensor([3.0, 4.0], dtype=torch.double, device=device)
+        m2 = torch.tensor([4.0, 3.0], dtype=torch.double, device=device)
+        self.assertRaisesRegex(
+            RuntimeError, msg, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))
+        )
 
         # complex
         m1 = torch.tensor((4.0000 + 4.0000j), dtype=torch.complex64)
-        m2 = torch.tensor(4., dtype=torch.float64)
-        self.assertRaisesRegex(RuntimeError, r"result type ComplexFloat can't be cast to the desired output type Double",
-                               lambda: torch.add(m1, m1, out=m2))
-
+        m2 = torch.tensor(4.0, dtype=torch.float64)
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"result type ComplexFloat can't be cast to the desired output type Double",
+            lambda: torch.add(m1, m1, out=m2),
+        )
 
     @onlyCUDA
     def test_addsub_half_tensor(self, device):
@@ -3158,30 +3722,44 @@ def test_addsub_half_tensor(self, device):
             self.assertTrue(not (actual.isnan() or actual.isinf()))
 
     def test_sub_typing(self, device):
-        m1 = torch.tensor([True, False, False, True, False, False], dtype=torch.bool, device=device)
-        m2 = torch.tensor([True, True, False, False, False, True], dtype=torch.bool, device=device)
-        self.assertRaisesRegex(RuntimeError,
-                               r"Subtraction, the `\-` operator, with two bool tensors is not supported. "
-                               r"Use the `\^` or `logical_xor\(\)` operator instead.",
-                               lambda: m1 - m2)
-        self.assertRaisesRegex(RuntimeError,
-                               r"Subtraction, the `\-` operator, with a bool tensor is not supported. "
-                               r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.",
-                               lambda: 1 - m1)
-        self.assertRaisesRegex(RuntimeError,
-                               r"Subtraction, the `\-` operator, with a bool tensor is not supported. "
-                               r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.",
-                               lambda: m2 - 1)
+        m1 = torch.tensor(
+            [True, False, False, True, False, False], dtype=torch.bool, device=device
+        )
+        m2 = torch.tensor(
+            [True, True, False, False, False, True], dtype=torch.bool, device=device
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"Subtraction, the `\-` operator, with two bool tensors is not supported. "
+            r"Use the `\^` or `logical_xor\(\)` operator instead.",
+            lambda: m1 - m2,
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"Subtraction, the `\-` operator, with a bool tensor is not supported. "
+            r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.",
+            lambda: 1 - m1,
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"Subtraction, the `\-` operator, with a bool tensor is not supported. "
+            r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.",
+            lambda: m2 - 1,
+        )
 
         # mismatched alpha
         m1 = torch.tensor([1], dtype=torch.int8, device=device)
         m2 = torch.tensor([2], dtype=torch.int8, device=device)
-        self.assertRaisesRegex(RuntimeError,
-                               r"Boolean alpha only supported for Boolean results\.",
-                               lambda: torch.sub(m1, m2, alpha=True))
-        self.assertRaisesRegex(RuntimeError,
-                               r"For integral input tensors, argument alpha must not be a floating point number\.",
-                               lambda: torch.sub(m1, m2, alpha=1.0))
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"Boolean alpha only supported for Boolean results\.",
+            lambda: torch.sub(m1, m2, alpha=True),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"For integral input tensors, argument alpha must not be a floating point number\.",
+            lambda: torch.sub(m1, m2, alpha=1.0),
+        )
 
     def test_mul(self, device):
         m1 = torch.randn(10, 10, device=device)
@@ -3194,31 +3772,61 @@ def test_mul(self, device):
 
         a1 = torch.tensor([True, False, False, True], dtype=torch.bool, device=device)
         a2 = torch.tensor([True, False, True, False], dtype=torch.bool, device=device)
-        self.assertEqual(a1 * a2, torch.tensor([True, False, False, False], dtype=torch.bool, device=device))
+        self.assertEqual(
+            a1 * a2,
+            torch.tensor([True, False, False, False], dtype=torch.bool, device=device),
+        )
 
-        if device == 'cpu':
+        if device == "cpu":
             a1 = torch.tensor([0.1, 0.1], dtype=torch.bfloat16, device=device)
             a2 = torch.tensor([1.1, 0.1], dtype=torch.bfloat16, device=device)
-            self.assertEqual(a1 * a2, torch.tensor([0.11, 0.01], dtype=torch.bfloat16, device=device), atol=0.01, rtol=0)
+            self.assertEqual(
+                a1 * a2,
+                torch.tensor([0.11, 0.01], dtype=torch.bfloat16, device=device),
+                atol=0.01,
+                rtol=0,
+            )
             self.assertEqual(a1.mul(a2), a1 * a2)
 
     def test_bool_tensor_comparison_ops(self, device):
-        a = torch.tensor([True, False, True, False, True, False], dtype=torch.bool, device=device)
-        b = torch.tensor([True, False, True, True, True, True], dtype=torch.bool, device=device)
-        self.assertEqual(a == b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device))
-        self.assertEqual(a != b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device))
-        self.assertEqual(a < b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device))
-        self.assertEqual(a > b, torch.tensor([0, 0, 0, 0, 0, 0], dtype=torch.bool, device=device))
-        self.assertEqual(a >= b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device))
-        self.assertEqual(a <= b, torch.tensor([1, 1, 1, 1, 1, 1], dtype=torch.bool, device=device))
-        self.assertEqual(a > False, torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device))
-        self.assertEqual(a == torch.tensor(True, dtype=torch.bool, device=device),
-                         torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device))
-        self.assertEqual(a == torch.tensor(0, dtype=torch.bool, device=device),
-                         torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device))
+        a = torch.tensor(
+            [True, False, True, False, True, False], dtype=torch.bool, device=device
+        )
+        b = torch.tensor(
+            [True, False, True, True, True, True], dtype=torch.bool, device=device
+        )
+        self.assertEqual(
+            a == b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a != b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a < b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a > b, torch.tensor([0, 0, 0, 0, 0, 0], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a >= b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a <= b, torch.tensor([1, 1, 1, 1, 1, 1], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a > False, torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device)
+        )
+        self.assertEqual(
+            a == torch.tensor(True, dtype=torch.bool, device=device),
+            torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device),
+        )
+        self.assertEqual(
+            a == torch.tensor(0, dtype=torch.bool, device=device),
+            torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device),
+        )
         self.assertFalse(a.equal(b))
 
-    @dtypes(*get_all_dtypes(include_complex=False))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16, torch.bool))
     def test_logical(self, device, dtype):
         if dtype != torch.bool:
             x = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype)
@@ -3252,10 +3860,20 @@ def _test_atan2_with_size(size, device):
             actual = a.atan2(b)
             x = a.view(-1)
             y = b.view(-1)
-            expected = torch.tensor([math.atan2(x[i].item(), y[i].item()) for i in range(x.numel())],
-                                    device=device, dtype=torch.double)
+            expected = torch.tensor(
+                [math.atan2(x[i].item(), y[i].item()) for i in range(x.numel())],
+                device=device,
+                dtype=torch.double,
+            )
             self.assertEqual(expected, actual.view(-1), rtol=0, atol=0.02)
 
+            # bfloat16
+            a_bf16 = a.bfloat16()
+            b_bf16 = b.bfloat16()
+            actual_bf16 = a_bf16.atan2(b_bf16)
+            self.assertEqual(actual_bf16, actual.bfloat16())
+            self.assertEqual(expected, actual_bf16.view(-1), exact_dtype=False, rtol=0, atol=0.02)
+
         _test_atan2_with_size((2, 2), device)
         _test_atan2_with_size((3, 3), device)
         _test_atan2_with_size((5, 5), device)
@@ -3274,10 +3892,10 @@ def _test_atan2(x, y, expected, device, dtype):
             _test_atan2(0, -1, math.pi / -2, device, dtype)
             _test_atan2(-1, 0, math.pi, device, dtype)
             _test_atan2(1, 0, 0, device, dtype)
-            _test_atan2(-1, -1, math.pi * -3 / 4 , device, dtype)
-            _test_atan2(1, 1, math.pi / 4 , device, dtype)
-            _test_atan2(1, -1, math.pi / -4 , device, dtype)
-            _test_atan2(-1, 1, math.pi * 3 / 4 , device, dtype)
+            _test_atan2(-1, -1, math.pi * -3 / 4, device, dtype)
+            _test_atan2(1, 1, math.pi / 4, device, dtype)
+            _test_atan2(1, -1, math.pi / -4, device, dtype)
+            _test_atan2(-1, 1, math.pi * 3 / 4, device, dtype)
 
     def test_trapezoid(self, device):
         def test_dx(sizes, dim, dx, device):
@@ -3300,7 +3918,9 @@ def test_x(sizes, dim, x, device):
         test_dx((0, 2), 0, 1.0, device)
         test_dx((0, 2), 1, 1.0, device)
         test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
-        test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device)
+        test_x(
+            (10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device
+        )
         test_x((1, 10), 0, [1.0], device)
         test_x((0, 2), 0, [], device)
         test_x((0, 2), 1, [1.0, 2.0], device)
@@ -3309,14 +3929,12 @@ def test_x(sizes, dim, x, device):
         test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
         test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device)
         test_x((2, 2, 4), -1, [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], device)
-        with self.assertRaisesRegex(
-                IndexError,
-                'Dimension out of range'):
+        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
             test_x((2, 3), 2, [], device)
             test_dx((2, 3), 2, 1.0, device)
         with self.assertRaisesRegex(
-                RuntimeError,
-                'There must be one `x` value for each sample point'):
+            RuntimeError, "There must be one `x` value for each sample point"
+        ):
             test_x((2, 3), 1, [1.0, 2.0], device)
             test_x((2, 3), 1, [1.0, 2.0, 3.0, 4.0], device)
 
@@ -3325,7 +3943,7 @@ def test_cumulative_trapezoid(self, device):
 
         import scipy.integrate
 
-        if hasattr(scipy.integrate, 'cumulative_trapezoid'):
+        if hasattr(scipy.integrate, "cumulative_trapezoid"):
             scipy_cumulative_trapezoid = scipy.integrate.cumulative_trapezoid
         else:  # Older version of SciPy uses a different name
             scipy_cumulative_trapezoid = scipy.integrate.cumtrapz
@@ -3340,14 +3958,20 @@ def test_dx(sizes, dim, dx, device):
 
         def test_x(sizes, dim, x, device):
             t = torch.randn(sizes, device=device)
-            actual = torch.cumulative_trapezoid(t, x=torch.tensor(x, device=device), dim=dim)
+            actual = torch.cumulative_trapezoid(
+                t, x=torch.tensor(x, device=device), dim=dim
+            )
             expected = scipy_cumulative_trapezoid(t.cpu().numpy(), x=x, axis=dim)
             self.assertEqual(expected.shape, actual.shape)
-            self.assertEqual(expected, actual.cpu(), exact_dtype=False, atol=1e-4, rtol=1e-4)
+            self.assertEqual(
+                expected, actual.cpu(), exact_dtype=False, atol=1e-4, rtol=1e-4
+            )
 
         def test_empty_x(sizes, dim, x, device):
             t = torch.randn(sizes, device=device)
-            actual = torch.cumulative_trapezoid(t, x=torch.tensor(x, device=device), dim=dim)
+            actual = torch.cumulative_trapezoid(
+                t, x=torch.tensor(x, device=device), dim=dim
+            )
             self.assertEqual(torch.empty(actual.shape), actual)
 
         test_dx((2,), -1, 1, device)
@@ -3364,7 +3988,9 @@ def test_empty_x(sizes, dim, x, device):
         test_x((2,), -1, [100, 50], device)
         test_x((4, 2), 0, [2, 3, 4, 5], device)
         test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
-        test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device)
+        test_x(
+            (10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device
+        )
         test_x((1, 10), 0, [1.0], device)
         test_x((0, 2), 1, [1, 2], device)
         test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device)
@@ -3372,40 +3998,51 @@ def test_empty_x(sizes, dim, x, device):
         test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
         test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device)
 
-        test_empty_x((0, 2), 0, [], device)  # SciPy failing when x == [], but our version returns empty
+        test_empty_x(
+            (0, 2), 0, [], device
+        )  # SciPy failing when x == [], but our version returns empty
 
-        with self.assertRaisesRegex(
-                IndexError,
-                'Dimension out of range'):
+        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
             test_x((2, 3), 2, [], device)
             test_dx((2, 3), 2, 1.0, device)
         with self.assertRaisesRegex(
-                RuntimeError,
-                'There must be one `x` value for each sample point'):
+            RuntimeError, "There must be one `x` value for each sample point"
+        ):
             test_x((2, 3), 1, [1.0, 2.0], device)
             test_x((0, 2), 0, [1.0, 2.0], device)
             test_x((2, 3), 1, [1.0, 2.0, 3.0, 4.0], device)
         with self.assertRaisesRegex(
-                RuntimeError,
-                'Currently, we only support dx as a real number'):
-            test_dx((2, 2), -1, complex(1, 1) , device)
+            RuntimeError, "Currently, we only support dx as a real number"
+        ):
+            test_dx((2, 2), -1, complex(1, 1), device)
         with self.assertRaisesRegex(
-                TypeError, 'received an invalid combination of arguments'):
-            actual = torch.cumulative_trapezoid(torch.randn((3, 3)), x=torch.randn((3, 3)), dx=3)
+            TypeError, "received an invalid combination of arguments"
+        ):
+            actual = torch.cumulative_trapezoid(
+                torch.randn((3, 3)), x=torch.randn((3, 3)), dx=3
+            )
 
+    @skipMeta
     @dtypes(torch.double)
     def test_pow_scalar_overloads_mem_overlap(self, device, dtype):
         sz = 3
         doubles = torch.randn(2 * sz, dtype=dtype, device=device)
-        self.check_internal_mem_overlap(
-            lambda t: t.pow_(42), 1, dtype, device)
+        self.check_internal_mem_overlap(lambda t: t.pow_(42), 1, dtype, device)
         self.unary_check_input_output_mem_overlap(
-            doubles, sz, lambda input, out: torch.pow(input, 42, out=out))
+            doubles, sz, lambda input, out: torch.pow(input, 42, out=out)
+        )
         self.unary_check_input_output_mem_overlap(
-            doubles, sz, lambda input, out: torch.pow(42, input, out=out))
+            doubles, sz, lambda input, out: torch.pow(42, input, out=out)
+        )
 
-    @dtypes(*list(product(get_all_dtypes(include_bool=False),
-                          get_all_dtypes(include_bool=False))))
+    @dtypes(
+        *list(
+            product(
+                all_types_and_complex_and(torch.half, torch.bfloat16),
+                all_types_and_complex_and(torch.half, torch.bfloat16),
+            )
+        )
+    )
     def test_float_power(self, device, dtypes):
         def to_np(value):
             if isinstance(value, torch.Tensor) and value.dtype == torch.bfloat16:
@@ -3414,25 +4051,43 @@ def to_np(value):
 
         base_dtype = dtypes[0]
         exp_dtype = dtypes[1]
-        out_dtype = torch.complex128 if base_dtype.is_complex or exp_dtype.is_complex else torch.float64
+        out_dtype = (
+            torch.complex128
+            if base_dtype.is_complex or exp_dtype.is_complex
+            else torch.float64
+        )
 
-        base = make_tensor((30,), device, base_dtype, low=1, high=100)
+        base = make_tensor((30,), dtype=base_dtype, device=device, low=1, high=100)
         # Complex and real results do not agree between PyTorch and NumPy when computing negative and zero power of 0
         # Related: https://github.com/pytorch/pytorch/issues/48000
         # base[0] = base[3] = base[7] = 0
-        exp = make_tensor((30,), device, exp_dtype, low=-2, high=2)
+        exp = make_tensor((30,), dtype=exp_dtype, device=device, low=-2, high=2)
         exp[0] = exp[4] = exp[6] = 0
 
         expected = torch.from_numpy(np.float_power(to_np(base), to_np(exp)))
 
         exponents = [-2.8, -2, -1, -0.5, 0.5, 1, 2]
-        complex_exponents = exponents + [-2.5j, -1.0j, 1.0j, 2.5j, 1.0 + 1.0j, -1.0 - 1.5j, 3.3j]
+        complex_exponents = exponents + [
+            -2.5j,
+            -1.0j,
+            1.0j,
+            2.5j,
+            1.0 + 1.0j,
+            -1.0 - 1.5j,
+            3.3j,
+        ]
 
-        for op in (torch.float_power, torch.Tensor.float_power, torch.Tensor.float_power_):
+        for op in (
+            torch.float_power,
+            torch.Tensor.float_power,
+            torch.Tensor.float_power_,
+        ):
 
             # Case of Tensor x Tensor
             if op is torch.Tensor.float_power_ and base_dtype != out_dtype:
-                with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "operation's result requires dtype"
+                ):
                     op(base.clone(), exp)
             else:
                 result = op(base.clone(), exp)
@@ -3445,24 +4100,39 @@ def to_np(value):
 
             # Case of Tensor x Scalar
             for i in complex_exponents if exp_dtype.is_complex else exponents:
-                out_dtype_scalar_exp = torch.complex128 if base_dtype.is_complex or type(i) == complex else torch.float64
+                out_dtype_scalar_exp = (
+                    torch.complex128
+                    if base_dtype.is_complex or type(i) == complex
+                    else torch.float64
+                )
                 expected_scalar_exp = torch.from_numpy(np.float_power(to_np(base), i))
 
-                if op is torch.Tensor.float_power_ and base_dtype != out_dtype_scalar_exp:
-                    with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"):
+                if (
+                    op is torch.Tensor.float_power_
+                    and base_dtype != out_dtype_scalar_exp
+                ):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "operation's result requires dtype"
+                    ):
                         op(base.clone(), i)
                 else:
                     result = op(base.clone(), i)
                     self.assertEqual(expected_scalar_exp, result)
 
                 if op is torch.float_power:
-                    out = torch.empty_like(base).to(device=device, dtype=out_dtype_scalar_exp)
+                    out = torch.empty_like(base).to(
+                        device=device, dtype=out_dtype_scalar_exp
+                    )
                     op(base, i, out=out)
                     self.assertEqual(expected_scalar_exp, out)
 
         # Case of Scalar x Tensor
         for i in complex_exponents if base_dtype.is_complex else exponents:
-            out_dtype_scalar_base = torch.complex128 if exp_dtype.is_complex or type(i) == complex else torch.float64
+            out_dtype_scalar_base = (
+                torch.complex128
+                if exp_dtype.is_complex or type(i) == complex
+                else torch.float64
+            )
             expected_scalar_base = torch.from_numpy(np.float_power(i, to_np(exp)))
 
             result = torch.float_power(i, exp)
@@ -3481,8 +4151,13 @@ def _promo_helper(x, y):
                     return torch.complex128
             return torch.double
 
-        test_cases = ((torch.tensor([-2, -1, 0, 1, 2], device=device), -.25),
-                      (torch.tensor([-1.0j, 0j, 1.0j, 1.0 + 1.0j, -1.0 - 1.5j], device=device), 2.))
+        test_cases = (
+            (torch.tensor([-2, -1, 0, 1, 2], device=device), -0.25),
+            (
+                torch.tensor([-1.0j, 0j, 1.0j, 1.0 + 1.0j, -1.0 - 1.5j], device=device),
+                2.0,
+            ),
+        )
         for base, exp in test_cases:
             for out_dtype in (torch.long, torch.float, torch.double, torch.cdouble):
                 out = torch.empty(1, device=device, dtype=out_dtype)
@@ -3491,18 +4166,25 @@ def _promo_helper(x, y):
                 if out.dtype == required_dtype:
                     torch.float_power(base, exp, out=out)
                 else:
-                    with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "operation's result requires dtype"
+                    ):
                         torch.float_power(base, exp, out=out)
 
                 if base.dtype == required_dtype:
                     torch.Tensor.float_power_(base.clone(), exp)
                 else:
-                    with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "operation's result requires dtype"
+                    ):
                         torch.Tensor.float_power_(base.clone(), exp)
 
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
-    @dtypes(*product(get_all_dtypes(include_complex=False, include_bfloat16=False),
-                     get_all_dtypes(include_complex=False, include_bfloat16=False)))
+    @dtypes(
+        *product(
+            all_types_and(torch.half, torch.bool), all_types_and(torch.half, torch.bool)
+        )
+    )
     def test_xlogy_xlog1py(self, device, dtypes):
         x_dtype, y_dtype = dtypes
 
@@ -3513,9 +4195,10 @@ def out_variant_helper(torch_fn, x, y):
             self.assertEqual(expected, out)
 
         def xlogy_inplace_variant_helper(x, y):
-            if x.dtype in get_all_int_dtypes() + [torch.bool]:
-                with self.assertRaisesRegex(RuntimeError,
-                                            "can't be cast to the desired output type"):
+            if x.dtype in integral_types_and(torch.bool):
+                with self.assertRaisesRegex(
+                    RuntimeError, "can't be cast to the desired output type"
+                ):
                     x.clone().xlogy_(y)
             else:
                 expected = torch.empty_like(x)
@@ -3527,9 +4210,15 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None):
             x, y, z = inputs
             torch_fn_partial = partial(torch_fn, x)
             reference_fn_partial = partial(reference_fn, x.cpu().numpy())
-            self.compare_with_numpy(torch_fn_partial, reference_fn_partial, x, exact_dtype=False)
-            self.compare_with_numpy(torch_fn_partial, reference_fn_partial, y, exact_dtype=False)
-            self.compare_with_numpy(torch_fn_partial, reference_fn_partial, z, exact_dtype=False)
+            self.compare_with_numpy(
+                torch_fn_partial, reference_fn_partial, x, exact_dtype=False
+            )
+            self.compare_with_numpy(
+                torch_fn_partial, reference_fn_partial, y, exact_dtype=False
+            )
+            self.compare_with_numpy(
+                torch_fn_partial, reference_fn_partial, z, exact_dtype=False
+            )
 
             val = scalar if scalar is not None else x
             out_variant_helper(torch_fn, val, x)
@@ -3537,13 +4226,17 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None):
             out_variant_helper(torch_fn, val, z)
 
         # Tensor-Tensor Test (tensor of same and different shape)
-        x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000)
-        y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000)
-        z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000)
+        x = make_tensor((3, 2, 4, 5), dtype=x_dtype, device=device, low=0.5, high=1000)
+        y = make_tensor((3, 2, 4, 5), dtype=y_dtype, device=device, low=0.5, high=1000)
+        z = make_tensor((4, 5), dtype=y_dtype, device=device, low=0.5, high=1000)
 
-        x_1p = make_tensor((3, 2, 4, 5), device, x_dtype, low=-0.5, high=1000)
-        y_1p = make_tensor((3, 2, 4, 5), device, y_dtype, low=-0.5, high=1000)
-        z_1p = make_tensor((4, 5), device, y_dtype, low=-0.5, high=1000)
+        x_1p = make_tensor(
+            (3, 2, 4, 5), dtype=x_dtype, device=device, low=-0.5, high=1000
+        )
+        y_1p = make_tensor(
+            (3, 2, 4, 5), dtype=y_dtype, device=device, low=-0.5, high=1000
+        )
+        z_1p = make_tensor((4, 5), dtype=y_dtype, device=device, low=-0.5, high=1000)
 
         xlogy_fns = torch.xlogy, scipy.special.xlogy
         xlog1py_fns = torch.special.xlog1py, scipy.special.xlog1py
@@ -3559,7 +4252,10 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None):
         test_helper(*xlog1py_fns, (x_1p, y_1p, z_1p), 3.14)
 
         # Special Values Tensor-Tensor
-        t = torch.tensor([-1., 0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device)
+        t = torch.tensor(
+            [-1.0, 0.0, 1.0, 2.0, float("inf"), -float("inf"), float("nan")],
+            device=device,
+        )
         zeros = torch.zeros(7, dtype=y_dtype, device=device)
 
         def test_zeros_special_helper(torch_fn, reference_fn, scalar=False):
@@ -3567,7 +4263,9 @@ def test_zeros_special_helper(torch_fn, reference_fn, scalar=False):
             zeros_np = 0 if scalar else zeros.cpu().numpy()
             torch_fn_partial = partial(torch_fn, zeros_t)
             reference_fn_partial = partial(reference_fn, zeros_np)
-            self.compare_with_numpy(torch_fn_partial, reference_fn_partial, t, exact_dtype=False)
+            self.compare_with_numpy(
+                torch_fn_partial, reference_fn_partial, t, exact_dtype=False
+            )
             out_variant_helper(torch_fn, zeros_t, t)
 
         test_zeros_special_helper(*xlogy_fns)
@@ -3584,14 +4282,14 @@ def test_xlogy_xlog1py_scalar_type_promotion(self, device):
         t = torch.randn((), dtype=torch.float32, device=device)
 
         self.assertEqual(t.dtype, torch.xlogy(t, 5).dtype)
-        self.assertEqual(t.dtype, torch.xlogy(t, 5.).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(t, 5.0).dtype)
         self.assertEqual(t.dtype, torch.special.xlog1py(t, 5).dtype)
-        self.assertEqual(t.dtype, torch.special.xlog1py(t, 5.).dtype)
+        self.assertEqual(t.dtype, torch.special.xlog1py(t, 5.0).dtype)
 
         self.assertEqual(t.dtype, torch.xlogy(5, t).dtype)
-        self.assertEqual(t.dtype, torch.xlogy(5., t).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(5.0, t).dtype)
         self.assertEqual(t.dtype, torch.special.xlog1py(5, t).dtype)
-        self.assertEqual(t.dtype, torch.special.xlog1py(5., t).dtype)
+        self.assertEqual(t.dtype, torch.special.xlog1py(5.0, t).dtype)
 
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     def test_xlogy_xlog1py_bfloat16(self, device):
@@ -3605,13 +4303,17 @@ def _compare_helper(x, y, torch_fn, reference_fn):
         x_dtype, y_dtype = torch.bfloat16, torch.bfloat16
 
         # Tensor-Tensor Test (tensor of same and different shape)
-        x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000)
-        y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000)
-        z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000)
+        x = make_tensor((3, 2, 4, 5), dtype=x_dtype, device=device, low=0.5, high=1000)
+        y = make_tensor((3, 2, 4, 5), dtype=y_dtype, device=device, low=0.5, high=1000)
+        z = make_tensor((4, 5), dtype=y_dtype, device=device, low=0.5, high=1000)
 
-        x_1p = make_tensor((3, 2, 4, 5), device, x_dtype, low=-0.8, high=1000)
-        y_1p = make_tensor((3, 2, 4, 5), device, y_dtype, low=-0.8, high=1000)
-        z_1p = make_tensor((4, 5), device, y_dtype, low=-0.8, high=1000)
+        x_1p = make_tensor(
+            (3, 2, 4, 5), dtype=x_dtype, device=device, low=-0.8, high=1000
+        )
+        y_1p = make_tensor(
+            (3, 2, 4, 5), dtype=y_dtype, device=device, low=-0.8, high=1000
+        )
+        z_1p = make_tensor((4, 5), dtype=y_dtype, device=device, low=-0.8, high=1000)
 
         xlogy_fns = torch.xlogy, scipy.special.xlogy
         xlog1py_fns = torch.special.xlog1py, scipy.special.xlog1py
@@ -3631,19 +4333,19 @@ def _compare_helper(x, y, torch_fn, reference_fn):
         _compare_helper(z_1p, 3.14, *xlog1py_fns)
 
         # Special Values Tensor-Tensor
-        t = torch.tensor([-1., 0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device)
+        t = torch.tensor(
+            [-1.0, 0.0, 1.0, 2.0, float("inf"), -float("inf"), float("nan")],
+            device=device,
+        )
         zeros = torch.tensor(7, dtype=y_dtype, device=device)
 
         _compare_helper(t, zeros, *xlogy_fns)
-        _compare_helper(t, 0., *xlogy_fns)
+        _compare_helper(t, 0.0, *xlogy_fns)
 
         _compare_helper(t, zeros, *xlog1py_fns)
-        _compare_helper(t, 0., *xlog1py_fns)
+        _compare_helper(t, 0.0, *xlog1py_fns)
 
-    @dtypes(*product(get_all_dtypes(include_complex=False,
-                                    include_half=False, include_bfloat16=False),
-                     get_all_dtypes(include_complex=False,
-                                    include_half=False, include_bfloat16=False)))
+    @dtypes(*product(all_types_and(torch.bool), all_types_and(torch.bool)))
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @slowTest
     def test_zeta(self, device, dtypes):
@@ -3656,64 +4358,106 @@ def test_helper(x, q):
             actual = torch.special.zeta(x, q)
 
             rtol, atol = None, None
-            if self.device_type == 'cpu':
+            if self.device_type == "cpu":
                 rtol, atol = 1e-6, 1e-6
             self.assertEqual(expected, actual, rtol=rtol, atol=atol, exact_dtype=False)
 
         # x tensor - q tensor same size
-        x = make_tensor((2, 3, 4), device, x_dtype)
-        q = make_tensor((2, 3, 4), device, q_dtype)
+        x = make_tensor((2, 3, 4), dtype=x_dtype, device=device)
+        q = make_tensor((2, 3, 4), dtype=q_dtype, device=device)
         test_helper(x, q)
 
         # x tensor - q tensor broadcast lhs
-        x = make_tensor((2, 1, 4), device, x_dtype)
-        q = make_tensor((2, 3, 4), device, q_dtype)
+        x = make_tensor((2, 1, 4), dtype=x_dtype, device=device)
+        q = make_tensor((2, 3, 4), dtype=q_dtype, device=device)
         test_helper(x, q)
 
         # x tensor - q tensor broadcast rhs
-        x = make_tensor((2, 3, 4), device, x_dtype)
-        q = make_tensor((2, 1, 4), device, q_dtype)
+        x = make_tensor((2, 3, 4), dtype=x_dtype, device=device)
+        q = make_tensor((2, 1, 4), dtype=q_dtype, device=device)
         test_helper(x, q)
 
         # x tensor - q tensor broadcast all
-        x = make_tensor((2, 3, 1), device, x_dtype)
-        q = make_tensor((2, 1, 4), device, q_dtype)
+        x = make_tensor((2, 3, 1), dtype=x_dtype, device=device)
+        q = make_tensor((2, 1, 4), dtype=q_dtype, device=device)
         test_helper(x, q)
 
         # x scalar - q tensor
         for x in np.linspace(-5, 5, num=10).tolist():
             if not q_dtype.is_floating_point:
                 q_dtype = torch.get_default_dtype()
-            q = make_tensor((2, 3, 4), device, q_dtype)
+            q = make_tensor((2, 3, 4), dtype=q_dtype, device=device)
             test_helper(x, q)
 
         # x tensor - q scalar
         for q in np.linspace(-5, 5, num=10).tolist():
             if not x_dtype.is_floating_point:
                 x_dtype = torch.get_default_dtype()
-            x = make_tensor((2, 3, 4), device, x_dtype)
+            x = make_tensor((2, 3, 4), dtype=x_dtype, device=device)
             test_helper(x, q)
 
+    @onlyCUDA
+    @dtypes(
+        torch.chalf,
+    )
+    def test_mul_chalf_tensor_and_cpu_scalar(self, device, dtype):
+        # Tests that Tensor and CPU Scalar work for `mul` for chalf.
+        # Ideally, this should be covered by `test_complex_half_reference_testing`
+        # from test_ops.py by checking reference_samples from the OpInfo.
+        # But currently that doesn't work as sample generation requires support of
+        # `index_select` which is not implemented for `complex32` at the
+        # time of writing this test.
+        # TODO: Remove this test once above issue is fixed.
+        # Ref: https://github.com/pytorch/pytorch/pull/76364
+        x = make_tensor((2, 2), device=device, dtype=dtype)
+        self.assertEqual(x * 2.5, x * torch.tensor(2.5, device=device, dtype=dtype))
 
-tensor_binary_ops = [
-    '__lt__', '__le__',
-    '__gt__', '__ge__',
-    '__eq__', '__ne__',
-
-    '__add__', '__radd__', '__iadd__',
-    '__sub__', '__rsub__', '__isub__',
-    '__mul__', '__rmul__', '__imul__',
-    '__matmul__', '__rmatmul__',
-    '__truediv__', '__rtruediv__', '__itruediv__',
-    '__floordiv__', '__rfloordiv__', '__ifloordiv__',
-    '__mod__', '__rmod__', '__imod__',
-    '__pow__', '__rpow__', '__ipow__',
-    '__lshift__', '__rlshift__', '__ilshift__',
-    '__rshift__', '__rrshift__', '__irshift__',
-    '__and__', '__rand__', '__iand__',
-    '__xor__', '__rxor__', '__ixor__',
-    '__or__', '__ror__', '__ior__',
 
+tensor_binary_ops = [
+    "__lt__",
+    "__le__",
+    "__gt__",
+    "__ge__",
+    "__eq__",
+    "__ne__",
+    "__add__",
+    "__radd__",
+    "__iadd__",
+    "__sub__",
+    "__rsub__",
+    "__isub__",
+    "__mul__",
+    "__rmul__",
+    "__imul__",
+    "__matmul__",
+    "__rmatmul__",
+    "__truediv__",
+    "__rtruediv__",
+    "__itruediv__",
+    "__floordiv__",
+    "__rfloordiv__",
+    "__ifloordiv__",
+    "__mod__",
+    "__rmod__",
+    "__imod__",
+    "__pow__",
+    "__rpow__",
+    "__ipow__",
+    "__lshift__",
+    "__rlshift__",
+    "__ilshift__",
+    "__rshift__",
+    "__rrshift__",
+    "__irshift__",
+    "__and__",
+    "__rand__",
+    "__iand__",
+    "__xor__",
+    "__rxor__",
+    "__ixor__",
+    "__or__",
+    "__ror__",
+    "__ior__",
     # Unsupported operators
     # '__imatmul__',
     # '__divmod__', '__rdivmod__', '__idivmod__',
@@ -3726,35 +4470,33 @@ class UnknownType:
 
     # TODO: refactor to inline these
     _types = [
-        torch.half, torch.float, torch.double,
-        torch.int8, torch.short, torch.int, torch.long,
-        torch.uint8
+        torch.half,
+        torch.float,
+        torch.double,
+        torch.int8,
+        torch.short,
+        torch.int,
+        torch.long,
+        torch.uint8,
     ]
 
-    # TODO: refactor to use make_tensor
-    def _small_2d(dtype, device, has_zeros=True, fill_ones=False, oneish=False):
-        t = _make_tensor((5, 5), dtype, device, fill_ones=fill_ones)
-        if oneish:
-            return t.clamp(min=_number(.99, 1, dtype), max=1.01)
-        if not has_zeros:
-            return t.clamp(min=(_number(_div_min, 1, dtype)))
-        return t
-
     def create_test_func(op):
         @dtypes(*_types)
         def test(self, device, dtype):
             # Generate the inputs
-            tensor = _small_2d(dtype, device)
+            tensor = torch.empty((), device=device, dtype=dtype)
 
             # Runs the tensor op on the device
             result = getattr(tensor, op)(UnknownType())
             self.assertEqual(result, NotImplemented)
+
         return test
 
     for op in tensor_binary_ops:
         test_name = "test_{}_not_implemented".format(op)
         assert not hasattr(cls, test_name), "{0} already in {1}".format(
-            test_name, cls.__name__)
+            test_name, cls.__name__
+        )
 
         setattr(cls, test_name, create_test_func(op))
 
@@ -3762,5 +4504,5 @@ def test(self, device, dtype):
 generate_not_implemented_tests(TestBinaryUfuncs)
 instantiate_device_type_tests(TestBinaryUfuncs, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_complex.py b/test/test_complex.py
index 9f2e0ad32401..88404902631f 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -3,12 +3,12 @@
 import torch
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
 from torch.testing._internal.common_utils import TestCase, run_tests
-from torch.testing._internal.common_dtype import get_all_complex_dtypes
+from torch.testing._internal.common_dtype import complex_types
 
 devices = (torch.device('cpu'), torch.device('cuda:0'))
 
 class TestComplexTensor(TestCase):
-    @dtypes(*get_all_complex_dtypes())
+    @dtypes(*complex_types())
     def test_to_list(self, device, dtype):
         # test that the complex float tensor has expected values and
         # there's no garbage value in the resultant list
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 06b1133d887f..9875f4ee3567 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -10,15 +10,12 @@
 import subprocess
 import glob
 
-import textwrap
-from multiprocessing import Process
-
 import torch.testing._internal.common_utils as common
 import torch
 import torch.backends.cudnn
 import torch.utils.cpp_extension
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
-from torch.testing._internal.common_utils import gradcheck, TEST_WITH_ASAN, has_breakpad
+from torch.testing._internal.common_utils import gradcheck
 
 
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
@@ -869,80 +866,6 @@ def test_custom_compound_op_autograd(self):
 
         gradcheck(torch.ops.my.add, [a, b], eps=1e-2)
 
-    @staticmethod
-    def _crash_handler_test_process(stderr_file, destination):
-        # Code to enable dumps and trigger a segfault
-        if sys.platform == "win32":
-            destination = destination.replace("\\", "\\\\")
-            csrc = textwrap.dedent(f"""
-            #include <torch/torch.h>
-            #include <locale>
-            #include <iostream>
-            #include <codecvt>
-            #include <string>
-
-            int fail() {{
-                std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-                std::string narrow("{destination}");
-                std::wstring wide = converter.from_bytes(narrow);
-                torch::crash_handler::enable_minidumps(wide.c_str());
-
-                volatile int* bad = nullptr;
-                return *bad;
-            }}
-            """)
-        else:
-            csrc = textwrap.dedent(f"""
-            #include <torch/torch.h>
-
-            int fail() {{
-                torch::crash_handler::enable_minidumps("{destination}");
-
-                volatile int* bad = nullptr;
-                return *bad;
-            }}
-            """)
-
-        # Some special stuff to overwrite stderr for a C++ extension
-        # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls
-        sys.stdout.flush()
-        newstdout = os.dup(2)
-        devnull = os.open(stderr_file, os.O_WRONLY)
-        os.dup2(devnull, 2)
-        os.close(devnull)
-        sys.stdout = os.fdopen(newstdout, 'w')
-
-        module = torch.utils.cpp_extension.load_inline(
-            name="segfault",
-            cpp_sources=csrc,
-            functions=["fail"],
-        )
-        module.fail()
-
-    @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
-    @unittest.skipIf(not has_breakpad(), "Built without breakpad")
-    @unittest.skipIf(os.environ.get("TEST_CONFIG") == "force_on_cpu", "fails on force_on_cpu config, tracked w/ #65253")
-    def test_crash_handler(self):
-        with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile(delete=not sys.platform == "win32") as stderr:
-            # Use multiprocessing to spin up a separate process to make catching
-            # the segfault easier
-            p = Process(target=self._crash_handler_test_process, args=(stderr.name, temp_dir))
-            p.start()
-            p.join()
-
-            with open(stderr.name) as f:
-                result = f.read().strip()
-
-            # Check that the signal handler was called
-            self.assertTrue(result.startswith(f"Wrote minidump to {temp_dir}"))
-
-            with open(result.replace("Wrote minidump to ", ""), "rb") as dump_file:
-                dump_bytes = dump_file.read()
-
-                # Check that the file has the correct magic number
-                self.assertEqual(b"MDMP", dump_bytes[0:4])
-
-
 
 if __name__ == "__main__":
     common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 7df9f637274c..c3d33224c361 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -7,6 +7,7 @@
 import ctypes
 import gc
 import io
+import os
 import pickle
 import queue
 import sys
@@ -64,7 +65,7 @@ def make_sparse_tensor(t, n, *sizes):
         torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0))
     v = tensor._values()
     v = v.new(n).copy_(torch.randn(n))
-    return t(i, v, torch.Size(sizes))
+    return t(i, v, torch.Size(sizes)).coalesce()
 
 _cycles_per_ms = None
 
@@ -568,18 +569,40 @@ def test_serialization_array_with_storage(self):
         self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor))
         self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
         self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor))
-        self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage))
-        self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda.UntypedStorage))
+        self.assertTrue(isinstance(q_copy[3], torch.storage._TypedStorage))
+        self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda._UntypedStorage))
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
     def test_cublas_allow_tf32_get_set(self):
+        skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\
+            int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE'])
+        if skip_tf32_cublas:
+            self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
+            return
+
         orig = torch.backends.cuda.matmul.allow_tf32
         self.assertEqual(torch._C._get_cublas_allow_tf32(), orig)
         torch.backends.cuda.matmul.allow_tf32 = not orig
         self.assertEqual(torch._C._get_cublas_allow_tf32(), not orig)
         torch.backends.cuda.matmul.allow_tf32 = orig
 
+    def test_float32_matmul_precision_get_set(self):
+        self.assertEqual(torch.get_float32_matmul_precision(), 'highest')
+        skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\
+            int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE'])
+        if not skip_tf32_cublas:
+            self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
+        for p in ('medium', 'high'):
+            torch.set_float32_matmul_precision(p)
+            self.assertEqual(torch.get_float32_matmul_precision(), p)
+            if not skip_tf32_cublas:
+                self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
+        torch.set_float32_matmul_precision('highest')
+        self.assertEqual(torch.get_float32_matmul_precision(), 'highest')
+        if not skip_tf32_cublas:
+            self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
+
     def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
         self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig)
@@ -1519,6 +1542,7 @@ def _spawn_test_multinomial_invalid_probs_cuda(self, probs):
         self.assertTrue(any([msg in out or msg in err for msg in expected_messages]))
 
     @slowTest
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     def test_multinomial_invalid_probs_cuda(self):
@@ -1956,6 +1980,7 @@ def worker(rank):
 t2.start()
 """])
 
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
     def test_fixed_cuda_assert_async(self):
         with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"):
             torch._assert_async(torch.tensor([], device="cuda"))
@@ -3007,7 +3032,6 @@ def test_autocast_rnn(self):
                     x = torch.randn((B, T, F), device="cuda", dtype=input_dtype)
                 elif input_layout == "packed":
                     batch_first = False
-                    x = torch.randn((T, B, F), device="cuda", dtype=input_dtype)
                     x = torch.nn.utils.rnn.pack_padded_sequence(torch.randn((T, B, F),
                                                                             device="cuda", dtype=input_dtype),
                                                                 lengths=(3, 2, 1, 3),
@@ -3103,6 +3127,18 @@ def test_max_large_axis(self):
     def test_to_numpy(self):
         self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy())
 
+    def test_graph_is_current_stream_capturing(self):
+        self.assertFalse(torch.cuda.is_current_stream_capturing())
+
+        if (TEST_CUDA and (not TEST_WITH_ROCM) and int(torch.version.cuda.split(".")[0]) >= 11):
+            s = torch.cuda.Stream()
+            with torch.cuda.stream(s):
+                g = torch.cuda.CUDAGraph()
+                self.assertFalse(torch.cuda.is_current_stream_capturing())
+                g.capture_begin()
+                self.assertTrue(torch.cuda.is_current_stream_capturing())
+                g.capture_end()
+
     @unittest.skipIf((not TEST_CUDA) or
                      TEST_WITH_ROCM or
                      int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
@@ -3124,6 +3160,14 @@ def test_graph_capture_simple(self):
 
         self.assertTrue(b.sum().item() == 11000.)
 
+    @unittest.skipIf((not TEST_CUDA) or
+                     TEST_WITH_ROCM or
+                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    def test_graph_capture_oom(self):
+        with self.assertRaisesRegex(RuntimeError, "out of memory"):
+            with torch.cuda.graph(torch.cuda.CUDAGraph()):
+                torch.zeros(2 ** 40, device="cuda")
+
     @unittest.skipIf((not TEST_CUDA) or
                      TEST_WITH_ROCM or
                      int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
@@ -3813,6 +3857,41 @@ def get_max_used():
         self.assertEqual(matmul_expand_mem, matmul_mem)
         self.assertEqual(bmm_mem, matmul_mem)
 
+    @unittest.skipIf(not TEST_WITH_ROCM, "ROCm-only test")
+    def test_rocm_backward_pass_guard(self):
+        # The test exercises a ROCm-specific feature.
+
+        class MyFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, tensor, constant):
+                self.assertFalse(torch._C._rocm_is_backward_pass())
+                ctx.constant = constant
+                return tensor * constant
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                self.assertTrue(torch._C._rocm_is_backward_pass())
+                return grad_output * ctx.constant, None
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.randn(()))
+
+            def forward(self, x):
+                return MyFunction.apply(x, self.a)
+
+        model = MyModule()
+        criterion = torch.nn.MSELoss(reduction='sum')
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
+
+        x = torch.randn(5, 5)
+        result = model(x)
+        loss = criterion(result, x)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
 
 class TestCudaComm(TestCase):
     def _test_broadcast(self, input):
@@ -3940,7 +4019,7 @@ def _test_reduce_add_coalesced(self, tensors, buffer_size):
         r_tensors = [comm.reduce_add(t) for t in zip(*dup_tensors)]
         for r, t in zip(r_tensors, tensors):
             self.assertEqualTypeString(r, t)
-            self.assertEqual(r, t * 2)
+            self.assertEqual(r.coalesce() if r.is_sparse else r, t * 2)
 
         rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size)
         self.assertEqual(r_tensors, rc_tensors)
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index bb45ad244741..6cefc78c2ed9 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -23,6 +23,7 @@
     DataLoader2,
     Dataset,
     IterableDataset,
+    IterDataPipe,
     Subset,
     TensorDataset,
     communication,
@@ -35,7 +36,8 @@
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
-                                                  load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE)
+                                                  load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE,
+                                                  IS_MACOS)
 
 
 try:
@@ -62,6 +64,14 @@
     HAS_DILL = False
 skipIfNoDill = unittest.skipIf(not HAS_DILL, "no dill")
 
+
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+skipIfNoNumpy = unittest.skipIf(not HAS_NUMPY, "no NumPy")
+
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -833,6 +843,21 @@ def __len__(self):
         return int(math.ceil(len(self.dataset) / float(self.batch_size)))
 
 
+class TestMultiEpochDataset(IterableDataset):
+    def __init__(self, length):
+        self.length = length
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        assert worker_info is not None
+        worker_id = worker_info.id
+        for idx in range(self.length // worker_info.num_workers):
+            yield worker_id
+
+    def __len__(self):
+        return self.length
+
+
 class CustomList(list):
     pass
 
@@ -841,6 +866,14 @@ class CustomDict(dict):
     pass
 
 
+def row_processor(row):
+    return np.add(row, 1)
+
+
+def filter_len(row):
+    return len(row) == 4
+
+
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
@@ -1343,6 +1376,7 @@ def test_chain_iterable_style_dataset(self):
         with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
+    @unittest.skipIf(IS_MACOS, "Not working on macos")
     def test_multiprocessing_contexts(self):
         reference = [
             torch.arange(3),
@@ -1366,6 +1400,30 @@ def test_multiprocessing_contexts(self):
                 self.assertEqual(
                     reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args)))
 
+    @skipIfNoNumpy
+    def test_multiprocessing_iterdatapipe(self):
+        # Testing to make sure that function from global scope (e.g. imported from library) can be serialized
+        # and used with multiprocess DataLoader
+
+        reference = [torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64),
+                     torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64)]
+        datapipe: IterDataPipe = IterableWrapper([[1, 2, 3, 4], [1, 2, 3, 4, 5, 6]])
+        datapipe = datapipe.map(row_processor)
+        datapipe = datapipe.filter(lambda row: len(row) == 4) if HAS_DILL else datapipe.filter(filter_len)
+
+        dl_common_args = dict(num_workers=2, batch_size=2, shuffle=True, pin_memory=(not TEST_CUDA))
+        for ctx in supported_multiprocessing_contexts:
+            self.assertEqual(reference,
+                             [t.type(torch.int64)
+                              for t in self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)])
+            if ctx is not None:
+                # test ctx object
+                ctx = mp.get_context(ctx)
+                self.assertEqual(reference,
+                                 [t.type(torch.int64)
+                                  for t in
+                                  self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)])
+
     def test_worker_seed(self):
         num_workers = 6
         batch_size = 1
@@ -1385,6 +1443,19 @@ def get_dataloader():
         dataset = SynchronizedSeedDataset(num_workers, batch_size, num_workers)
         self.assertEqual(set(int(batch) for batch in get_dataloader()), set(int(batch) for batch in get_dataloader()))
 
+    def test_multi_epochs_reproducibility(self):
+        num_workers = 2
+        batch_size = 10
+        num_epochs = 3
+
+        dataset = TestMultiEpochDataset(batch_size * num_workers)
+        dataloader = self._get_data_loader(dataset, batch_size=batch_size,
+                                           shuffle=False, num_workers=num_workers)
+
+        for ind in range(num_epochs):
+            for batch_idx, sample in enumerate(dataloader):
+                self.assertEqual(sample.tolist(), [batch_idx % num_workers] * batch_size)
+
     def test_worker_init_fn(self):
         dataset = SeedDataset(4)
         dataloader = self._get_data_loader(dataset, batch_size=2, num_workers=2,
@@ -2104,6 +2175,13 @@ def test_basics(self):
         self.assertEqual(list(dl), list(dl2))
         self.assertEqual(list(dl), list(dl2_threading))
 
+    class Sorter(IterDataPipe):
+        def __init__(self, datapipe):
+            self.datapipe = datapipe
+
+        def __iter__(self):
+            return iter(sorted(self.datapipe))
+
     def test_shuffle(self):
         items = list(range(1000))
         dp = IterableWrapper(items).sharding_filter().shuffle()
@@ -2111,19 +2189,27 @@ def test_shuffle(self):
         dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=False)
         self.assertEqual(items, list(dl))
 
-        dl = DataLoader(dp, batch_size=None, num_workers=2, shuffle=False,
-                        worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
+        dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=False,
+                         worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
         self.assertEqual(items, list(dl))
 
         dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True)
         self.assertNotEqual(items, list(dl))
         self.assertEqual(items, sorted(list(dl)))
 
-        dl = DataLoader(dp, batch_size=None, num_workers=2, shuffle=True,
-                        worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
+        dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True,
+                         worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
         self.assertNotEqual(items, list(dl))
         self.assertEqual(items, sorted(list(dl)))
 
+        dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True)
+        self.assertEqual(list(dl), items)
+
+        dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True,
+                         worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn)
+        self.assertEqual(list(dl), items)
+
+
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
@@ -2265,6 +2351,19 @@ def test_pin_memory(self):
             self.assertTrue(sample['a_tensor'].is_pinned())
             self.assertTrue(sample['another_dict']['a_number'].is_pinned())
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_pin_memory_device(self):
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory=True, pin_memory_device='cuda')
+        for sample in loader:
+            self.assertTrue(sample['a_tensor'].is_pinned(device='cuda'))
+            self.assertTrue(sample['another_dict']['a_number'].is_pinned(device='cuda'))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_pin_memory_with_only_device(self):
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory_device='cuda')
+        for sample in loader:
+            self.assertFalse(sample['a_tensor'].is_pinned(device='cuda'))
+            self.assertFalse(sample['another_dict']['a_number'].is_pinned(device='cuda'))
 
 class DummyDataset(torch.utils.data.Dataset):
     def __init__(self):
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index ab56f0b41eb9..7e76618f338f 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -53,9 +53,9 @@
 from torch.utils.data.datapipes.dataframe import CaptureDataFrame
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
 
-
 try:
     import dill
+
     # XXX: By default, dill writes the Pickler dispatch table to inject its
     # own logic there. This globally affects the behavior of the standard library
     # pickler for any user who transitively depends on this module!
@@ -68,6 +68,7 @@
 
 try:
     import pandas  # type: ignore[import] # noqa: F401 F403
+
     HAS_PANDAS = True
 except ImportError:
     HAS_PANDAS = False
@@ -234,16 +235,16 @@ def test_api(self):
         self.assertTrue(fd.closed)
 
     def test_pickle(self):
-        f = tempfile.TemporaryFile()
-        with self.assertRaises(TypeError) as ctx1:
-            pickle.dumps(f)
+        with tempfile.TemporaryFile() as f:
+            with self.assertRaises(TypeError) as ctx1:
+                pickle.dumps(f)
 
-        wrap_f = StreamWrapper(f)
-        with self.assertRaises(TypeError) as ctx2:
-            pickle.dumps(wrap_f)
+            wrap_f = StreamWrapper(f)
+            with self.assertRaises(TypeError) as ctx2:
+                pickle.dumps(wrap_f)
 
-        # Same exception when pickle
-        self.assertEqual(str(ctx1.exception), str(ctx2.exception))
+            # Same exception when pickle
+            self.assertEqual(str(ctx1.exception), str(ctx2.exception))
 
         fd = TestStreamWrapper._FakeFD("")
         wrap_fd = StreamWrapper(fd)
@@ -254,9 +255,9 @@ def test_repr(self):
         wrap_fd = StreamWrapper(fd)
         self.assertEqual(str(wrap_fd), "StreamWrapper<FakeFD>")
 
-        f = tempfile.TemporaryFile()
-        wrap_f = StreamWrapper(f)
-        self.assertEqual(str(wrap_f), "StreamWrapper<" + str(f) + ">")
+        with tempfile.TemporaryFile() as f:
+            wrap_f = StreamWrapper(f)
+            self.assertEqual(str(wrap_f), "StreamWrapper<" + str(f) + ">")
 
 
 class TestIterableDataPipeBasic(TestCase):
@@ -310,7 +311,7 @@ def test_listdirfilesdeterministic_iterable_datapipe(self):
         # The output order should be always the same.
         self.assertEqual(list(datapipe), list(datapipe))
 
-    def test_readfilesfromdisk_iterable_datapipe(self):
+    def test_openfilesfromdisk_iterable_datapipe(self):
         # test import datapipe class directly
         from torch.utils.data.datapipes.iter import (
             FileLister,
@@ -330,6 +331,22 @@ def test_readfilesfromdisk_iterable_datapipe(self):
                 rec[1].close()
         self.assertEqual(count, len(self.temp_files))
 
+        # functional API
+        datapipe3 = datapipe1.open_files(mode='b')
+
+        count = 0
+        for rec in datapipe3:
+            count = count + 1
+            self.assertTrue(rec[0] in self.temp_files)
+            with open(rec[0], 'rb') as f:
+                self.assertEqual(rec[1].read(), f.read())
+                rec[1].close()
+        self.assertEqual(count, len(self.temp_files))
+
+        # __len__ Test
+        with self.assertRaises(TypeError):
+            len(datapipe3)
+
     def test_routeddecoder_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png")
@@ -361,12 +378,14 @@ def _helper(prior_dp, dp, channel_first=False):
                 self.assertTrue(inp[1].closed)
 
         cached = list(datapipe2)
-        datapipe3 = dp.iter.RoutedDecoder(cached, _png_decoder)
+        with warnings.catch_warnings(record=True) as wa:
+            datapipe3 = dp.iter.RoutedDecoder(cached, _png_decoder)
         datapipe3.add_handler(decoder_basichandlers)
         _helper(cached, datapipe3)
 
         cached = list(datapipe2)
-        datapipe4 = dp.iter.RoutedDecoder(cached, decoder_basichandlers)
+        with warnings.catch_warnings(record=True) as wa:
+            datapipe4 = dp.iter.RoutedDecoder(cached, decoder_basichandlers)
         datapipe4.add_handler(_png_decoder)
         _helper(cached, datapipe4, channel_first=True)
 
@@ -415,7 +434,7 @@ def test_demux_mux_datapipe(self):
         numbers = NumbersDataset(10)
         n1, n2, n3 = numbers.demux(3, lambda x: x % 3)
         n = n1.mux(n2, n3)
-        self.assertEqual(list(range(10)), list(n))
+        self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
         source_numbers = list(range(0, 10)) + [10, 12]
@@ -424,7 +443,7 @@ def test_demux_mux_datapipe(self):
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
         self.assertEqual([1, 3, 5, 7, 9], list(n2))
         n = n1.mux(n2)
-        self.assertEqual(source_numbers, list(n))
+        self.assertEqual(list(range(10)), list(n))
 
     @suppress_warnings  # Suppress warning for lambda fn
     def test_map_with_col_file_handle_datapipe(self):
@@ -465,9 +484,11 @@ def operations(df):
             df['c'] = df.b + df['a'] * 7
             # somehow swallows pandas UserWarning when `df.c = df.b + df['a'] * 7`
             return df
+
         self.compare_capture_and_eager(operations)
 
 
+@skipIf(True, "Fix DataFramePipes Tests")
 class TestDataFramesPipes(TestCase):
     """
         Most of test will fail if pandas instaled, but no dill available.
@@ -482,8 +503,8 @@ def _get_dataframes_pipe(self, range=10, dataframe_size=7):
         return NumbersDataset(range) \
             .map(lambda i: (i, i % 3)) \
             ._to_dataframes_pipe(
-                columns=['i', 'j'],
-                dataframe_size=dataframe_size)
+            columns=['i', 'j'],
+            dataframe_size=dataframe_size)
 
     @skipIfNoDataFrames
     @skipIfNoDill  # TODO(VitalyFedyunin): Decouple tests from dill by avoiding lambdas in map
@@ -549,59 +570,146 @@ def _fake_add(constant, data):
 
 
 def _fake_filter_fn(data):
-    return data >= 5
+    return True
 
 
+def _simple_filter_fn(data):
+    return data >= 5
+
 def _fake_filter_fn_constant(constant, data):
     return data >= constant
 
 
+def _mul_10(x):
+    return x * 10
+
+
+def _mod_3_test(x):
+    return x % 3 == 1
+
+
 def _worker_init_fn(worker_id):
-    random.seed(123)
+    info = torch.utils.data.get_worker_info()
+    num_workers = info.num_workers
+    datapipe = info.dataset
+    torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id)
 
 
 class TestFunctionalIterDataPipe(TestCase):
 
+    def _serialization_test_helper(self, datapipe, use_dill):
+        if use_dill:
+            serialized_dp = dill.dumps(datapipe)
+            deserialized_dp = dill.loads(serialized_dp)
+        else:
+            serialized_dp = pickle.dumps(datapipe)
+            deserialized_dp = pickle.loads(serialized_dp)
+        try:
+            self.assertEqual(list(datapipe), list(deserialized_dp))
+        except AssertionError as e:
+            print(f"{datapipe} is failing.")
+            raise e
+
+    def _serialization_test_for_single_dp(self, dp, use_dill=False):
+        # 1. Testing for serialization before any iteration starts
+        self._serialization_test_helper(dp, use_dill)
+        # 2. Testing for serialization after DataPipe is partially read
+        it = iter(dp)
+        _ = next(it)
+        self._serialization_test_helper(dp, use_dill)
+        # 3. Testing for serialization after DataPipe is fully read
+        it = iter(dp)
+        _ = list(it)
+        self._serialization_test_helper(dp, use_dill)
+
+    def _serialization_test_for_dp_with_children(self, dp1, dp2, use_dill=False):
+        # 1. Testing for serialization before any iteration starts
+        self._serialization_test_helper(dp1, use_dill)
+        self._serialization_test_helper(dp2, use_dill)
+
+        # 2. Testing for serialization after DataPipe is partially read
+        it1, it2 = iter(dp1), iter(dp2)
+        _, _ = next(it1), next(it2)
+        # Catch `fork`, `demux` "some child DataPipes are not exhausted" warning
+        with warnings.catch_warnings(record=True) as wa:
+            self._serialization_test_helper(dp1, use_dill)
+            self._serialization_test_helper(dp2, use_dill)
+
+        # 2.5. Testing for serialization after one child DataPipe is fully read
+        #      (Only for DataPipes with children DataPipes)
+        it1 = iter(dp1)
+        _ = list(it1)  # fully read one child
+        # Catch `fork`, `demux` "some child DataPipes are not exhausted" warning
+        with warnings.catch_warnings(record=True) as wa:
+            self._serialization_test_helper(dp1, use_dill)
+            self._serialization_test_helper(dp2, use_dill)
+
+        # 3. Testing for serialization after DataPipe is fully read
+        it2 = iter(dp2)
+        _ = list(it2)  # fully read the other child
+        self._serialization_test_helper(dp1, use_dill)
+        self._serialization_test_helper(dp2, use_dill)
+
     def test_serializable(self):
-        input_dp = dp.iter.IterableWrapper(range(10))
-        picklable_datapipes: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [
-            (dp.iter.Batcher, (3, True,), {}),
-            (dp.iter.Collator, (_fake_fn,), {}),
-            (dp.iter.Concater, (dp.iter.IterableWrapper(range(5)),), {}),
-            (dp.iter.Demultiplexer, (2, _fake_filter_fn), {}),
-            (dp.iter.FileLister, (), {}),
-            (dp.iter.FileOpener, (), {}),
-            (dp.iter.Filter, (_fake_filter_fn,), {}),
-            (dp.iter.Filter, (partial(_fake_filter_fn_constant, 5),), {}),
-            (dp.iter.Forker, (2,), {}),
-            (dp.iter.Grouper, (_fake_filter_fn,), {"group_size": 2}),
-            (dp.iter.IterableWrapper, (), {}),
-            (dp.iter.Mapper, (_fake_fn, ), {}),
-            (dp.iter.Mapper, (partial(_fake_add, 1), ), {}),
-            (dp.iter.Multiplexer, (input_dp,), {}),
-            (dp.iter.Sampler, (), {}),
-            (dp.iter.Shuffler, (), {}),
-            (dp.iter.StreamReader, (), {}),
-            (dp.iter.UnBatcher, (), {}),
-            (dp.iter.Zipper, (input_dp,), {}),
+        picklable_datapipes: List = [
+            (dp.iter.Batcher, None, (3, True,), {}),
+            (dp.iter.Collator, None, (_fake_fn,), {}),
+            (dp.iter.Concater, None, (dp.iter.IterableWrapper(range(5)),), {}),
+            (dp.iter.Demultiplexer, None, (2, _simple_filter_fn), {}),
+            (dp.iter.FileLister, ".", (), {}),
+            (dp.iter.FileOpener, None, (), {}),
+            (dp.iter.Filter, None, (_fake_filter_fn,), {}),
+            (dp.iter.Filter, None, (partial(_fake_filter_fn_constant, 5),), {}),
+            (dp.iter.Forker, None, (2,), {}),
+            (dp.iter.Grouper, None, (_fake_filter_fn,), {"group_size": 2}),
+            (dp.iter.IterableWrapper, range(10), (), {}),
+            (dp.iter.Mapper, None, (_fake_fn,), {}),
+            (dp.iter.Mapper, None, (partial(_fake_add, 1),), {}),
+            (dp.iter.Multiplexer, None, (dp.iter.IterableWrapper(range(10)),), {}),
+            (dp.iter.Sampler, None, (), {}),
+            (dp.iter.Shuffler, dp.iter.IterableWrapper([0] * 10), (), {}),
+            (dp.iter.StreamReader, None, (), {}),
+            (dp.iter.UnBatcher, None, (0,), {}),
+            (dp.iter.Zipper, None, (dp.iter.IterableWrapper(range(10)),), {}),
         ]
-        for dpipe, dp_args, dp_kwargs in picklable_datapipes:
-            print(dpipe)
-            _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
+        # Skipping comparison for these DataPipes
+        dp_skip_comparison = {dp.iter.FileOpener, dp.iter.StreamReader}
+        # These DataPipes produce multiple DataPipes as outputs and those should be compared
+        dp_compare_children = {dp.iter.Demultiplexer, dp.iter.Forker}
+
+        for dpipe, custom_input, dp_args, dp_kwargs in picklable_datapipes:
+            if custom_input is None:
+                custom_input = dp.iter.IterableWrapper(range(10))
+            if dpipe in dp_skip_comparison:  # Merely make sure they are picklable and loadable (no value comparison)
+                datapipe = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+                serialized_dp = pickle.dumps(datapipe)
+                _ = pickle.loads(serialized_dp)
+            elif dpipe in dp_compare_children:  # DataPipes that have children
+                dp1, dp2 = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+                self._serialization_test_for_dp_with_children(dp1, dp2)
+            else:  # Single DataPipe that requires comparison
+                datapipe = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+                self._serialization_test_for_single_dp(datapipe)
 
     def test_serializable_with_dill(self):
-        """Only for DataPipes that take in a function or buffer as argument"""
+        """Only for DataPipes that take in a function as argument"""
         input_dp = dp.iter.IterableWrapper(range(10))
         unpicklable_datapipes: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [
             (dp.iter.Collator, (lambda x: x,), {}),
             (dp.iter.Demultiplexer, (2, lambda x: x % 2,), {}),
             (dp.iter.Filter, (lambda x: x >= 5,), {}),
             (dp.iter.Grouper, (lambda x: x >= 5,), {}),
-            (dp.iter.Mapper, (lambda x: x, ), {}),
+            (dp.iter.Mapper, (lambda x: x,), {}),
         ]
+        dp_compare_children = {dp.iter.Demultiplexer}
         if HAS_DILL:
             for dpipe, dp_args, dp_kwargs in unpicklable_datapipes:
-                _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
+                if dpipe in dp_compare_children:
+                    dp1, dp2 = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+                    self._serialization_test_for_dp_with_children(dp1, dp2, use_dill=True)
+                else:
+                    datapipe = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+                    self._serialization_test_for_single_dp(datapipe, use_dill=True)
         else:
             for dpipe, dp_args, dp_kwargs in unpicklable_datapipes:
                 with warnings.catch_warnings(record=True) as wa:
@@ -738,28 +846,38 @@ def test_fork_iterdatapipe(self):
         self.assertEqual(list(range(5)), output2)
         self.assertEqual(list(range(10)), output3)
 
-        # Reset Test: DataPipe doesn't reset if this pipe hasn't been read
+        # Reset Test: DataPipe resets when a new iterator is created, even if this datapipe hasn't been read
         dp1, dp2 = input_dp.fork(num_instances=2)
-        i1, i2 = iter(dp1), iter(dp2)
+        _ = iter(dp1)
         output2 = []
-        for i, n2 in enumerate(i2):
-            output2.append(n2)
-            if i == 4:
-                i1 = iter(dp1)  # Doesn't reset because i1 hasn't been read
-        self.assertEqual(list(range(10)), output2)
+        with self.assertRaisesRegex(RuntimeError, r"iterator has been invalidated"):
+            for i, n2 in enumerate(dp2):
+                output2.append(n2)
+                if i == 4:
+                    with warnings.catch_warnings(record=True) as wa:
+                        _ = iter(dp1)  # This will reset all child DataPipes
+                        self.assertEqual(len(wa), 1)
+                        self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+        self.assertEqual(list(range(5)), output2)
 
-        # Reset Test: DataPipe reset when some of it have been read
+        # Reset Test: DataPipe resets when some of it has been read
         dp1, dp2 = input_dp.fork(num_instances=2)
-        i1, i2 = iter(dp1), iter(dp2)
         output1, output2 = [], []
-        for i, (n1, n2) in enumerate(zip(i1, i2)):
+        for i, (n1, n2) in enumerate(zip(dp1, dp2)):
             output1.append(n1)
             output2.append(n2)
             if i == 4:
                 with warnings.catch_warnings(record=True) as wa:
-                    i1 = iter(dp1)  # Reset both all child DataPipe
+                    _ = iter(dp1)  # Reset both all child DataPipe
                     self.assertEqual(len(wa), 1)
                     self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                break
+        with warnings.catch_warnings(record=True) as wa:
+            for i, (n1, n2) in enumerate(zip(dp1, dp2)):
+                output1.append(n1)
+                output2.append(n2)
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
         self.assertEqual(list(range(5)) + list(range(10)), output1)
         self.assertEqual(list(range(5)) + list(range(10)), output2)
 
@@ -814,7 +932,7 @@ def test_mux_iterdatapipe(self):
         input_dp2 = dp.iter.IterableWrapper([10])
         input_dp3 = dp.iter.IterableWrapper([100, 200, 300])
         output_dp = input_dp1.mux(input_dp2, input_dp3)
-        expected_output = [1, 10, 100, 2, 200, 3, 300, 4]
+        expected_output = [1, 10, 100]
         self.assertEqual(len(expected_output), len(output_dp))
         self.assertEqual(expected_output, list(output_dp))
 
@@ -822,8 +940,8 @@ def test_mux_iterdatapipe(self):
         input_dp1 = dp.iter.IterableWrapper([0, 1, 2, 3])
         input_dp2 = dp.iter.IterableWrapper([])
         output_dp = input_dp1.mux(input_dp2)
-        self.assertEqual(len(input_dp1), len(output_dp))
-        self.assertEqual(list(input_dp1), list(output_dp))
+        self.assertEqual(len(input_dp2), len(output_dp))
+        self.assertEqual(list(input_dp2), list(output_dp))
 
         # __len__ Test: raises TypeError when __len__ is called and an input doesn't have __len__
         input_dp1 = dp.iter.IterableWrapper(range(10))
@@ -886,18 +1004,21 @@ def test_demux_iterdatapipe(self):
             next(it)
             next(it)
 
-        # Reset Test: DataPipe doesn't reset when it has not been read
+        # Reset Test: DataPipe resets when a new iterator is created, even if this datapipe hasn't been read
         dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
-        i1 = iter(dp1)
+        _ = iter(dp1)
         output2 = []
-        i = 0
-        for i, n2 in enumerate(dp2):
-            output2.append(n2)
-            if i == 4:
-                i1 = iter(dp1)
+        with self.assertRaisesRegex(RuntimeError, r"iterator has been invalidated"):
+            for i, n2 in enumerate(dp2):
+                output2.append(n2)
+                if i == 4:
+                    with warnings.catch_warnings(record=True) as wa:
+                        _ = iter(dp1)  # This will reset all child DataPipes
+                        self.assertEqual(len(wa), 1)
+                        self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
         self.assertEqual(list(range(1, 10, 2)), output2)
 
-        # Reset Test: DataPipe reset when some of it has been read
+        # Reset Test: DataPipe resets when some of it has been read
         dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
         output1, output2 = [], []
         for n1, n2 in zip(dp1, dp2):
@@ -909,11 +1030,13 @@ def test_demux_iterdatapipe(self):
             i1 = iter(dp1)  # Reset all child DataPipes
             self.assertEqual(len(wa), 1)
             self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
-        for n1, n2 in zip(dp1, dp2):
-            output1.append(n1)
-            output2.append(n2)
-        self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1)
-        self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2)
+            for n1, n2 in zip(dp1, dp2):
+                output1.append(n1)
+                output2.append(n2)
+            self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1)
+            self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2)
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
 
         # Reset Test: DataPipe reset, even when not all child DataPipes are exhausted
         dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
@@ -964,7 +1087,8 @@ def test_demux_iterdatapipe(self):
         traverse(dp2)  # This should not raise any error either
 
     def test_map_iterdatapipe(self):
-        input_dp = dp.iter.IterableWrapper(range(10))
+        target_length = 10
+        input_dp = dp.iter.IterableWrapper(range(target_length))
 
         def fn(item, dtype=torch.float, *, sum=False):
             data = torch.tensor(item, dtype=dtype)
@@ -972,21 +1096,21 @@ def fn(item, dtype=torch.float, *, sum=False):
 
         # Functional Test: apply to each element correctly
         map_dp = input_dp.map(fn)
-        self.assertEqual(len(input_dp), len(map_dp))
-        for x, y in zip(map_dp, input_dp):
+        self.assertEqual(target_length, len(map_dp))
+        for x, y in zip(map_dp, range(target_length)):
             self.assertEqual(x, torch.tensor(y, dtype=torch.float))
 
         # Functional Test: works with partial function
         map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True))
-        for x, y in zip(map_dp, input_dp):
+        for x, y in zip(map_dp, range(target_length)):
             self.assertEqual(x, torch.tensor(y, dtype=torch.int).sum())
 
         # __len__ Test: inherits length from source DataPipe
-        self.assertEqual(len(input_dp), len(map_dp))
+        self.assertEqual(target_length, len(map_dp))
 
-        input_dp_nl = IDP_NoLen(range(10))
+        input_dp_nl = IDP_NoLen(range(target_length))
         map_dp_nl = input_dp_nl.map(lambda x: x)
-        for x, y in zip(map_dp_nl, input_dp_nl):
+        for x, y in zip(map_dp_nl, range(target_length)):
             self.assertEqual(x, torch.tensor(y, dtype=torch.float))
 
         # __len__ Test: inherits length from source DataPipe - raises error when invalid
@@ -1138,24 +1262,24 @@ def _collate_fn(batch, default_type=torch.float):
 
         # Functional Test: defaults to the default collate function when a custom one is not specified
         collate_dp = input_dp.collate()
-        for x, y in zip(input_dp, collate_dp):
+        for x, y in zip(arrs, collate_dp):
             self.assertEqual(torch.tensor(x), y)
 
         # Functional Test: custom collate function
         collate_dp = input_dp.collate(collate_fn=_collate_fn)
-        for x, y in zip(input_dp, collate_dp):
+        for x, y in zip(arrs, collate_dp):
             self.assertEqual(torch.tensor(sum(x), dtype=torch.float), y)
 
         # Functional Test: custom, partial collate function
         collate_dp = input_dp.collate(partial(_collate_fn, default_type=torch.int))
-        for x, y in zip(input_dp, collate_dp):
+        for x, y in zip(arrs, collate_dp):
             self.assertEqual(torch.tensor(sum(x), dtype=torch.int), y)
 
         # Reset Test: reset the DataPipe and results are still correct
         n_elements_before_reset = 1
         res_before_reset, res_after_reset = reset_after_n_next_calls(collate_dp, n_elements_before_reset)
         self.assertEqual([torch.tensor(6, dtype=torch.int)], res_before_reset)
-        for x, y in zip(input_dp, res_after_reset):
+        for x, y in zip(arrs, res_after_reset):
             self.assertEqual(torch.tensor(sum(x), dtype=torch.int), y)
 
         # __len__ Test: __len__ is inherited
@@ -1166,7 +1290,7 @@ def _collate_fn(batch, default_type=torch.float):
         collate_dp_nl = input_dp_nl.collate()
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(collate_dp_nl)
-        for x, y in zip(input_dp_nl, collate_dp_nl):
+        for x, y in zip(arrs, collate_dp_nl):
             self.assertEqual(torch.tensor(x), y)
 
     def test_batch_iterdatapipe(self):
@@ -1216,14 +1340,14 @@ def test_unbatch_iterdatapipe(self):
         input_dp = prebatch_dp.batch(3)
         unbatch_dp = input_dp.unbatch()
         self.assertEqual(len(list(unbatch_dp)), target_length)  # __len__ is as expected
-        for i, res in zip(prebatch_dp, unbatch_dp):
+        for i, res in zip(range(target_length), unbatch_dp):
             self.assertEqual(i, res)
 
         # Functional Test: unbatch works for an input with nested levels
         input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]])
         unbatch_dp = input_dp.unbatch()
         self.assertEqual(len(list(unbatch_dp)), target_length)
-        for i, res in zip(prebatch_dp, unbatch_dp):
+        for i, res in zip(range(target_length), unbatch_dp):
             self.assertEqual(i, res)
 
         input_dp = dp.iter.IterableWrapper([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
@@ -1232,8 +1356,8 @@ def test_unbatch_iterdatapipe(self):
         unbatch_dp = input_dp.unbatch()
         expected_dp = [[0, 1], [2, 3], [4, 5], [6, 7]]
         self.assertEqual(len(list(unbatch_dp)), 4)
-        for i, res in zip(expected_dp, unbatch_dp):
-            self.assertEqual(i, res)
+        for j, res in zip(expected_dp, unbatch_dp):
+            self.assertEqual(j, res)
 
         # Functional Test: unbatching multiple levels at the same time
         unbatch_dp = input_dp.unbatch(unbatch_level=2)
@@ -1272,20 +1396,12 @@ def test_unbatch_iterdatapipe(self):
     def test_filter_datapipe(self):
         input_ds = dp.iter.IterableWrapper(range(10))
 
-        def _filter_fn(data, val, clip=False):
-            if clip:
-                return data >= val
-            return True
+        def _filter_fn(data, val):
+            return data >= val
 
         # Functional Test: filter works with partial function
         filter_dp = input_ds.filter(partial(_filter_fn, val=5))
-        for data, exp in zip(filter_dp, range(10)):
-            self.assertEqual(data, exp)
-
-        # Functional Test: filter works with partial function with keyword args
-        filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True))
-        for data, exp in zip(filter_dp, range(5, 10)):
-            self.assertEqual(data, exp)
+        self.assertEqual(list(filter_dp), list(range(5, 10)))
 
         def _non_bool_fn(data):
             return 1
@@ -1295,12 +1411,26 @@ def _non_bool_fn(data):
         with self.assertRaises(ValueError):
             temp = list(filter_dp)
 
+        # Funtional Test: Specify input_col
+        tuple_input_ds = dp.iter.IterableWrapper([(d - 1, d, d + 1) for d in range(10)])
+
+        # Single input_col
+        input_col_1_dp = tuple_input_ds.filter(partial(_filter_fn, val=5), input_col=1)
+        self.assertEqual(list(input_col_1_dp), [(d - 1, d, d + 1) for d in range(5, 10)])
+
+        # Multiple input_col
+        def _mul_filter_fn(a, b):
+            return a + b < 10
+
+        input_col_2_dp = tuple_input_ds.filter(_mul_filter_fn, input_col=[0, 2])
+        self.assertEqual(list(input_col_2_dp), [(d - 1, d, d + 1) for d in range(5)])
+
         # __len__ Test: DataPipe has no valid len
         with self.assertRaisesRegex(TypeError, r"has no len"):
             len(filter_dp)
 
         # Reset Test: DataPipe resets correctly
-        filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True))
+        filter_dp = input_ds.filter(partial(_filter_fn, val=5))
         n_elements_before_reset = 3
         res_before_reset, res_after_reset = reset_after_n_next_calls(filter_dp, n_elements_before_reset)
         self.assertEqual(list(range(5, 10))[:n_elements_before_reset], res_before_reset)
@@ -1315,39 +1445,124 @@ def test_sampler_iterdatapipe(self):
             self.assertEqual(x, i)
 
         # RandomSampler
-        random_sampled_dp = dp.iter.Sampler(input_dp, sampler=RandomSampler, sampler_kwargs={'replacement': True})  # type: ignore[var-annotated] # noqa: B950
+        random_sampled_dp = dp.iter.Sampler(input_dp, sampler=RandomSampler, sampler_kwargs={
+            'replacement': True})  # type: ignore[var-annotated] # noqa: B950
 
         # Requires `__len__` to build SamplerDataPipe
         input_dp_nolen = IDP_NoLen(range(10))
         with self.assertRaises(AssertionError):
             sampled_dp = dp.iter.Sampler(input_dp_nolen)
 
+    def test_stream_reader_iterdatapipe(self):
+        from io import StringIO
+
+        input_dp = dp.iter.IterableWrapper([("f1", StringIO("abcde")), ("f2", StringIO("bcdef"))])
+        expected_res = ["abcde", "bcdef"]
+
+        # Functional Test: Read full chunk
+        dp1 = input_dp.read_from_stream()
+        self.assertEqual([d[1] for d in dp1], expected_res)
+
+        # Functional Test: Read full chunk
+        dp2 = input_dp.read_from_stream(chunk=1)
+        self.assertEqual([d[1] for d in dp2], [c for s in expected_res for c in s])
+
+        # `__len__` Test
+        with self.assertRaises(TypeError):
+            len(dp1)
+
     def test_shuffle_iterdatapipe(self):
-        exp = list(range(20))
+        exp = list(range(100))
         input_ds = dp.iter.IterableWrapper(exp)
 
         with self.assertRaises(AssertionError):
             shuffle_dp = input_ds.shuffle(buffer_size=0)
 
-        for bs in (5, 20, 25):
-            shuffle_dp = input_ds.shuffle(buffer_size=bs)
-            self.assertEqual(len(shuffle_dp), len(input_ds))
+        def _create_dp(buffer_size):
+            input_ds = dp.iter.IterableWrapper(list(range(100)))
+            return input_ds.shuffle(buffer_size=bs).sharding_filter()
 
-            random.seed(123)
+        for bs in (5, 20, 33):
+            shuffle_dp = _create_dp(bs)
+            self.assertEqual(len(shuffle_dp), len(exp))
+
+            torch.manual_seed(123)
             res = list(shuffle_dp)
             self.assertEqual(sorted(res), exp)
 
             # Test Deterministic
-            for num_workers in (0, 1):
-                random.seed(123)
-                dl = DataLoader(shuffle_dp, num_workers=num_workers, worker_init_fn=_worker_init_fn, shuffle=True)
-                dl_res = list(dl)
-                self.assertEqual(res, dl_res)
+            for num_workers in (0, 1, 2):
+                dl_res = []
+                mp_ctx = "spawn" if num_workers > 0 else None
+                dl = DataLoader(
+                    shuffle_dp,
+                    num_workers=num_workers,
+                    shuffle=True,
+                    multiprocessing_context=mp_ctx,
+                    worker_init_fn=_worker_init_fn
+                )
+                for epoch in range(2):
+                    torch.manual_seed(123)
+                    dl_res.append(list(dl))
+                self.assertEqual(dl_res[0], dl_res[1])
+
+                # Different seeds
+                torch.manual_seed(321)
+                dl_res.append(list(dl))
+
+                self.assertEqual(len(dl_res[0]), len(dl_res[2]))
+                self.assertNotEqual(dl_res[0], dl_res[2])
+                self.assertEqual(sorted(dl_res[0]), sorted(dl_res[2]))
+
+                if num_workers == 0:
+                    continue
+
+                # Persistent workers
+                ps_dl_res = []
+                for _ in range(2):
+                    dl = DataLoader(
+                        shuffle_dp,
+                        num_workers=num_workers,
+                        shuffle=True,
+                        multiprocessing_context="spawn",
+                        worker_init_fn=_worker_init_fn,
+                        persistent_workers=True
+                    )
+                    ps_res = []
+                    torch.manual_seed(123)
+                    for epoch in range(2):
+                        ps_res.extend(list(dl))
+                    ps_dl_res.append(ps_res)
+                self.assertEqual(ps_dl_res[0], ps_dl_res[1])
+
+                # Different Seeds
+                dl = DataLoader(
+                    shuffle_dp,
+                    num_workers=num_workers,
+                    shuffle=True,
+                    multiprocessing_context="spawn",
+                    worker_init_fn=_worker_init_fn,
+                    persistent_workers=True
+                )
+                ps_res = []
+                torch.manual_seed(321)
+                for epoch in range(2):
+                    ps_res.extend(list(dl))
+                ps_dl_res.append(ps_res)
+
+                self.assertEqual(len(ps_dl_res[0]), len(ps_dl_res[2]))
+                self.assertNotEqual(ps_dl_res[0], ps_dl_res[2])
+                self.assertEqual(sorted(ps_dl_res[0]), sorted(ps_dl_res[2]))
+
 
         shuffle_dp_nl = IDP_NoLen(range(20)).shuffle(buffer_size=5)
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(shuffle_dp_nl)
 
+        # Test: deactivate shuffling via set_shuffle
+        unshuffled_dp = input_ds.shuffle().set_shuffle(False)
+        self.assertEqual(list(unshuffled_dp), list(input_ds))
+
     def test_zip_iterdatapipe(self):
 
         # Functional Test: raises TypeError when an input is not of type `IterDataPipe`
@@ -1378,19 +1593,50 @@ def test_zip_iterdatapipe(self):
 
 
 class TestFunctionalMapDataPipe(TestCase):
+
+    def _serialization_test_helper(self, datapipe, use_dill):
+        if use_dill:
+            serialized_dp = dill.dumps(datapipe)
+            deserialized_dp = dill.loads(serialized_dp)
+        else:
+            serialized_dp = pickle.dumps(datapipe)
+            deserialized_dp = pickle.loads(serialized_dp)
+        try:
+            self.assertEqual(list(datapipe), list(deserialized_dp))
+        except AssertionError as e:
+            print(f"{datapipe} is failing.")
+            raise e
+
+    def _serialization_test_for_single_dp(self, dp, use_dill=False):
+        # 1. Testing for serialization before any iteration starts
+        self._serialization_test_helper(dp, use_dill)
+        # 2. Testing for serialization after DataPipe is partially read
+        it = iter(dp)
+        _ = next(it)
+        self._serialization_test_helper(dp, use_dill)
+        # 3. Testing for serialization after DataPipe is fully read
+        _ = list(it)
+        self._serialization_test_helper(dp, use_dill)
+
     def test_serializable(self):
-        input_dp = dp.map.SequenceWrapper(range(10))
-        picklable_datapipes: List[
-            Tuple[Type[MapDataPipe], Tuple, Dict[str, Any]]
-        ] = [
-            (dp.map.Mapper, (), {}),
-            (dp.map.Mapper, (_fake_fn, ), {}),
-            (dp.map.Mapper, (partial(_fake_add, 1), ), {}),
+        picklable_datapipes: List = [
+            (dp.map.Batcher, None, (2,), {}),
+            (dp.map.Concater, None, (dp.map.SequenceWrapper(range(10)),), {}),
+            (dp.map.Mapper, None, (), {}),
+            (dp.map.Mapper, None, (_fake_fn,), {}),
+            (dp.map.Mapper, None, (partial(_fake_add, 1),), {}),
+            (dp.map.SequenceWrapper, range(10), (), {}),
+            (dp.map.Shuffler, dp.map.SequenceWrapper([0] * 5), (), {}),
+            (dp.map.Zipper, None, (dp.map.SequenceWrapper(range(10)),), {}),
         ]
-        for dpipe, dp_args, dp_kwargs in picklable_datapipes:
-            _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
+        for dpipe, custom_input, dp_args, dp_kwargs in picklable_datapipes:
+            if custom_input is None:
+                custom_input = dp.map.SequenceWrapper(range(10))
+            datapipe = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
+            self._serialization_test_for_single_dp(datapipe)
 
     def test_serializable_with_dill(self):
+        """Only for DataPipes that take in a function as argument"""
         input_dp = dp.map.SequenceWrapper(range(10))
         unpicklable_datapipes: List[
             Tuple[Type[MapDataPipe], Tuple, Dict[str, Any]]
@@ -1399,7 +1645,7 @@ def test_serializable_with_dill(self):
         ]
         if HAS_DILL:
             for dpipe, dp_args, dp_kwargs in unpicklable_datapipes:
-                _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
+                _ = dill.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
         else:
             for dpipe, dp_args, dp_kwargs in unpicklable_datapipes:
                 with warnings.catch_warnings(record=True) as wa:
@@ -1589,7 +1835,7 @@ class A(IterDataPipe[P]):
 
     @skipTyping
     def test_subtype(self):
-        from torch.utils.data._typing import issubtype
+        from torch.utils.data.datapipes._typing import issubtype
 
         basic_type = (int, str, bool, float, complex,
                       list, tuple, dict, set, T_co)
@@ -1637,7 +1883,7 @@ def test_subtype(self):
 
     @skipTyping
     def test_issubinstance(self):
-        from torch.utils.data._typing import issubinstance
+        from torch.utils.data.datapipes._typing import issubinstance
 
         basic_data = (1, '1', True, 1., complex(1., 0.))
         basic_type = (int, str, bool, float, complex)
@@ -1690,7 +1936,7 @@ def __iter__(self) -> Iterator[int]:  # type: ignore[override]
         with self.assertRaisesRegex(TypeError, r"Expected return type of '__iter__'"):
             class InvalidDP3(IterDataPipe[Tuple[int, str]]):
                 def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
-                    yield (0, )
+                    yield (0,)
 
         if _generic_namedtuple_allowed:
             with self.assertRaisesRegex(TypeError, r"is not supported by Python typing"):
@@ -1707,14 +1953,14 @@ def __iter__(self) -> Iterator[Tuple[int, str]]:
 
         self.assertTrue(issubclass(DP1, IterDataPipe))
         dp1 = DP1(10)
-        self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type))
+        self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type))  # type: ignore[attr-defined]
         dp1_ = DP1(5)
         self.assertEqual(dp1.type, dp1_.type)
 
         with self.assertRaisesRegex(TypeError, r"is not a generic class"):
             class InvalidDP5(DP1[tuple]):  # type: ignore[type-arg]
                 def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
-                    yield (0, )
+                    yield (0,)
 
         class DP2(IterDataPipe[T_co]):
             def __iter__(self) -> Iterator[T_co]:
@@ -1723,7 +1969,7 @@ def __iter__(self) -> Iterator[T_co]:
 
         self.assertTrue(issubclass(DP2, IterDataPipe))
         dp2 = DP2()  # type: ignore[var-annotated]
-        self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type))
+        self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type))  # type: ignore[attr-defined]
         dp2_ = DP2()  # type: ignore[var-annotated]
         self.assertEqual(dp2.type, dp2_.type)
 
@@ -1739,7 +1985,7 @@ def __iter__(self) -> Iterator[Tuple[T_co, str]]:
 
         self.assertTrue(issubclass(DP3, IterDataPipe))
         dp3 = DP3(range(10))  # type: ignore[var-annotated]
-        self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type))
+        self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type))  # type: ignore[attr-defined]
         dp3_ = DP3(5)  # type: ignore[var-annotated]
         self.assertEqual(dp3.type, dp3_.type)
 
@@ -1761,7 +2007,7 @@ def __iter__(self) -> Iterator[str]:
 
         self.assertTrue(issubclass(DP5, IterDataPipe))
         dp5 = DP5()
-        from torch.utils.data._typing import issubtype
+        from torch.utils.data.datapipes._typing import issubtype
         self.assertTrue(issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param))
 
         class DP6(IterDataPipe[int]):
@@ -1778,13 +2024,13 @@ class DP7(IterDataPipe[Awaitable[T_co]]):
             r""" DataPipe with abstract base class"""
 
         self.assertTrue(issubclass(DP7, IterDataPipe))
-        self.assertTrue(DP7.type.param == Awaitable[T_co])
+        self.assertTrue(DP7.type.param == Awaitable[T_co])  # type: ignore[attr-defined]
 
         class DP8(DP7[str]):
             r""" DataPipe subclass from a DataPipe with abc type"""
 
         self.assertTrue(issubclass(DP8, IterDataPipe))
-        self.assertTrue(DP8.type.param == Awaitable[str])
+        self.assertTrue(DP8.type.param == Awaitable[str])  # type: ignore[attr-defined]
 
     @skipTyping
     def test_construct_time(self):
@@ -1918,10 +2164,171 @@ def test_traverse_forked(self):
                                   dp2: {dp2.main_datapipe: {dp2.main_datapipe.main_datapipe: {}}}}}
         self.assertEqual(expected, graph)
 
+    def test_traverse_mapdatapipe(self):
+        source_dp = dp.map.SequenceWrapper(range(10))
+        map_dp = source_dp.map(partial(_fake_add, 1))
+        graph = torch.utils.data.graph.traverse(map_dp)
+        expected: Dict[Any, Any] = {map_dp: {source_dp: {}}}
+        self.assertEqual(expected, graph)
+
+    def test_traverse_mixdatapipe(self):
+        source_map_dp = dp.map.SequenceWrapper(range(10))
+        iter_dp = dp.iter.IterableWrapper(source_map_dp)
+        graph = torch.utils.data.graph.traverse(iter_dp)
+        expected: Dict[Any, Any] = {iter_dp: {source_map_dp: {}}}
+        self.assertEqual(expected, graph)
+
+
+def unbatch(x):
+    return x[0]
+
+
+class TestSerialization(TestCase):
+    @skipIfNoDill
+    def test_spawn_lambdas_iter(self):
+        idp = dp.iter.IterableWrapper(range(3)).map(lambda x: x + 1)
+        dl = DataLoader(idp, num_workers=2, shuffle=True,
+                        multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1)
+        result = list(dl)
+        self.assertEquals([1, 1, 2, 2, 3, 3], sorted(result))
+
+    @skipIfNoDill
+    def test_spawn_lambdas_map(self):
+        mdp = dp.map.SequenceWrapper(range(6)).map(lambda x: x + 1)
+        dl = DataLoader(mdp, num_workers=2, shuffle=True,
+                        multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1)
+        result = list(dl)
+        self.assertEquals([1, 2, 3, 4, 5, 6], sorted(result))
+
+
+class TestCircularSerialization(TestCase):
+    class CustomIterDataPipe(IterDataPipe):
+
+        @staticmethod
+        def add_one(x):
+            return x + 1
+
+        @classmethod
+        def classify(cls, x):
+            return 0
+
+        def add_v(self, x):
+            return x + self.v
+
+        def __init__(self, fn, source_dp=None):
+            self.fn = fn
+            self.source_dp = source_dp if source_dp else dp.iter.IterableWrapper([1, 2, 4])
+            self._dp = self.source_dp.map(self.add_one).map(self.add_v).demux(2, self.classify)[0]
+            self.v = 1
+
+        def __iter__(self):
+            yield from self._dp
+
+    def test_circular_serialization_with_pickle(self):
+        from torch.utils.data.datapipes.iter.combining import _ChildDataPipe, _DemultiplexerIterDataPipe
+
+        def _get_name(datapipe):
+            return datapipe.__name__
+
+        # Test for circular reference issue with pickle
+        source_dp = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn)
+        self.assertTrue(list(source_dp) ==
+                        list(pickle.loads(pickle.dumps(TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn)))))
+        res1 = traverse(source_dp, only_datapipe=True)
+        res2 = traverse(source_dp, only_datapipe=False)
+        expected_str1 = str({source_dp:
+                            {_get_name(dp.iter.IterableWrapper): {},
+                                _get_name(_ChildDataPipe):
+                                    {_get_name(_DemultiplexerIterDataPipe):
+                                        {_get_name(dp.iter.Mapper):
+                                            {_get_name(dp.iter.Mapper):
+                                                {_get_name(dp.iter.IterableWrapper): {}}}}}}}
+                            ).replace("'", "")
+        expected_str2 = str({source_dp:
+                            {_get_name(dp.iter.IterableWrapper): {},
+                                _get_name(_ChildDataPipe):
+                                    {_get_name(_DemultiplexerIterDataPipe):
+                                        {_get_name(dp.iter.Mapper):
+                                            {_get_name(dp.iter.Mapper):
+                                                {_get_name(dp.iter.IterableWrapper): {}},
+                                             _get_name(dp.iter.IterableWrapper): {}}}}}}
+                            ).replace("'", "")
+        # For simplicity, compare the resulting string instead of trying to recreate the object
+        self.assertEqual(expected_str1, str(res1))
+        self.assertEqual(expected_str2, str(res2))
+
+        dp1 = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn)
+        dp2 = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn, source_dp=dp1)
+        self.assertTrue(list(dp2) == list(pickle.loads(pickle.dumps(dp2))))
+        res3 = traverse(dp2, only_datapipe=True)
+        res4 = traverse(dp2, only_datapipe=False)
+        self.assertTrue(str(dp2) in str(res3))  # Quick check to ensure the result isn't blank
+        self.assertTrue(str(dp2) in str(res4))
+
+    class LambdaIterDataPipe(CustomIterDataPipe):
+
+        def __init__(self, fn, source_dp=None):
+            super().__init__(fn, source_dp)
+            self.container = [lambda x: x + 1, ]
+            self.lambda_fn = lambda x: x + 1
+            self._dp = self.source_dp.map(self.add_one).map(self.lambda_fn).map(self.add_v).demux(2, self.classify)[0]
+
+    @skipIfNoDill
+    def test_circular_serialization_with_dill(self):
+        from torch.utils.data.datapipes.iter.combining import _ChildDataPipe, _DemultiplexerIterDataPipe
+
+        def _get_name(datapipe):
+            return datapipe.__name__
+
+        # Test for circular reference issue with dill
+        self.assertTrue(list(TestCircularSerialization.LambdaIterDataPipe(lambda x: x + 1)) ==
+                        list(dill.loads(dill.dumps(TestCircularSerialization.LambdaIterDataPipe(lambda x: x + 1)))))
+        source_dp = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn)
+        res1 = traverse(source_dp, only_datapipe=True)
+        res2 = traverse(source_dp, only_datapipe=False)
+        expected_str1 = str({source_dp:
+                            {_get_name(dp.iter.IterableWrapper): {},
+                             _get_name(_ChildDataPipe):
+                                 {_get_name(_DemultiplexerIterDataPipe):
+                                     {_get_name(dp.iter.Mapper):
+                                         {_get_name(dp.iter.Mapper):
+                                             {_get_name(dp.iter.Mapper):
+                                                 {_get_name(dp.iter.IterableWrapper): {}}}}}}}}
+                            ).replace("'", "")
+        expected_str2 = str({source_dp:
+                            {_get_name(dp.iter.IterableWrapper): {},
+                                _get_name(_ChildDataPipe):
+                                    {_get_name(_DemultiplexerIterDataPipe):
+                                        {_get_name(dp.iter.Mapper):
+                                            {_get_name(dp.iter.Mapper):
+                                                {_get_name(dp.iter.Mapper):
+                                                    {_get_name(dp.iter.IterableWrapper): {}}},
+                                             _get_name(dp.iter.IterableWrapper): {}}}}}}
+                            ).replace("'", "")
+        # For simplicity, compare the resulting string instead of trying to recreate the object
+        self.assertEqual(expected_str1, str(res1))
+        self.assertEqual(expected_str2, str(res2))
+
+        dp1 = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn)
+        dp2 = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn, source_dp=dp1)
+        self.assertTrue(list(dp2) == list(dill.loads(dill.dumps(dp2))))
+        res3 = traverse(dp2, only_datapipe=True)
+        res4 = traverse(dp2, only_datapipe=False)
+        self.assertTrue(str(dp2) in str(res3))  # Quick check to ensure the result isn't blank
+        self.assertTrue(str(dp2) in str(res4))
+
 
 class TestSharding(TestCase):
 
     def _get_pipeline(self):
+        numbers_dp = NumbersDataset(size=10)
+        dp0, dp1 = numbers_dp.fork(num_instances=2)
+        dp0_upd = dp0.map(_mul_10)
+        dp1_upd = dp1.filter(_mod_3_test)
+        combined_dp = dp0_upd.mux(dp1_upd)
+        return combined_dp
+
+    def _get_dill_pipeline(self):
         numbers_dp = NumbersDataset(size=10)
         dp0, dp1 = numbers_dp.fork(num_instances=2)
         dp0_upd = dp0.map(lambda x: x * 10)
@@ -1929,20 +2336,18 @@ def _get_pipeline(self):
         combined_dp = dp0_upd.mux(dp1_upd)
         return combined_dp
 
-    @skipIfNoDill
     def test_simple_sharding(self):
         sharded_dp = self._get_pipeline().sharding_filter()
         torch.utils.data.graph_settings.apply_sharding(sharded_dp, 3, 1)
         items = list(sharded_dp)
-        self.assertEqual([1, 20, 40, 70], items)
+        self.assertEqual([1, 20], items)
 
-        all_items = list(self._get_pipeline())
+        all_items = [0, 1, 10, 4, 20, 7]
         items = []
         for i in range(3):
             sharded_dp = self._get_pipeline().sharding_filter()
             torch.utils.data.graph_settings.apply_sharding(sharded_dp, 3, i)
             items += list(sharded_dp)
-
         self.assertEqual(sorted(all_items), sorted(items))
 
     def test_sharding_length(self):
@@ -1966,7 +2371,6 @@ def test_sharding_length(self):
         self.assertEqual(1, len(sharded_dp0))
         self.assertEqual(0, len(sharded_dp1))
 
-    @skipIfNoDill
     def test_old_dataloader(self):
         dp0 = self._get_pipeline()
         expected = list(dp0)
@@ -1981,5 +2385,229 @@ def test_old_dataloader(self):
         self.assertEqual(sorted(expected), sorted(items))
 
 
+class TestIterDataPipeSingletonConstraint(TestCase):
+
+    r"""
+    Each `IterDataPipe` can only have one active iterator. Whenever a new iterator is created, older
+    iterators are invalidated. These tests aim to ensure `IterDataPipe` follows this behavior.
+    """
+
+    def _check_single_iterator_invalidation_logic(self, source_dp: IterDataPipe):
+        r"""
+        Given a IterDataPipe, verifies that the iterator can be read, reset, and the creation of
+        a second iterator invalidates the first one.
+        """
+        it1 = iter(source_dp)
+        self.assertEqual(list(range(10)), list(it1))
+        it1 = iter(source_dp)
+        self.assertEqual(list(range(10)), list(it1))  # A fresh iterator can be read in full again
+        it1 = iter(source_dp)
+        self.assertEqual(0, next(it1))
+        it2 = iter(source_dp)  # This should invalidate `it1`
+        self.assertEqual(0, next(it2))  # Should read from the beginning again
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+
+
+    def test_iterdatapipe_singleton_generator(self):
+        r"""
+        Testing for the case where IterDataPipe's `__iter__` is a generator function.
+        """
+
+        # Functional Test: Check if invalidation logic is correct
+        source_dp: IterDataPipe = dp.iter.IterableWrapper(range(10))
+        self._check_single_iterator_invalidation_logic(source_dp)
+
+        # Functional Test: extend the test to a pipeline
+        dps = source_dp.map(_fake_fn).filter(_fake_filter_fn)
+        self._check_single_iterator_invalidation_logic(dps)
+
+        # Functional Test: multiple simultaneous references to the same DataPipe fails
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            for _ in zip(source_dp, source_dp):
+                pass
+
+        # Function Test: sequential references work
+        for _ in zip(list(source_dp), list(source_dp)):
+            pass
+
+    def test_iterdatapipe_singleton_self_next(self):
+        r"""
+        Testing for the case where IterDataPipe's `__iter__` returns `self` and there is a `__next__` method
+        Note that the following DataPipe by is singleton by default (because `__iter__` returns `self`).
+        """
+        class _CustomIterDP_Self(IterDataPipe):
+            def __init__(self, iterable):
+                self.source = iterable
+                self.iterable = iter(iterable)
+
+            def __iter__(self):
+                self.reset()
+                return self
+
+            def __next__(self):
+                return next(self.iterable)
+
+            def reset(self):
+                self.iterable = iter(self.source)
+
+        # Functional Test: Check that every `__iter__` call returns the same object
+        source_dp = _CustomIterDP_Self(range(10))
+        res = list(source_dp)
+        it = iter(source_dp)
+        self.assertEqual(res, list(it))
+
+        # Functional Test: Check if invalidation logic is correct
+        source_dp = _CustomIterDP_Self(range(10))
+        self._check_single_iterator_invalidation_logic(source_dp)
+        self.assertEqual(1, next(source_dp))  # `source_dp` is still valid and can be read
+
+        # Functional Test: extend the test to a pipeline
+        source_dp = _CustomIterDP_Self(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn))
+        self._check_single_iterator_invalidation_logic(source_dp)
+        self.assertEqual(1, next(source_dp))  # `source_dp` is still valid and can be read
+
+        # Functional Test: multiple simultaneous references to the same DataPipe fails
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            for _ in zip(source_dp, source_dp):
+                pass
+
+    def test_iterdatapipe_singleton_new_object(self):
+        r"""
+        Testing for the case where IterDataPipe's `__iter__` isn't a generator nor returns `self`,
+        and there isn't a `__next__` method.
+        """
+        class _CustomIterDP(IterDataPipe):
+            def __init__(self, iterable):
+                self.iterable = iter(iterable)
+
+            def __iter__(self):  # Note that this doesn't reset
+                return self.iterable  # Intentionally not returning `self`
+
+        # Functional Test: Check if invalidation logic is correct
+        source_dp = _CustomIterDP(range(10))
+        it1 = iter(source_dp)
+        self.assertEqual(0, next(it1))
+        it2 = iter(source_dp)
+        self.assertEqual(1, next(it2))
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+
+        # Functional Test: extend the test to a pipeline
+        source_dp = _CustomIterDP(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn))
+        it1 = iter(source_dp)
+        self.assertEqual(0, next(it1))
+        it2 = iter(source_dp)
+        self.assertEqual(1, next(it2))
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+
+        # Functional Test: multiple simultaneous references to the same DataPipe fails
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            for _ in zip(source_dp, source_dp):
+                pass
+
+    def test_iterdatapipe_singleton_buggy(self):
+        r"""
+        Buggy test case case where IterDataPipe's `__iter__` returns a new object, but also has
+        a `__next__` method.
+        """
+        class _CustomIterDP(IterDataPipe):
+            def __init__(self, iterable):
+                self.source = iterable
+                self.iterable = iter(iterable)
+
+            def __iter__(self):
+                return iter(self.source)  # Intentionally not returning `self`
+
+            def __next__(self):
+                return next(self.iterable)
+
+        # Functional Test: Check if invalidation logic is correct
+        source_dp = _CustomIterDP(range(10))
+        self._check_single_iterator_invalidation_logic(source_dp)
+        self.assertEqual(0, next(source_dp))  # `__next__` is unrelated with `__iter__`
+
+        # Functional Test: Special case to show `__next__` is unrelated with `__iter__`
+        source_dp = _CustomIterDP(range(10))
+        self.assertEqual(0, next(source_dp))
+        it1 = iter(source_dp)
+        self.assertEqual(0, next(it1))
+        self.assertEqual(1, next(source_dp))
+        it2 = iter(source_dp)  # invalidates both `it1`
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+        self.assertEqual(2, next(source_dp))  # not impacted by the creation of `it2`
+        self.assertEqual(list(range(10)), list(it2))  # `it2` still works because it is a new object
+
+    def test_iterdatapipe_singleton_constraint_multiple_outputs(self):
+        r"""
+        Testing for the case where IterDataPipe has multiple child DataPipes as outputs.
+        """
+        # Functional Test: all previous related iterators should be invalidated when a new iterator
+        #                  is created from a ChildDataPipe
+        source_dp: IterDataPipe = dp.iter.IterableWrapper(range(10))
+        cdp1, cdp2 = source_dp.fork(num_instances=2)
+        it1, it2 = iter(cdp1), iter(cdp2)
+        self.assertEqual(list(range(10)), list(it1))
+        self.assertEqual(list(range(10)), list(it2))
+        it1, it2 = iter(cdp1), iter(cdp2)
+        with warnings.catch_warnings(record=True) as wa:
+            it3 = iter(cdp1)  # This should invalidate `it1` and `it2`
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it2)
+        self.assertEqual(0, next(it3))
+        # The next line should not invalidate anything, as there was no new iterator created
+        # for `cdp2` after `it2` was invalidated
+        it4 = iter(cdp2)
+        self.assertEqual(1, next(it3))  # An error shouldn't be raised here
+        self.assertEqual(list(range(10)), list(it4))
+
+        # Functional Test: invalidation when a new iterator is created from `source_dp`
+        source_dp = dp.iter.IterableWrapper(range(10))
+        cdp1, cdp2 = source_dp.fork(num_instances=2)
+        it1, it2 = iter(cdp1), iter(cdp2)
+        self.assertEqual(list(range(10)), list(it1))
+        self.assertEqual(list(range(10)), list(it2))
+        it1, it2 = iter(cdp1), iter(cdp2)
+        self.assertEqual(0, next(it1))
+        self.assertEqual(0, next(it2))
+        it3 = iter(source_dp)  # note that a new iterator is created from `source_dp`
+        self.assertEqual(0, next(it3))  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+        self.assertEqual(1, next(it3))
+
+        # Function Test: Extending test to pipeline
+        source_dp = dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)
+        cdp1, cdp2 = source_dp.fork(num_instances=2)
+        it1, it2 = iter(cdp1), iter(cdp2)
+        self.assertEqual(list(range(10)), list(it1))
+        self.assertEqual(list(range(10)), list(it2))
+        it1, it2 = iter(cdp1), iter(cdp2)
+        with warnings.catch_warnings(record=True) as wa:
+            it3 = iter(cdp1)  # This should invalidate `it1` and `it2`
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it2)
+        with warnings.catch_warnings(record=True) as wa:
+            it1, it2 = iter(cdp1), iter(cdp2)
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+        self.assertEqual(0, next(it1))
+        self.assertEqual(0, next(it2))
+        it3 = iter(source_dp)  # note that a new iterator is created from `source_dp`
+        self.assertEqual(0, next(it3))  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
+        with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
+            next(it1)
+        self.assertEqual(1, next(it3))
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_decomp.py b/test/test_decomp.py
new file mode 100644
index 000000000000..100859713a26
--- /dev/null
+++ b/test/test_decomp.py
@@ -0,0 +1,503 @@
+# Owner(s): ["module: primTorch"]
+
+from collections import defaultdict
+from torch import Tensor
+import torch.autograd
+from torch.utils._python_dispatch import enable_torch_dispatch_mode
+from torch._decomp import decomposition_table
+
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
+from torch.testing._internal.logging_tensor import no_dispatch
+from torch.testing._internal.common_utils import (
+    is_iterable_of_tensors,
+    TestCase,
+    skipIfCrossRef,
+    suppress_warnings,
+    TEST_WITH_ASAN,
+    run_tests,
+)
+from torch.testing._internal.common_device_type import (
+    onlyNativeDeviceTypes,
+    ops,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_methods_invocations import op_db
+
+import itertools
+import functools
+from functools import partial
+import unittest
+
+aten = torch.ops.aten
+
+
+# TODO: this isn't going to work with non-aten namespaces
+def overload_to_aten_name(overload):
+    return overload._schema.name.split("::")[1]
+
+
+# All operators that can have decomp tests
+decomposition_names = {overload_to_aten_name(k) for k in decomposition_table}
+_decomp_test_ops = [
+    op
+    for op in op_db
+    if op.aten_name in decomposition_names
+    or op.aten_backward_name in decomposition_names
+]
+
+
+def diff_arg(arg, requires_grad=True):
+    def is_differentiable_arg(arg):
+        if requires_grad:
+            return arg.requires_grad
+        else:
+            return arg.is_floating_point() or arg.is_complex()
+
+    if is_iterable_of_tensors(arg):
+        if all([is_differentiable_arg(a) for a in arg]):
+            return True
+        if all([not is_differentiable_arg(a) for a in arg]):
+            return False
+        raise RuntimeError("NYI: The test runner can't handle this")
+    return isinstance(arg, Tensor) and is_differentiable_arg(arg)
+
+
+# Version of autograd.grad with some differences:
+#   - pytree inputs is allowed (but leaves of the pytree have to all
+#     be tensors)
+#   - if an input is not used as part of derivatives, we will return a
+#     zero-filled tensor for the result
+def _autograd_grad(
+    outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True
+):
+    inputs, inputs_spec = tree_flatten(inputs)
+    diff_inputs = tuple(inp for inp in inputs if inp.requires_grad)
+    if grad_outputs is None:
+        diff_outputs = tuple(out for out in outputs if out.requires_grad)
+    else:
+        diff_grad_outputs = [
+            (out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad
+        ]
+        if len(diff_grad_outputs) == 0:
+            diff_outputs, grad_outputs = (), ()
+        else:
+            diff_outputs, grad_outputs = zip(*diff_grad_outputs)
+    grad_inputs = torch.autograd.grad(
+        diff_outputs,
+        diff_inputs,
+        grad_outputs,
+        retain_graph=retain_graph,
+        create_graph=create_graph,
+        allow_unused=True,
+    )
+    result = []
+    grad_inputs_iter = iter(grad_inputs)
+    for inp in inputs:
+        if inp.requires_grad:
+            grad_input = next(grad_inputs_iter)
+            if grad_input is None:
+                result.append(torch.zeros_like(inp))
+            else:
+                result.append(grad_input)
+        else:
+            result.append(torch.zeros_like(inp))
+    return tree_unflatten(result, inputs_spec)
+
+
+def _as_tuple(val):
+    if isinstance(val, tuple):
+        return val
+    return (val,)
+
+
+def ref_vjp_no_create(f, *primals):
+    result = f(*primals)
+
+    def wrapped(cotangents):
+        return _autograd_grad(
+            _as_tuple(result), primals, _as_tuple(cotangents), create_graph=False
+        )
+
+    return result, wrapped
+
+
+dtype_precisions = {
+    torch.float16: (0.001, 1e-5),
+    torch.bfloat16: (0.016, 1e-4),
+    torch.float32: (1.3e-6, 1e-5),
+    torch.float64: (1e-7, 1e-7),
+    torch.complex32: (0.001, 1e-5),
+    torch.complex64: (1.3e-6, 1e-5),
+    torch.complex128: (1e-7, 1e-7),
+}
+# Returns the "default" rtol and atol for comparing scalars or
+# tensors of the given dtypes.
+
+
+def _getDefaultRtolAndAtol(dtype0, dtype1):
+    rtol = max(
+        dtype_precisions.get(dtype0, (0, 0))[0], dtype_precisions.get(dtype1, (0, 0))[0]
+    )
+    atol = max(
+        dtype_precisions.get(dtype0, (0, 0))[1], dtype_precisions.get(dtype1, (0, 0))[1]
+    )
+    return rtol, atol
+
+
+def op_assert_ref(test_case, op, orig, decomp, ref, args, kwargs):
+    assert orig.dtype == decomp.dtype, f"Operation:  {op}"
+    if orig.numel() == 0 or decomp.numel() == 0:
+        assert orig.numel() == decomp.numel()
+        return
+    if ref.is_floating_point():
+        orig_diff = (orig - ref).abs().max()
+        decomp_diff = (decomp - ref).abs().max()
+        atol = 1e-10
+        if decomp_diff > orig_diff + atol:
+            raise RuntimeError(
+                f"Difference from float64 is larger with decomposition {op.__name__}"
+                f" than original. Original max diff: {orig_diff}, Decomp max diff: {decomp_diff}\n"
+                f"args = {args}\n"
+                f"kwargs = {kwargs}"
+            )
+    else:
+        test_case.assertEqual(
+            orig, decomp, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}"
+        )
+
+
+def op_assert_equal(test_case, op, orig, decomp, args, kwargs):
+    test_case.assertEqual(
+        orig.dtype, decomp.dtype, f"Operation: {op}, orig.dtype: {orig.dtype}, decomp.dtype: {decomp.dtype}, {args}, {kwargs}")
+    # Before adding an entry to this table, make sure your decomposition is right :)
+    tol_table = {
+        # Due to strange epsilon behaviors, see https://github.com/pytorch/pytorch/issues/73161
+        (torch.float32, torch.ops.aten.native_layer_norm.default): (1e-3, 1e-3),
+        (torch.float32, torch.ops.aten.native_layer_norm_backward.default): (
+            1e-3,
+            1e-3,
+        ),
+    }
+    if (decomp.dtype, op) in tol_table:
+        rtol, atol = tol_table[(decomp.dtype, op)]
+    else:
+        rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype)
+
+    test_case.assertEqual(orig, decomp, rtol=rtol, atol=atol, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}")
+
+
+# Given f, returns an f' such that:
+# - f' takes only positional arguments
+# - All arguments to f' are floating-point Tensors
+# - All outputs of f' are floating-point Tensors
+def normalize_op_input_output2(
+    f, args, kwargs, output_process_fn_grad=None, requires_grad=True
+):
+    flat_args, args_spec = tree_flatten(args)
+    diff_argnums = tuple(
+        i
+        for i, arg in enumerate(flat_args)
+        if diff_arg(arg, requires_grad=requires_grad)
+    )
+    assert len(diff_argnums) > 0
+    primals = tuple(flat_args[i] for i in diff_argnums)
+
+    @functools.wraps(f)
+    def wrapped(*primals):
+        _args = list(flat_args)
+        for num, arg in zip(diff_argnums, primals):
+            _args[num] = arg
+        _args = tree_unflatten(_args, args_spec)
+        result = f(*_args, **kwargs)
+        if output_process_fn_grad is not None:
+            result = output_process_fn_grad(result)
+        if isinstance(result, tuple):
+            # TODO: Remove the following hack for namedtuples
+            result = tuple(result)
+            result = tuple(
+                r
+                for r in result
+                if isinstance(r, Tensor) and (r.is_floating_point() or r.is_complex())
+            )
+            assert len(result) > 0
+        return result
+
+    return wrapped, primals
+
+
+# NB: This also upcasts dtype arguments
+
+
+def upcast_tensor(func, x, dtype=torch.float32):
+    # Some functions take a dtype as argument, so we need to
+    # manually change that dtype in order to run it with a
+    # higher precision
+    dtype_arg_table = {
+        torch.ops.aten._softmax_backward_data.default,
+        torch.ops.aten._log_softmax_backward_data.default,
+    }
+
+    if isinstance(x, Tensor) and x.dtype.is_floating_point:
+        return x.to(dtype=dtype)
+    elif (
+        isinstance(x, torch.dtype)
+        and func in dtype_arg_table
+        and x in [torch.float16, torch.bfloat16]
+    ):
+        return torch.float64
+    else:
+        return x
+
+
+def normalize_op_input_output(f, sample, requires_grad=True):
+    args = tuple([sample.input] + list(sample.args))
+    return normalize_op_input_output2(
+        f,
+        args,
+        sample.kwargs,
+        sample.output_process_fn_grad,
+        requires_grad=requires_grad,
+    )
+
+
+CROSS_REF_EXCLUDE_SET = {
+    # CUBLAS_STATUS_NOT_SUPPORTED when calling
+    # `cublasGemmStridedBatchedExFix(handle, opa, opb, (int)m, (int)n, (int)k,
+    # (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF,
+    # (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
+    # (int)num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
+    ("cuda", torch.bfloat16, "nn.functional.bilinear"),
+    # randomness
+    ("cuda", torch.float16, "nn.functional.dropout"),
+    ("cuda", torch.bfloat16, "nn.functional.dropout"),
+    ("cuda", torch.float64, "nn.functional.dropout"),
+    ("cuda", torch.float32, "nn.functional.dropout"),
+    # decomp has problem even with opmath
+    ("cuda", torch.bfloat16, "nn.functional.layer_norm"),
+    ("cuda", torch.float16, "nn.functional.layer_norm"),
+    ("cuda", torch.bfloat16, "nn.functional.batch_norm"),
+    ("cuda", torch.float16, "nn.functional.batch_norm"),
+    ("cuda", torch.bfloat16, "nn.functional.instance_norm"),
+    ("cuda", torch.float16, "nn.functional.instance_norm"),
+    # doesn't work
+    ("cuda", torch.bfloat16, "nn.functional.embedding"),
+
+}
+
+all_decomposed = set()
+all_called = defaultdict(int)
+
+# Helpful snippet for testing coverage
+"""
+import atexit
+def check_coverage():
+    print("missing coverage:")
+    print("\n".join(map(str, decomposition_table.keys() - all_decomposed)))
+atexit.register(check_coverage)
+"""
+
+# Helpful snippet for Horace to create his google sheet :)
+"""
+import atexit
+def dump_ops():
+    with open('run_ops.txt', 'w') as f, open('count_ops.txt', 'w') as g:
+        for op, count in sorted(all_called.items(), key=lambda x: x[0].__name__):
+            f.write(f'{op.__name__}\n')
+            g.write(f'{count}\n')
+    with open('run_decompositions.txt', 'w') as f:
+        for op in sorted([i.__name__ for i in all_decomposed]):
+            f.write(f'{op}\n')
+
+atexit.register(dump_ops)
+"""
+
+
+def any_unsupported(args, kwargs):
+    def test_unsupported(t):
+        if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
+            # These are all things that we haven't coded decompositions
+            # to handle correctly.  Maybe they should.
+            return any([
+                t.is_sparse_csr, t.is_sparse, t.is_mkldnn, t.is_quantized,
+                t.is_nested, torch._is_functional_tensor(t),
+            ])
+        elif torch.overrides.is_tensor_like(t):
+            # Decompositions will generally change the behavior of Tensor-like
+            # subclasses, so bypass tests in this case too
+            return True
+        else:
+            return False
+
+    flat_args, _ = tree_flatten(args)
+    flat_kwargs, _ = tree_flatten(kwargs)
+    return any(test_unsupported(x) for x in itertools.chain(flat_args, flat_kwargs))
+
+
+class TestDecomp(TestCase):
+    longMessage = True
+
+    # NB: This actually overlaps with test_comprehensive, but it only
+    # runs on things that are definitely decomposed so it's a lot faster
+    # to run
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyNativeDeviceTypes
+    @skipIfCrossRef
+    @suppress_warnings
+    @ops(_decomp_test_ops)
+    def test_quick(self, device, dtype, op):
+        self.do_cross_ref(device, dtype, op, run_all=False)
+
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyNativeDeviceTypes
+    @skipIfCrossRef
+    @suppress_warnings
+    @ops(op_db)
+    def test_comprehensive(self, device, dtype, op):
+        self.do_cross_ref(device, dtype, op, run_all=True)
+
+    def do_cross_ref(self, device, dtype, op, *, run_all):
+        if (torch.device(device).type, dtype, op.name) in CROSS_REF_EXCLUDE_SET or (
+            None,
+            dtype,
+            op.name,
+        ) in CROSS_REF_EXCLUDE_SET:
+            self.skipTest(f"{op.name} in {dtype} not supported")
+
+        test_dtype = dtype
+
+        # We check the correctness of each decomposition right after running it.
+        # So, when we encounter a decomposition, we run the function normally, and
+        # then run the decomposition, and ensure they're identical.
+        called = set()
+        decomposed = set()
+
+        saved_precision = self.precision
+        saved_rel_tol = self.rel_tol
+
+        class DecompCrossRefMode(torch.Tensor):
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                with no_dispatch():
+                    return cls._torch_dispatch(func, types, args, kwargs)
+
+            @classmethod
+            def _torch_dispatch(cls, func, types, args=(), kwargs=None):
+                self.precision = saved_precision
+                self.rel_tol = saved_rel_tol
+
+                called.add(func)
+                all_called[func] += 1
+
+                # Stuff we shouldn't bother testing
+                # (TODO: remove detach from the decomp table?)
+                if func not in decomposition_table or func in [
+                    torch.ops.aten.detach.default
+                ] or any_unsupported(args, kwargs):
+                    return func(*args, **kwargs)
+
+                decomposed.add(func)
+                all_decomposed.add(func)
+
+                # We take 2 main strategies for verifying correctness/numerical stability of decompositions
+                # The first one is simply tolerance checking between decomp_out and pytorch_out
+                # However, for fp16/bf16 and reductions, this becomes very
+                # finicky, as there are not many guarantees we can make.
+                # So, for fp16/bf16, we instead compare the difference of
+                # {decomp_out, pytorch_out_64} and {pytorch_out,
+                # pytorch_out_64}. In other words, we compare how far the
+                # decomposition and pytorch are from the "ground truth" (i.e.
+                # fp64). If the decomposition results in more error, we error
+
+                decomposition = decomposition_table[func]
+
+                do_relative_check = test_dtype in [torch.float16, torch.bfloat16]
+                real_out_unflat = func(*args, **kwargs)
+                real_out, _ = tree_flatten(real_out_unflat)
+                decomp_out, _ = tree_flatten(decomposition(*args, **kwargs))
+                assert len(real_out) == len(decomp_out)
+
+                if do_relative_check:
+                    upcast = partial(upcast_tensor, func, dtype=torch.float64)
+                    real_out_double, _ = tree_flatten(
+                        func(*tree_map(upcast, args), **tree_map(upcast, kwargs))
+                    )
+                    for orig, decomp, ref in zip(real_out, decomp_out, real_out_double):
+                        if orig is None:
+                            assert decomp is None
+                            continue
+                        op_assert_ref(self, func, orig, decomp, ref, args, kwargs)
+                else:
+                    for orig, decomp in zip(real_out, decomp_out):
+                        if orig is None:
+                            assert decomp is None
+                            continue
+                        op_assert_equal(self, func, orig, decomp, args, kwargs)
+
+                return real_out_unflat
+
+        requires_grad = (
+            op.supports_autograd
+            and dtype in op.supported_backward_dtypes(torch.device(device).type)
+            # TODO: OpInfo really ought to error out for this case, but it's
+            # not exercised in test_ops_gradients atm.  The problem is not
+            # complex32 per-se (which is supported by data movement only ops)
+            # but that when we do backwards we expect other ops like add to work
+            and not dtype == torch.complex32
+        )
+        samples = op.sample_inputs(device, test_dtype, requires_grad=requires_grad)
+
+        def check_decomposed(aten_name):
+            self.assertTrue(
+                any(overload_to_aten_name(c) == aten_name for c in decomposed),
+                msg=f"aten.{aten_name} was not decomposed, saw calls for: "
+                + ", ".join(map(str, list(called))),
+            )
+
+        aten_name = op.decomp_aten_name or op.aten_name
+
+        func = op.get_op()
+        for sample_input in samples:
+            if requires_grad:
+                fn, primals = normalize_op_input_output(func, sample_input)
+                primals = tree_map(
+                    lambda x: x if isinstance(x, torch.Tensor) else x, primals
+                )
+
+                # Once https://github.com/pytorch/pytorch/pull/75965/ I can
+                # store the called list on the mode object instance and no
+                # explicit clearing is necessary as I will create a fresh mode
+                # for each region
+                decomposed.clear()
+                with enable_torch_dispatch_mode(DecompCrossRefMode):
+                    decomp_out, decomp_vjp_fn = ref_vjp_no_create(fn, *primals)
+                if aten_name in decomposition_names:
+                    check_decomposed(aten_name)
+
+                if op.aten_backward_name in decomposition_names or run_all:
+                    cotangents = tree_map(lambda x: torch.randn_like(x), decomp_out)
+
+                    decomposed.clear()
+                    with enable_torch_dispatch_mode(DecompCrossRefMode):
+                        decomp_vjp_fn(cotangents)
+                    if not run_all:
+                        check_decomposed(op.aten_backward_name)
+
+            elif aten_name in decomposition_names or run_all:
+                args = [sample_input.input] + list(sample_input.args)
+                kwargs = sample_input.kwargs
+                decomposed.clear()
+                with enable_torch_dispatch_mode(DecompCrossRefMode):
+                    func(*args, **kwargs)
+                if not run_all:
+                    check_decomposed(aten_name)
+            else:
+                assert op.supports_autograd
+                self.skipTest(
+                    "only backwards is decomposed, but dtype doesn't support AD"
+                )
+
+
+instantiate_device_type_tests(TestDecomp, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index 37a6054f9151..bf609cf50b3e 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -532,8 +532,8 @@ def test_computed_table_with_ambiguous_autogradother(self):
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
             lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"),
-            # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"),
+            # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x })
+            lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -541,12 +541,12 @@ def test_computed_table_with_ambiguous_autogradother(self):
 schema: test::foo(Tensor x) -> (Tensor)
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
-QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+FPGA: fn_fpga :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
 ''')
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',))
+        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',))
 
         self.assertExpectedInline(extracted_table, '''\
 Undefined: fn_math [math kernel]
@@ -557,7 +557,7 @@ def test_computed_table_with_ambiguous_autogradother(self):
 AutogradCPU: fn_math [math kernel]
 AutogradCUDA: fn_math [math kernel]
 AutogradXLA: fn_math [math kernel]
-QuantizedCPU: fn_quantizedcpu [kernel]
+FPGA: fn_fpga [kernel]
 ''')
 
     def test_computed_table_with_cpu_defaultbackend(self):
@@ -616,7 +616,7 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self):
 ''')
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',))
+        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',))
 
         self.assertExpectedInline(extracted_table, '''\
 Undefined: fn_defaultbackend [default backend kernel]
@@ -627,7 +627,7 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self):
 AutogradCPU: fn_autograd [autograd kernel]
 AutogradCUDA: fn_autograd [autograd kernel]
 AutogradXLA: fn_autograd [autograd kernel]
-QuantizedCPU: fn_defaultbackend [default backend kernel]
+FPGA: fn_defaultbackend [default backend kernel]
 ''')
 
     def test_computed_table_with_cpu_autograd_math_defaultbackend(self):
@@ -808,7 +808,7 @@ def test_basic(self):
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
 Lazy            fn_Lazy [kernel]
-QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
+FPGA            fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fallthrough [backend fallback]
@@ -829,7 +829,7 @@ def test_math_autogradcpu(self):
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
 Lazy            fn_Lazy [kernel]
-QuantizedCPU    fn_CompositeImplicitAutograd [math kernel]
+FPGA            fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
@@ -864,7 +864,7 @@ def test_defaultbackend_autogradcpu(self):
 CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
 Lazy            fn_Lazy [kernel]
-QuantizedCPU    fn_CompositeExplicitAutograd [default backend kernel]
+FPGA            fn_CompositeExplicitAutograd [default backend kernel]
 AutogradOther   fallthrough [backend fallback]
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     fallthrough [backend fallback]
@@ -889,7 +889,7 @@ def test_defaultbackend_autogradcpu(self):
 
     def test_autogradother(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "QuantizedCPU", "CompositeImplicitAutograd"])
+        dispatcher.register(["CPU", "FPGA", "CompositeImplicitAutograd"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
             '''\
@@ -900,7 +900,7 @@ def test_autogradother(self):
 CPU             fn_CPU [kernel]
 XLA             fn_CompositeImplicitAutograd [math kernel]
 Lazy            fn_CompositeImplicitAutograd [math kernel]
-QuantizedCPU    fn_QuantizedCPU [kernel]
+FPGA            fn_FPGA [kernel]
 AutogradOther   ambiguous_autogradother [ambiguous autogradother]
 AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fn_CompositeImplicitAutograd [math kernel]
@@ -915,8 +915,8 @@ def test_autogradother(self):
 Registered Kernels
 key             kernel
 ---------------------------
+FPGA            fn_FPGA
 CPU             fn_CPU
-QuantizedCPU    fn_QuantizedCPU
 CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
 '''
         )
@@ -935,5 +935,20 @@ def test_defaultbackend_math(self):
                 r"Registration to both CompositeImplicitAutograd and CompositeExplicitAutograd is not allowed"):
             dispatcher.register(["CompositeExplicitAutograd", "CompositeImplicitAutograd"])
 
+    def test_quantized_structured_not_implemented(self):
+        x = torch.zeros([1, 1, 1])
+        y = torch.zeros([1, 1, 1])
+        scale, zero_point = 1.0, 0
+        dtype = torch.qint8
+        qx = torch.quantize_per_tensor(x, scale, zero_point, dtype)
+        qy = torch.quantize_per_tensor(y, scale, zero_point, dtype)
+        # If bmm gets quantized support you need to update this to something
+        # else that is not implemented
+        self.assertRaisesRegex(
+            NotImplementedError,
+            "Could not run 'aten::bmm.out' with arguments from the 'QuantizedCPU' backend.",
+            lambda: torch.bmm(qx, qy)
+        )
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_expanded_weights.py b/test/test_expanded_weights.py
new file mode 100644
index 000000000000..a1eb96019cfd
--- /dev/null
+++ b/test/test_expanded_weights.py
@@ -0,0 +1,481 @@
+# Owner(s): ["module: nn"]
+
+from functools import partial
+from itertools import product, chain
+import unittest
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from torch.nn.utils._per_sample_grad import call_for_per_sample_grads
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_device_type import OpDTypes, instantiate_device_type_tests, ops
+from torch.testing._internal.common_nn import TestBase, module_tests, new_module_tests
+from torch.testing._internal.common_utils import TestCase, freeze_rng_state, make_tensor, run_tests
+from torch.testing._internal.common_methods_invocations import SampleInput, op_db
+from torch.nn.utils._expanded_weights import ExpandedWeight
+from torch.nn.utils._expanded_weights.expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \
+    unpack_expanded_weight_or_tensor, sum_over_all_but_batch_and_last_n, standard_kwargs
+
+class TestContext:
+    pass
+
+class TestExpandedWeightHelperFunction(TestCase):
+    def test_forward_helper(self, device):
+        input = torch.randn(3, 4, device=device)
+        weight = torch.randn(5, 4, device=device)
+        bias = torch.randn(5, device=device)
+        for (weight_batched, bias_batched) in product([True, False], [True, False]):
+            maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 3) if weight_batched else weight
+            maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 3) if bias_batched else bias
+            args = (input, maybe_batched_weight, maybe_batched_bias)
+            expanded_args, expanded_kwargs = standard_kwargs(('bias',), args)
+            res = forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+            expected = nn.functional.linear(input, weight, bias)
+            self.assertEqual(res, expected)
+
+            self.assertEqual(len(expanded_args), 2)
+            assert expanded_args[0] is args[0]  # avoids property checks in assertEquals
+            assert expanded_args[1] is args[1]  # avoids property checks in assertEquals
+            self.assertEqual(len(expanded_kwargs), 1)
+            assert expanded_kwargs['bias'] is args[2]  # avoids property checks in assertEquals
+
+    def test_forward_helper_failure_args(self, device):
+        weight = torch.randn(5, 4, device=device)
+        bias = torch.randn(5, device=device)
+        with self.assertRaisesRegex(RuntimeError, r"do not support inputs that are also ExpandedWeights."):
+            input = ExpandedWeight(torch.randn(3, 4, requires_grad=True), 3)
+            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, weight, bias))
+            forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+        with self.assertRaisesRegex(RuntimeError, r"requires a Tensor as the first input"):
+            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (3, weight, bias))
+            forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+        with self.assertRaisesRegex(RuntimeError, r"requires a batch dimension but got an input of size 0"):
+            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.tensor(3), weight, bias))
+            forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+        with self.assertRaisesRegex(RuntimeError, r"0 is not a valid batch size for Expanded Weights"):
+            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.randn(0, 1, 2), weight, bias))
+            forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+        input = torch.randn(3, 4)
+        for (weight_batched, bias_batched) in product([True, False], [True, False]):
+            if not weight_batched and not bias_batched:
+                continue
+            maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 4) if weight_batched else weight
+            maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 4) if bias_batched else bias
+            with self.assertRaisesRegex(RuntimeError, r"Expected ExpandedWeights to have batch size matching input"):
+                expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, maybe_batched_weight, maybe_batched_bias))
+                forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
+
+    def test_set_grad_sample_if_exists(self, device):
+        def test_fn(_):
+            return True
+
+        orig_weight = torch.randn(4, device=device, requires_grad=True)
+        expanded_weight = ExpandedWeight(orig_weight, 3)
+        set_grad_sample_if_exists(expanded_weight, test_fn)
+        self.assertTrue(hasattr(orig_weight, 'grad_sample'))
+        self.assertTrue(orig_weight.grad_sample)
+
+        basic_tensor = torch.randn(4, device=device)
+        set_grad_sample_if_exists(basic_tensor, test_fn)
+        self.assertFalse(hasattr(basic_tensor, 'grad_sample'))
+
+        non_tensor = 3
+        set_grad_sample_if_exists(non_tensor, test_fn)
+        self.assertFalse(hasattr(non_tensor, 'grad_sample'))
+
+    def test_set_grad_sample_if_exists_failure(self, device):
+        def test_fn(_):
+            return True
+
+        grad_tensor = torch.randn(4, requires_grad=True, device=device)
+        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+            set_grad_sample_if_exists(grad_tensor, test_fn)
+
+    def test_unpack_expanded_weight_or_tensor(self, device):
+        input = torch.randn(3, requires_grad=True, device=device)
+        self.assertEqual(input, unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3)))
+
+        input.requires_grad_(False)
+        self.assertEqual(input, unpack_expanded_weight_or_tensor(input))
+        self.assertTrue(unpack_expanded_weight_or_tensor(4) is None)
+
+    def test_unpack_expanded_weight_or_tensor_with_custom_function(self, device):
+        input = torch.randn(3, requires_grad=True, device=device)
+        self.assertTrue(unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3), lambda x: x is input))
+
+        input.requires_grad_(False)
+        self.assertTrue(unpack_expanded_weight_or_tensor(input, lambda x: x is input))
+        self.assertTrue(unpack_expanded_weight_or_tensor(4, lambda x: x is input) is None)
+
+    def test_unpack_expanded_weight_or_tensor_failure(self, device):
+        input = torch.randn(3, requires_grad=True, device=device)
+        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+            unpack_expanded_weight_or_tensor(input)
+
+        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+            unpack_expanded_weight_or_tensor(input, lambda x: x is input)
+
+    def test_sum_over_all_but_batch_and_last_n(self, device):
+        input = torch.randn(1, 2, 3, 4, 5, device=device)
+        res = sum_over_all_but_batch_and_last_n(input, 2)
+        expected = input.sum((1, 2))
+        self.assertEqual(res, expected)
+
+        res = sum_over_all_but_batch_and_last_n(input, 0)
+        expected = input.sum((1, 2, 3, 4))
+        self.assertEqual(res, expected)
+
+        res = sum_over_all_but_batch_and_last_n(input, 4)
+        self.assertEqual(res, input)
+
+class TestExpandedWeightFunctional(TestCase):
+    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    def test_expanded_weight_per_sample_grad(self, device, dtype, op):
+        sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
+        for sample_input in supported_inputs(op, sample_inputs):
+            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+            input = sample_input.input
+            args = sample_input.args
+            kwargs = sample_input.kwargs
+            batch_size = input.shape[0] if len(input.shape) > 1 else 1
+
+            # get per sample grads with ExpandedWeights objects
+            (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size)
+            diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values())
+            diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)]
+            diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list]
+            if not diff_input_list:
+                continue
+            result = run_op(op, ew_input, *ew_args, **ew_kwargs)
+            result.sum().backward()  # grad doesn't work with ExpandedWeight because it calls __torch_function__
+            expanded_weight_grad = tuple(i.grad_sample if hasattr(i, "grad_sample") else i.grad for i in diff_input_list)
+
+            # get per sample grads with for loop
+            func = partial(run_op, op)
+            per_sample_grad = for_loop_per_sample_grad(batch_size, input, func, *args, **kwargs)
+
+            # check equality
+            self.assertEqual(len(per_sample_grad), len(expanded_weight_grad))
+            for (result_grad, expected_grad) in zip(expanded_weight_grad, per_sample_grad):
+                if result_grad is None:
+                    result_grad = torch.zeros_like(expected_grad)
+                self.assertEqual(result_grad, expected_grad)
+
+    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    def test_unsupported_expand_weights(self, device, dtype, op):
+        sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
+        unsupported_inputs = supported_inputs(op, sample_inputs, supported_inputs=False)
+        for sample_input in unsupported_inputs:
+            with self.assertRaisesRegex(RuntimeError, r"Expanded Weights"):
+                if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
+                    sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+                input = sample_input.input
+
+                batch_size = input.shape[0] if len(input.shape) > 1 else 1
+
+                # get per sample grads with ExpandedWeights objects
+                (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size)
+                result = run_op(op, ew_input, *ew_args, **ew_kwargs)
+                diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values())
+                diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)]
+                diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list]
+                result.sum().backward()  # grad doesn't work with ExpandedWeight because it calls __torch_function__
+
+    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported)
+    def test_expanded_weight_forward(self, device, dtype, op):
+        sample_inputs = op.sample_inputs(device, dtype)
+        for sample_input in supported_inputs(op, sample_inputs):
+            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(sample_input.args[0].clone(),
+                                           args=(sample_input.input.clone(),),
+                                           kwargs=sample_input.kwargs)
+                if "cuda" in device and "max_norm" in sample_input.kwargs and "padding_idx" in sample_input.kwargs:
+                    self.skipTest("embedding is non-determinstic in this case, see issue #74679")
+            batch_size = sample_input.input.shape[0] if len(sample_input.input.shape) > 1 else 1
+            (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size)
+            expanded_weight_result = run_op(op, ew_input, *ew_args, **ew_kwargs)
+            normal_result = run_op(op, sample_input.input, *sample_input.args, **sample_input.kwargs)
+            self.assertEqual(expanded_weight_result, normal_result)
+
+    def test_expanded_weight_error(self, device):
+        batch_size = 3
+        sample_input = make_tensor((batch_size, 4), dtype=torch.float32, device=device, requires_grad=True)
+        sample_weight = make_tensor((4), dtype=torch.float32, device=device, requires_grad=True)
+        with self.assertRaisesRegex(RuntimeError, r"Expanded Weights encountered but cannot handle function"):
+            torch.add(sample_input, ExpandedWeight(sample_weight, batch_size))
+
+    def test_small_model(self, device):
+        def convnet(num_classes):
+            return nn.Sequential(
+                nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(),
+                nn.AvgPool2d(kernel_size=2, stride=2),
+                nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(),
+                nn.AvgPool2d(kernel_size=2, stride=2),
+                nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(),
+                nn.AvgPool2d(kernel_size=2, stride=2),
+                nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                nn.Flatten(start_dim=1, end_dim=-1),
+                nn.Linear(128, num_classes, bias=True),
+            )
+
+        batch_size = 32
+        model = convnet(10).to(device)
+        input = torch.randn([batch_size, 3, 28, 28], device=device)
+        targets = torch.randint(0, 10, (batch_size,), device=device)
+        criterion = CrossEntropyLoss(reduction='sum')  # use a loss that doesn't average across the batch to test in a for loop
+        result = call_for_per_sample_grads(model, batch_size, input)
+        loss = criterion(result, targets)
+        loss.backward()
+        result = []
+        for weight in model.parameters():
+            result.append(weight.grad_sample)
+            del weight.grad_sample
+
+        expected = []
+        for i in range(batch_size):
+            loss = criterion(model(input[i].unsqueeze(0)), targets[i].unsqueeze(0))
+            expected.append(torch.autograd.grad(loss, model.parameters(), torch.ones_like(loss)))
+
+        expected = [torch.stack(grad) for grad in zip(*expected)]
+        for (res, exp) in zip(result, expected):
+            self.assertEqual(res, exp, atol=1e-4, rtol=5e-5)
+
+    def test_group_norm_error(self, device):
+        # group norm has to call native_group_norm. This checks that it hits the same errors
+        # that normal group norm would
+
+        N = 3
+        C = 5
+        inp = torch.randn(N, C)
+        with self.assertRaisesRegex(RuntimeError, r"Expected number of channels in input to be divisible"):
+            F.group_norm(inp, 2)  # 5 is not divisible by 2
+
+class TestExpandedWeightModule(TestCase):
+    def _do_test(self, module, input):
+        batch_size = input.shape[0]
+        diff_input = input.dtype == torch.float or input.dtype == torch.double
+        if diff_input:
+            input.requires_grad_()
+        with freeze_rng_state():
+            # get per sample grads with ExpandedWeights context manager
+            actual_res = call_for_per_sample_grads(module, batch_size, input).sum()
+            actual_res.backward()
+            actual_grads = []
+            for param in module.parameters():
+                actual_grads.append(param.grad_sample)
+                del param.grad_sample
+            if diff_input:
+                actual_grads.append(input.grad.clone())
+                input.grad = torch.zeros_like(input.grad)
+
+            # get per sample grads with a for loop
+            expected_res = torch.tensor(0., device=input.device, dtype=torch.double)
+            expected_grads = []
+            for i in range(batch_size):
+                input_slice = input[i]
+                diff_params = module.parameters()
+                if diff_input:
+                    diff_params = chain(diff_params, (input_slice,))
+                res = module(input_slice.unsqueeze(0)).sum()
+                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                expected_grads.append(out_grads)
+                expected_res += res
+            expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads))
+        self.assertEqual(actual_res, expected_res)
+        [self.assertEqual(actual, expected) for (actual, expected) in zip(actual_grads, expected_grads)]
+
+    def _do_test_multi_input(self, module, input):
+        class TestModule(nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self.module = module
+
+            def forward(self, input):
+                return self.module(input) + self.module(input)
+
+        batch_size = input.shape[0]
+        diff_input = input.dtype == torch.float or input.dtype == torch.double
+        if diff_input:
+            input.requires_grad_()
+        with freeze_rng_state():
+            # get per sample grads with ExpandedWeights context manager, calling .backward() twice
+            test_module = TestModule(module)
+            actual_res = call_for_per_sample_grads(test_module, batch_size, input).sum()
+            actual_res.backward()
+            actual_grads = []
+            for param in module.parameters():
+                actual_grads.append(param.grad_sample)
+                del param.grad_sample
+            if diff_input:
+                actual_grads.append(input.grad.clone())
+                input.grad = torch.zeros_like(input.grad)
+
+
+            # get per sample grads with a for loop, running over the input twice
+            expected_grads = []
+            for i in range(batch_size):
+                input_slice = input[i]
+                diff_params = module.parameters()
+                if diff_input:
+                    diff_params = chain(diff_params, (input_slice,))
+                res = module(input_slice.unsqueeze(0)).sum()
+                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                expected_grads.append(out_grads)
+        expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads))
+        expected_grads = tuple(expected_grad for expected_grad in expected_grads if expected_grad is not None)
+        assert [self.assertEqual(actual, 2 * expected) for (actual, expected) in zip(actual_grads, expected_grads)]
+
+    def test_per_sample_api_failing(self):
+        module = nn.Linear(10, 10)
+        input = torch.randn(64, 10)
+        with self.assertRaisesRegex(RuntimeError, r"Module passed must be nn.Module"):
+            call_for_per_sample_grads("fail", 64, input)
+        with self.assertRaisesRegex(RuntimeError, r"Batch size passed must be an integer"):
+            call_for_per_sample_grads(module, 6.4, input)
+        with self.assertRaisesRegex(RuntimeError, r"Batch size must be positive"):
+            call_for_per_sample_grads(module, -64, input)
+        with self.assertRaisesRegex(RuntimeError, r"incorrect for multiple calls"):
+            loss = call_for_per_sample_grads(module, 64, input).sum()
+            loss.backward()  # populate grad_sample fields
+            call_for_per_sample_grads(module, 64, input)
+
+class ContextManagerTests(TestBase):
+    def __init__(self, *args, **kwargs):
+        self.test_cpu = kwargs.get('test_cpu', True)
+        self.test_cuda = kwargs.get('test_cuda', True)
+        super().__init__(*args, **kwargs)
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', False)
+
+    def test_context_manager(self, test_case, device):
+        kwargs = {'device': device, 'dtype': torch.double}
+        module = self.constructor(*self.constructor_args).to(**kwargs)
+        if 'Embedding' in self.get_name():
+            kwargs['dtype'] = torch.long
+        input = self._get_input().to(**kwargs)
+        if len(input.shape) == 0 or input.shape[0] == 0:
+            raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0")
+        if self.constructor == torch.nn.Linear and len(input.shape) == 1:
+            raise unittest.SkipTest("Can't get per sample gradients for input of rank 1")
+        test_case._do_test(module, input)
+
+    def test_context_manager_multiple_inputs(self, test_case, device):
+        module = self.constructor(*self.constructor_args).to(device)
+        input = self._get_input()
+        if len(input.shape) == 0 or input.shape[0] == 0:
+            raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0")
+        if self.constructor == torch.nn.Linear and len(input.shape) == 1:
+            raise unittest.SkipTest("Can't get per sample gradients for input of rank 1")
+        test_case._do_test_multi_input(module, input)
+
+# TODO: Once all of these use ModuleInfo, replace with ModuleInfo tests
+# These currently use the legacy nn tests
+supported_modules = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'Embedding', 'LayerNorm', 'GroupNorm', 'InstanceNorm']
+supported_tests = [t for t in module_tests + new_module_tests if 'module_name' in t and t['module_name'] in supported_modules]
+for test_param in supported_tests:
+    if 'constructor' not in test_param:
+        name = test_param.pop('module_name')
+        test_param['constructor'] = getattr(nn, name)
+    decorator = test_param.pop('decorator', None)
+    test = ContextManagerTests(**test_param)
+    test_name = test.get_name()
+    if hasattr(TestExpandedWeightModule, test_name):
+        raise RuntimeError('Found two tests with the same name: ' + test_name)
+    test_name_multi_input = test.get_name() + "_multiple_inputs"
+    if hasattr(TestExpandedWeightModule, test_name_multi_input):
+        raise RuntimeError('Found two tests with the same name: ' + test_name)
+    if decorator is not None:
+        fn = decorator(fn)
+    if test.test_cpu:
+        setattr(TestExpandedWeightModule, test_name, lambda self, test=test: test.test_context_manager(self, 'cpu'))
+        setattr(TestExpandedWeightModule, test_name_multi_input,
+                lambda self, test=test: test.test_context_manager_multiple_inputs(self, 'cpu'))
+    if TEST_CUDA and test.test_cuda:
+        # since this checks derivatives, only use double for precision
+        setattr(TestExpandedWeightModule, test_name + '_cuda_double',
+                lambda self, test=test: test.test_context_manager(self, 'cuda'))
+
+# ------------- HELPER FUNCTIONS -----------------
+
+def run_op(op, input, *args, **kwargs):
+    r"""
+    OpInfo for Embedding switches the input and weight so autograd tests will only check the derivative
+    of the weight, not the input, which can't be differentiable since its dtype is int. Calls op,
+    using the special ordering that Embedding's OpInfo expects for that case.
+    """
+    if op.name == "nn.functional.embedding":
+        return op(args[0], input, **kwargs)
+    else:
+        return op(input, *args, **kwargs)
+
+def make_expanded_weight(sample_input, batch_size):
+    def expanded_weight_or_clone(arg):
+        return ExpandedWeight(torch.clone(arg), batch_size) if is_diff_tensor(arg) else clone_if_tensor(arg)
+
+    ew_input = clone_if_tensor(sample_input.input)
+    ew_args = tuple(expanded_weight_or_clone(arg) for arg in sample_input.args)
+    ew_kwargs = {name: expanded_weight_or_clone(arg) for (name, arg) in sample_input.kwargs.items()}
+    return ew_input, ew_args, ew_kwargs
+
+def supported_inputs(op, sample_inputs, supported_inputs=True):
+    r"""
+    ExpandedWeights currently does not support some use cases when there's no batch dimension or
+    operations that would cause inter-batch operations. Removes all of the cases it cannot deal with
+    """
+    def filter_fn(input):
+        convolutions = ["nn.functional.conv1d", "nn.functional.conv2d", "nn.functional.conv3d"]
+        if op.name == "nn.functional.linear":
+            is_supported_input = len(input.input.shape) > 1  # input of rank 1 means no batch dim
+        elif op.name == "nn.functional.layer_norm":
+            normalized_shape = input.args[0]
+            is_supported_input = input.input.shape != normalized_shape  # would cause inter-batch operations
+        elif op.name in convolutions:
+            # currently can't deal with padding computation on Python level
+            is_supported_input = 'padding' not in input.kwargs or not isinstance(input.kwargs['padding'], str)
+        elif op.name == "nn.functional.embedding":
+            idx = input.args[0]
+            is_supported_input = len(idx.shape) > 1  # there's no batch size
+        else:
+            is_supported_input = True
+        is_supported_input = is_supported_input and input.input.shape[0] > 0  # 0 is not a valid batch size
+        return is_supported_input if supported_inputs else not is_supported_input
+    return [input for input in sample_inputs if filter_fn(input)]
+
+def for_loop_per_sample_grad(batch_size, input, func, *args, **kwargs):
+    # get per sample grads by getting derivative for each input in a for loop
+    per_sample_grad = []
+    for i in range(batch_size):
+        per_sample_input = input[i]
+        result = func(per_sample_input.unsqueeze(0), *args, **kwargs)
+        diff_input_list = (per_sample_input,) + tuple(args) + tuple(kwargs.values())
+        diff_input_list = [i for i in diff_input_list if isinstance(i, torch.Tensor) and i.requires_grad]
+        per_sample_grad.append(torch.autograd.grad(result, diff_input_list, torch.ones_like(result), allow_unused=True))
+    if len(per_sample_grad) == batch_size:
+        per_sample_grad = tuple(torch.stack(grad) for grad in zip(*per_sample_grad))
+    return per_sample_grad
+
+def is_diff_tensor(t):
+    return isinstance(t, ExpandedWeight) or (isinstance(t, torch.Tensor) and t.requires_grad)
+
+def clone_if_tensor(t):
+    if isinstance(t, torch.Tensor):
+        res = torch.clone(t).detach()
+        res.requires_grad_(t.requires_grad)
+        return res
+    else:
+        return t
+
+instantiate_device_type_tests(TestExpandedWeightHelperFunction, globals())
+instantiate_device_type_tests(TestExpandedWeightFunctional, globals())
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_foreach.py b/test/test_foreach.py
index a04ddcebbaae..4da23dc66fc3 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -11,12 +11,13 @@
 from torch.testing._comparison import default_tolerances
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM, TEST_WITH_SLOW
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops)
+    (instantiate_device_type_tests, dtypes, onlyCUDA, skipMeta, ops)
 from torch.testing._internal.common_methods_invocations import (
     foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db,
     foreach_reduce_op_db)
 from torch.testing._internal.common_dtype import (
-    get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes,
+    all_types_and_complex_and, all_types_and, integral_types, complex_types,
+    floating_types_and, floating_types, integral_types_and,
 )
 
 # Includes some values such that N * N won't be a multiple of 4,
@@ -140,7 +141,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis
         self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True)
         if opinfo.supports_alpha_param:
             alpha = None
-            if dtype in get_all_int_dtypes():
+            if dtype in integral_types():
                 alpha = 3
             elif dtype.is_complex:
                 alpha = complex(3, 3)
@@ -165,19 +166,11 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis
         self._binary_test(
             dtype, inplace_op, inplace_ref, inputs, is_fastpath and disable_fastpath, is_inplace=True)
 
-    # note(mkozuki): Why ROCm?
-    # ROCm is supposed to compile slow path as in
-    # https://github.com/pytorch/pytorch/blob/7e032f18cf1405804c4f787b05ea2de5e08a091e/aten/src/ATen/native/ForeachUtils.h#L148-L164,  # noqa: E501
-    # Therefore `[torch.add(*args, alpha=alpha) for args in zip(tensors1, tensors2)]` and
-    # `torch._foreach_add(tensors1, tensors2, alpha=alpha)`
-    # are expected to return the same outputs, however, the outputs look unstable for torch.bfloat16 and torch.half.
-    # log: https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm4.2-py3.6-test1/2741/console
-    @skipCUDAIfRocm
     @skipMeta
     @ops(foreach_binary_op_db)
     def test_binary_op_tensorlists_fastpath(self, device, dtype, op):
         for N in N_values:
-            disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
+            disable_fastpath = op.ref == torch.div and dtype in integral_types_and(torch.bool)
             if op.ref == torch.add and dtype == torch.bool:
                 disable_fastpath = True
             self._test_binary_op_tensorlists(device, dtype, op, N, True, disable_fastpath)
@@ -194,22 +187,21 @@ def _test_binary_op_scalar(self, device, dtype, opinfo, N, scalar, is_fastpath,
         self._binary_test(dtype, op, ref, inputs, is_fastpath, is_inplace=False)
         self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True)
 
-    @skipCUDAIfRocm
     @skipMeta
     @ops(foreach_binary_op_db)
     def test_binary_op_scalar_fastpath(self, device, dtype, op):
         for N, scalar in itertools.product(N_values, Scalars):
-            disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
+            disable_fastpath = op.ref == torch.div and dtype in integral_types_and(torch.bool)
             if isinstance(scalar, int):
                 disable_fastpath |= dtype == torch.bool
             if isinstance(scalar, float):
-                disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool]
+                disable_fastpath |= dtype in integral_types_and(torch.bool)
             if isinstance(scalar, bool):
                 disable_fastpath |= dtype == torch.bool
                 if op.ref in (torch.add, torch.mul):
                     disable_fastpath = False
             if isinstance(scalar, complex):
-                disable_fastpath |= dtype not in get_all_complex_dtypes()
+                disable_fastpath |= dtype not in complex_types()
             self._test_binary_op_scalar(device, dtype, op, N, scalar, True, disable_fastpath)
 
     @ops(foreach_binary_op_db)
@@ -233,22 +225,21 @@ def _test_binary_op_scalarlist(self, device, dtype, opinfo, N, scalarlist, is_fa
     # errors depending on the order of scalarlist. To keep actual unit test impl simple,
     # separating mixed scalarlist tests. By setting the first element of scalarlist to bool,
     # they are expected to throw bool sub error even in inplace test.
-    @skipCUDAIfRocm
     @skipMeta
     @ops(foreach_binary_op_db)
     def test_binary_op_scalarlist_fastpath(self, device, dtype, op):
         for N in N_values:
             for type_str, scalarlist in getScalarLists(N):
-                bool_int_div = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
+                bool_int_div = op.ref == torch.div and dtype in integral_types_and(torch.bool)
                 disable_fastpath = bool_int_div
                 if type_str == "int":
                     disable_fastpath |= dtype == torch.bool
                 if type_str == "float":
-                    disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool]
+                    disable_fastpath |= dtype in integral_types_and(torch.bool)
                 if type_str == "complex":
-                    disable_fastpath |= dtype not in get_all_complex_dtypes()
+                    disable_fastpath |= dtype not in complex_types()
                 if type_str == "mixed":
-                    disable_fastpath |= True and dtype not in get_all_complex_dtypes()
+                    disable_fastpath |= True and dtype not in complex_types()
                 self._test_binary_op_scalarlist(device, dtype, op, N, scalarlist, True, disable_fastpath)
 
     @ops(foreach_binary_op_db)
@@ -305,7 +296,7 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast
     @skipMeta
     @ops(foreach_pointwise_op_db)
     def test_pointwise_op_fastpath(self, device, dtype, op):
-        disable_fastpath = dtype in get_all_int_dtypes() + [torch.bool]
+        disable_fastpath = dtype in integral_types_and(torch.bool)
         # for N, scalar in itertools.product(N_values, Scalars):
         for N in N_values:
             self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath)
@@ -363,7 +354,7 @@ def _test_unary(self, device, dtype, opinfo, N, is_fastpath):
         op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, 1)
         inputs = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
         # note(mkozuki): Complex inputs for `_foreach_abs` go through slowpath.
-        if opinfo.name == "_foreach_abs" and dtype in get_all_complex_dtypes():
+        if opinfo.name == "_foreach_abs" and dtype in complex_types():
             is_fastpath = False
         self._regular_unary_test(dtype, op, ref, inputs, is_fastpath)
         self._inplace_unary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath)
@@ -374,7 +365,7 @@ def test_unary_fastpath(self, device, dtype, op):
         for N in N_values:
             self._test_unary(device, dtype, op, N, is_fastpath=True)
 
-    @ops(foreach_unary_op_db, dtypes=get_all_dtypes())
+    @ops(foreach_unary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_unary_slowpath(self, device, dtype, op):
         for N in N_values:
             self._test_unary(device, dtype, op, N, is_fastpath=False)
@@ -391,7 +382,7 @@ def test_minmax_fastpath(self, device, dtype, op):
             self._minmax_test(op, inputs, True, N if dtype == torch.bool else 1)
 
     @ops(foreach_minmax_op_db,
-         dtypes=get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
+         dtypes=all_types_and(torch.half, torch.bfloat16, torch.bool))
     def test_minmax_slowpath(self, device, dtype, op):
         for N in N_values:
             inputs = tuple(op.sample_inputs(device, dtype, N, noncontiguous=True) for _ in range(2))
@@ -399,7 +390,7 @@ def test_minmax_slowpath(self, device, dtype, op):
 
     # note(mkozuki): ForeachFuncInfo's of both `_foreach_maximum` and `_foreach_minimum` include integer types.
     # so, manually limit dtypes to fp types for inf&nan tests.
-    @ops(foreach_minmax_op_db, dtypes=get_all_fp_dtypes(include_bfloat16=True, include_half=True))
+    @ops(foreach_minmax_op_db, dtypes=floating_types_and(torch.half, torch.bfloat16))
     def test_minmax_float_inf_nan(self, device, dtype, op):
         inputs = (
             [
@@ -424,7 +415,7 @@ def _reduce_test(self, opinfo, inputs, ord, is_fastpath, n_expected_cudaLaunchKe
     @ops(foreach_reduce_op_db)
     def test_reduce_fastpath(self, device, dtype, op):
         for N, ord in itertools.product(N_values, (0, 1, 2, -1, -2)):
-            if ord in (1, 2) and dtype in torch.testing.get_all_fp_dtypes():
+            if ord in (1, 2) and dtype in floating_types_and(torch.half, torch.bfloat16):
                 n_expected_cudaLaunchKernels = 3
             else:
                 n_expected_cudaLaunchKernels = N
@@ -437,7 +428,7 @@ def test_reduce_slowpath(self, device, dtype, op):
             inputs = op.sample_inputs(device, dtype, N, noncontiguous=True),
             self._reduce_test(op, inputs, ord, False, 1)
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
         # TODO: enable empty list case
         for tensors in [[torch.randn([0])]]:
@@ -447,7 +438,7 @@ def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
             torch._foreach_add_(tensors, 1)
             self.assertEqual(res, tensors)
 
-    @ops(foreach_binary_op_db, dtypes=get_all_dtypes())
+    @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_binary_op_scalar_with_overlapping_tensors(self, device, dtype, op):
         foreach_op, ref = op.method_variant, op.ref
         tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)]
@@ -479,7 +470,7 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op):
             runtime_error = e
         self.assertIsNone(runtime_error)
 
-    @ops(foreach_binary_op_db, dtypes=get_all_dtypes())
+    @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_binary_op_list_error_cases(self, device, dtype, op):
         foreach_op, foreach_op_, ref, ref_ = op.method_variant, op.inplace_variant, op.ref, op.ref_inplace
         tensors1 = []
@@ -534,7 +525,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
                 return
             with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
                 foreach_op([tensor1], [tensor2])
-            if dtype in get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div:
+            if dtype in integral_types_and(torch.bool) and foreach_op == torch._foreach_div:
                 with self.assertRaisesRegex(RuntimeError, "result type"):
                     foreach_op_([tensor1], [tensor2])
             else:
@@ -543,7 +534,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
 
     @skipMeta
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
-    @ops(foreach_binary_op_db, dtypes=get_all_dtypes())
+    @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_binary_op_list_slow_path(self, device, dtype, op):
         # note(mkozuki): why `n_expected_cudaLaunchKernels=0`?
         # In this test, foreach functions don't go through fast path,
@@ -635,7 +626,7 @@ def test_binary_op_tensors_on_different_devices(self, device, dtype, op):
             self.assertEqual(actual, tensors1)
 
     @onlyCUDA
-    @ops(foreach_pointwise_op_db, allowed_dtypes=get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @ops(foreach_pointwise_op_db, allowed_dtypes=floating_types())
     def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op):
         # tensors1: ['cuda', 'cpu]
         # tensors2: ['cuda', 'cpu]
@@ -653,6 +644,27 @@ def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op):
         foreach_op_(tensors1, tensors2, tensors3)
         self.assertEqual(expected, tensors1)
 
+    # note: BFloat16 has the same number of exponent bits as FP32
+    # so if squared L2 norm overflows in BF16, then it also overflows in FP32.
+    @onlyCUDA
+    @ops(foreach_reduce_op_db, allowed_dtypes=(torch.half, torch.bfloat16))
+    def test_foreach_l2_large_value_input(self, device, dtype, op):
+        ord, N = 2, 10
+        max_value = torch.finfo(dtype).max
+        scaler = torch.tensor([max_value]).sqrt().to(device=device, dtype=dtype)
+        inputs = [t * scaler for t in op.sample_inputs(device, dtype, N, noncontiguous=False, low=1)],
+        # make sure that the min. of squared L2 norm value per tensor is greater than the max value of `dtype`.
+        self.assertTrue(scaler * scaler * N > max_value)
+        fn, ref_fn, *_ = self._get_funcs(op, 3)
+        actual = fn(inputs, is_cuda=True, is_fastpath=True, ord=ord)
+        expect = ref_fn(inputs, ord=ord)
+        if dtype == torch.float16:
+            # making sure the reference L2 norm values are in the range of FP16.
+            self.assertFalse(any(torch.isinf(e) for e in expect))
+        else:
+            self.assertTrue(all(torch.isinf(e) for e in expect))
+        self.assertEqual(expect, actual, equal_nan=False)
+
 
 instantiate_device_type_tests(TestForeach, globals())
 
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 28476ff25957..31220b9f2d5a 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -2,7 +2,10 @@
 
 import torch
 from torch.testing._internal.common_utils import TestCase, run_tests
-from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs, log_input
+from torch.testing._internal.logging_tensor import LoggingTensor, LoggingTensorReentrant, capture_logs, log_input
+from torch.utils._pytree import tree_map
+
+import logging
 
 def are_aliased(x, y):
     if x._base is None and y._base is None:
@@ -13,23 +16,63 @@ def are_aliased(x, y):
         return y._base is x
     return x._base is y._base
 
+# Just for testing: a logging tensor that also transforms out-of-place ops into inplace ops.
+# That way even if the outer wrapper is functionalized, the inner wrapper will also need functionalization.
+class InplaceLoggingTensor(LoggingTensorReentrant):
+    @staticmethod
+    def __new__(cls, e):
+        r = torch.Tensor._make_wrapper_subclass(cls, e.shape, dtype=e.dtype, requires_grad=False)
+        r.elem = e
+        return r
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    def __str__(self):
+        return f'InplaceLoggingTensor({self.elem})'
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(e):
+            if isinstance(e, InplaceLoggingTensor):
+                return e.elem
+            else:
+                return e
+
+        def wrap(e):
+            if isinstance(e, torch.Tensor):
+                return InplaceLoggingTensor(e)
+            else:
+                return e
+        f = func
+        # this subclass converts all `add()` ops into `add_()` ops
+        if f is torch.ops.aten.add.Tensor:
+            f = torch.ops.aten.add_.Tensor
+
+        with cls.context():
+            rs = tree_map(wrap, f(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+        # after running the (potentially transformed) op,
+        # log the original op that we saw.
+        logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
+        return rs
+
+
 
 class TestFunctionalization(TestCase):
 
-    def get_logs(self, func, inpt):
+    def get_logs(self, func, inpt, *, reapply_views=False):
         input_clone_logging = LoggingTensor(inpt.clone())
         input_functional_logging = torch._to_functional_tensor(input_clone_logging)
 
         with capture_logs() as logs:
             log_input("input", input_clone_logging)
-            torch._enable_functionalization()
+            torch._enable_functionalization(reapply_views=reapply_views)
             try:
                 func(input_functional_logging)
             finally:
                 torch._disable_functionalization()
         return logs
 
-    def assert_functionalization(self, func, inpt):
+    def assert_functionalization(self, func, inpt, *, reapply_views=False):
         input_clone = inpt.clone()
         input_clone2 = inpt.clone()
         input_functional = torch._to_functional_tensor(input_clone2)
@@ -37,7 +80,7 @@ def assert_functionalization(self, func, inpt):
         # Compare outputs (and mutated inputs), with and without functionalization.
         out_ref = func(inpt)
 
-        torch._enable_functionalization()
+        torch._enable_functionalization(reapply_views=reapply_views)
         try:
             out_functional = func(input_functional)
         finally:
@@ -61,13 +104,57 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.view($0, [4, 2])
-$2 = torch._ops.aten.add($1, tensor([[1., 1.],
+$1 = torch._ops.aten.view_copy.default($0, [4, 2])
+$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.],
+        [1., 1.],
+        [1., 1.],
+        [1., 1.]]))
+$3 = torch._ops.aten.view_copy.default($2, [4, 2])
+$4 = torch._ops.aten.mul.Tensor($3, $3)""")
+
+    def test_simple_out(self):
+        def f(x):
+            tmp = torch.ones(4, 2)
+            y = x.view(4, 2)
+            # the out= tensor will get resized, since it has size=0 to start.
+            z = torch.empty(())
+            torch.add(y, tmp, out=z)
+            w = z * z
+            return w
+        self.assert_functionalization(f, torch.ones(4, 2))
+        logs = self.get_logs(f, torch.ones(4, 2))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.view_copy.default($0, [4, 2])
+$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]]))
-$3 = torch._ops.aten.view($2, [4, 2])
-$4 = torch._ops.aten.mul($3, $3)""")
+$3 = torch._ops.aten.mul.Tensor($2, $2)""")
+
+    def test_multi_out(self):
+        def f(x):
+            # aminmax.out returns a tuple of tensors.
+            # functionalization should properly handle the tuple.
+            out_min = torch.empty(4)
+            out_max = torch.empty(4)
+            torch.aminmax(x, dim=0, out=(out_max, out_min))
+            return out_max
+        self.assert_functionalization(f, torch.arange(8, dtype=torch.float32))
+        logs = self.get_logs(f, torch.arange(8, dtype=torch.float32))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1, $2 = torch._ops.aten.aminmax.default($0, dim=0)""")
+
+    def test_tensor_ctr(self):
+        def f(x):
+            y = torch.tensor((1, 2, 3))
+            z = y.view(-1)
+            z.add_(1)
+            return y
+        self.assert_functionalization(f, torch.arange(3, dtype=torch.float32))
+        logs = self.get_logs(f, torch.arange(3, dtype=torch.float32))
+        self.assertExpectedInline('\n'.join(logs), """$0 = input('input')""")
 
     def test_inplace_on_non_view(self):
         def f(x):
@@ -81,8 +168,8 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.view($0, [4, 2])
-$2 = torch._ops.aten.add($0, tensor([[1., 1.],
+$1 = torch._ops.aten.view_copy.default($0, [4, 2])
+$2 = torch._ops.aten.add.Tensor($0, tensor([[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]]))""")
@@ -94,17 +181,30 @@ def f(x):
             return y
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        # Only seeing copy_() calls in the logs are actually expected:
-        # - block_diag is a CompositeImplicitAutograd op, implemented in terms of copy_() and a few other ops.
-        # - copy_() doesn't have an out-of-place variant, so the pass leaves it alone
-        # - the other ops are all not called on the input tensor, which means that the LoggingTensor doesn't see them
-        # We can update the output of this test if/when these tests eventually use LoggingTensor with PythonMode
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.copy_(tensor([[1., 1.],
-        [1., 1.]]), $0)
-$2 = torch._ops.aten.copy_(tensor([[1., 1.],
-        [1., 1.]]), $0)""")
+$1 = torch._ops.aten.expand_copy.default($0, [2, 2])
+$2 = torch._ops.aten.slice_scatter.default(tensor([[0., 0., 0., 0.],
+        [0., 0., 0., 0.]]), $1, 1, 0, 2)
+$3 = torch._ops.aten.slice_scatter.default(tensor([[0., 0., 0., 0.],
+        [0., 0., 0., 0.],
+        [0., 0., 0., 0.],
+        [0., 0., 0., 0.]]), $2, 0, 0, 2)
+$4 = torch._ops.aten.slice_copy.Tensor($3, 0, 2, 4)
+$5 = torch._ops.aten.slice_copy.Tensor($4, 1, 2, 4)
+$6 = torch._ops.aten.expand_copy.default($0, [2, 2])""")
+
+    def test_cat(self):
+        def f(x):
+            out = torch.empty(0)
+            torch.cat((x,), out=out)
+            return out
+        self.assert_functionalization(f, torch.ones(2, 2))
+        logs = self.get_logs(f, torch.ones(2, 2))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.cat.default([LoggingTensor(tensor([[1., 1.],
+        [1., 1.]]))])""")
 
     def test_diagonal(self):
         def f(x):
@@ -118,10 +218,10 @@ def f(x):
         logs = self.get_logs(f, torch.ones(2, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.diagonal($0)
-$2 = torch._ops.aten.add($1, tensor([1., 1.]))
-$3 = torch._ops.aten.diagonal_scatter($0, $2)
-$4 = torch._ops.aten.mul($3, $3)""")
+$1 = torch._ops.aten.diagonal_copy.default($0)
+$2 = torch._ops.aten.add.Tensor($1, tensor([1., 1.]))
+$3 = torch._ops.aten.diagonal_scatter.default($0, $2)
+$4 = torch._ops.aten.mul.Tensor($3, $3)""")
 
     def test_diagonal_mutated_input(self):
         def f(x):
@@ -146,13 +246,13 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1, $2 = torch._ops.aten.split($0, 2)
-$3 = torch._ops.aten.diagonal($2)
-$4 = torch._ops.aten.add($3, tensor([1., 1.]))
-$5, $6 = torch._ops.aten.split($0, 2)
-$7 = torch._ops.aten.diagonal_scatter($6, $4)
-$8 = torch._ops.aten.slice_scatter($0, $7, 0, 2, 4)
-$9 = torch._ops.aten.mul($8, $8)""")
+$1, $2 = torch._ops.aten.split_copy.Tensor($0, 2)
+$3 = torch._ops.aten.diagonal_copy.default($2)
+$4 = torch._ops.aten.add.Tensor($3, tensor([1., 1.]))
+$5, $6 = torch._ops.aten.split_copy.Tensor($0, 2)
+$7 = torch._ops.aten.diagonal_scatter.default($6, $4)
+$8 = torch._ops.aten.slice_scatter.default($0, $7, 0, 2, 4)
+$9 = torch._ops.aten.mul.Tensor($8, $8)""")
 
     def test_view_inplace(self):
         def f(x):
@@ -166,9 +266,25 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.transpose($0, 1, 0)
-$2 = torch._ops.aten.select($1, 0, 0)
-$3 = torch._ops.aten.add($2, tensor([1., 1., 1., 1.]))""")
+$1 = torch._ops.aten.transpose_copy.int($0, 1, 0)
+$2 = torch._ops.aten.select_copy.int($1, 0, 0)
+$3 = torch._ops.aten.add.Tensor($2, tensor([1., 1., 1., 1.]))""")
+
+    def test_optional_tensor_list(self):
+        def f(x):
+            # test: an operator that takes in a List[Optional[Tensor]] argument
+            # (index_put)
+            y = x.view(8)
+            indices = torch.arange(4)
+            values = torch.arange(4, dtype=y.dtype)
+            y.index_put_((indices,), values, accumulate=False)
+            return y
+        self.assert_functionalization(f, torch.ones(4, 2))
+        logs = self.get_logs(f, torch.ones(4, 2))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.view_copy.default($0, [8])
+$2 = torch._ops.aten.index_put.default($1, [tensor([0, 1, 2, 3])], tensor([0., 1., 2., 3.]))""")
 
     def test_scalars(self):
         def f(x):
@@ -183,16 +299,28 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.view($0, [4, 2])
-$2 = torch._ops.aten.add($1, tensor(1))
-$3 = torch._ops.aten.mul($2, tensor(2))
-$4 = torch._ops.aten.div($3, tensor(1))""")
+$1 = torch._ops.aten.view_copy.default($0, [4, 2])
+$2 = torch._ops.aten.add.Tensor($1, 1)
+$3 = torch._ops.aten.mul.Tensor($2, 2)
+$4 = torch._ops.aten.div.Tensor($3, 1)""")
+
+    def test_only_one_view(self):
+        def f(x):
+            # This tests that we don't have any unnecessary views in the trace.
+            # If the input wasn't mutated, we don't need to regenerate it,
+            # so there should be a total of 1 op in the output trace.
+            return x.view(4, 2)
+        logs = self.get_logs(f, torch.ones(4, 2))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.view_copy.default($0, [4, 2])""")
 
     def test_everything(self):
         def f(x):
             # test: everything
             tmp = torch.ones(2, 2)
-            y = x.view(8)
+            x2 = x + x
+            y = x2.view(8)
             z0 = y.reshape(2, 4)
             z1 = z0.transpose(1, 0)
             z1.unsqueeze_(0)
@@ -205,41 +333,61 @@ def f(x):
         logs = self.get_logs(f, torch.ones(4, 2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.view($0, [8])
-$2 = torch._ops.aten._reshape_alias($1, [2, 4], [4, 1])
-$3 = torch._ops.aten.transpose($2, 1, 0)
-$4 = torch._ops.aten.view($0, [8])
-$5 = torch._ops.aten._reshape_alias($4, [2, 4], [4, 1])
-$6 = torch._ops.aten.transpose($5, 1, 0)
-$7 = torch._ops.aten.unsqueeze($6, 0)
-$8 = torch._ops.aten.view($0, [8])
-$9 = torch._ops.aten._reshape_alias($8, [2, 4], [4, 1])
-$10 = torch._ops.aten.transpose($9, 1, 0)
-$11 = torch._ops.aten.unsqueeze($10, 0)
-$12 = torch._ops.aten.squeeze($11)
-$13, $14 = torch._ops.aten.split($12, 2)
-$15 = torch._ops.aten.add($13, tensor([[1., 1.],
+$1 = torch._ops.aten.add.Tensor($0, $0)
+$2 = torch._ops.aten.view_copy.default($1, [8])
+$3 = torch._ops.aten._reshape_alias_copy.default($2, [2, 4], [4, 1])
+$4 = torch._ops.aten.transpose_copy.int($3, 1, 0)
+$5 = torch._ops.aten.view_copy.default($1, [8])
+$6 = torch._ops.aten._reshape_alias_copy.default($5, [2, 4], [4, 1])
+$7 = torch._ops.aten.transpose_copy.int($6, 1, 0)
+$8 = torch._ops.aten.unsqueeze_copy.default($7, 0)
+$9 = torch._ops.aten.view_copy.default($1, [8])
+$10 = torch._ops.aten._reshape_alias_copy.default($9, [2, 4], [4, 1])
+$11 = torch._ops.aten.transpose_copy.int($10, 1, 0)
+$12 = torch._ops.aten.unsqueeze_copy.default($11, 0)
+$13 = torch._ops.aten.squeeze_copy.default($12)
+$14, $15 = torch._ops.aten.split_copy.Tensor($13, 2)
+$16 = torch._ops.aten.add.Tensor($14, tensor([[1., 1.],
+        [1., 1.]]))
+$17 = torch._ops.aten.select_copy.int($3, 0, 0)
+$18 = torch._ops.aten.clone.default($16, memory_format=torch.contiguous_format)
+$19 = torch._ops.aten._unsafe_view.default($18, [4])
+$20 = torch._ops.aten.view_copy.default($1, [8])
+$21 = torch._ops.aten._reshape_alias_copy.default($20, [2, 4], [4, 1])
+$22 = torch._ops.aten.transpose_copy.int($21, 1, 0)
+$23 = torch._ops.aten.unsqueeze_copy.default($22, 0)
+$24 = torch._ops.aten.squeeze_copy.default($23)
+$25 = torch._ops.aten.slice_scatter.default($24, $16, 0, 0, 2)
+$26 = torch._ops.aten.unsqueeze_copy.default($25, 0)
+$27 = torch._ops.aten.squeeze_copy.dim($26, 0)
+$28 = torch._ops.aten.transpose_copy.int($27, 1, 0)
+$29 = torch._ops.aten._reshape_alias_copy.default($28, [8], [1])
+$30 = torch._ops.aten.view_copy.default($29, [4, 2])
+$31 = torch._ops.aten.view_copy.default($30, [8])
+$32 = torch._ops.aten._reshape_alias_copy.default($31, [2, 4], [4, 1])
+$33 = torch._ops.aten.select_copy.int($32, 0, 0)
+$34 = torch._ops.aten.add.Tensor($33, $19)""")
+
+    def test_reapply_views_simple(self):
+        def f(x):
+            tmp = torch.ones(4, 2)
+            y = x.view(4, 2)
+            y.add_(tmp)
+            z = x * x
+            return y
+        self.assert_functionalization(f, torch.ones(4, 2), reapply_views=True)
+        logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True)
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.view.default($0, [4, 2])
+$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.],
+        [1., 1.],
+        [1., 1.],
         [1., 1.]]))
-$16 = torch._ops.aten.select($2, 0, 0)
-$17 = torch._ops.aten.clone($15, memory_format=0)
-$18 = torch._ops.aten._unsafe_view($17, [4])
-$19 = torch._ops.aten.view($0, [8])
-$20 = torch._ops.aten._reshape_alias($19, [2, 4], [4, 1])
-$21 = torch._ops.aten.transpose($20, 1, 0)
-$22 = torch._ops.aten.unsqueeze($21, 0)
-$23 = torch._ops.aten.squeeze($22)
-$24 = torch._ops.aten.slice_scatter($23, $15, 0, 0, 2)
-$25 = torch._ops.aten.unsqueeze($24, 0)
-$26 = torch._ops.aten.squeeze($25, 0)
-$27 = torch._ops.aten.transpose($26, 1, 0)
-$28 = torch._ops.aten._reshape_alias($27, [8], [1])
-$29 = torch._ops.aten.view($28, [4, 2])
-$30 = torch._ops.aten.view($29, [8])
-$31 = torch._ops.aten._reshape_alias($30, [2, 4], [4, 1])
-$32 = torch._ops.aten.select($31, 0, 0)
-$33 = torch._ops.aten.add($32, $18)""")
-
-    def test_aliases_maintained_after_pass(self):
+$3 = torch._ops.aten.view.default($2, [4, 2])
+$4 = torch._ops.aten.mul.Tensor($3, $3)""")
+
+    def test_aliases_maintained_after_pass_when_reapplying_views(self):
         def f(x):
             tmp = torch.ones(4, 2)
             y = x.view(4, 2)
@@ -248,7 +396,7 @@ def f(x):
             return y, z
 
         input_functional = torch._to_functional_tensor(torch.ones(4, 2))
-        torch._enable_functionalization()
+        torch._enable_functionalization(reapply_views=True)
         try:
             y, z = f(input_functional)
             torch._sync(y)
@@ -279,34 +427,49 @@ def f(x):
         logs = self.get_logs(f, torch.ones(2))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.expand($0, [2])
-$2 = torch._ops.aten.add($1, $0)""")
+$1 = torch._ops.aten.expand_copy.default($0, [2])
+$2 = torch._ops.aten.add.Tensor($1, $0)""")
 
         # Test 2: copy_() with same dtype, different shape
         self.assert_functionalization(f, torch.ones(1))
         logs = self.get_logs(f, torch.ones(1))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten.expand($0, [2])
-$2 = torch._ops.aten.add($1, $0)""")
+$1 = torch._ops.aten.expand_copy.default($0, [2])
+$2 = torch._ops.aten.add.Tensor($1, $0)""")
 
         # Test 3: copy_() with different dtype, same shape
         self.assert_functionalization(f, torch.ones(2, dtype=torch.long))
         logs = self.get_logs(f, torch.ones(2, dtype=torch.long))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten._to_copy($0, dtype=6, layout=0, device=device(type='cpu'), pin_memory=False)
-$2 = torch._ops.aten.expand($1, [2])
-$3 = torch._ops.aten.add($2, $0)""")
+$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float32, layout=torch.strided, device=device(type='cpu'), pin_memory=False)
+$2 = torch._ops.aten.expand_copy.default($1, [2])
+$3 = torch._ops.aten.add.Tensor($2, $0)""")
 
         # Test 4: copy_() with different dtype, different shape
         self.assert_functionalization(f, torch.ones(1, dtype=torch.long))
         logs = self.get_logs(f, torch.ones(1, dtype=torch.long))
         self.assertExpectedInline('\n'.join(logs), """\
 $0 = input('input')
-$1 = torch._ops.aten._to_copy($0, dtype=6, layout=0, device=device(type='cpu'), pin_memory=False)
-$2 = torch._ops.aten.expand($1, [2])
-$3 = torch._ops.aten.add($2, $0)""")
+$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float32, layout=torch.strided, device=device(type='cpu'), pin_memory=False)
+$2 = torch._ops.aten.expand_copy.default($1, [2])
+$3 = torch._ops.aten.add.Tensor($2, $0)""")
+
+    def test_fill_(self):
+        def f(x):
+            y = x + x
+            z = y.diagonal()
+            z.fill_(0)
+            return y
+
+        self.assert_functionalization(f, torch.ones(2, 2))
+        logs = self.get_logs(f, torch.ones(2, 2))
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.aten.add.Tensor($0, $0)
+$2 = torch._ops.aten.diagonal_copy.default($1)
+$3 = torch._ops.aten.fill.Scalar($2, 0)""")
 
     def test_nested_functions_propagate_updates(self):
         def g(x):
@@ -324,5 +487,74 @@ def f(x):
 
         self.assert_functionalization(f, torch.ones(2, 2))
 
+    def test_mixed_wrappers_valid(self):
+        def f(x, y):
+            z = x + y
+            z.add_(1)
+            return z
+
+        x1_not_functional = LoggingTensor(torch.ones(4))
+        x2_functional = torch._to_functional_tensor(LoggingTensor(torch.ones(4)))
+
+        with capture_logs() as logs:
+            y = f(x1_not_functional, x2_functional)
+
+        # Make sure that functionalization ran the "+" kernel
+        # with a functional + non-functional tensor, and wrapped the output appropriately.
+        self.assertExpectedInline('\n'.join(logs), """\
+$2 = torch._ops.aten.add.Tensor($0, $1)
+$3 = torch._ops.aten.add.Tensor($2, 1)""")
+
+    def test_mixed_wrappers_invalid(self):
+        x1_not_functional = torch.ones(4)
+        x2_functional = torch._to_functional_tensor(torch.ones(4))
+
+        # When dealing with mixed functional + nonfunctional tensors,
+        # normal_tensor.add_(functional_tensor) is not valid
+        # because normal_tensor would need to be "promoted" to a functional tensor.
+        with self.assertRaises(RuntimeError):
+            x1_not_functional.add_(x2_functional)
+
+    # This tests the behavior of functionalization with multiple layers of wrapped tensor subclasses.
+    def test_multiple_levels_of_wrapping(self):
+        def f(x):
+            # call an inplace op and have it get logged twice (by the outer + inner wrapper)
+            x.add_(1)
+
+        # Test 1: both the inner and outer wrapper are "functionalized"
+        x_inner_and_outer_functional = torch._to_functional_tensor(
+            InplaceLoggingTensor(torch._to_functional_tensor(LoggingTensor(torch.ones(4)))))
+
+        with capture_logs() as logs:
+            f(x_inner_and_outer_functional)
+
+        # Since both wrappers were unctionalized, they both log "add"
+        self.assertExpectedInline('\n'.join(logs), """\
+$1 = torch._ops.aten.add.Tensor($0, 1)
+$3 = torch._ops.aten.add.Tensor($2, 1)""")
+
+        # Test 2: only the inner wrapper is "functionalized"
+        x_only_inner_functional = InplaceLoggingTensor(torch._to_functional_tensor(LoggingTensor(torch.ones(4))))
+
+        with capture_logs() as logs:
+            f(x_only_inner_functional)
+
+        # Since only the inner wrapper is functionalized, then the inner (first) log is functionalized
+        self.assertExpectedInline('\n'.join(logs), """\
+$1 = torch._ops.aten.add.Tensor($0, 1)
+$3 = torch._ops.aten.add_.Tensor($2, 1)""")
+
+        # Test 3: only the inner wrapper is "functionalized"
+        x_only_outer_functional = torch._to_functional_tensor(InplaceLoggingTensor(LoggingTensor(torch.ones(4))))
+
+        with capture_logs() as logs:
+            f(x_only_outer_functional)
+
+        # Only the outer add_ is functionalized
+        # Since only the outer wrapper is functionalized, then the outer (second) log is functionalized
+        self.assertExpectedInline('\n'.join(logs), """\
+$1 = torch._ops.aten.add_.Tensor($0, 1)
+$3 = torch._ops.aten.add.Tensor($2, 1)""")
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx.py b/test/test_fx.py
index a9ea626c8053..56b28371456e 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -7,6 +7,7 @@
 import inspect
 import math
 import numbers
+import io
 import operator
 import os
 import pickle
@@ -17,6 +18,7 @@
 import types
 import warnings
 import unittest
+import torch.nn.utils._stateless as _stateless
 from math import sqrt
 from torch.multiprocessing import Process
 from torch.testing import FileCheck
@@ -24,8 +26,8 @@
 from torch.testing._internal.common_device_type import ops, onlyCPU, instantiate_device_type_tests
 import torch.utils._pytree as pytree
 import torch.fx._pytree as fx_pytree
-from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap, PH
-from torch.fx.node import Target, Argument
+from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap, PH, CodeGen
+from torch.fx.node import Target, Argument, _format_arg
 from torch.fx.passes import shape_prop
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.experimental.rewriter import RewritingTracer
@@ -101,6 +103,11 @@ def a_lifted_leaf2(a, b):
 
 wrap('getattr')
 
+def wrapped_named_tup(p1, *, p2):
+    return p1.x + p2.y
+
+wrap(wrapped_named_tup)
+
 @wrap
 def wrapped_via_decorator(a):
     return a + 1
@@ -125,6 +132,9 @@ class Pair(NamedTuple):
     x : torch.Tensor
     y : torch.Tensor
 
+    def _custom_fx_repr_fn(self) -> str:
+        return f"Pair(x={_format_arg(self.x)}, y={_format_arg(self.y)})"
+
 # for testing pytrees
 class Foo(object):  # noqa: B209
     def __init__(self, a, b):
@@ -133,6 +143,7 @@ def __init__(self, a, b):
 
 class TestFX(JitTestCase):
     def setUp(self):
+        super().setUp()
         # Checking for mutable operations whil tracing is feature flagged
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
@@ -143,6 +154,7 @@ def setUp(self):
             torch.ops.load_library(str(lib_file_path))
 
     def tearDown(self):
+        super().tearDown()
         torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
 
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
@@ -449,6 +461,55 @@ def forward(self, a, b):
         gm.graph.lint()
         self.assertEqual(gm(3, 4), 14)
 
+    def test_concrete_arg_none_assert(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, val=None):
+                return x if val is None else x + val
+
+        f = Foo()
+        traced = torch.fx.symbolic_trace(f, concrete_args={'val' : None})
+        with self.assertRaisesRegex(AssertionError, 'val has been specialized to have value None'):
+            traced(torch.randn(5), torch.randn(5))
+
+        x = torch.randn(5)
+        torch.testing.assert_close(traced(x), f(x))
+
+    def test_trace_multiple_funcs(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+            def minus_forward(self, x, y):
+                return x - y
+
+            def multiply_forward(self, x, y):
+                return x * y
+
+        f = Foo()
+        x, y = torch.randn(5), torch.randn(5)
+
+        print(torch.__version__)
+
+        tracer = Tracer()
+        torch.testing.assert_close(GraphModule(f, tracer.trace(f))(x, y), f(x, y))
+
+        tracer.traced_func_name = "minus_forward"
+        torch.testing.assert_close(
+            GraphModule(f, tracer.trace(f))(x, y),
+            f.minus_forward(x, y),
+        )
+
+        tracer.traced_func_name = "multiply_forward"
+        torch.testing.assert_close(
+            GraphModule(f, tracer.trace(f))(x, y),
+            f.multiply_forward(x, y),
+        )
+
+        tracer.traced_func_name = "add_forward"
+        with self.assertRaisesRegex(AssertionError, "doesn't exist in"):
+            tracer.trace(f)
+
+
     def test_graph_unique_names(self):
         class M(torch.nn.Module):
             def forward(self, a, b):
@@ -678,6 +739,39 @@ def forward(self, a):
         for node in m_g.graph.nodes:
             self.assertTrue(node.name != "getattr")
 
+    @unittest.skip("Hotfix for SEV remediation")
+    def test_trace_buffer_slice(self):
+        bs, d_hid = 10, 23
+
+        class ExampleCode(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mm_param = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+                self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+                self.lin = torch.nn.Linear(d_hid, d_hid)
+                self.register_buffer('buffer', torch.randn(bs + 100, d_hid))
+
+            def forward(self, x):
+                x = torch.mm(x, self.mm_param)
+                skip_connection = x
+                x = torch.relu(x)
+                x = torch.mm(x, self.mm_param) + self.buffer[:x.shape[0]]
+                x = self.lin(x)
+                x = torch.relu(x)
+                x = x + skip_connection
+                x = torch.mm(x, self.mm_param2)
+                x = self.lin(x)
+                return x
+
+
+        ec = ExampleCode()
+
+        traced = torch.fx.symbolic_trace(ec)
+
+        x = torch.randn(bs, d_hid)
+        torch.testing.assert_allclose(ec(x), traced(x))
+
+
     def test_node_tagging(self):
         class TaggingTracer(Tracer):
             def create_node(self, kind : str, target : Union[str, Callable],
@@ -986,6 +1080,24 @@ def forward(self, x):
         traced_scripted = torch.jit.script(traced)
         self.assertEqual(traced_scripted(torch.rand(4)), 2)
 
+    def test_tuple_no_subscript(self):
+        def foo(x : Tuple):
+            return x[0]
+
+        traced = torch.fx.symbolic_trace(foo)
+        x = (torch.randn(5, 3),)
+        torch.testing.assert_allclose(traced(x), x[0])
+
+        bio = io.BytesIO()
+
+        torch.save(traced, bio)
+
+        bio.seek(0)
+
+        loaded = torch.load(bio)
+
+        torch.testing.assert_allclose(loaded(x), x[0])
+
     def test_torch_fx_len(self):
         class FXLenTest(torch.nn.Module):
             def forward(self, x):
@@ -1056,6 +1168,24 @@ def forward(self, a):
         out = gm(input)
         self.assertEqual(out, ref_out)
 
+    def test_torch_op_overloads(self):
+        class M(torch.nn.Module):
+            def forward(self, a):
+                b = torch.ops.aten.add.Tensor(a, a)
+                return b
+        m = M()
+        input = torch.randn(3)
+        ref_out = m(input)
+        gm = symbolic_trace(m)
+        gm.graph.lint()
+        out = gm(input)
+        self.assertEqual(out, ref_out)
+
+        for node in gm.graph.nodes:
+            if node.op == 'call_function':
+                assert isinstance(node.target, torch._ops.OpOverload)
+                assert node.target.__name__ == 'add.Tensor'
+
     def test_pickle_torch_custom_ops(self):
         class M(torch.nn.Module):
             def forward(self, a):
@@ -1238,6 +1368,18 @@ def test_remove_uses(self):
 
         self.assertTrue(neg not in relu.users)
 
+    def test_remove_uses_with_custom_filter(self):
+        g : torch.fx.Graph = Graph()
+        x : torch.fx.Node = g.placeholder('x')
+        relu : torch.fx.Node = g.call_function(torch.relu, (x,))
+        neg : torch.fx.Node = g.call_function(torch.neg, (relu,))
+        g.output(neg)
+
+        neg.replace_all_uses_with(relu, lambda x: x != neg)
+
+        self.assertTrue(neg in relu.users)
+
+
     def test_nonetype_annotation(self):
         eb = torch.nn.EmbeddingBag(3, 4)
         symbolic_trace(eb)
@@ -1925,6 +2067,28 @@ def test_update_kwargs_api(self):
         new_gm = torch.fx.GraphModule(torch.nn.Module(), graph)
         self.assertEqual(new_gm(inp_x, inp_y), torch.relu(inp_y))
 
+    def test_immutable_list_pytree_ops(self):
+        rand_tensor = torch.randn(5, 3)
+        l = immutable_list([3, [rand_tensor, 42]])
+
+        flattened, spec = pytree.tree_flatten(l)
+        assert flattened == [3, rand_tensor, 42]
+
+        unflattened = pytree.tree_unflatten(flattened, spec)
+        assert unflattened == l
+        assert isinstance(unflattened, immutable_list)
+
+    def test_immutable_dict_pytree_ops(self):
+        rand_tensor = torch.randn(5, 3)
+        d = immutable_dict({'a': 3, 'b': [rand_tensor, 42]})
+
+        flattened, spec = pytree.tree_flatten(d)
+        assert flattened == [3, rand_tensor, 42]
+
+        unflattened = pytree.tree_unflatten(flattened, spec)
+        assert unflattened == d
+        assert isinstance(unflattened, immutable_dict)
+
     def test_move_before(self):
         graph : torch.fx.Graph = torch.fx.Graph()
         x : torch.fx.Node = graph.create_node('placeholder', 'x')
@@ -2261,6 +2425,40 @@ def forward(self, x):
         input = torch.rand(3, 4)
         self.assertEqual(traced(input), Pair(input, input))
 
+    def test_named_tuple_inlined(self):
+        class NamedTupMod(torch.nn.Module):
+            def forward(self, inp):
+                return wrapped_named_tup(Pair(inp, 1.2), p2=Pair(3.4, inp))
+
+        m = NamedTupMod()
+        input = torch.rand(3, 4)
+        ref = m(input)
+        traced = symbolic_trace(m)
+
+        res = traced(input)
+        self.assertEqual(ref, res)
+
+        # Check Pair NamedTuple works when inlined into the function call.
+        ph = call_func = None
+        for node in traced.graph.nodes:
+            if node.op == "placeholder":
+                ph = node
+            elif node.op == "call_function" and node.target == wrapped_named_tup:
+                node.update_arg(0, Pair(ph, 1.2))
+                node.update_kwarg("p2", Pair(3.4, ph))
+                call_func = node
+                break
+        self.assertTrue(call_func is not None)
+        self.assertTrue(isinstance(call_func.args[0], Pair))
+        self.assertTrue(isinstance(call_func.kwargs["p2"], Pair))
+        self.assertEqual(_format_arg(call_func.args[0]), "Pair(x=%inp, y=1.2)")
+        self.assertEqual(_format_arg(call_func.kwargs["p2"]), "Pair(x=3.4, y=%inp)")
+
+        traced.graph.eliminate_dead_code()
+        traced.recompile()
+        res = traced(input)
+        self.assertEqual(ref, res)
+
     def test_return_type_exists(self):
         class ReturnTypeModule(torch.nn.Module):
             def other(self, x: List[str]) -> List[str]:
@@ -2809,6 +3007,35 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         gm2.delete_all_unused_submodules()
         torch.testing.assert_allclose(gm2(inputs), model(inputs))
 
+    def test_fx_stateless(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l1 = torch.nn.Linear(1, 1)
+                self.register_buffer('buffer', torch.ones(1))
+
+            def forward(self, x):
+                return self.l1(x) + self.buffer
+
+        module = MockModule()
+        x = torch.rand((1, 1))
+        weight = torch.tensor([[1.0]], requires_grad=True)
+        bias = torch.tensor([0.0], requires_grad=True)
+        buffer = torch.tensor([0.0])
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        fx_module = torch.fx.symbolic_trace(module)
+        res = _stateless.functional_call(fx_module, parameters, x)
+        res.backward()
+        self.assertIsNotNone(weight.grad)
+        self.assertIsNotNone(bias.grad)
+        self.assertIsNone(buffer.grad)
+        # Gradient was not calculated for the module stated and buffers
+        self.assertIsNone(module.l1.weight.grad)
+        self.assertIsNone(module.l1.bias.grad)
+        self.assertIsNone(module.buffer.grad)
+
     def test_tracing_graphmodules_as_leaf_submodules(self):
         class A(torch.nn.Module):
             def forward(self, t):
@@ -3126,6 +3353,12 @@ def verify_pytree(f, inp):
             orig_out = f(val)
             nf = symbolic_trace(f, concrete_args={'x': inp})
             self.assertEqual(nf(val), orig_out)
+
+            bare_fx = GraphModule({}, copy.deepcopy(nf.graph))
+            bare_fx.graph.set_codegen(CodeGen())
+            bare_fx.recompile()
+            self.assertEqual(nf.graph.process_outputs(bare_fx(*nf.graph.process_inputs(val))), orig_out)
+
             assert num_flat_args == 0 or "tree_flatten_spec" in nf.code
             assert(sum([i.op == 'placeholder' for i in nf.graph.nodes]) == num_flat_args)
 
@@ -3161,6 +3394,102 @@ def f(b, a):
         nf = symbolic_trace(nf)
         self.assertEqual(nf(**val), f(**val))
 
+    def test_custom_codegen(self):
+        class ListCodeGen(CodeGen):
+            def gen_fn_def(self, free_vars, maybe_return_annotation):
+                lst_unpack = f"""
+def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}:
+    {', '.join(free_vars)} = args_list"""
+                return lst_unpack
+
+            def additional_globals(self):
+                return [('List', typing.List)]
+
+            def process_inputs(self, *inputs):
+                assert(len(inputs) == 1)
+                return inputs[0]
+
+        def f(a, b):
+            return a + b
+
+        nf = symbolic_trace(f)
+        vals = [torch.randn(3), torch.randn(3)]
+        self.assertEqual(nf(*vals), f(*vals))
+
+        nf.graph.set_codegen(ListCodeGen())
+        nf.recompile()
+
+        bare_fx = GraphModule({}, copy.deepcopy(nf.graph))
+        bare_fx.graph.set_codegen(CodeGen())
+        bare_fx.recompile()
+
+        self.assertEqual(nf(vals), f(*vals))
+        self.assertEqual(nf.graph.process_outputs(bare_fx(*nf.graph.process_inputs(vals))), f(*vals))
+
+        ts_f = torch.jit.script(nf)
+        self.assertEqual(nf(vals), ts_f(vals))
+
+    def test_custom_codegen_with_transformer(self):
+        class ListCodeGen(CodeGen):
+            def gen_fn_def(self, free_vars, maybe_return_annotation):
+                lst_unpack = f"""
+def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}:
+    {', '.join(free_vars)} = args_list"""
+                return lst_unpack
+
+            def additional_globals(self):
+                return [('List', typing.List)]
+
+            def process_inputs(self, *inputs):
+                assert(len(inputs) == 1)
+                return inputs[0]
+
+        def f(a, b):
+            return a + b
+
+        nf = symbolic_trace(f)
+        vals = [torch.randn(3), torch.randn(3)]
+        self.assertEqual(nf(*vals), f(*vals))
+
+        nf.graph.set_codegen(ListCodeGen())
+        nf.recompile()
+        self.assertEqual(nf(vals), f(*vals))
+
+        transformed_gm = Transformer(nf).transform()
+        self.assertEqual(nf(vals), transformed_gm(vals))
+
+    def test_interpreter_with_codegen(self):
+        class ListCodeGen(CodeGen):
+            def gen_fn_def(self, free_vars, maybe_return_annotation):
+                lst_unpack = f"""
+def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}:
+    {', '.join(free_vars)} = args_list"""
+                return lst_unpack
+
+            def additional_globals(self):
+                return [('List', typing.List)]
+
+            def process_inputs(self, *inputs):
+                assert(len(inputs) == 1)
+                return inputs[0]
+
+            def generate_output(self, output_args):
+                return f'return list({repr(output_args)})'
+
+            def process_outputs(self, outputs):
+                return list(outputs)
+
+        def f(a, b):
+            a = a + b
+            b = a + b
+            return a, b
+
+        nf = symbolic_trace(f)
+        vals = [torch.randn(3), torch.randn(3)]
+        nf.graph.set_codegen(ListCodeGen())
+        nf.recompile()
+        self.assertEqual(Interpreter(nf).run(vals), nf(vals))
+
     def test_imul_code_print(self):
         graph = torch.fx.Graph()
         a = graph.placeholder("a")
@@ -3218,6 +3547,7 @@ def test_get_torch_func_signature_exhaustive(self, device, dtype, op):
 
 class TestFXAPIBackwardCompatibility(JitTestCase):
     def setUp(self):
+        super().setUp()
         self.maxDiff = None
 
         # Checking for mutable operations whil tracing is feature flagged
@@ -3226,6 +3556,7 @@ def setUp(self):
         torch.fx.proxy.TracerBase.check_mutable_operations = True
 
     def tearDown(self):
+        super().tearDown()
         torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
 
 
@@ -3464,12 +3795,14 @@ def check_symbols_have_bc_designation(m, prefix):
 
 class TestFunctionalTracing(JitTestCase):
     def setUp(self):
+        super().setUp()
         # Checking for mutable operations whil tracing is feature flagged
         # Enable it in testing but not by default
         self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
         torch.fx.proxy.TracerBase.check_mutable_operations = True
 
     def tearDown(self):
+        super().tearDown()
         torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
 
     IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary",
@@ -3496,6 +3829,7 @@ def tearDown(self):
         "bilinear": BUILT_IN_FUNC,
         "celu_": BUILT_IN_FUNC,
         "channel_shuffle": BUILT_IN_FUNC,
+        "native_channel_shuffle": BUILT_IN_FUNC,
         "conv1d": BUILT_IN_FUNC,
         "conv2d": BUILT_IN_FUNC,
         "conv3d": BUILT_IN_FUNC,
@@ -3512,6 +3846,7 @@ def tearDown(self):
         "linear": BUILT_IN_FUNC,
         "logsigmoid": BUILT_IN_FUNC,
         "one_hot": BUILT_IN_FUNC,
+        "pad": BUILT_IN_FUNC,
         "pairwise_distance": BUILT_IN_FUNC,
         "pdist": BUILT_IN_FUNC,
         "pixel_shuffle": BUILT_IN_FUNC,
@@ -3529,7 +3864,6 @@ def tearDown(self):
         "adaptive_max_pool2d_with_indices": LEN_ERROR,
         "adaptive_max_pool3d_with_indices": LEN_ERROR,
         "instance_norm": CONTROL_FLOW,
-        "pad": LEN_ERROR,
 
         "adaptive_max_pool1d": PROXY_ITERABLE,
         "adaptive_max_pool2d": PROXY_ITERABLE,
@@ -3584,9 +3918,9 @@ def tearDown(self):
         "leaky_relu": CONTROL_FLOW,
         "local_response_norm": CONTROL_FLOW,
         "margin_ranking_loss": CONTROL_FLOW,
-        "max_pool1d_with_indices": CONTROL_FLOW,
-        "max_pool2d_with_indices": CONTROL_FLOW,
-        "max_pool3d_with_indices": CONTROL_FLOW,
+        "max_pool1d_with_indices": ARG_TYPE_MISMATCH,
+        "max_pool2d_with_indices": ARG_TYPE_MISMATCH,
+        "max_pool3d_with_indices": ARG_TYPE_MISMATCH,
         "mse_loss": CONTROL_FLOW,
         "multi_head_attention_forward": CONTROL_FLOW,
         "multi_margin_loss": CONTROL_FLOW,
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index b7533ef34245..062eaed38f50 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -3,7 +3,9 @@
 import math
 import numbers
 import operator
+import pickle
 import sys
+import tempfile
 import unittest
 from typing import Callable, Dict, Union, List, Optional
 from types import BuiltinFunctionType
@@ -26,6 +28,8 @@
 )
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.fx.experimental.schema_type_annotation import AnnotateTypesWithSchema
+import torch.fx.experimental.meta_tracer
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 from torch.fx.operator_schemas import (
@@ -119,7 +123,7 @@ def forward(self, a, b, c):
         assert len(serialized_graph1["weights"]) == 4
         assert len(serialized_graph1["modules"]) == 0
         assert len(serialized_graph2["nodes"]) == 6
-        assert len(serialized_graph2["weights"]) == 4
+        assert len(serialized_graph2["weights"]) == 1
         assert len(serialized_graph2["modules"]) == 1
         assert serialized_graph1["weights"]["linear.weight"]["shape"] == "[4, 4]"
         assert serialized_graph1["weights"]["linear.weight"]["dtype"] == "torch.float32"
@@ -667,6 +671,45 @@ def forward(self, a, b):
         # Confirm that the output is correct
         self.assertEqual(traced(3, 3), m(3, 3))
 
+    def test_meta_tracer(self):
+        class MetaTracerTestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.emb = torch.nn.Embedding(num_embeddings=42, embedding_dim=16)
+                self.layernorm = torch.nn.LayerNorm(16)
+
+            def forward(self, x):
+                emb = self.emb(x)
+                emb = emb + torch.arange(emb.shape[-1], dtype=torch.float, device=emb.device)
+                lol = self.layernorm(emb)
+                return torch.relu(lol) if lol.shape[0] < 30 else torch.sigmoid(lol)
+
+        mttm = MetaTracerTestModule()
+        for BS in [15, 35]:
+            x = torch.zeros(BS, dtype=torch.long).random_(42)
+            meta_args = {'x' : x.to(device='meta')}
+            gm = torch.fx.experimental.meta_tracer.symbolic_trace(mttm, meta_args=meta_args)
+            torch.testing.assert_close(gm(x), mttm(x))
+
+            # Test serialization/deserialization
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                with open(f'{tmp_dir}/meta_module.pkl', 'wb') as f:
+                    pickle.dump(gm, f)
+
+                with open(f'{tmp_dir}/meta_module.pkl', 'rb') as f:
+                    loaded = pickle.load(f)
+
+                torch.testing.assert_close(loaded(x), mttm(x))
+
+    def test_proxy_tensor(self):
+        def f(x):
+            val = x.cos().cos().sum()
+            return torch.autograd.grad(val, x)
+
+        traced_graph = make_fx(f)(torch.randn(3, requires_grad=True))
+        inp = torch.randn(3, requires_grad=True)
+        torch.testing.assert_close(traced_graph(inp), f(inp))
+
     def test_call_to_assert_with_msg(self):
         class M(torch.nn.Module):
             def forward(self, a, b):
@@ -814,6 +857,29 @@ def mod_partition(node: Node):
 
         self.assertEqual(orig_out, submodules_out)
 
+    def test_split_module_kwargs_expansion(self):
+        class ModuleWithKwargsExpansion(torch.nn.Module):
+            def forward(self, x, **kwargs):
+                return x + kwargs['foo']
+
+        mod = ModuleWithKwargsExpansion()
+        traced = torch.fx.symbolic_trace(mod)
+
+        seen_getitem = False
+
+        def split_callback(n):
+            nonlocal seen_getitem
+            split_idx = int(seen_getitem)
+            if n.target == operator.getitem:
+                seen_getitem = True
+            return split_idx
+
+        split = split_module(traced, mod, split_callback)
+
+        x = torch.randn(5, 3)
+        foo = torch.randn(5, 3)
+        torch.testing.assert_allclose(split(x, foo=foo), traced(x, foo=foo))
+
     @skipIfNoTorchVision
     def test_subgraph_trivial_resnet(self):
         # Smoke test trivially splitting resnet into 1 partition works
@@ -1125,6 +1191,47 @@ def split_cb(node: torch.fx.Node):
         module_with_submodule = split_module(traced, mm, split_cb)
         self.assertEqual(module_with_submodule(a, b, c, d), traced(a, b, c, d))
 
+    def test_split_qualname_mapping(self):
+        d_hid = 4
+
+        class ExampleCode(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mm_param = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+                self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+                self.lin = torch.nn.Linear(d_hid, d_hid)
+
+            def forward(self, x):
+                x = torch.mm(x, self.mm_param)
+                x = torch.relu(x)
+                x = torch.mm(x, self.mm_param)
+                x = self.lin(x)
+                x = torch.relu(x)
+                x = torch.mm(x, self.mm_param2)
+                x = self.lin(x)
+                return x
+
+        my_module = ExampleCode()
+        my_module_traced = symbolic_trace(my_module)
+
+        part_idx = 0
+
+        def split_callback(n : torch.fx.Node):
+            nonlocal part_idx
+            if (n.op, n.target) == ('call_module', 'lin'):
+                part_idx += 1
+            return part_idx
+
+        # split module in module with submodules
+        qualname_map : Dict[str, str] = {}
+        module_with_submodules = split_module(
+            my_module_traced, my_module, split_callback, qualname_map
+        )
+        expected_qualname_map = {
+            'submod_1.lin': 'lin', 'submod_2.lin': 'lin'
+        }
+        self.assertEqual(qualname_map, expected_qualname_map)
+
     def test_traceable_function_with_nonstandard_name(self):
         def foo(x):
             return torch.relu(x)
@@ -1454,101 +1561,6 @@ class TestNormalizeOperators(JitTestCase):
     @onlyCPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_normalize_operator_exhaustive(self, device, dtype, op):
-        # Sorted and one entry on each line to minimize merge conflicts.
-        op_skip = {
-            # See: https://github.com/pytorch/pytorch/issues/64997
-            "as_strided",
-            "block_diag",
-            "broadcast_tensors",
-            "cartesian_prod",
-            "contiguous",
-            "einsum",
-            "expand",
-            "expand_as",
-            "fill_",
-            "T",   # Implemented with a lambda
-            "H",   # Implemented with a lambda
-            "mT",  # Implemented with a lambda
-            "mH",  # Implemented with a lambda
-            "gradient",
-            "histogramdd",
-            "igamma",
-            "igammac",
-            "index_put",
-            "nn.functional.conv2d",
-            "nn.functional.dropout",
-            "nn.functional.dropout2d",
-            "nn.functional.embedding",  # Implemented with a lambda
-            "nn.functional.embedding_bag",  # Implemented with a lambda
-            "nn.functional.rrelu",  # Implemented with a lambda
-            "nn.functional.feature_alpha_dropout",  # Implemented with a lambda
-            "nonzero",
-            "polygamma",
-            "special.polygamma",
-            "repeat",
-            "reshape_as",
-            "resize_",
-            "resize_as_",
-            "special.zeta",
-            "sum_to_size",
-            "to_sparse",
-            "unique",
-            "unique_consecutive",
-            "view",
-            "view_as",
-            "unfold",
-            "where",
-            "zero_",
-            'bfloat16',
-            'bool',
-            'byte',
-            'char',
-            'double',
-            'float',
-            'half',
-            'int',
-            'long',
-            'short',
-            'empty_like',
-            'ones_like',
-            'randn_like',
-            'zeros_like',
-            'full_like',
-            'rand_like',
-            'randint_like',
-            'new_ones',
-            'new_empty',
-            'new_zeros',
-            'new_full',
-            'normal',
-            'multinomial',
-            'bernoulli',
-            "__getitem__",
-            "__radd__",
-            "__rsub__",
-            "__rmul__",
-            "__rdiv__",
-            "__rmod__",
-            "__rpow__",
-            '__rand__',
-            '__ror__',
-            '__rxor__',
-            "__rmatmul__",
-            "atleast_1d",
-            "atleast_2d",
-            "atleast_3d",
-            "svd_lowrank",  # implemented with a lambda
-            "pca_lowrank",  # implemented with a lambda
-            "column_stack",
-        }
-
-        # Unsupported input types
-        if op.name in op_skip:
-            return
-
-        if op.name.startswith('_masked.'):
-            return
-
         # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors)
         fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot"}
         sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
@@ -1650,6 +1662,40 @@ def forward(self, {', '.join(param_names)}):
             test_out = traced(*param_values)
             self.assertEqual(test_out, ref_out)
 
+    def test_normalize_quantized_eb(self):
+        target = torch.ops.quantized.embedding_bag_byte_rowwise_offsets
+        args = (
+            torch.empty((2, 3), dtype=torch.uint8),
+            torch.empty((2,), dtype=torch.int64),
+            torch.empty((2,), dtype=torch.int64),
+        )
+        norm_args_and_kwargs = normalize_function(
+            target, args, normalize_to_only_use_kwargs=True
+        )
+        self.assertTrue(norm_args_and_kwargs is not None)
+        self.assertEqual(
+            set(norm_args_and_kwargs.kwargs.keys()),
+            {
+                "weight",
+                "indices",
+                "offsets",
+                "scale_grad_by_freq",
+                "mode",
+                "pruned_weights",
+                "per_sample_weights",
+                "compressed_indices_mapping",
+                "include_last_offset",
+            },
+        )
+        self.assertEqual(norm_args_and_kwargs.args, tuple())
+
+    def test_normalize_args_op_overload(self):
+        for target in [torch.ops.aten.resize_as_.default, torch.ops.aten.resize_as_]:
+            inp1 = torch.rand([1])
+            inp2 = torch.rand([4])
+            args, kwargs = normalize_function(target, (inp1,), {"the_template": inp2}, normalize_to_only_use_kwargs=True)
+            self.assertIs(kwargs["input"], inp1)
+            self.assertIs(kwargs["the_template"], inp2)
 
 instantiate_device_type_tests(TestNormalizeOperators, globals())
 
diff --git a/test/test_hub.py b/test/test_hub.py
new file mode 100644
index 000000000000..662a2cf9771e
--- /dev/null
+++ b/test/test_hub.py
@@ -0,0 +1,256 @@
+# Owner(s): ["module: hub"]
+
+import unittest
+from unittest.mock import patch
+import os
+import tempfile
+import warnings
+
+import torch
+import torch.hub as hub
+from torch.testing._internal.common_utils import retry, IS_SANDCASTLE, TestCase
+
+
+def sum_of_state_dict(state_dict):
+    s = 0
+    for _, v in state_dict.items():
+        s += v.sum()
+    return s
+
+
+SUM_OF_HUB_EXAMPLE = 431080
+TORCHHUB_EXAMPLE_RELEASE_URL = 'https://github.com/ailzhang/torchhub_example/releases/download/0.1/mnist_init_ones'
+
+
+@unittest.skipIf(IS_SANDCASTLE, 'Sandcastle cannot ping external')
+class TestHub(TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.previous_hub_dir = torch.hub.get_dir()
+        self.tmpdir = tempfile.TemporaryDirectory('hub_dir')
+        torch.hub.set_dir(self.tmpdir.name)
+        self.trusted_list_path = os.path.join(torch.hub.get_dir(), "trusted_list")
+
+    def tearDown(self):
+        super().tearDown()
+        torch.hub.set_dir(self.previous_hub_dir)  # probably not needed, but can't hurt
+        self.tmpdir.cleanup()
+
+    def _assert_trusted_list_is_empty(self):
+        with open(self.trusted_list_path) as f:
+            assert not f.readlines()
+
+    def _assert_in_trusted_list(self, line):
+        with open(self.trusted_list_path) as f:
+            assert line in (l.strip() for l in f.readlines())
+
+    @retry(Exception, tries=3)
+    def test_load_from_github(self):
+        hub_model = hub.load('ailzhang/torchhub_example', 'mnist', source='github', pretrained=True, verbose=False)
+        self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_load_from_local_dir(self):
+        local_dir = hub._get_cache_or_reload(
+            'ailzhang/torchhub_example',
+            force_reload=False,
+            trust_repo=True,
+            calling_fn=None
+        )
+        hub_model = hub.load(local_dir, 'mnist', source='local', pretrained=True, verbose=False)
+        self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_load_from_branch(self):
+        hub_model = hub.load('ailzhang/torchhub_example:ci/test_slash', 'mnist', pretrained=True, verbose=False)
+        self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_get_set_dir(self):
+        previous_hub_dir = torch.hub.get_dir()
+        with tempfile.TemporaryDirectory('hub_dir') as tmpdir:
+            torch.hub.set_dir(tmpdir)
+            self.assertEqual(torch.hub.get_dir(), tmpdir)
+            self.assertNotEqual(previous_hub_dir, tmpdir)
+
+            hub_model = hub.load('ailzhang/torchhub_example', 'mnist', pretrained=True, verbose=False)
+            self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+            assert os.path.exists(os.path.join(tmpdir, 'ailzhang_torchhub_example_master'))
+
+        # Test that set_dir properly calls expanduser()
+        # non-regression test for https://github.com/pytorch/pytorch/issues/69761
+        new_dir = os.path.join("~", "hub")
+        torch.hub.set_dir(new_dir)
+        self.assertEqual(torch.hub.get_dir(), os.path.expanduser(new_dir))
+
+    @retry(Exception, tries=3)
+    def test_list_entrypoints(self):
+        entry_lists = hub.list('ailzhang/torchhub_example', trust_repo=True)
+        self.assertObjectIn('mnist', entry_lists)
+
+    @retry(Exception, tries=3)
+    def test_download_url_to_file(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            f = os.path.join(tmpdir, 'temp')
+            hub.download_url_to_file(TORCHHUB_EXAMPLE_RELEASE_URL, f, progress=False)
+            loaded_state = torch.load(f)
+            self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_load_state_dict_from_url(self):
+        loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL)
+        self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE)
+
+        # with name
+        file_name = "the_file_name"
+        loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL, file_name=file_name)
+        expected_file_path = os.path.join(torch.hub.get_dir(), 'checkpoints', file_name)
+        self.assertTrue(os.path.exists(expected_file_path))
+        self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_load_legacy_zip_checkpoint(self):
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")
+            hub_model = hub.load('ailzhang/torchhub_example', 'mnist_zip', pretrained=True, verbose=False)
+            self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+            assert any("will be deprecated in favor of default zipfile" in str(w) for w in ws)
+
+    # Test the default zipfile serialization format produced by >=1.6 release.
+    @retry(Exception, tries=3)
+    def test_load_zip_1_6_checkpoint(self):
+        hub_model = hub.load(
+            'ailzhang/torchhub_example',
+            'mnist_zip_1_6',
+            pretrained=True,
+            verbose=False,
+            trust_repo=True
+        )
+        self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE)
+
+    @retry(Exception, tries=3)
+    def test_hub_parse_repo_info(self):
+        # If the branch is specified we just parse the input and return
+        self.assertEqual(
+            torch.hub._parse_repo_info('a/b:c'),
+            ('a', 'b', 'c')
+        )
+        # For torchvision, the default branch is main
+        self.assertEqual(
+            torch.hub._parse_repo_info('pytorch/vision'),
+            ('pytorch', 'vision', 'main')
+        )
+        # For the torchhub_example repo, the default branch is still master
+        self.assertEqual(
+            torch.hub._parse_repo_info('ailzhang/torchhub_example'),
+            ('ailzhang', 'torchhub_example', 'master')
+        )
+
+    @retry(Exception, tries=3)
+    def test_load_commit_from_forked_repo(self):
+        with self.assertRaisesRegex(ValueError, 'If it\'s a commit from a forked repo'):
+            torch.hub.load('pytorch/vision:4e2c216', 'resnet18')
+
+    @retry(Exception, tries=3)
+    @patch('builtins.input', return_value='')
+    def test_trust_repo_false_emptystring(self, patched_input):
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        self._assert_trusted_list_is_empty()
+        patched_input.assert_called_once()
+
+        patched_input.reset_mock()
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        self._assert_trusted_list_is_empty()
+        patched_input.assert_called_once()
+
+    @retry(Exception, tries=3)
+    @patch('builtins.input', return_value='no')
+    def test_trust_repo_false_no(self, patched_input):
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        self._assert_trusted_list_is_empty()
+        patched_input.assert_called_once()
+
+        patched_input.reset_mock()
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        self._assert_trusted_list_is_empty()
+        patched_input.assert_called_once()
+
+    @retry(Exception, tries=3)
+    @patch('builtins.input', return_value='y')
+    def test_trusted_repo_false_yes(self, patched_input):
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        self._assert_in_trusted_list("ailzhang_torchhub_example")
+        patched_input.assert_called_once()
+
+        # Loading a second time with "check", we don't ask for user input
+        patched_input.reset_mock()
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+        patched_input.assert_not_called()
+
+        # Loading again with False, we still ask for user input
+        patched_input.reset_mock()
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False)
+        patched_input.assert_called_once()
+
+    @retry(Exception, tries=3)
+    @patch('builtins.input', return_value='no')
+    def test_trust_repo_check_no(self, patched_input):
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+        self._assert_trusted_list_is_empty()
+        patched_input.assert_called_once()
+
+        patched_input.reset_mock()
+        with self.assertRaisesRegex(Exception, 'Untrusted repository.'):
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+        patched_input.assert_called_once()
+
+    @retry(Exception, tries=3)
+    @patch('builtins.input', return_value='y')
+    def test_trust_repo_check_yes(self, patched_input):
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+        self._assert_in_trusted_list("ailzhang_torchhub_example")
+        patched_input.assert_called_once()
+
+        # Loading a second time with "check", we don't ask for user input
+        patched_input.reset_mock()
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+        patched_input.assert_not_called()
+
+    @retry(Exception, tries=3)
+    def test_trust_repo_true(self):
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=True)
+        self._assert_in_trusted_list("ailzhang_torchhub_example")
+
+    @retry(Exception, tries=3)
+    def test_trust_repo_builtin_trusted_owners(self):
+        torch.hub.load('pytorch/vision', 'resnet18', trust_repo="check")
+        self._assert_trusted_list_is_empty()
+
+    @retry(Exception, tries=3)
+    def test_trust_repo_none(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=None)
+            assert len(w) == 1
+            assert issubclass(w[-1].category, UserWarning)
+            assert "You are about to download and run code from an untrusted repository" in str(w[-1].message)
+
+        self._assert_trusted_list_is_empty()
+
+    @retry(Exception, tries=3)
+    def test_trust_repo_legacy(self):
+        # We first download a repo and then delete the allowlist file
+        # Then we check that the repo is indeed trusted without a prompt,
+        # because it was already downloaded in the past.
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=True)
+        os.remove(self.trusted_list_path)
+
+        torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check")
+
+        self._assert_trusted_list_is_empty()
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 42ffa8ab24e8..9d2d82e9f12a 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -692,7 +692,7 @@ def test_bool_indices(self, device):
             self.assertEqual(v[boolIndices].shape, v[uint8Indices].shape)
             self.assertEqual(v[boolIndices], v[uint8Indices])
             self.assertEqual(v[boolIndices], tensor([True], dtype=torch.bool, device=device))
-            self.assertEquals(len(w), 2)
+            self.assertEqual(len(w), 2)
 
     def test_bool_indices_accumulate(self, device):
         mask = torch.zeros(size=(10, ), dtype=torch.bool, device=device)
@@ -713,7 +713,7 @@ def test_byte_mask(self, device):
         with warnings.catch_warnings(record=True) as w:
             self.assertEqual(v[mask].shape, (3, 7, 3))
             self.assertEqual(v[mask], torch.stack([v[0], v[2], v[3]]))
-            self.assertEquals(len(w), 2)
+            self.assertEqual(len(w), 2)
 
         v = torch.tensor([1.], device=device)
         self.assertEqual(v[v == 0], torch.tensor([], device=device))
@@ -725,7 +725,7 @@ def test_byte_mask_accumulate(self, device):
             warnings.simplefilter("always")
             y.index_put_((mask, ), y[mask], accumulate=True)
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
-            self.assertEquals(len(w), 2)
+            self.assertEqual(len(w), 2)
 
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
@@ -818,6 +818,9 @@ def test_index_put_accumulate_non_contiguous(self, device):
         value = torch.randn(2, 2)
         out_cuda = t1.index_put_(indices_dev, value.to(device), accumulate=True)
         out_cpu = t2.index_put_(indices, value, accumulate=True)
+        self.assertTrue(not t1.is_contiguous())
+        self.assertTrue(not t2.is_contiguous())
+
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
     @onlyCUDA
@@ -876,7 +879,7 @@ def test_multiple_byte_mask(self, device):
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
             self.assertEqual(v[mask1, :, mask2].shape, (3, 7))
-            self.assertEquals(len(w), 2)
+            self.assertEqual(len(w), 2)
 
     def test_byte_mask2d(self, device):
         v = torch.randn(5, 7, 3, device=device)
@@ -1130,7 +1133,7 @@ def test_byte_tensor_assignment(self, device):
 
         with warnings.catch_warnings(record=True) as w:
             x[b] = value
-            self.assertEquals(len(w), 1)
+            self.assertEqual(len(w), 1)
 
         self.assertEqual(x[0], value)
         self.assertEqual(x[1], torch.arange(4., 8, device=device))
diff --git a/test/test_jit.py b/test/test_jit.py
index 37cd9b5d53c0..e585471a413d 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -17,6 +17,7 @@
 from jit.test_data_parallel import TestDataParallel  # noqa: F401
 from jit.test_models import TestModels  # noqa: F401
 from jit.test_modules import TestModules  # noqa: F401
+from jit.test_autodiff import TestAutodiffJit  # noqa: F401
 from jit.test_autodiff_subgraph_slicing import TestAutodiffSubgraphSlicing  # noqa: F401
 from jit.test_custom_operators import TestCustomOperators  # noqa: F401
 from jit.test_export_modes import TestExportModes  # noqa: F401
@@ -25,12 +26,12 @@
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
 from jit.test_ignore_context_manager import TestIgnoreContextManager  # noqa: F401
 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis  # noqa: F401
-from jit.test_if_hoisting import TestIfHoisting  # noqa: F401
+from jit.test_op_decompositions import TestOpDecompositions  # noqa: F401
 from jit.test_unsupported_ops import TestUnsupportedOps  # noqa: F401
 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing  # noqa: F401
 from jit.test_peephole import TestPeephole  # noqa: F401
 from jit.test_alias_analysis import TestAliasAnalysis  # noqa: F401
-from jit.test_save_load import TestSaveLoad  # noqa: F401
+from jit.test_save_load import TestSaveLoad, TestSaveLoadFlatbuffer  # noqa: F401
 from jit.test_save_load_for_op_version import TestSaveLoadForOpVersion  # noqa: F401
 from jit.test_module_containers import TestModuleContainers  # noqa: F401
 from jit.test_python_bindings import TestPythonBindings  # noqa: F401
@@ -76,6 +77,7 @@
 from jit.test_device_analysis import TestDeviceAnalysis  # noqa: F401
 from jit.test_dce import TestDCE  # noqa: F401
 from jit.test_sparse import TestSparse  # noqa: F401
+from jit.test_tensor_methods import TestTensorMethods  # noqa: F401
 
 # Torch
 from torch import Tensor
@@ -98,18 +100,19 @@
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
     freeze_rng_state, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
-    enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs
+    enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
+    skipIfCrossRef, IS_MACOS
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
     _trace, do_input_map, get_execution_plan, make_global, \
     execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \
     RUN_CUDA
-from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, nn_functional_tests, get_script_args, \
-    EXCLUDE_SCRIPT, additional_module_tests, EXCLUDE_SCRIPT_MODULES, \
-    get_nn_module_name_from_kwargs, get_nn_mod_test_name, script_method_template
+from torch.testing._internal.jit_metaprogramming_utils import (
+    get_script_args,
+    create_input, unpack_variables,
+    additional_module_tests, EXCLUDE_SCRIPT_MODULES,
+    get_nn_module_name_from_kwargs, get_nn_mod_test_name, script_method_template)
 
 from torch.testing._internal.common_nn import module_tests, new_module_tests, criterion_tests
-from torch.testing._internal.common_methods_invocations import (
-    create_input, unpack_variables)
 
 # For testing truediv in python 2
 from torch.testing._internal.test_module.future_div import div_int_future, div_float_future
@@ -203,11 +206,6 @@ def doAutodiffCheck(testname):
 # TODO: enable TE in PE when all tests are fixed
 torch._C._jit_set_texpr_fuser_enabled(GRAPH_EXECUTOR == ProfilingMode.PROFILING)
 torch._C._jit_set_profiling_executor(GRAPH_EXECUTOR != ProfilingMode.LEGACY)
-# even though FULL_PROFILER should be our default
-# we haven't tested every single test in this file
-# but we enable FULL_PROFILER for a large subset
-# of the tests with "with enable_profiling_mode_for_profiling_tests"
-torch._C._jit_set_profiling_mode(False)
 
 def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
     hx, cx = hidden
@@ -969,6 +967,56 @@ def forward(self, input):
         m_dropout.eval()
         self.assertEqual(dropout(input) + 1, m_dropout(input))
 
+    def test_nn_lp_pool2d(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l = torch.nn.LPPool2d(2, 3)
+                self.n = torch.nn.LPPool2d(2, (7, 1))
+
+            def forward(self, x):
+                return (self.l(x),
+                        self.n(x),
+                        torch.nn.functional.lp_pool2d(x, float(2), 3),
+                        torch.nn.functional.lp_pool2d(x, 2, 3),
+                        torch.nn.functional.lp_pool2d(x, float(2), (7, 1)))
+
+        self.checkModule(Mod(), (torch.rand(1, 3, 7, 7),))
+
+    def test_nn_lp_pool1d(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l = torch.nn.LPPool1d(2, 3)
+                self.n = torch.nn.LPPool1d(2, 7)
+
+            def forward(self, x):
+                return (self.l(x),
+                        self.n(x),
+                        torch.nn.functional.lp_pool1d(x, float(2), 3),
+                        torch.nn.functional.lp_pool1d(x, 2, 3),
+                        torch.nn.functional.lp_pool1d(x, float(2), 7))
+
+        self.checkModule(Mod(), (torch.rand(1, 3, 7),))
+
+    def test_nn_padding_functional(self):
+        class Mod(nn.Module):
+            def __init__(self, *pad):
+                super().__init__()
+                self.pad = pad
+
+            def forward(self, x):
+                return F.pad(x, self.pad, mode='constant', value=3.5)
+
+        inputs = [
+            (Mod(1, 2), torch.randn(1, 3, 4)),  # 1D
+            (Mod(1, 2, 3, 4), torch.randn(1, 3, 4)),  # 2D
+            (Mod(1, 2, 3, 4, 5, 6), torch.randn(1, 3, 4)),  # 3D
+        ]
+
+        for m, inp in inputs:
+            self.checkModule(m, (inp,))
+
     def test_nn_padding(self):
         class Mod(nn.Module):
             def __init__(self, padding):
@@ -1640,6 +1688,73 @@ def doit(x, y):
         for node in g.nodes():
             self.assertTrue(g2.findNode(node.kind()) is not None)
 
+    def test_permute_inputs_binding(self):
+        @torch.jit.script
+        def foo(i, j, k):
+            pass
+
+        g = foo.graph
+
+        idxs = []
+        for i, inp in enumerate(g.inputs()):
+            inp.setDebugName(f"inp{i}")
+            idxs.append(i)
+
+        permuted_idxs = list(np.random.permutation(idxs))
+        g.permuteInputs(permuted_idxs)
+        for i, inp in enumerate(g.inputs()):
+            self.assertEqual(f"inp{permuted_idxs[i]}", inp.debugName())
+
+    @unittest.skipIf(IS_MACOS, "Failing on MacOS only")
+    def test_python_ir_utils(self):
+        @torch.jit.script
+        def foo(inp):
+            x = inp + 1
+            y = x / 2
+            z = y * y
+            return z
+
+        add_node = foo.graph.findNode("aten::add")
+        div_node = foo.graph.findNode("aten::div")
+
+        with foo.graph.insert_point_guard(add_node):
+            with foo.graph.insert_point_guard(div_node):
+                foo.graph.insertConstant("goodbye")
+            foo.graph.insertConstant("hello")
+        with foo.graph.insert_point_guard(foo.graph.findNode("aten::mul")):
+            foo.graph.insertConstant("hello")
+        FileCheck().check("hello").check("goodbye").check("hello").run(foo.graph)
+
+        self.assertTrue(add_node.matches(add_node.schema()))
+        self.assertFalse(add_node.matches(div_node.schema()))
+
+    def test_python_ir_utils_graph(self):
+        @torch.jit.script
+        def unrolled_mul(x: torch.Tensor, y: int):
+            out = x
+            for _ in range(y - 1):
+                out = out + x
+            return out
+
+        @torch.jit.script
+        def foo(x):
+            return x * 4
+
+        g = foo.graph
+        muls = g.findAllNodes("aten::mul")
+        scalar_muls = filter(lambda x: x.matches("aten::mul(Tensor self, Scalar other) -> Tensor"), muls)
+        mul_constant_int = filter(lambda x: isinstance(list(x.inputs())[1].toIValue(), int), scalar_muls)
+        for mul in mul_constant_int:
+            with g.insert_point_guard(mul):
+                outputs = g.insertGraph(unrolled_mul.graph, list(mul.inputs()))
+                assert len(outputs) == len(list(mul.outputs()))
+                for new_out, old_out in zip(outputs, g.outputs()):
+                    old_out.replaceAllUsesWith(new_out)
+                mul.destroy()
+
+        FileCheck().check_not("aten::mul").check("aten::add").run(foo.graph)
+        self.assertEqual(foo(torch.ones([2, 2])), torch.ones([2, 2]) * 4)
+
     @unittest.skipIf(IS_SANDCASTLE, "gtest runs these in sandcastle")
     @unittest.skipIf(RUN_CUDA, "covered by test_cpp_cuda")
     @unittest.skipIf(not torch._C._jit_has_cpp_tests(), "Tests were not built, use BUILD_TEST=1")
@@ -1868,8 +1983,8 @@ def equation_format_varargs(x, y):
         def sublist_format(x, y):
             return torch.einsum(x, [0], y, [1], [0, 1])
 
-        x = make_tensor((5,), 'cpu', torch.float32)
-        y = make_tensor((10,), 'cpu', torch.float32)
+        x = make_tensor((5,), dtype=torch.float32, device="cpu")
+        y = make_tensor((10,), dtype=torch.float32, device="cpu")
 
         for fn in [equation_format, equation_format_varargs, sublist_format]:
             check(fn, torch.jit.script(fn), x, y)
@@ -4310,6 +4425,7 @@ def foo(xyz):
         fc.run(scripted.foo.graph)
         fc.run(str(scripted.foo.graph))
 
+    @skipIfCrossRef
     def test_file_line_trace(self):
         def foobar(xyz):
             return torch.neg(xyz)
@@ -4450,6 +4566,7 @@ def debug_records_from_mod(mod):
             debug_files = filter(lambda f: f.endswith('.debug_pkl'), files)
             debug_files = (archive.open(f) for f in debug_files)
             debug_files = (pickle.load(f) for f in debug_files)
+            debug_files = (f[2] for f in debug_files)
             return list(debug_files)
 
         debug_files = debug_records_from_mod(ft3)
@@ -5666,12 +5783,7 @@ def test_fuser_double_float_codegen(self):
                'frac']
 
         def lookup_c_equivalent_fn(aten_fn):
-            if aten_fn == 'min':
-                return 'fmin'
-            elif aten_fn == 'max':
-                return 'fmax'
-            else:
-                return aten_fn
+            return aten_fn
 
         def test_dispatch(op, expects, dtype, binary=False):
             if dtype == torch.double:
@@ -5705,7 +5817,9 @@ def test_dispatch(op, expects, dtype, binary=False):
             test_dispatch(fn, lookup_c_equivalent_fn(fn) + '(', torch.double)
             test_dispatch(fn, lookup_c_equivalent_fn(fn) + 'f(', torch.float)
 
-        binary_fns = ['min', 'max', 'pow']
+        # 'min', 'max' were previously tested but are now replaced with ternary expressions
+        # instead of fmin() and fmax()
+        binary_fns = ['pow']
         for fn in binary_fns:
             test_dispatch(fn, lookup_c_equivalent_fn(fn) + '(', torch.double, binary=True)
             test_dispatch(fn, lookup_c_equivalent_fn(fn) + 'f(', torch.float, binary=True)
@@ -6617,6 +6731,13 @@ def forward(self) -> torch.Tensor:
         self.assertEqual(model1.forward(), script_model_1.forward())
         self.assertEqual(model2.forward(), script_model_2.forward())
 
+    def test_ternary_right_associative(self):
+        def plus_123(x: int):
+            return x + 1 if x == 1 else x + 2 if x == 2 else x + 3
+        self.checkScript(plus_123, (1,))
+        self.checkScript(plus_123, (2,))
+        self.checkScript(plus_123, (3,))
+
     def test_print(self):
         def func(x, y):
             q = (x + y).sigmoid()
@@ -7256,7 +7377,7 @@ def test_as_tensor_tensor_input(input):
             g = test_as_tensor_tensor_input.graph_for(torch.ones(3, 4))
             FileCheck().check("Tensor = aten::as_tensor").check("Float(*, *, requires_grad=0, device=cpu) = aten::as_tensor").run(g)
 
-
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "testing legacy behavior")
     def test_tensor_requires_grad(self):
         @torch.jit.script
         def test(b):
@@ -8162,6 +8283,44 @@ def test_irparser(self):
         """
         FileCheck().run(graph_str, parse_ir(graph_str))
 
+    def test_parse_tensor_constants(self):
+        def foo():
+            return torch.zeros([4, 4])
+
+        foo_s = torch.jit.script(foo)
+        torch._C._jit_pass_constant_propagation(foo_s.graph)
+
+        g = str(foo_s.graph)
+        g_parsed = parse_ir(g, parse_tensor_constants=True)
+        self.assertEqual(str(canonical(g_parsed)), str(canonical(foo_s.graph)))
+        func = torch._C._create_function_from_graph("forward", g_parsed)
+
+        out_parsed = func()
+        out_func = foo()
+        # not checking data, just dtype, size etc
+        out_parsed[:] = 0
+        out_func[:] = 0
+        self.assertEqual(out_func, out_parsed)
+
+        with self.assertRaises(RuntimeError):
+            parse_ir(g, parse_tensor_constants=False)
+
+    def test_parse_nested_names(self):
+        g_str = """
+    graph(%x.1 : Tensor):
+        %3 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=2]()
+        %hi.submod.value.5 : Tensor = aten::add(%x.1, %2, %3)
+        return (%hi.submod.value.5)
+        """
+        g = parse_ir(g_str)
+        round_trip_g = parse_ir(str(g))
+        self.assertEqual(canonical(g), canonical(round_trip_g))
+
+        func1 = torch._C._create_function_from_graph("forward", g)
+        func2 = torch._C._create_function_from_graph("forward", round_trip_g)
+        self.assertEqual(func1(torch.ones([2])), func2(torch.ones([2])))
+
     def test_is_after_use(self):
         def sorted_input_use(g):
             uses = list(next(g.inputs()).uses())
@@ -10286,7 +10445,7 @@ def fn(x):
         self.assertTrue(n.type() == torch._C.TensorType.getInferred())
 
         with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"):
-            fn(1)
+            fn("1")
 
     def test_script_define_order(self):
         class M(torch.jit.ScriptModule):
@@ -10991,6 +11150,26 @@ def randint():
             FileCheck().check("Double(*, *, requires_grad=0, device=cpu)") \
                        .check_not("Float(*, *, requires_grad=0, device=cpu)").run(randint.graph_for())
 
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "skip if profiling isn't enabled")
+    def test_autodiff_complex(self):
+        def foo(x: torch.Tensor, y: torch.Tensor, W: torch.Tensor):
+            return torch.exp(torch.mm(torch.complex(x, y), W.cfloat()))
+
+        @torch.jit.script
+        def jitted_foo(x: torch.Tensor, y: torch.Tensor, W: torch.Tensor):
+            return torch.exp(torch.mm(torch.complex(x, y), W.cfloat()))
+
+        x = torch.randn(128, 16, dtype=torch.float32, device='cuda:0')
+        y = torch.randn(128, 16, dtype=torch.float32, device='cuda:0')
+        W = torch.randn(16, 1, dtype=torch.float32, device='cuda:0', requires_grad=True)
+        W.data /= 4
+
+        with enable_profiling_mode_for_profiling_tests():
+            for i in range(4):
+                self.assertTrue((foo(x, y, W).grad_fn is None) == (jitted_foo(x, y, W).grad_fn is None))
+
+
     def test_linear_grad(self):
         with enable_profiling_mode_for_profiling_tests():
             def t(x: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor]):
@@ -11090,6 +11269,21 @@ def func(a):
         self.run_pass("erase_number_types", graph)
         FileCheck().check_not("int = prim::Constant").run(str(graph))
 
+    def test_refine_tuple_types(self):
+        # TupleConstruct output type is not correct here.
+        graph_str = """
+        graph(%a : Float(123), %b : Float(4, 5, 6)):
+          %c : (Tensor, Tensor) = prim::TupleConstruct(%a, %b)
+          return (%c)
+        """
+        graph = parse_ir(graph_str)
+        torch._C._jit_pass_refine_tuple_types(graph)
+
+        # After the pass, the output type should've been updated.
+        self.assertTrue('(Float(123), Float(4, 5, 6))' in str(graph.findNode('prim::TupleConstruct').output()))
+
+    # TODO(henrytu): Add test for RefineTypes for NamedTuple when it's supported by IR parser.
+
     def test_remove_dropout(self):
         weight_0_shape = (20, 5)
         weight_1_shape = (20, 20)
@@ -13013,153 +13207,6 @@ def func(niter):
 
         self.checkScript(dedent(code), (101,))
 
-    def test_pyop_exception_message(self):
-        class Foo(torch.jit.ScriptModule):
-            def __init__(self):
-                super(Foo, self).__init__()
-                self.conv = nn.Conv2d(1, 10, kernel_size=5)
-
-            @torch.jit.script_method
-            def forward(self, x):
-                return self.conv(x)
-        foo = Foo()
-        # testing that the correct error message propagates
-        with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"):
-            foo(torch.ones([123]))  # wrong size
-
-    def test_builtin_error_messsage(self):
-        with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"):
-            @torch.jit.script
-            def close_match(x):
-                return x.masked_fill(True)
-
-        with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently "
-                                    "supported in TorchScript"):
-            @torch.jit.script
-            def unknown_op(x):
-                torch.set_anomaly_enabled(True)
-                return x
-
-    def test_exceptions(self):
-        cu = torch.jit.CompilationUnit('''
-            def foo(cond):
-                if bool(cond):
-                    raise ValueError(3)
-                return 1
-        ''')
-
-        cu.foo(torch.tensor(0))
-        with self.assertRaisesRegex(torch.jit.Error, "3"):
-            cu.foo(torch.tensor(1))
-
-        def foo(cond):
-            a = 3
-            if bool(cond):
-                raise ArbitraryError(a, "hi")
-                if 1 == 2:
-                    raise ArbitraryError
-            return a
-
-        with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"):
-            torch.jit.script(foo)
-
-        def exception_as_value():
-            a = Exception()
-            print(a)
-
-        with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"):
-            torch.jit.script(exception_as_value)
-
-        @torch.jit.script
-        def foo_no_decl_always_throws():
-            raise RuntimeError("Hi")
-
-        # function that has no declared type but always throws set to None
-        output_type = next(foo_no_decl_always_throws.graph.outputs()).type()
-        self.assertTrue(str(output_type) == "NoneType")
-
-        @torch.jit.script
-        def foo_decl_always_throws():
-            # type: () -> Tensor
-            raise Exception("Hi")
-
-        output_type = next(foo_decl_always_throws.graph.outputs()).type()
-        self.assertTrue(str(output_type) == "Tensor")
-
-        def foo():
-            raise 3 + 4
-
-        with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"):
-            torch.jit.script(foo)
-
-        # a escapes scope
-        @torch.jit.script
-        def foo():
-            if 1 == 1:
-                a = 1
-            else:
-                if 1 == 1:
-                    raise Exception("Hi")
-                else:
-                    raise Exception("Hi")
-            return a
-        self.assertEqual(foo(), 1)
-
-        @torch.jit.script
-        def tuple_fn():
-            raise RuntimeError("hello", "goodbye")
-
-        with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"):
-            tuple_fn()
-
-        @torch.jit.script
-        def no_message():
-            raise RuntimeError
-
-        with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"):
-            no_message()
-
-    def test_assertions(self):
-        cu = torch.jit.CompilationUnit('''
-            def foo(cond):
-                assert bool(cond), "hi"
-                return 0
-        ''')
-
-        cu.foo(torch.tensor(1))
-        with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"):
-            cu.foo(torch.tensor(0))
-
-        @torch.jit.script
-        def foo(cond):
-            assert bool(cond), "hi"
-
-        foo(torch.tensor(1))
-        # we don't currently validate the name of the exception
-        with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"):
-            foo(torch.tensor(0))
-
-    def test_python_op_exception(self):
-        @torch.jit.ignore
-        def python_op(x):
-            raise Exception("bad!")
-
-        @torch.jit.script
-        def fn(x):
-            return python_op(x)
-
-        with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"):
-            fn(torch.tensor(4))
-
-    def test_dict_expansion_raises_error(self):
-        def fn(self):
-            d = {"foo": 1, "bar": 2, "baz": 3}
-            return {**d}
-
-        with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError,
-                                    "Dict expansion "):
-            torch.jit.script(fn)
-
     def test_module_parameters_and_buffers(self):
         weights = torch.randn(10, 10)
         bias = torch.randn(10)
@@ -14911,6 +14958,12 @@ def forward(self, x):
         with self.assertRaisesRegex(Exception, "Overloads are not useable when a module"):
             a = torch.jit.script(W2())
 
+    def test_narrow_copy(self):
+        def foo(a):
+            return a.narrow_copy(0, 0, 5)
+
+        self.checkScript(foo, [torch.rand(10)])
+
     def test_select_after_chunk(self):
         def foo(x):
             chunked = torch.chunk(x, 1)
@@ -15065,6 +15118,22 @@ def jit_multihead_attn_forward(query,                   # type: Tensor
         # print(jit_out / py_out - 1)
         self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4)
 
+    def test_torchscript_multi_head_attn_fast_path(self):
+        src_l = 3
+        bsz = 5
+        embed_size = 8
+        nhead = 2
+        multi_head_attn = torch.nn.MultiheadAttention(embed_size, nhead, batch_first=True)
+        multi_head_attn = multi_head_attn.eval()
+
+        query = key = value = torch.rand((bsz, src_l, embed_size))
+
+        with torch.no_grad():
+            py_out = multi_head_attn(query, key, value)
+            mha = torch.jit.script(multi_head_attn)
+            jit_out = mha(query, key, value)
+        torch.testing.assert_close(jit_out, py_out)
+
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_scriptmodule_multi_head_attn_cuda(self):
 
@@ -15494,7 +15563,7 @@ def forward(self, x):
         self.assertEqual(m.int64_min, imported.int64_min)
 
     def test_script_scope(self):
-        scripted = torch.jit.script(torch.nn.functional.pad)
+        scripted = torch.jit.script(torch.nn.functional.triplet_margin_loss)
 
     @unittest.skipIf(IS_WINDOWS, "NYI: TemporaryFileName on Windows")
     def test_serialization_sharing(self):
@@ -15895,7 +15964,7 @@ def foo(a):
 
         with self.assertRaisesRegex(RuntimeError, (r"Expected a value of type \'Tensor \(inferred\)\'"
                                                    r"[\S\s]*Inferred \'a\' to be of type \'Tensor\'")):
-            foo(1)
+            foo("1")
 
     def test_type_comments_in_body(self):
         @torch.jit.script
@@ -15918,6 +15987,13 @@ def __init__(self,
 
         torch.jit.script(M(2, 3))
 
+    def test_input_keyword_in_schema(self):
+        def f(x):
+            return torch.ceil(input=x)
+
+        inp = torch.randn(10)
+        self.checkScript(f, (inp, ))
+
     def test_module_method_reassignment(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -16207,59 +16283,6 @@ def test_nhwc_autocast_jit_trace_model(model, x):
 M = 10
 S = 5
 
-
-def add_nn_functional_test(name, self_size, args, variant_name='', check_ad=(), skipTestIf=(),
-                           output_process_fn=lambda x: x, kwargs=None):
-    test_name = 'test_nn_' + name
-
-    if variant_name != '':
-        test_name = test_name + '_' + variant_name
-
-    no_grad = variant_name == 'inplace'
-
-    @suppress_warnings
-    def do_test(self, name=name, args=args, test_name=test_name, check_ad=check_ad):
-        torch.manual_seed(2)
-
-        self_variable = create_input((self_size,))[0][0]
-
-        # need to record this because methods can change the size (e.g. unsqueeze)
-        args_variable, kwargs_variable = create_input(args, call_kwargs=kwargs)
-
-        self_tensor = deepcopy(self_variable.data)
-        args_tensor = deepcopy(unpack_variables(args_variable))
-
-        if not no_grad:
-            output_variable = getattr(F, name)(self_variable, *args_variable, **kwargs_variable)
-
-        def fn(*inputs, **kwargs):
-            return getattr(F, name)(*inputs, **kwargs)
-
-        f_args_variable = (self_variable,) + args_variable
-        f_args_tensor = (self_tensor,) + args_tensor
-        should_autodiff_node, autodiff_nodes, fusible_nodes = normalize_check_ad(check_ad, name)
-
-        if test_name not in EXCLUDE_SCRIPT:
-            def run_test():
-                # XXX: this test should always run with disable_autodiff_subgraph_inlining(True),
-                #      so that we don't regress on autodiff support.
-                with disable_autodiff_subgraph_inlining():
-                    script_fn = create_script_fn(self, name, 'nn_functional')
-                    check_against_reference(self, script_fn, fn, output_process_fn,
-                                            f_args_variable, kwargs_variable, no_grad=no_grad)
-                    # For tests we disabled AD subgraph inlining, make sure it's not falling back to autograd
-                    if (doAutodiffCheck(test_name)):
-                        self.assertAutodiffNode(script_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes)
-
-            if test_name in EXCLUDE_PYTHON_PRINT:
-                with torch._jit_internal._disable_emit_hooks():
-                    run_test()
-            else:
-                run_test()
-
-    post_add_test(test_name, skipTestIf, do_test, TestJitGeneratedFunctional)
-
-
 def add_nn_module_test(*args, **kwargs):
     no_grad = False if 'no_grad' not in kwargs else kwargs['no_grad']
 
@@ -16410,10 +16433,6 @@ def test_version(self):
         # issue gh-32561
         self.assertTrue(torch.__version__.startswith(torch.onnx.producer_version))
 
-
-for test in nn_functional_tests:
-    add_nn_functional_test(*test)
-
 for test in module_tests + new_module_tests + additional_module_tests:
     add_nn_module_test(**test)
 
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index cec8acfe7e85..6cb3c5645382 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -15,14 +15,15 @@
 class TestAutocast(JitTestCase):
     def setUp(self):
         # common input tensors
-        self.a_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
-        self.b_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
-        self.c_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
-        self.d_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
-        self.a_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
-        self.b_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
-        self.c_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
-        self.d_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
+        if TEST_CUDA:
+            self.a_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
+            self.b_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
+            self.c_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
+            self.d_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda')
+            self.a_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
+            self.b_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
+            self.c_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
+            self.d_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda')
         self.old_value = torch._C._jit_set_autocast_mode(True)
         super().setUp()
 
@@ -659,6 +660,95 @@ def forward(self, x, y):
         # isn't enabled
         self.assertRaises(RuntimeError, lambda: scripted_thing1.forward(x, y))
 
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    def test_jit_freeze_autocast_basic(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+
+            def forward(self, x, y):
+                with torch.cuda.amp.autocast():
+                    return torch.mm(x, y)
+
+        x = torch.rand((3, 4), dtype=torch.float).cuda()
+        y = torch.rand((4, 5), dtype=torch.float).cuda()
+
+        mod = TestModule().eval()
+
+        # sanity check
+        self._test_autocast(mod, "aten::_autocast_to_reduced_precision", x, y)
+
+        frozen_mod = torch.jit.freeze(torch.jit.script(mod).eval())
+        FileCheck().check_count("aten::_autocast_to_reduced_precision", 2, True).run(frozen_mod.graph)
+
+        # make sure that the runtime pass doesn't duplicate autocast nodes
+        frozen_mod(x, y)
+        optimized_graph = frozen_mod.graph_for(x, y)
+        FileCheck().check_count("aten::_autocast_to_reduced_precision", 2, True).run(optimized_graph)
+
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    def test_jit_freeze_autocast_constants(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.x = torch.rand((3, 4), dtype=torch.float).cuda()
+
+            def forward(self, y):
+                with torch.cuda.amp.autocast():
+                    return torch.mm(self.x, y)
+
+        y = torch.rand((4, 5), dtype=torch.float).cuda()
+        mod = TestModule().eval()
+
+        frozen_mod = torch.jit.freeze(torch.jit.script(mod).eval())
+        # freezing should pre-cast the constant self.x to remove one autocast call
+        FileCheck().check_count("aten::_autocast_to_reduced_precision", 1, True).run(frozen_mod.graph)
+
+        # the runtime autocasting pass will re-insert the second autocast call,
+        # but constant propagation will merge it with the constant that it's casting.
+        frozen_mod(y)
+        optimized_graph = frozen_mod.graph_for(y)
+        FileCheck().check_count("aten::_autocast_to_reduced_precision", 1, True).run(optimized_graph)
+
+    @unittest.skipIf(TEST_CUDA, "CPU-only test")
+    def test_jit_autocast_softmax_cpu(self):
+        def fn(x):
+            with torch.cpu.amp.autocast():
+                return torch.nn.functional.softmax(x, dim=0)
+
+        fn_s = torch.jit.script(fn)
+        x = torch.rand((2, 2), dtype=torch.bfloat16)
+        fn_s(x)
+        y = fn_s(x)
+
+        self.assertTrue(y.dtype == torch.bfloat16)
+
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    def test_jit_autocast_softmax_gpu(self):
+        def fn(x):
+            with torch.cuda.amp.autocast():
+                return torch.nn.functional.softmax(x, dim=0)
+
+        fn_s = torch.jit.script(fn)
+        x = torch.rand((2, 2), dtype=torch.half).cuda()
+        fn_s(x)
+        y = fn_s(x)
+
+        self.assertTrue(y.dtype == torch.float)
+
+    def test_ignore_amp(self):
+        @torch.jit.script
+        def foo(x):
+            return torch.mm(x, x)
+
+        inp = torch.rand([10, 10], dtype=torch.float)
+        foo._set_ignore_amp(True)
+        with torch.cpu.amp.autocast():
+            foo(inp)
+            foo(inp)
+
+        g = torch.jit.last_executed_optimized_graph()
+        FileCheck().check_not("_autocast_to_reduced").run(g)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index c03ff0b3119a..3926b081d7c9 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -1,34 +1,48 @@
 # Owner(s): ["oncall: jit"]
 
+import contextlib
 import unittest
 import os
 import random
+import enum
+import copy
+from functools import reduce
+import operator
+import warnings
 
 import torch
 from torch.nn import functional
+from torch.profiler import profile, ProfilerActivity
 
-from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR  # TEST_WITH_ROCM
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
+from torch.testing._internal.common_jit import JitCommonTestCase
+from torch.testing._internal.common_methods_invocations import op_db, SampleInput
+from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, TEST_WITH_ROCM, slowTest, \
+    is_iterable_of_tensors, freeze_rng_state
+from torch.testing._internal.jit_utils import clone_inputs, get_traced_sample_variant_pairs, JitTestCase, RUN_CUDA
+from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
 from torch.testing import FileCheck
 
-from test_jit import JitTestCase, RUN_CUDA
-
 from jit.test_fuser_common import TestFuserCommon  # noqa: F401
 
 import itertools
 import numpy as np
 import math
 
+from torch.autograd.gradcheck import gradcheck
+
 from typing import List
 
-CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.'))
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+CUDA_MAJOR, CUDA_MINOR = 0, 0
+
+if RUN_NVFUSER and torch.version.cuda is not None:
+    CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])
 
-os.environ['PYTORCH_NVFUSER_DISABLE_FALLBACK'] = '1'
-os.environ['PYTORCH_NVFUSER_DISABLE_FMA'] = '1'
-os.environ['PYTORCH_NVFUSER_DISABLE_FASTMATH'] = '1'
+os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,unroll_with_rng'
 os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
-os.environ['PYTORCH_NVFUSER_DISABLE_RNG_UNROLL'] = '1'
 
 if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
     torch._C._jit_set_texpr_fuser_enabled(False)
@@ -37,8 +51,9 @@
 
 FUSION_GROUP = 'prim::CudaFusionGroup'
 FUSION_GUARD = 'prim::CudaFusionGuard'
+# TODO: revert disabled alias ops
+ALIAS_TEST_DISABLED = True
 
-import contextlib
 
 @contextlib.contextmanager
 def nvfuser_singleton_fusion(flag):
@@ -57,37 +72,39 @@ def nvfuser_horizontal_fusion(flag):
         torch._C._jit_set_nvfuser_horizontal_mode(old_value)
 
 def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
     prop = torch.cuda.get_device_properties(torch.cuda.current_device())
     return prop.major < 7
 
-TEST_BF16 = torch.cuda.is_bf16_supported()
+TEST_BF16 = RUN_NVFUSER and torch.cuda.is_bf16_supported()
 
-class TestCudaFuser(JitTestCase):
+class CudaFuserTestOptions():
+    def __init__(self):
+        self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
+        self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+        self.old_value = torch._C._jit_set_autocast_mode(True)
+
+        if(RUN_CUDA):
+            self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True)
+
+    def restore(self):
+        if(RUN_CUDA):
+            torch._C._jit_set_nvfuser_enabled(self.old_nvfuser)
+        torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse)
+        torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse)
+        torch._C._jit_set_nvfuser_guard_mode(self.old_guard)
+        torch._C._debug_set_autodiff_subgraph_inlining(True)
+        torch._C._jit_set_autocast_mode(self.old_value)
 
-    special_values = torch.tensor(
-        [float("-inf"), -10, -math.pi,
-            -1, -0.5, 0, 1, 0.5,
-            math.pi, 10, float("inf"),
-            float("nan")], dtype=torch.float, device='cuda')
-
-    int_types = [
-        torch.int8,
-        torch.uint8,
-        torch.int16,
-        torch.int32,
-        torch.int64
-    ]
-
-    support_tensor_dtypes = [
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.bool
-    ]
-    if TEST_BF16:
-        support_tensor_dtypes.append(torch.bfloat16)
+class TestCudaFuser(JitTestCase):
+    def assertEqual(self, *args, **kwargs):
+        kwargs["exact_layout"] = True
+        super(JitTestCase, self).assertEqual(*args, **kwargs)
 
     def _getSubgraphInFusion(self, graph):
         num_node = 0
@@ -108,28 +125,60 @@ def count(block, ret):
 
     def setUp(self):
         super(TestCudaFuser, self).setUp()
-        self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
-        self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
-        torch._C._jit_override_can_fuse_on_cpu(False)
-        torch._C._jit_override_can_fuse_on_gpu(False)
-        self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False)
-        torch._C._debug_set_autodiff_subgraph_inlining(False)
-        self.old_value = torch._C._jit_set_autocast_mode(True)
 
-        if(RUN_CUDA):
-            self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True)
+        self.skip_node_list = []
+        disabled_ops = ("aten::batch_norm",
+                        "aten::_batch_norm_impl_index",
+                        "aten::_batch_norm_impl_index_backward",
+                        "aten::native_batch_norm_backward")
+        for op in disabled_ops:
+            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
+            if disabled_flag:
+                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
+                self.skip_node_list.append(op)
+
+        # cpu backup to avoid errors in case this is run on a CPU-only machine
+        dev = 'cuda' if RUN_NVFUSER else 'cpu'
+        self.special_values = torch.tensor(
+            [float("-inf"), -10, -math.pi,
+                -1, -0.5, 0, 1, 0.5,
+                math.pi, 10, float("inf"),
+                float("nan")], dtype=torch.float, device=dev)
+
+        self.int_types = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64
+        ]
+
+        self.support_tensor_dtypes = [
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bool
+        ]
+        if TEST_BF16:
+            self.support_tensor_dtypes.append(torch.bfloat16)
+
+        if(RUN_NVFUSER):
+            self.cuda_fuser_options = CudaFuserTestOptions()
 
     def tearDown(self):
-        if(RUN_CUDA):
-            torch._C._jit_set_nvfuser_enabled(self.old_nvfuser)
-        torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse)
-        torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse)
-        torch._C._jit_set_nvfuser_guard_mode(self.old_guard)
-        torch._C._debug_set_autodiff_subgraph_inlining(True)
-        torch._C._jit_set_autocast_mode(self.old_value)
+        # restoring skip node to the configuration before tests
+        for op in self.skip_node_list:
+            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
+            if not disabled_flag:
+                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
+
+        if(RUN_NVFUSER):
+            self.cuda_fuser_options.restore()
         super(TestCudaFuser, self).tearDown()
 
-    def _run_helper(self, jit_op, op, *args):
+    def _run_helper(self, jit_op, op, *args, check_stride=False, num_fusion=1):
         torch.cuda.manual_seed_all(123)
         jit_o = jit_op(*args)
         torch.cuda.manual_seed_all(123)
@@ -138,7 +187,9 @@ def _run_helper(self, jit_op, op, *args):
         o = op(*args)
         self.assertEqual(o.dtype, jit_o.dtype)
         self.assertEqual(o, jit_o)
-        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
+        if check_stride:
+            self.assertEqual(o.stride(), jit_o.stride())
+        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, num_fusion, consider_subgraphs=True)
 
     def _run_training_helper(self, jit_op, op, grads, *args):
         torch.cuda.manual_seed_all(123)
@@ -162,7 +213,7 @@ def _run_training_helper(self, jit_op, op, grads, *args):
         )[0].graph
         self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_half(self):
@@ -187,8 +238,9 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
             self.assertEqual(oo, jit_oo)
         self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
 
+
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_bfloat(self):
@@ -213,7 +265,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
             self.assertEqual(oo, jit_oo)
         self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_const(self):
@@ -230,7 +282,7 @@ def t(x, y):
         self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_chunk(self):
@@ -254,14 +306,14 @@ def t(x, y, z, q):
         self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_reduction_dtypes_axis(self):
 
-        for op in [torch.sum, torch.mean, torch.amax]:
+        for op in [torch.sum, torch.mean, torch.amax, torch.var, torch.std]:
             for dtype in [torch.float16, torch.float32, torch.double]:
-                for axis in [-1, 2]:
+                for axis in [-1, 2, 0]:
                     def make_func(op):
                         def func(x: torch.Tensor):
                             o = torch.mul(x, 2.0)
@@ -279,7 +331,34 @@ def func(x: torch.Tensor):
                     self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
                     self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_variance(self):
+
+        for op in [torch.var, torch.std]:
+            for dtype in [torch.float16, torch.float32, torch.double]:
+                for axis in [-2, -1, 2, 1]:
+                    for unbiased in [False, True]:
+                        def make_func(op):
+                            def func(x: torch.Tensor):
+                                o = torch.mul(x, 2.0)
+                                o = op(o, dim=[axis])
+                                return o
+                            return func
+
+                        x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
+                        t = make_func(op)
+                        t_jit = torch.jit.trace(t, x)
+                        jit_o = t_jit(x)
+                        jit_o = t_jit(x)
+                        o = t(x)
+                        self.assertEqual(o.dtype, jit_o.dtype)
+                        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+                        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_scalar_input(self):
@@ -297,7 +376,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_0(self):
@@ -316,7 +395,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
         self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_1(self):
@@ -335,7 +414,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
         self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_2(self):
@@ -354,7 +433,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
         self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_3(self):
@@ -376,7 +455,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
     # test_broadcasting_partition_logic_X
     # Testing partition logic that is capable to avoid creating unsupported
     # broadcasting semantics in CudaFusionGroup
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_partition_logic_0(self):
@@ -398,7 +477,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z))
         self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_partition_logic_1(self):
@@ -421,7 +500,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
 
     @unittest.skipIf(True, "Broadcast with different output not supported yet")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_multiple_output_shape(self):
@@ -443,7 +522,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
 
     @unittest.skipIf(True, "broadcast on branches can't be resolved yet")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_broadcasting_multiple_output(self):
@@ -465,21 +544,25 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
 
     def _unary_test_helper(self, operation, dtype, random_data):
-        shape = (4, 8, 32, 32)
+        gradient_check = (dtype == torch.float64) and random_data
+        shape = self.special_values.shape
+        torch.cuda.manual_seed_all(211)
 
         # need additional def of t for boolean ops
         def t(x: torch.Tensor, y: torch.Tensor):
             o = x * y
+            o = o + 5e-3
             o = operation(o)
             return o
 
-        y = torch.tensor([1], device="cuda").to(dtype)
+        y = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
+        y = y.to(dtype=dtype)
 
         if random_data:
-            x = torch.randn(shape, dtype=torch.float32, device="cuda")
+            x = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
             if dtype in self.int_types:
                 # prefer a larger variance for integer types
-                x *= 5
+                x = x * 5
             x = x.to(dtype=dtype)
         else:
             x = self.special_values.to(dtype=dtype)
@@ -491,16 +574,28 @@ def t(x: torch.Tensor, y: torch.Tensor):
         t_jit = torch.jit.script(t)
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
-        if dtype in self.support_tensor_dtypes:
+        jit_o = t_jit(x, y)
+        if gradient_check:
+            if jit_o.dtype != torch.bool:
+                # bool dtype has no `-`
+                gradcheck(t_jit, [x, y], nondet_tol=1e-5)
+        elif dtype in self.support_tensor_dtypes:
             self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
         o = t(x, y)
         self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o, msg=f"""
-        failing case:
-            {dtype} {operation} {x}
-        """)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+        if dtype == torch.bfloat16:
+            # compare with the actual ground truth for
+            #  bfloat16 kernels instead of eager mode
+            #  implementation, since mismatch in cast
+            #  adds excessive noise.
+            o = t(x.to(torch.float64), y.to(torch.float64)).to(torch.bfloat16)
+        else:
+            o = t(x, y)
+
+        self.assertTrue(self._compare("failing case {}\n{}\n{}\n{}".format(dtype, operation, x, y), o, jit_o, 1e-2))
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_unary_ops(self):
@@ -539,6 +634,12 @@ def test_unary_ops(self):
                       torch.trunc,
                       torch.frac,
                       torch.reciprocal,
+                      torch.isfinite,
+                      torch.isinf,
+                      torch.isnan,
+                      torch.isneginf,
+                      torch.isposinf,
+                      torch.isreal,
                       torch.nn.functional.softplus,
                       torch.nn.functional.gelu,
                       torch.relu,
@@ -551,7 +652,7 @@ def test_unary_ops(self):
             self._unary_test_helper(op, dtype, False)  # test special numbers
             self._unary_test_helper(op, dtype, True)  # test random data
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_category_rule(self):
@@ -611,7 +712,7 @@ def t(x: torch.Tensor, z: float):
         z = torch.tensor(3., dtype=torch.double)
         run_scalar(x, z)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_unary_bitwise(self):
@@ -620,8 +721,8 @@ def bit_not(x: torch.Tensor):
 
         jitted = torch.jit.script(bit_not)
         x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long)
-        jit_o = bit_not(x)
-        jit_o = bit_not(x)
+        jit_o = jitted(x)
+        jit_o = jitted(x)
         o = bit_not(x)
         self.assertEqual(o, jit_o)
         jitted.graph_for(x)  # Shows up in second instance, not first
@@ -633,48 +734,180 @@ def bool_not(x: torch.Tensor, y: torch.Tensor):
         jitted = torch.jit.script(bool_not)
         x = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
         y = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
-        jit_o = bool_not(x, y)
-        jit_o = bool_not(x, y)
+        jit_o = jitted(x, y)
+        jit_o = jitted(x, y)
         o = bool_not(x, y)
         self.assertEqual(o, jit_o)
         jitted.graph_for(x, y)  # Shows up in second instance, not first
         self.assertGraphContains(jitted.graph_for(x, y), FUSION_GUARD)
 
-    def _binary_test_helper(self, operation, dtypes, random_data):
+    def _get_scalar_binary_test_fn(self, category_and_type1, category_and_type2, operation):
+        category1, dtype_arg1 = category_and_type1
+        category2, dtype_arg2 = category_and_type2
+
+        def t_intx_tensory(x: int, y: torch.Tensor):
+            o = operation(x, y)
+            o = 2 + o
+            return o
+
+        def t_doublex_tensory(x: float, y: torch.Tensor):
+            o = operation(x, y)
+            o = 2 + o
+            return o
+        # Omit both scalar cases and swap cases
+        assert category1 == "scalar" and category2 != "scalar"
+        if dtype_arg1.is_floating_point:
+            return t_doublex_tensory
+        if dtype_arg1 == torch.int64 or dtype_arg1 == torch.int32:
+            return t_intx_tensory
+        raise NotImplementedError
+
+    def _binary_test_helper(self, operation, dtypes, random_data, categories="ndim"):
         if isinstance(dtypes, tuple):
             dtype_arg1, dtype_arg2 = dtypes
         else:
             dtype_arg1 = dtype_arg2 = dtypes
 
+        if isinstance(categories, tuple) and random_data:
+            category1, category2 = categories
+        elif not random_data:
+            category1 = category2 = "ndim"
+        else:
+            category1 = category2 = categories
+
+        def is_cpu_category(x):
+            return x == "0dimcpu" or x == "scalar"
+
+        # skip unsupported cases
+        if is_cpu_category(category1) and is_cpu_category(category2):
+            return
+
+        # only test cases with first operand as scalar
+        if category2 == "scalar":
+            return
+
+        # skip ops that doesn't support scalar inputs in eager
+        if operation in [
+            torch.atan2,
+            torch.max,
+            torch.min,
+            torch.remainder,  # unsupported in nvfuser
+        ]:
+            if category1 == "scalar" or category2 == "scalar":
+                return
+
+        if operation in [
+            torch.fmod,
+            torch.eq,
+            torch.ne,
+            torch.ge,
+            torch.gt,
+            torch.le,
+            torch.lt
+        ]:
+            if category1 == "scalar":
+                return
+
+        # operators that does not support bfloat16
+        if operation in [torch.fmod]:
+            if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
+                return
+
         def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
             o = operation(x, y)
             o = o + z
             return o
 
         shape = (4, 32, 32)
+
+        shapex = shape if category1 == "ndim" else ()
+        shapey = shape if category2 == "ndim" else ()
+
         if random_data:
-            x = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
-            y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
+            x = (torch.randn(shapex, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
+            y = (torch.randn(shapey, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
         else:
             x = self.special_values.to(dtype=dtype_arg1)
             y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2)
+
+        r"""
+            Category conversion
+        """
+        has_scalar = False
+        if category1 == "scalar":
+            has_scalar = True
+            x = x.item()
+
+        if category1 == "0dimcpu":
+            x = x.to(device="cpu")
+
+        if category2 == "scalar":
+            has_scalar = True
+            y = y.item()
+
+        if category2 == "0dimcpu":
+            y = y.to(device="cpu")
+
         z = torch.tensor([2], device="cuda").to(dtype_arg1)
+        is_dtype_arg1_int = dtype_arg1 == torch.int32 or dtype_arg1 == torch.int64
+        is_dtype_arg2_int = dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64
+
+        if operation in [torch.pow]:
+            if is_dtype_arg1_int and is_dtype_arg2_int:
+                if category2 == "scalar":
+                    # RuntimeError: Integers to negative integer powers are not allowed
+                    y = abs(y)
+                if category2 == "0dimcpu" and y == -1:
+                    # https://github.com/pytorch/pytorch/issues/73196
+                    y = y - 1
+                if category2 == "0dimcpu" and y == -2:
+                    # avoid pow(0, -2), which gives inconsistent results on integer tensor
+                    y = y - 1
 
         # Avoid division by zero for integer tensors
         div_like = [torch.div, torch.fmod, torch.remainder]
         if operation in div_like and (dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64):
             y[y == 0] = 1
 
-        o = t(x, y, z)
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
+        test_value = True
+        if dtype_arg1 == torch.half or dtype_arg2 == torch.half:
+            test_value = False
+        if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
+            test_value = False
 
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+        try:
+            if not has_scalar:
+                o = t(x, y, z)
+                t_jit = torch.jit.script(t)
+                jit_o = t_jit(x, y, z)
+                jit_o = t_jit(x, y, z)
+                jit_o = t_jit(x, y, z)
+
+                self.assertEqual(o.dtype, jit_o.dtype)
+                if test_value:
+                    self.assertEqual(o, jit_o)
+                self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+            elif category2 != "scalar":  # only test the case where first is scalar
+                test_fn = self._get_scalar_binary_test_fn((category1, dtype_arg1), (category2, dtype_arg2), operation)
+                o = test_fn(x, y)
+                t_jit = torch.jit.script(test_fn)
+                jit_o = t_jit(x, y)
+                jit_o = t_jit(x, y)
+                jit_o = t_jit(x, y)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+                self.assertEqual(o.dtype, jit_o.dtype)
+                if test_value:
+                    self.assertEqual(o, jit_o)
+                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+        except Exception as e:
+            print("failing test for op: ", operation.__name__)
+            print("with input\n\tx: ", x)
+            print("\ty: ", y)
+            print("\tz: ", z)
+            raise e
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_binary_ops(self):
@@ -682,14 +915,12 @@ def test_binary_ops(self):
         data_types = [
             torch.int32,
             torch.int64,
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64
         ]
-        '''
         if TEST_BF16:
             data_types.append(torch.bfloat16)
-        '''
         operations = [torch.mul,
                       torch.div,
                       torch.atan2,
@@ -704,59 +935,60 @@ def test_binary_ops(self):
                       torch.gt,
                       torch.le,
                       torch.lt]
-        binary_dtype_combinations = itertools.combinations(data_types, 2)
+
+        category_types = [
+            "scalar",
+            "0dim",
+            "0dimcpu",
+            "ndim"
+        ]
+
+        binary_dtype_combinations = list(itertools.combinations(data_types, 2))
+        category_combinations = list(itertools.combinations(category_types, 2))
+
+        for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations):
+            self._binary_test_helper(op, dtypes, True, categories)  # random data
+
         for op, dtypes in itertools.product(operations, binary_dtype_combinations):
-            self._binary_test_helper(op, dtypes, True)  # random data
             self._binary_test_helper(op, dtypes, False)  # special numbers
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_binary_bitwise(self):
-        def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) | z
-
-        def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) ^ z
+        dtypes = [torch.bool, torch.int32, torch.int64]
 
-        def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) << z
+        for dtype1, dtype2, dtype3 in itertools.product(dtypes, repeat=3):
+            def jit_and(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_and(x, y) & z
 
-        def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) >> z
+            def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_or(x, y) | z
 
-        for jit_func in [jit_or, jit_xor, jit_lshift, jit_rshift]:
-            x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long)
-            y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long)
-            z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(torch.long)
-
-            jitted = torch.jit.script(jit_func)
-            jit_o = jitted(x, y, z)
-            jit_o = jitted(x, y, z)
-            o = jit_func(x, y, z)
-            self.assertEqual(o, jit_o)
-            self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD)
+            def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_xor(x, y) ^ z
 
-        # We shouldn't need this redefinition of the function, but otherwise it won't recompile for a new type
-        def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) | z
+            def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_left_shift(x, y) << z
 
-        def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            return (x & y) ^ z
+            def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_right_shift(x, y) >> z
 
-        for jit_func in [jit_or, jit_xor]:
-            x = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool)
-            y = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool)
-            z = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool)
+            for jit_func in [jit_and, jit_or, jit_xor, jit_lshift, jit_rshift]:
+                if torch.bool in {dtype1, dtype2, dtype3} and jit_func in {jit_lshift, jit_rshift}:
+                    continue
+                x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype1)
+                y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype2)
+                z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(dtype3)
 
-            jitted = torch.jit.script(jit_func)
-            jit_o = jitted(x, y, z)
-            jit_o = jitted(x, y, z)
-            o = jit_func(x, y, z)
-            self.assertEqual(o, jit_o)
-            self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD)
+                jitted = torch.jit.script(jit_func)
+                jit_o = jitted(x, y, z)
+                jit_o = jitted(x, y, z)
+                o = jit_func(x, y, z)
+                self.assertEqual(o, jit_o)
+                self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_type_as_op(self):
@@ -813,7 +1045,7 @@ def threshold(x: torch.Tensor, th: int, val: int):
         threshold_jit = torch.jit.script(threshold)
         self._run_helper(threshold_jit, threshold, x, arg2, arg3)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_ternary_ops_integer_compatibility(self):
@@ -866,7 +1098,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: torch.Tensor):
         self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_ternary_ops_type_promotion(self):
@@ -887,7 +1119,22 @@ def test_ternary_ops_type_promotion(self):
             self._ternary_test_helper(op, dtypes, True)  # random data
             self._ternary_test_helper(op, dtypes, False)  # special numbers
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    # We can't test the scalar version of rsub from python
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
+    def test_rsub(self):
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+
+        def rsub(x: torch.Tensor, y: torch.Tensor):
+            o = torch.rsub(x, y)
+            o = o * 2.
+            return o
+
+        rsub_jit = torch.jit.script(rsub)
+        self._run_helper(rsub_jit, rsub, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     # legacy fuser does not work for rand_like, see issue #34361
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
     def test_ternary_ops(self):
@@ -939,7 +1186,7 @@ def lerp_scale(x: torch.Tensor, y: torch.Tensor, z: float):
         lerp_scale_jit = torch.jit.script(lerp_scale)
         self._run_helper(lerp_scale_jit, lerp_scale, x, y, 0.5)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
     def test_addcmul_ops(self):
         x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
@@ -967,7 +1214,7 @@ def addcmul_const_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         addcmul_const_alpha_jit = torch.jit.script(addcmul_const_alpha)
         self._run_helper(addcmul_const_alpha_jit, addcmul_const_alpha, x, y, z)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dynamic_size(self):
@@ -1007,7 +1254,9 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
         torch._C._jit_set_nvfuser_guard_mode(old_guard)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
     def test_random_topo(self):
         os.environ["PYTORCH_NVFUSER_DISABLE_FALLBACK"] = "1"
         self.assertTrue(runDefaultTestWithSeed(28449))
@@ -1015,7 +1264,7 @@ def test_random_topo(self):
     def _compare(self, desc, inp1, inp2, error):
         a = inp1.clone()
         b = inp2.clone()
-        close = torch.allclose(a, b, rtol=error, atol=error)
+        close = torch.allclose(a, b, rtol=error, atol=error, equal_nan=True)
         if not close:
             print(desc, close)
             z = a - b
@@ -1056,7 +1305,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
     # we are testing inputs with all combination of permutation order, just to
     # ensure that integration would be able to generate functionally correct
     # kernels
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_binary_ops_permutation(self):
@@ -1070,7 +1319,7 @@ def test_binary_ops_permutation(self):
                     x = [7, 8, 12]
                     self._permutation_helper(x, b_axis, torch.float32, "cuda", perm0, perm1)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_binary_ops_channels_last_with_bcast(self):
@@ -1121,7 +1370,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_reduction(self):
@@ -1171,7 +1420,7 @@ def _layer_norm_autodiff_helper(self, model, grad, shapes, args):
         FileCheck().check(FUSION_GUARD).run(v2.graph)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_layer_norm_autodiff(self):
@@ -1213,7 +1462,7 @@ def t(shapes: List[int], x, eps: float, cudnn: bool):
             self._layer_norm_autodiff_helper(m, grad, shapes, args)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_layer_norm_parser(self):
@@ -1272,9 +1521,8 @@ def forward(self, x: torch.Tensor):
         self.assertTrue(self._compare("comparing rstd failed", rstd, jit_rstd, error))
         self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
-    @unittest.skipIf(True, "codegen failure awaiting fix")
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_native_layer_norm(self):
@@ -1287,9 +1535,8 @@ def test_native_layer_norm(self):
                     norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
                     self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine)
 
-    @unittest.skipIf(True, "codegen failure awaiting fix")
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_native_layer_norm_half(self):
@@ -1302,7 +1549,7 @@ def test_native_layer_norm_half(self):
                 self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1315,7 +1562,15 @@ def test_native_layer_norm_bfloat(self):
                 norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
                 self._native_layer_norm_helper(input_shape, norm_shape, torch.bfloat16, "cuda", 1e-1)
 
-    def _norm_helper(self, shape, dtype, device, error, is_batch_norm_else_instance_norm, memory_format=torch.contiguous_format):
+    def _norm_helper(self,
+                     shape,
+                     dtype,
+                     device,
+                     error,
+                     is_batch_norm_else_instance_norm,
+                     memory_format=torch.contiguous_format,
+                     *,
+                     layer_dtype=torch.float32):
         class MyBatchNorm(torch.nn.Module):
             def __init__(self):
                 super(MyBatchNorm, self).__init__()
@@ -1337,8 +1592,8 @@ def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
         t = MyBatchNorm() if is_batch_norm_else_instance_norm else MyInstanceNorm()
 
         x = torch.randn(shape, dtype=dtype, device=device).to(memory_format=memory_format)
-        running_mean = torch.zeros(shape[1], dtype=torch.float32, device=device)
-        running_var = torch.ones(shape[1], dtype=torch.float32, device=device)
+        running_mean = torch.zeros(shape[1], dtype=layer_dtype, device=device)
+        running_var = torch.ones(shape[1], dtype=layer_dtype, device=device)
         t_jit = torch.jit.script(t)
 
         eager_running_mean = running_mean.clone()
@@ -1363,7 +1618,38 @@ def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_layer_norm_trivial_reduce_dim(self):
+        def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, w, b, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        batch = [1]
+        shapes = [2, 7, 3]
+
+        grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda")
+        args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()]
+        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+        self._layer_norm_autodiff_helper(t_wb, grad, shapes, args)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm_half_layer(self):
+        size = [2, 4, 2, 2]
+
+        for is_batch_norm_else_instance_norm in [False, True]:
+            for mf in [torch.channels_last, torch.contiguous_format]:
+                self._norm_helper(size, torch.float16, "cuda", 1e-3, is_batch_norm_else_instance_norm,
+                                  memory_format=mf, layer_dtype=torch.float16)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_norm_channels_last(self):
@@ -1375,7 +1661,7 @@ def test_norm_channels_last(self):
                     self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_norm(self):
@@ -1392,7 +1678,7 @@ def test_norm(self):
                         self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_norm_large(self):
@@ -1408,7 +1694,7 @@ def test_norm_large(self):
                     self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_norm_half(self):
@@ -1425,7 +1711,7 @@ def test_norm_half(self):
                         self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1442,7 +1728,7 @@ def test_norm_bfloat(self):
                         x[1] = C
                         self._norm_helper(x, torch.bfloat16, "cuda", 1e-1, is_batch_norm_else_instance_norm)
 
-    def _softmax_helper(self, shape, reduction_axis, dtype, device, error):
+    def _softmax_helper(self, shape, reduction_axis, is_log_softmax, dtype, device, error):
         class MySoftmax(torch.nn.Module):
             __constants__ = ['reduction_axis']
 
@@ -1455,22 +1741,40 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
                 o = torch.nn.functional.softmax(o, dim=self.reduction_axis)
                 return o
 
-        t = MySoftmax()
+        class MyLogSoftmax(torch.nn.Module):
+            __constants__ = ['reduction_axis']
 
-        x = torch.randn(shape, dtype=dtype, device=device)
-        y = torch.randn(shape, dtype=dtype, device=device)
+            def __init__(self):
+                super(MyLogSoftmax, self).__init__()
+                self.reduction_axis = reduction_axis
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                o = torch.add(x, y)
+                o = torch.nn.functional.log_softmax(o, dim=self.reduction_axis)
+                return o
+
+        gradient_check = (dtype == torch.float64)
+        t = MyLogSoftmax() if is_log_softmax else MySoftmax()
+
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
+        y = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
         t_jit = torch.jit.script(t)
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
-        o = t(x, y)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        # numerical issues here due to our scheduling.
-        # can't use `self.assertEqual(o, jit_o)`
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+        jit_o = t_jit(x, y)
+
+        if gradient_check:
+            gradcheck(t_jit.forward, [x, y], nondet_tol=1e-5)
+        else:
+            o = t(x, y)
+            self.assertEqual(o.dtype, jit_o.dtype)
+            # numerical issues here due to our scheduling.
+            # can't use `self.assertEqual(o, jit_o)`
+            self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+            self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_softmax_dtype(self):
@@ -1512,7 +1816,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         FileCheck().check(FUSION_GUARD).run(bwd_graph)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test__softmax_function(self):
@@ -1536,7 +1840,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test__softmax_function_half_to_float(self):
@@ -1560,7 +1864,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_softmax(self):
@@ -1569,14 +1873,21 @@ def test_softmax(self):
         output_size = int(pow(output_size, 1. / dims))
         reduction_sizes = [67, 256, 1024, 4096]
 
+        # gradient check
+        for reduction_dim in range(dims):
+            for is_log_softmax in [False, True]:
+                shape = [output_size for idx in range(dims)]
+                self._softmax_helper(shape, reduction_dim, is_log_softmax, torch.float64, "cuda", 1e-4)
+
         for reduction_dim in range(dims):
             for reduction_size in reduction_sizes:
                 x = [output_size for idx in range(dims)]
                 x[reduction_dim] = reduction_size
-                self._softmax_helper(x, reduction_dim, torch.float32, "cuda", 1e-4)
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float32, "cuda", 1e-4)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_softmax_half(self):
@@ -1589,10 +1900,11 @@ def test_softmax_half(self):
             for reduction_size in reduction_sizes:
                 x = [output_size for idx in range(dims)]
                 x[reduction_dim] = reduction_size
-                self._softmax_helper(x, reduction_dim, torch.float16, "cuda", 5e-3)
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float16, "cuda", 5e-3)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1606,10 +1918,11 @@ def test_softmax_bfloat(self):
             for reduction_size in reduction_sizes:
                 x = [output_size for idx in range(dims)]
                 x[reduction_dim] = reduction_size
-                self._softmax_helper(x, reduction_dim, torch.bfloat16, "cuda", 1e-1)
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.bfloat16, "cuda", 1e-1)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_reduction_permutation(self):
@@ -1623,7 +1936,7 @@ def test_reduction_permutation(self):
                         self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_reduction_multiple_output(self):
@@ -1662,7 +1975,7 @@ def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD)
         torch._C._jit_set_nvfuser_guard_mode(old_guard)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_channels_last_with_broadcast(self):
@@ -1768,7 +2081,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         '''
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_pw_single_reduction_partition(self):
@@ -1793,62 +2106,118 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_permutation_preservation(self):
         sizes = [2, 3, 4, 5]
         dtype = torch.float
         device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
 
-        def t(x: torch.Tensor):
-            o = torch.relu(x)
-            o = torch.sum(o, dim=[0])
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-        # TODO: we could preserve permutation to inputs
-        self.assertEqual(o.stride(), jit_o.stride())
+        with nvfuser_singleton_fusion(True):
 
-        def t(x: torch.Tensor):
-            o = torch.relu(x)
-            o = torch.add(o, 1.0)
-            return o
+            def t(x: torch.Tensor):
+                return torch.relu(x)
 
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-        self.assertTrue(jit_o.is_contiguous(memory_format=torch.channels_last))
+            t_jit = torch.jit.script(t)
+            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+            self._run_helper(t_jit, t, x, check_stride=True)
 
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+            def t(x: torch.Tensor, y: torch.Tensor):
+                return torch.add(x, y)
+
+            t_jit = torch.jit.script(t)
+            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+            y = torch.randn(sizes[1:], dtype=dtype, device=device)
+            self._run_helper(t_jit, t, x, y, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
-    def test_normalization_partition(self):
-        sizes = [3, 8, 5]
+    def test_permutation_preservation_edge_case_0(self):
+        sizes = [2, 3, 4, 5]
         dtype = torch.float
         device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device)
-        y = torch.randn(sizes, dtype=dtype, device=device)
-        z = torch.randn(sizes, dtype=dtype, device=device)
-        r_m = torch.randn(8, dtype=dtype, device=device)
-        r_v = torch.randn(8, dtype=dtype, device=device)
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        # mismatch rank with *note* different permutation recognized by PE
+        bias = torch.randn(3, dtype=dtype, device=device).unsqueeze(-1).unsqueeze(-1)
 
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.nn.functional.softmax(o, dim=0)
-            o = torch.add(o, z)
-            o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True)
+        def t(x, y):
+            return x + y
+
+        t_jit = torch.jit.script(t)
+        with nvfuser_singleton_fusion(True):
+            self._run_helper(t_jit, t, x, bias, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation_edge_case_1_broken(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        # in-compatible permutation, this will cause format propagation to break
+        bias = torch.randn(4, 5, dtype=dtype, device=device)
+
+        def t(x, y):
+            return x + y
+
+        t_jit = torch.jit.script(t)
+        with nvfuser_singleton_fusion(True):
+            for _ in range(5):
+                jit_o = t_jit(x, bias)
+
+        o = t(x, bias)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        try:
+            # nvfuser does not support in-compatible permutation, this will throw
+            self.assertEqual(o.stride(), jit_o.stride())
+        except Exception as e:
+            warnings.warn(
+                "permutation propagatoin is broken, proper support should come after nvfuser permutation scheduler update")
+        self.assertGraphContains(t_jit.graph_for(x, bias), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation_edge_case_2(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        y = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        z = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+
+        def t(x, y, w):
+            tmp = torch.lerp(x, y, w)
+            tmp = torch.clamp(tmp, -1.0, 0.5)
+            tmp = torch.nn.functional.softplus(tmp)
+            return torch.threshold(tmp, -2.0, 0.5)
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, z, check_stride=True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_normalization_partition(self):
+        sizes = [3, 8, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device)
+        y = torch.randn(sizes, dtype=dtype, device=device)
+        z = torch.randn(sizes, dtype=dtype, device=device)
+        r_m = torch.randn(8, dtype=dtype, device=device)
+        r_v = torch.randn(8, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.nn.functional.softmax(o, dim=0)
+            o = torch.add(o, z)
+            o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True)
             return o
         t_jit = torch.jit.script(t)
         jit_o = t_jit(x, y, z, r_m, r_v)
@@ -1859,7 +2228,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r
         self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_sum_to_one(self):
@@ -1880,7 +2249,7 @@ def t(x: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_single_reduction_broadcast(self):
@@ -1904,7 +2273,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_trivial_reduction(self):
@@ -1925,7 +2294,7 @@ def t(x: torch.Tensor):
         self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_profiling_node(self):
@@ -1940,8 +2309,28 @@ def repro(x: torch.Tensor, alpha: float):
         repro_jit = torch.jit.script(repro)
         self._run_helper(repro_jit, repro, x, 0.6)
 
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_rand_like(self):
+        dtype = torch.float
+        device = "cuda"
+
+        def t(x: torch.Tensor, alpha: float):
+            o = torch.rand_like(x)
+            o = torch.add(o, alpha)
+            return o
+
+        # disabling cache so new inputs would generate new graph
+        t.__disable_jit_function_caching__ = True
+
+        for m_format in [torch.contiguous_format, torch.channels_last]:
+            x = torch.randn(4, 5, 6, 7, dtype=dtype, device=device).to(memory_format=m_format)
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, 0.6, check_stride=True)
+
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_reduction_sizes_op(self):
@@ -1965,7 +2354,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_profile_ivalue(self):
@@ -1988,7 +2377,28 @@ def t(x: torch.Tensor, y: torch.Tensor, dim: List[int], keepdim: bool):
         self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_profile_ivalue_multiple_profiles(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([7, 4, 7], dtype=dtype, device=device)
+
+        def t(x, num: int):
+            for i in range(num):
+                # varying reduction axes should break profile_ivalue
+                tmp = x.sum(i, keepdim=True)
+                # inplace add on input/output, can't be functionalized/fused
+                x += tmp
+            return x
+
+        with nvfuser_singleton_fusion(True):
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, 3, num_fusion=0)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_sum_to_size(self):
@@ -2003,12 +2413,7 @@ def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]):
             return o
 
         t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, (4, 1))
-        jit_o = t_jit(x, y, (4, 1))
-        o = t(x, y, (4, 1))
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, (4, 1)), FUSION_GUARD)
+        self._run_helper(t_jit, t, x, y, (4, 1))
 
         # update shape: old kernel should handle dynamic shape well without
         # recompilation
@@ -2016,13 +2421,20 @@ def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]):
         y = torch.randn([2, 5, 8], dtype=dtype, device=device)
         # (TODO) check executed kernel, should extend autograd.profiler to fused
         # kernels
-        jit_o = t_jit(x, y, (5, 1))
-        o = t(x, y, (5, 1))
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
+        self._run_helper(t_jit, t, x, y, (5, 1))
+
+        with nvfuser_singleton_fusion(True):
+            x = torch.randn([2, 5, 8], dtype=dtype, device=device)
+
+            def t(x: torch.Tensor):
+                # no-op reduction
+                return x.sum_to_size((2, 5, 8))
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_grad_sum_to_size(self):
@@ -2081,7 +2493,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(x.grad, ref_x.grad)
         self.assertEqual(y.grad, ref_y.grad)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dropout_inference_fusion(self):
@@ -2098,7 +2510,7 @@ def t(x: torch.Tensor, p: float, train: bool):
 
         self._run_helper(t_jit, t, x, 0.15, False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dropout_train_nograd_fusion(self):
@@ -2115,7 +2527,7 @@ def t(x: torch.Tensor, p: float, train: bool):
 
         self._run_helper(t_jit, t, x, 0.0, True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dropout_train_nograd_prob_check(self):
@@ -2146,55 +2558,60 @@ def t(x: torch.Tensor, p: float, train: bool):
             self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dropout_training_fusion(self):
         dtype = torch.float
         device = "cuda"
-        x = torch.randn([10, 4, 8], dtype=dtype, device=device, requires_grad=True)
-        grads = torch.randn([10, 4, 8], dtype=dtype, device=device)
+        sizes = [2, 3, 4, 5]
 
         def t(x: torch.Tensor, p: float, train: bool):
             o = torch.nn.functional.dropout(x, p, training=train)
             o = o * 2.0
             return o
 
-        t_jit = torch.jit.script(t)
-
-        # The drop probability needs to be set to zero given that the order of picking random
-        # numbers between eager mode and the jit is different
-        self._run_training_helper(t_jit, t, grads, x, 0.0, True)
-
         def t2(x: torch.Tensor, p: float, train: bool):
             o = torch.nn.functional.softmax(x, dim=-1)
             o = torch.nn.functional.dropout(o, p, training=train)
             return o
 
-        t2_jit = torch.jit.script(t2)
+        # disabling cache so new inputs would generate new graph
+        t.__disable_jit_function_caching__ = True
+        t2.__disable_jit_function_caching__ = True
 
-        # The drop probability needs to be set to zero given that the order of picking random
-        # numbers between eager mode and the jit is different
-        self._run_training_helper(t2_jit, t2, grads, x, 0.0, True)
+        for fn in [t, t2]:
+            for m_format in [torch.contiguous_format, torch.channels_last]:
+                fn_jit = torch.jit.script(fn)
+                x = torch.randn(sizes, dtype=dtype, device=device, requires_grad=True).to(memory_format=m_format)
+                grads = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=m_format)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+                # The drop probability needs to be set to zero given that the order of picking random
+                # numbers between eager mode and the jit is different
+                self._run_training_helper(fn_jit, fn, grads, x, 0.0, True)
+
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_gelu(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
         dtype = torch.float
         device = "cuda"
         x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True)
         grads = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=False)
 
-        def t(x: torch.Tensor):
-            o = torch.nn.functional.gelu(x)
+        def t(x: torch.Tensor, mode: str):
+            o = torch.nn.functional.gelu(x, approximate=mode)
             o = o * 2.0
             return o
 
         t_jit = torch.jit.script(t)
-        self._run_training_helper(t_jit, t, grads, x)
+        self._run_training_helper(t_jit, t, grads, x, 'none')
+        self._run_training_helper(t_jit, t, grads, x, 'tanh')
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_dropout_training_prob_check(self):
@@ -2227,13 +2644,14 @@ def t(x: torch.Tensor, p: float, train: bool):
             self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
             self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_linear(self):
         in_feature = 2
         out_feature = 8
-        x = torch.randn(4, in_feature, dtype=torch.float32, device='cuda')
+        # Changing the input dims to be 3-D to avoid eager mode bias fusion
+        # The bias fusion causes some precision issues with TF-32
         weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda')
         bias = torch.randn(out_feature, dtype=torch.float32, device='cuda')
 
@@ -2242,17 +2660,55 @@ def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor):
             o = torch.relu(o)
             return o
 
-        # bias set to true.
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, weight, bias)
-        jit_o = t_jit(x, weight, bias)
-        o = t(x, weight, bias)
-        self.assertEqual(o, jit_o)
-        # since the output value is not used at all, the fusion operator should
-        # have been optimized away
-        self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias), FUSION_GUARD, 1)
+        # disabling cache so new inputs would generate new graph
+        t.__disable_jit_function_caching__ = True
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+        sizes = [in_feature, ]
+        for i in range(4):
+            # increase input rank in each iteration
+            sizes.insert(0, i + 2)
+            x = torch.randn(*sizes, dtype=torch.float32, device='cuda')
+            t_jit = torch.jit.script(t)
+            # fusion only happens for input rank >= 4
+            has_fusion = 0 if len(sizes) < 4 else 1
+            self._run_helper(t_jit, t, x, weight, bias, check_stride=True, num_fusion=has_fusion)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_linear_symbolic_shapes(self):
+        def fn(x: int):
+            y = torch.zeros((3, 4, x, x + 2)).cuda()
+            for i in range(2):
+                inp = torch.rand((3, 4, x, x + i)).cuda()
+                weight = torch.rand((x + 2, x + i)).cuda()
+                bias = torch.rand((x, x + 2)).cuda()
+                y += torch.sin(torch.nn.functional.linear(inp, weight, bias))
+            return y
+
+        fn_s = torch.jit.script(fn)
+        fn_s(5)
+        fn_s(5)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_conv2d_symbolic_shapes(self):
+        def fn(x: int):
+            responses = []
+            for i in range(2):
+                inp = torch.rand((3, 3, 32, 32)).cuda()
+                weight = torch.rand((x + i, 3, 7, 7)).cuda()
+                bias = torch.rand((x + i)).cuda()
+                res = torch.nn.functional.conv2d(inp, weight, bias, padding=3)
+                responses.append(res)
+            return responses
+
+        fn_s = torch.jit.script(fn)
+        fn_s(5)
+        fn_s(5)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_backward_type(self):
@@ -2295,7 +2751,7 @@ def test1(x: torch.Tensor, y: torch.Tensor):
             self.assertEqual(y.grad.dtype, y.dtype)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_autocast_1(self):
@@ -2303,7 +2759,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
             o = x * 2.0
             o = torch.softmax(o, dim=-1)
             o = o * 3.0
-            o = torch.matmul(o, y)
+            o = torch._C._nn.linear(o, y)
             return o
 
         x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True)
@@ -2314,7 +2770,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         for i in range(3):
             with torch.cuda.amp.autocast():
                 jit_o = t_jit(x, y)
-                if i == 2 :
+                if i == 2:
                     fwd_graph = t_jit.graph_for(x, y)
             jit_o.backward(grad)
 
@@ -2332,7 +2788,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(y.grad.dtype, y.dtype)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_autocast_2(self):
@@ -2349,9 +2805,9 @@ def t(x: torch.Tensor):
         t_jit = torch.jit.script(t)
 
         for i in range(3):
-            with torch.cuda.amp.autocast() :
+            with torch.cuda.amp.autocast():
                 jit_o = t_jit(x)
-                if i == 2 :
+                if i == 2:
                     fwd_graph = t_jit.graph_for(x)
             jit_o.backward(grad)
 
@@ -2368,7 +2824,7 @@ def t(x: torch.Tensor):
         self.assertEqual(x.grad.dtype, x.dtype)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2377,7 +2833,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
             o = x * 2.0
             o = torch.softmax(o, dim=-1)
             o = o * 3.0
-            o = torch.matmul(o, y)
+            o = torch._C._nn.linear(o, y)
             return o
 
         x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True)
@@ -2388,7 +2844,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         for i in range(3):
             with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                 jit_o = t_jit(x, y)
-                if i == 2 :
+                if i == 2:
                     fwd_graph = t_jit.graph_for(x, y)
             jit_o.backward(grad)
 
@@ -2406,7 +2862,7 @@ def t(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(y.grad.dtype, y.dtype)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2424,9 +2880,9 @@ def t(x: torch.Tensor):
         t_jit = torch.jit.script(t)
 
         for i in range(3):
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16) :
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                 jit_o = t_jit(x)
-                if i == 2 :
+                if i == 2:
                     fwd_graph = t_jit.graph_for(x)
             jit_o.backward(grad)
 
@@ -2442,7 +2898,7 @@ def t(x: torch.Tensor):
         self.assertEqual(jit_o.dtype, torch.float)
         self.assertEqual(x.grad.dtype, x.dtype)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_to_dtype_fp32_to_fp16(self):
@@ -2461,7 +2917,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.half)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_to_dtype_fp16_to_fp32(self):
@@ -2480,7 +2936,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.float)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_to_dtype_fp16_to_fp16(self):
@@ -2499,7 +2955,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.half)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2519,7 +2975,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.bfloat16)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2539,7 +2995,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.float)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2559,7 +3015,7 @@ def t(x: torch.Tensor):
         self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
         self.assertEqual(jit_o.dtype, torch.bfloat16)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(not TEST_MULTIGPU, "requires multiple CUDA device")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
@@ -2581,7 +3037,7 @@ def t(x):
         x = x.to("cuda:1")
         jit_o = t_jit(x)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_graph_for_with_missing_optimized_engine(self):
@@ -2608,7 +3064,7 @@ def t(x: torch.Tensor, flag: bool):
         # have been optimized away
         self.assertGraphContainsExactly(t_jit.graph_for(x, True), FUSION_GUARD, 1, True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_branches(self):
@@ -2638,7 +3094,7 @@ def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, flag: bool):
         # have been optimized away
         self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias, True), FUSION_GUARD, 1)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_scalar_tensor(self):
@@ -2661,7 +3117,7 @@ def t(x: torch.Tensor):
 
     @unittest.skipIf(os.environ.get('PYTORCH_NO_CUDA_MEMORY_CACHING') is not None,
                      "skipping graph_rng when caching allocator is disabled")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(CUDA_MAJOR < 11, "requires CUDA11 or above")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
@@ -2728,8 +3184,8 @@ def __init__(self, num_features=10, affine=True, track_running_stats=True):
                                                track_running_stats=track_running_stats).to(dtype=dtype)
 
             def forward(self, x):
-                o = x * 2.0
-                o = self.bn(o)
+                o = self.bn(x)
+                o = o * 2.0
                 return o
 
         x = torch.randn(batch, c, hw, hw, dtype=torch.float, device="cuda").to(dtype=dtype).requires_grad_()
@@ -2818,7 +3274,7 @@ def forward(self, x):
                                           e0))
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_batch_norm_half(self):
@@ -2833,7 +3289,25 @@ def test_batch_norm_half(self):
                 self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_batch_norm_impl_index_inner_bcast(self):
+        # the repro
+        self._test_batch_norm_impl_index_helper(2, 1, 1, False, True, True)
+
+        # running the full set
+        setups = [
+            [True, True],
+            [False, False],
+            [True, False],
+            [False, True]]
+        for training_and_track, affine in itertools.product(setups, [True, False]):
+            training, track_running_stats = training_and_track
+            self._test_batch_norm_impl_index_helper(2, 1, 1, affine, track_running_stats, training)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_batch_norm_impl_index_correctness(self):
@@ -2857,7 +3331,7 @@ def test_batch_norm_impl_index_correctness(self):
                     training, track_running_stats = training_and_track
                     self._test_batch_norm_impl_index_helper(b, c, hw, affine, track_running_stats, training)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_softplus_fuser(self):
@@ -2883,7 +3357,7 @@ def shifted_softplus(x: torch.Tensor, shift: float):
         assert torch.allclose(jit_grad, aten_grad)
         self.assertGraphContains(jitted.graph_for(inp, 0.693147), FUSION_GROUP, True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_inplace_removal(self):
@@ -2903,7 +3377,7 @@ def t(x: torch.Tensor):
         self.assertGraphContains(graph, 'aten::add', True)
         self.assertGraphContains(graph, 'aten::relu', True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_conv2d_bias(self):
@@ -2936,7 +3410,8 @@ def t_not_fused(x: torch.Tensor, w: torch.Tensor):
         self.assertGraphContains(graph, 'aten::relu', True)
 
         def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
-            return torch.nn.functional.conv2d(x, w, bias)
+            o = torch.nn.functional.conv2d(x, w, bias)
+            return o.relu()
 
         jitted_bias = torch.jit.script(t_bias)
 
@@ -2944,11 +3419,11 @@ def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
             jit_o = jitted_bias(inp, weight, bias)
 
         graph = jitted_bias.graph_for(inp)
-        self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
+        self.assertGraphContains(graph, FUSION_GROUP, True)
         self.assertGraphContains(graph, 'prim::add_optional', True)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_remove_output_used_only_in_dtype(self):
@@ -2981,7 +3456,7 @@ def forward(self, x, y):
             self.assertGraphContains(graph, FUSION_GROUP, True)
 
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_fix_shape_expression_bn(self):
@@ -3013,31 +3488,6 @@ def forward(self, x, y):
             graph = jitted.graph_for(x, y)
             self.assertGraphContains(graph, FUSION_GROUP, True)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_linear_1d_weight_mismatch_bias_dtype(self):
-        def t(x: torch.Tensor, w: torch.Tensor, b: torch.Tensor):
-            o = torch.nn.functional.linear(x, w, b)
-            return o.relu()
-
-        device = "cuda"
-        jitted = torch.jit.script(t)
-        x = torch.randn(2, 5, 5, dtype=torch.half, device=device)
-        w = torch.randn(5, dtype=torch.half, device=device)
-        b = torch.randn(5, dtype=torch.float32, device=device)
-
-        for i in range(3):
-            jit_o = jitted(x, w, b)
-        jit_o = jitted(x, w, b)
-        o = t(x, w, b)
-        self.assertEqual(o, jit_o)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.size(), jit_o.size())
-        graph = jitted.graph_for(x, w, b)
-        self.assertGraphContains(graph, FUSION_GROUP, True)
-        self.assertGraphContains(graph, 'aten::matmul', True)
-
     def _run_fwd_helper(self, func, ops, *args):
         jitted = torch.jit.script(func)
         for i in range(3):
@@ -3052,7 +3502,8 @@ def _run_fwd_helper(self, func, ops, *args):
         for op in ops:
             self.assertGraphContainsExactly(graph, op, 0)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_sibling_fusion(self):
@@ -3065,7 +3516,7 @@ def t(x: torch.Tensor):
             o1 = x + 1.0
             o2 = x * 0.5
             return o1, o2
-        self._run_fwd_helper(t, ['aten::add'], x)
+        self._run_fwd_helper(t, ['aten::add', 'aten::mul'], x)
 
         def t2(x: torch.Tensor, y: torch.Tensor):
             o1 = x.sum(0)
@@ -3073,8 +3524,7 @@ def t2(x: torch.Tensor, y: torch.Tensor):
             return o1, o2
         self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y)
 
-    @unittest.skipIf(True, "Fixed in PR #68804")
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_clean_profile_ivalue(self):
@@ -3090,13 +3540,13 @@ def t(x: torch.Tensor, flag: bool):
             return torch.dropout(x, 0.5, flag)
 
         jit_t = torch.jit.script(t)
-        for idx in range(5) :
+        for idx in range(5):
             out = jit_t(x, True)
 
         graph = jit_t.graph_for(x, True)
         out = jit_t(x, False)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_sibling_fusion_no_scalar_inputs(self):
@@ -3117,79 +3567,1248 @@ def t(x: torch.Tensor, y: torch.Tensor):
         graph = jitted.graph_for(x, y)
         self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_singleton_fusion(self):
-        x = torch.randn(4, 2, device="cuda")
+    def _bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
+        class BiasViewRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasViewRelu, self).__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs: torch.Tensor, view_shape: List[int]):
+                o = inputs + self.bias
+                o = o.view(view_shape)
+                return torch.relu(o)
+
+        t = BiasViewRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
 
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x.relu()
+        # profiling
+        jit_o = t_jit(x, output_shape)
+        # optimization
+        jit_o = t_jit(x, output_shape)
+        # final
+        jit_o = t_jit(x, output_shape)
+        # eager - baseline
+        o = t(x, output_shape)
 
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, output_shape)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+        has_inferred_dimension = any([dim == -1 for dim in output_shape])
+        if has_inferred_dimension:
+            # prohibit fusing when view_shape contains an inferred dimension
+            self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
+            self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
+        else:
+            self.assertGraphContains(graph, FUSION_GUARD)
+            self.assertGraphContains(graph, 'prim::view_copy', True)
+
+    def _alias_bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
+        class BiasViewRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasViewRelu, self).__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, view_shape : List[int]):
+                o = inputs.view(view_shape)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasViewRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        # profiling
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # optimization
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # final
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # eager - baseline
+        o = t(x.clone(), bias, output_shape)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias, output_shape)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
+
+    # generate random view given original view
+    def _random_view(self, original_view, max_len=8, max_views=10000):
+        class Moves(enum.Enum):
+            Merge = 0
+            Split = 1
+            Broadcast = 2
+            ImplicitBroadcast = 3
+            Keep = 4
+
+        def valid(old_view, new_view):
+            old_view_size = reduce(operator.mul, old_view)
+            new_view_size = reduce(operator.mul, new_view)
+            return old_view_size == new_view_size
+
+        # given a random starting number, find the nearest divisor
+        def find_nearest_divisor(N):
+            if 2 >= (N - 1):
+                return -1
+            result = random.randint(2, N - 1)
+            while (N % result) != 0:
+                result += 1
+            return result
+
+        complete_views = set([tuple(original_view)])
+
+        to_visit = []
+        # empty new view, curent originaal view, start pos=0, move count = 0, last_move
+        to_visit.append(([], original_view, 0, [], Moves.Keep))
+
+        # depth-first search of view shapes, starting from the original view
+        while len(to_visit) > 0 and len(complete_views) < max_views:
+            new_view, old_view, odx, move_list, last_move = to_visit[-1]
+            to_visit.pop()
+
+            # iterate over each move type
+            for idx in range(len(Moves)):
+                state = Moves(idx)
+                new_view_clone = copy.deepcopy(new_view)
+                old_view_clone = copy.deepcopy(old_view)
+                new_move_list = move_list + [state]
+                new_odx = odx
+
+                # Update state using Move state
+                if state == Moves.Keep:
+                    new_size = old_view_clone[odx]
+                    new_view_clone.append(new_size)
+                    new_odx += 1
+
+                elif state == Moves.Merge:
+                    if odx + 1 < len(old_view_clone):
+                        new_size = old_view_clone[odx] * old_view_clone[odx + 1]
+                        new_view_clone.append(new_size)
+                        new_odx += 2
+                    else:
+                        continue
+
+                elif state == Moves.Broadcast and last_move != Moves.Broadcast:
+                    new_view_clone.append(1)
+
+                elif state == Moves.Split:
+                    new_size = find_nearest_divisor(old_view_clone[odx])
+                    if new_size == -1:
+                        continue
+                    new_view_clone.append(new_size)
+                    old_view_clone[odx] = int(old_view[odx] / new_size)
+
+                    if old_view_clone[odx] == 1:
+                        new_odx += 1
+
+                elif state == Moves.ImplicitBroadcast:
+                    old_view_clone.insert(odx + 1, 1)
+                    new_size = old_view[odx] * 1
+                    new_view_clone.append(new_size)
+                    new_odx += 2
+
+                if new_odx < len(old_view_clone) and len(new_move_list) < max_len:
+                    to_visit.append((new_view_clone, old_view_clone, new_odx, new_move_list, state))
+                elif (valid(original_view, new_view_clone)):
+                    final_new_view = tuple(new_view_clone)
+                    complete_views.add(final_new_view)
+        return list(complete_views)
+
+    # ndims - number of dimensions
+    # test_fn - view test function
+    def _view_test_generator(self, ndims, test_fn):
+        # create random tensor
+        # max value for each dimension
+        max_size = 10e7
+        max_value = max(int(pow(max_size, 1. / ndims)), 1)
+        sizes = [random.randint(1, max_value) for idx in range(ndims)]
+        x = torch.randn(sizes)
+
+        original_sizes = list(x.size())
+        all_views = self._random_view(original_sizes)
+        random.shuffle(all_views)
+
+        max_samples = 20
+        max_views = min(len(all_views), max_samples)
+        total = 0
+        correct = 0
+        # test random combinations of compatible views
+        for idx in range(max_views):
+            for jdx in range(idx + 1, max_views):
+                total += 1
+                test_fn(all_views[idx], all_views[jdx], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
-    def test_disable_sibling_fuse(self):
-        x = torch.randn(4, 2, device="cuda")
-        y = torch.randn(8, device="cuda")
-        s = torch.tensor(1.5, device="cuda")
+    def test_view(self):
+        torch._C._jit_set_nvfuser_guard_mode(True)
+        self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6)
+        for ndims in range(1, 5):
+            self._view_test_generator(ndims, self._bias_view_relu_helper)
+        self._alias_bias_view_relu_helper([2, 3, 4, 5], [1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
 
-        with nvfuser_horizontal_fusion(False):
-            def t(x, y, s):
-                o1 = x + s
-                o2 = y + s
-                return o1, o2
+    def _bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
+        class BiasFlattenRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasFlattenRelu, self).__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, start_dim : int, end_dim : int):
+                o = inputs + self.bias
+                o = o.flatten(start_dim, end_dim)
+                return torch.relu(o)
+
+        t = BiasFlattenRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
 
-            t_jit = torch.jit.script(t)
-            for i in range(5):
-                t_jit(x, y, s)
+        self._run_helper(t_jit, t, x, start_dim, end_dim)
+        self.assertGraphContains(t_jit.graph_for(x, start_dim, end_dim), 'prim::flatten_copy', True)
 
-            # sibling fusion should be disabled with the flag
-            self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0)
+    def _alias_bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
+        class BiasFlattenRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasFlattenRelu, self).__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, start_dim : int, end_dim : int):
+                o = inputs.flatten(start_dim, end_dim)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasFlattenRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
 
-class TestPassManagerCudaFuser(JitTestCase):
+        # profiling
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # optimization
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # final
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # eager - baseline
+        o = t(x.clone(), bias, start_dim, end_dim)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias, start_dim, end_dim)
+
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::flatten_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since flatten is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
-    def test_context_manager_test(self):
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        with torch.jit.fuser('fuser2'):
-            with torch.jit.fuser('fuser2'):
+    def test_flatten(self):
+        torch._C._jit_set_nvfuser_guard_mode(True)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_strict_fusion(self):
+        def success(x):
+            with torch.jit.strict_fusion():
+                return x + x + x
+
+        scripted = self.checkScript(success, (torch.rand([4], device='cuda'),))
+        g = torch.jit.last_executed_optimized_graph()
+        FileCheck().check_not("aten::add").check("prim::CudaFusionGroup").run(g)
+
+        def failure(x):
+            with torch.jit.strict_fusion():
+                return x + torch.mm(x, x) + x
+
+        with self.assertRaises(Exception) as error_out:
+            foo_s = torch.jit.script(failure)
+            foo_s(torch.rand([4, 4]))
+            foo_s(torch.rand([4, 4]))
+
+        fc = FileCheck().check("Found unfused operators")
+        fc.check("aten::mm").run(str(error_out.exception))
+
+    def _ltc_helper(self, shape, dtype, device, error, approximate=True):
+        # modeled after LTC linear layer
+        class LTC(torch.nn.Module):
+            def __init__(self):
+                super(LTC, self).__init__()
+                self.weight = torch.nn.Parameter(torch.randn([1024, 1024], dtype=dtype, device=device), requires_grad=False)
+                self.bias = torch.nn.Parameter(torch.randn([1, 1024], dtype=dtype, device=device), requires_grad=False)
+
+            def forward(self, inputs : torch.Tensor):
+                o = inputs.view([32768, 1024])
+                o = torch.mm(o, self.weight)
+                o = o.view([256, 128, 1024])
+                o = o + self.bias
+                o = o.view([32768, 1024])
+                o = o.view([256, 128, 1024])
+                return torch.nn.functional.gelu(o)
+
+        t = LTC()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
 
-                def t1(x, y):
-                    o = x + y
-                    o = o + 2.0
-                    return o
-                t_jit = torch.jit.script(t1)
-                t_jit(x, y)
-                t_jit(x, y)
-                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+        # profile/optimization runs
+        for i in range(3):
+            jit_o = t_jit(x)
+        o = t(x)
 
-            def t2(x, y):
-                o = x + y
-                o = o + 3.0
-                return o
-            t_jit_2 = torch.jit.script(t2)
-            t_jit_2(x, y)
-            t_jit_2(x, y)
-            self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::view_copy', True)
 
-        def t3(x, y):
-            o = x + y
-            o = o + 4.0
-            return o
-        t_jit_3 = torch.jit.script(t3)
-        t_jit_3(x, y)
-        t_jit_3(x, y)
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nested_view(self):
+        self._ltc_helper([256, 128, 1024], torch.float, 'cuda', 1e-6)
+
+    def _bias_squeeze_relu_helper(self, shape, dtype, device, error):
+        class BiasSqueezeRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasSqueezeRelu, self).__init__()
+
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = inputs + bias
+                o = torch.squeeze(o)
+                return torch.relu(o)
+
+        t = BiasSqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        o = t(x, bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::squeeze_copy', True)
+
+    def _alias_bias_squeeze_relu_helper(self, shape, dtype, device, error):
+        class BiasSqueezeRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasSqueezeRelu, self).__init__()
+
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = torch.squeeze(inputs)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasSqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        o = t(x.clone(), bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::squeeze_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze(self):
+        self._bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
+        self._alias_bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    # remove this after opinfo tests are enabled
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze_zero(self):
+        x = torch.tensor(1.0, dtype=torch.float, device="cuda")
+
+        def squeeze_0(x: torch.Tensor):
+            o = x + 1.
+            o = torch.squeeze(o, 0)
+            o = o * 2.
+            return o
+
+        def squeeze_1(x: torch.Tensor):
+            o = x + 1.
+            o = torch.squeeze(o, -1)
+            o = o + .5
+            return o
+
+        squeeze_0_jit = torch.jit.script(squeeze_0)
+        self._run_helper(squeeze_0_jit, squeeze_0, x)
+        squeeze_1_jit = torch.jit.script(squeeze_1)
+        self._run_helper(squeeze_1_jit, squeeze_1, x)
+
+    def _bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
+        class BiasUnsqueezeRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasUnsqueezeRelu, self).__init__()
+
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = inputs + bias
+                o = torch.unsqueeze(o, 0)
+                return torch.relu(o)
+
+        t = BiasUnsqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        o = t(x, bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::unsqueeze_copy', True)
+
+    def _alias_bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
+        class BiasUnsqueezeRelu(torch.nn.Module):
+            def __init__(self):
+                super(BiasUnsqueezeRelu, self).__init__()
+
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor):
+                o = torch.unsqueeze(inputs, 0)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasUnsqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        o = t(x.clone(), bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::unsqueeze_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_unsqueeze(self):
+        self._bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
+        self._alias_bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_alias_pass_fix(self):
+        x = torch.randn(4, 24, 2, 2, dtype=torch.float, device="cuda")
+        w = torch.randn(24, 24, 1, 1, dtype=torch.float, device="cuda")
+        b = torch.randn(24, dtype=torch.float, device="cuda")
+
+        def t(x, w, b):
+            b2 = b + 1.0
+            o = torch.conv2d(x, w, b2)
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, w, b)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze_negative_dim(self):
+        x = torch.randn(4, 24, 1, 2, dtype=torch.float, device="cuda")
+
+        def t(x):
+            o = x + 1.0
+            o = o.squeeze(-2)
+            o = o * 2.0
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_singleton_fusion(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.relu()
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_issue1445_fusion(self):
+        def f(t0, t1, t2, t3):
+            masked_input = torch.where(t1, t2, t3)
+            total = masked_input.sum([0, 1, 2, 3])
+            sizes : List[int] = []
+            t10 = torch.reshape(t0, sizes)
+            t7 = total / t10
+            t4 = t7.to(dtype=torch.float)
+            return t4
+
+        x = torch.randn(1, 1, 1, 1, device='cuda').to(dtype=torch.long)
+        y = torch.randn(3, 2, 1, 1, device='cuda').to(dtype=torch.bool).expand([3, 2, 1, 2])
+        z = torch.randn(3, 2, 1, 2, device='cuda')
+        w = torch.tensor(1.5, device='cuda')
+
+        f_jit = torch.jit.script(f)
+        for i in range(5):
+            out_jit = f_jit(x, y, z, w)
+        out = f(x, y, z, w)
+        self.assertEqual(out, out_jit)
+        self.assertGraphContainsExactly(f_jit.graph_for(x, y, z, w), FUSION_GROUP, 1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_disable_sibling_fuse(self):
+        x = torch.randn(4, 2, device="cuda")
+        y = torch.randn(8, device="cuda")
+        s = torch.tensor(1.5, device="cuda")
+
+        with nvfuser_horizontal_fusion(False):
+            def t(x, y, s):
+                o1 = x + s
+                o2 = y + s
+                return o1, o2
+
+            t_jit = torch.jit.script(t)
+            for i in range(5):
+                t_jit(x, y, s)
+
+            # sibling fusion should be disabled with the flag
+            self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_build_shape_expression_native_dropout(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        def t(x):
+            o, mask = torch.native_dropout(x, 0.0, True)
+            o1 = o.sigmoid()
+            o2 = mask.float().sigmoid()
+            return (o1, o2)
+
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_tensor_permuted(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+        y = torch.tensor(1.0, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y):
+                return x + y
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cpu_scalar(self):
+        x = torch.randn(4, 2, 3, device="cuda")
+        y = torch.tensor(1.0, device="cpu")
+        z = torch.tensor(2.0, device="cpu")
+
+        with nvfuser_singleton_fusion(True):
+            # testing cpu scalar tensor promotion
+            def t(x, y):
+                return x + y
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+            # scalar cpu tensor add should NOT be fused
+            @torch.jit.script
+            def t1(y, z):
+                return y * z
+            for _ in range(5):
+                t1(y, z)
+            self.assertGraphContainsExactly(t1.graph_for(y, z), FUSION_GUARD, 0)
+
+            # everything, including scalar cpu tensor add should be fused
+            @torch.jit.script
+            def t2(x, y, z):
+                tmp = y + z
+                return tmp + x
+            for _ in range(5):
+                t2(x, y, z)
+            self.assertGraphContainsExactly(t2.graph_for(x, y, z), 'aten::add', 0)
+            self.assertGraphContainsExactly(t2.graph_for(x, y, z), FUSION_GUARD, 1)
+
+            # 'cpu_tmp = y + z' shouldn't be fused.
+            @torch.jit.script
+            def t3(x, y, z):
+                cpu_tmp = y + z
+                out = x + y
+                return cpu_tmp, out
+            for _ in range(5):
+                t3(x, y, z)
+            self.assertGraphContainsExactly(t3.graph_for(x, y, z), FUSION_GUARD, 1)
+            self.assertGraphContainsExactly(t3.graph_for(x, y, z), 'aten::add', 1)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_shape_expression(self):
+        x = torch.randn(4, 2, 1, 3, device="cuda")
+
+        def t_unsqueeze(x):
+            t0 = x.relu()
+            t1 = t0.unsqueeze(1)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def t_squeeze(x):
+            t0 = x.relu()
+            t1 = t0.squeeze()
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def t_squeeze_dim(x):
+            t0 = x.relu()
+            t1 = t0.squeeze(-2)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        # squeezing a non-size 1 dimension should be a no op
+        def t_squeeze_dim_no_op(x):
+            t0 = x.relu()
+            t1 = t0.squeeze(1)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def run(fn):
+            jit_fn = torch.jit.script(fn)
+            jit_o = jit_fn(x)
+            jit_o = jit_fn(x)
+            jit_o = jit_fn(x)
+            o = fn(x)
+            # output 0 is a tensor, so we check dtype and value
+            self.assertEqual(o[0].dtype, jit_o[0].dtype)
+            self.assertEqual(o[0], jit_o[0])
+            # output 1 is shape
+            self.assertEqual(o[1], jit_o[1])
+            self.assertGraphContainsExactly(jit_fn.graph_for(x), FUSION_GUARD, 1)
+
+        for t in [t_unsqueeze, t_squeeze, t_squeeze_dim, t_squeeze_dim_no_op]:
+            run(t)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_cuda_tensor(self):
+        x = torch.tensor(2.0, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x + 1.0
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+            @torch.jit.script
+            def t_jitted(x):
+                return x.sum(0)
+
+            for i in range(5):
+                t_jitted(x)
+            self.assertGraphContainsExactly(t_jitted.graph_for(x), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_overlapped_input(self):
+        x = torch.randn(8, device="cuda").as_strided((2, 4), (1, 1))
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x + 1.0
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_reduction_empty_axes(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                sizes : List[int] = []
+                return x.sum(sizes)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_int_tensor_input(self):
+        x = torch.randn(4, 2, device="cuda").to(dtype=torch.int)
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.amax(dim=0)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_boolean(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.to(dtype=torch.bool)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since reshape is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_copy_graph_guard(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+        y = [4, 6]
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y : List[int]):
+                t1 = x + 1.0
+                t2 = t1 * 1.0
+                out = t2.reshape(y)
+                return out.relu()
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_copy_graph_guard_double_fusion(self):
+        x = torch.randn(2, 2, 5, device="cuda")
+        w = torch.randn(5, 5, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, w):
+                o = x.view([4, x.size()[-1]])
+                o = torch.matmul(o, w)
+                o = o.view([2, 2, o.size()[1]])
+                return o
+
+            t_jit = torch.jit.script(t)
+            for i in range(3):
+                jit_o = t_jit(x, w)
+            o = t(x, w)
+            self.assertEqual(jit_o, o)
+            self.assertGraphContainsExactly(t_jit.graph_for(x, w), FUSION_GUARD, 2, consider_subgraphs=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_input_output_passthrough(self):
+        def t(t0, t1, t2):
+            mask = t1.to(dtype=torch.bool)
+            masked_input = torch.where(t0, mask, t2)
+            return masked_input, mask
+
+        t_jit = torch.jit.script(t)
+        # stick to integers, this avoid the numerical difference due to our
+        # promotion
+        x = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
+        y = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
+        z = torch.tensor(1.0, device='cuda').to(dtype=torch.bool)
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_pointwise_reference_tensor(self):
+        def t(input1, input2, scalar):
+            _unsafe_view = torch.ops.aten._unsafe_view(input1, [2, 4, 16])
+            add_ = torch.ops.aten.add_(_unsafe_view, input2)
+            gelu_ = torch.ops.aten.gelu(add_)
+            view_ = torch.ops.aten.view(gelu_, [8, 16])
+            mul_ = torch.ops.aten.mul(add_, scalar)
+            return [view_, mul_]
+
+        x = torch.randn(8, 16, device="cuda")
+        bias = torch.randn(16, device="cuda")
+        scalar = torch.ones(torch.Size([]), device="cuda")
+
+        t_jit = torch.jit.script(t)
+        for i in range(3):
+            jit_o = t_jit(x, bias, scalar)
+        o = t(x, bias, scalar)
+        self.assertEqual(jit_o, o)
+        self.assertGraphContains(t_jit.graph_for(x, bias, scalar), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_native_batch_norm_backward(self):
+        grad_output = torch.randn(4, 2, 3, device="cuda")
+        input = torch.randn(4, 2, 3, device="cuda")
+        weight = torch.randn(2, device="cuda")
+
+        r_m = torch.randn(2, device="cuda")
+        r_v = torch.randn(2, device="cuda").abs()
+
+        save_mean = torch.randn(2, device="cuda")
+        save_invstd = torch.randn(2, device="cuda").abs()
+
+        with nvfuser_singleton_fusion(True):
+            def t(grad_out, input, weight, r_m, r_v, save_mean, save_invstd, train: bool, eps: float, mask: List[bool]):
+                return torch.ops.aten.native_batch_norm_backward(grad_out, input, weight, r_m, r_v, save_mean,
+                                                                 save_invstd, train, eps, mask)
+
+            t_jit = torch.jit.script(t)
+            for i in range(4):
+                jit_o = t_jit(grad_output, input, weight, r_m.clone(), r_v.clone(),
+                              save_mean, save_invstd, True, 1e-5, [True, True, True])
+
+            ref_m = r_m.clone()
+            ref_v = r_v.clone()
+            jit_o = t_jit(grad_output, input, weight, r_m, r_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
+            o = t(grad_output, input, weight, ref_m, ref_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
+            for oo, jit_oo in zip(o, jit_o):
+                self.assertEqual(oo.dtype, jit_oo.dtype)
+                self.assertEqual(oo, jit_oo)
+            self.assertEqual(ref_m.dtype, r_m.dtype)
+            self.assertEqual(ref_m, r_m)
+            self.assertEqual(ref_v.dtype, r_v.dtype)
+            self.assertEqual(ref_v, r_v)
+            self.assertGraphContains(t_jit.graph_for(grad_output, input, weight, r_m.clone(), r_v.clone, save_mean,
+                                                     save_invstd, True, 1e-5, [True, True, True]), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_contiguous_on_broadcasted(self):
+        x = torch.randn(4, 1, device="cuda")
+        y = torch.randn(4, 128, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y):
+                t1 = x.expand([4, 128])
+                t2 = t1 * y
+                return t2
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_skip_parser(self):
+        x = torch.randn(4, 12, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def fn(x):
+                t1 = x + 1.0
+                return t1.relu()
+
+            fn_jit = torch.jit.script(fn)
+            self._run_helper(fn_jit, fn, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_jit.graph_for(x), 'aten::add', 0)
+
+            # flips skip parse for `aten::add`, following fusion should skip the
+            # add node
+            self.assertFalse(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
+
+            def fn_1(x):
+                t1 = x + 2.0  # change const value so we'll not reuse plan
+                return t1.relu()
+
+            fn_1_jit = torch.jit.script(fn_1)
+            self._run_helper(fn_1_jit, fn_1, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_1_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_1_jit.graph_for(x), 'aten::add', 1)
+
+            # flips skip parse for `aten::add`, next fusion should fuse add node
+            self.assertTrue(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
+
+            def fn_2(x):
+                t1 = x + 2.0  # change const value so we'll not reuse plan
+                return t1.relu()
+
+            fn_2_jit = torch.jit.script(fn_2)
+            self._run_helper(fn_2_jit, fn_2, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_2_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_2_jit.graph_for(x), 'aten::add', 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cuda_fusion_guard(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+
+        class ConvModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.sin().sigmoid()
+
+        mod = ConvModule().to(device="cuda")
+
+        inputs = [torch.randn(20, 16, 50, 100, device="cuda", requires_grad=True)]
+
+        def reduce_scalar(temp):
+            return temp.sum()
+
+        scripted = torch.jit.script(mod)
+        with torch.no_grad():
+            scripted(*inputs)
+        res = scripted(*inputs)
+        reduce_scalar(res).backward()
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nvfuser_comparison_callbacks_with_fallback(self):
+        try:
+            fused_result = None
+            unfused_result = None
+            graph_ir = None
+
+            def callback(fused_outputs, unfused_outputs, graph_str):
+                nonlocal unfused_result
+                nonlocal fused_result
+                nonlocal graph_ir
+                unfused_result = unfused_outputs[-1]
+                fused_result = fused_outputs[-1]
+                graph_ir = graph_str
+            torch._C._jit_nvfuser_set_comparison_callback(True, callback)
+
+            def fn(x, y):
+                z = torch.add(x, y)
+                return torch.relu(z)
+
+            x = torch.rand((4, 4)).cuda() - 0.5
+            y = torch.rand((4, 4)).cuda() - 0.5
+
+            fn_s = torch.jit.script(fn)
+            fn_s(x, y)
+            fn_s(x, y)
+            fn_s(x, y)
+
+            expected = fn(x, y)
+
+            self.assertEqual(expected, fused_result)
+            self.assertEqual(expected, unfused_result)
+            FileCheck().check("aten::add").run(graph_ir)
+        finally:
+            torch._C._jit_nvfuser_clear_comparison_callback()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nvfuser_comparison_callbacks_without_fallback(self):
+        try:
+            fused_result = None
+            unfused_result = None
+            graph_ir = None
+
+            def callback(fused_outputs, unfused_outputs, graph_str):
+                nonlocal unfused_result
+                nonlocal fused_result
+                nonlocal graph_ir
+                if len(unfused_outputs) > 0:
+                    unfused_result = unfused_outputs[-1]
+                fused_result = fused_outputs[-1]
+                graph_ir = graph_str
+            torch._C._jit_nvfuser_set_comparison_callback(False, callback)
+
+            def fn(x, y):
+                z = torch.add(x, y)
+                return torch.relu(z)
+
+            x = torch.rand((4, 4)).cuda() - 0.5
+            y = torch.rand((4, 4)).cuda() - 0.5
+
+            fn_s = torch.jit.script(fn)
+            fn_s(x, y)
+            fn_s(x, y)
+            fn_s(x, y)
+
+            expected = fn(x, y)
+
+            self.assertEqual(expected, fused_result)
+            self.assertEqual(None, unfused_result)
+            FileCheck().check("aten::add").run(graph_ir)
+        finally:
+            torch._C._jit_nvfuser_clear_comparison_callback()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires NVFuser")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cuda_fusion_guard_backward(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+
+        inp = torch.randn(10, device="cuda", requires_grad=True)
+        grad = torch.randn(10, device="cuda")
+
+        def f(x):
+            a = x.cos().cos()
+            return a
+        scripted = torch.jit.script(f)
+
+        with profile(activities=[ProfilerActivity.CPU]) as prof:
+            for _ in range(5):
+                inp.grad = None
+                out = scripted(inp)
+                out.backward(grad)
+
+        # check that we do not have fallback triggered
+        self.assertEqual(prof.events().table().find("fallback"), -1)
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    # TODO: generalize this
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_inf_quick_patch(self):
+        inputs = [torch.tensor([-float('inf'), float('inf'), 4.0], device="cuda"),
+                  torch.tensor([1.0, float('inf'), 4.0], device="cuda"),
+                  torch.tensor([-float('inf'), -1.5, 4.0], device="cuda"),
+                  torch.tensor([1.0, -3.0, float('nan')], device="cuda"),
+                  torch.tensor([-float('inf'), -float('inf'), -float('inf')], device="cuda"),
+                  torch.tensor([float('inf'), float('inf'), float('inf')], device="cuda"),
+                  torch.tensor([float('nan'), float('nan'), float('nan')], device="cuda")]
+
+        def fn_amax(x):
+            return x.amax(dim=0)
+
+        def fn_amin(x):
+            return x.amin(dim=0)
+
+        def fn_add_nan(x):
+            return x.relu() + float('nan')
+
+        def fn_add(x):
+            return x + 1.0
+
+        with nvfuser_singleton_fusion(True):
+            for t in [fn_amax, fn_amin, fn_add, fn_add_nan]:
+                for x in inputs:
+                    t_jit = torch.jit.script(t)
+                    self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_clamp_reversed_bound(self):
+        x = torch.tensor([1., -float('inf'), 2., float('inf'), float('nan')], device="cuda")
+
+        def t(x):
+            return x.clamp(min=1., max=0.5)
+
+        with nvfuser_singleton_fusion(True):
+            jit_t = torch.jit.script(t)
+            self._run_helper(jit_t, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_high_rank_fusion(self):
+        # currently we want to limit fusion to node with input where rank <= 8
+        rank_limit = 8
+        shapes = [4 for i in range(rank_limit + 1)]
+        x = torch.randn(shapes, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.relu()
+
+            jit_t = torch.jit.script(t)
+            for i in range(5):
+                jit_t(x)
+                self.assertGraphContainsExactly(jit_t.graph_for(x), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_clamp(self):
+        x = torch.tensor([1., float('inf'), 2., float('nan'), float('-inf')], device="cuda")
+
+        def clamp_max(x):
+            return x.clamp(max=1.5)
+
+        def clamp_min_max(x):
+            return x.clamp(min=1.5)
+
+        def clamp_min(x):
+            return x.clamp(min=1., max=3.)
+
+        with nvfuser_singleton_fusion(True):
+            for t in [clamp_max, clamp_min, clamp_min_max]:
+                t_jit = torch.jit.script(t)
+                self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_device_constant(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        def t(x):
+            return torch.rand_like(x, device=torch.device(type='cuda'))
+
+        # cpu tensor shouldn't be fused
+        def t_cpu(x):
+            return torch.rand_like(x, device=torch.device(type='cpu'))
+
+        with nvfuser_singleton_fusion(True):
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+            t_cpu_jit = torch.jit.script(t_cpu)
+            for i in range(5):
+                t_cpu_jit(x)
+
+            self.assertGraphContainsExactly(t_cpu_jit.graph_for(x), FUSION_GUARD, 0)
+
+
+class TestPassManagerCudaFuser(JitTestCase):
+    def setUp(self):
+        super().setUp()
+        if RUN_NVFUSER:
+            self.is_enabled = torch._C._jit_set_nvfuser_enabled(False)
+
+    def tearDown(self):
+        if RUN_NVFUSER:
+            torch._C._jit_set_nvfuser_enabled(self.is_enabled)
+        super().tearDown()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_context_manager_test(self):
+        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        with torch.jit.fuser('fuser2'):
+            with torch.jit.fuser('fuser2'):
+
+                def t1(x, y):
+                    o = x + y
+                    o = o + 2.0
+                    return o
+                t_jit = torch.jit.script(t1)
+                t_jit(x, y)
+                t_jit(x, y)
+                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+            def t2(x, y):
+                o = x + y
+                o = o + 3.0
+                return o
+            t_jit_2 = torch.jit.script(t2)
+            t_jit_2(x, y)
+            t_jit_2(x, y)
+            self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD)
+
+        def t3(x, y):
+            o = x + y
+            o = o + 4.0
+            return o
+        t_jit_3 = torch.jit.script(t3)
+        t_jit_3(x, y)
+        t_jit_3(x, y)
         self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), FUSION_GUARD, 0)
 
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     def test_register_fuser(self):
         self.assertFalse(torch._C._jit_set_nvfuser_enabled(True))
         self.assertTrue(torch._C._jit_nvfuser_enabled())
@@ -3198,6 +4817,128 @@ def test_register_fuser(self):
         self.assertTrue(torch._C._jit_set_nvfuser_enabled(False))
         self.assertFalse(torch._C._jit_nvfuser_enabled())
 
+    @unittest.skipIf(RUN_CUDA, "Testing on CPU only")
+    def test_register_fuser_cpu(self):
+        with self.assertRaises(RuntimeError):
+            torch._C._jit_set_nvfuser_enabled(True)
+            torch._C._jit_set_nvfuser_enabled(False)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not TEST_WITH_ROCM, "ROCM test only")
+    def test_register_fuser_rocm(self):
+        with self.assertRaises(RuntimeError):
+            torch._C._jit_set_nvfuser_enabled(True)
+            torch._C._jit_set_nvfuser_enabled(False)
+
+# See TestNNCOpInfoParent
+class TestCudaFuserOpInfoParent(JitCommonTestCase):
+    pass
+
+class TestCudaFuserOpInfo(TestCudaFuserOpInfoParent):
+    def setUp(self):
+        super(TestCudaFuserOpInfoParent, self).setUp()
+        if RUN_NVFUSER:
+            self.cuda_fuser_options = CudaFuserTestOptions()
+            # enables guard mode since tracing could change graph to violate guard.
+            torch._C._jit_set_nvfuser_guard_mode(True)
+        self.nvfuser_single_node_mode = torch._C._jit_set_nvfuser_single_node_mode(True)
+
+    def tearDown(self):
+        if RUN_NVFUSER:
+            self.cuda_fuser_options.restore()
+
+        torch._C._jit_set_nvfuser_single_node_mode(self.nvfuser_single_node_mode)
+
+        super(TestCudaFuserOpInfoParent, self).tearDown()
+
+    @slowTest
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @ops(op_db, dtypes=OpDTypes.supported)
+    def test_nvfuser_correctness(self, device, dtype, op):
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        for variant, sample in variant_sample_pairs:
+            trace = create_traced_fn(self, variant, cache_traced_fn=True)
+            ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            val = trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            self.assertEqual(ref, val, exact_layout=True)
+
+        # Note: Clearing CU after NVFuser tests
+        # https://github.com/pytorch/pytorch/issues/35600
+        # each torch.jit.trace adds state to the _python_cu compilation unit
+        # since this test traces a lot of functions, out-of-memory can occur
+        # if the CU is not cleared.
+        torch.jit._state._python_cu.drop_all_functions()
+
+    @slowTest
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @ops(op_db, allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32,
+                                torch.float64, torch.complex64, torch.complex128))
+    def test_nvfuser_extremal_values(self, device, dtype, op):
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        def _get_extremal_tensor(x, val, dtype):
+            if x.dtype != dtype:
+                return x
+            return torch.full_like(x, val)
+
+        def _get_extremal_input(x, val, dtype):
+            if isinstance(x, torch.Tensor):
+                return _get_extremal_tensor(x, val, dtype)
+            elif is_iterable_of_tensors(x):
+                return [_get_extremal_tensor(y, val, dtype) for y in x]
+            return x
+
+        def _get_extremal_sample(sample: SampleInput, val, dtype):
+            extremal_sample = SampleInput(
+                input=_get_extremal_input(sample.input, val, dtype),
+                args=[_get_extremal_input(x, val, dtype) for x in sample.args],
+                kwargs={k: _get_extremal_input(v, val, dtype) for k, v in sample.kwargs.items()},
+            )
+            return extremal_sample
+
+        def _get_extremal_samples(sample: SampleInput, dtype):
+            vals = [float('inf'), float('-inf'), float('nan')]
+            if dtype.is_complex:
+                complex_vals = itertools.product(vals, vals)
+                vals = list(map(lambda x: complex(*x), complex_vals))
+            for val in vals:
+                yield _get_extremal_sample(sample, val, dtype)
+
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        for variant, sample in variant_sample_pairs:
+
+            trace = create_traced_fn(self, variant, cache_traced_fn=True)
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            for extremal_sample in _get_extremal_samples(sample, dtype):
+                try:
+                    with freeze_rng_state():
+                        ref = variant(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
+                                      **extremal_sample.kwargs)
+                except (torch._C._LinAlgError, RuntimeError, ValueError):
+                    # if eager errors out, then don't expect NVFuser to pass
+                    continue
+
+                with freeze_rng_state():
+                    val = trace(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
+                                **extremal_sample.kwargs)
+
+                self.assertEqual(val, ref, equal_nan=True, exact_device=True)
+
+            # See [Note: Clearing CU after NVFuser tests]
+            torch.jit._state._python_cu.drop_all_functions()
+
+instantiate_device_type_tests(TestCudaFuserOpInfo, globals(), only_for=("cuda"))
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index a548a8df4c8c..cb14fe573358 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -18,7 +18,7 @@
 # inferred erroneously runs or skips
 # some tests
 torch._C._jit_set_profiling_executor(True)
-torch._C._jit_set_profiling_mode(True)
+torch._C._get_graph_executor_optimize(True)
 
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, \
     enable_profiling_mode_for_profiling_tests, slowTest
@@ -82,6 +82,7 @@ def inline_fusion_groups():
 
 class TestTEFuser(JitTestCase):
     def setUp(self):
+        super().setUp()
         self.tensorexpr_options = TensorExprTestOptions()
 
         # note: `self.dynamic_shapes` instatiated in specialization of class
@@ -109,6 +110,7 @@ def setUp(self):
     def tearDown(self):
         self.tensorexpr_options.restore()
         torch._C._jit_set_fusion_strategy(self.old_fusion_strategy)
+        super().tearDown()
 
     def assertAllFused(self, graph, except_for=None):
         except_for = except_for if except_for is not None else set()
@@ -1321,82 +1323,114 @@ def test_isnan(self):
                     " ".join(["Failed:", str(dtype), 'isnan', device])
                 )
 
-    def test_unary_ops(self):
+    def test_gelu(self):
         def apply(fn):
-            return lambda x: fn(x)
+            return lambda x, approximate: fn(x, approximate)
 
         unary_ops = [
-            torch.lgamma,
-            torch.sigmoid,
-            torch.reciprocal,
-            torch.neg,
-            torch.relu,
-            F.relu6,
-            torch.log,
-            torch.log10,
-            torch.log1p,
-            torch.log2,
-            torch.exp,
-            torch.expm1,
-            torch.erf,
-            torch.erfc,
-            torch.cos,
-            torch.sin,
-            torch.tan,
-            torch.acos,
-            torch.asin,
-            torch.cosh,
-            torch.sinh,
-            torch.atan,
-            torch.tanh,
-            F.hardtanh,
-            F.hardsigmoid,
-            F.hardswish,
-            F.softplus,
-            torch.sqrt,
-            torch.rsqrt,
             F.gelu,
-            torch.abs,
-            torch.ceil,
-            torch.floor,
-            torch.round,
-            torch.trunc,
-            torch.frac,
-            # TODO: broken on ROCm?
-            # F.hardshrink,
-            F.leaky_relu,
-            lambda x: torch.threshold(x, 0, -10),
-            lambda x: torch.clamp(x, -10, 10),
         ]
-        gpu_only = {torch.erf, torch.erfc}
         sizes = [(1,), (2,), (4, 4)]
         for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
             # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
             if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
-            # todo - re-enable. fails with .500
-            if dtype == torch.bfloat16 and op == torch.round:
-                continue
-            if op in gpu_only and device == "cpu":
-                continue
             try:
                 x = self.data_for(dtype, device, size=size)
+                cond = self.data_for(torch.bool, device)
                 fn = apply(op)
-                ref = fn(x)
+                ref = fn(x, cond)
             except Exception:
                 # If eager mode doesn't support a dtype/op/device combo,
                 # neither does the fuser.  Catch everything to avoid needing to
                 # guess what errors might be thrown by eager.
                 continue
             try:
-                t = torch.jit.trace(fn, (x,))
-                torch.testing.assert_close(ref, t(x))
-                self.assertAllFused(t.graph_for(x))
+                t = torch.jit.trace(fn, (x, cond))
+                torch.testing.assert_close(ref, t(x, cond))
+                self.assertAllFused(t.graph_for(x, cond))
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device, str(size)])
                 )
 
+    def test_unary_ops(self):
+        with torch._jit_internal._disable_emit_hooks():
+            def apply(fn):
+                return lambda x: fn(x)
+
+            unary_ops = [
+                torch.lgamma,
+                torch.sigmoid,
+                torch.reciprocal,
+                torch.neg,
+                torch.relu,
+                F.relu6,
+                torch.log,
+                torch.log10,
+                torch.log1p,
+                torch.log2,
+                torch.exp,
+                torch.expm1,
+                torch.erf,
+                torch.erfc,
+                torch.cos,
+                torch.sin,
+                torch.tan,
+                torch.acos,
+                torch.asin,
+                torch.cosh,
+                torch.sinh,
+                torch.atan,
+                torch.tanh,
+                F.hardtanh,
+                F.hardsigmoid,
+                F.hardswish,
+                F.softplus,
+                torch.sqrt,
+                torch.rsqrt,
+                torch.abs,
+                torch.ceil,
+                torch.floor,
+                torch.round,
+                torch.trunc,
+                torch.frac,
+                # TODO: broken on ROCm?
+                # F.hardshrink,
+                F.leaky_relu,
+                lambda x: torch.threshold(x, 0, -10),
+                # TODO: broken since type promotion was added
+                # lambda x: torch.clamp(x, -10, 10),
+            ]
+            gpu_only = {torch.erf, torch.erfc}
+            sizes = [(1,), (2,), (4, 4)]
+            for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
+                # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+                if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
+                    continue
+                # todo - re-enable. fails with .500
+                if dtype == torch.bfloat16 and op == torch.round:
+                    continue
+                if op in gpu_only and device == "cpu":
+                    continue
+                try:
+                    x = self.data_for(dtype, device, size=size)
+                    fn = apply(op)
+                    ref = fn(x)
+                except Exception:
+                    # If eager mode doesn't support a dtype/op/device combo,
+                    # neither does the fuser.  Catch everything to avoid needing to
+                    # guess what errors might be thrown by eager.
+                    continue
+                try:
+                    t = torch.jit.trace(fn, (x,))
+                    torch.testing.assert_close(ref, t(x))
+                    self.assertAllFused(t.graph_for(x))
+                except Exception as e:
+                    raise RuntimeError(
+                        " ".join(["Failed:", str(dtype), op.__name__, device, str(size)])
+                    )
+
     def test_binary_ops(self):
         def apply(fn):
             return lambda x, y: fn(x, y)
@@ -1562,47 +1596,48 @@ def fn(x, y):
                 )
 
     def test_binary_tensor_scalar_ops(self):
-        def apply_with_scalar(fn, scalar):
-            return lambda x: fn(x, scalar)
-
-        # FIXME: Fails in IR Eval: torch.int64 and_ cpu
-        binary_ops = [
-            operator.__and__,
-            operator.__or__,
-            operator.__xor__,
-            torch.add,
-            torch.sub,
-            torch.mul,
-            torch.eq,
-            torch.ne,
-            torch.ge,
-            torch.lt,
-            torch.gt,
-        ]
-        devices = self.devices
-        # Maybe we should split this into separate tests to speed it up by
-        # only using  scalar values relevant to particular ops
-        scalars = [1.5, 3, 0, -2.0, -1]
-        for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
-            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
-                continue
-            try:
-                x = self.data_for(dtype, device)
-                fn = apply_with_scalar(op, scalar)
-                ref = fn(x)
-            except Exception:
-                # If eager mode doesn't support a dtype/op/device combo,
-                # neither does the fuser.  Catch everything to avoid needing to
-                # guess what errors might be thrown by eager.
-                continue
-            try:
-                t = torch.jit.trace(fn, (x))
-                self.assertEqual(ref, t(x))
-                self.assertAllFused(t.graph_for(x))
-            except Exception as e:
-                raise RuntimeError(
-                    " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+        with torch._jit_internal._disable_emit_hooks():
+            def apply_with_scalar(fn, scalar):
+                return lambda x: fn(x, scalar)
+
+            # FIXME: Fails in IR Eval: torch.int64 and_ cpu
+            binary_ops = [
+                operator.__and__,
+                operator.__or__,
+                operator.__xor__,
+                torch.add,
+                torch.sub,
+                torch.mul,
+                torch.eq,
+                torch.ne,
+                torch.ge,
+                torch.lt,
+                torch.gt,
+            ]
+            devices = self.devices
+            # Maybe we should split this into separate tests to speed it up by
+            # only using  scalar values relevant to particular ops
+            scalars = [1.5, 3, 0, -2.0, -1]
+            for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
+                if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
+                    continue
+                try:
+                    x = self.data_for(dtype, device)
+                    fn = apply_with_scalar(op, scalar)
+                    ref = fn(x)
+                except Exception:
+                    # If eager mode doesn't support a dtype/op/device combo,
+                    # neither does the fuser.  Catch everything to avoid needing to
+                    # guess what errors might be thrown by eager.
+                    continue
+                try:
+                    t = torch.jit.trace(fn, (x))
+                    self.assertEqual(ref, t(x))
+                    self.assertAllFused(t.graph_for(x))
+                except Exception as e:
+                    raise RuntimeError(
+                        " ".join(["Failed:", str(dtype), op.__name__, device])
+                    )
 
     def test_binary_div_ops(self):
         def apply_with_scalar(fn, scalar):
@@ -2307,6 +2342,59 @@ def f(x):
         scr(x)
         self.assertLastGraphAllFused()
 
+    def test_with_strict_fusion(self):
+
+        def success(x):
+            with torch.jit.strict_fusion():
+                return x + x + x
+
+        scripted = self.checkScript(success, (torch.rand([4]),))
+        g = torch.jit.last_executed_optimized_graph()
+        FileCheck().check_not("aten::add").check("prim::TensorExprGroup").run(g)
+
+        def foo(x):
+            with torch.jit.strict_fusion():
+                return x + x + torch.rand([4]) + 3
+
+        with self.assertRaises(Exception) as error_out:
+            foo_s = torch.jit.script(foo)
+            foo_s(torch.rand([4]))
+            foo_s(torch.rand([4]))
+            print(torch.jit.last_executed_optimized_graph())
+        fc = FileCheck().check("Found unfused operators")
+        fc.check("aten::rand(int[] size")
+        fc.check("torch.rand([4]").run(str(error_out.exception))
+
+        with warnings.catch_warnings(record=True) as warns:
+            foo(torch.rand([4]))
+
+        FileCheck().check("Only works in script mode").run(str(warns[0]))
+
+        def test_autodiff(x):
+            with torch.jit.strict_fusion():
+                return torch.rand([4]) + x + x + x
+
+        foo_s = torch.jit.script(test_autodiff)
+        inp = torch.rand([4], requires_grad=True)
+        with self.assertRaises(Exception) as error_out:
+            for _ in range(3):
+                foo_s(inp)
+        f = FileCheck().check("unfused operators").check("aten::rand")
+        f.run(str(error_out.exception))
+
+        def test_separate_fusions(x, y):
+            with torch.jit.strict_fusion():
+                return x + x + x, y + y + y
+
+        inp = torch.rand([4], requires_grad=True)
+        with self.assertRaises(Exception) as error_out:
+            for _ in range(3):
+                foo_s = torch.jit.script(test_separate_fusions)
+                foo_s(inp, inp)
+
+        f = FileCheck().check("Found multiple fusions")
+        f.run(str(error_out.exception))
+
 class TestTEFuserStatic(TestTEFuser):
     dynamic_shapes = False
 
@@ -2367,7 +2455,6 @@ class TestTEFuserDynamic(TestTEFuser):
     'mul',
     'ne',
     'neg',
-    'nn.functional.gelu',
     'nn.functional.hardshrink',
     'nn.functional.hardsigmoid',
     'nn.functional.hardswish',
@@ -2444,12 +2531,21 @@ def get_name(op):
         l.append(op.variant_test_name)
     return '.'.join(l)
 
-class TestNNCOpInfo(JitCommonTestCase):
+# Purpose of this class is to allow super() calls.
+# super() [with no arguments] fails, presumably because of how instantiate_device_type_tests works.
+# super(TestNNCOpInfo, self) fails because TestNNCOpInfo gets deleted from global scope.
+# super(JitCommonTestCase, self).fn() would skip JitCommonTestCase.fn() implementation
+class TestNNCOpInfoParent(JitCommonTestCase):
+    pass
+
+class TestNNCOpInfo(TestNNCOpInfoParent):
     def setUp(self):
+        super(TestNNCOpInfoParent, self).setUp()
         self.tensorexpr_options = TensorExprTestOptions()
 
     def tearDown(self):
         self.tensorexpr_options.restore()
+        super(TestNNCOpInfoParent, self).tearDown()
 
     def te_compile(self, device, dtype, op):
         if op.name in skip_ops:
@@ -2531,7 +2627,7 @@ def test_nnc_correctness(self, device, dtype, op):
         variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
 
         for variant, sample in variant_sample_pairs:
-            trace = create_traced_fn(self, variant)
+            trace = create_traced_fn(self, variant, cache_traced_fn=True)
             ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
 
             trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
@@ -2549,9 +2645,13 @@ def test_nnc_correctness(self, device, dtype, op):
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestNNCOpInfo, globals(), only_for=only_for)
 
+# Purpose of this class is to allow super() calls. (See TestNNCOpInfoParent)
+class TestLoopnestRandomizationParent(JitTestCase):
+    pass
 
-class TestLoopnestRandomization(JitTestCase):
+class TestLoopnestRandomization(TestLoopnestRandomizationParent):
     def setUp(self):
+        super(TestLoopnestRandomizationParent, self).setUp()
         self.old_cpu_fuser_state = torch._C._jit_can_fuse_on_cpu()
         self.old_must_use_cpu_state = torch._C._jit_get_te_must_use_llvm_cpu()
         self.old_gpu_fuser_state = torch._C._jit_can_fuse_on_gpu()
@@ -2562,7 +2662,7 @@ def setUp(self):
         torch._C._jit_override_can_fuse_on_gpu(True)
 
         self.old_profiling_executor = torch._C._jit_set_profiling_executor(True)
-        self.old_profiling_mode = torch._C._jit_set_profiling_mode(True)
+        self.old_profiling_mode = torch._C._get_graph_executor_optimize(True)
 
         self.old_fusion_inlining = torch._C._debug_get_fusion_group_inlining()
         torch._C._debug_set_fusion_group_inlining(False)
@@ -2579,7 +2679,7 @@ def setUp(self):
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.old_profiling_executor)
-        torch._C._jit_set_profiling_mode(self.old_profiling_mode)
+        torch._C._get_graph_executor_optimize(self.old_profiling_mode)
 
         torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuser_state)
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
@@ -2591,6 +2691,7 @@ def tearDown(self):
 
         # Set it back to 0.
         os.environ["PYTORCH_TENSOREXPR_RANDOM_TRANSFORM_SEED"] = "0"
+        super(TestLoopnestRandomizationParent, self).tearDown()
 
     @onlyCPU
     @unittest.skipIf(not LLVM_ENABLED, "Compiles with TensorExprKernel")
diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py
new file mode 100644
index 000000000000..1e79b745d2c1
--- /dev/null
+++ b/test/test_jit_llga_fuser.py
@@ -0,0 +1,519 @@
+# Owner(s): ["module: mkldnn"]
+import torch
+import unittest
+import itertools
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import run_tests, TEST_SCIPY, IS_WINDOWS, IS_MACOS
+
+LLGA_FUSION_GROUP = 'prim::oneDNNFusionGroup'
+LLGA_NOT_ENABLED = not torch._C.has_mkldnn or IS_WINDOWS or IS_MACOS
+
+
+def warmup_forward(f, *args, profiling_count=2):
+    for i in range(profiling_count):
+        results = f(*args)
+
+    return results
+
+
+class JitLlgaTestCase(JitTestCase):
+    def setUp(self):
+        torch.jit.enable_onednn_fusion(True)
+
+    def tearDown(self):
+        torch.jit.enable_onednn_fusion(False)
+
+    def checkTrace(self, m, x, *args, **kwargs):
+        if isinstance(m, torch.nn.Module):
+            m.eval()
+        with torch.no_grad(), \
+                torch._jit_internal._disable_emit_hooks():
+            traced = torch.jit.trace(m, x)
+            if isinstance(m, torch.nn.Module):
+                traced = torch.jit.freeze(traced)
+            warmup_forward(traced, *x)
+            fwd_graph = traced.graph_for(*x)
+
+            ref_o = m(*x)
+            jit_o = traced(*x)
+            self.assertEqual(jit_o, ref_o)
+        return traced, fwd_graph
+
+    def assertFused(self, graph, fused_patterns):
+        for pat in fused_patterns:
+            self.assertGraphContainsExactly(graph, pat, 0)
+
+
+try:
+    import torchvision
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+except RuntimeError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, 'no torchvision')
+
+def get_eltwise_fn(name):
+    if hasattr(torch, name):
+        return getattr(torch, name)
+    elif hasattr(F, name):
+        return getattr(F, name)
+    else:
+        raise NameError('Eltwise function %s not found' % name)
+
+
+@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
+class TestOp(JitLlgaTestCase):
+    def test_conv2d(self):
+        for [spatial, in_channels, out_channels, kernel, padding, stride, dilation, g, bias] in itertools.product(
+                [7, 8],
+                [8, 15],
+                [7, 16],
+                [3, 4],
+                [0, 2],
+                [1, 2],
+                [1, 2],
+                [1, 2],
+                [True, False]):
+
+            m = nn.Conv2d(in_channels=in_channels * g,
+                          out_channels=out_channels * g,
+                          kernel_size=kernel,
+                          padding=padding,
+                          stride=stride,
+                          dilation=dilation,
+                          groups=g,
+                          bias=bias)
+
+            x = torch.rand(1, in_channels * g, spatial, spatial)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_bn2d(self):
+        m = nn.BatchNorm2d(32).eval()
+        x = torch.rand(1, 32, 28, 28)
+        _, graph = self.checkTrace(m, [x])
+        # single-op partition shouldn't be created for softmax
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+
+    def test_eltwise(self):
+        class M(nn.Module):
+            def __init__(self, eltwise_fn):
+                super(M, self).__init__()
+                self.eltwise = eltwise_fn
+
+            def forward(self, x):
+                return self.eltwise(x)
+
+        for eltwise in ['relu', 'gelu']:
+            eltwise_fn = get_eltwise_fn(eltwise)
+            m = M(eltwise_fn)
+            x = torch.rand(1, 32, 28, 28)
+            _, graph = self.checkTrace(m, [x])
+            # single-op partition shouldn't be created.
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+
+    def test_max_pool2d(self):
+        for [spatial, kernel, padding, stride, dilation, ceil_mode] in itertools.product(
+                [15, 16, 17, 18, 19],
+                [4, 5],
+                [0, 1, 2],
+                [1, 2],  # [1, 2, 4], TODO: fix issue in pad calculation
+                [1],     # [1, 2], TODO: backend support for dilation
+                [True, False]):
+
+            m = nn.MaxPool2d(kernel_size=kernel,
+                             stride=stride,
+                             padding=padding,
+                             dilation=dilation,
+                             ceil_mode=ceil_mode)
+
+            x = torch.rand(1, 4, spatial, spatial)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_avg_pool2d(self):
+        for [spatial, kernel, padding, stride, ceil_mode, count_include_pad] in itertools.product(
+                [15, 16, 17, 18, 19],
+                [4, 5],
+                [0, 1, 2],
+                [1, 2, 4],
+                [False],  # TODO: oneDNN Graph does not fully support ceil_mode=True
+                [True, False]):
+
+            m = nn.AvgPool2d(kernel_size=kernel,
+                             stride=stride,
+                             padding=padding,
+                             ceil_mode=ceil_mode,
+                             count_include_pad=count_include_pad)
+
+            x = torch.rand(1, 4, spatial, spatial)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_variable_kernel_avg_pool2d(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x):
+                x = F.avg_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=0, count_include_pad=False)
+                return x
+
+        x = torch.randn(1, 1000, 1, 1)
+        m = M()
+        _, graph = self.checkTrace(m, [x])
+        # kernel_size is not Constant, shouldn't have any LLGA_FUSION_GROUP
+        # TODO: with shape specialization, should have 1 LLGA_FUSION_GROUP
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+
+    def test_softmax(self):
+        for dim in [-4, -3, -2, -1, 0, 1, 2, 3]:
+            m = nn.Softmax(dim=dim)
+            x = torch.rand(8, 12, 12, 12)
+            _, graph = self.checkTrace(m, [x])
+            # single-op partition shouldn't be created for softmax
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+
+    def test_linear(self):
+        for bias in [True, False]:
+            x = torch.rand(32, 28)
+            m = torch.nn.Linear(in_features=28, out_features=64, bias=bias)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+            self.assertFused(graph, ['aten::linear'])
+
+    def _gen_binary_inputs(self, gen_permute=True):
+        for xshape, yshape in [
+            [[1, 32, 28, 28], [1, 32, 28, 28]],
+            [[1, 32, 28, 28], [1, 1, 28, 28]],
+            [[1, 32, 28, 28], [28]],
+            [[1, 32, 28, 28], [1]],
+
+        ]:
+            yield torch.rand(xshape), torch.rand(yshape)
+            if gen_permute and xshape != yshape:
+                yield torch.rand(yshape), torch.rand(xshape)
+
+    def test_add(self):
+        def forward_add(x, y):
+            return torch.add(x, y, alpha=2)
+
+        for x, y in self._gen_binary_inputs():
+            _, graph = self.checkTrace(forward_add, [x, y])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_add_scalar(self):
+        def add_scalar(x):
+            return 42 + x + 3.14
+
+        x = torch.rand(32, 32)
+        _, graph = self.checkTrace(add_scalar, [x])
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_addmm(self):
+        def addmm(x, y, z):
+            # alpha and beta are 1, by default
+            return torch.addmm(z, x, y)
+
+        x = torch.rand(64, 32)
+        y = torch.rand(32, 32)
+        z = torch.rand(64, 32)
+        _, graph = self.checkTrace(addmm, [x, y, z])
+        # single-op partition should be created for matmul with bias.
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_mul(self):
+        def forward_mul(x, y):
+            return torch.mul(x, y) * 3
+
+        for x, y in self._gen_binary_inputs():
+            _, graph = self.checkTrace(forward_mul, [x, y])
+            # single-op partitions shouldn't be created
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_identity_binary(self):
+        def forward(x):
+            return x * 1 + 0.0
+
+        x = torch.rand(32)
+        _, graph = self.checkTrace(forward, [x])
+        self.assertFused(graph, ['aten::add', 'aten::mul'])
+
+    def test_layer_norm(self):
+        # TODO: support more normalized_shape
+        m = torch.nn.LayerNorm(10)
+        x = torch.randn(2, 5, 10, 10)
+        _, graph = self.checkTrace(m, [x])
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_cat(self):
+        def cat_along_dim(d):
+            def forward_cat(*inputs):
+                return torch.cat(inputs, d)
+            return forward_cat
+
+        for xshape in [
+            [8, 8, 8, 8],
+            [64, 8, 32],
+            [2048, 64],
+        ]:
+            for d in range(len(xshape)):
+                x = torch.rand(xshape)
+                _, graph = self.checkTrace(cat_along_dim(d), [x, x, x])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+
+    def test_typecheck(self):
+        x = torch.rand(32, 28)
+        m = torch.nn.Linear(in_features=28, out_features=64, bias=True)
+        traced, graph = self.checkTrace(m, [x])
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        self.assertFused(graph, ['aten::linear'])
+        # change the shape of the input, we should enter fallback graph
+        x = torch.rand(5, 28)
+        self.assertEqual(m(x), traced(x))
+
+
+@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
+class TestFusionPattern(JitLlgaTestCase):
+    def test_conv2d_eltwise(self):
+        class M(nn.Module):
+            def __init__(self, eltwise_fn):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
+                self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=False)
+                self.eltwise = eltwise_fn
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.eltwise(x)
+                x = self.conv2(x)
+                x = self.eltwise(x)
+                return x
+
+        # for eltwise in ['relu', 'sigmoid', 'sqrt', 'abs', 'square', 'hardtanh']:
+        for eltwise in ['relu']:
+            for inplace in [True, False]:
+                eltwise_fn_name = eltwise + '_' if inplace else eltwise
+                eltwise_fn = get_eltwise_fn(eltwise_fn_name)
+
+                m = M(eltwise_fn)
+                x = torch.rand(1, 32, 28, 28)
+                _, graph = self.checkTrace(m, [x])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
+                # test if relu_ is replace with relu by mutation removal pass
+                self.assertFused(graph, ['aten::' + eltwise_fn_name])
+                # test if relu is fused into the fusion group
+                self.assertFused(graph, ['aten::' + eltwise])
+
+    def test_conv2d_bn(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
+                self.bn1 = nn.BatchNorm2d(32)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.bn1(x)
+                return x
+
+        m = M().eval()
+        x = torch.rand(1, 32, 28, 28)
+        _, graph = self.checkTrace(m, [x])
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm'])
+
+
+    def test_conv2d_bn_relu(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
+                self.bn1 = nn.BatchNorm2d(32)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.bn1(x)
+                x = F.relu(x)
+                return x
+
+        m = M().eval()
+        x = torch.rand(1, 32, 28, 28)
+        _, graph = self.checkTrace(m, [x])
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm',
+                                 'aten::relu'])
+
+    def test_bn2d_eltwise(self):
+        class M(nn.Module):
+            def __init__(self, eltwise_fn):
+                super(M, self).__init__()
+                self.eltwise = eltwise_fn
+                self.bn = nn.BatchNorm2d(32)
+
+            def forward(self, x):
+                x = self.bn(x)
+                x = self.eltwise(x)
+                return x
+
+        for eltwise in ['relu']:
+            eltwise_fn = get_eltwise_fn(eltwise)
+            m = M(eltwise_fn).eval()
+            x = torch.rand(1, 32, 28, 28)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+            self.assertFused(graph, ['aten::' + eltwise])
+
+    def test_linear_eltwise(self):
+        class M(nn.Module):
+            def __init__(self, eltwise_fn, bias):
+                super(M, self).__init__()
+                self.linear = nn.Linear(28, 64, bias)
+                self.eltwise = eltwise_fn
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.eltwise(x)
+                return x
+
+        for [has_bias, eltwise] in itertools.product(
+                [True, False],
+                ['relu', 'gelu', 'sigmoid', 'hardtanh', 'relu6', 'elu']):
+
+            eltwise_fn = get_eltwise_fn(eltwise)
+            m = M(eltwise_fn, has_bias)
+            x = torch.rand(32, 28, requires_grad=False)
+            _, graph = self.checkTrace(m, [x])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+            self.assertFused(graph, ['aten::' + eltwise])
+
+    def test_conv2d_sum(self):
+        class M(nn.Module):
+            def __init__(self, bias=False):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=bias)
+                self.bn1 = nn.BatchNorm2d(32)
+                self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=bias)
+                self.bn2 = nn.BatchNorm2d(32)
+                self.relu = nn.ReLU()
+                self.conv3 = nn.Conv2d(32, 32, 3, padding=1, bias=bias)
+                self.bn3 = nn.BatchNorm2d(32)
+
+            def forward(self, x, y):
+                x = self.conv1(x)
+                x = self.bn1(x)
+                y = self.conv2(y)
+                y = self.bn2(y)
+                z = self.relu(x + y)
+                z = self.conv3(z)
+                z = self.bn3(z)
+                return z
+
+        for bias in [True, False]:
+            m = M(bias).eval()
+            x = torch.rand(1, 32, 16, 16, requires_grad=False)
+            y = torch.rand(1, 32, 16, 16, requires_grad=False)
+            _, graph = self.checkTrace(m, [x, y])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+
+    def test_wildcard(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
+                self.eltwise = nn.ReLU()
+
+            def forward(self, x):
+                x = self.conv1(x)
+                y = self.eltwise(x)
+                return [x, y]
+
+        # The pattern is as the following:
+        #      conv
+        #     |    \
+        # eltwise   \
+        #    |       \
+        #  ListConstruct
+        #
+        # The output of conv is used by a wildcard op: ListConstruct.
+        # Thus conv-eltwise cannot be selected into the same Partition.
+        m = M()
+        x = torch.rand(1, 32, 28, 28)
+        _, graph = self.checkTrace(m, [x])
+        # conv can exist in a single-op oneDNN Graph partition but not relu
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        self.assertFused(graph, ['aten::_convolution'])
+
+    def test_rewrap_tensor_input_to_pytorch(self):
+        class M(nn.Module):
+            def __init__(self, eltwise_fn, data_type):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True, dtype=data_type)
+                self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True, dtype=data_type)
+                self.eltwise = eltwise_fn
+                self.adaptive_avg_pool_2d = nn.AdaptiveAvgPool2d((5, 7))
+
+            def forward(self, x, y):
+                x = self.conv1(x)
+                x = self.eltwise(x)
+                x = self.conv2(x)
+                x = self.eltwise(x)
+                x = torch.add(x, y)
+                x = self.adaptive_avg_pool_2d(x)
+                return x
+
+        eltwise_fn_name = 'relu'
+        eltwise_fn = get_eltwise_fn(eltwise_fn_name)
+        # Add bfloat16 later
+        for data_type in [torch.float]:
+            m = M(eltwise_fn, data_type)
+            m = m.to(memory_format=torch.channels_last)
+            x = torch.rand(1, 32, 28, 28, dtype=data_type).to(memory_format=torch.channels_last)
+            y = torch.rand(1, 32, 28, 28, dtype=data_type).to(memory_format=torch.channels_last)
+            # Simply test if the output is accurate
+            # The output of the second partition is input to adaptive_avg_pool2d, which is
+            # unsupported by LLGA, so it must be handled by PyTorch, which should receive
+            # correct strides info of the channels-last tensor.
+            graph, _ = self.checkTrace(m, [x, y])
+
+
+@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
+class TestModel(JitLlgaTestCase):
+    @skipIfNoTorchVision
+    def _test_vision(self, model_name):
+        m = getattr(torchvision.models, model_name)().eval()
+        x = torch.rand(1, 3, 224, 224) / 10
+        _, graph = self.checkTrace(m, [x])
+        self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm',
+                                 'aten::relu', 'aten::linear',
+                                 'aten::avg_pool2d', 'aten::max_pool2d'])
+
+
+for model_name, enabled in [
+    ['resnet50', True],
+    ['resnext50_32x4d', True],
+    ['resnext101_32x8d', True],
+    ['densenet121', True],
+    ['googlenet', TEST_SCIPY],
+    ['mobilenet_v2', True],
+    ['mnasnet1_0', True],
+    ['squeezenet1_0', True],
+    ['vgg16', True],
+    ['alexnet', True],
+    ['shufflenet_v2_x1_0', True],
+    ['wide_resnet50_2', True],
+]:
+    def wrapper(mname):
+        @unittest.skipIf(not enabled, 'Disabled')
+        def test(self):
+            return self._test_vision(mname)
+        return test
+
+    setattr(TestModel, 'test_vision_%s' % model_name, wrapper(model_name))
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_jiterator.py b/test/test_jiterator.py
new file mode 100644
index 000000000000..a92998672ffb
--- /dev/null
+++ b/test/test_jiterator.py
@@ -0,0 +1,132 @@
+# Owner(s): ["module: cuda"]
+
+import torch
+from torch.cuda.jiterator import _create_jit_fn as create_jit_fn
+import sys
+from itertools import product
+from torch.testing._internal.common_utils import TestCase, parametrize, run_tests, TEST_CUDA
+from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing._internal.common_device_type import (
+    skipCUDAIfRocm, skipCUDAIf, instantiate_device_type_tests, dtypes, toleranceOverride, tol)
+from torch.testing._internal.common_cuda import _get_torch_cuda_version
+
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests', file=sys.stderr)
+    TestCase = object  # noqa: F811
+
+
+code_string = "template <typename T> T my_fused_kernel(T x, T y, T alpha, T beta) { return alpha * x + beta * y; }"
+jitted_fn = create_jit_fn(code_string, alpha=1, beta=1)
+
+def ref_fn(x, y, alpha=1, beta=1):
+    return alpha * x + beta * y
+
+class TestPythonJiterator(TestCase):
+    @skipCUDAIfRocm
+    @parametrize("shape_strides", [
+        (([3, 3], [3, 1]), ([3, 3], [3, 1])),  # contiguous
+    ])
+    @dtypes(*product(all_types_and_complex_and(torch.half, torch.bfloat16),
+                     all_types_and_complex_and(torch.half, torch.bfloat16)))
+    def test_all_dtype_contiguous(self, device, dtypes, shape_strides):
+        a_buffer = torch.rand(9, device=device).mul(10).type(dtypes[0])
+        b_buffer = torch.rand(9, device=device).mul(10).type(dtypes[1])
+
+        a = a_buffer.as_strided(*shape_strides[0])
+        b = b_buffer.as_strided(*shape_strides[1])
+
+        expected = ref_fn(a, b)
+        result = jitted_fn(a, b)
+
+        self.assertEqual(expected, result)
+
+    @skipCUDAIfRocm
+    # See https://github.com/pytorch/pytorch/pull/76394#issuecomment-1118018287 for details
+    @skipCUDAIf(_get_torch_cuda_version() < (11, 6), "On cuda 11.3, nvrtcCompileProgram is taking too long to "
+                "compile jiterator generated kernels for non-contiguous input that requires dynamic-casting.")
+    @parametrize("shape_strides", [
+        (([3, 3], [1, 3]), ([3, 1], [1, 3])),  # non-contiguous
+    ])
+    @dtypes(*product(all_types_and_complex_and(torch.half, torch.bfloat16),
+                     all_types_and_complex_and(torch.half, torch.bfloat16)))
+    def test_all_dtype_noncontiguous(self, device, dtypes, shape_strides):
+        a_buffer = torch.rand(9, device=device).mul(10).type(dtypes[0])
+        b_buffer = torch.rand(9, device=device).mul(10).type(dtypes[1])
+
+        a = a_buffer.as_strided(*shape_strides[0])
+        b = b_buffer.as_strided(*shape_strides[1])
+
+        expected = ref_fn(a, b)
+        result = jitted_fn(a, b)
+
+        self.assertEqual(expected, result)
+
+    @skipCUDAIfRocm
+    @dtypes(torch.float, torch.double, torch.float16, torch.bfloat16)
+    @parametrize("alpha", [-1, 2.0, None])
+    @parametrize("beta", [3, -4.2, None])
+    @toleranceOverride({torch.float16 : tol(atol=1e-2, rtol=1e-3)})
+    def test_extra_args(self, device, dtype, alpha, beta):
+        a = torch.rand(3, device=device).mul(10).type(dtype)
+        b = torch.rand(3, device=device).mul(10).type(dtype)
+
+        extra_args = {}
+        if alpha is not None:
+            extra_args["alpha"] = alpha
+        if beta is not None:
+            extra_args["beta"] = beta
+
+        expected = ref_fn(a, b, **extra_args)
+        result = jitted_fn(a, b, **extra_args)
+
+        self.assertEqual(expected, result)
+
+    @skipCUDAIfRocm
+    def test_bool_extra_args(self, device):
+        code_string = "template <typename T> T conditional(T x, T mask, bool is_train) { return is_train ? x * mask : x; }"
+        jitted_fn = create_jit_fn(code_string, is_train=False)
+
+        def ref_fn(x, mask, is_train):
+            return x * mask if is_train else x
+
+        a = torch.rand(3, device=device)
+        b = torch.rand(3, device=device)
+
+        expected = ref_fn(a, b, is_train=True)
+        result = jitted_fn(a, b, is_train=True)
+        self.assertEqual(expected, result)
+
+    @skipCUDAIfRocm
+    @parametrize("num_inputs", list(range(1, 9)))
+    def test_various_num_inputs(self, num_inputs):
+        inputs = []
+        for i in range(num_inputs):
+            inputs.append(torch.rand(3, device='cuda').mul(10))
+
+        input_string = ",".join([f"T i{i}" for i in range(num_inputs)])
+        function_body = "+".join([f"i{i}" for i in range(num_inputs)])
+        code_string = f"template <typename T> T my_kernel({input_string}) {{ return {function_body}; }}"
+        jitted_fn = create_jit_fn(code_string)
+
+        def ref_fn(*inputs):
+            return torch.sum(torch.stack(inputs), dim=0)
+
+        expected = ref_fn(*inputs)
+        result = jitted_fn(*inputs)
+
+        self.assertEqual(expected, result)
+
+    @skipCUDAIfRocm
+    @parametrize("code_string", [
+        "template <typename T> T my _kernel(T x) { return x; }",
+        "template <typename T> Tmy_kernel(T x) { return x; }",
+    ])
+    def test_invalid_function_name(self, code_string):
+        with self.assertRaises(Exception):
+            jitted_fn = create_jit_fn(code_string)
+
+
+instantiate_device_type_tests(TestPythonJiterator, globals(), only_for="cuda")
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 6ca35557bbf5..1b0b4d95478a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -22,13 +22,14 @@
     (instantiate_device_type_tests, dtypes, has_cusolver,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
-     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver)
+     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, dtypesIfMPS)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
-    all_types, floating_and_complex_types, get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes,
-    get_all_fp_dtypes,
+    all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
+    floating_and_complex_types_and, floating_types_and, complex_types,
 )
-from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9
+from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9, _get_magma_version, \
+    _get_torch_cuda_version
 from torch.distributions.binomial import Binomial
 
 # Protects against includes accidentally setting the default dtype
@@ -101,7 +102,7 @@ def check(a_sizes_, b_sizes_):
 
     # Tests torch.outer, and its alias, torch.ger, vs. NumPy
     @precisionOverride({torch.bfloat16: 1e-1})
-    @dtypes(*(get_all_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_outer(self, device, dtype):
         def run_test_case(a, b):
             if dtype == torch.bfloat16:
@@ -138,6 +139,14 @@ def run_test_case(a, b):
         run_test_case(zero_strided, b)
         run_test_case(a, zero_strided)
 
+    def test_solve_removed_error(self, device):
+        a = make_tensor(5, 5, device=device, dtype=torch.float32)
+        b = make_tensor(5, 1, device=device, dtype=torch.float32)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            torch.solve(b, a)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            b.solve(a)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
@@ -264,7 +273,8 @@ def numpy_ref(a, b):
                 else:
                     # driver == 'gelsy'
                     # QR based algorithm; setting the value too high might lead to non-unique solutions and flaky tests
-                    rcond = 1e-4
+                    # so we skip this case
+                    continue
 
             # specifying rcond value has no effect for gels driver so no need to run the tests again
             if driver == 'gels' and rcond is not None:
@@ -744,7 +754,7 @@ def check(m, a, b, beta, alpha):
         check(m_scalar, a, b, beta, alpha)
 
         # test nans and infs are not propagated to the output when beta == 0
-        float_and_complex_dtypes = get_all_fp_dtypes() + get_all_complex_dtypes()
+        float_and_complex_dtypes = floating_and_complex_types_and(torch.half, torch.bfloat16)
         if beta == 0 and dtype in float_and_complex_dtypes:
             m[0][10] = m[10][10] = m[20][20] = float('inf')
             m[1][10] = m[11][10] = m[21][20] = float('nan')
@@ -757,7 +767,7 @@ def test_addr_bool(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=False, alpha=False)
         self._test_addr_vs_numpy(device, dtype, beta=True, alpha=True)
 
-    @dtypes(*(get_all_int_dtypes()))
+    @dtypes(*integral_types())
     def test_addr_integral(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError,
                                     'argument beta must not be a floating point number.'):
@@ -778,7 +788,7 @@ def test_addr_integral(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=2, alpha=2)
 
     @precisionOverride({torch.bfloat16: 1e-1})
-    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
+    @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16))
     def test_addr_float_and_complex(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError,
                                     'Boolean beta only supported for Boolean results.'):
@@ -791,11 +801,11 @@ def test_addr_float_and_complex(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=0., alpha=2)
         # when beta is not zero
         self._test_addr_vs_numpy(device, dtype, beta=0.5, alpha=2)
-        if dtype in get_all_complex_dtypes():
+        if dtype in complex_types():
             self._test_addr_vs_numpy(device, dtype, beta=(0 + 0.1j), alpha=(0.2 - 0.2j))
 
-    @dtypes(*itertools.product(get_all_dtypes(),
-                               get_all_dtypes()))
+    @dtypes(*itertools.product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+                               all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)))
     def test_outer_type_promotion(self, device, dtypes):
         a = torch.randn(5).to(device=device, dtype=dtypes[0])
         b = torch.randn(5).to(device=device, dtype=dtypes[1])
@@ -805,7 +815,7 @@ def test_outer_type_promotion(self, device, dtypes):
 
     # don't use @dtypes decorator to avoid generating ~1700 tests per device
     def test_addr_type_promotion(self, device):
-        for dtypes0, dtypes1, dtypes2 in product(get_all_dtypes(), repeat=3):
+        for dtypes0, dtypes1, dtypes2 in product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), repeat=3):
             a = make_tensor((5,), device=device, dtype=dtypes0, low=-2, high=2)
             b = make_tensor((5,), device=device, dtype=dtypes1, low=-2, high=2)
             m = make_tensor((5, 5), device=device, dtype=dtypes2, low=-2, high=2)
@@ -1100,96 +1110,65 @@ def test_kron_errors_and_warnings(self, device, dtype):
 
     # This test confirms that torch.linalg.norm's dtype argument works
     # as expected, according to the function's documentation
-    @skipCUDAIfNoMagma
-    def test_norm_dtype(self, device):
-        def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype):
-            # Determine the best dtype to use for comparisons between tensors
-            # of two different types
-            def get_compare_dtype(type0, type1):
-                types_32bit_based = [torch.float, torch.cfloat]
-                is_complex = type0.is_complex or type1.is_complex
-
-                if type0 in types_32bit_based or type1 in types_32bit_based:
-                    return torch.cfloat if is_complex else torch.float
-                else:
-                    return torch.cdouble if is_complex else torch.double
-
-            compare_dtype = get_compare_dtype(from_dtype, to_dtype)
-
-            def get_value_type(dtype):
-                if dtype == torch.cfloat:
-                    return torch.float
-                elif dtype == torch.cdouble:
-                    return torch.double
-                elif dtype == torch.complex32:
-                    return torch.float16
-                else:
-                    return dtype
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16)
+    def test_norm_dtype(self, device, dtype):
+        make_arg = partial(make_tensor, dtype=dtype, device=device)
 
+        def run_test_case(input_size, ord, keepdim, to_dtype):
             msg = (
                 f'input_size={input_size}, ord={ord}, keepdim={keepdim}, '
-                f'from_dtype={from_dtype}, to_dtype={to_dtype}')
-            input = torch.randn(*input_size, dtype=from_dtype, device=device)
+                f'dtype={dtype}, to_dtype={to_dtype}')
+            input = make_arg(input_size)
             result = torch.linalg.norm(input, ord, keepdim=keepdim)
-            if from_dtype.is_complex:
-                # By default, norm downgrades a complex input to the corresponding real number type
-                self.assertEqual(result.dtype, get_value_type(from_dtype), msg=msg)
-            else:
-                self.assertEqual(result.dtype, from_dtype, msg=msg)
+            self.assertEqual(result.dtype, input.real.dtype, msg=msg)
 
-            result_out = torch.empty((0), dtype=to_dtype, device=device)
+            result_out = torch.empty((0), dtype=result.dtype, device=device)
             torch.linalg.norm(input, ord, keepdim=keepdim, out=result_out)
-            self.assertEqual(result_out.dtype, to_dtype, msg=msg)
-            self.assertEqual(result.to(compare_dtype), result_out.to(compare_dtype), msg=msg)
+            self.assertEqual(result, result_out, msg=msg)
 
+            result = torch.linalg.norm(input.to(to_dtype), ord, keepdim=keepdim)
             result_with_dtype = torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype)
-            self.assertEqual(result_with_dtype.dtype, to_dtype, msg=msg)
-
-            if from_dtype.is_complex:
-                result_convert_first = torch.linalg.norm(input.to(to_dtype), ord, keepdim=keepdim)
-                self.assertEqual(result_with_dtype.to(compare_dtype), result_convert_first.to(compare_dtype), msg=msg)
-            else:
-                self.assertEqual(result.to(compare_dtype), result_with_dtype.to(compare_dtype), msg=msg)
+            self.assertEqual(result, result_with_dtype, msg=msg)
 
             result_out_with_dtype = torch.empty_like(result_with_dtype)
             torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype, out=result_out_with_dtype)
-            self.assertEqual(result_out_with_dtype.dtype, to_dtype, msg=msg)
             self.assertEqual(result_with_dtype, result_out_with_dtype, msg=msg)
 
-        ord_vector = [0, 0.1, -0.1, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None]
+        ord_vector = [0, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None]
+
+        # In these orders we are computing the 10-th power and 10-th root of numbers.
+        # We avoid them for half-precision types as it makes the tests above too badly conditioned
+        if dtype != torch.float16 and dtype != torch.bfloat16:
+            ord_vector.extend([0.1, -0.1])
         ord_matrix = ['fro', 'nuc', 1, -1, 2, -2, inf, -inf, None]
         S = 10
-        test_cases = [
-            ((S, ), ord_vector),
-            ((S, S), ord_matrix),
-        ]
-        for keepdim in [True, False]:
-            for input_size, ord_settings in test_cases:
-                for ord in ord_settings:
-                    if self.device_type == 'cpu' and not torch._C.has_lapack and ord in [2, -2, 'nuc']:
-                        continue
 
-                    dtypes = [torch.float, torch.double, torch.cfloat, torch.cdouble]
-                    for from_dtype, to_dtype in itertools.product(dtypes, dtypes):
-                        if from_dtype.is_complex and not to_dtype.is_complex:
-                            continue
-                        run_test_case(input_size, ord, keepdim, from_dtype, to_dtype)
-
-        # Make sure that setting dtype != out.dtype raises an error
-        dtype_pairs = [
-            (torch.float, torch.double),
-            (torch.double, torch.float),
-            (torch.cfloat, torch.cdouble),
-            (torch.cdouble, torch.cfloat),
-        ]
-        for keepdim in [True, False]:
-            for input_size, ord_settings in test_cases:
-                for ord in ord_settings:
-                    for dtype, out_dtype in dtype_pairs:
-                        input = torch.rand(*input_size)
-                        result = torch.tensor([]).to(out_dtype)
-                        with self.assertRaisesRegex(RuntimeError, r'provided dtype must match dtype of result'):
-                            torch.linalg.norm(input, ord=ord, keepdim=keepdim, dtype=dtype, out=result)
+        if dtype == torch.cfloat:
+            norm_dtypes = (torch.cfloat, torch.cdouble)
+        elif dtype == torch.cdouble:
+            norm_dtypes = (torch.cdouble,)
+        elif dtype in (torch.float16, torch.bfloat16, torch.float):
+            norm_dtypes = (torch.float, torch.double)
+        elif dtype == torch.double:
+            norm_dtypes = (torch.double,)
+        else:
+            raise RuntimeError("Unsupported dtype")
+
+        for ord, keepdim, norm_dtype in product(ord_vector, (True, False), norm_dtypes):
+            run_test_case((S,) , ord, keepdim, norm_dtype)
+
+        for ord, keepdim, norm_dtype in product(ord_matrix, (True, False), norm_dtypes):
+            if ord in [2, -2, 'nuc']:
+                # We need torch.svdvals
+                if dtype == torch.float16 or dtype == torch.bfloat16:
+                    continue
+
+                # We need LAPACK or equivalent
+                if ((torch.device(device).type == 'cuda' and not torch.cuda.has_magma and not has_cusolver()) or
+                   (torch.device(device).type == 'cpu' and not torch._C.has_lapack)):
+                    continue
+            run_test_case((S, S) , ord, keepdim, norm_dtype)
+
 
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16)
     def test_vector_norm(self, device, dtype):
@@ -1218,45 +1197,40 @@ def vector_norm_reference(input, ord, dim=None, keepdim=False, dtype=None):
             return result
 
         def run_test_case(input, ord, dim, keepdim, norm_dtype):
-            msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}, norm_dtype={norm_dtype}'
-            error_msg = None
-            if input.numel() == 0:
-                if ord < 0:
-                    error_msg = r'linalg.vector_norm of negative order cannot be performed on an empty tensor'
-                elif ord == inf and (dim is None or input.size(dim) == 0):
-                    error_msg = (
-                        r'linalg.vector_norm cannot compute the infinity norm on an empty '
-                        r'dimension because the operation does not have an identity')
-            if error_msg is None:
+            if (input.numel() == 0 and
+                (ord < 0. or ord == inf) and
+               (dim is None or input.shape[dim] == 0)):
+                # The operation does not have an identity.
+                error_msg = "linalg.vector_norm cannot compute"
+                with self.assertRaisesRegex(RuntimeError, error_msg):
+                    torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim)
+            else:
+                msg = (f'input.size()={input.size()}, ord={ord}, dim={dim}, '
+                       f'keepdim={keepdim}, dtype={dtype}, norm_dtype={norm_dtype}')
                 result_dtype_reference = vector_norm_reference(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype)
                 result_dtype = torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype)
+                if dtype.is_complex:
+                    result_dtype_reference = result_dtype_reference.real
                 self.assertEqual(result_dtype, result_dtype_reference, msg=msg)
 
                 if norm_dtype is not None:
-                    result_convert_before = torch.linalg.vector_norm(input.to(norm_dtype), ord, dim=dim, keepdim=keepdim)
-                    if norm_dtype.is_complex:
-                        result_convert_before = result_convert_before.to(norm_dtype)
-
-                    result_out = torch.empty((0), dtype=norm_dtype, device=device)
-                    torch.linalg.vector_norm(input, ord, dtype=norm_dtype, dim=dim, keepdim=keepdim, out=result_out)
-                    self.assertEqual(result_convert_before, result_out, msg=msg)
-                else:
-                    result_out = torch.empty((0), dtype=result_dtype.dtype, device=device)
-                    torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, out=result_out)
-                    self.assertEqual(result_dtype, result_out, msg=msg)
-            else:
-                with self.assertRaises(RuntimeError):
-                    vector_norm_reference(input, ord, dim=dim, keepdim=keepdim)
-                with self.assertRaisesRegex(RuntimeError, error_msg):
-                    torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim)
-
-        if dtype.is_complex:
-            norm_dtypes = [None, torch.cfloat, torch.cdouble]
+                    ref = torch.linalg.vector_norm(input.to(norm_dtype), ord, dim=dim, keepdim=keepdim)
+                    actual = torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype)
+                    self.assertEqual(ref, actual, msg=msg)
+
+        if dtype == torch.cfloat:
+            norm_dtypes = (None, torch.cfloat, torch.cdouble)
+        elif dtype == torch.cdouble:
+            norm_dtypes = (None, torch.cdouble)
+        elif dtype in (torch.float16, torch.bfloat16, torch.float):
+            norm_dtypes = (None, torch.float, torch.double)
+        elif dtype == torch.double:
+            norm_dtypes = (None, torch.double)
         else:
-            norm_dtypes = [None, torch.float, torch.double, torch.cfloat, torch.cdouble, torch.float16, torch.bfloat16]
+            raise RuntimeError("Unsupported dtype")
 
         for input_size, ord, keepdim, norm_dtype in product(input_sizes, ord_vector, [True, False], norm_dtypes):
-            input = make_tensor(input_size, device, dtype, low=-9, high=9)
+            input = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
             for dim in [None, random.randint(0, len(input_size) - 1)]:
                 run_test_case(
                     input,
@@ -1287,40 +1261,6 @@ def test_vector_norm_dim_tuple_arg(self, device):
                     with self.assertRaises(error):
                         torch.linalg.vector_norm(input, dim=dim)
 
-    # Test that linalg.vector_norm throws an error if the out tensor's dtype
-    # does not match the expected output dtype
-    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16)
-    def test_vector_norm_out_dtype_error(self, device, dtype):
-        input = torch.randn(10, device=device, dtype=dtype)
-        dtypes = [None, torch.float, torch.double, torch.cfloat, torch.cdouble, torch.float16, torch.bfloat16]
-
-        for norm_dtype, out_dtype in product(dtypes, dtypes):
-            if out_dtype is None:
-                continue
-
-            if norm_dtype is None:
-                if dtype == torch.cfloat:
-                    expected_dtype = torch.float
-                elif dtype == torch.cdouble:
-                    expected_dtype = torch.double
-                else:
-                    expected_dtype = dtype
-            else:
-                expected_dtype = norm_dtype
-
-            result = torch.empty((0), device=device, dtype=out_dtype)
-            msg = f'norm_dtype: {norm_dtype}, out_dtype: {out_dtype}, expected_dtype: {expected_dtype}'
-
-            if dtype.is_complex and norm_dtype is not None and not norm_dtype.is_complex:
-                with self.assertRaisesRegex(RuntimeError, r"linalg.vector_norm expected complex 'dtype'", msg=msg):
-                    torch.linalg.vector_norm(input, dtype=norm_dtype, out=result)
-
-            elif out_dtype != expected_dtype:
-                with self.assertRaisesRegex(RuntimeError, r'linalg.vector_norm expected out tensor dtype', msg=msg):
-                    torch.linalg.vector_norm(input, dtype=norm_dtype, out=result)
-            else:
-                torch.linalg.vector_norm(input, dtype=norm_dtype, out=result)
-
     # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that
     # their vector norm results match
     @dtypes(torch.float, torch.double)
@@ -1363,49 +1303,45 @@ def run_test_case(input, p, dim, keepdim):
     @skipMeta  # https://github.com/pytorch/pytorch/issues/54082
     @skipCUDAIfNoMagma
     @dtypes(torch.float, torch.double)
-    @precisionOverride({torch.float32: 2e-5})
+    @precisionOverride({torch.float32: 2e-4})
     def test_norm_matrix(self, device, dtype):
+        make_arg = partial(make_tensor, dtype=dtype, device=device)
+
         def run_test_case(input, ord, dim, keepdim):
             msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}'
             result = torch.linalg.norm(input, ord, dim, keepdim)
             input_numpy = input.cpu().numpy()
             result_numpy = np.linalg.norm(input_numpy, ord, dim, keepdim)
 
-            def check(op):
-                result = op(input, ord, dim, keepdim)
-                self.assertEqual(result, result_numpy, msg=msg)
-                result_out = torch.empty_like(result)
-                op(input, ord, dim, keepdim, out=result_out)
-                self.assertEqual(result, result_out, msg=msg)
-
-            check(torch.linalg.norm)
+            result = torch.linalg.norm(input, ord, dim, keepdim)
+            self.assertEqual(result, result_numpy, msg=msg)
             if ord is not None and dim is not None:
-                check(torch.linalg.matrix_norm)
+                result = torch.linalg.matrix_norm(input, ord, dim, keepdim)
+                self.assertEqual(result, result_numpy, msg=msg)
 
         ord_matrix = [1, -1, 2, -2, inf, -inf, 'nuc', 'fro']
         S = 10
         test_cases = [
-            # input size, p settings, dim
-            ((S, S), ord_matrix, None),
-            ((S, S), ord_matrix, (0, 1)),
-            ((S, S), ord_matrix, (1, 0)),
-            ((S, S, S, S), ord_matrix, (2, 0)),
-            ((S, S, S, S), ord_matrix, (-1, -2)),
-            ((S, S, S, S), ord_matrix, (-1, -3)),
-            ((S, S, S, S), ord_matrix, (-3, 2)),
+            # input size, dim
+            ((S, S), None),
+            ((S, S), (0, 1)),
+            ((S, S), (1, 0)),
+            ((S, S, S, S), (2, 0)),
+            ((S, S, S, S), (-1, -2)),
+            ((S, S, S, S), (-1, -3)),
+            ((S, S, S, S), (-3, 2)),
         ]
-        L = 1_000
 
-        if dtype == torch.double:
-            test_cases.append(((L, L), ord_matrix, None))
-
-        for keepdim in [True, False]:
-            for input_size, ord_settings, dim in test_cases:
-                input = torch.randn(*input_size, dtype=dtype, device=device)
-                for ord in ord_settings:
-                    if self.device_type == 'cpu' and not torch._C.has_lapack and ord in [2, -2, 'nuc']:
-                        continue
-                    run_test_case(input, ord, dim, keepdim)
+        for (shape, dim), keepdim, ord in product(test_cases, [True, False], ord_matrix):
+            if ord in [2, -2, 'nuc']:
+                # We need torch.svdvals
+                if dtype == torch.float16 or dtype == torch.bfloat16:
+                    continue
+                # We need LAPACK or equivalent
+                if ((torch.device(device).type == 'cuda' and not torch.cuda.has_magma and not has_cusolver()) or
+                   (torch.device(device).type == 'cpu' and not torch._C.has_lapack)):
+                    continue
+            run_test_case(make_arg(shape), ord, dim, keepdim)
 
 
     @onlyCUDA
@@ -1573,20 +1509,17 @@ def run_error_test_case(input, ord, dim, keepdim, error_type, error_regex):
         S = 10
         error_test_cases = [
             # input size, p settings, dim, error type, error regex
-            ((S, ), ['fro'], None, RuntimeError, r'order "fro" can only be used if either len\(dim\) == 2'),
-            ((S, ), ['nuc'], None, RuntimeError, r'order "nuc" can only be used if either len\(dim\) == 2'),
-            ((S, S), [3.5], None, RuntimeError, r'Order 3.5 not supported for matrix norm'),
-            ((S, S), [0], None, RuntimeError, r'Order 0 not supported for matrix norm'),
-            ((S, S), ['nuc'], 0, RuntimeError, r'order "nuc" can only be used if either len\(dim\) == 2'),
-            ((S, S), ['fro'], 0, RuntimeError, r'order "fro" can only be used if either len\(dim\) == 2'),
-            ((S, S), ['nuc'], (0, 0), RuntimeError, r'duplicate or invalid dimensions'),
-            ((S, S), ['fro', 0], (0, 0), RuntimeError, r'Expected dims to be different'),
-            ((S, S), ['fro', 'nuc', 0], (0, 4), IndexError, r'Dimension out of range'),
+            ((S, ), ['fro', 'nuc'], None, RuntimeError, r'input tensor must be a matrix or a batch of matrices'),
+            ((S, S), [3.5], None, RuntimeError, r'matrix_norm: Order 3.5 not supported'),
+            ((S, S), [0], None, RuntimeError, r'matrix_norm: Order 0 not supported'),
+            ((S, S), ['fail'], None, RuntimeError, r'matrix_norm: Order fail not supported'),
+            ((S, S), ['fro', 'nuc'], 0, RuntimeError, r'matrix_norm: dim must be a 2-tuple of ints'),
+            ((S, S), ['fro', 'nuc', 2], (0, 0), RuntimeError, r'Expected dims to be different'),
+            ((S, S), ['fro', 'nuc', 2], (0, 4), IndexError, r'Dimension out of range'),
             ((S, ), [0], (4, ), IndexError, r'Dimension out of range'),
             ((S, ), [None], (0, 0), RuntimeError, r'dim 0 appears multiple times'),
-            ((S, S, S), [1], (0, 1, 2), RuntimeError, r"'dim' must specify 1 or 2 dimensions"),
-            ((S, S, S), [1], None, RuntimeError, r"'dim' must specify 1 or 2 dimensions"),
-            ((S, S), ['garbage'], (0, 1), RuntimeError, r'Invalid norm order: garbage'),
+            ((S, S, S), [1], (0, 1, 2), RuntimeError, r"If dim is specified, it must be of length 1 or 2."),
+            ((S, S, S), [1], None, RuntimeError, r"If dim is not specified but ord is, the input must be 1D or 2D"),
         ]
         for keepdim in [True, False]:
             for input_size, ord_settings, dim, error_type, error_regex in error_test_cases:
@@ -1619,10 +1552,10 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
                 self.assertEqual(res.shape, expected.shape, msg=msg)
                 self.assertEqual(res, expected, msg=msg, exact_dtype=False)
 
-                res_out = torch.tensor([]).to(device)
+                res_out = torch.tensor([], device=device, dtype=res.dtype)
                 torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out)
                 self.assertEqual(res_out.shape, expected.shape, msg=msg)
-                self.assertEqual(res_out.cpu(), expected, msg=msg, exact_dtype=False)
+                self.assertEqual(res_out, expected, msg=msg)
 
             # matrix norm
             x = torch.randn(25, 25, device=device, dtype=dtype)
@@ -1634,10 +1567,10 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
                 self.assertEqual(res.shape, expected.shape, msg=msg)
                 self.assertEqual(res, expected, msg=msg, exact_dtype=False)
 
-                res_out = torch.tensor([]).to(device)
+                res_out = torch.tensor([], device=device, dtype=res.dtype)
                 torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out)
                 self.assertEqual(res_out.shape, expected.shape, msg=msg)
-                self.assertEqual(res_out.cpu(), expected, msg=msg, exact_dtype=False)
+                self.assertEqual(res_out, expected, msg=msg)
 
     # Test that linal.vector_norm gives the same result as numpy when inputs
     # contain extreme values (inf, -inf, nan)
@@ -1655,18 +1588,17 @@ def test_vector_norm_extreme_values(self, device):
                 result_n = np.linalg.norm(x_n, ord=ord)
                 self.assertEqual(result, result_n, msg=msg)
 
-    @skipMeta  # https://github.com/pytorch/pytorch/issues/54082
-    @skipCUDAIfNoMagma
+    @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 2e-5})
     def test_matrix_norm(self, device, dtype):
         # Test only inputs for which torch.linalg.matrix_norm diverges from torch.linalg.norm
-        A = make_tensor((2, 2, 2), device, dtype)
+        A = make_tensor((2, 2, 2), dtype=dtype, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm\(\):.*must be a matrix.*'):
-            torch.linalg.matrix_norm(make_tensor((2,), device, dtype))
-        with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm\(\):.*must be a 2-tuple.*'):
+        with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm:.*must be a matrix.*'):
+            torch.linalg.matrix_norm(make_tensor((2,), dtype=dtype, device=device))
+        with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm:.*must be a 2-tuple.*'):
             torch.linalg.matrix_norm(A, dim=(0,))
         with self.assertRaisesRegex(RuntimeError, r'.*not supported.*'):
             torch.linalg.matrix_norm(A, ord=0)
@@ -1738,14 +1670,9 @@ def is_broken_matrix_norm_case(ord, x):
     def test_norm_vector_degenerate_shapes(self, device, dtype):
         def run_test_case(input, ord, dim, keepdim):
             msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}'
-            should_error = False
-            if ord is not None and ord < 0:
-                should_error = True
-            elif ord == inf:
-                if dim is None or input.size(dim) == 0:
-                    should_error = True
-
-            if should_error:
+            if (input.numel() == 0 and
+                (ord < 0. or ord == inf) and
+               (dim is None or input.shape[dim] == 0)):
                 with self.assertRaises(RuntimeError):
                     torch.linalg.norm(input, ord, dim, keepdim)
             else:
@@ -1754,7 +1681,7 @@ def run_test_case(input, ord, dim, keepdim):
                 result = torch.linalg.norm(input, ord, dim, keepdim)
                 self.assertEqual(result, result_numpy, msg=msg)
 
-        ord_vector = [0, 0.5, 1, 2, 3, inf, -0.5, -1, -2, -3, -inf, None]
+        ord_vector = [0, 0.5, 1, 2, 3, inf, -0.5, -1, -2, -3, -inf]
         S = 10
         test_cases = [
             # input size, dim
@@ -2381,7 +2308,7 @@ def test_norm_fro_2_equivalence_old(self, device, dtype):
             (5, 3, 8, 1, 3, 5)]
 
         for input_size in input_sizes:
-            a = make_tensor(input_size, device, dtype, low=-9, high=9)
+            a = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
 
             # Try full reduction
             dim_settings = [None]
@@ -2866,7 +2793,6 @@ def test_inv_ex_info_device(self, device, dtype):
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
-    @skipCUDAIfRocm
     def test_inv_ex_singular(self, device, dtype):
         # if the input matrix is not invertible, info with positive integer is returned
         A = torch.eye(3, 3, dtype=dtype, device=device)
@@ -2894,6 +2820,7 @@ def test_inv_ex_singular(self, device, dtype):
     @slowTest
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
+    @skipCUDAIfRocm
     @dtypes(*floating_and_complex_types())
     @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3,
                         torch.float64: 1e-5, torch.complex128: 1e-5})
@@ -2936,7 +2863,7 @@ def run_test_singular_input(batch_dim, n):
     @skipCPUIfNoLapack
     @onlyNativeDeviceTypes   # TODO: XLA doesn't raise exception
     @skipCUDAIfRocm
-    @skipCUDAVersionIn([(11, 3), (11, 5)])  # https://github.com/pytorch/pytorch/issues/57482
+    @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)])  # https://github.com/pytorch/pytorch/issues/57482
     @dtypes(*floating_and_complex_types())
     def test_inverse_errors_large(self, device, dtype):
         # Test batched inverse of singular matrices reports errors without crashing (gh-51930)
@@ -3243,89 +3170,23 @@ def run_test_singular_input(batch_dim, n):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
-    def test_old_solve(self, device, dtype):
-        for (k, n) in zip([2, 3, 5], [3, 5, 7]):
-            b, A = self.solve_test_helper((n, n), (n, k), device, dtype)
-            x = torch.solve(b, A)[0]
-            self.assertEqual(b, np.matmul(A.cpu(), x.cpu()))
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_old_solve_batched(self, device, dtype):
-        def solve_batch_helper(A_dims, b_dims):
-            b, A = self.solve_test_helper(A_dims, b_dims, device, dtype)
-            x_exp_list = []
-            for i in range(b_dims[0]):
-                x_exp_list.append(torch.solve(b[i], A[i])[0])
-            x_exp = torch.stack(x_exp_list)  # Stacked output
-            x_act = torch.solve(b, A)[0]  # Actual output
-            self.assertEqual(x_exp, x_act)  # Equality check
-            Ax = np.matmul(A.cpu(), x_act.cpu())
-            self.assertEqual(b, Ax)
-
-        for batchsize in [1, 3, 4]:
-            solve_batch_helper((batchsize, 5, 5), (batchsize, 5, 10))
-
-    @slowTest
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_old_solve_batched_many_batches(self, device, dtype):
-        for A_dims, b_dims in zip([(256, 256, 5, 5), (3, 3)], [(5, 1), (512, 512, 3, 1)]):
-            b, A = self.solve_test_helper(A_dims, b_dims, device, dtype)
-            x, _ = torch.solve(b, A)
-            Ax = torch.matmul(A, x)
-            self.assertEqual(Ax, b.expand_as(x))
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_old_solve_batched_broadcasting(self, device, dtype):
+    def test_solve_batched_broadcasting(self, device, dtype):
         from numpy.linalg import solve
 
-        def run_test(A_dims, b_dims):
+        def run_test(A_dims, B_dims):
             A_matrix_size = A_dims[-1]
             A_batch_dims = A_dims[:-2]
-            b, A = self.solve_test_helper(A_batch_dims + (A_matrix_size, A_matrix_size), b_dims, device, dtype)
-            x, _ = torch.solve(b, A)
-            x_exp = solve(A.cpu().numpy(), b.cpu().numpy())
-            self.assertEqual(x, x_exp)
+            B, A = self.solve_test_helper(A_batch_dims + (A_matrix_size, A_matrix_size), B_dims, device, dtype)
+            actual = torch.linalg.solve(A, B)
+            expected = solve(A.cpu().numpy(), B.cpu().numpy())
+            self.assertEqual(actual, expected)
 
         # test against numpy.linalg.solve
-        run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6))  # no broadcasting
-        run_test((2, 1, 3, 4, 4), (4, 6))  # broadcasting b
+        run_test((5, 5), (2, 0, 5, 3))  # broadcasting with 0 batch dim
+        run_test((2, 0, 5, 5), (5, 3))  # broadcasting with 0 batch dim
+        run_test((2, 1, 3, 4, 4), (4, 6))  # broadcasting B
         run_test((4, 4), (2, 1, 3, 4, 2))  # broadcasting A
-        run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5))  # broadcasting A & b
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_old_solve_errors_and_warnings(self, device, dtype):
-        # dtypes should be safely castable
-        a = torch.eye(2, dtype=dtype, device=device)
-        b = torch.randn(2, 1, dtype=dtype, device=device)
-        out = torch.empty(0, dtype=torch.int, device=device)
-        lu = torch.empty(0, dtype=dtype, device=device)
-        with self.assertRaisesRegex(RuntimeError, "but got solution with dtype Int"):
-            torch.solve(b, a, out=(out, lu))
-
-        out = torch.empty(0, dtype=dtype, device=device)
-        lu = torch.empty(0, dtype=torch.int, device=device)
-        with self.assertRaisesRegex(RuntimeError, "but got lu with dtype Int"):
-            torch.solve(b, a, out=(out, lu))
-
-        # device should match
-        if torch.cuda.is_available():
-            wrong_device = 'cpu' if self.device_type != 'cpu' else 'cuda'
-            out = torch.empty(0, dtype=dtype, device=wrong_device)
-            lu = torch.empty_like(a)
-            with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"):
-                torch.solve(b, a, out=(out, lu))
-            out = torch.empty(0, dtype=dtype, device=device)
-            lu = torch.empty_like(a).to(wrong_device)
-            with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"):
-                torch.solve(b, a, out=(out, lu))
+        run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5))  # broadcasting A & B
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -3678,6 +3539,9 @@ def test_matrix_rank_atol_rtol(self, device, dtype):
             result = torch.linalg.matrix_rank(a, atol=tol_value, rtol=tol_value)
             self.assertEqual(result, 2)  # there are 2 singular values above max(0.81, 1.5*0.81)
 
+    # CUDA 11.6 issue failure https://github.com/pytorch/pytorch/issues/75391
+    @skipCUDAIf(torch.version.cuda is not None
+                and torch.version.cuda.split(".") == ["11", "6"], "There's a bug in CUDA 11.6")
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
@@ -3791,7 +3655,7 @@ def test_old_matrix_rank(self, device, dtype):
     # This tests only the cases where torch.chain_matmul differs from torch.linalg.multi_dot which this is an "alias" for.
     def test_chain_matmul(self, device, dtype):
         # chain_matmul accepts a single input tensor while multi_dot does not
-        t = make_tensor((2, 2), device, dtype)
+        t = make_tensor((2, 2), dtype=dtype, device=device)
         self.assertEqual(t, torch.chain_matmul(t))
         with self.assertRaisesRegex(RuntimeError, r"chain_matmul\(\): Expected one or more matrices"):
             torch.chain_matmul()
@@ -3799,13 +3663,13 @@ def test_chain_matmul(self, device, dtype):
         # chain_matmul expects all tensors to be 2D whereas multi_dot allows the first and last tensors to
         # be either 1D or 2D
         with self.assertRaisesRegex(RuntimeError, r"Tensor dimension is 1, expected 2 instead"):
-            torch.chain_matmul(make_tensor(1, device, dtype), make_tensor(1, device, dtype))
+            torch.chain_matmul(make_tensor(1, dtype=dtype, device=device), make_tensor(1, dtype=dtype, device=device))
 
     @onlyNativeDeviceTypes
     @dtypes(torch.double, torch.cdouble)
     def test_multi_dot(self, device, dtype):
         def check(*shapes):
-            tensors = [make_tensor(shape, device, dtype) for shape in shapes]
+            tensors = [make_tensor(shape, dtype=dtype, device=device) for shape in shapes]
             np_arrays = [tensor.cpu().numpy() for tensor in tensors]
             res = torch.linalg.multi_dot(tensors).cpu()
             ref = torch.from_numpy(np.array(np.linalg.multi_dot(np_arrays)))
@@ -3843,7 +3707,7 @@ def check(tensors, out, msg):
             with self.assertRaisesRegex(RuntimeError, msg):
                 torch.linalg.multi_dot(tensors, out=out)
 
-        a = make_tensor(2, device, dtype)
+        a = make_tensor(2, dtype=dtype, device=device)
 
         check([], None, "expected at least 2 tensors")
         check([a], None, "expected at least 2 tensors")
@@ -3852,17 +3716,17 @@ def check(tensors, out, msg):
         check([a, torch.tensor(1, device=device, dtype=dtype)], None, "the last tensor must be 1D or 2D")
 
         check([a, a, a], None, "tensor 1 must be 2D")
-        check([a, make_tensor((2, 2, 2), device, dtype), a], None, "tensor 1 must be 2D")
+        check([a, make_tensor((2, 2, 2), dtype=dtype, device=device), a], None, "tensor 1 must be 2D")
 
-        check([a, make_tensor(2, device, torch.double)], None, "all tensors must have be the same dtype")
+        check([a, make_tensor(2, dtype=torch.double, device=device)], None, "all tensors must have be the same dtype")
         check([a, a], torch.empty(0, device=device, dtype=torch.double), "expected out tensor to have dtype")
 
         if self.device_type == 'cuda':
-            check([a, make_tensor(2, 'cpu', dtype)], None, "all tensors must be on the same device")
+            check([a, make_tensor(2, dtype=dtype, device="cpu")], None, "all tensors must be on the same device")
             check([a, a], torch.empty(0, dtype=dtype), "expected out tensor to be on device")
 
-        check([a, make_tensor(3, device, dtype)], None, "cannot be multiplied")
-        check([a, make_tensor((3, 2), device, dtype), a], None, "cannot be multiplied")
+        check([a, make_tensor(3, dtype=dtype, device=device)], None, "cannot be multiplied")
+        check([a, make_tensor((3, 2), dtype=dtype, device=device), a], None, "cannot be multiplied")
 
     @precisionOverride({torch.float32: 5e-6, torch.complex64: 5e-6})
     @skipCUDAIfNoMagma
@@ -3955,14 +3819,14 @@ def test_linalg_qr_autograd_errors(self, device, dtype):
         self.assertEqual(q.shape, (0,))  # empty tensor
         b = torch.sum(r)
         with self.assertRaisesRegex(RuntimeError,
-                                    "The derivative of qr is not implemented when mode='r'"):
+                                    "The derivative of linalg.qr depends on Q"):
             b.backward()
         #
         inp = torch.randn((7, 5), device=device, dtype=dtype, requires_grad=True)
         q, r = torch.linalg.qr(inp, mode='complete')
         b = torch.sum(r)
         with self.assertRaisesRegex(RuntimeError,
-                                    "The derivative of qr is not implemented when mode='complete' and nrows > ncols"):
+                                    "The QR decomposition is not differentiable when mode='complete' and nrows > ncols"):
             b.backward()
 
     @skipCUDAIfNoMagma
@@ -4054,17 +3918,17 @@ def _check_einsum(self, *args, np_args=None):
     @dtypes(torch.double, torch.cdouble)
     def test_einsum(self, device, dtype):
         # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
-        x = make_tensor((5,), device, dtype)
-        y = make_tensor((7,), device, dtype)
-        A = make_tensor((3, 5), device, dtype)
-        B = make_tensor((2, 5), device, dtype)
-        C = make_tensor((2, 3, 5), device, dtype)
-        D = make_tensor((2, 5, 7), device, dtype)
-        E = make_tensor((7, 9), device, dtype)
-        F = make_tensor((2, 3, 3, 5), device, dtype)
-        G = make_tensor((5, 4, 6), device, dtype)
-        H = make_tensor((4, 4), device, dtype)
-        I = make_tensor((2, 3, 2), device, dtype)
+        x = make_tensor((5,), dtype=dtype, device=device)
+        y = make_tensor((7,), dtype=dtype, device=device)
+        A = make_tensor((3, 5), dtype=dtype, device=device)
+        B = make_tensor((2, 5), dtype=dtype, device=device)
+        C = make_tensor((2, 3, 5), dtype=dtype, device=device)
+        D = make_tensor((2, 5, 7), dtype=dtype, device=device)
+        E = make_tensor((7, 9), dtype=dtype, device=device)
+        F = make_tensor((2, 3, 3, 5), dtype=dtype, device=device)
+        G = make_tensor((5, 4, 6), dtype=dtype, device=device)
+        H = make_tensor((4, 4), dtype=dtype, device=device)
+        I = make_tensor((2, 3, 2), dtype=dtype, device=device)
 
         # Vector operations
         self._check_einsum('i->', x)                     # sum
@@ -4095,20 +3959,20 @@ def test_einsum(self, device, dtype):
         self._check_einsum("ii", H)                      # trace
         self._check_einsum("ii->i", H)                   # diagonal
         self._check_einsum('iji->j', I)                  # non-contiguous trace
-        self._check_einsum('ngrg...->nrg...', make_tensor((2, 1, 3, 1, 4), device, dtype))
+        self._check_einsum('ngrg...->nrg...', make_tensor((2, 1, 3, 1, 4), dtype=dtype, device=device))
 
         # Test ellipsis
         self._check_einsum("i...->...", H)
         self._check_einsum("ki,...k->i...", A.t(), B)
         self._check_einsum("k...,jk->...", A.t(), B)
         self._check_einsum('...ik, ...j -> ...ij', C, x)
-        self._check_einsum('Bik,k...j->i...j', C, make_tensor((5, 3), device, dtype))
-        self._check_einsum('i...j, ij... -> ...ij', C, make_tensor((2, 5, 2, 3), device, dtype))
+        self._check_einsum('Bik,k...j->i...j', C, make_tensor((5, 3), dtype=dtype, device=device))
+        self._check_einsum('i...j, ij... -> ...ij', C, make_tensor((2, 5, 2, 3), dtype=dtype, device=device))
 
         # torch.bilinear with noncontiguous tensors
-        l = make_tensor((5, 10), device, dtype, noncontiguous=True)
-        r = make_tensor((5, 20), device, dtype, noncontiguous=True)
-        w = make_tensor((15, 10, 20), device, dtype)
+        l = make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True)
+        r = make_tensor((5, 20), dtype=dtype, device=device, noncontiguous=True)
+        w = make_tensor((15, 10, 20), dtype=dtype, device=device)
         self._check_einsum("bn,anm,bm->ba", l, w, r)
 
         # with strided tensors
@@ -4116,11 +3980,11 @@ def test_einsum(self, device, dtype):
 
     @dtypes(torch.double, torch.cdouble)
     def test_einsum_sublist_format(self, device, dtype):
-        x = make_tensor((5,), device, dtype)
-        y = make_tensor((7,), device, dtype)
-        A = make_tensor((3, 5), device, dtype)
-        B = make_tensor((2, 5), device, dtype)
-        C = make_tensor((2, 1, 3, 1, 4), device, dtype)
+        x = make_tensor((5,), dtype=dtype, device=device)
+        y = make_tensor((7,), dtype=dtype, device=device)
+        A = make_tensor((3, 5), dtype=dtype, device=device)
+        B = make_tensor((2, 5), dtype=dtype, device=device)
+        C = make_tensor((2, 1, 3, 1, 4), dtype=dtype, device=device)
 
         self._check_einsum(x, [0])
         self._check_einsum(x, [0], [])
@@ -4135,9 +3999,9 @@ def test_einsum_sublist_format(self, device, dtype):
         self._check_einsum(A.t(), [0, Ellipsis], B, [1, 0], [Ellipsis])
 
         # torch.bilinear with noncontiguous tensors
-        l = make_tensor((5, 10), device, dtype, noncontiguous=True)
-        r = make_tensor((5, 20), device, dtype, noncontiguous=True)
-        w = make_tensor((15, 10, 20), device, dtype)
+        l = make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True)
+        r = make_tensor((5, 20), dtype=dtype, device=device, noncontiguous=True)
+        w = make_tensor((15, 10, 20), dtype=dtype, device=device)
         self._check_einsum(l, [40, 41], w, [2, 41, 50], r, [40, 50], [40, 2])
 
     @dtypes(torch.double, torch.cdouble)
@@ -4214,7 +4078,7 @@ def test(n=10,                       # how many tests to generate
                         shape[ell_index:ell_index] = ell_shape
                         labels.insert(ell_index, ...)
 
-                    operands.append(make_tensor(shape, device, dtype))
+                    operands.append(make_tensor(shape, dtype=dtype, device=device))
                     sublists.append(labels)
 
                 # NumPy has a bug with the sublist format so for now we compare PyTorch sublist
@@ -4251,7 +4115,7 @@ def test(n=10,                       # how many tests to generate
     def test_einsum_corner_cases(self, device):
         def check(equation, *operands, expected_output):
             tensors = [torch.tensor(operand, device=device, dtype=torch.float32) if not isinstance(operand, tuple)
-                       else make_tensor(operand, device, torch.float32) for operand in operands]
+                       else make_tensor(operand, dtype=torch.float32, device=device) for operand in operands]
             output = torch.einsum(equation, tensors)
             self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
 
@@ -4293,8 +4157,8 @@ def check(*args, regex, exception=RuntimeError):
             with self.assertRaisesRegex(exception, r'einsum\(\):.*' + regex):
                 torch.einsum(*args)
 
-        x = make_tensor((2,), device, torch.float32)
-        y = make_tensor((2, 3), device, torch.float32)
+        x = make_tensor((2,), dtype=torch.float32, device=device)
+        y = make_tensor((2, 3), dtype=torch.float32, device=device)
 
         check('', [], regex=r'at least one operand', exception=ValueError)
         check('. ..', [x], regex=r'found \'.\' for operand 0 that is not part of any ellipsis')
@@ -4405,7 +4269,7 @@ def test_linalg_solve_triangular(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNoMagma  # Magma needed for the PLU decomposition
     @skipCUDAIfRocm  # There is a memory access bug in rocBLAS in the (non-batched) solve_triangular
-    @skipCUDAVersionIn([(11, 3), (11, 5)])  # Tracked in https://github.com/pytorch/pytorch/issues/70111
+    @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)])  # Tracked in https://github.com/pytorch/pytorch/issues/70111
     @dtypes(*floating_and_complex_types())
     @precisionOverride({torch.float32: 1e-2, torch.complex64: 1e-2,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -4643,98 +4507,86 @@ def test_triangular_solve_out_errors_and_warnings(self, device, dtype):
             self.assertTrue("An output with one or more elements was resized" in str(w[0].message))
             self.assertTrue("An output with one or more elements was resized" in str(w[1].message))
 
-    def check_single_matmul(self, x, y, shape):
-        a = np.array(x, copy=False)
-        b = np.array(y, copy=False)
-        expected = np.matmul(a, b)
 
+    def check_single_matmul(self, x, y):
+
+        def assertEqual(answer, expected):
+            if x.dtype.is_floating_point or x.dtype.is_complex:
+                k = max(x.shape[-1], 1)  # Scale the atol with the size of the matrix
+                self.assertEqual(answer, expected,
+                                 msg=f"{x.shape} x {y.shape} = {answer.shape}",
+                                 atol=k * 5e-5,
+                                 rtol=1e-4)
+            else:
+                self.assertEqual(answer, expected, msg=f"{x.shape} x {y.shape} = {answer.shape}")
+
+        # test x @ y
+        expected = np.matmul(x.cpu(), y.cpu())
         ans = torch.matmul(x, y)
         self.assertTrue(ans.is_contiguous())
-        self.assertTrue(np.array_equal(ans, expected))
+        assertEqual(ans, expected)
 
-        out = torch.zeros(*shape, dtype=torch.int64).to(x.device)
+        # test out
+        out = torch.empty_like(ans)
         ans = torch.matmul(x, y, out=out)
         self.assertIs(ans, out)
         self.assertTrue(ans.is_contiguous())
-        self.assertTrue(np.array_equal(ans, expected))
+        assertEqual(ans, expected)
 
-    # TODO: update to run on CUDA, too
-    @onlyCPU
-    def test_matmul_small_brute_force_1d_Nd(self, device):
-        # Issue #20452: range(0, 10) does not work.
-        n = 1
-        for m in range(1, 8):
-            for p in range(1, 8):
-                for o in range(1, 5):
-                    # 1d, 3d, inner dimensions C
-                    x = torch.arange(m, device=device)
-                    y = torch.arange(o * m * p, device=device).reshape(o, m, p)
-                    self.check_single_matmul(x, y, (o, n, p))
-
-                    # 1d, 3d, inner dimensions Fortran
-                    x = torch.arange(m, device=device)
-                    y = torch.arange(o * p * m, device=device).reshape(o, p, m).mT
-                    self.check_single_matmul(x, y, (o, n, p))
-
-                    # 1d, 3d, inner dimensions non-contiguous
-                    x = torch.arange(2 * m, device=device)[::2]
-                    y = torch.arange(o * m * 2 * p, device=device).reshape(o, m, 2 * p)[:, :, ::2]
-                    self.check_single_matmul(x, y, (o, n, p))
-
-                    for r in range(1, 5):
-                        # 1d, 4d, inner dimensions C
-                        x = torch.arange(m)
-                        y = torch.arange(r * o * m * p, device=device).reshape(r, o, m, p)
-                        self.check_single_matmul(x, y, (r, o, n, p))
-
-                        # 1d, 4d, inner dimensions Fortran
-                        x = torch.arange(m)
-                        y = torch.arange(r * o * p * m, device=device).reshape(r, o, p, m).mT
-                        self.check_single_matmul(x, y, (r, o, n, p))
-
-                        # 1d, 4d, inner dimensions non-contiguous
-                        x = torch.arange(2 * m, device=device)[::2]
-                        y = torch.arange(r * o * m * 2 * p, device=device).reshape(r, o, m, 2 * p)[:, :, :, ::2]
-                        self.check_single_matmul(x, y, (r, o, n, p))
-
-    # TODO: update to run on CUDA, too
-    @onlyCPU
-    def test_matmul_small_brute_force_2d_Nd(self, device):
-        # Issue #20452: range(0, 10) does not work.
-        for n in range(1, 5):
-            for m in range(1, 5):
-                for p in range(1, 5):
-                    for o in range(1, 3):
-                        # 2d, 3d, inner dimensions C
-                        x = torch.arange(n * m, device=device).reshape(n, m)
-                        y = torch.arange(o * m * p, device=device).reshape(o, m, p)
-                        self.check_single_matmul(x, y, (o, n, p))
-
-                        # 2d, 3d, inner dimensions Fortran
-                        x = torch.arange(m * n, device=device).reshape(m, n).mT
-                        y = torch.arange(o * p * m, device=device).reshape(o, p, m).mT
-                        self.check_single_matmul(x, y, (o, n, p))
-
-                        # 2d, 3d, inner dimensions non-contiguous
-                        x = torch.arange(n * 2 * m, device=device).reshape(n, 2 * m)[:, ::2]
-                        y = torch.arange(o * m * 2 * p, device=device).reshape(o, m, 2 * p)[:, :, ::2]
-                        self.check_single_matmul(x, y, (o, n, p))
-
-                        for r in range(1, 2):
-                            # 2d, 4d, inner dimensions C
-                            x = torch.arange(n * m, device=device).reshape(n, m)
-                            y = torch.arange(r * o * m * p, device=device).reshape(r, o, m, p)
-                            self.check_single_matmul(x, y, (r, o, n, p))
-
-                            # 2d, 4d, inner dimensions Fortran
-                            x = torch.arange(m * n, device=device).reshape(m, n).mT
-                            y = torch.arange(r * o * p * m, device=device).reshape(r, o, p, m).mT
-                            self.check_single_matmul(x, y, (r, o, n, p))
-
-                            # 2d, 4d, inner dimensions non-contiguous
-                            x = torch.arange(n * 2 * m, device=device).reshape(n, 2 * m)[:, ::2]
-                            y = torch.arange(r * o * m * 2 * p, device=device).reshape(r, o, m, 2 * p)[:, :, :, ::2]
-                            self.check_single_matmul(x, y, (r, o, n, p))
+    def gen_sizes_matmul(self, x_dim, y_dim=4, matrix_size=4, batch_size=3):
+        """
+        Generates sequences of tuples (x, y) of with size(x) = x_dim and
+        size(y) <= y_dim that are compatible wrt. matmul
+        """
+        assert x_dim >= 1
+        assert y_dim >= 2
+        x = x_dim
+        for y in range(1, y_dim + 1):
+            for batch, mn in product(product(range(batch_size), repeat=max(x - 2, y - 2, 0)),
+                                     product(range(matrix_size), repeat=min(y, 2))):
+                if x == 1:
+                    size_x = mn[:1]
+                    size_y = batch + mn
+                    yield size_x, size_y
+                else:
+                    for k in range(matrix_size):
+                        size_x = (k,) + mn[:1]
+                        if x > 2:
+                            size_x = batch[-(x - 2):] + size_x
+                        size_y = mn
+                        if y > 2:
+                            size_y = batch[-(y - 2):] + size_y
+                        yield size_x, size_y
+
+    @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
+    @dtypes(torch.int64, torch.float, torch.complex64)
+    def test_matmul_small_brute_force_1d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(1), (True, False), (True, False)):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
+
+    @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
+    @dtypes(torch.int64, torch.float, torch.complex64)
+    def test_matmul_small_brute_force_2d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(2), (True, False), (True, False)):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
+
+    @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
+    @dtypes(torch.int64, torch.float, torch.complex64)
+    def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(3), (True, False), (True, False)):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
 
     def test_linear_algebra_scalar_raises(self, device) -> None:
         m = torch.randn(5, 5, device=device)
@@ -5050,9 +4902,40 @@ def call_torch_fn(*args, **kwargs):
             A_LU, pivots = fn(torch.lu, (2, 0, 0))
             self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
 
-    @dtypesIfCUDA(torch.cfloat, torch.cdouble,
-                  *get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater)))
-    @dtypes(*(set(get_all_dtypes()) - {torch.half, torch.bool}))
+    @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
+                        torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.half] if not CUDA9 else [],
+                  *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else []
+                  ))
+    @dtypes(*all_types_and_complex_and(torch.bfloat16))
+    def test_corner_cases_of_cublasltmatmul(self, device, dtype):
+        # common case
+        M = torch.randn(128, device=device).to(dtype)
+        m1 = torch.randn(2048, 2400, device=device).to(dtype)
+        m2 = torch.randn(128, 2400, device=device).to(dtype)
+        torch.nn.functional.linear(m1, m2, M)
+        # Ntrans_B has ld >> rows
+        m1 = torch.rand([128, 2400]).to(dtype).to(device).t()
+        m2 = torch.rand([2048, 25272]).to(dtype).to(device).t()[21940:24340]
+        M = torch.rand([128]).to(dtype).to(device)
+        torch.addmm(M, m2.t(), m1)
+        # trans_A has ld >> rows
+        m1 = torch.rand([128, 25272]).to(dtype).to(device)[:, 21940:24340].t()
+        m2 = torch.randn(2048, 2400, device=device).to(dtype)
+        M = torch.rand([128]).to(dtype).to(device)
+        torch.addmm(M, m2, m1)
+        # large tensor dim > 65535
+        M = torch.randn(16, device=device).to(dtype)
+        m1 = torch.randn(32, 131071 , device=device).to(dtype)
+        m2 = torch.randn(16, 131071, device=device).to(dtype)
+        torch.nn.functional.linear(m1, m2, M)
+
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.half] if not CUDA9 else [],
+                  *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else []
+                  ))
+    @dtypes(*all_types_and_complex_and(torch.bfloat16))
     def test_blas_alpha_beta_empty(self, device, dtype):
         # This test is disabled on CUDA 9 due to:
         # See: https://github.com/pytorch/pytorch/issues/31006
@@ -5088,7 +4971,7 @@ def test_blas_alpha_beta_empty(self, device, dtype):
         self.assertEqual(torch.full((2, 3), beta * value, dtype=dtype, device=device),
                          torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out))
 
-    @dtypes(*(get_all_complex_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16))
     def test_blas_nan_out(self, device, dtype):
         # These functions should work correctly with NaN filled outputs,
         # but need special handling, see [NOTE: cpu_zero]
@@ -5249,8 +5132,11 @@ def test_householder_product_errors_and_warnings(self, device):
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
-    def test_linalg_lu_factor_and_lu(self, device, dtype):
-        # Tests lu, linalg.lu_factor and linalg.lu_factor_ex
+    def test_linalg_lu_factor_and_lu_and_lu_unpack(self, device, dtype):
+        # Tests torch.lu
+        #       torch.linalg.lu_factor
+        #       torch.linalg.lu_factor_ex
+        #       torch.lu_unpack
         from torch.testing._internal.common_utils import random_matrix
 
         def run_test(A, pivot, singular, fn):
@@ -5273,9 +5159,14 @@ def run_test(A, pivot, singular, fn):
             if not pivot:
                 self.assertEqual(pivots, torch.arange(1, 1 + k, device=device, dtype=torch.int32).expand(batch + (k, )))
 
-            P, L, U = torch.lu_unpack(LU, pivots)
+            P, L, U = torch.lu_unpack(LU, pivots, unpack_pivots=pivot)
 
-            self.assertEqual(P @ L @ U, A)
+            self.assertEqual(P @ L @ U if pivot else L @ U, A)
+
+            PLU = torch.linalg.lu(A, pivot=pivot)
+            self.assertEqual(P, PLU.P)
+            self.assertEqual(L, PLU.L)
+            self.assertEqual(U, PLU.U)
 
         sizes = ((3, 3), (5, 5), (4, 2), (3, 4), (0, 0), (0, 1), (1, 0))
         batches = ((0,), (2,), (3,), (1, 0), (3, 5))
@@ -5306,39 +5197,10 @@ def run_test(A, pivot, singular, fn):
 
         if self.device_type == 'cpu':
             # Error checking, no pivoting variant on CPU
-            with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'):
-                torch.lu(torch.empty(1, 2, 2), pivot=False)
-
-            with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'):
-                torch.linalg.lu_factor(torch.empty(1, 2, 2), pivot=False)
-
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
-    @skipCUDAIfRocm
-    @precisionOverride({torch.float: 1e-3})
-    def test_lu_unpack(self, device, dtype):
-        def run_test(pivot):
-            for shape in ((3, 3), (5, 3, 3), (7, 3, 5, 5), (7, 5, 3, 3, 3)):
-                a = torch.randn(*shape, dtype=dtype, device=device)
-                a_lu, p = torch.lu(a, pivot=pivot)
-                p_ref, l_ref, u_ref = torch.lu_unpack(a_lu, p)
-                self.assertEqual(p_ref.matmul(l_ref.matmul(u_ref)), a)
-            for shape in ((3, 3), (5, 3, 3), (7, 3, 5, 5), (7, 5, 3, 3, 3),
-                          (3, 5), (5, 3), (3, 3, 5), (3, 5, 3),
-                          (7, 5, 3, 5, 3), (7, 5, 3, 3, 5),
-                          # empty tensors
-                          (0, 0), (0, 0, 0), (0, 3, 3)
-                          ):
-                a = make_tensor(shape, dtype=dtype, device=device, low=-0.1, high=+0.1)
-                a_lu, p = torch.lu(a, pivot=pivot)
-                p_ref, l_ref, u_ref = torch.lu_unpack(a_lu, p)
-                self.assertEqual(p_ref.matmul(l_ref.matmul(u_ref)), a)
-
-        run_test(True)
-
-        if self.device_type == 'cuda':
-            run_test(False)
+            fns = [torch.lu, torch.linalg.lu_factor, torch.linalg.lu_factor_ex, torch.linalg.lu]
+            for f in fns:
+                with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'):
+                    f(torch.empty(1, 2, 2), pivot=False)
 
     @skipCPUIfNoLapack
     @skipCUDAIfNoMagma
@@ -5349,21 +5211,18 @@ def test_lu_unpack_check_input(self, device, dtype):
 
         with self.assertRaisesRegex(RuntimeError, "torch.int32 dtype"):
             torch.lu_unpack(lu_data, lu_pivots.long())
-        with self.assertRaisesRegex(RuntimeError, "contiguous tensor"):
-            torch.lu_unpack(lu_data, lu_pivots.mT)
 
         # check that onces flags are unset, Nones are returned
         p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_data=False)
-        self.assertTrue((l == u) and l is None)
+        self.assertTrue(l.numel() == 0 and u.numel() == 0)
         p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_pivots=False)
-        self.assertTrue(p is None)
+        self.assertTrue(p.numel() == 0)
         p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_data=False, unpack_pivots=False)
-        self.assertTrue((p == l == u) and p is None)
+        self.assertTrue(p.numel() == 0 and l.numel() == 0 and u.numel() == 0)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
-    @skipCUDAIfRocm
     def test_lobpcg_basic(self, device, dtype):
         self._test_lobpcg_method(device, dtype, 'basic')
 
@@ -5674,7 +5533,7 @@ def tracker(worker):
 ---(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})---
 '''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter))
 
-    def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
+    def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, activation=None):
         dtype = t.dtype
         numpy_dtype = dtype
         if dtype in {torch.bfloat16}:
@@ -5693,15 +5552,19 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
         res3 = alpha * (m.to(numpy_dtype).cpu().numpy() @ v.to(numpy_dtype).cpu().numpy())
         if beta != 0:
             res3 += (beta * t).to(numpy_dtype).cpu().numpy()
+        if activation == "relu":
+            res3 = res3 * (res3 > 0)
+        else:
+            assert activation is None, f"unsupported activation {activation}"
         res3 = torch.from_numpy(res3).to(dtype)
         self.assertEqual(res1, res2)
         self.assertEqual(res1, res3)
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*get_all_complex_dtypes(),
-                  *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)),
-                                     include_half=(not TEST_WITH_ROCM)))
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else [],
+                  *[torch.half]))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
         # have to use torch.randn(...).to(bfloat16) instead of
@@ -5736,7 +5599,8 @@ def test_addmv(self, device, dtype):
         for m, v in itertools.product(ms, vs):
             self._test_addmm_addmv(torch.addmv, t, m, v, beta=0)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
+    @dtypesIfCUDA(*floating_types_and(*[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and
+                  SM53OrLater) else []))
     @dtypes(torch.float, torch.double)
     def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
         # tests (o, s)*(s).  o is output size, s is summed size.
@@ -5765,29 +5629,23 @@ def _test(row_major, incx, incy, lda_tail):
         for row_major, incx, incy, lda_tail in itertools.product((False, True), (1, 2), (1, 2), (0, 1)):
             _test(row_major, incx, incy, lda_tail)
 
-    @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
-                        torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*get_all_complex_dtypes(),
-                  *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
-    @dtypes(*get_all_complex_dtypes(), *get_all_fp_dtypes())
-    @tf32_on_and_off(0.05)
-    def test_addmm(self, device, dtype):
+    def _test_addmm_impl(self, func, activation, device, dtype):
         M = torch.randn(10, 25, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        self._test_addmm_addmv(torch.addmm, M, m1, m2)
+        self._test_addmm_addmv(func, M, m1, m2, activation=activation)
 
         # Test 0-strided
         M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25)
         m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        self._test_addmm_addmv(torch.addmm, M, m1, m2)
+        self._test_addmm_addmv(func, M, m1, m2, activation=activation)
 
         # Test beta=0, M=nan
         M = torch.full((10, 25), math.nan, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        self._test_addmm_addmv(torch.addmm, M, m1, m2, beta=0)
+        self._test_addmm_addmv(func, M, m1, m2, beta=0, activation=activation)
 
         # Test transpose
         for t1, t2, t3, t4 in itertools.product([True, False], repeat=4):
@@ -5799,10 +5657,29 @@ def maybe_transpose(cond, m):
             M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
             m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
             m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
-            self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4)
+            self._test_addmm_addmv(func, M, m1, m2, transpose_out=t4, activation=activation)
+
+    @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
+                        torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
+    @dtypesIfMPS(torch.float32)
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else []))
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
+    @tf32_on_and_off(0.05)
+    def test_addmm(self, device, dtype):
+        self._test_addmm_impl(torch.addmm, None, device, dtype)
+
+    @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
+                        torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
+    @dtypesIfCUDA(*floating_types_and(
+                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else []))
+    @dtypes(*floating_types_and(torch.bfloat16))
+    @tf32_on_and_off(0.05)
+    def test_addmm_activation(self, device, dtype):
+        self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
 
     @dtypes(torch.float, torch.double)
-    @dtypesIfCUDA(*([torch.float, torch.double] + get_all_complex_dtypes()))
+    @dtypesIfCUDA(*floating_and_complex_types())
     @tf32_on_and_off(0.005)
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
@@ -5855,7 +5732,8 @@ def test_matmul_45724(self, device):
 
     @slowTest
     @onlyNativeDeviceTypes
-    @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.int32, torch.int64, torch.cfloat, torch.cdouble)
+    # bfloat16 doesn't have sufficient precision to pass this test
+    @dtypes(torch.float32, torch.float64, torch.int32, torch.int64, torch.cfloat, torch.cdouble)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.cfloat, torch.cdouble)
     @tf32_on_and_off(0.01)
     def test_mm(self, device, dtype):
@@ -5998,9 +5876,8 @@ def test_strided_mm_bmm(self, device, dtype):
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
-    @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1")
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
@@ -6032,8 +5909,8 @@ def invert_perm(p):
         def generate_inputs(num_batches):
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
-                b1 = make_tensor((num_batches, M, N), device, dtype, low=-0.1, high=0.1)
-                b2 = make_tensor((num_batches, N, O), device, dtype, low=-0.1, high=0.1)
+                b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-0.1, high=0.1)
+                b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-0.1, high=0.1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
@@ -6041,8 +5918,8 @@ def generate_inputs(num_batches):
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-0.1, high=0.1).expand(num_batches, M, N)
-                b2 = make_tensor(shape2, device, dtype, low=-0.1, high=0.1).expand(num_batches, N, O)
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-0.1, high=0.1).expand(num_batches, M, N)
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-0.1, high=0.1).expand(num_batches, N, O)
                 yield b1, b2
             # zero-sized tensors
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
@@ -6112,7 +5989,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_addbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
@@ -6132,9 +6009,9 @@ def test_addbmm(self, device, dtype):
                 is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
 
         if not is_supported:
-            b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-            b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
-            t = make_tensor((M, O), device, dtype, low=-1, high=1)
+            b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
+            b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1)
+            t = make_tensor((M, O), dtype=dtype, device=device, low=-1, high=1)
             self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED",
                                    lambda: torch.addbmm(t, b1, b2))
             return
@@ -6148,8 +6025,8 @@ def generate_tensor():
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 for perm3 in itertools.permutations((0, 1)):
-                    b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) * 0.1
-                    b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) * 0.1
+                    b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) * 0.1
+                    b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) * 0.1
                     b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                     b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                     ref = torch.from_numpy(
@@ -6161,8 +6038,8 @@ def generate_tensor():
             for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1)
                 shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) * 0.1
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) * 0.1
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N) * 0.1
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O) * 0.1
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
                 ).to(device=device, dtype=dtype).sum(0)
@@ -6172,8 +6049,8 @@ def generate_tensor():
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
                 shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
                 shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1) * 0.1
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1) * 0.1
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1) * 0.1
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1) * 0.1
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
                 ).to(device=device, dtype=dtype).sum(0)
@@ -6185,7 +6062,7 @@ def generate_tensor():
 
     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5})
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_baddbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
@@ -6202,9 +6079,9 @@ def test_baddbmm(self, device, dtype):
             is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
 
         if not is_supported:
-            b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-            b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
-            t = make_tensor((num_batches, M, O), device, dtype, low=-1, high=1)
+            b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
+            b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1)
+            t = make_tensor((num_batches, M, O), dtype=dtype, device=device, low=-1, high=1)
             self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED",
                                    lambda: torch.baddbmm(t, b1, b2))
             return
@@ -6217,8 +6094,8 @@ def generate_tensor():
             numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32
             # transposed tensors
             for perm1, perm2, perm3 in itertools.product(itertools.permutations((0, 1, 2)), repeat=3):
-                b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-                b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
+                b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
+                b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 ref = torch.from_numpy(
@@ -6230,8 +6107,8 @@ def generate_tensor():
             for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1)
                 shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N)
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O)
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
                 out_tensor = torch.zeros_like(ref)
@@ -6240,8 +6117,8 @@ def generate_tensor():
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
                 shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
                 shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = make_tensor(shape1, device, dtype, low=-2, high=2)
-                b2 = make_tensor(shape2, device, dtype, low=-2, high=2)
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-2, high=2)
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-2, high=2)
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
                 out_tensor = torch.zeros_like(ref)
@@ -6260,12 +6137,10 @@ def test_solve_methods_arg_device(self, device):
             b = torch.randn(3, 1, device=b_device)
             A = torch.randn(3, 3, device=A_device)
 
-            # solve and cholesky_solve goes through generic backend dispatch and hit kernel specific device check first
+            # cholesky_solve goes through generic backend dispatch and hit kernel specific device check first
             # triangular_solve goes through specific backend dispatch (CPU/CUDA) and hit auto-generated device check first
             generic_backend_dispatch_err_str = "Expected b and A to be on the same device"
             specific_backend_dispatch_err_str = "Expected all tensors to be on the same device"
-            with self.assertRaisesRegex(RuntimeError, generic_backend_dispatch_err_str):
-                torch.solve(b, A)
 
             with self.assertRaisesRegex(RuntimeError, generic_backend_dispatch_err_str):
                 torch.cholesky_solve(b, A)
@@ -6324,7 +6199,7 @@ def run_test(M):
     @dtypes(torch.double, torch.cdouble)
     def test_matrix_power_non_negative(self, device, dtype):
         def check(*size):
-            t = make_tensor(size, device, dtype)
+            t = make_tensor(size, dtype=dtype, device=device)
             for n in range(8):
                 res = torch.linalg.matrix_power(t, n)
                 ref = np.linalg.matrix_power(t.cpu().numpy(), n)
@@ -7271,6 +7146,7 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot):
         if self.device_type == 'cuda':
             sub_test(False)
 
+    @skipCUDAIfRocm  # ROCm: test was exceptionally slow, even for slow tests. Skip until triage.
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -7764,6 +7640,104 @@ def test_tensordot(self, device):
         an = torch.from_numpy(np.tensordot(np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0))
         self.assertEqual(a, an)
 
+    @skipCUDAIfNoCusolver
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @skipCUDAIfRocm
+    @dtypes(*floating_and_complex_types())
+    def test_ldl_factor(self, device, dtype):
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        def run_test(shape, batch, hermitian):
+            A = random_hermitian_pd_matrix(shape, *batch, dtype=dtype, device=device)
+            actual_factors, actual_pivots, info = torch.linalg.ldl_factor_ex(A, hermitian=hermitian)
+            actual_L = torch.tril(actual_factors, diagonal=-1)
+            actual_L.diagonal(0, -2, -1).fill_(1.0)
+
+            # This test is designed only for inputs with 1x1 block diagonal matrix D.
+            # That is for positive definite input matrices, the pivots tensor is always > 0.
+            # If negative pivots are encountered, it means that the input matrix is not positive definite.
+            # And matrix D is a 2x2 block diagonal matrix.
+            self.assertTrue((actual_pivots > 0).all())
+
+            # Construct a 1x1 block diagonal matrix D from factors.
+            actual_D = torch.diag_embed(actual_factors.diagonal(0, -2, -1))
+
+            def T(x):
+                return x.mH if hermitian else x.mT
+            A_reconstructed = actual_L @ actual_D @ T(actual_L)
+
+            def symmetric(A):
+                return A.tril() + A.tril(-1).mT
+
+            self.assertEqual(symmetric(A) if not hermitian else A, A_reconstructed)
+
+            # Now test against SciPy implementation
+            if TEST_SCIPY:
+                from scipy.linalg import ldl as scipy_ldl
+                A_np = A.cpu().numpy()
+                np_dtype = A_np.dtype
+                scipy_ldl_batched = np.vectorize(
+                    lambda x: scipy_ldl(x, hermitian=hermitian, lower=True),
+                    otypes=[np_dtype, np_dtype, np.dtype('int64')],
+                    signature='(m,m)->(m,m),(m,m),(m)')
+
+                expected = scipy_ldl_batched(A_np)
+                expected_L, expected_D, expected_pivots = expected
+
+                if expected_pivots.ndim > 1:
+                    permuted_expected_L = np.stack(
+                        [expected_L[i][expected_pivots[i], :] for i in range(expected_pivots.shape[0])]
+                    )
+                else:
+                    permuted_expected_L = expected_L[expected_pivots, :]
+                self.assertEqual(actual_L, permuted_expected_L)
+                self.assertEqual(actual_D, expected_D)
+            else:
+                self.assertEqual(actual_factors.shape, A.shape)
+                self.assertEqual(actual_pivots.shape, A.shape[:-1])
+                self.assertEqual(info.shape, A.shape[:-2])
+
+        # hermitian=True for complex inputs on CUDA is supported only with MAGMA 2.5.4+
+        magma_254_available = self.device_type == 'cuda' and _get_magma_version() >= (2, 5, 4)
+        hermitians = (True, False) if dtype.is_complex and (self.device_type == 'cpu' or magma_254_available) else (False,)
+
+        shapes = (5,)
+        batches = ((), (4,),)
+        for shape, batch, hermitian in itertools.product(shapes, batches, hermitians):
+            run_test(shape, batch, hermitian)
+
+    @skipCUDAIfNoCusolver
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @skipCUDAIfRocm
+    @skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1")
+    @dtypes(*floating_and_complex_types())
+    def test_ldl_solve(self, device, dtype):
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        def run_test(shape, batch, nrhs, hermitian):
+            A = random_hermitian_pd_matrix(shape, *batch, dtype=dtype, device=device)
+            B = make_tensor((*A.shape[:-1], nrhs), dtype=dtype, device=device)
+            factors, pivots, info = torch.linalg.ldl_factor_ex(A, hermitian=hermitian)
+            X = torch.linalg.ldl_solve(factors, pivots, B, hermitian=hermitian)
+
+            def symmetric(A):
+                return A.tril() + A.tril(-1).mT
+
+            # verify A @ X == B
+            expected_B = symmetric(A) @ X if not hermitian else A @ X
+            self.assertEqual(B, expected_B)
+
+        # hermitian=True is not supported on CUDA yet
+        hermitians = (True, False) if dtype.is_complex and self.device_type == 'cpu' else (False,)
+
+        shapes = (5,)
+        batches = ((), (4,), (2, 2))
+        nrhss = (1, 7)
+        for shape, batch, nrhs, hermitian in itertools.product(shapes, batches, nrhss, hermitians):
+            run_test(shape, batch, nrhs, hermitian)
+
     @onlyCUDA
     @skipCUDAIfNoMagma
     @skipCUDAIfNoCusolver
diff --git a/test/test_logging.py b/test/test_logging.py
index 4bb057fd157a..01fdd3f8edd8 100644
--- a/test/test_logging.py
+++ b/test/test_logging.py
@@ -12,10 +12,10 @@ def testApiUsage(self):
         subprocess
         """
         s = TestCase.runWithPytorchAPIUsageStderr("import torch")
-        self.assertRegexpMatches(s, "PYTORCH_API_USAGE.*import")
+        self.assertRegex(s, "PYTORCH_API_USAGE.*import")
         # import the shared library directly - it triggers static init but doesn't call anything
         s = TestCase.runWithPytorchAPIUsageStderr("from ctypes import CDLL; CDLL('{}')".format(torch._C.__file__))
-        self.assertNotRegexpMatches(s, "PYTORCH_API_USAGE")
+        self.assertNotRegex(s, "PYTORCH_API_USAGE")
 
 
 if __name__ == '__main__':
diff --git a/test/test_masked.py b/test/test_masked.py
index 24593d156fd2..4b8fab87318f 100644
--- a/test/test_masked.py
+++ b/test/test_masked.py
@@ -6,13 +6,15 @@
 import itertools
 import torch
 from typing import List, Any
+from functools import wraps
+import unittest
 
 from torch.testing._internal.common_utils import \
-    (TestCase, suppress_warnings)
+    (TestCase, parametrize, suppress_warnings, _TestParametrizer, run_tests)
 from torch.testing._internal.common_methods_invocations import \
-    (op_db,)
+    (op_db, SampleInput)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, onlyNativeDeviceTypes)
+    (instantiate_device_type_tests, ops, onlyNativeDeviceTypes, precisionOverride)
 
 
 def apply_masked_reduction_along_dim(op, input, *args, **kwargs):
@@ -111,7 +113,10 @@ def apply_masked_reduction_along_dim(op, input, *args, **kwargs):
     output = input.new_full(shape, float('nan') if dtype.is_floating_point else 0, dtype=dtype)
 
     # apply op to all elementary slices:
-    inpmask = torch._masked._input_mask(input, mask=mask)
+    if mask is None:
+        inpmask = input.new_ones([], dtype=torch.bool).expand(input.shape)
+    else:
+        inpmask = torch._masked._input_mask(input, mask=mask)
     for s in itertools.product(*ranges):
         # data of an elementary slice is 1D sequence and has only
         # masked-in elements:
@@ -140,7 +145,10 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs):
     dim = args[dim_pos]
     args0 = args[:dim_pos] + (0,) + args[dim_pos + 1:]
     output = torch.zeros_like(input, dtype=dtype)
-    inpmask = torch._masked._input_mask(input, mask=mask)
+    if mask is None:
+        inpmask = input.new_ones([], dtype=torch.bool).expand(input.shape)
+    else:
+        inpmask = torch._masked._input_mask(input, mask=mask)
     dim_ = dim % input.ndim
     left_ranges = tuple(map(range, input.shape[:dim_]))
     right_ranges = tuple(map(range, input.shape[dim_ + 1:]))
@@ -153,6 +161,7 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs):
 reference_functions = dict(
     norm=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.linalg.vector_norm, *args, **dict(kwargs, dim_position=1)),
     var=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.var, *args, **dict(kwargs, dim_position=0)),
+    std=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.std, *args, **dict(kwargs, dim_position=0)),
     softmax=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.softmax, *args, **kwargs),
     log_softmax=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.log_softmax, *args, **kwargs),
     softmin=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.nn.functional.softmin, *args, **kwargs),
@@ -162,28 +171,262 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs):
 
 masked_ops = [op for op in op_db if op.name.startswith('_masked.')]
 masked_ops_with_references = [op for op in masked_ops if op.name.rsplit('.', 1)[-1] in reference_functions]
+masked_ops_with_non_strided_support = [op for op in masked_ops if op.supports_sparse or op.supports_sparse_csr]
+
+
+def _tensor_to_strided(obj):
+    # after gh-59958 is resolved, replace the usage of this function
+    # with torch.Tensor.to_dense
+    if torch.is_tensor(obj):
+        if obj.layout == torch.strided:
+            return obj
+        return obj.to_dense()
+    return obj
+
+
+def to_strided(obj):
+    """Convert the tensor content of object to strided tensor content.
+    """
+    return torch.utils._pytree.tree_map(_tensor_to_strided, obj)
+
+
+def to_sparse_coo(obj):
+    """Convert the tensor content of object to sparse coo tensor content.
+    """
+    return torch.utils._pytree.tree_map(torch.Tensor.to_sparse, obj)
+
+
+def to_sparse_csr(obj):
+    """Convert the tensor content of object to sparse csr tensor content.
+    """
+    return torch.utils._pytree.tree_map(torch.Tensor.to_sparse_csr, obj)
+
+
+class mask_layouts(_TestParametrizer):
+    """Decorator class for parametrization of test function with an input
+    layout argument and an extra argument of sample inputs generator.
+    The sample_inputs generator provides samples with all supported
+    layouts for the mask argument.
+    """
+    def _parametrize_test(self, test, generic_cls, device_cls):
+
+        @wraps(test)
+        def wrap(self, layout, device, dtype, op):
+            layout_name = str(layout).lstrip('torch.')
+            if layout == torch.strided:
+                # strided layouts are always supported
+                sample_inputs_func = op.sample_inputs
+            elif layout == torch.sparse_coo:
+                if not op.supports_sparse:
+                    raise unittest.SkipTest(f"{op.name} does not support inputs with {layout_name} layout")
+                sample_inputs_func = op.sample_inputs_sparse_coo
+            elif layout == torch.sparse_csr:
+                if not op.supports_sparse_csr:
+                    raise unittest.SkipTest(f"{op.name} does not support inputs with {layout_name} layout")
+                sample_inputs_func = op.sample_inputs_sparse_csr
+            else:
+                raise NotImplementedError(f'{layout}')
+
+            def sample_inputs_generator():
+                for sample_input in sample_inputs_func(device, dtype):
+                    mask = sample_input.kwargs.get('mask')
+                    if mask is None:
+                        yield sample_input
+                    else:
+                        if layout == sample_input.input.layout:
+                            yield sample_input
+                        if layout != torch.strided:
+                            sample_input_kwargs = sample_input.kwargs.copy()
+                            sample_input_kwargs.update(mask=mask.to_dense())
+                            yield SampleInput(sample_input.input.clone(),
+                                              args=sample_input.args,
+                                              kwargs=sample_input_kwargs)
+                        if layout != torch.sparse_coo and op.supports_sparse:
+                            sample_input_kwargs = sample_input.kwargs.copy()
+                            sample_input_kwargs.update(mask=mask.to_sparse())
+                            yield SampleInput(sample_input.input.clone(),
+                                              args=sample_input.args,
+                                              kwargs=sample_input_kwargs)
+                        if layout != torch.sparse_csr and op.supports_sparse_csr and sample_input.input.ndim == 2:
+                            sample_input_kwargs = sample_input.kwargs.copy()
+                            sample_input_kwargs.update(mask=mask.to_sparse_csr())
+                            yield SampleInput(sample_input.input.clone(),
+                                              args=sample_input.args,
+                                              kwargs=sample_input_kwargs)
+
+            test(self, layout, device, dtype, op, sample_inputs_generator())
+
+        for layout in (torch.strided, torch.sparse_coo, torch.sparse_csr):
+            yield (wrap, str(layout).lstrip('torch.'), {'layout': layout})
 
 
 class TestMasked(TestCase):
 
+    def assertEqualMasked(self, actual, expected, mask):
+        strided = to_strided(actual)
+        if mask is not None:
+            strided = torch.where(mask, strided, strided.new_zeros([]))
+            expected = torch.where(mask, expected, expected.new_zeros([]))
+        self.assertEqual(strided, expected, exact_device=False)
+
     @onlyNativeDeviceTypes
     @suppress_warnings
     @ops(masked_ops_with_references)
+    @precisionOverride({torch.bfloat16: 5e-4, torch.float16: 5e-4})
     def test_reference_masked(self, device, dtype, op):
         op_name = op.name.rsplit('.', 1)[-1]
         ref_op = reference_functions[op_name]
         sample_inputs = op.sample_inputs(device, dtype)
         for sample_input in sample_inputs:
             t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs
-            if op_name == 'var' and not (t_inp.dtype.is_floating_point or t_inp.dtype.is_complex):
-                # torch.var does not support integer inputs
+            if op_name in {'var', 'std'} and not (t_inp.dtype.is_floating_point or t_inp.dtype.is_complex):
+                # torch.var/torch.std does not support integer inputs
                 continue
             actual = op.op(t_inp, *t_args, **t_kwargs)
             expected = ref_op(t_inp, *t_args, **t_kwargs)
-            outmask = torch._masked._output_mask(op.op, t_inp, *t_args, **t_kwargs)
-            actual = torch.where(outmask, actual, actual.new_zeros([]))
-            expected = torch.where(outmask, expected, expected.new_zeros([]))
-            self.assertEqual(actual, expected, exact_device=False)
+            if t_kwargs.get('mask') is None:
+                outmask = None
+            else:
+                outmask = torch._masked._output_mask(op.op, t_inp, *t_args, **t_kwargs)
+            self.assertEqualMasked(actual, expected, outmask)
+
+    @mask_layouts()
+    @onlyNativeDeviceTypes
+    @suppress_warnings
+    @ops(masked_ops_with_non_strided_support)
+    @precisionOverride({torch.bfloat16: 5e-3, torch.float16: 5e-3})
+    def test_mask_layout(self, layout, device, dtype, op, sample_inputs):
+        for sample in sample_inputs:
+            t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs
+            actual = op.op(t_inp, *t_args, **t_kwargs)
+
+            assert actual.layout == layout
+
+            # check masked invariance:
+            #  op(inp, mask).to_dense() == op(inp.to_dense(), mask.to_dense()) at outmask
+            #
+            r_inp, r_args, r_kwargs = to_strided((t_inp, t_args, t_kwargs))
+            if r_kwargs.get('mask') is None:
+                outmask = None
+            else:
+                outmask = torch._masked._output_mask(op.op, r_inp, *r_args, **r_kwargs)
+            expected = op.op(r_inp, *r_args, **r_kwargs)
+            self.assertEqualMasked(actual, expected, outmask)
+
+    @parametrize("sparse_kind,fill_value", [('coo', 0), ('hybrid_coo', 0),
+                                            ('coo', 123), ('hybrid_coo', 123),
+                                            ('csr', 0), ('csr', 123)],
+                 name_fn=lambda sparse_kind, fill_value: f'{sparse_kind}_fill_value_{fill_value}')
+    def test_where(self, sparse_kind, fill_value):
+
+        is_hybrid = False
+        if sparse_kind == 'coo':
+
+            def to_sparse(dense):
+                return dense.to_sparse(2)
+
+            def set_values(sparse, index, value):
+                sparse._values()[index] = value
+
+        elif sparse_kind == 'hybrid_coo':
+            is_hybrid = True
+
+            def to_sparse(dense):
+                return dense.to_sparse(1)
+
+            def set_values(sparse, index, value):
+                sparse._values()[index] = value
+
+        elif sparse_kind == 'csr':
+
+            def to_sparse(dense):
+                return dense.to_sparse_csr()
+
+            def set_values(sparse, index, value):
+                sparse.values()[index] = value
+
+        else:
+            assert 0, sparse_kind
+
+        mask = torch.tensor([[1, 0, 1, 0, 0],
+                             [1, 1, 1, 1, 0],
+                             [0, 1, 0, 1, 0],
+                             [0, 0, 0, 0, 0],
+                             [0, 0, 1, 1, 0],
+                             [1, 1, 0, 0, 0]]).to(dtype=bool)
+        mask = to_sparse(mask)
+        # make some specified mask elements as explicit masked-out masks:
+        if is_hybrid:
+            set_values(mask, (1, 1), False)
+            set_values(mask, (-2, -2), False)
+        else:
+            set_values(mask, 3, False)
+            set_values(mask, -3, False)
+
+        input = torch.tensor([[1, 0, 0, 0, -1],
+                              [2, 3, 0, 0, -2],
+                              [0, 4, 5, 0, -3],
+                              [0, 0, 6, 7, 0],
+                              [0, 8, 9, 0, -3],
+                              [10, 11, 0, 0, -5]])
+        input = to_sparse(input)
+        # make specified input elements have zero values:
+        if is_hybrid:
+            set_values(input, (1, 1), 0)
+            set_values(input, (-1, 0), 0)
+            F = fill_value
+        else:
+            set_values(input, 3, 0)
+            set_values(input, -3, 0)
+            F = 0
+
+        # expected where result:
+        Z = 99
+        # Z value corresponds to masked-in elements that are not
+        # specified in the input and it will be replaced with a zero
+        tmp = torch.tensor([[1, F, Z, F, F],
+                            [2, F, Z, Z, F],
+                            [F, 4, F, Z, F],
+                            [0, 0, 0, 0, 0],
+                            [F, F, 9, F, F],
+                            [Z, 11, F, F, F]])
+        tmp = to_sparse(tmp)
+
+
+        sparse = torch._masked._where(mask, input,
+                                      torch.tensor(fill_value, dtype=input.dtype, device=input.device))
+
+        if tmp.layout == torch.sparse_coo:
+            expected_sparse = torch.sparse_coo_tensor(
+                tmp.indices(),
+                torch.where(tmp.values() != Z, tmp.values(), tmp.values().new_full([], 0)),
+                input.shape)
+            outmask = torch.sparse_coo_tensor(sparse.indices(),
+                                              sparse.values().new_full(sparse.values().shape, 1).to(dtype=bool),
+                                              sparse.shape)._coalesced_(True)
+        elif tmp.layout == torch.sparse_csr:
+            expected_sparse = torch.sparse_csr_tensor(
+                tmp.crow_indices(),
+                tmp.col_indices(),
+                torch.where(tmp.values() != Z, tmp.values(), tmp.values().new_full([], 0)),
+                input.shape)
+            outmask = torch.sparse_csr_tensor(sparse.crow_indices(), sparse.col_indices(),
+                                              sparse.values().new_full(sparse.values().shape, 1).to(dtype=bool),
+                                              sparse.shape)
+        else:
+            assert 0
+
+        self.assertEqual(sparse, expected_sparse)
+
+        # check invariance:
+        #  torch.where(mask.to_dense(), input.to_dense(), fill_value)
+        #    == where(mask, input, fill_value).to_dense(fill_value)
+        expected = torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, F))
+        dense = torch.where(outmask.to_dense(), sparse.to_dense(), torch.full(sparse.shape, F))
+        self.assertEqual(dense, expected)
+
 
+instantiate_device_type_tests(TestMasked, globals(), except_for='meta')
 
-instantiate_device_type_tests(TestMasked, globals())
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_meta.py b/test/test_meta.py
new file mode 100644
index 000000000000..c8b62dbf57e9
--- /dev/null
+++ b/test/test_meta.py
@@ -0,0 +1,1152 @@
+# Owner(s): ["module: primTorch"]
+
+import torch
+import os
+from enum import Enum
+from torch.overrides import resolve_name
+from torch.utils._pytree import tree_map, tree_flatten
+import torch.utils._python_dispatch
+from torch._prims.utils import is_complex_dtype, corresponding_real_dtype
+from torch.testing._internal.common_utils import (
+    TestCase,
+    skipIfCrossRef,
+    suppress_warnings,
+    TEST_WITH_ASAN,
+    run_tests,
+)
+from torch.testing._internal.common_device_type import (
+    ops,
+    instantiate_device_type_tests,
+    onlyCUDA,
+)
+from torch.testing._internal.logging_tensor import no_dispatch
+from torch.testing._internal.common_methods_invocations import op_db
+import torch._prims as prims
+
+import atexit
+import re
+from collections import defaultdict
+import unittest
+import warnings
+
+bf16 = torch.bfloat16
+f64 = torch.float64
+f32 = torch.float32
+f16 = torch.float16
+c32 = torch.complex32
+c64 = torch.complex64
+c128 = torch.complex128
+i8 = torch.int8
+i16 = torch.int16
+i32 = torch.int32
+i64 = torch.int64
+b8 = torch.bool
+u8 = torch.uint8
+
+dtype_abbrs = {
+    torch.bfloat16: 'bf16',
+    torch.float64: 'f64',
+    torch.float32: 'f32',
+    torch.float16: 'f16',
+    torch.complex32: 'c32',
+    torch.complex64: 'c64',
+    torch.complex128: 'c128',
+    torch.int8: 'i8',
+    torch.int16: 'i16',
+    torch.int32: 'i32',
+    torch.int64: 'i64',
+    torch.bool: 'b8',
+    torch.uint8: 'u8',
+}
+
+def safe_is_leaf(t):
+    try:
+        return t.is_leaf
+    except RuntimeError:
+        # inference mode can trigger this
+        return False
+
+
+# This is a class for converting multiple tensors into meta tensors which
+# share the same view/storage structure.  The operation model is you allocate
+# one of these, and then call it repeatedly on all the tensors you want to
+# convert.  It's important to use the same object for tensors you want to
+# share storage because this is how we correlate shared storages to the same
+# meta storages; similarly, it's important NOT to use the same object for
+# unrelated groups of tensors because this class will remember all the
+# tensors/storages its seen and therefore leak memory.
+class MetaConverter:
+    def __init__(self):
+        self.storage_memo = {}
+        self.tensor_memo = {}
+        self.hit = 0
+        self.miss = 0
+
+    def successful(self):
+        return self.hit > 0 and self.miss == 0
+
+    # NB: doesn't actually return a storage, because meta storage is
+    # not supported
+    def meta_storage(self, s):
+        # NB: TypedStorage is freshly allocated and cannot be used as hash
+        # key index.
+        if s._cdata not in self.storage_memo:
+            self.storage_memo[s._cdata] = torch.empty(s.size(), dtype=s.dtype, device='meta')
+        return self.storage_memo[s._cdata]
+
+    # This function assumes that it's possible to do the conversion
+    def meta_tensor(self, t):
+        if t not in self.tensor_memo:
+            with torch.inference_mode(t.is_inference()):
+                if t._is_view():
+                    # Construct views in two steps: recursively meta-fy their
+                    # base, and then create the view off that.  NB: doing it
+                    # directly from storage is WRONG because this won't cause
+                    # version counters to get shared.
+                    assert t._is_view()
+                    base = self.meta_tensor(t._base)
+
+                    def is_c_of_r(complex_dtype, real_dtype):
+                        return is_complex_dtype(complex_dtype) and \
+                            corresponding_real_dtype(complex_dtype) == real_dtype
+
+                    if base.dtype == t.dtype:
+                        pass
+                    elif is_c_of_r(base.dtype, t.dtype):
+                        base = torch.view_as_real(base)
+                    elif is_c_of_r(t.dtype, base.dtype):
+                        base = torch.view_as_complex(base)
+                    else:
+                        # This is not guaranteed to succeed.  If it fails, it
+                        # means there is another dtype-converting view function
+                        # that hasn't been handled here
+                        base = base.view(t.dtype)
+
+                    with torch.enable_grad():
+                        r = base.as_strided(t.size(), t.stride(), t.storage_offset())
+                else:
+                    is_leaf = safe_is_leaf(t)
+                    # Fake up some autograd history.
+                    if t.requires_grad:
+                        r = torch.empty((0,), dtype=t.dtype, device='meta', requires_grad=True)
+                        if not is_leaf:
+                            with torch.enable_grad():
+                                # The backward function here will be wrong, but
+                                # that's OK; our goal is just to get the metadata
+                                # looking as close as possible; we're not going to
+                                # actually try to backward() on these produced
+                                # metas.  TODO: would be safer to install some
+                                # sort of unsupported grad_fn here
+                                r = r.clone()
+                    else:
+                        r = torch.empty((0,), dtype=t.dtype, device='meta')
+                    # As long as meta storage is not supported, need to prevent
+                    # redispatching on set_(Storage, ...) which will choke with
+                    # meta storage
+                    s = self.meta_storage(t.storage())
+                    with no_dispatch():
+                        with torch.no_grad():
+                            r.set_(s, t.storage_offset(), t.size(), t.stride())
+
+                torch._C._set_conj(r, t.is_conj())
+                torch._C._set_neg(r, t.is_neg())
+            self.tensor_memo[t] = r
+
+        return self.tensor_memo[t]
+
+    def __call__(self, t):
+        # TODO: zero tensors?  We appear to have eliminated them by
+        # excluding complex for now
+        if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
+            if any([
+                t.is_sparse_csr, t.is_sparse, t.is_mkldnn, t.is_quantized,
+                t.is_nested, torch._is_functional_tensor(t),
+                # these are supported in meta conversion but the fallbacks
+                # don't work
+                t.is_neg(), t.is_conj(),
+                # conjugate fallback does not support meta tensors
+                t.dtype in (torch.complex128, torch.complex64),
+            ]):
+                # TODO: sparse should support meta
+                # NB technically to('meta') does work but our logging
+                # instrumentation will see the meta conversions and the
+                # tests all break so we just exclude this.  In any case
+                # the to conversion isn't really right anyhow.
+                self.miss += 1
+                return t
+            elif any([
+                t.device.type in ("lazy", "meta"), t.is_complex(),
+                # We need a way to test if a tensor is batched but there
+                # is no official APi to do it
+                # torch._C._is_batched(t),
+            ]):
+                # TODO: this stuff should support storage
+                # (well, maybe not batched)
+                self.hit += 1
+                return t.to("meta")
+            else:
+                self.hit += 1
+                r = self.meta_tensor(t)
+                if type(t) is torch.nn.Parameter:
+                    r = torch.nn.Parameter(r, requires_grad=r.requires_grad)
+                return r
+        elif torch.overrides.is_tensor_like(t):
+            # Blindly converting tensor subclasses to meta can cause
+            # unpredictable problems; e.g., FX tests will trace meta
+            # tensors into their trace / some subclasses don't correctly
+            # support meta.  Trying to YOLO this is more trouble than it's
+            # worth.
+            self.miss += 1
+            return t
+        else:
+            # non-Tensor types don't count as hit or miss
+            return t
+
+
+class TestMetaConverter(TestCase):
+    def assertSameVersionCounter(self, m1, m2):
+        # Cannot easily test m1 and m2 have same storage due to
+        # lack of Storage bindings.  Use version counter.
+        vc = m1._version
+        self.assertEqual(m2._version, vc)
+        # Doing it this way ensures that we get VC bump even with leaves
+        with torch.no_grad():
+            m1._base.add_(3)
+        self.assertNotEqual(m1._version, vc)
+        self.assertEqual(m2._version, m1._version)
+
+    def test_view_of_non_leaf(self):
+        x = torch.randn(4, requires_grad=True)
+        y = x.neg()
+        z1 = y[:]
+        z2 = y[:]
+        to_meta = MetaConverter()
+        m1 = to_meta(z1)
+        m2 = to_meta(z2)
+        self.assertEqual(m1.shape, z1.shape)
+        self.assertTrue(m1._is_view())
+        self.assertFalse(m1._base.is_leaf)
+        self.assertSameVersionCounter(m1, m2)
+
+    def test_view_of_leaf(self):
+        x = torch.randn(4, requires_grad=True)
+        z1 = x[:]
+        z2 = x[:]
+        to_meta = MetaConverter()
+        m1 = to_meta(z1)
+        m2 = to_meta(z2)
+        self.assertEqual(m1.shape, z1.shape)
+        self.assertTrue(m1._is_view())
+        self.assertTrue(m1._base.is_leaf)
+        self.assertSameVersionCounter(m1, m2)
+
+    def test_leaf(self):
+        x = torch.randn(4, requires_grad=True)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+        self.assertEqual(m.shape, x.shape)
+        self.assertTrue(m.is_leaf)
+        self.assertTrue(m.requires_grad)
+
+    def test_non_leaf(self):
+        x = torch.randn(4, requires_grad=True)
+        y = x.neg()
+        to_meta = MetaConverter()
+        m = to_meta(y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertFalse(m.is_leaf)
+        self.assertTrue(m.requires_grad)
+
+    def test_requires_grad_false(self):
+        x = torch.randn(4, requires_grad=False)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+        self.assertEqual(m.shape, x.shape)
+        self.assertFalse(m.requires_grad)
+
+    def test_view_as_real(self):
+        x = torch.randn(4, dtype=torch.complex64)
+        y = torch.view_as_real(x)
+        m = MetaConverter()(y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.dtype, y.dtype)
+
+    def test_view_as_complex(self):
+        x = torch.randn((4, 2), dtype=torch.float32)
+        y = torch.view_as_complex(x)
+        m = MetaConverter()(y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.dtype, y.dtype)
+
+    def test_view_dtype(self):
+        x = torch.randn(4, dtype=torch.float32)
+        y = x.view(dtype=torch.int32)
+        m = MetaConverter()(y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.dtype, y.dtype)
+
+    def test_imag(self):
+        x = torch.randn(4, dtype=torch.complex64)
+        y = x.imag
+        m = MetaConverter()(y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.dtype, y.dtype)
+        self.assertEqual(m.stride(), y.stride())
+        self.assertEqual(m.storage_offset(), y.storage_offset())
+
+
+def assert_ref_meta_equal(test_case, meta_rs, rs, msg_callable):
+    flat_meta_rs, _ = tree_flatten(meta_rs)
+    flat_rs, _ = tree_flatten(rs)
+    test_case.assertEqual(len(flat_meta_rs), len(flat_rs))
+    for i, meta_r, r in zip(range(len(flat_rs)), flat_meta_rs, flat_rs):
+        def test_assert(cond, msg):
+            if not cond:
+                raise RuntimeError(f"output {i}: {msg_callable(msg)}")
+        if not isinstance(r, torch.Tensor):
+            continue
+        test_assert(isinstance(meta_r, torch.Tensor), f"but real {i}th result is Tensor")
+        test_assert(meta_r.dtype == r.dtype, f"but real dtype was {r.dtype}")
+        test_assert(meta_r.shape == r.shape, f"but real shape was {r.shape}")
+        # NOTE: this helper is used instead of a direct stride comparison
+        # because strides of tensors with no elements and dimensions of
+        # length 1 are not computed consistently
+        same_strides, _ = prims.utils.check_significant_strides(meta_r, r)
+        test_assert(same_strides, f"but real stride was {r.stride()}")
+        test_assert(
+            meta_r.storage_offset() == r.storage_offset(),
+            f"but real storage_offset was {r.storage_offset()}")
+        test_assert(meta_r.requires_grad == r.requires_grad, f"but real requires_grad was {r.requires_grad}")
+        test_assert(meta_r.is_conj() == r.is_conj(), f"but real is_conj was {r.is_conj()}")
+        test_assert(meta_r.is_neg() == r.is_neg(), f"but real is_neg was {r.is_neg()}")
+
+
+# This environment variable controls whether or not we print expected failure
+# lists at the end of a test suite run.  The intended usage looks like this:
+#
+# 1. Run `PYTORCH_COLLECT_EXPECT=1 python test/test_meta.py` on a CUDA build
+#    of PyTorch that has LAPACK/MAGMA installed.  You can filter `-k test_meta`
+#    or `-k test_dispatch_meta` to only focus on one or another list
+# 2. Given the printed skip/xfail list, add them to the corresponding lists;
+#    torch.* entries go in meta_function and aten.* entries go in meta_dispatch.
+#    If there are preexisting entries, you need to merge in the entries.
+#
+# This is somewhat manual but typically you shouldn't need to do this, unless
+# you've made a major change (e.g., added a new dtype to PyTorch) and need to
+# refresh the lists.  If you want to do it from scratch, just clear out the
+# preexisting lists before running.
+#
+# WARNING: Python dict literals will silently ignore duplicate keys
+COLLECT_EXPECT = os.getenv('PYTORCH_COLLECT_EXPECT', '0') == '1'
+
+seen_succeeded = {}
+seen_failed = {}
+failed_reasons = defaultdict(set)
+def print_seen():
+    expected_failures = []
+    skips = []
+
+    def fmt_dtypes(dtypes):
+        r = ', '.join(sorted(dtype_abbrs[d] for d in dtypes))
+        return '{' + r + '}'
+
+    for op, failed_dtypes in seen_failed.items():
+        ops = resolve_name(op)
+        succeeded_dtypes = seen_succeeded.get(op, set())
+        expected_failures_dtypes = failed_dtypes - succeeded_dtypes
+        skips_dtypes = failed_dtypes & succeeded_dtypes
+        reasons = ""
+        if failed_reasons[op]:
+            reasons = "  # " + ", ".join(sorted(failed_reasons[op]))
+        if expected_failures_dtypes:
+            expected_failures.append(f"    {ops}: {fmt_dtypes(expected_failures_dtypes)},{reasons}")
+        if skips_dtypes:
+            skips.append(f"    {ops}: {fmt_dtypes(skips_dtypes)},")
+    expected_failures.sort()
+    skips.sort()
+    nl = '\n'
+    print(f"""\
+expected_failures = {{
+{nl.join(expected_failures)}
+}}
+
+skips = {{
+{nl.join(skips)}
+}}
+""")
+if COLLECT_EXPECT:
+    atexit.register(print_seen)
+
+# Success forces pass; failure forces fail; skip unconditionally skips testing
+TestExpect = Enum("TestExpect", ("SUCCESS", "XFAILURE", "SKIP"))
+
+# unlike print produce strides
+def verbose_print(e):
+    class Lit:
+        def __init__(self, s):
+            self.s = s
+
+        def __repr__(self):
+            return self.s
+
+    def go(t):
+        if isinstance(t, torch.Tensor):
+            return Lit(f"{t} stride={t.stride()}")
+        else:
+            return t
+
+    return repr(tree_map(go, e))
+
+def run_meta_crossref(
+    test_case,
+    test_expect,
+    func,
+    args,
+    kwargs,
+    *,
+    dtype,
+    device_type,
+):
+    to_meta = MetaConverter()
+    do_meta = test_expect is not TestExpect.SKIP
+
+    if do_meta:
+        try:
+            meta_args = tree_map(to_meta, args)
+            meta_kwargs = tree_map(to_meta, kwargs)
+        except Exception as e:
+            raise RuntimeError(
+                f"failed to convert args to meta; "
+                f"originally (*{args}, **{kwargs})") from e
+
+    rs = func(*args, **kwargs)
+
+    # TODO: also handle cases where func raise an exception
+
+    # For now, only attempt if we managed to convert all tensor types
+    # (if any of them failed, we're in a mixed device situation and
+    # this isn't well supported)
+    if do_meta and to_meta.successful():
+        try:
+            # Suppress warnings, this doesn't matter for test_meta.py
+            # but it does matter if you want to use this decorator
+            # for cross-ref testing, as some tests may be looking at
+            # errors
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                meta_rs = func(*meta_args, **meta_kwargs)
+        except Exception as e:
+            if test_expect is TestExpect.XFAILURE:
+                return rs
+            seen_failed.setdefault(func, set()).add(dtype)
+            if isinstance(e, NotImplementedError):
+                m = RE_NOT_IMPLEMENTED_MSG.search(e.args[0])
+                if m:
+                    failed_reasons[func].add(m.group(1))
+            if COLLECT_EXPECT:
+                return rs
+            raise RuntimeError(f"""\
+failed to run: {resolve_name(func)}(
+*{verbose_print(meta_args)},
+**{verbose_print(meta_kwargs)}
+)""") from e
+        else:
+            try:
+                delim = ',\n  '
+                assert_ref_meta_equal(test_case, meta_rs, rs, lambda msg: f"""\
+meta disagrees with real impl:
+{resolve_name(func)}(
+  {delim.join(map(verbose_print, meta_args))},
+  {delim.join(k + ": " + verbose_print(v) for k, v in meta_kwargs.items())}
+) = (
+  {verbose_print(meta_rs)}
+)
+{msg}
+""")
+            except Exception:
+                if test_expect is TestExpect.XFAILURE:
+                    return rs
+                seen_failed.setdefault(func, set()).add(dtype)
+                if COLLECT_EXPECT:
+                    return rs
+                raise
+            else:
+                seen_succeeded.setdefault(func, set()).add(dtype)
+                if test_expect is TestExpect.XFAILURE and not COLLECT_EXPECT:
+                    raise RuntimeError(f"unexpected success {resolve_name(func)}")
+
+    return rs
+
+
+
+RE_NOT_IMPLEMENTED_MSG = re.compile(r"Could not run '([^']+)' with arguments ")
+
+
+meta_function_expected_failures = {
+    torch.Tensor.item: {b8, bf16, c128, c64, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::_local_scalar_dense
+    torch.Tensor.to_sparse: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::to_sparse, aten::to_sparse.sparse_dim
+    torch.addbmm: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::addbmm, aten::addbmm.out
+    torch.allclose: {bf16, f16, f32, f64},  # aten::_local_scalar_dense
+    torch.angle: {c32, b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::angle, aten::angle.out
+    torch.argwhere: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::nonzero
+    torch.bincount: {i16, i32, i64, i8, u8},  # aten::bincount
+    torch.bucketize: {bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::bucketize.Tensor, aten::bucketize.Tensor_out
+    torch.combinations: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::masked_select
+    torch.complex: {f16, f32, f64},  # aten::complex.out
+    torch.conj_physical: {c32},  # aten::conj_physical.out
+    torch.corrcoef: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::_local_scalar_dense
+    torch.count_nonzero: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::count_nonzero.dim_IntList
+    torch.cov: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::_local_scalar_dense
+    torch.diag: {bf16, b8, f32, f64, i16, i32, i64, i8, u8},  # aten::diag.out
+    torch.diagflat: {bf16, b8, f32, f64, i16, i32, i64, i8, u8},  # aten::diag.out
+    torch.dot: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::dot
+    torch.fft.fft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.fft: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.fftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.fftshift: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::roll
+    torch.fft.hfft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.hfft: {b8, f32, f64, i16, i32, i64, i8, u8},
+    torch.fft.hfftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.ifft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.ifft: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.ifftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2c
+    torch.fft.ifftshift: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::roll
+    torch.fft.ihfft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.ihfft: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.ihfftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.irfft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.irfft: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.irfftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.rfft2: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.rfft: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.fft.rfftn: {b8, f32, f64, i16, i32, i64, i8, u8},  # aten::_fft_r2c
+    torch.floor_divide: {bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::floor_divide, aten::floor_divide.out
+    torch.frexp: {bf16, f16, f32, f64},  # aten::frexp.Tensor_out
+    torch.functional.istft: {f32, f64},  # aten::view_as_complex
+    torch.functional.stft: {f32, f64},  # aten::_fft_r2c
+    torch.functional.unique: {b8, bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::_unique2, aten::unique_dim
+    torch.functional.unique_consecutive: {b8, bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::unique_consecutive
+    torch.histc: {bf16, f32, f64},  # aten::histc, aten::histc.out
+    torch.histogram: {f32, f64},  # aten::histogram.bin_ct, aten::histogram.bins_tensor
+    torch.histogramdd: {f32, f64},  # aten::_histogramdd_bin_edges, aten::_histogramdd_from_bin_tensors
+    torch.kthvalue: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::kthvalue.values
+    torch.linalg.qr: {f32, f64},  # aten::_linalg_qr_helper
+    torch.logcumsumexp: {bf16, f32, f64},  # aten::_logcumsumexp, aten::_logcumsumexp.out
+    torch.masked_select: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::masked_select, aten::masked_select.out
+    torch.matrix_exp: {bf16, f32, f64},  # aten::linalg_matrix_exp
+    torch.median: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::median, aten::median.dim_values
+    torch.mode: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::mode
+    torch.multinomial: {bf16, f32, f64},  # aten::multinomial, aten::multinomial.out
+    torch.mvlgamma: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::_local_scalar_dense, aten::mvlgamma.out
+    torch.nan_to_num: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::nan_to_num.out
+    torch.nanmean: {bf16, f16, f32, f64},
+    torch.nanmedian: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::nanmedian, aten::nanmedian.dim_values
+    torch.nanquantile: {f32, f64},
+    torch.nansum: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::nansum, aten::nansum.out
+    torch.nn.functional.adaptive_avg_pool2d: {bf16, f32, f64},  # aten::_adaptive_avg_pool2d
+    torch.nn.functional.conv1d: {bf16, f32, f64, i64},
+    torch.nn.functional.conv2d: {bf16, f32, f64, i64},
+    torch.nn.functional.conv_transpose1d: {f32, f64, i64},
+    torch.nn.functional.conv_transpose2d: {f32, f64, i64},
+    torch.nn.functional.conv_transpose3d: {f32, f64, i64},
+    torch.nn.functional.ctc_loss: {f32, f64},
+    torch.nn.functional.embedding_bag: {f16, f32, f64},  # aten::_embedding_bag_forward_only
+    torch.nn.functional.gaussian_nll_loss: {bf16, f32, f64},  # aten::_local_scalar_dense
+    torch.nn.functional.grid_sample: {f32, f64},  # aten::grid_sampler_2d, aten::grid_sampler_3d
+    torch.nn.functional.group_norm: {bf16, f32, f64},  # aten::var_mean.correction
+    torch.nn.functional.instance_norm: {f32, f64},  # aten::var_mean.correction
+    torch.nn.functional.layer_norm: {bf16, f32, f64},
+    torch.nn.functional.max_pool3d: {f32, f64},  # aten::max_pool3d_with_indices
+    torch.nn.functional.max_pool3d_with_indices: {f32, f64},  # aten::max_pool3d_with_indices
+    torch.nn.functional.max_unpool1d: {f32, f64},  # aten::max_unpool2d
+    torch.nn.functional.max_unpool2d: {f32, f64},  # aten::max_unpool2d
+    torch.nn.functional.max_unpool3d: {f32, f64},  # aten::max_unpool3d
+    torch.nn.functional.multi_margin_loss: {f32, f64},  # aten::multi_margin_loss
+    torch.nn.functional.multilabel_margin_loss: {f32, f64},  # aten::multilabel_margin_loss_forward
+    torch.nn.functional.one_hot: {i64},  # aten::_local_scalar_dense
+    torch.nn.functional.pdist: {f32, f64},  # aten::_pdist_forward
+    torch.nn.functional.prelu: {bf16, f32, f64},  # aten::prelu
+    torch.nn.functional.relu: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::relu
+    torch.nn.functional.rrelu: {bf16, f32, f64},  # aten::rrelu_with_noise
+    torch.nn.functional.unfold: {bf16, f16, f32, f64},  # aten::im2col
+    torch.nonzero: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::nonzero, aten::nonzero.out
+    torch.polar: {f32, f64},  # aten::polar.out
+    torch.repeat_interleave: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::repeat_interleave.Tensor
+    torch.roll: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::roll
+    torch.searchsorted: {bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::searchsorted.Tensor, aten::searchsorted.Tensor_out
+    torch.symeig: {f32, f64},
+    torch.std_mean: {bf16, f16, f32, f64},  # aten::std_mean.correction
+    torch.take: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},  # aten::take, aten::take.out
+    torch.trace: {f32, f64, i16, i32, i64, i8, u8},  # aten::trace
+    torch.var_mean: {bf16, f16, f32, f64},  # aten::var_mean.correction
+    torch.vdot: {bf16, f32, f64, i16, i32, i64, i8, u8},  # aten::vdot
+    torch.qr: {f32, f64},
+    torch.ormqr: {f32, f64},
+    torch.lu_solve: {f32, f64},
+    torch.cholesky: {f32, f64},  # aten::cholesky, aten::cholesky.out
+    torch.cholesky_inverse: {f32, f64},  # aten::cholesky_inverse, aten::cholesky_inverse.out
+    torch.cholesky_solve: {f32, f64},  # aten::_cholesky_solve_helper
+    torch.eig: {f32, f64},  # aten::_local_scalar_dense
+    torch.geqrf: {f32, f64},  # aten::geqrf
+    torch.linalg.cholesky: {f32, f64},  # aten::linalg_cholesky_ex, aten::linalg_cholesky_ex.L
+    torch.linalg.cholesky_ex: {f32, f64},  # aten::linalg_cholesky_ex
+    torch.linalg.det: {f32, f64},  # aten::_det_lu_based_helper
+    torch.linalg.eig: {f32, f64},  # aten::linalg_eig
+    torch.linalg.eigh: {f32, f64},
+    torch.linalg.eigvals: {f32, f64},
+    torch.linalg.eigvalsh: {f32, f64},  # aten::linalg_eigvalsh.out
+    torch.linalg.householder_product: {f32, f64},  # aten::linalg_householder_product
+    torch.linalg.inv: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.ldl_factor: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.lstsq: {f32, f64},  # aten::linalg_lstsq.out
+    torch.linalg.lu_factor: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.slogdet: {f32, f64},  # aten::linalg_slogdet
+    torch.linalg.solve: {f32, f64},  # aten::linalg_solve, aten::linalg_solve.out
+    torch.linalg.solve_triangular: {f32, f64},  # aten::linalg_solve_triangular
+    torch.linalg.tensorinv: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.tensorsolve: {f32, f64},  # aten::linalg_solve
+    torch.logdet: {f32, f64},  # aten::_local_scalar_dense, aten::nonzero
+}
+
+"""
+# This is some sample code for how we could dump these dicts into YAML
+# file for easier reading/writing
+import yaml
+print(yaml.dump(
+  {resolve_name(k): [dtype_abbrs[d] for d in v]
+   for k, v in meta_function_expected_failures.items()}, default_flow_style=None))
+import sys
+sys.exit()
+"""
+
+meta_function_skips = {
+    torch.Tensor.__getitem__: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8, c32},
+    torch.Tensor.__rmatmul__: {bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.index_reduce: {bf16, f16, f32, f64},
+    torch.addr: {b8},
+    torch.aminmax: {b8, f32, f64, i16, i32, i64, i8, u8},
+    torch.bernoulli: {bf16, f32, f64},
+    torch.conj_physical: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},
+    torch.cummax: {b8, bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.cummin: {b8, bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.diff: {b8},
+    torch.functional.cdist: {f32, f64},
+    torch.functional.tensordot: {bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.index_add: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},
+    torch.inner: {bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.logical_not: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},
+    torch.logical_xor: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},
+    torch.logit: {b8, bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.matmul: {bf16, f32, f64, i16, i32, i64, i8, u8},
+    torch.nn.functional.adaptive_avg_pool1d: {bf16, f32, f64},
+    torch.nn.functional.adaptive_avg_pool3d: {f16, f32, f64},
+    torch.nn.functional.batch_norm: {f32, f64},
+    torch.nn.functional.cross_entropy: {bf16, f32, f64},
+    torch.nn.functional.interpolate: {bf16, f32, f64, u8},
+    torch.nn.functional.nll_loss: {bf16, f32, f64},
+    torch.nn.functional.pad: {f32, f64},
+    torch.normal: {bf16, f16, f32, f64},
+    torch.prod: {b8, f32, f64, i16, i32, i64, i8, u8},
+    torch.tensor_split: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8},
+    torch.nn.functional.logsigmoid: {bf16, f16, f32, f64},  # logsigmoid.output
+    torch.inverse: {f32, f64},
+    torch.linalg.matrix_power: {f32, f64},
+    torch.linalg.matrix_rank: {f32, f64},
+    torch.linalg.pinv: {f32, f64},
+    torch.empty: {b8, bf16, c128, c64, c32, f16, f32, f64, i16, i32, i64, i8, u8},
+}
+
+meta_function_device_expected_failures = defaultdict(dict)
+meta_function_device_skips = defaultdict(dict)
+
+meta_function_device_expected_failures['cpu'] = {
+}
+
+meta_function_device_expected_failures['cuda'] = {
+    torch.addbmm: {f16},  # aten::addbmm, aten::addbmm.out
+    torch.corrcoef: {bf16, f16},  # aten::_local_scalar_dense
+    torch.cov: {f16},  # aten::_local_scalar_dense
+    torch.diag: {bf16, f16},  # aten::diag.out
+    torch.diagflat: {bf16, f16},  # aten::diag.out
+    torch.dot: {f16},  # aten::dot
+    torch.fft.fft2: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.fft: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.fftn: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.hfft2: {c32, f16},  # aten::_fft_c2c
+    torch.fft.hfft: {c32, f16},
+    torch.fft.hfftn: {c32, f16},  # aten::_fft_c2c
+    torch.fft.ifft2: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.ifft: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.ifftn: {c32, f16},  # aten::_fft_c2c, aten::_fft_c2c.out
+    torch.fft.ihfft2: {f16},
+    torch.fft.ihfft: {f16},
+    torch.fft.ihfftn: {f16},
+    torch.fft.irfft2: {c32, f16},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.irfft: {c32, f16},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.irfftn: {c32, f16},  # aten::_fft_c2r, aten::_fft_c2r.out
+    torch.fft.rfft2: {f16},
+    torch.fft.rfft: {f16},
+    torch.fft.rfftn: {f16},
+    torch.functional.unique: {f16},  # aten::_unique2, aten::unique_dim
+    torch.functional.unique_consecutive: {f16},  # aten::unique_consecutive
+    torch.geqrf: {f32, f64},  # aten::geqrf
+    torch.histc: {i16, i32, i64, i8},  # aten::histc, aten::histc.out
+    torch.kthvalue: {f16},  # aten::kthvalue.values
+    torch.linalg.cholesky: {f32, f64},  # aten::linalg_cholesky_ex, aten::linalg_cholesky_ex.L
+    torch.linalg.cholesky_ex: {f32, f64},  # aten::linalg_cholesky_ex
+    torch.linalg.householder_product: {f32, f64},  # aten::linalg_householder_product, aten::linalg_householder_product.out
+    torch.linalg.inv: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.ldl_factor: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.lu_factor: {f32, f64},  # aten::_local_scalar_dense
+    torch.linalg.solve_triangular: {f32, f64},  # aten::linalg_solve_triangular, aten::linalg_solve_triangular.out
+    torch.linalg.tensorinv: {f32, f64},  # aten::_local_scalar_dense
+    torch.logcumsumexp: {bf16, f16},  # aten::_logcumsumexp, aten::_logcumsumexp.out
+    torch.matrix_exp: {f16},  # aten::linalg_matrix_exp
+    torch.median: {f16},  # aten::median, aten::median.dim_values
+    torch.multinomial: {f16},  # aten::multinomial, aten::multinomial.out
+    torch.mvlgamma: {f16},  # aten::_local_scalar_dense, aten::mvlgamma.out
+    torch.nanmedian: {f16},  # aten::nanmedian, aten::nanmedian.dim_values
+    torch.nn.functional.adaptive_avg_pool2d: {f16},  # aten::_adaptive_avg_pool2d
+    torch.nn.functional.conv1d: {f16},
+    torch.nn.functional.conv2d: {f16},
+    torch.nn.functional.conv_transpose1d: {bf16, f16},
+    torch.nn.functional.conv_transpose2d: {bf16, f16},
+    torch.nn.functional.conv_transpose3d: {bf16, f16},
+    torch.nn.functional.embedding_bag: {bf16},  # aten::_embedding_bag_forward_only
+    torch.nn.functional.gaussian_nll_loss: {f16},  # aten::_local_scalar_dense
+    torch.nn.functional.grid_sample: {f16},  # aten::grid_sampler_2d, aten::grid_sampler_3d
+    torch.nn.functional.group_norm: {bf16, f16},  # aten::var_mean.correction
+    torch.nn.functional.instance_norm: {bf16, f16},  # aten::var_mean.correction
+    torch.nn.functional.layer_norm: {f16},
+    torch.nn.functional.max_pool3d: {bf16, f16},  # aten::max_pool3d_with_indices
+    torch.nn.functional.max_pool3d_with_indices: {bf16, f16},  # aten::max_pool3d_with_indices
+    torch.nn.functional.max_unpool1d: {f16},  # aten::max_unpool2d
+    torch.nn.functional.max_unpool2d: {f16},  # aten::max_unpool2d
+    torch.nn.functional.max_unpool3d: {f16},  # aten::max_unpool3d
+    torch.nn.functional.multi_margin_loss: {bf16, f16},  # aten::multi_margin_loss
+    torch.nn.functional.multilabel_margin_loss: {bf16, f16},  # aten::multilabel_margin_loss_forward
+    torch.nn.functional.prelu: {f16},  # aten::prelu
+    torch.nn.functional.relu: {f16},  # aten::relu
+    torch.nn.functional.rrelu: {f16},  # aten::rrelu_with_noise
+    torch.ormqr: {f32, f64},  # aten::ormqr, aten::ormqr.out
+    torch.qr: {f32, f64},  # aten::_linalg_qr_helper
+    torch.trace: {b8, bf16, f16},  # aten::diag.out
+    torch.vdot: {f16},  # aten::vdot
+}
+
+meta_function_device_skips['cuda'] = {
+    torch.Tensor.__getitem__: {c32},
+    torch.Tensor.__rmatmul__: {f16},
+    torch.bernoulli: {f16},
+    torch.cummax: {f16},
+    torch.cummin: {f16},
+    torch.functional.tensordot: {f16},
+    torch.inner: {f16},
+    torch.inverse: {f32, f64},
+    torch.linalg.matrix_power: {f32, f64},
+    torch.linalg.matrix_rank: {f32, f64},
+    torch.linalg.svd: {f32, f64},
+    torch.logit: {f16},
+    torch.matmul: {f16},
+    torch.nn.functional.adaptive_avg_pool1d: {f16},
+    torch.nn.functional.adaptive_avg_pool3d: {bf16},
+    torch.nn.functional.batch_norm: {bf16, f16},
+    torch.nn.functional.cross_entropy: {f16},
+    torch.nn.functional.interpolate: {f16},
+    torch.nn.functional.nll_loss: {f16},
+    torch.nn.functional.pad: {f16},
+    torch.prod: {bf16, c32, f16},
+    torch.svd: {f32, f64},
+}
+
+# This is a __torch_function__ mode that, when enabled, interposes every
+# Torch API call and runs the operator as normal, and then reruns it
+# with meta inputs, and then checks that everything about the output agrees.
+# Most of the logic deals with faithfully replicating the original tensor
+# as a meta tensor, which is nontrivial because there are a lot of subsystems
+# that may potentially be exercised.
+#
+# That being said, this class is a little overkill for what it is doing in
+# this test file (since I could have just inlined __torch_function__ on the
+# OpInfo call, and OpInfos generally have very regular inputs), but it will be
+# useful for more comprehensive testing e.g., as seen in
+# https://github.com/pytorch/pytorch/pull/75994  The big benefit is it is
+# A LOT more efficient that torch dispatch mode (at the cost of less coverage)
+class MetaCrossRefFunctionMode(torch.overrides.TorchFunctionMode):
+    test_case: TestCase
+    device_type: str
+    dtype: torch.dtype
+
+    def __init__(self, test_case, *, device, dtype):
+        self.test_case = test_case
+        self.device_type = torch.device(device).type
+        self.dtype = dtype
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        if torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod):
+            return func(*args, **kwargs)
+
+        if self.dtype in meta_function_skips.get(func, set()):
+            test_expect = TestExpect.SKIP
+        elif self.dtype in meta_function_device_skips[self.device_type].get(func, set()):
+            test_expect = TestExpect.SKIP
+        elif self.dtype in meta_function_expected_failures.get(func, set()):
+            test_expect = TestExpect.XFAILURE
+        elif self.dtype in meta_function_device_expected_failures[self.device_type].get(func, set()):
+            test_expect = TestExpect.XFAILURE
+        else:
+            test_expect = TestExpect.SUCCESS
+
+        return run_meta_crossref(
+            self.test_case, test_expect, func, args,
+            kwargs, dtype=self.dtype, device_type=self.device_type
+        )
+
+aten = torch.ops.aten
+
+# these always fail
+meta_dispatch_expected_failures = {
+    aten._adaptive_avg_pool2d.default: {bf16, f64, f32},
+    aten._adaptive_avg_pool3d.default: {f16, f64, f32},
+    aten._cdist_forward.default: {f64, f32},
+    aten._conj_physical.default: {c32},
+    aten._convolution.default: {c64, i64, f64, c128, bf16, f32},
+    aten._ctc_loss.default: {f64, f32},
+    aten._embedding_bag_forward_only.default: {f16, f64, f32},
+    aten._fft_r2c.default: {i64, u8, b8, f32, i8, f64, i16, i32},
+    aten._histogramdd_bin_edges.default: {f64, f32},
+    aten._histogramdd_from_bin_cts.default: {f64, f32},
+    aten._histogramdd_from_bin_tensors.default: {f64, f32},
+    aten._local_scalar_dense.default: {c64, i64, c128, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten._pdist_forward.default: {f64, f32},
+    aten._unique2.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.addbmm.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.addbmm.out: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.angle.default: {c32, i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.angle.out: {c32, i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.bernoulli.out: {bf16, f64, f32},
+    aten.bincount.default: {i8, i64, i16, u8, i32},
+    aten.bucketize.Tensor: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.bucketize.Tensor_out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.col2im.default: {c64, f32, f64, c128},
+    aten.complex.default: {c64, f64, c128, f16, f32},
+    aten.complex.out: {f16},
+    aten.conj_physical.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, c32, i32},
+    aten.convolution.default: {c64, i64, f64, c128, bf16, f32},
+    aten.count_nonzero.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.count_nonzero.dim_IntList: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.diag.default: {i64, u8, b8, f32, i8, f64, i16, i32, bf16},
+    aten.diag.out: {bf16, i64, u8, b8, f32, i8, f64, i16, i32},
+    aten.dot.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.dot.out: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.floor_divide.default: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.floor_divide.out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.frexp.Tensor: {bf16, f16, f64, f32},
+    aten.grid_sampler_2d.default: {f64, f32},
+    aten.grid_sampler_3d.default: {f64, f32},
+    aten.histc.default: {bf16, f64, f32},
+    aten.histc.out: {bf16, f64, f32},
+    aten.histogram.bin_ct: {f64, f32},
+    aten.histogram.bins_tensor: {f64, f32},
+    aten.im2col.default: {bf16, f16, f64, f32},
+    aten.index.Tensor: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32, c32},
+    aten.kthvalue.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.linalg_matrix_exp.default: {bf16, f64, f32},
+    aten.log_sigmoid_forward.output: {bf16, f64, f32},
+    aten.logcumsumexp.default: {bf16, f64, f32},
+    aten.logcumsumexp.out: {bf16, f64, f32},
+    aten.logical_not.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.logical_not_.default: {bf16, f16, f64, f32},
+    aten.logical_xor.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.logit.out: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.masked_select.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.masked_select.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.max_pool3d_with_indices.default: {f64, f32},
+    aten.max_unpool2d.default: {f64, f32},
+    aten.max_unpool3d.default: {f64, f32},
+    aten.median.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.median.dim: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.mode.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.multi_margin_loss.default: {f64, f32},
+    aten.multilabel_margin_loss_forward.default: {f64, f32},
+    aten.multinomial.default: {bf16, f64, f32},
+    aten.multinomial.out: {bf16, f64, f32},
+    aten.mvlgamma.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.mvlgamma.out: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.nan_to_num.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.nan_to_num.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.nanmedian.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.nanmedian.dim: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.nansum.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.nansum.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.native_group_norm.default: {bf16, f64, f32},
+    aten.nll_loss2d_forward.default: {bf16, f64, f32},
+    aten.nonzero.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.nonzero.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.normal.Tensor_Tensor: {bf16, f16, f64, f32},
+    aten.normal.Tensor_Tensor_out: {bf16, f16, f64, f32},
+    aten.normal.float_Tensor: {bf16, f16, f64, f32},
+    aten.normal.float_Tensor_out: {bf16, f16, f64, f32},
+    aten.polar.default: {f64, f32},
+    aten.prelu.default: {bf16, f64, f32},
+    aten.prod.default: {i64, u8, b8, f32, i8, f64, i16, i32},
+    aten.reflection_pad2d.default: {f64, f32},
+    aten.relu.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.repeat_interleave.Tensor: {c64, i64, c128, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.roll.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.rrelu_with_noise.default: {bf16, f64, f32},
+    aten.searchsorted.Tensor: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.searchsorted.Tensor_out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32},
+    aten.std_mean.correction: {bf16, f16, f64, f32},
+    aten.take.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.take.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.tensordot.out: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.to_sparse.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.to_sparse.sparse_dim: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.trace.default: {i8, i64, f64, i16, u8, i32, f32},
+    aten.unique_consecutive.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.unique_dim.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.upsample_nearest3d.vec: {bf16, u8, f64, f32},
+    aten.var_mean.correction: {bf16, f16, f64, f32},
+    aten.vdot.default: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten.vdot.out: {i64, bf16, u8, f32, i8, f64, i16, i32},
+    aten._det_lu_based_helper.default: {f32, f64},  # aten::_det_lu_based_helper
+    aten._linalg_check_errors.default: {c128, c64, f32, f64},  # aten::_local_scalar_dense
+    aten.cholesky.default: {f32, f64},  # aten::cholesky
+    aten.cholesky.out: {f32, f64},  # aten::cholesky.out
+    aten.cholesky_inverse.default: {f32, f64},  # aten::cholesky_inverse
+    aten.cholesky_inverse.out: {f32, f64},  # aten::cholesky_inverse.out
+    aten.cholesky_solve.default: {f32, f64},  # aten::_cholesky_solve_helper
+    aten.cholesky_solve.out: {f32, f64},  # aten::_cholesky_solve_helper
+    aten.eig.default: {f32, f64},  # aten::_local_scalar_dense
+    aten.geqrf.default: {f32, f64},  # aten::geqrf
+    aten.inverse.out: {f32, f64},  # aten::_local_scalar_dense
+    aten.linalg_cholesky_ex.L: {f32, f64},  # aten::linalg_cholesky_ex.L
+    aten.linalg_cholesky_ex.default: {f32, f64},  # aten::linalg_cholesky_ex
+    aten.linalg_eig.default: {f32, f64},  # aten::linalg_eig
+    aten.linalg_eigh.default: {f32, f64},
+    aten.linalg_eigvalsh.out: {f32, f64},  # aten::linalg_eigvalsh.out
+    aten.linalg_householder_product.default: {f32, f64},  # aten::linalg_householder_product
+    aten.linalg_householder_product.out: {f32, f64},  # aten::linalg_householder_product.out
+    aten.linalg_lstsq.default: {f32, f64},  # aten::linalg_lstsq.out
+    aten.linalg_qr.default: {f32, f64},  # aten::_linalg_qr_helper
+    aten.linalg_slogdet.default: {f32, f64},  # aten::linalg_slogdet
+    aten.linalg_solve.default: {f32, f64},  # aten::linalg_solve
+    aten.linalg_solve.out: {f32, f64},  # aten::linalg_solve.out
+    aten.linalg_solve_triangular.default: {f32, f64},  # aten::linalg_solve_triangular
+    aten.linalg_solve_triangular.out: {f32, f64},  # aten::linalg_solve_triangular.out
+    aten.logdet.default: {f32, f64},  # aten::_local_scalar_dense, aten::nonzero
+    aten.lu_solve.default: {f32, f64},  # aten::lu_solve
+    aten.lu_solve.out: {f32, f64},  # aten::lu_solve.out
+    aten.ormqr.default: {f32, f64},  # aten::ormqr
+    aten.ormqr.out: {f32, f64},  # aten::ormqr.out
+    aten.symeig.default: {f32, f64},  # aten::_symeig_helper
+}
+
+# these sometimes pass and sometimes fail
+meta_dispatch_skips = {
+    aten.index_reduce.default: {bf16, f16, f64, f32},
+    aten.index_reduce.out: {bf16, f16, f64, f32},
+    aten._to_copy.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.addr.default: {b8},
+    aten.addr.out: {b8},
+    aten.aminmax.default: {i64, u8, b8, f32, i8, f64, i16, i32},
+    aten.copy_.default: {c32},
+    aten.cummax.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.cummin.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32},
+    aten.index_add.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.index_add.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32},
+    aten.isnan.default: {f64, f32},
+    aten.mul.Scalar: {i64, bf16, f16, f32, i8, f64, i16, i32},
+    aten.native_batch_norm.default: {f64, f32},
+    aten.native_layer_norm.default: {bf16, f64, f32},
+    aten.slice.Tensor: {c32},
+    aten.inverse.default: {f32, f64},
+    aten.linalg_pinv.atol_rtol_tensor: {f32, f64},
+    aten.linalg_pinv.atol_rtol_tensor_out: {f32, f64},
+    aten.empty.memory_format: {b8, bf16, c128, c64, c32, f16, f32, f64, i16, i32, i64, i8, u8},
+}
+
+meta_dispatch_device_expected_failures = defaultdict(dict)
+meta_dispatch_device_skips = defaultdict(dict)
+
+meta_dispatch_device_expected_failures['cuda'] = {
+    aten._adaptive_avg_pool2d.default: {f16},  # aten::_adaptive_avg_pool2d
+    aten._adaptive_avg_pool3d.default: {bf16},  # aten::_adaptive_avg_pool3d
+    aten._conj_physical.default: {f16},  # aten::conj_physical.out
+    aten._convolution.default: {f16},
+    aten._embedding_bag_forward_only.default: {bf16},  # aten::_embedding_bag_forward_only
+    aten._fft_c2c.default: {c32, f16},  # aten::_fft_c2c
+    aten._fft_c2c.out: {c32, f16},  # aten::_fft_c2c.out
+    aten._fft_c2r.default: {c32, f16},  # aten::_fft_c2r
+    aten._fft_c2r.out: {c32, f16},  # aten::_fft_c2r.out
+    aten._fft_r2c.default: {f16},  # aten::_fft_r2c
+    aten._fft_r2c.out: {f16},  # aten::_fft_r2c.out
+    aten._linalg_check_errors.default: {c128, c64, f32, f64},  # aten::_local_scalar_dense
+    aten._unique2.default: {f16},  # aten::_unique2
+    aten._use_cudnn_ctc_loss.default: {f32, f64},  # aten::_use_cudnn_ctc_loss
+    aten.addbmm.default: {f16},  # aten::addbmm
+    aten.addbmm.out: {f16},  # aten::addbmm.out
+    aten.bernoulli.out: {f16},  # aten::bernoulli.out
+    aten.convolution.default: {f16},
+    aten.cudnn_grid_sampler.default: {f16, f32, f64},  # aten::cudnn_grid_sampler
+    aten.diag.default: {f16},  # aten::diag.out
+    aten.diag.out: {bf16, f16},  # aten::diag.out
+    aten.dot.default: {f16},  # aten::dot
+    aten.dot.out: {f16},  # aten::dot
+    aten.geqrf.default: {f32, f64},  # aten::geqrf
+    aten.grid_sampler_2d.default: {f16},  # aten::grid_sampler_2d
+    aten.grid_sampler_3d.default: {f16},  # aten::grid_sampler_3d
+    aten.histc.default: {i16, i32, i64, i8},  # aten::histc
+    aten.histc.out: {i16, i32, i64, i8},  # aten::histc.out
+    aten.index.Tensor: {c32},  # aten::index.Tensor
+    aten.inverse.out: {f32, f64},  # aten::_local_scalar_dense
+    aten.kthvalue.default: {f16},  # aten::kthvalue.values
+    aten.linalg_cholesky_ex.L: {f32, f64},  # aten::linalg_cholesky_ex.L
+    aten.linalg_cholesky_ex.default: {f32, f64},  # aten::linalg_cholesky_ex
+    aten.linalg_eigvalsh.out: {f32, f64},  # aten::linalg_eigvalsh.out
+    aten.linalg_householder_product.default: {f32, f64},  # aten::linalg_householder_product
+    aten.linalg_householder_product.out: {f32, f64},  # aten::linalg_householder_product.out
+    aten.linalg_matrix_exp.default: {f16},  # aten::linalg_matrix_exp
+    aten.linalg_qr.default: {f32, f64},  # aten::_linalg_qr_helper
+    aten.linalg_solve_triangular.default: {f32, f64},  # aten::linalg_solve_triangular
+    aten.linalg_solve_triangular.out: {f32, f64},  # aten::linalg_solve_triangular.out
+    aten.log_sigmoid_forward.default: {bf16, f16, f64, f32},
+    aten.log_sigmoid_forward.output: {f16},  # aten::log_sigmoid_forward.output
+    aten.logcumsumexp.default: {bf16, f16},  # aten::_logcumsumexp
+    aten.logcumsumexp.out: {bf16, f16},  # aten::_logcumsumexp.out
+    aten.logit.out: {f16},
+    aten.max_pool3d_with_indices.default: {bf16, f16},  # aten::max_pool3d_with_indices
+    aten.max_unpool2d.default: {f16},  # aten::max_unpool2d
+    aten.max_unpool3d.default: {f16},  # aten::max_unpool3d
+    aten.median.default: {f16},  # aten::median
+    aten.median.dim: {f16},  # aten::median.dim_values
+    aten.multi_margin_loss.default: {bf16, f16},  # aten::multi_margin_loss
+    aten.multilabel_margin_loss_forward.default: {bf16, f16},  # aten::multilabel_margin_loss_forward
+    aten.multinomial.default: {f16},  # aten::multinomial
+    aten.multinomial.out: {f16},  # aten::multinomial.out
+    aten.mvlgamma.default: {f16},  # aten::_local_scalar_dense
+    aten.mvlgamma.out: {f16},  # aten::mvlgamma.out
+    aten.nanmedian.default: {f16},  # aten::nanmedian
+    aten.nanmedian.dim: {f16},  # aten::nanmedian.dim_values
+    aten.native_batch_norm.default: {bf16, f16},  # aten::var_mean.correction
+    aten.native_dropout.default: {bf16, f16, f32, f64},
+    aten.native_group_norm.default: {bf16, f16},  # aten::var_mean.correction
+    aten.native_layer_norm.default: {f16},  # aten::var_mean.correction
+    aten.nll_loss2d_forward.default: {f16},  # aten::nll_loss2d_forward
+    aten.ormqr.default: {f32, f64},  # aten::ormqr
+    aten.ormqr.out: {f32, f64},  # aten::ormqr.out
+    aten.prelu.default: {f16},  # aten::prelu
+    aten.prod.default: {bf16, c32, f16},  # aten::prod
+    aten.reflection_pad2d.default: {f16},  # aten::reflection_pad2d
+    aten.relu.default: {f16},  # aten::relu
+    aten.rrelu_with_noise.default: {f16},  # aten::rrelu_with_noise
+    aten.tensordot.out: {f16},  # aten::tensordot.out
+    aten.trace.default: {b8, bf16, f16},  # aten::diag.out
+    aten.unique_consecutive.default: {f16},  # aten::unique_consecutive
+    aten.unique_dim.default: {f16},  # aten::unique_dim
+    aten.upsample_nearest3d.vec: {f16},  # aten::upsample_nearest3d.vec
+    aten.vdot.default: {f16},  # aten::vdot
+    aten.vdot.out: {f16},  # aten::vdot
+}
+
+meta_dispatch_device_skips['cuda'] = {
+    aten._conj.default: {c32, f16},
+    aten._linalg_svd.default: {f32, f64},
+    aten.cudnn_batch_norm.default: {f32, f64},
+    aten.cummax.default: {f16},
+    aten.cummin.default: {f16},
+    aten.inverse.default: {f32, f64},
+    aten.slice.Tensor: {f16},
+    # ROCm stuff; technically this should be expected failure but it's
+    # not worth it; these should get unified anyway
+    aten.miopen_batch_norm.default: {f32},
+}
+
+class MetaCrossRefDispatchMode(torch.utils._python_dispatch.TorchDispatchMode):
+    test_case: TestCase
+    device: torch.device
+    dtype: torch.dtype
+
+    def __init__(self, test_case, *, device, dtype):
+        self.test_case = test_case
+        # save TLS
+        self.precision = test_case.precision
+        self.rel_tol = test_case.rel_tol
+        self.device_type = torch.device(device).type
+        self.dtype = dtype
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        self.test_case.precision = self.precision
+        self.test_case.rel_tol = self.rel_tol
+
+        if self.dtype in meta_dispatch_skips.get(func, set()):
+            test_expect = TestExpect.SKIP
+        elif self.dtype in meta_dispatch_device_skips[self.device_type].get(func, set()):
+            test_expect = TestExpect.SKIP
+        elif self.dtype in meta_dispatch_expected_failures.get(func, set()):
+            test_expect = TestExpect.XFAILURE
+        elif self.dtype in meta_dispatch_device_expected_failures[self.device_type].get(func, set()):
+            test_expect = TestExpect.XFAILURE
+        else:
+            test_expect = TestExpect.SUCCESS
+
+        return run_meta_crossref(
+            self.test_case,
+            test_expect,
+            func,
+            args,
+            kwargs,
+            dtype=self.dtype,
+            device_type=self.device_type,
+        )
+
+
+# NB: we're running these tests only on CUDA because there are some
+# inconsistencies between CUDA and CPU, and running on CUDA makes it easier
+# to ignore the CPU case when inconsistencies arise.  Ideally we deal
+# with the inconsistencies but this takes time.
+class TestMeta(TestCase):
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyCUDA
+    @skipIfCrossRef
+    @suppress_warnings
+    @ops(op_db)
+    def test_meta(self, device, dtype, op):
+        # run the OpInfo sample inputs, cross-referencing them with the
+        # meta implementation and check the results are the same.  All
+        # the heavy lifting happens in MetaCrossRefFunctionMode
+        func = op.get_op()
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample_input in samples:
+            args = [sample_input.input] + list(sample_input.args)
+            kwargs = sample_input.kwargs
+            with MetaCrossRefFunctionMode.push(self, dtype=dtype, device=device):
+                expected = func(*args, **kwargs)
+                if isinstance(expected, torch.Tensor) and op.supports_out:
+                    func(*args, **kwargs, out=expected)
+
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyCUDA
+    @skipIfCrossRef
+    @suppress_warnings
+    @ops(op_db)
+    def test_dispatch_meta(self, device, dtype, op):
+        func = op.get_op()
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample_input in samples:
+            args = [sample_input.input] + list(sample_input.args)
+            kwargs = sample_input.kwargs
+            with MetaCrossRefDispatchMode.push(self, dtype=dtype, device=device):
+                expected = func(*args, **kwargs)
+                if isinstance(expected, torch.Tensor) and op.supports_out:
+                    func(*args, **kwargs, out=expected)
+
+instantiate_device_type_tests(TestMeta, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index bfaca50e2090..cb9eb4828cac 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -283,6 +283,56 @@ def test_conv2d_bf16(self):
     def test_conv3d_bf16(self):
         self._test_conv_bf16_base(dim=3)
 
+    def _test_conv2d_nhwc_base(self, dtype):
+        conv_module = torch.nn.Conv2d
+        input_shapes = (224, 224)
+        options = itertools.product([True, False], [True, False], [1, 2], [1, 4])
+        for train, bias, dilation, groups in options:
+            N = torch.randint(3, 10, (1,)).item()
+            M = torch.randint(1, 3, (1,)).item() * groups
+            C = torch.randint(1, 3, (1,)).item() * groups
+            x_shape = (N, C) + input_shapes
+            x = torch.randn(x_shape, dtype=dtype)
+            # conv1: mkldnn conv2d in contiguous memory format (nchw)
+            # conv2: mkldnn conv2d in channels last memory format (nhwc)
+            conv1 = conv_module(in_channels=C,
+                                out_channels=M,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                dilation=dilation,
+                                bias=bias,
+                                groups=groups).to(dtype=dtype)
+            conv2 = copy.deepcopy(conv1).to(memory_format=torch.channels_last)
+            x1 = x.clone()
+            x2 = x.clone().to(memory_format=torch.channels_last)
+            if train:
+                x1.requires_grad_()
+                x2.requires_grad_()
+            y1 = conv1(x1)
+            y2 = conv2(x2)
+            self.assertEqual(y1, y2)
+            if train:
+                y1.sum().backward()
+                y2.sum().backward()
+                self.assertTrue(x2.grad.is_contiguous(memory_format=torch.channels_last))
+                self.assertEqual(conv1.weight.grad,
+                                 conv2.weight.grad,
+                                 atol=1e-3,
+                                 rtol=1e-3)
+                if bias:
+                    self.assertEqual(conv1.bias.grad, conv2.bias.grad)
+                self.assertEqual(x1.grad, x2.grad)
+
+    def test_conv2d_nhwc(self):
+        self._test_conv2d_nhwc_base(dtype=torch.float32)
+
+    @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
+    def test_conv2d_nhwc_bf16(self):
+        # when has_bf16_support() returns false, bf16 CPU conv will fall back to thnn impl
+        if has_bf16_support():
+            self._test_conv2d_nhwc_base(dtype=torch.bfloat16)
+
     def test_conv2d_legacy_jit_model(self):
         """
         MKLDNN integration used to serialize models with 5d weight for grouped
@@ -400,6 +450,74 @@ def test_gelu_bf16(self):
                                    msg,
                                    lambda: m(x2))
 
+    def _test_prelu_base(self, size, num_channels):
+        x = torch.randn(size, dtype=torch.float32)
+        x1 = x.clone().requires_grad_()
+        x2 = x.clone().to_mkldnn().requires_grad_()
+        x3 = x.clone().to_mkldnn().requires_grad_()
+        m1 = torch.nn.PReLU(num_channels)
+        m2 = mkldnn_utils.to_mkldnn(copy.deepcopy(m1))
+        m3 = copy.deepcopy(m1)
+        y1 = m1(x1)
+        y2 = m2(x2).to_dense()
+        y3 = m3(x3).to_dense()  # Only convert data to mkldnn, weight is Aten tensor
+        loss1 = y1.sum()
+        loss1.backward()
+        loss2 = y2.sum()
+        loss2.backward()
+        loss3 = y3.sum()
+        loss3.backward()
+        self.assertEqual(y1, y2)
+        self.assertEqual(y1, y3)
+        self.assertEqual(x1.grad, x2.grad.to_dense())
+        self.assertEqual(x1.grad, x3.grad.to_dense())
+
+    def test_prelu(self):
+        self._test_prelu_base(torch.Size([16]), 1)
+        self._test_prelu_base(torch.Size([16, 64]), 1)
+        self._test_prelu_base(torch.Size([16, 64]), 64)
+        self._test_prelu_base(torch.Size([16, 64, 112]), 1)
+        self._test_prelu_base(torch.Size([16, 64, 112]), 64)
+        self._test_prelu_base(torch.Size([16, 64, 112, 112]), 1)
+        self._test_prelu_base(torch.Size([16, 64, 112, 112]), 64)
+        self._test_prelu_base(torch.Size([16, 64, 112, 112, 1]), 1)
+        self._test_prelu_base(torch.Size([16, 64, 112, 112, 1]), 64)
+
+    @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
+    def _test_prelu_bf16_base(self, size, num_channels):
+        if has_bf16_support():
+            x = torch.randn(size, dtype=torch.float32)
+            x_fp32 = x.clone().to_mkldnn().requires_grad_()
+            x_bf16 = x.clone().to_mkldnn(torch.bfloat16).requires_grad_()
+            m = mkldnn_utils.to_mkldnn(torch.nn.PReLU())
+            m_bf16 = mkldnn_utils.to_mkldnn(torch.nn.PReLU(), torch.bfloat16)
+
+            y = m(x_fp32).to_dense()
+            y_bf16 = m_bf16(x_bf16).to_dense()
+            self.assertEqual(y, y_bf16.to(torch.float32), atol=1e-1, rtol=1e-3)
+
+            loss = y.sum()
+            loss.backward()
+            loss_bf16 = y_bf16.sum()
+            loss_bf16.backward()
+            self.assertEqual(x_fp32.grad.to_dense(), x_bf16.grad.to_dense(torch.float32))
+        else:
+            x_bf16 = torch.randn(size, dtype=torch.bfloat16).requires_grad_()
+            m_bf16 = mkldnn_utils.to_mkldnn(torch.nn.PReLU(), torch.bfloat16)
+            msg = r"bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"
+            self.assertRaisesRegex(RuntimeError,
+                                   msg,
+                                   lambda: m_bf16(x_bf16))
+
+    def test_prelu_bf16(self):
+        self._test_prelu_bf16_base(torch.Size([16]), 1)
+        self._test_prelu_bf16_base(torch.Size([16, 64]), 1)
+        self._test_prelu_bf16_base(torch.Size([16, 64]), 64)
+        self._test_prelu_bf16_base(torch.Size([16, 64, 112]), 1)
+        self._test_prelu_bf16_base(torch.Size([16, 64, 112]), 64)
+        self._test_prelu_bf16_base(torch.Size([16, 64, 112, 112, 1]), 1)
+        self._test_prelu_bf16_base(torch.Size([16, 64, 112, 112, 1]), 64)
+
     def _test_max_pool_base(self, dim, input):
         pool_module = {2: torch.nn.MaxPool2d, 3: torch.nn.MaxPool3d}
         for stride in [1, 2, 3]:
diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index bb42702f536b..c7fc823a9364 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -3,9 +3,8 @@
 import unittest
 import torch
 import torch.nn as nn
-import torch.backends.xnnpack
 import torch.utils.bundled_inputs
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, skipIfNoXNNPACK
 from torch.testing._internal.jit_utils import get_forward, get_forward_graph
 from torch.utils.mobile_optimizer import (LintCode,
                                           generate_mobile_module_lints,
@@ -24,9 +23,7 @@
 
 class TestOptimizer(TestCase):
 
-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
-                         " XNNPACK must be enabled for these tests."
-                         " Please build with USE_XNNPACK=1.")
+    @skipIfNoXNNPACK
     def test_optimize_for_mobile(self):
         batch_size = 2
         input_channels_per_group = 6
@@ -151,7 +148,7 @@ def forward(self, x):
         bn_scripted_module = torch.jit.script(bn_test_module)
         bn_scripted_module.eval()
 
-        self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 14)
+        self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 11)
         FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \
                    .run(str(get_forward(bn_scripted_module._c).graph))
 
@@ -252,7 +249,7 @@ def foo(self, x):
         bn_no_forward_scripted_module = torch.jit.script(bn_test_no_forward_module)
         bn_no_forward_scripted_module.eval()
 
-        self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 14)
+        self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 11)
         FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \
                    .run(bn_no_forward_scripted_module.foo.graph)
 
@@ -265,9 +262,7 @@ def foo(self, x):
             rtol=1e-2,
             atol=1e-3)
 
-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
-                         " XNNPACK must be enabled for these tests."
-                         " Please build with USE_XNNPACK=1.")
+    @skipIfNoXNNPACK
     def test_quantized_conv_no_asan_failures(self):
         # There were ASAN failures when fold_conv_bn was run on
         # already quantized conv modules. Verifying that this does
@@ -361,6 +356,7 @@ def get_lint_count_by_type(lint_type, module_lint_List):
         bi_module_lint_list = generate_mobile_module_lints(bi_module)
         self.assertEqual(len(bi_module_lint_list), 0)
 
+    @skipIfNoXNNPACK
     def test_preserve_bundled_inputs_methods(self):
         class MyBundledInputModule(torch.nn.Module):
             def __init__(self):
@@ -415,9 +411,7 @@ def get_all_bundled_inputs(self):
         incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs'])
         self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs'))
 
-    @unittest.skipUnless(torch.backends.xnnpack.enabled,
-                         " XNNPACK must be enabled for these tests."
-                         " Please build with USE_XNNPACK=1.")
+    @skipIfNoXNNPACK
     def test_hoist_conv_packed_params(self):
 
         if 'qnnpack' not in torch.backends.quantized.supported_engines:
@@ -511,6 +505,7 @@ def _quant_script_and_optimize(model):
             m_optim_res = m_optim(data)
             torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
 
+    @skipIfNoXNNPACK
     @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision")
     def test_mobilenet_optimize_for_mobile(self):
         m = torchvision.models.mobilenet_v3_small()
diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index 10f3fe39b373..a8add0e2cd92 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -10,9 +10,10 @@
 import unittest
 
 import torch
+import torch.backends.xnnpack
 import torch.utils.model_dump
 import torch.utils.mobile_optimizer
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, skipIfNoXNNPACK
 from torch.testing._internal.common_quantized import supported_qengines
 
 
@@ -170,6 +171,7 @@ def test_quantized_model(self):
         qmodel = self.get_quant_model()
         self.do_dump_model(torch.jit.script(qmodel))
 
+    @skipIfNoXNNPACK
     @unittest.skipUnless("qnnpack" in supported_qengines, "QNNPACK not available")
     def test_optimized_quantized_model(self):
         qmodel = self.get_quant_model()
diff --git a/test/test_module_init.py b/test/test_module_init.py
index 589db4b71622..b568f210e550 100644
--- a/test/test_module_init.py
+++ b/test/test_module_init.py
@@ -166,6 +166,9 @@ def build_constructor_arg_db():
         torch.nn.UpsamplingBilinear2d: ((), {}),
         torch.nn.UpsamplingNearest2d: ((), {}),
         torch.nn.ZeroPad2d: ((0,), {}),
+        torch.nn.qat.Conv1d: ((3, 3, 3), {
+            'qconfig': torch.ao.quantization.default_qconfig,
+        }),
         torch.nn.qat.Conv2d: ((3, 3, 3), {
             'qconfig': torch.ao.quantization.default_qconfig,
         }),
@@ -206,7 +209,7 @@ def build_constructor_arg_db():
         torch.nn.quantized.EmbeddingBag: ((10, 3), {
             'factory_kwargs': {},
         }),
-        torch.nn.quantized.GroupNorm: ((2, 3, torch.nn.Parameter(torch.tensor(2.)),
+        torch.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)),
                                         torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
         torch.nn.quantized.Hardswish: ((0.1, 0,), {}),
         torch.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)),
@@ -228,6 +231,7 @@ def build_constructor_arg_db():
         }),
         torch.nn.quantized.ReLU6: ((), {}),
         torch.nn.quantized.Sigmoid: ((0.1, 0), {}),
+        torch.nn.quantized.Softmax: ((), {}),
         torch.nn.quantized.FloatFunctional: ((), {}),
         torch.nn.quantized.FXFloatFunctional: ((), {}),
         torch.nn.quantized.QFunctional: ((), {}),
diff --git a/test/test_modules.py b/test/test_modules.py
index 448f8f5fa751..292382e83e9c 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -8,10 +8,10 @@
 
 import torch
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol)
+    instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
 from torch.testing._internal.common_modules import module_db, modules
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck)
+    TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck, skipIfMps)
 from unittest.mock import patch, call
 
 
@@ -40,6 +40,7 @@ def _check_module(items, name, device=device, dtype=dtype):
         _check_module(module.named_parameters(), "Parameter")
         _check_module(module.named_buffers(), "Buffer")
 
+    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db)
     def test_forward(self, device, dtype, module_info):
         module_cls = module_info.module_cls
@@ -201,6 +202,7 @@ def test_repr(self, device, dtype, module_info):
             m.__repr__()
             str(m)
 
+    @skipIfMps
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info):
         # Test that module can be pickled and unpickled.
@@ -233,6 +235,7 @@ def test_pickle(self, device, dtype, module_info):
 
     @modules([module_info for module_info in module_db
               if 'inplace' in signature(module_info.module_cls).parameters])
+    @skipMeta
     def test_check_inplace(self, device, dtype, module_info):
         # Check if the inplace variant of the module gives the same result as the out of place
         # variant.
@@ -310,6 +313,7 @@ def inner_zero_grad(obj):
                 obj.grad = None
         self._traverse_obj(obj, inner_zero_grad)
 
+    @skipIfMps
     @modules(module_db)
     def test_non_contiguous_tensors(self, device, dtype, module_info):
         # Check modules work with non-contiguous tensors
@@ -543,7 +547,7 @@ def check_backward(cpu_output, gpu_output):
                     for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs):
                         check_backward(cpu_output, gpu_output)
 
-
+    @skipIfMps
     @modules(module_db)
     def test_memory_format(self, device, dtype, module_info):
         module_cls = module_info.module_cls
diff --git a/test/test_mps.py b/test/test_mps.py
new file mode 100644
index 000000000000..04804261505f
--- /dev/null
+++ b/test/test_mps.py
@@ -0,0 +1,4006 @@
+# -*- coding: utf-8 -*-
+# Owner(s): ["module: mps"]
+
+import sys
+import math
+import random
+import unittest
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import itertools
+from torch.nn import Parameter
+from torch.testing._internal.common_utils import run_tests, TestCase, download_file, TEST_WITH_UBSAN
+import torch.backends.mps
+from torch.distributions import (Uniform)
+
+from torch.testing._internal.common_nn import NNTestCase
+import numpy as np
+import torch
+
+# Same logic as test_cuda.py
+if not torch.backends.mps.is_available():
+    print('MPS not available, skipping tests', file=sys.stderr)
+    TestCase = object  # noqa: F811
+    NNTestCase = object  # noqa: F811
+
+
+class MPSReluTest(TestCase):
+    def _npRelu(self, np_features):
+        return np.maximum(np_features, np.zeros(np_features.shape)).astype(np_features.dtype)
+
+    def testNpRelu(self):
+        torch.testing.assert_allclose(
+            np.array([[0., 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
+            self._npRelu(
+                np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                         0.9]])))
+
+    def _testRelu(self, np_features, device):
+        np_relu = self._npRelu(np_features)
+        # Convert the numpy array to a PyTorch Tensor,
+        # and move the Tensor to the CPU/GPU based on the "device" parameter
+        py_tensor = torch.from_numpy(np_features).to(device)
+        py_relu = torch.nn.ReLU(inplace=False)(py_tensor)
+        py_relu_cpu = py_relu.to("cpu")
+
+        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+
+    def _testReluInPlace(self, np_features, device):
+        np_relu = self._npRelu(np_features)
+        # Convert the numpy array to a PyTorch Tensor,
+        # and move the Tensor to the CPU/GPU based on the "device" parameter
+        py_tensor = torch.from_numpy(np_features).to(device)
+        py_relu = torch.nn.ReLU(inplace=True)(py_tensor)
+        py_relu_cpu = py_relu.to("cpu")
+
+        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+        # Inplace Relu modifies the initial input and it should match the output of Relu
+        torch.testing.assert_allclose(np_relu, py_tensor.to("cpu"))
+
+    def testNumbersCPU(self):
+        for t in [np.int32]:
+            # Force execution on CPU even if a GPU kernel is available for the type.
+            self._testRelu(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                device="cpu")
+            self._testReluInPlace(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                device="cpu")
+
+    def testNumbersGPU(self):
+        for t in [np.float16, np.float32]:
+            self._testRelu(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                device="mps")
+            self._testReluInPlace(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                device="mps")
+
+class MatmulTest(TestCase):
+    def _helper(self, shape_tensor_1, shape_tensor_2, expand_tensor_1_shape=None, expand_tensor_2_shape=None):
+        if expand_tensor_1_shape:
+            tensor1_mps = torch.randn(shape_tensor_1, device="mps").expand(expand_tensor_1_shape)
+        else:
+            tensor1_mps = torch.randn(shape_tensor_1, device="mps")
+
+        if expand_tensor_2_shape:
+            tensor2_mps = torch.randn(shape_tensor_2, device="mps").expand(expand_tensor_2_shape)
+        else:
+            tensor2_mps = torch.randn(shape_tensor_2, device="mps")
+
+        tensor1_cpu = tensor1_mps.to("cpu")
+        tensor2_cpu = tensor2_mps.to("cpu")
+
+        matmul_cpu = torch.matmul(tensor1_cpu, tensor2_cpu)
+        matmul_mps = torch.matmul(tensor1_mps, tensor2_mps)
+
+        self.assertEqual(matmul_cpu, matmul_mps.to("cpu"))
+
+    def test_vector_x_vector(self):
+        # uses `dot`
+        self._helper(3, 3)
+
+    def test_matrix_x_vector(self):
+        # uses `addmv`
+        self._helper((3, 4), 4)
+
+    def test_batched_matrix_x_broadcasted_vector(self):
+        self._helper((10, 3, 4), 4)
+
+    def test_batched_matrix_x_batched_matrix(self):
+        # uses `bmm.out`
+        self._helper((10, 3, 4), (10, 4, 5))
+
+    def test_batched_matrix_x_broadcasted_matrix(self):
+        self._helper((10, 3, 4), (4, 5))
+
+
+class MPSLeakyReluTest(TestCase):
+    def _npLeakyRelu(self, np_features, negative_slope=0.1):
+        return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype)
+
+    def testNpLeakyRelu(self):
+        torch.testing.assert_allclose(
+            np.array([[-0.09, 0.7, -0.05, 0.3, -0.01],
+                      [0.1, -0.03, 0.5, -0.07, 0.9]]),
+            self._npLeakyRelu(
+                np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                         0.9]]),
+                negative_slope=0.1))
+
+    def _testLeakyRelu(self, np_features, negative_slope, device):
+        cpu_x = torch.from_numpy(np_features).requires_grad_()
+        mps_x = torch.from_numpy(np_features).to('mps').requires_grad_()
+        relu_op = torch.nn.LeakyReLU(negative_slope)
+
+        cpu_leaky_relu = relu_op(cpu_x)
+        mps_leaky_relu = relu_op(mps_x)
+        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+
+        # test backward pass
+        cpu_grad = torch.ones_like(cpu_leaky_relu)
+        mps_grad = cpu_grad.to('mps')
+        cpu_leaky_relu.backward(gradient=cpu_grad)
+        mps_leaky_relu.backward(gradient=mps_grad)
+        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+
+    def testNumbersCPU(self):
+        for t in [np.float32]:
+            self._testLeakyRelu(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                negative_slope=0.2,
+                device="cpu")
+
+
+class TestAvgPool(TestCase):
+    def _sum_pool2d(self, x, kernel_size):
+        windows = torch.nn.functional.unfold(x, kernel_size=kernel_size, stride=kernel_size)
+        return torch.sum(windows, dim=1)
+
+    def _sum_pool3d(self, x, kernel_size):
+        # Because unfold does not support 3D sliding window we will split tensor to multiple tensors and calculate sum
+        h = kernel_size[0]
+        splited_x = [t.sum(0) for t in x.split(h) if t.size(0) == h]
+        # sum_pool2d assumes tensor in (1, 1, n, m) view, so unsqueeze two times
+        splited_x = [self._sum_pool2d(t.unsqueeze(0).unsqueeze(0), kernel_size[1:]) for t in splited_x]
+        joined_x = torch.cat(splited_x)
+        return joined_x.view(1, joined_x.numel())
+
+    def _avg_pool2d(self, x, kernel_size):
+        size = reduce((lambda x, y: x * y), kernel_size)
+        return self._sum_pool2d(x, kernel_size) / size
+
+    def _avg_pool3d(self, x, kernel_size):
+        size = reduce((lambda x, y: x * y), kernel_size)
+        return self._sum_pool3d(x, kernel_size) / size
+
+    def test_avg_pool2d_with_zero_divisor(self):
+        self.assertRaisesRegex(RuntimeError, "divisor must be not zero",
+                               lambda: F.avg_pool2d(torch.zeros(3, 3, 3), (2, 2), divisor_override=0))
+
+    def test_doubletensor_avg_pool2d_with_divisor(self):
+        n, m = 3, 3
+        input = torch.rand(1, 1, n, m)
+        for i in range(1, n + 1):
+            for j in range(1, m + 1):
+                for divisor in [1, 7, i * j]:
+                    actual = F.avg_pool2d(input[0], (i, j), divisor_override=divisor)
+                    actual = actual.view(1, actual.numel())
+                    expected = self._sum_pool2d(input, (i, j)) / divisor
+                    self.assertEqual(actual, expected, rtol=0, atol=1e-5)
+
+    def test_avg_pool2d_ceil_mode(self):
+        # Regression test for gh-36977
+        x = 10 * torch.randn((1, 16, 4, 4))
+        y = torch.nn.functional.avg_pool2d(
+            x, ceil_mode=True, count_include_pad=True, kernel_size=(1, 2),
+            padding=(0, 1), stride=2)
+        self.assertTrue(not torch.isnan(y).any())
+        y = torch.nn.functional.avg_pool2d(
+            x.to('mps'), ceil_mode=True, count_include_pad=True, kernel_size=(1, 2),
+            padding=(0, 1), stride=2)
+        self.assertTrue(not torch.isnan(y).any())
+
+
+class TestMPS(TestCase):
+    # @dtypes(*product([torch.float32, torch.int32], (torch.uint8, torch.bool)))
+    def test_masked_fill(self):
+        device = "mps"
+        dtype = torch.float32
+        mask_dtype = torch.bool
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            num_dest = 10
+            dst = torch.zeros(num_dest, dtype=dtype, device=device)
+            mask = torch.randint(2, (num_dest,), dtype=mask_dtype, device=device)
+            val = random.random()
+            dst2 = torch.zeros(num_dest, dtype=dtype)
+            mask_cpu = mask.to("cpu")
+
+            dst.masked_fill_(mask, val)
+            for i in range(num_dest):
+                if mask_cpu[i]:
+                    dst2[i] = val
+            self.assertEqual(dst.to("cpu"), dst2, atol=0, rtol=0)
+
+            # test non-contiguous case
+            dst = ((torch.randn(num_dest, num_dest, num_dest) * 10).to(dtype)).permute((2, 0, 1))
+            dst2 = dst.contiguous()
+            if dtype.is_complex:
+                mask = dst.abs() > 0
+            else:
+                mask = dst > 0
+            self.assertTrue(not dst.is_contiguous())
+            self.assertTrue(dst2.is_contiguous())
+            dst.masked_fill_(mask.to(mask_dtype), val)
+            dst2.masked_fill_(mask.to(mask_dtype), val)
+            self.assertEqual(dst, dst2, atol=0, rtol=0)
+
+            if mask_dtype == torch.uint8:
+                self.assertEqual(len(w), 3)
+
+                warn = 'masked_fill_ received a mask with dtype torch.uint8,'
+                for wi in w:
+                    self.assertEqual(str(wi.message)[0:52], str(warn))
+            else:
+                self.assertEqual(len(w), 0)
+
+    def test_exp(self, device="mps", dtype=torch.float):
+        for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
+            b = torch.arange(18, device="cpu") / 3 * math.pi
+            a = torch.tensor(v, dtype=dtype, device="cpu") * b
+            a = a.to(dtype).to("mps")
+            self.compare_with_numpy(torch.exp, np.exp, a)
+
+    def test_exp1(self, device="mps", dtype=torch.float):
+        input = torch.tensor([-0.1, 3.0, -0.9]).to('mps')
+        output = torch.exp(input).to('cpu')
+        print(output)
+
+    def _testLeakyRelu(self, np_features, negative_slope, device):
+        cpu_x = torch.from_numpy(np_features).requires_grad_()
+        mps_x = torch.from_numpy(np_features).to('mps').requires_grad_()
+        relu_op = torch.nn.LeakyReLU(negative_slope)
+
+        cpu_leaky_relu = relu_op(cpu_x)
+        mps_leaky_relu = relu_op(mps_x)
+        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+
+        # test backward pass
+        cpu_grad = torch.ones_like(cpu_leaky_relu)
+        mps_grad = cpu_grad.to('mps')
+        cpu_leaky_relu.backward(gradient=cpu_grad)
+        mps_leaky_relu.backward(gradient=mps_grad)
+        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+
+    def testNumbersGPU(self):
+        for t in [np.float32]:
+            self._testLeakyRelu(
+                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+                negative_slope=0.1,
+                device="mps")
+
+    def test_fill(self):
+
+        def helper(val, shape):
+            tensor = torch.zeros(shape, device='mps')
+            tensor_mps = tensor.fill_(val)
+            tensor_mps = torch.tanh(tensor_mps)
+
+            tensor_0 = torch.zeros(shape, device='cpu')
+            tensor_cpu = tensor_0.fill_(val)
+            tensor_cpu = torch.tanh(tensor_cpu)
+
+            self.assertEqual(tensor_mps, tensor_cpu)
+
+        helper(0, [1024])
+        helper(0.2, [2, 3])
+
+    def test_mm(self):
+        B = torch.ones(5, 6).to("mps")
+        C = torch.ones(6, 5).to("mps")
+        D = torch.mm(B, C).cpu()
+        torch.testing.assert_allclose(D, torch.full((5, 5), 6.0))
+
+    def test_addmm(self):
+        A = torch.ones(5, 5).to("mps")
+        B = torch.ones(5, 6).to("mps")
+        C = torch.ones(6, 5).to("mps")
+        D = torch.addmm(A, B, C).to("cpu")
+        torch.testing.assert_allclose(D, torch.full((5, 5), 7.0))
+
+    def test_bmm(self):
+        batch1_cpu = torch.randn(10, 3, 4)
+        batch2_cpu = torch.randn(10, 4, 5)
+
+        batch1_mps = batch1_cpu.detach().clone().to("mps")
+        batch2_mps = batch2_cpu.detach().clone().to("mps")
+
+        output_cpu = torch.bmm(batch1_cpu, batch2_cpu)
+        output_mps = torch.bmm(batch1_mps, batch2_mps)
+
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
+
+    def test_addbmm(self):
+        M_cpu = torch.randn(3, 5)
+        batch1_cpu = torch.randn(10, 3, 4)
+        batch2_cpu = torch.randn(10, 4, 5)
+
+        M_mps = M_cpu.detach().clone().to("mps")
+        batch1_mps = batch1_cpu.detach().clone().to("mps")
+        batch2_mps = batch2_cpu.detach().clone().to("mps")
+
+        output_cpu = torch.addbmm(M_cpu, batch1_cpu, batch2_cpu)
+        output_mps = torch.addbmm(M_mps, batch1_mps, batch2_mps)
+
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
+
+    def test_baddbmm(self):
+        M_cpu = torch.randn(3, 5)
+        batch1_cpu = torch.randn(10, 3, 4)
+        batch2_cpu = torch.randn(10, 4, 5)
+        alpha = 1.2
+        beta = 0.8
+
+        M_mps = M_cpu.detach().clone().to("mps")
+        batch1_mps = batch1_cpu.detach().clone().to("mps")
+        batch2_mps = batch2_cpu.detach().clone().to("mps")
+
+        output_cpu = torch.baddbmm(M_cpu, batch1_cpu, batch2_cpu, beta=beta, alpha=alpha)
+        output_mps = torch.baddbmm(M_mps, batch1_mps, batch2_mps, beta=beta, alpha=alpha)
+
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
+
+    def test_local_scalar_dense_mps(self):
+        x_cpu = torch.randn(1)
+        y_mps = x_cpu.to("mps")
+        torch.testing.assert_allclose(x_cpu.item(), y_mps.item())
+
+    def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False):
+        cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias)
+        mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias)
+
+        # Use the same weights and bias as the ones from the cpu
+        mps_linear.weight.data = cpu_linear.weight.data.detach().clone().to("mps")
+
+        if bias:
+            mps_linear.bias.data = cpu_linear.bias.data.detach().clone().to("mps")
+
+        linear_mps_input = torch.randn(shape).to('mps')
+        linear_cpu_input = linear_mps_input.detach().clone().to('cpu')
+
+        if backward_pass:
+            linear_mps_input = linear_mps_input.requires_grad_()
+            linear_cpu_input = linear_cpu_input.requires_grad_()
+
+        linear_cpu_output = cpu_linear(linear_cpu_input)
+        linear_mps_output = mps_linear(linear_mps_input)
+
+        self.assertEqual(linear_cpu_output, linear_mps_output.to('cpu'))
+        self.assertEqual(linear_cpu_output.size(), linear_mps_output.size())
+
+        if backward_pass:
+            cpu_grad = torch.ones_like(linear_cpu_output)
+            grad = cpu_grad.to('mps')
+
+            linear_cpu_output.backward(gradient=cpu_grad)
+            linear_mps_output.backward(gradient=grad)
+
+            self.assertEqual(linear_cpu_input.grad.size(), linear_mps_input.grad.size())
+            self.assertEqual(linear_cpu_input.grad, linear_mps_input.grad.to("cpu"), atol=8e-04, rtol=10.4e-05)
+
+            self.assertEqual(cpu_linear.weight.grad.size(), mps_linear.weight.grad.size())
+            self.assertEqual(cpu_linear.weight.grad, mps_linear.weight.grad.to("cpu"), atol=8e-04, rtol=10.4e-05)
+            if bias:
+                self.assertEqual(cpu_linear.bias.grad.size(), mps_linear.bias.grad.size())
+                self.assertEqual(cpu_linear.bias.grad, mps_linear.bias.grad.to("cpu"), atol=8e-04, rtol=10.4e-05)
+
+    def test_linear2D(self):
+        self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=True, backward_pass=False)
+
+    def test_linear2D_backward(self):
+        self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=True, backward_pass=True)
+
+    def test_linear2D_no_bias(self):
+        self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=False, backward_pass=False)
+
+    def test_linear2D_no_bias_backward(self):
+        self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=False, backward_pass=True)
+
+    def test_linear3D(self):
+        self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=False)
+
+    def test_linear3D_backwarwd(self):
+        self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=True)
+
+    def test_linear3D_no_bias(self):
+        self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=False)
+
+    def test_linear3D_no_bias_backward(self):
+        self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=True)
+
+    def test_uniform(self):
+        low = torch.zeros(5, 5, requires_grad=True)
+        high = (torch.ones(5, 5) * 3).requires_grad_()
+        low_1d = torch.zeros(1, requires_grad=True)
+        high_1d = (torch.ones(1) * 3).requires_grad_()
+        self.assertEqual(Uniform(low, high).sample().size(), (5, 5))
+        self.assertEqual(Uniform(low, high).sample((7,)).size(), (7, 5, 5))
+        # self.assertEqual(Uniform(low_1d, high_1d).sample().size(), (1,))
+        # self.assertEqual(Uniform(low_1d, high_1d).sample((1,)).size(), (1, 1))
+        # self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,))
+
+        # # Check log_prob computation when value outside range
+        # uniform = Uniform(low_1d, high_1d, validate_args=False)
+        # above_high = torch.tensor([4.0])
+        # below_low = torch.tensor([-1.0])
+        # self.assertEqual(uniform.log_prob(above_high).item(), -inf)
+        # self.assertEqual(uniform.log_prob(below_low).item(), -inf)
+
+        # # check cdf computation when value outside range
+        # self.assertEqual(uniform.cdf(below_low).item(), 0)
+        # self.assertEqual(uniform.cdf(above_high).item(), 1)
+
+        # set_rng_seed(1)
+        # self._gradcheck_log_prob(Uniform, (low, high))
+        # self._gradcheck_log_prob(Uniform, (low, 1.0))
+        # self._gradcheck_log_prob(Uniform, (0.0, high))
+
+        # state = torch.get_rng_state()
+        # rand = low.new(low.size()).uniform_()
+        # torch.set_rng_state(state)
+        # u = Uniform(low, high).rsample()
+        # u.backward(torch.ones_like(u))
+        # self.assertEqual(low.grad, 1 - rand)
+        # self.assertEqual(high.grad, rand)
+        # low.grad.zero_()
+        # high.grad.zero_()
+
+    # Test forward maxpool2d
+    def test_max_pool2d(self):
+        def helper(shape, ks, padding=0, dilation=1, ceil_mode=False, return_indices=False, test_ties=False):
+
+            cpu_x = None
+            if(test_ties):
+                cpu_x = torch.ones(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            else:
+                cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            pool = torch.nn.MaxPool2d(kernel_size=ks, padding=padding, dilation=dilation,
+                                      ceil_mode=ceil_mode, return_indices=return_indices)
+
+            if(return_indices is False):
+                y = pool(x)
+                ref_y = pool(cpu_x)
+
+                cpu_grad = torch.ones_like(ref_y)
+                grad = cpu_grad.to('mps')
+
+                y.backward(gradient=grad)
+                ref_y.backward(gradient=cpu_grad)
+
+                self.assertEqual(y, ref_y)
+                self.assertEqual(x.grad, cpu_x.grad)
+            else:
+                y, idx = pool(x)
+                ref_y, ref_idx = pool(cpu_x)
+
+                cpu_grad = torch.ones_like(ref_y)
+                grad = cpu_grad.to('mps')
+
+                y.backward(gradient=grad)
+                ref_y.backward(gradient=cpu_grad)
+
+                self.assertEqual(y, ref_y)
+                self.assertEqual(idx, ref_idx)
+                self.assertEqual(x.grad, cpu_x.grad)
+
+        # Test with no batch dimension
+        helper((8, 4, 4), ks=2)
+        helper((2, 8, 4, 4), ks=2)
+        helper((1, 100000, 32, 32), ks=4)
+        helper((1, 100000, 1, 4), ks=(1, 4))  # test for max_pool1d
+        # Test padding
+        helper((1, 100000, 32, 32), ks=4, padding=1)
+        helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 1))  # test for max_pool1d
+        # Test dilation
+        helper((1, 100000, 32, 32), ks=4, dilation=2)
+        helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 2))  # test for max_pool1d
+        # Test ceil mode
+        helper((1, 100000, 32, 32), ks=4, ceil_mode=True)
+        helper((1, 100000, 1, 4), ks=(1, 4), ceil_mode=True)  # test for max_pool1d
+
+        # Test return indices
+        for test_ties in [False, True]:
+            # Test with no batch dimension
+            helper((8, 4, 4), ks=2, return_indices=True, test_ties=test_ties)
+            helper((2, 8, 4, 4), ks=2, return_indices=True, test_ties=test_ties)
+            helper((1, 100000, 32, 32), ks=4, return_indices=True, test_ties=test_ties)
+            helper((1, 100000, 1, 4), ks=(1, 4), return_indices=True, test_ties=test_ties)  # test for max_pool1d
+            # Test padding
+            helper((1, 100000, 32, 32), ks=4, padding=1, return_indices=True, test_ties=test_ties)
+            helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 1),
+                   return_indices=True, test_ties=test_ties)  # test for max_pool1d
+            # Test dilation
+            helper((1, 100000, 32, 32), ks=4, dilation=2, return_indices=True, test_ties=test_ties)
+            helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 2),
+                   return_indices=True, test_ties=test_ties)  # test for max_pool1d
+            # Test ceil mode
+            helper((1, 100000, 32, 32), ks=4, ceil_mode=True, return_indices=True, test_ties=test_ties)
+            helper((1, 100000, 1, 4), ks=(1, 4), ceil_mode=True,
+                   return_indices=True, test_ties=test_ties)  # test for max_pool1d
+
+    def test_adaptive_avg_pool2d_output_size_one(self):
+        def helper(size, memory_format):
+            x = torch.randint(1, 10, size, dtype=torch.float, device='mps', requires_grad=True)
+            x = x.to(memory_format=memory_format)
+
+            net = torch.nn.AdaptiveAvgPool2d((1, 1))
+            out = net(x)
+            ref_out = x.contiguous().mean((-1, -2)).view((x.size(0), x.size(1), 1, 1))
+
+            out.sum().backward()    # make sure it doesn't crash
+
+            self.assertEqual(out, ref_out)
+            if memory_format == torch.channels_last:
+                self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+                c = out.size(1)
+                self.assertEqual(out.stride(), [c, 1, c, c])
+            else:
+                self.assertTrue(out.is_contiguous())
+                c = out.size(1)
+                self.assertEqual(out.stride(), [c, 1, 1, 1])
+
+        helper((2, 3, 6, 6), torch.contiguous_format)
+
+    # Test forward batch norm
+    def test_batch_norm(self):
+        def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=False,
+                   track_running_stats=True, test_module=False):
+
+            import numpy as np
+            np.random.seed(332)
+            arr = (256 - 128) * np.random.random_sample(size=shape) + 128
+            cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True)
+            if(channels_last):
+                cpu_x = cpu_x.to(memory_format=torch.channels_last)
+                cpu_x.retain_grad()
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            mean_shape = [shape[1]]
+            cpu_running_mean = None
+            cpu_running_var = None
+            running_mean = None
+            running_var = None
+            if(track_running_stats):
+                mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140
+                cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float)
+                var_arr = 32 * np.random.random_sample(size=mean_shape)
+                cpu_running_var = torch.tensor(var_arr, device='cpu', dtype=torch.float)
+                running_mean = cpu_running_mean.detach().clone().to('mps')
+                running_var = cpu_running_var.detach().clone().to('mps')
+
+            weight = None
+            cpu_weight = None
+            bias = None
+            cpu_bias = None
+            if(wts):
+                cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                weight = cpu_weight.detach().clone().to('mps').requires_grad_()
+                cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                bias = cpu_bias.detach().clone().to('mps').requires_grad_()
+
+            y = None
+            ref_y = None
+
+            if(not test_module):
+                y = torch.nn.functional.batch_norm(x, running_mean, running_var,
+                                                   weight=weight,
+                                                   bias=bias,
+                                                   training=training,
+                                                   momentum=momentum, eps=eps)
+                ref_y = torch.nn.functional.batch_norm(cpu_x, cpu_running_mean, cpu_running_var,
+                                                       weight=cpu_weight,
+                                                       bias=cpu_bias,
+                                                       training=training,
+                                                       momentum=momentum, eps=eps)
+
+            else:
+
+                batchnorm_op = None
+                mps_batchnorm_op = None
+
+                if(len(shape) == 3):
+                    batchnorm_op = torch.nn.BatchNorm1d(shape[1],
+                                                        eps=eps,
+                                                        momentum=momentum,
+                                                        affine=wts,
+                                                        track_running_stats=track_running_stats,
+                                                        device='cpu')
+                    mps_batchnorm_op = torch.nn.BatchNorm1d(shape[1],
+                                                            eps=eps,
+                                                            momentum=momentum,
+                                                            affine=wts,
+                                                            track_running_stats=track_running_stats,
+                                                            device='mps')
+                elif(len(shape) == 4):
+                    batchnorm_op = torch.nn.BatchNorm2d(shape[1],
+                                                        eps=eps,
+                                                        momentum=momentum,
+                                                        affine=wts,
+                                                        track_running_stats=track_running_stats,
+                                                        device='cpu')
+                    mps_batchnorm_op = torch.nn.BatchNorm2d(shape[1],
+                                                            eps=eps,
+                                                            momentum=momentum,
+                                                            affine=wts,
+                                                            track_running_stats=track_running_stats,
+                                                            device='mps')
+                elif(len(shape) == 5):
+                    batchnorm_op = torch.nn.BatchNorm3d(shape[1],
+                                                        eps=eps,
+                                                        momentum=momentum,
+                                                        affine=wts,
+                                                        track_running_stats=track_running_stats,
+                                                        device='cpu')
+                    mps_batchnorm_op = torch.nn.BatchNorm3d(shape[1],
+                                                            eps=eps,
+                                                            momentum=momentum,
+                                                            affine=wts,
+                                                            track_running_stats=track_running_stats,
+                                                            device='mps')
+
+                if(track_running_stats):
+                    batchnorm_op.running_mean = cpu_running_mean
+                    batchnorm_op.running_var = cpu_running_var
+                    mps_batchnorm_op.running_mean = running_mean
+                    mps_batchnorm_op.running_var = running_var
+                if(wts):
+                    batchnorm_op.weight = torch.nn.Parameter(cpu_weight)
+                    batchnorm_op.bias = torch.nn.Parameter(cpu_bias)
+                    mps_batchnorm_op.weight = torch.nn.Parameter(weight)
+                    mps_batchnorm_op.bias = torch.nn.Parameter(bias)
+
+                ref_y = batchnorm_op(cpu_x)
+                y = mps_batchnorm_op(x)
+
+            self.assertEqual(y, ref_y)
+            if(not test_module):
+                self.assertEqual(running_mean, cpu_running_mean)
+                self.assertEqual(running_var, cpu_running_var)
+            else:
+                self.assertEqual(mps_batchnorm_op.running_mean, batchnorm_op.running_mean)
+                self.assertEqual(mps_batchnorm_op.running_var, batchnorm_op.running_var)
+
+            cpu_grad = torch.randn(ref_y.shape)
+            grad = cpu_grad.to('mps')
+            ref_y.backward(gradient=cpu_grad)
+            y.backward(gradient=grad)
+
+            self.assertEqual(x.grad, cpu_x.grad)
+            if(wts):
+                if(not test_module):
+                    self.assertEqual(weight.grad, cpu_weight.grad)
+                    self.assertEqual(bias.grad, cpu_bias.grad)
+                else:
+                    self.assertEqual(mps_batchnorm_op.weight.grad, batchnorm_op.weight.grad)
+                    self.assertEqual(mps_batchnorm_op.bias.grad, batchnorm_op.bias.grad)
+
+        for shape in [(2, 3, 2, 2), (2, 3, 2, 2, 2), (2, 3, 2)]:
+            for test_module in [False, True]:
+                for track_running_stats in [True, False]:
+                    for channels_last in [False, True]:
+                        if(channels_last and len(shape) != 4):
+                            continue
+                        # Running stats must be tracked in eval mode
+                        if(track_running_stats):
+                            helper(shape, eps=0, momentum=1, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=1e-05, momentum=0.1, wts=False, training=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=0, momentum=1.0, wts=False, training=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=1, momentum=1, wts=True, training=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=3, momentum=0.67, wts=True, training=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=1e-05, momentum=0.1, wts=False, training=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=0, momentum=1.0, wts=False, training=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=1, momentum=1, wts=True, training=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=3, momentum=0.67, wts=True, training=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+
+    # Test forward instance norm
+    def test_instance_norm(self):
+        def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_running_stats=True, test_module=False):
+
+            import numpy as np
+            np.random.seed(332)
+            arr = (256 - 128) * np.random.random_sample(size=shape) + 128
+            cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True)
+            if(channels_last):
+                cpu_x = cpu_x.to(memory_format=torch.channels_last)
+                cpu_x.retain_grad()
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            mean_shape = [shape[1]]
+            cpu_running_mean = None
+            cpu_running_var = None
+            running_mean = None
+            running_var = None
+            if(track_running_stats):
+                mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140
+                cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float)
+                var_arr = 32 * np.random.random_sample(size=mean_shape)
+                cpu_running_var = torch.tensor(var_arr, device='cpu', dtype=torch.float)
+                running_mean = cpu_running_mean.detach().clone().to('mps')
+                running_var = cpu_running_var.detach().clone().to('mps')
+
+            weight = None
+            cpu_weight = None
+            bias = None
+            cpu_bias = None
+            if(wts):
+                cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                weight = cpu_weight.detach().clone().to('mps').requires_grad_()
+                cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                bias = cpu_bias.detach().clone().to('mps').requires_grad_()
+
+            y = None
+            ref_y = None
+
+            if(not test_module):
+                ref_y = torch.nn.functional.instance_norm(cpu_x, cpu_running_mean, cpu_running_var,
+                                                          weight=cpu_weight,
+                                                          bias=cpu_bias,
+                                                          momentum=momentum, eps=eps)
+                y = torch.nn.functional.instance_norm(x, running_mean, running_var,
+                                                      weight=weight,
+                                                      bias=bias,
+                                                      momentum=momentum, eps=eps)
+
+            else:
+
+                instancenorm_op = None
+                mps_instancenorm_op = None
+
+                if(len(shape) == 3):
+                    instancenorm_op = torch.nn.InstanceNorm1d(shape[1],
+                                                              eps=eps,
+                                                              momentum=momentum,
+                                                              affine=wts,
+                                                              track_running_stats=track_running_stats,
+                                                              device='cpu')
+                    mps_instancenorm_op = torch.nn.InstanceNorm1d(shape[1],
+                                                                  eps=eps,
+                                                                  momentum=momentum,
+                                                                  affine=wts,
+                                                                  track_running_stats=track_running_stats,
+                                                                  device='mps')
+                elif(len(shape) == 4):
+                    instancenorm_op = torch.nn.InstanceNorm2d(shape[1],
+                                                              eps=eps,
+                                                              momentum=momentum,
+                                                              affine=wts,
+                                                              track_running_stats=track_running_stats,
+                                                              device='cpu')
+                    mps_instancenorm_op = torch.nn.InstanceNorm2d(shape[1],
+                                                                  eps=eps,
+                                                                  momentum=momentum,
+                                                                  affine=wts,
+                                                                  track_running_stats=track_running_stats,
+                                                                  device='mps')
+                elif(len(shape) == 5):
+                    instancenorm_op = torch.nn.InstanceNorm3d(shape[1],
+                                                              eps=eps,
+                                                              momentum=momentum,
+                                                              affine=wts,
+                                                              track_running_stats=track_running_stats,
+                                                              device='cpu')
+                    mps_instancenorm_op = torch.nn.InstanceNorm3d(shape[1],
+                                                                  eps=eps,
+                                                                  momentum=momentum,
+                                                                  affine=wts,
+                                                                  track_running_stats=track_running_stats,
+                                                                  device='mps')
+
+                if(track_running_stats):
+                    instancenorm_op.running_mean = cpu_running_mean
+                    instancenorm_op.running_var = cpu_running_var
+                    mps_instancenorm_op.running_mean = running_mean
+                    mps_instancenorm_op.running_var = running_var
+                if(wts):
+                    instancenorm_op.weight = torch.nn.Parameter(cpu_weight)
+                    instancenorm_op.bias = torch.nn.Parameter(cpu_bias)
+                    mps_instancenorm_op.weight = torch.nn.Parameter(weight)
+                    mps_instancenorm_op.bias = torch.nn.Parameter(bias)
+
+                ref_y = instancenorm_op(cpu_x)
+                y = mps_instancenorm_op(x)
+
+            self.assertEqual(y, ref_y)
+            if(not test_module):
+                self.assertEqual(running_mean, cpu_running_mean)
+                self.assertEqual(running_var, cpu_running_var)
+            else:
+                self.assertEqual(mps_instancenorm_op.running_mean, instancenorm_op.running_mean)
+                self.assertEqual(mps_instancenorm_op.running_var, instancenorm_op.running_var)
+
+            cpu_grad = torch.randn(ref_y.shape)
+            grad = cpu_grad.to('mps')
+            ref_y.backward(gradient=cpu_grad)
+            y.backward(gradient=grad)
+
+            self.assertEqual(x.grad, cpu_x.grad)
+            if(wts):
+                if(not test_module):
+                    self.assertEqual(weight.grad, cpu_weight.grad)
+                    self.assertEqual(bias.grad, cpu_bias.grad)
+                else:
+                    self.assertEqual(mps_instancenorm_op.weight.grad, instancenorm_op.weight.grad)
+                    self.assertEqual(mps_instancenorm_op.bias.grad, instancenorm_op.bias.grad)
+
+        for shape in [(2, 3, 2, 2), (2, 3, 2, 2, 2), (2, 3, 2)]:
+            for test_module in [False, True]:
+                for track_running_stats in [True, False]:
+                    for channels_last in [False]:
+                        if(channels_last and len(shape) != 4):
+                            continue
+                        # Running stats must be tracked in eval mode
+                        if(track_running_stats):
+                            helper(shape, eps=0, momentum=1, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=1e-05, momentum=0.1, wts=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=0, momentum=1.0, wts=False, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=1, momentum=1, wts=True, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                            helper(shape, eps=3, momentum=0.67, wts=True, channels_last=channels_last,
+                                   track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=1e-05, momentum=0.1, wts=False, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=0, momentum=1.0, wts=False, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=1, momentum=1, wts=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+                        helper(shape, eps=3, momentum=0.67, wts=True, channels_last=channels_last,
+                               track_running_stats=track_running_stats, test_module=test_module)
+
+    # Test conv2d
+    def test_conv2d_unit(self):
+        def helper(input_shape, wt_shape,
+                   stride=1, padding=0,
+                   dilation=1, groups=1,
+                   bias_shape=None):
+
+            cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            cpu_wt = torch.randn(wt_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            wt = cpu_wt.detach().clone().to('mps').requires_grad_()
+
+            cpu_bias = None
+            bias = None
+
+            if(bias_shape is not None):
+                cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                bias = cpu_bias.detach().clone().to('mps').requires_grad_()
+
+            y = torch.nn.functional.conv2d(x, wt, bias=bias, stride=stride,
+                                           padding=padding, dilation=dilation, groups=groups)
+            ref_y = torch.nn.functional.conv2d(cpu_x, cpu_wt, bias=cpu_bias, stride=stride,
+                                               padding=padding, dilation=dilation, groups=groups)
+
+            cpu_grad = torch.ones_like(ref_y)
+            grad = cpu_grad.to('mps')
+
+            y.backward(gradient=grad)
+            ref_y.backward(gradient=cpu_grad)
+
+            self.assertEqual(y, ref_y, rtol=2.6e-05, atol=2e-04)
+            self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05)
+            self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05)
+            # if(bias_shape is not None):
+            #  print(cpu_bias.grad)
+            #  print(bias.grad.to('cpu'))
+            #  self.assertEqual(bias.grad, cpu_bias.grad, atol=8e-04, rtol=10.4e-05)
+
+        N = 1
+        C_in = 3
+        C_out = 64
+        H = 64
+        W = 64
+        kH = 4
+        kW = 4
+        stride = 2
+        padding = 1
+
+        helper((N, C_in, H, W), (C_out, C_in, kH, kW), stride=stride, padding=padding)
+
+        N = 4
+        C_in = 16
+        H = 32
+        W = 32
+
+        C_out = 8
+        kH = 3
+        kW = 3
+
+        for groups in [1, 2, 4]:
+            helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), groups=groups)
+            helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), groups=groups)
+
+            helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), bias_shape=(C_out), groups=groups)
+            helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), bias_shape=(C_out), groups=groups)
+
+            helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, kH + 2, kW + 2), groups=groups)
+            helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, kH + 2, kW + 2), groups=groups)
+
+            helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups,
+                   kH + 2, kW + 2), bias_shape=(C_out * 2), groups=groups)
+            helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups,
+                   kH + 2, kW + 2), bias_shape=(C_out * 2), groups=groups)
+
+    # Test conv transpose 2d
+    def test_conv_transpose2d(self):
+        def helper(input_shape, wt_shape,
+                   stride=1, padding=0,
+                   output_padding=0,
+                   dilation=1, groups=1,
+                   bias_shape=None):
+
+            cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            cpu_wt = torch.randn(wt_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            wt = cpu_wt.detach().clone().to('mps').requires_grad_()
+
+            cpu_bias = None
+            bias = None
+
+            if(bias_shape is not None):
+                cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True)
+                bias = cpu_bias.detach().clone().to('mps').requires_grad_()
+
+            y = torch.nn.functional.conv_transpose2d(
+                x, wt, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)
+            ref_y = torch.nn.functional.conv_transpose2d(
+                cpu_x, cpu_wt, bias=cpu_bias, stride=stride, padding=padding,
+                output_padding=output_padding, groups=groups, dilation=dilation)
+
+            cpu_grad = torch.randn(ref_y.shape)
+            grad = cpu_grad.to('mps')
+
+            y.backward(gradient=grad)
+            ref_y.backward(gradient=cpu_grad)
+
+            self.assertEqual(y, ref_y, rtol=2.6e-05, atol=2e-04)
+            self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05)
+            self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05)
+
+            # if(bias_shape is not None):
+            #  print(cpu_bias.grad)
+            #  print(bias.grad.to('cpu'))
+            #  self.assertEqual(bias.grad, cpu_bias.grad)
+
+        N = 4
+        C_in = 16
+        H = 32
+        W = 32
+
+        C_out = 8
+        groups = 1
+        kH = 3
+        kW = 3
+
+        for stride in [1, 2, 3]:
+            for padding in [0, 1, 2]:
+                for output_padding in [0, 1, 2]:
+                    for dilation in [1, 2]:
+                        if(output_padding >= stride or output_padding >= dilation):
+                            continue
+                        helper((N, C_out, H, W), (C_out, C_in, kH, kW), stride=stride,
+                               padding=padding, output_padding=output_padding, dilation=dilation)
+                        helper((N, C_out, H, W), (C_out, C_in, kH, kW), stride=stride,
+                               padding=padding, output_padding=output_padding, dilation=dilation)
+
+                        helper((N, C_out, H, W), (C_out, C_in, kH, kW), bias_shape=(C_in), stride=stride,
+                               padding=padding, output_padding=output_padding, dilation=dilation)
+                        helper((N, C_out, H, W), (C_out, C_in, kH, kW), bias_shape=(C_in), stride=stride,
+                               padding=padding, output_padding=output_padding, dilation=dilation)
+
+    # Test sigmoid
+    def test_sigmoid(self):
+        def helper(shape):
+
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            sigmoid_op = torch.nn.Sigmoid()
+
+            y = sigmoid_op(x)
+            ref_y = sigmoid_op(cpu_x)
+
+            cpu_grad = torch.ones_like(ref_y)
+            grad = cpu_grad.to('mps')
+
+            y.backward(gradient=grad)
+            ref_y.backward(gradient=cpu_grad)
+
+            self.assertEqual(y, ref_y)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 3, 4, 5))
+        helper((2, 3, 4))
+        helper((2, 8, 4, 5))
+
+    # Test tanh
+    def test_tanh(self):
+        def helper(shape):
+
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            tanh_op = torch.nn.Tanh()
+
+            y = tanh_op(x)
+            ref_y = tanh_op(cpu_x)
+
+            cpu_grad = torch.ones_like(ref_y)
+            grad = cpu_grad.to('mps')
+
+            y.backward(gradient=grad)
+            ref_y.backward(gradient=cpu_grad)
+
+            self.assertEqual(y, ref_y)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 3, 4, 5))
+        helper((2, 3, 4))
+        helper((2, 8, 4, 5))
+
+    def test_threshold(self):
+        def helper(threshold, value, num_elems, inplace=False, requires_grad=True):
+            m = nn.Threshold(threshold=threshold, value=value, inplace=inplace)
+
+            input_cpu = torch.randn(num_elems, requires_grad=requires_grad, dtype=torch.float)
+            input_mps = input_cpu.detach().clone().to('mps').requires_grad_(requires_grad)
+
+            output_cpu = m(input_cpu)
+            output_mps = m(input_mps)
+
+            cpu_grad = torch.ones_like(output_cpu)
+            mps_grad = cpu_grad.to('mps')
+
+            self.assertEqual(output_cpu, output_mps)
+
+            if requires_grad:
+                output_cpu.backward(gradient=cpu_grad)
+                output_mps.backward(gradient=mps_grad)
+
+                self.assertEqual(input_cpu.grad, input_mps.grad)
+
+        helper(threshold=0.1, value=20, num_elems=2)
+        helper(threshold=-0.1, value=10, num_elems=10)
+        helper(threshold=0.5, value=-15, num_elems=100)
+        helper(threshold=1, value=10, num_elems=100, inplace=True, requires_grad=False)
+
+    # Test pow
+    def test_pow(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+            z = torch.pow(x, y)
+            ref_z = torch.pow(cpu_x, cpu_y)
+
+            self.assertEqual(z, ref_z)
+
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+            exp = random.random()
+            z = torch.pow(x, exp)
+            ref_z = torch.pow(cpu_x, exp)
+
+            self.assertEqual(z, ref_z)
+
+        helper((2, 8, 4, 5))
+
+    # Test addcmul
+    def test_addcmul(self):
+        def helper(shape, value):
+
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            z = cpu_z.detach().clone().to('mps')
+
+            y = torch.addcmul(x, y, z, value=value)
+            ref_y = torch.addcmul(cpu_x, cpu_y, cpu_z, value=value)
+
+            self.assertEqual(y, ref_y)
+
+        helper((2, 3, 4, 5), 0.1)
+        helper((2, 8, 4, 5), 0.1)
+        helper((2, 3, 4, 5), 0.2)
+        helper((2, 8, 4, 5), 0.2)
+
+    # Test addcdiv
+    def test_addcdiv(self):
+        def helper(shape, value):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            # clamp to avoid division by 0
+            cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False).clamp_min_(0.1)
+            cpu_out = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            mps_z = cpu_z.detach().clone().to('mps')
+            mps_out = cpu_out.detach().clone().to('mps')
+
+            result_div_mps = torch.addcdiv(mps_x, mps_y, mps_z, value=value)
+            result_div_cpu = torch.addcdiv(cpu_x, cpu_y, cpu_z, value=value)
+            self.assertEqual(result_div_mps, result_div_cpu)
+            # test .out variant
+            self.assertEqual(torch.addcdiv(mps_x, mps_y, mps_z, out=mps_out, value=value), result_div_cpu)
+
+        helper((2, 3, 4, 5), 0.1)
+        helper((2, 8, 4, 5), 0.2)
+        helper((2, 3, 4, 5), 1.0)  # value of 1 should be ignored internally
+
+    def test_transpose_inplace(self):
+        values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
+        cpu_x = torch.tensor(values, device='cpu')
+        mps_x = torch.tensor(values, device='mps')
+
+        cpu_x.transpose_(0, 1)
+        mps_x.transpose_(0, 1)
+        self.assertEqual(cpu_x, mps_x.to('cpu'))
+
+    def test_slice(self):
+        values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
+        cpu_x = torch.tensor(values, device='cpu')
+        mps_x = (torch.tensor(values, device='mps', dtype=torch.float))
+
+        cpu_slice1 = cpu_x[:2, :]
+        mps_slice1 = mps_x[:2, :]
+        print(mps_slice1)
+        self.assertEqual(cpu_slice1, mps_slice1)
+
+        cpu_slice2 = cpu_x[:, :1]
+        mps_slice2 = mps_x[:, :1]
+        print(cpu_slice2)
+        print(mps_slice2.to('cpu'))
+        self.assertEqual(cpu_slice2, mps_slice2)
+
+        cpu_slice3 = cpu_x[1:2, :]
+        mps_slice3 = mps_x[1:2, :]
+        self.assertEqual(cpu_slice3, mps_slice3.to('cpu'))
+
+        cpu_slice4 = cpu_x[1, :]
+        mps_slice4 = mps_x[1, :].to('cpu')
+        self.assertEqual(cpu_slice4, mps_slice4)
+
+    def test_flatten(self):
+        values = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]
+        cpu_x = torch.tensor(values, device='cpu')
+        mps_x = torch.tensor(values, device='mps')
+
+        cpu_flatten1 = cpu_x.flatten()
+        mps_flatten1 = mps_x.flatten().to('cpu')
+        self.assertEqual(cpu_flatten1, mps_flatten1)
+
+        cpu_flatten2 = cpu_x.flatten(start_dim=1)
+        mps_flatten2 = mps_x.flatten(start_dim=1).to('cpu')
+        self.assertEqual(cpu_flatten2, mps_flatten2)
+
+        cpu_flatten3 = cpu_x.flatten(end_dim=1)
+        mps_flatten3 = mps_x.flatten(end_dim=1).to('cpu')
+        self.assertEqual(cpu_flatten3, mps_flatten3)
+
+    # Test repeat
+    def test_repeat(self):
+        def helper(shape, repeats):
+
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            y = x.repeat(repeats)
+            ref_y = cpu_x.repeat(repeats)
+
+            cpu_grad = torch.randn(ref_y.shape)
+            grad = cpu_grad.to('mps')
+
+            y.backward(gradient=grad)
+            ref_y.backward(gradient=cpu_grad)
+
+            self.assertEqual(y, ref_y)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 3, 4, 5), (2, 3, 4, 5))
+        helper((2, 3, 4), (4, 3, 2, 5, 7, 2))
+        helper((3, 4, 5), (2, 3, 4, 5))
+        helper((3, 4, 5), (2, 2, 2))
+
+    def _test_module_empty_input(self, module, inp, check_size=True):
+        inp.requires_grad_(True)
+        out = module(inp)
+        gO = torch.rand_like(out)
+        out.backward(gO)
+        if check_size:
+            self.assertEqual(out.size(), inp.size())
+        for p in module.parameters():
+            if p.requires_grad:
+                self.assertEqual(p.grad, torch.zeros_like(p.grad))
+        self.assertEqual(inp.grad, torch.zeros_like(inp))
+
+
+class TestSmoothL1Loss(TestCase):
+
+    def _smooth_l1_loss_helper(self, reduction="mean", requires_grad=False):
+        # CPU
+        input_cpu = torch.randn(4, 7, requires_grad=requires_grad)
+        target_cpu = torch.randn(4, 7)
+
+        # MPS
+        input_mps = input_cpu.detach().clone().to('mps').requires_grad_()
+        target_mps = target_cpu.detach().clone().to('mps')
+
+        smooth_l1_loss_cpu = F.smooth_l1_loss(input_cpu, target_cpu, beta=1.0, reduction=reduction)
+        smooth_l1_loss_mps = F.smooth_l1_loss(input_mps, target_mps, beta=1.0, reduction=reduction)
+
+        self.assertEqual(smooth_l1_loss_cpu, smooth_l1_loss_mps)
+
+        if requires_grad:
+            smooth_l1_loss_cpu.backward()
+            smooth_l1_loss_mps.backward()
+            self.assertEqual(input_cpu.grad, input_mps.grad.to("cpu"))
+
+        return smooth_l1_loss_cpu, smooth_l1_loss_mps
+
+    def test_smooth_l1_loss_reduction_none(self):
+        self._smooth_l1_loss_helper(reduction="none")
+
+    def test_smooth_l1_loss_reduction_mean(self):
+        self._smooth_l1_loss_helper(reduction="mean")
+
+    def test_smooth_l1_loss_reduction_sum(self):
+        self._smooth_l1_loss_helper(reduction="sum")
+
+    def test_smooth_l1_loss_reduction_mean_backward(self):
+        self._smooth_l1_loss_helper(reduction="mean", requires_grad=True)
+
+    def test_smooth_l1_loss_reduction_mean_sum_backward(self):
+        self._smooth_l1_loss_helper(reduction="sum", requires_grad=True)
+
+
+class TestNLLLoss(TestCase):
+
+    def test_nll_loss_mismatched_batch(self, device='mps'):
+        x = torch.randn((10, 3), requires_grad=True, device=device)
+        # t should have size (10,)
+        t = torch.zeros((3,), dtype=torch.int64, device=device)
+        with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
+            F.nll_loss(x, t)
+
+    def test_nll_loss_out_of_bounds_ignore_index(self):
+
+        def _test_nll_loss_out_of_bounds_ignore_index(device):
+            output = []
+            x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [
+                             0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device)
+            t = torch.tensor([0, 1, 255, 0, 1, 2], dtype=torch.int64, device=device)
+            for reduction in ['mean', 'none']:
+                output.append(F.nll_loss(x, t, ignore_index=255, reduction=reduction))
+            return output
+
+        output_cpu = _test_nll_loss_out_of_bounds_ignore_index(device='cpu')
+        output_mps = _test_nll_loss_out_of_bounds_ignore_index(device='mps')
+
+        for cpu, mps in zip(output_cpu, output_mps):
+            self.assertEqual(cpu, mps.to('cpu'))
+
+    def test_nll_loss_invalid_target_dim(self):
+
+        def _test_nll_loss_invalid_target_dim(device):
+            output = []
+            x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [
+                             0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device)
+            t = torch.zeros((6, 2), dtype=torch.int64, device=device)
+            with self.assertRaisesRegex(RuntimeError, "1D target tensor expected"):
+                F.nll_loss(x, t)
+
+        _test_nll_loss_invalid_target_dim(device='cpu')
+        _test_nll_loss_invalid_target_dim(device='mps')
+
+    def test_nll_loss_invalid_weights(self):
+
+        def _test_nll_loss_invalid_weights(device):
+            x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [
+                             0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device)
+            t = torch.tensor([0, 1, 2, 1, 1, 2], dtype=torch.int64, device=device)
+            invalid_weights = [
+                torch.zeros(4, device=device),
+                torch.zeros((1, 3), device=device),
+            ]
+            msg = "weight tensor should be defined either for all 3 classes or no classes"
+            for weight in invalid_weights:
+                with self.assertRaisesRegex(RuntimeError, msg):
+                    F.nll_loss(x, t, weight=weight)
+
+        _test_nll_loss_invalid_weights(device='cpu')
+        _test_nll_loss_invalid_weights(device='mps')
+
+    def _nll_loss_helper(self, input_size, reduction, expected):
+
+        # CPU
+        input = torch.rand(input_size, requires_grad=True, device='cpu')
+        num_channels = input_size[1]
+        target_size = (input_size[0], ) + tuple(input_size[2:])
+        target = torch.randint(num_channels, target_size, device='cpu')
+
+        # MPS
+        input_mps = input.detach().clone().to('mps').requires_grad_()
+        target_mps = target.detach().clone().to('mps')
+
+        output_cpu = F.nll_loss(input, target, reduction=reduction)
+        output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
+        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
+        self.assertEqualIgnoreType(output_cpu, output_mps.to('cpu'))
+
+        output_cpu.sum().backward()
+        output_mps.sum().backward()
+        self.assertEqual(input.grad, input_mps.grad.to('cpu'))
+
+    def test_as_strided(self):
+        def helper(n, c):
+            values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
+            values_1 = [[1.0, 1.0], [1.0, 1.0]]
+            cpu_x = torch.tensor(values, device='cpu')
+            ones1 = torch.tensor(values_1, device='mps')
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+            strided_cpu = torch.as_strided(cpu_x, (2, 2), (2, 2))
+            strided_mps = torch.as_strided(x, (2, 2), (2, 2))
+
+            print("Strided MPS {}".format(strided_mps.to('cpu')))
+            print("Strided cpu {}".format(strided_cpu))
+
+            self.assertEqual(strided_mps, strided_cpu)
+
+        helper(3, 3)
+
+    def test_nll_loss_empty_tensor_reduction_none(self, device='cpu'):
+        self._nll_loss_helper([1, 3], "none", torch.empty([0], device=device))
+        self._nll_loss_helper([3, 5, 7], "none", torch.empty([5, 7], device=device))
+        self._nll_loss_helper([2, 3, 1, 7], "none", torch.empty([2, 1, 7], device=device))
+        self._nll_loss_helper([2, 3, 5, 1], "none", torch.empty([2, 5, 1], device=device))
+        self._nll_loss_helper([2, 3, 5, 7, 1], "none", torch.empty([2, 5, 7, 1], device=device))
+
+    @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
+    def test_nll_loss_empty_tensor_reduction_mean(self, device='cpu'):
+        nan = torch.tensor(float('nan'), device=device)
+        self._nll_loss_helper([1, 3], "mean", nan)
+        self._nll_loss_helper([1, 3, 5, 7], "mean", nan)
+        self._nll_loss_helper([2, 3, 1, 7], "mean", nan)
+        self._nll_loss_helper([2, 3, 5, 1], "mean", nan)
+        self._nll_loss_helper([2, 3, 5, 7, 1], "mean", nan)
+
+    def test_nll_loss_empty_tensor_reduction_sum(self, device='cpu'):
+        zero = torch.tensor(0, device=device)
+        self._nll_loss_helper([1, 3], "sum", zero)
+        self._nll_loss_helper([1, 3, 5, 7], "sum", zero)
+        self._nll_loss_helper([2, 3, 1, 7], "sum", zero)
+        self._nll_loss_helper([2, 3, 5, 1], "sum", zero)
+        self._nll_loss_helper([2, 3, 5, 7, 1], "sum", zero)
+
+    def test_nll_loss_byte_target_matches_long(self, device='cpu'):
+        N, C = 10, 4
+        input = torch.randn(N, C, device=device, requires_grad=True)
+        target = torch.empty(N, dtype=torch.long, device=device).random_(0, C)
+
+        def compute_result_and_gradient(reduction, target_dtype):
+            result, grad = {}, {}
+            for dev in ['cpu', 'mps']:
+                input_dev = input.to(dev)
+                input_ = input_dev.detach()
+                input_.requires_grad_()
+
+                target_dev = target.to(dev)
+
+                prob = F.log_softmax(input_, dim=-1)
+                loss = nn.NLLLoss(reduction=reduction)
+                result[dev] = loss(prob, target_dev.to(target_dtype))
+                result[dev].sum().backward()
+                grad[dev] = input_.grad
+
+            return result, grad
+
+        for reduction in ["none", "mean", "sum"]:
+            result_long, grad_long = compute_result_and_gradient(reduction, torch.long)
+            result_byte, grad_byte = compute_result_and_gradient(reduction, torch.uint8)
+
+            self.assertEqual(result_long['mps'].to('cpu'), result_long['cpu'])
+            self.assertEqual(grad_long['mps'].to('cpu'), grad_long['cpu'])
+
+    # Mean Squared Error
+    def test_mse_loss(self):
+        def helper(shape, reduction):
+            # create the criterion
+            loss = torch.nn.MSELoss(reduction=reduction)
+
+            inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            targetCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+            targetMPS = targetCPU.detach().clone().to('mps')
+
+            # forward pass
+            outputCPU = loss(inputCPU, targetCPU)
+            outputMPS = loss(inputMPS, targetMPS)
+            self.assertEqual(outputCPU, outputMPS)
+
+            # backward pass
+            if reduction != 'none':
+                # chose 2 just to make the grad_output > 1 in backward pass
+                outputCPU.backward(gradient=torch.full_like(outputCPU, 2))
+                outputMPS.backward(gradient=torch.full_like(outputMPS, 2))
+                self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        helper([8, 5, 4], 'none')
+        helper([7, 5, 2, 4], 'sum')
+        # verify if changes in shape would cause cached graph lookup problems
+        helper([7, 5, 2, 4, 6], 'sum')
+        helper([8, 4, 5, 7, 6], 'mean')
+
+    # Binary Cross Enropy
+    def test_bce_loss(self):
+        def helper(shape, reduction):
+            # create the criterion
+            loss = torch.nn.BCELoss(reduction=reduction)
+
+            # input and target must be within [0..1]
+            input_t = np.random.random_sample(size=shape).astype(np.float32)
+            target_t = np.random.random_sample(size=shape).astype(np.float32)
+            inputCPU = torch.tensor(input_t, device='cpu', dtype=torch.float, requires_grad=True)
+            targetCPU = torch.tensor(target_t, device='cpu', dtype=torch.float, requires_grad=False)
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+            targetMPS = targetCPU.detach().clone().to('mps')
+
+            # forward pass
+            outputCPU = loss(inputCPU, targetCPU)
+            outputMPS = loss(inputMPS, targetMPS)
+            self.assertEqual(outputCPU, outputMPS)
+
+            # backward pass
+            if reduction != 'none':
+                # chose 0.6 just to have the grad_output != 1
+                outputCPU.backward(gradient=torch.full_like(outputCPU, 0.6))
+                outputMPS.backward(gradient=torch.full_like(outputMPS, 0.6))
+                self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        helper([8, 5, 4], 'none')
+        helper([7, 5, 2, 4], 'sum')
+        # verify if changes in shape would cause cached graph lookup problems
+        helper([7, 5, 2, 4, 6], 'sum')
+        helper([8, 4, 5, 7, 6], 'mean')
+
+    def test_log_softmax(self):
+        values = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]
+        cpu_x = torch.tensor(values, device='cpu', requires_grad=True)
+        mps_x = torch.tensor(values, device='mps', requires_grad=True)
+
+        cpu_log_softmax = F.log_softmax(cpu_x, dim=0)
+        mps_log_softmax = F.log_softmax(mps_x, dim=0)
+        self.assertEqual(cpu_log_softmax, mps_log_softmax.to('cpu'))
+
+        cpu_grad = torch.ones_like(cpu_log_softmax)
+        mps_grad = torch.ones_like(cpu_log_softmax).to('mps')
+
+        cpu_log_softmax.backward(gradient=cpu_grad)
+        mps_log_softmax.backward(gradient=mps_grad)
+
+        self.assertEqual(cpu_x.grad, mps_x.grad.to('cpu'))
+
+    def test_eq(self):
+        values1 = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]
+        values2 = [[[1.0, 2.0, 15.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [0.0, 11.0, 12.0]]]
+        mps_x = torch.tensor(values1, device='mps')
+        mps_y = torch.tensor(values2, device='mps')
+        cpu_x = torch.tensor(values1, device='cpu')
+        cpu_y = torch.tensor(values2, device='cpu')
+        result_mps = torch.eq(mps_x, mps_y)
+        result_cpu = torch.eq(cpu_x, cpu_y)
+
+        self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+    def test_eq_int64(self):
+        values1 = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+        values2 = [[[1, 2, 15], [4, 5, 6]], [[7, 8, 9], [0, 11, 12]]]
+        mps_x = torch.tensor(values1, device='mps')
+        mps_y = torch.tensor(values2, device='mps')
+        cpu_x = torch.tensor(values1, device='cpu')
+        cpu_y = torch.tensor(values2, device='cpu')
+        result_mps = torch.eq(mps_x, mps_y)
+        result_cpu = torch.eq(cpu_x, cpu_y)
+
+        self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+    def test_ne(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            result_mps = torch.ne(mps_x, mps_y)
+            result_cpu = torch.ne(cpu_x, cpu_y)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_ne_scalar(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            result_mps = torch.ne(mps_x, 0.0)
+            result_cpu = torch.ne(cpu_x, 0.0)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_lt(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            result_mps = torch.lt(mps_x, mps_y)
+            result_cpu = torch.lt(cpu_x, cpu_y)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_lt_scalar(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            result_mps = torch.lt(mps_x, 0.0)
+            result_cpu = torch.lt(cpu_x, 0.0)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_le(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            result_mps = torch.le(mps_x, mps_y)
+            result_cpu = torch.le(cpu_x, cpu_y)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_le_scalar(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            result_mps = torch.le(mps_x, 0.0)
+            result_cpu = torch.le(cpu_x, 0.0)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_ge(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            result_mps = torch.ge(mps_x, mps_y)
+            result_cpu = torch.ge(cpu_x, cpu_y)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_ge_scalar(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            result_mps = torch.ge(mps_x, 0.0)
+            result_cpu = torch.ge(cpu_x, 0.0)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_gt(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+            result_mps = torch.gt(mps_x, mps_y)
+            result_cpu = torch.gt(cpu_x, cpu_y)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    def test_gt_scalar(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            mps_x = cpu_x.detach().clone().to('mps')
+            result_mps = torch.gt(mps_x, 0.0)
+            result_cpu = torch.gt(cpu_x, 0.0)
+
+            self.assertEqual(result_cpu, result_mps.to('cpu'))
+
+        helper((2, 3, 4, 5))
+
+    # Test forward argmax
+    def test_argmax(self):
+        def helper(n, c, h, w, dtype=torch.float32):
+            cpu_x = None
+            x = None
+            if(dtype not in [torch.float32, torch.bool]):
+                cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            elif (dtype == torch.bool):
+                cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            else:
+                cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True)
+                x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            y = torch.argmax(x)
+            ref_y = torch.argmax(cpu_x)
+            self.assertEqual(y, ref_y)
+
+            y_0 = torch.argmax(x, dim=0)
+            refy_0 = torch.argmax(cpu_x, dim=0)
+            self.assertEqual(y_0, refy_0)
+
+            y_0dim = torch.argmax(x, dim=0, keepdim=True)
+            refy_0dim = torch.argmax(cpu_x, dim=0, keepdim=True)
+            self.assertEqual(y_0dim, refy_0dim)
+
+            y_1 = torch.argmax(x, dim=1)
+            refy_1 = torch.argmax(cpu_x, dim=1)
+            self.assertEqual(y_1, refy_1)
+
+            y_1dim = torch.argmax(x, dim=1, keepdim=True)
+            refy_1dim = torch.argmax(cpu_x, dim=1, keepdim=True)
+            self.assertEqual(y_1dim, refy_1dim)
+
+            y_2 = torch.argmax(x, dim=2)
+            refy_2 = torch.argmax(cpu_x, dim=2)
+            self.assertEqual(y_2, refy_2)
+
+            y_2dim = torch.argmax(x, dim=2, keepdim=True)
+            refy_2dim = torch.argmax(cpu_x, dim=2, keepdim=True)
+            self.assertEqual(y_2dim, refy_2dim)
+
+            y_3 = torch.argmax(x, dim=3)
+            refy_3 = torch.argmax(cpu_x, dim=3)
+            self.assertEqual(y_3, refy_3)
+
+            y_3dim = torch.argmax(x, dim=3, keepdim=True)
+            refy_3dim = torch.argmax(cpu_x, dim=3, keepdim=True)
+            self.assertEqual(y_3dim, refy_3dim)
+
+        helper(2, 8, 4, 4, torch.float32)
+        helper(2, 8, 4, 4, torch.int32)
+        helper(2, 8, 4, 4, torch.float16)
+        helper(2, 8, 4, 4, torch.int64)
+
+    # Test forward max
+    # Note - don't test grad now
+    def test_max_el(self):
+        def helper(n, c, h, w, dtype=torch.float32):
+
+            if(dtype not in [torch.float32, torch.bool]):
+                cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            elif (dtype == torch.bool):
+                cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            else:
+                cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True)
+                x = cpu_x.detach().clone().to('mps')
+
+            ref_y = torch.max(cpu_x)
+            y = torch.max(x)
+            self.assertEqual(y, ref_y)
+
+            for dim in [0, 1, 2, 3]:
+                for keepdim in [True, False]:
+                    y, idx = torch.max(x, dim=dim, keepdim=keepdim)
+                    refy, refidx = torch.max(cpu_x, dim=dim, keepdim=keepdim)
+                    self.assertEqual(y, refy)
+                    self.assertEqual(idx, refidx)
+
+            y_0 = torch.ones(c, h, w, device='mps', dtype=dtype)
+            idx_0 = torch.ones(c, h, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=0, out=(y_0, idx_0))
+            refy_0, refidx_0 = torch.max(cpu_x, dim=0)
+            self.assertEqual(y_0, refy_0)
+            self.assertEqual(idx_0, refidx_0)
+
+            y_0dim = torch.ones(1, c, h, w, device='mps', dtype=dtype)
+            idx_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=0, keepdim=True, out=(y_0dim, idx_0dim))
+            refy_0dim, refidx_0dim = torch.max(cpu_x, dim=0, keepdim=True)
+            self.assertEqual(y_0dim, refy_0dim)
+            self.assertEqual(idx_0dim, refidx_0dim)
+
+            y_1 = torch.ones(n, h, w, device='mps', dtype=dtype)
+            idx_1 = torch.ones(n, h, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=1, out=(y_1, idx_1))
+            refy_1, refidx_1 = torch.max(cpu_x, dim=1)
+            self.assertEqual(y_1, refy_1)
+            self.assertEqual(idx_1, refidx_1)
+
+            y_1dim = torch.ones(n, 1, h, w, device='mps', dtype=dtype)
+            idx_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=1, keepdim=True, out=(y_1dim, idx_1dim))
+            refy_1dim, refidx_1dim = torch.max(cpu_x, keepdim=True, dim=1)
+            self.assertEqual(y_1dim, refy_1dim)
+            self.assertEqual(idx_1dim, refidx_1dim)
+
+            y_2 = torch.ones(n, c, w, device='mps', dtype=dtype)
+            idx_2 = torch.ones(n, c, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=2, out=(y_2, idx_2))
+            refy_2, refidx_2 = torch.max(cpu_x, dim=2)
+            self.assertEqual(y_2, refy_2)
+            self.assertEqual(idx_2, refidx_2)
+
+            y_2dim = torch.ones(n, c, 1, w, device='mps', dtype=dtype)
+            idx_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.int64)
+            torch.max(x, dim=2, keepdim=True, out=(y_2dim, idx_2dim))
+            refy_2dim, refidx_2dim = torch.max(cpu_x, dim=2, keepdim=True,)
+            self.assertEqual(y_2dim, refy_2dim)
+            self.assertEqual(idx_2dim, refidx_2dim)
+
+            y_3 = torch.ones(n, c, h, device='mps', dtype=dtype)
+            idx_3 = torch.ones(n, c, h, device='mps', dtype=torch.int64)
+            torch.max(x, dim=3, out=(y_3, idx_3))
+            refy_3, refidx_3 = torch.max(cpu_x, dim=3)
+            self.assertEqual(y_3, refy_3)
+            self.assertEqual(idx_3, refidx_3)
+
+            y_3dim = torch.ones(n, c, h, 1, device='mps', dtype=dtype)
+            idx_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.int64)
+            torch.max(x, dim=3, keepdim=True, out=(y_3dim, idx_3dim))
+            refy_3dim, refidx_3dim = torch.max(cpu_x, dim=3, keepdim=True,)
+            self.assertEqual(y_3dim, refy_3dim)
+            self.assertEqual(idx_3dim, refidx_3dim)
+
+        helper(2, 8, 4, 5, torch.float32)
+        helper(2, 8, 4, 5, torch.int32)
+        # helper(2, 8, 4, 5, torch.int64)
+
+    def test_any(self):
+        def helper(shape):
+            input_xs = []
+            prod = 1
+
+            for i in range(len(shape)):
+                prod *= shape[i]
+            input_xs.append(torch.randn(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.ones(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.zeros(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape).bool())
+            input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape).bool())
+            input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape).bool())
+
+            for i, cpu_x in enumerate(input_xs):
+                x = cpu_x.detach().clone().to('mps')
+                y = torch.any(x)
+                ref_y = torch.any(cpu_x)
+                self.assertEqual(y, ref_y)
+
+                y_0 = torch.any(x, dim=0)
+                refy_0 = torch.any(cpu_x, dim=0)
+                self.assertEqual(y_0, refy_0)
+
+                y_0dim = torch.any(x, dim=0, keepdim=True)
+                refy_0dim = torch.any(cpu_x, dim=0, keepdim=True)
+                self.assertEqual(y_0dim, refy_0dim)
+
+                y_0dim = torch.any(x, dim=0, keepdim=True)
+                refy_0dim = torch.any(cpu_x, dim=0, keepdim=True)
+                self.assertEqual(y_0dim, refy_0dim)
+
+                y_1 = torch.any(x, dim=1)
+                refy_1 = torch.any(cpu_x, dim=1)
+                self.assertEqual(y_1, refy_1)
+
+                y_1dim = torch.any(x, dim=1, keepdim=True)
+                refy_1dim = torch.any(cpu_x, dim=1, keepdim=True)
+                self.assertEqual(y_1dim, refy_1dim)
+
+                if (len(shape) > 2):
+                    y_2 = torch.any(x, dim=2)
+                    refy_2 = torch.any(cpu_x, dim=2)
+                    self.assertEqual(y_2, refy_2)
+
+                    y_2dim = torch.any(x, dim=2, keepdim=True)
+                    refy_2dim = torch.any(cpu_x, dim=2, keepdim=True)
+                    self.assertEqual(y_2dim, refy_2dim)
+
+                    y_3 = torch.any(x, dim=3)
+                    refy_3 = torch.any(cpu_x, dim=3)
+                    self.assertEqual(y_3, refy_3)
+
+                    y_3dim = torch.any(x, dim=3, keepdim=True)
+                    refy_3dim = torch.any(cpu_x, dim=3, keepdim=True)
+                    self.assertEqual(y_3dim, refy_3dim)
+        helper((1, 1, 1, 1))
+        helper((1, 1, 3, 3))
+        helper((7, 13))
+        helper((2, 8, 4, 5))
+
+    def test_all(self):
+        def helper(shape):
+            input_xs = []
+            prod = 1
+
+            for i in range(len(shape)):
+                prod *= shape[i]
+            input_xs.append(torch.randn(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.ones(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.zeros(prod, dtype=torch.float).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape))
+            input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape).bool())
+            input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape).bool())
+            input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape).bool())
+
+            for i, cpu_x in enumerate(input_xs):
+                x = cpu_x.detach().clone().to('mps')
+                y = torch.all(x)
+                ref_y = torch.all(cpu_x)
+                self.assertEqual(y, ref_y)
+
+                y_0 = torch.all(x, dim=0)
+                refy_0 = torch.all(cpu_x, dim=0)
+                self.assertEqual(y_0, refy_0)
+
+                y_0dim = torch.all(x, dim=0, keepdim=True)
+                refy_0dim = torch.all(cpu_x, dim=0, keepdim=True)
+                self.assertEqual(y_0dim, refy_0dim)
+
+                y_0dim = torch.all(x, dim=0, keepdim=True)
+                refy_0dim = torch.all(cpu_x, dim=0, keepdim=True)
+                self.assertEqual(y_0dim, refy_0dim)
+
+                y_1 = torch.all(x, dim=1)
+                refy_1 = torch.all(cpu_x, dim=1)
+                self.assertEqual(y_1, refy_1)
+
+                y_1dim = torch.all(x, dim=1, keepdim=True)
+                refy_1dim = torch.all(cpu_x, dim=1, keepdim=True)
+                self.assertEqual(y_1dim, refy_1dim)
+                if (len(shape) > 2):
+                    y_2 = torch.all(x, dim=2)
+                    refy_2 = torch.all(cpu_x, dim=2)
+                    self.assertEqual(y_2, refy_2)
+
+                    y_2dim = torch.all(x, dim=2, keepdim=True)
+                    refy_2dim = torch.all(cpu_x, dim=2, keepdim=True)
+                    self.assertEqual(y_2dim, refy_2dim)
+
+                    y_3 = torch.all(x, dim=3)
+                    refy_3 = torch.all(cpu_x, dim=3)
+                    self.assertEqual(y_3, refy_3)
+
+                    y_3dim = torch.all(x, dim=3, keepdim=True)
+                    refy_3dim = torch.all(cpu_x, dim=3, keepdim=True)
+                    self.assertEqual(y_3dim, refy_3dim)
+
+        helper((1, 1, 1, 1))
+        helper((1, 1, 3, 3))
+        helper((7, 13))
+        helper((2, 8, 4, 5))
+
+    # Test forward min
+    def test_min_el(self):
+        def helper(n, c, h, w):
+            cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            y = torch.min(x)
+            ref_y = torch.min(cpu_x)
+            self.assertEqual(y, ref_y)
+
+            y_0, idx_0 = torch.min(x, dim=0)
+            refy_0, refidx_0 = torch.min(cpu_x, dim=0)
+            self.assertEqual(y_0, refy_0)
+            self.assertEqual(idx_0, refidx_0)
+
+            y_0 = torch.ones(c, h, w, device='mps', dtype=torch.float)
+            idx_0 = torch.ones(c, h, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=0, out=(y_0, idx_0))
+            refy_0, refidx_0 = torch.min(cpu_x, dim=0)
+            self.assertEqual(y_0, refy_0)
+            self.assertEqual(idx_0, refidx_0)
+
+            y_0dim, idx_0dim = torch.min(x, dim=0, keepdim=True)
+            refy_0dim, refidx_0dim = torch.min(cpu_x, dim=0, keepdim=True)
+            self.assertEqual(y_0dim, refy_0dim)
+            self.assertEqual(idx_0dim, refidx_0dim)
+
+            y_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.float)
+            idx_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=0, keepdim=True, out=(y_0dim, idx_0dim))
+            refy_0dim, refidx_0dim = torch.min(cpu_x, dim=0, keepdim=True)
+            self.assertEqual(y_0dim, refy_0dim)
+            self.assertEqual(idx_0dim, refidx_0dim)
+
+            y_1, idx_1 = torch.min(x, dim=1)
+            refy_1, refidx_1 = torch.min(cpu_x, dim=1)
+            self.assertEqual(y_1, refy_1)
+            self.assertEqual(idx_1, refidx_1)
+
+            y_1 = torch.ones(n, h, w, device='mps', dtype=torch.float)
+            idx_1 = torch.ones(n, h, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=1, out=(y_1, idx_1))
+            refy_1, refidx_1 = torch.min(cpu_x, dim=1)
+            self.assertEqual(y_1, refy_1)
+            self.assertEqual(idx_1, refidx_1)
+
+            y_1dim, idx_1dim = torch.min(x, dim=1, keepdim=True)
+            refy_1dim, refidx_1dim = torch.min(cpu_x, dim=1, keepdim=True)
+            self.assertEqual(y_1dim, refy_1dim)
+            self.assertEqual(idx_1dim, refidx_1dim)
+
+            y_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.float)
+            idx_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=1, keepdim=True, out=(y_1dim, idx_1dim))
+            refy_1dim, refidx_1dim = torch.min(cpu_x, keepdim=True, dim=1)
+            self.assertEqual(y_1dim, refy_1dim)
+            self.assertEqual(idx_1dim, refidx_1dim)
+
+            y_2, idx_2 = torch.min(x, dim=2)
+            refy_2, refidx_2 = torch.min(cpu_x, dim=2)
+            self.assertEqual(y_2, refy_2)
+            self.assertEqual(idx_2, refidx_2)
+
+            y_2 = torch.ones(n, c, w, device='mps', dtype=torch.float)
+            idx_2 = torch.ones(n, c, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=2, out=(y_2, idx_2))
+            refy_2, refidx_2 = torch.min(cpu_x, dim=2)
+            self.assertEqual(y_2, refy_2)
+            self.assertEqual(idx_2, refidx_2)
+
+            y_2dim, idx_2dim = torch.min(x, dim=2, keepdim=True)
+            refy_2dim, refidx_2dim = torch.min(cpu_x, dim=2, keepdim=True)
+            self.assertEqual(y_2dim, refy_2dim)
+            self.assertEqual(idx_2dim, refidx_2dim)
+
+            y_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.float)
+            idx_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.int64)
+            torch.min(x, dim=2, keepdim=True, out=(y_2dim, idx_2dim))
+            refy_2dim, refidx_2dim = torch.min(cpu_x, dim=2, keepdim=True,)
+            self.assertEqual(y_2dim, refy_2dim)
+            self.assertEqual(idx_2dim, refidx_2dim)
+
+            y_3, idx_3 = torch.min(x, dim=3)
+            refy_3, refidx_3 = torch.min(cpu_x, dim=3)
+            self.assertEqual(y_3, refy_3)
+            self.assertEqual(idx_3, refidx_3)
+
+            y_3 = torch.ones(n, c, h, device='mps', dtype=torch.float)
+            idx_3 = torch.ones(n, c, h, device='mps', dtype=torch.int64)
+            torch.min(x, dim=3, out=(y_3, idx_3))
+            refy_3, refidx_3 = torch.min(cpu_x, dim=3)
+            self.assertEqual(y_3, refy_3)
+            self.assertEqual(idx_3, refidx_3)
+
+            y_3dim, idx_3dim = torch.min(x, dim=3, keepdim=True)
+            refy_3dim, refidx_3dim = torch.min(cpu_x, dim=3, keepdim=True)
+            self.assertEqual(y_3dim, refy_3dim)
+            self.assertEqual(idx_3dim, refidx_3dim)
+
+            y_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.float)
+            idx_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.int64)
+            torch.min(x, dim=3, keepdim=True, out=(y_3dim, idx_3dim))
+            refy_3dim, refidx_3dim = torch.min(cpu_x, dim=3, keepdim=True,)
+            self.assertEqual(y_3dim, refy_3dim)
+            self.assertEqual(idx_3dim, refidx_3dim)
+
+        helper(2, 8, 4, 5)
+
+    # Test forward sum
+    def test_sum(self):
+        def helper(n, c, h, w, dtype=torch.float32):
+            cpu_x = None
+            x = None
+            if(dtype not in [torch.float32, torch.bool]):
+                cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            elif (dtype == torch.bool):
+                cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            else:
+                cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True)
+                x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            all_sum = torch.sum(x)
+            all_sum_cpu = torch.sum(cpu_x)
+
+            self.assertEqual(all_sum, all_sum_cpu)
+
+            nil_dim_sum = torch.sum(x, dim=[])
+            nil_dim_sum_cpu = torch.sum(cpu_x, dim=[])
+
+            self.assertEqual(nil_dim_sum, nil_dim_sum_cpu)
+
+            nil_dim_sum_keepdim = torch.sum(x, dim=[], keepdim=True)
+            nil_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[], keepdim=True)
+
+            self.assertEqual(nil_dim_sum_keepdim, nil_dim_sum_cpu_keepdim)
+
+            zero_dim_sum = torch.sum(x, dim=[0])
+            zero_dim_sum_cpu = torch.sum(cpu_x, dim=[0])
+
+            self.assertEqual(zero_dim_sum, zero_dim_sum_cpu)
+
+            zero_dim_sum_keepdim = torch.sum(x, dim=[0], keepdim=True)
+            zero_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[0], keepdim=True)
+
+            self.assertEqual(zero_dim_sum_keepdim, zero_dim_sum_cpu_keepdim)
+
+            zero_one_dim_sum = torch.sum(x, dim=[0, 1])
+            zero_one_dim_sum_cpu = torch.sum(cpu_x, dim=[0, 1])
+
+            self.assertEqual(zero_one_dim_sum, zero_one_dim_sum_cpu)
+
+            zero_one_dim_sum_keepdim = torch.sum(x, dim=[0, 1], keepdim=True)
+            zero_one_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[0, 1], keepdim=True)
+
+            self.assertEqual(zero_one_dim_sum_keepdim, zero_one_dim_sum_cpu_keepdim)
+
+            two_three_dim_sum = torch.sum(x, dim=[2, 3])
+            two_three_dim_sum_cpu = torch.sum(cpu_x, dim=[2, 3])
+
+            self.assertEqual(two_three_dim_sum, two_three_dim_sum_cpu)
+
+            two_three_keepdim_sum = torch.sum(x, dim=[2, 3], keepdim=True)
+            two_three_dim_keepsum_cpu = torch.sum(cpu_x, dim=[2, 3], keepdim=True)
+
+            self.assertEqual(two_three_keepdim_sum, two_three_dim_keepsum_cpu)
+
+        helper(2, 8, 4, 5)
+        helper(2, 8, 4, 5, dtype=torch.int32)
+        helper(2, 8, 4, 5, dtype=torch.int64)
+        helper(2, 8, 4, 5, dtype=torch.bool)
+
+    # Test forward prod
+    def test_prod(self):
+        def helper(shape, dtype=torch.float32):
+            cpu_x = None
+            x = None
+            if(dtype not in [torch.float32, torch.bool]):
+                cpu_x = torch.randint(1, 6, shape, device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            elif (dtype == torch.bool):
+                cpu_x = torch.randint(2, shape, device='cpu', dtype=dtype, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+            else:
+                cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=True)
+                x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            all_prod = torch.prod(x)
+            all_prod_cpu = torch.prod(cpu_x)
+
+            self.assertEqual(all_prod, all_prod_cpu)
+
+            for dim in range(len(shape)):
+                dim_prod = torch.prod(x, dim=dim)
+                dim_prod_cpu = torch.prod(cpu_x, dim=dim)
+
+                self.assertEqual(dim_prod, dim_prod_cpu)
+
+                dim_prod_keepdim = torch.prod(x, dim=dim, keepdim=True)
+                dim_prod_cpu_keepdim = torch.prod(cpu_x, dim=dim, keepdim=True)
+
+                self.assertEqual(dim_prod_keepdim, dim_prod_cpu_keepdim)
+
+        for dtype in [torch.float32, torch.int32, torch.int64, torch.bool]:
+            helper((2, 3), dtype)
+
+    # Test forward mean
+    def test_mean(self):
+        def helper(n, c, h, w):
+            cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            all_mean = torch.mean(x)
+            all_mean_cpu = torch.mean(cpu_x)
+
+            self.assertEqual(all_mean, all_mean_cpu)
+
+            nil_dim_mean = torch.mean(x, dim=[])
+            nil_dim_mean_cpu = torch.mean(cpu_x, dim=[])
+
+            self.assertEqual(nil_dim_mean, nil_dim_mean_cpu)
+
+            nil_dim_mean_keepdim = torch.mean(x, dim=[], keepdim=True)
+            nil_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[], keepdim=True)
+
+            self.assertEqual(nil_dim_mean_keepdim, nil_dim_mean_cpu_keepdim)
+
+            zero_dim_mean = torch.mean(x, dim=[0])
+            zero_dim_mean_cpu = torch.mean(cpu_x, dim=[0])
+
+            self.assertEqual(zero_dim_mean, zero_dim_mean_cpu)
+
+            zero_dim_mean_keepdim = torch.mean(x, dim=[0], keepdim=True)
+            zero_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[0], keepdim=True)
+
+            self.assertEqual(zero_dim_mean_keepdim, zero_dim_mean_cpu_keepdim)
+
+            zero_one_dim_mean = torch.mean(x, dim=[0, 1])
+            zero_one_dim_mean_cpu = torch.mean(cpu_x, dim=[0, 1])
+
+            self.assertEqual(zero_one_dim_mean, zero_one_dim_mean_cpu)
+
+            zero_one_dim_mean_keepdim = torch.mean(x, dim=[0, 1], keepdim=True)
+            zero_one_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[0, 1], keepdim=True)
+
+            self.assertEqual(zero_one_dim_mean_keepdim, zero_one_dim_mean_cpu_keepdim)
+
+            two_three_dim_mean = torch.mean(x, dim=[2, 3])
+            two_three_dim_mean_cpu = torch.mean(cpu_x, dim=[2, 3])
+
+            self.assertEqual(two_three_dim_mean, two_three_dim_mean_cpu)
+
+            two_three_keepdim_mean = torch.mean(x, dim=[2, 3], keepdim=True)
+            two_three_dim_keepmean_cpu = torch.mean(cpu_x, dim=[2, 3], keepdim=True)
+
+            self.assertEqual(two_three_keepdim_mean, two_three_dim_keepmean_cpu)
+
+        helper(2, 8, 4, 5)
+
+    # Test std
+    def test_std(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            all_std = torch.std(x, unbiased=False)
+            all_std_cpu = torch.std(cpu_x, unbiased=False)
+
+            self.assertEqual(all_std, all_std_cpu)
+
+            nil_dim_std = torch.std(x, dim=[], unbiased=False)
+            nil_dim_std_cpu = torch.std(cpu_x, dim=[], unbiased=False)
+
+            self.assertEqual(nil_dim_std, nil_dim_std_cpu)
+
+            nil_dim_std_keepdim = torch.std(x, dim=[], keepdim=True, unbiased=False)
+            nil_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[], keepdim=True, unbiased=False)
+
+            self.assertEqual(nil_dim_std_keepdim, nil_dim_std_cpu_keepdim)
+
+            zero_dim_std = torch.std(x, dim=[0], unbiased=False)
+            zero_dim_std_cpu = torch.std(cpu_x, dim=[0], unbiased=False)
+
+            self.assertEqual(zero_dim_std, zero_dim_std_cpu)
+
+            zero_dim_std_keepdim = torch.std(x, dim=[0], keepdim=True, unbiased=False)
+            zero_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0], keepdim=True, unbiased=False)
+
+            self.assertEqual(zero_dim_std_keepdim, zero_dim_std_cpu_keepdim)
+
+            zero_one_dim_std = torch.std(x, dim=[0, 1], unbiased=False)
+            zero_one_dim_std_cpu = torch.std(cpu_x, dim=[0, 1], unbiased=False)
+
+            self.assertEqual(zero_one_dim_std, zero_one_dim_std_cpu)
+
+            zero_one_dim_std_keepdim = torch.std(x, dim=[0, 1], keepdim=True, unbiased=False)
+            zero_one_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0, 1], keepdim=True, unbiased=False)
+
+            self.assertEqual(zero_one_dim_std_keepdim, zero_one_dim_std_cpu_keepdim)
+
+            two_three_dim_std = torch.std(x, dim=[2, 3], unbiased=False)
+            two_three_dim_std_cpu = torch.std(cpu_x, dim=[2, 3], unbiased=False)
+
+            self.assertEqual(two_three_dim_std, two_three_dim_std_cpu)
+
+            two_three_keepdim_std = torch.std(x, dim=[2, 3], keepdim=True, unbiased=False)
+            two_three_dim_keepstd_cpu = torch.std(cpu_x, dim=[2, 3], keepdim=True, unbiased=False)
+
+            self.assertEqual(two_three_keepdim_std, two_three_dim_keepstd_cpu)
+
+            all_std = torch.std(x, unbiased=True)
+            all_std_cpu = torch.std(cpu_x, unbiased=True)
+
+            self.assertEqual(all_std, all_std_cpu)
+
+            nil_dim_std = torch.std(x, dim=[], unbiased=True)
+            nil_dim_std_cpu = torch.std(cpu_x, dim=[], unbiased=True)
+
+            self.assertEqual(nil_dim_std, nil_dim_std_cpu)
+
+            nil_dim_std_keepdim = torch.std(x, dim=[], keepdim=True, unbiased=True)
+            nil_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[], keepdim=True, unbiased=True)
+
+            self.assertEqual(nil_dim_std_keepdim, nil_dim_std_cpu_keepdim)
+
+            zero_dim_std = torch.std(x, dim=[0], unbiased=True)
+            zero_dim_std_cpu = torch.std(cpu_x, dim=[0], unbiased=True)
+
+            self.assertEqual(zero_dim_std, zero_dim_std_cpu)
+
+            zero_dim_std_keepdim = torch.std(x, dim=[0], keepdim=True, unbiased=True)
+            zero_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0], keepdim=True, unbiased=True)
+
+            self.assertEqual(zero_dim_std_keepdim, zero_dim_std_cpu_keepdim)
+
+            zero_one_dim_std = torch.std(x, dim=[0, 1], unbiased=True)
+            zero_one_dim_std_cpu = torch.std(cpu_x, dim=[0, 1], unbiased=True)
+
+            self.assertEqual(zero_one_dim_std, zero_one_dim_std_cpu)
+
+            zero_one_dim_std_keepdim = torch.std(x, dim=[0, 1], keepdim=True, unbiased=True)
+            zero_one_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0, 1], keepdim=True, unbiased=True)
+
+            self.assertEqual(zero_one_dim_std_keepdim, zero_one_dim_std_cpu_keepdim)
+
+            two_three_dim_std = torch.std(x, dim=[2, 3], unbiased=True)
+            two_three_dim_std_cpu = torch.std(cpu_x, dim=[2, 3], unbiased=True)
+
+            self.assertEqual(two_three_dim_std, two_three_dim_std_cpu)
+
+            two_three_keepdim_std = torch.std(x, dim=[2, 3], keepdim=True, unbiased=True)
+            two_three_dim_keepstd_cpu = torch.std(cpu_x, dim=[2, 3], keepdim=True, unbiased=True)
+
+            self.assertEqual(two_three_keepdim_std, two_three_dim_keepstd_cpu)
+
+        helper((4, 5, 6, 7))
+
+    # Test var
+    def test_var(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            all_var = torch.var(x, unbiased=False)
+            all_var_cpu = torch.var(cpu_x, unbiased=False)
+
+            self.assertEqual(all_var, all_var_cpu)
+
+            nil_dim_var = torch.var(x, dim=[], unbiased=False)
+            nil_dim_var_cpu = torch.var(cpu_x, dim=[], unbiased=False)
+
+            self.assertEqual(nil_dim_var, nil_dim_var_cpu)
+
+            nil_dim_var_keepdim = torch.var(x, dim=[], keepdim=True, unbiased=False)
+            nil_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[], keepdim=True, unbiased=False)
+
+            self.assertEqual(nil_dim_var_keepdim, nil_dim_var_cpu_keepdim)
+
+            zero_dim_var = torch.var(x, dim=[0], unbiased=False)
+            zero_dim_var_cpu = torch.var(cpu_x, dim=[0], unbiased=False)
+
+            self.assertEqual(zero_dim_var, zero_dim_var_cpu)
+
+            zero_dim_var_keepdim = torch.var(x, dim=[0], keepdim=True, unbiased=False)
+            zero_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0], keepdim=True, unbiased=False)
+
+            self.assertEqual(zero_dim_var_keepdim, zero_dim_var_cpu_keepdim)
+
+            zero_one_dim_var = torch.var(x, dim=[0, 1], unbiased=False)
+            zero_one_dim_var_cpu = torch.var(cpu_x, dim=[0, 1], unbiased=False)
+
+            self.assertEqual(zero_one_dim_var, zero_one_dim_var_cpu)
+
+            zero_one_dim_var_keepdim = torch.var(x, dim=[0, 1], keepdim=True, unbiased=False)
+            zero_one_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0, 1], keepdim=True, unbiased=False)
+
+            self.assertEqual(zero_one_dim_var_keepdim, zero_one_dim_var_cpu_keepdim)
+
+            two_three_dim_var = torch.var(x, dim=[2, 3], unbiased=False)
+            two_three_dim_var_cpu = torch.var(cpu_x, dim=[2, 3], unbiased=False)
+
+            self.assertEqual(two_three_dim_var, two_three_dim_var_cpu)
+
+            two_three_keepdim_var = torch.var(x, dim=[2, 3], keepdim=True, unbiased=False)
+            two_three_dim_keepvar_cpu = torch.var(cpu_x, dim=[2, 3], keepdim=True, unbiased=False)
+
+            self.assertEqual(two_three_keepdim_var, two_three_dim_keepvar_cpu)
+
+            all_var = torch.var(x, unbiased=True)
+            all_var_cpu = torch.var(cpu_x, unbiased=True)
+
+            self.assertEqual(all_var, all_var_cpu)
+
+            nil_dim_var = torch.var(x, dim=[], unbiased=True)
+            nil_dim_var_cpu = torch.var(cpu_x, dim=[], unbiased=True)
+
+            self.assertEqual(nil_dim_var, nil_dim_var_cpu)
+
+            nil_dim_var_keepdim = torch.var(x, dim=[], keepdim=True, unbiased=True)
+            nil_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[], keepdim=True, unbiased=True)
+
+            self.assertEqual(nil_dim_var_keepdim, nil_dim_var_cpu_keepdim)
+
+            zero_dim_var = torch.var(x, dim=[0], unbiased=True)
+            zero_dim_var_cpu = torch.var(cpu_x, dim=[0], unbiased=True)
+
+            self.assertEqual(zero_dim_var, zero_dim_var_cpu)
+
+            zero_dim_var_keepdim = torch.var(x, dim=[0], keepdim=True, unbiased=True)
+            zero_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0], keepdim=True, unbiased=True)
+
+            self.assertEqual(zero_dim_var_keepdim, zero_dim_var_cpu_keepdim)
+
+            zero_one_dim_var = torch.var(x, dim=[0, 1], unbiased=True)
+            zero_one_dim_var_cpu = torch.var(cpu_x, dim=[0, 1], unbiased=True)
+
+            self.assertEqual(zero_one_dim_var, zero_one_dim_var_cpu)
+
+            zero_one_dim_var_keepdim = torch.var(x, dim=[0, 1], keepdim=True, unbiased=True)
+            zero_one_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0, 1], keepdim=True, unbiased=True)
+
+            self.assertEqual(zero_one_dim_var_keepdim, zero_one_dim_var_cpu_keepdim)
+
+            two_three_dim_var = torch.var(x, dim=[2, 3], unbiased=True)
+            two_three_dim_var_cpu = torch.var(cpu_x, dim=[2, 3], unbiased=True)
+
+            self.assertEqual(two_three_dim_var, two_three_dim_var_cpu)
+
+            two_three_keepdim_var = torch.var(x, dim=[2, 3], keepdim=True, unbiased=True)
+            two_three_dim_keepvar_cpu = torch.var(cpu_x, dim=[2, 3], keepdim=True, unbiased=True)
+
+            self.assertEqual(two_three_keepdim_var, two_three_dim_keepvar_cpu)
+
+        helper((4, 5, 6, 7))
+
+    # test norm_out
+    # CRASH in Fallback for svd_linalg op.
+    # def test_norm(self):
+        # def helper(shape):
+        # cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+        # x = cpu_x.detach().clone().to('mps')
+        # p_vals = [ ]
+        # for i in range(-5, 6):
+        # p_vals.append(i)
+        # p_vals.append(random.uniform(5.1, 10.1))
+
+        # p_vals.append(float('inf'))
+        # p_vals.append(float('-inf'))
+        # p_vals.append('fro')
+        # p_vals.append('nuc')
+        # # ints
+        # for p_val in p_vals:
+        # if (p_val != 'nuc'):
+        # all_norm = torch.norm(x, p=p_val)
+        # all_norm_cpu = torch.norm(cpu_x, p=p_val)
+
+        # self.assertEqual(all_norm, all_norm_cpu)
+
+        # nil_dim_norm = torch.norm(x, dim=[], p=p_val)
+        # nil_dim_norm_cpu = torch.norm(cpu_x, dim=[], p=p_val)
+
+        # self.assertEqual(nil_dim_norm, nil_dim_norm_cpu)
+
+        # nil_dim_norm_keepdim = torch.norm(x, dim=[], keepdim=True, p=p_val)
+        # nil_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[], keepdim=True, p=p_val)
+
+        # self.assertEqual(nil_dim_norm_keepdim, nil_dim_norm_cpu_keepdim)
+
+        # zero_dim_norm = torch.norm(x, dim=[0], p=p_val)
+        # zero_dim_norm_cpu = torch.norm(cpu_x, dim=[0], p=p_val)
+
+        # self.assertEqual(zero_dim_norm, zero_dim_norm_cpu)
+
+        # zero_dim_norm_keepdim = torch.norm(x, dim=[0], keepdim=True, p=p_val)
+        # zero_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[0], keepdim=True, p=p_val)
+
+        # self.assertEqual(zero_dim_norm_keepdim, zero_dim_norm_cpu_keepdim)
+
+        # if (len(shape) > 1):
+        # zero_one_dim_norm = torch.norm(x, dim=[0, 1],p=p_val)
+        # zero_one_dim_norm_cpu = torch.norm(cpu_x, dim=[0, 1],p=p_val)
+
+        # self.assertEqual(zero_one_dim_norm, zero_one_dim_norm_cpu)
+
+        # zero_one_dim_norm_keepdim = torch.norm(x, dim=[0, 1], keepdim=True, p=p_val)
+        # zero_one_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[0, 1], keepdim=True, p=p_val)
+
+        # self.assertEqual(zero_one_dim_norm_keepdim, zero_one_dim_norm_cpu_keepdim)
+
+        # zero_one_dim_norm = torch.norm(x, dim=[0, 1],p='fro')
+
+        # if (len(shape) > 3):
+        # two_three_dim_norm = torch.norm(x, dim=[2,3], p=p_val)
+        # two_three_dim_norm_cpu = torch.norm(cpu_x, dim=[2,3], p=p_val)
+
+        # self.assertEqual(two_three_dim_norm, two_three_dim_norm_cpu)
+
+        # two_three_keepdim_norm = torch.norm(x, dim=[2,3], keepdim=True, p=p_val)
+        # two_three_dim_keepnorm_cpu = torch.norm(cpu_x, dim=[2, 3], keepdim=True,p=p_val)
+
+        # self.assertEqual(two_three_keepdim_norm, two_three_dim_keepnorm_cpu)
+
+        # helper((5, 1))
+        # helper((5, 7))
+        # helper((4, 5, 6, 7))
+
+    # Test minimum and maximum
+    def test_minimum_maximum(self):
+        def helper(n, c, h, w):
+            cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            cpu_y = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            mps_x = cpu_x.detach().clone().to('mps')
+            mps_y = cpu_y.detach().clone().to('mps')
+
+            minimum_result_cpu = torch.minimum(cpu_x, cpu_y)
+            minimum_result_mps = torch.minimum(mps_x, mps_y)
+            self.assertEqual(minimum_result_cpu, minimum_result_mps)
+
+            maximum_result_cpu = torch.maximum(cpu_x, cpu_y)
+            maximum_result_mps = torch.maximum(mps_x, mps_y)
+            self.assertEqual(maximum_result_cpu, maximum_result_mps)
+
+        helper(1, 1, 4, 5)
+
+    # Test clamp_min
+    def test_clamp_min(self):
+        def helper(n, c, h, w):
+            cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_min_t = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            min_t = cpu_min_t.detach().clone().to('mps')
+
+            clamp_min_result = torch.clamp_min(x, min=5.0)
+            clamp_min_result_cpu = torch.clamp_min(cpu_x, min=5.0)
+
+            self.assertEqual(clamp_min_result, clamp_min_result_cpu)
+
+            clamp_min_t_result = torch.clamp_min(x, min=min_t)
+            clamp_min_t_result_cpu = torch.clamp_min(cpu_x, min=cpu_min_t)
+
+            self.assertEqual(clamp_min_t_result, clamp_min_t_result_cpu)
+
+        helper(2, 8, 4, 5)
+
+    # Test clamp_max
+
+    def test_clamp_max(self):
+        def helper(n, c, h, w):
+            cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_max_t = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
+            max_t = cpu_max_t.detach().clone().to('mps')
+
+            clamp_max_result = torch.clamp_max(x, max=100.0)
+            clamp_max_result_cpu = torch.clamp_max(cpu_x, max=100.0)
+
+            self.assertEqual(clamp_max_result, clamp_max_result_cpu)
+
+            clamp_max_t_result = torch.clamp_max(x, max=max_t)
+            clamp_max_t_result_cpu = torch.clamp_max(cpu_x, max=cpu_max_t)
+
+            self.assertEqual(clamp_max_t_result, clamp_max_t_result_cpu)
+
+        helper(2, 8, 4, 5)
+
+    # Test clamp
+    def test_clamp(self):
+        def helper(n, c, h, w):
+            import numpy as np
+            upper_bound = 1000
+            half_upper_bound = upper_bound / 2
+
+            # x=[0..1000)
+            x_arr = upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32)
+            cpu_x = torch.tensor(x_arr, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            # x=[0..500)
+            min_arr = half_upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32)
+            cpu_min_t = torch.tensor(min_arr, device='cpu', dtype=torch.float, requires_grad=False)
+            min_t = cpu_min_t.detach().clone().to('mps')
+
+            # x=[500..1000), to ensure max's are greater than mins
+            max_arr = (half_upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32)) + half_upper_bound
+            cpu_max_t = torch.tensor(max_arr, device='cpu', dtype=torch.float, requires_grad=False)
+            max_t = cpu_max_t.detach().clone().to('mps')
+
+            # [200..600]: just an arbitrary range between [0..1000]
+            clamp_result = torch.clamp(x, min=200.0, max=600.0)
+            clamp_result_cpu = torch.clamp(cpu_x, min=200.0, max=600.0)
+            self.assertEqual(clamp_result, clamp_result_cpu)
+
+            # test optional scalar refs and cached graph keys by passing only max
+            clamp_opt_result = torch.clamp(x, max=600.0)
+            clamp_opt_result_cpu = torch.clamp(cpu_x, max=600.0)
+            self.assertEqual(clamp_opt_result, clamp_opt_result_cpu)
+
+            clamp_t_result = torch.clamp(x, min=min_t, max=max_t)
+            clamp_t_result_cpu = torch.clamp(cpu_x, min=cpu_min_t, max=cpu_max_t)
+            self.assertEqual(clamp_t_result, clamp_t_result_cpu)
+
+            # test optional tensor refs and cached graph keys by passing only max
+            clamp_topt_result = torch.clamp(x, max=max_t)
+            clamp_topt_result_cpu = torch.clamp(cpu_x, max=cpu_max_t)
+            self.assertEqual(clamp_topt_result, clamp_topt_result_cpu)
+
+            # test inplace clamping
+            x.clamp_(min=200.0, max=600.0)
+            cpu_x.clamp_(min=200.0, max=600.0)
+            self.assertEqual(cpu_x, x)
+
+        helper(2, 8, 4, 5)
+
+    def test_divmode(self):
+        def helper(shape, rounding_mode):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            mps_x = cpu_x.detach().clone().to('mps')
+            # clamp to avoid division by 0
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False).clamp_min_(0.1)
+            mps_y = cpu_y.detach().clone().to('mps')
+
+            result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
+            result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
+            self.assertEqual(result_div_mps, result_div_cpu)
+
+        helper((2, 8, 4, 5), "floor")
+        helper((2, 8, 4, 5), "trunc")
+
+    def test_rounding(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            mps_x = cpu_x.detach().clone().to('mps')
+
+            result_floor_cpu = torch.floor(cpu_x)
+            result_floor_mps = torch.floor(mps_x)
+            self.assertEqual(result_floor_mps, result_floor_cpu)
+
+            result_ceil_cpu = torch.ceil(cpu_x)
+            result_ceil_mps = torch.ceil(mps_x)
+            self.assertEqual(result_ceil_mps, result_ceil_cpu)
+
+            result_trunc_cpu = torch.trunc(cpu_x)
+            result_trunc_mps = torch.trunc(mps_x)
+            self.assertEqual(result_trunc_mps, result_trunc_cpu)
+
+            result_round_cpu = torch.round(cpu_x)
+            result_round_mps = torch.round(mps_x)
+            self.assertEqual(result_round_mps, result_round_cpu)
+
+        helper((2, 6, 3, 5))
+        helper((2, 8, 4, 5))
+
+    def test_expand(self):
+        def helper(n, c):
+            values = [[1.0], [4.0], [7.0]]
+            cpu_x = torch.tensor(values, device='cpu')
+            x = cpu_x.detach().clone().to('mps')
+
+            strided_cpu = torch.as_strided(cpu_x, (3, 4), (1, 0))
+            strided_mps = torch.as_strided(x, (3, 4), (1, 0))
+
+            print(cpu_x)
+            print(strided_cpu)
+
+            print(x.to('cpu'))
+            print(strided_mps.to('cpu'))
+
+            print(strided_mps.size())
+            print(strided_mps.stride())
+
+            self.assertEqual(strided_mps, strided_cpu)
+
+        helper(3, 1)
+
+    def test_select(self):
+        def helper(n, c):
+            cpu_x = torch.randn(n, c, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            strided_cpu = torch.as_strided(cpu_x, (3, 1), (3, 1))
+            strided_mps = torch.as_strided(x, (3, 1), (3, 1))
+            self.assertEqual(strided_mps, strided_cpu)
+
+            strided_cpu = torch.as_strided(cpu_x, (1, 3), (3, 1))
+            strided_mps = torch.as_strided(x, (1, 3), (3, 1))
+            self.assertEqual(strided_mps, strided_cpu)
+
+            strided_cpu = torch.as_strided(cpu_x, (3, 1), (3, 1), storage_offset=1)
+            strided_mps = torch.as_strided(x, (3, 1), (3, 1), storage_offset=1)
+            print(cpu_x)
+            print(strided_cpu)
+
+            print(x.to('cpu'))
+            print(strided_mps.to('cpu'))
+
+            self.assertEqual(strided_mps, strided_cpu)
+
+        helper(3, 3)
+
+    def test_topk(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+            for largest_val in [True, False]:
+                if (type(shape) == tuple):
+                    for curr_dim in range(0, len(shape)):
+                        dim_size = shape[curr_dim]
+                        for k in range(1, dim_size + 1):
+                            topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest_val)
+                            topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest_val)
+                            self.assertEqual(topk_values, topk_values_cpu)
+                            self.assertEqual(topk_indices, topk_indices_cpu)
+                else:
+                    for k in range(1, shape):
+                        topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest_val)
+                        topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest_val)
+                        self.assertEqual(topk_values, topk_values_cpu)
+                        self.assertEqual(topk_indices, topk_indices_cpu)
+
+        helper(2)
+        helper((5, 1))
+        helper((1, 5))
+        helper((5, 9, 7, 4))
+
+    def test_upsample_nearest_exact2d(self):
+        def helper(N, C, H, W):
+            inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
+                                    requires_grad=True).reshape(N, C, H, W)
+            inputCPU.retain_grad()
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+
+            outputCPU = torch.nn.functional.interpolate(inputCPU, size=(5, 5), mode='nearest-exact')
+            outputMPS = torch.nn.functional.interpolate(inputMPS, size=(5, 5), mode='nearest-exact')
+
+            self.assertEqual(outputCPU, outputMPS)
+
+            outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3))
+            outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3))
+
+            self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        helper(1, 1, 4, 4)
+        helper(7, 5, 3, 2)
+
+    def test_upsample_nearest2d(self):
+        def helper(N, C, H, W):
+            inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
+                                    requires_grad=True).reshape(N, C, H, W)
+            inputCPU.retain_grad()
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+
+            x_max = 40
+            y_max = 40
+
+            for i in range(1, x_max):
+                for j in range(1, y_max):
+                    upsample_nearest2d = nn.UpsamplingNearest2d(scale_factor=(i, j))
+
+                    outputCPU = upsample_nearest2d(inputCPU)
+                    outputMPS = upsample_nearest2d(inputMPS)
+
+                    self.assertEqual(outputCPU, outputMPS)
+                    upsample_nearest2d = nn.UpsamplingNearest2d((i * H, j * W))
+
+                    outputCPU = upsample_nearest2d(inputCPU)
+                    outputMPS = upsample_nearest2d(inputMPS)
+
+                    self.assertEqual(outputCPU, outputMPS)
+
+                    outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3))
+                    outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3))
+
+                    self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        helper(1, 1, 4, 4)
+        helper(7, 5, 3, 2)
+
+    def test_upsample_bilinear2d(self):
+        def helper(N, C, H, W):
+            inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
+                                    requires_grad=True).reshape(N, C, H, W)
+            inputCPU.retain_grad()
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+
+            x_max = 40
+            y_max = 40
+
+            for i in range(1, x_max):
+                for j in range(1, y_max):
+                    upsample_bilinear2d = nn.UpsamplingBilinear2d(scale_factor=(i, j))
+
+                    outputCPU = upsample_bilinear2d(inputCPU)
+                    outputMPS = upsample_bilinear2d(inputMPS)
+
+                    self.assertEqual(outputCPU, outputMPS)
+
+                    upsample_bilinear2d = nn.UpsamplingBilinear2d((i * H, j * W))
+
+                    outputCPU = upsample_bilinear2d(inputCPU)
+                    outputMPS = upsample_bilinear2d(inputMPS)
+
+                    self.assertEqual(outputCPU, outputMPS)
+
+                    outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3))
+                    outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3))
+
+                    self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        helper(1, 1, 4, 4)
+        helper(7, 5, 3, 2)
+
+    # Test concat forward
+    def test_cat1(self):
+        def helper(shape_x, shape_y, shape_z):
+            cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False)
+            z = cpu_z.detach().clone().to('mps')
+
+            cat = torch.cat([x, y, z], dim=1)
+            cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z], dim=1)
+
+            self.assertEqual(cat, cat_cpu)
+
+        helper([2, 2, 4, 5], [2, 3, 4, 5], [2, 5, 4, 5])
+        # Empty test - Currently failing! Empty tensor not handled!
+        # helper([0, 2, 4, 5], [2, 0, 4, 5], [2, 5, 0, 5])
+
+    def test_pad(self):
+        def helper(shape, padding, op):
+            inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            inputCPU.retain_grad()
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+
+            padCriteria = op(padding)
+            outputCPU = padCriteria(inputCPU)
+            outputMPS = padCriteria(inputMPS)
+            self.assertEqual(outputCPU, outputMPS)
+
+            # backward pass (chose 0.6 just to have the grad_output != 1)
+            outputCPU.backward(gradient=torch.full_like(outputCPU, 0.6))
+            outputMPS.backward(gradient=torch.full_like(outputMPS, 0.6))
+            self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        # 1D Padding
+        helper((2, 4, 3), 2, nn.ReflectionPad1d)
+        # verify if a change in shape of input would cause problems with graph caching
+        helper((2, 4, 4), (1, 3), nn.ReflectionPad1d)
+        # Replication 1D
+        helper((2, 1, 6), 3, nn.ReplicationPad1d)
+
+        # 2D Padding
+        helper((1, 2, 3, 4), (1, 1, 2, 0), nn.ReflectionPad2d)
+        # verify if a change in shape of input would cause problems with graph caching
+        helper((2, 4, 3, 4), (1, 1, 2, 0), nn.ReflectionPad2d)
+        # this should make the padding (2, 2, 2, 2)
+        helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
+        # verify if a change in shape of padding would cause problems with graph caching
+        helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+
+        # 3D Padding
+        helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReflectionPad3d)
+        # verify if a change in shape of padding would cause problems with graph caching
+        helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d)
+
+    # Test stack forward
+    def test_stack(self):
+        # All shapes must be same
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            z = cpu_z.detach().clone().to('mps')
+
+            stack = torch.stack([x, y, z], dim=1)
+            stack_cpu = torch.stack([cpu_x, cpu_y, cpu_z], dim=1)
+
+            self.assertEqual(stack, stack_cpu)
+
+        helper([2, 8, 4, 5])
+        # Empty test - Currently failing! Empty tensor not handled!
+        # helper([0, 2, 4, 5])
+
+    # Test abs
+    def test_abs(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            abs_result = torch.abs(x)
+            abs_result_cpu = torch.abs(cpu_x)
+
+            self.assertEqual(abs_result, abs_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_log(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            log_result = torch.log(x)
+            log_result_cpu = torch.log(cpu_x)
+
+            self.assertEqual(log_result, log_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_log_ten(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            log_ten_result = torch.log10(x)
+            log_ten_result_cpu = torch.log10(cpu_x)
+
+            self.assertEqual(log_ten_result, log_ten_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_log_two(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            log_two_result = torch.log2(x)
+            log_two_result_cpu = torch.log2(cpu_x)
+
+            self.assertEqual(log_two_result, log_two_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_log1p(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            log_result = torch.log1p(x)
+            log_result_cpu = torch.log1p(cpu_x)
+
+            self.assertEqual(log_result, log_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_logaddexp(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            log_result = torch.logaddexp(x, y)
+            log_result_cpu = torch.logaddexp(cpu_x, cpu_y)
+
+            self.assertEqual(log_result, log_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    def test_logaddexp2(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            log_result = torch.logaddexp2(x, y)
+            log_result_cpu = torch.logaddexp2(cpu_x, cpu_y)
+
+            self.assertEqual(log_result, log_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    # Test concat forward
+    def test_cat2(self):
+
+        def helper1(shape_x, shape_y, shape_z, shape_w):
+            cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False)
+            z = cpu_z.detach().clone().to('mps')
+
+            cpu_w = torch.randn(shape_w, device='cpu', dtype=torch.float, requires_grad=False)
+            w = cpu_w.detach().clone().to('mps')
+
+            cat = torch.cat([x, y, z, w], dim=1)
+            cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z, cpu_w], dim=1)
+
+            self.assertEqual(cat, cat_cpu)
+
+        def helper(shape_x, shape_y, shape_z):
+            cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False)
+            z = cpu_z.detach().clone().to('mps')
+
+            cat = torch.cat([x, y, z], dim=1)
+            cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z], dim=1)
+
+            self.assertEqual(cat, cat_cpu)
+
+        helper([2, 8, 4, 5], [2, 10, 4, 5], [2, 6, 4, 5])
+        helper([2, 2, 4, 5], [2, 3, 4, 5], [2, 5, 4, 5])
+        # Empty test - Currently failing! Empty tensor not handled!
+        # helper([0, 2, 4, 5], [2, 0, 4, 5], [2, 5, 0, 5])
+
+    # Test isnan
+    def test_isnan(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            nan_index = [random.randrange(0, shape[0])]
+            # make a selected row inf
+            cpu_x.index_put_(indices=[torch.tensor(nan_index)], values=torch.tensor(float('nan')))
+            x = cpu_x.detach().clone().to('mps')
+
+            isnan_result = torch.isnan(x)
+            isnan_result_cpu = torch.isnan(cpu_x)
+
+            self.assertEqual(isnan_result, isnan_result_cpu)
+
+        helper((8, 2, 4, 5))
+
+    # Test reciprocal
+    def test_reciprocal(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            reciprocal_result = torch.reciprocal(x)
+            reciprocal_result_cpu = torch.reciprocal(cpu_x)
+
+            cpu_grad = torch.ones_like(reciprocal_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            reciprocal_result.backward(gradient=grad)
+            reciprocal_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(reciprocal_result, reciprocal_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 8, 4, 5))
+
+    # Test sqrt
+    def test_sqrt(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            sqrt_result = torch.sqrt(x)
+            sqrt_result_cpu = torch.sqrt(cpu_x)
+
+            cpu_grad = torch.ones_like(sqrt_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            sqrt_result.backward(gradient=grad)
+            sqrt_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(sqrt_result, sqrt_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 8, 4, 5))
+
+    # Test selu, elu, celu
+    def test_elu(self):
+        def helper(shape, alpha=1.0):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            for activation_func in [torch.nn.ELU(alpha=alpha), torch.nn.CELU(alpha=alpha), torch.nn.SELU()]:
+                elu_result = activation_func(x)
+                elu_result_cpu = activation_func(cpu_x)
+
+                cpu_grad = torch.randn(elu_result_cpu.shape)
+                grad = cpu_grad.to('mps')
+
+                elu_result.backward(gradient=grad)
+                elu_result_cpu.backward(gradient=cpu_grad)
+
+                self.assertEqual(elu_result, elu_result_cpu)
+                self.assertEqual(x.grad, cpu_x.grad)
+
+        # Test empty shape too
+        for shape in [[], (2, 3), (2, 8, 4, 5)]:
+            for alpha in [0.000001, 1.0, 2.3, 0.34, 23]:
+                helper(shape, alpha)
+    # Test silu
+
+    def test_silu(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            silu_result = torch.nn.SiLU()(x)
+            silu_result_cpu = torch.nn.SiLU()(cpu_x)
+
+            cpu_grad = torch.randn(silu_result_cpu.shape)
+            grad = cpu_grad.to('mps')
+
+            silu_result.backward(gradient=grad)
+            silu_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(silu_result, silu_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        # Test empty shape too
+        for shape in [[], (2, 3), (2, 8, 4, 5)]:
+            helper(shape)
+
+    # Test adaptive avg pool2d - when the input size is a multiple of output size
+    # Not testing for channels last right now
+    def test_adaptive_avg_pool2d_simple(self):
+        def helper(input_shape, out_shape, channels_last):
+            cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            if(channels_last):
+                cpu_x = cpu_x.to(memory_format=torch.channels_last)
+                cpu_x.retain_grad()
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            avg_result = torch.nn.AdaptiveAvgPool2d(out_shape)(x)
+            avg_result_cpu = torch.nn.AdaptiveAvgPool2d(out_shape)(cpu_x)
+
+            cpu_grad = torch.randn(avg_result_cpu.shape)
+            grad = cpu_grad.to('mps')
+
+            avg_result.backward(gradient=grad)
+            avg_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(avg_result, avg_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 2, 4, 4), (2, 2), False)
+        helper((2, 2, 9, 9), (3, 3), False)
+        helper((2, 2, 9, 9), (9, 9), False)
+        helper((2, 2, 16, 16), (2, 2), False)
+        helper((2, 2, 16, 16), (2, 16), False)
+
+        helper((2, 16, 16), (4, 4), False)
+
+    def test_gelu_simple(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            gelu_result = torch.nn.GELU()(x)
+            gelu_result_cpu = torch.nn.GELU()(cpu_x)
+
+            cpu_grad = torch.ones_like(gelu_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            gelu_result.backward(gradient=grad)
+            gelu_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(gelu_result, gelu_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        # Test empty shape too
+        for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]:
+            helper(shape)
+
+    # Test hardtanh
+    def test_hardtanh(self):
+        def helper(shape, min_val, max_val, inplace=False):
+            cpu_x = None
+            x = None
+
+            if(not inplace):
+                cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+                x = cpu_x.detach().clone().to('mps').requires_grad_()
+            else:
+                cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+                x = cpu_x.detach().clone().to('mps')
+
+            hardtanh_result = torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace)(x)
+            hardtanh_result_cpu = torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace)(cpu_x)
+
+            self.assertEqual(hardtanh_result, hardtanh_result_cpu)
+
+            if(not inplace):
+                cpu_grad = torch.randn(hardtanh_result_cpu.shape)
+                grad = cpu_grad.to('mps')
+                hardtanh_result.backward(gradient=grad)
+                hardtanh_result_cpu.backward(gradient=cpu_grad)
+                self.assertEqual(x.grad, cpu_x.grad)
+
+        # Test empty shape too
+        for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]:
+            for min_val, max_val in zip([-1, -2, 3], [1, -1, 4]):
+                helper(shape, min_val, max_val)
+                helper(shape, min_val, max_val, inplace=True)
+
+    # Test sign
+    def test_sign(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            sign_result = torch.sign(x)
+            sign_result_cpu = torch.sign(cpu_x)
+
+            cpu_grad = torch.ones_like(sign_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            sign_result.backward(gradient=grad)
+            sign_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(sign_result, sign_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    # Test neg
+    def test_neg(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            neg_result = torch.neg(x)
+            neg_result_cpu = torch.neg(cpu_x)
+
+            cpu_grad = torch.ones_like(neg_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            neg_result.backward(gradient=grad)
+            neg_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(neg_result, neg_result_cpu)
+
+        helper((2, 8, 4, 5))
+
+    # Test index select
+    def test_index_select(self):
+        def helper(shape, dim, index, idx_dtype=torch.int32):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
+            idx = cpu_idx.detach().clone().to('mps')
+
+            print(cpu_idx.shape)
+
+            idx_result = torch.index_select(x, dim=dim, index=idx)
+            idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx)
+
+            self.assertEqual(idx_result, idx_result_cpu)
+
+        helper((2, 8, 4, 5), 0, [1])
+        helper((8, 8, 4, 5), 0, [0, 3, 2, 7, 6])
+        helper((2, 8, 4, 5), 1, [0, 3, 2, 7, 6])
+        helper((2, 8, 4, 5), 2, [3, 0, 1])
+        helper((2, 8, 4, 5), 3, [2, 3, 0])
+        helper((2, 3, 3), -1, [1, 2])
+
+    def test_embedding_dense_backward(self):
+        def helper(n, d, m):
+            embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps')
+            W_MPS = torch.randn((m, d), requires_grad=True, device='mps')
+            idx_MPS = torch.tensor([0, 1, 2]).to('mps')
+            a_MPS = embeddingMPS.weight.clone() @ W_MPS.t()  # weight must be cloned for this to be differentiable
+            a_MPS.retain_grad()
+            b_MPS = embeddingMPS(idx_MPS) @ W_MPS.t()  # modifies weight in-place
+            b_MPS.retain_grad()
+            out_MPS = (a_MPS.unsqueeze(0) + b_MPS.unsqueeze(1))
+            loss_MPS = out_MPS.sigmoid().prod()
+            loss_MPS.backward()
+
+            embeddingCPU = nn.Embedding(n, d, max_norm=True, scale_grad_by_freq=True)
+            W_CPU = W_MPS.to('cpu')
+            idx_CPU = torch.tensor([0, 1, 2])
+            a_CPU = embeddingCPU.weight.clone() @ W_CPU.t()  # weight must be cloned for this to be differentiable
+            a_CPU.retain_grad()
+            b_CPU = embeddingCPU(idx_CPU) @ W_CPU.t()  # modifies weight in-place
+            b_CPU.retain_grad()
+            out_CPU = (a_CPU.unsqueeze(0) + b_CPU.unsqueeze(1))
+            loss_CPU = out_CPU.sigmoid().prod()
+            loss_CPU.backward()
+
+            self.assertEqual(b_CPU.grad, b_MPS.grad)
+            self.assertEqual(a_CPU.grad, a_MPS.grad)
+
+        helper(3, 5, 7)
+
+    # Test pytorch gather
+    def test_gather(self):
+        def helper(shape, dim, idx_shape, idx_dtype=torch.int64):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            # Indices should be taken from range of axis along which gathering is done
+            idx_np = np.random.randint(0, shape[dim], idx_shape)
+
+            cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype)
+            idx = cpu_idx.detach().clone().to('mps')
+
+            gather_result = torch.gather(x, dim=dim, index=idx)
+            gather_result_cpu = torch.gather(cpu_x, dim=dim, index=cpu_idx)
+
+            cpu_grad = torch.randn(idx_shape, device='cpu', dtype=torch.float)
+            grad = cpu_grad.to('mps')
+            gather_result.backward(gradient=grad)
+            gather_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(gather_result, gather_result_cpu)
+            self.assertEqual(cpu_x.grad, x.grad)
+
+        helper((6, 3, 3), 0, (3, 3, 3))
+        helper((2, 3, 3, 3), 0, (10, 3, 3, 3))
+        helper((2, 8, 4, 5), 0, (10, 8, 4, 5))
+        helper((2, 8, 4, 5), 0, (10, 6, 3, 2))
+        helper((8, 8, 4, 5), 0, (6, 8, 4, 5))
+        helper((8, 8, 4, 5), 0, (6, 7, 2, 3))
+        helper((2, 8, 4, 5), 1, (2, 5, 3, 4))
+        helper((2, 8, 4, 5), 2, (1, 8, 10, 3))
+        helper((2, 8, 4, 5), 3, (2, 5, 3, 12))
+
+    # Test pytorch scatter_add and scatter
+    def test_scatter_add(self):
+        def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, do_add=True):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            cpu_src = torch.randn(src_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            src = cpu_src.detach().clone().to('mps').requires_grad_()
+
+            # Indices should be taken from range of axis along which gathering is done
+            idx_np = None
+            if(do_add):
+                idx_np = np.random.randint(0, shape[dim], idx_shape)
+            else:
+                idx_np = np.array([[0, 1, 2],
+                                   [1, 2, 3],
+                                   [2, 3, 4],
+                                   [3, 4, 5],
+                                   [4, 5, 6]])
+
+            cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype)
+            idx = cpu_idx.detach().clone().to('mps')
+
+            scatter_result = None
+            scatter_result_cpu = None
+
+            if(do_add):
+                scatter_result = torch.scatter_add(x, dim=dim, index=idx, src=src)
+                scatter_result_cpu = torch.scatter_add(cpu_x, dim=dim, index=cpu_idx, src=cpu_src)
+            else:
+                scatter_result = torch.scatter(x, dim=dim, index=idx, src=src)
+                scatter_result_cpu = torch.scatter(cpu_x, dim=dim, index=cpu_idx, src=cpu_src)
+
+            cpu_grad = None
+            grad = None
+
+            if(idx_shape == src_shape):
+                cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float)
+                grad = cpu_grad.to('mps')
+                scatter_result.backward(gradient=grad)
+                scatter_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(scatter_result, scatter_result_cpu)
+            if(idx_shape == src_shape):
+                self.assertEqual(cpu_x.grad, x.grad)
+                self.assertEqual(cpu_src.grad, src.grad)
+
+        helper((2, 3), 0, (5, 3), (5, 3))
+        helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5))
+        helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5))
+        helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2))
+        helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2))
+        helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5))
+
+        helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5))
+        helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2))
+        helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3))
+        helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3))
+
+        helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8))
+        helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6))
+        helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6))
+
+        # Test scatter src
+        helper((8, 3), 0, (5, 3), (5, 3), do_add=False)
+        helper((10, 3), 0, (5, 3), (5, 8), do_add=False)
+
+    # Test pytorch scatter_reduce
+    def test_scatter_reduce(self):
+        def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="sum"):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            cpu_src = torch.randn(src_shape, device='cpu', dtype=torch.float, requires_grad=True)
+            src = cpu_src.detach().clone().to('mps').requires_grad_()
+
+            # Indices should be taken from range of axis along which gathering is done
+            idx_np = np.random.randint(0, shape[dim], idx_shape)
+
+            cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype)
+            idx = cpu_idx.detach().clone().to('mps')
+
+            scatter_result = torch.scatter(x, dim=dim, index=idx, src=src, reduce=reduce_str)
+            scatter_result_cpu = torch.scatter(cpu_x, dim=dim, index=cpu_idx, src=cpu_src, reduce=reduce_str)
+
+            self.assertEqual(scatter_result, scatter_result_cpu)
+
+        # for reduce in ["sum", "prod", "amax", "amin"]:
+        for reduce in ["add", "multiply"]:
+            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
+            helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
+            helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
+            helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce)
+
+            helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce)
+            helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce)
+            helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce)
+            helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce)
+
+            helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce)
+            helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce)
+            helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce)
+
+    def test_is_nonzero(self):
+        self.assertFalse(torch.is_nonzero(torch.tensor([0.]).to('mps')))
+        self.assertTrue(torch.is_nonzero(torch.tensor([1.5]).to('mps')))
+        self.assertFalse(torch.is_nonzero(torch.tensor([False]).to('mps')))
+        self.assertTrue(torch.is_nonzero(torch.tensor([3]).to('mps')))
+
+    # Test triu
+    def test_triu(self):
+        def helper(shape, diag=0):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            triu_result = torch.triu(x, diag)
+            triu_result_cpu = torch.triu(cpu_x, diag)
+
+            cpu_grad = torch.randn(triu_result_cpu.shape)
+            grad = cpu_grad.to('mps')
+
+            triu_result.backward(gradient=grad)
+            triu_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(triu_result, triu_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 8, 4, 5))
+        helper((2, 8, 4, 5), diag=1)
+        helper((2, 8, 4, 5), diag=2)
+        helper((2, 8, 4, 5), diag=3)
+        helper((2, 8, 4, 5), diag=-1)
+        helper((2, 8, 4, 5), diag=-2)
+        helper((2, 8, 4, 5), diag=-3)
+
+    # Test tril
+    def test_tril(self):
+        def helper(shape, diag=0):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            tril_result = torch.tril(x, diag)
+            tril_result_cpu = torch.tril(cpu_x, diag)
+
+            cpu_grad = torch.randn(tril_result_cpu.shape)
+            grad = cpu_grad.to('mps')
+
+            tril_result.backward(gradient=grad)
+            tril_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(tril_result, tril_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper((2, 8, 4, 5))
+        helper((2, 8, 4, 5), diag=1)
+        helper((2, 8, 4, 5), diag=2)
+        helper((2, 8, 4, 5), diag=3)
+        helper((2, 8, 4, 5), diag=-1)
+        helper((2, 8, 4, 5), diag=-2)
+        helper((2, 8, 4, 5), diag=-3)
+
+    # Test diag
+    def test_diag(self):
+        def helper(shape, diag=0):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            diag_result = torch.diag(x, diag)
+            diag_result_cpu = torch.diag(cpu_x, diag)
+
+            # cpu_grad = torch.randn(diag_result_cpu.shape)
+            # grad = cpu_grad.to('mps')
+
+            # diag_result.backward(gradient=grad)
+            # diag_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(diag_result, diag_result_cpu)
+            # self.assertEqual(x.grad, cpu_x.grad)
+
+        for shape in [(5, 5), (5, 6), (6, 5), (5,), (6,)]:
+            for diag in [0, 1, 2, 3, 4, -1, -2, -3, -4]:
+                helper(shape, diag=diag)
+
+    # Test softmax
+    def test_softmax(self):
+        def helper(shape, dim, channels_last=False):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            if(channels_last):
+                cpu_x = cpu_x.to(memory_format=torch.channels_last)
+                cpu_x.retain_grad()
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            softmax_result = torch.nn.functional.softmax(x, dim=dim)
+            softmax_result_cpu = torch.nn.functional.softmax(cpu_x, dim=dim)
+
+            # Currently NOT testing backward for channels last backward
+            cpu_grad = None
+            grad = None
+
+            if(not channels_last):
+                cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float)
+                grad = cpu_grad.to('mps')
+
+                softmax_result.backward(gradient=grad)
+                softmax_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(softmax_result, softmax_result_cpu)
+            if(not channels_last):
+                self.assertEqual(x.grad, cpu_x.grad)
+
+        def helper2(dim):
+            cpu_x = torch.tensor(1.23, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            softmax_result = torch.nn.functional.softmax(x, dim=dim)
+            softmax_result_cpu = torch.nn.functional.softmax(cpu_x, dim=dim)
+
+            cpu_grad = torch.tensor(2.34, device='cpu', dtype=torch.float)
+            grad = cpu_grad.to('mps')
+
+            softmax_result.backward(gradient=grad)
+            softmax_result_cpu.backward(gradient=cpu_grad)
+
+            self.assertEqual(softmax_result, softmax_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
+
+        helper2(0)
+
+        for channels_last in [False, True]:
+            for shape in [(2, 4, 8, 5), (3, 4, 6, 7, 2)]:
+                if(len(shape) != 4 and channels_last):
+                    continue
+                for dim in [0, 1, 2, 3, -1, -2, -3]:
+                    helper(shape, dim, channels_last)
+
+    # Test sub
+    def test_sub(self):
+        def helper(shape, alpha):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_out = torch.sub(cpu_x, cpu_y, alpha=alpha)
+            out = torch.sub(x, y, alpha=alpha)
+
+            self.assertEqual(out, cpu_out)
+
+        helper((2, 8, 4, 5), 0.1)
+        helper((2, 8, 3, 5), 0.1)
+        helper((2, 8, 3, 5), 0.2)
+
+    # Test where
+    def test_where(self):
+        def helper(shape, x_shape, y_shape, cond_dtype=torch.bool, x_dtype=torch.float):
+
+            cpu_cond = torch.randint(2, shape, device='cpu', dtype=cond_dtype, requires_grad=False)
+            cond = cpu_cond.detach().clone().to('mps')
+
+            cpu_x = torch.randn(x_shape, device='cpu', dtype=x_dtype, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+            cpu_y = torch.randn(y_shape, device='cpu', dtype=x_dtype, requires_grad=True)
+            y = cpu_y.detach().clone().to('mps').requires_grad_()
+
+            cpu_out = torch.where(cpu_cond, cpu_x, cpu_y)
+            out = torch.where(cond, x, y)
+
+            cpu_grad = torch.randn(cpu_out.shape)
+            grad = cpu_grad.to('mps')
+
+            cpu_out.backward(gradient=cpu_grad)
+            out.backward(gradient=grad)
+
+            self.assertEqual(out, cpu_out)
+            self.assertEqual(x.grad, cpu_x.grad)
+            self.assertEqual(y.grad, cpu_y.grad)
+
+        for shape in ([(0, 3), [], (2, 3), (9,)]):
+            helper(shape, shape, shape)
+
+        helper((2, 3, 1), (2, 3, 4), (2, 1, 4))
+        helper((2, 1, 1), (2, 3, 4), (1, 3, 4))
+        helper((1, 1, 1), (1, 1, 4), (2, 3, 1))
+        helper([], (1, 1, 4), (2, 3, 1))
+        helper([], (2, 3, 4), [])
+
+    # Test normal
+    def test_normal(self):
+        def helper(shape, mean=0.0, std=1.0):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            mps_out = torch.normal(mean, std, shape, device='mps')
+
+            # print(mps_out.to('cpu'))
+            print(mps_out.to('cpu').mean())
+            print(mps_out.to('cpu').std())
+
+            mean_array = np.ones(shape)
+            mean_array *= mean
+            cpu_mean_tensor = torch.tensor(mean_array, device='cpu', dtype=torch.float, requires_grad=False)
+            mean_tensor = cpu_mean_tensor.detach().clone().to('mps')
+
+            std_array = np.ones(shape)
+            std_array *= std
+            cpu_std_tensor = torch.tensor(std_array, device='cpu', dtype=torch.float, requires_grad=False)
+            std_tensor = cpu_std_tensor.detach().clone().to('mps')
+
+            mps_out = torch.zeros(shape, device='mps')
+            torch.normal(mean_tensor, std, out=mps_out)
+            print(mps_out.to('cpu').mean())
+            print(mps_out.to('cpu').std())
+
+            mps_out = torch.zeros(shape, device='mps')
+            torch.normal(mean, std_tensor, out=mps_out)
+            print(mps_out.to('cpu').mean())
+            print(mps_out.to('cpu').std())
+
+            mps_out = torch.zeros(shape, device='mps')
+            torch.normal(mean_tensor, std_tensor, out=mps_out)
+            print(mps_out.to('cpu').mean())
+            print(mps_out.to('cpu').std())
+
+        helper((2, 3, 4, 5, 6))
+        helper((100, 100), 2.5, 1.2)
+
+    def test_bernoulli(self):
+        def helper(shape, prob=0.5):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            prob_array = np.ones(shape)
+            prob_array *= prob
+            cpu_prob_tensor = torch.tensor(prob_array, device='cpu', dtype=torch.float, requires_grad=False)
+            prob_tensor = cpu_prob_tensor.detach().clone().to('mps')
+
+            mps_out = torch.bernoulli(prob_tensor)
+            # Compare "real" with theoretical values
+            print(mps_out.to('cpu').mean(), prob)
+            print(mps_out.to('cpu').std() ** 2, prob * (1 - prob))
+
+            mps_out = torch.zeros(shape, device='mps')
+            mps_out = torch.bernoulli(mps_out, prob)
+
+            print(mps_out.to('cpu').mean(), prob)
+            print(mps_out.to('cpu').std() ** 2, prob * (1 - prob))
+
+        helper((100, 100), 0.50)
+        helper((100, 100), 0.76)
+        helper((100, 100), 0.23)
+
+    # Test random_.to and random_.from
+    def test_random(self):
+        def helper(shape, low, high, dtype=torch.int32):
+
+            print(low, high)
+            mps_out = torch.randint(low, high, shape, dtype=dtype, device='mps')
+
+            print(mps_out.to('cpu').float().mean(), (low + (high - 1)) / 2.)
+            print(mps_out.to('cpu').float().std() ** 2, ((high - low)**2 - 1) / 12.)
+
+        helper([100, 100], 0, 10)
+        helper([100, 100], 23, 89)
+        helper([100, 100], 23, 89, dtype=torch.float32)
+        helper([100, 100], 23, 89, dtype=torch.int64)
+        helper([100, 100], 0, 2, dtype=torch.bool)
+
+    # Test add
+    def test_add_binary_op(self):
+        def helper(shape, alpha):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_out = torch.add(cpu_x, cpu_y, alpha=alpha)
+            out = torch.add(x, y, alpha=alpha)
+
+            self.assertEqual(out, cpu_out)
+
+        helper((2, 8, 4, 5), 0.1)
+        helper((2, 8, 3, 5), 0.1)
+        helper((2, 8, 3, 5), 0.2)
+
+    # Test add
+    def test_add_scalars(self):
+        def helper(alpha=1.0):
+            cpu_x = torch.tensor(2.3, device='cpu', dtype=torch.float, requires_grad=False)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = torch.tensor(3.4, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+
+            cpu_out = torch.add(cpu_x, cpu_y, alpha=alpha)
+            out = torch.add(x, y, alpha=alpha)
+
+            print(out.to('cpu'))
+
+            self.assertEqual(out, cpu_out)
+
+        helper()
+        helper(0.1)
+        helper(0.2)
+
+    def test_atan2(self):
+        def helper(shape):
+            input_cpu = torch.randn(shape)
+            input_mps = input_cpu.detach().clone().to("mps")
+
+            other_cpu = torch.randn(shape)
+            other_mps = other_cpu.detach().clone().to("mps")
+
+            atan2_cpu = torch.atan2(input_cpu, other_cpu)
+            atan2_mps = torch.atan2(input_mps, other_mps)
+
+            self.assertEqual(atan2_cpu, atan2_mps.to("cpu"))
+
+        helper(4)
+        helper(10000)
+        helper((10000, 40))
+
+
+class TestNNMPS(NNTestCase):
+
+    def _create_basic_net(self):
+        class Layer(nn.Module):
+            def __init__(self):
+                super(Layer, self).__init__()
+                self.layer_dummy_param = Parameter(torch.empty(3, 5))
+                self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
+
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = Layer()
+                self.dummy_param = Parameter(torch.empty(3, 5))
+                self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
+
+        l = Layer()
+        n = Net()
+        s = nn.Sequential(n, n)
+
+        return l, n, s
+
+    def test_requires_grad_(self):
+        m = self._create_basic_net()[-1]
+        assert len(list(m.buffers())) > 0, 'invalid test'
+        assert all(not b.requires_grad for b in m.buffers()) > 0, 'invalid test'
+        assert len(list(m.parameters())) > 0, 'invalid test'
+        assert all(p.requires_grad for p in m.parameters()) > 0, 'invalid test'
+        for requires_grad in (False, True):
+            self.assertIs(m.requires_grad_(requires_grad), m)
+            for p in m.parameters():
+                self.assertEqual(p.requires_grad, requires_grad)
+            for b in m.buffers():
+                self.assertFalse(b.requires_grad)
+
+    def test_module_backcompat(self):
+        from torch.serialization import SourceChangeWarning
+        path = download_file('https://download.pytorch.org/test_data/linear.pt')
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', SourceChangeWarning)
+            m = torch.load(path)
+        input = torch.randn(2, 3, dtype=torch.float)
+        self.assertEqual(m(input).size(), (2, 5))
+
+    def test_conv_backcompat(self):
+        from torch.serialization import SourceChangeWarning
+        # This file was generated by running on PyTorch 1.0.1 on Python 2:
+        #
+        #     import torch
+        #     from torch import nn
+        #     m = nn.Conv2d(1, 1, 1)
+        #     torch.save(m, 'legacy_conv2d.pt')
+        #
+        # NB: This Pickle also contains some Unicode data!
+        path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt')
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', SourceChangeWarning)
+            m = torch.load(path, encoding='utf-8')
+        input = torch.randn((1, 1, 1, 1), dtype=torch.float)
+        self.assertEqual(m(input).size(), (1, 1, 1, 1))
+
+    def test_zero_grad(self):
+        i = torch.randn(2, 5, requires_grad=True)
+        module = nn.Linear(5, 5)
+        for p in module.parameters():
+            p.requires_grad = False
+        module.zero_grad()
+
+        module.weight.requires_grad = True
+        module.zero_grad()
+        self.assertIsNone(module.weight.grad)  # uninitialized grad
+
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+
+        module.bias.requires_grad = True
+        module.zero_grad()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
+        module(i).sum().backward()
+        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNotNone(module.bias.grad)
+        self.assertGreater(module.weight.grad.data.abs().sum(), 0)
+        self.assertGreater(module.bias.grad.data.abs().sum(), 0)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
+
+        # Force set to None.
+        module.zero_grad(set_to_none=True)
+        self.assertIsNone(module.weight.grad)
+
+    def test_no_grad(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double]:
+            module = nn.Conv2d(2, 5, kernel_size=3, padding=1).to(dtype)
+            input = torch.randn(1, 2, 10, 10).to(dtype)
+            x = input
+            y = input.clone()
+
+            output = module(x)
+            self.assertTrue(output.requires_grad)
+            output.backward(torch.ones(1, 5, 10, 10))
+
+            with torch.no_grad():
+                output2 = module(y)
+                self.assertFalse(output2.requires_grad)
+                self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10)))
+
+    def test_invalid_conv1d(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double]:
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Calculated padded input size per channel: \(4\). ' +
+                                        r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
+                module(input)
+
+            # Negative stride check
+            module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+    def test_conv2d_discontiguous_weight(self):
+        # Test for https://github.com/pytorch/pytorch/issues/55781
+        x = torch.ones(64, 16, 16, 16)
+        weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2)[:, :, :, ::2]
+        self.assertFalse(weight.is_contiguous())
+        y = torch.nn.functional.conv2d(x, weight, None)
+        if torch.backends.mkldnn.is_available():
+            # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
+            with torch.backends.mkldnn.flags(enabled=False):
+                y_ = torch.nn.functional.conv2d(x, weight, None)
+                self.assertEqual(y, y_)
+        self.assertEqual(y.sum(), 4186112.)
+
+    def test_invalid_conv2d(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double]:
+            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
+            input = torch.empty(1, 1, 4, 4).to(dtype)
+            self.assertRaises(RuntimeError, lambda: module(input))
+
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True)
+            input = torch.randn(1, 3, 1, 1)
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Calculated padded input size per channel: \(1 x 1\). ' +
+                                        r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
+                module(input)
+
+            # Negative stride check
+            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+            # Zero stride check
+            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+    def test_conv2d_valid_padding(self, device='mps'):
+        # Test F.conv2d padding='valid' is the same as no padding
+        x = torch.rand(1, 1, 1, 10, device=device).to(torch.float)
+        y = torch.rand(1, 1, 1, 4, device=device).to(torch.float)
+
+        expect = F.conv2d(x, y)
+        actual = F.conv2d(x, y, padding='valid')
+        self.assertEqual(expect.to('cpu'), actual.to('cpu'))
+
+    # def test_conv2d_same_padding(self, device='mps'):
+        # x = torch.rand(1, 1, 10, 11, device=device)
+        # y = torch.rand(1, 1, 4, 5, device=device)
+        # expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
+        # actual = F.conv2d(x, y, padding='same')
+        # self.assertEqual(expect.to('cpu'), actual.to('cpu'))
+
+        # # With dilation
+        # y = torch.rand(1, 1, 3, 4, device=device)
+        # expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
+        # actual = F.conv2d(x, y, padding='same', dilation=2)
+        # self.assertEqual(expect, actual)
+
+        # # Dilation with asymmetric padding
+        # y = torch.rand(1, 1, 4, 4, device=device)
+        # expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
+        # actual = F.conv2d(x, y, padding='same', dilation=3)
+        # self.assertEqual(expect, actual)
+
+
+class TestConstantPadNd(TestCase):
+    def test_preserves_memory_format(self):
+        nchw_tensor = torch.rand((1, 2, 5, 3))
+        nchw_padded = torch.constant_pad_nd(nchw_tensor, [1, 2], 0.5)
+        self.assertTrue(nchw_padded.is_contiguous(memory_format=torch.contiguous_format))
+
+        nhwc_tensor = nchw_tensor.contiguous(memory_format=torch.channels_last)
+        nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5)
+        self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last))
+
+
+class TestLinalgMPS(TestCase):
+    def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
+        dtype = t.dtype
+        numpy_dtype = dtype
+        alpha = 1.2 if alpha is None else alpha
+        beta = 0.8 if beta is None else beta
+        res1 = f(t, m, v, alpha=alpha, beta=beta)
+        res2 = torch.full_like(res1, math.nan)
+        if transpose_out:
+            res2 = res2.t().clone(memory_format=torch.contiguous_format).t()
+        f(t, m, v, alpha=alpha, beta=beta, out=res2)
+        res3 = alpha * (m.to(numpy_dtype).cpu().numpy() @ v.to(numpy_dtype).cpu().numpy())
+        if beta != 0:
+            res3 += (torch.mul(t, beta)).to(numpy_dtype).cpu().numpy()
+        res3 = torch.from_numpy(res3).to(dtype)
+        self.assertEqual(res1, res2)
+        self.assertEqual(res1, res3)
+
+    def test_addmm(self, device="mps", dtype=torch.float32):
+        M = torch.randn(10, 25, device=device).to(dtype)
+        m1 = torch.randn(10, 50, device=device).to(dtype)
+        m2 = torch.randn(50, 25, device=device).to(dtype)
+        self._test_addmm_addmv(torch.addmm, M, m1, m2)
+
+        # Test beta=0, M=nan
+        M = torch.full((10, 25), math.nan, device=device).to(dtype)
+        m1 = torch.randn(10, 50, device=device).to(dtype)
+        m2 = torch.randn(50, 25, device=device).to(dtype)
+        self._test_addmm_addmv(torch.addmm, M, m1, m2, beta=0)
+
+        # Test transpose
+        for t1, t2, t3, t4 in itertools.product([True, False], repeat=4):
+            def maybe_transpose(cond, m):
+                if not cond:
+                    return m
+                return m.t().clone(memory_format=torch.contiguous_format).t()
+
+        M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
+        m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
+        m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
+        self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4)
+
+
+class TestRNNMPS(TestCase):
+    def test_lstm_1(self, device="mps", dtype=torch.float32):
+
+        rnn = nn.LSTM(1, 4, 2, device="cpu")
+        input = torch.randn(2, 3, 1, device="cpu")
+        hx = torch.zeros(2, 3, 4, device="cpu")
+        cx = torch.zeros(2, 3, 4, device="cpu")
+        outputs = []
+        for device in [torch.device("cpu"), torch.device("mps")]:
+            rnn = rnn.to(device)
+            input = input.to(device)
+            hx = hx.to(device)
+            cx = cx.to(device)
+            weight_list = []
+            output, _ = rnn(input, (hx, cx))
+            print(output.to('cpu'))
+
+    def test_lstm_2(self, device="mps", dtype=torch.float32):
+        rnn = nn.LSTM(1, 4, 1, device="cpu")
+        input = torch.randn(2, 3, 1, device="cpu", requires_grad=True)
+        hx = torch.zeros(1, 3, 4, device="cpu")
+        cx = torch.zeros(1, 3, 4, device="cpu")
+        outputs = []
+        for device in [torch.device("cpu"), torch.device("mps")]:
+            rnn = rnn.to(device)
+            input = input.to(device)
+            input.retain_grad()
+            hx = hx.to(device)
+            cx = cx.to(device)
+
+            output, _ = rnn(input, (hx, cx))
+            # Test by passing ones as the gradient from the loss.
+            output.backward(torch.ones_like(output))
+
+            print(rnn.weight_ih_l0.grad)
+            # Gradient on GPU is 2x the CPU gradient???
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index fc5fadcc20b8..5b939afd998f 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -15,7 +15,7 @@
 import torch.utils.hooks
 from torch.nn import Parameter
 from torch.testing._internal.common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN,
-                                                  load_tests, slowTest, TEST_WITH_TSAN)
+                                                  load_tests, slowTest, TEST_WITH_TSAN, TEST_WITH_ROCM)
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -258,7 +258,7 @@ def test_fill():
             self.assertTrue(e.is_set())
             self.assertTrue(data[0].eq(4).all())
             self.assertTrue(data[1].eq(4).all())
-            p.join(1)
+            p.join(100)
             self.assertFalse(p.is_alive())
 
         def test_receive():
@@ -280,7 +280,7 @@ def test_receive():
             # collect them properly
             del t1, t2
             e.set()
-            p.join(1)
+            p.join(100)
             self.assertFalse(p.is_alive())
 
         with leak_checker(self) as lc:
@@ -383,7 +383,12 @@ def test_inherit_tensor(self):
     def test_autograd_errors(self):
         ctx = mp.get_context('fork')
         simple_autograd_function()
-        with self.assertRaisesRegex(RuntimeError, r'Unable to handle autograd'):
+        # Autograd only uses thread when GPUs are involved
+        if torch.cuda.is_available() or torch.backends.mps.is_available():
+            with self.assertRaisesRegex(RuntimeError, r'Unable to handle autograd'):
+                with ctx.Pool(3) as pool:
+                    pool.map(simple_autograd_function, [1, 2, 3])
+        else:
             with ctx.Pool(3) as pool:
                 pool.map(simple_autograd_function, [1, 2, 3])
 
@@ -585,6 +590,7 @@ def _test_event_multiprocess_child(event, p2c, c2p):
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm')
     def test_event_multiprocess(self):
         event = torch.cuda.Event(enable_timing=False, interprocess=True)
         self.assertTrue(event.query())
@@ -643,6 +649,7 @@ def _test_event_handle_importer_consumer(handle, p2c, c2p):
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm')
     def test_event_handle_importer(self):
         e0 = torch.cuda.Event(enable_timing=False, interprocess=True)
         self.assertTrue(e0.query())
@@ -682,6 +689,7 @@ def _test_event_handle_exporter_consumer(handle, p2c, c2p):
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm')
     def test_event_handle_exporter(self):
         e0 = torch.cuda.Event(enable_timing=False, interprocess=True)
 
@@ -748,7 +756,7 @@ def hook(*unused):
 
         self.assertEqual(var.data, torch.ones(5, 5, device=device))
         self.assertEqual(var.grad.data, torch.ones(5, 5, device=device) * 4)
-        p.join(1)
+        p.join(100)
         self.assertFalse(p.is_alive())
 
     # Check sharing a cudaMalloc allocation with different types of storage.
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index ddc23e45f276..7bb529e8bbcc 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -14,14 +14,14 @@
 aten_native_yaml = os.path.join(path, '../aten/src/ATen/native/native_functions.yaml')
 all_operators_with_namedtuple_return = {
     'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
-    'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq', 'linalg_inv_ex',
+    'qr', 'geqrf', 'slogdet', 'sort', 'topk', 'lstsq', 'linalg_inv_ex',
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_unpack_dual", 'linalg_qr',
     'linalg_svd', '_linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
     'fake_quantize_per_channel_affine_cachemask', 'linalg_lstsq', 'linalg_eig', 'linalg_cholesky_ex',
-    'frexp', 'lu_unpack', 'histogram', '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams',
-    '_fused_moving_avg_obs_fq_helper', 'linalg_lu_factor', 'linalg_lu_factor_ex',
-    '_det_lu_based_helper',
-    '_lu_with_info',
+    'frexp', 'lu_unpack', 'histogram', 'histogramdd',
+    '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams',
+    '_fused_moving_avg_obs_fq_helper', 'linalg_lu_factor', 'linalg_lu_factor_ex', 'linalg_lu',
+    '_det_lu_based_helper', '_lu_with_info', 'linalg_ldl_factor_ex', 'linalg_ldl_factor',
 }
 
 
@@ -76,7 +76,6 @@ def test_namedtuple_return(self):
             op(operators=['_linalg_svd'], input=(), names=('U', 'S', 'Vh'), hasout=True),
             op(operators=['slogdet'], input=(), names=('sign', 'logabsdet'), hasout=False),
             op(operators=['qr', 'linalg_qr'], input=(), names=('Q', 'R'), hasout=True),
-            op(operators=['solve'], input=(a,), names=('solution', 'LU'), hasout=True),
             op(operators=['geqrf'], input=(), names=('a', 'tau'), hasout=True),
             op(operators=['symeig', 'eig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True),
             op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True),
@@ -88,6 +87,9 @@ def test_namedtuple_return(self):
             op(operators=['linalg_inv_ex'], input=(), names=('inverse', 'info'), hasout=True),
             op(operators=['linalg_lu_factor'], input=(), names=('LU', 'pivots'), hasout=True),
             op(operators=['linalg_lu_factor_ex'], input=(), names=('LU', 'pivots', 'info'), hasout=True),
+            op(operators=['linalg_ldl_factor'], input=(), names=('LD', 'pivots'), hasout=True),
+            op(operators=['linalg_ldl_factor_ex'], input=(), names=('LD', 'pivots', 'info'), hasout=True),
+            op(operators=['linalg_lu'], input=(), names=('P', 'L', 'U'), hasout=True),
             op(operators=['fake_quantize_per_tensor_affine_cachemask'],
                input=(0.1, 0, 0, 255), names=('output', 'mask',), hasout=False),
             op(operators=['fake_quantize_per_channel_affine_cachemask'],
@@ -100,6 +102,7 @@ def test_namedtuple_return(self):
                input=(torch.tensor([3, 2, 1, 4, 5], dtype=torch.int32), True, True),
                names=('P', 'L', 'U'), hasout=True),
             op(operators=['histogram'], input=(1,), names=('hist', 'bin_edges'), hasout=True),
+            op(operators=['histogramdd'], input=(1,), names=('hist', 'bin_edges'), hasout=False),
             op(operators=['_fake_quantize_per_tensor_affine_cachemask_tensor_qparams'],
                input=(torch.tensor([1.0]), torch.tensor([0], dtype=torch.int), torch.tensor([1]), 0, 255),
                names=('output', 'mask',), hasout=False),
diff --git a/test/test_native_mha.py b/test/test_native_mha.py
new file mode 100644
index 000000000000..1689789f9cea
--- /dev/null
+++ b/test/test_native_mha.py
@@ -0,0 +1,306 @@
+# Owner(s): ["module: nn"]
+import math
+
+import torch
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    skipMeta,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+class TestMHADeviceType(TestCase):
+    @torch.no_grad()
+    def _test_transform_bias_rescale_qkv_impl(
+        self, device, dtype, use_nt, use_padding=False
+    ):
+        tests = [
+            (64, 4, 16, 8),
+            # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8
+            (24, 2, 4, 2),
+            # Make sure CUDA can handle small input sizes
+            (2, 2, 2, 2),
+            # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4,
+            # causes alignment issues
+            (24, 4, 4, 2),
+            (48, 4, 16, 8),
+        ]
+        for (embed_dim, num_heads, bs, sl) in tests:
+            with self.subTest(embed_dim=embed_dim, num_heads=num_heads, bs=bs, sl=sl):
+                torch.manual_seed(9343)
+                dense_x = x = (
+                    torch.randn(bs, sl, 3 * embed_dim, device=device, dtype=dtype) * 10
+                )
+                if use_padding:
+                    x[0][-1] = torch.full(x[0][-1].shape, float("-Inf"))
+                if use_nt:
+                    xs = list(torch.unbind(x))
+                    if use_padding:
+                        xs[0] = xs[0][:-1]
+                    x = torch.nested_tensor(xs, device=device, dtype=dtype)
+                qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
+
+                # We have to use inference_mode here because q/k/v are
+                # all views of the same Tensor, which autograd doesn't
+                # like. This is fine because this function is only
+                # exposed to Python for purposes of writing this test.
+                with torch.inference_mode():
+                    (q, k, v) = torch._transform_bias_rescale_qkv(
+                        x, qkv.bias, num_heads=num_heads
+                    )
+
+                    def simple_transform_bias_rescale_qkv(qkv, bias):
+                        (q, k, v) = torch.split(qkv, embed_dim, dim=-1)
+                        (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1)
+                        return tuple(
+                            x.reshape(
+                                (bs, sl, num_heads, embed_dim // num_heads)
+                            ).transpose(2, 1)
+                            for x in (
+                                (q + q_bias) / math.sqrt(embed_dim // num_heads),
+                                (k + k_bias),
+                                (v + v_bias),
+                            )
+                        )
+
+                    correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(
+                        dense_x, qkv.bias
+                    )
+                    if use_nt and use_padding:
+                        for t in (correct_q, correct_k, correct_v):
+                            t[t == float("-Inf")] = 0
+
+                self.assertEqual(q.size(), correct_q.size())
+                torch.testing.assert_close(q, correct_q)
+                torch.testing.assert_close(k, correct_k)
+                torch.testing.assert_close(v, correct_v)
+
+    @dtypesIfCUDA(torch.float)
+    @dtypes(torch.float)
+    @skipMeta
+    def test_transform_bias_rescale_qkv(self, device, dtype):
+        for use_padding in (False, True):
+            with self.subTest(use_padding=use_padding):
+                self._test_transform_bias_rescale_qkv_impl(
+                    device, dtype, use_nt=False, use_padding=use_padding
+                )
+
+    @dtypesIfCUDA(torch.float)
+    @dtypes(torch.float)
+    @skipMeta
+    @onlyCUDA
+    def test_transform_bias_rescale_qkv_nested(self, device, dtype):
+        for use_padding in (False, True):
+            with self.subTest(use_padding=use_padding):
+                self._test_transform_bias_rescale_qkv_impl(
+                    device, dtype, use_nt=True, use_padding=use_padding
+                )
+
+    def _test_multihead_attention_impl(
+        self, device, dtype, mode, use_nt, need_weights, average_attn_weights, use_padding=False, pad_all=False
+    ):
+        embed_dim = 64
+        num_heads = 4
+        bs = 16
+        sl = 8
+
+        q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+        if use_padding:
+            if pad_all:
+                for q_i in q:
+                    q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype)
+                mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
+                for mask_i in mask:
+                    mask_i[-1] = True
+            else:
+                q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype)
+                mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
+                mask[0][-1] = True
+        if mode == "self":
+            k = q
+            v = q
+        elif mode == "encdec":
+            k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+            v = k
+        elif mode == "generic":
+            k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+            v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+        else:
+            self.fail(f"invalid mode `{mode}`!")
+
+        qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
+        proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
+
+        pt = torch.nn.MultiheadAttention(
+            embed_dim, num_heads, batch_first=True, device=device, dtype=dtype
+        )
+        pt.in_proj_weight = qkv.weight
+        pt.in_proj_bias = qkv.bias
+        pt.out_proj.weight = proj.weight
+        pt.out_proj.bias = proj.bias
+
+        class NativeMHA(torch.nn.Module):
+            def __init__(self, embed_dim, num_heads, qkv, proj):
+                super().__init__()
+                self.qkv = qkv
+                self.proj = proj
+                self.embed_dim = embed_dim
+                self.num_heads = num_heads
+
+            def forward(self, q, k, v, key_padding_mask):
+                return torch._native_multi_head_attention(
+                    q,
+                    k,
+                    v,
+                    self.embed_dim,
+                    self.num_heads,
+                    self.qkv.weight,
+                    self.qkv.bias,
+                    self.proj.weight,
+                    self.proj.bias,
+                    key_padding_mask,
+                    need_weights=need_weights,
+                    average_attn_weights=average_attn_weights,
+                )
+
+        npt = NativeMHA(
+            embed_dim=embed_dim, num_heads=num_heads, qkv=qkv, proj=proj
+        ).to(dtype)
+
+        if device == "cuda":
+            pt = pt.cuda()
+            npt = npt.cuda()
+
+        ypt, weight_pt = pt(
+            q,
+            k,
+            v,
+            need_weights=need_weights,
+            average_attn_weights=average_attn_weights,
+            key_padding_mask=mask if use_padding else None,
+        )
+        if use_nt:
+            qs = list(torch.unbind(q))
+            if use_padding:
+                if pad_all:
+                    qs = [x[:-1] for x in qs]
+                else:
+                    qs[0] = qs[0][:-1]
+            q = torch.nested_tensor(qs, device=device, dtype=dtype)
+            if mode == "self":
+                k = v = q
+            elif mode == "encdec":
+                k = torch.nested_tensor(torch.unbind(k), device=device, dtype=dtype)
+                v = k
+            else:
+                k = torch.nested_tensor(torch.unbind(k), device=device, dtype=dtype)
+                v = torch.nested_tensor(torch.unbind(v), device=device, dtype=dtype)
+
+        ynpt, weight_npt = npt(
+            q, k, v, key_padding_mask=mask if use_padding and not use_nt else None
+        )
+        if use_nt:
+            ynpt = ynpt.to_padded_tensor(0)
+            if pad_all:
+                ynpt_final = torch.zeros_like(ypt)
+                ynpt_final[:, :ynpt.shape[1], :] = ynpt
+                ynpt = ynpt_final
+
+        def do_pad_all(tensors):
+            for t in tensors:
+                for t_i in t:
+                    t_i[-1] = torch.zeros_like(t_i[-1], device=device, dtype=dtype)
+
+        # PyTorch implementation returns non-zero junk in the padding
+        # locations; overwrite it so that the comparison works out.
+        if use_padding:
+            ypt[0][-1] = torch.zeros_like(ypt[0][-1], device=device, dtype=dtype)
+            ynpt[0][-1] = torch.zeros_like(ynpt[0][-1], device=device, dtype=dtype)
+            if pad_all:
+                do_pad_all((ypt, ynpt))
+            # Zero the last row of each TxT weight matrix
+            if need_weights:
+                if average_attn_weights:
+                    weight_pt[0][-1] = torch.zeros_like(weight_pt[0][-1], device=device, dtype=dtype)
+                    weight_npt[0][-1] = torch.zeros_like(weight_npt[0][-1], device=device, dtype=dtype)
+                    if pad_all:
+                        do_pad_all((weight_pt, weight_npt))
+                else:
+                    for nh in range(num_heads):
+                        weight_pt[0][nh][-1] = torch.zeros_like(weight_pt[0][nh][-1], device=device, dtype=dtype)
+                        weight_npt[0][nh][-1] = torch.zeros_like(weight_npt[0][nh][-1], device=device, dtype=dtype)
+
+        if dtype == torch.half:
+            torch.testing.assert_close(ypt, ynpt, atol=1e-3, rtol=1e-3)
+        else:
+            # High rtol seems necessary for
+            # test_native_multihead_attention_cpu_float32 on Windows,
+            # otherwise 2e-4 would likely be fine.
+            torch.testing.assert_close(ypt, ynpt, atol=2e-5, rtol=2e-3)
+
+        if need_weights:
+            torch.testing.assert_close(weight_pt, weight_npt)
+        else:
+            self.assertEqual(weight_pt, weight_npt)
+
+    @dtypesIfCUDA(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @torch.no_grad()
+    def test_native_multihead_self_attention(self, device, dtype):
+        for (use_padding, pad_all) in ((False, False), (True, False), (True, True)):
+            for use_nt in (False, True):
+                # Figuring out exactly which elements of the weights are garbage in this
+                # case eludes me, and it's not particularly enlightening to test anyway
+                # because padding doesn't especially affect the intermediate weights.
+                for need_weights in (False, not pad_all):
+                    for average_attn_weights in (False, True):
+                        with self.subTest(use_padding=use_padding, pad_all=pad_all,
+                                          use_nt=use_nt, need_weights=need_weights,
+                                          average_attn_weights=average_attn_weights):
+                            self._test_multihead_attention_impl(
+                                device,
+                                dtype,
+                                "self",
+                                use_nt=use_nt,
+                                use_padding=use_padding,
+                                pad_all=pad_all,
+                                need_weights=need_weights,
+                                average_attn_weights=average_attn_weights,
+                            )
+
+    @dtypesIfCUDA(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @torch.no_grad()
+    def test_native_multihead_encoder_decoder_attention(self, device, dtype):
+        self._test_multihead_attention_impl(
+            device,
+            dtype,
+            "encdec",
+            use_nt=False,
+            need_weights=False,
+            average_attn_weights=False,
+        )
+
+    @dtypesIfCUDA(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @torch.no_grad()
+    def test_native_multihead_attention(self, device, dtype):
+        self._test_multihead_attention_impl(
+            device,
+            dtype,
+            "generic",
+            use_nt=False,
+            need_weights=False,
+            average_attn_weights=False,
+        )
+
+
+instantiate_device_type_tests(TestMHADeviceType, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
new file mode 100644
index 000000000000..b01ecbc4675a
--- /dev/null
+++ b/test/test_nestedtensor.py
@@ -0,0 +1,420 @@
+# Owner(s): ["module: nestedtensor"]
+
+import torch
+import torch.nn
+import unittest
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+    skipMeta,
+)
+from torch.testing._internal.common_utils import TestCase, IS_FBCODE, run_tests
+from torch import nested_tensor
+
+# Tests are ported from pytorch/nestedtensor.
+# This makes porting as_nested_tensor easier in the future.
+def _iter_constructors():
+    # yield as_nested_tensor
+    yield nested_tensor
+
+
+class TestNestedTensor(TestCase):
+    @torch.inference_mode()
+    def _test_unbind_case(self, a, b):
+        nt = nested_tensor([a, b])
+        a1, b1 = nt.unbind()
+        self.assertTrue(a is not a1)
+        self.assertTrue(b is not b1)
+
+        nt = nested_tensor([a, b], dtype=a.dtype)
+        a1, b1 = nt.unbind(0)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+
+        a = torch.randn((2, 3)).add_(1)
+        nt = nested_tensor([a])
+        self.assertEqual(a, nt.unbind(0)[0])
+
+    @torch.inference_mode()
+    def test_unbind_0(self):
+        self._test_unbind_case(
+            torch.tensor([1, 2]), torch.tensor([7, 8]),
+        )
+
+    @torch.inference_mode()
+    def test_unbind_1(self):
+        self._test_unbind_case(
+            torch.tensor([1]), torch.tensor([7]),
+        )
+
+    # @torch.inference_mode()
+    # def test_unbind_2(self):
+    #     self._test_unbind_case(
+    #         torch.tensor(1), torch.tensor(7),
+    #     )
+
+    @torch.inference_mode()
+    def test_unbind_3(self):
+        self._test_unbind_case(
+            torch.tensor([1.0]), torch.tensor([]),
+        )
+
+    @torch.inference_mode()
+    def test_unbind_4(self):
+        self._test_unbind_case(
+            torch.tensor([]), torch.tensor([]),
+        )
+
+    @torch.inference_mode()
+    def test_unbind_dim(self):
+        def _test_fn(unbind_fn):
+            a = torch.rand(3, 2)
+            b = torch.rand(2, 3)
+            nt = nested_tensor([a, b])
+            self.assertRaises(RuntimeError, lambda: unbind_fn(nt, 1))
+
+        # Both of these tests are necessary, because we're using
+        # torch_function.
+        _test_fn(lambda x, dim: x.unbind(dim))
+        # TODO: Re-enable this once using torch_dispatch
+        # _test_fn(lambda x, dim: torch.unbind(x, dim))
+
+    @torch.inference_mode()
+    def test_nested_tensor(self):
+        self.assertRaises(TypeError, lambda: nested_tensor([3.0]))
+        self.assertRaises(TypeError, lambda: nested_tensor(torch.tensor([3.0])))
+        self.assertRaises(TypeError, lambda: nested_tensor(4.0))
+
+    @torch.inference_mode()
+    def test_nested_tensor_matching_dim(self):
+        self.assertRaisesRegex(
+            RuntimeError,
+            "Found dimension 1 for Tensor at index 1 and dimension 0 for Tensor at index 0.",
+            lambda: nested_tensor([torch.tensor(1.0), torch.tensor([])]),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            "Found dimension 1 for Tensor at index 2 and dimension 0 for Tensor at index 1.",
+            lambda: nested_tensor(
+                [torch.tensor(1.0), torch.tensor(2.0), torch.tensor([])]
+            ),
+        )
+
+    @torch.inference_mode()
+    def test_default_nested_tensor(self):
+        self.assertRaises(TypeError, lambda: nested_tensor())
+        default_nested_tensor = nested_tensor([])
+        default_tensor = torch.tensor([])
+        # self.assertEqual(default_nested_tensor.nested_dim(), 1)
+        # self.assertEqual(default_nested_tensor.nested_size(), ())
+        self.assertEqual(default_nested_tensor.dim(), default_tensor.dim())
+        self.assertEqual(default_nested_tensor.layout, default_tensor.layout)
+        self.assertEqual(default_nested_tensor.device, default_tensor.device)
+        self.assertEqual(default_nested_tensor.dtype, default_tensor.dtype)
+        self.assertEqual(
+            default_nested_tensor.requires_grad, default_tensor.requires_grad
+        )
+        self.assertIsNone(default_tensor.grad)
+        # TODO: Re-enable once we have a performance driven
+        # use case and implementation.
+        # self.assertEqual(default_nested_tensor.is_pinned(),
+        #                  default_tensor.is_pinned())
+
+    @torch.inference_mode()
+    def test_dim(self):
+        for constructor in _iter_constructors():
+            a1 = constructor([])
+            self.assertEqual(a1.dim(), 1)
+            a1 = constructor([torch.tensor(3.0)])
+            self.assertEqual(a1.dim(), 1)
+            a1 = constructor([torch.tensor([1, 2, 3, 4])])
+            self.assertEqual(a1.dim(), 2)
+
+    @unittest.skipIf(IS_FBCODE, "numel is not virtual in fbcode.")
+    @torch.inference_mode()
+    def test_numel(self):
+        for constructor in _iter_constructors():
+            a1 = constructor([])
+            self.assertRaisesRegex(
+                RuntimeError, "numel is disabled", lambda: a1.numel(),
+            )
+
+    @torch.inference_mode()
+    def test_size(self):
+        for constructor in _iter_constructors():
+            a1 = constructor([])
+            self.assertRaisesRegex(
+                RuntimeError,
+                "Tensors of type NestedTensorImpl do not have sizes"
+                if IS_FBCODE
+                else "NestedTensorImpl doesn't support sizes",
+                lambda: a1.size(),
+            )
+
+    @unittest.skipIf(IS_FBCODE, "stride is not virtual in fbcode.")
+    @torch.inference_mode()
+    def test_stride(self):
+        for constructor in _iter_constructors():
+            a1 = constructor([])
+            self.assertRaisesRegex(
+                RuntimeError,
+                "NestedTensorImpl doesn't support strides",
+                lambda: a1.stride(),
+            )
+
+    @unittest.skipIf(IS_FBCODE, "is_contiguous is not virtual in fbcode.")
+    @torch.inference_mode()
+    def test_is_contiguous(self):
+        for constructor in _iter_constructors():
+            a1 = constructor([])
+            self.assertRaisesRegex(
+                RuntimeError, "is_contiguous is disabled", lambda: a1.is_contiguous()
+            )
+
+    @torch.inference_mode()
+    def test_repr_string(self):
+        a = nested_tensor([])
+        expected = "nested_tensor([" "\n\n])"
+        self.assertEqual(str(a), expected)
+        self.assertEqual(repr(a), expected)
+
+        a = nested_tensor([torch.tensor(1.0)])
+        expected = "nested_tensor([" "\n  tensor(1.)" "\n])"
+        self.assertEqual(str(a), expected)
+        self.assertEqual(repr(a), expected)
+
+        a = nested_tensor([torch.tensor([[1, 2]]), torch.tensor([[4, 5]])])
+        expected = (
+            "nested_tensor([" "\n  tensor([[1, 2]])" "," "\n  tensor([[4, 5]])" "\n])"
+        )
+        self.assertEqual(str(a), expected)
+        self.assertEqual(repr(a), expected)
+
+    @torch.inference_mode()
+    def test_activations(self):
+        for func in (torch.nn.functional.relu, torch.nn.functional.relu_, torch.nn.functional.gelu, torch._C._nn.gelu_):
+            t = torch.tensor([-1, 0, 1], dtype=torch.float)
+            nt = nested_tensor([t])
+            nested_result = func(nt)
+            self.assertTrue(nested_result.is_nested)
+            self.assertEqual(func(t), nested_result.unbind()[0])
+
+    def test_to_padded_tensor_on_empty_tensor(self):
+        nt = torch.nested_tensor([])
+        empty = nt.to_padded_tensor(4)
+        self.assertEqual(empty, torch.tensor([]))
+
+class TestNestedTensorDeviceType(TestCase):
+    @dtypes(torch.float)
+    @skipMeta
+    def test_to_then_from_padded_tensor_no_transform0213(self, device, dtype):
+        t = torch.randn(4, 4, 4, device=device, dtype=dtype)
+        ts = list(torch.unbind(t))
+        ts[0] = ts[0][:-1]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        padded = nt.to_padded_tensor(0)
+
+        nt_to = torch._nested_from_padded_and_nested_example(padded, nt)
+
+        for (t1, t2) in zip(nt.unbind(), nt_to.unbind()):
+            self.assertEqual(t1, t2)
+        self.assertEqual(nt.device, nt_to.device)
+
+    @dtypes(torch.float)
+    @dtypesIfCUDA(torch.float, torch.half)
+    @skipMeta
+    @torch.inference_mode()
+    def test_layer_norm(self, device, dtype):
+        def _test(size):
+            t0 = torch.randn(2, size, device=device, dtype=dtype, requires_grad=False)
+            t1 = torch.randn(2, size, device=device, dtype=dtype, requires_grad=False)
+            ts = [t0, t1, t0, t1]
+            nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+            layer_norm = torch.nn.LayerNorm(size, device=device, dtype=dtype)
+            nt_result = nt._nested_tensor_layer_norm(
+                layer_norm.weight, layer_norm.bias, 1e-5
+            )
+            for (nt_subresult, t) in zip(nt_result.unbind(), ts):
+                t_result = layer_norm(t.reshape(1, -1, size).squeeze(0))
+                self.assertEqual(nt_subresult, t_result)
+
+        for size in (1024, 1023, 513, 512, 256, 128, 2, 4, 32):
+            _test(size)
+
+    @skipMeta
+    @torch.inference_mode()
+    def test_embedding(self, device):
+        inputs = [
+            torch.randint(100, (L,), device=device, dtype=torch.int64)
+            for L in torch.randint(5, 50, (8,))
+        ]
+        x = torch.nested_tensor(inputs, device=device, dtype=torch.int64)
+        emb = torch.nn.Embedding(100, 8, device=device)
+        y = emb(x)
+        ys = y.unbind()
+        for i, inp in enumerate(inputs):
+            self.assertEqual(emb(inp), ys[i])
+
+    @dtypes(torch.float, torch.float16)
+    def test_to_padded_tensor_simple(self, device, dtype):
+        t = torch.randn(4, 4, 4, device=device, dtype=dtype)
+        ts = list(torch.unbind(t))
+        ts[0] = ts[0][:-1]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        for padding_value in (0, 1):
+            padded = nt.to_padded_tensor(padding_value)
+
+            correct_output = t.clone()
+            if padding_value == 0:
+                correct_output[0][-1] = torch.zeros_like(correct_output[0][-1])
+            else:
+                correct_output[0][-1] = torch.ones_like(correct_output[0][-1])
+
+            self.assertEqual(padded, correct_output)
+            self.assertEqual(padded.device, torch.device(device))
+            self.assertEqual(padded.dtype, dtype)
+
+    @dtypes(torch.float, torch.float16)
+    def test_to_padded_tensor_output_size(self, device, dtype):
+        t = torch.randn(4, 4, 4, device=device, dtype=dtype)
+        output_size = (4, 6, 5)
+        ts = list(torch.unbind(t))
+        ts[0] = ts[0][:-1]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        for padding_value in (0, 1):
+            padded = nt.to_padded_tensor(padding_value, output_size=output_size)
+            correct_output = torch.ones(output_size, device=device, dtype=dtype) * padding_value
+            correct_output[:4:, :4, :4] = t.clone()
+            if padding_value == 0:
+                correct_output[0][3] = torch.zeros_like(correct_output[0][3])
+            else:
+                correct_output[0][3] = torch.ones_like(correct_output[0][3])
+
+            self.assertEqual(padded, correct_output)
+            self.assertEqual(padded.device, torch.device(device))
+            self.assertEqual(padded.dtype, dtype)
+
+    @dtypes(torch.float, torch.float16, torch.double)
+    def test_to_padded_tensor_dim2(self, device, dtype):
+        ts = [
+            torch.randn(160, device=device, dtype=dtype),
+            torch.randn(1240, device=device, dtype=dtype),
+            torch.randn(2400, device=device, dtype=dtype),
+        ]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        pad = 42
+        correct_output = []
+        for t in ts:
+            next_output = torch.ones_like(ts[2]) * pad
+            correct_output.append(next_output)
+            next_output[:t.size(0)].copy_(t)
+        correct_output = torch.stack(correct_output)
+        padded = nt.to_padded_tensor(pad)
+        self.assertEqual(padded, correct_output)
+
+    @dtypes(torch.float, torch.float16, torch.double)
+    def test_to_padded_tensor_dim3(self, device, dtype):
+        ts = [
+            torch.randn(16, 21, device=device, dtype=dtype),
+            torch.randn(24, 32, device=device, dtype=dtype),
+            torch.randn(40, 53, device=device, dtype=dtype),
+        ]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        pad = 42
+        correct_output = []
+        for t in ts:
+            next_output = torch.ones_like(ts[2]) * pad
+            correct_output.append(next_output)
+            next_output[:t.size(0), :t.size(1)].copy_(t)
+        correct_output = torch.stack(correct_output)
+        padded = nt.to_padded_tensor(pad)
+        self.assertEqual(padded, correct_output)
+
+    @dtypes(torch.float, torch.float16, torch.double)
+    def test_to_padded_tensor_dim4(self, device, dtype):
+        ts = [
+            torch.randn(16, 21, 13, device=device, dtype=dtype),
+            torch.randn(24, 32, 14, device=device, dtype=dtype),
+            torch.randn(40, 53, 16, device=device, dtype=dtype),
+        ]
+        nt = torch.nested_tensor(ts, device=device, dtype=dtype)
+        pad = 42
+        correct_output = []
+        for t in ts:
+            next_output = torch.ones_like(ts[2]) * pad
+            correct_output.append(next_output)
+            next_output[:t.size(0), :t.size(1), :t.size(2)].copy_(t)
+        correct_output = torch.stack(correct_output)
+        padded = nt.to_padded_tensor(pad)
+        self.assertEqual(padded, correct_output)
+
+    @skipMeta
+    def test_device_checks(self, device):
+        nt = torch.nested_tensor([], device=device)
+        is_cuda = 'cuda' in str(device)
+        self.assertEqual(nt.is_cuda, is_cuda)
+
+    # Helper functions for testing elementwise ops
+    def random_nt_pair(self, device, dtype, num_tensors, max_dims):
+        ts1 = []
+        ts2 = []
+        for _ in range(num_tensors):
+            tensor_dims = tuple([torch.randint(low=0, high=max_dim, size=(1,)).item() for max_dim in max_dims])
+            t1 = torch.randn(tensor_dims, device=device, dtype=dtype)
+            t2 = torch.randn(tensor_dims, device=device, dtype=dtype)
+            ts1.append(t1)
+            ts2.append(t2)
+        return (torch.nested_tensor(ts1, device=device, dtype=dtype),
+                torch.nested_tensor(ts2, device=device, dtype=dtype))
+
+    def nt_equal(self, nt1, nt2):
+        self.assertEqual(nt1.dtype, nt2.dtype)
+        self.assertEqual(nt1.device, nt2.device)
+        ub1 = nt1.unbind()
+        ub2 = nt2.unbind()
+        self.assertEqual(len(ub1), len(ub2))
+        n = len(ub1)
+        for i in range(n):
+            self.assertEqual(ub1[i], ub2[i])
+
+    @dtypes(torch.float, torch.float16)
+    @skipMeta
+    @torch.inference_mode()
+    def test_nested_tensor_add(self, device, dtype):
+        (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4))
+        ref = torch.nested_tensor([t1 + t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())])
+        out = nt1 + nt2
+        self.nt_equal(ref, out)
+
+    @dtypes(torch.float, torch.float16)
+    @skipMeta
+    @torch.inference_mode()
+    def test_nested_tensor_mul(self, device, dtype):
+        (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4))
+        ref = torch.nested_tensor([t1 * t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())])
+        out = nt1 * nt2
+        self.nt_equal(ref, out)
+
+    @dtypes(torch.float, torch.float16)
+    @skipMeta
+    @torch.inference_mode()
+    def test_nested_tensor_add_in_place(self, device, dtype):
+        (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4))
+        ref = torch.nested_tensor([t1 + t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())])
+        nt1 += nt2
+        self.nt_equal(ref, nt1)
+
+    @dtypes(torch.float, torch.float16)
+    @skipMeta
+    @torch.inference_mode()
+    def test_nested_tensor_mul_in_place(self, device, dtype):
+        (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4))
+        ref = torch.nested_tensor([t1 * t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())])
+        nt1 *= nt2
+        self.nt_equal(ref, nt1)
+
+instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 28f44c94f405..ddb7a47cd813 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: nn"]
 
+import contextlib
 import math
 import random
 import string
@@ -14,6 +15,7 @@
 from functools import reduce, partial
 from operator import mul
 from collections import OrderedDict
+from tempfile import NamedTemporaryFile
 
 import torch
 
@@ -22,6 +24,7 @@
 torch.set_default_dtype(torch.double)
 
 from torch._six import inf, nan
+import torch.autograd.forward_ad as fwAD
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
@@ -34,12 +37,13 @@
 from torch.nn import Parameter
 from torch.nn.parameter import UninitializedParameter, UninitializedBuffer
 from torch.nn.parallel._functions import Broadcast
-from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes
+from torch.testing._internal.common_dtype import integral_types, floating_types_and, get_all_math_dtypes, \
+    floating_and_complex_types_and
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
-    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
-    get_function_arglist, load_tests, \
+    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
+    download_file, get_function_arglist, load_tests, skipIfMps,\
     suppress_warnings, TemporaryFileName, TEST_WITH_UBSAN, IS_PPC, \
-    parametrize as parametrize_test, subtest, instantiate_parametrized_tests
+    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, set_default_dtype, IS_WINDOWS
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, \
@@ -52,6 +56,7 @@
 from torch.nn import MultiheadAttention
 
 from hypothesis import given
+from torch.testing import make_tensor
 import torch.testing._internal.hypothesis_utils as hu
 from torch.testing._internal.common_utils import _assertGradAndGradgradChecks, gradcheck, gradgradcheck, \
     GRADCHECK_NONDET_TOL
@@ -68,6 +73,7 @@
 
 if TEST_SCIPY:
     from scipy import stats
+    import scipy.signal
     import scipy.ndimage
 
 if TEST_NUMPY:
@@ -138,6 +144,21 @@ def test_wrong_order(self):
             RuntimeError,
             lambda: rnn_utils.pack_padded_sequence(b_a, [22, 25], enforce_sorted=True))
 
+    def test_pad_sequence_with_tensor_sequences(self):
+        seq_tuple_input = torch.nn.utils.rnn.pad_sequence(
+            (torch.tensor([[7, 6]]), torch.tensor([[-7, -1]]))
+        )
+        seq_tensor_input = torch.nn.utils.rnn.pad_sequence(
+            torch.tensor([[[7, 6]], [[-7, -1]]])
+        )
+        self.assertEqual(seq_tuple_input, seq_tensor_input)
+        self.assertEqual(seq_tuple_input.shape, torch.Size([1, 2, 2]))
+
+    def test_pad_sequence_with_non_iterable_sequences(self):
+        msg = r"Expected iterable for input sequences, but got arg of type"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            torch.nn.utils.rnn.pad_sequence(5)
+
     def test_total_length(self):
         padded, lengths = self._padded_sequence(torch.FloatTensor)
         max_length = max(lengths)
@@ -395,6 +416,13 @@ def __init__(self):
 
         return l, n, s
 
+    def test_parse_to(self):
+        # Test for buggy use of THPMemoryFormat_New
+        self.assertEqual(
+            repr(torch._C._nn._parse_to(memory_format=torch.contiguous_format)[3]),
+            "torch.contiguous_format"
+        )
+
     def test_requires_grad_(self):
         m = self._create_basic_net()[-1]
         assert len(list(m.buffers())) > 0, 'invalid test'
@@ -876,7 +904,7 @@ def test_no_grad(self):
                 self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10)))
 
     def test_invalid_conv1d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double]:
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
             module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
             input = torch.randn(1, 3, 4).to(dtype)
             with self.assertRaisesRegex(RuntimeError,
@@ -891,30 +919,32 @@ def test_invalid_conv1d(self):
                 module(input)
 
     def test_mismatch_shape_conv2d(self):
-        x = torch.randn(1, 10, 1, 28, 28)
-        w = torch.randn(6, 1, 5, 5)
+        for dtype in (torch.float, torch.cfloat):
+            x = torch.randn(1, 10, 1, 28, 28, dtype=dtype)
+            w = torch.randn(6, 1, 5, 5, dtype=dtype)
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
-                                    r'input of size: \[1, 10, 1, 28, 28\]'):
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
+                                        r'input of size: \[1, 10, 1, 28, 28\]'):
 
-            F.conv2d(x, w)
+                F.conv2d(x, w)
 
     def test_conv2d_discontiguous_weight(self):
-        # Test for https://github.com/pytorch/pytorch/issues/55781
-        x = torch.ones(64, 16, 16, 16)
-        weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2)[:, :, :, ::2]
-        self.assertFalse(weight.is_contiguous())
-        y = torch.nn.functional.conv2d(x, weight, None)
-        if torch.backends.mkldnn.is_available():
-            # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
-            with torch.backends.mkldnn.flags(enabled=False):
-                y_ = torch.nn.functional.conv2d(x, weight, None)
-                self.assertEqual(y, y_)
-        self.assertEqual(y.sum(), 4186112.)
+        for dtype in (torch.float, torch.cfloat):
+            # Test for https://github.com/pytorch/pytorch/issues/55781
+            x = torch.ones(64, 16, 16, 16, dtype=dtype)
+            weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
+            self.assertFalse(weight.is_contiguous())
+            y = torch.nn.functional.conv2d(x, weight, None)
+            if torch.backends.mkldnn.is_available():
+                # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
+                with torch.backends.mkldnn.flags(enabled=False):
+                    y_ = torch.nn.functional.conv2d(x, weight, None)
+                    self.assertEqual(y, y_)
+            self.assertEqual(y.sum(), 4186112.)
 
     def test_invalid_conv2d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double]:
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
             module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
             input = torch.empty(1, 1, 4, 4).to(dtype)
             self.assertRaises(RuntimeError, lambda: module(input))
@@ -939,7 +969,7 @@ def test_invalid_conv2d(self):
                 module(input)
 
     def test_invalid_conv3d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double]:
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
             module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
             input = torch.empty(1, 1, 4, 4, 4).to(dtype)
             self.assertRaises(RuntimeError, lambda: module(input))
@@ -1391,6 +1421,16 @@ def test_load_state_dict_invalid(self):
                                     "expected torch.Tensor or Tensor-like object from checkpoint but received"):
             m.load_state_dict(state_dict)
 
+    def test_load_state_dict_type(self):
+        m = nn.Module()
+
+        with self.assertRaisesRegex(TypeError,
+                                    "Expected state_dict to be dict-like, got"):
+            m.load_state_dict("")
+        with self.assertRaisesRegex(TypeError,
+                                    "Expected state_dict to be dict-like, got"):
+            m.load_state_dict(2)
+
     def test_buffer_not_persistent_load(self):
         m = nn.Module()
         m.register_buffer('buf', torch.rand(5), persistent=False)
@@ -3153,6 +3193,40 @@ def forward(self, X):
             Y = model.weight
             self.assertEqual(id(X), id(Y))
 
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_caching_parametrization_with_transfer_parametrizations_and_params(self):
+        r"""Test that transferring parametrizations doesn't cause issues with caching"""
+        class Skew(nn.Module):
+            def forward(self, X):
+                X = X.tril(-1)
+                return X - X.T
+
+        class Orthogonal(nn.Module):
+            def forward(self, X):
+                Id = torch.eye(X.size(0), device=X.device)
+                return torch.linalg.solve(Id + X, Id - X)
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", Skew())
+        parametrize.register_parametrization(model, "weight", Orthogonal())
+
+        to_model = nn.Linear(5, 5)
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        with parametrize.cached():
+            X = model.weight
+            Y = model.weight
+            self.assertEqual(id(X), id(Y))
+
+            A = to_model.weight
+            B = to_model.weight
+            self.assertEqual(id(A), id(B))
+
+            # test that the results are distinct objects for each module
+            self.assertNotEqual(id(A), id(X))
+
     def test_parametrization_same_training_mode(self):
         r"""Test training mode updated on parametrization registration"""
         class Identity(nn.Module):
@@ -3168,6 +3242,220 @@ def forward(self, X):
         self.assertTrue(module.parametrizations.weight[0].training)
         self.assertTrue(module.parametrizations.weight[1].training)
 
+    def test_type_before_parametrizations(self):
+        r"""Test that type_before_parametrizations always retrieves original type"""
+
+        class Identity(nn.Module):
+            def forward(self, X):
+                return X
+
+        model = nn.Linear(5, 5)
+        original_type = type(model)
+        self.assertTrue(
+            parametrize.type_before_parametrizations(model) == original_type
+        )
+        parametrize.register_parametrization(model, "weight", Identity())
+        self.assertTrue(
+            parametrize.type_before_parametrizations(model) == original_type
+        )
+
+    def test_transfer_parametrizations_and_params(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1.0
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+            def right_inverse(self, x):
+                return 0.5 * x
+
+        class MinusOne(nn.Module):
+            def forward(self, x):
+                return x - 1.0
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", AddOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        parametrize.register_parametrization(model, "weight", MinusOne())
+        hold_weight = model.weight
+
+        to_model = nn.qat.Linear(
+            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # checks that final and original value are correct and the to_model is parametrized
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+
+        # check that the transfer didn't affect the original value
+        self.assertEqual(hold_weight, model.weight)
+
+        # testing that changes to one set of parametrizations do not affect the other
+        parametrize.remove_parametrizations(to_model, "weight")
+        self.assertFalse(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(model, "weight"))
+
+        # also test that parameters that don't exist in to_model get transferred
+        model.test_param = Parameter(torch.randn(5, 5))
+
+        self.assertTrue(not hasattr(to_model, "test_param"))
+        parametrize.register_parametrization(model, "test_param", Double())
+        hold_test_param = model.test_param
+        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
+
+        # check that previously missing params got transferred correctly
+        self.assertEqual(model.test_param, to_model.test_param)
+        self.assertEqual(
+            model.parametrizations.test_param.original,
+            to_model.parametrizations.test_param.original,
+        )
+
+        # check that the new transfer didn't change the value for the from_module
+        self.assertEqual(hold_test_param, model.test_param)
+
+    def test_transfer_parametrizations_and_params_right_inverse(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+            def right_inverse(self, x):
+                return 0.5 * x
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", Double())
+        hold_weight = model.weight
+
+        to_model = nn.qat.Linear(
+            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # check that transfer occurs successfully
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+
+        # check that transfer doesn't affect the from_model weight
+        self.assertEqual(hold_weight, model.weight)
+
+    def test_transfer_parametrizations_and_params_single_param(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1.0
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+        class MinusOne(nn.Module):
+            def forward(self, x):
+                return x - 1.0
+
+        model = nn.Linear(5, 5, bias=True)
+        parametrize.register_parametrization(model, "weight", AddOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        parametrize.register_parametrization(model, "weight", MinusOne())
+        parametrize.register_parametrization(model, "bias", AddOne())
+        parametrize.register_parametrization(model, "bias", Double())
+        parametrize.register_parametrization(model, "bias", MinusOne())
+
+        to_model = nn.qat.Linear(
+            5, 5, bias=True, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model, "weight")
+
+        # check that weight and only weight was transferred
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+        self.assertTrue("bias" not in to_model.parametrizations)
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    # and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_transfer_parametrizations_and_params_many_to_one(self):
+        # A parametrization with several outputs
+        class RankOne(nn.Module):
+            def forward(self, x, y):
+                # Form a rank-1 matrix from a pair of vectors
+                return x.unsqueeze(-1) @ y.unsqueeze(-2)
+
+            def right_inverse(self, Y):
+                # We project the given matrix onto the rank 1 matrices
+                U, S, Vh = torch.linalg.svd(Y, full_matrices=False)
+                # S is ordered in a decreasing way.
+                s0_sqrt = S[0].sqrt().unsqueeze(-1)
+                return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+        model = nn.Linear(3, 3)
+        parametrize.register_parametrization(model, "weight", RankOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        hold_weight = model.weight
+
+        to_model = nn.qat.Linear(
+            3, 3, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # checks that final and original value are correct and the to_model is parametrized
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original0,
+            to_model.parametrizations.weight.original0,
+        )
+        self.assertEqual(
+            model.parametrizations.weight.original1,
+            to_model.parametrizations.weight.original1,
+        )
+
+        # check that the transfer didn't affect the original value
+        self.assertEqual(hold_weight, model.weight)
+
+        # testing that changes to one set of parametrizations do not affect the other
+        model.test_param = Parameter(torch.randn(3, 3))
+
+        self.assertTrue(not hasattr(to_model, "test_param"))
+        parametrize.register_parametrization(model, "test_param", RankOne())
+        hold_test_param = model.test_param
+        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
+
+        # also check that previously missing params got transferred correctly
+        self.assertEqual(model.test_param, to_model.test_param)
+        self.assertEqual(
+            model.parametrizations.test_param.original0,
+            to_model.parametrizations.test_param.original0,
+        )
+        self.assertEqual(
+            model.parametrizations.test_param.original1,
+            to_model.parametrizations.test_param.original1,
+        )
+
+        # check that the new transfer didn't change the value for the from_module
+        self.assertEqual(hold_test_param, model.test_param)
+
     # torch/nn/utils/prune.py
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
     def test_validate_pruning_amount_init(self):
@@ -4131,37 +4419,38 @@ def check_weight_norm(l, name, num_params):
 
 
     def test_weight_norm(self):
-        input = torch.randn(3, 5)
-        m = nn.Linear(5, 7)
-        expected_output = m(input)
-
-        # add weight normalization
-        m = torch.nn.utils.weight_norm(m)
-        self.assertEqual(m.weight_v.size(), m.weight.size())
-        self.assertEqual(m.weight_g.size(), (7, 1))
-        self.assertEqual(m(input), expected_output)
-
-        # remove weight norm
-        m = torch.nn.utils.remove_weight_norm(m)
-        self.assertFalse(hasattr(m, 'weight_g'))
-        self.assertFalse(hasattr(m, 'weight_v'))
-        self.assertEqual(m(input), expected_output)
-
-        # test with dim=1
-        m = torch.nn.utils.weight_norm(m, dim=1)
-        self.assertEqual(m.weight_v.size(), m.weight.size())
-        self.assertEqual(m.weight_g.size(), (1, 5))
-        self.assertEqual(m(input), expected_output)
-
-        # test with dim=None
-        m = nn.Linear(5, 7)
-        expected_output = m(input)
-        m = torch.nn.utils.weight_norm(m, dim=None)
-        self.assertEqual(m(input), expected_output)
+        for dtype in [torch.float, torch.bfloat16]:
+            input = torch.randn(3, 40, dtype=dtype)
+            m = nn.Linear(40, 50).to(dtype=dtype)
+            expected_output = m(input)
 
-        with self.assertRaisesRegex(RuntimeError, 'register two weight_norm hooks'):
-            m = torch.nn.utils.weight_norm(m)
+            # add weight normalization
             m = torch.nn.utils.weight_norm(m)
+            self.assertEqual(m.weight_v.size(), m.weight.size())
+            self.assertEqual(m.weight_g.size(), (50, 1))
+            self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+            # remove weight norm
+            m = torch.nn.utils.remove_weight_norm(m)
+            self.assertFalse(hasattr(m, 'weight_g'))
+            self.assertFalse(hasattr(m, 'weight_v'))
+            self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+            # test with dim=1
+            m = torch.nn.utils.weight_norm(m, dim=1)
+            self.assertEqual(m.weight_v.size(), m.weight.size())
+            self.assertEqual(m.weight_g.size(), (1, 40))
+            self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+            # test with dim=None
+            m = nn.Linear(40, 50).to(dtype=dtype)
+            expected_output = m(input)
+            m = torch.nn.utils.weight_norm(m, dim=None)
+            self.assertEqual(m(input), expected_output)
+
+            with self.assertRaisesRegex(RuntimeError, 'register two weight_norm hooks'):
+                m = torch.nn.utils.weight_norm(m)
+                m = torch.nn.utils.weight_norm(m)
 
     def test_parameterlistdict_setting_attributes(self):
         with warnings.catch_warnings(record=True) as w:
@@ -4807,7 +5096,7 @@ def assert_weight_allclose_Q(weight, W):
                                                 (torch.float32, torch.complex64),
                                                 (True, False)):
             # Conv2d does not support complex yet
-            if not use_linear and dtype.is_complex:
+            if not use_linear:
                 continue
 
             if use_linear:
@@ -5161,8 +5450,52 @@ def test_FeatureAlphaDropout(self):
 
     def test_pad_scalar_error(self):
         inputs = torch.tensor(0., requires_grad=True)
-        self.assertRaises(AssertionError, lambda: F.pad(inputs, (1, 1)))
-        self.assertRaises(AssertionError, lambda: F.pad(inputs, (1,)))
+        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1, 1)))
+        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1,)))
+
+    def test_nested_tensor_from_mask(self):
+        N, L, D = 10, 12, 14
+
+        input = torch.rand(N, L, D)
+        mask = torch.ones(N, L, dtype=torch.bool)
+        # Leave first row be all True to maintain the nt's size unchanged
+        for i in range(1, N):
+            end = torch.randint(1, L, size=()).item()
+            mask[i, end:] = False
+
+        nt = torch._nested_tensor_from_mask(input, mask)
+        input_convert = nt.to_padded_tensor(0.)
+        input.masked_fill_(mask.reshape(N, L, 1).logical_not(), 0.)
+
+        self.assertEqual(input, input_convert)
+
+    def test_nested_tensor_from_mask_error(self):
+        N, L, D = 10, 12, 14
+
+        input = torch.rand(N, L, D)
+        # Mask is not bool
+        mask = torch.zeros(N, L, dtype=torch.float)
+        self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
+
+        # Mask size is not 2
+        mask = torch.zeros(N, L, D, dtype=torch.bool)
+        self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
+
+        # Input size is not 3
+        mask = torch.zeros(N, L, dtype=torch.bool)
+        input = torch.rand(N, L)
+        self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
+
+        # Mask size does not match input
+        mask = torch.zeros(N + 1, L + 1, dtype=torch.bool)
+        input = torch.rand(N, L, D)
+        self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
+
+        # Mask is not padding format
+        mask = torch.ones(N, L, dtype=torch.bool)
+        mask[0, 0] = False
+        mask[0, 2] = False
+        self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
 
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
     @parametrize_test("average_attn_weights", [True, False])
@@ -5487,6 +5820,32 @@ def test_multihead_attn_3d_attn_mask(self):
             # output_2d in shape of [T, 1, D]
             self.assertEqual(output_3d[i].unsqueeze(0).transpose(0, 1), output_2d)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_self_attn_TxT_attn_mask(self):
+        embed_dim = 16
+        num_heads = 4
+        batch_size = 10
+        tgt_len = 16
+
+        query = torch.rand(batch_size, tgt_len, embed_dim, device="cuda")  # [N, T, D]
+        attn_mask = torch.randint(0, 2, (tgt_len, tgt_len)).cuda().float()  # [T, T]
+        attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf')).masked_fill(attn_mask == 1, float(0.0))
+
+        attn_mask_4d = attn_mask.expand(batch_size, num_heads, tgt_len, tgt_len)
+
+        mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda()
+        mta_model.eval()
+
+        # Generate 3D results
+        with torch.inference_mode():
+            output_mask_4d = mta_model(query, query, query, attn_mask=attn_mask_4d)[0]
+            output_mask_4d = output_mask_4d.transpose(0, 1)  # [N, T, D]
+
+            output_mask_TxT = mta_model(query, query, query, attn_mask=attn_mask)[0]
+            output_mask_TxT = output_mask_TxT.transpose(0, 1)  # [N, T, D]
+
+            self.assertEqual(output_mask_4d, output_mask_TxT)
+
     def test_multihead_attn_no_bias(self):
         embed_dim = 8
         num_heads = 4
@@ -5496,9 +5855,7 @@ def test_multihead_attn_no_bias(self):
         self.assertIsNone(mha.in_proj_bias)
         self.assertIsNone(mha.out_proj.bias)
 
-    def test_multihead_attn_invalid_shape(self):
-        mha = torch.nn.MultiheadAttention(3, 3)
-
+    def _test_multihead_attn_invalid_shape_impl(self, mha):
         # Batched (3D) query cases
         query = torch.randn(3, 3, 3)
         key = torch.randn(3, 3, 3)
@@ -5554,6 +5911,113 @@ def test_multihead_attn_invalid_shape(self):
         with self.assertRaisesRegex(AssertionError, msg):
             mha(query, key, value, attn_mask=torch.randn(4, 3, 3).bernoulli_().to(torch.bool))
 
+    def test_multihead_attn_invalid_shape(self):
+        mha = torch.nn.MultiheadAttention(3, 3)
+        self._test_multihead_attn_invalid_shape_impl(mha)
+        # Give the test a chance to hit the fast path. (Right now, it
+        # won't, but gating may be less restricted in the future.)
+        with torch.no_grad():
+            self._test_multihead_attn_invalid_shape_impl(mha.eval())
+
+    @torch.no_grad()
+    def test_multihead_attn_fast_path_invalid_shape(self):
+        mha = torch.nn.MultiheadAttention(3, 3, batch_first=True).eval()
+
+        # Batched (3D) query cases
+        query = torch.randn(3, 3, 3)
+        key = torch.randn(3, 3, 3)
+        value = torch.randn(3, 3, 3)
+
+        # Currently, this case will just go to the slow path and get
+        # the usual message because it fails the requirement to be
+        # batched.
+        msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively"
+        # 3D query, 2D key and 3D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(3, 3), value, need_weights=False)
+
+        # Currently, this case will just go to the slow path and get
+        # the usual message because it fails the requirement to be
+        # batched.
+        msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively"
+        # 3D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(3, 3), need_weights=False)
+
+        msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
+
+        # Unbatched (2D) query cases
+        # NOTE: error messages are the same as regular path because the fast path doesn't support 2D.
+        query = torch.randn(3, 3)
+        key = torch.randn(3, 3)
+        value = torch.randn(3, 3)
+
+        msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(3, 3, 3), value)
+
+        msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(3, 3, 3))
+
+        msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor([[False, True, True] * 2], dtype=torch.bool))
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool))
+
+        msg = r"Expected `attn_mask` shape to be \(3, 3, 3\)"
+        # 2D query, 2D key, 2D value and 3D incorrect attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.randn(4, 3, 3).bernoulli_().to(torch.bool))
+
+    def test_multihead_attn_nested_tensor_outside_fast_path(self):
+        mha = torch.nn.MultiheadAttention(3, 3, batch_first=True).eval()
+        nt = torch.nested_tensor([torch.randn(3, 3)])
+        # One tested platform (linux-bionic-py3.7-clang) has a torch_function for one
+        # or more of these. Take advantage of that to test the torch_function bailout.
+        has_torch_func = torch.overrides.has_torch_function(
+            (nt, mha.in_proj_weight, mha.in_proj_bias, mha.out_proj.weight, mha.out_proj.bias))
+        if has_torch_func:
+            msg = "MultiheadAttention does not support NestedTensor.*argument has_torch_function"
+        else:
+            msg = ("MultiheadAttention does not support NestedTensor outside of its fast path.*grad is " +
+                   "enabled and.*or biases requires_grad")
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(nt, nt, nt)
+
+        if has_torch_func:
+            # Just give up, they're all going to fail with the same message.
+            return
+
+        with torch.no_grad():
+            mha(nt, nt, nt)
+        with torch.inference_mode():
+            mha(nt, nt, nt)
+        nt = torch.nested_tensor([torch.randn(3, 3, requires_grad=False)])
+        nt.requires_grad = False
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(nt, nt, nt)
+        mha.in_proj_weight.requires_grad = False
+        mha.in_proj_bias.requires_grad = False
+        mha.out_proj.weight.requires_grad = False
+        mha.out_proj.bias.requires_grad = False
+        mha(nt, nt, nt)
+
     def test_normalize(self):
         inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
         self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))
@@ -5795,6 +6259,9 @@ def test_state_dict(self):
         self.assertEqual(state_dict['weight'].data_ptr(), l.weight.data_ptr())
         self.assertEqual(state_dict['bias'].data_ptr(), l.bias.data_ptr())
 
+        # Reference https://github.com/pytorch/pytorch/pull/75507#issuecomment-1110291545
+        self.assertNotWarn(lambda: l.state_dict(destination=dict()), "Should not warn kwarg destination w/o _metadata")
+
     def test_load_state_dict(self):
         l = nn.Linear(5, 5)
         block = nn.Module()
@@ -6289,7 +6756,7 @@ def test(should_raise, module, input_size, dtype):
                 # just run it to ensure no exception raised.
                 module(input)
 
-        for dtype in [torch.bfloat16, torch.float, torch.double]:
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
             # Conv1d
             test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype)
             test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype)
@@ -6365,8 +6832,6 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
             output = deconv(inputs)
             output.mean().backward()
 
-
-    @skipIfRocm
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
     def test_Conv2d_groups_nobias(self):
@@ -6406,7 +6871,6 @@ def test_Conv2d_groups_nobias(self):
     # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
-    @skipIfRocm
     def test_Conv2d_groups_nobias_v2(self):
         torch.manual_seed(123)
         dev_dtypes = [("cpu", torch.float)]
@@ -7194,187 +7658,6 @@ def test_Transformer_cell(self):
                                  memory_key_padding_mask=memory_key_padding_mask)
             output.sum().backward()
 
-    def test_transformerencoderlayer(self):
-        # this is a deterministic test for TransformerEncoderLayer
-        d_model = 4
-        nhead = 2
-        dim_feedforward = 16
-        dropout = 0.0
-        bsz = 2
-
-        for batch_first in (False, True):
-            def perm_fn(x):
-                return x.transpose(1, 0) if batch_first else x
-
-            model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
-                                               batch_first=batch_first)
-
-            # set constant weights of the model
-            for idx, p in enumerate(model.parameters()):
-                x = p.data
-                sz = x.view(-1).size(0)
-                shape = x.shape
-                x = torch.cos(torch.arange(0, sz).float().view(shape))
-                p.data.copy_(x)
-
-            # deterministic input
-            encoder_input = torch.tensor([[[20., 30., 40., 50.]]])
-            result = model(encoder_input)
-            ref_output = torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]])
-            result = result.detach().numpy()
-            ref_output = ref_output.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            # 0 values are NOT masked. This shouldn't mask anything.
-            mask = torch.tensor([[0]]) == 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            result = result.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            # 1 values are masked. Since there is only 1 input embedding this
-            # will result in nan.
-            mask = torch.tensor([[1]]) == 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            result = result.detach().numpy()
-            self.assertTrue(np.isnan(result).all())
-
-            # deterministic input
-            encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
-                                                  [[5., 6., 7., 8.]]]))
-            result = model(encoder_input)
-            ref_output = perm_fn(torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]],
-                                               [[2.272644, 0.119035, -0.691669, 0.153486]]]))
-            result = result.detach().numpy()
-            ref_output = ref_output.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            # all 0 which is no masking
-            mask = torch.tensor([[0, 0]]) == 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            result = result.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            mask = torch.tensor([[1, 0]]) == 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            ref_output = perm_fn(torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]],
-                                               [[2.301516, 0.092249, -0.679101, 0.103088]]]))
-            result = result.detach().numpy()
-            ref_output = ref_output.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-
-            # deterministic input
-            encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
-                                                   [0.5387, 0.1655, 0.3565, 0.0471]],
-                                                  [[0.8335, 0.2799, 0.5031, 0.2947],
-                                                   [0.1402, 0.0318, 0.7636, 0.1346]],
-                                                  [[0.6333, 0.9344, 0.1376, 0.9938],
-                                                   [0.8924, 0.2872, 0.6692, 0.2944]],
-                                                  [[0.9897, 0.6915, 0.3154, 0.1733],
-                                                   [0.8645, 0.3513, 0.3064, 0.0767]],
-                                                  [[0.8117, 0.2366, 0.4838, 0.7881],
-                                                   [0.3718, 0.4945, 0.9511, 0.0864]]]))
-            result = model(encoder_input)
-            ref_output = perm_fn(torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249],
-                                                [2.427987, 0.021213, -0.602496, -0.084103]],
-                                               [[2.424689, 0.019155, -0.604793, -0.085672],
-                                                [2.413863, 0.022211, -0.612486, -0.072490]],
-                                               [[2.433774, 0.021598, -0.598343, -0.087548],
-                                                [2.425104, 0.019748, -0.604515, -0.084839]],
-                                               [[2.436185, 0.022682, -0.596625, -0.087261],
-                                                [2.433556, 0.021891, -0.598509, -0.086832]],
-                                               [[2.416246, 0.017512, -0.610712, -0.082961],
-                                                [2.422901, 0.024187, -0.606178, -0.074929]]]))
-            result = result.detach().numpy()
-            ref_output = ref_output.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            # all 0
-            mask = torch.zeros([2, 5]) == 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            result = result.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-            mask[0, 1] = 1
-            mask[1, 3] = 1
-            mask[1, 4] = 1
-            result = model(encoder_input, src_key_padding_mask=mask)
-            ref_output = perm_fn(torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642],
-                                                [2.428811, 0.021445, -0.601912, -0.084252]],
-                                               [[2.425009, 0.019155, -0.604566, -0.085899],
-                                                [2.415408, 0.02249 , -0.611415, -0.073]],
-                                               [[2.434199, 0.021682, -0.598039, -0.087699],
-                                                [2.42598, 0.019941, -0.603896, -0.085091]],
-                                               [[2.436457, 0.022736, -0.59643 , -0.08736],
-                                                [2.434021, 0.022093, -0.598179, -0.08679]],
-                                               [[2.416531, 0.017498, -0.610513, -0.083181],
-                                                [2.4242, 0.024653, -0.605266, -0.074959]]]))
-            result = result.detach().numpy()
-            ref_output = ref_output.detach().numpy()
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            np.testing.assert_allclose(result, ref_output, atol=1e-5)
-
-    def test_transformerencoderlayer_gelu(self):
-        # this is a deterministic test for TransformerEncoderLayer with gelu activation
-        d_model = 4
-        nhead = 2
-        dim_feedforward = 16
-        dropout = 0.0
-        bsz = 2
-
-        for activation, batch_first in product(('gelu', F.gelu, nn.GELU()), (True, False)):
-            def perm_fn(x):
-                return x.transpose(1, 0) if batch_first else x
-
-            model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
-                                               activation, batch_first=batch_first)
-
-            # set constant weights of the model
-            for idx, p in enumerate(model.parameters()):
-                x = p.data
-                sz = x.view(-1).size(0)
-                shape = x.shape
-                x = torch.cos(torch.arange(0, sz).float().view(shape))
-                p.data.copy_(x)
-
-            # deterministic input
-            encoder_input = torch.tensor([[[20., 30., 40., 50.]]])
-            result = model(encoder_input)
-            ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]])
-            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
-
-            # deterministic input
-            encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
-                                                  [[5., 6., 7., 8.]]]))
-            result = model(encoder_input)
-            ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]],
-                                               [[2.264103, 0.121417, -0.696012, 0.159724]]]))
-            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
-
-            # deterministic input
-            encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
-                                                  [0.5387, 0.1655, 0.3565, 0.0471]],
-                                                  [[0.8335, 0.2799, 0.5031, 0.2947],
-                                                  [0.1402, 0.0318, 0.7636, 0.1346]],
-                                                  [[0.6333, 0.9344, 0.1376, 0.9938],
-                                                  [0.8924, 0.2872, 0.6692, 0.2944]],
-                                                  [[0.9897, 0.6915, 0.3154, 0.1733],
-                                                  [0.8645, 0.3513, 0.3064, 0.0767]],
-                                                  [[0.8117, 0.2366, 0.4838, 0.7881],
-                                                  [0.3718, 0.4945, 0.9511, 0.0864]]]))
-            result = model(encoder_input)
-            ref_output = perm_fn(torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082],
-                                                [2.42151276, 0.03302179, -0.60722523, -0.05762651]],
-                                               [[2.41926761, 0.02974034, -0.60879519, -0.0621269],
-                                                [2.41626395, 0.03539356, -0.61087842, -0.04978623]],
-                                               [[2.42382808, 0.03218872, -0.6055963, -0.06073591],
-                                                [2.41983477, 0.03085259, -0.60840145, -0.06046414]],
-                                               [[2.42500749, 0.03328855, -0.60476388, -0.0595334],
-                                                [2.4237977, 0.03290575, -0.60561789, -0.05940082]],
-                                               [[2.41383916, 0.02686345, -0.61256377, -0.06380707],
-                                                [2.42000277, 0.03800944, -0.60824798, -0.04754947]]]))
-            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
-
     def test_transformerdecoderlayer(self):
         # this is a deterministic test for TransformerDecoderLayer
         d_model = 4
@@ -7633,7 +7916,7 @@ def get_a_test_layer(use_cuda, activation, batch_first=False):
         use_cuda = torch.cuda.is_available()
         device = torch.device("cuda" if use_cuda else "cpu")
 
-        for batch_first in (True, False):
+        def _test(batch_first, training):
             def perm_fn(x):
                 return x.transpose(1, 0) if batch_first else x
 
@@ -7641,6 +7924,8 @@ def perm_fn(x):
                                              batch_first=batch_first)
 
             model = nn.TransformerEncoder(encoder_layer, 1).to(device)
+            if not training:
+                model = model.eval()
 
             # deterministic input
             encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
@@ -7694,6 +7979,8 @@ def perm_fn(x):
 
             # test case 2, multiple layers no norm
             model = nn.TransformerEncoder(encoder_layer, 2).to(device)
+            if not training:
+                model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
             ref_output = perm_fn(torch.tensor([[[2.419051, 0.017446, -0.608738, -0.085003],
                                                 [2.419102, 0.017452, -0.608703, -0.085026]],
@@ -7710,6 +7997,8 @@ def perm_fn(x):
             torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             model = nn.TransformerEncoder(encoder_layer, 6).to(device)
+            if not training:
+                model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
             ref_output = perm_fn(torch.tensor([[[2.419101, 0.017453, -0.608703, -0.085025],
                                                 [2.419101, 0.017453, -0.608704, -0.085025]],
@@ -7729,6 +8018,8 @@ def perm_fn(x):
             # d_model = 4
             norm = nn.LayerNorm(4)
             model = nn.TransformerEncoder(encoder_layer, 2, norm=norm).to(device)
+            if not training:
+                model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
             ref_output = perm_fn(torch.tensor([[[1.695949, -0.357635, -0.893077, -0.445238],
                                                 [1.695955, -0.357639, -0.893050, -0.445266]],
@@ -7745,6 +8036,8 @@ def perm_fn(x):
             torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device)
+            if not training:
+                model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
             ref_output = perm_fn(torch.tensor([[[1.695955, -0.357639, -0.893051, -0.445265],
                                                 [1.695955, -0.357639, -0.893051, -0.445265]],
@@ -7759,7 +8052,15 @@ def perm_fn(x):
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
             torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
-
+        for batch_first in (True, False):
+            for training in (True, False):
+                # Fast path requires inference mode.
+                if training:
+                    cm = contextlib.nullcontext()
+                else:
+                    cm = torch.no_grad()
+                with cm:
+                    _test(batch_first, training)
 
     def test_transformerdecoder(self):
         def get_a_test_layer(use_cuda, activation, batch_first=False):
@@ -9142,6 +9443,28 @@ def test_pixel_shuffle_unshuffle_5D():
         test_pixel_shuffle_unshuffle_4D()
         test_pixel_shuffle_unshuffle_5D()
 
+    def test_pixel_shuffle_nhwc_cpu(self):
+        input = torch.randn(3, 18, 4, 4, device='cpu')
+        input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
+        grad = torch.randn(3, 18, 4, 4, device='cpu')
+        ps = torch.nn.PixelShuffle(3)
+        pus = torch.nn.PixelUnshuffle(3)
+
+        ref_input = input.detach().clone().contiguous().requires_grad_(True)
+        ref_grad = grad.detach().clone().contiguous()
+        ref_ps = torch.nn.PixelShuffle(3)
+        ref_pus = torch.nn.PixelUnshuffle(3)
+
+        out = pus(ps(input))
+        out.backward(grad)
+        ref_out = ref_pus(ref_ps(ref_input))
+        ref_out.backward(ref_grad)
+
+        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(ref_out.is_contiguous())
+        self.assertEqual(out, ref_out)
+        self.assertEqual(input.grad, ref_input.grad)
+
     # These tests should be OpInfo'd
     def test_elu_inplace_on_view(self):
         v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
@@ -9179,55 +9502,15 @@ def func(root):
         gradcheck(func, [v])
         gradgradcheck(func, [v])
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_PReLU_backward_requires_grad_false(self):
-        m = nn.PReLU().to('cuda')
-        x = torch.randn(2, 3, 4, 5, requires_grad=False, device='cuda')
-        y = m(x)
-        y.mean().backward()
-        self.assertEqual(x.grad, None)
-
-    @unittest.skipIf(
-        not TEST_NUMPY or not TEST_SCIPY, "Numpy or Scipy not found")
-    def test_gelu(self):
-        def _test_gelu(n, m, dtype, contiguous, atol=None, rtol=None):
-            numpy_dtype = {
-                torch.bfloat16: torch.float, torch.float: torch.float, torch.double: torch.double
-            }[dtype]
-            devices = ['cpu']
-            devices += ['cuda'] if TEST_CUDA else []
-
-            def _gelu_ref(X):
-                return X * stats.norm.cdf(X)
-
-            for d in devices:
-                if contiguous:
-                    X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)
-                else:
-                    X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)[:, ::2]
-                res = F.gelu(X)
-                ref = _gelu_ref(X.to(numpy_dtype).cpu().detach().numpy())
-                self.assertEqual(res, ref, rtol=rtol, atol=atol, exact_dtype=False)
-                if dtype == torch.float64:
-                    gradcheck(F.gelu, [X], eps=1e-4)
-
-        for n in range(1, 10):
-            for m in range(1, 10):
-                _test_gelu(n, m, torch.bfloat16, True, 1e-2, 0)
-                _test_gelu(n, m, torch.bfloat16, False, 1e-2, 0)
-                _test_gelu(n, m, torch.float32, True)
-                _test_gelu(n, m, torch.float32, False)
-                _test_gelu(n, m, torch.float64, True)
-                _test_gelu(n, m, torch.float64, False)
-
-        # Test multi threaded
-        num_threads = torch.get_num_threads()
-        torch.set_num_threads(4)
-        try:
-            _test_gelu(32, 32, torch.float32, False)
-        finally:
-            torch.set_num_threads(num_threads)
-
+        devices = ['cpu']
+        devices += ['cuda'] if TEST_CUDA else []
+        for d in devices:
+            m = nn.PReLU().to(d)
+            x = torch.randn(2, 3, 4, 5, device=d, requires_grad=False)
+            y = m(x)
+            y.mean().backward()
+            self.assertEqual(x.grad, None)
 
     def test_bce_loss_always_nonnegative(self):
         target = torch.ones(5)
@@ -9445,22 +9728,26 @@ def test_hardtanh_backward(self):
         self.assertEqual(x.grad, x_grad_ref)
 
     def test_batchnorm_nhwc_cpu(self):
-        def helper(self, size):
+        def helper(self, size, dtype, mixed_dtype=False):
             channels = size[1]
-            input = torch.randn(size, dtype=torch.float32, device='cpu', requires_grad=True)
-            input = input.contiguous(memory_format=torch.channels_last)
+            input = torch.randn(size, dtype=dtype, device='cpu', requires_grad=True)
+            input = input.contiguous(memory_format=torch.channels_last).to(dtype)
             input.retain_grad()
-            grad = torch.randn(size, dtype=torch.float32, device='cpu')
+            grad = torch.randn(size, dtype=dtype, device='cpu')
             grad = grad.contiguous(memory_format=torch.channels_last)
-            bn = nn.BatchNorm2d(channels).cpu().float()
+            bn = nn.BatchNorm2d(channels).cpu().to(dtype)
             bn.weight.data.uniform_()
             bn.bias.data.uniform_()
 
             ref_input = input.detach().clone().contiguous().requires_grad_(True)
             ref_grad = grad.detach().clone().contiguous()
-            ref_bn = nn.BatchNorm2d(channels).cpu().float()
+            ref_bn = nn.BatchNorm2d(channels).cpu().to(dtype)
             ref_bn.load_state_dict(bn.state_dict())
 
+            if mixed_dtype:
+                bn.float()
+                ref_bn.float()
+
             out = bn(input)
             out.backward(grad)
             ref_out = ref_bn(ref_input)
@@ -9473,9 +9760,11 @@ def helper(self, size):
             self.assertEqual(bn.bias.grad, ref_bn.bias.grad)
             self.assertEqual(input.grad, ref_input.grad)
 
-        helper(self, (4, 8, 10, 10))
-        helper(self, (4, 1, 9, 9))
-        helper(self, (4, 9, 1, 1))
+        # test NC11 and N1HW; test mixed dtype
+        for shape in [(4, 8, 10, 10), (4, 1, 9, 9), (4, 9, 1, 1)]:
+            helper(self, shape, torch.float, False)
+            helper(self, shape, torch.bfloat16, False)
+            helper(self, shape, torch.bfloat16, True)
 
     def test_batchnorm_non_contig_cpu(self):
         input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu()
@@ -9603,6 +9892,29 @@ def test_batchnorm_raises_error_if_bias_is_not_same_size_as_input(self):
             with self.assertRaises(RuntimeError):
                 F.batch_norm(input, running_mean, running_var, bias=Parameter(torch.rand(size)))
 
+    def test_batchnorm_raises_error_if_running_var_or_running_mean_have_forward_grad(self):
+        args = (
+            torch.randn(3, 2, 5),  # input
+            torch.randn(2),  # running_mean
+            torch.randn(2),  # running_var
+        )
+        kwargs = {'training': False, 'momentum': -1.2}
+        fn = partial(F.batch_norm, **kwargs)
+
+        for dual_indices in ((0,), (1,), (1, 2), (0, 1), (0, 1, 2),):
+            tangents = tuple(torch.rand_like(x) for x in args)
+
+            with fwAD.dual_level():
+                duals = [fwAD.make_dual(primal, tangent) if i in dual_indices else primal
+                         for i, (primal, tangent) in enumerate(zip(args, tangents))]
+                msg = "batch_norm is not differentiable wrt running_mean and running_var"
+                # 0 needs to have forward grad because otherwise we won't even run batch_norm_jvp
+                if (1 in dual_indices or 2 in dual_indices) and 0 in dual_indices:
+                    with self.assertRaisesRegex(RuntimeError, msg):
+                        fn(*duals)
+                else:
+                    fn(*duals)
+
     def test_batchnorm_buffer_update_when_stats_are_not_tracked(self):
         input_size = (32, 4)
         # Instantiate BN with buffers that are not None
@@ -9689,22 +10001,6 @@ def func(x):
             # just run a single backward, as gradcheck/gradgradcheck is expensive here
             output.sum().backward()
 
-    def test_binary_cross_entropy_grads(self):
-        import torch.nn.functional as F
-        for device in device_():
-            input = torch.rand(3, 3, dtype=torch.double, device=device, requires_grad=True)
-            target = torch.rand(3, 3, dtype=torch.double, device=device)
-
-            gradcheck(F.binary_cross_entropy, [input, target])
-            gradgradcheck(F.binary_cross_entropy, [input, target])
-
-            # now with diffentiable target
-            target.requires_grad_(True)
-            gradcheck(F.binary_cross_entropy, [input, target], check_batched_grad=False)
-            # no double backward for target yet
-            with self.assertRaisesRegex(RuntimeError, "not implemented"):
-                gradgradcheck(F.binary_cross_entropy, [input, target], check_batched_grad=False)
-
     def test_cosine_embedding_loss_with_diff_type(self):
         for device in device_():
             input1 = torch.tensor([[2, 3, 4], [6, 2, 4]], dtype=torch.double, device=device)
@@ -9957,11 +10253,20 @@ def test_cosine_similarity(self):
         self.assertLessEqual(out, 1.0)
 
         # Check dividing by 0.
+        # previous behavior: <x,y>/max(eps, ||x|| * ||y||)
+        # current: <x/max(eps, ||x||), y/max(eps,||y||)>
+        # if f(x,y) is the cosine similarity, then
+        # df/dx = y/(||x|| * ||y||) - (x * <x,y> * ||y||/||x||)/(||x|| * ||y||)^2
+        # the tests below check division by zero in the backward formula when
+        # x := input2 = 0, y := input1 != 0.
+        # For these inputs the gradient wrt x simplifies to g(x,y) := y/(||x|| * ||y||)
+        # Previous test checks g(x,y) == y/eps,
+        # Current test checks g(x,y) == (y/||y||)/eps.
         input1 = torch.randn(10).requires_grad_()
         input2 = torch.zeros_like(input1).requires_grad_()
         torch.cosine_similarity(input1, input2, 0).sum().backward()
         self.assertEqual(input1.grad, torch.zeros_like(input1))
-        self.assertEqual(input2.grad, input1 * 1e8)
+        self.assertEqual(input2.grad, input1 / input1.norm() * 1e8)
 
         # Check type promotion, issue #61454
         input = torch.tensor(12.)
@@ -9981,10 +10286,10 @@ def test_grid_sample_error_checking(self):
         with self.assertRaisesRegex(ValueError, "but got: 'garbage'"):
             F.grid_sample(input, grid, padding_mode='garbage', align_corners=False)
 
-        with self.assertRaisesRegex(RuntimeError, "expected 4D or 5D input"):
+        with self.assertRaisesRegex(RuntimeError, "expected grid to have size 1 in last dimension"):
             F.grid_sample(input[0], grid, align_corners=False)
 
-        with self.assertRaisesRegex(RuntimeError, "grid with same number of dimensions"):
+        with self.assertRaisesRegex(RuntimeError, "expected grid to have size 2 in last dimension"):
             F.grid_sample(input, torch.empty(1, 1, 1, 1, 3), align_corners=False)
 
         with self.assertRaisesRegex(RuntimeError, "expected grid and input to have same batch size"):
@@ -10000,7 +10305,7 @@ def test_grid_sample_error_checking(self):
             F.grid_sample(torch.empty(1, 1, 2, 2, 2), torch.empty(1, 1, 1, 1, 3), mode='bicubic')
 
         if TEST_CUDA:
-            with self.assertRaisesRegex(RuntimeError, "expected input and grid to be on same device"):
+            with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
                 F.grid_sample(input.cuda(), grid, align_corners=False)
 
     def test_affine_grid_error_checking(self):
@@ -10077,7 +10382,6 @@ def test_affine_grid_error_checking(self):
         with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"):
             F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False)
 
-    @skipIfRocm
     def test_grid_sample(self):
         # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient,
         # so we test both cases.
@@ -10426,26 +10730,24 @@ def get_grid(device='cpu', data=None):
                     W = random.randint(2, 8)
                     input = torch.randn(N, C, H, W, requires_grad=True)
                     grid = torch.randn(N, H, W, 2, requires_grad=True)
-                    self.assertTrue(gradcheck(
-                        lambda inp, grid: F.grid_sample(inp, grid, mode=mode, padding_mode=padding_mode,
-                                                        align_corners=align_corners),
-                        (input, grid)))
-                    input = input.requires_grad_(False)
-                    self.assertTrue(gradcheck(
-                        lambda grid: F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode,
-                                                   align_corners=align_corners),
-                        (grid,)))
 
                     for input_requires_grad in [False, True]:
+                        input.requires_grad_(input_requires_grad)
+                        self.assertTrue(gradcheck(
+                            lambda inp, grd: F.grid_sample(inp, grd, mode=mode, padding_mode=padding_mode,
+                                                           align_corners=align_corners),
+                            (input, grid)))
                         test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad)
                         if TEST_CUDNN:
                             with cudnn.flags(enabled=False):
                                 test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad)
 
     def test_grid_sample_3d(self):
-        def test(N, C, D, H, W, mode, padding_mode, align_corners):
+        # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient,
+        # so we test both cases.
+        def test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad):
             def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners):
-                input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_()
+                input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad)
                 grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_()
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
                                         align_corners=align_corners)
@@ -10455,24 +10757,25 @@ def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners):
                 out_cpu.backward(gradients)
 
                 if TEST_CUDA:
-                    input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                    input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_(input_requires_grad)
                     grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
                     out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode,
                                              align_corners=align_corners)
                     self.assertEqual(out_cpu, out_cuda)
 
                     out_cuda.backward(gradients.cuda())
-                    self.assertEqual(input_cpu.grad, input_cuda.grad)
+                    if input_requires_grad:
+                        self.assertEqual(input_cpu.grad, input_cuda.grad)
                     self.assertEqual(grid_cpu.grad, grid_cuda.grad, atol=5e-5, rtol=0)
 
                     # check that zero-dimensional input strides don't error out
                     base_input = torch.randn(N, C, 1, IH, IW)
-                    input_cpu = base_input.expand_as(input_cuda).requires_grad_()
+                    input_cpu = base_input.expand_as(input_cuda).requires_grad_(input_requires_grad)
                     grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True)
                     out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
                                             align_corners=align_corners)
 
-                    input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
+                    input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_(input_requires_grad)
                     grid_cuda = grid_cpu.detach().cuda().requires_grad_()
                     out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode,
                                              align_corners=align_corners)
@@ -10558,8 +10861,14 @@ def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners):
                         lambda inp, grid: F.grid_sample(inp, grid, mode=mode, padding_mode=padding_mode,
                                                         align_corners=align_corners),
                         (input, grid)))
+                    input = input.requires_grad_(False)
+                    self.assertTrue(gradcheck(
+                        lambda grid: F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode,
+                                                   align_corners=align_corners),
+                        (grid,)))
 
-                    test(N, C, D, H, W, mode, padding_mode, align_corners)
+                    for input_requires_grad in [False, True]:
+                        test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad)
 
     def test_affine_grid(self):
         # test known input on CPU
@@ -10878,6 +11187,30 @@ def test_upsampling_small_scale(self):
         expected_out_t = torch.tensor([[[[2.5]]]])
         self.assertEqual(expected_out_t, out_t)
 
+    def test_upsampling_bfloat16(self, dtype=torch.bfloat16):
+        def helper(size, scale_factor, mode, device):
+            inputf = torch.randn(size, device=device, dtype=torch.float, requires_grad=True)
+            input = inputf.to(dtype).detach().requires_grad_(True)
+            m = nn.Upsample(scale_factor=scale_factor, mode=mode)
+
+            outf = m(inputf)
+            out = m(input)
+            self.assertEqual(out.dtype, dtype)
+            self.assertEqualIgnoreType(out, outf, atol=0.1, rtol=0.0)
+
+            out.sum().backward()
+            outf.sum().backward()
+            self.assertEqual(input.grad.dtype, dtype)
+            self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0.1, rtol=0)
+
+        for device in ['cpu']:
+            helper([3, 20, 30], 2, 'nearest', device)
+            helper([3, 20, 11, 7], 2, 'nearest', device)
+            helper([3, 20, 11, 7, 3], 2, 'nearest', device)
+            helper([3, 20, 30], 2, 'linear', device)
+            helper([3, 20, 11, 7], 2, 'bilinear', device)
+            helper([3, 20, 11, 7, 3], 2, 'trilinear', device)
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_interpolate_illegal_memory_access(self):
         in_s = 45
@@ -11388,6 +11721,12 @@ def test_cross_entropy_loss_precision(self):
         outd = loss_cpu(inputd, target)
         self.assertEqual(outf, outd, exact_dtype=False)
 
+    def test_cross_entropy_loss_zero_div(self):
+        # Test for issue #73165
+        input_1 = torch.rand([5, 0], dtype=torch.float32)
+        input_2 = torch.rand([5, 0], dtype=torch.float32)
+        torch.nn.CrossEntropyLoss()(input_1, input_2)
+
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_convert_sync_batchnorm(self):
         module = torch.nn.Sequential(
@@ -12771,6 +13110,17 @@ def _test_LayerNorm_cuda_half(self, device):
         output.sum().backward()
         self.assertEqualTypeString(output, input)
 
+    def _test_LayerNorm_cpu_mixed_dtype(self, device):
+        for elementwise_affine in [True, False]:
+            # layer norm input shape is normalized to m x n, cpu vectorized on n,
+            # so make sure n exceeds vector length
+            input = torch.empty(2, 3, 11, 3, device=device, dtype=torch.bfloat16).random_(1, 10)
+            m = nn.LayerNorm([11, 3], elementwise_affine=elementwise_affine).to(device, torch.bfloat16)
+            m2 = deepcopy(m).to(device, torch.float)
+            out = m(input)
+            out2 = m2(input)
+            self.assertEqual(out, out2)
+
     def _test_GroupNorm_general(self, device, dtype=torch.float):
         good_shape_g = {
             (1, 2, 3, 4): 2,
@@ -12824,9 +13174,8 @@ def _test_GroupNorm_general(self, device, dtype=torch.float):
             (2, 6, 4, 2, 2): 4,
         }
         for shape, g in bad_shape_g.items():
-            gn = nn.GroupNorm(g, shape[1])
-            input = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10)
-            self.assertRaises(RuntimeError, lambda: gn(input))
+            with self.assertRaises(ValueError):
+                gn = nn.GroupNorm(g, shape[1])
 
     def _test_GroupNorm_cuda_half(self):
         input = torch.zeros(2, 4, 3, 2, requires_grad=True).cuda().half().random_(1, 10)
@@ -12835,17 +13184,20 @@ def _test_GroupNorm_cuda_half(self):
         output.sum().backward()
         self.assertEqualTypeString(output, input)
 
-    def _test_module_empty_input(self, module, inp, check_size=True):
-        inp.requires_grad_(True)
+    def _test_module_empty_input(self, module, inp, check_size=True, inference=False):
+        if not inference:
+            inp.requires_grad_(True)
         out = module(inp)
-        gO = torch.rand_like(out)
-        out.backward(gO)
+        if not inference:
+            gO = torch.rand_like(out)
+            out.backward(gO)
         if check_size:
             self.assertEqual(out.size(), inp.size())
-        for p in module.parameters():
-            if p.requires_grad:
-                self.assertEqual(p.grad, torch.zeros_like(p.grad))
-        self.assertEqual(inp.grad, torch.zeros_like(inp))
+        if not inference:
+            for p in module.parameters():
+                if p.requires_grad:
+                    self.assertEqual(p.grad, torch.zeros_like(p.grad))
+            self.assertEqual(inp.grad, torch.zeros_like(inp))
 
     def _test_module_empty_inputs(self, module, inputs):
         for _inp in inputs:
@@ -13097,7 +13449,7 @@ def test_affine_3d_rotateRandom(self, device):
 
     @onlyCUDA
     @skipCUDAIfNoCudnn
-    @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
     def test_Conv2d_deterministic_cudnn(self, device, dtype):
         inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
@@ -13116,7 +13468,7 @@ def test_Conv2d_deterministic_cudnn(self, device, dtype):
 
 
     @onlyCUDA
-    @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
     def test_Conv2d_large_workspace(self, device, dtype):
         # These sizes require huge cuDNN workspaces. Make sure we choose a
         # reasonable algorithm that does not run out of memory
@@ -13241,7 +13593,7 @@ def test_Conv3d_depthwise_naive_groups(self, device, dtype):
 
 
     @onlyCUDA
-    @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
     def test_noncontig_conv_grad(self, device, dtype):
         # FIXME: remove after adding non-contiguous grad tests for all modules
         module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype)
@@ -13357,8 +13709,8 @@ def test_conv_double_backward_stride(self):
                                                batch_size, inp_size, dilation,
                                                no_weight)
 
-
-    def test_conv1d_same_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_same_padding(self, device, dtype):
         # Test padding='same' outputs the correct shape
         test_args = [
             # in_size
@@ -13371,22 +13723,22 @@ def test_conv1d_same_padding(self, device):
             [1],
         ]
         for in_size, k_size, dilation, stride in itertools.product(*test_args):
-            x = torch.rand(1, 1, in_size, device=device)
-            y = torch.rand(1, 1, k_size, device=device)
+            x = torch.rand(1, 1, in_size, device=device, dtype=dtype)
+            y = torch.rand(1, 1, k_size, device=device, dtype=dtype)
             z = F.conv1d(x, y, padding='same', dilation=dilation, stride=stride)
             self.assertEqual(z.size(2), int(math.ceil(in_size / stride)))
 
         # Compare F.conv1d padding='same' output against manual padding
         # Without strides/dilation
-        x = torch.rand(1, 1, 12, device=device)
-        y = torch.rand(1, 1, 3, device=device)
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 3, device=device, dtype=dtype)
         expect = F.conv1d(x, y, padding=1)
         actual = F.conv1d(x, y, padding='same')
         self.assertEqual(expect, actual)
 
         # With dilation
-        x = torch.rand(1, 1, 12, device=device)
-        y = torch.rand(1, 1, 4, device=device)
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
         expect = F.conv1d(x, y, padding=3, dilation=2)
         actual = F.conv1d(x, y, padding='same', dilation=2)
         self.assertEqual(expect, actual)
@@ -13396,76 +13748,89 @@ def test_conv1d_same_padding(self, device):
         actual = F.conv1d(x, y, padding='same', dilation=3)
         self.assertEqual(expect, actual)
 
-
-    def test_conv2d_same_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_same_padding(self, device, dtype):
+        if dtype is torch.cfloat:
+            rtol, atol = 2e-6, 2e-6
+        else:
+            rtol, atol = None, None
         # Compare F.conv2d padding='same' output against manual padding
         # Without strides/dilation
-        x = torch.rand(1, 1, 10, 11, device=device)
-        y = torch.rand(1, 1, 4, 5, device=device)
+        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
         actual = F.conv2d(x, y, padding='same')
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # With dilation
-        y = torch.rand(1, 1, 3, 4, device=device)
+        y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
         actual = F.conv2d(x, y, padding='same', dilation=2)
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # Dilation with asymmetric padding
-        y = torch.rand(1, 1, 4, 4, device=device)
+        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
         actual = F.conv2d(x, y, padding='same', dilation=3)
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
-    def test_conv3d_same_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv3d_same_padding(self, device, dtype):
+        if dtype is torch.cfloat:
+            rtol, atol = 2e-6, 2e-6
+        else:
+            rtol, atol = None, None
         # Compare F.conv3d padding='same' output against manual padding
         # Without strides/dilation
-        x = torch.rand(1, 1, 10, 11, 12, device=device)
-        y = torch.rand(1, 1, 1, 2, 5, device=device)
+        x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype)
         expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :]
         actual = F.conv3d(x, y, padding='same')
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # With dilation
         expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
         actual = F.conv3d(x, y, padding='same', dilation=2)
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # Dilation with asymmetric padding
-        y = torch.rand(1, 1, 4, 4, 4, device=device)
+        y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype)
         expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:]
         actual = F.conv3d(x, y, padding='same', dilation=3)
-        self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
-    def test_conv1d_valid_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_valid_padding(self, device, dtype):
         # Test F.conv1d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 10, device=device)
-        y = torch.rand(1, 1, 4, device=device)
+        x = torch.rand(1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
         expect = F.conv1d(x, y)
         actual = F.conv1d(x, y, padding='valid')
         self.assertEqual(expect, actual)
 
-    def test_conv2d_valid_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_valid_padding(self, device, dtype):
         # Test F.conv2d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 1, 10, device=device)
-        y = torch.rand(1, 1, 1, 4, device=device)
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y)
         actual = F.conv2d(x, y, padding='valid')
         self.assertEqual(expect, actual)
 
-    def test_conv3d_valid_padding(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv3d_valid_padding(self, device, dtype):
         # Test F.conv3d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 1, 1, 10, device=device)
-        y = torch.rand(1, 1, 1, 1, 4, device=device)
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device)
         expect = F.conv3d(x, y)
         actual = F.conv3d(x, y, padding='valid')
         self.assertEqual(expect, actual)
 
-    def test_conv1d_same_padding_backward(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_same_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='same'
-        x = torch.rand(1, 1, 12, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 4, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
 
         # Symmetric padding
         z = F.conv1d(x, y, padding=3, dilation=2)
@@ -13490,10 +13855,11 @@ def test_conv1d_same_padding_backward(self, device):
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
-    def test_conv2d_same_padding_backward(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_same_padding_backward(self, device, dtype):
         # Test F.conv2d gradients work with padding='same'
-        x = torch.rand(1, 1, 10, 11, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 4, 5, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype, requires_grad=True)
 
         # Symmetric padding
         z = F.conv2d(x, y, padding=(3, 4), dilation=2)
@@ -13508,7 +13874,7 @@ def test_conv2d_same_padding_backward(self, device):
         x.grad, y.grad = None, None
 
         # Asymmetric padding
-        y = torch.rand(1, 1, 4, 4, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True)
         z = F.conv2d(x, y, padding=2)[..., 1:, 1:]
         z.sum().backward()
         gx_expect, gy_expect = x.grad, y.grad
@@ -13519,12 +13885,13 @@ def test_conv2d_same_padding_backward(self, device):
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
-    def test_conv3d_same_padding_backward(self, device):
+    @dtypes(torch.double, torch.cdouble)
+    def test_conv3d_same_padding_backward(self, device, dtype):
         check_forward_ad = torch.device(device).type != 'xla'
 
         # Test F.conv3d gradients work with padding='same'
-        x = torch.rand(1, 1, 1, 11, 12, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 1, 2, 5, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 2, 5, dtype=dtype, device=device, requires_grad=True)
 
         # Symmetric padding
         z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
@@ -13546,7 +13913,7 @@ def test_conv3d_same_padding_backward(self, device):
                           check_fwd_over_rev=True)
 
         # Asymmetric padding
-        y = torch.rand(1, 1, 1, 4, 4, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
         z = F.conv3d(x, y, padding=2)[..., 1:, 1:]
         z.sum().backward()
         gx_expect, gy_expect = x.grad, y.grad
@@ -13564,10 +13931,11 @@ def test_conv3d_same_padding_backward(self, device):
             gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
                           check_fwd_over_rev=True)
 
-    def test_conv1d_valid_padding_backward(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_valid_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='valid'
-        x = torch.rand(1, 1, 10, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 4, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
         F.conv1d(x, y, padding=0).sum().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
@@ -13577,10 +13945,132 @@ def test_conv1d_valid_padding_backward(self, device):
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
 
-    def test_conv2d_valid_padding_backward(self, device):
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv1d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 10), device=device, dtype=dtype)
+        feat_dim = t.shape[1]
+        weight_even = make_tensor((1, 1, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 1-D inputs.
+            t_a = t.view(-1).cpu().numpy()
+            w_a = weight.view(-1).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv1d is different
+                # from SciPy
+                p = weight.shape[2] // 2
+                t = torch.nn.functional.pad(t, (p, p))
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve
+            weight_flipped = torch.flip(weight, (2,))
+            actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:feat_dim]
+
+            self.assertEqual(actual, expected)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv2d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 2-D inputs.
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve2d(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv2d is different
+                # from SciPy
+                left_right_pad = weight.shape[3] // 2
+                top_bottom_pad = weight.shape[2] // 2
+                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad)
+                t = torch.nn.functional.pad(t, p)
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve2d
+            weight_flipped = torch.flip(weight, (2, 3))
+            actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:5, :10]
+
+            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv3d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 2, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 3-D inputs.
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv3d is different
+                # from SciPy
+                left_right_pad = weight.shape[4] // 2
+                top_bottom_pad = weight.shape[3] // 2
+                front_back_pad = weight.shape[2] // 2
+                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad,
+                     front_back_pad, front_back_pad)
+                t = torch.nn.functional.pad(t, p)
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve
+            weight_flipped = torch.flip(weight, (2, 3, 4))
+            actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:5, :5, :10]
+
+            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @dtypes(torch.float, torch.complex64)
+    def test_conv2d_valid_padding_backward(self, device, dtype):
         # Test F.conv2d gradients work with padding='valid'
-        x = torch.rand(1, 1, 1, 10, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 1, 4, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True)
         F.conv2d(x, y, padding=0).sum().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
@@ -13590,12 +14080,13 @@ def test_conv2d_valid_padding_backward(self, device):
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
 
-    def test_conv3d_valid_padding_backward(self, device):
+    @dtypes(torch.double, torch.cdouble)
+    def test_conv3d_valid_padding_backward(self, device, dtype):
         check_forward_ad = torch.device(device).type != 'xla'
 
         # Test F.conv3d gradients work with padding='valid'
-        x = torch.rand(1, 1, 1, 1, 10, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 1, 1, 4, device=device, requires_grad=True)
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True)
         F.conv3d(x, y, padding=0).sum().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
@@ -13608,6 +14099,17 @@ def test_conv3d_valid_padding_backward(self, device):
         gradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_forward_ad=check_forward_ad)
         gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad)
 
+    @parametrize_test("N", range(2, 4), name_fn=lambda N: 'ConvTranspose{}d'.format(N))
+    def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
+        # For inputs with no batch dim, verify output is the correct shape when output_size is set.
+        # See https://github.com/pytorch/pytorch/issues/75889
+        inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device)
+        output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200)
+        ConvTransposeNd = getattr(nn, 'ConvTranspose{}d'.format(N))
+        m = ConvTransposeNd(1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device)
+        output = m(inp, output_size=output_size)
+        self.assertEqual(output.shape, output_size)
+
     @skipMeta
     @parametrize_test("input_shape,transposed,dilated,groups,layout,backend_expected", [
         # === slow ===
@@ -13798,6 +14300,20 @@ def _make_noncontiguous(inp):
         if layout is torch._mkldnn:
             return
 
+        if backend_actual != torch._C._ConvBackend.Empty:  # FIXME: forward AD fails
+            # Forward AD and forward-over-reverse AD smoke test in float32
+            # TODO: remove this if we introduce per-op gradient tests for float32
+            with fwAD.dual_level():
+                dual_inputs = [(fwAD.make_dual(i, torch.rand_like(i)) if isinstance(i, torch.Tensor) else i) for i in inputs]
+                # Forward AD
+                output = convolution(*dual_inputs)
+                # Forward over reverse AD
+                grad_output_d = fwAD.make_dual(torch.rand_like(output), torch.rand_like(output))
+                if has_bias:
+                    torch.autograd.grad(output, [x, weight, bias], grad_output_d)
+                else:
+                    torch.autograd.grad(output, [x, weight], grad_output_d)
+
         # Convert to float64 for gradcheck.
         x = x.to(torch.float64).detach().requires_grad_(True)
         weight = weight.to(torch.float64).detach().requires_grad_(True)
@@ -13979,6 +14495,9 @@ def test_LayerNorm_general(self, device):
         if self.device_type == 'cuda':
             self._test_LayerNorm_cuda_half(device)
 
+        if self.device_type == 'cpu':
+            self._test_LayerNorm_cpu_mixed_dtype(device)
+
     @onlyNativeDeviceTypes
     def test_LayerNorm_numeric(self, device):
         def layer_norm_ref(X, gamma, beta, normalized_shape, eps):
@@ -14005,9 +14524,31 @@ def layer_norm_ref(X, gamma, beta, normalized_shape, eps):
             Y_cpu = layer_norm(X.cpu())
             self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5)
 
-    @onlyNativeDeviceTypes
-    def test_GroupNorm_general(self, device):
-        self._test_GroupNorm_general(device)
+    @onlyCPU
+    def test_glu_bfloat16(self, device):
+        def test_dtype(fn, input, dtype):
+            input = input.detach().clone().to(dtype=dtype).requires_grad_(True)
+            input2 = input.detach().clone().float().requires_grad_(True)
+            out = fn(input)
+            out.sum().backward()
+            out2 = fn(input2)
+            out2.sum().backward()
+            self.assertEqual(out.dtype, dtype)
+            self.assertEqual(input.grad.dtype, dtype)
+            self.assertEqual(out, out2, exact_dtype=False)
+            self.assertEqual(input.grad, input2.grad, atol=1e-2, rtol=0, exact_dtype=False)
+
+        def func(device):
+            return torch.nn.GLU(dim=-1).to(device)
+
+        shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]]
+        for shape in shapes:
+            x = torch.randn(shape, device=device)
+            test_dtype(func(device), x, torch.bfloat16)
+
+    @onlyNativeDeviceTypes
+    def test_GroupNorm_general(self, device):
+        self._test_GroupNorm_general(device)
 
         if self.device_type == 'cuda':
             self._test_GroupNorm_cuda_half()
@@ -14028,13 +14569,13 @@ def test_GroupNorm_empty(self, device):
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_groupnorm_nhwc(self, device, dtype):
-        def helper(self, size, groups):
+        def helper(self, size, groups, memory_format):
             channels = size[1]
             input = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-            input = input.contiguous(memory_format=torch.channels_last)
+            input = input.contiguous(memory_format=memory_format)
             input.retain_grad()
             grad = torch.randn(size, dtype=dtype, device=device)
-            grad = grad.contiguous(memory_format=torch.channels_last)
+            grad = grad.contiguous(memory_format=memory_format)
             gn = nn.GroupNorm(groups, channels).to(device).to(dtype)
             gn.weight.data.uniform_()
             gn.bias.data.uniform_()
@@ -14049,15 +14590,16 @@ def helper(self, size, groups):
             ref_out = ref_gn(ref_input)
             ref_out.backward(ref_grad)
 
-            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(out.is_contiguous(memory_format=memory_format))
             self.assertTrue(ref_out.is_contiguous())
             self.assertEqual(out, ref_out)
             self.assertEqual(gn.weight.grad, ref_gn.weight.grad)
             self.assertEqual(gn.bias.grad, ref_gn.bias.grad)
             self.assertEqual(input.grad, ref_input.grad)
 
-        helper(self, (4, 8, 10, 10), 4)
-        helper(self, (2, 30, 9, 9), 3)
+        helper(self, (4, 8, 10, 10), 4, torch.channels_last)
+        helper(self, (2, 30, 9, 9), 3, torch.channels_last)
+        helper(self, (2, 9, 7, 11, 15), 3, torch.channels_last_3d)
 
     @onlyNativeDeviceTypes
     def test_GroupNorm_numeric(self, device):
@@ -14095,10 +14637,10 @@ def test_pad(self, device, dtype):
         # Assert assertion errors are raised for invalid circular padding values
         inputs = torch.randn(1, 1, 4, device=device, dtype=dtype, requires_grad=True)
         # Should raise error when trying to wrap around more than once
-        self.assertRaises(AssertionError, lambda: F.pad(inputs, (5, 4), mode='circular'))
-        self.assertRaises(AssertionError, lambda: F.pad(inputs, (3, 6), mode='circular'))
+        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (5, 4), mode='circular'))
+        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (3, 6), mode='circular'))
         # Should raise error when negative padding results in negative output shape
-        self.assertRaises(AssertionError, lambda: F.pad(inputs, (-3, -2), mode='circular'))
+        self.assertRaises(RuntimeError, lambda: F.pad(inputs, (-3, -2), mode='circular'))
 
         # assert that relfection padding errors when pad >= input size
         expected_err_msg = r"Padding size should be less than the corresponding input dimension"
@@ -14250,11 +14792,29 @@ def test_Bilinear_empty(self, device):
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
     def test_TransformerEncoderLayer_empty(self, device):
-        for batch_first, input_shape in [(True, (0, 10, 512)),
-                                         (False, (10, 0, 512))]:
-            input = torch.rand(*input_shape, device=device)
-            encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
-            self._test_module_empty_input(encoder_layer, input, check_size=False)
+        for training in (True, False):
+            for batch_first, input_shape in [(True, (0, 10, 512)),
+                                             (False, (10, 0, 512))]:
+                input = torch.rand(*input_shape, device=device)
+                encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+                if not training:
+                    encoder_layer = encoder_layer.eval()
+                    with torch.no_grad():
+                        self._test_module_empty_input(encoder_layer, input, check_size=False, inference=True)
+                    if batch_first and not TEST_WITH_CROSSREF:
+                        with torch.no_grad():
+                            # A NestedTensor with no tensors inside it doesn't have dim 3 (or dim
+                            # 2, for that matter) so it can't hit the fast path, nor can we give a
+                            # result.
+                            with self.assertRaisesRegex(
+                                    AssertionError, 'MultiheadAttention does not support NestedTensor outside'):
+                                nt = torch.nested_tensor([], device=device)
+                                self._test_module_empty_input(encoder_layer, nt, check_size=False, inference=True)
+
+                            nt = torch.nested_tensor([torch.rand(0, 512, device=device)], device=device)
+                            self._test_module_empty_input(encoder_layer, nt, check_size=False, inference=True)
+                else:
+                    self._test_module_empty_input(encoder_layer, input, check_size=False)
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
@@ -14430,6 +14990,20 @@ def test_FractionalMaxPool3d_zero_batch(self, device):
             inp = torch.randn(1, 0, 50, 32, 32, device=device)
             mod(inp)
 
+    @onlyNativeDeviceTypes
+    def test_FractionalMaxPool2d_zero_out_size(self, device):
+        mod = nn.FractionalMaxPool2d([2, 2], output_size=[0, 1])
+        inp = torch.rand([16, 50, 32, 32], device=device)
+        out = mod(inp)
+        self.assertEqual(out, torch.empty((16, 50, 0, 1), device=device))
+
+    @onlyNativeDeviceTypes
+    def test_FractionalMaxPool3d_zero_out_size(self, device):
+        mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[0, 1, 1])
+        inp = torch.rand([16, 50, 32, 32], device=device)
+        out = mod(inp)
+        self.assertEqual(out, torch.empty((16, 0, 1, 1), device=device))
+
     @onlyNativeDeviceTypes
     def test_Unfold_empty(self, device):
         inp = torch.randn(0, 3, 3, 4, device=device)
@@ -14607,26 +15181,27 @@ def test_BatchNorm_empty(self, device):
         self.assertEqual(mod.weight.grad, torch.tensor([0., 0, 0], device=device))
         self.assertEqual(mod.bias.grad, torch.tensor([0., 0, 0], device=device))
 
-    def test_conv_empty_channel(self, device):
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv_empty_channel(self, device, dtype):
         in_channels = 0
-        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2).to(device)
-        inp = torch.randn(2, 0, 15, device=device)
+        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 15, device=device, dtype=dtype)
         self._test_module_empty_input(mod, inp, check_size=False)
 
         with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
             inp = torch.randn(2, 1, 0, device=device)
             mod(inp)
 
-        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2).to(device)
-        inp = torch.randn(2, 0, 50, 100, device=device)
+        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 100, device=device, dtype=dtype)
         self._test_module_empty_input(mod, inp, check_size=False)
 
         with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
             inp = torch.randn(2, 1, 40, 0, device=device)
             mod(inp)
 
-        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2).to(device)
-        inp = torch.randn(2, 0, 50, 20, 40, device=device)
+        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 20, 40, device=device, dtype=dtype)
         self._test_module_empty_input(mod, inp, check_size=False)
 
         with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
@@ -14734,6 +15309,21 @@ def test_one_hot(self, device):
         with self.assertRaises(RuntimeError):
             torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -2)
 
+    def test_nn_empty(self, device):
+        # One off tests to ensure scalars from nn.yaml are properly applied
+        def verify_scalars(input, output):
+            self.assertEqual(input.shape, output.shape)
+            self.assertEqual(0, output.numel())
+
+        for input_shape in [(0), (0, 2)]:
+            for module in [torch.nn.ELU, torch.nn.Hardtanh, torch.nn.LeakyReLU, torch.nn.LogSigmoid,
+                           torch.nn.RReLU, torch.nn.Softshrink, torch.nn.Softplus, torch.nn.Sigmoid,
+                           torch.nn.Tanh]:
+                input = torch.randn(input_shape, device=device, requires_grad=True)
+                m = module()
+                output = m(input)
+                verify_scalars(input, output)
+
     def test_nn_scalars(self, device):
         # One off tests to ensure scalars from nn.yaml are properly applied
         def verify_scalars(input, output):
@@ -14887,6 +15477,31 @@ def test_unequal_when_beta_is_greater_than_one():
         test_unequal_when_beta_is_less_than_one()
         test_unequal_when_beta_is_greater_than_one()
 
+    @onlyCPU
+    def test_smooth_l1_loss_bfloat16(self, device):
+        def test_dtype(fn, input, target, dtype):
+            input = input.detach().clone().to(dtype=dtype).requires_grad_(True)
+            input2 = input.detach().clone().float().requires_grad_(True)
+            target = target.detach().clone().to(dtype=dtype)
+            target2 = target.detach().clone().float()
+            out = fn(input, target)
+            out.sum().backward()
+            out2 = fn(input2, target2)
+            out2.sum().backward()
+            self.assertEqual(out.dtype, dtype)
+            self.assertEqual(input.grad.dtype, dtype)
+            self.assertEqual(out, out2, exact_dtype=False)
+            self.assertEqual(input.grad, input2.grad, exact_dtype=False)
+
+        def func(device):
+            return nn.SmoothL1Loss().to(device=device)
+
+        shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 128, 128]]
+        for shape in shapes:
+            x = torch.randn(shape, device=device, requires_grad=True)
+            t = torch.randn(shape, device=device)
+            test_dtype(func(device), x, t, torch.bfloat16)
+
     # We don't want to make propagating NaN a hard requirement on ops, but for
     # these easy ones, we should make them do so.
     def test_nonlinearity_propagate_nan(self, device):
@@ -15662,9 +16277,7 @@ def test_upsamplingBicubic2d(self, device, antialias, align_corners):
         # for scale_factor in [0.5, 1, 1.5, 2]:
         for scale_factor in [2, ]:
             in_t = torch.ones(2, 3, 8, 8, device=device)
-            print("dtype: ", in_t.dtype)
             out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
-            print(out_t)
             out_size = int(math.floor(in_t.shape[-1] * scale_factor))
             expected_out = torch.ones(2, 3, out_size, out_size, device=device)
             self.assertEqual(expected_out, out_t, atol=1e-5, rtol=0)
@@ -15745,6 +16358,48 @@ def helper(n, c, h, w, output_height, output_width, contig):
             helper(4, 8, 9, 14, 5, 8, contig)
             helper(4, 8, 11, 11, 1, 1, contig)
 
+    @dtypes(torch.float, torch.double)
+    def test_pooling_max_nhwc(self, device, dtype):
+        def helper(n, c, h, w, kernel_size, stride, padding, dilation, contig, device):
+            output_height = math.floor((h + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1)
+            output_width = math.floor((w + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1)
+
+            input = torch.randint(1, 10, (n, c, h, w), device=device, dtype=dtype)
+            input = input.contiguous(memory_format=torch.channels_last)
+            grad = torch.randint(1, 10, (n, c, output_height, output_width), device=device, dtype=dtype)
+            grad = grad.contiguous(memory_format=torch.channels_last)
+            if not contig:
+                input = input[:, ::2, :, :]
+                grad = grad[:, ::2, :, :]
+            input.requires_grad_(True)
+            pool = torch.nn.MaxPool2d(
+                kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False
+            )
+
+            ref_input = input.detach().clone().contiguous().requires_grad_(True)
+            ref_grad = grad.detach().clone().contiguous()
+            ref_pool = torch.nn.MaxPool2d(
+                kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False
+            ).to(device)
+
+            out, ind = pool(input)
+            out.backward(grad)
+            ref_out, ref_ind = ref_pool(ref_input)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertTrue(ind.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_ind.is_contiguous())
+            self.assertEqual(out, ref_out)
+            self.assertEqual(ind, ref_ind)
+            self.assertEqual(input.grad, ref_input.grad)
+
+        for contig in [True, False]:
+            helper(4, 8, 10, 10, (2, 2), (1, 1), (1, 1), (2, 2), contig, device)
+            helper(4, 8, 9, 14, (2, 2), (1, 1), (1, 1), (2, 2), contig, device)
+            helper(4, 8, 11, 11, (4, 4), (2, 2), (2, 2), (2, 2), contig, device)
+
     def test_embedding_dense_grad(self, device):
         embd = nn.Embedding(20, 20).to(device)
         weight = embd.weight
@@ -16112,25 +16767,93 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                     rtol = None
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)
 
+    def _slow_masked_softmax(self, input, mask):
+        exp = torch.exp(input)
+        exp = exp * mask
+        s = exp.sum(dim=3, keepdim=True).expand(exp.size())
+        return exp / s
+
     def test_masked_softmax(self, device):
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
         for (B, num_heads, L) in sizes:
-            input = torch.randn((B, num_heads, L, L))
-            mask = torch.randint(0, 2, (B, L))
+            for dim in [0, 3]:
+                input = torch.randn((B, num_heads, L, L))
+                mask = torch.randint(0, 2, (B, L))
+                mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+                if (self.device_type == "cuda"):
+                    input = input.cuda()
+                    mask = mask.cuda()
+                native_res = torch._masked_softmax(input, mask, dim)
+                mask = ~mask
+
+                def slow_masked_softmax(input, mask):
+                    exp = torch.exp(input)
+                    exp = exp * mask
+                    s = exp.sum(dim=dim, keepdim=True).expand(exp.size())
+                    return exp / s
+
+                pt_res = slow_masked_softmax(input, mask)
+                pt_res = torch.nan_to_num(pt_res)
+
+                mask_not = mask.logical_not()
+                # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0)
+                # Converts rows with all True's to False
+                mask_out = mask_not.all(dim, keepdim=True).expand(mask_not.shape)
+                self.assertEqual(
+                    pt_res.masked_fill(mask_out, 0),
+                    native_res.masked_fill(mask_out, 0),
+                    exact_dtype=True
+                )
+
+    def _test_masked_softmax_helper(self, input, dim, mask):
+        input_ref = input.detach().clone().requires_grad_()
+        result = torch._masked_softmax(input, mask, dim)
+
+        expected = torch._softmax(input_ref.masked_fill(mask, float('-inf')), dim, False)
+        grad = torch.randn_like(expected).to(dtype=expected.dtype)
+
+        result.backward(grad)
+        expected.backward(grad)
+
+        # Make sure the optional argument works as well
+        if dim == input.dim() - 1:
+            input_ref_default = input.detach().clone().requires_grad_()
+            result_default = torch._masked_softmax(input_ref_default, mask)
+            result_default.backward(grad)
+            self.assertEqual(result, result_default)
+            self.assertEqual(input.grad, input_ref_default.grad)
+
+        # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0)
+        # Converts rows with all True's to False
+        mask_out = mask.all(dim, keepdim=True).expand(mask.shape)
+        self.assertEqual(result.masked_fill(mask_out, 0), expected.masked_fill(mask_out, 0))
+
+        self.assertEqual(input.grad, torch.nan_to_num(input_ref.grad))
+        self.assertEqual(input.grad, input.grad.masked_fill(mask, 0.0))
+
+    def test_masked_softmax_grad(self, device):
+        shapes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
+        for shape in shapes:
+            dims = [0, len(shape) - 1] if len(shape) > 0 else [0]
+            for dim in dims:
+                input = torch.randn(shape, requires_grad=True)
+                mask = torch.randint(0, 2, shape).bool()
+                if (self.device_type == "cuda"):
+                    input = input.cuda().detach().requires_grad_()
+                    mask = mask.cuda()
+                self._test_masked_softmax_helper(input, dim, mask)
+
+    # In this test, the forward pass is expected to produce nan's because when dim=0, we only have unspecified values
+    def test_masked_softmax_forward_with_nans(self, device):
+        dim = 0
+        shapes = [(4, 5), (50, 100), (1500, 1200)]
+        for (x, y) in shapes:
+            input = torch.randn((x, y), requires_grad=True)
+            mask = torch.tensor([i % 2 for i in range(y)]).expand((x, y)).bool()
             if (self.device_type == "cuda"):
-                input = input.cuda()
+                input = input.cuda().detach().requires_grad_()
                 mask = mask.cuda()
-            mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
-            native_res = torch._masked_softmax(input, mask)
-            mask = mask.float()
-
-            def slow_masked_softmax(input, mask):
-                exp = torch.exp(input)
-                exp = exp * mask
-                s = exp.sum(dim=3, keepdim=True).expand(exp.size())
-                return exp / s
-            pt_res = slow_masked_softmax(input, mask)
-            self.assertEqual(pt_res, native_res, exact_dtype=True)
+            self._test_masked_softmax_helper(input, dim, mask)
 
     @onlyCUDA
     def test_masked_softmax_transformer_layout(self, device):
@@ -16138,25 +16861,40 @@ def test_masked_softmax_transformer_layout(self, device):
         num_heads = 16
         L = 42
         input = torch.randn((B, num_heads, L, L))
+        dim = input.dim() - 1
         mask = torch.randint(0, 2, (B, L))
         if (self.device_type == "cuda"):
             input = input.cuda()
             mask = mask.cuda()
         mask = mask.bool()
-        native_res = torch._masked_softmax(input, mask)
+        native_res = torch._masked_softmax(input, mask, dim)
         mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L)
+        mask = ~mask
         mask = mask.float()
 
-        def slow_masked_softmax(input, mask):
-            exp = torch.exp(input)
-            exp = exp * mask
-            s = exp.sum(dim=3, keepdim=True).expand(exp.size())
-            return exp / s
-        pt_res = slow_masked_softmax(input, mask)
+        pt_res = self._slow_masked_softmax(input, mask)
+        self.assertEqual(pt_res, native_res, exact_dtype=True)
+
+    @onlyCUDA
+    def test_masked_softmax_TxT_layout(self, device):
+        B = 211
+        num_heads = 16
+        L = 42
+        input = torch.randn((B, num_heads, L, L))
+        dim = input.dim() - 1
+        mask = torch.randint(0, 2, (L, L))
+        if (self.device_type == "cuda"):
+            input = input.cuda()
+            mask = mask.cuda()
+        mask = mask.bool()
+        native_res = torch._masked_softmax(input, mask, dim)
+        mask = mask.expand(B, num_heads, L, L)
+        mask = ~mask
+        mask = mask.float()
+
+        pt_res = self._slow_masked_softmax(input, mask)
         self.assertEqual(pt_res, native_res, exact_dtype=True)
 
-    # Test fails on Vg20
-    @skipCUDAIfRocm
     @dtypesIfCUDA(torch.half, torch.float)
     @dtypes(torch.float)
     def test_softmax_results(self, device, dtype):
@@ -16454,7 +17192,6 @@ def test_conv_transposed_large(self, device):
     def test_conv_large(self, device):
         dtype = torch.half if self.device_type == 'cuda' else torch.float
         conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
-        conv.weight = torch.nn.Parameter(torch.randn(2, 2, 8, 8, device=device, dtype=dtype) / 64)
         input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device)
         # forward
         ret = conv(input_large)
@@ -16538,6 +17275,7 @@ def _test_gumbel_softmax_grad(self, device, dtype):
         tol = 2 * torch.finfo(dtype).eps
         self.assertEqual(logits_soft.grad, logits_hard.grad, atol=tol, rtol=0)
 
+    @skipIfMps
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float, torch.double)
     def test_gumbel_softmax(self, device, dtype):
@@ -16941,8 +17679,6 @@ def test_embedding_max_norm_device(self, device, dtype):
         self.assertEqual(output[1], output[2])
         self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())
 
-    # Test fails on Vg20
-    @skipCUDAIfRocm
     @onlyCUDA
     @dtypes(torch.half, torch.float)
     def test_softmax(self, device, dtype):
@@ -17029,7 +17765,7 @@ def test_embedding_bag_empty_input(self, device, dtypes):
             output = Embed(input=x, offsets=torch.tensor([0, 0], device=device, dtype=dtypes[1]))
             self.assertEqual(output, torch.zeros_like(output))
 
-    @skipCUDAIf(True, "cuda assert is not recovarable.")
+    @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.")
     @dtypes(*itertools.product((torch.float, torch.double), (torch.int, torch.long)))
     @parametrize_test("padding_idx", [None, 0])
     @parametrize_test("mode", ["sum", "mean", "max"])
@@ -17148,15 +17884,15 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
                         bags.append(embeddings.narrow(0, offset, length).max(0)[0])
         return torch.stack(bags)
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double)))
+    @skipMeta
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.half, torch.float, torch.double)))
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 0, 0, 0], device=device, dtype=dtypes[1])
             per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
@@ -17187,13 +17923,13 @@ def test_per_sample_weights(mode, trainable_scale):
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double)))
+    @skipMeta
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])
             per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
@@ -17221,13 +17957,13 @@ def test_per_sample_weights(mode, trainable_scale):
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double)))
+    @skipMeta
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
         def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offset, has_weight=True):
             es = nn.EmbeddingBag(5, 2, mode=mode, include_last_offset=include_last_offset).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])
 
@@ -17383,7 +18119,7 @@ def _test_EmbeddingBag(
     ):
         # check a known test example
         es = nn.EmbeddingBag(5, 2, mode=mode, sparse=sparse).to(device, wdtype)
-        es.weight.data.copy_(torch.arange(1, 11, device=device, dtype=wdtype).view_as(es.weight))
+        es.weight.data.copy_(torch.arange(1, 11, device=device).view_as(es.weight).to(wdtype))
         input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtype)
         offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=odtype)
 
@@ -17486,8 +18222,8 @@ def _test_EmbeddingBag(
             offset[-1] = 100
             self.assertRaises(RuntimeError, lambda: es(input.view(-1), offset))
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double)))
+    @skipMeta
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
     def test_embedding_bag_device(self, device, dtypes):
         self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
         self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
@@ -17500,7 +18236,7 @@ def test_embedding_bag_device(self, device, dtypes):
         elif self.device_type == 'cpu':
             # TODO: figure out why precision on sparse embeddings isn't the
             # same as for dense.
-            test_backward = dtypes[2] is not torch.float
+            test_backward = dtypes[2] is not torch.float and dtypes[2] is not torch.float16
 
         self._test_EmbeddingBag(
             device,
@@ -17521,8 +18257,8 @@ def test_embedding_bag_device(self, device, dtypes):
             test_backward=test_backward,
         )
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double)))
+    @skipMeta
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)
 
@@ -17546,13 +18282,16 @@ def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
             )
         self.assertEqual(output_non_contig, output_contig)
 
-
     @onlyCUDA
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_bfloat16(self, device, dtypes):
         self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
         self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
 
+    @onlyNativeDeviceTypes  # currently fails on XLA
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
+    def test_embedding_bag_half(self, device, dtypes):
+        self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.float16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
 
     @onlyCUDA
     @dtypes(torch.half, torch.float, torch.double)
@@ -17569,7 +18308,33 @@ def test_multihead_attention_dtype(self, device, dtype):
         self.assertEqual(q.size(), out[0].size())
         self.assertEqual(dtype, out[0].dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @onlyCUDA
+    @dtypes(torch.half, torch.float, torch.double)
+    def test_multihead_attention_dtype_batch_first(self, device, dtype):
+        embed_dim = 128
+        num_heads = 8
+        sl = 10
+        bs = 8
+        # With batch_first=True, we have the possibility of hitting
+        # the native fast path if we call .eval() and enable inference
+        # mode. Test both paths.
+        for training in (True, False):
+            model = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda().to(dtype)
+            if not training:
+                model = model.eval()
+                cm = torch.no_grad()
+            else:
+                cm = contextlib.nullcontext()
+            with cm:
+                q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                # fast path currently doesn't support weights
+                out = model(q, k, v, need_weights=False)
+                self.assertEqual(q.size(), out[0].size())
+                self.assertEqual(dtype, out[0].dtype)
+
+    @dtypesIfCUDA(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
     @dtypes(torch.float)
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
@@ -17604,7 +18369,7 @@ def test_Conv2d_naive_groups(self, device, dtype):
                          torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
                          atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     def test_Conv2d_backward_depthwise(self, device, dtype):
         x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True)
         weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True)
@@ -17937,37 +18702,42 @@ def expected_output(dim):
         self.assertEqual(output[0, 0, 0, 0], float("-inf"))
         self.assertEqual(indices[0, 0, 0, 0], 0)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float)
     def test_MaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float)
     def test_MaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @skipIfMps
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float)
     def test_MaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @skipIfMps
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float)
     def test_AdaptiveMaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     @dtypes(torch.float)
     def test_AdaptiveMaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     @dtypes(torch.float)
     def test_AdaptiveMaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     @dtypes(torch.float)
     def test_maxpool_indices_no_batch_dim(self, device, dtype):
         """Check that indices with no batch dim is consistent with a single batch."""
@@ -18132,7 +18902,8 @@ def test_pooling_zero_stride(self, device):
                 self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
                                        lambda: fn_module(x))
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     @dtypes(torch.float)
     def test_pool_large_size(self, device, dtype):
         for op in ('max', 'avg'):
@@ -18146,7 +18917,8 @@ def test_pool_large_size(self, device, dtype):
                 # check if the output shape was still computed correctly
                 self.assertEqual(x.shape[2], res.shape[2])
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     @dtypes(torch.float)
     def test_pool_invalid_size(self, device, dtype):
         for op in ('max', 'avg'):
@@ -18390,6 +19162,35 @@ def test_multi_margin_loss_errors(self, device):
                           lambda: nn.functional.multi_margin_loss(torch.randn(5, device=device),
                                                                   torch.zeros(3, device=device)))
 
+    @onlyCPU
+    def test_activations_bfloat16_cpu(self, device):
+        def test_bfloat16(fn, device, inp_dims, prec):
+            # bfloat16 compute
+            input = torch.randn(inp_dims, dtype=torch.bfloat16, device=device, requires_grad=True)
+            out = fn(input)
+            grad_input = torch.randn_like(out, dtype=torch.bfloat16, device=device)
+            out.backward(grad_input)
+
+            # fp32 compute
+            input2 = input.detach().clone().float().requires_grad_(True)
+            out2 = fn(input2)
+            grad_input2 = grad_input.detach().clone().float()
+            out2.backward(grad_input2)
+
+            self.assertEqual(out.dtype, torch.bfloat16)
+            self.assertEqual(input.grad.dtype, torch.bfloat16)
+            self.assertEqual(out, out2, atol=prec, rtol=0, exact_dtype=False)
+            self.assertEqual(input.grad.data, input2.grad.data, atol=prec, rtol=0, exact_dtype=False)
+
+        shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]]
+        for shape in shapes:
+            test_bfloat16(torch.nn.LogSigmoid(), device, shape, prec=2e-2)
+            test_bfloat16(torch.nn.Hardsigmoid(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.Hardshrink(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.Softshrink(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.Hardswish(), device, shape, prec=2e-2)
+            test_bfloat16(torch.nn.Softplus(), device, shape, prec=1e-2)
+
     def _test_bfloat16_ops(self, op, device, inp_dims=(), prec=1e-2, scale_factor=None):
         # fp32 compute
         input1 = torch.randn(inp_dims, dtype=torch.float32, device=device, requires_grad=True)
@@ -18435,11 +19236,57 @@ def test_softmax_bfloat16(self, device):
             # test softmax with large input value which casues exp() to overflow
             self._test_bfloat16_ops(torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
 
+    @onlyCPU
+    @dtypes(torch.float, torch.double)
+    def test_conv_thnn_nhwc(self, device, dtype):
+        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, weight_memory_format):
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
+                .to(memory_format=torch.channels_last)
+            input.requires_grad_()
+            conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
+                .to(device='cpu', dtype=dtype, memory_format=weight_memory_format)
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -3, 3)
+
+            ref_input = input.detach().clone().contiguous().requires_grad_()
+            ref_conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)
+            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format)
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -3, 3)
+            ref_grad = grad.detach().clone().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        with torch.backends.mkldnn.flags(enabled=False):
+            for mf in [torch.contiguous_format, torch.channels_last]:
+                # non-dilated conv: thnn_conv2d normal path (with im2col)
+                helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1, weight_memory_format=mf)
+                helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8, weight_memory_format=mf)
+                # non-dilated conv: thnn_conv2d fast path (skip im2col)
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1, weight_memory_format=mf)
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16, weight_memory_format=mf)
+                # dilated conv: slow_conv_dilated2d
+                helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1, weight_memory_format=mf)
+                helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16, weight_memory_format=mf)
+
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))
     @skipCUDAIfNotMiopenSuggestNHWC
     @skipCUDAIfCudnnVersionLessThan(7603)
-    @dtypes(torch.half, torch.float)
+    @dtypes(torch.half, torch.float, torch.cfloat)
     def test_conv_cudnn_nhwc(self, device, dtype):
         def helper(n, c, h, w, out_channels, kernel_size, groups):
             input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
@@ -18886,6 +19733,20 @@ def test_cross_entropy_loss_prob_target_unit_weights(self, device):
                 output_unit = m_unit(input, target)
                 self.assertEqual(output, output_unit)
 
+    @parametrize_test('reduction', ['none', 'mean', 'sum'])
+    @parametrize_test('weighted', [False, True])
+    def test_cross_entropy_loss_prob_target_no_batch_dim(self, device, reduction, weighted):
+        C = 5
+        input = torch.randn(C, device=device).log_softmax(dim=-1)
+        target = torch.randn(C, device=device).softmax(dim=-1)
+        weight = torch.randn(C, device=device) if weighted else None
+        m = nn.CrossEntropyLoss(reduction=reduction, weight=weight)
+        loss_no_batch = m(input, target)
+        loss_batch = m(input.unsqueeze(0), target.unsqueeze(0))
+        if reduction == 'none':
+            loss_batch = loss_batch.squeeze(0)
+        self.assertEqual(loss_no_batch, loss_batch)
+
     def test_cross_entropy_loss_index_target_unit_weights(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -19235,7 +20096,7 @@ def __init__(self):
                 self.assertEqual(p.grad.to(devices[0]), pe.grad)
 
     def test_elu_inplace_overlap(self, device):
-        x = torch.randn((1, 6), device=device).expand((6, 6))
+        x = torch.randn((1, 6), dtype=torch.bfloat16, device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.elu(x, inplace=True)
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
@@ -19322,6 +20183,32 @@ def test_leaky_relu_inplace_with_zero_slope(self, device):
         expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=torch.bfloat16)
         self.assertEqual(a_bf16.grad, expected_bf16)
 
+    @onlyCPU
+    def test_softshrink(self, device):
+        x = torch.tensor([[1.21, 0.56, 0.5001, 0.4999, 1.2357, -0.4999, -0.5001, -1.154,
+                           0.254, -0.24, -0.225, 0.104, 0.002, -0.001, 0.0574, 1.2344,
+                           0.1748, -0.1797, -0.8125, 0.2051, -1.1328, 1.2344, -0.1562, 2.3554,
+                           -0.1953, 0.0304, -0.3613, -1.3047, 1.0312, 0.1436, -0.6953, 0.5664,
+                           -0.5820, -0.3301, 0.8203, 0.6133, 0.5938],
+                          [-0.8203, -1.2344, -0.5234, 2.5312, -0.4551, -0.6875, -1.5547, -0.2217,
+                           -0.3027, 2.6406, 1.3047, 0.2344, -1.6719, 0.2773, -1.3516, 3.4575,
+                           0.4414, 0.2656, 2.1094, -1.5156, 1.2344, -0.4336, 0.6797, -3.5486,
+                           0.9766, -0.4062, 1.4844, 0.7500, -1.7578, 0.7461, 1.6094, 8.5458,
+                           0.3730, -0.3477, -1.0625, 0.3848, 0.0557]], device=device)
+        expected = torch.tensor([[0.71, 0.06, 0.0001, 0., 0.7357, 0., -0.0001, -0.654,
+                                  0., 0., 0., 0., 0., 0., 0., 0.7344,
+                                  0., 0., -0.3125, 0., -0.6328, 0.7344, 0., 1.8554,
+                                  0., 0., 0., -0.8047, 0.5312, 0., -0.1953, 0.0664,
+                                  -0.0820, 0.0, 0.3203, 0.1133, 0.0938],
+                                 [-0.3203, -0.7344, -0.0234, 2.0312, 0.0, -0.1875, -1.0547, 0.,
+                                  0.0, 2.1406, 0.8047, 0., -1.1719, 0., -0.8516, 2.9575,
+                                  0., 0., 1.6094, -1.0156, 0.7344, 0., 0.1797, -3.0486,
+                                  0.4766, 0., 0.9844, 0.2500, -1.2578, 0.2461, 1.1094, 8.0458,
+                                  0., 0., -0.5625, 0., 0.]])
+        softshrink = torch.nn.Softshrink()
+        out = softshrink(x)
+        self.assertEqual(out, expected, atol=1e-2, rtol=0)
+
     def test_threshold_inplace_overlap(self, device):
         # Inplace threshold is okay, because it is idempotent
         x = torch.randn((1, 6), device=device).expand((6, 6))
@@ -19476,6 +20363,293 @@ def test_adaptive_pool_invalid(self, device):
                 t, output_size = inp
                 m(output_size)(t)
 
+    @dtypes(torch.float)
+    @dtypesIfCUDA(torch.double, torch.float, torch.half)
+    def test_transformerencoderlayer(self, device, dtype):
+        # this is a deterministic test for TransformerEncoderLayer
+        d_model = 4
+        nhead = 2
+        dim_feedforward = 16
+        dropout = 0.0
+        bsz = 2
+
+        atol = 1e-5
+        rtol = 1e-7
+        if "cuda" in device:
+            atol = 1e-3
+            rtol = 1e-2
+
+        def _test(training, batch_first, atol, rtol):
+            def perm_fn(x):
+                return x.transpose(1, 0) if batch_first else x
+
+            model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                               batch_first=batch_first, device=device, dtype=dtype)
+
+            if not training:
+                assert dropout == 0
+                model = model.eval()
+
+            # set constant weights of the model
+            for idx, p in enumerate(model.parameters()):
+                x = p.data
+                sz = x.view(-1).size(0)
+                shape = x.shape
+                x = torch.cos(torch.arange(0, sz).float().view(shape))
+                p.data.copy_(x)
+
+            # deterministic input
+            encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype)
+            result = model(encoder_input)
+            ref_output = torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]], device=device, dtype=dtype)
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+            # 0 values are NOT masked. This shouldn't mask anything.
+            mask = torch.tensor([[0]], device=device) == 1
+            # TODO: enable fast path for calls with a mask!
+            result = model(encoder_input, src_key_padding_mask=mask)
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+            # 1 values are masked. Since there is only 1 input embedding this
+            # will result in nan.
+            mask = torch.tensor([[1]], device=device) == 1
+            result = model(encoder_input, src_key_padding_mask=mask)
+            result = result.cpu().detach().numpy()
+            self.assertTrue(np.isnan(result).all())
+
+            # deterministic input
+            encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
+                                                  [[5., 6., 7., 8.]]], device=device, dtype=dtype))
+            result = model(encoder_input)
+            ref_output = perm_fn(torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]],
+                                               [[2.272644, 0.119035, -0.691669, 0.153486]]], device=device, dtype=dtype))
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+            # all 0 which is no masking
+            mask = torch.tensor([[0, 0]], device=device) == 1
+            result = model(encoder_input, src_key_padding_mask=mask)
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+            mask = torch.tensor([[1, 0]], device=device) == 1
+            result = model(encoder_input, src_key_padding_mask=mask)
+            ref_output = perm_fn(torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]],
+                                               [[2.301516, 0.092249, -0.679101, 0.103088]]], device=device, dtype=dtype))
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+
+            # deterministic input
+            encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                                   [0.5387, 0.1655, 0.3565, 0.0471]],
+                                                  [[0.8335, 0.2799, 0.5031, 0.2947],
+                                                   [0.1402, 0.0318, 0.7636, 0.1346]],
+                                                  [[0.6333, 0.9344, 0.1376, 0.9938],
+                                                   [0.8924, 0.2872, 0.6692, 0.2944]],
+                                                  [[0.9897, 0.6915, 0.3154, 0.1733],
+                                                   [0.8645, 0.3513, 0.3064, 0.0767]],
+                                                  [[0.8117, 0.2366, 0.4838, 0.7881],
+                                                   [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype))
+            result = model(encoder_input)
+            ref_output = perm_fn(torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249],
+                                                [2.427987, 0.021213, -0.602496, -0.084103]],
+                                               [[2.424689, 0.019155, -0.604793, -0.085672],
+                                                [2.413863, 0.022211, -0.612486, -0.072490]],
+                                               [[2.433774, 0.021598, -0.598343, -0.087548],
+                                                [2.425104, 0.019748, -0.604515, -0.084839]],
+                                               [[2.436185, 0.022682, -0.596625, -0.087261],
+                                                [2.433556, 0.021891, -0.598509, -0.086832]],
+                                               [[2.416246, 0.017512, -0.610712, -0.082961],
+                                                [2.422901, 0.024187, -0.606178, -0.074929]]], device=device, dtype=dtype))
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+
+            # all 0
+            mask = torch.zeros([2, 5], device=device) == 1
+            result = model(encoder_input, src_key_padding_mask=mask)
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+            mask[0, 1] = 1
+            mask[1, 3] = 1
+            mask[1, 4] = 1
+            result = model(encoder_input, src_key_padding_mask=mask)
+            ref_output = perm_fn(torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642],
+                                                [2.428811, 0.021445, -0.601912, -0.084252]],
+                                               [[2.425009, 0.019155, -0.604566, -0.085899],
+                                                [2.415408, 0.02249 , -0.611415, -0.073]],
+                                               [[2.434199, 0.021682, -0.598039, -0.087699],
+                                                [2.42598, 0.019941, -0.603896, -0.085091]],
+                                               [[2.436457, 0.022736, -0.59643 , -0.08736],
+                                                [2.434021, 0.022093, -0.598179, -0.08679]],
+                                               [[2.416531, 0.017498, -0.610513, -0.083181],
+                                                [2.4242, 0.024653, -0.605266, -0.074959]]], device=device, dtype=dtype))
+            self.assertEqual(result.shape, ref_output.shape)
+            torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+
+            # NestedTensor is only supported for the fast path
+            # currently, which won't be used if training.
+            if (batch_first and not training and
+                    ('cuda' in str(device) or 'cpu' in str(device)) and not TEST_WITH_CROSSREF):
+                encoder_input[0][-1] = torch.zeros_like(encoder_input[0][1])
+                mask = torch.zeros(encoder_input.shape[:-1], device=device, dtype=torch.bool)
+                mask[0][-1] = True
+
+                nt = torch.nested_tensor([encoder_input[0][:-1], encoder_input[1]], device=device)
+                result = model(nt)
+                ref_output = torch.tensor(
+                    [
+                        [
+                            [2.4268184, 0.02042419, -0.603311, -0.08476824],
+                            [2.423306, 0.01889652, -0.6057701, -0.08519465],
+                            [2.431538, 0.02078694, -0.5999354, -0.08746159],
+                            [2.4348664, 0.02212971, -0.5975677, -0.08733892],
+                            [2.423133, 0.02097577, -0.60594773, -0.08113337],
+                        ],
+                        [
+                            [2.4279876, 0.02121329, -0.60249615, -0.08410317],
+                            [2.4138637, 0.02221113, -0.6124869, -0.07249016],
+                            [2.4251041, 0.01974815, -0.6045152, -0.08483928],
+                            [2.4335563, 0.0218913, -0.59850943, -0.08683228],
+                            [2.4229012, 0.02418739, -0.6061784, -0.07492948],
+                        ],
+                    ],
+                    device=device, dtype=dtype
+                )
+                result = result.to_padded_tensor(0)
+                ref_output[0][-1] = torch.zeros_like(
+                    ref_output[0][-1], device=device, dtype=dtype
+                )
+                result[0][-1] = torch.zeros_like(
+                    result[0][-1], device=device, dtype=dtype
+                )
+                self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+                if 'cuda' in device:
+                    if dtype == torch.float:
+                        atol = 2e-4
+                        rtol = 4e-3
+                    else:
+                        atol = 7e-4
+                        rtol = 2e-2
+                    torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol)
+                else:
+                    torch.testing.assert_close(result, ref_output)
+
+
+        for batch_first in (True, False):
+            for training in (True, False):
+                if training:
+                    cm = contextlib.nullcontext()
+                else:
+                    # Fast path requires inference mode.
+                    cm = torch.no_grad()
+                with cm:
+                    _test(batch_first=batch_first, training=training, atol=atol, rtol=rtol)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(self, device, dtype):
+        mha = torch.nn.MultiheadAttention(3, 3, batch_first=True, dtype=dtype, device=device).eval()
+        mha.in_proj_bias = torch.nn.Parameter(mha.in_proj_bias.to(torch.half).to(device))
+        query = torch.randn(3, 3, 3, dtype=dtype, device=device)
+        mha(query, query, query)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_in_proj_bias_none(self, device, dtype):
+        mha = torch.nn.MultiheadAttention(1, 1, bias=False, dtype=dtype, device=device)
+        query = torch.rand(3, 2, 1, dtype=dtype, device=device)
+        mha(query, query, query)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_in_proj_weight_none(self, device, dtype):
+        # Setting kdim == vdim == 2 means that vdim != embed_dim
+        # will cause the logic to use per-input project weights, thereby
+        # forcing self.in_proj_weight = None
+        mha = torch.nn.MultiheadAttention(4, 4, vdim=2, kdim=2, dtype=dtype, device=device)
+        query = torch.rand(4, 4, 4, dtype=dtype, device=device)
+        key = torch.rand(4, 4, 2, dtype=dtype, device=device)
+        mha(query, key, key)
+
+    @dtypes(torch.float)
+    @dtypesIfCUDA(torch.half, torch.float)
+    def test_transformerencoderlayer_gelu(self, device, dtype):
+        # this is a deterministic test for TransformerEncoderLayer with gelu activation
+        d_model = 4
+        nhead = 2
+        dim_feedforward = 16
+        dropout = 0.0
+        bsz = 2
+
+        atol = 0
+        rtol = 1e-5
+        if "cuda" in device:
+            atol = 1e-3
+            rtol = 1e-2
+
+        def _test(activation, batch_first, training):
+            def perm_fn(x):
+                return x.transpose(1, 0) if batch_first else x
+
+            model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                               activation, batch_first=batch_first, device=device, dtype=dtype)
+            if not training:
+                assert dropout == 0
+                model = model.eval()
+
+            # set constant weights of the model
+            for idx, p in enumerate(model.parameters()):
+                x = p.data
+                sz = x.view(-1).size(0)
+                shape = x.shape
+                x = torch.cos(torch.arange(0, sz).float().view(shape))
+                p.data.copy_(x)
+
+            # deterministic input
+            encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype)
+            result = model(encoder_input)
+            ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]], device=device, dtype=dtype)
+            torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol)
+
+            # deterministic input
+            encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
+                                                  [[5., 6., 7., 8.]]], device=device, dtype=dtype))
+            result = model(encoder_input)
+            ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]],
+                                               [[2.264103, 0.121417, -0.696012, 0.159724]]], device=device, dtype=dtype))
+            torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol)
+
+            # deterministic input
+            encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                                  [0.5387, 0.1655, 0.3565, 0.0471]],
+                                                  [[0.8335, 0.2799, 0.5031, 0.2947],
+                                                  [0.1402, 0.0318, 0.7636, 0.1346]],
+                                                  [[0.6333, 0.9344, 0.1376, 0.9938],
+                                                  [0.8924, 0.2872, 0.6692, 0.2944]],
+                                                  [[0.9897, 0.6915, 0.3154, 0.1733],
+                                                  [0.8645, 0.3513, 0.3064, 0.0767]],
+                                                  [[0.8117, 0.2366, 0.4838, 0.7881],
+                                                  [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype))
+            result = model(encoder_input)
+            ref_output = perm_fn(torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082],
+                                                [2.42151276, 0.03302179, -0.60722523, -0.05762651]],
+                                               [[2.41926761, 0.02974034, -0.60879519, -0.0621269],
+                                                [2.41626395, 0.03539356, -0.61087842, -0.04978623]],
+                                               [[2.42382808, 0.03218872, -0.6055963, -0.06073591],
+                                                [2.41983477, 0.03085259, -0.60840145, -0.06046414]],
+                                               [[2.42500749, 0.03328855, -0.60476388, -0.0595334],
+                                                [2.4237977, 0.03290575, -0.60561789, -0.05940082]],
+                                               [[2.41383916, 0.02686345, -0.61256377, -0.06380707],
+                                                [2.42000277, 0.03800944, -0.60824798, -0.04754947]]], device=device, dtype=dtype))
+            torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol)
+        for activation, batch_first, training in product(('gelu', F.gelu, nn.GELU()), (True, False), (True, False)):
+            # Fast path requires inference mode.
+            if training:
+                cm = contextlib.nullcontext()
+            else:
+                cm = torch.no_grad()
+            with cm:
+                _test(activation=activation, batch_first=batch_first, training=training)
+
+
 class TestModuleGlobalHooks(TestCase):
 
     def tearDown(self):
@@ -20443,18 +21617,120 @@ def my_pre_load_hook_with_module(
                 nonlocal hook_called
                 hook_called += 1
 
-        m = MyModule()
-        state_dict = m.state_dict()
+        # Test that hooks registered on a submodule are also called
+        # appropriately, i.e. with the submodule as module argument in
+        # my_pre_load_hook_with_module.
+        class MyModuleContainer(nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                self.mod = mod
 
-        hook_called = 0
-        m._register_load_state_dict_pre_hook(m.my_pre_load_hook)
-        m.load_state_dict(state_dict)
-        self.assertEqual(1, hook_called)
+        for ctor in [MyModuleContainer, lambda x: x]:
+            m = ctor(MyModule())
+            state_dict = m.state_dict()
+            if isinstance(m, MyModuleContainer):
+                mod = m.mod
+            else:
+                mod = m
+
+            hook_called = 0
+            mod._register_load_state_dict_pre_hook(
+                mod.my_pre_load_hook
+            )
+            m.load_state_dict(state_dict)
+            self.assertEqual(1, hook_called)
 
+            hook_called = 0
+            mod._register_load_state_dict_pre_hook(
+                mod.my_pre_load_hook_with_module, True
+            )
+            m.load_state_dict(state_dict)
+            self.assertEqual(2, hook_called)
+
+    def test_load_state_dict_post_hook(self):
         hook_called = 0
-        m._register_load_state_dict_pre_hook(m.my_pre_load_hook_with_module, True)
-        m.load_state_dict(state_dict)
-        self.assertEqual(2, hook_called)
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.foo = torch.nn.Parameter(torch.rand(10))
+
+            def my_post_load_hook(self, module, incompatible_keys):
+                assert module is self
+                nonlocal hook_called
+                incompatible_keys.missing_keys.append("foo")
+                incompatible_keys.unexpected_keys.append("bar")
+                hook_called += 1
+
+        nested = MyModule()
+        wrapped = nn.ModuleList([nested])
+        handle = nested.register_load_state_dict_post_hook(
+            nested.my_post_load_hook,
+        )
+        # Hook must be called even if it is wrapped
+        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
+        self.assertEqual(hook_called, 1)
+        # Ensure that the hook modified missing_keys and unexpected_keys
+        missing = ret.missing_keys
+        unexpected = ret.unexpected_keys
+        self.assertEqual(missing, ["foo"])
+        self.assertEqual(unexpected, ["bar"])
+        # When called with strict=True, the error raised should mention the
+        # missing and unexpected keys the hook added.
+        with self.assertRaisesRegex(RuntimeError, "foo.*\n.*bar"):
+            wrapped.load_state_dict(wrapped.state_dict(), strict=True)
+        self.assertEqual(hook_called, 2)
+        # Removing the hook via handle.remove() should cause it not to
+        # fire anymore.
+        handle.remove()
+        # Hook did not run so it should not have added any keys
+        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
+        self.assertEqual(ret.missing_keys, [])
+        self.assertEqual(ret.unexpected_keys, [])
+        # hook_called should not have been incremented
+        self.assertEqual(hook_called, 2)
+
+        def load_hook_clear_incompatible(module, incompatible_keys):
+            incompatible_keys.missing_keys.clear()
+            incompatible_keys.unexpected_keys.clear()
+
+        nested.register_load_state_dict_post_hook(load_hook_clear_incompatible)
+        state_dict = wrapped.state_dict()
+        state_dict["extra"] = torch.ones(1)
+        # load state_dict with strict=True should not throw.
+        ret = wrapped.load_state_dict(state_dict, strict=True)
+        # explicitly ensure that the post hook clearned out incompatible_keys
+        self.assertEqual([], ret.missing_keys)
+        self.assertEqual([], ret.unexpected_keys)
+
+    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    def test_load_state_dict_post_hook_backward_compatibility(self):
+        def my_post_load_hook(mod, _):
+            nonlocal called
+            called = True
+
+        for m in [nn.Softmin(10), nn.Softmax(10), nn.LogSoftmax(10)]:
+            called = False
+            sd = deepcopy(m.state_dict())
+            self.assertTrue(hasattr(m, '_load_state_dict_post_hooks'))
+            # Simulate an older model that did not have this attr
+            delattr(m, '_load_state_dict_post_hooks')
+            # Save and load, and ensure that load_state_dict works (without proper
+            # BC we would run into errors because this attribute would be expected).
+            # In particular, Softmax runs into the issue described here:
+            # https://github.com/pytorch/pytorch/issues/77280
+            with NamedTemporaryFile() as f:
+                # Note that torch.save / torch.load is not recommended to save/load
+                # modules.
+                torch.save(m, f.name)
+                m = torch.load(f.name)
+                m.load_state_dict(sd)
+                self.assertFalse(called)
+
+            # Ensure hooks can be registered and called.
+            m.register_load_state_dict_post_hook(my_post_load_hook)
+            m.load_state_dict(sd)
+            self.assertTrue(called)
 
 
 instantiate_device_type_tests(TestNNDeviceType, globals())
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 656abdc57bda..96c1016c2dbb 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -8,8 +8,8 @@
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCPU, dtypes)
-from torch.testing._internal.common_dtype import get_all_dtypes
+    (instantiate_device_type_tests, onlyCPU, dtypes, skipMeta)
+from torch.testing._internal.common_dtype import all_types_and_complex_and
 
 # For testing handling NumPy objects and sending tensors to / accepting
 #   arrays from NumPy.
@@ -228,11 +228,34 @@ def test_from_numpy(self, device) -> None:
         x.strides = (3,)
         self.assertRaises(ValueError, lambda: torch.from_numpy(x))
 
+    @skipMeta
     def test_from_list_of_ndarray_warning(self, device):
         warning_msg = r"Creating a tensor from a list of numpy.ndarrays is extremely slow"
         with self.assertWarnsOnceRegex(UserWarning, warning_msg):
             torch.tensor([np.array([0]), np.array([1])], device=device)
 
+    def test_ctor_with_invalid_numpy_array_sequence(self, device):
+        # Invalid list of numpy array
+        with self.assertRaisesRegex(ValueError, "expected sequence of length"):
+            torch.tensor([np.random.random(size=(3, 3)), np.random.random(size=(3, 0))], device=device)
+
+        # Invalid list of list of numpy array
+        with self.assertRaisesRegex(ValueError, "expected sequence of length"):
+            torch.tensor([[np.random.random(size=(3, 3)), np.random.random(size=(3, 2))]], device=device)
+
+        with self.assertRaisesRegex(ValueError, "expected sequence of length"):
+            torch.tensor([[np.random.random(size=(3, 3)), np.random.random(size=(3, 3))],
+                          [np.random.random(size=(3, 3)), np.random.random(size=(3, 2))]], device=device)
+
+        # expected shape is `[1, 2, 3]`, hence we try to iterate over 0-D array
+        # leading to type error : not a sequence.
+        with self.assertRaisesRegex(TypeError, "not a sequence"):
+            torch.tensor([[np.random.random(size=(3)), np.random.random()]], device=device)
+
+        # list of list or numpy array.
+        with self.assertRaisesRegex(ValueError, "expected sequence of length"):
+            torch.tensor([[1, 2, 3], np.random.random(size=(2,)), ], device=device)
+
     @onlyCPU
     def test_ctor_with_numpy_scalar_ctor(self, device) -> None:
         dtypes = [
@@ -396,7 +419,7 @@ def test_has_storage_numpy(self, device):
             self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.long).storage())
             self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.uint8).storage())
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_numpy_scalar_cmp(self, device, dtype):
         if dtype.is_complex:
             tensors = (torch.tensor(complex(1, 3), dtype=dtype, device=device),
diff --git a/test/test_ops.py b/test/test_ops.py
index 4d41e60b4aaf..2d737f3d6d39 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,45 +1,80 @@
-# Owner(s): ["high priority"]
+# Owner(s): ["module: unknown"]
 
 from collections.abc import Sequence
-from functools import partial, wraps
+from functools import partial
 import warnings
 import unittest
 import itertools
-
 import torch
 
-from torch.testing import FileCheck, make_tensor
-from torch.testing._internal.common_dtype import floating_and_complex_types_and, get_all_dtypes
-from torch.testing._internal.common_utils import \
-    (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper,
-     gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings, noncontiguous_like,
-     TEST_WITH_ASAN, IS_WINDOWS, IS_FBCODE, first_sample)
-from torch.testing._internal.common_methods_invocations import \
-    (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo)
-from torch.testing._internal.common_device_type import \
-    (deviceCountAtLeast, instantiate_device_type_tests, ops, onlyCPU,
-     onlyCUDA, onlyNativeDeviceTypes, skipCUDAIfRocm, OpDTypes, skipMeta)
-from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference
-from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, create_traced_fn, \
-    check_alias_annotation
-from torch.testing._internal.jit_utils import disable_autodiff_subgraph_inlining, is_lambda
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    floating_and_complex_types_and,
+    all_types_and_complex_and,
+)
+from torch.testing._internal.common_utils import (
+    TestCase,
+    is_iterable_of_tensors,
+    run_tests,
+    IS_SANDCASTLE,
+    clone_input_helper,
+    IS_IN_CI,
+    suppress_warnings,
+    noncontiguous_like,
+    TEST_WITH_ASAN,
+    IS_WINDOWS,
+    IS_FBCODE,
+    first_sample,
+)
+from torch.testing._internal.common_methods_invocations import (
+    op_db,
+    _NOTHING,
+    UnaryUfuncInfo,
+    ReductionOpInfo,
+    SpectralFuncInfo,
+    ops_and_refs,
+    python_ref_db,
+    BinaryUfuncInfo,
+)
+from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
+    instantiate_device_type_tests,
+    ops,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    OpDTypes,
+    skipMeta,
+)
+import torch._prims as prims
+
 import torch.testing._internal.opinfo_helper as opinfo_helper
-from torch.testing._internal.composite_compliance import _check_composite_compliance
+from torch.testing._internal import composite_compliance
 
 # TODO: fixme https://github.com/pytorch/pytorch/issues/68972
 torch.set_default_dtype(torch.float32)
 
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
-_variant_ops = partial(ops, dtypes=OpDTypes.supported,
-                       allowed_dtypes=(torch.float, torch.cfloat))
+_variant_ops = partial(
+    ops, dtypes=OpDTypes.supported, allowed_dtypes=(torch.float, torch.cfloat)
+)
 
 # Get names of all the operators which have ref in their entry in OpInfo (testing infra)
-#   except for Unary Ufuncs (separately implemented in test/test_unary_ufuncs.py)
+#   except for elementwise unary operators (separately implemented in test/test_unary_ufuncs.py),
+#   elementwise binary operators (separately implemented in test_binary_ufuncs.py),
+#   reduction operations (separately impelemented in test_reductions.py),
 #   and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py)
-_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, ReductionOpInfo,
-                     SpectralFuncInfo)) and op.ref is not None and op.ref is not _NOTHING, op_db))
-
+_ref_test_ops = tuple(
+    filter(
+        lambda op: not isinstance(
+            op, (UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo, BinaryUfuncInfo)
+        )
+        and op.ref is not None
+        and op.ref is not _NOTHING,
+        op_db,
+    )
+)
+_ops_and_refs = op_db + python_ref_db
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
@@ -52,8 +87,10 @@ def tearDownClass(cls):
         super().tearDownClass()
 
         if IS_IN_CI:
-            err_msg = ("The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries."
-                       "This is OK for testing, but be sure to set the dtypes manually before landing your PR!")
+            err_msg = (
+                "The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries."
+                "This is OK for testing, but be sure to set the dtypes manually before landing your PR!"
+            )
             # Assure no opinfo entry has dynamic_dtypes
             filtered_ops = list(filter(opinfo_helper.is_dynamic_dtype_set, op_db))
             for op in filtered_ops:
@@ -64,65 +101,90 @@ def tearDownClass(cls):
 
     # Validates that each OpInfo specifies its forward and backward dtypes
     #   correctly for CPU and CUDA devices
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @skipMeta
-    @skipCUDAIfRocm
     @onlyNativeDeviceTypes
-    @ops(op_db, dtypes=OpDTypes.none)
+    @ops(ops_and_refs, dtypes=OpDTypes.none)
     def test_dtypes(self, device, op):
+        # Check complex32 support only if the op claims.
+        # TODO: Once the complex32 support is better, we should add check for complex32 unconditionally.
+        device_type = torch.device(device).type
+        include_complex32 = (
+            (torch.complex32,)
+            if op.supports_dtype(torch.complex32, device_type)
+            else ()
+        )
+
         # dtypes to try to backward in
-        allowed_backward_dtypes = floating_and_complex_types_and(torch.bfloat16, torch.float16)
+        allowed_backward_dtypes = floating_and_complex_types_and(
+            *((torch.half, torch.bfloat16) + include_complex32)
+        )
 
         # lists for (un)supported dtypes
-        supported_dtypes = []
-        unsupported_dtypes = []
-        supported_backward_dtypes = []
-        unsupported_backward_dtypes = []
+        supported_dtypes = set()
+        unsupported_dtypes = set()
+        supported_backward_dtypes = set()
+        unsupported_backward_dtypes = set()
 
         def unsupported(dtype):
-            unsupported_dtypes.append(dtype)
+            unsupported_dtypes.add(dtype)
             if dtype in allowed_backward_dtypes:
-                unsupported_backward_dtypes.append(dtype)
+                unsupported_backward_dtypes.add(dtype)
 
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(
+            *((torch.half, torch.bfloat16, torch.bool) + include_complex32)
+        ):
             # tries to acquire samples - failure indicates lack of support
-            requires_grad = (dtype in allowed_backward_dtypes and op.supports_autograd)
+            requires_grad = dtype in allowed_backward_dtypes
             try:
-                samples = list(op.sample_inputs(device, dtype, requires_grad=requires_grad))
+                samples = tuple(
+                    op.sample_inputs(device, dtype, requires_grad=requires_grad)
+                )
             except Exception as e:
                 unsupported(dtype)
                 continue
 
-            # Counts number of successful backward attempts
-            # NOTE: This exists as a kludge because this only understands how to
-            #   request a gradient if the output is a tensor or a sequence with
-            #   a tensor as its first element.
-            num_backward_successes = 0
             for sample in samples:
                 # tries to call operator with the sample - failure indicates
                 #   lack of support
                 try:
                     result = op(sample.input, *sample.args, **sample.kwargs)
+                    supported_dtypes.add(dtype)
                 except Exception as e:
                     # NOTE: some ops will fail in forward if their inputs
                     #   require grad but they don't support computing the gradient
                     #   in that type! This is a bug in the op!
                     unsupported(dtype)
+                    continue
 
-                # Short-circuits testing this dtype -- it doesn't work
-                if dtype in unsupported_dtypes:
-                    break
-
-                # Short-circuits if the dtype isn't a backward dtype or
-                #   it's already identified as not supported
-                if dtype not in allowed_backward_dtypes or dtype in unsupported_backward_dtypes:
+                # Checks for backward support in the same dtype, if the input has
+                # one or more tensors requiring grad
+                def _tensor_requires_grad(x):
+                    if isinstance(x, dict):
+                        for k, v in x.items():
+                            if _tensor_requires_grad(v):
+                                return True
+                    if isinstance(x, (list, tuple)):
+                        for a in x:
+                            if _tensor_requires_grad(a):
+                                return True
+                    if isinstance(x, torch.Tensor) and x.requires_grad:
+                        return True
+
+                    return False
+
+                requires_grad = _tensor_requires_grad(sample.input) \
+                    or _tensor_requires_grad(sample.args) or _tensor_requires_grad(sample.kwargs)
+                if not requires_grad:
                     continue
 
-                # Checks for backward support in the same dtype
                 try:
                     result = sample.output_process_fn_grad(result)
                     if isinstance(result, torch.Tensor):
                         backward_tensor = result
-                    elif isinstance(result, Sequence) and isinstance(result[0], torch.Tensor):
+                    elif isinstance(result, Sequence) and isinstance(
+                        result[0], torch.Tensor
+                    ):
                         backward_tensor = result[0]
                     else:
                         continue
@@ -135,55 +197,105 @@ def unsupported(dtype):
                     #   supporting grad in the input dtype.
                     grad = torch.randn_like(backward_tensor)
                     backward_tensor.backward(grad)
-                    num_backward_successes += 1
+                    supported_backward_dtypes.add(dtype)
                 except Exception as e:
-                    unsupported_backward_dtypes.append(dtype)
-
-            if dtype not in unsupported_dtypes:
-                supported_dtypes.append(dtype)
-            if num_backward_successes > 0 and dtype not in unsupported_backward_dtypes:
-                supported_backward_dtypes.append(dtype)
+                    unsupported_backward_dtypes.add(dtype)
 
         # Checks that dtypes are listed correctly and generates an informative
         #   error message
-        device_type = torch.device(device).type
-        claimed_supported = set(op.supported_dtypes(device_type))
-        supported_dtypes = set(supported_dtypes)
-
-        supported_but_unclaimed = supported_dtypes - claimed_supported
-        claimed_but_unsupported = claimed_supported - supported_dtypes
-        msg = """The supported dtypes for {0} on {1} according to its OpInfo are
-        {2}, but the detected supported dtypes are {3}.
-        """.format(op.name, device_type, claimed_supported, supported_dtypes)
-
-        if len(supported_but_unclaimed) > 0:
-            msg += "The following dtypes should be added to the OpInfo: {0}. ".format(supported_but_unclaimed)
-        if len(claimed_but_unsupported) > 0:
-            msg += "The following dtypes should be removed from the OpInfo: {0}.".format(claimed_but_unsupported)
-
-        self.assertEqual(supported_dtypes, claimed_supported, msg=msg)
 
-        # Checks that backward dtypes are listed correctly and generates an
-        #   informative error message
-        # NOTE: this code is nearly identical to the check + msg generation
-        claimed_backward_supported = set(op.supported_backward_dtypes(device_type))
-        supported_backward_dtypes = set(supported_backward_dtypes)
+        supported_forward = supported_dtypes - unsupported_dtypes
+        partially_supported_forward = supported_dtypes & unsupported_dtypes
+        unsupported_forward = unsupported_dtypes - supported_dtypes
+        supported_backward = supported_backward_dtypes - unsupported_backward_dtypes
+        partially_supported_backward = (
+            supported_backward_dtypes & unsupported_backward_dtypes
+        )
+        unsupported_backward = unsupported_backward_dtypes - supported_backward_dtypes
 
-        supported_but_unclaimed = supported_backward_dtypes - claimed_backward_supported
-        claimed_but_unsupported = claimed_backward_supported - supported_backward_dtypes
-        msg = """The supported backward dtypes for {0} on {1} according to its OpInfo are
-        {2}, but the detected supported backward dtypes are {3}.
-        """.format(op.name, device_type, claimed_backward_supported, supported_backward_dtypes)
+        device_type = torch.device(device).type
 
-        if len(supported_but_unclaimed) > 0:
-            msg += "The following backward dtypes should be added to the OpInfo: {0}. ".format(supported_but_unclaimed)
-        if len(claimed_but_unsupported) > 0:
-            msg += "The following backward dtypes should be removed from the OpInfo: {0}.".format(claimed_but_unsupported)
+        claimed_forward = set(op.supported_dtypes(device_type))
+        supported_but_unclaimed_forward = supported_forward - claimed_forward
+        claimed_but_unsupported_forward = claimed_forward & unsupported_forward
+
+        claimed_backward = set(op.supported_backward_dtypes(device_type))
+        supported_but_unclaimed_backward = supported_backward - claimed_backward
+        claimed_but_unsupported_backward = claimed_backward & unsupported_backward
+
+        # Partially supporting a dtype is not an error, but we print a warning
+        if (len(partially_supported_forward) + len(partially_supported_backward)) > 0:
+            msg = "Some dtypes for {0} on device type {1} are only partially supported!\n".format(
+                op.name, device_type
+            )
+            if len(partially_supported_forward) > 0:
+                msg = (
+                    msg
+                    + "The following dtypes only worked on some samples during forward: {0}.\n".format(
+                        partially_supported_forward
+                    )
+                )
+            if len(partially_supported_backward) > 0:
+                msg = (
+                    msg
+                    + "The following dtypes only worked on some samples during backward: {0}.\n".format(
+                        partially_supported_backward
+                    )
+                )
+            print(msg)
+
+        if (
+            len(supported_but_unclaimed_forward)
+            + len(claimed_but_unsupported_forward)
+            + len(supported_but_unclaimed_backward)
+            + len(claimed_but_unsupported_backward)
+        ) == 0:
+            return
 
-        self.assertEqual(supported_backward_dtypes, claimed_backward_supported, msg=msg)
+        # Reference operators often support additional dtypes, and that's OK
+        if op in python_ref_db:
+            if (
+                len(claimed_but_unsupported_forward)
+                + len(claimed_but_unsupported_backward)
+            ) == 0:
+                return
+
+        # Generates error msg
+        msg = "The supported dtypes for {0} on device type {1} are incorrect!\n".format(
+            op.name, device_type
+        )
+        if len(supported_but_unclaimed_forward) > 0:
+            msg = (
+                msg
+                + "The following dtypes worked in forward but are not listed by the OpInfo: {0}.\n".format(
+                    supported_but_unclaimed_forward
+                )
+            )
+        if len(supported_but_unclaimed_backward) > 0:
+            msg = (
+                msg
+                + "The following dtypes worked in backward but are not listed by the OpInfo: {0}.\n".format(
+                    supported_but_unclaimed_backward
+                )
+            )
+        if len(claimed_but_unsupported_forward) > 0:
+            msg = (
+                msg
+                + "The following dtypes did not work in forward but are listed by the OpInfo: {0}.\n".format(
+                    claimed_but_unsupported_forward
+                )
+            )
+        if len(claimed_but_unsupported_backward) > 0:
+            msg = (
+                msg
+                + "The following dtypes did not work in backward but are listed by the OpInfo: {0}.\n".format(
+                    claimed_but_unsupported_backward
+                )
+            )
+
+        self.fail(msg)
 
     # Validates that each OpInfo works correctly on different CUDA devices
-    @skipCUDAIfRocm
     @onlyCUDA
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
@@ -200,13 +312,16 @@ def test_multiple_devices(self, devices, dtype, op):
             elif is_iterable_of_tensors(result):
                 self.assertTrue(all(map(lambda t: t.device == cuda_device, result)))
             else:
-                self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.")
+                self.skipTest(
+                    "Skipped! Only supports single tensor or iterable of tensor outputs."
+                )
 
     # Tests that the function and its (ndarray-accepting) reference produce the same
     #   values on the tensors from sample_inputs func for the corresponding op.
     # This test runs in double and complex double precision because
     # NumPy does computation internally using double precision for many functions
     # resulting in possible equality check failures.
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @suppress_warnings
     @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128))
@@ -215,12 +330,73 @@ def test_reference_testing(self, device, dtype, op):
             # Sets the default dtype to NumPy's default dtype of double
             cur_default = torch.get_default_dtype()
             torch.set_default_dtype(torch.double)
-            sample_inputs = op.sample_inputs(device, dtype)
-            for sample_input in sample_inputs:
-                self.compare_with_reference(op, op.ref, sample_input, exact_dtype=(dtype is not torch.long))
+            for sample_input in op.reference_inputs(device, dtype):
+                self.compare_with_reference(
+                    op, op.ref, sample_input, exact_dtype=(dtype is not torch.long)
+                )
         finally:
             torch.set_default_dtype(cur_default)
 
+    # Tests that experimental Python References can propagate shape, dtype,
+    # and device metadata properly.
+    # TODO: include stride propagation.
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyNativeDeviceTypes
+    @ops(python_ref_db)
+    def test_python_reference_meta_functions(self, device, dtype, op):
+        def _to_tensormeta(x):
+            if isinstance(x, torch.Tensor):
+                return prims.utils.TensorMeta(x)
+            return x
+
+        # TODO: iterate over requires_grad true/false
+        inps = tuple(op.reference_inputs(device, dtype, requires_grad=False))
+        for sample in op.reference_inputs(device, dtype, requires_grad=False):
+
+            result = op(sample.input, *sample.args, **sample.kwargs)
+
+            meta_sample = sample.transform(_to_tensormeta)
+            meta_result = op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs)
+
+            if isinstance(result, torch.Tensor):
+                prims.utils.compare_tensor_meta(result, meta_result)
+            elif isinstance(result, Sequence):
+                for a, b in zip(result, meta_result):
+                    prims.utils.compare_tensor_meta(a, b)
+
+    # Tests that experimental Python References perform the same computation
+    # as the operators they reference.
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyNativeDeviceTypes
+    @ops(python_ref_db)
+    def test_python_reference_consistency(self, device, dtype, op):
+        for sample in op.reference_inputs(device, dtype, requires_grad=False):
+
+            actual = op(sample.input, *sample.args, **sample.kwargs)
+            expected = op.torch_opinfo(sample.input, *sample.args, **sample.kwargs)
+
+            self.assertEqual(
+                actual,
+                expected,
+                exact_stride=False,
+                exact_device=True,
+                exact_layout=True,
+                exact_is_coalesced=True,
+            )
+
+            if isinstance(actual, torch.Tensor):
+                assert isinstance(expected, torch.Tensor)
+                prims.utils.compare_tensor_meta(actual, expected)
+                if getattr(op, 'validate_view_consistency', True):
+                    self.assertEqual(actual._is_view(), expected._is_view())
+            if isinstance(actual, Sequence):
+                assert isinstance(expected, Sequence)
+                for a, b in zip(actual, expected):
+                    prims.utils.compare_tensor_meta(a, b)
+                    if getattr(op, 'validate_view_consistency', True):
+                        self.assertEqual(a._is_view(), b._is_view())
+
+
     @skipMeta
     @onlyNativeDeviceTypes
     @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
@@ -231,6 +407,23 @@ def test_errors(self, device, op):
             with self.assertRaisesRegex(ei.error_type, ei.error_regex):
                 op(si.input, *si.args, **si.kwargs)
 
+    @skipMeta
+    @onlyNativeDeviceTypes
+    @ops([op for op in python_ref_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
+    def test_python_reference_errors(self, device, op):
+        def _to_tensormeta(x):
+            if isinstance(x, torch.Tensor):
+                return prims.utils.TensorMeta(x)
+            return x
+
+        error_inputs = op.error_inputs(device)
+        for ei in error_inputs:
+            si = ei.sample_input
+            meta_sample = si.transform(_to_tensormeta)
+            # TODO: match strings
+            with self.assertRaisesRegex(ei.error_type, ""):
+                op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs)
+
     # Tests that the function produces the same result when called with
     #   noncontiguous tensors.
     # TODO: get working with Windows by addressing failing operators
@@ -244,8 +437,17 @@ def test_noncontiguous_samples(self, device, dtype, op):
         test_grad = dtype in op.supported_backward_dtypes(torch.device(device).type)
         sample_inputs = op.sample_inputs(device, dtype, requires_grad=test_grad)
         for sample_input in sample_inputs:
-            t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs
-            n_inp, n_args, n_kwargs = sample_input.noncontiguous()
+            t_inp, t_args, t_kwargs = (
+                sample_input.input,
+                sample_input.args,
+                sample_input.kwargs,
+            )
+            noncontig_sample = sample_input.noncontiguous()
+            n_inp, n_args, n_kwargs = (
+                noncontig_sample.input,
+                noncontig_sample.args,
+                noncontig_sample.kwargs,
+            )
 
             # Verifies sample input tensors should have no grad or history
             sample_tensor = t_inp if isinstance(t_inp, torch.Tensor) else t_inp[0]
@@ -271,10 +473,14 @@ def test_noncontiguous_samples(self, device, dtype, op):
                 grad_for_actual = noncontiguous_like(grad_for_expected)
             elif isinstance(expected, Sequence):
                 # Filter output elements that do not require grad
-                expected = [t for t in expected
-                            if isinstance(t, torch.Tensor) and t.requires_grad]
-                actual = [n for n in actual
-                          if isinstance(n, torch.Tensor) and n.requires_grad]
+                expected = [
+                    t
+                    for t in expected
+                    if isinstance(t, torch.Tensor) and t.requires_grad
+                ]
+                actual = [
+                    n for n in actual if isinstance(n, torch.Tensor) and n.requires_grad
+                ]
                 grad_for_expected = [torch.randn_like(t) for t in expected]
                 grad_for_actual = [noncontiguous_like(n) for n in grad_for_expected]
             else:
@@ -282,19 +488,35 @@ def test_noncontiguous_samples(self, device, dtype, op):
                 continue
 
             # Concatenate inputs into a tuple
-            t_inputs = (t_inp,) + t_args if isinstance(t_inp, torch.Tensor) else tuple(t_inp) + t_args
-            n_inputs = (n_inp,) + n_args if isinstance(n_inp, torch.Tensor) else tuple(n_inp) + n_args
+            t_inputs = (
+                (t_inp,) + t_args
+                if isinstance(t_inp, torch.Tensor)
+                else tuple(t_inp) + t_args
+            )
+            n_inputs = (
+                (n_inp,) + n_args
+                if isinstance(n_inp, torch.Tensor)
+                else tuple(n_inp) + n_args
+            )
 
             # Filter the elemnts that are tensors that require grad
-            t_input_tensors = [t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad]
-            n_input_tensors = [n for n in n_inputs if isinstance(n, torch.Tensor) and n.requires_grad]
+            t_input_tensors = [
+                t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad
+            ]
+            n_input_tensors = [
+                n for n in n_inputs if isinstance(n, torch.Tensor) and n.requires_grad
+            ]
 
             self.assertEqual(len(t_input_tensors), len(n_input_tensors))
 
             # Some functions may not use all the inputs to generate gradients. One of the
             # few examples of this "odd" behaviour is F.hinge_embedding_loss
-            t_grads = torch.autograd.grad(expected, t_input_tensors, grad_for_expected, allow_unused=True)
-            n_grads = torch.autograd.grad(actual, n_input_tensors, grad_for_actual, allow_unused=True)
+            t_grads = torch.autograd.grad(
+                expected, t_input_tensors, grad_for_expected, allow_unused=True
+            )
+            n_grads = torch.autograd.grad(
+                actual, n_input_tensors, grad_for_actual, allow_unused=True
+            )
 
             msg = "Got different gradients for contiguous / non-contiguous inputs wrt input {}."
             for i, (t, n) in enumerate(zip(t_grads, n_grads)):
@@ -304,97 +526,119 @@ def test_noncontiguous_samples(self, device, dtype, op):
     #   incorrectly sized out parameter warning properly yet
     # Cases test here:
     #   - out= with the correct dtype and device, but the wrong shape
-    @ops(op_db, dtypes=OpDTypes.none)
+    @ops(_ops_and_refs, dtypes=OpDTypes.none)
     def test_out_warning(self, device, op):
-        # TODO: verify the op doesn't support the out= kwarg
-        if not op.supports_out:
-            self.skipTest("Skipped! Op doesn't support out= kwarg.")
-
         # Prefers running in float32 but has a fallback for the first listed supported dtype
         supported_dtypes = op.supported_dtypes(self.device_type)
         if len(supported_dtypes) == 0:
             self.skipTest("Skipped! Op has not supported dtypes on this device.")
-        dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0]
+        dtype = (
+            torch.float32
+            if torch.float32 in supported_dtypes
+            else list(supported_dtypes)[0]
+        )
 
-        # NOTE: only tests on first sample
         samples = op.sample_inputs(device, dtype)
-        sample = first_sample(self, samples)
-
-        # calls it normally to get the expected result
-        expected = op(sample.input, *sample.args, **sample.kwargs)
-        op_out = partial(op, sample.input, *sample.args, **sample.kwargs)
-
-        # Short-circuits if output is not a single tensor or an
-        #   iterable of tensors
-
-        if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(expected, include_empty=True):
-            self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.")
-
-        # A wrapper around map that works with single tensors and always
-        #   instantiates the map. Used below to apply transforms to
-        #   single tensor and iterable tensor outputs.
-        def _apply_out_transform(fn, out):
-            if isinstance(out, torch.Tensor):
-                return fn(out)
-
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(fn, out))
-
-        # Extracts strides from a tensor or iterable of tensors into a tuple
-        def _extract_strides(out):
-            if isinstance(out, torch.Tensor):
-                return (out.stride(),)
-
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(lambda t: t.stride(), out))
-
-        # Extracts data pointers from a tensor or iterable of tensors into a tuple
-        # NOTE: only extracts on the CPU and CUDA device types since some
-        #   device types don't have storage
-        def _extract_data_ptrs(out):
-            if self.device_type != 'cpu' and self.device_type != 'cuda':
-                return ()
-
-            if isinstance(out, torch.Tensor):
-                return (out.data_ptr(),)
+        for sample in samples:
+            # calls it normally to get the expected result
+            expected = op(sample.input, *sample.args, **sample.kwargs)
+            op_out = partial(op, sample.input, *sample.args, **sample.kwargs)
+
+            # Short-circuits if output is not a single tensor or an
+            #   iterable of tensors
+            if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(
+                expected, include_empty=True
+            ):
+                self.skipTest(
+                    "Skipped! Only supports single tensor or iterable of tensor outputs."
+                )
+
+            # Validates the op doesn't support out if it claims not to
+            if not op.supports_out:
+                with self.assertRaises(Exception):
+                    assert op_out(out=expected) != NotImplemented
+                return
+
+            # A wrapper around map that works with single tensors and always
+            #   instantiates the map. Used below to apply transforms to
+            #   single tensor and iterable tensor outputs.
+            def _apply_out_transform(fn, out):
+                if isinstance(out, torch.Tensor):
+                    return fn(out)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(fn, out))
+
+            # Extracts strides from a tensor or iterable of tensors into a tuple
+            def _extract_strides(out):
+                if isinstance(out, torch.Tensor):
+                    return (out.stride(),)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(lambda t: t.stride(), out))
+
+            # Extracts data pointers from a tensor or iterable of tensors into a tuple
+            # NOTE: only extracts on the CPU and CUDA device types since some
+            #   device types don't have storage
+            def _extract_data_ptrs(out):
+                if self.device_type != "cpu" and self.device_type != "cuda":
+                    return ()
+
+                if isinstance(out, torch.Tensor):
+                    return (out.data_ptr(),)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(lambda t: t.data_ptr(), out))
+
+            @suppress_warnings
+            def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
+                out = _apply_out_transform(transform, expected)
+                original_strides = _extract_strides(out)
+                original_ptrs = _extract_data_ptrs(out)
 
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(lambda t: t.data_ptr(), out))
+                op_out(out=out)
+                final_strides = _extract_strides(out)
+                final_ptrs = _extract_data_ptrs(out)
 
-        def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
-            out = _apply_out_transform(transform, expected)
-            original_strides = _extract_strides(out)
-            original_ptrs = _extract_data_ptrs(out)
+                self.assertEqual(expected, out)
 
-            op_out(out=out)
-            final_strides = _extract_strides(out)
-            final_ptrs = _extract_data_ptrs(out)
+                if compare_strides_and_data_ptrs:
+                    stride_msg = "Strides are not the same! Original strides were {0} and strides are now {1}".format(
+                        original_strides, final_strides
+                    )
+                    self.assertEqual(original_strides, final_strides, msg=stride_msg)
+                    self.assertEqual(original_ptrs, final_ptrs)
 
-            self.assertEqual(expected, out)
+            # Case Zero: out= with the correct dtype and device, but the wrong shape
+            #   Expected behavior: if nonempty, resize with a warning.
+            def _case_zero_transform(t):
+                wrong_shape = list(t.shape)
 
-            if compare_strides_and_data_ptrs:
-                self.assertEqual(original_strides, final_strides)
-                self.assertEqual(original_ptrs, final_ptrs)
+                if len(wrong_shape) == 0:
+                    # Handles scalar tensor case (empty list)
+                    wrong_shape = [2]
+                else:
+                    wrong_shape[-1] = wrong_shape[-1] + 1
+                return make_tensor(wrong_shape, dtype=t.dtype, device=t.device)
 
-        # Case: out= with the correct dtype and device, but the wrong shape
-        #   Expected behavior: resize with a warning.
-        def _case_two_transform(t):
-            wrong_shape = list(t.shape)
+            # Verifies the out values are correct
+            _compare_out(_case_zero_transform, compare_strides_and_data_ptrs=False)
 
-            if len(wrong_shape) == 0:
-                # Handles scalar tensor case (empty list)
-                wrong_shape = [2]
-            else:
-                wrong_shape[-1] = wrong_shape[-1] + 1
-            return make_tensor(wrong_shape, dtype=t.dtype, device=t.device)
+            # Additionally validates that the appropriate warning is thrown if a nonempty
+            #   tensor is resized.
+            def _any_nonempty(out):
+                if isinstance(out, torch.Tensor):
+                    return out.numel() > 0
 
-        _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False)
+                return any(x.numel() > 0 for x in out)
 
-        # Additional validates that the appropriate warning is thrown
-        out = _apply_out_transform(_case_two_transform, expected)
-        msg_fail = "Resized a non-empty tensor but did not warn about it."
-        with self.assertWarnsRegex(UserWarning, "An output with one or more elements", msg=msg_fail):
-            op_out(out=out)
+            out = _apply_out_transform(_case_zero_transform, expected)
+            msg_fail = "Resized a non-empty tensor but did not warn about it."
+            if _any_nonempty(out):
+                with self.assertWarnsRegex(
+                    UserWarning, "An output with one or more elements", msg=msg_fail
+                ):
+                    op_out(out=out)
 
     # Validates ops implement the correct out= behavior
     # See https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch
@@ -406,173 +650,191 @@ def _case_two_transform(t):
     #   - Case 3: out has the correct shape and dtype, but is on a different device type
     #   - Case 4: out has the with correct shape and device, but a dtype that cannot
     #       "safely" cast to
-    @ops(op_db, dtypes=OpDTypes.none)
-    def test_out(self, device, op):
-        # TODO: verify the op doesn't support the out= kwarg
-        if not op.supports_out:
-            self.skipTest("Skipped! Op doesn't support out= kwarg.")
-
+    @ops(_ops_and_refs, dtypes=OpDTypes.any_one)
+    def test_out(self, device, dtype, op):
         # Prefers running in float32 but has a fallback for the first listed supported dtype
-        supported_dtypes = op.supported_dtypes(self.device_type)
-        if len(supported_dtypes) == 0:
-            self.skipTest("Skipped! Op has not supported dtypes on this device.")
-        dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0]
-
-        # NOTE: only tests on first sample
         samples = op.sample_inputs(device, dtype)
-        sample = first_sample(self, samples)
-
-        # calls it normally to get the expected result
-        expected = op(sample.input, *sample.args, **sample.kwargs)
-        op_out = partial(op, sample.input, *sample.args, **sample.kwargs)
-
-        # Short-circuits if output is not a single tensor or an
-        #   iterable of tensors
-
-        if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(expected, include_empty=True):
-            self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.")
-
-        # A wrapper around map that works with single tensors and always
-        #   instantiates the map. Used below to apply transforms to
-        #   single tensor and iterable tensor outputs.
-        def _apply_out_transform(fn, out):
-            if isinstance(out, torch.Tensor):
-                return fn(out)
-
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(fn, out))
-
-        # Extracts strides from a tensor or iterable of tensors into a tuple
-        def _extract_strides(out):
-            if isinstance(out, torch.Tensor):
-                return (out.stride(),)
+        for sample in samples:
+            # calls it normally to get the expected result
+            expected = op(sample.input, *sample.args, **sample.kwargs)
+            op_out = partial(op, sample.input, *sample.args, **sample.kwargs)
+
+            # Short-circuits if output is not a single tensor or an
+            #   iterable of tensors
+            if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(
+                expected, include_empty=True
+            ):
+                self.skipTest(
+                    "Skipped! Only supports single tensor or iterable of tensor outputs."
+                )
+
+            # Validates the op doesn't support out if it claims not to
+            if not op.supports_out:
+                with self.assertRaises(Exception):
+                    assert op_out(out=expected) != NotImplemented
+                return
+
+            # A wrapper around map that works with single tensors and always
+            #   instantiates the map. Used below to apply transforms to
+            #   single tensor and iterable tensor outputs.
+            def _apply_out_transform(fn, out):
+                if isinstance(out, torch.Tensor):
+                    return fn(out)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(fn, out))
+
+            # Extracts strides from a tensor or iterable of tensors into a tuple
+            def _extract_strides(out):
+                if isinstance(out, torch.Tensor):
+                    return (out.stride(),)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(lambda t: t.stride(), out))
+
+            # Extracts data pointers from a tensor or iterable of tensors into a tuple
+            # NOTE: only extracts on the CPU and CUDA device types since some
+            #   device types don't have storage
+            def _extract_data_ptrs(out):
+                if self.device_type != "cpu" and self.device_type != "cuda":
+                    return ()
+
+                if isinstance(out, torch.Tensor):
+                    return (out.data_ptr(),)
+
+                # assumes (see above) that out is an iterable of tensors
+                return tuple(map(lambda t: t.data_ptr(), out))
+
+            def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
+                out = _apply_out_transform(transform, expected)
+                original_strides = _extract_strides(out)
+                original_ptrs = _extract_data_ptrs(out)
 
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(lambda t: t.stride(), out))
+                op_out(out=out)
+                final_strides = _extract_strides(out)
+                final_ptrs = _extract_data_ptrs(out)
+                self.assertEqual(expected, out)
 
-        # Extracts data pointers from a tensor or iterable of tensors into a tuple
-        # NOTE: only extracts on the CPU and CUDA device types since some
-        #   device types don't have storage
-        def _extract_data_ptrs(out):
-            if self.device_type != 'cpu' and self.device_type != 'cuda':
-                return ()
+                if compare_strides_and_data_ptrs:
+                    stride_msg = "Strides are not the same! Original strides were {0} and strides are now {1}".format(
+                        original_strides, final_strides
+                    )
+                    self.assertEqual(original_strides, final_strides, msg=stride_msg)
+                    self.assertEqual(original_ptrs, final_ptrs)
+
+            # Case 0: out= with the correct shape, dtype, and device
+            #   but NaN values for floating point and complex tensors, and
+            #   maximum values for integer tensors.
+            #   Expected behavior: out= values have no effect on the computation.
+            def _case_zero_transform(t):
+                try:
+                    info = torch.iinfo(t.dtype)
+                    return torch.full_like(t, info.max)
+                except TypeError as te:
+                    # for non-integer types fills with NaN
+                    return torch.full_like(t, float("nan"))
 
-            if isinstance(out, torch.Tensor):
-                return (out.data_ptr(),)
 
-            # assumes (see above) that out is an iterable of tensors
-            return tuple(map(lambda t: t.data_ptr(), out))
+            _compare_out(_case_zero_transform)
 
-        def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
-            out = _apply_out_transform(transform, expected)
-            original_strides = _extract_strides(out)
-            original_ptrs = _extract_data_ptrs(out)
+            # Case 1: out= with the correct shape, dtype, and device,
+            #   but noncontiguous.
+            #   Expected behavior: strides are respected and `out` storage is not changed.
+            def _case_one_transform(t):
+                return make_tensor(
+                    t.shape, dtype=t.dtype, device=t.device, noncontiguous=True
+                )
 
-            op_out(out=out)
-            final_strides = _extract_strides(out)
-            final_ptrs = _extract_data_ptrs(out)
+            _compare_out(_case_one_transform)
 
-            self.assertEqual(expected, out)
+            # Case 2: out= with the correct dtype and device, but has no elements.
+            #   Expected behavior: resize without warning.
+            def _case_two_transform(t):
+                return make_tensor((0,), dtype=t.dtype, device=t.device)
 
-            if compare_strides_and_data_ptrs:
-                self.assertEqual(original_strides, final_strides)
-                self.assertEqual(original_ptrs, final_ptrs)
+            _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False)
 
-        # Case 0: out= with the correct shape, dtype, and device
-        #   but NaN values for floating point and complex tensors, and
-        #   maximum values for integer tensors.
-        #   Expected behavior: out= values have no effect on the computation.
-        def _case_zero_transform(t):
-            try:
-                info = torch.iinfo(t.dtype)
-                return torch.full_like(t, info.max)
-            except TypeError as te:
-                # for non-integer types fills with NaN
-                return torch.full_like(t, float('nan'))
-
-        _compare_out(_case_zero_transform)
-
-        # Case 1: out= with the correct shape, dtype, and device,
-        #   but noncontiguous.
-        #   Expected behavior: strides are respected and `out` storage is not changed.
-        def _case_one_transform(t):
-            return make_tensor(t.shape,
-                               dtype=t.dtype,
-                               device=t.device,
-                               noncontiguous=True)
-
-        _compare_out(_case_one_transform)
-
-        # Case 2: out= with the correct dtype and device, but has no elements.
-        #   Expected behavior: resize without warning.
-        def _case_two_transform(t):
-            return make_tensor((0,),
-                               dtype=t.dtype,
-                               device=t.device)
-
-        _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False)
-
-        # Also validates that no warning is thrown when this out is resized
-        out = _apply_out_transform(_case_two_transform, expected)
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            op_out(out=out)
-
-        # Verifies no warning is a resize warning
-        for w in caught:
-            if "An output with one or more elements" in str(w.message):
-                self.fail("Resizing an out= argument with no elements threw a resize warning!")
-
-        # Case 3: out= with correct shape and dtype, but wrong device.
-        wrong_device = None
-        if torch.device(device).type != 'cpu':
-            wrong_device = 'cpu'
-        elif torch.cuda.is_available():
-            wrong_device = 'cuda'
-
-        if wrong_device is not None:
-            def _case_three_transform(t):
-                return make_tensor(t.shape, dtype=t.dtype, device=wrong_device)
-
-            out = _apply_out_transform(_case_three_transform, expected)
-            msg_fail = f"Expected RuntimeError when calling with input.device={device} and out.device={wrong_device}"
-            with self.assertRaises(RuntimeError, msg=msg_fail):
+            # Also validates that no warning is thrown when this out is resized
+            out = _apply_out_transform(_case_two_transform, expected)
+            with warnings.catch_warnings(record=True) as caught:
+                warnings.simplefilter("always")
                 op_out(out=out)
 
-        # Case 4: out= with correct shape and device, but a dtype
-        #   that output cannot be "safely" cast to (long).
-        #   Expected behavior: error.
-        # NOTE: this case is filtered by dtype since some ops produce
-        #   bool tensors, for example, which can be safely cast to any
-        #   dtype. It is applied when single tensors are floating point or complex
-        #   dtypes, or if an op returns multiple tensors when at least one such
-        #   tensor is a floating point or complex dtype.
-        _dtypes = floating_and_complex_types_and(torch.float16, torch.bfloat16)
-        if (isinstance(expected, torch.Tensor) and expected.dtype in _dtypes or
-                (not isinstance(expected, torch.Tensor) and any(t.dtype in _dtypes for t in expected))):
-            def _case_four_transform(t):
-                return make_tensor(t.shape, dtype=torch.long, device=t.device)
-
-            out = _apply_out_transform(_case_four_transform, expected)
-            msg_fail = "" if not isinstance(expected, torch.Tensor) else \
-                       ("Expected RuntimeError when doing an unsafe cast from a result of dtype "
-                        f"{expected.dtype} into an out= with dtype torch.long")
-            with self.assertRaises(RuntimeError, msg=msg_fail):
-                op_out(out=out)
+            # Verifies no warning is a resize warning
+            for w in caught:
+                if "An output with one or more elements" in str(w.message):
+                    self.fail(
+                        "Resizing an out= argument with no elements threw a resize warning!"
+                    )
+
+            # Case 3: out= with correct shape and dtype, but wrong device.
+            wrong_device = None
+            if torch.device(device).type != "cpu":
+                wrong_device = "cpu"
+            elif torch.cuda.is_available():
+                wrong_device = "cuda"
+
+            if wrong_device is not None:
+
+                def _case_three_transform(t):
+                    return make_tensor(t.shape, dtype=t.dtype, device=wrong_device)
+
+                out = _apply_out_transform(_case_three_transform, expected)
+                msg_fail = f"Expected RuntimeError when calling with input.device={device} and out.device={wrong_device}"
+                with self.assertRaises(RuntimeError, msg=msg_fail):
+                    op_out(out=out)
+
+            # Case 4: out= with correct shape and device, but a dtype
+            #   that output cannot be "safely" cast to (long).
+            #   Expected behavior: error.
+            # NOTE: this case is filtered by dtype since some ops produce
+            #   bool tensors, for example, which can be safely cast to any
+            #   dtype. It is applied when single tensors are floating point or complex
+            #   dtypes, or if an op returns multiple tensors when at least one such
+            #   tensor is a floating point or complex dtype.
+            _dtypes = floating_and_complex_types_and(torch.float16, torch.bfloat16)
+            if (
+                isinstance(expected, torch.Tensor)
+                and expected.dtype in _dtypes
+                or (
+                    not isinstance(expected, torch.Tensor)
+                    and any(t.dtype in _dtypes for t in expected)
+                )
+            ):
+
+                def _case_four_transform(t):
+                    return make_tensor(t.shape, dtype=torch.long, device=t.device)
+
+                out = _apply_out_transform(_case_four_transform, expected)
+                msg_fail = "Expected RuntimeError when doing an unsafe cast!"
+                msg_fail = (
+                    msg_fail
+                    if not isinstance(expected, torch.Tensor)
+                    else (
+                        "Expected RuntimeError when doing an unsafe cast from a result of dtype "
+                        f"{expected.dtype} into an out= with dtype torch.long"
+                    )
+                )
+                with self.assertRaises(RuntimeError, msg=msg_fail):
+                    op_out(out=out)
 
     # Tests that the forward and backward passes of operations produce the
     #   same values for the cross-product of op variants (method, inplace)
     #   against eager's gold standard op function variant
     @_variant_ops(op_db)
     def test_variant_consistency_eager(self, device, dtype, op):
-        # Acquires variants (method variant, inplace variant, aliases)
+        # Acquires variants (method variant, inplace variant, operator variant, inplace_operator variant, aliases)
 
         method = op.method_variant
         inplace = op.inplace_variant
+        operator = op.operator_variant
+        inplace_operator = op.inplace_operator_variant
+
 
         # list of all inplace ops: inplace variant + alias inplace variants if exist
-        inplace_ops = [inplace, ]
-        variants = [method, inplace]
+        inplace_ops = [inplace, inplace_operator]
+        variants = [method, inplace, operator, inplace_operator]
+        operators = [operator, inplace_operator]
 
         for a_op in op.aliases:
             variants.append(a_op.op)
@@ -582,32 +844,48 @@ def test_variant_consistency_eager(self, device, dtype, op):
 
         inplace_variants = tuple(filter(None, inplace_ops))
         variants = tuple(filter(None, variants))
+        operators = tuple(filter(None, operators))
 
-        _requires_grad = (op.supports_autograd and
-                          (dtype.is_floating_point or op.supports_complex_autograd(torch.device(device).type)))
+        _requires_grad = dtype in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
 
         include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
-        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs)
+        samples = op.sample_inputs(
+            device,
+            dtype,
+            requires_grad=_requires_grad,
+            include_conjugated_inputs=include_conjugated_inputs,
+        )
         samples = list(samples)
 
         def _test_consistency_helper(samples, variants):
             for sample in samples:
                 # TODO: Check grad for all Tensors requiring grad if sample.input is TensorList
-                tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0]
+                tensor = (
+                    sample.input
+                    if isinstance(sample.input, torch.Tensor)
+                    else sample.input[0]
+                )
 
                 # Computes function forward and backward values
                 tensor.grad = None
                 expected_forward = op(sample.input, *sample.args, **sample.kwargs)
                 expected_grad = None
 
-                output_process_fn_grad = sample.output_process_fn_grad if sample.output_process_fn_grad \
+                output_process_fn_grad = (
+                    sample.output_process_fn_grad
+                    if sample.output_process_fn_grad
                     else lambda x: x
+                )
 
                 # Skips inplace variants if the output dtype is not the same as
                 #   the input dtype
                 skip_inplace = False
-                if (isinstance(expected_forward, torch.Tensor) and
-                        expected_forward.dtype is not tensor.dtype):
+                if (
+                    isinstance(expected_forward, torch.Tensor)
+                    and expected_forward.dtype is not tensor.dtype
+                ):
                     skip_inplace = True
 
                 # TODO: backward consistency only supported for single tensor outputs
@@ -615,8 +893,9 @@ def _test_consistency_helper(samples, variants):
                 #   tensor inputs
                 # TODO: update to handle checking grads of all tensor inputs as
                 #   derived from each tensor output
-                if (op.supports_autograd and isinstance(expected_forward, torch.Tensor)
-                        and (dtype.is_floating_point or op.supports_complex_autograd(torch.device(device).type))):
+                if isinstance(
+                    expected_forward, torch.Tensor
+                ) and dtype in op.supported_backward_dtypes(torch.device(device).type):
                     output_process_fn_grad(expected_forward).sum().backward()
                     expected_grad = tensor.grad
 
@@ -629,26 +908,39 @@ def _test_consistency_helper(samples, variants):
                     # Compares variant's forward
                     # Note: copies the to-be-modified input when testing the inplace variant
                     tensor.grad = None
-                    cloned = clone_input_helper(sample.input) if variant in inplace_ops else sample.input
+                    cloned = (
+                        clone_input_helper(sample.input)
+                        if variant in inplace_ops
+                        else sample.input
+                    )
 
                     if variant in inplace_ops and sample.broadcasts_input:
-                        with self.assertRaises(RuntimeError,
-                                               msg=('inplace variant either incorrectly allowed '
-                                                    'resizing or you have marked the sample {}'
-                                                    ' incorrectly with `broadcasts_self=True'.format(sample.summary()))):
-                            variant_forward = variant(cloned,
-                                                      *sample.args,
-                                                      **sample.kwargs)
+                        with self.assertRaises(
+                            RuntimeError,
+                            msg=(
+                                "inplace variant either incorrectly allowed "
+                                "resizing or you have marked the sample {}"
+                                " incorrectly with `broadcasts_self=True".format(
+                                    sample.summary()
+                                )
+                            ),
+                        ):
+                            variant_forward = variant(
+                                cloned, *sample.args, **sample.kwargs
+                            )
+                        continue
+
+                    if variant in operators and sample.kwargs:
+                        # skip samples with kwargs for operator variants
                         continue
 
-                    variant_forward = variant(cloned,
-                                              *sample.args,
-                                              **sample.kwargs)
+                    variant_forward = variant(cloned, *sample.args, **sample.kwargs)
                     self.assertEqual(expected_forward, variant_forward)
 
                     # Compares variant's backward
-                    if expected_grad is not None and \
-                            (variant not in inplace_ops or op.supports_inplace_autograd):
+                    if expected_grad is not None and (
+                        variant not in inplace_ops or op.supports_inplace_autograd
+                    ):
                         output_process_fn_grad(variant_forward).sum().backward()
                         self.assertEqual(expected_grad, tensor.grad)
 
@@ -659,531 +951,119 @@ def _test_inplace_preserve_storage(samples, variants):
                 # Skips inplace variants if the output dtype is not the same as
                 #   the input dtype
                 expected_forward = op(sample.input, *sample.args, **sample.kwargs)
-                tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0]
+                tensor = (
+                    sample.input
+                    if isinstance(sample.input, torch.Tensor)
+                    else sample.input[0]
+                )
                 skip_inplace = False
-                if (isinstance(expected_forward, torch.Tensor) and
-                        expected_forward.dtype is not tensor.dtype):
+                if (
+                    isinstance(expected_forward, torch.Tensor)
+                    and expected_forward.dtype is not tensor.dtype
+                ):
                     skip_inplace = True
                 if skip_inplace:
                     return
                 for variant in variants:
-                    cloned = clone_input_helper(sample.input) if variant in inplace_ops else sample.input
-                    inp_tensor = cloned if isinstance(cloned, torch.Tensor) else cloned[0]
+                    cloned = (
+                        clone_input_helper(sample.input)
+                        if variant in inplace_ops
+                        else sample.input
+                    )
+                    inp_tensor = (
+                        cloned if isinstance(cloned, torch.Tensor) else cloned[0]
+                    )
                     data_ptr = inp_tensor.data_ptr()
-                    variant_forward = variant(cloned,
-                                              *sample.args,
-                                              **sample.kwargs)
+                    if variant in operators and sample.kwargs:
+                        # skip samples with kwargs for operator variants
+                        continue
+
+                    variant_forward = variant(cloned, *sample.args, **sample.kwargs)
                     # TODO Support non-tensor outputs if they exist for inplace ops
-                    if (isinstance(variant_forward, torch.Tensor)):
-                        self.assertEqual(data_ptr, variant_forward.data_ptr(), atol=0, rtol=0)
+                    if isinstance(variant_forward, torch.Tensor):
+                        self.assertEqual(
+                            data_ptr, variant_forward.data_ptr(), atol=0, rtol=0
+                        )
                     else:
-                        self.assertTrue(False, "Non-tensor outputs for inplace ops are not supported")
+                        self.assertTrue(
+                            False,
+                            "Non-tensor outputs for inplace ops are not supported",
+                        )
 
         if len(inplace_ops) > 0:
-            inplace_samples = list(filter(lambda sample: not sample.broadcasts_input, samples))
+            inplace_samples = list(
+                filter(lambda sample: not sample.broadcasts_input, samples)
+            )
             _test_inplace_preserve_storage(inplace_samples, inplace_variants)
 
+    # Reference testing for operations in complex32 against complex64.
+    # NOTE: We test against complex64 as NumPy doesn't have a complex32 equivalent dtype.
+    @ops(op_db, allowed_dtypes=(torch.complex32,))
+    def test_complex_half_reference_testing(self, device, dtype, op):
+        if not op.supports_dtype(torch.complex32, device):
+            unittest.skip("Does not support complex32")
+
+        for sample in op.sample_inputs(device, dtype):
+            actual = op(sample.input, *sample.args, **sample.kwargs)
+            # sample.transform applies the lambda to torch.Tensor and torch.dtype.
+            # However, we only want to apply it to Tensors with dtype `torch.complex32`..
+            transformed_sample = sample.transform(lambda x: x.to(torch.complex64) if isinstance(
+                x, torch.Tensor) and x.dtype is torch.complex32 else x)
+            expected = op(
+                transformed_sample.input,
+                *transformed_sample.args,
+                **transformed_sample.kwargs,
+            )
+            self.assertEqual(actual, expected, exact_dtype=False)
+
+
+class TestCompositeCompliance(TestCase):
     # Checks if the operator (if it is composite) is written to support most
     # backends and Tensor subclasses. See "CompositeImplicitAutograd Compliance"
     # in aten/src/ATen/native/README.md for more details
-    #
-    # NB: onlyCPU because CompositeImplicitAutograd ops go through the same
-    # codepath on all devices. Ideally we'd use a meta device here but coverage
-    # for that is not good yet.
-    @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, '__torch_dispatch__ does not work in fbcode')
-    @onlyCPU
+    @unittest.skipIf(
+        IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode"
+    )
     @ops(op_db, allowed_dtypes=(torch.float,))
-    def test_composite_compliance(self, device, dtype, op):
+    def test_operator(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=False)
 
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
-            _check_composite_compliance(op, args, kwargs)
-
-    @onlyCPU
-    @ops(op_db, allowed_dtypes=(torch.float,))
-    def test_floating_inputs_are_differentiable(self, device, dtype, op):
-        # Nothing to check if the operation it's not differentiable
-        if not op.supports_autograd:
-            return
-
-        floating_dtypes = list(floating_and_complex_types_and(torch.bfloat16, torch.float16))
-
-        def check_tensor_floating_is_differentiable(t):
-            if isinstance(t, torch.Tensor) and t.dtype in floating_dtypes:
-                msg = (f"Found a sampled tensor of floating-point dtype {t.dtype} sampled with "
-                       "requires_grad=False. If this is intended, please skip/xfail this test. "
-                       "Remember that sampling operations are executed under a torch.no_grad contextmanager.")
-                self.assertTrue(t.requires_grad, msg)
-
+            composite_compliance.check_with_mode(op, args, kwargs)
+            composite_compliance.check_all_permutations(op, args, kwargs)
+
+    @unittest.skipIf(
+        IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode"
+    )
+    @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
+    def test_backward(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
-        for sample in samples:
-            check_tensor_floating_is_differentiable(sample.input)
-            for arg in sample.args:
-                check_tensor_floating_is_differentiable(arg)
-            for arg in sample.kwargs.values():
-                check_tensor_floating_is_differentiable(arg)
-
-
-# gradcheck requires double precision
-_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
-                         allowed_dtypes=[torch.double, torch.cdouble])
-
-
-class TestGradients(TestCase):
-    exact_dtype = True
-
-    # Copies inputs to inplace operations to avoid inplace modifications
-    #   to leaves requiring gradient
-    def _get_safe_inplace(self, inplace_variant):
-        @wraps(inplace_variant)
-        def _fn(t, *args, **kwargs):
-            return inplace_variant(t.clone(), *args, **kwargs)
-
-        return _fn
-
-    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
-                      check_batched_grad=None, check_batched_forward_grad=False):
-        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
-        # NB: check_backward_ad does not affect gradgradcheck (always True)
-        if variant is None:
-            self.skipTest("Skipped! Variant not implemented.")
-        if not op.supports_dtype(dtype, torch.device(device).type):
-            self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}")
-
-        def is_inplace(variant):
-            if hasattr(variant, "__wrapped__"):
-                return variant.__wrapped__ is op.get_inplace()
-            return variant is op.get_inplace()
-
-        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
-        samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs)
 
         for sample in samples:
-            if sample.broadcasts_input and is_inplace(variant):
-                continue
-
-            # Note on TensorList inputs
-            #
-            # gradcheck does not support TensorList inputs so here we pass TensorList
-            # inputs of size n as n single Tensor inputs to gradcheck and wrap the op
-            # in a function that puts the n Tensor inputs back into a TensorList
-            def fn(*inputs):
-                # Put tensors back into TensorList since we splat them when passing to gradcheck
-                if is_iterable_of_tensors(sample.input):
-                    n = len(sample.input)
-                    inputs = (inputs[:n], *inputs[n:])
-                output = op.gradcheck_wrapper(variant, *inputs, **sample.kwargs)
-                if sample.output_process_fn_grad is not None:
-                    return sample.output_process_fn_grad(output)
-                return output
-
-            # Splat TensorList inputs into single Tensor inputs
-            gradcheck_args = (sample.input,) if isinstance(sample.input, torch.Tensor) else tuple(sample.input)
-            gradcheck_args += sample.args
-
-            if check == 'gradcheck':
-                if check_batched_grad is None:
-                    check_batched_grad = op.check_batched_grad
-                self.assertTrue(gradcheck(fn, gradcheck_args,
-                                          check_batched_grad=check_batched_grad,
-                                          check_grad_dtypes=True,
-                                          nondet_tol=op.gradcheck_nondet_tol,
-                                          fast_mode=op.gradcheck_fast_mode,
-                                          check_forward_ad=check_forward_ad,
-                                          check_backward_ad=check_backward_ad,
-                                          check_undefined_grad=True,
-                                          check_batched_forward_grad=check_batched_forward_grad))
-            elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
-                self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
-                for gen_non_contig_grad_outputs in (False, True):
-                    kwargs = {
-                        "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs,
-                        "check_batched_grad": op.check_batched_gradgrad,
-                        "check_grad_dtypes": True,
-                        "nondet_tol": op.gradcheck_nondet_tol,
-                        "fast_mode": op.gradcheck_fast_mode
-                    }
-                    if check == "fwgrad_bwgrad":
-                        kwargs["check_fwd_over_rev"] = True
-                        kwargs["check_rev_over_rev"] = False
-                        kwargs["check_batched_grad"] = False
-                        kwargs["check_undefined_grad"] = False
-
-                    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
-            else:
-                self.assertTrue(False, msg="Unknown check requested!")
-
-    def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
-                          check_batched_grad=None, check_batched_forward_grad=False):
-        return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
-                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
-                                  check_batched_forward_grad=check_batched_forward_grad)
-
-    def _skip_helper(self, op, device, dtype):
-        if not op.supports_autograd and not op.supports_forward_ad:
-            self.skipTest("Skipped! autograd not supported.")
-        if not op.supports_complex_autograd(torch.device(device).type) and dtype.is_complex:
-            self.skipTest("Skipped! Complex autograd not supported.")
-
-    # Tests that gradients are computed correctly
-    @_gradcheck_ops(op_db)
-    def test_fn_grad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-        self._grad_test_helper(device, dtype, op, op.get_op())
-
-    # Method grad (and gradgrad, see below) tests are disabled since they're
-    #   costly and redundant with function grad (and gradgad) tests
-    # @_gradcheck_ops(op_db)
-    # def test_method_grad(self, device, dtype, op):
-    #     self._skip_helper(op, device, dtype)
-    #     self._grad_test_helper(device, dtype, op, op.get_method())
-
-    @_gradcheck_ops(op_db)
-    def test_inplace_grad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-        if not op.inplace_variant or not op.supports_inplace_autograd:
-            self.skipTest("Skipped! Operation does not support inplace autograd.")
-        self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
-
-    # Test that gradients of gradients are computed correctly
-    @_gradcheck_ops(op_db)
-    def test_fn_gradgrad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-        if not op.supports_gradgrad:
-            self.skipTest("Skipped! Operation does not support gradgrad")
-        self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
-
-    # Test that forward-over-reverse gradgrad is computed correctly
-    @_gradcheck_ops(op_db)
-    def test_fn_fwgrad_bwgrad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        if op.supports_fwgrad_bwgrad:
-            self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
-        else:
-            err_msg = r"Trying to use forward AD with .* that does not support it\."
-            hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.")
-            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
-                self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
-
-    # Test that gradients of gradients are properly raising
-    @_gradcheck_ops(op_db)
-    def test_fn_fail_gradgrad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-        if op.supports_gradgrad:
-            self.skipTest("Skipped! Operation does support gradgrad")
-
-        err_msg = r"derivative for .* is not implemented"
-        with self.assertRaisesRegex(RuntimeError, err_msg):
-            self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
-
-    # Method gradgrad (and grad, see above) tests are disabled since they're
-    #   costly and redundant with function gradgrad (and grad) tests
-    # @_gradcheck_ops(op_db)
-    # def test_method_gradgrad(self, device, dtype, op):
-    #     self._skip_helper(op, device, dtype)
-    #     self._gradgrad_test_helper(device, dtype, op, op.get_method())
-
-    @_gradcheck_ops(op_db)
-    def test_inplace_gradgrad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-        if not op.inplace_variant or not op.supports_inplace_autograd:
-            self.skipTest("Skipped! Operation does not support inplace autograd.")
-        self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad")
-
-    def _forward_grad_helper(self, device, dtype, op, variant, is_inplace):
-        # TODO: clean up how attributes are passed to gradcheck from OpInfos
-        def call_grad_test_helper():
-            check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
-                                          (op.check_inplace_batched_forward_grad and is_inplace))
-            self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
-                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
-        if op.supports_forward_ad:
-            call_grad_test_helper()
-        else:
-            err_msg = r"Trying to use forward AD with .* that does not support it\."
-            hint_msg = ("Running forward AD for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_forward_ad=True")
-            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
-                call_grad_test_helper()
-
-    @_gradcheck_ops(op_db)
-    def test_forward_mode_AD(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False)
-
-    @_gradcheck_ops(op_db)
-    def test_inplace_forward_mode_AD(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        if not op.inplace_variant or not op.supports_inplace_autograd:
-            self.skipTest("Skipped! Operation does not support inplace autograd.")
-
-        self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True)
-
-    # Functions that do not support autograd should not fail in forward mode
-    # Inplace functions (such as "resize_") are expected to fail in forward mode and should be skipped
-    # Test only when supports_autograd=False and for double dtype
-    @ops(filter(lambda op: not op.supports_autograd, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
-    def test_nondifferentiable(self, device, dtype, op):
-        # Expecting no errors
-        samples = op.sample_inputs(device, dtype, requires_grad=True)
-        sample = first_sample(self, samples)
-        result = op(sample.input, *sample.args, **sample.kwargs)
-
-
-# Tests operators for consistency between JIT and eager, also checks
-#   correctness of JIT specific alias schemas and intended
-#   autodifferentiation behavior.
-# Inherits from JitCommonTestCase instead of TestCase directly to share
-#   functionality with original test_jit.py method operator tests
-class TestJit(JitCommonTestCase):
-    exact_dtype = True
-
-    # Tests that the forward and backward passes of operations produce the
-    #   same values for the cross-product of op variants (function, method, inplace)
-    #   and runtimes (eager, traced, scripted).
-    # TODO WARNING: inplace x {traced, scripted} not currently tested
-    @_variant_ops(op_db)
-    def test_variant_consistency_jit(self, device, dtype, op):
-        _requires_grad = op.supports_autograd and (dtype.is_floating_point or
-                                                   op.supports_complex_autograd(torch.device(device).type))
-
-        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
-        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs)
-
-        # Acquires variants to test
-        func = op.get_op()
-        method = op.get_method()
-        variants = {
-            # TODO: inplace tests currently fail, fix and add inplace variant
-            'function': func, 'method': method,
-        }
-
-        # TODO: find better way to standardize on op registration itself..
-        has_fake_function = op.name in ["resize_", 'resize_as_']
-
-        if has_fake_function:
-            variants = {'method': getattr(torch.Tensor, op.name)}
-            samples = op.sample_inputs(device, dtype, requires_grad=False)
-
-        support_script = op.supports_scripting
+            args = [sample.input] + list(sample.args)
+            kwargs = sample.kwargs
+            composite_compliance.check_backward_formula(op, args, kwargs)
 
-        tested = False
-        for sample in samples:
-            # Test traced and scripted consistency
-            for func_type, variant in variants.items():
-                if variant is None:
-                    continue
+    @unittest.skipIf(
+        IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode"
+    )
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    def test_forward_ad(self, device, dtype, op):
+        if torch.float not in op.supported_backward_dtypes(device):
+            raise unittest.SkipTest("Does not support autograd")
 
-                # scripting and check_alias_analysis do not work with lambdas
-                # lambdas are typically used as a way to simulate methods without
-                # functional variants, so rely on the other variant for testing
-                # for now
-                if is_lambda(variant):
-                    continue
+        if not op.supports_forward_ad:
+            raise unittest.SkipTest("Does not support forward_ad")
 
-                tested = True
-
-                # Create accessor for script function variant
-                name = op.name + '_' if func_type == 'inplace' else op.name
-
-                # run with disable_autodiff_subgraph_inlining(True) to test
-                #   autodiff support. Context manager forces the graph to contain
-                #   DifferentiableGraph nodes if they are present
-                with disable_autodiff_subgraph_inlining():
-                    # Check scripted forward, grad, and grad grad
-                    if support_script:
-                        script_fn = create_script_fn(self, name, func_type)
-
-                    def out_fn(output):
-                        # Processes the output for autograd
-                        if sample.output_process_fn_grad is not None:
-                            return sample.output_process_fn_grad(output)
-                        return output
-
-                    def get_sample():
-                        return clone_input_helper(sample.input) if op.name[-1] == '_' else sample.input
-
-                    if support_script:
-                        check_against_reference(self,
-                                                script_fn,
-                                                func,
-                                                out_fn,
-                                                (get_sample(),) + sample.args,
-                                                sample.kwargs,
-                                                no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
-
-                    # Check traced forward, grad, and grad grad
-                    # TODO: fix tracing here
-                    supports_tracing = not has_fake_function
-                    if op.assert_jit_shape_analysis:
-                        self.assertTrue(supports_tracing)
-
-                    if supports_tracing:
-                        traced_fn = create_traced_fn(self, variant)
-                        check_against_reference(self,
-                                                traced_fn,
-                                                func,
-                                                out_fn,
-                                                (get_sample(),) + sample.args,
-                                                sample.kwargs,
-                                                no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
-
-                    # Check alias annotation schema for correctness (make
-                    #   sure inputs that aren't supposed to be modified aren't)
-                    # Note: only runs in float32 because schema isn't affected by dtype,
-                    #   so running it on all dtypes is would be excessive
-                    if dtype == torch.float32:
-                        # TODO: no reason why we cant run this with tracing graph
-                        if support_script and op.name != "rsub":
-                            check_alias_annotation(name, (get_sample(),) + sample.args, sample.kwargs,
-                                                   func_type=func_type, aten_name=op.aten_name)
-
-                        # TODO: use script graph as well
-                        checked_shape_analysis = False
-                        if supports_tracing:
-                            out = variant(get_sample(), *sample.args, **sample.kwargs)
-
-                            # right now, tuple of outputs and tensor output supported
-                            # TODO: list of tensor outputs
-                            tuple_of_tensors = isinstance(out, tuple) and all([isinstance(elem, torch.Tensor) for elem in out])
-
-                            if isinstance(out, torch.Tensor) or tuple_of_tensors:
-                                if tuple_of_tensors:
-                                    sizes = [elem.size() for elem in out]
-                                else:
-                                    sizes = out.size()
-                                self.checkShapeAnalysis(sizes, traced_fn.graph, op.assert_jit_shape_analysis)
-                                checked_shape_analysis = True
-                        if op.assert_jit_shape_analysis:
-                            self.assertTrue(checked_shape_analysis)
-
-                    # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample
-                    if dtype is torch.float32:
-                        # Sandcastle doesn't fuse nodes
-                        if IS_SANDCASTLE:
-                            # fusible nodes are expected to be found in FusionGroups in the DifferentiableGraphs
-                            nonfusible_nodes = op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes
-                            fusible_nodes = []
-                        else:
-                            nonfusible_nodes = op.autodiff_nonfusible_nodes
-                            fusible_nodes = op.autodiff_fusible_nodes
-
-                        if supports_tracing:
-                            self.assertAutodiffNode(traced_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
-                        if support_script:
-                            self.assertAutodiffNode(script_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
-        assert tested, "JIT Test does not execute any logic"
-
-    # alias testing is only done with torch.float for the same reason
-    _alias_ops = partial(ops, dtypes=OpDTypes.supported,
-                         allowed_dtypes=(torch.float,))
-
-    @_alias_ops((op for op in op_db if op.aliases))
-    def test_jit_alias_remapping(self, device, dtype, op):
-        # Required to avoid undefined value: tensor error in JIT compilation of the function template
-        tensor = torch.tensor
-
-        # NOTE: only tests on first sample
         samples = op.sample_inputs(device, dtype, requires_grad=True)
-        sample = first_sample(self, samples)
-
-        # [Scripting Data Preparation]
-        # Prepare data for test scripting
-        # Below we prepare strings of args/kwargs with and without type annotations.
-        # These strings are inserted into function template strings which is then torch scripted.
-        # - args string is ["t0"] corresponding to the "input" tensor required by the op
-        # - args_kw is the value of args and strings of kwargs used to call the op (without type annotations), for example,
-        # ["to", "1.0", "(1,)", "True", "tensor(1.0)"] -> def fn(t0): return variant(t0, 1.0, (1,), True, tensor(1.0))
-        args = ["t0"]
-
-        def quote_strs(v):
-            if isinstance(v, str):
-                return f"'{v}'"
-
-            return str(v)
-
-        args_kw = args + \
-            [f"{v}" for v in sample.args] + \
-            [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()]
-
-        # Prepare data for test tracing
-        sample_args_kwargs = ()
-        if len(sample.args) > 0:
-            sample_args_kwargs += (sample.args, )
-        if len(sample.kwargs) > 0:
-            sample_args_kwargs += (sample.kwargs, )
-
-        original_name = op.aten_name
-        original_name_inplace = original_name + "_"
-        expected_dtype = op(sample.input, *sample.args, **sample.kwargs).dtype
 
-        for a_op in op.aliases:
-            inplace = a_op.inplace_variant
-            method_or_inplace = [a_op.inplace_variant, a_op.method_variant]
-            variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None)
-
-            # Test scripting:
-            for variant in variants:
-                variant_name = variant.__name__
-                op_name = original_name_inplace if variant is inplace else original_name
-
-                if variant in method_or_inplace:
-                    fn_template = '''
-                        def _fn(t0{c}):
-                            return t0.{alias_name}({args_kw})
-                    '''
-                    # remove the first input tensor
-                    script = fn_template.format(
-                        c=", " if len(args_kw[1:]) > 1 else "",
-                        args_kw=", ".join(args_kw[1:]),
-                        alias_name=variant_name,
-                    )
-                else:
-                    fn_template = '''
-                        def _fn({args}):
-                            return variant({args_kw})
-                    '''
-                    script = fn_template.format(
-                        args=", ".join(args),
-                        args_kw=", ".join(args_kw),
-                    )
-                scripted = torch.jit.CompilationUnit(script)._fn
+        for sample in samples:
+            args = [sample.input] + list(sample.args)
+            kwargs = sample.kwargs
+            composite_compliance.check_forward_ad_formula(op, args, kwargs)
 
-                if (variant is inplace and not torch.can_cast(expected_dtype, dtype)):
-                    try:
-                        inp = clone_input_helper(sample.input)
-                        scripted(inp)
-                    except Exception as e:
-                        continue
-                    self.fail("Inplace operation on integer tensor that should be promoted to float didn't fail!")
-
-                inp = clone_input_helper(sample.input)
-                scripted(inp)
-                inp = clone_input_helper(sample.input)
-                graph = scripted.graph_for(inp)
-                FileCheck().check(op.aten_name).check_not(variant_name).run(graph)
-
-            # Test tracing:
-            for variant in variants:
-                variant_name = variant.__name__
-                op_name = original_name_inplace if variant is inplace else original_name
-
-                def _fn(*sample_args, **sample_kwargs):
-                    return variant(*sample_args, **sample_kwargs)
-
-                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
-                traced = torch.jit.trace(_fn, *inp)
-                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
-                traced(*inp)
-                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
-                graph = traced.graph_for(*inp)
-                FileCheck().check(op_name).check_not(variant_name).run(graph)
 
 class TestMathBits(TestCase):
     # Tests that
@@ -1196,7 +1076,17 @@ class TestMathBits(TestCase):
     # This test only runs for C -> R and C -> C functions
     # TODO: add tests for `R->C` functions
     # Note: This test runs for functions that take both tensors and tensorlists as input.
-    def _test_math_view(self, device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, out_type):
+    def _test_math_view(
+        self,
+        device,
+        dtype,
+        op,
+        samples,
+        math_op_physical,
+        math_op_view,
+        is_bit_set,
+        out_type,
+    ):
         inplace_variant = op.inplace_variant
 
         # helper function to clone and conjugate/negate the input if its a tensor
@@ -1205,7 +1095,7 @@ def _test_math_view(self, device, dtype, op, samples, math_op_physical, math_op_
         # have its requires_grad set to that value.
         def clone_and_perform_view(input, **kwargs):
             if isinstance(input, torch.Tensor):
-                requires_grad = kwargs.get('requires_grad', input.requires_grad)
+                requires_grad = kwargs.get("requires_grad", input.requires_grad)
                 with torch.no_grad():
                     # Ensure view represents the original sample input
                     input = math_op_physical(input)
@@ -1222,7 +1112,11 @@ def clone_and_perform_view(input, **kwargs):
                 return tuple(out)
 
         for sample in samples:
-            tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0]
+            tensor = (
+                sample.input
+                if isinstance(sample.input, torch.Tensor)
+                else sample.input[0]
+            )
             cloned1 = clone_and_perform_view(sample.input)
 
             # Computes function forward value with a physically conjugated/negated tensor and
@@ -1236,9 +1130,13 @@ def clone_and_perform_view(input, **kwargs):
             # input produces correct output, and the output tensor has the conj/neg bit set to True
             if inplace_variant is not None and not sample.broadcasts_input:
                 cloned2 = clone_and_perform_view(tensor, requires_grad=False)
-                if (isinstance(expected_forward, torch.Tensor) and
-                        expected_forward.dtype is tensor.dtype):
-                    inplace_forward = inplace_variant(cloned2, *sample.args, **sample.kwargs)
+                if (
+                    isinstance(expected_forward, torch.Tensor)
+                    and expected_forward.dtype is tensor.dtype
+                ):
+                    inplace_forward = inplace_variant(
+                        cloned2, *sample.args, **sample.kwargs
+                    )
                     self.assertTrue(is_bit_set(inplace_forward))
                     self.assertEqual(inplace_forward, expected_forward)
 
@@ -1247,40 +1145,62 @@ def clone_and_perform_view(input, **kwargs):
             #   tensor inputs
             # TODO: update to handle checking grads of all tensor inputs as
             #   derived from each tensor output
-            if isinstance(expected_forward, torch.Tensor) and expected_forward.requires_grad:
+            if (
+                isinstance(expected_forward, torch.Tensor)
+                and expected_forward.requires_grad
+            ):
                 output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x)
                 expected_forward = output_process_fn_grad(expected_forward)
                 forward_with_mathview = output_process_fn_grad(forward_with_mathview)
 
-                tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0]
+                tensor = (
+                    sample.input
+                    if isinstance(sample.input, torch.Tensor)
+                    else sample.input[0]
+                )
                 expected_forward.sum().backward(retain_graph=True)
                 forward_with_mathview.sum().backward(retain_graph=True)
                 if tensor.grad is not None:
-                    cloned1_tensor = cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
+                    cloned1_tensor = (
+                        cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
+                    )
                     self.assertEqual(tensor.grad, cloned1_tensor.grad)
 
                     tensor.grad, cloned1_tensor.grad = None, None
 
                     # a repeat of the above test if output is not complex valued
-                    if (out_type(expected_forward)):
+                    if out_type(expected_forward):
                         grad = torch.randn_like(expected_forward)
                         expected_forward.backward(grad)
-                        forward_with_mathview.backward(math_op_view(math_op_physical(grad)))
+                        forward_with_mathview.backward(
+                            math_op_view(math_op_physical(grad))
+                        )
 
                         self.assertEqual(tensor.grad, cloned1_tensor.grad)
 
-    @ops(op_db, allowed_dtypes=(torch.cfloat,))
+    @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
     def test_conj_view(self, device, dtype, op):
         if not op.test_conjugated_samples:
             self.skipTest("Operation doesn't support conjugated inputs.")
         math_op_physical = torch.conj_physical
         math_op_view = torch.conj
-        _requires_grad = (op.supports_autograd and op.supports_complex_autograd(torch.device(device).type))
+        _requires_grad = torch.cfloat in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
         is_bit_set = torch.is_conj
         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
-        self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, torch.is_complex)
-
-    @ops(op_db, allowed_dtypes=(torch.double,))
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            torch.is_complex,
+        )
+
+    @ops(ops_and_refs, allowed_dtypes=(torch.double,))
     def test_neg_view(self, device, dtype, op):
         if not op.test_neg_view:
             self.skipTest("Operation not tested with tensors with negative bit.")
@@ -1288,10 +1208,18 @@ def test_neg_view(self, device, dtype, op):
         math_op_view = torch._neg_view
         is_bit_set = torch.is_neg
         samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
-        self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set,
-                             lambda x: True)
-
-    @ops(op_db, allowed_dtypes=(torch.cdouble,))
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            lambda x: True,
+        )
+
+    @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
     def test_neg_conj_view(self, device, dtype, op):
         if not op.test_neg_view:
             self.skipTest("Operation not tested with tensors with negative bit.")
@@ -1307,18 +1235,27 @@ def math_op_view(x):
         def is_bit_set(x):
             return torch.is_neg(x) and torch.is_conj(x)
 
-        _requires_grad = (op.supports_autograd and op.supports_complex_autograd(torch.device(device).type))
+        _requires_grad = dtype in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
         # Only test one sample
         samples = itertools.islice(samples, 1)
-        self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set,
-                             torch.is_complex)
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            torch.is_complex,
+        )
 
 
 instantiate_device_type_tests(TestCommon, globals())
-instantiate_device_type_tests(TestGradients, globals())
-instantiate_device_type_tests(TestJit, globals())
+instantiate_device_type_tests(TestCompositeCompliance, globals())
 instantiate_device_type_tests(TestMathBits, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
new file mode 100644
index 000000000000..64cd71fdee6d
--- /dev/null
+++ b/test/test_ops_gradients.py
@@ -0,0 +1,264 @@
+# Owner(s): ["module: unknown"]
+
+from functools import partial, wraps
+from itertools import chain
+import torch
+
+from torch.testing._internal.common_utils import \
+    (TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_device_type import \
+    (instantiate_device_type_tests, ops, OpDTypes)
+
+# TODO: fixme https://github.com/pytorch/pytorch/issues/68972
+torch.set_default_dtype(torch.float32)
+
+# gradcheck requires double precision
+_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
+                         allowed_dtypes=[torch.double, torch.cdouble])
+
+class TestGradients(TestCase):
+    exact_dtype = True
+
+    # Copies inputs to inplace operations to avoid inplace modifications
+    #   to leaves requiring gradient
+    def _get_safe_inplace(self, inplace_variant):
+        @wraps(inplace_variant)
+        def _fn(t, *args, **kwargs):
+            return inplace_variant(t.clone(), *args, **kwargs)
+
+        return _fn
+
+    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
+                      check_batched_grad=None, check_batched_forward_grad=False):
+        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
+        # NB: check_backward_ad does not affect gradgradcheck (always True)
+        if variant is None:
+            self.skipTest("Skipped! Variant not implemented.")
+        if not op.supports_dtype(dtype, torch.device(device).type):
+            self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}")
+
+        def is_inplace(variant):
+            if hasattr(variant, "__wrapped__"):
+                return variant.__wrapped__ is op.get_inplace()
+            return variant is op.get_inplace()
+
+        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
+        samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs)
+
+        for sample in samples:
+            if sample.broadcasts_input and is_inplace(variant):
+                continue
+
+            # Gradcheck expects tensors as its input, but autograd actually supports tensorlists
+            #   and tensors passed as kwargs. The following creates a function that accepts just
+            #   the tensors that require grad as varargs, and then recomposes them back into the
+            #   original input.
+
+            # Creates gradcheck inputs by identifying tensors requiring grad
+            all_args = None
+            if is_iterable_of_tensors(sample.input):
+                all_args = chain(sample.input, sample.args, sample.kwargs.values())
+            else:
+                all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values()))
+            gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad))
+
+            def _input_recomposition_helper(inputs, inp, input_idx):
+                if is_iterable_of_tensors(inp):
+                    tensor_list = []
+                    for x in inp:
+                        if isinstance(x, torch.Tensor) and x.requires_grad:
+                            tensor_list.append(inputs[input_idx])
+                            input_idx = input_idx + 1
+                        else:
+                            tensor_list.append(x)
+                    return tensor_list, input_idx
+                elif isinstance(inp, torch.Tensor) and inp.requires_grad:
+                    return inputs[input_idx], input_idx + 1
+                else:
+                    return inp, input_idx
+
+            def fn(*inputs):
+                # Puts inputs back into sample properly
+                positional_args = []
+                input_idx = 0
+                inp, input_idx = _input_recomposition_helper(inputs, sample.input, input_idx)
+                positional_args.append(inp)
+
+                for x in sample.args:
+                    inp, input_idx = _input_recomposition_helper(inputs, x, input_idx)
+                    positional_args.append(inp)
+
+                # Recreates kwargs
+                kwargs = {}
+                for k, v in sample.kwargs.items():
+                    inp, input_idx = _input_recomposition_helper(inputs, v, input_idx)
+                    kwargs[k] = inp
+
+                output = op.gradcheck_wrapper(variant, *positional_args, **kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            if check == 'gradcheck':
+                if check_batched_grad is None:
+                    check_batched_grad = op.check_batched_grad
+                self.assertTrue(gradcheck(fn, gradcheck_args,
+                                          check_batched_grad=check_batched_grad,
+                                          check_grad_dtypes=True,
+                                          nondet_tol=op.gradcheck_nondet_tol,
+                                          fast_mode=op.gradcheck_fast_mode,
+                                          check_forward_ad=check_forward_ad,
+                                          check_backward_ad=check_backward_ad,
+                                          check_undefined_grad=True,
+                                          check_batched_forward_grad=check_batched_forward_grad))
+            elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
+                self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
+                for gen_non_contig_grad_outputs in (False, True):
+                    kwargs = {
+                        "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs,
+                        "check_batched_grad": op.check_batched_gradgrad,
+                        "check_grad_dtypes": True,
+                        "nondet_tol": op.gradcheck_nondet_tol,
+                        "fast_mode": op.gradcheck_fast_mode
+                    }
+                    if check == "fwgrad_bwgrad":
+                        kwargs["check_fwd_over_rev"] = True
+                        kwargs["check_rev_over_rev"] = False
+                        kwargs["check_batched_grad"] = False
+                        kwargs["check_undefined_grad"] = False
+
+                    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
+            else:
+                self.assertTrue(False, msg="Unknown check requested!")
+
+    def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
+                          check_batched_grad=None, check_batched_forward_grad=False):
+        return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
+                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
+                                  check_batched_forward_grad=check_batched_forward_grad)
+
+    def _skip_helper(self, op, device, dtype):
+        if dtype not in op.supported_backward_dtypes(torch.device(device).type):
+            self.skipTest("Skipped! Op doesn't support autograd for this dtype.")
+        if not op.supports_autograd and not op.supports_forward_ad:
+            self.skipTest("Skipped! autograd not supported.")
+
+    # Tests that gradients are computed correctly
+    @_gradcheck_ops(op_db)
+    def test_fn_grad(self, device, dtype, op):
+        # This is verified by test_dtypes in test_ops.py
+        if dtype not in op.supported_backward_dtypes(torch.device(device).type):
+            self.skipTest("Skipped! Dtype is not in supported backward dtypes!")
+        else:
+            self._grad_test_helper(device, dtype, op, op.get_op())
+
+    # Method grad (and gradgrad, see below) tests are disabled since they're
+    #   costly and redundant with function grad (and gradgad) tests
+    # @_gradcheck_ops(op_db)
+    # def test_method_grad(self, device, dtype, op):
+    #     self._skip_helper(op, device, dtype)
+    #     self._grad_test_helper(device, dtype, op, op.get_method())
+
+    @_gradcheck_ops(op_db)
+    def test_inplace_grad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+        if not op.inplace_variant:
+            self.skipTest("Op has no inplace variant!")
+
+        # Verifies an operation doesn't support inplace autograd if it claims not to
+        if not op.supports_inplace_autograd:
+            inplace = self._get_safe_inplace(op.get_inplace())
+            for sample in op.sample_inputs(device, dtype, requires_grad=True):
+                if sample.broadcasts_input:
+                    continue
+                with self.assertRaises(Exception):
+                    result = inplace(sample)
+                    result.sum().backward()
+        else:
+            self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
+
+    # Test that gradients of gradients are computed correctly
+    @_gradcheck_ops(op_db)
+    def test_fn_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+        if not op.supports_gradgrad:
+            self.skipTest("Op claims it doesn't support gradgrad. This is not verified.")
+        else:
+            self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
+
+    # Test that forward-over-reverse gradgrad is computed correctly
+    @_gradcheck_ops(op_db)
+    def test_fn_fwgrad_bwgrad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        if op.supports_fwgrad_bwgrad:
+            self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
+        else:
+            err_msg = r"Trying to use forward AD with .* that does not support it"
+            hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not "
+                        "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.")
+            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
+                self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
+
+    # Test that gradients of gradients are properly raising
+    @_gradcheck_ops(op_db)
+    def test_fn_fail_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+        if op.supports_gradgrad:
+            self.skipTest("Skipped! Operation does support gradgrad")
+
+        err_msg = r"derivative for .* is not implemented"
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
+
+    # Method gradgrad (and grad, see above) tests are disabled since they're
+    #   costly and redundant with function gradgrad (and grad) tests
+    # @_gradcheck_ops(op_db)
+    # def test_method_gradgrad(self, device, dtype, op):
+    #     self._skip_helper(op, device, dtype)
+    #     self._gradgrad_test_helper(device, dtype, op, op.get_method())
+
+    @_gradcheck_ops(op_db)
+    def test_inplace_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+        if not op.inplace_variant or not op.supports_inplace_autograd:
+            self.skipTest("Skipped! Operation does not support inplace autograd.")
+        self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad")
+
+    def _forward_grad_helper(self, device, dtype, op, variant, is_inplace):
+        # TODO: clean up how attributes are passed to gradcheck from OpInfos
+        def call_grad_test_helper():
+            check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
+                                          (op.check_inplace_batched_forward_grad and is_inplace))
+            self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
+                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
+        if op.supports_forward_ad:
+            call_grad_test_helper()
+        else:
+            err_msg = r"Trying to use forward AD with .* that does not support it"
+            hint_msg = ("Running forward AD for an OP that has does not support it did not "
+                        "raise any error. If your op supports forward AD, you should set supports_forward_ad=True")
+            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
+                call_grad_test_helper()
+
+    @_gradcheck_ops(op_db)
+    def test_forward_mode_AD(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False)
+
+    @_gradcheck_ops(op_db)
+    def test_inplace_forward_mode_AD(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        if not op.inplace_variant or not op.supports_inplace_autograd:
+            self.skipTest("Skipped! Operation does not support inplace autograd.")
+
+        self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True)
+
+
+instantiate_device_type_tests(TestGradients, globals())
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_ops_jit.py b/test/test_ops_jit.py
new file mode 100644
index 000000000000..e8b914f8072f
--- /dev/null
+++ b/test/test_ops_jit.py
@@ -0,0 +1,279 @@
+# Owner(s): ["module: unknown"]
+
+from functools import partial
+
+import torch
+
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import \
+    (run_tests, IS_SANDCASTLE, clone_input_helper, first_sample)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
+from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference
+from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, create_traced_fn, check_alias_annotation
+from torch.testing._internal.jit_utils import disable_autodiff_subgraph_inlining, is_lambda
+
+
+# TODO: fixme https://github.com/pytorch/pytorch/issues/68972
+torch.set_default_dtype(torch.float32)
+
+# variant testing is only done with torch.float and torch.cfloat to avoid
+#   excessive test times and maximize signal to noise ratio
+_variant_ops = partial(ops, dtypes=OpDTypes.supported,
+                       allowed_dtypes=(torch.float, torch.cfloat))
+
+
+
+# Tests operators for consistency between JIT and eager, also checks
+#   correctness of JIT specific alias schemas and intended
+#   autodifferentiation behavior.
+# Inherits from JitCommonTestCase instead of TestCase directly to share
+#   functionality with original test_jit.py method operator tests
+class TestJit(JitCommonTestCase):
+    exact_dtype = True
+
+    # Tests that the forward and backward passes of operations produce the
+    #   same values for the cross-product of op variants (function, method, inplace)
+    #   and runtimes (eager, traced, scripted).
+    # TODO WARNING: inplace x {traced, scripted} not currently tested
+    @_variant_ops(op_db)
+    def test_variant_consistency_jit(self, device, dtype, op):
+        _requires_grad = (dtype in op.supported_backward_dtypes(torch.device(device).type))
+
+        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
+        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs)
+
+        # Acquires variants to test
+        func = op.get_op()
+        method = op.get_method()
+        variants = {
+            # TODO: inplace tests currently fail, fix and add inplace variant
+            'function': func, 'method': method,
+        }
+
+        # TODO: find better way to standardize on op registration itself..
+        has_fake_function = op.name in ["resize_", 'resize_as_']
+
+        if has_fake_function:
+            variants = {'method': getattr(torch.Tensor, op.name)}
+            samples = op.sample_inputs(device, dtype, requires_grad=False)
+
+        support_script = op.supports_scripting
+
+        tested = False
+        for sample in samples:
+            # Test traced and scripted consistency
+            for func_type, variant in variants.items():
+                if variant is None:
+                    continue
+
+                # scripting and check_alias_analysis do not work with lambdas
+                # lambdas are typically used as a way to simulate methods without
+                # functional variants, so rely on the other variant for testing
+                # for now
+                if is_lambda(variant):
+                    continue
+
+                tested = True
+
+                # Create accessor for script function variant
+                name = op.name + '_' if func_type == 'inplace' else op.name
+
+                # run with disable_autodiff_subgraph_inlining(True) to test
+                #   autodiff support. Context manager forces the graph to contain
+                #   DifferentiableGraph nodes if they are present
+                with disable_autodiff_subgraph_inlining():
+                    # Check scripted forward, grad, and grad grad
+                    if support_script:
+                        script_fn = create_script_fn(self, name, func_type)
+
+                    def out_fn(output):
+                        # Processes the output for autograd
+                        if sample.output_process_fn_grad is not None:
+                            return sample.output_process_fn_grad(output)
+                        return output
+
+                    def get_sample():
+                        return clone_input_helper(sample.input) if op.name[-1] == '_' else sample.input
+
+                    if support_script:
+                        check_against_reference(self,
+                                                script_fn,
+                                                func,
+                                                out_fn,
+                                                (get_sample(),) + sample.args,
+                                                sample.kwargs,
+                                                no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
+
+                    # Check traced forward, grad, and grad grad
+                    # TODO: fix tracing here
+                    supports_tracing = not has_fake_function
+                    if op.assert_jit_shape_analysis:
+                        self.assertTrue(supports_tracing)
+
+                    if supports_tracing:
+                        traced_fn = create_traced_fn(self, variant)
+                        check_against_reference(self,
+                                                traced_fn,
+                                                func,
+                                                out_fn,
+                                                (get_sample(),) + sample.args,
+                                                sample.kwargs,
+                                                no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
+
+                    # Check alias annotation schema for correctness (make
+                    #   sure inputs that aren't supposed to be modified aren't)
+                    # Note: only runs in float32 because schema isn't affected by dtype,
+                    #   so running it on all dtypes is would be excessive
+                    if dtype == torch.float32:
+                        # TODO: no reason why we cant run this with tracing graph
+                        if support_script and op.name != "rsub":
+                            check_alias_annotation(name, (get_sample(),) + sample.args, sample.kwargs,
+                                                   func_type=func_type, aten_name=op.aten_name)
+
+                        # TODO: use script graph as well
+                        checked_shape_analysis = False
+                        if supports_tracing:
+                            out = variant(get_sample(), *sample.args, **sample.kwargs)
+
+                            # right now, tuple of outputs and tensor output supported
+                            # TODO: list of tensor outputs
+                            tuple_of_tensors = isinstance(out, tuple) and all([isinstance(elem, torch.Tensor) for elem in out])
+
+                            if isinstance(out, torch.Tensor) or tuple_of_tensors:
+                                if tuple_of_tensors:
+                                    sizes = [elem.size() for elem in out]
+                                else:
+                                    sizes = out.size()
+                                self.checkShapeAnalysis(sizes, traced_fn.graph, op.assert_jit_shape_analysis)
+                                checked_shape_analysis = True
+                        if op.assert_jit_shape_analysis:
+                            self.assertTrue(checked_shape_analysis)
+
+                    # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample
+                    if dtype is torch.float32:
+                        # Sandcastle doesn't fuse nodes
+                        if IS_SANDCASTLE:
+                            # fusible nodes are expected to be found in FusionGroups in the DifferentiableGraphs
+                            nonfusible_nodes = op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes
+                            fusible_nodes = []
+                        else:
+                            nonfusible_nodes = op.autodiff_nonfusible_nodes
+                            fusible_nodes = op.autodiff_fusible_nodes
+
+                        if supports_tracing:
+                            self.assertAutodiffNode(traced_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
+                        if support_script:
+                            self.assertAutodiffNode(script_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
+        assert tested, "JIT Test does not execute any logic"
+
+    # alias testing is only done with torch.float for the same reason
+    _alias_ops = partial(ops, dtypes=OpDTypes.supported,
+                         allowed_dtypes=(torch.float,))
+
+    @_alias_ops((op for op in op_db if op.aliases))
+    def test_jit_alias_remapping(self, device, dtype, op):
+        # Required to avoid undefined value: tensor error in JIT compilation of the function template
+        tensor = torch.tensor
+
+        # NOTE: only tests on first sample
+        samples = op.sample_inputs(device, dtype, requires_grad=True)
+        sample = first_sample(self, samples)
+
+        # [Scripting Data Preparation]
+        # Prepare data for test scripting
+        # Below we prepare strings of args/kwargs with and without type annotations.
+        # These strings are inserted into function template strings which is then torch scripted.
+        # - args string is ["t0"] corresponding to the "input" tensor required by the op
+        # - args_kw is the value of args and strings of kwargs used to call the op (without type annotations), for example,
+        # ["to", "1.0", "(1,)", "True", "tensor(1.0)"] -> def fn(t0): return variant(t0, 1.0, (1,), True, tensor(1.0))
+        args = ["t0"]
+
+        def quote_strs(v):
+            if isinstance(v, str):
+                return f"'{v}'"
+
+            return str(v)
+
+        args_kw = args + \
+            [f"{v}" for v in sample.args] + \
+            [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()]
+
+        # Prepare data for test tracing
+        sample_args_kwargs = ()
+        if len(sample.args) > 0:
+            sample_args_kwargs += (sample.args, )
+        if len(sample.kwargs) > 0:
+            sample_args_kwargs += (sample.kwargs, )
+
+        original_name = op.aten_name
+        original_name_inplace = original_name + "_"
+        expected_dtype = op(sample.input, *sample.args, **sample.kwargs).dtype
+
+        for a_op in op.aliases:
+            inplace = a_op.inplace_variant
+            method_or_inplace = [a_op.inplace_variant, a_op.method_variant]
+            variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None)
+
+            # Test scripting:
+            for variant in variants:
+                variant_name = variant.__name__
+                op_name = original_name_inplace if variant is inplace else original_name
+
+                if variant in method_or_inplace:
+                    fn_template = '''
+                        def _fn(t0{c}):
+                            return t0.{alias_name}({args_kw})
+                    '''
+                    # remove the first input tensor
+                    script = fn_template.format(
+                        c=", " if len(args_kw[1:]) > 1 else "",
+                        args_kw=", ".join(args_kw[1:]),
+                        alias_name=variant_name,
+                    )
+                else:
+                    fn_template = '''
+                        def _fn({args}):
+                            return variant({args_kw})
+                    '''
+                    script = fn_template.format(
+                        args=", ".join(args),
+                        args_kw=", ".join(args_kw),
+                    )
+                scripted = torch.jit.CompilationUnit(script)._fn
+
+                if (variant is inplace and not torch.can_cast(expected_dtype, dtype)):
+                    try:
+                        inp = clone_input_helper(sample.input)
+                        scripted(inp)
+                    except Exception as e:
+                        continue
+                    self.fail("Inplace operation on integer tensor that should be promoted to float didn't fail!")
+
+                inp = clone_input_helper(sample.input)
+                scripted(inp)
+                inp = clone_input_helper(sample.input)
+                graph = scripted.graph_for(inp)
+                FileCheck().check(op.aten_name).check_not(variant_name).run(graph)
+
+            # Test tracing:
+            for variant in variants:
+                variant_name = variant.__name__
+                op_name = original_name_inplace if variant is inplace else original_name
+
+                def _fn(*sample_args, **sample_kwargs):
+                    return variant(*sample_args, **sample_kwargs)
+
+                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
+                traced = torch.jit.trace(_fn, *inp)
+                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
+                traced(*inp)
+                inp = (clone_input_helper(sample.input),) + sample_args_kwargs
+                graph = traced.graph_for(*inp)
+                FileCheck().check(op_name).check_not(variant_name).run(graph)
+
+
+instantiate_device_type_tests(TestJit, globals())
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_optim.py b/test/test_optim.py
index 061f8a44765c..6d587b4b352d 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -20,8 +20,7 @@
     _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler, \
     EPOCH_DEPRECATION_WARNING
 from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
-    skipIfRocm
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -228,6 +227,12 @@ def fn_base(optimizer, weight, bias):
         # Make sure state dict wasn't modified
         self.assertEqual(state_dict, state_dict_c)
 
+        # Make sure that device of state['step'] is still CPU
+        new_state_dict = optimizer_cuda.state_dict()
+        if 'step' in state_dict['state'][0] and torch.is_tensor(state_dict['state'][0]['step']):
+            for state in new_state_dict['state'].values():
+                self.assertEqual(state['step'].device.type, 'cpu')
+
         for _i in range(20):
             optimizer.step(fn)
             optimizer_cuda.step(fn_cuda)
@@ -481,9 +486,8 @@ def test_multi_tensor_optimizers(self):
                     loss.backward()
 
                     # Test that step behaves as expected (a no-op) when grads are set to None
-                    # TODO: uncomment after optim foreach cleanup is landed
-                    # if iter == 0:
-                    #     optimizer.zero_grad(set_to_none=True)
+                    if iter == 0:
+                        optimizer.zero_grad(set_to_none=True)
 
                     optimizer.step()
 
@@ -615,26 +619,29 @@ def test_sparse_adam(self):
             optim.SparseAdam([{"params": [torch.zeros(3, layout=torch.sparse_coo)]}])
 
     # ROCm precision is too low to pass this test
-    @skipIfRocm
     def test_adadelta(self):
         # Handles https://github.com/pytorch/pytorch/issues/69698
         self.rel_tol = 4e-3
         for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
             self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias])
+                lambda weight, bias, maximize: optimizer([weight, bias], maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, rho=0.95))
+                lambda weight, bias, maximize: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95), maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, rho=0.95)),
+                lambda weight, bias, maximize: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95), maximize=maximize),
                 [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                 lambda opt: ReduceLROnPlateau(opt)]
+                 lambda opt: ReduceLROnPlateau(opt)],
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], weight_decay=1)
+                lambda weight, bias, maximize: optimizer([weight, bias], weight_decay=1, maximize=maximize),
+                constructor_accepts_maximize=True
             )
             with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
                 optimizer(None, lr=1e-2, rho=1.1)
@@ -678,30 +685,38 @@ def test_nadam(self):
     def test_adagrad(self):
         for optimizer in [optim.Adagrad, optim_mt.Adagrad]:
             self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
+                lambda weight, bias, maximize: optimizer([weight, bias], lr=1e-1, maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    [weight, bias], lr=1e-1, initial_accumulator_value=0.1
-                )
+                lambda weight, bias, maximize: optimizer(
+                    [weight, bias], lr=1e-1, initial_accumulator_value=0.1, maximize=maximize,
+                ),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
+                lambda weight, bias, maximize: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-1)
+                    lr=1e-1,
+                    maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
+                lambda weight, bias, maximize: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-1),
-                [lambda opt: ReduceLROnPlateau(opt)]
+                    lr=1e-1,
+                    maximize=maximize),
+                [lambda opt: ReduceLROnPlateau(opt)],
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
+                lambda weight, bias, maximize: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-1),
+                    lr=1e-1,
+                    maximize=maximize),
                 [lambda opt: ReduceLROnPlateau(opt),
-                 lambda opt: ExponentialLR(opt, gamma=0.99)]
+                 lambda opt: ExponentialLR(opt, gamma=0.99)],
+                constructor_accepts_maximize=True
             )
             with self.assertRaisesRegex(ValueError, "Invalid lr_decay value: -0.5"):
                 optimizer(None, lr=1e-2, lr_decay=-0.5)
@@ -731,15 +746,20 @@ def test_adagrad_complex(self):
     def test_adamax(self):
         for optimizer in [optim.Adamax, optim_mt.Adamax]:
             self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
+                lambda weight, bias, maximize: optimizer(
+                    [weight, bias], lr=1e-1, maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer(
+                lambda weight, bias, maximize: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-1)
+                    lr=1e-1, maximize=maximize),
+                constructor_accepts_maximize=True
             )
             self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-1, weight_decay=1)
+                lambda weight, bias, maximize: optimizer(
+                    [weight, bias], lr=1e-1, weight_decay=1, maximize=maximize),
+                constructor_accepts_maximize=True
             )
             with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
                 optimizer(None, lr=1e-2, betas=(0.0, 1.0))
@@ -1323,6 +1343,18 @@ def test_closed_form_cos_anneal_lr(self):
         closed_form_scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
         self._test_against_closed_form(scheduler, closed_form_scheduler, epochs)
 
+    def test_cos_anneal_lr_continue(self):
+        eta_min = 0.1
+        T_max = 5
+        scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
+        self.opt.step()
+        scheduler.step()
+        original_lrs = scheduler._last_lr
+        new_scheduler = CosineAnnealingLR(
+            self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0)
+        new_lrs = new_scheduler._last_lr
+        torch.testing.assert_allclose(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
+
     def test_reduce_lr_on_plateau1(self):
         epochs = 10
         for param_group in self.opt.param_groups:
diff --git a/test/test_overrides.py b/test/test_overrides.py
index da013d33a53d..d208a9201729 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1,4 +1,4 @@
-# Owner(s): ["high priority"]
+# Owner(s): ["module: __torch_function__"]
 
 import torch
 import numpy as np
@@ -8,14 +8,16 @@
 import pickle
 import collections
 
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, skipIfCrossRef
 from torch.overrides import (
     handle_torch_function,
     has_torch_function,
     get_overridable_functions,
     get_testing_overrides,
-    is_tensor_method_or_property
+    is_tensor_method_or_property,
+    TorchFunctionMode
 )
+from functools import partial
 
 Tensor = torch.Tensor
 
@@ -28,7 +30,7 @@
 
 def foo(a, b, c=None):
     """A function multiple arguments and an optional argument"""
-    if any(type(t) is not Tensor for t in (a, b, c)) and has_torch_function((a, b, c)):
+    if has_torch_function((a, b, c)):
         return handle_torch_function(foo, (a, b, c), a, b, c=c)
     if c:
         return a + b + c
@@ -36,19 +38,19 @@ def foo(a, b, c=None):
 
 def bar(a):
     """A function with one argument"""
-    if type(a) is not Tensor and has_torch_function((a,)):
+    if has_torch_function((a,)):
         return handle_torch_function(bar, (a,), a)
     return a
 
 def baz(a, b):
     """A function with multiple arguments"""
-    if type(a) is not Tensor or type(b) is not Tensor and has_torch_function((a, b)):
+    if has_torch_function((a, b)):
         return handle_torch_function(baz, (a, b), a, b)
     return a + b
 
 def quux(a):
     """Used to test that errors raised in user implementations get propagated"""
-    if type(a) is not Tensor and has_torch_function((a,)):
+    if has_torch_function((a,)):
         return handle_torch_function(quux, (a,), a)
     return a
 
@@ -556,6 +558,42 @@ class DummyTensor(torch.Tensor):
         self.assertTrue(c._is_view())
         self.assertTrue(c._base is a)
 
+    def test_grad(self):
+        # Previously, Tensor-like objects that did not subclass from Tensor
+        # did not get wrapped into unary tuples before being passed into
+        # handle_torch_function, in contradiction with how Tensor-likes
+        # were handled
+        #
+        # NB: this asserts that the arguments get normalized into a tuple
+        # before entering the torch function handler; it could go the
+        # other way but beware https://github.com/pytorch/pytorch/issues/76037
+
+        class Dummy:
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                inputs, outputs = args
+                self.assertEqual(inputs, (x,))
+                self.assertEqual(outputs, (x,))
+                return -1
+
+        x = Dummy()
+        self.assertEqual(torch.autograd.grad(x, x), -1)
+
+    def test_pow_rpow(self):
+        class NothingImplemented(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                return NotImplemented
+
+        class RPowOnly(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if func is torch.Tensor.__rpow__:
+                    return -1
+                return NotImplemented
+
+        self.assertEqual(NothingImplemented() ** RPowOnly(), -1)
+
 
 def generate_tensor_like_override_tests(cls):
     from torch.testing._internal.generated.annotated_fn_args import annotated_args
@@ -599,7 +637,7 @@ def instance_gen():
                     func_args.append([instance_gen(), instance_gen()])
                 elif t == 'c10::List<c10::optional<Tensor>>':
                     func_args.append([instance_gen(), instance_gen()])
-                elif t == 'IntArrayRef':
+                elif t == 'IntArrayRef' or t == 'SymIntArrayRef':
                     size = arg.get('size', 2)
                     if size == 1:
                         func_args.append(1)
@@ -621,6 +659,9 @@ def instance_gen():
                     func_args.append(torch.float32)
                 elif t == 'c10::string_view':
                     func_args.append('')
+                elif t == 'SymInt':
+                    # TODO: generate actual SymbolicInt
+                    func_args.append(1)
                 else:
                     raise RuntimeError(f"Unsupported argument type {t} for {arg['name']} of function {func}")
         else:
@@ -690,7 +731,7 @@ def test(self):
         test_method.__name__ = name
         setattr(cls, name, test_method)
 
-# generate_tensor_like_override_tests(TestTorchFunctionOverride)
+generate_tensor_like_override_tests(TestTorchFunctionOverride)
 
 class Wrapper:
     "Basic data container that knows how to unwrap itself"
@@ -739,10 +780,11 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         for a in args:
             if isinstance(a, cls):
                 args_of_this_cls.append(a)
-            elif isinstance(a, collections.Sequence):
+            elif isinstance(a, collections.abc.Sequence):
                 args_of_this_cls.extend(el for el in a if isinstance(el, cls))
         assert len(args_of_this_cls) > 0
-        args_of_this_cls[0].used_calls.add(func)
+        for a in args_of_this_cls:
+            a.used_calls.add(func)
         args = unwrap(tuple(args))
         kwargs = {k: unwrap(v) for k, v in kwargs.items()}
 
@@ -847,6 +889,7 @@ def run_test(fast_mode):
                 'dtype',
                 'is_floating_point',
                 'is_sparse',
+                'is_sparse_csr',
                 'layout',
                 'new_zeros',
                 'numel',
@@ -1044,6 +1087,16 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         self.assertEqual(torch.nn.functional.linear(inp, t1, t2), "called")
         self.assertEqual(torch.nn.functional.linear(inp, t2, t1), "called")
 
+class TestResolveName(TestCase):
+    def test_resolve_name(self):
+        for cs in get_overridable_functions().values():
+            for c in cs:
+                self.assertEqual(
+                    eval(torch.overrides.resolve_name(c)),
+                    c,
+                    msg=f"{c}, {torch.overrides.resolve_name(c)}"
+                )
+
 class TestTorchFunctionWarning(TestCase):
     def test_warn_on_invalid_torch_function(self):
         class Bad1():
@@ -1055,14 +1108,249 @@ def __torch_function__(self, *args, **kwargs):
                 pass
 
         a = Bad1()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # This needs to be a function that handle torch_function on the python side
-            torch.split(a, (2))
-
-        a = Bad2()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # This needs to be a function that handle torch_function on the python side
-            torch.split(a, (2))
+        for a in (Bad1(), Bad2()):
+            with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
+                # Function that handles torch_function on the python side
+                torch.nn.functional.dropout(a)
+
+            with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
+                # Function that handles torch_function in C++
+                torch.abs(a)
+
+@skipIfCrossRef
+class TestTorchFunctionMode(TestCase):
+    def test_basic(self):
+        class A(TorchFunctionMode):
+            def __torch_function__(self, *args, **kwargs):
+                return -1
+        # NB: factory functions get overridden too!
+        x = torch.randn(1)
+        with torch.overrides.push_torch_function_mode(A):
+            self.assertEqual(torch.randn(3), -1)
+            self.assertEqual(torch.add(x, x), -1)
+            self.assertEqual(torch.split(None, [2]), -1)  # python side
+            self.assertEqual(bar(x), -1)
+
+    def test_factory_override(self):
+        class A(TorchFunctionMode):
+            def __torch_function__(self, *args, **kwargs):
+                return -1
+
+        with torch.overrides.push_torch_function_mode(A):
+            self.assertEqual(torch.tensor([1]), -1)
+            self.assertEqual(torch.sparse_coo_tensor(1, 1, 1), -1)
+            self.assertEqual(torch.sparse_csr_tensor(1, 1, 1), -1)
+            self.assertEqual(torch._sparse_coo_tensor_unsafe(1, 1, (1, 1)), -1)
+            self.assertEqual(torch._sparse_csr_tensor_unsafe(1, 1, 1, (1, 1)), -1)
+            self.assertEqual(torch.as_tensor([1]), -1)
+
+    def test_enable_torch_function_mode_with_tensor_subclass(self):
+        x = torch.randn(1)
+        with torch.overrides.enable_torch_function_mode(SubTensor):
+            self.assertEqual(torch.mm(x, x), -1)
+
+    def test_modes_handle_first(self):
+        class A(TorchFunctionMode):
+            def __torch_function__(self, *args, **kwargs):
+                return -40
+
+        x = SubTensor()
+        with torch.overrides.push_torch_function_mode(A):
+            self.assertEqual(torch.neg(x), -40)
+            self.assertEqual(torch.mean(x), -40)
+            self.assertEqual(torch.mm(x, x), -40)
+            self.assertEqual(bar(x), -40)
+
+    def test_modes_return_notimplemented(self):
+        class MyMode(TorchFunctionMode):
+            def __torch_function__(self, *args, **kwargs):
+                return NotImplemented
+
+        x = SubTensor()
+        with torch.overrides.push_torch_function_mode(MyMode):
+            self.assertEqual(torch.mean(x), 0)
+            self.assertEqual(torch.mm(x, x), -1)
+            self.assertEqual(bar(x), 1)
+            self.assertRaisesRegex(
+                TypeError, r'SubTensor.+MyMode',
+                lambda: self.assertEqual(torch.max(x, x)))
+
+    def test_mode_stack(self):
+        logs = []
+
+        class Logger(TorchFunctionMode):
+            def __init__(self, name):
+                self.name = name
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                logs.append(self.name)
+                return func(*args, **kwargs)
+
+        x = torch.randn(1)
+        with torch.overrides.push_torch_function_mode(partial(Logger, "A")):
+            with torch.overrides.push_torch_function_mode(partial(Logger, "B")):
+                torch.mean(x)
+
+        self.assertEqual(logs, ["B", "A"])
+
+    def test_push_mode_instance_errors(self):
+        class A(TorchFunctionMode):
+            pass
+        with self.assertRaisesRegex(ValueError, 'instance of TorchFunctionMode'):
+            with torch.overrides.push_torch_function_mode(A(inner=None)):
+                pass
+
+    def test_push_mode_returns_unrelated(self):
+        with self.assertRaisesRegex(ValueError, 'return a TorchFunctionMode'):
+            with torch.overrides.push_torch_function_mode(lambda *, inner: None):
+                pass
+
+    def test_missing_inner_mode_ctor(self):
+        self.assertRaisesRegex(TypeError, 'push_torch_function_mode', lambda: TorchFunctionMode())
+
+    def test_enable_torch_function_mode_trivial(self):
+        class A(TorchFunctionMode):
+            def __torch_function__(self, *args, **kwargs):
+                return -40
+        a = A(inner=None)
+        with torch.overrides.enable_torch_function_mode(a):
+            with torch.overrides.enable_torch_function_mode(a):
+                self.assertEqual(bar(None), -40)
+
+    def test_enable_torch_function_mode_replace(self):
+        class A(TorchFunctionMode):
+            def __init__(self, val):
+                self.val = val
+
+            def __torch_function__(self, *args, **kwargs):
+                return self.val
+        a1 = A(-40, inner=None)
+        a2 = A(-41, inner=None)
+        with torch.overrides.enable_torch_function_mode(a1):
+            with torch.overrides.enable_torch_function_mode(a2, replace=a1):
+                self.assertEqual(bar(None), -41)
+
+    def test_enable_torch_function_mode_ignore_preexisting(self):
+        class A(TorchFunctionMode):
+            def __init__(self, val):
+                self.val = val
+
+            def __torch_function__(self, *args, **kwargs):
+                return self.val
+        a1 = A(-40, inner=None)
+        a2 = A(-41, inner=None)
+        with torch.overrides.enable_torch_function_mode(a1):
+            with torch.overrides.enable_torch_function_mode(a2, ignore_preexisting=True):
+                self.assertEqual(bar(None), -41)
+
+    def test_reentrant_mode_idiom(self):
+        log = []
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                log.append(func)
+                if func is torch.sub:
+                    with torch.overrides.enable_torch_function_mode(self, replace=self.inner):
+                        input, other = args
+                        assert not kwargs
+                        return torch.add(input, other, alpha=-1)
+                return func(*args, **kwargs)
+
+        x = torch.randn(1)
+        y = torch.randn(1)
+        with torch.overrides.push_torch_function_mode(A):
+            torch.sub(x, y)
+        # add hits the torch function again!
+        self.assertEqual(log, [torch.sub, torch.add])
+
+    def test_nn_parse_to(self):
+        # This failed because the parser thinks the function is called to()
+        # but it's actually called _parse_to()
+
+        called = False
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                nonlocal called
+                if kwargs is None:
+                    kwargs = {}
+                called = True
+                return func(*args, **kwargs)
+
+        with torch.overrides.push_torch_function_mode(A):
+            torch._C._nn._parse_to('cpu')
+
+        self.assertTrue(called)
+
+    def test_distributions_bernoulli(self):
+        # This failed because improper use of has_torch_function when
+        # is_tensor_like should have been used instead, inside the
+        # broadcasting logic called by distributions (Bernoulli doesn't
+        # matter per se)
+
+        called = False
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                nonlocal called
+                if kwargs is None:
+                    kwargs = {}
+                called = True
+                return func(*args, **kwargs)
+
+        with torch.overrides.push_torch_function_mode(A):
+            torch.distributions.Bernoulli(0.3)
+
+        self.assertTrue(called)
+
+    def test_mode_notimplemented_loop(self):
+        # Default tensor subclass implementation disables torch function;
+        # when we redispatch to mode we must not treat the objects as
+        # eligible
+
+        called = 0
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                nonlocal called
+                if kwargs is None:
+                    kwargs = {}
+                called += 1
+                # The first time we call, the mode sees an active type that
+                # it doesn't know how to deal with.  The second time, we're
+                # instructed to treat it "as if it were a tensor", and so
+                # we keep going.  I'm not entirely clear if the subclasses
+                # disappearing from types is the correct way to do it.
+                if any(t is not torch.Tensor for t in types):
+                    return NotImplemented
+                else:
+                    return func(*args, **kwargs)
+
+        class B(torch.Tensor):
+            pass
+
+        b = B()
+
+        with torch.overrides.push_torch_function_mode(A):
+            r = torch.neg(b)
+
+        self.assertIs(type(r), B)
+        self.assertEqual(called, 2)
+
+        called = 0
+
+        with torch.overrides.push_torch_function_mode(A):
+            r = bar(b)
+
+        self.assertIs(type(r), B)
+        self.assertEqual(called, 2)
+
+
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_per_overload_api.py b/test/test_per_overload_api.py
index c97ee2d62766..cdb2b7983512 100644
--- a/test/test_per_overload_api.py
+++ b/test/test_per_overload_api.py
@@ -1,67 +1,64 @@
 # Owner(s): ["module: unknown"]
-# import torch
-# import copy
+import torch
+import copy
 from torch.testing._internal.common_utils import TestCase, run_tests
 
 class TestPerOverloadAPI(TestCase):
-    # def test_basics_opoverloadpacket(self):
-    #     # add is ony used as an example here. It is ok to update the test
-    #     # if the semantics of add are modified in the future.
-    #     add_packet = torch.ops.aten.add
+    def test_basics_opoverloadpacket(self):
+        # add is ony used as an example here. It is ok to update the test
+        # if the semantics of add are modified in the future.
+        add_packet = torch.ops.aten.add
 
-    #     # class attributes
-    #     self.assertEqual(add_packet.op_name, 'add')
-    #     self.assertEqual(add_packet.qualified_op_name, 'aten.add')
+        # class attributes
+        self.assertEqual(add_packet.__name__, 'add')
+        self.assertEqual(str(add_packet), 'aten.add')
 
-    #     # callable
-    #     self.assertEqual(add_packet(torch.tensor(2), torch.tensor(3)), torch.tensor(5))
+        # callable
+        self.assertEqual(add_packet(torch.tensor(2), torch.tensor(3)), torch.tensor(5))
 
-    #     # correct module
-    #     self.assertEqual(add_packet.__module__, add_packet.op.__module__)
+        # correct module
+        self.assertEqual(add_packet.__module__, add_packet.op.__module__)
 
-    #     # caching
-    #     another_add_packet = torch.ops.aten.add
-    #     self.assertEqual(id(add_packet), id(another_add_packet))
+        # caching
+        another_add_packet = torch.ops.aten.add
+        self.assertEqual(id(add_packet), id(another_add_packet))
 
-    #     # deepcopy is a no-op
-    #     self.assertEqual(id(add_packet), id(copy.deepcopy(add_packet)))
+        # deepcopy is a no-op
+        self.assertEqual(id(add_packet), id(copy.deepcopy(add_packet)))
 
-    #     # pretty print
-    #     self.assertEqual(str(add_packet), "OpOverloadPacket(op='aten.add')")
+        # pretty print
+        self.assertEqual(repr(add_packet), "<OpOverloadPacket(op='aten.add')>")
 
-    #     self.assertRaises(AttributeError, lambda: add_packet.foo)
+        self.assertRaises(AttributeError, lambda: add_packet.foo)
 
-    # def test_basics_opoverload(self):
-    #     add_packet = torch.ops.aten.add
-    #     add_tensoroverload = add_packet.Tensor
+    def test_basics_opoverload(self):
+        add_packet = torch.ops.aten.add
+        add_tensoroverload = add_packet.Tensor
 
-    #     # class attributes
-    #     self.assertEqual(add_tensoroverload.name, 'aten.add')
-    #     self.assertEqual(add_tensoroverload.overload_name, 'Tensor')
-    #     self.assertEqual(add_tensoroverload.overload_packet, add_packet)
+        # class attributes
+        self.assertEqual(str(add_tensoroverload), 'aten.add.Tensor')
+        self.assertEqual(add_tensoroverload.__name__, 'add.Tensor')
+        self.assertEqual(add_tensoroverload.overloadpacket, add_packet)
 
-    #     # deepcopy is a no-op
-    #     self.assertEqual(id(add_tensoroverload), id(copy.deepcopy(add_tensoroverload)))
+        # deepcopy is a no-op
+        self.assertEqual(id(add_tensoroverload), id(copy.deepcopy(add_tensoroverload)))
 
-    #     # caching
-    #     another_add_tensoroverload = torch.ops.aten.add.Tensor
-    #     self.assertEqual(id(add_tensoroverload), id(another_add_tensoroverload))
+        # caching
+        another_add_tensoroverload = torch.ops.aten.add.Tensor
+        self.assertEqual(id(add_tensoroverload), id(another_add_tensoroverload))
 
-    #     # pretty print
-    #     self.assertEqual(str(add_tensoroverload), "OpOverload(op='aten.add', overload='Tensor')")
+        # pretty print
+        self.assertEqual(repr(add_tensoroverload), "<OpOverload(op='aten.add', overload='Tensor')>")
 
-    #     # callable
-    #     self.assertEqual(add_tensoroverload(torch.tensor(2), torch.tensor(3)), torch.tensor(5))
+        # callable
+        self.assertEqual(add_tensoroverload(torch.tensor(2), torch.tensor(3)), torch.tensor(5))
 
-    #     a = torch.tensor(2)
-    #     b = torch.tensor(0)
-    #     torch.ops.aten.add.out(a, a, out=b)
-    #     self.assertEqual(b, torch.tensor(4))
+        a = torch.tensor(2)
+        b = torch.tensor(0)
+        torch.ops.aten.add.out(a, a, out=b)
+        self.assertEqual(b, torch.tensor(4))
 
-    #     self.assertRaises(RuntimeError, lambda: add_tensoroverload(a, a, out=b))
-
-    def do_nothing(self):
-        return
+        self.assertRaises(RuntimeError, lambda: add_tensoroverload(a, a, out=b))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_prims.py b/test/test_prims.py
new file mode 100644
index 000000000000..fab525cd8b73
--- /dev/null
+++ b/test/test_prims.py
@@ -0,0 +1,108 @@
+# Owner(s): ["module: primTorch"]
+
+from functools import partial
+
+import torch
+from torch.testing import make_tensor
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCUDA,
+    skipCUDAIfRocm,
+    dtypes,
+)
+from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs, log_input
+import torch._prims as prims
+from torch._prims.executor import make_traced
+
+
+class TestPrims(TestCase):
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @dtypes(torch.float32)
+    def test_broadcast_in_dim(self, device, dtype):
+        # nvfuser is not currently capable of realizing a broadcasted tensor
+        # when the broadcast is the only operation.  Another op is needed.
+        def _wrapper(a, b, broadcast_dimensions):
+            a_bc = prims.broadcast_in_dim(a, b.shape, broadcast_dimensions)
+            return prims.add(a_bc, b)
+
+        traced = make_traced(_wrapper)
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for executor in ('aten', 'nvfuser'):
+            fn = partial(traced, executor=executor)
+            # Same shape
+            shape = (5, 5)
+            a = make_arg(shape)
+            b = make_arg(shape, low=0.0, high=0.0)
+            result = fn(a, b, (0, 1))
+
+            self.assertEqual(result.shape, a.shape)
+            self.assertTrue(result.is_contiguous)
+            self.assertEqual(a, result)
+
+            # Error input: reordering dims
+            with self.assertRaises(Exception):
+                result = fn(a, b, (1, 0))
+
+            # Adding outermost dimensions
+            a = make_arg((5, 5))
+            b = make_arg((3, 3, 5, 5), low=0.0, high=0.0)
+            result = fn(a, b, (2, 3))
+
+            self.assertEqual(result.shape, b.shape)
+            self.assertEqual(a.broadcast_to(b.shape), result)
+
+            # Expands
+            a = make_arg((1, 5, 1))
+            b = make_arg((3, 5, 7), low=0.0, high=0.0)
+            result = fn(a, b, (0, 1, 2))
+
+            self.assertEqual(result.shape, b.shape)
+            self.assertEqual(a.expand_as(result), result)
+
+            # Unsqueezes
+            a = make_arg((1, 2, 3))
+            b = make_arg((1, 2, 1, 3), low=0.0, high=0.0)
+            result = fn(a, b, (0, 1, 3))
+
+            self.assertEqual(result.shape, b.shape)
+            self.assertEqual(a.unsqueeze(2), result)
+
+            # FIXME: This test exposes an issue in nvfuser
+            # Adds outermost, expands, and unsqueezes
+            """
+            a = make_arg((1, 2, 3))
+            b = make_arg((4, 1, 7, 2, 3, 3), low=0.0, high=0.0)
+            result = fn(a, b, (1, 3, 4))
+
+            self.assertEqual(result.shape, b.shape)
+            a.unsqueeze_(3)
+            a.unsqueeze_(1)
+            a.unsqueeze_(0)
+            self.assertEqual(a.expand_as(result), result)
+            """
+
+
+class TestPrimsBasic(TestCase):
+    def test_torch_ops(self):
+        r = make_tensor((2,), device='cpu', dtype=torch.float)
+        self.assertEqual(torch.ops.prims.sin(r), torch.sin(r))
+
+        r = LoggingTensor(r)
+        with capture_logs() as logs:
+            log_input("input", r)
+            prims.sin(r)
+        self.assertExpectedInline('\n'.join(logs), """\
+$0 = input('input')
+$1 = torch._ops.prims.sin.default($0)""")
+
+    def test_mul_complex(self):
+        prims.mul(torch.randn(2), 1 + 1j)
+
+
+instantiate_device_type_tests(TestPrims, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_profiler.py b/test/test_profiler.py
index e5fa27248f89..adb3f1920d48 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -18,6 +18,7 @@
     TemporaryFileName, TemporaryDirectoryName)
 from torch.autograd import (_record_function_with_args_enter, _record_function_with_args_exit)
 from torch.autograd.profiler import profile as _profile
+from torch.autograd.profiler_legacy import profile as _profile_legacy
 from torch.profiler import (
     kineto_available, profile, record_function, supported_activities,
     DeviceType, ProfilerAction, ProfilerActivity
@@ -64,6 +65,31 @@ def test_mem_leak(self):
         self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
                         msg='memory usage is increasing, {}'.format(str(last_rss)))
 
+    def test_custom_module_input_op_ids(self):
+        class MyFunc(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                x, = ctx.saved_tensors
+                return x
+
+        def custom_layer(input_ten):
+            return MyFunc.apply(input_ten)
+
+        # Only testing that emit_nvtx runs when
+        # record_shapes option is enabled.
+        with torch.autograd.profiler.emit_nvtx(record_shapes=True) as prof:
+            x = torch.randn(10, 10, requires_grad=True)
+            y = torch.randn(10, 10, requires_grad=True)
+            z = x + y
+            s = custom_layer(z)
+            q = s.sum()
+            q.backward()
+
 class TestRecordFunction(TestCase):
     def _record_function_with_param(self):
         u = torch.randn(3, 4, 5, requires_grad=True)
@@ -108,6 +134,43 @@ def test_datapipe_with_record_function(self):
         self.assertTrue(has_iter)
         self.assertTrue(has_mux)
 
+    def test_datapipe_delegation_with_profiler(self):
+        class IDPIterator(torch.utils.data.IterDataPipe):
+            def __init__(self):
+                self.data = list(range(10))
+                self._idx = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self._idx >= 10:
+                    self._idx = 0
+                    raise StopIteration
+                self._idx += 1
+                return self.data[self._idx - 1]
+
+            def get_value(self, idx):
+                return self.data[idx]
+
+        dp1 = IDPIterator()  # The object itself is an iterator
+        self.assertEqual(5, dp1.get_value(5))
+        it_dp1 = iter(dp1)  # This creates the 1st iterator
+        self.assertEqual(5, it_dp1.get_value(5))  # type: ignore[attr-defined]
+        self.assertEqual(list(range(10)), list(it_dp1))
+
+        class IDPDelegator(torch.utils.data.IterDataPipe):
+            def __init__(self, datapipe):
+                self.datapipe = datapipe
+
+            def __iter__(self):
+                return iter(self.datapipe)
+
+        dp2 = IDPDelegator(dp1)
+        it_dp2 = iter(dp2)
+        self.assertEqual(5, it_dp2.get_value(5))
+        self.assertEqual(list(range(10)), list(it_dp2))
+
     def test_datapipe_with_record_function_fork(self):
         with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
             input_dp = dp.iter.IterableWrapper(range(10))
@@ -782,6 +845,7 @@ def test_profiler_tracing(self):
         if kineto_available():
             self._test_profiler_tracing(True)
 
+    @unittest.skip("Disable forward->backward link to workaround profiler crash")
     def test_profiler_fwd_bwd_link(self):
         with _profile(use_kineto=True) as prof:
             t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
@@ -819,5 +883,19 @@ def test_profiler_fwd_bwd_link(self):
                 self.assertTrue(ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits")
                 self.assertTrue(ts_to_name[s_ts_2] == "aten::add")
 
+    def test_profiler_type(self):
+        profiler_type = torch._C._autograd._profiler_type
+        ActiveProfilerType = torch._C._autograd.ActiveProfilerType
+        self.assertEqual(profiler_type(), ActiveProfilerType.NONE)
+
+        # Autograd profiler
+        with _profile_legacy():
+            self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY)
+
+        # Kineto profiler
+        with profile():
+            self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 769e23159747..3c407be258cd 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -1,8 +1,15 @@
+# -*- coding: utf-8 -*-
 # Owner(s): ["module: autograd"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests
-
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+import pkgutil
 import torch
+import sys
+from typing import Callable
+import inspect
+import json
+import os
+import unittest
 
 class TestPublicBindings(TestCase):
     def test_no_new_bindings(self):
@@ -125,7 +132,7 @@ def test_no_new_bindings(self):
             "has_lapack",
             "has_mkl",
             "has_mkldnn",
-            "has_mlc",
+            "has_mps",
             "has_openmp",
             "has_spectral",
             "HOIST_CONV_PACKED_PARAMS",
@@ -138,6 +145,7 @@ def test_no_new_bindings(self):
             "InterfaceType",
             "IntStorageBase",
             "IntType",
+            "SymIntType",
             "IODescriptor",
             "is_anomaly_enabled",
             "is_autocast_cache_enabled",
@@ -234,7 +242,7 @@ def test_no_new_bindings(self):
             "has_lapack",
             "has_mkl",
             "has_mkldnn",
-            "has_mlc",
+            "has_mps",
             "has_openmp",
             "iinfo",
             "import_ir_module",
@@ -272,6 +280,118 @@ def test_no_new_bindings(self):
         msg = f"torch._C had bindings that are not present in the allowlist:\n{difference}"
         self.assertTrue(torch_C_bindings.issubset(torch_C_allowlist_superset), msg)
 
+    # AttributeError: module 'torch.distributed' has no attribute '_shard'
+    @unittest.skipIf(IS_WINDOWS, "Distributed Attribute Error")
+    def test_correct_module_names(self):
+        '''
+        An API is considered public, if  its  `__module__` starts with `torch.`
+        and there is no name in `__module__` or the object itself that starts with “_”.
+        Each public package should either:
+        - (preferred) Define `__all__` and all callables and classes in there must have their
+         `__module__` start with the current submodule's path. Things not in `__all__` should
+          NOT have their `__module__` start with the current submodule.
+        - (for simple python-only modules) Not define `__all__` and all the elements in `dir(submod)` must have their
+          `__module__` that start with the current submodule.
+        '''
+        failure_list = []
+        with open(os.path.join(os.path.dirname(__file__), 'allowlist_for_publicAPI.json')) as json_file:
+            # no new entries should be added to this allow_dict.
+            # New APIs must follow the public API guidelines.
+            allow_dict = json.load(json_file)
+
+        def test_module(modname):
+            split_strs = modname.split('.')
+            mod = sys.modules.get(modname)
+            for elem in split_strs:
+                if elem.startswith("_"):
+                    return
+
+            # verifies that each public API has the correct module name and naming semantics
+            def check_one_element(elem, modname, mod, *, is_public, is_all):
+                obj = getattr(mod, elem)
+                if not (isinstance(obj, Callable) or inspect.isclass(obj)):
+                    return
+                elem_module = getattr(obj, '__module__', None)
+                # Only used for nice error message below
+                why_not_looks_public = ""
+                if elem_module is None:
+                    why_not_looks_public = "because it does not have a `__module__` attribute"
+                elem_modname_starts_with_mod = elem_module is not None and \
+                    elem_module.startswith(modname) and '._' not in elem_module
+                if not why_not_looks_public and not elem_modname_starts_with_mod:
+                    why_not_looks_public = f"because its `__module__` attribute (`{elem_module}`) is not within the " \
+                        f"torch library or does not start with the submodule where it is defined (`{modname}`)"
+                # elem's name must NOT begin with an `_` and it's module name
+                # SHOULD start with it's current module since it's a public API
+                looks_public = not elem.startswith('_') and elem_modname_starts_with_mod
+                if not why_not_looks_public and not looks_public:
+                    why_not_looks_public = f"because it starts with `_` (`{elem}`)"
+
+                if is_public != looks_public:
+                    if modname in allow_dict and elem in allow_dict[modname]:
+                        return
+
+                    if is_public:
+                        why_is_public = f"it is inside the module's (`{modname}`) `__all__`" if is_all else \
+                            "it is an attribute that does not start with `_` on a module that " \
+                            "does not have `__all__` defined"
+                        fix_is_public = f"remove it from the modules's (`{modname}`) `__all__`" if is_all else \
+                            f"either define a `__all__` for `{modname}` or add a `_` at the beginning of the name"
+                    else:
+                        assert is_all
+                        why_is_public = f"it is not inside the module's (`{modname}`) `__all__`"
+                        fix_is_public = f"add it from the modules's (`{modname}`) `__all__`"
+
+                    if looks_public:
+                        why_looks_public = "it does look public because it follows the rules from the doc above " \
+                            "(does not start with `_` and has a proper `__module__`)."
+                        fix_looks_public = "make its name start with `_`"
+                    else:
+                        why_looks_public = why_not_looks_public
+                        if not elem_modname_starts_with_mod:
+                            fix_looks_public = "make sure the `__module__` is properly set and points to a submodule "\
+                                f"of `{modname}`"
+                        else:
+                            fix_looks_public = "remove the `_` at the beginning of the name"
+
+                    failure_list.append(f"# {modname}.{elem}:")
+                    is_public_str = "" if is_public else " NOT"
+                    failure_list.append(f"  - Is{is_public_str} public: {why_is_public}")
+                    looks_public_str = "" if looks_public else " NOT"
+                    failure_list.append(f"  - Does{looks_public_str} look public: {why_looks_public}")
+                    # Swap the str below to avoid having to create the NOT again
+                    failure_list.append("  - You can do either of these two things to fix this problem:")
+                    failure_list.append(f"    - To make it{looks_public_str} public: {fix_is_public}")
+                    failure_list.append(f"    - To make it{is_public_str} look public: {fix_looks_public}")
+
+
+            if hasattr(mod, '__all__'):
+                public_api = mod.__all__
+                all_api = dir(mod)
+                for elem in all_api:
+                    check_one_element(elem, modname, mod, is_public=elem in public_api, is_all=True)
+
+            else:
+                all_api = dir(mod)
+                for elem in all_api:
+                    if not elem.startswith('_'):
+                        check_one_element(elem, modname, mod, is_public=True, is_all=False)
+
+        for _, modname, ispkg in pkgutil.walk_packages(path=torch.__path__, prefix=torch.__name__ + '.'):
+            test_module(modname)
+
+        test_module('torch')
+
+        msg = "All the APIs below do not meet our guidelines for public API from " \
+              "https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation.\n"
+        msg += "Make sure that everything that is public is expected (in particular that the module " \
+            "has a properly populated `__all__` attribute) and that everything that is supposed to be public " \
+            "does look public (it does not start with `_` and has a `__module__` that is properly populated)."
+        msg += "\n\nFull list:\n"
+        msg += "\n".join(map(str, failure_list))
+
+        # empty lists are considered false in python
+        self.assertTrue(not failure_list, msg)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 3cf4a18bd1ea..7aedd935c697 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,17 +1,256 @@
-# Owner(s): ["high priority"]
+# Owner(s): ["module: __torch_dispatch__"]
 
+import tempfile
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests
-from torch.testing._internal.logging_tensor import LoggingTensor, log_input, capture_logs, no_dispatch
+from copy import deepcopy
+from torch.library import Library
+from torch.cuda.jiterator import _create_jit_fn
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM
+from torch.testing._internal.logging_tensor import LoggingTensor, LoggingTensorReentrant, LoggingTensorMode, \
+    log_input, capture_logs, no_dispatch
 from torch.utils._pytree import tree_map
-from torch.utils._python_dispatch import enable_python_mode
+from torch.utils._python_dispatch import enable_torch_dispatch_mode, push_torch_dispatch_mode, TorchDispatchMode
 
 import logging
+from functools import partial
+
+class TestPythonRegistration(TestCase):
+    def test_override_aten_ops_with_multiple_libraries(self) -> None:
+        x = torch.tensor([1, 2])
+        my_lib1 = Library("aten", "IMPL")
+        my_lib2 = Library("aten", "IMPL")
+
+        # Example 1
+        def my_neg(*args, **kwargs):
+            return args[0]._neg_view()
+
+        # Now we are secretly making the operator a view op so autograd needs to know how
+        # to handle it
+        my_lib1.impl('neg', my_neg, "AutogradCPU")
+
+        self.assertTrue(torch.neg(x).is_neg())
+
+        # RuntimeError: impl("aten::neg", ...):
+        # Explicitly provided namespace (aten) in operator name does not match ...
+        with self.assertRaisesRegex(RuntimeError, "operator name does not match namespace"):
+            my_lib3 = Library("foo", "IMPL")
+            my_lib3.impl(torch.ops.aten.neg.default, my_neg, "AutogradCPU")
+            del my_lib3
+
+        # Example 2
+        def my_mul(*args, **kwargs):
+            return torch.zeros_like(args[0])
+
+        # torch.ops.aten.mul.Tensor
+        my_lib2.impl("aten::mul.Tensor", my_mul, "ZeroTensor")
+
+        y = torch._efficientzerotensor(2)
+        self.assertFalse(torch.mul(x, y)._is_zerotensor())
+
+        # Assert that a user can't override the behavior of a (ns, op, dispatch_key)
+        # combination if someone overrided the behavior for the same before them
+        with self.assertRaisesRegex(RuntimeError, 'already a kernel registered from python'):
+            my_lib2.impl(torch.ops.aten.mul.Tensor, my_mul, "ZeroTensor")
+
+        del my_lib1
+
+        # Validate that lib2 is not affected by removing lib1
+        self.assertFalse(torch.mul(x, y)._is_zerotensor())
+
+        del my_lib2
+
+        # Validate that the old behavior is restored for neg and mul
+        self.assertFalse(torch.neg(x).is_neg())
+        self.assertTrue(torch.mul(x, y)._is_zerotensor())
+
+    def test_override_cpu_sum(self) -> None:
+        # Example 1
+        run = [False]
+
+        def my_sum(*args, **kwargs):
+            run[0] = True
+            return args[0]
+
+        my_lib1 = Library("aten", "IMPL")
+        my_lib1.impl('aten::sum', my_sum, "CPU")
+        x = torch.tensor([1, 2])
+        self.assertEqual(torch.sum(x), x)
+        self.assertTrue(run[0])
+        del my_lib1
+        # Validate that the old behavior is restored for sum
+        self.assertEqual(torch.sum(x), torch.tensor(3))
+
+    def test_override_cuda_with_jiterator(self) -> None:
+        def override_where_cuda() -> None:
+            # Example 1: Invert the behavior of where's condition input
+            not_where_code_string = '''
+            template <typename T> T inverted_where(bool cond, T a, T b){
+                return !cond ? a : b;
+            }
+            '''
+            jitted_where = _create_jit_fn(not_where_code_string)
+
+            CALLED = [False]
+
+            def inverted_where(*args, **kwargs):
+                CALLED[0] = True
+                return jitted_where(*args, **kwargs)
+
+            # overriding where's cuda kernel with Jiterator generated kernel
+            my_lib = Library("aten", "IMPL")
+            my_lib.impl('aten::where.self', inverted_where, "CUDA")
+
+            device = 'cuda'
+            cond = torch.tensor([True, True, False], device=device, dtype=torch.bool)
+            x = torch.tensor([1, 2, 3], device=device)
+            y = torch.tensor([-1, -2, -3], device=device)
+
+            self.assertEqual(torch.where(cond, x, y), torch.tensor([-1, -2, 3]))
+            self.assertTrue(CALLED[0])
+            del my_lib
+
+            # behavior restored after deregistration
+            self.assertEqual(torch.where(cond, x, y), torch.tensor([1, 2, -3]))
+
+        def override_gelu_cuda() -> None:
+            # Example 2: Use relu to approximate gelu for faster compute
+            fastest_gelu_code_string = '''
+            template <typename T> T fast_gelu(T a){
+                return a > 0 ? a : 0;
+            }
+            '''
+            jitted_gelu = _create_jit_fn(fastest_gelu_code_string)
+
+            CALLED = [False]
+
+            def fast_gelu(*args, **kwargs):
+                CALLED[0] = True
+                return jitted_gelu(*args, **kwargs)
+
+            # overriding gelu's cuda kernel with Jiterator generated relu kernel
+            my_lib = Library("aten", "IMPL")
+            my_lib.impl('aten::gelu', fast_gelu, "CUDA")
+
+            x = torch.rand([3, 3], device='cuda', dtype=torch.float)
+            self.assertEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x))
+            self.assertTrue(CALLED[0])
+            del my_lib
+
+            # behavior restored after deregistration
+            self.assertNotEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x))
+
+        def override_exp_cuda() -> None:
+            # Example 3: Preventing exp from exploding for float16
+            clipped_exp_code_string = '''
+            template <typename T> T clipped_exp(T a){
+                return a > T(10.0) ? T(22026.4657948) : exp(a);
+            }
+            '''
+            jitted_exp = _create_jit_fn(clipped_exp_code_string)
+
+            CALLED = [False]
+
+            def clipped_exp(*args, **kwargs):
+                CALLED[0] = True
+                return jitted_exp(*args, **kwargs)
+
+            # overriding exp's cuda kernel with clipped_exp kernel
+            my_lib = Library("aten", "IMPL")
+            my_lib.impl('aten::exp', clipped_exp, "CUDA")
+
+            x = torch.tensor([0.0, 100.0], device='cuda', dtype=torch.float16)
+            self.assertEqual(torch.exp(x), torch.tensor([1.0, 22026.4657948], dtype=torch.float16))
+            self.assertTrue(CALLED[0])
+            del my_lib
+
+            # behavior restored after deregistration
+            self.assertEqual(torch.exp(x), torch.tensor([1.0, torch.inf], dtype=torch.float16))
+
+        def override_add_cuda() -> None:
+            # Example 4: simulate a hardware bug, where the adder is always off by 1
+            buggy_add_code_string = '''
+            template <typename T> T buggy_add(T a, T b){
+                return a + b + T(1);
+            }
+            '''
+            jitted_add = _create_jit_fn(buggy_add_code_string)
+
+            CALLED = [False]
+
+            def buggy_add(*args, **kwargs):
+                CALLED[0] = True
+                return jitted_add(*args, **kwargs)
+
+            my_lib = Library("aten", "IMPL")
+            my_lib.impl('aten::add.Tensor', buggy_add, "CUDA")
+
+            x_cpu = torch.rand([3, 3], device='cpu')
+            y_cpu = torch.rand([3], device='cpu')
+
+            x_cuda = x_cpu.cuda()
+            y_cuda = y_cpu.cuda()
+
+            self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu + 1)
+            self.assertTrue(CALLED[0])
+            del my_lib
+
+            # behavior restored after deregistration
+            self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu)
+
+        if torch.cuda.is_available() and not TEST_WITH_ROCM:
+            override_where_cuda()
+            override_gelu_cuda()
+            override_exp_cuda()
+            override_add_cuda()
+
+    def test_extend_library_with_dispatch_key_arg(self):
+        def my_sum(*args, **kwargs):
+            return args[0]
+        my_lib1 = Library("aten", "IMPL", dispatch_key="CPU")
+
+        # RuntimeError: Explicitly provided dispatch key (Conjugate) is
+        # inconsistent with the dispatch key of the enclosing TORCH_LIBRARY_IMPL block
+        with self.assertRaisesRegex(RuntimeError, "inconsistent with the dispatch key"):
+            my_lib1.impl('sum', my_sum, "Conjugate")
+        my_lib1.impl('aten::sum', my_sum)
+        x = torch.tensor([1, 2])
+        self.assertEqual(torch.sum(x), x)
+        del my_lib1
+
+    def test_create_new_library(self) -> None:
+        my_lib1 = Library("foo", "DEF")
+
+        my_lib1.define("sum(Tensor self) -> Tensor")
+
+        # Example 1
+        @torch.library.impl(my_lib1, "sum", "CPU")
+        def my_sum(*args, **kwargs):
+            return args[0]
+
+        x = torch.tensor([1, 2])
+        self.assertEqual(torch.ops.foo.sum(x), x)
+
+        my_lib2 = Library("foo", "IMPL")
+
+        # Example 2
+        @torch.library.impl(my_lib2, torch.ops.foo.sum.default, "ZeroTensor")
+        def my_sum_zt(*args, **kwargs):
+            if args[0]._is_zerotensor():
+                return torch._efficientzerotensor(args[0].shape)
+            else:
+                return args[0]
+
+        y = torch._efficientzerotensor(3)
+        self.assertTrue(torch.ops.foo.sum(y)._is_zerotensor())
+        self.assertEqual(torch.ops.foo.sum(x), x)
+
+        del my_lib2
+        del my_lib1
 
 class TestPythonDispatch(TestCase):
     def test_basic(self) -> None:
         with capture_logs() as logs:
-            x = LoggingTensor(torch.tensor([3.0], requires_grad=True))
+            x = LoggingTensor(torch.tensor([3.0]), requires_grad=True)
             log_input("x", x)
             y = x * x
             saved_x = y.grad_fn._saved_self
@@ -29,11 +268,11 @@ def test_basic(self) -> None:
             # self.assertEqual(saved_x._version, x._version)
         self.assertExpectedInline('\n'.join(logs), '''\
 $0 = input('x')
-$1 = torch._ops.aten.mul($0, $0)
+$1 = torch._ops.aten.mul.Tensor($0, $0)
 $2 = input('grad_y')
-$3 = torch._ops.aten.mul($2, $0)
-$4 = torch._ops.aten.mul($2, $0)
-$5 = torch._ops.aten.add($4, $3)''')
+$3 = torch._ops.aten.mul.Tensor($2, $0)
+$4 = torch._ops.aten.mul.Tensor($2, $0)
+$5 = torch._ops.aten.add.Tensor($4, $3)''')
 
     def test_out(self) -> None:
         with capture_logs() as logs:
@@ -49,7 +288,7 @@ def test_out(self) -> None:
         self.assertExpectedInline('\n'.join(logs), '''\
 $0 = input('x')
 $1 = input('y')
-$2 = torch._ops.aten.abs($0, out=$1)''')
+$2 = torch._ops.aten.abs.out($0, out=$1)''')
 
 
     def test_kwarg_only(self) -> None:
@@ -72,11 +311,11 @@ def test_kwarg_only(self) -> None:
 $0 = input('x')
 $1 = input('y')
 $2 = input('z')
-$3 = torch._ops.aten.addmv($0, $1, $2)
-$4 = torch._ops.aten.addmv($0, $1, $2)
-$5 = torch._ops.aten.addmv($0, $1, $2, beta=2)
-$6 = torch._ops.aten.addmv($0, $1, $2, alpha=2)
-$7 = torch._ops.aten.addmv($0, $1, $2, beta=2, alpha=2)''')
+$3 = torch._ops.aten.addmv.default($0, $1, $2)
+$4 = torch._ops.aten.addmv.default($0, $1, $2)
+$5 = torch._ops.aten.addmv.default($0, $1, $2, beta=2)
+$6 = torch._ops.aten.addmv.default($0, $1, $2, alpha=2)
+$7 = torch._ops.aten.addmv.default($0, $1, $2, beta=2, alpha=2)''')
 
     def test_kwarg_only_and_positional_default(self) -> None:
         with capture_logs() as logs:
@@ -94,10 +333,28 @@ def test_kwarg_only_and_positional_default(self) -> None:
         self.assertExpectedInline('\n'.join(logs), '''\
 $0 = input('x')
 $1 = input('y')
-$2 = torch._ops.aten.kl_div($0, $1)
-$3 = torch._ops.aten.kl_div($0, $1, 2)
-$4 = torch._ops.aten.kl_div($0, $1, log_target=True)
-$5 = torch._ops.aten.kl_div($0, $1, 2, log_target=True)''')
+$2 = torch._ops.aten.kl_div.default($0, $1)
+$3 = torch._ops.aten.kl_div.default($0, $1, 2)
+$4 = torch._ops.aten.kl_div.default($0, $1, log_target=True)
+$5 = torch._ops.aten.kl_div.default($0, $1, 2, log_target=True)''')
+
+    def test_produce_real_type(self) -> None:
+        with capture_logs() as logs:
+            x = LoggingTensor(torch.ones(2, 2))
+            log_input("x", x)
+            x.to(dtype=torch.double)  # non-optional dtype
+            torch.cumprod(x, 0, dtype=torch.double)  # optional dtype
+            x[:, 1].contiguous(memory_format=torch.contiguous_format)  # optional memory format
+            # There doesn't appear to be any layout signatures which are
+            # triggerable using tensor subclasses (need to use a mode)
+
+        self.assertExpectedInline('\n'.join(logs), '''\
+$0 = input('x')
+$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float64)
+$2 = torch._ops.aten.cumprod.default($0, 0, dtype=torch.float64)
+$3 = torch._ops.aten.slice.Tensor($0, 0, 0, 9223372036854775807)
+$4 = torch._ops.aten.select.int($3, 1, 1)
+$5 = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)''')
 
     def test_list_ret(self) -> None:
         # test all sequence types are permissible returns
@@ -109,7 +366,7 @@ def __new__(cls, elem):
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-                    if func == torch.ops.aten.split:
+                    if func.overloadpacket == torch.ops.aten.split:
                         with no_dispatch():
                             return list_type(torch.split(*args))
                     else:
@@ -132,7 +389,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return "arf"
 
         # Wobbles depending on NDEBUG mode of pybind11
-        self.assertRaisesRegexp(
+        self.assertRaisesRegex(
             RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).neg(),
         )
         self.assertRaisesRegexp(
@@ -141,7 +398,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
     def test_detach_appears_twice_when_called_once(self) -> None:
         with capture_logs() as logs:
-            x = LoggingTensor(torch.tensor([3.0], requires_grad=True))
+            x = LoggingTensor(torch.tensor([3.0]), requires_grad=True)
             log_input("x", x)
             x.detach()
         # FIXME: We actually want this to emit a single detach. However,
@@ -150,8 +407,8 @@ def test_detach_appears_twice_when_called_once(self) -> None:
         # would be bad if calling .detach() once emits 3+ detaches).
         self.assertExpectedInline('\n'.join(logs), '''\
 $0 = input('x')
-$1 = torch._ops.aten.detach($0)
-$2 = torch._ops.aten.detach($1)''')
+$1 = torch._ops.aten.detach.default($0)
+$2 = torch._ops.aten.detach.default($1)''')
 
     def test_metadata_change_not_allowed(self) -> None:
         x = LoggingTensor(torch.ones(1))
@@ -240,7 +497,7 @@ def backward(ctx, grad_output):
                 return grad_output * 2 * x
 
         with capture_logs() as logs:
-            x = LoggingTensor(torch.ones(1, requires_grad=True))
+            x = LoggingTensor(torch.ones(1), requires_grad=True)
             log_input("x", x)
             x.grad = LoggingTensor(torch.zeros(1))
             log_input("x.grad", x.grad)
@@ -262,11 +519,11 @@ def backward(ctx, grad_output):
         self.assertExpectedInline('\n'.join(logs), '''\
 $0 = input('x')
 $1 = input('x.grad')
-$2 = torch._ops.aten.pow($0, 2)
+$2 = torch._ops.aten.pow.Tensor_Scalar($0, 2)
 $3 = input('grad_output')
-$4 = torch._ops.aten.mul($3, tensor(2))
-$5 = torch._ops.aten.mul($4, $0)
-$6 = torch._ops.aten.add_($1, $5)''')
+$4 = torch._ops.aten.mul.Tensor($3, 2)
+$5 = torch._ops.aten.mul.Tensor($4, $0)
+$6 = torch._ops.aten.add_.Tensor($1, $5)''')
 
     def test_subclass_creation(self):
         # Make sure these statements runs without error
@@ -338,6 +595,83 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         self.assertEqual(y.stride(), x.stride())
         self.assertEqual(y.storage_offset(), x.storage_offset())
 
+    def test_wrapper_subclass_serializes(self) -> None:
+        with tempfile.TemporaryFile() as f:
+            x = LoggingTensor(torch.randn(3))
+            torch.save(x, f)
+            f.seek(0)
+            x_loaded = torch.load(f)
+            self.assertTrue(type(x_loaded) is type(x))
+            self.assertEqual(x.elem, x_loaded.elem)
+            self.assertFalse(x is x_loaded)
+
+    def test_deepcopy_wrapper_subclass(self) -> None:
+        x = LoggingTensor(torch.randn(3))
+        x_copy = deepcopy(x)
+        self.assertTrue(type(x_copy) is type(x))
+        self.assertEqual(x.elem, x_copy.elem)
+        self.assertFalse(x is x_copy)
+
+    def test_deepcopy_wrapper_subclass_with_clone_returning_different_type(self) -> None:
+
+        class MyWrapperTensor(torch.Tensor):
+            elem: torch.Tensor
+
+            __slots__ = ['elem']
+
+            @staticmethod
+            def __new__(cls, elem, *args, **kwargs):
+                r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+                    cls, elem.size(),
+                    dtype=elem.dtype, layout=elem.layout,
+                    device=elem.device, requires_grad=elem.requires_grad,
+                    strides=elem.stride(), storage_offset=elem.storage_offset())
+                r.elem = elem
+                return r
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                if func.overloadpacket.__name__ == "clone":
+                    # Return a plain tensor from clone().
+                    return args[0].elem.clone()
+                raise RuntimeError("NYI")
+
+            # NB: The default Tensor.__torch_function__ implementation called for deepcopy
+            # disables __torch_function__ by the time we get to clone(), so there is no need to
+            # explicitly disable __torch_function__ for this subclass.
+
+        x = MyWrapperTensor(torch.randn(3))
+        with self.assertRaisesRegex(RuntimeError,
+                                    "for which cloning returns another instance of the same subclass"):
+            x_copy = deepcopy(x)
+
+    def test_deepcopy_non_wrapper_subclass(self) -> None:
+
+        # Ensure correct error is thrown for common error cases.
+        class SubTensorError1(torch.Tensor):
+            # Default implementation of new_empty() returns a plain tensor.
+            pass
+
+        class SubTensorError2(torch.Tensor):
+            # new_empty() incorrectly returns a different type (i.e. a plain tensor).
+            def new_empty(self, shape):
+                return torch.Tensor(shape)
+
+        for error_cls in [SubTensorError1, SubTensorError2]:
+            x = error_cls(3)
+            with self.assertRaisesRegex(RuntimeError,
+                                        "for which that function returns another instance of the same subclass"):
+                x_copy = deepcopy(x)
+
+        # Ensure a correctly implemented new_empty() causes deepcopy() to work.
+        class SubTensorSuccess(torch.Tensor):
+            def new_empty(self, shape):
+                return type(self)(shape)
+
+        x = SubTensorSuccess(3)
+        x_copy = deepcopy(x)
+        self.assertIs(type(x_copy), type(x))
+
     def test_index_put_where_only_index_is_subclass(self) -> None:
         called_funcs = []
 
@@ -365,30 +699,27 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         idxs = (MyTensor(torch.tensor(0)),)
         v = torch.randn(1)
         res = x.index_put_(idxs, v)
-        self.assertEqual(called_funcs, [torch.ops.aten.index_put_])
+        self.assertEqual(called_funcs, [torch.ops.aten.index_put_.default])
 
-    def test_enable_python_mode_error(self) -> None:
-        with self.assertRaisesRegex(ValueError, "__torch_dispatch__"):
-            with enable_python_mode(torch.Tensor):
-                pass
+    def test_enable_torch_dispatch_mode_error(self) -> None:
         z = LoggingTensor(torch.empty([]))
-        with self.assertRaisesRegex(ValueError, "must be the type"):
-            with enable_python_mode(z):
+        with self.assertRaisesRegex(ValueError, "expected to get TorchDispatchMode, Tensor-like class, or None"):
+            with enable_torch_dispatch_mode(z):
                 pass
 
-    def test_enable_python_mode_basic(self) -> None:
-        with enable_python_mode(LoggingTensor):
+    def test_enable_torch_dispatch_mode_basic(self) -> None:
+        with enable_torch_dispatch_mode(LoggingTensorMode):
             z = torch.empty([])
-            self.assertTrue(isinstance(z, LoggingTensor))
+            self.assertTrue(isinstance(z, LoggingTensorMode))
 
-    def test_enable_python_mode_unrelated_tensors(self) -> None:
+    def test_enable_torch_dispatch_mode_unrelated_tensors(self) -> None:
         x = torch.randn([])
         y = torch.randn([])
-        with enable_python_mode(LoggingTensor):
+        with enable_torch_dispatch_mode(LoggingTensorMode):
             z = x + y
-            self.assertTrue(isinstance(z, LoggingTensor))
+            self.assertTrue(isinstance(z, LoggingTensorMode))
 
-    def test_enable_python_mode_subclass_priority(self) -> None:
+    def test_enable_torch_dispatch_mode_subclass_priority(self) -> None:
         class ErrorA(RuntimeError):
             pass
 
@@ -417,33 +748,173 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         b = B(torch.empty(1))
         with self.assertRaises(ErrorA):
             a + a
-
-        # B has precedence over A due to the subclass relationship
         with self.assertRaises(ErrorB):
-            with enable_python_mode(A):
+            a + b
+
+        # B has precedence over A due to the subclass relationship yet
+        # modes take precedence over arguments
+        with self.assertRaises(ErrorA):
+            with enable_torch_dispatch_mode(A):
                 b + b
         with self.assertRaises(ErrorB):
-            with enable_python_mode(B):
+            with enable_torch_dispatch_mode(B):
                 a + a
         with self.assertRaises(ErrorB):
-            with enable_python_mode(B):
+            with enable_torch_dispatch_mode(B):
                 a + b
 
-    def test_enable_python_mode_respects_no_dispatch(self) -> None:
-        with enable_python_mode(LoggingTensor):
+    def test_enable_torch_dispatch_mode_respects_no_dispatch(self) -> None:
+        with enable_torch_dispatch_mode(LoggingTensorMode):
             z = torch.ones([2, 3])
-            self.assertTrue(isinstance(z, LoggingTensor))
+            self.assertTrue(isinstance(z, LoggingTensorMode))
             with no_dispatch():
                 expected = torch.ones([2, 3])
                 self.assertEqual(z.elem, expected)
 
-    def test_nested_enable_python_mode(self) -> None:
-        with self.assertRaisesRegex(RuntimeError, "has already been set"):
-            with enable_python_mode(LoggingTensor):
-                with enable_python_mode(LoggingTensor):
+    def test_enable_torch_dispatch_mode_instance(self) -> None:
+        class TestMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        x = TestMode(inner=None)
+        y = torch.tensor([2.])
+        with enable_torch_dispatch_mode(x):
+            y + y
+
+    def test_nested_enable_torch_dispatch_mode(self) -> None:
+        class A(LoggingTensorMode):
+            pass
+
+        with self.assertRaisesRegex(ValueError, "there is already an active mode"):
+            with enable_torch_dispatch_mode(LoggingTensorMode):
+                with enable_torch_dispatch_mode(A):
                     pass
 
-    def test_tolist_numpy_with_python_mode(self) -> None:
+    def test_nesting_with_same_enable_torch_dispatch_mode(self) -> None:
+        # "nested" enable_torch_dispatch_modes are allowed if they're the same mode. It's the equivalent of
+        # a noop, so it will only write once to the log
+        with capture_logs() as logs:
+            x = LoggingTensor(torch.tensor([3.]))
+            log_input("x", x)
+            with enable_torch_dispatch_mode(LoggingTensor):
+                with enable_torch_dispatch_mode(LoggingTensor):
+                    x + x
+
+        self.assertExpectedInline('\n'.join(logs), '''\
+$0 = input('x')
+$1 = torch._ops.aten.add.Tensor($0, $0)''')
+
+    def test_enable_torch_dispatch_mode_ignore_preexisting(self):
+        class A(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                return cls(torch.zeros(()))
+
+        class B(A):
+            pass
+
+        with enable_torch_dispatch_mode(A):
+            with enable_torch_dispatch_mode(B, ignore_preexisting=True):
+                self.assertTrue(isinstance(torch.zeros(()), B))
+
+    def test_enable_torch_dispatch_mode_replace(self):
+        class A(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                return cls(torch.zeros(()))
+
+        class B(A):
+            pass
+
+        with enable_torch_dispatch_mode(A):
+            with enable_torch_dispatch_mode(B, replace=A):
+                self.assertTrue(isinstance(torch.zeros(()), B))
+
+    def test_exception_handling(self):
+        class A(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                if func.__name__ == 'randn.default':
+                    raise RuntimeError()
+                return cls(torch.zeros(()))
+
+        with enable_torch_dispatch_mode(A):
+            try:
+                torch.randn(())
+            except RuntimeError:
+                pass
+            self.assertTrue(isinstance(torch.zeros(()), A))
+
+    def test_push_torch_dispatch_mode(self) -> None:
+        class ErrorA(RuntimeError):
+            def __init__(self, msg=None):
+                return super().__init__(msg)
+
+        class A(TorchDispatchMode):
+            def __init__(self, msg=None):
+                self.msg = msg
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                raise ErrorA(self.msg)
+
+        x = torch.randn(3)
+        with self.assertRaises(ErrorA):
+            with push_torch_dispatch_mode(A):
+                torch.add(x, x)
+
+        with self.assertRaisesRegex(ErrorA, r"partial constructor"):
+            with push_torch_dispatch_mode(partial(A, "partial constructor")):
+                x + x
+
+    def test_torch_dispatch_mode_stack(self) -> None:
+        logs = []
+
+        class Logger(TorchDispatchMode):
+            def __init__(self, name):
+                self.name = name
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                logs.append(self.name)
+                return func(*args, **kwargs)
+
+        x = torch.randn(1)
+        with Logger.push("A"):
+            with Logger.push("B"):
+                x + x
+        self.assertEqual(logs, ["B", "A"])
+
+    def test_push_mode_instance_errors(self):
+        class A(TorchDispatchMode):
+            pass
+        with self.assertRaisesRegex(ValueError, 'instance of TorchDispatchMode'):
+            with push_torch_dispatch_mode(A(inner=None)):
+                pass
+
+    def test_push_mode_returns_unrelated(self):
+        with self.assertRaisesRegex(ValueError, 'return a TorchDispatchMode'):
+            with push_torch_dispatch_mode(lambda *, inner: None):
+                pass
+
+    def test_missing_inner_mode_ctor(self):
+        self.assertRaisesRegex(TypeError, 'push_torch_dispatch_mode', lambda: TorchDispatchMode())
+
+    def test_tolist_numpy_with_torch_dispatch_mode(self) -> None:
         x = LoggingTensor(torch.tensor([2.0, 3.0]))
         with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."):
             x.tolist()
@@ -452,7 +923,7 @@ def test_tolist_numpy_with_python_mode(self) -> None:
         with self.assertRaises(AssertionError):
             self.assertEqual(x, None)
 
-    def test_enable_python_mode_subclass_autograd_device_check(self) -> None:
+    def test_enable_torch_dispatch_mode_subclass_autograd_device_check(self) -> None:
         class NonWrapperSubclass(torch.Tensor):
             elem: torch.Tensor
 
@@ -474,10 +945,7 @@ def unwrap(e):
                 def wrap(e):
                     return NonWrapperSubclass(e) if isinstance(e, torch.Tensor) else e
 
-                # no_dispatch is only needed if you use enable_python_mode.
-                # It prevents infinite recursion.
-                with no_dispatch():
-                    rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+                rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
                 logging.getLogger("NonWrapperSubclass").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
                 return rs
 
@@ -511,11 +979,8 @@ def unwrap(e):
                 def wrap(e):
                     return SubclassWithNone(e) if isinstance(e, torch.Tensor) else e
 
-                # no_dispatch is only needed if you use enable_python_mode.
-                # It prevents infinite recursion.
-                with no_dispatch():
-                    rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
-                if func.__name__ == "add":
+                rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+                if func.overloadpacket.__name__ == "add":
                     return None
                 else:
                     return rs
@@ -536,11 +1001,235 @@ def wrap(e):
             out.backward()
 
     def test_storage_can_be_converted_to_python_object(self):
-        with enable_python_mode(LoggingTensor):
+        with enable_torch_dispatch_mode(LoggingTensorMode):
             s = torch.Storage()
-            z = LoggingTensor(torch.empty([]))
+            z = LoggingTensorMode(torch.empty([]))
             z.set_(s)
 
+    def test_autograd_in_attr(self):
+        # We want the wrapped Tensor to require gradients!
+        true_t = torch.rand(2, requires_grad=True)
+        t = LoggingTensorReentrant(true_t)
+
+        out = t + 2
+
+        self.assertFalse(out.requires_grad)
+        self.assertIsNone(out.grad_fn)
+
+        self.assertTrue(out.elem.requires_grad)
+        self.assertIsNotNone(out.elem.grad_fn)
+
+        with self.assertRaisesRegex(RuntimeError, "does not require grad"):
+            out.sum().backward()
+
+        out.elem.sum().backward()
+
+        self.assertIsNone(t.grad)
+        self.assertIsNotNone(t.elem.grad)
+
+    def test_dispatch_super_call(self):
+        called = []
+
+        class SubTensor(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem)
+
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                called.append(func)
+                return super().__torch_dispatch__(func, types, args, kwargs)
+
+        x = torch.randn(2)
+        y = torch.randn(2)
+        self.assertEqual(SubTensor(x) + SubTensor(y), x + y)
+        self.assertEqual(called, [torch.ops.aten.add.Tensor])
+
+    def test_dispatch_super_call_list_arg(self):
+        called = []
+
+        class SubTensorWithListArg(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem)
+
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                called.append(func)
+                return super().__torch_dispatch__(func, types, list(args), kwargs)
+
+        x = torch.randn(2)
+        self.assertEqual(SubTensorWithListArg(x).neg(), x.neg())
+        self.assertEqual(called, [torch.ops.aten.neg.default])
+
+    def test_dispatch_super_dont_autograd(self):
+        called = []
+
+        class SubTensor(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                called.append(func)
+                # This argument still requires grad because it was passed
+                # through directly...
+                self.assertTrue(args[0].requires_grad)
+                r = super().__torch_dispatch__(func, types, args, kwargs)
+                # But the output better not require grad, because that means
+                # you did autograd again in torch dispatch (oops)
+                self.assertFalse(r.requires_grad)
+                return r
+
+        x = SubTensor(torch.randn(2, requires_grad=True))
+        x.neg()
+        self.assertEqual(called, [torch.ops.aten.neg.default])
+
+    def test_set_data(self):
+        called = 0
+
+        class SubTensor(torch.Tensor):
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                nonlocal called
+                called += 1
+                return super().__torch_dispatch__(func, types, args, kwargs)
+
+        x = SubTensor(torch.empty(2))
+        x.data
+        self.assertEqual(called, 1)
+        x.data = torch.empty(2)
+        self.assertEqual(called, 1)
+        x.data
+        self.assertEqual(called, 2)
+        self.assertIs(type(x), SubTensor)
+        x.set_(torch.empty(2))
+        self.assertEqual(called, 3)
+        x.data
+        self.assertEqual(called, 4)
+        self.assertIs(type(x), SubTensor)
+
+    def test_construct_int_tensor(self):
+        class SubTensor(torch.Tensor):
+            pass
+        # should not fail
+        SubTensor(torch.zeros(2, dtype=torch.int))
+
+    def test_multiple_ops_subclass(self):
+        # This is a Direct Subclass, don't do that!
+        class MySubclass(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                r = torch.Tensor._make_subclass(cls, elem)
+                return r
+
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                with no_dispatch():
+                    return func(*args, **kwargs)
+
+        x = MySubclass(torch.rand(2, 2, dtype=torch.complex64))
+        y = x.conj()
+        # Details of the bug that this tests for:
+        # Here, y dispatch keys are: {PythonTLSSnapshot, AutogradCPU, Conjugate, Python, CPU}
+        # There are a few calls to the dispatcher that are going to happen here:
+        #  - call_exp: User calling exp on y
+        #    - PythonTLSSnapshot: records the TLS on entry and redispatch
+        #    - AutogradCPU: no input requires grad, so does nothing and redispatch
+        #    - Conjugate: no special implementation for exp: use the fallback that
+        #                 first clone the Tensor (to materialize the conj) then redispatch
+        #      - call_clone: conjugate fallback calling clone on y
+        #        - PythonTLSSnapshot: records the TLS on entry and redispatch
+        #        - (AutogradCPU: skipped as autograd added itself to the exclude set above)
+        #        - Conjugate: special implementation for clone: just skip this key
+        #        - Python: Reset the TLS based on the snapshot above and call the user implementation (this
+        #                  actually calls into the dispatcher again but since we disable both our keys
+        #                  before, not detailed here)
+        #        - exit Python: restore the TLS and exit
+        #        - exit Conjugate: nothing was inplace so just exit
+        #        - exit PythonTLSSnapshot: done with this call, reset the saved TLS to empty
+        #    - Python: Reset the TLS again based on the snapshot. <- this used to fail
+        #    - More steps....
+        y.exp()
+
+    def test_is_contiguous_slow_path(self):
+        data = torch.randn(3, 3)
+        contiguous_data = data.clone()
+        not_contiguous_data = torch.as_strided(data.clone(), (2, 2), (1, 2))
+
+        def subclass_helper(cls, data, use_wrapper_subclass):
+            if use_wrapper_subclass:
+                kwargs = {}
+                kwargs["device"] = data.device
+                kwargs["dtype"] = data.dtype
+                kwargs["layout"] = data.layout
+                kwargs["requires_grad"] = True
+                kwargs['dispatch_strides'] = True
+                return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)  # type: ignore[attr-defined]
+            else:
+                return torch.Tensor._make_subclass(cls, data, True, dispatch_strides=True)
+
+        for use_wrapper_subclass in [True, False]:
+            class ExampleTensor1(torch.Tensor):
+                @staticmethod
+                def __new__(cls, data, wrapper):
+                    return subclass_helper(cls, data, wrapper)
+
+                @classmethod
+                def __torch_dispatch__(cls, func, types, args, kwargs):
+                    return NotImplemented
+
+            class ExampleTensor2(torch.Tensor):
+                @staticmethod
+                def __new__(cls, data, wrapper):
+                    return subclass_helper(cls, data, wrapper)
+
+                @classmethod
+                def __torch_dispatch__(cls, func, types, args, kwargs):
+                    if func.overloadpacket == torch.ops.aten.is_contiguous:
+                        return contiguous_data.is_contiguous()
+                    return NotImplemented
+
+            class ExampleTensor3(torch.Tensor):
+                @staticmethod
+                def __new__(cls, data, wrapper):
+                    return subclass_helper(cls, data, wrapper)
+
+                @classmethod
+                def __torch_dispatch__(cls, func, types, args, kwargs):
+                    if func.overloadpacket == torch.ops.aten.is_contiguous:
+                        return not_contiguous_data.is_contiguous()
+                    return NotImplemented
+
+
+            err_msg = "no implementation found for 'torch.ops.aten.is_contiguous'"
+            e = ExampleTensor1(torch.randn(3, 3), use_wrapper_subclass)
+            with self.assertRaisesRegex(TypeError, err_msg):
+                e.is_contiguous()
+            with self.assertRaisesRegex(TypeError, err_msg):
+                e.contiguous()
+
+            e = ExampleTensor2(torch.randn(3, 3), use_wrapper_subclass)
+            self.assertEqual(e.is_contiguous(), True)
+            e.contiguous()  # this will just return the original TensorImpl since is_contiguous = True
+
+            err_msg = "no implementation found for"
+            e = ExampleTensor3(torch.randn(3, 3), use_wrapper_subclass)
+            self.assertEqual(e.is_contiguous(), False)
+            with self.assertRaisesRegex(TypeError, err_msg):
+                e.contiguous()
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 81631c45c3fd..6a1c750d49b6 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -1,10 +1,11 @@
-# Owner(s): ["high priority"]
+# Owner(s): ["module: pytree"]
 
 import torch
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten, TreeSpec, LeafSpec
 from torch.utils._pytree import _broadcast_to_and_flatten
 from collections import namedtuple
+from torch.testing._internal.common_utils import parametrize, subtest, instantiate_parametrized_tests
 
 class TestPytree(TestCase):
     def test_treespec_equality(self):
@@ -79,11 +80,18 @@ def run_test(tup):
         run_test(Point(1., 2))
         run_test(Point(torch.tensor(1.), 2))
 
-    def test_flatten_unflatten_torch_namedtuple_return_type(self):
+    @parametrize("op", [
+        subtest(torch.max, name='max'),
+        subtest(torch.min, name='min'),
+    ])
+    def test_flatten_unflatten_return_type(self, op):
         x = torch.randn(3, 3)
-        expected = torch.max(x, dim=0)
+        expected = op(x, dim=0)
 
         values, spec = tree_flatten(expected)
+        # Check that values is actually List[Tensor] and not (ReturnType(...),)
+        for value in values:
+            self.assertTrue(isinstance(value, torch.Tensor))
         result = tree_unflatten(values, spec)
 
         self.assertEqual(type(result), type(expected))
@@ -204,5 +212,7 @@ def test_broadcast_to_and_flatten(self):
             self.assertEqual(result, expected, msg=str([pytree, to_spec, expected]))
 
 
+instantiate_parametrized_tests(TestPytree)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_quantization.py b/test/test_quantization.py
index e646750a623e..83d80deece2c 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -31,6 +31,7 @@
 from quantization.core.test_workflow_module import TestObserver  # noqa: F401
 from quantization.core.test_quantized_module import TestStaticQuantizedModule  # noqa: F401
 from quantization.core.test_quantized_module import TestDynamicQuantizedModule  # noqa: F401
+from quantization.core.test_quantized_module import TestReferenceQuantizedModule  # noqa: F401
 from quantization.core.test_workflow_module import TestRecordHistogramObserver  # noqa: F401
 from quantization.core.test_workflow_module import TestHistogramObserver  # noqa: F401
 from quantization.core.test_workflow_module import TestDistributed  # noqa: F401
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 03d70db31ae0..6e422c9b6c3e 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -13,8 +13,8 @@
 from torch._six import inf, nan
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
-    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes,
-    integral_types_and, floating_and_complex_types_and
+    all_types_and_complex_and, get_all_math_dtypes, integral_types, complex_types, floating_types_and,
+    integral_types_and, floating_and_complex_types_and, all_types_and,
 )
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
@@ -99,7 +99,7 @@ class TestReductions(TestCase):
     def _test_dim_keepdim(self, op: ReductionOpInfo, device, *, ndim, **dim_keepdim):
         """Tests output shape for input with ndim and dim and keepdim kwargs"""
         shape = torch.randint(2, 5, (ndim,)).tolist()
-        t = make_tensor(shape, device, torch.float)
+        t = make_tensor(shape, dtype=torch.float, device=device)
         args, kwargs = next(op.generate_args_kwargs(t, **dim_keepdim))
         result = op(t, *args, **dim_keepdim, **kwargs)
         expected_shape = _reduced_shape(shape, **dim_keepdim)
@@ -207,14 +207,14 @@ def test_dim_offbounds(self, device, op: ReductionOpInfo):
     def test_dim_ndim_limit(self, device, op: ReductionOpInfo):
         """Tests that an exception is raised when reducing a tensor with more
         than 64 dims along some specific dimensions. dim=None is ok"""
-        t = make_tensor([1] * 65, device, torch.float)
+        t = make_tensor([1] * 65, dtype=torch.float, device=device)
         with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"):
             op(t, dim=0)
 
     @ops(filter(lambda op: op.identity is not None, reduction_ops), dtypes=OpDTypes.supported)
     def test_identity(self, device, dtype, op: ReductionOpInfo):
         """Tests that the identity value is an identity for the operator"""
-        t = make_tensor((10,), device, dtype)
+        t = make_tensor((10,), dtype=dtype, device=device)
         t[1::2] = op.identity
         args, kwargs = next(op.generate_args_kwargs(t))
         result = op(t[::2], *args, **kwargs)
@@ -230,7 +230,7 @@ def test_identity(self, device, dtype, op: ReductionOpInfo):
          allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16))
     def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo):
         """Tests that nan is propagated to the output by default"""
-        t = make_tensor((5,), device, dtype)
+        t = make_tensor((5,), dtype=dtype, device=device)
         t[2] = torch.nan
         args, kwargs = next(op.generate_args_kwargs(t))
         result = op(t, *args, **kwargs)
@@ -240,7 +240,7 @@ def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo):
          allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16))
     def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo):
         """Tests that NaN values do not affect the result."""
-        t = make_tensor((10,), device, dtype)
+        t = make_tensor((10,), dtype=dtype, device=device)
         t[1::2] = torch.nan
         args, kwargs = next(op.generate_args_kwargs(t))
         result = op(t[::2], *args, **kwargs)
@@ -250,7 +250,7 @@ def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo):
     @ops(reduction_ops, dtypes=OpDTypes.supported)
     def test_result_dtype(self, device, dtype, op: ReductionOpInfo):
         """Tests that the result has the correct dtype"""
-        t = make_tensor((5,), device, dtype)
+        t = make_tensor((5,), dtype=dtype, device=device)
         args, kwargs = next(op.generate_args_kwargs(t))
         result: torch.Tensor = op(t, *args, **kwargs)
         is_integral = dtype in integral_types_and(torch.bool)
@@ -275,7 +275,7 @@ def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo):
 
         See discussion here https://github.com/pytorch/pytorch/issues/61901
         """
-        t = make_tensor((0, 2, 3), device, torch.float)
+        t = make_tensor((0, 2, 3), dtype=torch.float, device=device)
         for dim in [0] + [[0, 2]] if op.supports_multiple_dims else []:
             args, kwargs = next(op.generate_args_kwargs(t, dim=dim))
             if op.identity is not None:
@@ -295,7 +295,7 @@ def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo):
     def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo):
         """Tests that reducing a nonempty slice of an empty tensor returns an
         empty tensor with the dimensions reduced."""
-        t = make_tensor((0, 2, 3), device, torch.float)
+        t = make_tensor((0, 2, 3), dtype=torch.float, device=device)
         for dim in [1] + [[1, 2]] if op.supports_multiple_dims else []:
             args, kwargs = next(op.generate_args_kwargs(t, dim=dim))
             result = op(t, *args, dim=dim, **kwargs)
@@ -315,31 +315,31 @@ def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_
     @ops(reduction_ops)
     def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo):
         """Tests reducing along noncontiguous innermost dimension."""
-        t = make_tensor((10, 10), device, dtype, low=-1, high=1)
+        t = make_tensor((10, 10), dtype=dtype, device=device, low=-1, high=1)
         self._test_noncontiguous(op, t[:, ::2], dim=1)
 
     @ops(reduction_ops)
     def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo):
         """Tests reducing along noncontiguous outermost dimension."""
-        t = make_tensor((10, 10), device, dtype, low=-1, high=1)
+        t = make_tensor((10, 10), dtype=dtype, device=device, low=-1, high=1)
         self._test_noncontiguous(op, t[::2, :], dim=0)
 
     @ops(reduction_ops)
     def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo):
         """Tests reducing all dimensions of a noncontiguous tensor."""
-        t = make_tensor((5, 5, 5), device, dtype, low=-1, high=1)
+        t = make_tensor((5, 5, 5), dtype=dtype, device=device, low=-1, high=1)
         self._test_noncontiguous(op, t[::2, ::3, 1:-1:2])
 
     @ops(reduction_ops)
     def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo):
         """Tests reducing a transposed tensor."""
-        t = make_tensor((5, 5), device, dtype, low=-1, high=1)
+        t = make_tensor((5, 5), dtype=dtype, device=device, low=-1, high=1)
         self._test_noncontiguous(op, t.T)
 
     @ops(reduction_ops)
     def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo):
         """Tests reducing a tensor with expanded singleton dimensions."""
-        t = make_tensor((2, 3), device, dtype, low=-1, high=1)
+        t = make_tensor((2, 3), dtype=dtype, device=device, low=-1, high=1)
         self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1))
 
     # NumPy does not support BFloat16 so we don't test that against reference
@@ -357,16 +357,16 @@ def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
             self.assertEqual(result, expected, exact_dtype=False)
 
     @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+         allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool))
     def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for scalar input tensors"""
-        self._test_ref(op, make_tensor([], device, dtype))
+        self._test_ref(op, make_tensor([], dtype=dtype, device=device))
 
     @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+         allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool))
     def test_ref_small_input(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for small input tensors"""
-        t = make_tensor((5, 3, 4, 2), device, dtype, low=-2, high=2, exclude_zero=True)
+        t = make_tensor((5, 3, 4, 2), dtype=dtype, device=device, low=-2, high=2, exclude_zero=True)
         self._test_ref(op, t)
         for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []):
             self._test_ref(op, t, dim=dim)
@@ -375,26 +375,27 @@ def test_ref_small_input(self, device, dtype, op: ReductionOpInfo):
          allowed_dtypes=[torch.float64])
     def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for a large 1D input tensor to check stability"""
-        self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=1, exclude_zero=True))
+        self._test_ref(op, make_tensor((2 ** 20,), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True))
 
     @ops(filter(lambda op: op.ref is not None, reduction_ops),
          allowed_dtypes=[torch.float64])
     def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for a large 2D input tensor to test parallelism"""
-        t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=1, exclude_zero=True)
+        t = make_tensor((32, 2 ** 16), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True)
         self._test_ref(op, t, dim=1)
 
+    @largeTensorTest("8gb")
     @ops(filter(lambda op: op.ref is not None, reduction_ops),
          allowed_dtypes=[torch.float64])
     def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for a very large input tensor that requires 64 bit indexing"""
-        self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=1, exclude_zero=True))
+        self._test_ref(op, make_tensor((275000000,), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True))
 
     @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+         allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool))
     def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for input tensors with duplicate values"""
-        t = make_tensor((4, 4), device, dtype, low=-2, high=2, exclude_zero=True)
+        t = make_tensor((4, 4), dtype=dtype, device=device, low=-2, high=2, exclude_zero=True)
         t[::2, ::2] = t[1::2, 1::2]
         self._test_ref(op, t)
         self._test_ref(op, t, dim=0)
@@ -404,7 +405,7 @@ def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo):
          allowed_dtypes=[torch.float32, torch.complex64])
     def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo):
         """Compares op against reference for input tensors with extremal values"""
-        t = make_tensor((5,), device, dtype, exclude_zero=True)
+        t = make_tensor((5,), dtype=dtype, device=device, exclude_zero=True)
         extremals = [0, 1, nan, inf, -inf]
         for extremal in extremals:
             t[2] = extremal
@@ -452,7 +453,7 @@ def test_dim_reduction_less_than_64(self, device):
         sizes = [1] * 65
         x = torch.randn(sizes, device=device)
         ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var,
-               torch.amin, torch.amax, torch.norm]
+               torch.norm]
         for op in ops:
             with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"):
                 op(x, 64)
@@ -743,6 +744,15 @@ def test_logsumexp_dim(self, device):
             lambda n, d: logsumexp(n, d),
             use_integral=False)
 
+    @onlyCPU
+    def test_mean_int_with_optdtype(self, device):
+        a = make_tensor((3, 4, 5), dtype=torch.int64, device=device)
+
+        # If the optional desired output type is given, the input
+        # is internally cast.
+        a_float = a.to(torch.float32)
+        self.assertEqual(a_float.mean(), a.mean(dtype=torch.float32))
+
     # TODO: update this and tests that use it to handle device properly
     def _test_reduce_integer_upcast(self, fn, has_out=True, test_complex=True):
         shape = (3, 4, 5)
@@ -1101,6 +1111,10 @@ def test_bincount(self, device):
         self.assertEqual(
             torch.tensor([1, 1, 1, 2], dtype=torch.int64, device=device),
             long_counts)
+        # test avoiding overflow for uint8 (#76979)
+        count_uint8 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.uint8, device=device).bincount()
+        count_int16 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.int16, device=device).bincount()
+        self.assertEqual(count_uint8, count_int16)
         # test minlength functionality
         int_counts = torch.bincount(
             torch.tensor([1, 1, 1, 1], device=device), minlength=5)
@@ -1415,7 +1429,7 @@ def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False):
             test_dtype_bfloat16(False, True)
             test_dtype_bfloat16(True, True)
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_nansum(self, device, dtype):
         args = product(
             (True, False),  # noncontiguous
@@ -1468,15 +1482,14 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype
                             self.compare_with_numpy(torch_func_partial, np_func_partial, x, device=None, dtype=None,
                                                     atol=atol, rtol=rtol, exact_dtype=exact_dtype)
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_count_nonzero(self, device, dtype):
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype)
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True)
 
     def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False):
         def is_integral(dtype):
-            return dtype in get_all_int_dtypes()
+            return dtype in integral_types()
 
         # On Windows CI, the current version of `numpy` promotes all lower integers
         # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
@@ -1505,28 +1518,30 @@ def is_integral(dtype):
                                                      with_keepdim=with_keepdim, with_extremal=with_extremal)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*all_types_and(torch.half))
     def test_sum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_keepdim=True)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*all_types_and(torch.half))
     def test_nansum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_keepdim=True)
 
-    @dtypes(*(get_all_complex_dtypes()))
+    @dtypes(*complex_types())
     def test_nansum_complex(self, device, dtype):
         x = torch.randn((3, 3, 3), device=device, dtype=dtype)
         with self.assertRaisesRegex(RuntimeError, "nansum does not support complex inputs"):
             torch.nansum(x)
 
-    def test_nansum_out_dtype(self, device):
-        dtypes = list(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))
-        for inp_dtype, out_dtype in combinations(dtypes, 2):
+    @dtypes(*all_types_and(torch.half))
+    def test_nansum_out_dtype(self, device, dtype):
+        out_dtype = dtype
+        inp_dtypes = all_types_and(torch.half) if out_dtype.is_floating_point else integral_types()
+        for inp_dtype in inp_dtypes:
             shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10)
             x = _generate_input(shape, inp_dtype, device, with_extremal=False)
             torch_fn = partial(torch.nansum, dtype=out_dtype)
@@ -1534,7 +1549,7 @@ def test_nansum_out_dtype(self, device):
             np_fn = partial(np.nansum, dtype=np_out_dtype)
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*all_types_and(torch.half))
     def test_argminmax_multiple(self, device, dtype):
         # Case: All Ones
         t = torch.ones(3, 3, device=device, dtype=dtype)
@@ -1542,7 +1557,7 @@ def test_argminmax_multiple(self, device, dtype):
         self.compare_with_numpy(torch.argmin, np.argmin, t)
 
         # Case: With single `nan` present.
-        if dtype in get_all_fp_dtypes():
+        if dtype in floating_types_and(torch.half, torch.bfloat16):
             t[2, 2] = float('nan')
             self.compare_with_numpy(torch.argmax, np.argmax, t)
             self.compare_with_numpy(torch.argmin, np.argmin, t)
@@ -1619,8 +1634,7 @@ def verify_against_numpy(t):
                           [0, 0]], device=device, dtype=dtype)
         verify_against_numpy(t)
 
-    @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False,
-                             include_bool=True, include_complex=True)))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_all_any_vs_numpy(self, device, dtype):
         # Note [all, any uint8 compatibility]: However for compatibility reason,
         # for `uint8`, they return Tensor of same dtype `uint8`.
@@ -1735,7 +1749,7 @@ def _test_output_dtype(x):
     @onlyNativeDeviceTypes
     def test_repeated_dim(self, device):
         ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var,
-               torch.amin, torch.amax, torch.norm]
+               torch.norm]
         x = torch.randn(3, 3, 3, 3, device=device)
 
         error_msg = r'appears multiple times in the list of dims'
@@ -1835,10 +1849,6 @@ def test_minmax_illegal_dtype(self, device):
             torch.max(x, dim=0, out=(illegal_values, valid_indices))
         with self.assertRaisesRegex(RuntimeError, rmsg):
             torch.min(x, dim=0, out=(illegal_values, valid_indices))
-        with self.assertRaisesRegex(RuntimeError, rmsg):
-            torch.amax(x, dim=0, out=illegal_values)
-        with self.assertRaisesRegex(RuntimeError, rmsg):
-            torch.amin(x, dim=0, out=illegal_values)
         with self.assertRaisesRegex(RuntimeError, rmsg):
             torch.max(x, dim=0, out=(valid_values, illegal_indices))
         with self.assertRaisesRegex(RuntimeError, rmsg):
@@ -1848,7 +1858,7 @@ def test_minmax_illegal_dtype(self, device):
         with self.assertRaisesRegex(RuntimeError, rmsg):
             torch.min(x, dim=0, out=(illegal_values, illegal_indices))
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_dim_arg_reduction_scalar(self, device, dtype):
         example = 4.0
 
@@ -1866,7 +1876,7 @@ def test_dim_arg_reduction_scalar(self, device, dtype):
 
 
     @precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2})
-    @dtypes(*(set(get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8}))
+    @dtypes(*set(all_types_and(torch.half, torch.bfloat16)) - {torch.uint8})
     def test_dim_reduction(self, device, dtype):
         example = [[-1, 2, 1], [5, 3, 6]]
 
@@ -2792,6 +2802,15 @@ def test_against_np(tensor, bins=100, min=0, max=0):
         expanded = torch.randn(1, 5, 1, 2, device=device).expand(3, 5, 7, 2)
         test_against_np(expanded)
 
+    @onlyCPU
+    def test_histc_bfloat16(self, device):
+        actual = torch.histc(
+            torch.tensor([1, 2, 1], dtype=torch.bfloat16, device=device), bins=4, min=0, max=3)
+        self.assertEqual(
+            torch.tensor([0, 2, 1, 0], dtype=torch.bfloat16, device=device),
+            actual)
+        self.assertEqual(actual.dtype, torch.bfloat16)
+
     """
     Runs torch.histogram and numpy.histogram on the specified input parameters
     and asserts that their output is equal.
@@ -2862,8 +2881,8 @@ def test_histogram(self, device, dtype):
 
         for contig, bins_contig, bin_ct, weighted, density, shape in \
                 product([True, False], [True, False], range(1, 10), [True, False], [True, False], shapes):
-            values = make_tensor(shape, device, dtype, low=-9, high=9, noncontiguous=not contig)
-            weights = make_tensor(shape, device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None
+            values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9, noncontiguous=not contig)
+            weights = make_tensor(shape, dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig) if weighted else None
 
             # Tests passing just the bin_ct
             self._test_histogram_numpy(values, bin_ct, None, weights, density)
@@ -2877,10 +2896,10 @@ def test_histogram(self, device, dtype):
             self._test_histogram_numpy(values, bin_ct, bin_range, weights, density)
 
             # Tests with caller-specified bin edges
-            bin_edges = make_tensor(bin_ct + 1, device, dtype, low=-9, high=9).msort()
+            bin_edges = make_tensor(bin_ct + 1, dtype=dtype, device=device, low=-9, high=9).msort()
             if not bins_contig:
                 # Necessary because msort always produces contiguous output
-                bin_edges_noncontig = make_tensor(bin_ct + 1, device, dtype, noncontiguous=not bins_contig)
+                bin_edges_noncontig = make_tensor(bin_ct + 1, dtype=dtype, device=device, noncontiguous=not bins_contig)
                 bin_edges_noncontig.copy_(bin_edges)
                 bin_edges = bin_edges_noncontig
             self.assertEqual(bin_edges.is_contiguous(), bins_contig)
@@ -2888,17 +2907,21 @@ def test_histogram(self, device, dtype):
 
             # Tests with input tensor in which all elements are equal
             elt = random.uniform(-9, 9)
-            values = make_tensor(shape, device, dtype, low=elt, high=elt, noncontiguous=not contig)
+            values = make_tensor(shape, dtype=dtype, device=device, low=elt, high=elt, noncontiguous=not contig)
             self._test_histogram_numpy(values, bin_ct, bin_range, weights, density)
             self._test_histogram_numpy(values, bin_edges, None, weights, density)
 
             # Tests with input equal to bin_edges
-            weights = make_tensor(bin_ct + 1, device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None
+            weights = (
+                make_tensor(bin_ct + 1, dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig)
+                if weighted
+                else None
+            )
             self._test_histogram_numpy(bin_edges, bin_edges, None, weights, density)
 
         # Tests values of default args
         for bin_ct, shape in product(range(1, 10), shapes):
-            values = make_tensor(shape, device, dtype, low=-9, high=9)
+            values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
             (actual_hist, actual_bin_edges) = torch.histogram(values, bin_ct)
             (expected_hist, expected_bin_edges) = torch.histogram(
                 values, bin_ct, range=None, weight=None, density=False)
@@ -2982,8 +3005,12 @@ def test_histogramdd(self, device, dtype):
                 product([True, False], [True, False], [True, False], [True, False], shapes):
             D = shape[-1]
 
-            values = make_tensor(shape, device, dtype, low=-9, high=9, noncontiguous=not contig)
-            weights = make_tensor(shape[:-1], device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None
+            values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9, noncontiguous=not contig)
+            weights = (
+                make_tensor(shape[:-1], dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig)
+                if weighted
+                else None
+            )
 
             # Tests passing a single bin count
             bin_ct = random.randint(1, 5)
@@ -3004,10 +3031,13 @@ def test_histogramdd(self, device, dtype):
             self._test_histogramdd_numpy(values, bin_ct, bin_range, weights, density)
 
             # Tests with caller-specified bin edges
-            bin_edges = [make_tensor(ct + 1, device, dtype, low=-9, high=9).msort() for ct in bin_ct]
+            bin_edges = [make_tensor(ct + 1, dtype=dtype, device=device, low=-9, high=9).msort() for ct in bin_ct]
             if not bins_contig:
                 # Necessary because msort always produces contiguous output
-                bin_edges_noncontig = [make_tensor(ct + 1, device, dtype, noncontiguous=not bins_contig) for ct in bin_ct]
+                bin_edges_noncontig = [
+                    make_tensor(ct + 1, dtype=dtype, device=device, noncontiguous=not bins_contig)
+                    for ct in bin_ct
+                ]
                 for dim in range(D):
                     bin_edges_noncontig[dim].copy_(bin_edges[dim])
                 bin_edges = bin_edges_noncontig
@@ -3019,58 +3049,58 @@ def test_histogramdd(self, device, dtype):
     @dtypes(torch.float32)
     def test_histogram_error_handling(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, 'not implemented for'):
-            values = make_tensor((), device, dtype=torch.int32)
+            values = make_tensor((), dtype=torch.int32, device=device)
             torch.histogram(values, 1)
 
         inconsistent_dtype = torch.float32 if dtype != torch.float32 else torch.float64
 
         with self.assertRaisesRegex(RuntimeError, 'input tensor and bins tensors should have the same dtype'):
-            values = make_tensor((), device, dtype=dtype)
-            bins = make_tensor((), device, dtype=inconsistent_dtype)
+            values = make_tensor((), dtype=dtype, device=device)
+            bins = make_tensor((), dtype=inconsistent_dtype, device=device)
             torch.histogram(values, bins)
 
         with self.assertRaisesRegex(RuntimeError, 'input tensor and weight tensor should have the same dtype'):
-            values = make_tensor((), device, dtype=dtype)
-            weight = make_tensor((), device, dtype=inconsistent_dtype)
+            values = make_tensor((), dtype=dtype, device=device)
+            weight = make_tensor((), dtype=inconsistent_dtype, device=device)
             torch.histogram(values, 1, weight=weight)
 
         with self.assertRaisesRegex(RuntimeError, 'input tensor and hist tensor should have the same dtype'):
-            values = make_tensor((), device, dtype=dtype)
-            hist = make_tensor((), device, dtype=inconsistent_dtype)
-            bin_edges = make_tensor((), device, dtype=dtype)
+            values = make_tensor((), dtype=dtype, device=device)
+            hist = make_tensor((), dtype=inconsistent_dtype, device=device)
+            bin_edges = make_tensor((), dtype=dtype, device=device)
             torch.histogram(values, 1, out=(hist, bin_edges))
 
         with self.assertRaisesRegex(RuntimeError, 'input tensor and bin_edges tensor should have the same dtype'):
-            values = make_tensor((), device, dtype=dtype)
-            hist = make_tensor((), device, dtype=dtype)
-            bin_edges = make_tensor((), device, dtype=inconsistent_dtype)
+            values = make_tensor((), dtype=dtype, device=device)
+            hist = make_tensor((), dtype=dtype, device=device)
+            bin_edges = make_tensor((), dtype=inconsistent_dtype, device=device)
             torch.histogram(values, 1, out=(hist, bin_edges))
 
         with self.assertRaisesRegex(RuntimeError, 'bins tensor should have one dimension'):
-            t = make_tensor((2, 2), device, dtype=dtype)
+            t = make_tensor((2, 2), dtype=dtype, device=device)
             torch.histogram(t, t)
 
         with self.assertRaisesRegex(RuntimeError, 'bins tensor should have at least 1 element'):
-            t = make_tensor((0), device, dtype=dtype)
+            t = make_tensor((0), dtype=dtype, device=device)
             torch.histogram(t, t)
 
         with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'):
-            values = make_tensor((), device, dtype=dtype)
+            values = make_tensor((), dtype=dtype, device=device)
             torch.histogram(values, -1)
 
         with self.assertRaisesRegex(RuntimeError, 'if weight tensor is provided it should have the same shape \
 as the input tensor excluding its innermost dimension'):
-            values = make_tensor((2, 2), device, dtype=dtype)
-            weight = make_tensor((1), device, dtype=dtype)
+            values = make_tensor((2, 2), dtype=dtype, device=device)
+            weight = make_tensor((1), dtype=dtype, device=device)
             torch.histogram(values, 1, weight=weight)
 
         with self.assertRaisesRegex(TypeError, 'received an invalid combination of arguments'):
-            values = make_tensor((), device, dtype=dtype)
-            bin_edges = make_tensor((), device, dtype=dtype)
+            values = make_tensor((), dtype=dtype, device=device)
+            bin_edges = make_tensor((), dtype=dtype, device=device)
             torch.histogram(values, bin_edges, range=(0, 1))
 
         with self.assertRaisesRegex(RuntimeError, 'min should not exceed max'):
-            values = make_tensor((), device, dtype=dtype)
+            values = make_tensor((), dtype=dtype, device=device)
             torch.histogram(values, 2, range=(1, 0))
 
         with self.assertRaisesRegex(RuntimeError, r'range \[nan, nan\] is not finite'):
@@ -3230,8 +3260,7 @@ def test_reduction_empty_any_all(self, device):
         shape = (2, 0, 4)
         x = torch.randn(shape, device=device)
 
-        for dtype in get_all_dtypes(include_half=True, include_bfloat16=False,
-                                    include_bool=True, include_complex=True):
+        for dtype in all_types_and_complex_and(torch.half, torch.bool):
             # Refer: [all, any uint8 compatibility]
             if dtype == torch.uint8:
                 out_dtype = torch.uint8
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index cd944da73667..d82cdca5534e 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -10,7 +10,9 @@
     (run_tests, TestCase,)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, dtypesIfCUDA,
-     toleranceOverride, tol)
+     toleranceOverride, tol,)
+from torch.testing._internal.common_dtype import \
+    (get_all_dtypes, get_all_fp_dtypes,)
 
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
@@ -22,13 +24,16 @@
 
 class TestScatterGather(TestCase):
     # Fills an index tensor with valid indices
-    def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o):
+    def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o, unique_indices=True):
         for i in range(1 if dim == 0 else m):
             for j in range(1 if dim == 1 else n):
                 for k in range(1 if dim == 2 else o):
                     ii = [i, j, k]
                     ii[dim] = slice(0, idx.size(dim) + 1)
-                    idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row]
+                    if unique_indices:
+                        idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row]
+                    else:
+                        idx[tuple(ii)] = torch.randint(dim_size, (elems_per_row,))
 
     @dtypes(torch.float32, torch.complex64)
     def test_gather(self, device, dtype):
@@ -67,7 +72,8 @@ def test_gather_bool(self, device, dtype):
         expected = torch.tensor(((False, False), (True, True)), device=device, dtype=dtype)
         self.assertEqual(actual, expected, atol=0, rtol=0)
 
-    def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction):
+    def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction,
+                           unique_indices=True, include_self=True):
         m, n, o = random.randint(10, 20), random.randint(10, 20), random.randint(10, 20)
         elems_per_row = random.randint(1, 10)
         dim = random.randrange(3)
@@ -75,7 +81,7 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction):
         idx_size = [m, n, o]
         idx_size[dim] = elems_per_row
         idx = torch.empty(tuple(idx_size), device=device, dtype=torch.long)
-        self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o)
+        self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o, unique_indices)
 
         if is_scalar:
             src = random.random()
@@ -85,11 +91,15 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction):
 
         base = make_tensor((m, n, o), device=device, dtype=dtype)
         if reduction is not None:
-            actual = fn(base.clone(), dim, idx, src, reduce=reduction)
+            if fn is torch.Tensor.scatter_reduce_:
+                actual = fn(base.clone(), dim, idx, src, reduce=reduction, include_self=include_self)
+            else:
+                actual = fn(base.clone(), dim, idx, src, reduce=reduction)
         else:
             actual = fn(base.clone(), dim, idx, src)
 
         expected = base.clone()
+        counts = torch.zeros(base.shape, dtype=torch.long, device=device) + include_self
         for i in range(idx_size[0]):
             for j in range(idx_size[1]):
                 for k in range(idx_size[2]):
@@ -98,16 +108,35 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction):
                     if fn is torch.Tensor.scatter_add_:
                         expected[tuple(ii)] += src[i, j, k]
                     else:
-                        # method may be 'scatter_' or 'scatter'
-                        # both might have a reduction argument
+                        # method may be 'scatter_', 'scatter', 'scatter_reduce'
+                        # or 'scatter_reduce_', the former two might have a reduction argument
+                        # while the latter two always do
                         value = src if is_scalar else src[i, j, k]
 
-                        if reduction == "add":
-                            expected[tuple(ii)] += value
-                        elif reduction == "multiply":
-                            expected[tuple(ii)] *= value
-                        else:
+                        if ((not include_self) and counts[tuple(ii)] == 0):
                             expected[tuple(ii)] = value
+                        else:
+                            if reduction == "add" or reduction == "sum":
+                                expected[tuple(ii)] += value
+                            elif reduction == "multiply" or reduction == "prod":
+                                expected[tuple(ii)] *= value
+                            elif reduction == "amax":
+                                expected[tuple(ii)] = max(expected[tuple(ii)], value)
+                            elif reduction == "amin":
+                                expected[tuple(ii)] = min(expected[tuple(ii)], value)
+                            elif reduction == "mean":
+                                expected[tuple(ii)] += value
+                            else:
+                                expected[tuple(ii)] = value
+
+                        counts[tuple(ii)] += 1
+
+        if (reduction == "mean"):
+            counts.masked_fill_(counts == 0, 1)
+            if (dtype.is_floating_point or dtype.is_complex):
+                expected /= counts
+            else:
+                expected.div_(counts, rounding_mode="floor")
 
         self.assertEqual(actual, expected, atol=0, rtol=0)
 
@@ -158,6 +187,67 @@ def test_scatter_add_mult_index_base(self, device, dtype):
         self.assertEqual(res0[0, :], m * torch.ones(n, device=device, dtype=dtype), atol=0, rtol=0)
         self.assertEqual(res1[:, 0], n * torch.ones(m, device=device, dtype=dtype), atol=0, rtol=0)
 
+    # FIXME: discrepancy between bool ReduceAdd on CUDA and CPU (a + b on CPU and buggy a && b on CUDA)
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False))
+    def test_scatter_reduce_sum(self, device, dtype):
+        for include_self in (True, False):
+            self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
+                                    is_scalar=False, reduction='sum', unique_indices=False,
+                                    include_self=include_self)
+
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True))
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True))
+    def test_scatter_reduce_prod(self, device, dtype):
+        for include_self in (True, False):
+            self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
+                                    is_scalar=False, reduction='prod', unique_indices=False,
+                                    include_self=include_self)
+
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False))
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True))
+    def test_scatter_reduce_mean(self, device, dtype):
+        for include_self in (True, False):
+            self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
+                                    is_scalar=False, reduction='mean', unique_indices=False,
+                                    include_self=include_self)
+
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True))
+    def test_scatter_reduce_amax(self, device, dtype):
+        for include_self in (True, False):
+            self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
+                                    is_scalar=False, reduction='amax', unique_indices=False,
+                                    include_self=include_self)
+            # simple test for nan/inf propagation
+            if (dtype.is_floating_point):
+                input = torch.zeros(3, device=device, dtype=dtype)
+                src = torch.tensor([1, float('nan'), -float('inf'), -float('inf'), 2, float('inf')], device=device, dtype=dtype)
+                idx = torch.tensor([0, 0, 1, 1, 2, 2], device=device)
+                input.scatter_reduce_(0, idx, src, 'amax', include_self=include_self)
+                expected_result = torch.tensor([float('nan'), -float('inf'), float('inf')], device=device, dtype=dtype)
+                if (include_self):
+                    expected_result[1] = 0
+                self.assertEqual(input, expected_result)
+
+
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True))
+    def test_scatter_reduce_amin(self, device, dtype):
+        for include_self in (True, False):
+            self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
+                                    is_scalar=False, reduction='amin', unique_indices=False,
+                                    include_self=include_self)
+            # simple test for nan/inf propagation
+            if (dtype.is_floating_point):
+                input = torch.zeros(3, device=device, dtype=dtype)
+                src = torch.tensor([1, float('nan'), -2, -float('inf'), float('inf'), float('inf')], device=device, dtype=dtype)
+                idx = torch.tensor([0, 0, 1, 1, 2, 2], device=device)
+                input.scatter_reduce_(0, idx, src, 'amin', include_self=include_self)
+                expected_result = torch.tensor([float('nan'), -float('inf'), float('inf')], device=device, dtype=dtype)
+                if (include_self):
+                    expected_result[2] = 0
+                self.assertEqual(input, expected_result)
+
 
 # Generic Device Test Framework instantation, see
 #   https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
diff --git a/test/test_serialization.py b/test/test_serialization.py
index a4fa6e8c9ba5..2643b4bcad5c 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -20,10 +20,10 @@
 from torch._utils import _rebuild_tensor
 from torch.serialization import check_module_version_greater_or_equal
 
-from torch.testing._internal.common_utils import TestCase, IS_WINDOWS, \
-    TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName
+from torch.testing._internal.common_utils import TestCase, IS_WINDOWS, TEST_DILL, \
+    run_tests, download_file, BytesIOContext, TemporaryFileName, parametrize, instantiate_parametrized_tests
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.testing._internal.common_dtype import all_types_and_complex_and
 
 # These tests were all copied from `test/test_torch.py` at some point, so see
 # the actual blame, see this revision
@@ -97,7 +97,7 @@ def _test_serialization_assert(self, b, c):
         self.assertTrue(isinstance(c[1], torch.FloatTensor))
         self.assertTrue(isinstance(c[2], torch.FloatTensor))
         self.assertTrue(isinstance(c[3], torch.FloatTensor))
-        self.assertTrue(isinstance(c[4], torch.storage.TypedStorage))
+        self.assertTrue(isinstance(c[4], torch.storage._TypedStorage))
         self.assertEqual(c[4].dtype, torch.float)
         c[0].fill_(10)
         self.assertEqual(c[0], c[2], atol=0, rtol=0)
@@ -370,7 +370,7 @@ def test_serialization_backwards_compat(self):
         self.assertTrue(isinstance(c[1], torch.FloatTensor))
         self.assertTrue(isinstance(c[2], torch.FloatTensor))
         self.assertTrue(isinstance(c[3], torch.FloatTensor))
-        self.assertTrue(isinstance(c[4], torch.storage.TypedStorage))
+        self.assertTrue(isinstance(c[4], torch.storage._TypedStorage))
         self.assertEqual(c[4].dtype, torch.float32)
         c[0].fill_(10)
         self.assertEqual(c[0], c[2], atol=0, rtol=0)
@@ -414,7 +414,7 @@ def test_serialization_save_warnings(self):
         with warnings.catch_warnings(record=True) as warns:
             with tempfile.NamedTemporaryFile() as checkpoint:
                 x = torch.save(torch.nn.Linear(2, 3), checkpoint)
-                self.assertEquals(len(warns), 0)
+                self.assertEqual(len(warns), 0)
 
     def test_serialization_map_location(self):
         test_file_path = download_file('https://download.pytorch.org/test_data/gpu_tensors.pt')
@@ -616,11 +616,12 @@ def save_load_check(a, b):
             self.assertEqual(a, a_loaded)
             self.assertEqual(b, b_loaded)
 
-        for device, dtype in product(devices, get_all_dtypes()):
+        for device, dtype in product(devices, all_types_and_complex_and(torch.half,
+                                                                        torch.bfloat16, torch.bool)):
             a = torch.tensor([], dtype=dtype, device=device)
 
-            for other_dtype in get_all_dtypes():
-                s = torch.TypedStorage(
+            for other_dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
+                s = torch._TypedStorage(
                     wrap_storage=a.storage()._untyped(),
                     dtype=other_dtype)
                 save_load_check(a, s)
@@ -652,7 +653,7 @@ def test_save_different_dtype_error(self):
                 torch.save([a.storage(), a.imag.storage()], f)
 
             a = torch.randn(10, device=device)
-            s_bytes = torch.TypedStorage(
+            s_bytes = torch._TypedStorage(
                 wrap_storage=a.storage()._untyped(),
                 dtype=torch.uint8)
 
@@ -726,7 +727,7 @@ def import_module(name, filename):
                 loaded = torch.load(checkpoint)
                 self.assertTrue(isinstance(loaded, module.Net))
                 if can_retrieve_source:
-                    self.assertEquals(len(w), 0)
+                    self.assertEqual(len(w), 0)
 
             # Replace the module with different source
             fname = get_file_path_2(os.path.dirname(os.path.dirname(torch.__file__)), 'torch', 'testing',
@@ -737,7 +738,7 @@ def import_module(name, filename):
                 loaded = torch.load(checkpoint)
                 self.assertTrue(isinstance(loaded, module.Net))
                 if can_retrieve_source:
-                    self.assertEquals(len(w), 1)
+                    self.assertEqual(len(w), 1)
                     self.assertTrue(w[0].category, 'SourceChangeWarning')
 
     def test_serialization_container(self):
@@ -869,6 +870,9 @@ def __new__(cls, elem, *args, **kwargs):
         r.elem = elem
         return r
 
+    def clone(self):
+        return type(self)(self.elem.clone())
+
 
 class TestGetStateSubclass(torch.Tensor):
     elem: torch.Tensor
@@ -944,8 +948,18 @@ def test_tensor_subclass_deepcopy(self):
         self.assertEqual(new_tensor.elem, my_tensor.elem)
         self.assertEqual(new_tensor.foo, foo_val)
 
+    @parametrize('requires_grad', (True, False))
+    def test_cloned_deepcopy(self, requires_grad):
+        my_tensor = torch.rand(2, requires_grad=requires_grad, device='meta')
+
+        new_tensor = deepcopy(my_tensor)
+
+        self.assertEqual(new_tensor.requires_grad, my_tensor.requires_grad)
+
+
 
 instantiate_device_type_tests(TestBothSerialization, globals())
+instantiate_parametrized_tests(TestSubclassSerialization)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 0267852ceb6a..b6557eed0d25 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -15,7 +15,7 @@
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyNativeDeviceTypes,
     dtypesIfCUDA, largeTensorTest)
-from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.testing._internal.common_dtype import all_types_and_complex_and, all_types, all_types_and
 
 # TODO: replace with make_tensor
 def _generate_input(shape, dtype, device, with_extremal):
@@ -227,12 +227,11 @@ def test_diagonal_multidim(self, device, dtype):
         self.assertEqual(expected, result)
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_complex=False, include_bool=False, include_half=False,
-                            include_bfloat16=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False))
+    @dtypes(*all_types())
+    @dtypesIfCUDA(*all_types_and(torch.half))
     def test_trace(self, device, dtype):
         def test(shape):
-            tensor = make_tensor(shape, device, dtype, low=-9, high=9)
+            tensor = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
             expected_dtype = tensor.sum().dtype
             expected_dtype = torch_to_numpy_dtype_dict[expected_dtype]
 
@@ -341,7 +340,7 @@ def test_clamp_raises_arg_errors(self, device):
         with self.assertRaisesRegex(RuntimeError, error_msg):
             torch.clamp(X)
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_flip(self, device, dtype):
         make_from_data = partial(torch.tensor, device=device, dtype=dtype)
         make_from_size = partial(make_tensor, device=device, dtype=dtype)
@@ -440,7 +439,7 @@ def gen_data():
         for dims in test_dims:
             self.assertEqual(size, list(data.flip(dims).size()))
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_flip_errors(self, device, dtype):
         make_arg = partial(make_tensor, dtype=dtype, device=device)
         data = make_arg((2, 2, 2))
@@ -458,7 +457,7 @@ def test_flip_errors(self, device, dtype):
     def _rand_shape(self, dim, min_size, max_size):
         return tuple(torch.randint(min_size, max_size + 1, (dim,)))
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_flip_numpy(self, device, dtype):
         make_arg = partial(make_tensor, dtype=dtype, device=device)
 
@@ -567,7 +566,7 @@ def test_nonzero_no_warning(self, device):
             t.nonzero()
             self.assertEqual(len(w), 0)
 
-    @dtypes(*get_all_dtypes(include_complex=False))
+    @dtypes(*all_types_and(torch.half, torch.bool, torch.bfloat16))
     def test_nonzero(self, device, dtype):
 
         shapes = [
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index b44b09ffa1dc..19394c0809c8 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -8,14 +8,12 @@
 from itertools import permutations, product
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_dtype import (
-    all_types, all_types_and, floating_types_and, get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes,
-)
+from torch.testing._internal.common_dtype import all_types, all_types_and, floating_types_and
 from torch.testing._internal.common_utils import \
-    (TEST_WITH_ROCM, TestCase, run_tests, slowTest)
+    (TestCase, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyNativeDeviceTypes,
-     skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest)
+     onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest)
 
 # TODO: remove this
 SIZE = 100
@@ -132,11 +130,25 @@ def test_sort(self, device):
             self.assertIsOrdered('descending', x, res2val, res2ind,
                                  'random with NaNs')
 
+    @onlyCUDA
+    def test_sort_large_slice(self, device):
+        # tests direct cub path
+        x = torch.randn(4, 1024000, device=device)
+        res1val, res1ind = torch.sort(x, stable=True)
+        torch.cuda.synchronize()
+        # assertIsOrdered is too slow, so just compare to cpu
+        res1val_cpu, res1ind_cpu = torch.sort(x.cpu(), stable=True)
+        self.assertEqual(res1val, res1val_cpu.cuda())
+        self.assertEqual(res1ind, res1ind_cpu.cuda())
+        res1val, res1ind = torch.sort(x, descending=True, stable=True)
+        torch.cuda.synchronize()
+        res1val_cpu, res1ind_cpu = torch.sort(x.cpu(), descending=True, stable=True)
+        self.assertEqual(res1val, res1val_cpu.cuda())
+        self.assertEqual(res1ind, res1ind_cpu.cuda())
+
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
-    @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_stable_sort(self, device, dtype):
-        if TEST_WITH_ROCM and dtype == torch.bfloat16:
-            return
         sizes = (100, 1000, 10000)
         for ncopies in sizes:
             x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device)
@@ -167,6 +179,23 @@ def test_sort_large(self, device, dtype):
         self.assertEqual(vm, torch.arange(255, dtype=dtype, device=device))
         self.assertEqual(im, t0.sort().indices)
 
+
+    @dtypes(torch.float32)
+    def test_sort_restride(self, device, dtype):
+        # Input: non-contiguous (stride: 5) 3-element array
+        tensor = torch.randn((3, 5), dtype=dtype, device=device)[:, 0]
+        # Outputs: 0-dim tensors
+        # They will need to be resized, which means they will also be
+        # restrided with the input tensor's strides as base.
+        values = torch.tensor(0, dtype=dtype, device=device)
+        indices = torch.tensor(0, dtype=torch.long, device=device)
+        torch.sort(tensor, out=(values, indices))
+        # Check: outputs were restrided to dense strides
+        self.assertEqual(values.stride(), (1,))
+        self.assertEqual(indices.stride(), (1,))
+        # Check: 'tensor'  indexed by 'indices' is equal to 'values'
+        self.assertEqual(tensor[indices], values)
+
     def _test_sort_discontiguous(self, device, dtype):
         # on CUDA 2048 vs >2048 have different code path for the dim being sorted
         sizes = (5, 7, 2049)
@@ -228,10 +257,8 @@ def test_topk_1d_output_discontiguous(self, device, dtype):
             self.assertEqual(values, values_cont)
 
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
-    @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_stable_sort_against_numpy(self, device, dtype):
-        if TEST_WITH_ROCM and dtype == torch.bfloat16:
-            return
         if dtype in floating_types_and(torch.float16, torch.bfloat16):
             inf = float('inf')
             neg_inf = -float('inf')
@@ -293,13 +320,10 @@ def repeated_index_fill(t, dim, idxs, vals):
             idx_numpy = np.argsort(sample_numpy, axis=dim, kind='stable')
             self.assertEqual(idx_torch, idx_numpy)
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_msort(self, device, dtype):
-        if TEST_WITH_ROCM and dtype == torch.bfloat16:
-            return
-
         def test(shape):
-            tensor = make_tensor(shape, device, dtype, low=-9, high=9)
+            tensor = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
             if tensor.size() != torch.Size([]):
                 if dtype is torch.bfloat16:
                     expected = torch.from_numpy(np.msort(tensor.float().cpu().numpy())).bfloat16()
@@ -385,7 +409,6 @@ def test_topk_arguments(self, device):
         # Make sure True isn't mistakenly taken as the 2nd dimension (interpreted as 1)
         self.assertRaises(TypeError, lambda: q.topk(4, True))
 
-    @skipCUDAIfRocm
     def test_unique_dim(self, device):
         self.assertFalse(hasattr(torch, 'unique_dim'))
 
@@ -441,7 +464,7 @@ def run_test(device, dtype):
                                                 device=device)
             expected_inverse_dim2 = torch.tensor([0, 1])
             expected_counts_dim2 = torch.tensor([1, 1])
-            expected_unique_empty = torch.tensor([], dtype=dtype, device=device)
+            expected_unique_empty = torch.empty(5, 0, dtype=dtype, device=device)
             expected_inverse_empty = torch.tensor([], dtype=torch.long, device=device)
             expected_counts_empty = torch.tensor([], dtype=torch.long, device=device)
             if dtype in floating_types_and(torch.float16, torch.bfloat16):
@@ -685,7 +708,6 @@ def test_topk_integral(self, device, dtype):
 
     @onlyCUDA
     @dtypes(torch.bfloat16)
-    @skipCUDAIfRocm
     def test_topk_bfloat16(self, device, dtype):
 
         small = 10
@@ -694,12 +716,9 @@ def test_topk_bfloat16(self, device, dtype):
         for curr_size in (small, large, verylarge):
             self._test_topk_dtype(device, dtype, False, curr_size)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_topk_nonfinite(self, device, dtype):
-        if TEST_WITH_ROCM and dtype == torch.bfloat16:
-            return
-
         x = torch.tensor([float('nan'), float('inf'), 1e4, 0, -1e4, -float('inf')], device=device, dtype=dtype)
         val, idx = x.topk(4)
         expect = torch.tensor([float('nan'), float('inf'), 1e4, 0], device=device, dtype=dtype)
@@ -728,15 +747,9 @@ def test_topk_4d(self, device):
             self.assertEqual(ind, expected_ind, atol=0, rtol=0)
 
     @onlyNativeDeviceTypes
-    @dtypesIfCUDA(*(get_all_dtypes(include_complex=False,
-                                   include_bool=False,
-                                   include_half=False,
-                                   include_bfloat16=True)))
-    @dtypes(*(get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False)))
+    @dtypesIfCUDA(*all_types_and(torch.bfloat16))
+    @dtypes(*all_types())
     def test_topk_zero(self, device, dtype):
-        if TEST_WITH_ROCM and dtype == torch.bfloat16:
-            return
-
         # https://github.com/pytorch/pytorch/issues/49205
         t = torch.rand(2, 2, device=device).to(dtype=dtype)
         val, idx = torch.topk(t, k=0, largest=False)
@@ -789,12 +802,9 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
-    @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128})
-    @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16))
+    @dtypes(*all_types_and(torch.half, torch.bool))
     def test_unique(self, device, dtype):
-        if dtype is torch.half and self.device_type == 'cpu':
-            return  # CPU does not have half support
-
         def ensure_tuple(x):
             if isinstance(x, torch.Tensor):
                 return (x,)
@@ -849,12 +859,9 @@ def ensure_tuple(x):
                                 count += 1
                         self.assertEqual(j, count)
 
-    @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128})
-    @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16))
+    @dtypes(*all_types_and(torch.half, torch.bool))
     def test_unique_consecutive(self, device, dtype):
-        if dtype is torch.half and self.device_type == 'cpu':
-            return  # CPU does not have half support
-
         if dtype is torch.bool:
             x = torch.tensor([True, False, False, False, True, True, False, False, False], dtype=torch.bool, device=device)
             expected_unique = torch.tensor([True, False, True, False], dtype=torch.bool, device=device)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index cbc98f572bd8..07a8fd2a03de 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -7,9 +7,6 @@
 import random
 import unittest
 from torch.testing import make_tensor
-from torch.testing._internal.common_dtype import (
-    all_types_and_complex,
-)
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
     do_test_empty_full, load_tests, TEST_NUMPY, IS_WINDOWS, gradcheck, coalescedonoff, \
     DeterministicGuard, first_sample
@@ -17,16 +14,16 @@
 from numbers import Number
 from typing import Dict, Any
 from distutils.version import LooseVersion
-from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes
 from torch.testing._internal.common_cuda import \
     (SM53OrLater, SM80OrLater, CUDA11OrLater)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
      deviceCountAtLeast, OpDTypes)
 from torch.testing._internal.common_methods_invocations import \
-    (sparse_unary_ufuncs)
+    (sparse_unary_ufuncs, sparse_masked_reduction_ops)
 from torch.testing._internal.common_dtype import (
-    floating_and_complex_types, floating_and_complex_types_and, get_all_dtypes, get_all_int_dtypes,
+    all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types,
+    floating_and_complex_types_and, integral_types, floating_types_and,
 )
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
@@ -43,6 +40,8 @@
 class TestSparse(TestCase):
 
     def setUp(self):
+        TestCase.setUp(self)
+
         self.index_tensor = lambda *args, **kwargs: torch.tensor(*args, **kwargs, dtype=torch.int64)
 
         def sparse_empty_factory(*args, **kwargs):
@@ -158,7 +157,7 @@ def test_shape(sparse_dims, nnz, with_size):
             self.assertEqual(i, x._indices())
             self.assertEqual(v, x._values())
             self.assertEqual(x.ndimension(), len(with_size))
-            self.assertEqual(x.coalesce()._nnz(), nnz)
+            self.assertEqual(x.coalesce()._nnz(), nnz if x.is_coalesced() else nnz // 2)
             self.assertEqual(list(x.size()), with_size)
 
             # Test .indices() and .values()
@@ -188,7 +187,8 @@ def test_shape(sparse_dims, nnz, with_size):
         self.assertEqual(x._values().numel(), 0)
 
     @coalescedonoff
-    @dtypes(torch.double, torch.cdouble)
+    @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @precisionOverride({torch.bfloat16: 1e-2})
     def test_coalesce(self, device, dtype, coalesced):
 
         def _test_coalesce(t):
@@ -299,24 +299,22 @@ def test_ctor_size_checks(self, device, dtype):
             RuntimeError,
             lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1])))
 
-    @dtypes(*floating_and_complex_types_and(torch.float16))
+    @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
     def test_to_dense(self, device, dtype):
         def test_tensor(x, res):
             x.to_dense()  # Tests triple to_dense for memory corruption
             x.to_dense()
             x.to_dense()
-            # We dont have to_dense for half types, so we don't request
-            # exact_dtype if res.type is torch.float16.
             dense_x = x.to_dense()
             safe_dense_x = self.safeToDense(x)
-            if (res.dtype == torch.float16):
-                exact_dtype = False
-            else:
-                exact_dtype = True
-                dense_x = dense_x.to(res.dtype)
-                safe_dense_x = safe_dense_x.to(res.dtype)
-            self.assertEqual(res, dense_x, exact_dtype=exact_dtype)
-            self.assertEqual(res, safe_dense_x, exact_dtype=exact_dtype)
+            dense_x = dense_x.to(res.dtype)
+            safe_dense_x = safe_dense_x.to(res.dtype)
+            self.assertEqual(res, dense_x)
+            self.assertEqual(res, safe_dense_x)
+
+            # Only run autograd test for float64
+            if x.dtype != torch.float64:
+                return
 
             def fn(x):
                 return x.to_dense()
@@ -349,6 +347,7 @@ def fn(x):
             ], dtype=dtype, device=device)
 
             test_tensor(x, res)
+            test_tensor(res, res)
 
             i = self.index_tensor([
                 [0, 1, 2, 2],
@@ -360,16 +359,8 @@ def fn(x):
             res = torch.empty((3, 4, 5, 0), dtype=dtype, device=device)
             test_tensor(x, res)
 
-    # half tensors on cpu don't implement to_dense, so need to convert to float
-    def _to_dense_half_safe(self, tensor):
-        if(tensor.dtype == torch.half and tensor.device.type == 'cpu'):
-            return tensor.to(torch.float).to_dense().to(torch.half)
-        else:
-            return tensor.to_dense()
-
     @coalescedonoff
-    @skipIfRocm
-    @dtypes(torch.float16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
+    @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
     def test_to_sparse(self, device, dtype, coalesced):
         shape = [5, 2, 10, 4]
         max_nnz = 1
@@ -382,9 +373,9 @@ def test_to_sparse(self, device, dtype, coalesced):
                                                       coalesced=coalesced)
                     expected = expected.to(dtype)
 
-                    d = self._to_dense_half_safe(expected)
+                    d = expected.to_dense()
                     result = d.to_sparse(dim)
-                    self.assertEqual(d, self._to_dense_half_safe(result))  # == not implemented for sparse tensors yet
+                    self.assertEqual(d, result.to_dense())
                     self.assertEqual(expected.size(), result.size())
                     self.assertEqual(dim, result.sparse_dim())
 
@@ -416,7 +407,7 @@ def test_scalar(self, device, dtype):
         a_coalesced = a.coalesce()
         self.assertTrue(a_coalesced.is_coalesced())
         self.assertEqual(torch.tensor(12.3 * 2, dtype=dtype, device=device), a.to_dense())
-        self.assertEqual(a, a.to_dense().to_sparse())
+        self.assertEqual(a.coalesce(), a.coalesce().to_dense().to_sparse())
 
         # tensor without value
         a = self.sparse_empty((), dtype=dtype, device=device)
@@ -675,7 +666,8 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @coalescedonoff
-    @dtypes(torch.double, torch.cdouble)
+    @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @precisionOverride({torch.bfloat16: 2e-2})
     def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
         # This is for testing torch.copy_(SparseTensor, SparseTensor)
         sparse_dims = 3
@@ -1007,6 +999,105 @@ def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=N
                 test_shape(len(sizes) // 2, 10, sizes, d, index)
                 test_shape(len(sizes), 10, sizes, d, index)
 
+    def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coalesced):
+        t = make_tensor(sizes, dtype=dtype, device=device)
+        t_sparse = t.to_sparse().coalesce() if coalesced else t.to_sparse()
+        t_small_sparse, _, _ = self._gen_sparse(len(sizes), 2, sizes, dtype, device, coalesced)
+        t_small = t_small_sparse.to_dense()
+        for d in dims:
+            # NOTE: indices are negative
+            idx_dim_d_range = list(range(-sizes[d], 0))
+            for idx_len in range(sizes[d], sizes[d] + 1):
+                # creates all possible valid indices into dim d of lenght idx_len
+                for idx in itertools.product(*itertools.repeat(idx_dim_d_range, idx_len)):
+                    t_idx = torch.tensor(idx, dtype=torch.long, device=device)
+
+                    # NOTE: index_select for dense does not support negative indices,
+                    # hence + sizes[d]. See https://github.com/pytorch/pytorch/issues/76347
+
+                    # tests the nnz > sizes[d] branch
+                    dense_result = t.index_select(d, t_idx + sizes[d])
+                    sparse_result = t_sparse.index_select(d, t_idx)
+                    self.assertEqual(dense_result, sparse_result)
+
+                    # tests the nnz <= sizes[d] branch
+                    small_dense_result = t_small.index_select(d, t_idx + sizes[d])
+                    small_sparse_result = t_small_sparse.index_select(d, t_idx)
+                    self.assertEqual(small_dense_result, small_sparse_result)
+
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+    def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
+        # will trigger brute-force algo
+        self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced)
+
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+    def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
+        # will trigger more sophisticated algos
+        self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced)
+
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+    def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced):
+        # empty index
+        idx_empty = torch.tensor([], dtype=torch.long, device=device)
+        t = make_tensor((5, 5), dtype=dtype, device=device)
+        res_dense = t.index_select(0, idx_empty)
+        res_sparse = t.to_sparse().index_select(0, idx_empty)
+        self.assertEqual(res_dense, res_sparse)
+
+        # non-contigous index
+        idx = torch.randint(low=0, high=5, size=(10, 2), device=device)[:, 0]
+
+        def run_test(sizes):
+            # case nnz > size[d]
+            t = make_tensor(sizes, dtype=dtype, device=device)
+            res_dense = t.index_select(0, idx)
+            res_sparse = t.to_sparse().index_select(0, idx)
+            self.assertEqual(res_dense, res_sparse)
+
+            # case nnz <= size[d]
+            t_small_sparse, _, _ = self._gen_sparse(len(sizes), 2, sizes, dtype, device, coalesced)
+            res_sparse = t_small_sparse.index_select(0, idx)
+            res_dense = t_small_sparse.to_dense().index_select(0, idx)
+            self.assertEqual(res_dense, res_sparse)
+
+        # brute-force
+        run_test((10, 10))
+        # more sophisticated algos
+        run_test((10, 100, 100))
+
+    @coalescedonoff
+    @dtypes(torch.double, torch.cdouble)
+    def test_index_select_parallelization(self, device, dtype, coalesced):
+        """
+        Test with sizes that will trigger parallelization (i.e. with sizes
+        that are >= at::internal::GRAIN_SIZE)
+        """
+        def run_test(nnz, size):
+            t_sparse, _, _ = self._gen_sparse(1, nnz, (size,), dtype, device, coalesced)
+            t_dense = t_sparse.to_dense()
+
+            # idx_small to (sort) and (binary) search into t_sparse
+            idx_small = torch.randint(size, (nnz // 2,), device=device)
+            # idx_large to (sort) and (binary) search into idx_large
+            # NOTE: when coalesced=True, the (binary) search will be
+            # done over t_sparse anyway, as it is already sorted.
+            idx_large = torch.randint(size, (nnz * 2,), device=device)
+            for idx in (idx_small, idx_large):
+                res_dense = t_dense.index_select(0, idx)
+                res_sparse = t_sparse.index_select(0, idx)
+                self.assertEqual(res_dense, res_sparse)
+
+        # NOTE: GRAIN_SIZE = 32768
+        # case nnz <= size[d]
+        tlen = 70000  # > 2 * GRAIN_SIZE
+        run_test(tlen, tlen)
+
+        # case nnz > size[d]
+        run_test(tlen, tlen // 2)
+
     @onlyCPU
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
@@ -1252,7 +1343,8 @@ def test_shape(di, dj, dk, nnz):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
-    @dtypes(torch.double, torch.cdouble)
+    @unittest.skip("See https://github.com/pytorch/pytorch/issues/73145")
+    @dtypes(torch.double, torch.cdouble, torch.bfloat16)
     def test_sparse_addmm(self, device, dtype, coalesced):
         def test_shape(m, n, p, nnz, broadcast, alpha_beta=None):
             if alpha_beta is None:
@@ -1598,7 +1690,6 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device,
         z = x1.coalesce()
         self.assertEqual(x1.is_coalesced(), coalesced)
         self.assertTrue(y.is_coalesced())
-        self.assertEqual(x1, y)
         y._values().add_(1)
         if not x1.is_coalesced():
             # check that coalesce is out of place if the original tensor is not
@@ -1698,7 +1789,7 @@ def _test_sparse_mask_fixed():
             exp_v = torch.tensor([7, 14, 3, 20], dtype=dtype, device=device)
             res = dense.sparse_mask(x)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4]), dtype=dtype, device=device)
-            self.assertEqual(res, expected)
+            self.assertEqual(res.coalesce(), expected.coalesce())
 
             i = self.index_tensor([
                 [1, 3, 0, 4],
@@ -1710,7 +1801,7 @@ def _test_sparse_mask_fixed():
             exp_v = torch.empty([4, 0], dtype=dtype, device=device)
             res = dense.sparse_mask(x)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 0]), dtype=dtype, device=device)
-            self.assertEqual(res, expected)
+            self.assertEqual(res.coalesce(), expected.coalesce())
 
         _test_sparse_mask_fixed()
 
@@ -1746,7 +1837,7 @@ def _test_sparse_mask_hybrid_fixed():
             res = dense.sparse_mask(x)
             exp_v = torch.tensor([[7, 9], [14, 1], [3, 3], [20, 1]])
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2]))
-            self.assertEqual(res, expected)
+            self.assertEqual(res.coalesce(), expected.coalesce())
 
             i = self.index_tensor([
                 [1, 3, 0, 4],
@@ -1758,7 +1849,7 @@ def _test_sparse_mask_hybrid_fixed():
             res = dense.sparse_mask(x)
             exp_v = torch.empty(4, 2, 0)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2, 0]))
-            self.assertEqual(res, expected)
+            self.assertEqual(res.coalesce(), expected.coalesce())
 
         _test_sparse_mask_hybrid_fixed()
 
@@ -1960,7 +2051,7 @@ def test_narrow(self, device, dtype, coalesced):
 
     def _test_log1p_tensor(self, sparse_tensor, coalesced):
         def is_integral(dtype):
-            return dtype in get_all_int_dtypes()
+            return dtype in integral_types()
 
         dense_tensor = sparse_tensor.to_dense()
         expected_output = dense_tensor.log1p()
@@ -1991,8 +2082,7 @@ def is_integral(dtype):
                 sparse_tensor.requires_grad_()
 
     @coalescedonoff
-    @dtypes(*get_all_dtypes(include_bool=False, include_half=False,
-                            include_bfloat16=False, include_complex=False))
+    @dtypes(*all_types())
     def test_log1p(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2100,7 +2190,7 @@ def test_neg_negative(self, device, dtype, coalesced):
 
     def _test_asin_arcsin(self, sparse_tensor, coalesced):
         def is_integral(dtype):
-            return dtype in get_all_int_dtypes()
+            return dtype in integral_types()
         is_integral_dtype = is_integral(sparse_tensor.dtype)
 
         dense_tensor = sparse_tensor.to_dense()
@@ -2135,8 +2225,7 @@ def is_integral(dtype):
                     op(sparse_tensor)
 
     @coalescedonoff
-    @dtypes(*get_all_dtypes(include_bool=False, include_half=False,
-                            include_bfloat16=False, include_complex=False))
+    @dtypes(*all_types())
     def test_asin_arcsin(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2623,14 +2712,14 @@ def test_legacy_new(self, device):
 
     @onlyCPU  # not really, but we only really want to run this once
     def test_dtypes(self, device):
-        all_sparse_dtypes = get_all_dtypes(include_complex=True)
+        all_sparse_dtypes = all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)
         do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.is_available():
             do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
 
     @onlyCPU  # not really, but we only really want to run this once
     def test_empty_full(self, device):
-        all_sparse_dtypes = get_all_dtypes(include_complex=True)
+        all_sparse_dtypes = all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)
         do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.device_count() > 0:
             do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None)
@@ -2887,11 +2976,11 @@ def test_any(self, device):
         self.assertEqual(torch.any(t), t_any)
 
     def test_isnan(self, device):
-        t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4]), device=device)
-        t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
+        t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, 4]), device=device)
+        t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, False]), device=device)
         self.assertEqual(torch.isnan(t).int(), t_nan.int())
-        t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, float("nan")]), device=device)
-        t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, True]), device=device)
+        t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, float("nan")]), device=device)
+        t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, True]), device=device)
         self.assertEqual(torch.isnan(t).int(), t_nan.int())
 
     @coalescedonoff
@@ -3227,13 +3316,11 @@ def sparse_log(x):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @skipIfRocm
     @coalescedonoff
-    @dtypes(*get_all_complex_dtypes(),
-            *get_all_fp_dtypes(include_half=False, include_bfloat16=False))
-    @dtypesIfCUDA(*((torch.complex64,) if CUDA11OrLater else ()),
-                  *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()),
-                  *get_all_fp_dtypes(
-                      include_half=(CUDA11OrLater and SM53OrLater),
-                      include_bfloat16=(CUDA11OrLater and SM80OrLater)))
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCUDA(*floating_types_and(*[torch.half] if CUDA11OrLater and SM53OrLater else [],
+                                      *[torch.bfloat16] if CUDA11OrLater and SM80OrLater else [],
+                                      *[torch.complex64] if CUDA11OrLater else [],
+                                      *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2, torch.complex64: 1e-2, torch.float32: 1e-2})
     def test_sparse_matmul(self, device, dtype, coalesced):
         """
@@ -3372,7 +3459,7 @@ def can_broadcast(s0, s1):
             (), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)
         )
         for s0, s1 in itertools.combinations(sizes, r=2):
-            t = make_tensor(s0, device, dtype, low=-9, high=9)
+            t = make_tensor(s0, dtype=dtype, device=device, low=-9, high=9)
             for sparse_dims in range(1, len(s0) + 1):
                 s = t.to_sparse(sparse_dims)
                 if can_broadcast(s0, s1):
@@ -3410,21 +3497,21 @@ class TestSparseOneOff(TestCase):
     def test_cuda_from_cpu(self):
         with self.assertRaisesRegex(
                 RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+                "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
             torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
                                      torch.randn(4, 4, 4),
                                      [3, 4, 4])
 
         with self.assertRaisesRegex(
                 RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+                "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
             torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
                                      torch.randn(4, 4, 4, 0),
                                      [3, 4, 4, 0])
 
         with self.assertRaisesRegex(
                 RuntimeError,
-                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+                "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
             torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
                                      torch.randn(0, 4, 4, 0),
                                      [0, 4, 4, 0])
@@ -3555,9 +3642,55 @@ def fn(x):
                 fast_mode=op.gradcheck_fast_mode))
 
 
+class TestSparseMaskedReductions(TestCase):
+    exact_dtype = True
+
+    @ops(sparse_masked_reduction_ops)
+    def test_future_empty_dim(self, device, dtype, op):
+        """Currently, `dim=()` in reductions operations means "reduce over
+        all dimensions" while in future, it will read "no reduce". See
+        https://github.com/pytorch/pytorch/issues/29137
+
+        For sparse masked reductions, we'll implement the current behavior.
+
+        For testing, we'll use samples with `dim=0` and map it to
+        `dim=()` until
+        torch.testing._internal.common_methods_invocations._generate_reduction_kwargs
+        is made to generate samples with `dim=()` for non-scalar
+        inputs. With this and after gh-29137 is resolved, this test
+        can be deleted. See also `torch._masked._canonical_dim`
+        implementation about changing the `dim=()` behavior.
+        """
+
+        samples = op.sample_inputs_func(op, device, dtype, requires_grad=False)
+        op_name = op.name.replace('_masked.', '')
+        for sample_input in samples:
+            if sample_input.kwargs.get('dim') != 0:
+                continue
+            sample_input_kwargs = dict(sample_input.kwargs)
+            sample_input_kwargs['dim'] = ()    # reduce over all dimensions
+
+            t = sample_input.input
+            mask = sample_input_kwargs.get('mask')
+            if mask is None and op_name in {'prod', 'amax', 'amin'}:
+                # FIXME: for now reductions with non-zero reduction identity and
+                # unspecified mask are not supported for sparse COO
+                # tensors, see torch._masked.prod implementation
+                # for details.
+                continue
+            sparse_op_kwargs = dict(sample_input_kwargs)
+            actual = op(t.to_sparse(), *sample_input.args, **sample_input_kwargs)
+            self.assertEqual(actual.layout, torch.sparse_coo)
+
+            expected = op(t, *sample_input.args, **sample_input_kwargs).to_sparse()
+            self.assertEqual(actual, expected)
+
+
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
 instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
 
+instantiate_device_type_tests(TestSparseMaskedReductions, globals(), except_for='meta')
+
 # e.g., TestSparseCPU and TestSparseCUDA
 instantiate_device_type_tests(TestSparse, globals(), except_for='meta')
 
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 6c9961a6d1fe..cccc2bbc3b47 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -4,17 +4,21 @@
 import random
 import itertools
 import unittest
-from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes, floating_and_complex_types, make_tensor
+from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_utils import \
-    (TEST_WITH_ROCM, TEST_SCIPY, TestCase, run_tests, load_tests, coalescedonoff)
+    (TEST_WITH_ROCM, TEST_SCIPY, TEST_MKL, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff, parametrize,
+     subtest)
 from torch.testing._internal.common_device_type import \
-    (ops, instantiate_device_type_tests, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoCusparseGeneric,
+    (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoCusparseGeneric,
      precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse)
 from torch.testing._internal.common_methods_invocations import \
-    (op_db, sparse_csr_unary_ufuncs, )
+    (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, CUDA11OrLater
-from torch.testing._internal.common_dtype import floating_types, get_all_dtypes
+from torch.testing._internal.common_dtype import (
+    floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
+    all_types_and_complex, floating_and_complex_types_and
+)
 from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED
 
 if TEST_SCIPY:
@@ -24,6 +28,8 @@
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
 
+no_mkl_sparse = IS_WINDOWS or not TEST_MKL
+
 def _check_cusparse_triangular_solve_available():
     version = _get_torch_cuda_version()
     # cusparseSpSM was added in 11.3.1 but we don't have access to patch version
@@ -43,10 +49,30 @@ def _check_cusparse_sddmm_available():
     return version >= min_supported_version
 
 _sparse_csr_ops = list(filter(lambda op: op.supports_sparse_csr, op_db))
+binary_functions_with_dense_output = ['mm', 'mv', ]
+binary_ops_with_dense_output = list(filter(lambda op: op.name in binary_functions_with_dense_output, op_db))
+
+UNARY_EWISE_CSR_ALLOW_AUTOGRAD = [
+    'abs',
+    'conj_physical',
+    'neg',
+]
 
 # This should be just an import from test_linalg instead of code duplication
 # but https://github.com/pytorch/pytorch/pull/63511#discussion_r733989701
-def _test_addmm_addmv(test_case, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, layout=torch.strided, all_sparse=False):
+def _test_addmm_addmv(
+    test_case,
+    f,
+    t,
+    m,
+    v,
+    *,
+    alpha=None,
+    beta=None,
+    transpose_out=False,
+    layout=torch.strided,
+    mode=None
+):
     """
     Unified test for checking `f(t, m, v, alpha=alpha, beta=beta)` computation,
     where f is `torch.addmv` or `torch.addmm`.
@@ -72,9 +98,11 @@ def convert_layout(mat):
             assert mat.layout == layout
             return mat
 
-    if all_sparse:
+    if mode == "all_sparse":
         res1 = f(*map(convert_layout, (t, m, v)), alpha=alpha, beta=beta)
         res1 = res1.to_dense()
+    elif mode == "dense_result":
+        res1 = f(t, convert_layout(m), convert_layout(v), alpha=alpha, beta=beta)
     else:
         res1 = f(t, convert_layout(m), v, alpha=alpha, beta=beta)
     res2 = torch.full_like(res1, float('nan'))
@@ -110,186 +138,398 @@ def test_make_crow_indices(self):
                     self.assertLessEqual(counts.max(), n_cols)
 
 
-class TestSparseCSR(TestCase):
+def all_sparse_compressed_layouts(test_name='layout'):
+    return parametrize(test_name, [
+        subtest(torch.sparse_csr, name='SparseCSR'),
+        subtest(torch.sparse_csc, name='SparseCSC'),
+        subtest(torch.sparse_bsr, name='SparseBSR'),
+        subtest(torch.sparse_bsc, name='SparseBSC')])
 
-    @onlyCPU
-    def test_csr_layout(self):
-        self.assertEqual(str(torch.sparse_csr), 'torch.sparse_csr')
-        self.assertEqual(type(torch.sparse_csr), torch.layout)
 
-    @dtypes(*get_all_dtypes())
-    def test_sparse_csr_constructor_shape_inference(self, device, dtype):
-        crow_indices = [0, 2, 4]
-        col_indices = [0, 1, 0, 1]
-        values = [1, 2, 3, 4]
-        sparse = torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
-                                         torch.tensor(col_indices, dtype=torch.int64),
-                                         torch.tensor(values), dtype=dtype, device=device)
-        self.assertEqual(torch.tensor(crow_indices, dtype=torch.int64), sparse.crow_indices())
-        self.assertEqual((len(crow_indices) - 1, max(col_indices) + 1), sparse.shape)
-        self.assertEqual(dtype, sparse.dtype)
-        self.assertEqual(torch.device(device), sparse.device)
-
-    @dtypes(*get_all_dtypes())
-    def test_sparse_csr_constructor(self, device, dtype):
-        crow_indices = [0, 2, 4]
-        col_indices = [0, 1, 0, 1]
-        values = [1, 2, 3, 4]
-        for index_dtype in [torch.int32, torch.int64]:
-            sparse = torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=index_dtype),
-                                             torch.tensor(col_indices, dtype=index_dtype),
-                                             torch.tensor(values),
-                                             size=(2, 10),
-                                             dtype=dtype,
-                                             device=device)
-            self.assertEqual((2, 10), sparse.shape)
-            self.assertEqual(torch.tensor(crow_indices, dtype=index_dtype), sparse.crow_indices())
-            self.assertEqual(torch.tensor(col_indices, dtype=index_dtype), sparse.col_indices())
-            self.assertEqual(torch.tensor(values, dtype=dtype), sparse.values())
-
-    @dtypes(*get_all_dtypes())
-    def test_sparse_csr_constructor_from_lists(self, device, dtype):
-        # without size
-        sparse = torch.sparse_csr_tensor([0, 2, 4],
-                                         [0, 1, 0, 1],
-                                         [1, 2, 3, 4],
-                                         dtype=dtype,
-                                         device=device)
+def sparse_compressed_nonblock_layouts(test_name='layout'):
+    return parametrize(test_name, [
+        subtest(torch.sparse_csr, name='SparseCSR'),
+        subtest(torch.sparse_csc, name='SparseCSC')])
+
+sparse_compressed_indices_methods = {
+    torch.sparse_csr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+    torch.sparse_csc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+    torch.sparse_bsr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+    torch.sparse_bsc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+}
 
-        self.assertEqual((2, 2), sparse.shape)
-        self.assertEqual(4, sparse.numel())
-        self.assertEqual(torch.tensor([0, 2, 4], dtype=torch.int64, device=device), sparse.crow_indices())
-        self.assertEqual(torch.tensor([0, 1, 0, 1], dtype=torch.int64, device=device), sparse.col_indices())
-        self.assertEqual(torch.tensor([1, 2, 3, 4], dtype=dtype, device=device), sparse.values())
-
-        # with size
-        for sparse_csr_tensor in [torch.sparse_csr_tensor, torch._sparse_csr_tensor_unsafe]:
-            sparse = sparse_csr_tensor([0, 2, 4],
-                                       [0, 1, 0, 1],
-                                       [1, 2, 3, 4],
-                                       size=(2, 10),
-                                       dtype=dtype,
-                                       device=device)
-
-            self.assertEqual((2, 10), sparse.shape)
-            self.assertEqual(torch.tensor([0, 2, 4], dtype=torch.int64, device=device), sparse.crow_indices())
-            self.assertEqual(torch.tensor([0, 1, 0, 1], dtype=torch.int64, device=device), sparse.col_indices())
-            self.assertEqual(torch.tensor([1, 2, 3, 4], dtype=dtype, device=device), sparse.values())
+
+class TestSparseCompressed(TestCase):
+    """Testing sparse compressed (CSR, CSC, BSR, BSC) tensor generic features.
+    """
+
+    def genTensor(self, size, nnz, *, layout, device=None, dtype=torch.float, index_dtype=torch.int64):
+        if device is None:
+            device = self.device_type
+        return self.genSparseCompressedTensor(size, nnz, device=device, dtype=dtype, index_dtype=index_dtype, layout=layout)
+
+    def _generate_small_inputs(self, layout, device, dtype, index_dtype):
+        """Generator of inputs to sparse compressed tensor factory functions.
+
+        The input is defined as a 4-tuple:
+          compressed_indices, plain_indices, values, expected_size_from_shape_inference
+        """
+        from operator import mul
+        from functools import reduce
+        if layout in {torch.sparse_csr, torch.sparse_csc}:
+            yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
+                   torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype),
+                   torch.tensor([1, 2, 3, 4], device=device, dtype=dtype),
+                   (2, 2))
+            yield (torch.tensor([0, ], device=device, dtype=index_dtype),
+                   torch.tensor([], device=device, dtype=index_dtype),
+                   torch.tensor([], device=device, dtype=dtype),
+                   (0, 0))
+            for batch_shape in [(2,), (2, 3)]:
+                prod = reduce(mul, batch_shape, 1)
+                yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1),
+                       torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1),
+                       torch.tensor([1, 2, 3, 4], device=device, dtype=dtype).repeat(prod, 1).reshape(*batch_shape, -1),
+                       (*batch_shape, 2, 2))
+        else:
+            assert layout in {torch.sparse_bsr, torch.sparse_bsc}
+            yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
+                   torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype),
+                   torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]], device=device, dtype=dtype),
+                   (2, 2))
+            yield (torch.tensor([0, ], device=device, dtype=index_dtype),
+                   torch.tensor([], device=device, dtype=index_dtype),
+                   torch.tensor([], device=device, dtype=dtype).reshape(1, 0, 0),
+                   (0, 0))
+            for batch_shape in [(2,), (2, 3)]:
+                prod = reduce(mul, batch_shape, 1)
+                yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1),
+                       torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1),
+                       torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]],
+                                    device=device, dtype=dtype).repeat(prod, 1, 1).reshape(*batch_shape, 4, 1, 2),
+                       (*batch_shape, 2, 2))
+
+    @all_sparse_compressed_layouts()
+    @onlyCPU
+    def test_layout(self, layout):
+        self.assertIn(str(layout), {'torch.sparse_csr', 'torch.sparse_csc', 'torch.sparse_bsr', 'torch.sparse_bsc'})
+        self.assertEqual(type(layout), torch.layout)
+
+    @parametrize('shape_and_device_inference', [subtest(False, name='_'), subtest(False, name='shape_and_device_inference')])
+    @parametrize('use_factory_function', [subtest(False, name='_'), subtest(True, name='factory')])
+    @parametrize('input_kind', [subtest('tensor', name='from_tensor'), subtest('list', name='from_list')])
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_sparse_compressed_constructor(self, layout, device, dtype,
+                                           use_factory_function, shape_and_device_inference, input_kind):
+        factory_function = {
+            torch.sparse_csr: torch.sparse_csr_tensor,
+            torch.sparse_csc: torch.sparse_csc_tensor,
+            torch.sparse_bsr: torch.sparse_bsr_tensor,
+            torch.sparse_bsc: torch.sparse_bsc_tensor,
+        }[layout]
+        compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout]
+        for index_dtype in [torch.int32, torch.int64]:
+            for compressed_indices, plain_indices, values, size in self._generate_small_inputs(layout, device, dtype, index_dtype):
+                if input_kind == 'list':
+                    if size == (0, 0):
+                        # for this degenerate case, plain_indices must
+                        # remain a tensor because
+                        # tensor(plain_indices) results a float dtype
+                        # when plain_indices is an empty list
+                        if index_dtype == torch.int32:
+                            # skip testing int32 case because
+                            # tensor(compressed_indices) results a
+                            # int64 dtype when compressed_indices is
+                            # [0] (a list of single int zero).
+                            continue
+                    else:
+                        plain_indices = plain_indices.tolist()
+                    compressed_indices = compressed_indices.tolist()
+                    values = values.tolist()
+                    if size == (0, 0) and layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                        # in the block sparse case, values of type list needs to represent a 3-D tensor
+                        values = [[[]]]
+                if use_factory_function:
+                    if shape_and_device_inference:
+                        sparse = factory_function(compressed_indices, plain_indices, values)
+                    else:
+                        sparse = factory_function(compressed_indices, plain_indices, values, size,
+                                                  dtype=dtype, device=device)
+                else:
+                    if shape_and_device_inference:
+                        sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, layout=layout)
+                    else:
+                        sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
+                                                                dtype=dtype, layout=layout, device=device)
+                self.assertEqual(layout, sparse.layout)
+                self.assertEqual(size, sparse.shape)
+                self.assertEqual(compressed_indices, compressed_indices_mth(sparse))
+                self.assertEqual(plain_indices, plain_indices_mth(sparse))
+                self.assertEqual(values, sparse.values())
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
-    def test_empty(self, device, dtype):
+    @sparse_compressed_nonblock_layouts()
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half))
+    def test_empty(self, layout, device, dtype):
         ns = [5, 2, 0]
-        for shape in itertools.product(ns, ns):
-            result = torch.empty(shape, dtype=dtype, device=device, layout=torch.sparse_csr)
+        batch_shapes = [(), (2,), (2, 3)]
+        compressed_dim = {
+            torch.sparse_csr: -2,
+            torch.sparse_csc: -1,
+        }[layout]
+        compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout]
+        for m, n, b in itertools.product(ns, ns, batch_shapes):
+            shape = (*b, m, n)
+            result = torch.empty(shape, dtype=dtype, device=device, layout=layout)
             self.assertEqual(result.shape, shape)
             self.assertEqual(result.dtype, dtype)
             self.assertEqual(result.device, torch.device(device))
-            self.assertEqual(result.layout, torch.sparse_csr)
-            self.assertEqual(result.crow_indices().shape, (shape[0] + 1,))
-            self.assertEqual(result.col_indices().shape, (0,))
-            self.assertEqual(result.values().shape, (0,))
+            self.assertEqual(result.layout, layout)
+            self.assertEqual(compressed_indices_mth(result).shape, (*b, shape[compressed_dim] + 1,))
+            self.assertEqual(plain_indices_mth(result).shape, (*b, 0,))
+            self.assertEqual(result.values().shape, (*b, 0,))
             self.assertEqual(result._nnz(), 0)
-            self.assertEqual(result.crow_indices().device, torch.device(device))
-            self.assertEqual(result.col_indices().device, torch.device(device))
+            self.assertEqual(compressed_indices_mth(result).device, torch.device(device))
+            self.assertEqual(plain_indices_mth(result).device, torch.device(device))
             self.assertEqual(result.values().device, torch.device(device))
-            self.assertEqual(result.crow_indices().dtype, torch.int64)
-            self.assertEqual(result.col_indices().dtype, torch.int64)
+            self.assertEqual(compressed_indices_mth(result).dtype, torch.int64)
+            self.assertEqual(plain_indices_mth(result).dtype, torch.int64)
             self.assertEqual(result.values().dtype, dtype)
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
-    def test_empty_errors(self, device, dtype):
-        with self.assertRaisesRegex(RuntimeError, "torch.empty: Only 2D sparse CSR tensors are supported."):
-            torch.empty((5,), dtype=dtype, device=device, layout=torch.sparse_csr)
-
-        with self.assertRaisesRegex(RuntimeError, "torch.empty: Only 2D sparse CSR tensors are supported."):
-            torch.empty((2, 3, 4), dtype=dtype, device=device, layout=torch.sparse_csr)
+    @sparse_compressed_nonblock_layouts()
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+    def test_empty_errors(self, layout, device, dtype):
+        with self.assertRaisesRegex(RuntimeError,
+                                    "torch.empty: Only batched sparse compressed \\(non-block\\) tensors are supported"
+                                    ", but got size"):
+            torch.empty((5,), dtype=dtype, device=device, layout=layout)
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
-    def test_clone(self, device, dtype):
-        x = torch.sparse_csr_tensor([0, 2, 4],
-                                    [0, 1, 0, 1],
-                                    [1, 2, 3, 4],
-                                    dtype=dtype,
-                                    device=device)
-        y = x.clone()
-
-        self.assertEqual(x.shape, y.shape)
-        self.assertEqual(x.crow_indices(), y.crow_indices())
-        self.assertEqual(x.col_indices(), y.col_indices())
-        self.assertEqual(x.values(), y.values())
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+    def test_clone(self, layout, device, dtype):
+        for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
+                layout, device, dtype, index_dtype=torch.int32):
+            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
+                                                    dtype=dtype, layout=layout, device=device)
+            cloned_sparse = sparse.clone()
+            self.assertEqual(sparse, cloned_sparse)
+
+    @all_sparse_compressed_layouts()
+    def test_print(self, layout, device):
+        compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout]
+        printed = []
+        for index_dtype in [torch.int32, torch.int64]:
+            for dtype in [torch.float32, torch.float64]:
+                for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
+                        layout, device, dtype, index_dtype):
+                    batch_shape = tuple(size[:-2])
+                    block_shape = tuple(values.shape[-2:]) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+                    if size not in [(2, 2), (0, 0), (2, 3, 2, 2), (2, 2, 2)]:
+                        # Skip inputs that are not in the list of
+                        # expected sizes to ensure the stability of
+                        # test_print in the case
+                        # _generate_small_inputs is extended with new
+                        # inputs
+                        continue
+                    if block_shape not in [(), (0, 0), (1, 2)]:
+                        # Skip inputs that are not in the list of
+                        # expected block sizes to ensure test_print
+                        # stability.
+                        continue
+                    printed.append("########## {}/{}/batch_shape={}/block_shape={} ##########".format(
+                        dtype, index_dtype, batch_shape, block_shape))
+                    x = torch.sparse_compressed_tensor(compressed_indices,
+                                                       plain_indices,
+                                                       values, dtype=dtype, layout=layout, device=device)
+                    printed.append("# sparse tensor")
+                    printed.append(str(x))
+                    printed.append(f"# _{compressed_indices_mth.__name__}")
+                    printed.append(str(compressed_indices_mth(x)))
+                    printed.append(f"# _{plain_indices_mth.__name__}")
+                    printed.append(str(plain_indices_mth(x)))
+                    printed.append("# _values")
+                    printed.append(str(x.values()))
+                    printed.append('')
+                printed.append('')
+        orig_maxDiff = self.maxDiff
+        self.maxDiff = None
+        try:
+            self.assertExpected('\n'.join(printed))
+            self.maxDiff = orig_maxDiff
+        except Exception:
+            self.maxDiff = orig_maxDiff
+            raise
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
-    def test_copy(self, device, dtype):
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_copy(self, layout, device, dtype):
 
         def run_test(shape, nnz, index_type):
-            a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
-            b = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
+            block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+            a = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, block_size=block_size)
+            b = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, block_size=block_size)
 
             a.copy_(b)
 
-            self.assertEqual(a.crow_indices(), b.crow_indices())
-            self.assertEqual(a.col_indices(), b.col_indices())
-            self.assertEqual(a.values(), b.values())
+            self.assertEqual(a, b)
 
         ns = [5, 2, 0]
-        for shape, index_dtype in zip(itertools.product(ns, ns), [torch.int32, torch.int64]):
-            run_test(shape, 0, index_dtype)
-            run_test(shape, shape[0] * shape[1], index_dtype)
+        batch_shapes = [(), (2,), (2, 3)]
+        for (m, n, b), index_dtype in zip(itertools.product(ns, ns, batch_shapes), [torch.int32, torch.int64]):
+            run_test((*b, m, n), 0, index_dtype)
+            run_test((*b, m, n), m * n, index_dtype)
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
-    def test_copy_errors(self, device, dtype):
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_copy_errors(self, layout, device, dtype):
+        block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
         for index_dtype in [torch.int32, torch.int64]:
             shape1 = (2, 3)
-            shape2 = (3, 2)
-            a = self.genSparseCSRTensor(shape1, 0, dtype=dtype, device=device, index_dtype=index_dtype)
-            b = self.genSparseCSRTensor(shape2, 0, dtype=dtype, device=device, index_dtype=index_dtype)
+            a = self.genSparseCompressedTensor(shape1, 0, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, block_size=block_size)
 
-            with self.assertRaisesRegex(RuntimeError, "only same size tensors are supported."):
-                a.copy_(b)
-
-            with self.assertRaisesRegex(RuntimeError, "copy between different layouts is not supported."):
+            with self.assertRaisesRegex(RuntimeError,
+                                        "copy of sparse compressed tensors having different layouts is not supported."):
                 a.copy_(torch.empty(a.shape, dtype=dtype, device=device))
 
-            b = self.genSparseCSRTensor(shape1, 1, dtype=dtype, device=device, index_dtype=index_dtype)
-            with self.assertRaisesRegex(RuntimeError, "only tensors with the same number of specified elements are supported."):
+            b = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, block_size=block_size)
+            with self.assertRaisesRegex(RuntimeError,
+                                        "only sparse compressed tensors with the same number of specified elements are supported."):
                 a.copy_(b)
 
+            shape2 = tuple(reversed(shape1))
+            c = self.genSparseCompressedTensor(shape2, 1, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, block_size=block_size)
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    "expected shapes of self and src to match along dimension"):
+                b.copy_(c)
+
+            if block_size:
+                block_size1 = tuple(reversed(block_size))
+                d = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device,
+                                                   index_dtype=index_dtype, block_size=block_size1)
+                with self.assertRaisesRegex(RuntimeError,
+                                            "copy of sparse compressed tensors having different block sizes is not supported"):
+                    b.copy_(d)
+
+
+class TestSparseCSR(TestCase):
+
+    def test_csr_stride(self):
+        a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64)
+
+        with self.assertRaisesRegex(RuntimeError, "Sparse CSR tensors do not have strides"):
+            a.stride()
+
+        with self.assertRaisesRegex(RuntimeError, "Sparse CSR tensors do not have strides"):
+            a.stride(-1)
+
+    def test_csr_storage(self):
+        a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64)
+
+        with self.assertRaisesRegex(RuntimeError, "Cannot access storage of SparseCsrTensorImpl"):
+            a.storage()
+
+    def test_csr_is_contiguous(self):
+        a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64)
+
+        with self.assertRaisesRegex(RuntimeError, "Tensors of type SparseCsrTensorImpl do not have is_contiguous"):
+            a.is_contiguous()
+
+    def test_csr_double_to_sparse_csr(self):
+        a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64)
+        a.to_sparse_csr().to_sparse_csr()
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    def test_sparse_csr_select(self, device, dtype):
+        batch_shape = (2, 3)
+        crow_indices = torch.tensor([0, 2, 4], device=device).repeat(6, 1).reshape(*batch_shape, -1)
+        col_indices = torch.tensor([0, 1, 0, 1], device=device).repeat(6, 1).reshape(*batch_shape, -1)
+        values = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype).repeat(6, 1).reshape(*batch_shape, -1)
+        sparse = torch.sparse_csr_tensor(crow_indices,
+                                         col_indices,
+                                         values,
+                                         size=(*batch_shape, 2, 10),
+                                         dtype=dtype,
+                                         device=device)
+
+        # select from batch dimensions
+        sparse_selected12 = sparse.select(1, 2)
+        expected_sparse_selected12 = torch.sparse_csr_tensor(crow_indices.select(1, 2).contiguous(),
+                                                             col_indices.select(1, 2).contiguous(),
+                                                             values.select(1, 2).contiguous(),
+                                                             size=(2, 2, 10),
+                                                             dtype=dtype,
+                                                             device=device)
+        self.assertEqual(expected_sparse_selected12, sparse_selected12)
+
+        # select from rows or columns
+        sparse_non_batched = sparse[0, 0]
+        for selects_args in [(0, 0), (1, 1)]:
+            sparse_selected = sparse_non_batched.select(*selects_args)
+            dense_selected = sparse_non_batched.to_dense().select(*selects_args)
+            self.assertEqual(dense_selected, sparse_selected)
+
+        # index a single element
+        self.assertEqual(sparse[0, 0, 0, 0], sparse.to_dense()[0, 0, 0, 0])
+
+        # selecting from rows or columns for batched CSR is not yet implemented
+        with self.assertRaisesRegex(RuntimeError, "selecting rows or columns is not implemented for batched"):
+            sparse.select(-2, 0)
+
+        with self.assertRaisesRegex(RuntimeError, "selecting rows or columns is not implemented for batched"):
+            sparse.select(-1, 0)
+
+        # assigning to sparse trhough indexing is disabled
+        with self.assertRaisesRegex(TypeError, "Cannot assign to a sparse tensor"):
+            sparse[0, 0, 0, 0] = 99.0
+
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_resize(self, device, dtype):
-        for index_dtype in [torch.int32, torch.int64]:
-            shape = (2, 3)
+        batch_shapes = [(), (2,), (2, 3)]
+        for index_dtype, b in zip([torch.int32, torch.int64], batch_shapes):
+            shape = (*b, 2, 3)
             nnz = 6
             a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
 
-            new_shape = (4, 5)
+            new_shape = (*b, 4, 5)
             a.resize_(new_shape)
 
             self.assertEqual(a.shape, new_shape)
             # resize to larger shape doesn't add specified elements
             self.assertEqual(a._nnz(), nnz)
 
-            new_shape = (1, 5)
+            new_shape = (*b, 1, 5)
             a.resize_(new_shape)
 
             self.assertEqual(a.shape, new_shape)
             # resize to smaller shape trims specified elements
             self.assertEqual(a._nnz(), 5)
 
+            # trim batched dimensions
+            a.resize_(new_shape[-2], new_shape[-1])
+            self.assertEqual(a.shape, (new_shape[-2], new_shape[-1]))
+            self.assertEqual(a._nnz(), 5)
+
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_resize_errors(self, device, dtype):
         for index_dtype in [torch.int32, torch.int64]:
             shape = (2, 3)
             nnz = 6
             a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
 
-            with self.assertRaisesRegex(RuntimeError, "torch.resize_: Only 2D sparse CSR tensors are supported."):
+            with self.assertRaisesRegex(RuntimeError, "torch.resize_: Only batched sparse CSR matrices are supported"):
                 new_shape = (4,)
                 a.resize_(new_shape)
 
@@ -308,7 +548,7 @@ def test_factory_type_invariants_check(self, device):
                                     torch.tensor([1, 2, 3, 4]),
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"\"csr_construct_check\" not implemented for 'Short'"):
+        with self.assertRaisesRegex(RuntimeError, r"\"validate_sparse_compressed_tensor_args\" not implemented for 'Short'"):
             torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=torch.int16),
                                     torch.tensor([0, 1, 0, 1], dtype=torch.int16),
                                     torch.tensor([1, 2, 3, 4]),
@@ -334,49 +574,63 @@ def test_factory_layout_invariants_check(self, device):
                                     torch.tensor([1, 2, 3, 4]))
 
     def test_factory_shape_invariants_check(self, device):
-        crow_indices = [0, 2, 4]
-        col_indices = [0, 1, 0, 1]
-        values = [1, 2, 3, 4]
+        crow_indices = torch.tensor([0, 2, 4], device=device)
+        col_indices = torch.tensor([0, 1, 0, 1], device=device)
+        values = torch.tensor([1, 2, 3, 4], device=device)
         size = (2, 10)
-        torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values), size,
-                                device=device)
+        torch.sparse_csr_tensor(crow_indices, col_indices, values, size, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"size of a CSR tensor must be of length 2, but got: 3"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values),
-                                    size=(2, 10, 2),
+        with self.assertRaisesRegex(RuntimeError, r"size of a batched CSR tensor must have length >= 2, but got: 1"):
+            torch.sparse_csr_tensor(crow_indices, col_indices, values,
+                                    size=(2,),
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"crow_indices must have dim\=1 but got crow_indices\.dim\(\)\=2"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices).repeat(2, 1),
-                                    torch.tensor(col_indices),
-                                    torch.tensor(values),
+        with self.assertRaisesRegex(RuntimeError, r"crow_indices must have dim >= 1 but got crow_indices\.dim\(\)\ = 0"):
+            torch.sparse_csr_tensor(torch.zeros((), device=device, dtype=torch.int64),
+                                    col_indices,
+                                    values,
                                     size,
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"col_indices must have dim\=1 but got col_indices\.dim\(\)\=2"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices),
-                                    torch.tensor(col_indices).repeat(2, 1),
-                                    torch.tensor(values),
+        with self.assertRaisesRegex(RuntimeError, r"col_indices must have dim >= 1 but got col_indices\.dim\(\)\ = 0"):
+            torch.sparse_csr_tensor(crow_indices,
+                                    torch.zeros((), device=device, dtype=torch.int64),
+                                    values,
                                     size,
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"values must have dim\=1 but got values\.dim\(\)\=2"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices),
-                                    torch.tensor(col_indices),
-                                    torch.tensor(values).repeat(2, 1),
+        with self.assertRaisesRegex(RuntimeError, r"values must have dim >= 1 but got values\.dim\(\)\ = 0"):
+            torch.sparse_csr_tensor(crow_indices,
+                                    col_indices,
+                                    torch.zeros((), device=device, dtype=torch.int64),
                                     size,
                                     device=device)
 
         with self.assertRaisesRegex(RuntimeError,
-                                    r"crow_indices\.numel\(\) must be size\(0\) \+ 1, but got: 3"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values), (1, 1),
+                                    r"crow_indices\.size\(-1\) must be equal to size\[-2\] \+ 1 \(that is 2\), but got: 3"):
+            torch.sparse_csr_tensor(crow_indices, col_indices, values, (1, 1),
+                                    device=device)
+
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"number of dimensions of crow_indices and col_indices must be the same"):
+            torch.sparse_csr_tensor(crow_indices, col_indices.repeat(2, 1), values, size,
+                                    device=device)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"number of dimensions of indices and values must be the same"):
+            torch.sparse_csr_tensor(crow_indices, col_indices, values.repeat(2, 1), size,
                                     device=device)
 
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"number of dimensions of indices must be one less"):
+            torch.sparse_csr_tensor(crow_indices.repeat(2, 1), col_indices.repeat(2, 1), values.repeat(2, 1), size,
+                                    device=device)
 
         with self.assertRaisesRegex(RuntimeError,
-                                    r"col_indices and values must have equal sizes, " +
-                                    r"but got col_indices\.numel\(\): 3, values\.numel\(\): 4"):
-            torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, 1, 0]), torch.tensor(values), size,
+                                    r"all batch dimensions of the provided size \(\[2\]\), indices \(\[2\], \[3\]\),"
+                                    r" and values \(\[4\]\) must be the same"):
+            torch.sparse_csr_tensor(crow_indices.repeat(2, 1), col_indices.repeat(3, 1), values.repeat(4, 1), (2, 2, 10),
                                     device=device)
 
     def test_factory_indices_invariants_check(self, device):
@@ -395,7 +649,7 @@ def test_factory_indices_invariants_check(self, device):
 
         with self.assertRaisesRegex(RuntimeError,
                                     r"at position i \= 2," +
-                                    r" this condition crow_indices\[i - 1\] <\= crow_indices\[i\] fails"):
+                                    r" the condition crow_indices\[i - 1\] <\= crow_indices\[i\] fails"):
             torch.sparse_csr_tensor(torch.tensor([0, 5, 4]), torch.tensor(col_indices), torch.tensor(values), size,
                                     device=device)
 
@@ -403,12 +657,12 @@ def test_factory_indices_invariants_check(self, device):
             torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, -1, 0, 1]), torch.tensor(values), size,
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"size\(1\) should be greater than col_indices\.max\(\)"):
+        with self.assertRaisesRegex(RuntimeError, r"size\[-1\] should be greater than col_indices\.max\(\)"):
             torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, 11, 0, 1]), torch.tensor(values), size,
                                     device=device)
 
     @onlyCUDA
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_factory_device_type_inference(self, device, dtype):
         cpu_cuda = ('cpu', 'cuda')
         cpu_cuda_none = cpu_cuda + (None,)
@@ -442,44 +696,7 @@ def test_factory_device_type_inference(self, device, dtype):
                     t.crow_indices().device == t.values().device
                     t.col_indices().device == t.values().device
 
-    def test_sparse_csr_print(self, device):
-        orig_maxDiff = self.maxDiff
-        self.maxDiff = None
-        shape_nnz = [
-            ((10, 10), 10),
-            ((100, 10), 10),
-            ((1000, 10), 10)
-        ]
-        printed = []
-        for shape, nnz in shape_nnz:
-            values_shape = torch.Size((nnz,))
-            col_indices_shape = torch.Size((nnz,))
-            crow_indices_shape = torch.Size((shape[0] + 1,))
-            printed.append("# shape: {}".format(torch.Size(shape)))
-            printed.append("# nnz: {}".format(nnz))
-            printed.append("# crow_indices shape: {}".format(crow_indices_shape))
-            printed.append("# col_indices shape: {}".format(col_indices_shape))
-            printed.append("# values_shape: {}".format(values_shape))
-            for index_dtype in [torch.int32, torch.int64]:
-                for dtype in floating_types():
-                    printed.append("########## {}/{} ##########".format(dtype, index_dtype))
-                    x = torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=index_dtype),
-                                                torch.tensor([0, 1, 0, 1], dtype=index_dtype),
-                                                torch.tensor([1, 2, 3, 4]), dtype=dtype, device=device)
-                    printed.append("# sparse tensor")
-                    printed.append(str(x))
-                    printed.append("# _crow_indices")
-                    printed.append(str(x.crow_indices()))
-                    printed.append("# _col_indices")
-                    printed.append(str(x.col_indices()))
-                    printed.append("# _values")
-                    printed.append(str(x.values()))
-                    printed.append('')
-                printed.append('')
-        self.assertExpected('\n'.join(printed))
-        self.maxDiff = orig_maxDiff
-
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_sparse_csr_from_dense(self, device, dtype):
         dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device)
         sparse = dense.to_sparse_csr()
@@ -499,7 +716,7 @@ def test_sparse_csr_from_dense(self, device, dtype):
         self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices())
         self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values())
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_sparse_csr_to_dense(self, device, dtype):
         mn = [5, 2, 0]
         for (m, n) in itertools.product(mn, mn):
@@ -508,14 +725,15 @@ def test_sparse_csr_to_dense(self, device, dtype):
             sparse = dense.to_sparse_csr()
             self.assertEqual(sparse.to_dense(), dense)
 
-        crow_indices = torch.tensor([0, 3, 5])
-        col_indices = torch.tensor([0, 1, 2, 0, 1])
-        values = torch.tensor([1, 2, 1, 3, 4], dtype=dtype)
-        csr = torch.sparse_csr_tensor(crow_indices, col_indices,
-                                      values, dtype=dtype, device=device)
-        dense = torch.tensor([[1, 2, 1], [3, 4, 0]], dtype=dtype, device=device)
+        batch_shape = (2, 3)
+        crow_indices = torch.tensor([0, 3, 5], device=device).repeat(6, 1).reshape(*batch_shape, -1)
+        col_indices = torch.tensor([0, 1, 2, 0, 1], device=device).repeat(6, 1).reshape(*batch_shape, -1)
+        values = torch.tensor([1, 2, 1, 3, 4], device=device, dtype=dtype).repeat(6, 1).reshape(*batch_shape, -1)
+        csr = torch.sparse_csr_tensor(crow_indices, col_indices, values, dtype=dtype, device=device)
+        dense = torch.tensor([[1, 2, 1], [3, 4, 0]], dtype=dtype, device=device).repeat(6, 1).reshape(csr.shape)
         self.assertEqual(csr.to_dense(), dense)
 
+    @skipMeta
     @skipCPUIfNoMklSparse
     @coalescedonoff
     @dtypes(torch.double)
@@ -559,7 +777,40 @@ def test_coo_to_csr_convert(self, device, dtype, coalesced):
         values = torch.tensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7], dtype=dtype, device=device)
         self.assertEqual(csr.values(), values)
 
-    @dtypes(*get_all_dtypes())
+    @parametrize("blocksize", [2, 4])
+    @dtypes((torch.double, torch.int32), (torch.double, torch.int64))
+    @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
+    @skipMeta
+    def test_csr_to_block_csr(self, device, dtypes, blocksize):
+        for shape in [(24, 24), (12, 24)]:
+            dtype, index_dtype = dtypes
+            m, k = shape
+            nnz = random.randint(0, m * k)
+            t = self.genSparseCSRTensor((m * blocksize, k * blocksize), nnz, dtype=dtype,
+                                        device=device, index_dtype=index_dtype)
+            st = sp.csr_matrix((t.values().cpu(), t.col_indices().cpu(), t.crow_indices().cpu()), shape=tuple(t.size()))
+            block_t = t.to_sparse_bsr((blocksize, blocksize))
+            self.assertEqual(block_t.values().dim(), 3)
+            self.assertTrue(block_t.layout == torch.sparse_bsr)
+            block_st = st.tobsr(blocksize=(blocksize, blocksize))
+            self.assertEqual(block_t.values().cpu(), block_st.data)
+            self.assertEqual(block_t.col_indices().cpu(), torch.tensor(block_st.indices).to(index_dtype))
+            self.assertEqual(block_t.crow_indices().cpu(), torch.tensor(block_st.indptr).to(index_dtype))
+
+    @dtypes(torch.double)
+    @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
+    def test_csr_to_block_csr_errors(self, device, dtype):
+        for index_dtype in [torch.int32, torch.int64]:
+            nnz = 15
+            t = self.genSparseCSRTensor((16, 16), nnz, dtype=dtype,
+                                        device=device, index_dtype=index_dtype)
+            with self.assertRaisesRegex(RuntimeError, "must be square."):
+                block_t = t.to_sparse_bsr((2, 3))
+
+            with self.assertRaisesRegex(RuntimeError, r"size \(16, 16\) with block size \(5, 5\)"):
+                block_t = t.to_sparse_bsr((5, 5))
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_sparse_csr_from_dense_convert_error(self, device, dtype):
         size = (4, 2, 4)
         dense = make_tensor(size, dtype=dtype, device=device)
@@ -585,8 +836,9 @@ def test_matmul_device_mismatch(self, device, dtype):
     @skipCPUIfNoMklSparse
     @skipCUDAIfNoCusparseGeneric
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(*get_all_complex_dtypes(),
-                  *get_all_fp_dtypes(include_half=SM53OrLater, include_bfloat16=SM80OrLater))
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.half] if SM53OrLater else [],
+                  *[torch.bfloat16] if SM80OrLater else []))
     def test_csr_matvec(self, device, dtype):
         side = 100
         for index_dtype in [torch.int32, torch.int64]:
@@ -624,7 +876,7 @@ def run_test(c, a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device
             self.assertEqual(actual, expected)
 
         for index_dtype in [torch.int32, torch.int64]:
-            for (m, n, k), batch_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 3], [True, False]):
+            for (m, n, k), batch_size, noncontiguous in zip(itertools.product([2, 5], repeat=3), [1, 3], [True, False]):
                 nnz = random.randint(0, m * k)
                 a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
 
@@ -657,7 +909,7 @@ def run_test(a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device=No
             self.assertEqual(actual, expected)
 
         for index_dtype in [torch.int32, torch.int64]:
-            for (m, n, k), batch_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 3], [True, False]):
+            for (m, n, k), batch_size, noncontiguous in zip(itertools.product([2, 5], repeat=3), [1, 3], [True, False]):
                 nnz = random.randint(0, m * k)
                 a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
 
@@ -691,46 +943,75 @@ def run_test_block_addmm_addmv(self, addmv_addmm, c, a, b, op_b=False, op_out=Fa
         self.assertEqual(actual, out)
         self.assertEqual(actual, expected)
 
+    # TODO: block_size 1 is broken
+    @parametrize("block_size", [2, 3])
+    @parametrize("index_dtype", [torch.int32, torch.int64])
+    @parametrize("noncontiguous", [True, False])
     @skipCPUIfNoMklSparse
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    def test_block_addmm(self, device, dtype):
-        for index_dtype in [torch.int32, torch.int64]:
-            for (m, n, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 2, 3], [True, False]):
-                nnz = random.randint(0, m * k)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-5, torch.complex128: 1e-5})
+    def test_block_addmm(self, device, dtype, index_dtype, block_size, noncontiguous):
+        for (m, n, k) in itertools.product([2, 5], repeat=3):
+            nnz = random.randint(0, m * k)
+            if not noncontiguous:
+                a = self.genSparseCSRTensor((m * block_size, k * block_size), nnz,
+                                            dtype=dtype, device=device, index_dtype=index_dtype)
+                a = a.to_sparse_bsr((block_size, block_size))
+            else:
                 a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
                 a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device)
                 a_data = a_data.mT if noncontiguous else a_data   # Test column-major blocks
-                a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, k * block_size))
-                b = make_tensor((k * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous)
-                c = make_tensor((m * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous)
-                for op_b, op_out in itertools.product([True, False], repeat=2):
-                    self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device)
-
+                a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(),
+                                                    a_data, (m * block_size, k * block_size))
+            b = make_tensor((k * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous)
+            c = make_tensor((m * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous)
+            for op_b, op_out in itertools.product([True, False], repeat=2):
+                self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device)
+
+    @parametrize("block_size", [2, 3])
+    @parametrize("index_dtype", [torch.int32, torch.int64])
+    @parametrize("noncontiguous", [True, False])
     @skipCPUIfNoMklSparse
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    def test_block_addmv(self, device, dtype):
-        for index_dtype in [torch.int32, torch.int64]:
-            block_sizes = [1, 2, 3]
-            if TEST_WITH_ROCM or not TEST_CUSPARSE_GENERIC:
-                block_sizes = [2, 3]
-            for (m, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=2), block_sizes, [True, False]):
-                nnz = random.randint(0, m * k)
+    def test_block_addmv(self, device, dtype, index_dtype, block_size, noncontiguous):
+        # TODO: Explicitly disable block size 1 support
+        # if (TEST_WITH_ROCM or not TEST_CUSPARSE_GENERIC) and block_size == 1:
+        #     return
+        for (m, k) in itertools.product([2, 5], repeat=2):
+            nnz = random.randint(0, m * k)
+            if not noncontiguous:
+                a = self.genSparseCSRTensor((m * block_size, k * block_size), nnz,
+                                            dtype=dtype, device=device, index_dtype=index_dtype)
+                a = a.to_sparse_bsr((block_size, block_size))
+            else:
                 a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
                 a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device)
-                a_data = a_data.mT if noncontiguous else a_data  # Test column-major blocks
-                a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, k * block_size))
-                b = make_tensor((k * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous)
-                c = make_tensor((m * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous)
-                self.run_test_block_addmm_addmv(torch.addmv, c, a, b, dtype=dtype, device=device)
-
+                a_data = a_data.mT if noncontiguous else a_data   # Test column-major blocks
+                a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(),
+                                                    a_data, (m * block_size, k * block_size))
+            b = make_tensor((k * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous)
+            c = make_tensor((m * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous)
+            self.run_test_block_addmm_addmv(torch.addmv, c, a, b, dtype=dtype, device=device)
+
+    @parametrize("block_size", [2, 3])
+    @parametrize("index_dtype", [torch.int32, torch.int64])
+    @parametrize("noncontiguous", [True, False])
     @skipCPUIfNoMklSparse
-    @skipCUDAIfRocm
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    def test_block_triangular_solve(self, device, dtype):
+    def test_block_triangular_solve(self, device, dtype, index_dtype, block_size, noncontiguous):
         def run_test(a, b, upper, transpose, unitriangular, op_out):
+            if unitriangular and self.device_type == 'cpu':
+                # TODO: When unitriangular=True results are not correct on CPU
+                return
+
+            if not upper and self.device_type == 'cpu':
+                # TODO: When upper=False some generated inputs might crash on CPU
+                return
+
             actual = torch.triangular_solve(b, a, upper=upper, unitriangular=unitriangular, transpose=transpose)
             actual_X = actual.solution
             actual_A_clone = actual.cloned_coefficient
@@ -754,6 +1035,14 @@ def run_test(a, b, upper, transpose, unitriangular, op_out):
                 transpose=transpose,
                 upper=upper,
                 unitriangular=unitriangular)
+
+            if expected_X.isnan().any():
+                # TODO: zeros on the diagonal are not handled for CPU path
+                # there's no way to query this info from MKL
+                if self.device_type == 'cuda' and not TEST_WITH_ROCM:
+                    self.assertTrue(actual_X.isnan().any() or actual_X.isinf().any())
+                return
+
             self.assertEqual(actual_X, expected_X)
 
             out = torch.empty_like(b.mH if op_out and a.shape == b.shape else b)
@@ -764,53 +1053,70 @@ def run_test(a, b, upper, transpose, unitriangular, op_out):
             self.assertEqual(out, actual_X)
             self.assertEqual(out, expected_X)
 
-        for index_dtype in [torch.int32, torch.int64]:
-            for (m, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=2), [2, 3], [True, False]):
-                nnz = random.randint(0, m * m)
+        for (m, k) in itertools.product([2, 3], [1, 3]):
+            nnz = random.randint(0, m * m)
+            if not noncontiguous:
+                a = self.genSparseCSRTensor((m * block_size, m * block_size), nnz,
+                                            dtype=dtype, device=device, index_dtype=index_dtype)
+                a = a.to_sparse_bsr((block_size, block_size))
+            else:
                 a = self.genSparseCSRTensor((m, m), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
                 a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device)
                 a_data = a_data.mT if noncontiguous else a_data  # Test column-major blocks
-                a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, m * block_size))
-                b = make_tensor((m * block_size, k), dtype=dtype, device=device, noncontiguous=noncontiguous)
+                a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(),
+                                                    a_data, (m * block_size, m * block_size))
+            b = make_tensor((m * block_size, k), dtype=dtype, device=device, noncontiguous=noncontiguous)
 
-                for (upper, unitriangular, transpose, op_out) in itertools.product([True, False], repeat=4):
-                    run_test(a, b, upper, unitriangular, transpose, op_out)
+            for (upper, unitriangular, transpose, op_out) in itertools.product([True, False], repeat=4):
+                run_test(a, b, upper, unitriangular, transpose, op_out)
 
     @skipCPUIfNoMklSparse
     @dtypes(torch.double)
     def test_mm(self, device, dtype):
-        def test_shape(di, dj, dk, nnz):
+        def test_shape(di, dj, dk, nnz0=None, nnz1=None):
             for index_dtype in [torch.int32, torch.int64]:
-                x = self.genSparseCSRTensor((di, dj), nnz, device=device, dtype=dtype, index_dtype=index_dtype)
-                t = torch.randn(di, dk, dtype=dtype, device=device)
-                y = torch.randn(dj, dk, dtype=dtype, device=device)
                 alpha = random.random()
                 beta = random.random()
 
-                # res = beta * t  + alpha * (x @ y)
-                res = torch.addmm(t, x, y, beta=beta, alpha=alpha)
-                expected = torch.addmm(t, x.to_dense(), y, beta=beta, alpha=alpha)
-                self.assertEqual(res, expected)
-
-                res = torch.addmm(t, x, y)
-                expected = torch.addmm(t, x.to_dense(), y)
-                self.assertEqual(res, expected)
-
-                res = torch.mm(x, y)
-                expected = torch.mm(x.to_dense(), y)
-                self.assertEqual(res, expected)
+                def _test(t, x, y):
+                    # res = beta * t  + alpha * (x @ y)
+                    res = torch.addmm(t, x, y, beta=beta, alpha=alpha)
+                    expected = torch.addmm(t, x.to_dense(), y.to_dense(), beta=beta, alpha=alpha)
+                    self.assertEqual(res, expected)
+
+                    res = torch.addmm(t, x, y)
+                    expected = torch.addmm(t, x.to_dense(), y.to_dense())
+                    self.assertEqual(res, expected)
+
+                    res = torch.mm(x, y)
+                    expected = torch.mm(x.to_dense(), y.to_dense())
+                    self.assertEqual(res, expected)
+
+                if nnz0 is None:
+                    nnz0 = random.randint(di * dk // 2, di * dk)
+                t = torch.randn(di, dj, dtype=dtype, device=device)
+                x = self.genSparseCSRTensor((di, dk), nnz0, device=device, dtype=dtype, index_dtype=index_dtype)
+                y = torch.randn(dk, dj, dtype=dtype, device=device)
+                _test(t, x, y)
+
+                if nnz1 is None:
+                    nnz1 = random.randint(dk * dj // 2, dk * dj)
+                t = torch.randn(di, dj, dtype=dtype, device=device)
+                x = torch.randn(di, dk, dtype=dtype, device=device)
+                y = self.genSparseCSRTensor((dk, dj), nnz1, device=device, dtype=dtype, index_dtype=index_dtype)
+                _test(t, x, y)
 
         for i in range(2, 5):
             for j in range(2, 8):
                 for k in range(2, 8):
-                    test_shape(i, j, k, i * j // 2)
-        test_shape(4, 4, 4, 0)
+                    test_shape(i, j, k)
+        test_shape(4, 4, 4, 0, 0)
 
     @skipCPUIfNoMklSparse
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(*get_all_complex_dtypes(),
-                  *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC,
-                                     include_bfloat16=SM80OrLater and TEST_CUSPARSE_GENERIC))
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
+                  *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_mm(self, device, dtype):
         def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
@@ -827,9 +1133,9 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
             test_shape(7, 8, 9, 20, True, index_dtype)
 
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(*get_all_complex_dtypes(),
-                  *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC,
-                                     include_bfloat16=SM80OrLater and TEST_CUSPARSE_GENERIC))
+    @dtypesIfCUDA(*floating_and_complex_types_and(
+                  *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
+                  *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
@@ -861,10 +1167,10 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
     @dtypes(*floating_and_complex_types())
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(torch.complex64,
-                  *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()),
-                  *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater,
-                                                   include_half=SM53OrLater))
+    @dtypesIfCUDA(*floating_types_and(torch.complex64,
+                                      *[torch.bfloat16] if SM80OrLater else [],
+                                      *[torch.half] if SM53OrLater else [],
+                                      *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
     @skipCUDAIf(
         not _check_cusparse_spgemm_available(),
         "cuSparse Generic API SpGEMM is not available"
@@ -873,19 +1179,52 @@ def test_addmm_all_sparse_csr(self, device, dtype):
         M = torch.randn(10, 25, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse")
+
+        # Test 0-strided
+        M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25)
+        m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50)
+        m2 = torch.randn(50, 25, device=device).to(dtype)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse")
+
+        # Test beta=0, M=nan
+        M = torch.full((10, 25), float('nan'), device=device).to(dtype)
+        m1 = torch.randn(10, 50, device=device).to(dtype)
+        m2 = torch.randn(50, 25, device=device).to(dtype)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, mode="all_sparse")
+
+        # Test transpose
+        for t1, t2, t3, t4 in itertools.product([True, False], repeat=4):
+            def maybe_transpose(cond, m):
+                if not cond:
+                    return m
+                return m.t().clone(memory_format=torch.contiguous_format).t()
+
+            M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
+            m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
+            m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
+            _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, mode="all_sparse")
+
+    @onlyCPU
+    @skipCPUIfNoMklSparse
+    @dtypes(*floating_and_complex_types())
+    def test_addmm_dense_result(self, device, dtype):
+        M = torch.randn(10, 25, device=device).to(dtype)
+        m1 = torch.randn(10, 50, device=device).to(dtype)
+        m2 = torch.randn(50, 25, device=device).to(dtype)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="dense_result")
 
         # Test 0-strided
         M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25)
         m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="dense_result")
 
         # Test beta=0, M=nan
         M = torch.full((10, 25), float('nan'), device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
         m2 = torch.randn(50, 25, device=device).to(dtype)
-        _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, all_sparse=True)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, mode="dense_result")
 
         # Test transpose
         for t1, t2, t3, t4 in itertools.product([True, False], repeat=4):
@@ -897,34 +1236,34 @@ def maybe_transpose(cond, m):
             M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
             m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
             m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
-            _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, all_sparse=True)
+            _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, mode="dense_result")
 
+    @parametrize("k", [0, 1, 8])
+    @parametrize("n", [0, 1, 10])
+    @parametrize("m", [0, 1, 25])
     @skipCPUIfNoMklSparse
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(torch.complex64,
-                  *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()),
-                  *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater,
-                                                   include_half=SM53OrLater))
+    @dtypesIfCUDA(*floating_types_and(torch.complex64,
+                                      *[torch.bfloat16] if SM80OrLater else [],
+                                      *[torch.half] if SM53OrLater else [],
+                                      *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
     @skipCUDAIf(
         not _check_cusparse_spgemm_available(),
         "cuSparse Generic API SpGEMM is not available"
     )
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    def test_addmm_sizes_all_sparse_csr(self, device, dtype):
-        for m in [0, 1, 25]:
-            for n in [0, 1, 10]:
-                for k in [0, 1, 8]:
-                    M = torch.randn(n, m, device=device).to(dtype)
-                    m1 = torch.randn(n, k, device=device).to(dtype)
-                    m2 = torch.randn(k, m, device=device).to(dtype)
-                    _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True)
-
-                    M = torch.randn(n, m, device=device).to(dtype).to_sparse_csr()
-                    m1 = torch.randn(n, k + 1, device=device).to(dtype).to_sparse_csr()
-                    m2 = torch.randn(k, m, device=device).to(dtype).to_sparse_csr()
-                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2))
-                    self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2))
+    def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):
+        M = torch.randn(n, m, device=device).to(dtype)
+        m1 = torch.randn(n, k, device=device).to(dtype)
+        m2 = torch.randn(k, m, device=device).to(dtype)
+        _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse")
+
+        M = torch.randn(n, m, device=device).to(dtype).to_sparse_csr()
+        m1 = torch.randn(n, k + 1, device=device).to(dtype).to_sparse_csr()
+        m2 = torch.randn(k, m, device=device).to(dtype).to_sparse_csr()
+        self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2))
+        self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2))
 
     @skipCPUIfNoMklSparse
     @dtypes(torch.float)
@@ -1000,6 +1339,9 @@ def test2(*, is_sparse):
     @dtypes(torch.float, torch.double)
     def test_add(self, device, dtype):
         def _test_spadd_shape(nnz, shape):
+            # sparse.to_dense() uses torch.add internally so if torch.add is wrong,
+            # the dense tensor will be wrong but this test would still pass
+            # there's a separate test that checks for the correctness of the .to_dense() call
             x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
             y = torch.randn(*shape, dtype=dtype, device=device)
             r = random.random()
@@ -1021,10 +1363,74 @@ def _test_spadd_shape(nnz, shape):
 
             self.assertEqual(res, expected)
 
-        _test_spadd_shape(10, [100, 100])
-        _test_spadd_shape(0, [100, 100])
-        _test_spadd_shape(10, [100, 1])
-        _test_spadd_shape(10, [1, 100])
+        ns = [2, 5]
+        batch_shapes = [(), (2,), (2, 3)]
+        for b, m, n in itertools.product(batch_shapes, ns, ns):
+            _test_spadd_shape(0, (*b, m, n))
+            _test_spadd_shape(m * n // 2, (*b, m, n))
+            _test_spadd_shape(m * n, (*b, m, n))
+
+    @dtypes(torch.float, torch.double)
+    def test_mul(self, device, dtype):
+        # TODO: This whole test should be migrated to OpInfos
+        def _test_spadd_shape(fn, nnz, shape):
+            x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
+            y = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
+
+            # Forward comparison
+            res_sparse_sparse = fn(y, x)
+            res_dense_sparse = fn(y.to_dense(), x)
+            res_sparse_dense = fn(y, x.to_dense())
+            expected = fn(y.to_dense(), x.to_dense()).to_sparse_csr()
+            self.assertEqual(res_sparse_sparse, expected)
+            # TODO: While result of mul(dense, csr) is csr, it is not fully compressed.
+            # That means it may contain materialized zeros, since the dense argument
+            # is converted according to the sparsity pattern of csr. In the future
+            # we might require the result to be fully compressed.
+            self.assertEqual(res_dense_sparse.to_dense(), expected.to_dense())
+            self.assertEqual(res_sparse_dense.to_dense(), expected.to_dense())
+
+            # Grad comparison
+            x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
+            y = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
+            z = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32)
+
+            # csr * csr -> csr with csr, csr gradients
+            x_a = x.clone().requires_grad_()
+            y_a = y.clone().requires_grad_()
+
+            fn(y_a, x_a).backward(z)
+
+            x_dense_a = x.to_dense().requires_grad_()
+            y_dense_a = y.to_dense().requires_grad_()
+
+            fn(y_dense_a, x_dense_a).backward(z.to_dense())
+
+            self.assertEqual(x_a.grad.layout, torch.sparse_csr)
+            self.assertEqual(y_a.grad.layout, torch.sparse_csr)
+
+            self.assertEqual(x_a.grad.to_dense(), x_dense_a.grad)
+            self.assertEqual(y_a.grad.to_dense(), y_dense_a.grad)
+
+            # TODO: Currently strided Tensors cannot have csr gradients
+            # dense * csr -> csr with csr, dense gradients
+            x_a = x.clone().requires_grad_()
+            y_a = y.to_dense().clone().requires_grad_()
+            err_msg = "Function MulBackward0 returned an invalid gradient at index 0 - expected layout Strided but got SparseCsr"
+            with self.assertRaisesRegex(RuntimeError, err_msg):
+                fn(y_a, x_a).backward(z)
+
+            # csr * dense -> csr with dense, csr gradients
+            x_a = x.to_dense().clone().requires_grad_()
+            y_a = y.clone().requires_grad_()
+            err_msg = "Function MulBackward0 returned an invalid gradient at index 1 - expected layout Strided but got SparseCsr"
+            with self.assertRaisesRegex(RuntimeError, err_msg):
+                fn(y_a, x_a).backward(z)
+
+        _test_spadd_shape(torch.mul, 100, [100, 100])
+        _test_spadd_shape(torch.mul, 0, [100, 100])
+        _test_spadd_shape(torch.mul, 100, [100, 1])
+        _test_spadd_shape(torch.mul, 100, [1, 100])
 
     @skipCPUIfNoMklSparse
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
@@ -1135,7 +1541,6 @@ def run_test(n, k, upper, unitriangular, transpose, zero):
             run_test(n, k, upper, unitriangular, transpose, zero)
 
     @skipCUDAIfRocm
-    @onlyCUDA
     @skipCUDAIf(
         not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
@@ -1162,7 +1567,7 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
             out = torch.sparse_csr_tensor(
                 *map(torch.clone, (actual.crow_indices(), actual.col_indices())),
                 torch.empty_like(actual.values()),
-                size=c.shape
+                size=actual.shape
             )
             torch.sparse.sampled_addmm(c, a, b, alpha=alpha, beta=beta, out=out)
 
@@ -1171,15 +1576,52 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
             self.assertEqual(actual.to_dense(), out.to_dense())
             self.assertEqual(actual.to_dense(), expected)
 
+        mnk = itertools.product([2, 5], repeat=3)
+        batch_shapes = [(), (2,), (2, 3)] if self.device_type == 'cuda' else [(), ]
+        tf = [True, False]
         for index_dtype in [torch.int32, torch.int64]:
-            for (m, n, k), noncontiguous in zip(itertools.product([1, 5], repeat=3), [True, False]):
+            for (m, n, k), b, noncontiguous, bcast_c in itertools.product(mnk, batch_shapes, tf, tf):
+                if bcast_c and len(b) == 0:
+                    continue
                 nnz = random.randint(0, m * n)
-                c = self.genSparseCSRTensor((m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
-                a = make_tensor((m, k), dtype=dtype, device=device, noncontiguous=noncontiguous)
-                b = make_tensor((k, n), dtype=dtype, device=device, noncontiguous=noncontiguous)
+                c_batch = () if bcast_c else b
+                c = self.genSparseCSRTensor((*c_batch, m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
+                a = make_tensor((*b, m, k), dtype=dtype, device=device, noncontiguous=noncontiguous)
+                b = make_tensor((*b, k, n), dtype=dtype, device=device, noncontiguous=noncontiguous)
                 for op_a, op_b in itertools.product([True, False], repeat=2):
                     run_test(c, a, b, op_a, op_b)
 
+    @skipCUDAIfRocm
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_sampled_addmm_autograd(self, device, dtype):
+        from torch.testing._internal.common_methods_invocations import sample_inputs_sparse_sampled_addmm
+
+        samples = list(sample_inputs_sparse_sampled_addmm(None, device, dtype, requires_grad=True))
+
+        for sample, dense_covector in zip(samples, [True, False]):
+            c = sample.input
+            a = sample.args[0]
+            b = sample.args[1]
+
+            # Compute sparse result
+            output = torch.sparse.sampled_addmm(c, a, b, **sample.kwargs)
+            covector = torch.randn_like(output).to_dense() if dense_covector else torch.randn_like(output)
+            output.backward(covector)
+
+            # Compute dense result and compare with sparse result
+            c1, a1, b1 = map(lambda x: x.detach().to_dense().requires_grad_(True), [c, a, b])
+            dense_output = sample.kwargs['alpha'] * (a1 @ b1) * torch.ones_like(c).to_dense() + sample.kwargs['beta'] * c1
+            self.assertEqual(output, dense_output)
+            dense_covector = covector.to_dense()
+            dense_output.backward(dense_covector)
+            self.assertEqual(c.grad, c1.grad)
+            self.assertEqual(a.grad, a1.grad)
+            self.assertEqual(b.grad, b1.grad)
+
     @skipCUDAIfRocm
     @onlyCUDA
     @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
@@ -1220,21 +1662,21 @@ def test_sampled_addmm_errors(self, device, dtype):
 
         # mat1 must be a matrix
         with self.assertRaisesRegex(RuntimeError, r"Expected mat1 to be a matrix"):
-            torch.sparse.sampled_addmm(a_sparse, a.unsqueeze(0), a)
+            torch.sparse.sampled_addmm(a_sparse, a[..., 0, :], a)
 
         # mat2 must be a matrix
         with self.assertRaisesRegex(RuntimeError, r"Expected mat2 to be a matrix"):
-            torch.sparse.sampled_addmm(a_sparse, a, a.unsqueeze(0))
+            torch.sparse.sampled_addmm(a_sparse, a, a[..., 0, :])
 
         a = make_tensor((2, 2), dtype=dtype, device=device)
         b = make_tensor((3, 3), dtype=dtype, device=device)
         b_sparse = b.to_sparse_csr()
-        with self.assertRaisesRegex(RuntimeError, r"self dim 0 must match mat1 dim 0"):
+        with self.assertRaisesRegex(RuntimeError, r"self.shape\[-2\] must match mat1.shape\[-2\]"):
             torch.sparse.sampled_addmm(b_sparse, a, a)
 
         b = make_tensor((2, 3), dtype=dtype, device=device)
         b_sparse = b.to_sparse_csr()
-        with self.assertRaisesRegex(RuntimeError, r"self dim 1 must match mat2 dim 1"):
+        with self.assertRaisesRegex(RuntimeError, r"self.shape\[-1\] must match mat2.shape\[-1\]"):
             torch.sparse.sampled_addmm(b_sparse, a, a)
 
         a = make_tensor((2, 2), dtype=dtype, device=device)
@@ -1245,7 +1687,8 @@ def test_sampled_addmm_errors(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, r"Expected mat2 to have strided layout"):
             torch.sparse.sampled_addmm(a_sparse, a, a_sparse)
 
-    @dtypes(*get_all_dtypes())
+    @skipMeta
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_coo_csr_conversion(self, device, dtype):
         for m, n in itertools.product([5, 2, 0], [5, 2, 0]):
             size = (m, n)
@@ -1255,9 +1698,20 @@ def test_coo_csr_conversion(self, device, dtype):
 
             self.assertEqual(csr_sparse.to_dense(), dense)
 
+    @skipMeta
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_csr_coo_conversion(self, device, dtype):
+        for m, n in itertools.product([5, 2, 0], [5, 2, 0]):
+            size = (m, n)
+            dense = make_tensor(size, dtype=dtype, device=device)
+            csr_sparse = dense.to_sparse_csr()
+            coo_sparse = csr_sparse.to_sparse()
+
+            self.assertEqual(coo_sparse.to_dense(), dense)
+
     @ops(_sparse_csr_ops)
     def test_sparse_csr_consistency(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype)
+        samples = list(op.sample_inputs(device, dtype))
 
         # Fail early to prevent silent success with this test
         ndims_equals_2d = (s.input.ndim == 2 for s in samples)
@@ -1269,7 +1723,9 @@ def test_sparse_csr_consistency(self, device, dtype, op):
             # Sparse CSR only supports 2D tensors as inputs
             if sample.input.ndim != 2:
                 continue
-
+            # Reductions on sparse CSR require keepdim=True
+            if isinstance(op, ReductionOpInfo):
+                continue
             expected = op(sample.input)
             assert torch.is_tensor(expected)
             output = op(sample.input.to_sparse_csr())
@@ -1326,10 +1782,7 @@ def test_sparse_csr_unary_out(self, device, dtype, op):
                                           index_dtype=sample.input.crow_indices().dtype)
             op(sample.input, *sample.args, **sample.kwargs, out=out)
 
-            self.assertEqual(out.values(), expect.values())
-            self.assertEqual(out.crow_indices(), expect.crow_indices())
-            self.assertEqual(out.col_indices(), expect.col_indices())
-            self.assertEqual(out._nnz(), expect._nnz())
+            self.assertEqual(out, expect)
 
     @ops(sparse_csr_unary_ufuncs)
     def test_sparse_csr_unary_inplace(self, device, dtype, op):
@@ -1361,12 +1814,178 @@ def test_sparse_csr_unary_inplace(self, device, dtype, op):
             actual = op.inplace_variant(sample.input, *sample.args, **sample.kwargs)
 
             self.assertIs(actual, sample.input)
-            self.assertEqual(actual.values(), expect.values())
-            self.assertEqual(actual.crow_indices(), expect.crow_indices())
-            self.assertEqual(actual.col_indices(), expect.col_indices())
-            self.assertEqual(actual._nnz(), expect._nnz())
+            self.assertEqual(actual, expect)
+
+    @ops(sparse_csr_unary_ufuncs, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, torch.cdouble])
+    def test_autograd_sparse_csr_unary(self, device, dtype, op):
+        if op.name not in UNARY_EWISE_CSR_ALLOW_AUTOGRAD:
+            self.skipTest(f"Skipped! Unary op {op.name} not supported with CSR input and autograd")
+
+        samples = list(op.sample_inputs(device, dtype))
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_bfloat16=False))
+        # Fail early to prevent silent success with this test
+        ndims_equals_2d = (s.input.ndim == 2 for s in samples)
+        if not any(ndims_equals_2d):
+            raise ValueError("Expected at least one 2D tensor in samples.")
+
+        for sample in samples:
+            sparse_input = sample.input.to_sparse_csr().requires_grad_(True)
+
+            def fn(input):
+                output = op.gradcheck_wrapper(op.get_op(), input, *sample.args, **sample.kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            # Compute sparse result
+            output = fn(sparse_input)
+            covector = torch.randn_like(output)
+            output.backward(covector)
+            self.assertTrue(torch.is_tensor(sparse_input.grad))
+            self.assertTrue(sparse_input.grad.is_sparse_csr)
+
+            # Compute dense result and compare with sparse result
+            dense_input = sparse_input.detach().to_dense().requires_grad_(True)
+            dense_output = fn(dense_input)
+            dense_covector = covector.to_dense()
+            dense_output.backward(dense_covector)
+            self.assertEqual(sparse_input.grad, dense_input.grad)
+
+    @skipCUDAIfRocm
+    @skipCUDAIf(
+        not _check_cusparse_sddmm_available(),
+        "cuSparse Generic API SDDMM is not available"
+    )
+    @dtypes(torch.float64)
+    def test_autograd_dense_output_addmm(self, device, dtype):
+        from torch.testing._internal.common_methods_invocations import sample_inputs_addmm
+
+        samples = list(sample_inputs_addmm(None, device, dtype, requires_grad=True))
+
+        # Fail early to prevent silent success with this test
+        ndims_equals_2d = (s.args[0].ndim == 2 for s in samples)
+        if not any(ndims_equals_2d):
+            raise ValueError("Expected at least one 2D tensor in samples to convert to sparse.")
+
+        for sample in samples:
+            a = sample.args[0].relu().to_sparse_csr()
+
+            # This path tests the autograd path wrt dense inputs
+            for addmm in [torch.addmm, torch.sparse.addmm]:
+
+                def fn(c, b):
+                    output = addmm(c, a, b, **sample.kwargs)
+                    if sample.output_process_fn_grad is not None:
+                        return sample.output_process_fn_grad(output)
+                    return output
+
+                self.assertTrue(torch.autograd.gradcheck(fn, [sample.input, sample.args[1]], fast_mode=True))
+
+                # noncontiguous
+                c = make_tensor(sample.input.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True)
+                b = make_tensor(sample.args[1].shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True)
+                self.assertTrue(torch.autograd.gradcheck(fn, [c, b], fast_mode=True))
+
+                # Now test the autograd path wrt sparse inputs
+                for reverse in [True, False]:
+                    c, b = sample.input, sample.args[1]
+                    if reverse and a.shape != b.shape:
+                        continue
+
+                    def fn(a):
+                        inputs = (c, b, a) if reverse else (c, a, b)
+                        output = addmm(*inputs, **sample.kwargs)
+                        if sample.output_process_fn_grad is not None:
+                            return sample.output_process_fn_grad(output)
+                        return output
+
+                    # gradcheck doesn't work for sparse CSR yet, compare against dense path
+                    # Compute sparse result
+                    a = a.detach().requires_grad_(True)
+                    output = fn(a)
+                    covector = torch.randn_like(output)
+                    output.backward(covector)
+                    self.assertTrue(torch.is_tensor(a.grad))
+                    if addmm == torch.sparse.addmm:
+                        self.assertTrue(a.grad.is_sparse_csr)
+                    else:
+                        self.assertTrue(a.grad.layout == torch.strided)
+
+                    # Compute dense result and compare with sparse result
+                    dense_a = a.detach().to_dense().requires_grad_(True)
+                    dense_output = fn(dense_a)
+                    self.assertEqual(output, dense_output)
+                    dense_covector = covector.to_dense()
+                    dense_output.backward(dense_covector)
+
+                    if addmm == torch.sparse.addmm:
+                        self.assertEqual(a.grad, dense_a.grad.sparse_mask(a))
+                    else:
+                        self.assertEqual(a.grad, dense_a.grad)
+
+    @skipCUDAIfRocm
+    @skipCPUIfNoMklSparse
+    @dtypes(torch.float64)
+    def test_autograd_dense_output_addmv(self, device, dtype):
+        from torch.testing._internal.common_methods_invocations import sample_inputs_addmv
+
+        samples = list(sample_inputs_addmv(None, device, dtype, requires_grad=True))
+
+        # Fail early to prevent silent success with this test
+        ndims_equals_2d = (s.args[0].ndim == 2 for s in samples)
+        if not any(ndims_equals_2d):
+            raise ValueError("Expected at least one 2D tensor in samples to convert to sparse.")
+
+        for sample in samples:
+            # TODO: Remove detach once we have autograd support for CSR input
+            a = sample.args[0].to_sparse_csr().detach()
+
+            def fn(c, b):
+                output = torch.addmv(c, a, b, **sample.kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            self.assertTrue(torch.autograd.gradcheck(fn, [sample.input, sample.args[1]], fast_mode=True))
+
+            # noncontiguous
+            c = make_tensor(sample.input.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True)
+            b = make_tensor(sample.args[1].shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True)
+            self.assertTrue(torch.autograd.gradcheck(fn, [c, b], fast_mode=True))
+
+    @ops(binary_ops_with_dense_output, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, ])
+    def test_autograd_dense_output(self, device, dtype, op):
+        if op.name == "mv" and no_mkl_sparse and self.device_type == 'cpu':
+            self.skipTest("MKL Sparse is not available")
+        if op.name == "mv" and TEST_WITH_ROCM and self.device_type == 'cuda':
+            # mv currently work only on CUDA
+            self.skipTest("ROCm is not supported")
+
+        samples = list(op.sample_inputs(device, dtype, requires_grad=True))
+
+        # Fail early to prevent silent success with this test
+        ndims_equals_2d = (s.input.ndim == 2 for s in samples)
+        if not any(ndims_equals_2d):
+            raise ValueError("Expected at least one 2D tensor in samples.")
+
+        # Here we assume that the signature is op(sparse_input, dense_input) -> dense_output
+        for sample in samples:
+            # TODO: Remove detach once we have autograd support for CSR input
+            sparse_input = sample.input.to_sparse_csr().detach()
+
+            def fn(*args):
+                output = op.gradcheck_wrapper(op.get_op(), sparse_input, *args, **sample.kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            self.assertTrue(torch.autograd.gradcheck(fn, sample.args, fast_mode=True))
+
+            # noncontiguous
+            args = [make_tensor(a.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) for a in sample.args]
+            self.assertTrue(torch.autograd.gradcheck(fn, args, fast_mode=True))
+
+    @dtypes(*all_types_and_complex())
     def test_direct_coo_csr_conversion(self, device, dtype):
         for m, n in itertools.product([5, 2, 0], [5, 2, 0]):
             size = (m, n)
@@ -1376,7 +1995,25 @@ def test_direct_coo_csr_conversion(self, device, dtype):
             self.assertEqual(coo_sparse.to_sparse_csr().to_sparse_coo(), coo_sparse)
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_sum(self, device, dtype):
+        def run_test(shape, nnz, index_type):
+            a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
+            self.assertEqual(a.sum(), a.values().sum())
+            if dtype in floating_types():
+                a.requires_grad_(True)
+                a.sum().backward()
+                self.assertEqual(a.grad, torch.ones(shape, dtype=dtype, device=device))
+        for shape, index_dtype in itertools.product(
+                [(10, 5), (10, 10)],
+                [torch.int32, torch.int64]):
+            run_test(shape, 0, index_dtype)
+            run_test(shape, max(shape), index_dtype)
+            run_test(shape, shape[0] * shape[1], index_dtype)
+
+
+    @skipMeta
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_transpose(self, device, dtype):
 
         def run_test(shape, nnz, index_type, dim0, dim1):
@@ -1397,21 +2034,173 @@ def run_test(shape, nnz, index_type, dim0, dim1):
     # TODO: This is a stopgap for a rigorous extension of our autograd tests
     # to test the functionality of detach
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_exercise_detach(self, device, dtype):
         shape = (3, 3)
         nnz = 4
         for index_dtype in [torch.int32, torch.int64]:
             inp = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype)
             detached_inp = inp.detach()
-            self.assertEqual(inp.values(), detached_inp.values())
-            self.assertEqual(inp.crow_indices(), detached_inp.crow_indices())
-            self.assertEqual(inp.col_indices(), detached_inp.col_indices())
+            self.assertEqual(inp, detached_inp)
+
+    def _convert_to_layout(self, a, target_layout):
+        """
+        Helper function to call the correct layout conversion
+        with reasonable defaults for the block size. Clearly there
+        is a need for a to.layout overload.
+        """
+        if target_layout is torch.sparse_csr:
+            return a.to_sparse_csr()
+        if target_layout is torch.sparse_csc:
+            return a.to_sparse_csc()
+        if target_layout is torch.sparse_bsr:
+            return a.to_sparse_bsr((2, 2))
+        if target_layout is torch.sparse_bsc:
+            return a.to_sparse_bsc((2, 2))
+        raise NotImplementedError(repr(a))
+
+    def _construct_sp_matrix(self, tensor, layout):
+        if tensor.layout in [torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.strided]:
+            tensor = tensor.to_dense()
+        else:
+            raise NotImplementedError(repr(tensor))
+        if layout is torch.sparse_csr:
+            return sp.csr_matrix(tensor.cpu().numpy())
+        if layout is torch.sparse_csc:
+            return sp.csc_matrix(tensor.cpu().numpy())
+        if layout is torch.sparse_bsr:
+            return sp.bsr_matrix(tensor.cpu().numpy())
+        # No native scipy BSC support?
+        raise NotImplementedError(repr(tensor))
 
+    @skipMeta
+    @all_sparse_compressed_layouts('to_layout')
+    @all_sparse_compressed_layouts('from_layout')
+    def test_compressed_layout_conversions_coverage(self, device, from_layout, to_layout):
+        """
+        This test performs a smoke test for covered conversion and verifies
+        that an exception is thrown for unsupported conversions.
+        """
+
+        def _to_from_layout(layout_a, layout_b):
+            a = make_tensor((6, 10), dtype=torch.float, device=device)
+            expect_error = (layout_a in [torch.sparse_csc, torch.sparse_bsc]
+                            or layout_b in [torch.sparse_csc, torch.sparse_bsc])
+            expect_error = expect_error or (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_bsr)
+            expect_error = expect_error or (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_csr)
+            # CSC to CSR conversion is supported
+            if layout_a is torch.sparse_csc and layout_b is torch.sparse_csr:
+                expect_error = False
+            # CSC to CSC conversion is supported
+            if layout_a is torch.sparse_csc and layout_b is torch.sparse_csc:
+                expect_error = False
+            if expect_error:
+                with self.assertRaises(RuntimeError):
+                    b = self._convert_to_layout(a, layout_a)
+                    self._convert_to_layout(b, layout_b)
+            else:
+                b = self._convert_to_layout(a, layout_a)
+                c = self._convert_to_layout(b, layout_b)
+                if (layout_a is not torch.sparse_bsr and layout_b is not torch.sparse_bsr):
+                    self.assertEqual(a.to_dense(), c.to_dense())
+
+        _to_from_layout(from_layout, to_layout)
+
+    @skipMeta
+    @all_sparse_compressed_layouts()
+    def test_dense_to_from_sparse_compressed(self, device, layout):
+        """
+        This test tests conversion from dense to/from CSR and CSC
+        by comparing to SciPy's implementation.
+
+        TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage
+        """
+        if layout is torch.sparse_bsc:
+            # TODO: Remove this once support has been enabled
+            return
+        if layout is torch.sparse_bsr:
+            # TODO: Remove this once support has been enabled
+            return
+
+        for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]:
+            dense = make_tensor(shape, dtype=torch.float, device=device)
+            dense = dense.relu()  # Introduce some sparsity
+            sp_matrix = self._construct_sp_matrix(dense, layout)
+            pt_matrix = self._convert_to_layout(dense, layout)
+
+            compressed_indices_mth = {
+                torch.sparse_csr: torch.Tensor.crow_indices,
+                torch.sparse_csc: torch.Tensor.ccol_indices,
+            }[layout]
+
+            plain_indices_mth = {
+                torch.sparse_csr: torch.Tensor.col_indices,
+                torch.sparse_csc: torch.Tensor.row_indices,
+            }[layout]
+
+            self.assertEqual(layout, pt_matrix.layout)
+            self.assertEqual(sp_matrix.shape, pt_matrix.shape)
+            self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values())
+
+            self.assertEqual(dense, pt_matrix.to_dense())
+
+    @skipMeta
+    @all_sparse_compressed_layouts()
+    @coalescedonoff
+    @dtypes(torch.double)
+    def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout):
+        """
+        This test tests conversion from COO to CSR and CSC and CSC to CSR and CSC
+        by comparing to SciPy's implementation.
+
+        TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage
+        """
+        if layout is torch.sparse_bsc:
+            # TODO: Remove this once support has been enabled
+            return
+        if layout is torch.sparse_bsr:
+            # TODO: Remove this once support has been enabled
+            return
+
+        for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]:
+            sparse_dim = 2
+            nnz = shape[0] * shape[1] // 2
+            sparse, _, _ = self.genSparseTensor(shape, sparse_dim, nnz, coalesced, device, dtype)
+            sp_matrix = self._construct_sp_matrix(sparse, layout)
+            pt_matrix = self._convert_to_layout(sparse, layout)
+
+            compressed_indices_mth = {
+                torch.sparse_csr: torch.Tensor.crow_indices,
+                torch.sparse_csc: torch.Tensor.ccol_indices,
+            }[layout]
+
+            plain_indices_mth = {
+                torch.sparse_csr: torch.Tensor.col_indices,
+                torch.sparse_csc: torch.Tensor.row_indices,
+            }[layout]
+
+            self.assertEqual(layout, pt_matrix.layout)
+            self.assertEqual(sp_matrix.shape, pt_matrix.shape)
+            self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values())
+
+            sparse_csc = sparse.to_sparse_csc()
+            sp_matrix = self._construct_sp_matrix(sparse_csc, layout)
+            pt_matrix = self._convert_to_layout(sparse_csc, layout)
+
+            self.assertEqual(layout, pt_matrix.layout)
+            self.assertEqual(sp_matrix.shape, pt_matrix.shape)
+            self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix))
+            self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values())
 
 
 # e.g., TestSparseCSRCPU and TestSparseCSRCUDA
 instantiate_device_type_tests(TestSparseCSR, globals())
+instantiate_device_type_tests(TestSparseCompressed, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index fecb4735976e..b4f37cc1558e 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -10,12 +10,14 @@
 import inspect
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA, TEST_MKL)
+    (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA, TEST_MKL, first_sample, TEST_WITH_ROCM,
+     make_tensor)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, dtypes, onlyNativeDeviceTypes,
-     skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf)
+     skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf, toleranceOverride, tol)
 from torch.testing._internal.common_methods_invocations import (
-    spectral_funcs, SpectralFuncInfo, SpectralFuncType)
+    spectral_funcs, SpectralFuncType)
+from torch.testing._internal.common_cuda import SM53OrLater
 
 from setuptools import distutils
 from typing import Optional, List
@@ -110,102 +112,27 @@ def _stft_reference(x, hop_length, window):
         X[:, m] = torch.fft.fft(slc * window)
     return X
 
-# Tests of functions related to Fourier analysis in the torch.fft namespace
-class TestFFT(TestCase):
-    exact_dtype = True
 
-    # rocFFT requires/assumes that the input to hipfftExecC2R or hipfftExecZ2D
-    # is of the form that is a valid output from a real to complex transform
-    # (i.e. it cannot be a set of random numbers)
-    # So for ROCm, call np.fft.rfftn and use its output as the input
-    # for testing ops that call hipfftExecC2R
-    def _generate_valid_rocfft_input(self, input, op, s, dim, norm):
-        def get_op_name(op):
-            if type(op) == SpectralFuncInfo:
-                return op.name
-            else:
-                return op.__name__
-
-        op_name = get_op_name(op)
-
-        # pick ops that call hipfftExecC2R or hipfftExecZ2D
-        if op_name in ("fft.irfft", "fft.hfft"):
-            n = s
-            # figure out fft_size
-            if dim is None and n is None:
-                dim = tuple(range(-(input.dim()), 0))
-                s = [input.size(d) for d in dim]
-            elif dim is None and n is not None:
-                dim = -1
-                s = [n]
-            elif dim is not None and n is None:
-                s = [input.size(d) for d in [dim]]
-            else:
-                s = [n]
-            fft_size = s[-1]
+def skip_helper_for_fft(device, dtype):
+    device_type = torch.device(device).type
+    if dtype not in (torch.half, torch.complex32):
+        return
 
-            # make fft_size even to match rocfft behavior to cuda and numpy
-            if (fft_size % 2) != 0:
-                n = fft_size + 1
+    if device_type == 'cpu':
+        raise unittest.SkipTest("half and complex32 are not supported on CPU")
+    if TEST_WITH_ROCM:
+        raise unittest.SkipTest("half and complex32 are not supported on ROCM")
+    if not SM53OrLater:
+        raise unittest.SkipTest("half and complex32 are only supported on CUDA device with SM>53")
 
-            # generate Hermitian symmetric input
-            if torch.is_complex(input):
-                valid_input = torch.fft.rfft(input.real, n=n, dim=dim, norm=norm)
-            else:
-                valid_input = torch.fft.rfft(input, n=n, dim=dim, norm=norm)
-
-            return (valid_input, n, dim, norm)
-        elif op_name in ("fft.irfftn", "fft.hfftn"):
-            # figure out fft_size
-            if dim is None and s is None:
-                dim = tuple(range(-(input.dim()), 0))
-                s = [input.size(d) for d in dim]
-            elif dim is None and s is not None:
-                dim = tuple(range(-(len(s)), 0))
-            elif dim is not None and s is None:
-                s = [input.size(d) for d in dim]
-
-            fft_size = s[-1]
-
-            # make fft_size even to match rocfft behavior to cuda and numpy
-            if (fft_size % 2) != 0:
-                if type(s) is tuple:
-                    s = list(s)
-                    s[-1] = fft_size + 1
-
-            # generate Hermitian symmetric input
-            if torch.is_complex(input):
-                valid_input = torch.fft.rfftn(input.real, s=s, dim=dim, norm=norm)
-            else:
-                valid_input = torch.fft.rfftn(input, s=s, dim=dim, norm=norm)
-            return (valid_input, s, dim, norm)
-        elif op_name in ("fft_irfft2", "fft_hfft2"):
-            # figure out fft_size
-            if dim is None and s is None:
-                dim = tuple(range(-(2), 0))
-                s = [input.size(d) for d in dim]
-            elif dim is None and s is not None:
-                dim = tuple(range(-(len(s)), 0))
-            elif dim is not None and s is None:
-                s = [input.size(d) for d in dim]
-            fft_size = s[-1]
-
-            # make fft_size even to match rocfft behavior to cuda and numpy
-            if (fft_size % 2) != 0:
-                if type(s) is tuple:
-                    s = list(s)
-                    s[-1] = fft_size + 1
-            # generate Hermitian symmetric input
-            if torch.is_complex(input):
-                valid_input = torch.fft.rfft2(input.real, s=s, dim=dim, norm=norm)
-            else:
-                valid_input = torch.fft.rfft2(input, s=s, dim=dim, norm=norm)
-            return (valid_input, s, dim, norm)
-        else:
-            return (input, s, dim, norm)
+
+# Tests of functions related to Fourier analysis in the torch.fft namespace
+class TestFFT(TestCase):
+    exact_dtype = True
 
     @onlyNativeDeviceTypes
-    @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.OneD])
+    @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.OneD],
+         allowed_dtypes=(torch.float, torch.cfloat))
     def test_reference_1d(self, device, dtype, op):
         if op.ref is None:
             raise unittest.SkipTest("No reference implementation")
@@ -239,10 +166,6 @@ def test_reference_1d(self, device, dtype, op):
             input = args[0]
             args = args[1:]
 
-            if torch.version.hip is not None and input.device.type == 'cuda':
-                input, args[0], args[1], args[2] = self._generate_valid_rocfft_input(
-                    input, op, args[0], args[1], args[2])
-
             expected = op.ref(input.cpu().numpy(), *args)
             exact_dtype = dtype in (torch.double, torch.complex128)
             actual = op(input, *args)
@@ -250,20 +173,39 @@ def test_reference_1d(self, device, dtype, op):
 
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
-    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    @toleranceOverride({
+        torch.half : tol(1e-2, 1e-2),
+        torch.chalf : tol(1e-2, 1e-2),
+    })
+    @dtypes(torch.half, torch.float, torch.double, torch.complex32, torch.complex64, torch.complex128)
     def test_fft_round_trip(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
         # Test that round trip through ifft(fft(x)) is the identity
-        test_args = list(product(
-            # input
-            (torch.randn(67, device=device, dtype=dtype),
-             torch.randn(80, device=device, dtype=dtype),
-             torch.randn(12, 14, device=device, dtype=dtype),
-             torch.randn(9, 6, 3, device=device, dtype=dtype)),
-            # dim
-            (-1, 0),
-            # norm
-            (None, "forward", "backward", "ortho")
-        ))
+        if dtype not in (torch.half, torch.complex32):
+            test_args = list(product(
+                # input
+                (torch.randn(67, device=device, dtype=dtype),
+                 torch.randn(80, device=device, dtype=dtype),
+                 torch.randn(12, 14, device=device, dtype=dtype),
+                 torch.randn(9, 6, 3, device=device, dtype=dtype)),
+                # dim
+                (-1, 0),
+                # norm
+                (None, "forward", "backward", "ortho")
+            ))
+        else:
+            # cuFFT supports powers of 2 for half and complex half precision
+            test_args = list(product(
+                # input
+                (torch.randn(64, device=device, dtype=dtype),
+                 torch.randn(128, device=device, dtype=dtype),
+                 torch.randn(4, 16, device=device, dtype=dtype),
+                 torch.randn(8, 6, 2, device=device, dtype=dtype)),
+                # dim
+                (-1, 0),
+                # norm
+                (None, "forward", "backward", "ortho")
+            ))
 
         fft_functions = [(torch.fft.fft, torch.fft.ifft)]
         # Real-only functions
@@ -282,13 +224,17 @@ def test_fft_round_trip(self, device, dtype):
                 }
 
                 y = backward(forward(x, **kwargs), **kwargs)
+                if x.dtype is torch.half and y.dtype is torch.complex32:
+                    # Since type promotion currently doesn't work with complex32
+                    # manually promote `x` to complex32
+                    x = x.to(torch.complex32)
                 # For real input, ifft(fft(x)) will convert to complex
                 self.assertEqual(x, y, exact_dtype=(
                     forward != torch.fft.fft or x.is_complex()))
 
     # Note: NumPy will throw a ValueError for an empty input
     @onlyNativeDeviceTypes
-    @ops(spectral_funcs)
+    @ops(spectral_funcs, allowed_dtypes=(torch.half, torch.float, torch.complex32, torch.cfloat))
     def test_empty_fft(self, device, dtype, op):
         t = torch.empty(1, 0, device=device, dtype=dtype)
         match = r"Invalid number of data points \([-\d]*\) specified"
@@ -296,6 +242,16 @@ def test_empty_fft(self, device, dtype, op):
         with self.assertRaisesRegex(RuntimeError, match):
             op(t)
 
+    @onlyNativeDeviceTypes
+    def test_empty_ifft(self, device):
+        t = torch.empty(2, 1, device=device, dtype=torch.complex64)
+        match = r"Invalid number of data points \([-\d]*\) specified"
+
+        for f in [torch.fft.irfft, torch.fft.irfft2, torch.fft.irfftn,
+                  torch.fft.hfft, torch.fft.hfft2, torch.fft.hfftn]:
+            with self.assertRaisesRegex(RuntimeError, match):
+                f(t)
+
     @onlyNativeDeviceTypes
     def test_fft_invalid_dtypes(self, device):
         t = torch.randn(64, device=device, dtype=torch.complex128)
@@ -311,8 +267,11 @@ def test_fft_invalid_dtypes(self, device):
 
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
-    @dtypes(torch.int8, torch.float, torch.double, torch.complex64, torch.complex128)
+    @dtypes(torch.int8, torch.half, torch.float, torch.double,
+            torch.complex32, torch.complex64, torch.complex128)
     def test_fft_type_promotion(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
+
         if dtype.is_complex or dtype.is_floating_point:
             t = torch.randn(64, device=device, dtype=dtype)
         else:
@@ -320,8 +279,10 @@ def test_fft_type_promotion(self, device, dtype):
 
         PROMOTION_MAP = {
             torch.int8: torch.complex64,
+            torch.half: torch.complex32,
             torch.float: torch.complex64,
             torch.double: torch.complex128,
+            torch.complex32: torch.complex32,
             torch.complex64: torch.complex64,
             torch.complex128: torch.complex128,
         }
@@ -330,17 +291,27 @@ def test_fft_type_promotion(self, device, dtype):
 
         PROMOTION_MAP_C2R = {
             torch.int8: torch.float,
+            torch.half: torch.half,
             torch.float: torch.float,
             torch.double: torch.double,
+            torch.complex32: torch.half,
             torch.complex64: torch.float,
             torch.complex128: torch.double,
         }
-        R = torch.fft.hfft(t)
+        if dtype in (torch.half, torch.complex32):
+            # cuFFT supports powers of 2 for half and complex half precision
+            # NOTE: With hfft and default args where output_size n=2*(input_size - 1),
+            # we make sure that logical fft size is a power of two.
+            x = torch.randn(65, device=device, dtype=dtype)
+            R = torch.fft.hfft(x)
+        else:
+            R = torch.fft.hfft(t)
         self.assertEqual(R.dtype, PROMOTION_MAP_C2R[dtype])
 
         if not dtype.is_complex:
             PROMOTION_MAP_R2C = {
                 torch.int8: torch.complex64,
+                torch.half: torch.complex32,
                 torch.float: torch.complex64,
                 torch.double: torch.complex128,
             }
@@ -352,14 +323,38 @@ def test_fft_type_promotion(self, device, dtype):
          allowed_dtypes=[torch.half, torch.bfloat16])
     def test_fft_half_and_bfloat16_errors(self, device, dtype, op):
         # TODO: Remove torch.half error when complex32 is fully implemented
-        x = torch.randn(8, 8, device=device).to(dtype)
-        with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "):
-            op(x)
+        sample = first_sample(self, op.sample_inputs(device, dtype))
+        device_type = torch.device(device).type
+        if dtype is torch.half and device_type == 'cuda' and TEST_WITH_ROCM:
+            err_msg = "Unsupported dtype "
+        elif dtype is torch.half and device_type == 'cuda' and not SM53OrLater:
+            err_msg = "cuFFT doesn't support signals of half type with compute capability less than SM_53"
+        else:
+            err_msg = "Unsupported dtype "
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            op(sample.input, *sample.args, **sample.kwargs)
+
+    @onlyNativeDeviceTypes
+    @ops(spectral_funcs, allowed_dtypes=(torch.half, torch.chalf))
+    def test_fft_half_and_chalf_not_power_of_two_error(self, device, dtype, op):
+        t = make_tensor(13, 13, device=device, dtype=dtype)
+        err_msg = "cuFFT only supports dimensions whose sizes are powers of two"
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            op(t)
+
+        if op.ndimensional in (SpectralFuncType.ND, SpectralFuncType.TwoD):
+            kwargs = {'s': (12, 12)}
+        else:
+            kwargs = {'n': 12}
+
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            op(t, **kwargs)
 
     # nd-fft tests
     @onlyNativeDeviceTypes
     @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
-    @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND])
+    @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND],
+         allowed_dtypes=(torch.cfloat, torch.cdouble))
     def test_reference_nd(self, device, dtype, op):
         if op.ref is None:
             raise unittest.SkipTest("No reference implementation")
@@ -383,9 +378,6 @@ def test_reference_nd(self, device, dtype, op):
             input = torch.randn(*shape, device=device, dtype=dtype)
 
             for norm in norm_modes:
-                if torch.version.hip is not None:
-                    input, s, dim, norm = self._generate_valid_rocfft_input(
-                        input, op, s, dim, norm)
                 expected = op.ref(input.cpu().numpy(), s, dim, norm)
                 exact_dtype = dtype in (torch.double, torch.complex128)
                 actual = op(input, s, dim, norm)
@@ -393,8 +385,15 @@ def test_reference_nd(self, device, dtype, op):
 
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
-    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    @toleranceOverride({
+        torch.half : tol(1e-2, 1e-2),
+        torch.chalf : tol(1e-2, 1e-2),
+    })
+    @dtypes(torch.half, torch.float, torch.double,
+            torch.complex32, torch.complex64, torch.complex128)
     def test_fftn_round_trip(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
+
         norm_modes = (None, "forward", "backward", "ortho")
 
         # input_ndim, dim
@@ -416,7 +415,11 @@ def test_fftn_round_trip(self, device, dtype):
                               (torch.fft.ihfftn, torch.fft.hfftn)]
 
         for input_ndim, dim in transform_desc:
-            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            if dtype in (torch.half, torch.complex32):
+                # cuFFT supports powers of 2 for half and complex half precision
+                shape = itertools.islice(itertools.cycle((2, 4, 8)), input_ndim)
+            else:
+                shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
             x = torch.randn(*shape, device=device, dtype=dtype)
 
             for (forward, backward), norm in product(fft_functions, norm_modes):
@@ -428,8 +431,13 @@ def test_fftn_round_trip(self, device, dtype):
                 kwargs = {'s': s, 'dim': dim, 'norm': norm}
                 y = backward(forward(x, **kwargs), **kwargs)
                 # For real input, ifftn(fftn(x)) will convert to complex
-                self.assertEqual(x, y, exact_dtype=(
-                    forward != torch.fft.fftn or x.is_complex()))
+                if x.dtype is torch.half and y.dtype is torch.chalf:
+                    # Since type promotion currently doesn't work with complex32
+                    # manually promote `x` to complex32
+                    self.assertEqual(x.to(torch.chalf), y)
+                else:
+                    self.assertEqual(x, y, exact_dtype=(
+                        forward != torch.fft.fftn or x.is_complex()))
 
     @onlyNativeDeviceTypes
     @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND],
@@ -454,8 +462,13 @@ def test_fftn_invalid(self, device, dtype, op):
 
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
-    @dtypes(torch.float, torch.double)
+    @toleranceOverride({
+        torch.half : tol(1e-2, 1e-2),
+    })
+    @dtypes(torch.half, torch.float, torch.double)
     def test_hfftn(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
+
         # input_ndim, dim
         transform_desc = [
             *product(range(2, 5), (None, (0,), (0, -1))),
@@ -468,8 +481,10 @@ def test_hfftn(self, device, dtype):
 
         for input_ndim, dim in transform_desc:
             actual_dims = list(range(input_ndim)) if dim is None else dim
-
-            shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim))
+            if dtype is torch.half:
+                shape = tuple(itertools.islice(itertools.cycle((2, 4, 8)), input_ndim))
+            else:
+                shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim))
             expect = torch.randn(*shape, device=device, dtype=dtype)
             input = torch.fft.ifftn(expect, dim=dim, norm="ortho")
 
@@ -486,8 +501,13 @@ def test_hfftn(self, device, dtype):
 
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
-    @dtypes(torch.float, torch.double)
+    @toleranceOverride({
+        torch.half : tol(1e-2, 1e-2),
+    })
+    @dtypes(torch.half, torch.float, torch.double)
     def test_ihfftn(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
+
         # input_ndim, dim
         transform_desc = [
             *product(range(2, 5), (None, (0,), (0, -1))),
@@ -499,7 +519,11 @@ def test_ihfftn(self, device, dtype):
         ]
 
         for input_ndim, dim in transform_desc:
-            shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim))
+            if dtype is torch.half:
+                shape = tuple(itertools.islice(itertools.cycle((2, 4, 8)), input_ndim))
+            else:
+                shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim))
+
             input = torch.randn(*shape, device=device, dtype=dtype)
             expect = torch.fft.ifftn(input, dim=dim, norm="ortho")
 
@@ -552,31 +576,18 @@ def fn(t: torch.Tensor, s: Optional[List[int]], dim: List[int] = (-2, -1), norm:
 
                 torch_fns = (torch_fn, torch.jit.script(fn))
 
-                if torch.version.hip is not None:
-                    valid_input_default, s, _, norm = self._generate_valid_rocfft_input(
-                        input, torch_fn, s, None, norm)
-                else:
-                    valid_input_default = input
-
                 # Once with dim defaulted
-                input_np = valid_input_default.cpu().numpy()
+                input_np = input.cpu().numpy()
                 expected = numpy_fn(input_np, s, norm=norm)
                 for fn in torch_fns:
-                    actual = fn(valid_input_default, s, norm=norm)
+                    actual = fn(input, s, norm=norm)
                     self.assertEqual(actual, expected)
 
                 # Once with explicit dims
                 dim = (1, 0)
-                if torch.version.hip is not None:
-                    valid_input_explicit, s, dim, norm = self._generate_valid_rocfft_input(
-                        input, torch_fn, s, dim, norm)
-                    input_np = valid_input_explicit.cpu().numpy()
-                else:
-                    valid_input_explicit = input
-
                 expected = numpy_fn(input_np, s, dim, norm)
                 for fn in torch_fns:
-                    actual = fn(valid_input_explicit, s, dim, norm)
+                    actual = fn(input, s, dim, norm)
                     self.assertEqual(actual, expected)
 
     @skipCPUIfNoFFT
@@ -879,9 +890,16 @@ def librosa_stft(x, n_fft, hop_length, win_length, window, center):
             input_1d = x.dim() == 1
             if input_1d:
                 x = x.view(1, -1)
+
+            # NOTE: librosa 0.9 changed default pad_mode to 'constant' (zero padding)
+            # however, we use the pre-0.9 default ('reflect')
+            pad_mode = 'reflect'
+
             result = []
             for xi in x:
-                ri = librosa.stft(xi.cpu().numpy(), n_fft, hop_length, win_length, window, center=center)
+                ri = librosa.stft(xi.cpu().numpy(), n_fft=n_fft, hop_length=hop_length,
+                                  win_length=win_length, window=window, center=center,
+                                  pad_mode=pad_mode)
                 result.append(torch.from_numpy(np.stack([ri.real, ri.imag], -1)))
             result = torch.stack(result, 0)
             if input_1d:
diff --git a/test/test_stateless.py b/test/test_stateless.py
index f092f36b2e65..e3e3f03277d8 100644
--- a/test/test_stateless.py
+++ b/test/test_stateless.py
@@ -1,10 +1,13 @@
 # Owner(s): ["module: nn"]
 
 import unittest
+import sys
+import os
+import subprocess
 
 import torch
 
-import torch.nn.utils._stateless as _stateless
+import torch.nn.utils.stateless as stateless
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -41,7 +44,7 @@ def _run_call_with_mock_module(self, module, device='cpu', prefix=''):
         # the parameters represent an identity function contrary to the
         # existing params in module. So here we expect the result to be the
         # same as the input if the weight swapping went well.
-        res = _stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x)
         self.assertEqual(x, res)
         # check that the weight remain unmodified
         cur_weight = to_check.l1.weight
@@ -70,6 +73,7 @@ def test_functional_call_with_jit(self):
             self._run_call_with_mock_module(traced_module)
 
     @unittest.skipIf(not TEST_MULTIGPU, 'multi-GPU not supported')
+    @unittest.skip("This doesn't work right now")
     def test_functional_call_with_data_parallel(self):
         module = MockModule()
         module.cuda()
@@ -85,7 +89,7 @@ def test_functional_call_with_gradient(self):
         parameters = {'l1.weight': weight,
                       'l1.bias': bias,
                       'buffer': buffer}
-        res = _stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x)
         # Check that a backward step calculates the gradient of the supplied parameters
         res.backward()
         self.assertIsNotNone(weight.grad)
@@ -104,13 +108,13 @@ def test_functional_batch_norm(self):
         rm = torch.zeros(10)
         parameters = {'running_mean': rm}
         prev_rm = module.running_mean.clone()
-        res = _stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x)
         cur_rm = module.running_mean
         self.assertEqual(cur_rm, prev_rm)
         self.assertEqual(rm, torch.full((10,), 12.8))
         # Now run functional without reparametrization and check that the module has
         # been updated
-        res = _stateless.functional_call(module, {}, x)
+        res = stateless.functional_call(module, {}, x)
         self.assertEqual(module.running_mean, torch.full((10,), 12.8))
 
     def test_circular_references(self):
@@ -126,7 +130,7 @@ def test_circular_references(self):
                       'l1.m.buffer': buffer}
         prev_weight = module.l1.weight.clone()
         prev_buffer = module.buffer.clone()
-        res = _stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x)
         self.assertEqual(x, res)
         # check that the weights remain unmodified and were correctly accesed
         cur_weight = module.l1.weight
@@ -146,11 +150,66 @@ def test_reparametrized_module_change_parametrization_original(self):
         parameters = {'l1.parametrizations.weight.original': torch.nn.Parameter(torch.tensor([[1.0]])),
                       'l1.bias': torch.tensor([0.0]),
                       'buffer': torch.tensor([0.0])}
-        res = torch.nn.utils._stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x)
         self.assertEqual(x, res)
         # verify that the spectral normalization is still applied
         self.assertTrue('l1.parametrizations.weight.original' in dict(module.named_parameters()))
         self.assertEqual(orig_sn_weight, module.l1.weight)
 
+    def test_setattr(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer('foo', torch.zeros(()))
+
+            def forward(self, x):
+                self.foo = self.foo + 1
+                return x + self.foo
+
+        a = {'foo': torch.zeros(())}
+        mod = Foo()
+        stateless.functional_call(mod, a, torch.ones(()))
+        self.assertEqual(mod.foo, torch.zeros(()))
+        self.assertEqual(a['foo'], torch.ones(()))
+
+
+class TestStatelessDeprecation(TestCase):
+    def test_private_stateless_warns(self):
+        script = """
+import torch
+import warnings
+
+with warnings.catch_warnings(record=True) as w:
+    from torch.nn.utils import _stateless
+
+exit(len(w))
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, '-W', 'all', '-c', script],
+                stderr=subprocess.STDOUT,
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),)
+        except subprocess.CalledProcessError as e:
+            self.assertEqual(e.returncode, 1)
+        else:
+            self.assertTrue(False, "No warning was raised.")
+
+class TestPythonOptimizeMode(TestCase):
+    def test_runs_with_optimize_flag(self):
+        script = """
+import torch
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, '-OO', '-c', script],
+                stderr=subprocess.STDOUT,
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),)
+        except subprocess.CalledProcessError as e:
+            self.assertFalse(e.returncode, "Import failed while running python in optimized mode")
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_subclass.py b/test/test_subclass.py
new file mode 100644
index 000000000000..2eb45c361ed9
--- /dev/null
+++ b/test/test_subclass.py
@@ -0,0 +1,245 @@
+# Owner(s): ["module: nn"]
+
+import tempfile
+import torch
+from copy import deepcopy
+from functools import partial
+from torch import nn
+from torch.nn.utils.parametrize import register_parametrization, remove_parametrizations
+from torch.nn.modules.lazy import LazyModuleMixin
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, parametrize, subtest, instantiate_parametrized_tests)
+from torch.testing._internal.common_subclass import subclass_db, DiagTensorBelow
+from torch.testing._internal.logging_tensor import LoggingTensor
+from torch.utils._pytree import tree_map
+from unittest import expectedFailure
+
+# The current test methodology in this file is to test a variety of real use cases
+# with a set of fully-fledged tensor subclasses. In the future, this may change
+# to more narrowly specify toy subclasses for each of the specific invariants under
+# test, avoiding the need to maintain the set of fully-fledged tensor subclasses.
+
+
+# Decorator for parametrizing tests across the various tensor classes.
+parametrize_tensor_cls = parametrize("tensor_cls", [
+    subtest(tensor_cls, name=info.name) for tensor_cls, info in subclass_db.items()])
+
+
+class TestSubclass(TestCase):
+    def _create_tensor(self, tensor_cls):
+        return subclass_db[tensor_cls].create_fn(3)
+
+    @parametrize_tensor_cls
+    @parametrize("tensor_requires_grad", [False, True])
+    def test_param_invariants(self, tensor_cls, tensor_requires_grad):
+        x = self._create_tensor(tensor_cls).requires_grad_(tensor_requires_grad)
+        param = nn.Parameter(x, requires_grad=(not tensor_requires_grad))
+
+        self.assertIsInstance(param, nn.Parameter)
+        # Ensure requires_grad passed to Parameter's constructor takes precedence.
+        self.assertEqual(param.requires_grad, not tensor_requires_grad)
+
+        # Ensure original tensor is not mutated by Parameter construction.
+        self.assertNotIsInstance(x, nn.Parameter)
+        self.assertEqual(x.requires_grad, tensor_requires_grad)
+
+    @parametrize_tensor_cls
+    @parametrize("as_param", [False, True])
+    def test_deepcopy(self, tensor_cls, as_param):
+        x = self._create_tensor(tensor_cls)
+        if as_param:
+            x = nn.Parameter(x)
+        x_copy = deepcopy(x)
+        self.assertEqual(x, x_copy)
+        self.assertEqual(x.__class__, x_copy.__class__)
+        self.assertIsNot(x, x_copy)
+        self.assertIsInstance(x_copy, tensor_cls)
+        if as_param:
+            # Deepcopy should preserve both custom type and "parameter-ness".
+            self.assertIsInstance(x_copy, nn.Parameter)
+
+    @parametrize_tensor_cls
+    @parametrize("as_param", [False, True])
+    def test_serialization(self, tensor_cls, as_param):
+        with tempfile.TemporaryFile() as f:
+            x = self._create_tensor(tensor_cls)
+            if as_param:
+                x = nn.Parameter(x)
+            torch.save(x, f)
+            f.seek(0)
+            x_loaded = torch.load(f)
+
+            self.assertEqual(x, x_loaded)
+            self.assertIsNot(x, x_loaded)
+            self.assertIsInstance(x_loaded, tensor_cls)
+            if as_param:
+                # Serialization should preserve both custom type and "parameter-ness".
+                self.assertIsInstance(x_loaded, nn.Parameter)
+
+    @parametrize_tensor_cls
+    @parametrize("as_param", [False, True])
+    def test_repr(self, tensor_cls, as_param):
+        x = self._create_tensor(tensor_cls)
+        if as_param:
+            x = nn.Parameter(x)
+        str_repr = x.__repr__()
+        if tensor_cls is not torch.Tensor:
+            self.assertEqual(str_repr.count(f"{tensor_cls.__name__}("), 1)
+        self.assertEqual(str_repr.count("Parameter"), 1 if as_param else 0)
+
+    @parametrize_tensor_cls
+    @parametrize("as_param", [False, True])
+    def test_type_propagation(self, tensor_cls, as_param):
+        x = self._create_tensor(tensor_cls)
+        if as_param:
+            x = nn.Parameter(x)
+
+        # Call the add operator to produce an output tensor.
+        output = x + self._create_tensor(torch.Tensor)
+
+        # Custom type should be propagated across operations if closed under the op, but
+        # "parameter-ness" should not be.
+        if subclass_db[tensor_cls].closed_under_ops:
+            self.assertIsInstance(output, tensor_cls)
+        else:
+            self.assertIsInstance(output, torch.Tensor)
+        self.assertNotIsInstance(output, nn.Parameter)
+
+    @parametrize_tensor_cls
+    def test_module_optimization(self, tensor_cls):
+        create_fn = partial(self._create_tensor, tensor_cls)
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.p1 = nn.Parameter(create_fn())
+
+                self.p_list = nn.ParameterList([create_fn() for _ in range(3)])
+                self.p_list.append(create_fn())
+
+                self.p_dict = nn.ParameterDict({
+                    'foo': create_fn(),
+                    'bar': create_fn(),
+                })
+                self.p_dict['baz'] = create_fn()
+
+                with torch.no_grad():
+                    nn.init.normal_(self.p1)
+                    for p in self.p_list:
+                        nn.init.uniform_(p)
+                    for _, p in self.p_dict.items():
+                        nn.init.uniform_(p)
+
+            def forward(self, x):
+                out = self.p1 + x
+                for p in self.p_list:
+                    out = p + out
+
+                for _, v in self.p_dict.items():
+                    out = v + out
+
+                return out
+
+        m = MyModule()
+        self.assertEqual(len(m.state_dict()), 8)
+
+        optimizer = torch.optim.SGD(m.parameters(), lr=0.1)
+        m(create_fn()).sum().backward(torch.tensor(1))
+        optimizer.step()
+
+    @parametrize_tensor_cls
+    @parametrize("leave_parametrized", [False, True])
+    def test_parametrization(self, tensor_cls, leave_parametrized):
+        # TODO: Either implement set_() properly for these tensor subclasses or apply a
+        # more general fix to avoid the need for special set_() handling. For now, skip
+        # testing these as they're expected to fail.
+        if tensor_cls in [LoggingTensor, DiagTensorBelow]:
+            return
+
+        create_fn = partial(self._create_tensor, tensor_cls)
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = nn.Parameter(create_fn())
+
+            def forward(self, x):
+                return self.weight + x
+
+        class MyParametrization(nn.Module):
+            def forward(self, X):
+                return -X
+
+        m = MyModule()
+        self.assertEqual(len(m.state_dict()), 1)
+        register_parametrization(m, 'weight', MyParametrization())
+        self.assertIsInstance(m.weight, tensor_cls)
+        output = m(self._create_tensor(torch.Tensor))
+        self.assertIsInstance(output, tensor_cls)
+        remove_parametrizations(m, 'weight', leave_parametrized=leave_parametrized)
+
+    # Lazy modules with custom tensors are not supported yet.
+    @expectedFailure
+    @parametrize_tensor_cls
+    def test_lazy_module(self, tensor_cls):
+        if tensor_cls is torch.Tensor:
+            self.fail('dummy fail for base tensor until the test passes for subclasses')
+
+        class MyLazyModule(LazyModuleMixin, nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = nn.UninitializedParameter()
+
+            def initialize_parameters(self, input) -> None:  # type: ignore[override]
+                if self.has_uninitialized_params():
+                    with torch.no_grad():
+                        self.param.materialize(input.shape)
+                        nn.init.uniform_(self.param)
+
+            def forward(self, x):
+                return self.param + x
+
+        m = MyLazyModule()
+        self.assertTrue(m.has_uninitialized_params())
+        output = m(self._create_tensor(tensor_cls))
+        self.assertFalse(m.has_uninitialized_params())
+        self.assertIsInstance(m.param, tensor_cls)
+
+    def test_non_rewrapping_torch_dispatch_subclass_as_parameter_throws_for_detach(self):
+
+        # Define a subclass that does not rewrap for any function in its __torch_dispatch__ impl.
+        class NonRewrappingTensor(torch.Tensor):
+            @staticmethod
+            def __new__(
+                cls, t: torch.Tensor
+            ):
+                r = super(NonRewrappingTensor, cls)._make_wrapper_subclass(
+                    cls, t.shape, dtype=t.dtype, requires_grad=t.requires_grad, device=t.device)
+                return r
+
+            def __init__(self, t) -> None:
+                self.tensor: torch.Tensor = t
+
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+
+                def unwrap(e) -> torch.Tensor:
+                    if isinstance(e, NonRewrappingTensor):
+                        t = e.tensor
+                        return t
+                    else:
+                        return e
+
+                r = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+                # Return an unwrapped tensor no longer of original subclass type.
+                return r
+
+        with self.assertRaisesRegex(RuntimeError, r"requires that detach\(\) returns an instance of the same type"):
+            param = nn.Parameter(NonRewrappingTensor(torch.randn(3)))
+
+instantiate_parametrized_tests(TestSubclass)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 62d595373b3a..c341ef36dae1 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -13,15 +13,17 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings,
-    torch_to_numpy_dtype_dict, slowTest,
-    TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS)
+    torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, slowTest,
+    TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS, parametrize)
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta, instantiate_device_type_tests, deviceCountAtLeast, onlyNativeDeviceTypes,
     onlyCPU, largeTensorTest, precisionOverride, dtypes,
     onlyCUDA, skipCPUIf, dtypesIfCUDA, skipMeta, get_all_device_types)
 from torch.testing._internal.common_dtype import (
-    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+    all_types_and_complex_and, all_types_and, floating_and_complex_types,
+    floating_types, floating_and_complex_types_and, integral_types_and, get_all_dtypes
 )
+from torch.testing._creation import float_to_corresponding_complex_type_map
 
 from torch.utils.dlpack import to_dlpack
 
@@ -147,7 +149,7 @@ def test_vander_types(self, device, dtype):
                 exact_dtype=False)
 
     def test_cat_all_dtypes_and_devices(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf):
             x = torch.tensor([[1, 2], [3, 4]], dtype=dt, device=device)
 
             expected1 = torch.tensor([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=dt, device=device)
@@ -157,7 +159,7 @@ def test_cat_all_dtypes_and_devices(self, device):
             self.assertEqual(torch.cat((x, x), 1), expected2)
 
     def test_fill_all_dtypes_and_devices(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf):
             for x in [torch.tensor((10, 10), dtype=dt, device=device),
                       torch.empty(10000, dtype=dt, device=device)]:  # large tensor
                 numel = x.numel()
@@ -311,7 +313,7 @@ def run_test(shape, device, diagonal, dtype):
                   (3, 1), (5, 3, 1), (7, 5, 3, 1),  # very fat matrices
                   (1, 3), (5, 1, 3), (7, 5, 1, 3),  # very thin matrices
                   (1, 3, 3, 3), (3, 1, 3, 3, 3)]    # unsqueezed batch dimensions
-        dtypes = [dtype for dtype in get_all_dtypes() if dtype != torch.bfloat16]
+        dtypes = all_types_and_complex_and(torch.half, torch.bool)
         for s, d, dtype in product(shapes, diagonals, dtypes):
             run_test(s, device, d, dtype)
 
@@ -508,12 +510,12 @@ def test_block_diag_scipy(self, device):
             self.assertEqual(torch_result, scipy_result)
 
     @onlyNativeDeviceTypes
-    @dtypes(torch.float32, torch.float64)
+    @dtypes(torch.half, torch.float32, torch.float64)
     def test_torch_complex(self, device, dtype):
         real = torch.tensor([1, 2], device=device, dtype=dtype)
         imag = torch.tensor([3, 4], device=device, dtype=dtype)
         z = torch.complex(real, imag)
-        complex_dtype = torch.complex64 if dtype == torch.float32 else torch.complex128
+        complex_dtype = float_to_corresponding_complex_type_map[dtype]
         self.assertEqual(torch.tensor([1.0 + 3.0j, 2.0 + 4.0j], dtype=complex_dtype), z)
 
     @onlyNativeDeviceTypes
@@ -531,12 +533,12 @@ def test_torch_polar(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64,
-            torch.float16, torch.complex64, torch.complex128, torch.bool)
+            torch.complex64, torch.complex128, torch.bool)
     def test_torch_complex_floating_dtype_error(self, device, dtype):
         for op in (torch.complex, torch.polar):
             a = torch.tensor([1, 2], device=device, dtype=dtype)
             b = torch.tensor([3, 4], device=device, dtype=dtype)
-            error = r"Expected both inputs to be Float or Double tensors but " \
+            error = r"Expected both inputs to be Half, Float or Double tensors but " \
                     r"got [A-Za-z]+ and [A-Za-z]+"
         with self.assertRaisesRegex(RuntimeError, error):
             op(a, b)
@@ -626,16 +628,22 @@ def test_cat_out(self, device):
         y = torch.randn((4, 6), device=device)
 
         with self.assertRaisesRegex(
-                RuntimeError, r"unsupported operation:.* input tensor 0"):
+                RuntimeError,
+                r"unsupported operation: some elements of the input tensor and "
+                r"the written-to tensor refer to a single memory location."):
             torch.cat([x, y], dim=0, out=x)
 
         with self.assertRaisesRegex(
-                RuntimeError, r"unsupported operation:.* input tensor 1"):
+                RuntimeError,
+                r"unsupported operation: some elements of the input tensor and "
+                r"the written-to tensor refer to a single memory location."):
             torch.cat([x, y], dim=0, out=y)
 
         z = torch.zeros((4, 6), device=device)
         with self.assertRaisesRegex(
-                RuntimeError, r"unsupported operation:.* input tensor 1"):
+                RuntimeError,
+                r"unsupported operation: some elements of the input tensor and "
+                r"the written-to tensor refer to a single memory location."):
             torch.cat([y, z], out=z[:2, :])
 
         w = y.view(-1).clone()
@@ -739,8 +747,7 @@ def test_cat_out_memory_format(self, device):
         self.assertTrue(res1_cpu.is_contiguous(memory_format=torch.contiguous_format))
 
         # Case 2: if out= is not the correct shape then the output it is resized internally
-        # - For the CPU variant the memory format is that of the first tensor
-        # - For the CUDA variant it only propagates memory format if all the tensors have
+        # - For both CPU and CUDA variants, it only propagates memory format if all the tensors have
         #   the same memory format, otherwise it just uses contiguous_format as a default
 
         out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format)
@@ -751,7 +758,7 @@ def test_cat_out_memory_format(self, device):
         res2_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu)
 
         self.assertTrue(res2_cuda.is_contiguous(memory_format=torch.contiguous_format))
-        self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.contiguous_format))
 
         out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format)
         # a_cuda and c_cuda have same memory_format
@@ -890,7 +897,7 @@ def _hvd_split_helper(self, torch_fn, np_fn, op_name, inputs, device, dtype, dim
             bound = dim + 2 * (dim == 0) + (dim == 2)
             error_expected = len(shape) < bound or (not isinstance(arg, list) and shape[direction] % arg != 0)
 
-            t = make_tensor(shape, device, dtype)
+            t = make_tensor(shape, dtype=dtype, device=device)
             t_np = t.cpu().numpy()
 
             if not error_expected:
@@ -1009,8 +1016,7 @@ def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype
                         np_fn(np_input)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_hstack_column_stack(self, device, dtype):
         ops = ((torch.hstack, np.hstack), (torch.column_stack, np.column_stack))
         for torch_op, np_op in ops:
@@ -1029,8 +1035,7 @@ def test_hstack_column_stack(self, device, dtype):
                          torch_result)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_vstack_row_stack(self, device, dtype):
         ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack))
         for torch_op, np_op in ops:
@@ -1047,8 +1052,7 @@ def test_vstack_row_stack(self, device, dtype):
                 self.assertEqual(actual, expected)
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_dstack(self, device, dtype):
         self._test_special_stacks(2, 3, torch.dstack, np.dstack, device, dtype)
         for i in range(5):
@@ -1600,6 +1604,10 @@ def test_cartesian_prod(self, device):
     def test_combinations(self, device):
         a = torch.tensor([1, 2, 3], device=device)
 
+        c = torch.combinations(a, r=0)
+        expected = torch.empty(0, dtype=a.dtype, device=device)
+        self.assertEqual(c, expected)
+
         c = torch.combinations(a, r=1)
         expected = torch.tensor(list(combinations(a, r=1)), device=device)
         self.assertEqual(c, expected)
@@ -1752,7 +1760,7 @@ def test_random_from_to_bool(self, device):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.bfloat16, torch.half))
     def test_random_full_range(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1786,7 +1794,7 @@ def test_random_full_range(self, device, dtype):
         self.assertTrue(from_ <= t.to(torch.double).min() < (from_ + delta))
         self.assertTrue((to_inc_ - delta) < t.to(torch.double).max() <= to_inc_)
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.bfloat16, torch.half))
     def test_random_from_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1875,7 +1883,7 @@ def test_random_from_to(self, device, dtype):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.bfloat16, torch.half))
     def test_random_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1933,7 +1941,7 @@ def test_random_to(self, device, dtype):
                     lambda: t.random_(from_, to_)
                 )
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.bfloat16, torch.half))
     def test_random_default(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1960,11 +1968,12 @@ def test_empty_full(self, device):
         torch_device = torch.device(device)
         device_type = torch_device.type
 
+        dtypes = get_all_dtypes(include_half=False, include_bfloat16=False, include_complex32=True)
         if device_type == 'cpu':
-            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device)
+            do_test_empty_full(self, dtypes, torch.strided, torch_device)
         if device_type == 'cuda':
-            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, None)
-            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device)
+            do_test_empty_full(self, dtypes, torch.strided, None)
+            do_test_empty_full(self, dtypes, torch.strided, torch_device)
 
     # TODO: this test should be updated
     @suppress_warnings
@@ -2053,6 +2062,10 @@ def test_zeros(self, device):
         expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex64)
         self.assertEqual(complexTensor, expected)
 
+        complexHalfTensor = torch.zeros(2, 2, device=device, dtype=torch.complex32)
+        expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex32)
+        self.assertEqual(complexHalfTensor, expected)
+
     # TODO: this test should be updated
     def test_zeros_out(self, device):
         shape = (3, 4)
@@ -2085,6 +2098,10 @@ def test_ones(self, device):
         expected = torch.tensor([[True, True]], device=device, dtype=torch.bool)
         self.assertEqual(res1, expected)
 
+        # test chalf
+        self.assertEqual(torch.ones(100, 100, device=device, dtype=torch.chalf),
+                         torch.ones(100, 100, device=device, dtype=torch.cfloat), exact_dtype=False)
+
     # TODO: this test should be updated
     @onlyCPU
     def test_constructor_dtypes(self, device):
@@ -2099,6 +2116,9 @@ def test_constructor_dtypes(self, device):
         self.assertIs(torch.float32, torch.get_default_dtype())
         self.assertIs(torch.FloatStorage, torch.Storage)
 
+        # only floating-point types are supported as the default type
+        self.assertRaises(TypeError, lambda: torch.set_default_tensor_type('torch.IntTensor'))
+
         torch.set_default_dtype(torch.float64)
         self.assertIs(torch.float64, torch.get_default_dtype())
         self.assertIs(torch.DoubleStorage, torch.Storage)
@@ -2117,13 +2137,21 @@ def test_constructor_dtypes(self, device):
             self.assertIs(torch.float64, torch.get_default_dtype())
             self.assertIs(torch.cuda.DoubleStorage, torch.Storage)
 
-        # don't support integral or sparse default types.
-        self.assertRaises(TypeError, lambda: torch.set_default_tensor_type('torch.IntTensor'))
-        self.assertRaises(TypeError, lambda: torch.set_default_dtype(torch.int64))
-
         # don't allow passing dtype to set_default_tensor_type
         self.assertRaises(TypeError, lambda: torch.set_default_tensor_type(torch.float32))
 
+        # don't allow passing dtype to set_default_dtype
+        for t in all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.qint8):
+            # only floating-point types are supported as the default type
+            if t in (
+                    torch.half,
+                    torch.float,
+                    torch.double,
+                    torch.bfloat16):
+                torch.set_default_dtype(t)
+            else:
+                self.assertRaises(TypeError, lambda: torch.set_default_dtype(t))
+
         torch.set_default_tensor_type(default_type)
 
     # TODO: this test should be updated
@@ -2651,8 +2679,17 @@ def test_empty_tensor_props(self, device):
             y = torch.empty(tuple(size_ones_instead_of_zeros), device=device)
             self.assertEqual(x.stride(), y.stride())
 
+    @onlyNativeDeviceTypes
+    def test_empty_overflow(self, device):
+        with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'):
+            torch.empty([2, 4, 2**29, 2**29], dtype=torch.float64)
+        with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'):
+            torch.empty([8, 8, 2**29, 2**29], dtype=torch.float64)
+        with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'):
+            torch.empty_strided([8, 8], [2**61, 1], dtype=torch.float64)
+
     def test_eye(self, device):
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             if dtype == torch.bfloat16:
                 continue
             # Test the RuntimeError is raised when either m or n is a negative number
@@ -2685,8 +2722,7 @@ def test_eye(self, device):
                 self.assertEqual(res1, res2)
 
     @precisionOverride({torch.float: 1e-8, torch.double: 1e-10})
-    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*floating_and_complex_types())
     def test_linspace_vs_numpy(self, device, dtype):
         start = -0.0316082797944545745849609375 + (0.8888888888j if dtype.is_complex else 0)
         end = .0315315723419189453125 + (0.444444444444j if dtype.is_complex else 0)
@@ -2723,7 +2759,7 @@ def test_logspace_vs_numpy_complex(self, device, dtype):
                                                     device, dtype)
 
     @precisionOverride({torch.float: 1e-6, torch.double: 1e-10})
-    @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @dtypes(*floating_types())
     def test_logspace_vs_numpy(self, device, dtype):
         start = -0.0316082797944545745849609375
         end = .0315315723419189453125
@@ -2786,43 +2822,49 @@ def test_tensor_ctor_device_inference(self, device):
                                                             sparse_size, dtype=torch.float64)
                 self.assertEqual(sparse_with_dtype.device, torch.device('cpu'))
 
+    def _test_signal_window_functions(self, name, dtype, device, **kwargs):
+        import scipy.signal as signal
+
+        torch_method = getattr(torch, name + '_window')
+        if not dtype.is_floating_point:
+            with self.assertRaisesRegex(RuntimeError, r'floating point'):
+                torch_method(3, dtype=dtype)
+            return
+        for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]:
+            for periodic in [True, False]:
+                res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype)
+                # NB: scipy always returns a float64 result
+                ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic))
+                self.assertEqual(res, ref, exact_dtype=False)
+        with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'):
+            torch_method(3, layout=torch.sparse_coo)
+        self.assertTrue(torch_method(3, requires_grad=True).requires_grad)
+        self.assertFalse(torch_method(3).requires_grad)
+
     @onlyNativeDeviceTypes
     @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3})
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
     @dtypes(torch.float, torch.double, torch.long)
-    def test_signal_window_functions(self, device, dtype):
-        import scipy.signal as signal
-
-        def test(name, kwargs):
-            torch_method = getattr(torch, name + '_window')
-            if not dtype.is_floating_point:
-                with self.assertRaisesRegex(RuntimeError, r'floating point'):
-                    torch_method(3, dtype=dtype)
-                return
-            for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]:
-                for periodic in [True, False]:
-                    res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype)
-                    # NB: scipy always returns a float64 result
-                    ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic))
-                    self.assertEqual(res, ref, exact_dtype=False)
-            with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'):
-                torch_method(3, layout=torch.sparse_coo)
-            self.assertTrue(torch_method(3, requires_grad=True).requires_grad)
-            self.assertFalse(torch_method(3).requires_grad)
-
-        for window in ['hann', 'hamming', 'bartlett', 'blackman']:
-            test(window, kwargs={})
+    @parametrize("window", ['hann', 'hamming', 'bartlett', 'blackman'])
+    def test_signal_window_functions(self, device, dtype, window):
+        self._test_signal_window_functions(window, dtype, device)
 
+    @onlyNativeDeviceTypes
+    @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3})
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
+    @dtypes(torch.float, torch.double, torch.long)
+    def test_kaiser_window(self, device, dtype):
         for num_test in range(50):
-            test('kaiser', kwargs={'beta': random.random() * 30})
+            self._test_signal_window_functions('kaiser', dtype, device, beta=random.random() * 30)
 
     def test_tensor_factories_empty(self, device):
         # ensure we can create empty tensors from each factory function
         shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)]
 
         for shape in shapes:
-            for dt in get_all_dtypes():
+            for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf):
 
                 self.assertEqual(shape, torch.zeros(shape, device=device, dtype=dt).shape)
                 self.assertEqual(shape, torch.zeros_like(torch.zeros(shape, device=device, dtype=dt)).shape)
@@ -2843,7 +2885,8 @@ def test_tensor_factories_empty(self, device):
                     self.assertEqual(shape, torch.randint(6, shape, device=device, dtype=dt).shape)
                     self.assertEqual(shape, torch.randint_like(torch.zeros(shape, device=device, dtype=dt), 6).shape)
 
-                if dt not in {torch.double, torch.float, torch.half, torch.bfloat16, torch.complex64, torch.complex128}:
+                if dt not in {torch.double, torch.float, torch.half, torch.bfloat16,
+                              torch.complex32, torch.complex64, torch.complex128}:
                     self.assertRaises(RuntimeError, lambda: torch.rand(shape, device=device, dtype=dt).shape)
 
                 if dt == torch.double or dt == torch.float or dt.is_complex:
@@ -2908,8 +2951,8 @@ def test_arange_bfloat16(self, device):
         bfloat16_tensor = torch.arange(0, 6, step=2, dtype=torch.bfloat16, device=device)
         self.assertEqual(ref_tensor, bfloat16_tensor)
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_half=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_bool=False, include_half=True))
+    @dtypes(*all_types_and_complex_and(torch.bfloat16))
+    @dtypesIfCUDA(*all_types_and_complex_and(torch.bfloat16))
     def test_linspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
@@ -3026,12 +3069,12 @@ def _test_linspace(self, device, dtype, steps):
     # See NOTE [Linspace+Logspace precision override]
     @skipCPUIf(True, "compares with CPU")
     @precisionOverride({torch.half: 0.0039 + LINSPACE_LOGSPACE_EXTRA_EPS})
-    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
+    @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16))
     def test_linspace_device_vs_cpu(self, device, dtype):
         self._test_linspace(device, dtype, steps=10)
 
     @skipCPUIf(True, "compares with CPU")
-    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
+    @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16))
     def test_linspace_special_steps(self, device, dtype):
         for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS:
             self._test_linspace(device, dtype, steps=steps)
@@ -3072,10 +3115,9 @@ def test_logspace_special_steps(self, device, dtype):
             self._test_logspace(device, dtype, steps=steps)
             self._test_logspace_base2(device, dtype, steps=steps)
 
-    @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
-    @dtypesIfCUDA(*((get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16])
-                    if TEST_WITH_ROCM
-                    else get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
+    @dtypes(*all_types_and(torch.bfloat16))
+    @dtypesIfCUDA(*integral_types_and(torch.half, torch.bfloat16, torch.float32, torch.float64) if TEST_WITH_ROCM else
+                  all_types_and(torch.half, torch.bfloat16))
     def test_logspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
@@ -3335,7 +3377,7 @@ def test_normal_std_error(self, device):
         std = torch.tensor(-1, dtype=torch.float32, device=device)
 
         for input in [0, a]:
-            with self.assertRaisesRegex(RuntimeError, r'normal_ expects std >= 0.0'):
+            with self.assertRaisesRegex(RuntimeError, r'normal expects std >= 0.0, but found std'):
                 torch.normal(input, -1, (10,))
 
             with self.assertRaisesRegex(RuntimeError, r'normal expects all elements of std >= 0.0'):
@@ -3453,7 +3495,7 @@ def seed(generator):
             self.assertTrue((res1 >= 0).all().item())
 
     @dtypes(torch.half, torch.float, torch.bfloat16, torch.double,
-            torch.complex64, torch.complex128)
+            torch.complex32, torch.complex64, torch.complex128)
     def test_randn(self, device, dtype):
         SIZE = 100
         for size in [0, SIZE]:
@@ -3464,7 +3506,7 @@ def test_randn(self, device, dtype):
             torch.randn(size, size, out=res2)
             self.assertEqual(res1, res2)
 
-    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    @dtypes(torch.float, torch.double, torch.complex32, torch.complex64, torch.complex128)
     def test_rand(self, device, dtype):
         SIZE = 100
         for size in [0, SIZE]:
@@ -3488,9 +3530,13 @@ def test_randperm(self, device):
         for n in (5, 100, 50000, 100000):
             # Ensure both integer and floating-point numbers are tested. Half follows an execution path that is
             # different from others on CUDA.
-            for dtype in (torch.long, torch.half, torch.float):
+            for dtype in (torch.long, torch.half, torch.float, torch.bfloat16):
                 if n > 2049 and dtype == torch.half:  # Large n for torch.half will raise an exception, do not test here.
                     continue
+                if dtype == torch.bfloat16 and device != 'cpu':
+                    continue
+                if n > 256 and dtype == torch.bfloat16:
+                    continue
                 with torch.random.fork_rng(devices=rng_device):
                     res1 = torch.randperm(n, dtype=dtype, device=device)
                 res2 = torch.empty(0, dtype=dtype, device=device)
@@ -3640,7 +3686,7 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs):
         if offset is None:
             offset = first * get_dtype_size(dtype)
 
-        numpy_original = make_tensor(shape, torch.device("cpu"), dtype).numpy()
+        numpy_original = make_tensor(shape, dtype=dtype, device="cpu").numpy()
         original = memoryview(numpy_original)
         # First call PyTorch's version in case of errors.
         # If this call exits successfully, the NumPy version must also do so.
@@ -3651,13 +3697,13 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs):
         self.assertEqual(numpy_frombuffer.__array_interface__["data"][0], torch_frombuffer.data_ptr())
         return (numpy_original, torch_frombuffer)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_same_type(self, device, dtype):
         self._run_test((), dtype)
         self._run_test((4,), dtype)
         self._run_test((10, 10), dtype)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_requires_grad(self, device, dtype):
         def _run_test_and_check_grad(requires_grad, *args, **kwargs):
             kwargs["requires_grad"] = requires_grad
@@ -3672,14 +3718,14 @@ def _run_test_and_check_grad(requires_grad, *args, **kwargs):
         _run_test_and_check_grad(False, (4,), dtype)
         _run_test_and_check_grad(False, (10, 10), dtype)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_with_offset(self, device, dtype):
         # Offset should be valid whenever there is, at least,
         # one remaining element
         for i in range(SIZE):
             self._run_test(SHAPE, dtype, first=i)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_with_count(self, device, dtype):
         # Count should be valid for any valid in the interval
         # [-1, len(input)], except for 0
@@ -3687,7 +3733,7 @@ def test_with_count(self, device, dtype):
             if i != 0:
                 self._run_test(SHAPE, dtype, count=i)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_with_count_and_offset(self, device, dtype):
         # Explicit default count [-1, 1, 2, ..., len]
         for i in range(-1, SIZE + 1):
@@ -3703,7 +3749,7 @@ def test_with_count_and_offset(self, device, dtype):
             for j in range(SIZE - i + 1):
                 self._run_test(SHAPE, dtype, count=i, first=j)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_invalid_positional_args(self, device, dtype):
         bytes = get_dtype_size(dtype)
         in_bytes = SIZE * bytes
@@ -3740,9 +3786,9 @@ def test_invalid_positional_args(self, device, dtype):
                                         rf"buffer length \({in_bytes} bytes\)"):
                 self._run_test(SHAPE, dtype, count=count, first=first)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_shared_buffer(self, device, dtype):
-        x = make_tensor((1,), device, dtype)
+        x = make_tensor((1,), dtype=dtype, device=device)
         # Modify the whole tensor
         arr, tensor = self._run_test(SHAPE, dtype)
         tensor[:] = x
@@ -3767,15 +3813,15 @@ def test_shared_buffer(self, device, dtype):
                 arr[first] = x.item() - 1
                 self.assertEqual(arr[first:last], tensor)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_not_a_buffer(self, device, dtype):
         with self.assertRaisesRegex(ValueError,
                                     r"object does not implement Python buffer protocol."):
             torch.frombuffer([1, 2, 3, 4], dtype=dtype)
 
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_non_writable_buffer(self, device, dtype):
-        numpy_arr = make_tensor((1,), device, dtype).numpy()
+        numpy_arr = make_tensor((1,), dtype=dtype, device=device).numpy()
         byte_arr = numpy_arr.tobytes()
         with self.assertWarnsOnceRegex(UserWarning,
                                        r"The given buffer is not writable."):
@@ -3852,7 +3898,7 @@ def _check(self, original, cvt=lambda t: t, is_alias=True, same_dtype=True, same
         self.assertEqual(result.requires_grad, kwargs.get("requires_grad", False))
 
     def _test_alias_with_cvt(self, cvt, device, dtype, shape=(5, 5), only_with_dtype=False):
-        original = make_tensor(shape, device, dtype)
+        original = make_tensor(shape, dtype=dtype, device=device)
 
         def check(**kwargs):
             self._check(original, cvt=cvt, **kwargs)
@@ -3873,28 +3919,28 @@ def check(**kwargs):
     # data pointer (which is basically the point here), since they all
     # return 0.
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_alias_from_tensor(self, device, dtype):
         self._test_alias_with_cvt(identity, device, dtype)
 
     @onlyCPU
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_alias_from_numpy(self, device, dtype):
         self._test_alias_with_cvt(to_numpy, device, dtype)
 
     # Skipping 'meta', since 'to_dlpack' does not work for them.
     @skipMeta
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_alias_from_dlpack(self, device, dtype):
         self._test_alias_with_cvt(to_dlpack, device, dtype)
 
     @onlyCPU
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_alias_from_buffer(self, device, dtype):
         self._test_alias_with_cvt(to_memview, device, dtype, shape=(5,), only_with_dtype=True)
 
     def _test_copy_with_cvt(self, cvt, device, dtype, shape=(5, 5), only_with_dtype=False):
-        original = make_tensor(shape, device, dtype)
+        original = make_tensor(shape, dtype=dtype, device=device)
 
         def check(**kwargs):
             self._check(original, cvt=cvt, is_alias=False, **kwargs)
@@ -3916,35 +3962,35 @@ def check(**kwargs):
 
         # Copy is forced because of different dtype
         if not only_with_dtype:
-            for other in get_all_dtypes():
+            for other in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
                 if dtype != other:
                     check(same_dtype=False, dtype=other)
                     check(same_dtype=False, dtype=other, copy=True)
 
     @skipMeta
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_copy_tensor(self, device, dtype):
         self._test_copy_with_cvt(identity, device, dtype)
 
     @onlyCPU
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_copy_from_numpy(self, device, dtype):
         self._test_copy_with_cvt(to_numpy, device, dtype)
 
     @skipMeta
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_copy_from_dlpack(self, device, dtype):
         self._test_copy_with_cvt(to_dlpack, device, dtype)
 
     @onlyCPU
-    @dtypes(*torch_to_numpy_dtype_dict.keys())
+    @dtypes(*set(numpy_to_torch_dtype_dict.values()))
     def test_copy_from_buffer(self, device, dtype):
         self._test_copy_with_cvt(to_memview, device, dtype, shape=(5,), only_with_dtype=True)
 
     def _test_copy_mult_devices(self, devices, dtype, cvt):
         cuda1 = devices[0]
         cuda2 = devices[1]
-        original = make_tensor((5, 5), cuda1, dtype)
+        original = make_tensor((5, 5), dtype=dtype, device=cuda1)
 
         def check(**kwargs):
             self._check(original, cvt, is_alias=False, same_device=False, device=cuda2, **kwargs)
@@ -3955,19 +4001,19 @@ def check(**kwargs):
 
     @onlyCUDA
     @deviceCountAtLeast(2)
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_copy_from_tensor_mult_devices(self, devices, dtype):
         self._test_copy_mult_devices(devices, dtype, identity)
 
     @onlyCUDA
     @deviceCountAtLeast(2)
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_copy_from_dlpack_mult_devices(self, devices, dtype):
         self._test_copy_mult_devices(devices, dtype, to_dlpack)
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_copy_list(self, device, dtype):
-        original = make_tensor((5, 5), torch.device("cpu"), dtype)
+        original = make_tensor((5, 5), dtype=dtype, device=torch.device("cpu"))
 
         def check(**kwargs):
             self._check(original, torch.Tensor.tolist, is_alias=False, **kwargs)
@@ -3980,7 +4026,7 @@ def check(**kwargs):
 
     @dtypes(torch.float32)
     def test_unsupported_alias(self, device, dtype):
-        original = make_tensor((5, 5), device, dtype)
+        original = make_tensor((5, 5), dtype=dtype, device=device)
 
         if torch.cuda.is_available():
             other_device = get_another_device(device)
@@ -4001,14 +4047,14 @@ def test_unsupported_alias(self, device, dtype):
     @dtypes(torch.float32)
     def test_unsupported_alias_mult_devices(self, devices, dtype):
         dev1, dev2 = devices[:2]
-        original = make_tensor((5, 5), dev1, dtype)
+        original = make_tensor((5, 5), dtype=dtype, device=dev1)
         with self.assertRaisesRegex(ValueError,
                                     f"from device '{dev1}' to '{dev2}'"):
             torch.asarray(original, device=dev2, copy=False)
 
     @dtypes(torch.float32, torch.complex64)
     def test_retain_autograd_history(self, device, dtype):
-        original = make_tensor((5, 5), device, dtype, requires_grad=True)
+        original = make_tensor((5, 5), dtype=dtype, device=device, requires_grad=True)
         # 'cloned' has 'grad_fn=<CloneBackwards>'
         cloned = original.clone()
 
@@ -4046,6 +4092,8 @@ def test_astensor_consistency(self, device):
             [0.0, True, False, 42],
             # With Complex
             [0.0, True, False, 42, 5j],
+            # With Range
+            range(5),
         ]
 
         for e in examples:
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index 4300e9a71006..45bcef536e4f 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -42,7 +42,7 @@
 skipIfNoMatplotlib = unittest.skipIf(not TEST_MATPLOTLIB, "no matplotlib")
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ASAN
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ASAN, TEST_WITH_CROSSREF
 
 def tensor_N(shape, dtype=float):
     numel = np.prod(shape)
@@ -54,6 +54,8 @@ class BaseTestCase(TestCase):
     def setUp(self):
         if not TEST_TENSORBOARD:
             return self.skipTest("Skip the test since TensorBoard is not installed")
+        if TEST_WITH_CROSSREF:
+            return self.skipTest("Don't run TensorBoard tests with crossref")
         self.temp_dirs = []
 
     def createSummaryWriter(self):
@@ -562,15 +564,15 @@ def forward(self, x):
         expected_proto = GraphDef()
         text_format.Parse(expected_str, expected_proto)
 
-        self.assertEquals(len(expected_proto.node), len(actual_proto.node))
+        self.assertEqual(len(expected_proto.node), len(actual_proto.node))
         for i in range(len(expected_proto.node)):
             expected_node = expected_proto.node[i]
             actual_node = actual_proto.node[i]
-            self.assertEquals(expected_node.name, actual_node.name)
-            self.assertEquals(expected_node.op, actual_node.op)
-            self.assertEquals(expected_node.input, actual_node.input)
-            self.assertEquals(expected_node.device, actual_node.device)
-            self.assertEquals(
+            self.assertEqual(expected_node.name, actual_node.name)
+            self.assertEqual(expected_node.op, actual_node.op)
+            self.assertEqual(expected_node.input, actual_node.input)
+            self.assertEqual(expected_node.device, actual_node.device)
+            self.assertEqual(
                 sorted(expected_node.attr.keys()), sorted(actual_node.attr.keys()))
 
     def test_nested_nn_squential(self):
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 42ca49dc3475..8a5e918eda4b 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -13,11 +13,13 @@
 
 class BaseTestClass(JitTestCase):
     def setUp(self):
+        super(BaseTestClass, self).setUp()
         self.tensorexpr_options = TensorExprTestOptions()
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
 
     def tearDown(self):
         self.tensorexpr_options.restore()
+        super(BaseTestClass, self).tearDown()
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
index 00f97399edd7..486858d310a3 100644
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@@ -348,20 +348,14 @@ def f(a):
         """
         graph = torch._C.parse_ir(graph_str)
 
-        def my_custom_lowering(inputs, out_shape, out_type, device):
-            def get_dim_args(dims):
-                dim_args = []
-                for dim in dims:
-                    dim_args.append(te.DimArg(dim, "i" + str(len(dim_args))))
-                return dim_args
-
+        def my_custom_lowering(inputs, out_shape, out_stride, out_type, device):
             def compute(idxs):
                 load = inputs[0].as_buf().load(idxs)
                 return te.ifThenElse(
                     te.ExprHandle.isnan(load), te.ExprHandle.float(0.0), load
                 )
 
-            return te.Compute2("custom_nan_to_num", get_dim_args(out_shape), compute)
+            return te.Compute2("custom_nan_to_num", out_shape, compute)
 
         kernel = te.TensorExprKernel(graph, {"aten::nan_to_num": my_custom_lowering})
         res1 = kernel.run((x,))
diff --git a/test/test_testing.py b/test/test_testing.py
index 3cfef8cee395..25f53e5e91ae 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -15,21 +15,20 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
     (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest,
-     parametrize, subtest, instantiate_parametrized_tests, dtype_name)
+     parametrize, subtest, instantiate_parametrized_tests, dtype_name, TEST_WITH_ROCM)
 from torch.testing._internal.common_device_type import \
     (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
      get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyNativeDeviceTypes,
      deviceCountAtLeast, ops, expectedFailureMeta)
 from torch.testing._internal.common_methods_invocations import op_db
 import torch.testing._internal.opinfo_helper as opinfo_helper
-from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.testing._internal.common_dtype import all_types_and_complex_and
 from torch.testing._internal.common_modules import modules, module_db
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
     # Ensure that assertEqual handles numpy arrays properly
-    @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False,
-                             include_bool=True, include_complex=True)))
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half))
     def test_assertEqual_numpy(self, device, dtype):
         S = 10
         test_sizes = [
@@ -40,7 +39,7 @@ def test_assertEqual_numpy(self, device, dtype):
             (0, S),
             (S, 0)]
         for test_size in test_sizes:
-            a = make_tensor(test_size, device, dtype, low=-5, high=5)
+            a = make_tensor(test_size, dtype=dtype, device=device, low=-5, high=5)
             a_n = a.cpu().numpy()
             msg = f'size: {test_size}'
             self.assertEqual(a_n, a, rtol=0, atol=0, msg=msg)
@@ -255,7 +254,7 @@ def test_make_tensor(self, device, dtype):
         def check(size, low, high, requires_grad, noncontiguous):
             if dtype not in [torch.float, torch.cfloat]:
                 requires_grad = False
-            t = make_tensor(size, device, dtype, low=low, high=high,
+            t = make_tensor(size, dtype=dtype, device=device, low=low, high=high,
                             requires_grad=requires_grad, noncontiguous=noncontiguous)
 
             self.assertEqual(t.shape, size)
@@ -279,10 +278,16 @@ def check(size, low, high, requires_grad, noncontiguous):
             check(size, None, None, False, False)
             check(size, 2, 4, True, True)
 
+    def test_make_tensor_complex32(self, device):
+        # verify that we can generate torch.complex32 tensor
+        t = make_tensor((1, 2, 3), dtype=torch.complex32, device=device)
+        self.assertEqual(t.dtype, torch.complex32)
+
     # The following tests (test_cuda_assert_*) are added to ensure test suite terminates early
     # when CUDA assert was thrown. Because all subsequent test will fail if that happens.
     # These tests are slow because it spawn another process to run test suite.
     # See: https://github.com/pytorch/pytorch/issues/49019
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
     @onlyCUDA
     @slowTest
     def test_cuda_assert_should_stop_common_utils_test_suite(self, device):
@@ -316,6 +321,7 @@ def test_trivial_passing_test_case_on_cpu_cuda(self):
         self.assertIn('errors=1', stderr)
 
 
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
     @onlyCUDA
     @slowTest
     def test_cuda_assert_should_stop_common_device_type_test_suite(self, device):
@@ -356,6 +362,7 @@ def test_trivial_passing_test_case_on_cpu_cuda(self, device):
         self.assertIn('errors=1', stderr)
 
 
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
     @onlyCUDA
     @slowTest
     def test_cuda_assert_should_not_stop_common_distributed_test_suite(self, device):
@@ -403,10 +410,10 @@ def test_get_supported_dtypes(self, device):
         ops_to_test = list(filter(lambda op: op.name in ['atan2', 'topk', 'xlogy'], op_db))
 
         for op in ops_to_test:
-            dynamic_dtypes = opinfo_helper.get_supported_dtypes(op.op, op.sample_inputs_func, self.device_type)
+            dynamic_dtypes = opinfo_helper.get_supported_dtypes(op, op.sample_inputs_func, self.device_type)
             dynamic_dispatch = opinfo_helper.dtypes_dispatch_hint(dynamic_dtypes)
             if self.device_type == 'cpu':
-                dtypes = op.dtypesIfCPU
+                dtypes = op.dtypes
             else:  # device_type ='cuda'
                 dtypes = op.dtypesIfCUDA
 
@@ -574,11 +581,10 @@ def test_unknown_layout(self):
 
     def test_meta(self):
         actual = torch.empty((2, 2), device="meta")
-        expected = actual.clone()
+        expected = torch.empty((2, 2), device="meta")
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaisesRegex(NotImplementedError, "meta"):
-                fn()
+            fn()
 
     def test_mismatching_layout(self):
         strided = torch.empty((2, 2))
@@ -1085,10 +1091,7 @@ def test_matching(self):
         col_indices = (1, 0)
         values = (1, 2)
         actual = torch.sparse_csr_tensor(crow_indices, col_indices, values, size=(2, 2))
-        # TODO: replace this by actual.clone() after https://github.com/pytorch/pytorch/issues/59285 is fixed
-        expected = torch.sparse_csr_tensor(
-            actual.crow_indices(), actual.col_indices(), actual.values(), size=actual.size(), device=actual.device
-        )
+        expected = actual.clone()
 
         for fn in assert_close_with_inputs(actual, expected):
             fn()
@@ -1139,6 +1142,180 @@ def test_mismatching_values_msg(self):
                 fn()
 
 
+@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support CSC testing")
+class TestAssertCloseSparseCSC(TestCase):
+    def test_matching(self):
+        ccol_indices = (0, 1, 2)
+        row_indices = (1, 0)
+        values = (1, 2)
+        actual = torch.sparse_csc_tensor(ccol_indices, row_indices, values, size=(2, 2))
+        expected = actual.clone()
+
+        for fn in assert_close_with_inputs(actual, expected):
+            fn()
+
+    def test_mismatching_ccol_indices_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = (1, 2)
+        actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = (0, 2, 2)
+        expected_row_indices = actual_row_indices
+        expected_values = actual_values
+        expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC ccol_indices")):
+                fn()
+
+    def test_mismatching_row_indices_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = (1, 2)
+        actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = actual_ccol_indices
+        expected_row_indices = (1, 1)
+        expected_values = actual_values
+        expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC row_indices")):
+                fn()
+
+    def test_mismatching_values_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = (1, 2)
+        actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = actual_ccol_indices
+        expected_row_indices = actual_row_indices
+        expected_values = (1, 3)
+        expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC values")):
+                fn()
+
+
+@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support BSR testing")
+class TestAssertCloseSparseBSR(TestCase):
+    def test_matching(self):
+        crow_indices = (0, 1, 2)
+        col_indices = (1, 0)
+        values = ([[1]], [[2]])
+        actual = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(2, 2))
+        expected = actual.clone()
+
+        for fn in assert_close_with_inputs(actual, expected):
+            fn()
+
+    def test_mismatching_crow_indices_msg(self):
+        actual_crow_indices = (0, 1, 2)
+        actual_col_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2))
+
+        expected_crow_indices = (0, 2, 2)
+        expected_col_indices = actual_col_indices
+        expected_values = actual_values
+        expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR crow_indices")):
+                fn()
+
+    def test_mismatching_col_indices_msg(self):
+        actual_crow_indices = (0, 1, 2)
+        actual_col_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2))
+
+        expected_crow_indices = actual_crow_indices
+        expected_col_indices = (1, 1)
+        expected_values = actual_values
+        expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR col_indices")):
+                fn()
+
+    def test_mismatching_values_msg(self):
+        actual_crow_indices = (0, 1, 2)
+        actual_col_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2))
+
+        expected_crow_indices = actual_crow_indices
+        expected_col_indices = actual_col_indices
+        expected_values = ([[1]], [[3]])
+        expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR values")):
+                fn()
+
+
+@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support BSC testing")
+class TestAssertCloseSparseBSC(TestCase):
+    def test_matching(self):
+        ccol_indices = (0, 1, 2)
+        row_indices = (1, 0)
+        values = ([[1]], [[2]])
+        actual = torch.sparse_bsc_tensor(ccol_indices, row_indices, values, size=(2, 2))
+        expected = actual.clone()
+
+        for fn in assert_close_with_inputs(actual, expected):
+            fn()
+
+    def test_mismatching_ccol_indices_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = (0, 2, 2)
+        expected_row_indices = actual_row_indices
+        expected_values = actual_values
+        expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC ccol_indices")):
+                fn()
+
+    def test_mismatching_row_indices_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = actual_ccol_indices
+        expected_row_indices = (1, 1)
+        expected_values = actual_values
+        expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC row_indices")):
+                fn()
+
+    def test_mismatching_values_msg(self):
+        actual_ccol_indices = (0, 1, 2)
+        actual_row_indices = (1, 0)
+        actual_values = ([[1]], [[2]])
+        actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2))
+
+        expected_ccol_indices = actual_ccol_indices
+        expected_row_indices = actual_row_indices
+        expected_values = ([[1]], [[3]])
+        expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2))
+
+        for fn in assert_close_with_inputs(actual, expected):
+            with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC values")):
+                fn()
+
+
 class TestAssertCloseQuantized(TestCase):
     def test_mismatching_is_quantized(self):
         actual = torch.tensor(1.0)
@@ -1463,7 +1640,7 @@ def test_op_parametrized(self, device, dtype, op, flag):
         device_cls = locals()['TestParametrized{}'.format(device.upper())]
         expected_test_names = []
         for op in op_db:
-            for dtype in op.default_test_dtypes(device):
+            for dtype in op.supported_dtypes(torch.device(device).type):
                 for flag_part in ('flag_disabled', 'flag_enabled'):
                     expected_name = '{}.test_op_parametrized_{}_{}_{}_{}'.format(
                         device_cls.__name__, op.formatted_name, flag_part, device, dtype_name(dtype))
diff --git a/test/test_torch.py b/test/test_torch.py
index 164e6585f164..3e2bc5b03a3b 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -34,16 +34,16 @@
     TestCase, TEST_WITH_ROCM, run_tests,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest,
-    skipCUDAMemoryLeakCheckIf, BytesIOContext, noarchTest,
+    TEST_WITH_CROSSREF,
+    skipCUDAMemoryLeakCheckIf, BytesIOContext,
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
-    skipIfNotRegistered, bytes_to_scalar)
+    skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
     expectedFailureXLA,
     instantiate_device_type_tests,
-    skipCUDAVersionIn,
     onlyCUDA, onlyCPU,
     dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast,
     skipMeta,
@@ -52,9 +52,11 @@
 from typing import Tuple
 import torch.backends.quantized
 import torch.testing._internal.data
-from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32
+from torch.testing._internal.common_cuda import (
+    tf32_on_and_off, tf32_is_not_fp32, TEST_CUDNN)
 from torch.testing._internal.common_dtype import (
-    get_all_fp_dtypes, get_all_int_dtypes, get_all_math_dtypes, get_all_dtypes, get_all_complex_dtypes
+    floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types,
+    all_types_and, floating_types, floating_and_complex_types, integral_types,
 )
 
 # Protects against includes accidentally setting the default dtype
@@ -116,19 +118,6 @@ def test_cuda_vitals_gpu_only(self, device):
 class TestTorchDeviceType(TestCase):
     exact_dtype = True
 
-    # FIXME: Port this to ErrorInputs on where
-    @onlyCUDA
-    @dtypes(torch.float32)
-    def test_where_invalid_device(self, device, dtype):
-        for devices in [('cpu', device, device), (device, 'cpu', 'cpu'),
-                        (device, 'cpu', device), ('cpu', device, 'cpu')]:
-            condition = make_tensor(16, device=devices[0], dtype=torch.float32)
-            x = make_tensor(16, device=devices[1], dtype=torch.float32)
-            y = make_tensor(16, device=devices[2], dtype=torch.float32)
-            with self.assertRaisesRegex(RuntimeError,
-                                        "Expected condition, x and y to be on the same device"):
-                torch.where(condition, x, y)
-
     # TODO: move all tensor creation to common ops
     def _rand_shape(self, dim, min_size, max_size):
         shape = []
@@ -174,7 +163,7 @@ def rand_byte():
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128)
     def test_storage(self, device, dtype):
-        v = make_tensor((3, 5), device, dtype, low=-9, high=9)
+        v = make_tensor((3, 5), dtype=dtype, device=device, low=-9, high=9)
         self.assertEqual(v.storage()[0], v[0][0])
         self.assertEqual(v.storage()[14], v[2][4])
         v_s = v.storage()
@@ -233,16 +222,26 @@ def test_storage_setitem(self, device, dtype):
         self.assertEqual(s, storage_type(l))
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_tensor_storage_type(self, device, dtype):
+        a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
+
+        module = torch.cuda if (torch.device(device).type == 'cuda') else torch
+        expected_storage_type = getattr(module, torch.storage._dtype_to_storage_type_map()[dtype])
+
+        self.assertEqual(a.storage_type(), expected_storage_type)
+
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_tensor_from_storage(self, device, dtype):
-        a = make_tensor((4, 5, 3), device, dtype, low=-9, high=9)
+        a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
         b = torch.tensor(a_s, device=device, dtype=dtype).reshape(a.size())
         self.assertEqual(a, b)
         c = torch.tensor(a_s._untyped(), device=device, dtype=dtype).reshape(a.size())
         self.assertEqual(a, c)
 
-        for error_dtype in get_all_dtypes():
+        for error_dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             if error_dtype == dtype:
                 continue
             with self.assertRaisesRegex(RuntimeError, r'Expected a Storage of type'):
@@ -250,16 +249,16 @@ def test_tensor_from_storage(self, device, dtype):
                 torch.tensor(error_storage, device=device, dtype=dtype)
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_set_storage(self, device, dtype):
-        a = make_tensor((4, 5, 3), device, dtype, low=-9, high=9)
+        a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
         b = torch.tensor([], device=device, dtype=dtype).set_(a_s).reshape(a.size())
         self.assertEqual(a, b)
         c = torch.tensor([], device=device, dtype=dtype).set_(a_s._untyped()).reshape(a.size())
         self.assertEqual(a, c)
 
-        for error_dtype in get_all_dtypes():
+        for error_dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             if error_dtype == dtype:
                 continue
             with self.assertRaisesRegex(RuntimeError, r'Expected a Storage of type'):
@@ -460,26 +459,12 @@ def test_scalar_check(self, device):
         self.assertEqual((), torch.cummax(zero_d, 0)[0].shape)
         self.assertEqual((), torch.cummin(zero_d, 0)[0].shape)
 
-        # renorm
-        self.assertRaises(RuntimeError, lambda: torch.renorm(zero_d, 0.5, 0, 1.0))
-
         # sort, topk
         self.assertEqual([(), ()], [x.shape for x in torch.sort(zero_d, 0, False)])
         self.assertEqual([(), ()], [x.shape for x in torch.sort(zero_d, 0, True)])
         self.assertEqual([(), ()], [x.shape for x in torch.topk(zero_d, 1, 0, False)])
         self.assertEqual([(), ()], [x.shape for x in torch.topk(zero_d, 1, 0, True)])
 
-        # lstsq (gels)
-        self.assertRaises(RuntimeError, lambda: torch.lstsq(zero_d, zero_d))
-
-        # eig
-        self.assertRaises(RuntimeError, lambda: torch.eig(zero_d, False))
-        self.assertRaises(RuntimeError, lambda: torch.eig(zero_d, True))
-
-        # this is only implemented on cpu
-        if (torch.device(device).type == 'cpu'):
-            self.assertRaises(RuntimeError, lambda: torch.ormqr(zero_d, zero_d, zero_d))
-
         # max, min
         self.assertEqual((), torch.max(zero_d, zero_d).shape)
         self.assertEqual((1,), torch.max(one_d, zero_d).shape)
@@ -488,9 +473,6 @@ def test_scalar_check(self, device):
         self.assertEqual((1,), torch.min(one_d, zero_d).shape)
         self.assertEqual((1,), torch.min(zero_d, one_d).shape)
 
-        # diag
-        self.assertRaises(RuntimeError, lambda: torch.diag(zero_d))
-
         zero_d_int = torch.tensor(1, device=device)
         one_d_int = torch.tensor([1], device=device)
 
@@ -647,6 +629,7 @@ def test_scalar_check(self, device):
                 self.assertEqual((), torch.nn.functional.multi_margin_loss(input, target, reduction='sum').shape)
 
     # Uses mismatched arange out size to trigger a warning
+    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref perturbs line numbering")
     def test_cpp_warnings_have_python_context(self, device):
         # Creates long string in advance to avoid a too-long Python line
         s = ".+Triggered internally at.+RangeFactories.+"
@@ -793,158 +776,159 @@ def test_is_set_to(self, device):
         self.assertFalse(t1.is_set_to(t2))
         self.assertFalse(t2.is_set_to(t1))
 
-    def test_broadcast(self, device):
-
-        # all functions
-        fns = {
-            "dist", "atan2", "pow", "lerp", "add",
-            "sub", "mul", "div", "fmod", "remainder",
-            "eq", "ge", "gt", "le", "lt", "max", "min", "ne",
-            "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill",
-            "map", "map2", "copy"
-        }
+    # See https://github.com/pytorch/pytorch/issues/72650
+    @skipIfMps
+    @skipMeta
+    @parametrize(
+        "fn",
+        [
+            "dist", "atan2", "pow", "lerp", "add", "sub", "mul", "div", "fmod", "remainder", "eq", "ge", "gt", "le",
+            "lt", "max", "min", "ne", "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill", "map",
+            "map2", "copy",
+        ],
+    )
+    def test_broadcast(self, fn, device):
         # functions with three tensor arguments
         fns_3_args = {"map2"}
         fns_value_kwarg = {"addcdiv", "addcmul"}
 
-        for fn in fns:
-            (dims_small, dims_large, dims_full) = self._select_broadcastable_dims()
-            full1d = torch.randn(*dims_full, device=device).flatten().float()
-            small = torch.randn(*dims_small, device=device).float()
-            large = torch.randn(*dims_large, device=device).float()
-            small_expanded = small.expand(*dims_full)
-            large_expanded = large.expand(*dims_full)
-            small2 = None
-            small2_expanded = None
-            if fn in fns_3_args or fn in fns_value_kwarg:
-                # create another smaller tensor
-                (dims_small2, _, _) = self._select_broadcastable_dims(dims_full)
-                small2 = torch.randn(*dims_small2, device=device).float()
-                small2_expanded = small2.expand(*dims_full)
-
-            if small.is_cuda and fn in ['map', 'map2']:
-                # map and map2 are not implementd on CUDA tensors
-                continue
-
-            if hasattr(large_expanded, fn):
-                # run through tensor versions of functions
-                # and verify fully expanded inputs give same results
-                expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
-
-                def tensorfn(myfn, t1, t2):
-                    if fn == "lerp":
-                        return myfn(t1, 0.5)
-                    elif fn == "masked_select":
-                        return myfn(t1 < 0)
-                    elif fn == "masked_scatter":
-                        return myfn(t1 < 0.5, full1d)
-                    elif fn == "masked_fill":
-                        return myfn(t1 < 0.5, 1.0)
-                    elif fn in fns_3_args:
-                        return myfn(1, t1, t2)
-                    elif fn in fns_value_kwarg:
-                        return myfn(t1, t2, value=1)
-                    else:
-                        return myfn(t1)
-
-                # test various orders
-                for first, second, third in [(large, small, small2), (small, large, small2),
-                                             (small2, small, large), (small2, large, small)]:
-                    if first is None:
-                        break  # ignore last iter when small2 is None
-                    method_expanded = getattr(expanded[first], fn)
-                    method = getattr(first, fn)
-                    r1 = tensorfn(method_expanded, expanded[second], expanded[third])
-                    r2 = tensorfn(method, second, third)
-                    self.assertEqual(r1, r2)
-
-            # now for torch. versions of functions
-            if hasattr(torch, fn):
-                fntorch = getattr(torch, fn)
-                expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
-
-                def torchfn(t1, t2, t3):
-                    if fn == "lerp":
-                        return fntorch(t1, t2, 0.5)
-                    elif fn == "masked_select":
-                        return fntorch(t1, t2 < 0)
-                    elif fn == "masked_scatter":
-                        return fntorch(t1, t2 < 0.5, full1d)
-                    elif fn == "masked_fill":
-                        return fntorch(t1, t2 < 0.5, 1.0)
-                    elif fn in fns_3_args:
-                        return fntorch(t1, 1.0, t2, t3)
-                    elif fn in fns_value_kwarg:
-                        return fntorch(t1, t2, t3, value=1.0)
-                    else:
-                        return fntorch(t1, t2)
-
-                # test various orders
-                for first, second, third in [(large, small, small2), (small, large, small2),
-                                             (small2, small, large), (small2, large, small)]:
-                    if first is None:
-                        break  # ignore last iter when small2 is None
-                    r1 = torchfn(expanded[first], expanded[second], expanded[third])
-                    r2 = torchfn(first, second, third)
-                    self.assertEqual(r1, r2)
-
-            # now for in place functions
-            # in-place tensor is not broadcastable; test only guaranteed
-            # to work by broadcasting other argument(s)
-            if not hasattr(large_expanded, fn + "_"):
-                continue
+        (dims_small, dims_large, dims_full) = self._select_broadcastable_dims()
+        full1d = torch.randn(*dims_full, device=device).flatten().float()
+        small = torch.randn(*dims_small, device=device).float()
+        large = torch.randn(*dims_large, device=device).float()
+        small_expanded = small.expand(*dims_full)
+        large_expanded = large.expand(*dims_full)
+        small2 = None
+        small2_expanded = None
+        if fn in fns_3_args or fn in fns_value_kwarg:
+            # create another smaller tensor
+            (dims_small2, _, _) = self._select_broadcastable_dims(dims_full)
+            small2 = torch.randn(*dims_small2, device=device).float()
+            small2_expanded = small2.expand(*dims_full)
+
+        if small.is_cuda and fn in ['map', 'map2']:
+            # map and map2 are not implementd on CUDA tensors
+            return
 
-            # need to clone largeExpanded so we can reuse, since functions are in-place
-            large_expanded_clone = large_expanded.clone()
+        if hasattr(large_expanded, fn):
+            # run through tensor versions of functions
+            # and verify fully expanded inputs give same results
+            expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
 
-            def tensorfn_inplace(t0, t1, t2=None):
-                t0_fn = getattr(t0, fn + "_")
+            def tensorfn(myfn, t1, t2):
                 if fn == "lerp":
-                    return t0_fn(t1, 0.5)
+                    return myfn(t1, 0.5)
+                elif fn == "masked_select":
+                    return myfn(t1 < 0)
                 elif fn == "masked_scatter":
-                    return t0_fn(t1 < 0.5, full1d)
+                    return myfn(t1 < 0.5, full1d)
                 elif fn == "masked_fill":
-                    return t0_fn(t1 < 0.5, 1.0)
-                elif fn == "map":
-                    return t0_fn(t1, lambda x, y: x + y)
-                elif fn == "map2":
-                    return t0_fn(t1, t2, lambda x, y, z: x + y + z)
+                    return myfn(t1 < 0.5, 1.0)
                 elif fn in fns_3_args:
-                    return t0_fn(1.0, t1, t2)
+                    return myfn(1, t1, t2)
                 elif fn in fns_value_kwarg:
-                    return t0_fn(t1, t2, value=1.0)
+                    return myfn(t1, t2, value=1)
                 else:
-                    return t0_fn(t1)
-            # in-place pointwise operations don't actually work if the in-place
-            # tensor is 0-strided (numpy has the same issue)
-            if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()):
-                r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded)
-                r2 = tensorfn_inplace(large_expanded_clone, small, small2)
+                    return myfn(t1)
+
+            # test various orders
+            for first, second, third in [(large, small, small2), (small, large, small2),
+                                         (small2, small, large), (small2, large, small)]:
+                if first is None:
+                    break  # ignore last iter when small2 is None
+                method_expanded = getattr(expanded[first], fn)
+                method = getattr(first, fn)
+                r1 = tensorfn(method_expanded, expanded[second], expanded[third])
+                r2 = tensorfn(method, second, third)
                 self.assertEqual(r1, r2)
 
-            def broadcastable(t0, t1, t2=None):
-                try:
-                    t1.expand_as(t0)
-                    if t2 is not None:
-                        t2.expand_as(t0)
-                except RuntimeError:
-                    return False
-                return True
-
-            def _test_in_place_broadcastable(t0, t1, t2=None):
-                if not broadcastable(t0, t1, t2):
-                    same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True)
-                    if not same_size:
-                        self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2))
+        # now for torch. versions of functions
+        if hasattr(torch, fn):
+            fntorch = getattr(torch, fn)
+            expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded}
+
+            def torchfn(t1, t2, t3):
+                if fn == "lerp":
+                    return fntorch(t1, t2, 0.5)
+                elif fn == "masked_select":
+                    return fntorch(t1, t2 < 0)
+                elif fn == "masked_scatter":
+                    return fntorch(t1, t2 < 0.5, full1d)
+                elif fn == "masked_fill":
+                    return fntorch(t1, t2 < 0.5, 1.0)
+                elif fn in fns_3_args:
+                    return fntorch(t1, 1.0, t2, t3)
+                elif fn in fns_value_kwarg:
+                    return fntorch(t1, t2, t3, value=1.0)
                 else:
-                    tensorfn_inplace(t0, t1, t2)
+                    return fntorch(t1, t2)
+
+            # test various orders
+            for first, second, third in [(large, small, small2), (small, large, small2),
+                                         (small2, small, large), (small2, large, small)]:
+                if first is None:
+                    break  # ignore last iter when small2 is None
+                r1 = torchfn(expanded[first], expanded[second], expanded[third])
+                r2 = torchfn(first, second, third)
+                self.assertEqual(r1, r2)
+
+        # now for in place functions
+        # in-place tensor is not broadcastable; test only guaranteed
+        # to work by broadcasting other argument(s)
+        if not hasattr(large_expanded, fn + "_"):
+            return
+
+        # need to clone largeExpanded so we can reuse, since functions are in-place
+        large_expanded_clone = large_expanded.clone()
+
+        def tensorfn_inplace(t0, t1, t2=None):
+            t0_fn = getattr(t0, fn + "_")
+            if fn == "lerp":
+                return t0_fn(t1, 0.5)
+            elif fn == "masked_scatter":
+                return t0_fn(t1 < 0.5, full1d)
+            elif fn == "masked_fill":
+                return t0_fn(t1 < 0.5, 1.0)
+            elif fn == "map":
+                return t0_fn(t1, lambda x, y: x + y)
+            elif fn == "map2":
+                return t0_fn(t1, t2, lambda x, y, z: x + y + z)
+            elif fn in fns_3_args:
+                return t0_fn(1.0, t1, t2)
+            elif fn in fns_value_kwarg:
+                return t0_fn(t1, t2, value=1.0)
+            else:
+                return t0_fn(t1)
+        # in-place pointwise operations don't actually work if the in-place
+        # tensor is 0-strided (numpy has the same issue)
+        if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()):
+            r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded)
+            r2 = tensorfn_inplace(large_expanded_clone, small, small2)
+            self.assertEqual(r1, r2)
+
+        def broadcastable(t0, t1, t2=None):
+            try:
+                t1.expand_as(t0)
+                if t2 is not None:
+                    t2.expand_as(t0)
+            except RuntimeError:
+                return False
+            return True
 
-            if fn not in fns_3_args and fn not in fns_value_kwarg:
-                _test_in_place_broadcastable(small, large_expanded)
-                _test_in_place_broadcastable(small, large)
+        def _test_in_place_broadcastable(t0, t1, t2=None):
+            if not broadcastable(t0, t1, t2):
+                same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True)
+                if not same_size:
+                    self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2))
             else:
-                _test_in_place_broadcastable(small2, small_expanded, large_expanded)
-                _test_in_place_broadcastable(small2, small, large)
+                tensorfn_inplace(t0, t1, t2)
+
+        if fn not in fns_3_args and fn not in fns_value_kwarg:
+            _test_in_place_broadcastable(small, large_expanded)
+            _test_in_place_broadcastable(small, large)
+        else:
+            _test_in_place_broadcastable(small2, small_expanded, large_expanded)
+            _test_in_place_broadcastable(small2, small, large)
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
@@ -1019,6 +1003,7 @@ def test_case_info(fn_name, config):
 
     # FIXME: update OpInfos to support "nondeterministic samples" and port these tests
     #   to that architecture
+    @skipIfMps
     def test_nondeterministic_alert_AvgPool3d(self, device):
         module = torch.nn.AvgPool3d(3)
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
@@ -1031,6 +1016,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_AdaptiveAvgPool2d(self, device):
         module = torch.nn.AdaptiveAvgPool2d(3)
         input = torch.randn(2, 3, 3, requires_grad=True, device=device)
@@ -1043,6 +1029,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_AdaptiveAvgPool3d(self, device):
         module = torch.nn.AdaptiveAvgPool3d(3)
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
@@ -1055,6 +1042,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_MaxPool3d(self, device):
         module = torch.nn.MaxPool3d(3)
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
@@ -1067,6 +1055,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_AdaptiveMaxPool2d(self, device):
         module = torch.nn.AdaptiveMaxPool2d(3)
         input = torch.randn(2, 3, 3, requires_grad=True, device=device)
@@ -1079,6 +1068,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_FractionalMaxPool2d(self, device):
         module = torch.nn.FractionalMaxPool2d(2, output_ratio=0.5)
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
@@ -1091,6 +1081,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_FractionalMaxPool3d(self, device):
         module = torch.nn.FractionalMaxPool3d(2, output_ratio=0.5)
         input = torch.randn(2, 3, 3, 3, 3, requires_grad=True, device=device)
@@ -1103,6 +1094,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_interpolate_linear(self, device):
         input = torch.randn(1, 2, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
@@ -1133,6 +1125,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_interpolate_bicubic(self, device):
         input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
@@ -1148,6 +1141,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_interpolate_trilinear(self, device):
         input = torch.randn(1, 2, 4, 4, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
@@ -1163,6 +1157,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_ReflectionPad1d(self, device):
         module = torch.nn.ReflectionPad1d((1, 2))
         input = torch.randn(2, 3, 8, device=device, requires_grad=True)
@@ -1187,6 +1182,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_ReflectionPad3d(self, device):
         module = torch.nn.ReflectionPad3d((1, 2, 3, 4, 5, 6))
         input = torch.randn(2, 3, 8, 8, 8, device=device, requires_grad=True)
@@ -1199,6 +1195,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_ReplicationPad1d(self, device):
         module = torch.nn.ReplicationPad1d((1, 2))
         input = torch.randn(2, 3, 4, device=device, requires_grad=True)
@@ -1223,6 +1220,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_ReplicationPad3d(self, device):
         module = torch.nn.ReplicationPad3d((1, 2, 3, 4, 5, 6))
         input = torch.randn(2, 3, 4, 4, 4, device=device, requires_grad=True)
@@ -1324,6 +1322,7 @@ def forward_func(slf, device):
         test_func(torch.Tensor.put)
         test_func(torch.Tensor.put_)
 
+    @skipIfMps
     def test_nondeterministic_alert_histc(self, device):
         def test_func(op_call):
             a = torch.tensor([], device=device)
@@ -1337,6 +1336,7 @@ def forward_func(slf, device):
         test_func(torch.histc)
         test_func(torch.Tensor.histc)
 
+    @skipIfMps
     def test_nondeterministic_alert_bincount(self, device):
         def test_func(op_call):
             a = torch.tensor([], device=device, dtype=torch.long)
@@ -1391,6 +1391,7 @@ def backward_func(slf, device):
         test_func(torch.gather)
         test_func(torch.Tensor.gather)
 
+    @skipIfMps
     def test_nondeterministic_alert_grid_sample_2d(self, device):
         input = torch.empty(1, 1, 2, 2, device=device, requires_grad=True)
         grid = torch.empty(1, 1, 1, 2, device=device)
@@ -1403,6 +1404,7 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
+    @skipIfMps
     def test_nondeterministic_alert_grid_sample_3d(self, device):
         input = torch.empty(1, 1, 2, 2, 2, device=device, requires_grad=True)
         grid = torch.empty(1, 1, 1, 2, 3, device=device)
@@ -1415,15 +1417,55 @@ def backward_func(slf, device):
 
         backward_func(self, device)
 
-    def test_embedding_scalar_weight_error(self, device):
-        indices = torch.rand(2, 2, device=device).long()
-        weights = [
-            torch.tensor(1.0, device=device),
-            torch.tensor(1.0, device=device).reshape(1, 1, 1),
-        ]
-        for weight in weights:
-            with self.assertRaisesRegex(RuntimeError, "'weight' must be 2-D"):
-                torch.embedding(weight, indices)
+    def test_invalid_shapes_grid_sampler(self, device):
+        make_arg = partial(
+            make_tensor, device=device, dtype=torch.float64, requires_grad=True)
+
+        inputs = (
+            # input, grid
+            ((5, 5, 5, 5, 5,), (1, 1, 1, 4, 4,)),  # 3d
+            ((5, 5, 5, 5,), (1, 1, 4, 4,)),  # 2d
+        )
+
+        interpolation_mode = 0
+        padding_mode = 0
+        align_corners = True
+
+        err = "expected grid and input to have same batch size"
+
+        for input, grid in inputs:
+            input = make_arg(input)
+            grid = make_arg(grid, low=-1, high=1)
+
+            # Wrapper for the 2d, 3d, and cuDNN functions listed below.
+            with self.assertRaisesRegex(RuntimeError, err):
+                torch.grid_sampler(
+                    input, grid, interpolation_mode, padding_mode,
+                    align_corners)
+
+            # Expects 2d input.
+            with self.assertRaisesRegex(RuntimeError, err):
+                torch.grid_sampler_2d(
+                    input, grid, interpolation_mode, padding_mode,
+                    align_corners)
+
+            # Expects 3d input.
+            with self.assertRaisesRegex(RuntimeError, err):
+                torch.grid_sampler_3d(
+                    input, grid, interpolation_mode, padding_mode,
+                    align_corners)
+
+            # Expects 2d input.
+            with self.assertRaisesRegex(RuntimeError, err):
+                torch._grid_sampler_2d_cpu_fallback(
+                    input, grid, interpolation_mode, padding_mode,
+                    align_corners)
+
+            # Expects 2d input, on CUDA.
+            # Doesn't work on CPU and ROCm.
+            if device != 'cpu' and TEST_CUDNN and not TEST_WITH_ROCM:
+                with self.assertRaisesRegex(RuntimeError, err):
+                    torch.cudnn_grid_sampler(input, grid)
 
     def test_dist(self, device):
         def run_test(x, y):
@@ -1592,18 +1634,21 @@ def _cond_fn(x):
             _sync_raises_helper(f, level)
 
 
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     def test_log_normal(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).log_normal_()
         self.assertEqual(a.dtype, dtype)
         self.assertEqual(a.size(), torch.Size([1]))
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     def test_geometric(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).geometric_(0.5)
         self.assertEqual(a.dtype, dtype)
         self.assertEqual(a.size(), torch.Size([1]))
 
+    @skipIfMps
     def test_repeat_interleave(self, device):
         y = torch.tensor([[1, 2], [3, 4]], device=device)
         # exercise single argument function signature
@@ -1630,9 +1675,9 @@ def test_repeat_interleave(self, device):
             self.assertEqual(a_with_output.dtype, y.dtype)
             self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
 
-    @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False))
-    @dtypesIfCPU(*(get_all_fp_dtypes(include_half=False, include_bfloat16=True)))
-    @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*floating_types())
+    @dtypesIfCPU(*floating_types_and(torch.bfloat16))
+    @dtypesIfCUDA(*floating_types_and(torch.half))
     def test_bernoulli_p(self, device, dtype):
         for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):
             x = torch.tensor(trivial_p, dtype=dtype, device=device)
@@ -1652,9 +1697,9 @@ def isBinary(t):
         self.assertTrue(isBinary(p))
 
     # RngUniform not implemented for Integral type in XLA test
-    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
-    @dtypesIfCPU(*(get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False)))
-    @dtypesIfCUDA(*(get_all_dtypes(include_bfloat16=False, include_complex=False)))
+    @dtypes(*floating_types())
+    @dtypesIfCPU(*all_types_and(torch.bool))
+    @dtypesIfCUDA(*all_types_and(torch.bool, torch.half))
     def test_bernoulli_self(self, device, dtype):
 
         def isBinary(t):
@@ -1666,7 +1711,7 @@ def isBinary(t):
         t.bernoulli_(0.5)
         self.assertTrue(isBinary(t))
 
-        for p_dtype in get_all_fp_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False):
+        for p_dtype in floating_types_and(*[torch.half] if device.startswith('cuda') else []):
             p = torch.rand(10, dtype=p_dtype, device=device).expand(10, 10)
             t.fill_(2)
             t.bernoulli_(p)
@@ -1681,8 +1726,8 @@ def isBinary(t):
             self.assertTrue(isBinary(t))
 
     @slowTest
-    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
-    @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*floating_types())
+    @dtypesIfCUDA(*floating_types_and(torch.half))
     def test_bernoulli_edge_cases(self, device, dtype):
         # Need to draw a lot of samples to cover every random floating point number.
         a = torch.zeros(10000, 10000, dtype=dtype, device=device)  # probability of drawing "1" is 0
@@ -1693,7 +1738,8 @@ def test_bernoulli_edge_cases(self, device, dtype):
         num_zeros = (torch.bernoulli(b) == 0).sum()
         self.assertEqual(num_zeros, 0)
 
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    @skipIfMps
     def test_exponential(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).exponential_(0.5)
         self.assertEqual(a.dtype, dtype)
@@ -1720,15 +1766,15 @@ def test_exponential_no_zero(self, device, dtype):
         self.assertTrue(x.min() > 0)
 
     def _generate_correlation_tensors(self, device, dtype):
-        yield make_tensor((0, 0), device, dtype)
-        yield make_tensor((1, 0), device, dtype)
-        yield make_tensor((0, 1), device, dtype)
-        yield make_tensor((2,), device, dtype)
-        yield make_tensor((2, 1), device, dtype)
-        yield make_tensor((2, 2), device, dtype)
-        yield make_tensor((2, 3), device, dtype)
-        yield make_tensor((5, 10), device, dtype)
-        yield make_tensor((5, 10), device, dtype, noncontiguous=True)
+        yield make_tensor((0, 0), dtype=dtype, device=device)
+        yield make_tensor((1, 0), dtype=dtype, device=device)
+        yield make_tensor((0, 1), dtype=dtype, device=device)
+        yield make_tensor((2,), dtype=dtype, device=device)
+        yield make_tensor((2, 1), dtype=dtype, device=device)
+        yield make_tensor((2, 2), dtype=dtype, device=device)
+        yield make_tensor((2, 3), dtype=dtype, device=device)
+        yield make_tensor((5, 10), dtype=dtype, device=device)
+        yield make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True)
         if dtype != torch.int:
             yield torch.tensor([0, -2, nan, 10.2, inf], dtype=dtype, device=device)
 
@@ -1755,29 +1801,12 @@ def check(t, correction=1, fweights=None, aweights=None):
             num_observations = x.numel() if x.ndim < 2 else x.size(1)
             if num_observations > 0:
                 fweights = torch.randint(1, 10, (num_observations,), device=device)
-                aweights = make_tensor((num_observations,), device, torch.float, low=1)
+                aweights = make_tensor((num_observations,), dtype=torch.float, device=device, low=1)
                 for correction, fw, aw in product([0, 1, 2], [None, fweights], [None, aweights]):
                     check(x, correction, fweights, aweights)
 
-    # FIXME: port to ErrorInputs
-    def test_cov_error(self, device):
-        def check(msg, *args, **kwargs):
-            with self.assertRaisesRegex(RuntimeError, r'cov\(\):.*' + msg + r'.*'):
-                torch.cov(*args, **kwargs)
-
-        a = torch.rand(2)
-        check(r'expected input to have two or fewer dimensions', torch.rand(2, 2, 2))
-        check(r'expected fweights to have one or fewer dimensions', a, fweights=torch.rand(2, 2))
-        check(r'expected aweights to have one or fewer dimensions', a, aweights=torch.rand(2, 2))
-        check(r'expected fweights to have integral dtype', a, fweights=torch.rand(2))
-        check(r'expected aweights to have floating point dtype', a, aweights=torch.tensor([1, 1]))
-        check(r'expected fweights to have the same numel', a, fweights=torch.tensor([1]))
-        check(r'expected aweights to have the same numel', a, aweights=torch.rand(1))
-        check(r'fweights cannot be negative', a, fweights=torch.tensor([-1, -2]))
-        check(r'aweights cannot be negative', a, aweights=torch.tensor([-1., -2.]))
-
     @skipIfNoSciPy
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
     def test_uniform_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1789,8 +1818,8 @@ def test_uniform_kstest(self, device, dtype):
                     self.assertTrue(res.statistic < 0.1)
 
     @skipIfNoSciPy
-    @dtypes(*get_all_fp_dtypes(include_bfloat16=False))
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half))
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     def test_normal_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1800,8 +1829,9 @@ def test_normal_kstest(self, device, dtype):
                 res = stats.kstest(t.cpu().to(torch.double), 'norm', args=(mean, std))
                 self.assertTrue(res.statistic < 0.1)
 
+    @skipIfMps
     @skipIfNoSciPy
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
     def test_lognormal_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1814,8 +1844,9 @@ def test_lognormal_kstest(self, device, dtype):
                 else:
                     self.assertTrue(res.statistic < 0.1)
 
+    @skipIfMps
     @skipIfNoSciPy
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
     def test_exponential_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1824,8 +1855,9 @@ def test_exponential_kstest(self, device, dtype):
             res = stats.kstest(t.cpu().to(torch.double), 'expon', args=(0, 1 / lambd,))
             self.assertTrue(res.statistic < 0.1)
 
+    @skipIfMps
     @skipIfNoSciPy
-    @dtypes(*get_all_fp_dtypes())
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
     def test_cauchy_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1845,8 +1877,9 @@ def test_cauchy_no_inf(self, device, dtype):
             x.cauchy_()
             self.assertFalse(x.isinf().sum())
 
+    @skipIfMps
     @skipIfNoSciPy
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_geometric_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -1909,6 +1942,7 @@ def _brute_cdist(self, x, y, p=2):
             return torch.empty(r1, r2, device=x.device)
         return torch.norm(x[..., None, :] - y[..., None, :, :], p=p, dim=-1)
 
+    @skipIfMps
     def test_cdist_norm(self, device):
         for r1 in [3, 4, 5, 6]:
             for m in [2, 3, 4, 10]:
@@ -1926,6 +1960,7 @@ def test_cdist_norm(self, device):
                             expected = self._brute_cdist(x, y, p=p)
                             self.assertEqual(expected, actual)
 
+    @skipIfMps
     def test_cdist_norm_batch(self, device):
         for r1 in [3, 4, 5, 6]:
             for m in [2, 3, 4, 10]:
@@ -2060,6 +2095,7 @@ def _test_euclidean_large_cdist(sizex, sizey=None):
         _test_euclidean_large_cdist((2000, 5))
 
     # Ensure that cdist backward with p<1 does not produce NaNs
+    @skipIfMps
     def test_cdist_grad_p_lt_1_no_nan(self, device):
         for p in [0.99, 0.7, 0.5, 0.1, 0.01]:
             x = torch.randn(1, 2, device=device)
@@ -2087,37 +2123,7 @@ def test_cdist_same_inputs(self, device):
             # values such as nan or inf
             assert torch.isfinite(x.grad).all()
 
-    def test_multinomial_constraints(self, device):
-        x = torch.empty(1, 2, 3, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "prob_dist must be 1 or 2 dim",
-            lambda: torch.multinomial(x, 2))
-        x = torch.empty(1, 2, dtype=torch.long, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "multinomial only supports floating-point dtypes for input",
-            lambda: torch.multinomial(x, 2))
-        x = torch.empty(1, 2, dtype=torch.double, device=device)
-        y = torch.empty(1, 2, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "multinomial expects Long tensor out",
-            lambda: torch.multinomial(x, 2, out=y))
-        x = torch.empty(2, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "cannot sample n_sample <= 0 samples",
-            lambda: torch.multinomial(x, 0))
-        x = torch.empty(2, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "cannot sample n_sample <= 0 samples",
-            lambda: torch.multinomial(x, -1))
-        x = torch.empty(2, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "cannot sample n_sample > prob_dist",
-            lambda: torch.multinomial(x, 3, False))
-        x = torch.empty(16777217, dtype=torch.double, device=device)
-        self.assertRaisesRegex(
-            RuntimeError, "number of categories cannot exceed",
-            lambda: torch.multinomial(x, 3))
-
+    @skipIfMps
     def test_cumsum(self, device):
         x = torch.rand(100, 100, device=device)
         res1 = torch.cumsum(x, 1)
@@ -2168,6 +2174,7 @@ def test_cumsum(self, device):
         # Check that output maintained correct shape
         self.assertEqual(raw_tensor.shape, raw_tensor.grad.shape)
 
+    @skipIfMps
     def test_cumprod(self, device):
         x = torch.rand(100, 100, device=device)
         res1 = torch.cumprod(x, 1)
@@ -2218,6 +2225,7 @@ def test_cumprod(self, device):
         # Check that output maintained correct shape
         self.assertEqual(raw_tensor.shape, raw_tensor.grad.shape)
 
+    @skipIfMps
     def test_cummax_cummin(self, device):
         def test_ops(op, string_of_function_name, expected_output1, expected_output2):
             x = torch.rand(100, 100, device=device)
@@ -2284,6 +2292,7 @@ def test_ops(op, string_of_function_name, expected_output1, expected_output2):
                                                        [0, 0, 0],
                                                        [0, 0, 0]]), expected_out)
 
+    @skipIfMps
     def test_logcumsumexp(self, device):
         def logcumsumexp(a, axis):
             return torch.cumsum(a.exp(), axis=axis).log_()
@@ -2357,7 +2366,7 @@ def to_np(t):
 
     # All tensors appear contiguous on XLA
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_diff_noncontig(self, device, dtype):
         shapes = (
             (1,),
@@ -2367,7 +2376,7 @@ def test_diff_noncontig(self, device, dtype):
             (2, 3, 5))
 
         for shape in shapes:
-            contig = make_tensor(shape, device, dtype, low=-9, high=9)
+            contig = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
 
             non_contig = torch.empty(shape + (2, 2), device=device, dtype=dtype)[..., 0]
             non_contig = non_contig.select(-1, -1)
@@ -2377,9 +2386,9 @@ def test_diff_noncontig(self, device, dtype):
             self._test_diff_numpy(non_contig)
 
     # RngNormal not implemented for type f16 for XLA
-    @dtypes(*get_all_dtypes(include_half=False, include_bfloat16=False))
-    @dtypesIfCPU(*get_all_dtypes(include_bfloat16=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.bool))
+    @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool))
     def test_diff(self, device, dtype):
         shapes = (
             (1,),
@@ -2389,7 +2398,7 @@ def test_diff(self, device, dtype):
             (2, 3, 5))
 
         for shape in shapes:
-            contig = make_tensor(shape, device, dtype, low=-9, high=9)
+            contig = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9)
             self._test_diff_numpy(contig)
 
         t = torch.ones(2, 3)
@@ -2494,7 +2503,7 @@ def test_gradient_extreme_cases(self, device, dtype):
 
         # Test behaviour in very big tensors
         large_size = 100000
-        t = make_tensor((large_size,), device, dtype)
+        t = make_tensor((large_size,), dtype=dtype, device=device)
         t_np = t.cpu().numpy()
         coordinates_np = list(np.random.randn(large_size))
         coordinates = [torch.tensor(coordinates_np, device=device)]
@@ -2551,38 +2560,6 @@ def test_gradient_type_promotion(self, device):
                 actual, expected = self._inf_nan_preprocess(list(actual), expected)
                 self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False)
 
-    # FIXME: port this to ErrorInputs
-    @onlyNativeDeviceTypes
-    @dtypes(torch.long, torch.float32, torch.complex64)
-    def test_error_gradient(self, device, dtype):
-        t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device, dtype=dtype)
-        with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected spacing to be unspecified, a scalar '):
-            dim = (1, 0)
-            spacing = [0.1]
-            torch.gradient(t, spacing=spacing, dim=dim, edge_order=1)
-
-        with self.assertRaisesRegex(RuntimeError, 'torch.gradient only supports edge_order=1 and edge_order=2.'):
-            torch.gradient(t, edge_order=3)
-
-        with self.assertRaisesRegex(RuntimeError, 'dim 1 appears multiple times in the list of dims'):
-            dim = (1, 1)
-            spacing = 0.1
-            torch.gradient(t, spacing=spacing, dim=dim, edge_order=1)
-
-        with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each tensor to be on the same device,'):
-            dim = (0, 1)
-            coordinates = [torch.tensor([1, 2, 4], device='cpu'), torch.tensor([1, 2, 4], device='meta')]
-            torch.gradient(t, spacing=coordinates, dim=dim, edge_order=1)
-
-        with self.assertRaises(IndexError):
-            torch.gradient(t, dim=3)
-
-        with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each dimension size to be at least'):
-            torch.gradient(torch.tensor([[1], [2], [3]]), edge_order=1)
-
-        with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each dimension size to be at least'):
-            torch.gradient(torch.tensor([[1, 2], [3, 4]]), edge_order=2)
-
     def _test_large_cum_fn_helper(self, x, fn):
         x_cpu = x.cpu().float()
         expected = fn(x_cpu)
@@ -2610,6 +2587,7 @@ def test_large_cumprod(self, device, dtype):
         x[2::3] = .5
         self._test_large_cum_fn_helper(x, lambda x: torch.cumprod(x, 0))
 
+    @skipIfMps
     def test_discontiguous_out_cumsum(self, device):
         x = torch.randn(4, 8, device=device)
         y = torch.empty(4, 16, device=device)[:, ::2]
@@ -2630,12 +2608,14 @@ def _test_cumminmax_helper(self, x, fn, expected_val, expected_ind):
         self.assertEqual(out_val, expected_val, atol=0, rtol=0)
         self.assertEqual(out_ind, expected_ind, atol=0, rtol=0)
 
+    @skipIfMps
     def test_cummax_discontiguous(self, device):
         x = torch.tensor([[0, 1, 2, 3, 2, 1], [4, 5, 6, 5, 6, 7]], device=device, dtype=torch.float).t().contiguous().t()
         expected_val = torch.tensor([[0, 1, 2, 3, 3, 3], [4, 5, 6, 6, 6, 7]], device=device, dtype=torch.float)
         expected_ind = torch.tensor([[0, 1, 2, 3, 3, 3], [0, 1, 2, 2, 4, 5]], device=device, dtype=torch.long)
         self._test_cumminmax_helper(x, torch.cummax, expected_val, expected_ind)
 
+    @skipIfMps
     def test_cummin_discontiguous(self, device):
         x = torch.tensor([[3, 2, 1, 0, 1, 2], [7, 6, 5, 4, 5, 2]], device=device, dtype=torch.float).t().contiguous().t()
         expected_val = torch.tensor([[3, 2, 1, 0, 0, 0], [7, 6, 5, 4, 4, 2]], device=device, dtype=torch.float)
@@ -2650,7 +2630,7 @@ def test_bool_tensor_value_change(self, device):
 
     # FIXME: move to shape ops test suite
     def test_unfold_all_devices_and_dtypes(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
 
             if dt == torch.bool:
                 x = torch.empty((0, 1, 3, 0), dtype=dt, device=device)
@@ -2672,7 +2652,7 @@ def test_unfold_scalars(self, device):
     # FIXME: move to data movement test suite
     def test_copy_all_dtypes_and_devices(self, device):
         from copy import copy
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             x = torch.tensor([1, 2, 3, 4], dtype=dt, device=device)
             x_clone = x.clone()
             y = copy(x)
@@ -2741,7 +2721,7 @@ def test_copy_transpose_math_view(self, device, dtype):
             self.assertEqual(dst, src.conj_physical())
 
     def test_clone_all_dtypes_and_devices(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             x = torch.tensor((1, 1), dtype=dt, device=device)
             y = x.clone()
             self.assertEqual(x, y)
@@ -2811,8 +2791,58 @@ def test_narrow_empty(self, device):
             sz[d] = 0
             self.assertEqual(sz, y.size())
 
+    # FIXME: move to indexing test suite
+    @parametrize("reduce", ['prod', 'amin', 'amax', 'mean'])
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    def test_index_reduce(self, device, dtype, reduce):
+        size = (3, 4, 5)
+        index_dtypes = [torch.int, torch.long]
+        include_selfs = [True, False]
+        reduction_init = {'prod': 1, 'mean': 0, 'amin': float('inf'), 'amax': -float('inf')}
+
+        for dest_contig, src_contig, index_contig in product([True, False], repeat=3):
+            for idx_dtype, include_self in product(index_dtypes, include_selfs):
+                for dim in range(len(size)):
+                    num_src = np.random.randint(10)
+                    num_dest = size[dim]
+                    dest = torch.randn(size, dtype=dtype, device=device)
+                    if not dest_contig:
+                        dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=True)
+                    src = torch.randn(*size[:dim], num_src, *size[dim + 1:], dtype=dtype, device=device)
+                    if not src_contig:
+                        # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage
+                        src = torch.testing.make_non_contiguous(src)
+                    idx = torch.randint(num_dest, (num_src,), dtype=idx_dtype, device=device)
+                    if not index_contig:
+                        # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage
+                        idx = torch.testing.make_non_contiguous(idx)
+                    expected = dest.clone()
+                    dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
+                    # fill rows in idx with reduction inits if include_self=False
+                    if (not include_self):
+                        expected.index_fill_(dim, idx.long(), reduction_init[reduce])
+                    expected = expected.transpose(0, dim)
+                    src = src.transpose(0, dim)
+                    for i in range(num_src):
+                        if reduce == 'prod':
+                            expected[idx[i]] *= src[i]
+                        elif reduce == 'amin':
+                            torch.minimum(expected[idx[i]], src[i], out=expected[idx[i]])
+                        elif reduce == 'amax':
+                            torch.maximum(expected[idx[i]], src[i], out=expected[idx[i]])
+                        else:
+                            expected[idx[i]] += src[i]
+                    if reduce == 'mean':
+                        counts = torch.ones_like(expected) if include_self else torch.zeros_like(expected)
+                        counts.index_add_(0, idx, torch.ones_like(src))
+                        counts.masked_fill_(counts == 0, 1)
+                        expected /= counts
+                    expected = expected.transpose(0, dim)
+
+                    self.assertEqual(dest, expected)
+
     # FIXME: move to test indexing
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_index_copy(self, device, dtype):
         # We just test for num_copy <= num_dest, as otherwise there are repeated indices
         # and the behavior is undefined
@@ -2820,7 +2850,7 @@ def test_index_copy(self, device, dtype):
 
         def make_arg(batch_sizes, n, dim, contig):
             size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
-            return make_tensor(size_arg, device, dtype, low=None, high=None, noncontiguous=not contig)
+            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
 
         def ref_index_copy(tgt, dim, idx, src):
             for i in range(idx.size(0)):
@@ -2847,7 +2877,7 @@ def ref_index_copy(tgt, dim, idx, src):
     # onlyNativeDeviceTypes due to an XLA error:
     # https://github.com/pytorch/pytorch/issues/53256
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_index_copy_scalars(self, device, dtype):
         # Create the 8 possible combinations of scalar sizes for target / index / source
         scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None),
@@ -2957,13 +2987,14 @@ def test_index_put_non_accumulate_deterministic(self, device) -> None:
                 self.assertEqual(output, input_list)
 
     # FIXME: move to test indexing
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @skipIfMps
     def test_index_fill(self, device, dtype):
         x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
         index = torch.tensor([0], device=device)
         x.index_fill_(1, index, 0)
         self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
-        if not x.is_complex():
+        if not x.is_complex() and not device == "meta":
             with self.assertRaisesRegex(RuntimeError, r"Scalar"):
                 x.index_fill_(1, index, 1 + 1j)
         # Make sure that the result stays 0-dim while applied to
@@ -2975,13 +3006,13 @@ def test_index_fill(self, device, dtype):
     # FIXME: move to test indexing
     # The test fails for zero-dimensional tensors on XLA
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_index_select(self, device, dtype):
         num_src, num_out = 3, 5
 
         def make_arg(batch_sizes, n, dim, contig):
             size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
-            return make_tensor(size_arg, device, dtype, low=None, high=None, noncontiguous=not contig)
+            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
 
         def ref_index_select(src, dim, idx):
             # bfloat16 is just used on GPU, so it's not supported on numpy
@@ -2996,7 +3027,9 @@ def ref_index_select(src, dim, idx):
             for other_sizes in ((), (4, 5)):
                 for dim in range(len(other_sizes)):
                     src = make_arg(other_sizes, num_src, dim, src_contig)
-                    idx = make_tensor((num_out,), device, dtype=torch.int64, low=0, high=num_src, noncontiguous=not idx_contig)
+                    idx = make_tensor(
+                        (num_out,), dtype=torch.int64, device=device, low=0, high=num_src, noncontiguous=not idx_contig
+                    )
                     out = torch.index_select(src, dim, idx)
                     out2 = ref_index_select(src, dim, idx)
                     self.assertEqual(out, out2)
@@ -3005,13 +3038,13 @@ def ref_index_select(src, dim, idx):
             other_sizes = (3, 2)
             dim = 1
             src = make_arg(other_sizes, num_src, dim, True)
-            idx = make_tensor((num_out,), device, dtype=idx_type, low=0, high=num_src, noncontiguous=False)
+            idx = make_tensor((num_out,), dtype=idx_type, device=device, low=0, high=num_src, noncontiguous=False)
             out = torch.index_select(src, dim, idx)
             out2 = ref_index_select(src, dim, idx)
             self.assertEqual(out, out2)
 
         # Create the 4 possible combinations of scalar sizes for index / source
-        scalars = ((make_tensor(size_s, device, dtype),
+        scalars = ((make_tensor(size_s, dtype=dtype, device=device),
                     torch.zeros(size_i, dtype=torch.int64, device=device))
                    for size_s, size_i in product([(), (1,)], repeat=2))
         for source, idx in scalars:
@@ -3019,7 +3052,7 @@ def ref_index_select(src, dim, idx):
             self.assertEqual(out.item(), source.item())
 
     # FIXME: find a test suite for the take operator
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_take(self, device, dtype):
         idx_size = (4,)
 
@@ -3054,7 +3087,7 @@ def ref_take(src, idx):
     # FIXME: find a test suite for the put operator
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_put(self, device, dtype):
         src_size = (4,)
 
@@ -3125,7 +3158,7 @@ def ref_put(dst, idx, src, accumulate):
     # FIXME: find a test suite for the put operator
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_put_accumulate(self, device, dtype):
         # Test for parallel adds with accumulate == True
         low_precision = dtype == torch.half or dtype == torch.bfloat16
@@ -3147,6 +3180,7 @@ def test_put_accumulate(self, device, dtype):
             self.assertEqual(out, orig + source.sum(), rtol=rtol, atol=atol)
 
     # FIXME: find a test suite for the take operator
+    @skipIfMps
     def test_take_empty(self, device):
         for input_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]:
             for indices_shape in [(0,), (0, 1, 2, 0)]:
@@ -3169,13 +3203,9 @@ def scatter_allow_reduce(self, device, dtype, reduceop):
         device_type = torch.device(device).type
         return device_type != 'cuda' or (reduceop == 'multiply' and dtype.is_floating_point)
 
-    # FIXME: port to test_scatter_gather_ops.py
-    # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
-    # So, we are skipping it here.
-    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              get_all_complex_dtypes()))
-    @dtypesIfCPU(*get_all_dtypes())
-    @dtypesIfCUDA(*get_all_dtypes())
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_operations_to_large_input(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3200,13 +3230,9 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype):
             input.scatter_(0, index, src, reduce=operation)
             self.assertEqual(input, result)
 
-    # FIXME: port to test_scatter_gather_ops.py
-    # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
-    # So, we are skipping it here.
-    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              get_all_complex_dtypes()))
-    @dtypesIfCPU(*get_all_dtypes())
-    @dtypesIfCUDA(*get_all_dtypes())
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_scalar(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3243,13 +3269,9 @@ def test_scatter_add_non_unique_index(self, device):
                          torch.tensor([[3], [1]], device=device,
                                       dtype=torch.float32).repeat(1, width))
 
-    # FIXME: port to test_scatter_gather_ops.py
-    # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
-    # So, we are skipping it here.
-    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              get_all_complex_dtypes()))
-    @dtypesIfCPU(*get_all_dtypes())
-    @dtypesIfCUDA(*get_all_dtypes())
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_non_unique_index(self, device, dtype):
         height = 2
         width = 2
@@ -3270,12 +3292,8 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
             input.scatter_(0, index, src, reduce=operation)
             self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}")
 
-    # FIXME: port to test_scatter_gather_ops.py
-    # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
-    # So, we are skipping it here.
     @onlyCUDA
-    @dtypes(*(get_all_complex_dtypes() +
-              get_all_int_dtypes()))
+    @dtypes(*integral_types(), *complex_types())
     def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype):
         height = 2
         width = 2
@@ -3327,7 +3345,7 @@ def test_scatter_add_bool(self, device):
 
     # FIXME: find a test suite for the masked scatter operator
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_masked_scatter(self, device, dtype):
         dt = dtype
         with warnings.catch_warnings(record=True) as w:
@@ -3390,6 +3408,7 @@ def test_masked_scatter(self, device, dtype):
             self.assertEqual(str(wi.message)[0:55], str(warn))
 
     # FIXME: find a test suite for the masked scatter operator
+    @skipIfMps
     def test_masked_scatter_bool_tensor(self, device):
         src = torch.tensor([True, True, True], device=device)
         dst = torch.tensor([False, False, False], device=device)
@@ -3404,8 +3423,6 @@ def test_masked_scatter_bool_tensor(self, device):
 
     # FIXME: find a test suite for the masked scatter operator
     #   test_scatter_gather_ops or test_masked_ops?
-    # refer https://github.com/pytorch/pytorch/issues/60190
-    @skipIfRocm
     @onlyCUDA
     @largeTensorTest('30GB')
     def test_masked_scatter_large_tensor(self, device):
@@ -3416,7 +3433,7 @@ def test_masked_scatter_large_tensor(self, device):
         self.assertEqual(result, result_cpu)
 
     # FIXME: find a test suite for the masked select operator
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_masked_select(self, device, dtype):
         if device == 'cpu':
             warn = 'masked_select received a mask with dtype torch.uint8,'
@@ -3484,7 +3501,7 @@ def test_masked_select_discontiguous(self, device):
                 self.assertEqual(out_dc, expected, atol=0, rtol=0)
 
     # FIXME: find a test suite for the masked fill operator
-    @dtypes(*product(get_all_dtypes(), (torch.uint8, torch.bool)))
+    @dtypes(*product(all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16), (torch.uint8, torch.bool)))
     def test_masked_fill(self, device, dtypes):
         dtype = dtypes[0]
         mask_dtype = dtypes[1]
@@ -3789,15 +3806,18 @@ def test_pdist_norm_backward(self, device):
     # FIXME: find a test suite for the pdist operator
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
     @skipIfRocm
+    @onlyCUDA
+    @largeTensorTest('10GB', device='cpu')
+    @largeTensorTest('5GB', device='cuda')
     def test_pdist_norm_large(self, device):
         # use dim0>=46342 for forward, see:
         # https://github.com/pytorch/pytorch/issues/30583
         # Compare output using GPU with the CPU implementation, as brute_pdist uses too much memory
-        if 'cuda' in device:
-            x = torch.randn(50000, 1, dtype=torch.float32)
-            expected_cpu = torch.pdist(x, p=2)
-            actual_gpu = torch.pdist(x.to(device), p=2)
-            self.assertEqual(expected_cpu, actual_gpu.cpu())
+        x = torch.randn(50000, 1, dtype=torch.float32)      # 50k * 4 bytes = 200 KB
+        # Will require 1249975000 float32s
+        expected_cpu = torch.pdist(x, p=2)                  # ~1250M * 4 bytes = 5 GB on CPU
+        actual_gpu = torch.pdist(x.to(device), p=2)         # 5 GB on GPU
+        self.assertEqual(expected_cpu, actual_gpu.cpu())    # Another 5 GB on CPU
 
     # FIXME: move to elementwise ternary test suite
     @onlyNativeDeviceTypes
@@ -4031,19 +4051,6 @@ def test_masked_fill_mem_overlap(self, device):
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             mask[1:].masked_fill_(mask[:-1], False)
 
-    # FIXME: convert to ErrorInputs
-    @onlyNativeDeviceTypes
-    def test_masked_select_mem_overlap(self, device):
-        x = torch.rand((1,), device=device).expand((3,))
-        y = torch.rand((6,), device=device)
-        mask = torch.tensor([True, False, True, True, False, False], device=device)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.masked_select(y, mask, out=x)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.masked_select(y, mask, out=y)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.masked_select(mask.clone(), mask, out=mask)
-
     # FIXME: convert to ErrorInputs
     @expectedFailureMeta  # RuntimeError not raised
     @onlyNativeDeviceTypes
@@ -4055,15 +4062,6 @@ def test_masked_scatter_mem_overlap(self, device):
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             x.masked_scatter_(mask, src)
 
-    # FIXME: convert to ErrorInputs
-    @onlyNativeDeviceTypes
-    def test_index_select_mem_overlap(self, device):
-        x = torch.rand((1, 6), device=device).expand((2, 6))
-        y = torch.rand((3, 6), device=device)
-        ind = torch.tensor([0, 1], dtype=torch.int64, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.index_select(y, 1, ind, out=x)
-
     # FIXME: convert to ErrorInputs
     @onlyNativeDeviceTypes
     def test_scatter_mem_overlap(self, device):
@@ -4078,32 +4076,6 @@ def test_scatter_mem_overlap(self, device):
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             ind.scatter_(0, ind, ind.clone())
 
-    # FIXME: convert to ErrorInputs
-    @onlyNativeDeviceTypes
-    def test_gather_mem_overlap(self, device):
-        x = torch.rand((1,), device=device).expand((3,))
-        src = torch.rand((6,), device=device)
-        ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.gather(src, 0, ind, out=x)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.gather(src, 0, ind, out=src)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.gather(ind.clone(), 0, ind[1:], out=ind[:1])
-
-    # FIXME: convert to ErrorInputs
-    @onlyNativeDeviceTypes
-    def test_take_mem_overlap(self, device):
-        x = torch.rand((1,), device=device).expand((3,))
-        src = torch.rand((6,), device=device)
-        ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.take(src, ind, out=x)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.take(src, ind, out=src)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.take(ind.clone(), ind[1:], out=ind[:-1])
-
     # FIXME: move to test distributions
     @onlyCUDA
     def test_multinomial_device_constrain(self, device):
@@ -4308,6 +4280,7 @@ def _test_propagation_rules(self, contiguous, cl, ambiguous, bias):
             result = ambiguous * 5
             self.assertEqual(ambiguous.stride(), result.stride())
 
+    @skipIfMps
     def test_memory_format_empty_like(self, device):
         def test_helper(x, memory_format):
             xc = x.contiguous(memory_format=memory_format)
@@ -4562,38 +4535,38 @@ def compare_strides(s1, s2, div):
     # FIXME: move dlpack tests to their own test class/suite
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_dlpack_capsule_conversion(self, device, dtype):
         # DLpack does not explicitly support bool (xref dmlc/dlpack#75)
-        x = make_tensor((5,), device, dtype)
+        x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(to_dlpack(x))
         self.assertEqual(z, x)
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_dlpack_protocol_conversion(self, device, dtype):
-        x = make_tensor((5,), device, dtype)
+        x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(x)
         self.assertEqual(z, x)
 
     @skipMeta
     @onlyNativeDeviceTypes
     def test_dlpack_shared_storage(self, device):
-        x = make_tensor((5,), device, torch.float64)
+        x = make_tensor((5,), dtype=torch.float64, device=device)
         z = from_dlpack(to_dlpack(x))
         z[0] = z[0] + 20.0
         self.assertEqual(z, x)
 
     @skipMeta
     @onlyCUDA
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_dlpack_conversion_with_streams(self, device, dtype):
         # Create a stream where the tensor will reside
         stream = torch.cuda.Stream()
         with torch.cuda.stream(stream):
             # Do an operation in the actual stream
-            x = make_tensor((5,), device, dtype) + 1
+            x = make_tensor((5,), dtype=dtype, device=device) + 1
         # DLPack protocol helps establish a correct stream order
         # (hence data dependency) at the exchange boundary.
         # DLPack manages this synchronization for us, so we don't need to
@@ -4604,11 +4577,44 @@ def test_dlpack_conversion_with_streams(self, device, dtype):
         stream.synchronize()
         self.assertEqual(z, x)
 
+    @skipMeta
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
+    def test_from_dlpack(self, device, dtype):
+        x = make_tensor((5,), dtype=dtype, device=device)
+        y = torch.from_dlpack(x)
+        self.assertEqual(x, y)
+
+    @skipMeta
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
+    def test_from_dlpack_noncontinguous(self, device, dtype):
+        x = make_tensor((25,), dtype=dtype, device=device).reshape(5, 5)
+
+        y1 = x[0]
+        y1_dl = torch.from_dlpack(y1)
+        self.assertEqual(y1, y1_dl)
+
+        y2 = x[:, 0]
+        y2_dl = torch.from_dlpack(y2)
+        self.assertEqual(y2, y2_dl)
+
+        y3 = x[1, :]
+        y3_dl = torch.from_dlpack(y3)
+        self.assertEqual(y3, y3_dl)
+
+        y4 = x[1]
+        y4_dl = torch.from_dlpack(y4)
+        self.assertEqual(y4, y4_dl)
+
+        y5 = x.t()
+        y5_dl = torch.from_dlpack(y5)
+        self.assertEqual(y5, y5_dl)
+
     @skipMeta
     @onlyCUDA
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_dlpack_conversion_with_diff_streams(self, device, dtype):
-        from torch._C import _from_dlpack
         stream_a = torch.cuda.Stream()
         stream_b = torch.cuda.Stream()
         # DLPack protocol helps establish a correct stream order
@@ -4616,12 +4622,20 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype):
         # the `tensor.__dlpack__` method will insert a synchronization event
         # in the current stream to make sure that it was correctly populated.
         with torch.cuda.stream(stream_a):
-            x = make_tensor((5,), device, dtype) + 1
-            z = _from_dlpack(x.__dlpack__(stream_b.cuda_stream))
+            x = make_tensor((5,), dtype=dtype, device=device) + 1
+            z = torch.from_dlpack(x.__dlpack__(stream_b.cuda_stream))
             stream_a.synchronize()
         stream_b.synchronize()
         self.assertEqual(z, x)
 
+    @skipMeta
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
+    def test_from_dlpack_dtype(self, device, dtype):
+        x = make_tensor((5,), dtype=dtype, device=device)
+        y = torch.from_dlpack(x)
+        assert x.dtype == y.dtype
+
     @skipMeta
     @onlyCUDA
     def test_dlpack_default_stream(self, device):
@@ -4643,15 +4657,15 @@ def __dlpack__(self, stream=None):
 
         # CUDA-based tests runs on non-default streams
         with torch.cuda.stream(torch.cuda.default_stream()):
-            x = DLPackTensor(make_tensor((5,), device, torch.float32))
+            x = DLPackTensor(make_tensor((5,), dtype=torch.float32, device=device))
             from_dlpack(x)
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_bool=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_dlpack_tensor_invalid_stream(self, device, dtype):
         with self.assertRaises(TypeError):
-            x = make_tensor((5,), device, dtype)
+            x = make_tensor((5,), dtype=dtype, device=device)
             x.__dlpack__(stream=object())
 
     @skipMeta
@@ -4723,6 +4737,7 @@ def test_storage_all_devices(self, devices):
             self.assertEqual(t.dtype, t.storage().dtype)
 
     # FIXME: move to test distributions
+    @skipIfMps
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
     @dtypes(torch.float, torch.double)
     def test_multinomial(self, device, dtype):
@@ -5070,6 +5085,7 @@ def test_pickle_gradscaler(self, device):
                     self.assertEqual(b.scale(torch.tensor([4.0], dtype=torch.float32, device=device)), 12.0)
 
     # FIXME: convert to ErrorInputs
+    @skipIfMps
     def test_multinomial_invalid(self, device):
         def test(probs):
             with self.assertRaisesRegex(RuntimeError,
@@ -5083,6 +5099,7 @@ def test(probs):
         test(torch.tensor([1., 1., nan]))
 
     # FIXME: convert to ErrorInputs
+    @skipIfMps
     def test_multinomial_invalid_distribution(self, device):
         def test(probs, replacement):
             with self.assertRaisesRegex(RuntimeError,
@@ -5123,106 +5140,72 @@ def test_multinomial_empty_wo_replacement(self, device):
         self._test_multinomial_empty(device, False, 1)
         self._test_multinomial_empty(device, False, 2)
 
-    # FIXME: move to elementwise ternary test suite
-    def _test_where_scalar_template(self, device, dtype, exec_fn):
-        for ndims in range(0, 4):
-            shape = self._rand_shape(ndims, min_size=5, max_size=10)
-            for n in range(ndims + 1):
-                for c in combinations(list(range(ndims)), n):
-                    for scalar_type in [int, float, complex]:
-                        if dtype.is_complex:
-                            condition = make_tensor(shape, dtype=dtype, device=device).abs() > 0.5
-                        else:
-                            condition = make_tensor(shape, dtype=dtype, device=device) > 0.5
-
-                        x = make_tensor(shape, dtype=dtype, device=device)
-
-                        if not dtype.is_complex and scalar_type == complex:
-                            continue
-
-                        scalar_1 = scalar_type(random.random())
-
-                        exec_fn(scalar_type, dtype, condition, x, scalar_1)
-
-    # FIXME: move to elementwise ternary test suite
-    # For current implementation,
-    # below are the valid `TensorDtype` and `ScalarType` combinations.
-    def _where_valid_scalar_tensor_combination(self, scalar_type, dtype):
-        if (scalar_type == int and dtype == torch.long):
-            return True
-        elif (scalar_type == float and dtype == torch.double):
-            return True
-        elif (scalar_type == complex and dtype == torch.complex128):
-            return True
-        return False
+    @dtypesIfCUDA(torch.float, torch.double, torch.half)
+    @dtypesIfCPU(torch.float, torch.double, torch.bfloat16)
+    @dtypes(torch.float, torch.double)
+    def test_multinomial_cpu(self, device, dtype):
+        def make_prob_dist(shape, is_contiguous):
+            if is_contiguous:
+                if dtype == torch.half or dtype == torch.bfloat16:
+                    return torch.zeros(shape, device=device).uniform_().to(dtype=dtype)
+                return torch.zeros(shape, device=device, dtype=dtype).uniform_()
+            elif len(shape) == 1:
+                if dtype == torch.half or dtype == torch.bfloat16:
+                    return torch.zeros((shape + [5]), device=device).uniform_().to(dtype=dtype)[:, 2]
+                return torch.zeros((shape + [5]), device=device, dtype=dtype).uniform_()[:, 2]
+            else:
+                # num dim = 2
+                new_shape = [2, shape[1], 7, 1, shape[0], 1, 10]
+                if dtype == torch.half or dtype == torch.bfloat16:
+                    prob_dist = torch.zeros(new_shape, device=device).uniform_().to(dtype=dtype)
+                else:
+                    prob_dist = torch.zeros(new_shape, device=device, dtype=dtype).uniform_()
+                prob_dist = prob_dist.transpose(1, 4)
+                prob_dist = prob_dist[1, :, 5, 0, :, 0, 4]
+                assert not prob_dist.is_contiguous()  # sanity check
+                return prob_dist
 
     # FIXME: move to elementwise ternary test suite
+    # As the test fails with Runtime Error not raised on XLA
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() +
-              get_all_complex_dtypes()))
-    def test_where_scalar_invalid_combination_raises(self, device, dtype):
-
-        def checkRaises(scalar_type, dtype, condition, x, scalar_1):
-            if not self._where_valid_scalar_tensor_combination(scalar_type, dtype):
-                # Note: This should fail once `where` supports type promotion.
-                with self.assertRaisesRegex(RuntimeError, "expected scalar type"):
-                    torch.where(condition, x, scalar_1)
-
-        self._test_where_scalar_template(device, dtype, checkRaises)
+    def test_where_scalar_handcrafted_values(self, device):
+        # Tests ScalarxScalar, ScalarxTensor and TensorxScalar
+        # variant of `where` against NumPy version with
+        # handcrafted values.
+        condition_shape = (5, 5)
+        dtypes = (
+            torch.bool, torch.uint8, torch.int8, torch.int16, torch.int64,
+            torch.float16, torch.float32, torch.float64,
+            torch.complex64, torch.complex128,
+        )
+        shapes = ((), (5,), (1, 5),)
 
-    # FIXME: move to elementwise ternary test suite
-    @skipCUDAVersionIn([(11, 2)])  # test fails for 11.2, see https://github.com/pytorch/pytorch/issues/51980
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() +
-              get_all_complex_dtypes()))
-    def test_where_scalar_valid_combination(self, device, dtype):
+        with torch.no_grad():
+            tensors = (torch.empty(shape, dtype=dtype, device=device).fill_(17)
+                       for shape, dtype in product(shapes, dtypes))
 
-        def checkResult(scalar_type, dtype, condition, x, scalar_1):
-            if self._where_valid_scalar_tensor_combination(scalar_type, dtype):
-                def x_like(scalar, without_dtype=False):
-                    return torch.tensor(scalar, dtype=dtype, device=device).expand_as(x)
+        # Use different values for `x` and `y`
+        # as they are the output values which are compared.
+        x_vals = (True, 3, 7.0, 1 + 0.5j)
+        y_vals = itertools.chain((False, 4, 8.0, 2 + 0.5j), tensors)
+        for x in x_vals:
+            for y in y_vals:
+                condition = torch.empty(*condition_shape, dtype=torch.bool, device=device).bernoulli_()
+                common_dtype = torch.result_type(x, y)
 
-                # X = Tensor, Y = Scalar
-                scalar_out = torch.where(condition, x, scalar_1)
-                tensor_out = torch.where(condition, x, x_like(scalar_1))
-                self.assertEqual(scalar_out, tensor_out)
+                def check_equal(condition, x, y):
+                    condition_np = condition.cpu().numpy()
+                    x_np = x.cpu().numpy() if isinstance(x, torch.Tensor) else x
+                    y_np = y.cpu().numpy() if isinstance(y, torch.Tensor) else y
 
-                # X = Scalar, Y = Tensor
-                scalar_out = torch.where(condition, scalar_1, x)
-                tensor_out = torch.where(condition, x_like(scalar_1), x)
-                self.assertEqual(scalar_out, tensor_out)
+                    # NumPy aggressively promotes to double, hence cast to output to correct dtype
+                    expected = torch.from_numpy(np.where(condition_np, x_np, y_np)).to(common_dtype)
+                    result = torch.where(condition, x, y)
+                    self.assertEqual(expected, result)
 
-        self._test_where_scalar_template(device, dtype, checkResult)
+                check_equal(condition, x, y)
+                check_equal(condition, y, x)
 
-    # FIXME: move to elementwise ternary test suite
-    # As the test fails with Runtime Error not raised on XLA
-    @onlyNativeDeviceTypes
-    def test_where_scalar_scalar(self, device):
-        # Scalar-Scalar Version
-        height = 5
-        width = 5
-        default_dtype = torch.get_default_dtype()
-        for test_default_dtype in [torch.float, torch.double]:
-            torch.set_default_dtype(test_default_dtype)
-            for scalar_type_1 in [int, float, complex]:
-                for scalar_type_2 in [int, float, complex]:
-                    x1 = scalar_type_1(random.random() * random.randint(10, 20))
-                    x2 = scalar_type_2(random.random() * random.randint(20, 30))
-                    condition = torch.randn(height, width, device=device) > 0.5
-                    if scalar_type_1 != scalar_type_2:
-                        self.assertRaisesRegex(RuntimeError, "expected scalar type", lambda: torch.where(condition, x1, x2))
-                    else:
-                        def get_dtype(scalar_type):
-                            complex_dtype = torch.complex64 if torch.float == torch.get_default_dtype() else torch.complex128
-                            type_map = {int: torch.long, float: torch.get_default_dtype(), complex: complex_dtype}
-                            return type_map[scalar_type]
-                        expected = torch.zeros((height, width), dtype=get_dtype(scalar_type_1))
-                        expected[condition] = x1
-                        expected[~condition] = x2
-                        result = torch.where(condition, x1, x2)
-                        self.assertEqual(expected, result)
-
-        # Reset the original dtype
-        torch.set_default_dtype(default_dtype)
 
     def test_hook_remove(self, device):
         # Reference: https://github.com/pytorch/pytorch/issues/58354
@@ -5286,6 +5269,48 @@ def test_assertRaisesRegex_ignore_msg_non_native_device(self, device):
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.nn.functional.nll_loss(x, t, weight=invalid_weight)
 
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32))
+    def test_copy_(self, device, dtype):
+        def can_cast(src_dtype, dst_dtype):
+            # torch.can_cast(torch.int16, torch.uint8) returns True
+            # which isn't actually safe-cast.
+            # This function returns False in this case.
+            def is_unsigned_int(dtype):
+                return dtype is torch.uint8
+
+            if is_unsigned_int(dst_dtype):
+                return is_unsigned_int(src_dtype)
+            return torch.can_cast(src_dtype, dst_dtype)
+
+        def make_tensor_wrapper(shape, dtype):
+            if dtype is not torch.complex32:
+                # Make tensor does not support generating
+                # complex32 tensor
+                return make_tensor(shape, device=device, dtype=dtype)
+            return torch.randn(shape, device=device, dtype=dtype)
+
+        t = make_tensor_wrapper((50,), dtype)
+        src_dtypes = all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32)
+        for src_dtype in src_dtypes:
+            src = make_tensor_wrapper((50,), dtype=src_dtype)
+            t.copy_(src)
+            dst = make_tensor_wrapper((50, ), dtype=src_dtype)
+            if can_cast(src_dtype, dtype):
+                rtol = None
+                atol = None
+                if dtype in (torch.half, torch.complex32):
+                    rtol = 1e-3
+                    atol = 1e-3
+                if dtype in (torch.bfloat16,):
+                    rtol = 1e-2
+                    atol = 1e-2
+                self.assertEqual(src, dst.copy_(t), rtol=rtol, atol=atol)
+
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32))
+    def test_item(self, device, dtype):
+        t = torch.ones((), device=device, dtype=dtype)
+        self.assertEqual(1, t.item())
+
 
 # Tests that compare a device's computation with the (gold-standard) CPU's.
 class TestDevicePrecision(TestCase):
@@ -5714,69 +5739,6 @@ def test_unflatten(self):
                                     r"the unspecified dimension size -1 can be any value and is ambiguous"):
             torch.randn(2, 0).unflatten(1, (2, -1, 0))
 
-    # FIXME: move to test_scatter_gather_ops.py
-    def test_scatter_reduce(self):
-        dtype = device = None
-        output_size = 10
-        shape = [5, 10, 20]
-        reduces = ["sum", "prod", "mean", "amax", "amin"]
-        fills = {"sum": 0, "prod": 1, "mean": 0, "amax": -(2 ** 31), "amin": 2 ** 31 - 1}
-        fns = {"sum": lambda t, v: t.add_(v),
-               "prod": lambda t, v: t.mul_(v),
-               "mean": lambda t, v, n: t.mul_(n).add_(v).div_(n + 1),
-               "amax": lambda t, v: torch.max(t, v, out=t),
-               "amin": lambda t, v: torch.min(t, v, out=t)}
-
-        index = torch.randint(0, output_size, shape, dtype=torch.long, device=device)
-        input = torch.randn(shape, dtype=dtype, device=device)
-
-        for reduce in reduces:
-            for dim in range(len(shape)):
-                output = input.scatter_reduce(dim, index, reduce, output_size=output_size)
-
-                # Check that output is of the correct size
-                output_shape = copy.copy(shape)
-                output_shape[dim] = output_size
-                self.assertEqual(output.shape, output_shape)
-
-                expected = torch.zeros(output_shape, dtype=dtype, device=device)
-                expected.fill_(fills[reduce])
-                counts = torch.zeros(output_shape, dtype=dtype, device=device)
-                for i, j, k in itertools.product(range(shape[0]), range(shape[1]), range(shape[2])):
-                    v = input[i, j, k]
-                    m = index[i, j, k]
-
-                    if dim == 0:
-                        i = m
-                    elif dim == 1:
-                        j = m
-                    else:
-                        k = m
-
-                    op = fns[reduce]
-                    if (reduce == "mean"):
-                        op(expected[i, j, k], v, counts[i, j, k])
-                    else:
-                        op(expected[i, j, k], v)
-                    counts[i, j, k] += 1
-
-                if (reduce == "amin" or reduce == "amax"):
-                    expected.masked_fill_(counts == 0, 0)
-
-                self.assertTrue(torch.allclose(output, expected))
-
-        with self.assertRaisesRegex(RuntimeError, "Expected `dim` to be in range -3 to 2"):
-            torch.scatter_reduce(input, 4, index, "sum")
-
-        with self.assertRaisesRegex(RuntimeError, "Shape mismatch"):
-            index2 = torch.randint(0, output_size, (10, ), dtype=torch.long, device=device)
-            torch.scatter_reduce(input, 0, index2, "sum")
-
-        with self.assertRaisesRegex(RuntimeError, "Expected `index` values to be in range 0 to 2"):
-            input2 = torch.randn(10, dtype=dtype, device=device)
-            index2 = torch.tensor([0, 1, 0, 1, 2, 3, 3, 4, 4, 3])
-            torch.scatter_reduce(input2, 0, index2, "sum", output_size=2)
-
     def test_structseq_repr(self):
         a = torch.arange(250).reshape(5, 5, 10)
         expected = """
@@ -6296,6 +6258,7 @@ def test_from_buffer(self):
         self.assertEqual(bools.size(), 8)
         self.assertEqual(bools.tolist(), [False, True, True, True, True, True, True, True])
         self.assertEqual(bools.type(), 'torch.BoolStorage')
+        self.assertTrue(isinstance(bools, torch.BoolStorage))
 
         f = bytearray(b'\x80\x02\x8a\nl\xfc\x9cF\xf9 j\xa8P\x19.\x80\x02M\xe9')
         bools = torch.BoolStorage.from_buffer(f, 'big')
@@ -6308,6 +6271,122 @@ def test_from_buffer(self):
         bytes = torch.ByteStorage.from_buffer(a)
         self.assertEqual(bytes.nbytes(), 4)
         self.assertEqual(bytes.tolist(), [1, 2, 3, 4])
+        self.assertTrue(isinstance(bytes, torch.ByteStorage))
+
+    def test_storage_error(self):
+        quantized_storages = [
+            torch.QInt32Storage,
+            torch.QInt8Storage,
+            torch.QUInt2x4Storage,
+            torch.QUInt4x2Storage,
+            torch.QUInt8Storage,
+        ]
+
+        with self.assertRaisesRegex(RuntimeError, r"Only child classes of _LegacyStorage can be instantiated"):
+            torch.storage._LegacyStorage()
+
+        for storage_class in torch._storage_classes:
+            if storage_class in [torch._UntypedStorage, torch.cuda._UntypedStorage, torch._TypedStorage]:
+                continue
+
+            device = 'cuda' if storage_class.__module__ == 'torch.cuda' else 'cpu'
+            dtype = storage_class.dtype
+
+            if device == 'cuda' and not torch.cuda.is_available():
+                continue
+
+            # Legacy <type>Storage constructor errors
+            with self.assertRaisesRegex(RuntimeError, r"'device' cannot be specified"):
+                storage_class(device='cpu')
+
+            with self.assertRaisesRegex(RuntimeError, r"'dtype' cannot be specified"):
+                storage_class(dtype=torch.float)
+
+            with self.assertRaisesRegex(TypeError, r"got an unexpected keyword"):
+                storage_class(sdlkjf=torch.float)
+
+            with self.assertRaisesRegex(RuntimeError, r"Too many positional arguments"):
+                storage_class(0, 0)
+
+            with self.assertRaisesRegex(TypeError, r"invalid data type"):
+                storage_class('string')
+
+            with self.assertRaisesRegex(TypeError, r"Argument type not recognized"):
+                storage_class(torch.tensor([]))
+
+            s = storage_class()
+
+            with self.assertRaisesRegex(RuntimeError, r"No positional arguments"):
+                storage_class(0, wrap_storage=s._untyped())
+
+            with self.assertRaisesRegex(TypeError, r"must be _UntypedStorage"):
+                storage_class(wrap_storage=s)
+
+            if torch.cuda.is_available():
+                if storage_class in quantized_storages:
+                    with self.assertRaisesRegex(RuntimeError, r"Cannot create CUDA storage with quantized dtype"):
+                        s.cuda()
+
+                else:
+
+                    if s.is_cuda:
+                        s_other_device = s.cpu()
+                    else:
+                        s_other_device = s.cuda()
+
+                    with self.assertRaisesRegex(RuntimeError, r"Device of 'wrap_storage' must be"):
+                        storage_class(wrap_storage=s_other_device._untyped())
+
+            # _TypedStorage constructor errors
+            with self.assertRaisesRegex(RuntimeError, r"No positional arguments"):
+                torch._TypedStorage(0, wrap_storage=s._untyped(), dtype=dtype)
+
+            with self.assertRaisesRegex(RuntimeError, r"Argument 'dtype' must be specified"):
+                torch._TypedStorage(wrap_storage=s._untyped())
+
+            with self.assertRaisesRegex(TypeError, r"Argument 'dtype' must be torch.dtype"):
+                torch._TypedStorage(wrap_storage=s._untyped(), dtype=0)
+
+            with self.assertRaisesRegex(RuntimeError, r"Argument 'device' should not be specified"):
+                torch._TypedStorage(wrap_storage=s._untyped(), dtype=dtype, device=device)
+
+            with self.assertRaisesRegex(TypeError, r"Argument 'wrap_storage' must be _UntypedStorage"):
+                torch._TypedStorage(wrap_storage=s, dtype=dtype)
+
+            with self.assertRaisesRegex(RuntimeError, r"Storage device not recognized"):
+                torch._TypedStorage(dtype=dtype, device='xla')
+
+            if torch.cuda.is_available():
+                if storage_class in quantized_storages:
+                    with self.assertRaisesRegex(RuntimeError, r"Cannot create CUDA storage with quantized dtype"):
+                        torch._TypedStorage(dtype=dtype, device='cuda')
+
+            with self.assertRaisesRegex(TypeError, r"Argument type not recognized"):
+                torch._TypedStorage(torch.tensor([]), dtype=dtype, device=device)
+
+            with self.assertRaisesRegex(RuntimeError, r"Too many positional arguments"):
+                torch._TypedStorage(0, 0, dtype=dtype, device=device)
+
+    def test_storage_error_no_attribute(self):
+        storage_classes = [
+            torch.cuda.ByteStorage,
+            torch.cuda.FloatStorage,
+            torch.cuda._UntypedStorage,
+        ]
+        for storage_class in storage_classes:
+            with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'):
+                storage_class.from_buffer()
+
+            if storage_class == torch.cuda._UntypedStorage:
+                with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'):
+                    storage_class._new_with_weak_ptr()
+
+            else:
+                with self.assertRaisesRegex(AttributeError, r'has no attribute'):
+                    storage_class._new_with_weak_ptr()
+
+            with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'):
+                storage_class._new_shared_filename(0, 0, 0)
 
     def test_storage_casts(self):
         storage = torch.IntStorage([-1, 0, 1, 2, 3, 4])
@@ -6489,6 +6568,11 @@ def test_print(self):
         self.assertEqual(x.__repr__(), str(x))
         self.assertExpectedInline(str(x), '''tensor([2.3000+4.j, 7.0000+6.j])''')
 
+        # test complex half tensor
+        x = torch.tensor([1.25 + 4j, -7. + 6j], dtype=torch.chalf)
+        self.assertEqual(x.__repr__(), str(x))
+        self.assertExpectedInline(str(x), '''tensor([ 1.2500+4.j, -7.0000+6.j], dtype=torch.complex32)''')
+
         # test scientific notation for complex tensors
         x = torch.tensor([1e28 + 2j , -1e-28j])
         self.assertEqual(x.__repr__(), str(x))
@@ -7066,6 +7150,14 @@ def test_fill_diagonal(self):
         e1.fill_diagonal_(v, wrap=True)
         self.assertEqual(e1, e2)
 
+    def test_setting_real_imag_to_a_number(self):
+        x = torch.randn(4, dtype=torch.cfloat)
+        x.real = 0
+        x.imag = 0
+        zeros = torch.zeros(4)
+        self.assertEqual(x.real, zeros)
+        self.assertEqual(x.imag, zeros)
+
     def test_batch_norm_cpu_inference(self):
         # input nchw in (2,1,1,1), (2,2,2,2)
         inputs = [
@@ -7114,7 +7206,6 @@ def test_batch_norm_cpu_inference(self):
 
     # FIXME: move these meta tests to their own test suite/class or
     #   distribute them among the appropriate test suites for their ops
-    @noarchTest
     def test_empty_meta(self):
         x = torch.empty(2 ** 20, 2 ** 20, device='meta')
         y = torch.empty(2 ** 20, device='meta')
@@ -7122,7 +7213,10 @@ def test_empty_meta(self):
         self.assertEqual(z.size(), (2 ** 20, 2 ** 20))
         self.assertRaises(RuntimeError, lambda: z[0][0].item())
 
-    @noarchTest
+    def test_format_scalar_meta(self):
+        x = torch.empty((), device='meta')
+        self.assertEqual(format(x), repr(x))
+
     def test_upsample_nearest1d_meta(self):
         # TODO: this test should be triggered by test_nn.py but right
         # now meta is not enabled (and even if it was, we are probably
@@ -7146,7 +7240,6 @@ def test_upsample_nearest1d_meta(self):
         self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8))
         self.assertRaises(RuntimeError, lambda: z[0][0][0].item())
 
-    @noarchTest
     def test_upsample_nearest2d_meta(self):
         # TODO: the out tests cannot be triggered by test_nn.py because
         # we don't actually do out= arguments for nn functions, so there
@@ -7187,13 +7280,11 @@ def test_upsample_nearest2d_meta(self):
             """Expected out tensor to have device meta, but got cpu instead"""
         )
 
-    @noarchTest
     def test_detach_meta(self):
         x = torch.empty(2, device='meta')
         # This used to segfault
         self.assertRaises(RuntimeError, lambda: x.detach().storage())
 
-    @noarchTest
     def test_add_meta_scalar(self):
         # From https://github.com/pytorch/pytorch/issues/53815
         x = torch.empty(2, device='meta')
@@ -7228,28 +7319,39 @@ def test_normal_shape(self):
             self.assertEqual(torch.normal(tensor2145, tensor2345).size(), (2, 3, 4, 5))
 
             # inputs are non-expandable tensors, but they have same number of elements
-            # TORCH_WARN_ONCE is used in torch.normal, only 1st assertEqual will show warn msg
-            if not warned:
-                self.assertWarnsRegex(UserWarning, "deprecated and the support will be removed",
-                                      lambda: self.assertEqual(torch.normal(tensor120, tensor2345).size(), (120,)))
-                warned = True
-            else:
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"The size of tensor a \(120\) must match the size of "
+                    r"tensor b \(5\) at non-singleton dimension 3"):
                 self.assertEqual(torch.normal(tensor120, tensor2345).size(), (120,))
-            self.assertEqual(torch.normal(tensor2345, tensor120).size(), (2, 3, 4, 5))
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"The size of tensor a \(5\) must match the size of "
+                    r"tensor b \(120\) at non-singleton dimension 3"):
+                self.assertEqual(torch.normal(tensor2345, tensor120).size(), (2, 3, 4, 5))
 
             # inputs are non-expandable tensors and they don't have same number of elements
-            with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"):
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"The size of tensor a \(5\) must match the size of "
+                    r"tensor b \(4\) at non-singleton dimension 3"):
                 torch.normal(tensor2345, tensor4)
 
             # output and inputs are size compatible
             self.assertEqual(torch.normal(tensor2345, tensor2345, out=output2345).size(), (2, 3, 4, 5))
 
             # output and inputs are not size compatible
-            with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"):
-                # inputs are expandable but have different broadcasted size than output
-                torch.normal(tensor2345, tensor2145, out=output345)
-            with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"):
-                # inputs are not expandable but reshapeable, output size is not the same as mean
+            with self.assertWarnsRegex(
+                    UserWarning,
+                    "This behavior is deprecated, and in a future PyTorch "
+                    "release outputs will not be resized unless they have "
+                    "zero elements"):
+                self.assertEqual(torch.normal(tensor2345, tensor2145, out=output345).size(), (2, 3, 4, 5))
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"The size of tensor a \(5\) must match the size of "
+                    r"tensor b \(120\) at non-singleton dimension 3"):
+                # inputs are not expandable, output size is not the same as mean
                 torch.normal(tensor2345, tensor120, out=output345)
 
     def test_tensoriterator_output_setup(self):
@@ -7354,12 +7456,12 @@ def test_numel(self):
 
     # Verifies that (deep)copies of dtypes are the same objects
     def test_copy_dtypes(self):
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             copied_dtype = copy.deepcopy(dtype)
             self.assertIs(dtype, copied_dtype)
 
     def test_dtype_is_signed(self):
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.half):
             self.assertEqual(dtype.is_signed, torch.is_signed(torch.tensor(0, dtype=dtype)))
 
         self.assertRaisesRegex(RuntimeError, 'not supported for quantized', lambda: torch.quint8.is_signed)
@@ -7474,6 +7576,12 @@ def test_copy_transpose(self):
         self.assertEqual(y[:, 0], range(100))
         self.assertEqual(y[:, 40], range(4000, 4100))
 
+        x = torch.arange(100 * 100).reshape(100, 100).to(dtype=torch.complex32).t()
+        y = torch.empty(100, 100, dtype=torch.complex32)
+        y.copy_(x)
+        self.assertEqual(y[:, 0], range(100))
+        self.assertEqual(y[:, 40], range(4000, 4100))
+
     # FIXME: Port to a more appropriate test suite
     def test_copy_broadcast(self):
         torch.zeros(5, 6).copy_(torch.zeros(6))
@@ -7486,7 +7594,7 @@ def test_copy_many_to_one(self):
         self.assertRaises(RuntimeError, lambda: torch.zeros(1, 6).expand(5, 6).copy_(torch.zeros(5, 6)))
 
     # FIXME: Port to a more appropriate test suite
-    def test_to(self):
+    def _test_to_with_layout(self, layout):
         def test_copy_behavior(t, non_blocking=False):
             self.assertIs(t, t.to(t, non_blocking=non_blocking))
             self.assertIs(t, t.to(t.dtype, non_blocking=non_blocking))
@@ -7508,16 +7616,33 @@ def test_copy_behavior(t, non_blocking=False):
                 self.assertIsNot(t, t.to(device, t.dtype, non_blocking=non_blocking, copy=True))
 
         a = torch.tensor(5)
+        if layout == torch.sparse_csr:
+            a = torch.tensor([[0, 1, 2], [2, 0, 3]]).to_sparse_csr()
         test_copy_behavior(a)
         self.assertEqual(a.device, a.to('cpu').device)
         self.assertEqual(a.device, a.to('cpu', dtype=torch.float32).device)
         self.assertIs(torch.float32, a.to('cpu', dtype=torch.float32).dtype)
         self.assertEqual(a.device, a.to(torch.float32).device)
         self.assertIs(torch.float32, a.to(dtype=torch.float32).dtype)
-        self.assertEqual(a.data_ptr(), a.to('cpu').data_ptr())
-        self.assertEqual(a.data_ptr(), a.to(dtype=a.dtype, device=a.device, copy=False).data_ptr())
-        self.assertEqual(a.data_ptr(), a.to('cpu', copy=False).data_ptr())
-        self.assertNotEqual(a.data_ptr(), a.to('cpu', copy=True).data_ptr())
+
+        def test_data_ptr(getter):
+            self.assertEqual(getter(a), getter(a.to('cpu')))
+            self.assertEqual(getter(a), getter(a.to(dtype=a.dtype, device=a.device, copy=False)))
+            self.assertEqual(getter(a), getter(a.to('cpu', copy=False)))
+            self.assertNotEqual(getter(a), getter(a.to('cpu', copy=True)))
+        if layout == torch.sparse_csr:
+            # TODO: compressed sparse tensors currently don't support data_ptr.
+            # Exercising failure will allow us to widen coverage of this test once it does.
+            with self.assertRaisesRegex(RuntimeError, "Cannot access data pointer of Tensor that doesn't have storage"):
+                a.data_ptr()
+            # While compressed sparse tensors don't have a concept of data_ptr
+            # the underlying tensors do. The implementation of to appropriately forwards
+            # the call to the components, which is what we're test here.
+            test_data_ptr(lambda a: a.values().data_ptr())
+            test_data_ptr(lambda a: a.crow_indices().data_ptr())
+            test_data_ptr(lambda a: a.col_indices().data_ptr())
+        else:
+            test_data_ptr(lambda a: a.data_ptr())
 
         if torch.cuda.is_available():
             for non_blocking in [True, False]:
@@ -7532,6 +7657,10 @@ def test_copy_behavior(t, non_blocking=False):
                     self.assertIs(torch.int32, b.to(dtype=torch.int32).dtype)
                     self.assertEqual(b.device, b.to(dtype=torch.int32).device)
 
+    def test_to(self):
+        self._test_to_with_layout(torch.strided)
+        self._test_to_with_layout(torch.sparse_csr)
+
     # FIXME: describe this test
     def test_as_subclass(self):
         class SubTensor(torch.Tensor):
@@ -7802,6 +7931,22 @@ def test_type_conversion_via_dtype_name(self):
         self.assertEqual(cdouble.dtype, torch.complex128)
         self.assertEqual(cdouble.real, x.double())
         self.assertEqual(cdouble.imag, torch.zeros_like(cdouble.imag))
+        chalf = x.chalf()
+        self.assertEqual(chalf.dtype, torch.complex32)
+        self.assertEqual(chalf.real, x.half())
+        self.assertEqual(chalf.imag, torch.zeros_like(chalf.imag))
+
+    def test_type_alias(self):
+        type_alias_map = {torch.float64: torch.double,
+                          torch.float32: torch.float,
+                          torch.int32: torch.int,
+                          torch.int64: torch.long,
+                          torch.int16: torch.short,
+                          torch.float16: torch.half,
+                          torch.complex32: torch.chalf,
+                          torch.complex64: torch.cfloat}
+        for dtype, alias in type_alias_map.items():
+            self.assertIs(alias, dtype)
 
     # FIXME: Describe this test
     def test_doc_template(self) -> None:
@@ -8153,8 +8298,8 @@ def invert_perm(p):
         def generate_inputs(num_batches):
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
-                b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-                b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
+                b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
+                b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
@@ -8162,8 +8307,8 @@ def generate_inputs(num_batches):
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N)
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O)
                 yield b1, b2
             # zero-sized tensors
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index 01e96a3fe112..8c82b43ecba6 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -7,15 +7,14 @@
 import torch
 
 from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests,
-                                                  TEST_NUMPY, torch_to_numpy_dtype_dict)
+                                                  TEST_NUMPY, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict)
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyNativeDeviceTypes,
-                                                        dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta)
+                                                        dtypes, onlyCPU, expectedFailureMeta, skipMeta)
 from torch.testing._internal.common_dtype import (
-    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes
+    all_types_and_complex_and, get_all_math_dtypes, floating_types, get_all_dtypes
 )
 
-if TEST_NUMPY:
-    import numpy as np
+import numpy as np
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -184,12 +183,14 @@ def test_bfloat16(self, device):
             self.assertEqual(bf + scalar, scalar + bf)
 
         # with tensor
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             t = torch.tensor(1, dtype=dtype, device=device)
             self.assertEqual(bf + t, t + bf)
             if dtype in (torch.float16, torch.float32, torch.float64, torch.cfloat, torch.cdouble):
                 # Handles bfloat16 x float16 -> float32 promotion
                 expected_dtype = dtype if dtype != torch.half else torch.float32
+            elif dtype is torch.chalf:
+                expected_dtype = torch.cfloat
             elif dtype in (torch.bool, torch.uint8,
                            torch.int8, torch.int16, torch.int32, torch.int64, torch.bfloat16):
                 expected_dtype = torch.bfloat16
@@ -200,6 +201,39 @@ def test_bfloat16(self, device):
             self.assertEqual(torch.promote_types(torch.bfloat16, dtype), expected_dtype)
             self.assertEqual((bf + t).dtype, expected_dtype)
 
+    @onlyNativeDeviceTypes
+    def test_complex_half(self, device):
+        # with scalar
+        chalf = torch.tensor(5.5, dtype=torch.chalf, device=device)
+        for scalar in (2.2, 5, 100000):   # chalf + 100000 is inf
+            self.assertEqual((chalf * scalar).dtype, torch.chalf)
+            self.assertEqual(scalar * chalf, chalf * scalar)
+
+        for scalar in (complex(1, 1), complex(-2, 0), complex(0, -3)):
+            self.assertEqual((chalf * scalar).dtype, torch.chalf)
+            self.assertEqual(chalf * scalar, scalar * chalf)
+
+        # with tensor
+        dtypes = all_types_and_complex_and(torch.chalf, torch.half, torch.bfloat16, torch.bool)
+        for dtype in dtypes:
+            t = torch.tensor(1, dtype=dtype, device=device)
+            self.assertEqual(chalf * t, t * chalf)
+            if dtype in (torch.float16, torch.chalf):
+                expected_dtype = torch.chalf
+            elif dtype in (torch.float, torch.double, torch.bfloat16):
+                expected_dtype = torch.cdouble if dtype is torch.double else torch.cfloat
+            elif dtype in (torch.cfloat, torch.cdouble):
+                expected_dtype = dtype
+            elif dtype in (torch.bool, torch.uint8,
+                           torch.int8, torch.int16, torch.int32, torch.int64):
+                expected_dtype = torch.chalf
+            else:
+                raise AssertionError(f'Missing dtype {dtype} not tested.')
+
+            self.assertEqual(torch.promote_types(dtype, torch.chalf), expected_dtype)
+            self.assertEqual(torch.promote_types(torch.chalf, dtype), expected_dtype)
+            self.assertEqual((chalf * t).dtype, expected_dtype)
+
     @float_double_default_dtype
     def test_alternate_result(self, device):
         f = torch.tensor([1, 1, 1, 1], dtype=torch.float, device=device)
@@ -340,7 +374,8 @@ def test_create_bool_tensors(self, device):
         # this seems like odd behavior but ints also create float tensors, numpy doesn't have this function.
         self.assertEqual(torch.scalar_tensor(False, device=device), torch.tensor(0., device=device))
 
-    @dtypes(*itertools.product(get_all_dtypes(), get_all_dtypes()))
+    @dtypes(*itertools.product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+                               all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)))
     def test_result_type(self, device, dtypes):
         "Test result_type for tensor vs tensor and scalar vs scalar."
 
@@ -520,12 +555,16 @@ def test_complex_assertraises(self, device):
             dict(name="ne", compare_op=lambda x, y: x != y, ),
         ]
         for op in comparison_ops:
-            for dt1 in get_all_math_dtypes(device):
-                for dt2 in get_all_math_dtypes(device):
-                    if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"):
-                        u = torch.tensor([1], dtype=dt1, device=device)
-                        v = torch.tensor([2], dtype=dt2, device=device)
-                        self.assertRaises(RuntimeError, lambda: torch.tensor([op["compare_op"](u, v)], dtype=torch.bool))
+            is_cuda = torch.device(device).type == 'cuda'
+            dtypes = get_all_dtypes(include_half=is_cuda,
+                                    include_bfloat16=False, include_bool=False,
+                                    include_complex32=True)
+
+            for dt1, dt2 in itertools.product(dtypes, dtypes):
+                if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"):
+                    u = torch.tensor([1], dtype=dt1, device=device)
+                    v = torch.tensor([2], dtype=dt2, device=device)
+                    self.assertRaises(RuntimeError, lambda: torch.tensor([op["compare_op"](u, v)], dtype=torch.bool))
 
     @float_double_default_dtype
     def test_lt_with_type_promotion(self, device):
@@ -562,7 +601,7 @@ def test_promote_types(self, device):
 
     @float_double_default_dtype
     def test_promote_self(self, device):
-        for dtype in get_all_dtypes():
+        for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf, torch.bool):
             self.assertEqual(torch.promote_types(dtype, dtype), dtype)
 
     @expectedFailureMeta
@@ -811,8 +850,8 @@ def test_integer_addcdiv_deprecated(self, device, dtype):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @float_double_default_dtype
     @onlyCPU
-    @dtypes(*list(itertools.product(torch_to_numpy_dtype_dict.keys(),
-                                    torch_to_numpy_dtype_dict.keys())))
+    @dtypes(*list(itertools.product(set(numpy_to_torch_dtype_dict.values()),
+                                    set(numpy_to_torch_dtype_dict.values()))))
     def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
         import operator
         np_type = torch_to_numpy_dtype_dict[dtypes[0]]
@@ -880,7 +919,7 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
 
     @onlyNativeDeviceTypes
     def test_cat_different_dtypes(self, device):
-        dtypes = get_all_dtypes(include_bfloat16=False)
+        dtypes = all_types_and_complex_and(torch.half, torch.bool)
         for x_dtype, y_dtype in itertools.product(dtypes, dtypes):
             x_vals, y_vals = [1, 2, 3], [4, 5, 6]
 
@@ -899,7 +938,7 @@ def test_cat_different_dtypes(self, device):
 
     @onlyNativeDeviceTypes
     def test_cat_out_different_dtypes(self, device):
-        dtypes = get_all_dtypes(include_bfloat16=False, include_bool=False)
+        dtypes = all_types_and_complex_and(torch.half)
         for x_dtype, y_dtype, out_dtype in itertools.product(dtypes, dtypes, dtypes):
             out = torch.zeros(6, device=device, dtype=out_dtype)
             x = torch.tensor([1, 2, 3], device=device, dtype=x_dtype)
@@ -937,7 +976,11 @@ def test_unary_op_out_casting(self, device, dtypes):
             elif op in real_only_ops and dtypes[0].is_complex:
                 with self.assertRaises(RuntimeError):
                     op(t, out=out)
-            elif op in float_only_ops and (not dtypes[0].is_floating_point and not dtypes[0].is_complex):
+            elif (
+                    op in float_only_ops
+                    and (not dtypes[0].is_floating_point and not dtypes[0].is_complex)
+                    and device != "meta"
+            ):
                 with self.assertRaises(RuntimeError):
                     op(t, out=out)
             else:
@@ -947,6 +990,7 @@ def test_unary_op_out_casting(self, device, dtypes):
     # Verifies that the out= argument doesn't affect the computation, that
     # is, out = op(...) and op(..., out=out) produce the same result.
     @onlyNativeDeviceTypes
+    @skipMeta
     def test_computation_ignores_out(self, device):
         t = torch.tensor(33000, dtype=torch.float16, device=device)
         out = torch.empty(0, dtype=torch.float64, device=device)
@@ -966,37 +1010,70 @@ def test_computation_ignores_out(self, device):
         self.assertEqual(result, a - b, exact_dtype=False)
         self.assertNotEqual(result, a.double() - b, exact_dtype=False)
 
-    @dtypesIfCUDA(*itertools.product(get_all_dtypes(include_bfloat16=False, include_complex=False),
-                                     get_all_dtypes(include_bfloat16=False, include_complex=False)))
-    @dtypes(*itertools.product(get_all_dtypes(include_half=False, include_bfloat16=False,
-                                              include_complex=False),
-                               get_all_dtypes(include_half=False, include_bfloat16=False,
-                                              include_complex=False)))
-    def test_atan2_type_promotion(self, device, dtypes):
-        dtype1, dtype2 = dtypes
-        default_float = torch.get_default_dtype()
-
-        def is_int(dtype):
-            return dtype in get_all_int_dtypes() + [torch.bool]
-
-        def is_float(dtype):
-            return dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False)
-
-        def get_binary_float_result_type(x, y):
-            dtype1 = x.dtype
-            dtype2 = y.dtype
-            if is_float(dtype1) and is_float(dtype2):
-                return torch.result_type(x, y)
-            elif is_float(dtype1) and is_int(dtype2):
-                return dtype1
-            elif is_int(dtype1) and is_float(dtype2):
-                return dtype2
-            elif is_int(dtype1) and is_int(dtype2):
-                return default_float
-
-        x = torch.tensor(1, dtype=dtype1, device=device)
-        y = torch.tensor(2, dtype=dtype2, device=device)
-        self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype)
+    @onlyNativeDeviceTypes
+    @dtypes(*itertools.product((torch.bool, torch.int, torch.float, torch.double), repeat=3))
+    def test_clamp_type_promotion(self, device, dtypes):
+        dtype0, dtype1, dtype2 = dtypes
+        S = 4
+
+        def make_tensor(size, dtype):
+            if dtype == torch.bool:
+                return torch.randint(2, size, dtype=dtype, device=device)
+            elif dtype == torch.int:
+                return torch.randint(10, size, dtype=dtype, device=device)
+            else:
+                return torch.randn(size, dtype=dtype, device=device)
+        min_t = make_tensor((S,), dtype1)
+        max_t = make_tensor((S,), dtype2)
+        mins = (min_t, min_t[0], min_t[0].item())
+        maxs = (max_t, max_t[0], max_t[0].item())
+        inp = make_tensor((S,), dtype0)
+        for min_v, max_v in itertools.product(mins, maxs):
+            if type(max_v) != type(min_v):
+                continue
+            if isinstance(min_v, torch.Tensor) and min_v.ndim == 0 and max_v.ndim == 0:
+                continue  # 0d tensors go to scalar overload, and it's tested separately
+
+            def expected_type(inp, max, min):
+                arg1, arg2 = max, min
+                if isinstance(max, torch.Tensor) and max.ndim == 0:
+                    # first do a maybe dimensional boundary
+                    arg1, arg2 = min, max
+                exp_type = torch.result_type(inp, arg1)
+                inp_new = torch.empty_like(inp, dtype=exp_type)
+                return torch.result_type(inp_new, arg2)
+            exp_type = expected_type(inp, min_v, max_v)
+            if exp_type != torch.bool:
+                actual = torch.clamp(inp, min_v, max_v)
+                inps = list(map(lambda x: x.to(exp_type) if isinstance(x, torch.Tensor) else x,
+                            (inp, min_v, max_v)))
+                expected = torch.clamp(inps[0], inps[1], inps[2])
+                self.assertEqual(actual, expected)
+                if inp.dtype in floating_types() or exp_type == inp.dtype:
+                    actual = torch.clamp_(inp, min_v, max_v)
+                    self.assertEqual(actual, expected, exact_dtype=False)
+        for val in mins:
+            def expected_type(inp, val):
+                return torch.result_type(inp, val)
+            exp_type = expected_type(inp, val)
+            if exp_type != torch.bool:
+                actual = torch.clamp_min(inp, val)
+                inps = list(map(lambda x: x.to(exp_type) if isinstance(x, torch.Tensor) else x,
+                            (inp, val)))
+                expected = torch.clamp_min(inps[0], inps[1])
+                self.assertEqual(actual.dtype, exp_type)
+                self.assertEqual(actual, expected)
+                if inp.dtype == exp_type:
+                    actual = torch.clamp_min_(inp, val)
+                    self.assertEqual(actual, expected)
+                actual = torch.clamp_max(inp, val)
+                expected = torch.clamp_max(inps[0], inps[1])
+                self.assertEqual(actual, expected)
+                if inp.dtype in floating_types() or exp_type == inp.dtype:
+                    actual = torch.clamp_max_(inp, val)
+                    self.assertEqual(actual, expected, exact_dtype=False)
+
+
 
 instantiate_device_type_tests(TestTypePromotion, globals())
 
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 2a113799fff6..3c443dd5bc52 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -11,18 +11,47 @@
 
 from torch._six import inf, nan
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict,
-    suppress_warnings, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS, gradcheck)
+    TestCase,
+    run_tests,
+    torch_to_numpy_dtype_dict,
+    numpy_to_torch_dtype_dict,
+    suppress_warnings,
+    TEST_SCIPY,
+    slowTest,
+    skipIfNoSciPy,
+    IS_WINDOWS,
+    gradcheck,
+    TEST_WITH_ASAN,
+)
 from torch.testing._internal.common_methods_invocations import (
-    unary_ufuncs, _NOTHING)
+    unary_ufuncs,
+    generate_elementwise_unary_tensors,
+    _NOTHING,
+    generate_elementwise_unary_small_value_tensors,
+    generate_elementwise_unary_large_value_tensors,
+    generate_elementwise_unary_extremal_value_tensors,
+)
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, ops, dtypes, onlyCPU, onlyNativeDeviceTypes,
-    onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU,
-    OpDTypes)
+    instantiate_device_type_tests,
+    ops,
+    dtypes,
+    onlyCPU,
+    onlyNativeDeviceTypes,
+    onlyCUDA,
+    dtypesIfCUDA,
+    precisionOverride,
+    dtypesIfCPU,
+)
+
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
-    floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, get_all_dtypes, get_all_math_dtypes,
-    get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+    floating_types_and,
+    all_types_and_complex_and,
+    integral_types_and,
+    get_all_math_dtypes,
+    complex_types,
+    all_types_and,
+    floating_and_complex_types_and,
 )
 
 if TEST_SCIPY:
@@ -45,140 +74,8 @@
 # (https://numpy.org/doc/1.18/reference/ufuncs.html) for more details
 # about the concept of ufuncs.
 
-# Functions tested here:
-#
-
-# Interesting values and extremal values for different dtypes
-_unsigned_int_vals = (0, 1, 55, 127)
-_int_vals = (0, -1, 1, -55, 55, -127, 127, -128, 128)
-_large_int_vals = (-1113, 1113, -10701, 10701)
-_float_vals = (0.,
-               -.001, .001,
-               -.25, .25,
-               -1., 1.,
-               -math.pi / 2, math.pi / 2,
-               -math.pi + .00001, math.pi - .00001,
-               -math.pi, math.pi,
-               -math.pi - .00001, math.pi + .00001)
-_large_float16_vals = (-501, 501,
-                       -1001.2, 1001.2,
-                       -13437.7, 13437.7)
-_large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20)
-_float_extremals = (float('inf'), float('-inf'), float('nan'))
-_medium_length = 812
-_large_size = (1029, 917)
-
-
-# Replace values satisfying condition with a safe value. This is used to block
-# out values the could cause singularity like tan(pi/2)
-def replace_values_in_tensor(tensor, condition, safe_value):
-    mask = condition(tensor)
-    tensor.masked_fill_(mask, safe_value)
-
-
-# Returns generator of tensors of different sizes filled with values in domain
-# and with intested region filled with `vals`. This will help test different code
-# paths for the given vals
-# `filter_` can be either None or a tuple of (condition, safe_value). When not None
-# values satisfying `condition`` will be replaced with `safe_value` in the generated
-# tensor. This is useful to avoid singularities when generating inputs for tests, such
-# as tan(pi/2)
-def generate_tensors_from_vals(vals, device, dtype, domain, filter_):
-    offset = 63
-
-    assert _large_size[1] > (_medium_length + offset)  # large tensor should be large enough
-    assert len(vals) < _medium_length  # medium tensor should contain all vals
-    assert _medium_length % 4 == 0  # ensure vectorized code coverage
-
-    if not dtype.is_complex:
-        # Filter values based on Operators domain.
-        # Note: Complex numbers don't belong to ordered field,
-        #       so we don't filter for them.
-        if domain[0] is not None:
-            vals = list(filter(lambda x: x >= domain[0], vals))
-        if domain[1] is not None:
-            vals = list(filter(lambda x: x < domain[1], vals))
-
-    if filter_ is not None:
-        condition, safe_value = filter_
-
-    # Constructs the large tensor containing vals
-    large_tensor = make_tensor(_large_size, device=device, dtype=dtype, low=domain[0], high=domain[1])
-
-    # Inserts the vals at an odd place
-    large_tensor[57][offset:offset + len(vals)] = torch.tensor(vals, device=device, dtype=dtype)
-
-    if filter_ is not None:
-        replace_values_in_tensor(large_tensor, condition, safe_value)
-
-    # Takes a medium sized copy of the large tensor containing vals
-    medium_tensor = large_tensor[57][offset:offset + _medium_length]
-
-    if filter_ is not None:
-        replace_values_in_tensor(medium_tensor, condition, safe_value)
-
-    # Constructs scalar tensors
-    scalar_tensors = (t.squeeze() for t in torch.split(medium_tensor, 1))
-
-    # Tensors with no elements
-    empty_sizes = ((0,), (0, 3, 3), (1, 0, 5), (6, 0, 0, 0), (3, 0, 1, 0))
-    empty_tensors = (torch.empty(size, device=device, dtype=dtype) for size in empty_sizes)
-
-    return chain(empty_tensors, scalar_tensors, (medium_tensor,), (large_tensor,))
-
-
-# [Note generate_numeric_tensors, generate_numeric_tensors_hard,
-#  and generate_numeric_tensors_extremal]
-#
-# Returns an iterable of contiguous tensors with the same storage on the requested
-#   device and with the requested dtype.
-#
-# This function is intended to test the non-vectorized and vectorized code
-#   paths of unary functions, as well as their handling of odd tensor
-#   sizes (like zero-dim tensors and tensors with zero elements).
-#
-# The iterable will include an empty tensor, tensors with no elements,
-#   zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and
-#   a large 2D tensor.
-#
-# These tensors will include interesting values. The generate_numeric_tensors_hard
-#   tests larger values (>500) and generate_numeric_tensors_extremal tests extremal
-#   values like -inf, inf, and nan.
-#
-# The randomly generated values can be restricted by the domain
-#   argument.
-def generate_numeric_tensors(device, dtype, *,
-                             domain=(None, None),
-                             filter_=None):
-    # Special-cases bool
-    if dtype is torch.bool:
-        tensors = (torch.empty(0, device=device, dtype=torch.bool),
-                   torch.tensor(True, device=device),
-                   torch.tensor(False, device=device),
-                   torch.tensor((True, False), device=device),
-                   make_tensor((_medium_length,), device=device, dtype=dtype, low=None, high=None),
-                   make_tensor(_large_size, device=device, dtype=dtype, low=None, high=None))
-        return tensors
-
-    # Acquires dtype-specific vals
-    if dtype.is_floating_point or dtype.is_complex:
-        vals = _float_vals
-
-        # Converts float -> complex vals if dtype is complex
-        if dtype.is_complex:
-            vals = tuple(complex(x, y) for x, y in product(vals, vals))
-    elif dtype is torch.uint8:
-        vals = _unsigned_int_vals
-    else:  # dtypes is a signed integer type
-        assert dtype in (torch.int8, torch.int16, torch.int32, torch.int64)
-        vals = _int_vals
-
-    return generate_tensors_from_vals(vals, device, dtype, domain, filter_)
-
 
-def generate_numeric_tensors_hard(device, dtype, *,
-                                  domain=(None, None),
-                                  filter_=None):
+def generate_numeric_tensors_hard(device, dtype, *, domain=(None, None), filter_=None):
     is_signed_integral = dtype in (torch.int8, torch.int16, torch.int32, torch.int64)
     if not (dtype.is_floating_point or dtype.is_complex or is_signed_integral):
         return ()
@@ -190,18 +87,23 @@ def generate_numeric_tensors_hard(device, dtype, *,
         else:
             vals = _large_float_vals
     elif dtype.is_complex:
-        vals = tuple(complex(x, y) for x, y in chain(product(_large_float_vals, _large_float_vals),
-                                                     product(_float_vals, _large_float_vals),
-                                                     product(_large_float_vals, _float_vals)))
+        vals = tuple(
+            complex(x, y)
+            for x, y in chain(
+                product(_large_float_vals, _large_float_vals),
+                product(_float_vals, _large_float_vals),
+                product(_large_float_vals, _float_vals),
+            )
+        )
     else:
         vals = _large_int_vals
 
     return generate_tensors_from_vals(vals, device, dtype, domain, filter_)
 
 
-def generate_numeric_tensors_extremal(device, dtype, *,
-                                      domain=(None, None),
-                                      filter_=None):
+def generate_numeric_tensors_extremal(
+    device, dtype, *, domain=(None, None), filter_=None
+):
     if not (dtype.is_floating_point or dtype.is_complex):
         return ()
 
@@ -209,9 +111,14 @@ def generate_numeric_tensors_extremal(device, dtype, *,
     if dtype.is_floating_point:
         vals = _float_extremals
     elif dtype.is_complex:
-        vals = tuple(complex(x, y) for x, y in chain(product(_float_extremals, _float_extremals),
-                                                     product(_float_vals, _float_extremals),
-                                                     product(_float_extremals, _float_vals)))
+        vals = tuple(
+            complex(x, y)
+            for x, y in chain(
+                product(_float_extremals, _float_extremals),
+                product(_float_vals, _float_extremals),
+                product(_float_extremals, _float_vals),
+            )
+        )
 
     return generate_tensors_from_vals(vals, device, dtype, domain, filter_)
 
@@ -221,8 +128,10 @@ def generate_numeric_tensors_extremal(device, dtype, *,
 class TestUnaryUfuncs(TestCase):
     exact_dtype = True
 
-    @ops([_fn for _fn in unary_ufuncs if _fn.domain != (None, None)],
-         allowed_dtypes=floating_types_and(torch.bfloat16, torch.half))
+    @ops(
+        [_fn for _fn in unary_ufuncs if _fn.domain != (None, None)],
+        allowed_dtypes=floating_types_and(torch.bfloat16, torch.half),
+    )
     def test_float_domains(self, device, dtype, op):
         eps = (1e-5, 1e-3, 1e-1, 1, 2, 10, 20, 50, 100)
 
@@ -240,11 +149,14 @@ def test_float_domains(self, device, dtype, op):
                     continue
 
                 result = op(lower_tensor)
-                self.assertEqual(result.item(), float('nan'),
-                                 msg=("input of {0} outside lower domain boundary"
-                                      " {1} produced {2}, not nan!").format(lower_tensor.item(),
-                                                                            low,
-                                                                            result.item()))
+                self.assertEqual(
+                    result.item(),
+                    float("nan"),
+                    msg=(
+                        "input of {0} outside lower domain boundary"
+                        " {1} produced {2}, not nan!"
+                    ).format(lower_tensor.item(), low, result.item()),
+                )
 
         if high is not None:
             high_tensor = torch.tensor(high, device=device, dtype=dtype)
@@ -256,15 +168,20 @@ def test_float_domains(self, device, dtype, op):
                     continue
 
                 result = op(higher_tensor)
-                self.assertEqual(result.item(), float('nan'),
-                                 msg=("input of {0} outside upper domain boundary"
-                                      " {1} produced {2}, not nan!").format(higher_tensor.item(),
-                                                                            high,
-                                                                            result.item()))
+                self.assertEqual(
+                    result.item(),
+                    float("nan"),
+                    msg=(
+                        "input of {0} outside upper domain boundary"
+                        " {1} produced {2}, not nan!"
+                    ).format(higher_tensor.item(), high, result.item()),
+                )
 
     # Helper for comparing torch tensors and numpy arrays
     # TODO: should this or assertEqual also validate that strides are equal?
-    def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs):
+    def assertEqualHelper(
+        self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs
+    ):
         assert isinstance(actual, torch.Tensor)
 
         # Some NumPy functions return scalars, not arrays
@@ -273,46 +190,96 @@ def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, *
         elif isinstance(expected, np.ndarray):
             # Handles exact dtype comparisons between arrays and tensors
             if exact_dtype:
-                # Allows array dtype to be float32 when comparing with bfloat16 tensors
-                #   since NumPy doesn't support the bfloat16 dtype
-                # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16
-                # to float32
-                if expected.dtype == np.float32:
-                    assert actual.dtype in (torch.float16, torch.bfloat16, torch.float32)
-                else:
-                    assert expected.dtype == torch_to_numpy_dtype_dict[actual.dtype]
-
-            self.assertEqual(actual,
-                             torch.from_numpy(expected).to(actual.dtype),
-                             msg,
-                             exact_device=False,
-                             **kwargs)
+                if (
+                    actual.dtype is torch.bfloat16
+                    or expected.dtype != torch_to_numpy_dtype_dict[actual.dtype]
+                ):
+                    # Allows array dtype to be float32 when comparing with bfloat16 tensors
+                    #   since NumPy doesn't support the bfloat16 dtype
+                    # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16
+                    # to float32
+                    if expected.dtype == np.float32:
+                        assert actual.dtype in (
+                            torch.float16,
+                            torch.bfloat16,
+                            torch.float32,
+                        )
+                    elif expected.dtype == np.float64:
+                        assert actual.dtype in (
+                            torch.float16,
+                            torch.bfloat16,
+                            torch.float32,
+                            torch.float64,
+                        )
+                    else:
+                        self.fail(
+                            "Expected dtype {0} but got {1}!".format(
+                                expected.dtype, actual.dtype
+                            )
+                        )
+
+            self.assertEqual(
+                actual,
+                torch.from_numpy(expected).to(actual.dtype),
+                msg,
+                exact_device=False,
+                **kwargs
+            )
         else:
             self.assertEqual(actual, expected, msg, exact_device=False, **kwargs)
 
     # Tests that the function and its (array-accepting) reference produce the same
     #   values on given tensors
     def _test_reference_numerics(self, dtype, op, tensors, equal_nan=True):
-        def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=True):
-            if not torch.can_cast(numpy_to_torch_dtype_dict[expected.dtype.type], dtype):
+        def _helper_reference_numerics(
+            expected, actual, msg, exact_dtype, equal_nan=True
+        ):
+            if not torch.can_cast(
+                numpy_to_torch_dtype_dict[expected.dtype.type], dtype
+            ):
                 exact_dtype = False
 
             if dtype in [torch.uint8, torch.int8, torch.bool]:
                 # NOTE: For these dtypes, PyTorch computes in the default scalar type (float)
                 # while NumPy computes in float16
-                self.assertEqualHelper(actual, expected, msg, dtype=dtype,
-                                       exact_dtype=exact_dtype, rtol=1e-3, atol=1e-2)
+                self.assertEqualHelper(
+                    actual,
+                    expected,
+                    msg,
+                    dtype=dtype,
+                    exact_dtype=exact_dtype,
+                    rtol=1e-3,
+                    atol=1e-2,
+                )
             elif dtype is torch.bfloat16:
                 # Ref: https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_utils.py#L1149
-                self.assertEqualHelper(actual, expected, msg, dtype=dtype,
-                                       exact_dtype=exact_dtype, rtol=16e-3, atol=1e-5)
+                self.assertEqualHelper(
+                    actual,
+                    expected,
+                    msg,
+                    dtype=dtype,
+                    exact_dtype=exact_dtype,
+                    rtol=16e-3,
+                    atol=1e-5,
+                )
+
             else:
-                self.assertEqualHelper(actual, expected, msg, dtype=dtype, equal_nan=equal_nan, exact_dtype=exact_dtype)
+                self.assertEqualHelper(
+                    actual,
+                    expected,
+                    msg,
+                    dtype=dtype,
+                    equal_nan=equal_nan,
+                    exact_dtype=exact_dtype,
+                )
 
         for t in tensors:
+            t = t.input
             torch_kwargs, numpy_kwargs = op.sample_kwargs(t.device, dtype, t)
             if dtype is torch.bfloat16:
                 a = t.cpu().to(torch.float32).numpy()
+            elif dtype is torch.complex32:
+                a = t.cpu().to(torch.complex64).numpy()
             else:
                 a = t.cpu().numpy()
 
@@ -321,15 +288,19 @@ def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=Tru
 
             # Crafts a custom error message for smaller, printable tensors
             if t.numel() < 10:
-                msg = ("Failed to produce expected results! Input tensor was"
-                       " {0}, torch result is {1}, and reference result is"
-                       " {2}.").format(t, actual, expected)
+                msg = (
+                    "Failed to produce expected results! Input tensor was"
+                    " {0}, torch result is {1}, and reference result is"
+                    " {2}."
+                ).format(t, actual, expected)
             else:
                 msg = None
 
             exact_dtype = True
             if isinstance(actual, torch.Tensor):
-                _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan)
+                _helper_reference_numerics(
+                    expected, actual, msg, exact_dtype, equal_nan
+                )
             else:
                 for x, y in zip(expected, actual):
                     # testing multi-outputs results
@@ -339,58 +310,72 @@ def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=Tru
     #   values on a range of tensors, including empty tensors, scalar tensors,
     #   1D tensors and a large 2D tensor with interesting and extremal values
     #   and noncontiguities.
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @suppress_warnings
     @ops(reference_filtered_ops)
     def test_reference_numerics_normal(self, device, dtype, op):
-        tensors = generate_numeric_tensors(device, dtype,
-                                           domain=op.domain,
-                                           filter_=op.reference_numerics_filter)
+        tensors = generate_elementwise_unary_tensors(
+            op, device=device, dtype=dtype, requires_grad=False
+        )
         self._test_reference_numerics(dtype, op, tensors)
 
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @suppress_warnings
-    @ops(reference_filtered_ops, allowed_dtypes=floating_and_complex_types_and(
-        torch.bfloat16, torch.half, torch.int8, torch.int16, torch.int32, torch.int64
-    ))
-    def test_reference_numerics_hard(self, device, dtype, op):
-        if not op.handles_large_floats:
-            raise self.skipTest("This op does not handle large values")
-
-        tensors = generate_numeric_tensors_hard(device, dtype,
-                                                domain=op.domain)
+    @ops(reference_filtered_ops)
+    def test_reference_numerics_small(self, device, dtype, op):
+        if dtype in (torch.bool,):
+            raise self.skipTest("bool has no small values")
+
+        tensors = generate_elementwise_unary_small_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=False
+        )
         self._test_reference_numerics(dtype, op, tensors)
 
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @suppress_warnings
-    @ops(reference_filtered_ops,
-         allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half))
-    def test_reference_numerics_extremal(self, device, dtype, op):
-        handles_extremals = (op.handles_complex_extremals if
-                             dtype in (torch.cfloat, torch.cdouble) else op.handles_extremals)
-        if not handles_extremals:
-            raise self.skipTest("This op does not handle extremal values")
+    @ops(reference_filtered_ops)
+    def test_reference_numerics_large(self, device, dtype, op):
+        if dtype in (torch.bool, torch.uint8, torch.int8):
+            raise self.skipTest("bool, uint8, and int8 dtypes have no large values")
 
-        tensors = generate_numeric_tensors_extremal(device, dtype,
-                                                    domain=op.domain)
+        tensors = generate_elementwise_unary_large_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=False
+        )
+        self._test_reference_numerics(dtype, op, tensors)
 
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @suppress_warnings
+    @ops(
+        reference_filtered_ops,
+        allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+    )
+    def test_reference_numerics_extremal(self, device, dtype, op):
+        tensors = generate_elementwise_unary_extremal_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=False
+        )
         self._test_reference_numerics(dtype, op, tensors)
 
     # Tests for testing (non)contiguity consistency
-
     @ops(unary_ufuncs)
     def test_contig_vs_every_other(self, device, dtype, op):
-        contig = make_tensor((1026,), device=device, dtype=dtype,
-                             low=op.domain[0], high=op.domain[1])
+        contig = make_tensor(
+            (1026,), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
+        )
         non_contig = contig[::2]
 
         self.assertTrue(contig.is_contiguous())
         self.assertFalse(non_contig.is_contiguous())
 
         torch_kwargs, _ = op.sample_kwargs(device, dtype, non_contig)
-        self.assertEqual(op(contig, **torch_kwargs)[::2], op(non_contig, **torch_kwargs))
+        self.assertEqual(
+            op(contig, **torch_kwargs)[::2], op(non_contig, **torch_kwargs)
+        )
 
     @ops(unary_ufuncs)
     def test_contig_vs_transposed(self, device, dtype, op):
-        contig = make_tensor((789, 357), device=device, dtype=dtype,
-                             low=op.domain[0], high=op.domain[1])
+        contig = make_tensor(
+            (789, 357), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
+        )
         non_contig = contig.T
 
         self.assertTrue(contig.is_contiguous())
@@ -403,8 +388,9 @@ def test_contig_vs_transposed(self, device, dtype, op):
     def test_non_contig(self, device, dtype, op):
         shapes = [(5, 7), (1024,)]
         for shape in shapes:
-            contig = make_tensor(shape, device, dtype,
-                                 low=op.domain[0], high=op.domain[1])
+            contig = make_tensor(
+                shape, dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
+            )
             non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
             non_contig.copy_(contig)
 
@@ -416,8 +402,13 @@ def test_non_contig(self, device, dtype, op):
 
     @ops(unary_ufuncs)
     def test_non_contig_index(self, device, dtype, op):
-        contig = make_tensor((2, 2, 1, 2), device, dtype,
-                             low=op.domain[0], high=op.domain[1])
+        contig = make_tensor(
+            (2, 2, 1, 2),
+            dtype=dtype,
+            device=device,
+            low=op.domain[0],
+            high=op.domain[1],
+        )
         non_contig = contig[:, 1, ...]
         contig = non_contig.contiguous()
 
@@ -431,8 +422,9 @@ def test_non_contig_index(self, device, dtype, op):
     def test_non_contig_expand(self, device, dtype, op):
         shapes = [(1, 3), (1, 7), (5, 7)]
         for shape in shapes:
-            contig = make_tensor(shape, device, dtype,
-                                 low=op.domain[0], high=op.domain[1])
+            contig = make_tensor(
+                shape, dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
+            )
             non_contig = contig.clone().expand(3, -1, -1)
 
             self.assertTrue(contig.is_contiguous())
@@ -442,13 +434,15 @@ def test_non_contig_expand(self, device, dtype, op):
             contig = op(contig, **torch_kwargs)
             non_contig = op(non_contig, **torch_kwargs)
             for i in range(3):
-                self.assertEqual(contig, non_contig[i],
-                                 msg='non-contiguous expand[' + str(i) + ']')
+                self.assertEqual(
+                    contig, non_contig[i], msg="non-contiguous expand[" + str(i) + "]"
+                )
 
     @ops(unary_ufuncs)
     def test_contig_size1(self, device, dtype, op):
-        contig = make_tensor((5, 100), device, dtype,
-                             low=op.domain[0], high=op.domain[1])
+        contig = make_tensor(
+            (5, 100), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
+        )
         contig = contig[:1, :50]
         contig2 = torch.empty(contig.size(), device=device, dtype=dtype)
         contig2.copy_(contig)
@@ -461,8 +455,13 @@ def test_contig_size1(self, device, dtype, op):
 
     @ops(unary_ufuncs)
     def test_contig_size1_large_dim(self, device, dtype, op):
-        contig = make_tensor((5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4), device, dtype,
-                             low=op.domain[0], high=op.domain[1])
+        contig = make_tensor(
+            (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4),
+            dtype=dtype,
+            device=device,
+            low=op.domain[0],
+            high=op.domain[1],
+        )
         contig = contig[:1, :, :, :, :, :, :, :, :, :, :, :]
         contig2 = torch.empty(contig.size(), device=device, dtype=dtype)
         contig2.copy_(contig)
@@ -477,8 +476,9 @@ def test_contig_size1_large_dim(self, device, dtype, op):
     # per-batch computation.
     @ops(unary_ufuncs)
     def test_batch_vs_slicing(self, device, dtype, op):
-        input = make_tensor((1024, 512), dtype=dtype, device=device,
-                            low=op.domain[0], high=op.domain[1])
+        input = make_tensor(
+            (1024, 512), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
+        )
 
         torch_kwargs, _ = op.sample_kwargs(device, dtype, input)
         actual = op(input, **torch_kwargs)
@@ -486,43 +486,14 @@ def test_batch_vs_slicing(self, device, dtype, op):
 
         self.assertEqual(actual, expected)
 
-    def _test_out_arg(self, op, input, output, expected, **kwargs):
-        if op.safe_casts_outputs:
-            expect_fail = not torch.can_cast(expected.dtype, output.dtype)
-        else:
-            expect_fail = output.dtype != expected.dtype
-
-        if expect_fail:
-            with self.assertRaises(RuntimeError):
-                op(input, out=output, **kwargs)
-        else:
-            res = op(input, out=output, **kwargs)
-            self.assertTrue(res is output)
-            self.assertEqual(output, expected.to(output.dtype))
-
-    @ops(unary_ufuncs, dtypes=OpDTypes.supported)
-    def test_out_arg_all_dtypes(self, device, dtype, op):
-        if not op.supports_out:
-            self.skipTest("Skipped! Op doesn't support out= kwarg.")
-
-        input = make_tensor((64, 64), dtype=dtype, device=device,
-                            low=op.domain[0], high=op.domain[1])
-        torch_kwargs, _ = op.sample_kwargs(device, dtype, input)
-        expected = op(input, **torch_kwargs)
-
-        for out_dtype in all_types_and_complex_and(torch.bool, torch.half):
-            out = torch.empty_like(input, dtype=out_dtype)
-            self._test_out_arg(op, input, out, expected, **torch_kwargs)
-
-    @dtypes(*(get_all_int_dtypes() + [torch.bool] +
-              get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*all_types_and(torch.bool, torch.half))
     def test_nan_to_num(self, device, dtype):
         for contiguous in [False, True]:
-            x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device)
+            x = make_tensor((64, 64), low=0.0, high=100.0, dtype=dtype, device=device)
 
             if dtype.is_floating_point:
                 # Add extremal values.
-                extremals = [float('nan'), float('inf'), -float('inf')]
+                extremals = [float("nan"), float("inf"), -float("inf")]
                 for idx, extremal in zip(torch.randint(0, 63, (3,)), extremals):
                     x[idx, :] = extremal
 
@@ -534,12 +505,16 @@ def test_nan_to_num(self, device, dtype):
             posinf = random.random() * 5
             neginf = random.random() * 10
 
-            self.compare_with_numpy(lambda x: x.nan_to_num(nan=nan, posinf=posinf),
-                                    lambda x: np.nan_to_num(x, nan=nan, posinf=posinf),
-                                    x)
-            self.compare_with_numpy(lambda x: x.nan_to_num(posinf=posinf, neginf=neginf),
-                                    lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf),
-                                    x)
+            self.compare_with_numpy(
+                lambda x: x.nan_to_num(nan=nan, posinf=posinf),
+                lambda x: np.nan_to_num(x, nan=nan, posinf=posinf),
+                x,
+            )
+            self.compare_with_numpy(
+                lambda x: x.nan_to_num(posinf=posinf, neginf=neginf),
+                lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf),
+                x,
+            )
 
             # Out Variant
             out = torch.empty_like(x)
@@ -551,10 +526,35 @@ def test_nan_to_num(self, device, dtype):
             torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf)
             self.assertEqual(result, out)
 
+    @onlyCPU
+    def test_nan_to_num_bfloat16(self, device):
+        def test_dtype(fn, input, dtype):
+            input = input.detach().clone().to(dtype=dtype).requires_grad_(True)
+            input2 = input.detach().clone().float().requires_grad_(True)
+            out = fn(input)
+            out.sum().backward()
+            out2 = fn(input2)
+            out2.sum().backward()
+            self.assertEqual(out.dtype, dtype)
+            self.assertEqual(input.grad.dtype, dtype)
+            self.assertEqual(out, out2, exact_dtype=False)
+            self.assertEqual(input.grad, input2.grad, exact_dtype=False)
+
+        def func():
+            return torch.nan_to_num
+
+        shapes = [[1, 3, 6, 6], [1, 3, 6, 128], [1, 3, 256, 256]]
+        for shape in shapes:
+            x = torch.randn(shape, device=device)
+            extremals = [float('nan'), float('inf'), -float('inf')]
+            for id1, id2, extremal in zip(torch.randint(0, 2, (3,)), torch.randint(0, 5, (3,)), extremals):
+                x[0, id1, id2, :] = extremal
+            test_dtype(func(), x, torch.bfloat16)
+
     @dtypes(torch.cdouble)
     def test_complex_edge_values(self, device, dtype):
         # sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424
-        x = torch.tensor(0. - 1.0e+20j, dtype=dtype, device=device)
+        x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device)
         self.compare_with_numpy(torch.sqrt, np.sqrt, x)
         # acos test reference: https://github.com/pytorch/pytorch/issue/42952
         # Skip on Windows, as CUDA acos  returns conjugate value
@@ -562,7 +562,11 @@ def test_complex_edge_values(self, device, dtype):
         if not (IS_WINDOWS and dtype == torch.cdouble and "cuda" in device):
             self.compare_with_numpy(torch.acos, np.arccos, x)
 
-        x = torch.tensor((-1.0e+60 if dtype == torch.cdouble else -1.0e+20) - 4988429.2j, dtype=dtype, device=device)
+        x = torch.tensor(
+            (-1.0e60 if dtype == torch.cdouble else -1.0e20) - 4988429.2j,
+            dtype=dtype,
+            device=device,
+        )
         self.compare_with_numpy(torch.sqrt, np.sqrt, x)
 
     @unittest.skipIf(not TEST_SCIPY, "Requires SciPy")
@@ -572,14 +576,28 @@ def test_digamma_special(self, device, dtype):
         # Reference:
         # https://github.com/scipy/scipy/blob/3a8a3a1d4657254a6611e77e9c28feafa26e6645/scipy/special/tests/test_digamma.py#L22
         euler = 0.57721566490153286
-        dataset = [(0., -0.),
-                   (1, -euler),
-                   (0.5, -2 * math.log(2) - euler),
-                   (1 / 3, -math.pi / (2 * math.sqrt(3)) - 3 * math.log(3) / 2 - euler),
-                   (1 / 4, -math.pi / 2 - 3 * math.log(2) - euler),
-                   (1 / 6, -math.pi * math.sqrt(3) / 2 - 2 * math.log(2) - 3 * math.log(3) / 2 - euler),
-                   (1 / 8, -math.pi / 2 - 4 * math.log(2) -
-                       (math.pi + math.log(2 + math.sqrt(2)) - math.log(2 - math.sqrt(2))) / math.sqrt(2) - euler)]
+        dataset = [
+            (0.0, -0.0),
+            (1, -euler),
+            (0.5, -2 * math.log(2) - euler),
+            (1 / 3, -math.pi / (2 * math.sqrt(3)) - 3 * math.log(3) / 2 - euler),
+            (1 / 4, -math.pi / 2 - 3 * math.log(2) - euler),
+            (
+                1 / 6,
+                -math.pi * math.sqrt(3) / 2
+                - 2 * math.log(2)
+                - 3 * math.log(3) / 2
+                - euler,
+            ),
+            (
+                1 / 8,
+                -math.pi / 2
+                - 4 * math.log(2)
+                - (math.pi + math.log(2 + math.sqrt(2)) - math.log(2 - math.sqrt(2)))
+                / math.sqrt(2)
+                - euler,
+            ),
+        ]
         x = torch.tensor(dataset, device=device, dtype=dtype)
         self.compare_with_numpy(torch.digamma, scipy.special.digamma, x)
 
@@ -587,15 +605,29 @@ def test_digamma_special(self, device, dtype):
     @dtypes(torch.float, torch.double)
     def test_digamma(self, device, dtype):
         # Tests pole behavior
-        tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
-                               -100.99999994, 0.000000111, -1931.99999994,
-                               -0.000000111, 0, -0, -1, -2, -931], dtype=dtype, device=device)
+        tensor = torch.tensor(
+            [
+                -0.999999994,
+                -1.999999994,
+                -2.0000000111,
+                -100.99999994,
+                0.000000111,
+                -1931.99999994,
+                -0.000000111,
+                0,
+                -0,
+                -1,
+                -2,
+                -931,
+            ],
+            dtype=dtype,
+            device=device,
+        )
         self.compare_with_numpy(torch.digamma, scipy.special.digamma, tensor)
 
-    @skipCUDAIfRocm
-    @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False))
+    @dtypes(*floating_types_and(torch.half))
     def test_frexp(self, device, dtype):
-        input = make_tensor((50, 50), device, dtype)
+        input = make_tensor((50, 50), dtype=dtype, device=device)
         mantissa, exponent = torch.frexp(input)
         np_mantissa, np_exponent = np.frexp(input.cpu().numpy())
 
@@ -606,26 +638,29 @@ def test_frexp(self, device, dtype):
         self.assertTrue(exponent.dtype == torch.int32)
         self.assertTrue(torch_to_numpy_dtype_dict[exponent.dtype] == np_exponent.dtype)
 
-    @skipCUDAIfRocm
     def test_frexp_assert_raises(self, device):
-        invalid_input_dtypes = get_all_int_dtypes() + \
-            get_all_complex_dtypes() + \
-            [torch.bool]
+        invalid_input_dtypes = integral_types_and(torch.bool) + complex_types()
         for dtype in invalid_input_dtypes:
-            input = make_tensor((50, 50), device, dtype)
-            with self.assertRaisesRegex(RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes"):
+            input = make_tensor((50, 50), dtype=dtype, device=device)
+            with self.assertRaisesRegex(
+                RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes"
+            ):
                 torch.frexp(input)
 
-        for dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False):
-            input = make_tensor((50, 50), device, dtype)
+        for dtype in floating_types_and(torch.half):
+            input = make_tensor((50, 50), dtype=dtype, device=device)
 
-            dtypes = list(all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+            dtypes = list(
+                all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)
+            )
             dtypes.remove(dtype)
             for mantissa_dtype in dtypes:
                 mantissa = torch.empty_like(input, dtype=mantissa_dtype)
                 exponent = torch.empty_like(input, dtype=torch.int)
-                with self.assertRaisesRegex(RuntimeError,
-                                            r"torch\.frexp\(\) expects mantissa to have dtype .+ but got .+"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"torch\.frexp\(\) expects mantissa to have dtype .+ but got .+",
+                ):
                     torch.frexp(input, out=(mantissa, exponent))
 
             dtypes.append(dtype)
@@ -633,8 +668,10 @@ def test_frexp_assert_raises(self, device):
             for exponent_dtype in dtypes:
                 mantissa = torch.empty_like(input)
                 exponent = torch.empty_like(input, dtype=exponent_dtype)
-                with self.assertRaisesRegex(RuntimeError,
-                                            r"torch\.frexp\(\) expects exponent to have int dtype but got .+"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"torch\.frexp\(\) expects exponent to have int dtype but got .+",
+                ):
                     torch.frexp(input, out=(mantissa, exponent))
 
     def test_mvlgamma_argcheck(self, device):
@@ -642,17 +679,21 @@ def run_test(d):
             input = torch.linspace((d - 2) / 2, 10, 10, device=device)
             torch.mvlgamma(input, d)
 
-        with self.assertRaisesRegex(RuntimeError, r"All elements must be greater than \(p-1\)/2"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"All elements must be greater than \(p-1\)/2"
+        ):
             run_test(3)
 
     def test_polygamma_neg(self, device):
-        with self.assertRaisesRegex(RuntimeError, r'polygamma\(n, x\) does not support negative n\.'):
+        with self.assertRaisesRegex(
+            RuntimeError, r"polygamma\(n, x\) does not support negative n\."
+        ):
             torch.polygamma(-1, torch.tensor([1.0, 2.0], device=device))
 
     # TODO resolve with opinfos
     @onlyCPU
     def test_op_invert(self, device):
-        res = 0xffff - torch.arange(127, dtype=torch.int8)
+        res = 0xFFFF - torch.arange(127, dtype=torch.int8)
         for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
             a = torch.arange(127, dtype=dtype)
             self.assertEqual(res.to(dtype), ~a)
@@ -669,16 +710,19 @@ def test_op_invert(self, device):
     def test_abs_angle_complex_to_float(self, device, dtype):
         # Constructs random complex values
         from random import random
+
         random_vals = []
         for multiplier in (-1, 1, -10, 10, -100, 100):
             for _ in range(10):
-                random_vals.append(complex(random() * multiplier, random() * multiplier))
+                random_vals.append(
+                    complex(random() * multiplier, random() * multiplier)
+                )
 
         for vals in (random_vals, []):
             a = np.array(vals, dtype=torch_to_numpy_dtype_dict[dtype])
             t = torch.tensor(vals, device=device, dtype=dtype)
 
-            for fn_name in ('abs', 'angle'):
+            for fn_name in ("abs", "angle"):
                 torch_fn = getattr(torch, fn_name)
                 np_fn = getattr(np, fn_name)
 
@@ -688,12 +732,16 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                 self.assertEqual(np_result, torch_result, exact_dtype=True)
 
                 # Tests float out
-                float_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+                float_dtype = (
+                    torch.float32 if dtype is torch.complex64 else torch.float64
+                )
                 np_float_out = np_fn(a).astype(torch_to_numpy_dtype_dict[float_dtype])
                 float_out = torch.empty_like(t).float()
                 torch_fn(t, out=float_out)
                 # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(torch.from_numpy(np_float_out), float_out.cpu())
+                self.assertEqualIgnoreType(
+                    torch.from_numpy(np_float_out), float_out.cpu()
+                )
 
                 # Tests float out (resized out)
                 float_out = torch.empty(1, device=device, dtype=float_dtype)
@@ -705,13 +753,17 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                 complex_out = torch.empty_like(t)
                 torch_fn(t, out=complex_out)
                 # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(torch.from_numpy(np_complex_out), complex_out.cpu())
+                self.assertEqualIgnoreType(
+                    torch.from_numpy(np_complex_out), complex_out.cpu()
+                )
 
                 # Tests complex out (resized out)
                 complex_out = torch.empty(0, device=device, dtype=dtype)
                 torch_fn(t, out=complex_out)
                 # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(torch.from_numpy(np_complex_out), complex_out.cpu())
+                self.assertEqualIgnoreType(
+                    torch.from_numpy(np_complex_out), complex_out.cpu()
+                )
 
                 # Tests long out behavior (expected failure)
                 long_out = torch.empty(0, device=device, dtype=torch.long)
@@ -719,40 +771,42 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                     torch_fn(t, out=long_out)
 
                 # Tests inplace
-                if fn_name == 'abs':
+                if fn_name == "abs":
                     torch_inplace_method = getattr(torch.Tensor, fn_name + "_")
                     np_fn(a, out=a)
                     if dtype.is_complex:
-                        with self.assertRaisesRegex(RuntimeError, "In-place abs is not supported for complex tensors."):
+                        with self.assertRaisesRegex(
+                            RuntimeError,
+                            "In-place abs is not supported for complex tensors.",
+                        ):
                             torch_inplace_method(t)
                         return
                     torch_inplace_method(t)
                     self.assertEqual(torch.from_numpy(a), t.cpu())
 
                 # Note: angle does not have an in-place variant
-                if fn_name == 'angle':
+                if fn_name == "angle":
                     with self.assertRaises(AttributeError):
                         torch_inplace_method = getattr(torch.Tensor, fn_name + "_")
 
-    def check_internal_mem_overlap(self, inplace_op, num_inputs,
-                                   dtype, device,
-                                   expected_failure=False):
+    def check_internal_mem_overlap(
+        self, inplace_op, num_inputs, dtype, device, expected_failure=False
+    ):
         if isinstance(inplace_op, str):
             inplace_op = getattr(torch.Tensor, inplace_op)
         input = torch.randn(1, dtype=dtype, device=device).expand(3, 3)
-        inputs = [input] + [torch.randn_like(input)
-                            for i in range(num_inputs - 1)]
+        inputs = [input] + [torch.randn_like(input) for i in range(num_inputs - 1)]
         if not expected_failure:
-            with self.assertRaisesRegex(RuntimeError, 'single memory location'):
+            with self.assertRaisesRegex(RuntimeError, "single memory location"):
                 inplace_op(*inputs)
         else:
             with self.assertRaises(AssertionError):
-                with self.assertRaisesRegex(RuntimeError, 'single memory location'):
+                with self.assertRaisesRegex(RuntimeError, "single memory location"):
                     inplace_op(*inputs)
 
-    def unary_check_input_output_mem_overlap(self, data, sz, op,
-                                             expected_failure=False):
-
+    def unary_check_input_output_mem_overlap(
+        self, data, sz, op, expected_failure=False
+    ):
         def _test(op, output, input):
             output_exp = torch.empty_like(output)
             op(input, out=output_exp)
@@ -761,15 +815,15 @@ def _test(op, output, input):
         # output is identical to input:
         _test(op, output=data[0:sz], input=data[0:sz])
         # output and input are independent:
-        _test(op, output=data[0:sz], input=data[sz:2 * sz])
+        _test(op, output=data[0:sz], input=data[sz : 2 * sz])
         # output partially overlaps with input:
         if not expected_failure:
-            with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-                _test(op, data[0:sz], data[1:sz + 1])
+            with self.assertRaisesRegex(RuntimeError, "unsupported operation"):
+                _test(op, data[0:sz], data[1 : sz + 1])
         else:
             with self.assertRaises(AssertionError):
-                with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-                    _test(op, data[0:sz], data[1:sz + 1])
+                with self.assertRaisesRegex(RuntimeError, "unsupported operation"):
+                    _test(op, data[0:sz], data[1 : sz + 1])
 
     # TODO: run on non-native device types
     @dtypes(torch.double)
@@ -779,170 +833,201 @@ def test_unary_out_op_mem_overlap(self, device, dtype):
         positives = torch.randint(1, 100, (2 * sz,), device=device).double()
         ints = torch.randint(-100, 100, (2 * sz,), device=device)
         unary_mem_overlap_cases = [
-            ("abs", doubles, True, True, 'cpu'),
-            ("abs", doubles, True, True, 'cuda'),
-            ("acos", doubles, True, True, 'cpu'),
-            ("acos", doubles, True, True, 'cuda'),
-            ("asin", doubles, True, True, 'cpu'),
-            ("asin", doubles, True, True, 'cuda'),
-            ("atan", doubles, True, True, 'cpu'),
-            ("atan", doubles, True, True, 'cuda'),
-            ("acosh", doubles, True, True, 'cpu'),
-            ("acosh", doubles, True, True, 'cuda'),
-            ("asinh", doubles, True, True, 'cpu'),
-            ("asinh", doubles, True, True, 'cuda'),
-            ("atanh", doubles, True, True, 'cpu'),
-            ("atanh", doubles, True, True, 'cuda'),
-            ("bitwise_not", ints, True, True, 'cpu'),
-            ("bitwise_not", ints, True, True, 'cuda'),
-            ("ceil", doubles, True, True, 'cpu'),
-            ("ceil", doubles, True, True, 'cuda'),
-            ("cos", doubles, True, True, 'cpu'),
-            ("cos", doubles, True, True, 'cuda'),
-            ("cosh", doubles, True, True, 'cpu'),
-            ("cosh", doubles, True, True, 'cuda'),
-            ("digamma", doubles, True, True, 'cpu'),
-            ("erf", doubles, True, True, 'cpu'),
-            ("erf", doubles, True, True, 'cuda'),
-            ("erfc", doubles, True, True, 'cpu'),
-            ("erfc", doubles, True, True, 'cuda'),
-            ("erfinv", doubles, True, True, 'cpu'),
-            ("erfinv", doubles, True, True, 'cuda'),
-            ("exp", doubles, True, True, 'cpu'),
-            ("exp", doubles, True, True, 'cuda'),
-            ("exp2", doubles, True, True, 'cpu'),
-            ("exp2", doubles, True, True, 'cuda'),
-            ("expm1", doubles, True, True, 'cpu'),
-            ("expm1", doubles, True, True, 'cuda'),
-            ("floor", doubles, True, True, 'cpu'),
-            ("floor", doubles, True, True, 'cuda'),
-            ("frac", doubles, True, True, 'cpu'),
-            ("frac", doubles, True, True, 'cuda'),
-            ("i0", doubles, True, True, 'cpu'),
-            ("i0", doubles, True, True, 'cuda'),
-            ("log", positives, True, True, 'cpu'),
-            ("log", positives, True, True, 'cuda'),
-            ("log10", positives, True, True, 'cpu'),
-            ("log10", positives, True, True, 'cuda'),
-            ("log1p", positives, True, True, 'cpu'),
-            ("log1p", positives, True, True, 'cuda'),
-            ("log2", positives, True, True, 'cpu'),
-            ("log2", positives, True, True, 'cuda'),
-            ("neg", doubles, True, True, 'cpu'),
-            ("neg", doubles, True, True, 'cuda'),
-            ("reciprocal", doubles, True, True, 'cpu'),
-            ("reciprocal", doubles, True, True, 'cuda'),
-            ("round", doubles, True, True, 'cpu'),
-            ("round", doubles, True, True, 'cuda'),
-            ("rsqrt", positives, True, True, 'cpu'),
-            ("rsqrt", positives, True, True, 'cuda'),
-            ("sin", doubles, True, True, 'cpu'),
-            ("sin", doubles, True, True, 'cuda'),
-            ("sinh", doubles, True, True, 'cpu'),
-            ("sinh", doubles, False, True, 'cuda'),
-            ("sigmoid", doubles, True, True, 'cpu'),
-            ("sigmoid", doubles, True, True, 'cuda'),
-            ("logit", doubles, True, True, 'cpu'),
-            ("logit", doubles, True, True, 'cuda'),
-            ("sqrt", doubles, True, True, 'cpu'),
-            ("sqrt", doubles, False, True, 'cuda'),
-            ("tan", doubles, True, True, 'cpu'),
-            ("tan", doubles, True, True, 'cuda'),
-            ("tanh", doubles, True, True, 'cpu'),
-            ("tanh", doubles, True, True, 'cuda'),
-            ("trunc", doubles, True, True, 'cpu'),
-            ("trunc", doubles, True, True, 'cuda')
+            ("abs", doubles, True, True, "cpu"),
+            ("abs", doubles, True, True, "cuda"),
+            ("acos", doubles, True, True, "cpu"),
+            ("acos", doubles, True, True, "cuda"),
+            ("asin", doubles, True, True, "cpu"),
+            ("asin", doubles, True, True, "cuda"),
+            ("atan", doubles, True, True, "cpu"),
+            ("atan", doubles, True, True, "cuda"),
+            ("acosh", doubles, True, True, "cpu"),
+            ("acosh", doubles, True, True, "cuda"),
+            ("asinh", doubles, True, True, "cpu"),
+            ("asinh", doubles, True, True, "cuda"),
+            ("atanh", doubles, True, True, "cpu"),
+            ("atanh", doubles, True, True, "cuda"),
+            ("bitwise_not", ints, True, True, "cpu"),
+            ("bitwise_not", ints, True, True, "cuda"),
+            ("ceil", doubles, True, True, "cpu"),
+            ("ceil", doubles, True, True, "cuda"),
+            ("cos", doubles, True, True, "cpu"),
+            ("cos", doubles, True, True, "cuda"),
+            ("cosh", doubles, True, True, "cpu"),
+            ("cosh", doubles, True, True, "cuda"),
+            ("digamma", doubles, True, True, "cpu"),
+            ("erf", doubles, True, True, "cpu"),
+            ("erf", doubles, True, True, "cuda"),
+            ("erfc", doubles, True, True, "cpu"),
+            ("erfc", doubles, True, True, "cuda"),
+            ("erfinv", doubles, True, True, "cpu"),
+            ("erfinv", doubles, True, True, "cuda"),
+            ("exp", doubles, True, True, "cpu"),
+            ("exp", doubles, True, True, "cuda"),
+            ("exp2", doubles, True, True, "cpu"),
+            ("exp2", doubles, True, True, "cuda"),
+            ("expm1", doubles, True, True, "cpu"),
+            ("expm1", doubles, True, True, "cuda"),
+            ("floor", doubles, True, True, "cpu"),
+            ("floor", doubles, True, True, "cuda"),
+            ("frac", doubles, True, True, "cpu"),
+            ("frac", doubles, True, True, "cuda"),
+            ("i0", doubles, True, True, "cpu"),
+            ("i0", doubles, True, True, "cuda"),
+            ("log", positives, True, True, "cpu"),
+            ("log", positives, True, True, "cuda"),
+            ("log10", positives, True, True, "cpu"),
+            ("log10", positives, True, True, "cuda"),
+            ("log1p", positives, True, True, "cpu"),
+            ("log1p", positives, True, True, "cuda"),
+            ("log2", positives, True, True, "cpu"),
+            ("log2", positives, True, True, "cuda"),
+            ("neg", doubles, True, True, "cpu"),
+            ("neg", doubles, True, True, "cuda"),
+            ("reciprocal", doubles, True, True, "cpu"),
+            ("reciprocal", doubles, True, True, "cuda"),
+            ("round", doubles, True, True, "cpu"),
+            ("round", doubles, True, True, "cuda"),
+            ("rsqrt", positives, True, True, "cpu"),
+            ("rsqrt", positives, True, True, "cuda"),
+            ("sin", doubles, True, True, "cpu"),
+            ("sin", doubles, True, True, "cuda"),
+            ("sinh", doubles, True, True, "cpu"),
+            ("sinh", doubles, False, True, "cuda"),
+            ("sigmoid", doubles, True, True, "cpu"),
+            ("sigmoid", doubles, True, True, "cuda"),
+            ("logit", doubles, True, True, "cpu"),
+            ("logit", doubles, True, True, "cuda"),
+            ("sqrt", doubles, True, True, "cpu"),
+            ("sqrt", doubles, False, True, "cuda"),
+            ("tan", doubles, True, True, "cpu"),
+            ("tan", doubles, True, True, "cuda"),
+            ("tanh", doubles, True, True, "cpu"),
+            ("tanh", doubles, True, True, "cuda"),
+            ("trunc", doubles, True, True, "cpu"),
+            ("trunc", doubles, True, True, "cuda"),
         ]
 
-        for (fn, inputs, has_input_output_mem_overlap_check,
-             has_internal_mem_overlap_check, dev) in unary_mem_overlap_cases:
+        for (
+            fn,
+            inputs,
+            has_input_output_mem_overlap_check,
+            has_internal_mem_overlap_check,
+            dev,
+        ) in unary_mem_overlap_cases:
             if dev != device:
                 continue
             out_fn = getattr(torch, fn)
-            in_fn = getattr(torch.Tensor, fn + '_')
-
-            self.unary_check_input_output_mem_overlap(inputs, sz, out_fn,
-                                                      expected_failure=not has_input_output_mem_overlap_check)
-
-            self.check_internal_mem_overlap(in_fn, 1, dtype, dev,
-                                            expected_failure=not has_internal_mem_overlap_check)
+            in_fn = getattr(torch.Tensor, fn + "_")
+
+            self.unary_check_input_output_mem_overlap(
+                inputs,
+                sz,
+                out_fn,
+                expected_failure=not has_input_output_mem_overlap_check,
+            )
+
+            self.check_internal_mem_overlap(
+                in_fn,
+                1,
+                dtype,
+                dev,
+                expected_failure=not has_internal_mem_overlap_check,
+            )
 
     # TODO: opinfo hardshrink
     @onlyCPU
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_hardshrink(self, device, dtype):
         data = torch.tensor([1, 0.5, 0.3, 0.6], dtype=dtype, device=device).view(2, 2)
-        self.assertEqual(torch.tensor([1, 0.5, 0, 0.6], dtype=dtype, device=device).view(2, 2),
-                         data.hardshrink(0.3))
-        self.assertEqual(torch.tensor([1, 0, 0, 0.6], dtype=dtype, device=device).view(2, 2),
-                         data.hardshrink(0.5))
+        self.assertEqual(
+            torch.tensor([1, 0.5, 0, 0.6], dtype=dtype, device=device).view(2, 2),
+            data.hardshrink(0.3),
+        )
+        self.assertEqual(
+            torch.tensor([1, 0, 0, 0.6], dtype=dtype, device=device).view(2, 2),
+            data.hardshrink(0.5),
+        )
 
         # test default lambd=0.5
         self.assertEqual(data.hardshrink(), data.hardshrink(0.5))
 
         # test non-contiguous case
-        self.assertEqual(torch.tensor([1, 0, 0.5, 0.6], dtype=dtype, device=device).view(2, 2),
-                         data.t().hardshrink(0.3))
+        self.assertEqual(
+            torch.tensor([1, 0, 0.5, 0.6], dtype=dtype, device=device).view(2, 2),
+            data.t().hardshrink(0.3),
+        )
 
     @onlyCPU
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_hardshrink_edge_cases(self, device, dtype) -> None:
         def h(values, l_expected):
             for l, expected in l_expected.items():
-                values_tensor = torch.tensor([float(v) for v in values],
-                                             dtype=dtype, device=device)
-                expected_tensor = torch.tensor([float(v) for v in expected],
-                                               dtype=dtype, device=device)
-                self.assertEqual(expected_tensor == values_tensor.hardshrink(l),
-                                 torch.ones_like(values_tensor, dtype=torch.bool))
+                values_tensor = torch.tensor(
+                    [float(v) for v in values], dtype=dtype, device=device
+                )
+                expected_tensor = torch.tensor(
+                    [float(v) for v in expected], dtype=dtype, device=device
+                )
+                self.assertEqual(
+                    expected_tensor == values_tensor.hardshrink(l),
+                    torch.ones_like(values_tensor, dtype=torch.bool),
+                )
 
         def test_helper(min, max):
-            h([0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
-              {0.0: [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
-               min: [0.0, 0.0, 0.0, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
-               0.1: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -1.0, max, -max, inf, -inf],
-               1.0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, max, -max, inf, -inf],
-               max: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, inf, -inf],
-               inf: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]})
+            h(
+                [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
+                {
+                    0.0: [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
+                    min: [0.0, 0.0, 0.0, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf],
+                    0.1: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -1.0, max, -max, inf, -inf],
+                    1.0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, max, -max, inf, -inf],
+                    max: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, inf, -inf],
+                    inf: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                },
+            )
 
         test_helper(torch.finfo(dtype).tiny, torch.finfo(dtype).max)
 
     @onlyCPU
     @slowTest
     @dtypes(torch.float)
+    @unittest.skipIf(True, "Insufficient memory on linux.(2|4)xlarge")
     def test_exp_slow(self, device, dtype):
         # Test for https://github.com/pytorch/pytorch/issues/17271
         # This is pretty slow on my Macbook but it only takes a few
         # seconds on a beefy Xeon server
-        a = torch.exp(torch.ones(2 ** 31, dtype=dtype, device=device))
+        a = torch.exp(torch.ones(2**31, dtype=dtype, device=device))
         b = torch.exp(torch.ones(1, dtype=dtype, device=device))
-        self.assertEqual(a, b.expand(2 ** 31))
+        self.assertEqual(a, b.expand(2**31))
 
-    @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
-    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
-    @dtypes(torch.float, torch.double)
+    @precisionOverride(
+        {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}
+    )
+    @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_hardswish(self, device, dtype):
         inputValues = [-1000, -4, -3, -2, 0, 2, 3, 4, 1000]
         expectedOutput = np.multiply(
-            inputValues,
-            np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0)
+            inputValues, np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0
+        )
 
         inputTensor = torch.tensor(inputValues, dtype=dtype, device=device)
-        expectedOutputTensor = \
-            torch.tensor(expectedOutput, dtype=dtype, device=device)
+        expectedOutputTensor = torch.tensor(expectedOutput, dtype=dtype, device=device)
 
         # normal
-        self.assertEqual(torch.nn.functional.hardswish(inputTensor),
-                         expectedOutputTensor)
+        self.assertEqual(
+            torch.nn.functional.hardswish(inputTensor), expectedOutputTensor
+        )
 
         # inplace
         inputTensorCpy = inputTensor.clone().detach()
         torch.nn.functional.hardswish(inputTensorCpy, inplace=True)
         self.assertEqual(inputTensorCpy, expectedOutputTensor)
 
-    @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
-    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
-    @dtypes(torch.float, torch.double)
+    @precisionOverride(
+        {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}
+    )
+    @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_hardsigmoid(self, device, dtype):
         inputValues = [-1000, -4, -3, -2, 0, 2, 3, 4, 1000]
         expectedOutput = np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0
@@ -950,21 +1035,28 @@ def test_hardsigmoid(self, device, dtype):
         inputTensor = torch.tensor(inputValues, dtype=dtype, device=device)
 
         # normal
-        self.assertEqual(torch.nn.functional.hardsigmoid(inputTensor),
-                         torch.tensor(expectedOutput, dtype=dtype, device=device))
+        self.assertEqual(
+            torch.nn.functional.hardsigmoid(inputTensor),
+            torch.tensor(expectedOutput, dtype=dtype, device=device),
+        )
 
         # inplace
         inputTensorCpy = inputTensor.clone().detach()
-        self.assertEqual(torch.nn.functional.hardsigmoid(inputTensorCpy, inplace=True),
-                         torch.tensor(expectedOutput, dtype=dtype, device=device))
-
-    @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
-    @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
-    @dtypes(torch.float, torch.double)
+        self.assertEqual(
+            torch.nn.functional.hardsigmoid(inputTensorCpy, inplace=True),
+            torch.tensor(expectedOutput, dtype=dtype, device=device),
+        )
+
+    @precisionOverride(
+        {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}
+    )
+    @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_hardsigmoid_backward(self, device, dtype):
         inputValues = [-3.0, 3.0, -2.0, 2.0, -6.0, 6.0]
         expectedValues = [0.0, 0.0, 1.0 / 6.0, 1.0 / 6.0, 0.0, 0.0]
-        inputTensor = torch.tensor(inputValues, dtype=dtype, device=device).requires_grad_()
+        inputTensor = torch.tensor(
+            inputValues, dtype=dtype, device=device
+        ).requires_grad_()
         expetedTensor = torch.tensor(expectedValues, dtype=dtype, device=device)
         out = torch.nn.functional.hardsigmoid(inputTensor)
         out.backward(torch.ones_like(inputTensor))
@@ -976,7 +1068,8 @@ def test_silu(self, device, dtype):
         input_np = np.random.randn(5, 8)
         special_input = [[-1000, -1, -0.1, 0, 0.5, 1, 2, 1000]]
         input_np = np.concatenate((input_np, special_input), axis=0).astype(
-            torch_to_numpy_dtype_dict[dtype])
+            torch_to_numpy_dtype_dict[dtype]
+        )
         expected_output_np = input_np * scipy.special.expit(input_np)
 
         expected_output = torch.from_numpy(expected_output_np).to(device)
@@ -986,18 +1079,30 @@ def test_silu(self, device, dtype):
         rtol = 1e-6
 
         input = torch.from_numpy(input_np).clone().contiguous().to(device)
-        self.assertEqual(torch.nn.functional.silu(input), expected_output,
-                         atol=atol, rtol=rtol)
-        self.assertEqual(torch.nn.functional.silu(input, inplace=True),
-                         expected_output, atol=atol, rtol=rtol)
+        self.assertEqual(
+            torch.nn.functional.silu(input), expected_output, atol=atol, rtol=rtol
+        )
+        self.assertEqual(
+            torch.nn.functional.silu(input, inplace=True),
+            expected_output,
+            atol=atol,
+            rtol=rtol,
+        )
 
         input = torch.from_numpy(input_np).clone().to(device)
         input_noncontig = input.transpose(0, 1)
-        self.assertEqual(torch.nn.functional.silu(input_noncontig),
-                         expected_output_noncontig, atol=atol, rtol=rtol)
-        self.assertEqual(torch.nn.functional.silu(
-            input_noncontig, inplace=True), expected_output_noncontig,
-            atol=atol, rtol=rtol)
+        self.assertEqual(
+            torch.nn.functional.silu(input_noncontig),
+            expected_output_noncontig,
+            atol=atol,
+            rtol=rtol,
+        )
+        self.assertEqual(
+            torch.nn.functional.silu(input_noncontig, inplace=True),
+            expected_output_noncontig,
+            atol=atol,
+            rtol=rtol,
+        )
 
     # It is not obvious how to merge this into OpInfo becuase these inputs
     # succeed for gradcheck but are expected to fail for gradgradcheck
@@ -1008,10 +1113,12 @@ def test_sinc(self, device, dtype):
         # We also need to be careful when we are very close to 0, as the
         # derivative's denominator is squared, and there are some floats
         # that are positive and whose squares are zero.
-        a = torch.tensor([0.0, torch.finfo(torch.double).tiny, 1.0],
-                         dtype=dtype,
-                         requires_grad=True,
-                         device=device)
+        a = torch.tensor(
+            [0.0, torch.finfo(torch.double).tiny, 1.0],
+            dtype=dtype,
+            requires_grad=True,
+            device=device,
+        )
         gradcheck(torch.sinc, a)
 
     @skipIfNoSciPy
@@ -1020,7 +1127,8 @@ def test_mish(self, device, dtype):
         input_np = np.random.randn(5, 8)
         special_input = [[-1000, -1, -0.1, 0, 0.5, 1, 2, 1000]]
         input_np = np.concatenate((input_np, special_input), axis=0).astype(
-            torch_to_numpy_dtype_dict[dtype])
+            torch_to_numpy_dtype_dict[dtype]
+        )
         expected_output_np = input_np * np.tanh(np.log1p(np.exp(input_np)))
 
         expected_output = torch.from_numpy(expected_output_np).to(device)
@@ -1030,34 +1138,50 @@ def test_mish(self, device, dtype):
         rtol = 1e-6
 
         input = torch.from_numpy(input_np).clone().contiguous().to(device)
-        self.assertEqual(torch.nn.functional.mish(input), expected_output,
-                         atol=atol, rtol=rtol)
-        self.assertEqual(torch.nn.functional.mish(input, inplace=True),
-                         expected_output, atol=atol, rtol=rtol)
+        self.assertEqual(
+            torch.nn.functional.mish(input), expected_output, atol=atol, rtol=rtol
+        )
+        self.assertEqual(
+            torch.nn.functional.mish(input, inplace=True),
+            expected_output,
+            atol=atol,
+            rtol=rtol,
+        )
 
         input = torch.from_numpy(input_np).clone().to(device)
         input_noncontig = input.transpose(0, 1)
-        self.assertEqual(torch.nn.functional.mish(input_noncontig),
-                         expected_output_noncontig, atol=atol, rtol=rtol)
-        self.assertEqual(torch.nn.functional.mish(
-            input_noncontig, inplace=True), expected_output_noncontig,
-            atol=atol, rtol=rtol)
+        self.assertEqual(
+            torch.nn.functional.mish(input_noncontig),
+            expected_output_noncontig,
+            atol=atol,
+            rtol=rtol,
+        )
+        self.assertEqual(
+            torch.nn.functional.mish(input_noncontig, inplace=True),
+            expected_output_noncontig,
+            atol=atol,
+            rtol=rtol,
+        )
 
     # do ops like threshold need a test_unary(_nonufunc) test suite?
     @onlyCPU
-    @dtypes(*get_all_math_dtypes('cpu'))
+    @dtypes(*get_all_math_dtypes("cpu"))
     def test_threshold(self, device, dtype):
         if dtype != torch.uint8 and dtype != torch.float16 and not dtype.is_complex:
             # 100 is wide enough to use AVX2 instructions for all types
-            x = torch.randn(100, dtype=torch.float, device=device).sign().to(dtype=dtype)
+            x = (
+                torch.randn(100, dtype=torch.float, device=device)
+                .sign()
+                .to(dtype=dtype)
+            )
             y = torch.threshold(x, 0, 0)
             self.assertTrue(y.le(0).any())
 
-    def _helper_test_igamma(self, loglo, loghi, device, dtype,
-                            torch_fcn, scipy_fcn):
+    def _helper_test_igamma(self, loglo, loghi, device, dtype, torch_fcn, scipy_fcn):
         exp1 = 2.71828182846
-        vec1 = torch.logspace(loglo, loghi, steps=500, base=exp1,
-                              dtype=torch.float64, device=device).unsqueeze(-1)
+        vec1 = torch.logspace(
+            loglo, loghi, steps=500, base=exp1, dtype=torch.float64, device=device
+        ).unsqueeze(-1)
         vec1 = vec1.to(dtype)
         inputs = [
             (vec1, vec1.transpose(0, 1)),
@@ -1065,8 +1189,8 @@ def _helper_test_igamma(self, loglo, loghi, device, dtype,
             (vec1, 0.5 * vec1),  # test for considerable ratio
             (vec1, 2.0 * vec1),
             (vec1[::2, :], vec1[::2, :]),  # contiguous/noncontiguous tests
-            (vec1[::2, :], vec1[:vec1.shape[0] // 2, :]),
-            (vec1[:vec1.shape[0] // 2, :], vec1[::2, :]),
+            (vec1[::2, :], vec1[: vec1.shape[0] // 2, :]),
+            (vec1[: vec1.shape[0] // 2, :], vec1[::2, :]),
         ]
         half_prec = dtype in [torch.bfloat16, torch.float16]
         for input0, input1 in inputs:
@@ -1078,7 +1202,6 @@ def _helper_test_igamma(self, loglo, loghi, device, dtype,
             expected = torch.from_numpy(expected).to(dtype)
             self.assertEqual(actual, expected)
 
-    @skipCUDAIfRocm  # see issue https://github.com/pytorch/pytorch/issues/46531
     @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64)
     @dtypes(torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
@@ -1087,8 +1210,9 @@ def test_igamma_common(self, device, dtype):
         # test igamma for reasonable range of values
         loglo = -4  # approx 0.018
         loghi = 4  # approx 54.6
-        self._helper_test_igamma(loglo, loghi, device, dtype,
-                                 torch.igamma, scipy.special.gammainc)
+        self._helper_test_igamma(
+            loglo, loghi, device, dtype, torch.igamma, scipy.special.gammainc
+        )
 
     @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64)
     @dtypes(torch.float32, torch.float64)
@@ -1098,8 +1222,9 @@ def test_igammac_common(self, device, dtype):
         # test igammac for reasonable range of values
         loglo = -4  # approx 0.018
         loghi = 4  # approx 54.6
-        self._helper_test_igamma(loglo, loghi, device, dtype,
-                                 torch.igammac, scipy.special.gammaincc)
+        self._helper_test_igamma(
+            loglo, loghi, device, dtype, torch.igammac, scipy.special.gammaincc
+        )
 
     @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64)
     @dtypes(torch.float32, torch.float64)
@@ -1109,8 +1234,8 @@ def test_igamma_edge_cases(self, device, dtype):
         infs = torch.zeros((3,), **tkwargs) + float("inf")
         zeros = torch.zeros((3,), **tkwargs)
         ones = torch.ones((3,), **tkwargs)
-        zero_to_large = torch.tensor([0., 1., 1e3], **tkwargs)
-        small_to_inf = torch.tensor([1e-3, 1., float("inf")], **tkwargs)
+        zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
+        small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
         nans = torch.zeros((3,), **tkwargs) + float("nan")
         inpouts = [
             # (a    ,    x),       out
@@ -1138,8 +1263,8 @@ def test_igammac_edge_cases(self, device, dtype):
         infs = torch.zeros((3,), **tkwargs) + float("inf")
         zeros = torch.zeros((3,), **tkwargs)
         ones = torch.ones((3,), **tkwargs)
-        zero_to_large = torch.tensor([0., 1., 1e3], **tkwargs)
-        small_to_inf = torch.tensor([1e-3, 1., float("inf")], **tkwargs)
+        zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs)
+        small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs)
         nans = torch.zeros((3,), **tkwargs) + float("nan")
         inpouts = [
             # (a    ,    x),       out
@@ -1179,7 +1304,7 @@ def _i0_range_helper(self, range, device, dtype):
             t = torch.rand(1000, device=device).to(dtype) * r
             self._i0_helper(t)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_range1(self, device, dtype):
@@ -1187,7 +1312,7 @@ def test_i0_range1(self, device, dtype):
         # The domain is (-13.25, 13.25)
         self._i0_range_helper(13.25, device, dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_range2(self, device, dtype):
@@ -1202,7 +1327,7 @@ def test_i0_range3(self, device, dtype):
         # The domain is (-709.75, 709.75)
         self._i0_range_helper(709.75, device, dtype)
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_special(self, device, dtype):
@@ -1212,7 +1337,7 @@ def test_i0_special(self, device, dtype):
         t = torch.tensor([inf, -inf, nan], device=device, dtype=dtype)
         self.assertTrue(torch.i0(t).isnan().all())
 
-    @dtypesIfCUDA(*get_all_fp_dtypes())
+    @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_special_i0_i1_vs_scipy(self, device, dtype):
@@ -1266,11 +1391,25 @@ def check_equal(t):
             self.assertEqual(actual, expected)
 
         range = (-10, 10)
+        t = torch.linspace(*range, 1, device=device, dtype=dtype)
+        check_equal(t)
 
-        t = torch.linspace(*range, int(1e4), device=device, dtype=dtype)
+        # Skip testing NaN, inf, -inf since they are tested in reference_numerics tests.
+        info = torch.finfo(dtype)
+        min, max, eps, tiny = info.min, info.max, info.eps, info.tiny
+        t = torch.tensor([min, max, eps, tiny], dtype=dtype, device=device)
         check_equal(t)
 
-        # NaN, inf, -inf are tested in reference_numerics tests.
+    @dtypes(torch.float32, torch.float64)
+    @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
+    def test_special_log_ndtr_vs_scipy(self, device, dtype):
+        def check_equal(t):
+            # Test by comparing with scipy
+            actual = torch.special.log_ndtr(t)
+            expected = scipy.special.log_ndtr(t.cpu().numpy())
+            self.assertEqual(actual, expected)
+
+        # Skip testing NaN, inf, -inf since they are tested in reference_numerics tests.
         info = torch.finfo(dtype)
         min, max, eps, tiny = info.min, info.max, info.eps, info.tiny
         t = torch.tensor([min, max, eps, tiny], dtype=dtype, device=device)
@@ -1279,7 +1418,7 @@ def check_equal(t):
     # TODO: allow large opinfo values to be opted-into via metadata
     @dtypes(torch.long)
     def test_abs_big_number(self, device, dtype):
-        bignumber = 2 ** 31 + 1
+        bignumber = 2**31 + 1
         res = torch.tensor([bignumber], device=device, dtype=dtype)
         self.assertGreater(res.abs()[0], 0)
 
@@ -1304,15 +1443,17 @@ def test_abs_zero(self, device, dtype):
         for num in abs_zeros:
             self.assertGreater(math.copysign(1.0, num), 0.0)
 
-    @dtypes(*(get_all_dtypes(include_bool=False)))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_isposinf_isneginf_non_boolean_output(self, device, dtype):
         # test non-boolean tensors as the `out=` parameters
         # boolean outputs are tested in the above testcases
-        vals = (float('inf'), -float('inf'), 1.2)
+        vals = (float("inf"), -float("inf"), 1.2)
         t = torch.tensor(vals, device=device)
         for torch_op in (torch.isposinf, torch.isneginf):
             out = torch.empty_like(t, dtype=dtype)
-            with self.assertRaisesRegex(RuntimeError, 'does not support non-boolean outputs'):
+            with self.assertRaisesRegex(
+                RuntimeError, "does not support non-boolean outputs"
+            ):
                 torch_op(t, out=out)
 
     def test_nonzero_empty(self, device):
@@ -1346,13 +1487,16 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
     # TODO: rationalize with exp OpInfo
-    @dtypes(*(get_all_fp_dtypes(include_half=False) +
-              get_all_complex_dtypes()))
-    @dtypesIfCUDA(*(get_all_fp_dtypes(include_half=True) +
-                    get_all_complex_dtypes()))
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
+    @dtypesIfCUDA(*floating_and_complex_types_and(torch.half, torch.bfloat16))
     def test_exp(self, device, dtype):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
-            a = torch.tensor(v, dtype=dtype, device=device) * torch.arange(18, device=device) / 3 * math.pi
+            a = (
+                torch.tensor(v, dtype=dtype, device=device)
+                * torch.arange(18, device=device)
+                / 3
+                * math.pi
+            )
             a = a.to(dtype)
             # bfloat16 overflows
             if dtype == torch.bfloat16:
@@ -1360,10 +1504,12 @@ def test_exp(self, device, dtype):
             self.compare_with_numpy(torch.exp, np.exp, a)
 
             if dtype.is_complex:
-                inf_real_zero_imag_in = torch.tensor(complex(float('inf'), 0), device=device, dtype=dtype)
+                inf_real_zero_imag_in = torch.tensor(
+                    complex(float("inf"), 0), device=device, dtype=dtype
+                )
                 inf_real_zero_imag_out = torch.exp(inf_real_zero_imag_in).item()
                 self.assertTrue(math.isinf(inf_real_zero_imag_out.real))
-                if self.device_type == 'cpu':
+                if self.device_type == "cpu":
                     pass
                     # These are commented out because it cannot be consistently reproduced.
                     # This is incorrect. It should be zero. Need fix!
@@ -1377,16 +1523,20 @@ def test_exp(self, device, dtype):
                     self.assertEqual(inf_real_zero_imag_out.imag, 0, atol=0, rtol=0)
                     self.compare_with_numpy(torch.exp, np.exp, inf_real_zero_imag_in)
 
-                zero_real_inf_imag_in = torch.tensor(complex(0, float('inf')), device=device, dtype=dtype)
+                zero_real_inf_imag_in = torch.tensor(
+                    complex(0, float("inf")), device=device, dtype=dtype
+                )
                 zero_real_inf_imag_out = torch.exp(zero_real_inf_imag_in).item()
                 self.assertTrue(math.isnan(zero_real_inf_imag_out.real))
                 self.assertTrue(math.isnan(zero_real_inf_imag_out.imag))
                 # Ensure we are notified when NumPy changes its behavior
                 self.compare_with_numpy(torch.exp, np.exp, zero_real_inf_imag_in)
 
-                inf_real_imag_in = torch.tensor(complex(float('inf'), float('inf')), device=device, dtype=dtype)
+                inf_real_imag_in = torch.tensor(
+                    complex(float("inf"), float("inf")), device=device, dtype=dtype
+                )
                 inf_real_imag_out = torch.exp(inf_real_imag_in).item()
-                if self.device_type == 'cpu':
+                if self.device_type == "cpu":
                     pass
                     # This is incorrect. Need fix! https://github.com/pytorch/pytorch/issues/40590
                     # This is commented out because it cannot be consistently reproduced.
@@ -1397,9 +1547,11 @@ def test_exp(self, device, dtype):
                     self.assertTrue(math.isnan(inf_real_imag_out.imag))
                     self.compare_with_numpy(torch.exp, np.exp, inf_real_imag_in)
 
-                inf_real_nan_imag_in = torch.tensor(complex(float('inf'), float('nan')), device=device, dtype=dtype)
+                inf_real_nan_imag_in = torch.tensor(
+                    complex(float("inf"), float("nan")), device=device, dtype=dtype
+                )
                 inf_real_nan_imag_out = torch.exp(inf_real_nan_imag_in).item()
-                if self.device_type == 'cpu':
+                if self.device_type == "cpu":
                     pass
                     # This is incorrect. It should be inf. Need fix! https://github.com/pytorch/pytorch/issues/40590
                     # This is commented out because it cannot be consistently reproduced.
@@ -1410,7 +1562,9 @@ def test_exp(self, device, dtype):
                     self.assertTrue(math.isnan(inf_real_nan_imag_out.imag))
                     self.compare_with_numpy(torch.exp, np.exp, inf_real_nan_imag_in)
 
-                nan_real_inf_imag_in = torch.tensor(complex(float('nan'), float('inf')), device=device, dtype=dtype)
+                nan_real_inf_imag_in = torch.tensor(
+                    complex(float("nan"), float("inf")), device=device, dtype=dtype
+                )
                 nan_real_inf_imag_out = torch.exp(nan_real_inf_imag_in).item()
                 self.assertTrue(math.isnan(nan_real_inf_imag_out.real))
                 self.assertTrue(math.isnan(nan_real_inf_imag_out.imag))
@@ -1420,5 +1574,5 @@ def test_exp(self, device, dtype):
 
 instantiate_device_type_tests(TestUnaryUfuncs, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_utils.py b/test/test_utils.py
index c8f4e3aa9453..65583bcbaf63 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,9 +1,7 @@
-# Owner(s): ["high priority"]
+# Owner(s): ["module: unknown"]
 
 import sys
 import os
-import contextlib
-import io
 import re
 import shutil
 import random
@@ -18,10 +16,9 @@
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
 import torch.utils.cpp_extension
-import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import has_breakpad, load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, TEST_WITH_ASAN
+from torch.testing._internal.common_utils import load_tests, IS_SANDCASTLE, IS_WINDOWS
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -262,6 +259,19 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
+    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
+        inp = torch.randn(2, device='cuda').requires_grad_()
+        layer = torch.nn.Dropout()
+
+        def run_fn(input):
+            return layer(input)
+
+        out = checkpoint(run_fn, inp, use_reentrant=False, preserve_rng_state=False)
+        out.sum().backward()
+        # This should run without error
+
+
     def test_checkpoint_non_tensor(self):
 
         def run_fn(tensor1, tensor2):
@@ -411,12 +421,6 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
-class TestFFI(TestCase):
-    def test_deprecated(self):
-        with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."):
-            from torch.utils.ffi import create_extension  # type: ignore[attr-defined] # noqa: F401
-
-
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
 class TestBottleneck(TestCase):
     def _run(self, command, timeout=30):
@@ -584,146 +588,6 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
         try_check_onnx_broadcast(dims1, dims2, True, False)
 
 
-def sum_of_state_dict(state_dict):
-    s = 0
-    for _, v in state_dict.items():
-        s += v.sum()
-    return s
-
-SUM_OF_HUB_EXAMPLE = 431080
-TORCHHUB_EXAMPLE_RELEASE_URL = 'https://github.com/ailzhang/torchhub_example/releases/download/0.1/mnist_init_ones'
-
-@unittest.skipIf(IS_SANDCASTLE, 'Sandcastle cannot ping external')
-class TestHub(TestCase):
-    @retry(Exception, tries=3)
-    def test_load_from_github(self):
-        hub_model = hub.load(
-            'ailzhang/torchhub_example',
-            'mnist',
-            source='github',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_load_from_local_dir(self):
-        local_dir = hub._get_cache_or_reload(
-            'ailzhang/torchhub_example', force_reload=False)
-        hub_model = hub.load(
-            local_dir,
-            'mnist',
-            source='local',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_load_from_branch(self):
-        hub_model = hub.load(
-            'ailzhang/torchhub_example:ci/test_slash',
-            'mnist',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_set_dir(self):
-        temp_dir = tempfile.gettempdir()
-        hub.set_dir(temp_dir)
-        hub_model = hub.load(
-            'ailzhang/torchhub_example',
-            'mnist',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-        assert os.path.exists(temp_dir + '/ailzhang_torchhub_example_master')
-        shutil.rmtree(temp_dir + '/ailzhang_torchhub_example_master')
-
-    @retry(Exception, tries=3)
-    def test_list_entrypoints(self):
-        entry_lists = hub.list('ailzhang/torchhub_example', force_reload=True)
-        self.assertObjectIn('mnist', entry_lists)
-
-    @retry(Exception, tries=3)
-    def test_download_url_to_file(self):
-        temp_file = os.path.join(tempfile.gettempdir(), 'temp')
-        hub.download_url_to_file(TORCHHUB_EXAMPLE_RELEASE_URL, temp_file, progress=False)
-        loaded_state = torch.load(temp_file)
-        self.assertEqual(sum_of_state_dict(loaded_state),
-                         SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_load_state_dict_from_url(self):
-        loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL)
-        self.assertEqual(sum_of_state_dict(loaded_state),
-                         SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_load_zip_checkpoint(self):
-        hub_model = hub.load(
-            'ailzhang/torchhub_example',
-            'mnist_zip',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-
-    # Test the default zipfile serialization format produced by >=1.6 release.
-    @retry(Exception, tries=3)
-    def test_load_zip_1_6_checkpoint(self):
-        hub_model = hub.load(
-            'ailzhang/torchhub_example',
-            'mnist_zip_1_6',
-            pretrained=True,
-            verbose=False)
-        self.assertEqual(sum_of_state_dict(hub_model.state_dict()),
-                         SUM_OF_HUB_EXAMPLE)
-
-
-    def test_hub_dir(self):
-        with tempfile.TemporaryDirectory('hub_dir') as dirname:
-            torch.hub.set_dir(dirname)
-            self.assertEqual(torch.hub.get_dir(), dirname)
-
-    @retry(Exception, tries=3)
-    def test_hub_parse_repo_info(self):
-        # If the branch is specified we just parse the input and return
-        self.assertEqual(
-            torch.hub._parse_repo_info('a/b:c'),
-            ('a', 'b', 'c')
-        )
-        # For torchvision, the default branch is main
-        self.assertEqual(
-            torch.hub._parse_repo_info('pytorch/vision'),
-            ('pytorch', 'vision', 'main')
-        )
-        # For the torchhub_example repo, the default branch is still master
-        self.assertEqual(
-            torch.hub._parse_repo_info('ailzhang/torchhub_example'),
-            ('ailzhang', 'torchhub_example', 'master')
-        )
-
-    @retry(Exception, tries=3)
-    def test_load_state_dict_from_url_with_name(self):
-        with tempfile.TemporaryDirectory('hub_dir') as dirname:
-            torch.hub.set_dir(dirname)
-            file_name = 'test_file'
-            loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL, file_name=file_name)
-            self.assertTrue(os.path.exists(os.path.join(dirname, 'checkpoints', file_name)))
-            self.assertEqual(sum_of_state_dict(loaded_state),
-                             SUM_OF_HUB_EXAMPLE)
-
-    @retry(Exception, tries=3)
-    def test_load_commit_from_forked_repo(self):
-        with self.assertRaisesRegex(
-                ValueError,
-                'If it\'s a commit from a forked repo'):
-            model = torch.hub.load('pytorch/vision:4e2c216', 'resnet18', force_reload=True)
-
 class TestHipify(TestCase):
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
@@ -757,32 +621,6 @@ def forward(self, x):
             ms(torch.tensor([False], dtype=torch.bool))
 
 
-class TestCrashHandler(TestCase):
-    @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
-    @unittest.skipIf(not has_breakpad(), "Built without breakpad")
-    def test_python_exception_writing(self):
-        with tempfile.TemporaryDirectory() as temp_dir:
-            torch.utils._crash_handler.enable_minidumps(temp_dir)
-            torch.utils._crash_handler.enable_minidumps_on_exceptions()
-
-            files = os.listdir(temp_dir)
-            self.assertEqual(len(files), 0)
-
-            f = io.StringIO()
-            with contextlib.redirect_stderr(f):
-                try:
-                    @torch.jit.script
-                    def x(i: int):
-                        return i + "2"  # type: ignore[operator]
-                except RuntimeError as e:
-                    pass
-
-            files = os.listdir(temp_dir)
-            self.assertEqual(len(files), 1)
-            self.assertTrue(files[0].endswith(".dmp"))
-            torch.utils._crash_handler.disable_minidumps()
-
-
 @unittest.skipIf(IS_SANDCASTLE, "cpp_extension is OSS only")
 class TestStandaloneCPPJIT(TestCase):
     def test_load_standalone(self):
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 2678db1d74d5..424a31e61d24 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -11,12 +11,12 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck,
-    torch_to_numpy_dtype_dict,
+    numpy_to_torch_dtype_dict,
 )
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes)
+    (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipMeta)
 from torch.testing._internal.common_dtype import (
-    get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+    all_types_and_complex_and, complex_types, all_types_and, floating_and_complex_types_and,
 )
 
 # TODO: replace this with make_tensor() in common_utils.py
@@ -121,26 +121,26 @@ def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         else:
             return x.transpose(dim0, dim1)
 
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_conj_self(self, device, dtype):
         t = torch.ones(5, 5, device=device)
         s = t.conj()
         self.assertTrue(s is t)
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_view_dtype_new(self, device, dtype):
-        dtypes = torch_to_numpy_dtype_dict.copy()
+        dtypes = {value : key for (key, value) in numpy_to_torch_dtype_dict.items()}
         del dtypes[torch.bool]
 
         def generate_inputs():
-            yield make_tensor((4, 4, 64), device, dtype, low=-5, high=5)
-            yield make_tensor((4, 4, 64), device, dtype, low=-5, high=5).permute(1, 0, 2)
-            yield make_tensor((4, 64, 4), device, dtype, low=-5, high=5).permute(2, 0, 1)
-            yield make_tensor((1, 5, 1), device, dtype, low=-5, high=5).expand(5, 5, 64)
-            yield make_tensor((2, 5, 256), device, dtype, low=-5, high=5)[1::2, 1:, ::2]
-            yield make_tensor((0, 5, 64), device, dtype, low=-5, high=5)
-            yield make_tensor((), device, dtype, low=-5, high=5)
+            yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5)
+            yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5).permute(1, 0, 2)
+            yield make_tensor((4, 64, 4), dtype=dtype, device=device, low=-5, high=5).permute(2, 0, 1)
+            yield make_tensor((1, 5, 1), dtype=dtype, device=device, low=-5, high=5).expand(5, 5, 64)
+            yield make_tensor((2, 5, 256), dtype=dtype, device=device, low=-5, high=5)[1::2, 1:, ::2]
+            yield make_tensor((0, 5, 64), dtype=dtype, device=device, low=-5, high=5)
+            yield make_tensor((), dtype=dtype, device=device, low=-5, high=5)
 
         def calc_expected_size_and_stride(a, view_dtype):
             dtype_size = torch._utils._element_size(a.dtype)
@@ -210,24 +210,24 @@ def calc_expected_size_and_stride(a, view_dtype):
         # because view(dtype) does not support backward yet
         # TODO: Remove this when autograd support is added
         if dtype.is_floating_point or dtype.is_complex:
-            for view_dtype in [*get_all_fp_dtypes(), *get_all_complex_dtypes()]:
-                t = make_tensor((5, 5, 64), device, dtype, low=-5, high=5, requires_grad=True)
+            for view_dtype in floating_and_complex_types_and(torch.half, torch.bfloat16):
+                t = make_tensor((5, 5, 64), dtype=dtype, device=device, low=-5, high=5, requires_grad=True)
                 self.assertFalse(t.view(view_dtype).requires_grad)
 
     # Test the extra error checks that happen when the view dtype
     # has a greater element size than the original dtype
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_view_dtype_upsize_errors(self, device, dtype):
         dtype_size = torch._utils._element_size(dtype)
 
-        for view_dtype in get_all_dtypes():
+        for view_dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             view_dtype_size = torch._utils._element_size(view_dtype)
             if view_dtype_size <= dtype_size:
                 continue
 
             size_ratio = view_dtype_size // dtype_size
-            a = make_tensor((4, 4, size_ratio + 1), device, dtype, low=-5, high=5)
+            a = make_tensor((4, 4, size_ratio + 1), dtype=dtype, device=device, low=-5, high=5)
             with self.assertRaisesRegex(
                     RuntimeError,
                     rf"self.size\(-1\) must be divisible by {size_ratio}"):
@@ -238,7 +238,7 @@ def test_view_dtype_upsize_errors(self, device, dtype):
                     rf"self.storage_offset\(\) must be divisible by {size_ratio}"):
                 a[:, :, 1:].view(view_dtype)
 
-            a = make_tensor((4, 4, size_ratio), device, dtype, low=-5, high=5)
+            a = make_tensor((4, 4, size_ratio), dtype=dtype, device=device, low=-5, high=5)
             a = a.as_strided((4, 4, size_ratio), (size_ratio, 1, 1))
             with self.assertRaisesRegex(
                     RuntimeError,
@@ -302,7 +302,7 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
         self.assertEqual(res.shape, torch.Size([0]))
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_complex_dtypes(include_complex32=True))
+    @dtypes(*complex_types(), torch.complex32)
     def test_view_as_real(self, device, dtype):
         def fn(contiguous_input=True):
             t = torch.randn(3, 4, dtype=dtype, device=device)
@@ -310,11 +310,7 @@ def fn(contiguous_input=True):
             res = torch.view_as_real(input)
             self.assertEqual(res[:, :, 0], input.real)
             self.assertEqual(res[:, :, 1], input.imag)
-            # TODO: Add torch.ComplexHalfStorage
-            if dtype != torch.complex32:
-                self.assertTrue(self.is_view_of(t, res))
-            else:
-                self.assertRaises(RuntimeError, lambda: self.is_view_of(t, res))
+            self.assertTrue(self.is_view_of(t, res))
 
         fn()
         fn(contiguous_input=False)
@@ -322,27 +318,19 @@ def fn(contiguous_input=True):
         # tensor with zero elements
         x = torch.tensor([], dtype=dtype, device=device)
         res = torch.view_as_real(x)
-        # TODO: Add torch.ComplexHalfStorage
-        if dtype != torch.complex32:
-            self.assertTrue(self.is_view_of(x, res))
-        else:
-            self.assertRaises(RuntimeError, lambda: self.is_view_of(x, res))
+        self.assertTrue(self.is_view_of(x, res))
         self.assertEqual(res.shape, torch.Size([0, 2]))
 
         # tensor with zero dim
         x = torch.tensor(2 + 3j, dtype=dtype, device=device)
         res = torch.view_as_real(x)
-        # TODO: Add torch.ComplexHalfStorage
-        if dtype != torch.complex32:
-            self.assertTrue(self.is_view_of(x, res))
-        else:
-            self.assertRaises(RuntimeError, lambda: self.is_view_of(x, res))
+        self.assertTrue(self.is_view_of(x, res))
         self.assertEqual(res.shape, torch.Size([2]))
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_view_tensor_split(self, device, dtype):
-        a = make_tensor((40, 30), device, dtype, low=-9, high=9)
+        a = make_tensor((40, 30), dtype=dtype, device=device, low=-9, high=9)
         a_split_dim0 = a.tensor_split(7, 0)
         for a_split_dim0_tensor in a_split_dim0:
             self.assertTrue(self.is_view_of(a, a_split_dim0_tensor))
@@ -351,9 +339,9 @@ def test_view_tensor_split(self, device, dtype):
             self.assertTrue(self.is_view_of(a, a_split_dim1_tensor))
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_view_tensor_hsplit(self, device, dtype):
-        t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
+        t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_hsplit = torch.hsplit(t, 2)
         for t_hsplit_tensor in t_hsplit:
             self.assertTrue(self.is_view_of(t, t_hsplit_tensor))
@@ -361,9 +349,9 @@ def test_view_tensor_hsplit(self, device, dtype):
         self.assertEqual(t_hsplit[1][2, 0, 2], t[2, 2, 2])
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_view_tensor_vsplit(self, device, dtype):
-        t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
+        t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_vsplit = torch.vsplit(t, 2)
         for t_vsplit_tensor in t_vsplit:
             self.assertTrue(self.is_view_of(t, t_vsplit_tensor))
@@ -371,9 +359,9 @@ def test_view_tensor_vsplit(self, device, dtype):
         self.assertEqual(t_vsplit[1][0, 2, 2], t[2, 2, 2])
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_view_tensor_dsplit(self, device, dtype):
-        t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
+        t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_dsplit = torch.dsplit(t, 2)
         for t_dsplit_tensor in t_dsplit:
             self.assertTrue(self.is_view_of(t, t_dsplit_tensor))
@@ -381,7 +369,7 @@ def test_view_tensor_dsplit(self, device, dtype):
         self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2])
 
     @onlyNativeDeviceTypes
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_imag_noncomplex(self, device, dtype):
         t = torch.ones((5, 5), dtype=dtype, device=device)
 
@@ -389,7 +377,7 @@ def test_imag_noncomplex(self, device, dtype):
             torch.imag(t)
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_complex_dtypes())
+    @dtypes(*complex_types())
     def test_real_imag_view(self, device, dtype):
         def compare_with_numpy(contiguous_input=True):
             t = torch.randn(3, 3, dtype=dtype, device=device)
@@ -420,7 +408,7 @@ def compare_with_numpy(contiguous_input=True):
         self.assertEqual(a[5:].imag, a.imag[5:])
 
     @onlyNativeDeviceTypes
-    @dtypes(*get_all_complex_dtypes())
+    @dtypes(*complex_types())
     def test_conj_imag_view(self, device, dtype) -> None:
         t = _make_tensor((4, 5,), dtype, device)
         t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device)
@@ -445,7 +433,7 @@ def test_conj_view_with_shared_memory(self, device) -> None:
         self.assertEqual(torch.add(b, c), b.add_(c))
 
     @onlyNativeDeviceTypes
-    @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes()))
+    @dtypes(*product(complex_types(), all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)))
     @suppress_warnings
     def test_set_real_imag(self, device, dtypes):
         x = torch.randn(10, dtype=dtypes[0], device=device)
@@ -729,6 +717,7 @@ def test_contiguous_self(self, device):
         s = t.contiguous()
         self.assertTrue(s is t)
 
+    @skipMeta
     def test_contiguous_nonview(self, device):
         t = torch.ones(5, 5, device=device)
         nv = t.t().contiguous()
@@ -754,6 +743,7 @@ def test_reshape_as_view(self, device):
         v[6] = 0
         self.assertEqual(t[1, 1], v[6])
 
+    @skipMeta
     def test_reshape_nonview(self, device):
         t = torch.ones(5, 5, device=device)
         nv = torch.reshape(t.t(), (25,))
@@ -806,7 +796,8 @@ def assert_is_nonview(t, nv):
             idx_nv = (0,) * nv.ndim
             self.assertTrue(not nv._is_view())
             nv[idx_nv] = 0
-            self.assertNotEqual(t[idx_t], nv[idx_nv])
+            if device != "meta":
+                self.assertNotEqual(t[idx_t], nv[idx_nv])
         t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
         nv = t.flatten(1, 3)
         assert_is_nonview(t, nv)
@@ -905,6 +896,43 @@ def run_test(device, op):
             op = partial(fn, source=0, destination=1)
             run_test(device, op)
 
+    # Testing that the generated view_copy kernel and its derivative are implemented correctly
+    def test_view_copy(self, device):
+        a = torch.randn(4, device=device, requires_grad=True)
+        a_ref = a.clone().detach().requires_grad_()
+        a_view = a_ref.view(2, 2)
+        a_view_copy = torch.view_copy(a, (2, 2))
+
+        # view_copy ops don't preserve view relationship
+        self.assertTrue(self.is_view_of(a_ref, a_view))
+        self.assertFalse(self.is_view_of(a, a_view_copy))
+
+        a_view_copy.sum().backward()
+        a_view.sum().backward()
+
+        # forward and backward give the same shape + result
+        self.assertEqual(a_view_copy, a_view)
+        self.assertEqual(a.grad, a_ref.grad)
+
+    def test_view_copy_out(self, device):
+        a = torch.randn(2, 2, device=device)
+        out = torch.empty(2, device=device)
+
+        torch.diagonal_copy(a, out=out)
+        expected = torch.diagonal_copy(a)
+
+        self.assertEqual(expected, out)
+
+        a = torch.randn(4, device=device)
+        out1 = torch.empty(2, device=device)
+        out2 = torch.empty(2, device=device)
+
+        torch.split_copy(a, 2, out=(out1, out2))
+        expected1, expected2 = torch.split_copy(a, 2)
+
+        self.assertEqual(expected1, out1)
+        self.assertEqual(expected2, out2)
+
 class TestOldViewOps(TestCase):
     def test_ravel(self, device):
 
@@ -914,29 +942,38 @@ def _test_ravel(tensors, size, nc=False):
                 flat = src.ravel()
                 self.assertEqual(flat.shape, torch.Size([size]))
                 self.assertEqual(src.view(-1), flat)
-                self.assertEqual(flat._base, src)
+                self.assertIs(flat._base, src)
+                self.assertTrue(flat.is_contiguous())
 
                 # Non-continuous Tensor -> Copy
                 if nc:
                     nc_src = src.t()
                     nc_flat = nc_src.ravel()
                     self.assertEqual(nc_flat.shape, torch.Size([size]))
-                    self.assertEqual(nc_src.reshape(-1), nc_flat)
-                    self.assertTrue(nc_flat._base != nc_src)
+                    self.assertEqual(nc_src.contiguous().view(-1), nc_flat)
+                    self.assertIsNot(nc_flat._base, src)
+                    self.assertTrue(nc_flat.is_contiguous())
 
         # Test that flatten returns 1-dim tensor when given a 0-dim tensor
         zero_dim_tensor = torch.tensor(123, device=device)
         flat0 = zero_dim_tensor.ravel()
         one_dim_tensor = torch.tensor([123], device=device)
         flat1 = zero_dim_tensor.ravel()
+        nc_ones_tensor = torch.ones(10, device=device)[::2]
+        flat2 = nc_ones_tensor.ravel()
 
         self.assertEqual(zero_dim_tensor.shape, torch.Size([]))
         self.assertEqual(flat0.shape, torch.Size([1]))
         self.assertEqual(one_dim_tensor.shape, torch.Size([1]))
         self.assertEqual(flat1.shape, torch.Size([1]))
+        self.assertEqual(nc_ones_tensor.shape, torch.Size([5]))
+        self.assertEqual(flat2.shape, torch.Size([5]))
         self.assertEqual(flat0, one_dim_tensor)
         self.assertEqual(flat0, flat1)
         self.assertEqual(flat0.shape, flat1.shape)
+        self.assertTrue(flat0.is_contiguous())
+        self.assertTrue(flat1.is_contiguous())
+        self.assertTrue(flat2.is_contiguous())
 
         # Test both float tensor and quantized tensor
         tensors = [torch.randn(5, 5, 5, 5, device=device),
@@ -1027,7 +1064,9 @@ def test_reshape(self, device):
         self.assertRaises(RuntimeError, lambda: x.reshape(-1, -1))
 
         y = torch.randn(4, 4, 4, device=device)[:, 0, :]
-        self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr())
+        # .data_ptr() on meta tensors is always 0 so they are equal regardless of the reshape
+        if device != "meta":
+            self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr())
         self.assertEqual(y.contiguous().view(-1), y.reshape(-1))
         self.assertEqual(y.reshape(2, 2, 4).data_ptr(), y.data_ptr())
 
@@ -1250,7 +1289,7 @@ def test_T(self, device):
         scalar = torch.tensor(5, device=device)
         self.assertEqual(scalar, scalar.T)
 
-    @dtypes(*(torch.testing.get_all_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_transposes(self, device, dtype):
         for op in ("T", "H", "mT", "mH", "adjoint"):
             shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),)
@@ -1266,7 +1305,7 @@ def test_transposes(self, device, dtype):
                     t2 = t2.conj()
                 self.assertEqual(t2, t1)
 
-    @dtypes(*(torch.testing.get_all_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_transposes_errors(self, device, dtype):
         for op in ("H", "mT", "mH", "adjoint"):
             shapes = ((2,), (2, 3, 4)) if op == "H" else ((2,),)
@@ -1392,8 +1431,7 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
                         self.assertEqual(np_res, torch_res)
 
     # TODO: are these view ops?
-    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
-              get_all_complex_dtypes()))
+    @dtypes(*all_types_and_complex_and(torch.half))
     def test_atleast(self, device, dtype):
         self._test_atleast_dim(torch.atleast_1d, np.atleast_1d, device, dtype)
         self._test_atleast_dim(torch.atleast_2d, np.atleast_2d, device, dtype)
@@ -1457,8 +1495,80 @@ def test_broadcast_shapes(self, device):
                 actual = torch.broadcast_shapes(s0, s1)
                 self.assertEqual(expected, actual)
 
+        inputs_list = [[1, 4], [4, 1], [1, 1, 3]]
+        for integral_inputs in inputs_list:
+            res1 = torch.broadcast_shapes(*integral_inputs)
+            res2 = torch.broadcast_tensors(*map(torch.empty, integral_inputs))[0].shape
+            self.assertEqual(res1, res2)
+
+        inputs_with_neg_vals = [[1, 1, -12], [-1, 1], [-11, ]]
+        for integral_inputs_with_neg_vals in inputs_with_neg_vals:
+            with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+                torch.broadcast_shapes(*integral_inputs_with_neg_vals)
+
+        integral_inputs_error_case = [(3, 5), (2, 4, 1)]
+        for error_input in integral_inputs_error_case:
+            with self.assertRaisesRegex(RuntimeError, "Shape mismatch: objects cannot be broadcast to a single shape"):
+                torch.broadcast_shapes(*error_input)
+
+        negative_inputs = [(-1,), (1, -12), (4, -11), (-4, 1), (1, 1, -2)]
+        for s0 in negative_inputs:
+            with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+                torch.broadcast_shapes(s0)
+
+            for s1 in negative_inputs:
+                with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+                    torch.broadcast_shapes(s0, s1)
+
+        float_inputs_error_case = [(1.1, 2.0), (1.1, 1.0)]
+        for error_case in float_inputs_error_case:
+            for float_input in error_case:
+                with self.assertRaisesRegex(RuntimeError, "Input shapes "
+                                            "should be of type ints, a tuple of ints, or a list of ints"):
+                    torch.broadcast_shapes(float_input)
+
+        diff_input_types = [(1, (5,)), (3, (1,)), (1, (3, 4))]
+        for s0 in diff_input_types:
+            res1 = torch.broadcast_shapes(*s0)
+            res2 = torch.broadcast_tensors(*map(torch.empty, s0))[0].shape
+            self.assertEqual(res1, res2)
+
+    @unittest.skipIf(np.__version__ < '1.20',
+                     "NumPy does not support broadcast_shapes before the 1.20 version")
+    @onlyCPU
+    def test_broadcast_shapes_numpy_ref(self, device):
+        examples = [(), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)]
+        for s0 in examples:
+            x0 = torch.randn(s0)
+            actual = torch.broadcast_shapes(s0)
+            numpy_expected = np.broadcast_shapes(s0)
+            self.assertEqual(actual, numpy_expected)
+
+            for s1 in examples:
+                x1 = torch.randn(s1)
+                actual = torch.broadcast_shapes(s0, s1)
+                numpy_expected = np.broadcast_shapes(s0, s1)
+                self.assertEqual(actual, numpy_expected)
+
+        inputs_list = [[1, 4], [4, 1], [1, 1, 3]]
+        for integral_inputs in inputs_list:
+            res1 = torch.broadcast_shapes(*integral_inputs)
+            res2_numpy = np.broadcast_shapes(*integral_inputs)
+            self.assertEqual(res1, res2_numpy)
+
+        for list_inputs in inputs_list:
+            res1 = torch.broadcast_shapes(list_inputs)
+            res2 = np.broadcast_shapes(list_inputs)
+            self.assertEqual(res1, res2)
+
+        diff_input_types = [(1, (5,)), (3, (1,)), (1, (3, 4))]
+        for s0 in diff_input_types:
+            res1 = torch.broadcast_shapes(*s0)
+            res2_numpy = np.broadcast_shapes(*s0)
+            self.assertEqual(res1, res2_numpy)
+
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
             # s0.dim() <= s1.dim(), reverse s0 and s1 to compare trailing dimension
@@ -1473,7 +1583,7 @@ def can_broadcast(s0, s1):
             (), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)
         )
         for s0, s1 in combinations(sizes, r=2):
-            t = make_tensor(s0, device, dtype, low=-9, high=9)
+            t = make_tensor(s0, dtype=dtype, device=device, low=-9, high=9)
             t_np = t.cpu().numpy()
 
             if can_broadcast(s0, s1):
@@ -1561,9 +1671,9 @@ def test_view(self, device):
         self.assertEqual(tensor.view(6, 2, 1), contig_tensor.view(6, 2, 1))
         self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1))
 
-    @dtypes(*get_all_dtypes())
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_reshape_view_semantics(self, device, dtype):
-        tensor = make_tensor((15, 4), device, dtype)
+        tensor = make_tensor((15, 4), dtype=dtype, device=device)
         target = (20, 3)
 
         # Cases where the tensor can be returned as a view.
@@ -1588,7 +1698,7 @@ def test_contiguous(self, device):
 
     @onlyNativeDeviceTypes
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_tensor_split_sections(self, device, dtype):
         input_sizes = [
             (0,),
@@ -1599,7 +1709,7 @@ def test_tensor_split_sections(self, device, dtype):
             (12, 3),
         ]
         for input_size in input_sizes:
-            a_base = make_tensor(input_size, device, dtype, low=-9, high=9)
+            a_base = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
             # Run tests on transposed input if it has at least 2 dims
             for a in [a_base, a_base.t()] if a_base.dim() > 2 else [a_base]:
                 a_n = a.cpu().numpy()
@@ -1619,7 +1729,7 @@ def test_tensor_split_sections(self, device, dtype):
 
     @onlyNativeDeviceTypes
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*get_all_dtypes(include_bfloat16=False))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_tensor_split_indices(self, device, dtype):
         input_sizes = [
             (0,),
@@ -1642,7 +1752,7 @@ def test_tensor_split_indices(self, device, dtype):
             (1, 5, 2, 8),
         ]
         for input_size in input_sizes:
-            a_base = make_tensor(input_size, device, dtype, low=-9, high=9)
+            a_base = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
             # Run tests on transposed input if it has at least 2 dims
             for a in [a_base, a_base.t()] if a_base.dim() > 2 else [a_base]:
                 a_n = a.cpu().numpy()
@@ -1698,20 +1808,28 @@ def test_tensor_split_errors(self, device):
 
     def test_resize_all_dtypes_and_devices(self, device):
         shape = (2, 2)
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             x.resize_(shape)
             self.assertEqual(shape, x.shape)
 
     def test_resize_as_all_dtypes_and_devices(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             y = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dt, device=device)
             x.resize_as_(y)
             self.assertEqual(y.shape, x.shape)
 
+    @onlyNativeDeviceTypes
+    def test_resize_overflow(self, device):
+        x = torch.empty((), dtype=torch.float64)
+        with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'):
+            x.resize_([2, 4, 2**29, 2**29])
+        with self.assertRaisesRegex(RuntimeError, 'overflow'):
+            x.resize_([8, 8, 2**29, 2**29])
+
     def test_view_all_dtypes_and_devices(self, device):
-        for dt in get_all_dtypes():
+        for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             self.assertEqual(x.view(6).shape, [6])
 
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 38ee3bbfdfa4..9e510d1715b1 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -21,6 +21,7 @@
                      " Please build with USE_XNNPACK=1.")
 @unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKOps(TestCase):
+    @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488")
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
            weight_output_dim=st.integers(2, 64),
@@ -53,7 +54,6 @@ def test_linear_1d_input(self, input_size, weight_output_dim, use_bias):
         output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias)
         torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
 
-
     @given(batch_size=st.integers(0, 3),
            input_channels_per_group=st.integers(1, 32),
            height=st.integers(5, 64),
@@ -183,6 +183,7 @@ def test_conv2d_transpose(self,
                      " Please build with USE_XNNPACK=1.")
 @unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKSerDes(TestCase):
+    @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488")
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
            weight_output_dim=st.integers(2, 64),
@@ -437,6 +438,7 @@ def forward(self, x):
         xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data)
         torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
+    @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488")
     @given(batch_size=st.integers(0, 3),
            input_channels_per_group=st.integers(1, 32),
            height=st.integers(5, 64),
diff --git a/test/typing/reveal/namedtuple.py b/test/typing/reveal/namedtuple.py
index 8a0508b325c5..2e130338f0b9 100644
--- a/test/typing/reveal/namedtuple.py
+++ b/test/typing/reveal/namedtuple.py
@@ -7,9 +7,9 @@
 t_sort[0][0, 0] == 1.5      # noqa: B015
 t_sort.indices[0, 0] == 1   # noqa: B015
 t_sort.values[0, 0] == 1.5  # noqa: B015
-reveal_type(t_sort)  # E: Tuple[{Tensor}, {Tensor}, fallback=torch._C.namedtuple_values_indices]
+reveal_type(t_sort)  # E: Tuple[{Tensor}, {Tensor}, fallback=torch.return_types.sort]
 
 t_qr = torch.linalg.qr(t)
 t_qr[0].shape == [2, 2]     # noqa: B015
 t_qr.Q.shape == [2, 2]      # noqa: B015
-reveal_type(t_qr)  # E: Tuple[{Tensor}, {Tensor}, fallback=torch._C._VariableFunctions.namedtuple_Q_R]
+reveal_type(t_qr)  # E: Tuple[{Tensor}, {Tensor}, fallback=torch.return_types.qr]
diff --git a/third_party/BUILD.buck b/third_party/BUILD.buck
new file mode 100644
index 000000000000..cb28c744b54a
--- /dev/null
+++ b/third_party/BUILD.buck
@@ -0,0 +1,221 @@
+load("//third_party:glog.buck.bzl", "define_glog")
+load("//third_party:xnnpack.buck.bzl", "define_xnnpack")
+
+define_glog()
+
+define_xnnpack()
+
+cxx_library(
+    name = "fmt",
+    srcs = ['fmt/src/format.cc'],
+    deps = [],
+    compiler_flags = ['-w', '-Wno-error=format-zero-length', '-Wno-error=vla', '-Wno-incompatible-pointer-types-discards-qualifiers', '-Wno-unused-label', '-Wno-deprecated-declarations', '-Wno-implicit-function-declaration', '-Wno-error', '-Wno-non-pod-varargs', '-Wno-format-security', '-Wno-c++11-narrowing', '-Wno-ignored-attributes', '-Wno-return-std-move', '-Wno-shadow', '-Wno-sign-compare', '-Wno-switch', '-Wno-undef', '-Wno-uninitialized', '-Wno-unknown-pragmas', '-Wno-unknown-warning-option', '-Wno-unused-function', '-Wno-unused-local-typedef', '-Wno-unused-value', '-Wno-unused-variable', '-Wno-register', '-Wno-format', '-Wno-unused-lambda-capture', '-Wno-missing-braces', '-Wno-unused-parameter', '-Wno-unreachable-code', '-Wno-inconsistent-missing-destructor-override', '-Wno-implicit-fallthrough', '-Wno-ignored-qualifiers', '-Wno-pedantic', '-Wno-deprecated-copy', '-Wno-non-virtual-dtor', '-Wno-null-pointer-arithmetic', '-Wno-implicit-const-int-float-conversion', '-Wno-tautological-unsigned-enum-zero-compare', '-Wno-embedded-directive', '-Wno-int-conversion', '-Wno-nonnull', '-Wno-variadic-macros', '-Wno-zero-length-array', '-Wno-missing-prototypes', '-fno-exceptions', '-fno-rtti', '-Wno-braced-scalar-init', '-fvisibility-inlines-hidden'],
+    preferred_linkage = "static",
+    exported_preprocessor_flags = ['-DFMT_EXCEPTIONS=0'],
+    header_namespace = "third_party/fmt",
+    public_system_include_directories = ['fmt/include'],
+    raw_headers = glob(["fmt/include/fmt/*.h"]),
+    soname = "libthird-party_fmt_fmt.$(ext)",
+    visibility = ['PUBLIC'],
+)
+
+cxx_library(
+    name = "pthreadpool",
+    srcs = ['pthreadpool/src/legacy-api.c', 'pthreadpool/src/memory.c', 'pthreadpool/src/portable-api.c', 'pthreadpool/src/pthreads.c'],
+    deps = [
+        ":FXdiv",
+        ":pthreadpool_header",
+    ],
+    compiler_flags = [
+        "-w",
+        "-Os",
+        "-fstack-protector-strong",
+        "-fno-delete-null-pointer-checks"
+    ],
+    headers = {
+        'threadpool-atomics.h': 'pthreadpool/src/threadpool-atomics.h',
+        'threadpool-common.h': 'pthreadpool/src/threadpool-common.h',
+        'threadpool-object.h': 'pthreadpool/src/threadpool-object.h',
+        'threadpool-utils.h': 'pthreadpool/src/threadpool-utils.h',
+    },
+    header_namespace = "",
+    preferred_linkage = "static",
+    link_whole = False,
+    platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]],
+    preprocessor_flags = ['-DPTHREADPOOL_USE_FUTEX=0', '-DPTHREADPOOL_USE_GCD=0'],
+    reexport_all_header_dependencies = True,
+    visibility = ['PUBLIC'],
+)
+
+cxx_library(
+    name = "pthreadpool_header",
+    header_namespace = "",
+    exported_headers = {
+        "pthreadpool.h": "pthreadpool/include/pthreadpool.h",
+    },
+    reexport_all_header_dependencies = True,
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "FXdiv",
+    header_namespace = "",
+    exported_headers = {
+        "fxdiv.h": "FXdiv/include/fxdiv.h",
+    },
+    reexport_all_header_dependencies = True,
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "psimd",
+    header_namespace = "",
+    exported_headers = {
+        "psimd.h": "psimd/include/psimd.h",
+    },
+    preferred_linkage = "static",
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "cpuinfo",
+    srcs = [
+        "cpuinfo/wrappers/api.c",
+        "cpuinfo/wrappers/arm/android/properties.c",
+        "cpuinfo/wrappers/arm/cache.c",
+        "cpuinfo/wrappers/arm/linux/aarch32-isa.c",
+        "cpuinfo/wrappers/arm/linux/aarch64-isa.c",
+        "cpuinfo/wrappers/arm/linux/chipset.c",
+        "cpuinfo/wrappers/arm/linux/clusters.c",
+        "cpuinfo/wrappers/arm/linux/cpuinfo.c",
+        "cpuinfo/wrappers/arm/linux/hwcap.c",
+        "cpuinfo/wrappers/arm/linux/init.c",
+        "cpuinfo/wrappers/arm/linux/midr.c",
+        "cpuinfo/wrappers/arm/mach/init.c",
+        "cpuinfo/wrappers/arm/uarch.c",
+        "cpuinfo/wrappers/cache.c",
+        "cpuinfo/wrappers/init.c",
+        "cpuinfo/wrappers/linux/cpulist.c",
+        "cpuinfo/wrappers/linux/multiline.c",
+        "cpuinfo/wrappers/linux/processors.c",
+        "cpuinfo/wrappers/linux/smallfile.c",
+        "cpuinfo/wrappers/mach/topology.c",
+        "cpuinfo/wrappers/x86/cache/descriptor.c",
+        "cpuinfo/wrappers/x86/cache/deterministic.c",
+        "cpuinfo/wrappers/x86/cache/init.c",
+        "cpuinfo/wrappers/x86/info.c",
+        "cpuinfo/wrappers/x86/init.c",
+        "cpuinfo/wrappers/x86/isa.c",
+        "cpuinfo/wrappers/x86/linux/cpuinfo.c",
+        "cpuinfo/wrappers/x86/linux/init.c",
+        "cpuinfo/wrappers/x86/mach/init.c",
+        "cpuinfo/wrappers/x86/name.c",
+        "cpuinfo/wrappers/x86/topology.c",
+        "cpuinfo/wrappers/x86/uarch.c",
+        "cpuinfo/wrappers/x86/vendor.c",
+        "cpuinfo/wrappers/x86/windows/init.c",
+    ],
+    include_directories = ["cpuinfo/src"],
+    public_include_directories = ["cpuinfo/include"],
+    raw_headers = glob([
+        "cpuinfo/src/**/*.h",
+        "cpuinfo/src/**/*.c",
+    ]),
+    preferred_linkage = "static",
+    preprocessor_flags = [
+        "-DCPUINFO_LOG_LEVEL=2",
+        "-D_GNU_SOURCE=1",
+    ],
+    visibility = ["PUBLIC"],
+    deps = [
+        ":clog",
+    ],
+)
+
+cxx_library(
+    name = "clog",
+    srcs = [
+        "cpuinfo/deps/clog/src/clog.c",
+    ],
+    raw_headers = glob([
+        "cpuinfo/deps/clog/include/*.h",
+    ]),
+    public_include_directories = [
+        "cpuinfo/deps/clog/include/",
+    ],
+    force_static = True,
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+    name = "FP16",
+    raw_headers = glob([
+        "FP16/include/*.h",
+    ]),
+    public_include_directories = [
+        "FP16/include/",
+    ],
+    force_static = True,
+    visibility = ["PUBLIC"],
+)
+
+
+cxx_library(
+    name = "miniz",
+    srcs = ["miniz-2.0.8/miniz.c"],
+    header_namespace = "",
+    exported_headers = {"miniz.h": "miniz-2.0.8/miniz.h"},
+    exported_preprocessor_flags = [
+        "-DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS",
+    ],
+    visibility = ["PUBLIC"],
+)
+
+remote_file(
+  name = "typing-extensions-download",
+  url = "https://files.pythonhosted.org/packages/75/e1/932e06004039dd670c9d5e1df0cd606bf46e29a28e65d5bb28e894ea29c9/typing_extensions-4.2.0-py3-none-any.whl",
+  sha1 = "ff0849420e94f425818bff5d0f25e3cdfaba8601",
+  out = "typing_extensions-4.2.0-py3-none-any.whl",
+)
+
+prebuilt_python_library(
+  name = "typing-extensions",
+  binary_src = ":typing-extensions-download",
+  visibility = ["PUBLIC"],
+  deps = [":typing-extensions-download"],
+)
+
+remote_file(
+  name = "pyyaml-download",
+  url = "https://files.pythonhosted.org/packages/12/fc/a4d5a7554e0067677823f7265cb3ae22aed8a238560b5133b58cda252dad/PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl",
+  sha1 = "11aa9c5fe2d890b6a73212beadc7c8a4265ebc39",
+  out = "pyyaml.whl",
+)
+
+prebuilt_python_library(
+  name = "pyyaml",
+  binary_src = ":pyyaml-download",
+  visibility = ["PUBLIC"],
+  deps = [":pyyaml-download"],
+)
+
+cxx_library(
+    name = "ruy_lib",
+    srcs = glob(
+        ["ruy/**/*.cc"],
+        exclude = [
+            "ruy/ruy/test_*.cc",
+            "ruy/ruy/*_test.cc",
+            "ruy/example/*.cc",
+            "ruy/ruy/profiler/test.cc",
+            "ruy/ruy/benchmark.cc",
+        ],
+    ),
+    compiler_flags = ["-Os"],
+    preferred_linkage = "static",
+    public_include_directories = ["ruy"],
+    raw_headers = glob(["ruy/**/*.h"]),
+    visibility = [
+        "PUBLIC",
+    ],
+)
diff --git a/third_party/LICENSES_BUNDLED.txt b/third_party/LICENSES_BUNDLED.txt
index c1c9a1783964..9b61374c0aa7 100644
--- a/third_party/LICENSES_BUNDLED.txt
+++ b/third_party/LICENSES_BUNDLED.txt
@@ -6,11 +6,21 @@ License: MIT
 Files: third_party/FP16
   For details, see third_party/FP16/LICENSE
 
+Name: FP16-source
+License: MIT
+Files: third_party/XNNPACK/build/FP16-source
+  For details, see third_party/XNNPACK/build/FP16-source/LICENSE
+
 Name: FXdiv
 License: MIT
 Files: third_party/FXdiv
   For details, see third_party/FXdiv/LICENSE
 
+Name: FXdiv-source
+License: MIT
+Files: third_party/XNNPACK/build/FXdiv-source
+  For details, see third_party/XNNPACK/build/FXdiv-source/LICENSE
+
 Name: NNPACK
 License: BSD-2-Clause
 Files: third_party/NNPACK
@@ -29,22 +39,36 @@ Files: third_party/XNNPACK
 Name: benchmark
 License: Apache-2.0
 Files: third_party/benchmark,
-     third_party/protobuf/third_party/benchmark,
+     third_party/onnx/third_party/benchmark,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark,
-     third_party/onnx/third_party/benchmark
+     third_party/protobuf/third_party/benchmark
   For details, see third_party/benchmark/LICENSE,
-     third_party/protobuf/third_party/benchmark/LICENSE,
+     third_party/onnx/third_party/benchmark/LICENSE,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE,
-     third_party/onnx/third_party/benchmark/LICENSE
+     third_party/protobuf/third_party/benchmark/LICENSE
+
+Name: breakpad
+License: BSD-3-Clause
+Files: third_party/breakpad
+  For details, see third_party/breakpad/LICENSE
 
 Name: clog
 License: BSD-2-Clause
-Files: third_party/cpuinfo/deps/clog,
-     third_party/fbgemm/third_party/cpuinfo/deps/clog,
-     third_party/QNNPACK/deps/clog
-  For details, see third_party/cpuinfo/deps/clog/LICENSE,
-     third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE,
-     third_party/QNNPACK/deps/clog/LICENSE
+Files: third_party/QNNPACK/deps/clog,
+     third_party/XNNPACK/build/clog-source/deps/clog,
+     third_party/XNNPACK/build/cpuinfo-source/deps/clog,
+     third_party/cpuinfo/deps/clog,
+     third_party/fbgemm/third_party/cpuinfo/deps/clog
+  For details, see third_party/QNNPACK/deps/clog/LICENSE,
+     third_party/XNNPACK/build/clog-source/deps/clog/LICENSE,
+     third_party/XNNPACK/build/cpuinfo-source/deps/clog/LICENSE,
+     third_party/cpuinfo/deps/clog/LICENSE,
+     third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE
+
+Name: clog-source
+License: BSD-2-Clause
+Files: third_party/XNNPACK/build/clog-source
+  For details, see third_party/XNNPACK/build/clog-source/LICENSE
 
 Name: cpuinfo
 License: BSD-2-Clause
@@ -53,6 +77,21 @@ Files: third_party/cpuinfo,
   For details, see third_party/cpuinfo/LICENSE,
      third_party/fbgemm/third_party/cpuinfo/LICENSE
 
+Name: cpuinfo-source
+License: BSD-2-Clause
+Files: third_party/XNNPACK/build/cpuinfo-source
+  For details, see third_party/XNNPACK/build/cpuinfo-source/LICENSE
+
+Name: cudnn_frontend
+License: MIT
+Files: third_party/cudnn_frontend
+  For details, see third_party/cudnn_frontend/LICENSE.txt
+
+Name: dart
+License: Apache-2.0
+Files: third_party/flatbuffers/dart
+  For details, see third_party/flatbuffers/dart/LICENSE
+
 Name: eigen
 License: BSD-3-Clause
 Files: third_party/eigen
@@ -68,12 +107,17 @@ License: BSD-3-Clause
 Files: third_party/fbgemm
   For details, see third_party/fbgemm/LICENSE
 
+Name: flatbuffers
+License: Apache-2.0
+Files: third_party/flatbuffers
+  For details, see third_party/flatbuffers/LICENSE.txt
+
 Name: fmt
 License: MIT with exception
-Files: third_party/kineto/libkineto/third_party/fmt,
-     third_party/fmt
-  For details, see third_party/kineto/libkineto/third_party/fmt/LICENSE.rst,
-     third_party/fmt/LICENSE.rst
+Files: third_party/fmt,
+     third_party/kineto/libkineto/third_party/fmt
+  For details, see third_party/fmt/LICENSE.rst,
+     third_party/kineto/libkineto/third_party/fmt/LICENSE.rst
 
 Name: foxi
 License: MIT
@@ -87,14 +131,18 @@ Files: third_party/gemmlowp/gemmlowp
 
 Name: generator
 License: Apache-2.0
-Files: third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator,
-     third_party/googletest/googlemock/scripts/generator,
+Files: third_party/XNNPACK/build/googletest-source/googlemock/scripts/generator,
+     third_party/benchmark/build/third_party/googletest/src/googlemock/scripts/generator,
      third_party/fbgemm/third_party/googletest/googlemock/scripts/generator,
+     third_party/googletest/googlemock/scripts/generator,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator,
      third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator
-  For details, see third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE,
-     third_party/googletest/googlemock/scripts/generator/LICENSE,
+  For details, see third_party/XNNPACK/build/googletest-source/googlemock/scripts/generator/LICENSE,
+     third_party/benchmark/build/third_party/googletest/src/googlemock/scripts/generator/LICENSE,
      third_party/fbgemm/third_party/googletest/googlemock/scripts/generator/LICENSE,
+     third_party/googletest/googlemock/scripts/generator/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator/LICENSE
 
@@ -103,46 +151,58 @@ License: BSD-3-Clause
 Files: third_party/gloo
   For details, see third_party/gloo/LICENSE
 
+Name: googlebenchmark-source
+License: Apache-2.0
+Files: third_party/XNNPACK/build/googlebenchmark-source
+  For details, see third_party/XNNPACK/build/googlebenchmark-source/LICENSE
+
 Name: googlemock
 License: BSD-3-Clause
-Files: third_party/kineto/libkineto/third_party/googletest/googlemock,
-     third_party/googletest/googlemock,
+Files: third_party/XNNPACK/build/googletest-source/googlemock,
      third_party/fbgemm/third_party/googletest/googlemock,
+     third_party/kineto/libkineto/third_party/googletest/googlemock,
      third_party/protobuf/third_party/googletest/googlemock,
      third_party/tensorpipe/third_party/googletest/googlemock
-  For details, see third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE,
-     third_party/googletest/googlemock/LICENSE,
+  For details, see third_party/XNNPACK/build/googletest-source/googlemock/LICENSE,
      third_party/fbgemm/third_party/googletest/googlemock/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE,
      third_party/protobuf/third_party/googletest/googlemock/LICENSE,
      third_party/tensorpipe/third_party/googletest/googlemock/LICENSE
 
 Name: googletest
 License: BSD-3-Clause
-Files: third_party/kineto/libkineto/third_party/googletest,
-     third_party/kineto/libkineto/third_party/googletest/googletest,
-     third_party/googletest,
-     third_party/googletest/googletest,
+Files: third_party/XNNPACK/build/googletest-source/googletest,
      third_party/fbgemm/third_party/googletest,
      third_party/fbgemm/third_party/googletest/googletest,
+     third_party/googletest,
+     third_party/kineto/libkineto/third_party/googletest,
+     third_party/kineto/libkineto/third_party/googletest/googletest,
      third_party/protobuf/third_party/googletest,
      third_party/protobuf/third_party/googletest/googletest,
      third_party/tensorpipe/third_party/googletest,
      third_party/tensorpipe/third_party/googletest/googletest
-  For details, see third_party/kineto/libkineto/third_party/googletest/LICENSE,
-     third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE,
-     third_party/googletest/LICENSE,
-     third_party/googletest/googletest/LICENSE,
+  For details, see third_party/XNNPACK/build/googletest-source/googletest/LICENSE,
      third_party/fbgemm/third_party/googletest/LICENSE,
      third_party/fbgemm/third_party/googletest/googletest/LICENSE,
+     third_party/googletest/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE,
      third_party/protobuf/third_party/googletest/LICENSE,
      third_party/protobuf/third_party/googletest/googletest/LICENSE,
      third_party/tensorpipe/third_party/googletest/LICENSE,
      third_party/tensorpipe/third_party/googletest/googletest/LICENSE
 
+Name: googletest-source
+License: BSD-3-Clause
+Files: third_party/XNNPACK/build/googletest-source
+  For details, see third_party/XNNPACK/build/googletest-source/LICENSE
+
 Name: gtest
 License: BSD-3-Clause
-Files: third_party/ideep/mkl-dnn/tests/gtests/gtest
-  For details, see third_party/ideep/mkl-dnn/tests/gtests/gtest/LICENSE
+Files: third_party/ideep/mkl-dnn/tests/gtest,
+     third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest
+  For details, see third_party/ideep/mkl-dnn/tests/gtest/LICENSE,
+     third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest/LICENSE
 
 Name: ideep
 License: MIT
@@ -154,11 +214,21 @@ License: BSD-3-Clause
 Files: third_party/ios-cmake
   For details, see third_party/ios-cmake/LICENSE
 
+Name: json
+License: MIT
+Files: third_party/cudnn_frontend/include/contrib/nlohmann/json
+  For details, see third_party/cudnn_frontend/include/contrib/nlohmann/json/LICENSE.txt
+
 Name: kineto
 License: BSD-3-Clause
 Files: third_party/kineto
   For details, see third_party/kineto/LICENSE
 
+Name: libdisasm
+License: Clarified Artistic License
+Files: third_party/breakpad/src/third_party/libdisasm
+  For details, see third_party/breakpad/src/third_party/libdisasm/LICENSE
+
 Name: libnop
 License: Apache-2.0
 Files: third_party/tensorpipe/third_party/libnop
@@ -169,6 +239,11 @@ License: MIT
 Files: third_party/tensorpipe/third_party/libuv
   For details, see third_party/tensorpipe/third_party/libuv/LICENSE
 
+Name: lss
+License: BSD-3-Clause
+Files: third_party/breakpad/src/third_party/lss
+  For details, see third_party/breakpad/src/third_party/lss/LICENSE
+
 Name: miniz-2.0.8
 License: MIT
 Files: third_party/miniz-2.0.8
@@ -189,12 +264,20 @@ License: BSD-Source-Code
 Files: third_party/neon2sse
   For details, see third_party/neon2sse/LICENSE
 
+Name: oneDNN
+License: Apache-2.0
+Files: third_party/ideep/mkl-dnn/third_party/oneDNN
+  For details, see third_party/ideep/mkl-dnn/third_party/oneDNN/LICENSE
+
+Name: onnx
+License: Apache-2.0
+Files: third_party/onnx
+  For details, see third_party/onnx/LICENSE
+
 Name: onnx
 License: MIT
-Files: third_party/onnx-tensorrt/third_party/onnx,
-     third_party/onnx
-  For details, see third_party/onnx-tensorrt/third_party/onnx/LICENSE,
-     third_party/onnx/LICENSE
+Files: third_party/onnx-tensorrt/third_party/onnx
+  For details, see third_party/onnx-tensorrt/third_party/onnx/LICENSE
 
 Name: onnx-tensorrt
 License: MIT
@@ -208,23 +291,30 @@ Files: third_party/protobuf
 
 Name: psimd
 License: MIT
-Files: third_party/psimd
-  For details, see third_party/psimd/LICENSE
+Files: third_party/XNNPACK/deps/psimd,
+     third_party/psimd
+  For details, see third_party/XNNPACK/deps/psimd/LICENSE,
+     third_party/psimd/LICENSE
 
 Name: pthreadpool
 License: BSD-2-Clause
 Files: third_party/pthreadpool
   For details, see third_party/pthreadpool/LICENSE
 
+Name: pthreadpool-source
+License: BSD-2-Clause
+Files: third_party/XNNPACK/build/pthreadpool-source
+  For details, see third_party/XNNPACK/build/pthreadpool-source/LICENSE
+
 Name: pybind11
 License: BSD-3-Clause
-Files: third_party/pybind11,
+Files: third_party/onnx/third_party/pybind11,
      third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11,
-     third_party/onnx/third_party/pybind11,
+     third_party/pybind11,
      third_party/tensorpipe/third_party/pybind11
-  For details, see third_party/pybind11/LICENSE,
+  For details, see third_party/onnx/third_party/pybind11/LICENSE,
      third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE,
-     third_party/onnx/third_party/pybind11/LICENSE,
+     third_party/pybind11/LICENSE,
      third_party/tensorpipe/third_party/pybind11/LICENSE
 
 Name: python-peachpy
@@ -242,6 +332,21 @@ License: BSL-1.0
 Files: third_party/sleef
   For details, see third_party/sleef/LICENSE.txt
 
+Name: src
+License: BSD-3-Clause
+Files: third_party/benchmark/build/third_party/googletest/src
+  For details, see third_party/benchmark/build/third_party/googletest/src/LICENSE
+
+Name: swift
+License: Apache-2.0
+Files: third_party/flatbuffers/swift
+  For details, see third_party/flatbuffers/swift/LICENSE
+
+Name: tb_plugin
+License: BSD-3-Clause
+Files: third_party/kineto/tb_plugin
+  For details, see third_party/kineto/tb_plugin/LICENSE
+
 Name: tbb
 License: Apache-2.0
 Files: third_party/tbb
diff --git a/third_party/XNNPACK b/third_party/XNNPACK
index 79cd5f9e18ad..ae108ef49aa5 160000
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@@ -1 +1 @@
-Subproject commit 79cd5f9e18ad0925ac9a050b00ea5a36230072db
+Subproject commit ae108ef49aa5623b896fc93d4298c49d1750d9ba
diff --git a/third_party/breakpad b/third_party/breakpad
deleted file mode 160000
index 7d188f679d4a..000000000000
--- a/third_party/breakpad
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7d188f679d4ae0a5bd06408a3047d69ef8eef848
diff --git a/third_party/build_bundled.py b/third_party/build_bundled.py
index 0e1da44565ed..c05e1c3642fe 100644
--- a/third_party/build_bundled.py
+++ b/third_party/build_bundled.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import argparse
 import os
 
 
@@ -52,7 +53,7 @@ def create_bundled(d, outstream):
         outstream.write(f"Files: {files}\n")
         outstream.write('  For details, see ')
         outstream.write(license_file)
-        outstream.write('\n\n') 
+        outstream.write('\n\n')
 
 
 def identify_license(f, exception=''):
@@ -89,6 +90,8 @@ def squeeze(t):
         elif 'BoostSoftwareLicense-Version1.0' in txt:
             # Hmm, do we need to check the text?
             return 'BSL-1.0'
+        elif squeeze("Clarified Artistic License") in txt:
+            return 'Clarified Artistic License'
         elif all([squeeze(m) in txt.lower() for m in bsd3_txt]):
             return 'BSD-3-Clause'
         elif all([squeeze(m) in txt.lower() for m in bsd3_v1_txt]):
@@ -97,30 +100,30 @@ def squeeze(t):
             return 'BSD-2-Clause'
         elif all([squeeze(m) in txt.lower() for m in bsd3_src_txt]):
             return 'BSD-Source-Code'
-        elif all([squeeze(m) in txt.lower() for m in mit_txt]):
+        elif any([squeeze(m) in txt.lower() for m in mit_txt]):
             return 'MIT'
         else:
             raise ValueError('unknown license')
 
-mit_txt = ['permission is hereby granted, free of charge, to any person '
-           'obtaining a copy of this software and associated documentation '
-           'files (the "software"), to deal in the software without '
-           'restriction, including without limitation the rights to use, copy, '
-           'modify, merge, publish, distribute, sublicense, and/or sell copies '
-           'of the software, and to permit persons to whom the software is '
+mit_txt = ['permission is hereby granted, free of charge, to any person ',
+           'obtaining a copy of this software and associated documentation ',
+           'files (the "software"), to deal in the software without ',
+           'restriction, including without limitation the rights to use, copy, ',
+           'modify, merge, publish, distribute, sublicense, and/or sell copies ',
+           'of the software, and to permit persons to whom the software is ',
            'furnished to do so, subject to the following conditions:',
 
-           'the above copyright notice and this permission notice shall be '
+           'the above copyright notice and this permission notice shall be ',
            'included in all copies or substantial portions of the software.',
 
-           'the software is provided "as is", without warranty of any kind, '
-           'express or implied, including but not limited to the warranties of '
-           'merchantability, fitness for a particular purpose and '
-           'noninfringement. in no event shall the authors or copyright holders '
-           'be liable for any claim, damages or other liability, whether in an '
-           'action of contract, tort or otherwise, arising from, out of or in '
-           'connection with the software or the use or other dealings in the '
-           'software.'
+           'the software is provided "as is", without warranty of any kind, ',
+           'express or implied, including but not limited to the warranties of ',
+           'merchantability, fitness for a particular purpose and ',
+           'noninfringement. in no event shall the authors or copyright holders ',
+           'be liable for any claim, damages or other liability, whether in an ',
+           'action of contract, tort or otherwise, arising from, out of or in ',
+           'connection with the software or the use or other dealings in the ',
+           'software.',
            ]
 
 bsd3_txt = ['redistribution and use in source and binary forms, with or without '
@@ -154,6 +157,21 @@ def squeeze(t):
 
 if __name__ == '__main__':
     third_party = os.path.join(mydir)
-    fname = os.path.join(third_party, 'LICENSES_BUNDLED.txt')
+    parser = argparse.ArgumentParser(
+        description="Generate bundled licenses file",
+    )
+    parser.add_argument(
+        "--out-file",
+        type=str,
+        default=os.environ.get(
+            "PYTORCH_THIRD_PARTY_BUNDLED_LICENSE_FILE",
+            str(os.path.join(third_party, 'LICENSES_BUNDLED.txt'))
+        ),
+        help="location to output new bundled licenses file",
+    )
+
+    args = parser.parse_args()
+    fname = args.out_file
+    print(f"+ Writing bundled licenses to {args.out_file}")
     with open(fname, 'w') as fid:
         create_bundled(third_party, fid)
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 51e60d891b68..43709ab96c47 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 51e60d891b689d618e7a623509a779c422a420f7
+Subproject commit 43709ab96c47e26eebcdac72f93f946d44ceffa8
diff --git a/third_party/eigen b/third_party/eigen
index d41dc4dd74ac..3147391d946b 160000
--- a/third_party/eigen
+++ b/third_party/eigen
@@ -1 +1 @@
-Subproject commit d41dc4dd74acce21fb210e7625d5d135751fa9e5
+Subproject commit 3147391d946bb4b6c68edd901f2add6ac1f31f8c
diff --git a/third_party/fbgemm b/third_party/fbgemm
index e385d0267a9c..2e9be6581010 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit e385d0267a9cc6235ee19d4689930e32fe693b89
+Subproject commit 2e9be65810107a9595da717f95d21924b73be833
diff --git a/third_party/generate-cpuinfo-wrappers.py b/third_party/generate-cpuinfo-wrappers.py
new file mode 100644
index 000000000000..825a6bd228a2
--- /dev/null
+++ b/third_party/generate-cpuinfo-wrappers.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+from __future__ import print_function
+import os
+
+
+CPUINFO_SOURCES = {
+    None: [
+        "init.c",
+        "api.c",
+        "cache.c",
+    ],
+    "defined(__linux__)": [
+        "linux/multiline.c",
+        "linux/cpulist.c",
+        "linux/mockfile.c",
+        "linux/smallfile.c",
+        "linux/processors.c",
+    ],
+    "defined(__MACH__) && defined(__APPLE__)": [
+        "mach/topology.c",
+    ],
+    "defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_WIN32)": [
+        "x86/cache/init.c",
+        "x86/cache/deterministic.c",
+        "x86/cache/descriptor.c",
+        "x86/info.c",
+        "x86/mockcpuid.c",
+        "x86/isa.c",
+        "x86/topology.c",
+        "x86/name.c",
+        "x86/init.c",
+        "x86/uarch.c",
+        "x86/vendor.c",
+    ],
+    "(defined(__i386__) || defined(__i686__) || defined(__x86_64__)) && defined(__linux__)": [
+        "x86/linux/init.c",
+        "x86/linux/cpuinfo.c",
+    ],
+    "(defined(__i386__) || defined(__i686__) || defined(__x86_64__)) && defined(__MACH__) && defined(__APPLE__)": [
+        "x86/mach/init.c",
+    ],
+    "defined(_WIN32)": [
+        "x86/windows/init.c",
+    ],
+    "(defined(__arm__) || defined(__aarch64__)) && defined(__linux__)": [
+        "arm/linux/cpuinfo.c",
+        "arm/linux/hwcap.c",
+        "arm/linux/init.c",
+        "arm/linux/clusters.c",
+        "arm/linux/midr.c",
+        "arm/linux/chipset.c",
+        "arm/tlb.c",
+        "arm/uarch.c",
+        "arm/cache.c",
+    ],
+    "defined(__arm__) && defined(__linux__)": [
+        "arm/linux/aarch32-isa.c",
+    ],
+    "defined(__aarch64__) && defined(__linux__)": [
+        "arm/linux/aarch64-isa.c",
+    ],
+    "(defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)": [
+        "arm/android/properties.c",
+    ],
+    "(defined(__arm__) || defined(__aarch64__)) && defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE": [
+        "arm/mach/init.c",
+    ],
+}
+
+
+if __name__ == "__main__":
+    for condition, filenames in CPUINFO_SOURCES.items():
+        for filename in filenames:
+            filepath = os.path.join("cpuinfo/wrappers", filename)
+            if not os.path.exists(os.path.dirname(filepath)):
+                print(filepath)
+                os.makedirs(os.path.dirname(filepath))
+            with open(filepath, "w") as wrapper:
+                print("/* Auto-generated by generate-wrappers.py script. Do not modify */", file=wrapper)
+                print(file=wrapper)
+                print("#ifdef __APPLE__", file=wrapper)
+                print("\t#include <TargetConditionals.h>", file=wrapper)
+                print("#endif /* __APPLE__ */", file=wrapper)
+                print(file=wrapper)
+
+                if not condition:
+                    print("#include <%s>" % filename, file=wrapper)
+                else:
+                    # Include source file only if condition is satisfied
+                    print("#if %s" % condition, file=wrapper)
+                    print("#include <%s>" % filename, file=wrapper)
+                    print("#endif /* %s */" % condition, file=wrapper)
diff --git a/third_party/generate-xnnpack-wrappers.py b/third_party/generate-xnnpack-wrappers.py
new file mode 100644
index 000000000000..23992645672a
--- /dev/null
+++ b/third_party/generate-xnnpack-wrappers.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+from __future__ import print_function
+import collections
+import os
+
+BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
+WRAPPER_SRC_NAMES = {
+    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
+    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
+    "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_SSE41_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_F16C_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_XOP_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_FMA3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
+    "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
+}
+
+SRC_NAMES = [
+    "OPERATOR_SRCS",
+    "SUBGRAPH_SRCS",
+    "LOGGING_SRCS",
+    "HOT_SRCS",
+    "TABLE_SRCS",
+    "JIT_SRCS",
+    "JIT_AARCH32_SRCS",
+    "JIT_AARCH64_SRCS",
+    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
+    "PROD_SSE_MICROKERNEL_SRCS",
+    "PROD_SSE2_MICROKERNEL_SRCS",
+    "PROD_SSSE3_MICROKERNEL_SRCS",
+    "PROD_SSE41_MICROKERNEL_SRCS",
+    "PROD_AVX_MICROKERNEL_SRCS",
+    "PROD_F16C_MICROKERNEL_SRCS",
+    "PROD_XOP_MICROKERNEL_SRCS",
+    "PROD_FMA3_MICROKERNEL_SRCS",
+    "PROD_AVX2_MICROKERNEL_SRCS",
+    "PROD_AVX512F_MICROKERNEL_SRCS",
+    "PROD_AVX512SKX_MICROKERNEL_SRCS",
+]
+
+def update_sources():
+    sources = collections.defaultdict(list)
+    with open("./XNNPACK/CMakeLists.txt") as cmake:
+        lines = cmake.readlines()
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if line.startswith("SET") and line.split('(')[1].strip(' \t\n\r') in set(WRAPPER_SRC_NAMES.keys()) | set(SRC_NAMES):
+                name = line.split('(')[1].strip(' \t\n\r')
+                i += 1
+                while i < len(lines) and len(lines[i]) > 0 and ')' not in lines[i]:
+                    # remove "src/" at the beginning, remove whitespaces and newline
+                    value = lines[i].strip(' \t\n\r')
+                    sources[name].append(value[4:])
+                    i += 1
+                if i < len(lines) and len(lines[i]) > 4:
+                    # remove "src/" at the beginning, possibly ')' at the end
+                    value = lines[i].strip(' \t\n\r)')
+                    sources[name].append(value[4:])
+            else:
+                i += 1
+    print(sources)
+    return sources
+
+if __name__ == "__main__":
+    xnnpack_sources = collections.defaultdict(list)
+    sources = update_sources()
+    for name in WRAPPER_SRC_NAMES:
+        xnnpack_sources[WRAPPER_SRC_NAMES[name]].extend(sources[name])
+    for condition, filenames in xnnpack_sources.items():
+        for filename in filenames:
+            filepath = os.path.join("XNNPACK/wrappers", filename)
+            if not os.path.isdir(os.path.dirname(filepath)):
+                os.makedirs(os.path.dirname(filepath))
+            with open(filepath, "w") as wrapper:
+                print("/* {} */".format(BANNER), file=wrapper)
+                print(file=wrapper)
+
+                # Architecture- or platform-dependent preprocessor flags can be
+                # defined here. Note: platform_preprocessor_flags can't be used
+                # because they are ignored by arc focus & buck project.
+
+                if condition is None:
+                    print("#include <%s>" % filename, file=wrapper)
+                else:
+                    # Include source file only if condition is satisfied
+                    print("#if %s" % condition, file=wrapper)
+                    print("#include <%s>" % filename, file=wrapper)
+                    print("#endif /* %s */" % condition, file=wrapper)
diff --git a/third_party/glog.buck.bzl b/third_party/glog.buck.bzl
new file mode 100644
index 000000000000..88e32ae02cd3
--- /dev/null
+++ b/third_party/glog.buck.bzl
@@ -0,0 +1,97 @@
+GLOG_CONFIG_HEADERS = [
+    "vlog_is_on.h",
+    "stl_logging.h",
+    "raw_logging.h",
+    "logging.h",
+]
+
+GLOG_SED_COMMAND = " ".join([
+    "sed",
+    "-e 's/@ac_cv_cxx_using_operator@/1/g'",
+    "-e 's/@ac_cv_have_unistd_h@/1/g'",
+    "-e 's/@ac_cv_have_stdint_h@/1/g'",
+    "-e 's/@ac_cv_have_systypes_h@/1/g'",
+    "-e 's/@ac_cv_have_libgflags@/0/g'",
+    "-e 's/@ac_cv_have_uint16_t@/1/g'",
+    "-e 's/@ac_cv_have___builtin_expect@/1/g'",
+    "-e 's/@ac_cv_have_.*@/0/g'",
+    "-e 's/@ac_google_start_namespace@/namespace google {/g'",
+    "-e 's/@ac_google_end_namespace@/}/g'",
+    "-e 's/@ac_google_namespace@/google/g'",
+    "-e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g'",
+    "-e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g'",
+    "-e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'",
+])
+
+def define_glog():
+    cxx_library(
+        name = "glog",
+        srcs = [
+            "glog/src/demangle.cc",
+            "glog/src/vlog_is_on.cc",
+            "glog/src/symbolize.cc",
+            "glog/src/raw_logging.cc",
+            "glog/src/logging.cc",
+            "glog/src/signalhandler.cc",
+            "glog/src/utilities.cc",
+        ],
+        exported_headers = [":glog_{}".format(header) for header in GLOG_CONFIG_HEADERS],
+        header_namespace = "glog",
+        compiler_flags = [
+            "-Wno-sign-compare",
+            "-Wno-unused-function",
+            "-Wno-unused-local-typedefs",
+            "-Wno-unused-variable",
+            "-Wno-deprecated-declarations",
+        ],
+        preferred_linkage = "static",
+        exported_linker_flags = [],
+        exported_preprocessor_flags = [
+            "-DGLOG_NO_ABBREVIATED_SEVERITIES",
+            "-DGLOG_STL_LOGGING_FOR_UNORDERED",
+            "-DGOOGLE_GLOG_DLL_DECL=",
+            "-DGOOGLE_NAMESPACE=google",
+            # this is required for buck build
+            "-DGLOG_BAZEL_BUILD",
+            "-DHAVE_PTHREAD",
+            # Allows src/logging.cc to determine the host name.
+            "-DHAVE_SYS_UTSNAME_H",
+            # For src/utilities.cc.
+            "-DHAVE_SYS_SYSCALL_H",
+            "-DHAVE_SYS_TIME_H",
+            "-DHAVE_STDINT_H",
+            "-DHAVE_STRING_H",
+            # Enable dumping stacktrace upon sigaction.
+            "-DHAVE_SIGACTION",
+            # For logging.cc.
+            "-DHAVE_PREAD",
+            "-DHAVE___ATTRIBUTE__",
+        ],
+        deps = [":glog_config"],
+        soname = "libglog.$(ext)",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "glog_config",
+        header_namespace = "",
+        exported_headers = {
+            "config.h": ":glog_config.h",
+            "glog/log_severity.h": "glog/src/glog/log_severity.h",
+        },
+    )
+
+    genrule(
+        name = "glog_config.h",
+        srcs = ["glog/src/config.h.cmake.in"],
+        out = "config.h",
+        cmd = "awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $SRCS > $OUT",
+    )
+
+    for header in GLOG_CONFIG_HEADERS:
+        genrule(
+            name = "glog_{}".format(header),
+            out = header,
+            srcs = ["glog/src/glog/{}.in".format(header)],
+            cmd = "{} $SRCS > $OUT".format(GLOG_SED_COMMAND),
+        )
diff --git a/third_party/ideep b/third_party/ideep
index 4a56ab2c3f61..02b17c5748c9 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 4a56ab2c3f61c44e0f8ea241beeb732b7d70dc5b
+Subproject commit 02b17c5748c9349dcc586c359af800c684d9b1ab
diff --git a/third_party/kineto b/third_party/kineto
index b5bb62d25be7..b2b48c00c6e5 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit b5bb62d25be75c381dbbd975276602f021982ef2
+Subproject commit b2b48c00c6e5bd8e807e2231adb229db6a1d1c22
diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD
index 4ffe7a578fbf..1d40b1c5feda 100644
--- a/third_party/mkl-dnn.BUILD
+++ b/third_party/mkl-dnn.BUILD
@@ -10,6 +10,7 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine DNNL_WITH_LEVEL_ZERO": "/* #undef DNNL_WITH_LEVEL_ZERO */",
     "#cmakedefine DNNL_SYCL_CUDA": "/* #undef DNNL_SYCL_CUDA */",
     "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
+    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
     "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
     "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
     "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
@@ -37,6 +38,13 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
     "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
     "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
+    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 1",
+    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
+    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
+    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
+    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
+    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
+    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
 }
 
 template_rule(
@@ -45,9 +53,9 @@ template_rule(
     out = "third_party/oneDNN/include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "2",
-        "@DNNL_VERSION_MINOR@": "5",
-        "@DNNL_VERSION_PATCH@": "2",
-        "@DNNL_VERSION_HASH@": "a9302535553c73243c632ad3c4c80beec3d19a1e",
+        "@DNNL_VERSION_MINOR@": "6",
+        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_HASH@": "52b5f107dd9cf10910aaa19cb47f3abf9b349815",
     },
 )
 
diff --git a/third_party/onnx b/third_party/onnx
index 85546f8c44e6..96046b8ccfb8 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 85546f8c44e627f8ff1181725d03cc49f675e44f
+Subproject commit 96046b8ccfb8e6fa82f6b2b34b3d56add2e8849c
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
new file mode 100644
index 000000000000..549c70e03953
--- /dev/null
+++ b/third_party/xnnpack.buck.bzl
@@ -0,0 +1,586 @@
+load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+
+def define_xnnpack():
+    cxx_library(
+        name = "XNNPACK",
+        srcs = ["XNNPACK/src/allocator.c", "XNNPACK/src/init.c", "XNNPACK/src/memory-planner.c", "XNNPACK/src/operator-delete.c", "XNNPACK/src/runtime.c", "XNNPACK/src/subgraph.c", "XNNPACK/src/tensor.c", "XNNPACK/src/datatype-strings.c", "XNNPACK/src/operator-strings.c", "XNNPACK/src/subgraph-strings.c"],
+        deps = [":operators", ":subgraph", ":tables", ":ukernels_scalar", "//third_party:cpuinfo", "//third_party:pthreadpool", "//third_party:pthreadpool_header", ":arm_lib", ":x86_and_x86_64_lib"],
+        exported_deps = [],
+        compiler_flags = ["-w"],
+        preferred_linkage = "static",
+        exported_headers = {"xnnpack.h": "XNNPACK/include/xnnpack.h"},
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0", "-DXNN_NO_Q8_OPERATORS", "-DXNN_NO_F16_OPERATORS", "-DXNN_NO_NCHW_OPERATORS", "-DXNN_NO_QU8_OPERATORS", "-DXNN_NO_S8_OPERATORS", "-DXNN_NO_U8_OPERATORS", "-DXNN_NO_VCVT_OPERATORS", "-DXNN_NO_X32_OPERATORS", "-DXNN_NO_X8_OPERATORS", "-DXNN_NO_XX_OPERATORS"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_scalar",
+        srcs = ["XNNPACK/wrappers/params-init.c", "XNNPACK/wrappers/u8-lut32norm/scalar.c", "XNNPACK/wrappers/xx-copy/memcpy.c", "XNNPACK/wrappers/x8-lut/gen/lut-scalar-x4.c", "XNNPACK/wrappers/x32-depthtospace2d-chw2hwc/scalar.c"],
+        deps = [":interface", "//third_party:FP16", "//third_party:FXdiv"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "operators",
+        srcs = ["XNNPACK/src/operators/argmax-pooling-nhwc.c", "XNNPACK/src/operators/average-pooling-nhwc.c", "XNNPACK/src/operators/binary-elementwise-nd.c", "XNNPACK/src/operators/channel-shuffle-nc.c", "XNNPACK/src/operators/constant-pad-nd.c", "XNNPACK/src/operators/convolution-nchw.c", "XNNPACK/src/operators/convolution-nhwc.c", "XNNPACK/src/operators/deconvolution-nhwc.c", "XNNPACK/src/operators/depth-to-space-nchw2nhwc.c", "XNNPACK/src/operators/depth-to-space-nhwc.c", "XNNPACK/src/operators/fully-connected-nc.c", "XNNPACK/src/operators/global-average-pooling-ncw.c", "XNNPACK/src/operators/global-average-pooling-nwc.c", "XNNPACK/src/operators/lut-elementwise-nc.c", "XNNPACK/src/operators/max-pooling-nhwc.c", "XNNPACK/src/operators/prelu-nc.c", "XNNPACK/src/operators/resize-bilinear-nchw.c", "XNNPACK/src/operators/resize-bilinear-nhwc.c", "XNNPACK/src/operators/softmax-nc.c", "XNNPACK/src/operators/unary-elementwise-nc.c", "XNNPACK/src/operators/unpooling-nhwc.c", "XNNPACK/src/indirection.c", "XNNPACK/src/operator-run.c", "XNNPACK/src/packing.c"],
+        deps = [":interface", "//third_party:cpuinfo", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-Os"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "arm_lib",
+        srcs = [],
+        deps = [":jit_memory", ":ukernels_asm_aarch32", ":ukernels_asm_aarch64", ":ukernels_neon", ":ukernels_neon_aarch64", ":ukernels_neon_dot", ":ukernels_neon_fma", ":ukernels_neon_fp16", ":ukernels_neon_fp16arith_aarch64", ":ukernels_neon_v8", ":ukernels_scalar_aarch32"],
+        exported_deps = [],
+        compiler_flags = ["-w"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "third-party/XNNPACK",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = [],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "x86_and_x86_64_lib",
+        srcs = [],
+        deps = [":ukernels_avx", ":ukernels_avx2", ":ukernels_avx512", ":ukernels_avx512skx", ":ukernels_f16c", ":ukernels_fma3", ":ukernels_sse", ":ukernels_sse2", ":ukernels_sse41", ":ukernels_ssse3", ":ukernels_xop"],
+        exported_deps = [],
+        compiler_flags = ["-w"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "third-party/XNNPACK",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = [],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "tables",
+        srcs = ["XNNPACK/src/tables/exp2-k-over-64.c", "XNNPACK/src/tables/exp2-k-over-2048.c", "XNNPACK/src/tables/exp2minus-k-over-4.c", "XNNPACK/src/tables/exp2minus-k-over-8.c", "XNNPACK/src/tables/exp2minus-k-over-16.c", "XNNPACK/src/tables/exp2minus-k-over-64.c", "XNNPACK/src/tables/exp2minus-k-over-2048.c"],
+        deps = [":interface", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "subgraph",
+        srcs = ["XNNPACK/src/subgraph/abs.c", "XNNPACK/src/subgraph/add2.c", "XNNPACK/src/subgraph/argmax-pooling-2d.c", "XNNPACK/src/subgraph/average-pooling-2d.c", "XNNPACK/src/subgraph/bankers-rounding.c", "XNNPACK/src/subgraph/ceiling.c", "XNNPACK/src/subgraph/clamp.c", "XNNPACK/src/subgraph/convert.c", "XNNPACK/src/subgraph/convolution-2d.c", "XNNPACK/src/subgraph/deconvolution-2d.c", "XNNPACK/src/subgraph/depth-to-space.c", "XNNPACK/src/subgraph/depthwise-convolution-2d.c", "XNNPACK/src/subgraph/divide.c", "XNNPACK/src/subgraph/elu.c", "XNNPACK/src/subgraph/floor.c", "XNNPACK/src/subgraph/fully-connected.c", "XNNPACK/src/subgraph/global-average-pooling-2d.c", "XNNPACK/src/subgraph/hardswish.c", "XNNPACK/src/subgraph/leaky-relu.c", "XNNPACK/src/subgraph/max-pooling-2d.c", "XNNPACK/src/subgraph/maximum2.c", "XNNPACK/src/subgraph/minimum2.c", "XNNPACK/src/subgraph/multiply2.c", "XNNPACK/src/subgraph/negate.c", "XNNPACK/src/subgraph/prelu.c", "XNNPACK/src/subgraph/sigmoid.c", "XNNPACK/src/subgraph/softmax.c", "XNNPACK/src/subgraph/square-root.c", "XNNPACK/src/subgraph/square.c", "XNNPACK/src/subgraph/squared-difference.c", "XNNPACK/src/subgraph/static-constant-pad.c", "XNNPACK/src/subgraph/static-reshape.c", "XNNPACK/src/subgraph/static-resize-bilinear-2d.c", "XNNPACK/src/subgraph/subtract.c", "XNNPACK/src/subgraph/unpooling-2d.c"],
+        deps = [":interface", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_avx512",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mavx512f"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-mavx512f"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x25-minmax-avx512f.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/7x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/7x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-prelu/gen/avx512f-2x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-avx512f-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-avx512f-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-avx512f-x16.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-avx512f-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-avx512f-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-avx512f-x16.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_fp16arith_aarch64",
+        srcs = ["XNNPACK/wrappers/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x3-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-ibilinear/gen/neonfp16arith-c8.c", "XNNPACK/wrappers/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-prelu/gen/neonfp16arith-2x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmul-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmulc-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vclamp/gen/vclamp-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vhswish/gen/vhswish-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vlrelu/gen/vlrelu-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+fp16"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_avx",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mavx"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-mavx"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-avx.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-avx-x24.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/5x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/5x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-prelu/gen/avx-2x16.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-avx-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-avx-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-avx-x16.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx-rr2-p5-nr2-x40.c", "XNNPACK/wrappers/f32-vsqrt/gen/avx-sqrt-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-avx-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-avx-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-avx-x16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx-x64.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_sse41",
+        srcs = [],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-msse4.1"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-sse41-x8.c", "XNNPACK/wrappers/f32-prelu/gen/sse41-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-sse41-x32.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-sse41-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-sse41-rr2-lut64-p2-div-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-sse41-x16.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-sse41-x16.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/s8-ibilinear/gen/sse41-c16.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-sse41-c16.c", "XNNPACK/wrappers/s8-vclamp/sse41-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/sse41-c16.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon",
+        srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-neon-int16-x16.c", "XNNPACK/wrappers/f32-argmaxpool/4x-neon-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-neon-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9x-neon-c4.c", "XNNPACK/wrappers/f32-avgpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neon-2x2.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-neon-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neon-2x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-neon-x8.c", "XNNPACK/wrappers/f32-gavgpool-cw/neon-x4.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld128.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/neon-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld128.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-prelu/gen/neon-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/neon-rr2-lut64-p2-x8.c", "XNNPACK/wrappers/f32-rmax/neon.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-neon.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-neon-x8.c", "XNNPACK/wrappers/f32-velu/gen/velu-neon-rr2-lut16-p3-x8.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-neon-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-neon-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-neon-2x.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-neon-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-neon-rr2-lut64-p2-nr2recps-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-neon-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-neon-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-neon-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-gemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-igemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-neon-c8.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-neon-c8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/s8-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/s8-ibilinear/gen/neon-c16.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-neon-c16.c", "XNNPACK/wrappers/s8-vclamp/neon-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/u8-ibilinear/gen/neon-c16.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-neon-c16.c", "XNNPACK/wrappers/u8-rmax/neon.c", "XNNPACK/wrappers/u8-vclamp/neon-x64.c", "XNNPACK/wrappers/xx-fill/neon-x64.c", "XNNPACK/wrappers/xx-pad/neon.c", "XNNPACK/wrappers/x8-zip/xm-neon.c", "XNNPACK/wrappers/x8-zip/x2-neon.c", "XNNPACK/wrappers/x8-zip/x3-neon.c", "XNNPACK/wrappers/x8-zip/x4-neon.c", "XNNPACK/wrappers/x32-packx/x4-neon-st4.c", "XNNPACK/wrappers/x32-unpool/neon.c", "XNNPACK/wrappers/x32-zip/xm-neon.c", "XNNPACK/wrappers/x32-zip/x2-neon.c", "XNNPACK/wrappers/x32-zip/x3-neon.c", "XNNPACK/wrappers/x32-zip/x4-neon.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["^(android-armv7|iphoneos-armv7)$", ["-march=armv7-a", "-mfpu=neon", "-mfloat-abi=softfp"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_dot",
+        srcs = [],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+dotprod"]], ["^android-armv7$", ["-march=armv8.2-a+dotprod", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["^((?!iphoneos-armv7).)*$", ["XNNPACK/wrappers/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_aarch64",
+        srcs = ["XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neonfma-3x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neonfma-2x4-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neonfma-4x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neonfma-1x4-acc2.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-spmm/gen/32x2-minmax-neonfma.c", "XNNPACK/wrappers/f32-spmm/gen/32x4-minmax-neonfma.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vsqrt/gen/neon-sqrt-x4.c", "XNNPACK/wrappers/x8-lut/gen/lut-neon-tbx128x4-x64.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["(aarch64|arm64)", ["-mfpu=neon-vfpv4"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_v8",
+        srcs = ["XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-neonv8-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-neonv8-x32.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-neonv8-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8-a", "-mfpu=neon-fp-armv8"]], ["^android-armv7$", ["-march=armv8-a", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]], ["^iphoneos-armv7$", ["-mcpu=cyclone", "-mtune=generic"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_avx512skx",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mavx512f", "-mavx512cd", "-mavx512bw", "-mavx512dq", "-mavx512vl"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["^(i[3-6]86|x86|x86_64|AMD64)$", ["-mavx512f", "-mavx512cd", "-mavx512bw", "-mavx512dq", "-mavx512vl"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx512skx-x128.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx512skx-x128.c", "XNNPACK/wrappers/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qs8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx512skx-x32.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx512skx-x32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx512skx-vpshufb-x64.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_fp16",
+        srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-neonfp16-x16.c"],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["arm", ["-mfpu=neon-fp16"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "interface",
+        srcs = [],
+        deps = [],
+        exported_deps = ["//third_party:pthreadpool_header"],
+        compiler_flags = ["-w"],
+        preferred_linkage = "static",
+        exported_headers = {"xnnpack.h": "XNNPACK/include/xnnpack.h"},
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_fma3",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mfma", "-mf16c"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["^(i[3-6]86|x86|x86_64|AMD64)$", ["-mfma", "-mf16c"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x3-minmax-fma3.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x4-minmax-fma3.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x9-minmax-fma3.c", "XNNPACK/wrappers/f16-ibilinear/gen/fma3-c8.c", "XNNPACK/wrappers/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-fma3.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/1x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/4x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/5x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/5x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-fma3-x16.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "jit_memory",
+        srcs = ["XNNPACK/src/jit/aarch32-assembler.cc", "XNNPACK/src/jit/aarch64-assembler.cc", "XNNPACK/src/jit/assembler.cc", "XNNPACK/src/jit/memory.c"],
+        deps = [":interface", "//third_party:clog"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-Os"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_sse2",
+        srcs = [],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-msse2"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c", "XNNPACK/wrappers/f32-argmaxpool/4x-sse2-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-sse2-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9x-sse2-c4.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-sse2-x16.c", "XNNPACK/wrappers/f32-prelu/gen/sse2-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/sse2-rr2-p5-x20-acc2.c", "XNNPACK/wrappers/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-sse2-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-sse2-rr2-lut64-p2-div-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-sse2-c8.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-sse2-c8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/s8-ibilinear/gen/sse2-c8.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-sse2-c16.c", "XNNPACK/wrappers/s8-vclamp/sse2-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/sse2-c8.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-sse2-c16.c", "XNNPACK/wrappers/u8-rmax/sse2.c", "XNNPACK/wrappers/u8-vclamp/sse2-x64.c", "XNNPACK/wrappers/xx-fill/sse2-x64.c", "XNNPACK/wrappers/xx-pad/sse2.c", "XNNPACK/wrappers/x8-zip/xm-sse2.c", "XNNPACK/wrappers/x8-zip/x2-sse2.c", "XNNPACK/wrappers/x8-zip/x3-sse2.c", "XNNPACK/wrappers/x8-zip/x4-sse2.c", "XNNPACK/wrappers/x32-unpool/sse2.c", "XNNPACK/wrappers/x32-zip/xm-sse2.c", "XNNPACK/wrappers/x32-zip/x2-sse2.c", "XNNPACK/wrappers/x32-zip/x3-sse2.c", "XNNPACK/wrappers/x32-zip/x4-sse2.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_sse",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-msse"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-avgpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-sse-2x2.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-sse-2x4-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-sse-1x4-acc3.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-sse-4x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-sse-2x4.c", "XNNPACK/wrappers/f32-gavgpool-cw/sse-x4.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-gemm/gen/4x2c4-minmax-sse.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/sse-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/sse-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-igemm/gen/4x2c4-minmax-sse.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-rmax/sse.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-sse.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-sse-x8.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-sse-x8.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-sse-2x.c", "XNNPACK/wrappers/f32-vsqrt/gen/sse-sqrt-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-sse-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-sse-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-sse-x8.c", "XNNPACK/wrappers/x32-packx/x4-sse.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_asm_aarch32",
+        srcs = ["XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/4x4-aarch32-vfp-ld64.S", "XNNPACK/wrappers/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S", "XNNPACK/wrappers/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S"],
+        deps = [":interface", ":jit_memory", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["^android-armv7$", ["-march=armv8.2-a+dotprod", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_ssse3",
+        srcs = [],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-mssse3"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_f16c",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mf16c"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-mf16c"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-f16c-x16.c", "XNNPACK/wrappers/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-gavgpool/gen/7x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-maxpool/9p8x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-prelu/gen/f16c-2x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vadd-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vaddc-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmul-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmulc-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vclamp/gen/vclamp-f16c-x16.c", "XNNPACK/wrappers/f16-vhswish/gen/vhswish-f16c-x16.c", "XNNPACK/wrappers/f16-vlrelu/gen/vlrelu-f16c-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-f16c-x16.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_xop",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mxop"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows-x86_64", ["-Drestrict="]], ["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_scalar_aarch32",
+        srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/f32-argmaxpool/4x-scalar-c1.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-scalar-c1.c", "XNNPACK/wrappers/f32-argmaxpool/9x-scalar-c1.c", "XNNPACK/wrappers/f32-avgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-conv-hwc/3x3s2p0p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-conv-hwc/3x3s2p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x3-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x3-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x4-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x4-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x9-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x9-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x25-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x25-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-scalar-4x1.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-scalar-fabsf-x2.c", "XNNPACK/wrappers/f32-gavgpool-cw/scalar-x1.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-relu-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-relu-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-scalar.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/scalar-p4.c", "XNNPACK/wrappers/f32-ibilinear/gen/scalar-c2.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-relu-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-relu-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-scalar.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-prelu/gen/scalar-2x4.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-scalar-imagic-x4.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-scalar-imagic-x4.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/scalar-rr2-p5-x4-acc2.c", "XNNPACK/wrappers/f32-rmax/scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x1-minmax-scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x2-minmax-scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x4-minmax-scalar.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-scalar-x4.c", "XNNPACK/wrappers/f32-velu/gen/velu-scalar-rr2-lut16-p3-x4.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-scalar-x4.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-scalar-x4.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c", "XNNPACK/wrappers/f32-vrelu/gen/vrelu-scalar-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-scalar-rr2-lut64-p2-div-x2.c", "XNNPACK/wrappers/f32-vsqrt/gen/scalar-sqrt-x1.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-scalar-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-scalar-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-scalar-x4.c", "XNNPACK/wrappers/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qs8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/qu8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qu8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/s8-ibilinear/gen/scalar-c1.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/s8-vclamp/scalar-x4.c", "XNNPACK/wrappers/u8-ibilinear/gen/scalar-c1.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/u8-rmax/scalar.c", "XNNPACK/wrappers/u8-vclamp/scalar-x4.c", "XNNPACK/wrappers/xx-fill/scalar-x16.c", "XNNPACK/wrappers/xx-pad/scalar.c", "XNNPACK/wrappers/x8-zip/xm-scalar.c", "XNNPACK/wrappers/x8-zip/x2-scalar.c", "XNNPACK/wrappers/x8-zip/x3-scalar.c", "XNNPACK/wrappers/x8-zip/x4-scalar.c", "XNNPACK/wrappers/x32-packx/x2-scalar.c", "XNNPACK/wrappers/x32-packx/x3-scalar.c", "XNNPACK/wrappers/x32-packx/x4-scalar.c", "XNNPACK/wrappers/x32-unpool/scalar.c", "XNNPACK/wrappers/x32-zip/xm-scalar.c", "XNNPACK/wrappers/x32-zip/x2-scalar.c", "XNNPACK/wrappers/x32-zip/x3-scalar.c", "XNNPACK/wrappers/x32-zip/x4-scalar.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["^(android-armv7|iphoneos-armv7)$", ["-march=armv7-a", "-mfpu=neon", "-mfloat-abi=softfp"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_neon_fma",
+        srcs = ["XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-neonfma-acc2.c", "XNNPACK/wrappers/f32-gemm/gen/1x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-gemm/gen/6x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/neonfma-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/neonfma-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-igemm/gen/6x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/neonfma-rr1-lut64-p2-x16.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c", "XNNPACK/wrappers/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-neonfma-rr1-p6-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x16.c"],
+        deps = [":interface", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["arm", ["-mfpu=neon-vfpv4"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_avx2",
+        srcs = [],
+        deps = [":interface"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2", "-mavx2", "-mfma", "-mf16c"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["x86", ["-mavx2", "-mfma", "-mf16c"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-gemm/gen/4x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-igemm/gen/1x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-igemm/gen/4x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx2-x64.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx2-x64.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx2-rr1-p5-div-x40.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx2-x16.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx2-x16.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx2-x128.c"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
+
+    cxx_library(
+        name = "ukernels_asm_aarch64",
+        srcs = ["XNNPACK/wrappers/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/4x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a55.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a75.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/8x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/1x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/4x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/4x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/6x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a55.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a75.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/8x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-igemm/4x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x12inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x12inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/1x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/1x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/4x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S"],
+        deps = [":interface", ":jit_memory", "//third_party:FP16"],
+        exported_deps = [],
+        compiler_flags = ["-w", "-O2"],
+        preferred_linkage = "static",
+        exported_preprocessor_flags = [],
+        header_namespace = "",
+        headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]),
+        linker_flags = [],
+        platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+fp16+dotprod"]]],
+        platform_linker_flags = [],
+        platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]],
+        preprocessor_flags = ["-DXNN_LOG_LEVEL=0"],
+        soname = "",
+        visibility = ["PUBLIC"],
+    )
diff --git a/tools/README.md b/tools/README.md
index e4aba38afd85..94b61eecce82 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -33,41 +33,14 @@ Build system pieces:
 
 Developer tools which you might find useful:
 
-* [linter/clang_tidy](linter/clang_tidy/__main__.py) - Script for running clang-tidy
-  on lines of your script which you changed.
-* [extract_scripts.py](extract_scripts.py) - Extract scripts from
-  `.github/workflows/*.yml` into a specified dir, on which linters such as
-  [linter/run_shellcheck.sh](linter/run_shellcheck.sh) can be run. Assumes that every `run`
-  script has `shell: bash` unless a different shell is explicitly listed on that
-  specific step (so `defaults` doesn't currently work), but also has some rules
-  for other situations such as [actions/github-script][]. Exits with nonzero
-  status if any of the extracted scripts contain [GitHub Actions expressions][]:
-  `${{<expression> }}`
 * [git_add_generated_dirs.sh](git_add_generated_dirs.sh) and
   [git_reset_generated_dirs.sh](git_reset_generated_dirs.sh) -
   Use this to force add generated files to your Git index, so that you
   can conveniently run diffs on them when working on code-generation.
   (See also [generated_dirs.txt](generated_dirs.txt) which
   specifies the list of directories with generated files.)
-* [linter/mypy_wrapper.py](linter/mypy_wrapper.py) - Run `mypy` on a single file using the
-  appropriate subset of our `mypy*.ini` configs.
-* [linter/run_shellcheck.sh](linter/run_shellcheck.sh) - Find `*.sh` files (recursively) in
-  the directories specified as arguments, and run [ShellCheck][] on all of them.
 * [stats/test_history.py](stats/test_history.py) - Query S3 to display history of a single
   test across multiple jobs over time.
-* [linter/trailing_newlines.py](linter/trailing_newlines.py) - Take names of UTF-8 files from
-  stdin, print names of nonempty files whose contents don't end in exactly one
-  trailing newline, exit with status 1 if no output printed or 0 if some
-  filenames were printed.
-* [linter/translate_annotations.py](linter/translate_annotations.py) - Read [Flake8][] or
-  [clang-tidy][] warnings (according to a `--regex`) from a `--file`, convert to
-  the JSON format accepted by [pytorch/add-annotations-github-action], and
-  translate line numbers from `HEAD` back in time to the given `--commit` by
-  running `git diff-index --unified=0` appropriately.
-* [vscode_settings.py](vscode_settings.py) - Merge
-  `.vscode/settings_recommended.json` into your workspace-local
-  `.vscode/settings.json`, preferring the former in case of conflicts but
-  otherwise preserving the latter as much as possible.
 
 Important if you want to run on AMD GPU:
 
diff --git a/tools/actions_local_runner.py b/tools/actions_local_runner.py
deleted file mode 100755
index 050905934133..000000000000
--- a/tools/actions_local_runner.py
+++ /dev/null
@@ -1,440 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import subprocess
-import sys
-import os
-import argparse
-import yaml
-import asyncio
-import shutil
-import re
-import fnmatch
-import shlex
-import configparser
-
-from typing import List, Dict, Any, Optional, Union, NamedTuple, Set
-
-REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-
-class col:
-    HEADER = "\033[95m"
-    BLUE = "\033[94m"
-    GREEN = "\033[92m"
-    YELLOW = "\033[93m"
-    RED = "\033[91m"
-    RESET = "\033[0m"
-    BOLD = "\033[1m"
-    UNDERLINE = "\033[4m"
-
-
-def should_color() -> bool:
-    return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
-
-
-def color(the_color: str, text: str) -> str:
-    if should_color():
-        return col.BOLD + the_color + str(text) + col.RESET
-    else:
-        return text
-
-
-def cprint(the_color: str, text: str) -> None:
-    if should_color():
-        print(color(the_color, text))
-    else:
-        print(text)
-
-
-def git(args: List[str]) -> List[str]:
-    p = subprocess.run(
-        ["git"] + args,
-        cwd=REPO_ROOT,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        check=True,
-    )
-    lines = p.stdout.decode().strip().split("\n")
-    return [line.strip() for line in lines]
-
-
-def find_changed_files() -> List[str]:
-    untracked = []
-
-    for line in git(["status", "--porcelain"]):
-        # Untracked files start with ??, so grab all of those
-        if line.startswith("?? "):
-            untracked.append(line.replace("?? ", ""))
-
-    # Modified, unstaged
-    modified = git(["diff", "--name-only"])
-
-    # Modified, staged
-    cached = git(["diff", "--cached", "--name-only"])
-
-    # Committed
-    merge_base = git(["merge-base", "origin/master", "HEAD"])[0]
-    diff_with_origin = git(["diff", "--name-only", merge_base, "HEAD"])
-
-    # De-duplicate
-    all_files = set()
-    for x in untracked + cached + modified + diff_with_origin:
-        stripped = x.strip()
-        if stripped != "" and os.path.exists(stripped):
-            all_files.add(stripped)
-    return list(all_files)
-
-
-def print_results(job_name: str, passed: bool, streams: List[str]) -> None:
-    icon = color(col.GREEN, "✓") if passed else color(col.RED, "x")
-    print(f"{icon} {color(col.BLUE, job_name)}")
-
-    for stream in streams:
-        stream = stream.strip()
-        if stream != "":
-            print(stream)
-
-
-class CommandResult(NamedTuple):
-    passed: bool
-    stdout: str
-    stderr: str
-
-
-async def shell_cmd(
-    cmd: Union[str, List[str]],
-    env: Optional[Dict[str, Any]] = None,
-    redirect: bool = True,
-) -> CommandResult:
-    if isinstance(cmd, list):
-        cmd_str = " ".join(shlex.quote(arg) for arg in cmd)
-    else:
-        cmd_str = cmd
-
-    proc = await asyncio.create_subprocess_shell(
-        cmd_str,
-        shell=True,
-        cwd=REPO_ROOT,
-        env=env,
-        stdout=subprocess.PIPE if redirect else None,
-        stderr=subprocess.PIPE if redirect else None,
-        executable=shutil.which("bash"),
-    )
-    stdout, stderr = await proc.communicate()
-
-    passed = proc.returncode == 0
-    if not redirect:
-        return CommandResult(passed, "", "")
-
-    return CommandResult(passed, stdout.decode().strip(), stderr.decode().strip())
-
-
-class Check:
-    name: str
-
-    def __init__(self, files: Optional[List[str]], quiet: bool):
-        self.quiet = quiet
-        self.files = files
-
-    async def run(self) -> bool:
-        result = await self.run_helper()
-        if result is None:
-            return True
-
-        streams = []
-        if not result.passed:
-            streams = [
-                result.stderr,
-                result.stdout,
-            ]
-        print_results(self.name, result.passed, streams)
-        return result.passed
-
-    async def run_helper(self) -> Optional[CommandResult]:
-        if self.files is not None:
-            relevant_files = self.filter_files(self.files)
-            if len(relevant_files) == 0:
-                # No files, do nothing
-                return CommandResult(passed=True, stdout="", stderr="")
-
-            return await self.quick(relevant_files)
-
-        return await self.full()
-
-    def filter_ext(self, files: List[str], extensions: Set[str]) -> List[str]:
-        def passes(filename: str) -> bool:
-            return os.path.splitext(filename)[1] in extensions
-
-        return [f for f in files if passes(f)]
-
-    def filter_files(self, files: List[str]) -> List[str]:
-        return files
-
-    async def quick(self, files: List[str]) -> CommandResult:
-        raise NotImplementedError
-
-    async def full(self) -> Optional[CommandResult]:
-        raise NotImplementedError
-
-
-class Flake8(Check):
-    name = "flake8"
-
-    def filter_files(self, files: List[str]) -> List[str]:
-        config = configparser.ConfigParser()
-        config.read(os.path.join(REPO_ROOT, ".flake8"))
-
-        excludes = re.split(r",\s*", config["flake8"]["exclude"].strip())
-        excludes = [e.strip() for e in excludes if e.strip() != ""]
-
-        def should_include(name: str) -> bool:
-            for exclude in excludes:
-                if fnmatch.fnmatch(name, pat=exclude):
-                    return False
-                if name.startswith(exclude) or f"./{name}".startswith(exclude):
-                    return False
-            return True
-
-        files = self.filter_ext(files, {".py"})
-        return [f for f in files if should_include(f)]
-
-    async def quick(self, files: List[str]) -> CommandResult:
-        return await shell_cmd(["flake8"] + files)
-
-    async def full(self) -> CommandResult:
-        return await shell_cmd(["flake8"])
-
-
-class Mypy(Check):
-    name = "mypy (skipped typestub generation)"
-
-    def filter_files(self, files: List[str]) -> List[str]:
-        return self.filter_ext(files, {".py", ".pyi"})
-
-    def env(self) -> Dict[str, Any]:
-        env = os.environ.copy()
-        if should_color():
-            # Secret env variable: https://github.com/python/mypy/issues/7771
-            env["MYPY_FORCE_COLOR"] = "1"
-        return env
-
-    async def quick(self, files: List[str]) -> CommandResult:
-        return await shell_cmd(
-            [sys.executable, "tools/linter/mypy_wrapper.py"]
-            + [os.path.join(REPO_ROOT, f) for f in files],
-            env=self.env(),
-        )
-
-    async def full(self) -> None:
-        env = self.env()
-        # hackily change the name
-        self.name = "mypy"
-
-        await shell_cmd(
-            [
-                sys.executable,
-                "tools/actions_local_runner.py",
-                "--job",
-                "mypy",
-                "--file",
-                ".github/workflows/lint.yml",
-                "--step",
-                "Run autogen",
-            ],
-            redirect=False,
-            env=env,
-        )
-
-        await shell_cmd(
-            [
-                sys.executable,
-                "tools/actions_local_runner.py",
-                "--job",
-                "mypy",
-                "--file",
-                ".github/workflows/lint.yml",
-                "--step",
-                "Run mypy",
-            ],
-            redirect=False,
-            env=env,
-        )
-
-
-class ShellCheck(Check):
-    name = "shellcheck: Run ShellCheck"
-
-    def filter_files(self, files: List[str]) -> List[str]:
-        return self.filter_ext(files, {".sh"})
-
-    async def quick(self, files: List[str]) -> CommandResult:
-        return await shell_cmd(
-            ["tools/linter/run_shellcheck.sh"]
-            + [os.path.join(REPO_ROOT, f) for f in files],
-        )
-
-    async def full(self) -> None:
-        await shell_cmd(
-            [
-                sys.executable,
-                "tools/actions_local_runner.py",
-                "--job",
-                "shellcheck",
-                "--file",
-                ".github/workflows/lint.yml",
-                "--step",
-                "Run ShellCheck",
-            ],
-            redirect=False,
-        )
-
-
-class ClangTidy(Check):
-    name = "clang-tidy: Run clang-tidy"
-    common_options = [
-        "--clang-tidy-exe",
-        ".clang-tidy-bin/clang-tidy",
-    ]
-
-    def filter_files(self, files: List[str]) -> List[str]:
-        return self.filter_ext(files, {".c", ".cc", ".cpp"})
-
-    async def quick(self, files: List[str]) -> CommandResult:
-        return await shell_cmd(
-            [sys.executable, "-m", "tools.linter.clang_tidy", "--paths"]
-            + [os.path.join(REPO_ROOT, f) for f in files]
-            + self.common_options,
-        )
-
-    async def full(self) -> None:
-        await shell_cmd(
-            [sys.executable, "-m", "tools.linter.clang_tidy"] + self.common_options,
-            redirect=False,
-        )
-
-
-class YamlStep(Check):
-    def __init__(self, step: Dict[str, Any], job_name: str, quiet: bool):
-        super().__init__(files=None, quiet=quiet)
-        self.step = step
-        self.name = f'{job_name}: {self.step["name"]}'
-
-    async def full(self) -> CommandResult:
-        env = os.environ.copy()
-        env["GITHUB_WORKSPACE"] = "/tmp"
-        script = self.step["run"]
-
-        if self.quiet:
-            # TODO: Either lint that GHA scripts only use 'set -eux' or make this more
-            # resilient
-            script = script.replace("set -eux", "set -eu")
-            script = re.sub(r"^time ", "", script, flags=re.MULTILINE)
-
-        return await shell_cmd(script, env=env)
-
-
-def changed_files() -> Optional[List[str]]:
-    changed_files: Optional[List[str]] = None
-    try:
-        changed_files = sorted(find_changed_files())
-    except Exception:
-        # If the git commands failed for some reason, bail out and use the whole list
-        print(
-            "Could not query git for changed files, falling back to testing all files instead",
-            file=sys.stderr,
-        )
-        return None
-
-    return changed_files
-
-
-def grab_specific_steps(
-    steps_to_grab: List[str], job: Dict[str, Any]
-) -> List[Dict[str, Any]]:
-    relevant_steps = []
-    for step in steps_to_grab:
-        for actual_step in job["steps"]:
-            if actual_step["name"].lower().strip() == step.lower().strip():
-                relevant_steps.append(actual_step)
-                break
-
-    if len(relevant_steps) != len(steps_to_grab):
-        raise RuntimeError(f"Missing steps:\n{relevant_steps}\n{steps_to_grab}")
-
-    return relevant_steps
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Pull shell scripts out of GitHub actions and run them"
-    )
-    parser.add_argument("--file", help="YAML file with actions")
-    parser.add_argument(
-        "--changed-only",
-        help="only run on changed files",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument("--job", help="job name", required=True)
-    parser.add_argument(
-        "--no-quiet", help="output commands", action="store_true", default=False
-    )
-    parser.add_argument("--step", action="append", help="steps to run (in order)")
-    args = parser.parse_args()
-
-    quiet = not args.no_quiet
-
-    if args.file is None:
-        # If there is no .yml file provided, fall back to the list of known
-        # jobs. We use this for flake8 and mypy since they run different
-        # locally than in CI due to 'make quicklint'
-        if args.job not in ad_hoc_steps:
-            raise RuntimeError(
-                f"Job {args.job} not found and no .yml file was provided"
-            )
-
-        files = None
-        if args.changed_only:
-            files = changed_files()
-
-        checks = [ad_hoc_steps[args.job](files, quiet)]
-    else:
-        if args.step is None:
-            raise RuntimeError("1+ --steps must be provided")
-
-        action = yaml.safe_load(open(args.file, "r"))
-        if "jobs" not in action:
-            raise RuntimeError(f"top level key 'jobs' not found in {args.file}")
-        jobs = action["jobs"]
-
-        if args.job not in jobs:
-            raise RuntimeError(f"job '{args.job}' not found in {args.file}")
-
-        job = jobs[args.job]
-
-        # Pull the relevant sections out of the provided .yml file and run them
-        relevant_steps = grab_specific_steps(args.step, job)
-        checks = [
-            YamlStep(step=step, job_name=args.job, quiet=quiet)
-            for step in relevant_steps
-        ]
-
-    loop = asyncio.get_event_loop()
-    loop.run_until_complete(asyncio.gather(*[check.run() for check in checks]))
-
-
-# These are run differently locally in order to enable quicklint, so dispatch
-# out to special handlers instead of using lint.yml
-ad_hoc_steps = {
-    "mypy": Mypy,
-    "flake8-py3": Flake8,
-    "shellcheck": ShellCheck,
-    "clang-tidy": ClangTidy,
-}
-
-if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        pass
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 38698631c03c..ca41b17a43e7 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -4,43 +4,50 @@
 import os
 import argparse
 import sys
-sys.path.append(os.path.realpath(os.path.join(
-    __file__,
-    os.path.pardir,
-    os.path.pardir,
-    os.path.pardir,
-    'torch',
-    'utils')))
+
+sys.path.append(
+    os.path.realpath(
+        os.path.join(
+            __file__, os.path.pardir, os.path.pardir, os.path.pardir, "torch", "utils"
+        )
+    )
+)
 
 from hipify import hipify_python  # type: ignore[import]
 
-parser = argparse.ArgumentParser(description='Top-level script for HIPifying, filling in most common parameters')
+parser = argparse.ArgumentParser(
+    description="Top-level script for HIPifying, filling in most common parameters"
+)
 parser.add_argument(
-    '--out-of-place-only',
-    action='store_true',
-    help="Whether to only run hipify out-of-place on source files")
+    "--out-of-place-only",
+    action="store_true",
+    help="Whether to only run hipify out-of-place on source files",
+)
 
 parser.add_argument(
-    '--project-directory',
+    "--project-directory",
     type=str,
-    default='',
+    default="",
     help="The root of the project.",
-    required=False)
+    required=False,
+)
 
 parser.add_argument(
-    '--output-directory',
+    "--output-directory",
     type=str,
-    default='',
+    default="",
     help="The directory to store the hipified project",
-    required=False)
+    required=False,
+)
 
 parser.add_argument(
-    '--extra-include-dir',
+    "--extra-include-dir",
     type=str,
     default=[],
-    nargs='+',
+    nargs="+",
     help="The list of extra directories in caffe2 to hipify",
-    required=False)
+    required=False,
+)
 
 args = parser.parse_args()
 
@@ -78,8 +85,11 @@
     "aten/src/ATen/cuda/*",
     "aten/src/ATen/native/cuda/*",
     "aten/src/ATen/native/cudnn/*",
+    "aten/src/ATen/native/quantized/cudnn/*",
+    "aten/src/ATen/native/nested/cuda/*",
     "aten/src/ATen/native/sparse/cuda/*",
     "aten/src/ATen/native/quantized/cuda/*",
+    "aten/src/ATen/native/transformers/cuda/*",
     "aten/src/THC/*",
     "aten/src/ATen/test/*",
     # CMakeLists.txt isn't processed by default, but there are a few
@@ -89,16 +99,18 @@
     "tools/autograd/templates/python_variable_methods.cpp",
 ]
 
+includes = [os.path.join(proj_dir, include) for include in includes]
+
 for new_dir in args.extra_include_dir:
     abs_new_dir = os.path.join(proj_dir, new_dir)
     if os.path.exists(abs_new_dir):
-        new_dir = os.path.join(new_dir, '**/*')
-        includes.append(new_dir)
+        abs_new_dir = os.path.join(abs_new_dir, "**/*")
+        includes.append(abs_new_dir)
 
 ignores = [
     "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
     "caffe2/operators/pool_op_cudnn.cu",
-    '*/hip/*',
+    "*/hip/*",
     # These files are compatible with both cuda and hip
     "aten/src/ATen/core/*",
     "torch/csrc/jit/codegen/cuda/codegen.cpp",
@@ -112,15 +124,18 @@
     "torch/include/*",
 ]
 
+ignores = [os.path.join(proj_dir, ignore) for ignore in ignores]
+
 # Check if the compiler is hip-clang.
 def is_hip_clang() -> bool:
     try:
-        hip_path = os.getenv('HIP_PATH', '/opt/rocm/hip')
-        with open(hip_path + '/lib/.hipInfo') as f:
-            return 'HIP_COMPILER=clang' in f.read()
+        hip_path = os.getenv("HIP_PATH", "/opt/rocm/hip")
+        with open(hip_path + "/lib/.hipInfo") as f:
+            return "HIP_COMPILER=clang" in f.read()
     except IOError:
         return False
 
+
 # TODO Remove once gloo submodule is recent enough to contain upstream fix.
 if is_hip_clang():
     gloo_cmake_file = "third_party/gloo/cmake/Hip.cmake"
@@ -128,7 +143,7 @@ def is_hip_clang() -> bool:
     if os.path.exists(gloo_cmake_file):
         with open(gloo_cmake_file, "r") as sources:
             lines = sources.readlines()
-        newlines = [line.replace(' hip_hcc ', ' amdhip64 ') for line in lines]
+        newlines = [line.replace(" hip_hcc ", " amdhip64 ") for line in lines]
         if lines == newlines:
             print("%s skipped" % gloo_cmake_file)
         else:
@@ -142,7 +157,7 @@ def is_hip_clang() -> bool:
     do_write = False
     with open(gloo_cmake_file, "r") as sources:
         lines = sources.readlines()
-    newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines]
+    newlines = [line.replace("RCCL_LIBRARY", "RCCL_LIBRARY_PATH") for line in lines]
     if lines == newlines:
         print("%s skipped" % gloo_cmake_file)
     else:
@@ -158,7 +173,7 @@ def is_hip_clang() -> bool:
     if os.path.exists(gloo_cmake_file):
         with open(gloo_cmake_file, "r") as sources:
             lines = sources.readlines()
-        newlines = [line.replace('HIP_HCC_FLAGS', 'HIP_CLANG_FLAGS') for line in lines]
+        newlines = [line.replace("HIP_HCC_FLAGS", "HIP_CLANG_FLAGS") for line in lines]
         if lines == newlines:
             print("%s skipped" % gloo_cmake_file)
         else:
@@ -173,4 +188,5 @@ def is_hip_clang() -> bool:
     includes=includes,
     ignores=ignores,
     out_of_place_only=args.out_of_place_only,
-    hip_clang_launch=is_hip_clang())
+    hip_clang_launch=is_hip_clang(),
+)
diff --git a/tools/autograd/BUILD.bazel b/tools/autograd/BUILD.bazel
new file mode 100644
index 000000000000..d1a0db360d23
--- /dev/null
+++ b/tools/autograd/BUILD.bazel
@@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
diff --git a/tools/autograd/BUILD.buck b/tools/autograd/BUILD.buck
new file mode 100644
index 000000000000..aedc8fa342b4
--- /dev/null
+++ b/tools/autograd/BUILD.buck
@@ -0,0 +1,34 @@
+python_library(
+    name = "autograd",
+    srcs = glob(
+        ["*.py"],
+    ),
+    base_module = "tools.autograd",
+    resources = [
+        "deprecated.yaml",
+        "derivatives.yaml",
+        "templates/ADInplaceOrViewType.cpp",
+        "templates/Functions.cpp",
+        "templates/Functions.h",
+        "templates/TraceType.cpp",
+        "templates/VariableType.cpp",
+        "templates/VariableType.h",
+        "templates/annotated_fn_args.py.in",
+        "templates/python_fft_functions.cpp",
+        "templates/python_functions.cpp",
+        "templates/python_functions.h",
+        "templates/python_linalg_functions.cpp",
+        "templates/python_nn_functions.cpp",
+        "templates/python_return_types.cpp",
+        "templates/python_sparse_functions.cpp",
+        "templates/python_special_functions.cpp",
+        "templates/python_torch_functions.cpp",
+        "templates/python_variable_methods.cpp",
+        "templates/variable_factories.h",
+    ],
+    visibility = ["PUBLIC"],
+    deps = [
+        "//third_party:pyyaml",
+        "//torchgen:torchgen",
+    ],
+)
diff --git a/tools/autograd/build.bzl b/tools/autograd/build.bzl
new file mode 100644
index 000000000000..a21ca870708c
--- /dev/null
+++ b/tools/autograd/build.bzl
@@ -0,0 +1,14 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "autograd",
+        srcs = rules.glob(["*.py"]),
+        data = rules.glob([
+            "*.yaml",
+            "templates/*",
+        ]),
+        visibility = ["//:__subpackages__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//torchgen:torchgen",
+        ],
+    )
diff --git a/tools/autograd/context.py b/tools/autograd/context.py
index 66f4f81aa0fb..af1a6025ed8d 100644
--- a/tools/autograd/context.py
+++ b/tools/autograd/context.py
@@ -1,15 +1,18 @@
-from tools.codegen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
-from tools.codegen.context import native_function_manager
-from tools.codegen.utils import T
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
+from torchgen.context import native_function_manager
+from torchgen.utils import T
 
 import functools
 from typing import Callable
 
 # Like tools.api.context.with_native_function, but for
 # NativeFunctionWithDifferentiabilityInfo.
-def with_native_function_with_differentiability_info(func: Callable[[NFWDI], T]) -> Callable[[NFWDI], T]:
+def with_native_function_with_differentiability_info(
+    func: Callable[[NFWDI], T]
+) -> Callable[[NFWDI], T]:
     @functools.wraps(func)
     def wrapper(f: NFWDI) -> T:
         with native_function_manager(f.func):
             return func(f)
+
     return wrapper
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 582ba69c3623..75aec440808e 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -230,14 +230,14 @@
 
 - name: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   self: maybe_multiply(grad, beta.conj())
-  mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), alpha)
-  mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), alpha)
+  mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), mat1.layout(), alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), mat2.layout(), alpha)
   result: maybe_multiply(self_t, beta) + maybe_multiply(mat1_t.mm(mat2_p), alpha) + maybe_multiply(mat1_p.mm(mat2_t), alpha)
 
-- name: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+- name: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   self: maybe_multiply(grad, beta)
-  sparse: _sparse_addmm_sparse_backward(grad, sparse, dense, alpha)
-  dense: mm_mat2_backward(grad, sparse, dense.sizes(), dense.strides(), alpha)
+  mat1: mm_mat1_sparse_backward(grad, mat1, mat2, alpha)
+  mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), mat2.layout(), alpha)
 
 - name: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   self: maybe_multiply(grad, beta.conj())
@@ -260,7 +260,7 @@
 
 - name: angle(Tensor self) -> Tensor
   self: angle_backward(grad, self)
-  result: handle_r_to_c(result.scalar_type(), angle_backward(self_t, self_p))
+  result: handle_r_to_c(result.scalar_type(), angle_backward(self_t.conj(), self_p).conj())
 
 # The four items below are necessary because TensorIterator doesn't work on
 # Variables (codegen does not unwrap the input Tensor for all() and any() ).
@@ -315,6 +315,7 @@
 
 - name: atan2(Tensor self, Tensor other) -> Tensor
   self, other: atan2_backward(grad, self, other, grad_input_mask)
+  result: (-self_p * other_t + other_p * self_t) / (self_p.pow(2) + other_p.pow(2))
 
 - name: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   self: maybe_multiply(grad, beta.conj())
@@ -365,12 +366,14 @@
 
 - name: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
   self: cholesky_inverse_backward(grad, self, upper, result)
+  result: cholesky_inverse_jvp(self_p, self_t, result, upper)
 
 # For clamp, gradient is not defined at the boundaries. But empirically it's helpful
 # to be able to get gradient on min and max, so we return the subgradient 1 for these cases.
 - name: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
   self: clamp_backward(grad, self, min, max)
   min, max: clamp_backward_min_max(grad, self, min, max, grad_input_mask)
+  result: clamp_jvp(self_p, self_t, min_p, min_t, max_p, max_t)
 
 - name: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
   self: clamp_backward(grad, self, min, max)
@@ -383,7 +386,7 @@
 - name: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
   self: where(self >= min, grad, at::scalar_tensor(0., grad.options()))
   min: where(self < min, grad, at::scalar_tensor(0., grad.options()))
-  result: where(self_p >= min_p, self_t, at::scalar_tensor(0., self_p.options())) + where(self_p < min_p, min_t, at::scalar_tensor(0., self_p.options()))
+  result: where(self_p >= min_p, self_t, min_t)
 
 - name: clamp_max(Tensor self, Scalar max) -> Tensor
   self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
@@ -392,14 +395,14 @@
 - name: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
   self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
   max: where(self > max, grad, at::scalar_tensor(0., grad.options()))
-  result: where(self_p <= max_p, self_t, at::scalar_tensor(0., self_p.options())) + where(self_p > max_p, max_t, at::scalar_tensor(0., self_p.options()))
+  result: where(self_p <= max_p, self_t, max_t)
 
 - name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
   self: grad
   result: auto_linear
 
 - name: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
-  self: grad.to(self.options(), /*non_blocking*/false, /*copy*/false)
+  self: _to_copy_backward(grad, self.options())
   result: _to_copy(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format)
   # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype)
   # (If dtype IS nullopt, we rely on the regular check that any input requires grad).
@@ -415,6 +418,7 @@
 
 - name: polar(Tensor abs, Tensor angle) -> Tensor
   abs, angle: polar_backward(grad, result)
+  result: at::complex(abs_t*angle_p.cos() - angle_t*abs_p*angle_p.sin(), abs_t*angle_p.sin() + angle_t*abs_p*angle_p.cos())
 
 - name: _conj(Tensor(a) self) -> Tensor(a)
   self: grad.conj()
@@ -512,6 +516,7 @@
 - name: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
   self: norm_backward(grad, self - other, p, result)
   other: -norm_backward(grad, self - other, p, result)
+  result: norm_jvp(self_p - other_p, self_t - other_t, p, result, {}, false)
 
 # The backward formula is done in this order to improve numerical stability
 # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414
@@ -549,6 +554,7 @@
 
 - name: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
   input: "GradMode::is_enabled() ? infinitely_differentiable_native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p)))) : native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p))))"
+  result0: "(!train.has_value() || train.value()) ? (p == 1 ? 0.0 : 1.0 / (1.0 - p)) * input_t * result1 : input_t"
 
 - name: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
   grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)"
@@ -598,6 +604,10 @@
   self: at::sum_to(grad, self.sizes())
   result: auto_linear
 
+- name: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+  self: at::sum_to(grad, c10::expectIntArrayRef(self.sym_sizes()))
+  result: auto_linear
+
 - name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   self: zeros_like(grad)
   result: self_t.zero_()
@@ -620,6 +630,15 @@
 - name: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
   self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
 
+- name: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  self: zeros_like(grad)
+  result: at::fill(self_t, 0)
+
+- name: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  self: zeros_like(grad)
+  value: grad.sum()
+  result: at::fill(self_t, value_t)
+
 - name: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
   self: zeros_like(grad)
   result: self_t.fill_(0)
@@ -681,7 +700,7 @@
   input, grid: "grad.defined() ? grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
 
 - name: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners) : std::tuple<Tensor, Tensor>()"
+  input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
 
 # See NOTE [ grid_sample CPU fallback ]
 - name: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
@@ -749,8 +768,12 @@
   index: non_differentiable
   result: at::index_add(self_t, dim, index, maybe_multiply(source_t, alpha))
 
+- name: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+  self, source: index_reduce_backward(grad, self, dim, index, source, reduce, include_self, result)
+  index: non_differentiable
+
 - name: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
-  self: grad.clone().index_fill_(dim, index, 0)
+  self: grad.index_fill(dim, index, 0)
   # The case source.dim() == 0 is necessary to support scalar tensors of the form
   # source.dim() == 0 and index.dim() == 1 and index.size() == (1,),
   # This is because source is not broadcastable to index, as source.dim() < index.dim()
@@ -910,6 +933,7 @@
 
 - name: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
   self: logsumexp_backward(grad, self, result, dim, keepdim)
+  result: logsumexp_jvp(self_p, self_t, dim, keepdim)
 
 - name: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR)
   self: not_implemented("lstsq")
@@ -930,17 +954,26 @@
   result: self_t.zero_()
 
 - name: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
-  A: lu_factor_ex_backward(grad, A, LU, pivots)
-  LU: lu_factor_ex_jvp(A_t, LU, pivots)
+  A: lu_factor_ex_backward(grad, LU, pivots, pivot)
+  LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot)
   output_differentiability: [True, False, False]
 
+- name: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+  A: linalg_lu_backward(grad_L, grad_U, P, L, U, pivot)
+  L: std::get<0>(linalg_lu_jvp(A_t, P, L, U, pivot))
+  U: std::get<1>(linalg_lu_jvp(A_t, P, L, U, pivot))
+  output_differentiability: [False, True, True]
+
 - name: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
   self, LU_data: lu_solve_backward(grad, result, LU_data, LU_pivots, grad_input_mask)
   result: lu_solve_jvp(result, LU_data_p, LU_data_t, self_t, LU_pivots)
 
 - name: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
-  LU_data: lu_unpack_backward(grads, LU_data, unpack_data)
+  LU_data: lu_unpack_backward(grad_L, grad_U, LU_data.size(-2), LU_data.size(-1))
   LU_pivots: non_differentiable
+  L: "LU_data_t.size(-2) >= LU_data_t.size(-1) ? LU_data_t.tril(-1) : LU_data_t.narrow(-1, 0, LU_data_t.size(-2)).tril(-1)"
+  U: "LU_data_t.size(-1) >= LU_data_t.size(-2) ? LU_data_t.triu() : LU_data_t.narrow(-2, 0, LU_data_t.size(-1)).triu()"
+  output_differentiability: [False, True, True]
 
 - name: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
   self: grad.masked_fill(mask, 0)
@@ -979,7 +1012,7 @@
 - name: maximum(Tensor self, Tensor other) -> Tensor
   self: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
   other: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
-  result: other_t + at::where(self_p == other_p, 0.5, (self_p > other_p).to(result.scalar_type())) * (self_t - other_t)
+  result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p > other_p).to(result.scalar_type())) * (self_t - other_t)
 
 - name: fmax(Tensor self, Tensor other) -> Tensor
   self: grad.masked_fill((self >= other).logical_or_(other.isnan()).logical_not_(), 0)
@@ -1035,7 +1068,7 @@
 - name: minimum(Tensor self, Tensor other) -> Tensor
   self: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
   other: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
-  result: other_t + at::where(self_p == other_p, 0.5, (self_p < other_p).to(result.scalar_type())) * (self_t - other_t)
+  result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p < other_p).to(result.scalar_type())) * (self_t - other_t)
 
 - name: fmin(Tensor self, Tensor other) -> Tensor
   self: grad.masked_fill((self <= other).logical_or_(other.isnan()).logical_not_(), 0)
@@ -1049,8 +1082,8 @@
   self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim)
 
 - name: mm(Tensor self, Tensor mat2) -> Tensor
-  self: mm_mat1_backward(grad, mat2, self.sizes(), self.strides(), 1)
-  mat2: mm_mat2_backward(grad, self, mat2.sizes(), mat2.strides(), 1)
+  self: mm_mat1_backward(grad, mat2, self.sizes(), self.strides(), self.layout(), 1)
+  mat2: mm_mat2_backward(grad, self, mat2.sizes(), mat2.strides(), mat2.layout(), 1)
   result: at::mm(self_t, mat2_p) + at::mm(self_p, mat2_t)
 
 - name: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -1123,18 +1156,23 @@
 
 - name: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
   self: norm_backward(grad, self, p, result)
+  result: norm_jvp(self_p, self_t, p, result)
 
 - name: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
   self: norm_backward(grad, self, p, result, dim, keepdim)
+  result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
 
 - name: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
   self: norm_backward(grad, self.to(grad.scalar_type()), p, result)
+  result: norm_jvp(self_p, self_t, p, result)
 
 - name: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   self: norm_backward(grad, self.to(grad.scalar_type()), p, result, dim, keepdim)
+  result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
 
 - name: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   self: linalg_vector_norm_backward(grad, self, ord, result, dim, keepdim)
+  result: linalg_vector_norm_jvp(self_p, self_t, ord, result, dim, keepdim)
 
 - name: _pdist_forward(Tensor self, float p=2) -> Tensor
   self: _pdist_backward(grad, self, p, result)
@@ -1216,11 +1254,11 @@
   self: "accumulate ? grad : grad.put(index, zeros_like(source), false)"
   index: non_differentiable
   source: grad.take(index).reshape_as(source)
+  result: auto_linear  # It is affine, but sure
 
-- name: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
-  self: linalg_qr_backward(grads, self, mode, Q, R)
-  Q: linalg_qr_jvp_Q(self_t, Q, R)
-  R: linalg_qr_jvp_R(self_t, Q, R)
+- name: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
+  A: linalg_qr_backward(grad_Q, grad_R, Q, R, mode)
+  Q, R: linalg_qr_jvp(A_t, Q, R, mode)
 
 - name: rad2deg(Tensor self) -> Tensor
   self: rad2deg_backward(grad)
@@ -1266,6 +1304,15 @@
   self: grad * std::sqrt(2 * M_PI) * (result.square() / 2).exp()
   result: auto_element_wise
 
+- name: special_log_ndtr(Tensor self) -> Tensor
+  self: grad / std::sqrt(2 * M_PI) * (result + self.pow(2) / 2).neg().exp()
+  result: auto_element_wise
+
+# [Note: Sometimes view derivatives]
+# The following situation applies to other operations as well.
+# TODO: This note is only referenced once by to_dense. Make this
+# more generic if it's been referenced more than once.
+#
 # DO NOT define a backward for reshape!
 # reshape is special in that it sometimes returns a view, and sometimes not.
 # Defining a backward will make codegen spit out the forward call as
@@ -1373,10 +1420,6 @@
   self: slogdet_backward(grad, self, sign, logabsdet)
   output_differentiability: [false, true]
 
-- name: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
-  self: solve_backward_self(grad, self, A)
-  A: solve_backward_A(grad, self, A, solution)
-
 - name: linalg_solve(Tensor input, Tensor other) -> Tensor
   input: solve_backward_A(grad, other, input, result)
   other: solve_backward_self(grad, other, input)
@@ -1447,9 +1490,11 @@
 - name: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
   other: handle_r_to_c(other.scalar_type(), grad)
+  result: -maybe_multiply(self_t, alpha) + other_t
 
 - name: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
+  result: auto_element_wise
 
 - name: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
   self: grad.expand(self.sizes())
@@ -1459,10 +1504,7 @@
   self: sum_backward(grad, self.sizes(), dim, keepdim)
   result: auto_linear
 
-- name: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  self: grad.expand(self.sizes()).to(self.scalar_type()) * self.isnan().logical_not()
-
-- name: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+- name: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   self: nansum_backward(grad.to(self.scalar_type()), self, dim, keepdim)
 
 # We never call _linalg_svd with compute_uv=False in an autograd context, so we don't even consider it here
@@ -1473,22 +1515,18 @@
                    full_matrices ? U.narrow(-1, 0, S.size(-1)) : U,
                    S,
                    full_matrices ? Vh.narrow(-2, 0, S.size(-1)) : Vh)"
-  U: std::get<0>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices))
-  S: std::get<1>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices))
-  Vh: std::get<2>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices))
+  U, S, Vh: linalg_svd_jvp(A_t, U, S, Vh, full_matrices)
 
 - name: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
   self: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors_return, /*is_hermitian=*/true, /*symeig_eigenvector=*/eigenvectors)
 
 - name: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
   self: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/true)
-  eigenvalues: std::get<0>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true))
-  eigenvectors: std::get<1>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true))
+  eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)
 
 - name: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
   self: handle_r_to_c(self.scalar_type(), linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/false))
-  eigenvalues: std::get<0>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false))
-  eigenvectors: std::get<1>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false))
+  eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false)
 
 - name: t(Tensor(a) self) -> Tensor(a)
   self: grad.t()
@@ -1564,7 +1602,11 @@
   self: zeros_like(grad)
   result: auto_element_wise
 
-- name: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
+# DO NOT define a backward for to_dense
+# See [Note: Sometimes view derivatives]
+# - name: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
+#
+- name: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
   self: to_dense_backward(grad, self)
 
 - name: to_sparse(Tensor self) -> Tensor
@@ -1612,6 +1654,9 @@
   self: grad.reshape(self.sizes())
   result: auto_linear
 
+- name: lift(Tensor self) -> Tensor
+  self: not_implemented("lift")
+
 - name: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
   self: grad.squeeze(dim)
   result: auto_linear
@@ -1642,7 +1687,7 @@
   self: at::view_as_real(grad.contiguous().resolve_conj()) # [gx, gy]
   result: at::view_as_complex(self_t)
 
-- name: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
+- name: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
   condition: non_differentiable
   self: where(condition, grad, zeros_like(grad))
   other: where(condition, zeros_like(grad), grad)
@@ -1651,8 +1696,8 @@
 # weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen
 # to be running backward with create_graph=True, fall back to a backward function that uses
 # differentiable ops.
-- name: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
-  v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_cuda_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple<Tensor, Tensor>()"
+- name: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+  v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple<Tensor, Tensor>()"
 
 - name: zero_(Tensor(a!) self) -> Tensor(a!)
   self: zeros_like(grad)
@@ -1685,6 +1730,9 @@
 # NN
 - name: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
   i1, i2, i3: _trilinear_backward(grad, i1, i2, i3, expand1, expand2, expand3, sumdim, grad_input_mask)
+  result: "_trilinear(i1_t, i2_p, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
+           _trilinear(i1_p, i2_t, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
+           _trilinear(i1_p, i2_p, i3_t, expand1, expand2, expand3, sumdim, unroll_dim)"
 
 - name: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   self: constant_pad_nd_backward(grad, pad)
@@ -1696,7 +1744,7 @@
 
 - name: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
   self: binary_cross_entropy_double_backward(grad_output, grad, self, target, weight, reduction)
-  target: not_implemented("binary_cross_entropy_backward wrt `target`")
+  target: binary_cross_entropy_double_backward_target(grad, grad_output, self, target, weight, reduction)
   grad_output: binary_cross_entropy_double_backward_grad_output(grad, self, target, weight, reduction)
 
 - name: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
@@ -1707,6 +1755,7 @@
 - name: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
   indices: non_differentiable
   weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
+  result: auto_linear
 
 - name: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
   grad_output: embedding_dense_double_backward(grad, indices, padding_idx)
@@ -1754,10 +1803,12 @@
 - name: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
   self: nll_loss_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
   target: non_differentiable
+  output: std::get<0>(nll_loss_forward(self_t, target, weight, reduction, ignore_index))
 
 - name: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
   self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
   target: non_differentiable
+  output: std::get<0>(nll_loss2d_forward(self_t, target, weight, reduction, ignore_index))
 
 - name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
@@ -1799,12 +1850,20 @@
 - name: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
   self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result)
 
-- name: gelu(Tensor self) -> Tensor
-  self: "GradMode::is_enabled() ? infinitely_differentiable_gelu_backward(grad, self) : gelu_backward(grad, self)"
+- name: gelu(Tensor self, *, str approximate='none') -> Tensor
+  self: gelu_backward(grad, self, approximate)
   result: auto_element_wise
 
+- name: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+  grad_output: gelu_backward(grad, self, approximate)
+  self: gelu_double_backward(grad, grad_output, self, approximate)
+  result: gelu_backward(grad_output_t, self_p, approximate) + gelu_double_backward(self_t, grad_output_p, self_p, approximate)
+
 - name: glu(Tensor self, int dim=-1) -> Tensor
+  # TODO: glu_backward can benefit from forward result,
+  # and forward ad/forward over reverse ad for that matter
   self: glu_backward(grad, self, dim)
+  result: glu_jvp(result, self_p, self_t, dim)
 
 - name: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   self: hardshrink_backward(grad, self, lambd)
@@ -1829,27 +1888,37 @@
 
 - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
   self: log_sigmoid_backward(grad, self, buffer)
+  output: auto_element_wise
 
 - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
+  result: self_t - logsumexp_jvp(self_p, self_t, {dim}, true)
 
 - name: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _sparse_log_softmax_backward_data(grad, result, dim, self)
 
+- name: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor
+  self: _masked_softmax_backward(grad, result, mask, dim)
+  mask: non_differentiable
+
 - name: prelu(Tensor self, Tensor weight) -> Tensor
   self, weight: "grad.defined() ? prelu_backward(grad, self, weight) : std::tuple<Tensor, Tensor>()"
+  result: prelu_jvp(self_p, self_t, weight_p, weight_t)
 
 - name: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
   grad_output, self, weight: prelu_double_backward(grads[0], grads[1], grad_output, self, weight)
 
 - name: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
   self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
+  result: auto_element_wise
+
 
 - name: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
   self: rrelu_with_noise_backward(grad, result, noise, lower, upper, training, true)
 
 - name: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _softmax_backward_data(grad, result, dim, self.scalar_type())
+  result: result * (self_t - logsumexp_jvp(self_p, self_t, {dim}, true))
 
 - name: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _sparse_softmax_backward_data(grad, result, dim, self)
@@ -1898,43 +1967,52 @@
   self: replication_pad3d_backward(grad, self, padding)
   result: auto_linear
 
-  # NOTE: Not implementing forward AD formulas for non-vec upsample overloads because they are
-  #       only kept for backward compatability
 - name: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
   self: upsample_linear1d_backward(grad, output_size, self.sizes(), align_corners, scales)
+  result: auto_linear
 
 - name: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   self: upsample_bilinear2d_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_bilinear2d_aa(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   self: _upsample_bilinear2d_aa_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   self: upsample_bicubic2d_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_bicubic2d_aa(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   self: _upsample_bicubic2d_aa_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w)
 
 - name: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   self: upsample_trilinear3d_backward(grad, output_size, self.sizes(), align_corners, scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
   self: upsample_nearest1d_backward(grad, output_size, self.sizes(), scales)
+  result: auto_linear
 
 - name: _upsample_nearest_exact1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
   self: _upsample_nearest_exact1d_backward(grad, output_size, self.sizes(), scales)
+  result: auto_linear
 
 - name: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
   self: upsample_nearest2d_backward(grad, output_size, self.sizes(), scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_nearest_exact2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
   self: _upsample_nearest_exact2d_backward(grad, output_size, self.sizes(), scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   self: upsample_nearest3d_backward(grad, output_size, self.sizes(), scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_nearest_exact3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   self: _upsample_nearest_exact3d_backward(grad, output_size, self.sizes(), scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   input: upsample_linear1d_backward(grad, output_size, input.sizes(), align_corners, scale_factors)
@@ -1983,6 +2061,14 @@
   input: _upsample_nearest_exact3d_backward(grad, output_size, input.sizes(), scale_factors)
   result: auto_linear
 
+- name: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  self: pixel_unshuffle(grad, upscale_factor)
+  result: auto_linear
+
+- name: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  self: pixel_shuffle(grad, downscale_factor)
+  result: auto_linear
+
 - name: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
   self: _adaptive_avg_pool2d_backward(grad, self)
   result: auto_linear
@@ -2019,6 +2105,19 @@
   result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
   output_differentiability: [True, False]
 
+#mps
+- name: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  self: mps_max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
+
+- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  self, weight, bias: "grad.defined() ? mps_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+
+- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask)
+
+- name: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+  self, weight, bias: mps_linear_backward(self, grad, weight, grad_input_mask)
+
 - name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
   self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
   result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
@@ -2030,12 +2129,12 @@
   output_differentiability: [True, False]
 
 - name: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  self: max_unpool2d_backward(grad, self, indices, output_size)
+  self: max_pool_double_backward(grad, indices, 2)
   indices: non_differentiable
   result: auto_linear
 
 - name: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  self: max_unpool3d_backward(grad, self, indices, output_size, stride, padding)
+  self: max_pool_double_backward(grad, indices, 3)
   indices: non_differentiable
   result: auto_linear
 
@@ -2139,6 +2238,7 @@
 - name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
   grad_output: elu_backward(grad, alpha, scale, input_scale, is_result, self_or_result)
   self_or_result: elu_double_backward(grad, grad_output, alpha, scale, input_scale, is_result, self_or_result)
+  result: elu_backward(grad_output_t, alpha, scale, input_scale, is_result, self_or_result_p) + elu_double_backward(self_or_result_t, grad_output_p, alpha, scale, input_scale, is_result, self_or_result_p)
 
 - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
   grad_output: max_pool_double_backward(grad, indices, 2)
@@ -2153,6 +2253,7 @@
 - name: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
   grad_output: glu_double_backward_grad_output(grad, self, dim)
   self: glu_double_backward(grad, grad_output, self, dim)
+  result: glu_backward_jvp(result, grad_output_p, self_p, grad_output_t, self_t, dim)
 
 - name: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   grad_output: hardtanh_backward(grad, self, min_val, max_val)
@@ -2181,6 +2282,24 @@
   # self_is_result is always false here since double backward call is an out-of-place call, self is input itself
   grad_output: leaky_relu_backward(grad, self, negative_slope, false)
   self: zeros_like(grad)
+  # leaky_relu_backward(grad_output, self, negative_slope, false)
+  # computes grad_output * at::where(self_p > 0, 1, negative_slope)
+  # so the jvp formula is the following:
+  # grad_output_t * at::where(self_p > 0, self_p.new_ones([]), negative_slope);
+  #
+  # leaky_relu_backward(grad_output, result, negative_slope, true)
+  # computes grad_output * at::where(result > 0, 1, negative_slope)
+  # under the assumption that `negative_slope` is positive (otherwise,
+  # it is not possible to compute the gradient).
+  #
+  # so the jvp formula is the following:
+  # grad_output_t * at::where(result_p > 0, result_p.new_ones([]), negative_slope);
+  # with the assumption that negative_slope is positive.
+  #
+  # Combined together that results in the following optimized kernel which
+  # also checks the assumption that negative_slope is positive when self_is_result
+  # is True:
+  result: leaky_relu_backward(grad_output_t, self_p, negative_slope, self_is_result)
 
 - name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
   grad_output: max_pool_double_backward(grad, indices, 2)
@@ -2194,11 +2313,6 @@
   indices: non_differentiable
   result: auto_linear
 
-- name: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  grad_output: max_unpool2d(grad, indices, output_size)
-  self: zeros_like(self)
-  indices: non_differentiable
-
 - name: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
   grad_output: mse_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
   self: mse_loss_double_backward(grad * grad_output, self, reduction)
@@ -2249,6 +2363,11 @@
   self: zeros_like(self)
   result: replication_pad3d_backward(grad_output_t, self_p, padding)
 
+- name: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  self: maybe_multiply(grad, beta.conj())
+  mat1: maybe_multiply(grad.sparse_mask(self).mm(mat2.mH()), alpha.conj())
+  mat2: maybe_multiply(mat1.mH().mm(grad.sparse_mask(self)), alpha.conj())
+
 - name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
@@ -2281,43 +2400,52 @@
   self: zeros_like(grad)
   result: zeros_like(self_t) + threshold_backward(grad_output_t, self_p, threshold)
 
-  # NOTE: Not implementing forward AD formulas for backwards of non-vec upsample overloads
-  #       because they are only kept for backward compatability
 - name: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor
   grad_output: upsample_linear1d(grad, output_size, align_corners, scales)
+  result: auto_linear
 
 - name: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: upsample_bilinear2d(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_bilinear2d_aa_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: _upsample_bilinear2d_aa(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: upsample_bicubic2d(grad, output_size, align_corners, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_bicubic2d_aa_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: _upsample_bicubic2d_aa(grad, output_size, align_corners, scales_h, scales_w)
 
 - name: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: upsample_trilinear3d(grad, output_size, align_corners, scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor
   grad_output: upsample_nearest1d(grad, output_size, scales)
+  result: auto_linear
 
 - name: _upsample_nearest_exact1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor
   grad_output: _upsample_nearest_exact1d(grad, output_size, scales)
+  result: auto_linear
 
 - name: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: upsample_nearest2d(grad, output_size, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_nearest_exact2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: _upsample_nearest_exact2d(grad, output_size, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: upsample_nearest3d(grad, output_size, scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: _upsample_nearest_exact3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   grad_output: _upsample_nearest_exact3d(grad, output_size, scales_d, scales_h, scales_w)
+  result: auto_linear
 
 - name: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
   grad_output: upsample_linear1d(grad, output_size, align_corners, scale_factors)
@@ -2383,6 +2511,9 @@
 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})"
 
+- name: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
+  self, weight: "grad.defined() ? mps_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor>()"
+
 - name: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector<int64_t>(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})"
 
@@ -2418,6 +2549,15 @@
   # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
   input, weight, bias: "grad.defined() ? convolution_backward(grad, input, weight, bias->sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
+#LSTM MPS
+- name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  output_differentiability: [True, True, True, False, False]
+  input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
+
+- name: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+
+
+
 # Only frst three of _cudnn_rnn outputs can have gradients.
 # _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf)
 - name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -2485,12 +2625,15 @@
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.size(dim.back()))
+  result: auto_linear
 
 - name: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
   self: fft_c2r_backward(grad, dim, normalization)
+  result: auto_linear
 
 - name: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor
   self: _fft_c2c(grad, dim, normalization, !forward)
+  result: auto_linear
 
 - name: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
   self: unbind_backward(grads, dim)
@@ -2590,6 +2733,6 @@
 - name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   output_differentiability: [False]
 
-- name: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
-  self: scatter_reduce_backward(grad, self, dim, index, reduce, result)
+- name: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  self, src: scatter_reduce_backward(grad, self, dim, index, src, reduce, include_self, result)
   index: non_differentiable
diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py
index 2d1dbd5c71a5..89269e8e0e0f 100644
--- a/tools/autograd/gen_annotated_fn_args.py
+++ b/tools/autograd/gen_annotated_fn_args.py
@@ -5,6 +5,7 @@
 
 python -m tools.autograd.gen_annotated_fn_args \
        aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
        $OUTPUT_DIR \
        tools/autograd
 
@@ -20,24 +21,35 @@
 
 from typing import Dict, List, Any
 
-from tools.codegen.gen import parse_native_yaml
-from tools.codegen.utils import FileManager
-from tools.codegen.context import with_native_function
-from tools.codegen.model import BaseOperatorName, NativeFunction
-import tools.codegen.api.python as python
-from .gen_python_functions import should_generate_py_binding, is_py_torch_function, \
-    is_py_nn_function, is_py_linalg_function, is_py_variable_method, is_py_special_function, \
-    is_py_fft_function
+from torchgen.gen import parse_native_yaml
+from torchgen.utils import FileManager
+from torchgen.context import with_native_function
+from torchgen.model import BaseOperatorName, NativeFunction
+import torchgen.api.python as python
+from .gen_python_functions import (
+    should_generate_py_binding,
+    is_py_torch_function,
+    is_py_nn_function,
+    is_py_linalg_function,
+    is_py_variable_method,
+    is_py_special_function,
+    is_py_fft_function,
+)
 
-def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None:
-    native_functions = parse_native_yaml(native_yaml_path).native_functions
+
+def gen_annotated(
+    native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
     mappings = (
-        (is_py_torch_function, 'torch._C._VariableFunctions'),
-        (is_py_nn_function, 'torch._C._nn'),
-        (is_py_linalg_function, 'torch._C._linalg'),
-        (is_py_special_function, 'torch._C._special'),
-        (is_py_fft_function, 'torch._C._fft'),
-        (is_py_variable_method, 'torch.Tensor'),
+        (is_py_torch_function, "torch._C._VariableFunctions"),
+        (is_py_nn_function, "torch._C._nn"),
+        (is_py_linalg_function, "torch._C._linalg"),
+        (is_py_special_function, "torch._C._special"),
+        (is_py_fft_function, "torch._C._fft"),
+        (is_py_variable_method, "torch.Tensor"),
     )
     annotated_args: List[str] = []
     for pred, namespace in mappings:
@@ -48,13 +60,18 @@ def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None:
             groups[f.func.name.name].append(f)
         for group in groups.values():
             for f in group:
-                annotated_args.append(f'{namespace}.{gen_annotated_args(f)}')
+                annotated_args.append(f"{namespace}.{gen_annotated_args(f)}")
 
-    template_path = os.path.join(autograd_dir, 'templates')
+    template_path = os.path.join(autograd_dir, "templates")
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
-    fm.write_with_template('annotated_fn_args.py', 'annotated_fn_args.py.in', lambda: {
-        'annotated_args': textwrap.indent('\n'.join(annotated_args), '    '),
-    })
+    fm.write_with_template(
+        "annotated_fn_args.py",
+        "annotated_fn_args.py.in",
+        lambda: {
+            "annotated_args": textwrap.indent("\n".join(annotated_args), "    "),
+        },
+    )
+
 
 @with_native_function
 def gen_annotated_args(f: NativeFunction) -> str:
@@ -63,26 +80,29 @@ def gen_annotated_args(f: NativeFunction) -> str:
         if arg.default is not None:
             continue
         out_arg: Dict[str, Any] = {}
-        out_arg['name'] = arg.name
-        out_arg['simple_type'] = python.argument_type_str(arg.type, simple_type=True)
+        out_arg["name"] = arg.name
+        out_arg["simple_type"] = python.argument_type_str(arg.type, simple_type=True)
         size = python.argument_type_size(arg.type)
         if size:
-            out_arg['size'] = size
+            out_arg["size"] = size
         out_args.append(out_arg)
 
-    return f'{f.func.name.name}: {repr(out_args)},'
+    return f"{f.func.name.name}: {repr(out_args)},"
+
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description='Generate annotated_fn_args script')
-    parser.add_argument('native_functions', metavar='NATIVE',
-                        help='path to native_functions.yaml')
-    parser.add_argument('out', metavar='OUT',
-                        help='path to output directory')
-    parser.add_argument('autograd', metavar='AUTOGRAD',
-                        help='path to template directory')
+    parser = argparse.ArgumentParser(description="Generate annotated_fn_args script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to template directory"
+    )
     args = parser.parse_args()
-    gen_annotated(args.native_functions, args.out, args.autograd)
+    gen_annotated(args.native_functions, args.tags, args.out, args.autograd)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 26ab682c5d05..25a04fb14acc 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -5,6 +5,7 @@
 python -m tools.autograd.gen_autograd \
        build/aten/src/ATen/Declarations.yaml \
        aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
        $OUTPUT_DIR \
        tools/autograd
 
@@ -24,23 +25,29 @@
 
 import argparse
 import os
-from tools.codegen.api import cpp
-from tools.codegen.api.autograd import (
-    match_differentiability_info, NativeFunctionWithDifferentiabilityInfo,
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    match_differentiability_info,
+    NativeFunctionWithDifferentiabilityInfo,
 )
-from tools.codegen.gen import parse_native_yaml
-from tools.codegen.selective_build.selector import SelectiveBuilder
+from torchgen.gen import parse_native_yaml
+from torchgen.selective_build.selector import SelectiveBuilder
 from typing import List
 from . import gen_python_functions
-from .gen_autograd_functions import gen_autograd_functions_lib, gen_autograd_functions_python
+from .gen_autograd_functions import (
+    gen_autograd_functions_lib,
+    gen_autograd_functions_python,
+)
 from .gen_trace_type import gen_trace_type
 from .gen_variable_type import gen_variable_type
 from .gen_inplace_or_view_type import gen_inplace_or_view_type
 from .gen_variable_factories import gen_variable_factories
 from .load_derivatives import load_derivatives
 
+
 def gen_autograd(
     native_functions_path: str,
+    tags_path: str,
     out: str,
     autograd_dir: str,
     operator_selector: SelectiveBuilder,
@@ -48,66 +55,84 @@ def gen_autograd(
 ) -> None:
     # Parse and load derivatives.yaml
     differentiability_infos = load_derivatives(
-        os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path)
-
-    template_path = os.path.join(autograd_dir, 'templates')
-
-    native_funcs = parse_native_yaml(native_functions_path).native_functions
-    fns = list(sorted(filter(
-        operator_selector.is_native_function_selected_for_training,
-        native_funcs), key=lambda f: cpp.name(f.func)))
-    fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = match_differentiability_info(fns, differentiability_infos)
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions
+    fns = list(
+        sorted(
+            filter(
+                operator_selector.is_native_function_selected_for_training, native_funcs
+            ),
+            key=lambda f: cpp.name(f.func),
+        )
+    )
+    fns_with_diff_infos: List[
+        NativeFunctionWithDifferentiabilityInfo
+    ] = match_differentiability_info(fns, differentiability_infos)
 
     # Generate VariableType.h/cpp
     if not disable_autograd:
-        gen_variable_type(out, native_functions_path, fns_with_diff_infos, template_path)
+        gen_variable_type(
+            out, native_functions_path, tags_path, fns_with_diff_infos, template_path
+        )
 
-        gen_inplace_or_view_type(out, native_functions_path, fns_with_diff_infos, template_path)
+        gen_inplace_or_view_type(
+            out, native_functions_path, tags_path, fns_with_diff_infos, template_path
+        )
 
         # operator filter not applied as tracing sources are excluded in selective build
         gen_trace_type(out, native_funcs, template_path)
     # Generate Functions.h/cpp
-    gen_autograd_functions_lib(
-        out, differentiability_infos, template_path)
+    gen_autograd_functions_lib(out, differentiability_infos, template_path)
 
     # Generate variable_factories.h
-    gen_variable_factories(out, native_functions_path, template_path)
+    gen_variable_factories(out, native_functions_path, tags_path, template_path)
 
 
 def gen_autograd_python(
     native_functions_path: str,
+    tags_path: str,
     out: str,
     autograd_dir: str,
 ) -> None:
     differentiability_infos = load_derivatives(
-        os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path)
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
 
-    template_path = os.path.join(autograd_dir, 'templates')
+    template_path = os.path.join(autograd_dir, "templates")
 
     # Generate Functions.h/cpp
-    gen_autograd_functions_python(
-        out, differentiability_infos, template_path)
+    gen_autograd_functions_python(out, differentiability_infos, template_path)
 
     # Generate Python bindings
-    deprecated_path = os.path.join(autograd_dir, 'deprecated.yaml')
+    deprecated_path = os.path.join(autograd_dir, "deprecated.yaml")
     gen_python_functions.gen(
-        out, native_functions_path, deprecated_path, template_path)
+        out, native_functions_path, tags_path, deprecated_path, template_path
+    )
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description='Generate autograd C++ files script')
-    parser.add_argument('native_functions', metavar='NATIVE',
-                        help='path to native_functions.yaml')
-    parser.add_argument('out', metavar='OUT',
-                        help='path to output directory')
-    parser.add_argument('autograd', metavar='AUTOGRAD',
-                        help='path to autograd directory')
+    parser = argparse.ArgumentParser(description="Generate autograd C++ files script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="NATIVE", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to autograd directory"
+    )
     args = parser.parse_args()
-    gen_autograd(args.native_functions,
-                 args.out, args.autograd,
-                 SelectiveBuilder.get_nop_selector())
+    gen_autograd(
+        args.native_functions,
+        args.tags,
+        args.out,
+        args.autograd,
+        SelectiveBuilder.get_nop_selector(),
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index be7c7212db8d..3e1e55b82b2f 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -8,17 +8,36 @@
 
 from typing import List, Sequence, Tuple
 
-from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo,
-                                        SavedAttribute, uses_retain_variables,
-                                        uses_single_grad)
-from tools.codegen.api.types import (Binding, BaseCType, OptionalCType, tensorT, longT,
-                                     doubleT, scalarT, stringT, boolT, intArrayRefT,
-                                     tensorListT, MutRefCType, ListCType, ArrayRefCType)
-from tools.codegen.code_template import CodeTemplate
-from tools.codegen.utils import FileManager
-from tools.codegen.model import Argument
-
-FUNCTION_DECLARATION = CodeTemplate("""\
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    SavedAttribute,
+    uses_retain_variables,
+    uses_single_grad,
+)
+from torchgen.api.types import (
+    Binding,
+    BaseCType,
+    OptionalCType,
+    tensorT,
+    longT,
+    doubleT,
+    scalarT,
+    stringT,
+    boolT,
+    intArrayRefT,
+    tensorListT,
+    MutRefCType,
+    ListCType,
+    ArrayRefCType,
+    optionalIntArrayRefT,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.utils import FileManager
+from torchgen.model import Argument
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
 struct TORCH_API ${op} : public ${superclass} {
   using ${superclass}::${superclass};
   variable_list apply(variable_list&& grads) override;
@@ -31,16 +50,20 @@
   ${saved_variables}
   ${saved_list_sizes}
 };
-""")
+"""
+)
 
-WILL_RELEASE_VARIABLES = CodeTemplate("""\
+WILL_RELEASE_VARIABLES = CodeTemplate(
+    """\
 bool retain_variables = true;
 void will_release_variables() override {
   retain_variables = false;
 }
-""")
+"""
+)
 
-FUNCTION_DEFINITION = CodeTemplate("""\
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
 variable_list ${op}::apply(variable_list&& grads) {
   ${thread_lock}
   ${asserts}
@@ -50,34 +73,43 @@
   ${body}
   return grad_inputs;
 }
-""")
+"""
+)
 
-GRAD_INPUT_MASK = CodeTemplate("""\
+GRAD_INPUT_MASK = CodeTemplate(
+    """\
   auto grad_input_mask = std::array<bool, ${n}>{
     ${masks}
   };\
-""")
+"""
+)
 
-DERIVATIVE_SINGLE = CodeTemplate("""\
+DERIVATIVE_SINGLE = CodeTemplate(
+    """\
 if (should_compute_output({ ${name}_ix })) {
   auto grad_result = ${derivative};
   copy_range(grad_inputs, ${name}_ix, grad_result);
 }
-""")
+"""
+)
 
-DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate("""\
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate(
+    """\
   if (should_compute_output({ ${name}_ix })) {
     copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
   }
-""")
+"""
+)
 
-DERIVATIVE_MULTI = CodeTemplate("""\
+DERIVATIVE_MULTI = CodeTemplate(
+    """\
 if (should_compute_output({ ${idx_ranges} })) {
   ${grad_input_mask}
   auto grad_result = ${derivative};
   ${copy_ranges}
 }
-""")
+"""
+)
 
 # Generates python bindings
 #
@@ -88,12 +120,15 @@
 #       Each PyGetSetDef has a function ptr to a getter, also defined here (3).
 #   (3) Getters for each of grad_fn's saved inputs and outputs.
 #
-PY_FUNCTION_DEFINITION = CodeTemplate("""\
+PY_FUNCTION_DEFINITION = CodeTemplate(
+    """\
 static PyTypeObject ${op}Class;
 addClass<${op}>(${op}Class, "${op}", ${op}_properties);
-""")
+"""
+)
 
-PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate("""\
+PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate(
+    """\
 ${all_getter_definitions}
 
 static struct PyGetSetDef ${op}_properties[] = {
@@ -102,43 +137,55 @@
   {nullptr} /* sentinel */
 };
 
-""")
+"""
+)
 
-PY_GETSETDEF_STRUCT = CodeTemplate("""\
-{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}""")
+PY_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}"""
+)
 
-PY_RAW_GETSETDEF_STRUCT = CodeTemplate("""\
-{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}""")
+PY_RAW_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}"""
+)
 
 # Getter templates
-GETTER_DEFINITION = CodeTemplate("""\
+GETTER_DEFINITION = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   auto prop = static_cast<${op}*>(self->cdata.get())->${name};
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_SAVEDVAR = CodeTemplate("""\
+GETTER_DEFINITION_SAVEDVAR = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate("""\
+GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate("""\
+GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   const auto *node = static_cast<${op}*>(self->cdata.get());
@@ -150,9 +197,11 @@
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate("""\
+GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   const auto *node = static_cast<${op}*>(self->cdata.get());
@@ -164,9 +213,11 @@
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_OPT = CodeTemplate("""\
+GETTER_DEFINITION_OPT = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
@@ -177,9 +228,11 @@
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
-GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate("""\
+GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate(
+    """\
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
   HANDLE_TH_ERRORS
   auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
@@ -190,7 +243,8 @@
   ${body}
   END_HANDLE_TH_ERRORS
 }
-""")
+"""
+)
 
 # Getter body
 GETTER_BODY_SAVEDVAR = """\
@@ -204,7 +258,7 @@
 
 GETTER_BODY_VEC_SAVEDVAR = """\
 PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
-for (int i = 0; i < prop.size(); i++) {
+for (auto i: c10::irange(prop.size())) {
   PyTuple_SetItem(tup, (Py_ssize_t) i, THPVariable_Wrap(prop[i].unpack(self->cdata)));
 }
 return tup;
@@ -212,7 +266,7 @@
 
 GETTER_BODY_RAW_VEC_SAVEDVAR = """\
 PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
-for (int i = 0; i < prop.size(); i++) {
+for (auto i : c10::irange(prop.size())) {
   pybind11::object obj = pybind11::cast(prop[i], pybind11::return_value_policy::reference);
   PyTuple_SetItem(tup, (Py_ssize_t) i, obj.release().ptr());
 }
@@ -221,7 +275,7 @@
 
 GETTER_BODY_ARRAYREF_LONG = """\
 PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
-for (int i = 0; i < prop.size(); i++) {
+for (auto i : c10::irange(prop.size())) {
   PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong((uint64_t) prop[i]));
 }
 return tup;
@@ -229,7 +283,7 @@
 
 GETTER_BODY_ARRAYREF_DOUBLE = """\
 PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
-for (int i = 0; i < prop.size(); i++) {
+for (auto i : c10::irange(prop.size())) {
   PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble((double) prop[i]));
 }
 return tup;
@@ -292,6 +346,7 @@
 # TODO: This is probably not exhaustive, but it's a start
 UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS
 
+
 def gen_autograd_functions_lib(
     out: str,
     differentiability_infos: Sequence[DifferentiabilityInfo],
@@ -304,19 +359,26 @@ def gen_autograd_functions_lib(
     """
 
     # only create an autograd function if we are actually going to calculate a derivative
-    infos = list(filter(lambda info: info.args_with_derivatives, differentiability_infos))
+    infos = list(
+        filter(lambda info: info.args_with_derivatives, differentiability_infos)
+    )
     declarations = list(map(lambda f: process_function(f, FUNCTION_DECLARATION), infos))
     definitions = list(map(lambda f: process_function(f, FUNCTION_DEFINITION), infos))
 
-    file_basename = 'Functions'
+    file_basename = "Functions"
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
-    for suffix in ['.h', '.cpp']:
+    for suffix in [".h", ".cpp"]:
         fname = file_basename + suffix
-        fm.write_with_template(fname, fname, lambda: {
-            'generated_comment': '@' + f'generated from {fm.template_dir}/' + fname,
-            'autograd_function_declarations': declarations,
-            'autograd_function_definitions': definitions,
-        })
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@" + f"generated from {fm.template_dir}/" + fname,
+                "autograd_function_declarations": declarations,
+                "autograd_function_definitions": definitions,
+            },
+        )
+
 
 def gen_autograd_functions_python(
     out: str,
@@ -326,34 +388,43 @@ def gen_autograd_functions_python(
 
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
     num_shards = 5
-    fm.write('python_functions.h', lambda: {
-        'generated_comment': f'@generated from {fm.template_dir}/python_functions.h',
-        'shard_forward_declare': [
-            f"void initialize_autogenerated_functions_{i}();"
-            for i in range(num_shards)
-        ],
-        'shard_call': [
-            f"initialize_autogenerated_functions_{i}();"
-            for i in range(num_shards)
-        ]
-    })
-
-    infos = list(filter(lambda info: info.args_with_derivatives, differentiability_infos))
+    fm.write(
+        "python_functions.h",
+        lambda: {
+            "generated_comment": f"@generated from {fm.template_dir}/python_functions.h",
+            "shard_forward_declare": [
+                f"void initialize_autogenerated_functions_{i}();"
+                for i in range(num_shards)
+            ],
+            "shard_call": [
+                f"initialize_autogenerated_functions_{i}();" for i in range(num_shards)
+            ],
+        },
+    )
+
+    infos = list(
+        filter(lambda info: info.args_with_derivatives, differentiability_infos)
+    )
     fm.write_sharded(
-        'python_functions.cpp',
+        "python_functions.cpp",
         infos,
         key_fn=lambda info: info.name,
         base_env={
-            'generated_comment': f'@generated from {fm.template_dir}/python_functions.cpp',
+            "generated_comment": f"@generated from {fm.template_dir}/python_functions.cpp",
         },
         env_callable=lambda info: {
-            'py_function_initializers': [process_function(info, PY_FUNCTION_DEFINITION)],
-            'py_function_props_and_getters': [process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)],
+            "py_function_initializers": [
+                process_function(info, PY_FUNCTION_DEFINITION)
+            ],
+            "py_function_props_and_getters": [
+                process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)
+            ],
         },
         num_shards=num_shards,
-        sharded_keys={'py_function_initializers', 'py_function_props_and_getters'}
+        sharded_keys={"py_function_initializers", "py_function_props_and_getters"},
     )
 
+
 def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str:
     saved_variables: List[str] = []
     release_variables: List[str] = []
@@ -365,12 +436,15 @@ def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str
     py_getsetdef_structs: List[str] = []
 
     for arg in info.args_with_derivatives:
-        if arg.type == 'at::TensorList' or arg.type == 'const c10::List<c10::optional<at::Tensor>> &':
-            size = f'{arg.name}_size_'
-            saved_list_sizes.append(f'size_t {arg.name}_size_;')
+        if (
+            arg.type == "at::TensorList"
+            or arg.type == "const c10::List<c10::optional<at::Tensor>> &"
+        ):
+            size = f"{arg.name}_size_"
+            saved_list_sizes.append(f"size_t {arg.name}_size_;")
         else:
-            size = '1'
-        compute_index_ranges.append(f'auto {arg.name}_ix = gen.range({size});')
+            size = "1"
+        compute_index_ranges.append(f"auto {arg.name}_ix = gen.range({size});")
 
     def save_var(var: SavedAttribute, is_output: bool) -> None:
         name = var.nctype.name
@@ -378,76 +452,124 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
         should_append_getsetdef = True
         should_append_raw_getsetdef = False
 
-        if type == BaseCType(tensorT) or type == OptionalCType(BaseCType(tensorT)) or \
-                type == MutRefCType(OptionalCType(BaseCType(tensorT))) or \
-                (type == BaseCType(scalarT) and is_output):
-            saved_variables.append(f'SavedVariable {name}_;')
-            release_variables.append(f'{name}_.reset_data();')
-            ptr = 'shared_from_this()' if is_output else ''
-            unpack.append(f'auto {name} = {name}_.unpack({ptr});')
-            getter_definitions.append(GETTER_DEFINITION_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_SAVEDVAR))
-            getter_definitions.append(GETTER_DEFINITION_RAW_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR))
+        if (
+            type == BaseCType(tensorT)
+            or type == OptionalCType(BaseCType(tensorT))
+            or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+            or (type == BaseCType(scalarT) and is_output)
+        ):
+            saved_variables.append(f"SavedVariable {name}_;")
+            release_variables.append(f"{name}_.reset_data();")
+            ptr = "shared_from_this()" if is_output else ""
+            unpack.append(f"auto {name} = {name}_.unpack({ptr});")
+            getter_definitions.append(
+                GETTER_DEFINITION_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR
+                )
+            )
             should_append_raw_getsetdef = True
         elif type == BaseCType(tensorListT):
-            saved_variables.append(f'std::vector<SavedVariable> {name}_;')
-            saved_variables.append(f'bool {name}_released_ = false;')
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
             # Just clear() is sufficient, we don't need to loop and clear each variable.
             # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
-            release_variables.append(f'{name}_.clear();')
-            release_variables.append(f'{name}_released_ = true;')
-            unpack.append(f'auto {name} = unpack_list({name}_);')
-            asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
-            getter_definitions.append(GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR))
-            getter_definitions.append(GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR))
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            unpack.append(f"auto {name} = unpack_list({name}_);")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
             should_append_raw_getsetdef = True
         elif type == ListCType(OptionalCType(BaseCType(tensorT))):
-            saved_variables.append(f'std::vector<SavedVariable> {name}_;')
-            saved_variables.append(f'bool {name}_released_ = false;')
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
             # Just clear() is sufficient, we don't need to loop and clear each variable.
             # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
-            release_variables.append(f'{name}_.clear();')
-            release_variables.append(f'{name}_released_ = true;')
-            unpack.append(f'auto {name} = unpack_opt_list({name}_);')
-            asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
-            getter_definitions.append(GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR))
-            getter_definitions.append(GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
-                op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR))
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            unpack.append(f"auto {name} = unpack_opt_list({name}_);")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
             should_append_raw_getsetdef = True
         elif type == BaseCType(intArrayRefT):
-            saved_variables.append(f'std::vector<int64_t> {name};')
-            getter_definitions.append(GETTER_DEFINITION.substitute(
-                op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG))
+            saved_variables.append(f"std::vector<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(optionalIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
         elif type == OptionalCType(BaseCType(intArrayRefT)):
-            saved_variables.append(f'c10::OptionalArray<int64_t> {name};')
-            getter_definitions.append(GETTER_DEFINITION_OPT_ARRAYREF.substitute(
-                op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG))
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
         elif type == OptionalCType(ArrayRefCType(BaseCType(doubleT))):
-            saved_variables.append(f'c10::OptionalArray<double> {name};')
-            getter_definitions.append(GETTER_DEFINITION_OPT_ARRAYREF.substitute(
-                op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE))
+            saved_variables.append(f"c10::OptionalArray<double> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE
+                )
+            )
         elif type == BaseCType(longT):
-            saved_variables.append(f'{type.cpp_type()} {name} = 0;')
-            getter_definitions.append(GETTER_DEFINITION.substitute(
-                op=info.op, name=name, body=GETTER_BODY_INT64_T))
+            saved_variables.append(f"{type.cpp_type()} {name} = 0;")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_INT64_T
+                )
+            )
         elif type == BaseCType(stringT):
-            saved_variables.append(f'std::string {name};')
-            getter_definitions.append(GETTER_DEFINITION.substitute(
-                op=info.op, name=name, body=GETTER_BODY_STRING))
+            saved_variables.append(f"std::string {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
         elif type == OptionalCType(BaseCType(stringT)):
-            saved_variables.append(f'c10::optional<std::string> {name};')
-            getter_definitions.append(GETTER_DEFINITION_OPT.substitute(
-                op=info.op, name=name, body=GETTER_BODY_STRING))
+            saved_variables.append(f"c10::optional<std::string> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
         else:
-            saved_variables.append(f'{type.cpp_type()} {name};')
+            saved_variables.append(f"{type.cpp_type()} {name};")
 
             if type in MISC_GETTER_DEFS:
                 getter_def, body = MISC_GETTER_DEFS[type]
-                getter_definitions.append(getter_def.substitute(op=info.op, name=name, body=body))
+                getter_definitions.append(
+                    getter_def.substitute(op=info.op, name=name, body=body)
+                )
             else:
                 # Types we don't expose python bindings to yet:
                 #   TypeAndSize, at::ScalarType, TensorOptions, TensorGeometry,
@@ -455,9 +577,13 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
                 should_append_getsetdef = False
 
         if should_append_getsetdef:
-            py_getsetdef_structs.append(PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name))
+            py_getsetdef_structs.append(
+                PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
         if should_append_raw_getsetdef:
-            py_getsetdef_structs.append(PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name))
+            py_getsetdef_structs.append(
+                PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
 
     for var in info.all_saved_inputs:
         save_var(var, is_output=False)
@@ -467,24 +593,25 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
     # lock the mutex when we release variables and in Node::apply to protect thread safety
     # see Note [Thread Safety on Autograd Node]
     if len(release_variables) > 0:
-        thread_lock = 'std::lock_guard<std::mutex> lock(mutex_);'
+        thread_lock = "std::lock_guard<std::mutex> lock(mutex_);"
     else:
-        thread_lock = ''
+        thread_lock = ""
 
     if uses_retain_variables(info):
         will_release_variables = WILL_RELEASE_VARIABLES.substitute()
     else:
-        will_release_variables = ''
+        will_release_variables = ""
 
     body: List[str] = []
 
     if uses_single_grad(info):
-        body.append('const auto& grad = grads[0];')
+        body.append("const auto& grad = grads[0];")
     else:
         # Generate aliases for gradients named for returned values.
         body.extend(
-            f'const auto& {name} = grads[{info.available_named_gradients.index(name)}];'
-            for name in info.used_named_gradients)
+            f"const auto& {name} = grads[{info.available_named_gradients.index(name)}];"
+            for name in info.used_named_gradients
+        )
 
     def emit_derivative(
         derivative: Derivative,
@@ -494,51 +621,65 @@ def emit_derivative(
         var_names = derivative.var_names
         if len(var_names) == 1:
             checks_any_grad_defined = False
-            if 'not_implemented' not in formula:
+            if "not_implemented" not in formula:
                 matching_args = [
-                    arg for arg in args_with_derivatives
-                    if arg.name == var_names[0]]
+                    arg for arg in args_with_derivatives if arg.name == var_names[0]
+                ]
                 if len(matching_args) == 1:
                     # We can add undefined grad support if the input variable is a Tensor
                     arg = matching_args[0]
-                    if isinstance(arg.argument, Argument) and str(arg.argument.type) in ('Tensor', 'Tensor?'):
-                        formula = 'any_grad_defined ? (' + formula + ') : Tensor()'
+                    if isinstance(arg.argument, Argument) and str(
+                        arg.argument.type
+                    ) in ("Tensor", "Tensor?"):
+                        formula = "any_grad_defined ? (" + formula + ") : Tensor()"
                         checks_any_grad_defined = True
-            return (checks_any_grad_defined,
-                    DERIVATIVE_SINGLE.substitute(name=var_names[0], derivative=formula))
+            return (
+                checks_any_grad_defined,
+                DERIVATIVE_SINGLE.substitute(name=var_names[0], derivative=formula),
+            )
         else:
-            if 'grad_input_mask' in formula:
-                masks = [f'should_compute_output({{ {n}_ix }}),' for n in var_names]
-                grad_input_mask = GRAD_INPUT_MASK.substitute(masks=masks, n=len(var_names))
+            if "grad_input_mask" in formula:
+                masks = [f"should_compute_output({{ {n}_ix }})," for n in var_names]
+                grad_input_mask = GRAD_INPUT_MASK.substitute(
+                    masks=masks, n=len(var_names)
+                )
             else:
-                grad_input_mask = ''
-            idx_ranges = ', '.join(f'{n}_ix' for n in var_names)
+                grad_input_mask = ""
+            idx_ranges = ", ".join(f"{n}_ix" for n in var_names)
             copy_ranges: List[str] = []
             for i, n in enumerate(var_names):
                 copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
             return False, DERIVATIVE_MULTI.substitute(
-                idx_ranges=idx_ranges, copy_ranges=copy_ranges,
+                idx_ranges=idx_ranges,
+                copy_ranges=copy_ranges,
                 derivative=formula,
-                grad_input_mask=grad_input_mask)
+                grad_input_mask=grad_input_mask,
+            )
 
     body.extend(unpack)
     need_any_grad_defined_var = False
     for derivative in info.derivatives:
-        checks_any_grad_defined, derivative_text = emit_derivative(derivative, info.args_with_derivatives)
+        checks_any_grad_defined, derivative_text = emit_derivative(
+            derivative, info.args_with_derivatives
+        )
         body.append(derivative_text)
         need_any_grad_defined_var |= checks_any_grad_defined
     # Since single-output derivative formulas need to check if grads are
     # defined, only perform the check once, before all the formulas
     if need_any_grad_defined_var:
-        body.insert(-len(info.derivatives),
-                    'bool any_grad_defined = any_variable_defined(grads);')
+        body.insert(
+            -len(info.derivatives),
+            "bool any_grad_defined = any_variable_defined(grads);",
+        )
 
     if info.name in UNTRACEABLE_FUNCTIONS:
-        superclass = 'Node'
+        superclass = "Node"
     else:
-        superclass = 'TraceableFunction'
+        superclass = "TraceableFunction"
 
-    all_getsetdef_structs = ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else ""
+    all_getsetdef_structs = (
+        ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else ""
+    )
     all_getter_definitions = "\n".join(getter_definitions)
 
     return template.substitute(
@@ -553,5 +694,5 @@ def emit_derivative(
         body=body,
         superclass=superclass,
         all_getter_definitions=all_getter_definitions,
-        all_getsetdef_structs=all_getsetdef_structs
+        all_getsetdef_structs=all_getsetdef_structs,
     )
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index dfb1a1e9892b..541ef2b5312b 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -4,24 +4,40 @@
 # if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
 # The fallback is expected to mimick this codegen, so we should keep the two in sync.
 
-from tools.codegen.api import cpp
-from tools.codegen.api.autograd import (
-    NativeFunctionWithDifferentiabilityInfo, gen_differentiable_outputs,
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    NativeFunctionWithDifferentiabilityInfo,
+    gen_differentiable_outputs,
     dispatch_strategy,
 )
-from tools.codegen.api.types import (Binding, DispatcherSignature, CType, BaseCType,
-                                     OptionalCType, longT, boolT, intArrayRefT)
-from tools.codegen.code_template import CodeTemplate
-from tools.codegen.context import with_native_function
-from tools.codegen.model import (
-    Type, NativeFunction, SelfArgument, TensorOptionsArguments, SchemaKind,
-    is_foreach_op,
+from torchgen.api.types import (
+    Binding,
+    DispatcherSignature,
+    CType,
+    BaseCType,
+    OptionalCType,
+    longT,
+    boolT,
+    intArrayRefT,
+    symIntArrayRefT,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import (
+    Type,
+    NativeFunction,
+    SelfArgument,
+    TensorOptionsArguments,
+    SchemaKind,
 )
 from typing import List, Optional, Sequence, Tuple, Dict
-from tools.codegen.utils import FileManager
+from torchgen.utils import FileManager
 from .context import with_native_function_with_differentiability_info
 from .gen_trace_type import (
-    MANUAL_AUTOGRAD, type_wrapper_name, tie_return_values, get_return_value
+    MANUAL_AUTOGRAD,
+    type_wrapper_name,
+    tie_return_values,
+    get_return_value,
 )
 
 # See NOTE [ Autograd View Variables ] in variable.h for details.
@@ -33,58 +49,77 @@
 # A map: function name => name of the argument that all outputs are view of
 
 VIEW_FUNCTIONS_WITH_METADATA_CHANGE = [
-    'view_as_complex',
-    'view_as_real',
-    '_conj',
-    '_neg_view'
+    "view_as_complex",
+    "view_as_real",
+    "_conj",
+    "_neg_view",
 ]
 
 VIEW_FUNCTIONS = {
-    'numpy_T': 'self',
-    'alias': 'self',
-    'as_strided': 'self',
-    'diagonal': 'self',
-    'expand': 'self',
-    'permute': 'self',
-    'select': 'self',
-    'slice': 'self',
-    'split': 'self',
-    'split_with_sizes': 'self',
-    'squeeze': 'self',
-    't': 'self',
-    'transpose': 'self',
-    'unfold': 'self',
-    'unsqueeze': 'self',
-    'flatten': 'self',
-    'view': 'self',
-    'unbind': 'self',
-    '_indices': 'self',
-    '_values': 'self',
-    'indices': 'self',
-    'values': 'self',
-    'crow_indices': 'self',
-    'col_indices': 'self',
+    "numpy_T": "self",
+    "alias": "self",
+    "as_strided": "self",
+    "diagonal": "self",
+    "expand": "self",
+    "permute": "self",
+    "select": "self",
+    "slice": "self",
+    "split": "self",
+    "split_with_sizes": "self",
+    "squeeze": "self",
+    "t": "self",
+    "transpose": "self",
+    "unfold": "self",
+    "unsqueeze": "self",
+    "flatten": "self",
+    "view": "self",
+    "unbind": "self",
+    "_indices": "self",
+    "_values": "self",
+    "indices": "self",
+    "values": "self",
+    "crow_indices": "self",
+    "col_indices": "self",
+    "ccol_indices": "self",
+    "row_indices": "self",
     # sparse_coo ctor output should really be views of both indices and values,
     # but we only supports making as view of a single variable, and indices is
     # discrete anyways.
     # FIXME: clone indices on construction.
-    'sparse_coo_tensor_with_dims_and_tensors': 'values',
-    '_reshape_alias': 'self',
+    "sparse_coo_tensor_with_dims_and_tensors": "values",
+    "_reshape_alias": "self",
 }
 
 for key in VIEW_FUNCTIONS_WITH_METADATA_CHANGE:
-    VIEW_FUNCTIONS[key] = 'self'
+    VIEW_FUNCTIONS[key] = "self"
 
 # note: some VIEW_FUNCTIONS are just compositions of the view functions above
 # this list contains both the root view functions and any that are purely composed
 # of viewing functions, and is used by the JIT to determine when an operator
 # may return a view of its inputs; however they may sometimes return a copy.
 # (e.g. `contiguous`)
-RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union({
-    'chunk', 'detach', 'contiguous', 'reshape', 'reshape_as',
-    'expand_as', 'view_as', 'real', 'imag', 'narrow', 'movedim',
-    'tensor_split', 'swapdims', 'swapaxes', 'mT', 'mH', 'adjoint', 'matrix_H'
-})
+RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union(
+    {
+        "chunk",
+        "detach",
+        "contiguous",
+        "reshape",
+        "reshape_as",
+        "expand_as",
+        "view_as",
+        "real",
+        "imag",
+        "narrow",
+        "movedim",
+        "tensor_split",
+        "swapdims",
+        "swapaxes",
+        "mT",
+        "mH",
+        "adjoint",
+        "matrix_H",
+    }
+)
 
 # These are the functions we consider views for the purposes of validating
 # StorageImpl and TensorImpl in gen_variable_type.
@@ -93,68 +128,90 @@
 # See NOTE [Unsafe View] for more info.
 ALL_VIEW_FUNCTIONS = {
     **VIEW_FUNCTIONS,
-    '_unsafe_view': 'self',
+    "_unsafe_view": "self",
 }
 
-ARRAYREF_TO_VEC = CodeTemplate("""\
+ARRAYREF_TO_VEC = CodeTemplate(
+    """\
 auto ${vec} = ${arg}.vec();
-""")
+"""
+)
 
-OPTIONAL_TO_VAL = CodeTemplate("""\
+OPTIONAL_TO_VAL = CodeTemplate(
+    """\
 auto ${val} = ${arg}.value_or(${default});
-""")
+"""
+)
 
-CALL_DISPATCH = CodeTemplate("""\
-at::_ops::${unambiguous_name}::call(${unpacked_args})""")
+CALL_DISPATCH = CodeTemplate(
+    """\
+at::_ops::${unambiguous_name}::call(${unpacked_args})"""
+)
 
-SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate("""\
+SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
+    """\
 std::function<at::Tensor(const at::Tensor&)> func=nullptr;
 if (${is_view_with_metadata_change} || !self.unsafeGetTensorImpl()->support_as_strided()) {
   ${replay_view_func}
 }
-""")
+"""
+)
 
-REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate("""\
+REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate(
+    """\
 func = [=](const at::Tensor& ${input_base}) {
   return ${replay_view_call};
 };
-""")
+"""
+)
 
-METHOD_DEFINITION = CodeTemplate("""\
+METHOD_DEFINITION = CodeTemplate(
+    """\
 ${return_type} ${type_wrapper_name}(${formals}) {
   ${type_definition_body}
 }
-""")
+"""
+)
 
-WRAPPER_REGISTRATION = CodeTemplate("""\
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
 m.impl("${unqual_operator_name_with_overload}",
        TORCH_FN(${class_type}::${type_wrapper_name})
 );
-""")
+"""
+)
 
-AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate("""\
+AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate(
+    """\
 m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback());
-""")
+"""
+)
 
-INPLACE_REDISPATCH = CodeTemplate("""\
+INPLACE_REDISPATCH = CodeTemplate(
+    """\
 {
   at::AutoDispatchBelowADInplaceOrView guard;
   at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
 }
-""")
+"""
+)
 
-ASSIGN_RETURN_VALUE = CodeTemplate("""\
+ASSIGN_RETURN_VALUE = CodeTemplate(
+    """\
 ${return_values} = ${rhs_value};
-""")
+"""
+)
 
-VIEW_REDISPATCH = CodeTemplate("""\
+VIEW_REDISPATCH = CodeTemplate(
+    """\
 ${assign_return_values} ([&]() {
   at::AutoDispatchBelowADInplaceOrView guard;
   return at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
 })();
-""")
+"""
+)
 
-TMP_VAR = '_tmp'
+TMP_VAR = "_tmp"
 
 # FIXME: Ideally these functions should be methods on Type class, but we have a
 #        comment in codegen/model.py there saying these concepts are not well defined.
@@ -163,27 +220,38 @@ def is_tensor_type(t: Type) -> bool:
     # TODO: Should handle optional here?
     return t.is_tensor_like() and t.is_list_like() is None
 
+
 def is_tensor_list_type(t: Type) -> bool:
     # TODO: Should handle optional here?
     return t.is_tensor_like() and t.is_list_like() is not None
 
-UNPACK_TENSOR = CodeTemplate("""\
-auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
+
+UNPACK_TENSOR = CodeTemplate(
+    """\
+auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});"""
+)
+
 
 def unpacked_name(arg_name: str) -> str:
-    return arg_name + '_'
+    return arg_name + "_"
+
 
 @with_native_function
 def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
     body: List[str] = []
     unpacked_bindings: List[Binding] = []
 
-    bindings = [r for a in f.func.schema_order_arguments()
-                for r in cpp.argument(a,
-                                      method=False,
-                                      cpp_no_default_args=set(),
-                                      faithful=False,
-                                      has_tensor_options=False)]
+    bindings = [
+        r
+        for a in f.func.schema_order_arguments()
+        for r in cpp.argument(
+            a,
+            method=False,
+            cpp_no_default_args=set(),
+            faithful=False,
+            has_tensor_options=False,
+        )
+    ]
 
     for i, binding in enumerate(bindings):
         assert not isinstance(binding.argument, SelfArgument)
@@ -197,25 +265,31 @@ def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
 
         is_tensor_list = is_tensor_list_type(binding.argument.type)
         ref = (not is_nullable) and not is_tensor_list
-        suffix = '_opt' if is_nullable and not is_tensor_list else ''
-        body.append(UNPACK_TENSOR.substitute(
-            arg_name=binding.name,
-            arg_pos=i,
-            suffix=suffix,
-            ref='&' if ref else '',
-        ))
-        unpacked_bindings.append(Binding(
-            name=unpacked_name(binding.name),
-            nctype=binding.nctype,
-            argument=binding.argument,
-            default=binding.default,
-        ))
+        suffix = "_opt" if is_nullable and not is_tensor_list else ""
+        body.append(
+            UNPACK_TENSOR.substitute(
+                arg_name=binding.name,
+                arg_pos=i,
+                suffix=suffix,
+                ref="&" if ref else "",
+            )
+        )
+        unpacked_bindings.append(
+            Binding(
+                name=unpacked_name(binding.name),
+                nctype=binding.nctype,
+                argument=binding.argument,
+                default=binding.default,
+            )
+        )
 
     return body, unpacked_bindings
 
+
 def get_base_name(f: NativeFunction) -> str:
     return f.func.name.name.base  # TODO: should be str(f.func.name.name)?
 
+
 def get_view_info(f: NativeFunction) -> Optional[str]:
     base_name = get_base_name(f)
     view_info = VIEW_FUNCTIONS.get(base_name, None)
@@ -223,114 +297,148 @@ def get_view_info(f: NativeFunction) -> Optional[str]:
         view_info = "self"
     return view_info
 
+
 # For view replay calls, we generate an ordinary Dispatcher::call() instead, because:
 #  - We want to replay the entire call into the op, including any previously-set dispatch keys (including autograd!).
 #  - The view replay call also is not part of the hot path.
-def emit_view_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str:
+def emit_view_call(
+    f: NativeFunction, input_base: str, unpacked_args: Sequence[str]
+) -> str:
     # View replay functions use the standard Dispatcher::call API.
     return CALL_DISPATCH.substitute(
-        unambiguous_name=f.func.name.unambiguous_name(),
-        unpacked_args=unpacked_args)
+        unambiguous_name=f.func.name.unambiguous_name(), unpacked_args=unpacked_args
+    )
+
 
 def emit_view_lambda(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
-    """ Generate an additional lambda function to recover views in backward when as_strided is not supported.
+    """Generate an additional lambda function to recover views in backward when as_strided is not supported.
     See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details."""
-    input_base = 'input_base'
-    replay_view_func = ''
+    input_base = "input_base"
+    replay_view_func = ""
     updated_unpacked_args: List[str] = []
     known_view_arg_simple_types: List[CType] = [
         BaseCType(longT),
         OptionalCType(BaseCType(longT)),
         BaseCType(boolT),
-        BaseCType(intArrayRefT)]
+        BaseCType(intArrayRefT),
+        BaseCType(symIntArrayRefT),
+    ]
     for unpacked_binding in unpacked_bindings:
         arg, arg_type = unpacked_binding.name, unpacked_binding.nctype.type
-        if arg == 'self_':
+        if arg == "self_":
             updated_unpacked_args.append(input_base)
             continue
         if arg_type not in known_view_arg_simple_types:
-            known_types_str = ', '.join([str(t) for t in known_view_arg_simple_types])
-            raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: '
-                            f'{known_types_str}. Please update the list or materialize it so that it can be closed '
-                            'over by value, also add a test in pytorch/xla/test/test_operations.py where this code '
-                            'is exercised.')
-
-        if arg_type == BaseCType(intArrayRefT):
+            known_types_str = ", ".join([str(t) for t in known_view_arg_simple_types])
+            raise TypeError(
+                f"You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: "
+                f"{known_types_str}. Please update the list or materialize it so that it can be closed "
+                "over by value, also add a test in pytorch/xla/test/test_operations.py where this code "
+                "is exercised."
+            )
+
+        if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType(
+            symIntArrayRefT
+        ):
             # It's not safe to close over IntArrayRef by value, since this is a
             # reference type, so materialize a vector to close over by value
-            arg_vec = arg + '_vec'
+            arg_vec = arg + "_vec"
             replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
             updated_unpacked_args.append(arg_vec)
         elif arg_type == OptionalCType(BaseCType(longT)):
             # Materialize int64_t? to int64_t
-            arg_value = arg + '_val'
-            replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0')
+            arg_value = arg + "_val"
+            replay_view_func += OPTIONAL_TO_VAL.substitute(
+                arg=arg, val=arg_value, default="0"
+            )
             updated_unpacked_args.append(arg_value)
         else:
             updated_unpacked_args.append(arg)
 
     replay_view_call = emit_view_call(f, input_base, updated_unpacked_args)
     replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute(
-        input_base=input_base,
-        replay_view_call=replay_view_call)
+        input_base=input_base, replay_view_call=replay_view_call
+    )
 
-    is_view_with_metadata_change = 'true' if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else 'false'
+    is_view_with_metadata_change = (
+        "true" if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else "false"
+    )
 
     return SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE.substitute(
         is_view_with_metadata_change=is_view_with_metadata_change,
-        replay_view_func=replay_view_func)
+        replay_view_func=replay_view_func,
+    )
 
-def emit_view_body(fn: NativeFunctionWithDifferentiabilityInfo, var: str) -> Tuple[str, str]:
+
+def emit_view_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, var: str
+) -> Tuple[str, str]:
     # See NOTE [ Autograd View Variables ] in variable.h for details.
     f = fn.func
     base_name = get_base_name(f)
     view_info = get_view_info(f)
-    call = ''
+    call = ""
     differentiable_outputs = gen_differentiable_outputs(fn)
     differentiable_output_vars = {r.name for r in differentiable_outputs}
     if not isinstance(view_info, str):
-        raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}')
+        raise TypeError(
+            f"The view info should be a string for {base_name}, but it is: {view_info}"
+        )
     if len(differentiable_output_vars) == 0:
         # no output is differentiable (.indices() for SparseTensors for example)
-        rhs_value = (f'as_view({view_info}, {var}, '
-                     f'/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)')
+        rhs_value = (
+            f"as_view({view_info}, {var}, "
+            f"/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)"
+        )
     elif len(differentiable_output_vars) == 1:
         # Single differentiable output (Tensor or Tensor[])
         return_info = differentiable_outputs[0]
         # We only support simple Tensor or a TensorList for functions that return views
-        if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type):
-            raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]')
+        if not is_tensor_type(return_info.type) and not is_tensor_list_type(
+            return_info.type
+        ):
+            raise RuntimeError(
+                f"{base_name} that return differentiable views can only return Tensor or Tensor[]"
+            )
 
         # See Note [ View + Inplace detection]
         def get_creation_meta_in_mode(original: str) -> str:
-            creation_meta_with_grad_mode = f'(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)'
-            return f'InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}'
+            creation_meta_with_grad_mode = f"(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)"
+            return f"InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}"
 
         # Only allow rebasing of the history if we return a single Tensor
         # If we are in a no grad block, raise a warning
         # See NOTE [ View + Inplace detection ] for more details about this logic
         if is_tensor_list_type(return_info.type):
-            creation_meta = get_creation_meta_in_mode('CreationMeta::MULTI_OUTPUT_NODE')
-            call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
-                     '/* is_fw_differentiable */ true, '
-                     f'/* creation_meta */ {creation_meta});')
-            rhs_value = f'std::move({var})'
+            creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE")
+            call += (
+                f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
+                "/* is_fw_differentiable */ true, "
+                f"/* creation_meta */ {creation_meta});"
+            )
+            rhs_value = f"std::move({var})"
         else:
             _, unpacked_bindings = unpack_args(f)
             call += emit_view_lambda(f, unpacked_bindings)
-            creation_meta = get_creation_meta_in_mode('CreationMeta::DEFAULT')
-            rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
-                         '/* is_fw_differentiable */ true, '
-                         f'/* view_func */ func, /* creation_meta */ {creation_meta})')
+            creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT")
+            rhs_value = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
+                "/* is_fw_differentiable */ true, "
+                f"/* view_func */ func, /* creation_meta */ {creation_meta})"
+            )
     else:
         # This could be supported but we don't need it at the moment, so keeping things simple.
-        raise RuntimeError('Function that return multiple differentiable output '
-                           'when at least one of them is view is not supported.')
+        raise RuntimeError(
+            "Function that return multiple differentiable output "
+            "when at least one of them is view is not supported."
+        )
     return call, rhs_value
 
+
 def modifies_arguments(f: NativeFunction) -> bool:
     return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
 
+
 @with_native_function_with_differentiability_info
 def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]:
     f = fn.func
@@ -341,48 +449,67 @@ def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> Li
 
     # code-generated ADInplaceOrView kernels plumb and recompute dispatch keys directly through the kernel for performance.
     # See Note [Plumbing Keys Through The Dispatcher] for details.
-    dispatch_key_set = 'ks & c10::after_ADInplaceOrView_keyset'
-    redispatch_args = ', '.join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+    dispatch_key_set = "ks & c10::after_ADInplaceOrView_keyset"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
 
     # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
     # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
     if modifies_arguments(f):  # inplace op
-        inplace_view_body.append(INPLACE_REDISPATCH.substitute(
-            unambiguous_name=f.func.name.unambiguous_name(),
-            unpacked_args=redispatch_args,
-        ))
+        inplace_view_body.append(
+            INPLACE_REDISPATCH.substitute(
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
         for r in cpp.return_names(f):
-            inplace_view_body.append(f'increment_version({r});')
+            inplace_view_body.append(f"increment_version({r});")
     else:
-        assert(get_view_info(f) is not None)
-        inplace_view_body.append(VIEW_REDISPATCH.substitute(
-            assign_return_values='auto ' + TMP_VAR + ' = ',
-            unambiguous_name=f.func.name.unambiguous_name(),
-            unpacked_args=redispatch_args,
-        ))
+        assert get_view_info(f) is not None
+        inplace_view_body.append(
+            VIEW_REDISPATCH.substitute(
+                assign_return_values="auto " + TMP_VAR + " = ",
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
         call, rhs_value = emit_view_body(fn, TMP_VAR)
         inplace_view_body.append(call)
         assert rhs_value is not None
         inplace_view_body.append(
-            ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f), rhs_value=rhs_value))
+            ASSIGN_RETURN_VALUE.substitute(
+                return_values=tie_return_values(f), rhs_value=rhs_value
+            )
+        )
     if f.func.returns:
-        inplace_view_body.append(f'return {get_return_value(f)};')
+        inplace_view_body.append(f"return {get_return_value(f)};")
     return inplace_view_body
 
+
 @with_native_function
 def gen_formals(f: NativeFunction) -> str:
-    return ', '.join(
+    return ", ".join(
         # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
         # See Note [Plumbing Keys Through The Dispatcher] for details.
-        ['c10::DispatchKeySet ks'] +
-        [f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
-         for a in f.func.schema_order_arguments()]
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
     )
 
+
 @with_native_function_with_differentiability_info
-def inplace_or_view_method_definition(fn: NativeFunctionWithDifferentiabilityInfo) -> Optional[str]:
+def inplace_or_view_method_definition(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Optional[str]:
     f = fn.func
-    if get_view_info(f) is None and (not modifies_arguments(f) or is_foreach_op(str(f.func.name))):
+    if get_view_info(f) is None and (
+        # For functions that modify their inputs but don't return them,
+        # we can't give them autograd support.
+        # See https://github.com/pytorch/pytorch/issues/53796
+        not modifies_arguments(f)
+        or len(f.func.returns) == 0
+    ):
         return None
     return METHOD_DEFINITION.substitute(
         return_type=cpp.returns_type(f.func.returns).cpp_type(),
@@ -391,38 +518,56 @@ def inplace_or_view_method_definition(fn: NativeFunctionWithDifferentiabilityInf
         type_definition_body=emit_inplace_or_view_body(fn),
     )
 
+
 @with_native_function_with_differentiability_info
-def inplace_or_view_method_registration(fn: NativeFunctionWithDifferentiabilityInfo) -> Optional[str]:
+def inplace_or_view_method_registration(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Optional[str]:
     f = fn.func
-    if get_view_info(f) is None and (not modifies_arguments(f) or is_foreach_op(str(f.func.name))):
+    if get_view_info(f) is None and (
+        not modifies_arguments(f) or len(f.func.returns) == 0
+    ):
         return None
     return WRAPPER_REGISTRATION.substitute(
         unqual_operator_name_with_overload=f.func.name,
         type_wrapper_name=type_wrapper_name(f),
-        class_type='ADInplaceOrView',
+        class_type="ADInplaceOrView",
     )
 
+
 def use_derived(fn: NativeFunctionWithDifferentiabilityInfo) -> bool:
     f = fn.func
     name = cpp.name(f.func)
-    return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived'
+    return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == "use_derived"
+
 
-def gen_inplace_or_view_type_env(fn: NativeFunctionWithDifferentiabilityInfo) -> Dict[str, List[str]]:
+def gen_inplace_or_view_type_env(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Dict[str, List[str]]:
     definition = inplace_or_view_method_definition(fn)
     registration = inplace_or_view_method_registration(fn)
 
     return {
-        'ops_headers': ([f'#include <ATen/ops/{fn.func.root_name}_ops.h>']
-                        if definition is not None else []),
-        'inplace_or_view_method_definitions': [definition] if definition is not None else [],
-        'inplace_or_view_wrapper_registrations': [registration] if registration is not None else [],
+        "ops_headers": (
+            [f"#include <ATen/ops/{fn.func.root_name}_ops.h>"]
+            if definition is not None
+            else []
+        ),
+        "inplace_or_view_method_definitions": [definition]
+        if definition is not None
+        else [],
+        "inplace_or_view_wrapper_registrations": [registration]
+        if registration is not None
+        else [],
     }
 
+
 def gen_inplace_or_view_type(
     out: str,
     native_yaml_path: str,
+    tags_yaml_path: str,
     fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
-    template_path: str
+    template_path: str,
 ) -> None:
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
     # template regarding sharding of the generated files.
@@ -430,15 +575,17 @@ def gen_inplace_or_view_type(
 
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
     fm.write_sharded(
-        'ADInplaceOrViewType.cpp',
+        "ADInplaceOrViewType.cpp",
         [fn for fn in fns_with_infos if use_derived(fn)],
         key_fn=lambda fn: fn.func.root_name,
         base_env={
-            'generated_comment':
-            f'@generated from {template_path}/ADInplaceOrViewType.cpp',
+            "generated_comment": f"@generated from {template_path}/ADInplaceOrViewType.cpp",
         },
         env_callable=gen_inplace_or_view_type_env,
         num_shards=2,
-        sharded_keys={'ops_headers', 'inplace_or_view_method_definitions',
-                      'inplace_or_view_wrapper_registrations'}
+        sharded_keys={
+            "ops_headers",
+            "inplace_or_view_method_definitions",
+            "inplace_or_view_wrapper_registrations",
+        },
     )
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 2b9b133cea7a..ab592764e5bd 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -37,26 +37,36 @@
 
 from .gen_trace_type import should_trace
 
-from tools.codegen.code_template import CodeTemplate
-from tools.codegen.api import cpp
-from tools.codegen.api.types import CppSignatureGroup
-from tools.codegen.api.python import (PythonArgument, PythonSignature,
-                                      PythonSignatureDeprecated,
-                                      PythonSignatureGroup,
-                                      PythonSignatureNativeFunctionPair,
-                                      arg_parser_output_exprs,
-                                      argument_type_str, cpp_dispatch_exprs,
-                                      cpp_dispatch_target,
-                                      dispatch_lambda_args,
-                                      dispatch_lambda_exprs,
-                                      dispatch_lambda_return_str,
-                                      has_tensor_options,
-                                      namedtuple_fieldnames, signature)
-from tools.codegen.gen import cpp_string, parse_native_yaml
-from tools.codegen.context import with_native_function
-from tools.codegen.model import (Argument, BaseOperatorName, NativeFunction,
-                                 Type, Variant)
-from tools.codegen.utils import split_name_params, YamlLoader, FileManager
+from torchgen.code_template import CodeTemplate
+from torchgen.api import cpp
+from torchgen.api.types import CppSignatureGroup
+from torchgen.api.python import (
+    PythonArgument,
+    PythonSignature,
+    PythonSignatureDeprecated,
+    PythonSignatureGroup,
+    PythonSignatureNativeFunctionPair,
+    arg_parser_output_exprs,
+    argument_type_str,
+    cpp_dispatch_exprs,
+    cpp_dispatch_target,
+    dispatch_lambda_args,
+    dispatch_lambda_exprs,
+    dispatch_lambda_return_str,
+    has_tensor_options,
+    namedtuple_fieldnames,
+    signature,
+)
+from torchgen.gen import cpp_string, parse_native_yaml
+from torchgen.context import with_native_function
+from torchgen.model import (
+    Argument,
+    BaseOperatorName,
+    NativeFunction,
+    Type,
+    Variant,
+)
+from torchgen.utils import split_name_params, YamlLoader, FileManager
 
 from typing import Dict, Optional, List, Tuple, Set, Sequence, Callable
 
@@ -70,49 +80,101 @@
 
 # These functions require manual Python bindings or are not exposed to Python
 _SKIP_PYTHON_BINDINGS = [
-    'alias', 'contiguous', 'is_cuda', 'is_sparse', 'is_sparse_csr', 'size', 'stride',
-    '.*_backward', '.*_backward_(out|input|weight|bias)', '.*_forward',
-    '.*_forward_out', '_unsafe_view', 'tensor', '_?sparse_coo_tensor.*',
-    '_?sparse_csr_tensor.*',
-    '_arange.*', '_range.*', 'linspace.*', 'logspace.*',
-    '_sparse_add_out', '_sparse_div.*', '_sparse_mul.*', '_sparse_sub.*', '_sparse_dense_add_out',
-    'index', 'unique_dim_consecutive',
-    '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*',
-    '_th_.*', '_thnn_.*',
-    'arange.*', 'range.*', '_solve.*', '_inverse.*',
-    'full(_out)?',
-    '_cholesky.*', '_triangular_solve.*', '_qr.*', '_symeig.*', '_svd.*',
-    'slice', 'randint(_out)?',
-    'item', '_local_scalar_dense', 'to',
-    '_to_copy',
-    'copy_sparse_to_sparse_', 'copy_',
-    'numpy_T', 'matrix_H', 'mT', 'mH',  # these need to be an attributes in Python, not functions
-    'nonzero(_(out|numpy))?',
-    'set_data',
-    '.*_overrideable',  # overrideable functions for backend extension
-    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retains_grad', 'set_',
-    '_fw_primal', 'fake_quantize_per_tensor_affine_cachemask',
-    'fake_quantize_per_channel_affine_cachemask',
-    '_new_zeros_with_same_feature_meta', '_has_same_storage_numel',  # used for forward AD internals
-    '_reshape_alias',
-    'replace_',  # only used by the functionalization pass, doesn't need to be exposed to python
+    "alias",
+    "contiguous",
+    "is_cuda",
+    "is_sparse",
+    "is_sparse_csr",
+    "size",
+    "stride",
+    ".*_backward",
+    ".*_backward_(out|input|weight|bias)",
+    ".*_forward",
+    ".*_forward_out",
+    ".*_jvp",
+    "_unsafe_view",
+    "tensor",
+    "_?sparse_(coo|compressed|csr|csc|bsr|bsc)_tensor.*",
+    "_arange.*",
+    "_range.*",
+    "linspace.*",
+    "logspace.*",
+    "_sparse_add_out",
+    "_sparse_div.*",
+    "_sparse_mul.*",
+    "_sparse_sub.*",
+    "_sparse_dense_add_out",
+    "index",
+    "unique_dim_consecutive",
+    "_cumsum.*",
+    "_cumprod.*",
+    "_sum.*",
+    "_prod.*",
+    "_th_.*",
+    "_thnn_.*",
+    "arange.*",
+    "range.*",
+    "_solve.*",
+    "_inverse.*",
+    "full(_out)?",
+    "_cholesky.*",
+    "_triangular_solve.*",
+    "_qr.*",
+    "_symeig.*",
+    "_svd.*",
+    "slice",
+    "randint(_out)?",
+    "item",
+    "_local_scalar_dense",
+    "to",
+    "_to_copy",
+    "copy_sparse_to_sparse_",
+    "copy_",
+    "numpy_T",
+    "matrix_H",
+    "mT",
+    "mH",  # these need to be an attributes in Python, not functions
+    "nonzero(_(out|numpy))?",
+    "set_data",
+    ".*_overrideable",  # overrideable functions for backend extension
+    "data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "requires_grad_",
+    "retains_grad",
+    "set_",
+    "_fw_primal",
+    "fake_quantize_per_tensor_affine_cachemask",
+    "fake_quantize_per_channel_affine_cachemask",
+    "_new_zeros_with_same_feature_meta",
+    "_has_same_storage_numel",  # used for forward AD internals
+    "_reshape_alias",
+    "replace_",  # only used by the functionalization pass, doesn't need to be exposed to python
+    "copy",  # only used by the functionalization pass
+    "fill.Tensor",  # only used by the functionalization pass
+    "fill.Scalar",  # only used by the functionalization pass
+    "lift",
 ]
 
-SKIP_PYTHON_BINDINGS = list(map(lambda pattern: re.compile(rf'^{pattern}$'), _SKIP_PYTHON_BINDINGS))
+SKIP_PYTHON_BINDINGS = list(
+    map(lambda pattern: re.compile(rf"^{pattern}$"), _SKIP_PYTHON_BINDINGS)
+)
 
 # These function signatures are not exposed to Python. Note that this signature
 # list does not support regex.
 SKIP_PYTHON_BINDINGS_SIGNATURES = [
-    'add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor',
-    'add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)',
-    'sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor',
-    'sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)',
-    'mul.Scalar(Tensor self, Scalar other) -> Tensor',
-    'mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)',
-    'div.Scalar(Tensor self, Scalar other) -> Tensor',
-    'div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)',
+    "add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+    "add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+    "sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+    "sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+    "mul.Scalar(Tensor self, Scalar other) -> Tensor",
+    "mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
+    "div.Scalar(Tensor self, Scalar other) -> Tensor",
+    "div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
 ]
 
+
 @with_native_function
 def should_generate_py_binding(f: NativeFunction) -> bool:
     name = cpp.name(f.func)
@@ -127,32 +189,42 @@ def should_generate_py_binding(f: NativeFunction) -> bool:
 
     return True
 
+
 def get_pycname(name: BaseOperatorName) -> str:
-    return f'THPVariable_{name}'
+    return f"THPVariable_{name}"
+
 
 def is_noarg(overloads: Sequence[PythonSignatureNativeFunctionPair]) -> bool:
     return len(overloads) == 1 and overloads[0].signature.arguments_count() == 0
 
+
 def is_py_variable_method(f: NativeFunction) -> bool:
     return f.python_module is None and Variant.method in f.variants
 
+
 def is_py_torch_function(f: NativeFunction) -> bool:
     return f.python_module is None and Variant.function in f.variants
 
+
 def is_py_nn_function(f: NativeFunction) -> bool:
-    return f.python_module == 'nn'
+    return f.python_module == "nn"
+
 
 def is_py_fft_function(f: NativeFunction) -> bool:
-    return f.python_module == 'fft'
+    return f.python_module == "fft"
+
 
 def is_py_linalg_function(f: NativeFunction) -> bool:
-    return f.python_module == 'linalg'
+    return f.python_module == "linalg"
+
 
 def is_py_sparse_function(f: NativeFunction) -> bool:
-    return f.python_module == 'sparse'
+    return f.python_module == "sparse"
+
 
 def is_py_special_function(f: NativeFunction) -> bool:
-    return f.python_module == 'special'
+    return f.python_module == "special"
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
@@ -160,54 +232,110 @@ def is_py_special_function(f: NativeFunction) -> bool:
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
-def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_path: str) -> None:
+
+def gen(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    deprecated_yaml_path: str,
+    template_path: str,
+) -> None:
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
-    native_functions = parse_native_yaml(native_yaml_path).native_functions
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
     native_functions = list(filter(should_generate_py_binding, native_functions))
 
     methods = load_signatures(native_functions, deprecated_yaml_path, method=True)
     create_python_bindings(
-        fm, methods, is_py_variable_method, None, 'python_variable_methods.cpp', method=True)
+        fm,
+        methods,
+        is_py_variable_method,
+        None,
+        "python_variable_methods.cpp",
+        method=True,
+    )
 
     # NOTE: num_shards here must be synced with gatherTorchFunctions in
     #       torch/csrc/autograd/python_torch_functions_manual.cpp
     functions = load_signatures(native_functions, deprecated_yaml_path, method=False)
     create_python_bindings_sharded(
-        fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp',
-        method=False, num_shards=3)
+        fm,
+        functions,
+        is_py_torch_function,
+        "torch",
+        "python_torch_functions.cpp",
+        method=False,
+        num_shards=3,
+    )
 
     create_python_bindings(
-        fm, functions, is_py_nn_function, 'torch.nn', 'python_nn_functions.cpp', method=False)
+        fm,
+        functions,
+        is_py_nn_function,
+        "torch.nn",
+        "python_nn_functions.cpp",
+        method=False,
+    )
 
     create_python_bindings(
-        fm, functions, is_py_fft_function, 'torch.fft', 'python_fft_functions.cpp', method=False)
+        fm,
+        functions,
+        is_py_fft_function,
+        "torch.fft",
+        "python_fft_functions.cpp",
+        method=False,
+    )
 
     create_python_bindings(
-        fm, functions, is_py_linalg_function, 'torch.linalg', 'python_linalg_functions.cpp', method=False)
+        fm,
+        functions,
+        is_py_linalg_function,
+        "torch.linalg",
+        "python_linalg_functions.cpp",
+        method=False,
+    )
 
     create_python_bindings(
-        fm, functions, is_py_sparse_function, 'torch.sparse', 'python_sparse_functions.cpp', method=False)
+        fm,
+        functions,
+        is_py_sparse_function,
+        "torch.sparse",
+        "python_sparse_functions.cpp",
+        method=False,
+    )
 
     create_python_bindings(
-        fm, functions, is_py_special_function, 'torch.special', 'python_special_functions.cpp', method=False)
+        fm,
+        functions,
+        is_py_special_function,
+        "torch.special",
+        "python_special_functions.cpp",
+        method=False,
+    )
 
     # Currently, we only use `functions` to generate `return_types` bindings.
     # All methods which return namedtuple have function variant at this point.
     # If any method only operator with namedtuple is added in the future,
     # we will have to address that.
     create_python_return_type_bindings(
-        fm, functions, lambda fn: True, 'python_return_types.cpp')
+        fm, functions, lambda fn: True, "python_return_types.cpp"
+    )
+
 
 def group_filter_overloads(
     pairs: Sequence[PythonSignatureNativeFunctionPair],
-    pred: Callable[[NativeFunction], bool]
+    pred: Callable[[NativeFunction], bool],
 ) -> Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]:
-    grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list)
+    grouped: Dict[
+        BaseOperatorName, List[PythonSignatureNativeFunctionPair]
+    ] = defaultdict(list)
     for pair in pairs:
         if pred(pair.function):
             grouped[pair.function.func.name.name].append(pair)
     return grouped
 
+
 def create_python_bindings(
     fm: FileManager,
     pairs: Sequence[PythonSignatureNativeFunctionPair],
@@ -230,15 +358,20 @@ def create_python_bindings(
         py_methods.append(method_impl(name, module, overloads, method=method))
         py_method_defs.append(method_def(name, module, overloads, method=method))
         py_forwards.extend(forward_decls(name, overloads, method=method))
-        ops_headers.append(f'#include <ATen/ops/{name.base}.h>')
+        ops_headers.append(f"#include <ATen/ops/{name.base}.h>")
+
+    fm.write_with_template(
+        filename,
+        filename,
+        lambda: {
+            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
+            "ops_headers": ops_headers,
+            "py_forwards": py_forwards,
+            "py_methods": py_methods,
+            "py_method_defs": py_method_defs,
+        },
+    )
 
-    fm.write_with_template(filename, filename, lambda: {
-        'generated_comment': '@' + f'generated from {fm.template_dir}/{filename}',
-        'ops_headers': ops_headers,
-        'py_forwards': py_forwards,
-        'py_methods': py_methods,
-        'py_method_defs': py_method_defs,
-    })
 
 def create_python_return_type_bindings(
     fm: FileManager,
@@ -257,15 +390,24 @@ def create_python_return_type_bindings(
 
     for name in sorted(grouped.keys(), key=lambda x: str(x)):
         overloads = grouped[name]
-        definitions, map_entries = generate_return_type_definition_and_map_entry(overloads)
-        py_return_types_definition.append("" if not definitions else "\n".join(definitions))
+        definitions, map_entries = generate_return_type_definition_and_map_entry(
+            overloads
+        )
+        py_return_types_definition.append(
+            "" if not definitions else "\n".join(definitions)
+        )
         py_return_types_map.append("" if not map_entries else "\n".join(map_entries))
 
-    fm.write_with_template(filename, filename, lambda: {
-        'generated_comment': '@' + f'generated from {fm.template_dir}/{filename}',
-        'py_return_types': py_return_types_definition,
-        'py_return_types_map' : py_return_types_map,
-    })
+    fm.write_with_template(
+        filename,
+        filename,
+        lambda: {
+            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
+            "py_return_types": py_return_types_definition,
+            "py_return_types_map": py_return_types_map,
+        },
+    )
+
 
 def create_python_bindings_sharded(
     fm: FileManager,
@@ -275,12 +417,14 @@ def create_python_bindings_sharded(
     filename: str,
     *,
     method: bool,
-    num_shards: int
+    num_shards: int,
 ) -> None:
     """Generates Python bindings to ATen functions"""
     grouped = group_filter_overloads(pairs, pred)
 
-    def key_func(kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]) -> str:
+    def key_func(
+        kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]
+    ) -> str:
         return kv[0].base
 
     def env_func(
@@ -288,25 +432,25 @@ def env_func(
     ) -> Dict[str, List[str]]:
         name, fn_pairs = kv
         return {
-            'ops_headers': [f'#include <ATen/ops/{name.base}.h>'],
-            'py_forwards': list(forward_decls(name, fn_pairs, method=method)),
-            'py_methods': [method_impl(name, module, fn_pairs, method=method)],
-            'py_method_defs': [method_def(name, module, fn_pairs, method=method)],
+            "ops_headers": [f"#include <ATen/ops/{name.base}.h>"],
+            "py_forwards": list(forward_decls(name, fn_pairs, method=method)),
+            "py_methods": [method_impl(name, module, fn_pairs, method=method)],
+            "py_method_defs": [method_def(name, module, fn_pairs, method=method)],
         }
 
     fm.write_sharded(
         filename,
         grouped.items(),
         base_env={
-            'generated_comment':
-            '@' + f'generated from {fm.template_dir}/{filename}',
+            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
         },
         key_fn=key_func,
         env_callable=env_func,
         num_shards=num_shards,
-        sharded_keys={'ops_headers', 'py_forwards', 'py_methods', 'py_method_defs'}
+        sharded_keys={"ops_headers", "py_forwards", "py_methods", "py_method_defs"},
     )
 
+
 def load_signatures(
     native_functions: List[NativeFunction],
     deprecated_yaml_path: str,
@@ -315,7 +459,6 @@ def load_signatures(
     skip_deprecated: bool = False,
     pyi: bool = False,
 ) -> Sequence[PythonSignatureNativeFunctionPair]:
-
     @with_native_function
     def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair:
         return PythonSignatureNativeFunctionPair(
@@ -324,9 +467,12 @@ def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair:
         )
 
     pairs = list(map(gen_signature_pairs, native_functions))
-    deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method, pyi=pyi)
+    deprecated = load_deprecated_signatures(
+        pairs, deprecated_yaml_path, method=method, pyi=pyi
+    )
     return pairs if skip_deprecated else pairs + deprecated
 
+
 def load_deprecated_signatures(
     pairs: Sequence[PythonSignatureNativeFunctionPair],
     deprecated_yaml_path: str,
@@ -345,28 +491,35 @@ def signature_original(f: NativeFunction) -> str:
         # remove inplace suffix but keep outplace suffix
         opname = str(f.func.name.name.base)
         if f.func.is_out_fn():
-            opname += '_out'
+            opname += "_out"
         if f.func.name.name.inplace and pyi:
-            opname += '_'
-        args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
+            opname += "_"
+        args = CppSignatureGroup.from_native_function(
+            f, method=False
+        ).signature.arguments()
         # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml.
-        types = ', '.join(argument_type_str(a.argument.type)
-                          for a in args if isinstance(a.argument, Argument))
-        return f'{opname}({types})'
+        types = ", ".join(
+            argument_type_str(a.argument.type)
+            for a in args
+            if isinstance(a.argument, Argument)
+        )
+        return f"{opname}({types})"
 
     # deprecated -> type-only native signature (according to the call order)
-    def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -> str:
+    def signature_deprecated(
+        opname: str, params: List[str], call_args: List[str]
+    ) -> str:
         # create a mapping of parameter name to parameter type
         types: Dict[str, str] = {}
         for param in params:
-            if param == '*':
+            if param == "*":
                 continue
-            type, name = param.split(' ')
+            type, name = param.split(" ")
             types[name] = type
         # if the name in the call is not in the parameter list, assume it's
         # a literal Scalar
-        rearranged_types = ', '.join(types.get(arg, 'Scalar') for arg in call_args)
-        return f'{opname}({rearranged_types})'
+        rearranged_types = ", ".join(types.get(arg, "Scalar") for arg in call_args)
+        return f"{opname}({rearranged_types})"
 
     # group the original ATen signatures by type-only signature
     grouped: Dict[str, List[PythonSignatureNativeFunctionPair]] = defaultdict(list)
@@ -376,12 +529,12 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
     # find matching original signatures for each deprecated signature
     results: List[PythonSignatureNativeFunctionPair] = []
 
-    with open(deprecated_yaml_path, 'r') as f:
+    with open(deprecated_yaml_path, "r") as f:
         deprecated_defs = yaml.load(f, Loader=YamlLoader)
 
     for deprecated in deprecated_defs:
-        _, params = split_name_params(deprecated['name'])
-        aten_name, call_args = split_name_params(deprecated['aten'])
+        _, params = split_name_params(deprecated["name"])
+        aten_name, call_args = split_name_params(deprecated["aten"])
 
         for pair in grouped[signature_deprecated(aten_name, params, call_args)]:
             # It uses the types from the original ATen declaration, but the
@@ -392,12 +545,15 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
             # but never changes output_args nor TensorOptions (if any?),
             # so here we only look into these two types of args.
             python_sig = pair.signature
-            src_args: Dict[str, PythonArgument] = {a.name: PythonArgument(
-                name=a.name,
-                type=a.type,
-                default=None,
-                default_init=None,
-            ) for a in itertools.chain(python_sig.input_args, python_sig.input_kwargs)}
+            src_args: Dict[str, PythonArgument] = {
+                a.name: PythonArgument(
+                    name=a.name,
+                    type=a.type,
+                    default=None,
+                    default_init=None,
+                )
+                for a in itertools.chain(python_sig.input_args, python_sig.input_kwargs)
+            }
 
             args: List[str] = []
             input_args: List[PythonArgument] = []
@@ -405,10 +561,10 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
 
             kwarg_only = False
             for param in params:
-                if param == '*':
+                if param == "*":
                     kwarg_only = True
                     continue
-                _, param_name = param.split(' ')
+                _, param_name = param.split(" ")
                 args.append(param_name)
 
                 if param_name not in src_args:
@@ -416,49 +572,56 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
                     continue
 
                 if not kwarg_only:
-                    if not method or param_name != 'self':
+                    if not method or param_name != "self":
                         input_args.append(src_args[param_name])
                 else:
                     input_kwargs.append(src_args[param_name])
 
-            results.append(PythonSignatureNativeFunctionPair(
-                signature=PythonSignatureDeprecated(
-                    name=python_sig.name,
-                    input_args=tuple(input_args),
-                    input_kwargs=tuple(input_kwargs),
-                    output_args=python_sig.output_args,
-                    tensor_options_args=python_sig.tensor_options_args,
-                    method=python_sig.method,
-                    deprecated_args_names=tuple(args),
-                    deprecated_args_exprs=tuple(call_args),
-                    returns=python_sig.returns,
-                ),
-                function=pair.function,
-            ))
+            results.append(
+                PythonSignatureNativeFunctionPair(
+                    signature=PythonSignatureDeprecated(
+                        name=python_sig.name,
+                        input_args=tuple(input_args),
+                        input_kwargs=tuple(input_kwargs),
+                        output_args=python_sig.output_args,
+                        tensor_options_args=python_sig.tensor_options_args,
+                        method=python_sig.method,
+                        deprecated_args_names=tuple(args),
+                        deprecated_args_exprs=tuple(call_args),
+                        returns=python_sig.returns,
+                    ),
+                    function=pair.function,
+                )
+            )
 
     return results
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                         Named Tuple Codegen
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+
 @with_native_function
 def gen_namedtuple_typename_key(f: NativeFunction) -> str:
     name = cpp.name(f.func)
     fieldnames = namedtuple_fieldnames(f.func.returns)
-    return '_'.join([name] + fieldnames)
+    return "_".join([name] + fieldnames)
+
 
 def emit_namedtuple_call(
-    overloads: Sequence[PythonSignatureNativeFunctionPair]
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
 ) -> Tuple[List[str], Dict[str, str]]:
     """
     Generate block of named tuple type def inits, and add typeref snippets
     to declarations that use them
     """
-    typenames: Dict[str, str] = {}    # map from unique name + field name lists to typedef name
-    typedefs: List[str] = []          # typedef declarations and init code
+    typenames: Dict[
+        str, str
+    ] = {}  # map from unique name + field name lists to typedef name
+    typedefs: List[str] = []  # typedef declarations and init code
 
     for overload in overloads:
         fieldnames = namedtuple_fieldnames(overload.function.func.returns)
@@ -471,8 +634,10 @@ def emit_namedtuple_call(
         if typename is None:
             typename = f'NamedTuple{"" if not typedefs else len(typedefs)}'
             typenames[tn_key] = typename
-            typedefs.append(f"""\
-static PyTypeObject* {typename} = get_namedtuple("{name}");""")
+            typedefs.append(
+                f"""\
+static PyTypeObject* {typename} = get_namedtuple("{name}");"""
+            )
 
     return typedefs, typenames
 
@@ -485,16 +650,20 @@ def generate_return_type_definition_and_map_entry(
     and return named tuple for a native function which returns named tuple
     and relevant entry for the map in same file.
     """
-    typenames: Dict[str, str] = {}  # map from unique name + field name lists to typedef name
+    typenames: Dict[
+        str, str
+    ] = {}  # map from unique name + field name lists to typedef name
     definitions: List[str] = []  # function defintion to register the typedef
-    map_entries: List[str] = []  # C++ map entry of <function_name, function creates it namedtuple>
+    map_entries: List[
+        str
+    ] = []  # C++ map entry of <function_name, function creates it namedtuple>
 
     for overload in overloads:
         fieldnames = namedtuple_fieldnames(overload.function.func.returns)
         if not fieldnames:
             continue
 
-        fields = ', '.join(f'{{"{fn}", ""}}' for fn in fieldnames)
+        fields = ", ".join(f'{{"{fn}", ""}}' for fn in fieldnames)
 
         name = cpp.name(overload.function.func)  # use @with_native_function?
         tn_key = gen_namedtuple_typename_key(overload.function)
@@ -503,7 +672,8 @@ def generate_return_type_definition_and_map_entry(
         if typename is None:
             typename = f'{name}NamedTuple{"" if not definitions else len(definitions)}'
             typenames[tn_key] = typename
-            definitions.append(f"""\
+            definitions.append(
+                f"""\
 PyTypeObject* get_{name}_namedtuple() {{
     static PyStructSequence_Field NamedTuple_fields[] = {{ {fields},  {{nullptr}} }};
     static PyTypeObject {typename};
@@ -516,11 +686,13 @@ def generate_return_type_definition_and_map_entry(
     }}
     return &{typename};
 }}
-""")
+"""
+            )
             map_entries.append(f'{{"{name}", get_{name}_namedtuple()}}, ')
 
     return definitions, map_entries
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                         Method Impl Codegen
@@ -528,7 +700,8 @@ def generate_return_type_definition_and_map_entry(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 # python binding for all overloads of a particular function/method
-PY_VARIABLE_METHOD_VARARGS = CodeTemplate(r"""\
+PY_VARIABLE_METHOD_VARARGS = CodeTemplate(
+    r"""\
 // ${name}
 static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs)
 {
@@ -546,19 +719,23 @@ def generate_return_type_definition_and_map_entry(
   ${method_footer}
 }
 
-""")
+"""
+)
 
 # handler for a single parsed signature - may be a single overload or
 # a pair of overloads that whose signatures only differ in output params
 # (plugged into PY_VARIABLE_METHOD_VARARGS as an item in ${dispatch})
-PY_VARIABLE_CASE = CodeTemplate("""\
+PY_VARIABLE_CASE = CodeTemplate(
+    """\
 case ${overload_index}: {
   ${body}
 }
-""")
+"""
+)
 
 # python binding for single-overload function/method
-PY_VARIABLE_METHOD_VARARGS_SINGLETON = CodeTemplate("""\
+PY_VARIABLE_METHOD_VARARGS_SINGLETON = CodeTemplate(
+    """\
 // ${name}
 static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs)
 {
@@ -574,10 +751,12 @@ def generate_return_type_definition_and_map_entry(
   ${method_footer}
 }
 
-""")
+"""
+)
 
 # python binding for a method with no args, shortcuts parsing
-PY_VARIABLE_METHOD_NOARGS = CodeTemplate("""\
+PY_VARIABLE_METHOD_NOARGS = CodeTemplate(
+    """\
 // ${name}
 static PyObject * ${pycname}(PyObject* self_, PyObject* args)
 {
@@ -587,14 +766,16 @@ def generate_return_type_definition_and_map_entry(
   ${method_footer}
 }
 
-""")
+"""
+)
+
 
 def method_impl(
     name: BaseOperatorName,
     module: Optional[str],
     overloads: Sequence[PythonSignatureNativeFunctionPair],
     *,
-    method: bool
+    method: bool,
 ) -> str:
     """
     Generate a python binding for all overloads of an op.
@@ -603,15 +784,15 @@ def method_impl(
     noarg = is_noarg(overloads)
     namedtuple_inits, namedtuple_typenames = emit_namedtuple_call(overloads)
 
-    method_header = ['HANDLE_TH_ERRORS']
+    method_header = ["HANDLE_TH_ERRORS"]
     method_header += namedtuple_inits
-    method_header += [
-        "const Tensor& self = THPVariable_Unpack(self_);"
-    ] if method else []
+    method_header += (
+        ["const Tensor& self = THPVariable_Unpack(self_);"] if method else []
+    )
 
-    method_footer = ([] if noarg else ['Py_RETURN_NONE;']) + ['END_HANDLE_TH_ERRORS']
+    method_footer = ([] if noarg else ["Py_RETURN_NONE;"]) + ["END_HANDLE_TH_ERRORS"]
 
-    traceable = 'true' if all(should_trace(o.function) for o in overloads) else 'false'
+    traceable = "true" if all(should_trace(o.function) for o in overloads) else "false"
 
     grouped_overloads: Sequence[PythonSignatureGroup] = group_overloads(overloads)
     is_singleton = len(grouped_overloads) == 1
@@ -619,11 +800,15 @@ def method_impl(
     dispatch: List[str] = []
     for overload_index, overload in enumerate(grouped_overloads):
         signature = overload.signature.signature_str()
-        signatures.append(f'{cpp_string(str(signature))},')
+        signatures.append(f"{cpp_string(str(signature))},")
         dispatch_body = emit_dispatch_case(overload, namedtuple_typenames)
         dispatch.append(
-            PY_VARIABLE_CASE.substitute(overload_index=overload_index, body=dispatch_body)
-            if not is_singleton else dispatch_body)
+            PY_VARIABLE_CASE.substitute(
+                overload_index=overload_index, body=dispatch_body
+            )
+            if not is_singleton
+            else dispatch_body
+        )
 
     if noarg:
         template = PY_VARIABLE_METHOD_NOARGS
@@ -650,6 +835,7 @@ def method_impl(
         self_="self_" if method else "nullptr",
     )
 
+
 def gen_has_torch_function_check(
     name: BaseOperatorName, module: Optional[str], *, noarg: bool, method: bool
 ) -> str:
@@ -661,17 +847,21 @@ def gen_has_torch_function_check(
 }}
 """
         else:
-            return ''
+            return ""
 
     self_ = "self_" if method else "nullptr"
-    namespace = {
-        "torch": "THPVariableFunctionsModule",
-        "torch.nn": "THPNNVariableFunctionsModule",
-        "torch.fft": "THPFFTVariableFunctionsModule",
-        "torch.linalg": "THPLinalgVariableFunctionsModule",
-        "torch.sparse": "THPSparseVariableFunctionsModule",
-        "torch.special": "THPSpecialVariableFunctionsModule",
-    }[module] if module else "THPVariableClass"
+    namespace = (
+        {
+            "torch": "THPVariableFunctionsModule",
+            "torch.nn": "THPNNVariableFunctionsModule",
+            "torch.fft": "THPFFTVariableFunctionsModule",
+            "torch.linalg": "THPLinalgVariableFunctionsModule",
+            "torch.sparse": "THPSparseVariableFunctionsModule",
+            "torch.special": "THPSpecialVariableFunctionsModule",
+        }[module]
+        if module
+        else "THPVariableClass"
+    )
 
     return f"""\
 if(_r.has_torch_function()) {{
@@ -679,14 +869,18 @@ def gen_has_torch_function_check(
 }}
 """
 
+
 # handler for output/no-output overload pair
-PY_VARIABLE_OUT = CodeTemplate("""\
+PY_VARIABLE_OUT = CodeTemplate(
+    """\
 if (_r.isNone(${out_idx})) {
   ${call_dispatch}
 } else {
   ${call_dispatch_out}
 }
-""")
+"""
+)
+
 
 def emit_dispatch_case(
     overload: PythonSignatureGroup,
@@ -703,14 +897,18 @@ def emit_dispatch_case(
         return PY_VARIABLE_OUT.substitute(
             out_idx=overload.signature.output_idx(),
             call_dispatch=emit_single_dispatch(
-                overload.signature, overload.base, namedtuple_typenames),
+                overload.signature, overload.base, namedtuple_typenames
+            ),
             call_dispatch_out=emit_single_dispatch(
-                overload.signature, overload.outplace, namedtuple_typenames),
+                overload.signature, overload.outplace, namedtuple_typenames
+            ),
         )
     else:
         # no-output version only
         return emit_single_dispatch(
-            overload.signature, overload.base, namedtuple_typenames)
+            overload.signature, overload.base, namedtuple_typenames
+        )
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
@@ -718,24 +916,30 @@ def emit_dispatch_case(
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+
 def forward_decls(
     name: BaseOperatorName,
     overloads: Sequence[PythonSignatureNativeFunctionPair],
     *,
-    method: bool
+    method: bool,
 ) -> Tuple[str, ...]:
     if method:
         return ()
 
     pycname = get_pycname(name)
     if is_noarg(overloads):
-        return (f"""\
+        return (
+            f"""\
 static PyObject * {pycname}(PyObject* self_, PyObject* args);
-""",)
+""",
+        )
     else:
-        return (f"""\
+        return (
+            f"""\
 static PyObject * {pycname}(PyObject* self_, PyObject* args, PyObject* kwargs);
-""",)
+""",
+        )
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
@@ -743,12 +947,13 @@ def forward_decls(
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+
 def method_def(
     name: BaseOperatorName,
     module: Optional[str],
     overloads: Sequence[PythonSignatureNativeFunctionPair],
     *,
-    method: bool
+    method: bool,
 ) -> str:
     """
     Generate method def entry.
@@ -756,14 +961,14 @@ def method_def(
     pycname = get_pycname(name)
 
     if is_noarg(overloads):
-        pyfunc_cast = ''
-        flags = 'METH_NOARGS' if method else 'METH_VARARGS | METH_KEYWORDS'
+        pyfunc_cast = ""
+        flags = "METH_NOARGS" if method else "METH_VARARGS | METH_KEYWORDS"
     else:
-        pyfunc_cast = 'castPyCFunctionWithKeywords'
-        flags = 'METH_VARARGS | METH_KEYWORDS'
+        pyfunc_cast = "castPyCFunctionWithKeywords"
+        flags = "METH_VARARGS | METH_KEYWORDS"
 
     if module == "torch":
-        flags += ' | METH_STATIC'
+        flags += " | METH_STATIC"
 
     if name.dunder_method:
         # PyMethodDef entry for binary op, throws not implemented error
@@ -774,12 +979,14 @@ def method_def(
         return f"""\
 {{"{name}", {pyfunc_cast}({pycname}), {flags}, NULL}},"""
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                   Overload Sorting and Grouping
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+
 def group_overloads(
     overloads: Sequence[PythonSignatureNativeFunctionPair],
 ) -> Sequence[PythonSignatureGroup]:
@@ -792,15 +999,15 @@ def group_overloads(
         if overload.function.func.is_out_fn():
             if sig in outplaces:
                 raise RuntimeError(
-                    f'Found duplicated function definition:\n- {overload.function.func}.\n'
-                    f'Existing definition:\n- {outplaces[sig].function.func}.'
+                    f"Found duplicated function definition:\n- {overload.function.func}.\n"
+                    f"Existing definition:\n- {outplaces[sig].function.func}."
                 )
             outplaces[sig] = overload
         else:
             if sig in bases:
                 raise RuntimeError(
-                    f'Found duplicated function definition:\n- {overload.function.func}.\n'
-                    f'Existing definition:\n- {bases[sig].function.func}.'
+                    f"Found duplicated function definition:\n- {overload.function.func}.\n"
+                    f"Existing definition:\n- {bases[sig].function.func}."
                 )
             bases[sig] = overload
 
@@ -808,30 +1015,41 @@ def group_overloads(
         if sig not in bases:
             candidates: List[str] = []
             for overload in overloads:
-                if str(overload.function.func.name.name) == str(out.function.func.name.name) \
-                        and not overload.function.func.is_out_fn() \
-                        and not overload.signature.deprecated:
-                    candidates.append(overload.signature.signature_str(skip_outputs=True))
+                if (
+                    str(overload.function.func.name.name)
+                    == str(out.function.func.name.name)
+                    and not overload.function.func.is_out_fn()
+                    and not overload.signature.deprecated
+                ):
+                    candidates.append(
+                        overload.signature.signature_str(skip_outputs=True)
+                    )
             out_sig = out.signature.signature_str()
             raise RuntimeError(
-                f'While identifying overloads, we found an out schema {out_sig} without a corresponding non-out variant. '
-                f'We expected the non-out variant to have schema: \n- {sig}\nPlease check that you spelled the schema '
-                'correctly in native_functions.yaml. We discovered the following candidate(s): \n'
-                + '\n'.join(f'- {candidate}' for candidate in candidates))
+                f"While identifying overloads, we found an out schema {out_sig} without a corresponding non-out variant. "
+                f"We expected the non-out variant to have schema: \n- {sig}\nPlease check that you spelled the schema "
+                "correctly in native_functions.yaml. We discovered the following candidate(s): \n"
+                + "\n".join(f"- {candidate}" for candidate in candidates)
+            )
 
     grouped: List[PythonSignatureGroup] = []
     for sig, base in bases.items():
         outplace = outplaces.get(sig)
-        grouped.append(PythonSignatureGroup(
-            # prefer the signature with optional out=... arguments because it's the
-            # superset that can be used to parse input for both base and outplace.
-            signature=outplace.signature if outplace is not None else base.signature,
-            base=base.function,
-            outplace=outplace.function if outplace is not None else None,
-        ))
+        grouped.append(
+            PythonSignatureGroup(
+                # prefer the signature with optional out=... arguments because it's the
+                # superset that can be used to parse input for both base and outplace.
+                signature=outplace.signature
+                if outplace is not None
+                else base.signature,
+                base=base.function,
+                outplace=outplace.function if outplace is not None else None,
+            )
+        )
 
     return sort_overloads(grouped)
 
+
 # This function declares a partial order on declarations, and sorts them according
 # to its linear extension. This is necessary, because there's some ambiguity in the
 # choice of overload, and we want a different order.
@@ -876,20 +1094,29 @@ def group_overloads(
 #     foo(Tensor other, *, Scalar alpha=1, Scalar beta=1)
 #
 
+
 def sort_overloads(
-    grouped_overloads: Sequence[PythonSignatureGroup]
+    grouped_overloads: Sequence[PythonSignatureGroup],
 ) -> Sequence[PythonSignatureGroup]:
-
     def is_arg_smaller(t1: Type, t2: Type) -> bool:
-        return (str(t1) == 'Scalar' and str(t2) == 'Tensor' or
-                'Dimname' in str(t1) and 'Dimname' not in str(t2) or
-                # In the discussion https://github.com/pytorch/pytorch/issues/54555 it has been
-                # discussed why it is important to prioritize int/int? over int[]
-                str(t1) == 'int[]' and (str(t2) == 'int' or str(t2) == 'int?') or
-                # TensorList currently throws an error during argument parsing, that's why it needs to be
-                # last in signature ordering. See discussion: https://github.com/pytorch/pytorch/issues/58087
-                str(t1) == 'Tensor[]' and str(t2).find("[]") != -1)
-
+        return (
+            str(t1) == "Scalar"
+            and str(t2) == "Tensor"
+            or str(t1) == "Scalar?"
+            and str(t2) == "Tensor?"
+            or "Dimname" in str(t1)
+            and "Dimname" not in str(t2)
+            or
+            # In the discussion https://github.com/pytorch/pytorch/issues/54555 it has been
+            # discussed why it is important to prioritize int/int? over int[]
+            str(t1) == "int[]"
+            and (str(t2) == "int" or str(t2) == "int?")
+            or
+            # TensorList currently throws an error during argument parsing, that's why it needs to be
+            # last in signature ordering. See discussion: https://github.com/pytorch/pytorch/issues/58087
+            str(t1) == "Tensor[]"
+            and str(t2).find("[]") != -1
+        )
 
     def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool:
         """Returns True if s1 < s2 in the partial order."""
@@ -900,13 +1127,16 @@ def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool:
         # above. The old codegen used the deprecated 'dynamic_type(arg.type)', which
         # ignores the optional annotation, i.e. 'Scalar' and 'Scalar?'.
         equal = all(arg1.type == arg2.type for arg1, arg2 in zip(args1, args2))
-        smaller_or_equal = all(str(arg1.type) == str(arg2.type)
-                               or is_arg_smaller(arg1.type, arg2.type)
-                               for arg1, arg2 in zip(args1, args2))
+        smaller_or_equal = all(
+            str(arg1.type) == str(arg2.type) or is_arg_smaller(arg1.type, arg2.type)
+            for arg1, arg2 in zip(args1, args2)
+        )
         return smaller_or_equal and not equal
 
     # First sort by signature
-    grouped_overloads = sorted(grouped_overloads, key=lambda x: x.signature.signature_str())
+    grouped_overloads = sorted(
+        grouped_overloads, key=lambda x: x.signature.signature_str()
+    )
 
     # Construct the relation graph
     larger_than: Dict[int, Set[int]] = defaultdict(set)
@@ -934,39 +1164,43 @@ def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool:
 
     return list(map(lambda x: grouped_overloads[x], sorted_ids))
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                       Codegen API Integration
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+
 def emit_single_dispatch(
     ps: PythonSignature, f: NativeFunction, namedtuple_typenames: Dict[str, str]
 ) -> str:
     """
     Emit dispatch code for a single native function.
     """
+
     @with_native_function
     def go(f: NativeFunction) -> str:
         # header comments
-        deprecated = '[deprecated] ' if ps.deprecated else ''
-        schema_comment = f'// {deprecated}aten::{f.func}'
+        deprecated = "[deprecated] " if ps.deprecated else ""
+        schema_comment = f"// {deprecated}aten::{f.func}"
 
         # dispatch lambda signature
         name = cpp.name(f.func)
-        lambda_formals = ', '.join(map(lambda a: f"{a.type_str} {a.name}",
-                                       dispatch_lambda_args(ps, f)))
+        lambda_formals = ", ".join(
+            map(lambda a: f"{a.type_str} {a.name}", dispatch_lambda_args(ps, f))
+        )
         lambda_return = dispatch_lambda_return_str(f)
 
         # dispatch lambda body
         dispatch_callee = cpp_dispatch_target(f)
-        dispatch_args = ', '.join(cpp_dispatch_exprs(f, python_signature=ps))
+        dispatch_args = ", ".join(cpp_dispatch_exprs(f, python_signature=ps))
 
         # from arg parser outputs to dispatch lambda arguments
         parser_outputs = arg_parser_output_exprs(ps, f)
         lambda_arg_exprs = dispatch_lambda_exprs(ps, f)
-        inits = '\n'.join(lambda_arg_exprs.inits)
-        lambda_args = ', '.join(lambda_arg_exprs.exprs)
+        inits = "\n".join(lambda_arg_exprs.inits)
+        lambda_args = ", ".join(lambda_arg_exprs.exprs)
 
         # scatter fields
         # TODO: Checking `ps.method and ('requires_grad' in parser_outputs)` is a hacky
@@ -974,12 +1208,17 @@ def go(f: NativeFunction) -> str:
         #       new_full, new_empty, and new_zeros. A much better but more difficult to
         #       implement solution involves refactoring according to Ed's description here:
         #       https://github.com/pytorch/pytorch/issues/36455#issuecomment-614767589
-        need_set_requires_grad = ps.tensor_options_args and (not has_tensor_options(f) or (
-            ps.method and ('requires_grad' in parser_outputs)))
-        set_requires_grad = f'.set_requires_grad({parser_outputs["requires_grad"].expr})' \
-            if need_set_requires_grad else ''
+        need_set_requires_grad = ps.tensor_options_args and (
+            not has_tensor_options(f)
+            or (ps.method and ("requires_grad" in parser_outputs))
+        )
+        set_requires_grad = (
+            f'.set_requires_grad({parser_outputs["requires_grad"].expr})'
+            if need_set_requires_grad
+            else ""
+        )
 
-        if lambda_return == 'void':
+        if lambda_return == "void":
             return f"""\
 {schema_comment}
 {inits}
@@ -992,7 +1231,7 @@ def go(f: NativeFunction) -> str:
 """
         else:
             typename = namedtuple_typenames.get(gen_namedtuple_typename_key(f))
-            namedtuple_typeref = f'{typename}, ' if typename is not None else ''
+            namedtuple_typeref = f"{typename}, " if typename is not None else ""
             return f"""\
 {schema_comment}
 {inits}
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 1b9cc7eec294..8072c6cad2d9 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -1,13 +1,17 @@
 import itertools
 from typing import List, Sequence, Union, Dict
 
-from tools.codegen.api.types import DispatcherSignature
-from tools.codegen.api import cpp
-from tools.codegen.code_template import CodeTemplate
-from tools.codegen.context import with_native_function
-from tools.codegen.utils import FileManager
-from tools.codegen.model import (Argument, NativeFunction, SchemaKind,
-                                 TensorOptionsArguments)
+from torchgen.api.types import DispatcherSignature
+from torchgen.api import cpp
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.utils import FileManager
+from torchgen.model import (
+    Argument,
+    NativeFunction,
+    SchemaKind,
+    TensorOptionsArguments,
+)
 
 # Note [Manual Backend kernels]
 # For these ops, we want to manually register to dispatch key Backend and
@@ -19,16 +23,33 @@
 #   - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration
 # Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
-MANUAL_BACKEND = set([
-    'options', 'data', 'set_data', 'is_leaf', 'output_nr', '_version', 'retain_grad',
-    '_backward', 'requires_grad_',
-])
+MANUAL_BACKEND = set(
+    [
+        "options",
+        "data",
+        "set_data",
+        "is_leaf",
+        "output_nr",
+        "_version",
+        "retain_grad",
+        "_backward",
+        "requires_grad_",
+    ]
+)
 
 # For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
-MANUAL_AUTOGRAD_AND_TRACER = set([
-    'resize_', 'resize_as_', 'detach', 'detach_', 'copy_', '_fw_primal', '_make_dual',
-])
+MANUAL_AUTOGRAD_AND_TRACER = set(
+    [
+        "resize_",
+        "resize_as_",
+        "detach",
+        "detach_",
+        "copy_",
+        "_fw_primal",
+        "_make_dual",
+    ]
+)
 
 # Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
 #   union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER)
@@ -41,45 +62,65 @@
 # on demand.  Only concrete ATen methods can be disabled this way; it will have
 # NO EFFECT otherwise.
 DONT_RECORD_TRACE = {
-    'convolution', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
-    'conv_transpose2d', 'conv_transpose3d', 'lstm_cell', 'gru_cell',
-    'rnn_tanh_cell', 'rnn_relu_cell',
+    "convolution",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "lstm_cell",
+    "gru_cell",
+    "rnn_tanh_cell",
+    "rnn_relu_cell",
     # FIXME: figure out a better way when we support sparse tensors in jit
-    '_coalesced',
+    "_coalesced",
 }
 
+
 def should_trace(f: NativeFunction) -> bool:
     # Operations involving Storage or Type are not traceable at the moment
-    if any(str(arg.type) in {'Storage', 'Type', 'ConstQuantizerPtr'}
-           for arg in f.func.schema_order_arguments()):
+    if any(
+        str(arg.type) in {"Storage", "Type", "ConstQuantizerPtr"}
+        for arg in f.func.schema_order_arguments()
+    ):
         return False
     # We can't trace functions which don't have any Tensor or TensorList returns
     if not any(r.type.is_tensor_like() for r in f.func.returns):
         return False
     return f.func.name.name.base not in DONT_RECORD_TRACE
 
-SELECT = CodeTemplate("""\
+
+SELECT = CodeTemplate(
+    """\
 
 if (${cond}) {
   ${true}
 } else {
   ${false}
 }
-""")
+"""
+)
 
-OP_NAME = CodeTemplate("""\
+OP_NAME = CodeTemplate(
+    """\
 op_name = c10::Symbol::fromQualString("aten::${trace_name}");
-""")
+"""
+)
 
 # These functions have their names recorded under trace renamed,
 RENAME_TRACE = {
-    'zero': 'zeros_like',  # replacing aten::zero_ with aten::zeros_like
-    'fill': 'full_like',  # replacing aten::fill_ with aten::full_like
+    "zero": "zeros_like",  # replacing aten::zero_ with aten::zeros_like
+    "fill": "full_like",  # replacing aten::fill_ with aten::full_like
 }
 
+
 def format_trace_op_name(f: NativeFunction) -> str:
     # TODO: byte-for-byte compatible with old codegen behavior - should clean up
-    if f.func.kind() in (SchemaKind.functional, SchemaKind.out) or f.func.name.name.dunder_method:
+    if (
+        f.func.kind() in (SchemaKind.functional, SchemaKind.out)
+        or f.func.name.name.dunder_method
+    ):
         # special case for *_out functions: the in-place and out-of-place ops
         # are overloaded with the same name in the JIT
         trace_name = str(f.func.name.name)
@@ -94,32 +135,39 @@ def format_trace_op_name(f: NativeFunction) -> str:
     inplace_trace_name = RENAME_TRACE.get(inplace_trace_name, inplace_trace_name)
 
     return SELECT.substitute(
-        cond='tracer_state->force_outplace',
+        cond="tracer_state->force_outplace",
         true=OP_NAME.substitute(trace_name=outplace_trace_name),
         false=OP_NAME.substitute(trace_name=inplace_trace_name),
     )
 
+
 ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
 
-def format_trace_inputs(f: NativeFunction) -> str:
 
-    def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequence[str]:
+def format_trace_inputs(f: NativeFunction) -> str:
+    def dispatch_trace_input(
+        arg: Union[Argument, TensorOptionsArguments]
+    ) -> Sequence[str]:
         if isinstance(arg, TensorOptionsArguments):
-            name = 'options'
+            name = "options"
             return [
-                ADD_TRACE_INPUT.substitute(name=name, input='optTypeMetaToScalarType(options.dtype_opt())'),
-                ADD_TRACE_INPUT.substitute(name=name, input='options.layout()'),
-                ADD_TRACE_INPUT.substitute(name=name, input='options.device()'),
-                ADD_TRACE_INPUT.substitute(name=name, input='options.pinned_memory()'),
+                ADD_TRACE_INPUT.substitute(
+                    name=name, input="optTypeMetaToScalarType(options.dtype_opt())"
+                ),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.layout()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.device()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()"),
             ]
         else:
             name = arg.name
-            if str(arg.type) == 'Tensor?[]':
+            if str(arg.type) == "Tensor?[]":
                 return [f'jit::tracer::addInputs(node, "{name}", {name});']
             else:
                 return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
 
-    args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments())
+    args: List[Union[Argument, TensorOptionsArguments]] = list(
+        f.func.schema_order_arguments()
+    )
 
     if f.func.is_out_fn():
         # *_out functions take the result as a separate argument, but we don't want to
@@ -129,7 +177,9 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
         # there is only one output argument.
         args = args[:-1]
 
-    trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args)
+    trace_inputs = itertools.chain.from_iterable(
+        dispatch_trace_input(arg) for arg in args
+    )
 
     if f.func.is_out_fn():
         # for *_out functions, handle the result argument differently for inplace/outplace.
@@ -141,32 +191,49 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
         # Factories are a bit special because their out-of-place overloads
         # take an extra TensorOptions argument, which is missing in the _out function
         has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
-        has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out)
-        is_factory_method = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg)
+        has_tensor_input_arg = any(
+            a.type.is_tensor_like() for a in f.func.arguments.flat_non_out
+        )
+        is_factory_method = f.category_override == "factory" or (
+            has_tensor_return and not has_tensor_input_arg
+        )
 
         # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method`
         # flag for the whole family of ops with the same basename if any of them is a
         # factory method. For most cases the whole family of ops are indeed all factory
         # method - 'normal' is the only exception. So we handle it specially here to avoid
         # cloning the old logic.
-        if f.func.name.name.base == 'normal':
+        if f.func.name.name.base == "normal":
             is_factory_method = True
 
         if is_factory_method:
             outplace = [
-                ADD_TRACE_INPUT.substitute(name='out', input='optTypeMetaToScalarType(out.options().dtype_opt())'),
-                ADD_TRACE_INPUT.substitute(name='out', input='out.options().layout()'),
-                ADD_TRACE_INPUT.substitute(name='out', input='out.options().device()'),
-                ADD_TRACE_INPUT.substitute(name='out', input='out.options().pinned_memory()'),
+                ADD_TRACE_INPUT.substitute(
+                    name="out",
+                    input="optTypeMetaToScalarType(out.options().dtype_opt())",
+                ),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().layout()"),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().device()"),
+                ADD_TRACE_INPUT.substitute(
+                    name="out", input="out.options().pinned_memory()"
+                ),
             ]
         else:
             outplace = []
 
         trace_inputs = itertools.chain(
             trace_inputs,
-            [SELECT.substitute(cond='tracer_state->force_outplace', true='\n'.join(outplace), false=inplace)])
+            [
+                SELECT.substitute(
+                    cond="tracer_state->force_outplace",
+                    true="\n".join(outplace),
+                    false=inplace,
+                )
+            ],
+        )
+
+    return "\n".join(trace_inputs)
 
-    return '\n'.join(trace_inputs)
 
 # `torch.jit.trace` have undocumented keyword argument `_force_outplace`,
 # which force jit to replace functions with outplace variants (for
@@ -191,29 +258,32 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
 #  - Or keep `aten::zeros_like` arguments aligned with `aten::zero_`
 # arguments (inside of the `native_functions.yaml`)
 RENAME_TRACE_ADD_ARGS = {
-    'fill': '''\
+    "fill": """\
     jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
     jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
     jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
     jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
-''',
-    'zero': '''\
+""",
+    "zero": """\
     jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
     jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
     jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
     jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
-''',
+""",
 }
 
-INPLACE_GUARD = CodeTemplate("""\
+INPLACE_GUARD = CodeTemplate(
+    """\
 jit::tracer::ensureUniqueIfOutOfPlaced("${name}", ${mutable_input});
-""")
+"""
+)
 
-PRE_RECORD_TRACE = CodeTemplate("""\
+PRE_RECORD_TRACE = CodeTemplate(
+    """\
 torch::jit::Node* node = nullptr;
 std::shared_ptr<jit::tracer::TracingState> tracer_state;
 if (jit::tracer::isTracing()) {
@@ -227,40 +297,59 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
   ${inplace_guard}
   jit::tracer::setTracingState(nullptr);
 }
-""")
+"""
+)
+
 
 def format_prerecord_trace(f: NativeFunction) -> str:
     if not should_trace(f):
-        return ''
+        return ""
 
     # TODO: clean up old codegen behavior
-    is_inplace = f.func.kind() in (SchemaKind.inplace, SchemaKind.out) and not f.func.name.name.dunder_method
-    add_args = RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, '') if is_inplace else ''
-    additional_inputs = SELECT.substitute(
-        cond='tracer_state->force_outplace',
-        true=add_args,
-        false='',
-    ) if add_args else ''
+    is_inplace = (
+        f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
+        and not f.func.name.name.dunder_method
+    )
+    add_args = (
+        RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, "") if is_inplace else ""
+    )
+    additional_inputs = (
+        SELECT.substitute(
+            cond="tracer_state->force_outplace",
+            true=add_args,
+            false="",
+        )
+        if add_args
+        else ""
+    )
 
     return PRE_RECORD_TRACE.substitute(
         set_op_name=format_trace_op_name(f),
         add_trace_inputs=format_trace_inputs(f) + additional_inputs,
         inplace_guard=INPLACE_GUARD.substitute(
             name=cpp.name(f.func),
-            mutable_input=f.func.arguments.out[0].name if f.func.arguments.out else 'self',
-        ) if is_inplace else '',
+            mutable_input=f.func.arguments.out[0].name
+            if f.func.arguments.out
+            else "self",
+        )
+        if is_inplace
+        else "",
     )
 
-POST_RECORD_TRACE = CodeTemplate("""\
+
+POST_RECORD_TRACE = CodeTemplate(
+    """\
 if (tracer_state) {
   jit::tracer::setTracingState(std::move(tracer_state));
   ${add_trace_outputs}
 }
-""")
+"""
+)
+
 
 def format_postrecord_trace(f: NativeFunction) -> str:
     if not should_trace(f):
-        return ''
+        return ""
 
     # For outplacing ops, *_out overloads require special handling to move the
     # output *argument* to a return value
@@ -271,29 +360,37 @@ def format_postrecord_trace(f: NativeFunction) -> str:
         # Code size optimization: the common case is that the return value is
         # the same for both variants
         if output_names_outplace == output_names_inplace:
-            outputs = [f'jit::tracer::addOutput(node, {n});' for n in output_names_outplace]
+            outputs = [
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ]
             return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
 
         selection = SELECT.substitute(
-            cond='force_outplace',
-            true='\n'.join(f'jit::tracer::addOutput(node, {n});' for n in output_names_outplace),
-            false='\n'.join(f'jit::tracer::addOutput(node, {n});' for n in output_names_inplace),
+            cond="force_outplace",
+            true="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ),
+            false="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_inplace
+            ),
         )
         return POST_RECORD_TRACE.substitute(add_trace_outputs=selection)
     else:
         output_names = cpp.return_names(f)
-        outputs = [f'jit::tracer::addOutput(node, {n});' for n in output_names]
+        outputs = [f"jit::tracer::addOutput(node, {n});" for n in output_names]
         return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
 
+
 def declare_returned_variables(f: NativeFunction) -> str:
     modifies_arguments = f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
     if modifies_arguments:
-        return ''
+        return ""
     if len(f.func.returns) == 1:
-        return ''
+        return ""
     types = map(cpp.return_type, f.func.returns)
     names = cpp.return_names(f)
-    return '\n'.join(f'{type.cpp_type()} {name};' for type, name in zip(types, names))
+    return "\n".join(f"{type.cpp_type()} {name};" for type, name in zip(types, names))
+
 
 def tie_return_values(f: NativeFunction) -> str:
     if len(f.func.returns) == 1:
@@ -301,6 +398,7 @@ def tie_return_values(f: NativeFunction) -> str:
     names = cpp.return_names(f)
     return f'std::tie({", ".join(names)})'
 
+
 def get_return_value(f: NativeFunction) -> str:
     names = cpp.return_names(f)
     if len(f.func.returns) == 1:
@@ -308,11 +406,15 @@ def get_return_value(f: NativeFunction) -> str:
     if f.func.kind() == SchemaKind.out:
         return f'std::forward_as_tuple({", ".join(names)})'
     else:
-        moved = ", ".join(f'std::move({name})' for name in names)
-        return f'std::make_tuple({moved})'
+        moved = ", ".join(f"std::move({name})" for name in names)
+        return f"std::make_tuple({moved})"
+
+
+TRACE_DISPATCH = CodeTemplate(
+    """\
+${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});"""
+)
 
-TRACE_DISPATCH = CodeTemplate("""\
-${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});""")
 
 def emit_trace_body(f: NativeFunction) -> List[str]:
     trace_body: List[str] = []
@@ -325,47 +427,59 @@ def emit_trace_body(f: NativeFunction) -> List[str]:
 
     # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
     # See Note [Plumbing Keys Through The Dispatcher] for details.
-    dispatch_key_set = 'ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)'
-    redispatch_args = ', '.join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+    dispatch_key_set = "ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
 
-    assign_return_values = f'{tie_return_values(f)} = ' \
-                           if f.func.kind() == SchemaKind.functional and f.func.returns else ''
+    assign_return_values = (
+        f"{tie_return_values(f)} = "
+        if f.func.kind() == SchemaKind.functional and f.func.returns
+        else ""
+    )
 
     # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
     # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
-    trace_body.append(TRACE_DISPATCH.substitute(
-        assign_return_values=assign_return_values,
-        unambiguous_name=f.func.name.unambiguous_name(),
-        unpacked_args=redispatch_args,
-    ))
+    trace_body.append(
+        TRACE_DISPATCH.substitute(
+            assign_return_values=assign_return_values,
+            unambiguous_name=f.func.name.unambiguous_name(),
+            unpacked_args=redispatch_args,
+        )
+    )
 
     trace_body.append(format_postrecord_trace(f))
     if f.func.returns:
-        trace_body.append(f'return {get_return_value(f)};')
+        trace_body.append(f"return {get_return_value(f)};")
     return trace_body
 
-METHOD_DEFINITION = CodeTemplate("""\
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
 ${return_type} ${type_wrapper_name}(${formals}) {
   ${type_definition_body}
 }
-""")
+"""
+)
+
 
 def type_wrapper_name(f: NativeFunction) -> str:
     if f.func.name.overload_name:
-        return f'{cpp.name(f.func)}_{f.func.name.overload_name}'
+        return f"{cpp.name(f.func)}_{f.func.name.overload_name}"
     else:
         return cpp.name(f.func)
 
+
 @with_native_function
 def method_definition(f: NativeFunction) -> str:
     assert cpp.name(f.func) not in MANUAL_TRACER
 
-    formals = ', '.join(
+    formals = ", ".join(
         # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
         # See Note [Plumbing Keys Through The Dispatcher] for details.
-        ['c10::DispatchKeySet ks'] +
-        [f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
-            for a in f.func.schema_order_arguments()]
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
     )
 
     return METHOD_DEFINITION.substitute(
@@ -375,11 +489,15 @@ def method_definition(f: NativeFunction) -> str:
         type_definition_body=emit_trace_body(f),
     )
 
-WRAPPER_REGISTRATION = CodeTemplate("""\
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
 m.impl("${name}",
        TORCH_FN(${class_type}::${type_wrapper_name})
 );
-""")
+"""
+)
+
 
 @with_native_function
 def method_registration(f: NativeFunction) -> str:
@@ -388,31 +506,36 @@ def method_registration(f: NativeFunction) -> str:
     return WRAPPER_REGISTRATION.substitute(
         name=f.func.name,
         type_wrapper_name=type_wrapper_name(f),
-        class_type='TraceType',
+        class_type="TraceType",
     )
 
-def gen_trace_type_func(
-    fn: NativeFunction
-) -> Dict[str, List[str]]:
+
+def gen_trace_type_func(fn: NativeFunction) -> Dict[str, List[str]]:
     return {
-        'ops_headers': [f'#include <ATen/ops/{fn.root_name}_ops.h>'],
-        'trace_method_definitions': [method_definition(fn)],
-        'trace_wrapper_registrations': [method_registration(fn)],
+        "ops_headers": [f"#include <ATen/ops/{fn.root_name}_ops.h>"],
+        "trace_method_definitions": [method_definition(fn)],
+        "trace_wrapper_registrations": [method_registration(fn)],
     }
 
-def gen_trace_type(out: str, native_functions: List[NativeFunction], template_path: str) -> None:
+
+def gen_trace_type(
+    out: str, native_functions: List[NativeFunction], template_path: str
+) -> None:
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
     # template regarding sharding of the generated files.
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
     fm.write_sharded(
-        'TraceType.cpp',
+        "TraceType.cpp",
         [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER],
         key_fn=lambda fn: fn.root_name,
         base_env={
-            'generated_comment':
-            f'@generated from {template_path}/TraceType.cpp',
+            "generated_comment": f"@generated from {template_path}/TraceType.cpp",
         },
         env_callable=gen_trace_type_func,
         num_shards=5,
-        sharded_keys={'ops_headers', 'trace_method_definitions', 'trace_wrapper_registrations'}
+        sharded_keys={
+            "ops_headers",
+            "trace_method_definitions",
+            "trace_wrapper_registrations",
+        },
     )
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index 1a09902e86ec..26eb2d91595d 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -5,13 +5,13 @@
 import re
 from typing import Optional, List
 
-from tools.codegen.api.types import CppSignatureGroup
-from tools.codegen.api import cpp
-import tools.codegen.api.python as python
-from tools.codegen.gen import parse_native_yaml
-from tools.codegen.context import with_native_function
-from tools.codegen.utils import mapMaybe, FileManager
-from tools.codegen.model import NativeFunction, TensorOptionsArguments, Variant
+from torchgen.api.types import CppSignatureGroup
+from torchgen.api import cpp
+import torchgen.api.python as python
+from torchgen.gen import parse_native_yaml
+from torchgen.context import with_native_function
+from torchgen.utils import mapMaybe, FileManager
+from torchgen.model import NativeFunction, TensorOptionsArguments, Variant
 
 OPTIONAL_TYPE_PATTERN = re.compile(r"c10::optional<(.+)>")
 TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
@@ -20,28 +20,41 @@
 # TODO: maybe update the cpp argument API to take optional namespace argument?
 def fully_qualified_type(argument_type: str) -> str:
     def maybe_optional_type(type: str, is_opt: bool) -> str:
-        return f'c10::optional<{type}>' if is_opt else type
+        return f"c10::optional<{type}>" if is_opt else type
 
     opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type)
     is_opt = opt_match is not None
     if opt_match:
-        argument_type = argument_type[opt_match.start(1):opt_match.end(1)]
+        argument_type = argument_type[opt_match.start(1) : opt_match.end(1)]
     match = TYPE_PATTERN.match(argument_type)
     if match is None:
         return maybe_optional_type(argument_type, is_opt)
     index = match.start(1)
-    qualified_type = f'{argument_type[:index]}at::{argument_type[index:]}'
+    qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}"
     return maybe_optional_type(qualified_type, is_opt)
 
-def gen_variable_factories(out: str, native_yaml_path: str, template_path: str) -> None:
-    native_functions = parse_native_yaml(native_yaml_path).native_functions
+
+def gen_variable_factories(
+    out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
     factory_functions = [fn for fn in native_functions if is_factory_function(fn)]
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
-    fm.write_with_template('variable_factories.h', 'variable_factories.h', lambda: {
-        'generated_comment': '@' + f'generated from {fm.template_dir}/variable_factories.h',
-        'ops_headers': [f'#include <ATen/ops/{fn.root_name}.h>' for fn in factory_functions],
-        'function_definitions': list(mapMaybe(process_function, factory_functions)),
-    })
+    fm.write_with_template(
+        "variable_factories.h",
+        "variable_factories.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir}/variable_factories.h",
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
+            ],
+            "function_definitions": list(mapMaybe(process_function, factory_functions)),
+        },
+    )
+
 
 @with_native_function
 def is_factory_function(f: NativeFunction) -> bool:
@@ -52,6 +65,7 @@ def is_factory_function(f: NativeFunction) -> bool:
     has_tensor_options = python.has_tensor_options(f)
     return has_tensor_options or name.endswith("_like")
 
+
 @with_native_function
 def process_function(f: NativeFunction) -> Optional[str]:
     name = cpp.name(f.func)
@@ -64,22 +78,22 @@ def process_function(f: NativeFunction) -> Optional[str]:
     sig = CppSignatureGroup.from_native_function(f, method=False).signature
     formals: List[str] = []
     exprs: List[str] = []
-    requires_grad = 'false'
+    requires_grad = "false"
     for arg in sig.arguments():
         qualified_type = fully_qualified_type(arg.type)
         if arg.default:
-            formals.append(f'{qualified_type} {arg.name} = {arg.default}')
+            formals.append(f"{qualified_type} {arg.name} = {arg.default}")
         else:
-            formals.append(f'{qualified_type} {arg.name}')
+            formals.append(f"{qualified_type} {arg.name}")
 
         if isinstance(arg.argument, TensorOptionsArguments):
             # note: we remove the requires_grad setting from the TensorOptions because
             # it is ignored anyways (and we actually have an assertion that it isn't set
             # which would fail otherwise). We handle requires_grad explicitly here
             # instead of passing it through to the kernel.
-            exprs.append(f'at::TensorOptions({arg.name}).requires_grad(c10::nullopt)')
+            exprs.append(f"at::TensorOptions({arg.name}).requires_grad(c10::nullopt)")
             # Manually set the requires_grad bit on the result tensor.
-            requires_grad = f'{arg.name}.requires_grad()'
+            requires_grad = f"{arg.name}.requires_grad()"
         else:
             exprs.append(arg.name)
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 4b634146dfed..78e8e4edce13 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -27,103 +27,329 @@
 #
 from .context import with_native_function_with_differentiability_info
 from .gen_trace_type import (
-    MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, declare_returned_variables,
-    tie_return_values, get_return_value, type_wrapper_name,
+    MANUAL_BACKEND,
+    MANUAL_AUTOGRAD_AND_TRACER,
+    declare_returned_variables,
+    tie_return_values,
+    get_return_value,
+    type_wrapper_name,
 )
 from .gen_inplace_or_view_type import (
-    get_view_info, is_tensor_type, is_tensor_list_type, unpack_args, get_base_name,
-    use_derived, modifies_arguments, WRAPPER_REGISTRATION, TMP_VAR, METHOD_DEFINITION,
-    ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name,
-    AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION
-)
-
-from tools.codegen.api.types import (Binding, DispatcherSignature, BaseCType, intArrayRefT,
-                                     tensorT, tensorListT, MutRefCType, OptionalCType,
-                                     ListCType, SpecialArgName, scalarT, stringT,
-                                     VectorCType)
-from tools.codegen.api.autograd import (
-    DifferentiableInput, NativeFunctionWithDifferentiabilityInfo,
-    SavedAttribute, dispatch_strategy, gen_differentiable_outputs,
-    is_differentiable)
-from tools.codegen.api import cpp
-from tools.codegen.code_template import CodeTemplate
-from tools.codegen.context import native_function_manager, with_native_function
-from tools.codegen.utils import mapMaybe, FileManager
-from tools.codegen.model import (Argument, NativeFunction, SchemaKind,
-                                 SelfArgument, TensorOptionsArguments,
-                                 BaseType, ListType)
-from typing import Callable, List, Optional, Sequence, Union, Dict
+    get_view_info,
+    is_tensor_type,
+    is_tensor_list_type,
+    unpack_args,
+    get_base_name,
+    use_derived,
+    modifies_arguments,
+    WRAPPER_REGISTRATION,
+    TMP_VAR,
+    METHOD_DEFINITION,
+    ASSIGN_RETURN_VALUE,
+    gen_formals,
+    ALL_VIEW_FUNCTIONS,
+    unpacked_name,
+    AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION,
+)
+
+from torchgen.api.types import (
+    Binding,
+    DispatcherSignature,
+    BaseCType,
+    intArrayRefT,
+    tensorT,
+    tensorListT,
+    MutRefCType,
+    OptionalCType,
+    ListCType,
+    SpecialArgName,
+    scalarT,
+    stringT,
+    TupleCType,
+    VectorCType,
+)
+from torchgen.api.autograd import (
+    DifferentiableInput,
+    NativeFunctionWithDifferentiabilityInfo,
+    SavedAttribute,
+    dispatch_strategy,
+    gen_differentiable_outputs,
+    is_differentiable,
+)
+from torchgen.api import cpp
+from torchgen.code_template import CodeTemplate
+from torchgen.context import native_function_manager, with_native_function
+from torchgen.utils import mapMaybe, FileManager
+from torchgen.model import (
+    Argument,
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+    BaseType,
+    ListType,
+)
+from typing import Callable, List, Optional, Sequence, Tuple, Union, Dict
 
 # We don't set or modify grad_fn on these methods. Generally, they return
 # tensors that have requires_grad=False. In-place functions listed here will
 # not examine or modify requires_grad or grad_fn.
 DONT_REQUIRE_DERIVATIVE = {
     # These only depend on the input Tensor's shape and device, not the data
-    'ones_like', 'zeros_like', 'rand_like', 'randn_like',
+    "ones_like",
+    "zeros_like",
+    "rand_like",
+    "randn_like",
     # These are only implemented on integral types
-    '__and__', '__iand__', '__ilshift__', '__ior__', '__irshift__', '__ixor__',
-    '__lshift__', '__or__', '__rshift__', '__xor__',
+    "__and__",
+    "__iand__",
+    "__ilshift__",
+    "__ior__",
+    "__irshift__",
+    "__ixor__",
+    "__lshift__",
+    "__or__",
+    "__rshift__",
+    "__xor__",
     # These work on integral data types, and hence don't require derivative
-    '_sobol_engine_draw', '_sobol_engine_ff', '_sobol_engine_scramble_',
-    '_sobol_engine_initialize_state_',
+    "_sobol_engine_draw",
+    "_sobol_engine_ff",
+    "_sobol_engine_scramble_",
+    "_sobol_engine_initialize_state_",
     # This is an unsafe method that is meant to be out of reach of autograd.
-    '_coalesced_',
+    "_coalesced_",
     # Quantize functions should not record gradients
-    'quantize_per_tensor', 'quantize_per_channel',
+    "quantize_per_tensor",
+    "quantize_per_channel",
     # Functions that return integers should not have output that require gradients
-    'argmax', 'argmin', 'argsort', 'searchsorted',
-    'bucketize',
+    "argmax",
+    "argmin",
+    "argsort",
+    "searchsorted",
+    "bucketize",
     # Functions that return booleans are not differentiable
-    'isnan', 'isposinf', 'isneginf', 'isinf', 'signbit', 'isin',
+    "isnan",
+    "isposinf",
+    "isneginf",
+    "isinf",
+    "signbit",
+    "isin",
     # Functions return none are not differentiable
-    'record_stream',
+    "record_stream",
     # These functions are not differentiable
-    'logical_and', 'logical_xor', 'logical_not', 'logical_or',
+    "logical_and",
+    "logical_xor",
+    "logical_not",
+    "logical_or",
 }
 
 # The C -> R functions at the time of adding this are still being audited and tested
 # but will not error out.
 # C -> C, R -> C functions for which backward is correctly implemented and tested
 GRADIENT_IMPLEMENTED_FOR_COMPLEX = {
-    't', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone', 'diag_embed',
-    'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
-    'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril',
-    'triu', 'chunk', 'zero_', 'eq_', 'ne_', 'add', '__radd__', 'sum',
-    '_conj', 'sin', 'cos', 'mul', 'sinc', 'sinh', 'cosh', '__rmul__',
-    'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex', 'index_put',
-    'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
-    'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
-    'dot', 'vdot', 'cholesky', 'triangular_solve', 'mm', '_unsafe_view', 'mv', 'outer',
-    'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
-    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'atanh', 'take', 'fill_',
-    'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', '_linalg_svd', '_fft_c2c', '_fft_r2c',
-    'linalg_solve', 'sqrt', 'stack', 'gather', 'index_select', 'index_add_', 'linalg_inv', 'linalg_inv_ex',
-    'l1_loss_backward', 'baddbmm', 'addbmm', 'addmm', 'addmv', 'addr', 'linalg_householder_product',
-    'constant_pad_nd', 'reflection_pad1d', 'reflection_pad2d', 'reflection_pad3d', 'linalg_cholesky_ex', 'linalg_eig',
-    'select_backward', 'diagonal_backward', 'slice_backward',
-    'reflection_pad1d_backward', 'reflection_pad2d_backward', 'reflection_pad3d_backward', 'symeig', '_sparse_sparse_matmul',
-    'replication_pad1d', 'replication_pad2d', 'replication_pad3d', 'take', 'put_', '_to_copy',
-    'replication_pad1d_backward', 'replication_pad2d_backward', 'replication_pad3d_backward',
-    'diag', 'masked_scatter', 'masked_select', 'index_add', 'index_fill', 'trace', 'polar', 'cumsum', 'rsub',
-    'eig', 'lerp', 'linalg_vector_norm', 'cumprod', 'prod', 'index_copy', 'lu', 'unfold', 'unfold_backward',
-    'index', 'masked_fill', 'linalg_cross', 'lu_unpack', 'renorm', '_conj_physical', 'linalg_lu_factor_ex',
-    'scatter', 'scatter_add', 'sigmoid', 'sigmoid_backward', 'trapezoid', 'cumulative_trapezoid',
-    'conj_physical_', '_neg_view', '_reshape_alias', '_det_lu_based_helper', 'lu_solve',
-    'linalg_solve_triangular', 'linalg_pinv', 'linalg_lstsq', 'col2im', 'col2im_backward', 'im2col', 'im2col_backward',
+    "t",
+    "view",
+    "reshape",
+    "reshape_as",
+    "view_as",
+    "roll",
+    "clone",
+    "diag_embed",
+    "repeat",
+    "expand",
+    "flip",
+    "fliplr",
+    "flipud",
+    "rot90",
+    "transpose",
+    "permute",
+    "squeeze",
+    "unsqueeze",
+    "resize",
+    "resize_as",
+    "tril",
+    "triu",
+    "chunk",
+    "zero_",
+    "eq_",
+    "ne_",
+    "add",
+    "__radd__",
+    "sum",
+    "_conj",
+    "sin",
+    "cos",
+    "mul",
+    "sinc",
+    "sinh",
+    "cosh",
+    "__rmul__",
+    "sgn",
+    "asin",
+    "acos",
+    "sub",
+    "div",
+    "cat",
+    "view_as_complex",
+    "index_put",
+    "neg",
+    "complex",
+    "select",
+    "where",
+    "as_strided",
+    "slice",
+    "constant_pad_nd",
+    "unbind",
+    "split",
+    "split_with_sizes",
+    "unsafe_split",
+    "split_with_sizes_backward",
+    "dot",
+    "vdot",
+    "cholesky",
+    "triangular_solve",
+    "mm",
+    "_unsafe_view",
+    "mv",
+    "outer",
+    "bmm",
+    "diagonal",
+    "alias",
+    "atan",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "reciprocal",
+    "tan",
+    "pow",
+    "rsqrt",
+    "tanh",
+    "tanh_backward",
+    "asinh",
+    "acosh",
+    "atanh",
+    "take",
+    "fill_",
+    "exp",
+    "nonzero",
+    "mean",
+    "inverse",
+    "solve",
+    "linalg_cholesky",
+    "addcmul",
+    "addcdiv",
+    "matrix_exp",
+    "linalg_matrix_exp",
+    "linalg_eigh",
+    "cholesky_solve",
+    "linalg_qr",
+    "_linalg_svd",
+    "_fft_c2c",
+    "_fft_r2c",
+    "linalg_solve",
+    "sqrt",
+    "stack",
+    "gather",
+    "index_select",
+    "index_add_",
+    "linalg_inv",
+    "linalg_inv_ex",
+    "l1_loss_backward",
+    "baddbmm",
+    "addbmm",
+    "addmm",
+    "addmv",
+    "addr",
+    "linalg_householder_product",
+    "constant_pad_nd",
+    "reflection_pad1d",
+    "reflection_pad2d",
+    "reflection_pad3d",
+    "linalg_cholesky_ex",
+    "linalg_eig",
+    "select_backward",
+    "diagonal_backward",
+    "slice_backward",
+    "reflection_pad1d_backward",
+    "reflection_pad2d_backward",
+    "reflection_pad3d_backward",
+    "symeig",
+    "_sparse_sparse_matmul",
+    "replication_pad1d",
+    "replication_pad2d",
+    "replication_pad3d",
+    "take",
+    "put_",
+    "_to_copy",
+    "replication_pad1d_backward",
+    "replication_pad2d_backward",
+    "replication_pad3d_backward",
+    "diag",
+    "masked_scatter",
+    "masked_select",
+    "index_add",
+    "index_fill",
+    "trace",
+    "polar",
+    "cumsum",
+    "rsub",
+    "eig",
+    "lerp",
+    "linalg_vector_norm",
+    "cumprod",
+    "prod",
+    "index_copy",
+    "lu",
+    "unfold",
+    "unfold_backward",
+    "index",
+    "masked_fill",
+    "linalg_cross",
+    "lu_unpack",
+    "renorm",
+    "_conj_physical",
+    "linalg_lu_factor_ex",
+    "scatter",
+    "scatter_add",
+    "sigmoid",
+    "sigmoid_backward",
+    "trapezoid",
+    "cumulative_trapezoid",
+    "conj_physical_",
+    "_neg_view",
+    "_reshape_alias",
+    "_det_lu_based_helper",
+    "lu_solve",
+    "linalg_solve_triangular",
+    "linalg_pinv",
+    "linalg_lstsq",
+    "col2im",
+    "col2im_backward",
+    "im2col",
+    "im2col_backward",
+    "cholesky_inverse",
+    "to_sparse",
+    "sparse_sampled_addmm",
+    "linalg_lu",
+    "pixel_shuffle",
+    "pixel_unshuffle",
 }
 
 GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX = {
-    'to_dense', '_coalesce', 'coalesce', 'values', '_sparse_coo_tensor_with_dims_and_tensors',
-    'sparse_mask_helper_cuda', '_sparse_addmm',
+    "_to_dense",
+    "_coalesce",
+    "coalesce",
+    "values",
+    "_sparse_coo_tensor_with_dims_and_tensors",
+    "sparse_mask_helper_cuda",
+    "_sparse_addmm",
 }
 
 GRADIENT_IMPLEMENTED_FOR_COMPLEX.update(GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX)
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
-RESET_GRAD_ACCUMULATOR = {
-    'set', 'resize'
-}
+RESET_GRAD_ACCUMULATOR = {"set", "resize"}
 
 # NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
 #
@@ -138,206 +364,282 @@
 #      the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
 #
 # The following code templates implement the checks for this invariant:
-SAVE_TENSOR_STORAGE = CodeTemplate("""\
+SAVE_TENSOR_STORAGE = CodeTemplate(
+    """\
 c10::optional<Storage> ${tensor_name}_storage_saved =
   ${tensor_name}.has_storage() ? c10::optional<Storage>(${tensor_name}.storage()) : c10::nullopt;
-""")
+"""
+)
 
 # If tensor_name == out_tensor_name, used to enforce (1), otherwise used for (2)
-ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate("""\
+ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate(
+    """\
 if (${tensor_name}_storage_saved.has_value())
   AT_ASSERT(${tensor_name}_storage_saved.value().is_alias_of(${out_tensor_name}.storage()));
-""")
+"""
+)
 
-SAVE_TENSORLIST_STORAGE = CodeTemplate("""\
+SAVE_TENSORLIST_STORAGE = CodeTemplate(
+    """\
 std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
 for (const Tensor& tensor : ${tensorlist_name})
   ${tensorlist_name}_storage_saved.push_back(
     tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt);
-""")
+"""
+)
 
-ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate("""\
+ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate(
+    """\
 for (size_t i=0; i<${tensorlist_name}.size(); i++) {
   if (${tensorlist_name}_storage_saved[i].has_value())
     AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(${tensorlist_name}[i].storage()));
 }
-""")
+"""
+)
 
-SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
 std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
 for (const c10::optional<Tensor>& tensor : ${tensorlist_name})
   ${tensorlist_name}_storage_saved.push_back(
     tensor.has_value() && tensor->has_storage() ? c10::optional<Storage>(tensor->storage()) : c10::nullopt);
-""")
+"""
+)
 
-ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
 for (size_t i=0; i<${tensorlist_name}.size(); i++) {
   if (${tensorlist_name}_storage_saved[i].has_value())
     AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(
         static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->storage()));
 }
-""")
+"""
+)
 
-SAVE_TENSOR_IMPL = CodeTemplate("""\
+SAVE_TENSOR_IMPL = CodeTemplate(
+    """\
 c10::intrusive_ptr<TensorImpl> ${tensor_name}_impl_saved;
 if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr();
-""")
+"""
+)
 
-ENFORCE_SAME_TENSOR_IMPL = CodeTemplate("""\
+ENFORCE_SAME_TENSOR_IMPL = CodeTemplate(
+    """\
 if (${tensor_name}_impl_saved) AT_ASSERT(${tensor_name}_impl_saved == ${tensor_name}.getIntrusivePtr());
-""")
+"""
+)
 
-ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate("""\
+ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
+    """\
 AT_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
-""")
+"""
+)
 
-ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate("""\
+ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate(
+    """\
 if (${tensor_name}.has_storage()) AT_ASSERT(${tensor_name}.storage().use_count() == 1, "function: ${fn_name}");
-""")
+"""
+)
 
-SAVE_TENSORLIST_IMPL = CodeTemplate("""\
+SAVE_TENSORLIST_IMPL = CodeTemplate(
+    """\
 std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
 for (size_t i=0; i<${tensorlist_name}.size(); i++)
   if (${tensorlist_name}[i].defined()) ${tensorlist_name}_impl_saved[i] = ${tensorlist_name}[i].getIntrusivePtr();
-""")
+"""
+)
 
-ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate("""\
+ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate(
+    """\
 for (size_t i=0; i<${tensorlist_name}.size(); i++) {
   if (${tensorlist_name}_impl_saved[i])
     AT_ASSERT(${tensorlist_name}_impl_saved[i] == ${tensorlist_name}[i].getIntrusivePtr());
 }
-""")
+"""
+)
 
-SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
 std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
 for (size_t i=0; i<${tensorlist_name}.size(); i++) {
   c10::optional<Tensor> t = ${tensorlist_name}[i];
   if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr();
 }
-""")
+"""
+)
 
-ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
 for (size_t i=0; i<${tensorlist_name}.size(); i++) {
   if (${tensorlist_name}_impl_saved[i])
     AT_ASSERT(${tensorlist_name}_impl_saved[i] == static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->getIntrusivePtr());
 }
-""")
+"""
+)
 
 # The following list contains functions that we don't enforce the invariant on.
 DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = {
     # These functions are expected to change impl or storage of input tensors
-    'set_', '_cudnn_rnn_flatten_weight',
+    "set_",
+    "_cudnn_rnn_flatten_weight",
 }
 DONT_ENFORCE_TENSOR_IMPL_USE_COUNT = {
     # These non-inplace, non-out functions return tensors with use_count > 1
     # Therefore, they MAY (but not necessarily) return one of its inputs as-is
     # See https://github.com/pytorch/pytorch/issues/60426 for more information
-    '_embedding_bag', '_embedding_bag_forward_only',
-    'q_per_channel_scales', 'q_per_channel_zero_points',
-    'lu_unpack', '_cudnn_rnn_backward',
-
+    "_embedding_bag",
+    "_embedding_bag_forward_only",
+    "q_per_channel_scales",
+    "q_per_channel_zero_points",
+    "lu_unpack",
+    "_cudnn_rnn_backward",
     # The below failed StorageImpl use_count check but we skip tensor_impl check
     # just in case
-    '_cudnn_rnn', 'dequantize_self',
+    "_cudnn_rnn",
+    "dequantize_self",
+    # lift() should never actually be called with a requires_grad=True tensor,
+    "lift",
 }
 
 DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = {
     # These non-view functions return tensors with storage use_count != 1
-    '_slow_conv2d_forward', 'slow_conv3d_forward', 'channel_shuffle',
-
+    "_slow_conv2d_forward",
+    "slow_conv3d_forward",
+    "channel_shuffle",
+    # lift() should never actually be called with a requires_grad=True tensor,
+    "lift",
     # If an input is returned as-is in output, we cannot guarantee its storage_impl
     # use count to be 1 either.
     *DONT_ENFORCE_TENSOR_IMPL_USE_COUNT,
 }
 # END CHECKS FOR [ TensorImpl and Storage Pointer Sanity Checks ]
 
-DECLARE_GRAD_FN = CodeTemplate("""\
+DECLARE_GRAD_FN = CodeTemplate(
+    """\
 std::shared_ptr<${op}> grad_fn;
-""")
+"""
+)
 
-SETUP_ANY_REQUIRES_GRAD = CodeTemplate("""\
+SETUP_ANY_REQUIRES_GRAD = CodeTemplate(
+    """\
 auto _any_requires_grad = compute_requires_grad( ${args_with_derivatives} );
 ${extra_differentiability_conditions}
 (void)_any_requires_grad;
-""")
+"""
+)
 
-SETUP_DERIVATIVE = CodeTemplate("""\
+SETUP_DERIVATIVE = CodeTemplate(
+    """\
 if (_any_requires_grad) {
   ${setup}
 }
-""")
+"""
+)
 
-SETUP_NONE_REQUIRES_GRAD = CodeTemplate("""\
+SETUP_NONE_REQUIRES_GRAD = CodeTemplate(
+    """\
 if (compute_requires_grad( ${args_to_check} )) {
   throw_error_out_requires_grad("${base_name}");
 }
-""")
+"""
+)
 
-ASSIGN_GRAD_FN = CodeTemplate("""\
+ASSIGN_GRAD_FN = CodeTemplate(
+    """\
 grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode);
 grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
-""")
+"""
+)
 
-CALL_REDISPATCH = CodeTemplate("""\
-at::redispatch::${api_name}(${unpacked_args})""")
+CALL_REDISPATCH = CodeTemplate(
+    """\
+at::redispatch::${api_name}(${unpacked_args})"""
+)
 # If the non-variable operation has return values, we use the `tmp` variable to hold the
 # values temporarily and pass the values to the return variables outside of the
 # `at::AutoDispatchBelowAutograd` guard block.
-DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate("""\
+DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
+    """\
 auto ${tmp_var} = ([&]() {
   ${guard}
   return ${base_type_call};
 })();
-""")
+"""
+)
 
-DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate("""\
+DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate(
+    """\
 {
   ${guard}
   ${base_type_call};
 }
-""")
+"""
+)
 
-SET_HISTORY = CodeTemplate("""\
+SET_HISTORY = CodeTemplate(
+    """\
 if (grad_fn) {
     ${fn}_history(${differentiable_outputs}, grad_fn);
 }
-""")
+"""
+)
 
-CONDITIONAL = CodeTemplate("""\
+CONDITIONAL = CodeTemplate(
+    """\
 if (${cond}) {
   ${statements}
 }
-""")
+"""
+)
 
-RUN_ONLY_IN_DEBUG_MODE = CodeTemplate("""\
+RUN_ONLY_IN_DEBUG_MODE = CodeTemplate(
+    """\
 #ifndef NDEBUG
 ${statements}
 #endif
-""")
+"""
+)
 
-FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate("""\
+FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate(
+    """\
 isFwGradDefined(${req_inp})\
-""")
+"""
+)
 
-FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate("""\
+FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
+    """\
 auto ${inp}_t_raw = toNonOptFwGrad(${inp});
 auto ${inp}_tensor = toNonOptTensor(${inp});
 auto ${inp}_t = (${inp}_t_raw.defined() || !${inp}_tensor.defined())
   ? ${inp}_t_raw : at::${zeros_fn}(${inp}_tensor.sizes(), ${inp}_tensor.options());
-""")
+"""
+)
 
-FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate("""\
+FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate(
+    """\
 auto ${inp}_p = toNonOptPrimal(${inp});
-""")
+"""
+)
 
-FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate("""\
+FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate(
+    """\
 if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined()) {
   // The hardcoded 0 here will need to be updated once we support multiple levels.
   ${out_arg}._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace});
 }
-""")
+"""
+)
+
+FW_DERIVATIVE_SETTER_MULTI_OUTPUT = CodeTemplate(
+    """\
+if (${all_res}_new_fw_grad_opt.has_value() && std::get<${idx}>(${all_res}_new_fw_grad_opt.value()).defined()) {
+  ${out_arg}._set_fw_grad(std::get<${idx}>(${all_res}_new_fw_grad_opt.value()), /* level */ 0, /* is_inplace_op */ false);
+}
+"""
+)
 
-FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate("""\
+FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate(
+    """\
 if (${out_arg}_new_fw_grad_opt.has_value()) {
   auto ${out_arg}_new_fw_grad = ${out_arg}_new_fw_grad_opt.value();
   TORCH_INTERNAL_ASSERT(${out_arg}.size() == ${out_arg}_new_fw_grad.size());
@@ -348,29 +650,38 @@
     }
   }
 }
-""")
+"""
+)
 
-FW_DERIVATIVE_TEMPLATE = CodeTemplate("""\
+FW_DERIVATIVE_TEMPLATE = CodeTemplate(
+    """\
 ${fw_grad_opt_definition}
 if (${requires_fw_grad}) {
     ${unpacked_arguments}
     ${out_arg}_new_fw_grad_opt = ${formula};
 }
-""")
+"""
+)
 
-FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate("""\
-TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${msg} that does not support it.");
-""")
+FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate(
+    """\
+TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
+"""
+)
 
-FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate("""\
+FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate(
+    """\
 for (const auto& _t: ${arg}) {
-    TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${msg} that does not support it.");
+    TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
 }
-""")
+"""
+)
+
 
 def gen_variable_type(
     out: str,
     native_yaml_path: str,
+    tags_yaml_path: str,
     fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo],
     template_path: str,
 ) -> None:
@@ -382,47 +693,54 @@ def gen_variable_type(
     compute the output. The grad_fn is attached to differentiable functions.
     """
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
-    fm.write('VariableType.h', lambda: {
-        'generated_comment': "@" f'generated from {template_path}/VariableType.h'
-    })
+    fm.write(
+        "VariableType.h",
+        lambda: {
+            "generated_comment": "@" f"generated from {template_path}/VariableType.h"
+        },
+    )
 
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
     # template regarding sharding of the generated files.
     fm.write_sharded(
-        'VariableType.cpp',
+        "VariableType.cpp",
         [fn for fn in fns_with_diff_infos if use_derived(fn)],
         key_fn=lambda fn: cpp.name(fn.func.func),
         base_env={
-            'generated_comment':
-            "@" f'generated from {template_path}/VariableType.cpp',
+            "generated_comment": "@" f"generated from {template_path}/VariableType.cpp",
         },
         env_callable=gen_variable_type_func,
         num_shards=5,
-        sharded_keys={'type_derived_method_definitions', 'wrapper_registrations'}
+        sharded_keys={"type_derived_method_definitions", "wrapper_registrations"},
     )
 
+
 @with_native_function
 def gen_wrapper_registration(f: NativeFunction) -> str:
     return WRAPPER_REGISTRATION.substitute(
         unqual_operator_name_with_overload=f.func.name,
         type_wrapper_name=type_wrapper_name(f),
-        class_type='VariableType',
+        class_type="VariableType",
     )
 
+
 def gen_variable_type_func(
-    fn: NativeFunctionWithDifferentiabilityInfo
+    fn: NativeFunctionWithDifferentiabilityInfo,
 ) -> Dict[str, List[str]]:
     f = fn.func
     with native_function_manager(f):
         name = cpp.name(f.func)
         formals = gen_formals(f)
 
-        if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \
-                and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \
-                and len(gen_differentiable_outputs(fn)) > 0 \
-                and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \
-                and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \
-                and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+        if (
+            fn.info is None
+            and not get_base_name(f) in RESET_GRAD_ACCUMULATOR
+            and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE
+            and len(gen_differentiable_outputs(fn)) > 0
+            and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE
+            and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+            and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT
+        ):
             # NOTE: [ Registering AutogradNotImplemented boxed kernel ]
             #
             # When there is no derivatives.yaml entry, we register a generic boxed
@@ -441,7 +759,8 @@ def gen_variable_type_func(
             #    to (1).
             type_definition = ""
             wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute(
-                unqual_operator_name_with_overload=f.func.name)
+                unqual_operator_name_with_overload=f.func.name
+            )
         else:
             type_definition = METHOD_DEFINITION.substitute(
                 return_type=cpp.returns_type(f.func.returns).cpp_type(),
@@ -456,21 +775,24 @@ def gen_variable_type_func(
     # If you want to register a kernel to Autograd, you must make the op abstract.
     # In other words, this op must have dispatch section in native_functions.yaml.
     if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives):
-        msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. '
-               f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA '
-               f'or CompositeExplicitAutograd in native_functions.yaml. Please see '
-               f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword '
-               f'for instructions to choose the right dispatch keyword.')
+        msg = (
+            f"There's a formula for {name}(or its functional variant) in derivatives.yaml. "
+            f"It's required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA "
+            f"or CompositeExplicitAutograd in native_functions.yaml. Please see "
+            f"https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword "
+            f"for instructions to choose the right dispatch keyword."
+        )
         assert f.is_abstract, msg
 
     return {
-        'type_derived_method_definitions': [type_definition],
-        'wrapper_registrations': [wrapper_registration],
+        "type_derived_method_definitions": [type_definition],
+        "wrapper_registrations": [wrapper_registration],
     }
 
+
 @with_native_function_with_differentiability_info
 def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]:
-    assert dispatch_strategy(fn) == 'use_derived'
+    assert dispatch_strategy(fn) == "use_derived"
     f = fn.func
     info = fn.info
     fw_derivatives = fn.fw_derivatives
@@ -506,7 +828,9 @@ def gen_differentiable_input(
     def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]:
         return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out))
 
-    def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]:
+    def find_args_with_derivatives(
+        differentiable_inputs: List[DifferentiableInput],
+    ) -> List[DifferentiableInput]:
         """Find arguments that have derivative definitions"""
         if info is None or not info.has_derivatives:
             return differentiable_inputs
@@ -514,26 +838,38 @@ def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput])
         differentiable = [arg for arg in differentiable_inputs if arg.name in names]
         if len(differentiable) != len(names):
             missing = names - set(arg.name for arg in differentiable)
-            raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}')
+            raise RuntimeError(
+                f"Missing arguments for derivatives: {missing} in {info.name}"
+            )
         return differentiable
 
     differentiable_inputs = gen_differentiable_inputs(f)
     args_with_derivatives = find_args_with_derivatives(differentiable_inputs)
     differentiable_outputs = gen_differentiable_outputs(fn)
 
-    undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or (name in DONT_REQUIRE_DERIVATIVE)
+    undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or (
+        name in DONT_REQUIRE_DERIVATIVE
+    )
 
-    requires_derivative = (not undifferentiable) and (len(differentiable_inputs) > 0) and (len(differentiable_outputs) > 0)
+    requires_derivative = (
+        (not undifferentiable)
+        and (len(differentiable_inputs) > 0)
+        and (len(differentiable_outputs) > 0)
+    )
 
     if info is not None and info.has_derivatives and not requires_derivative:
-        raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative')
+        raise RuntimeError(
+            f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
+        )
 
     def emit_save_inputs() -> List[str]:
         setup: List[str] = []
         if info is None or not info.has_derivatives:
             return setup
 
-        has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives)
+        has_tensorlist_arg = any(
+            is_tensor_list_type(arg.type) for arg in args_with_derivatives
+        )
 
         # We don't want to save tensors if we know that they will never be used
         # when computing the derivative, so we add guards to those statements
@@ -550,7 +886,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             # require_grad if the backward function even gets executed. I don't
             # have any good ideas for detecting those cases, so I simply disabled the
             # checks.
-            if 'backward' in info.name:
+            if "backward" in info.name:
                 return None
 
             # If there's a single derivative we could compute, we already have
@@ -580,12 +916,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             else:
                 raise AssertionError()
 
-            return f'grad_fn->should_compute_output({edge_off})'
+            return f"grad_fn->should_compute_output({edge_off})"
 
         setup.extend(save_variables(info.all_saved_inputs, False, guard_for))
         for arg in args_with_derivatives:
             if is_tensor_list_type(arg.type):
-                setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();')
+                setup.append(f"grad_fn->{arg.name}_size_ = {arg.name}.size();")
 
         return setup
 
@@ -593,25 +929,37 @@ def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[s
         body: List[str] = []
         if is_out_fn:
             # For out functions, ensure that no input or output requires grad
-            body.append(DECLARE_GRAD_FN.substitute(op='Node'))
-            body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
-                base_name=base_name,
-                args_to_check=[arg.name for arg in differentiable_inputs]))
-            body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
-                base_name=base_name,
-                args_to_check=[arg.name for arg in differentiable_outputs]))
+            body.append(DECLARE_GRAD_FN.substitute(op="Node"))
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_inputs],
+                )
+            )
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_outputs],
+                )
+            )
             return body
 
-        op = info.op if info is not None and info.has_derivatives else 'NotImplemented'
+        op = info.op if info is not None and info.has_derivatives else "NotImplemented"
         setup = []
-        setup.extend(ASSIGN_GRAD_FN.substitute(
-            op=op,
-            op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"',
-            args_with_derivatives=[arg.name for arg in args_with_derivatives],
-        ).split('\n'))
+        setup.extend(
+            ASSIGN_GRAD_FN.substitute(
+                op=op,
+                op_ctor=""
+                if info is not None and info.has_derivatives
+                else f'"{cpp.name(f.func)}"',
+                args_with_derivatives=[arg.name for arg in args_with_derivatives],
+            ).split("\n")
+        )
         setup.extend(emit_save_inputs())
 
-        body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives))
+        body.extend(
+            emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)
+        )
         body.append(DECLARE_GRAD_FN.substitute(op=op))
         body.append(SETUP_DERIVATIVE.substitute(setup=setup))
         return body
@@ -623,7 +971,11 @@ def emit_check_if_in_complex_autograd_allowlist() -> List[str]:
         for arg in differentiable_outputs:
             name = arg.name
             # TODO: should be `arg.type.is_tensor_like()`?
-            if arg.cpp_type in ['at::Tensor', 'at::TensorList', 'const c10::List<c10::optional<at::Tensor>> &']:
+            if arg.cpp_type in [
+                "at::Tensor",
+                "at::TensorList",
+                "const c10::List<c10::optional<at::Tensor>> &",
+            ]:
                 body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");')
         return body
 
@@ -639,7 +991,7 @@ def emit_check_no_requires_grad(
             arg_name = arg.name
             if info and arg_name in info.non_differentiable_arg_names:
                 continue
-            if arg_name == 'output':
+            if arg_name == "output":
                 # Double-backwards definitions sometimes take in 'input' and
                 # 'output', but only define the derivative for input.
                 continue
@@ -649,17 +1001,19 @@ def emit_check_no_requires_grad(
     def emit_original_self_definition() -> List[str]:
         body: List[str] = []
         if inplace:
-            body.append('c10::optional<at::Tensor> original_self;')
+            body.append("c10::optional<at::Tensor> original_self;")
 
             all_forward_grad_cond = []
             for derivative in fw_derivatives:
                 if derivative.required_original_self_value:
-                    all_forward_grad_cond.append(get_any_has_forward_grad_name(derivative.var_name))
+                    all_forward_grad_cond.append(
+                        get_any_has_forward_grad_name(derivative.var_names)
+                    )
 
             if all_forward_grad_cond:
                 body.append(f'if ({" || ".join(all_forward_grad_cond)}) {{')
-                body.append('  original_self = self.clone();')
-                body.append('}')
+                body.append("  original_self = self.clone();")
+                body.append("}")
 
         return body
 
@@ -671,80 +1025,100 @@ def save_variables(
         # assign the saved variables to the generated grad_fn
         stmts: List[str] = []
         for arg in saved_variables:
-            name = arg.nctype.name.name if isinstance(arg.nctype.name, SpecialArgName) else arg.nctype.name
+            name = (
+                arg.nctype.name.name
+                if isinstance(arg.nctype.name, SpecialArgName)
+                else arg.nctype.name
+            )
             type = arg.nctype.type
             expr = arg.expr
             stmts_prepend = None
-            if type == BaseCType(tensorT) or type == OptionalCType(BaseCType(tensorT)) or \
-                    type == MutRefCType(OptionalCType(BaseCType(tensorT))) or (is_output and type == BaseCType(scalarT)):
+            if (
+                type == BaseCType(tensorT)
+                or type == OptionalCType(BaseCType(tensorT))
+                or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+                or (is_output and type == BaseCType(scalarT))
+            ):
                 var = name
-                name += '_'
-                if var == 'self' and inplace:
-                    stmts_prepend = 'if (!original_self.has_value()) original_self = self.clone()'
-                    var = 'original_self.value()'
+                name += "_"
+                if var == "self" and inplace:
+                    stmts_prepend = (
+                        "if (!original_self.has_value()) original_self = self.clone()"
+                    )
+                    var = "original_self.value()"
                     assert not is_output
                 if inplace and is_output:
-                    var = 'self'
-                    is_inplace_view = f'{var}.is_view()'
-                    expr = f'SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})'
+                    var = "self"
+                    is_inplace_view = f"{var}.is_view()"
+                    expr = f"SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})"
                 else:
-                    expr = f'SavedVariable({var}, {str(is_output).lower()})'
-            elif type == BaseCType(tensorListT) or type == ListCType(OptionalCType(BaseCType(tensorT))):
-                expr = f'make_saved_variable_list({name})'
-                name += '_'
+                    expr = f"SavedVariable({var}, {str(is_output).lower()})"
+            elif type == BaseCType(tensorListT) or type == ListCType(
+                OptionalCType(BaseCType(tensorT))
+            ):
+                expr = f"make_saved_variable_list({name})"
+                name += "_"
             elif type == BaseCType(intArrayRefT):
                 expr = expr + ".vec()"
             elif type == BaseCType(stringT):
-                expr = f'std::string({expr})'
+                expr = f"std::string({expr})"
             elif type == OptionalCType(BaseCType(stringT)):
-                expr = f'{expr}.has_value() ? c10::optional<std::string>(std::string({expr}.value())) : c10::nullopt'
+                expr = f"{expr}.has_value() ? c10::optional<std::string>(std::string({expr}.value())) : c10::nullopt"
             guard = guard_for(arg)
             if guard is None:
                 if stmts_prepend:
-                    stmts.append(f'{stmts_prepend};')
-                stmts.append(f'grad_fn->{name} = {expr};')
+                    stmts.append(f"{stmts_prepend};")
+                stmts.append(f"grad_fn->{name} = {expr};")
             else:
-                stmts.append(f'if ({guard}) {{')
+                stmts.append(f"if ({guard}) {{")
                 if stmts_prepend:
-                    stmts.append(f'  {stmts_prepend};')
-                stmts.append(f'  grad_fn->{name} = {expr};')
-                stmts.append('}')
+                    stmts.append(f"  {stmts_prepend};")
+                stmts.append(f"  grad_fn->{name} = {expr};")
+                stmts.append("}")
         return stmts
 
     # Generates a Dispatcher::redispatch() call into the dispatcher. We do this mainly for performance reasons:
     #  - Pre-compute the full DispatchKeySet. This saves the dispatcher from having to read from TLS.
     #  - redispatch() avoids a redundant call to RecordFunction, which was already called right before
     #    we entered this autograd kernel.
-    def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str:
-        """ Dispatch call via function in a namespace or method on Tensor."""
+    def emit_dispatch_call(
+        f: NativeFunction, input_base: str, unpacked_args: Sequence[str]
+    ) -> str:
+        """Dispatch call via function in a namespace or method on Tensor."""
         dispatcher_sig = DispatcherSignature.from_schema(f.func)
         dispatcher_exprs = dispatcher_sig.exprs()
 
         # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
         # Ops also always have a function variant of the redispatch API.
         # See Note [Plumbing Keys Through The Dispatcher] for details.
-        dispatch_key_set = 'ks & c10::after_autograd_keyset'
+        dispatch_key_set = "ks & c10::after_autograd_keyset"
         call = CALL_REDISPATCH.substitute(
             api_name=cpp.name(
                 f.func,
                 faithful_name_for_out_overloads=True,
             ),
-            unpacked_args=[dispatch_key_set] + list(unpacked_args))
+            unpacked_args=[dispatch_key_set] + list(unpacked_args),
+        )
         return call
 
-    def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str:
-        call = ''
+    def wrap_output(
+        f: NativeFunction, unpacked_bindings: List[Binding], var: str
+    ) -> str:
+        call = ""
         rhs_value: Optional[str] = None
         if not any(r.type.is_tensor_like() for r in f.func.returns):
             rhs_value = var
         else:
-            rhs_value = f'std::move({var})'
+            rhs_value = f"std::move({var})"
         assert rhs_value is not None
-        call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f),
-                                               rhs_value=rhs_value)
+        call += ASSIGN_RETURN_VALUE.substitute(
+            return_values=tie_return_values(f), rhs_value=rhs_value
+        )
         return call
 
-    def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str:
+    def check_tensorimpl_and_storage(
+        call: str, unpacked_bindings: List[Binding]
+    ) -> str:
         # See NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
         stmts_before_call: List[str] = []
         stmts_after_call: List[str] = []
@@ -757,22 +1131,42 @@ def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) ->
             arg = unpacked_binding.name
             noref_cpp_type = unpacked_binding.nctype.type.remove_const_ref()
             if noref_cpp_type == BaseCType(tensorListT):
-                stmts_before_call += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
-                                      SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
-                stmts_after_call += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
-                                     ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                stmts_before_call += [
+                    SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
             elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
-                stmts_before_call += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
-                                      SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
-                stmts_after_call += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
-                                     ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                stmts_before_call += [
+                    SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(
+                        tensorlist_name=arg
+                    ),
+                    ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(
+                        tensorlist_name=arg
+                    ),
+                ]
             elif noref_cpp_type == BaseCType(tensorT):
-                stmts_before_call += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
-                                      SAVE_TENSOR_IMPL.substitute(tensor_name=arg)]
-                stmts_after_call += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg, out_tensor_name=arg),
-                                     ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg)]
-
-        assert (stmts_before_call and stmts_after_call) or (not stmts_before_call and not stmts_after_call)
+                stmts_before_call += [
+                    SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
+                    SAVE_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                        tensor_name=arg, out_tensor_name=arg
+                    ),
+                    ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+
+        assert (stmts_before_call and stmts_after_call) or (
+            not stmts_before_call and not stmts_after_call
+        )
 
         # Check properties of outputs (enforce (2), (3))
         if not f.func.kind() in (SchemaKind.inplace, SchemaKind.out):
@@ -780,33 +1174,55 @@ def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) ->
             aliased_arg_name = ALL_VIEW_FUNCTIONS.get(base_name, None)
             if aliased_arg_name is not None:
                 aliased_arg_name = unpacked_name(aliased_arg_name)
-            for i, (ret, ret_name) in enumerate(zip(f.func.returns, cpp.return_names(f))):
+            for i, (ret, ret_name) in enumerate(
+                zip(f.func.returns, cpp.return_names(f))
+            ):
                 noref_cpp_type = cpp.return_type(ret).remove_const_ref()
                 if noref_cpp_type == BaseCType(tensorT):
                     if aliased_arg_name is not None:
-                        assert i == 0, "Expect non-CompositeImplicitAutograd view function {base} to return single output"
-                        stmts_after_call += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=aliased_arg_name,
-                                                                                    out_tensor_name=ret_name)]
+                        assert (
+                            i == 0
+                        ), "Expect non-CompositeImplicitAutograd view function {base} to return single output"
+                        stmts_after_call += [
+                            ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                                tensor_name=aliased_arg_name, out_tensor_name=ret_name
+                            )
+                        ]
                     else:
-                        if type_wrapper_name(f) not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT:
-                            stmts_after_call += [ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute(
-                                tensor_name=ret_name, fn_name=type_wrapper_name(f))]
+                        if (
+                            type_wrapper_name(f)
+                            not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+                        ):
+                            stmts_after_call += [
+                                ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute(
+                                    tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                                )
+                            ]
 
                     if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
-                        stmts_after_call += [ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
-                            tensor_name=ret_name, fn_name=type_wrapper_name(f))]
+                        stmts_after_call += [
+                            ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
+                                tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                            )
+                        ]
 
                 # Currently we don't have any functions that return the following types, but
                 # we should update the checks once we do
                 elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
-                    raise AssertionError(f"Please add use_count checks for {noref_cpp_type}")
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
                 elif noref_cpp_type == BaseCType(tensorListT):
-                    raise AssertionError(f"Please add use_count checks for {noref_cpp_type}")
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
 
         if stmts_before_call and stmts_after_call:
-            call = RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call) + \
-                call + \
-                RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call)
+            call = (
+                RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call)
+                + call
+                + RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call)
+            )
         return call
 
     def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
@@ -816,161 +1232,259 @@ def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         # in are now Variables.
         # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
         unpacked_args = [b.name for b in unpacked_bindings]
-        base_type_call = emit_dispatch_call(f, 'self_', unpacked_args)
+        base_type_call = emit_dispatch_call(f, "self_", unpacked_args)
 
         if get_view_info(f) is not None or modifies_arguments(f):
-            guard = 'at::AutoDispatchBelowAutograd guard;'
+            guard = "at::AutoDispatchBelowAutograd guard;"
         else:
-            guard = 'at::AutoDispatchBelowADInplaceOrView guard;'
+            guard = "at::AutoDispatchBelowADInplaceOrView guard;"
 
         if not modifies_arguments(f) and not returns_void:
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
-                base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard)
+                base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard
+            )
 
             call += wrap_output(f, unpacked_bindings, TMP_VAR)
         else:
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
-                base_type_call=base_type_call, guard=guard)
+                base_type_call=base_type_call, guard=guard
+            )
         call = check_tensorimpl_and_storage(call, unpacked_bindings)
         return call
 
     def emit_history() -> str:
-        fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set'
+        fn = "rebase" if modifies_arguments(f) and view_info is None else "set"
         output_names = [r.name for r in differentiable_outputs]
         # TODO: flatten allocates a std::vector, which could be expensive
-        outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names)
+        outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(
+            outs=output_names
+        )
         return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
 
     def emit_save_outputs() -> str:
         if is_out_fn:
             # out functions don't currently support differentiation
-            return ''
+            return ""
         if info is not None and info.has_derivatives:
             stmts = save_variables(info.all_saved_outputs, True)
             if len(stmts) == 0:
-                return ''
-            return CONDITIONAL.substitute(cond='grad_fn', statements=stmts)
-        return ''
+                return ""
+            return CONDITIONAL.substitute(cond="grad_fn", statements=stmts)
+        return ""
 
     def emit_any_requires_grad() -> List[str]:
-        extra_condition = ''
+        extra_condition = ""
         if fn.info and fn.info.output_differentiability_conditions:
             assert len(fn.info.output_differentiability_conditions) == 1
-            extra_condition = \
-                f'_any_requires_grad &= ({fn.info.output_differentiability_conditions[0]});'
-        return [SETUP_ANY_REQUIRES_GRAD.substitute(
-            args_with_derivatives=[arg.name for arg in args_with_derivatives],
-            extra_differentiability_conditions=extra_condition)]
+            extra_condition = f"_any_requires_grad &= ({fn.info.output_differentiability_conditions[0]});"
+        return [
+            SETUP_ANY_REQUIRES_GRAD.substitute(
+                args_with_derivatives=[arg.name for arg in args_with_derivatives],
+                extra_differentiability_conditions=extra_condition,
+            )
+        ]
 
-    def get_any_has_forward_grad_name(var_name: str) -> str:
-        return f'_any_has_forward_grad_{var_name}'
+    def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
+        if len(var_names) == 1:
+            return f"_any_has_forward_grad_{var_names[0]}"
+        else:
+            return f'_any_has_forward_grad_{"_".join(var_names)}'
 
     def emit_any_has_forward_grad() -> List[str]:
         content: List[str] = []
         for derivative in fw_derivatives:
             assert derivative.required_inputs_fw_grad is not None
-            requires_fw_grad = " || ".join([FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                                           for inp in differentiable_inputs if inp.name in derivative.required_inputs_fw_grad])
+            requires_fw_grad = " || ".join(
+                [
+                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    for inp in differentiable_inputs
+                    if inp.name in derivative.required_inputs_fw_grad
+                ]
+            )
             if not requires_fw_grad:
                 # Handle functions like stack
                 # For these, we don't unpack anything and always call the user function
-                if not (len(differentiable_inputs) == 1 and is_tensor_list_type(differentiable_inputs[0].type)):
-                    raise RuntimeError(f'No differentiable input to "{name}" is a differentiable Tensor (as the provided'
-                                       'forward AD formula does not use any input tangent) even though a forward gradient '
-                                       'formula has been defined for it. This case should only happen for function that '
-                                       'take a single TensorList as input. All other cases are not supported right now.')
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
                 requires_fw_grad = "true"
 
             if fn.info and fn.info.output_differentiability_conditions:
                 assert len(fn.info.output_differentiability_conditions) == 1
-                requires_fw_grad = \
-                    f'({fn.info.output_differentiability_conditions[0]}) && ({requires_fw_grad})'
+                requires_fw_grad = f"({fn.info.output_differentiability_conditions[0]}) && ({requires_fw_grad})"
 
-            content.append(f"auto {get_any_has_forward_grad_name(derivative.var_name)} = {requires_fw_grad};\n"
-                           f"(void){get_any_has_forward_grad_name(derivative.var_name)};")
+            content.append(
+                f"auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};\n"
+                f"(void){get_any_has_forward_grad_name(derivative.var_names)};"
+            )
 
         return content
 
     def emit_check_inplace() -> List[str]:
         if not inplace:
             return []
-        return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs]
+        return [
+            f"check_inplace({arg.name}, _any_requires_grad);"
+            for arg in differentiable_outputs
+        ]
 
     def emit_fw_derivatives() -> List[str]:
         content: List[str] = []
         fw_grad_setters: List[str] = []
         for derivative in fw_derivatives:
-            res = derivative.var_name
+            res = derivative.var_names
             if f.func.name.name.inplace:
+                assert (
+                    len(res) == 1
+                ), "Expected number of outputs to be 1 if function is inplace"
                 # TODO update this when inplace namings are unified
-                res = "self"
+                res = ("self",)
 
             assert derivative.required_inputs_fw_grad is not None
 
             unpacked_arguments = ""
             for inp in differentiable_inputs:
-                zeros_fn = "zeros" if inplace and inp.name == "self" else "_efficientzerotensor"
+                zeros_fn = (
+                    "zeros"
+                    if inplace and inp.name == "self"
+                    else "_efficientzerotensor"
+                )
                 if inp.name in derivative.required_inputs_fw_grad:
-                    unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp=inp.name, zeros_fn=zeros_fn)
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                            inp=inp.name, zeros_fn=zeros_fn
+                        )
+                    )
                 if inp.name in (derivative.required_inputs_primal or []):
-                    unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp=inp.name)
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp=inp.name)
+                    )
             if derivative.required_original_self_value:
-                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp="original_self", zeros_fn=zeros_fn)
-                unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp="original_self")
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                    inp="original_self", zeros_fn=zeros_fn
+                )
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
+                    inp="original_self"
+                )
             elif inplace and derivative.is_reusing_outplace_formula:
                 # The gradient wasn't already cloned, do it if grad mode is enabled
-                unpacked_arguments += "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;"
+                unpacked_arguments += (
+                    "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;"
+                )
 
             if inplace:
                 is_inplace_str = "true"
             else:
                 is_inplace_str = "false"
 
-            if isinstance(derivative.var_type, BaseType) and derivative.var_type.is_tensor_like():
+            if all(
+                (isinstance(var_type, BaseType) and var_type.is_tensor_like())
+                for var_type in derivative.var_types
+            ):
                 # Is there a way to get from BaseType to BaseCType
-                opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
-                fw_grad_setter = FW_DERIVATIVE_SETTER_TENSOR.substitute(out_arg=res, is_inplace=is_inplace_str)
-            elif isinstance(derivative.var_type, ListType) and derivative.var_type.is_tensor_like():
-                opt_res_grad_type = OptionalCType(VectorCType(BaseCType(tensorT))).cpp_type()
-                fw_grad_setter = FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute(out_arg=res, is_inplace=is_inplace_str)
+                if len(derivative.var_types) == 1:
+                    opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
+                    fw_grad_setters.append(
+                        FW_DERIVATIVE_SETTER_TENSOR.substitute(
+                            out_arg=res[0], is_inplace=is_inplace_str
+                        )
+                    )
+                else:
+                    tuple_type = TupleCType(
+                        [BaseCType(tensorT)] * len(derivative.var_types)
+                    )
+                    opt_res_grad_type = OptionalCType(tuple_type).cpp_type()
+                    for idx, single_res in enumerate(res):
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_MULTI_OUTPUT.substitute(
+                                idx=idx, all_res="_".join(res), out_arg=single_res
+                            )
+                        )
+            elif (
+                isinstance(derivative.var_types[0], ListType)
+                and derivative.var_types[0].is_tensor_like()
+            ):
+                assert (
+                    len(derivative.var_types) == 1
+                ), "Expected number of outputs to be 1 if function returns ListType"
+                opt_res_grad_type = OptionalCType(
+                    VectorCType(BaseCType(tensorT))
+                ).cpp_type()
+                fw_grad_setters.append(
+                    FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute(
+                        out_arg=res[0], is_inplace=is_inplace_str
+                    )
+                )
             else:
                 raise RuntimeError("Unsupported output type for forward derivative")
 
-            fw_grad_opt_definition = f"{opt_res_grad_type} {res}_new_fw_grad_opt = c10::nullopt;"
+            fw_grad_opt_definition = (
+                f"{opt_res_grad_type} {'_'.join(res)}_new_fw_grad_opt = c10::nullopt;"
+            )
 
             # View ops create fw_grad that already is a view of the base's fw_grad so just use that
-            content.append(FW_DERIVATIVE_TEMPLATE.substitute(
-                fw_grad_opt_definition=fw_grad_opt_definition,
-                requires_fw_grad=get_any_has_forward_grad_name(derivative.var_name), formula=derivative.formula, out_arg=res,
-                unpacked_arguments=unpacked_arguments))
-            fw_grad_setters.append(fw_grad_setter)
+            content.append(
+                FW_DERIVATIVE_TEMPLATE.substitute(
+                    fw_grad_opt_definition=fw_grad_opt_definition,
+                    requires_fw_grad=get_any_has_forward_grad_name(
+                        derivative.var_names
+                    ),
+                    formula=derivative.formula,
+                    out_arg="_".join(res),
+                    unpacked_arguments=unpacked_arguments,
+                )
+            )
 
         # Set all the grads at the end to avoid: https://github.com/pytorch/pytorch/issues/67367
-        content.append('\n'.join(fw_grad_setters))
+        content.append("\n".join(fw_grad_setters))
         return content
 
     def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
         def get_msg() -> str:
             if is_out_fn:
-                msg = name + " (because it is an out= function)"
+                msg = "because it is an out= function"
             else:
-                msg = name
+                msg = (
+                    "because it has not been implemented yet.\\nPlease file an issue "
+                    "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                    "so that we can prioritize its implementation."
+                )
             return msg
+
         res = ""
         to_check: List[str] = []
-        for inp in list(mapMaybe(gen_differentiable_input,
-                                 f.func.arguments.non_out + list(f.func.arguments.out))):  # type: ignore[operator]
+        for inp in list(
+            mapMaybe(
+                gen_differentiable_input,
+                f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
+            )
+        ):
             if is_tensor_type(inp.type):
-                to_check.append(FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name))
+                to_check.append(
+                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                )
             elif is_tensor_list_type(inp.type):
                 cond = FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp="_t")
-                res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(arg=inp.name, cond=cond, msg=get_msg())
+                res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(
+                    arg=inp.name, cond=cond, name=name, msg=get_msg()
+                )
             else:
-                raise RuntimeError(f'Unsupported input type for "{name}" when forbidding forward AD usage.')
+                raise RuntimeError(
+                    f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                )
 
         if len(to_check) > 0:
             cond = " || ".join(to_check)
-            res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, msg=get_msg())
+            res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(
+                cond=cond, name=name, msg=get_msg()
+            )
         return res
 
     body: List[str] = []
@@ -1000,12 +1514,15 @@ def get_msg() -> str:
             if len(fw_derivatives) == 0:
                 body.append(emit_forbid_fw_derivatives())
             else:
-                assert len(fw_derivatives) == len(differentiable_outputs), (
+                assert sum(
+                    len(derivative.var_names) for derivative in fw_derivatives
+                ) == len(differentiable_outputs), (
                     "Expected the number of forward derivatives implemented to match the "
                     "number of differentiable outputs. NB: This only applies when at least "
                     "one forward derivative is implemented. Not implementing any forward "
                     "derivatives is also okay, and we would require inputs to the op to "
-                    "not have associated tangents in that case.")
+                    "not have associated tangents in that case."
+                )
 
     if requires_derivative:
         # Save only after the forward AD has been set up
@@ -1017,7 +1534,7 @@ def get_msg() -> str:
         # `reset_grad_accumulator` in an operator that's not `inplace`, you can
         # remove this assert but the code generation will get more elaborate
         assert inplace
-        body.append('reset_grad_accumulator(self);')
+        body.append("reset_grad_accumulator(self);")
     if not returns_void:
-        body.append(f'return {get_return_value(f)};')
+        body.append(f"return {get_return_value(f)};")
     return body
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index e62ab95c66d0..185a4cdcef49 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -1,44 +1,113 @@
 # Parses derivatives.yaml into autograd functions
 #
 # Each autograd function is represented by `DifferentiabilityInfo` containing
-# a list of `Derivative`. See `tools.codegen.api.autograd` for the data models.
+# a list of `Derivative`. See `torchgen.api.autograd` for the data models.
 from collections import defaultdict
 import re
 from typing import Counter, Sequence, Any, Tuple, List, Set, Dict, Match, Optional
 import yaml
 
-from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo,
-                                        SavedAttribute, ForwardDerivative)
-from tools.codegen.api.types import (Binding, CppSignatureGroup, NamedCType, BaseCType, VectorCType,
-                                     intArrayRefT, tensorOptionsT, typeAndSizeT, longT, boolT,
-                                     tensorGeometryT, scalarTypeT, SpecialArgName,
-                                     OptionalCType, stringT)
-from tools.codegen.api import cpp
-from tools.codegen.gen import parse_native_yaml
-from tools.codegen.context import with_native_function
-from tools.codegen.model import FunctionSchema, NativeFunction, Variant, Type
-from tools.codegen.utils import IDENT_REGEX, split_name_params, YamlLoader
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    SavedAttribute,
+    ForwardDerivative,
+)
+from torchgen.api.types import (
+    Binding,
+    CppSignatureGroup,
+    NamedCType,
+    BaseCType,
+    VectorCType,
+    intArrayRefT,
+    tensorOptionsT,
+    typeAndSizeT,
+    longT,
+    boolT,
+    layoutT,
+    tensorGeometryT,
+    scalarTypeT,
+    SpecialArgName,
+    OptionalCType,
+    stringT,
+)
+from torchgen.api import cpp
+from torchgen.gen import parse_native_yaml, get_grouped_by_view_native_functions
+from torchgen.context import with_native_function
+from torchgen.model import (
+    FunctionSchema,
+    NativeFunction,
+    Variant,
+    Type,
+    NativeFunctionsViewGroup,
+    OperatorName,
+)
+from torchgen.utils import IDENT_REGEX, split_name_params, YamlLoader, concatMap
 
 _GLOBAL_LOAD_DERIVATIVE_CACHE = {}
 
-def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Sequence[DifferentiabilityInfo]:
+# This function directly adds derivative entries for {view}_copy variants of each view op.
+# Since every {view} and {view}_copy op shares the same derivative formula,
+# we generate them here instead of duplicating them in the yaml.
+# See Note [Codegen'd {view}_copy Operators]
+def add_view_copy_derivatives(
+    infos: List[DifferentiabilityInfo], view_groups: List[NativeFunctionsViewGroup]
+) -> List[DifferentiabilityInfo]:
+    # Get the map from each view op's name to its corresponding view group
+    view_name_to_group: Dict[OperatorName, NativeFunctionsViewGroup] = {
+        g.view.func.name: g for g in view_groups
+    }
+
+    view_copy_differentiability_infos = []
+    for info in infos:
+        maybe_view_group = view_name_to_group.get(info.func.func.name, None)
+        if maybe_view_group is not None and maybe_view_group.view_copy is not None:
+            view_copy_info = info.create_view_copy_from_view_derivative(
+                maybe_view_group
+            )
+            if view_copy_info is not None:
+                view_copy_differentiability_infos.append(view_copy_info)
+
+    return view_copy_differentiability_infos
+
+
+def load_derivatives(
+    derivatives_yaml_path: str, native_yaml_path: str, tags_yaml_path: str
+) -> Sequence[DifferentiabilityInfo]:
     # Do some caching as this is a deterministic function
     global _GLOBAL_LOAD_DERIVATIVE_CACHE
     key = (derivatives_yaml_path, native_yaml_path)
     if key not in _GLOBAL_LOAD_DERIVATIVE_CACHE:
 
-        with open(derivatives_yaml_path, 'r') as f:
+        with open(derivatives_yaml_path, "r") as f:
             definitions = yaml.load(f, Loader=YamlLoader)
 
-        functions = parse_native_yaml(native_yaml_path).native_functions
+        funcs = parse_native_yaml(native_yaml_path, tags_yaml_path).native_functions
+        # From the parsed native functions, separate out the (generated) view_copy functions,
+        # so we can generate derivatives for them separately.
+        native_functions_with_view_groups = get_grouped_by_view_native_functions(funcs)
+        native_functions_without_view_copies = concatMap(
+            # We need to pull out the view_inplace ops too, since they might have their own derivative entries.
+            lambda g: [g]
+            if isinstance(g, NativeFunction)
+            else list(g.functions(include_copy=False)),
+            native_functions_with_view_groups,
+        )
+        view_groups = [
+            g
+            for g in native_functions_with_view_groups
+            if isinstance(g, NativeFunctionsViewGroup)
+        ]
 
         # What's the difference between function schema v.s. signature?
         # function schema is the complete declaration including mutability annotation / default value and etc.
         # signature is the canonical schema for a group of functions (in-place/out/functional variants)
         # that are semantically related.
-        functions_by_signature: Dict[FunctionSchema, List[NativeFunction]] = defaultdict(list)
+        functions_by_signature: Dict[
+            FunctionSchema, List[NativeFunction]
+        ] = defaultdict(list)
         functions_by_schema: Dict[str, NativeFunction] = dict()
-        for function in functions:
+        for function in native_functions_without_view_copies:
             functions_by_signature[function.func.signature()].append(function)
             assert str(function.func) not in functions_by_schema
             functions_by_schema[str(function.func)] = function
@@ -48,38 +117,56 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque
         op_counter = Counter[str]()
 
         infos = [
-            create_differentiability_info(defn, functions_by_signature, functions_by_schema, op_counter)
-            for defn in definitions]
+            create_differentiability_info(
+                defn, functions_by_signature, functions_by_schema, op_counter
+            )
+            for defn in definitions
+        ]
+        infos += add_view_copy_derivatives(infos, view_groups)
 
         _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = infos
 
     return _GLOBAL_LOAD_DERIVATIVE_CACHE[key]
 
+
 @with_native_function
 def cpp_arguments(f: NativeFunction) -> Sequence[Binding]:
     return CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
 
-def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...],
-                      available_named_gradients: Sequence[str]) -> Derivative:
+
+def create_derivative(
+    f: NativeFunction,
+    formula: str,
+    var_names: Tuple[str, ...],
+    available_named_gradients: Sequence[str],
+) -> Derivative:
     original_formula = formula
-    arguments: List[NamedCType] = [a.nctype.remove_const_ref() for a in cpp_arguments(f)]
+    arguments: List[NamedCType] = [
+        a.nctype.remove_const_ref() for a in cpp_arguments(f)
+    ]
 
-    return_names = tuple(n if n != 'self' else 'result' for n in cpp.return_names(f))
+    return_names = tuple(n if n != "self" else "result" for n in cpp.return_names(f))
     return_types = tuple(cpp.return_type(r).remove_const_ref() for r in f.func.returns)
 
-    named_returns = [NamedCType(name, type) for name, type in zip(return_names, return_types)]
+    named_returns = [
+        NamedCType(name, type) for name, type in zip(return_names, return_types)
+    ]
 
     formula, saved_inputs = saved_variables(formula, arguments, var_names)
     formula, saved_outputs = saved_variables(formula, named_returns, var_names)
 
-    used_named_gradients = {name for name in available_named_gradients if re.search(IDENT_REGEX.format(name), formula)}
+    used_named_gradients = {
+        name
+        for name in available_named_gradients
+        if re.search(IDENT_REGEX.format(name), formula)
+    }
 
     # Check that the referenced derivatives in the formula are in bounds
     for i in used_gradient_indices(formula):
         if i >= len(f.func.returns):
             raise RuntimeError(
-                f'Out of bounds grads access: derivative formula for {cpp.name(f.func)} '
-                f'used grads[{i}], but the forward only returns {len(f.func.returns)} outputs.'
+                f"Out of bounds grads access: derivative formula for {cpp.name(f.func)} "
+                f"used grads[{i}], but the forward only returns {len(f.func.returns)} outputs."
             )
 
     return Derivative(
@@ -91,34 +178,43 @@ def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...
         named_gradients=used_named_gradients,
     )
 
-def create_forward_derivative(f: NativeFunction, formula: str, names: Tuple[str, ...]) -> ForwardDerivative:
-    assert len(names) == 1, "Forward derivatives can define gradients for only one output at a time"
-    var_name = names[0]
-    var_type: Optional[Type] = None
+
+def create_forward_derivative(
+    f: NativeFunction, formula: str, names: Tuple[str, ...]
+) -> ForwardDerivative:
+    var_names = names
+    var_types: Optional[Tuple[Type, ...]] = None
     for r in f.func.returns:
-        if r.name == var_name:
-            var_type = r.type
-            break
+        if r.name in var_names:
+            if var_types is None:
+                var_types = tuple()
+            var_types = var_types + (r.type,)
+
     # Handle default return names
-    if var_type is None:
-        if var_name == "result":
+    if var_types is None:
+        if var_names == ("result",):
             assert len(f.func.returns) == 1
-            var_type = f.func.returns[0].type
+            var_types = (f.func.returns[0].type,)
         else:
-            res = re.findall(r"^result(\d+)$", var_name)
-            if len(res) == 1:
-                arg_idx = int(res[0])
-                var_type = f.func.returns[arg_idx].type
-
-    assert var_type is not None, "No matching output for forward derivative definition"
+            for var_name in var_names:
+                res = re.findall(r"^result(\d+)$", var_name)
+                if len(res) == 1:
+                    if var_types is None:
+                        var_types = tuple()
+                    arg_idx = int(res[0])
+                    var_types = var_types + (f.func.returns[arg_idx].type,)
+
+    assert var_types is not None, "No matching output for forward derivative definition"
     return ForwardDerivative(
         formula=formula,
-        var_name=var_name,
-        var_type=var_type,
+        var_names=var_names,
+        var_types=var_types,
         required_inputs_fw_grad=None,
         required_inputs_primal=None,
         required_original_self_value=False,
-        is_reusing_outplace_formula=False)
+        is_reusing_outplace_formula=False,
+    )
+
 
 def postprocess_forward_derivatives(
     f: NativeFunction,
@@ -126,22 +222,23 @@ def postprocess_forward_derivatives(
     all_arg_names: List[str],
     derivatives: List[Derivative],
     forward_derivatives: List[ForwardDerivative],
-    args_with_derivatives: Sequence[Binding]
+    args_with_derivatives: Sequence[Binding],
 ) -> List[ForwardDerivative]:
-
     def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]:
         required_inputs = set()
         for arg in args_with_derivatives:
-            if arg.type == 'at::TensorList':
+            if arg.type == "at::TensorList":
                 # The functions taking TensorList handle everything internally
                 continue
             arg_name = arg.name
 
             found = re.search(IDENT_REGEX.format(arg_name), formula)
             if found:
-                raise RuntimeError(f"The forward formula for {defn_name} is using the base name of the {arg_name} "
-                                   f"argument which is ambiguous. You should use {arg_name}_p to access the primal "
-                                   f"value and {arg_name}_t to access the tangent.")
+                raise RuntimeError(
+                    f"The forward formula for {defn_name} is using the base name of the {arg_name} "
+                    f"argument which is ambiguous. You should use {arg_name}_p to access the primal "
+                    f"value and {arg_name}_t to access the tangent."
+                )
 
             found = re.search(IDENT_REGEX.format(arg_name + postfix), formula)
             if found:
@@ -155,15 +252,23 @@ def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]:
         formula = defn.formula
         required_inputs_tangent = find_required_inputs(formula, "_t")
         if formula == "auto_element_wise":
-            if (not len(args_with_derivatives) == 1) or len(forward_derivatives) > 1:
-                raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the "
-                                   "forward definition of gradient as element_wise but this only "
-                                   "works for functions with a single differentiable input and a "
-                                   "single differentiable output.")
+            if (
+                (not len(args_with_derivatives) == 1)
+                or len(forward_derivatives) > 1
+                or len(forward_derivatives[0].var_names) > 1
+            ):
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as element_wise but this only "
+                    "works for functions with a single differentiable input and a "
+                    "single differentiable output."
+                )
             if not len(derivatives) == 1:
-                raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the "
-                                   "forward definition of gradient as element_wise but it does not "
-                                   "defines the gradient formula for its argument which is required.")
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as element_wise but it does not "
+                    "defines the gradient formula for its argument which is required."
+                )
             # This transformation is based on the observation that for element-wise functions, the Jacobian
             # matrix is diagonal and thus doing J * v is the same as (v^T J)^T (in practice, we ignore the transpositions)
             # For the complex case, we use hermitian transpose and get (v.conj() J).conj()
@@ -182,6 +287,7 @@ def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]:
             # Do replacement 1) of the grad
             def repl(m: Any) -> str:
                 return f"{m.group(1)}{input_name}_t.conj(){m.group(2)}"
+
             fw_formula = re.sub(IDENT_REGEX.format("grad"), repl, backward_formula)
 
             # Do replacement 2) of the input variables
@@ -190,6 +296,7 @@ def repl(m: Any) -> str:
 
                 def repl(m: Any) -> str:
                     return f"{m.group(1)}{arg_name}_p{m.group(2)}"
+
                 fw_formula = re.sub(IDENT_REGEX.format(arg_name), repl, fw_formula)
 
             # Do the final conjugate 3)
@@ -200,10 +307,15 @@ def repl(m: Any) -> str:
             required_inputs_tangent = tuple(all_arg_names)
             formula = fw_formula
         elif formula == "auto_linear":
-            if len(forward_derivatives) > 1:
-                raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the "
-                                   "forward definition of gradient as linear but this only works "
-                                   "for functions with a single differentiable output.")
+            if (
+                len(forward_derivatives) > 1
+                or len(forward_derivatives[0].var_names) > 1
+            ):
+                raise RuntimeError(
+                    f"Derivative definition of {defn_name} in derivatives.yaml defines the "
+                    "forward definition of gradient as linear but this only works "
+                    "for functions with a single differentiable output."
+                )
             # This transformation is based on the observation that linear functions can be written as:
             #   y = f(x) = A * x
             # For some matrix A and the Jacobian of the function f is also A.
@@ -229,7 +341,9 @@ def repl(m: Any) -> str:
                 fw_formula = "at::{}({})".format(defn_name, ", ".join(new_args))
             else:
                 assert Variant.method in f.variants
-                fw_formula = "{}.{}({})".format(new_args[0], defn_name, ", ".join(new_args[1:]))
+                fw_formula = "{}.{}({})".format(
+                    new_args[0], defn_name, ", ".join(new_args[1:])
+                )
 
             # All of the input tangents are always used so all of them are required here.
             required_inputs_tangent = tuple(diff_arg_names)
@@ -241,26 +355,31 @@ def repl(m: Any) -> str:
         # This call inspects the formula to find for which input's primal are used.
         required_inputs_primal = find_required_inputs(formula, "_p")
 
-        updated_derivatives.append(ForwardDerivative(
-            formula=formula,
-            var_name=defn.var_name,
-            var_type=defn.var_type,
-            required_inputs_fw_grad=required_inputs_tangent,
-            required_inputs_primal=required_inputs_primal,
-            required_original_self_value=False,
-            is_reusing_outplace_formula=False))
+        updated_derivatives.append(
+            ForwardDerivative(
+                formula=formula,
+                var_names=defn.var_names,
+                var_types=defn.var_types,
+                required_inputs_fw_grad=required_inputs_tangent,
+                required_inputs_primal=required_inputs_primal,
+                required_original_self_value=False,
+                is_reusing_outplace_formula=False,
+            )
+        )
 
     return updated_derivatives
 
-def is_forward_derivative_definition(all_arg_names: List[str], names: Tuple[str, ...]) -> bool:
-    if len(names) > 1:
-        # Forward definition are always for a single output at a time
-        return False
-    name = names[0]
-    if name not in all_arg_names:
-        return True
-    else:
-        return False
+
+def is_forward_derivative_definition(
+    all_arg_names: List[str], names: Tuple[str, ...]
+) -> bool:
+    for name in names:
+        if name not in all_arg_names:
+            return True
+        else:
+            return False
+    raise RuntimeError("Expected `names` to be non-empty")
+
 
 def create_differentiability_info(
     defn: Dict[Any, Any],
@@ -270,17 +389,19 @@ def create_differentiability_info(
 ) -> DifferentiabilityInfo:
     """Processes a single entry `defn` in derivatives.yaml"""
 
-    def canonical_function(functions: Sequence[NativeFunction], name: str) -> NativeFunction:
+    def canonical_function(
+        functions: Sequence[NativeFunction], name: str
+    ) -> NativeFunction:
         for f in functions:
             if cpp.name(f.func) == name:
                 return f
         # some functions only have in-place variants
-        assert name + '_' == cpp.name(functions[0].func)
+        assert name + "_" == cpp.name(functions[0].func)
         return functions[0]
 
     def split_names(raw_names: str) -> Tuple[str, ...]:
         """Given "foo, bar", return ["foo", "bar"]."""
-        return tuple(x.strip() for x in raw_names.split(','))
+        return tuple(x.strip() for x in raw_names.split(","))
 
     def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None:
         """
@@ -289,14 +410,16 @@ def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None:
         used with double backwards.
         """
 
-        uses_grad = False                   # true if any derivative uses "grad"
-        num_grads_uses = 0                  # count of uses of "grads" or "grads[INDEX]"
-        uses_named_grads = False            # true if any derivative uses "grad_{name}"
+        uses_grad = False  # true if any derivative uses "grad"
+        num_grads_uses = 0  # count of uses of "grads" or "grads[INDEX]"
+        uses_named_grads = False  # true if any derivative uses "grad_{name}"
         used_grads_indices: List[int] = []  # which indices of grads are used
         for d in derivatives:
             formula = d.formula
-            uses_grad = uses_grad or bool(re.findall(IDENT_REGEX.format('grad'), formula))
-            num_grads_uses += len(re.findall(IDENT_REGEX.format('grads'), formula))
+            uses_grad = uses_grad or bool(
+                re.findall(IDENT_REGEX.format("grad"), formula)
+            )
+            num_grads_uses += len(re.findall(IDENT_REGEX.format("grads"), formula))
             uses_named_grads = uses_named_grads or bool(d.named_gradients)
             used_grads_indices.extend(used_gradient_indices(formula))
         # This is a basic sanity check: the number of places we see
@@ -309,26 +432,32 @@ def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None:
         only_used_grads_indices = num_grads_uses == len(used_grads_indices)
 
         if uses_grad and num_grads_uses > 0:
-            raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml illegally "
-                               "mixes use of 'grad' and 'grads'. Consider replacing "
-                               "occurrences of 'grad' with 'grads[0]'")
+            raise RuntimeError(
+                f"Derivative definition of {defn_name} in derivatives.yaml illegally "
+                "mixes use of 'grad' and 'grads'. Consider replacing "
+                "occurrences of 'grad' with 'grads[0]'"
+            )
 
         if only_used_grads_indices and set(used_grads_indices) == {0}:
-            raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml solely "
-                               "refers to 'grads[0]'.  If the first output is indeed the "
-                               "only differentiable output, replace 'grads[0]' with 'grad'; "
-                               "otherwise, there is a likely error in your derivatives "
-                               "declaration.")
+            raise RuntimeError(
+                f"Derivative definition of {defn_name} in derivatives.yaml solely "
+                "refers to 'grads[0]'.  If the first output is indeed the "
+                "only differentiable output, replace 'grads[0]' with 'grad'; "
+                "otherwise, there is a likely error in your derivatives "
+                "declaration."
+            )
 
         if uses_named_grads and (uses_grad or num_grads_uses > 0):
             raise RuntimeError(
-                f'Derivative definition of {defn_name} in derivatives.yaml illegally '
+                f"Derivative definition of {defn_name} in derivatives.yaml illegally "
                 'mixes use of "grad_RETURN_NAME" and "grad" or "grads[x]". Use '
-                'only one method for identifying gradients.')
-
+                "only one method for identifying gradients."
+            )
 
     @with_native_function
-    def set_up_derivatives(f: NativeFunction) -> Tuple[
+    def set_up_derivatives(
+        f: NativeFunction,
+    ) -> Tuple[
         Sequence[Derivative],
         Sequence[ForwardDerivative],
         Sequence[Binding],
@@ -342,7 +471,9 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
         args_with_derivatives_set: Set[str] = set()
 
         all_arg_names = [a.name for a in cpp_arguments(f)]
-
+        all_ret_names = [
+            r.name for r in f.func.returns
+        ]  # only used for the assert below
         # output_differentiability is captured from the enclosed
         # scope. Don't modify it.
         #
@@ -355,72 +486,104 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
         differentiability = output_differentiability or [True] * len(f.func.returns)
         # A return is available as a named gradient ...
         available_named_gradients = [
-            f'grad_{ret.name}' for ret, differentiable in zip(f.func.returns, differentiability)
+            f"grad_{ret.name}"
+            for ret, differentiable in zip(f.func.returns, differentiability)
             # if it has not been explicitly made undifferentiable
             if differentiable
             # and if it has a name
             and ret.name is not None
             # and if its type is differentiable
-            and ret.type.is_tensor_like()]
+            and ret.type.is_tensor_like()
+        ]
 
         for raw_names in sorted(defn.keys()):
             formula = defn[raw_names]
             names = split_names(raw_names)
 
+            for name in names:
+                assert not (name in all_arg_names and name in all_ret_names), (
+                    f"While processing the derivative formula for '{f.func.name}' wrt '{name}', "
+                    f"expected '{name}' to not be both an input arg and named return. "
+                )
+
             if is_forward_derivative_definition(all_arg_names, names):
                 forward_derivatives.append(create_forward_derivative(f, formula, names))
             else:
-                if formula.lower().strip() == 'non_differentiable':
+                if formula.lower().strip() == "non_differentiable":
                     non_differentiable_arg_names += names
                 else:
-                    derivative = create_derivative(f, formula, names,
-                                                   available_named_gradients)
+                    derivative = create_derivative(
+                        f, formula, names, available_named_gradients
+                    )
                     derivatives.append(derivative)
                     args_with_derivatives_set |= set(names)
 
         overlap = args_with_derivatives_set.intersection(non_differentiable_arg_names)
         if overlap:
-            raise RuntimeError(f'derivatives definition for {defn} have overlapped non_differentiable '
-                               f'and differentiable variables: {overlap}')
+            raise RuntimeError(
+                f"derivatives definition for {defn} have overlapped non_differentiable "
+                f"and differentiable variables: {overlap}"
+            )
 
         # Next, let us determine the list of inputs in order.
         # TODO: do we need eagerly calculate and save it here? Can it be derived
         # from NativeFunction and `derivatives` on callsites instead?
-        args_with_derivatives = [a for a in cpp_arguments(f) if a.name in args_with_derivatives_set]
+        args_with_derivatives = [
+            a for a in cpp_arguments(f) if a.name in args_with_derivatives_set
+        ]
 
         # Postprocess forward derivatives definitions now that we know the differentiable arguments
-        forward_derivatives = postprocess_forward_derivatives(f, defn_name, all_arg_names, derivatives,
-                                                              forward_derivatives, args_with_derivatives)
+        forward_derivatives = postprocess_forward_derivatives(
+            f,
+            defn_name,
+            all_arg_names,
+            derivatives,
+            forward_derivatives,
+            args_with_derivatives,
+        )
 
         # Test to see if the use of 'grads' makes sense.
         check_grad_usage(defn_name, derivatives)
 
-        return (derivatives, forward_derivatives, args_with_derivatives,
-                non_differentiable_arg_names, available_named_gradients)
+        return (
+            derivatives,
+            forward_derivatives,
+            args_with_derivatives,
+            non_differentiable_arg_names,
+            available_named_gradients,
+        )
 
     # NB: Removes 'name' from defn dictionary
-    specification = defn.pop('name')
+    specification = defn.pop("name")
     defn_name, _ = split_name_params(specification)
     # NB: Removes 'output_differentiability' from defn dictionary
     #     `None` means all differentiable.
-    output_differentiability = defn.pop('output_differentiability', None)
+    output_differentiability = defn.pop("output_differentiability", None)
     output_differentiability_conditions = None
-    if output_differentiability and any([isinstance(diff, str) for diff in output_differentiability]):
+    if output_differentiability and any(
+        [isinstance(diff, str) for diff in output_differentiability]
+    ):
         if len(output_differentiability) != 1:
-            raise RuntimeError(f'Not supported: for {specification},'
-                               f'output_differentiability must either be '
-                               f'List[bool] or a List[str] where each str is a '
-                               f'condition. In the case where it is a condition, '
-                               f'we only support single-output functions. '
-                               f'Please file us an issue. ')
+            raise RuntimeError(
+                f"Not supported: for {specification},"
+                f"output_differentiability must either be "
+                f"List[bool] or a List[str] where each str is a "
+                f"condition. In the case where it is a condition, "
+                f"we only support single-output functions. "
+                f"Please file us an issue. "
+            )
         output_differentiability_conditions = output_differentiability
         output_differentiability = [True]
 
     schema_function = functions_by_schema.get(specification)
     if not schema_function:
-        avail = '\n'.join(k for k, v in functions_by_schema.items() if cpp.name(v.func) == defn_name)
-        raise RuntimeError(f'could not find ATen function for schema: {specification} '
-                           f'.  Available signatures:\n{avail}')
+        avail = "\n".join(
+            k for k, v in functions_by_schema.items() if cpp.name(v.func) == defn_name
+        )
+        raise RuntimeError(
+            f"could not find ATen function for schema: {specification} "
+            f".  Available signatures:\n{avail}"
+        )
 
     # now map this to the legacy schema; this isn't technically necessary, but we'd need some logic here
     # to map in-place schemas to the out-of-place variants.
@@ -428,24 +591,39 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
     signature = schema_function.func.signature()
     functions = functions_by_signature[signature]
     if len(functions) == 0:
-        avail = '\n'.join(str(k) for k, v in functions_by_signature.items() if cpp.name(k) == defn_name)
-        raise RuntimeError(f'could not find ATen function for legacy signature: {signature} '
-                           f'corresponding to schema {specification}.  Please report a bug to PyTorch. '
-                           f'Available signatures:\n{avail}')
+        avail = "\n".join(
+            str(k)
+            for k, v in functions_by_signature.items()
+            if cpp.name(k) == defn_name
+        )
+        raise RuntimeError(
+            f"could not find ATen function for legacy signature: {signature} "
+            f"corresponding to schema {specification}.  Please report a bug to PyTorch. "
+            f"Available signatures:\n{avail}"
+        )
 
     canonical = canonical_function(functions, defn_name)
-    if 'grad_input_mask' in (a.name for a in cpp_arguments(canonical)):
-        raise RuntimeError(f"Schema for {defn_name} has an argument named grad_input_mask, "
-                           "but this name would be shadowed by our codegen. "
-                           "Please use a different name in native_functions.yaml.")
-
-    if 'result' in (a.name for a in cpp_arguments(canonical)):
-        raise RuntimeError(f"Schema for {defn_name} has an argument named result, "
-                           "but this is only allowed for outputs."
-                           "Please use a different name in native_functions.yaml.")
-
-    (derivatives, forward_derivatives, args_with_derivatives,
-     non_differentiable_arg_names, available_named_gradients) = set_up_derivatives(canonical)
+    if "grad_input_mask" in (a.name for a in cpp_arguments(canonical)):
+        raise RuntimeError(
+            f"Schema for {defn_name} has an argument named grad_input_mask, "
+            "but this name would be shadowed by our codegen. "
+            "Please use a different name in native_functions.yaml."
+        )
+
+    if "result" in (a.name for a in cpp_arguments(canonical)):
+        raise RuntimeError(
+            f"Schema for {defn_name} has an argument named result, "
+            "but this is only allowed for outputs."
+            "Please use a different name in native_functions.yaml."
+        )
+
+    (
+        derivatives,
+        forward_derivatives,
+        args_with_derivatives,
+        non_differentiable_arg_names,
+        available_named_gradients,
+    ) = set_up_derivatives(canonical)
 
     used_named_gradients: Set[str] = set()
     for d in derivatives:
@@ -455,7 +633,7 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
     op = None
     if args_with_derivatives:
         op_prefix = _create_op_prefix(defn_name)
-        op = f'{op_prefix}{op_counter[op_prefix]}'
+        op = f"{op_prefix}{op_counter[op_prefix]}"
         op_counter[op_prefix] += 1
 
     return DifferentiabilityInfo(
@@ -474,7 +652,9 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
         output_differentiability_conditions=output_differentiability_conditions,
     )
 
-GRAD_INDEX_REGEX = r'(?:^|\W)grads\[(\d+)\]'
+
+GRAD_INDEX_REGEX = r"(?:^|\W)grads\[(\d+)\]"
+
 
 def used_gradient_indices(formula: str) -> List[int]:
     """Determine a list of gradient indices (the i in grads[i]) that
@@ -485,106 +665,167 @@ def used_gradient_indices(formula: str) -> List[int]:
     """
     return [int(i) for i in re.findall(GRAD_INDEX_REGEX, formula)]
 
+
 def saved_variables(
     formula: str,
     nctypes: List[NamedCType],
     var_names: Tuple[str, ...],
 ) -> Tuple[str, Tuple[SavedAttribute, ...]]:
-
     def stride_expr(name: str) -> str:
         assert var_names == (name,), (
             'Replacement for ".strides()" is currently only supported for single derivatives of the same tensor '
-            'that ".strides()" is being called on.')
+            'that ".strides()" is being called on.'
+        )
         return f'strides_or_error({name}, "{name}")'
 
     REPLACEMENTS: List[Tuple[str, Dict[str, Any]]] = [
         # replace self.sizes() with self_sizes
-        (r'{}.sizes\(\)', {
-            'suffix': '_sizes',
-            'nctype': lambda name: NamedCType(name, BaseCType(intArrayRefT)),
-        }),
+        (
+            r"{}.sizes\(\)",
+            {
+                "suffix": "_sizes",
+                "nctype": lambda name: NamedCType(name, BaseCType(intArrayRefT)),
+            },
+        ),
         # replace self->sizes() with self_sizes_opt
-        (r'{}->sizes\(\)', {
-            'suffix': '_sizes_opt',
-            'nctype': lambda name: NamedCType(name, OptionalCType(BaseCType(intArrayRefT))),
-            'expr': lambda name: f'{name}.has_value() ? c10::optional<IntArrayRef>({name}->sizes()) : c10::nullopt',
-        }),
+        (
+            r"{}->sizes\(\)",
+            {
+                "suffix": "_sizes_opt",
+                "nctype": lambda name: NamedCType(
+                    name, OptionalCType(BaseCType(intArrayRefT))
+                ),
+                "expr": lambda name: f"{name}.has_value() ? c10::optional<IntArrayRef>({name}->sizes()) : c10::nullopt",
+            },
+        ),
         # replace self.options() with self_options
-        (r'{}.options\(\)', {
-            'suffix': '_options',
-            'nctype': lambda name: NamedCType(name, BaseCType(tensorOptionsT)),
-        }),
+        (
+            r"{}.options\(\)",
+            {
+                "suffix": "_options",
+                "nctype": lambda name: NamedCType(name, BaseCType(tensorOptionsT)),
+            },
+        ),
         # replace zeros_like(self) with self_info
-        (r'zeros_like\({}\)', {
-            'suffix': '_info',
-            'nctype': lambda name: NamedCType(name, BaseCType(typeAndSizeT)),
-            'expr': lambda name: name,  # at save-time
-            'res': lambda name: name + '_info.zeros()',  # at eval-time
-        }),
+        (
+            r"zeros_like\({}\)",
+            {
+                "suffix": "_info",
+                "nctype": lambda name: NamedCType(name, BaseCType(typeAndSizeT)),
+                "expr": lambda name: name,  # at save-time
+                "res": lambda name: name + "_info.zeros()",  # at eval-time
+            },
+        ),
         # replace self.size(2) with self_size_2
-        (r'{}.size\((\w+)\)', {
-            'suffix': lambda m: '_argsize_{}'.format(*m.groups()),
-            'nctype': lambda name: NamedCType(name, BaseCType(longT)),
-        }),
+        (
+            r"{}.size\((\w+)\)",
+            {
+                "suffix": lambda m: "_argsize_{}".format(*m.groups()),
+                "nctype": lambda name: NamedCType(name, BaseCType(longT)),
+            },
+        ),
         # replace self.numel() with self_numel
-        (r'{}.numel\(\)', {
-            'suffix': '_numel',
-            'nctype': lambda name: NamedCType(name, BaseCType(longT)),
-        }),
+        (
+            r"{}.numel\(\)",
+            {
+                "suffix": "_numel",
+                "nctype": lambda name: NamedCType(name, BaseCType(longT)),
+            },
+        ),
         # replace to_args_sizes(self) with self_args_sizes
-        (r'to_args_sizes\({}\)', {
-            'suffix': '_args_sizes',
-            'nctype': lambda name: NamedCType(name, VectorCType(VectorCType(BaseCType(longT)))),
-        }),
+        (
+            r"to_args_sizes\({}\)",
+            {
+                "suffix": "_args_sizes",
+                "nctype": lambda name: NamedCType(
+                    name, VectorCType(VectorCType(BaseCType(longT)))
+                ),
+            },
+        ),
         # replace to_args_scalartypes(self) with self_args_scalartypes
-        (r'to_args_scalartypes\({}\)', {
-            'suffix': '_args_scalartypes',
-            'nctype': lambda name: NamedCType(name, VectorCType(BaseCType(scalarTypeT))),
-        }),
+        (
+            r"to_args_scalartypes\({}\)",
+            {
+                "suffix": "_args_scalartypes",
+                "nctype": lambda name: NamedCType(
+                    name, VectorCType(BaseCType(scalarTypeT))
+                ),
+            },
+        ),
         # replace TensorGeometry(self) with self_geometry
-        (r'TensorGeometry\({}\)', {
-            'suffix': '_geometry',
-            'nctype': lambda name: NamedCType(name, BaseCType(tensorGeometryT)),
-        }),
-        (r'{}.scalar_type\(\)', {
-            'suffix': '_scalar_type',
-            'nctype': lambda name: NamedCType(name, BaseCType(scalarTypeT)),
-        }),
+        (
+            r"TensorGeometry\({}\)",
+            {
+                "suffix": "_geometry",
+                "nctype": lambda name: NamedCType(name, BaseCType(tensorGeometryT)),
+            },
+        ),
+        (
+            r"{}.scalar_type\(\)",
+            {
+                "suffix": "_scalar_type",
+                "nctype": lambda name: NamedCType(name, BaseCType(scalarTypeT)),
+            },
+        ),
         # replace self.dim() with self_dim
-        (r'{}.dim\(\)', {
-            'suffix': '_dim',
-            'nctype': lambda name: NamedCType(name, BaseCType(longT)),
-        }),
+        (
+            r"{}.dim\(\)",
+            {
+                "suffix": "_dim",
+                "nctype": lambda name: NamedCType(name, BaseCType(longT)),
+            },
+        ),
         # replace self.strides() with self_strides
-        (r'{}.strides\(\)', {
-            'suffix': '_strides',
-            'nctype': lambda name: NamedCType(name, BaseCType(intArrayRefT)),
-            'expr': stride_expr,
-        }),
+        (
+            r"{}.strides\(\)",
+            {
+                "suffix": "_strides",
+                "nctype": lambda name: NamedCType(name, BaseCType(intArrayRefT)),
+                "expr": stride_expr,
+            },
+        ),
+        # replace self.layout() with self_layout
+        (
+            r"{}.layout\(\)",
+            {
+                "suffix": "_layout",
+                "nctype": lambda name: NamedCType(name, BaseCType(layoutT)),
+            },
+        ),
         # replace self.is_conj() with self_conjugate
-        (r'{}.is_conj\(\)', {
-            'suffix': '_conjugate',
-            'nctype': lambda name: NamedCType(name, BaseCType(boolT)),
-        })
+        (
+            r"{}.is_conj\(\)",
+            {
+                "suffix": "_conjugate",
+                "nctype": lambda name: NamedCType(name, BaseCType(boolT)),
+            },
+        ),
     ]
 
     # find which arguments need to be saved
     saved: List[SavedAttribute] = []
 
     for nctype in nctypes:
-        name = nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
+        name = (
+            nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
+        )
         # First search the formula for expressions which can be evaluated
         # when the autograd Function is created to avoid saving variables
         for regex, info in REPLACEMENTS:
+
             def repl(m: Match[str]) -> str:
-                suffix: str = info['suffix'](m) if callable(info['suffix']) else info['suffix']
-                expr: str = info['expr'](name) if 'expr' in info else m.group(0)
-                saved.append(SavedAttribute(
-                    nctype=info['nctype'](name + suffix),
-                    expr=expr,
-                ))
-                if 'res' in info:
-                    replacement: str = info['res'](name)
+                suffix: str = (
+                    info["suffix"](m) if callable(info["suffix"]) else info["suffix"]
+                )
+                expr: str = info["expr"](name) if "expr" in info else m.group(0)
+                saved.append(
+                    SavedAttribute(
+                        nctype=info["nctype"](name + suffix),
+                        expr=expr,
+                    )
+                )
+                if "res" in info:
+                    replacement: str = info["res"](name)
                     return replacement
                 return name + suffix
 
@@ -595,19 +836,23 @@ def repl(m: Match[str]) -> str:
         # the backward function
         if nctype.type == OptionalCType(BaseCType(stringT)):
             formula = re.sub(
-                rf'\b{name}\b',
-                f'{name}.has_value() ? c10::optional<c10::string_view>({name}.value()) : c10::nullopt',
-                formula)
+                rf"\b{name}\b",
+                f"{name}.has_value() ? c10::optional<c10::string_view>({name}.value()) : c10::nullopt",
+                formula,
+            )
 
         # Find any variables which remain in the formula and save them
         if re.search(IDENT_REGEX.format(name), formula):
-            saved.append(SavedAttribute(
-                nctype=nctype,
-                expr=name,
-            ))
+            saved.append(
+                SavedAttribute(
+                    nctype=nctype,
+                    expr=name,
+                )
+            )
 
     return formula, tuple(saved)
 
+
 def _create_op_prefix(name: str) -> str:
     """Takes a native function name converts to a op prefix name.
 
@@ -621,15 +866,19 @@ def _create_op_prefix(name: str) -> str:
     >>> _create_op_prefix('add')
     'AddBackward'
     """
-    camel_case = ''.join([p.title() for p in name.split('_')])
-    return (camel_case + 'Backward').replace('ForwardBackward', 'Backward')
+    camel_case = "".join([p.title() for p in name.split("_")])
+    return (camel_case + "Backward").replace("ForwardBackward", "Backward")
 
 
 def dedup_vars(vars: Sequence[SavedAttribute]) -> Sequence[SavedAttribute]:
     seen: Set[str] = set()
     saved: List[SavedAttribute] = []
     for var in vars:
-        name = var.nctype.name.name if isinstance(var.nctype.name, SpecialArgName) else var.nctype.name
+        name = (
+            var.nctype.name.name
+            if isinstance(var.nctype.name, SpecialArgName)
+            else var.nctype.name
+        )
         if name in seen:
             continue
         seen.add(name)
diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp
index 5465e6214387..13b3d47cf448 100644
--- a/tools/autograd/templates/python_nn_functions.cpp
+++ b/tools/autograd/templates/python_nn_functions.cpp
@@ -12,6 +12,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/tensor_memoryformats.h"
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -43,7 +44,7 @@ static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObje
   ParsedArgs<5> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.has_torch_function()) {
-    return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn");
+    return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn", "_parse_to");
   }
   auto parsed = parse_to_conversion(r, /*allow_copy*/ false); // we don't want copy for nn.Module.to
   auto& device = std::get<0>(parsed);
@@ -66,7 +67,7 @@ static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObje
   }
   PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
   if (opt_memory_format.has_value()) {
-    PyTuple_SET_ITEM(tuple.get(), 3, THPMemoryFormat_New(opt_memory_format.value(), "unused_name"));
+    PyTuple_SET_ITEM(tuple.get(), 3, torch::utils::getTHPMemoryFormat(opt_memory_format.value()).release().ptr());
   } else {
     Py_INCREF(Py_None);
     PyTuple_SET_ITEM(tuple.get(), 3, Py_None);
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index b3d6ae705c51..ad14d2c7c20c 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -231,7 +231,11 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
      return handle_torch_function(self, "numel", args);
    }
    auto& self_ = THPVariable_Unpack(self);
-   return THPUtils_packInt64(self_.numel());
+   if (jit::tracer::isTracing()) {
+     return wrap(jit::tracer::getNumelOf(self_));
+   } else {
+     return THPUtils_packInt64(self_.numel());
+   }
    END_HANDLE_TH_ERRORS
 }
 
@@ -541,6 +545,28 @@ static PyObject * THPVariable_xpu(PyObject* self, PyObject* args, PyObject* kwar
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * THPVariable_ipu(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "ipu(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)",
+    "ipu(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated"
+  });
+  auto& self_ = THPVariable_Unpack(self);
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(self, args, kwargs, parsed_args);
+
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor");
+  }
+
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::IPU) : r.device(0);
+  auto opt_memory_format = r.memoryformatOptional(2);
+  TORCH_CHECK(device.is_ipu(), "Invalid device, must be ipu device");
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, c10::optional<c10::MemoryFormat> optional_memory_format) {
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
@@ -1091,6 +1117,7 @@ static PyObject* THPVariable_set_(
           "set_(Storage source)",
           "set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)",
           "set_(Tensor source)",
+          "set_(Tensor source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)",
       },
       /*traceable=*/false);
 
@@ -1114,7 +1141,7 @@ static PyObject* THPVariable_set_(
       at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage);
       TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage,
         "Expected a Storage of type ", self.dtype(),
-        " or an UntypedStorage, but got type ", storage_scalar_type,
+        " or an _UntypedStorage, but got type ", storage_scalar_type,
         " for argument 1 'storage'");
       auto dispatch_set_ = [](const Tensor& self, Storage source) -> Tensor {
         pybind11::gil_scoped_release no_gil;
@@ -1130,7 +1157,7 @@ static PyObject* THPVariable_set_(
       at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage);
       TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage,
         "Expected a Storage of type ", self.dtype(),
-        " or an UntypedStorage, but got type ", storage_scalar_type,
+        " or an _UntypedStorage, but got type ", storage_scalar_type,
         " for argument 1 'storage'");
       auto dispatch_set_ = [](const Tensor& self,
                               Storage source,
@@ -1152,6 +1179,21 @@ static PyObject* THPVariable_set_(
       };
       return wrap(dispatch_set_(self, _r.tensor(0)));
     }
+    case 4: {
+      // aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor
+      // source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
+      at::Tensor storage = _r.tensor(0);
+      auto dispatch_set_ = [](const Tensor& self,
+                              const Tensor& source,
+                              int64_t storage_offset,
+                              IntArrayRef size,
+                              IntArrayRef stride) -> Tensor {
+        pybind11::gil_scoped_release no_gil;
+        return self.set_(source, storage_offset, size, stride);
+      };
+      return wrap(dispatch_set_(
+          self, storage, _r.toInt64(1), _r.intlist(2), _r.intlist(3)));
+    }
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -1205,6 +1247,7 @@ PyMethodDef variable_methods[] = {
   {"cpu", castPyCFunctionWithKeywords(THPVariable_cpu), METH_VARARGS | METH_KEYWORDS, NULL},
   {"cuda", castPyCFunctionWithKeywords(THPVariable_cuda), METH_VARARGS | METH_KEYWORDS, NULL},
   {"xpu", castPyCFunctionWithKeywords(THPVariable_xpu), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ipu", castPyCFunctionWithKeywords(THPVariable_ipu), METH_VARARGS | METH_KEYWORDS, NULL},
   {"data_ptr", THPVariable_data_ptr, METH_NOARGS, NULL},
   {"dim", THPVariable_dim, METH_NOARGS, NULL},
   {"has_names", THPVariable_has_names, METH_NOARGS, NULL},
diff --git a/tools/bazel.bzl b/tools/bazel.bzl
index b932b812c322..75216430b2e4 100644
--- a/tools/bazel.bzl
+++ b/tools/bazel.bzl
@@ -1,16 +1,39 @@
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_cuda//cuda:defs.bzl", "requires_cuda_enabled")
 load("//c10/macros:cmake_configure_file.bzl", "cmake_configure_file")
+load("//tools/config:defs.bzl", "if_cuda")
+
+def _genrule(**kwds):
+    if _enabled(**kwds):
+        native.genrule(**kwds)
+
+def _py_library(name, **kwds):
+    deps = [dep for dep in kwds.pop("deps", []) if dep != None]
+    native.py_library(name = name, deps = deps, **kwds)
+
+def _requirement(_pypi_project):
+    return None
 
 # Rules implementation for the Bazel build system. Since the common
 # build structure aims to replicate Bazel as much as possible, most of
 # the rules simply forward to the Bazel definitions.
 rules = struct(
+    cc_binary = cc_binary,
     cc_library = cc_library,
     cc_test = cc_test,
     cmake_configure_file = cmake_configure_file,
     filegroup = native.filegroup,
+    genrule = _genrule,
     glob = native.glob,
+    if_cuda = if_cuda,
+    py_binary = native.py_binary,
+    py_library = _py_library,
+    requirement = _requirement,
     requires_cuda_enabled = requires_cuda_enabled,
     select = select,
+    test_suite = native.test_suite,
 )
+
+def _enabled(tags = [], **_kwds):
+    """Determines if the target is enabled."""
+    return "-bazel" not in tags
diff --git a/tools/build_defs/fb_xplat_genrule.bzl b/tools/build_defs/fb_xplat_genrule.bzl
new file mode 100644
index 000000000000..ddc19b2373e9
--- /dev/null
+++ b/tools/build_defs/fb_xplat_genrule.bzl
@@ -0,0 +1,5 @@
+def fb_xplat_genrule(default_outs = ["."], **kwargs):
+    genrule(
+        # default_outs=default_outs, # only needed for internal BUCK
+        **kwargs
+    )
diff --git a/tools/build_defs/glob_defs.bzl b/tools/build_defs/glob_defs.bzl
new file mode 100644
index 000000000000..a0eea247e839
--- /dev/null
+++ b/tools/build_defs/glob_defs.bzl
@@ -0,0 +1,89 @@
+"""Provides utility macros for working with globs."""
+
+load("@bazel_skylib//lib:paths.bzl", "paths")
+
+def subdir_glob(glob_specs, exclude = None, prefix = ""):
+    """Returns a dict of sub-directory relative paths to full paths.
+
+    The subdir_glob() function is useful for defining header maps for C/C++
+    libraries which should be relative the given sub-directory.
+    Given a list of tuples, the form of (relative-sub-directory, glob-pattern),
+    it returns a dict of sub-directory relative paths to full paths.
+
+    Please refer to native.glob() for explanations and examples of the pattern.
+
+    Args:
+      glob_specs: The array of tuples in form of
+        (relative-sub-directory, glob-pattern inside relative-sub-directory).
+        type: List[Tuple[str, str]]
+      exclude: A list of patterns to identify files that should be removed
+        from the set specified by the first argument. Defaults to [].
+        type: Optional[List[str]]
+      prefix: If is not None, prepends it to each key in the dictionary.
+        Defaults to None.
+        type: Optional[str]
+
+    Returns:
+      A dict of sub-directory relative paths to full paths.
+    """
+    if exclude == None:
+        exclude = []
+
+    results = []
+
+    for dirpath, glob_pattern in glob_specs:
+        results.append(
+            _single_subdir_glob(dirpath, glob_pattern, exclude, prefix),
+        )
+
+    return _merge_maps(*results)
+
+def _merge_maps(*file_maps):
+    result = {}
+    for file_map in file_maps:
+        for key in file_map:
+            if key in result and result[key] != file_map[key]:
+                fail(
+                    "Conflicting files in file search paths. " +
+                    "\"%s\" maps to both \"%s\" and \"%s\"." %
+                    (key, result[key], file_map[key]),
+                )
+
+            result[key] = file_map[key]
+
+    return result
+
+def _single_subdir_glob(dirpath, glob_pattern, exclude = None, prefix = None):
+    if exclude == None:
+        exclude = []
+    results = {}
+    files = native.glob([paths.join(dirpath, glob_pattern)], exclude = exclude)
+    for f in files:
+        if dirpath:
+            key = f[len(dirpath) + 1:]
+        else:
+            key = f
+        if prefix:
+            key = paths.join(prefix, key)
+        results[key] = f
+
+    return results
+
+# Using a flat list will trigger build errors on Android.
+# cxx_library will generate an apple_library on iOS, a cxx_library on Android.
+# Those rules have different behaviors. Using a map will make the behavior consistent.
+#
+def glob_private_headers(glob_patterns, exclude = []):
+    result = {}
+    headers = native.glob(glob_patterns, exclude = exclude)
+    for header in headers:
+        result[paths.basename(header)] = header
+    return result
+
+def glob(include, exclude = (), **kwargs):
+    buildfile = native.read_config("buildfile", "name", "BUCK")
+    subpkgs = [
+        target[:-len(buildfile)] + "**/*"
+        for target in native.glob(["*/**/" + buildfile])
+    ]
+    return native.glob(include, exclude = list(exclude) + subpkgs, **kwargs)
diff --git a/tools/build_defs/type_defs.bzl b/tools/build_defs/type_defs.bzl
new file mode 100644
index 000000000000..afc02702e8ad
--- /dev/null
+++ b/tools/build_defs/type_defs.bzl
@@ -0,0 +1,128 @@
+"""Provides macros for queries type information."""
+
+_SELECT_TYPE = type(select({"DEFAULT": []}))
+
+def is_select(thing):
+    return type(thing) == _SELECT_TYPE
+
+def is_unicode(arg):
+    """Checks if provided instance has a unicode type.
+
+    Args:
+      arg: An instance to check. type: Any
+
+    Returns:
+      True for unicode instances, False otherwise. rtype: bool
+    """
+    return hasattr(arg, "encode")
+
+_STRING_TYPE = type("")
+
+def is_string(arg):
+    """Checks if provided instance has a string type.
+
+    Args:
+      arg: An instance to check. type: Any
+
+    Returns:
+      True for string instances, False otherwise. rtype: bool
+    """
+    return type(arg) == _STRING_TYPE
+
+_LIST_TYPE = type([])
+
+def is_list(arg):
+    """Checks if provided instance has a list type.
+
+    Args:
+      arg: An instance to check. type: Any
+
+    Returns:
+      True for list instances, False otherwise. rtype: bool
+    """
+    return type(arg) == _LIST_TYPE
+
+_DICT_TYPE = type({})
+
+def is_dict(arg):
+    """Checks if provided instance has a dict type.
+
+    Args:
+      arg: An instance to check. type: Any
+
+    Returns:
+      True for dict instances, False otherwise. rtype: bool
+    """
+    return type(arg) == _DICT_TYPE
+
+_TUPLE_TYPE = type(())
+
+def is_tuple(arg):
+    """Checks if provided instance has a tuple type.
+
+    Args:
+      arg: An instance to check. type: Any
+
+    Returns:
+      True for tuple instances, False otherwise. rtype: bool
+    """
+    return type(arg) == _TUPLE_TYPE
+
+def is_collection(arg):
+    """Checks if provided instance is a collection subtype.
+
+    This will either be a dict, list, or tuple.
+    """
+    return is_dict(arg) or is_list(arg) or is_tuple(arg)
+
+_BOOL_TYPE = type(True)
+
+def is_bool(arg):
+    """Checks if provided instance is a boolean value.
+
+    Args:
+      arg: An instance ot check. type: Any
+
+    Returns:
+      True for boolean values, False otherwise. rtype: bool
+    """
+    return type(arg) == _BOOL_TYPE
+
+_NUMBER_TYPE = type(1)
+
+def is_number(arg):
+    """Checks if provided instance is a number value.
+
+    Args:
+      arg: An instance ot check. type: Any
+
+    Returns:
+      True for number values, False otherwise. rtype: bool
+    """
+    return type(arg) == _NUMBER_TYPE
+
+_STRUCT_TYPE = type(struct())  # Starlark returns the same type for all structs
+
+def is_struct(arg):
+    """Checks if provided instance is a struct value.
+
+    Args:
+      arg: An instance ot check. type: Any
+
+    Returns:
+      True for struct values, False otherwise. rtype: bool
+    """
+    return type(arg) == _STRUCT_TYPE
+
+type_utils = struct(
+    is_bool = is_bool,
+    is_number = is_number,
+    is_string = is_string,
+    is_unicode = is_unicode,
+    is_list = is_list,
+    is_dict = is_dict,
+    is_tuple = is_tuple,
+    is_collection = is_collection,
+    is_select = is_select,
+    is_struct = is_struct,
+)
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index c263e5084f78..c5508773f643 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -11,13 +11,22 @@
 from tools.build_pytorch_libs import build_caffe2
 from tools.setup_helpers.cmake import CMake
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Placeholder for future interface. For now just gives a nice -h.
-    parser = argparse.ArgumentParser(description='Build libtorch')
-    parser.add_argument('--rerun-cmake', action="store_true", help='rerun cmake')
-    parser.add_argument('--cmake-only', action="store_true",
-                        help='Stop once cmake terminates. Leave users a chance to adjust build options')
+    parser = argparse.ArgumentParser(description="Build libtorch")
+    parser.add_argument("--rerun-cmake", action="store_true", help="rerun cmake")
+    parser.add_argument(
+        "--cmake-only",
+        action="store_true",
+        help="Stop once cmake terminates. Leave users a chance to adjust build options",
+    )
     options = parser.parse_args()
 
-    build_caffe2(version=None, cmake_python_library=None, build_python=False,
-                 rerun_cmake=options.rerun_cmake, cmake_only=options.cmake_only, cmake=CMake())
+    build_caffe2(
+        version=None,
+        cmake_python_library=None,
+        build_python=False,
+        rerun_cmake=options.rerun_cmake,
+        cmake_only=options.cmake_only,
+        cmake=CMake(),
+    )
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index d795770c8844..eba8ea1dcf66 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -1,4 +1,5 @@
 import os
+import platform
 from glob import glob
 import shutil
 from typing import Dict, Optional
@@ -8,8 +9,30 @@
 
 from setuptools import distutils  # type: ignore[import]
 
+
 def _overlay_windows_vcvars(env: Dict[str, str]) -> Dict[str, str]:
-    vc_arch = 'x64' if IS_64BIT else 'x86'
+    vc_arch = "x64" if IS_64BIT else "x86"
+
+    if platform.machine() == "ARM64":
+        vc_arch = "x64_arm64"
+
+        # First Win11 Windows on Arm build version that supports x64 emulation
+        # is 10.0.22000.
+        win11_1st_version = (10, 0, 22000)
+        current_win_version = tuple(
+            int(version_part) for version_part in platform.version().split(".")
+        )
+        if current_win_version < win11_1st_version:
+            vc_arch = "x86_arm64"
+            print(
+                "Warning: 32-bit toolchain will be used, but 64-bit linker "
+                "is recommended to avoid out-of-memory linker error!"
+            )
+            print(
+                "Warning: Please consider upgrading to Win11, where x64 "
+                "emulation is enabled!"
+            )
+
     vc_env: Dict[str, str] = distutils._msvccompiler._get_vc_env(vc_arch)
     # Keys in `_get_vc_env` are always lowercase.
     # We turn them into uppercase before overlaying vcvars
@@ -29,19 +52,21 @@ def _create_build_env() -> Dict[str, str]:
     # you should NEVER add something to this list. It is bad practice to
     # have cmake read the environment
     my_env = os.environ.copy()
-    if 'CUDA_HOME' in my_env:  # Keep CUDA_HOME. This env variable is still used in other part.
-        my_env['CUDA_BIN_PATH'] = my_env['CUDA_HOME']
+    if (
+        "CUDA_HOME" in my_env
+    ):  # Keep CUDA_HOME. This env variable is still used in other part.
+        my_env["CUDA_BIN_PATH"] = my_env["CUDA_HOME"]
     elif IS_WINDOWS:  # we should eventually make this as part of FindCUDA.
-        cuda_win = glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+        cuda_win = glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
         if len(cuda_win) > 0:
-            my_env['CUDA_BIN_PATH'] = cuda_win[0]
+            my_env["CUDA_BIN_PATH"] = cuda_win[0]
 
     if IS_WINDOWS and USE_NINJA:
         # When using Ninja under Windows, the gcc toolchain will be chosen as
         # default. But it should be set to MSVC as the user's first choice.
         my_env = _overlay_windows_vcvars(my_env)
-        my_env.setdefault('CC', 'cl')
-        my_env.setdefault('CXX', 'cl')
+        my_env.setdefault("CC", "cl")
+        my_env.setdefault("CXX", "cl")
     return my_env
 
 
@@ -54,18 +79,15 @@ def build_caffe2(
     cmake: CMake,
 ) -> None:
     my_env = _create_build_env()
-    build_test = not check_negative_env_flag('BUILD_TEST')
-    cmake.generate(version,
-                   cmake_python_library,
-                   build_python,
-                   build_test,
-                   my_env,
-                   rerun_cmake)
+    build_test = not check_negative_env_flag("BUILD_TEST")
+    cmake.generate(
+        version, cmake_python_library, build_python, build_test, my_env, rerun_cmake
+    )
     if cmake_only:
         return
     cmake.build(my_env)
     if build_python:
-        caffe2_proto_dir = os.path.join(cmake.build_dir, 'caffe2', 'proto')
-        for proto_file in glob(os.path.join(caffe2_proto_dir, '*.py')):
-            if proto_file != os.path.join(caffe2_proto_dir, '__init__.py'):
-                shutil.copy(proto_file, os.path.join('caffe2', 'proto'))
+        caffe2_proto_dir = os.path.join(cmake.build_dir, "caffe2", "proto")
+        for proto_file in glob(os.path.join(caffe2_proto_dir, "*.py")):
+            if proto_file != os.path.join(caffe2_proto_dir, "__init__.py"):
+                shutil.copy(proto_file, os.path.join("caffe2", "proto"))
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 21cef9716924..52e8dd25f5a6 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -1,51 +1,41 @@
-# In both open-source and fbcode builds, these are generated into
-# torch/csrc/{autgrad,jit}/generated.i
-GENERATED_CPP = [
-    "autograd/generated/Functions.cpp",
-    "autograd/generated/VariableType_0.cpp",
-    "autograd/generated/VariableType_1.cpp",
-    "autograd/generated/VariableType_2.cpp",
-    "autograd/generated/VariableType_3.cpp",
-    "autograd/generated/VariableType_4.cpp",
-    "autograd/generated/TraceType_0.cpp",
-    "autograd/generated/TraceType_1.cpp",
-    "autograd/generated/TraceType_2.cpp",
-    "autograd/generated/TraceType_3.cpp",
-    "autograd/generated/TraceType_4.cpp",
-    "autograd/generated/ADInplaceOrViewType_0.cpp",
-    "autograd/generated/ADInplaceOrViewType_1.cpp",
-    "autograd/generated/python_functions_0.cpp",
-    "autograd/generated/python_functions_1.cpp",
-    "autograd/generated/python_functions_2.cpp",
-    "autograd/generated/python_functions_3.cpp",
-    "autograd/generated/python_functions_4.cpp",
-    "autograd/generated/python_nn_functions.cpp",
-    "autograd/generated/python_fft_functions.cpp",
-    "autograd/generated/python_linalg_functions.cpp",
-    "autograd/generated/python_return_types.cpp",
-    "autograd/generated/python_sparse_functions.cpp",
-    "autograd/generated/python_special_functions.cpp",
-    "autograd/generated/python_torch_functions_0.cpp",
-    "autograd/generated/python_torch_functions_1.cpp",
-    "autograd/generated/python_torch_functions_2.cpp",
-    "autograd/generated/python_variable_methods.cpp",
+# WARNING: the contents of this file must BOTH be valid Starlark (for Buck and
+
+# Bazel) as well as valid Python (for our cmake build).  This means that
+# load() directives are not allowed (as they are not recognized by Python).
+# If you want to fix this, figure out how run this file from cmake with a proper
+# Starlark interpreter as part of the default OSS build process.  If you need
+# some nontrivial Starlark features, make a separate bzl file (remember that
+
+# bzl files are not exported via ShipIt by default, so you may also need to
+# update PyTorch's ShipIt config)
+
+# This is duplicated in caffe2/CMakeLists.txt for now and not yet used in buck
+GENERATED_LAZY_TS_CPP = [
+    "lazy/generated/LazyNativeFunctions.cpp",
+    "lazy/generated/RegisterAutogradLazy.cpp",
+    "lazy/generated/RegisterLazy.cpp",
 ]
 
 # NVFuser runtime library
 libtorch_nvfuser_runtime_sources = [
+    "torch/csrc/jit/codegen/cuda/runtime/array.cu",
     "torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu",
     "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
     "torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu",
     "torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu",
     "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
     "torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
     "torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu",
     "torch/csrc/jit/codegen/cuda/runtime/helpers.cu",
     "torch/csrc/jit/codegen/cuda/runtime/index_utils.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu",
     "torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu",
     "torch/csrc/jit/codegen/cuda/runtime/tensor.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/tuple.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/type_traits.cu",
     "torch/csrc/jit/codegen/cuda/runtime/welford.cu",
     "torch/csrc/jit/codegen/cuda/runtime/warp.cu",
     "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh",
@@ -56,19 +46,19 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name.split("/")[-1].split(".
 
 def libtorch_generated_sources(gencode_pattern):
     return [gencode_pattern.format(name) for name in [
-        "autograd/generated/Functions.cpp",
-        "autograd/generated/VariableType_0.cpp",
-        "autograd/generated/VariableType_1.cpp",
-        "autograd/generated/VariableType_2.cpp",
-        "autograd/generated/VariableType_3.cpp",
-        "autograd/generated/VariableType_4.cpp",
-        "autograd/generated/TraceType_0.cpp",
-        "autograd/generated/TraceType_1.cpp",
-        "autograd/generated/TraceType_2.cpp",
-        "autograd/generated/TraceType_3.cpp",
-        "autograd/generated/TraceType_4.cpp",
-        "autograd/generated/ADInplaceOrViewType_0.cpp",
-        "autograd/generated/ADInplaceOrViewType_1.cpp",
+        "torch/csrc/autograd/generated/Functions.cpp",
+        "torch/csrc/autograd/generated/VariableType_0.cpp",
+        "torch/csrc/autograd/generated/VariableType_1.cpp",
+        "torch/csrc/autograd/generated/VariableType_2.cpp",
+        "torch/csrc/autograd/generated/VariableType_3.cpp",
+        "torch/csrc/autograd/generated/VariableType_4.cpp",
+        "torch/csrc/autograd/generated/TraceType_0.cpp",
+        "torch/csrc/autograd/generated/TraceType_1.cpp",
+        "torch/csrc/autograd/generated/TraceType_2.cpp",
+        "torch/csrc/autograd/generated/TraceType_3.cpp",
+        "torch/csrc/autograd/generated/TraceType_4.cpp",
+        "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp",
+        "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp",
     ]]
 
 # copied from https://github.com/pytorch/pytorch/blob/f99a693cd9ff7a9b5fdc71357dac66b8192786d3/aten/src/ATen/core/CMakeLists.txt
@@ -137,8 +127,10 @@ libtorch_profiler_sources = [
     "torch/csrc/autograd/profiler_legacy.cpp",
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/profiler/api.cpp",
+    "torch/csrc/profiler/collection.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
     "torch/csrc/profiler/nvtx_observer.cpp",
+    "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/monitor/counters.cpp",
     "torch/csrc/monitor/events.cpp",
 ]
@@ -213,20 +205,22 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/operator_upgraders/utils.cpp",
     "torch/csrc/jit/operator_upgraders/upgraders.cpp",
     "torch/csrc/jit/operator_upgraders/upgraders_entry.cpp",
+    "torch/csrc/jit/passes/add_if_then_else.cpp",
     "torch/csrc/jit/passes/annotate_warns.cpp",
     "torch/csrc/jit/passes/bailout_graph.cpp",
+    "torch/csrc/jit/passes/check_strict_fusion.cpp",
     "torch/csrc/jit/passes/batch_mm.cpp",
     "torch/csrc/jit/passes/canonicalize.cpp",
     "torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp",
     "torch/csrc/jit/passes/clear_profiling.cpp",
     "torch/csrc/jit/passes/clear_undefinedness.cpp",
     "torch/csrc/jit/passes/common_subexpression_elimination.cpp",
-    "torch/csrc/jit/passes/common_expression_hoisting.cpp",
     "torch/csrc/jit/passes/concat_opt.cpp",
     "torch/csrc/jit/passes/constant_pooling.cpp",
     "torch/csrc/jit/passes/constant_propagation.cpp",
     "torch/csrc/jit/passes/restore_mutation.cpp",
     "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
+    "torch/csrc/jit/passes/cuda_graph_fuser.cpp",
     "torch/csrc/jit/passes/dead_code_elimination.cpp",
     "torch/csrc/jit/passes/eliminate_no_ops.cpp",
     "torch/csrc/jit/passes/remove_redundant_profiles.cpp",
@@ -263,9 +257,11 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/passes/peephole.cpp",
     "torch/csrc/jit/passes/peephole_non_tensor.cpp",
     "torch/csrc/jit/passes/create_functional_graphs.cpp",
+    "torch/csrc/jit/passes/refine_tuple_types.cpp",
     "torch/csrc/jit/passes/remove_mutation.cpp",
     "torch/csrc/jit/passes/prepack_folding.cpp",
     "torch/csrc/jit/passes/fold_conv_bn.cpp",
+    "torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp",
     "torch/csrc/jit/passes/frozen_concat_linear.cpp",
     "torch/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp",
     "torch/csrc/jit/passes/frozen_conv_folding.cpp",
@@ -279,6 +275,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/passes/integer_value_refinement.cpp",
     "torch/csrc/jit/passes/replacement_of_old_operators.cpp",
     "torch/csrc/jit/passes/symbolic_shape_analysis.cpp",
+    "torch/csrc/jit/passes/symbolic_shape_cache.cpp",
     "torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp",
     "torch/csrc/jit/passes/specialize_autogradzero.cpp",
     "torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp",
@@ -307,11 +304,15 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp",
     "torch/csrc/jit/runtime/interpreter.cpp",
     "torch/csrc/jit/runtime/logging.cpp",
+    "torch/csrc/jit/runtime/simple_graph_executor_impl.cpp",
     "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp",
     "torch/csrc/jit/runtime/profiling_record.cpp",
     "torch/csrc/jit/runtime/script_profile.cpp",
     "torch/csrc/jit/runtime/symbolic_script.cpp",
     "torch/csrc/jit/runtime/symbolic_shape_registry.cpp",
+    "torch/csrc/jit/runtime/decomposition_registry.cpp",
+    "torch/csrc/jit/runtime/decomposition_registry_util.cpp",
+    "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp",
     "torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp",
     "torch/csrc/jit/runtime/jit_trace.cpp",
     "torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp",
@@ -328,6 +329,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/tensorexpr/cpp_codegen.cpp",
     "torch/csrc/jit/tensorexpr/eval.cpp",
     "torch/csrc/jit/tensorexpr/expr.cpp",
+    "torch/csrc/jit/tensorexpr/external_functions_core.cpp",
     "torch/csrc/jit/tensorexpr/external_functions_registry.cpp",
     "torch/csrc/jit/tensorexpr/graph_opt.cpp",
     "torch/csrc/jit/tensorexpr/hash_provider.cpp",
@@ -361,6 +363,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/tensorexpr/unique_name_manager.cpp",
     "torch/csrc/jit/testing/file_check.cpp",
     "torch/csrc/jit/testing/hooks_for_testing.cpp",
+    "torch/csrc/utils/cpp_stacktraces.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
 ]
@@ -372,6 +375,7 @@ core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [
 
 core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/runtime/static/fusion.cpp",
+    "torch/csrc/jit/runtime/static/generated_ops.cpp",
     "torch/csrc/jit/runtime/static/impl.cpp",
     "torch/csrc/jit/runtime/static/memory_planner.cpp",
     "torch/csrc/jit/runtime/static/native_ops.cpp",
@@ -387,6 +391,7 @@ lazy_tensor_core_sources = [
     "torch/csrc/lazy/backend/backend_interface.cpp",
     "torch/csrc/lazy/backend/lowering_context.cpp",
     "torch/csrc/lazy/core/config.cpp",
+    "torch/csrc/lazy/core/debug_util.cpp",
     "torch/csrc/lazy/core/hash.cpp",
     "torch/csrc/lazy/core/helpers.cpp",
     "torch/csrc/lazy/core/ir.cpp",
@@ -397,33 +402,56 @@ lazy_tensor_core_sources = [
     "torch/csrc/lazy/core/lazy_view.cpp",
     "torch/csrc/lazy/core/metrics.cpp",
     "torch/csrc/lazy/core/multi_wait.cpp",
+    "torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp",
+    "torch/csrc/lazy/core/ops/utils.cpp",
     "torch/csrc/lazy/core/permutation_util.cpp",
     "torch/csrc/lazy/core/shape.cpp",
+    "torch/csrc/lazy/core/shape_inference.cpp",
     "torch/csrc/lazy/core/tensor.cpp",
     "torch/csrc/lazy/core/tensor_impl.cpp",
     "torch/csrc/lazy/core/tensor_util.cpp",
     "torch/csrc/lazy/core/thread_pool.cpp",
-    "torch/csrc/lazy/core/view_ops/as_strided.cpp",
-    "torch/csrc/lazy/core/view_ops/as_strided_view_update.cpp",
-    "torch/csrc/lazy/core/view_ops/diagonal.cpp",
-    "torch/csrc/lazy/core/view_ops/diagonal_view_update.cpp",
-    "torch/csrc/lazy/core/view_ops/narrow.cpp",
-    "torch/csrc/lazy/core/view_ops/narrow_view_update.cpp",
-    "torch/csrc/lazy/core/view_ops/permute.cpp",
-    "torch/csrc/lazy/core/view_ops/resize.cpp",
-    "torch/csrc/lazy/core/view_ops/select.cpp",
-    "torch/csrc/lazy/core/view_ops/squeeze.cpp",
-    "torch/csrc/lazy/core/view_ops/unsqueeze.cpp",
-    "torch/csrc/lazy/core/view_ops/select_view_update.cpp",
-    "torch/csrc/lazy/core/view_ops/view.cpp",
+    "torch/csrc/lazy/core/trie.cpp",
+]
+
+# We can't build all of the ts backend under certain build configurations, e.g. mobile,
+# since it depends on things like autograd, meta functions, which may be disabled
+lazy_tensor_ts_sources = [
     "torch/csrc/lazy/ts_backend/config.cpp",
-    "torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp",
+    "torch/csrc/lazy/ts_backend/dynamic_ir.cpp",
+    "torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp",
+    "torch/csrc/lazy/ts_backend/ops/random_ops.cpp",
     "torch/csrc/lazy/ts_backend/ops/cast.cpp",
     "torch/csrc/lazy/ts_backend/ops/device_data.cpp",
     "torch/csrc/lazy/ts_backend/ops/expand.cpp",
     "torch/csrc/lazy/ts_backend/ops/generic.cpp",
     "torch/csrc/lazy/ts_backend/ops/scalar.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/as_strided.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/diagonal.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/narrow.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/permute.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/resize.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/select.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/squeeze.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/unsqueeze.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/select_view_update.cpp",
+    "torch/csrc/lazy/ts_backend/view_ops/view.cpp",
     "torch/csrc/lazy/ts_backend/ts_node.cpp",
+    "torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp",
+    "torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp",
+    "torch/csrc/lazy/ts_backend/ts_backend_impl.cpp",
+    "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp",
+    "torch/csrc/lazy/ts_backend/ts_lowering_context.cpp",
+    "torch/csrc/lazy/ts_backend/ts_native_functions.cpp",
+    "torch/csrc/lazy/ts_backend/ts_node_lowering.cpp",
+]
+
+lazy_tensor_core_python_sources = [
+    "torch/csrc/lazy/python/init.cpp",
+    "torch/csrc/lazy/python/python_util.cpp",
 ]
 
 libtorch_core_sources = sorted(
@@ -449,9 +477,12 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/TCPStore.cpp",
     "torch/csrc/distributed/c10d/Utils.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
+    "torch/csrc/distributed/c10d/debug.cpp",
     "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
     "torch/csrc/distributed/c10d/exception.cpp",
     "torch/csrc/distributed/c10d/logger.cpp",
+    "torch/csrc/distributed/c10d/logging.cpp",
+    "torch/csrc/distributed/c10d/quantization/quantization.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
     "torch/csrc/distributed/c10d/sequence_num.cpp",
     "torch/csrc/distributed/c10d/socket.cpp",
@@ -604,7 +635,18 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
 ]
 
 def libtorch_sources(gencode_pattern = ":generate-code[{}]"):
-    return libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources
+    enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None))
+    flatbuffer_serializer_sources = [
+        "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
+        "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
+    ]
+    if enable_flatbuffer:
+        return (
+            libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources +
+            flatbuffer_serializer_sources
+        )
+    else:
+        return libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources
 
 libtorch_cuda_core_sources = [
     "torch/csrc/CudaIPCTypes.cpp",
@@ -616,6 +658,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/compute_at.cpp",
     "torch/csrc/jit/codegen/cuda/compute_at_map.cpp",
     "torch/csrc/jit/codegen/cuda/codegen.cpp",
+    "torch/csrc/jit/codegen/cuda/contiguity.cpp",
     "torch/csrc/jit/codegen/cuda/dispatch.cpp",
     "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp",
     "torch/csrc/jit/codegen/cuda/executor.cpp",
@@ -625,11 +668,14 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/executor_utils.cpp",
     "torch/csrc/jit/codegen/cuda/fusion.cpp",
     "torch/csrc/jit/codegen/cuda/graph_fuser.cpp",
+    "torch/csrc/jit/codegen/cuda/grouped_reduction.cpp",
     "torch/csrc/jit/codegen/cuda/index_compute.cpp",
     "torch/csrc/jit/codegen/cuda/index_reference_replay.cpp",
     "torch/csrc/jit/codegen/cuda/instrumentation.cpp",
     "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp",
+    "torch/csrc/jit/codegen/cuda/ir_builder.cpp",
     "torch/csrc/jit/codegen/cuda/ir_cloner.cpp",
+    "torch/csrc/jit/codegen/cuda/ir_container.cpp",
     "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp",
     "torch/csrc/jit/codegen/cuda/ir_nodes.cpp",
     "torch/csrc/jit/codegen/cuda/ir_iostream.cpp",
@@ -639,28 +685,36 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_ir.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp",
+    "torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp",
     "torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp",
     "torch/csrc/jit/codegen/cuda/lower_allocation.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp",
     "torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp",
     "torch/csrc/jit/codegen/cuda/lower_index.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp",
     "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
     "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
     "torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp",
     "torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp",
     "torch/csrc/jit/codegen/cuda/lower_predicate.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_replace_size.cpp",
     "torch/csrc/jit/codegen/cuda/lower_shift.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_sync_information.cpp",
     "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp",
     "torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp",
     "torch/csrc/jit/codegen/cuda/lower_unroll.cpp",
     "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
     "torch/csrc/jit/codegen/cuda/lower_validation.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp",
     "torch/csrc/jit/codegen/cuda/lower2device.cpp",
     "torch/csrc/jit/codegen/cuda/manager.cpp",
     "torch/csrc/jit/codegen/cuda/mutator.cpp",
     "torch/csrc/jit/codegen/cuda/non_divisible_split.cpp",
+    "torch/csrc/jit/codegen/cuda/ops/alias.cpp",
     "torch/csrc/jit/codegen/cuda/ops/composite.cpp",
     "torch/csrc/jit/codegen/cuda/ops/normalization.cpp",
     "torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp",
@@ -687,6 +741,8 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/transform_view.cpp",
     "torch/csrc/jit/codegen/cuda/type.cpp",
     "torch/csrc/jit/codegen/cuda/utils.cpp",
+    "torch/csrc/jit/codegen/cuda/mma_type.cpp",
+    "torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp",
     "torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp",
     "torch/csrc/jit/tensorexpr/cuda_codegen.cpp",
     "torch/csrc/jit/runtime/register_cuda_ops.cpp",
@@ -770,7 +826,6 @@ torch_cpp_srcs = [
     "torch/csrc/api/src/optim/schedulers/step_lr.cpp",
     "torch/csrc/api/src/serialize/input-archive.cpp",
     "torch/csrc/api/src/serialize/output-archive.cpp",
-    "torch/csrc/utils/crash_handler.cpp",
 ]
 
 libtorch_python_cuda_core_sources = [
@@ -814,7 +869,6 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/profiler_python.cpp",
     "torch/csrc/autograd/python_anomaly_mode.cpp",
     "torch/csrc/autograd/python_saved_variable_hooks.cpp",
-    "torch/csrc/autograd/python_mode.cpp",
     "torch/csrc/autograd/python_cpp_function.cpp",
     "torch/csrc/autograd/python_engine.cpp",
     "torch/csrc/autograd/python_function.cpp",
@@ -824,9 +878,11 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
+    "torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp",
     "torch/csrc/jit/python/init.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
     "torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp",
+    "torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp",
     "torch/csrc/jit/passes/onnx/eval_peephole.cpp",
     "torch/csrc/jit/passes/onnx/constant_fold.cpp",
     "torch/csrc/jit/passes/onnx/constant_map.cpp",
@@ -843,6 +899,7 @@ libtorch_python_core_sources = [
     "torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp",
     "torch/csrc/jit/passes/onnx/shape_type_inference.cpp",
     "torch/csrc/jit/passes/onnx/function_extraction.cpp",
+    "torch/csrc/jit/passes/onnx/onnx_log.cpp",
     "torch/csrc/jit/python/pybind_utils.cpp",
     "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp",
     "torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp",
@@ -885,12 +942,11 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/tensor_numpy.cpp",
     "torch/csrc/utils/tensor_types.cpp",
     "torch/csrc/utils/disable_torch_function.cpp",
-]
+] + lazy_tensor_core_python_sources
 
 libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/python_comm_hook.cpp",
-    "torch/csrc/distributed/c10d/quantization/quantization.cpp",
 ]
 
 libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
@@ -908,21 +964,21 @@ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources +
 
 def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
     _libtorch_python_sources = [gencode_pattern.format(name) for name in [
-        "autograd/generated/python_functions_0.cpp",
-        "autograd/generated/python_functions_1.cpp",
-        "autograd/generated/python_functions_2.cpp",
-        "autograd/generated/python_functions_3.cpp",
-        "autograd/generated/python_functions_4.cpp",
-        "autograd/generated/python_nn_functions.cpp",
-        "autograd/generated/python_fft_functions.cpp",
-        "autograd/generated/python_linalg_functions.cpp",
-        "autograd/generated/python_return_types.cpp",
-        "autograd/generated/python_sparse_functions.cpp",
-        "autograd/generated/python_special_functions.cpp",
-        "autograd/generated/python_torch_functions_0.cpp",
-        "autograd/generated/python_torch_functions_1.cpp",
-        "autograd/generated/python_torch_functions_2.cpp",
-        "autograd/generated/python_variable_methods.cpp",
+        "torch/csrc/autograd/generated/python_functions_0.cpp",
+        "torch/csrc/autograd/generated/python_functions_1.cpp",
+        "torch/csrc/autograd/generated/python_functions_2.cpp",
+        "torch/csrc/autograd/generated/python_functions_3.cpp",
+        "torch/csrc/autograd/generated/python_functions_4.cpp",
+        "torch/csrc/autograd/generated/python_nn_functions.cpp",
+        "torch/csrc/autograd/generated/python_fft_functions.cpp",
+        "torch/csrc/autograd/generated/python_linalg_functions.cpp",
+        "torch/csrc/autograd/generated/python_return_types.cpp",
+        "torch/csrc/autograd/generated/python_sparse_functions.cpp",
+        "torch/csrc/autograd/generated/python_special_functions.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_0.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
+        "torch/csrc/autograd/generated/python_variable_methods.cpp",
     ]]
 
     _libtorch_python_sources.extend(libtorch_python_core_sources)
@@ -945,11 +1001,13 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/MemoryOverlap.cpp",
     "aten/src/ATen/MapAllocator.cpp",
     "aten/src/ATen/NamedTensorUtils.cpp",
+    "aten/src/ATen/NestedTensorImpl.cpp",
     "aten/src/ATen/ParallelCommon.cpp",
     "aten/src/ATen/ParallelNative.cpp",
     "aten/src/ATen/ParallelNativeTBB.cpp",
     "aten/src/ATen/ParallelOpenMP.cpp",
     "aten/src/ATen/ParallelThreadPoolNative.cpp",
+    "aten/src/ATen/PythonTorchFunctionTLS.cpp",
     "aten/src/ATen/ScalarOps.cpp",
     "aten/src/ATen/SequenceNumber.cpp",
     "aten/src/ATen/SparseTensorImpl.cpp",
@@ -991,7 +1049,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/core/op_registration/infer_schema.cpp",
     "aten/src/ATen/core/op_registration/op_registration.cpp",
     "aten/src/ATen/core/operator_name.cpp",
-    "aten/src/ATen/core/PythonModeTLS.cpp",
+    "aten/src/ATen/core/TorchDispatchModeTLS.cpp",
     "aten/src/ATen/core/register_symbols.cpp",
     "aten/src/ATen/core/class_type.cpp",
     "aten/src/ATen/core/type.cpp",
@@ -1006,7 +1064,6 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/detail/ORTHooksInterface.cpp",
     "aten/src/ATen/metal/Context.cpp",
     "aten/src/ATen/native/AutogradComposite.cpp",
-    "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp",
     "aten/src/ATen/native/DispatchStub.cpp",
     "aten/src/ATen/native/UpSample.cpp",
     "aten/src/ATen/native/mkl/LinearAlgebra.cpp",
@@ -1024,6 +1081,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/native/mkldnn/MkldnnTensorMath.cpp",
     "aten/src/ATen/native/mkldnn/Normalization.cpp",
     "aten/src/ATen/native/mkldnn/Pooling.cpp",
+    "aten/src/ATen/native/mkldnn/Prelu.cpp",
     "aten/src/ATen/native/mkldnn/Relu.cpp",
     "aten/src/ATen/native/mkldnn/SoftMax.cpp",
     "aten/src/ATen/native/mkldnn/TensorFactories.cpp",
@@ -1047,6 +1105,10 @@ aten_cpu_source_codegen_list = [
     "aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp",
 ]
 
+aten_ufunc_headers = [
+    "aten/src/ATen/native/ufunc/add.h",
+]
+
 # When building lite interpreter in OSS, "aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp" will go through
 # codegen process. The codegen version of this file, like Activation.cpp.DEFAULT.cpp, will be included
 # in ${cpu_kernel_cpp} in aten/src/ATen/CMakeLists.txt. As a result, in aten/src/ATen/CMakeLists.txt,
@@ -1079,6 +1141,7 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/MaxPoolKernel.cpp",
     "aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp",
     "aten/src/ATen/native/cpu/MultinomialKernel.cpp",
+    "aten/src/ATen/native/cpu/PixelShuffleKernel.cpp",
     "aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp",
     "aten/src/ATen/native/cpu/PowKernel.cpp",
     "aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp",
@@ -1099,6 +1162,7 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/batch_norm_kernel.cpp",
     "aten/src/ATen/native/cpu/group_norm_kernel.cpp",
     "aten/src/ATen/native/cpu/layer_norm_kernel.cpp",
+    "aten/src/ATen/native/cpu/WeightNormKernel.cpp",
     "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp",
 ]
 
@@ -1124,7 +1188,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/quantized/cpu/qconcat.cpp",
     "aten/src/ATen/native/quantized/cpu/qconv.cpp",
     "aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp",
-    "aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp",
+    "aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp",
     "aten/src/ATen/native/quantized/cpu/qelu.cpp",
     "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp",
     "aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp",
@@ -1136,7 +1200,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp",
     "aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp",
     "aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp",
-    "aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp",
+    "aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp",
     "aten/src/ATen/native/quantized/cpu/qmatmul.cpp",
     "aten/src/ATen/native/quantized/cpu/qmul.cpp",
     "aten/src/ATen/native/quantized/cpu/qnormalization.cpp",
@@ -1144,6 +1208,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/quantized/cpu/qreduction.cpp",
     "aten/src/ATen/native/quantized/cpu/qrelu.cpp",
     "aten/src/ATen/native/quantized/cpu/qsigmoid.cpp",
+    "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp",
     "aten/src/ATen/native/quantized/cpu/qsort.cpp",
     "aten/src/ATen/native/quantized/cpu/qtanh.cpp",
     "aten/src/ATen/native/quantized/cpu/qthreshold.cpp",
@@ -1160,9 +1225,11 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp",
     "aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp",
     "aten/src/ATen/native/quantized/library.cpp",
+    "aten/src/ATen/native/quantized/cpu/ruy_utils.cpp",
+    "aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp",
+    "aten/src/ATen/native/quantized/qlinear_unpack.cpp",
     "aten/src/ATen/quantized/QTensorImpl.cpp",
     "aten/src/ATen/quantized/Quantizer.cpp",
-    "aten/src/ATen/native/attention.cpp",
     "aten/src/ATen/native/Activation.cpp",
     "aten/src/ATen/native/AdaptiveAveragePooling.cpp",
     "aten/src/ATen/native/AdaptiveAveragePooling3d.cpp",
@@ -1172,6 +1239,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/AveragePool2d.cpp",
     "aten/src/ATen/native/AveragePool3d.cpp",
     "aten/src/ATen/native/BatchLinearAlgebra.cpp",
+    "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp",
     "aten/src/ATen/native/Batching.cpp",
     "aten/src/ATen/native/BinaryOps.cpp",
     "aten/src/ATen/native/Blas.cpp",
@@ -1180,7 +1248,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/CPUBlas.cpp",
     "aten/src/ATen/native/ChanelShuffle.cpp",
     "aten/src/ATen/native/Col2Im.cpp",
-    "aten/src/ATen/native/ConstantPadNd.cpp",
+    "aten/src/ATen/native/PadNd.cpp",
     "aten/src/ATen/native/Convolution.cpp",
     "aten/src/ATen/native/ConvolutionMM2d.cpp",
     "aten/src/ATen/native/ConvolutionMM3d.cpp",
@@ -1284,6 +1352,8 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/WeightNorm.cpp",
     "aten/src/ATen/native/group_norm.cpp",
     "aten/src/ATen/native/layer_norm.cpp",
+    "aten/src/ATen/native/nested/NestedTensorMath.cpp",
+    "aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp",
     "aten/src/ATen/native/sparse/ParamUtils.cpp",
     "aten/src/ATen/native/sparse/SoftMax.cpp",
     "aten/src/ATen/native/sparse/SparseBlas.cpp",
@@ -1294,6 +1364,8 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/sparse/SparseTensorMath.cpp",
     "aten/src/ATen/native/sparse/SparseUnaryOps.cpp",
     "aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp",
+    "aten/src/ATen/native/transformers/attention.cpp",
+    "aten/src/ATen/native/transformers/transformer.cpp",
     "aten/src/ATen/native/utils/Factory.cpp",
     "aten/src/ATen/native/xnnpack/Activation.cpp",
     "aten/src/ATen/native/xnnpack/ChannelShuffle.cpp",
@@ -1323,8 +1395,11 @@ aten_cuda_cu_source_list = [
     "aten/src/ATen/cuda/CUDASparseBlas.cpp",
     "aten/src/ATen/cuda/CublasHandlePool.cpp",
     "aten/src/ATen/native/cuda/Activation.cpp",
+    "aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp",
     "aten/src/ATen/native/cuda/Blas.cpp",
+    "aten/src/ATen/native/cuda/Distributions.cpp",
     "aten/src/ATen/native/cuda/Equal.cpp",
+    "aten/src/ATen/native/cuda/GridSampler.cpp",
     "aten/src/ATen/native/cuda/IndexKernel.cpp",
     "aten/src/ATen/native/cuda/ReduceOps.cpp",
     "aten/src/ATen/native/cuda/ScanKernels.cpp",
@@ -1334,6 +1409,7 @@ aten_cuda_cu_source_list = [
     "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
     "aten/src/ATen/native/cuda/TensorTopK.cpp",
     "aten/src/ATen/native/cuda/jit_utils.cpp",
+    "aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp",
     "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
     "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
     "aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp",
diff --git a/tools/clang_format_hash/linux64/clang-format-linux64 b/tools/clang_format_hash/linux64/clang-format-linux64
deleted file mode 100644
index 40a85640a2aa..000000000000
--- a/tools/clang_format_hash/linux64/clang-format-linux64
+++ /dev/null
@@ -1 +0,0 @@
-21ca53c291a88b53dac85751b7a0203ca610ac94b7adaff3c092cf30df4168f2
\ No newline at end of file
diff --git a/tools/clang_format_hash/mac/clang-format-mojave b/tools/clang_format_hash/mac/clang-format-mojave
deleted file mode 100644
index fe4f8f6bdd69..000000000000
--- a/tools/clang_format_hash/mac/clang-format-mojave
+++ /dev/null
@@ -1 +0,0 @@
-5fde7bccf65032da297dfb1f18e4a95e96e278fa397e9dcaf364dfe23ec46353
\ No newline at end of file
diff --git a/tools/code_analyzer/gen_op_registration_allowlist.py b/tools/code_analyzer/gen_op_registration_allowlist.py
index 00f880d6e9c7..65e56856a789 100644
--- a/tools/code_analyzer/gen_op_registration_allowlist.py
+++ b/tools/code_analyzer/gen_op_registration_allowlist.py
@@ -16,24 +16,26 @@
 
 DepGraph = Dict[str, Set[str]]
 
+
 def canonical_name(opname: str) -> str:
     # Skip the overload name part as it's not supported by code analyzer yet.
-    return opname.split('.', 1)[0]
+    return opname.split(".", 1)[0]
+
 
 def load_op_dep_graph(fname: str) -> DepGraph:
-    with open(fname, 'r') as stream:
+    with open(fname, "r") as stream:
         result = defaultdict(set)
         for op in yaml.safe_load(stream):
-            op_name = canonical_name(op['name'])
-            for dep in op.get('depends', []):
-                dep_name = canonical_name(dep['name'])
+            op_name = canonical_name(op["name"])
+            for dep in op.get("depends", []):
+                dep_name = canonical_name(dep["name"])
                 result[op_name].add(dep_name)
         return dict(result)
 
 
 def load_root_ops(fname: str) -> List[str]:
     result = []
-    with open(fname, 'r') as stream:
+    with open(fname, "r") as stream:
         for op in yaml.safe_load(stream):
             result.append(canonical_name(op))
     return result
@@ -49,7 +51,7 @@ def gen_transitive_closure(
 
     # The dependency graph might contain a special entry with key = `__BASE__`
     # and value = (set of `base` ops to always include in custom build).
-    queue.append('__BASE__')
+    queue.append("__BASE__")
 
     # The dependency graph might contain a special entry with key = `__ROOT__`
     # and value = (set of ops reachable from C++ functions). Insert the special
@@ -58,7 +60,7 @@ def gen_transitive_closure(
     # '__ROOT__' is only needed for full-jit. Keep it only for training.
     # TODO: when FL is migrated from full-jit to lite trainer, remove '__ROOT__'
     if train:
-        queue.append('__ROOT__')
+        queue.append("__ROOT__")
 
     while queue:
         cur = queue.pop()
@@ -69,21 +71,25 @@ def gen_transitive_closure(
 
     return sorted(result)
 
+
 def gen_transitive_closure_str(dep_graph: DepGraph, root_ops: List[str]) -> str:
-    return ' '.join(gen_transitive_closure(dep_graph, root_ops))
+    return " ".join(gen_transitive_closure(dep_graph, root_ops))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Util to produce transitive dependencies for custom build')
+        description="Util to produce transitive dependencies for custom build"
+    )
     parser.add_argument(
-        '--op-dependency',
-        help='input yaml file of op dependency graph '
-             '- can be omitted for custom build with static dispatch')
+        "--op-dependency",
+        help="input yaml file of op dependency graph "
+        "- can be omitted for custom build with static dispatch",
+    )
     parser.add_argument(
-        '--root-ops',
+        "--root-ops",
         required=True,
-        help='input yaml file of root (directly used) operators')
+        help="input yaml file of root (directly used) operators",
+    )
     args = parser.parse_args()
 
     deps = load_op_dep_graph(args.op_dependency) if args.op_dependency else {}
diff --git a/tools/code_analyzer/gen_operators_yaml.py b/tools/code_analyzer/gen_operators_yaml.py
new file mode 100644
index 000000000000..0daa27f0480e
--- /dev/null
+++ b/tools/code_analyzer/gen_operators_yaml.py
@@ -0,0 +1,591 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from typing import List, Optional, Dict, Any
+
+import yaml
+from gen_op_registration_allowlist import (
+    canonical_name,
+    gen_transitive_closure,
+    load_op_dep_graph,
+)
+from torchgen.selective_build.operator import (
+    SelectiveBuildOperator,
+    merge_operator_dicts,
+)
+from torchgen.selective_build.selector import merge_kernel_metadata
+
+# Generate YAML file containing the operators used for a specific PyTorch model.
+# ------------------------------------------------------------------------------
+#
+# This binary is responsible for generating the model_operators.yaml file for
+# each model from a pt_operator_library() BUCK macro invocation.
+#
+# Output YAML file format:
+# ------------------------
+#
+# <BEGIN FILE CONTENTS>
+# include_all_non_op_selectives: False
+# include_all_operators: False
+# debug_info:
+#   - model1@v100
+#   - model2@v50
+# operators:
+#   aten::add:
+#     is_root_operator: Yes
+#     is_used_for_training: Yes
+#     include_all_overloads: No
+#     debug_info:
+#       - model1@v100
+#       - model2@v50
+#   aten::add.int:
+#     is_root_operator: No
+#     is_used_for_training: No
+#     include_all_overloads: Yes
+# kernel_metadata:
+#   add_kernel:
+#     - Int8
+#     - UInt32
+#   sub_kernel:
+#     - Int16
+#     - Float
+# <END FILE CONTENTS>
+#
+# There are a few main inputs to this application
+# -----------------------------------------------
+#
+# 1. Inference Root Operators (--root_ops): Root operators (called directly
+#    from TorchScript) used by inference use-cases.
+#
+# 2. Training Root Operators (--training_root_ops): Root operators used
+#    by training use-cases. Currently, this list is the list of all operators
+#    used by training, and not just the root operators. All Training ops are
+#    also considered for inference, so these are merged into inference ops.
+#
+# 3. Operator Depencency Graph (--dep_graph_yaml_path): A path to the
+#    operator dependency graph used to determine which operators depend on
+#    which other operators for correct functioning. This is used for
+#    generating the transitive closure of all the operators used by the
+#    model based on the root operators when static selective build is used.
+#    For tracing based selective build, we don't need to perform this
+#    transitive cloure.
+#
+# 4. Model Metadata (--model_name, --model_versions, --model_assets,
+#    --model_backends): Self-descriptive. These are used to tell this
+#    script which model operator lists to fetch from the Unified Model
+#    Build Metadata YAML file.
+#
+# 5. Unified Model YAML file (--models_yaml_path): A path to the Unified
+#    model YAML operator list file. This yaml file contains (for each
+#    model/version/asset/backend) the set of used root and traced
+#    operators. This is used to extract the actual set of operators
+#    needed to be included in the build.
+#
+
+
+def canonical_opnames(opnames: List[str]) -> List[str]:
+    return [canonical_name(opname) for opname in opnames]
+
+
+def make_filter_from_options(
+    model_name: str,
+    model_versions: List[str],
+    model_assets: Optional[List[str]],
+    model_backends: Optional[List[str]],
+):
+    def is_model_included(model_info):
+        model = model_info["model"]
+        if model["name"] != model_name:
+            return False
+        if str(model["version"]) not in model_versions:
+            return False
+        if model_assets is not None and model["asset"] not in model_assets:
+            return False
+        # TODO: Handle backend later
+        return True
+
+    return is_model_included
+
+
+# Returns if a the specified rule is a new or old style pt_operator_library
+def is_new_style_rule(model_name: str, model_versions: Optional[List[str]]):
+    return model_name is not None and model_versions is not None
+
+
+# Verifies that specified model_name, and all specified versions and assets
+# appear in at least one model yaml. Throws if verification is failed,
+# returns None on success
+def verify_all_specified_present(
+    model_assets: Optional[List[str]],
+    model_versions: List[str],
+    selected_models_yaml: List[Dict[str, Any]],
+    rule_name: str,
+    model_name: str,
+    new_style_rule: bool,
+):
+    def find_missing_items(model_items, key, selected_models_yaml):
+        missing_items = []
+        if not new_style_rule or not model_items:
+            return missing_items
+        for item in model_items:
+            found = False
+            for model in selected_models_yaml:
+                if str(model["model"][key]) == item:
+                    found = True
+            if not found:
+                missing_items.append(item)
+        return missing_items
+
+    missing_assets = find_missing_items(model_assets, "asset", selected_models_yaml)
+    missing_versions = find_missing_items(
+        model_versions, "version", selected_models_yaml
+    )
+
+    if len(missing_versions) > 0 or len(missing_assets) > 0:  # at least one is missing
+        name_warning = ""
+        if len(selected_models_yaml) == 0:
+            name_warning = (
+                "WARNING: 0 yaml's were found for target rule. This could be because the "
+                + "provided model name: {name} is incorrect. Please check that field as well as "
+                + "the assets and versions."
+            ).format(name=model_name)
+        raise RuntimeError(
+            (
+                "Error: From the pt_operator_library rule for Rule: {name}, at least one entry for the "
+                + "following fields was expected -- Model: {model_name} Expected Assets: {expected_assets}, Expected Versions: "
+                + "{expected_versions}. {name_warning} In all_mobile_models.yaml either no assets were on one of the "
+                + "specified versions, one of the specified assets was not present on any of the specified "
+                + "versions, or both. Assets not found: {missing_assets}, Versions not found: {missing_versions} "
+                + "For questions please ask in https://fb.workplace.com/groups/2148543255442743/"
+            ).format(
+                name=rule_name,
+                model_name=model_name,
+                expected_versions=model_versions,
+                expected_assets=model_assets
+                if model_assets
+                else "<All model assets present on specified versions>",
+                name_warning=name_warning,
+                missing_versions=missing_versions
+                if len(missing_versions) > 0
+                else "<All specified versions had at least one asset>",
+                missing_assets=missing_assets
+                if len(missing_assets) > 0
+                else "<All specified assets are present on at least 1 version>",
+            )
+        )
+
+
+# Uses the selected models configs and then combines them into one dictionary,
+# formats them as a string, and places the string into output as a top level debug_info
+def create_debug_info_from_selected_models(
+    output: Dict[str, object],
+    selected_models: List[dict],
+    new_style_rule: bool,
+):
+
+    model_dict = {
+        "asset_info": {},  # maps asset name -> dict of asset metadata like hashes
+        "is_new_style_rule": new_style_rule,
+    }
+
+    for model in selected_models:
+        model_info = model["model"]
+        asset = model_info["asset"]
+        hash = model_info["md5_hash"]
+
+        asset_info = model_dict["asset_info"].setdefault(asset, {})
+
+        asset_info.setdefault("md5_hash", []).append(hash)
+
+    # Will later be used in gen_oplist to generate the model/version/asset checking
+    output["debug_info"] = [json.dumps(model_dict)]
+
+
+def fill_output(output: Dict[str, object], options: object):
+    """Populate the output dict with the information required to serialize
+    the YAML file used for selective build.
+    """
+    dept_graph = load_op_dep_graph(options.dep_graph_yaml_path)
+
+    model_versions = (
+        options.model_versions.split(",") if options.model_versions is not None else []
+    )
+    model_assets = (
+        options.model_assets.split(",") if options.model_assets is not None else None
+    )
+
+    with open(options.models_yaml_path, "rb") as models_yaml_file:
+        all_models_yaml = yaml.safe_load(models_yaml_file) or []
+
+    model_filter_func = make_filter_from_options(
+        options.model_name, model_versions, model_assets, options.model_backends
+    )
+
+    selected_models_yaml = list(filter(model_filter_func, all_models_yaml))
+
+    verify_all_specified_present(
+        model_assets=model_assets,
+        model_versions=model_versions,
+        selected_models_yaml=selected_models_yaml,
+        rule_name=options.rule_name,
+        model_name=options.model_name,
+        new_style_rule=is_new_style_rule(options.model_name, options.model_versions),
+    )
+
+    create_debug_info_from_selected_models(
+        output,
+        selected_models_yaml,
+        is_new_style_rule(options.model_name, options.model_versions),
+    )
+
+    # initialize variables for static build from the pt_operator_library rule
+    if options.root_ops is not None:
+        static_root_ops = set(filter(lambda x: len(x) > 0, options.root_ops.split(",")))
+    else:
+        static_root_ops = set()
+
+    static_training_root_ops = set(
+        filter(
+            lambda x: len(x) > 0,
+            (options.training_root_ops or "").split(","),
+        )
+    )
+    if len(static_training_root_ops) > 0:
+        static_root_ops = static_root_ops | static_training_root_ops
+    # end if
+
+    root_ops_unexpand = set()
+    traced_ops = set()
+    training_root_ops_unexpand = set()
+    traced_training_ops = set()
+    all_kernel_metadata = []
+    all_custom_classes = set()
+    all_build_features = set()
+
+    # Go through each yaml file and retrieve operator information.
+    for model_info in selected_models_yaml:
+        if "traced_operators" not in model_info:
+            # If this YAML file doesn't specify any traced operators, then it is using
+            # the static analysis selective build approach of finding transitively
+            # used operators, and we should update root_ops with the set of root
+            # operators, all of whose overloads must be included. In addition, these
+            # root_ops will be further expanded using the transitive closure of
+            # operator dependencies.
+            static_root_ops = static_root_ops | set(model_info["root_operators"])
+        else:
+            # If this YAML file specifies traced operators, then it is using
+            # the tracing based selective build approach of finding used
+            # operators, and we should update root_ops_unexpand with the set of root
+            # operators whose overloads don't need to be included. In addition, these
+            # root_ops_unexpand will NOT be further expanded. If the train flag is
+            # set then the ops will be used for training, so we put them in a separate
+            # set
+            if model_info["train"]:
+                training_root_ops_unexpand = training_root_ops_unexpand | set(
+                    model_info["root_operators"]
+                )
+                traced_training_ops = traced_training_ops | set(
+                    model_info["traced_operators"]
+                )
+            else:
+                root_ops_unexpand = root_ops_unexpand | set(
+                    model_info["root_operators"]
+                )
+                traced_ops = traced_ops | set(model_info["traced_operators"])
+
+        if "kernel_metadata" in model_info:
+            all_kernel_metadata.append(model_info["kernel_metadata"])
+
+        if "custom_classes" in model_info:
+            all_custom_classes = all_custom_classes | set(model_info["custom_classes"])
+
+        if "build_features" in model_info:
+            all_build_features = all_build_features | set(model_info["build_features"])
+
+    # This following section on transitive closure is relevant to static build only
+    canonical_root_ops = canonical_opnames(static_root_ops)
+    # If no canonical_root_ops exist, don't compute the transitive closure
+    # otherwise, we will include __BASE__ and __ROOT__ ops and mark them as required
+    # for inference.
+    if len(canonical_root_ops) > 0:
+        closure_op_list = gen_transitive_closure(dept_graph, canonical_root_ops)
+    else:
+        closure_op_list = set()
+
+    canonical_training_root_ops = canonical_opnames(static_training_root_ops)
+    # If no canonical_training_root_ops exist, don't compute the transitive closure
+    # otherwise, we will include __BASE__ and __ROOT__ ops and mark them as required
+    # for training.
+    if len(canonical_training_root_ops) > 0:
+        closure_training_op_list = gen_transitive_closure(
+            dept_graph, canonical_training_root_ops, train=True
+        )
+    else:
+        closure_training_op_list = set()
+
+    # bucketed_ops holds sets of operators that correspond to specific semantic buckets. For
+    # example:
+    #
+    # 1. Root Operators not used for training w/o full overload inclusion
+    # 2. Root Operators not used for training w/ full overload inclusion
+    # 3. Root Operators used for training w/o full overload inclusion
+    # 4. Root Operators used for training w/ full overload inclusion
+    # 5. Non-root Operators not used for training w/o full overload inclusion
+    # etc...
+    #
+    # Basically for each of the 3 boolean conditional, there are 2
+    # options (True/False).
+    #
+    bucketed_ops = []
+
+    # START STATIC BUILD OPS
+    static_root_ops_bucket = {}
+    for op_name in static_root_ops:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": True,
+                "is_used_for_training": False,
+                "include_all_overloads": True,
+                "debug_info": [options.model_name],
+            },
+        )
+        static_root_ops_bucket[op_name] = op
+    bucketed_ops.append(static_root_ops_bucket)
+
+    closure_ops_bucket = {}
+    for op_name in closure_op_list:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": False,
+                "is_used_for_training": False,
+                "include_all_overloads": True,
+                "debug_info": [options.model_name],
+            },
+        )
+        closure_ops_bucket[op_name] = op
+    bucketed_ops.append(closure_ops_bucket)
+
+    static_training_root_ops_bucket = {}
+    for op_name in static_training_root_ops:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": True,
+                "is_used_for_training": True,
+                "include_all_overloads": True,
+                "debug_info": [options.model_name],
+            },
+        )
+        static_training_root_ops_bucket[op_name] = op
+    bucketed_ops.append(static_training_root_ops_bucket)
+
+    closure_training_ops_bucket = {}
+    for op_name in closure_training_op_list:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": False,
+                "is_used_for_training": True,
+                "include_all_overloads": True,
+                "debug_info": [options.model_name],
+            },
+        )
+        closure_training_ops_bucket[op_name] = op
+    bucketed_ops.append(closure_training_ops_bucket)
+    # END STATIC BUILD OPS
+
+    # START TRACING BASED BUILD OPS
+    root_ops_unexpand_bucket = {}
+    for op_name in root_ops_unexpand:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": True,
+                "is_used_for_training": False,
+                "include_all_overloads": False,
+                "debug_info": [options.model_name],
+            },
+        )
+        root_ops_unexpand_bucket[op_name] = op
+    bucketed_ops.append(root_ops_unexpand_bucket)
+
+    traced_ops_bucket = {}
+    for op_name in traced_ops:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": False,
+                "is_used_for_training": False,
+                "include_all_overloads": False,
+                "debug_info": [options.model_name],
+            },
+        )
+        traced_ops_bucket[op_name] = op
+    bucketed_ops.append(traced_ops_bucket)
+
+    training_root_ops_unexpand_bucket = {}
+    for op_name in training_root_ops_unexpand:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": True,
+                "is_used_for_training": True,
+                "include_all_overloads": False,
+                "debug_info": [options.model_name],
+            },
+        )
+        training_root_ops_unexpand_bucket[op_name] = op
+    bucketed_ops.append(training_root_ops_unexpand_bucket)
+
+    traced_training_ops_bucket = {}
+    for op_name in traced_training_ops:
+        op = SelectiveBuildOperator.from_yaml_dict(
+            op_name,
+            {
+                "is_root_operator": False,
+                "is_used_for_training": True,
+                "include_all_overloads": False,
+                "debug_info": [options.model_name],
+            },
+        )
+        traced_training_ops_bucket[op_name] = op
+    bucketed_ops.append(traced_training_ops_bucket)
+    # END TRACING BASED BUILD OPS
+
+    # Merge dictionaries together to remove op duplication
+    operators: Dict[str, SelectiveBuildOperator] = {}
+    for ops_dict in bucketed_ops:
+        operators = merge_operator_dicts(operators, ops_dict)
+
+    # Loop over all operators, and if any of the them specifies that
+    # all overloads need to be included, then set include_all_non_op_selectives
+    # to True, since it indicates that this operator list came from something
+    # other than a traced operator list.
+    include_all_non_op_selectives = False
+    for (op_name, op_info) in operators.items():
+        include_all_non_op_selectives = (
+            include_all_non_op_selectives or op_info.include_all_overloads
+        )
+
+    operators_as_dict = {}
+    for (k, v) in operators.items():
+        operators_as_dict[k] = v.to_dict()
+
+    output["operators"] = operators_as_dict
+
+    output["custom_classes"] = all_custom_classes
+
+    output["build_features"] = all_build_features
+
+    output["include_all_non_op_selectives"] = include_all_non_op_selectives
+    if len(all_kernel_metadata) > 0:
+        kernel_metadata = {}
+        for kt in all_kernel_metadata:
+            kernel_metadata = merge_kernel_metadata(kernel_metadata, kt)
+        output["kernel_metadata"] = kernel_metadata
+
+
+def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    parser.add_argument(
+        "--root_ops",
+        help="A comma separated list of root operators used by the model",
+        required=False,
+    )
+    parser.add_argument(
+        "--training_root_ops",
+        help="A comma separated list of root operators used for training",
+        required=False,
+    )
+    parser.add_argument(
+        "--output_path",
+        help="The location of the output yaml file.",
+        required=True,
+    )
+    parser.add_argument(
+        "--dep_graph_yaml_path",
+        type=str,
+        help="A path to the Operator Dependency Graph YAML file.",
+        required=True,
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        help="The name of the model that uses the specified root operators.",
+        required=True,
+    )
+    parser.add_argument(
+        "--model_versions",
+        type=str,
+        help="A comma separated list of model versions.",
+        required=False,
+    )
+    parser.add_argument(
+        "--model_assets",
+        type=str,
+        help="A comma separate list of model asset names (if absent, defaults to all assets for this model).",
+        required=False,
+    )
+    parser.add_argument(
+        "--model_backends",
+        type=str,
+        default="CPU",
+        help="A comma separated list of model backends.",
+        required=False,
+    )
+    parser.add_argument(
+        "--models_yaml_path",
+        type=str,
+        help="The path to where the unified Mobile Model Config YAML resides.",
+        required=True,
+    )
+    parser.add_argument(
+        "--include_all_operators",
+        action="store_true",
+        default=False,
+        help="Set this flag to request inclusion of all opeators (i.e. build is not selective).",
+        required=False,
+    )
+    parser.add_argument(
+        "--rule_name",
+        type=str,
+        help="The name of pt_operator_library rule resulting in this generation",
+        required=True,
+    )
+    options = parser.parse_args()
+    return options
+
+
+def main(argv) -> None:
+    parser = argparse.ArgumentParser(description="Generate used operators YAML")
+    options = get_parser_options(parser)
+
+    model_dict = {
+        "model_name": options.model_name,
+        "asset_info": {},
+        "is_new_style_rule": False,
+    }
+    output = {
+        "debug_info": [json.dumps(model_dict)],
+    }
+
+    if options.include_all_operators:
+        output["include_all_operators"] = True
+        output["operators"] = {}
+        output["kernel_metadata"] = {}
+    else:
+        fill_output(output, options)
+
+    with open(options.output_path, "wb") as out_file:
+        out_file.write(
+            yaml.safe_dump(
+                output,
+                default_flow_style=False,
+            ).encode("utf-8")
+        )
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index 010b420d8c9b..b5d31b922167 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -7,11 +7,15 @@
 from typing import Set, List, Any
 
 import yaml
-from tools.codegen.selective_build.selector import combine_selective_builders, SelectiveBuilder
+from torchgen.selective_build.selector import (
+    combine_selective_builders,
+    SelectiveBuilder,
+)
 from tools.lite_interpreter.gen_selected_mobile_ops_header import (
     write_selected_mobile_ops,
 )
 
+
 def extract_all_operators(selective_builder: SelectiveBuilder) -> Set[str]:
     ops = []
     for (op_name, op) in selective_builder.operators.items():
@@ -125,7 +129,7 @@ def main(argv: List[Any]) -> None:
     )
     options = parser.parse_args()
 
-    if (os.path.isfile(options.model_file_list_path)):
+    if os.path.isfile(options.model_file_list_path):
         print("Processing model file: ", options.model_file_list_path)
         model_dicts = []
         model_dict = yaml.safe_load(open(options.model_file_list_path))
@@ -180,5 +184,6 @@ def main(argv: List[Any]) -> None:
         selective_builder,
     )
 
+
 if __name__ == "__main__":
     main(sys.argv)
diff --git a/tools/code_coverage/README.md b/tools/code_coverage/README.md
index 6e83dc593ed1..67adb445d053 100644
--- a/tools/code_coverage/README.md
+++ b/tools/code_coverage/README.md
@@ -3,7 +3,7 @@
 ## Overview
 
 This tool is designed for calculating code coverage for Pytorch project.
-It’s an integrated tool. You can use this tool to run and generate both file-level and line-level report for C++ and Python tests. It will also be the tool we use in *CircleCI* to generate report for each master commit.
+It’s an integrated tool. You can use this tool to run and generate both file-level and line-level report for C++ and Python tests. It will also be the tool we use in *CircleCI* to generate report for each main commit.
 
 ### Simple
 * *Simple command to run:*
@@ -30,11 +30,11 @@ This part will introduce about the arguments you can use when run this tool. The
 We have two different compilers, `gcc` and `clang`, and this tool supports both. But it is recommended to use `gcc` because it's much faster and use less disk place. The examples will also be divided to two parts, for `gcc` and `clang`.
 
 ## Preparation
-The first step is to [build *Pytorch* from source](https://github.com/pytorch/pytorch#from-source) with `CODE_COVERAGE` option `ON`. You may also want to set `BUILD_TEST` option `ON` to get the test binaries. Besides, if you are under `gcc` compiler, to get accurate result, it is recommended to also select `CMAKE_BUILD_CONFIG=Debug`.
+The first step is to [build *Pytorch* from source](https://github.com/pytorch/pytorch#from-source) with `USE_CPP_CODE_COVERAGE` option `ON`. You may also want to set `BUILD_TEST` option `ON` to get the test binaries. Besides, if you are under `gcc` compiler, to get accurate result, it is recommended to also select `CMAKE_BUILD_TYPE=Debug`.
 See: [how to adjust build options](https://github.com/pytorch/pytorch#adjust-build-options-optional) for reference. Following is one way to adjust build option:
 ```
 # in build/ folder (all build artifacts must in `build/` folder)
-cmake .. -DCODE_COVERAGE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_CONFIG=Debug
+cmake .. -DUSE_CPP_CODE_COVERAGE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug
 ```
 
 
@@ -53,7 +53,7 @@ python oss_coverage.py --run-only=atest
 ```
 This command will run `atest` binary in `build/bin/` folder and generate reoports over the entire *Pytorch* folder. You can find the reports in `profile/summary`. But you may only be interested in the `aten` folder, in this case, try:
 ```
-python oss_coverage.py --run-only=atest --interested-only=aten
+python oss_coverage.py --run-only=atest --interest-only=aten
 ```
 In *Pytorch*, `c++` tests located in `build/bin/` and `python` tests located in `test/`. If you want to run `python` test, try:
 ```
@@ -62,7 +62,7 @@ python oss_coverage.py --run-only=test_complex.py
 
 You may also want to specify more than one test or interested folder, in this case, try:
 ```
-python oss_coverage.py --run-only=atest c10_logging_test --interested-only aten/src/Aten c10/core
+python oss_coverage.py --run-only=atest c10_logging_test --interest-only aten/src/Aten c10/core
 ```
 That it is! With these two simple options, you can customize many different functionality according to your need.
 By default, the tool will run all tests in `build/bin` folder (by running all executable binaries in it) and `test/` folder (by running `run_test.py`), and then collect coverage over the entire *Pytorch* folder. If this is what you want, try:
@@ -84,9 +84,9 @@ By default all steps will be run, but you can specify only run one of them. Foll
 `—summary` is useful when you have different interested folder. For example,
 ```bash
 # after run this command
-python oss_coverage.py --run-only=atest --interested-folder=aten
+python oss_coverage.py --run-only=atest --interest-only=aten
 # you may then want to learn atest's coverage over c10, instead of running the test again, you can:
-python oss_coverage.py --run-only=atest --interested-folder=c10 --summary
+python oss_coverage.py --run-only=atest --interest-only=c10 --summary
 ```
 
 
diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py
deleted file mode 100644
index 64b7547e78f0..000000000000
--- a/tools/codegen/api/autograd.py
+++ /dev/null
@@ -1,388 +0,0 @@
-from dataclasses import dataclass
-import re
-from typing import Optional, Sequence, Set, List, Tuple, Match
-
-from tools.codegen.api import cpp
-from tools.codegen.api.types import Binding, NamedCType
-from tools.codegen.model import NativeFunction, Type, SchemaKind
-from tools.codegen.utils import IDENT_REGEX
-
-# Represents a saved attribute involved in backward calculation.
-# Note that it can be a derived property of an input argument, e.g.:
-# we could save `other.scalar_type()` instead of the entire `other` tensor.
-@dataclass(frozen=True)
-class SavedAttribute:
-    # The NamedCType holds the updated name and cpp type of the attribute
-    # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type`
-    nctype: NamedCType
-
-    # The expression to read the derived property at save time, e.g.:
-    # `other.scalar_type()`.
-    expr: str
-
-# Represents a backward formula that calculates derivatives for one
-# or more tensors.
-@dataclass(frozen=True)
-class Derivative:
-    # The formula string (legit C++ expression).
-    # Note that expressions against input arguments have been replaced with the
-    # corresponding saved attributes.
-    # E.g.:
-    #  raw formula: `mul_tensor_backward(grad, self, other.scalar_type())`
-    #         here: `mul_tensor_backward(grad, self, other_scalar_type)`
-    formula: str
-
-    # The formula string before input argument replacement
-    original_formula: str
-
-    # Names of the arguments for which this formula calculates derivatives.
-    var_names: Tuple[str, ...]
-
-    # Saved inputs that are referenced by the formula.
-    saved_inputs: Tuple[SavedAttribute, ...]
-
-    # Saved outputs that are referenced by the formula.
-    saved_outputs: Tuple[SavedAttribute, ...]
-
-    # Gradients that are referenced by name in the formula.
-    named_gradients: Set[str]
-
-# Represents a forward formula that calculates forward derivatives
-# for one tensor.
-@dataclass(frozen=True)
-class ForwardDerivative:
-    # The formula string (legit C++ expression).
-    # Note that special keywords such as "linear" or "element_wise" have been
-    # replaced by the automatically generated formula.
-    formula: str
-
-    # Name of the output argument for which this formula calculates forward
-    # derivatives
-    var_name: str
-
-    # Type of the output argument for which this formula calculates forward
-    # derivatives
-    var_type: Type
-
-    # Inputs for which the forward derivatives are required for this formula
-    required_inputs_fw_grad: Optional[Tuple[str, ...]]
-
-    # Inputs for which the primal is required for this formula
-    required_inputs_primal: Optional[Tuple[str, ...]]
-
-    # Flag to specify if this formula requires the original value of self
-    # This is only used by inplace operations
-    required_original_self_value: bool
-
-    # If this formula is specified in derivatives.yaml or if we are re-using the
-    # out of place formula for inplace
-    is_reusing_outplace_formula: bool
-
-# Represents differentiability info for a NativeFunction.
-@dataclass(frozen=True)
-class DifferentiabilityInfo:
-    # The base name read from derivatives.yaml.
-    name: str
-
-    # The matching native function.
-    #
-    # There can be multiple NativeFunction having the same base name:
-    #  - different overloads with different types of input arguments;
-    #  - in-place/out/functional variants of the same function;
-    #
-    # We first use the schema string (under the 'name' key) in derivatives.yaml
-    # to find the NativeFunction having the same schema string.
-    # Then we find the in-place/out/functional variants of the matching function.
-    # Among these variants, we choose the one having the same name as the
-    # derivatives.yaml entry. If there is no exact match, then we choose the
-    # in-place variant.
-    # TODO: maybe the logic to search for all variants is no longer necessary?
-    func: NativeFunction
-
-    # The name of the generated autograd function.
-    # It's set only if we will calculate a derivative, i.e.
-    # 'args_with_derivatives' is not empty.
-    op: Optional[str]
-
-    # The derivatives formulae for this function.
-    # Note that the length of this sequence is the number of differentiable inputs
-    derivatives: Sequence[Derivative]
-
-    # The forward derivatives formulae for this function.
-    # Note that the length of this sequence is the number of differentiable outputs
-    forward_derivatives: Sequence[ForwardDerivative]
-
-    # The union of 'saved_inputs' of all 'derivatives'.
-    all_saved_inputs: Sequence[SavedAttribute]
-
-    # The union of 'saved_outputs' of all 'derivatives'.
-    all_saved_outputs: Sequence[SavedAttribute]
-
-    # All named gradients that are available for use, in the same
-    # order as in the grads vector.
-    available_named_gradients: Sequence[str]
-
-    # The named gradients that are used in any of the derivatives.
-    # Invariant: all(name in available_named_gradients for name in used_named_gradients)
-    used_named_gradients: Set[str]
-
-    # The function's input arguments for which it calculates derivatives.
-    # It's the union of 'var_names' of all 'derivatives', sorted by the
-    # argument order in the function schema.
-    args_with_derivatives: Sequence[Binding]
-
-    # Names of arguments whose derivative formula is 'non_differentiable'.
-    non_differentiable_arg_names: Sequence[str]
-
-    # Raw data read from derivatives.yaml.
-    output_differentiability: Optional[List[bool]]
-
-    # output_differentiability in derivatives.yaml can be a list of
-    # conditions that express if the output is differentiable. In this case,
-    # the number of conditions must match the number of outputs
-    # (NB: we only support one condition right now).
-    # output_differentiability gets populated with True for each condition,
-    # while output_differentiability_conditions gets populated with the conditions
-    output_differentiability_conditions: Optional[List[str]]
-
-    @property
-    def has_derivatives(self) -> bool:
-        return len(self.args_with_derivatives) > 0
-
-def uses_ident(info: Optional[DifferentiabilityInfo], ident: str) -> bool:
-    if info is None:
-        return False
-    for derivative in info.derivatives:
-        formula = derivative.formula
-        if re.search(IDENT_REGEX.format(ident), formula):
-            return True
-    return False
-
-def uses_retain_variables(info: Optional[DifferentiabilityInfo]) -> bool:
-    return uses_ident(info, 'retain_variables')
-
-def uses_single_grad(info: Optional[DifferentiabilityInfo]) -> bool:
-    return uses_ident(info, 'grad')
-
-# Represents a differentiable `Argument`.
-# How is it different from the `Argument` type?
-# - It's processed Arguments which are differentiable and only used in the
-#   context of the autograd codegen;
-# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
-@dataclass(frozen=True)
-class DifferentiableInput:
-    name: str
-    type: Type
-
-    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
-    cpp_type: str
-
-# Represents a differentiable `Return`.
-# How it it different from the `Return` type?
-# - The name in `Return` is optional. Here it is always populated using the same
-#   `cpp.return_names()` method.
-#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
-# - It's processed Returns which are differentiable, in compliance with the
-#   `output_differentiability` field defined in derivatives.yaml (if specified),
-#   and are only used in the context of the autograd codegen;
-@dataclass(frozen=True)
-class DifferentiableOutput:
-    name: str
-    type: Type
-
-    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
-    cpp_type: str
-
-@dataclass(frozen=True)
-class NativeFunctionWithDifferentiabilityInfo:
-    func: NativeFunction
-    info: Optional[DifferentiabilityInfo]
-    fw_derivatives: Sequence[ForwardDerivative]
-
-# TODO: Update comment below since it is out of date.
-def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
-    """How are we going to call the underlying implementation of a
-    declaration?  There are two strategies:
-        - use_derived: we want to call the implementation on CPUDoubleType
-          (or a similar, derived Type instance).  Because these derived
-          instances deal in Tensors, not Variables (it's a completely different
-          object, so it doesn't dispatch back to VariableType), code on
-          this dispatch path needs to wrap/unwrap tensors.  If the
-          derived implementation takes and returns tensors, the
-          implementation is usually differentiable (although we also use
-          the derived dispatch path for non-differentiable functions
-          that we still want to dispatch on the derived Type instance;
-          e.g., size())
-        - use_type: we want to call the implementation on Type, because
-          it is implemented concretely, and the functions it invokes will
-          get dispatched back to VariableType (which will ensure that they
-          are differentiable.)
-    """
-    if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives):
-        # If the function is abstract (not implemented on at::Type), we must
-        # call the implementation on the derived type with unpacked tensors.
-
-        # If the function has a derivative specified and is concrete, we could
-        # call either implementation. We prefer the calling the derived
-        # type's implementation with unpacked tensors because it is more
-        # performant in some cases: any internal calls to other ATen functions
-        # won't have the history tracked.
-
-        # If the function has a type dispatched argument (i.e. is a factory),
-        # we prefer calling the derived type's implementation both because it is
-        # more performant and to ensure factory functions return tensors with _version
-        # of 0 (probably not strictly necessary, but nice to have to keeps versions simple
-        # to understand.
-
-        return 'use_derived'
-    else:
-        # If the function is concrete (we don't have to override it) and we
-        # didn't declare it in derivatives.yaml, we'll assume that it is
-        # actually implemented out of differentiable functions. (This
-        # assumption might not hold, but then you'll see gradcheck fail.)
-        return 'use_type'
-
-def match_differentiability_info(
-    native_functions: List[NativeFunction],
-    differentiability_infos: Sequence[DifferentiabilityInfo],
-) -> List[NativeFunctionWithDifferentiabilityInfo]:
-    """Sets the "derivative" key on declarations to matching autograd function
-    In-place functions will use the out-of-place derivative definition if there
-    is no in-place specific derivative.
-    """
-
-    info_by_schema = {info.func.func: info for info in differentiability_infos}
-    functional_info_by_signature = {
-        info.func.func.signature(strip_default=True): info
-        for info in differentiability_infos
-        if info.func.func.kind() == SchemaKind.functional}
-
-    def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]:
-        if f.func in info_by_schema:
-            return info_by_schema[f.func], True
-
-        # if there is no exact match look for the out-of-place signature.
-        # i.e mul() for mul_() or mul_out()
-        return functional_info_by_signature.get(f.func.signature(strip_default=True)), False
-
-    result: List[NativeFunctionWithDifferentiabilityInfo] = []
-    for f in native_functions:
-        info, is_exact_match = find_info(f)
-
-        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
-        # 'self' derivatives of an inplace function, so we must check for this case.
-        if f.func.kind() == SchemaKind.inplace and (info is not None):
-            for derivative in info.derivatives:
-                if 'self' in derivative.var_names:
-                    for saved_input in derivative.saved_inputs:
-                        assert 'strides_or_error' not in saved_input.expr, (
-                            "Calling '.strides()' in the 'self' derivative formula of an "
-                            f"in-place function is not supported: {f.func}")
-
-        # For functions that have a single def for out-of-place and inplace (like abs())
-        if info and info.forward_derivatives:
-            forward_derivatives = info.forward_derivatives
-
-            if f.func.kind() == SchemaKind.inplace:
-                # For inplace functions there is a little bit of work to do:
-                #  1) Validate the formula and make sure the input that is modified in not used:
-                #    - If there is a formula for the inplace variant of the function (is_exact_match == True) then
-                #      we make sure that the original value of the input that is being modified inplace (self_p) is
-                #      not used in the formula. Note that the formula can use "original_self_p" here and that would
-                #      trigger a clone of the original input.
-                #    - If we are re-using the out of place formula (is_exact_match == False) then we replace every
-                #      occurrence of self_p and self_t by original_self_p and original_self_t. These will be
-                #      populated by cloned version of the original input (either the clone done by the backward AD
-                #      logic if self is also used in a backward formula or a special clone that we add).
-                #  2) At this point, there cannot be a self_p in the formula.
-                #  3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is
-                #     simply called self (as it is modified inplace).
-                #  4) Update the required primals data in case it used to contain "result" but should now contain
-                #     "self"
-                #  5) If it is not an exact match, the user formula is not modifying the existing forward grad
-                #     inplace as it should. So add some code that makes sure that we do so if the forward grad
-                #     already exists.
-
-                assert len(info.forward_derivatives) == 1  # Only single output inplace should exist
-                fw_info = info.forward_derivatives[0]
-                formula = fw_info.formula
-
-                def replace_self_with_original_self(formula: str, postfix: str) -> str:
-                    def repl(m: Match[str]) -> str:
-                        return f'{m.group(1)}original_self{postfix}{m.group(2)}'
-                    return re.sub(IDENT_REGEX.format(f'self{postfix}'), repl, formula)
-
-                if re.search(IDENT_REGEX.format("self_p"), formula):
-                    if is_exact_match:
-                        # For manually defined formulas, don't allow the original value to be used
-                        raise RuntimeError(f'The formula for "{f.func.name}" is using the original value of self '
-                                           'that is being modified inplace. This would lead to wrong forward gradients. '
-                                           'Please use "result" in the formula only.')
-                    else:
-                        # When the original formula is out of place, we save a clone of the primal
-                        # value to be able to access this value if needed
-                        # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t"
-                        formula = replace_self_with_original_self(formula, "_p")
-                        formula = replace_self_with_original_self(formula, "_t")
-
-                # replace "result" from the formula by "self_p"
-                def repl(m: Match[str]) -> str:
-                    return f'{m.group(1)}self_p{m.group(2)}'
-                formula = re.sub(IDENT_REGEX.format("result"), repl, formula)
-
-                required_primals = fw_info.required_inputs_primal
-                if re.search(IDENT_REGEX.format("self_p"), formula):
-                    required_primals = required_primals + ("self",) if required_primals else ("self",)
-
-                if not is_exact_match:
-                    # Make sure that the forward grad is modified inplace when the original formula
-                    # is out of place
-                    formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}"
-
-                required_original_self_value = bool(re.search(IDENT_REGEX.format("original_self_p"), formula))
-
-                forward_derivatives = [ForwardDerivative(
-                    formula=formula,
-                    var_name="self",
-                    var_type=fw_info.var_type,
-                    required_inputs_fw_grad=fw_info.required_inputs_fw_grad,
-                    required_inputs_primal=required_primals,
-                    required_original_self_value=required_original_self_value,
-                    is_reusing_outplace_formula=not is_exact_match), ]
-        else:
-            forward_derivatives = []
-
-        result.append(NativeFunctionWithDifferentiabilityInfo(
-            func=f,
-            info=info,
-            fw_derivatives=forward_derivatives
-        ))
-
-    return result
-
-def is_differentiable(name: str, type: Type, info: Optional[DifferentiabilityInfo]) -> bool:
-    return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names)
-
-def gen_differentiable_outputs(fn: NativeFunctionWithDifferentiabilityInfo) -> List[DifferentiableOutput]:
-    f = fn.func
-    info = fn.info
-    outputs: List[DifferentiableOutput] = [
-        DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret).cpp_type())
-        for name, ret in zip(cpp.return_names(f), f.func.returns)]
-    output_differentiability = info.output_differentiability if info else None
-    if output_differentiability is not None:
-        if len(output_differentiability) != len(outputs):
-            raise RuntimeError(f"The length of output_differentiability ({len(output_differentiability)}), "
-                               f"does not match the number of outputs ({len(outputs)}).")
-        differentiable_outputs: List[DifferentiableOutput] = []
-        if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
-            raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)")
-        for differentiable, output in zip(output_differentiability, outputs):
-            if differentiable:
-                differentiable_outputs.append(output)
-        return differentiable_outputs
-    candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type, info), outputs))
-    if uses_single_grad(info):
-        return candidate_differentiable_outputs[:1]
-    else:
-        return candidate_differentiable_outputs
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
deleted file mode 100644
index a485fc17acf6..000000000000
--- a/tools/codegen/api/cpp.py
+++ /dev/null
@@ -1,317 +0,0 @@
-from tools.codegen.model import (Argument, Arguments, BaseTy, BaseType,
-                                 FunctionSchema, ListType, NativeFunction,
-                                 OptionalType, Return, SelfArgument,
-                                 TensorOptionsArguments, Type)
-from tools.codegen.api.types import (ArgName, BaseCType, Binding, ConstRefCType, NamedCType, CType,
-                                     MutRefCType, ArrayCType, ListCType, VectorCType, ArrayRefCType,
-                                     OptionalCType, TupleCType, SpecialArgName, boolT, scalarT,
-                                     tensorListT, dimnameListT, tensorT, voidT, longT,
-                                     BaseTypeToCppMapping, intArrayRefT, tensorOptionsT)
-from tools.codegen import local
-from tools.codegen.utils import assert_never
-from typing import Optional, Sequence, Union, List, Set
-
-# This file describes the translation of JIT schema to the public C++
-# API, which is what people use when they call functions like at::add.
-#
-# Prominent characteristics of the C++ API:
-#
-#   - dtype, layout, device and pin_memory are collected into
-#     a single C++ type TensorOptions  (the native functions API
-#     also has this, but tensor options is really most relevant
-#     for the C++ API; it makes calling kwarg factory functions
-#     pleasant)
-#
-#   - defaulting lives here (in fact, the dispatcher is completely
-#     oblivious of defaults!)
-#
-# BTW: policy on name collisions: we try not to have types with
-# collisions, but functions are fair game to collide
-
-def name(func: FunctionSchema, *, faithful_name_for_out_overloads: bool = False) -> str:
-    name = str(func.name.name)
-    if func.is_out_fn():
-        if faithful_name_for_out_overloads:
-            name += '_outf'
-        else:
-            name += '_out'
-
-    return name
-
-# Translation of "value types" in JIT schema to C++ API type.  Value
-# types look the same no matter if they are argument types or return
-# types.  Returns None if the type in question is not a value type.
-def valuetype_type(t: Type, *, binds: ArgName, remove_non_owning_ref_types: bool = False) -> Optional[NamedCType]:
-    if isinstance(t, BaseType):
-        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
-            return None
-        if remove_non_owning_ref_types:
-            if t.name == BaseTy.str:
-                raise AssertionError("string ref->value conversion: not implemented yet")
-        # All other BaseType currently map directly to BaseCppTypes.
-        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
-    elif isinstance(t, OptionalType):
-        elem = valuetype_type(t.elem, binds=binds)
-        if elem is None:
-            return None
-        return NamedCType(binds, OptionalCType(elem.type))
-    elif isinstance(t, ListType):
-        if str(t.elem) == 'bool':
-            assert t.size is not None
-            return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size))
-        else:
-            return None
-    else:
-        raise AssertionError(f"unrecognized type {repr(t)}")
-
-# Translation of types occuring in JIT arguments to a C++ argument type.
-# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
-# For example, we'll return std::vector<int> instead of IntArrayRef.
-# See Note [translation from C++ reference to value types]
-def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType:
-    # If it's a value type, do the value type translation
-    r = valuetype_type(t, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types)
-    if r is not None:
-        return r
-
-    if isinstance(t, BaseType):
-        if t.name == BaseTy.Tensor:
-            if mutable and not local.use_const_ref_for_mutable_tensors():
-                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
-            else:
-                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
-        elif t.name == BaseTy.Scalar:
-            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
-        else:
-            raise AssertionError(f"base type should have been value type {t}")
-    elif isinstance(t, OptionalType):
-        if str(t.elem) == 'Tensor':
-            if mutable and not local.use_const_ref_for_mutable_tensors():
-                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))  # TODO: fix this discrepancy
-            else:
-                return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(tensorT))))
-        elif str(t.elem) == 'Scalar':
-            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
-        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
-        return NamedCType(binds, OptionalCType(elem.type))
-    elif isinstance(t, ListType):
-        # TODO: remove these special cases, ArrayRef fallthrough works fine
-        if str(t.elem) == 'int':
-            if remove_non_owning_ref_types:
-                return NamedCType(binds, VectorCType(BaseCType(longT)))
-            else:
-                return NamedCType(binds, BaseCType(intArrayRefT))
-        elif str(t.elem) == 'Tensor':
-            return NamedCType(binds, BaseCType(tensorListT))
-        elif str(t.elem) == 'Scalar':
-            return NamedCType(binds, ArrayRefCType(BaseCType(scalarT)))
-        elif str(t.elem) == 'Dimname':
-            return NamedCType(binds, BaseCType(dimnameListT))
-        elif str(t.elem) == 'Tensor?':
-            return NamedCType(binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))))
-        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
-        return NamedCType(binds, ArrayRefCType(elem.type))
-    else:
-        raise AssertionError(f"unrecognized type {repr(t)}")
-
-# Translate a JIT argument into its C++ type
-def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
-    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
-
-# Translation of a (non-multi) return type from JIT to C++
-# N.B: returntype_type returns a CType, not a NamedCType.
-# This is mostly because of the mismatch between return types and return names.
-# e.g. a function with a return type of 'void' has 0 return names,
-# and a function with a return type of 'std::tuple' has >1 return name.
-def returntype_type(t: Type, *, mutable: bool) -> CType:
-    # placeholder is ignored
-    r = valuetype_type(t, binds="__placeholder__")
-    if r is not None:
-        return r.type
-
-    if isinstance(t, BaseType):
-        if t.name == BaseTy.Tensor:
-            if mutable:
-                if local.use_const_ref_for_mutable_tensors():
-                    return ConstRefCType(BaseCType(tensorT))
-                else:
-                    return MutRefCType(BaseCType(tensorT))
-            else:
-                # Note [Tensor Copy Returns]
-                # Currently, we use "Argument.is_write" to determine
-                # whether or not Tensor return types should be copies or references.
-                # If that ever changes, take a look at other locations of this note!
-                return BaseCType(tensorT)
-        elif t.name == BaseTy.Scalar:
-            return BaseCType(scalarT)
-    elif isinstance(t, ListType):
-        elem = returntype_type(t.elem, mutable=mutable)
-        assert t.size is None, f"fixed size list returns not supported: {t}"
-        return VectorCType(elem)
-
-    raise AssertionError(f"unrecognized return type {t}")
-
-# Translation of a single return to its C++ type
-def return_type(r: Return) -> CType:
-    return returntype_type(r.type, mutable=r.is_write)
-
-# Translation of a full (possibly multi) return from JIT to its C++ type
-def returns_type(rs: Sequence[Return]) -> CType:
-    if len(rs) == 0:
-        return BaseCType(voidT)
-    elif len(rs) == 1:
-        return return_type(rs[0])
-    else:
-        return TupleCType([return_type(r) for r in rs])
-
-def return_names(f: NativeFunction, *, fallback_name: str = 'result') -> Sequence[str]:
-    returns: List[str] = []
-    for i, r in enumerate(f.func.returns):
-        # If we have an inplace function, the return argument is
-        # implicitly named self.
-        # TODO: Consider incorporating this into the data model
-        if f.func.name.name.inplace:
-            assert i == 0, "illegal inplace function with multiple returns"
-            name = 'self'
-        # If we are out function, the name is the name of the
-        # corresponding output function (r.name will get recorded
-        # in field_name later.)
-        elif f.func.is_out_fn():
-            name = f.func.arguments.out[i].name
-        # If the return argument is explicitly named...
-        elif r.name:
-            name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
-            if name_conflict and not f.func.is_out_fn():
-                name = f'{r.name}_return'
-            else:
-                name = r.name
-        # If there is no explicit name and no fallback name was passed in, we just name the output result,
-        # unless it's a multi-return, in which case it's result0,
-        # result1, etc (zero-indexed)
-        else:
-            name = fallback_name if len(f.func.returns) == 1 else f'{fallback_name}{i}'
-        returns.append(name)
-    return returns
-
-JIT_TO_CPP_DEFAULT = {
-    'False': 'false',
-    'True': 'true',
-    'None': 'c10::nullopt',  # UGH this one is type directed
-    'Mean': 'at::Reduction::Mean',
-    '[]': '{}',
-    'contiguous_format': 'MemoryFormat::Contiguous',
-    'long': 'at::kLong',
-}
-
-# Convert a JIT default into C++ expression representing the default
-def default_expr(d: str, t: Type) -> str:
-    if d == 'None' and str(t) == 'Tensor?':
-        return '{}'
-    if isinstance(t, BaseType) and t.name is BaseTy.str:
-        # Schema allows single quotes but C++ needs double
-        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
-            s = ''
-            i = 1
-            while i + 1 < len(d):
-                if d[i] != '\\':
-                    if d[i] == '"':
-                        s += '\\"'
-                    else:
-                        s += d[i]
-                    i += 1
-                else:
-                    if d[i + 1] == "'":
-                        s += "'"
-                    else:
-                        s += d[i:i + 2]
-                    i += 2
-
-            return f'"{s}"'
-
-    if isinstance(t, OptionalType):
-        if d == 'None':
-            return 'c10::nullopt'
-
-        return default_expr(d, t.elem)
-
-    if isinstance(t, ListType):
-        if (d.startswith('[') and d.endswith(']')):
-            return '{' + d[1:-1] + '}'
-        elif t.size is None:
-            # NOTE: Sized lists can have scalar defaults
-            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
-
-    return JIT_TO_CPP_DEFAULT.get(d, d)
-
-# Convert an argument into its C++ API form
-
-def argument(
-    a: Union[Argument, TensorOptionsArguments, SelfArgument],
-    *, cpp_no_default_args: Set[str], method: bool, faithful: bool,
-    has_tensor_options: bool
-) -> List[Binding]:
-    def sub_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Binding]:
-        return argument(
-            a, cpp_no_default_args=cpp_no_default_args, method=method, faithful=faithful,
-            has_tensor_options=has_tensor_options)
-
-    if isinstance(a, Argument):
-        binds: ArgName
-        if a.name == "memory_format" and has_tensor_options:
-            binds = SpecialArgName.possibly_redundant_memory_format
-        else:
-            binds = a.name
-        default: Optional[str] = None
-        if a.name not in cpp_no_default_args and a.default is not None:
-            default = default_expr(a.default, a.type)
-        return [Binding(
-            nctype=argument_type(a, binds=binds),
-            name=a.name,
-            default=default,
-            argument=a,
-        )]
-    elif isinstance(a, TensorOptionsArguments):
-        if faithful:
-            return sub_argument(a.dtype) + sub_argument(a.layout) + \
-                sub_argument(a.device) + sub_argument(a.pin_memory)
-        else:
-            default = None
-            # Enforced by NativeFunction.__post_init__
-            assert 'options' not in cpp_no_default_args
-            if all(x.default == "None" for x in a.all()):
-                default = '{}'
-            elif a.dtype.default == "long":
-                default = 'at::kLong'  # TODO: this is wrong
-            return [Binding(
-                nctype=NamedCType('options', BaseCType(tensorOptionsT)),
-                name='options',
-                default=default,
-                argument=a,
-            )]
-    elif isinstance(a, SelfArgument):
-        if method:
-            # Caller is responsible for installing implicit this in context!
-            return []
-        else:
-            return sub_argument(a.argument)
-    else:
-        assert_never(a)
-
-def arguments(
-    arguments: Arguments,
-    *, faithful: bool, method: bool, cpp_no_default_args: Set[str]
-) -> List[Binding]:
-    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
-    if faithful:
-        args.extend(arguments.non_out)
-        args.extend(arguments.out)
-    else:
-        args.extend(arguments.out)
-        args.extend(arguments.non_out)
-    return [
-        r.no_default() if faithful else r for a in args
-        for r in argument(
-            a, faithful=faithful, method=method,
-            has_tensor_options=arguments.tensor_options is not None,
-            cpp_no_default_args=cpp_no_default_args)
-    ]
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
deleted file mode 100644
index 6738fbef5b49..000000000000
--- a/tools/codegen/api/dispatcher.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from tools.codegen.model import (Argument, FunctionSchema, Return,
-                                 SelfArgument, TensorOptionsArguments, Type)
-
-from tools.codegen.api.types import ArgName, Binding, NamedCType, CType
-from tools.codegen.api import cpp
-from tools.codegen.utils import concatMap, assert_never
-
-import itertools
-from typing import Sequence, List, Union
-
-# This file describes the translation of JIT schema to the dispatcher
-# API, the *unboxed* calling convention by which invocations through
-# the dispatcher are made.  Historically, the dispatcher API matched
-# the C++ API, but with the establishment of the boxed API, we've
-# made changes to the dispatcher API to so that the unboxed API
-# better aligns with the boxed API.  The dispatcher API hooks heavily
-# into our template based boxing/unboxing machinery, so changes
-# to this convention will usually need template updates too.
-#
-# Prominent characteristics of the dispatcher API:
-#
-#   - dtype, layout, device and pin_memory are represented as separate
-#     arguments.
-#
-
-def name(func: FunctionSchema) -> str:
-    return cpp.name(func)
-
-def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType:
-    # This is a faux amis.  If it makes sense in the future to add
-    # more special cases here, or invert things so cpp.argument_type
-    # calls this, or just completely inline the function, please do
-    # it.
-    return cpp.argumenttype_type(t, mutable=mutable, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types)
-
-def argument_type(a: Argument, *, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType:
-    return argumenttype_type(a.type, mutable=a.is_write, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types)
-
-def returns_type(rs: Sequence[Return]) -> CType:
-    # At present, there is no difference. But there could be!
-    return cpp.returns_type(rs)
-
-def jit_arguments(func: FunctionSchema) -> List[Argument]:
-    def to_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Argument]:
-        if isinstance(a, Argument):
-            return [a]
-        elif isinstance(a, SelfArgument):
-            return [a.argument]
-        elif isinstance(a, TensorOptionsArguments):
-            return [a.dtype, a.layout, a.device, a.pin_memory]
-        else:
-            assert_never(a)
-    return list(concatMap(to_argument, itertools.chain(
-        func.arguments.positional,
-        func.arguments.kwarg_only,
-        func.arguments.out)))
-
-def argument(a: Argument, *, remove_non_owning_ref_types: bool = False) -> Binding:
-    return Binding(
-        nctype=argument_type(a, binds=a.name, remove_non_owning_ref_types=remove_non_owning_ref_types),
-        name=a.name,
-        argument=a
-    )
-
-def arguments(func: FunctionSchema) -> List[Binding]:
-    return [argument(a) for a in jit_arguments(func)]
diff --git a/tools/codegen/api/functionalization.py b/tools/codegen/api/functionalization.py
deleted file mode 100644
index ebd30ab94c9d..000000000000
--- a/tools/codegen/api/functionalization.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from tools.codegen.model import (
-    FunctionSchema, BaseTy, BaseType, NativeFunction, Argument, Tag,
-)
-from tools.codegen.api.types import (
-    Binding, NamedCType, ConstRefCType, BaseCType, CType, tensorT, longT
-)
-from tools.codegen.api import dispatcher
-from typing import List, Optional
-
-
-# This file describes the translation of JIT schema to API's used
-# when creating view lambdas that are used by the functionalization pass.
-# There are two types of lambdas: forward lambdas and reverse lambdas.
-# These API's mostly follow the dispatcher API, with a few quirks:
-# - The lambda capture has to convert reference types to value types
-# - While the forward lambda just directly calls into the at::_ops API
-#   (following the dispatcher convention), the logic here for the reverse lambda
-#   is responsible for generating both the call-site, and the declarations
-#   (which are implemented manually in the at::functionalization::impl namespace).
-
-# The lambdas generated for each view op in the functionalization pass are of the form
-# [capture_arguments](outer_arguments) -> returns_type {
-#     return name(inner_arguments);
-# }
-
-# Define some specific lambda input arguments.
-base_binding = Binding(
-    name='base',
-    nctype=NamedCType(name='base', type=ConstRefCType(BaseCType(tensorT))),
-    argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None),
-    default=None)
-mutated_view_binding = Binding(
-    name='mutated_view',
-    nctype=NamedCType(name='mutated_view', type=ConstRefCType(BaseCType(tensorT))),
-    argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None),
-    default=None)
-mutated_view_idx_binding = Binding(
-    name='mutated_view_idx',
-    nctype=NamedCType(name='mutated_view_idx', type=BaseCType(longT)),
-    argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None),
-    default=None)
-
-# The lambda capture itself doesn't have a name.
-# The name returned here corresponds to the name of the inner function called by the lambda.
-def name(f: NativeFunction, *, functional_op: NativeFunction, is_reverse: bool, include_namespace: bool) -> str:
-    # For inplace_view ops, the lambda calls out to the corresponding functional view op
-    fn = functional_op if f.tag is Tag.inplace_view else f
-    name = fn.func.name.unambiguous_name()
-    if is_reverse:
-        # in the reverse case, we codegen both the call-sites (which need the full namespace) and the declarations (which don't)
-        if include_namespace:
-            return f'at::functionalization::FunctionalInverses::{name}_inverse'
-        else:
-            return f'{name}_inverse'
-    # in the forward case, we just diretly call into the at::_ops API (so we always need the namespace)
-    assert include_namespace
-    return f'at::_ops::{name}::call'
-
-
-def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> List[Binding]:
-    # capture arguments include all arguments except `self`.
-    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
-    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
-    args = func.arguments.flat_all
-    assert args[0].type == BaseType(BaseTy.Tensor)
-    non_self_args = args[1:]
-    non_self_value_bindings = [dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args]
-    return non_self_value_bindings
-
-
-def returns_type(func: FunctionSchema) -> CType:
-    # Assertion: all view ops return tensor-like outputs
-    assert len(func.returns) >= 1
-    for ret in func.returns:
-        assert ret.type.is_tensor_like()
-    # However, the return type of the lambda is always an individual tensor.
-    # For multi-tensor outputs, each tensor needs to be tracked individually.
-    return BaseCType(tensorT)
-
-
-def outer_arguments(*, is_reverse: bool) -> List[Binding]:
-    if is_reverse:
-        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
-    else:
-        return [base_binding, mutated_view_idx_binding]
-
-
-def inner_call_index(func: FunctionSchema) -> Optional[Binding]:
-    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
-    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
-    if len(func.returns) > 1 or (len(func.returns) == 1 and func.returns[0].type.is_list_like()):
-        return mutated_view_idx_binding
-    return None
-
-
-def inner_arguments(func: FunctionSchema, is_reverse: bool) -> List[Binding]:
-    args = func.arguments.flat_all
-    assert args[0].type == BaseType(BaseTy.Tensor)
-    non_self_args = args[1:]
-    # The forward lambda calls the at::_ops API, while the reverse lambda calls the view inverse API.
-    # Both of these follow the dispatcher API.
-    non_self_bindings = [dispatcher.argument(a) for a in non_self_args]
-    if not is_reverse:
-        # the forward lambda swaps out the original tensor argument with the lambd arg "base"
-        return [base_binding] + non_self_bindings
-    else:
-        # the reverse lambda does the same, but with an additional "mutated_view" arg
-        # additionally, we have a calling convention: for view ops that return multiple tensor outputs
-        # their corresponding view_inverse function takes in an additional index argument.
-        index_binding = inner_call_index(func)
-        if index_binding is not None:
-            return [base_binding, mutated_view_binding, index_binding] + non_self_bindings
-        else:
-            return [base_binding, mutated_view_binding] + non_self_bindings
diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py
deleted file mode 100644
index 3fe83936eef8..000000000000
--- a/tools/codegen/api/lazy.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from typing import List, Union, Tuple
-from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType,
-                                 ListType, OperatorName, FunctionSchema,
-                                 Return)
-from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType,
-                                     ConstRefCType, NamedCType,
-                                     MutRefCType,
-                                     VectorCType, boolT, longT, doubleT, ListCType, stringT,
-                                     scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType)
-
-valueT = BaseCppType('torch::lazy', 'Value')
-
-
-def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, ListCType]:
-    """
-    This function takes a type from NativeFunctions and converts it for use with
-    lazy tensor codegen.  Currently its output is used in several places, and so far
-    it has been possible for them to all use the same conversions, but that may not be
-    optimal or possible in the finished system.
-
-    Type conversion for lazy currently consists of
-     (1) changing Tensor-like things into Value-like things
-     (2) wrapping everything in a BaseCType
-     (3) making reference types into values (e.g. vector instead of IntArrayRef)
-
-    (1) converts Tensors to Values since Values are how Lazy IR represents tensors.  There
-    is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like'
-
-    This is incomplete- there are assertions in places that it's expected to need to add
-    more types as the codegen is used with more operators.
-    """
-    if isinstance(typ, BaseType):
-        if typ.name == BaseTy.Tensor:
-            return BaseCType(valueT)
-        elif typ.name == BaseTy.Scalar:
-            return BaseCType(scalarT)
-        elif typ.name == BaseTy.ScalarType:
-            return BaseCType(scalarTypeT)
-        elif typ.name == BaseTy.int:
-            return BaseCType(longT)
-        elif typ.name == BaseTy.bool:
-            return BaseCType(boolT)
-        elif typ.name == BaseTy.float:
-            return BaseCType(doubleT)
-        elif typ.name == BaseTy.str:
-            return BaseCType(stringT)
-        else:
-            raise AssertionError(f"TODO add support for type {repr(typ)}")
-    elif isinstance(typ, OptionalType):
-        return OptionalCType(process_ir_type(typ.elem))
-    elif isinstance(typ, ListType):
-        if str(typ.elem) == 'Tensor?':
-            # TODO(whc) is this actually correct? or should it use a Vector like above
-            return ListCType(OptionalCType(BaseCType(valueT)))
-        else:
-            return VectorCType(process_ir_type(typ.elem))
-    else:
-        raise AssertionError(f"unrecognized type {repr(typ)}")
-
-
-def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRefCType,
-                           ListCType, ArrayRefCType, ArrayCType, VectorCType, TupleCType]) -> bool:
-    """
-    Given a type, determine if it is a Value-like type.  This is equivalent to
-    being Tensor-like, but assumes the type has already been transformed.
-    """
-    if isinstance(typ, BaseCType):
-        return typ.type == valueT
-    elif isinstance(typ, (OptionalCType, ListCType, VectorCType)):
-        return isValueType(typ.elem)
-    else:
-        return False
-
-# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node.
-# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML),
-# but carries type information from a native FunctionSchema modified for use with IR nodes,
-# and preserving original argument names.
-
-
-class LazyIrSchema:
-    # The name of the operator this function schema describes.
-    name: 'OperatorName'
-
-    positional_arg_types: Tuple[NamedCType, ...]
-    keyword_arg_types: Tuple[NamedCType, ...]
-
-    # TODO: Need to handle collisions with argument names at some point
-    returns: Tuple['Return', ...]
-
-    def __init__(self, func: FunctionSchema):
-
-        positional_arg_types = []
-        for arg_field in ["pre_self_positional",
-                          "self_arg",
-                          "post_self_positional"]:
-            if arg_field == "self_arg" and func.arguments.self_arg is not None:
-                arg = getattr(func.arguments, "self_arg").argument
-                positional_arg_types.append(NamedCType(arg.name, process_ir_type(arg.type)))
-            elif getattr(func.arguments, arg_field) is not None:
-                positional_arg_types.extend([
-                    NamedCType(
-                        arg.name,
-                        process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)])
-        self.positional_arg_types = tuple(positional_arg_types)
-
-        keyword_arg_types = []
-        for arg_field in ["pre_tensor_options_kwarg_only",
-                          "tensor_options",
-                          "post_tensor_options_kwarg_only",
-                          "out"]:
-            if getattr(func.arguments, arg_field) is not None:
-                keyword_arg_types.extend([
-                    NamedCType(
-                        arg.name,
-                        process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)])
-        self.keyword_arg_types = tuple(keyword_arg_types)
-        self.name = func.name
-        self.returns = func.returns
-
-    @property
-    def node_name(self) -> str:
-        """
-        Return camel-case version of op in node.
-
-        Note: This function also appends any `overload_name` in the operation.
-        For example, if the op is `bitwise_and.Tensor`, the returned name
-        will be `BitwiseAndTensor`.
-        """
-        op_name = f"{self.name.name}_{self.name.overload_name}".lower()
-        return "".join(word.capitalize() or "" for word in op_name.split("_"))
-
-    @property
-    def aten_name(self) -> str:
-        return f"{self.name.name}"
-
-    @property
-    def base_name(self) -> str:
-        return f"{self.name.name.base}"
-
-    def filtered_types(self, positional: bool = True, keyword: bool = True,
-                       values: bool = True, scalars: bool = True) -> List[NamedCType]:
-        types: List[NamedCType] = []
-        if positional:
-            types.extend(self.positional_arg_types)
-        if keyword:
-            types.extend(self.keyword_arg_types)
-
-        if values and scalars:
-            return types
-
-        if values:
-            return [t for t in types if isValueType(t.type)]
-        elif scalars:
-            return [t for t in types if not isValueType(t.type)]
-
-        return []
-
-    @property
-    def positional_values(self) -> List[NamedCType]:
-        return self.filtered_types(positional=True, keyword=False, values=True, scalars=False)
-
-    @property
-    def positional_scalars(self) -> List[NamedCType]:
-        return self.filtered_types(positional=True, keyword=False, values=False, scalars=True)
-
-    @property
-    def keyword_values(self) -> List[NamedCType]:
-        return self.filtered_types(positional=False, keyword=True, values=True, scalars=False)
-
-    @property
-    def keyword_scalars(self) -> List[NamedCType]:
-        return self.filtered_types(positional=False, keyword=True, values=False, scalars=True)
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
deleted file mode 100644
index d072f20d4270..000000000000
--- a/tools/codegen/api/native.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from tools.codegen.model import (Argument, FunctionSchema, Return,
-                                 SelfArgument, TensorOptionsArguments, Type)
-
-from tools.codegen.api.types import (ArgName, BaseCType, Binding,
-                                     ConstRefCType, NamedCType, CType, MutRefCType, ListCType,
-                                     OptionalCType, tensorT, scalarT, layoutT,
-                                     deviceT, boolT, scalarTypeT)
-from tools.codegen.api import cpp
-from tools.codegen import local
-from tools.codegen.utils import assert_never
-
-from typing import Union, Sequence, List, Optional
-
-# This file describes the translation of JIT schema to the native functions API.
-# This looks a lot like the C++ API (which makes historical sense, because the
-# idea was you wrote native functions to implement functions in the C++ API),
-# but over time we have evolved the C++ API without actually changing our
-# native:: kernels.  The intention is to make native API and dispatcher API
-# line up as closely as possible, since this results in the least overhead
-# (no translation is needed from dispatcher API to native API).
-
-def name(func: FunctionSchema) -> str:
-    name = str(func.name.name)
-    # TODO: delete this!
-    if func.is_out_fn():
-        name += '_out'
-    if func.name.overload_name:
-        name += f'_{func.name.overload_name}'
-    return name
-
-def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType:
-    if str(t) == 'Tensor?':
-        tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT))
-        if mutable and not local.use_const_ref_for_mutable_tensors():
-            return NamedCType(binds, MutRefCType(tensor_type))
-        else:
-            return NamedCType(binds, ConstRefCType(tensor_type))
-    elif str(t) == 'Tensor?[]':
-        return NamedCType(binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))))
-    elif str(t) == 'Scalar':
-        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
-    elif str(t) == 'Scalar?':
-        return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
-    return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
-
-def returns_type(rs: Sequence[Return]) -> CType:
-    return cpp.returns_type(rs)
-
-def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
-    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
-
-def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool) -> List[Binding]:
-    # Ideally, we NEVER default native functions.  However, there are a number
-    # of functions that call native:: directly and rely on the defaulting
-    # existing.  So for BC, we generate defaults for non-out variants (but not
-    # for out variants, where it is impossible to generate an appropriate
-    # default)
-    should_default = not is_out
-    if isinstance(a, Argument):
-        default: Optional[str] = None
-        if should_default and a.default is not None:
-            default = cpp.default_expr(a.default, a.type)
-        return [Binding(
-            nctype=argument_type(a, binds=a.name),
-            name=a.name,
-            default=default,
-            argument=a,
-        )]
-    elif isinstance(a, SelfArgument):
-        # Erase SelfArgument from the distinction
-        return argument(a.argument, is_out=is_out)
-    elif isinstance(a, TensorOptionsArguments):
-        default = None
-        if should_default:
-            default = '{}'
-        # TODO: Not sure why the arguments assigned here are for
-        # TensorOptionsArguments and not the constituent pieces.  It seems
-        # to matter
-        return [
-            Binding(
-                nctype=NamedCType('dtype', OptionalCType(BaseCType(scalarTypeT))),
-                name='dtype',
-                default=default,
-                argument=a,
-            ),
-            Binding(
-                nctype=NamedCType('layout', OptionalCType(BaseCType(layoutT))),
-                name='layout',
-                default=default,
-                argument=a,
-            ),
-            Binding(
-                nctype=NamedCType('device', OptionalCType(BaseCType(deviceT))),
-                name='device',
-                default=default,
-                argument=a,
-            ),
-            Binding(
-                nctype=NamedCType('pin_memory', OptionalCType(BaseCType(boolT))),
-                name='pin_memory',
-                default=default,
-                argument=a,
-            )]
-    else:
-        assert_never(a)
-
-def arguments(func: FunctionSchema) -> List[Binding]:
-    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
-    args.extend(func.arguments.non_out)
-    args.extend(func.arguments.out)
-    return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())]
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
deleted file mode 100644
index 6c362cb87387..000000000000
--- a/tools/codegen/api/python.py
+++ /dev/null
@@ -1,1205 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Union, Sequence, Set, List, Dict, Tuple
-
-from tools.codegen.api.types import Binding, CppSignature, CppSignatureGroup
-from tools.codegen.api import cpp
-from tools.codegen.gen import pythonify_default
-from tools.codegen.model import (Argument, BaseTy, BaseType, ListType,
-                                 NativeFunction, OptionalType, Return, Type,
-                                 Variant)
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                           Data Models
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-# [Notes] python binding codegen
-#
-# The Python binding codegen produces code that takes the input list of
-# PyObjects, finds the matching ATen C++ function using PythonArgParser,
-# converts the PyObjects into C++ types and calls the ATen C++ function:
-#
-# +--------+  parsing   +------------------------+  binding   +-----------------------+
-# | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch |
-# +--------+            +------------------------+            +-----------------------+
-#
-# The following examples demonstrate the data models the Python binding
-# codegen needs to deal with and the tasks it needs to accomplish. It
-# helps understand the purpose of the new data types we introduced below.
-#
-#  - Function Schema (source of truth)
-#
-#      aten::empty.names(int[] size, *, Dimname[]? names,
-#                        ScalarType? dtype=None, Layout? layout=None,
-#                        Device? device=None, bool? pin_memory=None,
-#                        MemoryFormat? memory_format=None) -> Tensor
-#
-#  - Python Signature
-#
-#    It's used to generate input schema string for PythonArgParser.
-#    Note: TensorOptions fields are reordered and the additional
-#    'requires_grad' field is added:
-#
-#      empty(IntArrayRef size, *, DimnameList? names,
-#            MemoryFormat? memory_format=None, ScalarType dtype=None,
-#            Layout layout=torch.strided, Device device=None,
-#            bool pin_memory=False, bool requires_grad=False)
-#
-#  - C++ Signature
-#
-#    It's used to generate C++ lambda formals & dispatch call.
-#    Note: the scattered TensorOptions fields are packed into 'options'.
-#
-#      auto dispatch_empty =
-#          [](IntArrayRef size, c10::optional<DimnameList> names,
-#             const TensorOptions & options,
-#             c10::optional<MemoryFormat> memory_format) -> Tensor {
-#          pybind11::gil_scoped_release no_gil;
-#          return torch::empty(size, names, options, memory_format);
-#      };
-#
-#  - Binding between Python Arguments and C++ Arguments
-#
-#    Given a set of Python Arguments in scope, we need produce the
-#    binding expressions that translate the Python API into C++ API:
-#
-#            Python Args               Cpp Args       Binding Exprs
-#     -----------------------------------------------------------------
-#         0: size                      size           '_r.intlist(0)'
-#         1: names                     names          'names' [special init]
-#         2: memory_format -------+
-#         3: dtype         -----+-|--> options        'options' [special packing]
-#         4: layout            /  |
-#         5: device           /   +--> memory_format  '_r.memoryformatOptional(2)'
-#         6: pin_memory      /
-#         7: requires_grad -+
-#
-#    So the full dispatch expression would look like:
-#
-#      dispatch_empty(_r.intlist(0), names, options,
-#                     _r.memoryformatOptional(2))
-#
-#    Where does 'names' come from? It involves special local init:
-#
-#      auto __names = _r.toDimnameListOptional(1);
-#      c10::optional<DimnameList> names =
-#          __names ? c10::make_optional(DimnameList(__names.value()))
-#                  : c10::nullopt;
-#
-#    Where does 'options' come from? It involves special local init
-#    for TensorOptions. Note that Python side has the additional
-#    'requires_grad' field:
-#
-#      const auto options = TensorOptions()
-#          .dtype(_r.scalartype(3))
-#          .device(_r.device(5))
-#          .layout(_r.layoutOptional(4))
-#          .requires_grad(_r.toBool(7))
-#          .pinned_memory(_r.toBool(6));
-#
-#    In some other cases one Python Argument can map to multiple C++
-#    Arguments. For example:
-#
-#     aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False)
-#       -> (Tensor values, Tensor indices)
-#
-#            Python Args               Cpp Args          Binding Exprs
-#     ---------------------------------------------------------------------
-#                               +----> max               'out[0]'
-#                              /-----> max_values        'out[1]
-#         0: input            /        self              '_r.tensor(0)'
-#         1: dim             /         dim               '_r.dimname(1)'
-#         2: keepdim        /          keepdim           '_r.toBool(2)'
-#         3: out      -----+           [local init] out  '_r.tensorlist_n<2>(3)'
-#
-#    As demonstrated above, the binding can involve reordering,
-#    packing, unpacking and special local inits.
-#
-#
-#  Let's look at a concrete example:
-#
-#      static PythonArgParser parser({
-#        "abs(Tensor input, *, Tensor out=None)",
-#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#         ^
-#         +--- Python Schema, represented by PythonSignature and PythonArgument
-#
-#      }, /*traceable=*/true);
-#
-#      ParsedArgs<2> parsed_args;
-#      auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
-#
-#      ...
-#
-#      if (_r.isNone(1)) {
-#          ~~~~~~~~~~~~  <--- Scattered PythonArgParser output (arg name = 'out')
-#                             represented by PythonArgParserOutputExpr
-#
-#        // aten::abs(Tensor self) -> Tensor
-#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#         ^
-#         +--- NativeFunction schema, base version
-#
-#        auto dispatch_abs = [](const Tensor & self) -> Tensor {
-#                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#                             ^
-#                             +--- dispatch_lambda_args / dispatch_lambda_return_str
-#                                  generated from NativeFunction / CppSignature
-#                                  (deprecated PythonSignature is special)
-#                                  arguments are represented by DispatchLambdaArgument
-#
-#          pybind11::gil_scoped_release no_gil;
-#          return self.abs();
-#                 ~~~~~~~~~~~  <--- cpp_dispatch_target / cpp_dispatch_exprs
-#                                   generated from NativeFunction / CppSignature
-#        };
-#        return wrap(dispatch_abs(_r.tensor(0)));
-#                                 ~~~~~~~~~~~~~
-#                                  ^
-#                                  +--- dispatch_lambda_exprs
-#                                       binding PythonArgParserOutputExpr (python args)
-#                                       and DispatchLambdaArgument (c++ args)
-#
-#      } else {
-#        // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#         ^
-#         +--- NativeFunction schema, out-variant
-#
-#        auto dispatch_abs_out = [](Tensor out, const Tensor & self) -> Tensor {
-#          pybind11::gil_scoped_release no_gil;
-#          return at::abs_out(out, self);
-#        };
-#        return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0)));
-#      }
-#
-#
-# [Notes] python interface codegen
-# The python dataclasses below are used used to generate both python binding code
-# and pyi type hint signatures.
-# In theory these two should look very similar, but there are number of differences
-# in how pyi signatures vs. python_arg_parser signatures are generated.
-# These differences have been encapsulated in signature_str() vs. signature_str_pyi()
-# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments.
-# For examples, only pyi signatures include return types.
-
-@dataclass(frozen=True)
-class PythonReturns:
-    returns: Tuple[Return, ...]
-
-    def named_tuple_pyi(self) -> Optional[Tuple[str, str]]:
-        python_returns = [argument_type_str_pyi(r.type) for r in self.returns]
-        field_names = namedtuple_fieldnames(self.returns)
-        if field_names:
-            namedtuple_name = '_'.join(['namedtuple'] + field_names)
-            tuple_args = [f'("{name}", {typ})' for name, typ in zip(field_names, python_returns)]
-            namedtuple_def = f'NamedTuple("{namedtuple_name}", [{", ".join(tuple_args)}])'
-            return namedtuple_name, namedtuple_def
-        return None
-
-    def returns_str_pyi(self) -> str:
-        named_tuple = self.named_tuple_pyi()
-        if named_tuple is not None:
-            namedtuple_name, _ = named_tuple
-            return namedtuple_name
-
-        python_returns = [argument_type_str_pyi(r.type) for r in self.returns]
-        if len(python_returns) > 1:
-            return 'Tuple[' + ', '.join(python_returns) + ']'
-        if len(python_returns) == 1:
-            return python_returns[0]
-        return 'None'
-
-
-@dataclass(frozen=True)
-class PythonArgument:
-    name: str
-    type: Type
-    default: Optional[str]
-
-    # Used to generate the default init expr for some PythonArgParser outputs, e.g.:
-    #
-    #   _r.layoutWithDefault(3, layout_from_backend(self.options().backend())))
-    #                           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    #                            ^
-    #                            +--- default_init str
-    default_init: Optional[str]
-
-    # Compute argument formal for python argument parsing.
-    # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
-    def argument_str(self, *, method: bool = False) -> str:
-        type_str = argument_type_str(self.type).replace('const ', '').replace(' &', '')
-
-        name = self.name
-        # s/self/input/ outside method bindings
-        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
-        # for the parse string
-        if name == 'self' and type_str == 'Tensor' and not method:
-            name = 'input'
-
-        # add default
-        if self.default is not None:
-            default = {
-                'nullptr': 'None',
-                'c10::nullopt': 'None',
-                '{}': 'None',
-            }.get(self.default, self.default)
-            return f'{type_str} {name}={default}'
-        else:
-            return f'{type_str} {name}'
-
-    def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> str:
-        type_str = argument_type_str_pyi(self.type)
-
-        name = self.name
-        # s/self/input/ outside method bindings
-        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
-        # for the parse string
-        if name == 'self' and type_str == 'Tensor' and not method and not deprecated:
-            name = 'input'
-
-        if name == 'from':  # from is a Python keyword...
-            name += '_'
-
-        # pyi merges the _out and functional variants into the same signature, with an optional out arg
-        if name == 'out' and type_str == 'Tensor' and not deprecated:
-            type_str = 'Optional[' + type_str + ']'
-
-        # pyi deprecated signatures don't get defaults for their out arg
-        treat_as_no_default = deprecated and isinstance(self, PythonOutArgument) and self.default == 'None'
-
-        # add default
-        if self.default is not None and not treat_as_no_default:
-            if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \
-               self.default.startswith('{') and self.default.endswith('}'):
-                default = '(' + self.default[1:-1] + ')'
-            else:
-                default = {
-                    'nullptr': 'None',
-                    'c10::nullopt': 'None',
-                    '{}': 'None',
-                    'MemoryFormat::Contiguous': 'contiguous_format',
-                    'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine',
-                }.get(self.default, self.default)
-            return f'{name}: {type_str}={default}'
-        else:
-            return f'{name}: {type_str}'
-
-@dataclass(frozen=True)
-class PythonOutArgument(PythonArgument):
-    # In Python signature multiple output fields are packed into one 'out' argument.
-    # When binding to C++, it's first binded to a local 'out' variable:
-    #   'auto out = _r.tensorlist_n<2>(2);',
-    # then binded to scattered C++ output arguments as 'out[0]', 'out[1]', and etc.
-    # TODO: maybe don't need keep scattered out fields for python signature?
-    outputs: Tuple[PythonArgument, ...]
-
-    @staticmethod
-    def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgument']:
-        if not outputs:
-            return None
-
-        size = len(outputs)
-        if size == 1:
-            return PythonOutArgument(
-                name=outputs[0].name,
-                type=outputs[0].type,
-                default='None',
-                default_init=None,
-                outputs=outputs,
-            )
-        elif size > 1:
-            if any(map(lambda a: not a.type.is_tensor_like(), outputs)):
-                raise RuntimeError(f'Unsupported output type: {outputs}')
-            return PythonOutArgument(
-                name='out',
-                # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None?
-                type=ListType(BaseType(BaseTy.Tensor), size),
-                default='None',
-                default_init=None,
-                outputs=outputs,
-            )
-        raise AssertionError(r'Unexpected PythonOutArgument size')
-
-@dataclass(frozen=True)
-class PythonSignature:
-    # Base operator name, without inplace/outplace suffix.
-    name: str
-
-    # Positional arguments.
-    # TODO: create a dedicated SelfArgument type for 'self'?
-    input_args: Tuple[PythonArgument, ...]
-
-    # Keyword arguments excluding the 'out' argument and scattered kwargs belonging
-    # to TensorOptions (dtype, layout, device, pin_memory, requires_grad, etc).
-    input_kwargs: Tuple[PythonArgument, ...]
-
-    output_args: Optional[PythonOutArgument]
-
-    # Return types, which are only used by pyi
-    returns: PythonReturns
-
-    # These are scattered kwargs arguments belonging to TensorOptions.
-    # When binding to C++, they are packed into a TensorOptions object 'options'.
-    # It's possible that the C++ signature doesn't take TensorOptions object (e.g.
-    # for out variant), in which case they will be used as scattered fields without
-    # being packed into 'options'.
-    # TODO: maybe create a PythonTensorOptionsArgument?
-    tensor_options_args: Tuple[PythonArgument, ...]
-
-    # method or function signature?
-    method: bool
-
-    @property
-    def deprecated(self) -> bool:
-        return False
-
-    def arguments(
-        self, *, skip_outputs: bool = False, skip_tensor_options: bool = False
-    ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]:
-        result: List[Union[PythonArgument, PythonOutArgument]] = []
-        result.extend(self.input_args)
-        result.extend(self.input_kwargs)
-        if self.output_args is not None and not skip_outputs:
-            result.append(self.output_args)
-        if not skip_tensor_options:
-            result.extend(self.tensor_options_args)
-        return tuple(result)
-
-    def arguments_count(self) -> int:
-        return len(self.arguments())
-
-    def output_idx(self) -> int:
-        return len(self.input_args) + len(self.input_kwargs)
-
-    # [old codegen] Compute the Python function signature for argument parsing,
-    # as specified in torch/csrc/utils/python_arg_parser.h.  WARNING:
-    # this is NOT the same type signature as specified by PEP 484
-    # as understood by mypy; our format was independently developed
-    # and has some quirks to make it more suitable specifically
-    # for error parsing.
-    #
-    # For a translation to mypy-valid type signatures, see
-    # signature_str_pyi().
-    def signature_str(self, *, skip_outputs: bool = False) -> str:
-        args = self.arguments(skip_outputs=skip_outputs)
-        schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method), args))
-        positional_argc = len(self.input_args)
-        if len(schema_formals) > positional_argc:
-            schema_formals.insert(positional_argc, '*')
-
-        return f'{self.name}({", ".join(schema_formals)})'
-
-    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
-        args = self.arguments(skip_outputs=skip_outputs)
-        schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args))
-        positional_argc = len(self.input_args)
-        if len(schema_formals) > positional_argc:
-            schema_formals.insert(positional_argc, '*')
-
-        # only pyi signatures include returns
-        returns_str = self.returns.returns_str_pyi()
-        # pyi also includes self (with no typing/defaults) for methods
-        if self.method:
-            schema_formals.insert(0, "self")
-        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
-
-    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
-        # only pyi uses vararg signatures
-        args = self.arguments(skip_outputs=skip_outputs)
-        schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args))
-        # vararg only applies to pyi signatures. vararg variants are not generated for all signatures
-        num_args = self.arguments_count()
-        num_positionalargs = len(self.input_args)
-
-        have_vararg_version = False
-        if num_args > 0:
-            vararg_type = args[0].type
-            if isinstance(vararg_type, ListType) and str(vararg_type.elem) == 'int' and num_positionalargs == 1:
-                have_vararg_version = True
-
-        if not have_vararg_version:
-            return None
-        # Below are the major changes in vararg vs. regular pyi signatures
-        # vararg signatures also omit the asterix
-        schema_formals[0] = '*' + args[0].name + ': _int'
-
-        returns_str = self.returns.returns_str_pyi()
-        # pyi also includes self (with no typing/defaults) for methods
-        if self.method:
-            schema_formals.insert(0, "self")
-        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
-
-# The deprecated python signature involves some special logic, so create a
-# dedicated data model to store these extra properties.
-@dataclass(frozen=True)
-class PythonSignatureDeprecated(PythonSignature):
-    # We need keep the order of arguments in deprecated signature.
-    # Particularly, method signature might have 'self' not at the beginning, e.g.:
-    #   addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
-    # When generating lambda function signature we need follow the exact order (even for method=True):
-    #   [](Scalar beta, const Tensor & self, const Tensor & mat1, const Tensor & mat2) -> Tensor
-    deprecated_args_names: Tuple[str, ...]
-
-    # The deprecated signature might miss some arguments that the corresponding
-    # C++ signature expects. We need store the constant default values to pass in.
-    # For example:
-    #   [deprecate signature]: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
-    #   [func schema]: aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-    #   [func call]: self.addmm(mat1, mat2, beta, 1)
-    # We store ['self', 'mat1', 'mat2', 'beta', '1'] in this case.
-    deprecated_args_exprs: Tuple[str, ...]
-
-    @property
-    def deprecated(self) -> bool:
-        return True
-
-    def signature_str(self, *, skip_outputs: bool = False) -> str:
-        return PythonSignature.signature_str(self, skip_outputs=skip_outputs) + '|deprecated'
-
-    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
-        args = self.arguments(skip_outputs=skip_outputs)
-        schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method, deprecated=True), args))
-        positional_argc = len(self.input_args)
-        if len(schema_formals) > positional_argc:
-            schema_formals.insert(positional_argc, '*')
-
-        returns_str = self.returns.returns_str_pyi()
-        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
-
-    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
-        # the codegen doesn't include vararg variants for deprecated signatures
-        return None
-
-# This struct is used to hold the PythonSignature and its corresponding
-# NativeFunction BEFORE grouping base and out-variant functions.
-# Why not store NativeFunction in PythonSignature or construct PythonSignature
-# from NativeFunction? Because they are not 1-1 mapped.
-# One native function could have both deprecated and non-deprecated python
-# signatures - NativeFunction doesn't contain information to construct the
-# deprecated python signature.
-# One python signature is used to handle both the base and the out-variant
-# function - see 'PythonSignatureGroup'.
-@dataclass(frozen=True)
-class PythonSignatureNativeFunctionPair:
-    signature: PythonSignature
-    function: NativeFunction
-
-# We merge pairs of functions with signatures that are equivalent mod
-# output arguments, and use a single entry in the python_arg_parser sig
-# list for both (output arguments become optional).
-@dataclass(frozen=True)
-class PythonSignatureGroup:
-    # The signature used for Python argument parsing. The outplace signature
-    # is preferred if exists, because it can be used to parse inputs for both
-    # the out-place variant and the base version (with output omitted).
-    signature: PythonSignature
-
-    # The regular ATen declaration (e.g. conv2d)
-    base: NativeFunction
-
-    # The out variant (e.g. conv2d_out)
-    outplace: Optional[NativeFunction]
-
-# C++ function dispatch is wrapped in a lambda function. The lambda function
-# has almost the same signature as the C++ function, only with some small
-# variants - see details below.
-# This data model is used to represent arguments of the lambda function
-# signature.
-@dataclass(frozen=True)
-class DispatchLambdaArgument:
-    name: str
-    type_str: str
-    is_out_arg: bool
-
-# To pass PyObjects arguments to C++ function (via the lambda wrapper),
-# we need first convert PyObjects into simple C++ objects. This work
-# is done by PythonArgParser.
-# This data model is used to represent the output of PythonArgParser.
-# It has 1-1 mapping with PythonArgument in PythonSignature.
-@dataclass(frozen=True)
-class PythonArgParserOutputExpr:
-    # argument name
-    name: str
-
-    # RHS expression to reference PythonArgParser output.
-    expr: str
-
-    # In some special cases we need create different expr, e.g.:
-    # '_r.isNone(1)' instead of '_r.tensor(1)'.
-    index: int
-
-    # The python argument it maps to.
-    argument: PythonArgument
-
-    @property
-    def is_none_expr(self) -> str:
-        return f'_r.isNone({self.index})'
-
-# To pass PythonArgParser output to the lambda wrapper, we need bind
-# PythonArgParserOutputExpr to DispatchLambdaArgument.
-# They are not always 1-1 mapped, e.g. scattered TensorOptions fields
-# need be packed into a TensorOptions object, which is the argument
-# that the lambda function wrapper takes.
-@dataclass(frozen=True)
-class DispatchLambdaArgumentExprs:
-    # The exprs that provide the binding for lambda arguments, e.g.:
-    #
-    #   'self' -> '_r.tensor(0)'
-    #   'min' -> 'out[0]' / 'min_indices' -> 'out[1]'
-    #   'options' -> 'options'
-    #
-    # It has 1-1 mapping with DispatchLambdaArgument.
-    exprs: Sequence[str]
-
-    # Special local inits, which might introduce new variables that
-    # the 'exprs' above reference, e.g.:
-    #
-    #   'auto out = _r.tensorlist_n<2>(2);'
-    #
-    inits: Sequence[str]
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                          Helper Functions
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
-    return CppSignatureGroup.from_native_function(f, method=method).signature
-
-def has_tensor_options(f: NativeFunction) -> bool:
-    return f.func.arguments.tensor_options is not None
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                          Python Signature
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-# 'simple_type' was introduced by the old codegen, which is slightly
-# different from the python schema type, e.g.: doesn't have '?' suffix
-# for optional Tensor/TensorList; doesn't have '[size]' suffix for list type.
-def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
-    if isinstance(t, BaseType):
-        if t.name == BaseTy.Tensor:
-            return 'Tensor'
-        elif t.name == BaseTy.int:
-            return 'int64_t'
-        elif t.name == BaseTy.float:
-            return 'double'
-        elif t.name == BaseTy.str:
-            return 'c10::string_view'
-        elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar,
-                        BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage,
-                        BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat,
-                        BaseTy.Dimname, BaseTy.Stream, BaseTy.ConstQuantizerPtr]:
-            # These python schema type names line up with their function schema names
-            return t.name.name
-
-    elif isinstance(t, OptionalType):
-        if str(t.elem) == 'Tensor':
-            # Is it desired to keep '?' for simple_type with new style dispatcher?
-            return 'Tensor?'
-        elem = argument_type_str(t.elem, simple_type=simple_type)
-        if elem == 'Layout':
-            # TODO: fix this special case in PythonArgParser?
-            return 'Layout'
-        else:
-            return f'{elem}?'
-
-    elif isinstance(t, ListType):
-        size = t.size if not simple_type else None
-        if str(t.elem) == 'bool':
-            assert t.size is not None
-            return f'::std::array<bool,{t.size}>'
-        elif str(t.elem) == 'int':
-            return f'IntArrayRef[{size}]' if size is not None else 'IntArrayRef'
-        elif str(t.elem) == 'Tensor':
-            return f'TensorList[{size}]' if size is not None else 'TensorList'
-        elif str(t.elem) == 'Scalar':
-            return f'ScalarList[{size}]' if size is not None else 'ScalarList'
-        elif str(t.elem) == 'Tensor?':
-            if simple_type:
-                return 'c10::List<c10::optional<Tensor>>'
-            else:
-                return 'const c10::List<c10::optional<Tensor>> &'
-        elif str(t.elem) == 'Dimname':
-            return f'DimnameList[{size}]' if size is not None else 'DimnameList'
-        elem = argument_type_str(t.elem, simple_type=simple_type)
-        return f'ArrayRef<{elem}>'
-
-    raise RuntimeError(f'unrecognized type {repr(t)}')
-
-def argument_type_size(t: Type) -> Optional[int]:
-    l = t.is_list_like()
-    if l is not None and str(l.elem) != 'bool':
-        return l.size
-    else:
-        return None
-
-def argument(a: Argument) -> PythonArgument:
-    return PythonArgument(
-        name=a.name,
-        type=a.type,
-        # TODO: directly translate a.default to python default
-        default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
-        if a.default is not None else None,
-        default_init=None,
-    )
-
-# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen
-def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature:
-    args: List[Argument] = []
-    args.extend(f.func.arguments.pre_self_positional)
-    # Skip SelfArgument if this is method.
-    if not method and f.func.arguments.self_arg is not None:
-        args.append(f.func.arguments.self_arg.argument)
-    args.extend(f.func.arguments.post_self_positional)
-    args.extend(f.func.arguments.pre_tensor_options_kwarg_only)
-    # Skip TensorOptionsArguments. Python side TensorOptions
-    # arguments are created based on different rules - see below.
-    args.extend(f.func.arguments.post_tensor_options_kwarg_only)
-    args.extend(f.func.arguments.out)
-
-    input_arg_set = set(a.name for a in f.func.arguments.flat_positional)
-    kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
-    out_arg_set = set(a.name for a in f.func.arguments.out)
-
-    input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args)))
-    input_kwargs = tuple(map(argument, filter(lambda a: a.name in kwarg_only_set, args)))
-    outputs = tuple(map(argument, filter(lambda a: a.name in out_arg_set, args)))
-
-    # Reintroduce the scattered fields of TensorOptions for Python.
-    # Compared to the cpp counterpart, the python arguments have new property
-    # (default_init) and a new argument 'requires_grad', which require some
-    # special handlings.
-    # [old codegen] TODO: because these aren't guaranteed to be 100% faithful
-    # to the original versions in the yaml, this recreation is a potential
-    # source of drift between eager and JIT. Pull this logic out to a shared place.
-
-    has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out)
-    if any(a.name == 'requires_grad' for a in f.func.schema_order_arguments()):
-        raise ValueError('argument named requires_grad is reserved, should not explicitly add it in the schema')
-
-    # [old codegen] this probably won't work if one of the returns is not a tensor,
-    # but it will produce a compile-time error that is obvious.
-    has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
-
-    name: str = cpp.name(f.func)
-    is_factory_function = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg)
-    is_like_or_new_function = f.category_override in ('new', 'like') or name.startswith('new_') or name.endswith('_like')
-
-    tensor_options_args: List[PythonArgument] = []
-    if is_factory_function or is_like_or_new_function:
-        tensor_options_args.append(PythonArgument(
-            name='dtype',
-            type=BaseType(BaseTy.ScalarType),
-            default='None' if pyi else _dtype_default_type_hack(name),
-            default_init='self.scalar_type()' if is_like_or_new_function else None,
-        ))
-        tensor_options_args.append(PythonArgument(
-            name='layout',
-            type=OptionalType(BaseType(BaseTy.Layout)),
-            default='strided' if pyi else 'torch.strided',
-            default_init='self.layout()' if is_like_or_new_function else None,
-        ))
-        tensor_options_args.append(PythonArgument(
-            name='device',
-            type=BaseType(BaseTy.Device),
-            default='None',
-            default_init='self.device()' if is_like_or_new_function else None,
-        ))
-        tensor_options_args.append(PythonArgument(
-            name='pin_memory',
-            type=BaseType(BaseTy.bool),
-            default='False',
-            default_init=None,
-        ))
-        tensor_options_args.append(PythonArgument(
-            name='requires_grad',
-            type=BaseType(BaseTy.bool),
-            default='False',
-            default_init=None,
-        ))
-
-    returns = PythonReturns(returns=f.func.returns)
-
-    return PythonSignature(
-        name=str(f.func.name.name),
-        input_args=input_args,
-        input_kwargs=input_kwargs,
-        output_args=PythonOutArgument.from_outputs(outputs),
-        tensor_options_args=tuple(tensor_options_args),
-        returns=returns,
-        method=method,
-    )
-
-# TODO blowtorch
-# note: removing this will be BC-breaking. A quick test shows that
-# randperm will otherwise default its dtype to torch.float64
-def _dtype_default_type_hack(name: str) -> str:
-    if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices':
-        return 'torch.int64'
-    else:
-        return 'None'
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                          Python Interface
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
-    if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)):
-        return []
-    else:
-        if any(map(lambda r: r.name is None, returns)):
-            # When building on Windows, `PyStructSequence_UnnamedField` could not be
-            # resolved by the linker for some reason, which cause error in building:
-            #
-            # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
-            # PyStructSequence_UnnamedField
-            #
-            # Thus, at this point in time, we do not support unnamed
-            # fields in namedtuple; you must either name all fields,
-            # or none of them.
-            raise ValueError("Unnamed field is not supported by codegen")
-
-        return list(map(lambda r: str(r.name), returns))
-
-def argument_type_str_pyi(t: Type) -> str:
-    add_optional = False
-    if isinstance(t, OptionalType):
-        t = t.elem
-        add_optional = True
-
-    if isinstance(t, BaseType):
-        if t.name == BaseTy.int:
-            ret = '_int'
-        elif t.name == BaseTy.float:
-            ret = '_float'
-        elif t.name == BaseTy.str:
-            ret = 'str'
-        elif t.name == BaseTy.Scalar:
-            ret = 'Number'
-        elif t.name == BaseTy.ScalarType:
-            ret = '_dtype'
-        elif t.name == BaseTy.bool:
-            ret = '_bool'
-        elif t.name == BaseTy.QScheme:
-            ret = '_qscheme'
-        elif t.name == BaseTy.Layout:
-            ret = '_layout'
-        elif t.name == BaseTy.Device:
-            ret = 'Union[_device, str, None]'
-        elif t.name == BaseTy.MemoryFormat:
-            ret = 'memory_format'
-        elif t.name == BaseTy.Dimname:
-            ret = 'Union[str, ellipsis, None]'
-        elif t.name in [BaseTy.Tensor, BaseTy.Generator,
-                        BaseTy.Storage, BaseTy.Stream]:
-            # These python schema type names line up with their function schema names
-            ret = t.name.name
-
-    elif isinstance(t, ListType):
-        if str(t.elem) == 'int':
-            ret = 'Union[_int, _size]' if t.size is not None else '_size'
-        elif t.is_tensor_like():
-            # TODO: this doesn't seem right...
-            # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]]
-            # It should probably translate to   Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]]
-            if isinstance(t.elem, OptionalType):
-                add_optional = True
-            ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \
-                  'Union[Tuple[Tensor, ...], List[Tensor]]'
-        elif str(t.elem) == 'float':
-            ret = 'Sequence[_float]'
-        else:
-            elem = argument_type_str_pyi(t.elem)
-            ret = f'Sequence[{elem}]'
-
-    if add_optional:
-        ret = 'Optional[' + ret + ']'
-    return ret
-
-    raise RuntimeError(f'unrecognized type {repr(t)}')
-
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                        C++ Function Dispatch
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-# This section provides APIs to generate the code that does C++ function
-# dispatch. The C++ function call is wrapped by a lambda function.
-# For example:
-#
-#    // aten::selu_(Tensor(a!) self) -> Tensor(a!)
-#    auto dispatch_selu_ = [](Tensor self) -> Tensor {
-#      pybind11::gil_scoped_release no_gil;
-#      return at::selu_(self);
-#    };
-#
-# The lambda function's signature follows the C++ signature in common
-# cases, e.g.:
-#
-#   // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-#   [](const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
-#
-# For out variant the 'out' argument's type is changed from 'Tensor &'
-# to 'Tensor'. It's because when calling the lambda it passes in the
-# PythonArgParser output '_r.tensor(3)', which is stack allocated object
-# and needs to pass by value. Also see comments in 'dispatch_lambda_return_str()'.
-#
-#   // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-#   [](Tensor out, const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
-#
-# For multi-output case it can keep using reference type because the
-# PythonArgParser output has been unpacked to local variables, e.g.:
-#
-#   // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *,
-#   //     Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
-#   [](Tensor & max, Tensor & max_values, const Tensor & self, Dimname dim, bool keepdim) -> std::tuple<Tensor,Tensor>
-#
-# For deprecated python signature, it should follow deprecated python arg order.
-# TODO: This is to keep same byte-for-byte result as the old codegen - maybe unnecessary?
-
-def dispatch_lambda_args(ps: PythonSignature, f: NativeFunction) -> Tuple[DispatchLambdaArgument, ...]:
-    # Start with cpp arguments - dispatch lambda signature always include 'self'
-    cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments()
-
-    # Special reorder logic for deprecated python signature
-    if isinstance(ps, PythonSignatureDeprecated):
-        m: Dict[str, Binding] = dict((a.name, a) for a in cpp_args)
-        # reorder according to the deprecated signature
-        # ignore 'out' argument when binding to non-output function.
-        ordered_args = filter(lambda n: n != 'out' or f.func.is_out_fn(),
-                              ps.deprecated_args_names)
-        cpp_args = list(map(lambda n: m[n], ordered_args))
-
-    out_args: Set[str] = set(a.name for a in f.func.arguments.out)
-
-    # Convert from cpp argument to lambda argument
-    def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
-        type_str = cpp_arg.type
-        is_out_arg = cpp_arg.name in out_args
-        if ps.method and cpp_arg.name == 'self':
-            # For method's 'self', we can use 'const Tensor &' and simply ignore mutability!
-            type_str = 'const at::Tensor &'
-        else:
-            # For other cases we need prevent dangling refs to temps (unless it's
-            # unpacked scattered output)
-            # The reason is explained in the comments above and in 'dispatch_lambda_return_str()'.
-            # TODO: avoid this special handling?
-            ensure_temp_safe = len(out_args) <= 1 or not is_out_arg
-            if ensure_temp_safe:
-                type_str = {
-                    'at::Tensor &': 'at::Tensor',
-                }.get(type_str, type_str)
-        return DispatchLambdaArgument(
-            name=cpp_arg.name,
-            type_str=type_str,
-            is_out_arg=is_out_arg,
-        )
-
-    return tuple(map(dispatch_lambda_arg, cpp_args))
-
-# [old codegen] XXX: if you got here because of an assertion failure, it doesn't mean
-# it's enough to just extend the list here. Before you do this, make sure
-# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h.
-SUPPORTED_RETURN_TYPES = {
-    'at::Tensor',
-    '::std::tuple<at::Tensor,at::Tensor>',
-    '::std::tuple<at::Tensor,at::Tensor,at::Tensor>',
-    '::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor>',
-    '::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>',
-    '::std::tuple<at::Tensor,at::Tensor,at::Tensor,int64_t>',
-    '::std::tuple<at::Tensor,at::Tensor,double,int64_t>',
-    '::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t>',
-    '::std::tuple<at::Tensor,at::Tensor,double,at::Tensor,int64_t>',
-    '::std::tuple<double,int64_t>',
-    '::std::vector<at::Tensor>',
-    'at::Scalar', 'bool', 'int64_t', 'void*', 'void',
-    'at::QScheme', 'double',
-    'at::IntArrayRef',
-    'at::ScalarType'
-}
-
-def dispatch_lambda_return_str(f: NativeFunction) -> str:
-    # [old codegen] Remove type annotation (e.g. 'Tensor' rather than 'Tensor &')
-    # because the dispatch lambdas take mutable arguments *by value*, not
-    # by reference. If you then return a reference to such an argument, you
-    # will now have a pointer to a dangling stack entry. Not good.
-    #
-    # You want:
-    #
-    #   auto dispatch_selu_ = [](Tensor self) -> Tensor { ...; return at::selu_(self); };
-    #                                            ^^^^^^
-    #
-    # *not*
-    #
-    #   auto dispatch_selu_ = [](Tensor self) -> Tensor& { ...; return at::selu_(self); };
-    #                                            ^^^^^^^
-    #
-    # (NB: We can't make dispatch_selu_ take Tensor&, because the enclosing
-    # codegen looks like dispatch_selu_(_r.tensor(0)), and you can't take a
-    # mutable reference to temporary.  Maybe we could assign it to a
-    # variable itself.)
-    returns_without_annotation = tuple(map(lambda r: Return(r.name, r.type, None), f.func.returns))
-    return_str = cpp.returns_type(returns_without_annotation).cpp_type()
-    if return_str not in SUPPORTED_RETURN_TYPES:
-        raise RuntimeError(f'{f.func.name} returns unsupported type {return_str}')
-    return return_str
-
-def cpp_dispatch_target(f: NativeFunction) -> str:
-    name = cpp.name(f.func)
-    if Variant.method in f.variants:
-        return f'self.{name}'
-    if Variant.function in f.variants:
-        if has_tensor_options(f) or f.func.name.name.base.endswith('_like'):
-            namespace = 'torch'
-        else:
-            namespace = 'at'
-        return f'{namespace}::{name}'
-    raise RuntimeError(f'could not dispatch, neither function nor method: {f.func}')
-
-def cpp_dispatch_exprs(f: NativeFunction, *,
-                       python_signature: Optional[PythonSignature] = None,
-                       ) -> Tuple[str, ...]:
-    cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments()
-
-    exprs: Tuple[str, ...] = tuple()
-    if not isinstance(python_signature, PythonSignatureDeprecated):
-        # By default the exprs are consistent with the C++ signature.
-        exprs = tuple(map(lambda a: a.name, cpp_args))
-    else:
-        # For deprecated python signature we may need fill in some constants.
-        exprs = tuple(filter(lambda n: n != 'out' or f.func.is_out_fn(),
-                             python_signature.deprecated_args_exprs))
-
-    if Variant.method in f.variants:
-        exprs = tuple(filter('self'.__ne__, exprs))
-
-    return exprs
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                     Python / C++ Args Binding
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-# We explicitly enumerate the PythonArgParser unpacking methods for all
-# supported types. This might be more verbose than necessary, partially
-# because of the irregularity of unpacking method naming, partially
-# because we want to mimic the old codegen behavior - to reject
-# unexpected and/or unsupported cases which the old codegen rejects.
-# For certain cases it is intentionally more restrictive than necessary,
-# e.g.: it doesn't accepts doublelist with definite size.
-def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
-    if has_default and str(t) not in ('ScalarType', 'Device', 'Layout?'):
-        raise RuntimeError(f'type \'{t}\' does not supported unpacking with default')
-
-    if isinstance(t, BaseType):
-        if t.name in [BaseTy.Tensor, BaseTy.Stream, BaseTy.Storage,
-                      BaseTy.Scalar, BaseTy.Dimname]:
-            # These unpack methods line up with their schema names
-            return t.name.name.lower()
-        elif t.name == BaseTy.ScalarType:
-            return 'scalartypeWithDefault' if has_default else 'scalartype'
-        elif t.name == BaseTy.Device:
-            return 'deviceWithDefault' if has_default else 'device'
-        elif t.name == BaseTy.int:
-            return 'toInt64'
-        elif t.name == BaseTy.bool:
-            return 'toBool'
-        elif t.name == BaseTy.float:
-            return 'toDouble'
-        elif t.name == BaseTy.str:
-            return 'stringView'
-
-    elif isinstance(t, OptionalType):
-        if str(t.elem) == 'Tensor':
-            return 'optionalTensor'
-
-        elif isinstance(t.elem, BaseType):
-            if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar,
-                               BaseTy.int, BaseTy.bool,
-                               BaseTy.float, BaseTy.str]:
-                # Regular cases: append 'Optional' to elem's unpacking method
-                return arg_parser_unpack_method(t.elem, False) + 'Optional'
-            elif t.elem.name == BaseTy.MemoryFormat:
-                return 'memoryformatOptional'
-            elif t.elem.name == BaseTy.Generator:
-                return 'generator'
-            elif t.elem.name == BaseTy.Layout:
-                return 'layoutWithDefault' if has_default else 'layoutOptional'
-            elif t.elem.name == BaseTy.Device:
-                return 'deviceWithDefault' if has_default else 'deviceOptional'
-
-        elif isinstance(t.elem, ListType):
-            if str(t.elem.elem) == 'int':
-                # accept definite size
-                return 'intlistOptional'
-            elif str(t.elem) == 'float[]':
-                return 'doublelistOptional'
-            elif str(t.elem) == 'Dimname[]':
-                return 'toDimnameListOptional'
-
-    elif isinstance(t, ListType):
-        if str(t.elem) == 'Tensor':
-            # accept and use definite size
-            if t.size is not None:
-                return f'tensorlist_n<{t.size}>'
-            else:
-                return 'tensorlist'
-        elif str(t.elem) == 'Tensor?':
-            return 'list_of_optional_tensors'
-        elif str(t.elem) == 'Dimname':
-            # accept definite size
-            return 'dimnamelist'
-        elif str(t.elem) == 'int':
-            # accept definite size
-            return 'intlist'
-        elif str(t) == 'float[]':
-            return 'doublelist'
-        elif str(t) == 'Scalar[]':
-            return 'scalarlist'
-    raise RuntimeError(f'type \'{t}\' is not supported by PythonArgParser')
-
-# Return RHS expression for python argument using PythonArgParser output.
-# e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)'
-def arg_parser_output_expr(
-    arg_index: int, a: PythonArgument
-) -> PythonArgParserOutputExpr:
-    has_default = a.default_init is not None
-    unpack_method = arg_parser_unpack_method(a.type, has_default)
-    default = f', {a.default_init}' if has_default else ''
-    expr = f'_r.{unpack_method}({arg_index}{default})'
-
-    return PythonArgParserOutputExpr(
-        name=a.name,
-        expr=expr,
-        index=arg_index,
-        argument=a,
-    )
-
-# Returns a map with key = arg_name and value = PythonArgParserOutputExpr.
-def arg_parser_output_exprs(
-    ps: PythonSignature, f: NativeFunction
-) -> Dict[str, PythonArgParserOutputExpr]:
-    return {e.name: e for i, a in enumerate(ps.arguments())
-            for e in (arg_parser_output_expr(i, a), )}
-
-# argument name to type for scattered tensor options fields
-TENSOR_OPTIONS_FIELDS = {
-    'dtype': 'ScalarType',
-    'device': 'Device',
-    'layout': 'Layout?',
-    'pin_memory': 'bool',
-    'requires_grad': 'bool',
-}
-
-# bind arg parser outputs (python args) with dispatch lambda arguments (c++ args).
-def dispatch_lambda_exprs(
-    ps: PythonSignature, f: NativeFunction
-) -> DispatchLambdaArgumentExprs:
-    # This method is to bind 'arg_parser_outputs' and 'lambda_args' by producing
-    # 'inits' and 'lambda_args_exprs' for each lambda argument using arg parser
-    # outputs.
-    arg_parser_outputs = arg_parser_output_exprs(ps, f)
-    lambda_args = dispatch_lambda_args(ps, f)
-    inits: List[str] = []
-    lambda_args_exprs: Dict[str, str] = dict()
-
-    has_toptions = has_tensor_options(f)
-
-    # 1. special inits/unpacking to provide binding exprs for lambda arguments.
-    for a in ps.arguments(skip_tensor_options=True):
-        name = a.name
-        arg_parser_expr = arg_parser_outputs[a.name].expr
-
-        if has_toptions and name == 'self':
-            # TODO: why this needs to be special case?
-            inits.extend([
-                f'auto self = {arg_parser_expr};',
-            ])
-            lambda_args_exprs[name] = name
-        elif isinstance(a, PythonOutArgument) and len(a.outputs) > 1 and f.func.is_out_fn():
-            inits.extend([
-                f'auto out = {arg_parser_expr};',
-            ])
-            for i, out_arg in enumerate(a.outputs):
-                lambda_args_exprs[out_arg.name] = f'out[{i}]'
-        elif str(a.type) == 'Dimname[]?':
-            # [old codegen]
-            # TODO: make this part of something more general, or get rid of it.
-            # optional<ArrayRef<T>> are special. The PythonArgParser returns an
-            # optional<vector<T>>, which cannot be implicitly converted to
-            # optional<ArrayRef<T>>. One needs to unwrap the optional and rewrap.
-            inits.extend([
-                f'auto __{name} = {arg_parser_expr};',
-                f'c10::optional<DimnameList> {name} = __{name} ? c10::make_optional(DimnameList(__{name}.value())) : c10::nullopt;',
-            ])
-            lambda_args_exprs[name] = name
-        else:
-            # default case - directly using PythonArgParser output expr
-            lambda_args_exprs[name] = arg_parser_expr
-
-    # method's self is passed directly to python binding, rather than parsed
-    if ps.method:
-        lambda_args_exprs['self'] = 'self'
-
-    # 2. special packing/checking for TensorOptions.
-    tensor_options_args_names = list(map(lambda a: a.name, ps.tensor_options_args))
-    if has_toptions:
-        if f.func.is_out_fn():
-            raise RuntimeError(f'{f.func}: tensor options with output arg')
-        for a in ps.tensor_options_args:
-            if a.name not in TENSOR_OPTIONS_FIELDS:
-                raise RuntimeError(
-                    f'{f.func}: unrecognized tensor options field \'{a.name}\' in python binding arguments')
-            if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name):
-                raise RuntimeError(
-                    f'{f.func}: unrecognized type \'{str(a.type)}\' for tensor options field \'{a.name}\'')
-        if not all(map(lambda a: a in tensor_options_args_names, TENSOR_OPTIONS_FIELDS.keys())):
-            raise RuntimeError(
-                f'{f.func}: incomplete tensor options args: {tensor_options_args_names}')
-
-        inits.append(f'''\
-const auto options = TensorOptions()
-    .dtype({arg_parser_outputs['dtype'].expr})
-    .device({arg_parser_outputs['device'].expr})
-    .layout({arg_parser_outputs['layout'].expr})
-    .requires_grad({arg_parser_outputs['requires_grad'].expr})
-    .pinned_memory({arg_parser_outputs['pin_memory'].expr});
-torch::utils::maybe_initialize_cuda(options);
-''')
-        lambda_args_exprs['options'] = 'options'
-
-    # 3. special case - access scattered TensorOptions fields without packing
-    # TODO: maybe move to the generator side as it's not related to binding.
-    if not has_toptions and tensor_options_args_names:
-        if 'dtype' in tensor_options_args_names:
-            # we're an output-arg variant, check these args against output tensor
-            if not f.func.is_out_fn():
-                raise RuntimeError(
-                    f'{f.func}: dtype in tensor_options_args without output arg')
-            if not all(map(lambda a: a in tensor_options_args_names, ('layout', 'device'))):
-                raise RuntimeError(
-                    f'{f.func}: incomplete tensor options for output check')
-
-            inits.append(f"""\
-check_out_type_matches({arg_parser_outputs['out'].expr}, {arg_parser_outputs['dtype'].expr},
-                       {arg_parser_outputs['dtype'].is_none_expr}, {arg_parser_outputs['layout'].expr},
-                       {arg_parser_outputs['device'].expr}, {arg_parser_outputs['device'].is_none_expr});
-""")
-        # we'll set requires_grad on outgoing tensor
-        if 'requires_grad' not in tensor_options_args_names:
-            raise RuntimeError(
-                f'{f.func}: expected "requires_grad" in tensor_options_args absent, but found [{tensor_options_args_names}]')
-
-    return DispatchLambdaArgumentExprs(
-        exprs=tuple(map(lambda a: lambda_args_exprs[a.name], lambda_args)),
-        inits=inits,
-    )
diff --git a/tools/codegen/api/translate.py b/tools/codegen/api/translate.py
deleted file mode 100644
index 591b8d75e3b1..000000000000
--- a/tools/codegen/api/translate.py
+++ /dev/null
@@ -1,240 +0,0 @@
-from typing import Dict, Sequence, List, NoReturn, Union
-from tools.codegen.api.types import (BaseCType, Binding, ConstRefCType,
-                                     Expr, MutRefCType, OptionalCType,
-                                     NamedCType, SpecialArgName, tensorT,
-                                     memoryFormatT, tensorOptionsT, scalarTypeT,
-                                     boolT, deviceT, layoutT, optionalTensorRefT,
-                                     scalarT, optionalScalarRefT,
-                                     VectorCType, longT, intArrayRefT)
-
-# This file implements a small program synthesis engine that implements
-# conversions between one API to another.
-#
-# The key data type in this file in NamedCType, short for Named C++ semantic type.  A NamedCType
-# represents a C++ type, plus semantic information about what it represents.
-# For example, consider the argument "bool pin_memory"; its normal C++ type is
-# "bool", but its C++ semantic type also keeps track that this represents a
-# "pin_memory"; you can't just use a random other boolean in a context where you
-# need a "pin_memory"!
-#
-# The translator takes a list of needed NamedCTypes, and then figures out how
-# to construct expressions with these NamedCTypes from the given bindings.  Many
-# of these expressions are trivial (I need a Tensor other; there's a Tensor
-# other scope); others are more nontrivial and may require packing/unpacking.
-# Some examples of non-trivial action:
-#
-#   - Need the "dtype" binding?  Well, maybe "dtype" isn't available
-#     in the context, instead, "options" is, and you need to extract
-#     it from there.  (Gather)
-#
-#   - Need the "context" binding?  Well, maybe "context" isn't available
-#     in the context, and you need to construct it from "dtype", "device",
-#     etc.  (Scatter)
-#
-#   - Need the "memory_format" binding?  Well, actually, it's available
-#     from both "memory_format" and "options", so you had better make sure
-#     they are consistent.  (Join)
-
-options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT)))
-
-longVec_ctype = VectorCType(BaseCType(longT))
-optionalScalar_ctype = OptionalCType(BaseCType(scalarT))
-optionalTensor_ctype = OptionalCType(BaseCType(tensorT))
-
-class UnsatError(RuntimeError):
-    pass
-
-# Given a set of in-scope bindings and a set of target bindings, synthesize
-# a list of expressions that uses only the in-scope bindings (bindings) that
-# have all of the types of goals.  You may want to use this function if
-# you're generating code for a function like:
-#
-#   void f({args}) {
-#     g({exprs}); // g is a different API
-#   }
-#
-# and you need to generate "exprs".
-#
-# Typically, a list of Bindings is convenient to get (you usually call something
-# like arguments() to get them); but technically you only need less information:
-# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for
-# 'goals', an (ordered) list of NamedCType goals is sufficient.  If you are doing
-# something more complicated, e.g., tracking the set of bindings in a context,
-# you may find using these smaller types more convenient.
-def translate(
-    bindings: Sequence[Union[Expr, Binding]],
-    goals: Sequence[Union[NamedCType, Binding]],
-    *, method: bool = False,
-    allow_expensive_conversions: bool = False
-) -> List[Expr]:
-
-    binding_exprs: List[Expr] = []
-    for b in bindings:
-        if isinstance(b, Binding):
-            binding_exprs.append(Expr(
-                expr=b.name,
-                type=b.nctype,
-            ))
-        else:
-            binding_exprs.append(b)
-
-    goal_ctypes: List[NamedCType] = []
-    for g in goals:
-        if isinstance(g, Binding):
-            goal_ctypes.append(g.nctype)
-        else:
-            goal_ctypes.append(g)
-
-    # Add all the bindings to the context
-    ctx: Dict[NamedCType, str] = {}
-    for b in binding_exprs:
-        ctx[b.type] = b.expr
-
-        # While we're at it, do some simple forward inference, looking through
-        # constructors.
-        # TODO: My kingdom for a pattern matcher
-        # https://www.python.org/dev/peps/pep-0634/
-        # TODO: This could get us in recomputation trouble if b.expr is nontrivial
-        t = b.type
-        if isinstance(t, ConstRefCType) and isinstance(t.elem, OptionalCType) and \
-                isinstance(t.elem.elem, BaseCType) and str(t.elem.elem.type) == 'at::Tensor':
-            ctx[NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT)))] = \
-                f'({b.expr}.has_value() ? *{b.expr} : at::Tensor())'
-
-        if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))):
-            ctx[NamedCType(t.name, BaseCType(optionalTensorRefT))] = \
-                f'(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())'
-
-        if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))):
-            ctx[NamedCType(t.name, BaseCType(optionalScalarRefT))] = \
-                f'({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())'
-
-    # Add implicit bindings if the generated code is inside a Tensor method
-    if method:
-        ctx[NamedCType("self", MutRefCType(BaseCType(tensorT)))] = "const_cast<Tensor&>(*this)"
-        ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "const_cast<Tensor&>(*this)"
-        # This is better!  Byte-for-byte compat
-        # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this"
-
-    def unsat(goal: NamedCType) -> NoReturn:
-        ctx_desc = '\n'.join(f"  {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items())
-        raise UnsatError(f'''
-Failed to synthesize the expression "{goal.cpp_type()} {goal.name}".
-When I failed, the following bindings were available in the context:
-
-{ctx_desc}
-
-This probably means there is a missing rule in the rules of tools.codegen.api.translate.
-Check this module for more information.
-''')
-
-    # A shitty backtracking search implementation.  It's shitty because it
-    # doesn't actually do backtracing or search. In particular, if
-    # direct=True, we won't try to do any fancy synthesis, just trivial
-    # conversions (e.g., "T a" is OK for "const T& a").  So all of the
-    # existing rules in this function simply try to solve immediately,
-    # and bail if things don't work out.
-    def solve(goal: NamedCType, *, direct: bool) -> str:
-        def direct_solve(goal: NamedCType) -> str:
-            return solve(goal, direct=True)
-
-        if goal in ctx:
-            # Trivial
-            return ctx[goal]
-
-        # const & is satisfied with mutable &
-        if isinstance(goal.type, ConstRefCType):
-            try:
-                # WARNING: not strictly decreasing; be careful not
-                # to add a direct conversion that goes satisfies
-                # mutable& with const&
-                return solve(NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct)
-            except UnsatError:
-                pass
-
-        # mutable & is satisfied with value
-        if isinstance(goal.type, MutRefCType):
-            try:
-                return solve(NamedCType(goal.name, goal.type.elem), direct=direct)
-            except UnsatError:
-                pass
-
-        if direct:
-            unsat(goal)
-
-        # For now, all of these rules are mutually exclusive.
-        if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))):
-            memory_format = direct_solve(
-                NamedCType(SpecialArgName.possibly_redundant_memory_format, OptionalCType(BaseCType(memoryFormatT)))
-            )
-            # No need to join "memory_format" and "options" if the target API takes "options" directly.
-            # Otherwise it will cause the redundant memory_format error.
-            if options_ctype in goal_ctypes:
-                return memory_format
-            try:
-                options = direct_solve(options_ctype)
-                return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})"
-            except UnsatError:
-                return memory_format
-
-        elif goal == NamedCType("options", BaseCType(tensorOptionsT)):
-            dtype = direct_solve(NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))))
-            pin_memory = direct_solve(NamedCType("pin_memory", OptionalCType(BaseCType(boolT))))
-            device = direct_solve(NamedCType("device", OptionalCType(BaseCType(deviceT))))
-            layout = direct_solve(NamedCType("layout", OptionalCType(BaseCType(layoutT))))
-            return f'TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})'
-
-        elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))):
-            options = direct_solve(options_ctype)
-            return f'optTypeMetaToScalarType({options}.dtype_opt())'
-
-        elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))):
-            options = direct_solve(options_ctype)
-            return f'{options}.layout_opt()'
-
-        elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))):
-            options = direct_solve(options_ctype)
-            return f'{options}.device_opt()'
-
-        elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))):
-            options = direct_solve(options_ctype)
-            return f'{options}.pinned_memory_opt()'
-
-        # We can always do translations from value types to reference types, like vector<int> -> IntArrayRef
-        elif goal.type == BaseCType(intArrayRefT):
-            return direct_solve(NamedCType(goal.name, longVec_ctype))
-        elif goal.type == BaseCType(optionalScalarRefT):
-            return direct_solve(NamedCType(goal.name, optionalScalar_ctype))
-        elif goal.type == BaseCType(optionalTensorRefT):
-            return direct_solve(NamedCType(goal.name, optionalTensor_ctype))
-
-
-        # Note [translation from C++ reference to value types]
-        # The below cases are all for when we have an argument with a reference type,
-        # and a corresponding goal with a value type.
-        # These are needed when we populate the inputs to a lambda capture and we need
-        # to guarantee the lifetime of each captured argument.
-        # We guard it with an explicit kwarg because converting to a value type is expensive
-        # (O(n)) to convert from IntArrayRef to vector<int>),
-        # so the caller of translate() should be explicit that they need it.
-        if allow_expensive_conversions:
-            if goal.type == VectorCType(BaseCType(longT)):
-                intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT))
-                argname = direct_solve(intArrayRef_ctype)
-                return f'{argname}.vec()'
-            elif goal.type == OptionalCType(BaseCType(scalarT)):
-                optionalScalarRef_ctype = NamedCType(goal.name, BaseCType(optionalScalarRefT))
-                argname = direct_solve(optionalScalarRef_ctype)
-                return f'{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt'
-            elif goal.type == OptionalCType(BaseCType(scalarT)):
-                optionalTensorRef_ctype = NamedCType(goal.name, BaseCType(optionalTensorRefT))
-                argname = direct_solve(optionalTensorRef_ctype)
-                return f'{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt'
-            # Technically, we also need to handle cases of C++ containers holding reference types.
-            # But there currently aren't any ops that require lambda capture codegen
-            # With arguments like std::vector<IntArrayRef>.
-            # If that changes, we'll have to add the translation here.
-
-        unsat(goal)
-
-    return [Expr(solve(g, direct=False), g) for g in goal_ctypes]
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
deleted file mode 100644
index d269f2c7a3ff..000000000000
--- a/tools/codegen/api/types.py
+++ /dev/null
@@ -1,618 +0,0 @@
-from tools.codegen.model import (Argument, FunctionSchema, NativeFunction,
-                                 BackendIndex,
-                                 SelfArgument, TensorOptionsArguments, BaseTy)
-from dataclasses import dataclass
-from typing import Optional, Union, Sequence, TypeVar, List, Set, Dict
-from enum import Enum
-
-_T = TypeVar('_T')
-
-# An ArgName is just the str name of the argument in schema;
-# but in some special circumstances, we may add a little extra
-# context.  The Enum SpecialArgName covers all of these cases;
-# grep for their construction sites to see when they can occr.
-
-SpecialArgName = Enum('SpecialArgName', (
-    'possibly_redundant_memory_format',
-))
-ArgName = Union[str, SpecialArgName]
-
-# This class shouldn't be created directly; instead, use/create one of the singletons below.
-@dataclass(frozen=True)
-class BaseCppType:
-    ns: Optional[str]
-    name: str
-
-    def __str__(self) -> str:
-        if self.ns is None or self.ns == '':
-            return self.name
-        return f"{self.ns}::{self.name}"
-
-# The set of all non-templated, valid, fully-qualified names of C++ types that are used in the codegen.
-# Templated types get their own dataclass, mainly to make namespace parsing easier.
-byteT = BaseCppType('', 'uint8_t')
-charT = BaseCppType('', 'int8_t')
-shortT = BaseCppType('', 'int16_t')
-# It would be more symmetric for this to be called intT, but it easy to mix
-# this up with JIT int (which is int64_t in C++), so we intentionally don't
-# define intT to make it obvious when you've stuffed it up
-int32T = BaseCppType('', 'int32_t')
-longT = BaseCppType('', 'int64_t')
-halfT = BaseCppType('at', 'Half')
-doubleT = BaseCppType('', 'double')
-floatT = BaseCppType('', 'float')
-complexHalfT = BaseCppType('c10', 'complex<c10::Half>')  # stuffing template param here is an abuse
-complexFloatT = BaseCppType('c10', 'complex<float>')
-complexDoubleT = BaseCppType('c10', 'complex<double>')
-boolT = BaseCppType('', 'bool')
-bfloat16T = BaseCppType('at', 'BFloat16')
-voidT = BaseCppType('', 'void')
-stringT = BaseCppType('c10', 'string_view')
-generatorT = BaseCppType('at', 'Generator')
-scalarTypeT = BaseCppType('at', 'ScalarType')
-tensorT = BaseCppType('at', 'Tensor')
-optionalTensorRefT = BaseCppType('at', 'OptionalTensorRef')
-tensorListT = BaseCppType('at', 'TensorList')
-dimnameT = BaseCppType('at', 'Dimname')
-dimnameListT = BaseCppType('at', 'DimnameList')
-layoutT = BaseCppType('at', 'Layout')
-deviceT = BaseCppType('at', 'Device')
-scalarT = BaseCppType('at', 'Scalar')
-optionalScalarRefT = BaseCppType('at', 'OptionalScalarRef')
-memoryFormatT = BaseCppType('at', 'MemoryFormat')
-qschemeT = BaseCppType('at', 'QScheme')
-storageT = BaseCppType('at', 'Storage')
-streamT = BaseCppType('at', 'Stream')
-intArrayRefT = BaseCppType('at', 'IntArrayRef')
-tensorOptionsT = BaseCppType('at', 'TensorOptions')
-typeAndSizeT = BaseCppType('torch::autograd::generated', 'TypeAndSize')
-tensorGeometryT = BaseCppType('at', 'TensorGeometry')
-
-BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
-    BaseTy.int: longT,
-    BaseTy.float: doubleT,
-    BaseTy.bool: boolT,
-    BaseTy.str: stringT,
-    BaseTy.Generator: generatorT,
-    BaseTy.ScalarType: scalarTypeT,
-    BaseTy.Tensor: tensorT,
-    BaseTy.Dimname: dimnameT,
-    BaseTy.Layout: layoutT,
-    BaseTy.Device: deviceT,
-    BaseTy.Scalar: scalarT,
-    BaseTy.MemoryFormat: memoryFormatT,
-    BaseTy.QScheme: qschemeT,
-    BaseTy.Storage: storageT,
-    BaseTy.Stream: streamT,
-}
-
-# CTypes encode C++ type structure as needed for translation.
-
-@dataclass(frozen=True)
-class BaseCType:
-    type: BaseCppType
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        return str(self.type)
-
-    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
-    # TODO: Kill this when we eventually remove it!
-    def cpp_type_registration_declarations(self) -> str:
-        return str(self.type).replace('at::', '')
-
-    def remove_const_ref(self) -> 'CType':
-        return self
-
-@dataclass(frozen=True)
-class ConstRefCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        if strip_ref:
-            return self.elem.cpp_type(strip_ref=strip_ref)
-        return f'const {self.elem.cpp_type()} &'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'const {self.elem.cpp_type_registration_declarations()} &'
-
-    def remove_const_ref(self) -> 'CType':
-        return self.elem.remove_const_ref()
-
-@dataclass(frozen=True)
-class MutRefCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        if strip_ref:
-            return self.elem.cpp_type(strip_ref=strip_ref)
-        return f'{self.elem.cpp_type()} &'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'{self.elem.cpp_type_registration_declarations()} &'
-
-    def remove_const_ref(self) -> 'CType':
-        return self.elem.remove_const_ref()
-
-@dataclass(frozen=True)
-class OptionalCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'c10::optional<{self.elem.cpp_type()}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'c10::optional<{self.elem.cpp_type_registration_declarations()}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return OptionalCType(self.elem.remove_const_ref())
-
-@dataclass(frozen=True)
-class ListCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'c10::List<{self.elem.cpp_type()}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'c10::List<{self.elem.cpp_type_registration_declarations()}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return ListCType(self.elem.remove_const_ref())
-
-@dataclass(frozen=True)
-class ArrayRefCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'at::ArrayRef<{self.elem.cpp_type()}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'ArrayRef<{self.elem.cpp_type_registration_declarations()}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return ArrayRefCType(self.elem.remove_const_ref())
-
-@dataclass(frozen=True)
-class VectorCType:
-    elem: 'CType'
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'::std::vector<{self.elem.cpp_type()}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'::std::vector<{self.elem.cpp_type_registration_declarations()}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return VectorCType(self.elem.remove_const_ref())
-
-@dataclass(frozen=True)
-class ArrayCType:
-    elem: 'CType'
-    size: int
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'::std::array<{self.elem.cpp_type()},{self.size}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'::std::array<{self.elem.cpp_type_registration_declarations()},{self.size}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return ArrayCType(self.elem.remove_const_ref(), self.size)
-
-@dataclass(frozen=True)
-class TupleCType:
-    elems: List['CType']
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        # Do not pass `strip_ref` recursively.
-        return f'::std::tuple<{",".join([e.cpp_type() for e in self.elems])}>'
-
-    def cpp_type_registration_declarations(self) -> str:
-        return f'::std::tuple<{",".join([e.cpp_type_registration_declarations() for e in self.elems])}>'
-
-    def remove_const_ref(self) -> 'CType':
-        return TupleCType([e.remove_const_ref() for e in self.elems])
-
-CType = Union[
-    BaseCType,
-    OptionalCType,
-    ConstRefCType,
-    MutRefCType,
-    ListCType,
-    ArrayRefCType,
-    ArrayCType,
-    VectorCType,
-    TupleCType
-]
-
-# A NamedCType is short for Named C++ semantic type.  A NamedCType represents a C++ type, plus
-# semantic information about what it represents.  For example, consider the
-# argument "bool pin_memory"; its normal C++ type is "bool", but its C++
-# semantic type also keeps track that this represents a "pin_memory"; you can't
-# just use a random other boolean in a context where you need a "pin_memory"!
-#
-
-@dataclass(frozen=True)
-class NamedCType:
-    name: ArgName
-    type: CType
-
-    def cpp_type(self, *, strip_ref: bool = False) -> str:
-        return self.type.cpp_type(strip_ref=strip_ref)
-
-    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
-    # TODO: Kill this when we eventually remove it!
-    def cpp_type_registration_declarations(self) -> str:
-        return self.type.cpp_type_registration_declarations()
-
-    def remove_const_ref(self) -> 'NamedCType':
-        return NamedCType(self.name, self.type.remove_const_ref())
-
-    def with_name(self, name: str) -> 'NamedCType':
-        return NamedCType(name, self.type)
-
-# A binding represents any C++ binding site for a formal parameter.
-# We don't distinguish between binding sites for different APIs;
-# instead, all of the important distinctions are encoded in CType,
-# which you can use to figure out if a given Binding is appropriate
-# for use in another context.  (See tools.codegen.api.translate)
-
-@dataclass(frozen=True)
-class Binding:
-    name: str
-    nctype: NamedCType
-    argument: Union[Argument, TensorOptionsArguments, SelfArgument]
-    # TODO: maybe don't represent default here
-    default: Optional[str] = None
-
-    @property
-    def type(self) -> str:
-        return self.nctype.cpp_type()
-
-    def no_default(self) -> 'Binding':
-        return Binding(
-            name=self.name,
-            nctype=self.nctype,
-            default=None,
-            argument=self.argument,
-        )
-
-    def decl(self, *, func_ptr_cast: bool = False) -> str:
-        mb_default = ""
-        if self.default is not None:
-            mb_default = f"={self.default}"
-
-        # casting only needs to know the type
-        if func_ptr_cast:
-            return f"{self.type}"
-        else:
-            return f"{self.type} {self.name}{mb_default}"
-
-    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
-    # TODO: Kill this when we eventually remove it!
-    def decl_registration_declarations(self) -> str:
-        type_s = self.nctype.cpp_type_registration_declarations()
-        mb_default = ""
-        if self.default is not None:
-            mb_default = f"={self.default}"
-        return f"{type_s} {self.name}{mb_default}"
-
-    def defn(self) -> str:
-        return f"{self.type} {self.name}"
-
-    def with_name(self, name: str) -> 'Binding':
-        return Binding(
-            name=name,
-            nctype=self.nctype,
-            argument=self.argument,
-            default=self.default
-        )
-
-# An Expr is a C++ expression.  It has a C++ string representing its syntax,
-# as well as a CType saying what it provides.
-
-@dataclass(frozen=True)
-class Expr:
-    expr: str
-    type: NamedCType
-
-# A CppSignature represents a single overload in the C++ API.  For
-# any given function schema, there may be multiple CppSignatures
-# corresponding to it, based on how we desugar to C++.  See also
-# CppSignatureGroup.
-@dataclass(frozen=True)
-class CppSignature:
-    # The schema this signature is derived from
-    func: FunctionSchema
-
-    # Is this a C++ signature for a method, i.e. Tensor::my_op(...)?
-    method: bool
-
-    # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API
-    # (i.e. with a potential TensorOptions argument and out arguments in the front)
-    faithful: bool
-
-    # The set of C++ arguments which should not have defaults applied to them
-    cpp_no_default_args: Set[str]
-
-    # Is this a fallback C++ binding?  Fallback bindings are enabled by
-    # manual_cpp_binding: True and are alternate, non-public API that
-    # lets manual C++ binding implementors access the binding that would
-    # have been automatically generated
-    fallback_binding: bool = False
-
-    # Return the unpacked argument structure of this signature,
-    # discarding information about which arguments are semantically
-    # related to each other.
-    def arguments(self) -> Sequence[Binding]:
-        return cpp.arguments(
-            self.func.arguments, faithful=self.faithful,
-            method=self.method, cpp_no_default_args=self.cpp_no_default_args)
-
-    def name(self) -> str:
-        n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful)
-        if self.fallback_binding:
-            n = f"__dispatch_{n}"
-        return n
-
-    # Render the C++ declaration for this signature
-    def decl(self, *, name: Optional[str] = None, prefix: str = "", is_redispatching_fn: bool = False) -> str:
-        returns_type = cpp.returns_type(self.func.returns).cpp_type()
-        cpp_args = [a.decl() for a in self.arguments()]
-        if is_redispatching_fn:
-            cpp_args = ['c10::DispatchKeySet dispatchKeySet'] + cpp_args
-        cpp_args_str = ', '.join(cpp_args)
-        if name is None:
-            name = prefix + self.name()
-        return f"{returns_type} {name}({cpp_args_str})"
-
-    # Render the C++ definition for this signature, not including
-    # the body (with curly braces)
-    def defn(self, *, name: Optional[str] = None, prefix: str = "", is_redispatching_fn: bool = False) -> str:
-        returns_type = cpp.returns_type(self.func.returns).cpp_type()
-        cpp_args = [a.defn() for a in self.arguments()]
-        if is_redispatching_fn:
-            cpp_args = ['c10::DispatchKeySet dispatchKeySet'] + cpp_args
-        cpp_args_str = ', '.join(cpp_args)
-        if name is None:
-            name = prefix + self.name()
-        return f"{returns_type} {name}({cpp_args_str})"
-
-    def ptr_type(self) -> str:
-        args_types_str = ', '.join(a.type for a in self.arguments())
-        return f'{cpp.returns_type(self.func.returns).cpp_type()} (*)({args_types_str})'
-
-    # Return the C++ function type, e.g., something like int(bool)
-    def type(self) -> str:
-        args_types_str = ', '.join(a.type for a in self.arguments())
-        return f'{cpp.returns_type(self.func.returns).cpp_type()} ({args_types_str})'
-
-
-# Represents group of all CppSignatures associated with a
-# FunctionSchema.  Right now, that's the regular, user-visible
-# signature, as well as a "faithful" signature which doesn't
-# have grouping.
-@dataclass(frozen=True)
-class CppSignatureGroup:
-    func: FunctionSchema
-    signature: CppSignature
-    faithful_signature: Optional[CppSignature]
-
-    def most_faithful_signature(self) -> CppSignature:
-        if self.faithful_signature:
-            return self.faithful_signature
-        else:
-            return self.signature
-
-    @staticmethod
-    def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup':
-        func = f.func
-        faithful_signature: Optional[CppSignature]
-        if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
-            faithful_signature = CppSignature(
-                func=func,
-                faithful=True,
-                method=method,
-                fallback_binding=fallback_binding,
-                cpp_no_default_args=f.cpp_no_default_args
-            )
-        else:
-            faithful_signature = None
-        signature = CppSignature(
-            func=func,
-            faithful=False,
-            method=method,
-            fallback_binding=fallback_binding,
-            cpp_no_default_args=f.cpp_no_default_args
-        )
-        return CppSignatureGroup(
-            func=func,
-            signature=signature,
-            faithful_signature=faithful_signature,
-        )
-
-@dataclass(frozen=True)
-class DispatcherSignature:
-    # The schema this signature is derived from
-    func: FunctionSchema
-
-    # Allows you to prepend an arbitrary prefix to the signature name.
-    # This is useful for parts of the codegen that generate wrappers around kernels,
-    # and need to avoid naming collisions.
-    prefix: str = ""
-
-    def arguments(self) -> List[Binding]:
-        return dispatcher.arguments(self.func)
-
-    def name(self) -> str:
-        return self.prefix + dispatcher.name(self.func)
-
-    def decl(self, name: Optional[str] = None) -> str:
-        args_str = ', '.join(a.decl() for a in self.arguments())
-        if name is None:
-            name = self.name()
-        return f"{self.returns_type().cpp_type()} {name}({args_str})"
-
-    def defn(self, name: Optional[str] = None, *, is_redispatching_fn: bool = False) -> str:
-        args = [a.defn() for a in self.arguments()]
-        if is_redispatching_fn:
-            args = ['c10::DispatchKeySet dispatchKeySet'] + args
-        args_str = ', '.join(args)
-        if name is None:
-            name = self.name()
-        return f"{self.returns_type().cpp_type()} {name}({args_str})"
-
-    def exprs(self) -> List[Expr]:
-        return [Expr(a.name, a.nctype) for a in self.arguments()]
-
-    def returns_type(self) -> CType:
-        return dispatcher.returns_type(self.func.returns)
-
-    def ptr_type(self) -> str:
-        dispatcher_args_types_str = ', '.join(a.type for a in self.arguments())
-        return f'{self.returns_type().cpp_type()} (*)({dispatcher_args_types_str})'
-
-    # Return the C++ function type, e.g., something like int(bool)
-    def type(self) -> str:
-        dispatcher_args_types_str = ', '.join(a.type for a in self.arguments())
-        return f'{self.returns_type().cpp_type()} ({dispatcher_args_types_str})'
-
-    @staticmethod
-    def from_schema(func: FunctionSchema, *, prefix: str = '') -> 'DispatcherSignature':
-        return DispatcherSignature(func, prefix)
-
-@dataclass(frozen=True)
-class NativeSignature:
-    # The schema this signature is derived from
-    func: FunctionSchema
-
-    prefix: str = ""
-
-    def name(self) -> str:
-        return self.prefix + native.name(self.func)
-
-    def decl(self, name: Optional[str] = None) -> str:
-        args_str = ', '.join(a.decl() for a in self.arguments())
-        if name is None:
-            name = self.name()
-        return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})"
-
-    def defn(self, name: Optional[str] = None) -> str:
-        args_str = ', '.join(a.defn() for a in self.arguments())
-        if name is None:
-            name = self.name()
-        return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})"
-
-    def ptr_type(self) -> str:
-        # don't include defaults in type signature!
-        args_str = ', '.join(a.defn() for a in self.arguments())
-        return f'{native.returns_type(self.func.returns).cpp_type()} (*)({args_str})'
-
-    def arguments(self) -> List[Binding]:
-        return native.arguments(self.func)
-
-    def returns_type(self) -> CType:
-        return native.returns_type(self.func.returns)
-
-    def dispatcher_exprs(self) -> List[Expr]:
-        return translate.translate(self.arguments(), dispatcher.arguments(self.func), method=False)
-
-@dataclass(frozen=True)
-class ViewInverseSignature:
-    # The NativeFunction this signature is derived from
-    f: NativeFunction
-
-    def name(self) -> str:
-        return functionalization.name(self.f, functional_op=self.f, is_reverse=True, include_namespace=False)
-
-    def decl(self) -> str:
-        return_type = functionalization.returns_type(self.f.func)
-        decls = [a.decl() for a in functionalization.inner_arguments(self.f.func, is_reverse=True)]
-        return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
-
-    @staticmethod
-    def from_func(f: NativeFunction) -> 'ViewInverseSignature':
-        # Some assertions: lambdas are only used for view ops
-        assert f.is_view_op
-        assert not f.func.name.name.inplace  # only functional view ops need an inverse (e.g. not transpose_())
-        return ViewInverseSignature(f)
-
-@dataclass(frozen=True)
-class FunctionalizationLambda:
-    # The NativeFunction this signature is derived from
-    f: NativeFunction
-
-    # The corresponding out-of-place variant of the above NativeFunction
-    # This only really matters for inplace-view ops.
-    # e.g. transpose_() -> transpose().
-    functional_op: NativeFunction
-
-    # are we generating the forward lambda or the reverse lambda?
-    is_reverse: bool
-
-    def captures(self) -> List[Expr]:
-        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
-        outer_ctx = dispatcher.arguments(self.f.func)
-        capture_bindings = functionalization.capture_arguments(self.f.func, is_reverse=self.is_reverse)
-        # allow_expensive_conversions is set because we want to convert
-        # some reference types (IntArrayRef) to value types (vector<int64_t>).
-        capture_exprs = translate.translate(outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True)
-        return capture_exprs
-
-    def decl(self) -> str:
-        return_type = functionalization.returns_type(self.f.func)
-        capture_str = ', '.join(f'{val.type.name} = {val.expr}' for val in self.captures())
-        decls = [a.decl() for a in functionalization.outer_arguments(is_reverse=self.is_reverse)]
-        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
-
-    def inner_call(self) -> str:
-        inner_call_name = functionalization.name(
-            self.f, functional_op=self.functional_op, is_reverse=self.is_reverse, include_namespace=True)
-
-        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
-        capture_ctx = functionalization.capture_arguments(self.f.func, is_reverse=self.is_reverse)
-        full_ctx = arg_ctx + capture_ctx
-
-        call_bindings = functionalization.inner_arguments(self.f.func, is_reverse=self.is_reverse)
-        maybe_index = functionalization.inner_call_index(self.f.func)
-        call_exprs = [e.expr for e in translate.translate(full_ctx, call_bindings, method=False)]
-        if not self.is_reverse and maybe_index is not None:
-            return f'{inner_call_name}({", ".join(call_exprs)})[{maybe_index.name}];'
-        else:
-            return f'{inner_call_name}({", ".join(call_exprs)});'
-
-    @staticmethod
-    def from_func(f: NativeFunction, *, functional_op: NativeFunction, is_reverse: bool) -> 'FunctionalizationLambda':
-        # Some assertions: lambdas are only used for view ops
-        assert f.is_view_op
-        assert functional_op.is_view_op
-        # functional_op corresponds to the functional-variant of f, and is only actually used if f itself is an inplace_view op.
-        assert f.func.signature() == functional_op.func.signature()
-        return FunctionalizationLambda(f, functional_op, is_reverse)
-
-
-# Helper functions
-
-def kernel_signature(
-        f: NativeFunction, backend_index: BackendIndex, *, prefix: str = '') -> Union['NativeSignature', 'DispatcherSignature']:
-    # Note [External Backends Follow Dispatcher API]
-    # Kernel signatures for in-tree backends follow the "native" API,
-    # while kernels for out-of-tree backends follow the dispatcher API.
-    # See the comments in `native.py` for details, but historically there have been
-    # some small differences in schema convention between them and the Dispatcher API.
-    # Any differences that require translating between the two will results in a runtime cost,
-    # so we'd like to keep the differences as small as possible.
-    # With external backends, we'd like to enforce that they write their kernels with schemas
-    # that match the Dispatcher API directly, if they can.
-    if backend_index.external:
-        return DispatcherSignature.from_schema(f.func, prefix=prefix)
-    else:
-        return NativeSignature(f.func, prefix)
-
-# Functions only, no types
-from tools.codegen.api import cpp, dispatcher, native, translate, functionalization
diff --git a/tools/codegen/code_template.py b/tools/codegen/code_template.py
deleted file mode 100644
index 3b0b188834ef..000000000000
--- a/tools/codegen/code_template.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import re
-from typing import Match, Optional, Sequence, Mapping
-
-# match $identifier or ${identifier} and replace with value in env
-# If this identifier is at the beginning of whitespace on a line
-# and its value is a list then it is treated as
-# block substitution by indenting to that depth and putting each element
-# of the list on its own line
-# if the identifier is on a line starting with non-whitespace and a list
-# then it is comma separated ${,foo} will insert a comma before the list
-# if this list is not empty and ${foo,} will insert one after.
-
-
-class CodeTemplate:
-    # Python 2.7.5 has a bug where the leading (^[^\n\S]*)? does not work,
-    # workaround via appending another [^\n\S]? inside
-
-    substitution_str = r'(^[^\n\S]*[^\n\S]?)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})'
-
-    # older versions of Python have a bug where \w* does not work,
-    # so we need to replace with the non-shortened version [a-zA-Z0-9_]*
-    # https://bugs.python.org/issue18647
-
-    substitution_str = substitution_str.replace(r'\w', r'[a-zA-Z0-9_]')
-
-    substitution = re.compile(substitution_str, re.MULTILINE)
-
-    pattern: str
-    filename: str
-
-    @staticmethod
-    def from_file(filename: str) -> 'CodeTemplate':
-        with open(filename, 'r') as f:
-            return CodeTemplate(f.read(), filename)
-
-    def __init__(self, pattern: str, filename: str = "") -> None:
-        self.pattern = pattern
-        self.filename = filename
-
-    def substitute(self, env: Optional[Mapping[str, object]] = None, **kwargs: object) -> str:
-        if env is None:
-            env = {}
-
-        def lookup(v: str) -> object:
-            assert env is not None
-            return kwargs[v] if v in kwargs else env[v]
-
-        def indent_lines(indent: str, v: Sequence[object]) -> str:
-            return "".join([indent + l + "\n" for e in v for l in str(e).splitlines()]).rstrip()
-
-        def replace(match: Match[str]) -> str:
-            indent = match.group(1)
-            key = match.group(2)
-            comma_before = ''
-            comma_after = ''
-            if key[0] == "{":
-                key = key[1:-1]
-                if key[0] == ",":
-                    comma_before = ', '
-                    key = key[1:]
-                if key[-1] == ',':
-                    comma_after = ', '
-                    key = key[:-1]
-            v = lookup(key)
-            if indent is not None:
-                if not isinstance(v, list):
-                    v = [v]
-                return indent_lines(indent, v)
-            elif isinstance(v, list):
-                middle = ', '.join([str(x) for x in v])
-                if len(v) == 0:
-                    return middle
-                return comma_before + middle + comma_after
-            else:
-                return str(v)
-        return self.substitution.sub(replace, self.pattern)
-
-
-if __name__ == "__main__":
-    c = CodeTemplate("""\
-    int foo($args) {
-
-        $bar
-            $bar
-        $a+$b
-    }
-    int commatest(int a${,stuff})
-    int notest(int a${,empty,})
-    """)
-    print(c.substitute(args=["hi", 8], bar=["what", 7],
-                       a=3, b=4, stuff=["things...", "others"], empty=[]))
diff --git a/tools/codegen/context.py b/tools/codegen/context.py
deleted file mode 100644
index ba21c86c7934..000000000000
--- a/tools/codegen/context.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from tools.codegen.utils import S, T, context
-from tools.codegen.model import (NativeFunction, NativeFunctionsGroup, BackendIndex, DispatchKey)
-import tools.codegen.local as local
-
-import functools
-from typing import TypeVar, Union, Iterator, Callable, Dict
-import contextlib
-
-# Helper functions for defining generators on things in the model
-
-F = TypeVar(
-    'F',
-    NativeFunction,
-    NativeFunctionsGroup,
-    Union[NativeFunction, NativeFunctionsGroup],
-)
-
-@contextlib.contextmanager
-def native_function_manager(g: Union[NativeFunctionsGroup, NativeFunction]) -> Iterator[None]:
-    if isinstance(g, NativeFunctionsGroup):
-        # By default, we associate all errors with structured native functions
-        # with the out variant.  In some cases, it might be better to have
-        # a more specific place to hang things; if so, use
-        # native_function_manager again on the inside
-        f = g.out
-    else:
-        f = g
-    with context(lambda: f'in native_functions.yaml line {f.loc}:\n  {f.func}'):
-        with local.parametrize(use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors):
-            yield
-
-# Given a function that operates on NativeFunction, wrap it into a new function
-# that sets some appropriate context managers for that native function.
-# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound
-# (you will get an error if we try to access the local variables without having
-# set them).
-def with_native_function(func: Callable[[F], T]) -> Callable[[F], T]:
-    @functools.wraps(func)
-    def wrapper(f: F) -> T:
-        with native_function_manager(f):
-            return func(f)
-    return wrapper
-
-def method_with_native_function(func: Callable[[S, F], T]) -> Callable[[S, F], T]:
-    @functools.wraps(func)
-    def wrapper(slf: S, f: F) -> T:
-        with native_function_manager(f):
-            return func(slf, f)
-    return wrapper
-
-# Convenience decorator for functions that explicitly take in a BackendIndex,
-# instead of indirectly taking one in as a closure
-def with_native_function_and_index(func: Callable[[F, BackendIndex], T]) -> Callable[[F, BackendIndex], T]:
-    @functools.wraps(func)
-    def wrapper(f: F, backend_index: BackendIndex) -> T:
-        with native_function_manager(f):
-            return func(f, backend_index)
-    return wrapper
-
-def with_native_function_and_indices(
-        func: Callable[[F, Dict[DispatchKey, BackendIndex]], T]
-) -> Callable[[F, Dict[DispatchKey, BackendIndex]], T]:
-    @functools.wraps(func)
-    def wrapper(f: F, backend_indices: Dict[DispatchKey, BackendIndex]) -> T:
-        with native_function_manager(f):
-            return func(f, backend_indices)
-    return wrapper
diff --git a/tools/codegen/dest/__init__.py b/tools/codegen/dest/__init__.py
deleted file mode 100644
index ce9265adf969..000000000000
--- a/tools/codegen/dest/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .lazy_ir import LazyIR as LazyIR
-from .lazy_ir import GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition
-from .lazy_ir import GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition
-from .register_dispatch_key import (
-    RegisterDispatchKey as RegisterDispatchKey,
-    gen_registration_helpers as gen_registration_helpers,
-    gen_registration_headers as gen_registration_headers,
-)
-from .native_functions import compute_native_function_declaration as compute_native_function_declaration
diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py
deleted file mode 100644
index d41b4edcd8ac..000000000000
--- a/tools/codegen/dest/lazy_ir.py
+++ /dev/null
@@ -1,264 +0,0 @@
-from typing import List, Union
-from dataclasses import dataclass
-from tools.codegen.context import method_with_native_function
-from tools.codegen.model import (BackendIndex, NativeFunction,
-                                 NativeFunctionsGroup)
-from tools.codegen.api.types import (BaseCType, OptionalCType, NamedCType,
-                                     VectorCType, kernel_signature)
-import tools.codegen.api.dispatcher as dispatcher
-from tools.codegen.api.lazy import LazyIrSchema, isValueType
-from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body
-
-
-def node_ctor_arg_rvalue_string(arg: NamedCType) -> str:
-    """
-    Given a NamedCType from a lazy IR schema,
-    generate a c++ string for materializing an rvalue of that arg for passing into
-    a lazy Node constructor.
-    """
-    if isValueType(arg.type):
-        if isinstance(arg.type, BaseCType):
-            return f"lazy_{arg.name}.GetIrValue()"
-        elif isinstance(arg.type, OptionalCType):
-            return f"lazy_{arg.name} ? " \
-                   f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \
-                   "c10::nullopt"
-        else:
-            raise AssertionError("TODO not sure if there are other valid types to handle here")
-    else:
-        if isinstance(arg.type, VectorCType) and isinstance(arg.type.elem, BaseCType):
-            return f"std::vector<{arg.type.elem.type}>({arg.name}.begin(), {arg.name}.end())"
-        elif (isinstance(arg.type, OptionalCType) and
-                isinstance(arg.type.elem, VectorCType) and
-                isinstance(arg.type.elem.elem, BaseCType)):
-            return f"torch::lazy::ToOptionalVector<{arg.type.elem.elem.type}>({arg.name})"
-        else:
-            return f"{arg.name}"
-
-def node_ctor_inputs(func: LazyIrSchema) -> str:
-    """
-    Produce a formatted string with the arguments as passed into the constructor of a node class.
-    """
-    node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()]
-    return ",\n                              ".join(node_ctor_values)
-
-
-@dataclass(frozen=True)
-class LazyIR:
-    backend_index: BackendIndex
-    node_base: str
-
-    @method_with_native_function
-    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
-        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
-        return self.gen(f)
-
-    def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
-        # for now, we just want one IR class decl and soon after also the method defs
-        # and we use the functional version not out/inplace.
-        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
-        schema = LazyIrSchema(func)
-        all_types = schema.filtered_types()
-        value_types = schema.filtered_types(values=True, scalars=False)
-        scalar_types = schema.filtered_types(values=False, scalars=True)
-
-        node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types])
-        scalar_initializers = ",\n        ".join([f"{t.name}_({t.name})" for t in scalar_types])
-        comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else ""
-        scalar_decls = "\n  ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types])
-        scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types])
-        base_ctor_value_args_list = []
-        optional_values = []
-        for t in value_types:
-            if isinstance(t.type, BaseCType):
-                base_ctor_value_args_list.append(f"{t.name}")
-            elif isinstance(t.type, OptionalCType):
-                base_ctor_value_args_list.append(f"{t.name}.value_or(kNullValue)")
-                optional_values.append(t.name)
-            else:
-                raise AssertionError("TODO not sure if there are other valid types to handle here")
-        base_ctor_value_args = ", ".join(base_ctor_value_args_list)
-        has_optional_decls = "\n  ".join([f"bool has_{value}: 1;" for value in optional_values])
-        has_optional_defs = "\n    ".join([f"has_{value} = !!{value};" for value in optional_values])
-        members_to_string = []
-        for t in scalar_types:
-            if isinstance(t.type, OptionalCType):
-                members_to_string.append(f"""if ({t.name}_.has_value()) {{
-    ss << ", {t.name}=" << {t.name}_.value();
-}} else {{
-    ss << ", {t.name}=null";
-}}""")
-            else:
-                members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;')
-        members_to_string_str = "\n    ".join(members_to_string)
-
-        return [f"""\
-// TODO(alanwaketan): Public members don't need to have _ suffix.
-class {schema.node_name} : public {self.node_base} {{
- public:
-  {schema.node_name}({node_ctor_args}, std::vector<Shape>&& shapes)
-      : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}),
-              {{{base_ctor_value_args}}}, std::move(shapes),
-              /* num_outputs */ {len(func.returns)},
-              torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers}
-        {scalar_initializers}
-
-  {{
-    {has_optional_defs}
-  }}
-
-  std::string ToString() const override {{
-    std::stringstream ss;
-    ss << TsNode::ToString();
-    {members_to_string_str}
-    return ss.str();
-  }}
-
-  torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
-                   torch::lazy::TSLoweringContext* loctx) const override {{
-    {ts_lowering_body(f)}
-  }}
-
-  {scalar_decls}
-  {has_optional_decls}
-
-}};
-
-""", ]
-
-
-def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str:
-    lazy_tensor_decls: List[str] = []
-    for t in value_types:
-        if isinstance(t.type, BaseCType):
-            lazy_tensor_decls.append(
-                f"{tensor_class} lazy_{t.name} = "
-                f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);")
-        elif isinstance(t.type, OptionalCType):
-            # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it
-            # until we encounter a real world example.
-            lazy_tensor_decls.append(
-                f"    {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));")
-        else:
-            raise AssertionError("TODO not sure if there are other valid types to handle here")
-    return "\n    ".join(lazy_tensor_decls)
-
-@dataclass(frozen=True)
-class GenLazyNativeFuncDefinition:
-    class_method_name: str
-    backend_index: BackendIndex
-    tensor_class: str
-
-    @method_with_native_function
-    def __call__(self, func: NativeFunction) -> List[str]:
-        sig = kernel_signature(func, self.backend_index)
-
-        # Lazy IR stuff
-        schema = LazyIrSchema(func.func)
-        all_types = schema.filtered_types()
-        value_types = schema.filtered_types(values=True, scalars=False)
-        scalar_types = schema.filtered_types(values=False, scalars=True)
-        returns_length = len(schema.returns)
-
-        value_types_names = ", ".join([f"{t.name}" for t in value_types])
-        get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});"""
-        lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class)
-        node_ctor_input_str = node_ctor_inputs(schema)
-
-        # call the meta kernel if it exists, to compute output shape/dtype for our IR
-        if func.structured or func.structured_delegate is not None:
-            meta_out = """std::vector<Shape> shapes{Shape(out_meta.scalar_type(), out_meta.sizes().vec())};"""
-            if returns_length > 1:
-                def this_shape(i: int) -> str:
-                    return f"Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())"
-                shapes_str = ','.join([this_shape(i) for i in range(returns_length)])
-                meta_out = "std::vector<Shape> shapes{" + shapes_str + "};"
-
-            meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)});
-        {meta_out}"""
-        else:
-            shape_sig = ComputeShapeSignature(func)
-            meta_str = f"""
-        auto shapes = {shape_sig.shape_call};"""
-        meta_str += f"""
-        TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});"""
-
-        node_str = f"""auto node = torch::lazy::MakeNode<ir::ops::{schema.node_name}>({node_ctor_input_str},
-                                                                                      std::move(shapes));"""
-
-        assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}"
-        first_tensor = value_types[0]
-        bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));"""
-        if returns_length > 1:
-            bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors;
-        for (int i = 0; i < {returns_length}; i++) {{
-            lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i)));
-        }}
-        auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);"""
-        if schema.name.name.inplace:
-            assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \
-                                        "and has tuple outputs."
-            bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node);
-        auto& result = {first_tensor.name};"""
-
-
-        return [f"""\
-    // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it.
-    {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{
-        TORCH_LAZY_FN_COUNTER("lazy::");
-        {get_device_str}
-        {lazy_tensor_decls_str}
-        {meta_str}
-        {node_str}
-        {bridge_str}
-        return result;
-    }};\n
-    """]
-
-class ComputeShapeSignature:
-    """
-    Here we use the base name as the suffix of the signature to avoid generating for in-place variants.
-    """
-    @method_with_native_function
-    def __init__(self, f: NativeFunction):
-        self.__schema = LazyIrSchema(f.func)
-        self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)])
-        self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()])
-
-    def __decl_suffix(self) -> str:
-        return f"{self.__schema.base_name}({self.__dispatch_args})"
-
-    def __call_suffix(self) -> str:
-        return f"{self.__schema.base_name}({self.__call_args})"
-
-    @property
-    def shape_decl(self) -> str:
-        return f"std::vector<Shape> compute_shape_{self.__decl_suffix()}"
-
-    @property
-    def shape_call(self) -> str:
-        return f"torch_lazy_tensors::ir::ops::compute_shape_{self.__call_suffix()}"
-
-
-@dataclass(frozen=True)
-class GenLazyShapeInferenceDefinition:
-    backend_index: BackendIndex
-    tensor_class: str
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> List[str]:
-        sig = kernel_signature(f, self.backend_index)
-
-        # Lazy IR stuff
-        schema = LazyIrSchema(f.func)
-        value_types = schema.filtered_types(values=True, scalars=False)
-        lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class)
-        node_ctor_input_str = node_ctor_inputs(schema)
-
-        # Only generate shape/dtype fn for non-structured kernels,
-        # since we just use the meta function for structured kernels
-        if not f.structured and f.structured_delegate is None:
-            shape_sig = ComputeShapeSignature(f)
-            return ["\n".join([f"{shape_sig.shape_decl};"])]
-        else:
-            return []
diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py
deleted file mode 100644
index 32d505cda7bf..000000000000
--- a/tools/codegen/dest/lazy_ts_lowering.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Union
-from tools.codegen.model import (NativeFunction, NativeFunctionsGroup)
-from tools.codegen.api.lazy import LazyIrSchema, isValueType
-from tools.codegen.api.types import OptionalCType
-
-
-def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
-    # for now, we just want one IR class decl and soon after also the method defs
-    # and we use the functional version not out/inplace.
-    func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
-    schema = LazyIrSchema(func)
-
-    emplace_arguments = []
-    for value in schema.positional_arg_types:
-        if isValueType(value.type):
-            if isinstance(value.type, OptionalCType):
-                emplace_arguments.append(f"has_{value.name} ? loctx->GetOutputOp(operand(i++)) : nullptr")
-                continue
-            emplace_arguments.append('loctx->GetOutputOp(operand(i++))')
-            continue
-        emplace_arguments.append(f'"{value.name}", {value.name}_')
-
-    emplace_arguments_str = "\n    ".join(
-        [f"arguments.emplace_back({a});" for a in emplace_arguments])
-    emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))]
-    emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars]
-    assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values"
-    emplace_kwarguments = "\n    ".join(
-        [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars])
-    return f"""\
-    std::vector<torch::jit::NamedValue> arguments;
-    std::vector<torch::jit::NamedValue> kwarguments;
-    arguments.reserve({len(emplace_arguments)});
-    kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)});
-    size_t i = 0;
-    {emplace_arguments_str}
-    {emplace_kwarguments}
-    torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
-    CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)});
-
-    // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly
-    return {schema.aten_name}_out;
-"""
diff --git a/tools/codegen/dest/native_functions.py b/tools/codegen/dest/native_functions.py
deleted file mode 100644
index 5fbb297f9c50..000000000000
--- a/tools/codegen/dest/native_functions.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from typing import List, Union, Optional
-
-from tools.codegen.context import with_native_function_and_index
-from tools.codegen.utils import mapMaybe
-from tools.codegen.model import NativeFunction, NativeFunctionsGroup, BackendIndex
-from tools.codegen.api.types import kernel_signature
-import tools.codegen.api.meta as meta
-import tools.codegen.api.structured as structured
-
-@with_native_function_and_index
-def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> Optional[str]:
-    sig = kernel_signature(f, backend_index)
-    metadata = backend_index.get_kernel(f)
-    if metadata is None:
-        return None
-    if "legacy::" in metadata.kernel:
-        return None
-    else:
-        prefix = 'static' if backend_index.external else 'TORCH_API'
-        return f"{prefix} {sig.decl(name=metadata.kernel)};"
-
-@with_native_function_and_index
-def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> List[str]:
-    meta_name = meta.name(g)
-    out_args = structured.impl_arguments(g)
-    metadata = backend_index.get_kernel(g)
-    if metadata is None:
-        return []
-    prefix = '' if backend_index.external else 'TORCH_API '
-    return [f"""\
-struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{
-void impl({', '.join(a.decl() for a in out_args)});
-}};
-"""]
-
-# Generates NativeFunctions.h, a list of forward declarations of all
-# actual kernel definitions we keep in aten/src/ATen/native/
-@with_native_function_and_index
-def compute_native_function_declaration(
-        g: Union[NativeFunctionsGroup, NativeFunction],
-        backend_index: BackendIndex
-) -> List[str]:
-    metadata = backend_index.get_kernel(g)
-    if isinstance(g, NativeFunctionsGroup):
-        if metadata is not None and metadata.structured:
-            if backend_index.external:
-                # Structured hasn't been tested with external backends yet.
-                raise AssertionError("Structured external backend functions are not implemented yet.")
-            else:
-                return gen_structured(g, backend_index)
-        else:
-            return list(mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions()))
-    else:
-        x = gen_unstructured(g, backend_index)
-        return [] if x is None else [x]
diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py
deleted file mode 100644
index c555768d08ce..000000000000
--- a/tools/codegen/dest/register_dispatch_key.py
+++ /dev/null
@@ -1,757 +0,0 @@
-from typing import List, Optional, Union
-import itertools
-from typing_extensions import Literal
-from dataclasses import dataclass
-import textwrap
-
-from tools.codegen.context import method_with_native_function, native_function_manager
-from tools.codegen.utils import Target, mapMaybe, assert_never
-from tools.codegen.model import (DispatchKey, NativeFunction,
-                                 NativeFunctionsGroup, SchemaKind,
-                                 TensorOptionsArguments,
-                                 DeviceCheckType, Argument,
-                                 is_cuda_dispatch_key, BackendIndex,
-                                 gets_generated_out_inplace_wrapper)
-from tools.codegen.api.types import (BaseCType, Binding, ConstRefCType,
-                                     CppSignature, CppSignatureGroup,
-                                     Expr, MutRefCType, kernel_signature,
-                                     NativeSignature, tensorT, NamedCType,
-                                     DispatcherSignature)
-import tools.codegen.api.meta as meta
-import tools.codegen.api.cpp as cpp
-import tools.codegen.api.structured as structured
-from tools.codegen.api.translate import translate
-from tools.codegen.selective_build.selector import SelectiveBuilder
-
-def gen_registration_headers(
-        backend_index: BackendIndex,
-        per_operator_headers: bool,
-        rocm: bool,
-) -> List[str]:
-    if per_operator_headers:
-        headers = ["#include <ATen/ops/as_strided_native.h>"]
-    else:
-        headers = ["#include <ATen/NativeFunctions.h>"]
-
-    if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta):
-        headers.append("#include <ATen/EmptyTensor.h>")
-    elif backend_index.dispatch_key == DispatchKey.CUDA:
-        if rocm:
-            headers.append("#include <ATen/hip/EmptyTensor.h>")
-        else:
-            headers.append("#include <ATen/cuda/EmptyTensor.h>")
-    elif per_operator_headers:
-        headers += [
-            "#include <ATen/ops/empty.h>",
-            "#include <ATen/ops/empty_strided.h>"]
-    else:
-        headers.append("#include <ATen/Functions.h>")
-
-    return headers
-
-def gen_create_out_helper(backend_index: BackendIndex) -> List[str]:
-    if backend_index.dispatch_key == DispatchKey.Meta:
-        empty_options = "options.device(at::kMeta)"
-    else:
-        empty_options = "options"
-
-    if backend_index.dispatch_key in (
-            DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA):
-        dispatch = str(backend_index.dispatch_key).lower()
-        empty_impl = f"at::detail::empty_{dispatch}"
-        empty_strided_impl = f"at::detail::empty_strided_{dispatch}"
-        runtime_empty_supported_check = ""
-    elif backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
-        empty_impl = "at::empty"
-        empty_strided_impl = "at::empty_strided"
-        runtime_empty_supported_check = """\
-  if (!c10::detail::backend_supports_empty_operator(options)) {{
-    // The main purpose of this CompositeExplicitAutograd kernel is to provide
-    // a "free" implementation of out-of-place operators.
-    // If a backend hasn't implemented an out-of-place op but has implemented
-    // the out= variant, then this kernel will call their out= variant.
-    // It does that by using at::empty() to create the tensor to pass to the out= variant though,
-    // so this "default" kernel doesn't actually handle backends that don't support at::empty
-    // (e.g. quantized backends).
-    // Returning an undefined tensor here allows us to reach the out= kernel and give a better error.
-    // Longer term, this could be better fixed by https://github.com/pytorch/pytorch/issues/52680
-    return at::Tensor();
-  }}
-"""
-    else:
-        return []
-
-    return [f"""
-Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
-  {runtime_empty_supported_check}
-  if (strides.empty()) {{
-      return {empty_impl}(sizes, {empty_options});
-  }} else {{
-      return {empty_strided_impl}(sizes, strides, {empty_options});
-  }}
-}}
-"""]
-
-
-def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]:
-    return ["""
-void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
-  TORCH_CHECK(options.dtype() == out.dtype(),
-      "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
-  TORCH_CHECK(options.device() == out.device(),
-      "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
-  const bool resized = at::native::resize_output(out, sizes);
-  // Only restride if a resize occurred; otherwise we ignore the (advisory)
-  // strides from the meta function and directly use the output tensor's
-  // preexisting strides
-  if (resized) {
-    if (!strides.empty()) {
-      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
-      at::native::as_strided_(out, sizes, strides);
-    } else if (options.memory_format_opt().has_value()) {
-      out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
-    }
-  }
-}
-"""]
-
-def gen_check_inplace_helper(backend_index: BackendIndex) -> List[str]:
-    return ["""
-void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
-  // These checks are needed on those operators that:
-  //   1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
-  //   2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
-  // For other operators (e.g. 'add'), 'TensorIterator' already checks
-  // these things separately.
-  TORCH_CHECK(options.dtype() == self.dtype(),
-      "Bad in-place call: ",
-      "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
-  TORCH_CHECK(options.device() == self.device(),
-      "Bad in-place call: ",
-      "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
-  TORCH_CHECK(sizes == self.sizes(),
-      "Bad in-place call: ",
-      "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
-}
-"""]
-
-
-def gen_registration_helpers(backend_index: BackendIndex) -> List[str]:
-    return [
-        *gen_create_out_helper(backend_index),
-        *gen_resize_out_helper(backend_index),
-        *gen_check_inplace_helper(backend_index)
-    ]
-
-
-# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp).
-#
-#   - The primary function of this file is to register all of the
-#     implementations for the given dispatch key to the dispatcher,
-#     so they are available for use in PyTorch.  If dispatch is
-#     None, we generate schema (def) registrations and catchall
-#     registrations.
-#   - The secondary function of this file is to generate a wrapper
-#     around functions.  In CPUType these wrappers do nothing
-#     (and should be removed), but in other cases they handle
-#     DeviceGuard. A small extra benefit of wrappers is they
-#     are not overloaded, so they can be used in the registration
-#     API without having to disambiguate which overload you want
-#     (as would be the case if you directly registered native::
-#     functions).
-#   - The tertiary function of this file is to generate *static*
-#     cpp API bindings which can be used to bypass dispatcher
-#     directly to kernels, but with user-friendly cpp-style API
-@dataclass(frozen=True)
-class RegisterDispatchKey:
-    backend_index: BackendIndex
-
-    target: Union[
-        Literal[Target.ANONYMOUS_DEFINITION],
-        Literal[Target.NAMESPACED_DEFINITION],
-        Literal[Target.NAMESPACED_DECLARATION],
-        Literal[Target.REGISTRATION]
-    ]
-
-    # Selector object to determine which operators to generate
-    # registration code for.
-    selector: SelectiveBuilder
-
-    # Whether or not we are actually code-genning for ROCm
-    rocm: bool
-
-    # The namespace that the kernels are written in. This is just `at::native` for in-tree kernels.
-    cpp_namespace: str
-
-    # The class that all unstructured native functions live under. This is used to improve
-    # compiler error messages when a kernel writer adds a native function with the wrong signature.
-    # This is only used in unstructured kernels, since structured kernels already live in a class.
-    # Finally, this field is currently Optional because it is only used by external backends.
-    # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating
-    # all of the existing kernel signatures scattered across aten/src/ATen/native.
-    class_method_name: Optional[str]
-
-    @staticmethod
-    def gen_device_check(type: DeviceCheckType, args: List[Argument], method_name: str) -> str:
-        if type == DeviceCheckType.NoCheck:
-            return '  // No device check\n'
-
-        device_check = 'c10::optional<Device> common_device = nullopt;\n'
-        device_check += '(void)common_device; // Suppress unused variable warning\n'
-        for arg in args:
-            # Only tensor like arguments are eligible
-            if arg.type.is_tensor_like():
-                device_check += f"""
-  c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");"""
-        return device_check
-
-    @method_with_native_function
-    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
-        if isinstance(f, NativeFunctionsGroup):
-            g: NativeFunctionsGroup = f
-            # Note: We call gen_structured() if the operator is marked structured, regardless of the backend.
-            # gen_structured() has special logic to handle auto-generated kernels.
-            if g.structured:
-                return self.gen_structured(g)
-            else:
-                return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()))
-        elif isinstance(f, NativeFunction):
-            r = self.gen_unstructured(f)
-            return [] if r is None else [r]
-        else:
-            assert_never(f)
-
-    def wrapper_kernel_sig(self, f: NativeFunction) -> Union[NativeSignature, DispatcherSignature]:
-        # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names.
-        return kernel_signature(f, self.backend_index, prefix=f'wrapper_{f.func.name.overload_name}_')
-
-    def gen_out_inplace_wrapper(self, f: NativeFunction, g: Optional[NativeFunctionsGroup]) -> Optional[str]:
-        if g is None:
-            return None
-        k = f.func.kind()
-        if k is SchemaKind.inplace:
-            copy_op = 'at::_copy_from'
-        elif k is SchemaKind.out:
-            copy_op = 'at::_copy_from_and_resize'
-        else:
-            raise AssertionError("gen_out_inplace_wrapper called on a functional op")
-
-        sig = self.wrapper_kernel_sig(f)
-        name = sig.name()
-
-        func_res = f'{name}_tmp'
-        return_names = cpp.return_names(f)
-        if len(return_names) > 1:
-            updates = '\n  '.join(
-                f'{copy_op}(std::get<{i}>({func_res}), {ret_name});'
-                for i, ret_name in enumerate(return_names))
-            returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})'
-        else:
-            ret_name = return_names[0]
-            updates = f'{copy_op}({func_res}, {ret_name});'
-            returns = ret_name
-
-        functional_sig = self.wrapper_kernel_sig(g.functional)
-        wrapper_name = sig.name()
-
-        return f"""\
-{sig.defn(name=wrapper_name)} {{
-  auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))});
-  {updates}
-  return {returns};
-}}
-"""
-
-    def gen_structured(self, g: NativeFunctionsGroup) -> List[str]:
-        metadata = self.backend_index.get_kernel(g)
-        if self.backend_index.dispatch_key == DispatchKey.Meta:
-            assert not self.backend_index.has_kernel(g.out), \
-                "Do not explicitly specify Meta dispatch key on structured " \
-                "functions, they will be automatically generated for you"
-        elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
-            assert not self.backend_index.has_kernel(g.out), \
-                "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured " \
-                "functions, they will be automatically generated for you"
-        elif metadata is None or not metadata.structured:
-            return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()))
-
-        structured_gen = StructuredRegisterDispatchKey(
-            self.backend_index,
-            self.target,
-            self.selector,
-            self.rocm,
-            self.cpp_namespace,
-            self.class_method_name,
-            g
-        )
-        return list(mapMaybe(structured_gen.gen_one, g.functions()))
-
-    def gen_unstructured(self, f: NativeFunction, g: Optional[NativeFunctionsGroup] = None) -> Optional[str]:
-        with native_function_manager(f):
-            inplace_meta = False
-            gets_out_inplace_wrapper = False
-            if not self.backend_index.has_kernel(f):
-                if (self.backend_index.dispatch_key == DispatchKey.Meta and
-                        f.func.kind() is SchemaKind.inplace and
-                        # Defer to composites for meta implementation
-                        not f.has_composite_kernel and
-                        # Inplace list operations are not supported
-                        len(f.func.returns) == 1):
-                    inplace_meta = True
-                elif (not self.backend_index.use_out_as_primary and
-                        g is not None
-                        and gets_generated_out_inplace_wrapper(f, g, self.backend_index)):
-                    # We want to generate inplace/out wrappers, that don't have a kernel for the backend.
-                    gets_out_inplace_wrapper = True
-                else:
-                    return None
-            if f.manual_kernel_registration:
-                return None
-
-            if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
-                return None
-
-            sig = self.wrapper_kernel_sig(f)
-
-            name = sig.name()
-            returns_type = sig.returns_type().cpp_type()
-            args = sig.arguments()
-            args_str = ', '.join(a.defn() for a in args)
-
-            # See Note [Direct dispatch bindings]
-            cpp_sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False)
-
-            if self.target is Target.NAMESPACED_DECLARATION:
-                result = f"TORCH_API {cpp_sig_group.signature.decl()};\n"
-                if cpp_sig_group.faithful_signature is not None:
-                    result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n"
-                return result
-            elif self.target is Target.NAMESPACED_DEFINITION:
-                def generate_defn(cpp_sig: CppSignature) -> str:
-                    return f"""
-{cpp_sig.defn()} {{
-return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
-}}
-"""
-                result = generate_defn(cpp_sig_group.signature)
-                if cpp_sig_group.faithful_signature is not None:
-                    result += generate_defn(cpp_sig_group.faithful_signature)
-                return result
-            elif self.target is Target.ANONYMOUS_DEFINITION:
-                # short circuit for inplace_meta
-                if inplace_meta:
-                    assert f.func.arguments.self_arg is not None
-                    self_arg_name = f.func.arguments.self_arg.argument.name
-                    # TODO: handle in place on tensor list
-                    return f"""
-{returns_type} {name}({args_str}) {{
-  TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(),
-    "Cannot inplace into non-meta tensor with meta tensor argument");
-  return {self_arg_name};
-}}
-"""
-
-                # short circuit for generated inplace/out wrappers
-                if gets_out_inplace_wrapper:
-                    return self.gen_out_inplace_wrapper(f, g)
-
-                metadata = self.backend_index.get_kernel(f)
-                if metadata is None:
-                    return None
-                if self.class_method_name is None:
-                    impl_name = f"{self.cpp_namespace}::{metadata.kernel}"
-                else:
-                    impl_name = f"{self.cpp_namespace}::{self.class_method_name}::{metadata.kernel}"
-
-                args_exprs_str = ', '.join(a.name for a in args)
-
-                device_check = '  // No device check\n'
-                # Backends that require device guards presumably also require device checks.
-                if self.backend_index.device_guard:
-                    device_check_args = itertools.chain(
-                        f.func.arguments.out,
-                        f.func.arguments.flat_positional
-                    )
-                    device_check = RegisterDispatchKey.gen_device_check(f.device_check, list(device_check_args), name)
-
-                device_guard = "// DeviceGuard omitted"  # default
-                if f.device_guard and self.backend_index.device_guard:
-                    has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args)
-                    if has_tensor_options:
-                        # kernel is creating a tensor
-                        device_guard = """
-  const DeviceGuard device_guard(device_or_default(device));"""
-
-                        # CUDA requires special handling
-                        if is_cuda_dispatch_key(self.backend_index.dispatch_key):
-                            device_guard = f"globalContext().lazyInitCUDA();\n{device_guard}"
-                    else:
-                        # kernel is operating on existing tensors
-
-                        # There is precedence for which argument we use to do
-                        # device guard.  This describes the precedence order.
-                        self_arg = [f.func.arguments.self_arg.argument] if f.func.arguments.self_arg is not None else []
-                        candidate_args = itertools.chain(
-                            self_arg,
-                            f.func.arguments.out,
-                            f.func.arguments.flat_positional
-                        )
-
-                        # Only tensor like arguments are eligible
-                        device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
-                        if device_of is not None:
-                            device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
-
-                return f"""\
-namespace {{
-
-{returns_type} {name}({args_str}) {{
-  {device_check}
-
-  {device_guard}
-  return {impl_name}({args_exprs_str});
-}}
-
-}} // anonymous namespace
-"""
-
-            elif self.target is Target.REGISTRATION:
-                if f.manual_kernel_registration:
-                    return None
-                else:
-                    payload = f"TORCH_FN({name})"
-                    return f'm.impl("{f.func.name}",\n{payload});\n'
-            else:
-                assert_never(self.target)
-
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                           STRUCTURED
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-@dataclass(frozen=True)
-class StructuredRegisterDispatchKey(RegisterDispatchKey):
-    g: NativeFunctionsGroup
-
-    def gen_class_set_output(self, k: SchemaKind, parent_class: str, generate_super: bool) -> str:
-        if generate_super:
-            set_output_super = f"{parent_class}::set_output(output_idx, sizes, strides, options, names);"
-        else:
-            set_output_super = ""
-        maybe_star = "*" if k is SchemaKind.functional else ""
-        return f"""
-void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides,
-                TensorOptions options, DimnameList names) override {{
-{textwrap.indent(self.gen_class_set_output_body(k), "    ")}
-    if (!names.empty()) {{
-      namedinference::propagate_names({maybe_star}outputs_[output_idx], names);
-    }}
-    // super must happen after, so that downstream can use maybe_get_output
-    // to retrieve the output
-{textwrap.indent(set_output_super, "    ")}
-}}
-"""
-
-    def gen_class_set_output_body(self, k: SchemaKind) -> str:
-        if self.backend_index.dispatch_key in [DispatchKey.CUDA, DispatchKey.CompositeExplicitAutograd]:
-            maybe_set_guard = """
-auto current_device = guard_.current_device();
-if (C10_UNLIKELY(current_device.has_value())) {
-  TORCH_INTERNAL_ASSERT(*current_device == options.device(),
-    "structured kernels don't support multi-device outputs");
-} else {
-  guard_.reset_device(options.device());
-}
-"""
-            maybe_set_guard_line = maybe_set_guard + "\n"
-        else:
-            maybe_set_guard_line = maybe_set_guard = ''
-
-        if k is SchemaKind.functional:
-            assert self.backend_index.dispatch_key in (
-                DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA,
-                DispatchKey.CompositeExplicitAutograd)
-            return f"""{maybe_set_guard_line}
-outputs_[output_idx] = create_out(sizes, strides, options);"""
-        elif k is SchemaKind.inplace:
-            return f"""{maybe_set_guard_line}
-const auto& out = outputs_[output_idx].get();
-check_inplace(out, sizes, options);"""
-        elif k is SchemaKind.out:
-            return f"""{maybe_set_guard_line}
-const auto& out = outputs_[output_idx].get();
-resize_out(out, sizes, strides, options);"""
-        else:
-            assert_never(k)
-
-    # returns the definition of a ctor, as well as how to construct
-    # this class to a variable named op
-    def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str:
-        if k is SchemaKind.functional:
-            return ""
-        elif k is SchemaKind.inplace:
-            # TODO: Make sure out argument is guaranteed to be self
-            return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}"
-        elif k is SchemaKind.out:
-            out_args = ', '.join(f"Tensor& out{i}" for i in range(returns))
-            out_refs = ', '.join(f"std::ref(out{i})" for i in range(returns))
-            return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}"
-        else:
-            assert_never(k)
-
-    def gen_class(
-        self, f: NativeFunction, k: SchemaKind, *, class_name: str, parent_class: str, generate_super: bool
-    ) -> str:
-        maybe_star = ''
-        if k is SchemaKind.functional:
-            output_type = "c10::ExclusivelyOwned<Tensor>"
-            maybe_star = '*'
-        elif k is SchemaKind.inplace:
-            output_type = "std::reference_wrapper<Tensor>"
-        elif k is SchemaKind.out:
-            output_type = "std::reference_wrapper<Tensor>"
-
-        if self.backend_index.dispatch_key == DispatchKey.CUDA:
-            if self.rocm:
-                guard_field = 'c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;'
-            else:
-                guard_field = 'c10::cuda::OptionalCUDAGuard guard_;'
-        elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
-            guard_field = 'c10::OptionalDeviceGuard guard_;'
-        else:
-            guard_field = ''
-
-        indent = " " * 4
-        class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns))
-        lines = (
-            f"struct {class_name} final : public {parent_class} {{",
-            f"{textwrap.indent(class_ctor_str, indent)}",
-            f"{textwrap.indent(self.gen_class_set_output(k, parent_class, generate_super), indent)}",
-            "    const Tensor& maybe_get_output(int64_t output_idx) override {",
-            f"        return {maybe_star}outputs_[output_idx];",
-            "    }",
-            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",
-            f"{textwrap.indent(guard_field, indent)}",
-            "};"
-        )
-        return '\n'.join(line for line in lines if line)
-
-    @method_with_native_function
-    def gen_one(self, f: NativeFunction) -> Optional[str]:
-        assert not f.manual_kernel_registration
-
-        if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
-            return None
-
-        # TODO: Now, there is something interesting going on here.  In the code below,
-        # we generate CompositeExplicitAutograd implementations of functional and inplace
-        # based on the out implementation.  But in fact, out is definable by
-        # functional too (just not very efficiently), and this is honestly the
-        # MORE likely situation for a backend implementor.  How do we pick?
-        # Well, taking a page from Haskell type classes and default methods,
-        # we could conceivably register a circular definition (out in terms
-        # of functional, and functional in terms of out) and just require
-        # someone to implement one or the other.  We'd have to do a little bit
-        # of work to not register one of these "weak" definitions unless there
-        # is a strong definition somewhere in the DAG!  So it's not implemented yet.
-        if self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd and f.func.kind() is SchemaKind.out:
-            # Never generate a default implementation for out, that's what you
-            # have to define as a backend implementor
-            return None
-
-        # Note [Direct dispatch bindings]
-        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        # Signature of the non-dispatched function we'll expose in a header
-        # (e.g., at::cpu::add).  We don't generate methods (TODO: do this
-        # when CPUTensor class is a thing); nor do we generate fallback
-        # bindings for manual_cpp_binding functions.
-        cpp_sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False)
-
-        # Signature of the wrapper function we'll register to the dispatcher
-        sig = NativeSignature(f.func, prefix="wrapper_")
-
-        if self.target is Target.NAMESPACED_DECLARATION:
-            result = f"TORCH_API {cpp_sig_group.signature.decl()};\n"
-            if cpp_sig_group.faithful_signature is not None:
-                result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n"
-            return result
-
-        elif self.target is Target.NAMESPACED_DEFINITION:
-            def generate_defn(cpp_sig: CppSignature) -> str:
-                return f"""
-{cpp_sig.defn()} {{
-return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
-}}
-"""
-            result = generate_defn(cpp_sig_group.signature)
-            if cpp_sig_group.faithful_signature is not None:
-                result += generate_defn(cpp_sig_group.faithful_signature)
-            return result
-
-        elif self.target is Target.ANONYMOUS_DEFINITION:
-
-            k = f.func.kind()
-
-            # Construct the body of the wrapper function with signature sig
-            sig_body = []
-            # We'll use context to keep track of any variables we've brought
-            # into scope while generating code
-            context: List[Union[Binding, Expr]] = list(sig.arguments())
-
-            # Initialize the class corresponding to this structured
-            # operator; feeding it the output argument(s) if it is known
-            if self.backend_index.dispatch_key is DispatchKey.Meta:
-                class_name = f"structured_{meta.name(self.g)}_meta_{k.name}"
-                parent_class = f"at::meta::structured_{meta.name(self.g)}"
-            elif self.backend_index.dispatch_key is DispatchKey.CompositeExplicitAutograd:
-                # TODO: dedup this branch
-                class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}"
-                parent_class = f"at::meta::structured_{meta.name(self.g)}"
-            else:
-                metadata = self.backend_index.get_kernel(self.g)
-                assert metadata is not None
-                class_name = f"structured_{metadata.kernel}_{k.name}"
-                parent_class = f"{self.cpp_namespace}::structured_{metadata.kernel}"
-
-            if self.backend_index.device_guard:
-                device_check_args = itertools.chain(
-                    f.func.arguments.out,
-                    f.func.arguments.flat_positional
-                )
-                sig_body.append(RegisterDispatchKey.gen_device_check(f.device_check, list(device_check_args), sig.name()))
-
-            if k is SchemaKind.functional:
-                sig_body.append(f"{class_name} op;")
-            elif k is SchemaKind.inplace:
-                sig_body.append(f"{class_name} op(self);")
-            elif k is SchemaKind.out:
-                out_args_str = ', '.join(a.name for a in f.func.arguments.out)
-                sig_body.append(f"{class_name} op({out_args_str});")
-
-            # Translate the input native arguments into structured
-            # arguments for the meta call
-            meta_exprs = ', '.join(
-                e.expr for e in translate(
-                    context,
-                    structured.meta_arguments(self.g),
-                    method=False
-                )
-            )
-
-            if self.g.out.precomputed:
-                # If this function group has precomputed elements, the meta function
-                # returns a struct containing them which must be saved so that it
-                # can be unpacked when generating code to call the impl.
-                sig_body.append(f"auto precompute = op.meta({meta_exprs});")
-
-                # Put all of the contents of the precompute struct into the context
-                # so that translate will be able to return the correct args for the
-                # call to the impl.
-                precomputed_values = [*self.g.out.precomputed.replace.values(), self.g.out.precomputed.add]
-                for precomputed_elems in precomputed_values:
-                    for arg in precomputed_elems:
-                        context.append(Expr(
-                            expr=f"precompute.{arg.name}",
-                            type=structured.argument_type(arg, binds=arg.name),
-                        ))
-
-                # Add a use of the precompute struct so FB internal compilers don't
-                # complain that there is an unused variable.
-                sig_body.append("(void)precompute;")
-            else:
-                sig_body.append(f"op.meta({meta_exprs});")
-
-
-            # After running meta, op.outputs_ is guaranteed to be valid;
-            # add it to the context
-            out_args = structured.out_arguments(self.g)
-            maybe_star = '*' if k is SchemaKind.functional else ''
-            for i, out_arg in enumerate(out_args):
-                assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type
-                context.append(Expr(
-                    expr=f"{maybe_star}op.outputs_[{i}]",
-                    # TODO: Stop hardcoding that the output type is a Tensor.  Note
-                    # that for the codegen here this is fine because outputs_ is
-                    # hardcoded to be tensor already
-                    type=NamedCType(out_arg.nctype.name, MutRefCType(BaseCType(tensorT)))
-                ))
-
-            # With the expanded context, do the impl call (if not a meta
-            # function)
-            if self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
-                # TODO: https://github.com/pytorch/pytorch/issues/53023
-                out_sig_group = CppSignatureGroup.from_native_function(
-                    self.g.out, method=False, fallback_binding=f.manual_cpp_binding)
-                out_sig = out_sig_group.most_faithful_signature()
-                api_name = out_sig.name()
-                out_exprs = ', '.join(
-                    e.expr for e in translate(
-                        context,
-                        out_sig.arguments(),
-                        method=False
-                    )
-                )
-                # TODO: I think this means structured won't work with method
-                # only functions (but maybe you're saved by faithful? iunno.)
-                # NB: Originally I wrote this as an at::redispatch call, but
-                # I got in trouble because that meant I needed a DispatchKeySet
-                # in the wrapper function, which meant I needed a DispatchKeySet
-                # in the DispatchKeyFunctions declarations, but the defined API
-                # there does NOT permit a dispatch key set.  I think you can
-                # probably unwind this by calling some function to do the TLS
-                # fetch and get the DispatchKeySet when you don't have it, but
-                # I didn't do it for this version
-                sig_body.append(f"at::{api_name}({out_exprs});")
-            elif self.backend_index.dispatch_key != DispatchKey.Meta:
-                impl_exprs = ', '.join(
-                    e.expr for e in translate(
-                        context,
-                        structured.impl_arguments(self.g),
-                        method=False
-                    )
-                )
-                sig_body.append(f"op.impl({impl_exprs});")
-
-            # Destructively return the final tensors
-            # TODO: Do this in translate instead
-            if k is SchemaKind.functional:
-                if len(f.func.returns) == 1:
-                    ret_expr = "std::move(op.outputs_[0]).take()"  # small optimization
-                else:
-                    moved = ', '.join(f"std::move(op.outputs_[{i}]).take()" for i in range(len(f.func.returns)))
-                    ret_expr = f"std::make_tuple({moved})"
-            elif k is SchemaKind.inplace:
-                ret_expr = "self"
-            elif k is SchemaKind.out:
-                if len(f.func.returns) == 1:
-                    ret_expr = f.func.arguments.out[0].name
-                else:
-                    refs = ', '.join(a.name for a in f.func.arguments.out)
-                    ret_expr = f"std::forward_as_tuple({refs})"
-            sig_body.append(f"return {ret_expr};")
-
-            sig_body_str = "\n".join(sig_body)
-
-            # For an overview of what this template code looks like, see
-            # https://github.com/pytorch/rfcs/pull/9
-            return f"""\
-{self.gen_class(
-f, k,
-class_name=class_name,
-parent_class=parent_class,
-generate_super=self.g.out.structured_inherits is not None
-)}
-
-{sig.defn()} {{
-{sig_body_str}
-}}
-"""
-
-        elif self.target is Target.REGISTRATION:
-            return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
-        else:
-            assert_never(self.target)
-            # Silence mypy's "Missing return statement" error
-            return None
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
deleted file mode 100644
index 1c2c83029d0a..000000000000
--- a/tools/codegen/gen.py
+++ /dev/null
@@ -1,1704 +0,0 @@
-import os
-from typing import List, Dict, Optional, Tuple, Set, Any, Union, Sequence, TypeVar
-from typing_extensions import Literal
-import yaml
-from collections import OrderedDict, defaultdict, namedtuple
-import argparse
-import pathlib
-import json
-from dataclasses import dataclass
-
-from tools.codegen.model import (Argument, DispatchKey, FunctionSchema,
-                                 Location, NativeFunction,
-                                 NativeFunctionsGroup, OperatorName,
-                                 BackendIndex, BackendMetadata,
-                                 OptionalType, SchemaKind, SelfArgument,
-                                 TensorOptionsArguments, Type, Variant,
-                                 is_cuda_dispatch_key,
-                                 is_generic_dispatch_key,
-                                 Tag, BaseOperatorName)
-from tools.codegen.api.types import (Binding, CppSignature, CppSignatureGroup,
-                                     DispatcherSignature, NativeSignature)
-from tools.codegen.api import cpp
-import tools.codegen.api.dispatcher as dispatcher
-import tools.codegen.api.native as native
-import tools.codegen.api.meta as meta
-import tools.codegen.api.structured as structured
-from tools.codegen.api.translate import translate
-from tools.codegen.selective_build.selector import SelectiveBuilder
-from tools.codegen.utils import (
-    Target, concatMap, context, mapMaybe, YamlDumper, YamlLoader, FileManager, assert_never
-)
-from tools.codegen.context import (method_with_native_function,
-                                   native_function_manager,
-                                   with_native_function_and_indices,
-                                   with_native_function)
-import tools.codegen.dest as dest
-from tools.codegen.gen_functionalization_type import (
-    needs_functionalization,
-    gen_functionalization_definition,
-    gen_functionalization_registration,
-    gen_functionalization_view_inverse_declaration
-)
-
-T = TypeVar('T')
-
-# Welcome to the ATen code generator v2!  The ATen code generator is
-# responsible for parsing native_functions.yaml and then generating
-# various generated files (e.g., TypeDefault.cpp) based on the operators
-# defined in this file.  This means that the code generator knows how to
-# parse function schema, and then translate this into various C++ types
-# and boilerplate code.
-#
-# Some things to know about this file when you modify it:
-#
-# - This file has STRICT mypy typechecking.  Typecheck it with
-#   `mypy --config mypy-strict.ini` in the root source directory
-#
-# - Most of the heavy lifting lives in external modules:
-#   - 'model' has the data model for native_functions.yaml.  The classes
-#     in those file represent what you see when you look at
-#     a native_functions.yaml
-#   - 'api' has conversions for how to translate JIT schema into
-#     the various C++ APIs that the codegen interacts with.  There
-#     are in fact THREE different C++ APIs: the public C++ API,
-#     the dispatcher API, and the legacy disaptcher API.  See each
-#     of these respective files for more information
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                         HELPER FUNCTIONS
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-# A custom loader for YAML to let us also keep track of line numbers
-# of each entry in the YAML file
-class LineLoader(YamlLoader):
-    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
-        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
-        # Add 1 so line numbering starts at 1
-        mapping['__line__'] = node.start_mark.line + 1
-        return mapping
-
-_GLOBAL_PARSE_NATIVE_YAML_CACHE = {}
-
-# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
-ParsedYaml = namedtuple('ParsedYaml', ['native_functions', 'backend_indices'])
-def parse_native_yaml(path: str) -> ParsedYaml:
-    global _GLOBAL_PARSE_NATIVE_YAML_CACHE
-    if path not in _GLOBAL_PARSE_NATIVE_YAML_CACHE:
-        with open(path, 'r') as f:
-            es = yaml.load(f, Loader=LineLoader)
-        assert isinstance(es, list)
-        rs: List[NativeFunction] = []
-        bs: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]] = defaultdict(dict)
-        for e in es:
-            assert isinstance(e.get('__line__'), int), e
-            loc = Location(path, e['__line__'])
-            funcs = e.get('func')
-            with context(lambda: f'in {loc}:\n  {funcs}'):
-                func, m = NativeFunction.from_yaml(e, loc)
-                rs.append(func)
-                BackendIndex.grow_index(bs, m)
-        error_check_native_functions(rs)
-        # Default dict is to prevent the codegen from barfing when we have a dispatch key that has no kernels yet.
-        indices: Dict[DispatchKey, BackendIndex] = defaultdict(lambda: BackendIndex(
-            dispatch_key=DispatchKey.Undefined,
-            use_out_as_primary=True,
-            external=False,
-            device_guard=False,
-            index={}))
-        for k, v in bs.items():
-            # All structured in-tree operators are implemented in terms of their out operator.
-            indices[k] = BackendIndex(
-                dispatch_key=k,
-                use_out_as_primary=True,
-                external=False,
-                # Only cuda-like devices in tree require device guards
-                device_guard=is_cuda_dispatch_key(k),
-                index=v)
-        _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] = ParsedYaml(rs, indices)
-
-    return _GLOBAL_PARSE_NATIVE_YAML_CACHE[path]
-
-# Some assertions are already performed during parsing, but those are only within a single NativeFunction.
-# Assertions here are meant to be performed across NativeFunctions.
-def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None:
-    func_map: Dict[OperatorName, NativeFunction] = {}
-    base_func_map: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list)
-    for f in funcs:
-        func_map[f.func.name] = f
-        base_func_map[f.func.name.name].append(f)
-    for f in funcs:
-        if f.structured_delegate is not None:
-            delegate_func = func_map[f.structured_delegate]
-            assert delegate_func.structured, \
-                f"{f.func.name} is marked as a structured_delegate pointing to " \
-                f"{f.structured_delegate}, but {f.structured_delegate} is not marked as structured. " \
-                f"Consider adding 'structured=True' to the delegated operator"
-        if f.tag is not None and f.tag is Tag.inplace_view:
-            base_name = f.func.name.name
-            overload_name = f.func.name.overload_name
-            assert base_name.inplace, \
-                f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming " \
-                "convention for inplace ops - the codegen expects the base name to have a trailing underscore. "
-            out_of_place_base_name = BaseOperatorName(base_name.base, False, base_name.dunder_method)
-            assert len(base_func_map[out_of_place_base_name]) > 0, \
-                f"{f.func.name} is marked with tag: inplace_view. The codegen expects there to be a corresponding " \
-                f"out-of-place view op with the name '{base_name}' and matching schema, but it didn't find one. "
-
-
-def cpp_string(s: str) -> str:
-    """Convert a python string into a c++ string literal """
-    s = s.replace('\\', '\\\\')
-    s = s.replace('"', '\\"')
-    s = s.replace('\a', '\\a')
-    s = s.replace('\b', '\\b')
-    s = s.replace('\f', '\\f')
-    s = s.replace('\n', '\\n')
-    s = s.replace('\v', '\\v')
-    s = s.replace('\t', '\\t')
-    return f'"{s}"'
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                        C++ CODE GENERATION
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-# Most functions in this section are curried: they consist of a function
-# that takes some parameters (e.g., what is to be generated) which itself
-# returns a function that actually maps NativeFunction to the code
-# to be generated.  This pattern makes it convenient to use map, concatMap
-# and similar functional combinators.
-
-def static_dispatch_keys(backend: Optional[BackendIndex]) -> List[DispatchKey]:
-    if backend is None:
-        return []
-    else:
-        return [
-            backend.dispatch_key,
-            DispatchKey.CompositeImplicitAutograd,
-            DispatchKey.CompositeExplicitAutograd
-        ]
-
-def get_static_dispatch_backend(f: NativeFunction, backend_index: BackendIndex) -> Optional[DispatchKey]:
-    if (f.structured_delegate is not None or backend_index.has_kernel(f)):
-        # TODO: for ops with structured_delegate it should check the dispatch table of
-        # the out variant instead. For now, these structured ops all have CPU/CUDA kernels
-        # so we always dispatch to the `backend`, but this could be wrong when we
-        # migrate math/default_backend ops to use structured delegate.
-        return backend_index.dispatch_key
-    elif f.has_composite_explicit_autograd_kernel:
-        return DispatchKey.CompositeExplicitAutograd
-    elif f.has_composite_implicit_autograd_kernel:
-        return DispatchKey.CompositeImplicitAutograd
-    return None
-
-
-def static_dispatch_ops_header(
-        f: NativeFunction,
-        backend_index: Optional[BackendIndex]) -> Optional[str]:
-    if backend_index is None or f.manual_kernel_registration:
-        return None
-
-    dispatch_key = get_static_dispatch_backend(f, backend_index)
-    return (f'#include <ATen/ops/{f.root_name}_{dispatch_key.lower()}_dispatch.h>'
-            if dispatch_key is not None else None)
-
-
-def static_dispatch_extra_headers(backend: Optional[BackendIndex], skip_tensor_include: bool = False) -> List[str]:
-    if skip_tensor_include:
-        # See Note [Avoiding Include Cycles In Static Dispatch]
-        maybe_inl = '_inl'
-    else:
-        maybe_inl = ''
-    return [f'#include <ATen/{dispatch_key}Functions{maybe_inl}.h>'
-            for dispatch_key in static_dispatch_keys(backend)]
-
-
-def static_dispatch(
-    f: NativeFunction, cpp_sig: CppSignature,
-    *, method: bool, backend_index: Optional[BackendIndex]
-) -> Optional[str]:
-    if backend_index is None or f.manual_kernel_registration:
-        return None
-    target_sig = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False).signature
-    name = target_sig.name()
-    exprs = translate(cpp_sig.arguments(), target_sig.arguments(), method=method)
-    exprs_str = ', '.join(a.expr for a in exprs)
-
-    dispatch_key = get_static_dispatch_backend(f, backend_index)
-    if dispatch_key is not None:
-        return f'return at::{dispatch_key.lower()}::{name}({exprs_str});'
-
-    return f'TORCH_CHECK(false, "Static dispatch does not support {name} for {backend_index.dispatch_key}.");'
-
-# Generates RegisterSchema.cpp.  Depending on the selector, either
-# all schemas are registered, or only some are (in the case of
-# selective build)
-@dataclass(frozen=True)
-class RegisterSchema:
-    selector: SelectiveBuilder
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> Optional[str]:
-        if not self.selector.is_native_function_selected(f):
-            return None
-        return f'm.def({cpp_string(str(f.func))});\n'
-
-# Generates Operators.h and Operators.cpp.
-# These provide macros that, given an operator and overload name, allow users
-# to access an "un-overloaded" function version of the operator. This
-# is useful for extension writers who want to (1) want to decltype the operator
-# and (2) don't want to worry about method-only operators.
-@dataclass(frozen=True)
-class ComputeOperators:
-    target: Union[
-        Literal[Target.DECLARATION],
-        Literal[Target.DEFINITION]
-    ]
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> str:
-        sig = DispatcherSignature.from_schema(f.func)
-        name = f.func.name.unambiguous_name()
-        call_method_name = 'call'
-        redispatch_method_name = 'redispatch'
-
-        if self.target is Target.DECLARATION:
-            # Note [The ATen Operators API]
-            # The ATen Operators API lives in the at::_ops namespace, and contains compile-time
-            # metadata about each operator + entry points into the Dispatcher.
-            # The C++ function, method, and redispatch API's are all implemented as wrappers
-            # into various bits of the structs defined here.
-            #
-            # Important characteristics about the Operators API:
-            # (1) It follows the Dispatcher API.
-            #     This is kind of necessary to avoid overhead.
-            #     For example: if it followed the C++ API, then all of the faithful C++ factory functions
-            #     would need to wrap their arguments into TensorOptions only to unwrap them again.
-            # (2) Overload names are disambiguated.
-            #     This is helpful for pytorch extenders who would like to decltype() an aten operator,
-            #     that has overloads, e.g. decltype(at::_ops::mul_Tensor::call)
-            # (3) No argument defaulting is allowed.
-            #     This is more of an implementation detail to avoid #include cycles,
-            #     since TensorBody.h (which defines the Tensor class) needs to include this file.
-            # (4) manual_cpp_bindings and faithful names are not included in the API.
-            #     This applies to stuff like __dispatch__is_complex(), and add_outf().
-            #     These aren't "real aten ops", they're just additional functions provided by the C++ API.
-            #     They're implemented as wrappers in Functions.h that call into the actual operators
-            #     defined here, i.e. at::_ops::is_complex::call() and at::_ops::add_out::call().
-            #     This means that ATEN_OP(is_complex) will not fastpath, and will go through the dispatcher.
-            return f"""
-struct TORCH_API {name} {{
-  using schema = {sig.type()};
-  using ptr_schema = schema*;
-  // See Note [static constexpr char* members for windows NVCC]
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}")
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}")
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))})
-  static {sig.defn(name=call_method_name, is_redispatching_fn=False)};
-  static {sig.defn(name=redispatch_method_name, is_redispatching_fn=True)};
-}};"""
-        elif self.target is Target.DEFINITION:
-            defns = f"""
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}")
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}")
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))})
-
-// aten::{f.func}
-static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{
-  return c10::Dispatcher::singleton()
-      .findSchemaOrThrow({name}::name, {name}::overload_name)
-      .typed<{name}::schema>();
-}}
-"""
-
-            for is_redispatching_fn in [False, True]:
-                if is_redispatching_fn:
-                    dispatcher_exprs_str = ', '.join(['dispatchKeySet'] + [a.name for a in sig.arguments()])
-                    dispatcher_call = 'redispatch'
-                    method_name = f'{name}::{redispatch_method_name}'
-                else:
-                    dispatcher_exprs_str = ', '.join([a.name for a in sig.arguments()])
-                    dispatcher_call = 'call'
-                    method_name = f'{name}::{call_method_name}'
-
-                defns += f"""
-// aten::{f.func}
-{sig.defn(name=method_name, is_redispatching_fn=is_redispatching_fn)} {{
-    static auto op = create_{name}_typed_handle();
-    return op.{dispatcher_call}({dispatcher_exprs_str});
-}}
-"""
-            return defns
-        else:
-            assert_never(self.target)
-
-
-# Generates Function.h, which provides the functional public C++ API,
-# and the scaffolding to call into the dispatcher from these functions.
-@dataclass(frozen=True)
-class ComputeFunction:
-    static_dispatch_backend_index: Optional[BackendIndex]
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> Optional[str]:
-        if Variant.function not in f.variants:
-            return None
-
-        sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding)
-
-        def generate_defn(faithful: bool) -> str:
-            if faithful:
-                sig = sig_group.faithful_signature
-                assert sig is not None
-            else:
-                sig = sig_group.signature
-
-            # See Note [The ATen Operators API]
-            target_sig = DispatcherSignature.from_schema(f.func)
-            exprs = translate(sig.arguments(), target_sig.arguments())
-            exprs_str = ', '.join([e.expr for e in exprs])
-
-            static_dispatch_block = static_dispatch(f, sig, method=False, backend_index=self.static_dispatch_backend_index)
-            if static_dispatch_block is None:
-                return f"""
-// aten::{f.func}
-TORCH_API inline {sig.decl()} {{
-    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
-}}
-"""
-            else:
-                return f"""
-// aten::{f.func}
-TORCH_API inline {sig.decl()} {{
-    {static_dispatch_block}
-}}
-"""
-        result = generate_defn(False)
-        if sig_group.faithful_signature is not None:
-            result += generate_defn(True)
-
-        return result
-
-# Generates TensorBody.h. This file provides the object-oriented (method-based)
-# public C++ API, and the scaffolding to call into the dispatcher from these functions.
-@dataclass(frozen=True)
-class ComputeTensorMethod:
-    target: Union[
-        Literal[Target.DECLARATION],
-        Literal[Target.DEFINITION]
-    ]
-    static_dispatch_backend_index: Optional[BackendIndex]
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> Optional[str]:
-        if Variant.method not in f.variants:
-            return None
-
-        assert not f.func.is_out_fn()
-        assert f.func.arguments.self_arg is not None
-
-        sig_group = CppSignatureGroup.from_native_function(f, method=True, fallback_binding=f.manual_cpp_binding)
-
-        if self.target is Target.DECLARATION:
-            result = f"{sig_group.signature.decl()} const;\n"
-            if sig_group.faithful_signature is not None:
-                result += f"{sig_group.faithful_signature.decl()} const;\n"
-            return result
-
-        if self.target is not Target.DEFINITION:
-            assert_never(self.target)
-
-        def generate_defn(faithful: bool) -> str:
-            if faithful:
-                sig = sig_group.faithful_signature
-                assert sig is not None
-            else:
-                sig = sig_group.signature
-
-            target_sig = DispatcherSignature.from_schema(f.func)
-            exprs = translate(sig.arguments(), target_sig.arguments(), method=True)
-            exprs_str = ', '.join([e.expr for e in exprs])
-
-            static_dispatch_block = static_dispatch(f, sig, method=True, backend_index=self.static_dispatch_backend_index)
-            if static_dispatch_block is None:
-                return f"""
-// aten::{f.func}
-inline {sig.defn(prefix="Tensor::")} const {{
-    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
-}}
-"""
-            else:
-                return f"""
-// aten::{f.func}
-inline {sig.defn(prefix="Tensor::")} const {{
-    {static_dispatch_block}
-}}
-"""
-
-        result = generate_defn(faithful=False)
-        if sig_group.faithful_signature is not None:
-            result += generate_defn(faithful=True)
-
-        return result
-
-# Generates RedispatchFunctions.h.
-# This is similar to the C++ API defined in Functions.h, but provides access
-# to the dispatcher's redispatch API.
-@dataclass(frozen=True)
-class ComputeRedispatchFunction:
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> Optional[str]:
-        # We unconditionally generate function variants of the redispatch API.
-        # This is mainly because we can namespace functions separately, but not methods,
-        sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding)
-
-        def generate_defn(faithful: bool) -> str:
-            if faithful:
-                sig = sig_group.faithful_signature
-                assert sig is not None
-            else:
-                sig = sig_group.signature
-
-            target_sig = DispatcherSignature.from_schema(f.func)
-            exprs = translate(sig.arguments(), target_sig.arguments())
-            exprs_str = ', '.join(['dispatchKeySet'] + [a.expr for a in exprs])
-
-            return f"""
-// aten::{f.func}
-TORCH_API inline {sig.decl(is_redispatching_fn=True)} {{
-    return at::_ops::{f.func.name.unambiguous_name()}::redispatch({exprs_str});
-}}
-"""
-        result = generate_defn(False)
-        if sig_group.faithful_signature is not None:
-            result += generate_defn(True)
-
-        return result
-
-
-# Generates ATenOpList.cpp, a runtime accessible list of all aten
-# operators.
-# TODO: This was historically used to help some JIT interop code
-# figure out whether or not to treat aten namespace'd operators
-# one way or another, we should reevaluate if this is actually needed.
-@with_native_function
-def compute_aten_op(f: NativeFunction) -> str:
-    return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},'
-
-# Generates MetaFunctions.h
-def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]:
-    if not g.structured:
-        return None
-    with native_function_manager(g.out):
-        name = meta.name(g)
-        args = structured.meta_arguments(g)
-        args_str = ', '.join(a.decl() for a in args)
-        parent_class = g.out.structured_inherits
-        if parent_class is None:
-            parent_class = "at::impl::MetaBase"
-        meta_return = "void"
-        precomputed = g.out.precomputed if g.structured else None
-
-        if precomputed:
-            # Generate the template declaration with one bool parameter for each
-            # precomputed element. Each parameter is true if the corresponding (in
-            # terms of position) precomputed element has been set.
-            precomputed_values = [*precomputed.replace.values(), precomputed.add]
-            precomputed_elements = [elem for replace_list in precomputed_values for elem in replace_list]
-            precomputed_template_parameters = [elem.name.upper() for elem in precomputed_elements]
-            precomputed_template_params_str = ", ".join(f"bool {param} = false" for param in precomputed_template_parameters)
-            precompute_template_decl = f"template <{precomputed_template_params_str}>"
-
-            # Generate a string containing declarations of all precomputed elements.
-            precomputed_elements_with_cpp_types = [
-                structured.argument_type(elem, binds=elem.name)
-                for elem in precomputed_elements
-            ]
-
-            precomputed_elements_decl = ";\n".join(
-                f"{elem.cpp_type(strip_ref=True)} {elem.name}" for elem in precomputed_elements_with_cpp_types
-            )
-
-            # Generate "setter" methods for each precomputed element. Each method will return
-            # a new instance of precompute_out with the template parameter that corresponds to
-            # the member set by the method to true (to indicate that it has been set).
-            setter_methods = []
-            for i, elem in enumerate(precomputed_elements):
-                # Generate the signature. The return type will be the same
-                # as the type of `this` but with the template parameter
-                # corresponding to the element set by this method set to true.
-                # The assert generated below will ensure that this template
-                # parameter is false on the type of `this`.
-                return_ty_templates = ", ".join(
-                    precomputed_template_parameters[:i] + ["true"] + precomputed_template_parameters[i + 1:]
-                )
-                return_ty = f"precompute_out<{return_ty_templates}>"
-                elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(strip_ref=True)
-                signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)"
-
-                # Generate an assert which checks that the
-                # template parameter corresponding to the precomputed
-                # element that is set by this method is false on the
-                # class corresponding to the object that `this` points to.
-                # This ensures that each element can be set only once.
-                assert_msg = f"\"{precomputed_elements[i].name} already set\""
-                assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});"
-
-                # Generate the new object construction block. All state
-                # except the element that this method sets is copied from the
-                # object that `this` points to. The value for the element that
-                # the method sets is taken from a method parameter.
-                construction_stmts = []
-                construction_stmts.append(f"{return_ty} ret;")
-
-                for j, elem in enumerate(precomputed_elements):
-                    if i == j:
-                        construction_stmts.append(f"ret.{elem.name} = value;")
-                    else:
-                        construction_stmts.append(f"ret.{elem.name} = this->{elem.name};")
-
-                construction_stmts.append("return ret;")
-                construction_block = "\n".join(construction_stmts)
-
-                setter_methods.append(f"""
-                    {signature} {{
-                        {assert_stmt}
-                        {construction_block}
-                    }}
-                """)
-            setter_methods_decl = "\n".join(setter_methods)
-
-            # Meta should return an instance of the struct containing the precomputed elements.
-            meta_return_template_params = ", ".join(["true"] * len(precomputed_template_parameters))
-            # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return
-            # type (which has a variable number of template parameters).
-            meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;"
-            meta_return = "meta_return_ty"
-            precomputed_decl = f"""
-                {precompute_template_decl}
-                struct TORCH_API precompute_out {{
-                    {setter_methods_decl}
-                    {precomputed_elements_decl};
-            }};"""
-        else:
-            meta_return_typedef = ""
-            precomputed_decl = ""
-
-        return f"""\
-struct TORCH_API structured_{name} : public {parent_class} {{
-    {precomputed_decl}
-    {meta_return_typedef}
-    {meta_return} meta({args_str});
-}};
-"""
-
-
-def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool:
-    name = str(f.func.name.name)
-    if name.endswith('_like') or name.startswith('new_'):
-        return False
-    if f.func.arguments.tensor_options is None:
-        return False
-    return selector.is_native_function_selected(f)
-
-
-# Generates RegisterBackendSelect.cpp, a series of kernels which provide
-# specialized computation of dispatch key for operator signatures which cannot
-# be easily done automatically using templating.
-@dataclass(frozen=True)
-class ComputeBackendSelect:
-    target: Union[
-        Literal[Target.DEFINITION],
-        Literal[Target.REGISTRATION]
-    ]
-
-    # Selector object to determine which operators to generate
-    # registration code for.
-    selector: SelectiveBuilder
-
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> Optional[str]:
-        if not needs_backend_select(f, self.selector):
-            return None
-
-        name = native.name(f.func)
-        native_sig = NativeSignature(f.func)
-
-        native_tensor_args = [
-            a for a in native_sig.arguments()
-            if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like()
-        ]
-
-        dispatcher_sig = DispatcherSignature.from_schema(f.func)
-
-        sig: Union[NativeSignature, DispatcherSignature]
-        sig = dispatcher_sig
-        dispatcher_exprs = dispatcher_sig.exprs()
-        dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
-
-        if self.target is Target.DEFINITION:
-            # I don't think there's actually a good reason to generate
-            # these two cases differently
-            # The first case could probably be improved though- it calls computeDispatchKeySet(),
-            # which looks at TLS dispatch keys- there should not be any by the time we reach backend select.
-            if native_tensor_args:
-                tensor_args = ', '.join(a.name for a in native_tensor_args)
-                compute_dk = f"""\
-DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args});
-  DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
-  DispatchKeySet _dk = c10::impl::computeDispatchKeySet(_dk_set, _dk_mask);"""
-            else:
-                compute_dk = f"DispatchKeySet _dk = c10::DispatchKeySet({dispatch_key});"
-            return f"""\
-// aten::{f.func}
-C10_ALWAYS_INLINE
-{sig.defn(name)} {{
-  {compute_dk}
-  return at::_ops::{f.func.name.unambiguous_name()}::redispatch(
-      _dk, {', '.join(a.expr for a in dispatcher_exprs)});
-}}
-"""
-        elif self.target is Target.REGISTRATION:
-            return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
-        else:
-            assert_never(self.target)
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                       YAML CODE GENERATION
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-def format_yaml(data: object) -> str:
-    # Ignore alias in Dumper
-    YamlDumper.ignore_aliases = lambda self, data: True  # type: ignore[assignment]
-
-    # Support serializing OrderedDict
-    def dict_representer(dumper: Any, data: Any) -> Any:
-        return dumper.represent_dict(data.items())
-    YamlDumper.add_representer(OrderedDict, dict_representer)  # type: ignore[no-untyped-call]
-    # Some yaml parsers (e.g. Haskell's) don't understand line breaks.
-    # width=1e9 turns off optional line breaks and improves
-    # the portability of the outputted yaml.
-    return yaml.dump(data, default_flow_style=False, Dumper=YamlDumper, width=1e9)  # type: ignore[no-any-return]
-
-# For some reason, some defaults we write to YAML are written as native
-# YAML objects, rather than doing them uniformly as strings.  This
-# function detects those cases and converts them into native Python
-# objects.
-def pythonify_default(s: str) -> object:
-    if s == 'true':
-        return True
-    elif s == 'false':
-        return False
-
-    try:
-        return int(s)
-    except ValueError:
-        try:
-            return float(s)
-        except ValueError:
-            return s
-
-# What is a dynamic type?  Over time, the semantic meaning of
-# dynamic type has degraded to meaninglessness (in the old days,
-# it captured dtype-ness of types, but that has gone away with
-# the removal of TH).  These days, it's mostly the same thing as
-# the C++ API argument type, except that Tensor and Tensor?
-# arguments simply present as Tensor.
-#
-# TODO: Get rid of dynamic_type, after getting tools/autograd
-# to use the new codegen framework
-def dynamic_type(t: Type) -> str:
-    if isinstance(t, OptionalType):
-        return dynamic_type(t.elem)
-    # Note we don't use t.is_tensor_like() here because it would
-    # also include Tensor[]
-    if str(t) == 'Tensor':
-        return 'at::Tensor'
-    return cpp.argumenttype_type(t, mutable=False, binds='__placeholder__').cpp_type()
-
-def compute_method_of_yaml(variants: Set[Variant]) -> List[str]:
-    # This is written out explicitly to ensure that Tensor and
-    # namespace are put into the list in the right order
-    method_of = ['Type']
-    if Variant.method in variants:
-        method_of.append('Tensor')
-    if Variant.function in variants:
-        method_of.append('namespace')
-    return method_of
-
-def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[str, str]]:
-    # Note [name and field_name]
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # To understand name_to_field_name, we must first talk about this
-    # schema:
-    #
-    #   lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
-    #
-    # There is something very odd about this schema: it is an out
-    # variant of the function (that is to say, it will convert into
-    # at::lstsq_out() in the C++ API), but the names of the output
-    # return arguments don't match the keyword argument names of
-    # the inputs.  It TURNS OUT that in this situation, the historical
-    # Declarations.yaml we want to output is this (abbreviated to
-    # only show relevant fields):
-    #
-    #   arguments:
-    #     ...
-    #   - field_name: solution
-    #     name: X
-    #   - field_name: QR
-    #     name: qr
-    #     ...
-    #
-    #   returns:
-    #   - field_name: solution
-    #     name: X
-    #   - field_name: QR
-    #     name: qr
-    #
-    # The name of the return fields is stored in 'field_name', and the
-    # name of the arguments is stored in 'name'.  So when we process
-    # arguments, we need a way to get at the corresponding return.  At
-    # the moment, this is most conveniently done by constructing a
-    # mapping from name (the argument concept) to field_name (the
-    # return concept) while processing return arguments, since we don't
-    # directly maintain this correspondence in the modeling of function
-    # schema itself.
-    #
-    # See also https://github.com/pytorch/pytorch/issues/43114
-    name_to_field_name: Dict[str, str] = {}
-
-    # Compute the returns field of the YAML entry
-    names = cpp.return_names(f)
-    returns = []
-    for i, (r, name) in enumerate(zip(f.func.returns, names)):
-        ret = {
-            'dynamic_type': dynamic_type(r.type),
-            'name': name,
-            'type': cpp.return_type(r).cpp_type(),
-        }
-
-        if r.name:
-            # See Note [name and field_name]
-            ret['field_name'] = r.name
-            if f.func.is_out_fn():
-                name_to_field_name[f.func.arguments.out[i].name] = r.name
-
-        returns.append(ret)
-
-    return returns, name_to_field_name
-
-# arguments in yaml roughly corresponds to the public C++ API
-def compute_cpp_argument_yaml(cpp_a: Binding, *, schema_order: bool, kwarg_only_set: Set[str],
-                              out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object:
-    if isinstance(cpp_a.argument, TensorOptionsArguments):
-        arg: Dict[str, object] = {
-            'annotation': None,
-            'dynamic_type': 'at::TensorOptions',
-            'is_nullable': False,
-            'name': cpp_a.name,
-            'type': cpp_a.type,
-            'kwarg_only': True,
-        }
-        if cpp_a.default is not None:
-            arg['default'] = cpp_a.default
-        return arg
-    elif isinstance(cpp_a.argument, SelfArgument):
-        raise AssertionError()
-    elif isinstance(cpp_a.argument, Argument):
-        return compute_argument_yaml(
-            cpp_a.argument, schema_order=schema_order,
-            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
-
-def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Set[str],
-                          out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object:
-    arg: Dict[str, object] = {
-        'annotation': str(a.annotation) if a.annotation else None,
-        'dynamic_type': dynamic_type(a.type),
-        'is_nullable': a.type.is_nullable(),
-        'name': a.name,
-        'type': cpp.argument_type(a, binds="__placeholder__").cpp_type(),
-    }
-    if a.default is not None:
-        arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type))
-    if a.name in kwarg_only_set:
-        arg['kwarg_only'] = True
-    if a.name in out_arg_set:
-        arg['output'] = True
-        arg['allocate'] = True
-        # See Note [name and field_name]
-        if a.name in name_to_field_name:
-            arg['field_name'] = name_to_field_name[a.name]
-    # Historically, booleans don't get their size recorded, because it
-    # is already built into the cpp type (e.g., std::array<bool, 4>)
-    l = a.type.is_list_like()
-    if l is not None and l.size is not None and str(l.elem) != 'bool':
-        arg['size'] = l.size
-    return arg
-
-@with_native_function
-def compute_declaration_yaml(f: NativeFunction) -> object:
-    returns, name_to_field_name = compute_returns_yaml(f)
-
-    # These sets are used to conveniently test if an argument is a
-    # kwarg-only or out argument
-    kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
-    out_arg_set = set(a.name for a in f.func.arguments.out)
-
-    sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False)
-    cpp_args = sig_group.signature.arguments()
-    arguments = [
-        compute_cpp_argument_yaml(
-            cpp_a, schema_order=False,
-            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
-        for cpp_a in cpp_args
-    ]
-
-    schema_order_jit_arguments = list(f.func.schema_order_arguments())
-
-    schema_order_arguments = [
-        compute_argument_yaml(
-            a, schema_order=True,
-            kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name)
-        for a in schema_order_jit_arguments
-    ]
-
-    cpp_schema_order_types = [
-        # NB: method here doesn't matter
-        r.type for a in schema_order_jit_arguments
-        for r in cpp.argument(
-            a, method=False, cpp_no_default_args=set(), faithful=False, has_tensor_options=False)
-    ]
-
-    cpp_returns = cpp.returns_type(f.func.returns).cpp_type()
-    schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})"
-
-    is_factory_method = any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) \
-        and Variant.method not in f.variants
-
-    return OrderedDict([
-        ('name', cpp.name(f.func)),
-        ('operator_name', str(f.func.name.name)),
-        ('overload_name', str(f.func.name.overload_name)),
-        ('manual_kernel_registration', f.manual_kernel_registration),
-        ('category_override', f.category_override if f.category_override is not None else ''),
-        ('schema_string', f'aten::{f.func}'),
-        ('arguments', arguments),
-        ('schema_order_cpp_signature', schema_order_cpp_signature),
-        ('schema_order_arguments', schema_order_arguments),
-        ('method_of', compute_method_of_yaml(f.variants)),
-        ('mode', 'native'),
-        ('python_module', '' if f.python_module is None else f.python_module),
-        ('returns', returns),
-        ('inplace', f.func.name.name.inplace),
-        ('is_factory_method', is_factory_method),
-        ('abstract', f.is_abstract),
-        ('device_guard', f.device_guard),
-        ('with_gil', False),
-        ('deprecated', False),
-        ('has_math_kernel', f.has_composite_implicit_autograd_kernel),
-    ])
-
-# See Note [Auto generated composite kernels]
-def has_autogenerated_composite_kernel(f: NativeFunction) -> bool:
-    return (f.structured or f.structured_delegate is not None) and \
-           (f.func.kind() == SchemaKind.functional or f.func.kind() == SchemaKind.inplace)
-
-@with_native_function_and_indices
-def compute_registration_declarations(f: NativeFunction, backend_indices: Dict[DispatchKey, BackendIndex]) -> str:
-    name = dispatcher.name(f.func)
-    returns_type = dispatcher.returns_type(f.func.returns).cpp_type_registration_declarations()
-    args = dispatcher.arguments(f.func)
-    args_str = ', '.join(a.no_default().decl_registration_declarations() for a in args)
-    comment_data : Dict[str, str] = {
-        'schema': f'aten::{f.func}',
-        # TODO: What exactly is the semantics of the 'dispatch' field?
-        'dispatch': str({k for k, v in backend_indices.items() if v.has_kernel(f)} != {DispatchKey.CompositeImplicitAutograd}),
-        'default': str(f.has_composite_kernel or has_autogenerated_composite_kernel(f))
-    }
-    return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)}
-"""
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                           RUN IT ALL
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-
-def get_custom_build_selector(
-        provided_op_registration_allowlist: Optional[List[str]],
-        op_selection_yaml_path: Optional[str]) -> SelectiveBuilder:
-    assert not (
-        provided_op_registration_allowlist is not None and
-        op_selection_yaml_path is not None), (
-            "Both provided_op_registration_allowlist and " +
-            "op_selection_yaml_path can NOT be provided at the " +
-            "same time.")
-
-    op_registration_allowlist: Optional[Set[str]] = None
-    if provided_op_registration_allowlist is not None:
-        op_registration_allowlist = set(provided_op_registration_allowlist)
-
-    if op_registration_allowlist is not None:
-        selector = SelectiveBuilder.from_legacy_op_registration_allow_list(
-            op_registration_allowlist,
-            True,
-            False,
-        )
-    elif op_selection_yaml_path is not None:
-        selector = SelectiveBuilder.from_yaml_path(op_selection_yaml_path)
-    else:
-        selector = SelectiveBuilder.get_nop_selector()
-
-    return selector
-
-def pre_group_native_functions(
-        native_functions: Sequence[NativeFunction]) -> Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]]:
-    pre_grouped_native_functions: Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]] = defaultdict(dict)
-    for f in native_functions:
-        d = pre_grouped_native_functions[f.func.signature()]
-        assert f.func.kind() not in d
-        d[f.func.kind()] = f
-    return pre_grouped_native_functions
-
-def get_grouped_native_functions(
-        native_functions: Sequence[NativeFunction]) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
-    def flatten_pre_group(d: Dict[SchemaKind, NativeFunction]) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
-        r = NativeFunctionsGroup.from_dict(d)
-        if r is None:
-            return list(d.values())
-        else:
-            return [r]
-
-    # TODO: how come ValuesView isn't a Sequence lol
-    pre_grouped_native_functions = pre_group_native_functions(native_functions)
-    return list(concatMap(flatten_pre_group, list(pre_grouped_native_functions.values())))
-
-def gen_aggregated_headers(
-        *,
-        native_functions: Sequence[NativeFunction],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        static_dispatch_idx: Optional[BackendIndex],
-        selector: SelectiveBuilder,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        cpu_fm: FileManager,
-        cuda_fm: FileManager,
-        functions_keys: Set[DispatchKey],
-        dispatch_keys: Sequence[DispatchKey],
-        rocm: bool,
-) -> None:
-    # Buck doesn't support dynamic output files, so we aggregate all operator
-    # headers into a single file
-    structured_native_functions = [g for g in grouped_native_functions
-                                   if isinstance(g, NativeFunctionsGroup)]
-    cpu_fm.write('NativeMetaFunctions.h', lambda: {
-        'NativeMetaFunctions_includes': [],
-        'NativeMetaFunctions_declarations': list(
-            mapMaybe(compute_meta_function_declaration, structured_native_functions)),
-    })
-    method_native_functions = [fn for fn in native_functions
-                               if Variant.method in fn.variants]
-    non_method_native_functions = [fn for fn in native_functions
-                                   if fn not in method_native_functions]
-    cpu_fm.write('MethodOperators.h', lambda: {
-        'MethodOperators_includes': [],
-        'MethodOperators_declarations': list(mapMaybe(ComputeOperators(
-            Target.DECLARATION), method_native_functions)),
-    })
-    cpu_fm.write('Operators.h', lambda: {
-        'Operators_includes': ['#include <ATen/MethodOperators.h>'],
-        'Operators_declarations': list(mapMaybe(ComputeOperators(
-            Target.DECLARATION), non_method_native_functions)),
-    })
-    cpu_fm.write('Functions.h', lambda: {
-        'static_dispatch_extra_headers': static_dispatch_extra_headers(static_dispatch_idx),
-        'Functions_includes': ['#include <ATen/Operators.h>'],
-        'Functions_declarations': list(mapMaybe(ComputeFunction(
-            static_dispatch_backend_index=static_dispatch_idx), native_functions)),
-    })
-    cpu_fm.write('NativeFunctions.h', lambda: {
-        'NativeFunctions_includes': ['#include <ATen/NativeMetaFunctions.h>'],
-        'NativeFunctions_declarations': list(concatMap(
-            # Convert to a set first to remove duplicate kernel names.
-            # Backends are allowed to repeat kernel names; only generate the declaration once!
-            lambda f: list(OrderedDict.fromkeys(concatMap(
-                lambda backend_idx:
-                    dest.compute_native_function_declaration(f, backend_idx),
-                backend_indices.values()))),
-            grouped_native_functions)),
-    })
-
-    for dispatch_key in dispatch_keys:
-        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
-        if dispatch_key in functions_keys:
-            if dispatch_key in static_dispatch_keys(static_dispatch_idx):
-                # See Note [Avoiding Include Cycles In Static Dispatch]
-                inl_headers = ''
-            else:
-                inl_headers = f'#include <ATen/{dispatch_key}Functions_inl.h>'
-
-            fm.write_with_template(f'{dispatch_key}Functions.h', 'DispatchKeyFunctions.h', lambda: {
-                'dispatch_key': str(dispatch_key),
-                'inline_headers_for_nonstatic_build': inl_headers,
-            })
-            fm.write_with_template(f'{dispatch_key}Functions_inl.h', 'DispatchKeyFunctions_inl.h', lambda: {
-                'DispatchKeyFunctions_inl_includes': [],
-                'dispatch_namespace': dispatch_key.lower(),
-                'dispatch_namespaced_declarations': list(concatMap(
-                    dest.RegisterDispatchKey(
-                        backend_indices[dispatch_key],
-                        Target.NAMESPACED_DECLARATION,
-                        selector,
-                        rocm=rocm,
-                        cpp_namespace='at::native',
-                        class_method_name=None),
-                    grouped_native_functions
-                )),
-            })
-
-        del fm
-
-def gen_per_operator_headers(
-        *,
-        native_functions: Sequence[NativeFunction],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        static_dispatch_idx: Optional[BackendIndex],
-        selector: SelectiveBuilder,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        cpu_fm: FileManager,
-        cuda_fm: FileManager,
-        ops_fm: FileManager,
-        functions_keys: Set[DispatchKey],
-        dispatch_keys: Sequence[DispatchKey],
-        rocm: bool,
-) -> None:
-    # For CMake builds, split operator declarations into separate headers in
-    # the ATen/ops folder to split up header dependencies
-    functions_by_root_name: Dict[str, List[NativeFunction]] = defaultdict(lambda: [])
-    for fn in native_functions:
-        functions_by_root_name[fn.root_name].append(fn)
-
-    grouped_functions_by_root_name: Dict[str, List[Union[NativeFunction, NativeFunctionsGroup]]] = defaultdict(lambda: [])
-    for group in grouped_native_functions:
-        name = group.root_name
-        grouped_functions_by_root_name[name].append(group)
-
-    for name, functions in functions_by_root_name.items():
-        ops_fm.write_with_template(
-            f'{name}_ops.h', 'Operator.h', lambda: {
-                'declarations': list(mapMaybe(ComputeOperators(
-                    Target.DECLARATION), functions)),
-            })
-
-        ops_fm.write_with_template(
-            f'{name}.h', 'Function.h', lambda: {
-                'static_dispatch_ops_headers': list(mapMaybe(
-                    lambda fn: static_dispatch_ops_header(fn, backend_index=static_dispatch_idx),
-                    functions)),
-                'operator_includes': f'#include <ATen/ops/{name}_ops.h>',
-                'function_definitions': list(mapMaybe(ComputeFunction(
-                    static_dispatch_backend_index=static_dispatch_idx), functions)),
-            })
-
-        grouped_functions = grouped_functions_by_root_name.get(name, [])
-        structured_functions = [fn for fn in grouped_functions
-                                if isinstance(fn, NativeFunctionsGroup) and fn.structured]
-        is_structured = len(structured_functions) > 0
-
-
-        if is_structured:
-            ops_fm.write_with_template(
-                f'{name}_meta.h', 'NativeMetaFunction.h', lambda: {
-                    'meta_function_declarations': list(mapMaybe(
-                        compute_meta_function_declaration, structured_functions)),
-                })
-
-
-        ops_fm.write_with_template(
-            f'{name}_native.h', 'NativeFunction.h', lambda: {
-                'extra_includes': (f'#include <ATen/ops/{name}_meta.h>'
-                                   if is_structured else []),
-                'native_function_declarations': list(concatMap(
-                    # Convert to a set first to remove duplicate kernel names.
-                    # Backends are allowed to repeat kernel names; only generate the declaration once!
-                    lambda f: list(OrderedDict.fromkeys(concatMap(
-                        lambda backend_idx:
-                            dest.compute_native_function_declaration(f, backend_idx),
-                        backend_indices.values()))),
-                    grouped_functions)),
-            })
-
-    for category, suffix in [
-            ('Functions', ''),
-            ('Operators', '_ops'),
-            ('NativeMetaFunctions', '_meta'),
-            ('NativeFunctions', '_native'),
-    ]:
-        cpu_fm.write(f'{category}.h', lambda: {
-            'static_dispatch_extra_headers': [],
-            f'{category}_includes': [
-                f'#include <ATen/ops/{name}{suffix}.h>'
-                for name in sorted(functions_by_root_name.keys())
-            ],
-            f'{category}_declarations': [],
-        })
-
-    for dispatch_key in dispatch_keys:
-        if dispatch_key not in functions_keys:
-            continue
-
-        dispatch_namespace = dispatch_key.lower()
-        dispatch_names = []
-
-        for name, functions in functions_by_root_name.items():
-            grouped_functions = grouped_functions_by_root_name.get(name, [])
-            declarations = list(concatMap(
-                dest.RegisterDispatchKey(
-                    backend_indices[dispatch_key],
-                    Target.NAMESPACED_DECLARATION,
-                    selector,
-                    rocm=rocm,
-                    cpp_namespace='at::native',
-                    class_method_name=None),
-                grouped_functions
-            ))
-
-            if len(declarations) == 0:
-                continue
-
-            dispatch_names.append(name)
-            ops_fm.write_with_template(
-                f'{name}_{dispatch_namespace}_dispatch.h',
-                'DispatchKeyFunction.h', lambda: {
-                    'dispatch_namespace': dispatch_namespace,
-                    'dispatch_namespaced_declarations': declarations,
-                })
-
-        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
-        if dispatch_key in static_dispatch_keys(static_dispatch_idx):
-            # See Note [Avoiding Include Cycles In Static Dispatch]
-            inl_headers = ''
-        else:
-            inl_headers = f'#include <ATen/{dispatch_key}Functions_inl.h>'
-
-        fm.write_with_template(f'{dispatch_key}Functions.h', 'DispatchKeyFunctions.h', lambda: {
-            'dispatch_key': str(dispatch_key),
-            'inline_headers_for_nonstatic_build': inl_headers,
-        })
-        fm.write_with_template(f'{dispatch_key}Functions_inl.h', 'DispatchKeyFunctions_inl.h', lambda: {
-            'dispatch_namespace': dispatch_namespace,
-            'DispatchKeyFunctions_inl_includes': [
-                f'#include <ATen/ops/{name}_{dispatch_namespace}_dispatch.h>'
-                for name in sorted(dispatch_names)
-            ],
-            'dispatch_namespaced_declarations': [],
-        })
-        del fm
-
-    cpu_fm.write('MethodOperators.h', lambda: {
-        'MethodOperators_includes': sorted(
-            f'#include <ATen/ops/{name}_ops.h>'
-            for name, functions in functions_by_root_name.items()
-            if any(Variant.method in fn.variants for fn in functions)
-        ),
-        'MethodOperators_declarations': [],
-    })
-
-def gen_headers(
-        *,
-        native_functions: Sequence[NativeFunction],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        static_dispatch_idx: Optional[BackendIndex],
-        selector: SelectiveBuilder,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        core_fm: FileManager,
-        cpu_fm: FileManager,
-        cuda_fm: FileManager,
-        ops_fm: FileManager,
-        dispatch_keys: Sequence[DispatchKey],
-        functions_keys: Set[DispatchKey],
-        rocm: bool,
-        per_operator_headers: bool,
-) -> None:
-    if per_operator_headers:
-        gen_per_operator_headers(
-            native_functions=native_functions,
-            grouped_native_functions=grouped_native_functions,
-            static_dispatch_idx=static_dispatch_idx,
-            selector=selector,
-            backend_indices=backend_indices,
-            cpu_fm=cpu_fm,
-            cuda_fm=cuda_fm,
-            ops_fm=ops_fm,
-            dispatch_keys=dispatch_keys,
-            functions_keys=functions_keys,
-            rocm=rocm,
-        )
-    else:
-        gen_aggregated_headers(
-            native_functions=native_functions,
-            grouped_native_functions=grouped_native_functions,
-            static_dispatch_idx=static_dispatch_idx,
-            selector=selector,
-            backend_indices=backend_indices,
-            cpu_fm=cpu_fm,
-            cuda_fm=cuda_fm,
-            dispatch_keys=dispatch_keys,
-            functions_keys=functions_keys,
-            rocm=rocm,
-        )
-
-    def static_dispatch_method_headers() -> List[str]:
-        return list(mapMaybe(
-            lambda fn: static_dispatch_ops_header(fn, backend_index=static_dispatch_idx),
-            [fn for fn in native_functions if Variant.method in fn.variants]))
-
-
-    core_fm.write('TensorBody.h', lambda: {
-        'static_dispatch_ops_headers': (
-            static_dispatch_method_headers() if per_operator_headers
-            else static_dispatch_extra_headers(static_dispatch_idx, skip_tensor_include=True)),
-        'tensor_method_declarations': list(mapMaybe(ComputeTensorMethod(
-            target=Target.DECLARATION, static_dispatch_backend_index=static_dispatch_idx), native_functions)),
-        'tensor_method_definitions': list(mapMaybe(ComputeTensorMethod(
-            target=Target.DEFINITION, static_dispatch_backend_index=static_dispatch_idx), native_functions)),
-    })
-
-    cpu_fm.write('RedispatchFunctions.h', lambda: {
-        'function_redispatch_definitions': list(mapMaybe(ComputeRedispatchFunction(), native_functions)),
-    })
-
-    cpu_fm.write('RegistrationDeclarations.h', lambda: {
-        'registration_declarations': [compute_registration_declarations(f, backend_indices) for f in native_functions],
-    })
-
-    cpu_fm.write('FunctionalInverses.h', lambda: {
-        'view_inverse_declarations': list(mapMaybe(gen_functionalization_view_inverse_declaration, native_functions))
-    })
-
-
-    def gen_aten_interned_strings() -> Dict[str, str]:
-        attrs = set()  # All function argument names
-        names = set()  # All ATen function names
-        for func in native_functions:
-            names.add(str(func.func.name.name))
-            # Some operators don't have a functional variant but we still create a
-            # symbol without the underscore
-            names.add(func.func.name.name.base)
-
-            for arg in func.func.schema_order_arguments():
-                attrs.add(arg.name)
-
-        # These are keywords in C++, so aren't valid symbol names
-        # https://en.cppreference.com/w/cpp/language/operator_alternative
-        names -= set(['and', 'and_eq', 'bitand', 'bitor', 'compl', 'not',
-                      'not_eq', 'or', 'or_eq', 'xor', 'xor_eq'])
-
-        return {
-            'aten_symbols': ' \\\n'.join([
-                f"_(aten, {name})" for name in sorted(names)
-            ]),
-            'attr_symbols': ' \\\n'.join([
-                f"_(attr, {name})" for name in sorted(attrs)
-            ]),
-        }
-
-    core_fm.write('aten_interned_strings.h', gen_aten_interned_strings)
-
-def gen_source_files(
-        *,
-        native_functions: Sequence[NativeFunction],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        static_dispatch_idx: Optional[BackendIndex],
-        selector: SelectiveBuilder,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        core_fm: FileManager,
-        cpu_fm: FileManager,
-        cuda_fm: FileManager,
-        dispatch_keys: Sequence[DispatchKey],
-        functions_keys: Set[DispatchKey],
-        rocm: bool,
-        force_schema_registration: bool,
-        per_operator_headers: bool,
-) -> None:
-    extra_cuda_headers = '''\
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/ATenCUDAGeneral.h>
-#include <ATen/cuda/CUDADevice.h>
-#include <ATen/cuda/CUDAContext.h>'''
-    if rocm:
-        extra_cuda_headers = '''\
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include <ATen/hip/ATenHIPGeneral.h>
-#include <ATen/hip/HIPDevice.h>
-#include <ATen/hip/HIPContext.h>'''
-
-    for dispatch_key in dispatch_keys:
-        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
-
-        if per_operator_headers:
-            def operator_headers() -> List[str]:
-                headers = []
-                for fn in native_functions:
-                    is_registered = backend_index.has_kernel(fn) or (
-                        fn.structured and dispatch_key in
-                        (DispatchKey.Meta, DispatchKey.CompositeExplicitAutograd))
-                    if not is_registered:
-                        continue
-
-                    headers.append(f"#include <ATen/ops/{fn.root_name}_native.h>")
-                    if dispatch_key == DispatchKey.CompositeExplicitAutograd:
-                        headers.append(f"#include <ATen/ops/{fn.root_name}.h>")
-                    if dispatch_key in functions_keys:
-                        headers.append(
-                            f"#include <ATen/ops/{fn.root_name}_{dispatch_namespace}_dispatch.h>")
-
-                return sorted(set(headers))
-        else:
-            def operator_headers() -> List[str]:
-                headers = ["#include <ATen/NativeFunctions.h>"]
-                if dispatch_key == DispatchKey.CompositeExplicitAutograd:
-                    headers.append("#include <ATen/Functions.h>")
-                if dispatch_key in functions_keys:
-                    headers.append(f"#include <ATen/{dispatch_key!s}Functions.h>")
-                return headers
-
-        backend_index = backend_indices[dispatch_key]
-        dispatch_namespace = str(dispatch_key).lower()
-        fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
-            'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '',
-            'external_backend_headers': '',
-            'dispatch_headers': dest.gen_registration_headers(backend_index, per_operator_headers, rocm),
-            'ops_headers': operator_headers(),
-            'DispatchKey': dispatch_key,
-            'dispatch_namespace': dispatch_key.lower(),
-            'dispatch_helpers': dest.gen_registration_helpers(backend_index),
-            'dispatch_namespaced_definitions': list(concatMap(
-                dest.RegisterDispatchKey(
-                    backend_index,
-                    Target.NAMESPACED_DEFINITION,
-                    selector,
-                    rocm=rocm,
-                    cpp_namespace='at::native',
-                    class_method_name=None),
-                grouped_native_functions
-            )),
-            'dispatch_anonymous_definitions': list(concatMap(
-                dest.RegisterDispatchKey(
-                    backend_index,
-                    Target.ANONYMOUS_DEFINITION,
-                    selector,
-                    rocm=rocm,
-                    cpp_namespace='at::native',
-                    class_method_name=None),
-                grouped_native_functions
-            )),
-            'dispatch_registrations': list(concatMap(
-                dest.RegisterDispatchKey(
-                    backend_index,
-                    Target.REGISTRATION,
-                    selector,
-                    rocm=rocm,
-                    cpp_namespace='at::native',
-                    class_method_name=None),
-                grouped_native_functions
-            )),
-        })
-
-    # BackendSelect is generated specially
-    def gen_backend_select() -> Dict[str, List[str]]:
-        relevant_fns = [fn for fn in native_functions if needs_backend_select(fn, selector)]
-        return {
-            'ops_headers': [f'#include <ATen/ops/{fn.root_name}_ops.h>' for fn in relevant_fns],
-            'backend_select_method_definitions':
-                list(mapMaybe(ComputeBackendSelect(Target.DEFINITION, selector), relevant_fns)),
-            'backend_select_function_registrations':
-                list(mapMaybe(ComputeBackendSelect(Target.REGISTRATION, selector), relevant_fns)),
-        }
-    cpu_fm.write('RegisterBackendSelect.cpp', gen_backend_select)
-
-    schema_selector = selector
-    if force_schema_registration:
-        schema_selector = SelectiveBuilder.get_nop_selector()
-    cpu_fm.write('RegisterSchema.cpp', lambda: {
-        'schema_registrations': list(mapMaybe(RegisterSchema(schema_selector), native_functions)),
-    })
-
-    def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str:
-        return fn.root_name
-
-    cpu_fm.write_sharded(
-        'Operators.cpp',
-        native_functions,
-        key_fn=key_func,
-        env_callable=lambda fn: {
-            'operator_headers': [f'#include <ATen/ops/{fn.root_name}.h>'],
-            'definitions': [ComputeOperators(Target.DEFINITION)(fn)]},
-        num_shards=5,
-        sharded_keys={'operator_headers', 'definitions'}
-    )
-
-    cpu_fm.write('Functions.cpp', lambda: {})
-
-    core_fm.write('TensorMethods.cpp', lambda: {})
-
-    core_fm.write('ATenOpList.cpp', lambda: {
-        'aten_ops': list(mapMaybe(compute_aten_op, native_functions)),
-    })
-
-    # We need to easily map from [inplace_op_name] -> [functional_op] for the functionalization pass,
-    # so here I generate a mapping from every operator name to its corresponding functional NativeFunction (if it exist).
-    pre_grouped_d: Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]] = pre_group_native_functions(native_functions)
-    to_functional_op: Dict[OperatorName, Optional[NativeFunction]] = {
-        k: v for d in [
-            {f.func.name: pre_grouped_d[func][SchemaKind.functional]
-                if SchemaKind.functional in pre_grouped_d[func].keys() else None
-                for f in pre_grouped_d[func].values()}
-            for func in pre_grouped_d.keys()]
-        for k, v in d.items()
-    }
-
-
-    def functionalization_env_callable(
-            g: Union[NativeFunction, NativeFunctionsGroup]
-    ) -> Dict[str, List[str]]:
-        functions = [g] if isinstance(g, NativeFunction) else list(g.functions())
-        functions_needing_functionalization = [
-            fn for fn in functions if needs_functionalization(selector, fn)]
-        return {
-            'ops_headers': ([
-                f"#include <ATen/ops/{functions[0].root_name}_native.h>",
-                f"#include <ATen/ops/{functions[0].root_name}_ops.h>",
-            ] if functions_needing_functionalization else []),
-            'func_definitions': list(mapMaybe(
-                lambda f: gen_functionalization_definition(selector, f, to_functional_op[f.func.name]),
-                functions_needing_functionalization)),
-            'func_registrations': list(mapMaybe(
-                lambda f: gen_functionalization_registration(
-                    selector, f, backend_indices[DispatchKey.CompositeImplicitAutograd]),
-                functions_needing_functionalization)),
-        }
-
-
-    cpu_fm.write_sharded(
-        'RegisterFunctionalization.cpp',
-        grouped_native_functions,
-        key_fn=key_func,
-        env_callable=functionalization_env_callable,
-        num_shards=4,
-        sharded_keys={'ops_headers', 'func_definitions', 'func_registrations'}
-    )
-
-
-def gen_declarations_yaml(
-        cpu_fm: FileManager,
-        native_functions: Sequence[NativeFunction]) -> None:
-    cpu_fm.write('Declarations.yaml', lambda:
-                 format_yaml([compute_declaration_yaml(f) for f in native_functions]))
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description='Generate ATen source files')
-    parser.add_argument(
-        '-s',
-        '--source-path',
-        help='path to source directory for ATen',
-        default='aten/src/ATen')
-    parser.add_argument(
-        '-o',
-        '--output-dependencies',
-        help='output a list of dependencies into the given file and exit')
-    parser.add_argument(
-        '--dry-run', action='store_true',
-        help='run without writing any files (still updates outputs)')
-    parser.add_argument(
-        '--per-operator-headers', action='store_true',
-        help='generate separate headers per operator in ATen/ops')
-    parser.add_argument(
-        '-d', '--install_dir', help='output directory',
-        default='build/aten/src/ATen')
-    parser.add_argument(
-        '--rocm',
-        action='store_true',
-        help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly')
-    # TODO: --op_registration_whitelist will be removed when all call-sites
-    # for gen.py are moved over to using the operator YAML file for mobile
-    # custom build.
-    parser.add_argument(
-        '--op_registration_whitelist',
-        nargs='*',
-        help='filter op registrations by the whitelist (if set); '
-             'each item is `namespace`::`operator name` without overload name; '
-             'e.g.: aten::empty aten::conv2d ...')
-    parser.add_argument(
-        '--op_selection_yaml_path',
-        help='Provide a path to the operator selection (for custom build) YAML '
-             'that contains the information about the set of selected operators '
-             'and their categories (training, ...). Each operator is either a '
-             'full operator name with overload or just a bare operator name. '
-             'The operator names also contain the namespace prefix (e.g. aten::)')
-    parser.add_argument(
-        '--backend_whitelist',
-        nargs='*',
-        help='filter dispatch backend by the whitelist (if set), '
-             'e.g.: CPU CUDA QuantizedCPU ...')
-    parser.add_argument(
-        '--static_dispatch_backend',
-        help='generate static dispatch code for the specific backend (if set)')
-    parser.add_argument(
-        '--force_schema_registration',
-        action='store_true',
-        help='force it to generate schema-only registrations for all ops, including'
-             'those that are not listed on --op_registration_whitelist')
-    parser.add_argument(
-        '--generate',
-        type=str,
-        nargs='*',
-        choices=['headers', 'sources', 'declarations_yaml'],
-        default=['headers', 'sources', 'declarations_yaml'],
-        help='Generate only a subset of files')
-    options = parser.parse_args()
-
-    selector = get_custom_build_selector(
-        options.op_registration_whitelist,
-        options.op_selection_yaml_path,
-    )
-
-    native_yaml_path = os.path.join(options.source_path, 'native/native_functions.yaml')
-    parsed_yaml = parse_native_yaml(native_yaml_path)
-    native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices
-    grouped_native_functions = get_grouped_native_functions(native_functions)
-
-    template_dir = os.path.join(options.source_path, "templates")
-
-    # NB: It is mandatory to NOT use os.path.join here, as the install directory
-    # will eventually be ingested by cmake, which does not respect Windows style
-    # path slashes.  If you switch this to use os.path.join, you'll get an error
-    # like:
-    #
-    #   Syntax error in cmake code when parsing string
-    #
-    #     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
-    #
-    #   Invalid character escape '\c'.
-    core_install_dir = f'{options.install_dir}/core'
-    pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
-    ops_install_dir = f'{options.install_dir}/ops'
-    pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)
-
-    def make_file_manager(install_dir: str) -> FileManager:
-        return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run)
-
-    core_fm = make_file_manager(core_install_dir)
-    cpu_fm = make_file_manager(options.install_dir)
-    cuda_fm = make_file_manager(options.install_dir)
-    ops_fm = make_file_manager(ops_install_dir)
-
-    extra_cuda_headers = '''\
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/ATenCUDAGeneral.h>
-#include <ATen/cuda/CUDADevice.h>
-#include <ATen/cuda/CUDAContext.h>'''
-    if options.rocm:
-        extra_cuda_headers = '''\
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include <ATen/hip/ATenHIPGeneral.h>
-#include <ATen/hip/HIPDevice.h>
-#include <ATen/hip/HIPContext.h>'''
-
-    dispatch_keys = [
-        DispatchKey.CPU,
-        DispatchKey.SparseCPU,
-        DispatchKey.SparseCsrCPU,
-        DispatchKey.MkldnnCPU,
-        DispatchKey.CUDA,
-        DispatchKey.SparseCUDA,
-        DispatchKey.SparseCsrCUDA,
-        DispatchKey.QuantizedCPU,
-        DispatchKey.QuantizedCUDA,
-        DispatchKey.CompositeImplicitAutograd,
-        DispatchKey.CompositeExplicitAutograd,
-        # Meta is a magic key: it is automatically generated for structured
-        # kernels
-        DispatchKey.Meta,
-        DispatchKey.ZeroTensor,
-    ]
-    # Only a limited set of dispatch keys get CPUFunctions.h headers generated
-    # for them; this is the set
-    functions_keys = {
-        DispatchKey.CPU,
-        DispatchKey.CUDA,
-        DispatchKey.CompositeImplicitAutograd,
-        DispatchKey.CompositeExplicitAutograd,
-        DispatchKey.Meta,
-    }
-    if options.backend_whitelist:
-        dispatch_keys = [k for k in dispatch_keys if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist]
-
-    static_dispatch_idx: Optional[BackendIndex] = None
-    if options.static_dispatch_backend:
-        static_dispatch_idx = backend_indices[DispatchKey.parse(options.static_dispatch_backend)]
-
-    if 'sources' in options.generate:
-        gen_source_files(
-            native_functions=native_functions,
-            grouped_native_functions=grouped_native_functions,
-            static_dispatch_idx=static_dispatch_idx,
-            selector=selector,
-            backend_indices=backend_indices,
-            core_fm=core_fm,
-            cpu_fm=cpu_fm,
-            cuda_fm=cuda_fm,
-            dispatch_keys=dispatch_keys,
-            functions_keys=functions_keys,
-            rocm=options.rocm,
-            force_schema_registration=options.force_schema_registration,
-            per_operator_headers=options.per_operator_headers,
-        )
-
-    if 'headers' in options.generate:
-        gen_headers(
-            native_functions=native_functions,
-            grouped_native_functions=grouped_native_functions,
-            static_dispatch_idx=static_dispatch_idx,
-            selector=selector,
-            backend_indices=backend_indices,
-            core_fm=core_fm,
-            cpu_fm=cpu_fm,
-            cuda_fm=cuda_fm,
-            ops_fm=ops_fm,
-            dispatch_keys=dispatch_keys,
-            functions_keys=functions_keys,
-            rocm=options.rocm,
-            per_operator_headers=options.per_operator_headers,
-        )
-
-    if 'declarations_yaml' in options.generate:
-        gen_declarations_yaml(
-            native_functions=native_functions,
-            cpu_fm=cpu_fm)
-
-    if options.output_dependencies:
-        depfile_path = pathlib.Path(options.output_dependencies).resolve()
-        depfile_name = depfile_path.name
-        depfile_stem = depfile_path.stem
-
-        for fm, prefix in [
-                (cpu_fm, ""),
-                (core_fm, "core_"),
-                (cuda_fm, "cuda_"),
-                (ops_fm, "ops_"),
-        ]:
-            varname = prefix + depfile_stem
-            path = depfile_path.parent / (prefix + depfile_name)
-            fm.write_outputs(varname, str(path))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py
deleted file mode 100644
index 7837a41cab6e..000000000000
--- a/tools/codegen/gen_backend_stubs.py
+++ /dev/null
@@ -1,325 +0,0 @@
-import pathlib
-import argparse
-import os
-import yaml
-import re
-from collections import namedtuple, Counter, defaultdict
-from typing import List, Dict, Union, Sequence, Optional
-from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml
-from tools.codegen.model import (BackendIndex, BackendMetadata, DispatchKey,
-                                 NativeFunction, NativeFunctionsGroup, OperatorName)
-from tools.codegen.selective_build.selector import SelectiveBuilder
-from tools.codegen.utils import Target, concatMap, context, YamlLoader, FileManager
-from tools.codegen.context import native_function_manager
-import tools.codegen.dest as dest
-import tools.codegen.api.dispatcher as dispatcher
-from tools.codegen.api.types import DispatcherSignature
-
-
-# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
-# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping)
-ParsedExternalYaml = namedtuple('ParsedExternalYaml', [
-    'backend_key', 'autograd_key', 'cpp_namespace', 'backend_indices'])
-def parse_backend_yaml(
-        backend_yaml_path: str,
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        backend_indices: Dict[DispatchKey, BackendIndex]
-) -> ParsedExternalYaml:
-
-    native_functions_map: Dict[OperatorName, NativeFunction] = {
-        f.func.name: f
-        for f in concatMap(lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()), grouped_native_functions)
-    }
-
-    with open(backend_yaml_path, 'r') as f:
-        yaml_values = yaml.load(f, Loader=YamlLoader)
-    assert isinstance(yaml_values, dict)
-
-    valid_keys = ['backend', 'cpp_namespace', 'extra_headers', 'supported', 'autograd', 'full_codegen']
-
-    backend = yaml_values.pop('backend', None)
-    assert backend is not None, 'You must provide a value for "backend"'
-
-    cpp_namespace = yaml_values.pop('cpp_namespace', None)
-    assert cpp_namespace is not None, 'You must provide a value for "cpp_namespace"'
-
-    # Mostly just defaulting to false to stick with LazyTensor convention.
-    use_out_as_primary = yaml_values.pop('use_out_as_primary', False)
-    assert isinstance(use_out_as_primary, bool), \
-        f'You must provide either True or False for use_out_as_primary. Provided: {use_out_as_primary}'
-
-    use_device_guard = yaml_values.pop('device_guard', False)
-    assert isinstance(use_device_guard, bool), \
-        f'You must provide either True or False for device_guard. Provided: {use_device_guard}'
-
-    supported = yaml_values.pop('supported', [])
-    if supported is None:
-        supported = []  # Allow an empty list of supported ops
-    assert isinstance(supported, list), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})'
-
-    supported_autograd = yaml_values.pop('autograd', [])
-    assert isinstance(supported_autograd, list), f'expected "autograd" to be a list, but got: {supported_autograd}'
-
-    # full_codegen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
-    full_codegen = yaml_values.pop('full_codegen', [])
-    supported.extend(full_codegen)
-
-    assert len(yaml_values.keys()) == 0, \
-        f'{backend_yaml_path} contains unexpected keys: {", ".join(yaml_values.keys())}. \
-Only the following keys are supported: {", ".join(valid_keys)}'
-
-    def create_backend_index(
-            backend_ops: List[str],
-            dispatch_key: DispatchKey,
-            *,
-            use_out_as_primary: bool,
-            use_device_guard: bool
-    ) -> BackendIndex:
-        metadata: Dict[OperatorName, BackendMetadata] = {}
-        for op in backend_ops:
-            op_name = OperatorName.parse(op)
-            assert op_name in native_functions_map, f"Found an invalid operator name: {op_name}"
-            # See Note [External Backends Follow Dispatcher API]
-            kernel_name = dispatcher.name(native_functions_map[op_name].func)
-            # TODO: allow structured external backends later.
-            m = BackendMetadata(kernel=kernel_name, structured=False)
-            metadata[op_name] = m
-        return BackendIndex(
-            dispatch_key=dispatch_key,
-            use_out_as_primary=use_out_as_primary,
-            external=True,
-            device_guard=use_device_guard,
-            index=metadata)
-
-    backend_key: Optional[DispatchKey] = None
-    if len(supported) > 0:
-        with context(lambda: f'The provided value for "backend" must be a valid DispatchKey, but got {backend}.'):
-            backend_key = DispatchKey.parse(backend)
-
-        backend_idx = create_backend_index(
-            supported, backend_key, use_out_as_primary=use_out_as_primary, use_device_guard=use_device_guard)
-        assert backend_key not in backend_indices
-        backend_indices[backend_key] = backend_idx
-
-    autograd_key: Optional[DispatchKey] = None
-    if len(supported_autograd) > 0:
-        with context(lambda: f'The "autograd" key was specified, which indicates that you would like to override \
-the behavior of autograd for some operators on your backend. However "Autograd{backend}" is not a valid DispatchKey.'):
-            autograd_key = DispatchKey.parse(f'Autograd{backend}')
-
-        autograd_idx = create_backend_index(
-            supported_autograd, autograd_key, use_out_as_primary=use_out_as_primary, use_device_guard=use_device_guard)
-        assert autograd_key not in backend_indices
-        backend_indices[autograd_key] = autograd_idx
-
-    for g in grouped_native_functions:
-        if isinstance(g, NativeFunction):
-            forward_kernels = [] if backend_key is None else \
-                [m for m in [backend_indices[backend_key].get_kernel(g)] if m is not None]
-            backward_kernels = [] if autograd_key is None else \
-                [m for m in [backend_indices[autograd_key].get_kernel(g)] if m is not None]
-        else:
-            forward_kernels = [] if backend_key is None else [m for m in [
-                backend_indices[backend_key].get_kernel(f) for f in g.functions()]
-                if m is not None]
-            backward_kernels = [] if autograd_key is None else [m for m in [
-                backend_indices[autograd_key].get_kernel(f) for f in g.functions()]
-                if m is not None]
-
-        forward_kernels = [f for f in forward_kernels if f is not None]
-        backward_kernels = [f for f in backward_kernels if f is not None]
-        assert len(forward_kernels) == 0 or len(backward_kernels) == 0, \
-            f'Currently, all variants of an op must either be registered to a backend key, or to a backend\'s \
-autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! \
-{forward_kernels[0].kernel} is listed under "supported", but {backward_kernels[0].kernel} is listed under "autograd".'
-
-    return ParsedExternalYaml(backend_key, autograd_key, cpp_namespace, backend_indices)
-
-def error_on_missing_kernels(
-        native_functions: Sequence[NativeFunction],
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        backend_key: DispatchKey,
-        autograd_key: Optional[DispatchKey],
-        kernel_defn_file_path: str,
-        full_codegen: Optional[List[OperatorName]] = None,
-) -> None:
-    try:
-        with open(kernel_defn_file_path, 'r') as f:
-            backend_defns = f.read()
-    except IOError:
-        raise AssertionError(f'Unable to read from the specified impl_path file: {kernel_defn_file_path}')
-
-    if full_codegen is None:
-        full_codegen = []
-
-    class_name: Optional[str] = backend_indices[backend_key].native_function_class_name()
-    assert class_name is not None
-
-    expected_backend_op_names: List[OperatorName] = \
-        list(backend_indices[backend_key].index.keys()) + \
-        [] if autograd_key is None else list(backend_indices[autograd_key].index.keys())
-    expected_backend_native_funcs: List[NativeFunction] = [
-        f for f in native_functions if f.func.name in expected_backend_op_names and f.func.name not in full_codegen]
-    expected_backend_kernel_name_counts: Dict[str, List[NativeFunction]] = defaultdict(list)
-    for native_f in expected_backend_native_funcs:
-        expected_backend_kernel_name_counts[dispatcher.name(native_f.func)].append(native_f)
-
-    kernel_defn_regex = rf'{class_name}::([\w\d]*)\([^\)]*\)\s*{{'
-    actual_backend_kernel_name_counts = Counter(re.findall(kernel_defn_regex, backend_defns))
-
-    missing_kernels_err_msg = ""
-    for expected_name, funcs in expected_backend_kernel_name_counts.items():
-        expected_overload_count = len(funcs)
-        actual_overload_count = actual_backend_kernel_name_counts[expected_name]
-        if expected_overload_count != actual_overload_count:
-            def create_decl(f: NativeFunction) -> str:
-                with native_function_manager(f):
-                    return DispatcherSignature.from_schema(f.func).decl()
-            expected_schemas_str = '\n'.join([create_decl(f) for f in funcs])
-            missing_kernels_err_msg += f"""
-{class_name} is missing a kernel definition for {expected_name}. We found {actual_overload_count} kernel(s) with that name,
-but expected {expected_overload_count} kernel(s). The expected function schemas for the missing operator are:
-{expected_schemas_str}
-
-"""
-    assert missing_kernels_err_msg == "", missing_kernels_err_msg
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description='Generate backend stub files')
-    parser.add_argument(
-        '-s',
-        '--source_yaml',
-        help='path to source yaml file containing operator external definitions')
-    parser.add_argument(
-        '-o', '--output_dir', help='output directory')
-    parser.add_argument(
-        '--dry_run', type=bool, default=False, help='output directory')
-    parser.add_argument(
-        '--impl_path', type=str, default=None, help='path to the source C++ file containing kernel definitions')
-    options = parser.parse_args()
-
-    run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path)
-
-
-def gen_dispatchkey_nativefunc_headers(
-        fm: FileManager,
-        class_name: str,
-        cpp_namespace: str,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        backend_dispatch_key: DispatchKey,
-        autograd_dispatch_key: Optional[DispatchKey]) -> None:
-    assert class_name is not None
-    generated_comment = 'Autogenerated file by gen_backend_stubs.py. Do not edit directly!'
-
-    # Convert to a set first to remove duplicate kernel names.
-    # Backends are allowed to repeat kernel names; only generate the declaration once!
-    # Sort for deterministic output.
-    backend_declarations = list(sorted(set(concatMap(
-        lambda f: dest.compute_native_function_declaration(f, backend_indices[backend_dispatch_key]),
-        grouped_native_functions))))
-    autograd_declarations = list(sorted(set(concatMap(
-        lambda f: [] if autograd_dispatch_key is None else
-        dest.compute_native_function_declaration(f, backend_indices[autograd_dispatch_key]),
-        grouped_native_functions))))
-
-    fm.write_with_template(f'{backend_dispatch_key}NativeFunctions.h', 'DispatchKeyNativeFunctions.h', lambda: {
-        'generated_comment': generated_comment,
-        'cpp_namespace': cpp_namespace,
-        'class_name': class_name,
-        'dispatch_declarations': backend_declarations + autograd_declarations,
-    })
-
-
-def gen_dispatcher_registrations(
-        fm: FileManager,
-        output_dir: str,
-        cpp_namespace: str,
-        backend_indices: Dict[DispatchKey, BackendIndex],
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-        backend_dispatch_key: DispatchKey,
-        dispatch_key: DispatchKey,
-        selector: 'SelectiveBuilder') -> None:
-    backend_index = backend_indices[dispatch_key]
-    fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
-        'extra_cuda_headers': '',
-        'external_backend_headers': f'#include "{output_dir}/{backend_dispatch_key}NativeFunctions.h"',
-        'ops_headers': '#include <ATen/Functions.h>',
-        'DispatchKey': dispatch_key,
-        'dispatch_namespace': dispatch_key.lower(),
-        'dispatch_headers': dest.gen_registration_headers(backend_index, per_operator_headers=False, rocm=False),
-        'dispatch_helpers': dest.gen_registration_helpers(backend_index),
-        'dispatch_namespaced_definitions': list(concatMap(
-            dest.RegisterDispatchKey(
-                backend_index,
-                Target.NAMESPACED_DEFINITION,
-                selector,
-                rocm=False,
-                cpp_namespace=cpp_namespace,
-                class_method_name=f'{backend_dispatch_key}NativeFunctions'),
-            grouped_native_functions
-        )),
-        'dispatch_anonymous_definitions': list(concatMap(
-            dest.RegisterDispatchKey(
-                backend_index,
-                Target.ANONYMOUS_DEFINITION,
-                selector,
-                rocm=False,
-                cpp_namespace=cpp_namespace,
-                class_method_name=f'{backend_dispatch_key}NativeFunctions'),
-            grouped_native_functions
-        )),
-        'dispatch_registrations': list(concatMap(
-            dest.RegisterDispatchKey(
-                backend_index,
-                Target.REGISTRATION,
-                selector,
-                rocm=False,
-                cpp_namespace=cpp_namespace,
-                class_method_name=f'{dispatch_key}NativeFunctions'),
-            grouped_native_functions
-        )),
-    })
-
-def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str] = None) -> None:
-
-    # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py
-    pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute()
-    template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
-
-    def make_file_manager(install_dir: str) -> FileManager:
-        return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=dry_run)
-
-    fm = make_file_manager(output_dir)
-
-    native_yaml_path = os.path.join(pytorch_root, 'aten/src/ATen/native/native_functions.yaml')
-    parsed_yaml = parse_native_yaml(native_yaml_path)
-    native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices
-    grouped_native_functions = get_grouped_native_functions(native_functions)
-    parsed_backend_yaml = parse_backend_yaml(source_yaml, grouped_native_functions, backend_indices)
-    backend_key = parsed_backend_yaml.backend_key
-    autograd_key = parsed_backend_yaml.autograd_key
-    cpp_namespace = parsed_backend_yaml.cpp_namespace
-    backend_indices = parsed_backend_yaml.backend_indices
-
-    selector = SelectiveBuilder.get_nop_selector()
-
-
-    if backend_key is None:
-        # This could be useful if a backend wants to quickly set up a noop yaml file but doesn't have any kernels ready yet.
-        return
-
-    class_name = backend_indices[backend_key].native_function_class_name()
-
-    if impl_path is not None:
-        error_on_missing_kernels(native_functions, backend_indices, backend_key, autograd_key, impl_path)
-
-
-        gen_dispatchkey_nativefunc_headers(fm, class_name, cpp_namespace, backend_indices,
-                                           grouped_native_functions, backend_key, autograd_key)
-
-        for dispatch_key in [backend_key] if autograd_key is None else [backend_key, autograd_key]:
-            gen_dispatcher_registrations(fm, output_dir, cpp_namespace, backend_indices, grouped_native_functions,
-                                         backend_key, dispatch_key, selector)
-if __name__ == '__main__':
-    main()
diff --git a/tools/codegen/gen_functionalization_type.py b/tools/codegen/gen_functionalization_type.py
deleted file mode 100644
index 6666a493be74..000000000000
--- a/tools/codegen/gen_functionalization_type.py
+++ /dev/null
@@ -1,365 +0,0 @@
-from tools.codegen.api import cpp
-from tools.codegen.api.types import (
-    DispatcherSignature, Binding, FunctionalizationLambda, ViewInverseSignature
-)
-from tools.codegen.api.translate import translate
-from tools.codegen.context import with_native_function
-from tools.codegen.model import (
-    Argument, NativeFunction, SchemaKind, BackendIndex,
-    Tag, FunctionSchema, SelfArgument, TensorOptionsArguments, BaseType, BaseTy
-)
-from tools.codegen.selective_build.selector import SelectiveBuilder
-from typing import List, Optional, Union, Tuple
-from tools.codegen.utils import mapMaybe
-
-def modifies_arguments(f: NativeFunction) -> bool:
-    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
-
-# This function constructs the return statement for the kernels that contain mutations
-# It mostly just needs to special case multi-output returns to wrap the result in a tuple
-def return_str(f: NativeFunction) -> str:
-    if len(f.func.arguments.out) != 0:
-        if len(f.func.arguments.out) > 1:
-            return_names = ', '.join(a.name for a in f.func.arguments.out)
-            return f'return {DispatcherSignature.from_schema(f.func).returns_type().cpp_type()}({return_names});'
-        else:
-            return f'return {f.func.arguments.out[0].name}'
-    if f.func.arguments.self_arg is not None:
-        return f'return {f.func.arguments.self_arg.argument.name}'
-    return ''
-
-def wrapper_name(func: FunctionSchema) -> str:
-    if func.name.overload_name:
-        return f'{cpp.name(func)}_{func.name.overload_name}'
-    else:
-        return cpp.name(func)
-
-def is_tensor_like(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> bool:
-    return isinstance(a, SelfArgument) or (isinstance(a, Argument) and a.type.is_tensor_like())
-
-# unwraps all tensor-like arguments, returning:
-# (1) a string containing all of the logic that does the unwrapping
-# (2) a context, to be used by translate(), with all of the relevant bindings.
-def unwrap_tensor_args(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
-    context: List[Binding] = []
-    unwrapped_tensor_args: List[str] = []
-    for arg in sig.arguments():
-        if is_tensor_like(arg.argument):
-            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
-            unwrapped_name = f'{arg.name}_'
-            unwrapped_tensor_args.append(
-                f'auto {unwrapped_name} = at::functionalization::impl::from_functional_tensor({arg.name});')
-            context.append(arg.with_name(unwrapped_name))
-        else:
-            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
-            context.append(arg)
-    unwrap_tensor_args_str = '\n      '.join(unwrapped_tensor_args)
-    return unwrap_tensor_args_str, context
-
-# converts  all tensor-like arguments to meta tensors, which are used to compute stride info. Returns:
-# (1) a string containing all of the logic that does the conversions.
-# (2) a context, to be used by translate(), with all of the relevant bindings.
-def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
-    context: List[Binding] = []
-    unwrapped_tensor_args: List[str] = []
-    for arg in sig.arguments():
-        if is_tensor_like(arg.argument):
-            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
-            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
-            a_ = arg.name
-            unwrapped_name = f'{arg.name}_meta'
-            unwrapped_tensor_args.append(
-                f"auto {unwrapped_name} = at::native::empty_strided_meta({a_}.sizes(), {a_}.strides(), \
-/*dtype=*/c10::make_optional({a_}.scalar_type()), /*layout=*/c10::make_optional({a_}.layout()), \
-/*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);"
-            )
-            context.append(arg.with_name(unwrapped_name))
-        else:
-            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
-            context.append(arg)
-    unwrap_tensor_args_str = '\n        '.join(unwrapped_tensor_args)
-    return unwrap_tensor_args_str, context
-
-# The functionalization codegen currently expects view op schemas to have this form:
-# foo(Tensor(a), ...) -> Tensor(a) (e.g. transpose)
-# foo(Tensor(a!), ...) -> Tensor(a!) (e.g. transpose_)
-def assert_view_op_properties(func: FunctionSchema) -> None:
-    def is_alias(a: Argument) -> bool:
-        return a.annotation is not None
-
-    args = func.arguments.flat_non_out
-    # The first argument is a tensor with an alias semantics (annotations)
-    assert len(args) > 0 and args[0].type == BaseType(BaseTy.Tensor), \
-        f"""In the functionalization codegen, we expect the first argument of every view operator to be a tensor,
-but found an argument of type {str(args[0].type)} for operator: {str(func.name)}."""
-    # No other arguments have aliasing semantics
-    assert is_alias(args[0]) and not any(is_alias(a) for a in args[1:]), \
-        """In the functionalization codegen, we expect the first argument of every view operator to alias the output.
-View operators with multiple aliasing inputs aren't supported yet. Found an operator that doesn't satisfy this constraint"""
-
-# Generates the Functionalization kernel for:
-# - ops that create aliases (e.g. transpose())
-# - ops that are views AND mutations (e.g. transpose_())
-def emit_view_functionalization_body(
-        f: NativeFunction,
-        functional_op: NativeFunction
-) -> str:
-    # view op case
-    assert f.is_view_op
-
-    if f.tag is Tag.inplace_view:
-        # This op is both an inplace op AND a view op.
-        # See Note [Functionalization Pass - Inplace View Ops] for details.
-        # I currently have the view meta call into the out-of-place variant of the view, to avoid
-        # having to define an extra ~20 inplace {view}_inverse_ functions.
-        # Most view ops don't have NativeFunctionGroup's both, because we don't define out= variants for view ops.
-        # I'm assuming that every inplace-view op has a corresponding out-of-place view op,
-        # with the same name but the trailing underscore removed.
-        # This is currently asserted at parse time in gen.py (see error_check_native_functions).
-        assert f.func.kind() is SchemaKind.inplace
-        # Requirement: Every inplace_view op needs to have a corresponding functional view op, which we paired together beforehand.
-        assert functional_op is not None
-        api_name = functional_op.func.name.unambiguous_name()
-        call_sig = DispatcherSignature.from_schema(functional_op.func)
-    else:
-        api_name = f.func.name.unambiguous_name()
-        call_sig = DispatcherSignature.from_schema(f.func)
-
-    dispatcher_sig = DispatcherSignature.from_schema(f.func)
-    assert_view_op_properties(f.func)
-    view_tensor_name = dispatcher_sig.arguments()[0].name
-
-    keyset = 'dispatchKeySet & c10::after_func_keyset'
-    return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type()
-
-    unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(dispatcher_sig)
-    view_redispatch_args = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)]
-
-    forward_lambda = FunctionalizationLambda.from_func(f, functional_op=functional_op, is_reverse=False)
-    reverse_lambda = FunctionalizationLambda.from_func(f, functional_op=functional_op, is_reverse=True)
-
-    # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
-    meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
-    meta_call_args = [e.expr for e in translate(meta_call_ctx, call_sig.arguments(), method=False)]
-
-    if f.tag is Tag.inplace_view:
-        # See Note [Functionalization Pass - Inplace View Ops] for more details
-        return f"""
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          return {forward_lambda.inner_call()}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }}
-      );
-      at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta);
-      {unwrap_tensor_args_str}
-      {return_type} reference_tensor_output;
-      {{
-        at::AutoDispatchSkipFunctionalize guard;
-        {meta_conversion_str}
-        reference_tensor_output = at::_ops::{api_name}::call({', '.join(meta_call_args)});
-      }}
-      // See  Note [Propagating strides in the functionalization pass]
-      at::functionalization::impl::set_sizes_strides_offset({view_tensor_name}, reference_tensor_output);
-      return {view_tensor_name};
-"""
-
-    else:
-        return f"""
-      {unwrap_tensor_args_str}
-      {return_type} tmp_output;
-      {return_type} reference_tensor_output;
-      {{
-        at::AutoDispatchSkipFunctionalize guard;
-        {meta_conversion_str}
-        reference_tensor_output = at::_ops::{api_name}::call({', '.join(meta_call_args)});
-        tmp_output = at::_ops::{api_name}::redispatch({', '.join(view_redispatch_args)});
-        // I'm fusing the [alias removal], [mutation removal], [add views back] passes together.
-        // Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal).
-      }}
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          return {forward_lambda.inner_call()}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }}
-      );
-      auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
-      // See  Note [Propagating strides in the functionalization pass]
-      at::functionalization::impl::set_sizes_strides_offset(out, reference_tensor_output);
-      return out;
-"""
-
-# Generates the Functionalization kernel for inplace ops
-def emit_inplace_functionalization_body(
-        f: NativeFunction,
-        functional_op: Optional[NativeFunction]
-) -> str:
-    # mutation case
-    assert(modifies_arguments(f))
-
-    dispatcher_sig = DispatcherSignature.from_schema(f.func)
-
-    keyset = 'dispatchKeySet & c10::after_func_keyset'
-    return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type()
-
-    unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(dispatcher_sig)
-
-    maybe_return = '' if len(f.func.returns) == 0 else 'return '
-    sync_tensor_args = '\n      '.join(mapMaybe(
-        lambda arg: f'at::functionalization::impl::sync({arg.name});'
-                    if arg.type.is_tensor_like() else None,
-        f.func.arguments.flat_all))
-
-    # Note [functionalizating copy_() and not preserving strides]
-    # copy_() can't be functionalized, since there doesn't exist an out-of-place variant.
-    # We could add one, but that would be sub-optimal for functorch: copy() would need to allocate a fresh tensor.
-    # This may seem like a large hack for one optimization, but copy_() is one of the most common inplace operators.
-    # Instead, we can replace `self.copy_(src)` with `src.to(self).expand_as(self)`.
-    # This maintains the exact same semantics, EXCEPT that we don't preserve the strides from `self`.
-    # This seems like a reasonable tradeoff, for a few reasons:
-    # - mutation removal is only used by functorch, and not by Vulkan or XLA. Functorch already doesn't preserve strides.
-    # - There are actually a few other places where the functionalization pass currently doesn't support strides:
-    #   calls to slice/diagonal_scatter don't currently preserve the strides of their inputs (but maybe we should fix this).
-    if str(f.func.name) == 'copy_':
-        exprs = [keyset] + [a.name for a in unwrapped_args_ctx]
-        functional_call_str = f"""\
-            auto tmp_intermediate = at::_ops::to_other::redispatch({keyset}, src_, self_, non_blocking, false, c10::nullopt);
-            tmp_output = at::_ops::expand_as::redispatch({keyset}, tmp_intermediate, self_);"""
-    elif functional_op is None:
-        # We can't functionalize this inplace op, since we don't know what the corresponding functional op is.
-        inplace_exprs = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, dispatcher_sig.arguments(), method=False)]
-        warn_str = "Note: the functionalization pass encountered an operator ({}) that it could not functionalize, \
-because it couldn't find an out-of-place equivalent of the operator to call. \
-Instead, it's calling the inplace/view operator directly. \
-If this causes problems in your program, consider upstreaming the out-of-place op to PyTorch.".format(str(f.func.name))
-
-        return f"""
-      if (c10::impl::tls_local_dispatch_key_set().included_.has(c10::DispatchKey::Functionalize)) {{
-          TORCH_WARN("{warn_str}");
-      }}
-      {sync_tensor_args}
-      {unwrap_tensor_args_str}
-      at::AutoDispatchSkipFunctionalize guard;
-      // Redispatch as normally otherwise, since XLA has its own lowerings for special inplace ops.
-      {maybe_return}at::_ops::{f.func.name.unambiguous_name()}::redispatch({', '.join(inplace_exprs)});
-"""
-    else:
-        # call the out-of-place variant of the op
-        functional_sig = DispatcherSignature.from_schema(functional_op.func)
-        functional_exprs = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False)]
-        functional_call_str = \
-            f"tmp_output = at::_ops::{functional_op.func.name.unambiguous_name()}::redispatch({', '.join(functional_exprs)});"
-
-    mutable_input_post_processing = '\n'.join([
-        f"""
-      auto {a.name}_functional = at::functionalization::impl::unsafeGetFunctionalWrapper({a.name});
-      {a.name}_functional->replace_(tmp_output);
-      {a.name}_functional->commit_update();"""
-        for a in f.func.arguments.flat_non_out
-        if a.annotation and a.annotation.is_write and a.type.is_tensor_like()])
-
-    return f"""
-      {sync_tensor_args}
-      {unwrap_tensor_args_str}
-      {return_type} tmp_output;
-      {{
-          at::AutoDispatchSkipFunctionalize guard;
-          // The functionalization pass explicitly doesn't pass out= parameters to the redispatch
-          {functional_call_str}
-      }}
-      {mutable_input_post_processing}
-      {return_str(f)};"""
-
-
-def emit_declaration_for_noncomposite_views(f: NativeFunction) -> str:
-    # For every view op, we need a corresponding "inverse view" function.
-    # This generates the declarations so we get a good compiler error when someone adds a new view.
-    view_inverse_sig = ViewInverseSignature(f)
-    return view_inverse_sig.decl()
-
-
-# The below functions generate RegisterFunctionalization.cpp
-# These files provide the kernels that run the functionalization pass, which can be opted into
-# per backend (e.g. XLA or Vulkan), or as a composable transform (functionalize() in functorch).
-
-def needs_functionalization(
-    selector: SelectiveBuilder,
-    f: NativeFunction,
-) -> bool:
-    return (selector.include_all_operators and
-            (f.is_view_op or modifies_arguments(f)))
-
-
-def gen_functionalization_registration(
-    selector: SelectiveBuilder,
-    f: NativeFunction,
-    composite_implicit_autograd_index: BackendIndex
-) -> Optional[str]:
-    @with_native_function
-    def emit_registration_helper(f: NativeFunction) -> Optional[str]:
-        # Note: for now, this logic is meant to avoid registering functionalization kernels for mobile.
-        # At some point, Vulkan we'll want to use functionalization and we'll need to change this.
-        if not needs_functionalization(selector, f):
-            return None
-        if f.is_view_op and f.has_composite_implicit_autograd_kernel:
-            metadata = composite_implicit_autograd_index.get_kernel(f)
-            assert metadata is not None
-            native_api_name = metadata.kernel
-            sig = DispatcherSignature.from_schema(f.func)
-            # Note [Composite view ops in the functionalization pass]
-            # We don't need to worry about implemententing functionalization kernels for views with
-            # CompositeImplicitAutograd kernels, because we can just decompose them into their base operators.
-            # We can't just opt the entire Functionalization dispatch key into the composite keyset though,
-            # because we don't want to decompose non-view ops that are composite, like `at::ones`.
-            registration_str = f'static_cast<{sig.ptr_type()}>(at::native::{native_api_name})'
-        else:
-            registration_str = f'TORCH_FN(functionalization::{wrapper_name(f.func)})'
-
-        return f'm.impl("{f.func.name}", {registration_str});'
-
-    return emit_registration_helper(f)
-
-def gen_functionalization_definition(
-    selector: SelectiveBuilder,
-    f: NativeFunction,
-    functional_op: Optional[NativeFunction]
-) -> Optional[str]:
-    @with_native_function
-    def emit_definition_helper(f: NativeFunction) -> Optional[str]:
-        if not needs_functionalization(selector, f):
-            return None
-        if f.is_view_op and f.has_composite_implicit_autograd_kernel:
-            # See Note [Composite view ops in the functionalization pass]
-            return None
-        # order is important here, ops that are both views and mutations should hit the view path.
-        if f.is_view_op:
-            # Every view op is expected to have a functional counterpart (e.g. transpose_() -> transpose())
-            assert functional_op is not None
-            body_str = emit_view_functionalization_body(f, functional_op)
-        else:
-            # inplace op
-            assert modifies_arguments(f)
-            body_str = emit_inplace_functionalization_body(f, functional_op)
-        sig = DispatcherSignature.from_schema(f.func)
-        return f"""
-    {sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
-    {body_str}
-    }}
-    """
-
-    return emit_definition_helper(f)
-
-# See Note [Functionalization Pass: View Inverses].
-@with_native_function
-def gen_functionalization_view_inverse_declaration(f: NativeFunction) -> Optional[str]:
-    # We only need to generate view_inverse declarations for view ops that:
-    # - aren't composite (since they'll decompose and we'll get them for free).
-    # - aren't inplace (since they should have a corresponding functional version, which we call instead).
-    if f.is_view_op and not f.has_composite_implicit_autograd_kernel and not modifies_arguments(f):
-        output = emit_declaration_for_noncomposite_views(f)
-        return output
-    return None
diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py
deleted file mode 100644
index b2515d3d083c..000000000000
--- a/tools/codegen/gen_lazy_tensor.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import pathlib
-import argparse
-import os
-import yaml
-from collections import namedtuple
-from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple
-from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml
-from tools.codegen.model import (FunctionSchema,
-                                 NativeFunction, NativeFunctionsGroup, OperatorName)
-from tools.codegen.selective_build.selector import SelectiveBuilder
-from tools.codegen.utils import concatMap, YamlLoader, FileManager
-import tools.codegen.dest as dest
-from .gen_backend_stubs import (parse_backend_yaml, error_on_missing_kernels,
-                                gen_dispatchkey_nativefunc_headers,
-                                gen_dispatcher_registrations)
-
-# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
-# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping, full_codegen)
-ParsedExternalYaml = namedtuple('ParsedExternalYaml', [
-    'backend_key', 'autograd_key', 'cpp_namespace', 'backend_indices', 'full_codegen'])
-
-
-def parse_full_codegen_ops(
-        backend_yaml_path: str,
-        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
-) -> List[OperatorName]:
-
-    native_functions_map: Dict[OperatorName, NativeFunction] = {
-        f.func.name: f
-        for f in concatMap(lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()), grouped_native_functions)
-    }
-
-    with open(backend_yaml_path, 'r') as f:
-        yaml_values = yaml.load(f, Loader=YamlLoader)
-    assert isinstance(yaml_values, dict)
-
-    full_codegen = yaml_values.pop('full_codegen', [])
-    assert isinstance(full_codegen, list), f'expected "full_codegen" to be a list, but got: {full_codegen}'
-    full_codegen = [OperatorName.parse(name) for name in full_codegen]
-
-    return full_codegen
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description='Generate Lazy Tensor backend files')
-    parser.add_argument(
-        '-s',
-        '--source_yaml',
-        help='path to source yaml file containing operator external definitions')
-    parser.add_argument(
-        '-o', '--output_dir', help='output directory')
-    parser.add_argument(
-        '--dry_run', type=bool, default=False, help='output directory')
-    parser.add_argument(
-        '--impl_path', type=str, default=None, help='path to the source C++ file containing kernel definitions')
-    parser.add_argument(
-        '--gen_ts_lowerings', action="store_true", help='Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions')
-    parser.add_argument(
-        '--node_base', type=str, default="Node", help='Name of backend specific custom Lazy IR Node base class')
-    parser.add_argument(
-        '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class')
-    parser.add_argument(
-        '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class')
-    parser.add_argument(
-        '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h",
-        help='Path to header file defining custom Lazy Tensor class')
-    options = parser.parse_args()
-
-    run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path,
-        options.gen_ts_lowerings, options.node_base, options.node_base_hdr,
-        options.tensor_class, options.tensor_class_hdr)
-
-
-def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str],
-        gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str],
-        tensor_class: str, tensor_class_hdr: str) -> None:
-
-    # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py
-    pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute()
-    template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
-
-    def make_file_manager(install_dir: str) -> FileManager:
-        return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=dry_run)
-
-    fm = make_file_manager(output_dir)
-
-    native_yaml_path = os.path.join(pytorch_root, 'aten/src/ATen/native/native_functions.yaml')
-    parsed_yaml = parse_native_yaml(native_yaml_path)
-    native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices
-    grouped_native_functions = get_grouped_native_functions(native_functions)
-
-    def sort_native_function(f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
-        """
-        We sort the native function because of the note in concat_map_codegen.
-        TODO(alanwaketan): Remove this sorting hack once all ops are grouped properly.
-        """
-        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
-        return str(func.name.name)
-
-    grouped_native_functions = sorted(grouped_native_functions, key=sort_native_function)
-    parsed_backend_yaml = parse_backend_yaml(source_yaml, grouped_native_functions, backend_indices)
-    backend_key = parsed_backend_yaml.backend_key
-    autograd_key = parsed_backend_yaml.autograd_key
-    cpp_namespace = parsed_backend_yaml.cpp_namespace
-    backend_indices = parsed_backend_yaml.backend_indices
-    full_codegen = parse_full_codegen_ops(source_yaml, grouped_native_functions)
-
-    def concat_map_codegen(func: Callable[[NativeFunction], Sequence[str]],
-                           xs: Iterable[Union[NativeFunctionsGroup, NativeFunction]],
-                           *, codegenInplaceVariant: bool = False) -> Iterator[str]:
-        """
-        We code-gen for the functional variant, which is all we need for IR classes/lowerings/shape inferences, but we
-        only code-gen additional entries for the inplace variant for the native functions.
-        Note: If xs is not sorted, there may be an edge case when generating IR classes. Considering relu and relu_, if
-        we encounter relu_ before relu. we will then generate an IR class with op = at::aten::relu_ for both relu and
-        relu_ which will cause problems for relu.
-        TODO(alanwaketan): Once all ops are grouped properly, we should no longer need this hack.
-        """
-        generated = set()
-
-        def gen_key(func: FunctionSchema) -> Tuple[str, str]:
-            # we want to generate unique entries for overloads of functional variants,
-            # but not for inplace variants unless explicitly told `codegenInplaceVariant`
-            return (func.name.name.base, func.name.overload_name)
-
-        for x in xs:
-            f = x.functional if isinstance(x, NativeFunctionsGroup) else x
-            # For the 'or'd terms:
-            # 1. codegenInplaceVariant means we can generate the in-place variant corresponding items.
-            # 2. not f.func.name.name.inplace means the op is not a in-place variant, so we can generate the item.
-            # 3. f.func.name.name.base not in generated means even for in-place ops we still need to generate the item
-            # as if they were the functional variants for one time.
-            if f.func.name in full_codegen and \
-               (codegenInplaceVariant or not f.func.name.name.inplace or gen_key(f.func) not in generated):
-                generated.add(gen_key(f.func))
-                for r in func(f):
-                    yield r
-
-    selector = SelectiveBuilder.get_nop_selector()
-
-    assert backend_key is not None
-    class_name = backend_indices[backend_key].native_function_class_name()
-
-    if impl_path is not None:
-        error_on_missing_kernels(native_functions, backend_indices, backend_key,
-                                 autograd_key, impl_path, full_codegen)
-
-    assert class_name is not None
-
-    # Generate nativefunction declarations
-    gen_dispatchkey_nativefunc_headers(fm, class_name, cpp_namespace, backend_indices,
-                                       grouped_native_functions, backend_key, autograd_key)
-
-    # Generate Dispatcher registrations which hook up the nativefunctions
-    for dispatch_key in [backend_key] if autograd_key is None else [backend_key, autograd_key]:
-        gen_dispatcher_registrations(fm, output_dir, cpp_namespace, backend_indices, grouped_native_functions,
-                                     backend_key, dispatch_key, selector)
-
-    # Generate native function impls that build IR nodes
-    fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: {
-        'includes': [f'#include <{path}>' for path in [
-            tensor_class_hdr,
-            "ATen/MetaFunctions.h",
-            "torch/csrc/lazy/core/metrics.h",
-            "torch/csrc/lazy/core/shape.h",
-            "lazy_tensor_core/csrc/aten_ltc_bridge.h",
-            "lazy_tensor_core/csrc/lazy_graph_executor.h",
-            f"{output_dir}/{backend_key}NativeFunctions.h",
-            f"{output_dir}/{backend_key}LazyIr.h",
-            f"{output_dir}/{backend_key}ShapeInference.h",
-        ]],
-        'native_functions_include': '',
-        'backend_namespace': 'torch_lazy_tensors',  # this is wrong
-        'native_function_definitions':
-        list(concat_map_codegen(
-            dest.GenLazyNativeFuncDefinition(f'{backend_key}NativeFunctions',
-                                             backend_indices[backend_key],
-                                             tensor_class),
-            grouped_native_functions,
-            codegenInplaceVariant=True
-        )),
-    })
-    # Generate headers for shape/dtype funcs for non-meta kernels
-    fm.write_with_template(f'{backend_key}ShapeInference.h', 'ShapeInference.h', lambda: {
-        'lazy_ir_sysinc': [f'#include <{path}>' for path in [
-            "ATen/Tensor.h",
-            "c10/core/ScalarType.h",
-            "c10/util/Optional.h",
-            "torch/csrc/lazy/core/ir.h",
-            "torch/csrc/lazy/core/shape.h",
-            "vector",
-        ]],
-        'lazy_ir_inc': [],
-        'DispatchKey': backend_key,
-        'dispatch_namespace': backend_key.lower(),
-        'func_declarations': list(concat_map_codegen(
-            dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key],
-                                                 tensor_class),
-            grouped_native_functions
-        )),
-    })
-    # Generate IR node classes
-    fm.write_with_template(f'{backend_key}LazyIr.h', 'LazyIr.h', lambda: {
-        'lazy_ir_sysinc': [f'#include <{path}>' for path in [
-            "ATen/core/Formatting.h",
-            "c10/core/ScalarType.h",
-            "c10/util/Optional.h",
-            "torch/csrc/lazy/core/hash.h",
-            "torch/csrc/lazy/core/ir.h",
-            "vector",
-        ]],
-        'lazy_ir_inc': [f'#include "{path}"' for path in [
-            node_base_hdr if node_base_hdr is not None else None
-        ] if path is not None],
-        'external_backend_headers': f'#include "{output_dir}/{backend_key}NativeFunctions.h"',
-        'namespaced_headers': '',
-        'DispatchKey': backend_key,
-        'dispatch_namespace': backend_key.lower(),
-        'ir_declarations': list(concat_map_codegen(
-            dest.LazyIR(backend_indices[backend_key], node_base),
-            grouped_native_functions
-        )),
-    })
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
deleted file mode 100644
index 6bc0d7df1002..000000000000
--- a/tools/codegen/model.py
+++ /dev/null
@@ -1,1639 +0,0 @@
-import re
-
-from tools.codegen.utils import assert_never
-
-from dataclasses import dataclass
-from typing import List, Dict, Optional, Iterator, Tuple, Set, Sequence, Callable, Union
-from enum import Enum, auto
-import itertools
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-#                           DATA MODEL
-#
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-#
-# Some general principles for our data model.
-#
-# - Stop using C++ data types as the internal data representation
-#   format.  Instead, the internal data structures are centered
-#   around JIT schema representation.  This avoid a big problem
-#   with the old codegen where we read in all the types from
-#   native_functions.yaml and then immediately had to retranslate
-#   them into C++ types.
-#
-# - More semantic data representation.  Instead of representing
-#   everything as dicts and strings, we define dataclasses for
-#   every interesting entity the code generation has to deal with.
-#   These dataclasses have strong semantic invariants: for example,
-#   we generally require them to roundtrip losslessly into the
-#   form they were parsed from.  These structures are immutable
-#   and you're expected to populate information once during
-#   construction.
-
-# Represent a source location; used for better error reporting
-@dataclass(frozen=True)
-class Location:
-    file: str
-    line: int
-
-    def __str__(self) -> str:
-        return "{}:{}".format(self.file, self.line)
-
-# Valid values of the 'variants' field in native_functions.yaml
-Variant = Enum('Variant', ('function', 'method'))
-
-# NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
-class DispatchKey(Enum):
-    Undefined = 0
-    CatchAll = Undefined
-
-    CPU = auto()
-    CUDA = auto()
-    HIP = auto()
-    FPGA = auto()
-    ORT = auto()
-    XLA = auto()
-    Lazy = auto()
-    Vulkan = auto()
-    Metal = auto()
-    XPU = auto()
-    MKLDNN = auto()
-    OpenGL = auto()
-    OpenCL = auto()
-    IDEEP = auto()
-    QuantizedCPU = auto()
-    QuantizedCUDA = auto()
-    QuantizedXPU = auto()
-    CustomRNGKeyId = auto()
-    MkldnnCPU = auto()
-    SparseCPU = auto()
-    SparseCUDA = auto()
-    SparseCsrCPU = auto()
-    SparseCsrCUDA = auto()
-    SparseHIP = auto()
-    SparseXPU = auto()
-    NestedTensor = auto()
-    PrivateUse1 = auto()
-    PrivateUse2 = auto()
-    PrivateUse3 = auto()
-    EndOfBackendKeys = PrivateUse3
-
-    ZeroTensor = auto()
-    Meta = auto()
-    BackendSelect = auto()
-    Named = auto()
-    AutogradOther = auto()
-    AutogradCPU = auto()
-    AutogradCUDA = auto()
-    AutogradXLA = auto()
-    AutogradLazy = auto()
-    AutogradNestedTensor = auto()
-    AutogradXPU = auto()
-    AutogradPrivateUse1 = auto()
-    AutogradPrivateUse2 = auto()
-    AutogradPrivateUse3 = auto()
-    Tracer = auto()
-    Autocast = auto()
-    Batched = auto()
-    VmapMode = auto()
-    TESTING_ONLY_GenericWrapper = auto()
-    TESTING_ONLY_GenericMode = auto()
-    NumDispatchKeys = auto()
-    Autograd = auto()
-    CompositeImplicitAutograd = auto()
-    CompositeExplicitAutograd = auto()
-    EndOfAliasKeys = CompositeExplicitAutograd
-
-    CPUTensorId = CPU
-    CUDATensorId = CUDA
-    PrivateUse1_PreAutograd = AutogradPrivateUse1
-    PrivateUse2_PreAutograd = AutogradPrivateUse2
-    PrivateUse3_PreAutograd = AutogradPrivateUse3
-
-    def __str__(self) -> str:
-        return self.name
-
-    def lower(self) -> str:
-        return str(self).lower()
-
-    @staticmethod
-    def parse(value: str) -> 'DispatchKey':
-        for k, v in DispatchKey.__members__.items():
-            if k == value:
-                return v
-        raise AssertionError(f'unknown dispatch key {value}')
-
-STRUCTURED_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}
-
-# Dispatch keys that "support all backends".  These codegen slightly differently
-# then backend specific keys.
-def is_generic_dispatch_key(dk: DispatchKey) -> bool:
-    return dk in {DispatchKey.CompositeExplicitAutograd, DispatchKey.CompositeImplicitAutograd}
-
-# CUDA specific dispatch keys
-def is_cuda_dispatch_key(dk: DispatchKey) -> bool:
-    return dk in {
-        DispatchKey.CUDA,
-        DispatchKey.QuantizedCUDA,
-        DispatchKey.SparseCUDA,
-        DispatchKey.SparseCsrCUDA,
-        DispatchKey.AutogradCUDA,
-        DispatchKey.CUDATensorId,
-    }
-
-# Structured kernel generation is only supported for certain key types;
-# otherwise use old-style
-def is_structured_dispatch_key(dk: DispatchKey) -> bool:
-    return dk in STRUCTURED_DISPATCH_KEYS
-
-class DeviceCheckType(Enum):
-    NoCheck = 0
-    ExactSame = 1
-
-class Tag(Enum):
-    inplace_view = 0
-
-    def __str__(self) -> str:
-        return self.name
-
-    @staticmethod
-    def parse(value: str) -> 'Tag':
-        for k, v in Tag.__members__.items():
-            if k == value:
-                return v
-        raise AssertionError(f'unknown tag {value}')
-
-# The basic input to the code generation is native_functions.yaml.
-# The name "native", BTW, comes from the distinction between native
-# functions and legacy TH functions.  The legacy TH functions are gone,
-# but the "native" descriptor has stuck.
-#
-# NativeFunction models a single entry in native_functions.yaml.  Its
-# fields roughly correspond to what you would see in the YAML itself,
-# but after canonicalization and parsing has occurred.
-#
-# You can see some of the overall design patterns for how we setup
-# dataclasses in this class, but we will defer a complete discussion
-# of this at FunctionSchema.
-@dataclass(frozen=True)
-class NativeFunction:
-    # The function schema of the operator in question.  This schema
-    # has been parsed; see FunctionSchema for more about its structure.
-    # (This type is quoted as we are forward referencing a type
-    # defined later in the file.  I opted for this ordering of the
-    # classes for expository clarity.)
-    func: 'FunctionSchema'
-
-    # Whether or not to generate mutable tensor arguments like regular
-    # ones
-    use_const_ref_for_mutable_tensors: bool
-
-    # Whether or not to omit automatic generation of a DeviceGuard
-    device_guard: bool
-
-    # How to emit automatic generation of device check
-    device_check: DeviceCheckType
-
-    # What python module to put the function in
-    python_module: Optional[str]
-
-    # TODO: figure out what this does
-    category_override: Optional[str]
-
-    # If no variants are specified in native_functions.yaml, this is
-    # assumed to be {'function'}.
-    variants: Set[Variant]
-
-    # Whether or not we should skip generating registrations for
-    # this kernel.  This is a bit of a double-edged sword, as manual
-    # registrations don't participate in codegen-based selective build!
-    manual_kernel_registration: bool
-
-    # Whether or not to skip generating TensorMethod/Functions bindings
-    # for this kernel.  Technically, this doesn't actually skip generating
-    # the binding; instead, the binding gets generated to __dispatch_{funcname}
-    # so you can make use of the normal binding if you need it.
-    manual_cpp_binding: bool
-
-    # The location in the YAML file were this native function entry was
-    # defined.  This is for conveniently reporting error messages!
-    loc: 'Location'
-
-    # Whether or not this out functions is a "structured kernel".  Structured
-    # kernels are defined a little differently from normal kernels; in
-    # particular, their shape checking logic is defined separately from
-    # the kernel.  Only out functions can be structured; other functions
-    # delegate to the out function using the structured_delegate keyword.
-    # Every structured kernel must have at least an out and a functional
-    # variant.
-    structured: bool
-
-    # Whether or not this non-out function is a structured kernel, defined
-    # in terms of the out kernel referenced by the string here.
-    structured_delegate: Optional['OperatorName']
-
-    # Only valid for structured kernels.  Specifies alternative of what
-    # to inherit from when defining the meta class for the structured
-    # operator.  This will usually be TensorIteratorBase.  This also
-    # changes the semantics of set_output to call the parent class.
-    structured_inherits: Optional[str]
-
-    # Structured kernels can declare elements as "precomputed". These elements
-    # are returned by the meta function in one struct and passed to the impl
-    # function in lieu of certain kernel arguments that these precomputed
-    # elements supersede. Information about the names and types of these
-    # precomputed elements and how they correspond to kernel arguments is stored
-    # in this member, if applicable.
-    precomputed: Optional['Precompute']
-
-    # Argument names whose default  should be excluded from the C++ interface.
-    # Intended for resolving overload ambiguities between signatures.
-    cpp_no_default_args: Set[str]
-
-    # Note [Abstract ATen methods]
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # An abstract ATen method is one whose dispatch differs between
-    # types.  These are implemented in derived types (with a
-    # standard (throwing) definition in Type).  A concrete ATen
-    # method is one which has the same dispatch for all types;
-    # we just implement it in the base Type.  This is exposed
-    # in Declarations.yaml via a field named 'abstract'.
-    is_abstract: bool
-
-    # Whether or not the NativeFunction contains a backend-agnostic kernel
-    has_composite_implicit_autograd_kernel: bool
-    has_composite_explicit_autograd_kernel: bool
-
-    # Tags are used to describe semantic information about (groups of) operators,
-    # That aren't easily inferrable directly from the operator's schema.
-    # For now operators have at most one tag.
-    tag: Optional['Tag']
-
-    # NB: The benefit of defining a dataclass is that we automatically get
-    # a constructor defined for all the fields we specify.  No need
-    # to explicitly write it out.
-
-    # We parse both the NativeFunction + backend-specific information about it, which it stored in a corresponding BackendIndex.
-    @staticmethod
-    def from_yaml(
-            ei: Dict[str, object],
-            loc: 'Location'
-    ) -> Tuple['NativeFunction', Dict[DispatchKey, Dict['OperatorName', 'BackendMetadata']]]:
-        """
-        Parse a NativeFunction from a dictionary as directly parsed
-        from native_functions.yaml
-        """
-        e = ei.copy()
-
-        funcs = e.pop('func')
-        assert isinstance(funcs, str), f'not a str: {funcs}'
-        func = FunctionSchema.parse(funcs)
-
-        cpp_no_default_args_list = e.pop('cpp_no_default_args', [])
-        assert isinstance(cpp_no_default_args_list, list)
-        cpp_no_default_args = set(cpp_no_default_args_list)
-
-        use_const_ref_for_mutable_tensors = e.pop('use_const_ref_for_mutable_tensors', False)
-        assert isinstance(use_const_ref_for_mutable_tensors, bool)
-
-        variants_s = e.pop('variants', 'function')
-        assert isinstance(variants_s, str)
-        variants: Set[Variant] = set()
-        for v in variants_s.split(', '):
-            if v == 'function':
-                variants.add(Variant.function)
-            elif v == 'method':
-                variants.add(Variant.method)
-            else:
-                raise AssertionError(f'illegal variant {v}')
-
-        manual_kernel_registration = e.pop('manual_kernel_registration', False)
-        assert isinstance(manual_kernel_registration, bool), f'not a bool: {manual_kernel_registration}'
-
-        manual_cpp_binding = e.pop('manual_cpp_binding', False)
-        assert isinstance(manual_cpp_binding, bool), f'not a bool: {manual_cpp_binding}'
-
-        device_guard = e.pop('device_guard', True)
-        assert isinstance(device_guard, bool), f'not a bool: {device_guard}'
-
-        device_check_s = e.pop('device_check', None)
-        assert device_check_s is None or isinstance(device_check_s, str), f'not a str: {device_check_s}'
-        device_check: DeviceCheckType
-        if device_check_s is None:
-            device_check = DeviceCheckType.ExactSame
-        else:
-            device_check = DeviceCheckType[device_check_s]
-
-        structured = e.pop('structured', False)
-        assert isinstance(structured, bool), f'not a bool: {structured}'
-
-        structured_delegate_s = e.pop('structured_delegate', None)
-        assert structured_delegate_s is None or isinstance(structured_delegate_s, str), f'not a str: {structured_delegate}'
-        structured_delegate: Optional[OperatorName] = None
-        if structured_delegate_s is not None:
-            structured_delegate = OperatorName.parse(structured_delegate_s)
-
-        structured_inherits = e.pop('structured_inherits', None)
-        assert structured_inherits is None or isinstance(structured_inherits, str), f'not a str: {structured_inherits}'
-
-        python_module = e.pop('python_module', None)
-        assert python_module is None or isinstance(python_module, str), f'not a str: {python_module}'
-
-        category_override = e.pop('category_override', None)
-        assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}'
-
-        precomputed_dict = e.pop('precomputed', None)
-        assert precomputed_dict is None or structured is True
-        precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None
-
-        tag_str = e.pop('tags', None)
-        assert tag_str is None or isinstance(tag_str, str), f'not a str: {tag_str}'
-        tag = Tag.parse(tag_str) if tag_str else None
-
-        from tools.codegen.api import cpp
-
-        raw_dispatch = e.pop('dispatch', None)
-        assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
-        dispatch: Dict[DispatchKey, str] = {}
-        if raw_dispatch is not None:
-            assert not manual_kernel_registration, \
-                "cannot specify both manual_kernel_registration and dispatch; with " \
-                "manual registration, dispatch has no effect!"
-            for ks, v in raw_dispatch.items():
-                if ks == '__line__':
-                    continue  # not worth tracking line numbers for dispatch entries
-                assert isinstance(ks, str), e
-                assert isinstance(v, str), e
-                for k in ks.split(","):
-                    dispatch_key = DispatchKey.parse(k.strip())
-                    dispatch[dispatch_key] = v
-            assert dispatch != {DispatchKey.CompositeImplicitAutograd: cpp.name(func)}, \
-                "unnecessary dispatch table for this function; just delete the dispatch " \
-                "key entirely"
-            # if a function is a structured delegate, deleting the dispatch
-            # table is NOT semantics preserving
-            assert structured_delegate or dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}, \
-                f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} " \
-                f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}.  Rename your implementation to the expected " \
-                "name, then delete the dispatch table"
-        elif not structured and structured_delegate is None:
-            dispatch[DispatchKey.CompositeImplicitAutograd] = cpp.name(func)
-
-        assert not (DispatchKey.CompositeExplicitAutograd in dispatch and DispatchKey.CompositeImplicitAutograd in dispatch), \
-            "cannot specify both CompositeExplicitAutograd and CompositeImplicitAutograd on a single kernel; each " \
-            "strictly subsumes the other.  If you wanted to provide an explicit autograd " \
-            "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only"
-
-        if structured_delegate:
-            # Structured functions MUST have a dispatch table
-            is_abstract = True
-        else:
-            is_abstract = dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}
-
-        has_composite_implicit_autograd_kernel = DispatchKey.CompositeImplicitAutograd in dispatch.keys()
-        has_composite_explicit_autograd_kernel = DispatchKey.CompositeExplicitAutograd in dispatch.keys()
-
-        # BackendMetadata is used to store any information about a NativeFunction that is backend dependent.
-        # The most obvious information is the kernel name, which usually contains the name of the backend in it for cpu/cuda.
-        # Why is 'structured' included? External backends (e.g. XLA) opt into which ops are structured
-        # independently of which in-tree ops are structured
-        backend_metadata = {k: {func.name: BackendMetadata(
-            kernel=v, structured=structured and is_structured_dispatch_key(k))} for k, v in dispatch.items()}
-
-        # don't care if it exists or not; make it easier to use this function
-        # with other yaml parsers that aren't setting __line__ in the dict
-        e.pop('__line__', None)
-        assert not e, f"leftover entries: {e}"
-
-        # Asserts that we can't do in post_init, because they rely on backend-specific info
-        if structured_delegate is not None:
-            for key in STRUCTURED_DISPATCH_KEYS:
-                assert key not in dispatch, \
-                    f"if structured_delegate, then must not have {key} in dispatch dictionary " \
-                    "(it is delegated!)"
-
-        return NativeFunction(
-            func=func,
-            use_const_ref_for_mutable_tensors=use_const_ref_for_mutable_tensors,
-            variants=variants,
-            structured=structured,
-            structured_delegate=structured_delegate,
-            structured_inherits=structured_inherits,
-            precomputed=precomputed,
-            manual_kernel_registration=manual_kernel_registration,
-            manual_cpp_binding=manual_cpp_binding,
-            python_module=python_module,
-            category_override=category_override,
-            device_guard=device_guard,
-            device_check=device_check,
-            loc=loc,
-            cpp_no_default_args=cpp_no_default_args,
-            is_abstract=is_abstract,
-            has_composite_implicit_autograd_kernel=has_composite_implicit_autograd_kernel,
-            has_composite_explicit_autograd_kernel=has_composite_explicit_autograd_kernel,
-            tag=tag,
-        ), backend_metadata
-
-
-    def validate_unstructured(self) -> None:
-        # TODO: probably better to accumulate these errors and report them all
-        # at once
-        assert not self.structured, "This function is structured, but there was " \
-            "no valid functional variant of it."
-        assert self.structured_delegate, "This function delegates to another structured out function, " \
-            "but no valid function was found (the delegate may not exist, or it has the wrong type)"
-
-    # __post_init__ functions in dataclasses can be used to do extra
-    # validation after construction.
-    #
-    # Notice that we don't do any type validation here.  In fact, we
-    # rely exclusively on mypy to check if you've done types correctly!
-    # Validation is for nontrivial invariants that cannot be (conveniently)
-    # encoded in the type system.
-    def __post_init__(self) -> None:
-        if self.func.arguments.out:
-            assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \
-                "be declared with only function variant; e.g., variants: function; " \
-                "otherwise you will tickle a Python argument binding bug " \
-                "(which usually manifests itself as the result variable being undefined.)"
-        if self.structured:
-            assert self.func.kind() == SchemaKind.out, "Put structured field on the out= " \
-                "variant of a function; did you mean structured_delegate?"
-            assert self.device_guard, "device_guard: False is not respected by structured kernels"
-        if self.structured_delegate:
-            assert self.func.kind() != SchemaKind.out, "structured_delegate field not allowed " \
-                "on out= functions; did you mean structured?"
-            assert self.device_guard, "device_guard: False is not respected by structured kernels"
-        # Technically, with the asserts above, this assert is impossible to
-        # happen
-        assert not (self.structured and self.structured_delegate), \
-            "Cannot have both structured and structured_delegate on function"
-        defaulted_arguments = {a.name for a in self.func.schema_order_arguments()
-                               if a.default is not None}
-        invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
-        assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}'
-        if self.structured_inherits is not None:
-            assert self.structured, "structured_inherits must also imply structured: True"
-        if str(self.func.name).startswith('_foreach'):
-            assert self.device_check == DeviceCheckType.NoCheck, \
-                "foreach kernels fall back to slow path when tensor are on different devices, " \
-                "device_check not allowed to be enabled"
-
-    @property
-    def has_composite_kernel(self) -> bool:
-        return self.has_composite_implicit_autograd_kernel or self.has_composite_explicit_autograd_kernel
-
-    @property
-    def is_view_op(self) -> bool:
-        rets = self.func.returns
-        is_non_mutating_view = len(rets) > 0 and any(r.annotation is not None and not r.annotation.is_write for r in rets)
-        is_inplace_view = self.tag is not None and self.tag is Tag.inplace_view
-        is_wildcard_view = any(inp.annotation is not None and
-                               inp.annotation.alias_set_after != "" for inp in self.func.schema_order_arguments())
-        return is_non_mutating_view or is_inplace_view or is_wildcard_view
-
-    @property
-    def root_name(self) -> str:
-        return self.func.name.name.base
-
-SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
-
-# A structured kernel is guaranteed to have a functional and out variant, and
-# optionally an inplace variant.
-#
-# NB: we create NativeFunctionsGroup *even if* the function is not
-# actually annotated structured.  Test the structured boolean to see if it
-# actually is structured or not.
-@dataclass(frozen=True)
-class NativeFunctionsGroup:
-    functional: NativeFunction
-    inplace: Optional[NativeFunction]
-    out: NativeFunction
-
-    @property
-    def structured(self) -> bool:
-        # Whether or not the operator has a meta() function. This information is backend-agnostic.
-        return self.out.structured
-
-    def __post_init__(self) -> None:
-        test_sig: FunctionSchema = self.functional.func.signature()
-        for f in self.functions():
-            if test_sig != f.func.signature():
-                raise AssertionError(
-                    "NativeFunctionsGroup constructed from two NativeFunctions "
-                    f"that don't have matching signatures: {test_sig} != {f.func.signature()}"
-                )
-        assert self.functional.func.kind() == SchemaKind.functional
-        assert self.out.func.kind() == SchemaKind.out
-        if self.inplace is not None:
-            assert self.inplace.func.kind() == SchemaKind.inplace
-
-        if self.structured:
-            # For now, structured composite kernels are not supported (need some
-            # design work to figure out how to make the composite case work)
-            assert not self.out.has_composite_implicit_autograd_kernel
-
-            assert self.functional.structured_delegate == self.out.func.name, \
-                f"{self.functional.func.name} delegates to {self.functional.structured_delegate} " \
-                f"but its actual delegate is {self.out.func.name}"
-            if self.inplace is not None:
-                assert self.inplace.structured_delegate == self.out.func.name
-
-    def signature(self) -> 'FunctionSchema':
-        return self.out.func.signature()
-
-    def functions(self) -> Iterator[NativeFunction]:
-        yield self.functional
-        yield self.out
-        if self.inplace is not None:
-            yield self.inplace
-
-    @property
-    def root_name(self) -> str:
-        return self.functional.root_name
-
-    @staticmethod
-    def from_dict(d: Dict[SchemaKind, NativeFunction]) -> Optional['NativeFunctionsGroup']:
-        assert d
-        if len(d) == 1:
-            return None
-        d = dict(d)  # non-destructive updates please
-        functional = d.pop(SchemaKind.functional, None)
-        inplace = d.pop(SchemaKind.inplace, None)
-        out = d.pop(SchemaKind.out, None)
-        assert not d
-        assert functional is not None
-        # There are a few operators which only have functional/inplace variants;
-        # these don't count as structured for our purposes here
-        if out is None:
-            return None
-
-        return NativeFunctionsGroup(
-            functional=functional,
-            inplace=inplace,
-            out=out,
-        )
-
-def is_foreach_op(name: str) -> bool:
-    return str(name) in set([
-        '_amp_foreach_non_finite_check_and_unscale_',
-        '_foreach_add_.ScalarList',
-        '_foreach_sub_.ScalarList',
-        '_foreach_mul_.ScalarList',
-        '_foreach_div_.ScalarList',
-        '_foreach_add_.Scalar',
-        '_foreach_sub_.Scalar',
-        '_foreach_mul_.Scalar',
-        '_foreach_div_.Scalar',
-        '_foreach_add_.List',
-        '_foreach_sub_.List',
-        '_foreach_mul_.List',
-        '_foreach_div_.List',
-        '_foreach_exp_',
-        '_foreach_sqrt_',
-        '_foreach_abs_',
-        '_foreach_acos_',
-        '_foreach_asin_',
-        '_foreach_atan_',
-        '_foreach_ceil_',
-        '_foreach_cos_',
-        '_foreach_cosh_',
-        '_foreach_erf_',
-        '_foreach_erfc_',
-        '_foreach_expm1_',
-        '_foreach_floor_',
-        '_foreach_log_',
-        '_foreach_log10_',
-        '_foreach_log1p_',
-        '_foreach_log2_',
-        '_foreach_neg_',
-        '_foreach_tan_',
-        '_foreach_tanh_',
-        '_foreach_sin_',
-        '_foreach_sinh_',
-        '_foreach_round_',
-        '_foreach_lgamma_',
-        '_foreach_frac_',
-        '_foreach_reciprocal_',
-        '_foreach_sigmoid_',
-        '_foreach_trunc_',
-        '_foreach_addcmul_.Scalar',
-        '_foreach_addcdiv_.Scalar',
-        '_foreach_addcmul_.ScalarList',
-        '_foreach_addcdiv_.ScalarList',
-        '_foreach_zero_'])
-
-@dataclass(frozen=True)
-class BackendMetadata:
-    # The name of the backend kernel, for a given operator
-    # for in-tree backends. These names come directly from the 'dispatch" field
-    # in native_functions.yaml. The dispatch entry is optional; in that
-    # case, that is equivalent to having written:
-    #
-    #   dispatch:
-    #       CompositeImplicitAutograd: $operator_name
-    kernel: str
-    # Whether or not the operator has a structured kernel implemented, for this particular backend.
-    # For in-tree backends, they all have the same value for structured- this is listed
-    # in native_functions.yaml.
-    # However, external backends like XLA can indendently toggle which ops are structured.
-    structured: bool
-    #
-
-
-# BackendIndex represents a backend.
-# The BackendIndex encodes per-operator information that is potentially different
-# for each backend. The most obvious example is the name of the kernel
-# (the 'dispatch' entry in native_functions.yaml).
-# However, there can be other examples of different backends having different information.
-# External backends can choose to opt their kernels to be structured independently from in-tree backends,
-# which means that this information isn't inherentely tied to a NativeFunction- it's different per backend.
-@dataclass(frozen=True)
-class BackendIndex:
-    dispatch_key: DispatchKey
-    # Mainly important for structured kernels, this determines which variant in the operator group is used to implement the others.
-    # All in-tree ops use out kernels, while XLA uses functional kernels.
-    use_out_as_primary: bool
-    # Whether the backend requires a device guard, and device checks.
-    # For in-tree backends, this is currently just CUDA/HIP
-    # For out-of-tree backends, this is currently just Intel XPU
-    device_guard: bool
-    # Whether the backend is in-tree (CPU/CUDA) or out-of-tree (XLA)
-    external: bool
-    # Other backend-specific information that is on a per-operator basis
-    index: Dict['OperatorName', BackendMetadata]
-
-    @staticmethod
-    def grow_index(
-            parent_index: Dict[DispatchKey, Dict['OperatorName', BackendMetadata]],
-            child_index: Dict[DispatchKey, Dict['OperatorName', BackendMetadata]]
-    ) -> None:
-        for k, v in child_index.items():
-            for op_name, metadata in v.items():
-                assert op_name not in parent_index[k], f'duplicate operator {op_name} for dispatch key {k}'
-                parent_index[k][op_name] = metadata
-
-    def primary(self, g: NativeFunctionsGroup) -> NativeFunction:
-        if self.use_out_as_primary:
-            return g.out
-        else:
-            return g.functional
-
-    def has_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> bool:
-        m = self.get_kernel(g)
-        return m is not None
-
-
-    def get_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> Optional[BackendMetadata]:
-        if isinstance(g, NativeFunction):
-            f = g
-        elif isinstance(g, NativeFunctionsGroup):
-            f = self.primary(g)
-        else:
-            assert_never(f)
-        if f.func.name not in self.index:
-            return None
-        return self.index[f.func.name]
-
-    def native_function_class_name(self) -> Optional[str]:
-        if self.external:
-            return f'{str(self.dispatch_key)}NativeFunctions'
-        else:
-            # TODO: This discrepancy isn't required; we could also generated
-            # a class for in-tree kernels. It'll just require carefully
-            # updating every kernel definition + callsite of every in-tree aten kernel.
-            return None
-
-
-# The function schema is undoubtedly the most important data structure
-# in all of the codegen, as it defines the type signature for operators,
-# and most of the code generation we do is type directed (e.g., look at
-# the types, decide what to do.  Think about how we code generate
-# C++ function stubs!)
-#
-# We will also see in this class the general structure for how we model
-# data in this code generation.  A few notable properties to point out
-# ahead of time:
-#
-#   - These dataclasses are a *lossless* representation of the strings
-#     they are parsed from.  In fact, we assert that given the
-#     information stored in the dataclass, we can exactly reconstruct
-#     the string we parsed from (and assert this inside the parse
-#     definition).  There are a few reasons for this:
-#
-#       - If you find that it is difficult to reconstruct the string
-#         given a dataclass, that is a clue that you are data
-#         representation is wrong.
-#
-#       - It helps ensure that all relevant information is present
-#         in the dataclass, so that downstream users aren't tempted
-#         to reparse the original string to get some information
-#         that was omitted.
-#
-#       - It forces you to represent the data in-memory in the same way
-#         it is recorded textually, which makes the dataclasses easier
-#         to understand for someone who is familiar with the
-#         textual format.  (As a tradeoff, it means you have to model
-#         the syntax, even when it is inconvenient.  But maybe that means
-#         the syntax is bad!)  If you don't understand the internal
-#         representation, go look at the printing code to see how
-#         it maps onto the surface syntax!
-#
-#       - It makes it easy to test the parsing code, as parsing code
-#         that is inconsistent with the string code will fail early
-#         and loudly.  (As a tradeoff, it makes the parsing code a bit
-#         brittle (in particular, with trivial whitespace changes you
-#         are likely to trigger an assert error).
-#
-#     In general, try to make the __str__ code as simple as possible
-#     (even at the cost of more complex parsing logic.)  Additionally,
-#     try to minimize redundancy in data representation.  (Precomputed
-#     fields are OK though: they are defined as a simple function on
-#     the canonical representation in question.)
-#
-#   - These dataclasses are all frozen; once constructed their
-#     values never change.  This makes it easy to tell where any
-#     given data came from: just look to the constructor.  As a
-#     tradeoff, you can't easily "decorate" a schema with extra
-#     information from a post-facto analysis.  We impose this
-#     restriction to make these structures more understandable.
-#
-@dataclass(frozen=True)
-class FunctionSchema:
-    # The name of the operator this function schema describes.
-    name: 'OperatorName'
-
-    arguments: 'Arguments'
-
-    # TODO: Need to handle collisions with argument names at some point
-    returns: Tuple['Return', ...]
-
-    def schema_order_arguments(self) -> Iterator['Argument']:
-        return itertools.chain(
-            self.arguments.flat_positional,
-            self.arguments.flat_kwarg_only,
-            self.arguments.out
-        )
-
-    @staticmethod
-    def parse(func: str) -> 'FunctionSchema':
-        # We should probably get a proper parser here
-        assert ' -> ' in func, "function schema missing return type (spaces are mandatory)"
-        last_index = func.rfind(" -> ")
-        func_decl = func[:last_index]
-        return_decl = func[last_index + len(" -> "):]
-        ops, args = func_decl.split('(', 1)
-        assert args[-1] == ")", "Expecting closing )"
-        args = args[:-1]
-        name = OperatorName.parse(ops)
-        arguments = Arguments.parse(args)
-        returns = parse_returns(return_decl)
-        r = FunctionSchema(
-            name=name,
-            arguments=arguments,
-            returns=returns
-        )
-        assert str(r) == func, f'{str(r)} != {func}'
-        return r
-
-    def __post_init__(self) -> None:
-        for arg, ret in zip(self.arguments.out, self.returns):
-            assert arg.annotation == ret.annotation, \
-                "Out arguments must have matching return Tensor; furthermore, " \
-                "the ith-argument needs to correspond to the ith return"
-        # Invariant: we expect out arguments to appear as keyword arguments in the schema.
-        # This means that all mutable returns should be aliased to a keyword argument
-        # (except for "self", which we explicitly don't treat as an out argument because of its use in methods)
-        # See Note [is_out_fn]
-        out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.flat_positional if arg.name == "self"]
-        mutable_returns = [ret for ret in self.returns if ret.annotation is not None and ret.annotation.is_write]
-        for ret in mutable_returns:
-            assert any([ret.annotation == arg.annotation for arg in out_and_self]), \
-                "All mutable returns must be aliased either to a keyword argument, or to \"self\". " \
-                "Did you forget to mark an out argument as keyword-only?"
-        if self.arguments.out:
-            assert len(self.arguments.out) == len(self.returns), \
-                "Must return as many arguments as there are out arguments"
-        if self.name.name.inplace:
-            # TODO: fixme
-            if not is_foreach_op(str(self.name)):
-                assert len(self.returns) == 1
-
-    def is_out_fn(self) -> bool:
-        # Note [is_out_fn]
-        #
-        # out functions are the variants which take an explicit out= argument
-        # to populate into.  We need to know if a schema corresponds to an
-        # out function for several reasons:
-        #
-        #   - They codegen differently in C++ API
-        #       - codegen to at::add_out rather than at::add
-        #       - out argument is moved to front of C++ argument list
-        #
-        # out functions are DEFINED to be any function with a keyword-only
-        # argument that is mutable.  In principle, this could lead to a
-        # false positive if you define a function that mutates a
-        # kwarg only argument, but this isn't the "true" output of this
-        # function.  A more robust definition that would work in this
-        # case would also look at:
-        #
-        #   - The output types.  Out functions take in the arguments
-        #     they mutate and then return them again; this is sort
-        #     of "definitionally" what makes something an out function.
-        #     Historically, we DO check this for consistency.
-        #   - Correspondence with pure variant.  An out function
-        #     should have a signature equivalent to its pure variant,
-        #     but just with extra kwargs for the output elements.  This
-        #     is difficult to actually check for and historically
-        #     we only do this check in tools/
-        return bool(self.arguments.out)
-
-    def kind(self) -> SchemaKind:
-        """
-        What kind of schema is this?  A functional schema is one
-        that returns a newly allocated output; an inplace schema
-        modifies the self argument inplace; an out schema writes
-        the result into an explicitly provided out argument.
-        """
-        is_inplace = self.name.name.inplace
-        is_out = bool(self.arguments.out)
-        assert not (is_inplace and is_out)
-        if is_inplace:
-            return SchemaKind.inplace
-        elif is_out:
-            return SchemaKind.out
-        else:
-            return SchemaKind.functional
-
-    def signature(self, *, strip_default: bool = False) -> 'FunctionSchema':
-        """
-        Certain schemas are 'related', in that they are simply
-        inplace/out/functional versions of the same function.  This method
-        factors these schemas into the "core" functional signature which
-        is equal across all versions.
-
-        Here is what normalization happens to the schema to convert
-        it to a signature:
-        - The overload name is stripped (name is retained, since
-          it expresses semantic content about what the function does)
-        - Inplace is set False
-        - Out arguments are stripped
-        - Mutability annotations are stripped  (this is sound
-          because you cannot overload on mutability annotation)
-        - Return names are stripped since they are not overloadable and
-          some variants have return names but some not
-        """
-
-        def strip_ret_annotation(r: Return) -> Return:
-            return Return(
-                name=None,
-                type=r.type,
-                annotation=None,
-            )
-
-        return FunctionSchema(
-            name=OperatorName(
-                name=BaseOperatorName(
-                    base=self.name.name.base,
-                    inplace=False,
-                    dunder_method=self.name.name.dunder_method,
-                ),
-                overload_name="",  # stripped
-            ),
-            arguments=self.arguments.signature(strip_default=strip_default),
-            returns=tuple(map(strip_ret_annotation, self.returns)),
-        )
-
-    def __str__(self) -> str:
-        all_arguments_str = str(self.arguments)
-        if len(self.returns) == 1:
-            returns = str(self.returns[0])  # omit parentheses
-        else:
-            returns = '(' + ', '.join(map(str, self.returns)) + ')'
-        return f'{self.name}({all_arguments_str}) -> {returns}'
-
-# Here is the rest of the data model, described more briefly.
-
-# Simplified version for what actually shows up in built-ins.
-# Look at alias_info.h for expanded syntax.  If you need the structure,
-# you also need to make this structure recursive so it can be lined
-# up with the type components too.  For primitives this isn't really
-# necessary
-@dataclass(frozen=True)
-class Annotation:
-    # Typically only has one element.  Not actually a set so
-    # we can conveniently assume it is canonically ordered
-    alias_set: Tuple[str, ...]
-    is_write: bool
-    alias_set_after: str
-
-    @staticmethod
-    def parse(ann: str) -> 'Annotation':
-        # Only handling afterSet == Wildcard for now
-        becomes_wildcard_index = ann.find(" -> *")
-        if becomes_wildcard_index != -1:
-            after_set = "*"
-            # TODO: im not good enough with regexes to ignore -> *
-            m = re.match(r'^([a-z])(!?)(!?)$', ann[:becomes_wildcard_index] + ann[becomes_wildcard_index + len(" -> *"):])
-        else:
-            after_set = ""
-            m = re.match(r'^([a-z])(!?)(!?)$', ann)
-        assert m is not None, f'unrecognized alias annotation {ann}'
-        alias_set = (m.group(1),)
-        is_write = m.group(2) == '!'
-        r = Annotation(alias_set=alias_set, is_write=is_write, alias_set_after=after_set)
-        assert str(r) == ann, f'{r} != {ann}'
-        return r
-
-    def __str__(self) -> str:
-        alias_set = '|'.join(self.alias_set)
-        if self.alias_set_after:
-            alias_set = f'{alias_set}{" -> "}{self.alias_set_after}'
-        is_write = '!' if self.is_write else ''
-        return f'{alias_set}{is_write}'
-
-# The base class for the type system.  This is also loosely modeled
-# off of jit_type.h, but we've simplified the hierarchy to focus
-# in on the aspects of the type system that matter for code generation
-# (for example, there's no SingleElementType subclass anymore).
-# You never actually construct a Type; usually it's going to be one
-# of the subclasses.  If Python had ADTs this would be one!
-@dataclass(frozen=True)
-class Type:
-    @staticmethod
-    def parse(t: str) -> 'Type':
-        r = Type._parse(t)
-        assert str(r) == t, f'{r} != {t}'
-        return r
-
-    @staticmethod
-    def _parse(t: str) -> 'Type':
-        m = re.match(r'^(.+)\?$', t)
-        if m is not None:
-            return OptionalType(Type.parse(m.group(1)))
-        m = re.match(r'^(.+)\[([0-9]+)?\]$', t)
-        if m is not None:
-            size = int(m.group(2)) if m.group(2) is not None else None
-            return ListType(elem=Type.parse(m.group(1)), size=size)
-        try:
-            return BaseType(BaseTy[t])
-        except KeyError:
-            raise RuntimeError(f"unrecognized type {t}")
-
-    def __str__(self) -> str:
-        raise NotImplementedError
-
-    # WARNING: These concepts are not very well-defined.  For example,
-    # is "int?" nullable? How about "int?[]".  They are defined
-    # so we can conveniently generate legacy Declarations.yaml but
-    # really we should probably just remove these at some point
-
-    def is_tensor_like(self) -> bool:
-        raise NotImplementedError
-
-    def is_nullable(self) -> bool:
-        raise NotImplementedError
-
-    def is_list_like(self) -> Optional['ListType']:
-        raise NotImplementedError
-
-# Base types are simple, atomic types with no further structure
-BaseTy = Enum('BaseTy', (
-    'Generator',
-    'ScalarType',
-    'Tensor',
-    'int',
-    'Dimname',
-    'float',
-    'str',
-    'bool',
-    'Layout',
-    'Device',
-    'Scalar',
-    'MemoryFormat',
-    'QScheme',
-    'Storage',
-    'Stream',
-    'ConstQuantizerPtr',  # TODO: rename
-))
-
-@dataclass(frozen=True)
-class BaseType(Type):
-    name: BaseTy
-
-    def __str__(self) -> str:
-        return f'{self.name.name}'
-
-    def is_tensor_like(self) -> bool:
-        return self.name == BaseTy.Tensor
-
-    def is_nullable(self) -> bool:
-        return False
-
-    def is_list_like(self) -> Optional['ListType']:
-        return None
-
-# Optional types may be specified, or may also be validly given None
-@dataclass(frozen=True)
-class OptionalType(Type):
-    elem: Type
-
-    def __str__(self) -> str:
-        return f'{self.elem}?'
-
-    def is_tensor_like(self) -> bool:
-        return self.elem.is_tensor_like()
-
-    def is_nullable(self) -> bool:
-        return True
-
-    def is_list_like(self) -> Optional['ListType']:
-        return self.elem.is_list_like()
-
-# List types specify that we may have multiples of an element.  We
-# also support explicit sizes on list types, but these have
-# some nontrivial semantics!  (However, for C++ API purposes, explicit
-# sizes are mostly erased from the type system.)
-#
-# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g.,
-# int[] elaborates differently than bool[3]!
-@dataclass(frozen=True)
-class ListType(Type):
-    elem: Type
-    size: Optional[int]
-
-    def __str__(self) -> str:
-        size = f'{self.size}' if self.size else ''
-        return f'{self.elem}[{size}]'
-
-    def is_tensor_like(self) -> bool:
-        return self.elem.is_tensor_like()
-
-    def is_nullable(self) -> bool:
-        return self.elem.is_nullable()
-
-    def is_list_like(self) -> Optional['ListType']:
-        return self
-
-@dataclass(frozen=True)
-class Argument:
-    # NB: I didn't put kwarg_only as a boolean field here, unlike
-    # c10::Argument, so that printing works correctly
-
-    name: str
-    type: Type
-    default: Optional[str]
-
-    # The semantics of the annotation field are a little strange.
-    #
-    # Alias annotations parametrize Tensors (since Tensors are the only things
-    # that can alias.)  This motivates why I write Tensor(a!)?  (and not, for
-    # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor,
-    # which may be optional (i.e., the alias annotation should bind first to
-    # Tensor, before the optional postfix annotation).
-    #
-    # However, despite being a property of Tensor, we (and c10::Argument)
-    # store the annotation at the top level of the Argument, rather than
-    # inside the embedded Tensor type.  In the C++ version of this
-    # class, we then go through great lengths to mimic the type
-    # structure in the annotation structure so we can correlate
-    # annotations with types.
-    #
-    # Now, it turns out, in all applications in code generation, the
-    # structure of annotated types is very simple.  So we just hard
-    # code it here.  But if we ever do get anything more complex, this
-    # model will have to change!
-    annotation: Optional[Annotation]
-
-    @staticmethod
-    def parse(arg: str) -> 'Argument':
-        name: str
-        default: Optional[str]
-        type_and_annot, name_and_default = arg.rsplit(' ', 1)
-        if '=' in name_and_default:
-            name, default = name_and_default.split('=')
-        else:
-            name = name_and_default
-            default = None
-        # TODO: deduplicate annotation matching with Return
-        match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot)
-        annotation: Optional[Annotation]
-        if match:
-            # If you update this, make sure the __str__ still works too
-            assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor'
-            type_s = 'Tensor' + match.group(2)
-            annotation = Annotation.parse(match.group(1))
-        else:
-            type_s = type_and_annot
-            annotation = None
-        type = Type.parse(type_s)
-        r = Argument(
-            name=name,
-            type=type,
-            default=default,
-            annotation=annotation,
-        )
-        assert str(r) == arg, f'{str(r)} != {arg}'
-        return r
-
-    @property
-    def is_write(self) -> bool:
-        return self.annotation is not None and self.annotation.is_write
-
-    def __str__(self) -> str:
-        type = f'{self.type}'
-        if self.annotation:
-            assert type in ['Tensor', 'Tensor?', 'Tensor[]']
-            type = type.replace('Tensor', f'Tensor({self.annotation})')
-        if self.name is None:
-            return type
-        else:
-            mb_default = ''
-            if self.default:
-                mb_default = f'={self.default}'
-            return f"{type} {self.name}{mb_default}"
-
-
-@dataclass(frozen=True)
-class Return:
-    name: Optional[str]
-    type: Type
-    annotation: Optional[Annotation]
-
-    @staticmethod
-    def parse(arg: str) -> 'Return':
-        name: Optional[str]
-        if ' ' in arg:
-            type_and_annot, name = arg.rsplit(' ', 1)
-        else:
-            type_and_annot = arg
-            name = None
-        match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot)
-        annotation: Optional[Annotation]
-        if match:
-            # If you update this, make sure the __str__ still works too
-            assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor'
-            type_s = 'Tensor' + match.group(2)
-            annotation = Annotation.parse(match.group(1))
-        else:
-            type_s = type_and_annot
-            annotation = None
-        type = Type.parse(type_s)
-        r = Return(
-            name=name,
-            type=type,
-            annotation=annotation,
-        )
-        assert str(r) == arg, f'{str(r)} != {arg}'
-        return r
-
-    @property
-    def is_write(self) -> bool:
-        return self.annotation is not None and self.annotation.is_write
-
-    def __str__(self) -> str:
-        type = f'{self.type}'
-        if self.annotation:
-            assert type in ['Tensor', 'Tensor?', 'Tensor[]']
-            type = type.replace('Tensor', f'Tensor({self.annotation})')
-        if self.name is None:
-            return type
-        else:
-            return f"{type} {self.name}"
-
-
-# Represents the self argument for functions that may be methods
-@dataclass(frozen=True)
-class SelfArgument:
-    argument: Argument
-
-# Bundle of arguments that represent a TensorOptions.  This is mostly
-# relevant for the public C++ API but we bake it into the core data
-# model because other APIs often have to interact with it
-@dataclass(frozen=True)
-class TensorOptionsArguments:
-    dtype: Argument
-    layout: Argument
-    device: Argument
-    pin_memory: Argument
-
-    def all(self) -> Sequence[Argument]:
-        return [self.dtype, self.layout, self.device, self.pin_memory]
-
-@dataclass(frozen=True)
-class Arguments:
-    # pre_self_positional is usually empty, but is notably non-empty
-    # for where.self, where the condition argument comes before the
-    # self argument
-    pre_self_positional: Tuple[Argument, ...]
-    self_arg: Optional[SelfArgument]
-    post_self_positional: Tuple[Argument, ...]
-
-    pre_tensor_options_kwarg_only: Tuple[Argument, ...]
-    tensor_options: Optional[TensorOptionsArguments]
-    # post_tensor_options is typically memory format, which should be
-    # part of tensor options but isn't right now, and is usually
-    # placed after the tensor options arguments
-    post_tensor_options_kwarg_only: Tuple[Argument, ...]
-
-    # Unlike in the previous codegen, we have factored out 'out' arguments
-    # in the canonical representation, removing them from kwarg
-    # arguments.  This choice is justified by numerous downstream
-    # transformations which treat out arguments specially; additionally,
-    # you can see that canonicity is not violated!
-    out: Tuple[Argument, ...]  # these are also kwarg-only
-
-    @property
-    def flat_non_out(self) -> Sequence[Argument]:
-        ret: List[Argument] = []
-        ret.extend(self.flat_positional)
-        ret.extend(self.flat_kwarg_only)
-        return ret
-
-    @property
-    def flat_positional(self) -> Sequence[Argument]:
-        ret: List[Argument] = []
-        ret.extend(self.pre_self_positional)
-        if self.self_arg is not None:
-            ret.append(self.self_arg.argument)
-        ret.extend(self.post_self_positional)
-        return ret
-
-    # NB: doesn't contain out arguments
-    @property
-    def flat_kwarg_only(self) -> Sequence[Argument]:
-        ret: List[Argument] = []
-        ret.extend(self.pre_tensor_options_kwarg_only)
-        if self.tensor_options is not None:
-            ret.extend(self.tensor_options.all())
-        ret.extend(self.post_tensor_options_kwarg_only)
-        return ret
-
-    @property
-    def flat_all(self) -> Sequence[Argument]:
-        ret: List[Argument] = []
-        ret.extend(self.flat_positional)
-        ret.extend(self.flat_kwarg_only)
-        ret.extend(self.out)
-        return ret
-
-    @property
-    def non_out(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
-        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
-        ret.extend(self.positional)
-        ret.extend(self.kwarg_only)
-        return ret
-
-    @property
-    def positional(self) -> Sequence[Union[Argument, SelfArgument]]:
-        ret: List[Union[Argument, SelfArgument]] = []
-        ret.extend(self.pre_self_positional)
-        if self.self_arg is not None:
-            ret.append(self.self_arg)
-        ret.extend(self.post_self_positional)
-        return ret
-
-    @property
-    def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]:
-        ret: List[Union[Argument, TensorOptionsArguments]] = []
-        ret.extend(self.pre_tensor_options_kwarg_only)
-        if self.tensor_options is not None:
-            ret.append(self.tensor_options)
-        ret.extend(self.post_tensor_options_kwarg_only)
-        return ret
-
-    @property
-    def all(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
-        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
-        ret.extend(self.positional)
-        ret.extend(self.kwarg_only)
-        ret.extend(self.out)
-        return ret
-
-    def signature(self, *, strip_default: bool = False) -> 'Arguments':
-        # dataclasses.replace could be used here, but it is less
-        # type safe so for now I've opted to type everything out
-        def strip_arg_annotation(a: Argument) -> Argument:
-            return Argument(
-                name=a.name,
-                type=a.type,
-                default=a.default if not strip_default else None,
-                annotation=None,
-            )
-
-        return Arguments(
-            pre_self_positional=tuple(map(strip_arg_annotation, self.pre_self_positional)),
-            self_arg=SelfArgument(
-                strip_arg_annotation(self.self_arg.argument)
-            ) if self.self_arg is not None else None,
-            post_self_positional=tuple(map(strip_arg_annotation, self.post_self_positional)),
-            pre_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)),
-            # NB: tensor_options guaranteed to not have any alias annotations
-            tensor_options=self.tensor_options,
-            post_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)),
-            # out arguments are dropped in signature
-            out=(),
-        )
-
-
-    @staticmethod
-    def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]:
-        positional: List[Argument] = []
-        kwarg_only: List[Argument] = []
-        out: List[Argument] = []
-        arguments_acc = positional
-
-        # TODO: Use a real parser here; this will get bamboozled
-        # by signatures that contain things like std::array<bool, 2> (note the space)
-        for arg in args.split(', '):
-            if not arg:
-                continue
-            if arg == '*':
-                assert arguments_acc is positional, "invalid syntax: kwarg-only specifier * can only occur once"
-                arguments_acc = kwarg_only
-                continue
-            parg = Argument.parse(arg)
-            # Currently, we rely directly on the invariant that there are NO
-            # kwarg-only mutating arguments.  If you want to relax this,
-            # we will need a more semantic way of matching that takes
-            # into account return arguments.  In that case, you will have
-            # to manage out computation a level up, in FunctionSchema.  See Note
-            # [is_out_fn]
-            if parg.annotation is not None and parg.annotation.is_write:
-                if arguments_acc is positional:
-                    pass  # do nothing
-                elif arguments_acc is kwarg_only:
-                    arguments_acc = out
-            else:
-                assert arguments_acc is not out
-            arguments_acc.append(parg)
-
-        return positional, kwarg_only, out
-
-    @staticmethod
-    def parse(args: str) -> 'Arguments':
-        """
-        Input: 'int x, int y, int z'
-        """
-
-        # We do this in two phases.  First we parse into three
-        # main categories: positional, kwarg_only, out.
-        # Then, we reparse positional and kwarg_only to separate
-        # out the self argument and tensor options arguments.
-
-        positional, kwarg_only, out = Arguments._preparse(args)
-
-        # Split self argument
-        self_ix = None
-        for i, a in enumerate(positional):
-            if a.name == "self":
-                self_ix = i
-                break
-        pre_self_positional: List[Argument]
-        self_arg: Optional[SelfArgument]
-        post_self_positional: List[Argument]
-        if self_ix is not None:
-            pre_self_positional = positional[:self_ix]
-            self_arg = SelfArgument(positional[self_ix])
-            post_self_positional = positional[self_ix + 1:]
-        else:
-            pre_self_positional = []
-            self_arg = None
-            post_self_positional = positional
-
-        # Group tensor options arguments
-        pre_tensor_options_kwarg_only: List[Argument] = []
-        tensor_options: Optional[TensorOptionsArguments] = None
-        post_tensor_options_kwarg_only: List[Argument] = []
-        kwarg_only_acc = pre_tensor_options_kwarg_only
-
-        def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
-            return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
-        predicates = [  # order matters
-            pred('dtype', Type.parse('ScalarType')),
-            pred('layout', Type.parse('Layout')),
-            pred('device', Type.parse('Device')),
-            pred('pin_memory', Type.parse('bool')),
-        ]
-
-        i = 0
-        while i < len(kwarg_only):
-            # If there is enough space...
-            if i <= len(kwarg_only) - len(predicates):
-                # And the next len(predicates) arguments look like TensorOptions arguments
-                if all(p(a) for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])):
-                    assert kwarg_only_acc is pre_tensor_options_kwarg_only
-                    # Group them together as one argument
-                    tensor_options = TensorOptionsArguments(
-                        dtype=kwarg_only[i],
-                        layout=kwarg_only[i + 1],
-                        device=kwarg_only[i + 2],
-                        pin_memory=kwarg_only[i + 3],
-                    )
-                    i += len(predicates)
-                    kwarg_only_acc = post_tensor_options_kwarg_only
-                    continue
-            kwarg_only_acc.append(kwarg_only[i])
-            i += 1
-
-        return Arguments(
-            pre_self_positional=tuple(pre_self_positional),
-            self_arg=self_arg,
-            post_self_positional=tuple(post_self_positional),
-            pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only),
-            tensor_options=tensor_options,
-            post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only),
-            out=tuple(out),
-        )
-
-
-    def __str__(self) -> str:
-        all_arguments: List[str] = []
-        all_arguments.extend(map(str, self.flat_positional))
-        if self.flat_kwarg_only or self.out:
-            all_arguments.append('*')
-        all_arguments.extend(map(str, self.flat_kwarg_only))
-        all_arguments.extend(map(str, self.out))
-        return ', '.join(all_arguments)
-
-    def __post_init__(self) -> None:
-        # TODO: These invariants are weirdly asymmetric?
-        # TODO: Fancier types?
-        if self.self_arg is None:
-            assert not self.pre_self_positional
-        if self.tensor_options is None:
-            assert not self.post_tensor_options_kwarg_only
-
-
-# Names that validly are __iXXX__ indicating inplace operations.
-# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods
-# NB: PyTorch hasn't actually implemented all of these
-AUGMENTED_ASSIGNMENT_NAMES = ['add', 'sub', 'mul', 'div', 'mod', 'pow', 'lshift', 'rshift', 'and', 'xor', 'or']
-
-# A BaseOperatorName is what we think of the operator name, without
-# the overload name.  Unusually, we don't represent this as just a
-# string; instead, we directly represent a few important semantic
-# bits of information we derive from the string: namely whether
-# or not it's inplace (add_) and whether or not it's a double-underscore
-# method (__add__)
-@dataclass(frozen=True)
-class BaseOperatorName:
-    base: str
-    inplace: bool
-    dunder_method: bool
-
-    @staticmethod
-    def parse(op: str) -> 'BaseOperatorName':
-        assert op != ''
-        assert not op.endswith('_out'), \
-            "_out suffix is reserved and not permitted for operator names; " \
-            "did you mean to specify an out overload name instead?"
-        m = re.match(r'^__([^_]+)__$', op)
-        if m is not None:
-            dunder_method = True
-            base = m.group(1)
-            if any(base == f'i{n}' for n in AUGMENTED_ASSIGNMENT_NAMES):
-                inplace = True
-                base = base[1:]
-            else:
-                inplace = False
-                # temporary, this is not intrinsically true but
-                # has been historically true for dunder methods
-                # we support  (but, if we ever got, say, __int__, this would
-                # be wrong!)
-                assert base[0] != 'i'
-        else:
-            dunder_method = False
-            base = op
-            if base[-1] == '_':
-                inplace = True
-                base = base[:-1]
-            else:
-                inplace = False
-        r = BaseOperatorName(base=base, inplace=inplace, dunder_method=dunder_method)
-        assert str(r) == op, f'{str(r)} != {op}'
-        return r
-
-    def __str__(self) -> str:
-        if self.dunder_method:
-            i = 'i' if self.inplace else ''
-            return f'__{i}{self.base}__'
-        else:
-            i = '_' if self.inplace else ''
-            return f'{self.base}{i}'
-
-# Operator name is the base operator name along with the (typically not
-# user visible) overload string.
-@dataclass(frozen=True)
-class OperatorName:
-    name: BaseOperatorName
-    overload_name: str
-
-    @staticmethod
-    def parse(op_name: str) -> 'OperatorName':
-        if '.' in op_name:
-            name, overload_name = op_name.split('.', 1)
-        else:
-            name = op_name
-            overload_name = ''
-        r = OperatorName(
-            name=BaseOperatorName.parse(name),
-            overload_name=overload_name
-        )
-        assert str(r) == op_name, f'{str(r)} != {op_name}'
-        return r
-
-    def __str__(self) -> str:
-        if self.overload_name:
-            return f"{self.name}.{self.overload_name}"
-        else:
-            return f"{self.name}"
-
-    # NB: This must be synchronized with the naming scheme in
-    # aten/src/ATen/templates/Operators.h
-    # Given a function schema "aten::op.overload(...)",
-    # If there is no overload name, this returns f"{op}"
-    # If there is an overload name, this returns f"{op}_{overload}"
-    def unambiguous_name(self) -> str:
-        if self.overload_name:
-            return f"{self.name}_{self.overload_name}"
-        else:
-            return f"{self.name}"
-
-    def remove_inplace(self) -> 'OperatorName':
-        return OperatorName(
-            name=BaseOperatorName(base=self.name.base, inplace=False, dunder_method=self.name.dunder_method),
-            overload_name=self.overload_name
-        )
-
-
-def gets_generated_out_inplace_wrapper(f: NativeFunction, g: NativeFunctionsGroup, b: BackendIndex) -> bool:
-    return f.func.kind() is not SchemaKind.functional and \
-        not b.has_kernel(f) and \
-        b.has_kernel(g.functional)
-
-# Helper functions for parsing argument lists (both inputs and returns)
-
-def parse_returns(return_decl: str) -> Tuple[Return, ...]:
-    """
-    Input: '()'
-    Output: []
-    """
-    if return_decl == '()':
-        return ()
-    if return_decl[0] == '(' and return_decl[-1] == ')':
-        return_decl = return_decl[1:-1]
-    return tuple(Return.parse(arg) for arg in return_decl.split(', '))
-
-
-# A Precompute instance consists of a map from kernel argument name
-# to the list of Argument instances that should replace that
-# kernel argument in the impl function.
-@dataclass(frozen=True)
-class Precompute:
-    # A map from kernel argument name -> a list of precomputed
-    # elements that replaces/supersedes it.
-    replace: Dict[str, List[Argument]]
-    # List of precomputed args added without replacement
-    add: List[Argument]
-
-    @staticmethod
-    def parse(src: object) -> 'Precompute':
-        assert isinstance(src, list)
-
-        # src is a list of strings of the format:
-        #   {kernel param name} -> {replacement decl}[, {replacement decl}, ...]
-        #   [{add decl}[, {add decl}, ...]]
-        # The last line is optional and contains the precomputed parameters that are
-        # added without replacement.
-        # The other lines are parsed to get the names of which precomputed elements
-        # should replace which kernel arguments.
-        add_args = []
-        if ' -> ' not in src[-1]:
-            add_list = src[-1].split(',')
-            add_args = [Argument.parse(name.strip()) for name in add_list]
-            src = src[:-1]
-
-        replace = {}
-        for raw_replace_item in src:
-            assert isinstance(raw_replace_item, str)
-            assert ' -> ' in raw_replace_item, 'precomputed parameters without replacement' \
-                                               ' are allowed only in the last line'
-
-            arg, with_list_raw = raw_replace_item.split(' -> ')
-            with_list = with_list_raw.split(',')
-            with_list_args = [Argument.parse(name.strip()) for name in with_list]
-            replace[arg] = with_list_args
-
-        r = Precompute(replace=replace, add=add_args)
-        assert r.to_list() == src, 'r.to_list() != src'
-        return r
-
-    def to_list(self) -> List[str]:
-        replace_list = []
-        for kernel_param, replacement_params in self.replace.items():
-            replacements = ', '.join(str(param) for param in replacement_params)
-            replace_list.append(f'{kernel_param} -> {replacements}')
-
-        return replace_list
diff --git a/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py b/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py
deleted file mode 100644
index 2adf6e793eeb..000000000000
--- a/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py
+++ /dev/null
@@ -1,7 +0,0 @@
-MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/**
- * @generated
- * This is an auto-generated file. Please do not modify it by hand.
- * To re-generate, please run:
- * cd ~/pytorch && python torch/csrc/jit/mobile/upgrader_mobile.cpp
- */
-"""
diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py
deleted file mode 100644
index b92a57958675..000000000000
--- a/tools/codegen/selective_build/selector.py
+++ /dev/null
@@ -1,270 +0,0 @@
-from typing import Dict, Set, Optional, Tuple, List
-import yaml
-
-from dataclasses import dataclass
-
-from tools.codegen.model import NativeFunction
-from tools.codegen.selective_build.operator import (
-    SelectiveBuildOperator, merge_debug_info, merge_operator_dicts,
-    strip_operator_overload_name)
-
-# A SelectiveBuilder holds information extracted from the selective build
-# YAML specification.
-#
-# It includes information about the build's selectivity, the debug_info
-# associated with this selective build (opaque string), and the set of
-# operators that should be included in the build.
-#
-@dataclass(frozen=True)
-class SelectiveBuilder:
-
-    # If true, then the build is not selective, and includes all
-    # operators.
-    include_all_operators: bool
-
-    # Debug Information at the selective/custom build level.
-    _debug_info: Optional[Tuple[str, ...]]
-
-    # A dictionary of operator -> operator metadata.
-    operators: Dict[str, SelectiveBuildOperator]
-
-    # A dictionary of selected kernel tags and dtypes. Typically a
-    # PyTorch Operator Kernel (function) may have many code paths
-    # that are specialized for many many Tensor dtypes, so it's not
-    # one per kernel function, but there could be many per kernel
-    # function. The tag isn't a kernel function name, but some fragment
-    # of the kernel function implementation itself.
-    kernel_metadata: Dict[str, List[str]]
-
-    # A set of all the custom torch bind classes used by the selected models
-    # Stored as a set internally to remove duplicates proactively, but written
-    # as a list to yamls
-    custom_classes: Set[str]
-
-    # A set of all the build features used by the selected models
-    # Stored as a set internally to remove duplicates proactively, but written
-    # as a list to yamls
-    build_features: Set[str]
-
-    # If true, then fragments for all dtypes for all kernel functions
-    # are included as well as all custom classes. This is typically set when any one of the
-    # operator lists is generated from a mechanism other than
-    # tracing based selective build.
-    include_all_non_op_selectives: bool
-
-    @staticmethod
-    def get_nop_selector() -> 'SelectiveBuilder':
-        return SelectiveBuilder.from_yaml_dict({'include_all_operators': True})
-
-    @staticmethod
-    def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder':
-        valid_top_level_keys = {
-            'include_all_non_op_selectives',
-            'include_all_operators',
-            'debug_info',
-            'operators',
-            'kernel_metadata',
-            'custom_classes',
-            'build_features',
-        }
-        top_level_keys = set(data.keys())
-        if len(top_level_keys - valid_top_level_keys) > 0:
-            raise Exception("Got unexpected top level keys: {}".format(
-                ",".join(top_level_keys - valid_top_level_keys),
-            ))
-        include_all_operators = data.get('include_all_operators', False)
-        assert isinstance(include_all_operators, bool)
-
-        debug_info = None
-        if 'debug_info' in data:
-            di_list = data['debug_info']
-            assert isinstance(di_list, list)
-
-            debug_info = tuple(map(lambda x: str(x), di_list))
-
-        operators = {}
-        operators_dict = data.get('operators', {})
-        assert isinstance(operators_dict, dict)
-
-        for (k, v) in operators_dict.items():
-            operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v)
-
-        kernel_metadata = {}
-        kernel_metadata_dict = data.get('kernel_metadata', {})
-        assert isinstance(kernel_metadata_dict, dict)
-
-        for (k, v) in kernel_metadata_dict.items():
-            kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v))
-
-        custom_classes = data.get('custom_classes', [])
-        custom_classes = set(custom_classes)  # type: ignore[arg-type]
-
-        build_features = data.get('build_features', [])
-        build_features = set(build_features)  # type: ignore[arg-type]
-
-        include_all_non_op_selectives = data.get('include_all_non_op_selectives', False)
-        assert isinstance(include_all_non_op_selectives, bool)
-
-        return SelectiveBuilder(
-            include_all_operators,
-            debug_info,
-            operators,
-            kernel_metadata,
-            custom_classes,  # type: ignore[arg-type]
-            build_features,  # type: ignore[arg-type]
-            include_all_non_op_selectives,
-        )
-
-    @staticmethod
-    def from_yaml_str(config_contents: str) -> 'SelectiveBuilder':
-        contents = yaml.safe_load(config_contents)
-        return SelectiveBuilder.from_yaml_dict(contents)
-
-    @staticmethod
-    def from_yaml_path(config_path: str) -> 'SelectiveBuilder':
-        with open(config_path, 'r') as f:
-            contents = yaml.safe_load(f)
-            return SelectiveBuilder.from_yaml_dict(contents)
-
-    @staticmethod
-    def from_legacy_op_registration_allow_list(
-            allow_list: Set[str],
-            is_root_operator: bool,
-            is_used_for_training: bool) -> 'SelectiveBuilder':
-        operators = {}
-        for op in allow_list:
-            operators[op] = {
-                'name': op,
-                'is_root_operator': is_root_operator,
-                'is_used_for_training': is_used_for_training,
-                'include_all_overloads': True,
-            }
-        return SelectiveBuilder.from_yaml_dict({
-            'operators': operators,
-            'include_all_non_op_selectives': True,
-        })
-
-    def is_operator_selected(self, name: str) -> bool:
-        if self.include_all_operators:
-            return True
-
-        if name in self.operators:
-            return True
-        name = strip_operator_overload_name(name)
-        return name in self.operators and self.operators[name].include_all_overloads
-
-    def is_native_function_selected(self, func: NativeFunction) -> bool:
-        op_name = op_name_from_native_function(func)
-        return self.is_operator_selected(op_name)
-
-    def is_operator_selected_for_training(self, name: str) -> bool:
-        if not self.is_operator_selected(name):
-            return False
-        if self.include_all_operators:
-            return True
-
-        not_training_op = SelectiveBuildOperator(
-            name='',
-            is_root_operator=False,
-            is_used_for_training=False,
-            include_all_overloads=False,
-            _debug_info=None,
-        )
-        op = not_training_op
-        if name in self.operators:
-            op = self.operators[name]
-
-        name = strip_operator_overload_name(name)
-        base_op = not_training_op
-        if name in self.operators:
-            base_op = self.operators[name]
-
-        return (
-            op.is_used_for_training or
-            (base_op.include_all_overloads and base_op.is_used_for_training)
-        )
-
-    def is_native_function_selected_for_training(self, func: NativeFunction) -> bool:
-        op_name = op_name_from_native_function(func)
-        return self.is_operator_selected_for_training(op_name)
-
-    def is_root_operator(self, name: str) -> bool:
-        if not self.is_operator_selected(name):
-            return False
-        if self.include_all_operators:
-            return True
-
-        if name in self.operators:
-            op: SelectiveBuildOperator = self.operators[name]
-            return op.is_root_operator
-        name = strip_operator_overload_name(name)
-        if name not in self.operators:
-            return False
-        base_op: SelectiveBuildOperator = self.operators[name]
-        return base_op.include_all_overloads and base_op.is_root_operator
-
-    def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool:
-        if self.include_all_operators or self.include_all_non_op_selectives:
-            return True
-
-        return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag]
-
-    def to_dict(self) -> Dict[str, object]:
-        ret: Dict[str, object] = {
-            'include_all_non_op_selectives': self.include_all_non_op_selectives,
-            'include_all_operators': self.include_all_operators,
-        }
-        operators = {}
-        for (op_name, op) in self.operators.items():
-            operators[op_name] = op.to_dict()
-        ret['operators'] = operators
-
-        if self._debug_info is not None:
-            ret['debug_info'] = sorted(self._debug_info)
-
-        ret['kernel_metadata'] = {k: sorted(list(v)) for (k, v) in self.kernel_metadata.items()}
-
-        ret['custom_classes'] = sorted(self.custom_classes)
-
-        ret['build_features'] = sorted(self.build_features)
-
-        return ret
-
-
-def merge_kernel_metadata(
-        lhs: Dict[str, List[str]],
-        rhs: Dict[str, List[str]],
-) -> Dict[str, List[str]]:
-    kernel_metadata: Dict[str, List[str]] = {}
-    for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()):
-        dtypes_copy = set(dtypes)
-        if tag_name in kernel_metadata:
-            dtypes_copy |= set(kernel_metadata[tag_name])
-
-        kernel_metadata[tag_name] = list(dtypes_copy)
-
-    return kernel_metadata
-
-def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder:
-    include_all_operators = lhs.include_all_operators or rhs.include_all_operators
-    debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
-    operators = merge_operator_dicts(lhs.operators, rhs.operators)
-    kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata)
-    include_all_non_op_selectives = lhs.include_all_non_op_selectives or rhs.include_all_non_op_selectives
-    custom_classes = lhs.custom_classes.union(rhs.custom_classes)
-    build_features = lhs.build_features.union(rhs.build_features)
-    return SelectiveBuilder(
-        include_all_operators,
-        debug_info,
-        operators,
-        kernel_metadata,
-        custom_classes,
-        build_features,
-        include_all_non_op_selectives,
-    )
-
-
-def op_name_from_native_function(f: NativeFunction) -> str:
-    # This was originally read from the 'operator_name_with_overload' field in the
-    # declaration dict, which was the part before the first '(' in 'schema_string'.
-    return f'aten::{f.func.name}'
diff --git a/tools/codegen/utils.py b/tools/codegen/utils.py
deleted file mode 100644
index 48373a0db03c..000000000000
--- a/tools/codegen/utils.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import contextlib
-import functools
-import hashlib
-import os
-import re
-import textwrap
-from typing import Tuple, List, Iterable, Iterator, Callable, Sequence, TypeVar, Optional, Dict, Any, Union, Set, NoReturn
-from enum import Enum
-
-from tools.codegen.code_template import CodeTemplate
-
-# Safely load fast C Yaml loader/dumper if they are available
-try:
-    from yaml import CSafeLoader as Loader
-except ImportError:
-    from yaml import SafeLoader as Loader  # type: ignore[misc]
-
-try:
-    from yaml import CSafeDumper as Dumper
-except ImportError:
-    from yaml import SafeDumper as Dumper  # type: ignore[misc]
-YamlDumper = Dumper
-
-# A custom loader for YAML that errors on duplicate keys.
-# This doesn't happen by default: see https://github.com/yaml/pyyaml/issues/165
-class YamlLoader(Loader):
-    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
-        mapping = []
-        for key_node, value_node in node.value:
-            key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
-            assert key not in mapping, f"Found a duplicate key in the yaml. key={key}, line={node.start_mark.line}"
-            mapping.append(key)
-        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
-        return mapping
-
-# Many of these functions share logic for defining both the definition
-# and declaration (for example, the function signature is the same), so
-# we organize them into one function that takes a Target to say which
-# code we want.
-#
-# This is an OPEN enum (we may add more cases to it in the future), so be sure
-# to explicitly specify with Union[Literal[Target.XXX]] what targets are valid
-# for your use.
-Target = Enum('Target', (
-    # top level namespace (not including at)
-    'DEFINITION',
-    'DECLARATION',
-    # TORCH_LIBRARY(...) { ... }
-    'REGISTRATION',
-    # namespace { ... }
-    'ANONYMOUS_DEFINITION',
-    # namespace cpu { ... }
-    'NAMESPACED_DEFINITION',
-    'NAMESPACED_DECLARATION',
-))
-
-# Matches "foo" in "foo, bar" but not "foobar". Used to search for the
-# occurrence of a parameter in the derivative formula
-IDENT_REGEX = r'(^|\W){}($|\W)'
-
-# TODO: Use a real parser here; this will get bamboozled
-def split_name_params(schema: str) -> Tuple[str, List[str]]:
-    m = re.match(r'(\w+)(\.\w+)?\((.*)\)', schema)
-    if m is None:
-        raise RuntimeError(f'Unsupported function schema: {schema}')
-    name, _, params = m.groups()
-    return name, params.split(', ')
-
-T = TypeVar('T')
-S = TypeVar('S')
-
-# These two functions purposely return generators in analogy to map()
-# so that you don't mix up when you need to list() them
-
-# Map over function that may return None; omit Nones from output sequence
-def mapMaybe(func: Callable[[T], Optional[S]], xs: Iterable[T]) -> Iterator[S]:
-    for x in xs:
-        r = func(x)
-        if r is not None:
-            yield r
-
-# Map over function that returns sequences and cat them all together
-def concatMap(func: Callable[[T], Sequence[S]], xs: Iterable[T]) -> Iterator[S]:
-    for x in xs:
-        for r in func(x):
-            yield r
-
-# Conveniently add error context to exceptions raised.  Lets us
-# easily say that an error occurred while processing a specific
-# context.
-@contextlib.contextmanager
-def context(msg_fn: Callable[[], str]) -> Iterator[None]:
-    try:
-        yield
-    except Exception as e:
-        # TODO: this does the wrong thing with KeyError
-        msg = msg_fn()
-        msg = textwrap.indent(msg, '  ')
-        msg = f'{e.args[0]}\n{msg}' if e.args else msg
-        e.args = (msg,) + e.args[1:]
-        raise
-
-# A little trick from https://github.com/python/mypy/issues/6366
-# for getting mypy to do exhaustiveness checking
-# TODO: put this somewhere else, maybe
-def assert_never(x: NoReturn) -> NoReturn:
-    raise AssertionError("Unhandled type: {}".format(type(x).__name__))
-
-@functools.lru_cache(maxsize=None)
-def _read_template(template_fn: str) -> CodeTemplate:
-    return CodeTemplate.from_file(template_fn)
-
-
-# String hash that's stable across different executions, unlike builtin hash
-def string_stable_hash(s: str) -> int:
-    sha1 = hashlib.sha1(s.encode('latin1')).digest()
-    return int.from_bytes(sha1, byteorder='little')
-
-# A small abstraction for writing out generated files and keeping track
-# of what files have been written (so you can write out a list of output
-# files)
-class FileManager:
-    install_dir: str
-    template_dir: str
-    dry_run: bool
-    filenames: Set[str]
-
-    def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None:
-        self.install_dir = install_dir
-        self.template_dir = template_dir
-        self.filenames = set()
-        self.dry_run = dry_run
-
-    def _write_if_changed(self, filename: str, contents: str) -> None:
-        old_contents: Optional[str]
-        try:
-            with open(filename, 'r') as f:
-                old_contents = f.read()
-        except IOError:
-            old_contents = None
-        if contents != old_contents:
-            # Create output directory if it doesn't exist
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            with open(filename, 'w') as f:
-                f.write(contents)
-
-    def write_with_template(self, filename: str, template_fn: str,
-                            env_callable: Callable[[], Union[str, Dict[str, Any]]]) -> None:
-        filename = '{}/{}'.format(self.install_dir, filename)
-        assert filename not in self.filenames, "duplicate file write {filename}"
-        self.filenames.add(filename)
-        if not self.dry_run:
-            env = env_callable()
-            if isinstance(env, dict):
-                # TODO: Update the comment reference to the correct location
-                if 'generated_comment' not in env:
-                    comment = "@" + "generated by tools/codegen/gen.py"
-                    comment += " from {}".format(os.path.basename(template_fn))
-                    env['generated_comment'] = comment
-                template = _read_template(os.path.join(self.template_dir, template_fn))
-                self._write_if_changed(filename, template.substitute(env))
-            elif isinstance(env, str):
-                self._write_if_changed(filename, env)
-            else:
-                assert_never(env)
-
-
-    def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str, Dict[str, Any]]]]) -> None:
-        self.write_with_template(filename, filename, env_callable)
-
-    def write_sharded(
-            self,
-            filename: str,
-            items: Iterable[T],
-            *,
-            key_fn: Callable[[T], str],
-            env_callable: Callable[[T], Dict[str, List[str]]],
-            num_shards: int,
-            base_env: Optional[Dict[str, Any]] = None,
-            sharded_keys: Set[str]
-    ) -> None:
-
-        everything: Dict[str, Any] = {'shard_id': 'Everything'}
-        shards: List[Dict[str, Any]] = [{'shard_id': f'_{i}'} for i in range(num_shards)]
-        all_shards = [everything] + shards
-
-        if base_env is not None:
-            for shard in all_shards:
-                shard.update(base_env)
-
-        for key in sharded_keys:
-            for shard in all_shards:
-                if key in shard:
-                    assert isinstance(shard[key], list), "sharded keys in base_env must be a list"
-                    shard[key] = shard[key].copy()
-                else:
-                    shard[key] = []
-
-        def merge_env(into: Dict[str, List[str]], from_: Dict[str, List[str]]) -> None:
-            for k, v in from_.items():
-                assert k in sharded_keys, f"undeclared sharded key {k}"
-                into[k] += v
-
-        if self.dry_run:
-            # Dry runs don't write any templates, so incomplete environments are fine
-            items = ()
-
-        for item in items:
-            key = key_fn(item)
-            sid = string_stable_hash(key) % num_shards
-            env = env_callable(item)
-
-            merge_env(shards[sid], env)
-            merge_env(everything, env)
-
-        dot_pos = filename.rfind('.')
-        if dot_pos == -1:
-            dot_pos = len(filename)
-        base_filename = filename[:dot_pos]
-        extension = filename[dot_pos:]
-
-        for shard in all_shards:
-            shard_id = shard['shard_id']
-            self.write_with_template(f"{base_filename}{shard_id}{extension}",
-                                     filename,
-                                     lambda: shard)
-
-        # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled
-        self.filenames.discard(
-            f"{self.install_dir}/{base_filename}Everything{extension}")
-
-    def write_outputs(self, variable_name: str, filename: str) -> None:
-        """Write a file containing the list of all outputs which are
-        generated by this script."""
-        content = 'set({}\n    {})'.format(
-            variable_name, '\n    '.join('"' + name + '"' for name in sorted(self.filenames)))
-        self._write_if_changed(filename, content)
diff --git a/tools/coverage_plugins_package/setup.py b/tools/coverage_plugins_package/setup.py
index c93f6129258d..012506945504 100644
--- a/tools/coverage_plugins_package/setup.py
+++ b/tools/coverage_plugins_package/setup.py
@@ -6,8 +6,8 @@
 setuptools.setup(
     name="coverage-plugins",
     version="0.0.1",
-    author='PyTorch Team',
-    author_email='packages@pytorch.org',
+    author="PyTorch Team",
+    author_email="packages@pytorch.org",
     description="plug-in to coverage for PyTorch JIT",
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
index 8dcd31397d2a..a64670b6ada3 100644
--- a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
+++ b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
@@ -1,4 +1,4 @@
-'''
+"""
 This coverage plug-in attempts to cover JIT'd functions and methods that were previously missed in code coverage. Any
 function and method that was passed through/decorated with torch.jit.script or torch.jit.script_method should now be
 marked covered when coverage is run with this plug-in.
@@ -6,39 +6,54 @@
 DISCLAIMER: note that this will mark the entire JIT'd function/method as covered without seeking proof that the
 compiled code has been executed. This means that even if the code chunk is merely compiled and not run, it will get
 marked as covered.
-'''
+"""
 
 from coverage import CoveragePlugin, CoverageData  # type: ignore[import]
-from inspect import ismodule, isclass, ismethod, isfunction, iscode, getsourcefile, getsourcelines
+from inspect import (
+    ismodule,
+    isclass,
+    ismethod,
+    isfunction,
+    iscode,
+    getsourcefile,
+    getsourcelines,
+)
 from time import time
 from typing import Any
 
 # All coverage stats resulting from this plug-in will be in a separate .coverage file that should be merged later with
 # `coverage combine`. The convention seems to be .coverage.dotted.suffix based on the following link:
 # https://coverage.readthedocs.io/en/coverage-5.5/cmd.html#combining-data-files-coverage-combine
-cov_data = CoverageData(basename=f'.coverage.jit.{time()}')
+cov_data = CoverageData(basename=f".coverage.jit.{time()}")
 
 
 def is_not_builtin_class(obj: Any) -> bool:
-    return isclass(obj) and not type(obj).__module__ == 'builtins'
+    return isclass(obj) and not type(obj).__module__ == "builtins"
 
 
 class JitPlugin(CoveragePlugin):  # type: ignore[misc, no-any-unimported]
-    '''
+    """
     dynamic_context is an overridden function that gives us access to every frame run during the coverage process. We
     look for when the function being run is `should_drop`, as all functions that get passed into `should_drop` will be
     compiled and thus should be marked as covered.
-    '''
+    """
+
     def dynamic_context(self, frame: Any) -> None:
-        if frame.f_code.co_name == 'should_drop':
-            obj = frame.f_locals['fn']
+        if frame.f_code.co_name == "should_drop":
+            obj = frame.f_locals["fn"]
             # The many conditions in the if statement below are based on the accepted arguments to getsourcefile. Based
             # on its documentation (https://docs.python.org/3/library/inspect.html#inspect.getsourcefile), the argument
             # must be a module, class, method, function, traceback, frame, or code object AND it cannot be a built-in
             # module, class, or function.
             # Currently, we DO NOT include tracebacks or frames as they should not be JIT'd, and we have not checked for
             # built-in modules or functions as those do not seem to be JIT'd either.
-            if is_not_builtin_class(obj) or ismodule(obj) or ismethod(obj) or isfunction(obj) or iscode(obj):
+            if (
+                is_not_builtin_class(obj)
+                or ismodule(obj)
+                or ismethod(obj)
+                or isfunction(obj)
+                or iscode(obj)
+            ):
                 filename = getsourcefile(obj)
                 # We don't want to report for filename = None
                 if filename:
@@ -51,9 +66,14 @@ def dynamic_context(self, frame: Any) -> None:
                     except OSError:
                         pass
                     else:
-                        line_data = {filename: range(starting_lineno, starting_lineno + len(sourcelines))}
+                        line_data = {
+                            filename: range(
+                                starting_lineno, starting_lineno + len(sourcelines)
+                            )
+                        }
                         cov_data.add_lines(line_data)
         super().dynamic_context(frame)
 
+
 def coverage_init(reg: Any, options: Any) -> None:
     reg.add_dynamic_context(JitPlugin())
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
index dfb0f95171ee..80894ad2bdbb 100644
--- a/tools/download_mnist.py
+++ b/tools/download_mnist.py
@@ -6,15 +6,15 @@
 import sys
 
 MIRRORS = [
-    'http://yann.lecun.com/exdb/mnist/',
-    'https://ossci-datasets.s3.amazonaws.com/mnist/',
+    "http://yann.lecun.com/exdb/mnist/",
+    "https://ossci-datasets.s3.amazonaws.com/mnist/",
 ]
 
 RESOURCES = [
-    'train-images-idx3-ubyte.gz',
-    'train-labels-idx1-ubyte.gz',
-    't10k-images-idx3-ubyte.gz',
-    't10k-labels-idx1-ubyte.gz',
+    "train-images-idx3-ubyte.gz",
+    "train-labels-idx1-ubyte.gz",
+    "t10k-images-idx3-ubyte.gz",
+    "t10k-labels-idx1-ubyte.gz",
 ]
 
 
@@ -25,23 +25,23 @@ def report_download_progress(
 ) -> None:
     if file_size != -1:
         percent = min(1, (chunk_number * chunk_size) / file_size)
-        bar = '#' * int(64 * percent)
-        sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100)))
+        bar = "#" * int(64 * percent)
+        sys.stdout.write("\r0% |{:<64}| {}%".format(bar, int(percent * 100)))
 
 
 def download(destination_path: str, resource: str, quiet: bool) -> None:
     if os.path.exists(destination_path):
         if not quiet:
-            print('{} already exists, skipping ...'.format(destination_path))
+            print("{} already exists, skipping ...".format(destination_path))
     else:
         for mirror in MIRRORS:
             url = mirror + resource
-            print('Downloading {} ...'.format(url))
+            print("Downloading {} ...".format(url))
             try:
                 hook = None if quiet else report_download_progress
                 urlretrieve(url, destination_path, reporthook=hook)
             except (URLError, ConnectionError) as e:
-                print('Failed to download (trying next):\n{}'.format(e))
+                print("Failed to download (trying next):\n{}".format(e))
                 continue
             finally:
                 if not quiet:
@@ -49,32 +49,32 @@ def download(destination_path: str, resource: str, quiet: bool) -> None:
                     print()
             break
         else:
-            raise RuntimeError('Error downloading resource!')
+            raise RuntimeError("Error downloading resource!")
 
 
 def unzip(zipped_path: str, quiet: bool) -> None:
     unzipped_path = os.path.splitext(zipped_path)[0]
     if os.path.exists(unzipped_path):
         if not quiet:
-            print('{} already exists, skipping ... '.format(unzipped_path))
+            print("{} already exists, skipping ... ".format(unzipped_path))
         return
-    with gzip.open(zipped_path, 'rb') as zipped_file:
-        with open(unzipped_path, 'wb') as unzipped_file:
+    with gzip.open(zipped_path, "rb") as zipped_file:
+        with open(unzipped_path, "wb") as unzipped_file:
             unzipped_file.write(zipped_file.read())
             if not quiet:
-                print('Unzipped {} ...'.format(zipped_path))
+                print("Unzipped {} ...".format(zipped_path))
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description='Download the MNIST dataset from the internet')
+        description="Download the MNIST dataset from the internet"
+    )
     parser.add_argument(
-        '-d', '--destination', default='.', help='Destination directory')
+        "-d", "--destination", default=".", help="Destination directory"
+    )
     parser.add_argument(
-        '-q',
-        '--quiet',
-        action='store_true',
-        help="Don't report about progress")
+        "-q", "--quiet", action="store_true", help="Don't report about progress"
+    )
     options = parser.parse_args()
 
     if not os.path.exists(options.destination):
@@ -86,8 +86,8 @@ def main() -> None:
             download(path, resource, options.quiet)
             unzip(path, options.quiet)
     except KeyboardInterrupt:
-        print('Interrupted')
+        print("Interrupted")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/extract_scripts.py b/tools/extract_scripts.py
index fd90b1b9f0e5..7a9a29decc5a 100755
--- a/tools/extract_scripts.py
+++ b/tools/extract_scripts.py
@@ -18,82 +18,85 @@ class Script(TypedDict):
 
 
 def extract(step: Step) -> Optional[Script]:
-    run = step.get('run')
+    run = step.get("run")
 
     # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#using-a-specific-shell
-    shell = step.get('shell', 'bash')
+    shell = step.get("shell", "bash")
     extension = {
-        'bash': '.sh',
-        'pwsh': '.ps1',
-        'python': '.py',
-        'sh': '.sh',
-        'cmd': '.cmd',
-        'powershell': '.ps1',
+        "bash": ".sh",
+        "pwsh": ".ps1",
+        "python": ".py",
+        "sh": ".sh",
+        "cmd": ".cmd",
+        "powershell": ".ps1",
     }.get(shell)
 
-    is_gh_script = step.get('uses', '').startswith('actions/github-script@')
-    gh_script = step.get('with', {}).get('script')
+    is_gh_script = step.get("uses", "").startswith("actions/github-script@")
+    gh_script = step.get("with", {}).get("script")
 
     if run is not None and extension is not None:
         script = {
-            'bash': f'#!/usr/bin/env bash\nset -eo pipefail\n{run}',
-            'sh': f'#!/usr/bin/env sh\nset -e\n{run}',
+            "bash": f"#!/usr/bin/env bash\nset -eo pipefail\n{run}",
+            "sh": f"#!/usr/bin/env sh\nset -e\n{run}",
         }.get(shell, run)
-        return {'extension': extension, 'script': script}
+        return {"extension": extension, "script": script}
     elif is_gh_script and gh_script is not None:
-        return {'extension': '.js', 'script': gh_script}
+        return {"extension": ".js", "script": gh_script}
     else:
         return None
 
 
 def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument('--out', required=True)
+    parser.add_argument("--out", required=True)
     args = parser.parse_args()
 
     out = Path(args.out)
     if out.exists():
-        sys.exit(f'{out} already exists; aborting to avoid overwriting')
+        sys.exit(f"{out} already exists; aborting to avoid overwriting")
 
     gha_expressions_found = False
 
-    for p in Path('.github/workflows').iterdir():
-        with open(p) as f:
+    for p in Path(".github/workflows").iterdir():
+        with open(p, "rb") as f:
             workflow = yaml.safe_load(f)
 
-        for job_name, job in workflow['jobs'].items():
+        for job_name, job in workflow["jobs"].items():
             job_dir = out / p / job_name
-            steps = job['steps']
+            if "steps" not in job:
+                continue
+            steps = job["steps"]
             index_chars = len(str(len(steps) - 1))
             for i, step in enumerate(steps, start=1):
                 extracted = extract(step)
                 if extracted:
-                    script = extracted['script']
-                    step_name = step.get('name', '')
-                    if '${{' in script:
+                    script = extracted["script"]
+                    step_name = step.get("name", "")
+                    if "${{" in script:
                         gha_expressions_found = True
                         print(
-                            f'{p} job `{job_name}` step {i}: {step_name}',
-                            file=sys.stderr
+                            f"{p} job `{job_name}` step {i}: {step_name}",
+                            file=sys.stderr,
                         )
 
                     job_dir.mkdir(parents=True, exist_ok=True)
 
                     sanitized = re.sub(
-                        '[^a-zA-Z_]+', '_',
-                        f'_{step_name}',
-                    ).rstrip('_')
-                    extension = extracted['extension']
-                    filename = f'{i:0{index_chars}}{sanitized}{extension}'
+                        "[^a-zA-Z_]+",
+                        "_",
+                        f"_{step_name}",
+                    ).rstrip("_")
+                    extension = extracted["extension"]
+                    filename = f"{i:0{index_chars}}{sanitized}{extension}"
                     (job_dir / filename).write_text(script)
 
     if gha_expressions_found:
         sys.exit(
-            'Each of the above scripts contains a GitHub Actions '
-            '${{ <expression> }} which must be replaced with an `env` variable'
-            ' for security reasons.'
+            "Each of the above scripts contains a GitHub Actions "
+            "${{ <expression> }} which must be replaced with an `env` variable"
+            " for security reasons."
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/fast_nvcc/fast_nvcc.py b/tools/fast_nvcc/fast_nvcc.py
index f1bb4fa6c9e6..0a1ae07c2342 100755
--- a/tools/fast_nvcc/fast_nvcc.py
+++ b/tools/fast_nvcc/fast_nvcc.py
@@ -14,12 +14,11 @@
 import subprocess
 import sys
 import time
-from typing import (Awaitable, DefaultDict, Dict, List, Match, Optional, Set,
-                    cast)
+from typing import Awaitable, DefaultDict, Dict, List, Match, Optional, Set, cast
 
 from typing_extensions import TypedDict
 
-help_msg = '''fast_nvcc [OPTION]... -- [NVCC_ARG]...
+help_msg = """fast_nvcc [OPTION]... -- [NVCC_ARG]...
 
 Run the commands given by nvcc --dryrun, in parallel.
 
@@ -31,61 +30,61 @@
 instance passing --help (after "--") doesn't work since the --help
 execution path doesn't compile anything, so adding --dryrun there gives
 nothing in stderr.
-'''
+"""
 parser = argparse.ArgumentParser(help_msg)
 parser.add_argument(
-    '--faithful',
-    action='store_true',
+    "--faithful",
+    action="store_true",
     help="don't modify the commands given by nvcc (slower)",
 )
 parser.add_argument(
-    '--graph',
-    metavar='FILE.gv',
-    help='write Graphviz DOT file with execution graph',
+    "--graph",
+    metavar="FILE.gv",
+    help="write Graphviz DOT file with execution graph",
 )
 parser.add_argument(
-    '--nvcc',
-    metavar='PATH',
-    default='nvcc',
+    "--nvcc",
+    metavar="PATH",
+    default="nvcc",
     help='path to nvcc (default is just "nvcc")',
 )
 parser.add_argument(
-    '--save',
-    metavar='DIR',
-    help='copy intermediate files from each command into DIR',
+    "--save",
+    metavar="DIR",
+    help="copy intermediate files from each command into DIR",
 )
 parser.add_argument(
-    '--sequential',
-    action='store_true',
-    help='sequence commands instead of using the graph (slower)',
+    "--sequential",
+    action="store_true",
+    help="sequence commands instead of using the graph (slower)",
 )
 parser.add_argument(
-    '--table',
-    metavar='FILE.csv',
-    help='write CSV with times and intermediate file sizes',
+    "--table",
+    metavar="FILE.csv",
+    help="write CSV with times and intermediate file sizes",
 )
 parser.add_argument(
-    '--verbose',
-    metavar='FILE.txt',
-    help='like nvcc --verbose, but expanded and into a file',
+    "--verbose",
+    metavar="FILE.txt",
+    help="like nvcc --verbose, but expanded and into a file",
 )
 default_config = parser.parse_args([])
 
 
 # docs about temporary directories used by NVCC
-url_base = 'https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html'
-url_vars = f'{url_base}#keeping-intermediate-phase-files'
+url_base = "https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html"
+url_vars = f"{url_base}#keeping-intermediate-phase-files"
 
 
 # regex for temporary file names
-re_tmp = r'(?<![\w\-/])(?:/tmp/)?(tmp[^ \"\'\\]+)'
+re_tmp = r"(?<![\w\-/])(?:/tmp/)?(tmp[^ \"\'\\]+)"
 
 
 def fast_nvcc_warn(warning: str) -> None:
     """
     Warn the user about something regarding fast_nvcc.
     """
-    print(f'warning (fast_nvcc): {warning}', file=sys.stderr)
+    print(f"warning (fast_nvcc): {warning}", file=sys.stderr)
 
 
 def warn_if_windows() -> None:
@@ -95,7 +94,7 @@ def warn_if_windows() -> None:
     # use os.name instead of platform.system() because there is a
     # platform.py file in this directory, making it very difficult to
     # import the platform module from the Python standard library
-    if os.name == 'nt':
+    if os.name == "nt":
         fast_nvcc_warn("untested on Windows, might not work; see this URL:")
         fast_nvcc_warn(url_vars)
 
@@ -104,24 +103,24 @@ def warn_if_tmpdir_flag(args: List[str]) -> None:
     """
     Warn the user that using fast_nvcc with some flags might not work.
     """
-    file_path_specs = 'file-and-path-specifications'
-    guiding_driver = 'options-for-guiding-compiler-driver'
+    file_path_specs = "file-and-path-specifications"
+    guiding_driver = "options-for-guiding-compiler-driver"
     scary_flags = {
-        '--objdir-as-tempdir': file_path_specs,
-        '-objtemp': file_path_specs,
-        '--keep': guiding_driver,
-        '-keep': guiding_driver,
-        '--keep-dir': guiding_driver,
-        '-keep-dir': guiding_driver,
-        '--save-temps': guiding_driver,
-        '-save-temps': guiding_driver,
+        "--objdir-as-tempdir": file_path_specs,
+        "-objtemp": file_path_specs,
+        "--keep": guiding_driver,
+        "-keep": guiding_driver,
+        "--keep-dir": guiding_driver,
+        "-keep-dir": guiding_driver,
+        "--save-temps": guiding_driver,
+        "-save-temps": guiding_driver,
     }
     for arg in args:
         for flag, frag in scary_flags.items():
-            if re.match(fr'^{re.escape(flag)}(?:=.*)?$', arg):
-                fast_nvcc_warn(f'{flag} not supported since it interacts with')
-                fast_nvcc_warn('TMPDIR, so fast_nvcc may break; see this URL:')
-                fast_nvcc_warn(f'{url_base}#{frag}')
+            if re.match(rf"^{re.escape(flag)}(?:=.*)?$", arg):
+                fast_nvcc_warn(f"{flag} not supported since it interacts with")
+                fast_nvcc_warn("TMPDIR, so fast_nvcc may break; see this URL:")
+                fast_nvcc_warn(f"{url_base}#{frag}")
 
 
 class DryunData(TypedDict):
@@ -135,18 +134,18 @@ def nvcc_dryrun_data(binary: str, args: List[str]) -> DryunData:
     Return parsed environment variables and commands from nvcc --dryrun.
     """
     result = subprocess.run(  # type: ignore[call-overload]
-        [binary, '--dryrun'] + args,
+        [binary, "--dryrun"] + args,
         capture_output=True,
-        encoding='ascii',  # this is just a guess
+        encoding="ascii",  # this is just a guess
     )
-    print(result.stdout, end='')
+    print(result.stdout, end="")
     env = {}
     commands = []
     for line in result.stderr.splitlines():
-        match = re.match(r'^#\$ (.*)$', line)
+        match = re.match(r"^#\$ (.*)$", line)
         if match:
-            stripped, = match.groups()
-            mapping = re.match(r'^(\w+)=(.*)$', stripped)
+            (stripped,) = match.groups()
+            mapping = re.match(r"^(\w+)=(.*)$", stripped)
             if mapping:
                 name, val = mapping.groups()
                 env[name] = val
@@ -154,14 +153,14 @@ def nvcc_dryrun_data(binary: str, args: List[str]) -> DryunData:
                 commands.append(stripped)
         else:
             print(line, file=sys.stderr)
-    return {'env': env, 'commands': commands, 'exit_code': result.returncode}
+    return {"env": env, "commands": commands, "exit_code": result.returncode}
 
 
 def warn_if_tmpdir_set(env: Dict[str, str]) -> None:
     """
     Warn the user that setting TMPDIR with fast_nvcc might not work.
     """
-    if os.getenv('TMPDIR') or 'TMPDIR' in env:
+    if os.getenv("TMPDIR") or "TMPDIR" in env:
         fast_nvcc_warn("TMPDIR is set, might not work; see this URL:")
         fast_nvcc_warn(url_vars)
 
@@ -183,17 +182,17 @@ def module_id_contents(command: List[str]) -> str:
     """
     Guess the contents of the .module_id file contained within command.
     """
-    if command[0] == 'cicc':
+    if command[0] == "cicc":
         path = command[-3]
-    elif command[0] == 'cudafe++':
+    elif command[0] == "cudafe++":
         path = command[-1]
-    middle = pathlib.PurePath(path).name.replace('-', '_').replace('.', '_')
+    middle = pathlib.PurePath(path).name.replace("-", "_").replace(".", "_")
     # this suffix is very wrong (the real one is far less likely to be
     # unique), but it seems difficult to find a rule that reproduces the
     # real suffixes, so here's one that, while inaccurate, is at least
     # hopefully as straightforward as possible
     suffix = hashlib.md5(str.encode(middle)).hexdigest()[:8]
-    return f'_{len(middle)}_{middle}_{suffix}'
+    return f"_{len(middle)}_{middle}_{suffix}"
 
 
 def unique_module_id_files(commands: List[str]) -> List[str]:
@@ -206,14 +205,14 @@ def unique_module_id_files(commands: List[str]) -> List[str]:
         arr = []
 
         def uniqueify(s: Match[str]) -> str:
-            filename = re.sub(r'\-(\d+)', r'-\1-' + str(i), s.group(0))
+            filename = re.sub(r"\-(\d+)", r"-\1-" + str(i), s.group(0))
             arr.append(filename)
             return filename
 
-        line = re.sub(re_tmp + r'.module_id', uniqueify, line)
-        line = re.sub(r'\s*\-\-gen\_module\_id\_file\s*', ' ', line)
+        line = re.sub(re_tmp + r".module_id", uniqueify, line)
+        line = re.sub(r"\s*\-\-gen\_module\_id\_file\s*", " ", line)
         if arr:
-            filename, = arr
+            (filename,) = arr
             if not module_id:
                 module_id = module_id_contents(shlex.split(line))
             uniqueified.append(f"echo -n '{module_id}' > '{filename}'")
@@ -225,7 +224,7 @@ def make_rm_force(commands: List[str]) -> List[str]:
     """
     Add --force to all rm commands.
     """
-    return [f'{c} --force' if c.startswith('rm ') else c for c in commands]
+    return [f"{c} --force" if c.startswith("rm ") else c for c in commands]
 
 
 def print_verbose_output(
@@ -238,12 +237,12 @@ def print_verbose_output(
     Human-readably write nvcc --dryrun data to stderr.
     """
     padding = len(str(len(commands) - 1))
-    with open(filename, 'w') as f:
+    with open(filename, "w") as f:
         for name, val in env.items():
             print(f'#{" "*padding}$ {name}={val}', file=f)
         for i, command in enumerate(commands):
-            prefix = f'{str(i).rjust(padding)}$ '
-            print(f'#{prefix}{command[0]}', file=f)
+            prefix = f"{str(i).rjust(padding)}$ "
+            print(f"#{prefix}{command[0]}", file=f)
             for part in command[1:]:
                 print(f'#{" "*len(prefix)}{part}', file=f)
 
@@ -262,7 +261,7 @@ def files_mentioned(command: str) -> List[str]:
     """
     Return fully-qualified names of all tmp files referenced by command.
     """
-    return [f'/tmp/{match.group(1)}' for match in re.finditer(re_tmp, command)]
+    return [f"/tmp/{match.group(1)}" for match in re.finditer(re_tmp, command)]
 
 
 def nvcc_data_dependencies(commands: List[str]) -> Graph:
@@ -291,11 +290,11 @@ def nvcc_data_dependencies(commands: List[str]) -> Graph:
                     for filename in fatbins[dep]:
                         if filename in tmp_files:
                             deps.add(tmp_files[filename])
-            if tmp.endswith('.fatbin.c') and not line.startswith('fatbinary'):
+            if tmp.endswith(".fatbin.c") and not line.startswith("fatbinary"):
                 fatbins[i].add(tmp)
             else:
                 tmp_files[tmp] = i
-        if line.startswith('rm ') and not deps:
+        if line.startswith("rm ") and not deps:
             deps.add(i - 1)
         graph.append(deps)
     return graph
@@ -329,7 +328,7 @@ def warn_if_not_weakly_connected(graph: Graph) -> None:
     Warn the user if the execution graph is not weakly connected.
     """
     if not is_weakly_connected(graph):
-        fast_nvcc_warn('execution graph is not (weakly) connected')
+        fast_nvcc_warn("execution graph is not (weakly) connected")
 
 
 def print_dot_graph(
@@ -341,18 +340,19 @@ def print_dot_graph(
     """
     Print a DOT file displaying short versions of the commands in graph.
     """
+
     def name(k: int) -> str:
         return f'"{k} {os.path.basename(commands[k][0])}"'
-    with open(filename, 'w') as f:
-        print('digraph {', file=f)
+
+    with open(filename, "w") as f:
+        print("digraph {", file=f)
         # print all nodes, in case it's disconnected
         for i in range(len(graph)):
-            print(f'    {name(i)};', file=f)
+            print(f"    {name(i)};", file=f)
         for i, deps in enumerate(graph):
             for j in deps:
-                print(f'    {name(j)} -> {name(i)};', file=f)
-        print('}', file=f)
-
+                print(f"    {name(j)} -> {name(i)};", file=f)
+        print("}", file=f)
 
 
 class Result(TypedDict, total=False):
@@ -378,7 +378,7 @@ async def run_command(
     for task in deps:
         dep_result = await task
         # abort if a previous step failed
-        if 'exit_code' not in dep_result or dep_result['exit_code'] != 0:
+        if "exit_code" not in dep_result or dep_result["exit_code"] != 0:
             return {}
     if gather_data:
         t1 = time.monotonic()
@@ -390,17 +390,17 @@ async def run_command(
     )
     stdout, stderr = await proc.communicate()
     code = cast(int, proc.returncode)
-    results: Result = {'exit_code': code, 'stdout': stdout, 'stderr': stderr}
+    results: Result = {"exit_code": code, "stdout": stdout, "stderr": stderr}
     if gather_data:
         t2 = time.monotonic()
-        results['time'] = t2 - t1
+        results["time"] = t2 - t1
         sizes = {}
         for tmp_file in files_mentioned(command):
             if os.path.exists(tmp_file):
                 sizes[tmp_file] = os.path.getsize(tmp_file)
             else:
                 sizes[tmp_file] = 0
-        results['files'] = sizes
+        results["files"] = sizes
     if save:
         dest = pathlib.Path(save) / str(i)
         dest.mkdir()
@@ -424,14 +424,18 @@ async def run_graph(
     tasks: List[Awaitable[Result]] = []
     for i, (command, indices) in enumerate(zip(commands, graph)):
         deps = {tasks[j] for j in indices}
-        tasks.append(asyncio.create_task(run_command(  # type: ignore[attr-defined]
-            command,
-            env=env,
-            deps=deps,
-            gather_data=gather_data,
-            i=i,
-            save=save,
-        )))
+        tasks.append(
+            asyncio.create_task(
+                run_command(  # type: ignore[attr-defined]
+                    command,
+                    env=env,
+                    deps=deps,
+                    gather_data=gather_data,
+                    i=i,
+                    save=save,
+                )
+            )
+        )
     return [await task for task in tasks]
 
 
@@ -440,8 +444,8 @@ def print_command_outputs(command_results: List[Result]) -> None:
     Print captured stdout and stderr from commands.
     """
     for result in command_results:
-        sys.stdout.write(result.get('stdout', b'').decode('ascii'))
-        sys.stderr.write(result.get('stderr', b'').decode('ascii'))
+        sys.stdout.write(result.get("stdout", b"").decode("ascii"))
+        sys.stderr.write(result.get("stderr", b"").decode("ascii"))
 
 
 def write_log_csv(
@@ -455,15 +459,15 @@ def write_log_csv(
     """
     tmp_files: List[str] = []
     for result in command_results:
-        tmp_files.extend(result.get('files', {}).keys())
-    with open(filename, 'w', newline='') as csvfile:
-        fieldnames = ['command', 'seconds'] + list(dict.fromkeys(tmp_files))
+        tmp_files.extend(result.get("files", {}).keys())
+    with open(filename, "w", newline="") as csvfile:
+        fieldnames = ["command", "seconds"] + list(dict.fromkeys(tmp_files))
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
         for i, result in enumerate(command_results):
-            command = f'{i} {os.path.basename(command_parts[i][0])}'
-            row = {'command': command, 'seconds': result.get('time', 0)}
-            writer.writerow({**row, **result.get('files', {})})
+            command = f"{i} {os.path.basename(command_parts[i][0])}"
+            row = {"command": command, "seconds": result.get("time", 0)}
+            writer.writerow({**row, **result.get("files", {})})
 
 
 def exit_code(results: List[Result]) -> int:
@@ -471,7 +475,7 @@ def exit_code(results: List[Result]) -> int:
     Aggregate individual exit codes into a single code.
     """
     for result in results:
-        code = result.get('exit_code', 0)
+        code = result.get("exit_code", 0)
         if code != 0:
             return code
     return 0
@@ -497,9 +501,9 @@ def fast_nvcc(
     warn_if_windows()
     warn_if_tmpdir_flag(args)
     dryrun_data = nvcc_dryrun_data(config.nvcc, args)
-    env = dryrun_data['env']
+    env = dryrun_data["env"]
     warn_if_tmpdir_set(env)
-    commands = dryrun_data['commands']
+    commands = dryrun_data["commands"]
     if not config.faithful:
         commands = make_rm_force(unique_module_id_files(commands))
 
@@ -523,13 +527,15 @@ def fast_nvcc(
         )
     if config.sequential:
         graph = straight_line_dependencies(commands)
-    results = asyncio.run(run_graph(  # type: ignore[attr-defined]
-        env=env,
-        commands=commands,
-        graph=graph,
-        gather_data=bool(config.table),
-        save=config.save,
-    ))
+    results = asyncio.run(
+        run_graph(  # type: ignore[attr-defined]
+            env=env,
+            commands=commands,
+            graph=graph,
+            gather_data=bool(config.table),
+            save=config.save,
+        )
+    )
     print_command_outputs(results)
     if config.table:
         write_log_csv(command_parts, results, filename=config.table)
@@ -537,10 +543,10 @@ def fast_nvcc(
 
 
 def our_arg(arg: str) -> bool:
-    return arg != '--'
+    return arg != "--"
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     argv = sys.argv[1:]
     us = list(itertools.takewhile(our_arg, argv))
     them = list(itertools.dropwhile(our_arg, argv))
diff --git a/tools/gdb/pytorch-gdb.py b/tools/gdb/pytorch-gdb.py
index 46cdcdec2de2..0ed516078f76 100644
--- a/tools/gdb/pytorch-gdb.py
+++ b/tools/gdb/pytorch-gdb.py
@@ -2,6 +2,7 @@
 import textwrap
 from typing import Any
 
+
 class DisableBreakpoints:
     """
     Context-manager to temporarily disable all gdb breakpoints, useful if
@@ -20,6 +21,7 @@ def __exit__(self, etype: Any, evalue: Any, tb: Any) -> None:
         for b in self.disabled_breakpoints:
             b.enabled = True
 
+
 class TensorRepr(gdb.Command):  # type: ignore[misc, no-any-unimported]
     """
     Print a human readable representation of the given at::Tensor.
@@ -30,23 +32,26 @@ class TensorRepr(gdb.Command):  # type: ignore[misc, no-any-unimported]
     internally creates a Python wrapper for the given tensor and call repr()
     on it.
     """
+
     __doc__ = textwrap.dedent(__doc__).strip()
 
     def __init__(self) -> None:
-        gdb.Command.__init__(self, 'torch-tensor-repr',
-                             gdb.COMMAND_USER, gdb.COMPLETE_EXPRESSION)
+        gdb.Command.__init__(
+            self, "torch-tensor-repr", gdb.COMMAND_USER, gdb.COMPLETE_EXPRESSION
+        )
 
     def invoke(self, args: str, from_tty: bool) -> None:
         args = gdb.string_to_argv(args)
         if len(args) != 1:
-            print('Usage: torch-tensor-repr EXP')
+            print("Usage: torch-tensor-repr EXP")
             return
         name = args[0]
         with DisableBreakpoints():
-            res = gdb.parse_and_eval('torch::gdb::tensor_repr(%s)' % name)
-            print('Python-level repr of %s:' % name)
+            res = gdb.parse_and_eval("torch::gdb::tensor_repr(%s)" % name)
+            print("Python-level repr of %s:" % name)
             print(res.string())
             # torch::gdb::tensor_repr returns a malloc()ed buffer, let's free it
-            gdb.parse_and_eval('(void)free(%s)' % int(res))
+            gdb.parse_and_eval("(void)free(%s)" % int(res))
+
 
 TensorRepr()
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index 2ee17b76e52f..e47c61f55eb3 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -5,46 +5,59 @@
 from setuptools import distutils  # type: ignore[import]
 from typing import Optional, Union
 
+
 def get_sha(pytorch_root: Union[str, Path]) -> str:
     try:
-        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip()
+        return (
+            subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=pytorch_root)
+            .decode("ascii")
+            .strip()
+        )
     except Exception:
-        return 'Unknown'
+        return "Unknown"
+
 
 def get_torch_version(sha: Optional[str] = None) -> str:
     pytorch_root = Path(__file__).parent.parent
-    version = open(pytorch_root / 'version.txt', 'r').read().strip()
+    version = open(pytorch_root / "version.txt", "r").read().strip()
 
-    if os.getenv('PYTORCH_BUILD_VERSION'):
-        assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
-        build_number = int(os.getenv('PYTORCH_BUILD_NUMBER', ""))
-        version = os.getenv('PYTORCH_BUILD_VERSION', "")
+    if os.getenv("PYTORCH_BUILD_VERSION"):
+        assert os.getenv("PYTORCH_BUILD_NUMBER") is not None
+        build_number = int(os.getenv("PYTORCH_BUILD_NUMBER", ""))
+        version = os.getenv("PYTORCH_BUILD_VERSION", "")
         if build_number > 1:
-            version += '.post' + str(build_number)
-    elif sha != 'Unknown':
+            version += ".post" + str(build_number)
+    elif sha != "Unknown":
         if sha is None:
             sha = get_sha(pytorch_root)
-        version += '+git' + sha[:7]
+        version += "+git" + sha[:7]
     return version
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generate torch/version.py from build and environment metadata.")
-    parser.add_argument("--is_debug", type=distutils.util.strtobool, help="Whether this build is debug mode or not.")
+    parser = argparse.ArgumentParser(
+        description="Generate torch/version.py from build and environment metadata."
+    )
+    parser.add_argument(
+        "--is_debug",
+        type=distutils.util.strtobool,
+        help="Whether this build is debug mode or not.",
+    )
     parser.add_argument("--cuda_version", type=str)
     parser.add_argument("--hip_version", type=str)
 
     args = parser.parse_args()
 
     assert args.is_debug is not None
-    args.cuda_version = None if args.cuda_version == '' else args.cuda_version
-    args.hip_version = None if args.hip_version == '' else args.hip_version
+    args.cuda_version = None if args.cuda_version == "" else args.cuda_version
+    args.hip_version = None if args.hip_version == "" else args.hip_version
 
     pytorch_root = Path(__file__).parent.parent
     version_path = pytorch_root / "torch" / "version.py"
     sha = get_sha(pytorch_root)
     version = get_torch_version(sha)
 
-    with open(version_path, 'w') as f:
+    with open(version_path, "w") as f:
         f.write("__version__ = '{}'\n".format(version))
         # NB: This is not 100% accurate, because you could have built the
         # library code with DEBUG, but csrc without DEBUG (in which case
diff --git a/tools/git-pre-commit b/tools/git-pre-commit
deleted file mode 100755
index 1c4340c6b434..000000000000
--- a/tools/git-pre-commit
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "Running pre-commit flake8"
-python3 tools/linter/flake8_hook.py
-
-echo "Running pre-commit clang-tidy"
-git diff HEAD > pr.diff
-python3 -m tools.linter.clang_tidy --diff-file "pr.diff"
-rm pr.diff
-
-echo "Running pre-commit clang-format"
-tools/linter/git-clang-format HEAD~ --force
diff --git a/tools/iwyu/fixup.py b/tools/iwyu/fixup.py
index b4d6294cbae0..4ce80bb0f52b 100644
--- a/tools/iwyu/fixup.py
+++ b/tools/iwyu/fixup.py
@@ -2,7 +2,7 @@
 import re
 
 QUOTE_INCLUDE_RE = re.compile(r'^#include "(.*)"')
-ANGLE_INCLUDE_RE = re.compile(r'^#include <(.*)>')
+ANGLE_INCLUDE_RE = re.compile(r"^#include <(.*)>")
 
 # By default iwyu will pick the C include, but we prefer the C++ headers
 STD_C_HEADER_MAP = {
@@ -34,25 +34,27 @@
     "<wctype.h>": "<cwctype>",
 }
 
+
 def main() -> None:
     for line in sys.stdin:
         # Convert all quoted includes to angle brackets
         match = QUOTE_INCLUDE_RE.match(line)
         if match is not None:
-            print(f"#include <{match.group(1)}>{line[match.end(0):]}", end='')
+            print(f"#include <{match.group(1)}>{line[match.end(0):]}", end="")
             continue
 
         match = ANGLE_INCLUDE_RE.match(line)
         if match is not None:
             path = f"<{match.group(1)}>"
             new_path = STD_C_HEADER_MAP.get(path, path)
-            tail = line[match.end(0):]
+            tail = line[match.end(0) :]
             if len(tail) > 1:
-                tail = ' ' + tail
-            print(f"#include {new_path}{tail}", end='')
+                tail = " " + tail
+            print(f"#include {new_path}{tail}", end="")
             continue
 
-        print(line, end='')
+        print(line, end="")
+
 
 if __name__ == "__main__":
     main()
diff --git a/tools/jit/BUILD.buck b/tools/jit/BUILD.buck
new file mode 100644
index 000000000000..d79aece1ed24
--- /dev/null
+++ b/tools/jit/BUILD.buck
@@ -0,0 +1,13 @@
+python_library(
+    name = "jit",
+    srcs = glob([
+        "*.py",
+        "templates/*",
+    ]),
+    base_module = "tools.jit",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//:aten_code_template",
+        "//torchgen:torchgen",
+    ],
+)
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
new file mode 100644
index 000000000000..154b4f527b7e
--- /dev/null
+++ b/tools/jit/gen_unboxing.py
@@ -0,0 +1,247 @@
+# Generates RegisterCodegenUnboxedKernels.cpp, UnboxingFunctions.h and UnboxingFunctions.cpp.
+import argparse
+import os
+import pathlib
+from dataclasses import dataclass
+from torchgen.api import unboxing
+from torchgen.api.translate import translate
+from torchgen.api.types import CppSignatureGroup
+from torchgen.api.unboxing import convert_arguments
+from torchgen.context import method_with_native_function
+from torchgen.gen import parse_native_yaml, cpp_string, get_custom_build_selector
+from torchgen.model import NativeFunction, NativeFunctionsGroup, Variant
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import Target, FileManager, mapMaybe, make_file_manager
+from typing import Union, Sequence
+from typing_extensions import Literal
+
+
+# Generates UnboxingFunctions.h & UnboxingFunctions.cpp.
+@dataclass(frozen=True)
+class ComputeUnboxingFunctions:
+    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    selector: SelectiveBuilder
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+        if not self.selector.is_root_operator(f"aten::{f.func.name}"):
+            return ""
+
+        if self.target is Target.DECLARATION:
+            # Note [The ATen Codegen Unboxing API]
+            # Similar to the ATen Operators API, ATen Codegen Unboxing API lives in the at::unboxing namespace, and
+            # will be used by codegen unboxing wrappers (CodegenUnboxingWrappers.cpp).
+            # The Wrappers will be registered into torch::jit::OperatorRegistry using RegisterOperators API.
+            #
+            # Important characteristics about the Codegen Unboxing API:
+            # (1) It follows the OperatorRegistry API.
+            #     This is kind of necessary to avoid overhead.
+            #     For example: if it followed the C++ API, then all of the faithful C++ factory functions
+            #     would need to wrap their arguments into TensorOptions only to unwrap them again.
+            # (2) Under the hood it calls C++ API.
+            return f"""
+// aten::{f.func}
+TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack);
+"""
+        else:
+            sig_group = CppSignatureGroup.from_native_function(
+                f, method=(Variant.method in f.variants)
+            )
+            sig = sig_group.most_faithful_signature()
+            # parse arguments into C++ code
+            binding_list, code_list = convert_arguments(f)
+
+            # for each C++ argument, generate the conversion code
+            code_connector = "\n\t"
+            arg_connector = ", "
+            # function call and push back to stack
+            prefix = "self_base." if sig.method else "at::"
+            translated_args = translate(
+                binding_list, sig.arguments(), method=sig.method
+            )
+            args_str = f"{arg_connector.join(e.expr for e in translated_args)}"
+            if len(f.func.returns) == 0:
+                ret_str = ""
+                push_str = ""
+            else:
+                ret_str = "auto result_ = "
+                push_str = """
+    pack(stack, std::move(result_));
+                """
+            return f"""
+// aten::{f.func}
+TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack) {{
+    {code_connector.join(code_list)}
+
+    drop(stack, {len(binding_list)});
+
+    {ret_str}{prefix}{sig.name()}({args_str});
+    {push_str}
+}}
+"""
+
+
+# Generates RegisterCodegenUnboxedKernels.cpp.
+@dataclass(frozen=True)
+class ComputeCodegenUnboxedKernels:
+    selector: SelectiveBuilder
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+        if not self.selector.is_root_operator(f"aten::{f.func.name}"):
+            return ""
+        # We unconditionally generate function wrappers,
+        sig_group = CppSignatureGroup.from_native_function(f, method=False)
+
+        sig = sig_group.most_faithful_signature()
+
+        # escape double quote in schema, get rid of extra double quotes
+        schema = cpp_string(str(sig.func))[1:-1]
+
+        # arguments
+        args = sig.arguments()
+        connector = ",\n\t\t"
+        args_code = []
+        for arg in args:
+            if not arg.default:
+                arg_cpp = "c10::IValue(c10::nullopt)"
+            elif arg.default.startswith("{"):
+                arg_cpp = f"c10::IntArrayRef({arg.default})"
+            else:
+                arg_cpp = f"c10::IValue({arg.default})"
+            args_code.append(
+                f"""c10::Argument("{arg.name}", nullptr, c10::nullopt, {arg_cpp})"""
+            )
+
+        returns = f.func.returns
+        returns_code = []
+        for ret in returns:
+            returns_code.append(f"""c10::Argument("{ret.name if ret.name else ""}")""")
+        return f"""
+// aten::{schema}
+OperatorGenerator(
+    "aten::{f.func.name.name}",
+    "{f.func.name.overload_name}",
+    {{
+        {connector.join(args_code)}
+    }},
+    {{
+        {connector.join(returns_code)}
+    }},
+    [](Stack & stack) {{
+        RECORD_FUNCTION("{sig.name()}", std::vector<c10::IValue>());
+        at::unboxing::{unboxing.name(f)}(stack);
+    }},
+    aliasAnalysisFromSchema()
+),
+"""
+
+
+def gen_unboxing(
+    *,
+    native_functions: Sequence[NativeFunction],
+    cpu_fm: FileManager,
+    selector: SelectiveBuilder,
+) -> None:
+    def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str:
+        return fn.root_name
+
+    cpu_fm.write_sharded(
+        "UnboxingFunctions.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {
+            "definitions": [ComputeUnboxingFunctions(Target.DEFINITION, selector)(fn)]
+        },
+        num_shards=5,
+        sharded_keys={"definitions"},
+    )
+    cpu_fm.write(
+        "UnboxingFunctions.h",
+        lambda: {
+            "declarations": list(
+                mapMaybe(
+                    ComputeUnboxingFunctions(Target.DECLARATION, selector),
+                    native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write_sharded(
+        "RegisterCodegenUnboxedKernels.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {
+            "unboxed_ops": [ComputeCodegenUnboxedKernels(selector)(fn)]
+        },
+        num_shards=10,
+        sharded_keys={"unboxed_ops"},
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate unboxing source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="aten/src/ATen",
+    )
+    parser.add_argument(
+        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dependencies",
+        help="output a list of dependencies into the given file and exit",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="run without writing any files (still updates outputs)",
+    )
+    parser.add_argument(
+        "--op_selection_yaml_path",
+        help="Provide a path to the operator selection (for custom build) YAML "
+        "that contains the information about the set of selected operators "
+        "and their categories (training, ...). Each operator is either a "
+        "full operator name with overload or just a bare operator name. "
+        "The operator names also contain the namespace prefix (e.g. aten::)",
+    )
+    parser.add_argument(
+        "--op_registration_allowlist",
+        nargs="*",
+        help="filter op registrations by the allowlist (if set); "
+        "each item is `namespace`::`operator name` without overload name; "
+        "e.g.: aten::empty aten::conv2d ...",
+    )
+
+    options = parser.parse_args()
+
+    selector = get_custom_build_selector(
+        options.op_registration_allowlist,
+        options.op_selection_yaml_path,
+    )
+
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+
+    cpu_fm = make_file_manager(options=options)
+    gen_unboxing(native_functions=native_functions, cpu_fm=cpu_fm, selector=selector)
+
+    if options.output_dependencies:
+        depfile_path = pathlib.Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        path = depfile_path.parent / depfile_name
+        cpu_fm.write_outputs(depfile_stem, str(path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py
new file mode 100644
index 000000000000..bbc93954eda4
--- /dev/null
+++ b/tools/linter/adapters/actionlint_linter.py
@@ -0,0 +1,138 @@
+import argparse
+import os
+import re
+import json
+import logging
+import subprocess
+import time
+from enum import Enum
+from typing import List, NamedTuple, Optional, Pattern
+
+
+LINTER_CODE = "ACTIONLINT"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+RESULTS_RE: Pattern[str] = re.compile(
+    r"""(?mx)
+    ^
+    (?P<file>.*?):
+    (?P<line>\d+):
+    (?P<char>\d+):
+    \s(?P<message>.*)
+    \s(?P<code>\[.*\])
+    $
+    """
+)
+
+
+def run_command(
+    args: List[str],
+) -> "subprocess.CompletedProcess[bytes]":
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+def check_files(
+    binary: str,
+    files: List[str],
+) -> List[LintMessage]:
+    try:
+        proc = run_command([binary] + files)
+    except OSError as err:
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
+            )
+        ]
+    stdout = str(proc.stdout, "utf-8").strip()
+    return [
+        LintMessage(
+            path=match["file"],
+            name=match["code"],
+            description=match["message"],
+            line=int(match["line"]),
+            char=int(match["char"]),
+            code=LINTER_CODE,
+            severity=LintSeverity.ERROR,
+            original=None,
+            replacement=None,
+        )
+        for match in RESULTS_RE.finditer(stdout)
+    ]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="actionlint runner",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--binary",
+        required=True,
+        help="actionlint binary path",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.binary):
+        err_msg = LintMessage(
+            path="<none>",
+            line=None,
+            char=None,
+            code=LINTER_CODE,
+            severity=LintSeverity.ERROR,
+            name="command-failed",
+            original=None,
+            replacement=None,
+            description=(
+                f"Could not find actionlint binary at {args.binary},"
+                " you may need to run `lintrunner init`."
+            ),
+        )
+        print(json.dumps(err_msg._asdict()), flush=True)
+        exit(0)
+
+    lint_messages = check_files(args.binary, args.filenames)
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py
new file mode 100644
index 000000000000..9d259fe096b8
--- /dev/null
+++ b/tools/linter/adapters/black_linter.py
@@ -0,0 +1,228 @@
+import argparse
+import concurrent.futures
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from enum import Enum
+from typing import Any, List, NamedTuple, Optional, BinaryIO
+
+
+IS_WINDOWS: bool = os.name == "nt"
+
+
+def eprint(*args: Any, **kwargs: Any) -> None:
+    print(*args, file=sys.stderr, flush=True, **kwargs)
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+def as_posix(name: str) -> str:
+    return name.replace("\\", "/") if IS_WINDOWS else name
+
+
+def _run_command(
+    args: List[str],
+    *,
+    stdin: BinaryIO,
+    timeout: int,
+) -> "subprocess.CompletedProcess[bytes]":
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            stdin=stdin,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=IS_WINDOWS,  # So batch scripts are found.
+            timeout=timeout,
+            check=True,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+def run_command(
+    args: List[str],
+    *,
+    stdin: BinaryIO,
+    retries: int,
+    timeout: int,
+) -> "subprocess.CompletedProcess[bytes]":
+    remaining_retries = retries
+    while True:
+        try:
+            return _run_command(args, stdin=stdin, timeout=timeout)
+        except subprocess.TimeoutExpired as err:
+            if remaining_retries == 0:
+                raise err
+            remaining_retries -= 1
+            logging.warning(
+                "(%s/%s) Retrying because command failed with: %r",
+                retries - remaining_retries,
+                retries,
+                err,
+            )
+            time.sleep(1)
+
+
+def check_file(
+    filename: str,
+    retries: int,
+    timeout: int,
+) -> List[LintMessage]:
+    try:
+        with open(filename, "rb") as f:
+            original = f.read()
+        with open(filename, "rb") as f:
+            proc = run_command(
+                [sys.executable, "-mblack", "--stdin-filename", filename, "-"],
+                stdin=f,
+                retries=retries,
+                timeout=timeout,
+            )
+    except subprocess.TimeoutExpired:
+        return [
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code="BLACK",
+                severity=LintSeverity.ERROR,
+                name="timeout",
+                original=None,
+                replacement=None,
+                description=(
+                    "black timed out while trying to process a file. "
+                    "Please report an issue in pytorch/pytorch with the "
+                    "label 'module: lint'"
+                ),
+            )
+        ]
+    except (OSError, subprocess.CalledProcessError) as err:
+        return [
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code="BLACK",
+                severity=LintSeverity.ADVICE,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(
+                    f"Failed due to {err.__class__.__name__}:\n{err}"
+                    if not isinstance(err, subprocess.CalledProcessError)
+                    else (
+                        "COMMAND (exit code {returncode})\n"
+                        "{command}\n\n"
+                        "STDERR\n{stderr}\n\n"
+                        "STDOUT\n{stdout}"
+                    ).format(
+                        returncode=err.returncode,
+                        command=" ".join(as_posix(x) for x in err.cmd),
+                        stderr=err.stderr.decode("utf-8").strip() or "(empty)",
+                        stdout=err.stdout.decode("utf-8").strip() or "(empty)",
+                    )
+                ),
+            )
+        ]
+
+    replacement = proc.stdout
+    if original == replacement:
+        return []
+
+    return [
+        LintMessage(
+            path=filename,
+            line=None,
+            char=None,
+            code="BLACK",
+            severity=LintSeverity.WARNING,
+            name="format",
+            original=original.decode("utf-8"),
+            replacement=replacement.decode("utf-8"),
+            description="Run `lintrunner -a` to apply this patch.",
+        )
+    ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Format files with black.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--retries",
+        default=3,
+        type=int,
+        help="times to retry timed out black",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=90,
+        type=int,
+        help="seconds to wait for black",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(threadName)s:%(levelname)s> %(message)s",
+        level=logging.NOTSET
+        if args.verbose
+        else logging.DEBUG
+        if len(args.filenames) < 1000
+        else logging.INFO,
+        stream=sys.stderr,
+    )
+
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=os.cpu_count(),
+        thread_name_prefix="Thread",
+    ) as executor:
+        futures = {
+            executor.submit(check_file, x, args.retries, args.timeout): x
+            for x in args.filenames
+        }
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                for lint_message in future.result():
+                    print(json.dumps(lint_message._asdict()), flush=True)
+            except Exception:
+                logging.critical('Failed at "%s".', futures[future])
+                raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/circleci_linter.py b/tools/linter/adapters/circleci_linter.py
index 4eb13228845c..8a76ed396f9f 100644
--- a/tools/linter/adapters/circleci_linter.py
+++ b/tools/linter/adapters/circleci_linter.py
@@ -51,7 +51,11 @@ def run_command(args: List[str], cwd: str) -> "subprocess.CompletedProcess[bytes
     start_time = time.monotonic()
     try:
         return subprocess.run(
-            args, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True,
+            args,
+            cwd=cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
         )
     finally:
         end_time = time.monotonic()
@@ -100,8 +104,8 @@ def run_check(
     return [
         LintMessage(
             path=config_file,
-            line=1,
-            char=1,
+            line=None,
+            char=None,
             code="CIRCLECI",
             severity=LintSeverity.ERROR,
             name="config inconsistency",
@@ -117,10 +121,13 @@ def run_check(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="circleci consistency linter", fromfile_prefix_chars="@",
+        description="circleci consistency linter",
+        fromfile_prefix_chars="@",
     )
     parser.add_argument(
-        "--config-yml", required=True, help="location of config.yml",
+        "--config-yml",
+        required=True,
+        help="location of config.yml",
     )
     parser.add_argument(
         "--regen-script-working-dir",
@@ -133,7 +140,9 @@ def run_check(
         help="location of the config generation script, relative to --regen-script-working-dir",
     )
     parser.add_argument(
-        "--verbose", action="store_true", help="verbose logging",
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
     )
 
     args = parser.parse_args()
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index b4641306daf9..3445dee4e540 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -153,8 +153,8 @@ def check_file(
     return [
         LintMessage(
             path=filename,
-            line=1,
-            char=1,
+            line=None,
+            char=None,
             code="CLANGFORMAT",
             severity=LintSeverity.WARNING,
             name="format",
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index a3a3bdd0143d..d7e19452df03 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -10,11 +10,22 @@
 import time
 from enum import Enum
 from pathlib import Path
+from sysconfig import get_paths as gp
 from typing import Any, List, NamedTuple, Optional, Pattern
 
-
+# PyTorch directory root
+result = subprocess.run(
+    ["git", "rev-parse", "--show-toplevel"],
+    stdout=subprocess.PIPE,
+    check=True,
+)
+PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
 IS_WINDOWS: bool = os.name == "nt"
 
+# Returns '/usr/local/include/python<version number>'
+def get_python_include_dir() -> str:
+    return gp()["include"]
+
 
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
@@ -75,12 +86,14 @@ def run_command(
         logging.debug("took %dms", (end_time - start_time) * 1000)
 
 
-# Severity is either "error" or "note": https://git.io/JiLOP
+# Severity is either "error" or "note":
+# https://github.com/python/mypy/blob/8b47a032e1317fb8e3f9a818005a6b63e9bf0311/mypy/errors.py#L46-L47
 severities = {
     "error": LintSeverity.ERROR,
     "warning": LintSeverity.WARNING,
 }
 
+
 def clang_search_dirs() -> List[str]:
     # Compilers are ordered based on fallback preference
     # We pick the first one that is available on the system
@@ -116,8 +129,13 @@ def clang_search_dirs() -> List[str]:
 
     return search_paths
 
+
 include_args = []
-include_dir = ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs()
+include_dir = [
+    "/usr/lib/llvm-11/include/openmp",
+    get_python_include_dir(),
+    os.path.join(PYTORCH_ROOT, "third_party/pybind11/include"),
+] + clang_search_dirs()
 for dir in include_dir:
     include_args += ["--extra-arg", f"-I{dir}"]
 
@@ -142,9 +160,7 @@ def check_file(
                 name="command-failed",
                 original=None,
                 replacement=None,
-                description=(
-                    f"Failed due to {err.__class__.__name__}:\n{err}"
-                ),
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
             )
         ]
     lint_messages = []
@@ -190,8 +206,10 @@ def main() -> None:
     parser.add_argument(
         "--build_dir",
         required=True,
-        help=("Where the compile_commands.json file is located. "
-              "Gets passed to clang-tidy -p"),
+        help=(
+            "Where the compile_commands.json file is located. "
+            "Gets passed to clang-tidy -p"
+        ),
     )
     parser.add_argument(
         "--verbose",
diff --git a/tools/linter/adapters/exec_linter.py b/tools/linter/adapters/exec_linter.py
index f263d11d5456..f00dc60afbb2 100644
--- a/tools/linter/adapters/exec_linter.py
+++ b/tools/linter/adapters/exec_linter.py
@@ -51,13 +51,17 @@ def check_file(filename: str) -> Optional[LintMessage]:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="native functions linter", fromfile_prefix_chars="@",
+        description="exec linter",
+        fromfile_prefix_chars="@",
     )
     parser.add_argument(
-        "--verbose", action="store_true", help="location of native_functions.yaml",
+        "--verbose",
+        action="store_true",
     )
     parser.add_argument(
-        "filenames", nargs="+", help="paths to lint",
+        "filenames",
+        nargs="+",
+        help="paths to lint",
     )
 
     args = parser.parse_args()
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index 50b257f41ff5..20274432566c 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -1,5 +1,4 @@
 import argparse
-import concurrent.futures
 import json
 import logging
 import os
@@ -244,16 +243,15 @@ def get_issue_documentation_url(code: str) -> str:
     return ""
 
 
-def check_file(
-    filename: str,
-    binary: str,
+def check_files(
+    filenames: List[str],
     flake8_plugins_path: Optional[str],
     severities: Dict[str, LintSeverity],
     retries: int,
 ) -> List[LintMessage]:
     try:
         proc = run_command(
-            [binary, "--exit-zero", filename],
+            [sys.executable, "-mflake8", "--exit-zero"] + filenames,
             extra_env={"FLAKE8_PLUGINS_PATH": flake8_plugins_path}
             if flake8_plugins_path
             else None,
@@ -262,7 +260,7 @@ def check_file(
     except (OSError, subprocess.CalledProcessError) as err:
         return [
             LintMessage(
-                path=filename,
+                path=None,
                 line=None,
                 char=None,
                 code="FLAKE8",
@@ -314,11 +312,6 @@ def main() -> None:
         description="Flake8 wrapper linter.",
         fromfile_prefix_chars="@",
     )
-    parser.add_argument(
-        "--binary",
-        required=True,
-        help="flake8 binary path",
-    )
     parser.add_argument(
         "--flake8-plugins-path",
         help="FLAKE8_PLUGINS_PATH env value",
@@ -369,28 +362,11 @@ def main() -> None:
             assert len(parts) == 2, f"invalid severity `{severity}`"
             severities[parts[0]] = LintSeverity(parts[1])
 
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=os.cpu_count(),
-        thread_name_prefix="Thread",
-    ) as executor:
-        futures = {
-            executor.submit(
-                check_file,
-                filename,
-                args.binary,
-                flake8_plugins_path,
-                severities,
-                args.retries,
-            ): filename
-            for filename in args.filenames
-        }
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                for lint_message in future.result():
-                    print(json.dumps(lint_message._asdict()), flush=True)
-            except Exception:
-                logging.critical('Failed at "%s".', futures[future])
-                raise
+    lint_messages = check_files(
+        args.filenames, flake8_plugins_path, severities, args.retries
+    )
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
 
 
 if __name__ == "__main__":
diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py
index d160c4d5dc21..61a81ad12dc3 100644
--- a/tools/linter/adapters/grep_linter.py
+++ b/tools/linter/adapters/grep_linter.py
@@ -43,11 +43,17 @@ def as_posix(name: str) -> str:
     return name.replace("\\", "/") if IS_WINDOWS else name
 
 
-def run_command(args: List[str],) -> "subprocess.CompletedProcess[bytes]":
+def run_command(
+    args: List[str],
+) -> "subprocess.CompletedProcess[bytes]":
     logging.debug("$ %s", " ".join(args))
     start_time = time.monotonic()
     try:
-        return subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,)
+        return subprocess.run(
+            args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
     finally:
         end_time = time.monotonic()
         logging.debug("took %dms", (end_time - start_time) * 1000)
@@ -116,13 +122,18 @@ def lint_file(
 
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description="grep wrapper linter.", fromfile_prefix_chars="@",
+        description="grep wrapper linter.",
+        fromfile_prefix_chars="@",
     )
     parser.add_argument(
-        "--pattern", required=True, help="pattern to grep for",
+        "--pattern",
+        required=True,
+        help="pattern to grep for",
     )
     parser.add_argument(
-        "--linter-name", required=True, help="name of the linter",
+        "--linter-name",
+        required=True,
+        help="name of the linter",
     )
     parser.add_argument(
         "--error-name",
@@ -142,10 +153,14 @@ def main() -> None:
         ),
     )
     parser.add_argument(
-        "--verbose", action="store_true", help="verbose logging",
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
     )
     parser.add_argument(
-        "filenames", nargs="+", help="paths to lint",
+        "filenames",
+        nargs="+",
+        help="paths to lint",
     )
     args = parser.parse_args()
 
@@ -160,7 +175,7 @@ def main() -> None:
     )
 
     try:
-        proc = run_command(["grep", "-nPH", args.pattern, *args.filenames])
+        proc = run_command(["grep", "-nEHI", args.pattern, *args.filenames])
     except Exception as err:
         err_msg = LintMessage(
             path=None,
diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py
index 687f8bf68066..65ee8850e667 100644
--- a/tools/linter/adapters/mypy_linter.py
+++ b/tools/linter/adapters/mypy_linter.py
@@ -1,5 +1,4 @@
 import argparse
-import concurrent.futures
 import json
 import logging
 import os
@@ -8,6 +7,7 @@
 import sys
 import time
 from enum import Enum
+from pathlib import Path
 from typing import Any, Dict, List, NamedTuple, Optional, Pattern
 
 
@@ -56,7 +56,6 @@ def as_posix(name: str) -> str:
 )
 
 
-
 def run_command(
     args: List[str],
     *,
@@ -76,21 +75,22 @@ def run_command(
         logging.debug("took %dms", (end_time - start_time) * 1000)
 
 
-# Severity is either "error" or "note": https://git.io/JiLOP
+# Severity is either "error" or "note":
+# https://github.com/python/mypy/blob/8b47a032e1317fb8e3f9a818005a6b63e9bf0311/mypy/errors.py#L46-L47
 severities = {
     "error": LintSeverity.ERROR,
     "note": LintSeverity.ADVICE,
 }
 
-def check_file(
-    filename: str,
+
+def check_files(
+    filenames: List[str],
     config: str,
-    binary: str,
     retries: int,
 ) -> List[LintMessage]:
     try:
         proc = run_command(
-            [binary, f"--config={config}", filename],
+            [sys.executable, "-mmypy", f"--config={config}"] + filenames,
             extra_env={},
             retries=retries,
         )
@@ -105,9 +105,7 @@ def check_file(
                 name="command-failed",
                 original=None,
                 replacement=None,
-                description=(
-                    f"Failed due to {err.__class__.__name__}:\n{err}"
-                ),
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
             )
         ]
     stdout = str(proc.stdout, "utf-8").strip()
@@ -134,11 +132,6 @@ def main() -> None:
         description="mypy wrapper linter.",
         fromfile_prefix_chars="@",
     )
-    parser.add_argument(
-        "--binary",
-        required=True,
-        help="mypy binary path",
-    )
     parser.add_argument(
         "--retries",
         default=3,
@@ -172,27 +165,26 @@ def main() -> None:
         stream=sys.stderr,
     )
 
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=os.cpu_count(),
-        thread_name_prefix="Thread",
-    ) as executor:
-        futures = {
-            executor.submit(
-                check_file,
-                filename,
-                args.config,
-                args.binary,
-                args.retries,
-            ): filename
-            for filename in args.filenames
-        }
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                for lint_message in future.result():
-                    print(json.dumps(lint_message._asdict()), flush=True)
-            except Exception:
-                logging.critical('Failed at "%s".', futures[future])
-                raise
+    # Use a dictionary here to preserve order. mypy cares about order,
+    # tragically, e.g. https://github.com/python/mypy/issues/2015
+    filenames: Dict[str, bool] = {}
+
+    # If a stub file exists, have mypy check it instead of the original file, in
+    # accordance with PEP-484 (see https://www.python.org/dev/peps/pep-0484/#stub-files)
+    for filename in args.filenames:
+        if filename.endswith(".pyi"):
+            filenames[filename] = True
+            continue
+
+        stub_filename = filename.replace(".py", ".pyi")
+        if Path(stub_filename).exists():
+            filenames[stub_filename] = True
+        else:
+            filenames[filename] = True
+
+    lint_messages = check_files(list(filenames), args.config, args.retries)
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
 
 
 if __name__ == "__main__":
diff --git a/tools/linter/adapters/nativefunctions_linter.py b/tools/linter/adapters/nativefunctions_linter.py
index dd6e3b03aab1..28065f2b7af4 100644
--- a/tools/linter/adapters/nativefunctions_linter.py
+++ b/tools/linter/adapters/nativefunctions_linter.py
@@ -44,7 +44,8 @@ class LintMessage(NamedTuple):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="native functions linter", fromfile_prefix_chars="@",
+        description="native functions linter",
+        fromfile_prefix_chars="@",
     )
     parser.add_argument(
         "--native-functions-yml",
@@ -89,8 +90,8 @@ class LintMessage(NamedTuple):
     if contents != new_contents:
         msg = LintMessage(
             path=args.native_functions_yml,
-            line=1,
-            char=1,
+            line=None,
+            char=None,
             code="NATIVEFUNCTIONS",
             severity=LintSeverity.ERROR,
             name="roundtrip inconsistency",
diff --git a/tools/linter/adapters/newlines_linter.py b/tools/linter/adapters/newlines_linter.py
index 5ce5edca670a..f51254ad496a 100644
--- a/tools/linter/adapters/newlines_linter.py
+++ b/tools/linter/adapters/newlines_linter.py
@@ -67,7 +67,7 @@ def check_file(filename: str) -> Optional[LintMessage]:
                 name="testestTrailing newline",
                 original=None,
                 replacement=None,
-                description="Trailing newline found. Run `lintunner --take NEWLINE -a` to apply changes.",
+                description="Trailing newline found. Run `lintrunner --take NEWLINE -a` to apply changes.",
             )
 
         else:
@@ -103,19 +103,24 @@ def check_file(filename: str) -> Optional[LintMessage]:
                     name="Trailing newline",
                     original=original,
                     replacement=original.rstrip("\n") + "\n",
-                    description="Trailing newline found. Run `lintunner --take NEWLINE -a` to apply changes.",
+                    description="Trailing newline found. Run `lintrunner --take NEWLINE -a` to apply changes.",
                 )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="native functions linter", fromfile_prefix_chars="@",
+        description="native functions linter",
+        fromfile_prefix_chars="@",
     )
     parser.add_argument(
-        "--verbose", action="store_true", help="location of native_functions.yaml",
+        "--verbose",
+        action="store_true",
+        help="location of native_functions.yaml",
     )
     parser.add_argument(
-        "filenames", nargs="+", help="paths to lint",
+        "filenames",
+        nargs="+",
+        help="paths to lint",
     )
 
     args = parser.parse_args()
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index b4451beac644..db1f69d26b22 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -1,6 +1,7 @@
 """
 Initializer script that installs stuff to pip.
 """
+import os
 import argparse
 import logging
 import subprocess
@@ -23,12 +24,18 @@ def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]":
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="pip initializer")
     parser.add_argument(
-        "packages", nargs="+", help="pip packages to install",
+        "packages",
+        nargs="+",
+        help="pip packages to install",
     )
     parser.add_argument(
-        "--verbose", action="store_true", help="verbose logging",
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "--dry-run", help="do not install anything, just print what would be done."
     )
-    parser.add_argument("--dry-run", help="do not install anything, just print what would be done.")
 
     args = parser.parse_args()
 
@@ -45,7 +52,19 @@ def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]":
                 "Package {package_name} did not have a version specified. "
                 "Please specify a version to product a consistent linting experience."
             )
-    pip_args = ["pip3", "install", "--user"]
+    pip_args = ["pip3", "install"]
+
+    # If we are in a global install, use `--user` to install so that you do not
+    # need root access in order to initialize linters.
+    #
+    # However, `pip install --user` interacts poorly with virtualenvs (see:
+    # https://bit.ly/3vD4kvl) and conda (see: https://bit.ly/3KG7ZfU). So in
+    # these cases perform a regular installation.
+    in_conda = os.environ.get("CONDA_PREFIX") is not None
+    in_virtualenv = os.environ.get("VIRTUAL_ENV") is not None
+    if not in_conda and not in_virtualenv:
+        pip_args.append("--user")
+
     pip_args.extend(args.packages)
 
     dry_run = args.dry_run == "1"
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index f2bc9339776d..65fcef4bc291 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -16,12 +16,19 @@
 HOST_PLATFORM = platform.system()
 
 # PyTorch directory root
-result = subprocess.run(
-    ["git", "rev-parse", "--show-toplevel"],
-    stdout=subprocess.PIPE,
-    check=True,
-)
-PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
+try:
+    result = subprocess.run(
+        ["git", "rev-parse", "--show-toplevel"],
+        stdout=subprocess.PIPE,
+        check=True,
+    )
+    PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
+except subprocess.CalledProcessError:
+    # If git is not installed, compute repo root as 3 folders up from this file
+    path_ = os.path.abspath(__file__)
+    for _ in range(4):
+        path_ = os.path.dirname(path_)
+    PYTORCH_ROOT = path_
 
 DRY_RUN = False
 
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index 0f3619ad0fff..736ab6addb84 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -2,29 +2,31 @@
     "clang-format": {
         "Darwin": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/mac/clang-format-mojave",
-            "hash": "1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d",
-            "object_name": "mac/clang-format-mojave",
-            "s3_bucket": "oss-clang-format"
+            "hash": "1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d"
         },
         "Linux": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64",
-            "hash": "e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856",
-            "object_name": "linux64/clang-format-linux64",
-            "s3_bucket": "oss-clang-format"
+            "hash": "e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856"
         }
     },
     "clang-tidy": {
         "Darwin": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/clang-tidy",
-            "hash": "541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36",
-            "object_name": "macos/clang-tidy",
-            "s3_bucket": "oss-clang-format"
+            "hash": "541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36"
         },
         "Linux": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-tidy",
-            "hash": "49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5",
-            "object_name": "linx64/clang-tidy",
-            "s3_bucket": "oss-clang-format"
+            "hash": "49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5"
+        }
+    },
+    "actionlint": {
+        "Darwin": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/actionlint",
+            "hash": "3ce2c94280c540e20b270acae60bdd9e72ad17d6cb35b688951b1ec1eb8cbdd6"
+        },
+        "Linux": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/actionlint",
+            "hash": "693f464106474760f0edf4a1778215095eacc4bd5f79aab5dc950892f120828b"
         }
     }
 }
diff --git a/tools/linter/adapters/shellcheck_linter.py b/tools/linter/adapters/shellcheck_linter.py
new file mode 100644
index 000000000000..d94c5a1ce047
--- /dev/null
+++ b/tools/linter/adapters/shellcheck_linter.py
@@ -0,0 +1,118 @@
+import argparse
+import json
+import logging
+import subprocess
+import time
+import shutil
+from enum import Enum
+from typing import List, NamedTuple, Optional
+
+
+LINTER_CODE = "SHELLCHECK"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+def run_command(
+    args: List[str],
+) -> "subprocess.CompletedProcess[bytes]":
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+def check_files(
+    files: List[str],
+) -> List[LintMessage]:
+    try:
+        proc = run_command(
+            ["shellcheck", "--external-sources", "--format=json1"] + files
+        )
+    except OSError as err:
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
+            )
+        ]
+    stdout = str(proc.stdout, "utf-8").strip()
+    results = json.loads(stdout)["comments"]
+    return [
+        LintMessage(
+            path=result["file"],
+            name=f"SC{result['code']}",
+            description=result["message"],
+            line=result["line"],
+            char=result["column"],
+            code=LINTER_CODE,
+            severity=LintSeverity.ERROR,
+            original=None,
+            replacement=None,
+        )
+        for result in results
+    ]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="shellcheck runner",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    if shutil.which("shellcheck") is None:
+        err_msg = LintMessage(
+            path="<none>",
+            line=None,
+            char=None,
+            code=LINTER_CODE,
+            severity=LintSeverity.ERROR,
+            name="command-failed",
+            original=None,
+            replacement=None,
+            description="shellcheck is not installed, did you forget to run `lintrunner init`?",
+        )
+        print(json.dumps(err_msg._asdict()), flush=True)
+        exit(0)
+
+    args = parser.parse_args()
+
+    lint_messages = check_files(args.filenames)
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
diff --git a/tools/linter/adapters/testowners_linter.py b/tools/linter/adapters/testowners_linter.py
new file mode 100755
index 000000000000..b65cfde4d79d
--- /dev/null
+++ b/tools/linter/adapters/testowners_linter.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232.
+
+This lint verifies that every Python test file (file that matches test_*.py or *_test.py in the test folder)
+has valid ownership information in a comment header. Valid means:
+  - The format of the header follows the pattern "# Owner(s): ["list", "of owner", "labels"]
+  - Each owner label actually exists in PyTorch
+  - Each owner label starts with "module: " or "oncall: " or is in ACCEPTABLE_OWNER_LABELS
+"""
+import json
+import argparse
+from enum import Enum
+from typing import List, Any, Optional, NamedTuple
+from urllib.request import urlopen
+
+
+LINTER_CODE = "TESTOWNERS"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions
+ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"]
+OWNERS_PREFIX = "# Owner(s): "
+
+
+def get_pytorch_labels() -> Any:
+    labels = (
+        urlopen("https://ossci-metrics.s3.amazonaws.com/pytorch_labels.json")
+        .read()
+        .decode("utf-8")
+    )
+    return json.loads(labels)
+
+
+PYTORCH_LABELS = get_pytorch_labels()
+# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions
+ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"]
+GLOB_EXCEPTIONS = ["**/test/run_test.py"]
+
+
+def check_labels(
+    labels: List[str], filename: str, line_number: int
+) -> List[LintMessage]:
+    lint_messages = []
+    for label in labels:
+        if label not in PYTORCH_LABELS:
+            lint_messages.append(
+                LintMessage(
+                    path=filename,
+                    line=line_number,
+                    char=None,
+                    code=LINTER_CODE,
+                    severity=LintSeverity.ERROR,
+                    name="[invalid-label]",
+                    original=None,
+                    replacement=None,
+                    description=(
+                        f"{label} is not a PyTorch label "
+                        "(please choose from https://github.com/pytorch/pytorch/labels)"
+                    ),
+                )
+            )
+
+        if (
+            label.startswith("module:")
+            or label.startswith("oncall:")
+            or label in ACCEPTABLE_OWNER_LABELS
+        ):
+            continue
+
+        lint_messages.append(
+            LintMessage(
+                path=filename,
+                line=line_number,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="[invalid-owner]",
+                original=None,
+                replacement=None,
+                description=(
+                    f"{label} is not an acceptable owner "
+                    "(please update to another label or edit ACCEPTABLE_OWNERS_LABELS "
+                    "in tools/linters/adapters/testowners_linter.py"
+                ),
+            )
+        )
+
+    return lint_messages
+
+
+def check_file(filename: str) -> List[LintMessage]:
+    lint_messages = []
+    has_ownership_info = False
+
+    with open(filename) as f:
+        for idx, line in enumerate(f):
+            if not line.startswith(OWNERS_PREFIX):
+                continue
+
+            has_ownership_info = True
+            labels = json.loads(line[len(OWNERS_PREFIX) :])
+            lint_messages.extend(check_labels(labels, filename, idx + 1))
+
+    if has_ownership_info is False:
+        lint_messages.append(
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="[no-owner-info]",
+                original=None,
+                replacement=None,
+                description="Missing a comment header with ownership information.",
+            )
+        )
+
+    return lint_messages
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="test ownership linter",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    args = parser.parse_args()
+    lint_messages = []
+
+    for filename in args.filenames:
+        lint_messages.extend(check_file(filename))
+
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/clang_format_all.py b/tools/linter/clang_format_all.py
deleted file mode 100755
index 7792f15a77d1..000000000000
--- a/tools/linter/clang_format_all.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-"""
-A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is
-also a diff mode which simply checks if clang-format would make any changes, which is useful for
-CI purposes.
-
-If clang-format is not available, the script also downloads a platform-appropriate binary from
-and S3 bucket and verifies it against a precommited set of blessed binary hashes.
-"""
-import argparse
-import asyncio
-import re
-import os
-import sys
-from typing import List, Set
-
-from .clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH
-
-# Allowlist of directories to check. All files that in that directory
-# (recursively) will be checked.
-# If you edit this, please edit the allowlist in clang_format_ci.sh as well.
-CLANG_FORMAT_ALLOWLIST = [
-    "c10/",
-    "torch/csrc/jit/",
-    "test/cpp/jit/",
-    "test/cpp/tensorexpr/"
-]
-
-# Only files with names matching this regex will be formatted.
-CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$")
-
-
-def get_allowlisted_files() -> Set[str]:
-    """
-    Parse CLANG_FORMAT_ALLOWLIST and resolve all directories.
-    Returns the set of allowlist cpp source files.
-    """
-    matches = []
-    for dir in CLANG_FORMAT_ALLOWLIST:
-        for root, dirnames, filenames in os.walk(dir):
-            for filename in filenames:
-                if CPP_FILE_REGEX.match(filename):
-                    matches.append(os.path.join(root, filename))
-    return set(matches)
-
-
-async def run_clang_format_on_file(
-    filename: str,
-    semaphore: asyncio.Semaphore,
-    verbose: bool = False,
-) -> None:
-    """
-    Run clang-format on the provided file.
-    """
-    # -style=file picks up the closest .clang-format, -i formats the files inplace.
-    cmd = "{} -style=file -i {}".format(CLANG_FORMAT_PATH, filename)
-    async with semaphore:
-        proc = await asyncio.create_subprocess_shell(cmd)
-        _ = await proc.wait()
-    if verbose:
-        print("Formatted {}".format(filename))
-
-
-async def file_clang_formatted_correctly(
-    filename: str,
-    semaphore: asyncio.Semaphore,
-    verbose: bool = False,
-) -> bool:
-    """
-    Checks if a file is formatted correctly and returns True if so.
-    """
-    ok = True
-    # -style=file picks up the closest .clang-format
-    cmd = "{} -style=file {}".format(CLANG_FORMAT_PATH, filename)
-
-    async with semaphore:
-        proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE)
-        # Read back the formatted file.
-        stdout, _ = await proc.communicate()
-
-    formatted_contents = stdout.decode()
-    # Compare the formatted file to the original file.
-    with open(filename) as orig:
-        orig_contents = orig.read()
-        if formatted_contents != orig_contents:
-            ok = False
-            if verbose:
-                print("{} is not formatted correctly".format(filename))
-
-    return ok
-
-
-async def run_clang_format(
-    max_processes: int,
-    diff: bool = False,
-    verbose: bool = False,
-) -> bool:
-    """
-    Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX.
-    """
-    # Check to make sure the clang-format binary exists.
-    if not os.path.exists(CLANG_FORMAT_PATH):
-        print("clang-format binary not found")
-        return False
-
-    # Gather command-line options for clang-format.
-    args = [CLANG_FORMAT_PATH, "-style=file"]
-
-    if not diff:
-        args.append("-i")
-
-    ok = True
-
-    # Semaphore to bound the number of subprocesses that can be created at once to format files.
-    semaphore = asyncio.Semaphore(max_processes)
-
-    # Format files in parallel.
-    if diff:
-        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]):
-            ok &= await f
-
-        if ok:
-            print("All files formatted correctly")
-        else:
-            print("Some files not formatted correctly")
-    else:
-        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()])
-
-    return ok
-
-def parse_args(args: List[str]) -> argparse.Namespace:
-    """
-    Parse and return command-line arguments.
-    """
-    parser = argparse.ArgumentParser(
-        description="Execute clang-format on your working copy changes."
-    )
-    parser.add_argument(
-        "-d",
-        "--diff",
-        action="store_true",
-        default=False,
-        help="Determine whether running clang-format would produce changes",
-    )
-    parser.add_argument("--verbose", "-v", action="store_true", default=False)
-    parser.add_argument("--max-processes", type=int, default=50,
-                        help="Maximum number of subprocesses to create to format files in parallel")
-    return parser.parse_args(args)
-
-
-def main(args: List[str]) -> bool:
-    # Parse arguments.
-    options = parse_args(args)
-    # Get clang-format and make sure it is the right binary and it is in the right place.
-    ok = get_and_check_clang_format(options.verbose)
-    # Invoke clang-format on all files in the directories in the allowlist.
-    if ok:
-        loop = asyncio.get_event_loop()
-        ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose))
-
-    # We have to invert because False -> 0, which is the code to be returned if everything is okay.
-    return not ok
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/tools/linter/clang_format_ci.sh b/tools/linter/clang_format_ci.sh
deleted file mode 100755
index 6f5220e516d1..000000000000
--- a/tools/linter/clang_format_ci.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-set -eux
-
-# Runs clang-format on allowlisted files.
-# Requires a single argument, which is the <commit> argument to git-clang-format
-
-# If you edit this allowlist, please edit the one in clang_format_all.py as well
-find . -type f \
-  -path './c10/*' -or \
-  -path './torch/csrc/jit/*' -or \
-  -path './test/cpp/jit/*' -or \
-  -path './test/cpp/tensorexpr/*' \
-  | xargs tools/linter/git-clang-format --verbose "$1" --
diff --git a/tools/linter/clang_format_utils.py b/tools/linter/clang_format_utils.py
deleted file mode 100644
index 021ba9162cca..000000000000
--- a/tools/linter/clang_format_utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-from install.download_bin import download, PYTORCH_ROOT  # type: ignore[import]
-
-# This dictionary maps each platform to the S3 object URL for its clang-format binary.
-PLATFORM_TO_CF_URL = {
-    "Darwin": "https://oss-clang-format.s3.us-east-2.amazonaws.com/mac/clang-format-mojave",
-    "Linux": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64",
-}
-
-# This dictionary maps each platform to a relative path to a file containing its reference hash.
-PLATFORM_TO_HASH = {
-    "Darwin": os.path.join("tools", "clang_format_hash", "mac", "clang-format-mojave"),
-    "Linux": os.path.join("tools", "clang_format_hash", "linux64", "clang-format-linux64"),
-}
-
-CLANG_FORMAT_DIR = os.path.join(PYTORCH_ROOT, ".clang-format-bin")
-CLANG_FORMAT_PATH = os.path.join(CLANG_FORMAT_DIR, "clang-format")
-
-def get_and_check_clang_format(verbose: bool = False) -> bool:
-    return bool(download("clang-format", CLANG_FORMAT_DIR, PLATFORM_TO_CF_URL, PLATFORM_TO_HASH))
diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py
deleted file mode 100644
index fa6403a64bb6..000000000000
--- a/tools/linter/clang_tidy/__main__.py
+++ /dev/null
@@ -1,210 +0,0 @@
-import argparse
-import pathlib
-import os
-import shutil
-import subprocess
-import re
-import sys
-from typing import List
-
-
-from tools.linter.clang_tidy.run import run
-from tools.linter.clang_tidy.generate_build_files import generate_build_files
-from tools.linter.install.clang_tidy import INSTALLATION_PATH
-from tools.linter.install.download_bin import PYTORCH_ROOT
-
-
-def clang_search_dirs() -> List[str]:
-    # Compilers are ordered based on fallback preference
-    # We pick the first one that is available on the system
-    compilers = ["clang", "gcc", "cpp", "cc"]
-    compilers = [c for c in compilers if shutil.which(c) is not None]
-    if len(compilers) == 0:
-        raise RuntimeError(f"None of {compilers} were found")
-    compiler = compilers[0]
-
-    result = subprocess.run(
-        [compiler, "-E", "-x", "c++", "-", "-v"],
-        stdin=subprocess.DEVNULL,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        check=True,
-    )
-    stderr = result.stderr.decode().strip().split("\n")
-    search_start = r"#include.*search starts here:"
-    search_end = r"End of search list."
-
-    append_path = False
-    search_paths = []
-    for line in stderr:
-        if re.match(search_start, line):
-            if append_path:
-                continue
-            else:
-                append_path = True
-        elif re.match(search_end, line):
-            break
-        elif append_path:
-            search_paths.append(line.strip())
-
-    # There are source files include <torch/cuda.h>, <torch/torch.h> etc.
-    # under torch/csrc/api/include folder. Since torch/csrc/api/include is not
-    # a search path for clang-tidy, there will be clang-disagnostic errors
-    # complaing those header files not found. Change the source code to include
-    # full path like torch/csrc/api/include/torch/torch.h does not work well
-    # since torch/torch.h includes torch/all.h which inturn includes more.
-    # We would need recursively change mutliple files.
-    # Adding the include path to the lint script should be a better solution.
-    search_paths.append(
-        os.path.join(PYTORCH_ROOT, "torch/csrc/api/include"),
-    )
-    return search_paths
-
-
-DEFAULTS = {
-    "glob": [
-        # The negative filters below are to exclude files that include onnx_pb.h or
-        # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
-        # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
-        # in a follow up PR.
-        # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
-        # deploy/interpreter files are excluded due to using macros and other techniquies
-        # that are not easily converted to accepted c++
-        "-torch/csrc/jit/passes/onnx/helper.cpp",
-        "-torch/csrc/jit/passes/onnx/shape_type_inference.cpp",
-        "-torch/csrc/jit/serialization/onnx.cpp",
-        "-torch/csrc/jit/serialization/export.cpp",
-        "-torch/csrc/jit/serialization/import.cpp",
-        "-torch/csrc/jit/serialization/import_legacy.cpp",
-        "-torch/csrc/onnx/init.cpp",
-        "-torch/csrc/cuda/nccl.*",
-        "-torch/csrc/cuda/python_nccl.cpp",
-        "-torch/csrc/autograd/FunctionsManual.cpp",
-        "-torch/csrc/generic/*.cpp",
-        "-torch/csrc/jit/codegen/cuda/runtime/*",
-        "-torch/csrc/deploy/interactive_embedded_interpreter.cpp",
-        "-torch/csrc/deploy/interpreter/interpreter.cpp",
-        "-torch/csrc/deploy/interpreter/interpreter.h",
-        "-torch/csrc/deploy/interpreter/interpreter_impl.h",
-        "-torch/csrc/deploy/interpreter/test_main.cpp",
-        "-torch/csrc/deploy/test_deploy_python_ext.cpp",
-    ],
-    "paths": ["torch/csrc/"],
-    "include-dir": ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs(),
-    "clang-tidy-exe": INSTALLATION_PATH,
-    "compile-commands-dir": "build",
-    "config-file": ".clang-tidy",
-    "disable-progress-bar": False,
-}
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="clang-tidy wrapper script")
-    parser.add_argument(
-        "-e",
-        "--clang-tidy-exe",
-        default=DEFAULTS["clang-tidy-exe"],
-        help="Path to clang-tidy executable",
-    )
-    parser.add_argument(
-        "-g",
-        "--glob",
-        action="append",
-        default=DEFAULTS["glob"],
-        help="Only lint files that match these glob patterns "
-        "(see documentation for `fnmatch` for supported syntax)."
-        "If a pattern starts with a - the search is negated for that pattern.",
-    )
-    parser.add_argument(
-        "-x",
-        "--regex",
-        action="append",
-        default=[],
-        help="Only lint files that match these regular expressions (from the start of the filename). "
-        "If a pattern starts with a - the search is negated for that pattern.",
-    )
-    parser.add_argument(
-        "-c",
-        "--compile-commands-dir",
-        default=DEFAULTS["compile-commands-dir"],
-        help="Path to the folder containing compile_commands.json",
-    )
-    parser.add_argument(
-        "--diff-file",
-        help="File containing diff to use for determining files to lint and line filters",
-    )
-    parser.add_argument(
-        "-p",
-        "--paths",
-        nargs="+",
-        default=DEFAULTS["paths"],
-        help="Lint only the given paths (recursively)",
-    )
-    parser.add_argument(
-        "-n",
-        "--dry-run",
-        action="store_true",
-        help="Only show the command to be executed, without running it",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
-    parser.add_argument("-q", "--quiet", action="store_true", help="Don't print output")
-    parser.add_argument(
-        "--config-file",
-        default=DEFAULTS["config-file"],
-        help="Path to a clang-tidy config file. Defaults to '.clang-tidy'.",
-    )
-    parser.add_argument(
-        "--print-include-paths",
-        action="store_true",
-        help="Print the search paths used for include directives",
-    )
-    parser.add_argument(
-        "-I",
-        "--include-dir",
-        action="append",
-        default=DEFAULTS["include-dir"],
-        help="Add the specified directory to the search path for include files",
-    )
-    parser.add_argument(
-        "-s",
-        "--suppress-diagnostics",
-        action="store_true",
-        help="Add NOLINT to suppress clang-tidy violations",
-    )
-    parser.add_argument(
-        "--disable-progress-bar",
-        action="store_true",
-        default=DEFAULTS["disable-progress-bar"],
-        help="Disable the progress bar",
-    )
-    parser.add_argument(
-        "extra_args", nargs="*", help="Extra arguments to forward to clang-tidy"
-    )
-    return parser.parse_args()
-
-
-def main() -> None:
-    options = parse_args()
-
-    if not pathlib.Path("build").exists():
-        generate_build_files()
-
-    # Check if clang-tidy executable exists
-    exists = os.access(options.clang_tidy_exe, os.X_OK)
-
-    if not exists:
-        msg = (
-            f"Could not find '{options.clang_tidy_exe}'\n"
-            + "We provide a custom build of clang-tidy that has additional checks.\n"
-            + "You can install it by running:\n"
-            + "$ python3 -m tools.linter.install.clang_tidy \n"
-            + "from the pytorch folder"
-        )
-        raise RuntimeError(msg)
-
-    result, _ = run(options)
-    sys.exit(result.returncode)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/linter/clang_tidy/generate_build_files.py b/tools/linter/clang_tidy/generate_build_files.py
index 9e3db664ab0d..fff8bf492e0f 100644
--- a/tools/linter/clang_tidy/generate_build_files.py
+++ b/tools/linter/clang_tidy/generate_build_files.py
@@ -6,8 +6,15 @@
 
 def run_cmd(cmd: List[str]) -> None:
     print(f"Running: {cmd}")
-    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,)
-    stdout, stderr = result.stdout.decode("utf-8").strip(), result.stderr.decode("utf-8").strip()
+    result = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    stdout, stderr = (
+        result.stdout.decode("utf-8").strip(),
+        result.stderr.decode("utf-8").strip(),
+    )
     print(stdout)
     print(stderr)
     if result.returncode != 0:
@@ -36,7 +43,7 @@ def run_autogen() -> None:
         [
             sys.executable,
             "-m",
-            "tools.codegen.gen",
+            "torchgen.gen",
             "-s",
             "aten/src/ATen",
             "-d",
@@ -51,8 +58,9 @@ def run_autogen() -> None:
             "tools/setup_helpers/generate_code.py",
             "--native-functions-path",
             "aten/src/ATen/native/native_functions.yaml",
-            "--nn-path",
-            "aten/src",
+            "--tags-path",
+            "aten/src/ATen/native/tags.yaml",
+            "--gen_lazy_ts_backend",
         ]
     )
 
diff --git a/tools/linter/clang_tidy/max_tokens_pragma.py b/tools/linter/clang_tidy/max_tokens_pragma.py
deleted file mode 100644
index 4f7b152659f7..000000000000
--- a/tools/linter/clang_tidy/max_tokens_pragma.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import argparse
-import re
-from typing import List
-
-
-# > Why is DEFAULT_MAX_TOKEN_COUNT set to 1?
-#
-# clang-tidy doesn't have a direct way to query for token counts in the
-# codebase. The workaround is to set the max token count to 1. This will cause
-# clang-tidy to output a warning with the actual token count of the file.
-#
-# A non-destructive way to set the max token count to 1 would be to pass it
-# through the -fmax-tokens option. However, this flag will be overridden if here
-# exists a #pragma max_tokens_total statement in the file. This necessitates a
-# destructive way to set the max token count to 1.
-DEFAULT_MAX_TOKEN_COUNT = 1
-MAX_TOKENS_CHECK_DIAG_NAME = "misc-max-tokens"
-MAX_TOKENS_PRAGMA_PATTERN = r"^#pragma\s+clang\s+max_tokens_total\s+(\d+)$"
-
-
-def add_max_tokens_pragma(code: str, num_max_tokens: int) -> str:
-    lines = code.splitlines()
-
-    found_pragma = False
-    pragma = f"#pragma clang max_tokens_total {num_max_tokens}"
-
-    for idx, line in enumerate(lines):
-        match = re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip())
-        if match:
-            found_pragma = True
-            token_count = match.group(1)
-            if int(token_count) != num_max_tokens:
-                lines[idx] = pragma
-
-    if not found_pragma:
-        lines = [pragma] + lines
-
-    return "\n".join(lines)
-
-
-def strip_max_tokens_pragmas(code: str) -> str:
-    lines = code.splitlines()
-    lines = [
-        line
-        for line in lines
-        if re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip()) is None
-    ]
-    return "\n".join(lines)
-
-
-def add_max_tokens_pragma_to_files(files: List[str], num_max_tokens: int) -> None:
-    for filename in files:
-        with open(filename, "r+") as f:
-            data = f.read()
-            data = add_max_tokens_pragma(data, num_max_tokens)
-
-            f.seek(0)
-            f.write(data)
-            f.truncate()
-
-
-def strip_max_tokens_pragma_from_files(files: List[str]) -> None:
-    for filename in files:
-        with open(filename, "r+") as f:
-            data = f.read()
-            data = strip_max_tokens_pragmas(data)
-
-            f.seek(0)
-            f.write(data)
-            f.truncate()
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Add max_tokens_total pragmas to C/C++ source files"
-    )
-    parser.add_argument(
-        "-n",
-        "--num-max-tokens",
-        default=DEFAULT_MAX_TOKEN_COUNT,
-        help="Set the token count to this value",
-        type=int,
-    )
-    parser.add_argument(
-        "files", nargs="+", help="Add max_tokens_total pragmas to the specified files"
-    )
-    parser.add_argument(
-        "-i", "--ignore", nargs="+", default=[], help="Ignore the specified files"
-    )
-    parser.add_argument(
-        "-s",
-        "--strip",
-        action="store_true",
-        help="Remove max_tokens_total pragmas from the input files",
-    )
-    return parser.parse_args()
-
-
-def main() -> None:
-    options = parse_args()
-
-    ignored = set(options.ignore)
-    files = [filename for filename in options.files if filename not in ignored]
-    if options.strip:
-        strip_max_tokens_pragma_from_files(files)
-    else:
-        add_max_tokens_pragma_to_files(files, options.num_max_tokens)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/linter/clang_tidy/requirements.txt b/tools/linter/clang_tidy/requirements.txt
deleted file mode 100644
index faea93fd550a..000000000000
--- a/tools/linter/clang_tidy/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-unidiff==0.6.0
diff --git a/tools/linter/clang_tidy/run.py b/tools/linter/clang_tidy/run.py
deleted file mode 100644
index 9e71333475fc..000000000000
--- a/tools/linter/clang_tidy/run.py
+++ /dev/null
@@ -1,516 +0,0 @@
-#!/usr/bin/env python3
-"""
-A driver script to run clang-tidy on changes detected via git.
-
-By default, clang-tidy runs on all files you point it at. This means that even
-if you changed only parts of that file, you will get warnings for the whole
-file. This script has the ability to ask git for the exact lines that have
-changed since a particular git revision, and makes clang-tidy only lint those.
-This makes it much less overhead to integrate in CI and much more relevant to
-developers. This git-enabled mode is optional, and full scans of a directory
-tree are also possible. In both cases, the script allows filtering files via
-glob or regular expressions.
-"""
-
-
-import collections
-import fnmatch
-import json
-import os
-import os.path
-import re
-import shutil
-import sys
-import asyncio
-import shlex
-import multiprocessing
-
-from typing import Any, Dict, Iterable, List, Set, Tuple
-
-Patterns = collections.namedtuple("Patterns", "positive, negative")
-
-
-# NOTE: Clang-tidy cannot lint headers directly, because headers are not
-# compiled -- translation units are, of which there is one per implementation
-# (c/cc/cpp) file.
-DEFAULT_FILE_PATTERN = re.compile(r"^.*\.c(c|pp)?$")
-CLANG_WARNING_PATTERN = re.compile(
-    r"([^:]+):(\d+):\d+:\s+(warning|error):.*\[([^\]]+)\]"
-)
-# Set from command line arguments in main().
-VERBOSE = False
-QUIET = False
-
-
-def log(*args: Any, **kwargs: Any) -> None:
-    if not QUIET:
-        print(*args, **kwargs)
-
-
-class CommandResult:
-    def __init__(self, returncode: int, stdout: str, stderr: str):
-        self.returncode = returncode
-        self.stdout = stdout.strip()
-        self.stderr = stderr.strip()
-
-    def failed(self) -> bool:
-        return self.returncode != 0
-
-    def __add__(self, other: "CommandResult") -> "CommandResult":
-        return CommandResult(
-            self.returncode + other.returncode,
-            f"{self.stdout}\n{other.stdout}",
-            f"{self.stderr}\n{other.stderr}",
-        )
-
-    def __str__(self) -> str:
-        return f"{self.stdout}"
-
-    def __repr__(self) -> str:
-        return (
-            f"returncode: {self.returncode}\n"
-            + f"stdout: {self.stdout}\n"
-            + f"stderr: {self.stderr}"
-        )
-
-
-class ProgressMeter:
-    def __init__(
-        self, num_items: int, start_msg: str = "", disable_progress_bar: bool = False
-    ) -> None:
-        self.num_items = num_items
-        self.num_processed = 0
-        self.width = 80
-        self.disable_progress_bar = disable_progress_bar
-
-        # helper escape sequences
-        self._clear_to_end = "\x1b[2K"
-        self._move_to_previous_line = "\x1b[F"
-        self._move_to_start_of_line = "\r"
-        self._move_to_next_line = "\n"
-
-        if self.disable_progress_bar:
-            log(start_msg)
-        else:
-            self._write(
-                start_msg
-                + self._move_to_next_line
-                + "[>"
-                + (self.width * " ")
-                + "]"
-                + self._move_to_start_of_line
-            )
-            self._flush()
-
-    def _write(self, s: str) -> None:
-        sys.stderr.write(s)
-
-    def _flush(self) -> None:
-        sys.stderr.flush()
-
-    def update(self, msg: str) -> None:
-        if self.disable_progress_bar:
-            return
-
-        # Once we've processed all items, clear the progress bar
-        if self.num_processed == self.num_items - 1:
-            self._write(self._clear_to_end)
-            return
-
-        # NOP if we've already processed all items
-        if self.num_processed > self.num_items:
-            return
-
-        self.num_processed += 1
-
-        self._write(
-            self._move_to_previous_line
-            + self._clear_to_end
-            + msg
-            + self._move_to_next_line
-        )
-
-        progress = int((self.num_processed / self.num_items) * self.width)
-        padding = self.width - progress
-        self._write(
-            self._move_to_start_of_line
-            + self._clear_to_end
-            + f"({self.num_processed} of {self.num_items}) "
-            + f"[{progress*'='}>{padding*' '}]"
-            + self._move_to_start_of_line
-        )
-        self._flush()
-
-    def print(self, msg: str) -> None:
-        if QUIET:
-            return
-        elif self.disable_progress_bar:
-            print(msg)
-        else:
-            self._write(
-                self._clear_to_end
-                + self._move_to_previous_line
-                + self._clear_to_end
-                + msg
-                + self._move_to_next_line
-                + self._move_to_next_line
-            )
-            self._flush()
-
-
-class ClangTidyWarning:
-    def __init__(self, name: str, occurrences: List[Tuple[str, int]]):
-        self.name = name
-        self.occurrences = occurrences
-
-    def __str__(self) -> str:
-        base = f"[{self.name}] occurred {len(self.occurrences)} times\n"
-        for occ in self.occurrences:
-            base += f"    {occ[0]}:{occ[1]}\n"
-        return base
-
-
-async def run_shell_command(
-    cmd: List[str], on_completed: Any = None, *args: Any
-) -> CommandResult:
-    """Executes a shell command and runs an optional callback when complete"""
-    if VERBOSE:
-        log("Running: ", " ".join(cmd))
-
-    proc = await asyncio.create_subprocess_shell(
-        " ".join(shlex.quote(x) for x in cmd),  # type: ignore[attr-defined]
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-    )
-    output = await proc.communicate()
-    result = CommandResult(
-        returncode=proc.returncode if proc.returncode is not None else -1,
-        stdout=output[0].decode("utf-8").strip(),
-        stderr=output[1].decode("utf-8").strip(),
-    )
-
-    if on_completed:
-        on_completed(result, *args)
-
-    return result
-
-
-async def _run_clang_tidy_in_parallel(
-    commands: List[Tuple[List[str], str]], disable_progress_bar: bool
-) -> CommandResult:
-    progress_meter = ProgressMeter(
-        len(commands),
-        f"Processing {len(commands)} clang-tidy jobs",
-        disable_progress_bar=disable_progress_bar,
-    )
-
-    async def gather_with_concurrency(n: int, tasks: List[Any]) -> Any:
-        semaphore = asyncio.Semaphore(n)
-
-        async def sem_task(task: Any) -> Any:
-            async with semaphore:
-                return await task
-
-        return await asyncio.gather(
-            *(sem_task(task) for task in tasks), return_exceptions=True
-        )
-
-    async def helper() -> Any:
-        def on_completed(result: CommandResult, filename: str) -> None:
-            if result.failed():
-                msg = str(result) if not VERBOSE else repr(result)
-                progress_meter.print(msg)
-            progress_meter.update(f"Processed {filename}")
-
-        coros = [
-            run_shell_command(cmd, on_completed, filename)
-            for (cmd, filename) in commands
-        ]
-        return await gather_with_concurrency(multiprocessing.cpu_count(), coros)
-
-    results = await helper()
-    return sum(results, CommandResult(0, "", ""))
-
-
-async def _run_clang_tidy(
-    options: Any, line_filters: List[Dict[str, Any]], files: Iterable[str]
-) -> CommandResult:
-    """Executes the actual clang-tidy command in the shell."""
-
-    base = [options.clang_tidy_exe]
-
-    # Apply common options
-    base += ["-p", options.compile_commands_dir]
-    if not options.config_file and os.path.exists(".clang-tidy"):
-        options.config_file = ".clang-tidy"
-    if options.config_file:
-        import yaml
-
-        with open(options.config_file) as config:
-            # Here we convert the YAML config file to a JSON blob.
-            base += [
-                "-config",
-                json.dumps(yaml.load(config, Loader=yaml.SafeLoader)),
-            ]
-    if options.print_include_paths:
-        base += ["--extra-arg", "-v"]
-    if options.include_dir:
-        for dir in options.include_dir:
-            base += ["--extra-arg", f"-I{dir}"]
-    base += options.extra_args
-    if line_filters:
-        base += ["-line-filter", json.dumps(line_filters)]
-
-    # Apply per-file options
-    commands = []
-    for f in files:
-        command = list(base) + [map_filename(options.compile_commands_dir, f)]
-        commands.append((command, f))
-
-    if options.dry_run:
-        return CommandResult(0, str([c for c, _ in commands]), "")
-
-    return await _run_clang_tidy_in_parallel(commands, options.disable_progress_bar)
-
-
-def extract_warnings(
-    output: str, base_dir: str = "."
-) -> Tuple[Dict[str, Dict[int, Set[str]]], List[ClangTidyWarning]]:
-    warn2occ: Dict[str, List[Tuple[str, int]]] = {}
-    fixes: Dict[str, Dict[int, Set[str]]] = {}
-    for line in output.splitlines():
-        p = CLANG_WARNING_PATTERN.match(line)
-        if p is None:
-            continue
-        if os.path.isabs(p.group(1)):
-            path = os.path.abspath(p.group(1))
-        else:
-            path = os.path.abspath(os.path.join(base_dir, p.group(1)))
-        line_no = int(p.group(2))
-
-        # Filter out any options (which start with '-')
-        warning_names = set([w for w in p.group(4).split(",") if not w.startswith("-")])
-
-        for name in warning_names:
-            if name not in warn2occ:
-                warn2occ[name] = []
-            warn2occ[name].append((path, line_no))
-
-        if path not in fixes:
-            fixes[path] = {}
-        if line_no not in fixes[path]:
-            fixes[path][line_no] = set()
-        fixes[path][line_no].update(warning_names)
-
-    warnings = [ClangTidyWarning(name, sorted(occ)) for name, occ in warn2occ.items()]
-
-    return fixes, warnings
-
-
-def apply_nolint(fname: str, warnings: Dict[int, Set[str]]) -> None:
-    with open(fname, encoding="utf-8") as f:
-        lines = f.readlines()
-
-    line_offset = -1  # As in .cpp files lines are numbered starting from 1
-    for line_no in sorted(warnings.keys()):
-        nolint_diagnostics = ",".join(warnings[line_no])
-        line_no += line_offset
-        indent = " " * (len(lines[line_no]) - len(lines[line_no].lstrip(" ")))
-        lines.insert(line_no, f"{indent}// NOLINTNEXTLINE({nolint_diagnostics})\n")
-        line_offset += 1
-
-    with open(fname, mode="w") as f:
-        f.write("".join(lines))
-
-
-# Functions for correct handling of "ATen/native/cpu" mapping
-# Sources in that folder are not built in place but first copied into build folder with `.[CPUARCH].cpp` suffixes
-def map_filename(build_folder: str, fname: str) -> str:
-    fname = os.path.relpath(fname)
-    native_cpu_prefix = "aten/src/ATen/native/cpu/"
-    build_cpu_prefix = os.path.join(build_folder, native_cpu_prefix, "")
-    default_arch_suffix = ".DEFAULT.cpp"
-    if fname.startswith(native_cpu_prefix) and fname.endswith(".cpp"):
-        return (
-            f"{build_cpu_prefix}{fname[len(native_cpu_prefix):]}{default_arch_suffix}"
-        )
-    if fname.startswith(build_cpu_prefix) and fname.endswith(default_arch_suffix):
-        return f"{native_cpu_prefix}{fname[len(build_cpu_prefix):-len(default_arch_suffix)]}"
-    return fname
-
-
-def map_filenames(build_folder: str, fnames: Iterable[str]) -> List[str]:
-    return [map_filename(build_folder, fname) for fname in fnames]
-
-
-def split_negative_from_positive_patterns(patterns: Iterable[str]) -> Patterns:
-    """Separates negative patterns (that start with a dash) from positive patterns"""
-    positive, negative = [], []
-    for pattern in patterns:
-        if pattern.startswith("-"):
-            negative.append(pattern[1:])
-        else:
-            positive.append(pattern)
-
-    return Patterns(positive, negative)
-
-
-def get_file_patterns(globs: Iterable[str], regexes: Iterable[str]) -> Patterns:
-    """Returns a list of compiled regex objects from globs and regex pattern strings."""
-    # fnmatch.translate converts a glob into a regular expression.
-    # https://docs.python.org/2/library/fnmatch.html#fnmatch.translate
-    glob = split_negative_from_positive_patterns(globs)
-    regexes_ = split_negative_from_positive_patterns(regexes)
-
-    positive_regexes = regexes_.positive + [fnmatch.translate(g) for g in glob.positive]
-    negative_regexes = regexes_.negative + [fnmatch.translate(g) for g in glob.negative]
-
-    positive_patterns = [re.compile(regex) for regex in positive_regexes] or [
-        DEFAULT_FILE_PATTERN
-    ]
-    negative_patterns = [re.compile(regex) for regex in negative_regexes]
-
-    return Patterns(positive_patterns, negative_patterns)
-
-
-def filter_files(files: Iterable[str], file_patterns: Patterns) -> Iterable[str]:
-    """Returns all files that match any of the patterns."""
-    if VERBOSE:
-        log("Filtering with these file patterns: {}".format(file_patterns))
-    for file in files:
-        if not any(n.match(file) for n in file_patterns.negative):
-            if any(p.match(file) for p in file_patterns.positive):
-                yield file
-                continue
-        if VERBOSE:
-            log(f"{file} omitted due to file filters")
-
-
-async def get_all_files(paths: List[str]) -> List[str]:
-    """Returns all files that are tracked by git in the given paths."""
-    output = await run_shell_command(["git", "ls-files"] + paths)
-    return str(output).strip().splitlines()
-
-
-def find_changed_lines(diff: str) -> Dict[str, List[Tuple[int, int]]]:
-    # Delay import since this isn't required unless using the --diff-file
-    # argument, which for local runs people don't care about
-    try:
-        import unidiff  # type: ignore[import]
-    except ImportError as e:
-        e.msg += ", run 'pip install unidiff'"  # type: ignore[attr-defined]
-        raise e
-
-    files: Any = collections.defaultdict(list)
-
-    for file in unidiff.PatchSet(diff):
-        for hunk in file:
-            added_line_nos = [line.target_line_no for line in hunk if line.is_added]
-
-            if len(added_line_nos) == 0:
-                continue
-
-            # Convert list of line numbers to ranges
-            # Eg: [1, 2, 3, 12, 13, 14, 15] becomes [[1,3], [12, 15]]
-            i = 1
-            ranges = [[added_line_nos[0], added_line_nos[0]]]
-            while i < len(added_line_nos):
-                if added_line_nos[i] != added_line_nos[i - 1] + 1:
-                    ranges[-1][1] = added_line_nos[i - 1]
-                    ranges.append([added_line_nos[i], added_line_nos[i]])
-                i += 1
-            ranges[-1][1] = added_line_nos[-1]
-
-            files[file.path] += ranges
-
-    return dict(files)
-
-
-def filter_from_diff(
-    paths: List[str], diffs: List[str]
-) -> Tuple[List[str], List[Dict[Any, Any]]]:
-    files = []
-    line_filters = []
-
-    for diff in diffs:
-        changed_files = find_changed_lines(diff)
-        changed_files = {
-            filename: v
-            for filename, v in changed_files.items()
-            if any(filename.startswith(path) for path in paths)
-        }
-        line_filters += [
-            {"name": name, "lines": lines} for name, lines, in changed_files.items()
-        ]
-        files += list(changed_files.keys())
-
-    return files, line_filters
-
-
-def filter_from_diff_file(
-    paths: List[str], filename: str
-) -> Tuple[List[str], List[Dict[Any, Any]]]:
-    with open(filename, "r") as f:
-        diff = f.read()
-    return filter_from_diff(paths, [diff])
-
-
-async def filter_default(paths: List[str]) -> Tuple[List[str], List[Dict[Any, Any]]]:
-    return await get_all_files(paths), []
-
-
-async def _run(options: Any) -> Tuple[CommandResult, List[ClangTidyWarning]]:
-    # These flags are pervasive enough to set it globally. It makes the code
-    # cleaner compared to threading it through every single function.
-    global VERBOSE
-    global QUIET
-    VERBOSE = options.verbose
-    QUIET = options.quiet
-
-    # Normalize the paths first
-    paths = [path.rstrip("/") for path in options.paths]
-
-    # Filter files
-    if options.diff_file:
-        files, line_filters = filter_from_diff_file(options.paths, options.diff_file)
-    else:
-        files, line_filters = await filter_default(options.paths)
-
-    file_patterns = get_file_patterns(options.glob, options.regex)
-    files = list(filter_files(files, file_patterns))
-
-    # clang-tidy errors when it does not get input files.
-    if not files:
-        log("No files detected")
-        return CommandResult(0, "", ""), []
-
-    result = await _run_clang_tidy(options, line_filters, files)
-    fixes, warnings = extract_warnings(
-        result.stdout, base_dir=options.compile_commands_dir
-    )
-
-    if options.suppress_diagnostics:
-        for fname in fixes.keys():
-            mapped_fname = map_filename(options.compile_commands_dir, fname)
-            log(f"Applying fixes to {mapped_fname}")
-            apply_nolint(fname, fixes[fname])
-            if os.path.relpath(fname) != mapped_fname:
-                shutil.copyfile(fname, mapped_fname)
-
-    if options.dry_run:
-        log(result)
-    elif result.failed():
-        # If you change this message, update the error checking logic in
-        # .github/workflows/lint.yml
-        msg = "Warnings detected!"
-        log(msg)
-        log("Summary:")
-        for w in warnings:
-            log(str(w))
-
-    return result, warnings
-
-
-def run(options: Any) -> Tuple[CommandResult, List[ClangTidyWarning]]:
-    loop = asyncio.get_event_loop()
-    return loop.run_until_complete(_run(options))
diff --git a/tools/linter/flake8_hook.py b/tools/linter/flake8_hook.py
deleted file mode 100755
index b9ebd5b47931..000000000000
--- a/tools/linter/flake8_hook.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-
-from flake8.main import git  # type: ignore[import]
-
-if __name__ == '__main__':
-    sys.exit(
-        git.hook(
-            strict=True,
-            lazy=git.config_for('lazy'),
-        )
-    )
diff --git a/tools/linter/git-clang-format b/tools/linter/git-clang-format
deleted file mode 100755
index 13073b6ecbfa..000000000000
--- a/tools/linter/git-clang-format
+++ /dev/null
@@ -1,655 +0,0 @@
-#!/usr/bin/env python3
-#
-# ===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===------------------------------------------------------------------------===#
-
-r"""
-clang-format git integration
-============================
-
-This file provides a clang-format integration for git. Put it somewhere in your
-path and ensure that it is executable. Then, "git clang-format" will invoke
-clang-format on the changes in current files or a specific commit.
-
-For further details, run:
-git clang-format -h
-
-Requires Python 2.7 or Python 3
-"""
-
-from __future__ import absolute_import, division, print_function
-import argparse
-import collections
-import contextlib
-import errno
-import os
-import re
-import subprocess
-import sys
-from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH
-
-usage = "git clang-format [OPTIONS] [<commit>] [<commit>] [--] [<file>...]"
-
-desc = """
-If zero or one commits are given, run clang-format on all lines that differ
-between the working directory and <commit>, which defaults to HEAD.  Changes are
-only applied to the working directory.
-
-If two commits are given (requires --diff), run clang-format on all lines in the
-second <commit> that differ from the first <commit>.
-
-If --binary is unspecified, we will try to fetch the correct clang-format
-binary for PyTorch
-
-The following git-config settings set the default of the corresponding option:
-  clangFormat.binary
-  clangFormat.commit
-  clangFormat.extension
-  clangFormat.style
-"""
-
-# Name of the temporary index file in which save the output of clang-format.
-# This file is created within the .git directory.
-temp_index_basename = "clang-format-index"
-
-
-Range = collections.namedtuple("Range", "start, count")
-
-
-def main():
-    config = load_git_config()
-
-    # In order to keep '--' yet allow options after positionals, we need to
-    # check for '--' ourselves.  (Setting nargs='*' throws away the '--', while
-    # nargs=argparse.REMAINDER disallows options after positionals.)
-    argv = sys.argv[1:]
-    try:
-        idx = argv.index("--")
-    except ValueError:
-        dash_dash = []
-    else:
-        dash_dash = argv[idx:]
-        argv = argv[:idx]
-
-    default_extensions = ",".join(
-        [
-            # From clang/lib/Frontend/FrontendOptions.cpp, all lower case
-            "c",
-            "h",  # C
-            "m",  # ObjC
-            "mm",  # ObjC++
-            "cc",
-            "cp",
-            "cpp",
-            "c++",
-            "cxx",
-            "hh",
-            "hpp",
-            "hxx",  # C++
-            "cu",  # CUDA
-            # Other languages that clang-format supports
-            "proto",
-            "protodevel",  # Protocol Buffers
-            "java",  # Java
-            "js",  # JavaScript
-            "ts",  # TypeScript
-            "cs",  # C Sharp
-        ]
-    )
-
-    p = argparse.ArgumentParser(
-        usage=usage,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        description=desc,
-    )
-    p.add_argument("--binary", default=None, help="path to clang-format"),
-    p.add_argument(
-        "--commit",
-        default=config.get("clangformat.commit", "HEAD"),
-        help="default commit to use if none is specified",
-    ),
-    p.add_argument(
-        "--diff",
-        action="store_true",
-        help="print a diff instead of applying the changes",
-    )
-    p.add_argument(
-        "--extensions",
-        default=config.get("clangformat.extensions", default_extensions),
-        help=(
-            "comma-separated list of file extensions to format, "
-            "excluding the period and case-insensitive"
-        ),
-    ),
-    p.add_argument(
-        "-f", "--force", action="store_true", help="allow changes to unstaged files"
-    )
-    p.add_argument(
-        "-p", "--patch", action="store_true", help="select hunks interactively"
-    )
-    p.add_argument(
-        "-q", "--quiet", action="count", default=0, help="print less information"
-    )
-    p.add_argument(
-        "--style",
-        default=config.get("clangformat.style", None),
-        help="passed to clang-format",
-    ),
-    p.add_argument(
-        "-v", "--verbose", action="count", default=0, help="print extra information"
-    )
-    # We gather all the remaining positional arguments into 'args' since we need
-    # to use some heuristics to determine whether or not <commit> was present.
-    # However, to print pretty messages, we make use of metavar and help.
-    p.add_argument(
-        "args",
-        nargs="*",
-        metavar="<commit>",
-        help="revision from which to compute the diff",
-    )
-    p.add_argument(
-        "ignored",
-        nargs="*",
-        metavar="<file>...",
-        help="if specified, only consider differences in these files",
-    )
-    opts = p.parse_args(argv)
-
-    opts.verbose -= opts.quiet
-    del opts.quiet
-
-    ok = get_and_check_clang_format(opts.verbose)
-    if not ok:
-        # We have to invert because False -> 0, which is the code to be returned if everything is okay.
-        return not ok
-
-    if opts.binary is None:
-        opts.binary = CLANG_FORMAT_PATH
-
-    commits, files = interpret_args(opts.args, dash_dash, opts.commit)
-    if len(commits) > 1:
-        if not opts.diff:
-            die("--diff is required when two commits are given")
-    else:
-        if len(commits) > 2:
-            die("at most two commits allowed; %d given" % len(commits))
-    changed_lines = compute_diff_and_extract_lines(commits, files)
-    if opts.verbose >= 1:
-        ignored_files = set(changed_lines)
-    filter_by_extension(changed_lines, opts.extensions.lower().split(","))
-    if opts.verbose >= 1:
-        ignored_files.difference_update(changed_lines)
-        if ignored_files:
-            print("Ignoring changes in the following files (wrong extension):")
-            for filename in ignored_files:
-                print("    %s" % filename)
-        if changed_lines:
-            print("Running clang-format on the following files:")
-            for filename in changed_lines:
-                print("    %s" % filename)
-    if not changed_lines:
-        print("no modified files to format")
-        return
-    # The computed diff outputs absolute paths, so we must cd before accessing
-    # those files.
-    cd_to_toplevel()
-    if len(commits) > 1:
-        old_tree = commits[1]
-        new_tree = run_clang_format_and_save_to_tree(
-            changed_lines, revision=commits[1], binary=opts.binary, style=opts.style
-        )
-    else:
-        old_tree = create_tree_from_workdir(changed_lines)
-        new_tree = run_clang_format_and_save_to_tree(
-            changed_lines, binary=opts.binary, style=opts.style
-        )
-    if opts.verbose >= 1:
-        print("old tree: %s" % old_tree)
-        print("new tree: %s" % new_tree)
-    if old_tree == new_tree:
-        if opts.verbose >= 0:
-            print("clang-format did not modify any files")
-    elif opts.diff:
-        print_diff(old_tree, new_tree)
-    else:
-        changed_files = apply_changes(
-            old_tree, new_tree, force=opts.force, patch_mode=opts.patch
-        )
-        if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1:
-            print("changed files:")
-            for filename in changed_files:
-                print("    %s" % filename)
-
-
-def load_git_config(non_string_options=None):
-    """Return the git configuration as a dictionary.
-
-  All options are assumed to be strings unless in `non_string_options`, in which
-  is a dictionary mapping option name (in lower case) to either "--bool" or
-  "--int"."""
-    if non_string_options is None:
-        non_string_options = {}
-    out = {}
-    for entry in run("git", "config", "--list", "--null").split("\0"):
-        if entry:
-            name, value = entry.split("\n", 1)
-            if name in non_string_options:
-                value = run("git", "config", non_string_options[name], name)
-            out[name] = value
-    return out
-
-
-def interpret_args(args, dash_dash, default_commit):
-    """Interpret `args` as "[commits] [--] [files]" and return (commits, files).
-
-  It is assumed that "--" and everything that follows has been removed from
-  args and placed in `dash_dash`.
-
-  If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its
-  left (if present) are taken as commits.  Otherwise, the arguments are checked
-  from left to right if they are commits or files.  If commits are not given,
-  a list with `default_commit` is used."""
-    if dash_dash:
-        if len(args) == 0:
-            commits = [default_commit]
-        else:
-            commits = args
-        for commit in commits:
-            object_type = get_object_type(commit)
-            if object_type not in ("commit", "tag"):
-                if object_type is None:
-                    die("'%s' is not a commit" % commit)
-                else:
-                    die(
-                        "'%s' is a %s, but a commit was expected"
-                        % (commit, object_type)
-                    )
-        files = dash_dash[1:]
-    elif args:
-        commits = []
-        while args:
-            if not disambiguate_revision(args[0]):
-                break
-            commits.append(args.pop(0))
-        if not commits:
-            commits = [default_commit]
-        files = args
-    else:
-        commits = [default_commit]
-        files = []
-    return commits, files
-
-
-def disambiguate_revision(value):
-    """Returns True if `value` is a revision, False if it is a file, or dies."""
-    # If `value` is ambiguous (neither a commit nor a file), the following
-    # command will die with an appropriate error message.
-    run("git", "rev-parse", value, verbose=False)
-    object_type = get_object_type(value)
-    if object_type is None:
-        return False
-    if object_type in ("commit", "tag"):
-        return True
-    die("`%s` is a %s, but a commit or filename was expected" % (value, object_type))
-
-
-def get_object_type(value):
-    """Returns a string description of an object's type, or None if it is not
-  a valid git object."""
-    cmd = ["git", "cat-file", "-t", value]
-    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    stdout, stderr = p.communicate()
-    if p.returncode != 0:
-        return None
-    return convert_string(stdout.strip())
-
-
-def compute_diff_and_extract_lines(commits, files):
-    """Calls compute_diff() followed by extract_lines()."""
-    diff_process = compute_diff(commits, files)
-    changed_lines = extract_lines(diff_process.stdout)
-    diff_process.stdout.close()
-    diff_process.wait()
-    if diff_process.returncode != 0:
-        # Assume error was already printed to stderr.
-        sys.exit(2)
-    return changed_lines
-
-
-def compute_diff(commits, files):
-    """Return a subprocess object producing the diff from `commits`.
-
-  The return value's `stdin` file object will produce a patch with the
-  differences between the working directory and the first commit if a single
-  one was specified, or the difference between both specified commits, filtered
-  on `files` (if non-empty).  Zero context lines are used in the patch."""
-    git_tool = "diff-index"
-    if len(commits) > 1:
-        git_tool = "diff-tree"
-    cmd = ["git", git_tool, "-p", "-U0"] + commits + ["--"]
-    cmd.extend(files)
-    p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    p.stdin.close()
-    return p
-
-
-def extract_lines(patch_file):
-    """Extract the changed lines in `patch_file`.
-
-  The return value is a dictionary mapping filename to a list of (start_line,
-  line_count) pairs.
-
-  The input must have been produced with ``-U0``, meaning unidiff format with
-  zero lines of context.  The return value is a dict mapping filename to a
-  list of line `Range`s."""
-    matches = {}
-    for line in patch_file:
-        line = convert_string(line)
-        match = re.search(r"^\+\+\+\ [^/]+/(.*)", line)
-        if match:
-            filename = match.group(1).rstrip("\r\n")
-        match = re.search(r"^@@ -[0-9,]+ \+(\d+)(,(\d+))?", line)
-        if match:
-            start_line = int(match.group(1))
-            line_count = 1
-            if match.group(3):
-                line_count = int(match.group(3))
-            if line_count > 0:
-                matches.setdefault(filename, []).append(Range(start_line, line_count))
-    return matches
-
-
-def filter_by_extension(dictionary, allowed_extensions):
-    """Delete every key in `dictionary` that doesn't have an allowed extension.
-
-  `allowed_extensions` must be a collection of lowercase file extensions,
-  excluding the period."""
-    allowed_extensions = frozenset(allowed_extensions)
-    for filename in list(dictionary.keys()):
-        base_ext = filename.rsplit(".", 1)
-        if len(base_ext) == 1 and "" in allowed_extensions:
-            continue
-        if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions:
-            del dictionary[filename]
-
-
-def cd_to_toplevel():
-    """Change to the top level of the git repository."""
-    toplevel = run("git", "rev-parse", "--show-toplevel")
-    os.chdir(toplevel)
-
-
-def create_tree_from_workdir(filenames):
-    """Create a new git tree with the given files from the working directory.
-
-  Returns the object ID (SHA-1) of the created tree."""
-    return create_tree(filenames, "--stdin")
-
-
-def run_clang_format_and_save_to_tree(
-    changed_lines, revision=None, binary="clang-format", style=None
-):
-    """Run clang-format on each file and save the result to a git tree.
-
-  Returns the object ID (SHA-1) of the created tree."""
-
-    def iteritems(container):
-        try:
-            return container.iteritems()  # Python 2
-        except AttributeError:
-            return container.items()  # Python 3
-
-    def index_info_generator():
-        for filename, line_ranges in iteritems(changed_lines):
-            if revision:
-                git_metadata_cmd = [
-                    "git",
-                    "ls-tree",
-                    "%s:%s" % (revision, os.path.dirname(filename)),
-                    os.path.basename(filename),
-                ]
-                git_metadata = subprocess.Popen(
-                    git_metadata_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE
-                )
-                stdout = git_metadata.communicate()[0]
-                mode = oct(int(stdout.split()[0], 8))
-            else:
-                mode = oct(os.stat(filename).st_mode)
-            # Adjust python3 octal format so that it matches what git expects
-            if mode.startswith("0o"):
-                mode = "0" + mode[2:]
-            blob_id = clang_format_to_blob(
-                filename, line_ranges, revision=revision, binary=binary, style=style
-            )
-            yield "%s %s\t%s" % (mode, blob_id, filename)
-
-    return create_tree(index_info_generator(), "--index-info")
-
-
-def create_tree(input_lines, mode):
-    """Create a tree object from the given input.
-
-  If mode is '--stdin', it must be a list of filenames.  If mode is
-  '--index-info' is must be a list of values suitable for "git update-index
-  --index-info", such as "<mode> <SP> <sha1> <TAB> <filename>".  Any other mode
-  is invalid."""
-    assert mode in ("--stdin", "--index-info")
-    cmd = ["git", "update-index", "--add", "-z", mode]
-    with temporary_index_file():
-        p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
-        for line in input_lines:
-            p.stdin.write(to_bytes("%s\0" % line))
-        p.stdin.close()
-        if p.wait() != 0:
-            die("`%s` failed" % " ".join(cmd))
-        tree_id = run("git", "write-tree")
-        return tree_id
-
-
-def clang_format_to_blob(
-    filename, line_ranges, revision=None, binary="clang-format", style=None
-):
-    """Run clang-format on the given file and save the result to a git blob.
-
-  Runs on the file in `revision` if not None, or on the file in the working
-  directory if `revision` is None.
-
-  Returns the object ID (SHA-1) of the created blob."""
-    clang_format_cmd = [binary]
-    if style:
-        clang_format_cmd.extend(["-style=" + style])
-    clang_format_cmd.extend(
-        [
-            "-lines=%s:%s" % (start_line, start_line + line_count - 1)
-            for start_line, line_count in line_ranges
-        ]
-    )
-    if revision:
-        clang_format_cmd.extend(["-assume-filename=" + filename])
-        git_show_cmd = ["git", "cat-file", "blob", "%s:%s" % (revision, filename)]
-        git_show = subprocess.Popen(
-            git_show_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE
-        )
-        git_show.stdin.close()
-        clang_format_stdin = git_show.stdout
-    else:
-        clang_format_cmd.extend([filename])
-        git_show = None
-        clang_format_stdin = subprocess.PIPE
-    try:
-        clang_format = subprocess.Popen(
-            clang_format_cmd, stdin=clang_format_stdin, stdout=subprocess.PIPE
-        )
-        if clang_format_stdin == subprocess.PIPE:
-            clang_format_stdin = clang_format.stdin
-    except OSError as e:
-        if e.errno == errno.ENOENT:
-            die('cannot find executable "%s"' % binary)
-        else:
-            raise
-    clang_format_stdin.close()
-    hash_object_cmd = ["git", "hash-object", "-w", "--path=" + filename, "--stdin"]
-    hash_object = subprocess.Popen(
-        hash_object_cmd, stdin=clang_format.stdout, stdout=subprocess.PIPE
-    )
-    clang_format.stdout.close()
-    stdout = hash_object.communicate()[0]
-    if hash_object.returncode != 0:
-        die("`%s` failed" % " ".join(hash_object_cmd))
-    if clang_format.wait() != 0:
-        die("`%s` failed" % " ".join(clang_format_cmd))
-    if git_show and git_show.wait() != 0:
-        die("`%s` failed" % " ".join(git_show_cmd))
-    return convert_string(stdout).rstrip("\r\n")
-
-
-@contextlib.contextmanager
-def temporary_index_file(tree=None):
-    """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting
-  the file afterward."""
-    index_path = create_temporary_index(tree)
-    old_index_path = os.environ.get("GIT_INDEX_FILE")
-    os.environ["GIT_INDEX_FILE"] = index_path
-    try:
-        yield
-    finally:
-        if old_index_path is None:
-            del os.environ["GIT_INDEX_FILE"]
-        else:
-            os.environ["GIT_INDEX_FILE"] = old_index_path
-        os.remove(index_path)
-
-
-def create_temporary_index(tree=None):
-    """Create a temporary index file and return the created file's path.
-
-  If `tree` is not None, use that as the tree to read in.  Otherwise, an
-  empty index is created."""
-    gitdir = run("git", "rev-parse", "--git-dir")
-    path = os.path.join(gitdir, temp_index_basename)
-    if tree is None:
-        tree = "--empty"
-    run("git", "read-tree", "--index-output=" + path, tree)
-    return path
-
-
-def print_diff(old_tree, new_tree):
-    """Print the diff between the two trees to stdout."""
-    # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
-    # is expected to be viewed by the user, and only the former does nice things
-    # like color and pagination.
-    #
-    # We also only print modified files since `new_tree` only contains the files
-    # that were modified, so unmodified files would show as deleted without the
-    # filter.
-    subprocess.check_call(["git", "diff", "--diff-filter=M", old_tree, new_tree, "--"])
-
-
-def apply_changes(old_tree, new_tree, force=False, patch_mode=False):
-    """Apply the changes in `new_tree` to the working directory.
-
-  Bails if there are local changes in those files and not `force`.  If
-  `patch_mode`, runs `git checkout --patch` to select hunks interactively."""
-    changed_files = (
-        run(
-            "git",
-            "diff-tree",
-            "--diff-filter=M",
-            "-r",
-            "-z",
-            "--name-only",
-            old_tree,
-            new_tree,
-        )
-        .rstrip("\0")
-        .split("\0")
-    )
-    if not force:
-        unstaged_files = run("git", "diff-files", "--name-status", *changed_files)
-        if unstaged_files:
-            print(
-                "The following files would be modified but " "have unstaged changes:",
-                file=sys.stderr,
-            )
-            print(unstaged_files, file=sys.stderr)
-            print("Please commit, stage, or stash them first.", file=sys.stderr)
-            sys.exit(2)
-    if patch_mode:
-        # In patch mode, we could just as well create an index from the new tree
-        # and checkout from that, but then the user will be presented with a
-        # message saying "Discard ... from worktree".  Instead, we use the old
-        # tree as the index and checkout from new_tree, which gives the slightly
-        # better message, "Apply ... to index and worktree".  This is not quite
-        # right, since it won't be applied to the user's index, but oh well.
-        with temporary_index_file(old_tree):
-            subprocess.check_call(["git", "checkout", "--patch", new_tree])
-        index_tree = old_tree
-    else:
-        with temporary_index_file(new_tree):
-            run("git", "checkout-index", "-a", "-f")
-    return changed_files
-
-
-def run(*args, **kwargs):
-    stdin = kwargs.pop("stdin", "")
-    verbose = kwargs.pop("verbose", True)
-    strip = kwargs.pop("strip", True)
-    for name in kwargs:
-        raise TypeError("run() got an unexpected keyword argument '%s'" % name)
-    p = subprocess.Popen(
-        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE
-    )
-    stdout, stderr = p.communicate(input=stdin)
-
-    stdout = convert_string(stdout)
-    stderr = convert_string(stderr)
-
-    if p.returncode == 0:
-        if stderr:
-            if verbose:
-                print("`%s` printed to stderr:" % " ".join(args), file=sys.stderr)
-            print(stderr.rstrip(), file=sys.stderr)
-        if strip:
-            stdout = stdout.rstrip("\r\n")
-        return stdout
-    if verbose:
-        print("`%s` returned %s" % (" ".join(args), p.returncode), file=sys.stderr)
-    if stderr:
-        print(stderr.rstrip(), file=sys.stderr)
-    sys.exit(2)
-
-
-def die(message):
-    print("error:", message, file=sys.stderr)
-    sys.exit(2)
-
-
-def to_bytes(str_input):
-    # Encode to UTF-8 to get binary data.
-    if isinstance(str_input, bytes):
-        return str_input
-    return str_input.encode("utf-8")
-
-
-def to_string(bytes_input):
-    if isinstance(bytes_input, str):
-        return bytes_input
-    return bytes_input.encode("utf-8")
-
-
-def convert_string(bytes_input):
-    try:
-        return to_string(bytes_input.decode("utf-8"))
-    except AttributeError:  # 'str' object has no attribute 'decode'.
-        return str(bytes_input)
-    except UnicodeError:
-        return str(bytes_input)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/linter/install/clang_tidy.py b/tools/linter/install/clang_tidy.py
deleted file mode 100644
index 28b15edfd9bf..000000000000
--- a/tools/linter/install/clang_tidy.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import os
-from tools.linter.install.download_bin import download, PYTORCH_ROOT, HASH_PATH
-
-PLATFORM_TO_URL = {
-    "Linux": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-tidy",
-    "Darwin": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/clang-tidy",
-}
-
-PLATFORM_TO_HASH = {
-    "Linux": os.path.join(HASH_PATH, "clang-tidy-linux64"),
-    "Darwin": os.path.join(HASH_PATH, "clang-tidy-macos"),
-}
-
-OUTPUT_DIR = os.path.join(PYTORCH_ROOT, ".clang-tidy-bin")
-INSTALLATION_PATH = os.path.join(OUTPUT_DIR, "clang-tidy")
-
-if __name__ == "__main__":
-    ok = download("clang-tidy", OUTPUT_DIR, PLATFORM_TO_URL, PLATFORM_TO_HASH)
-    if not ok:
-        print("Installation failed!")
-        exit(1)
diff --git a/tools/linter/install/download_bin.py b/tools/linter/install/download_bin.py
deleted file mode 100644
index 3bb65baac118..000000000000
--- a/tools/linter/install/download_bin.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import platform
-import sys
-import stat
-import hashlib
-import subprocess
-import os
-import urllib.request
-import urllib.error
-
-from typing import Dict
-
-# String representing the host platform (e.g. Linux, Darwin).
-HOST_PLATFORM = platform.system()
-
-# PyTorch directory root
-result = subprocess.run(
-    ["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE, check=True,
-)
-PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
-
-HASH_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hashes")
-
-
-def compute_file_sha256(path: str) -> str:
-    """Compute the SHA256 hash of a file and return it as a hex string."""
-    # If the file doesn't exist, return an empty string.
-    if not os.path.exists(path):
-        return ""
-
-    hash = hashlib.sha256()
-
-    # Open the file in binary mode and hash it.
-    with open(path, "rb") as f:
-        for b in f:
-            hash.update(b)
-
-    # Return the hash as a hexadecimal string.
-    return hash.hexdigest()
-
-
-def report_download_progress(
-    chunk_number: int, chunk_size: int, file_size: int
-) -> None:
-    """
-    Pretty printer for file download progress.
-    """
-    if file_size != -1:
-        percent = min(1, (chunk_number * chunk_size) / file_size)
-        bar = "#" * int(64 * percent)
-        sys.stdout.write("\r0% |{:<64}| {}%".format(bar, int(percent * 100)))
-
-
-def download_bin(name: str, output_dir: str, platform_to_url: Dict[str, str]) -> bool:
-    """
-    Downloads the binary appropriate for the host platform and stores it in the given output directory.
-    """
-    if HOST_PLATFORM not in platform_to_url:
-        print(f"Unsupported platform: {HOST_PLATFORM}", file=sys.stderr)
-        return False
-
-    url = platform_to_url[HOST_PLATFORM]
-    filename = os.path.join(output_dir, name)
-
-    # Try to download binary.
-    print(f"Downloading {name} to {output_dir}", file=sys.stderr)
-    try:
-        urllib.request.urlretrieve(
-            url,
-            filename,
-            reporthook=report_download_progress if sys.stdout.isatty() else None,
-        )
-    except urllib.error.URLError as e:
-        print(f"Error downloading {filename}: {e}", file=sys.stderr)
-        return False
-    finally:
-        print(file=sys.stderr)
-
-    return True
-
-
-def download(
-    name: str,
-    output_dir: str,
-    platform_to_url: Dict[str, str],
-    platform_to_hash: Dict[str, str],
-    verbose: bool = False,
-) -> bool:
-    """
-    Download a platform-appropriate binary if one doesn't already exist at the expected location and verifies
-    that it is the right binary by checking its SHA256 hash against the expected hash.
-    """
-
-    output_path = os.path.join(output_dir, name)
-    if not os.path.exists(output_dir):
-        # If the directory doesn't exist, try to create it.
-        try:
-            os.mkdir(output_dir)
-        except OSError as e:
-            print(f"Unable to create directory for {name} binary: {output_dir}", file=sys.stderr)
-            return False
-        finally:
-            if verbose:
-                print(f"Created directory {output_dir} for {name} binary", file=sys.stderr)
-
-        # If the directory didn't exist, neither did the binary, so download it.
-        ok = download_bin(name, output_dir, platform_to_url)
-
-        if not ok:
-            return False
-    else:
-        # If the directory exists but the binary doesn't, download it.
-        if not os.path.exists(output_path):
-            ok = download_bin(name, output_dir, platform_to_url)
-
-            if not ok:
-                return False
-        else:
-            if verbose:
-                print(f"Found pre-existing {name} binary, skipping download", file=sys.stderr)
-
-    # Now that the binary is where it should be, hash it.
-    actual_bin_hash = compute_file_sha256(output_path)
-
-    # If the host platform is not in platform_to_hash, it is unsupported.
-    if HOST_PLATFORM not in platform_to_hash:
-        print(f"Unsupported platform: {HOST_PLATFORM}", file=sys.stderr)
-        return False
-
-    # This is the path to the file containing the reference hash.
-    hashpath = os.path.join(PYTORCH_ROOT, platform_to_hash[HOST_PLATFORM])
-
-    if not os.path.exists(hashpath):
-        print("Unable to find reference binary hash", file=sys.stderr)
-        return False
-
-    # Load the reference hash and compare the actual hash to it.
-    with open(hashpath, "r") as f:
-        reference_bin_hash = f.readline().strip()
-
-        if verbose:
-            print(f"Reference Hash: {reference_bin_hash}", file=sys.stderr)
-            print(f"Actual Hash: {repr(actual_bin_hash)}", file=sys.stderr)
-
-        if reference_bin_hash != actual_bin_hash:
-            print("The downloaded binary is not what was expected!", file=sys.stderr)
-            print(f"Downloaded hash: {repr(actual_bin_hash)} vs expected {reference_bin_hash}", file=sys.stderr)
-
-            # Err on the side of caution and try to delete the downloaded binary.
-            try:
-                os.unlink(output_path)
-                print("The binary has been deleted just to be safe", file=sys.stderr)
-            except OSError as e:
-                print(f"Failed to delete binary: {e}", file=sys.stderr)
-                print("Delete this binary as soon as possible and do not execute it!", file=sys.stderr)
-
-            return False
-        else:
-            # Make sure the binary is executable.
-            mode = os.stat(output_path).st_mode
-            mode |= stat.S_IXUSR
-            os.chmod(output_path, mode)
-            print(f"Using {name} located at {output_path}", file=sys.stderr)
-
-    return True
diff --git a/tools/linter/install/hashes/clang-tidy-linux64 b/tools/linter/install/hashes/clang-tidy-linux64
deleted file mode 100644
index 111d45175928..000000000000
--- a/tools/linter/install/hashes/clang-tidy-linux64
+++ /dev/null
@@ -1 +0,0 @@
-49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5
diff --git a/tools/linter/install/hashes/clang-tidy-macos b/tools/linter/install/hashes/clang-tidy-macos
deleted file mode 100644
index 8b688a106156..000000000000
--- a/tools/linter/install/hashes/clang-tidy-macos
+++ /dev/null
@@ -1 +0,0 @@
-541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36
diff --git a/tools/linter/mypy_wrapper.py b/tools/linter/mypy_wrapper.py
deleted file mode 100755
index fb1dbcbc65dd..000000000000
--- a/tools/linter/mypy_wrapper.py
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This module is meant to be run as a script (see the docstring of main
-below) and passed the filename of any Python file in this repo, to
-typecheck that file using only the subset of our mypy configs that apply
-to it.
-
-Since editors (e.g. VS Code) can be configured to use this wrapper
-script in lieu of mypy itself, the idea is that this can be used to get
-inline mypy results while developing, and have at least some degree of
-assurance that those inline results match up with what you would get
-from running the mypy lint from the .github/workflows/lint.yml file.
-
-See also these wiki pages:
-
-- https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch
-- https://github.com/pytorch/pytorch/wiki/Lint-as-you-type
-"""
-
-import sys
-from collections import defaultdict
-from configparser import ConfigParser
-from pathlib import Path, PurePath, PurePosixPath
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import mypy.api
-# not part of the public API, but this is the easiest way to ensure that
-# we agree with what mypy actually does
-import mypy.config_parser
-
-
-def read_config(config_path: Path) -> Set[str]:
-    """
-    Return the set of `files` in the `mypy` ini file at config_path.
-    """
-    config = ConfigParser()
-    config.read(config_path)
-    # hopefully on Windows this gives posix paths
-    return set(mypy.config_parser.split_and_match_files(
-        config['mypy']['files'],
-    ))
-
-
-# see tools/test/test_mypy_wrapper.py for examples of many of the
-# following functions
-
-
-def config_files() -> Dict[str, Set[str]]:
-    """
-    Return a dict from all our `mypy` ini filenames to their `files`.
-    """
-    return {str(ini): read_config(ini) for ini in Path().glob('mypy*.ini')}
-
-
-def split_path(path: str) -> List[str]:
-    """
-    Split a relative (not absolute) POSIX path into its segments.
-    """
-    pure = PurePosixPath(path)
-    return [str(p.name) for p in list(reversed(pure.parents))[1:] + [pure]]
-
-
-# mypy doesn't support recursive types yet
-# https://github.com/python/mypy/issues/731
-
-# but if it did, the `Any` here would be `Union[Set[str], 'Trie']`,
-# although that is not completely accurate: specifically, every `None`
-# key must map to a `Set[str]`, and every `str` key must map to a `Trie`
-Trie = Dict[Optional[str], Any]
-
-
-def make_trie(configs: Dict[str, Set[str]]) -> Trie:
-    """
-    Return a trie from path prefixes to their `mypy` configs.
-
-    Specifically, each layer of the trie represents a segment of a POSIX
-    path relative to the root of this repo. If you follow a path down
-    the trie and reach a `None` key, that `None` maps to the (nonempty)
-    set of keys in `configs` which explicitly include that path.
-    """
-    trie: Trie = {}
-    for ini, files in configs.items():
-        for f in files:
-            inner = trie
-            for segment in split_path(f):
-                inner = inner.setdefault(segment, {})
-            inner.setdefault(None, set()).add(ini)
-    return trie
-
-
-def lookup(trie: Trie, filename: str) -> Set[str]:
-    """
-    Return the configs in `trie` that include a prefix of `filename`.
-
-    A path is included by a config if any of its ancestors are included
-    by the wildcard-expanded version of that config's `files`. Thus,
-    this function follows `filename`'s path down the `trie` and
-    accumulates all the configs it finds along the way.
-    """
-    configs = set()
-    inner = trie
-    for segment in split_path(filename):
-        inner = inner.get(segment, {})
-        configs |= inner.get(None, set())
-    return configs
-
-
-def make_plan(
-    *,
-    configs: Dict[str, Set[str]],
-    files: List[str]
-) -> Dict[str, List[str]]:
-    """
-    Return a dict from config names to the files to run them with.
-
-    The keys of the returned dict are a subset of the keys of `configs`.
-    The list of files in each value of returned dict should contain a
-    nonempty subset of the given `files`, in the same order as `files`.
-    """
-    trie = make_trie(configs)
-    plan = defaultdict(list)
-    for filename in files:
-        for config in lookup(trie, filename):
-            plan[config].append(filename)
-    return plan
-
-
-def run(
-    *,
-    args: List[str],
-    files: List[str],
-) -> Tuple[int, List[str], List[str]]:
-    """
-    Return the exit code and list of output lines from running `mypy`.
-
-    The given `args` are passed verbatim to `mypy`. The `files` (each of
-    which must be an absolute path) are converted to relative paths
-    (that is, relative to the root of this repo) and then classified
-    according to which ones need to be run with each `mypy` config.
-    Thus, `mypy` may be run zero, one, or multiple times, but it will be
-    run at most once for each `mypy` config used by this repo.
-    """
-    repo_root = Path.cwd()
-    plan = make_plan(configs=config_files(), files=[
-        PurePath(f).relative_to(repo_root).as_posix() for f in files
-    ])
-    mypy_results = [
-        mypy.api.run(
-            # insert custom flags after args to avoid being overridden
-            # by existing flags in args
-            args + [
-                # don't special-case the last line
-                '--no-error-summary',
-                f'--config-file={config}',
-            ] + filtered
-        )
-        # by construction, filtered must be nonempty
-        for config, filtered in plan.items()
-    ]
-    return (
-        # assume all mypy exit codes are nonnegative
-        # https://github.com/python/mypy/issues/6003
-        max(
-            [exit_code for _, _, exit_code in mypy_results],
-            default=0,
-        ),
-        list(dict.fromkeys(  # remove duplicates, retain order
-            item
-            for stdout, _, _ in mypy_results
-            for item in stdout.splitlines()
-        )),
-        [stderr for _, stderr, _ in mypy_results],
-    )
-
-
-def main(args: List[str]) -> None:
-    """
-    Run mypy on one Python file using the correct config file(s).
-
-    This function assumes the following preconditions hold:
-
-    - the cwd is set to the root of this cloned repo
-    - args is a valid list of CLI arguments that could be passed to mypy
-    - some of args are absolute paths to files to typecheck
-    - all the other args are config flags for mypy, rather than files
-
-    These assumptions hold, for instance, when mypy is run automatically
-    by VS Code's Python extension, so in your clone of this repository,
-    you could modify your .vscode/settings.json to look something like
-    this (assuming you use a conda environment named "pytorch"):
-
-        {
-          "python.linting.enabled": true,
-          "python.linting.mypyEnabled": true,
-          "python.linting.mypyPath":
-            "${env:HOME}/miniconda3/envs/pytorch/bin/python",
-          "python.linting.mypyArgs": [
-            "${workspaceFolder}/tools/linter/mypy_wrapper.py"
-          ]
-        }
-
-    More generally, this should work for any editor sets the cwd to the
-    repo root, runs mypy on individual files via their absolute paths,
-    and allows you to set the path to the mypy executable.
-    """
-    repo_root = str(Path.cwd())
-    exit_code, mypy_issues, stderrs = run(
-        args=[arg for arg in args if not arg.startswith(repo_root)],
-        files=[arg for arg in args if arg.startswith(repo_root)],
-    )
-    for issue in mypy_issues:
-        print(issue)
-    for stderr in stderrs:
-        print(stderr, end='', file=sys.stderr)
-    sys.exit(exit_code)
-
-
-if __name__ == '__main__':
-    main(sys.argv[1:])
diff --git a/tools/linter/run_shellcheck.sh b/tools/linter/run_shellcheck.sh
deleted file mode 100755
index e9d2dd40e8fd..000000000000
--- a/tools/linter/run_shellcheck.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-find "$@" -name '*.sh' -print0 | xargs -0 -n1 shellcheck --external-sources
diff --git a/tools/linter/trailing_newlines.py b/tools/linter/trailing_newlines.py
deleted file mode 100755
index ee743a4785f8..000000000000
--- a/tools/linter/trailing_newlines.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-
-import fileinput
-import os
-import sys
-
-NEWLINE, = b'\n'
-
-
-def correct_trailing_newlines(filename: str) -> bool:
-    with open(filename, 'rb') as f:
-        a = len(f.read(2))
-        if a == 0:
-            return True
-        elif a == 1:
-            # file is wrong whether or not the only byte is a newline
-            return False
-        else:
-            f.seek(-2, os.SEEK_END)
-            b, c = f.read(2)
-            # no ASCII byte is part of any non-ASCII character in UTF-8
-            return b != NEWLINE and c == NEWLINE
-
-
-def main() -> int:
-    # mimic git grep exit code behavior
-    exit_code = 1
-    for line in fileinput.input():
-        stripped = line.rstrip()
-        if not correct_trailing_newlines(stripped):
-            exit_code = 0
-            print(stripped)
-    return exit_code
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/tools/linter/translate_annotations.py b/tools/linter/translate_annotations.py
deleted file mode 100755
index ed0147e4a62a..000000000000
--- a/tools/linter/translate_annotations.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import re
-import subprocess
-from bisect import bisect_right
-from collections import defaultdict
-from typing import (Callable, DefaultDict, Generic, List, Optional, Pattern,
-                    Sequence, TypeVar, cast)
-
-from typing_extensions import TypedDict
-
-
-class Hunk(TypedDict):
-    old_start: int
-    old_count: int
-    new_start: int
-    new_count: int
-
-
-class Diff(TypedDict):
-    old_filename: Optional[str]
-    hunks: List[Hunk]
-
-
-# @@ -start,count +start,count @@
-hunk_pattern = r'^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@'
-
-
-def parse_diff(diff: str) -> Diff:
-    name = None
-    name_found = False
-    hunks: List[Hunk] = []
-    for line in diff.splitlines():
-        hunk_match = re.match(hunk_pattern, line)
-        if name_found:
-            if hunk_match:
-                old_start, old_count, new_start, new_count = hunk_match.groups()
-                hunks.append({
-                    'old_start': int(old_start),
-                    'old_count': int(old_count or '1'),
-                    'new_start': int(new_start),
-                    'new_count': int(new_count or '1'),
-                })
-        else:
-            assert not hunk_match
-            name_match = re.match(r'^--- (?:(?:/dev/null)|(?:a/(.*)))$', line)
-            if name_match:
-                name_found = True
-                name, = name_match.groups()
-    return {
-        'old_filename': name,
-        'hunks': hunks,
-    }
-
-
-T = TypeVar('T')
-U = TypeVar('U')
-
-
-# we want to use bisect.bisect_right to find the closest hunk to a given
-# line number, but the bisect module won't have a key function until
-# Python 3.10 https://github.com/python/cpython/pull/20556 so we make an
-# O(1) wrapper around the list of hunks that makes it pretend to just be
-# a list of line numbers
-# https://gist.github.com/ericremoreynolds/2d80300dabc70eebc790
-class KeyifyList(Generic[T, U]):
-    def __init__(self, inner: List[T], key: Callable[[T], U]) -> None:
-        self.inner = inner
-        self.key = key
-
-    def __len__(self) -> int:
-        return len(self.inner)
-
-    def __getitem__(self, k: int) -> U:
-        return self.key(self.inner[k])
-
-
-def translate(diff: Diff, line_number: int) -> Optional[int]:
-    if line_number < 1:
-        return None
-
-    hunks = diff['hunks']
-    if not hunks:
-        return line_number
-
-    keyified = KeyifyList(
-        hunks,
-        lambda hunk: hunk['new_start'] + (0 if hunk['new_count'] > 0 else 1)
-    )
-    i = bisect_right(cast(Sequence[int], keyified), line_number)
-    if i < 1:
-        return line_number
-
-    hunk = hunks[i - 1]
-    d = line_number - (hunk['new_start'] + (hunk['new_count'] or 1))
-    return None if d < 0 else hunk['old_start'] + (hunk['old_count'] or 1) + d
-
-
-# we use camelCase here because this will be output as JSON and so the
-# field names need to match the group names from here:
-# https://github.com/pytorch/add-annotations-github-action/blob/3ab7d7345209f5299d53303f7aaca7d3bc09e250/action.yml#L23
-class Annotation(TypedDict):
-    filename: str
-    lineNumber: int
-    columnNumber: int
-    errorCode: str
-    errorDesc: str
-
-
-def parse_annotation(regex: Pattern[str], line: str) -> Optional[Annotation]:
-    m = re.match(regex, line)
-    if m:
-        try:
-            line_number = int(m.group('lineNumber'))
-            column_number = int(m.group('columnNumber'))
-        except ValueError:
-            return None
-        return {
-            'filename': m.group('filename'),
-            'lineNumber': line_number,
-            'columnNumber': column_number,
-            'errorCode': m.group('errorCode'),
-            'errorDesc': m.group('errorDesc'),
-        }
-    else:
-        return None
-
-
-def translate_all(
-    *,
-    lines: List[str],
-    regex: Pattern[str],
-    commit: str
-) -> List[Annotation]:
-    ann_dict: DefaultDict[str, List[Annotation]] = defaultdict(list)
-    for line in lines:
-        annotation = parse_annotation(regex, line)
-        if annotation is not None:
-            ann_dict[annotation['filename']].append(annotation)
-    ann_list = []
-    for filename, annotations in ann_dict.items():
-        raw_diff = subprocess.check_output(
-            ['git', 'diff-index', '--unified=0', commit, filename],
-            encoding='utf-8',
-        )
-        diff = parse_diff(raw_diff) if raw_diff.strip() else None
-        # if there is a diff but it doesn't list an old filename, that
-        # means the file is absent in the commit we're targeting, so we
-        # skip it
-        if not (diff and not diff['old_filename']):
-            for annotation in annotations:
-                line_number: Optional[int] = annotation['lineNumber']
-                if diff:
-                    annotation['filename'] = cast(str, diff['old_filename'])
-                    line_number = translate(diff, cast(int, line_number))
-                if line_number:
-                    annotation['lineNumber'] = line_number
-                    ann_list.append(annotation)
-    return ann_list
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--file')
-    parser.add_argument('--regex')
-    parser.add_argument('--commit')
-    args = parser.parse_args()
-    with open(args.file, 'r') as f:
-        lines = f.readlines()
-    print(json.dumps(translate_all(
-        lines=lines,
-        regex=args.regex,
-        commit=args.commit
-    )))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/lite_interpreter/BUILD.buck b/tools/lite_interpreter/BUILD.buck
new file mode 100644
index 000000000000..10415c26aee7
--- /dev/null
+++ b/tools/lite_interpreter/BUILD.buck
@@ -0,0 +1,6 @@
+python_library(
+    name = "gen_selected_mobile_ops_header",
+    srcs = ["gen_selected_mobile_ops_header.py"],
+    base_module = "tools.lite_interpreter",
+    visibility = ["PUBLIC"],
+)
diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
index e34b7bbfa5c7..37cd9e6903bf 100644
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@@ -2,8 +2,8 @@
 import argparse
 import os
 from typing import Set
-from tools.codegen.selective_build.selector import SelectiveBuilder
-from tools.codegen.code_template import CodeTemplate
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.code_template import CodeTemplate
 
 import yaml
 
@@ -44,6 +44,7 @@
 
 """
 
+
 def extract_root_operators(selective_builder: SelectiveBuilder) -> Set[str]:
     ops = []
     for (op_name, op) in selective_builder.operators.items():
@@ -51,18 +52,24 @@ def extract_root_operators(selective_builder: SelectiveBuilder) -> Set[str]:
             ops.append(op_name)
     return set(ops)
 
+
 def get_selected_kernel_dtypes_code(
-        selective_builder: SelectiveBuilder,
+    selective_builder: SelectiveBuilder,
 ) -> str:
     # See https://www.internalfb.com/intern/paste/P153411698/ for an example of the
     # generated code in case all kernel dtypes are selected and in case some kernel
     # dtypes are selected (i.e. both cases).
     #
     body = "return true;"
-    if selective_builder.include_all_operators is False and selective_builder.include_all_non_op_selectives is False:
+    if (
+        selective_builder.include_all_operators is False
+        and selective_builder.include_all_non_op_selectives is False
+    ):
         body_parts = []
         for kernel_tag, dtypes in selective_builder.kernel_metadata.items():
-            conditions = list(map(lambda x: 'scalar_type == at::ScalarType::' + x, dtypes))
+            conditions = list(
+                map(lambda x: "scalar_type == at::ScalarType::" + x, dtypes)
+            )
             body_parts.append(
                 if_condition_template.substitute(
                     kernel_tag_name=kernel_tag,
@@ -79,8 +86,8 @@ def get_selected_kernel_dtypes_code(
 # 1. The selected root operators
 # 2. The selected kernel dtypes
 def write_selected_mobile_ops(
-        output_file_path: str,
-        selective_builder: SelectiveBuilder,
+    output_file_path: str,
+    selective_builder: SelectiveBuilder,
 ) -> None:
     root_ops = extract_root_operators(selective_builder)
     custom_classes = selective_builder.custom_classes
@@ -90,16 +97,29 @@ def write_selected_mobile_ops(
         # This condition checks if we are in selective build.
         # if these lists are not defined the corresponding selective build macros trivially return the item in question was selected
         if not selective_builder.include_all_operators:
-            body_parts.append("#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n")
+            body_parts.append(
+                "#define TORCH_OPERATOR_WHITELIST "
+                + (";".join(sorted(root_ops)))
+                + ";\n\n"
+            )
             # This condition checks if we are in tracing based selective build
             if selective_builder.include_all_non_op_selectives is False:
-                body_parts.append("#define TORCH_CUSTOM_CLASS_ALLOWLIST " + (";".join(sorted(custom_classes))) + ";\n\n")
-                body_parts.append("#define TORCH_BUILD_FEATURE_ALLOWLIST " + (";".join(sorted(build_features))) + ";\n\n")
+                body_parts.append(
+                    "#define TORCH_CUSTOM_CLASS_ALLOWLIST "
+                    + (";".join(sorted(custom_classes)))
+                    + ";\n\n"
+                )
+                body_parts.append(
+                    "#define TORCH_BUILD_FEATURE_ALLOWLIST "
+                    + (";".join(sorted(build_features)))
+                    + ";\n\n"
+                )
 
         body_parts.append(get_selected_kernel_dtypes_code(selective_builder))
         header_contents = "".join(body_parts)
         out_file.write(header_contents.encode("utf-8"))
 
+
 # root_ops: a set of selected root operators for selective build
 # Write the file selected_mobile_ops.h with optionally:
 # 1. The selected root operators from root_ops
@@ -110,7 +130,9 @@ def write_selected_mobile_ops_with_all_dtypes(
 ) -> None:
     with open(output_file_path, "wb") as out_file:
         body_parts = [selected_mobile_ops_preamble]
-        body_parts.append("#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n")
+        body_parts.append(
+            "#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n"
+        )
 
         selective_builder = SelectiveBuilder.get_nop_selector()
         body_parts.append(get_selected_kernel_dtypes_code(selective_builder))
@@ -118,17 +140,25 @@ def write_selected_mobile_ops_with_all_dtypes(
         header_contents = "".join(body_parts)
         out_file.write(header_contents.encode("utf-8"))
 
+
 def main() -> None:
     parser = argparse.ArgumentParser(
         description="Generate selected_mobile_ops.h for selective build."
     )
     parser.add_argument(
-        "-p", "--yaml_file_path", type=str, required=True, help="Path to the yaml"
-        " file with a list of operators used by the model."
+        "-p",
+        "--yaml_file_path",
+        type=str,
+        required=True,
+        help="Path to the yaml" " file with a list of operators used by the model.",
     )
     parser.add_argument(
-        "-o", "--output_file_path", type=str, required=True, help="Path to destination"
-        "folder where selected_mobile_ops.h will be written."
+        "-o",
+        "--output_file_path",
+        type=str,
+        required=True,
+        help="Path to destination"
+        "folder where selected_mobile_ops.h will be written.",
     )
     parsed_args = parser.parse_args()
     model_file_name = parsed_args.yaml_file_path
@@ -138,12 +168,13 @@ def main() -> None:
     with open(model_file_name, "rb") as model_file:
         loaded_model = yaml.load(model_file, Loader=Loader)
 
-
     root_operators_set = set(loaded_model)
     print("Writing header file selected_mobile_ops.h: ", parsed_args.output_file_path)
     write_selected_mobile_ops_with_all_dtypes(
         os.path.join(parsed_args.output_file_path, "selected_mobile_ops.h"),
-        root_operators_set)
+        root_operators_set,
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/tools/lldb/deploy_debugger.py b/tools/lldb/deploy_debugger.py
index deaf65d7edb9..5a1395898b5c 100644
--- a/tools/lldb/deploy_debugger.py
+++ b/tools/lldb/deploy_debugger.py
@@ -1,10 +1,12 @@
 import lldb  # type: ignore[import]
+
 # load into lldb instance with:
 #   command script import tools/lldb/deploy_debugger.py
 
 target = lldb.debugger.GetSelectedTarget()
 bp = target.BreakpointCreateByRegex("__deploy_register_code")
-bp.SetScriptCallbackBody("""\
+bp.SetScriptCallbackBody(
+    """\
 process = frame.thread.GetProcess()
 target = process.target
 symbol_addr = frame.module.FindSymbol("__deploy_module_info").GetStartAddress()
@@ -31,4 +33,5 @@
     lldb.debugger.HandleCommand(cmd2)
 
 return False
-""")
+"""
+)
diff --git a/tools/nightly.py b/tools/nightly.py
index 7a46a011d232..32733c5d9477 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -40,8 +40,21 @@
 import subprocess
 from ast import literal_eval
 from argparse import ArgumentParser
-from typing import (Any, Callable, Dict, Generator, Iterable, Iterator, List,
-                    Optional, Sequence, Set, Tuple, TypeVar, cast)
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    cast,
+)
 
 LOGGER: Optional[logging.Logger] = None
 URL_FORMAT = "{base_url}/{platform}/{dist_name}.tar.bz2"
@@ -199,7 +212,13 @@ def check_branch(subcommand: str, branch: Optional[str]) -> Optional[str]:
         return "Branch name to checkout must be supplied with '-b' option"
     # next check that the local repo is clean
     cmd = ["git", "status", "--untracked-files=no", "--porcelain"]
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True)
+    p = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=True,
+        universal_newlines=True,
+    )
     if p.stdout.strip():
         return "Need to have clean working tree to checkout!\n\n" + p.stdout
     # next check that the branch name doesn't already exist
@@ -218,7 +237,7 @@ def timer(logger: logging.Logger, prefix: str) -> Iterator[None]:
     logger.info(f"{prefix} took {time.time() - start_time:.3f} [s]")
 
 
-F = TypeVar('F', bound=Callable[..., Any])
+F = TypeVar("F", bound=Callable[..., Any])
 
 
 def timed(prefix: str) -> Callable[[F], F]:
@@ -325,7 +344,7 @@ def deps_install(deps: List[str], existing_env: bool, env_opts: List[str]) -> No
 
 @timed("Installing pytorch nightly binaries")
 def pytorch_install(url: str) -> "tempfile.TemporaryDirectory[str]":
-    """"Install pytorch into a temporary directory"""
+    """ "Install pytorch into a temporary directory"""
     pytdir = tempfile.TemporaryDirectory()
     cmd = ["conda", "create", "--yes", "--no-deps", "--prefix", pytdir.name, url]
     p = subprocess.run(cmd, check=True)
@@ -369,7 +388,13 @@ def _nightly_version(spdir: str) -> str:
     # now cross reference with nightly version
     _ensure_commit(git_version)
     cmd = ["git", "show", "--no-patch", "--format=%s", git_version]
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True)
+    p = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=True,
+        universal_newlines=True,
+    )
     m = SHA1_RE.search(p.stdout)
     if m is None:
         raise RuntimeError(
@@ -516,7 +541,13 @@ def move_nightly_files(spdir: str, platform: str) -> None:
 
 def _available_envs() -> Dict[str, str]:
     cmd = ["conda", "env", "list"]
-    p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    p = subprocess.run(
+        cmd,
+        check=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
     lines = p.stdout.splitlines()
     envs = {}
     for line in map(str.strip, lines):
diff --git a/tools/onnx/update_default_opset_version.py b/tools/onnx/update_default_opset_version.py
new file mode 100755
index 000000000000..dfdbf1f23c87
--- /dev/null
+++ b/tools/onnx/update_default_opset_version.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+"""Updates the default value of opset_version.
+
+The current policy is that the default should be set to the
+latest released version as of 18 months ago.
+
+Usage:
+Run with no arguments.
+"""
+
+import datetime
+import os
+import pathlib
+import re
+import subprocess
+import sys
+from subprocess import DEVNULL
+
+pytorch_dir = pathlib.Path(__file__).parent.parent.parent.resolve()
+onnx_dir = pytorch_dir / "third_party" / "onnx"
+os.chdir(onnx_dir)
+
+date = datetime.datetime.now() - datetime.timedelta(days=18 * 30)
+onnx_commit = subprocess.check_output(
+    ("git", "log", f"--until={date}", "--max-count=1", "--format=%H"), encoding="utf-8"
+).strip()
+onnx_tags = subprocess.check_output(
+    ("git", "tag", "--list", f"--contains={onnx_commit}"), encoding="utf-8"
+)
+tag_tups = []
+semver_pat = re.compile(r"v(\d+)\.(\d+)\.(\d+)")
+for tag in onnx_tags.splitlines():
+    match = semver_pat.match(tag)
+    if match:
+        tag_tups.append(tuple(int(x) for x in match.groups()))
+
+version_str = "{}.{}.{}".format(*min(tag_tups))
+
+print("Using ONNX release", version_str)
+
+head_commit = subprocess.check_output(
+    ("git", "log", "--max-count=1", "--format=%H", "HEAD"), encoding="utf-8"
+).strip()
+
+new_default = None
+
+subprocess.check_call(
+    ("git", "checkout", f"v{version_str}"), stdout=DEVNULL, stderr=DEVNULL
+)
+try:
+    from onnx import helper  # type: ignore[import]
+
+    for version in helper.VERSION_TABLE:
+        if version[0] == version_str:
+            new_default = version[2]
+            print("found new default opset_version", new_default)
+            break
+    if not new_default:
+        sys.exit(
+            f"failed to find version {version_str} in onnx.helper.VERSION_TABLE at commit {onnx_commit}"
+        )
+finally:
+    subprocess.check_call(
+        ("git", "checkout", head_commit), stdout=DEVNULL, stderr=DEVNULL
+    )
+
+os.chdir(pytorch_dir)
+
+
+def read_sub_write(path: str, prefix_pat: str) -> None:
+    with open(path, encoding="utf-8") as f:
+        content_str = f.read()
+    content_str = re.sub(prefix_pat, r"\g<1>{}".format(new_default), content_str)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(content_str)
+    print("modified", path)
+
+
+read_sub_write(
+    os.path.join("torch", "onnx", "_constants.py"),
+    r"(onnx_default_opset = )\d+",
+)
+read_sub_write(
+    os.path.join("torch", "onnx", "__init__.py"), r"(opset_version \(int, default )\d+"
+)
+
+print("Updating operator .expect files")
+subprocess.check_call(("python", "setup.py", "develop"), stdout=DEVNULL, stderr=DEVNULL)
+subprocess.check_call(
+    ("python", os.path.join("test", "onnx", "test_operators.py"), "--accept"),
+    stdout=DEVNULL,
+    stderr=DEVNULL,
+)
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index faf1fdf06d36..94c89a906714 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -2,14 +2,21 @@
 import collections
 from pprint import pformat
 
-from tools.codegen.model import Variant
-from tools.codegen.api.python import (PythonSignatureGroup,
-                                      PythonSignatureNativeFunctionPair)
-from tools.codegen.gen import parse_native_yaml
-from tools.codegen.utils import FileManager
+from torchgen.model import Variant
+from torchgen.api.python import (
+    PythonSignatureGroup,
+    PythonSignatureNativeFunctionPair,
+    returns_named_tuple_pyi,
+)
+from torchgen.gen import parse_native_yaml
+from torchgen.utils import FileManager
 from typing import Sequence, List, Dict
 
-from tools.autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads
+from tools.autograd.gen_python_functions import (
+    should_generate_py_binding,
+    load_signatures,
+    group_overloads,
+)
 
 """
 This module implements generation of type stubs for PyTorch,
@@ -35,23 +42,29 @@
 read gen_pyi for the gory details.
 """
 
+
 def get_py_torch_functions(
-        python_funcs: Sequence[PythonSignatureNativeFunctionPair],
-        method: bool = False,
+    python_funcs: Sequence[PythonSignatureNativeFunctionPair],
+    method: bool = False,
 ) -> Sequence[PythonSignatureGroup]:
     """
     Get declarations (grouped by name) which should be generated
     as either functions in the "torch" module or methods on Tensor.
     """
+
     def should_bind_function(python_func: PythonSignatureNativeFunctionPair) -> bool:
-        return (should_generate_py_binding(python_func.function) and
-                not python_func.function.python_module and
-                Variant.function in python_func.function.variants)
+        return (
+            should_generate_py_binding(python_func.function)
+            and not python_func.function.python_module
+            and Variant.function in python_func.function.variants
+        )
 
     def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
-        return (should_generate_py_binding(python_func.function) and
-                not python_func.function.python_module and
-                Variant.method in python_func.function.variants)
+        return (
+            should_generate_py_binding(python_func.function)
+            and not python_func.function.python_module
+            and Variant.method in python_func.function.variants
+        )
 
     should_bind = should_bind_method if method else should_bind_function
     return group_overloads([f for f in python_funcs if should_bind(f)])
@@ -61,76 +74,111 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
 # the stubs to read on the human eye.
 
 DEVICE_PARAM = "device: Union[_device, str, None]=None"
-FACTORY_PARAMS = f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False"
+FACTORY_PARAMS = (
+    f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False"
+)
 
 # this could be more precise w.r.t list contents etc. How to do Ellipsis?
 INDICES = "indices: Union[None, _int, slice, Tensor, List, Tuple]"
 
 blocklist = [
-    '__init_subclass__',
-    '__new__',
-    '__subclasshook__',
-    'cdist',
-    'device',
-    'grad',
-    'requires_grad',
-    'range',
+    "__init_subclass__",
+    "__new__",
+    "__subclasshook__",
+    "cdist",
+    "device",
+    "grad",
+    "requires_grad",
+    "range",
     # defined in functional
-    'einsum',
+    "einsum",
     # reduction argument; these bindings don't make sense
-    'binary_cross_entropy_with_logits',
-    'ctc_loss',
-    'cosine_embedding_loss',
-    'hinge_embedding_loss',
-    'kl_div',
-    'margin_ranking_loss',
-    'triplet_margin_loss',
+    "binary_cross_entropy_with_logits",
+    "ctc_loss",
+    "cosine_embedding_loss",
+    "hinge_embedding_loss",
+    "kl_div",
+    "margin_ranking_loss",
+    "triplet_margin_loss",
     # Somehow, these are defined in both _C and in functional. Ick!
-    'broadcast_tensors',
+    "broadcast_tensors",
     # Manually define named tensor type stubs in __init__.pyi.in
-    'align_tensors',
-    'meshgrid',
-    'cartesian_prod',
-    'block_diag',
-    'norm',
-    'chain_matmul',
-    'stft',
-    'tensordot',
-    'split',
-    'unique_consecutive',
-    'atleast_1d',
-    'atleast_2d',
-    'atleast_3d',
+    "align_tensors",
+    "meshgrid",
+    "cartesian_prod",
+    "block_diag",
+    "norm",
+    "chain_matmul",
+    "stft",
+    "tensordot",
+    "split",
+    "unique_consecutive",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
     # These are handled specially by python_arg_parser.cpp
-    'add',
-    'add_',
-    'add_out',
-    'sub',
-    'sub_',
-    'sub_out',
-    'mul',
-    'mul_',
-    'mul_out',
-    'div',
-    'div_',
-    'div_out',
-    'true_divide', 'true_divide_', 'true_divide_out',
-    'floor_divide', 'floor_divide_', 'floor_divide_out',
+    "add",
+    "add_",
+    "add_out",
+    "sub",
+    "sub_",
+    "sub_out",
+    "mul",
+    "mul_",
+    "mul_out",
+    "div",
+    "div_",
+    "div_out",
+    "true_divide",
+    "true_divide_",
+    "true_divide_out",
+    "floor_divide",
+    "floor_divide_",
+    "floor_divide_out",
 ]
 
-binary_ops = ('add', 'sub', 'mul', 'div', 'pow', 'lshift', 'rshift', 'mod', 'truediv',
-              'matmul', 'floordiv',
-              'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rpow',          # reverse arithmetic
-              'and', 'or', 'xor', 'rand', 'ror', 'rxor',  # logic
-              'iadd', 'iand', 'idiv', 'ilshift', 'imul',
-              'ior', 'irshift', 'isub', 'ixor', 'ifloordiv', 'imod',  # inplace ops
-              )
-symmetric_comparison_ops = ('eq', 'ne')
-asymmetric_comparison_ops = ('ge', 'gt', 'lt', 'le')
+binary_ops = (
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "pow",
+    "lshift",
+    "rshift",
+    "mod",
+    "truediv",
+    "matmul",
+    "floordiv",
+    "radd",
+    "rsub",
+    "rmul",
+    "rtruediv",
+    "rfloordiv",
+    "rpow",  # reverse arithmetic
+    "and",
+    "or",
+    "xor",
+    "rand",
+    "ror",
+    "rxor",  # logic
+    "iadd",
+    "iand",
+    "idiv",
+    "ilshift",
+    "imul",
+    "ior",
+    "irshift",
+    "isub",
+    "ixor",
+    "ifloordiv",
+    "imod",  # inplace ops
+)
+symmetric_comparison_ops = ("eq", "ne")
+asymmetric_comparison_ops = ("ge", "gt", "lt", "le")
 comparison_ops = symmetric_comparison_ops + asymmetric_comparison_ops
 
-unary_ops = ('neg', 'abs', 'invert')
-to_py_type_ops = ('bool', 'float', 'complex', 'long', 'index', 'int', 'nonzero')
+unary_ops = ("neg", "abs", "invert")
+to_py_type_ops = ("bool", "float", "complex", "long", "index", "int", "nonzero")
 all_ops = binary_ops + comparison_ops + unary_ops + to_py_type_ops
 
 
@@ -141,32 +189,35 @@ def sig_for_ops(opname: str) -> List[str]:
 
     # we have to do this by hand, because they are hand-bound in Python
 
-    assert opname.endswith('__') and opname.startswith('__'), "Unexpected op {}".format(opname)
+    assert opname.endswith("__") and opname.startswith("__"), "Unexpected op {}".format(
+        opname
+    )
 
     name = opname[2:-2]
     if name in binary_ops:
-        return ['def {}(self, other: Any) -> Tensor: ...'.format(opname)]
+        return ["def {}(self, other: Any) -> Tensor: ...".format(opname)]
     elif name in comparison_ops:
-        sig = 'def {}(self, other: Any) -> Tensor: ...'.format(opname)
+        sig = "def {}(self, other: Any) -> Tensor: ...".format(opname)
         if name in symmetric_comparison_ops:
             # unsafe override https://github.com/python/mypy/issues/5704
-            sig += '  # type: ignore[override]'
+            sig += "  # type: ignore[override]"
         return [sig]
     elif name in unary_ops:
-        return ['def {}(self) -> Tensor: ...'.format(opname)]
+        return ["def {}(self) -> Tensor: ...".format(opname)]
     elif name in to_py_type_ops:
-        if name in {'bool', 'float', 'complex'}:
+        if name in {"bool", "float", "complex"}:
             tname = name
-        elif name == 'nonzero':
-            tname = 'bool'
+        elif name == "nonzero":
+            tname = "bool"
         else:
-            tname = 'int'
-        if tname in {'float', 'int', 'bool', 'complex'}:
-            tname = 'builtins.' + tname
-        return ['def {}(self) -> {}: ...'.format(opname, tname)]
+            tname = "int"
+        if tname in {"float", "int", "bool", "complex"}:
+            tname = "builtins." + tname
+        return ["def {}(self) -> {}: ...".format(opname, tname)]
     else:
         raise Exception("unknown op", opname)
 
+
 def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]:
     type_hints: List[str] = []
 
@@ -184,81 +235,98 @@ def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]:
     # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument
     # Generates the out variant if one exists. Otherwise, generate the functional variant
     type_hint = sig_group.signature.signature_str_pyi(
-        skip_outputs=sig_group.outplace is None)
+        skip_outputs=sig_group.outplace is None
+    )
     type_hints.append(type_hint)
 
     # Some operators also additionally have a vararg variant of their signature
     type_hint_vararg = sig_group.signature.signature_str_pyi_vararg(
-        skip_outputs=sig_group.outplace is None)
+        skip_outputs=sig_group.outplace is None
+    )
     if type_hint_vararg:
         type_hints.append(type_hint_vararg)
 
     return type_hints
 
+
 def gen_nn_functional(fm: FileManager) -> None:
     # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered
     # through an `_add_docstr` call
     imports = [
-        'conv1d',
-        'conv2d',
-        'conv3d',
-        'conv_transpose1d',
-        'conv_transpose2d',
-        'conv_transpose3d',
-        'conv_tbc',
-        'avg_pool1d',
-        'relu_',
-        'selu_',
-        'celu_',
-        'rrelu_',
-        'pixel_shuffle',
-        'pixel_unshuffle',
-        'channel_shuffle',
-        'pdist',
-        'cosine_similarity',
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+        "conv_tbc",
+        "avg_pool1d",
+        "relu_",
+        "selu_",
+        "celu_",
+        "rrelu_",
+        "pixel_shuffle",
+        "pixel_unshuffle",
+        "channel_shuffle",
+        "native_channel_shuffle",
+        "pdist",
+        "cosine_similarity",
     ]
     # Functions generated by `torch._jit_internal.boolean_dispatch`
     dispatches = [
-        'fractional_max_pool2d',
-        'fractional_max_pool3d',
-        'max_pool1d',
-        'max_pool2d',
-        'max_pool3d',
-        'adaptive_max_pool1d',
-        'adaptive_max_pool2d',
-        'adaptive_max_pool3d',
+        "fractional_max_pool2d",
+        "fractional_max_pool3d",
+        "max_pool1d",
+        "max_pool2d",
+        "max_pool3d",
+        "adaptive_max_pool1d",
+        "adaptive_max_pool2d",
+        "adaptive_max_pool3d",
     ]
     # Functions directly imported from `torch._C`
     from_c = [
-        'avg_pool2d',
-        'avg_pool3d',
-        'hardtanh_',
-        'elu_',
-        'leaky_relu_',
-        'logsigmoid',
-        'softplus',
-        'softshrink',
-        'one_hot',
+        "avg_pool2d",
+        "avg_pool3d",
+        "hardtanh_",
+        "elu_",
+        "leaky_relu_",
+        "logsigmoid",
+        "softplus",
+        "softshrink",
+        "one_hot",
     ]
     import_code = ["from .. import {0} as {0}".format(_) for _ in imports]
     # TODO make these types more precise
     dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)]
-    fm.write_with_template('torch/nn/functional.pyi', 'torch/nn/functional.pyi.in', lambda: {
-        'imported_hints': import_code,
-        'dispatched_hints': dispatch_code,
-    })
+    fm.write_with_template(
+        "torch/nn/functional.pyi",
+        "torch/nn/functional.pyi.in",
+        lambda: {
+            "imported_hints": import_code,
+            "dispatched_hints": dispatch_code,
+        },
+    )
 
     # functional.pyi already contains the definitions for those functions
     # so, we don't export then to it
-    from_c.extend(['hardtanh', 'leaky_relu', 'hardsigmoid'])
+    from_c.extend(["hardtanh", "leaky_relu", "hardsigmoid"])
     dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)]
-    fm.write_with_template('torch/_C/_nn.pyi', 'torch/_C/_nn.pyi.in', lambda: {
-        'imported_hints': import_code,
-        'dispatched_hints': dispatch_code,
-    })
-
-
-def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -> None:
+    fm.write_with_template(
+        "torch/_C/_nn.pyi",
+        "torch/_C/_nn.pyi.in",
+        lambda: {
+            "imported_hints": import_code,
+            "dispatched_hints": dispatch_code,
+        },
+    )
+
+
+def gen_pyi(
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    deprecated_yaml_path: str,
+    fm: FileManager,
+) -> None:
     """gen_pyi()
 
     This function generates a pyi file for torch.
@@ -278,125 +346,218 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     unsorted_function_hints: Dict[str, List[str]] = collections.defaultdict(list)
-    unsorted_function_hints.update({
-        'set_flush_denormal': ['def set_flush_denormal(mode: _bool) -> _bool: ...'],
-        'get_default_dtype': ['def get_default_dtype() -> _dtype: ...'],
-        'asarray': ['def asarray(obj: Any, *, dtype: Optional[_dtype]=None, '
-                    'device: Union[_device, str, None]=None, copy: Optional[_bool]=None, '
-                    'requires_grad: _bool=False) -> Tensor: ...'],
-        'from_numpy': ['def from_numpy(ndarray) -> Tensor: ...'],
-        'frombuffer': ['def frombuffer(buffer: Any, *, dtype: _dtype, count: int=-1, '
-                       'offset: int=0, device: Union[_device, str, None]=None, '
-                       'requires_grad: _bool=False) -> Tensor: ...'],
-        'numel': ['def numel(self: Tensor) -> _int: ...'],
-        'as_tensor': ["def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."],
-        'get_num_threads': ['def get_num_threads() -> _int: ...'],
-        'set_num_threads': ['def set_num_threads(num: _int) -> None: ...'],
-        'init_num_threads': ['def init_num_threads() -> None: ...'],
-        'get_num_interop_threads': ['def get_num_interop_threads() -> _int: ...'],
-        'set_num_interop_threads': ['def set_num_interop_threads(num: _int) -> None: ...'],
-        # These functions are explicitly disabled by
-        # SKIP_PYTHON_BINDINGS because they are hand bound.
-        # Correspondingly, we must hand-write their signatures.
-        'tensor': ["def tensor(data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)],
-        'sparse_coo_tensor': ['def sparse_coo_tensor(indices: Tensor, values: Union[Tensor,List],'
-                              ' size: Optional[_size]=None, *, dtype: Optional[_dtype]=None,'
-                              ' device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ...'],
-        'sparse_csr_tensor' : ['def sparse_csr_tensor(crow_indices: Union[Tensor, List],'
-                               'col_indices: Union[Tensor, List],'
-                               ' values: Union[Tensor, List], size: Optional[_size]=None,'
-                               ' *, dtype: Optional[_dtype]=None,'
-                               ' device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ...'],
-        '_sparse_coo_tensor_unsafe': ['def _sparse_coo_tensor_unsafe(indices: Tensor, values: Tensor, size: List[int],'
-                                      ' dtype: Optional[_dtype] = None, device: Optional[_device] = None,'
-                                      ' requires_grad: bool = False) -> Tensor: ...'],
-        '_sparse_csr_tensor_unsafe': ['def _sparse_csr_tensor_unsafe(crow_indices: Union[Tensor, List],'
-                                      'col_indices: Union[Tensor, List],'
-                                      ' values: Union[Tensor, List], size: List[int],'
-                                      ' dtype: Optional[_dtype] = None, device: Optional[_device] = None,'
-                                      ' requires_grad: bool = False) -> Tensor: ...'],
-        'range': ['def range(start: Number, end: Number,'
-                  ' step: Number=1, *, out: Optional[Tensor]=None, {}) -> Tensor: ...'
-                  .format(FACTORY_PARAMS)],
-        'arange': ['def arange(start: Number, end: Number, step: Number, *,'
-                   ' out: Optional[Tensor]=None, {}) -> Tensor: ...'
-                   .format(FACTORY_PARAMS),
-                   'def arange(start: Number, end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...'
-                   .format(FACTORY_PARAMS),
-                   'def arange(end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...'
-                   .format(FACTORY_PARAMS)],
-        'linspace': ['def linspace(start: Number, end: Number, steps: Optional[_int]=None, *,'
-                     ' out: Optional[Tensor]=None, {}) -> Tensor: ...'.format(FACTORY_PARAMS)],
-        'logspace': ['def logspace(start: Number, end: Number, steps: Optional[_int]=None, base: _float=10.0, *,'
-                     ' out: Optional[Tensor]=None, {}) -> Tensor: ...'.format(FACTORY_PARAMS)],
-        'randint': ['def randint(low: _int, high: _int, size: _size, *,'
-                    ' generator: Optional[Generator]=None, {}) -> Tensor: ...'
-                    .format(FACTORY_PARAMS),
-                    'def randint(high: _int, size: _size, *,'
-                    ' generator: Optional[Generator]=None, {}) -> Tensor: ...'
-                    .format(FACTORY_PARAMS)],
-        'full': ['def full(size: _size, fill_value: Number, *,'
-                 ' out: Optional[Tensor]=None,'
-                 ' layout: _layout=strided, {}) -> Tensor: ...'
-                 .format(FACTORY_PARAMS),
-                 'def full(size: _size, fill_value: Number, *,'
-                 ' names: List[Union[str, None]],'
-                 ' layout: _layout=strided, {}) -> Tensor: ...'
-                 .format(FACTORY_PARAMS)],
-        'is_grad_enabled': ['def is_grad_enabled() -> _bool: ...'],
-        'is_inference_mode_enabled': ['def is_inference_mode_enabled() -> _bool: ...'],
-        'nonzero': ['def nonzero(input: Tensor, *, as_tuple: Literal[False]=False, out: Optional[Tensor]=None) -> Tensor: ...',
-                    'def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...'],
-        'binary_cross_entropy_with_logits': ['def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, '
-                                             'weight: Optional[Tensor] = None, size_average: Optional[bool] = None, '
-                                             'reduce: Optional[bool] = None, reduction: str = ..., '
-                                             'pos_weight: Optional[Tensor] = None) -> Tensor: ...'],
-        'cosine_embedding_loss': ['def cosine_embedding_loss(input1: Tensor, input2: Tensor, '
-                                  'target: Tensor, margin: float = ..., size_average: Optional[bool] = ..., '
-                                  'reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'],
-        'ctc_loss': ['def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor,'
-                     ' blank: int = ..., reduction: str = ..., zero_infinity: bool = ...) -> Tensor: ...'],
-        'hinge_embedding_loss': ['def hinge_embedding_loss(input: Tensor, target: Tensor, margin: float = ...,'
-                                 ' size_average: Optional[bool] = ..., reduce: Optional[bool] = ..., '
-                                 'reduction: str = ...) -> Tensor: ...'],
-        'kl_div': ['def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., '
-                   'reduce: Optional[bool] = ..., reduction: str = ..., log_target: bool = ...) -> Tensor: ...'],
-        'margin_ranking_loss': ['def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor,'
-                                ' margin: float = ..., size_average: Optional[bool] = ..., '
-                                ' reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'],
-        'triplet_margin_loss': ['def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, '
-                                'margin: float = ..., p: float = ..., eps: float = ..., swap: bool = ..., '
-                                'size_average: Optional[bool] = ..., '
-                                'reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'],
-        'dsmm': ['def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
-        'hsmm': ['def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
-        'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, '
-                   'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'],
-        'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
-        'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, '
-                'rounding_mode: Optional[str] = None, out: Optional[Tensor]=None) -> Tensor: ...'],
-    })
-    for binop in ['mul', 'true_divide', 'floor_divide']:
+
+    for n, n1, n2 in [
+        ("csr", "crow", "col"),
+        ("csc", "ccol", "row"),
+        ("bsr", "crow", "col"),
+        ("bsc", "ccol", "row"),
+    ]:
+        unsorted_function_hints.update(
+            {
+                f"sparse_{n}_tensor": [
+                    f"def sparse_{n}_tensor({n1}_indices: Union[Tensor, List],"
+                    f"{n2}_indices: Union[Tensor, List],"
+                    " values: Union[Tensor, List], size: Optional[_size]=None,"
+                    " *, dtype: Optional[_dtype]=None,"
+                    " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..."
+                ],
+                f"_sparse_{n}_tensor_unsafe": [
+                    f"def _sparse_{n}_tensor_unsafe({n1}_indices: Union[Tensor, List],"
+                    f"{n2}_indices: Union[Tensor, List],"
+                    " values: Union[Tensor, List], size: List[int],"
+                    " dtype: Optional[_dtype] = None, device: Optional[_device] = None,"
+                    " requires_grad: bool = False) -> Tensor: ..."
+                ],
+            }
+        )
+
+    unsorted_function_hints.update(
+        {
+            "set_flush_denormal": ["def set_flush_denormal(mode: _bool) -> _bool: ..."],
+            "get_default_dtype": ["def get_default_dtype() -> _dtype: ..."],
+            "asarray": [
+                "def asarray(obj: Any, *, dtype: Optional[_dtype]=None, "
+                "device: Union[_device, str, None]=None, copy: Optional[_bool]=None, "
+                "requires_grad: _bool=False) -> Tensor: ..."
+            ],
+            "from_numpy": ["def from_numpy(ndarray) -> Tensor: ..."],
+            "frombuffer": [
+                "def frombuffer(buffer: Any, *, dtype: _dtype, count: int=-1, "
+                "offset: int=0, device: Union[_device, str, None]=None, "
+                "requires_grad: _bool=False) -> Tensor: ..."
+            ],
+            "numel": ["def numel(self: Tensor) -> _int: ..."],
+            "as_tensor": [
+                "def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."
+            ],
+            "get_num_threads": ["def get_num_threads() -> _int: ..."],
+            "set_num_threads": ["def set_num_threads(num: _int) -> None: ..."],
+            "init_num_threads": ["def init_num_threads() -> None: ..."],
+            "get_num_interop_threads": ["def get_num_interop_threads() -> _int: ..."],
+            "set_num_interop_threads": [
+                "def set_num_interop_threads(num: _int) -> None: ..."
+            ],
+            # These functions are explicitly disabled by
+            # SKIP_PYTHON_BINDINGS because they are hand bound.
+            # Correspondingly, we must hand-write their signatures.
+            "tensor": [
+                "def tensor(data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)
+            ],
+            "sparse_coo_tensor": [
+                "def sparse_coo_tensor(indices: Tensor, values: Union[Tensor,List],"
+                " size: Optional[_size]=None, *, dtype: Optional[_dtype]=None,"
+                " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..."
+            ],
+            "_sparse_coo_tensor_unsafe": [
+                "def _sparse_coo_tensor_unsafe(indices: Tensor, values: Tensor, size: List[int],"
+                " dtype: Optional[_dtype] = None, device: Optional[_device] = None,"
+                " requires_grad: bool = False) -> Tensor: ..."
+            ],
+            "sparse_compressed_tensor": [
+                "def sparse_compressed_tensor(compressed_indices: Union[Tensor, List],"
+                "plain_indices: Union[Tensor, List],"
+                " values: Union[Tensor, List], size: Optional[_size]=None,"
+                " *, dtype: Optional[_dtype]=None, layout: Optional[_layout] = None,"
+                " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..."
+            ],
+            "_sparse_compressed_tensor_unsafe": [
+                "def _sparse_compressed_tensor_unsafe(comp_indices: Union[Tensor, List],"
+                "plain_indices: Union[Tensor, List],"
+                " values: Union[Tensor, List], size: List[int],"
+                " dtype: Optional[_dtype] = None, layout: Optional[_layout] = None,"
+                " device: Optional[_device] = None,"
+                " requires_grad: bool = False) -> Tensor: ..."
+            ],
+            "range": [
+                "def range(start: Number, end: Number,"
+                " step: Number=1, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                )
+            ],
+            "arange": [
+                "def arange(start: Number, end: Number, step: Number, *,"
+                " out: Optional[Tensor]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                ),
+                "def arange(start: Number, end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                ),
+                "def arange(end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                ),
+            ],
+            "linspace": [
+                "def linspace(start: Number, end: Number, steps: Optional[_int]=None, *,"
+                " out: Optional[Tensor]=None, {}) -> Tensor: ...".format(FACTORY_PARAMS)
+            ],
+            "logspace": [
+                "def logspace(start: Number, end: Number, steps: Optional[_int]=None, base: _float=10.0, *,"
+                " out: Optional[Tensor]=None, {}) -> Tensor: ...".format(FACTORY_PARAMS)
+            ],
+            "randint": [
+                "def randint(low: _int, high: _int, size: _size, *,"
+                " generator: Optional[Generator]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                ),
+                "def randint(high: _int, size: _size, *,"
+                " generator: Optional[Generator]=None, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                ),
+            ],
+            "full": [
+                "def full(size: _size, fill_value: Number, *,"
+                " out: Optional[Tensor]=None,"
+                " layout: _layout=strided, {}) -> Tensor: ...".format(FACTORY_PARAMS),
+                "def full(size: _size, fill_value: Number, *,"
+                " names: List[Union[str, None]],"
+                " layout: _layout=strided, {}) -> Tensor: ...".format(FACTORY_PARAMS),
+            ],
+            "is_grad_enabled": ["def is_grad_enabled() -> _bool: ..."],
+            "is_inference_mode_enabled": [
+                "def is_inference_mode_enabled() -> _bool: ..."
+            ],
+            "nonzero": [
+                "def nonzero(input: Tensor, *, as_tuple: Literal[False]=False, out: Optional[Tensor]=None) -> Tensor: ...",
+                "def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...",
+            ],
+            "binary_cross_entropy_with_logits": [
+                "def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, "
+                "weight: Optional[Tensor] = None, size_average: Optional[bool] = None, "
+                "reduce: Optional[bool] = None, reduction: str = ..., "
+                "pos_weight: Optional[Tensor] = None) -> Tensor: ..."
+            ],
+            "cosine_embedding_loss": [
+                "def cosine_embedding_loss(input1: Tensor, input2: Tensor, "
+                "target: Tensor, margin: float = ..., size_average: Optional[bool] = ..., "
+                "reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..."
+            ],
+            "ctc_loss": [
+                "def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor,"
+                " blank: int = ..., reduction: str = ..., zero_infinity: bool = ...) -> Tensor: ..."
+            ],
+            "hinge_embedding_loss": [
+                "def hinge_embedding_loss(input: Tensor, target: Tensor, margin: float = ...,"
+                " size_average: Optional[bool] = ..., reduce: Optional[bool] = ..., "
+                "reduction: str = ...) -> Tensor: ..."
+            ],
+            "kl_div": [
+                "def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., "
+                "reduce: Optional[bool] = ..., reduction: str = ..., log_target: bool = ...) -> Tensor: ..."
+            ],
+            "margin_ranking_loss": [
+                "def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor,"
+                " margin: float = ..., size_average: Optional[bool] = ..., "
+                " reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..."
+            ],
+            "triplet_margin_loss": [
+                "def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, "
+                "margin: float = ..., p: float = ..., eps: float = ..., swap: bool = ..., "
+                "size_average: Optional[bool] = ..., "
+                "reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..."
+            ],
+            "dsmm": ["def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ..."],
+            "hsmm": ["def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ..."],
+            "saddmm": [
+                "def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, "
+                "alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ..."
+            ],
+            "spmm": ["def spmm(input: Tensor, mat2: Tensor) -> Tensor: ..."],
+            "div": [
+                "def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, "
+                "rounding_mode: Optional[str] = None, out: Optional[Tensor]=None) -> Tensor: ..."
+            ],
+        }
+    )
+    for binop in ["mul", "true_divide", "floor_divide"]:
         unsorted_function_hints[binop].append(
-            'def {}(input: Union[Tensor, Number],'
-            ' other: Union[Tensor, Number],'
-            ' *, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop))
-    for binop in ['add', 'sub']:
+            "def {}(input: Union[Tensor, Number],"
+            " other: Union[Tensor, Number],"
+            " *, out: Optional[Tensor]=None) -> Tensor: ...".format(binop)
+        )
+    for binop in ["add", "sub"]:
         unsorted_function_hints[binop].append(
-            'def {}(input: Union[Tensor, Number],'
-            ' other: Union[Tensor, Number],'
-            ' *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop))
-
-    native_functions = parse_native_yaml(native_yaml_path).native_functions
+            "def {}(input: Union[Tensor, Number],"
+            " other: Union[Tensor, Number],"
+            " *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...".format(
+                binop
+            )
+        )
+
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
     native_functions = list(filter(should_generate_py_binding, native_functions))
 
-    function_signatures = load_signatures(native_functions, deprecated_yaml_path, method=False, pyi=True)
+    function_signatures = load_signatures(
+        native_functions, deprecated_yaml_path, method=False, pyi=True
+    )
     sig_groups = get_py_torch_functions(function_signatures)
     for group in sorted(sig_groups, key=lambda g: g.signature.name):
         name = group.signature.name
         unsorted_function_hints[name] += generate_type_hints(group)
 
-        named_tuple = group.signature.returns.named_tuple_pyi()
+        named_tuple = returns_named_tuple_pyi(group.signature)
         if named_tuple is not None and not group.signature.deprecated:
             # deprecated namedtuples are currently not included for torch functions
             tuple_name, tuple_def = named_tuple
@@ -408,122 +569,193 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
     function_hints = []
     for name, hints in sorted(unsorted_function_hints.items()):
         if len(hints) > 1:
-            hints = ['@overload\n' + h for h in hints]
+            hints = ["@overload\n" + h for h in hints]
         function_hints += hints
 
     # Generate type signatures for Tensor methods
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     unsorted_tensor_method_hints: Dict[str, List[str]] = collections.defaultdict(list)
-    unsorted_tensor_method_hints.update({
-        'size': ['def size(self) -> Size: ...',
-                 'def size(self, dim: _int) -> _int: ...'],
-        'stride': ['def stride(self) -> Tuple[_int]: ...',
-                   'def stride(self, _int) -> _int: ...'],
-        'new_ones': ['def new_ones(self, size: _size, {}) -> Tensor: ...'.
-                     format(FACTORY_PARAMS)],
-        'new_tensor': ["def new_tensor(self, data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)],
-        # new and __init__ have the same signatures differ only in return type
-        # Adapted from legacy_tensor_ctor and legacy_tensor_new
-        'new': ['def new(self, *args: Any, {}) ->Tensor: ...'.format(DEVICE_PARAM),
-                'def new(self, storage: Storage) -> Tensor: ...',
-                'def new(self, other: Tensor) -> Tensor: ...',
-                'def new(self, size: _size, *, {}) -> Tensor: ...'.format(DEVICE_PARAM),
-                ],
-        '__init__': ['def __init__(self, *args: Any, {}) -> None: ...'.format(DEVICE_PARAM),
-                     'def __init__(self, storage: Storage) -> None: ...',
-                     'def __init__(self, other: Tensor) -> None: ...',
-                     'def __init__(self, size: _size, *, {}) -> None: ...'.format(DEVICE_PARAM),
-                     ],
-        'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."],
-        '_make_subclass': ["def _make_subclass(cls, data: Tensor, require_grad: _bool = False) -> Tensor: ..."],
-        '__getitem__': ["def __getitem__(self, {}) -> Tensor: ...".format(INDICES)],
-        '__setitem__': ["def __setitem__(self, {}, val: Union[Tensor, Number])"
-                        " -> None: ...".format(INDICES)],
-        'tolist': ['def tolist(self) -> List: ...'],
-        'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'],
-        'element_size': ['def element_size(self) -> _int: ...'],
-        'data_ptr': ['def data_ptr(self) -> _int: ...'],
-        'dim': ['def dim(self) -> _int: ...'],
-        'nonzero': ['def nonzero(self, *, as_tuple: Literal[False]=False) -> Tensor: ...',
-                    'def nonzero(self, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...'],
-        'numel': ['def numel(self) -> _int: ...'],
-        'ndimension': ['def ndimension(self) -> _int: ...'],
-        'nelement': ['def nelement(self) -> _int: ...'],
-        'cuda': ['def cuda(self, device: Optional[Union[_device, _int, str]]=None, non_blocking: _bool=False) -> Tensor: ...'],
-        'numpy': ['def numpy(self) -> Any: ...'],
-        'apply_': ['def apply_(self, callable: Callable) -> Tensor: ...'],
-        'map_': ['def map_(self, tensor: Tensor, callable: Callable) -> Tensor: ...'],
-        'map2_': ['def map2_(self, x: Tensor, y: Tensor, callable: Callable) -> Tensor: ...'],
-        'storage': ['def _storage(self) -> Storage: ...'],
-        'storage_type': ['def storage_type(self) -> Storage: ...'],
-        'type': ['def type(self, dtype: None=None, non_blocking: _bool=False) -> str: ...',
-                 'def type(self, dtype: Union[str, _dtype], non_blocking: _bool=False) -> Tensor: ...',
-                 ],
-        'get_device': ['def get_device(self) -> _int: ...'],
-        'contiguous': ['def contiguous(self, memory_format=torch.contiguous_format) -> Tensor: ...'],
-        'has_names': ['def has_names(self) -> _bool: ...'],
-        'is_contiguous': ['def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: ...'],
-        '_is_view': ['def _is_view(self) -> _bool: ...'],
-        'is_cuda': ['is_cuda: _bool'],
-        'is_leaf': ['is_leaf: _bool'],
-        'is_sparse': ['is_sparse: _bool'],
-        'is_sparse_csr' : ['is_sparse_csr: _bool'],
-        'is_quantized': ['is_quantized: _bool'],
-        'is_meta': ['is_meta: _bool'],
-        'is_ort': ['is_ort: _bool'],
-        'is_mkldnn': ['is_mkldnn: _bool'],
-        'is_vulkan': ['is_vulkan: _bool'],
-        'storage_offset': ['def storage_offset(self) -> _int: ...'],
-        'to': ['def to(self, dtype: _dtype, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...',
-               'def to(self, device: Optional[Union[_device, str]]=None, dtype: Optional[_dtype]=None, '
-               'non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...',
-               'def to(self, other: Tensor, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...',
-               ],
-        'item': ["def item(self) -> Number: ..."],
-        'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."],
-        'set_': ['def set_(self, storage: Union[Storage, TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...',
-                 'def set_(self, storage: Union[Storage, TypedStorage]) -> Tensor: ...'],
-        'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
-                  'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
-        'div': ['def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ...'],
-        'div_': ['def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ...'],
-    })
-    for binop in ['mul', 'true_divide', 'floor_divide']:
+    unsorted_tensor_method_hints.update(
+        {
+            "size": [
+                "def size(self) -> Size: ...",
+                "def size(self, dim: _int) -> _int: ...",
+            ],
+            "stride": [
+                "def stride(self) -> Tuple[_int]: ...",
+                "def stride(self, _int) -> _int: ...",
+            ],
+            "new_ones": [
+                "def new_ones(self, size: _size, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                )
+            ],
+            "new_tensor": [
+                "def new_tensor(self, data: Any, {}) -> Tensor: ...".format(
+                    FACTORY_PARAMS
+                )
+            ],
+            # new and __init__ have the same signatures differ only in return type
+            # Adapted from legacy_tensor_ctor and legacy_tensor_new
+            "new": [
+                "def new(self, *args: Any, {}) ->Tensor: ...".format(DEVICE_PARAM),
+                "def new(self, storage: Storage) -> Tensor: ...",
+                "def new(self, other: Tensor) -> Tensor: ...",
+                "def new(self, size: _size, *, {}) -> Tensor: ...".format(DEVICE_PARAM),
+            ],
+            "__init__": [
+                "def __init__(self, *args: Any, {}) -> None: ...".format(DEVICE_PARAM),
+                "def __init__(self, storage: Storage) -> None: ...",
+                "def __init__(self, other: Tensor) -> None: ...",
+                "def __init__(self, size: _size, *, {}) -> None: ...".format(
+                    DEVICE_PARAM
+                ),
+            ],
+            "as_subclass": ["def as_subclass(self, cls: Tensor) -> Tensor: ..."],
+            "_make_subclass": [
+                "def _make_subclass(cls, data: Tensor, require_grad: _bool = False) -> Tensor: ..."
+            ],
+            "__getitem__": ["def __getitem__(self, {}) -> Tensor: ...".format(INDICES)],
+            "__setitem__": [
+                "def __setitem__(self, {}, val: Union[Tensor, Number])"
+                " -> None: ...".format(INDICES)
+            ],
+            "tolist": ["def tolist(self) -> List: ..."],
+            "requires_grad_": [
+                "def requires_grad_(self, mode: _bool=True) -> Tensor: ..."
+            ],
+            "element_size": ["def element_size(self) -> _int: ..."],
+            "data_ptr": ["def data_ptr(self) -> _int: ..."],
+            "dim": ["def dim(self) -> _int: ..."],
+            "nonzero": [
+                "def nonzero(self, *, as_tuple: Literal[False]=False) -> Tensor: ...",
+                "def nonzero(self, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...",
+            ],
+            "numel": ["def numel(self) -> _int: ..."],
+            "ndimension": ["def ndimension(self) -> _int: ..."],
+            "nelement": ["def nelement(self) -> _int: ..."],
+            "cuda": [
+                "def cuda(self, device: Optional[Union[_device, _int, str]]=None, non_blocking: _bool=False) -> Tensor: ..."
+            ],
+            "numpy": ["def numpy(self) -> Any: ..."],
+            "apply_": ["def apply_(self, callable: Callable) -> Tensor: ..."],
+            "map_": [
+                "def map_(self, tensor: Tensor, callable: Callable) -> Tensor: ..."
+            ],
+            "map2_": [
+                "def map2_(self, x: Tensor, y: Tensor, callable: Callable) -> Tensor: ..."
+            ],
+            "storage": ["def _storage(self) -> Storage: ..."],
+            "storage_type": ["def storage_type(self) -> Storage: ..."],
+            "type": [
+                "def type(self, dtype: None=None, non_blocking: _bool=False) -> str: ...",
+                "def type(self, dtype: Union[str, _dtype], non_blocking: _bool=False) -> Tensor: ...",
+            ],
+            "get_device": ["def get_device(self) -> _int: ..."],
+            "contiguous": [
+                "def contiguous(self, memory_format=torch.contiguous_format) -> Tensor: ..."
+            ],
+            "has_names": ["def has_names(self) -> _bool: ..."],
+            "is_contiguous": [
+                "def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: ..."
+            ],
+            "_is_view": ["def _is_view(self) -> _bool: ..."],
+            "is_cuda": ["is_cuda: _bool"],
+            "is_leaf": ["is_leaf: _bool"],
+            "is_nested": ["is_nested: _bool"],
+            "is_sparse": ["is_sparse: _bool"],
+            "is_sparse_csr": ["is_sparse_csr: _bool"],
+            "is_quantized": ["is_quantized: _bool"],
+            "is_meta": ["is_meta: _bool"],
+            "is_ort": ["is_ort: _bool"],
+            "is_mkldnn": ["is_mkldnn: _bool"],
+            "is_vulkan": ["is_vulkan: _bool"],
+            "is_ipu": ["is_ipu: _bool"],
+            "storage_offset": ["def storage_offset(self) -> _int: ..."],
+            "to": [
+                "def to(self, dtype: _dtype, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...",
+                "def to(self, device: Optional[Union[_device, str]]=None, dtype: Optional[_dtype]=None, "
+                "non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...",
+                "def to(self, other: Tensor, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...",
+            ],
+            "item": ["def item(self) -> Number: ..."],
+            "copy_": [
+                "def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."
+            ],
+            "set_": [
+                "def set_(self, storage: Union[Storage, _TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...",
+                "def set_(self, storage: Union[Storage, _TypedStorage]) -> Tensor: ...",
+            ],
+            "split": [
+                "def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...",
+                "def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...",
+            ],
+            "div": [
+                "def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ..."
+            ],
+            "div_": [
+                "def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ..."
+            ],
+        }
+    )
+    for binop in ["mul", "true_divide", "floor_divide"]:
         for inplace in [False, True]:
-            out_suffix = ', *, out: Optional[Tensor]=None'
+            out_suffix = ", *, out: Optional[Tensor]=None"
             if inplace:
-                binop += '_'
-                out_suffix = ''
+                binop += "_"
+                out_suffix = ""
             unsorted_tensor_method_hints[binop].append(
-                'def {}(self, other: Union[Tensor, Number]{})'
-                ' -> Tensor: ...'.format(binop, out_suffix))
-    for binop in ['add', 'sub']:
+                "def {}(self, other: Union[Tensor, Number]{})"
+                " -> Tensor: ...".format(binop, out_suffix)
+            )
+    for binop in ["add", "sub"]:
         for inplace in [False, True]:
-            out_suffix = ', out: Optional[Tensor]=None'
+            out_suffix = ", out: Optional[Tensor]=None"
             if inplace:
-                binop += '_'
-                out_suffix = ''
+                binop += "_"
+                out_suffix = ""
             unsorted_tensor_method_hints[binop].append(
-                'def {}(self, other: Union[Tensor, Number], '
-                '*, alpha: Optional[Number]=1{})'
-                ' -> Tensor: ...'.format(binop, out_suffix))
-    simple_conversions = ['byte', 'char', 'cpu', 'double', 'float',
-                          'half', 'int', 'long', 'short', 'bool',
-                          'bfloat16']
+                "def {}(self, other: Union[Tensor, Number], "
+                "*, alpha: Optional[Number]=1{})"
+                " -> Tensor: ...".format(binop, out_suffix)
+            )
+    simple_conversions = [
+        "byte",
+        "char",
+        "cpu",
+        "double",
+        "float",
+        "half",
+        "int",
+        "long",
+        "short",
+        "bool",
+        "bfloat16",
+    ]
     for name in simple_conversions:
-        unsorted_tensor_method_hints[name].append('def {}(self) -> Tensor: ...'.format(name))
+        unsorted_tensor_method_hints[name].append(
+            "def {}(self) -> Tensor: ...".format(name)
+        )
 
     # pyi tensor methods don't currently include deprecated signatures for some reason
     # TODO: we should probably add them in
-    tensor_method_signatures = load_signatures(native_functions, deprecated_yaml_path, method=True, skip_deprecated=True, pyi=True)
-    tensor_method_sig_groups = get_py_torch_functions(tensor_method_signatures, method=True)
+    tensor_method_signatures = load_signatures(
+        native_functions,
+        deprecated_yaml_path,
+        method=True,
+        skip_deprecated=True,
+        pyi=True,
+    )
+    tensor_method_sig_groups = get_py_torch_functions(
+        tensor_method_signatures, method=True
+    )
 
     for group in sorted(tensor_method_sig_groups, key=lambda g: g.signature.name):
         name = group.signature.name
         unsorted_tensor_method_hints[name] += generate_type_hints(group)
 
-        named_tuple = group.signature.returns.named_tuple_pyi()
+        named_tuple = returns_named_tuple_pyi(group.signature)
         if named_tuple is not None and not group.signature.deprecated:
             # deprecated namedtuples are currently not included for torch functions
             tuple_name, tuple_def = named_tuple
@@ -533,13 +765,13 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
                 namedtuples[tuple_name] = tuple_def
 
     for op in all_ops:
-        name = '__{}__'.format(op)
+        name = "__{}__".format(op)
         unsorted_tensor_method_hints[name] += sig_for_ops(name)
 
     tensor_method_hints = []
     for name, hints in sorted(unsorted_tensor_method_hints.items()):
         if len(hints) > 1:
-            hints = ['@overload\n' + h for h in hints]
+            hints = ["@overload\n" + h for h in hints]
         tensor_method_hints += hints
 
     # TODO: Missing type hints for nn
@@ -547,92 +779,182 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
     # Generate namedtuple definitions
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    namedtuple_defs = ['{} = {}'.format(name, defn) for name, defn in namedtuples.items()]
+    namedtuple_defs = [
+        "{} = {}".format(name, defn) for name, defn in namedtuples.items()
+    ]
 
     # Generate type signatures for legacy classes
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     # TODO: These are deprecated, maybe we shouldn't type hint them
     legacy_storage_base_hints = []
-    dt = ('Double', 'Float', 'Long', 'Int',
-          'Short', 'Char', 'Byte', 'Bool',
-          'Half', 'BFloat16', 'ComplexDouble',
-          'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32', 'QUInt4x2', 'QUInt2x4')
+    dt = (
+        "Double",
+        "Float",
+        "Long",
+        "Int",
+        "Short",
+        "Char",
+        "Byte",
+        "Bool",
+        "Half",
+        "BFloat16",
+        "ComplexDouble",
+        "ComplexFloat",
+        "QUInt8",
+        "QInt8",
+        "QInt32",
+        "QUInt4x2",
+        "QUInt2x4",
+    )
     for c in dt:
-        legacy_storage_base_hints.append('class {}StorageBase(object): ...'.format(c))
+        legacy_storage_base_hints.append("class {}StorageBase(object): ...".format(c))
     for c in dt:
-        legacy_storage_base_hints.append('class Cuda{}StorageBase(object): ...'.format(c))
+        legacy_storage_base_hints.append(
+            "class Cuda{}StorageBase(object): ...".format(c)
+        )
 
     legacy_class_hints = []
-    for c in ('DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor',
-              'ShortTensor', 'HalfTensor', 'CharTensor', 'ByteTensor', 'BoolTensor'):
-        legacy_class_hints.append('class {}(Tensor): ...'.format(c))
+    for c in (
+        "DoubleTensor",
+        "FloatTensor",
+        "LongTensor",
+        "IntTensor",
+        "ShortTensor",
+        "HalfTensor",
+        "CharTensor",
+        "ByteTensor",
+        "BoolTensor",
+    ):
+        legacy_class_hints.append("class {}(Tensor): ...".format(c))
 
     # Generate type signatures for dtype classes
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     # TODO: don't explicitly list dtypes here; get it from canonical
     # source
-    dtype_class_hints = ['{}: dtype = ...'.format(n)
-                         for n in
-                         ['float32', 'float', 'float64', 'double', 'float16', 'bfloat16', 'half',
-                          'uint8', 'int8', 'int16', 'short', 'int32', 'int', 'int64', 'long',
-                          'complex32', 'complex64', 'cfloat', 'complex128', 'cdouble',
-                          'quint8', 'qint8', 'qint32', 'bool', 'quint4x2', 'quint2x4']]
+    dtype_class_hints = [
+        "{}: dtype = ...".format(n)
+        for n in [
+            "float32",
+            "float",
+            "float64",
+            "double",
+            "float16",
+            "bfloat16",
+            "half",
+            "uint8",
+            "int8",
+            "int16",
+            "short",
+            "int32",
+            "int",
+            "int64",
+            "long",
+            "complex32",
+            "complex64",
+            "cfloat",
+            "complex128",
+            "cdouble",
+            "quint8",
+            "qint8",
+            "qint32",
+            "bool",
+            "quint4x2",
+            "quint2x4",
+        ]
+    ]
 
     # Generate __all__ directive
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     # Include only the functions that contain hints, to prevent undefined
     # symbols to be included in the `__all__` directive.
-    hinted_function_names = [name for name, hint in unsorted_function_hints.items() if hint]
+    hinted_function_names = [
+        name for name, hint in unsorted_function_hints.items() if hint
+    ]
     all_symbols = sorted(list(namedtuples.keys()) + hinted_function_names)
-    all_directive = pformat(all_symbols, width=100, compact=True).split('\n')
-    all_directive[0] = '__all__ = {}'.format(all_directive[0])
+    all_directive = pformat(all_symbols, width=100, compact=True).split("\n")
+    all_directive[0] = "__all__ = {}".format(all_directive[0])
 
     # Write out the stub
     # ~~~~~~~~~~~~~~~~~~
 
     env = {
-        'namedtuple_defs': namedtuple_defs,
-        'function_hints': function_hints,
-        'tensor_method_hints': tensor_method_hints,
-        'legacy_class_hints': legacy_class_hints,
-        'legacy_storage_base_hints': legacy_storage_base_hints,
-        'dtype_class_hints': dtype_class_hints,
-        'all_directive': all_directive
+        "namedtuple_defs": namedtuple_defs,
+        "function_hints": function_hints,
+        "tensor_method_hints": tensor_method_hints,
+        "legacy_class_hints": legacy_class_hints,
+        "legacy_storage_base_hints": legacy_storage_base_hints,
+        "dtype_class_hints": dtype_class_hints,
+        "all_directive": all_directive,
     }
-    fm.write_with_template('torch/_C/__init__.pyi', 'torch/_C/__init__.pyi.in', lambda: {
-        'generated_comment': '@' + 'generated from torch/_C/__init__.pyi.in',
-        **env,
-    })
-    fm.write_with_template('torch/_C/_VariableFunctions.pyi', 'torch/_C/_VariableFunctions.pyi.in', lambda: {
-        'generated_comment': '@' + 'generated from torch/_C/_VariableFunctions.pyi.in',
-        **env,
-    })
-    fm.write_with_template('torch/_VF.pyi', 'torch/_C/_VariableFunctions.pyi.in', lambda: {
-        'generated_comment': '@' + 'generated from torch/_C/_VariableFunctions.pyi.in',
-        **env,
-    })
+    fm.write_with_template(
+        "torch/_C/__init__.pyi",
+        "torch/_C/__init__.pyi.in",
+        lambda: {
+            "generated_comment": "@" + "generated from torch/_C/__init__.pyi.in",
+            **env,
+        },
+    )
+    fm.write_with_template(
+        "torch/_C/_VariableFunctions.pyi",
+        "torch/_C/_VariableFunctions.pyi.in",
+        lambda: {
+            "generated_comment": "@"
+            + "generated from torch/_C/_VariableFunctions.pyi.in",
+            **env,
+        },
+    )
+    fm.write_with_template(
+        "torch/_VF.pyi",
+        "torch/_C/_VariableFunctions.pyi.in",
+        lambda: {
+            "generated_comment": "@"
+            + "generated from torch/_C/_VariableFunctions.pyi.in",
+            **env,
+        },
+    )
+    fm.write_with_template(
+        "torch/return_types.pyi",
+        "torch/_C/return_types.pyi.in",
+        lambda: {
+            "generated_comment": "@" + "generated from torch/_C/return_types.pyi",
+            **env,
+        },
+    )
     gen_nn_functional(fm)
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description='Generate type stubs for PyTorch')
-    parser.add_argument('--native-functions-path', metavar='NATIVE',
-                        default='aten/src/ATen/native/native_functions.yaml',
-                        help='path to native_functions.yaml')
-    parser.add_argument('--deprecated-functions-path', metavar='DEPRECATED',
-                        default='tools/autograd/deprecated.yaml',
-                        help='path to deprecated.yaml')
-    parser.add_argument('--out', metavar='OUT',
-                        default='.',
-                        help='path to output directory')
+    parser = argparse.ArgumentParser(description="Generate type stubs for PyTorch")
+    parser.add_argument(
+        "--native-functions-path",
+        metavar="NATIVE",
+        default="aten/src/ATen/native/native_functions.yaml",
+        help="path to native_functions.yaml",
+    )
+    parser.add_argument(
+        "--tags-path",
+        metavar="TAGS",
+        default="aten/src/ATen/native/tags.yaml",
+        help="path to tags.yaml",
+    )
+    parser.add_argument(
+        "--deprecated-functions-path",
+        metavar="DEPRECATED",
+        default="tools/autograd/deprecated.yaml",
+        help="path to deprecated.yaml",
+    )
+    parser.add_argument(
+        "--out", metavar="OUT", default=".", help="path to output directory"
+    )
     args = parser.parse_args()
-    fm = FileManager(install_dir=args.out, template_dir='.', dry_run=False)
-    gen_pyi(args.native_functions_path, args.deprecated_functions_path, fm)
+    fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False)
+    gen_pyi(
+        args.native_functions_path, args.tags_path, args.deprecated_functions_path, fm
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/render_junit.py b/tools/render_junit.py
index 28e617af0e8f..68adadde0449 100644
--- a/tools/render_junit.py
+++ b/tools/render_junit.py
@@ -16,12 +16,15 @@
 except ImportError:
     print("rich not found, for color output use 'pip install rich'")
 
+
 def parse_junit_reports(path_to_reports: str) -> List[TestCase]:  # type: ignore[no-any-unimported]
     def parse_file(path: str) -> List[TestCase]:  # type: ignore[no-any-unimported]
         try:
             return convert_junit_to_testcases(JUnitXml.fromfile(path))
         except Exception as err:
-            rich.print(f":Warning: [yellow]Warning[/yellow]: Failed to read {path}: {err}")
+            rich.print(
+                f":Warning: [yellow]Warning[/yellow]: Failed to read {path}: {err}"
+            )
             return []
 
     if not os.path.exists(path_to_reports):
@@ -46,6 +49,7 @@ def convert_junit_to_testcases(xml: Union[JUnitXml, TestSuite]) -> List[TestCase
             testcases.append(item)
     return testcases
 
+
 def render_tests(testcases: List[TestCase]) -> None:  # type: ignore[no-any-unimported]
     num_passed = 0
     num_skipped = 0
@@ -64,14 +68,15 @@ def render_tests(testcases: List[TestCase]) -> None:  # type: ignore[no-any-unim
             else:
                 num_skipped += 1
                 continue
-            rich.print(f"{icon} [bold red]{testcase.classname}.{testcase.name}[/bold red]")
+            rich.print(
+                f"{icon} [bold red]{testcase.classname}.{testcase.name}[/bold red]"
+            )
             print(f"{result.text}")
     rich.print(f":white_check_mark: {num_passed} [green]Passed[green]")
     rich.print(f":dash: {num_skipped} [grey]Skipped[grey]")
     rich.print(f":rotating_light: {num_failed} [grey]Failed[grey]")
 
 
-
 def parse_args() -> Any:
     parser = argparse.ArgumentParser(
         description="Render xunit output for failed tests",
diff --git a/tools/setup_helpers/BUILD.bazel b/tools/setup_helpers/BUILD.bazel
new file mode 100644
index 000000000000..28dcd1b5b47c
--- /dev/null
+++ b/tools/setup_helpers/BUILD.bazel
@@ -0,0 +1,15 @@
+py_binary(
+    name = "generate_code",
+    srcs = ["generate_code.py"],
+    deps = [
+        "//tools/autograd",
+        "//torchgen",
+    ],
+    visibility = ["//:__pkg__"],
+)
+
+py_binary(
+    name = "gen_version_header",
+    srcs = ["gen_version_header.py"],
+    visibility = ["//:__pkg__"],
+)
diff --git a/tools/setup_helpers/BUILD.buck b/tools/setup_helpers/BUILD.buck
new file mode 100644
index 000000000000..afcd31fb3a03
--- /dev/null
+++ b/tools/setup_helpers/BUILD.buck
@@ -0,0 +1,41 @@
+python_library(
+    name = "generate_code",
+    srcs = [
+        "generate_code.py",
+    ],
+    base_module = "tools.setup_helpers",
+    deps = [
+        "//tools/autograd:autograd",
+        "//tools/jit:jit",
+        "//torchgen:torchgen",
+    ],
+)
+
+python_binary(
+    name = "generate_code_bin",
+    main_module = "tools.setup_helpers.generate_code",
+    visibility = ["PUBLIC"],
+    # package_style = "inplace",
+    zip_safe = False,
+    deps = [
+        ":generate_code",
+    ],
+)
+
+python_library(
+    name = "gen-version-header-lib",
+    srcs = [
+        "gen_version_header.py",
+    ],
+    base_module = "tools.setup_helpers",
+    deps = [],
+)
+
+python_binary(
+    name = "gen-version-header",
+    main_module = "tools.setup_helpers.gen_version_header",
+    visibility = ["PUBLIC"],
+    deps = [
+        ":gen-version-header-lib",
+    ],
+)
diff --git a/tools/setup_helpers/__init__.py b/tools/setup_helpers/__init__.py
index fa892dfb6e6f..4bf1747e80c6 100644
--- a/tools/setup_helpers/__init__.py
+++ b/tools/setup_helpers/__init__.py
@@ -8,8 +8,8 @@ def which(thefile: str) -> Optional[str]:
     for d in path:
         fname = os.path.join(d, thefile)
         fnames = [fname]
-        if sys.platform == 'win32':
-            exts = os.environ.get('PATHEXT', '').split(os.pathsep)
+        if sys.platform == "win32":
+            exts = os.environ.get("PATHEXT", "").split(os.pathsep)
             fnames += [fname + ext for ext in exts]
         for name in fnames:
             if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name):
diff --git a/tools/setup_helpers/build.bzl b/tools/setup_helpers/build.bzl
new file mode 100644
index 000000000000..c5be13e4603b
--- /dev/null
+++ b/tools/setup_helpers/build.bzl
@@ -0,0 +1,17 @@
+def define_targets(rules):
+    rules.py_binary(
+        name = "generate_code",
+        srcs = ["generate_code.py"],
+        visibility = ["//:__pkg__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//tools/autograd",
+            "//torchgen",
+        ],
+    )
+
+    rules.py_binary(
+        name = "gen_version_header",
+        srcs = ["gen_version_header.py"],
+        visibility = ["//:__pkg__"],
+    )
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index ff175771fd18..2c48e2807cdf 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -1,9 +1,9 @@
 "Manages CMake."
 
 
-
 import multiprocessing
 import os
+import platform
 import re
 from subprocess import check_call, check_output, CalledProcessError
 import sys
@@ -12,7 +12,7 @@
 from typing import IO, Any, Dict, List, Optional, Union, cast
 
 from . import which
-from .env import (BUILD_DIR, IS_64BIT, IS_DARWIN, IS_WINDOWS, check_negative_env_flag)
+from .env import BUILD_DIR, IS_64BIT, IS_DARWIN, IS_WINDOWS, check_negative_env_flag
 from .numpy_ import USE_NUMPY, NUMPY_INCLUDE_DIR
 
 
@@ -20,20 +20,23 @@ def _mkdir_p(d: str) -> None:
     try:
         os.makedirs(d, exist_ok=True)
     except OSError as e:
-        raise RuntimeError(f"Failed to create folder {os.path.abspath(d)}: {e.strerror}") from e
+        raise RuntimeError(
+            f"Failed to create folder {os.path.abspath(d)}: {e.strerror}"
+        ) from e
 
 
 # Ninja
 # Use ninja if it is on the PATH. Previous version of PyTorch required the
 # ninja python package, but we no longer use it, so we do not have to import it
-USE_NINJA = (not check_negative_env_flag('USE_NINJA') and
-             which('ninja') is not None)
+USE_NINJA = not check_negative_env_flag("USE_NINJA") and which("ninja") is not None
 
 
 CMakeValue = Optional[Union[bool, str]]
 
 
-def convert_cmake_value_to_python_value(cmake_value: str, cmake_type: str) -> CMakeValue:
+def convert_cmake_value_to_python_value(
+    cmake_value: str, cmake_type: str
+) -> CMakeValue:
     r"""Convert a CMake value in a string form to a Python value.
 
     Args:
@@ -46,18 +49,24 @@ def convert_cmake_value_to_python_value(cmake_value: str, cmake_type: str) -> CM
 
     cmake_type = cmake_type.upper()
     up_val = cmake_value.upper()
-    if cmake_type == 'BOOL':
+    if cmake_type == "BOOL":
         # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake
-        return not (up_val in ('FALSE', 'OFF', 'N', 'NO', '0', '', 'NOTFOUND') or up_val.endswith('-NOTFOUND'))
-    elif cmake_type == 'FILEPATH':
-        if up_val.endswith('-NOTFOUND'):
+        return not (
+            up_val in ("FALSE", "OFF", "N", "NO", "0", "", "NOTFOUND")
+            or up_val.endswith("-NOTFOUND")
+        )
+    elif cmake_type == "FILEPATH":
+        if up_val.endswith("-NOTFOUND"):
             return None
         else:
             return cmake_value
     else:  # Directly return the cmake_value.
         return cmake_value
 
-def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str, CMakeValue]:
+
+def get_cmake_cache_variables_from_file(
+    cmake_cache_file: IO[str],
+) -> Dict[str, CMakeValue]:
     r"""Gets values in CMakeCache.txt into a dictionary.
 
     Args:
@@ -69,7 +78,7 @@ def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str,
     results = dict()
     for i, line in enumerate(cmake_cache_file, 1):
         line = line.strip()
-        if not line or line.startswith(('#', '//')):
+        if not line or line.startswith(("#", "//")):
             # Blank or comment line, skip
             continue
 
@@ -82,19 +91,24 @@ def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str,
         #   USE_CUDA:=ON
         #   Intel(R) MKL-DNN_SOURCE_DIR:STATIC=/path/to/pytorch/third_party/ideep/mkl-dnn
         #   "OpenMP_COMPILE_RESULT_CXX_openmp:experimental":INTERNAL=FALSE
-        matched = re.match(r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line)
+        matched = re.match(
+            r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line
+        )
         if matched is None:  # Illegal line
-            raise ValueError('Unexpected line {} in {}: {}'.format(i, repr(cmake_cache_file), line))
+            raise ValueError(
+                "Unexpected line {} in {}: {}".format(i, repr(cmake_cache_file), line)
+            )
         _, variable, type_, value = matched.groups()
         if type_ is None:
-            type_ = ''
-        if type_.upper() in ('INTERNAL', 'STATIC'):
+            type_ = ""
+        if type_.upper() in ("INTERNAL", "STATIC"):
             # CMake internal variable, do not touch
             continue
         results[variable] = convert_cmake_value_to_python_value(value, type_)
 
     return results
 
+
 class CMake:
     "Manages cmake."
 
@@ -109,31 +123,36 @@ def _cmake_cache_file(self) -> str:
         Returns:
           string: The path to CMakeCache.txt.
         """
-        return os.path.join(self.build_dir, 'CMakeCache.txt')
+        return os.path.join(self.build_dir, "CMakeCache.txt")
 
     @staticmethod
     def _get_cmake_command() -> str:
         "Returns cmake command."
 
-        cmake_command = 'cmake'
+        cmake_command = "cmake"
         if IS_WINDOWS:
             return cmake_command
-        cmake3_version = CMake._get_version(which('cmake3'))
-        cmake_version = CMake._get_version(which('cmake'))
+        cmake3_version = CMake._get_version(which("cmake3"))
+        cmake_version = CMake._get_version(which("cmake"))
 
         _cmake_min_version = LooseVersion("3.10.0")
-        if all((ver is None or ver < _cmake_min_version for ver in [cmake_version, cmake3_version])):
-            raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found')
+        if all(
+            (
+                ver is None or ver < _cmake_min_version
+                for ver in [cmake_version, cmake3_version]
+            )
+        ):
+            raise RuntimeError("no cmake or cmake3 with version >= 3.10.0 found")
 
         if cmake3_version is None:
-            cmake_command = 'cmake'
+            cmake_command = "cmake"
         elif cmake_version is None:
-            cmake_command = 'cmake3'
+            cmake_command = "cmake3"
         else:
             if cmake3_version >= cmake_version:
-                cmake_command = 'cmake3'
+                cmake_command = "cmake3"
             else:
-                cmake_command = 'cmake'
+                cmake_command = "cmake"
         return cmake_command
 
     @staticmethod
@@ -142,16 +161,16 @@ def _get_version(cmd: Optional[str]) -> Any:
 
         if cmd is None:
             return None
-        for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
-            if 'version' in line:
-                return LooseVersion(line.strip().split(' ')[2])
-        raise RuntimeError('no version found')
+        for line in check_output([cmd, "--version"]).decode("utf-8").split("\n"):
+            if "version" in line:
+                return LooseVersion(line.strip().split(" ")[2])
+        raise RuntimeError("no version found")
 
     def run(self, args: List[str], env: Dict[str, str]) -> None:
         "Executes cmake with arguments and an environment."
 
         command = [self._cmake_command] + args
-        print(' '.join(command))
+        print(" ".join(command))
         try:
             check_call(command, cwd=self.build_dir, env=env)
         except (CalledProcessError, KeyboardInterrupt) as e:
@@ -165,7 +184,7 @@ def defines(args: List[str], **kwargs: CMakeValue) -> None:
         "Adds definitions to a cmake argument list."
         for key, value in sorted(kwargs.items()):
             if value is not None:
-                args.append('-D{}={}'.format(key, value))
+                args.append("-D{}={}".format(key, value))
 
     def get_cmake_cache_variables(self) -> Dict[str, CMakeValue]:
         r"""Gets values in CMakeCache.txt into a dictionary.
@@ -189,45 +208,54 @@ def generate(
         if rerun and os.path.isfile(self._cmake_cache_file):
             os.remove(self._cmake_cache_file)
 
-        ninja_build_file = os.path.join(self.build_dir, 'build.ninja')
+        ninja_build_file = os.path.join(self.build_dir, "build.ninja")
         if os.path.exists(self._cmake_cache_file) and not (
-                USE_NINJA and not os.path.exists(ninja_build_file)):
+            USE_NINJA and not os.path.exists(ninja_build_file)
+        ):
             # Everything's in place. Do not rerun.
             return
 
         args = []
         if USE_NINJA:
             # Avoid conflicts in '-G' and the `CMAKE_GENERATOR`
-            os.environ['CMAKE_GENERATOR'] = 'Ninja'
-            args.append('-GNinja')
+            os.environ["CMAKE_GENERATOR"] = "Ninja"
+            args.append("-GNinja")
         elif IS_WINDOWS:
-            generator = os.getenv('CMAKE_GENERATOR', 'Visual Studio 15 2017')
-            supported = ['Visual Studio 15 2017', 'Visual Studio 16 2019']
+            generator = os.getenv("CMAKE_GENERATOR", "Visual Studio 15 2017")
+            supported = ["Visual Studio 15 2017", "Visual Studio 16 2019"]
             if generator not in supported:
-                print('Unsupported `CMAKE_GENERATOR`: ' + generator)
-                print('Please set it to one of the following values: ')
-                print('\n'.join(supported))
+                print("Unsupported `CMAKE_GENERATOR`: " + generator)
+                print("Please set it to one of the following values: ")
+                print("\n".join(supported))
                 sys.exit(1)
-            args.append('-G' + generator)
+            args.append("-G" + generator)
             toolset_dict = {}
-            toolset_version = os.getenv('CMAKE_GENERATOR_TOOLSET_VERSION')
+            toolset_version = os.getenv("CMAKE_GENERATOR_TOOLSET_VERSION")
             if toolset_version is not None:
-                toolset_dict['version'] = toolset_version
-                curr_toolset = os.getenv('VCToolsVersion')
+                toolset_dict["version"] = toolset_version
+                curr_toolset = os.getenv("VCToolsVersion")
                 if curr_toolset is None:
-                    print('When you specify `CMAKE_GENERATOR_TOOLSET_VERSION`, you must also '
-                          'activate the vs environment of this version. Please read the notes '
-                          'in the build steps carefully.')
+                    print(
+                        "When you specify `CMAKE_GENERATOR_TOOLSET_VERSION`, you must also "
+                        "activate the vs environment of this version. Please read the notes "
+                        "in the build steps carefully."
+                    )
                     sys.exit(1)
             if IS_64BIT:
-                args.append('-Ax64')
-                toolset_dict['host'] = 'x64'
+                if platform.machine() == "ARM64":
+                    args.append("-A ARM64")
+                else:
+                    args.append("-Ax64")
+                    toolset_dict["host"] = "x64"
             if toolset_dict:
-                toolset_expr = ','.join(["{}={}".format(k, v) for k, v in toolset_dict.items()])
-                args.append('-T' + toolset_expr)
-
-        base_dir = os.path.dirname(os.path.dirname(os.path.dirname(
-            os.path.abspath(__file__))))
+                toolset_expr = ",".join(
+                    ["{}={}".format(k, v) for k, v in toolset_dict.items()]
+                )
+                args.append("-T" + toolset_expr)
+
+        base_dir = os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        )
         install_dir = os.path.join(base_dir, "torch")
 
         _mkdir_p(install_dir)
@@ -242,50 +270,54 @@ def generate(
             # Key: environment variable name. Value: Corresponding variable name to be passed to CMake. If you are
             # adding a new build option to this block: Consider making these two names identical and adding this option
             # in the block below.
-            '_GLIBCXX_USE_CXX11_ABI': 'GLIBCXX_USE_CXX11_ABI',
-            'CUDNN_LIB_DIR': 'CUDNN_LIBRARY',
-            'USE_CUDA_STATIC_LINK': 'CAFFE2_STATIC_LINK_CUDA',
+            "_GLIBCXX_USE_CXX11_ABI": "GLIBCXX_USE_CXX11_ABI",
+            "CUDNN_LIB_DIR": "CUDNN_LIBRARY",
+            "USE_CUDA_STATIC_LINK": "CAFFE2_STATIC_LINK_CUDA",
         }
-        additional_options.update({
-            # Build options that have the same environment variable name and CMake variable name and that do not start
-            # with "BUILD_", "USE_", or "CMAKE_". If you are adding a new build option, also make sure you add it to
-            # CMakeLists.txt.
-            var: var for var in
-            ('BLAS',
-             'BUILDING_WITH_TORCH_LIBS',
-             'CUDA_HOST_COMILER',
-             'CUDA_NVCC_EXECUTABLE',
-             'CUDA_SEPARABLE_COMPILATION',
-             'CUDNN_LIBRARY',
-             'CUDNN_INCLUDE_DIR',
-             'CUDNN_ROOT',
-             'EXPERIMENTAL_SINGLE_THREAD_POOL',
-             'INSTALL_TEST',
-             'JAVA_HOME',
-             'INTEL_MKL_DIR',
-             'INTEL_OMP_DIR',
-             'MKL_THREADING',
-             'MKLDNN_CPU_RUNTIME',
-             'MSVC_Z7_OVERRIDE',
-             'CAFFE2_USE_MSVC_STATIC_RUNTIME',
-             'Numa_INCLUDE_DIR',
-             'Numa_LIBRARIES',
-             'ONNX_ML',
-             'ONNX_NAMESPACE',
-             'ATEN_THREADING',
-             'WERROR',
-             'OPENSSL_ROOT_DIR')
-        })
+        additional_options.update(
+            {
+                # Build options that have the same environment variable name and CMake variable name and that do not start
+                # with "BUILD_", "USE_", or "CMAKE_". If you are adding a new build option, also make sure you add it to
+                # CMakeLists.txt.
+                var: var
+                for var in (
+                    "BLAS",
+                    "BUILDING_WITH_TORCH_LIBS",
+                    "CUDA_HOST_COMILER",
+                    "CUDA_NVCC_EXECUTABLE",
+                    "CUDA_SEPARABLE_COMPILATION",
+                    "CUDNN_LIBRARY",
+                    "CUDNN_INCLUDE_DIR",
+                    "CUDNN_ROOT",
+                    "EXPERIMENTAL_SINGLE_THREAD_POOL",
+                    "INSTALL_TEST",
+                    "JAVA_HOME",
+                    "INTEL_MKL_DIR",
+                    "INTEL_OMP_DIR",
+                    "MKL_THREADING",
+                    "MKLDNN_CPU_RUNTIME",
+                    "MSVC_Z7_OVERRIDE",
+                    "CAFFE2_USE_MSVC_STATIC_RUNTIME",
+                    "Numa_INCLUDE_DIR",
+                    "Numa_LIBRARIES",
+                    "ONNX_ML",
+                    "ONNX_NAMESPACE",
+                    "ATEN_THREADING",
+                    "WERROR",
+                    "OPENSSL_ROOT_DIR",
+                    "STATIC_DISPATCH_BACKEND",
+                )
+            }
+        )
 
         # Aliases which are lower priority than their canonical option
         low_priority_aliases = {
-            'CUDA_HOST_COMPILER': 'CMAKE_CUDA_HOST_COMPILER',
-            'CUDAHOSTCXX': 'CUDA_HOST_COMPILER',
-            'CMAKE_CUDA_HOST_COMPILER': 'CUDA_HOST_COMPILER',
-            'CMAKE_CUDA_COMPILER': 'CUDA_NVCC_EXECUTABLE',
-            'CUDACXX': 'CUDA_NVCC_EXECUTABLE'
+            "CUDA_HOST_COMPILER": "CMAKE_CUDA_HOST_COMPILER",
+            "CUDAHOSTCXX": "CUDA_HOST_COMPILER",
+            "CMAKE_CUDA_HOST_COMPILER": "CUDA_HOST_COMPILER",
+            "CMAKE_CUDA_COMPILER": "CUDA_NVCC_EXECUTABLE",
+            "CUDACXX": "CUDA_NVCC_EXECUTABLE",
         }
-
         for var, val in my_env.items():
             # We currently pass over all environment variables that start with "BUILD_", "USE_", and "CMAKE_". This is
             # because we currently have no reliable way to get the list of all build options we have specified in
@@ -295,7 +327,9 @@ def generate(
             true_var = additional_options.get(var)
             if true_var is not None:
                 build_options[true_var] = val
-            elif var.startswith(('BUILD_', 'USE_', 'CMAKE_')) or var.endswith(('EXITCODE', 'EXITCODE__TRYRUN_OUTPUT')):
+            elif var.startswith(("BUILD_", "USE_", "CMAKE_")) or var.endswith(
+                ("EXITCODE", "EXITCODE__TRYRUN_OUTPUT")
+            ):
                 build_options[var] = val
 
             if var in low_priority_aliases:
@@ -304,68 +338,81 @@ def generate(
                     build_options[key] = val
 
         # The default value cannot be easily obtained in CMakeLists.txt. We set it here.
-        py_lib_path = sysconfig.get_path('purelib')
-        cmake_prefix_path = build_options.get('CMAKE_PREFIX_PATH', None)
+        py_lib_path = sysconfig.get_path("purelib")
+        cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH", None)
         if cmake_prefix_path:
             build_options["CMAKE_PREFIX_PATH"] = (
-                cast(str, py_lib_path) + ";" + cast(str, cmake_prefix_path)
+                py_lib_path + ";" + cast(str, cmake_prefix_path)
             )
         else:
-            build_options['CMAKE_PREFIX_PATH'] = py_lib_path
+            build_options["CMAKE_PREFIX_PATH"] = py_lib_path
 
         # Some options must be post-processed. Ideally, this list will be shrunk to only one or two options in the
         # future, as CMake can detect many of these libraries pretty comfortably. We have them here for now before CMake
         # integration is completed. They appear here not in the CMake.defines call below because they start with either
         # "BUILD_" or "USE_" and must be overwritten here.
-        build_options.update({
-            # Note: Do not add new build options to this dict if it is directly read from environment variable -- you
-            # only need to add one in `CMakeLists.txt`. All build options that start with "BUILD_", "USE_", or "CMAKE_"
-            # are automatically passed to CMake; For other options you can add to additional_options above.
-            'BUILD_PYTHON': build_python,
-            'BUILD_TEST': build_test,
-            # Most library detection should go to CMake script, except this one, which Python can do a much better job
-            # due to NumPy's inherent Pythonic nature.
-            'USE_NUMPY': USE_NUMPY,
-        })
+        build_options.update(
+            {
+                # Note: Do not add new build options to this dict if it is directly read from environment variable -- you
+                # only need to add one in `CMakeLists.txt`. All build options that start with "BUILD_", "USE_", or "CMAKE_"
+                # are automatically passed to CMake; For other options you can add to additional_options above.
+                "BUILD_PYTHON": build_python,
+                "BUILD_TEST": build_test,
+                # Most library detection should go to CMake script, except this one, which Python can do a much better job
+                # due to NumPy's inherent Pythonic nature.
+                "USE_NUMPY": USE_NUMPY,
+            }
+        )
 
         # Options starting with CMAKE_
         cmake__options = {
-            'CMAKE_INSTALL_PREFIX': install_dir,
+            "CMAKE_INSTALL_PREFIX": install_dir,
         }
 
         # We set some CMAKE_* options in our Python build code instead of relying on the user's direct settings. Emit an
         # error if the user also attempts to set these CMAKE options directly.
         specified_cmake__options = set(build_options).intersection(cmake__options)
         if len(specified_cmake__options) > 0:
-            print(', '.join(specified_cmake__options) +
-                  ' should not be specified in the environment variable. They are directly set by PyTorch build script.')
+            print(
+                ", ".join(specified_cmake__options)
+                + " should not be specified in the environment variable. They are directly set by PyTorch build script."
+            )
             sys.exit(1)
         build_options.update(cmake__options)
 
-        CMake.defines(args,
-                      PYTHON_EXECUTABLE=sys.executable,
-                      PYTHON_LIBRARY=cmake_python_library,
-                      PYTHON_INCLUDE_DIR=sysconfig.get_path('include'),
-                      TORCH_BUILD_VERSION=version,
-                      NUMPY_INCLUDE_DIR=NUMPY_INCLUDE_DIR,
-                      **build_options)
-
-        expected_wrapper = '/usr/local/opt/ccache/libexec'
+        CMake.defines(
+            args,
+            PYTHON_EXECUTABLE=sys.executable,
+            PYTHON_LIBRARY=cmake_python_library,
+            PYTHON_INCLUDE_DIR=sysconfig.get_path("include"),
+            TORCH_BUILD_VERSION=version,
+            NUMPY_INCLUDE_DIR=NUMPY_INCLUDE_DIR,
+            **build_options,
+        )
+
+        expected_wrapper = "/usr/local/opt/ccache/libexec"
         if IS_DARWIN and os.path.exists(expected_wrapper):
-            if 'CMAKE_C_COMPILER' not in build_options and 'CC' not in os.environ:
+            if "CMAKE_C_COMPILER" not in build_options and "CC" not in os.environ:
                 CMake.defines(args, CMAKE_C_COMPILER="{}/gcc".format(expected_wrapper))
-            if 'CMAKE_CXX_COMPILER' not in build_options and 'CXX' not in os.environ:
-                CMake.defines(args, CMAKE_CXX_COMPILER="{}/g++".format(expected_wrapper))
+            if "CMAKE_CXX_COMPILER" not in build_options and "CXX" not in os.environ:
+                CMake.defines(
+                    args, CMAKE_CXX_COMPILER="{}/g++".format(expected_wrapper)
+                )
 
         for env_var_name in my_env:
-            if env_var_name.startswith('gh'):
+            if env_var_name.startswith("gh"):
                 # github env vars use utf-8, on windows, non-ascii code may
                 # cause problem, so encode first
                 try:
                     my_env[env_var_name] = str(my_env[env_var_name].encode("utf-8"))
                 except UnicodeDecodeError as e:
-                    shex = ':'.join('{:02x}'.format(ord(c)) for c in my_env[env_var_name])
-                    print('Invalid ENV[{}] = {}'.format(env_var_name, shex), file=sys.stderr)
+                    shex = ":".join(
+                        "{:02x}".format(ord(c)) for c in my_env[env_var_name]
+                    )
+                    print(
+                        "Invalid ENV[{}] = {}".format(env_var_name, shex),
+                        file=sys.stderr,
+                    )
                     print(e, file=sys.stderr)
         # According to the CMake manual, we should pass the arguments first,
         # and put the directory as the last element. Otherwise, these flags
@@ -381,7 +428,14 @@ def build(self, my_env: Dict[str, str]) -> None:
 
         from .env import build_type
 
-        build_args = ['--build', '.', '--target', 'install', '--config', build_type.build_type_string]
+        build_args = [
+            "--build",
+            ".",
+            "--target",
+            "install",
+            "--config",
+            build_type.build_type_string,
+        ]
 
         # Determine the parallelism according to the following
         # priorities:
@@ -391,7 +445,7 @@ def build(self, my_env: Dict[str, str]) -> None:
 
         # Allow the user to set parallelism explicitly. If unset,
         # we'll try to figure it out.
-        max_jobs = os.getenv('MAX_JOBS')
+        max_jobs = os.getenv("MAX_JOBS")
 
         if max_jobs is not None or not USE_NINJA:
             # Ninja is capable of figuring out the parallelism on its
@@ -410,10 +464,10 @@ def build(self, my_env: Dict[str, str]) -> None:
             # build_args += ['-j', max_jobs] would be sufficient by
             # then. Until then, we use "--" to pass parameters to the
             # underlying build system.
-            build_args += ['--']
+            build_args += ["--"]
             if IS_WINDOWS and not USE_NINJA:
                 # We are likely using msbuild here
-                build_args += ['/p:CL_MPCount={}'.format(max_jobs)]
+                build_args += ["/p:CL_MPCount={}".format(max_jobs)]
             else:
-                build_args += ['-j', max_jobs]
+                build_args += ["-j", max_jobs]
         self.run(build_args, my_env)
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index d658acdb8d52..bf693cacc381 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -6,37 +6,41 @@
 from typing import Iterable, List, Optional, cast
 
 
-IS_WINDOWS = (platform.system() == 'Windows')
-IS_DARWIN = (platform.system() == 'Darwin')
-IS_LINUX = (platform.system() == 'Linux')
+IS_WINDOWS = platform.system() == "Windows"
+IS_DARWIN = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
 
-IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ])
-CONDA_DIR = os.path.join(os.path.dirname(sys.executable), '..')
+IS_CONDA = (
+    "conda" in sys.version
+    or "Continuum" in sys.version
+    or any([x.startswith("CONDA") for x in os.environ])
+)
+CONDA_DIR = os.path.join(os.path.dirname(sys.executable), "..")
 
-IS_64BIT = (struct.calcsize("P") == 8)
+IS_64BIT = struct.calcsize("P") == 8
 
-BUILD_DIR = 'build'
+BUILD_DIR = "build"
 
 
-def check_env_flag(name: str, default: str = '') -> bool:
-    return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y']
+def check_env_flag(name: str, default: str = "") -> bool:
+    return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"]
 
 
-def check_negative_env_flag(name: str, default: str = '') -> bool:
-    return os.getenv(name, default).upper() in ['OFF', '0', 'NO', 'FALSE', 'N']
+def check_negative_env_flag(name: str, default: str = "") -> bool:
+    return os.getenv(name, default).upper() in ["OFF", "0", "NO", "FALSE", "N"]
 
 
 def gather_paths(env_vars: Iterable[str]) -> List[str]:
-    return list(chain(*(os.getenv(v, '').split(os.pathsep) for v in env_vars)))
+    return list(chain(*(os.getenv(v, "").split(os.pathsep) for v in env_vars)))
 
 
 def lib_paths_from_base(base_path: str) -> List[str]:
-    return [os.path.join(base_path, s) for s in ['lib/x64', 'lib', 'lib64']]
+    return [os.path.join(base_path, s) for s in ["lib/x64", "lib", "lib64"]]
 
 
 # We promised that CXXFLAGS should also be affected by CFLAGS
-if 'CFLAGS' in os.environ and 'CXXFLAGS' not in os.environ:
-    os.environ['CXXFLAGS'] = os.environ['CFLAGS']
+if "CFLAGS" in os.environ and "CXXFLAGS" not in os.environ:
+    os.environ["CXXFLAGS"] = os.environ["CFLAGS"]
 
 
 class BuildType(object):
@@ -55,39 +59,40 @@ def __init__(self, cmake_build_type_env: Optional[str] = None) -> None:
             self.build_type_string = cmake_build_type_env
             return
 
-        cmake_cache_txt = os.path.join(BUILD_DIR, 'CMakeCache.txt')
+        cmake_cache_txt = os.path.join(BUILD_DIR, "CMakeCache.txt")
         if os.path.isfile(cmake_cache_txt):
             # Found CMakeCache.txt. Use the build type specified in it.
             from .cmake import get_cmake_cache_variables_from_file
+
             with open(cmake_cache_txt) as f:
                 cmake_cache_vars = get_cmake_cache_variables_from_file(f)
             # Normally it is anti-pattern to determine build type from CMAKE_BUILD_TYPE because it is not used for
             # multi-configuration build tools, such as Visual Studio and XCode. But since we always communicate with
             # CMake using CMAKE_BUILD_TYPE from our Python scripts, this is OK here.
-            self.build_type_string = cast(str, cmake_cache_vars['CMAKE_BUILD_TYPE'])
+            self.build_type_string = cast(str, cmake_cache_vars["CMAKE_BUILD_TYPE"])
         else:
-            self.build_type_string = os.environ.get('CMAKE_BUILD_TYPE', 'Release')
+            self.build_type_string = os.environ.get("CMAKE_BUILD_TYPE", "Release")
 
     def is_debug(self) -> bool:
         "Checks Debug build."
-        return self.build_type_string == 'Debug'
+        return self.build_type_string == "Debug"
 
     def is_rel_with_deb_info(self) -> bool:
         "Checks RelWithDebInfo build."
-        return self.build_type_string == 'RelWithDebInfo'
+        return self.build_type_string == "RelWithDebInfo"
 
     def is_release(self) -> bool:
         "Checks Release build."
-        return self.build_type_string == 'Release'
+        return self.build_type_string == "Release"
 
 
 # hotpatch environment variable 'CMAKE_BUILD_TYPE'. 'CMAKE_BUILD_TYPE' always prevails over DEBUG or REL_WITH_DEB_INFO.
-if 'CMAKE_BUILD_TYPE' not in os.environ:
-    if check_env_flag('DEBUG'):
-        os.environ['CMAKE_BUILD_TYPE'] = 'Debug'
-    elif check_env_flag('REL_WITH_DEB_INFO'):
-        os.environ['CMAKE_BUILD_TYPE'] = 'RelWithDebInfo'
+if "CMAKE_BUILD_TYPE" not in os.environ:
+    if check_env_flag("DEBUG"):
+        os.environ["CMAKE_BUILD_TYPE"] = "Debug"
+    elif check_env_flag("REL_WITH_DEB_INFO"):
+        os.environ["CMAKE_BUILD_TYPE"] = "RelWithDebInfo"
     else:
-        os.environ['CMAKE_BUILD_TYPE'] = 'Release'
+        os.environ["CMAKE_BUILD_TYPE"] = "Release"
 
 build_type = BuildType()
diff --git a/tools/setup_helpers/gen.py b/tools/setup_helpers/gen.py
index bdb52ee44efb..3ca9a8787906 100644
--- a/tools/setup_helpers/gen.py
+++ b/tools/setup_helpers/gen.py
@@ -6,6 +6,6 @@
 root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.insert(0, root)
 
-import tools.codegen.gen
+import torchgen.gen
 
-tools.codegen.gen.main()
+torchgen.gen.main()
diff --git a/tools/setup_helpers/gen_unboxing.py b/tools/setup_helpers/gen_unboxing.py
new file mode 100644
index 000000000000..d2883f6d1e48
--- /dev/null
+++ b/tools/setup_helpers/gen_unboxing.py
@@ -0,0 +1,11 @@
+# Little stub file to get BUILD.bazel to play along
+
+import os.path
+import sys
+
+root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, root)
+
+import tools.jit.gen_unboxing
+
+tools.jit.gen_unboxing.main()
diff --git a/tools/setup_helpers/gen_version_header.py b/tools/setup_helpers/gen_version_header.py
index 963db1dad1f1..bd576af6f111 100644
--- a/tools/setup_helpers/gen_version_header.py
+++ b/tools/setup_helpers/gen_version_header.py
@@ -76,7 +76,9 @@ def main(args: argparse.Namespace) -> None:
         help="Path to the template (i.e. version.h.in)",
     )
     parser.add_argument(
-        "--version-path", required=True, help="Path to the file specifying the version",
+        "--version-path",
+        required=True,
+        help="Path to the file specifying the version",
     )
     parser.add_argument(
         "--output-path",
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index ef90acc3935a..4440e6c2e0a2 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -1,8 +1,9 @@
 import argparse
 import os
+import pathlib
 import sys
 import yaml
-from typing import Any, List, Optional, cast
+from typing import Any, Optional, cast
 
 try:
     # use faster C loader if available
@@ -10,56 +11,42 @@
 except ImportError:
     from yaml import SafeLoader as YamlLoader  # type: ignore[misc]
 
-source_files = {'.py', '.cpp', '.h'}
-
-NATIVE_FUNCTIONS_PATH = 'aten/src/ATen/native/native_functions.yaml'
-
-# TODO: This is a little inaccurate, because it will also pick
-# up setup_helper scripts which don't affect code generation
-def all_generator_source() -> List[str]:
-    r = []
-    for directory, _, filenames in os.walk('tools'):
-        for f in filenames:
-            if os.path.splitext(f)[1] in source_files:
-                full = os.path.join(directory, f)
-                r.append(full)
-    return sorted(r)
-
-
-def generate_code(ninja_global: Optional[str] = None,
-                  nn_path: Optional[str] = None,
-                  native_functions_path: Optional[str] = None,
-                  install_dir: Optional[str] = None,
-                  subset: Optional[str] = None,
-                  disable_autograd: bool = False,
-                  force_schema_registration: bool = False,
-                  operator_selector: Any = None) -> None:
+NATIVE_FUNCTIONS_PATH = "aten/src/ATen/native/native_functions.yaml"
+TAGS_PATH = "aten/src/ATen/native/tags.yaml"
+
+
+def generate_code(
+    gen_dir: pathlib.Path,
+    native_functions_path: Optional[str] = None,
+    tags_path: Optional[str] = None,
+    install_dir: Optional[str] = None,
+    subset: Optional[str] = None,
+    disable_autograd: bool = False,
+    force_schema_registration: bool = False,
+    operator_selector: Any = None,
+) -> None:
     from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python
     from tools.autograd.gen_annotated_fn_args import gen_annotated
-    from tools.codegen.selective_build.selector import SelectiveBuilder
-
+    from torchgen.selective_build.selector import SelectiveBuilder
 
     # Build ATen based Variable classes
     if install_dir is None:
-        install_dir = 'torch/csrc'
-        python_install_dir = 'torch/testing/_internal/generated'
+        install_dir = os.fspath(gen_dir / "torch/csrc")
+        python_install_dir = os.fspath(gen_dir / "torch/testing/_internal/generated")
     else:
         python_install_dir = install_dir
-    autograd_gen_dir = os.path.join(install_dir, 'autograd', 'generated')
-    jit_gen_dir = os.path.join(install_dir, 'jit', 'generated')
-    for d in (autograd_gen_dir, jit_gen_dir, python_install_dir):
-        if not os.path.exists(d):
-            os.makedirs(d)
-    runfiles_dir = os.environ.get("RUNFILES_DIR", None)
-    data_dir = os.path.join(runfiles_dir, 'pytorch') if runfiles_dir else ''
-    autograd_dir = os.path.join(data_dir, 'tools', 'autograd')
-    tools_jit_templates = os.path.join(data_dir, 'tools', 'jit', 'templates')
+    autograd_gen_dir = os.path.join(install_dir, "autograd", "generated")
+    for d in (autograd_gen_dir, python_install_dir):
+        os.makedirs(d, exist_ok=True)
+    autograd_dir = os.fspath(pathlib.Path(__file__).parent.parent / "autograd")
 
     if subset == "pybindings" or not subset:
         gen_autograd_python(
             native_functions_path or NATIVE_FUNCTIONS_PATH,
+            tags_path or TAGS_PATH,
             autograd_gen_dir,
-            autograd_dir)
+            autograd_dir,
+        )
 
     if operator_selector is None:
         operator_selector = SelectiveBuilder.get_nop_selector()
@@ -68,6 +55,7 @@ def generate_code(ninja_global: Optional[str] = None,
 
         gen_autograd(
             native_functions_path or NATIVE_FUNCTIONS_PATH,
+            tags_path or TAGS_PATH,
             autograd_gen_dir,
             autograd_dir,
             disable_autograd=disable_autograd,
@@ -77,18 +65,20 @@ def generate_code(ninja_global: Optional[str] = None,
     if subset == "python" or not subset:
         gen_annotated(
             native_functions_path or NATIVE_FUNCTIONS_PATH,
+            tags_path or TAGS_PATH,
             python_install_dir,
-            autograd_dir)
+            autograd_dir,
+        )
 
 
 def get_selector_from_legacy_operator_selection_list(
-        selected_op_list_path: str,
+    selected_op_list_path: str,
 ) -> Any:
-    with open(selected_op_list_path, 'r') as f:
+    with open(selected_op_list_path, "r") as f:
         # strip out the overload part
         # It's only for legacy config - do NOT copy this code!
         selected_op_list = {
-            opname.split('.', 1)[0] for opname in yaml.load(f, Loader=YamlLoader)
+            opname.split(".", 1)[0] for opname in yaml.load(f, Loader=YamlLoader)
         }
 
     # Internal build doesn't use this flag any more. Only used by OSS
@@ -100,7 +90,8 @@ def get_selector_from_legacy_operator_selection_list(
     is_root_operator = True
     is_used_for_training = True
 
-    from tools.codegen.selective_build.selector import SelectiveBuilder
+    from torchgen.selective_build.selector import SelectiveBuilder
+
     selector = SelectiveBuilder.from_legacy_op_registration_allow_list(
         selected_op_list,
         is_root_operator,
@@ -117,12 +108,14 @@ def get_selector(
     # cwrap depends on pyyaml, so we can't import it earlier
     root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     sys.path.insert(0, root)
-    from tools.codegen.selective_build.selector import SelectiveBuilder
+    from torchgen.selective_build.selector import SelectiveBuilder
 
-    assert not (selected_op_list_path is not None and
-                operators_yaml_path is not None), \
-        ("Expected at most one of selected_op_list_path and " +
-         "operators_yaml_path to be set.")
+    assert not (
+        selected_op_list_path is not None and operators_yaml_path is not None
+    ), (
+        "Expected at most one of selected_op_list_path and "
+        + "operators_yaml_path to be set."
+    )
 
     if selected_op_list_path is None and operators_yaml_path is None:
         return SelectiveBuilder.get_nop_selector()
@@ -133,49 +126,105 @@ def get_selector(
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description='Autogenerate code')
-    parser.add_argument('--native-functions-path')
-    parser.add_argument('--nn-path')
-    parser.add_argument('--ninja-global')
-    parser.add_argument('--install_dir')
+    parser = argparse.ArgumentParser(description="Autogenerate code")
+    parser.add_argument("--native-functions-path")
+    parser.add_argument("--tags-path")
     parser.add_argument(
-        '--subset',
-        help='Subset of source files to generate. Can be "libtorch" or "pybindings". Generates both when omitted.'
+        "--gen-dir",
+        type=pathlib.Path,
+        default=pathlib.Path("."),
+        help="Root directory where to install files. Defaults to the current working directory.",
     )
     parser.add_argument(
-        '--disable-autograd',
+        "--install_dir",
+        help=(
+            "Deprecated. Use --gen-dir instead. The semantics are different, do not change "
+            "blindly."
+        ),
+    )
+    parser.add_argument(
+        "--subset",
+        help='Subset of source files to generate. Can be "libtorch" or "pybindings". Generates both when omitted.',
+    )
+    parser.add_argument(
+        "--disable-autograd",
         default=False,
-        action='store_true',
-        help='It can skip generating autograd related code when the flag is set',
+        action="store_true",
+        help="It can skip generating autograd related code when the flag is set",
+    )
+    parser.add_argument(
+        "--selected-op-list-path",
+        help="Path to the YAML file that contains the list of operators to include for custom build.",
     )
     parser.add_argument(
-        '--selected-op-list-path',
-        help='Path to the YAML file that contains the list of operators to include for custom build.',
+        "--operators_yaml_path",
+        help="Path to the model YAML file that contains the list of operators to include for custom build.",
     )
     parser.add_argument(
-        '--operators_yaml_path',
-        help='Path to the model YAML file that contains the list of operators to include for custom build.',
+        "--force_schema_registration",
+        action="store_true",
+        help="force it to generate schema-only registrations for ops that are not"
+        "listed on --selected-op-list",
     )
     parser.add_argument(
-        '--force_schema_registration',
-        action='store_true',
-        help='force it to generate schema-only registrations for ops that are not'
-        'listed on --selected-op-list'
+        "--gen_lazy_ts_backend",
+        action="store_true",
+        help="Enable generation of the torch::lazy TorchScript backend",
+    )
+    parser.add_argument(
+        "--per_operator_headers",
+        action="store_true",
+        help="Build lazy tensor ts backend with per-operator ATen headers, must match how ATen was built",
     )
     options = parser.parse_args()
 
     generate_code(
-        options.ninja_global,
-        options.nn_path,
+        options.gen_dir,
         options.native_functions_path,
+        options.tags_path,
         options.install_dir,
         options.subset,
         options.disable_autograd,
         options.force_schema_registration,
         # options.selected_op_list
-        operator_selector=get_selector(options.selected_op_list_path, options.operators_yaml_path),
+        operator_selector=get_selector(
+            options.selected_op_list_path, options.operators_yaml_path
+        ),
     )
 
+    if options.gen_lazy_ts_backend:
+        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
+        ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
+        ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+        ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
+        install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
+        lazy_install_dir = os.path.join(install_dir, "lazy/generated")
+        os.makedirs(lazy_install_dir, exist_ok=True)
+
+        assert os.path.isfile(
+            ts_backend_yaml
+        ), f"Unable to access ts_backend_yaml: {ts_backend_yaml}"
+        assert os.path.isfile(
+            ts_native_functions
+        ), f"Unable to access {ts_native_functions}"
+        from torchgen.gen_lazy_tensor import run_gen_lazy_tensor
+        from torchgen.dest.lazy_ir import GenTSLazyIR
+
+        run_gen_lazy_tensor(
+            aten_path=aten_path,
+            source_yaml=ts_backend_yaml,
+            backend_name="TorchScript",
+            output_dir=lazy_install_dir,
+            dry_run=False,
+            impl_path=ts_native_functions,
+            node_base="TsNode",
+            node_base_hdr=ts_node_base,
+            build_in_tree=True,
+            lazy_ir_generator=GenTSLazyIR,
+            per_operator_headers=options.per_operator_headers,
+            gen_forced_fallback_code=True,
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/tools/setup_helpers/numpy_.py b/tools/setup_helpers/numpy_.py
index 882de4be6e93..e93fcfd24707 100644
--- a/tools/setup_helpers/numpy_.py
+++ b/tools/setup_helpers/numpy_.py
@@ -10,7 +10,7 @@
 
 # Set USE_NUMPY to what the user wants, because even if we fail here, cmake
 # will check for the presence of NumPy again (`cmake/Dependencies.cmake`).
-USE_NUMPY = not check_negative_env_flag('USE_NUMPY')
+USE_NUMPY = not check_negative_env_flag("USE_NUMPY")
 NUMPY_INCLUDE_DIR = None
 
 if USE_NUMPY:
diff --git a/tools/shared/cwrap_common.py b/tools/shared/cwrap_common.py
index 01ff97aabd9b..42548b9afa11 100644
--- a/tools/shared/cwrap_common.py
+++ b/tools/shared/cwrap_common.py
@@ -6,17 +6,18 @@
 
 Arg = Dict[str, Any]
 
+
 def parse_arguments(args: List[Union[str, Arg]]) -> List[Arg]:
     new_args = []
     for arg in args:
         # Simple arg declaration of form "<type> <name>"
         if isinstance(arg, str):
-            t, _, name = arg.partition(' ')
-            new_args.append({'type': t, 'name': name})
+            t, _, name = arg.partition(" ")
+            new_args.append({"type": t, "name": name})
         elif isinstance(arg, dict):
-            if 'arg' in arg:
-                arg['type'], _, arg['name'] = arg['arg'].partition(' ')
-                del arg['arg']
+            if "arg" in arg:
+                arg["type"], _, arg["name"] = arg["arg"].partition(" ")
+                del arg["arg"]
             new_args.append(arg)
         else:
             raise AssertionError()
@@ -27,52 +28,66 @@ def parse_arguments(args: List[Union[str, Arg]]) -> List[Arg]:
 
 
 def set_declaration_defaults(declaration: Declaration) -> None:
-    if 'schema_string' not in declaration:
+    if "schema_string" not in declaration:
         # This happens for legacy TH bindings like
         # _thnn_conv_depthwise2d_backward
-        declaration['schema_string'] = ''
-    declaration.setdefault('arguments', [])
-    declaration.setdefault('return', 'void')
-    if 'cname' not in declaration:
-        declaration['cname'] = declaration['name']
-    if 'backends' not in declaration:
-        declaration['backends'] = ['CPU', 'CUDA']
-    assert 'api_name' not in declaration
-    declaration['api_name'] = declaration['name']
+        declaration["schema_string"] = ""
+    declaration.setdefault("arguments", [])
+    declaration.setdefault("return", "void")
+    if "cname" not in declaration:
+        declaration["cname"] = declaration["name"]
+    if "backends" not in declaration:
+        declaration["backends"] = ["CPU", "CUDA"]
+    assert "api_name" not in declaration
+    declaration["api_name"] = declaration["name"]
     # NB: keep this in sync with gen_autograd.py
-    if declaration.get('overload_name'):
-        declaration['type_wrapper_name'] = "{}_{}".format(
-            declaration['name'], declaration['overload_name'])
+    if declaration.get("overload_name"):
+        declaration["type_wrapper_name"] = "{}_{}".format(
+            declaration["name"], declaration["overload_name"]
+        )
     else:
-        declaration['type_wrapper_name'] = declaration['name']
+        declaration["type_wrapper_name"] = declaration["name"]
     # TODO: Uggggh, parsing the schema string here, really???
-    declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0]
-    if declaration['schema_string']:
-        declaration['unqual_schema_string'] = declaration['schema_string'].split('::')[1]
-        declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1]
+    declaration["operator_name_with_overload"] = declaration["schema_string"].split(
+        "("
+    )[0]
+    if declaration["schema_string"]:
+        declaration["unqual_schema_string"] = declaration["schema_string"].split("::")[
+            1
+        ]
+        declaration["unqual_operator_name_with_overload"] = declaration[
+            "operator_name_with_overload"
+        ].split("::")[1]
     else:
-        declaration['unqual_schema_string'] = ''
-        declaration['unqual_operator_name_with_overload'] = ''
+        declaration["unqual_schema_string"] = ""
+        declaration["unqual_operator_name_with_overload"] = ""
     # Simulate multiple dispatch, even if it's not necessary
-    if 'options' not in declaration:
-        declaration['options'] = [{
-            'arguments': copy.deepcopy(declaration['arguments']),
-            'schema_order_arguments': copy.deepcopy(declaration['schema_order_arguments']),
-        }]
-        del declaration['arguments']
-        del declaration['schema_order_arguments']
+    if "options" not in declaration:
+        declaration["options"] = [
+            {
+                "arguments": copy.deepcopy(declaration["arguments"]),
+                "schema_order_arguments": copy.deepcopy(
+                    declaration["schema_order_arguments"]
+                ),
+            }
+        ]
+        del declaration["arguments"]
+        del declaration["schema_order_arguments"]
     # Parse arguments (some of them can be strings)
-    for option in declaration['options']:
-        option['arguments'] = parse_arguments(option['arguments'])
-        option['schema_order_arguments'] = parse_arguments(option['schema_order_arguments'])
+    for option in declaration["options"]:
+        option["arguments"] = parse_arguments(option["arguments"])
+        option["schema_order_arguments"] = parse_arguments(
+            option["schema_order_arguments"]
+        )
     # Propagate defaults from declaration to options
-    for option in declaration['options']:
+    for option in declaration["options"]:
         for k, v in declaration.items():
             # TODO(zach): why does cwrap not propagate 'name'? I need it
             # propagaged for ATen
-            if k != 'options':
+            if k != "options":
                 option.setdefault(k, v)
 
+
 # TODO(zach): added option to remove keyword handling for C++ which cannot
 # support it.
 
@@ -86,38 +101,41 @@ def filter_unique_options(
     remove_self: bool,
 ) -> List[Option]:
     def exclude_arg(arg: Arg) -> bool:
-        return arg['type'] == 'CONSTANT'  # type: ignore[no-any-return]
+        return arg["type"] == "CONSTANT"  # type: ignore[no-any-return]
 
     def exclude_arg_with_self_check(arg: Arg) -> bool:
-        return exclude_arg(arg) or (remove_self and arg['name'] == 'self')
+        return exclude_arg(arg) or (remove_self and arg["name"] == "self")
 
     def signature(option: Option, num_kwarg_only: int) -> str:
         if num_kwarg_only == 0:
             kwarg_only_count = None
         else:
             kwarg_only_count = -num_kwarg_only
-        arg_signature = '#'.join(
-            type_to_signature.get(arg['type'], arg['type'])
-            for arg in option['arguments'][:kwarg_only_count]
-            if not exclude_arg_with_self_check(arg))
+        arg_signature = "#".join(
+            type_to_signature.get(arg["type"], arg["type"])
+            for arg in option["arguments"][:kwarg_only_count]
+            if not exclude_arg_with_self_check(arg)
+        )
         if kwarg_only_count is None:
             return arg_signature
-        kwarg_only_signature = '#'.join(
-            arg['name'] + '#' + arg['type']
-            for arg in option['arguments'][kwarg_only_count:]
-            if not exclude_arg(arg))
+        kwarg_only_signature = "#".join(
+            arg["name"] + "#" + arg["type"]
+            for arg in option["arguments"][kwarg_only_count:]
+            if not exclude_arg(arg)
+        )
         return arg_signature + "#-#" + kwarg_only_signature
+
     seen_signatures = set()
     unique = []
     for option in options:
         # if only check num_kwarg_only == 0 if allow_kwarg == False
-        limit = len(option['arguments']) if allow_kwarg else 0
+        limit = len(option["arguments"]) if allow_kwarg else 0
         for num_kwarg_only in range(0, limit + 1):
             sig = signature(option, num_kwarg_only)
             if sig not in seen_signatures:
                 if num_kwarg_only > 0:
-                    for arg in option['arguments'][-num_kwarg_only:]:
-                        arg['kwarg_only'] = True
+                    for arg in option["arguments"][-num_kwarg_only:]:
+                        arg["kwarg_only"] = True
                 unique.append(option)
                 seen_signatures.add(sig)
                 break
@@ -126,49 +144,48 @@ def signature(option: Option, num_kwarg_only: int) -> str:
 
 def sort_by_number_of_args(declaration: Declaration, reverse: bool = True) -> None:
     def num_args(option: Option) -> int:
-        return len(option['arguments'])
-    declaration['options'].sort(key=num_args, reverse=reverse)
+        return len(option["arguments"])
 
+    declaration["options"].sort(key=num_args, reverse=reverse)
 
-class Function(object):
 
+class Function(object):
     def __init__(self, name: str) -> None:
         self.name = name
-        self.arguments: List['Argument'] = []
+        self.arguments: List["Argument"] = []
 
-    def add_argument(self, arg: 'Argument') -> None:
+    def add_argument(self, arg: "Argument") -> None:
         assert isinstance(arg, Argument)
         self.arguments.append(arg)
 
     def __repr__(self) -> str:
-        return self.name + '(' + ', '.join(a.__repr__() for a in self.arguments) + ')'
+        return self.name + "(" + ", ".join(a.__repr__() for a in self.arguments) + ")"
 
 
 class Argument(object):
-
     def __init__(self, _type: str, name: str, is_optional: bool):
         self.type = _type
         self.name = name
         self.is_optional = is_optional
 
     def __repr__(self) -> str:
-        return self.type + ' ' + self.name
+        return self.type + " " + self.name
 
 
 def parse_header(path: str) -> List[Function]:
-    with open(path, 'r') as f:
-        lines: Iterable[Any] = f.read().split('\n')
+    with open(path, "r") as f:
+        lines: Iterable[Any] = f.read().split("\n")
 
     # Remove empty lines and prebackend directives
-    lines = filter(lambda l: l and not l.startswith('#'), lines)
+    lines = filter(lambda l: l and not l.startswith("#"), lines)
     # Remove line comments
-    lines = (l.partition('//') for l in lines)
+    lines = (l.partition("//") for l in lines)
     # Select line and comment part
     lines = ((l[0].strip(), l[2].strip()) for l in lines)
     # Remove trailing special signs
-    lines = ((l[0].rstrip(');').rstrip(','), l[1]) for l in lines)
+    lines = ((l[0].rstrip(");").rstrip(","), l[1]) for l in lines)
     # Split arguments
-    lines = ((l[0].split(','), l[1]) for l in lines)
+    lines = ((l[0].split(","), l[1]) for l in lines)
     # Flatten lines
     new_lines = []
     for l, c in lines:
@@ -182,32 +199,31 @@ def parse_header(path: str) -> List[Function]:
     lines = filter(lambda l: l[0], lines)
     generic_functions = []
     for l, c in lines:
-        if l.startswith('TH_API void THNN_'):
-            fn_name = l[len('TH_API void THNN_'):]
-            if fn_name[0] == '(' and fn_name[-2] == ')':
+        if l.startswith("TH_API void THNN_"):
+            fn_name = l[len("TH_API void THNN_") :]
+            if fn_name[0] == "(" and fn_name[-2] == ")":
                 fn_name = fn_name[1:-2]
             else:
                 fn_name = fn_name[:-1]
             generic_functions.append(Function(fn_name))
-        elif l.startswith('TORCH_CUDA_CPP_API void THNN_'):
-            fn_name = l[len('TORCH_CUDA_CPP_API void THNN_'):]
-            if fn_name[0] == '(' and fn_name[-2] == ')':
+        elif l.startswith("TORCH_CUDA_CPP_API void THNN_"):
+            fn_name = l[len("TORCH_CUDA_CPP_API void THNN_") :]
+            if fn_name[0] == "(" and fn_name[-2] == ")":
                 fn_name = fn_name[1:-2]
             else:
                 fn_name = fn_name[:-1]
             generic_functions.append(Function(fn_name))
-        elif l.startswith('TORCH_CUDA_CU_API void THNN_'):
-            fn_name = l[len('TORCH_CUDA_CU_API void THNN_'):]
-            if fn_name[0] == '(' and fn_name[-2] == ')':
+        elif l.startswith("TORCH_CUDA_CU_API void THNN_"):
+            fn_name = l[len("TORCH_CUDA_CU_API void THNN_") :]
+            if fn_name[0] == "(" and fn_name[-2] == ")":
                 fn_name = fn_name[1:-2]
             else:
                 fn_name = fn_name[:-1]
             generic_functions.append(Function(fn_name))
         elif l:
             t, name = l.split()
-            if '*' in name:
-                t = t + '*'
+            if "*" in name:
+                t = t + "*"
                 name = name[1:]
-            generic_functions[-1].add_argument(
-                Argument(t, name, '[OPTIONAL]' in c))
+            generic_functions[-1].add_argument(Argument(t, name, "[OPTIONAL]" in c))
     return generic_functions
diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py
index 7482047d4e8d..5e22fb4be4e0 100644
--- a/tools/shared/module_loader.py
+++ b/tools/shared/module_loader.py
@@ -5,7 +5,9 @@
 
 def import_module(name: str, path: str) -> ModuleType:
     import importlib.util
+
     spec = importlib.util.spec_from_file_location(name, path)
+    assert spec is not None
     module = importlib.util.module_from_spec(spec)
     cast(Loader, spec.loader).exec_module(module)
     return module
diff --git a/tools/stats/export_slow_tests.py b/tools/stats/export_slow_tests.py
index b9d71cfb6cb7..13afbf984a23 100644
--- a/tools/stats/export_slow_tests.py
+++ b/tools/stats/export_slow_tests.py
@@ -5,53 +5,74 @@
 import os
 import statistics
 from collections import defaultdict
-from tools.stats.s3_stat_parser import get_previous_reports_for_branch, Report, Version2Report
+from tools.stats.s3_stat_parser import (
+    get_previous_reports_for_branch,
+    Report,
+    Version2Report,
+)
 from typing import cast, DefaultDict, Dict, List, Any
 from urllib.request import urlopen
 
-SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
+SLOW_TESTS_FILE = ".pytorch-slow-tests.json"
 SLOW_TEST_CASE_THRESHOLD_SEC = 60.0
 RELATIVE_DIFFERENCE_THRESHOLD = 0.1
+IGNORED_JOBS = ["asan", "periodic"]
+
 
 def get_test_case_times() -> Dict[str, float]:
-    reports: List[Report] = get_previous_reports_for_branch('origin/viable/strict', "")
+    reports: List[Report] = get_previous_reports_for_branch("origin/viable/strict", "")
     # an entry will be like ("test_doc_examples (__main__.TestTypeHints)" -> [values]))
     test_names_to_times: DefaultDict[str, List[float]] = defaultdict(list)
     for report in reports:
-        if report.get('format_version', 1) != 2:  # type: ignore[misc]
+        if report.get("format_version", 1) != 2:  # type: ignore[misc]
             raise RuntimeError("S3 format currently handled is version 2 only")
         v2report = cast(Version2Report, report)
-        for test_file in v2report['files'].values():
-            for suitename, test_suite in test_file['suites'].items():
-                for casename, test_case in test_suite['cases'].items():
+
+        if any(job_name in str(report["build_job"]) for job_name in IGNORED_JOBS):
+            continue
+
+        for test_file in v2report["files"].values():
+            for suitename, test_suite in test_file["suites"].items():
+                for casename, test_case in test_suite["cases"].items():
                     # The below attaches a __main__ as that matches the format of test.__class__ in
                     # common_utils.py (where this data will be used), and also matches what the output
                     # of a running test would look like.
-                    name = f'{casename} (__main__.{suitename})'
-                    succeeded: bool = test_case['status'] is None
+                    name = f"{casename} (__main__.{suitename})"
+                    succeeded: bool = test_case["status"] is None
                     if succeeded:
-                        test_names_to_times[name].append(test_case['seconds'])
-    return {test_case: statistics.mean(times) for test_case, times in test_names_to_times.items()}
+                        test_names_to_times[name].append(test_case["seconds"])
+    return {
+        test_case: statistics.mean(times)
+        for test_case, times in test_names_to_times.items()
+    }
 
 
 def filter_slow_tests(test_cases_dict: Dict[str, float]) -> Dict[str, float]:
-    return {test_case: time for test_case, time in test_cases_dict.items() if time >= SLOW_TEST_CASE_THRESHOLD_SEC}
+    return {
+        test_case: time
+        for test_case, time in test_cases_dict.items()
+        if time >= SLOW_TEST_CASE_THRESHOLD_SEC
+    }
 
 
 def get_test_infra_slow_tests() -> Dict[str, float]:
     url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json"
-    contents = urlopen(url, timeout=1).read().decode('utf-8')
+    contents = urlopen(url, timeout=1).read().decode("utf-8")
     return cast(Dict[str, float], json.loads(contents))
 
 
-def too_similar(calculated_times: Dict[str, float], other_times: Dict[str, float], threshold: float) -> bool:
+def too_similar(
+    calculated_times: Dict[str, float], other_times: Dict[str, float], threshold: float
+) -> bool:
     # check that their keys are the same
     if calculated_times.keys() != other_times.keys():
         return False
 
     for test_case, test_time in calculated_times.items():
         other_test_time = other_times[test_case]
-        relative_difference = abs((other_test_time - test_time) / max(other_test_time, test_time))
+        relative_difference = abs(
+            (other_test_time - test_time) / max(other_test_time, test_time)
+        )
         if relative_difference > threshold:
             return False
     return True
@@ -60,38 +81,43 @@ def too_similar(calculated_times: Dict[str, float], other_times: Dict[str, float
 def export_slow_tests(options: Any) -> None:
     filename = options.filename
     if os.path.exists(filename):
-        print(f'Overwriting existent file: {filename}')
-    with open(filename, 'w+') as file:
+        print(f"Overwriting existent file: {filename}")
+    with open(filename, "w+") as file:
         slow_test_times: Dict[str, float] = filter_slow_tests(get_test_case_times())
         if options.ignore_small_diffs:
             test_infra_slow_tests_dict = get_test_infra_slow_tests()
-            if too_similar(slow_test_times, test_infra_slow_tests_dict, options.ignore_small_diffs):
+            if too_similar(
+                slow_test_times, test_infra_slow_tests_dict, options.ignore_small_diffs
+            ):
                 slow_test_times = test_infra_slow_tests_dict
-        json.dump(slow_test_times, file, indent='    ', separators=(',', ': '), sort_keys=True)
-        file.write('\n')
+        json.dump(
+            slow_test_times, file, indent="    ", separators=(",", ": "), sort_keys=True
+        )
+        file.write("\n")
 
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description='Export a JSON of slow test cases in PyTorch unit test suite')
+        description="Export a JSON of slow test cases in PyTorch unit test suite"
+    )
     parser.add_argument(
-        '-f',
-        '--filename',
-        nargs='?',
+        "-f",
+        "--filename",
+        nargs="?",
         type=str,
         default=SLOW_TESTS_FILE,
         const=SLOW_TESTS_FILE,
-        help='Specify a file path to dump slow test times from previous S3 stats. Default file path: .pytorch-slow-tests.json',
+        help="Specify a file path to dump slow test times from previous S3 stats. Default file path: .pytorch-slow-tests.json",
     )
     parser.add_argument(
-        '--ignore-small-diffs',
-        nargs='?',
+        "--ignore-small-diffs",
+        nargs="?",
         type=float,
         const=RELATIVE_DIFFERENCE_THRESHOLD,
-        help='Compares generated results with stats/slow-tests.json in pytorch/test-infra. If the relative differences '
-             'between test times for each test are smaller than the threshold and the set of test cases have not '
-             'changed, we will export the stats already in stats/slow-tests.json. Else, we will export the calculated '
-             'results. The default threshold is 10%.',
+        help="Compares generated results with stats/slow-tests.json in pytorch/test-infra. If the relative differences "
+        "between test times for each test are smaller than the threshold and the set of test cases have not "
+        "changed, we will export the stats already in stats/slow-tests.json. Else, we will export the calculated "
+        "results. The default threshold is 10%.",
     )
     return parser.parse_args()
 
@@ -101,5 +127,5 @@ def main() -> None:
     export_slow_tests(options)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index f6250a182bef..7249c5fccb65 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -8,30 +8,34 @@
 from typing import Any, Callable, Dict, List, Optional, cast
 from urllib.request import urlopen
 
-# PYTORCH_IGNORE_DISABLED_ISSUES should only be set during CI (along with IN_CI) as a
-# comma-separated list of issue numbers. The intent is to re-enable any disabled tests
-# associated with the issues in this list.
-#
-# There is normally no reason to use this locally as the disabled tests list should not
-# affect your local development and every test should be enabled. If for whatever reason
-# you would like to use this during local development, please note the following caveat:
-#
-# Whenever you set OR reset PYTORCH_IGNORE_DISABLED_ISSUES, you should delete the existing
-# .pytorch-disabled-tests.json and redownload/parse the file for your change to apply, as
-# PYTORCH_IGNORE_DISABLED_ISSUES is used during the parsing stage. To download the files,
-# run test/run_test.py with IN_CI=1.
-IGNORE_DISABLED_ISSUES: List[str] = os.getenv('PYTORCH_IGNORE_DISABLED_ISSUES', '').split(',')
-
-SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
-DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
+
+def get_disabled_issues() -> List[str]:
+    pr_body = os.getenv("PR_BODY", "")
+    commit_messages = os.getenv("COMMIT_MESSAGES", "")
+    # The below regex is meant to match all *case-insensitive* keywords that
+    # GitHub has delineated would link PRs to issues, more details here:
+    # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue.
+    # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not
+    # "closes  #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no #
+    regex = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
+    issue_numbers = [x[5] for x in re.findall(regex, pr_body + commit_messages)]
+    print("Ignoring disabled issues: ", issue_numbers)
+    return issue_numbers
+
+
+IGNORE_DISABLED_ISSUES: List[str] = get_disabled_issues()
+
+SLOW_TESTS_FILE = ".pytorch-slow-tests.json"
+DISABLED_TESTS_FILE = ".pytorch-disabled-tests.json"
 
 FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds
 
+
 def fetch_and_cache(
     dirpath: str,
     name: str,
     url: str,
-    process_fn: Callable[[Dict[str, Any]], Dict[str, Any]]
+    process_fn: Callable[[Dict[str, Any]], Dict[str, Any]],
 ) -> Dict[str, Any]:
     """
     This fetch and cache utils allows sharing between different process.
@@ -54,18 +58,20 @@ def is_cached_file_valid() -> bool:
 
     for _ in range(3):
         try:
-            contents = urlopen(url, timeout=5).read().decode('utf-8')
+            contents = urlopen(url, timeout=5).read().decode("utf-8")
             processed_contents = process_fn(json.loads(contents))
             with open(path, "w") as f:
                 f.write(json.dumps(processed_contents))
             return processed_contents
         except Exception as e:
-            print(f'Could not download {url} because: {e}.')
-    print(f'All retries exhausted, downloading {url} failed.')
+            print(f"Could not download {url} because: {e}.")
+    print(f"All retries exhausted, downloading {url} failed.")
     return {}
 
 
-def get_slow_tests(dirpath: str, filename: str = SLOW_TESTS_FILE) -> Optional[Dict[str, float]]:
+def get_slow_tests(
+    dirpath: str, filename: str = SLOW_TESTS_FILE
+) -> Optional[Dict[str, float]]:
     url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json"
     try:
         return fetch_and_cache(dirpath, filename, url, lambda x: x)
@@ -74,28 +80,38 @@ def get_slow_tests(dirpath: str, filename: str = SLOW_TESTS_FILE) -> Optional[Di
         return {}
 
 
-def get_disabled_tests(dirpath: str, filename: str = DISABLED_TESTS_FILE) -> Optional[Dict[str, Any]]:
+def get_disabled_tests(
+    dirpath: str, filename: str = DISABLED_TESTS_FILE
+) -> Optional[Dict[str, Any]]:
     def process_disabled_test(the_response: Dict[str, Any]) -> Dict[str, Any]:
         disabled_test_from_issues = dict()
-        for item in the_response['items']:
-            title = item['title']
-            key = 'DISABLED '
-            issue_url = item['html_url']
-            issue_number = issue_url.split('/')[-1]
+        for item in the_response["items"]:
+            title = item["title"]
+            key = "DISABLED "
+            issue_url = item["html_url"]
+            issue_number = issue_url.split("/")[-1]
             if title.startswith(key) and issue_number not in IGNORE_DISABLED_ISSUES:
-                test_name = title[len(key):].strip()
-                body = item['body']
+                test_name = title[len(key) :].strip()
+                body = item["body"]
                 platforms_to_skip = []
-                key = 'platforms:'
-                for line in body.splitlines():
-                    line = line.lower()
-                    if line.startswith(key):
-                        pattern = re.compile(r"^\s+|\s*,\s*|\s+$")
-                        platforms_to_skip.extend([x for x in pattern.split(line[len(key):]) if x])
-                disabled_test_from_issues[test_name] = (item['html_url'], platforms_to_skip)
+                key = "platforms:"
+                # When the issue has no body, it is assumed that all platforms should skip the test
+                if body is not None:
+                    for line in body.splitlines():
+                        line = line.lower()
+                        if line.startswith(key):
+                            pattern = re.compile(r"^\s+|\s*,\s*|\s+$")
+                            platforms_to_skip.extend(
+                                [x for x in pattern.split(line[len(key) :]) if x]
+                            )
+                disabled_test_from_issues[test_name] = (
+                    item["html_url"],
+                    platforms_to_skip,
+                )
         return disabled_test_from_issues
+
     try:
-        url = 'https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests.json'
+        url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests.json"
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")
diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py
index 0555945e4786..44cd4e43dbb4 100755
--- a/tools/stats/print_test_stats.py
+++ b/tools/stats/print_test_stats.py
@@ -12,15 +12,36 @@
 import time
 from collections import defaultdict
 from pathlib import Path
-from typing import (Any, DefaultDict, Dict, Iterable, Iterator, List, Optional,
-                    Set, Tuple, cast)
+from typing import (
+    Any,
+    DefaultDict,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    cast,
+)
 from xml.dom import minidom
 
 from typing_extensions import TypedDict
-from tools.stats.s3_stat_parser import (newify_case, get_S3_object_from_bucket, get_test_stats_summaries_for_job,
-                                        Report, Status, Commit, HAVE_BOTO3, Version2Case, VersionedReport,
-                                        Version1Report, Version2Report, ReportMetaMeta)
-from tools.stats.scribe import send_to_scribe, rds_write, register_rds_schema, schema_from_sample
+from tools.stats.s3_stat_parser import (
+    newify_case,
+    get_S3_object_from_bucket,
+    get_test_stats_summaries_for_job,
+    Report,
+    Status,
+    Commit,
+    HAVE_BOTO3,
+    Version2Case,
+    VersionedReport,
+    Version1Report,
+    Version2Report,
+    ReportMetaMeta,
+)
+from tools.stats.scribe import send_to_scribe
 
 
 SimplerSuite = Dict[str, Version2Case]
@@ -61,12 +82,12 @@ class SuiteDiff(TypedDict):
 # share a name (for version 2 reports) or using a list of cases rather
 # than a dict.
 def simplify(report: Report) -> SimplerReport:
-    if 'format_version' not in report:  # version 1 implicitly
+    if "format_version" not in report:  # version 1 implicitly
         v1report = cast(Version1Report, report)
         return {
             # we just don't have test filename information sadly, so we
             # just make one fake filename that is the empty string
-            '': {
+            "": {
                 suite_name: {
                     # This clobbers some cases that have duplicate names
                     # because in version 1, we would merge together all
@@ -80,35 +101,41 @@ def simplify(report: Report) -> SimplerReport:
                     # we're only uploading in the new format (where
                     # everything is also keyed by filename) going
                     # forward, it shouldn't matter too much.
-                    case['name']: newify_case(case)
-                    for case in suite['cases']
+                    case["name"]: newify_case(case)
+                    for case in suite["cases"]
                 }
-                for suite_name, suite in v1report['suites'].items()
+                for suite_name, suite in v1report["suites"].items()
             }
         }
     else:
         v_report = cast(VersionedReport, report)
-        version = v_report['format_version']
+        version = v_report["format_version"]
         if version == 2:
             v2report = cast(Version2Report, v_report)
             return {
                 filename: {
-                    suite_name: suite['cases']
-                    for suite_name, suite in file_data['suites'].items()
+                    suite_name: suite["cases"]
+                    for suite_name, suite in file_data["suites"].items()
                 }
-                for filename, file_data in v2report['files'].items()
+                for filename, file_data in v2report["files"].items()
             }
         else:
-            raise RuntimeError(f'Unknown format version: {version}')
+            raise RuntimeError(f"Unknown format version: {version}")
 
 
 def plural(n: int) -> str:
-    return '' if n == 1 else 's'
+    return "" if n == 1 else "s"
 
 
 def get_base_commit(sha1: str) -> str:
+    default_branch = os.environ.get("GIT_DEFAULT_BRANCH")
+    # capture None and "" cases
+    if not default_branch:
+        default_branch = "master"
+
+    default_remote = f"origin/{default_branch}"
     return subprocess.check_output(
-        ["git", "merge-base", sha1, "origin/master"],
+        ["git", "merge-base", sha1, default_remote],
         encoding="ascii",
     ).strip()
 
@@ -118,28 +145,28 @@ def display_stat(
     format: Tuple[Tuple[int, int], Tuple[int, int]],
 ) -> str:
     spread_len = format[1][0] + 1 + format[1][1]
-    spread = x['spread']
+    spread = x["spread"]
     if spread is not None:
-        spread_str = f' ± {spread:{spread_len}.{format[1][1]}f}s'
+        spread_str = f" ± {spread:{spread_len}.{format[1][1]}f}s"
     else:
-        spread_str = ' ' * (3 + spread_len + 1)
+        spread_str = " " * (3 + spread_len + 1)
     mean_len = format[0][0] + 1 + format[0][1]
     return f'{x["center"]:{mean_len}.{format[0][1]}f}s{spread_str}'
 
 
 def list_stat(l: List[float]) -> Stat:
     return {
-        'center': statistics.mean(l),
-        'spread': statistics.stdev(l) if len(l) > 1 else None
+        "center": statistics.mean(l),
+        "spread": statistics.stdev(l) if len(l) > 1 else None,
     }
 
 
 def zero_stat() -> Stat:
-    return {'center': 0, 'spread': None}
+    return {"center": 0, "spread": None}
 
 
 def recenter(was: Stat, now: float) -> Stat:
-    return {'center': now - was['center'], 'spread': was['spread']}
+    return {"center": now - was["center"], "spread": was["spread"]}
 
 
 def sum_normals(stats: Iterable[Stat]) -> Stat:
@@ -151,29 +178,29 @@ def sum_normals(stats: Iterable[Stat]) -> Stat:
     """
     l = list(stats)
     spread: Optional[float]
-    if any(stat['spread'] is not None for stat in l):
-        spread = math.sqrt(sum((stat['spread'] or 0)**2 for stat in l))
+    if any(stat["spread"] is not None for stat in l):
+        spread = math.sqrt(sum((stat["spread"] or 0) ** 2 for stat in l))
     else:
         spread = None
     return {
-        'center': sum(stat['center'] for stat in l),
-        'spread': spread,
+        "center": sum(stat["center"] for stat in l),
+        "spread": spread,
     }
 
 
 def format_seconds(seconds: List[float]) -> str:
     if len(seconds) > 0:
         x = list_stat(seconds)
-        return f'total time {display_stat(x, ((5, 2), (4, 2)))}'.strip()
-    return ''
+        return f"total time {display_stat(x, ((5, 2), (4, 2)))}".strip()
+    return ""
 
 
 def show_ancestors(num_commits: int) -> str:
-    return f'    | : ({num_commits} commit{plural(num_commits)})'
+    return f"    | : ({num_commits} commit{plural(num_commits)})"
 
 
 def unlines(lines: List[str]) -> str:
-    return ''.join(f'{line}\n' for line in lines)
+    return "".join(f"{line}\n" for line in lines)
 
 
 def matching_test_times(
@@ -193,8 +220,8 @@ def matching_test_times(
                 if suite:
                     case = suite.get(case_name)
                     if case:
-                        t = case['seconds']
-                        s = case['status']
+                        t = case["seconds"]
+                        s = case["status"]
                         if s == status:
                             times.append(t)
     return times
@@ -206,7 +233,7 @@ def analyze(
     base_reports: Dict[Commit, List[SimplerReport]],
 ) -> List[SuiteDiff]:
     nonempty_shas = [sha for sha, reports in base_reports.items() if reports]
-    # most recent master ancestor with at least one S3 report,
+    # most recent main ancestor with at least one S3 report,
     # or empty list if there are none (will show all tests as added)
     base_report = base_reports[nonempty_shas[0]] if nonempty_shas else []
 
@@ -226,37 +253,49 @@ def analyze(
     for filename, suite_name in sorted(all_suites):
         case_diffs: List[CaseDiff] = []
         head_suite = head_report.get(filename, {}).get(suite_name)
-        base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[
-            {
-                (n, case['status'])
-                for n, case
-                in report.get(filename, {}).get(suite_name, {}).items()
-            }
-            for report in base_report
-        ] or [set()])))
+        base_cases: Dict[str, Status] = dict(
+            sorted(
+                set.intersection(
+                    *[
+                        {
+                            (n, case["status"])
+                            for n, case in report.get(filename, {})
+                            .get(suite_name, {})
+                            .items()
+                        }
+                        for report in base_report
+                    ]
+                    or [set()]
+                )
+            )
+        )
         case_stats: Dict[str, Stat] = {}
         if head_suite:
-            now = sum(case['seconds'] for case in head_suite.values())
+            now = sum(case["seconds"] for case in head_suite.values())
             if any(
                 filename in report and suite_name in report[filename]
                 for report in base_report
             ):
                 removed_cases: List[CaseDiff] = []
                 for case_name, case_status in base_cases.items():
-                    case_stats[case_name] = list_stat(matching_test_times(
-                        base_reports=base_reports,
-                        filename=filename,
-                        suite_name=suite_name,
-                        case_name=case_name,
-                        status=case_status,
-                    ))
+                    case_stats[case_name] = list_stat(
+                        matching_test_times(
+                            base_reports=base_reports,
+                            filename=filename,
+                            suite_name=suite_name,
+                            case_name=case_name,
+                            status=case_status,
+                        )
+                    )
                     if case_name not in head_suite:
-                        removed_cases.append({
-                            'margin': '-',
-                            'name': case_name,
-                            'was': (case_stats[case_name], case_status),
-                            'now': None,
-                        })
+                        removed_cases.append(
+                            {
+                                "margin": "-",
+                                "name": case_name,
+                                "was": (case_stats[case_name], case_status),
+                                "now": None,
+                            }
+                        )
                 modified_cases: List[CaseDiff] = []
                 added_cases: List[CaseDiff] = []
                 for head_case_name in sorted(head_suite):
@@ -264,70 +303,86 @@ def analyze(
                     if head_case_name in base_cases:
                         stat = case_stats[head_case_name]
                         base_status = base_cases[head_case_name]
-                        if head_case['status'] != base_status:
-                            modified_cases.append({
-                                'margin': '!',
-                                'name': head_case_name,
-                                'was': (stat, base_status),
-                                'now': head_case,
-                            })
+                        if head_case["status"] != base_status:
+                            modified_cases.append(
+                                {
+                                    "margin": "!",
+                                    "name": head_case_name,
+                                    "was": (stat, base_status),
+                                    "now": head_case,
+                                }
+                            )
                     else:
-                        added_cases.append({
-                            'margin': '+',
-                            'name': head_case_name,
-                            'was': None,
-                            'now': head_case,
-                        })
+                        added_cases.append(
+                            {
+                                "margin": "+",
+                                "name": head_case_name,
+                                "was": None,
+                                "now": head_case,
+                            }
+                        )
                 # there might be a bug calculating this stdev, not sure
                 was = sum_normals(case_stats.values())
                 case_diffs = removed_cases + modified_cases + added_cases
                 if case_diffs:
-                    modified_suites.append({
-                        'margin': ' ',
-                        'name': suite_name,
-                        'was': was,
-                        'now': now,
-                        'cases': case_diffs,
-                    })
+                    modified_suites.append(
+                        {
+                            "margin": " ",
+                            "name": suite_name,
+                            "was": was,
+                            "now": now,
+                            "cases": case_diffs,
+                        }
+                    )
             else:
                 for head_case_name in sorted(head_suite):
                     head_case = head_suite[head_case_name]
-                    case_diffs.append({
-                        'margin': ' ',
-                        'name': head_case_name,
-                        'was': None,
-                        'now': head_case,
-                    })
-                added_suites.append({
-                    'margin': '+',
-                    'name': suite_name,
-                    'was': None,
-                    'now': now,
-                    'cases': case_diffs,
-                })
+                    case_diffs.append(
+                        {
+                            "margin": " ",
+                            "name": head_case_name,
+                            "was": None,
+                            "now": head_case,
+                        }
+                    )
+                added_suites.append(
+                    {
+                        "margin": "+",
+                        "name": suite_name,
+                        "was": None,
+                        "now": now,
+                        "cases": case_diffs,
+                    }
+                )
         else:
             for case_name, case_status in base_cases.items():
-                case_stats[case_name] = list_stat(matching_test_times(
-                    base_reports=base_reports,
-                    filename=filename,
-                    suite_name=suite_name,
-                    case_name=case_name,
-                    status=case_status,
-                ))
-                case_diffs.append({
-                    'margin': ' ',
-                    'name': case_name,
-                    'was': (case_stats[case_name], case_status),
-                    'now': None,
-                })
-            removed_suites.append({
-                'margin': '-',
-                'name': suite_name,
-                # there might be a bug calculating this stdev, not sure
-                'was': sum_normals(case_stats.values()),
-                'now': None,
-                'cases': case_diffs,
-            })
+                case_stats[case_name] = list_stat(
+                    matching_test_times(
+                        base_reports=base_reports,
+                        filename=filename,
+                        suite_name=suite_name,
+                        case_name=case_name,
+                        status=case_status,
+                    )
+                )
+                case_diffs.append(
+                    {
+                        "margin": " ",
+                        "name": case_name,
+                        "was": (case_stats[case_name], case_status),
+                        "now": None,
+                    }
+                )
+            removed_suites.append(
+                {
+                    "margin": "-",
+                    "name": suite_name,
+                    # there might be a bug calculating this stdev, not sure
+                    "was": sum_normals(case_stats.values()),
+                    "now": None,
+                    "cases": case_diffs,
+                }
+            )
 
     return removed_suites + modified_suites + added_suites
 
@@ -337,24 +392,24 @@ def case_diff_lines(diff: CaseDiff) -> List[str]:
 
     case_fmt = ((3, 3), (2, 3))
 
-    was = diff['was']
+    was = diff["was"]
     if was:
-        was_line = f'    # was {display_stat(was[0], case_fmt)}'
+        was_line = f"    # was {display_stat(was[0], case_fmt)}"
         was_status = was[1]
         if was_status:
-            was_line += f' ({was_status})'
+            was_line += f" ({was_status})"
         lines.append(was_line)
 
-    now = diff['now']
+    now = diff["now"]
     if now:
-        now_stat: Stat = {'center': now['seconds'], 'spread': None}
-        now_line = f'    # now {display_stat(now_stat, case_fmt)}'
-        now_status = now['status']
+        now_stat: Stat = {"center": now["seconds"], "spread": None}
+        now_line = f"    # now {display_stat(now_stat, case_fmt)}"
+        now_status = now["status"]
         if now_status:
-            now_line += f' ({now_status})'
+            now_line += f" ({now_status})"
         lines.append(now_line)
 
-    return [''] + [f'{diff["margin"]} {l}' for l in lines]
+    return [""] + [f'{diff["margin"]} {l}' for l in lines]
 
 
 def display_suite_diff(diff: SuiteDiff) -> str:
@@ -362,23 +417,23 @@ def display_suite_diff(diff: SuiteDiff) -> str:
 
     suite_fmt = ((4, 2), (3, 2))
 
-    was = diff['was']
+    was = diff["was"]
     if was:
-        lines.append(f'    # was {display_stat(was, suite_fmt)}')
+        lines.append(f"    # was {display_stat(was, suite_fmt)}")
 
-    now = diff['now']
+    now = diff["now"]
     if now is not None:
-        now_stat: Stat = {'center': now, 'spread': None}
-        lines.append(f'    # now {display_stat(now_stat, suite_fmt)}')
+        now_stat: Stat = {"center": now, "spread": None}
+        lines.append(f"    # now {display_stat(now_stat, suite_fmt)}")
 
-    for case_diff in diff['cases']:
-        lines.extend([f'  {l}' for l in case_diff_lines(case_diff)])
+    for case_diff in diff["cases"]:
+        lines.extend([f"  {l}" for l in case_diff_lines(case_diff)])
 
-    return unlines([''] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + [''])
+    return unlines([""] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + [""])
 
 
 def anomalies(diffs: List[SuiteDiff]) -> str:
-    return ''.join(map(display_suite_diff, diffs))
+    return "".join(map(display_suite_diff, diffs))
 
 
 def graph(
@@ -391,89 +446,91 @@ def graph(
     other_ancestors: int = 0,
 ) -> str:
     lines = [
-        'Commit graph (base is most recent master ancestor with at least one S3 report):',
-        '',
-        '    : (master)',
-        '    |',
+        "Commit graph (base is most recent master ancestor with at least one S3 report):",
+        "",
+        "    : (master)",
+        "    |",
     ]
 
-    head_time_str = f'           {format_seconds([head_seconds])}'
+    head_time_str = f"           {format_seconds([head_seconds])}"
     if on_master:
-        lines.append(f'    * {head_sha[:10]} (HEAD)   {head_time_str}')
+        lines.append(f"    * {head_sha[:10]} (HEAD)   {head_time_str}")
     else:
-        lines.append(f'    | * {head_sha[:10]} (HEAD) {head_time_str}')
+        lines.append(f"    | * {head_sha[:10]} (HEAD) {head_time_str}")
 
         if ancestry_path > 0:
             lines += [
-                '    | |',
+                "    | |",
                 show_ancestors(ancestry_path),
             ]
 
         if other_ancestors > 0:
             lines += [
-                '    |/|',
+                "    |/|",
                 show_ancestors(other_ancestors),
-                '    |',
+                "    |",
             ]
         else:
-            lines.append('    |/')
+            lines.append("    |/")
 
     is_first = True
     for sha, seconds in base_seconds.items():
         num_runs = len(seconds)
         prefix = str(num_runs).rjust(3)
-        base = '(base)' if is_first and num_runs > 0 else '      '
+        base = "(base)" if is_first and num_runs > 0 else "      "
         if num_runs > 0:
             is_first = False
         t = format_seconds(seconds)
         p = plural(num_runs)
         if t:
-            p = f'{p}, '.ljust(3)
-        lines.append(f'    * {sha[:10]} {base} {prefix} report{p}{t}')
+            p = f"{p}, ".ljust(3)
+        lines.append(f"    * {sha[:10]} {base} {prefix} report{p}{t}")
 
-    lines.extend(['    |', '    :'])
+    lines.extend(["    |", "    :"])
 
     return unlines(lines)
 
 
 def case_delta(case: CaseDiff) -> Stat:
-    was = case['was']
-    now = case['now']
+    was = case["was"]
+    now = case["now"]
     return recenter(
         was[0] if was else zero_stat(),
-        now['seconds'] if now else 0,
+        now["seconds"] if now else 0,
     )
 
 
 def display_final_stat(stat: Stat) -> str:
-    center = stat['center']
-    spread = stat['spread']
+    center = stat["center"]
+    spread = stat["spread"]
     displayed = display_stat(
-        {'center': abs(center), 'spread': spread},
+        {"center": abs(center), "spread": spread},
         ((4, 2), (3, 2)),
     )
     if center < 0:
-        sign = '-'
+        sign = "-"
     elif center > 0:
-        sign = '+'
+        sign = "+"
     else:
-        sign = ' '
-    return f'{sign}{displayed}'.rstrip()
+        sign = " "
+    return f"{sign}{displayed}".rstrip()
 
 
 def summary_line(message: str, d: DefaultDict[str, List[CaseDiff]]) -> str:
     all_cases = [c for cs in d.values() for c in cs]
     tests = len(all_cases)
     suites = len(d)
-    sp = f'{plural(suites)})'.ljust(2)
-    tp = f'{plural(tests)},'.ljust(2)
+    sp = f"{plural(suites)})".ljust(2)
+    tp = f"{plural(tests)},".ljust(2)
     # there might be a bug calculating this stdev, not sure
     stat = sum_normals(case_delta(c) for c in all_cases)
-    return ''.join([
-        f'{message} (across {suites:>4} suite{sp}',
-        f'{tests:>6} test{tp}',
-        f' totaling {display_final_stat(stat)}',
-    ])
+    return "".join(
+        [
+            f"{message} (across {suites:>4} suite{sp}",
+            f"{tests:>6} test{tp}",
+            f" totaling {display_final_stat(stat)}",
+        ]
+    )
 
 
 def summary(analysis: List[SuiteDiff]) -> str:
@@ -483,17 +540,17 @@ def summary(analysis: List[SuiteDiff]) -> str:
 
     for diff in analysis:
         # the use of 'margin' here is not the most elegant
-        name = diff['name']
-        margin = diff['margin']
-        cases = diff['cases']
-        if margin == '-':
+        name = diff["name"]
+        margin = diff["margin"]
+        cases = diff["cases"]
+        if margin == "-":
             removed_tests[name] += cases
-        elif margin == '+':
+        elif margin == "+":
             added_tests[name] += cases
         else:
-            removed = list(filter(lambda c: c['margin'] == '-', cases))
-            added = list(filter(lambda c: c['margin'] == '+', cases))
-            modified = list(filter(lambda c: c['margin'] == '!', cases))
+            removed = list(filter(lambda c: c["margin"] == "-", cases))
+            added = list(filter(lambda c: c["margin"] == "+", cases))
+            modified = list(filter(lambda c: c["margin"] == "!", cases))
             if removed:
                 removed_tests[name] += removed
             if added:
@@ -501,11 +558,13 @@ def summary(analysis: List[SuiteDiff]) -> str:
             if modified:
                 modified_tests[name] += modified
 
-    return unlines([
-        summary_line('Removed ', removed_tests),
-        summary_line('Modified', modified_tests),
-        summary_line('Added   ', added_tests),
-    ])
+    return unlines(
+        [
+            summary_line("Removed ", removed_tests),
+            summary_line("Modified", modified_tests),
+            summary_line("Added   ", added_tests),
+        ]
+    )
 
 
 def regression_info(
@@ -525,7 +584,7 @@ def regression_info(
     and its test times. Since Python dicts maintain insertion order
     (guaranteed as part of the language spec since 3.7), the
     base_reports argument must list the head's several most recent
-    master commits, from newest to oldest (so the merge-base is
+    main commits, from newest to oldest (so the merge-base is
     list(base_reports)[0]).
     """
     simpler_head = simplify(head_report)
@@ -537,40 +596,49 @@ def regression_info(
         base_reports=simpler_base,
     )
 
-    return '\n'.join([
-        unlines([
-            '----- Historic stats comparison result ------',
-            '',
-            f'    job: {job_name}',
-            f'    commit: {head_sha}',
-        ]),
-
-        # don't print anomalies, because sometimes due to sharding, the
-        # output from this would be very long and obscure better signal
-
-        # anomalies(analysis),
-
-        graph(
-            head_sha=head_sha,
-            head_seconds=head_report['total_seconds'],
-            base_seconds={
-                c: [r['total_seconds'] for r in rs]
-                for c, rs in base_reports.items()
-            },
-            on_master=on_master,
-            ancestry_path=ancestry_path,
-            other_ancestors=other_ancestors,
-        ),
-        summary(analysis),
-    ])
+    return "\n".join(
+        [
+            unlines(
+                [
+                    "----- Historic stats comparison result ------",
+                    "",
+                    f"    job: {job_name}",
+                    f"    commit: {head_sha}",
+                ]
+            ),
+            # don't print anomalies, because sometimes due to sharding, the
+            # output from this would be very long and obscure better signal
+            # anomalies(analysis),
+            graph(
+                head_sha=head_sha,
+                head_seconds=head_report["total_seconds"],
+                base_seconds={
+                    c: [r["total_seconds"] for r in rs]
+                    for c, rs in base_reports.items()
+                },
+                on_master=on_master,
+                ancestry_path=ancestry_path,
+                other_ancestors=other_ancestors,
+            ),
+            summary(analysis),
+        ]
+    )
 
 
 class TestCase:
     def __init__(self, dom: Any) -> None:
-        self.class_name = str(dom.attributes['classname'].value)
-        self.name = str(dom.attributes['name'].value)
-        self.time = float(dom.attributes['time'].value)
-        error_elements = dom.getElementsByTagName('error')
+        self.class_name = str(dom.attributes["classname"].value)
+        self.name = str(dom.attributes["name"].value)
+        self.time = float(dom.attributes["time"].value)
+        # The following attribute is currently ONLY used in process_intentional_test_runs for validation
+        # reasons. The test filename that populates TestFile is calculated and passed down through the test report path.
+        # The reason we don't just use this attribute is because it doesn't exist for cpp tests, e.g., in test_libtorch
+        self.file = (
+            str(dom.attributes["file"].value)
+            if dom.hasAttribute("file")
+            else "N/A - probably a cpp test"
+        )
+        error_elements = dom.getElementsByTagName("error")
         # DISCLAIMER: unexpected successes and expected failures are currently not reported in assemble_s3_object
         self.expected_failure = False
         self.skipped = False
@@ -579,25 +647,32 @@ def __init__(self, dom: Any) -> None:
         if len(error_elements) > 0:
             # We are only expecting 1 element here
             error_element = error_elements[0]
-            self.unexpected_success = (error_element.hasAttribute('type') and
-                                       error_element.attributes['type'].value == 'UnexpectedSuccess')
+            self.unexpected_success = (
+                error_element.hasAttribute("type")
+                and error_element.attributes["type"].value == "UnexpectedSuccess"
+            )
             self.errored = not self.unexpected_success
-        skipped_elements = dom.getElementsByTagName('skipped')
+        skipped_elements = dom.getElementsByTagName("skipped")
         if len(skipped_elements) > 0:
             # We are only expecting 1 element here
             skipped_element = skipped_elements[0]
-            self.expected_failure = (skipped_element.hasAttribute('type') and
-                                     skipped_element.attributes['type'].value == 'XFAIL')
+            self.expected_failure = (
+                skipped_element.hasAttribute("type")
+                and skipped_element.attributes["type"].value == "XFAIL"
+            )
             self.skipped = not self.expected_failure
-        self.failed = len(dom.getElementsByTagName('failure')) > 0
+        self.failed = len(dom.getElementsByTagName("failure")) > 0
 
     def __repr__(self) -> str:
         return self.__str__()
 
     def __str__(self) -> str:
-        return f'[TestCase name: {self.name} | class_name: {self.class_name} | time: {self.time} | ' \
-            f'expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | ' \
-            f'unexpected_success: {self.unexpected_success} | failed: {self.failed}]'
+        return (
+            f"[TestCase name: {self.name} | class_name: {self.class_name} | file: {self.file} | time: {self.time} | "
+            f"expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | "
+            f"unexpected_success: {self.unexpected_success} | failed: {self.failed}]\n"
+        )
+
 
 class TestSuite:
     def __init__(self, name: str) -> None:
@@ -612,10 +687,12 @@ def __init__(self, name: str) -> None:
         self.expected_failure_count = 0
 
     def __repr__(self) -> str:
-        rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}'
+        rc = (
+            f"{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}"
+        )
         if self.skipped_count > 0:
-            rc += f' skipped: {self.skipped_count}'
-        return f'TestSuite({rc})'
+            rc += f" skipped: {self.skipped_count}"
+        return f"TestSuite({rc})"
 
     def append(self, test_case: TestCase) -> None:
         self.test_cases[test_case.name] = test_case
@@ -628,7 +705,9 @@ def append(self, test_case: TestCase) -> None:
 
     def update(self, test_case: TestCase) -> None:
         name = test_case.name
-        assert name in self.test_cases, f'Error: attempting to replace nonexistent test case {name}'
+        assert (
+            name in self.test_cases
+        ), f"Error: attempting to replace nonexistent test case {name}"
         # Note that time for unexpected successes and expected failures are reported as 0s
         self.test_cases[name].time += test_case.time
         self.test_cases[name].failed |= test_case.failed
@@ -637,54 +716,46 @@ def update(self, test_case: TestCase) -> None:
         self.test_cases[name].unexpected_success |= test_case.unexpected_success
         self.test_cases[name].expected_failure |= test_case.expected_failure
 
-    def print_report(self, num_longest: int = 3) -> None:
-        sorted_tests = sorted(self.test_cases.values(), key=lambda x: x.time)
-        test_count = len(sorted_tests)
-        print(f"class {self.name}:")
-        print(
-            f"    tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} "
-            f"errored: {self.errored_count} unexpected_success: {self.unexpected_success_count} "
-            f"expected_failure: {self.expected_failure_count}")
-        print(f"    run_time: {self.total_time:.2f} seconds")
-        print(f"    avg_time: {self.total_time/test_count:.2f} seconds")
-        if test_count >= 2:
-            print(f"    median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds")
-        sorted_tests = sorted_tests[-num_longest:]
-        print(f"    {len(sorted_tests)} longest tests:")
-        for test in reversed(sorted_tests):
-            print(f"        {test.name} time: {test.time:.2f} seconds")
-        print("")
+
+# Tests that spawn duplicates (usually only twice) intentionally
+MULTITESTS = [
+    "test_cpp_extensions_aot",
+    "distributed/test_distributed_spawn",
+    "distributed\\test_distributed_spawn",  # for windows
+    "distributed/test_c10d_gloo",
+    "distributed\\test_c10d_gloo",  # for windows
+    "cpp",  # The caffe2 cpp tests spawn duplicate test cases as well.
+]
+
 
 DuplicatedDict = Dict[str, Dict[str, List[TestCase]]]
 
+
 class TestFile:
     def __init__(self, name: str) -> None:
         self.name = name
         self.total_time = 0.0
         self.test_suites: Dict[str, TestSuite] = dict()
 
-    def append(self, test_case: TestCase, test_type: str, duplicated_tests_dict: DuplicatedDict) -> None:
-        is_multi_test = self.name == 'test_cpp_extensions_aot' or \
-            self.name == 'distributed/test_distributed_spawn' or \
-            self.name == 'distributed/test_c10d_gloo' or \
-            self.name == 'cpp'  # The caffe2 cpp tests spawn duplicate test cases as well.
-        if is_multi_test:
-            suite_name = test_case.class_name + '__' + test_type
-        else:
-            suite_name = test_case.class_name
+    def append(
+        self, test_case: TestCase, test_type: str, duplicated_tests_dict: DuplicatedDict
+    ) -> None:
+        suite_name = test_case.class_name
         if suite_name not in self.test_suites:
             self.test_suites[suite_name] = TestSuite(suite_name)
         if test_case.name in self.test_suites[suite_name].test_cases:
-            if is_multi_test:
+            if self.name in MULTITESTS:
                 self.test_suites[suite_name].update(test_case)
                 self.total_time += test_case.time
-            else:
-                # Gather up duplicated test cases
-                if suite_name not in duplicated_tests_dict:
-                    duplicated_tests_dict[suite_name] = dict()
-                if test_case.name not in duplicated_tests_dict[suite_name]:
-                    duplicated_tests_dict[suite_name][test_case.name] = [self.test_suites[suite_name].test_cases[test_case.name]]
-                duplicated_tests_dict[suite_name][test_case.name].append(test_case)
+
+            # Gather up duplicated test cases to parse for flaky reruns
+            if suite_name not in duplicated_tests_dict:
+                duplicated_tests_dict[suite_name] = dict()
+            if test_case.name not in duplicated_tests_dict[suite_name]:
+                duplicated_tests_dict[suite_name][test_case.name] = [
+                    self.test_suites[suite_name].test_cases[test_case.name]
+                ]
+            duplicated_tests_dict[suite_name][test_case.name].append(test_case)
         else:
             self.test_suites[suite_name].append(test_case)
             self.total_time += test_case.time
@@ -696,7 +767,7 @@ def parse_report(path: str) -> Iterator[TestCase]:
     except Exception as e:
         print(f"Error occurred when parsing {path}: {e}")
         return
-    for test_case in dom.getElementsByTagName('testcase'):
+    for test_case in dom.getElementsByTagName("testcase"):
         yield TestCase(test_case)
 
 
@@ -716,11 +787,11 @@ def get_recursive_files(folder: str, extension: str) -> Iterable[str]:
 
 def parse_reports(folder: str) -> Tuple[Dict[str, TestFile], Dict[str, DuplicatedDict]]:
     tests_by_file = dict()
-    duplicated_tests_by_file : Dict[str, DuplicatedDict] = dict()
+    duplicated_tests_by_file: Dict[str, DuplicatedDict] = dict()
     for report in get_recursive_files(folder, ".xml"):
         report_path = Path(report)
         # basename of the directory of test-report is the test filename
-        test_filename = re.sub(r'\.', '/', report_path.parent.name)
+        test_filename = re.sub(r"\.", "/", report_path.parent.name)
         # test type is the parent directory (only applies to dist-*)
         # See: CUSTOM_HANDLERS in test/run_test.py
         test_type = report_path.parent.parent.name
@@ -729,7 +800,9 @@ def parse_reports(folder: str) -> Tuple[Dict[str, TestFile], Dict[str, Duplicate
         if test_filename not in tests_by_file:
             tests_by_file[test_filename] = TestFile(test_filename)
         for test_case in parse_report(report):
-            tests_by_file[test_filename].append(test_case, test_type, duplicated_tests_by_file[test_filename])
+            tests_by_file[test_filename].append(
+                test_case, test_type, duplicated_tests_by_file[test_filename]
+            )
     return tests_by_file, duplicated_tests_by_file
 
 
@@ -754,59 +827,74 @@ def process_intentional_test_runs(runs: List[TestCase]) -> Tuple[int, int]:
         else:
             num_pass += 1
 
-    REPEAT_TEST_FOR_TYPES_TESTS = [
-        "test_data_parallel_module",
-        "test_data_parallel_module_kwargs_only",
-        "test_data_parallel_module_kwargs_only_empty_list",
-        "test_data_parallel_module_kwargs_only_empty_dict",
-        "test_data_parallel_module_kwargs_only_empty_tuple"
-    ]
-
-    # Do not run checks for tests that use repeat_test_for_types decorator as they do not go well with our retry
-    # functionality. Once issue https://github.com/pytorch/pytorch/issues/69865 is fixed, we should remove the exception
-    if not any([x in test_run.name for x in REPEAT_TEST_FOR_TYPES_TESTS]):
-        err_msg = f'Warning: unintentional test case duplicates found for {test_run.name} in suite {test_run.class_name}.'
-        report_only = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') != '1'
-        if report_only and num_fail + num_errored + num_unexpected_success < 1 or not report_only and num_expected_fail < 1:
-            raise RuntimeWarning(f'{err_msg} Intentional reruns are only triggered when the first run fails or errors, but'
-                                 ' we found no failures nor errors.')
+    # Do not run duplication checks for test files that spawn duplicate tests intentionally
+    # and are not necessarily flaky test reruns.
+    if not any(x in test_run.file for x in MULTITESTS):
+        err_msg = f"Warning: unintentional test case duplicates found for {test_run.name} in suite {test_run.class_name}."
+        report_only = os.getenv("PYTORCH_OVERRIDE_FLAKY_SIGNAL") != "1"
+        if (
+            report_only
+            and num_fail + num_errored + num_unexpected_success < 1
+            or not report_only
+            and num_expected_fail < 1
+        ):
+            raise RuntimeWarning(
+                f"{err_msg} Intentional reruns are only triggered when the first run fails or errors, but"
+                " we found no failures nor errors."
+            )
         if num_unexpected_success + num_expected_fail < 1:
-            raise RuntimeWarning(f'{err_msg} Intentional reruns should raise at least one unexpected success or expected '
-                                 'failure, but none have been found.')
+            raise RuntimeWarning(
+                f"{err_msg} Intentional reruns should raise at least one unexpected success or expected "
+                "failure, but none have been found."
+            )
         if report_only and num_pass != num_unexpected_success:
-            raise RuntimeWarning(f'{err_msg} Every success in an intentional rerun is shadowed by one unexpected success.'
-                                 f'However, successes = {num_pass} and unexpected successes = {num_unexpected_success}')
+            raise RuntimeWarning(
+                f"{err_msg} Every success in an intentional rerun is shadowed by one unexpected success."
+                f"However, successes = {num_pass} and unexpected successes = {num_unexpected_success}"
+            )
         if not report_only and num_pass > 1:
-            raise RuntimeWarning(f'{err_msg} There should be at most 1 successful run in an intentional rerun that stops'
-                                 f' at first success. The number of successful runs = {num_pass}')
+            raise RuntimeWarning(
+                f"{err_msg} There should be at most 1 successful run in an intentional rerun that stops"
+                f" at first success. The number of successful runs = {num_pass}"
+            )
         if num_skipped > 0:
-            raise RuntimeWarning(f'{err_msg} No skips should occur in intentional reruns, but skips = {num_skipped}')
-    return max(num_unexpected_success, num_pass), num_fail + num_expected_fail + num_errored
+            raise RuntimeWarning(
+                f"{err_msg} No skips should occur in intentional reruns, but skips = {num_skipped}"
+            )
+    return (
+        max(num_unexpected_success, num_pass),
+        num_fail + num_expected_fail + num_errored,
+    )
 
 
-def assemble_flaky_test_stats(duplicated_tests_by_file: Dict[str, DuplicatedDict]) -> Any:
+def write_flaky_test_stats_to_rockset(
+    duplicated_tests_by_file: Dict[str, DuplicatedDict]
+) -> Any:
     flaky_tests = []
-    workflow_id = os.environ.get("GITHUB_RUN_ID", os.environ.get("CIRCLE_WORKFLOW_ID", None))
+    workflow_id = os.environ.get(
+        "GITHUB_RUN_ID", os.environ.get("CIRCLE_WORKFLOW_ID", None)
+    )
     for file_name, suite_to_dict in duplicated_tests_by_file.items():
         for suite_name, testcase_to_runs in suite_to_dict.items():
             for testcase_name, list_of_runs in testcase_to_runs.items():
                 num_green, num_red = process_intentional_test_runs(list_of_runs)
-                if num_green > 0:   # Otherwise, it's likely just a failing test
-                    flaky_tests.append({
-                        "name": testcase_name,
-                        "suite": suite_name,
-                        "file": file_name,
-                        "num_green": num_green,
-                        "num_red": num_red,
-                    })
+                if (
+                    num_green > 0 and num_red > 0
+                ):  # Flaky tests show different results in consecutive reruns
+                    flaky_tests.append(
+                        {
+                            "name": testcase_name,
+                            "suite": suite_name,
+                            "file": file_name,
+                            "num_green": num_green,
+                            "num_red": num_red,
+                        }
+                    )
     if len(flaky_tests) > 0:
-        # write to RDS
-        register_rds_schema("flaky_tests", schema_from_sample(flaky_tests[0]))
-        rds_write("flaky_tests", flaky_tests, only_on_master=False)
-
-        # write to S3 to go to Rockset as well
         import uuid
+
         for flaky_test in flaky_tests:
+            flaky_test["job_id"] = os.environ["GHA_WORKFLOW_JOB_ID"]
             flaky_test["workflow_id"] = workflow_id
             key = f"flaky_tests/{workflow_id}/{uuid.uuid4()}.json"
             obj = get_S3_object_from_bucket("ossci-raw-job-status", key)
@@ -818,11 +906,17 @@ def build_info() -> ReportMetaMeta:
         "build_pr": os.environ.get("PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER", "")),
         "build_tag": os.environ.get("TAG", os.environ.get("CIRCLE_TAG", "")),
         "build_sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "")),
-        "build_base_commit": get_base_commit(os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))),
+        "build_base_commit": get_base_commit(
+            os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))
+        ),
         "build_branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH", "")),
         "build_job": os.environ.get("JOB_BASE_NAME", os.environ.get("CIRCLE_JOB", "")),
-        "build_workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID", "")),
-        "build_start_time_epoch": str(int(os.path.getmtime(os.path.realpath(__file__)))),
+        "build_workflow_id": os.environ.get(
+            "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID", "")
+        ),
+        "build_start_time_epoch": str(
+            int(os.path.getmtime(os.path.realpath(__file__)))
+        ),
     }
 
 
@@ -830,7 +924,7 @@ def build_message(
     test_file: TestFile,
     test_suite: TestSuite,
     test_case: TestCase,
-    meta_info: ReportMetaMeta
+    meta_info: ReportMetaMeta,
 ) -> Dict[str, Dict[str, Any]]:
     return {
         "normal": {
@@ -856,7 +950,9 @@ def send_report_to_scribe(reports: Dict[str, TestFile]) -> None:
         [
             {
                 "category": "perfpipe_pytorch_test_times",
-                "message": json.dumps(build_message(test_file, test_suite, test_case, meta_info)),
+                "message": json.dumps(
+                    build_message(test_file, test_suite, test_case, meta_info)
+                ),
                 "line_escape": False,
             }
             for test_file in reports.values()
@@ -875,50 +971,50 @@ def assemble_s3_object(
 ) -> Version2Report:
     return {
         **build_info(),  # type: ignore[misc]
-        'total_seconds': total_seconds,
-        'format_version': 2,
-        'files': {
+        "total_seconds": total_seconds,
+        "format_version": 2,
+        "files": {
             name: {
-                'total_seconds': test_file.total_time,
-                'suites': {
+                "total_seconds": test_file.total_time,
+                "suites": {
                     name: {
-                        'total_seconds': suite.total_time,
-                        'cases': {
+                        "total_seconds": suite.total_time,
+                        "cases": {
                             name: {
-                                'seconds': case.time,
-                                'status': 'errored' if case.errored else
-                                          'failed' if case.failed else
-                                          'skipped' if case.skipped else None
+                                "seconds": case.time,
+                                "status": "errored"
+                                if case.errored
+                                else "failed"
+                                if case.failed
+                                else "skipped"
+                                if case.skipped
+                                else None,
                             }
                             for name, case in suite.test_cases.items()
                         },
                     }
                     for name, suite in test_file.test_suites.items()
-                }
+                },
             }
             for name, test_file in reports.items()
-        }
+        },
     }
 
 
 def send_report_to_s3(head_report: Version2Report) -> None:
-    job = os.getenv('JOB_BASE_NAME', os.environ.get('CIRCLE_JOB'))
-    sha1 = os.environ.get('SHA1', os.environ.get('CIRCLE_SHA1', ''))
-    branch = os.environ.get('BRANCH', os.environ.get('CIRCLE_BRANCH', ''))
+    job = os.getenv("JOB_BASE_NAME", os.environ.get("CIRCLE_JOB"))
+    sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", ""))
     now = datetime.datetime.utcnow().isoformat()
 
     # SHARD_NUMBER and TEST_CONFIG are specific to GHA, as these details would be included in CIRCLE_JOB already
-    shard = os.environ.get('SHARD_NUMBER', '')
-    test_config = os.environ.get('TEST_CONFIG')
+    shard = os.environ.get("SHARD_NUMBER", "")
+    test_config = os.environ.get("TEST_CONFIG")
 
-    job_report_dirname = f'{job}{f"-{test_config}" if test_config is not None else ""}{shard}'
-
-    if branch not in ['master', 'nightly'] and not branch.startswith("release/"):
-        pr = os.environ.get('PR_NUMBER', os.environ.get('CIRCLE_PR_NUMBER', 'unknown'))
-        key = f'pr_test_time/{pr}/{sha1}/{job_report_dirname}/{now}Z.json.bz2'  # Z meaning UTC
-    else:
-        key = f'test_time/{sha1}/{job_report_dirname}/{now}Z.json.bz2'  # Z meaning UTC
-    obj = get_S3_object_from_bucket('ossci-metrics', key)
+    job_report_dirname = (
+        f'{job}{f"-{test_config}" if test_config is not None else ""}{shard}'
+    )
+    key = f"test_time/{sha1}/{job_report_dirname}/{now}Z.json.bz2"  # Z meaning UTC
+    obj = get_S3_object_from_bucket("ossci-metrics", key)
     # use bz2 because the results are smaller than gzip, and the
     # compression time penalty we pay is only about half a second for
     # input files of a few megabytes in size like these JSON files, and
@@ -927,46 +1023,25 @@ def send_report_to_s3(head_report: Version2Report) -> None:
     obj.put(Body=bz2.compress(json.dumps(head_report).encode()))
 
 
-def upload_failures_to_rds(reports: Dict[str, TestFile]) -> None:
-    """
-    We have 40k+ tests, so saving every test for every commit is not very
-    feasible for PyTorch. Most of these are things we don't care about anyways,
-    so this code filters out failures and saves only those to the DB.
-    """
-    # Gather all failures across the entire report
-    failures = []
-    for file in reports.values():
-        for suite in file.test_suites.values():
-            for case in suite.test_cases.values():
-                if case.errored or case.failed:
-                    failures.append({
-                        "name": case.name,
-                        "suite": suite.name,
-                        "file": file.name,
-                        "status": "failure" if case.failed else "error"
-                    })
-
-    if len(failures) > 0:
-        register_rds_schema("test_failures", schema_from_sample(failures[0]))
-        rds_write("test_failures", failures, only_on_master=False)
-
-
 def print_regressions(head_report: Report, *, num_prev_commits: int) -> None:
     sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))
 
     base = get_base_commit(sha1)
 
     count_spec = f"{base}..{sha1}"
-    intermediate_commits = int(subprocess.check_output(
-        ["git", "rev-list", "--count", count_spec],
-        encoding="ascii"
-    ))
-    ancestry_path = int(subprocess.check_output(
-        ["git", "rev-list", "--ancestry-path", "--count", count_spec],
-        encoding="ascii",
-    ))
+    intermediate_commits = int(
+        subprocess.check_output(
+            ["git", "rev-list", "--count", count_spec], encoding="ascii"
+        )
+    )
+    ancestry_path = int(
+        subprocess.check_output(
+            ["git", "rev-list", "--ancestry-path", "--count", count_spec],
+            encoding="ascii",
+        )
+    )
 
-    # if current commit is already on master, we need to exclude it from
+    # if current commit is already on main, we need to exclude it from
     # this history; otherwise we include the merge-base
     commits = subprocess.check_output(
         ["git", "rev-list", f"--max-count={num_prev_commits+1}", base],
@@ -989,15 +1064,18 @@ def print_regressions(head_report: Report, *, num_prev_commits: int) -> None:
             objects[commit].extend(summary)
 
     print()
-    print(regression_info(
-        head_sha=sha1,
-        head_report=head_report,
-        base_reports=objects,
-        job_name=job,
-        on_master=on_master,
-        ancestry_path=ancestry_path - 1,
-        other_ancestors=intermediate_commits - ancestry_path,
-    ), end="")
+    print(
+        regression_info(
+            head_sha=sha1,
+            head_report=head_report,
+            base_reports=objects,
+            job_name=job,
+            on_master=on_master,
+            ancestry_path=ancestry_path - 1,
+            other_ancestors=intermediate_commits - ancestry_path,
+        ),
+        end="",
+    )
 
 
 def positive_integer(value: str) -> float:
@@ -1022,9 +1100,10 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
     return True
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
     import sys
+
     parser = argparse.ArgumentParser(
         "Print statistics from test XML output.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
@@ -1080,9 +1159,8 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
     args = parser.parse_args()
 
     reports_by_file, duplicated_tests_by_file = parse_reports(args.folder)
-    assemble_flaky_test_stats(duplicated_tests_by_file)
+    write_flaky_test_stats_to_rockset(duplicated_tests_by_file)
 
-    upload_failures_to_rds(reports_by_file)
     if reports_has_no_tests(reports_by_file):
         print(f"No tests in reports found in {args.folder}")
         sys.exit(0)
@@ -1092,16 +1170,10 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
     except Exception as e:
         print(f"ERROR ENCOUNTERED WHEN UPLOADING TO SCRIBE: {e}")
 
-    # longest_tests can contain duplicates as the same tests can be spawned from different files
-    longest_tests: List[TestCase] = []
     total_time = 0.0
     for filename, test_filename in reports_by_file.items():
         for suite_name, test_suite in test_filename.test_suites.items():
             total_time += test_suite.total_time
-            if test_suite.total_time >= args.class_print_threshold:
-                test_suite.print_report(args.longest_of_class)
-                longest_tests.extend(test_suite.test_cases.values())
-    longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:]
 
     obj = assemble_s3_object(reports_by_file, total_seconds=total_time)
 
@@ -1111,14 +1183,6 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
         except Exception as e:
             print(f"ERROR ENCOUNTERED WHEN UPLOADING TO S3: {e}")
 
-    print(f"Total runtime is {datetime.timedelta(seconds=total_time)}")
-    print(
-        f"{len(longest_tests)} longest tests of entire run"
-        f" (ignoring suites totaling less than {args.class_print_threshold} seconds):"
-    )
-    for test_case in reversed(longest_tests):
-        print(f"    {test_case.class_name}.{test_case.name}  time: {test_case.time:.2f} seconds")
-
     if args.compare_with_s3:
         head_json = obj
         if args.use_json:
diff --git a/tools/stats/s3_stat_parser.py b/tools/stats/s3_stat_parser.py
index 71474bf487cd..666b9f6b4547 100644
--- a/tools/stats/s3_stat_parser.py
+++ b/tools/stats/s3_stat_parser.py
@@ -10,6 +10,7 @@
 try:
     import boto3  # type: ignore[import]
     import botocore  # type: ignore[import]
+
     HAVE_BOTO3 = True
 except ImportError:
     HAVE_BOTO3 = False
@@ -18,10 +19,10 @@
 logger = logging.getLogger(__name__)
 
 
-OSSCI_METRICS_BUCKET = 'ossci-metrics'
+OSSCI_METRICS_BUCKET = "ossci-metrics"
 
 Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[Literal['errored', 'failed', 'skipped']]
+Status = Optional[Literal["errored", "failed", "skipped"]]
 
 
 class CaseMeta(TypedDict):
@@ -85,8 +86,10 @@ class Version2Report(VersionedReport):
 Report = Union[Version1Report, VersionedReport]
 
 if HAVE_BOTO3:
-    S3_RESOURCE_READ_ONLY = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED))
-    S3_RESOURCE = boto3.resource('s3')
+    S3_RESOURCE_READ_ONLY = boto3.resource(
+        "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
+    )
+    S3_RESOURCE = boto3.resource("s3")
 
 
 def get_S3_bucket_readonly(bucket_name: str) -> Any:
@@ -98,16 +101,16 @@ def get_S3_object_from_bucket(bucket_name: str, object: str) -> Any:
 
 
 def case_status(case: Version1Case) -> Status:
-    for k in {'errored', 'failed', 'skipped'}:
-        if case[k]:  # type: ignore[misc]
+    for k in {"errored", "failed", "skipped"}:
+        if case[k]:  # type: ignore[literal-required]
             return cast(Status, k)
     return None
 
 
 def newify_case(case: Version1Case) -> Version2Case:
     return {
-        'seconds': case['seconds'],
-        'status': case_status(case),
+        "seconds": case["seconds"],
+        "status": case_status(case),
     }
 
 
@@ -119,28 +122,28 @@ def get_cases(
     test_name: Optional[str],
 ) -> List[Version2Case]:
     cases: List[Version2Case] = []
-    if 'format_version' not in data:  # version 1 implicitly
+    if "format_version" not in data:  # version 1 implicitly
         v1report = cast(Version1Report, data)
-        suites = v1report['suites']
+        suites = v1report["suites"]
         for sname, v1suite in suites.items():
             if not suite_name or sname == suite_name:
-                for v1case in v1suite['cases']:
-                    if not test_name or v1case['name'] == test_name:
+                for v1case in v1suite["cases"]:
+                    if not test_name or v1case["name"] == test_name:
                         cases.append(newify_case(v1case))
     else:
         v_report = cast(VersionedReport, data)
-        version = v_report['format_version']
+        version = v_report["format_version"]
         if version == 2:
             v2report = cast(Version2Report, v_report)
-            for fname, v2file in v2report['files'].items():
+            for fname, v2file in v2report["files"].items():
                 if fname == filename or not filename:
-                    for sname, v2suite in v2file['suites'].items():
+                    for sname, v2suite in v2file["suites"].items():
                         if sname == suite_name or not suite_name:
-                            for cname, v2case in v2suite['cases'].items():
+                            for cname, v2case in v2suite["cases"].items():
                                 if not test_name or cname == test_name:
                                     cases.append(v2case)
         else:
-            raise RuntimeError(f'Unknown format version: {version}')
+            raise RuntimeError(f"Unknown format version: {version}")
     return cases
 
 
@@ -148,19 +151,22 @@ def _parse_master_summaries(summaries: Any, jobs: List[str]) -> Dict[str, List[R
     summary_dict = defaultdict(list)
     for summary in summaries:
         # master summary format: "test_time/{sha}/{job}/file"
-        summary_job = summary.key.split('/')[2]
+        summary_job = summary.key.split("/")[2]
         if summary_job in jobs or len(jobs) == 0:
             binary = summary.get()["Body"].read()
             string = bz2.decompress(binary).decode("utf-8")
             summary_dict[summary_job].append(json.loads(string))
     return summary_dict
 
-def _parse_pr_summaries(summaries: Any, job_prefix: str) -> Dict[str, List[Tuple[Report, str]]]:
+
+def _parse_pr_summaries(
+    summaries: Any, job_prefix: str
+) -> Dict[str, List[Tuple[Report, str]]]:
     summary_dict = defaultdict(list)
     for summary in summaries:
         # PR summary format: "pr_test_time/{pr}/{sha}/{job}/file"
-        summary_job = summary.key.split('/')[3]
-        summary_timestamp = summary.key.split('/')[4][:len("YYYY-MM-ddTHH:mm:ss")]
+        summary_job = summary.key.split("/")[3]
+        summary_timestamp = summary.key.split("/")[4][: len("YYYY-MM-ddTHH:mm:ss")]
         if not job_prefix or len(job_prefix) == 0 or summary_job.startswith(job_prefix):
             binary = summary.get()["Body"].read()
             string = bz2.decompress(binary).decode("utf-8")
@@ -171,18 +177,25 @@ def _parse_pr_summaries(summaries: Any, job_prefix: str) -> Dict[str, List[Tuple
 # Collect and decompress S3 test stats summaries into JSON.
 # data stored on S3 buckets are pathed by {sha}/{job} so we also allow
 # optional jobs filter
-def get_test_stats_summaries(*, sha: str, jobs: Optional[List[str]] = None) -> Dict[str, List[Report]]:
+def get_test_stats_summaries(
+    *, sha: str, jobs: Optional[List[str]] = None
+) -> Dict[str, List[Report]]:
     bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
     summaries = bucket.objects.filter(Prefix=f"test_time/{sha}")
     return _parse_master_summaries(summaries, jobs=list(jobs or []))
 
 
-def get_test_stats_summaries_for_job(*, sha: str, job_prefix: str) -> Dict[str, List[Report]]:
+def get_test_stats_summaries_for_job(
+    *, sha: str, job_prefix: str
+) -> Dict[str, List[Report]]:
     bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
     summaries = bucket.objects.filter(Prefix=f"test_time/{sha}/{job_prefix}")
     return _parse_master_summaries(summaries, jobs=list())
 
-def get_test_stats_summaries_for_pr(*, pr: str, job_prefix: str) -> Dict[str, List[Tuple[Report, str]]]:
+
+def get_test_stats_summaries_for_pr(
+    *, pr: str, job_prefix: str
+) -> Dict[str, List[Tuple[Report, str]]]:
     bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
     summaries = bucket.objects.filter(Prefix=f"pr_test_time/{pr}/")
     return _parse_pr_summaries(summaries, job_prefix=job_prefix)
@@ -191,35 +204,50 @@ def get_test_stats_summaries_for_pr(*, pr: str, job_prefix: str) -> Dict[str, Li
 # This function returns a list of S3 test time reports. This function can run into errors if HAVE_BOTO3 = False
 # or the S3 bucket is somehow unavailable. Even though this function goes through ten commits' reports to find a
 # non-empty report, it is still conceivable (though highly unlikely) for this function to return no reports.
-def get_previous_reports_for_branch(branch: str, ci_job_prefix: str = "") -> List[Report]:
+def get_previous_reports_for_branch(
+    branch: str, ci_job_prefix: str = ""
+) -> List[Report]:
     commit_date_ts = subprocess.check_output(
-        ['git', 'show', '-s', '--format=%ct', 'HEAD'],
-        encoding="ascii").strip()
+        ["git", "show", "-s", "--format=%ct", "HEAD"], encoding="ascii"
+    ).strip()
     commit_date = datetime.fromtimestamp(int(commit_date_ts))
     # We go a day before this current commit to avoiding pulling incomplete reports
-    day_before_commit = str(commit_date - timedelta(days=1)).split(' ')[0]
+    day_before_commit = str(commit_date - timedelta(days=1)).split(" ")[0]
     # something like git rev-list --before="2021-03-04" --max-count=10 --remotes="*origin/nightly"
     commits = subprocess.check_output(
-        ["git", "rev-list", f"--before={day_before_commit}", "--max-count=10", f"--remotes=*{branch}"],
-        encoding="ascii").splitlines()
+        [
+            "git",
+            "rev-list",
+            f"--before={day_before_commit}",
+            "--max-count=10",
+            f"--remotes=*{branch}",
+        ],
+        encoding="ascii",
+    ).splitlines()
 
     reports: List[Report] = []
     commit_index = 0
     while len(reports) == 0 and commit_index < len(commits):
         commit = commits[commit_index]
-        logger.info(f'Grabbing reports from commit: {commit}')
-        summaries = get_test_stats_summaries_for_job(sha=commit, job_prefix=ci_job_prefix)
+        logger.info(f"Grabbing reports from commit: {commit}")
+        summaries = get_test_stats_summaries_for_job(
+            sha=commit, job_prefix=ci_job_prefix
+        )
         for job_name, summary in summaries.items():
             reports.append(summary[0])
             if len(summary) > 1:
-                logger.warning(f'WARNING: Multiple summary objects found for {commit}/{job_name}')
+                logger.warning(
+                    f"WARNING: Multiple summary objects found for {commit}/{job_name}"
+                )
         commit_index += 1
     return reports
 
 
-def get_previous_reports_for_pr(pr: str, ci_job_prefix: str = "") -> List[Tuple[Report, str]]:
+def get_previous_reports_for_pr(
+    pr: str, ci_job_prefix: str = ""
+) -> List[Tuple[Report, str]]:
     reports: List[Tuple[Report, str]] = []
-    logger.info(f'Grabbing reports from PR: {[pr]}')
+    logger.info(f"Grabbing reports from PR: {[pr]}")
     summaries = get_test_stats_summaries_for_pr(pr=pr, job_prefix=ci_job_prefix)
     for _, summary in summaries.items():
         reports.extend(summary)
diff --git a/tools/stats/test_history.py b/tools/stats/test_history.py
index 24678aabba93..83751441bb7d 100755
--- a/tools/stats/test_history.py
+++ b/tools/stats/test_history.py
@@ -7,17 +7,12 @@
 from signal import SIG_DFL, SIGPIPE, signal
 from typing import Dict, Iterator, List, Optional, Set, Tuple
 
-from tools.stats.s3_stat_parser import (Report, get_cases,
-                                        get_test_stats_summaries)
+from tools.stats.s3_stat_parser import Report, get_cases, get_test_stats_summaries
 
 
-def get_git_commit_history(
-    *,
-    path: str,
-    ref: str
-) -> List[Tuple[str, datetime]]:
+def get_git_commit_history(*, path: str, ref: str) -> List[Tuple[str, datetime]]:
     rc = subprocess.check_output(
-        ['git', '-C', path, 'log', '--pretty=format:%H %ct', ref],
+        ["git", "-C", path, "log", "--pretty=format:%H %ct", ref],
     ).decode("latin-1")
     return [
         (x[0], datetime.fromtimestamp(int(x[1]), tz=timezone.utc))
@@ -37,23 +32,20 @@ def make_column(
     num_length = digits + 1 + decimals
     if data:
         cases = get_cases(
-            data=data,
-            filename=filename,
-            suite_name=suite_name,
-            test_name=test_name
+            data=data, filename=filename, suite_name=suite_name, test_name=test_name
         )
         if cases:
             case = cases[0]
-            status = case['status']
+            status = case["status"]
             omitted = len(cases) - 1
             if status:
-                return f'{status.rjust(num_length)} ', omitted
+                return f"{status.rjust(num_length)} ", omitted
             else:
                 return f'{case["seconds"]:{num_length}.{decimals}f}s', omitted
         else:
             return f'{"absent".rjust(num_length)} ', 0
     else:
-        return ' ' * (num_length + 1), 0
+        return " " * (num_length + 1), 0
 
 
 def make_columns(
@@ -83,10 +75,10 @@ def make_columns(
         if job in omitted:
             total_omitted += omitted[job]
     if total_omitted > 0:
-        columns.append(f'({total_omitted} job re-runs omitted)')
+        columns.append(f"({total_omitted} job re-runs omitted)")
     if total_suites > 0:
-        columns.append(f'({total_suites} matching suites omitted)')
-    return ' '.join(columns)
+        columns.append(f"({total_suites} matching suites omitted)")
+    return " ".join(columns)
 
 
 def make_lines(
@@ -108,17 +100,17 @@ def make_lines(
             )
             if cases:
                 case = cases[0]
-                status = case['status']
+                status = case["status"]
                 line = f'{job} {case["seconds"]}s{f" {status}" if status else ""}'
                 if len(cases) > 1:
-                    line += f' ({len(cases) - 1} matching suites omitted)'
+                    line += f" ({len(cases) - 1} matching suites omitted)"
                 lines.append(line)
             elif job in jobs:
-                lines.append(f'{job} (test not found)')
+                lines.append(f"{job} (test not found)")
     if lines:
         return lines
     else:
-        return ['(no reports in S3)']
+        return ["(no reports in S3)"]
 
 
 def history_lines(
@@ -142,26 +134,24 @@ def history_lines(
             summaries = get_test_stats_summaries(sha=sha)
         else:
             summaries = get_test_stats_summaries(sha=sha, jobs=jobs)
-        if mode == 'columns':
+        if mode == "columns":
             assert jobs is not None
             # we assume that get_test_stats_summaries here doesn't
             # return empty lists
-            omitted = {
-                job: len(l) - 1
-                for job, l in summaries.items()
-                if len(l) > 1
-            }
-            lines = [make_columns(
-                jobs=jobs,
-                jsons={job: l[0] for job, l in summaries.items()},
-                omitted=omitted,
-                filename=filename,
-                suite_name=suite_name,
-                test_name=test_name,
-                digits=digits,
-            )]
+            omitted = {job: len(l) - 1 for job, l in summaries.items() if len(l) > 1}
+            lines = [
+                make_columns(
+                    jobs=jobs,
+                    jsons={job: l[0] for job, l in summaries.items()},
+                    omitted=omitted,
+                    filename=filename,
+                    suite_name=suite_name,
+                    test_name=test_name,
+                    digits=digits,
+                )
+            ]
         else:
-            assert mode == 'multiline'
+            assert mode == "multiline"
             lines = make_lines(
                 jobs=set(jobs or []),
                 jsons=summaries,
@@ -181,7 +171,7 @@ class HelpFormatter(
 
 
 def description() -> str:
-    return r'''
+    return r"""
 Display the history of a test.
 
 Each line of (non-error) output starts with the timestamp and SHA1 hash
@@ -193,55 +183,50 @@ def description() -> str:
 followed by the time of the specified test in that job at that commit.
 Example:
 
-    $ tools/stats/test_history.py --mode=multiline --ref=594a66 --sha-length=8 --test=test_set_dir \
-      --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
-    2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc5_4_test 0.36s
-    2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc7_test 0.573s errored
-    2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc5_4_test 0.819s
-    2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc7_test 0.449s
-    2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc5_4_test 0.361s
-    2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc7_test 0.454s
-    2021-02-10 10:09:10Z 2e35fe95 (no reports in S3)
-    2021-02-10 10:09:07Z ff73be7e (no reports in S3)
-    2021-02-10 10:05:39Z 74082f0d (no reports in S3)
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.414s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.476s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.377s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.326s
+    $ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \
+      --test=test_composite_compliance_dot_cpu_float32 \
+      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
+    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 13:14:56Z e73eaffd (no reports in S3)
+    2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
 
 Another multiline example, this time with the --all flag:
 
-    $ tools/stats/test_history.py --mode=multiline --all --ref=321b9 --delta=12 --sha-length=8 \
-      --test=test_qr_square_many_batched_complex_cuda
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 424.284s
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 402.572s
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.164s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 436.732s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 407.616s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.044s
+    $ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \
+      --test=test_composite_compliance_dot_cuda_float32
+    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped
 
 In columns mode, the name of the job isn't printed, but the order of the
 columns is guaranteed to match the order of the jobs passed on the
 command line. Example:
 
-    $ tools/stats/test_history.py --mode=columns --ref=3cf783 --sha-length=8 --test=test_set_dir \
-      --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
-    2021-02-10 12:18:50Z 3cf78395    0.644s    0.312s
-    2021-02-10 11:13:34Z 594a66d7    0.360s  errored
-    2021-02-10 10:13:25Z 9c0caf03    0.819s    0.449s
-    2021-02-10 10:09:14Z 602434bc    0.361s    0.454s
-    2021-02-10 10:09:10Z 2e35fe95
-    2021-02-10 10:09:07Z ff73be7e
-    2021-02-10 10:05:39Z 74082f0d
-    2021-02-10 07:42:29Z 0620c96f    0.414s    0.377s (2 job re-runs omitted)
-    2021-02-10 07:27:53Z 33afb5f1    0.381s    0.294s
+    $ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \
+      --test=test_composite_compliance_dot_cpu_float32 \
+      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
+    2022-02-18 15:47:37Z 86a961af    0.001s    0.001s
+    2022-02-18 15:12:34Z f5e201e4    0.001s    0.001s
+    2022-02-18 13:14:56Z 1c0df265    0.001s    0.001s
+    2022-02-18 13:14:56Z e73eaffd
+    2022-02-18 06:29:12Z 710f12f5    0.001s    0.001s
+    2022-02-18 05:20:30Z 51b04f27    0.001s    0.001s
+    2022-02-18 03:49:46Z 69389fb5    0.001s    0.001s
+    2022-02-18 00:19:12Z 056b6260    0.001s    0.001s
+    2022-02-17 23:58:32Z 39fb7714    0.001s    0.001s
 
 Minor note: in columns mode, a blank cell means that no report was found
 in S3, while the word "absent" means that a report was found but the
 indicated test was not found in that report.
-'''
+"""
 
 
 def parse_args(raw: List[str]) -> argparse.Namespace:
@@ -251,61 +236,57 @@ def parse_args(raw: List[str]) -> argparse.Namespace:
         formatter_class=HelpFormatter,
     )
     parser.add_argument(
-        '--mode',
-        choices=['columns', 'multiline'],
-        help='output format',
-        default='columns',
+        "--mode",
+        choices=["columns", "multiline"],
+        help="output format",
+        default="columns",
     )
     parser.add_argument(
-        '--pytorch',
-        help='path to local PyTorch clone',
-        default='.',
+        "--pytorch",
+        help="path to local PyTorch clone",
+        default=".",
     )
     parser.add_argument(
-        '--ref',
-        help='starting point (most recent Git ref) to display history for',
-        default='master',
+        "--ref",
+        help="starting point (most recent Git ref) to display history for",
+        default="master",
     )
     parser.add_argument(
-        '--delta',
+        "--delta",
         type=int,
-        help='minimum number of hours between commits',
+        help="minimum number of hours between commits",
         default=0,
     )
     parser.add_argument(
-        '--sha-length',
+        "--sha-length",
         type=int,
-        help='length of the prefix of the SHA1 hash to show',
+        help="length of the prefix of the SHA1 hash to show",
         default=40,
     )
     parser.add_argument(
-        '--digits',
+        "--digits",
         type=int,
-        help='(columns) number of digits to display before the decimal point',
+        help="(columns) number of digits to display before the decimal point",
         default=4,
     )
     parser.add_argument(
-        '--all',
-        action='store_true',
-        help='(multiline) ignore listed jobs, show all jobs for each commit',
-    )
-    parser.add_argument(
-        '--file',
-        help='name of the file containing the test',
+        "--all",
+        action="store_true",
+        help="(multiline) ignore listed jobs, show all jobs for each commit",
     )
     parser.add_argument(
-        '--suite',
-        help='name of the suite containing the test',
+        "--file",
+        help="name of the file containing the test",
     )
     parser.add_argument(
-        '--test',
-        help='name of the test',
-        required=True
+        "--suite",
+        help="name of the suite containing the test",
     )
+    parser.add_argument("--test", help="name of the test", required=True)
     parser.add_argument(
-        '--job',
-        help='names of jobs to display columns for, in order',
-        action='append',
+        "--job",
+        help="names of jobs to display columns for, in order",
+        action="append",
         default=[],
     )
     args = parser.parse_args(raw)
@@ -313,7 +294,7 @@ def parse_args(raw: List[str]) -> argparse.Namespace:
     args.jobs = None if args.all else args.job
     # We dont allow implicit or empty "--jobs", unless "--all" is specified.
     if args.jobs == []:
-        parser.error('No jobs specified.')
+        parser.error("No jobs specified.")
 
     return args
 
diff --git a/tools/stats/upload_binary_size_to_scuba.py b/tools/stats/upload_binary_size_to_scuba.py
index adf1d5076867..aacaf627ec95 100644
--- a/tools/stats/upload_binary_size_to_scuba.py
+++ b/tools/stats/upload_binary_size_to_scuba.py
@@ -55,7 +55,9 @@ def build_message(size: int) -> Dict[str, Any]:
             "build_num": os.environ.get("CIRCLE_BUILD_NUM"),
             "sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1")),
             "branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH")),
-            "workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")),
+            "workflow_id": os.environ.get(
+                "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")
+            ),
         },
         "int": {
             "time": int(time.time()),
@@ -118,13 +120,17 @@ def gen_messages() -> Generator[Dict[str, Any], None, None]:
                     "pkg_type": "{}/{}/{}".format(android_build_type, arch, lib),
                     "cu_ver": "",  # dummy value for derived field `build_name`
                     "py_ver": "",  # dummy value for derived field `build_name`
-                    "pr": os.environ.get("PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER")),
+                    "pr": os.environ.get(
+                        "PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER")
+                    ),
                     # This is the only place where we use directly CIRCLE_BUILD_NUM, everywhere else CIRCLE_* vars
                     # are used as fallback, there seems to be no direct analogy between circle build number and GHA IDs
                     "build_num": os.environ.get("CIRCLE_BUILD_NUM"),
                     "sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1")),
                     "branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH")),
-                    "workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")),
+                    "workflow_id": os.environ.get(
+                        "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")
+                    ),
                 },
                 "int": {
                     "time": int(time.time()),
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
new file mode 100644
index 000000000000..bdc9c9f319da
--- /dev/null
+++ b/tools/stats/upload_test_stats.py
@@ -0,0 +1,210 @@
+import argparse
+import os
+import requests
+import shutil
+import zipfile
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Dict, List, Any
+
+import rockset  # type: ignore[import]
+import boto3  # type: ignore[import]
+
+PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
+GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+REQUEST_HEADERS = {
+    "Accept": "application/vnd.github.v3+json",
+    "Authorization": "token " + GITHUB_TOKEN,
+}
+S3_RESOURCE = boto3.resource("s3")
+TEMP_DIR = Path(os.environ["RUNNER_TEMP"]) / "tmp-test-stats"
+
+
+def parse_xml_report(
+    report: Path, workflow_id: int, workflow_run_attempt: int
+) -> List[Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    job_id = int(report.parts[0].rpartition("_")[2])
+
+    print(f"Parsing test report: {report}, job id: {job_id}")
+    root = ET.parse(report)
+
+    test_cases = []
+    for test_case in root.iter("testcase"):
+        case = process_xml_element(test_case)
+        case["workflow_id"] = workflow_id
+        case["workflow_run_attempt"] = workflow_run_attempt
+        case["job_id"] = job_id
+        test_cases.append(case)
+
+    return test_cases
+
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # By default, all attributes are strings. Apply a few special conversions
+    # here for well-known attributes so that they are the right type in Rockset.
+    line = ret.get("line")
+    if line:
+        ret["line"] = int(line)
+    time = ret.get("time")
+    if time:
+        ret["time"] = float(time)
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #     </testcase>
+    # becomes
+    #    {"foo": {"text": "hello"}}
+    for child in element:
+        ret[child.tag] = process_xml_element(child)
+    return ret
+
+
+def get_artifact_urls(workflow_run_id: int) -> Dict[Path, str]:
+    """Get all workflow artifacts with 'test-report' in the name."""
+    response = requests.get(
+        f"{PYTORCH_REPO}/actions/runs/{workflow_run_id}/artifacts?per_page=100",
+    )
+    artifacts = response.json()["artifacts"]
+    while "next" in response.links.keys():
+        response = requests.get(response.links["next"]["url"], headers=REQUEST_HEADERS)
+        artifacts.extend(response.json()["artifacts"])
+
+    artifact_urls = {}
+    for artifact in artifacts:
+        if "test-report" in artifact["name"]:
+            artifact_urls[Path(artifact["name"])] = artifact["archive_download_url"]
+    return artifact_urls
+
+
+def unzip(p: Path) -> None:
+    """Unzip the provided zipfile to a similarly-named directory.
+
+    Returns None if `p` is not a zipfile.
+
+    Looks like: /tmp/test-reports.zip -> /tmp/unzipped-test-reports/
+    """
+    assert p.is_file()
+    unzipped_dir = p.with_name("unzipped-" + p.stem)
+
+    with zipfile.ZipFile(p, "r") as zip:
+        zip.extractall(unzipped_dir)
+
+
+def download_and_extract_artifact(
+    artifact_name: Path, artifact_url: str, workflow_run_attempt: int
+) -> None:
+    # [Artifact run attempt]
+    # All artifacts on a workflow share a single namespace. However, we can
+    # re-run a workflow and produce a new set of artifacts. To avoid name
+    # collisions, we add `-runattempt1<run #>-` somewhere in the artifact name.
+    #
+    # This code parses out the run attempt number from the artifact name. If it
+    # doesn't match the one specified on the command line, skip it.
+    atoms = str(artifact_name).split("-")
+    for atom in atoms:
+        if atom.startswith("runattempt"):
+            found_run_attempt = int(atom[len("runattempt") :])
+            if workflow_run_attempt != found_run_attempt:
+                print(
+                    f"Skipping {artifact_name} as it is an invalid run attempt. "
+                    f"Expected {workflow_run_attempt}, found {found_run_attempt}."
+                )
+
+    print(f"Downloading and extracting {artifact_name}")
+
+    response = requests.get(artifact_url, headers=REQUEST_HEADERS)
+    with open(artifact_name, "wb") as f:
+        f.write(response.content)
+    unzip(artifact_name)
+
+
+def download_and_extract_s3_reports(
+    workflow_run_id: int, workflow_run_attempt: int
+) -> None:
+    bucket = S3_RESOURCE.Bucket("gha-artifacts")
+    objs = bucket.objects.filter(
+        Prefix=f"pytorch/pytorch/{workflow_run_id}/{workflow_run_attempt}/artifact/test-reports"
+    )
+
+    for obj in objs:
+        p = Path(Path(obj.key).name)
+        print(f"Downloading and extracting {p}")
+        with open(p, "wb") as f:
+            f.write(obj.get()["Body"].read())
+        unzip(p)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload test stats to Rockset")
+    parser.add_argument(
+        "--workflow-run-id",
+        required=True,
+        help="id of the workflow to get artifacts from",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+    args = parser.parse_args()
+
+    if TEMP_DIR.exists():
+        print("rm: ", TEMP_DIR)
+        shutil.rmtree(TEMP_DIR)
+
+    print("mkdir: ", TEMP_DIR)
+    TEMP_DIR.mkdir()
+    print("cd to ", TEMP_DIR)
+    os.chdir(TEMP_DIR)
+
+    # Download and extract all the reports (both GHA and S3)
+    download_and_extract_s3_reports(args.workflow_run_id, args.workflow_run_attempt)
+    artifact_urls = get_artifact_urls(args.workflow_run_id)
+    for name, url in artifact_urls.items():
+        download_and_extract_artifact(Path(name), url, args.workflow_run_attempt)
+
+    # Parse the reports and transform them to JSON
+    test_cases = []
+    for xml_report in Path(".").glob("**/*.xml"):
+        test_cases.extend(
+            parse_xml_report(
+                xml_report, int(args.workflow_run_id), int(args.workflow_run_attempt)
+            )
+        )
+
+    # Write the JSON to rockset
+    print(f"Writing {len(test_cases)} test cases to Rockset")
+    client = rockset.Client(
+        api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+    )
+    client.Collection.retrieve("test_run").add_docs(test_cases)
+    print("Done!")
diff --git a/tools/test/test_actions_local_runner.py b/tools/test/test_actions_local_runner.py
deleted file mode 100644
index ba4e6fd2cdb9..000000000000
--- a/tools/test/test_actions_local_runner.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import textwrap
-import unittest
-import sys
-import contextlib
-import io
-import os
-import subprocess
-import multiprocessing
-from typing import List, Dict, Any
-
-from tools import actions_local_runner
-
-
-if sys.version_info >= (3, 8):
-    # actions_local_runner uses asyncio features not available in 3.6, and
-    # IsolatedAsyncioTestCase was added in 3.8, so skip testing on
-    # unsupported systems
-    class TestRunner(unittest.IsolatedAsyncioTestCase):
-        def run(self, *args: List[Any], **kwargs: List[Dict[str, Any]]) -> Any:
-            return super().run(*args, **kwargs)
-
-        def test_step_extraction(self) -> None:
-            fake_job = {
-                "steps": [
-                    {"name": "test1", "run": "echo hi"},
-                    {"name": "test2", "run": "echo hi"},
-                    {"name": "test3", "run": "echo hi"},
-                ]
-            }
-
-            actual = actions_local_runner.grab_specific_steps(["test2"], fake_job)
-            expected = [
-                {"name": "test2", "run": "echo hi"},
-            ]
-            self.assertEqual(actual, expected)
-
-        async def test_runner(self) -> None:
-            fake_step = {"name": "say hello", "run": "echo hi"}
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                await actions_local_runner.YamlStep(fake_step, "test", True).run()
-
-            result = f.getvalue()
-            self.assertIn("say hello", result)
-
-    class TestEndToEnd(unittest.TestCase):
-        expected = [
-            "cmakelint: Run cmakelint",
-            "quick-checks: Ensure no direct cub include",
-            "quick-checks: Ensure no unqualified type ignore",
-            "quick-checks: Ensure no unqualified noqa",
-            "quick-checks: Ensure canonical include",
-            "quick-checks: Ensure no non-breaking spaces",
-            "quick-checks: Ensure no tabs",
-            "flake8",
-            "quick-checks: Ensure correct trailing newlines",
-            "quick-checks: Ensure no trailing spaces",
-            "shellcheck: Regenerate workflows",
-            "shellcheck: Assert that regenerating the workflows didn't change them",
-            "shellcheck: Extract scripts from GitHub Actions workflows",
-            "shellcheck: Run ShellCheck",
-        ]
-
-        def test_lint(self):
-            cmd = ["make", "lint", "-j", str(multiprocessing.cpu_count())]
-            proc = subprocess.run(
-                cmd, cwd=actions_local_runner.REPO_ROOT, stdout=subprocess.PIPE
-            )
-            stdout = proc.stdout.decode()
-
-            for line in self.expected:
-                self.assertIn(line, stdout)
-
-            self.assertIn("mypy", stdout)
-
-        def test_quicklint(self):
-            cmd = ["make", "quicklint", "-j", str(multiprocessing.cpu_count())]
-            proc = subprocess.run(
-                cmd, cwd=actions_local_runner.REPO_ROOT, stdout=subprocess.PIPE
-            )
-            stdout = proc.stdout.decode()
-
-            for line in self.expected:
-                self.assertIn(line, stdout)
-
-            # TODO: See https://github.com/pytorch/pytorch/issues/57967
-            self.assertIn("mypy (skipped typestub generation)", stdout)
-
-    class TestQuicklint(unittest.IsolatedAsyncioTestCase):
-        test_files = [
-            os.path.join("caffe2", "some_cool_file.py"),
-            os.path.join("torch", "some_cool_file.py"),
-            os.path.join("aten", "some_cool_file.py"),
-            os.path.join("torch", "some_stubs.pyi"),
-            os.path.join("test.sh"),
-        ]
-        test_py_files = [
-            f for f in test_files if f.endswith(".py") or f.endswith(".pyi")
-        ]
-        test_sh_files = [f for f in test_files if f.endswith(".sh")]
-        maxDiff = None
-
-        def setUp(self, *args, **kwargs):
-            for name in self.test_files:
-                bad_code = textwrap.dedent(
-                    """
-                    some_variable = '2'
-                    some_variable = None
-                    some_variable = 11.2
-                """
-                ).rstrip("\n")
-
-                with open(name, "w") as f:
-                    f.write(bad_code)
-
-        def tearDown(self, *args, **kwargs):
-            for name in self.test_files:
-                os.remove(name)
-
-        def test_file_selection(self):
-            files = actions_local_runner.find_changed_files()
-            for name in self.test_files:
-                self.assertIn(name, files)
-
-        async def test_flake8(self):
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                await actions_local_runner.Flake8(self.test_py_files, True).run()
-
-            # Should exclude the caffe2/ file
-            expected = textwrap.dedent(
-                """
-                x flake8
-                torch/some_cool_file.py:4:21: W292 no newline at end of file
-                aten/some_cool_file.py:4:21: W292 no newline at end of file
-            """
-            ).lstrip("\n")
-            self.assertEqual(expected, f.getvalue())
-
-        async def test_shellcheck(self):
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                await actions_local_runner.ShellCheck(self.test_sh_files, True).run()
-
-            self.assertIn("SC2148: Tips depend on target shell", f.getvalue())
-            self.assertIn("SC2283: Remove spaces around = to assign", f.getvalue())
-
-        async def test_mypy(self):
-            self.maxDiff = None
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                # Quicklint assumes this has been run already and doesn't work
-                # without it
-                _, _, _ = await actions_local_runner.shell_cmd(
-                    [
-                        f"{sys.executable}",
-                        "tools/actions_local_runner.py",
-                        "--job",
-                        "mypy",
-                        "--file",
-                        ".github/workflows/lint.yml",
-                        "--step",
-                        "Run autogen",
-                    ],
-                    redirect=True,
-                )
-
-                await actions_local_runner.Mypy(self.test_py_files, True).run()
-
-            # Should exclude the aten/ file; also, apparently mypy
-            # typechecks files in reverse order
-            expected = textwrap.dedent(
-                """
-                x mypy (skipped typestub generation)
-                torch/some_stubs.pyi:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str")  [assignment]
-                torch/some_stubs.pyi:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str")  [assignment]
-                torch/some_cool_file.py:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str")  [assignment]
-                torch/some_cool_file.py:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str")  [assignment]
-                caffe2/some_cool_file.py:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str")  [assignment]
-                caffe2/some_cool_file.py:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str")  [assignment]
-            """  # noqa: B950
-            ).lstrip(
-                "\n"
-            )
-            self.assertEqual(expected, f.getvalue())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/test/test_cmake.py b/tools/test/test_cmake.py
index ecbce07f52d2..2c4bead6db3b 100644
--- a/tools/test/test_cmake.py
+++ b/tools/test/test_cmake.py
@@ -9,49 +9,60 @@
 import tools.setup_helpers.cmake
 
 
-T = typing.TypeVar('T')
+T = typing.TypeVar("T")
 
 
 class TestCMake(unittest.TestCase):
-
-    @unittest.mock.patch('multiprocessing.cpu_count')
+    @unittest.mock.patch("multiprocessing.cpu_count")
     def test_build_jobs(self, mock_cpu_count: unittest.mock.MagicMock) -> None:
         """Tests that the number of build jobs comes out correctly."""
         mock_cpu_count.return_value = 13
         cases = [
             # MAX_JOBS, USE_NINJA, IS_WINDOWS,         want
-            ((     '8',      True,     False),          ['-j', '8']),  # noqa: E201,E241
-            ((    None,      True,     False),                 None),  # noqa: E201,E241
-            ((     '7',     False,     False),          ['-j', '7']),  # noqa: E201,E241
-            ((    None,     False,     False),         ['-j', '13']),  # noqa: E201,E241
-            ((     '6',      True,      True),          ['-j', '6']),  # noqa: E201,E241
-            ((    None,      True,      True),                 None),  # noqa: E201,E241
-            ((    '11',     False,      True), ['/p:CL_MPCount=11']),  # noqa: E201,E241
-            ((    None,     False,      True), ['/p:CL_MPCount=13']),  # noqa: E201,E241
+            (("8", True, False), ["-j", "8"]),  # noqa: E201,E241
+            ((None, True, False), None),  # noqa: E201,E241
+            (("7", False, False), ["-j", "7"]),  # noqa: E201,E241
+            ((None, False, False), ["-j", "13"]),  # noqa: E201,E241
+            (("6", True, True), ["-j", "6"]),  # noqa: E201,E241
+            ((None, True, True), None),  # noqa: E201,E241
+            (("11", False, True), ["/p:CL_MPCount=11"]),  # noqa: E201,E241
+            ((None, False, True), ["/p:CL_MPCount=13"]),  # noqa: E201,E241
         ]
         for (max_jobs, use_ninja, is_windows), want in cases:
-            with self.subTest(MAX_JOBS=max_jobs, USE_NINJA=use_ninja, IS_WINDOWS=is_windows):
+            with self.subTest(
+                MAX_JOBS=max_jobs, USE_NINJA=use_ninja, IS_WINDOWS=is_windows
+            ):
                 with contextlib.ExitStack() as stack:
-                    stack.enter_context(env_var('MAX_JOBS', max_jobs))
-                    stack.enter_context(unittest.mock.patch.object(tools.setup_helpers.cmake, 'USE_NINJA', use_ninja))
-                    stack.enter_context(unittest.mock.patch.object(tools.setup_helpers.cmake, 'IS_WINDOWS', is_windows))
+                    stack.enter_context(env_var("MAX_JOBS", max_jobs))
+                    stack.enter_context(
+                        unittest.mock.patch.object(
+                            tools.setup_helpers.cmake, "USE_NINJA", use_ninja
+                        )
+                    )
+                    stack.enter_context(
+                        unittest.mock.patch.object(
+                            tools.setup_helpers.cmake, "IS_WINDOWS", is_windows
+                        )
+                    )
 
                     cmake = tools.setup_helpers.cmake.CMake()
 
-                    with unittest.mock.patch.object(cmake, 'run') as cmake_run:
+                    with unittest.mock.patch.object(cmake, "run") as cmake_run:
                         cmake.build({})
 
                     cmake_run.assert_called_once()
-                    call, = cmake_run.mock_calls
+                    (call,) = cmake_run.mock_calls
                     build_args, _ = call.args
 
                 if want is None:
-                    self.assertNotIn('-j', build_args)
+                    self.assertNotIn("-j", build_args)
                 else:
                     self.assert_contains_sequence(build_args, want)
 
     @staticmethod
-    def assert_contains_sequence(sequence: Sequence[T], subsequence: Sequence[T]) -> None:
+    def assert_contains_sequence(
+        sequence: Sequence[T], subsequence: Sequence[T]
+    ) -> None:
         """Raises an assertion if the subsequence is not contained in the sequence."""
         if len(subsequence) == 0:
             return  # all sequences contain the empty subsequence
@@ -63,7 +74,7 @@ def assert_contains_sequence(sequence: Sequence[T], subsequence: Sequence[T]) ->
             assert len(candidate) == len(subsequence)  # sanity check
             if candidate == subsequence:
                 return  # found it
-        raise AssertionError(f'{subsequence} not found in {sequence}')
+        raise AssertionError(f"{subsequence} not found in {sequence}")
 
 
 @contextlib.contextmanager
diff --git a/tools/test/test_codegen.py b/tools/test/test_codegen.py
index 0dded01cc6ed..22b5470f6326 100644
--- a/tools/test/test_codegen.py
+++ b/tools/test/test_codegen.py
@@ -4,72 +4,77 @@
 
 from tools.autograd import gen_autograd_functions
 from tools.autograd import load_derivatives
-import tools.codegen.model
+import torchgen.model
 
-class TestCreateDerivative(unittest.TestCase):
 
+class TestCreateDerivative(unittest.TestCase):
     def test_named_grads(self) -> None:
-        schema = tools.codegen.model.FunctionSchema.parse(
-            'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)')
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        schema = torchgen.model.FunctionSchema.parse(
+            "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)"
+        )
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
         derivative = load_derivatives.create_derivative(
             native_function,
-            formula='func_backward(grad_x, grad_y)',
+            formula="func_backward(grad_x, grad_y)",
             var_names=(),
-            available_named_gradients=['grad_x', 'grad_y'])
-        self.assertSetEqual(derivative.named_gradients, {'grad_x', 'grad_y'})
+            available_named_gradients=["grad_x", "grad_y"],
+        )
+        self.assertSetEqual(derivative.named_gradients, {"grad_x", "grad_y"})
 
     def test_non_differentiable_output(self) -> None:
-        specification = 'func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)'
-        schema = tools.codegen.model.FunctionSchema.parse(specification)
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        specification = "func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)"
+        schema = torchgen.model.FunctionSchema.parse(specification)
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
         differentiability_info = load_derivatives.create_differentiability_info(
-            defn={'name': specification,
-                  'a': 'grads[0]',
-                  'b': 'grads[2]',
-                  },
+            defn={
+                "name": specification,
+                "a": "grads[0]",
+                "b": "grads[2]",
+            },
             functions_by_signature={schema.signature(): [native_function]},
             functions_by_schema={specification: native_function},
             op_counter=typing.Counter[str](),
         )
 
-        self.assertSequenceEqual(differentiability_info.available_named_gradients,
-                                 # grad_y is not present because y is a
-                                 # bool and thus not differentiable.
-                                 ['grad_x', 'grad_z'])
+        self.assertSequenceEqual(
+            differentiability_info.available_named_gradients,
+            # grad_y is not present because y is a
+            # bool and thus not differentiable.
+            ["grad_x", "grad_z"],
+        )
 
     def test_indexed_grads(self) -> None:
-        schema = tools.codegen.model.FunctionSchema.parse(
-            'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)')
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        schema = torchgen.model.FunctionSchema.parse(
+            "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)"
+        )
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
         derivative = load_derivatives.create_derivative(
             native_function,
-            formula='func_backward(grads[0], grads[1])',
+            formula="func_backward(grads[0], grads[1])",
             var_names=(),
-            available_named_gradients=['grad_x', 'grad_y'])
+            available_named_gradients=["grad_x", "grad_y"],
+        )
         self.assertSetEqual(derivative.named_gradients, set())
 
     def test_named_grads_and_indexed_grads(self) -> None:
-        specification = 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)'
-        schema = tools.codegen.model.FunctionSchema.parse(specification)
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        specification = "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)"
+        schema = torchgen.model.FunctionSchema.parse(specification)
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    'illegally mixes use of "grad_RETURN_NAME"'):
+        with self.assertRaisesRegex(
+            RuntimeError, 'illegally mixes use of "grad_RETURN_NAME"'
+        ):
             load_derivatives.create_differentiability_info(
-                defn={'name': specification,
-                      # Uh-oh, the derivatives reference gradients by
-                      # name and by index.
-                      'a': 'grad_x',
-                      'b': 'grads[1]',
-                      },
+                defn={
+                    "name": specification,
+                    # Uh-oh, the derivatives reference gradients by
+                    # name and by index.
+                    "a": "grad_x",
+                    "b": "grads[1]",
+                },
                 functions_by_signature={schema.signature(): [native_function]},
                 functions_by_schema={specification: native_function},
                 op_counter=typing.Counter[str](),
@@ -78,60 +83,61 @@ def test_named_grads_and_indexed_grads(self) -> None:
 
 class TestGenAutogradFunctions(unittest.TestCase):
     def test_non_differentiable_output_invalid_type(self) -> None:
-        specification = 'func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)'
-        schema = tools.codegen.model.FunctionSchema.parse(specification)
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        specification = "func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)"
+        schema = torchgen.model.FunctionSchema.parse(specification)
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
         differentiability_info = load_derivatives.create_differentiability_info(
-            defn={'name': specification,
-                  'a': 'grad_x',
-                  'b': 'grad_z',
-                  },
+            defn={
+                "name": specification,
+                "a": "grad_x",
+                "b": "grad_z",
+            },
             functions_by_signature={schema.signature(): [native_function]},
             functions_by_schema={specification: native_function},
             op_counter=typing.Counter[str](),
         )
         definition = gen_autograd_functions.process_function(
-            differentiability_info,
-            gen_autograd_functions.FUNCTION_DEFINITION)
+            differentiability_info, gen_autograd_functions.FUNCTION_DEFINITION
+        )
         # grad_z should map to grads[1], not grads[2] because output 1
         # (y) is not differentiable.
-        assert 'grad_z = grads[2]' not in definition
-        assert 'grad_z = grads[1]' in definition
-
+        assert "grad_z = grads[2]" not in definition
+        assert "grad_z = grads[1]" in definition
 
     def test_non_differentiable_output_output_differentiability(self) -> None:
-        specification = 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y, Tensor z)'
-        schema = tools.codegen.model.FunctionSchema.parse(specification)
-        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION,
-                                              func=schema)
+        specification = "func(Tensor a, Tensor b) -> (Tensor x, Tensor y, Tensor z)"
+        schema = torchgen.model.FunctionSchema.parse(specification)
+        native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema)
 
         differentiability_info = load_derivatives.create_differentiability_info(
-            defn={'name': specification,
-                  'a': 'grad_x',
-                  'b': 'grad_z',
-                  'output_differentiability': [True, False, True],
-                  },
+            defn={
+                "name": specification,
+                "a": "grad_x",
+                "b": "grad_z",
+                "output_differentiability": [True, False, True],
+            },
             functions_by_signature={schema.signature(): [native_function]},
             functions_by_schema={specification: native_function},
             op_counter=typing.Counter[str](),
         )
         definition = gen_autograd_functions.process_function(
-            differentiability_info,
-            gen_autograd_functions.FUNCTION_DEFINITION)
+            differentiability_info, gen_autograd_functions.FUNCTION_DEFINITION
+        )
         # grad_z should map to grads[1], not grads[2] because output 1
         # (y) is not differentiable.
-        assert 'grad_z = grads[2]' not in definition
-        assert 'grad_z = grads[1]' in definition
+        assert "grad_z = grads[2]" not in definition
+        assert "grad_z = grads[1]" in definition
 
 
 # Represents the most basic NativeFunction. Use dataclasses.replace()
 # to edit for use.
-DEFAULT_NATIVE_FUNCTION, _ = tools.codegen.model.NativeFunction.from_yaml(
-    {'func': 'func() -> bool'},
-    loc=tools.codegen.model.Location(__file__, 1))
+DEFAULT_NATIVE_FUNCTION, _ = torchgen.model.NativeFunction.from_yaml(
+    {"func": "func() -> bool"},
+    loc=torchgen.model.Location(__file__, 1),
+    valid_tags=set(),
+)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tools/test/test_codegen_model.py b/tools/test/test_codegen_model.py
new file mode 100644
index 000000000000..710e90697116
--- /dev/null
+++ b/tools/test/test_codegen_model.py
@@ -0,0 +1,145 @@
+# Owner(s): ["module: codegen"]
+
+import expecttest
+import unittest
+import yaml
+import textwrap
+
+from torchgen.model import NativeFunctionsGroup, DispatchKey
+import torchgen.dest as dest
+import torchgen.gen as gen
+from torchgen.gen import LineLoader, parse_native_yaml_struct
+
+
+class TestCodegenModel(expecttest.TestCase):
+    def assertParseErrorInline(self, yaml_str: str, expect: str) -> None:
+        es = yaml.load(yaml_str, Loader=LineLoader)
+        try:
+            parse_native_yaml_struct(es, set())
+        except AssertionError as e:
+            # hack to strip out the context
+            msg, _ = str(e).split("  in ", 2)
+            self.assertExpectedInline("\n".join(textwrap.wrap(msg)), expect, skip=1)
+            return
+        self.fail(msg="Did not raise when expected to")
+
+    def assertUfuncErrorInline(self, yaml_str: str, expect: str) -> None:
+        # parse a single structured group out of the yaml to g
+        es = yaml.load(yaml_str, Loader=LineLoader)
+        parsed_yaml = parse_native_yaml_struct(es, set())
+        native_functions, backend_indices = (
+            parsed_yaml.native_functions,
+            parsed_yaml.backend_indices,
+        )
+        grouped_native_functions = gen.get_grouped_native_functions(native_functions)
+        assert len(grouped_native_functions) == 1
+        g = grouped_native_functions[0]
+        assert isinstance(g, NativeFunctionsGroup)
+        assert g.out.ufunc_inner_loop
+        # this is not ufunc codegen per se, but it does some basic sanity tests for
+        # ufunc generation
+        gen.compute_meta_function_declaration(g)
+        dest.compute_native_function_declaration(g, backend_indices[DispatchKey.CPU])
+        dest.compute_native_function_declaration(g, backend_indices[DispatchKey.CUDA])
+        try:
+            # the real kahuna
+            dest.compute_ufunc_cpu(g)
+            dest.compute_ufunc_cpu_kernel(g)
+            dest.compute_ufunc_cuda(g)
+        except AssertionError as e:
+            # hack to strip out the context
+            msg, _ = str(e).split("  in ", 2)
+            self.assertExpectedInline("\n".join(textwrap.wrap(msg)), expect, skip=1)
+            return
+        self.fail(msg="Did not raise when expected to")
+
+    # NB: indent is hardcoded to be two here, so format your yaml accordingly
+    binop_out = (
+        "func: binop.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+    )
+    ti_binop_out = f"""{binop_out}
+  structured: True
+  structured_inherits: TensorIteratorBase"""
+    ti_binop = """func: binop(Tensor self, Tensor other) -> Tensor
+  structured_delegate: binop.out
+"""
+
+    ti_unop_out = """func: unop.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase"""
+    ti_unop = """func: unop(Tensor self) -> Tensor
+  structured_delegate: unop.out
+"""
+
+    def test_nonstructured_ufunc(self) -> None:
+        yaml_str = f"""\
+- {self.binop_out}
+  ufunc_inner_loop:
+    Generic: binop (Bool)
+"""
+        self.assertParseErrorInline(
+            yaml_str,
+            """\
+ufunc must be structured""",
+        )
+
+    def test_overlapping_ufunc_and_dispatch(self) -> None:
+        yaml_str = f"""\
+- {self.ti_binop_out}
+  ufunc_inner_loop:
+    Generic: binop (Bool)
+  dispatch:
+    CPU: binop_cpu
+"""
+        self.assertParseErrorInline(
+            yaml_str,
+            """\
+ufunc should not have explicit dispatch entry for CPU""",
+        )
+
+    # See https://github.com/pytorch/pytorch/pull/65851#discussion_r810238456
+    @unittest.expectedFailure
+    def test_scalaronly_shadowed(self) -> None:
+        yaml_str = f"""\
+- {self.ti_binop_out}
+  ufunc_inner_loop:
+    Generic: binop (Bool)
+    ScalarOnly: binop (Bool)
+"""
+        self.assertParseErrorInline(
+            yaml_str,
+            """\
+""",
+        )
+
+    def test_conflicting_ufunc(self) -> None:
+        yaml_str = f"""\
+- {self.ti_binop_out}
+  ufunc_inner_loop:
+    Generic: binop (Bool)
+    ScalarOnly: binop_scalar (Bool)
+- {self.ti_binop}
+"""
+        self.assertUfuncErrorInline(
+            yaml_str,
+            """\
+ScalarOnly and Generic must have same ufunc name""",
+        )
+
+    def test_invalid_cudafunctoronself_for_binary_op(self) -> None:
+        yaml_str = f"""\
+- {self.ti_unop_out}
+  ufunc_inner_loop:
+    Generic: unop (All)
+    CUDAFunctorOnSelf: unop_self_cuda (All)
+- {self.ti_unop}
+"""
+        self.assertUfuncErrorInline(
+            yaml_str,
+            """\
+cannot use CUDAFunctorOnSelf on non-binary function""",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/test/test_extract_scripts.py b/tools/test/test_extract_scripts.py
deleted file mode 100644
index 3126893c4bb3..000000000000
--- a/tools/test/test_extract_scripts.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import unittest
-
-from tools import extract_scripts
-
-requirements_sh = '''
-#!/usr/bin/env bash
-set -eo pipefail
-pip install -r requirements.txt
-'''.strip()
-
-hello_sh = '''
-#!/usr/bin/env sh
-set -e
-echo hello world
-'''.strip()
-
-
-class TestExtractScripts(unittest.TestCase):
-    def test_extract_none(self) -> None:
-        self.assertEqual(
-            extract_scripts.extract({
-                'name': 'Checkout PyTorch',
-                'uses': 'zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9',
-            }),
-            None,
-        )
-
-    def test_extract_run_default_bash(self) -> None:
-        self.assertEqual(
-            extract_scripts.extract({
-                'name': 'Install requirements',
-                'run': 'pip install -r requirements.txt',
-            }),
-            {
-                'extension': '.sh',
-                'script': requirements_sh,
-            },
-        )
-
-    def test_extract_run_sh(self) -> None:
-        self.assertEqual(
-            extract_scripts.extract({
-                'name': 'Hello world',
-                'run': 'echo hello world',
-                'shell': 'sh',
-            }),
-            {
-                'extension': '.sh',
-                'script': hello_sh,
-            },
-        )
-
-    def test_extract_run_py(self) -> None:
-        self.assertEqual(
-            extract_scripts.extract({
-                'name': 'Hello world',
-                'run': 'print("Hello!")',
-                'shell': 'python',
-            }),
-            {
-                'extension': '.py',
-                'script': 'print("Hello!")',
-            },
-        )
-
-    def test_extract_github_script(self) -> None:
-        self.assertEqual(
-            # https://github.com/actions/github-script/tree/v3.1.1#reading-step-results
-            extract_scripts.extract({
-                'uses': 'actions/github-script@v3',
-                'id': 'set-result',
-                'with': {
-                    'script': 'return "Hello!"',
-                    'result-encoding': 'string',
-                },
-            }),
-            {
-                'extension': '.js',
-                'script': 'return "Hello!"',
-            },
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/test/test_gen_backend_stubs.py b/tools/test/test_gen_backend_stubs.py
index ee2ee8a0f0b9..168ae8b1d7c7 100644
--- a/tools/test/test_gen_backend_stubs.py
+++ b/tools/test/test_gen_backend_stubs.py
@@ -5,233 +5,269 @@
 import unittest
 import expecttest
 
-from tools.codegen.gen_backend_stubs import run
-from tools.codegen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE  # noqa: F401
+from torchgen.gen_backend_stubs import run
+from torchgen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE  # noqa: F401
 
 path = os.path.dirname(os.path.realpath(__file__))
-gen_backend_stubs_path = os.path.join(path, '../tools/codegen/gen_backend_stubs.py')
+gen_backend_stubs_path = os.path.join(path, "../torchgen/gen_backend_stubs.py")
 
 # gen_backend_stubs.py is an integration point that is called directly by external backends.
 # The tests here are to confirm that badly formed inputs result in reasonable error messages.
 class TestGenBackendStubs(expecttest.TestCase):
-
     def setUp(self) -> None:
         global _GLOBAL_PARSE_NATIVE_YAML_CACHE
         _GLOBAL_PARSE_NATIVE_YAML_CACHE.clear()
 
-
     def assert_success_from_gen_backend_stubs(self, yaml_str: str) -> None:
-        with tempfile.NamedTemporaryFile(mode='w') as fp:
+        with tempfile.NamedTemporaryFile(mode="w") as fp:
             fp.write(yaml_str)
             fp.flush()
-            run(fp.name, '', True)
+            run(fp.name, "", True)
 
     def get_errors_from_gen_backend_stubs(self, yaml_str: str) -> str:
-        with tempfile.NamedTemporaryFile(mode='w') as fp:
+        with tempfile.NamedTemporaryFile(mode="w") as fp:
             fp.write(yaml_str)
             fp.flush()
             try:
-                run(fp.name, '', True)
+                run(fp.name, "", True)
             except AssertionError as e:
                 # Scrub out the temp file name from any error messages to simplify assertions.
-                return str(e).replace(fp.name, '')
-            self.fail('Expected gen_backend_stubs to raise an AssertionError, but it did not.')
+                return str(e).replace(fp.name, "")
+            self.fail(
+                "Expected gen_backend_stubs to raise an AssertionError, but it did not."
+            )
 
     def test_valid_single_op(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
-- abs'''
+- abs"""
         self.assert_success_from_gen_backend_stubs(yaml_str)
 
     def test_valid_multiple_ops(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - add.Tensor
-- abs'''
+- abs"""
         self.assert_success_from_gen_backend_stubs(yaml_str)
 
     def test_valid_zero_ops(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
-supported:'''
+supported:"""
         self.assert_success_from_gen_backend_stubs(yaml_str)
 
     def test_valid_zero_ops_doesnt_require_backend_dispatch_key(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: BAD_XLA
 cpp_namespace: torch_xla
-supported:'''
+supported:"""
         # External codegen on a yaml file with no operators is effectively a no-op,
         # so there's no reason to parse the backend
         self.assert_success_from_gen_backend_stubs(yaml_str)
 
     def test_valid_with_autograd_ops(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - abs
 autograd:
-- add.Tensor'''
+- add.Tensor"""
         # External codegen on a yaml file with no operators is effectively a no-op,
         # so there's no reason to parse the backend
         self.assert_success_from_gen_backend_stubs(yaml_str)
 
     def test_missing_backend(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 cpp_namespace: torch_xla
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide a value for "backend"''')
+        self.assertExpectedInline(
+            output_error, '''You must provide a value for "backend"'''
+        )
 
     def test_empty_backend(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend:
 cpp_namespace: torch_xla
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide a value for "backend"''')
+        self.assertExpectedInline(
+            output_error, '''You must provide a value for "backend"'''
+        )
 
     def test_backend_invalid_dispatch_key(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: NOT_XLA
 cpp_namespace: torch_xla
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''\
+        self.assertExpectedInline(
+            output_error,
+            """\
 unknown dispatch key NOT_XLA
-  The provided value for "backend" must be a valid DispatchKey, but got NOT_XLA.''')  # noqa: B950
+  The provided value for "backend" must be a valid DispatchKey, but got NOT_XLA.""",
+        )  # noqa: B950
 
     def test_missing_cpp_namespace(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide a value for "cpp_namespace"''')
+        self.assertExpectedInline(
+            output_error, '''You must provide a value for "cpp_namespace"'''
+        )
 
     def test_whitespace_cpp_namespace(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace:\t
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide a value for "cpp_namespace"''')
+        self.assertExpectedInline(
+            output_error, '''You must provide a value for "cpp_namespace"'''
+        )
 
     # supported is a single item (it should be a list)
     def test_nonlist_supported(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
-supported: abs'''
+supported: abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''expected "supported" to be a list, but got: abs (of type <class 'str'>)''')
+        self.assertExpectedInline(
+            output_error,
+            """expected "supported" to be a list, but got: abs (of type <class 'str'>)""",
+        )
 
     # supported contains an op that isn't in native_functions.yaml
     def test_supported_invalid_op(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
-- abs_BAD'''
+- abs_BAD"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''Found an invalid operator name: abs_BAD''')
+        self.assertExpectedInline(
+            output_error, """Found an invalid operator name: abs_BAD"""
+        )
 
     # The backend is valid, but doesn't have a valid autograd key. They can't override autograd kernels in that case.
     # Only using Vulkan here because it has a valid backend key but not an autograd key- if this changes we can update the test.
     def test_backend_has_no_autograd_key_but_provides_entries(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: Vulkan
 cpp_namespace: torch_vulkan
 supported:
 - add
 autograd:
-- sub'''
+- sub"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''Found an invalid operator name: add''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error, """Found an invalid operator name: add"""
+        )  # noqa: B950
 
     # in an operator group, currently all operators must either be registered to the backend or autograd kernel.
     # Here, functional and out mismatch
     def test_backend_autograd_kernel_mismatch_out_functional(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
-- add.out'''
+- add.out"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_out is listed under "autograd".''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_out is listed under "autograd".""",  # noqa: B950
+        )
 
     # in an operator group, currently all operators must either be registered to the backend or autograd kernel.
     # Here, functional and inplace mismatch
     def test_backend_autograd_kernel_mismatch_functional_inplace(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
-- add_.Tensor'''
+- add_.Tensor"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_ is listed under "autograd".''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_ is listed under "autograd".""",  # noqa: B950
+        )
 
     # Currently, the same operator can't be listed under both 'supported' and 'autograd', which would
     # involve registering the same kernel to both the XLA and AutogradXLA keys.
     # If we need that functionality in the future, we'll need to augment the codegen.
     def test_op_appears_in_supported_and_autograd_lists(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
-- add.Tensor'''
+- add.Tensor"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add is listed under "autograd".''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add is listed under "autograd".""",  # noqa: B950
+        )
 
     # unrecognized extra yaml key
     def test_unrecognized_key(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 supported:
 - abs
-invalid_key: invalid_val'''
+invalid_key: invalid_val"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, ''' contains unexpected keys: invalid_key. Only the following keys are supported: backend, cpp_namespace, extra_headers, supported, autograd, full_codegen''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """ contains unexpected keys: invalid_key. Only the following keys are supported: backend, class_name, cpp_namespace, extra_headers, supported, autograd, full_codegen""",  # noqa: B950
+        )
 
     # if use_out_as_primary is provided, it must be a bool
     def test_use_out_as_primary_non_bool(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 use_out_as_primary: frue
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide either True or False for use_out_as_primary. Provided: frue''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """You must provide either True or False for use_out_as_primary. Provided: frue""",
+        )  # noqa: B950
 
     # if device_guard is provided, it must be a bool
     def test_device_guard_non_bool(self) -> None:
-        yaml_str = '''\
+        yaml_str = """\
 backend: XLA
 cpp_namespace: torch_xla
 device_guard: frue
 supported:
-- abs'''
+- abs"""
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
-        self.assertExpectedInline(output_error, '''You must provide either True or False for device_guard. Provided: frue''')  # noqa: B950
+        self.assertExpectedInline(
+            output_error,
+            """You must provide either True or False for device_guard. Provided: frue""",
+        )  # noqa: B950
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tools/test/test_import_test_stats.py b/tools/test/test_import_test_stats.py
new file mode 100644
index 000000000000..ea9aad8df40d
--- /dev/null
+++ b/tools/test/test_import_test_stats.py
@@ -0,0 +1,67 @@
+import os
+import unittest
+from tools.stats.import_test_stats import get_disabled_issues
+from typing import List
+from unittest.mock import patch
+
+
+class TestGetDisabledIssues(unittest.TestCase):
+    def run_assert_disabled_issues(
+        self, pr_body: str, commit_messages: str, expected: List[str]
+    ) -> None:
+        with patch.dict(
+            os.environ, {"PR_BODY": pr_body, "COMMIT_MESSAGES": commit_messages}
+        ):
+            disabled_issues = get_disabled_issues()
+        self.assertEqual(disabled_issues, expected)
+
+    # test variations of close in PR_BODY
+    def test_closes_pr_body(self) -> None:
+        pr_body = "closes #123 Close #143 ClOsE #345 closed #10283"
+        self.run_assert_disabled_issues(pr_body, "", ["123", "143", "345", "10283"])
+
+    # test variations of fix in COMMIT_MESSAGES
+    def test_fixes_commit_messages(self) -> None:
+        commit_messages = "fix #123 FixEd #143 fixes #345 FiXeD #10283"
+        self.run_assert_disabled_issues(
+            "", commit_messages, ["123", "143", "345", "10283"]
+        )
+
+    # test variations of resolve in PR_BODY and COMMIT_MESSAGES
+    def test_resolves_pr_commits(self) -> None:
+        pr_body = "resolve #123 resolveS #143"
+        commit_messages = "REsolved #345 RESOLVES #10283"
+        self.run_assert_disabled_issues(
+            pr_body, commit_messages, ["123", "143", "345", "10283"]
+        )
+
+    # test links
+    def test_issue_links(self) -> None:
+        pr_body = "closes https://github.com/pytorch/pytorch/issues/75198 fixes https://github.com/pytorch/pytorch/issues/75123"
+        self.run_assert_disabled_issues(pr_body, "", ["75198", "75123"])
+
+    # test strange spacing
+    def test_spacing(self) -> None:
+        pr_body = "resolve #123,resolveS #143Resolved #345\nRESOLVES #10283"
+        commit_messages = "Fixed #2348fixes https://github.com/pytorch/pytorch/issues/75123resolveS #2134"
+        self.run_assert_disabled_issues(
+            pr_body,
+            commit_messages,
+            ["123", "143", "345", "10283", "2348", "75123", "2134"],
+        )
+
+    # test bad things
+    def test_not_accepted(self) -> None:
+        pr_body = (
+            "fixes189 fixeshttps://github.com/pytorch/pytorch/issues/75123 "
+            "closedhttps://githubcom/pytorch/pytorch/issues/75123"
+        )
+        commit_messages = (
+            "fix 234, fixes # 45, fixing #123, close 234, closes#45, closing #123 resolve 234, "
+            "resolves  #45, resolving #123"
+        )
+        self.run_assert_disabled_issues(pr_body, commit_messages, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/test/test_max_tokens_pragma.py b/tools/test/test_max_tokens_pragma.py
deleted file mode 100644
index 746b51e39d03..000000000000
--- a/tools/test/test_max_tokens_pragma.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import unittest
-from tools.linter.clang_tidy.max_tokens_pragma import (
-    add_max_tokens_pragma,
-    strip_max_tokens_pragmas,
-)
-
-
-def compare_code(a: str, b: str) -> bool:
-    a_lines = [line.strip() for line in a.splitlines()]
-    b_lines = [line.strip() for line in b.splitlines()]
-    return a_lines == b_lines
-
-
-class TestMaxTokensPragma(unittest.TestCase):
-    def test_no_prior_pragmas(self) -> None:
-        input = """\
-        // File without any prior pragmas
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-        """
-
-        expected = """\
-        #pragma clang max_tokens_total 42
-        // File without any prior pragmas
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-        """
-        output = add_max_tokens_pragma(input, 42)
-        self.assertTrue(compare_code(output, expected))
-
-        output = strip_max_tokens_pragmas(output)
-        self.assertTrue(compare_code(output, input))
-
-    def test_single_prior_pragma(self) -> None:
-        input = """\
-        // File with prior pragmas
-
-        #pragma clang max_tokens_total 1
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-        """
-
-        expected = """\
-        // File with prior pragmas
-
-        #pragma clang max_tokens_total 42
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-        """
-        stripped = """\
-        // File with prior pragmas
-
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-        """
-
-        output = add_max_tokens_pragma(input, 42)
-        self.assertTrue(compare_code(output, expected))
-
-        output = strip_max_tokens_pragmas(output)
-        self.assertTrue(compare_code(output, stripped))
-
-    def test_multiple_prior_pragmas(self) -> None:
-        input = """\
-        // File with multiple prior pragmas
-
-        #pragma clang max_tokens_total 1
-
-        // Different pragma; script should ignore this
-        #pragma clang max_tokens_here 20
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-
-        #pragma clang max_tokens_total 1
-        """
-
-        expected = """\
-        // File with multiple prior pragmas
-
-        #pragma clang max_tokens_total 42
-
-        // Different pragma; script should ignore this
-        #pragma clang max_tokens_here 20
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-
-        #pragma clang max_tokens_total 42
-        """
-        stripped = """\
-        // File with multiple prior pragmas
-
-
-        // Different pragma; script should ignore this
-        #pragma clang max_tokens_here 20
-
-        int main() {
-          for (int i = 0; i < 10; i++);
-          return 0;
-        }
-
-        """
-
-        output = add_max_tokens_pragma(input, 42)
-        self.assertTrue(compare_code(output, expected))
-
-        output = strip_max_tokens_pragmas(output)
-        self.assertTrue(compare_code(output, stripped))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/test/test_mypy_wrapper.py b/tools/test/test_mypy_wrapper.py
deleted file mode 100644
index df7b0ab9e27d..000000000000
--- a/tools/test/test_mypy_wrapper.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import unittest
-
-from tools.linter import mypy_wrapper
-
-
-class TestMypyWrapper(unittest.TestCase):
-    configs = {
-        'foo.ini': {
-            'file1.abc',
-            'dir2',
-            'dir3/file4.xyz',
-        },
-        'bar/baz.ini': {
-            'file1.abc',
-            'dir2/dir5/file6.def',
-            'dir3/file7.abc',
-        },
-    }
-
-    trie: mypy_wrapper.Trie = {
-        'file1.abc': {None: {'foo.ini', 'bar/baz.ini'}},
-        'dir2': {
-            None: {'foo.ini'},
-            'dir5': {'file6.def': {None: {'bar/baz.ini'}}},
-        },
-        'dir3': {
-            'file4.xyz': {None: {'foo.ini'}},
-            'file7.abc': {None: {'bar/baz.ini'}},
-        },
-    }
-
-    def test_config_files(self) -> None:
-        self.assertEqual(mypy_wrapper.config_files().keys(), {
-            'mypy.ini',
-            'mypy-strict.ini',
-        })
-
-    def test_split_path(self) -> None:
-        self.assertEqual(mypy_wrapper.split_path('file1.abc'), ['file1.abc'])
-        self.assertEqual(
-            mypy_wrapper.split_path('dir3/file4.xyz'),
-            ['dir3', 'file4.xyz'],
-        )
-        self.assertEqual(
-            mypy_wrapper.split_path('dir2/dir5/file6.def'),
-            ['dir2', 'dir5', 'file6.def'],
-        )
-
-    def test_make_trie(self) -> None:
-        self.assertEqual(mypy_wrapper.make_trie(self.configs), self.trie)
-
-    def test_lookup(self) -> None:
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'file1.abc'),
-            {'foo.ini', 'bar/baz.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir2/dir5/file6.def'),
-            {'foo.ini', 'bar/baz.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir3/file4.xyz'),
-            {'foo.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir3/file7.abc'),
-            {'bar/baz.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'file8.xyz'),
-            set(),
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir2/dir9/file10.abc'),
-            {'foo.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir3/file11.abc'),
-            set(),
-        )
-
-        # non-leaves shouldn't ever be passed to lookup in practice, but
-        # still, good to consider/test these cases
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir2'),
-            {'foo.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir2/dir5'),
-            {'foo.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir3'),
-            set(),
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir2/dir9'),
-            {'foo.ini'},
-        )
-        self.assertEqual(
-            mypy_wrapper.lookup(self.trie, 'dir4'),
-            set(),
-        )
-
-    def test_make_plan(self) -> None:
-        self.assertEqual(
-            mypy_wrapper.make_plan(configs=self.configs, files=[
-                'file8.xyz',
-                'dir3/file11.abc',
-            ]),
-            {}
-        )
-        self.assertEqual(
-            mypy_wrapper.make_plan(configs=self.configs, files=[
-                'file8.xyz',
-                'dir2/dir9/file10.abc',
-                'dir3/file4.xyz',
-                'dir3/file11.abc',
-            ]),
-            {
-                'foo.ini': ['dir2/dir9/file10.abc', 'dir3/file4.xyz'],
-            }
-        )
-        self.assertEqual(
-            mypy_wrapper.make_plan(configs=self.configs, files=[
-                'file8.xyz',
-                'dir3/file11.abc',
-                'dir3/file7.abc',
-            ]),
-            {
-                'bar/baz.ini': ['dir3/file7.abc'],
-            }
-        )
-        self.assertEqual(
-            mypy_wrapper.make_plan(configs=self.configs, files=[
-                'dir2/dir9/file10.abc',
-                'dir2/dir5/file6.def',
-                'dir3/file7.abc',
-                'file1.abc',
-                'dir3/file11.abc',
-            ]),
-            {
-                'foo.ini': [
-                    'dir2/dir9/file10.abc',
-                    'dir2/dir5/file6.def',
-                    'file1.abc',
-                ],
-                'bar/baz.ini': [
-                    'dir2/dir5/file6.def',
-                    'dir3/file7.abc',
-                    'file1.abc',
-                ],
-            }
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/test/test_stats.py b/tools/test/test_stats.py
index 46ad28748608..2718308f66da 100644
--- a/tools/test/test_stats.py
+++ b/tools/test/test_stats.py
@@ -3,10 +3,16 @@
 from typing import Dict, List
 
 from tools.stats import print_test_stats
-from tools.stats.s3_stat_parser import (Commit, Report, ReportMetaMeta,
-                                        Status, Version1Case,
-                                        Version1Report, Version2Case,
-                                        Version2Report)
+from tools.stats.s3_stat_parser import (
+    Commit,
+    Report,
+    ReportMetaMeta,
+    Status,
+    Version1Case,
+    Version1Report,
+    Version2Case,
+    Version2Report,
+)
 
 
 def fakehash(char: str) -> str:
@@ -15,14 +21,14 @@ def fakehash(char: str) -> str:
 
 def dummy_meta_meta() -> ReportMetaMeta:
     return {
-        'build_pr': '',
-        'build_tag': '',
-        'build_sha1': '',
-        'build_base_commit': '',
-        'build_branch': '',
-        'build_job': '',
-        'build_workflow_id': '',
-        'build_start_time_epoch': '',
+        "build_pr": "",
+        "build_tag": "",
+        "build_sha1": "",
+        "build_base_commit": "",
+        "build_branch": "",
+        "build_job": "",
+        "build_workflow_id": "",
+        "build_start_time_epoch": "",
     }
 
 
@@ -35,202 +41,210 @@ def makecase(
     skipped: bool = False,
 ) -> Version1Case:
     return {
-        'name': name,
-        'seconds': seconds,
-        'errored': errored,
-        'failed': failed,
-        'skipped': skipped,
+        "name": name,
+        "seconds": seconds,
+        "errored": errored,
+        "failed": failed,
+        "skipped": skipped,
     }
 
 
 def make_report_v1(tests: Dict[str, List[Version1Case]]) -> Version1Report:
     suites = {
         suite_name: {
-            'total_seconds': sum(case['seconds'] for case in cases),
-            'cases': cases,
+            "total_seconds": sum(case["seconds"] for case in cases),
+            "cases": cases,
         }
         for suite_name, cases in tests.items()
     }
     return {
         **dummy_meta_meta(),  # type: ignore[misc]
-        'total_seconds': sum(s['total_seconds'] for s in suites.values()),
-        'suites': suites,
+        "total_seconds": sum(s["total_seconds"] for s in suites.values()),
+        "suites": suites,
     }
 
 
 def make_case_v2(seconds: float, status: Status = None) -> Version2Case:
     return {
-        'seconds': seconds,
-        'status': status,
+        "seconds": seconds,
+        "status": status,
     }
 
 
-def make_report_v2(tests: Dict[str, Dict[str, Dict[str, Version2Case]]]) -> Version2Report:
+def make_report_v2(
+    tests: Dict[str, Dict[str, Dict[str, Version2Case]]]
+) -> Version2Report:
     files = {}
     for file_name, file_suites in tests.items():
         suites = {
             suite_name: {
-                'total_seconds': sum(case['seconds'] for case in cases.values()),
-                'cases': cases,
+                "total_seconds": sum(case["seconds"] for case in cases.values()),
+                "cases": cases,
             }
             for suite_name, cases in file_suites.items()
         }
         files[file_name] = {
-            'suites': suites,
-            'total_seconds': sum(suite['total_seconds'] for suite in suites.values()),
+            "suites": suites,
+            "total_seconds": sum(suite["total_seconds"] for suite in suites.values()),  # type: ignore[type-var]
         }
     return {
         **dummy_meta_meta(),  # type: ignore[misc]
-        'format_version': 2,
-        'total_seconds': sum(s['total_seconds'] for s in files.values()),
-        'files': files,
+        "format_version": 2,
+        "total_seconds": sum(s["total_seconds"] for s in files.values()),
+        "files": files,
     }
+
+
 maxDiff = None
 
+
 class TestPrintTestStats(unittest.TestCase):
-    version1_report: Version1Report = make_report_v1({
-        # input ordering of the suites is ignored
-        'Grault': [
-            # not printed: status same and time similar
-            makecase('test_grault0', 4.78, failed=True),
-            # status same, but time increased a lot
-            makecase('test_grault2', 1.473, errored=True),
-        ],
-        # individual tests times changed, not overall suite
-        'Qux': [
-            # input ordering of the test cases is ignored
-            makecase('test_qux1', 0.001, skipped=True),
-            makecase('test_qux6', 0.002, skipped=True),
-            # time in bounds, but status changed
-            makecase('test_qux4', 7.158, failed=True),
-            # not printed because it's the same as before
-            makecase('test_qux7', 0.003, skipped=True),
-            makecase('test_qux5', 11.968),
-            makecase('test_qux3', 23.496),
-        ],
-        # new test suite
-        'Bar': [
-            makecase('test_bar2', 3.742, failed=True),
-            makecase('test_bar1', 50.447),
-        ],
-        # overall suite time changed but no individual tests
-        'Norf': [
-            makecase('test_norf1', 3),
-            makecase('test_norf2', 3),
-            makecase('test_norf3', 3),
-            makecase('test_norf4', 3),
-        ],
-        # suite doesn't show up if it doesn't change enough
-        'Foo': [
-            makecase('test_foo1', 42),
-            makecase('test_foo2', 56),
-        ],
-    })
+    version1_report: Version1Report = make_report_v1(
+        {
+            # input ordering of the suites is ignored
+            "Grault": [
+                # not printed: status same and time similar
+                makecase("test_grault0", 4.78, failed=True),
+                # status same, but time increased a lot
+                makecase("test_grault2", 1.473, errored=True),
+            ],
+            # individual tests times changed, not overall suite
+            "Qux": [
+                # input ordering of the test cases is ignored
+                makecase("test_qux1", 0.001, skipped=True),
+                makecase("test_qux6", 0.002, skipped=True),
+                # time in bounds, but status changed
+                makecase("test_qux4", 7.158, failed=True),
+                # not printed because it's the same as before
+                makecase("test_qux7", 0.003, skipped=True),
+                makecase("test_qux5", 11.968),
+                makecase("test_qux3", 23.496),
+            ],
+            # new test suite
+            "Bar": [
+                makecase("test_bar2", 3.742, failed=True),
+                makecase("test_bar1", 50.447),
+            ],
+            # overall suite time changed but no individual tests
+            "Norf": [
+                makecase("test_norf1", 3),
+                makecase("test_norf2", 3),
+                makecase("test_norf3", 3),
+                makecase("test_norf4", 3),
+            ],
+            # suite doesn't show up if it doesn't change enough
+            "Foo": [
+                makecase("test_foo1", 42),
+                makecase("test_foo2", 56),
+            ],
+        }
+    )
 
     version2_report: Version2Report = make_report_v2(
         {
-            'test_a': {
-                'Grault': {
-                    'test_grault0': make_case_v2(4.78, 'failed'),
-                    'test_grault2': make_case_v2(1.473, 'errored'),
+            "test_a": {
+                "Grault": {
+                    "test_grault0": make_case_v2(4.78, "failed"),
+                    "test_grault2": make_case_v2(1.473, "errored"),
+                },
+                "Qux": {
+                    "test_qux1": make_case_v2(0.001, "skipped"),
+                    "test_qux6": make_case_v2(0.002, "skipped"),
+                    "test_qux4": make_case_v2(7.158, "failed"),
+                    "test_qux7": make_case_v2(0.003, "skipped"),
+                    "test_qux8": make_case_v2(11.968),
+                    "test_qux3": make_case_v2(23.496),
                 },
-                'Qux': {
-                    'test_qux1': make_case_v2(0.001, 'skipped'),
-                    'test_qux6': make_case_v2(0.002, 'skipped'),
-                    'test_qux4': make_case_v2(7.158, 'failed'),
-                    'test_qux7': make_case_v2(0.003, 'skipped'),
-                    'test_qux8': make_case_v2(11.968),
-                    'test_qux3': make_case_v2(23.496),
-                }
             },
-            'test_b': {
-                'Bar': {
-                    'test_bar2': make_case_v2(3.742, 'failed'),
-                    'test_bar1': make_case_v2(50.447),
+            "test_b": {
+                "Bar": {
+                    "test_bar2": make_case_v2(3.742, "failed"),
+                    "test_bar1": make_case_v2(50.447),
                 },
                 # overall suite time changed but no individual tests
-                'Norf': {
-                    'test_norf1': make_case_v2(3),
-                    'test_norf2': make_case_v2(3),
-                    'test_norf3': make_case_v2(3),
-                    'test_norf4': make_case_v2(3),
+                "Norf": {
+                    "test_norf1": make_case_v2(3),
+                    "test_norf2": make_case_v2(3),
+                    "test_norf3": make_case_v2(3),
+                    "test_norf4": make_case_v2(3),
                 },
             },
-            'test_c': {
-                'Foo': {
-                    'test_foo1': make_case_v2(42),
-                    'test_foo2': make_case_v2(56),
+            "test_c": {
+                "Foo": {
+                    "test_foo1": make_case_v2(42),
+                    "test_foo2": make_case_v2(56),
                 },
-            }
-        })
+            },
+        }
+    )
 
     def test_simplify(self) -> None:
         self.assertEqual(
             {
-                '': {
-                    'Bar': {
-                        'test_bar1': {'seconds': 50.447, 'status': None},
-                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                "": {
+                    "Bar": {
+                        "test_bar1": {"seconds": 50.447, "status": None},
+                        "test_bar2": {"seconds": 3.742, "status": "failed"},
                     },
-                    'Foo': {
-                        'test_foo1': {'seconds': 42, 'status': None},
-                        'test_foo2': {'seconds': 56, 'status': None},
+                    "Foo": {
+                        "test_foo1": {"seconds": 42, "status": None},
+                        "test_foo2": {"seconds": 56, "status": None},
                     },
-                    'Grault': {
-                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
-                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    "Grault": {
+                        "test_grault0": {"seconds": 4.78, "status": "failed"},
+                        "test_grault2": {"seconds": 1.473, "status": "errored"},
                     },
-                    'Norf': {
-                        'test_norf1': {'seconds': 3, 'status': None},
-                        'test_norf3': {'seconds': 3, 'status': None},
-                        'test_norf2': {'seconds': 3, 'status': None},
-                        'test_norf4': {'seconds': 3, 'status': None},
+                    "Norf": {
+                        "test_norf1": {"seconds": 3, "status": None},
+                        "test_norf3": {"seconds": 3, "status": None},
+                        "test_norf2": {"seconds": 3, "status": None},
+                        "test_norf4": {"seconds": 3, "status": None},
                     },
-                    'Qux': {
-                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
-                        'test_qux3': {'seconds': 23.496, 'status': None},
-                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
-                        'test_qux5': {'seconds': 11.968, 'status': None},
-                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
-                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                    "Qux": {
+                        "test_qux1": {"seconds": 0.001, "status": "skipped"},
+                        "test_qux3": {"seconds": 23.496, "status": None},
+                        "test_qux4": {"seconds": 7.158, "status": "failed"},
+                        "test_qux5": {"seconds": 11.968, "status": None},
+                        "test_qux6": {"seconds": 0.002, "status": "skipped"},
+                        "test_qux7": {"seconds": 0.003, "status": "skipped"},
                     },
                 },
             },
-            print_test_stats.simplify(self.version1_report)
+            print_test_stats.simplify(self.version1_report),
         )
 
         self.assertEqual(
             {
-                'test_a': {
-                    'Grault': {
-                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
-                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                "test_a": {
+                    "Grault": {
+                        "test_grault0": {"seconds": 4.78, "status": "failed"},
+                        "test_grault2": {"seconds": 1.473, "status": "errored"},
                     },
-                    'Qux': {
-                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
-                        'test_qux3': {'seconds': 23.496, 'status': None},
-                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
-                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
-                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
-                        'test_qux8': {'seconds': 11.968, 'status': None},
+                    "Qux": {
+                        "test_qux1": {"seconds": 0.001, "status": "skipped"},
+                        "test_qux3": {"seconds": 23.496, "status": None},
+                        "test_qux4": {"seconds": 7.158, "status": "failed"},
+                        "test_qux6": {"seconds": 0.002, "status": "skipped"},
+                        "test_qux7": {"seconds": 0.003, "status": "skipped"},
+                        "test_qux8": {"seconds": 11.968, "status": None},
                     },
                 },
-                'test_b': {
-                    'Bar': {
-                        'test_bar1': {'seconds': 50.447, 'status': None},
-                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                "test_b": {
+                    "Bar": {
+                        "test_bar1": {"seconds": 50.447, "status": None},
+                        "test_bar2": {"seconds": 3.742, "status": "failed"},
                     },
-                    'Norf': {
-                        'test_norf1': {'seconds': 3, 'status': None},
-                        'test_norf2': {'seconds': 3, 'status': None},
-                        'test_norf3': {'seconds': 3, 'status': None},
-                        'test_norf4': {'seconds': 3, 'status': None},
+                    "Norf": {
+                        "test_norf1": {"seconds": 3, "status": None},
+                        "test_norf2": {"seconds": 3, "status": None},
+                        "test_norf3": {"seconds": 3, "status": None},
+                        "test_norf4": {"seconds": 3, "status": None},
                     },
                 },
-                'test_c': {
-                    'Foo': {
-                        'test_foo1': {'seconds': 42, 'status': None},
-                        'test_foo2': {'seconds': 56, 'status': None},
+                "test_c": {
+                    "Foo": {
+                        "test_foo1": {"seconds": 42, "status": None},
+                        "test_foo2": {"seconds": 56, "status": None},
                     },
                 },
             },
@@ -242,95 +256,101 @@ def test_analysis(self) -> None:
 
         base_reports: Dict[Commit, List[Report]] = {
             # bbbb has no reports, so base is cccc instead
-            fakehash('b'): [],
-            fakehash('c'): [
-                make_report_v1({
-                    'Baz': [
-                        makecase('test_baz2', 13.605),
-                        # no recent suites have & skip this test
-                        makecase('test_baz1', 0.004, skipped=True),
-                    ],
-                    'Foo': [
-                        makecase('test_foo1', 43),
-                        # test added since dddd
-                        makecase('test_foo2', 57),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 4.88, failed=True),
-                        makecase('test_grault1', 11.967, failed=True),
-                        makecase('test_grault2', 0.395, errored=True),
-                        makecase('test_grault3', 30.460),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 2),
-                        makecase('test_norf2', 2),
-                        makecase('test_norf3', 2),
-                        makecase('test_norf4', 2),
-                    ],
-                    'Qux': [
-                        makecase('test_qux3', 4.978, errored=True),
-                        makecase('test_qux7', 0.002, skipped=True),
-                        makecase('test_qux2', 5.618),
-                        makecase('test_qux4', 7.766, errored=True),
-                        makecase('test_qux6', 23.589, failed=True),
-                    ],
-                }),
+            fakehash("b"): [],
+            fakehash("c"): [
+                make_report_v1(
+                    {
+                        "Baz": [
+                            makecase("test_baz2", 13.605),
+                            # no recent suites have & skip this test
+                            makecase("test_baz1", 0.004, skipped=True),
+                        ],
+                        "Foo": [
+                            makecase("test_foo1", 43),
+                            # test added since dddd
+                            makecase("test_foo2", 57),
+                        ],
+                        "Grault": [
+                            makecase("test_grault0", 4.88, failed=True),
+                            makecase("test_grault1", 11.967, failed=True),
+                            makecase("test_grault2", 0.395, errored=True),
+                            makecase("test_grault3", 30.460),
+                        ],
+                        "Norf": [
+                            makecase("test_norf1", 2),
+                            makecase("test_norf2", 2),
+                            makecase("test_norf3", 2),
+                            makecase("test_norf4", 2),
+                        ],
+                        "Qux": [
+                            makecase("test_qux3", 4.978, errored=True),
+                            makecase("test_qux7", 0.002, skipped=True),
+                            makecase("test_qux2", 5.618),
+                            makecase("test_qux4", 7.766, errored=True),
+                            makecase("test_qux6", 23.589, failed=True),
+                        ],
+                    }
+                ),
             ],
-            fakehash('d'): [
-                make_report_v1({
-                    'Foo': [
-                        makecase('test_foo1', 40),
-                        # removed in cccc
-                        makecase('test_foo3', 17),
-                    ],
-                    'Baz': [
-                        # not skipped, so not included in stdev
-                        makecase('test_baz1', 3.14),
-                    ],
-                    'Qux': [
-                        makecase('test_qux7', 0.004, skipped=True),
-                        makecase('test_qux2', 6.02),
-                        makecase('test_qux4', 20.932),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 3),
-                        makecase('test_norf2', 3),
-                        makecase('test_norf3', 3),
-                        makecase('test_norf4', 3),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 5, failed=True),
-                        makecase('test_grault1', 14.325, failed=True),
-                        makecase('test_grault2', 0.31, errored=True),
-                    ],
-                }),
+            fakehash("d"): [
+                make_report_v1(
+                    {
+                        "Foo": [
+                            makecase("test_foo1", 40),
+                            # removed in cccc
+                            makecase("test_foo3", 17),
+                        ],
+                        "Baz": [
+                            # not skipped, so not included in stdev
+                            makecase("test_baz1", 3.14),
+                        ],
+                        "Qux": [
+                            makecase("test_qux7", 0.004, skipped=True),
+                            makecase("test_qux2", 6.02),
+                            makecase("test_qux4", 20.932),
+                        ],
+                        "Norf": [
+                            makecase("test_norf1", 3),
+                            makecase("test_norf2", 3),
+                            makecase("test_norf3", 3),
+                            makecase("test_norf4", 3),
+                        ],
+                        "Grault": [
+                            makecase("test_grault0", 5, failed=True),
+                            makecase("test_grault1", 14.325, failed=True),
+                            makecase("test_grault2", 0.31, errored=True),
+                        ],
+                    }
+                ),
             ],
-            fakehash('e'): [],
-            fakehash('f'): [
-                make_report_v1({
-                    'Foo': [
-                        makecase('test_foo3', 24),
-                        makecase('test_foo1', 43),
-                    ],
-                    'Baz': [
-                        makecase('test_baz2', 16.857),
-                    ],
-                    'Qux': [
-                        makecase('test_qux2', 6.422),
-                        makecase('test_qux4', 6.382, errored=True),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 0.9),
-                        makecase('test_norf3', 0.9),
-                        makecase('test_norf2', 0.9),
-                        makecase('test_norf4', 0.9),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 4.7, failed=True),
-                        makecase('test_grault1', 13.146, failed=True),
-                        makecase('test_grault2', 0.48, errored=True),
-                    ],
-                }),
+            fakehash("e"): [],
+            fakehash("f"): [
+                make_report_v1(
+                    {
+                        "Foo": [
+                            makecase("test_foo3", 24),
+                            makecase("test_foo1", 43),
+                        ],
+                        "Baz": [
+                            makecase("test_baz2", 16.857),
+                        ],
+                        "Qux": [
+                            makecase("test_qux2", 6.422),
+                            makecase("test_qux4", 6.382, errored=True),
+                        ],
+                        "Norf": [
+                            makecase("test_norf1", 0.9),
+                            makecase("test_norf3", 0.9),
+                            makecase("test_norf2", 0.9),
+                            makecase("test_norf4", 0.9),
+                        ],
+                        "Grault": [
+                            makecase("test_grault0", 4.7, failed=True),
+                            makecase("test_grault1", 13.146, failed=True),
+                            makecase("test_grault2", 0.48, errored=True),
+                        ],
+                    }
+                ),
             ],
         }
 
@@ -344,7 +364,7 @@ def test_analysis(self) -> None:
         )
 
         self.assertEqual(
-            '''\
+            """\
 
 - class Baz:
 -     # was   15.23s ±   2.30s
@@ -402,14 +422,14 @@ class Qux:
 +     def test_bar2: ...
 +         # now   3.742s           (failed)
 
-''',
+""",
             print_test_stats.anomalies(analysis),
         )
 
     def test_graph(self) -> None:
         # HEAD is on master
         self.assertEqual(
-            '''\
+            """\
 Commit graph (base is most recent master ancestor with at least one S3 report):
 
     : (master)
@@ -420,21 +440,21 @@ def test_graph(self) -> None:
     * dddddddddd          0 reports
     |
     :
-''',
+""",
             print_test_stats.graph(
-                head_sha=fakehash('a'),
+                head_sha=fakehash("a"),
                 head_seconds=502.99,
                 base_seconds={
-                    fakehash('b'): [47.84],
-                    fakehash('c'): [332.50],
-                    fakehash('d'): [],
+                    fakehash("b"): [47.84],
+                    fakehash("c"): [332.50],
+                    fakehash("d"): [],
                 },
                 on_master=True,
-            )
+            ),
         )
 
         self.assertEqual(
-            '''\
+            """\
 Commit graph (base is most recent master ancestor with at least one S3 report):
 
     : (master)
@@ -446,21 +466,21 @@ def test_graph(self) -> None:
     * dddddddddd          1 report,  total time  1234.56s
     |
     :
-''',
+""",
             print_test_stats.graph(
-                head_sha=fakehash('a'),
+                head_sha=fakehash("a"),
                 head_seconds=9988.77,
                 base_seconds={
-                    fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
-                    fakehash('c'): [5308.77] * 10 + [5802.33] * 10,
-                    fakehash('d'): [1234.56],
+                    fakehash("b"): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
+                    fakehash("c"): [5308.77] * 10 + [5802.33] * 10,
+                    fakehash("d"): [1234.56],
                 },
                 on_master=False,
-            )
+            ),
         )
 
         self.assertEqual(
-            '''\
+            """\
 Commit graph (base is most recent master ancestor with at least one S3 report):
 
     : (master)
@@ -474,22 +494,22 @@ def test_graph(self) -> None:
     * dddddddddd (base)  15 reports, total time    58.92s ±   25.82s
     |
     :
-''',
+""",
             print_test_stats.graph(
-                head_sha=fakehash('a'),
+                head_sha=fakehash("a"),
                 head_seconds=25.52,
                 base_seconds={
-                    fakehash('b'): [],
-                    fakehash('c'): [],
-                    fakehash('d'): [52.25] * 14 + [152.26],
+                    fakehash("b"): [],
+                    fakehash("c"): [],
+                    fakehash("d"): [52.25] * 14 + [152.26],
                 },
                 on_master=False,
                 ancestry_path=5,
-            )
+            ),
         )
 
         self.assertEqual(
-            '''\
+            """\
 Commit graph (base is most recent master ancestor with at least one S3 report):
 
     : (master)
@@ -503,22 +523,22 @@ def test_graph(self) -> None:
     * dddddddddd          3 reports, total time     0.10s ±    0.05s
     |
     :
-''',
+""",
             print_test_stats.graph(
-                head_sha=fakehash('a'),
+                head_sha=fakehash("a"),
                 head_seconds=0.08,
                 base_seconds={
-                    fakehash('b'): [],
-                    fakehash('c'): [0.09],
-                    fakehash('d'): [0.05, 0.10, 0.15],
+                    fakehash("b"): [],
+                    fakehash("c"): [0.09],
+                    fakehash("d"): [0.05, 0.10, 0.15],
                 },
                 on_master=False,
                 other_ancestors=1,
-            )
+            ),
         )
 
         self.assertEqual(
-            '''\
+            """\
 Commit graph (base is most recent master ancestor with at least one S3 report):
 
     : (master)
@@ -534,24 +554,24 @@ def test_graph(self) -> None:
     * dddddddddd         10 reports, total time     5.84s ±    0.92s
     |
     :
-''',
+""",
             print_test_stats.graph(
-                head_sha=fakehash('a'),
+                head_sha=fakehash("a"),
                 head_seconds=5.98,
                 base_seconds={
-                    fakehash('b'): [4.81, 7.23],
-                    fakehash('c'): [],
-                    fakehash('d'): [4.97] * 5 + [6.71] * 5,
+                    fakehash("b"): [4.81, 7.23],
+                    fakehash("c"): [],
+                    fakehash("d"): [4.97] * 5 + [6.71] * 5,
                 },
                 on_master=False,
                 ancestry_path=1,
                 other_ancestors=7,
-            )
+            ),
         )
 
     def test_regression_info(self) -> None:
         self.assertEqual(
-            '''\
+            """\
 ----- Historic stats comparison result ------
 
     job: foo_job
@@ -571,41 +591,48 @@ def test_regression_info(self) -> None:
 Removed  (across    1 suite)      1 test,  totaling -   1.00s
 Modified (across    1 suite)      1 test,  totaling -  41.48s ±   2.12s
 Added    (across    1 suite)      1 test,  totaling +   3.00s
-''',
+""",
             print_test_stats.regression_info(
-                head_sha=fakehash('a'),
-                head_report=make_report_v1({
-                    'Foo': [
-                        makecase('test_foo', 0.02, skipped=True),
-                        makecase('test_baz', 3),
-                    ]}),
+                head_sha=fakehash("a"),
+                head_report=make_report_v1(
+                    {
+                        "Foo": [
+                            makecase("test_foo", 0.02, skipped=True),
+                            makecase("test_baz", 3),
+                        ]
+                    }
+                ),
                 base_reports={
-                    fakehash('b'): [
-                        make_report_v1({
-                            'Foo': [
-                                makecase('test_foo', 40),
-                                makecase('test_bar', 1),
-                            ],
-                        }),
+                    fakehash("b"): [
+                        make_report_v1(
+                            {
+                                "Foo": [
+                                    makecase("test_foo", 40),
+                                    makecase("test_bar", 1),
+                                ],
+                            }
+                        ),
                     ],
-                    fakehash('c'): [
-                        make_report_v1({
-                            'Foo': [
-                                makecase('test_foo', 43),
-                            ],
-                        }),
+                    fakehash("c"): [
+                        make_report_v1(
+                            {
+                                "Foo": [
+                                    makecase("test_foo", 43),
+                                ],
+                            }
+                        ),
                     ],
                 },
-                job_name='foo_job',
+                job_name="foo_job",
                 on_master=False,
                 ancestry_path=0,
                 other_ancestors=0,
-            )
+            ),
         )
 
     def test_regression_info_new_job(self) -> None:
         self.assertEqual(
-            '''\
+            """\
 ----- Historic stats comparison result ------
 
     job: foo_job
@@ -629,25 +656,28 @@ def test_regression_info_new_job(self) -> None:
 Removed  (across    0 suites)     0 tests, totaling     0.00s
 Modified (across    0 suites)     0 tests, totaling     0.00s
 Added    (across    1 suite)      2 tests, totaling +   3.02s
-''',
+""",
             print_test_stats.regression_info(
-                head_sha=fakehash('a'),
-                head_report=make_report_v1({
-                    'Foo': [
-                        makecase('test_foo', 0.02, skipped=True),
-                        makecase('test_baz', 3),
-                    ]}),
+                head_sha=fakehash("a"),
+                head_report=make_report_v1(
+                    {
+                        "Foo": [
+                            makecase("test_foo", 0.02, skipped=True),
+                            makecase("test_baz", 3),
+                        ]
+                    }
+                ),
                 base_reports={
-                    fakehash('b'): [],
-                    fakehash('c'): [],
+                    fakehash("b"): [],
+                    fakehash("c"): [],
                 },
-                job_name='foo_job',
+                job_name="foo_job",
                 on_master=False,
                 ancestry_path=3,
                 other_ancestors=2,
-            )
+            ),
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tools/test/test_test_history.py b/tools/test/test_test_history.py
index 8863c24a5d55..7851ca3f510f 100644
--- a/tools/test/test_test_history.py
+++ b/tools/test/test_test_history.py
@@ -16,36 +16,33 @@ class Example(TypedDict):
 
 def parse_block(block: List[str]) -> Optional[Example]:
     if block:
-        match = re.match(r'^\$ ([^ ]+) (.*)$', block[0])
+        match = re.match(r"^\$ ([^ ]+) (.*)$", block[0])
         if match:
             cmd, first = match.groups()
             args = []
             for i, line in enumerate([first] + block[1:]):
-                if line.endswith('\\'):
+                if line.endswith("\\"):
                     args.append(line[:-1])
                 else:
                     args.append(line)
                     break
             return {
-                'cmd': cmd,
-                'args': shlex.split(''.join(args)),
-                'lines': block[i + 1:]
+                "cmd": cmd,
+                "args": shlex.split("".join(args)),
+                "lines": block[i + 1 :],
             }
     return None
 
 
 def parse_description(description: str) -> List[Example]:
     examples: List[Example] = []
-    for block in description.split('\n\n'):
-        matches = [
-            re.match(r'^    (.*)$', line)
-            for line in block.splitlines()
-        ]
+    for block in description.split("\n\n"):
+        matches = [re.match(r"^    (.*)$", line) for line in block.splitlines()]
         if all(matches):
             lines = []
             for match in matches:
                 assert match
-                line, = match.groups()
+                (line,) = match.groups()
                 lines.append(line)
             example = parse_block(lines)
             if example:
@@ -53,6 +50,7 @@ def parse_description(description: str) -> List[Example]:
     return examples
 
 
+@unittest.skip("Skipping as this test is fragile, issue #73083")
 class TestTestHistory(unittest.TestCase):
     maxDiff = None
 
@@ -61,14 +59,16 @@ def test_help_examples(self) -> None:
         self.assertEqual(len(examples), 3)
         for i, example in enumerate(examples):
             with self.subTest(i=i):
-                self.assertTrue(test_history.__file__.endswith(example['cmd']))
-                expected = example['lines']
-                actual = list(itertools.islice(
-                    test_history.run(example['args']),
-                    len(expected),
-                ))
+                self.assertTrue(test_history.__file__.endswith(example["cmd"]))
+                expected = example["lines"]
+                actual = list(
+                    itertools.islice(
+                        test_history.run(example["args"]),
+                        len(expected),
+                    )
+                )
                 self.assertEqual(actual, expected)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tools/test/test_test_selections.py b/tools/test/test_test_selections.py
index 5ea6fa8b3c62..b846bb53c0cb 100644
--- a/tools/test/test_test_selections.py
+++ b/tools/test/test_test_selections.py
@@ -7,37 +7,37 @@
 
 class TestCalculateShards(unittest.TestCase):
     tests: List[str] = [
-        'super_long_test',
-        'long_test1',
-        'long_test2',
-        'normal_test1',
-        'normal_test2',
-        'normal_test3',
-        'short_test1',
-        'short_test2',
-        'short_test3',
-        'short_test4',
-        'short_test5',
+        "super_long_test",
+        "long_test1",
+        "long_test2",
+        "normal_test1",
+        "normal_test2",
+        "normal_test3",
+        "short_test1",
+        "short_test2",
+        "short_test3",
+        "short_test4",
+        "short_test5",
     ]
 
     test_times: Dict[str, float] = {
-        'super_long_test': 55,
-        'long_test1': 22,
-        'long_test2': 18,
-        'normal_test1': 9,
-        'normal_test2': 7,
-        'normal_test3': 5,
-        'short_test1': 1,
-        'short_test2': 0.6,
-        'short_test3': 0.4,
-        'short_test4': 0.3,
-        'short_test5': 0.01,
+        "super_long_test": 55,
+        "long_test1": 22,
+        "long_test2": 18,
+        "normal_test1": 9,
+        "normal_test2": 7,
+        "normal_test3": 5,
+        "short_test1": 1,
+        "short_test2": 0.6,
+        "short_test3": 0.4,
+        "short_test4": 0.3,
+        "short_test5": 0.01,
     }
 
     def assert_shards_equal(
         self,
         expected_shards: List[Tuple[float, List[str]]],
-        actual_shards: List[Tuple[float, List[str]]]
+        actual_shards: List[Tuple[float, List[str]]],
     ) -> None:
         for expected, actual in zip(expected_shards, actual_shards):
             self.assertAlmostEqual(expected[0], actual[0])
@@ -45,53 +45,140 @@ def assert_shards_equal(
 
     def test_calculate_2_shards_with_complete_test_times(self) -> None:
         expected_shards = [
-            (60, ['super_long_test', 'normal_test3']),
-            (58.31, ['long_test1', 'long_test2', 'normal_test1', 'normal_test2', 'short_test1', 'short_test2',
-                     'short_test3', 'short_test4', 'short_test5'])
+            (60, ["super_long_test", "normal_test3"]),
+            (
+                58.31,
+                [
+                    "long_test1",
+                    "long_test2",
+                    "normal_test1",
+                    "normal_test2",
+                    "short_test1",
+                    "short_test2",
+                    "short_test3",
+                    "short_test4",
+                    "short_test5",
+                ],
+            ),
         ]
-        self.assert_shards_equal(expected_shards, calculate_shards(2, self.tests, self.test_times))
+        self.assert_shards_equal(
+            expected_shards, calculate_shards(2, self.tests, self.test_times)
+        )
 
+    def test_calculate_1_shard_with_complete_test_times(self) -> None:
+        expected_shards = [
+            (
+                118.31,
+                [
+                    "super_long_test",
+                    "long_test1",
+                    "long_test2",
+                    "normal_test1",
+                    "normal_test2",
+                    "normal_test3",
+                    "short_test1",
+                    "short_test2",
+                    "short_test3",
+                    "short_test4",
+                    "short_test5",
+                ],
+            ),
+        ]
+        self.assert_shards_equal(
+            expected_shards, calculate_shards(1, self.tests, self.test_times)
+        )
 
     def test_calculate_5_shards_with_complete_test_times(self) -> None:
         expected_shards = [
-            (55.0, ['super_long_test']),
-            (22.0, ['long_test1', ]),
-            (18.0, ['long_test2', ]),
-            (11.31, ['normal_test1', 'short_test1', 'short_test2', 'short_test3', 'short_test4', 'short_test5']),
-            (12.0, ['normal_test2', 'normal_test3']),
+            (55.0, ["super_long_test"]),
+            (
+                22.0,
+                [
+                    "long_test1",
+                ],
+            ),
+            (
+                18.0,
+                [
+                    "long_test2",
+                ],
+            ),
+            (
+                11.31,
+                [
+                    "normal_test1",
+                    "short_test1",
+                    "short_test2",
+                    "short_test3",
+                    "short_test4",
+                    "short_test5",
+                ],
+            ),
+            (12.0, ["normal_test2", "normal_test3"]),
         ]
-        self.assert_shards_equal(expected_shards, calculate_shards(5, self.tests, self.test_times))
-
+        self.assert_shards_equal(
+            expected_shards, calculate_shards(5, self.tests, self.test_times)
+        )
 
     def test_calculate_2_shards_with_incomplete_test_times(self) -> None:
-        incomplete_test_times = {k: v for k, v in self.test_times.items() if 'test1' in k}
+        incomplete_test_times = {
+            k: v for k, v in self.test_times.items() if "test1" in k
+        }
         expected_shards = [
-            (22.0, ['long_test1', 'long_test2', 'normal_test3', 'short_test3', 'short_test5']),
-            (10.0, ['normal_test1', 'short_test1', 'super_long_test', 'normal_test2', 'short_test2', 'short_test4']),
+            (
+                22.0,
+                [
+                    "long_test1",
+                    "long_test2",
+                    "normal_test3",
+                    "short_test3",
+                    "short_test5",
+                ],
+            ),
+            (
+                10.0,
+                [
+                    "normal_test1",
+                    "short_test1",
+                    "super_long_test",
+                    "normal_test2",
+                    "short_test2",
+                    "short_test4",
+                ],
+            ),
         ]
-        self.assert_shards_equal(expected_shards, calculate_shards(2, self.tests, incomplete_test_times))
-
+        self.assert_shards_equal(
+            expected_shards, calculate_shards(2, self.tests, incomplete_test_times)
+        )
 
     def test_calculate_5_shards_with_incomplete_test_times(self) -> None:
-        incomplete_test_times = {k: v for k, v in self.test_times.items() if 'test1' in k}
+        incomplete_test_times = {
+            k: v for k, v in self.test_times.items() if "test1" in k
+        }
         expected_shards = [
-            (22.0, ['long_test1', 'normal_test2', 'short_test5']),
-            (9.0, ['normal_test1', 'normal_test3']),
-            (1.0, ['short_test1', 'short_test2']),
-            (0.0, ['super_long_test', 'short_test3']),
-            (0.0, ['long_test2', 'short_test4']),
+            (22.0, ["long_test1", "normal_test2", "short_test5"]),
+            (9.0, ["normal_test1", "normal_test3"]),
+            (1.0, ["short_test1", "short_test2"]),
+            (0.0, ["super_long_test", "short_test3"]),
+            (0.0, ["long_test2", "short_test4"]),
         ]
-        self.assert_shards_equal(expected_shards, calculate_shards(5, self.tests, incomplete_test_times))
+        self.assert_shards_equal(
+            expected_shards, calculate_shards(5, self.tests, incomplete_test_times)
+        )
 
     def test_calculate_2_shards_against_optimal_shards(self) -> None:
         for _ in range(100):
             random.seed(120)
             random_times = {k: random.random() * 10 for k in self.tests}
             # all test times except first two
-            rest_of_tests = [i for k, i in random_times.items() if k != 'super_long_test' and k != 'long_test1']
+            rest_of_tests = [
+                i
+                for k, i in random_times.items()
+                if k != "super_long_test" and k != "long_test1"
+            ]
             sum_of_rest = sum(rest_of_tests)
-            random_times['super_long_test'] = max(sum_of_rest / 2, max(rest_of_tests))
-            random_times['long_test1'] = sum_of_rest - random_times['super_long_test']
+            random_times["super_long_test"] = max(sum_of_rest / 2, max(rest_of_tests))
+            random_times["long_test1"] = sum_of_rest - random_times["super_long_test"]
             # An optimal sharding would look like the below, but we don't need to compute this for the test:
             # optimal_shards = [
             #     (sum_of_rest, ['super_long_test', 'long_test1']),
@@ -103,10 +190,12 @@ def test_calculate_2_shards_against_optimal_shards(self) -> None:
                 # The calculated shard should not have a ratio worse than 7/6 for num_shards = 2
                 self.assertGreaterEqual(7.0 / 6.0, max_shard_time / sum_of_rest)
                 sorted_tests = sorted(self.tests)
-                sorted_shard_tests = sorted(calculated_shards[0][1] + calculated_shards[1][1])
+                sorted_shard_tests = sorted(
+                    calculated_shards[0][1] + calculated_shards[1][1]
+                )
                 # All the tests should be represented by some shard
                 self.assertEqual(sorted_tests, sorted_shard_tests)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tools/test/test_trailing_newlines.py b/tools/test/test_trailing_newlines.py
deleted file mode 100644
index 4f4b662b1036..000000000000
--- a/tools/test/test_trailing_newlines.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from tools.linter import trailing_newlines
-import unittest
-import tempfile
-
-
-def correct_trailing_newlines(file_contents: str) -> bool:
-    with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp:
-        filename = tmp.name
-        tmp.write(file_contents)
-    return trailing_newlines.correct_trailing_newlines(filename)
-
-
-class TestTrailingNewlines(unittest.TestCase):
-    def test_empty(self) -> None:
-        self.assertTrue(correct_trailing_newlines(''))
-
-    def test_single_byte(self) -> None:
-        self.assertFalse(correct_trailing_newlines('a'))
-
-    def test_single_newline(self) -> None:
-        self.assertFalse(correct_trailing_newlines('\n'))
-
-    def test_two_newlines(self) -> None:
-        self.assertFalse(correct_trailing_newlines('\n\n'))
-
-    def test_three_newlines(self) -> None:
-        self.assertFalse(correct_trailing_newlines('\n\n\n'))
-
-    def test_hello_world(self) -> None:
-        self.assertFalse(correct_trailing_newlines('hello world'))
-
-    def test_hello_world_newline(self) -> None:
-        self.assertTrue(correct_trailing_newlines('hello world\n'))
-
-    def test_hello_world_two_newlines(self) -> None:
-        self.assertFalse(correct_trailing_newlines('hello world\n\n'))
-
-    def test_hello_world_three_newlines(self) -> None:
-        self.assertFalse(correct_trailing_newlines('hello world\n\n\n'))
-
-    def test_hello_world_multiline(self) -> None:
-        self.assertFalse(correct_trailing_newlines('hello\nworld'))
-
-    def test_hello_world_multiline_gap(self) -> None:
-        self.assertTrue(correct_trailing_newlines('hello\n\nworld\n'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/test/test_translate_annotations.py b/tools/test/test_translate_annotations.py
deleted file mode 100644
index 867decc4af1a..000000000000
--- a/tools/test/test_translate_annotations.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import re
-import unittest
-
-from tools.linter.translate_annotations import parse_annotation, parse_diff, translate
-
-flake8_regex \
-    = r'^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)'
-clang_tidy_regex \
-    = r'^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorDesc>.*?) \[(?P<errorCode>.*)\]'
-
-# in the below example patch, note that the filenames differ, so the
-# translation should reflect that as well as the line numbers
-
-# $ git clone -b 1.0.2 https://github.com/cscorley/whatthepatch.git
-# $ cd whatthepatch/tests/casefiles
-# $ git diff --no-index --unified=0 lao tzu
-lao_tzu_diff = '''
-diff --git a/lao b/tzu
-index 635ef2c..5af88a8 100644
---- a/lao
-+++ b/tzu
-@@ -1,2 +0,0 @@
--The Way that can be told of is not the eternal Way;
--The name that can be named is not the eternal name.
-@@ -4 +2,2 @@ The Nameless is the origin of Heaven and Earth;
--The Named is the mother of all things.
-+The named is the mother of all things.
-+
-@@ -11,0 +11,3 @@ But after they are produced,
-+They both may be called deep and profound.
-+Deeper and more profound,
-+The door of all subtleties!
-'''.lstrip()
-
-sparser_diff = '''
-diff --git a/foo.txt b/bar.txt
-index 27a6dad..6fae323 100644
---- a/foo.txt
-+++ b/bar.txt
-@@ -4,3 +4,2 @@ lines
--lines
--lines
--lines
-+A change!!
-+Wow
-@@ -10,2 +8,0 @@ more lines
--even more
--even more
-'''.lstrip()
-
-new_file_diff = '''
-diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
-new file mode 100644
-index 0000000000..a81eeae346
---- /dev/null
-+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
-@@ -0,0 +1,19 @@
-+#pragma once
-+
-+#include <torch/csrc/jit/tensorexpr/tensor.h>
-+
-+namespace torch {
-+namespace jit {
-+namespace tensorexpr {
-+
-+TORCH_API Tensor* conv2d_depthwise(
-+    BufHandle input,
-+    BufHandle weight,
-+    BufHandle bias,
-+    int stride,
-+    int pad,
-+    int groups);
-+
-+} // namespace tensorexpr
-+} // namespace jit
-+} // namespace torch
-'''.lstrip()
-
-# fun fact, this example fools VS Code's diff syntax highlighter
-haskell_diff = '''
-diff --git a/hello.hs b/hello.hs
-index ffb8d4ad14..0872ac9db6 100644
---- a/hello.hs
-+++ b/hello.hs
-@@ -1 +1 @@
---- a/hello/world/example
-+main = putStrLn "Hello, world!"
-'''.lstrip()
-
-
-class TestTranslateAnnotations(unittest.TestCase):
-    maxDiff = None
-
-    def test_parse_diff_lao_tzu(self) -> None:
-        self.assertEqual(
-            parse_diff(lao_tzu_diff),
-            {
-                'old_filename': 'lao',
-                'hunks': [
-                    {
-                        'old_start': 1,
-                        'old_count': 2,
-                        'new_start': 0,
-                        'new_count': 0,
-                    },
-                    {
-                        'old_start': 4,
-                        'old_count': 1,
-                        'new_start': 2,
-                        'new_count': 2,
-                    },
-                    {
-                        'old_start': 11,
-                        'old_count': 0,
-                        'new_start': 11,
-                        'new_count': 3,
-                    },
-                ],
-            },
-        )
-
-    def test_parse_diff_new_file(self) -> None:
-        self.assertEqual(
-            parse_diff(new_file_diff),
-            {
-                'old_filename': None,
-                'hunks': [
-                    {
-                        'old_start': 0,
-                        'old_count': 0,
-                        'new_start': 1,
-                        'new_count': 19,
-                    },
-                ],
-            },
-        )
-
-    def test_parse_diff_haskell(self) -> None:
-        self.assertEqual(
-            parse_diff(haskell_diff),
-            {
-                'old_filename': 'hello.hs',
-                'hunks': [
-                    {
-                        'old_start': 1,
-                        'old_count': 1,
-                        'new_start': 1,
-                        'new_count': 1,
-                    },
-                ],
-            },
-        )
-
-    def test_translate_lao_tzu(self) -> None:
-        # we'll pretend that this diff represents the file lao being
-        # renamed to tzu and also modified
-        diff = parse_diff(lao_tzu_diff)
-
-        # line numbers less than 1 are invalid so they map to None
-        self.assertEqual(translate(diff, -1), None)
-        self.assertEqual(translate(diff, 0), None)
-
-        # the first two lines of the file were removed, so the first
-        # line of the new version corresponds to the third line of the
-        # original
-        self.assertEqual(translate(diff, 1), 3)
-
-        # the second and third lines of the new file were not present in
-        # the original version, so they map to None
-        self.assertEqual(translate(diff, 2), None)
-        self.assertEqual(translate(diff, 3), None)
-
-        # at this point, we have a stretch of lines that are identical
-        # in both versions of the file, but the original version of the
-        # file had 4 lines before this section whereas the new version
-        # has only 3 lines before this section
-        self.assertEqual(translate(diff, 4), 5)
-        self.assertEqual(translate(diff, 5), 6)
-        self.assertEqual(translate(diff, 6), 7)
-        self.assertEqual(translate(diff, 7), 8)
-        self.assertEqual(translate(diff, 8), 9)
-        self.assertEqual(translate(diff, 9), 10)
-        self.assertEqual(translate(diff, 10), 11)
-
-        # these three lines were added in the new version of the file,
-        # so they map to None
-        self.assertEqual(translate(diff, 11), None)
-        self.assertEqual(translate(diff, 12), None)
-        self.assertEqual(translate(diff, 13), None)
-
-        # the diff doesn't say how long the file is, so we keep mapping
-        # line numbers back; since we can look back at the original
-        # files, though, we can see that the original is two lines
-        # shorter than the new version, which explains why we are
-        # subtracting 2 here
-        self.assertEqual(translate(diff, 14), 12)
-        self.assertEqual(translate(diff, 15), 13)
-
-    def test_translate_empty(self) -> None:
-        diff = parse_diff('--- a/foo')
-
-        # again, we start numbering at 1
-        self.assertEqual(translate(diff, -1), None)
-        self.assertEqual(translate(diff, 0), None)
-
-        # this diff says there are no changes, so all line numbers
-        # greater than zero map to themselves
-        self.assertEqual(translate(diff, 1), 1)
-        self.assertEqual(translate(diff, 2), 2)
-        self.assertEqual(translate(diff, 3), 3)
-        self.assertEqual(translate(diff, 4), 4)
-        self.assertEqual(translate(diff, 5), 5)
-
-    def test_translate_sparser(self) -> None:
-        diff = parse_diff(sparser_diff)
-
-        # again, we start numbering at 1
-        self.assertEqual(translate(diff, -1), None)
-        self.assertEqual(translate(diff, 0), None)
-
-        # the first three lines are unchanged
-        self.assertEqual(translate(diff, 1), 1)
-        self.assertEqual(translate(diff, 2), 2)
-        self.assertEqual(translate(diff, 3), 3)
-
-        # we removed three lines here and added two, so the two lines we
-        # added don't map back to anything in the original file
-        self.assertEqual(translate(diff, 4), None)
-        self.assertEqual(translate(diff, 5), None)
-
-        # we have some unchanged lines here, but in the preceding hunk
-        # we removed 3 and added only 2, so we have an offset of 1
-        self.assertEqual(translate(diff, 6), 7)
-        self.assertEqual(translate(diff, 7), 8)
-
-        # since the unified diff format essentially subtracts 1 from the
-        # starting line number when the count is 0, and since we use
-        # bisect.bisect_right to decide which hunk to look at, an
-        # earlier version of translate had a bug that caused it to get
-        # confused because it would look at the second hunk (which lists
-        # 8 as its start line number) rather than the first hunk
-        self.assertEqual(translate(diff, 8), 9)
-
-        # after the two lines that we removed in the second hunk, we've
-        # reduced the total length of the file by 3 lines, so once we
-        # reach the end of the diff, we just add 3 to every line number
-        self.assertEqual(translate(diff, 9), 12)
-        self.assertEqual(translate(diff, 10), 13)
-        self.assertEqual(translate(diff, 11), 14)
-        self.assertEqual(translate(diff, 12), 15)
-
-    def test_parse_annotation_flake8(self) -> None:
-        regex = re.compile(flake8_regex)
-        self.assertEqual(
-            parse_annotation(regex, 'README.md:1:3: R100 make a better title'),
-            {
-                'filename': 'README.md',
-                'lineNumber': 1,
-                'columnNumber': 3,
-                'errorCode': 'R100',
-                'errorDesc': 'make a better title',
-            },
-        )
-
-    def test_parse_annotation_clang_tidy(self) -> None:
-        regex = re.compile(clang_tidy_regex)
-        self.assertEqual(
-            parse_annotation(regex, 'README.md:2:1: improve description [R200]'),
-            {
-                'filename': 'README.md',
-                'lineNumber': 2,
-                'columnNumber': 1,
-                'errorCode': 'R200',
-                'errorDesc': 'improve description',
-            },
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/testing/explicit_ci_jobs.py b/tools/testing/explicit_ci_jobs.py
index 5944d226b0bc..3de04e1a18e9 100755
--- a/tools/testing/explicit_ci_jobs.py
+++ b/tools/testing/explicit_ci_jobs.py
@@ -45,7 +45,13 @@ def add_job(
     if requires is not None:
         for requirement in requires:
             dependency = past_jobs[requirement]
-            add_job(workflows, dependency["workflow_name"], dependency["type"], dependency["job"], past_jobs)
+            add_job(
+                workflows,
+                dependency["workflow_name"],
+                dependency["type"],
+                dependency["job"],
+                past_jobs,
+            )
 
     workflows[workflow_name]["jobs"].append({type: job})
 
@@ -88,13 +94,16 @@ def get_filtered_circleci_config(
 def commit_ci(files: List[str], message: str) -> None:
     # Check that there are no other modified files than the ones edited by this
     # tool
-    stdout = subprocess.run(["git", "status", "--porcelain"], stdout=subprocess.PIPE).stdout.decode()
+    stdout = subprocess.run(
+        ["git", "status", "--porcelain"], stdout=subprocess.PIPE
+    ).stdout.decode()
     for line in stdout.split("\n"):
         if line == "":
             continue
         if line[0] != " ":
-            raise RuntimeError(f"Refusing to commit while other changes are already staged: {line}")
-
+            raise RuntimeError(
+                f"Refusing to commit while other changes are already staged: {line}"
+            )
 
     # Make the commit
     subprocess.run(["git", "add"] + files)
@@ -107,10 +116,12 @@ def commit_ci(files: List[str], message: str) -> None:
     )
     parser.add_argument("--job", action="append", help="job name", default=[])
     parser.add_argument(
-        "--filter-gha", help="keep only these github actions (glob match)", default=''
+        "--filter-gha", help="keep only these github actions (glob match)", default=""
     )
     parser.add_argument(
-        "--make-commit", action="store_true", help="add change to git with to a do-not-merge commit"
+        "--make-commit",
+        action="store_true",
+        help="add change to git with to a do-not-merge commit",
     )
     args = parser.parse_args()
 
@@ -118,7 +129,9 @@ def commit_ci(files: List[str], message: str) -> None:
     with open(CONFIG_YML, "r") as f:
         config_yml = yaml.safe_load(f.read())
 
-    config_yml["workflows"] = get_filtered_circleci_config(config_yml["workflows"], args.job)
+    config_yml["workflows"] = get_filtered_circleci_config(
+        config_yml["workflows"], args.job
+    )
 
     with open(CONFIG_YML, "w") as f:
         yaml.dump(config_yml, f)
@@ -131,13 +144,15 @@ def commit_ci(files: List[str], message: str) -> None:
                 path.resolve().unlink()
 
     if args.make_commit:
-        jobs_str = '\n'.join([f" * {job}" for job in args.job])
-        message = textwrap.dedent(f"""
+        jobs_str = "\n".join([f" * {job}" for job in args.job])
+        message = textwrap.dedent(
+            f"""
         [skip ci][do not merge] Edit config.yml to filter specific jobs
 
         Filter CircleCI to only run:
         {jobs_str}
 
         See [Run Specific CI Jobs](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#run-specific-ci-jobs) for details.
-        """).strip()
+        """
+        ).strip()
         commit_ci([str(f.relative_to(REPO_ROOT)) for f in touched_files], message)
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index c83b0619f030..67ca627cc2c0 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -6,16 +6,16 @@
 from tools.stats.s3_stat_parser import (
     get_previous_reports_for_branch,
     get_previous_reports_for_pr,
-    Report, Version2Report,
-    HAVE_BOTO3)
-from tools.stats.import_test_stats import (
-    get_disabled_tests,
-    get_slow_tests
+    Report,
+    Version2Report,
+    HAVE_BOTO3,
 )
+from tools.stats.import_test_stats import get_disabled_tests, get_slow_tests
 
 from typing import Any, Dict, List, Optional, Tuple, cast
 from typing_extensions import TypedDict
 
+
 class JobTimeJSON(TypedDict):
     commit: str
     JOB_BASE_NAME: str
@@ -23,50 +23,55 @@ class JobTimeJSON(TypedDict):
 
 
 def _get_stripped_CI_job() -> str:
-    """E.g. convert 'pytorch_windows_vs2019_py36_cuda10.1_build' to 'pytorch_windows_vs2019_py36_cuda10.1'.
-    """
-    job = os.environ.get("JOB_BASE_NAME", "").rstrip('0123456789')
-    if job.endswith('_slow_test'):
-        job = job[:len(job) - len('_slow_test')]
-    elif job.endswith('_test') or job.endswith('-test'):
-        job = job[:len(job) - len('_test')]
-    elif job.endswith('_build') or job.endswith('-build'):
-        job = job[:len(job) - len('_build')]
+    """E.g. convert 'pytorch_windows_vs2019_py36_cuda10.1_build' to 'pytorch_windows_vs2019_py36_cuda10.1'."""
+    job = os.environ.get("JOB_BASE_NAME", "").rstrip("0123456789")
+    if job.endswith("_slow_test"):
+        job = job[: len(job) - len("_slow_test")]
+    elif job.endswith("_test") or job.endswith("-test"):
+        job = job[: len(job) - len("_test")]
+    elif job.endswith("_build") or job.endswith("-build"):
+        job = job[: len(job) - len("_build")]
     return job
 
 
 def _get_job_times_json(job_times: Dict[str, float]) -> JobTimeJSON:
     return {
-        'commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'], encoding="ascii").strip(),
-        'JOB_BASE_NAME': _get_stripped_CI_job(),
-        'job_times': job_times,
+        "commit": subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], encoding="ascii"
+        ).strip(),
+        "JOB_BASE_NAME": _get_stripped_CI_job(),
+        "job_times": job_times,
     }
 
 
 def _calculate_job_times(reports: List["Report"]) -> Dict[str, float]:
-    """Compute test runtime by filename: ("test_file_name" -> (current_avg, # values))
-    """
+    """Compute test runtime by filename: ("test_file_name" -> (current_avg, # values))"""
     jobs_to_times: Dict[str, Tuple[float, int]] = dict()
     for report in reports:
         v_report = cast(Version2Report, report)
-        assert 'format_version' in v_report.keys() and v_report.get('format_version') == 2, \
-            "S3 format currently handled is version 2 only"
-        files: Dict[str, Any] = v_report['files']
+        assert (
+            "format_version" in v_report.keys() and v_report.get("format_version") == 2
+        ), "S3 format currently handled is version 2 only"
+        files: Dict[str, Any] = v_report["files"]
         for name, test_file in files.items():
             if name not in jobs_to_times:
-                jobs_to_times[name] = (test_file['total_seconds'], 1)
+                jobs_to_times[name] = (test_file["total_seconds"], 1)
             else:
                 curr_avg, curr_count = jobs_to_times[name]
                 new_count = curr_count + 1
-                new_avg = (curr_avg * curr_count + test_file['total_seconds']) / new_count
+                new_avg = (
+                    curr_avg * curr_count + test_file["total_seconds"]
+                ) / new_count
                 jobs_to_times[name] = (new_avg, new_count)
 
     return {job: time for job, (time, _) in jobs_to_times.items()}
 
 
-def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, float]) -> List[Tuple[float, List[str]]]:
+def calculate_shards(
+    num_shards: int, tests: List[str], job_times: Dict[str, float]
+) -> List[Tuple[float, List[str]]]:
     filtered_job_times: Dict[str, float] = dict()
-    unknown_jobs : List[str] = []
+    unknown_jobs: List[str] = []
     for test in tests:
         if test in job_times:
             filtered_job_times[test] = job_times[test]
@@ -75,13 +80,18 @@ def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, flo
 
     # The following attempts to implement a partition approximation greedy algorithm
     # See more at https://en.wikipedia.org/wiki/Greedy_number_partitioning
-    sorted_jobs = sorted(filtered_job_times, key=lambda j: filtered_job_times[j], reverse=True)
+    sorted_jobs = sorted(
+        filtered_job_times, key=lambda j: filtered_job_times[j], reverse=True
+    )
     sharded_jobs: List[Tuple[float, List[str]]] = [(0.0, []) for _ in range(num_shards)]
     for job in sorted_jobs:
         min_shard_index = sorted(range(num_shards), key=lambda i: sharded_jobs[i][0])[0]
         curr_shard_time, curr_shard_jobs = sharded_jobs[min_shard_index]
         curr_shard_jobs.append(job)
-        sharded_jobs[min_shard_index] = (curr_shard_time + filtered_job_times[job], curr_shard_jobs)
+        sharded_jobs[min_shard_index] = (
+            curr_shard_time + filtered_job_times[job],
+            curr_shard_jobs,
+        )
 
     # Round robin the unknown jobs starting with the smallest shard
     index = sorted(range(num_shards), key=lambda i: sharded_jobs[i][0])[0]
@@ -94,14 +104,20 @@ def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, flo
 def _pull_job_times_from_S3() -> Dict[str, float]:
     if HAVE_BOTO3:
         ci_job_prefix = _get_stripped_CI_job()
-        s3_reports: List["Report"] = get_previous_reports_for_branch('origin/viable/strict', ci_job_prefix)
+        s3_reports: List["Report"] = get_previous_reports_for_branch(
+            "origin/viable/strict", ci_job_prefix
+        )
     else:
-        print('Uh oh, boto3 is not found. Either it is not installed or we failed to import s3_stat_parser.')
-        print('If not installed, please install boto3 for automatic sharding and test categorization.')
+        print(
+            "Uh oh, boto3 is not found. Either it is not installed or we failed to import s3_stat_parser."
+        )
+        print(
+            "If not installed, please install boto3 for automatic sharding and test categorization."
+        )
         s3_reports = []
 
     if len(s3_reports) == 0:
-        print('Gathered no reports from S3. Please proceed without them.')
+        print("Gathered no reports from S3. Please proceed without them.")
         return dict()
 
     return _calculate_job_times(s3_reports)
@@ -116,20 +132,26 @@ def _query_past_job_times(test_times_file: Optional[str] = None) -> Dict[str, fl
         with open(test_times_file) as file:
             test_times_json: JobTimeJSON = json.load(file)
 
-        curr_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'], encoding="ascii").strip()
-        file_commit = test_times_json.get('commit', '')
+        curr_commit = subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], encoding="ascii"
+        ).strip()
+        file_commit = test_times_json.get("commit", "")
         curr_ci_job = _get_stripped_CI_job()
-        file_ci_job = test_times_json.get('JOB_BASE_NAME', 'N/A')
+        file_ci_job = test_times_json.get("JOB_BASE_NAME", "N/A")
         if curr_commit != file_commit:
-            print(f'Current test times file is from different commit {file_commit}.')
+            print(f"Current test times file is from different commit {file_commit}.")
         elif curr_ci_job != file_ci_job:
-            print(f'Current test times file is for different CI job {file_ci_job}.')
+            print(f"Current test times file is for different CI job {file_ci_job}.")
         else:
-            print(f'Found stats for current commit: {curr_commit} and job: {curr_ci_job}. Proceeding with those values.')
-            return test_times_json.get('job_times', {})
+            print(
+                f"Found stats for current commit: {curr_commit} and job: {curr_ci_job}. Proceeding with those values."
+            )
+            return test_times_json.get("job_times", {})
 
         # Found file, but commit or CI job in JSON doesn't match
-        print(f'Overwriting current file with stats based on current commit: {curr_commit} and CI job: {curr_ci_job}')
+        print(
+            f"Overwriting current file with stats based on current commit: {curr_commit} and CI job: {curr_ci_job}"
+        )
 
     job_times = export_S3_test_times(test_times_file)
 
@@ -142,21 +164,26 @@ def _query_failure_test_module(reports: List[Tuple["Report", str]]) -> List[str]
         return test_modules
     report = reports[0][0]
     v_report = cast(Version2Report, report)
-    assert 'format_version' in v_report.keys() and v_report.get('format_version') == 2, \
-        "S3 format currently handled is version 2 only"
-    files: Dict[str, Any] = v_report['files']
+    assert (
+        "format_version" in v_report.keys() and v_report.get("format_version") == 2
+    ), "S3 format currently handled is version 2 only"
+    files: Dict[str, Any] = v_report["files"]
     for fname, file in files.items():
         contains_failure = any(
-            any(case['status'] == 'errored' or case['status'] == 'failed'
-                for _, case in suite['cases'].items())
-            for _, suite in file['suites'].items())
+            any(
+                case["status"] == "errored" or case["status"] == "failed"
+                for _, case in suite["cases"].items()
+            )
+            for _, suite in file["suites"].items()
+        )
         if contains_failure:
             test_modules.append(fname)
     return test_modules
 
 
 def _query_changed_test_files() -> List[str]:
-    cmd = ["git", "diff", "--name-only", "origin/master", "HEAD"]
+    default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'master')}"
+    cmd = ["git", "diff", "--name-only", default_branch, "HEAD"]
     proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
     if proc.returncode != 0:
@@ -167,14 +194,19 @@ def _query_changed_test_files() -> List[str]:
     return lines
 
 
-def get_shard_based_on_S3(which_shard: int, num_shards: int, tests: List[str], test_times_file: str) -> List[str]:
-    """Get sharded test allocation based on historic S3 data.
-    """
+# Get sharded test allocation based on historic S3 data.
+def get_shard_based_on_S3(
+    which_shard: int, num_shards: int, tests: List[str], test_times_file: str
+) -> List[str]:
+    # Short circuit and don't do any work if there's only 1 shard
+    if num_shards == 1:
+        return tests
+
     jobs_to_times = _query_past_job_times(test_times_file)
 
     # Got no stats from S3, returning early to save runtime
     if len(jobs_to_times) == 0:
-        print('Gathered no stats from S3. Proceeding with default sharding plan.')
+        print("Gathered no stats from S3. Proceeding with default sharding plan.")
         return tests[which_shard - 1 :: num_shards]
 
     shards = calculate_shards(num_shards, tests, jobs_to_times)
@@ -182,14 +214,15 @@ def get_shard_based_on_S3(which_shard: int, num_shards: int, tests: List[str], t
     return tests_from_shard
 
 
-def get_slow_tests_based_on_S3(test_list: List[str], td_list: List[str], slow_test_threshold: int) -> List[str]:
-    """Get list of slow tests based on historic S3 data.
-    """
+def get_slow_tests_based_on_S3(
+    test_list: List[str], td_list: List[str], slow_test_threshold: int
+) -> List[str]:
+    """Get list of slow tests based on historic S3 data."""
     jobs_to_times: Dict[str, float] = _query_past_job_times()
 
     # Got no stats from S3, returning early to save runtime
     if len(jobs_to_times) == 0:
-        print('Gathered no stats from S3. No new slow tests calculated.')
+        print("Gathered no stats from S3. No new slow tests calculated.")
         return []
 
     slow_tests: List[str] = []
@@ -201,38 +234,42 @@ def get_slow_tests_based_on_S3(test_list: List[str], td_list: List[str], slow_te
 
 
 def get_specified_test_cases(filename: str, tests: List[str]) -> Dict[str, List[str]]:
-    """Get test cases from a specified test case file. Usually exported manually or through CI system.
-    """
+    """Get test cases from a specified test case file. Usually exported manually or through CI system."""
     if not os.path.exists(filename):
-        print(f'Could not find specified tests file: {filename}. Proceeding with default behavior.')
+        print(
+            f"Could not find specified tests file: {filename}. Proceeding with default behavior."
+        )
         return dict()
 
     # The below encoding is utf-8-sig because utf-8 doesn't properly handle the byte-order-mark character
-    with open(filename, mode='r', encoding="utf-8-sig") as csv_file:
+    with open(filename, mode="r", encoding="utf-8-sig") as csv_file:
         csv_reader = csv.DictReader(csv_file)
         line_count = 0
         specified_test_case_dict: Dict[str, List[str]] = dict()
         for row in csv_reader:
             line_count += 1
             if line_count == 1:
-                if 'test_filename' not in row or 'test_case_name' not in row:
-                    print('Data is missing necessary columns for test specification. Proceeding with default behavior.')
+                if "test_filename" not in row or "test_case_name" not in row:
+                    print(
+                        "Data is missing necessary columns for test specification. Proceeding with default behavior."
+                    )
                     return dict()
-            test_filename = row['test_filename']
-            test_case_name = row['test_case_name']
+            test_filename = row["test_filename"]
+            test_case_name = row["test_case_name"]
             if test_filename not in tests:
-                print(f'Specified test_filename {test_filename} not found in TESTS. Skipping.')
+                print(
+                    f"Specified test_filename {test_filename} not found in TESTS. Skipping."
+                )
                 continue
             if test_filename not in specified_test_case_dict:
                 specified_test_case_dict[test_filename] = []
             specified_test_case_dict[test_filename].append(test_case_name)
-        print(f'Processed {line_count} test cases.')
+        print(f"Processed {line_count} test cases.")
         return specified_test_case_dict
 
 
 def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str]:
-    """Get the reordered test filename list based on github PR history or git changed file.
-    """
+    """Get the reordered test filename list based on github PR history or git changed file."""
     prioritized_tests = []
     # Try using historic stats from PR.
     if is_reordering_by_pr and HAVE_BOTO3:
@@ -240,7 +277,8 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str
         if len(pr_number):
             ci_job_prefix = _get_stripped_CI_job()
             s3_reports: List[Tuple["Report", str]] = get_previous_reports_for_pr(
-                pr_number, ci_job_prefix)
+                pr_number, ci_job_prefix
+            )
             prioritized_tests = _query_failure_test_module(s3_reports)
             print("Prioritized test from previous CI info.")
 
@@ -253,9 +291,11 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str
             return tests
 
         prefix = f"test{os.path.sep}"
-        prioritized_tests = [f for f in changed_files if f.startswith(prefix) and f.endswith(".py")]
-        prioritized_tests = [f[len(prefix):] for f in prioritized_tests]
-        prioritized_tests = [f[:-len(".py")] for f in prioritized_tests]
+        prioritized_tests = [
+            f for f in changed_files if f.startswith(prefix) and f.endswith(".py")
+        ]
+        prioritized_tests = [f[len(prefix) :] for f in prioritized_tests]
+        prioritized_tests = [f[: -len(".py")] for f in prioritized_tests]
         print("Prioritized test from test file changes.")
 
     bring_to_front = []
@@ -267,12 +307,16 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str
         else:
             the_rest.append(test)
     if len(tests) == len(bring_to_front) + len(the_rest):
-        print(f"reordering tests for PR:\n"
-              f"prioritized: {bring_to_front}\nthe rest: {the_rest}\n")
+        print(
+            f"reordering tests for PR:\n"
+            f"prioritized: {bring_to_front}\nthe rest: {the_rest}\n"
+        )
         return bring_to_front + the_rest
     else:
-        print(f"Something went wrong in CI reordering, expecting total of {len(tests)}:\n"
-              f"but found prioritized: {len(bring_to_front)}\nthe rest: {len(the_rest)}\n")
+        print(
+            f"Something went wrong in CI reordering, expecting total of {len(tests)}:\n"
+            f"but found prioritized: {len(bring_to_front)}\nthe rest: {len(the_rest)}\n"
+        )
         return tests
 
 
@@ -280,13 +324,13 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str
 def export_S3_test_times(test_times_filename: Optional[str] = None) -> Dict[str, float]:
     test_times: Dict[str, float] = _pull_job_times_from_S3()
     if test_times_filename is not None:
-        print(f'Exporting S3 test stats to {test_times_filename}.')
+        print(f"Exporting S3 test stats to {test_times_filename}.")
         if os.path.exists(test_times_filename):
-            print(f'Overwriting existent file: {test_times_filename}')
-        with open(test_times_filename, 'w+') as file:
+            print(f"Overwriting existent file: {test_times_filename}")
+        with open(test_times_filename, "w+") as file:
             job_times_json = _get_job_times_json(test_times)
-            json.dump(job_times_json, file, indent='    ', separators=(',', ': '))
-            file.write('\n')
+            json.dump(job_times_json, file, indent="    ", separators=(",", ": "))
+            file.write("\n")
     return test_times
 
 
diff --git a/tools/ufunc_defs.bzl b/tools/ufunc_defs.bzl
new file mode 100644
index 000000000000..4490f05be015
--- /dev/null
+++ b/tools/ufunc_defs.bzl
@@ -0,0 +1,25 @@
+load("@bazel_skylib//lib:paths.bzl", "paths")
+load(":build_variables.bzl", "aten_ufunc_headers")
+
+aten_ufunc_names = [
+    paths.split_extension(paths.basename(h))[0]
+    for h in aten_ufunc_headers
+]
+
+def aten_ufunc_generated_cpu_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCPU_{}.cpp".format(n)
+        for n in aten_ufunc_names
+    ]]
+
+def aten_ufunc_generated_cpu_kernel_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCPUKernel_{}.cpp".format(n)
+        for n in aten_ufunc_names
+    ]]
+
+def aten_ufunc_generated_cuda_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncCUDA_{}.cu".format(n)
+        for n in aten_ufunc_names
+    ]]
diff --git a/tools/update_masked_docs.py b/tools/update_masked_docs.py
new file mode 100644
index 000000000000..87ee0830e01b
--- /dev/null
+++ b/tools/update_masked_docs.py
@@ -0,0 +1,61 @@
+"""This script updates the file torch/_masked/_docs.py that contains
+the generated doc-strings for various masked operations. The update
+should be triggered whenever a new masked operation is introduced to
+torch._masked package. Running the script requires that torch package
+is functional.
+"""
+
+import os
+
+
+def main() -> None:
+
+    target = os.path.join("torch", "_masked", "_docs.py")
+
+    try:
+        import torch
+    except ImportError as msg:
+        print(f"Failed to import torch required to build {target}: {msg}")
+        return
+
+    if os.path.isfile(target):
+        with open(target) as _f:
+            current_content = _f.read()
+    else:
+        current_content = ""
+
+    _new_content = []
+    _new_content.append(
+        """\
+# -*- coding: utf-8 -*-
+# This file is generated, do not modify it!
+#
+# To update this file, run the update masked docs script as follows:
+#
+#   python tools/update_masked_docs.py
+#
+# The script must be called from an environment where the development
+# version of torch package can be imported and is functional.
+#
+"""
+    )
+
+    for func_name in sorted(torch._masked.__all__):
+        func = getattr(torch._masked, func_name)
+        func_doc = torch._masked._generate_docstring(func)
+        _new_content.append(f'{func_name}_docstring = """{func_doc}"""\n')
+
+    new_content = "\n".join(_new_content)
+
+    if new_content == current_content:
+        print(f"Nothing to update in {target}")
+        return
+
+    with open(target, "w") as _f:
+        _f.write(new_content)
+
+    print(f"Successfully updated {target}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/vscode_settings.py b/tools/vscode_settings.py
index 88dbfb4fedf9..5c7fa8740c4f 100755
--- a/tools/vscode_settings.py
+++ b/tools/vscode_settings.py
@@ -5,17 +5,17 @@
 
 
 def main() -> None:
-    folder = Path('.vscode')
-    recommended = json.loads((folder / 'settings_recommended.json').read_text())
-    path = folder / 'settings.json'
+    folder = Path(".vscode")
+    recommended = json.loads((folder / "settings_recommended.json").read_text())
+    path = folder / "settings.json"
     try:
         current = json.loads(path.read_text())
     except Exception:
         current = {}
-    with open(path, 'w') as f:
+    with open(path, "w") as f:
         json.dump({**current, **recommended}, f, indent=2)
-        f.write('\n')
+        f.write("\n")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 00892ea09eae..15bad2039451 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -14,7 +14,7 @@ if(NOT BUILD_PYTHON)
 endif()
 
 if(USE_TBB)
-include_directories(${TBB_INCLUDE_DIR})
+  include_directories(${TBB_INCLUDE_DIR})
 endif()
 
 set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -44,6 +44,9 @@ set(TORCH_PYTHON_SRCS
     )
 append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS)
 
+list(APPEND TORCH_PYTHON_SRCS
+    ${TORCH_SRC_DIR}/csrc/init_flatbuffer_module.cpp)
+
 # NB: This has to match the condition under which the JIT test directory
 #     is included (at the time of writing that's in caffe2/CMakeLists.txt).
 if(BUILD_TEST)
@@ -151,8 +154,8 @@ if(USE_CUDNN OR USE_ROCM)
     endif()
 endif()
 
-if(USE_MLCOMPUTE)
-    list(APPEND TORCH_PYTHON_SRCS ${MLC_PYTHON_SRCS})
+if(USE_MPS)
+    list(APPEND TORCH_PYTHON_SRCS ${MPS_PYTHON_SRCS})
 endif()
 
 if(USE_VALGRIND AND NOT WIN32)
@@ -190,6 +193,7 @@ add_custom_target(torch_python_stubs DEPENDS
     "${TORCH_SRC_DIR}/_C/__init__.pyi"
     "${TORCH_SRC_DIR}/_C/_VariableFunctions.pyi"
     "${TORCH_SRC_DIR}/nn/functional.pyi"
+    "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi"
 )
 add_custom_command(
     OUTPUT
@@ -199,6 +203,7 @@ add_custom_command(
     COMMAND
     "${PYTHON_EXECUTABLE}" -mtools.pyi.gen_pyi
       --native-functions-path "aten/src/ATen/native/native_functions.yaml"
+      --tags-path "aten/src/ATen/native/tags.yaml"
       --deprecated-functions-path "tools/autograd/deprecated.yaml"
     DEPENDS
     "${TORCH_SRC_DIR}/_C/__init__.pyi.in"
@@ -206,10 +211,23 @@ add_custom_command(
     "${TORCH_SRC_DIR}/nn/functional.pyi.in"
     "${TOOLS_PATH}/pyi/gen_pyi.py"
     "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
+    "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml"
     "${TORCH_ROOT}/tools/autograd/deprecated.yaml"
     WORKING_DIRECTORY
     "${TORCH_ROOT}"
 )
+file(GLOB_RECURSE datapipe_files "${TORCH_SRC_DIR}/utils/data/datapipes/*.py")
+add_custom_command(
+    OUTPUT
+    "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi"
+    COMMAND
+    "${PYTHON_EXECUTABLE}" ${TORCH_SRC_DIR}/utils/data/datapipes/gen_pyi.py
+    DEPENDS
+    "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi.in"
+    ${datapipe_files}
+    WORKING_DIRECTORY
+    "${TORCH_ROOT}"
+)
 if(USE_DISTRIBUTED)
     if(WIN32)
       append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
@@ -346,7 +364,6 @@ if(USE_NUMPY)
   target_compile_definitions(torch_python PRIVATE USE_NUMPY)
 endif()
 
-list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS BUILD_CAFFE2)
 if(HAVE_SOVERSION)
   set_target_properties(torch_python PROPERTIES
       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
@@ -376,6 +393,9 @@ set_source_files_properties(
 # Disable certain warnings for GCC-9.X
 if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/Module.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+  set_source_files_properties(
+    ${TORCH_SRC_DIR}/csrc/init_flatbuffer_module.cpp
+    PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/autograd/python_variable.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
 endif()
 
@@ -403,6 +423,10 @@ target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
 
 target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
 
+if(BUILD_ONEDNN_GRAPH)
+  target_compile_definitions(torch_python PRIVATE "-DBUILD_ONEDNN_GRAPH")
+  target_compile_definitions(torch_cpu PRIVATE "-DBUILD_ONEDNN_GRAPH")
+endif()
 
 if(NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
     set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in
index 1b3a760c8cbd..75d566f131ab 100644
--- a/torch/_C/_VariableFunctions.pyi.in
+++ b/torch/_C/_VariableFunctions.pyi.in
@@ -5,13 +5,11 @@ from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable,
 from typing_extensions import Literal
 from torch._six import inf
 
-from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout
+from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout, SymInt
+import torch
 
 import builtins
 
-# REDUNDANT!
-${namedtuple_defs}
-
 ${function_hints}
 
 ${all_directive}
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index ffa4da59707e..b986f87943b0 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -12,8 +12,8 @@ from typing import (
 from typing_extensions import Literal
 from torch._six import inf
 
-from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage
-from torch.storage import TypedStorage
+from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage, SymInt
+from torch.storage import _TypedStorage
 
 import builtins
 
@@ -22,6 +22,8 @@ import builtins
 from . import _nn as _nn
 from . import _onnx as _onnx
 from . import _VariableFunctions as _VariableFunctions
+from . import _lazy as _lazy
+from . import _lazy_ts_backend as _lazy_ts_backend
 
 T = TypeVar('T')
 
@@ -52,7 +54,7 @@ class Stream:
 class Size(Tuple[_int, ...]):
     # TODO: __reduce__
 
-    @overload
+    @overload  # type: ignore[override]
     def __getitem__(self: Size, key: _int) -> _int: ...
 
     @overload
@@ -107,6 +109,9 @@ def DisableTorchFunction(): ...
 strided : layout = ...
 sparse_coo : layout = ...
 sparse_csr : layout = ...
+sparse_csc : layout = ...
+sparse_bsr : layout = ...
+sparse_bsc : layout = ...
 _mkldnn : layout = ...
 
 # Defined in torch/csrc/MemoryFormat.cpp
@@ -195,7 +200,7 @@ def _is_tracing() -> _bool: ...
 def _jit_init() -> _bool: ...
 def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
 def _jit_unflatten(vars: List[Tensor], desc: IODescriptor) -> Any: ...
-def _jit_get_operation(op_name: str) -> Callable: ...
+def _jit_get_operation(op_name: str) -> Tuple[Callable, List[str]]: ...
 def _get_operation_overload(op_name: str, op_overload_name: str) -> Callable: ...
 def _get_schema(op_name: str, overload_name: str) -> FunctionSchema: ...
 def _jit_pass_optimize_for_mobile(module: 'torch.jit.ScriptModule',
@@ -211,9 +216,10 @@ def _jit_pass_metal_optimize_for_mobile(module: 'torch.jit.ScriptModule',
 def _jit_pass_inline(Graph) -> None: ...
 def _jit_pass_constant_propagation(Graph) -> None: ...
 def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ...
+def _jit_register_decomposition_for_schema(schema: FunctionSchema, Graph) -> None: ...
 def _jit_erase_non_input_shape_information(Graph) -> None: ...
-def _jit_pass_common_expression_hoisting(Graph) -> None: ...
 def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ...
+def _jit_get_all_schemas() -> List[FunctionSchema]: ...
 def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ...
 def _jit_can_fuse_on_cpu() -> _bool: ...
 def _jit_can_fuse_on_gpu() -> _bool: ...
@@ -222,6 +228,8 @@ def _debug_get_fusion_group_inlining() -> _bool: ...
 def _debug_set_fusion_group_inlining(enable: _bool): ...
 def _jit_texpr_fuser_enabled() -> _bool: ...
 def _jit_nvfuser_enabled() -> _bool: ...
+def _jit_llga_enabled() -> _bool: ...
+def _jit_set_llga_enabled(enable: _bool): ...
 def _llvm_enabled() -> _bool: ...
 def _jit_override_can_fuse_on_cpu(override: _bool): ...
 def _jit_override_can_fuse_on_gpu(override: _bool): ...
@@ -233,7 +241,7 @@ def _jit_set_te_must_use_llvm_cpu(use_llvm: _bool): ...
 def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ...
 def _jit_cat_wo_conditionals(optimize_cat: _bool): ...
 def _jit_opt_conditionals(opt_conds: _bool): ...
-def _jit_pass_canonicalize(graph: Graph): ...
+def _jit_pass_canonicalize(graph: Graph, keep_unique_names: _bool = True): ...
 def _jit_pass_erase_shape_information(graph: Graph): ...
 def _jit_pass_fold_convbn(module: 'torch.jit.ScriptModule'): ...
 def _jit_pass_insert_observers(module: 'torch.jit.ScriptModule',
@@ -260,7 +268,7 @@ ResolutionCallback = Callable[[str], Callable[..., Any]]
 
 # Defined in torch/csrc/jit/python/script_init.cpp
 #        and torch/csrc/jit/python/init.cpp
-def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ...
+def _create_function_from_graph(qualname: str, graph: Graph) -> ScriptFunction: ...
 def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
 def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
 def _jit_assert_is_instance(obj: Any, type: JitType): ...
@@ -281,7 +289,7 @@ def _get_model_ops_and_info_from_buffer(buffer: BinaryIO): ...
 def _get_mobile_model_contained_types(filename: Union[str, Path]): ...
 def _get_mobile_model_contained_types_from_buffer(buffer: BinaryIO): ...
 def _logging_set_logger(logger: LoggerBase) -> LoggerBase: ...
-def _get_graph_executor_optimize() -> _bool: ...
+def _get_graph_executor_optimize(optimize: Optional[_bool] = None) -> _bool: ...
 def _set_graph_executor_optimize(optimize: _bool): ...
 def _export_opnames(module: ScriptModule) -> List[str]: ...
 def _create_function_from_trace(
@@ -302,7 +310,7 @@ def _dump_upgraders_map() -> Dict[str, str]: ...
 def _test_only_populate_upgraders(content: Dict[str, str]) -> None: ...
 def _test_only_remove_upgraders(content: Dict[str, str]) -> None: ...
 def merge_type_from_type_comment(decl: Decl, type_annotation_decl: Decl, is_method: _bool) -> Decl: ...
-def parse_ir(input: str) -> Graph: ...
+def parse_ir(input: str, parse_tensor_constants: _bool) -> Graph: ...
 def parse_schema(schema: str) -> FunctionSchema: ...
 def get_device(input: Tensor) -> _int: ...
 def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallback) -> JitType: ...
@@ -314,11 +322,11 @@ def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def
 def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
 def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ...
 def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ...
-def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool = False) -> None: ...
+def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool, is_script: _bool) -> None: ...
 def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph, module: Module) -> None: ...
 def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
 def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
-def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ...
+def _jit_pass_peephole(graph: Graph, disable_shape_peepholes: _bool = False) -> None: ...
 def _jit_pass_fuse_addmm(graph: Graph) -> None: ...
 def _jit_pass_onnx_preprocess(graph: Graph) -> None: ...
 def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ...
@@ -326,7 +334,8 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
 def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
 def _jit_pass_onnx_unpack_quantized_weights(
     graph: Graph,
-    paramsDict: Dict[str, IValue]
+    paramsDict: Dict[str, IValue],
+    caffe2: _bool
 ) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_quantization_insert_permutes(
     graph: Graph,
@@ -342,8 +351,15 @@ def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size:
 def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ...
 def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
 def _jit_pass_onnx_function_extraction(graph: Graph, module_names : Set[str], param_names : List[str]) -> Dict[Node, Dict[str, str]]: ...
+def _jit_pass_onnx_clear_scope_records() -> None: ...
+def _jit_pass_onnx_track_scope_attributes(graph: Graph, onnx_attrs: Dict[str, Any]) -> None: ...
+def _jit_is_onnx_log_enabled() -> _bool: ...
+def _jit_set_onnx_log_enabled(enabled: _bool) -> None: ...
+def _jit_set_onnx_log_output_stream(stream_name: str) -> None: ...
+def _jit_onnx_log(*args: Any) -> None: ...
 def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
 def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
+def _jit_pass_onnx_deduplicate_initializers(graph: Graph, params_dict: Dict[str, IValue], is_train: _bool) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
@@ -365,6 +381,10 @@ def _compile_graph_to_code_table(name: str, graph: Graph) -> IValue: ...
 
 def _generate_upgraders_graph() -> Dict[str, Graph]: ...
 
+def _calculate_package_version_based_on_upgraders(val: _bool): ...
+
+def _get_version_calculator_flag() -> _bool: ...
+
 def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ...
 def _jit_script_compile_overload(
     qualname: str,
@@ -408,7 +428,7 @@ def _import_ir_module_from_package(
 ) -> ScriptModule: ...
 
 def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
-def _check_onnx_proto(proto: str) -> None: ...
+def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ...
 def _propagate_and_assign_input_shapes(
     graph: Graph,
     inputs: Tuple[Tensor, ...],
@@ -426,12 +446,9 @@ class AliasDb:
     def __str__(self) -> str: ...
     ...
 
-# Defined in torch/torch/csrc/jit/ir/ir.h
-class Graph:
-    def eraseInput(self, i: _int) -> None: ...
-    def alias_db(self) -> AliasDb: ...
-    def inputs(self) -> List[Value]: ...
-    ...
+class _InsertPoint:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args) -> None: ...
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Value:
@@ -442,10 +459,33 @@ class Value:
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Block:
+    def inputs(self) -> List[Value]: ...
+    def outputs(self) -> List[Value]: ...
     ...
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Node:
+    def schema(self) -> str: ...
+    def output(self) -> Value: ...
+    def outputs(self) -> List[Value]: ...
+    def outputsSize(self) -> _int: ...
+    def blocks(self) -> List[Block]: ...
+    def mustBeNone(self) -> _bool: ...
+    def kindOf(self, str) -> str: ...
+    def __getitem__(self, key: str) -> Any: ...
+    def namedInput(self, str) -> Value: ...
+    ...
+
+# Defined in torch/torch/csrc/jit/ir/ir.h
+class Graph:
+    def eraseInput(self, i: _int) -> None: ...
+    def alias_db(self) -> AliasDb: ...
+    def inputs(self) -> List[Value]: ...
+    def setInsertPoint(self, n: Union[Block, Node]) -> None: ...
+    def insert_point_guard(self, n: Union[Block, Node]) -> _InsertPoint: ...
+    def insertPoint(self) -> Node: ...
+    def insertGraph(self, callee: Graph, inputs: List[Value]) -> List[Value]: ...
+    def makeMultiOutputIntoTuple(self) -> None: ...
     ...
 
 
@@ -460,6 +500,8 @@ class Argument:
 class FunctionSchema:
     arguments: List[Argument]
     returns: List[Argument]
+    name: str
+    overload_name: str
     ...
 
 class _UpgraderEntry:
@@ -510,6 +552,8 @@ class ConcreteModuleTypeBuilder:
     def __init__(self, obj: Any) -> None: ...
     def set_module_dict(self): ...
     def set_module_list(self): ...
+    def set_parameter_list(self): ...
+    def set_parameter_dict(self): ...
     def add_attribute(self, name: str, ty: JitType, is_param: _bool, is_buffer: _bool): ...
     def add_module(self, name: str, meta: ConcreteModuleType): ...
     def add_constant(self, name: str, value: Any): ...
@@ -636,6 +680,8 @@ def _get_cudnn_allow_tf32() -> _bool: ...  # THPModule_allowTF32CuDNN
 def _set_cudnn_allow_tf32(arg: _bool) -> None: ...  # THPModule_setAllowTF32CuDNN
 def _get_cublas_allow_tf32() -> _bool: ...  # THPModule_allowTF32CuBLAS
 def _set_cublas_allow_tf32(arg: _bool) -> None: ...  # THPModule_setAllowTF32CuBLAS
+def _get_float32_matmul_precision() -> str: ... #THPModule_float32MatmulPrecision
+def _set_float32_matmul_precision(arg: str) -> None: ... #THPModule_setFloat32MatmulPrecision
 def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ... #THPModule_allowFP16ReductionCuBLAS
 def _set_cublas_allow_fp16_reduced_precision_reduction(arg: _bool) -> None: ... #THPModule_setAllowFP16ReductionCuBLAS
 # NB: There is no Capsule type in typing, see
@@ -660,6 +706,7 @@ def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_n
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
 def _demangle(str) -> str: ...  # c10::demangle
 def _disabled_torch_function_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ...  # THPModule_disable_torch_function
+def _disabled_torch_dispatch_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ...  # THPModule_disable_dispatch_function
 def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
 class _LinalgBackend:
@@ -674,6 +721,8 @@ def _valgrind_toggle_and_dump_stats() -> None: ...  # CALLGRIND_TOGGLE_COLLECT a
 
 has_openmp: _bool
 has_mkl: _bool
+has_mps: _bool
+_is_mps_available: _bool
 has_lapack: _bool
 has_cuda: _bool
 has_mkldnn: _bool
@@ -709,8 +758,20 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ...
 def __is_forward_AD_enabled() -> _bool: ...
 def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ...
 def _reset_default_hooks() -> None: ...
-def _enter_python_mode(cls: Type) -> None: ...
-def _exit_python_mode() -> None: ...
+
+# Defined in torch/overrides.py
+class TorchFunctionMode(object):
+    ...
+
+def _set_torch_function_mode(cls: Optional[Union[type, TorchFunctionMode]]) -> None: ...
+def _get_torch_function_mode() -> Optional[Union[type, TorchFunctionMode]]: ...
+
+# Defined in torch/utils/_python_dispatch.py
+class TorchDispatchMode(object):
+    ...
+
+def _set_torch_dispatch_mode(cls: Optional[Union[type, TorchDispatchMode]]) -> None: ...
+def _get_torch_dispatch_mode() -> Optional[Union[type, TorchDispatchMode]]: ...
 
 class _InferenceMode(object):
     def __init__(self, mode: _bool) -> None: ...
@@ -777,6 +838,12 @@ class Generator(object):
     def seed(self) -> _int: ...
     def initial_seed(self) -> _int: ...
 
+
+# Defined in torch/csrc/utils/python_dispatch.cpp
+def _dispatch_library(kind: str, name: str, dispatch: str, file: str = "", linenum: Any = 0) -> Any: ...
+def _dispatch_has_kernel_for_dispatch_key(name: str, dispatch: str) -> _bool: ...
+def _dispatch_has_kernel(name: str) -> _bool: ...
+
 # Defined in torch/csrc/utils/init.cpp
 class BenchmarkConfig(object):
     num_calling_threads: _int
@@ -795,9 +862,6 @@ class ThroughputBenchmark(object):
     def run_once(self, *args: Any, **kwargs: Any) -> Any: ...
     def benchmark(self, config: BenchmarkConfig) -> BenchmarkExecutionStats: ...
 
-# IDK if these are actually exposed here, hope they are
-${namedtuple_defs}
-
 # Defined in torch/csrc/generic/Storage.cpp
 ${legacy_storage_base_hints}
 
@@ -869,6 +933,10 @@ def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ...
 def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...
+def _cuda_jiterator_compile_and_launch_kernel(code_string: str,
+                                              kernel_name: str,
+                                              tensors: Tuple,
+                                              kwargs: Dict[str, Union[_int, _float, _bool]]) -> Tensor: ...
 def _nccl_version() -> _int: ...
 def _nccl_unique_id() -> bytes: ...
 def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
@@ -896,6 +964,7 @@ def _nccl_reduce_scatter(input: Sequence[Tensor],
                          op: _int,
                          streams: Optional[Sequence[_CudaStreamBase]],
                          comms: Optional[Sequence[object]]) -> None: ...
+def _rocm_is_backward_pass() -> _bool: ...
 
 
 class _CudaDeviceProperties:
@@ -957,6 +1026,8 @@ class _CUDAGraph:
     def reset(self) -> None: ...
     def pool(self) -> Tuple[_int, _int]: ...
 
+def _cuda_isCurrentStreamCapturing() -> _bool: ...
+
 def _graph_pool_handle() -> Tuple[_int, _int]: ...
 
 # Defined in torch/csrc/DataLoader.cpp
@@ -997,6 +1068,7 @@ class JitType:
     def isSubtypeOf(self, other: JitType) -> _bool: ...
     def with_dtype(self, dtype: _dtype) -> JitType: ...
     def with_sizes(self, sizes: List[Optional[_int]]) -> JitType: ...
+    def kind(self) -> str: ...
 
 class InferredType:
     def __init__(self, arg: Union[JitType, str]): ...
@@ -1111,6 +1183,9 @@ class TensorType(JitType):
     def getInferred(cls) -> TensorType: ...
     def with_sizes(self, other: Optional[List[Optional[_int]]]) -> TensorType: ...
     def sizes(self) -> Optional[List[_int]]: ...
+    def strides(self) -> Optional[List[_int]]: ...
+    def device(self) -> Optional[_device]: ...
+    def dtype(self) -> Optional[_dtype]: ...
     @staticmethod
     def create_from_tensor(t: Tensor) -> TensorType: ...
 
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 38ac7ccaea0c..b2a190c1e96c 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -28,13 +28,21 @@ class DeviceType(Enum):
     FPGA = ...
     ORT = ...
     XLA = ...
-    MLC = ...
+    MPS = ...
     HPU = ...
     Meta = ...
     Vulkan = ...
     Metal = ...
     ...
 
+class _ExperimentalConfig:
+    def __init__(
+        self,
+        profiler_metrics: List[str] = ...,
+        profiler_measure_per_kernel: bool = ...,
+    ) -> None: ...
+    ...
+
 class ProfilerConfig:
     def __init__(
         self,
@@ -43,7 +51,8 @@ class ProfilerConfig:
         profile_memory: bool,
         with_stack: bool,
         with_flops: bool,
-        with_modules: bool
+        with_modules: bool,
+        experimental_config: _ExperimentalConfig,
     ) -> None: ...
     ...
 
@@ -82,11 +91,15 @@ class _ProfilerResult:
 class SavedTensor:
     ...
 
+class ActiveProfilerType:
+    ...
+
 def _enable_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ...
 def _prepare_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ...
 def _disable_profiler() -> _ProfilerResult: ...
 def _profiler_enabled() -> bool: ...
 def _add_metadata_json(key: str, value: str) -> None: ...
+def _kineto_step() -> None: ...
 def kineto_available() -> bool: ...
 def _record_function_with_args_enter(name: str, args: List[Any]) -> torch.Tensor: ...
 def _record_function_with_args_exit(handle: torch.Tensor) -> None: ...
@@ -98,3 +111,4 @@ def _pop_saved_tensors_default_hooks() -> None: ...
 
 def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
 def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ...
+def _profiler_type() -> ActiveProfilerType: ...
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 741d4d5562a1..6192b1f04388 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -52,9 +52,11 @@ class Logger:
     ): ...
     ...
 
-def _get_debug_mode(): ...
+def get_debug_level(): ...
+def set_debug_level(): ...
+def set_debug_level_from_env(): ...
 
-class _DistributedDebugLevel(Enum):
+class DebugLevel(Enum):
     OFF = ...
     INFO = ...
     DETAIL = ...
@@ -132,7 +134,7 @@ class TCPStore(Store):
         self,
         host_name: str,
         port: int,
-        world_size: int = ...,
+        world_size: Optional[int] = ...,
         is_master: bool = ...,
         timeout: timedelta = ...,
         wait_for_workers: bool = ...,
@@ -393,5 +395,7 @@ def _broadcast_coalesced(
 ): ...
 def _test_python_store(store: Store): ...
 def _verify_params_across_processes(
-    process_group: ProcessGroup, params: List[Tensor]
+    process_group: ProcessGroup,
+    params: List[Tensor],
+    logger: Optional[Logger],
 ): ...
diff --git a/torch/_C/_distributed_rpc.pyi b/torch/_C/_distributed_rpc.pyi
index d89f614123e1..06d7a6fcba3f 100644
--- a/torch/_C/_distributed_rpc.pyi
+++ b/torch/_C/_distributed_rpc.pyi
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, overload
 from datetime import timedelta
 import enum
 import torch
+from torch.types import Device
 from . import Future
 from ._autograd import ProfilerConfig, ProfilerState, ProfilerEvent
 from ._distributed_c10d import ProcessGroup, Store
@@ -32,7 +33,7 @@ class WorkerInfo:
     def __repr__(self) -> str: ...
 
 class RpcAgent:
-    def join(self, shutdown: bool = False): ...
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
     def sync(self): ...
     def shutdown(self): ...
     @overload
@@ -68,6 +69,7 @@ class PyRRef:
 class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions):
     num_worker_threads: int
     device_maps: Dict[str, Dict[torch.device, torch.device]]
+    devices: List[torch.device]
     def __init__(
         self,
         num_worker_threads: int,
@@ -85,12 +87,12 @@ class TensorPipeAgent(RpcAgent):
         store: Store,
         name: str,
         worker_id: int,
-        world_size: int,
+        world_size: Optional[int],
         opts: _TensorPipeRpcBackendOptionsBase,
         reverse_device_maps: Dict[str, Dict[torch.device, torch.device]],
         devices: List[torch.device],
     ): ...
-    def join(self): ...
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
     def shutdown(self): ...
     @overload
     def get_worker_info(self) -> WorkerInfo: ...
@@ -100,6 +102,17 @@ class TensorPipeAgent(RpcAgent):
     def get_worker_info(self, id: int) -> WorkerInfo: ...
     def get_worker_infos(self) -> List[WorkerInfo]: ...
     def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
+    def _update_group_membership(
+        self,
+        worker_info: WorkerInfo,
+        my_devices: List[torch.device],
+        reverse_device_map: Dict[str, Dict[torch.device, torch.device]],
+        is_join: bool): ...
+    def _get_backend_options(self) -> _TensorPipeRpcBackendOptionsBase: ...
+    @property
+    def is_static_group(self) -> bool: ...
+    @property
+    def store(self) -> Store: ...
 
 def _is_current_rpc_agent_set() -> bool: ...
 def _get_current_rpc_agent()-> RpcAgent: ...
diff --git a/torch/_C/_lazy.pyi b/torch/_C/_lazy.pyi
new file mode 100644
index 000000000000..e86b80837d58
--- /dev/null
+++ b/torch/_C/_lazy.pyi
@@ -0,0 +1,20 @@
+from typing import List
+from torch import Tensor
+
+#defined in torch/csrc/lazy/python/init.cpp
+def _mark_step(device: str, devices: List[str], wait: bool): ...
+def _wait_device_ops(devices: List[str]): ...
+def _reset_metrics(): ...
+def _counter_names() -> List[str]: ...
+def _counter_value(name: str) -> int: ...
+def _get_graph_hash(tensors: List[Tensor]) -> str: ...
+def _sync_multi(tensors: List[Tensor], devices: List[str], wait: bool = True, sync_ltc_data: bool = True): ...
+def _get_tensor_id(tensor: Tensor) -> int: ...
+def _get_tensors_text(tensors: List[Tensor]) -> str: ...
+def _get_tensors_dot(tensors: List[Tensor]) -> str: ...
+def _get_tensors_backend(tensors: List[Tensor]) -> str: ...
+def _get_force_fallback() -> str: ...
+def _set_force_fallback(newval: str): ...
+def _clear_ir_cache(): ...
+def _dump_ir_cache(filename: str): ...
+def _set_reuse_ir(val: bool): ...
diff --git a/torch/_C/_lazy_ts_backend.pyi b/torch/_C/_lazy_ts_backend.pyi
new file mode 100644
index 000000000000..91575fe939bf
--- /dev/null
+++ b/torch/_C/_lazy_ts_backend.pyi
@@ -0,0 +1,8 @@
+#defined in torch/csrc/lazy/python/init.cpp
+
+from typing import List, Tuple, Any
+from torch import Tensor
+
+def _init(): ...
+def _get_tensors_ts_device_data_node(tensors: List[Tensor]) -> Tuple[List[int], List[Any]]: ...
+def _run_cached_graph(hash_str: str, graph_inputs: List[Any]) -> List[Tensor]: ...
diff --git a/torch/_C/_nn.pyi.in b/torch/_C/_nn.pyi.in
index b2b2bcbbefdd..1198c43da450 100644
--- a/torch/_C/_nn.pyi.in
+++ b/torch/_C/_nn.pyi.in
@@ -13,6 +13,9 @@ def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tens
 def mkldnn_reorder_conv2d_weight(self: Tensor, padding: List, stride: List, dilatation: List, groups: int) -> Tensor: ...
 def mkldnn_reorder_conv3d_weight(self: Tensor, padding: List, stride: List, dilatation: List, groups: int) -> Tensor: ...
 
+# Defined in aten/src/ATen/native/mkldnn/Prelu.cpp
+def mkldnn_prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+
 # Defined at tools/autograd/templates/python_nn_functions.cpp
 @overload
 def _parse_to(device: _device, dtype: _dtype, non_blocking: _bool, copy: _bool, *,
diff --git a/torch/_C/build.bzl b/torch/_C/build.bzl
new file mode 100644
index 000000000000..230124eb69aa
--- /dev/null
+++ b/torch/_C/build.bzl
@@ -0,0 +1,6 @@
+def define_targets(rules):
+    rules.filegroup(
+        name = "pyi.in",
+        srcs = rules.glob(["*.pyi.in"]),
+        visibility = ["//visibility:public"],
+    )
diff --git a/torch/_C/return_types.pyi.in b/torch/_C/return_types.pyi.in
new file mode 100644
index 000000000000..aa540ea328b5
--- /dev/null
+++ b/torch/_C/return_types.pyi.in
@@ -0,0 +1,10 @@
+# ${generated_comment}
+
+from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided
+from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, TypeVar
+from typing_extensions import Literal
+from torch._six import inf
+
+from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout
+
+${namedtuple_defs}
diff --git a/torch/_C_flatbuffer/__init__.pyi b/torch/_C_flatbuffer/__init__.pyi
new file mode 100644
index 000000000000..3a2ff059b0ed
--- /dev/null
+++ b/torch/_C_flatbuffer/__init__.pyi
@@ -0,0 +1,10 @@
+from torch._C import LiteScriptModule, ScriptModule
+
+def _load_mobile_module_from_file(filename: str): ...
+def _load_mobile_module_from_bytes(bytes_: bytes): ...
+def _load_jit_module_from_file(filename: str): ...
+def _load_jit_module_from_bytes(bytes_: bytes): ...
+def _save_mobile_module(m: LiteScriptModule, filename: str): ...
+def _save_jit_module(m: ScriptModule, filename: str): ...
+def _save_mobile_module_to_bytes(m: LiteScriptModule) -> bytes: ...
+def _save_jit_module_to_bytes(m: ScriptModule) -> bytes: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 519ea3e607cd..6c1e5a88ab8f 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -15,6 +15,7 @@
 import textwrap
 import ctypes
 import warnings
+import inspect
 if sys.version_info < (3,):
     raise Exception("Python 2 has reached end-of-life and is no longer supported by PyTorch.")
 
@@ -29,7 +30,7 @@
 
 from ._six import string_classes as _string_classes
 
-from typing import Set, Type, TYPE_CHECKING, Union
+from typing import Set, Type, TYPE_CHECKING, Union, Callable
 import builtins
 
 __all__ = [
@@ -39,12 +40,14 @@
     'no_grad', 'enable_grad', 'rand', 'randn', 'inference_mode',
     'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage',
     'ShortStorage', 'CharStorage', 'ByteStorage', 'BoolStorage',
+    '_TypedStorage',
     'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor',
     'ShortTensor', 'CharTensor', 'ByteTensor', 'BoolTensor', 'Tensor',
     'lobpcg', 'use_deterministic_algorithms',
     'are_deterministic_algorithms_enabled',
     'is_deterministic_algorithms_warn_only_enabled',
     'set_deterministic_debug_mode', 'get_deterministic_debug_mode',
+    'set_float32_matmul_precision', 'get_float32_matmul_precision',
     'set_warn_always', 'is_warn_always_enabled',
 ]
 
@@ -227,10 +230,15 @@ def _load_global_deps():
             ''').strip()) from None
     raise  # If __file__ is not None the cause is unknown, so just re-raise.
 
-
-__all__ += [name for name in dir(_C)
-            if name[0] != '_' and
-            not name.endswith('Base')]
+for name in dir(_C):
+    if name[0] != '_' and not name.endswith('Base'):
+        __all__.append(name)
+        obj = getattr(_C, name)
+        if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
+            if (obj.__module__ != 'torch'):
+                # TODO: fix their module from C++ side
+                if name not in ['DisableTorchFunction', 'Generator']:
+                    obj.__module__ = 'torch'
 
 if not TYPE_CHECKING:
     # issue 38137 and python issue 43367. Submodules of a C extension are
@@ -562,6 +570,23 @@ def get_deterministic_debug_mode() -> builtins.int:
     else:
         return 0
 
+def get_float32_matmul_precision() -> builtins.str:
+    r"""Returns the current value of float32 matrix multiplication precision. Refer to
+    :func:`torch.set_float32_matmul_precision` documentation for more details.
+    """
+    return _C._get_float32_matmul_precision()
+
+def set_float32_matmul_precision(precision):
+    r"""Sets the precision of float32 matrix multiplication (one of HIGHEST, HIGH, MEDIUM).
+    Original RFC: https://github.com/pytorch/pytorch/issues/76440
+    Args:
+        precision(str): default "highest": avoid internally reducing precision with
+        formats such as TF32.
+        If "high," allow TF32.
+        If "medium," allow TF32.
+    """
+    _C._set_float32_matmul_precision(precision)
+
 def set_warn_always(b):
     r"""When this flag is False (default) then some PyTorch warnings may only
     appear once per process. This helps avoid excessive warning information.
@@ -594,104 +619,105 @@ def is_warn_always_enabled():
 ################################################################################
 
 from ._tensor import Tensor
-from .storage import _StorageBase, TypedStorage
+from .storage import _StorageBase, _TypedStorage, _LegacyStorage
 
 # NOTE: New <type>Storage classes should never be added. When adding a new
-# dtype, use torch.storage.TypedStorage directly.
+# dtype, use torch.storage._TypedStorage directly.
 
-class UntypedStorage(_C.ByteStorageBase, _StorageBase):
+class _UntypedStorage(_C.ByteStorageBase, _StorageBase):
     pass
 
-class ByteStorage(TypedStorage):
+class ByteStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.uint8
 
-class DoubleStorage(TypedStorage):
+class DoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.double
 
-class FloatStorage(TypedStorage):
+class FloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.float
 
-class HalfStorage(TypedStorage):
+class HalfStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.half
 
-class LongStorage(TypedStorage):
+class LongStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.long
 
-class IntStorage(TypedStorage):
+class IntStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.int
 
-class ShortStorage(TypedStorage):
+class ShortStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.short
 
-class CharStorage(TypedStorage):
+class CharStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.int8
 
-class BoolStorage(TypedStorage):
+class BoolStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.bool
 
-class BFloat16Storage(TypedStorage):
+class BFloat16Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.bfloat16
 
-class ComplexDoubleStorage(TypedStorage):
+class ComplexDoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.cdouble
 
-class ComplexFloatStorage(TypedStorage):
+class ComplexFloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.cfloat
 
-class QUInt8Storage(TypedStorage):
+class QUInt8Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.quint8
 
-class QInt8Storage(TypedStorage):
+class QInt8Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.qint8
 
-class QInt32Storage(TypedStorage):
+class QInt32Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.qint32
 
-class QUInt4x2Storage(TypedStorage):
+class QUInt4x2Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.quint4x2
 
-class QUInt2x4Storage(TypedStorage):
+class QUInt2x4Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.quint2x4
 
 _storage_classes = {
-    UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage,
+    _UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage,
     ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage,
     QUInt8Storage, QInt8Storage, QInt32Storage, BFloat16Storage,
     ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage, QUInt2x4Storage,
+    _TypedStorage
 }
 
 # The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings()
@@ -715,7 +741,7 @@ def manager_path():
         raise RuntimeError("Unable to find torch_shm_manager at " + path)
     return path.encode('utf-8')
 
-from .autocast_mode import autocast
+from torch.amp import autocast
 
 # Shared memory manager needs to know the exact location of manager executable
 _C._initExtension(manager_path())
@@ -740,8 +766,11 @@ def manager_path():
 for name in dir(_C._VariableFunctions):
     if name.startswith('__') or name in PRIVATE_OPS:
         continue
-    globals()[name] = getattr(_C._VariableFunctions, name)
-    __all__.append(name)
+    obj = getattr(_C._VariableFunctions, name)
+    obj.__module__ = 'torch'
+    globals()[name] = obj
+    if not name.startswith("_"):
+        __all__.append(name)
 
 ################################################################################
 # Import interface functions defined in Python
@@ -809,6 +838,7 @@ def _assert(condition, message):
 from torch import distributions as distributions
 from torch import testing as testing
 import torch.backends.cuda
+import torch.backends.mps
 import torch.backends.cudnn
 import torch.backends.mkl
 import torch.backends.mkldnn
@@ -871,6 +901,9 @@ def compiled_with_cxx11_abi():
 # information.
 from . import _masked
 
+# Import removed ops with error message about removal
+from ._linalg_utils import solve
+
 
 def _register_device_module(device_type, module):
     r"""Register an external runtime module of the specific :attr:`device_type`
@@ -889,3 +922,6 @@ def _register_device_module(device_type, module):
 
 # expose return_types
 from . import return_types
+if sys.executable != 'torch_deploy':
+    from . import library
+    from . import _meta_registrations
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
new file mode 100644
index 000000000000..9d72a832538d
--- /dev/null
+++ b/torch/_decomp/__init__.py
@@ -0,0 +1,105 @@
+import torch
+import torch._ops
+import torch.library
+from typing import Callable, Union, Dict, Sequence, List
+from torch.utils._pytree import tree_map
+from collections import defaultdict
+
+__all__ = ["decomposition_table", "register_decomposition", "get_decompositions"]
+
+# TODO: relax key type here; torch registrations should be possible to; but
+# right now this type is accurate
+decomposition_table: Dict[torch._ops.OpOverload, Callable] = {}
+
+
+meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+
+
+def register_decomposition(aten_op, registry=None, *, disable_meta: bool = False):
+    """
+    A decorator to register a function as a decomposition to the Python
+    decomposition table.  Use it like this::
+
+        @register_decomposition(torch.ops.aten.clamp_min)
+        def clamp_min(x):
+            return torch.clamp(self, min=min)
+
+    If you are writing a new decomposition, consider contributing it
+    directly to PyTorch in torch._decomp.decompositions.
+
+    This API is experimental; we are almost certainly going to extend
+    the API when we make decompositions eligible for use in transforms (e.g.,
+    autograd) and not just backend tracing, where we then need to know if a
+    decomposition can be used to simulate a transform.
+
+    By default, if the decomposition is for an operator that doesn't have
+    a Meta implementation, we will register it to the dispatcher.  Use
+    `disable_meta` to disable this behavior.
+    """
+    def decomposition_decorator(f):
+        nonlocal registry
+        if registry is None:
+            registry = decomposition_table
+
+        def add_op_to_table(aten_op):
+            overloads = []
+            if isinstance(aten_op, torch._ops.OpOverload):
+                overloads.append(aten_op)
+            else:
+                assert isinstance(aten_op, torch._ops.OpOverloadPacket)
+                for ol in aten_op.overloads():
+                    overloads.append(getattr(aten_op, ol))
+            for op_overload in overloads:
+                if op_overload in registry:
+                    raise RuntimeError(f"duplicate registrations for {op_overload}")
+                registry[op_overload] = f
+                # TODO: factor this logic into OpOverload or Library API
+                name = op_overload._schema.name
+                if op_overload._schema.overload_name:
+                    name += "." + op_overload._schema.overload_name
+                if (
+                    not disable_meta
+                    # TorchScript dumps a bunch of extra nonsense overloads
+                    # which don't have corresponding dispatcher entries, we need
+                    # to filter those out
+                    and torch._C._dispatch_has_kernel(name)
+                    and not torch._C._dispatch_has_kernel_for_dispatch_key(name, 'Meta')
+                ):
+                    meta_lib.impl(op_overload, f)
+
+        # To handle allowing multiple aten_ops at once
+        tree_map(add_op_to_table, aten_op)
+        return f
+
+    return decomposition_decorator
+
+
+def get_decompositions(
+    aten_ops: Sequence[Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket]]
+) -> Dict[torch._ops.OpOverload, Callable]:
+    """
+    Retrieve a dictionary of decompositions corresponding to the list of
+    operator overloads and overload packets passed as input.  Overload
+    packets will include all decomposed overloads in the packet.  If there is
+    no decomposition for a requested operator, it is silently ignored.
+
+    This API is experimental; we are almost certainly going to give an alternate,
+    more recommended formulation, where a user provides the set of operators
+    they know how to implement, and we provide decompositions for everything
+    not in this set.
+    """
+    packets_to_overloads = defaultdict(list)
+    for opo in decomposition_table:
+        packets_to_overloads[opo.overloadpacket].append(opo)
+    decompositions = {}
+    for op in aten_ops:
+        if isinstance(op, torch._ops.OpOverloadPacket) and op in packets_to_overloads:
+            for op_overload in packets_to_overloads[op]:
+                decompositions[op_overload] = decomposition_table[op_overload]
+        elif isinstance(op, torch._ops.OpOverload) and op in decomposition_table:
+            decompositions[op] = decomposition_table[op]
+    return decompositions
+
+# populate the table
+import torch._decomp.decompositions
+import torch._refs
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
new file mode 100644
index 000000000000..5541506b72a5
--- /dev/null
+++ b/torch/_decomp/decompositions.py
@@ -0,0 +1,1289 @@
+import torch
+from torch import Tensor
+from torch._decomp import register_decomposition
+from enum import Enum
+from typing import Tuple, Optional, List, Callable
+import torch.nn.functional as F
+import functools
+from torch.utils._pytree import tree_map, tree_flatten
+import torch._prims.utils as utils
+
+# None of these functions are publicly accessible; get at them
+# from torch._decomps
+__all__: List[str] = []
+
+aten = torch.ops.aten
+
+
+class Reduction(Enum):
+    NONE = 0
+    MEAN = 1
+    SUM = 2
+
+
+# This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided
+# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
+# Will need to validate the non-elementwise uses
+def type_casts(f: Callable, type_promotion: utils.ELEMENTWISE_TYPE_PROMOTION_KIND):
+    @functools.wraps(f)
+    def inner(*args, **kwargs):
+        flat_args = [x for x in tree_flatten((args, kwargs))[0] if isinstance(x, Tensor)]
+        computation_dtype, result_dtype = utils.elementwise_dtypes(*flat_args,
+                                                                   type_promotion_kind=type_promotion)
+
+        # TODO: pretty sure this is not quite right
+        def increase_prec(x):
+            if isinstance(x, Tensor):
+                return x.to(computation_dtype)
+            else:
+                return x
+
+        def decrease_prec(x):
+            if isinstance(x, Tensor):
+                return x.to(result_dtype)
+            else:
+                return x
+
+        r = f(*tree_map(increase_prec, args), **tree_map(increase_prec, kwargs))
+        return tree_map(decrease_prec, r)
+
+    return inner
+
+pw_cast_for_opmath = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+reduction_complex_to_real = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT)
+pw_cast_for_int_to_real = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+
+# This expands x until x.dim() == dim. Might be useful as an operator
+def _unsqueeze_to_dim(x: Tensor, dim: int):
+    for _ in range(dim - x.dim()):
+        x = x.unsqueeze(-1)
+    return x
+
+
+@register_decomposition(aten.tanh_backward)
+@pw_cast_for_opmath
+def tanh_backward(out_grad: Tensor, y: Tensor):
+    return out_grad * (1 - y * y).conj_physical()
+
+
+@register_decomposition(aten.sigmoid_backward)
+@pw_cast_for_opmath
+def sigmoid_backward(out_grad: Tensor, y: Tensor):
+    return out_grad * (y * (1 - y)).conj_physical()
+
+
+@register_decomposition(aten.softplus_backward)
+@pw_cast_for_opmath
+def softplus_backward(out_grad: Tensor, x: Tensor, beta: float, threshold: float):
+    z = (x * beta).exp()
+    return torch.where((x * beta) > threshold, out_grad, out_grad * z / (z + 1.0))
+
+
+@register_decomposition(aten.elu)
+@pw_cast_for_opmath
+def elu(
+    self: Tensor, alpha: float = 1, scale: float = 1, input_scale: float = 1
+) -> Tensor:
+    negcoef = alpha * scale
+    poscoef = scale
+    negiptcoef = input_scale
+    return torch.where(
+        self > 0, self * poscoef, (torch.exp(self * negiptcoef) - 1) * negcoef
+    )
+
+
+@register_decomposition(aten.elu_backward)
+@pw_cast_for_opmath
+def elu_backward(
+    grad_output: Tensor,
+    alpha: float,
+    scale: float,
+    input_scale: float,
+    is_result: bool,
+    self_or_result: Tensor,
+):
+    negcoef = alpha * scale
+    poscoef = scale
+    negiptcoef = input_scale
+    if is_result:
+        return torch.where(
+            self_or_result <= 0,
+            grad_output * negiptcoef * (self_or_result + negcoef),
+            self_or_result * poscoef,
+        )
+    else:
+        return torch.where(
+            self_or_result <= 0,
+            grad_output * negiptcoef * negcoef * torch.exp(self_or_result * negiptcoef),
+            grad_output * poscoef,
+        )
+
+
+@register_decomposition(aten.hardsigmoid)
+@pw_cast_for_opmath
+def hardsigmoid(self: Tensor) -> Tensor:
+    return torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6
+
+
+@register_decomposition(aten.hardsigmoid_backward)
+@pw_cast_for_opmath
+def hardsigmoid_backward(grad_output: Tensor, self: Tensor):
+    return torch.where(
+        (self > -3.0) & (self < 3.0),
+        grad_output * (1.0 / 6.0),
+        grad_output.new_zeros(()),
+    )
+
+
+@register_decomposition(aten.hardtanh)
+@pw_cast_for_opmath
+def hardtanh(self: Tensor, min_val: float = -1, max_val: float = 1) -> Tensor:
+    return torch.clamp(self, min_val, max_val)
+
+
+@register_decomposition(aten.hardtanh_backward)
+@pw_cast_for_opmath
+def hardtanh_backward(
+    grad_output: Tensor, self: Tensor, min_val: float, max_val: float
+):
+    return torch.where(
+        (self <= min_val) | (self >= max_val), grad_output.new_zeros(()), grad_output
+    )
+
+
+@register_decomposition(aten.hardshrink_backward)
+@pw_cast_for_opmath
+def hardshrink_backward(grad_out: Tensor, self: Tensor, lambd: float):
+    return torch.where(
+        (self >= -lambd) & (self <= lambd), grad_out.new_zeros(()), grad_out
+    )
+
+
+@register_decomposition(aten.hardswish)
+@pw_cast_for_opmath
+def hardswish(self: Tensor) -> Tensor:
+    return self * torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6
+
+
+@register_decomposition(aten.hardswish_backward)
+@pw_cast_for_opmath
+def hardswish_backward(grad_output: Tensor, self: Tensor) -> Tensor:
+    return torch.where(
+        self < -3,
+        grad_output.new_zeros(()),
+        torch.where(self <= 3, grad_output * ((self / 3) + 0.5), grad_output),
+    )
+
+
+@register_decomposition(aten.threshold_backward)
+@pw_cast_for_opmath
+def threshold_backward(grad_output: Tensor, self: Tensor, threshold: float):
+    return torch.where(self <= threshold, grad_output.new_zeros(()), grad_output)
+
+
+@register_decomposition(aten.leaky_relu)
+@pw_cast_for_opmath
+def leaky_relu(self: Tensor, negative_slope: float = 0.01) -> Tensor:
+    return torch.where(self > 0, self, self * negative_slope)
+
+
+@register_decomposition(aten.leaky_relu_backward)
+@pw_cast_for_opmath
+def leaky_relu_backward(
+    grad_output: Tensor, self: Tensor, negative_slope: float, self_is_result: bool
+):
+    return torch.where(self > 0, grad_output, grad_output * negative_slope)
+
+
+
+@register_decomposition(aten.gelu)
+@pw_cast_for_opmath
+def gelu(self: Tensor, approximate: str = 'none') -> Tensor:
+    M_SQRT2 = 1.41421356237309504880
+    M_SQRT1_2 = 0.70710678118654752440
+    M_2_SQRTPI = 1.12837916709551257390
+    if approximate == 'tanh':
+        kBeta = M_SQRT2 * M_2_SQRTPI * 0.5
+        kKappa = 0.044715
+        x_cube = self * self * self
+        inner = kBeta * (self + kKappa * x_cube)
+        return 0.5 * self * (1 + torch.tanh(inner))
+    else:
+        kAlpha = M_SQRT1_2
+        return self * 0.5 * (1 + torch.erf(self * kAlpha))
+
+
+@register_decomposition(aten.gelu_backward)
+@pw_cast_for_opmath
+def gelu_backward(grad: Tensor, self: Tensor, approximate: str = "none"):
+    M_SQRT2 = 1.41421356237309504880
+    M_SQRT1_2 = 0.70710678118654752440
+    M_2_SQRTPI = 1.12837916709551257390
+    if approximate == 'tanh':
+        kBeta = M_SQRT2 * M_2_SQRTPI * 0.5
+        kKappa = 0.044715
+        x_sq = self * self
+        x_cube = x_sq * self
+        inner = kBeta * (self + kKappa * x_cube)
+        tanh_inner = torch.tanh(inner)
+
+        left = 0.5 * self
+        right = 1 + tanh_inner
+
+        left_derivative = 0.5 * right
+
+        tanh_derivative = 1 - tanh_inner * tanh_inner
+        inner_derivative = kBeta * (1 + 3 * kKappa * x_sq)
+        right_derivative = left * tanh_derivative * inner_derivative
+
+        return grad * (left_derivative + right_derivative)
+    else:
+        kAlpha = M_SQRT1_2
+        kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5
+        cdf = 0.5 * (1 + torch.erf(self * kAlpha))
+        pdf = kBeta * torch.exp(self * self * -0.5)
+        return grad * (cdf + self * pdf)
+
+
+@register_decomposition(aten.mish_backward)
+@pw_cast_for_opmath
+def mish_backward(grad_output: Tensor, input: Tensor):
+    input_tanh_softplus = torch.tanh(F.softplus(input))
+    input_sigmoid = torch.sigmoid(input)
+    out = input * input_sigmoid * (1 - input_tanh_softplus * input_tanh_softplus)
+    return grad_output * (input_tanh_softplus + out)
+
+
+@register_decomposition(aten.silu)
+@pw_cast_for_opmath
+def silu(self: Tensor) -> Tensor:
+    return self * torch.sigmoid(self)
+
+
+@register_decomposition(aten.silu_backward)
+@pw_cast_for_opmath
+def silu_backward(grad_output: Tensor, self: Tensor) -> Tensor:
+    sigmoid = 1 / (1 + torch.exp(-self))
+    return grad_output * sigmoid * (1 + self * (1 - sigmoid))
+
+
+@register_decomposition(aten.softshrink_backward)
+def softshrink_backward(grad_output: Tensor, self: Tensor, lambd: float) -> Tensor:
+    return torch.where(
+        (self >= -lambd) & (self <= lambd), grad_output.new_zeros(()), grad_output
+    )
+
+
+@register_decomposition(aten.prelu_backward)
+@pw_cast_for_opmath
+def prelu_backward(
+    grad_output: Tensor, self: Tensor, weight: Tensor
+) -> Tuple[Tensor, Tensor]:
+    # Logic is more complicated than I would like.  Basically, weight can either
+    # be a scalar or a vector of size [C], and in the forward pass it's
+    # broadcast against [N, C, ...]. So now, we need to do the corresponding
+    # reduction, which is harder than we'd like...
+    cur_weight = weight
+    for _ in range(2, grad_output.dim()):
+        cur_weight = cur_weight.unsqueeze(-1)
+    input_grad = torch.where(self > 0, grad_output, cur_weight * grad_output)
+    weight_grad_collector = torch.where(
+        self > 0, grad_output.new_zeros(()), self * grad_output
+    )
+    out = weight_grad_collector.sum_to_size(cur_weight.shape)
+    while out.dim() > weight.dim():
+        out = out.squeeze(-1)
+    return (input_grad, out)
+
+
+@register_decomposition(aten.rrelu_with_noise_backward)
+@pw_cast_for_opmath
+def rrelu_with_noise_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    noise: Tensor,
+    lower: float,
+    upper: float,
+    training: bool,
+    self_is_result: bool,
+) -> Tensor:
+    if training and upper - lower > 1e-6:
+        return grad_output.mul(noise)
+    else:
+        negative_slope = (lower + upper) / 2
+        return aten.leaky_relu_backward(grad_output, self, negative_slope, self_is_result)
+
+
+@register_decomposition(aten.log_sigmoid_backward)
+@pw_cast_for_opmath
+def log_sigmoid_backward(grad_output: Tensor, self: Tensor, buffer: Tensor) -> Tensor:
+    in_negative = self < 0
+    max_deriv = torch.where(in_negative, 1, 0)
+    sign = torch.where(in_negative, 1, -1)
+    z = torch.exp(-torch.abs(self))
+    return grad_output * (max_deriv - sign * (z / (1 + z)))
+    # CPU has a special formula that uses buffer, but disabled for convenience sake
+    # return (max_deriv - sign * (buffer / (1 + buffer))) * grad_output
+
+
+def apply_loss_reduction(loss: Tensor, reduction: int):
+    if reduction == Reduction.MEAN.value:
+        return torch.mean(loss)
+    elif reduction == Reduction.SUM.value:
+        return torch.sum(loss)
+    else:
+        return loss
+
+
+def to_real_dtype(dtype: torch.dtype):
+    if dtype == torch.complex32:
+        return torch.float16
+    elif dtype == torch.complex64:
+        return torch.float32
+    elif dtype == torch.complex128:
+        return torch.float64
+
+# TODO: None of these loss castings are quite correct, see
+# https://github.com/pytorch/pytorch/issues/76870. Also, the ATen kernels
+# perform the pointwise portion in opmath, but don't maintain it between the
+# pointwise portion and the reduction
+
+@register_decomposition(aten.l1_loss)
+def l1_loss(
+    self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value
+) -> Tensor:
+    loss = (self - target).abs()
+    # PyTorch semantics result in the output of l1_loss having the corresponding
+    # real dtype to self.  This may not happen without explicit casting if say
+    # self: complex64 and target: float64, which results in loss: float64
+    float_type = to_real_dtype(self.dtype)
+    return apply_loss_reduction(loss, reduction).to(float_type)
+
+
+@register_decomposition(aten.l1_loss_backward)
+@pw_cast_for_opmath
+def l1_loss_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    reduction: int = Reduction.MEAN.value,
+):
+    sign = torch.sign(self - target)
+
+    norm = sign / self.numel() if reduction == Reduction.MEAN.value else sign
+    return grad_output * norm
+
+
+@register_decomposition(aten.mse_loss)
+@pw_cast_for_opmath
+def mse_loss(
+    self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value
+) -> Tensor:
+    loss = (self - target) ** 2
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.mse_loss_backward)
+@pw_cast_for_opmath
+def mse_loss_backward(
+    grad_output: Tensor, input: Tensor, target: Tensor, reduction: int
+):
+    norm = 2.0 / input.numel() if reduction == Reduction.MEAN.value else 2.0
+    return norm * (input - target) * grad_output
+
+
+@register_decomposition(aten.huber_loss)
+@pw_cast_for_opmath
+def huber_loss(
+    self: Tensor,
+    target: Tensor,
+    reduction: int = Reduction.MEAN.value,
+    delta: float = 1.0,
+) -> Tensor:
+    assert delta > 0, "huber_loss does not support non-positive values for delta."
+    z = (self - target).abs()
+    loss = torch.where(z < delta, 0.5 * z * z, delta * (z - 0.5 * delta))
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.huber_loss_backward)
+@pw_cast_for_opmath
+def huber_loss_backward(
+    grad_output: Tensor, self: Tensor, target: Tensor, reduction: int, delta: float
+):
+    norm = 1.0 / self.numel() if reduction == Reduction.MEAN.value else 1.0
+    x = self - target
+    return torch.where(
+        x < -delta,
+        -norm * grad_output * delta,
+        torch.where(x > delta, norm * grad_output * delta, norm * x * grad_output),
+    )
+
+
+def _nll_loss_backward(
+        grad_output: Tensor,
+        self: Tensor,
+        target: Tensor,
+        weight: Optional[Tensor],
+        reduction: int,
+        ignore_index: int,
+        total_weight: Tensor,
+) -> Tensor:
+    channel_dim = 0 if self.dim() < 2 else 1
+    if reduction == Reduction.MEAN.value:
+        grad_output = grad_output / total_weight
+
+    target = target.unsqueeze(channel_dim)
+    grad_input = torch.zeros_like(self)
+    grad_input = torch.scatter(grad_input, channel_dim, target, -1.0)
+
+    if grad_input.dim() > grad_output.dim() > 0:
+        grad_output = grad_output.unsqueeze(channel_dim)
+
+    if weight is not None:
+        new_shape = [1 for _ in range(self.dim())]
+        new_shape[channel_dim] = weight.shape[0]
+        weight = weight.reshape(new_shape)
+        grad_output = grad_output * weight
+
+    has_ignore_index = ignore_index >= 0
+    if has_ignore_index:
+        ignore_index_mask = target != ignore_index
+        grad_output = grad_output * ignore_index_mask
+
+    return grad_input * grad_output
+
+@register_decomposition(aten.nll_loss_backward)
+def nll_loss_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+) -> Tensor:
+    assert 0 <= self.dim() <= 2, "input tensor should be 1D or 2D"
+    assert (
+        target.dim() <= 1
+    ), "0D or 1D target tensor expected, multi-target not supported"
+
+    no_batch_dim = self.dim() == 1 and target.dim() == 0
+    assert no_batch_dim or (
+        self.shape[0] == target.shape[0]
+    ), f"size mismatch (got input: {self.shape}, target: {target.shape})"
+    assert total_weight.numel() == 1, (
+        "expected total_weight to be a single element tensor, got: ",
+        f"{total_weight.shape} ({total_weight.numel()} elements)",
+    )
+
+    assert (
+        weight is None or weight.numel() == self.shape[-1]
+    ), "weight tensor should be defined either for all or no classes"
+
+    if reduction == Reduction.NONE.value and self.dim() == 2:
+        assert grad_output.dim() == 1 and grad_output.shape[0] == self.shape[0], (
+            f"Expected a tensor of dimension 1 and tensor.size[0] == {self.shape[0]} but "
+            f"got: dimension {grad_output.dim()} and tensor.size[0] == {grad_output.shape[0]}"
+        )
+    else:
+        assert (
+            grad_output.dim() <= 1 and grad_output.numel() == 1
+        ), f"Expected a single element grad_output tensor, but got: {grad_output.shape}"
+
+    return _nll_loss_backward(grad_output, self, target, weight, reduction, ignore_index, total_weight)
+
+
+@register_decomposition(aten.nll_loss2d_backward)
+def nll_loss2d_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+) -> Tensor:
+    assert (
+        self.dim() == 4
+    ), f"only batches of spatial inputs supported (4D tensors), but got input of dimension: {self.dim()}"
+
+    assert (
+        target.dim() == 3
+    ), f"only batches of spatial targets supported (3D tensors) but got targets of dimension: {target.dim()}"
+
+    assert(
+        self.shape[0] == target.shape[0] and self.shape[2] == target.shape[1] and self.shape[3] == target.shape[2]
+    ), f"size mismatch (got input: {self.shape}, target: {target.shape}"
+
+    assert (
+        total_weight.numel() == 1
+    ), (
+        "expected total_weight to be a single element tensor, "
+        f"got: {total_weight.shape} ( {total_weight.numel()}, elements)"
+    )
+
+    return _nll_loss_backward(grad_output, self, target, weight, reduction, ignore_index, total_weight)
+
+
+@register_decomposition(aten.binary_cross_entropy)
+@pw_cast_for_opmath
+def binary_cross_entropy(
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    # We cannot currently model this without introducing data-dependent control flow
+    # TORCH_CHECK(
+    #     (input_val >= 0) && (input_val <= 1),
+    #     "all elements of input should be between 0 and 1"
+    # )
+    loss = (target - 1) * torch.maximum(
+        torch.log(1 - self), self.new_full((), -100)
+    ) - target * torch.maximum(torch.log(self), self.new_full((), -100))
+    if weight is not None:
+        loss = loss * weight
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.binary_cross_entropy_backward)
+@pw_cast_for_opmath
+def binary_cross_entropy_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    EPSILON = 1e-12
+    result = grad_output * (self - target) / torch.clamp(self * (1 - self), min=EPSILON)
+    if weight is not None:
+        result = result * weight
+    if reduction == Reduction.MEAN.value:
+        result = result / self.numel()
+    return result
+
+
+@register_decomposition(aten._euclidean_dist)
+def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor:
+    x1_norm = x1.pow(2).sum(-1, True)
+    x1_pad = torch.ones_like(x1_norm, memory_format=torch.contiguous_format)
+    x2_norm = x2.pow(2).sum(-1, True)
+    x2_pad = torch.ones_like(x2_norm, memory_format=torch.contiguous_format)
+    x1_ = torch.cat([x1.mul(-2), x1_norm, x1_pad], -1)
+    x2_ = torch.cat([x2, x2_pad, x2_norm], -1)
+    result = x1_.matmul(x2_.mT)
+    return result.clamp_min(0).sqrt()
+
+
+@register_decomposition(aten.slice_backward)
+def slice_backward(
+    grad_output: Tensor,
+    input_sizes: List[int],
+    dim: int,
+    start: int,
+    end: int,
+    step: int,
+):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.slice_scatter(grad_input, grad_output, dim, start, end, step)
+
+
+@register_decomposition(aten.select_backward)
+def select_backward(grad_output: Tensor, input_sizes: List[int], dim: int, index: int):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.select_scatter(grad_input, grad_output, dim, index)
+
+
+@register_decomposition(aten.diagonal_backward)
+def diagonal_backward(
+    grad_output: Tensor, input_sizes: List[int], offset: int, dim1: int, dim2: int
+):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.diagonal_scatter(grad_input, grad_output, offset, dim1, dim2)
+
+
+@register_decomposition(aten._softmax_backward_data)
+@pw_cast_for_opmath
+def _softmax_backward_data(
+    grad_output: Tensor, output: Tensor, dim: int, input_dtype: int
+):
+    new_grad = grad_output * output
+    return new_grad - output * torch.sum(new_grad, dim=dim, keepdim=True)
+
+
+@register_decomposition(aten._log_softmax_backward_data)
+@pw_cast_for_opmath
+def _log_softmax_backward_data(
+    grad_output: Tensor, output: Tensor, dim: int, input_dtype: int
+):
+    grad_input = grad_output - torch.exp(output) * torch.sum(
+        grad_output, dim=dim, keepdim=True
+    )
+    return grad_input
+
+
+# TODO: the type annotations on arguments are not quite right
+
+
+@register_decomposition(aten.im2col_backward)
+def im2col_backward(
+    grad_output: Tensor,
+    input_size: List[int],
+    kernel_size: List[int],
+    dilation: List[int],
+    padding: List[int],
+    stride: List[int],
+) -> Tensor:
+    return F.fold(grad_output, input_size, kernel_size, dilation, padding, stride)  # type: ignore[arg-type]
+
+
+@register_decomposition(aten.col2im_backward)
+def col2im_backward(
+    grad_output: Tensor,
+    kernel_size: List[int],
+    dilation: List[int],
+    padding: List[int],
+    stride: List[int],
+) -> Tensor:
+    return F.unfold(grad_output, kernel_size, dilation, padding, stride)  # type: ignore[arg-type]
+
+
+@register_decomposition(aten.masked_fill.Scalar)
+def masked_fill_Scalar(self: Tensor, mask: Tensor, value: float) -> Tensor:
+    return torch.where(mask, utils.dtype_to_type(self.dtype)(value), self)
+
+
+@register_decomposition(aten.masked_fill.Tensor)
+def masked_fill_Tensor(self: Tensor, mask: Tensor, value: Tensor) -> Tensor:
+    return torch.where(mask, value, self)
+
+
+@register_decomposition(aten.native_dropout_backward)
+@pw_cast_for_opmath
+def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float):
+    return grad_output * (mask.type_as(grad_output) * scale)
+
+
+@register_decomposition(aten.logit)
+@pw_cast_for_int_to_real
+def logit(self: Tensor, eps: Optional[float] = None) -> Tensor:
+    if eps is None:
+        eps = -1.0
+    lo = eps
+    hi = 1 - eps
+    self = torch.clamp(self, lo, hi)
+    return (self / (1 - self)).log()
+
+
+@register_decomposition(aten.logit_backward)
+@pw_cast_for_opmath
+def logit_backward(
+    grad_output: Tensor, self: Tensor, eps: Optional[float] = None
+) -> Tensor:
+    if eps is not None:
+        lo = eps
+        hi = 1.0 - lo
+        return torch.where(
+            torch.logical_and(self >= lo, self <= hi),
+            grad_output / (self * (1.0 - self)),
+            self.new_zeros(()),
+        )
+    else:
+        return torch.where(
+            torch.logical_and(self >= 0.0, self <= 1.0),
+            grad_output / (self * (1.0 - self)),
+            self.new_full((), float("nan")),
+        )
+
+
+@register_decomposition(aten.native_dropout)
+@pw_cast_for_opmath
+def native_dropout(input: Tensor, p: float, train: Optional[bool]):
+    if train:
+        bool_mask = torch.rand_like(input) < p
+        res = bool_mask * input * float(1.0 / p)
+        return (res, bool_mask)
+    else:
+        return (input, torch.ones_like(input, dtype=torch.bool))
+
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten._softmax)
+@pw_cast_for_opmath
+def _softmax(x: Tensor, dim: int, half_to_float: bool):
+    x_max = torch.max(x, dim, keepdim=True)[0]
+    unnormalized = torch.exp(x - x_max)
+    return unnormalized / torch.sum(unnormalized, dim, keepdim=True)
+
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten._log_softmax)
+@pw_cast_for_opmath
+def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+    x_max = torch.max(x, dim, keepdim=True)[0]
+    shifted = x - x_max
+    shifted_logsumexp = torch.log(torch.sum(torch.exp(shifted), dim, keepdim=True))
+    return shifted - shifted_logsumexp
+
+
+@register_decomposition(aten.addcdiv)
+@pw_cast_for_opmath
+def addcdiv(self: Tensor, tensor1: Tensor, tensor2: Tensor, value: float = 1):
+    return self + value * (tensor1 / tensor2)
+
+
+# Remove special case when https://github.com/pytorch/pytorch/pull/72949 is landed.
+@register_decomposition(aten.addcmul)
+@pw_cast_for_opmath
+def addcmul(self: Tensor, tensor1: Tensor, tensor2: Tensor, value: float = 1):
+    if self.is_floating_point() or self.is_complex():
+        return self + value * tensor1 * tensor2
+    else:
+        return self + int(value) * tensor1 * tensor2
+
+
+@register_decomposition(aten.rsub.Tensor)
+def rsub_Tensor(self: Tensor, other: Tensor, alpha: float = 1) -> Tensor:
+    return torch.sub(other, self, alpha=alpha)
+
+
+@register_decomposition(aten.rsub.Scalar)
+def rsub_Scalar(self: Tensor, other: float, alpha: float = 1) -> Tensor:
+    return torch.sub(other, self, alpha=alpha)
+
+
+@register_decomposition(aten.embedding)
+def embedding(
+    weight: Tensor,
+    indices: Tensor,
+    padding_idx: int = -1,
+    scale_grad_by_freq: bool = False,
+    sparse: bool = False,
+) -> Tensor:
+    assert weight.dim() == 2, "'weight' must be 2-D"
+    # TODO: Assert not ported over yet
+    #   auto indices_arg = TensorArg(indices, "indices", 1);
+    #   checkScalarTypes("embedding", indices_arg, {kLong, kInt});
+
+    if indices.dim() == 1:
+        return weight.index_select(0, indices)
+
+    size = list(indices.shape)
+    for d in weight.shape[1:]:
+        size.append(d)
+
+    return weight.index_select(0, indices.reshape(-1)).view(size)
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten.embedding_dense_backward)
+def embedding_dense_backward(
+    grad_output: Tensor,
+    indices: Tensor,
+    num_weights: int,
+    padding_idx: int,
+    scale_grad_by_freq: bool,
+):
+    numel = indices.numel()
+    grad = grad_output.view(numel, grad_output.size(-1))
+    grad_weight = grad_output.new_zeros((num_weights, grad_output.shape[-1]))
+    indices_rank1 = indices.view(numel)
+    if scale_grad_by_freq:
+        counts = indices.new_zeros((num_weights,))
+        ones = indices.new_ones((numel,))
+        counts = counts.index_put([indices_rank1], ones, accumulate=True)
+        grad_weights_scale = counts[indices_rank1]
+        grad = grad / grad_weights_scale.unsqueeze(1)
+    skip_padding = (indices_rank1 != padding_idx).unsqueeze(1)
+    skip_padding = skip_padding.expand_as(grad)
+    zero_grad = torch.full_like(grad, 0)
+    return grad_weight.index_put(
+        [indices_rank1], torch.where(skip_padding, grad, zero_grad), accumulate=True
+    )
+
+
+def prod(x: List[int]):
+    r = 1
+    for i in x:
+        r *= i
+    return r
+
+
+@register_decomposition(aten.split_with_sizes)
+def split_with_sizes(
+    self: Tensor, split_sizes: List[int], dim: int = 0
+) -> List[Tensor]:
+    num_splits = len(split_sizes)
+    splits = []
+    start_idx = 0
+    for i in range(num_splits):
+        length = split_sizes[i]
+        splits.append(self.narrow(dim, start_idx, length))
+        start_idx += length
+    return splits
+
+
+@register_decomposition(aten.split.Tensor)
+def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]:
+    input_sizes = self.shape
+    dim_size = input_sizes[dim]
+    if split_size == 0:
+        assert dim_size == 0
+        return [self]
+    chunks = (dim_size + split_size - 1) // split_size
+    split_sizes = [split_size for i in range(chunks)]
+    split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size)
+    return torch.split(self, split_sizes, dim)
+
+
+# TODO: this doesn't appear to have enough precision in bfloat16
+@register_decomposition(aten.addmm)
+@pw_cast_for_opmath
+def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: int = 1, alpha: int = 1):
+    if not self.is_floating_point() and not self.is_complex():
+        beta = int(beta)
+        alpha = int(alpha)
+    out = alpha * torch.mm(mat1, mat2)
+    if beta == 0:
+        return out
+    return beta * self + out
+
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten.native_layer_norm)
+@pw_cast_for_opmath
+def native_layer_norm(
+    input: Tensor,
+    normalized_shape: List[int],
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+
+    axis = input_ndim - len(normalized_shape)
+    M = prod(input_shape[:axis])  # type: ignore[arg-type]
+
+    # Hmm... not sure how I get around this...
+    # Basically, native_batch_norm doesn't support 0-entry tensors, while
+    # native_layer_norm does (and is tested by OpInfos!)
+    if M > 0:
+        input_reshaped = input.view(1, M, -1)
+    else:
+        return (input, input.new_zeros((0,)), input.new_zeros((0,)))
+
+    # Unlike Batch Normalization, which applies scalar scale and bias for each
+    # entire channel/plane with the affine option, Layer Normalization applies
+    # per-element scale and bias. E.g. For input {N, C, H, W}, weight for
+    # batchnorm has shape {C} while weight for layernorm has shape {H, W} or {W}.
+    out, mean, rstd = aten.native_batch_norm(
+        input_reshaped,
+        weight=None,
+        bias=None,
+        running_mean=None,
+        running_var=None,
+        training=True,
+        momentum=0.0,
+        eps=eps,
+    )
+    out = out.view(input_shape)
+    if weight is not None:
+        out = out * weight
+    if bias is not None:
+        out = out + bias
+
+    stat_shape = list(input_shape[:axis])
+    for _ in range(axis, input.dim()):
+        stat_shape.append(1)
+    mean = mean.view(stat_shape)
+    rstd = rstd.view(stat_shape)
+    return (out, mean, rstd)
+
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten.native_layer_norm_backward)
+@pw_cast_for_opmath
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices: List[int] = []
+    outer_dim_indices: List[int] = []
+    for i in range(input_ndim):
+        if i >= axis:
+            inner_dim_indices.append(i)
+        else:
+            outer_dim_indices.append(i)
+
+    N = prod(inner_dims)  # type: ignore[arg-type]
+    M = prod(outer_dims)  # type: ignore[arg-type]
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape),
+            input.new_zeros(input_shape[axis:]),
+            input.new_zeros(input_shape[axis:]),
+        )
+
+    x_hat = (input - mean) * rstd
+    if weight is not None:
+        grad_x_hat = grad_out * weight
+    else:
+        grad_x_hat = grad_out
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+
+    inner = a - b - c3
+
+    if output_mask[0]:
+        d_input: Optional[Tensor] = (rstd / N) * inner
+    else:
+        d_input = None
+
+    if output_mask[1] and weight is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight: Optional[Tensor] = torch.sum(
+                grad_out * x_hat, outer_dim_indices, False
+            )
+        else:
+            d_weight = grad_out * x_hat
+    else:
+        d_weight = None
+
+    if output_mask[2] and bias is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
+        else:
+            d_bias = grad_out
+    else:
+        d_bias = None
+    return (d_input, d_weight, d_bias)
+
+
+# TODO: Correct the type promotion semantics
+@register_decomposition(aten.native_batch_norm)
+@pw_cast_for_opmath
+def native_batch_norm(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    reduction_dims = [0] + list(range(2, input.dim()))
+    if training:
+        # save_mean = torch.sum(input / (input.shape[0] * input.shape[2]), dim=reduction_dims)
+        biased_var, save_mean = torch.var_mean(
+            input, dim=reduction_dims, unbiased=False
+        )
+        save_invstd = 1 / (torch.sqrt(biased_var + eps))
+
+        if running_mean is not None:
+            running_mean.copy_(momentum * save_mean + (1 - momentum) * running_mean)
+        if running_var is not None:
+            n = input.numel() / input.shape[1]
+            # This doesn't strictly match eager's numerics, which accumulates var sum and then directly applies the correction
+            # But... that would require re-implementing var here, for negligible numerics gain on a tensor whose
+            # numerics probably don't matter.
+            unbiased_var = biased_var * (n / (n - 1))
+            running_var.copy_(momentum * unbiased_var + (1 - momentum) * running_var)
+        mean = save_mean
+        invstd = save_invstd
+    else:
+        assert running_mean is not None and running_var is not None
+        mean = running_mean
+        invstd = 1 / (torch.sqrt(running_var + eps))
+        # Very annoying inconsistency where CPU and CUDA give different shapes
+        if input.device.type == "cuda":
+            save_mean = running_mean
+            save_invstd = invstd
+        else:
+            save_mean = input.new_zeros((0,))
+            save_invstd = input.new_zeros((0,))
+
+    if weight is None:
+        weight = input.new_ones(())
+
+    if bias is None:
+        bias = input.new_zeros(())
+
+    mean = _unsqueeze_to_dim(mean, input.dim() - 1)
+    invstd = _unsqueeze_to_dim(invstd, input.dim() - 1)
+    weight = _unsqueeze_to_dim(weight, input.dim() - 1)
+    bias = _unsqueeze_to_dim(bias, input.dim() - 1)
+    output = ((input - mean) * invstd) * weight + bias
+    return output, save_mean, save_invstd
+
+
+@register_decomposition(aten.clamp_min)
+def clamp_min(self: Tensor, min: float):
+    return torch.clamp(self, min=min)
+
+
+@register_decomposition(aten.clamp_max)
+def clamp_max(self: Tensor, max: float):
+    return torch.clamp(self, max=max)
+
+
+@register_decomposition(aten._fused_dropout)
+@pw_cast_for_opmath
+def _fused_dropout_decomposition(input, p, generator=None):
+    mask = (torch.rand_like(input) < p).to(dtype=torch.uint8)
+    res = mask.type_as(input) * input * (1.0 / p)
+    return (res, mask)
+
+
+# TODO: these logical decomps are buggy for complex inputs
+@register_decomposition(aten.logical_xor)
+def logical_xor(self: Tensor, other: Tensor) -> Tensor:
+    return self.to(dtype=torch.bool) ^ other.to(dtype=torch.bool)
+
+
+@register_decomposition(aten.logical_not)
+def logical_not(self: Tensor) -> Tensor:
+    return ~self.to(dtype=torch.bool)
+
+
+@register_decomposition(aten.xlogy.Tensor)
+@pw_cast_for_int_to_real
+def xlogy(self: Tensor, other: Tensor) -> Tensor:
+    return aten.where(aten.isnan(self),
+                      self,
+                      aten.where(self == aten.new_zeros(self, ()),
+                                 aten.new_zeros(self, ()),
+                                 self * aten.log(other)))
+
+
+@register_decomposition(aten.var.correction)
+@reduction_complex_to_real
+def var_correction(
+    x: Tensor,
+    dims: Optional[List[int]],
+    correction: Optional[int] = None,
+    keepdim: bool = False,
+):
+    if dims is None:
+        dims = []
+
+    if x.is_complex():
+        # For complex, calculate variance of real and imaginary components
+        # separately then add to get overall variance.
+        real_in = x.real
+        var_real = torch.var(real_in, dims, correction=correction, keepdim=keepdim)
+        imag_in = x.imag
+        var_imag = torch.var(imag_in, dims, correction=correction, keepdim=keepdim)
+        return var_real + var_imag
+
+    if correction is None:
+        correction = 0
+
+    if len(dims) == 0:
+        n = prod(x.shape)  # type: ignore[arg-type]
+    else:
+        n = 1
+        for dim in dims:
+            n *= x.shape[dim]
+
+    mean = torch.mean(x, dims, True)
+    sub = x - mean
+    sq = sub * sub
+    sum = torch.sum(sq, dims, keepdim)
+
+    if correction:
+        n = n - correction
+
+    return sum / n
+
+
+@register_decomposition(aten.std.correction)
+@reduction_complex_to_real
+def std_decomposition(
+    x: Tensor, dims: List[int], correction: int = 0, keepdim: bool = False
+):
+    return torch.sqrt(torch.var(x, dims, correction=correction, keepdim=keepdim))
+
+
+# Questionable decompositions
+# This is only valid if we're running the graph without autograd, such as if the backward pass has been traced.
+# Note that this decomposition causes issues with in-place ops
+@register_decomposition(aten.detach, disable_meta=True)
+def detach_decomposition(x):
+    return x
+
+
+@register_decomposition(aten.cudnn_batch_norm)
+def cudnn_batch_norm(
+    input: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+):
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+    # Cudnn return running mean and variance when training is True
+    if training:
+        return (a, b, c, input.new_zeros((0,), dtype=torch.uint8))
+    return (a, input.new_zeros((0,)), input.new_zeros((0,)), input.new_zeros((0,), dtype=torch.uint8))
+
+
+@register_decomposition(aten.cudnn_batch_norm_backward)
+def cudnn_batch_norm_backward(
+    input: Tensor,
+    grad_output: Tensor,
+    weight: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_var: Optional[Tensor],
+    epsilon: float,
+    reserveSpace: Tensor,
+):
+    return aten.native_batch_norm_backward(
+        grad_output,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_var,
+        True,
+        epsilon,
+        [True, True, True],
+    )
+
+
+@register_decomposition(aten.rot90.default)
+def rot90(self: Tensor, k: int = 1, dims: List[int] = [0, 1]) -> Tensor:  # noqa: B006
+    total_dims = self.dim()
+    total_rot_dims = len(dims)
+    assert total_rot_dims == 2, f"expected total rotation dims == 2, but got dims = {total_rot_dims}"
+    assert total_dims >= 2, f"expected total dims >= 2, but got total dims = {total_dims}"
+    assert dims[0] != dims[1] and abs(dims[0] - dims[1]) != total_dims,\
+           f"expected rotation dims to be different, but got dim0 = {dims[0]} and dim1 = {dims[1]}"
+    assert dims[0] < total_dims and dims[0] >= -total_dims, f"Rotation dim0 out of range, dim0 = {dims[0]}"
+    assert dims[1] < total_dims and dims[1] >= -total_dims, f"Rotation dim1 out of range, dim1 = {dims[1]}"
+    k = k % 4
+    if k == 1:
+        return self.flip(dims[1]).transpose(dims[0], dims[1])
+    elif k == 2:
+        return self.flip(dims)
+    elif k == 3:
+        return self.flip(dims[0]).transpose(dims[0], dims[1])
+    else:
+        return self.clone(memory_format=torch.contiguous_format)
+
+
+@register_decomposition(aten.transpose.int)
+def transpose_int(self: Tensor, dim0: int, dim1: int) -> Tensor:
+    dim0, dim1 = utils.canonicalize_dims(self.dim(), (dim0, dim1))  # type: ignore[misc]
+
+    if self.dim() <= 1:
+        return self
+
+    if dim0 == dim1:
+        return self
+    perm = list(range(self.dim()))
+    perm[dim0], perm[dim1] = perm[dim1], perm[dim0]
+    return torch.permute(self, perm)
+
+
+@register_decomposition(aten.t.default)
+def t(self: Tensor) -> Tensor:
+    return self.transpose(0, 0 if self.dim() < 2 else 1)
+
+
+def check_stack_inputs(tensors: List[Tensor]):
+    entry_shape = tensors[0].shape
+    for i in range(1, len(tensors)):
+        assert tensors[i].shape == entry_shape, (f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0"
+                                                 f"and {tensors[i].shape} at entry {i}")
+
+
+def get_stack_inputs(tensors: List[Tensor], dim: int):
+    check_stack_inputs(tensors)
+    return [t.unsqueeze(dim) for t in tensors]
+
+
+@register_decomposition(aten.stack.default)
+def stack(tensors: List[Tensor], dim: int = 0) -> Tensor:
+    assert len(tensors) > 0, "stack expects a non-empty TensorList"
+    wrapped_dim = utils.canonicalize_dim(tensors[0].dim() + 1, dim)
+    if wrapped_dim < tensors[0].dim() and not tensors[0].is_sparse:
+        check_stack_inputs(tensors)
+        result_sizes = list(tensors[0].shape)
+        result_sizes.insert(wrapped_dim, len(tensors))
+        out = torch.cat(tensors, wrapped_dim)
+        return out.view(result_sizes)
+    else:
+        return torch.cat(get_stack_inputs(tensors, wrapped_dim), dim)
+
+
+def _squeeze_multiple(self: Tensor, dims: List[int]) -> Tensor:
+    ndim = self.dim()
+    wrapped_dims = utils.canonicalize_dims(ndim, dims)
+    assert isinstance(wrapped_dims, tuple)
+    for idx in range(ndim - 1, -1, -1):
+        if idx in wrapped_dims:
+            self = self.squeeze(idx)
+    return self
+
+
+@register_decomposition(aten.logsumexp.default)
+@pw_cast_for_int_to_real
+def logsumexp(self: Tensor, dim: List[int], keepdim: bool = False) -> Tensor:
+    if self.numel() == 0:
+        return torch.sum(torch.exp(self), dim, keepdim).log()
+    maxes = torch.amax(self, dim, keepdim=True)
+    maxes_squeezed = maxes if keepdim else _squeeze_multiple(maxes, dim)
+    maxes_squeezed = torch.masked_fill(maxes_squeezed, maxes_squeezed.abs() == float('inf'), 0)
+    result = torch.sum(torch.exp(self - maxes), dim, keepdim)
+    return result.log().add(maxes_squeezed)
+
+
+@register_decomposition(aten.trace.default)
+def trace(self: Tensor) -> Tensor:
+    return torch.sum(torch.diag(self))
+
+
+# nb: Should use acc_t, not op_math
+@register_decomposition(aten.log_sigmoid_forward.default)
+@pw_cast_for_opmath
+def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
+    min = torch.minimum(self.new_zeros(()), self)
+    z = torch.exp(-torch.abs(self))
+    if self.is_cuda:
+        buffer = self.new_zeros((0,))
+    else:
+        buffer = z
+    return min - torch.log1p(z), buffer
diff --git a/torch/_deploy.py b/torch/_deploy.py
index 4a27e3753d3d..4cdb6f6f92e1 100644
--- a/torch/_deploy.py
+++ b/torch/_deploy.py
@@ -17,8 +17,8 @@ def _save_storages(importer, obj):
         importers = sys_importer
 
     def persistent_id(obj):
-        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
-            if isinstance(obj, torch.storage.TypedStorage):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage._TypedStorage):
+            if isinstance(obj, torch.storage._TypedStorage):
                 # TODO: Once we decide to break serialization FC, we can
                 # remove this case
                 storage = obj._storage
@@ -59,10 +59,10 @@ def persistent_load(saved_id):
 
         if typename == 'storage':
             # TODO: Once we decide to break serialization FC, we can
-            # stop wrapping with TypedStorage
+            # stop wrapping with _TypedStorage
             storage = serialized_storages[data[0]]
             dtype = serialized_dtypes[data[0]]
-            return torch.storage.TypedStorage(
+            return torch.storage._TypedStorage(
                 wrap_storage=storage._untyped(),
                 dtype=dtype)
 
@@ -82,7 +82,7 @@ def persistent_load(saved_id):
         importer = sys_importer
 
     unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
-    unpickler.persistent_load = persistent_load
+    unpickler.persistent_load = persistent_load  # type: ignore[assignment]
     result = _deploy_objects[id] = unpickler.load()
     return result
 
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 20616a978d45..3c067d5c1c53 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -18,6 +18,7 @@
 import typing
 import io
 import pickle
+import threading
 # This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`.
 # Explicitly ask to import `torch.distributed.__init__` first.
 # Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised.
@@ -977,7 +978,7 @@ def linear(x):
 
 
 # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj.
-def _qualified_name(obj) -> str:
+def _qualified_name(obj, mangle_name=True) -> str:
     # This special case allows us to override the qualified name on a type.
     # It's currently used in conjunction with tracing, where we create a
     # fake module to filter only supported attributes. However, since this
@@ -1026,13 +1027,16 @@ def _qualified_name(obj) -> str:
         module_name = module_name.replace("<", "_")
         module_name = module_name.replace(">", "_")
 
-    # __main__ is a builtin module, so rewrite it to "__torch__".
-    if module_name == "__main__":
-        module_name = "__torch__"
-    else:
-        # Everything else gets a "__torch__" prefix to avoid name collisions
-        # with the names of user values.
-        module_name = "__torch__." + module_name
+    # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h
+    # does not need mangle the python class name.
+    if mangle_name:
+        # __main__ is a builtin module, so rewrite it to "__torch__".
+        if module_name == "__main__":
+            module_name = "__torch__"
+        else:
+            # Everything else gets a "__torch__" prefix to avoid name collisions
+            # with the names of user values.
+            module_name = "__torch__." + module_name
 
     if "." in name:
         raise RuntimeError(f"Could not get qualified name for class '{name}': "
@@ -1248,6 +1252,8 @@ def persistent_id(self, obj):
             return ""
         if isinstance(obj, torch.cuda.Event):
             return ""
+        if isinstance(obj, threading.Thread):
+            return ""
         return None
 
 
diff --git a/torch/_lazy/__init__.py b/torch/_lazy/__init__.py
new file mode 100644
index 000000000000..ff4e90c0edf2
--- /dev/null
+++ b/torch/_lazy/__init__.py
@@ -0,0 +1,33 @@
+import torch._C._lazy
+
+
+def mark_step(device: str = "lazy:0", wait=False):
+    """Triggers a mark step, which amounts to
+    - collecting a group of 'live' lazy tensors to index into the compilation cache
+      (lowering/compiling their IR graphs if not cached)
+    - kicking off execution of the compiled function
+    - (optionally, wait=True) waiting for cpu-side execution to complete (does not sync the accelerator)
+    """
+    # TODO(whc) expand this to include backend hooks and align with XLA backend needs
+    torch._C._lazy._mark_step(device, [], wait=wait)
+
+def wait_device_ops(devices=None):
+    """Waits for all the async operations on the given devices to complete.
+    Args:
+      devices (string..., optional): The devices whose async ops need to be waited
+        for. If empty, all the local devices will be waited for.
+    """
+    if devices is None:
+        devices = []
+    torch._C._lazy._wait_device_ops(devices=devices)
+
+def sync_multi(tensors, devices):
+    """
+    Sync the list of lazy tensors so there IR get lowered for the activate backend
+    and the compiled computation graph get cached.
+    """
+    torch._C._lazy._sync_multi(tensors, devices)
+
+def get_tensor_id(tensor):
+    """Return a unique id of the lazy tensor maintained by LTC"""
+    return torch._C._lazy._get_tensor_id(tensor)
diff --git a/torch/_lazy/computation.py b/torch/_lazy/computation.py
new file mode 100644
index 000000000000..7dd57cd7238d
--- /dev/null
+++ b/torch/_lazy/computation.py
@@ -0,0 +1,23 @@
+import torch._C._lazy
+import torch._C._lazy_ts_backend
+
+def get_tensors_ts_device_data_node(tensors):
+    """Return tensor ids and eager tensors for DeviceData nodes in the
+       IR for the passed in lazy tensors.
+
+       TODO: This API is currently ts backend specific. We are working on
+       generalizing it to all backends including XLA.
+    """
+    return torch._C._lazy_ts_backend._get_tensors_ts_device_data_node(tensors)
+
+def get_graph_hash(tensors):
+    """Return the graph hash for the passed in lazy tensors"""
+    return torch._C._lazy._get_graph_hash(tensors)
+
+def run_cached_graph(hash_str, graph_inputs):
+    """Running the cached computation graph with the given inputs
+
+       TODO: This API is currently ts backend specific. We are working on
+       generalizing it to all backends including XLA.
+    """
+    return torch._C._lazy_ts_backend._run_cached_graph(hash_str, graph_inputs)
diff --git a/torch/_lazy/config.py b/torch/_lazy/config.py
new file mode 100644
index 000000000000..c2e72bd7d60b
--- /dev/null
+++ b/torch/_lazy/config.py
@@ -0,0 +1,13 @@
+import torch._C._lazy
+
+def get_force_fallback():
+    """Get the config used to force LTC fallback"""
+    return torch._C._lazy._get_force_fallback()
+
+def set_force_fallback(configval):
+    """Set the config used to force LTC fallback"""
+    torch._C._lazy._set_force_fallback(configval)
+
+def set_reuse_ir(val: bool):
+    """Set the config to reuse IR nodes for faster tracing"""
+    torch._C._lazy._set_reuse_ir(val)
diff --git a/torch/_lazy/debug.py b/torch/_lazy/debug.py
new file mode 100644
index 000000000000..882056ca9c0f
--- /dev/null
+++ b/torch/_lazy/debug.py
@@ -0,0 +1,20 @@
+import torch._C._lazy
+
+
+def render_ir_graph(tensors):
+    """Return a text dump of the LTC IR graph in dot format for the tensors.
+       The text can be processed by tools like dot to be rendered in pdf,png etc."""
+    return torch._C._lazy._get_tensors_dot(tensors)
+
+def dump_ir(tensors, ir_format):
+    """Return a dump of the tensors in the specified format.
+       Valid format are
+       - text: for LTC IR
+       - backend: for the activate backend IR
+    """
+    if ir_format == "text":
+        return torch._C._lazy._get_tensors_text(tensors)
+    elif ir_format == "backend":
+        return torch._C._lazy._get_tensors_backend(tensors)
+    else:
+        raise RuntimeError(f"Unrecognized IR format: {ir_format}")
diff --git a/torch/_lazy/extract_compiled_graph.py b/torch/_lazy/extract_compiled_graph.py
new file mode 100644
index 000000000000..37d0e67f31f3
--- /dev/null
+++ b/torch/_lazy/extract_compiled_graph.py
@@ -0,0 +1,199 @@
+import torch._lazy.metrics as metrics
+from torch._lazy.tensor_factory_functions import tensor_factory_functions
+from torch._lazy import computation
+from torch._lazy import debug as lazy_debug
+import torch._lazy as lazy
+import dataclasses
+from typing import List, Dict, Any, Callable
+import copy
+from torch import fx
+import torch
+import itertools
+import os
+
+debug = os.environ.get("debug_extract_compiled_graph") is not None
+
+@dataclasses.dataclass
+class GraphInputMatcher:
+    """
+    The GraphInputMatcher class setup the graph inputs for future calls after lazy tracing.
+    Specifically, those graph inputs corresponding to method parameters should be replaced with the
+    arguments for the current call.
+
+    tensor_id_to_arg_idx maps the tensor id to the parameter index.
+    graph_input_tensor_ids, graph_input_ivalues list the tensor_id and ivalue for each of the
+    TS/XLA graph inputs.
+    """
+    tensor_id_to_arg_idx: Dict[int, int]
+    graph_input_tensor_ids: List[int]
+    # there are 2 categories of graph_input_tensors.
+    # Category 1: those whose id are not found in tensor_id_to_arg_idx. These are
+    # most likely const tensors and we can get its content from graph_input_tensors
+    # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get
+    #  the tensor from method arguments
+    graph_input_ivalues: List[Any]
+
+    # get the real graph input tensors
+    def __call__(self, args):
+        real_input = []
+        for tensor_id, traced_ivalue in zip(self.graph_input_tensor_ids, self.graph_input_ivalues):
+            arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None)
+            if arg_idx is None:
+                inp = traced_ivalue
+            else:
+                inp = args[arg_idx]
+            real_input.append(inp)
+        return real_input
+
+class ReturnValueHandler:
+    r"""
+    When ltc_sync_multi is called on multi tensors, the compiled graph
+    will contain output only for unique tensors - if a tensor appears multiple
+    times in the input to _ltc_sync_multi, only the first occurance matters.
+
+    However from python level, we still expect multi tensors returned with duplciation
+    even if the TS graph dedup the output. e.g. for method:
+
+      def forward(self, a):
+        return a, a
+
+    the TS graph captured by LTC will return a single tensor, but Python method expects 2.
+
+    This class dedup the lazy tensors first to get the index that will be used
+    to duplicate the eager tensors later.
+    """
+    def __init__(self, lazy_out_list):
+        self.index: List[List[int]] = []
+        self.total_count = len(lazy_out_list)
+
+        tensor_id_to_idx: Dict[int, int] = dict()
+        for dup_idx, lazy_tensor in enumerate(lazy_out_list):
+            uniq_idx = tensor_id_to_idx.get(id(lazy_tensor), None)
+            if uniq_idx is not None:
+                self.index[uniq_idx].append(dup_idx)
+            else:
+                uniq_idx = len(self.index)
+                self.index.append([dup_idx])
+                tensor_id_to_idx[id(lazy_tensor)] = uniq_idx
+
+    def duplicate_eager_tensors(self, eager_tensor_list):
+        duplicated_list = [None] * self.total_count
+        assert len(eager_tensor_list) == len(self.index)
+
+        for uniq_idx, eager_tensor in enumerate(eager_tensor_list):
+            for dup_idx in self.index[uniq_idx]:
+                duplicated_list[dup_idx] = eager_tensor
+        return duplicated_list
+
+def force_lazy_device(model: fx.GraphModule):
+    """
+    Factory methods in a Fx graph may create tensors for a specific eager devices.
+    If we take no actions, those eager tensors will be mixed with lazy tensors and
+    cause crash. This method overwrite those eager device to lazy device.
+    """
+    def tolazydevice(dev):
+        if isinstance(dev, torch.device):
+            return torch.device("lazy", index=dev.index)
+        return dev
+
+    def hasDeviceArg(args, kwargs):
+        return any(isinstance(arg, torch.device) for arg in itertools.chain(args, kwargs.values()))
+
+    for nd in model.graph.nodes:
+        nd.args = tuple(tolazydevice(arg) for arg in nd.args)
+        nd.kwargs = {k: tolazydevice(v) for k, v in nd.kwargs.items()}
+
+        # For torchbench like yolov3, hf_Bart, dynamo generates Fx graph that return
+        # eager tensors on the default device
+        # (check https://gist.github.com/shunting314/eabdf6c769c59bc384469717b8f9bb7f for yolove,
+        # and https://gist.github.com/shunting314/8d5e2d9348a3258959d3954186c48814 for hf_Bart).
+        # To force those tensors on the lazy device, we can not simply override
+        # the device argument since there is no explicit device argument.
+        # What we are doing here is, for the list of covered tensor factory methods
+        # we add a lazy device argument explicity.
+        #
+        # TODO: This solution is no ideal since we may miss some factory methods. In future
+        # when we support lazy mode, this method can be replaced by that.
+        if nd.target in tensor_factory_functions and not hasDeviceArg(nd.args, nd.kwargs):
+            kwargs = dict(nd.kwargs)  # nd.kwargs is immutable. make a mutable copy.
+            kwargs["device"] = torch.device("lazy")
+            nd.kwargs = kwargs
+
+    model.recompile()
+
+def get_fallback_ops():
+    fallback_ops = []
+    for opname in metrics.counter_names():
+        if "aten::" not in opname:
+            continue
+        val = int(metrics.counter_value(opname))
+        if val > 0:
+            fallback_ops.append(f"{opname}={val}")
+
+    return fallback_ops
+
+def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable:
+    """
+    Optimize an eager model with LTC and returns a wrapper to execute the
+    compiled graph directly without retracing. It depends on other mechanisms
+    like TorchDynamo guards to guarantee the returned wrapper is only called
+    when it's safe.
+    """
+    lazy_args = [arg.to(device="lazy") for arg in example_inputs]
+    args_tensor_ids = [lazy.get_tensor_id(lazy_arg) for lazy_arg in lazy_args]
+    tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)}
+    lazy_model = copy.deepcopy(model).to(device=torch.device("lazy"))
+    force_lazy_device(lazy_model)
+
+    # This line executes lazy tracing and enable us extracting compiled graph later
+    metrics.reset()
+    lazy_out = lazy_model(*lazy_args)
+    fallback_ops = get_fallback_ops()
+    metrics.reset()
+
+    if len(fallback_ops) > 0:
+        raise RuntimeError(f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}")
+
+    if not isinstance(lazy_out, (tuple, list)):
+        lazy_out = (lazy_out,)
+
+    args_and_out = tuple(lazy_args) + tuple(lazy_out)
+    return_value_handler = ReturnValueHandler(args_and_out)
+    if debug:
+        print("Fx code:\n", model.code)
+        print("LTC IR:", lazy_debug.dump_ir(args_and_out, "text"))
+
+    # TODO: this part is TS backend specific for now and will be generalized to
+    # support XLA
+    graph_input_tensor_ids, graph_input_ivalues = computation.get_tensors_ts_device_data_node(args_and_out)
+    assert len(graph_input_tensor_ids) == len(graph_input_ivalues)
+    graph_input_matcher = GraphInputMatcher(tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_ivalues)
+
+    graph_hash = computation.get_graph_hash(args_and_out)
+
+    if debug:
+        print("graph_hash", graph_hash)
+        print(f"args_tensor_ids {args_tensor_ids}")
+        print("tensor ids from device data:", graph_input_tensor_ids)
+
+    # sync the list of output tensors so the computation graph for these
+    # tensors will be cached. Those computation graphs can be retrieved
+    # by graph hash later.
+    lazy.sync_multi(args_and_out, [])
+
+    def optimized_mod(*args):
+        if len(args_and_out) == 0:
+            return ()
+        graph_input = graph_input_matcher(args)
+        res = return_value_handler.duplicate_eager_tensors(computation.run_cached_graph(graph_hash, graph_input))
+
+        assert len(res) == len(args_and_out)
+        for i, arg in enumerate(args):
+            # only copy those tensors that get inplace updated
+            if arg is not res[i]:
+                arg.copy_(res[i])
+
+        # skip the args
+        return res[len(args):]
+
+    return optimized_mod
diff --git a/torch/_lazy/ir_cache.py b/torch/_lazy/ir_cache.py
new file mode 100644
index 000000000000..04f1f103d286
--- /dev/null
+++ b/torch/_lazy/ir_cache.py
@@ -0,0 +1,11 @@
+import torch._C._lazy
+
+def dump(dot_file_name: str):
+    """Dump TrieCache in the dot format"""
+    return torch._C._lazy._dump_ir_cache(dot_file_name)
+
+def reset():
+    """Clear TrieCache. This is needed in testing to avoid
+    node reusing between different tests.
+    """
+    return torch._C._lazy._clear_ir_cache()
diff --git a/torch/_lazy/metrics.py b/torch/_lazy/metrics.py
new file mode 100644
index 000000000000..043db981bb71
--- /dev/null
+++ b/torch/_lazy/metrics.py
@@ -0,0 +1,13 @@
+import torch._C._lazy
+
+def reset():
+    """Resets all metric counters."""
+    torch._C._lazy._reset_metrics()
+
+def counter_names():
+    """Retrieves all the currently active counter names."""
+    return torch._C._lazy._counter_names()
+
+def counter_value(name: str):
+    """Return the value of the counter with the speficied name"""
+    return torch._C._lazy._counter_value(name)
diff --git a/torch/_lazy/tensor_factory_functions.py b/torch/_lazy/tensor_factory_functions.py
new file mode 100644
index 000000000000..47aa9c500466
--- /dev/null
+++ b/torch/_lazy/tensor_factory_functions.py
@@ -0,0 +1,48 @@
+import torch
+
+"""
+tensor_factory_functions defines the list of torch functions that create tensors.
+The list is grabbed by searching thru native_functions.yaml by the following
+regular expression:
+
+  cat native_functions.yaml | grep 'func:' | grep -v "Tensor.*->" | grep "[-]>.*Tensor"
+
+It's possible that new tensor factory functions are added making this list stale.
+Use at your own risk or regenerate the list.
+"""
+tensor_factory_functions = (
+    torch._cudnn_init_dropout_state,
+    torch.arange,
+    torch.bartlett_window,
+    torch.blackman_window,
+    torch._empty_affine_quantized,
+    torch.empty_strided,
+    torch.eye,
+    torch.full,
+    torch.from_file,
+    torch.hann_window,
+    torch.hamming_window,
+    torch.kaiser_window,
+    torch.linspace,
+    torch.logspace,
+    torch.ones,
+    torch.scalar_tensor,
+    torch.rand,
+    torch.randint,
+    torch.randn,
+    torch.randperm,
+    torch.range,
+    torch._efficientzerotensor,
+    torch.zeros,
+    torch.tril_indices,
+    torch.triu_indices,
+    # Note: the following functions match the regular expression search above but
+    # they are not available in the torch module. Comment out.
+    # torch._sparse_coo_tensor_with_dims,
+    # torch.fft_fftfreq,
+    # torch.fft_rfftfreq,
+) + (
+    # torch.tensor is special since it's not in native_functions.yaml
+    # add it separately
+    torch.tensor,
+)
diff --git a/torch/_lazy/ts_backend.py b/torch/_lazy/ts_backend.py
new file mode 100644
index 000000000000..118de2dbefca
--- /dev/null
+++ b/torch/_lazy/ts_backend.py
@@ -0,0 +1,5 @@
+import torch._C._lazy_ts_backend
+
+def init():
+    """Initializes the lazy Torchscript backend"""
+    torch._C._lazy_ts_backend._init()
diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py
index 568ae8b74aae..faa79f7f0cdb 100644
--- a/torch/_linalg_utils.py
+++ b/torch/_linalg_utils.py
@@ -99,3 +99,10 @@ def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
         E = torch.flip(E, dims=(-1,))
         Z = torch.flip(Z, dims=(-1,))
     return E, Z
+
+# This function was deprecated and removed
+# This nice error message can be removed in version 1.13+
+def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. Please use the `torch.linalg.solve` function instead.",
+    )
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 560d9579e61f..cb7a6723683a 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -652,17 +652,16 @@ class LOBPCG(object):
     """
 
     def __init__(self,
-                 A,        # type: Optional[Tensor]
-                 B,        # type: Optional[Tensor]
-                 X,        # type: Tensor
-                 iK,       # type: Optional[Tensor]
-                 iparams,  # type: Dict[str, int]
-                 fparams,  # type: Dict[str, float]
-                 bparams,  # type: Dict[str, bool]
-                 method,   # type: str
-                 tracker   # type: None
-                 ):
-        # type: (...) -> None
+                 A: Optional[Tensor],
+                 B: Optional[Tensor],
+                 X: Tensor,
+                 iK: Optional[Tensor],
+                 iparams: Dict[str, int],
+                 fparams: Dict[str, float],
+                 bparams: Dict[str, bool],
+                 method: str,
+                 tracker: None
+                 ) -> None:
 
         # constant parameters
         self.A = A
@@ -681,10 +680,10 @@ def __init__(self,
         self.E = torch.zeros((n, ), dtype=X.dtype, device=X.device)
         self.R = torch.zeros((m, n), dtype=X.dtype, device=X.device)
         self.S = torch.zeros((m, 3 * n), dtype=X.dtype, device=X.device)
-        self.tvars = {}               # type: Dict[str, Tensor]
-        self.ivars = {'istep': 0}     # type: Dict[str, int]
-        self.fvars = {'_': 0.0}       # type: Dict[str, float]
-        self.bvars = {'_': False}     # type: Dict[str, bool]
+        self.tvars: Dict[str, Tensor] = {}
+        self.ivars: Dict[str, int] = {'istep': 0}
+        self.fvars: Dict[str, float] = {'_': 0.0}
+        self.bvars: Dict[str, bool] = {'_': False}
 
     def __str__(self):
         lines = ['LOPBCG:']
@@ -941,17 +940,15 @@ def _get_rayleigh_ritz_transform(self, S):
         SBS = _utils.qform(B, S)
         d_row = SBS.diagonal(0, -2, -1) ** -0.5
         d_col = d_row.reshape(d_row.shape[0], 1)
+        # TODO use torch.linalg.cholesky_solve once it is implemented
         R = torch.linalg.cholesky((SBS * d_row) * d_col, upper=True)
-        Id = torch.eye(R.size(-1), dtype=R.dtype, device=R.device)
-        Rinv = torch.triangular_solve(Id, R, upper=True).solution
-        return Rinv * d_col
+        return torch.linalg.solve_triangular(R, d_row.diag_embed(), upper=True, left=False)
 
     def _get_svqb(self,
-                  U,     # Tensor
-                  drop,  # bool
-                  tau    # float
-                  ):
-        # type: (Tensor, bool, float) -> Tensor
+                  U: Tensor,     # Tensor
+                  drop: bool,  # bool
+                  tau: float    # float
+                  ) -> Tensor:
         """Return B-orthonormal U.
 
         .. note:: When `drop` is `False` then `svqb` is based on the
diff --git a/torch/_masked/__init__.py b/torch/_masked/__init__.py
index a1b398cb2f49..d679817c8304 100644
--- a/torch/_masked/__init__.py
+++ b/torch/_masked/__init__.py
@@ -2,8 +2,10 @@
 
 from typing import Optional, Tuple, List, Union, Any
 
+import warnings
 import torch
 from torch import Tensor
+from . import _docs
 
 # A workaround to support both TorchScript and MyPy:
 from typing import TYPE_CHECKING
@@ -27,6 +29,26 @@ def _apply_docstring_templates(func):
     """Decorator that applies docstring templates to function docstring
     and returns the function instance.
     """
+
+    doc_string = getattr(_docs, f'{func.__name__}_docstring', None)
+    if doc_string is None:
+        warnings.warn(
+            f'No documentation string available for {func.__name__}.'
+            ' PyTorch team should run `python tools/update_masked_docs.py`'
+            ' to generate the missing docstrings.')
+    else:
+        func.__doc__ = doc_string
+
+    # Expose function as public symbol
+    __all__.append(func.__name__)
+
+    return func
+
+
+def _generate_docstring(func):
+    """An utility function called from tools/update_masked_docs.py
+    script to update the module torch._masked._docs.py
+    """
     docstring_templates = dict(
         reduction_signature='''\
 {function_name}(input, {operation_args}, *, {operation_kwargs}) -> Tensor''',
@@ -139,11 +161,16 @@ def _apply_docstring_templates(func):
         # be removed in the final documentation string.
         sum=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
         prod=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
+        cumsum=(('dim__as_int',), ('dtype=None', 'mask=None')),
+        cumprod=(('dim__as_int',), ('dtype=None', 'mask=None')),
         amin=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
         amax=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
+        argmin=(('dim__as_int',), ('keepdim=False', 'dtype=None', 'mask=None')),
+        argmax=(('dim__as_int',), ('keepdim=False', 'dtype=None', 'mask=None')),
         mean=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
         norm=(('ord', 'dim',), ('keepdim=False', 'dtype=None', 'mask=None')),
         var=(('dim', 'unbiased'), ('keepdim=False', 'dtype=None', 'mask=None')),
+        std=(('dim', 'unbiased'), ('keepdim=False', 'dtype=None', 'mask=None')),
         softmax=(('dim__as_int',), ('dtype=None', 'mask=None')),
         log_softmax=(('dim__as_int',), ('dtype=None', 'mask=None')),
         softmin=(('dim__as_int',), ('dtype=None', 'mask=None')),
@@ -197,22 +224,35 @@ def _apply_docstring_templates(func):
         normalize='''\
 Let ``x`` be a sequence of unmasked elements of one-dimensional slice
 of the :attr:`input` tensor. Normalize of i-th element in ``x`` is
-defined as ``x[i]/max(norm(x, p), eps)``.''')
+defined as ``x[i]/max(norm(x, p), eps)``.''',
+        cumsum='''\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``sum(x[:i])``.''',
+        cumprod='''\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``prod(x[:i])``.''')
 
     reduction_names = dict(
         sum='sum',
         prod='product',
         amax='maximum',
         amin='minimum',
+        argmax='argmax',
+        argmin='argmin',
         mean='mean',
         norm='norm',
-        var='variance')
+        var='variance',
+        std='standard_deviation')
 
     normalization_names = dict(
         softmax='softmax',
         log_softmax='log_softmax',
         softmin='softmin',
-        normalize='normalize')
+        normalize='normalize',
+        cumsum='cumulative_sum',
+        cumprod='cumulative_prod')
 
     operation_names = dict()
     operation_names.update(reduction_names)
@@ -226,7 +266,7 @@ def _apply_docstring_templates(func):
     if func.__name__ in {'norm', 'normalize'}:
         example_args = (2.0, example_dim)
         example_input = example_input.to(dtype=torch.float32)
-    elif func.__name__ in {'var'}:
+    elif func.__name__ in {'var', 'std'}:
         example_args = (example_dim, False)
     else:
         example_args = (example_dim,)
@@ -297,12 +337,7 @@ def _apply_docstring_templates(func):
         doc_template = '\n\n'.join([f'{{{op_kind}_{sec}}}' for sec in doc_sections])
     else:
         doc_template = func.__doc__
-    func.__doc__ = doc_template.format_map(templates)
-
-    # Expose function as public symbol
-    __all__.append(func.__name__)
-
-    return func
+    return doc_template.format_map(templates)
 
 
 def _reduction_identity(op_name: str, input: Tensor, *args):
@@ -322,16 +357,16 @@ def _reduction_identity(op_name: str, input: Tensor, *args):
     dtype: DType = input.dtype
     device = input.device
     op_name = op_name.rsplit('.', 1)[-1]  # lstrip module name when present
-    if op_name == 'sum':
+    if op_name in {'sum', 'cumsum'}:
         return torch.tensor(0, dtype=dtype, device=device)
-    elif op_name == 'prod':
+    elif op_name in {'prod', 'cumprod'}:
         return torch.tensor(1, dtype=dtype, device=device)
-    elif op_name == 'amax':
+    elif op_name in {'amax', 'argmax'}:
         if torch.is_floating_point(input):
             return torch.tensor(-torch.inf, dtype=dtype, device=device)
         elif torch.is_signed(input) or dtype == torch.uint8:
             return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
-    elif op_name == 'amin':
+    elif op_name in {'amin', 'argmin'}:
         if torch.is_floating_point(input):
             return torch.tensor(torch.inf, dtype=dtype, device=device)
         elif torch.is_signed(input) or dtype == torch.uint8:
@@ -349,7 +384,7 @@ def _reduction_identity(op_name: str, input: Tensor, *args):
             assert torch.is_floating_point(input), input.dtype
             return torch.tensor(torch.inf, dtype=dtype, device=device)
         return torch.tensor(0, dtype=dtype, device=device)
-    elif op_name == 'var':
+    elif op_name in {'var', 'std'}:
         return None
     raise NotImplementedError(f'identity of {op_name} on {dtype} input')
 
@@ -358,6 +393,12 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> Tuple[int, ...]:
     """Return dim argument as a tuple of sorted dim values.
     """
     dims: List[int] = []
+    if dim == ():
+        # Currently, `dim=()` in reductions operations means "reduce
+        # over all dimensions" while in future, it will read "no
+        # reduce". See https://github.com/pytorch/pytorch/issues/29137
+        # When gh-29137 is resolved, this if-block must be deleted.
+        dim = None
     if dim is None:
         return tuple(range(ndim))
     ndim = max(ndim, 1)
@@ -371,31 +412,347 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> Tuple[int, ...]:
     return tuple(sorted(dims))
 
 
+def _sparse_coo_flatten_indices(indices: Tensor, shape: tuple):
+    # Flatted N-D indices to 1-D indices
+    flat_indices = indices.new_zeros(indices.size(1))
+    for d, sz in enumerate(shape):
+        flat_indices.mul_(sz)
+        flat_indices.add_(indices[d])
+    return flat_indices
+
+
+def _any(input: Tensor, dim: tuple, keepdim: bool):
+    # Support torch.any with tuple dim argument.
+    # Workaround of https://github.com/pytorch/pytorch/issues/56586
+    r = input
+    for d in reversed(dim):
+        r = r.any(dim=d, keepdim=keepdim)
+    return r
+
+
+def _sparse_coo_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """Sparse variant of torch.where. Supports sparse COO and hybrid sparse COO tensors.
+
+    _sparse_coo_where implements the following invariant:
+
+      _sparse_coo_where(mask, input, fill_value).to_dense(fill_value) ==
+        torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value))
+
+    where `a == b` means `assertEqual(a, b)`, mask is boolean sparse
+    tensor, and `to_dense(fill_value)` is like `to_dense()` except
+    that the unspecified elements are mapped to `fill_value` rather
+    than to `0`.
+
+    Returns a sparse COO tensor with the following features:
+
+    - all specified elements correspond to masked-in elements that
+      have the values of the input tensor. If there exists a masked-in
+      element (as specified by mask) that is not specified in the
+      input, in the result tensor, the corresponding element has value
+      0. In the dense part of the sparse tensor, the masked-out
+      elements are replaced with fill_value.
+
+    - all unspecified elements correspond to masked-out elements.
+    """
+
+    assert input.layout == torch.sparse_coo
+    assert mask.layout == input.layout
+    assert mask.shape == input.shape
+    assert mask.dense_dim() == input.dense_dim()  # TODO: eliminate this restriction
+
+    input = input.coalesce()
+
+    # For set operations on sparse tensor indices, we'll convert
+    # multi-dimensional indices to 1-D indices for efficiency.
+    input_flat_indices = _sparse_coo_flatten_indices(input.indices(), input.shape[:input.sparse_dim()])
+    mask_flat_indices = _sparse_coo_flatten_indices(mask.indices(), mask.shape[:mask.sparse_dim()])
+
+    # the set of mask flat indices that define masked-in elements:
+    if mask.dense_dim() > 0:
+        mask_values = _any(mask.values(), tuple(range(1, input.sparse_dim() + 1)), False)
+    else:
+        mask_values = mask.values()
+    maskin_flat_indices = mask_flat_indices[mask_values.nonzero()[:, 0]]
+
+    def intersection(i1, i2):
+        union, counts = torch.cat([i1, i2]).unique(return_counts=True)
+        return union, torch.where(counts.gt(1))
+
+    def minus(i1, i2):
+        union, counts = torch.cat([i1, i2]).unique(return_counts=True)
+        return intersection(union[torch.where(counts.eq(1))], i1)
+
+    def _apply(a):
+        obj, w = a
+        return obj[w]
+
+    # the set of input flat indices of specified and masked-in elements:
+    maskin_input_flat_indices = _apply(intersection(maskin_flat_indices, input_flat_indices))
+    _, w = intersection(input_flat_indices, maskin_input_flat_indices)
+
+    # the indices and values of masked-in elements
+    where_input_indices = input.indices()[(slice(None),) + w]
+    where_input_values = input.values()[w]
+
+    if mask.dense_dim() > 0:
+        # apply mask to the dense part of the input values:
+        _, w1 = intersection(mask_flat_indices, maskin_input_flat_indices)
+        where_mask_values = mask.values()[w1]
+        where_input_values = torch.where(where_mask_values, where_input_values,
+                                         where_input_values.new_full([], fill_value.item()))
+
+    # the set of flat indices of unspecified input and masked-in elements:
+    maskin_zero_flat_indices = _apply(minus(maskin_flat_indices, maskin_input_flat_indices))
+
+    # the indices of masked-in zero elements
+    _, w = intersection(mask_flat_indices, maskin_zero_flat_indices)
+    where_zero_indices = mask.indices()[(slice(None),) + w]
+
+    # construct result
+    n = where_zero_indices.size(1)
+    if n == 0:
+        # the input is coalesced, hence input_flat_indices are ordered
+        # and the result is guaranteed to be coalesced:
+        result = torch.sparse_coo_tensor(where_input_indices, where_input_values, input.shape)
+        return result._coalesced_(True)
+
+    where_indices = torch.cat([where_input_indices, where_zero_indices], dim=1)
+    where_values = torch.cat([where_input_values, where_input_values.new_zeros((n,) + where_input_values.shape[1:])])
+    result = torch.sparse_coo_tensor(where_indices, where_values, input.shape)
+
+    # appending zero elements leads to uncoalesced sparse tensor
+    return result.coalesce()
+
+
+def _sparse_coo_scatter_reduction_helper(op,
+                                         mask_input: Tensor,
+                                         dims: Tuple[int, ...],
+                                         keepdim: bool,
+                                         dtype: Optional[DType] = None) -> Tensor:
+    reduce = op.__name__
+    valid_reductions = ['sum', 'prod', 'amax', 'amin']
+    if reduce not in valid_reductions:
+        raise ValueError(f"op must be one of {' '.join(valid_reductions)}, but got {reduce} instead")
+
+    output_dtype = dtype
+    values, indices = mask_input._values(), mask_input._indices()
+    input_dims = mask_input.dim()
+    num_sparse_dims = mask_input.sparse_dim()
+    reduced_sparse_dims = []
+    retained_sparse_dims = []
+    reduced_dense_dims = []
+
+    # promote dtype if specified
+    if values.dtype != output_dtype:
+        values = values.to(output_dtype)
+
+    if keepdim:
+        output_shape = tuple(1 if i in dims else si for (i, si) in enumerate(mask_input.shape))
+    else:
+        output_shape = tuple(si for (i, si) in enumerate(mask_input.shape) if i not in dims)
+
+    for d in dims:
+        if (d >= input_dims):
+            continue
+
+        if d < num_sparse_dims:
+            reduced_sparse_dims.append(d)
+        else:
+            reduced_dense_dims.append(d + 1 - num_sparse_dims)
+
+    # Reduce dense dimensions
+    if len(reduced_dense_dims) > 0:
+        if reduce == "sum":
+            new_values = values
+            new_values = op(new_values, dim=reduced_dense_dims, keepdim=bool(keepdim))
+        else:
+            # FIXME: Implement reductions for dense dimensions for ops with non-zero reduction identities
+            return NotImplemented
+    else:
+        new_values = values.clone()
+
+    # Reduce sparse dimensions
+    if len(reduced_sparse_dims) == num_sparse_dims:
+        if reduce in {'amax', 'amin'} and new_values.size(0) == 0:
+            # IndexError: amax(): Expected reduction dim 0 to have non-zero size.
+            # sum()/prod() return the reduction identity when dim has size 0 but amax()/amin() do not
+            # See https://github.com/pytorch/pytorch/issues/61901
+            new_values = _reduction_identity(reduce, new_values)
+        else:
+            new_values = op(new_values, dim=0)
+        if (keepdim):
+            for _ in range(num_sparse_dims):
+                new_values = new_values.unsqueeze(0)
+        return new_values.to(dtype=output_dtype).to_sparse()
+    else:
+        new_indices = indices.clone()
+        if keepdim:
+            # zero out reduced sparse dimensions if keepdim = True
+            # ensures that the call to torch.unique folds duplicated indices together while preserving the dimension
+            new_indices[reduced_sparse_dims, :] = 0
+        else:
+            # remove reduced sparse dimensions if keepdim = False
+            if (len(reduced_sparse_dims) > 0):
+                retained_sparse_dims = [i for i in range(num_sparse_dims) if i not in set(reduced_sparse_dims)]
+                new_indices = new_indices.index_select(0, torch.tensor(retained_sparse_dims).to(mask_input.device))
+
+    # Use scatter_reduce to reduce items in the new_values tensor that correspond to the same indices in new_indices
+    if (new_indices.numel() > 0):
+        # lexsort indices and get index tensor for scatter reduction
+        new_indices, inverse_indices = torch.unique(new_indices, return_inverse=True, dim=1)
+        out_shape = list(new_values.shape)
+        out_shape[0] = new_indices.shape[1]
+        for _ in range(new_values.ndim - 1):
+            inverse_indices = inverse_indices.unsqueeze(-1)
+        scatter_indices = inverse_indices.expand(new_values.shape)
+        # FIXME: temporary workaround for issue with bfloat16/float16 remove when acctype is implemented for scatter_reduce
+        if output_dtype in {torch.bfloat16, torch.float16}:
+            new_values = new_values.to(torch.float)
+            out = new_values.new_empty(out_shape)
+            new_values = out.scatter_reduce_(0, scatter_indices, new_values, reduce=reduce, include_self=False)
+            new_values = new_values.to(dtype=output_dtype)
+        else:
+            out = new_values.new_empty(out_shape)
+            new_values = out.scatter_reduce_(0, scatter_indices, new_values, reduce=reduce, include_self=False)
+
+    return torch.sparse_coo_tensor(new_indices, new_values, output_shape, dtype=output_dtype, device=mask_input.device)
+
+
+def _sparse_csr_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """Sparse variant of torch.where. Supports sparse CSR tensors.
+    """
+    # TODO: implement sparse CSR specific where operator for efficiency
+    return _sparse_coo_where(mask.to_sparse_coo(), input.to_sparse_coo(), fill_value).to_sparse_csr()
+
+
+def _where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """torch.where with sparse inputs support.
+
+    _where implements the following invariant:
+
+      _where(mask, input, fill_value).to_dense(fill_value) ==
+        torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value))
+
+    where `a == b` means `assertEqual(a, b)`, mask is boolean sparse
+    tensor, and `to_dense(fill_value)` is like `to_dense()` except
+    that the unspecified elements are mapped to `fill_value` rather
+    than to `0`.
+
+    Returns a sparse tensor with the following features:
+
+    - all specified elements correspond to masked-in elements that
+      have the values of the input tensor. If there exists a masked-in
+      element (as specified by mask) that is not specified in the
+      input, in the result tensor, the corresponding element has value
+      0. In the dense part of the sparse tensor, the masked-out
+      elements are replaced with fill_value.
+
+    - all unspecified elements correspond to masked-out elements.
+    """
+    if mask.layout == torch.strided:
+        if fill_value.dtype == torch.bool:
+            # Workaround internal assert failure in
+            # test_nvfuser_correctness__masked_mean_cuda_bool: We
+            # don't have an op for aten::new_full but it isn't a
+            # special case.  Argument types: Tensor, int[], bool, int,
+            # int, Device, bool
+            fill = input.new_full([], int(fill_value.item())).to(dtype=torch.bool)
+        else:
+            fill = input.new_full([], fill_value.item())
+        return torch.where(mask, input, fill)
+    elif mask.layout == torch.sparse_coo:
+        return _sparse_coo_where(mask, input, fill_value)
+    elif mask.layout == torch.sparse_csr:
+        return _sparse_csr_where(mask, input, fill_value)
+    else:
+        raise ValueError(f'_where expects strided or sparse COO or sparse CSR tensor but got {mask.layout}')
+
+
 def _input_mask(input: Tensor, *args, **kwargs) -> Tensor:
     """Return canonical input mask.
-    Canonical input mask is a boolean tensor with the same shape as
-    input and with (broadcasted) content of mask, if specified.
+
+    A canonical input mask is defined as a boolean mask tensor that
+    shape and layout matches with the shape and the layout of the
+    input.
+
+    The canonical input mask is computed from the :attr:`mask` tensor
+    content to meet the following criteria:
+
+    1. The shape of the canonical input mask is the same as the shape
+       of :attr:`input` tensor. If the mask tensor has a smaller shape
+       than the shape of the :attr:`input`, broadcasting rules will be
+       applied. Downcasting of mask is not supported.
+
+    2. The layout of the canonical input mask is the same as the
+       layout of the :attr:`input` tensor. If the mask has different
+       layout, it will be converted to the expected layout.  In the
+       case of sparse COO layout, the canonical input mask will be
+       coalesced.
+
+    3. The dtype of the canonical input mask is torch.bool. If the
+       mask dtype is not bool then it will be converted to bool dtype
+       using `.to(dtype=bool)` method call.
+
+    4. The elements of the canonical input mask have boolean values
+       copied from the content of the :attr:`mask` tensor (after
+       possible broadcasting and dtype conversion transforms).  In
+       general, the sparsity pattern of the sparse canonical input
+       mask need not to be the same as the sparsity pattern of the
+       sparse :attr:`input` tensor.
+
     """
+    if input.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}:
+        raise ValueError(f'_input_mask expects strided or sparse COO or sparse CSR tensor but got {input.layout}')
+
     mask = kwargs.get('mask')
+
+    # default mask
     if mask is None:
-        inmask = input.new_ones(input.shape, dtype=torch.bool)
-    elif mask.ndim < input.ndim:
-        inmask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool)
-    elif mask.ndim > input.ndim:
-        raise IndexError("_input_mask expected broadcastable mask (got mask dimensionality higher than of the input)")
-    elif mask.shape != input.shape:
-        inmask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool)
-    else:
-        inmask = mask.to(dtype=torch.bool)
-    return inmask
+        raise ValueError('_input_mask requires explicit mask')
+
+    # mask shape must match with input shape
+    if mask.shape != input.shape:
+        if mask.ndim > input.ndim:
+            raise IndexError("_input_mask expected broadcastable mask (got mask dimensionality higher than of the input)")
+        if mask.layout == torch.strided:
+            mask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool)
+        elif mask.layout == torch.sparse_coo:
+            mask = torch._sparse_broadcast_to(mask, input.shape)
+        else:
+            assert mask.layout == torch.sparse_csr
+            # Broadcasting of CSR tensors is not implemented. Working
+            # around by using COO layout.
+            mask = torch._sparse_broadcast_to(mask.to_sparse(), input.shape).to_sparse_csr()
+
+    # mask layout must match with input layout
+    if mask.layout != input.layout:
+        if input.layout == torch.strided:
+            mask = mask.to_dense()
+        elif input.layout == torch.sparse_coo:
+            if mask.layout == torch.strided:
+                mask = mask.to_sparse(input.sparse_dim())
+            else:
+                mask = mask.to_sparse()
+        else:
+            assert input.layout == torch.sparse_csr
+            mask = mask.to_sparse_csr()
+
+    # sparse mask must be coalesced
+    if mask.layout == torch.sparse_coo:
+        mask = mask.coalesce()
+
+    # mask is a boolean tensor
+    mask = mask.to(dtype=torch.bool)
+
+    return mask
 
 
 def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor:
     """Return output mask of masked operation applied to given arguments.
     """
     if callable(op):
-        is_reduction = op.__name__ in {'sum', 'prod', 'amax', 'amin', 'mean', 'norm', 'var'}
-        is_normalization = op.__name__ in {'softmax', 'log_softmax', 'softmin', 'normalize'}
+        is_reduction = op.__name__ in {'sum', 'prod', 'amax', 'amin', 'argmax', 'argmin', 'mean', 'norm', 'var', 'std'}
+        is_normalization = op.__name__ in {'softmax', 'log_softmax', 'softmin', 'normalize', 'cumsum', 'cumprod'}
         if is_reduction:
             if op.__name__ == 'norm':
                 if args:
@@ -404,10 +761,7 @@ def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor:
             outmask = _input_mask(input, *args, **kwargs)
             keepdim = kwargs.get('keepdim', False)
             dim_ = _canonical_dim(dim, input.ndim)
-            # Workaround https://github.com/pytorch/pytorch/issues/56586
-            for d in reversed(dim_):
-                outmask = outmask.any(dim=d, keepdim=bool(keepdim))
-            return outmask
+            return _any(outmask, dim_, bool(keepdim))
         elif is_normalization:
             return _input_mask(input, *args, **kwargs)
         else:
@@ -416,6 +770,19 @@ def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor:
         raise ValueError(f'_output_mask expected masked operation (got {type(op).__name__} object)')
 
 
+def _combine_input_and_mask(op, input: Tensor, mask, *args) -> Tensor:
+    """Return input with masked-out elements eliminated for the given operations.
+    """
+    if mask is None:
+        return input
+    canonical_mask = _input_mask(input, mask=mask)
+    if callable(op):
+        fill_value = _reduction_identity(op.__name__, input, *args)
+        return _where(canonical_mask, input, fill_value)
+    else:
+        raise ValueError(f'_combine_input_and_mask expected masked operation (got {type(op).__name__} object)')
+
+
 @_apply_docstring_templates
 def sum(input: Tensor,
         dim: DimOrDims = None,
@@ -425,16 +792,28 @@ def sum(input: Tensor,
         mask: Optional[Tensor] = None) -> Tensor:
     # __doc__ is generated by _apply_docstring_templates decorator
     if dtype is None:
-        dtype = input.dtype
-    # TODO: What follows is a reference implementation of a masked sum
-    # operation that is to be replaced with an optimized one and
-    # extended to support other layouts.
+        # promote integer types to int64 when output dtype is not specified
+        if input.layout == torch.sparse_csr:
+            if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}:
+                # csr.to(dtype=torch.int64) is not implemented, so
+                # using coo.to on input to ensure the promoted dtype
+                input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr()
+            else:
+                dtype = input.dtype
+        else:
+            dtype = input.dtype
+            if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}:
+                dtype = torch.int64
+    dim_ = _canonical_dim(dim, input.ndim)
+    mask_input = _combine_input_and_mask(sum, input, mask)
     if input.layout == torch.strided:
-        mask_input = input if mask is None else torch.where(mask, input, input.new_zeros([]))
-        dim_ = _canonical_dim(dim, input.ndim)
         return torch.sum(mask_input, dim_, bool(keepdim), dtype=dtype)
+    elif input.layout == torch.sparse_coo:
+        return _sparse_coo_scatter_reduction_helper(torch.sum, mask_input, dim_, bool(keepdim), dtype)
+    elif input.layout == torch.sparse_csr:
+        return torch._sparse_csr_sum(mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype)
     else:
-        raise ValueError(f'masked sum expects strided tensor (got {input.layout} tensor)')
+        raise ValueError(f'masked sum expects strided, sparse_coo or sparse_csr tensor (got {input.layout} tensor)')
 
 
 @_apply_docstring_templates
@@ -445,19 +824,81 @@ def prod(input: Tensor,
          dtype: Optional[DType] = None,
          mask: Optional[Tensor] = None) -> Tensor:
     # __doc__ is generated by _apply_docstring_templates decorator
+    if dtype is None:
+        # promote integer types to int64 when output dtype is not specified
+        if input.layout == torch.sparse_csr:
+            if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}:
+                # csr.to(dtype=torch.int64) is not implemented, so
+                # using coo.to on input to ensure the promoted dtype
+                input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr()
+            else:
+                dtype = input.dtype
+        else:
+            dtype = input.dtype
+            if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}:
+                dtype = torch.int64
+    dim_ = _canonical_dim(dim, input.ndim)
+    mask_input = _combine_input_and_mask(prod, input, mask)
     if input.layout == torch.strided:
-        mask_input = input if mask is None else torch.where(mask, input, torch.ones_like(input))
-        dim_ = _canonical_dim(dim, input.ndim)
-
         # Workaround https://github.com/pytorch/pytorch/issues/56586
         result = mask_input
+        result = result.to(dtype=dtype)
         for d in reversed(dim_):
             result = result.prod(dim=d, keepdim=bool(keepdim))
-        if dtype is not None:
-            result = result.to(dtype=dtype)
         return result
+    elif input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch, the same issue arises for sparse_coo tensors
+            raise ValueError('masked prod expects explicit mask for sparse_coo tensor input')
+        return _sparse_coo_scatter_reduction_helper(torch.prod, mask_input, dim_, bool(keepdim), dtype)
+    elif input.layout == torch.sparse_csr:
+        if mask is None:
+            # mask is None corresponds to all-True mask. The
+            # unspecified elements in the CSR tensor correspond to
+            # zero values. Hence, the prod reduction result is
+            # automatically zero unless all elements are specified.
+            # A semi-optimal way to take this into account is to use:
+            #
+            #   masked_prod(csr, ..., mask=None) == torch._sparse_csr_prod(csr, ...) * all(csr.nonzero(), ...)
+            #
+            # but that requires implementing `all` and `nonzero`
+            # support for sparse csr tensors.
+            raise ValueError('masked prod expects explicit mask for sparse_csr tensor input')
+        return torch._sparse_csr_prod(mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype)
+    else:
+        raise ValueError(f'masked prod expects strided, sparse_coo or sparse_csr tensor (got {input.layout} tensor)')
+
+
+@_apply_docstring_templates
+def cumsum(input: Tensor,
+           dim: int,
+           *,
+           dtype: Optional[DType] = None,
+           mask: Optional[Tensor] = None) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(sum, input, mask)
+    if input.layout == torch.strided:
+        return torch.cumsum(mask_input, dim_, dtype=dtype).to(dtype=dtype)
+    else:
+        raise ValueError(f'masked cumsum expects strided tensor (got {input.layout} tensor)')
+
+
+@_apply_docstring_templates
+def cumprod(input: Tensor,
+            dim: int,
+            *,
+            dtype: Optional[DType] = None,
+            mask: Optional[Tensor] = None) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(prod, input, mask)
+    if input.layout == torch.strided:
+        return torch.cumprod(mask_input, dim_, dtype=dtype).to(dtype=dtype)
     else:
-        raise ValueError(f'masked prod expects strided tensor (got {input.layout} tensor)')
+        raise ValueError(f'masked cumprod expects strided tensor (got {input.layout} tensor)')
 
 
 @_apply_docstring_templates
@@ -479,16 +920,19 @@ def amax(input: Tensor,
 {reduction_example}"""
     if dtype is None:
         dtype = input.dtype
+
+    mask_input = _combine_input_and_mask(amax, input, mask)
+    dim_ = _canonical_dim(dim, mask_input.ndim)
     if input.layout == torch.strided:
-        if mask is None:
-            mask_input = input
-        else:
-            identity = input.new_full([], _reduction_identity('amax', input))
-            mask_input = torch.where(mask, input, identity)
-        dim_ = _canonical_dim(dim, mask_input.ndim)
         return torch.amax(mask_input, dim_, bool(keepdim)).to(dtype=dtype)
+    elif input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch of prod, a similar issue arises here
+            # where unspecified elements along a dimension may need to be reduced with the result
+            raise ValueError('masked amax expects explicit mask for sparse_coo tensor input')
+        return _sparse_coo_scatter_reduction_helper(torch.amax, mask_input, dim_, bool(keepdim), dtype)
     else:
-        raise ValueError(f'masked amax expects strided tensor (got {input.layout} tensor)')
+        raise ValueError(f'masked amax expects strided or sparse_coo tensor (got {input.layout} tensor)')
 
 
 @_apply_docstring_templates
@@ -510,16 +954,63 @@ def amin(input: Tensor,
 {reduction_example}"""
     if dtype is None:
         dtype = input.dtype
+
+    mask_input = _combine_input_and_mask(amin, input, mask)
+    dim_ = _canonical_dim(dim, mask_input.ndim)
     if input.layout == torch.strided:
-        if mask is None:
-            mask_input = input
-        else:
-            identity = input.new_full([], _reduction_identity('amin', input))
-            mask_input = torch.where(mask, input, identity)
-        dim_ = _canonical_dim(dim, mask_input.ndim)
         return torch.amin(mask_input, dim_, bool(keepdim)).to(dtype=dtype)
+    elif input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch of prod, a similar issue arises here
+            # where unspecified elements along a dimension may need to be reduced with the result
+            raise ValueError('masked amax expects explicit mask for sparse_coo tensor input')
+        return _sparse_coo_scatter_reduction_helper(torch.amin, mask_input, dim_, bool(keepdim), dtype)
+    else:
+        raise ValueError(f'masked amin expects strided or sparse_coo tensor (got {input.layout} tensor)')
+
+
+@_apply_docstring_templates
+def argmax(input: Tensor,
+           dim: int = None,
+           *,
+           keepdim: Optional[bool] = False,
+           dtype: Optional[DType] = None,
+           mask: Optional[Tensor] = None) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+{reduction_identity_dtype}
+{reduction_args}
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    mask_input = _combine_input_and_mask(argmax, input, mask)
+    if input.layout == torch.strided:
+        return torch.argmax(mask_input, dim, bool(keepdim)).to(dtype=dtype)
+    else:
+        raise ValueError(f'masked argmax expects strided tensor (got {input.layout} tensor)')
+
+
+@_apply_docstring_templates
+def argmin(input: Tensor,
+           dim: int = None,
+           *,
+           keepdim: Optional[bool] = False,
+           dtype: Optional[DType] = None,
+           mask: Optional[Tensor] = None) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+{reduction_identity_dtype}
+{reduction_args}
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    mask_input = _combine_input_and_mask(argmin, input, mask)
+    if input.layout == torch.strided:
+        return torch.argmin(mask_input, dim, bool(keepdim)).to(dtype=dtype)
     else:
-        raise ValueError(f'masked amin expects strided tensor (got {input.layout} tensor)')
+        raise ValueError(f'masked argmin expects strided tensor (got {input.layout} tensor)')
 
 
 @_apply_docstring_templates
@@ -547,9 +1038,14 @@ def mean(input: Tensor,
     if dtype is None:
         dtype = input.dtype
     if input.layout == torch.strided:
-        inmask = _input_mask(input, mask=mask)
-        count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=keepdim, mask=inmask)
-        total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask)
+        if mask is None:
+            # TODO: compute count analytically
+            count = sum(torch.ones(input.shape, dtype=torch.int64, device=input.device), dim, keepdim=keepdim)
+            total = sum(input, dim, keepdim=keepdim, dtype=dtype)
+        else:
+            inmask = _input_mask(input, mask=mask)
+            count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=keepdim, mask=inmask)
+            total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask)
         return total / count
     else:
         raise ValueError(f'masked sum expects strided tensor (got {input.layout} tensor)')
@@ -577,35 +1073,22 @@ def norm(input: Tensor,
 {reduction_example}"""
     if dtype is None:
         dtype = input.dtype
+    mask_input = _combine_input_and_mask(norm, input, mask, ord)
     if input.layout == torch.strided:
-        identity = input.new_full([], _reduction_identity('norm', input, ord))
-        mask_input = input if mask is None else torch.where(mask, input, identity)
         dim_ = _canonical_dim(dim, input.ndim)
         return torch.linalg.vector_norm(mask_input, ord, dim_, bool(keepdim), dtype=dtype)
     else:
         raise ValueError(f'masked norm expects strided tensor (got {input.layout} tensor)')
 
 
-@_apply_docstring_templates
-def var(input: Tensor,
-        dim: DimOrDims = None,
-        unbiased: Optional[bool] = False,
-        *,
-        keepdim: Optional[bool] = False,
-        dtype: Optional[DType] = None,
-        mask: Optional[Tensor] = None) -> Tensor:
-    """\
-{reduction_signature}
-
-{reduction_descr}
-
-The identity value of sample variance operation is undefined.  The
-elements of output tensor with strided layout, that correspond to
-fully masked-out elements, have ``nan`` values.
-
-{reduction_args}
-
-{reduction_example}"""
+def std_var(input: Tensor,
+            dim: DimOrDims = None,
+            unbiased: Optional[bool] = False,
+            *,
+            keepdim: Optional[bool] = False,
+            dtype: Optional[DType] = None,
+            mask: Optional[Tensor] = None,
+            take_sqrt: Optional[bool] = False) -> Tensor:
     if dtype is None:
         dtype = input.dtype
         if not (dtype.is_floating_point or dtype.is_complex):
@@ -614,23 +1097,88 @@ def var(input: Tensor,
     if not (compute_dtype.is_floating_point or compute_dtype.is_complex):
         compute_dtype = torch.float32
     if input.layout == torch.strided:
-        inmask = _input_mask(input, mask=mask)
-        count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=True, mask=inmask)
-        sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask)
+        if mask is None:
+            # TODO: compute count analytically
+            count = sum(torch.ones(input.shape, dtype=torch.int64, device=input.device), dim, keepdim=True)
+            sample_total = sum(input, dim, keepdim=True, dtype=dtype)
+        else:
+            inmask = _input_mask(input, mask=mask)
+            count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=True, mask=inmask)
+            sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask)
         # TODO: replace torch.subtract/divide/square/maximum with
         # masked subtract/divide/square/maximum when these will be
         # available.
         sample_mean = torch.divide(sample_total, count)
         x = torch.subtract(input, sample_mean)
-        total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask)
+        if mask is None:
+            total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
+        else:
+            total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask)
         if not keepdim:
             count = count.reshape(total.shape)
         if unbiased:
             count = torch.subtract(count, 1)
             count = torch.maximum(count, count.new_zeros([]))
-        return torch.divide(total, count).to(dtype=dtype)
+        output = torch.divide(total, count).to(dtype=dtype)
+        if take_sqrt:
+            output = torch.sqrt(output)
+        return output
     else:
-        raise ValueError(f'masked var expects strided tensor (got {input.layout} tensor)')
+        raise ValueError(f'masked std/var expects strided tensor (got {input.layout} tensor)')
+
+
+@_apply_docstring_templates
+def var(input: Tensor,
+        dim: DimOrDims = None,
+        unbiased: Optional[bool] = False,
+        *,
+        keepdim: Optional[bool] = False,
+        dtype: Optional[DType] = None,
+        mask: Optional[Tensor] = None) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+The identity value of sample variance operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+{reduction_args}
+{reduction_example}"""
+    return std_var(
+        input=input,
+        dim=dim,
+        unbiased=unbiased,
+        keepdim=keepdim,
+        dtype=dtype,
+        mask=mask,
+        take_sqrt=False,
+    )
+
+
+@_apply_docstring_templates
+def std(input: Tensor,
+        dim: DimOrDims = None,
+        unbiased: Optional[bool] = False,
+        *,
+        keepdim: Optional[bool] = False,
+        dtype: Optional[DType] = None,
+        mask: Optional[Tensor] = None) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+The identity value of sample standard deviation operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+{reduction_args}
+{reduction_example}"""
+    return std_var(
+        input=input,
+        dim=dim,
+        unbiased=unbiased,
+        keepdim=keepdim,
+        dtype=dtype,
+        mask=mask,
+        take_sqrt=True
+    )
 
 
 @_apply_docstring_templates
@@ -642,10 +1190,8 @@ def softmax(input: Tensor,
     if dtype is None:
         dtype = input.dtype
     dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amax, input, mask)
     if input.layout == torch.strided:
-        fill = input.new_full([], _reduction_identity('amax', input))
-        inmask = _input_mask(input, mask=mask)
-        mask_input = torch.where(inmask, input, fill)
         return torch.nn.functional.softmax(mask_input, dim_, dtype=dtype)
     else:
         raise ValueError(f'masked softmax expects strided tensor (got {input.layout} tensor)')
@@ -660,10 +1206,8 @@ def log_softmax(input: Tensor,
     if dtype is None:
         dtype = input.dtype
     dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amax, input, mask)
     if input.layout == torch.strided:
-        fill = input.new_full([], _reduction_identity('amax', input))
-        inmask = _input_mask(input, mask=mask)
-        mask_input = torch.where(inmask, input, fill)
         return torch.nn.functional.log_softmax(mask_input, dim_, dtype=dtype)
     else:
         raise ValueError(f'masked log_softmax expects strided tensor (got {input.layout} tensor)')
@@ -678,10 +1222,8 @@ def softmin(input: Tensor,
     if dtype is None:
         dtype = input.dtype
     dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amin, input, mask)
     if input.layout == torch.strided:
-        fill = input.new_full([], _reduction_identity('amin', input))
-        inmask = _input_mask(input, mask=mask)
-        mask_input = torch.where(inmask, input, fill)
         return torch.nn.functional.softmin(mask_input, dim_, dtype=dtype)
     else:
         raise ValueError(f'masked softmin expects strided tensor (got {input.layout} tensor)')
@@ -698,13 +1240,12 @@ def normalize(input: Tensor,
     if dtype is None:
         dtype = input.dtype
     dim_ = _canonical_dim(dim, input.ndim)[0]
+    # TODO: eliminate mask_input as unnecessary when using masked divide.
+    mask_input = _combine_input_and_mask(sum, input, mask)
     if input.layout == torch.strided:
         nrm_ = norm(input, ord, dim, keepdim=True, dtype=dtype, mask=mask)
         # TODO: replace torch.maximum with masked maximum when available.
         denom = torch.maximum(nrm_, nrm_.new_full([], eps))
-        # TODO: eliminate mask_input as unnecessary when using masked divide.
-        inmask = _input_mask(input, mask=mask)
-        mask_input = input if mask is None else torch.where(inmask, input, input.new_zeros([]))
         # TODO: replace torch.divide with masked divide when available.
         return torch.divide(mask_input, denom)
     else:
diff --git a/torch/_masked/_docs.py b/torch/_masked/_docs.py
new file mode 100644
index 000000000000..0949c5b21c19
--- /dev/null
+++ b/torch/_masked/_docs.py
@@ -0,0 +1,1046 @@
+# -*- coding: utf-8 -*-
+# This file is generated, do not modify it!
+#
+# To update this file, run the update masked docs script as follows:
+#
+#   python tools/update_masked_docs.py
+#
+# The script must be called from an environment where the development
+# version of torch package can be imported and is functional.
+#
+
+amax_docstring = """amax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns maximum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of maximum operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in maximum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of maximum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.amax(input, 1, mask=mask)
+    tensor([                  -1, -9223372036854775808])
+"""
+
+amin_docstring = """amin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns minimum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of minimum operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in minimum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of minimum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.amin(input, 1, mask=mask)
+    tensor([                 -3, 9223372036854775807])
+"""
+
+argmax_docstring = """argmax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns argmax of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of argmax operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in argmax computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of argmax operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which argmax is computed.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.argmax(input, 1, mask=mask)
+    tensor([2, 0])
+"""
+
+argmin_docstring = """argmin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns argmin of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of argmin operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in argmin computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of argmin operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which argmin is computed.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.argmin(input, 1, mask=mask)
+    tensor([0, 0])
+"""
+
+cumprod_docstring = """cumprod(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns cumulative_prod of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``prod(x[:i])``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+cumulative_prod computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the cumulative_prod output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which cumulative_prod is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.cumprod(input, 1, mask=mask)
+    tensor([[-3., -3.,  3.],
+            [ 1.,  1.,  1.]])
+"""
+
+cumsum_docstring = """cumsum(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns cumulative_sum of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``sum(x[:i])``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+cumulative_sum computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the cumulative_sum output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which cumulative_sum is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.cumsum(input, 1, mask=mask)
+    tensor([[-3., -3., -4.],
+            [ 0.,  0.,  0.]])
+"""
+
+log_softmax_docstring = """log_softmax(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns log_softmax of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is
+defined as ``log(exp(x[i])/sum(exp(x)))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+log_softmax computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the log_softmax output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which log_softmax is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.log_softmax(input, 1, mask=mask)
+    tensor([[-2.1269,    -inf, -0.1269],
+            [    nan,     nan,     nan]])
+"""
+
+mean_docstring = """mean(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns mean of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+By definition, the identity value of a mean operation is the mean
+value of the tensor. If all elements of the input tensor along given
+dimension(s) :attr:`dim` are masked-out, the identity value of the
+mean is undefined.  Due to this ambiguity, the elements of output
+tensor with strided layout, that correspond to fully masked-out
+elements, have ``nan`` values.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in mean computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of mean operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.mean(input, 1, mask=mask)
+    tensor([-2., nan])
+"""
+
+norm_docstring = """norm(input, ord, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns norm of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of norm operation, which is used to start the
+reduction, is ``0.0``, except for ``ord=-inf`` it is
+``inf``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in norm computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of norm operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    ord (int, float, optional): the order of vector norm. Default: 2.
+      See :func:`torch.linalg.vector_norm` for a list of supported norms.
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.norm(input, 2.0, 1, mask=mask)
+    tensor([3.1623, 0.0000])
+"""
+
+normalize_docstring = """normalize(input, ord, dim, *, eps=1e-12, dtype=None, mask=None) -> Tensor
+
+Returns normalize of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Normalize of i-th element in ``x`` is
+defined as ``x[i]/max(norm(x, p), eps)``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+normalize computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the normalize output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    ord (int, float): the order of vector norm. Default: 2.
+      See :func:`torch.linalg.vector_norm` for a list of supported norms.
+    dim (int): the dimension along which normalize is computed.
+
+Keyword args:
+    eps (float, optional): small value to avoid division by zero. Default: 1e-12.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.normalize(input, 2.0, 1, mask=mask)
+    tensor([[-0.9487,  0.0000, -0.3162],
+            [ 0.0000,  0.0000,  0.0000]])
+"""
+
+prod_docstring = """prod(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns product of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of product operation, which is used to start the reduction, is ``1``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in product computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of product operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.prod(input, 1, mask=mask)
+    tensor([3, 1])
+"""
+
+softmax_docstring = """softmax(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns softmax of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmax of i-th element in ``x`` is
+defined as ``exp(x[i])/sum(exp(x))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+softmax computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the softmax output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which softmax is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.softmax(input, 1, mask=mask)
+    tensor([[0.1192, 0.0000, 0.8808],
+            [   nan,    nan,    nan]])
+"""
+
+softmin_docstring = """softmin(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns softmin of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmin of i-th element in ``x`` is
+defined as ``exp(-x[i])/sum(exp(-x))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+softmin computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the softmin output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which softmin is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.softmin(input, 1, mask=mask)
+    tensor([[0.8808, 0.0000, 0.1192],
+            [   nan,    nan,    nan]])
+"""
+
+std_docstring = """std(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns standard_deviation of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of sample standard deviation operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in standard_deviation computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of standard_deviation operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+      the uncorrected sample variance.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.std(input, 1, False, mask=mask)
+    tensor([1., nan])
+"""
+
+sum_docstring = """sum(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns sum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of sum operation, which is used to start the reduction, is ``0``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in sum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of sum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.sum(input, 1, mask=mask)
+    tensor([-4,  0])
+"""
+
+var_docstring = """var(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns variance of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of sample variance operation is undefined.  The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in variance computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of variance operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+      the uncorrected sample variance.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch._masked.var(input, 1, False, mask=mask)
+    tensor([1., nan])
+"""
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
new file mode 100644
index 000000000000..8fa3661e6714
--- /dev/null
+++ b/torch/_meta_registrations.py
@@ -0,0 +1,68 @@
+import torch
+
+meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+
+def toRealValueType(dtype):
+    from_complex = {
+        torch.complex32: torch.half,
+        torch.cfloat: torch.float,
+        torch.cdouble: torch.double
+    }
+    return from_complex.get(dtype, dtype)
+
+# Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py
+@torch.library.impl(meta_lib, "index_select")
+def meta_index_select(self, dim, index):
+    result_size = list(self.size())
+    if self.dim() > 0:
+        result_size[dim] = index.numel()
+    return self.new_empty(result_size)
+
+@torch.library.impl(meta_lib, "index_select.out")
+def meta_index_select_out(self, dim, index, out):
+    torch._resize_output_(out, self.size(), self.device)
+    return out.copy_(torch.index_select(self, dim, index))
+
+@torch.library.impl(meta_lib, "abs")
+def meta_abs(self):
+    if self.is_complex():
+        float_type = toRealValueType(self.dtype)
+        return self.new_empty(self.size(), dtype=float_type)
+    else:
+        return self.new_empty(self.size())
+
+@torch.library.impl(meta_lib, "abs.out")
+def meta_abs_out(self, out):
+    torch._resize_output_(out, self.size(), self.device)
+    return out.copy_(torch.abs(self))
+
+@torch.library.impl(meta_lib, "max")
+def meta_max(self):
+    return self.new_empty(())
+
+@torch.library.impl(meta_lib, "min")
+def meta_min(self):
+    return self.new_empty(())
+
+def squareCheckInputs(self, f_name):
+    assert self.dim() >= 2, f"{f_name}: The input tensor must have at least 2 dimensions."
+    # TODO: I think the error message has the -2 and -1 swapped.  If you fix
+    # it fix the C++ squareCheckInputs too
+    assert self.size(-1) == self.size(-2), \
+        f"{f_name}: A must be batches of square matrices, but they are {self.size(-1)} by {self.size(-2)} matrices"
+
+def checkUplo(uplo: str):
+    uplo_uppercase = uplo.upper()
+    assert len(uplo) == 1 and uplo_uppercase == 'U' or uplo_uppercase == 'L', \
+        f"Expected UPLO argument to be 'L' or 'U', but got {uplo}"
+
+@torch.library.impl(meta_lib, "linalg_eigh")
+def meta_linalg_eigh(self, uplo="L"):
+    squareCheckInputs(self, "linalg_eigh")
+    checkUplo(uplo)
+    real_dtype = toRealValueType(self.dtype)
+    assert self.dim() >= 2
+    values = self.new_empty(self.shape, dtype=real_dtype)
+    values.transpose_(-2, -1)
+    vectors = self.new_empty(self.shape[:-1])
+    return (values, vectors)
diff --git a/torch/_ops.py b/torch/_ops.py
index 9116d2256c03..9728998c5652 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -32,13 +32,17 @@ def __init__(self, overloadpacket, op, schema):
         self._op = op
         self._schema = schema
         self._overloadpacket = overloadpacket
+        self._overloadname = 'default' if schema.overload_name == '' else schema.overload_name
+        self.__name__ = "{}.{}".format(self._schema.name.split("::")[1], self._overloadname)
+        self.__module__ = overloadpacket.__module__
+        op.__module__ = overloadpacket.__module__
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
         return self
 
-    def __str__(self):
-        return "OpOverload(op='{}.{}', overload='{}')".format(*self._schema.name.split("::"), self.overload_name)
+    def __repr__(self):
+        return "<OpOverload(op='{}.{}', overload='{}')>".format(*self._schema.name.split("::"), self._overloadname)
 
     def __call__(self, *args, **kwargs):
         return self._op(*args, **kwargs or {})
@@ -46,17 +50,15 @@ def __call__(self, *args, **kwargs):
     def __getattr__(self, key):
         return getattr(self._op, key)
 
-    # `my_namespace::my_op`
-    @property
-    def name(self):
-        return "{}.{}".format(*self._schema.name.split("::"))
+    def __hash__(self):
+        return hash(self._op)
 
-    @property
-    def overload_name(self):
-        return self._schema.overload_name
+    # `my_namespace.my_op_name.overload_name`
+    def __str__(self):
+        return "{}.{}.{}".format(*self._schema.name.split("::"), self._overloadname)
 
     @property
-    def overload_packet(self):
+    def overloadpacket(self):
         return self._overloadpacket
 
     @property
@@ -68,27 +70,26 @@ def op(self):
 # OpOverloadPacket class contains pointer to a base unresolved operator that doesn't correspond to a specific operator
 # You can obtain an OpOverload object through attribute query.
 class OpOverloadPacket:
-    def __init__(self, qualified_op_name, op_name, op):
+    def __init__(self, qualified_op_name, op_name, op, overload_names):
         # These attributes are accessible on the object through the properties
         # defined below but are immutable
         self._qualified_op_name = qualified_op_name
-        self._op_name = op_name
+        self.__name__ = op_name
         self._op = op
+        self._overload_names = overload_names
 
     # it's a no-op since OpOverloadPacket object is immutable and must be unique for a given op.
     def __deepcopy__(self, memo=None):
         return self
 
-    def __str__(self):
-        return "OpOverloadPacket(op='{}.{}')".format(*self._qualified_op_name.split("::"))
+    def __repr__(self):
+        return "<OpOverloadPacket(op='{}.{}')>".format(*self._qualified_op_name.split("::"))
 
-    @property
-    def qualified_op_name(self):
-        return "{}.{}".format(*self._qualified_op_name.split("::"))
+    def __hash__(self):
+        return hash(self._op)
 
-    @property
-    def op_name(self):
-        return self._op_name
+    def __str__(self):
+        return "{}.{}".format(*self._qualified_op_name.split("::"))
 
     @property
     def op(self):
@@ -99,39 +100,67 @@ def __getattr__(self, key):
         if key == '__file__':
             return 'torch.ops'
 
+        # ensure that query for dunder attributes that does not exist on
+        # opoverloadpacket but instead exists on the self._op object does not unnecessarily call
+        # `_get_operation_overload` (which is an expensive operation).
+        # This is done to prevent any potential slowdown. This list can be extended
+        # if there exists other attributes like `__name__` that only exist on self._op and not on the
+        # opoverloadpacket.
+        # This is ok since we are guaranteed that an overload name for an aten op can't start with '__'
         try:
+            if key.startswith('__'):
+                return getattr(self._op, key)
+        except AttributeError:
+            # for consistency because it seems weird to
+            # throw an attribute error with a message containing
+            # an object name different from the one the attribute
+            # query was performed on.
+            raise AttributeError("'{}' can't have an overload name beginning with '__' and the "
+                                 "underlying op {} has no attribute {} either."
+                                 .format(str(self), str(self._op), key)) from None
+
+        try:
+            # This is ok since we are guaranteed that an overload name for an aten op can't be 'default'
             use_key = '' if key == 'default' else key
             # TODO: disallow access to overloads registered by JIT
-            op_ = torch._C._get_operation_overload(self._qualified_op_name, use_key)
+            op_ = torch._C._get_operation_overload(
+                self._qualified_op_name, use_key)
             schema = torch._C._get_schema(self._qualified_op_name, use_key)
             overload = OpOverload(self, op_, schema)
             # cache the overload object
             setattr(self, key, overload)
             return overload
         except RuntimeError:
-            try:
-                # This is added to maintain bc in case the user queries an attribute that exists on `self._op`
-                # which used to be returned before instead of the OpOverloadPacket
-                out = getattr(self._op, key)
-                return out
-            except AttributeError:
-                raise AttributeError("'{}' object has no attribute '{}'".format(str(self), key)) from None
+            raise AttributeError(
+                "The underlying op of '{}' has no overload name '{}'".format(str(self), key)
+            ) from None
 
     def __call__(self, *args, **kwargs):
-        # overloading __call__ to ensure torch.ops.foo.bar() is still callable from JIT
-        # We save the function ptr as the `op` attribute on OpOverloadPacket to access it here.
+        # overloading __call__ to ensure torch.ops.foo.bar()
+        # is still callable from JIT
+        # We save the function ptr as the `op` attribute on
+        # OpOverloadPacket to access it here.
         return self._op(*args, **kwargs or {})
 
+    # TODO: use this to make a __dir__
+    def overloads(self):
+        return [n if n else "default" for n in self._overload_names]
+
 # Resolution of torch.fn is different from torch.ops.aten.fn
-# torch.fn uses the Python argparser, matches with the appropriate schema, and calls into the unboxed version of the method
-# torch.ops.aten.fn resolution is done via the mechanism defined in JIT. JIT creates a stack of all the overloads and
-# then tries to match the correct one at runtime and always calls into the boxed version of the method
-# Autograd codegen creates VariableType, TracerType, inplace or view type and python bindings
-# Aten codegen generates tensor methods for the the tensor class
+# torch.fn uses the Python argparser, matches with the
+# appropriate schema, and calls into the unboxed version of the method
+# torch.ops.aten.fn resolution is done via the mechanism defined in JIT.
+# JIT creates a stack of all the overloads and then tries to match the
+# correct one at runtime and always calls into the boxed version of the method
+# Autograd codegen creates VariableType, TracerType,
+# inplace or view type and python bindings.
+# Aten codegen generates tensor methods for the the tensor class.
 
 # _OpNamespace is a subclass of ModuleType because the torch script
 # allows attribute lookups on modules only. Since we want torch.ops.foo.bar()
 # to work from script, we need to ensure ops and foo are modules
+
+
 class _OpNamespace(types.ModuleType):
     """
     An op namespace to dynamically bind Operators into Python.
@@ -160,23 +189,29 @@ def __getattr__(self, op_name):
         # It is not a valid op_name when __file__ is passed in
         if op_name == '__file__':
             return 'torch.ops'
+
         # Get the op `my_namespace::my_op` if available. This will also check
         # for overloads and raise an exception if there are more than one.
         namespace_name = self.name
         qualified_op_name = '{}::{}'.format(namespace_name, op_name)
-        op = torch._C._jit_get_operation(qualified_op_name)
+        try:
+            op, overload_names = torch._C._jit_get_operation(qualified_op_name)
+        except RuntimeError as e:
+            # Turn this into AttributeError so getattr(obj, key, default)
+            # works (this is called by TorchScript with __origin__)
+            raise AttributeError(f"'_OpNamespace' object has no attribute '{op_name}'") from e
 
         # let the script frontend know that op is identical to the builtin op
         # with qualified_op_name
         torch.jit._builtins._register_builtin(op, qualified_op_name)
         op.__module__ = self.__module__ + "." + namespace_name
-        # opoverloadpacket = OpOverloadPacket(qualified_op_name, op_name, op)
-        # opoverloadpacket.__module__ = self.__module__ + "." + namespace_name
+        opoverloadpacket = OpOverloadPacket(qualified_op_name, op_name, op, overload_names)
+        opoverloadpacket.__module__ = self.__module__ + "." + namespace_name
         # cache the opoverloadpacket to ensure that each op corresponds to
         # a unique OpOverloadPacket object
-        # setattr(self, op_name, opoverloadpacket)
-        setattr(self, op_name, op)
-        return op
+        setattr(self, op_name, opoverloadpacket)
+        return opoverloadpacket
+
 
 class _Ops(types.ModuleType):
     __file__ = '_ops.py'
@@ -220,5 +255,6 @@ def load_library(self, path):
             ctypes.CDLL(path)
         self.loaded_libraries.add(path)
 
+
 # The ops "namespace"
 ops = _Ops()
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
new file mode 100644
index 000000000000..85242acd9120
--- /dev/null
+++ b/torch/_prims/__init__.py
@@ -0,0 +1,2020 @@
+import torch
+from torch import Tensor, _TypedStorage
+
+import torch._prims.utils as utils
+from torch._prims.utils import (
+    TensorLike,
+    TensorLikeType,
+    TensorMeta,
+    ShapeType,
+    getnvFuserDtype,
+    DimsType,
+    DimsSequenceType,
+    StrideType,
+    Number,
+    NumberType,
+)
+from torch.overrides import has_torch_function, handle_torch_function
+import torch.library
+from torch.utils._pytree import tree_map
+
+from typing import Sequence, Optional, Union, Callable, List, Tuple, Any, Type
+from functools import reduce, partial
+from enum import Enum
+import operator
+import math
+
+prim = torch.library.Library("prims", "DEF")
+prim_impl = torch.library.Library("prims", "IMPL", "CompositeExplicitAutograd")
+prim_meta_impl = torch.library.Library("prims", "IMPL", "Meta")
+
+# Experimental module containing prototype "primitive" operations.
+
+__all__ = [
+    #
+    # Common datastructures and helpers
+    #
+    "RETURN_TYPE",
+    #
+    # Elementwise unary prims
+    #
+    "abs",
+    "acos",
+    "acosh",
+    "asin",
+    "atan",
+    "cos",
+    "cosh",
+    "bessel_i0e",
+    "bessel_i1e",
+    "cbrt",
+    "ceil",
+    "digamma",
+    "erf",
+    "erf_inv",
+    "erfc",
+    "exp",
+    "expm1",
+    "floor",
+    "is_finite",
+    "lgamma",
+    "log",
+    "log1p",
+    "neg",
+    "reciprocal",
+    "round",
+    "sign",
+    "sin",
+    "sinh",
+    "sqrt",
+    "square",
+    "tan",
+    #
+    # Elementwise binary prims
+    #
+    "add",
+    "atan2",
+    "bitwise_and",
+    "bitwise_not",
+    "bitwise_or",
+    "bitwise_xor",
+    # 'complex',  # needs custom meta
+    "div",
+    "eq",
+    "ge",
+    "gt",
+    "igamma",
+    "igammac",
+    "le",
+    "lt",
+    "maximum",
+    "minimum",
+    "mul",
+    "ne",
+    "nextafter",
+    "pow",
+    "rsqrt",
+    "shift_left",
+    "shift_right_arithmetic",
+    "shift_right_logical",  # not implemented
+    #
+    # View prims
+    #
+    "as_strided",
+    "broadcast_in_dim",
+    "collapse_view",
+    "expand_dims",
+    "slice",
+    "slice_in_dim",  # implemented using slice -- make this a ref?
+    "split_dim",
+    "squeeze",
+    "transpose",
+    "view_of",
+    #
+    # Shape prims
+    #
+    "collapse",
+    "concatenate",
+    "reshape",
+    "rev",
+    #
+    # Conditional prims
+    #
+    "select",
+    #
+    # Data conversion and movement prims
+    #
+    "clone",
+    "convert_element_type",
+    "device_put",
+    "to_dtype",
+    #
+    # Inplace prims
+    #
+    "copy_to",
+    "resize",
+    # "_set",  # Commented out, see note below
+    #
+    # Reduction prims
+    #
+    "all",
+    "amax",
+    "amin",
+    "any",
+    "prod",
+    "sum",
+    #
+    # Tensor Creation
+    #
+    "empty",
+    "empty_like",
+    "full",
+    "full_like",
+]
+
+#
+# Common datastructures and helpers
+#
+
+# Describes the return type of the primitive:
+#
+#   - NEW, a new tensor is created
+#   - VIEW, a view of an input tensor is returned
+#   - INPLACE, one or more input tensors is modified
+#
+# these descriptors are mututally exclusive and exhaustive.
+class RETURN_TYPE(Enum):
+    NEW = (0,)
+    VIEW = (1,)
+    INPLACE = (2,)
+
+
+def _wrap_tensor_meta(f):
+    def wrap(t):
+        if isinstance(t, torch.Tensor):
+            return TensorMeta(t)
+        else:
+            return t
+
+    def unwrap(t):
+        # TODO: doesn't setup aliasing relation on views correctly
+        if isinstance(t, TensorMeta):
+            return torch.empty_strided(
+                t.shape, t.stride(), dtype=t.dtype, device="meta"
+            )
+        else:
+            return t
+
+    def wrapper(*args, **kwargs):
+        wrapped_args = tree_map(wrap, args)
+        wrapped_kwargs = tree_map(wrap, kwargs)
+        return tree_map(unwrap, f(*wrapped_args, **wrapped_kwargs))
+
+    return wrapper
+
+
+def _make_prim(
+    *,
+    schema: str,
+    meta: Callable,
+    impl_aten: Callable,
+    impl_nvfuser: Optional[Callable] = None,
+    return_type: RETURN_TYPE,
+    doc: str,
+):
+    """
+    Creates a primitive operation.
+
+    """
+
+    prim.define(schema)
+
+    def _prim_impl(*args, **kwargs):
+        # always run the meta function because aten implementation will
+        # typically accept more inputs (e.g., it will do promotion and
+        # broadcasting) which we want to reject
+        meta(*args, **kwargs)
+        return impl_aten(*args, **kwargs)
+
+    name = schema.split("(")[0]
+    prim_impl.impl(name, _prim_impl)
+    prim_meta_impl.impl(name, _wrap_tensor_meta(meta))
+
+    _prim = getattr(torch.ops.prims, name).default
+
+    _prim.__doc__ = doc
+    _prim.meta = meta  # type: ignore[attr-defined]
+    _prim.impl_nvfuser = impl_nvfuser  # type: ignore[attr-defined]
+    _prim.return_type = return_type  # type: ignore[attr-defined]
+
+    return _prim
+
+
+class ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND(Enum):
+    DEFAULT = (0,)
+    ALWAYS_BOOL = (2,)
+    COMPLEX_TO_FLOAT = (3,)
+
+
+# TODO: implement dtype validation here, too, or on the corresponding refs
+def _elementwise_meta(
+    *args, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
+) -> TensorMeta:
+    """
+    Meta function for elementwise operations that produce outputs in the same dtype
+    as their inputs.
+
+    Stride logic is currently incorrect.
+    """
+
+    assert len(args) > 0
+
+    utils.check_same_device(*args, allow_cpu_scalar_tensors=True)
+    utils.check_same_shape(*args, allow_cpu_scalar_tensors=True)
+    utils.check_same_dtype(*args)
+
+    strides = utils.compute_elementwise_output_strides(*args)
+
+    tensor = None
+    scalar_tensor = None
+    number = None
+    for arg in args:
+        if isinstance(arg, TensorLike):
+            if utils.is_cpu_scalar_tensor(arg) and scalar_tensor is None:
+                scalar_tensor = arg
+            if not utils.is_cpu_scalar_tensor(arg) and tensor is None:
+                tensor = arg
+
+        elif isinstance(arg, Number):
+            if number is None:
+                number = arg
+
+    # NOTE: type promotion behavior here is mostly hidden from tests because
+    # references will typically handle the type promotion properly even if this doesn't
+    # (but getting it wrong will cause too many casts to be inserted in traces!)
+    if tensor is not None or scalar_tensor is not None:
+        tensor = tensor if tensor is not None else scalar_tensor
+        assert tensor is not None  # appease mypy
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT:
+            return TensorMeta(tensor, strides=strides)
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+            return TensorMeta(tensor, strides=strides, dtype=torch.bool)
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
+            if utils.is_complex_dtype(tensor.dtype):
+                dtype = utils.corresponding_real_dtype(tensor.dtype)
+            else:
+                dtype = tensor.dtype
+            return TensorMeta(tensor, strides=strides, dtype=dtype)
+
+    # Number case
+    # NOTE: this case is not currently exercised
+    # TODO: fix number type promotion (bool, complex->float)
+    return TensorMeta(number)
+
+
+def _make_elementwise_unary_prim(
+    name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs
+):
+    """
+    Creates an elementwise unary prim.
+    """
+
+    return _make_prim(
+        schema=f"{name}(Tensor self) -> Tensor",
+        meta=partial(_elementwise_meta, type_promotion=type_promotion),
+        return_type=RETURN_TYPE.NEW,
+        **kwargs,
+    )
+
+
+def _make_elementwise_binary_prim(
+    name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs
+):
+    """
+    Creates an elementwise binary prim.
+    """
+
+    return _make_prim(
+        schema=f"{name}(Tensor self, Tensor other) -> Tensor",
+        meta=partial(_elementwise_meta, type_promotion=type_promotion),
+        return_type=RETURN_TYPE.NEW,
+        **kwargs,
+    )
+
+
+def _not_impl(*args, **kwargs):
+    raise NotImplementedError
+
+
+#
+# Elementwise unary operations
+#
+
+abs = _make_elementwise_unary_prim(
+    "abs",
+    impl_aten=torch.abs,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+
+acos = _make_elementwise_unary_prim(
+    "acos",
+    impl_aten=torch.acos,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+acosh = _make_elementwise_unary_prim(
+    "acosh",
+    impl_aten=torch.acosh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+asin = _make_elementwise_unary_prim(
+    "asin",
+    impl_aten=torch.asin,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+atan = _make_elementwise_unary_prim(
+    "atan",
+    impl_aten=torch.atan,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+cos = _make_elementwise_unary_prim(
+    "cos",
+    impl_aten=torch.cos,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+cosh = _make_elementwise_unary_prim(
+    "cosh",
+    impl_aten=torch.cosh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i0e = _make_elementwise_unary_prim(
+    "bessel_i0e",
+    impl_aten=torch.special.i0e,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i1e = _make_elementwise_unary_prim(
+    "bessel_i1e",
+    impl_aten=torch.special.i1e,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _cbrt_aten(a: torch.Tensor):
+    return pow(a, (1 / 3))
+
+
+cbrt = _make_elementwise_unary_prim(
+    "cbrt",
+    impl_aten=_cbrt_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+ceil = _make_elementwise_unary_prim(
+    "ceil",
+    impl_aten=torch.ceil,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+digamma = _make_elementwise_unary_prim(
+    "digamma",
+    impl_aten=torch.digamma,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erf = _make_elementwise_unary_prim(
+    "erf",
+    impl_aten=torch.erf,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erf_inv = _make_elementwise_unary_prim(
+    "erf_inv",
+    impl_aten=torch.special.erfinv,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erfc = _make_elementwise_unary_prim(
+    "erfc",
+    impl_aten=torch.special.erfc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+exp = _make_elementwise_unary_prim(
+    "exp",
+    impl_aten=torch.exp,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+expm1 = _make_elementwise_unary_prim(
+    "expm1",
+    impl_aten=torch.special.expm1,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+floor = _make_elementwise_unary_prim(
+    "floor",
+    impl_aten=torch.floor,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+is_finite = _make_elementwise_unary_prim(
+    "is_finite",
+    impl_aten=torch.isfinite,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+lgamma = _make_elementwise_unary_prim(
+    "lgamma",
+    impl_aten=torch.lgamma,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log = _make_elementwise_unary_prim(
+    "log",
+    impl_aten=torch.log,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log1p = _make_elementwise_unary_prim(
+    "log1p",
+    impl_aten=torch.log1p,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+reciprocal = _make_elementwise_unary_prim(
+    "reciprocal",
+    impl_aten=torch.reciprocal,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+neg = _make_elementwise_unary_prim(
+    "neg",
+    impl_aten=torch.neg,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+round = _make_elementwise_unary_prim(
+    "round",
+    impl_aten=torch.round,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sign = _make_elementwise_unary_prim(
+    "sign",
+    impl_aten=torch.sign,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sin = _make_elementwise_unary_prim(
+    "sin",
+    impl_aten=torch.sin,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sinh = _make_elementwise_unary_prim(
+    "sinh",
+    impl_aten=torch.sinh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sqrt = _make_elementwise_unary_prim(
+    "sqrt",
+    impl_aten=torch.sqrt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+square = _make_elementwise_unary_prim(
+    "square",
+    impl_aten=torch.square,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+tan = _make_elementwise_unary_prim(
+    "tan",
+    impl_aten=torch.tan,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+#
+# Elementwise binary operations
+#
+# TODO: we should be able to stamp these out but it's a little tricky with FX's name resolution
+def _add_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.add(a, b)  # type: ignore[attr-defined]
+
+
+add = _make_elementwise_binary_prim(
+    name="add",
+    impl_aten=torch.add,
+    impl_nvfuser=_add_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+atan2 = _make_elementwise_binary_prim(
+    name="atan2",
+    impl_aten=torch.atan2,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_and = _make_elementwise_binary_prim(
+    "bitwise_and",
+    impl_aten=torch.bitwise_and,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_not = _make_elementwise_binary_prim(
+    "bitwise_not",
+    impl_aten=torch.bitwise_not,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_or = _make_elementwise_binary_prim(
+    "bitwise_or",
+    impl_aten=torch.bitwise_or,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_xor = _make_elementwise_binary_prim(
+    "bitwise_xor",
+    impl_aten=torch.bitwise_xor,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: complex needs a special meta to account for its float -> complex behavior
+# complex = _make_elementwise_binary_prim(
+#   impl_aten=torch.complex,
+#   doc="",
+# )
+
+# div prim performs truncation division on integer inputs
+#   and true division for floating and complex inputs
+def _div_aten(a, b):
+    if isinstance(a, (bool, int)):
+        return torch.div(a, b, rounding_mode="trunc")
+    return torch.true_divide(a, b)
+
+
+def _div_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.div(a, b)  # type: ignore[attr-defined]
+
+
+div = _make_elementwise_binary_prim(
+    "div",
+    impl_aten=_div_aten,
+    impl_nvfuser=_div_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+eq = _make_elementwise_binary_prim(
+    "eq",
+    impl_aten=torch.eq,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+
+def _ge_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.ge(a, b)  # type: ignore[attr-defined]
+
+
+ge = _make_elementwise_binary_prim(
+    "ge",
+    impl_aten=torch.ge,
+    impl_nvfuser=_ge_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+
+def _gt_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.gt(a, b)  # type: ignore[attr-defined]
+
+
+gt = _make_elementwise_binary_prim(
+    "gt",
+    impl_aten=torch.gt,
+    impl_nvfuser=_gt_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+igamma = _make_elementwise_binary_prim(
+    "igamma",
+    impl_aten=torch.special.gammainc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+igammac = _make_elementwise_binary_prim(
+    "igammac",
+    impl_aten=torch.special.gammaincc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _le_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.le(a, b)  # type: ignore[attr-defined]
+
+
+le = _make_elementwise_binary_prim(
+    "le",
+    impl_aten=torch.le,
+    impl_nvfuser=_le_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+
+def _lt_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.lt(a, b)  # type: ignore[attr-defined]
+
+
+lt = _make_elementwise_binary_prim(
+    "lt",
+    impl_aten=torch.lt,
+    impl_nvfuser=_lt_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+
+def _wrap_scalar(a: NumberType, *, dtype: torch.dtype = None) -> torch.Tensor:
+    """
+    Wraps a Number into a Tensor of corresponding dtype.
+
+    Note: this should not generally be used, but some torch functions don't
+    accept scalars, so it's necessary for their prims to do so.
+    """
+    dtype = dtype if dtype is not None else utils.type_to_dtype(type(a))
+    return torch.tensor(a, dtype=dtype)
+
+
+# Note: the following impls are because torch.maximum and torch.mininum do not support scalar inputs
+def _maximum_aten(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+) -> TensorLikeType:
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = _wrap_scalar(b, dtype=a.dtype)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = _wrap_scalar(a, dtype=b.dtype)
+
+    return torch.maximum(a, b)  # type: ignore[arg-type]
+
+
+maximum = _make_elementwise_binary_prim(
+    "maximum",
+    impl_aten=_maximum_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _minimum_aten(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+) -> TensorLikeType:
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = _wrap_scalar(b, dtype=a.dtype)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = _wrap_scalar(a, dtype=b.dtype)
+
+    return torch.minimum(a, b)  # type: ignore[arg-type]
+
+
+minimum = _make_elementwise_binary_prim(
+    "minimum",
+    impl_aten=_minimum_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _mul_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType):
+    return fd.Ops.mul(a, b)  # type: ignore[attr-defined]
+
+
+mul = _make_elementwise_binary_prim(
+    "mul",
+    impl_aten=torch.mul,
+    impl_nvfuser=_mul_nvfuser,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+ne = _make_elementwise_binary_prim(
+    "ne",
+    impl_aten=torch.ne,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+nextafter = _make_elementwise_binary_prim(
+    "nextafter",
+    impl_aten=torch.nextafter,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+pow = _make_elementwise_binary_prim(
+    "pow",
+    impl_aten=torch.pow,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+rsqrt = _make_elementwise_binary_prim(
+    "rsqrt",
+    impl_aten=torch.rsqrt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+shift_left = _make_elementwise_binary_prim(
+    "shift_left",
+    impl_aten=torch.bitwise_left_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+shift_right_arithmetic = _make_elementwise_binary_prim(
+    "shift_right_arithmetic",
+    impl_aten=torch.bitwise_right_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+shift_right_logical = _not_impl
+
+sub = _make_elementwise_binary_prim(
+    "sub",
+    impl_aten=torch.sub,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+#
+# View operations
+#
+# TODO: model view relationships
+# TODO: model storage
+def _as_strided_meta(
+    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
+) -> TensorLikeType:
+    assert len(size) == len(stride)
+    assert storage_offset >= 0
+    utils.validate_strides(stride)
+    utils.validate_shape(size)
+
+    if reduce(operator.mul, size) == 0:
+        # NOTE: This special case is to avoid having to acquire the storage below
+        # as_strided to shapes with no elements are trivially valid, so it's OK
+        pass
+    elif isinstance(a, torch.Tensor):
+        utils.check_in_bounds_for_storage(a.storage(), size, stride, storage_offset)
+
+    return TensorMeta(a, shape=size, strides=stride)
+
+
+def _as_strided_aten(
+    a: Tensor, size: ShapeType, stride: StrideType, storage_offset: int
+) -> Tensor:
+    return torch.as_strided(a, size, stride, storage_offset)
+
+
+_as_strided_doc = """
+    Creates a view of the tensor with the given shape (size), strides (stride) and
+    storage offset (storage_offset).
+"""
+
+as_strided = _make_prim(
+    schema="as_strided(Tensor(a!) a, int[] size, int[] stride, int storage_offset) -> Tensor(a!)",
+    meta=_as_strided_meta,
+    impl_aten=_as_strided_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_as_strided_doc,
+)
+
+
+def _broadcast_in_dim_meta(
+    a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int]
+):
+    # Type checks
+    assert isinstance(a, TensorLike)
+    assert isinstance(shape, Sequence)
+    assert isinstance(broadcast_dimensions, Sequence)
+
+    # every dimension must be accounted for
+    assert a.ndim == len(broadcast_dimensions)
+
+    # broadcast shape must have weakly more dimensions
+    assert len(shape) >= a.ndim
+
+    # broadcast_dimensions must be an ascending sequence
+    # (no relative reordering of dims) of integers and
+    # each dimension must be within the new shape
+    def _greater_than_reduce(acc, x):
+        assert isinstance(x, int)
+        assert x > acc
+        assert x < len(shape)
+
+        return x
+
+    reduce(lambda acc, x: _greater_than_reduce(acc, x), broadcast_dimensions, -1)
+
+    # shape must be broadcastable to
+    for idx, new_idx in enumerate(broadcast_dimensions):
+        assert a.shape[idx] == 1 or a.shape[idx] == shape[new_idx]
+
+    new_strides = []
+    original_idx = 0
+    for idx in range(len(shape)):
+        if idx in broadcast_dimensions:
+            new_strides.append(a.stride()[original_idx])
+            original_idx = original_idx + 1
+        else:
+            new_strides.append(0)
+
+    return TensorMeta(a, shape=shape, strides=new_strides)
+
+
+def _broadcast_in_dim_aten(a, shape, broadcast_dimensions):
+    s = list(shape)
+    for broadcast_dimension in broadcast_dimensions:
+        s[broadcast_dimension] = -1
+
+    v = a
+    for idx, x in enumerate(s):
+        if x != -1:
+            v = v.unsqueeze(idx)
+
+    return v.expand(shape)
+
+
+def _broadcast_in_dim_nvfuser(
+    fd: Any,
+    a: torch.Tensor,
+    shape: ShapeType,
+    broadcast_dimensions: ShapeType,
+):
+    return fd.Ops.broadcast_in_dim(a, shape, broadcast_dimensions)  # type: ignore[attr-defined]
+
+
+_broadcast_in_dim_doc = """
+  Creates a view of a with the specified shape.
+
+  Allows adding dimensions of any length and broadcasting
+  dimensions of length one in a to any length.
+
+  The location of the broadcast dimensions must be specified
+  using the broadcast_dimensions argument. Changing the
+  relative order of dimensions is not supported.
+  """
+
+broadcast_in_dim = _make_prim(
+    schema="broadcast_in_dim(Tensor(a) a, int[] shape, int[] broadcast_dimensions) -> Tensor(a)",
+    meta=_broadcast_in_dim_meta,
+    impl_aten=_broadcast_in_dim_aten,
+    impl_nvfuser=_broadcast_in_dim_nvfuser,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_broadcast_in_dim_doc,
+)
+
+
+def _collapse_view_helper(
+    a: TensorLikeType, start: int, end: int
+) -> Tuple[Optional[ShapeType], Optional[StrideType]]:
+    assert isinstance(a, TensorLike)
+
+    # Special-case for zero dimensional tensors
+    if a.ndim == 0:
+        shape = (1,)
+        strides = (1,)
+    else:
+        shape = a.shape  # type: ignore[assignment]
+        strides = a.stride()
+
+    utils.validate_idx(len(shape), start)
+    utils.validate_exclusive_idx(len(shape), end)
+
+    # Verifies end is strictly greater than start
+    # (Collapse requires a non-empty interval)
+    if end <= start:
+        msg = "Attempting to collapse but end, {0}, is less than or equal to start, {1}!".format(
+            end, start
+        )
+        raise ValueError(msg)
+
+    if a.ndim == 0 or (end - 1 == start):
+        return shape, strides
+
+    length = shape[end - 1]
+    stride = strides[end - 1]
+    for idx in reversed(range(start, end - 1)):
+        if shape[idx] == 0 or shape[idx + 1] == 0:
+            length = 0
+            stride = 0
+            break
+
+        if shape[idx] == 1:
+            continue
+
+        length = length * shape[idx]
+        stride = min(stride, strides[idx])
+
+        if (
+            a.numel() > 0
+            and shape[idx + 1] != 1
+            and not (strides[idx] == strides[idx + 1] * shape[idx + 1])
+        ):
+            return None, None
+
+    new_shape = shape[:start] + (length,) + shape[end:]
+    new_strides = strides[:start] + (stride,) + strides[end:]
+
+    # NOTE: when the input has no elements it's restrided as if it were contiguous
+    if a.numel() == 0:
+        new_strides = utils.make_contiguous_strides_for(new_shape)
+
+    return new_shape, new_strides
+
+
+def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeType:
+    new_shape, new_strides = _collapse_view_helper(a, start, end)
+
+    if new_shape is None:
+        msg = "Attempting to view a collapsed tensor, but no such view exists!"
+        raise ValueError(msg)
+
+    return TensorMeta(a, shape=new_shape, strides=new_strides)
+
+
+def _collapse_view_aten(a: Tensor, start: int, end: int) -> Tensor:
+    # Special-cases zero-dim tensors
+    if a.ndim == 0:
+        shape = (1,)
+    else:
+        shape = a.shape  # type: ignore[assignment]
+
+    dim_length = 1
+    for idx in range(start, end):
+        dim_length = dim_length * shape[idx]
+
+    new_shape = shape[0:start] + (dim_length,) + shape[end:]
+
+    return a.view(new_shape)
+
+
+_collapse_view_doc = """
+  Creates a view of a with the dimensions between
+  start (inclusive) and end (exclusive) merged into a
+  single dimension.
+
+  If it's not possible to take such a view then an error
+  is thrown. See collapse instead.
+
+  The dimensions can be merged if and only if
+  they are all "nested" with each other. That is, they all
+  have the property that
+
+  stride[i] = stride[i+1] * shape[i+1]
+
+  for all i in [start, end - 1).
+  """
+
+collapse_view = _make_prim(
+    schema="collapse_view(Tensor(a) a, int start, int end) -> Tensor(a)",
+    meta=_collapse_view_meta,
+    impl_aten=_collapse_view_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_collapse_view_doc,
+)
+
+
+def expand_dims(a: TensorLikeType, dimensions: DimsSequenceType) -> TensorLikeType:
+    """
+    Creates a view of a with a.ndim + len(dimensions) dimensions, with new
+    dimensions of length one at the dimensions specified by dimensions.
+    """
+    dims = sorted(utils.canonicalize_dims(a.ndim, dimensions))  # type: ignore[arg-type]
+    if len(set(dims)) != len(dims):
+        msg = "Received duplicate dimensions to expand in {0}".format(str(dimensions))
+        raise ValueError(msg)
+
+    new_shape = list(a.shape)
+    for idx in dims:
+        new_shape.insert(idx, 1)
+
+    broadcast_dimensions = [
+        idx for idx in range(len(new_shape)) if idx not in dimensions
+    ]
+    return broadcast_in_dim(a, new_shape, broadcast_dimensions)
+
+
+# Note: saves the Python slice object because we're about to clobber its name with the slice prim
+pyslice: Type[slice] = slice
+
+
+def _slice_meta(
+    a: TensorLikeType,
+    start_indices: DimsSequenceType,
+    limit_indices: DimsSequenceType,
+    strides: Optional[StrideType] = None,
+) -> TensorLikeType:
+    _strides = strides if strides is not None else [1] * len(start_indices)
+
+    if a.ndim != len(start_indices):
+        msg = "Attempting to slice tensor of rank {0} with start_indices of length {1}!".format(
+            a.ndim, len(start_indices)
+        )
+        raise ValueError(msg)
+
+    if a.ndim != len(limit_indices):
+        msg = "Attempting to slice tensor of rank {0} with limit_indices of length {1}!".format(
+            a.ndim, len(limit_indices)
+        )
+        raise ValueError(msg)
+
+    if a.ndim != len(_strides):
+        msg = (
+            "Attempting to slice tensor of rank {0} with strides of length {1}!".format(
+                a.ndim, len(limit_indices)
+            )
+        )
+        raise ValueError(msg)
+
+    for x, y in zip(start_indices, a.shape):
+        if x < 0:
+            msg = "Attempting to slice a tensor with a negative start index of {0}!".format(
+                x
+            )
+            raise ValueError(msg)
+        if x > y:
+            msg = (
+                "Attempting to slice a tensor but a start index in {0} is greater than"
+                " the length of its corresponding dimension in shape {1}".format(
+                    start_indices, a.shape
+                )
+            )
+            raise ValueError(msg)
+
+    for x, y, z in zip(limit_indices, a.shape, start_indices):
+        if x < 0:
+            msg = "Attempting to slice a tensor with a negative stop index of {0}!".format(
+                x
+            )
+            raise ValueError(msg)
+        if x > y:
+            msg = (
+                "Attempting to slice a tensor but a stop index in {0} is greater than the length of "
+                " its corresponding dimension in shape {1}".format(
+                    limit_indices, a.shape
+                )
+            )
+            raise ValueError(msg)
+        if x < z:
+            msg = (
+                "Attempting to slice a tensor but a start index in {0} is greater than "
+                " its corresponding stop index {1}".format(x, z)
+            )
+
+    for x in _strides:
+        if x <= 0:
+            msg = (
+                "Attempting to slice a tensor with a non-positive step of {0}!".format(
+                    x
+                )
+            )
+            raise ValueError(msg)
+
+    new_shape = []
+    for x, y, z in zip(start_indices, limit_indices, _strides):
+        new_shape.append(math.floor((y - x) / z))
+
+    new_strides = []
+    for x, y in zip(a.stride(), _strides):
+        new_strides.append(x * y)
+
+    return TensorMeta(a, shape=new_shape, strides=new_strides)
+
+
+def _slice_aten(
+    a: Tensor,
+    start_indices: DimsSequenceType,
+    limit_indices: DimsSequenceType,
+    strides: Optional[StrideType] = None,
+) -> Tensor:
+    _strides = strides if strides is not None else [1] * len(start_indices)
+
+    slices = []
+    for start, stop, step in zip(start_indices, limit_indices, _strides):
+        slices.append(pyslice(start, stop, step))
+
+    return operator.getitem(a, slices)  # type: ignore[call-overload]
+
+
+_slice_doc = """
+    Creates a view of a "bounding box" within the tensor.
+
+    The bounding box is specified independently in each of the tensor's dimensions.
+    start_indices and limit_indices describe the box's boundaries for their corresponding
+    dimensions. If strides is specified then they specify the step size between elements
+    in their corresponding dimension.
+
+    This operation is analogous to slicing in NumPy, but does not permit slices where
+    the stop indices are less than the start indices.
+    """
+
+slice = _make_prim(
+    schema="slice(Tensor(a) a, int[] start_indices, int[] limit_indices, int[]? strides=None) -> Tensor(a)",
+    meta=_slice_meta,
+    impl_aten=_slice_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_slice_doc,
+)
+
+
+def _slice_in_dim_meta(
+    a: TensorLikeType,
+    start_index: int,
+    limit_index: int,
+    stride: int = 1,
+    axis: int = 0,
+) -> TensorLikeType:
+    if axis < 0:
+        msg = "slice_in_dim: received a negative axis {0}".format(axis)
+        raise ValueError(msg)
+    if axis >= a.ndim:
+        msg = "slice_in_dim: axis {0} is greater or equal to the rank {1} of the tensor".format(
+            axis, a.ndim
+        )
+        raise ValueError(msg)
+
+    if start_index < 0:
+        msg = "slice_in_dim: received a negative start_index {0}".format(start_index)
+        raise ValueError(msg)
+
+    if start_index > a.shape[axis]:
+        msg = "slice_in_dim: start_index is greater than the length {0} of dimension {1}".format(
+            start_index, axis
+        )
+        raise ValueError(msg)
+
+    if limit_index > a.shape[axis]:
+        msg = "slice_in_dim: limit_index is greater than the length {0} of dimension {1}".format(
+            limit_index, axis
+        )
+        raise ValueError(msg)
+
+    if limit_index < start_index:
+        msg = "slice_in_dim: received a limit_index {0} less than the start_index {1}".format(
+            limit_index, start_index
+        )
+        raise ValueError(msg)
+
+    if stride < 0:
+        msg = "slice_in_dim: received a non-positive stride of {0}!".format(stride)
+        raise ValueError(msg)
+
+    start_indices = [0] * a.ndim
+    limit_indices = list(a.shape)
+    strides = [1] * a.ndim
+
+    start_indices[axis] = start_index
+    limit_indices[axis] = limit_index
+    strides[axis] = stride
+
+    return _slice_meta(a, start_indices, limit_indices, strides)
+
+
+def _slice_in_dim_aten(
+    a: Tensor,
+    start_index: int,
+    limit_index: int,
+    stride: int = 1,
+    axis: int = 0,
+) -> Tensor:
+    start_indices = [0] * a.ndim
+    limit_indices = list(a.shape)
+    strides = [1] * a.ndim
+
+    start_indices[axis] = start_index
+    limit_indices[axis] = limit_index
+    strides[axis] = stride
+
+    return slice(a, start_indices, limit_indices, strides)
+
+
+_slice_in_dim_doc = """
+    Convenience wrapper for slicing just one dimension using slice.
+    """
+
+slice_in_dim = _make_prim(
+    schema="slice_in_dim(Tensor(a) a, int start_index, int limit_index, int stride=1, int axis=0) -> Tensor(a)",
+    meta=_slice_in_dim_meta,
+    impl_aten=_slice_in_dim_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_slice_in_dim_doc,
+)
+
+
+def _split_dim_meta(a: TensorLikeType, dim: int, outer_length: int) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    utils.validate_idx(a.ndim, dim)
+    utils.validate_dim_length(outer_length)
+
+    # Verifies the dim can be split with the specified lhs_length
+    _inner_length = a.shape[dim] / outer_length
+    inner_length: int = int(_inner_length)
+
+    if inner_length != _inner_length:
+        msg = "Attempting to split dimension of length {0}, but outer length of {1} divides it with a remainder!".format(
+            a.shape[dim], outer_length
+        )
+        raise ValueError(msg)
+
+    new_shape: List[int] = []
+    new_strides: List[int] = []
+    for idx in range(a.ndim):
+        if idx == dim:
+            new_shape.extend((outer_length, inner_length))
+            new_strides.extend((a.stride()[idx] * inner_length, a.stride()[idx]))
+        else:
+            new_shape.append(a.shape[idx])
+            new_strides.append(a.stride()[idx])
+
+    return TensorMeta(a, shape=new_shape, strides=new_strides)
+
+
+def _split_dim_aten(a: Tensor, dim: int, outer_length: int) -> Tensor:
+    inner_length = int(a.shape[dim] / outer_length)
+    new_shape = a.shape[0:dim] + (outer_length, inner_length) + a.shape[dim + 1 :]
+
+    return a.view(new_shape)
+
+
+_split_dim_doc = """
+  Creates a view of a with the given dimension (of length l) split
+  into two dimensions, with the outer of the two having
+  length outer_length and the inner of the two having computed
+  length inner_length such outer_length * inner_length = l.
+  """
+
+# TODO: consider renaming split_dim_view
+split_dim = _make_prim(
+    schema="split_dim(Tensor(a) a, int dim, int outer_length) -> Tensor(a)",
+    meta=_split_dim_meta,
+    impl_aten=_split_dim_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_split_dim_doc,
+)
+
+# Note: allows dimensions to be specified redundantly
+def _squeeze_meta(a: TensorLikeType, dimensions: Sequence) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+
+    for idx in dimensions:
+        utils.validate_idx(a.ndim, idx)
+        assert a.shape[idx] == 1
+
+    new_shape = []
+    new_strides = []
+    for idx in range(len(a.shape)):
+        if idx in dimensions:
+            continue
+
+        new_shape.append(a.shape[idx])
+        new_strides.append(a.stride()[idx])
+
+    return TensorMeta(a, shape=new_shape, strides=new_strides)
+
+
+def _squeeze_aten(a: Tensor, dimensions: Sequence) -> Tensor:
+    squeezes = 0
+    for idx in dimensions:
+        a = torch.squeeze(a, dim=(idx - squeezes))
+        squeezes = squeezes + 1
+
+    return a
+
+
+_squeeze_doc = """
+  Creates a view of the tensor with the specified dimensions removed.
+
+  The removed dimensions must each have length one.
+  """
+
+squeeze = _make_prim(
+    schema="squeeze(Tensor(a) a, int[] dimensions) -> Tensor(a)",
+    meta=_squeeze_meta,
+    impl_aten=_squeeze_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_squeeze_doc,
+)
+
+
+def _transpose_meta(a: TensorLikeType, permutation: DimsSequenceType) -> TensorLikeType:
+    if a.ndim != len(permutation):
+        msg = "Attempting to permute a tensor of rank {0}, but received a permutation of length {1}!".format(
+            a.ndim, len(permutation)
+        )
+        raise ValueError(msg)
+
+    if not utils.is_valid_permutation(a.ndim, permutation):
+        msg = "Received an invalid permutation, {0}!".format(permutation)
+        raise ValueError(msg)
+
+    new_shape = [0] * a.ndim
+    new_strides = [0] * a.ndim
+    for idx, dim in enumerate(permutation):
+        new_shape[idx] = a.shape[dim]
+        new_strides[idx] = a.stride()[dim]
+
+    return TensorMeta(a, shape=tuple(new_shape), strides=tuple(new_strides))
+
+
+def _transpose_aten(a: Tensor, permutation: DimsSequenceType) -> Tensor:
+    return torch.permute(a, permutation)
+
+
+_transpose_doc = """
+    Creates a view of the tensor with its dimensions permuted.
+
+    The length of the permutation must be the rank of the tensor,
+    and each element of the permutation specifies the new order
+    for the corresponding dimension.
+    """
+
+transpose = _make_prim(
+    schema="transpose(Tensor(a) a, int[] permutation) -> Tensor(a)",
+    meta=_transpose_meta,
+    impl_aten=_transpose_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_transpose_doc,
+)
+
+
+def _view_of_meta(a: TensorLikeType) -> TensorLikeType:
+    return TensorMeta(a)
+
+
+def _view_of_aten(a: Tensor) -> Tensor:
+    return a.view(a.shape)
+
+
+_view_of_doc = """
+    Creates a view of the tensor.
+    """
+
+view_of = _make_prim(
+    schema="view_of(Tensor(a) a) -> Tensor",
+    meta=_view_of_meta,
+    impl_aten=_view_of_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_view_of_doc,
+)
+
+#
+# Shape operations
+#
+def collapse(a: Tensor, start: int, end: int) -> Tensor:
+    """
+    Wrapper around reshape that collapses a span of dimensions.
+
+    See collapse_view for the corresponding view operation.
+    """
+
+    dim_length = 1
+    for idx in range(start, end):
+        dim_length = dim_length * a.shape[idx]
+
+    new_shape = a.shape[0:start] + (dim_length,) + a.shape[end:]
+    return reshape(a, new_shape)
+
+
+# TODO: review stride logic
+def _concatenate_meta(tensors: Sequence[TensorLikeType], dim: int) -> TensorLikeType:
+    if len(tensors) == 0:
+        msg = "concatenate expects at least one tensor, but received zero!"
+        raise ValueError(msg)
+
+    for tensor in tensors:
+        assert isinstance(tensor, TensorLike)
+
+    utils.check_same_dtype(*tensors)
+    utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
+
+    shape = tensors[0].shape
+    utils.validate_idx(tensors[0].ndim, dim)
+
+    # Verifies same shape (except in the concat dimension)
+    concat_length = 0
+    for tensor in tensors:
+        for idx, (common_length, length) in enumerate(zip(shape, tensor.shape)):
+            if idx == dim:
+                concat_length = concat_length + length
+            else:
+                assert length == common_length
+
+    new_shape = list(tensors[0].shape).copy()
+    new_shape[dim] = concat_length
+    return TensorMeta(
+        tensors[0],
+        shape=new_shape,
+        strides=utils.make_contiguous_strides_for(new_shape),
+    )
+
+
+def _concatenate_aten(
+    tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: int
+) -> Tensor:
+    return torch.cat(tensors, dim)
+
+
+_concatenate_doc = """
+  Concatenates tensors along the specified dimension.
+
+  The tensors' shapes must have the same rank and same length for other dimensions.
+  """
+
+concatenate = _make_prim(
+    schema="concatenate(Tensor[] tensors, int dim) -> Tensor",
+    meta=_concatenate_meta,
+    impl_aten=_concatenate_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_concatenate_doc,
+)
+
+
+def _reshape_meta(a: TensorLikeType, shape: ShapeType):
+    assert isinstance(a, TensorLike)
+    utils.validate_shape(shape)
+
+    # Validates the tensor and the requested shape have the
+    # same number of elements
+    numel = reduce(operator.mul, shape)
+    if numel != a.numel():
+        msg = "Attempting to reshape a tensor with {0} elements to a shape with {1} elements!".format(
+            a.numel(), numel
+        )
+        raise ValueError(msg)
+
+    return TensorMeta(a, shape=shape, strides=utils.make_contiguous_strides_for(shape))
+
+
+def _reshape_aten(a: Tensor, shape: ShapeType) -> Tensor:
+    return a.reshape(shape).contiguous().clone()
+
+
+_reshape_doc = """
+  Creates a contiguous tensor with the specified shape
+  containing a copy of the data in a.
+  """
+reshape = _make_prim(
+    schema="reshape(Tensor a, int[] shape) -> Tensor",
+    meta=_reshape_meta,
+    impl_aten=_reshape_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_reshape_doc,
+)
+
+
+def _rev_meta(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
+    utils.validate_dimension_indices(a.ndim, dims)
+    return TensorMeta(a)
+
+
+_rev_doc = """
+    Reverses the order of elements along the given dimensions.
+    """
+
+rev = _make_prim(
+    schema="rev(Tensor a, int[] dims) -> Tensor",
+    meta=_rev_meta,
+    impl_aten=torch.flip,
+    return_type=RETURN_TYPE.NEW,
+    doc=_rev_doc,
+)
+
+#
+# Conditional prims
+#
+
+
+def _select_meta(
+    pred: TensorLikeType, a: TensorLikeType, b: TensorLikeType
+) -> TensorLikeType:
+    utils.check_same_device(pred, a, b, allow_cpu_scalar_tensors=True)
+    utils.check_same_shape(pred, a, b, allow_cpu_scalar_tensors=True)
+    assert pred.dtype is torch.bool
+
+    return _elementwise_meta(
+        a, b, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+def _select_aten(pred: Tensor, a: Tensor, b: Tensor) -> Tensor:
+    return torch.where(pred, a, b)
+
+
+_select_doc = """
+  Selects elements from a and b according to pred.
+
+  Where pred is true the result contains the element from a, and
+  where pred is false the result contains the element from b.
+  """
+
+select = _make_prim(
+    schema="select(Tensor pred, Tensor a, Tensor b) -> Tensor",
+    meta=_select_meta,
+    impl_aten=_select_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_select_doc,
+)
+
+#
+# Type conversions
+#
+# TODO: model memory format on TensorMeta
+# TODO: make clone a reference following its implementation in TensorFactories.cpp
+def _clone_meta(
+    a: TensorLikeType, *, memory_format: torch.memory_format
+) -> TensorLikeType:
+    strides = utils.compute_elementwise_output_strides(a)
+    return TensorMeta(a, strides=strides)
+
+
+def _clone_aten(a: Tensor, *, memory_format: torch.memory_format) -> Tensor:
+    return torch.clone(a, memory_format=memory_format)
+
+
+_clone_doc = """
+    Creates a copy of a tensors.
+"""
+
+clone = _make_prim(
+    schema="clone(Tensor a, *, MemoryFormat memory_format) -> Tensor",
+    meta=_clone_meta,
+    impl_aten=_clone_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_clone_doc,
+)
+
+
+def _convert_element_type_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
+    # Type checks
+    assert isinstance(a, TensorLike)
+    assert isinstance(dtype, torch.dtype)
+
+    strides = utils.compute_elementwise_output_strides(a)
+
+    return TensorMeta(a, strides=strides, dtype=dtype)
+
+
+def _convert_element_type_aten(a: Tensor, dtype: torch.dtype) -> Tensor:
+    # TODO: update meta objects so this can be acquired directly
+    try:
+        requires_grad = a.requires_grad
+    except Exception as e:
+        requires_grad = False
+
+    result = empty_like(a, device=a.device, dtype=dtype, requires_grad=requires_grad)
+    with torch.no_grad():
+        return copy_to(result, a)
+
+
+def _convert_element_type_nvfuser(fd: Any, a: Tensor, dtype: torch.dtype) -> Tensor:
+    nvfuser_dtype = getnvFuserDtype(dtype)
+    return fd.Ops.cast(nvfuser_dtype, a)  # type: ignore[attr-defined]
+
+
+_convert_element_type_doc = """
+  Creates a copy of a tensor with the given dtype.
+  """
+
+convert_element_type = _make_prim(
+    schema="convert_element_type(Tensor a, ScalarType dtype) -> Tensor",
+    meta=_convert_element_type_meta,
+    impl_aten=_convert_element_type_aten,
+    impl_nvfuser=_convert_element_type_nvfuser,
+    return_type=RETURN_TYPE.NEW,
+    doc=_convert_element_type_doc,
+)
+
+
+def _device_put_meta(
+    a: TensorLikeType, device: Union[str, torch.device]
+) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    assert isinstance(device, (str, torch.device))
+
+    return TensorMeta(a, device=utils.wrap_device(device))
+
+
+def _device_put_aten(a: Tensor, device: Union[str, torch.device]) -> Tensor:
+    return a.to(device)
+
+
+_device_put_doc = """
+  Creates a copy of a tensor on the given device.
+  """
+
+device_put = _make_prim(
+    schema="device_put(Tensor a, Device device) -> Tensor",
+    meta=_device_put_meta,
+    impl_aten=_device_put_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_device_put_doc,
+)
+
+# TODO: FIXME: strides are incorrect
+def _to_dtype_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(a.shape)
+    return TensorMeta(a, strides=strides, dtype=dtype)
+
+
+def _to_dtype_aten(a: Tensor, dtype: torch.dtype) -> Tensor:
+    return a.to(dtype)
+
+
+_to_dtype_doc = """
+    Creates a contiguous copy of a tensor with the given dtype.
+"""
+
+to_dtype = _make_prim(
+    schema=("to_dtype(Tensor a, ScalarType dtype) -> Tensor"),
+    meta=_to_dtype_meta,
+    impl_aten=_to_dtype_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_to_dtype_doc,
+)
+
+#
+# Inplace operators
+#
+
+
+def _copy_to_meta(a: TensorLikeType, b: TensorLikeType):
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+
+    # Validates the cast is safe
+    # TODO: move this as an option on the reference
+    # a_typ = utils.dtype_to_type(a.dtype)
+    # b_typ = utils.dtype_to_type(b.dtype)
+    # if a_typ is not utils.get_higher_type(a_typ, b_typ):
+    #     raise RuntimeError(str(b.dtype), " can't be cast safely to ", str(a.dtype), "!")
+
+    # Validates the tensors have the same number of elements
+    if a.numel() != b.numel():
+        msg = "Attempting to copy {0} elements to a tensor with {1} elements!".format(
+            b.numel(), a.numel()
+        )
+        raise RuntimeError(msg)
+
+    return a
+
+
+def _copy_to_aten(a: Tensor, b: Tensor) -> Tensor:
+    return a.copy_(b)
+
+
+_copy_to_doc = """
+  Copies the data in b to a and returns the modified a.
+  """
+
+# TODO: Remove safe casting and implement on reference instead
+copy_to = _make_prim(
+    schema="copy_to(Tensor(a!) a, Tensor b) -> Tensor(a!)",
+    meta=_copy_to_meta,
+    impl_aten=_copy_to_aten,
+    return_type=RETURN_TYPE.INPLACE,
+    doc=_copy_to_doc,
+)
+
+
+def _resize_meta(
+    a: TensorLikeType, shape: Union[torch.Size, List[int], Tuple[int, ...]]
+):
+    return TensorMeta(a, shape=shape, strides=utils.make_contiguous_strides_for(shape))
+
+
+def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor:
+    return a.resize_(shape)
+
+
+_resize_doc = """
+  Gives a tensor with no elements a new shape, returning the modified tensor.
+
+  The tensor's strides are contiguous and its values are unitialized.
+  """
+
+# TODO: review support arbitrary resizes
+resize = _make_prim(
+    schema="resize(Tensor(a!) a, int[] shape) -> Tensor(a!)",
+    meta=_resize_meta,
+    impl_aten=_resize_aten,
+    return_type=RETURN_TYPE.INPLACE,
+    doc=_resize_doc,
+)
+
+
+def _reduction_meta(inp, dims, *, output_dtype=None):
+    """
+    Meta function for single output reduction operations
+    Stride logic is incorrect
+    """
+    assert isinstance(inp, TensorLike)
+    if output_dtype is None:
+        output_dtype = inp.dtype
+    output_shape = utils.compute_reduction_output_shape(inp.shape, dims)
+    return TensorMeta(
+        shape=output_shape,
+        strides=utils.make_contiguous_strides_for(output_shape),
+        dtype=output_dtype,
+        device=inp.device,
+    )
+
+
+def _bool_return_reduction_meta(inp, dims):
+    return _reduction_meta(inp, dims, output_dtype=torch.bool)
+
+
+_sum_doc = """
+    Computes the sum of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_amax_doc = """
+    Computes the maximum value of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_amin_doc = """
+    Computes the minimum value of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+
+
+def _make_reduction_prim(name: str, impl_aten, doc):
+    """Creates a reduction prim."""
+    return _make_prim(
+        schema=f"{name}(Tensor inp, int[]? dims, *, ScalarType? output_dtype=None) -> Tensor",
+        meta=_reduction_meta,
+        impl_aten=impl_aten,
+        return_type=RETURN_TYPE.NEW,
+        doc=doc,
+    )
+
+
+def _make_bool_reduction_prim(name: str, impl_aten, doc):
+    """Creates a reduction prim that reduces to bool."""
+    return _make_prim(
+        schema=f"{name}(Tensor inp, int[]? dims, *, ScalarType? output_dtype=None) -> Tensor",
+        meta=_bool_return_reduction_meta,
+        impl_aten=impl_aten,
+        return_type=RETURN_TYPE.NEW,
+        doc=doc,
+    )
+
+
+sum = _make_reduction_prim(
+    name="sum",
+    impl_aten=torch.sum,
+    doc=_sum_doc,
+)
+
+prod = _make_reduction_prim(
+    name="prod",
+    impl_aten=torch.prod,
+    doc=_sum_doc,  # TODO: fixme
+)
+
+amax = _make_reduction_prim(
+    name="amax",
+    impl_aten=torch.amax,
+    doc=_amax_doc,
+)
+
+amin = _make_reduction_prim(
+    name="amin",
+    impl_aten=torch.amin,
+    doc=_amin_doc,
+)
+
+all = _make_bool_reduction_prim(
+    name="all",
+    impl_aten=torch.all,
+    doc="",
+)
+
+any = _make_bool_reduction_prim(
+    name="any",
+    impl_aten=torch.any,
+    doc="",
+)
+
+# TODO: layout, pin_memory, memory_format
+# TODO: model requires_grad on TensorMeta
+def _empty_meta(
+    shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _empty_aten(
+    shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> Tensor:
+    return torch.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+_empty_doc = """
+    Creates a tensor with uninitialized values and the specified shape, dtype, and device.
+"""
+
+empty = _make_prim(
+    schema="empty(int[] shape, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_empty_meta,
+    impl_aten=_empty_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_empty_doc,
+)
+
+# TODO: memory format
+def _empty_like_meta(
+    a: TensorLikeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> TensorLikeType:
+    strides: Tuple[int, ...]
+    if a.numel() == 0:
+        strides = a.stride()
+    else:
+        strides = utils.compute_elementwise_output_strides(a)
+
+    return TensorMeta(a, strides=strides, dtype=dtype, device=device)
+
+
+def _empty_like_aten(
+    a: Tensor, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> Tensor:
+    return torch.empty_like(a, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+_empty_like_doc = """
+    Creates a tensor with uninitialized values, and the same shape, dtype, and device as the
+    given tensor by default. The dtype and device settings can be overridden
+    by specifying them explicitly.
+"""
+
+empty_like = _make_prim(
+    schema="empty_like(Tensor a, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_empty_like_meta,
+    impl_aten=_empty_like_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_empty_like_doc,
+)
+
+
+def _full_meta(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _full_aten(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> Tensor:
+    # Note that Mypy thinks torch.full can't accept a complex fill_value
+    return torch.full(
+        shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+    )
+
+
+_full_doc = """
+    Creates a tensor filled with the given fill value, and with the specified shape, dtype, and device.
+"""
+
+# TODO: add layout
+full = _make_prim(
+    schema="full(int[] shape, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_full_meta,
+    impl_aten=_full_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_full_doc,
+)
+
+
+def _full_like_meta(
+    a: TensorLikeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    strides = strides = utils.compute_elementwise_output_strides(a)
+    if a.numel() == 0:
+        strides = a.stride()
+
+    return TensorMeta(a, strides=strides, dtype=dtype, device=device)
+
+
+def _full_like_aten(
+    a: Tensor,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> Tensor:
+    # Note that Mypy thinks torch.full can't accept a complex fill_value
+    return torch.full_like(
+        a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+    )
+
+
+_full_like_doc = """
+    Creates a tensor filled with the given fill value, and the same shape, dtype, and device as the
+    given tensor by default. The dtype and device settings can be overridden
+    by specifying them explicitly.
+"""
+
+full_like = _make_prim(
+    schema="full_like(Tensor a, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_full_like_meta,
+    impl_aten=_full_like_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_full_like_doc,
+)
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
new file mode 100644
index 000000000000..fee316ff3af9
--- /dev/null
+++ b/torch/_prims/context.py
@@ -0,0 +1,158 @@
+import string
+from typing import Callable, Sequence, Any, Dict
+from itertools import chain
+
+
+import torch
+from torch.fx.graph import Graph, Node
+import torch.overrides
+
+from torch._prims.utils import TensorMeta
+import torch._refs as refs
+
+
+# TODO:  automap torch operations to references
+# (need to throw a good assertion if the mapping doesn't exist)
+_torch_to_reference_map = {
+    torch.add: refs.add,
+    # torch.div: refs.div,
+    torch.mul: refs.mul,
+    torch.ge: refs.ge,
+    torch.gt: refs.gt,
+    torch.le: refs.le,
+    torch.lt: refs.lt,
+}
+
+
+class PrimContext(torch.overrides.TorchFunctionMode):
+    """
+    The prototype prim tracing context.
+
+    Example usage:
+
+    import torch._prims.utils as utils
+    from torch._prims.context import PrimContext
+    from torch._prims.executor import execute
+    from torch.overrides import push_torch_function_mode
+
+    a = torch.randn((2, 2))
+    b = torch.randn((2, 2))
+
+    with push_torch_function_mode(PrimContext):
+      meta_a = ctx.placeholder(utils.TensorMeta(a))
+      meta_b = ctx.placeholder(utils.TensorMeta(b))
+      result = torch.add(meta_a, meta_b)
+      ctx.output(result)
+
+    exc_result = execute(ctx, a, b)
+
+    Currently this only acquires a trace of prims, and
+    it does not account for control flow. As such,
+    execute must be called with tensors that have the
+    same metadata (dtype, device, shape...) as
+    the tensors used to trace the operations.
+
+    The tracing context's FX graph can be acquired
+    using its graph attribute.
+    """
+
+    def __init__(self):
+        self.graph = Graph()
+
+        # Private attributes for generating names
+        self._tensor_name_counter = 0
+        self._dim_name_counter = 0
+        self._shape_name_counter = 0
+        self._lowercase = tuple(string.ascii_lowercase)
+        self._uppercase = tuple(string.ascii_uppercase)
+
+    @staticmethod
+    def _create_name(idx, chars):
+        name = ""
+        while idx >= len(chars):
+            name = chars[idx % len(chars)] + name
+            idx = idx - len(chars)
+        name = chars[idx] + name
+
+        return name
+
+    def _tensor_name(self):
+        idx = self._tensor_name_counter
+        self._tensor_name_counter = self._tensor_name_counter + 1
+
+        return self._create_name(idx, self._lowercase)
+
+    def _add_user(self, tm: TensorMeta, node: Node) -> None:
+        assert tm.node is not None
+        tm.node.users[node] = None
+
+    def placeholder(self, a: Any):
+        name = self._tensor_name()
+        node = self.graph.placeholder(name)
+
+        if isinstance(a, TensorMeta):
+            if a.node is not None:
+                raise ValueError("Attempting to reuse a TensorMeta in a new trace!")
+            a.tname = name
+            a.node = node
+
+        return a
+
+    def output(self, tm: TensorMeta):
+        # TODO: allow other output types
+        assert isinstance(tm, TensorMeta)
+
+        node = self.graph.output(tm)
+        self._add_user(tm, node)
+
+    def __torch_function__(
+        self,
+        func: Callable,
+        types: Sequence,
+        args: Sequence[Any] = (),
+        kwargs: Dict = None,
+    ):
+        """
+        Determines which function to call. The order of which
+        function is called is determined by:
+
+        - func's "meta" attribute, if it exists
+        - if func is a torch operation, its corresponding reference
+        - func
+        """
+
+        if kwargs is None:
+            kwargs = {}
+
+        if hasattr(func, "meta"):
+            # TODO: add check that all args/kwargs are 'registered' properly
+            # to this trace
+
+            output = func.meta(*args, **kwargs)  # type: ignore[attr-defined]
+
+            # Updates graph
+            # TODO: handle outputs with multiple tensors
+            # TODO: handle non-tensor outputs
+            assert isinstance(output, TensorMeta)
+            output_name = self._tensor_name()
+            node = self.graph.create_node(
+                "call_function", func, name=output_name, args=args, kwargs=kwargs
+            )
+            output.tname = output_name
+            output.node = node
+
+            # Marks uses
+            for x in (
+                x for x in chain(args, kwargs.values()) if isinstance(x, TensorMeta)
+            ):
+                self._add_user(x, node)
+
+            return output
+
+        # Remaps torch operations to their references
+        if func in _torch_to_reference_map:
+            fn = _torch_to_reference_map[func]
+            with torch.overrides.enable_torch_function_mode(self, replace=self.inner):
+                return fn(*args, **kwargs)  # type: ignore[operator]
+
+        return func(*args, **kwargs)
diff --git a/torch/_prims/executor.py b/torch/_prims/executor.py
new file mode 100644
index 000000000000..4675b520ac1c
--- /dev/null
+++ b/torch/_prims/executor.py
@@ -0,0 +1,113 @@
+from typing import Callable
+
+import torch
+
+from torch.fx import GraphModule
+from torch._prims.utils import TensorMeta, getnvFuserDtype
+from torch._prims.context import PrimContext
+import torch.overrides
+
+if torch.cuda.is_available():
+    from torch._C._nvfuser import Fusion, FusionDefinition  # type: ignore[import]
+
+
+def execute(ctx: PrimContext, *args, executor: str = "aten", **kwargs):
+    """
+    Prototype ATen executor.
+
+    Just executes the context's graph.
+    """
+
+    if executor == "aten":
+        gm = GraphModule({}, ctx.graph)
+        return gm.forward(*args, **kwargs)
+    elif executor == "nvfuser":
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                "Attempting to use nvFuser trace executor but CUDA is not available!"
+            )
+
+        # PROTOTYPE nvfuser executor
+        # Only accepts tensor inputs and single tensor outputs
+        # Does not handle kwargs
+        # Does not support reusing the same ctx to execute!
+        assert len(kwargs) == 0
+        # TODO: make this a proper trace -> trace transform that
+        # doesn't mutate the context
+        graph_fd = ctx.graph.placeholder("fd")
+        ctx.graph._root.append(graph_fd)
+
+        fusion = Fusion()
+        with FusionDefinition(fusion) as fd:
+            # Transforms graph to call nvfuser lowerings
+            nv_args = [fd]
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    x = fd.define_tensor(
+                        arg.size(), arg.stride(), getnvFuserDtype(arg.dtype)
+                    )
+                    fd.add_input(x)
+                    nv_args.append(x)
+                else:
+                    nv_args.append(x)
+
+            for x in ctx.graph.nodes:
+                if x.op == "call_function":
+                    x.target = x.target.impl_nvfuser
+                    x.args = (graph_fd,) + x.args
+
+            gm = GraphModule({}, ctx.graph)
+            out = gm.forward(*nv_args)
+            fd.add_output(out)
+
+            return fusion.execute(
+                tuple(arg for arg in args if isinstance(arg, torch.Tensor))
+            )[0]
+
+    msg = "Received unexpected value for 'executor': {0}. Allowed values are: aten, nvfuser.".format(
+        executor
+    )
+    raise ValueError(msg)
+
+
+def make_traced(fn: Callable):
+    """
+    Returns a function that, when called, will
+    trace its torch operations to prims and then
+    execute those prims on the requested trace executor
+    (possibly lowering them to that trace executor first).
+
+    Only supports the torch operations defined in _torch_to_reference_map
+    in context.py and operations with positional args. All args must
+    be tensors and the function must return a single tensor. In the
+    near future all these restrictions will be lifted.
+
+    Example usage:
+
+    def foo(a, b):
+      return torch.add(a, b)
+
+    traced_foo = make_traced(foo)
+
+    a = torch.randn((1, 2, 3, 4, 5), device='cuda')
+    b = torch.randn((1, 2, 3, 4, 5), device='cuda')
+    result = traced_foo(a, b, executor='nvfuser')
+
+    Executor may be either 'aten' or 'nvfuser'.
+    """
+
+    def _traced(*args, executor="aten"):
+        ctx: PrimContext
+        with torch.overrides.push_torch_function_mode(PrimContext) as ctx:  # type: ignore[attr-defined, assignment]
+            placeholders = []
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    placeholders.append(ctx.placeholder(TensorMeta(arg)))
+                else:
+                    placeholders.append(ctx.placeholder(arg))
+
+            result = fn(*placeholders)
+            ctx.output(result)
+        return execute(ctx, *args, executor=executor)
+
+    return _traced
diff --git a/torch/_prims/utils.py b/torch/_prims/utils.py
new file mode 100644
index 000000000000..beb94b2069f4
--- /dev/null
+++ b/torch/_prims/utils.py
@@ -0,0 +1,1104 @@
+from __future__ import annotations
+
+from typing import Any, Union, Sequence, Optional, Callable, Dict, Tuple, List
+from enum import Enum
+from functools import reduce, cmp_to_key
+import operator
+
+import torch
+
+# nvFuser imports are conditional on CUDA being available
+if torch.cuda.is_available():
+    from torch._C._nvfuser import DataType  # type: ignore[import]
+
+    _torch_dtype_to_nvfuser_dtype_map = {
+        torch.cdouble: DataType.ComplexDouble,
+        torch.cfloat: DataType.ComplexFloat,
+        torch.double: DataType.Double,
+        torch.float: DataType.Float,
+        torch.half: DataType.Half,
+        torch.bfloat16: DataType.BFloat16,
+        torch.long: DataType.Int,
+        torch.int: DataType.Int32,
+        torch.bool: DataType.Bool,
+    }
+else:
+    _torch_dtype_to_nvfuser_dtype_map = {}
+
+
+def getnvFuserDtype(dtype: torch.dtype):
+    """
+    Translates from torch.dtype to nvFuser's DataType enum
+    """
+    return _torch_dtype_to_nvfuser_dtype_map[dtype]
+
+
+ShapeType = Union[torch.Size, List[int], Tuple[int, ...]]
+StrideType = Union[List[int], Tuple[int, ...]]
+DimsType = Union[int, List[int], Tuple[int, ...]]
+DimsSequenceType = Union[List[int], Tuple[int, ...]]
+NumberType = Union[bool, int, float, complex]
+Number = (bool, int, float, complex)
+
+
+class TensorMeta(torch.Tensor):
+    """
+    Model tensor metadata.  Not a stock meta tensor because device is modeled
+    as the original device (not meta device), also we have different behavior
+    for some high level Python bindings
+    """
+
+    # Note: this will be an fx Node if it's ever
+    # populated, but some Meta-internal jobs don't include fx
+    node: Optional[Any]
+    tname: str
+
+    @staticmethod
+    def __new__(
+        cls,
+        tensorlike: Optional[Union[TensorMeta, NumberType, torch.Tensor]] = None,
+        *,
+        shape: Optional[ShapeType] = None,
+        strides: Optional[StrideType] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str]] = None,
+    ):
+
+        if isinstance(tensorlike, Number):
+            assert not shape and (shape is None or isinstance(shape, Sequence))
+            assert not strides and (strides is None or isinstance(strides, Sequence))
+            inferred_shape: Tuple[int, ...] = ()
+            inferred_strides: Tuple[int, ...] = ()
+            inferred_dtype = type_to_dtype(type(tensorlike))
+            inferred_device = torch.device("cpu")
+            # TODO: This looks wrong, a number that is wrapped into a tensor
+            # needs to behave differently than a scalar tensor for type
+            # promotion purposes
+        elif tensorlike is not None:
+            assert isinstance(tensorlike, (TensorMeta, torch.Tensor))
+            inferred_shape = tuple(tensorlike.shape)
+            inferred_strides = tuple(tensorlike.stride())
+            inferred_dtype = tensorlike.dtype
+            inferred_device = tensorlike.device
+        else:
+            # If no tensorlike "example" is given then all metadata
+            # must be provided explicitly
+            assert shape is not None
+            assert strides is not None
+            assert dtype is not None
+            assert device is not None
+
+        shape = inferred_shape if shape is None else tuple(shape)
+        strides = inferred_strides if strides is None else tuple(strides)
+        dtype = inferred_dtype if dtype is None else dtype
+        device = inferred_device if device is None else device
+
+        if isinstance(device, str):
+            device = torch.device(device)
+
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            shape,
+            strides=strides,
+            storage_offset=0,  # TODO: this is inaccurate
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+        )
+
+        r.tname = ""
+        r.node = None
+        return r
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable,
+        types: Sequence,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Dict] = None,
+    ):
+        if kwargs is None:
+            kwargs = {}
+
+        if func in {
+            torch.Tensor.ndim.__get__,  # type: ignore[attr-defined]
+            torch.Tensor.numel,
+            torch.Tensor.stride,
+            torch.Tensor.dtype.__get__,  # type: ignore[attr-defined]
+            torch.Tensor.shape.__get__,  # type: ignore[attr-defined]
+            torch.Tensor.device.__get__,  # type: ignore[attr-defined]
+        }:
+            return super().__torch_function__(func, types, args, kwargs)
+
+        if not hasattr(func, "meta"):
+            raise ValueError(f"Callable {func} has no meta function!")
+
+        return func.meta(*args, **kwargs)  # type: ignore[attr-defined]
+
+    @classmethod
+    def __torch_dispatch__(
+        cls,
+        func,
+        types,
+        args=(),
+        kwargs=None,
+    ):
+        raise RuntimeError("this should be unreachable")
+
+    # TODO: fx uses dunder repr to print objects in code
+    def __repr__(self):
+        return self.tname
+        # return f"TensorMeta(dtype={self.dtype}, device={self.device}, shape={self.shape}, strides={self.stride()})"
+
+    def __format__(self, format_spec):
+        return self.tname
+
+
+TensorLikeType = Union[torch.Tensor, TensorMeta]
+TensorLike = (torch.Tensor, TensorMeta)
+TensorSequenceType = Union[List[TensorLikeType], Tuple[TensorLikeType, ...]]
+
+
+# TODO: look at using torch.testing.assert_close instead with an option
+#   to just compare metadata
+def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType):
+    """
+    Checks that two tensor likes have the same shape,
+    dtype and device.
+
+    In the future this will validate additional metadata, like
+    strides.
+    """
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+
+    for x, y in zip(a.shape, b.shape):
+        if x != y:
+            msg = "Shapes {0} and {1} are not equal!".format(a.shape, b.shape)
+            raise AssertionError(msg)
+
+    if a.dtype != b.dtype:
+        msg = "Dtypes {0} and {1} are not equal!".format(a.dtype, b.dtype)
+        raise AssertionError(msg)
+
+    if a.device != b.device:
+        # Handles special cuda:0 vs cuda case
+        # TODO: we should review why this happens and see about fixing it
+        if (str(a.device) == "cuda:0" or str(a.device) == "cuda") and (
+            str(b.device) == "cuda:0" or str(b.device) == "cuda"
+        ):
+            pass
+        else:
+            msg = "Devices {0} and {1} are not equal!".format(a.device, b.device)
+            raise AssertionError(msg)
+
+    same_strides, idx = check_significant_strides(a, b)
+    if not same_strides:
+        msg = "Stride mismatch! Strides are {0} and {1} (mismatched at {2})!".format(
+            a.stride(), b.stride(), idx
+        )
+        raise RuntimeError(msg)
+
+
+def check_significant_strides(
+    a: TensorLikeType, b: TensorLikeType
+) -> Tuple[bool, Optional[int]]:
+    # NOTE: only on CUDA because CPU elementwise strides are incorrect in PyTorch
+    # See https://github.com/pytorch/pytorch/issues/77553
+    # Only compares strides that are "meaningful" -- strides for dimensions with length > 1
+    # and for tensors with more than one element
+    if (a.device.type == "cuda" or b.device.type == "cuda") and a.numel() > 0:
+        for idx in range(a.ndim):
+            if a.stride()[idx] != b.stride()[idx] and a.shape[idx] > 1:
+                return False, idx
+
+    return True, None
+
+
+def is_contiguous(a: TensorLikeType) -> bool:
+    """
+    Tests whether a tensor is contiguous or not.
+
+    Tensors are contiguous when they have no elements,
+    or when they have "nested" strides.
+    """
+    if a.numel() == 0:
+        return True
+
+    expected_stride = 1
+    for x, y in reversed(tuple(zip(a.shape, a.stride()))):
+        # Skips checking strides when a dimension has length 1
+        if x == 1:
+            continue
+
+        if y != expected_stride:
+            return False
+        expected_stride = expected_stride * x
+
+    return True
+
+
+# NOTE: Based on the implementation in TensorIterator.cpp, but note that
+# the note [Computing output strides] is incorrect, because it
+# says that strides will be preserved even if they are not
+# "non overlapping and dense", but this is incorrect. The
+# output of elementwise operations are always given
+# non overlapping and dense strides.
+# This is also INCORRECT because it does not model TensorIterator's
+# short-circuit, which can cause different strides.
+def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
+    """
+    Computes the output strides for elementwise operations.
+    """
+
+    if len(tensors) == 0:
+        msg = "Can't compute elementwise output strides for zero tensors!"
+        raise ValueError(msg)
+
+    check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
+
+    # Filters the tensors to actual tensors
+    all_tensors = all(isinstance(a, TensorLike) for a in tensors)
+    tensors = tuple(
+        a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
+    )
+
+    # Short-circuits for CPU scalar case
+    if len(tensors) == 0:
+        return ()
+
+    # Short-circuits for shapes with zero or one dimensions
+    # TODO: are these necessary?
+    ndim = tensors[0].ndim
+    if ndim == 0:
+        return ()
+    if ndim == 1:
+        return (1,)
+
+    shape = tensors[0].shape
+
+    def _cmp(idx_a, idx_b):
+        for tensor in tensors:
+            stride_a = tensor.stride()[idx_a]
+            stride_b = tensor.stride()[idx_b]
+
+            if stride_a == 0 or stride_b == 0:
+                continue
+
+            if stride_a < stride_b:
+                return -1
+
+            if stride_a > stride_b:
+                return 1
+
+            # stride_a == stride_b
+            if shape[idx_a] > shape[idx_b]:
+                return 1
+
+            # NOTE: this case is missing in the C++ impl
+            if shape[idx_a] < shape[idx_b]:
+                return -1
+
+        # Note: this case is hit if all strides are zero,
+        # or all strides are equal and all dimensions have the same length
+        return 0
+
+    perm = tuple(range(ndim))
+    perm = tuple(sorted(perm, key=cmp_to_key(_cmp), reverse=True))
+
+    permuted_shape = [-1] * ndim
+    for idx, x in enumerate(perm):
+        permuted_shape[idx] = shape[x]
+
+    new_strides = make_contiguous_strides_for(permuted_shape)
+    # print(f"new_strides is {new_strides}")
+    # print(f"shape is {shape}")
+    # print(f"permuted_shape is {permuted_shape}")
+    permuted_strides = [-1] * ndim
+    for idx, x in enumerate(perm):
+        permuted_strides[x] = new_strides[idx]
+
+    return tuple(permuted_strides)
+
+
+#
+# Common helper functions
+#
+
+
+def validate_dim_length(length: int):
+    """
+    Validates that an object represents a valid
+    dimension length.
+    """
+
+    assert isinstance(length, int)
+    assert length >= 0
+
+
+def validate_shape(shape: ShapeType):
+    """
+    Validates that a sequence represents a valid shape.
+    """
+
+    assert isinstance(shape, Sequence)
+    for l in shape:
+        validate_dim_length(l)
+
+
+def validate_strides(strides: StrideType):
+    """
+    Verifies the object specifies valid strides.
+    """
+
+    assert isinstance(strides, Sequence)
+    for stride in strides:
+        assert stride >= 0
+
+
+def validate_idx(rank: int, idx: int):
+    """
+    Validates that idx is a valid index for the given shape.
+    Assumes the index is already canonicalized.
+    """
+
+    assert isinstance(idx, int)
+    assert isinstance(rank, int)
+
+    assert idx >= 0 and idx < rank or idx == 0
+
+
+def validate_dimension_indices(rank: int, indices: DimsSequenceType):
+    for idx in indices:
+        validate_idx(rank, idx)
+
+
+def validate_exclusive_idx(rank: int, ex_idx: int):
+    """
+    Validates that ex_idx is a valid exclusive index
+    for the given shape.
+    """
+
+    assert isinstance(ex_idx, int)
+    assert isinstance(rank, int)
+    assert ex_idx > 0 and ex_idx <= rank
+
+
+# "Wraps" a dim (up to one time) for the given rank, allowing
+# dims to be specified using negative indices
+def canonicalize_dim(rank: int, idx: int) -> int:
+    # TODO: add a comment for why this is
+    _rank = rank if rank != 0 else 1
+
+    if idx >= 0 and idx < _rank:
+        return idx
+
+    if idx < 0:
+        _idx = idx + _rank
+    else:
+        _idx = idx
+
+    if _idx < 0 or _idx > _rank:
+        msg = "Received out of bounds index {0} for tensor of rank {1}!".format(
+            idx, rank
+        )
+        raise ValueError(msg)
+
+    return _idx
+
+
+# Takes a dimension or sequence of dimensions and "wraps" them,
+# mapping negative offsets to positive ones
+def canonicalize_dims(rank: int, indices: DimsType) -> DimsType:
+    if isinstance(indices, int):
+        return canonicalize_dim(rank, indices)
+
+    return tuple(canonicalize_dim(rank, x) for x in indices)
+
+
+def is_valid_permutation(rank: int, perm: DimsSequenceType) -> bool:
+    """
+    Validates that perm is a permutation of length rank.
+    """
+
+    if not isinstance(perm, Sequence):
+        return False
+
+    if not (tuple(sorted(perm)) == tuple(range(0, rank))):
+        return False
+
+    return True
+
+
+def is_same_shape(a: Sequence, b: Sequence) -> bool:
+    """
+    Compares two shapes a and b, returning True if they are the same
+    (their ranks and corresponding lengths match) and False otherwise.
+    """
+
+    return tuple(a) == tuple(b)
+
+
+def is_cpu_scalar_tensor(a: Any) -> bool:
+    return isinstance(a, TensorLike) and a.ndim == 0 and a.device.type == "cpu"
+
+
+def check_same_device(*args, allow_cpu_scalar_tensors):
+    """
+    Checks that all Tensors in args have the same device.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensor objects in args have different devices, unless one is a CPU scalar tensor and allow_cpu_scalar_tensors is True
+    """
+    # Short-circuits if all (one or fewer) arguments are trivially on the same device
+    if len(args) <= 1:
+        return
+
+    # Note: cannot initialize device to the first arg's device (it may not have one)
+    device = None
+    for arg in args:
+        if isinstance(arg, Number):
+            continue
+        elif isinstance(arg, TensorLike):
+            if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg):
+                continue
+
+            if device is None:
+                device = arg.device
+
+            if device != arg.device:
+                msg = (
+                    "Tensor on device "
+                    + str(arg.device)
+                    + " is not on the expected device "
+                    + str(device)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same device, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+# Asserts if any of the following are true:
+#   - a non-scalar or non-Tensor is given
+#   - the shape of any tensors is distinct
+def check_same_shape(*args, allow_cpu_scalar_tensors):
+    """
+    Checks that all Tensors in args have the same shape.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensor objects in args have different devices
+    """
+    shape = None
+
+    for arg in args:
+        if isinstance(arg, Number):
+            continue
+        elif isinstance(arg, TensorLike):
+            if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg):
+                continue
+
+            if shape is None:
+                shape = arg.shape
+
+            if not is_same_shape(shape, arg.shape):
+                msg = "Shape {0} is not the expected shape {1}!".format(
+                    arg.shape, shape
+                )
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same shape, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+_integer_dtypes = (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
+_float_dtypes = (torch.float16, torch.bfloat16, torch.float32, torch.float64)
+_complex_dtypes = (torch.complex32, torch.complex64, torch.complex128)
+
+
+def is_boolean_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype is torch.bool
+
+
+def is_integer_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _integer_dtypes
+
+
+def is_float_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _float_dtypes
+
+
+def is_complex_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _complex_dtypes
+
+
+_complex_to_real_dtype_map = {
+    torch.complex128: torch.float64,
+    torch.complex64: torch.float32,
+    torch.complex32: torch.float16,
+}
+
+_real_to_complex_dtype_map = {
+    torch.float16: torch.complex32,
+    torch.bfloat16: torch.complex64,
+    torch.float32: torch.complex64,
+    torch.float64: torch.complex128,
+}
+
+
+def corresponding_real_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _complex_to_real_dtype_map[dtype]
+
+
+def corresponding_complex_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _real_to_complex_dtype_map[dtype]
+
+
+def dtype_to_type(dtype: torch.dtype) -> type:
+    """
+    Computes the corresponding Python type (AKA "type kind") for the
+    given dtype.
+    """
+    assert isinstance(dtype, torch.dtype)
+
+    if dtype is torch.bool:
+        return bool
+    if dtype in _integer_dtypes:
+        return int
+    if dtype in _float_dtypes:
+        return float
+    if dtype in _complex_dtypes:
+        return complex
+
+    raise ValueError("Invalid dtype!")
+
+
+_type_to_dtype_map = {
+    bool: torch.bool,
+    int: torch.int64,
+    float: torch.float64,
+    complex: torch.complex128,
+}
+
+
+def type_to_dtype(typ: type) -> torch.dtype:
+    """
+    Computes the corresponding dtype for a Number type.
+    """
+    return _type_to_dtype_map[typ]
+
+
+_ordered_types = (bool, int, float, complex)
+
+
+def get_higher_type(a: type, b: type) -> type:
+    """
+    Returns the higher of the two given Number types.
+
+    The types are ordered bool -> int -> float -> complex.
+    """
+    # Type checking
+    assert a in _ordered_types
+    assert b in _ordered_types
+
+    if a is b:
+        return a
+
+    for typ in _ordered_types:
+        if a is typ:
+            return b
+        if b is typ:
+            return a
+
+    raise ValueError("Unknown Python scalar type!")
+
+
+# Returns the higher of two torch datatypes a and b or, if the two
+#   are not ordered relative to each other, the next
+#   higher datatype
+def get_higher_dtype(
+    a: Optional[Union[torch.dtype, TensorLikeType, NumberType]],
+    b: Optional[Union[torch.dtype, TensorLikeType, NumberType]],
+) -> Optional[torch.dtype]:
+    """
+    Computes the "lowest" datatype that is weakly
+    "higher" than both a and b.
+    """
+
+    # Type checking
+    assert a is None or isinstance(a, (torch.dtype, TensorLike, Number))
+    assert b is None or isinstance(b, (torch.dtype, TensorLike, Number))
+
+    def _extract_dtype(
+        x: Optional[Union[torch.dtype, TensorLikeType, NumberType]]
+    ) -> Optional[torch.dtype]:
+        if x is None:
+            return None
+        if isinstance(x, torch.dtype):
+            return x
+        if isinstance(x, TensorLike):
+            return x.dtype
+        if isinstance(x, Number):
+            return type_to_dtype(type(x))
+
+        raise RuntimeError("Unexpected type given to _extract_dtype!")
+
+    a, b = _extract_dtype(a), _extract_dtype(b)
+
+    if a is b:
+        return a
+
+    if a is None:
+        return b
+
+    if b is None:
+        return a
+
+    ordered_datatypes = (
+        (torch.bool,),
+        (torch.uint8, torch.int8),
+        (torch.int16,),
+        (torch.int32,),
+        (torch.int64,),
+        (torch.float16, torch.bfloat16),
+        (torch.float32,),
+        (torch.float64,),
+        (torch.complex32,),
+        (torch.complex64,),
+        (torch.complex128,),
+    )
+
+    for idx, dtypes in enumerate(ordered_datatypes):
+        if a in dtypes and b in dtypes:
+            return ordered_datatypes[idx + 1][0]
+        if a in dtypes:
+            return b
+        if b in dtypes:
+            return a
+
+    raise RuntimeError("Unexpected termination!")
+
+
+# TODO: maybe unify with can_cast_to?
+def is_weakly_lesser_type(a: type, b: type) -> bool:
+    """
+    Compares two types, a and b, returning True if a is weakly "less" than b.
+
+    The comparison is determined by the following type ordering: bool, int, float, complex.
+    """
+    ordered_types = (
+        bool,
+        int,
+        float,
+        complex,
+    )
+
+    assert a in ordered_types
+    assert b in ordered_types
+
+    for typ in ordered_types:
+        if a == typ:
+            return True
+        if b == typ:
+            return False
+
+    raise RuntimeError("Unexpected termination!")
+
+
+def can_safe_cast_to(*, cast_to: torch.dtype, cast_from: torch.dtype) -> bool:
+    for fn in (is_complex_dtype, is_float_dtype, is_integer_dtype, is_boolean_dtype):
+        if fn(cast_to):
+            return True
+        if fn(cast_from):
+            return False
+
+    raise ValueError("Received unknown dtypes {0}, {1}!".format(cast_to, cast_from))
+
+
+def check_same_dtype(*args):
+    """
+    Checks that all Tensors in args have the same device and that all Numbers have the
+    same corresponding Python type.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensors objects in args have different dtypes
+      - two Number objects in args have different types
+      - there are Tensors and Numbers in args, and one of those Tensors corresponding
+          Python types is different from the type of one of those Numbers
+    """
+    full_dtype = None
+    scalar_type = None
+
+    for arg in args:
+        if isinstance(arg, Number):
+            # Scalar type checking is disabled (and may be removed in the future)
+            continue
+            # if scalar_type is None:
+            #     scalar_type = type(arg)
+
+            # if scalar_type is not type(arg):
+            #     msg = (
+            #         "Scalar of type "
+            #         + str(type(arg))
+            #         + " is not the expected type of "
+            #         + str(scalar_type)
+            #         + "!"
+            #     )
+            #     raise RuntimeError(msg)
+        elif isinstance(arg, TensorLike):
+            if full_dtype is None:
+                full_dtype = arg.dtype
+            if scalar_type is None:
+                scalar_type = dtype_to_type(arg.dtype)
+
+            if full_dtype is not arg.dtype:
+                msg = (
+                    "Tensor with dtype "
+                    + str(arg.dtype)
+                    + " is not the expected dtype of "
+                    + str(full_dtype)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+
+            arg_type = dtype_to_type(arg.dtype)
+            if arg_type is not scalar_type:
+                msg = (
+                    "Tensor with corresponding Python type "
+                    + str(arg_type)
+                    + " is not the expected type of "
+                    + str(scalar_type)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same dtype, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+# Maps datatypes to their computation types for elementwise operations
+_computation_dtype_map = {
+    torch.bfloat16: torch.float32,
+    torch.float16: torch.float32,
+    torch.complex32: torch.complex64,
+}
+
+
+def _get_computation_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _computation_dtype_map.get(dtype, dtype)
+
+
+class ELEMENTWISE_TYPE_PROMOTION_KIND(Enum):
+    DEFAULT = (0,)
+    NO_OPMATH = (1,)
+    INT_TO_FLOAT = (2,)
+    ALWAYS_BOOL = (3,)
+    COMPLEX_TO_FLOAT = (4,)
+    BOOL_TO_LONG = (5,)
+
+
+# TODO: document type promotion kinds
+def elementwise_dtypes(
+    *_args,
+    type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND,
+) -> Tuple[torch.dtype, torch.dtype]:
+    """
+    Computes the computation and result dtypes for elementwise type promotion
+    on the given arguments and with the given elementwise type promotion kind.
+
+    Note that not all inputs to an elementwise operation necessarily participate in type promotion.
+    For example, the "alpha" parameter of torch.add does not participate in type promotion,
+    although it may be cast to the Python type corresponding to the computation dtype that
+    the type promotion algorithm determines.
+
+    Default elementwise type promotion, which all other type promotion kinds tweak (see below),
+    first decides which of four ordered types to use:
+
+    bool -> integer -> floating point -> complex
+
+    The selected type is the "lowest" type in the above list such that all number arguments
+    have a weakly "lower" type and all tensor arguments have a weakly lower corresponding
+    type for their dtype.
+
+    Once the type is determined, the particular result dtype is found. The dtypes are
+    partially ordered as follows:
+
+    bool -> uint8, int8 -> int16 -> int32 -> int64 ->
+      float16, bfloat16 -> float32 -> float64 -> complex32 -> complex64 -> complex128
+
+    The result dtype is selected by:
+      - if no tensor's dtype has the same corresponding type as the one selected,
+          then the result dtype is the (default) dtype corresponding to the selected type
+          (for example, 1.5 + an integer tensor has a result dtype of the default floating point dtype)
+      - if the result type is complex then the dtype is:
+        -  the default complex dtype if there are no floating point or complex tensors
+        -  if there are floating point or complex tensors with one or more dimensions, then
+            the complex dtype corresponding to the highest corresponding complex dtype among those tensors
+            (for example, double + cfloat -> cdouble)
+        -  if there are only floating point or complex tensors with zero dimensions, then
+            the complex dtype corresponding to the highest corresponding complex dtype among those tensors
+      - if the first two cases do not apply, the result dtype is the highest dtype among
+          all tensors with one or more dimensions of the output type, and if there are no such
+          tensors then it's the highest dtype among all tensors with zero dimensions of the output type
+          (for example, long + half -> half, even if the half tensor has zero dimensions)
+
+    The "corresponding complex dtypes" are:
+      float16    -> complex32
+      bfloat16   -> complex64
+      float32    -> complex64
+      float64    -> complex128
+      complex32  -> complex32
+      complex64  -> complex64
+      complex128 -> complex128
+
+    The DEFAULT type promotion kind computes per above, and then uses the result dtype to pick a computation
+    dtype by mapping low precision floating point and complex dtypes as follows:
+
+      float16   -> float32
+      bfloat16  -> float32
+      complex32 -> complex64
+
+    This is referred to as "op math", and the NO_OPMATH type promotion kind disables this mapping, making the
+    computation dtype the same as the result dtype when it's selected. NO_OPMATH is appropriate for kernels
+    which perform no mathematical operations on their tensors (see below for examples).
+
+    The INT_TO_FLOAT type promotion kind maps boolean and integer maps result dtypes to the default floating point dtype,
+    and computation dtypes to the appropriate op math dtype.
+
+    The COMPLEX_TO_FLOAT type promotion kind maps complex result dtypes to the corresponding float dtype, following this
+    mapping:
+
+        complex32  -> float16
+        complex64  -> float32
+        complex128 -> float64
+
+    Note that COMPLEX_TO_FLOAT derives the computation dtype as the DEFAULT setting does.
+
+    The BOOL_TO_LONG type promotion kind maps boolean computation and result dtypes to long.
+
+    The ALWAYS_BOOL type promotion kind always sets the result dtype to bool.
+
+    Example operators for each type promotion option:
+      DEFAULT                 : add
+      NO_OPMATH               : where, nextafter, cat
+      INT_TO_FLOAT            : sin
+      COMPLEX_TO_FLOAT        : abs
+      BOOL_TO_LONG            : pow
+      ALWAYS_BOOL             : eq
+
+    """
+
+    args = tuple(x for x in _args if x is not None)
+
+    highest_type: type = bool
+    for x in args:
+        if not isinstance(x, (Number, TensorLike)):
+            msg = (
+                "Unexpected type {0} when computing elementwise type promotion!".format(
+                    str(type(x))
+                )
+            )
+            raise ValueError(msg)
+
+        if isinstance(x, Number):
+            highest_type = get_higher_type(highest_type, type(x))
+        else:
+            # x is a TensorLike
+            highest_type = get_higher_type(highest_type, dtype_to_type(x.dtype))
+
+    result_dtype = None
+
+    def _find_highest_dtype_filtered(
+        args, filter, *, float_as_complex=False, all_tensors_equal=False
+    ) -> Optional[torch.dtype]:
+        zero_dim_tensor_dtype = None
+        one_plus_dim_tensor_dtype = None
+        for x in args:
+            if isinstance(x, TensorLike) and filter(x.dtype):
+                _dtype = x.dtype
+                if float_as_complex and is_float_dtype(_dtype):
+                    _dtype = corresponding_complex_dtype(_dtype)
+                if x.ndim == 0 and not all_tensors_equal:
+                    zero_dim_tensor_dtype = get_higher_dtype(
+                        zero_dim_tensor_dtype, _dtype
+                    )
+                else:
+                    # x.ndim > 0 or all_tensors_equal
+                    one_plus_dim_tensor_dtype = get_higher_dtype(
+                        one_plus_dim_tensor_dtype, _dtype
+                    )
+
+        # Prefers dtype of tensors with one or more dimensions
+        if one_plus_dim_tensor_dtype is not None:
+            return one_plus_dim_tensor_dtype
+
+        return zero_dim_tensor_dtype
+
+    if highest_type is float:
+        result_dtype = _find_highest_dtype_filtered(args, is_float_dtype)
+        result_dtype = (
+            torch.get_default_dtype() if result_dtype is None else result_dtype
+        )
+    elif highest_type is complex:
+        # NOTE: complex x float type promotion is incorrectly implemented in PyTorch today
+        # it will treat zero dim and non-zero-dim float and complex tensors equally
+        # unless there's a non-zero-dim complex tensor
+        # the following captures this oddity
+        has_one_plus_dim_complex_tensor = False
+        for x in args:
+            if isinstance(x, TensorLike) and x.ndim > 0 and is_complex_dtype(x.dtype):
+                has_one_plus_dim_complex_tensor = True
+                break
+
+        if has_one_plus_dim_complex_tensor:
+            result_dtype = _find_highest_dtype_filtered(
+                args,
+                lambda x: is_float_dtype(x) or is_complex_dtype(x),
+                float_as_complex=True,
+            )
+        else:
+            # no complex tensors of rank 1+
+            # NOTE: bugged case where all tensors are equal
+            result_dtype = _find_highest_dtype_filtered(
+                args,
+                lambda x: is_float_dtype(x) or is_complex_dtype(x),
+                float_as_complex=True,
+                all_tensors_equal=True,
+            )
+
+        if result_dtype is None:
+            result_dtype = corresponding_complex_dtype(torch.get_default_dtype())
+    elif highest_type is int:
+        result_dtype = _find_highest_dtype_filtered(args, is_integer_dtype)
+        result_dtype = torch.long if result_dtype is None else result_dtype
+    else:
+        # highest_type is bool
+        result_dtype = torch.bool
+
+    if type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT:
+        return _get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH:
+        return result_dtype, result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
+        if is_integer_dtype(result_dtype) or is_boolean_dtype(result_dtype):
+            result_dtype = torch.get_default_dtype()
+        return _get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
+        # NOTE: computation can still occur in a complex dtype
+        computation_dtype = _get_computation_dtype(result_dtype)
+        if is_complex_dtype(result_dtype):
+            result_dtype = corresponding_real_dtype(result_dtype)
+        return computation_dtype, result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG:
+        if is_boolean_dtype(result_dtype):
+            return torch.long, torch.long
+        return _get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+        return _get_computation_dtype(result_dtype), torch.bool
+    else:
+        raise ValueError(
+            "Unknown type promotion kind {0}".format(str(type_promotion_kind))
+        )
+
+
+def wrap_device(d: Union[str, torch.device]) -> torch.device:
+    """
+    Wraps strings into torch.device objects.
+
+    Given torch.device objects are returned unmodified.
+    """
+
+    assert isinstance(d, (str, torch.device))
+    if isinstance(d, str):
+        return torch.device(d)
+
+    return d
+
+
+def make_contiguous_strides_for(shape: ShapeType) -> Tuple[int, ...]:
+    validate_shape(shape)
+    if not shape:
+        return ()
+
+    multiplier = 1
+    strides = []
+    for l in reversed(shape):
+        if l != 0:
+            strides.append(multiplier)
+            multiplier = l * multiplier
+        else:
+            strides.append(multiplier)
+
+    result = tuple(reversed(strides))
+    return result
+
+
+def compute_reduction_output_shape(
+    shape: ShapeType, dimensions: Sequence
+) -> Tuple[int, ...]:
+    for idx in dimensions:
+        validate_idx(len(shape), idx)
+
+    new_shape = []
+    for idx in range(len(shape)):
+        if idx in dimensions:
+            continue
+
+        new_shape.append(shape[idx])
+
+    return tuple(new_shape)
+
+
+def validate_no_repeating_dims(dims: Sequence):
+    if len(dims) != len(set(dims)):
+        raise RuntimeError("duplicate value in the list of dims")
+
+
+def reduction_dims(shape: ShapeType, dims: Optional[Sequence]) -> Tuple[int, ...]:
+    if dims is None:
+        return tuple(range(len(shape)))
+    dims = tuple(canonicalize_dim(len(shape), idx) for idx in dims)
+    validate_no_repeating_dims(dims)
+    return dims
+
+
+def check_in_bounds_for_storage(
+    a: torch._TypedStorage, shape: ShapeType, strides: StrideType, storage_offset: int
+):
+    """
+    Determines if the given shape, strides, and offset are valid for the given storage.
+    """
+
+    # Short-circuits if the shape has no elements
+    if reduce(operator.mul, shape) == 0:
+        return
+
+    length = a.size() - storage_offset
+    max_offset = 0
+    for x, y in zip(shape, strides):
+        max_offset = max_offset + (x - 1) * y
+
+    if max_offset >= length:
+        required_length = max_offset + storage_offset
+        msg = (
+            "Can't view a storage of size {0} with an offset of {1}, shape of {2}, and strides of {3}, "
+            "which requires a storage of size {4}".format(
+                a.size(), storage_offset, str(shape), str(strides), required_length
+            )
+        )
+        raise ValueError(msg)
diff --git a/torch/_prims/wrappers.py b/torch/_prims/wrappers.py
new file mode 100644
index 000000000000..a4c358954fec
--- /dev/null
+++ b/torch/_prims/wrappers.py
@@ -0,0 +1,195 @@
+import torch
+import torch._prims as prims
+from torch._prims.utils import (
+    Number,
+    NumberType,
+    TensorLike,
+    TensorLikeType,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+)
+import torch._prims.utils as utils
+from torch.utils._pytree import tree_flatten
+
+from typing import Callable, Sequence, Union
+import inspect
+from functools import wraps, reduce
+import operator
+import warnings
+from itertools import chain
+
+# TODO: implement ref.cast with an option to enforce safe casting
+def _maybe_convert_to_dtype(
+    a: Union[TensorLikeType, NumberType, Sequence], dtype: torch.dtype
+) -> Union[TensorLikeType, NumberType, Sequence]:
+    if isinstance(a, TensorLike):
+        if a.dtype != dtype:
+            # NOTE: this is incorrect on the CPU
+            # See https://github.com/pytorch/pytorch/issues/77553
+            return prims.convert_element_type(a, dtype)
+        return a
+    if isinstance(a, Number):
+        return utils.dtype_to_type(dtype)(a)
+    if isinstance(a, Sequence):
+        return tuple(_maybe_convert_to_dtype(x, dtype) for x in a)
+
+    raise ValueError(
+        "Received type {0} that is neither a tensor or a number!".format(type(a))
+    )
+
+
+def _maybe_convert_to_type(a: NumberType, typ: type) -> NumberType:
+    if not isinstance(a, Number):
+        msg = "Found unknown type {0} when trying to convert scalars!".format(type(a))
+        raise ValueError(msg)
+    if not utils.is_weakly_lesser_type(type(a), typ):
+        msg = "Scalar {0} of type {1} cannot be safely cast to type {2}!".format(
+            a, type(a), typ
+        )
+        raise ValueError(msg)
+
+    return typ(a)
+
+
+def _annotation_has_type(*, typ, annotation):
+    if hasattr(annotation, "__args__"):
+        for a in annotation.__args__:
+            if _annotation_has_type(typ=typ, annotation=a):
+                return True
+        return False
+
+    return typ is annotation
+
+
+class elementwise_type_promotion_wrapper(object):
+    """
+    Adds elementwise type promotion to a Python reference implementation.
+
+    Takes two kwargs, type_promoting_args and type_promotion_kind.
+
+    type_promoting_args must be a string Sequence specifiying the argument names of all
+    arguments that participate in type promotion (and should be type promoted). If the
+    arg specifies a Sequence-type then every element of the Sequence will participate in
+    type promotion.
+
+    type_promotion_kind must be one of the kinds specified by ELEMENTWISE_TYPE_PROMOTION_KIND.
+    See its documentation for details.
+
+    Other type promotion behavior, like validating the Python type of scalar arguments, must
+    be handled separately.
+    """
+
+    def __init__(
+        self,
+        *,
+        type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND,
+        type_promoting_args: Sequence[str] = None,
+    ):
+        self.type_promoting_arg_names = type_promoting_args
+        self.type_promotion_kind = type_promotion_kind
+
+    def __call__(self, fn: Callable) -> Callable:
+        sig = inspect.signature(fn)
+
+        @wraps(fn)
+        def _fn(*args, **kwargs):
+            bound = sig.bind(*args, **kwargs)
+            type_promoting_args = tuple(
+                bound.arguments[x]
+                for x in self.type_promoting_arg_names  # type: ignore[union-attr]
+                if x in bound.arguments.keys()
+            )
+
+            flattened_type_promoting_args = tree_flatten(type_promoting_args)[0]
+            compute_dtype, result_dtype = utils.elementwise_dtypes(
+                *flattened_type_promoting_args,
+                type_promotion_kind=self.type_promotion_kind,
+            )
+
+            promoted_args = {
+                x: _maybe_convert_to_dtype(bound.arguments[x], compute_dtype)
+                for x in self.type_promoting_arg_names  # type: ignore[union-attr]
+                if x in bound.arguments.keys()
+            }
+            bound.arguments.update(promoted_args)
+
+            result = fn(**bound.arguments)
+
+            # FIXME?: assumes result is a single tensor
+            assert isinstance(result, TensorLike)
+            return _maybe_convert_to_dtype(result, result_dtype)
+
+        _fn.__signature__ = sig  # type: ignore[attr-defined]
+        return _fn
+
+
+# TODO: handle tuples of tensors
+def _maybe_resize_out(out: TensorLikeType, shape):
+    if out.numel() == 0:
+        return prims.resize(out, shape)
+
+    if out.numel() != reduce(operator.mul, shape, 1):
+        msg = (
+            "An output with one or more elements was resized since it had shape {0} "
+            "which does not match the required output shape {1}. "
+            "This behavior is deprecated, and in a future PyTorch release outputs will not "
+            "be resized unless they have zero elements. "
+            "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0).".format(
+                str(out.shape), str(shape)
+            )
+        )
+        warnings.warn(msg)
+        return prims.resize(out, shape)
+
+    return out
+
+
+def _safe_copy_out(*, copy_from: TensorLikeType, copy_to: TensorLikeType):
+    # Checks same device
+    if copy_from.device != copy_to.device:
+        msg = "Attempting to copy from device {0} to device {1}, but cross-device copies are not allowed!".format(
+            copy_from.device, copy_to.device
+        )
+        raise RuntimeError(msg)
+
+    # Checks safe cast
+    if not utils.can_safe_cast_to(cast_from=copy_from.dtype, cast_to=copy_to.dtype):
+        msg = "Attempting to cast from {0} to out tensor with dtype {1}, but this can't be cast because it is not safe!".format(
+            copy_from.dtype, copy_to.dtype
+        )
+        raise RuntimeError(msg)
+
+    return prims.copy_to(copy_to, copy_from)
+
+
+# FIXME: only supports single tensor out
+def out_wrapper(fn: Callable) -> Callable:
+    """
+    Adds the out parameter to a Python reference.
+
+    Note that this currently only supports operations that return a single tensor.
+    """
+
+    @wraps(fn)
+    def _fn(*args, out=None, **kwargs):
+        result = fn(*args, **kwargs)
+        if out is not None:
+            assert isinstance(out, TensorLike)
+            out = _maybe_resize_out(out, result.shape)
+            return _safe_copy_out(copy_from=result, copy_to=out)  # type: ignore[arg-type]
+            return out
+        return result
+
+    sig = inspect.signature(fn)
+    out_param = inspect.Parameter(
+        "out",
+        kind=inspect.Parameter.KEYWORD_ONLY,
+        default=None,
+        annotation=TensorLikeType,
+    )
+    params = chain(sig.parameters.values(), (out_param,))
+    _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+        parameters=params, return_annotation=sig.return_annotation  # type: ignore[arg-type]
+    )
+    _fn.__annotations__ = fn.__annotations__
+    _fn.__annotations__["out"] = TensorLikeType
+    return _fn
diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py
index aa19a18efb3b..ee2c7d279458 100644
--- a/torch/_python_dispatcher.py
+++ b/torch/_python_dispatcher.py
@@ -15,13 +15,13 @@
 - CPU/AutogradCPU: represents in-tree backends which we usually have dedicated inference &
     autograd kernel in pytorch core library.
     E.g. CPU, CUDA
-- QuantizedCPU/AutogradOther: represents in-tree backends which we usually have backend specific
+- FPGA/AutogradOther: represents in-tree backends which we usually have backend specific
     inference kernels, but they share the same autograd kernel specified in AutogradOther.
-    E.g. QuantizedCPU, QuantizedCUDA
+    E.g. FPGA, SparseCsrCPU
 - XLA/AutogradXLA: represents out-of-tree backends which we don't have either inference or autograd
     kernel defined in pytorch core library. Backend owner is responsible for registering both
     inference & autograd kernels in their extensions(e.g. torch-xla) for the operators they support.
-    E.g. XLA, XPU, MLC
+    E.g. XLA, XPU, MPS
 - CompositeExplicitAutograd: alias key mapped to inference kernels of all backends like CPU, CUDA, XLA etc.
     Kernels registered to this key MUST work for inference for all backends.
 - Autograd: alias key mapped to autograd of all backends like AutogradCPU, AutogradXLA, AutogradOther.
@@ -53,7 +53,7 @@ class PythonDispatcher:
     name = "foo"
     runtime_keys = [
         "CPU", "AutogradCPU",
-        "QuantizedCPU", "AutogradOther",
+        "FPGA", "AutogradOther",
         "XLA", "AutogradXLA",
         "Lazy", "AutogradLazy",
     ]
@@ -66,7 +66,7 @@ class PythonDispatcher:
 
     def __init__(self):
         C._dispatch_check_invariants(self.name)  # type: ignore[attr-defined]
-        self.ref = C._dispatch_library("FRAGMENT", self.namespace, "")  # type: ignore[attr-defined]
+        self.ref = C._dispatch_library("FRAGMENT", self.namespace, "")
         self.ref.def_("foo(Tensor x) -> Tensor")
 
     """
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
new file mode 100644
index 000000000000..894baf3605bc
--- /dev/null
+++ b/torch/_refs/__init__.py
@@ -0,0 +1,1448 @@
+import torch
+
+import torch._prims as prims
+import torch._prims.utils as utils
+from torch._prims.utils import (
+    DimsType,
+    ShapeType,
+    StrideType,
+    TensorLike,
+    TensorLikeType,
+    DimsSequenceType,
+    TensorSequenceType,
+    Number,
+    NumberType,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+)
+from torch._prims.wrappers import (
+    elementwise_type_promotion_wrapper,
+    out_wrapper,
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+)
+
+from functools import reduce
+from typing import Sequence, Optional, Union, Callable, List, Tuple
+import operator
+import warnings
+import math
+from enum import Enum
+
+# Experimental module containing prototype Python references for existing
+#   PyTorch operations.
+
+__all__ = [
+    #
+    # Elementwise Unary References
+    #
+    "abs",
+    "acos",
+    "acosh",
+    "asin",
+    "atan",
+    # "bessel_i0e",  # special.i0e
+    # "bessel_i1e",  # special.i1e
+    # "cbrt",  # No corresponding torch operation
+    "ceil",
+    "cos",
+    "cosh",
+    "digamma",
+    "erf",
+    "erfinv",
+    "erfc",
+    "exp",
+    "expm1",
+    "floor",
+    "isfinite",
+    "isnan",
+    "lgamma",
+    "log",
+    "log1p",
+    "neg",
+    "reciprocal",
+    "round",  # TODO: model kwargs
+    "sign",
+    "sin",
+    "sinh",
+    "sqrt",
+    "square",
+    "tan",
+    #
+    # Elementwise Binary References
+    #
+    "add",
+    "atan2",
+    "bitwise_and",
+    "bitwise_left_shift",
+    "bitwise_or",
+    "bitwise_right_shift",
+    "bitwise_xor",
+    # "complex",
+    # 'copysign', # where
+    # 'div', # need to implement all rounding modes first
+    "eq",
+    "float_power",
+    # 'floor_divide', # requires floor
+    # 'fmax', # requires where
+    # 'fmod',
+    # 'gcd',
+    "ge",
+    "gt",
+    # 'heaviside',
+    # 'hypot',
+    "igamma",
+    "igammac",
+    "isclose",
+    # 'lcm',
+    # 'ldexp',
+    "le",
+    "logical_and",
+    "logical_or",
+    # 'logical_xor',
+    "lt",
+    # 'max', # implement with reductions
+    "maximum",
+    # 'min', # implement with reductions
+    "minimum",
+    "mul",
+    "ne",
+    "nextafter",
+    # 'polar',  # abs, cos, sin
+    "pow",
+    # 'remainder',
+    # 'rsub', # unblocked
+    # # special.xlog1py
+    # # special.zeta
+    "sub",
+    "true_divide",
+    # 'xlogy', # where?, log, mul
+    #
+    # Conditional references
+    #
+    "where",  # TODO: add opinfo
+    #
+    # Data conversion and movement references
+    #
+    "clone",
+    "copy_to",  # TODO: add opinfo
+    #
+    # Reduction ops
+    #
+    "sum",
+    "amax",
+    "amin",
+    #
+    # View & Shape Ops
+    #
+    "as_strided",
+    "cat",
+    "chunk",
+    "flatten",
+    "flip",
+    "narrow",
+    "permute",
+    "reshape",
+    "stack",
+    "swap_axes",  # alias for transpose
+    "squeeze",
+    "tensor_split",
+    "transpose",
+    "unsqueeze",
+    "view",
+    #
+    # Tensor Creation
+    #
+    "empty",
+    "empty_like",
+    "full",
+    "full_like",
+    "ones_like",
+]
+
+Tensor = torch.Tensor
+
+
+class REDUCTION_OUTPUT_TYPE_KIND(Enum):
+    SAME = (0,)
+    SAME_OR_REAL = (1,)  # for complex types outputs corresponding real type
+    OP_MATH = (2,)  # keep output in opmath type, needed for mean
+    ALWAYS_BOOL = (3,)
+
+
+def _broadcast_shapes(*_shapes):
+    shapes = tuple(filter(lambda x: x is not None, _shapes))
+
+    # Short-circuits on no input
+    if len(shapes) == 0:
+        return None
+
+    # Type checking
+    # TODO: make common validations available as utils
+    for shape in shapes:
+        assert isinstance(shape, Sequence)
+
+    # Computes common shape
+    common_shape = [
+        1,
+    ] * reduce(max, (len(shape) for shape in shapes))
+    for shape in shapes:
+        for idx in range(-1, -1 - len(shape), -1):
+            if common_shape[idx] == 1:
+                if shape[idx] < 0:
+                    raise ValueError(
+                        "Attempting to broadcast a dimension with negative length!"
+                    )
+                common_shape[idx] = shape[idx]
+            elif shape[idx] != 1:
+                if common_shape[idx] != shape[idx]:
+                    raise RuntimeError(
+                        "Attempting to broadcast a dimension of length ",
+                        str(shape[idx]),
+                        "!",
+                    )
+
+    return common_shape
+
+
+def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
+    # Computes common shape
+    common_shape = _broadcast_shapes(
+        *map(lambda t: t.shape if isinstance(t, TensorLike) else None, args)
+    )
+
+    def __maybe_broadcast(x, shape):
+        if x is None:
+            return None
+        elif isinstance(x, Number):
+            return x
+        elif isinstance(x, TensorLike):
+            if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
+                return x
+
+            if tuple(x.shape) != common_shape:
+                common_rank = len(common_shape) + 1
+                start = common_rank - (len(x.shape) + 1)
+                dims = tuple(range(start, len(x.shape) + start))
+                return prims.broadcast_in_dim(x, common_shape, dims)
+        else:
+            raise RuntimeError(
+                "Unexpected type when broadcasting: " + str(type(x)) + "!"
+            )
+
+    return tuple(__maybe_broadcast(x, common_shape) for x in args)
+
+
+# Utilities should come BEFORE this import
+from torch._decomp import register_decomposition
+
+#
+# Elementwise unary references
+#
+
+infer_aten_op = object()
+
+# TODO: add type promotion support
+def _make_elementwise_unary_reference(
+    prim: Callable,
+    *,
+    type_promotion_kind,
+    aten_op=infer_aten_op,
+    disable_meta=False,
+    extra_meta=None,
+) -> Callable:
+    @out_wrapper
+    @elementwise_type_promotion_wrapper(
+        type_promoting_args=("a",),
+        type_promotion_kind=type_promotion_kind,
+    )
+    def _ref(a: TensorLikeType) -> TensorLikeType:
+        if not isinstance(a, TensorLike):
+            raise RuntimeError(
+                "Expected a tensor input for an elementwise unary operation!"
+            )
+
+        if extra_meta is not None:
+            extra_meta(a)
+
+        return prim(a)
+
+    if aten_op is infer_aten_op:
+        aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0])
+    if aten_op is not None:
+        register_decomposition(aten_op, disable_meta=disable_meta)(_ref)
+
+    return _ref
+
+
+abs = _make_elementwise_unary_reference(
+    prims.abs,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+
+acos = _make_elementwise_unary_reference(
+    prims.acos,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+acosh = _make_elementwise_unary_reference(
+    prims.acosh,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+asin = _make_elementwise_unary_reference(
+    prims.asin,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+atan = _make_elementwise_unary_reference(
+    prims.atan,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+ceil = _make_elementwise_unary_reference(
+    prims.ceil,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+cos = _make_elementwise_unary_reference(
+    prims.cos,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+cosh = _make_elementwise_unary_reference(
+    prims.cosh,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+digamma = _make_elementwise_unary_reference(
+    prims.digamma,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+erf = _make_elementwise_unary_reference(
+    prims.erf,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+erfinv = _make_elementwise_unary_reference(
+    prims.erf_inv,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    aten_op=torch.ops.aten.erfinv,  # prim/aten name mismatch
+)
+
+erfc = _make_elementwise_unary_reference(
+    prims.erfc,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+exp = _make_elementwise_unary_reference(
+    prims.exp,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+expm1 = _make_elementwise_unary_reference(
+    prims.expm1,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+floor = _make_elementwise_unary_reference(
+    prims.floor,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _isfinite(a: TensorLikeType) -> TensorLikeType:
+    if utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype):
+        return prims.is_finite(a)
+
+    return ones_like(a, dtype=torch.bool)
+
+
+isfinite = _make_elementwise_unary_reference(
+    _isfinite,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=None,  # CompositeImplicitAutograd
+)
+
+
+def _isnan(a: TensorLikeType) -> TensorLikeType:
+    return prims.ne(a, a)
+
+
+isnan = _make_elementwise_unary_reference(
+    _isnan,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=torch.ops.aten.isnan,  # prim/aten name mismatch
+)
+
+lgamma = _make_elementwise_unary_reference(
+    prims.lgamma,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+log = _make_elementwise_unary_reference(
+    prims.log,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+log1p = _make_elementwise_unary_reference(
+    prims.log1p,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+
+def _neg_meta(a: TensorLikeType):
+    if a.dtype is torch.bool:
+        msg = "neg is not supported on bool tensors."
+        raise RuntimeError(msg)
+
+
+neg = _make_elementwise_unary_reference(
+    prims.neg,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    extra_meta=_neg_meta,
+)
+
+reciprocal = _make_elementwise_unary_reference(
+    prims.reciprocal,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+# TODO: round takes additional kwargs
+round = _make_elementwise_unary_reference(
+    prims.round,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    aten_op=None,  # TODO: this does need a decomp, but kwarg handling is needed
+)
+
+sign = _make_elementwise_unary_reference(
+    prims.sign,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sin = _make_elementwise_unary_reference(
+    prims.sin,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+sinh = _make_elementwise_unary_reference(
+    prims.sinh,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+sqrt = _make_elementwise_unary_reference(
+    prims.sqrt,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+square = _make_elementwise_unary_reference(
+    prims.square,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
+    aten_op=None,  # CompositeImplicitAutograd,
+)
+
+tan = _make_elementwise_unary_reference(
+    prims.tan,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+
+def _make_elementwise_binary_reference(
+    prim: Callable,
+    *,
+    type_promotion_kind,
+    aten_op=infer_aten_op,
+    has_out=True,
+    supports_lhs_python_scalar=True,
+    supports_rhs_python_scalar=True,
+    disable_meta=False,
+) -> Callable:
+    @elementwise_type_promotion_wrapper(
+        type_promoting_args=("a", "b"),
+        type_promotion_kind=type_promotion_kind,
+    )
+    def _ref(
+        a: Union[Tensor, NumberType],
+        b: Union[Tensor, NumberType],
+    ) -> Tensor:
+        if not supports_lhs_python_scalar and isinstance(a, Number):
+            raise ValueError(
+                "Received a lhs Python scalar to an elementwise binary operation that does not accept lhs scalars!"
+            )
+
+        if not supports_rhs_python_scalar and isinstance(b, Number):
+            raise ValueError(
+                "Received a rhs Python scalar to an elementwise binary operation that does not accept rhs scalars!"
+            )
+
+        # TODO: enable this for operations that support it, like add
+        if isinstance(a, Number) and isinstance(b, Number):
+            raise ValueError(
+                "Receive two Number inputs to an elementwise binary operation!"
+            )
+
+        a, b = _maybe_broadcast(a, b)
+        return prim(a, b)
+
+    if has_out:
+        _ref = out_wrapper(_ref)
+
+    if aten_op is infer_aten_op:
+        aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0])
+    if aten_op is not None:
+        register_decomposition(aten_op, disable_meta=disable_meta)(_ref)
+
+    return _ref
+
+
+# Add has its own implementation because it has an alpha argument
+@out_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def add(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    *,
+    alpha: Optional[NumberType] = None,
+):
+    """
+    Reference implementation of torch.add
+    """
+
+    if isinstance(a, Number) and isinstance(b, Number):
+        raise ValueError(
+            "Receive two Number inputs to an elementwise binary operation!"
+        )
+
+    a, b = _maybe_broadcast(a, b)
+
+    if alpha is not None:
+        dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
+        python_type = utils.dtype_to_type(dtype)
+        if not utils.is_weakly_lesser_type(type(alpha), python_type):
+            msg = (
+                "alpha argument of type {0} cannot be safely cast to type {1}!".format(
+                    type(alpha), python_type
+                )
+            )
+            raise ValueError(msg)
+        b = prims.mul(b, alpha)
+
+    return prims.add(a, b)
+
+
+# TODO: add docstring
+atan2 = _make_elementwise_binary_reference(
+    prims.atan2,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+
+# TODO: add docstring
+bitwise_and = _make_elementwise_binary_reference(
+    prims.bitwise_and,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+bitwise_left_shift = _make_elementwise_binary_reference(
+    prims.shift_left,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    aten_op=torch.ops.aten.bitwise_left_shift,  # prim/aten name mismatch
+)
+
+# TODO: add docstring
+bitwise_or = _make_elementwise_binary_reference(
+    prims.bitwise_or,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+bitwise_right_shift = _make_elementwise_binary_reference(
+    prims.shift_right_arithmetic,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    aten_op=torch.ops.aten.bitwise_right_shift,  # prim/aten name mismatch
+)
+
+# TODO: add docstring
+bitwise_xor = _make_elementwise_binary_reference(
+    prims.bitwise_xor,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+# complex =  _make_elementwise_binary_reference(prims.complex, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+
+# TODO: add docstring
+eq = _make_elementwise_binary_reference(
+    prims.eq,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+# TODO: add docstring
+# Float power has its own implementation because it has unique type promotion.
+# NB: aten_op not registered because CompositeExplicitAutograd
+@out_wrapper
+def float_power(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+) -> Tensor:
+
+    if isinstance(a, Number) and isinstance(b, Number):
+        raise ValueError(
+            "Receive two Number inputs to an elementwise binary operation!"
+        )
+
+    # Handles type promotion
+    dtype = utils.get_higher_dtype(a, b)
+    assert dtype is not None
+    if utils.is_complex_dtype(dtype):
+        dtype = torch.complex128
+    else:
+        dtype = torch.float64
+
+    # Float power has the following contiguous cast behavior to be
+    # consistent with its C++ impl
+    if isinstance(a, TensorLike) and a.dtype != dtype:
+        a = prims.to_dtype(a, dtype)
+    if isinstance(b, TensorLike) and b.dtype != dtype:
+        b = prims.to_dtype(b, dtype)
+
+    a, b = _maybe_broadcast(a, b)
+    return prims.pow(a, b)
+
+
+# TODO: add docstring
+ge = _make_elementwise_binary_reference(
+    prims.ge,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+# TODO: add docstring
+gt = _make_elementwise_binary_reference(
+    prims.gt,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+igamma = _make_elementwise_binary_reference(
+    prims.igamma,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+
+igammac = _make_elementwise_binary_reference(
+    prims.igammac,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+
+
+def isclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> TensorLikeType:
+    if a.dtype != b.dtype:
+        msg = "Attempting to compare tensors of different dtypes {0} and {1}!".format(
+            a.dtype, b.dtype
+        )
+        raise ValueError(a, b)
+    if rtol < 0:
+        msg = "rtol must be greater than or equal to zero, but got {0}!".format(rtol)
+    if atol < 0:
+        msg = "atol must be greater than or equal to zero, but got {0}!".format(atol)
+
+    close = eq(a, b)
+    if equal_nan and (utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype)):
+        close = logical_or(close, logical_and(isnan(a), isnan(b)))
+
+    # Note: In case of zero tolerances the closeness inequality degenerates to an equality check.
+    # In this case, the short-circuit prevents false positives as detailed in the paragraph below.
+    if atol == 0 and rtol == 0:
+        return close
+
+    # Note [closeness error computation]
+    # atol and rtol are provided as doubles, so the computation
+    # rtol * other will produce a float or complex tensor.
+    # When the difference (self - other) is compared to it then the
+    # tensor representing the difference will also be cast to float or complex.
+    # However, since (self - other) in uint8 is very likely to produce a
+    # negative value, this moves the cast forward so the difference is
+    # always computed in a float or complex type.
+    # If the values of the integer tensors cannot be exactly represented
+    # by the default scalar type then this may cause an incorrect result.
+    if not utils.is_float_dtype(a.dtype) and not utils.is_complex_dtype(a.dtype):
+        a = prims.convert_element_type(a, torch.get_default_dtype())
+        b = prims.convert_element_type(b, torch.get_default_dtype())
+
+    allowed_error = add(atol, abs(mul(b, rtol)))
+    actual_error = abs(sub(a, b))
+
+    # Computes finite closeness
+    result = logical_or(
+        close, logical_and(isfinite(actual_error), le(actual_error, allowed_error))
+    )
+
+    return result
+
+
+# TODO: add docstring
+le = _make_elementwise_binary_reference(
+    prims.le,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+
+def _logical_and(a: TensorLikeType, b: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        a = ne(a, 0)
+    if not utils.is_boolean_dtype(b.dtype):
+        b = ne(b, 0)
+    return bitwise_and(a, b)
+
+
+logical_and = _make_elementwise_binary_reference(
+    _logical_and,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=torch.ops.aten.logical_and,
+)
+
+
+def _logical_or(a: TensorLikeType, b: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        a = ne(a, 0)
+    if not utils.is_boolean_dtype(b.dtype):
+        b = ne(b, 0)
+    return bitwise_or(a, b)
+
+
+logical_or = _make_elementwise_binary_reference(
+    _logical_or,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=torch.ops.aten.logical_or,
+)
+
+# TODO: add docstring
+lt = _make_elementwise_binary_reference(
+    prims.lt,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+# TODO: add docstring
+maximum = _make_elementwise_binary_reference(
+    prims.maximum,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+minimum = _make_elementwise_binary_reference(
+    prims.minimum,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+mul = _make_elementwise_binary_reference(
+    prims.mul,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: add docstring
+ne = _make_elementwise_binary_reference(
+    prims.ne,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+
+# TODO: add docstring
+nextafter = _make_elementwise_binary_reference(
+    prims.nextafter,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+
+# TODO: add docstring
+pow = _make_elementwise_binary_reference(
+    prims.pow,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
+)
+
+# TODO: add docstring
+# TODO: consider refactoring this with add impl
+# sub has its own implementation because it has an alpha argument
+@register_decomposition(torch.ops.aten.sub)
+@out_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def sub(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    *,
+    alpha: Optional[NumberType] = None,
+):
+    """
+    Reference implementation of torch.add
+    """
+
+    if isinstance(a, Number) and isinstance(b, Number):
+        raise ValueError(
+            "Receive two Number inputs to an elementwise binary operation!"
+        )
+
+    a, b = _maybe_broadcast(a, b)
+
+    if alpha is not None:
+        dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
+        python_type = utils.dtype_to_type(dtype)
+        if not utils.is_weakly_lesser_type(type(alpha), python_type):
+            msg = (
+                "alpha argument of type {0} cannot be safely cast to type {1}!".format(
+                    type(alpha), python_type
+                )
+            )
+            raise ValueError(msg)
+        b = prims.mul(b, alpha)
+
+    return prims.sub(a, b)
+
+
+# TODO: add docstring
+true_divide = _make_elementwise_binary_reference(
+    prims.div,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    aten_op=None,  # CompositeImplicitAutograd
+)
+
+#
+# Conditional references
+#
+
+# https://pytorch.org/docs/stable/generated/torch.where.html
+# TODO: implement alternate where
+@register_decomposition(torch.ops.aten.where)
+@out_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+)
+def where(
+    pred: Tensor,
+    a: Optional[Union[TensorLikeType, NumberType]] = None,
+    b: Optional[Union[TensorLikeType, NumberType]] = None,
+):
+    """ """
+
+    if a is None or b is None:
+        raise NotImplementedError
+
+    pred, a, b = _maybe_broadcast(pred, a, b)
+    return prims.select(pred, a, b)
+
+
+#
+# Data Movement References
+#
+def clone(
+    a: TensorLikeType, *, memory_format: torch.memory_format = torch.preserve_format
+) -> TensorLikeType:
+
+    return prims.clone(a, memory_format=memory_format)
+
+
+def copy_to(a: Tensor, b: Tensor, *, allow_cross_device=True):
+    if not allow_cross_device and a.device != b.device:
+        msg = "Attempting to copy from device {0} to device {1}, but cross-device copies are not allowed!".format(
+            b.device, a.device
+        )
+        raise RuntimeError(msg)
+
+    return prims.copy_to(a, b)
+
+
+#
+# Reduction references
+#
+
+
+def _reduction(
+    a: Tensor,
+    prim: Callable,
+    *,
+    has_identity: bool = True,
+    accepts_dim_tuple: bool = True,  # to handle min/argmin that accept single dim only
+    dims: Optional[DimsType] = None,
+    keepdims: bool = False,
+    dtype: Optional[torch.dtype] = None,  # should be specified for ops that support it
+    out: Optional[Tensor] = None,
+    output_dtype_kind: REDUCTION_OUTPUT_TYPE_KIND,
+):  # it is usually SAME, but I want
+    # ref writers to actually think about what to put here
+    assert isinstance(a, TensorLike)
+    if out is not None:
+        assert isinstance(out, TensorLike)
+        if dtype is not None:
+            # TODO - this is true for eager mode currently, but it's wrong behavior for complex norms
+            if dtype != out.dtype:
+                raise RuntimeError(
+                    "dtype argument and out dtype must match in reduction"
+                )
+    if not accepts_dim_tuple:
+        assert dims is None or isinstance(dims, int)
+    if isinstance(dims, int):
+        dims = (dims,)  # type: ignore[assignment]
+    dims = utils.reduction_dims(a.shape, dims)
+    if not has_identity:
+        valid_shape = all(a.shape[i] for i in range(a.ndim) if i in dims)
+        if not valid_shape:
+            raise RuntimeError(
+                "reducing over zero-size dimension for reduction operation without identity"
+            )
+    # even though some reductions, like amin or amax, don't strictly require type promotion,
+    # all the math ops (including comparisons) are still defined only for a computation type,
+    # so promotion will still happen. We are doing it explicitly here
+    inp_dtype = dtype if dtype is not None else a.dtype
+    computation_dtype = utils._get_computation_dtype(inp_dtype)
+    a_converted = prims.convert_element_type(a, computation_dtype)
+    result = prim(a_converted, dims)
+
+    if keepdims:
+        output_shape = [a.shape[i] if i not in dims else 1 for i in range(a.ndim)]
+        broadcast_dims = [i for i in range(a.ndim) if i not in dims]
+        result = prims.broadcast_in_dim(result, output_shape, broadcast_dims)
+    if out is not None:
+        if dtype is None:
+            if output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.SAME:
+                if out.dtype != a.dtype:
+                    raise RuntimeError("Expected the dtype for input and out to match")
+            elif output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.ALWAYS_BOOL:
+                if out.dtype != torch.bool:
+                    raise RuntimeError("Expected the dtype for input and out to match")
+        out = _maybe_resize_out(out, result.shape)
+        return copy_to(out, result, allow_cross_device=False)  # type: ignore[arg-type]
+
+    if output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.SAME:
+        result_dtype = dtype if dtype else a.dtype
+        if result.dtype != result_dtype:
+            result = prims.convert_element_type(result, result_dtype)
+    return result
+
+
+# TODO: register decomp after stride logic is fixed
+def sum(
+    a: Tensor,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    keepdim: bool = False,
+    *,
+    dtype=None,
+    out: Optional[Tensor] = None,
+):
+    if dtype is None:
+        if utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype):
+            dtype = torch.int64
+        else:
+            dtype = a.dtype
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+    return _reduction(
+        a,
+        prims.sum,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=dtype,
+        out=out,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+def amin(
+    a: Tensor,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    keepdim: bool = False,
+    *,
+    out: Optional[Tensor] = None,
+):
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+
+    if a.ndim > 64:
+        raise RuntimeError(
+            "Received a tensor with {0} dimensions, but only tensors with up to 64 dims are supported!".format(
+                a.ndim
+            )
+        )
+
+    return _reduction(
+        a,
+        prims.amin,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=None,
+        out=out,
+        has_identity=False,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+def amax(
+    a: Tensor,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    keepdim: bool = False,
+    *,
+    out: Optional[Tensor] = None,
+):
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+
+    if a.ndim > 64:
+        raise RuntimeError(
+            "Received a tensor with {0} dimensions, only tensors with up to 64 dims are supported!".format(
+                a.ndim
+            )
+        )
+
+    return _reduction(
+        a,
+        prims.amax,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=None,
+        out=out,
+        has_identity=False,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+def as_strided(
+    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
+) -> TensorLikeType:
+    return prims.as_strided(a, size, stride, storage_offset)
+
+
+@out_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("tensors",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+)
+def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
+    _dim = utils.canonicalize_dims(tensors[0].ndim, dim)
+    return prims.concatenate(tensors, _dim)
+
+
+def chunk(a: TensorLikeType, chunks: int, dim: int = 0) -> Tuple[TensorLikeType, ...]:
+    if chunks <= 0:
+        msg = "Expected at least one chunk, but got {0}!".format(chunks)
+        raise ValueError(msg)
+
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    length = a.shape[dim]
+    chunk_size = math.ceil(length / chunks)
+    full_chunks = math.floor(length / chunk_size)
+    tail_chunk_size = length % chunk_size
+
+    result = []
+    for i in range(full_chunks):
+        result.append(narrow(a, dim, i * chunk_size, chunk_size))
+
+    if tail_chunk_size != 0:
+        result.append(narrow(a, dim, full_chunks * chunk_size, tail_chunk_size))
+
+    return tuple(result)
+
+
+# Note: flatten, unlike prim.collapse and prim.collapse_view has an inclusive end_dim
+# Note: flatten, unlike other shape operators, returns the input tensor on a no-op (unless
+# a 0D tensor is flattened, in which case it's returned in 1D)
+def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorLikeType:
+    start_dim = utils.canonicalize_dim(a.ndim, start_dim)
+    end_dim = utils.canonicalize_dim(a.ndim, end_dim)
+
+    # Short-circuits on no-op
+    if start_dim == end_dim and a.ndim != 0:
+        return a
+
+    # Tries to take a view
+    # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view)
+    new_shape, new_strides = prims._collapse_view_helper(a, start_dim, end_dim + 1)
+    if new_shape is not None:
+        return prims.collapse_view(a, start_dim, end_dim + 1)
+
+    # Makes a copy if it can't make a view
+    return prims.collapse(a, start_dim, end_dim + 1)
+
+
+@register_decomposition(torch.ops.aten.flip)
+def flip(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
+    if not isinstance(dims, tuple) and not isinstance(dims, list):
+        raise ValueError("dims has to be a sequence of ints")
+    dims = utils.canonicalize_dims(a.ndim, dims)  # type: ignore[assignment]
+    utils.validate_no_repeating_dims(dims)
+    return prims.rev(a, dims)
+
+
+def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    return prims.slice_in_dim(a, start, start + length, axis=dim)
+
+
+def permute(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
+    _permutation = utils.canonicalize_dims(a.ndim, dims)
+    return prims.transpose(a, _permutation)
+
+
+def _reshape_view_helper(
+    a: TensorLikeType, shape: ShapeType, *, allow_copy: bool
+) -> TensorLikeType:
+    # NOTE: Reshape may be given a shape with a -1 length
+    # This indicates that the dimension's length should be inferred
+    # Creates a valid shape
+
+    for idx in range(len(shape)):
+        if shape[idx] == -1:
+            # Verifies there's only one dimension of length -1 in the shape
+            if shape.count(-1) > 1:
+                msg = "Can only infer the length of one dimension, but got shape {0}!".format(
+                    str(shape)
+                )
+                raise ValueError(msg)
+
+            # TODO: improve error message
+            if a.numel() > 0:
+                length = reduce(
+                    operator.floordiv, (x for x in shape if x != -1), a.numel()
+                )
+            else:
+                msg = "Cannot reshape a tensor of zero elements into shape {0} because the unspecified length is ambiguous!".format(
+                    str(shape)
+                )
+                raise ValueError(msg)
+
+            shape = list(shape)
+            shape[idx] = length
+            break
+
+    # Short-circuits if shape is the same
+    utils.validate_shape(shape)
+    if tuple(a.shape) == tuple(shape):
+        return prims.view_of(a)
+
+    numel = reduce(operator.mul, shape) if len(shape) > 0 else 1
+    if a.numel() != numel:
+        msg = "Attempting to reshape a tensor with shape {0} and {1} elements to a shape {2} with {3} elements!".format(
+            str(a.shape), a.numel(), str(shape), numel
+        )
+        raise ValueError(msg)
+
+    # Special-cases tensors with no elements
+    if a.numel() == 0:
+        return as_strided(a, shape, utils.make_contiguous_strides_for(shape))
+
+    # Special-cases reshaping zero dim tensors
+    if a.ndim == 0:
+        _a = a
+        for length in shape:
+            assert length == 1
+            _a = unsqueeze(_a, -1)
+        return _a
+
+    # Special-cases reshaping to zero dim tensors
+    if len(shape) == 0:
+        _a = a
+        for length in a.shape:
+            assert length == 1
+            _a = squeeze(_a, -1)
+        return _a
+
+    # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
+
+    # NOTE [Reshape Algorithm]
+    # This algorithm works by attempting to greedily construct the desired dimensions in
+    # the output shape, left to right. It does this by, conceptually, accumulating
+    # dimensions of the original tensor, also left to right, until the dimension
+    # can be constructed using prims.split_dim.
+    # The algorithm also has special handling for tail squeezes/unsqueezes, like
+    # if a reshape from (5, 5) to (5, 5, 1) or vice versa.
+    #
+    # This algorithm does not flatten the original tensor and then split dims as appropriate
+    # because that would create copies more often than this algorithm. flatten is the only
+    # operation below which can create a view or a copy, and while it prefers creating
+    # views it may sometimes create a copy if the tensor's strides do not permit a view.
+    # As a result, this algorithm tries to minimize flattening.
+    #
+    # Note that a better version of this algorithm may exist. Regions which could be
+    # flattened without creating a copy can be identified in advance, and that might
+    # allow fewer flatten calls or faster short-circuiting to make a copy.
+    idx = 0
+    a_ = a
+    for length in shape:
+        # Handles tail unsqueezes
+        if idx >= a_.ndim:
+            assert length == 1
+            last_dim = a_.ndim - 1
+            # NOTE: using split_dim instead of unsqueeze may seem silly here,
+            # but it's necessary to get the strides correct
+            a_ = prims.split_dim(a_, last_dim, a_.shape[last_dim])
+            idx = idx + 1
+            continue
+
+        # Skips dimensions that are already the correct length
+        if length == a_.shape[idx]:
+            idx = idx + 1
+            continue
+
+        # Gathers enough original dimensions such that this new dimension can be created
+        # Note that this accumulation will terminate because we've verified a and the shape
+        # specify the same number of elements above
+        accum = a_.shape[idx]
+        end = idx
+        while accum % length != 0:
+            end = end + 1
+            accum = accum * a_.shape[end]
+        if end != idx:
+            # NOTE: in this case multiple dimensions must be flatten to create the desired dimension
+            # This flattening is why reshape sometimes creates a copy -- because flattening
+            # may return a view of a copy
+
+            # Checks if collapse can be a view and short-circuits to copying reshape if it can't
+            new_shape, new_strides = prims._collapse_view_helper(a_, idx, end + 1)
+            if new_shape is None:
+                if allow_copy:
+                    return prims.reshape(a, shape)
+
+                msg = "Cannot view a tensor with shape {0} and strides {1} as a tensor with shape {2}!".format(
+                    a.shape, a.stride(), shape
+                )
+                raise ValueError(msg)
+
+            a_ = flatten(a_, idx, end)
+
+        # Splits the (possibly flattened) dimension to create the desired dim length
+        if accum != length:
+            a_ = prims.split_dim(a_, idx, length)
+
+        idx = idx + 1
+
+    # Squeezes tail
+    while idx < a_.ndim:
+        assert a_.shape[idx] == 1
+        a_ = squeeze(a_, idx)
+
+    return a_
+
+
+def reshape(a: TensorLikeType, shape: ShapeType) -> TensorLikeType:
+    return _reshape_view_helper(a, shape, allow_copy=True)
+
+
+# update to cat then view instead of unsqueezing each tensor
+@out_wrapper
+def stack(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
+    tensors = tuple(unsqueeze(a, dim) for a in tensors)
+    return cat(tensors, dim)
+
+
+# Note: although squeeze is documented as having the out= kwarg it doesn't
+def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType:
+    if dim is not None:
+        dim = utils.canonicalize_dim(a.ndim, dim)
+        # Short-circuits if the tensor has no dimensions
+        if len(a.shape) == 0:
+            assert dim == 0
+            return prims.view_of(a)
+
+        # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1
+        if a.shape[dim] != 1:
+            return prims.view_of(a)
+        return prims.squeeze(a, (dim,))
+
+    dims = tuple(idx for idx in range(len(a.shape)) if a.shape[idx] == 1)
+    return prims.squeeze(a, dims)
+
+
+# Note: does not work with TensorMetas because of data-dependent control-flow
+def tensor_split(
+    a: TensorLikeType,
+    indices_or_sections: Union[Tensor, DimsType],
+    dim: int = 0,
+) -> Tuple[TensorLikeType, ...]:
+    _dim = utils.canonicalize_dim(a.ndim, dim)
+    if a.ndim == 0:
+        msg = "tensor_split: received a rank zero tensor, but expected a tensor of rank one or greater!"
+        raise ValueError(msg)
+
+    # If indices_or_sections is a tensor, it must be a CPU Long tensor
+    if isinstance(indices_or_sections, TensorLike):
+        if indices_or_sections.device != torch.device("cpu"):
+            msg = "tensor_split: if indices_or_sections is a tensor it must be on the CPU, but received one on {0}".format(
+                indices_or_sections.device
+            )
+            raise ValueError(msg)
+        if indices_or_sections.dtype != torch.long:
+            msg = "tensor_split: if indices_or_sections is a tensor it must have long dtype, "
+            " but received one with dtype {0}".format(indices_or_sections.dtype)
+            raise ValueError(msg)
+
+    # Case 0 -- indices_or_sections is an integer or a scalar tensor n and a is split along dim into n parts of equal-ish length
+    if isinstance(indices_or_sections, int) or (
+        isinstance(indices_or_sections, TensorLike) and indices_or_sections.ndim == 0
+    ):
+        sections: int = (
+            indices_or_sections  # type: ignore[assignment]
+            if isinstance(indices_or_sections, Number)
+            else indices_or_sections.item()
+        )
+
+        if sections <= 0:
+            msg = "tensor_split: number of sections must be greater than 0, but was {0}".format(
+                sections
+            )
+            raise ValueError(msg)
+
+        splits = []
+        dim_size = a.shape[_dim]
+        min_split_size = math.floor(dim_size / sections)
+        num_splits_one_extra = dim_size % sections
+        start_idx = 0
+        for split_idx in range(sections):
+            split_size = (
+                min_split_size + 1
+                if (split_idx < num_splits_one_extra)
+                else min_split_size
+            )
+            s = prims.slice_in_dim(a, start_idx, start_idx + split_size, axis=_dim)
+            splits.append(s)
+            start_idx = start_idx + split_size
+
+        return tuple(splits)
+    # Case 1 -- indices_or_sections is a sequence of integers or a 1D tensor describing the splits
+    else:
+        indices = indices_or_sections
+        if isinstance(indices_or_sections, TensorLike):
+            if indices_or_sections.ndim != 1:
+                msg = "tensor_split: non-scalar indices_or_sections tensors must have only one dimension, "
+                "but received a tensor with {0} dimensions".format(
+                    indices_or_sections.ndim
+                )
+                raise ValueError(msg)
+
+            indices = indices_or_sections.tolist()
+
+        splits = []
+        start_idx = 0
+        for x in indices:
+            splits.append(prims.slice_in_dim(a, start_idx, x, axis=_dim))
+            start_idx = x
+        splits.append(prims.slice_in_dim(a, start_idx, a.shape[_dim], axis=_dim))
+        return tuple(splits)
+
+
+def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
+    _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1))  # type: ignore[misc]
+
+    if a.ndim <= 1:
+        return prims.view_of(a)
+
+    _permutation = list(range(0, a.ndim))
+    _permutation[_dim0] = _dim1
+    _permutation[_dim1] = _dim0
+    return prims.transpose(a, _permutation)
+
+
+# Aliases for transpose
+swap_axes = transpose
+
+
+def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
+    # Note that unsqueeze canonicalizes with rank + 1 because it allows
+    # a new innermost dimension to be specified
+    dim = utils.canonicalize_dim(a.ndim + 1, dim)
+    return prims.expand_dims(a, (dim,))
+
+
+def view(a: TensorLikeType, shape: ShapeType) -> TensorLikeType:
+    return _reshape_view_helper(a, shape, allow_copy=False)
+
+
+@out_wrapper
+def empty(
+    *shape,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    dtype = torch.get_default_dtype() if dtype is None else dtype
+    device = torch.device("cpu") if device is None else device
+    if len(shape) > 0 and isinstance(shape[0], tuple):
+        return prims.empty(
+            *shape, dtype=dtype, device=device, requires_grad=requires_grad
+        )
+    return prims.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+def empty_like(
+    a: TensorLikeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    device = a.device if device is None else device
+    return prims.empty_like(a, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+@out_wrapper
+def full(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    dtype = torch.get_default_dtype() if dtype is None else dtype
+    device = torch.device("cpu") if device is None else device
+    return prims.full(
+        shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+
+def full_like(
+    a: TensorLikeType,
+    fill_value: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    device = a.device if device is None else device
+    return prims.full_like(
+        a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+
+def ones_like(
+    a: TensorLikeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    return full_like(a, 1, dtype=dtype, device=device, requires_grad=requires_grad)
diff --git a/tools/codegen/api/__init__.py b/torch/_refs/nn/__init__.py
similarity index 100%
rename from tools/codegen/api/__init__.py
rename to torch/_refs/nn/__init__.py
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
new file mode 100644
index 000000000000..12ac19844c95
--- /dev/null
+++ b/torch/_refs/nn/functional/__init__.py
@@ -0,0 +1,47 @@
+import torch
+
+import torch._prims.utils as utils
+from torch._prims.utils import (
+    TensorLikeType,
+    NumberType,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+)
+import torch._refs as refs
+from torch._prims.wrappers import elementwise_type_promotion_wrapper
+
+from typing import Optional
+
+__all__ = [
+    "elu",
+]
+
+# elu is implemented specially because it has an alpha argument
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def elu(
+    a: TensorLikeType, alpha: Optional[NumberType] = None, inplace: bool = False
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.elu
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    rhs: TensorLikeType
+    if alpha is not None:
+        python_type = utils.dtype_to_type(a.dtype)
+        if not utils.is_weakly_lesser_type(type(alpha), python_type):
+            msg = (
+                "alpha argument of type {0} cannot be safely cast to type {1}!".format(
+                    type(alpha), python_type
+                )
+            )
+            raise ValueError(msg)
+        rhs = refs.mul(alpha, refs.expm1(a))
+    else:
+        rhs = refs.expm1(a)
+
+    return refs.where(refs.gt(a, 0), a, rhs)
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
new file mode 100644
index 000000000000..ff8c92cd8fa4
--- /dev/null
+++ b/torch/_refs/special/__init__.py
@@ -0,0 +1,23 @@
+import torch
+
+import torch._prims as prims
+import torch._prims.utils as utils
+from torch._prims.utils import TensorLikeType
+from torch._prims.wrappers import out_wrapper, elementwise_type_promotion_wrapper
+from torch._refs import _make_elementwise_unary_reference
+
+__all__ = [
+    "i0e",
+    "i1e",
+]
+
+i0e = _make_elementwise_unary_reference(
+    prims.bessel_i0e,
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    aten_op=torch.ops.aten.special_i0e,
+)
+i1e = _make_elementwise_unary_reference(
+    prims.bessel_i1e,
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    aten_op=torch.ops.aten.special_i1e,
+)
diff --git a/torch/_tensor.py b/torch/_tensor.py
index dc2f5c21624d..37383c17af28 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -18,16 +18,17 @@
 import torch.utils.hooks as hooks
 
 
-def _wrap_type_error_to_not_implemented(f):
+def _handle_torch_function_and_wrap_type_error_to_not_implemented(f):
     # functools.wraps doesn't work well with methods in python 2
     method_assignments = ('__name__', '__doc__')
     assigned = functools.WRAPPER_ASSIGNMENTS
 
     @functools.wraps(f, assigned=assigned)
     def wrapped(*args, **kwargs):
-        if has_torch_function(args):
-            return handle_torch_function(wrapped, args, *args, **kwargs)
         try:
+            # See https://github.com/pytorch/pytorch/issues/75462
+            if has_torch_function(args):
+                return handle_torch_function(wrapped, args, *args, **kwargs)
             return f(*args, **kwargs)
         except TypeError:
             return NotImplemented
@@ -46,7 +47,9 @@ def _rebuild_from_type_v2(func, new_type, args, state):
     if new_type is Tensor:
         return func(*args)
 
-    ret = func(*args).as_subclass(new_type)
+    ret = func(*args)
+    if type(ret) is not new_type:
+        ret = ret.as_subclass(new_type)
     # Tensor does define __setstate__ even though it doesn't define
     # __getstate__. So only use __setstate__ if it is NOT the one defined
     # on Tensor
@@ -92,8 +95,17 @@ def __deepcopy__(self, memo):
             # does accurate alias tracking; however, the code below
             # doesn't work because of
             # https://github.com/pytorch/pytorch/issues/47442
-            if self.is_sparse or self.device.type in ['xla', 'mlc', 'ort', 'meta', 'hpu']:
+            # Update the test in test_serialization if you remove 'meta' from here
+            if self.is_sparse or self.device.type in ['lazy', 'xla', 'mps', 'ort', 'meta', 'hpu'] or \
+                    (type(self) is not Tensor and self.data_ptr() == 0):
                 new_tensor = self.clone()
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError("The default implementation of __deepcopy__() for wrapper subclasses "
+                                       "only works for subclass types that implement clone() and for which "
+                                       "cloning returns another instance of the same subclass. You should either "
+                                       "properly implement clone() for your subclass or override __deepcopy__() "
+                                       "if it is intended behavior for clone() to return an instance of a "
+                                       "different type.")
             else:
                 new_storage = self.storage().__deepcopy__(memo)
                 if self.is_quantized:
@@ -109,9 +121,9 @@ def __deepcopy__(self, memo):
                     else:
                         raise RuntimeError(f"Unsupported qscheme {self.qscheme()} in deepcopy")
                     # TODO: Once we decide to break serialization FC, no longer
-                    # need to wrap with TypedStorage
+                    # need to wrap with _TypedStorage
                     new_tensor = torch._utils._rebuild_qtensor(
-                        torch.storage.TypedStorage(
+                        torch.storage._TypedStorage(
                             wrap_storage=new_storage._untyped(),
                             dtype=self.dtype),
                         self.storage_offset(),
@@ -120,19 +132,34 @@ def __deepcopy__(self, memo):
                         quantizer_params,
                         self.requires_grad,
                         self._backward_hooks)
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError("The default implementation of __deepcopy__() for quantized tensors "
+                                           "expects the tensor returned by torch._utils._rebuild_qtensor() to "
+                                           "match the type of the instance being copied. If you encounter this, "
+                                           "please open an issue on PyTorch's GitHub.")
                 else:
                     new_tensor = self.new_empty([])
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError("The default implementation of __deepcopy__() for non-wrapper subclasses "
+                                           "only works for subclass types that implement new_empty() and for which "
+                                           "that function returns another instance of the same subclass. You should "
+                                           "either properly implement new_empty() for your subclass or override "
+                                           "__deepcopy__() if it is intended behavior for new_empty() to return "
+                                           "an instance of a different type.")
                     new_tensor.set_(new_storage, self.storage_offset(), self.size(), self.stride())
                     if self.is_conj():
                         new_tensor = new_tensor.conj_physical()
                     if self.is_neg():
                         new_tensor = new_tensor.neg()
-                    new_tensor.requires_grad = self.requires_grad
+            if self.requires_grad:
+                new_tensor.requires_grad_()
             if self.grad is not None:
                 new_tensor.grad = self.grad.__deepcopy__(memo)
 
             if not type(self) is Tensor:
-                new_tensor = new_tensor.as_subclass(type(self))  # type: ignore[arg-type]
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError("Type of deepcopy result does not match the type of the source tensor. "
+                                       "If you encounter this, please open an issue on PyTorch's GitHub.")
 
                 # Plain Tensors don't have slots
                 slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
@@ -175,21 +202,14 @@ def storage(self):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.storage, (self,), self)
 
-        if self.dtype not in torch.storage._dtype_to_storage_type_map():
-            raise RuntimeError(f'unsupported Storage type: {self.dtype}')
-
-        storage = self._storage()
-        storage_name = torch.storage._dtype_to_storage_type_map()[self.dtype]
-        storage_class = eval(type(storage).__module__ + '.' + storage_name)
-        storage = storage_class(wrap_storage=storage)
-        return storage
+        return torch._TypedStorage(wrap_storage=self._storage(), dtype=self.dtype)
 
     def _reduce_ex_internal(self, proto):
         check_serializing_named_tensor(self)
         # See Note [Don't serialize hooks]
         torch.utils.hooks.warn_if_has_hooks(self)
         backward_hooks: Dict[Any, Any] = OrderedDict()
-        # Note: Numpy array is chosen to be the rebuild component for XLA, ORT, MLC Tensors.
+        # Note: Numpy array is chosen to be the rebuild component for XLA, ORT Tensors.
         # We considered a few options:
         # 1. CPU tensor can't be used here.
         #    Otherwise in torch.load CPU storage is reconstructed with randomly
@@ -199,7 +219,7 @@ def _reduce_ex_internal(self, proto):
         # 2. Python list is not a good fit due to performance reason.
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
-        if self.device.type in ['xla', 'ort', 'mlc']:
+        if self.device.type in ['xla', 'ort', 'mps', 'hpu']:
             return (torch._utils._rebuild_device_tensor_from_numpy, (self.cpu().numpy(),
                                                                      self.dtype,
                                                                      str(self.device),
@@ -232,9 +252,9 @@ def _reduce_ex_internal(self, proto):
             else:
                 raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}")
             # TODO: Once we decide to break serialization FC, no longer
-            # need to wrap with TypedStorage
+            # need to wrap with _TypedStorage
             args_qtensor = (
-                torch.storage.TypedStorage(
+                torch.storage._TypedStorage(
                     wrap_storage=self.storage()._untyped(),
                     dtype=self.dtype),
                 self.storage_offset(),
@@ -265,11 +285,23 @@ def _reduce_ex_internal(self, proto):
                 raise NotImplementedError(
                     'sparse csr tensor __reduce_ex__ for layout `%s`' % (self.layout))
             return (torch._utils._rebuild_sparse_csr_tensor, args_sparse_csr)
+        elif self.data_ptr() == 0 and type(self) is not torch.Tensor:
+            arg_wrapper_subclass = (
+                type(self),
+                self.dtype,
+                tuple(self.size()),
+                self.stride(),
+                self.storage_offset(),
+                self.layout,
+                self.device,
+                self.requires_grad
+            )
+            return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass)
         else:
             # TODO: Once we decide to break serialization FC, no longer
-            # need to wrap with TypedStorage
+            # need to wrap with _TypedStorage
             args = (
-                torch.storage.TypedStorage(
+                torch.storage._TypedStorage(
                     wrap_storage=self.storage()._untyped(),
                     dtype=self.dtype),
                 self.storage_offset(),
@@ -298,11 +330,12 @@ def __setstate__(self, state):
         # See Note [Don't serialize hooks]
         self.requires_grad, _, self._backward_hooks = state
 
-    def __repr__(self):
+    def __repr__(self, *, tensor_contents=None):
         if has_torch_function_unary(self):
-            return handle_torch_function(Tensor.__repr__, (self,), self)
+            return handle_torch_function(Tensor.__repr__, (self,), self,
+                                         tensor_contents=tensor_contents)
         # All strings are unicode in Python 3.
-        return torch._tensor_str._str(self)
+        return torch._tensor_str._str(self, tensor_contents=tensor_contents)
 
     def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None):
         r"""Computes the gradient of current tensor w.r.t. graph leaves.
@@ -497,6 +530,10 @@ def norm(self, p="fro", dim=None, keepdim=False, dtype=None):
             return handle_torch_function(Tensor.norm, (self,), self, p=p, dim=dim, keepdim=keepdim, dtype=dtype)
         return torch.norm(self, p, dim, keepdim, dtype=dtype)
 
+    def solve(self, other):
+        from ._linalg_utils import solve
+        return solve(self, other)
+
     def lu(self, pivot=True, get_infos=False):
         r"""See :func:`torch.lu`"""
         # If get_infos is True, then we don't need to check for errors and vice versa
@@ -597,47 +634,37 @@ def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None
             )
         return torch.unique_consecutive(self, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rsub__(self, other):
-        if has_torch_function_variadic(self, other):
-            return handle_torch_function(Tensor.__rsub__, (self, other), self, other)
         return _C._VariableFunctions.rsub(self, other)
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rdiv__(self, other):
-        if has_torch_function_variadic(self, other):
-            return handle_torch_function(Tensor.__rdiv__, (self, other), self, other)
         return self.reciprocal() * other
 
     __rtruediv__ = __rdiv__
     __itruediv__ = _C._TensorBase.__idiv__
 
-    __pow__ = _wrap_type_error_to_not_implemented(_C._TensorBase.pow)
+    __pow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(_C._TensorBase.pow)
+    __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(_C._TensorBase.pow_)
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rmod__(self, other):
-        if has_torch_function_variadic(self, other):
-            return handle_torch_function(Tensor.__rmod__, (self, other), self, other)
         return torch.remainder(other, self)
 
     def __format__(self, format_spec):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.__format__, (self,), self, format_spec)
-        if self.dim() == 0:
+        if self.dim() == 0 and not self.is_meta:
             return self.item().__format__(format_spec)
         return object.__format__(self, format_spec)
 
-    def __ipow__(self, other):  # type: ignore[misc]
-        if has_torch_function_variadic(self, other):
-            return handle_torch_function(Tensor.__ipow__, (self, other), self, other)
-        return NotImplemented
-
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rpow__(self, other):
         dtype = torch.result_type(other, self)
         return torch.tensor(other, dtype=dtype, device=self.device) ** self
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __floordiv__(self, other):
         warnings.warn("__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. "
                       "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
@@ -646,7 +673,7 @@ def __floordiv__(self, other):
                       "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3)
         return torch.div(self, other, rounding_mode='trunc')
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rfloordiv__(self, other):
         warnings.warn("__rfloordiv__ is deprecated, and its behavior will change in a future version of pytorch. "
                       "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
@@ -655,18 +682,16 @@ def __rfloordiv__(self, other):
                       "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3)
         return torch.div(other, self, rounding_mode='trunc')
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rlshift__(self, other):
         return torch.bitwise_left_shift(other, self)
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rrshift__(self, other):
         return torch.bitwise_right_shift(other, self)
 
-    @_wrap_type_error_to_not_implemented
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rmatmul__(self, other):
-        if has_torch_function_variadic(self, other):
-            return handle_torch_function(Tensor.__rmatmul__, (self, other), self, other)
         return torch.matmul(other, self)
 
     __pos__ = _C._TensorBase.positive
@@ -830,10 +855,10 @@ def storage_type(self):
         Returns the type of the underlying storage.
 
         """
-        # NB: this returns old fashioned TypedStorage, e.g., FloatStorage, as it
-        # would be pretty pointless otherwise (it would always return
-        # UntypedStorage)
-        return type(self.storage())
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.storage_type, (self,), self)
+
+        return self.storage()._get_legacy_storage_class()
 
     def refine_names(self, *names):
         r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
@@ -1031,53 +1056,7 @@ def to_sparse_coo(self):
             25
 
        """
-        if self.is_sparse:
-            return self
-        if self.is_sparse_csr:
-            crow_indices = self.crow_indices()
-            col_indices = self.col_indices()
-            indices = torch._convert_indices_from_csr_to_coo(crow_indices, col_indices,
-                                                             out_int32=crow_indices.dtype == torch.int32)
-            return torch.sparse_coo_tensor(indices,
-                                           self.values(),
-                                           size=self.shape,
-                                           dtype=self.dtype,
-                                           device=self.device)
-        else:
-            return self.to_sparse()
-
-    def to_sparse_csr(self):
-        """ Convert a tensor to compressed row storage format. Only works with 2D tensors.
-
-        Examples::
-
-            >>> dense = torch.randn(5, 5)
-            >>> sparse = dense.to_sparse_csr()
-            >>> sparse._nnz()
-            25
-
-        """
-        shape = self.size()
-        fill_value = 0
-        if len(shape) != 2:
-            raise RuntimeError("Only 2D tensors can be converted to the CSR format but got shape: ", shape)
-
-        if self.is_sparse:
-            coalesced_self = self.coalesce()
-            row_indices = coalesced_self.indices()[0]
-            device = coalesced_self.values().device
-            crow_indices = torch._convert_indices_from_coo_to_csr(
-                row_indices, self.shape[0], out_int32=row_indices.dtype == torch.int32)
-            return torch.sparse_csr_tensor(crow_indices,
-                                           coalesced_self.indices()[1].contiguous(),
-                                           coalesced_self.values(),
-                                           size=coalesced_self.shape,
-                                           dtype=coalesced_self.dtype,
-                                           device=device)
-        elif self.is_sparse_csr:
-            return self
-        else:
-            return self.to_sparse().to_sparse_csr()
+        return self.to_sparse()
 
     def _update_names(self, names, inplace):
         if has_torch_function_unary(self):
@@ -1145,6 +1124,8 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
             else:
                 return _convert(ret, cls)
 
+    __torch_dispatch__ = _C._disabled_torch_dispatch_impl
+
     def __dlpack__(self, stream=None):
         """
         Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 66ffffec87b5..3fb5f706e1a4 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1060,6 +1060,24 @@ def add_docstr_all(method, docstr):
     {memory_format}
 """.format(**common_args))
 
+add_docstr_all('ipu',
+               r"""
+ipu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in IPU memory.
+
+If this object is already in IPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination IPU device.
+        Defaults to the current IPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(**common_args))
+
 add_docstr_all('xpu',
                r"""
 xpu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
@@ -1798,6 +1816,12 @@ def add_docstr_all(method, docstr):
 length of :attr:`index` (which must be a vector), and all other dimensions must
 match :attr:`self`, or an error will be raised.
 
+For a 3-D tensor the output is given as::
+
+    self[index[i], :, :] += alpha * src[i, :, :]  # if dim == 0
+    self[:, index[i], :] += alpha * src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] += alpha * src[:, :, i]  # if dim == 2
+
 Note:
     {forward_reproducibility_note}
 
@@ -1912,6 +1936,73 @@ def add_docstr_all(method, docstr):
 Out-place version of :meth:`~Tensor.index_put_`.
 """)
 
+add_docstr_all('index_reduce_',
+               r"""
+index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor
+
+Accumulate the elements of ``source`` into the :attr:`self`
+tensor by accumulating to the indices in the order given in :attr:`index`
+using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``,
+``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th
+row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
+:obj:`include_self="True"`, the values in the :attr:`self` tensor are included
+in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+to are treated as if they were filled with the reduction identites.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i], :, :] *= src[i, :, :]  # if dim == 0
+    self[:, index[i], :] *= src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] *= src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    This function only supports floating point tensors.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+        should have dtype either `torch.int64` or `torch.int32`
+    source (FloatTensor): the tensor containing values to accumulate
+    reduce (str): the reduction operation to apply
+        (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+
+Keyword args:
+    include_self (bool): whether the elements from the ``self`` tensor are
+        included in the reduction
+
+Example::
+
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2, 0])
+    >>> x.index_reduce_(0, index, t, 'prod')
+    tensor([[20., 44., 72.],
+            [ 2.,  2.,  2.],
+            [14., 16., 18.],
+            [ 2.,  2.,  2.],
+            [ 8., 10., 12.]])
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> x.index_reduce_(0, index, t, 'prod', include_self=False)
+    tensor([[10., 22., 36.],
+            [ 2.,  2.,  2.],
+            [ 7.,  8.,  9.],
+            [ 2.,  2.,  2.],
+            [ 4.,  5.,  6.]])
+""".format(**reproducibility_notes))
+
 add_docstr_all('index_select',
                r"""
 index_select(dim, index) -> Tensor
@@ -3374,6 +3465,69 @@ def callable(a, b) -> number
 
 """.format(**reproducibility_notes))
 
+add_docstr_all('scatter_reduce_', r"""
+scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Reduces all values from the :attr:`src` tensor to the indices specified in
+the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction
+defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`,
+:obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an
+index in :attr:`self` which is specified by its index in :attr:`src` for
+``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self`
+tensor are included in the reduction.
+
+:attr:`self`, :attr:`index` and :attr:`src` should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and reduce.
+    src (Tensor): the source elements to scatter and reduce
+    reduce (str): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+    include_self (bool): whether elements from the :attr:`self` tensor are
+        included in the reduction
+
+Example::
+
+    >>> src = torch.tensor([1., 2., 3., 4., 5., 6.])
+    >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+    >>> input = torch.tensor([1., 2., 3., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum")
+    tensor([5., 14., 8., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False)
+    tensor([4., 12., 5., 4.])
+    >>> input2 = torch.tensor([5., 4., 3., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax")
+    tensor([5., 6., 5., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False)
+    tensor([3., 6., 5., 2.])
+
+
+""".format(**reproducibility_notes))
+
 add_docstr_all('select',
                r"""
 select(dim, index) -> Tensor
@@ -3540,13 +3694,6 @@ def callable(a, b) -> number
 
 """)
 
-add_docstr_all('solve',
-               r"""
-solve(A) -> Tensor, Tensor
-
-See :func:`torch.solve`
-""")
-
 add_docstr_all('sort',
                r"""
 sort(dim=-1, descending=False) -> (Tensor, LongTensor)
@@ -3972,6 +4119,16 @@ def callable(a, b) -> number
     {memory_format}
 """.format(**common_args))
 
+add_docstr_all('chalf',
+               r"""
+chalf(memory_format=torch.preserve_format) -> Tensor
+
+``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`.
+
+Args:
+     {memory_format}
+ """.format(**common_args))
+
 add_docstr_all('half',
                r"""
 half(memory_format=torch.preserve_format) -> Tensor
@@ -4140,6 +4297,35 @@ def callable(a, b) -> number
            size=(3, 3), nnz=1, layout=torch.sparse_coo)
 """)
 
+add_docstr_all('to_sparse_csr',
+               r"""
+to_sparse_csr() -> Tensor
+Convert a tensor to compressed row storage format. Only works with 2D tensors.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse._nnz()
+    25
+
+""")
+
+add_docstr_all('to_sparse_bsr',
+               r"""
+to_sparse_bsr(blocksize) -> Tensor
+Convert a CSR tensor to a block sparse row (BSR) storage format of given blocksize.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsr = sparse.to_sparse_bsr((5, 5))
+    >>> sparse_bsr.col_indices()
+    tensor([0, 1, 0, 1])
+
+""")
+
 add_docstr_all('to_mkldnn',
                r"""
 to_mkldnn() -> Tensor
@@ -4746,6 +4932,13 @@ def callable(a, b) -> number
 Out-of-place version of :meth:`torch.Tensor.scatter_add_`
 """)
 
+add_docstr_all('scatter_reduce',
+               r"""
+scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""")
+
 add_docstr_all('masked_scatter',
                r"""
 masked_scatter(mask, tensor) -> Tensor
@@ -4862,6 +5055,11 @@ def callable(a, b) -> number
 Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise.
 """)
 
+add_docstr_all('is_ipu',
+               r"""
+Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise.
+""")
+
 add_docstr_all('is_xpu',
                r"""
 Is ``True`` if the Tensor is stored on the XPU, ``False`` otherwise.
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index b0bb6e93aaee..b1c53091bf60 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -80,6 +80,9 @@ def set_printoptions(
         PRINT_OPTS.linewidth = linewidth
     PRINT_OPTS.sci_mode = sci_mode
 
+def tensor_totype(t):
+    dtype = torch.float if t.is_mps else torch.double
+    return t.to(dtype=dtype)
 
 class _Formatter(object):
     def __init__(self, tensor):
@@ -104,9 +107,9 @@ def __init__(self, tensor):
                 return
 
             # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU.
-            nonzero_finite_abs = nonzero_finite_vals.abs().double()
-            nonzero_finite_min = nonzero_finite_abs.min().double()
-            nonzero_finite_max = nonzero_finite_abs.max().double()
+            nonzero_finite_abs = tensor_totype(nonzero_finite_vals.abs())
+            nonzero_finite_min = tensor_totype(nonzero_finite_abs.min())
+            nonzero_finite_max = tensor_totype(nonzero_finite_abs.max())
 
             for value in nonzero_finite_vals:
                 if value != torch.ceil(value):
@@ -254,6 +257,9 @@ def _tensor_str(self, indent):
     if self.dtype is torch.float16 or self.dtype is torch.bfloat16:
         self = self.float()
 
+    if self.dtype is torch.complex32:
+        self = self.cfloat()
+
     if self.dtype.is_complex:
         # handle the conjugate bit
         self = self.resolve_conj()
@@ -297,10 +303,19 @@ def get_summarized_data(self):
     else:
         return torch.stack([get_summarized_data(x) for x in self])
 
-def _str_intern(inp):
-    prefix = 'tensor('
+def _str_intern(inp, *, tensor_contents=None):
+    is_plain_tensor = type(inp) is torch.Tensor or type(inp) is torch.nn.Parameter
+    if inp.is_nested:
+        prefix = "nested_tensor("
+    elif is_plain_tensor:
+        prefix = 'tensor('
+    else:
+        prefix = f"{type(inp).__name__}("
     indent = len(prefix)
     suffixes = []
+    custom_contents_provided = tensor_contents is not None
+    if custom_contents_provided:
+        tensor_str = tensor_contents
 
     # This is used to extract the primal value and thus disable the forward AD
     # within this function.
@@ -315,7 +330,8 @@ def _str_intern(inp):
     # In other cases, we don't have a way to set them as default yet,
     # and we should always print out device for them.
     if self.device.type != torch._C._get_default_device()\
-            or (self.device.type == 'cuda' and torch.cuda.current_device() != self.device.index):
+            or (self.device.type == 'cuda' and torch.cuda.current_device() != self.device.index)\
+            or (self.device.type == 'mps'):
         suffixes.append('device=\'' + str(self.device) + '\'')
 
     # Tensor printing performs tensor operations like slice, indexing, etc to make it in a
@@ -332,40 +348,52 @@ def _str_intern(inp):
         suffixes.append('nnz=' + str(self._nnz()))
         if not has_default_dtype:
             suffixes.append('dtype=' + str(self.dtype))
-        indices_prefix = 'indices=tensor('
-        indices = self._indices().detach()
-        indices_str = _tensor_str(indices, indent + len(indices_prefix))
-        if indices.numel() == 0:
-            indices_str += ', size=' + str(tuple(indices.shape))
-        values_prefix = 'values=tensor('
-        values = self._values().detach()
-        values_str = _tensor_str(values, indent + len(values_prefix))
-        if values.numel() == 0:
-            values_str += ', size=' + str(tuple(values.shape))
-        tensor_str = indices_prefix + indices_str + '),\n' + ' ' * indent + values_prefix + values_str + ')'
-    elif self.is_sparse_csr:
+        if not custom_contents_provided:
+            indices_prefix = 'indices=tensor('
+            indices = self._indices().detach()
+            indices_str = _tensor_str(indices, indent + len(indices_prefix))
+            if indices.numel() == 0:
+                indices_str += ', size=' + str(tuple(indices.shape))
+            values_prefix = 'values=tensor('
+            values = self._values().detach()
+            values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0:
+                values_str += ', size=' + str(tuple(values.shape))
+            tensor_str = indices_prefix + indices_str + '),\n' + ' ' * indent + values_prefix + values_str + ')'
+    elif self.layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}:
         suffixes.append('size=' + str(tuple(self.shape)))
         suffixes.append('nnz=' + str(self._nnz()))
         if not has_default_dtype:
             suffixes.append('dtype=' + str(self.dtype))
-        crow_indices_prefix = 'crow_indices=tensor('
-        crow_indices = self.crow_indices().detach()
-        crow_indices_str = _tensor_str(crow_indices, indent + len(crow_indices_prefix))
-        if crow_indices.numel() == 0:
-            crow_indices_str += ', size=' + str(tuple(crow_indices.shape))
-        col_indices_prefix = 'col_indices=tensor('
-        col_indices = self.col_indices().detach()
-        col_indices_str = _tensor_str(col_indices, indent + len(col_indices_prefix))
-        if col_indices.numel() == 0:
-            col_indices_str += ', size=' + str(tuple(col_indices.shape))
-        values_prefix = 'values=tensor('
-        values = self.values().detach()
-        values_str = _tensor_str(values, indent + len(values_prefix))
-        if values.numel() == 0:
-            values_str += ', size=' + str(tuple(values.shape))
-        tensor_str = crow_indices_prefix + crow_indices_str + '),\n' + ' ' * indent +\
-            col_indices_prefix + col_indices_str + '),\n' + ' ' * indent +\
-            values_prefix + values_str + ')'
+        if not custom_contents_provided:
+            compressed_indices_method, plain_indices_method = {
+                torch.sparse_csr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+                torch.sparse_csc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+                torch.sparse_bsr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+                torch.sparse_bsc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+            }[self.layout]
+            if self.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                cdimname, pdimname = 'row', 'column'
+            else:
+                cdimname, pdimname = 'column', 'row'
+            compressed_indices_prefix = f'c{cdimname[:3]}_indices=tensor('
+            compressed_indices = compressed_indices_method(self).detach()
+            compressed_indices_str = _tensor_str(compressed_indices, indent + len(compressed_indices_prefix))
+            if compressed_indices.numel() == 0:
+                compressed_indices_str += ', size=' + str(tuple(compressed_indices.shape))
+            plain_indices_prefix = f'{pdimname[:3]}_indices=tensor('
+            plain_indices = plain_indices_method(self).detach()
+            plain_indices_str = _tensor_str(plain_indices, indent + len(plain_indices_prefix))
+            if plain_indices.numel() == 0:
+                plain_indices_str += ', size=' + str(tuple(plain_indices.shape))
+            values_prefix = 'values=tensor('
+            values = self.values().detach()
+            values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0:
+                values_str += ', size=' + str(tuple(values.shape))
+            tensor_str = compressed_indices_prefix + compressed_indices_str + '),\n' + ' ' * indent +\
+                plain_indices_prefix + plain_indices_str + '),\n' + ' ' * indent +\
+                values_prefix + values_str + ')'
     elif self.is_quantized:
         suffixes.append('size=' + str(tuple(self.shape)))
         if not has_default_dtype:
@@ -379,7 +407,14 @@ def _str_intern(inp):
             suffixes.append('scale=' + str(self.q_per_channel_scales()))
             suffixes.append('zero_point=' + str(self.q_per_channel_zero_points()))
             suffixes.append('axis=' + str(self.q_per_channel_axis()))
-        tensor_str = _tensor_str(self.dequantize(), indent)
+        if not custom_contents_provided:
+            tensor_str = _tensor_str(self.dequantize(), indent)
+    elif self.is_nested:
+        if not custom_contents_provided:
+            def indented_str(s, indent):
+                return "\n".join(f"  {line}" for line in s.split("\n"))
+            strs = ",\n".join(indented_str(str(t), indent + 1) for t in torch.ops.aten.unbind.int(self, 0))
+            tensor_str = f"[\n{strs}\n]"
     else:
         if self.is_meta:
             suffixes.append('size=' + str(tuple(self.shape)))
@@ -387,7 +422,8 @@ def _str_intern(inp):
                 suffixes.append('dtype=' + str(self.dtype))
             # TODO: This implies that ellipses is valid syntax for allocating
             # a meta tensor, which it could be, but it isn't right now
-            tensor_str = '...'
+            if not custom_contents_provided:
+                tensor_str = '...'
         else:
             if self.numel() == 0 and not self.is_sparse:
                 # Explicitly print the shape if it is not (0,), to match NumPy behavior
@@ -398,15 +434,17 @@ def _str_intern(inp):
                 # should be int64, so it must be shown explicitly.
                 if self.dtype != torch.get_default_dtype():
                     suffixes.append('dtype=' + str(self.dtype))
-                tensor_str = '[]'
+                if not custom_contents_provided:
+                    tensor_str = '[]'
             else:
                 if not has_default_dtype:
                     suffixes.append('dtype=' + str(self.dtype))
 
-                if self.layout != torch.strided:
-                    tensor_str = _tensor_str(self.to_dense(), indent)
-                else:
-                    tensor_str = _tensor_str(self, indent)
+                if not custom_contents_provided:
+                    if self.layout != torch.strided:
+                        tensor_str = _tensor_str(self.to_dense(), indent)
+                    else:
+                        tensor_str = _tensor_str(self, indent)
 
     if self.layout != torch.strided:
         suffixes.append('layout=' + str(self.layout))
@@ -427,8 +465,17 @@ def _str_intern(inp):
     if tangent is not None:
         suffixes.append('tangent={}'.format(tangent))
 
-    return _add_suffixes(prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse)
+    string_repr = _add_suffixes(prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse)
+
+    # Check if this instance is flagged as a parameter and change the repr accordingly.
+    # Unfortunately, this function has to be aware of this detail.
+    # NB: This is currently skipped for plain tensor parameters to maintain BC. In the future,
+    # this should be done for those as well to produce a valid repr.
+    if isinstance(self, torch.nn.Parameter) and not is_plain_tensor:
+        string_repr = f"Parameter({string_repr})"
+
+    return string_repr
 
-def _str(self):
+def _str(self, *, tensor_contents=None):
     with torch.no_grad():
-        return _str_intern(self)
+        return _str_intern(self, tensor_contents=tensor_contents)
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 39433bda3482..620e78ad43e3 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -111,6 +111,10 @@ def merge_dicts(*dicts):
     "tf32_note": """This operator supports :ref:`TensorFloat32<tf32_on_ampere>`."""
 }
 
+rocm_fp16_notes = {
+    "rocm_fp16_note": """On certain ROCm devices, when using float16 inputs this module will use \
+:ref:`different precision<fp16_on_mi200>` for backward."""
+}
 
 reproducibility_notes = {
     "forward_reproducibility_note": """This operation may behave nondeterministically when given tensors on \
@@ -225,6 +229,12 @@ def merge_dicts(*dicts):
 See :meth:`~Tensor.index_add_` for function description.
 """)
 
+add_docstr(torch.index_reduce, r"""
+index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor
+
+See :meth:`~Tensor.index_reduce_` for function description.
+""")
+
 add_docstr(torch.add, r"""
 add(input, other, *, alpha=1, out=None) -> Tensor
 
@@ -298,6 +308,8 @@ def merge_dicts(*dicts):
 
 {tf32_note}
 
+{rocm_fp16_note}
+
 Args:
     batch1 (Tensor): the first batch of matrices to be multiplied
     batch2 (Tensor): the second batch of matrices to be multiplied
@@ -317,7 +329,7 @@ def merge_dicts(*dicts):
     tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
             [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
             [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.addcdiv, r"""
 addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
@@ -427,6 +439,8 @@ def merge_dicts(*dicts):
 
 {tf32_note}
 
+{rocm_fp16_note}
+
 Args:
     input (Tensor): matrix to be added
     mat1 (Tensor): the first matrix to be matrix multiplied
@@ -445,7 +459,7 @@ def merge_dicts(*dicts):
     >>> torch.addmm(M, mat1, mat2)
     tensor([[-4.8716,  1.4671, -1.3746],
             [ 0.7573, -3.9555, -2.8681]])
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.adjoint,
            r"""
@@ -1031,9 +1045,6 @@ def merge_dicts(*dicts):
 CPU device, and not share its memory.
 
 .. seealso::
-    :func:`torch.as_tensor` creates a tensor that always shares memory if the input is a
-           tensor or a NumPy array, copying otherwise.
-
     :func:`torch.tensor` creates a tensor that always copies the data from the input object.
 
     :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays.
@@ -1041,7 +1052,7 @@ def merge_dicts(*dicts):
     :func:`torch.frombuffer` creates a tensor that always shares memory from objects that
            implement the buffer protocol.
 
-    :func:`torch.utils.dlpack.from_dlpack` creates a tensor that always shares memory from
+    :func:`torch.from_dlpack` creates a tensor that always shares memory from
            DLPack capsules.
 
 Args:
@@ -1130,6 +1141,8 @@ def merge_dicts(*dicts):
 
 {tf32_note}
 
+{rocm_fp16_note}
+
 Args:
     input (Tensor): the tensor to be added
     batch1 (Tensor): the first batch of matrices to be multiplied
@@ -1147,7 +1160,7 @@ def merge_dicts(*dicts):
     >>> batch2 = torch.randn(10, 4, 5)
     >>> torch.baddbmm(M, batch1, batch2).size()
     torch.Size([10, 3, 5])
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.bernoulli,
            r"""
@@ -1284,6 +1297,8 @@ def merge_dicts(*dicts):
 """ + r"""
 {tf32_note}
 
+{rocm_fp16_note}
+
 .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
           For broadcasting matrix products, see :func:`torch.matmul`.
 
@@ -1301,7 +1316,7 @@ def merge_dicts(*dicts):
     >>> res = torch.bmm(input, mat2)
     >>> res.size()
     torch.Size([10, 3, 5])
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.bitwise_and,
            r"""
@@ -1374,12 +1389,14 @@ def merge_dicts(*dicts):
 bitwise_left_shift(input, other, *, out=None) -> Tensor
 
 Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
-The result will have the same dtype as :attr:`input`.
+The input tensor must be of integral type. This operator supports
+:ref:`broadcasting to a common shape <broadcasting-semantics>` and
+:ref:`type promotion <type-promotion-doc>`.
 
 The operation applied is:
 
 .. math::
-    \text{{out}}_i = \text{{input}}_i \times 2 ^ {{\text{{other}}_i}}
+    \text{{out}}_i = \text{{input}}_i << \text{{other}}_i
 
 Args:
     input (Tensor or Scalar): the first input tensor
@@ -1399,12 +1416,14 @@ def merge_dicts(*dicts):
 bitwise_right_shift(input, other, *, out=None) -> Tensor
 
 Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
-The result will have the same dtype as :attr:`input`.
+The input tensor must be of integral type. This operator supports
+:ref:`broadcasting to a common shape <broadcasting-semantics>` and
+:ref:`type promotion <type-promotion-doc>`.
 
 The operation applied is:
 
 .. math::
-    \text{{out}}_i = \text{{input}}_i / 2 ^ {{\text{{other}}_i}}
+    \text{{out}}_i = \text{{input}}_i >> \text{{other}}_i
 
 Args:
     input (Tensor or Scalar): the first input tensor
@@ -4320,77 +4339,6 @@ def merge_dicts(*dicts):
     Use :func:`torch.outer` instead.
 """)
 
-add_docstr(torch.solve,
-           r"""
-torch.solve(input, A, *, out=None) -> (Tensor, Tensor)
-
-This function returns the solution to the system of linear
-equations represented by :math:`AX = B` and the LU factorization of
-A, in order as a namedtuple `solution, LU`.
-
-`LU` contains `L` and `U` factors for LU factorization of `A`.
-
-`torch.solve(B, A)` can take in 2D inputs `B, A` or inputs that are
-batches of 2D matrices. If the inputs are batches, then returns
-batched outputs `solution, LU`.
-
-Supports real-valued and complex-valued inputs.
-
-.. warning::
-
-    :func:`torch.solve` is deprecated in favor of :func:`torch.linalg.solve`
-    and will be removed in a future PyTorch release.
-    :func:`torch.linalg.solve` has its arguments reversed and does not return the
-    LU factorization of the input. To get the LU factorization see :func:`torch.lu`,
-    which may be used with :func:`torch.lu_solve` and :func:`torch.lu_unpack`.
-
-    ``X = torch.solve(B, A).solution`` should be replaced with
-
-    .. code:: python
-
-        X = torch.linalg.solve(A, B)
-
-.. note::
-
-    Irrespective of the original strides, the returned matrices
-    `solution` and `LU` will be transposed, i.e. with strides like
-    `B.contiguous().mT.stride()` and
-    `A.contiguous().mT.stride()` respectively.
-
-Args:
-    input (Tensor): input matrix :math:`B` of size :math:`(*, m, k)` , where :math:`*`
-                is zero or more batch dimensions.
-    A (Tensor): input square matrix of size :math:`(*, m, m)`, where
-                :math:`*` is zero or more batch dimensions.
-
-Keyword args:
-    out ((Tensor, Tensor), optional): optional output tuple.
-
-Example::
-
-    >>> A = torch.tensor([[6.80, -2.11,  5.66,  5.97,  8.23],
-    ...                   [-6.05, -3.30,  5.36, -4.44,  1.08],
-    ...                   [-0.45,  2.58, -2.70,  0.27,  9.04],
-    ...                   [8.32,  2.71,  4.35,  -7.17,  2.14],
-    ...                   [-9.67, -5.14, -7.26,  6.08, -6.87]]).t()
-    >>> B = torch.tensor([[4.02,  6.19, -8.22, -7.57, -3.03],
-    ...                   [-1.56,  4.00, -8.67,  1.75,  2.86],
-    ...                   [9.81, -4.09, -4.57, -8.61,  8.99]]).t()
-    >>> X, LU = torch.solve(B, A)
-    >>> torch.dist(B, torch.mm(A, X))
-    tensor(1.00000e-06 *
-           7.0977)
-
-    >>> # Batched solver example
-    >>> A = torch.randn(2, 3, 1, 4, 4)
-    >>> B = torch.randn(2, 3, 1, 4, 6)
-    >>> X, LU = torch.solve(B, A)
-    >>> torch.dist(B, A.matmul(X))
-    tensor(1.00000e-06 *
-       3.6386)
-
-""")
-
 add_docstr(torch.get_default_dtype,
            r"""
 get_default_dtype() -> torch.dtype
@@ -4529,6 +4477,98 @@ def merge_dicts(*dicts):
     (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
 """.format(**common_args))
 
+add_docstr(torch.histogramdd,
+           r"""
+histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+
+Computes a multi-dimensional histogram of the values in a tensor.
+
+Interprets the elements of an input tensor whose innermost dimension has size N
+as a collection of N-dimensional points. Maps each of the points into a set of
+N-dimensional bins and returns the number of points (or total weight) in each bin.
+
+:attr:`input` must be a tensor with at least 2 dimensions.
+If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+If input has three or more dimensions, all but the last dimension are flattened.
+
+Each dimension is independently associated with its own strictly increasing sequence
+of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+tensors. Alternatively, bin edges may be constructed automatically by passing a
+sequence of integers specifying the number of equal-width bins in each dimension.
+
+For each N-dimensional point in input:
+    - Each of its coordinates is binned independently among the bin edges
+        corresponding to its dimension
+    - Binning results are combined to identify the N-dimensional bin (if any)
+        into which the point falls
+    - If the point falls into a bin, the bin's count (or total weight) is incremented
+    - Points which do not fall into any bin do not contribute to the output
+
+:attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+
+If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+the rightmost bin is inclusive of its right edge.
+
+If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+are determined by the minimum and maximum elements of the input tensor in the
+corresponding dimension. The :attr:`range` argument can be provided to manually
+specify the leftmost and rightmost bin edges in each dimension.
+
+If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+
+.. note::
+    See also :func:`torch.histogram`, which specifically computes 1D histograms.
+    While :func:`torch.histogramdd` infers the dimensionality of its bins and
+    binned values from the shape of :attr:`input`, :func:`torch.histogram`
+    accepts and flattens :attr:`input` of any shape.
+
+Args:
+    {input}
+    bins: Tensor[], int[], or int.
+            If Tensor[], defines the sequences of bin edges.
+            If int[], defines the number of equal-width bins in each dimension.
+            If int, defines the number of equal-width bins for all dimensions.
+Keyword args:
+    range (sequence of float): Defines the leftmost and rightmost bin edges
+                                in each dimension.
+    weight (Tensor): By default, each value in the input has weight 1. If a weight
+                        tensor is passed, each N-dimensional coordinate in input
+                        contributes its associated weight towards its bin's result.
+                        The weight tensor should have the same shape as the :attr:`input`
+                        tensor excluding its innermost dimension N.
+    density (bool): If False (default), the result will contain the count (or total weight)
+                    in each bin. If True, each count (weight) is divided by the total count
+                    (total weight), then divided by the volume of its associated bin.
+Returns:
+    hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+    bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+
+Example::
+    >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+    ...                   weight=torch.tensor([1., 2., 4., 8.]))
+        torch.return_types.histogramdd(
+            hist=tensor([[0., 1., 0.],
+                         [2., 0., 0.],
+                         [4., 0., 8.]]),
+            bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                       tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+
+    >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+    ...                   range=[0., 1., 0., 1.], density=True)
+        torch.return_types.histogramdd(
+           hist=tensor([[2., 0.],
+                        [0., 2.]]),
+           bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                      tensor([0.0000, 0.5000, 1.0000])))
+
+""")
+# TODO: Fix via https://github.com/pytorch/pytorch/issues/75798
+torch.histogramdd.__module__ = "torch"
+
 add_docstr(torch.hypot,
            r"""
 hypot(input, other, *, out=None) -> Tensor
@@ -6686,6 +6726,8 @@ def merge_dicts(*dicts):
 
 {tf32_note}
 
+{rocm_fp16_note}
+
 Args:
     input (Tensor): the first matrix to be matrix multiplied
     mat2 (Tensor): the second matrix to be matrix multiplied
@@ -6700,7 +6742,7 @@ def merge_dicts(*dicts):
     >>> torch.mm(mat1, mat2)
     tensor([[ 0.4851,  0.5037, -0.3633],
             [-0.0760, -3.6705,  2.4784]])
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.hspmm,
            r"""
@@ -6752,6 +6794,8 @@ def merge_dicts(*dicts):
 
 {tf32_note}
 
+{rocm_fp16_note}
+
 .. note::
 
     The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
@@ -6791,7 +6835,7 @@ def merge_dicts(*dicts):
     >>> torch.matmul(tensor1, tensor2).size()
     torch.Size([10, 3, 5])
 
-""".format(**common_args, **tf32_notes))
+""".format(**common_args, **tf32_notes, **rocm_fp16_notes))
 
 add_docstr(torch.mode,
            r"""
@@ -8547,6 +8591,12 @@ def merge_dicts(*dicts):
 Out-of-place version of :meth:`torch.Tensor.scatter_add_`
 """)
 
+add_docstr(torch.scatter_reduce, r"""
+scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""")
+
 add_docstr(torch.select,
            r"""
 select(input, dim, index) -> Tensor
@@ -8956,6 +9006,68 @@ def merge_dicts(*dicts):
             [-0.0881,  0.4370,  0.2275,  1.0284]])
 """.format(**common_args))
 
+add_docstr(torch.sparse_compressed_tensor,
+           r"""
+sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None,
+                         *, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR,
+CSC, BSR, or BSC - <sparse-csr-docs>` with specified values at the
+given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse
+matrix multiplication operations in Compressed Sparse format are
+typically faster than that for sparse tensors in COO format. Make you
+have a look at :ref:`the note on the data type of the indices
+<sparse-csr-docs>`.
+
+Args:
+    compressed_indices (array_like): One-dimensional array of size
+        size[cdim] + 1 where cdim is 0 or 1 depending on the layout.
+        The last element is the number of non-zeros. This tensor
+        encodes the index in values and plain_indices depending on
+        where the given compressed dimension (row or column)
+        starts. Each successive number in the tensor subtracted by the
+        number before it denotes the number of elements in a given
+        compressed dimension.
+    plain_indices (array_like): Plain dimension (column or row)
+        co-ordinates of each element in values. Strictly one
+        dimensional tensor with the same length as values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types.  For block
+        sparse formats, the dimensionality of values must be two plus
+        the dimensionality of plain_indices.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor. If not provided, the size will be inferred as
+        the minimum size big enough to hold all non-zero elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    layout (:class:`torch.layout`, required): the desired layout of
+        returned tensor: :attr:`torch.sparse_csr`,
+        :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or
+        :attr:`torch.sparse_bsc`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_tensor_type`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+
+Example::
+    >>> compressed_indices = [0, 2, 4]
+    >>> plain_indices = [0, 1, 0, 1]
+    >>> values = [1, 2, 3, 4]
+    >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64),
+    ...                                torch.tensor(plain_indices, dtype=torch.int64),
+    ...                                torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr)
+    tensor(crow_indices=tensor([0, 2, 4]),
+           col_indices=tensor([0, 1, 0, 1]),
+           values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+           dtype=torch.float64, layout=torch.sparse_csr)
+""".format(**factory_common_args))
+
 add_docstr(torch.sparse_csr_tensor,
            r"""
 sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
@@ -8966,27 +9078,34 @@ def merge_dicts(*dicts):
 at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
 
 Args:
-    crow_indices (array_like): One-dimensional array of size size[0] + 1. The last element
-        is the number of non-zeros. This tensor encodes the index in values and col_indices
-        depending on where the given row starts. Each successive number in the tensor
-        subtracted by the number before it denotes the number of elements in a given row.
-    col_indices (array_like): Column co-ordinates of each element in values. Strictly one
-        dimensional tensor with the same length as values.
-    values (array_list): Initial values for the tensor. Can be a list, tuple, NumPy ``ndarray``, scalar,
-        and other types.
-    size (list, tuple, :class:`torch.Size`, optional): Size of the sparse tensor. If not provided, the
-        size will be inferred as the minimum size big enough to hold all non-zero elements.
-
-Keyword args:
-    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-        Default: if None, infers data type from :attr:`values`.
-    device (:class:`torch.device`, optional): the desired device of returned tensor.
-        Default: if None, uses the current device for the default tensor type
-        (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU
-        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    crow_indices (array_like): One-dimensional array of size size[0] + 1.
+        The last element is the number of non-zeros. This tensor
+        encodes the index in values and col_indices depending on where
+        the given row starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        elements in a given row.
+    col_indices (array_like): Column co-ordinates of each element in
+        values. Strictly one dimensional tensor with the same length
+        as values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor. If not provided, the size will be inferred as
+        the minimum size big enough to hold all non-zero elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_tensor_type`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
     {requires_grad}
 
-Example ::
+Example::
     >>> crow_indices = [0, 2, 4]
     >>> col_indices = [0, 1, 0, 1]
     >>> values = [1, 2, 3, 4]
@@ -8999,6 +9118,173 @@ def merge_dicts(*dicts):
            dtype=torch.float64, layout=torch.sparse_csr)
 """.format(**factory_common_args))
 
+add_docstr(torch.sparse_csc_tensor,
+           r"""
+sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column)
+<sparse-csr-docs>` with specified values at the given
+:attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+multiplication operations in CSC format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-csr-docs>`.
+
+Args:
+    ccol_indices (array_like): One-dimensional array of size size[1] + 1.
+        The last element is the number of non-zeros. This tensor
+        encodes the index in values and row_indices depending on where
+        the given column starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        elements in a given column.
+    row_indices (array_like): Row co-ordinates of each element in
+        values. Strictly one dimensional tensor with the same length
+        as values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor. If not provided, the size will be inferred as
+        the minimum size big enough to hold all non-zero elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_tensor_type`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+
+Example::
+    >>> ccol_indices = [0, 2, 4]
+    >>> row_indices = [0, 1, 0, 1]
+    >>> values = [1, 2, 3, 4]
+    >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+    ...                         torch.tensor(row_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(ccol_indices=tensor([0, 2, 4]),
+           row_indices=tensor([0, 1, 0, 1]),
+           values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+           dtype=torch.float64, layout=torch.sparse_csc)
+""".format(**factory_common_args))
+
+add_docstr(torch.sparse_bsr_tensor,
+           r"""
+sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row))
+<sparse-csr-docs>` with specified 2-dimensional blocks at the given
+:attr:`crow_indices` and :attr:`col_indices`. Sparse matrix
+multiplication operations in BSR format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-csr-docs>`.
+
+Args:
+    crow_indices (array_like): One-dimensional array of size size[0] +
+        1. The last element is the number of non-zeros. This tensor
+        encodes the index in values and col_indices depending on where
+        the given row starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        blocks in a given row.
+    col_indices (array_like): Column co-ordinates of each block in
+        values. Strictly one dimensional tensor with the same length
+        as values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types. The
+        dimensionality of values must be two plus the dimensionality
+        of col_indices.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor. If not provided, the size will be inferred as
+        the minimum size big enough to hold all non-zero blocks.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_tensor_type`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+
+Example::
+    >>> crow_indices = [0, 1, 2]
+    >>> col_indices = [0, 1]
+    >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+    >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+    ...                         torch.tensor(col_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(crow_indices=tensor([0, 1, 2]),
+           col_indices=tensor([0, 1]),
+           values=tensor([[[1., 2.],
+                           [3., 4.]],
+
+                          [[5., 6.],
+                           [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+           layout=torch.sparse_bsr)
+""".format(**factory_common_args))
+
+add_docstr(torch.sparse_bsc_tensor,
+           r"""
+sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse
+Column)) <sparse-csr-docs>` with specified 2-dimensional blocks at the
+given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+multiplication operations in BSC format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-csr-docs>`.
+
+Args:
+    ccol_indices (array_like): One-dimensional array of size size[1] +
+        1. The last element is the number of non-zeros. This tensor
+        encodes the index in values and row_indices depending on where
+        the given column starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        elements in a given column.
+    row_indices (array_like): Row co-ordinates of each element in
+        values. Strictly one dimensional tensor with the same length
+        as values.
+    values (array_list): Initial blocks for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, and other types. The dimensionality
+        of values must be two plus the dimensionality of row_indices.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor. If not provided, the size will be inferred as
+        the minimum size big enough to hold all non-zero blocks.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_tensor_type`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+
+Example::
+    >>> ccol_indices = [0, 1, 2]
+    >>> row_indices = [0, 1]
+    >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+    >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+    ...                         torch.tensor(row_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(ccol_indices=tensor([0, 1, 2]),
+           row_indices=tensor([0, 1]),
+           values=tensor([[[1., 2.],
+                           [3., 4.]],
+
+                          [[5., 6.],
+                           [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+           layout=torch.sparse_bsc)
+""".format(**factory_common_args))
+
 add_docstr(torch.sparse_coo_tensor,
            r"""
 sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
@@ -9747,10 +10033,10 @@ def merge_dicts(*dicts):
            r"""
 roll(input, shifts, dims=None) -> Tensor
 
-Roll the tensor along the given dimension(s). Elements that are shifted beyond the
-last position are re-introduced at the first position. If a dimension is not
-specified, the tensor will be flattened before rolling and then restored
-to the original shape.
+Roll the tensor :attr:`input` along the given dimension(s). Elements that are
+shifted beyond the last position are re-introduced at the first position. If
+:attr:`dims` is `None`, the tensor will be flattened before rolling and then
+restored to the original shape.
 
 Args:
     {input}
@@ -9768,6 +10054,11 @@ def merge_dicts(*dicts):
             [3, 4],
             [5, 6],
             [7, 8]])
+    >>> torch.roll(x, 1)
+    tensor([[8, 1],
+            [2, 3],
+            [4, 5],
+            [6, 7]])
     >>> torch.roll(x, 1, 0)
     tensor([[7, 8],
             [1, 2],
@@ -10760,12 +11051,6 @@ def merge_dicts(*dicts):
 .. note::
     The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be :ref:`broadcastable <broadcasting-semantics>`.
 
-.. note::
-    Currently valid scalar and tensor combination are
-    1. Scalar of floating dtype and torch.double
-    2. Scalar of integral dtype and torch.long
-    3. Scalar of complex dtype and torch.complex128
-
 Arguments:
     condition (BoolTensor): When True (nonzero), yield x, otherwise yield y
     x (Tensor or Scalar): value (if :attr:`x` is a scalar) or values selected at indices
@@ -11955,3 +12240,141 @@ def merge_dicts(*dicts):
     tensor([[2, 3, 5],
             [2, 3, 5]])
 """)
+
+add_docstr(torch.view_as_real_copy,
+           r"""
+Performs the same operation as :func:`torch.view_as_real`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.view_as_complex_copy,
+           r"""
+Performs the same operation as :func:`torch.view_as_complex`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.as_strided_copy,
+           r"""
+Performs the same operation as :func:`torch.as_strided`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.diagonal_copy,
+           r"""
+Performs the same operation as :func:`torch.diagonal`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.expand_copy,
+           r"""
+Performs the same operation as :func:`torch.expand`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.permute_copy,
+           r"""
+Performs the same operation as :func:`torch.permute`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.select_copy,
+           r"""
+Performs the same operation as :func:`torch.select`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.detach_copy,
+           r"""
+Performs the same operation as :func:`torch.detach`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.slice_copy,
+           r"""
+Performs the same operation as :func:`torch.slice`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.split_copy,
+           r"""
+Performs the same operation as :func:`torch.split`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.split_with_sizes_copy,
+           r"""
+Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.squeeze_copy,
+           r"""
+Performs the same operation as :func:`torch.squeeze`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.t_copy,
+           r"""
+Performs the same operation as :func:`torch.t`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.transpose_copy,
+           r"""
+Performs the same operation as :func:`torch.transpose`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.unsqueeze_copy,
+           r"""
+Performs the same operation as :func:`torch.unsqueeze`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.indices_copy,
+           r"""
+Performs the same operation as :func:`torch.indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.values_copy,
+           r"""
+Performs the same operation as :func:`torch.values`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.crow_indices_copy,
+           r"""
+Performs the same operation as :func:`torch.crow_indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.col_indices_copy,
+           r"""
+Performs the same operation as :func:`torch.col_indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.unbind_copy,
+           r"""
+Performs the same operation as :func:`torch.unbind`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.view_copy,
+           r"""
+Performs the same operation as :func:`torch.view`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.unfold_copy,
+           r"""
+Performs the same operation as :func:`torch.unfold`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
+
+add_docstr(torch.alias_copy,
+           r"""
+Performs the same operation as :func:`torch.alias`, but all output tensors
+are freshly created instead of aliasing the input.
+""")
diff --git a/torch/_utils.py b/torch/_utils.py
index 862727731419..e19baa0c2684 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -128,7 +128,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
 
 
 # TODO: Once we decide to break serialization FC, `storage` no longer needs to
-# be a TypedStorage
+# be a _TypedStorage
 def _rebuild_tensor(storage, storage_offset, size, stride):
     # first construct a tensor with the correct dtype/device
     t = torch.tensor([], dtype=storage.dtype, device=storage._untyped().device)
@@ -202,15 +202,20 @@ def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad):
 
 # Should not be used, only here to be able to load Tensors serialized with older versions of pytorch
 _rebuild_xla_tensor = _rebuild_device_tensor_from_numpy
-_rebuild_mlc_tensor = _rebuild_device_tensor_from_numpy
 
 
 def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
     return torch.empty_strided(size, stride, dtype=dtype, device='meta', requires_grad=requires_grad)
 
 
+def _rebuild_wrapper_subclass(cls, dtype, size, stride, storage_offset, layout, device, requires_grad):
+    return torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+        cls, size, strides=stride, storage_offset=storage_offset, layout=layout,
+        device=device, requires_grad=requires_grad)
+
+
 # TODO: Once we decide to break serialization FC, `storage` no longer needs to
-# be a TypedStorage
+# be a _TypedStorage
 def _rebuild_qtensor(storage, storage_offset, size, stride, quantizer_params, requires_grad, backward_hooks):
     qscheme = quantizer_params[0]
     if qscheme == torch.per_tensor_affine:
diff --git a/torch/amp/__init__.py b/torch/amp/__init__.py
new file mode 100644
index 000000000000..e4fe09f55632
--- /dev/null
+++ b/torch/amp/__init__.py
@@ -0,0 +1 @@
+from .autocast_mode import autocast
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
new file mode 100644
index 000000000000..072be3b91859
--- /dev/null
+++ b/torch/amp/autocast_mode.py
@@ -0,0 +1,276 @@
+import torch
+import functools
+import warnings
+
+from typing import Any, Optional
+from torch.types import _dtype
+
+def autocast_decorator(autocast_instance, func):
+    @functools.wraps(func)
+    def decorate_autocast(*args, **kwargs):
+        with autocast_instance:
+            return func(*args, **kwargs)
+    decorate_autocast.__script_unsupported = '@autocast() decorator is not supported in script mode'  # type: ignore[attr-defined]
+    return decorate_autocast
+
+class autocast(object):
+    r"""
+    Instances of :class:`autocast` serve as context managers or decorators that
+    allow regions of your script to run in mixed precision.
+
+    In these regions, ops run in an op-specific dtype chosen by autocast
+    to improve performance while maintaining accuracy.
+    See the :ref:`Autocast Op Reference<autocast-op-reference>` for details.
+
+    When entering an autocast-enabled region, Tensors may be any type.
+    You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting.
+
+    :class:`autocast` should wrap only the forward pass(es) of your network, including the loss
+    computation(s).  Backward passes under autocast are not recommended.
+    Backward ops run in the same type that autocast used for corresponding forward ops.
+
+    Example for CUDA Devices::
+
+        # Creates model and optimizer in default precision
+        model = Net().cuda()
+        optimizer = optim.SGD(model.parameters(), ...)
+
+        for input, target in data:
+            optimizer.zero_grad()
+
+            # Enables autocasting for the forward pass (model + loss)
+            with autocast():
+                output = model(input)
+                loss = loss_fn(output, target)
+
+            # Exits the context manager before backward()
+            loss.backward()
+            optimizer.step()
+
+    See the :ref:`CUDA Automatic Mixed Precision examples<amp-examples>` for usage (along with gradient scaling)
+    in more complex scenarios (e.g., gradient penalty, multiple models/losses, custom autograd functions).
+
+    :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model::
+
+        class AutocastModel(nn.Module):
+            ...
+            @autocast()
+            def forward(self, input):
+                ...
+
+    Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
+    After returning to an autocast-disabled region, using them with floating-point
+    Tensors of different dtypes may cause type mismatch errors.  If so, cast the Tensor(s)
+    produced in the autocast region back to ``float32`` (or other dtype if desired).
+    If a Tensor from the autocast region is already ``float32``, the cast is a no-op,
+    and incurs no additional overhead.
+    CUDA Example::
+
+        # Creates some tensors in default dtype (here assumed to be float32)
+        a_float32 = torch.rand((8, 8), device="cuda")
+        b_float32 = torch.rand((8, 8), device="cuda")
+        c_float32 = torch.rand((8, 8), device="cuda")
+        d_float32 = torch.rand((8, 8), device="cuda")
+
+        with autocast():
+            # torch.mm is on autocast's list of ops that should run in float16.
+            # Inputs are float32, but the op runs in float16 and produces float16 output.
+            # No manual casts are required.
+            e_float16 = torch.mm(a_float32, b_float32)
+            # Also handles mixed input types
+            f_float16 = torch.mm(d_float32, e_float16)
+
+        # After exiting autocast, calls f_float16.float() to use with d_float32
+        g_float32 = torch.mm(d_float32, f_float16.float())
+
+    CPU Training Example::
+
+        # Creates model and optimizer in default precision
+        model = Net()
+        optimizer = optim.SGD(model.parameters(), ...)
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+
+                # Runs the forward pass with autocasting.
+                with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+                    output = model(input)
+                    loss = loss_fn(output, target)
+
+                loss.backward()
+                optimizer.step()
+
+
+    CPU Inference Example::
+
+        # Creates model in default precision
+        model = Net().eval()
+
+        with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+            for input in data:
+                # Runs the forward pass with autocasting.
+                output = model(input)
+
+    CPU Inference Example with Jit Trace::
+
+        class TestModel(nn.Module):
+            def __init__(self, input_size, num_classes):
+                super(TestModel, self).__init__()
+                self.fc1 = nn.Linear(input_size, num_classes)
+            def forward(self, x):
+                return self.fc1(x)
+
+        input_size = 2
+        num_classes = 2
+        model = TestModel(input_size, num_classes).eval()
+
+        # For now, we suggest to disable the Jit Autocast Pass,
+        # As the issue: https://github.com/pytorch/pytorch/issues/75956
+        torch._C._jit_set_autocast_mode(False)
+
+        with torch.cpu.amp.autocast(cache_enabled=False):
+            model = torch.jit.trace(model, torch.randn(1, input_size))
+        model = torch.jit.freeze(model)
+        # Models Run
+        for _ in range(3):
+            model(torch.randn(1, input_size))
+
+    Type mismatch errors *in* an autocast-enabled region are a bug; if this is what you observe,
+    please file an issue.
+
+    ``autocast(enabled=False)`` subregions can be nested in autocast-enabled regions.
+    Locally disabling autocast can be useful, for example, if you want to force a subregion
+    to run in a particular ``dtype``.  Disabling autocast gives you explicit control over
+    the execution type.  In the subregion, inputs from the surrounding region
+    should be cast to ``dtype`` before use::
+
+        # Creates some tensors in default dtype (here assumed to be float32)
+        a_float32 = torch.rand((8, 8), device="cuda")
+        b_float32 = torch.rand((8, 8), device="cuda")
+        c_float32 = torch.rand((8, 8), device="cuda")
+        d_float32 = torch.rand((8, 8), device="cuda")
+
+        with autocast():
+            e_float16 = torch.mm(a_float32, b_float32)
+            with autocast(enabled=False):
+                # Calls e_float16.float() to ensure float32 execution
+                # (necessary because e_float16 was created in an autocasted region)
+                f_float32 = torch.mm(c_float32, e_float16.float())
+
+            # No manual casts are required when re-entering the autocast-enabled region.
+            # torch.mm again runs in float16 and produces float16 output, regardless of input types.
+            g_float16 = torch.mm(d_float32, f_float32)
+
+    The autocast state is thread-local.  If you want it enabled in a new thread, the context manager or decorator
+    must be invoked in that thread.  This affects :class:`torch.nn.DataParallel` and
+    :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
+    (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
+
+    Args:
+        device_type(string, required):  Whether to use 'cuda' or 'cpu' device
+        enabled(bool, optional, default=True):  Whether autocasting should be enabled in the region.
+        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
+        cache_enabled(bool, optional, default=True):  Whether the weight cache inside autocast should be enabled.
+    """
+    def __init__(self, device_type : str,
+                 dtype : Optional[_dtype] = None,
+                 enabled : bool = True,
+                 cache_enabled : Optional[bool] = None):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = device_type
+            self.fast_dtype = dtype
+            # TODO: support get_autocast_gpu/cpu_dtype
+            assert dtype is not None
+            return
+        self.device = device_type
+        if self.device == 'cuda':
+            self.fast_dtype = torch.get_autocast_gpu_dtype()
+        elif self.device == 'cpu':
+            self.fast_dtype = torch.get_autocast_cpu_dtype()
+        elif self.device == 'xpu':
+            self.fast_dtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
+        else:
+            raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'')
+        self._cache_enabled = torch.is_autocast_cache_enabled()
+        if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda':
+            warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling')
+            enabled = False
+        if dtype is not None:
+            self.fast_dtype = dtype
+        if cache_enabled is not None:
+            self._cache_enabled = cache_enabled
+
+        if self.device == 'cpu':
+            supported_dtype = [torch.bfloat16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = 'In CPU autocast, but the target dtype is not supported. Disabling autocast.\n'
+                error_message += 'CPU Autocast only supports dtype of torch.bfloat16 currently.'
+                warnings.warn(error_message)
+                enabled = False
+        if self.device == 'xpu':
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = 'In XPU autocast, but the target dtype is not supported. Disabling autocast.\n'
+                error_message += 'XPU Autocast only supports dtype of torch.bfloat16 currently.'
+                warnings.warn(error_message)
+                enabled = False
+        if self.device == 'cuda':
+            if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
+                raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.')
+        self._enabled = enabled
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            assert self.fast_dtype is not None
+            return self
+
+        self.prev_cache_enabled = torch.is_autocast_cache_enabled()
+        if self.device == 'cpu':
+            self.prev = torch.is_autocast_cpu_enabled()
+            self.prev_fastdtype = torch.get_autocast_cpu_dtype()
+            torch.set_autocast_cpu_enabled(self._enabled)
+            torch.set_autocast_cpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
+            torch.autocast_increment_nesting()
+        elif self.device == 'xpu':
+            self.prev = torch.xpu.is_autocast_xpu_enabled()    # type: ignore[attr-defined]
+            self.prev_fastdtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_enabled(self._enabled)  # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
+            torch.autocast_increment_nesting()
+        else:
+            self.prev = torch.is_autocast_enabled()
+            self.prev_fastdtype = torch.get_autocast_gpu_dtype()
+            torch.set_autocast_gpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
+            torch.set_autocast_enabled(self._enabled)
+            torch.autocast_increment_nesting()
+        torch.set_autocast_cache_enabled(self._cache_enabled)
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+
+        # Drop the cache when we exit to a nesting level that's outside any instance of autocast.
+        if self.device == 'cpu':
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_cpu_enabled(self.prev)
+            torch.set_autocast_cpu_dtype(self.prev_fastdtype)
+        elif self.device == 'xpu':
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.xpu.set_autocast_xpu_enabled(self.prev)            # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_dtype(self.prev_fastdtype)    # type: ignore[attr-defined]
+        else:
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_enabled(self.prev)
+            torch.set_autocast_gpu_dtype(self.prev_fastdtype)
+        torch.set_autocast_cache_enabled(self.prev_cache_enabled)
+        return False
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return autocast_decorator(self, func)
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index dde8cd2563a8..c57122fbf411 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -169,14 +169,14 @@ def from_float(cls, mod):
         assert hasattr(mod, 'sparse_params'), \
             ('Expecting the Linear to have `sparse_params`. Make sure you have provided arguments '
              'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.')
-        sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None)
+        sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None)  # type: ignore[operator, union-attr]
         assert isinstance(sparse_block_shape, (tuple, list))
         assert len(sparse_block_shape) == 2
         # TODO: Need to add options to qconfig to avoid the calibration.
         # TODO: Add calibration for the sparsity
         assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
         activation_post_process = mod.activation_post_process
-        weight_post_process = mod.qconfig.weight()
+        weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
 
         # Assumption is that the weight is already sparsified by the
         # `sparsifier.convert`
@@ -184,7 +184,7 @@ def from_float(cls, mod):
 
         weight_post_process(weight)
         dtype = weight_post_process.dtype
-        act_scale, act_zp = activation_post_process.calculate_qparams()
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[operator, union-attr]
         assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
         w_sc, w_zp = weight_post_process.calculate_qparams()
         if isinstance(w_zp, torch.Tensor):
@@ -193,15 +193,15 @@ def from_float(cls, mod):
             assert w_zp == 0, 'Weight zero point must map to 0'
         qweight = _quantize_weight(weight.float(), weight_post_process)
 
-        row_block_size = mod.sparse_params['sparse_block_shape'][0]
-        col_block_size = mod.sparse_params['sparse_block_shape'][1]
+        row_block_size = mod.sparse_params['sparse_block_shape'][0]  # type: ignore[index]
+        col_block_size = mod.sparse_params['sparse_block_shape'][1]  # type: ignore[index]
         qlinear = cls(mod.in_features,
                       mod.out_features,
                       row_block_size,
                       col_block_size,
                       dtype=dtype)
         qlinear.set_weight_bias(qweight, mod.bias,
-                                row_block_size, col_block_size)
+                                row_block_size, col_block_size)  # type: ignore[arg-type]
         qlinear.scale = float(act_scale)
         qlinear.zero_point = int(act_zp)
         return qlinear
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index 2db70b87a56a..2a54535678b2 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -436,6 +436,8 @@ def get_matching_activations(
     quantized_dict = get_logger_dict(q_module)
     act_dict: Dict[str, Dict] = {}
     for key in quantized_dict:
+        if len(quantized_dict[key]["tensor_val"]) == 0:
+            continue
         match_key = _find_match(sorted(float_dict, reverse=True), key, "stats")
         if match_key is not None:
             act_dict[key] = {}
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 2e65fbec48f4..116b46240105 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -131,6 +131,9 @@ class OutputLogger(nn.Module):
     stats: List[torch.Tensor]
     stats_rnn: List[RNNReturnType]
 
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
     def __init__(
         self,
         ref_node_name: str,
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index bcebadc95d5c..23e235c891db 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -12,6 +12,7 @@
     get_target_type_str,
     get_arg_indices_of_inputs_to_log,
     get_node_input_qparams,
+    op_type_supports_shadowing,
 )
 
 from .ns_types import (
@@ -220,6 +221,8 @@ def _insert_dtype_cast_after_node(
     """
     dtype_cast_op = None
     dtype_cast_mod_cls = None
+    dtype_cast_method = None
+    dtype_cast_method_dtype = None
     dtype_cast_scale = None
     dtype_cast_zero_point = None
     node_input_type_a, _node_output_type_a = \
@@ -257,6 +260,12 @@ def _insert_dtype_cast_after_node(
         if node_a_input_qparams is not None:
             dtype_cast_op = torch.quantize_per_tensor  # type: ignore[assignment]
             dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams
+    elif (
+        node_input_type_a == NodeInputOrOutputType.FP16 and
+        node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        dtype_cast_method = 'to'
+        dtype_cast_method_dtype = torch.float16
     else:
         raise AssertionError(
             f"dtype cast from {node_input_type_c} {node_c.format_node()} to " +
@@ -274,6 +283,10 @@ def _insert_dtype_cast_after_node(
                 return graph_c.create_node(
                     'call_function', dtype_cast_op, (prev_node_c,), {},
                     new_dtype_cast_name)
+        elif dtype_cast_method:
+            return graph_c.create_node(
+                'call_method', dtype_cast_method,
+                (prev_node_c, dtype_cast_method_dtype), {}, new_dtype_cast_name)
         else:
             assert dtype_cast_mod_cls
             dtype_cast_mod = dtype_cast_mod_cls()
@@ -345,7 +358,54 @@ def _copy_node_from_a_to_c(
 
     else:
         raise AssertionError(
-            f"handling of node with op {node_a.op} is not implemented")
+            f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented")
+
+def _can_insert_copy_of_subgraph_a(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    num_non_param_args_node_a: int,
+) -> bool:
+    """
+    This function returns `False` if the input subgraph cannot be copied by
+    `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means
+    that there is a corner case logic for which copy is not yet implemented.
+    """
+    # populate the list of nodes we need to check
+    nodes = []
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        nodes.append(cur_node)
+        cur_node = cur_node.args[0]  # type: ignore[assignment]
+    nodes.append(cur_node)
+    nodes.reverse()
+
+    def _can_insert(node_a_arg, gm_a):
+        if isinstance(node_a_arg, Node):
+            arg_a = return_first_non_observer_node(node_a_arg, gm_a)
+            if arg_a.op == 'call_method':
+                return arg_a.target in ('dequantize', 'to')
+            elif arg_a.op == 'get_attr':
+                return True
+            else:
+                return False
+        elif isinstance(node_a_arg, (list, tuple)):
+            for el in node_a_arg:
+                if not isinstance(el, Node):
+                    return False
+        return True
+
+    # For each node, check if we handle the copy behavior. This follows the
+    # logic in `_insert_copy_of_subgraph_a_after_input_node_c`.
+    for node_a_arg in nodes[0].args[num_non_param_args_node_a:]:
+        if not _can_insert(node_a_arg, gm_a):
+            return False
+
+    for node in nodes[1:]:
+        for node_a_arg in node.args[1:]:
+            if not _can_insert(node_a_arg, gm_a):
+                return False
+
+    return True
 
 def _insert_copy_of_subgraph_a_after_input_node_c(
     input_node_c: Union[Node, List[Node]],
@@ -464,7 +524,7 @@ def _insert_copy_of_node_a_after_input_node_c(
             arg_a = return_first_non_observer_node(node_a_arg, gm_a)
             node_a_arg_copy = _copy_node_from_a_to_c(arg_a, gm_a, gm_b, graph_c)
             new_args.append(node_a_arg_copy)
-        elif isinstance(node_a_arg, (int, float)):
+        elif isinstance(node_a_arg, (int, float, torch.dtype)):
             new_args.append(node_a_arg)
         elif isinstance(node_a_arg, (list, tuple)):
             for el in node_a_arg:
@@ -589,6 +649,26 @@ def load_arg(a):
                 subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \
                     end_node_b_to_matched_subgraph_a_and_name[node_b]
 
+            if len(node_b.args) == 0:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', kwargs-only node not handled yet')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            all_op_types_support_shadowing = (
+                op_type_supports_shadowing(subgraph_a.start_node) and
+                op_type_supports_shadowing(node_b)
+            )
+            if not all_op_types_support_shadowing:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unsupported')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
             # For both start_node and end_node verify that we know how to do
             # the dtype cast. If we do not, skip.
             node_input_type_a, node_output_type_a = \
@@ -630,6 +710,16 @@ def load_arg(a):
                     env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
                     continue
 
+            num_non_param_args_node_a = \
+                get_number_of_non_param_args(subgraph_a.start_node, gm_a)
+            if not _can_insert_copy_of_subgraph_a(subgraph_a, gm_a, num_non_param_args_node_a):
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unhandled logic in subgraph copy')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
             fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a)
             fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)
 
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index d27c5d165ad8..fc53a24fc53c 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -13,6 +13,10 @@
 import torch.nn.intrinsic as nni
 import torch.nn.qat as nnqat
 import torch.nn.qat.dynamic as nnqatd
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+import torch.ao.quantization.fx._lower_to_native_backend as \
+    _lower_to_native_backend
+import torch.ao.quantization.quantization_mappings as quantization_mappings
 
 from .ns_types import NSNodeTargetType
 
@@ -20,73 +24,35 @@
 
 
 def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
+    # note: this set is modified below by items from backend_config_dict
     sets_of_related_ops: List[Set[NSNodeTargetType]] = [
         # conv modules
         set([
             nn.Conv1d,
-            nnq.Conv1d,
-            nnqd.Conv1d,
-            nniqat.ConvBn1d,
-            nniqat.ConvBnReLU1d,
-            nniq.ConvReLU1d,
-            nni.ConvReLU1d,
         ]),
         set([
             nn.Conv2d,
-            nnq.Conv2d,
-            nnqd.Conv2d,
-            nnqat.Conv2d,
-            nniqat.ConvBn2d,
-            nniqat.ConvBnReLU2d,
-            nniqat.ConvReLU2d,
-            nniq.ConvReLU2d,
-            nni.ConvReLU2d,
         ]),
         set([
             nn.Conv3d,
-            nnq.Conv3d,
-            nnqd.Conv3d,
-            nnqat.Conv3d,
-            nniqat.ConvBn3d,
-            nniqat.ConvBnReLU3d,
-            nniqat.ConvReLU3d,
-            nniq.ConvReLU3d,
-            nni.ConvReLU3d,
         ]),
         # conv functionals
         set([
             F.conv1d,
-            toq.conv1d,
-            toq.conv1d_relu,
         ]),
         set([
             F.conv2d,
-            toq.conv2d,
-            toq.conv2d_relu,
         ]),
         set([
             F.conv3d,
-            toq.conv3d,
-            toq.conv3d_relu,
         ]),
         # linear modules
         set([
             nn.Linear,
-            nnq.Linear,
-            nni.LinearReLU,
-            nniq.LinearReLU,
-            nniqd.LinearReLU,
-            nnqat.Linear,
-            nnqatd.Linear,
-            nnqd.Linear,
-            nniqat.LinearReLU,
-            nn.modules.linear.NonDynamicallyQuantizableLinear,
         ]),
         # linear functionals
         set([
             F.linear,
-            toq.linear,
-            toq.linear_relu,
         ]),
         # average pool
         set([
@@ -117,26 +83,20 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
         # LSTM
         set([
             nn.LSTM,
-            nnqd.LSTM,
         ]),
         # add
         set([
             torch.add,
-            toq.add,
             operator.add,  # x + y
-            toq.add_relu,
         ]),
         # cat
         set([
             torch.cat,
-            toq.cat,
         ]),
         # mul
         set([
             torch.mul,
-            toq.mul,
             operator.mul,
-            toq.mul_relu,
         ]),
         # relu
         set([
@@ -170,121 +130,82 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
         # BatchNorm
         set([
             nn.BatchNorm2d,
-            nnq.BatchNorm2d,
         ]),
         set([
             nn.BatchNorm3d,
-            nnq.BatchNorm3d,
         ]),
         # ConvTranspose
         set([
             nn.ConvTranspose1d,
-            nnq.ConvTranspose1d,
-            nnqd.ConvTranspose1d,
         ]),
         set([
             nn.ConvTranspose2d,
-            nnq.ConvTranspose2d,
-            nnqd.ConvTranspose2d,
         ]),
         set([
             nn.ConvTranspose3d,
-            nnq.ConvTranspose3d,
-            nnqd.ConvTranspose3d,
-        ]),
-        set([
-            nn.ConvTranspose3d,
-            nnq.ConvTranspose3d,
         ]),
         # ELU
         set([
             nn.ELU,
-            nnq.ELU,
         ]),
         # Embedding
         set([
             nn.Embedding,
-            nnq.Embedding,
-            nnqat.Embedding,
         ]),
         # EmbeddingBag
         set([
             nn.EmbeddingBag,
-            nnq.EmbeddingBag,
-            nnqat.EmbeddingBag,
         ]),
         # GroupNorm
         set([
             nn.GroupNorm,
-            nnq.GroupNorm,
         ]),
         # Hardswish
         set([
             nn.Hardswish,
-            nnq.Hardswish,
         ]),
         # InstanceNorm
         set([
             nn.InstanceNorm1d,
-            nnq.InstanceNorm1d,
         ]),
         set([
             nn.InstanceNorm2d,
-            nnq.InstanceNorm2d,
         ]),
         set([
             nn.InstanceNorm3d,
-            nnq.InstanceNorm3d,
         ]),
         # LayerNorm
         set([
             nn.LayerNorm,
-            nnq.LayerNorm,
         ]),
         # LeakyReLU
         set([
             nn.LeakyReLU,
-            nnq.LeakyReLU,
         ]),
         # ReLU6
         set([
             nn.ReLU6,
             F.relu6,
-            nnq.ReLU6,
-        ]),
-        # BNReLU2d
-        set([
-            nni.BNReLU2d,
-            nniq.BNReLU2d,
-        ]),
-        set([
-            nni.BNReLU3d,
-            nniq.BNReLU3d,
         ]),
         # F.elu
         set([
             F.elu,
-            toq.elu,
         ]),
         # F.hardswish
         set([
             F.hardswish,
-            toq.hardswish,
         ]),
         # F.instance_norm
         set([
             F.instance_norm,
-            toq.instance_norm,
         ]),
         # F.layer_norm
         set([
             F.layer_norm,
-            toq.layer_norm,
         ]),
         # F.leaky_relu
         set([
             F.leaky_relu,
-            toq.leaky_relu,
         ]),
         # F.silu
         set([
@@ -376,20 +297,116 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
         # dropout
         set([
             nn.Dropout,
-            nnq.Dropout,
         ]),
         # F.dropout
         set([
             F.dropout,
-            toq.dropout,
         ]),
         # matmul
         set([
             torch.matmul,
-            toq.matmul,
         ]),
+        # Softmax
+        set([
+            nn.Softmax,
+        ]),
+    ]
+
+    # for each floating point op, add versions of the op added by
+    # backend_config_dict
+    backend_config_dict = get_native_backend_config_dict()
+
+    new_connections = [
+        # technical debt edge case
+        (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear),
     ]
 
+    for config in backend_config_dict['configs']:
+
+        if 'pattern' not in config:
+            continue
+
+        # format: (c, (b, a))
+        pattern = config['pattern']
+        first_element = pattern
+        # look from the end, because pattern is in reverse order
+        while isinstance(first_element, (list, tuple)):
+            first_element = first_element[-1]
+
+        if 'fused_module' in config:
+            # case 1: pattern fuses a pattern of ops into an op
+            # example: nn.Conv1d, nn.ReLU fused into nni.ConvReLU1d
+            new_connections.append((first_element, config['fused_module']))
+
+        if 'qat_module' in config:
+            # case 2: pattern swaps a module into a QAT module
+            # example: nni.ConvReLU1d swapped into nniqat.ConvReLU1d
+            new_connections.append((first_element, config['qat_module']))
+
+        if 'reference_quantized_module_for_root' in config:
+            # case 3: reference version of floating point module, such as
+            # nn.Conv2d and nnqr.Conv2d
+            new_connections.append(
+                (first_element, config['reference_quantized_module_for_root'])
+            )
+
+    #
+    # Add reference module swaps from default lowering path
+    #
+
+    for source_to_target in (
+        _lower_to_native_backend.STATIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.WEIGHT_ONLY_LOWER_MODULE_MAP,
+        _lower_to_native_backend.SPECIAL_PATTERN_LOWER_MODULE_MAP,
+    ):
+        for source, target in source_to_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target))
+
+    for source_to_double_target in (
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP,
+    ):
+        for source, (target1, target2) in source_to_double_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target1))
+            new_connections.append((source, target2))
+
+    #
+    # Add function swaps from default lowering path
+    #
+
+    for source, (target1, target2) in \
+            _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items():
+        new_connections.append((source, target1))
+        new_connections.append((source, target2))
+
+    for source_to_target in (
+        _lower_to_native_backend.QBIN_OP_MAPPING,
+        _lower_to_native_backend.QBIN_RELU_OP_MAPPING,
+        quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():
+            new_connections.append((source, target))
+
+    #
+    # Add other swaps, ideally in the future this could be removed
+    # after the lowering code stops using these.
+    #
+    for source_to_target in (
+        quantization_mappings.DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():
+            new_connections.append((source, target))
+
+
+    # add the new connections from backend_config_dict
+    for item1, item2 in new_connections:
+        for set_of_related_ops in sets_of_related_ops:
+            if item1 in set_of_related_ops or item2 in set_of_related_ops:
+                set_of_related_ops.add(item1)
+                set_of_related_ops.add(item2)
+                break
+
     base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]] = {}
 
     counter = 0
@@ -446,10 +463,10 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         F.dropout,
         F.silu,
         F.mish,
-        # TODO(future PR): implement shadowing for binary ops and
-        # uncomment below
-        # operator.add,
-        # operator.mul,
+        operator.add,
+        torch.add,
+        operator.mul,
+        torch.mul,
         torch.sum,
     ])
 
@@ -512,6 +529,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         torch.squeeze,
         torch.stack,
         torch.unsqueeze,
+        operator.add,
     ])
 
     MODS_IO_TYPE_FP32: Set[NSNodeTargetType] = set([
@@ -523,9 +541,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nn.Conv1d,
         nn.Conv2d,
         nn.Conv3d,
-        nnqd.Conv1d,
-        nnqd.Conv2d,
-        nnqd.Conv3d,
+        nnqat.Conv1d,
         nnqat.Conv2d,
         nnqat.Conv3d,
         nnqat.Embedding,
@@ -540,9 +556,6 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nn.ConvTranspose1d,
         nn.ConvTranspose2d,
         nn.ConvTranspose3d,
-        nnqd.ConvTranspose1d,
-        nnqd.ConvTranspose2d,
-        nnqd.ConvTranspose3d,
         nn.ELU,
         nn.GroupNorm,
         nn.InstanceNorm1d,
@@ -554,12 +567,14 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nn.ReLU6,
         nn.SiLU,
         nn.Mish,
+        nn.Softmax,
         nni.BNReLU2d,
         nni.BNReLU3d,
         nni.ConvReLU1d,
         nni.ConvReLU2d,
         nni.ConvReLU3d,
         nni.LinearReLU,
+        nni.LinearBn1d,
         nni.ConvBn1d,
         nni.ConvBn2d,
         nni.ConvBn3d,
@@ -569,9 +584,11 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nniqat.ConvBnReLU1d,
         nniqat.ConvBnReLU2d,
         nniqat.ConvBnReLU3d,
+        nniqat.ConvReLU1d,
         nniqat.ConvReLU2d,
         nniqat.ConvReLU3d,
         nniqat.LinearReLU,
+        nniqat.LinearBn1d,
         nniqd.LinearReLU,
     ])
 
@@ -579,26 +596,23 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nnq.Linear,
         nnq.Conv1d,
         nnq.Conv2d,
-        nniq.ConvReLU2d,
         nnq.Conv3d,
         nnq.BatchNorm2d,
         nnq.BatchNorm3d,
         nnq.Dropout,
         nnq.ConvTranspose1d,
         nnq.ConvTranspose2d,
-        nnq.ConvTranspose3d,
         nnq.ELU,
-        nnq.GroupNorm,
         nnq.InstanceNorm1d,
         nnq.InstanceNorm2d,
         nnq.InstanceNorm3d,
         nnq.LayerNorm,
         nnq.Hardswish,
         nnq.LeakyReLU,
-        nnq.ReLU6,
         nnq.Embedding,
         nnq.EmbeddingBag,
         nnq.Dropout,
+        nnq.Softmax,
         nniq.BNReLU2d,
         nniq.BNReLU3d,
         nniq.ConvReLU1d,
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index b0adb5faf95d..b8e6a0ee4dc1 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -8,7 +8,7 @@
 
 from torch.ao.quantization.utils import getattr_from_fqn
 from .ns_types import NSNodeTargetType
-from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns
+from torch.ao.quantization.fx.backend_config_utils import get_native_quant_patterns
 from torch.ao.quantization import (
     ObserverBase,
     FakeQuantizeBase,
@@ -66,9 +66,18 @@ def get_reversed_fusions() -> List[Tuple[NSFusionType, int]]:
     # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
     # For fusions, we only care about patterns composed of multiple ops.
     # TODO(future PR): allow customizations from default patterns.
-    all_quant_patterns = get_default_quant_patterns()
+    all_quant_patterns = get_native_quant_patterns()
+
     default_base_op_idx = 0
     for quant_pattern, _quant_handler in all_quant_patterns.items():
+        # TODO: this is a temporary hack to flatten the patterns from quantization so
+        # that it works with the ns matcher function, maybe we should use `is_match`
+        # in torch.ao.quantization.fx.match_utils to match the patterns
+        if isinstance(quant_pattern, tuple) and len(quant_pattern) == 2 and \
+           isinstance(quant_pattern[1], tuple) and len(quant_pattern[1]) == 2:
+            # flatten the pattern with form (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d))
+            quant_pattern = (quant_pattern[0], quant_pattern[1][0], quant_pattern[1][1])
+
         # Only patterns of multiple ops are fusions, ignore
         # patterns which contain a single ops (they get matched
         # without caring about fusions).
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 96a57c438e27..8f1f277aa8c4 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -60,10 +60,15 @@ def get_node_first_input_and_output_type(
         elif node.target in FUNS_IO_TYPE_INT8:
             return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
         elif node.target in FUNS_IO_TYPE_FP32_OR_INT8:
-            return (
-                NodeInputOrOutputType.FP32_OR_INT8,
-                NodeInputOrOutputType.FP32_OR_INT8,
+            first_arg = node.args[0]
+            assert isinstance(first_arg, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
             )
+            return (prev_node_output_type, prev_node_output_type)
         else:
             return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
@@ -71,7 +76,13 @@ def get_node_first_input_and_output_type(
         assert node.op == "call_module"
         assert isinstance(node.target, str)
         mod = getattr_from_fqn(gm, node.target)
-        if isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase)):  # type: ignore[arg-type]
+        is_known_fp32_or_int8_input_module = any(
+            isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
+        )
+        if (
+            isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase))  # type: ignore[arg-type]
+            or is_known_fp32_or_int8_input_module
+        ):
             # A logger or observer's input and output type is the output
             # type of the preceding node.
             first_arg = node.args[0]
@@ -89,18 +100,10 @@ def get_node_first_input_and_output_type(
         is_known_int8_input_module = any(
             isinstance(mod, target_type) for target_type in MODS_IO_TYPE_INT8  # type: ignore[arg-type]
         )
-        is_known_fp32_or_int8_input_module = any(
-            isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
-        )
         if is_known_fp32_input_module:
             return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
         elif is_known_int8_input_module:
             return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
-        elif is_known_fp32_or_int8_input_module:
-            return (
-                NodeInputOrOutputType.FP32_OR_INT8,
-                NodeInputOrOutputType.FP32_OR_INT8,
-            )
         else:
             return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
 
@@ -141,10 +144,15 @@ def get_node_first_input_and_output_type(
             return (prev_node_output_type, NodeInputOrOutputType.FP16)
 
         elif node.target in METHS_IO_TYPE_FP32_OR_INT8:
-            return (
-                NodeInputOrOutputType.FP32_OR_INT8,
-                NodeInputOrOutputType.FP32_OR_INT8,
+            first_arg = node.args[0]
+            assert isinstance(first_arg, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
             )
+            return (prev_node_output_type, prev_node_output_type)
 
         return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
     else:
@@ -481,3 +489,10 @@ def compute_cosine_similarity(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     x = x.reshape(1, -1)
     y = y.reshape(1, -1)
     return torch.nn.functional.cosine_similarity(x, y)
+
+def op_type_supports_shadowing(node: Node) -> bool:
+    if node.op == 'call_function':
+        if node.target in (torch.add, torch.mul, operator.add, operator.mul, torch.cat, torch.stack):
+            # shadowing for ops with multiple tensor inputs is not implemented yet
+            return False
+    return True
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index 36e183efe1d8..2020593ddbfb 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -158,23 +158,27 @@ def get_op_to_type_to_weight_extraction_fn() -> Dict[str, Dict[Callable, Callabl
 
     op_to_type_to_weight_extraction_fn: Dict[str, Dict[Callable, Callable]] = {
         'call_module': {
-            # Conv
+            # Conv1d
             nn.Conv1d: mod_weight_detach,
-            nn.Conv2d: mod_weight_detach,
-            nn.Conv3d: mod_weight_detach,
             nni.ConvReLU1d: mod_0_weight_detach,
-            nni.ConvReLU2d: mod_0_weight_detach,
-            nni.ConvReLU3d: mod_0_weight_detach,
             nnq.Conv1d: mod_weight_bias_0,
+            nnqat.Conv1d: mod_weight_detach,
             nniqat.ConvBn1d: mod_weight_detach,
             nniqat.ConvBnReLU1d: mod_weight_detach,
+            nniqat.ConvReLU1d: mod_weight_detach,
             nniq.ConvReLU1d: mod_weight_bias_0,
+            # Conv2d
+            nn.Conv2d: mod_weight_detach,
+            nni.ConvReLU2d: mod_0_weight_detach,
             nnq.Conv2d: mod_weight_bias_0,
             nnqat.Conv2d: mod_weight_detach,
             nniqat.ConvBn2d: mod_weight_detach,
             nniqat.ConvBnReLU2d: mod_weight_detach,
             nniqat.ConvReLU2d: mod_weight_detach,
             nniq.ConvReLU2d: mod_weight_bias_0,
+            # Conv3d
+            nn.Conv3d: mod_weight_detach,
+            nni.ConvReLU3d: mod_0_weight_detach,
             nnq.Conv3d: mod_weight_bias_0,
             nnqat.Conv3d: mod_weight_detach,
             nniqat.ConvBn3d: mod_weight_detach,
@@ -189,6 +193,7 @@ def get_op_to_type_to_weight_extraction_fn() -> Dict[str, Dict[Callable, Callabl
             nnqat.Linear: mod_weight_detach,
             nnqd.Linear: mod_weight_bias_0,
             nniqat.LinearReLU: mod_weight_detach,
+            nniqat.LinearBn1d: mod_weight_detach,
             nn.modules.linear.NonDynamicallyQuantizableLinear: mod_weight_detach,
             # LSTM
             nn.LSTM: get_lstm_weight,
diff --git a/torch/ao/quantization/_dbr/auto_trace.py b/torch/ao/quantization/_dbr/auto_trace.py
index 86893a1ef4b8..c786c8628a7f 100644
--- a/torch/ao/quantization/_dbr/auto_trace.py
+++ b/torch/ao/quantization/_dbr/auto_trace.py
@@ -14,6 +14,8 @@
     get_torch_function_hook_type,
     get_module_hook_type,
     OpQuantizeabilityType,
+    AutoQuantizationStateModuleDict,
+    get_fqn_valid_for_module_dict_key,
 )
 from .model_utils import (
     pack_weights_for_functionals,
@@ -350,6 +352,8 @@ def _patched_module_call(self, *args, **kwargs):
                             for _, child_child in child.named_modules():
                                 leaves.add(child_child)
 
+                    self._fqn_to_auto_quant_state_map = AutoQuantizationStateModuleDict()
+
                     for fqn, v in named_modules:
 
                         # fqn is the global FQN, i.e. 'foo.bar.baz'
@@ -366,14 +370,39 @@ def _patched_module_call(self, *args, **kwargs):
                         if v is self:
                             # for the top level module only, specify input
                             # and output dtypes
-                            v._auto_quant_state = AutoQuantizationState(
+                            auto_quant_state = AutoQuantizationState(
                                 qconfig_dict, fqn,
                                 input_dtypes, output_dtypes)
-                            pass
                         else:
-                            v._auto_quant_state = AutoQuantizationState(
+                            auto_quant_state = AutoQuantizationState(
                                 qconfig_dict, fqn)
 
+                        # The code below registers the auto_quant_state object
+                        # of the child in the module hierarchy of the parent,
+                        # and adds the auto_quant_state object to the child
+                        # with a raw __setattr__, without registering it in
+                        # the module hierarchy of the child.
+                        # This is solving the problem of both storing extra state
+                        # (observers) as well as not modifying the meaning of user
+                        # code in child modules which iterates over all module
+                        # children.
+                        #
+                        # This narrows down the issue of dynamically adding
+                        # children to only affect the top level module and not
+                        # the children.
+
+                        # On the parent, register this module in the FQN map
+                        fqn_to_use_for_key = \
+                            get_fqn_valid_for_module_dict_key(fqn)
+                        self._fqn_to_auto_quant_state_map[fqn_to_use_for_key] = \
+                            auto_quant_state
+                        # On the child, manually set the attribute without
+                        # going through the `torch.nn.Module.__setattr__`
+                        # function, to prevent this object from appearing in
+                        # the child's module hierarchy.
+                        object.__setattr__(
+                            v, '_auto_quant_state', auto_quant_state)
+
                 global_op_idx[0] = 0
 
                 output = super().__call__(*new_args, **new_kwargs)
@@ -688,6 +717,6 @@ def rewrite_for_scripting(self):
 # checking the fix into `torch.nn.Sequential` to avoid the patch.
 def _nn_sequential_patched_forward(cls, input):
     for module in cls:
-        if not isinstance(module, AutoQuantizationState):
+        if not isinstance(module, AutoQuantizationStateModuleDict):
             input = module(input)
     return input
diff --git a/torch/ao/quantization/_dbr/auto_trace_rewriter.py b/torch/ao/quantization/_dbr/auto_trace_rewriter.py
index 79d19f410c3c..1189dbc879c4 100644
--- a/torch/ao/quantization/_dbr/auto_trace_rewriter.py
+++ b/torch/ao/quantization/_dbr/auto_trace_rewriter.py
@@ -8,7 +8,10 @@
 import torch.fx
 from .mappings import conv_ops
 from .quantization_state import AutoQuantizationState
-from .utils import get_packable_arg_idxs
+from .utils import (
+    get_packable_arg_idxs,
+    AutoQuantizationStateModuleDict,
+)
 
 class AllModuleTracer(torch.fx.Tracer):
     """
@@ -40,10 +43,10 @@ def _maybe_update_args_with_quants(self, args, arg_quant_infos, target):
                         new_first_arg.append(args[0][idx])
                     else:
                         # create a quant node
-                        scale, zp = input_arg_quant_info
+                        scale, zp, dtype = input_arg_quant_info
                         quant = super().create_node(
                             'call_function', torch.quantize_per_tensor,
-                            (args[0][idx], scale.item(), zp.item(), torch.quint8), {}, None, None)
+                            (args[0][idx], scale.item(), zp.item(), dtype), {}, None, None)
                         new_first_arg.append(quant)
                 new_args = [new_first_arg, *args[1:]]
             elif target == torch.cat:
@@ -58,10 +61,10 @@ def _maybe_update_args_with_quants(self, args, arg_quant_infos, target):
                         new_args.append(args[idx])
                     else:
                         # create a quant node
-                        scale, zp = input_arg_quant_info
+                        scale, zp, dtype = input_arg_quant_info
                         quant = super().create_node(
                             'call_function', torch.quantize_per_tensor,
-                            (args[idx], scale.item(), zp.item(), torch.quint8), {}, None, None)
+                            (args[idx], scale.item(), zp.item(), dtype), {}, None, None)
                         new_args.append(quant)
             args = tuple(new_args)
         return args
@@ -207,7 +210,7 @@ def linear_rewrite_args(input, weight, bias=None):
     # class.
     # TODO(future): remove the hack
     def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any:
-        if isinstance(m, AutoQuantizationState):
+        if isinstance(m, AutoQuantizationStateModuleDict):
             return args[0]
         return super().call_module(m, forward, args, kwargs)
 
diff --git a/torch/ao/quantization/_dbr/mappings.py b/torch/ao/quantization/_dbr/mappings.py
index 1fcad0b61119..89c963f8795a 100644
--- a/torch/ao/quantization/_dbr/mappings.py
+++ b/torch/ao/quantization/_dbr/mappings.py
@@ -6,6 +6,7 @@
 from torch.ao.quantization.quantization_mappings import (
     DEFAULT_STATIC_QUANT_MODULE_MAPPINGS,
     DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS,
 )
 
 import operator
@@ -67,6 +68,10 @@
     set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys())
 module_types_supported_by_quantization |= \
     set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.values())
+module_types_supported_by_quantization |= \
+    set(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.keys())
+module_types_supported_by_quantization |= \
+    set(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.values())
 module_types_supported_by_quantization |= set([
     # these are quantizeable modules which do not need swaps
     nn.ReLU,
@@ -144,6 +149,9 @@
 for a, b in DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.items():
     a_related_to_b.add((a, b))
     a_related_to_b.add((b, a))
+for a, b in DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.items():
+    a_related_to_b.add((a, b))
+    a_related_to_b.add((b, a))
 for a, b in fp32_to_int8_fun_mapping.items():
     a_related_to_b.add((a, b))
     a_related_to_b.add((b, a))
diff --git a/torch/ao/quantization/_dbr/model_utils.py b/torch/ao/quantization/_dbr/model_utils.py
index ca668edce8b7..cd60de8a1ba4 100644
--- a/torch/ao/quantization/_dbr/model_utils.py
+++ b/torch/ao/quantization/_dbr/model_utils.py
@@ -118,9 +118,9 @@ def attach_scale_zp_values_to_model(
     if hasattr(module, '_auto_quant_state'):
         qstate: AutoQuantizationState = module._auto_quant_state  # type: ignore[assignment]
         for tensor_id, observer in qstate.tensor_id_to_observer.items():
-            activation_int8_quantized = \
-                observer.dtype in [torch.quint8, torch.qint8]
-            if activation_int8_quantized:
+            activation_int8_or_int32_quantized = \
+                observer.dtype in [torch.quint8, torch.qint8, torch.qint32]
+            if activation_int8_or_int32_quantized:
                 scale, zp = observer.calculate_qparams()
                 # tensor_id_to_observer is a ModuleDict which has to have string keys
                 # tensor_id_to_scale_zp is a normal dict which can have int keys
diff --git a/torch/ao/quantization/_dbr/module_swap_utils.py b/torch/ao/quantization/_dbr/module_swap_utils.py
index 59e495ef6760..a95f8210286e 100644
--- a/torch/ao/quantization/_dbr/module_swap_utils.py
+++ b/torch/ao/quantization/_dbr/module_swap_utils.py
@@ -1,18 +1,23 @@
-from typing import Dict, Callable, Any
+from typing import Dict, Callable, Any, Optional
 
 import torch
 
 from torch.nn.intrinsic import _FusedModule
 from ..utils import (
     activation_is_int8_quantized,
+    activation_is_int32_quantized,
     op_is_int8_dynamically_quantized,
 )
 from torch.ao.quantization import swap_module
+from torch.ao.quantization.quantization_mappings import (
+    DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS,
+)
 
 def _swap_child_modules(
     module: torch.nn.Module,
     static_mappings: Dict[Callable, Any],
     dynamic_mappings: Dict[Callable, Any],
+    parent_fqn: Optional[str] = None,
 ) -> None:
     """
     For each direct child of `module`, swaps it using `static_mappings`
@@ -22,26 +27,52 @@ def _swap_child_modules(
     Recursively calls itself on each child.
     """
 
+    qstate = getattr(module, '_auto_quant_state', None)
+
     reassign = {}
-    for name, mod in module.named_children():
+    for local_fqn, mod in module.named_children():
+        if parent_fqn is None:
+            global_fqn = local_fqn
+        else:
+            global_fqn = f"{parent_fqn}.{local_fqn}"
         # both fused modules and observed custom modules are
         # swapped as one unit
         if not isinstance(mod, _FusedModule):
-            _swap_child_modules(mod, static_mappings, dynamic_mappings)
+            _swap_child_modules(
+                mod, static_mappings, dynamic_mappings, global_fqn)
 
         qconfig = getattr(mod, 'qconfig', None)
         if not qconfig:
             continue
         activation_int8_quantized = activation_is_int8_quantized(qconfig)
         op_int8_dynamically_quantized = op_is_int8_dynamically_quantized(qconfig)
+        activation_int32_quantized = activation_is_int32_quantized(qconfig)
+
+        # Get the output observer from qstate and attach it to the module,
+        # to match the API for Eager mode module swaps
+        if qstate is not None:
+            output_obs = qstate.get_output_observer_from_fqn(global_fqn)
+            if output_obs is not None:
+                mod.activation_post_process = output_obs
+
         if activation_int8_quantized:
             if not type(mod) in static_mappings:
                 continue
-            reassign[name] = swap_module(mod, static_mappings, {})
+            reassign[local_fqn] = swap_module(mod, static_mappings, {})
         elif op_int8_dynamically_quantized:
             if not type(mod) in dynamic_mappings:
                 continue
-            reassign[name] = swap_module(mod, dynamic_mappings, {})
+            reassign[local_fqn] = swap_module(mod, dynamic_mappings, {})
+        elif activation_int32_quantized:
+            # For now, only apply reference logic to modules quantized to
+            # int32. Do it automatically.
+            # TODO(future PR): extend this logic to more dtypes, and add
+            # the is_reference API flag instead of doing this automatically.
+            # Note: swap modules only does the swap if the mapping for this
+            # module exists.
+            reassign[local_fqn] = swap_module(
+                mod, DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS, {})
+
         # TODO(future PR): add support for other dtypes
 
     for key, value in reassign.items():
diff --git a/torch/ao/quantization/_dbr/quantization_state.py b/torch/ao/quantization/_dbr/quantization_state.py
index f7f20b1ce224..db0ab0111d52 100644
--- a/torch/ao/quantization/_dbr/quantization_state.py
+++ b/torch/ao/quantization/_dbr/quantization_state.py
@@ -30,7 +30,6 @@
     iterate_and_apply,
     get_op_packing_only_uses_module_attributes,
     get_packable_tensor_kwarg_names,
-    get_producer_of_seen_q_op_info,
     clone_detach_tensor_without_dispatch,
     get_input_args_quant_dequant_info,
     get_cur_qconfig,
@@ -43,11 +42,15 @@
     get_seen_q_op_info_of_end_of_fusion,
 )
 
+from torch.ao.quantization.utils import (
+    activation_is_int32_quantized,
+)
+
 OpConvertInfo = Tuple[
     # quantized equivalent of original op (None means keep original)
     Optional[Callable],
-    # arg_quant_infos, each element is (scale, zp) for quantized and None otherwise
-    List[Optional[Tuple[float, int]]],
+    # arg_quant_infos, each element is (scale, zp, dtype) for quantized and None otherwise
+    List[Optional[Tuple[float, int, torch.dtype]]],
     # arg_dequant_infos, each element is True if this arg needs a dequant
     List[bool],
     # packed param name, if the op has a packed param
@@ -455,9 +458,11 @@ def op_convert_before_hook(
                     quant_info = arg_quant_infos[tensor_arg_idx]
                     dequant_info = arg_dequant_infos[tensor_arg_idx]
                     if quant_info is not None:
-                        scale, zp = quant_info
-                        arg = torch.quantize_per_tensor(arg, scale, zp, torch.quint8)
-                    elif dequant_info is True:
+                        scale, zp, dtype = quant_info
+                        arg = torch.quantize_per_tensor(arg, scale, zp, dtype)
+                    if dequant_info is True:
+                        # Note: both quant and dequant paths are taken for
+                        # reference ops.
                         arg = arg.dequantize()
                     new_first_arg.append(arg)
                     tensor_arg_idx += 1
@@ -471,9 +476,11 @@ def op_convert_before_hook(
                     quant_info = arg_quant_infos[tensor_arg_idx]
                     dequant_info = arg_dequant_infos[tensor_arg_idx]
                     if quant_info is not None:
-                        scale, zp = quant_info
-                        arg = torch.quantize_per_tensor(arg, scale, zp, torch.quint8)
-                    elif dequant_info is True:
+                        scale, zp, dtype = quant_info
+                        arg = torch.quantize_per_tensor(arg, scale, zp, dtype)
+                    if dequant_info is True:
+                        # Note: both quant and dequant paths are taken for
+                        # reference ops.
                         arg = arg.dequantize()
                     new_args.append(arg)
                     tensor_arg_idx += 1
@@ -519,10 +526,22 @@ def op_convert_after_hook(
         global_op_idx: List[int],
     ) -> Any:
         """
-        This function is called aftern an op call in a converted model.
-
-        TODO: add dequant, if needed
+        This function is called after an op call in a converted model.
         """
+        # TODO(future PR): improve performance by moving this out of the
+        # path of non-reference ops
+        seen_q_op_info = self._get_cur_seen_q_op_info()
+
+        if seen_q_op_info.is_reference_op_at_inference:
+            # given the current reference module design,
+            # we need to quantize to the target dtype
+            output_tensor_info = seen_q_op_info.output_tensor_infos[0]
+            tensor_id, inf_dtype = \
+                output_tensor_info.id, output_tensor_info.inf_dtype
+            scale, zp = self.tensor_id_to_scale_zp[tensor_id]
+            output = torch.quantize_per_tensor(
+                output, scale, zp, inf_dtype)
+
         if self.log_op_outputs:
             output_clone = clone_detach_tensor_without_dispatch(output)
             seen_q_op_info = self._get_cur_seen_q_op_info()
@@ -796,11 +815,15 @@ def _first_call_op_prepare_before_hook_create_subgraphs(
             op_type_is_module = isinstance(op, torch.nn.Module)
             op_type = type(op) if op_type_is_module else op  # type: ignore[assignment]
             qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op_type)
+            # TODO(future PR): use API flag instead of qconfig for is_reference
+            is_reference_op_at_inference = \
+                qconfig is not None and activation_is_int32_quantized(qconfig)
             self.idx_to_seen_q_op_infos[self.idx] = SeenQOpInfo(
                 self.idx, op_type, op_type_is_module, fqn, arg_tensor_infos, [],
                 packable_tensor_idx_to_name, packable_nontensor_idx_to_arg,
                 packable_tensor_kwarg_name_to_name,
-                op_packing_only_uses_module_attributes, qconfig, None)
+                op_packing_only_uses_module_attributes, qconfig, None,
+                is_reference_op_at_inference)
 
         return args, kwargs
 
@@ -826,19 +849,13 @@ def _first_call_op_prepare_after_hook_adjust_subgraphs(
             seen_q_op_info = self._get_cur_seen_q_op_info()
             func_output_dtype_type = get_func_output_dtype_type(seen_q_op_info)
             if func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEPENDS_ON_QCONFIG:
-                if isinstance(op, torch.nn.Module):
-                    # For now, assume that eager mode convert has attached qconfig
-                    # objects to any leaf module which needs quantization
-                    if hasattr(op, 'activation_post_process'):
-                        dtype_to_use = op.activation_post_process.dtype
-                    else:
-                        dtype_to_use = torch.float
+                qconfig = get_cur_qconfig(
+                    self.qconfig_dict, seen_q_op_info.fqn,
+                    seen_q_op_info.type)
+                if qconfig is None:
+                    dtype_to_use = torch.float
                 else:
-                    qconfig = get_cur_qconfig(self.qconfig_dict, seen_q_op_info.fqn, op)
-                    if qconfig is None:
-                        dtype_to_use = torch.float
-                    else:
-                        dtype_to_use = qconfig.activation().dtype
+                    dtype_to_use = qconfig.activation().dtype
 
             elif func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEFAULT_BC_UNSUPPORTED_SYNTAX:
                 dtype_to_use = torch.float
@@ -939,42 +956,8 @@ def _maybe_insert_output_observers(
             assert seen_q_op_info.input_tensor_infos[0] is not None
             first_input_tensor_id = seen_q_op_info.input_tensor_infos[0].id
 
-            first_input_obs = None
-            if str(first_input_tensor_id) in self.tensor_id_to_observer:
-                first_input_obs = \
-                    self.tensor_id_to_observer[str(first_input_tensor_id)]
-            else:
-                # This observer may be in a module (handled by eager
-                # convert), in which case it's not in our map. For now,
-                # copy it from the module. In the future, we could look
-                # into having a soft link.
-                # TODO: make this handle more cases
-                # TODO: handle module -> add_scalar -> add_scalar
-                prev_op = get_producer_of_seen_q_op_info(
-                    self.idx_to_seen_q_op_infos, seen_q_op_info)
-                assert prev_op is not None
-                # TODO: the following line needs to only check fqn
-                # for modules, not for functions
-                fqn_last_part = prev_op.fqn.split('.')[-1]
-                if hasattr(root_module, fqn_last_part):
-                    first_input_mod = getattr(root_module, fqn_last_part)
-                else:
-                    first_input_mod = None
-                # Currently, both tracing for module fusion and tracing for
-                # quantization go through this code path. When tracing
-                # for module fusion, quantizeable modules do not have
-                # observers yet. For this path to not crash, we create one.
-                # When tracing for quantization, this will be ignored.
-                # TODO(future PR): refactor to avoid this.
-                if first_input_mod and hasattr(first_input_mod, 'activation_post_process'):
-                    first_input_obs = first_input_mod.activation_post_process
-                else:
-                    # TODO(future PR): check qconfig is None
-                    qconfig = get_cur_qconfig(
-                        self.qconfig_dict, seen_q_op_info.fqn, seen_q_op_info.type)
-                    assert qconfig is not None
-                    first_input_obs = qconfig.activation()
-
+            first_input_obs = \
+                self.tensor_id_to_observer[str(first_input_tensor_id)]
             self.tensor_id_to_observer[str(output_tensor_id)] = first_input_obs
 
     def insert_observers(self, root_module: torch.nn.Module):
@@ -982,6 +965,15 @@ def insert_observers(self, root_module: torch.nn.Module):
             self._maybe_insert_input_observers(seen_q_op_info)
             self._maybe_insert_output_observers(seen_q_op_info, root_module)
 
+    def get_output_observer_from_fqn(self, fqn: str) -> Optional[torch.nn.Module]:
+        for idx, seen_q_op_info in self.idx_to_seen_q_op_infos.items():
+            if seen_q_op_info.fqn != fqn:
+                continue
+            output_tensor_id = seen_q_op_info.output_tensor_infos[0].id
+            if str(output_tensor_id) in self.tensor_id_to_observer:
+                return self.tensor_id_to_observer[str(output_tensor_id)]
+        return None
+
     # This is a hack to enable nn.Sequential to properly work with
     # this class.
     # TODO(future): remove the hack
diff --git a/torch/ao/quantization/_dbr/torchscript_utils.py b/torch/ao/quantization/_dbr/torchscript_utils.py
new file mode 100644
index 000000000000..2efbbe5fd938
--- /dev/null
+++ b/torch/ao/quantization/_dbr/torchscript_utils.py
@@ -0,0 +1,15 @@
+import torch
+from torch.jit._recursive import wrap_cpp_module
+
+def remove_redundant_aliases(scripted_module: torch.nn.Module):
+    """
+    Running torch.jit.trace on a model with DBR quantization introduces
+    extra alias ops, because we use `torch.Tensor.as_subclass` and tracing
+    through this results in an `aten::alias` function call in TorchScript.
+    This pass removes these alias calls when it is safe to do so.
+    """
+    module_c = scripted_module._c
+    module_c = \
+        torch._C._jit_pass_dbr_quant_remove_redundant_aliases(module_c)  # type: ignore[attr-defined]
+    scripted_module = wrap_cpp_module(module_c)
+    return scripted_module
diff --git a/torch/ao/quantization/_dbr/utils.py b/torch/ao/quantization/_dbr/utils.py
index 4b3465c26150..83b641e80662 100644
--- a/torch/ao/quantization/_dbr/utils.py
+++ b/torch/ao/quantization/_dbr/utils.py
@@ -102,6 +102,8 @@ class SeenQOpInfo:
     qconfig: QConfigAny
     # fusion_info for the op, is None if no fusion is found
     fusion_info: Optional[FusionInfo]
+    # True if this op is a reference op during inference
+    is_reference_op_at_inference: bool
 
     def __repr__(self) -> str:
         s = f"(type): {self.type}\n"
@@ -233,9 +235,6 @@ def get_func_output_obs_type(
     seen_q_op_info: SeenQOpInfo,
 ) -> FuncOutputObsType:
     op_type = seen_q_op_info.type
-    is_module = isinstance(op_type, type(torch.nn.Module))
-    if is_module:
-        return FuncOutputObsType.NONE
 
     if seen_q_op_info.qconfig is None:
         return FuncOutputObsType.NONE
@@ -267,6 +266,8 @@ def get_func_output_obs_type(
             seen_q_op_info.input_tensor_infos[0].inf_dtype in (torch.int32, torch.int64)
         ):
             return FuncOutputObsType.NONE
+    elif op_type in (torch.nn.LSTM,):
+        return FuncOutputObsType.NONE
     return FuncOutputObsType.NEW_OBS
 
 def converted_func_needs_scale_zp(seen_q_op_info: SeenQOpInfo) -> bool:
@@ -583,10 +584,9 @@ def get_torch_function_hook_type(
     # the direct __dict__ accesses are for performance, because
     # the default `torch.nn.Module.__getattr__` has overhead.
     parent_module_has_qstate = parent_module is not None and \
-        '_modules' in parent_module.__dict__ and \
-        '_auto_quant_state' in parent_module.__dict__['_modules']
+        '_auto_quant_state' in parent_module.__dict__
     needs_op_hooks = parent_module_has_qstate and \
-        parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(func)  # type: ignore[union-attr, operator]
+        parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(func)  # type: ignore[union-attr, operator]
 
     if needs_op_hooks:
         return HookType.OP_HOOKS
@@ -608,17 +608,15 @@ def get_module_hook_type(
     if cached_hook_type is not None:
         return cached_hook_type
     parent_module_has_qstate = parent_module is not None and \
-        '_modules' in parent_module.__dict__ and \
-        '_auto_quant_state' in parent_module.__dict__['_modules']
+        '_auto_quant_state' in parent_module.__dict__
     needs_op_hooks = parent_module_has_qstate and \
-        parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(cur_module)  # type: ignore[union-attr, operator]
+        parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(cur_module)  # type: ignore[union-attr, operator]
     # We need IO hooks if
     # * we are calling forward on a module (always True here)
     # * that module has quant state
     # * that module does not need op hooks for the parent
     needs_io_hooks = (
-        '_modules' in cur_module.__dict__ and
-        '_auto_quant_state' in cur_module.__dict__['_modules'] and
+        '_auto_quant_state' in cur_module.__dict__ and
         (not needs_op_hooks)
     )
     needs_arg_dequants = parent_module_has_qstate and not needs_op_hooks
@@ -652,7 +650,7 @@ def clone_detach_tensor_without_dispatch(x: torch.Tensor) -> torch.Tensor:
 def get_input_args_quant_dequant_info(
     seen_q_op_info: SeenQOpInfo,
     tensor_id_to_scale_zp: Dict[int, Tuple[torch.Tensor, torch.Tensor]],
-) -> Tuple[List[Optional[Tuple[float, int]]], List[bool], bool]:
+) -> Tuple[List[Optional[Tuple[float, int, torch.dtype]]], List[bool], bool]:
     """
     Returns a list of information about the tensor inputs to the current op.
 
@@ -678,7 +676,7 @@ def get_input_args_quant_dequant_info(
       # dequants
       [False, False]
     """
-    quant_infos: List[Optional[Tuple[float, int]]] = []
+    quant_infos: List[Optional[Tuple[float, int, torch.dtype]]] = []
     dequant_infos: List[bool] = []
 
     # determine the expected output dtype
@@ -694,12 +692,20 @@ def get_input_args_quant_dequant_info(
             tensor_id = input_arg.id
             if input_arg.inf_dtype != output_dtype:
                 any_arg_quant_or_dequant_needed = True
-                if output_dtype == torch.quint8:
+                if output_dtype in (torch.quint8, torch.qint32):
                     assert tensor_id in tensor_id_to_scale_zp
                     scale, zp = tensor_id_to_scale_zp[tensor_id]
                     # TODO: return this to the caller
-                    quant_infos.append((scale, zp,))  # type: ignore[arg-type]
-                    dequant_infos.append(False)
+                    quant_infos.append((scale, zp, output_dtype))  # type: ignore[arg-type]
+                    if output_dtype == torch.qint32:
+                        # For now, we treat all qint32 ops as reference, so
+                        # we add a dequant before the op.
+                        # TODO(future PR): extend this to more dtypes
+                        # TODO(future PR): use is_reference flag instead of
+                        # assuming
+                        dequant_infos.append(True)
+                    else:
+                        dequant_infos.append(False)
                 else:
                     quant_infos.append(None)
                     dequant_infos.append(True)
@@ -727,3 +733,18 @@ def get_cur_qconfig(
         qconfig_dict, cur_op_type, cur_fqn, global_qconfig)
 
     return qconfig
+
+
+# We store quantization state for all children on the top level module in a
+# ModuleDict. In order to properly special case this module from other
+# ModuleDict instances, we create a marker class for it.
+class AutoQuantizationStateModuleDict(torch.nn.ModuleDict):
+    pass
+
+def get_fqn_valid_for_module_dict_key(fqn: str) -> str:
+    """
+    Modifies `fqn` to make it a valid key to a ModuleDict.
+    """
+    if fqn == '':
+        fqn = ' '
+    return fqn.replace('.', ':')
diff --git a/torch/ao/quantization/_quantize_dbr.py b/torch/ao/quantization/_quantize_dbr.py
index b0e7222a7839..dbcfac60a177 100644
--- a/torch/ao/quantization/_quantize_dbr.py
+++ b/torch/ao/quantization/_quantize_dbr.py
@@ -73,6 +73,14 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
         if len(module_fusion_fqns):
             model = torch.quantization.fuse_modules(model, module_fusion_fqns)
 
+            # Since we are reusing the auto_trace machinery to find fusion
+            # FQNs, we need to do some surgery to get qconfigs on modules
+            # after module fusion to be correct.
+            for _, child in model.named_modules():
+                if isinstance(child, torch.nn.intrinsic._FusedModule):
+                    if hasattr(child[0], 'qconfig'):
+                        child.qconfig = child[0].qconfig
+
         # delete all the DBR state from the model, so add_auto_observation
         # can start from a clean slate
         parents_to_delete_auto_quant_state = []
@@ -82,6 +90,15 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
         for v in parents_to_delete_auto_quant_state:
             del v._auto_quant_state
 
+        del model._fqn_to_auto_quant_state_map
+
+        for p in model.parameters():
+            if hasattr(p, '_qtensor_info'):
+                del p._qtensor_info
+        for b in model.buffers():
+            if hasattr(b, '_qtensor_info'):
+                del b._qtensor_info
+
         # the model hierarchy might have changed during fusion, so we
         # have to delete the cached module hook types
         for k, v in model.named_modules():
@@ -102,11 +119,10 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
             child.qconfig = None  # type: ignore[assignment]
         elif isinstance(child, torch.nn.LSTM):
             # TODO: fix LSTM handling in eager mode static quant and remove this
-            child.qconfig = None
+            qconfig_dict['object_type'][torch.nn.LSTM] = None
+
+    # TODO(future PR): do the QAT module swap
 
-    model = torch.quantization.prepare(
-        model, inplace, allow_list, observer_non_leaf_module_list,
-        prepare_custom_config_dict)
     assert not inplace
     model = add_auto_observation(
         model, qconfig_dict, example_inputs,
diff --git a/torch/ao/quantization/_quantize_fx_do_not_use.py b/torch/ao/quantization/_quantize_fx_do_not_use.py
deleted file mode 100644
index d39abe299393..000000000000
--- a/torch/ao/quantization/_quantize_fx_do_not_use.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-from torch.fx import GraphModule
-from typing import Dict, Any, Optional
-from .quantize_fx import (
-    _check_is_graph_module,
-    check_is_valid_convert_custom_config_dict
-)
-from .fx._convert_do_not_use import _convert_do_not_use
-
-def _convert_fx_do_not_use(
-        graph_module: GraphModule, is_reference: bool = False,
-        convert_custom_config_dict: Dict[str, Any] = None,
-        _remove_qconfig: bool = True,
-        backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module:
-    """
-    Please do not use, this is a temporary function to migrate convert_fx
-    to a new implementation
-    """
-    assert is_reference
-    if convert_custom_config_dict is None:
-        convert_custom_config_dict = {}
-
-    _check_is_graph_module(graph_module)
-    check_is_valid_convert_custom_config_dict(convert_custom_config_dict)
-
-    quantized = _convert_do_not_use(
-        graph_module, is_reference, convert_custom_config_dict,
-        False, _remove_qconfig_flag=_remove_qconfig,
-        backend_config_dict=backend_config_dict)
-
-    preserved_attributes = convert_custom_config_dict.get("preserved_attributes", [])
-    for attr_name in preserved_attributes:
-        setattr(quantized, attr_name, getattr(graph_module, attr_name))
-    return quantized
diff --git a/torch/ao/quantization/fx/backend_config/README.md b/torch/ao/quantization/backend_config/README.md
similarity index 100%
rename from torch/ao/quantization/fx/backend_config/README.md
rename to torch/ao/quantization/backend_config/README.md
diff --git a/torch/ao/quantization/backend_config/__init__.py b/torch/ao/quantization/backend_config/__init__.py
new file mode 100644
index 000000000000..f62e344423d4
--- /dev/null
+++ b/torch/ao/quantization/backend_config/__init__.py
@@ -0,0 +1,11 @@
+from .tensorrt import get_tensorrt_backend_config_dict
+from .native import get_native_backend_config_dict
+
+# TODO: add more validations
+def validate_backend_config_dict(backend_config_dict):
+    return "configs" in backend_config_dict
+
+__all__ = [
+    "get_native_backend_config_dict",
+    "get_tensorrt_backend_config_dict",
+]
diff --git a/torch/ao/quantization/backend_config/native.py b/torch/ao/quantization/backend_config/native.py
new file mode 100644
index 000000000000..d1b254e08359
--- /dev/null
+++ b/torch/ao/quantization/backend_config/native.py
@@ -0,0 +1,722 @@
+from collections import namedtuple
+from typing import List, Dict, Any
+import operator
+import torch
+from torch.ao.quantization.backend_config.observation_type import ObservationType
+import torch.nn.functional as F
+import torch.nn as nn
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.qat as nniqat
+import torch.nn.qat as nnqat
+import torch.nn.quantized._reference as nnqr
+from ..observer import (
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
+)
+from ..fake_quantize import FixedQParamsFakeQuantize
+from ..fuser_method_mappings import (
+    reverse_sequential_wrapper2,
+    reverse2,
+    reverse3,
+    fuse_conv_bn,
+    fuse_conv_bn_relu,
+    fuse_linear_bn,
+    fuse_convtranspose_bn,
+)
+
+# TODO: rename to be more explict, e.g. qat_conv_relu
+_ConvMetadata = namedtuple(
+    "_ConvMetadata",
+    ["root", "transpose", "bn", "reference", "transpose_reference",
+     "fused_conv_relu", "fused_conv_bn", "fused_conv_bn_relu",
+     "qat", "relu_qat", "bn_qat", "bn_relu_qat",
+     "func"])
+_Conv1dMetadata = _ConvMetadata(
+    nn.Conv1d, nn.ConvTranspose1d, nn.BatchNorm1d, nnqr.Conv1d, nnqr.ConvTranspose1d,
+    nni.ConvReLU1d, nni.ConvBn1d, nni.ConvBnReLU1d,
+    nnqat.Conv1d, nniqat.ConvReLU1d, nniqat.ConvBn1d, nniqat.ConvBnReLU1d,
+    F.conv1d)
+_Conv2dMetadata = _ConvMetadata(
+    nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d, nnqr.Conv2d, nnqr.ConvTranspose2d,
+    nni.ConvReLU2d, nni.ConvBn2d, nni.ConvBnReLU2d,
+    nnqat.Conv2d, nniqat.ConvReLU2d, nniqat.ConvBn2d, nniqat.ConvBnReLU2d,
+    F.conv2d)
+_Conv3dMetadata = _ConvMetadata(
+    nn.Conv3d, nn.ConvTranspose3d, nn.BatchNorm3d, nnqr.Conv3d, nnqr.ConvTranspose3d,
+    nni.ConvReLU3d, nni.ConvBn3d, nni.ConvBnReLU3d,
+    nnqat.Conv3d, nniqat.ConvReLU3d, nniqat.ConvBn3d, nniqat.ConvBnReLU3d,
+    F.conv3d)
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+# weighted op int8 dtype config
+# this is config for ops that has quantized weights, like linear, conv
+weighted_op_int8_dtype_config = {
+    # optional, input activation dtype
+    "input_dtype": torch.quint8,
+    # optional, weight dtype
+    "weight_dtype": torch.qint8,
+    # optional, bias dtype
+    "bias_dtype": torch.float,
+    # optional, output activation dtype
+    "output_dtype": torch.quint8
+}
+
+default_op_quint8_dtype_config = {
+    # optional, input activation dtype
+    "input_dtype": torch.quint8,
+    # optional, output activation dtype
+    "output_dtype": torch.quint8,
+}
+
+default_op_fp16_dtype_config = {
+    # optional, input activation dtype
+    "input_dtype": torch.float16,
+    # optional, weight dtype
+    "weight_dtype": torch.float16,
+    # optional, bias dtype
+    "bias_dtype": torch.float16,
+    # optional, output activation dtype
+    "output_dtype": torch.float16,
+}
+
+default_dynamic_int8_dtype_config = {
+    "input_dtype": torch.quint8,
+    "weight_dtype": torch.qint8,
+    "bias_dtype": torch.float,
+    "output_dtype": torch.float,
+    # currently the dtype check is not yet enabled, so we provided the dtype_configs but
+    # it is not really used yet,
+    # we will enable it a bit later after we moved everything to backend_config_dict
+    "is_dynamic": True,
+}
+
+default_dynamic_float16_dtype_config = {
+    "input_dtype": torch.float16,
+    "weight_dtype": torch.float16,
+    "bias_dtype": torch.float,
+    "output_dtype": torch.float,
+    # currently the dtype check is not yet enabled, so we provided the dtype_configs but
+    # it is not really used yet,
+    # we will enable it a bit later after we moved everything to backend_config_dict
+    "is_dynamic": True,
+}
+
+weight_only_quint8_dtype_config = {
+    "input_dtype": torch.float,
+    "weight_dtype": torch.quint8,
+    "output_dtype": torch.float,
+}
+
+weight_only_quint4x2_dtype_config = {
+    "input_dtype": torch.float,
+    "weight_dtype": torch.quint4x2,
+    "output_dtype": torch.float,
+}
+
+# ======================
+# |  OPERATOR CONFIGS  |
+# ======================
+
+def _get_default_op_backend_config(op, dtype_configs):
+    return {
+        "pattern": op,
+        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        "dtype_configs": dtype_configs,
+    }
+
+_DEFAULT_OP_INT8_CONFIGS = [
+    _get_default_op_backend_config(op, [default_op_quint8_dtype_config]) for op in [
+        torch.nn.ELU,
+        torch.nn.LeakyReLU,
+        torch.nn.Hardswish,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.Dropout,
+        torch.nn.functional.elu,
+        torch.nn.functional.hardswish,
+        torch.nn.functional.instance_norm,
+        torch.nn.functional.leaky_relu,
+        torch.nn.functional.dropout,
+        torch.nn.functional.layer_norm,
+    ]]
+
+def _get_linear_configs(dtype_configs):
+    """
+    Return all configs related to linear modules and ops.
+    """
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    linear_configs = []
+
+    # (1) Single linear modules/functions
+    # -------------------------------------
+    # linear module
+    linear_configs.append({
+        # Please see README under this folder for pattern format
+        "pattern": torch.nn.Linear,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        # the root module for the pattern, used to query the reference quantized module
+        # e.g. for a (torch.nn.ReLU, torch.nn.Linear) pattern, the root will be torch.nn.Linear
+        "root_module": torch.nn.Linear,
+        # the corresponding reference quantized module for the root module
+        "reference_quantized_module_for_root": nnqr.Linear,
+        "qat_module": nnqat.Linear,
+    })
+    # linear qat module
+    linear_configs.append({
+        "pattern": nnqat.Linear,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        "root_module": torch.nn.Linear,
+        "reference_quantized_module_for_root": nnqr.Linear,
+    })
+    # functional linear
+    linear_configs.append({
+        "pattern": torch.nn.functional.linear,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+    })
+
+    # (2) Linear + relu
+    # -------------------
+    # 2.1 linear module + relu fusion config
+    # linear relu, linear module + relu module
+    linear_configs.append({
+        "pattern": (torch.nn.ReLU, torch.nn.Linear),
+        "dtype_configs": dtype_configs,
+        "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU),
+        "fused_module": nni.LinearReLU,
+    })
+    # linear relu, linear module + functional relu
+    linear_configs.append({
+        "pattern": (torch.nn.functional.relu, torch.nn.Linear),
+        "dtype_configs": dtype_configs,
+        "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU),
+        "fused_module": nni.LinearReLU,
+    })
+
+    # 2.2 linear module + relu, fused module configs
+    # linear relu, fused module
+    linear_configs.append({
+        "pattern": nni.LinearReLU,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        "root_module": torch.nn.Linear,
+        "reference_quantized_module_for_root": nnqr.Linear,
+        "qat_module": nniqat.LinearReLU,
+    })
+    # linear relu, qat fused module
+    linear_configs.append({
+        "pattern": nniqat.LinearReLU,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        "root_module": torch.nn.Linear,
+        "reference_quantized_module_for_root": nnqr.Linear,
+    })
+    # 2.3 functional linear + relu configs
+    # linear relu, functional linear + relu module
+    linear_configs.append({
+        "pattern": (torch.nn.ReLU, F.linear),
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+    })
+    # linear relu, functional linear + functional relu
+    linear_configs.append({
+        "pattern": (F.relu, F.linear),
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+    })
+
+    # (3) Linear + batchnorm
+    # ------------------------
+    # 3.1 linear bn fusion
+    linear_configs.append({
+        "pattern": (nn.BatchNorm1d, nn.Linear),
+        "dtype_configs": dtype_configs,
+        "fuser_method": reverse2(fuse_linear_bn),
+        "fused_module": nni.LinearBn1d,
+    })
+
+    # 3.2 linear bn fused
+    # linear bn, fused module
+    linear_configs.append({
+        "pattern": nni.LinearBn1d,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        "root_module": torch.nn.Linear,
+        "reference_quantized_module_for_root": nnqr.Linear,
+        "qat_module": nniqat.LinearBn1d,
+    })
+    # linear bn, qat fused module
+    linear_configs.append({
+        "pattern": nniqat.LinearBn1d,
+        "observation_type": observation_type,
+        "dtype_configs": dtype_configs,
+        "root_module": torch.nn.Linear,
+        "reference_quantized_module_for_root": nnqr.Linear,
+    })
+    return linear_configs
+
+def _get_conv_configs():
+    """
+    Return all configs related to conv modules and ops.
+    """
+    conv_configs = []
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [weighted_op_int8_dtype_config]
+    for convs in [_Conv1dMetadata, _Conv2dMetadata, _Conv3dMetadata]:
+
+        # (1) Single conv modules/functions
+        # -----------------------------------
+        # conv module
+        conv_configs.append({
+            "pattern": convs.root,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+            "qat_module": convs.qat,
+        })
+        # conv qat module
+        conv_configs.append({
+            "pattern": convs.qat,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+        })
+        # functional conv
+        conv_configs.append({
+            "pattern": convs.func,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+        })
+
+        # (2) Conv + relu
+        # -----------------
+        # 2.1 conv module + relu fusion configs
+        # conv relu fusion, conv module + relu module
+        conv_configs.append({
+            "pattern": (torch.nn.ReLU, convs.root),
+            "dtype_configs": dtype_configs,
+            "fuser_method": reverse_sequential_wrapper2(convs.fused_conv_relu),
+            "fused_module": convs.fused_conv_relu,
+        })
+        # conv relu fusion, conv module + functional relu
+        conv_configs.append({
+            "pattern": (F.relu, convs.root),
+            "dtype_configs": dtype_configs,
+            "fuser_method": reverse_sequential_wrapper2(convs.fused_conv_relu),
+            "fused_module": convs.fused_conv_relu,
+        })
+        # 2.2 conv module + relu fused module configs
+        # conv relu, fused module
+        conv_configs.append({
+            "pattern": convs.fused_conv_relu,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+            "qat_module": convs.relu_qat,
+        })
+        # conv relu, qat fused module
+        conv_configs.append({
+            "pattern": convs.relu_qat,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+        })
+        # 2.3 functional conv + relu configs
+        # conv relu, functional conv + relu module
+        conv_configs.append({
+            "pattern": (torch.nn.ReLU, convs.func),
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+        })
+        # conv relu, functional conv + functional relu
+        conv_configs.append({
+            "pattern": (F.relu, convs.func),
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+        })
+
+        # fused conv relu
+        conv_configs.append({
+            "pattern": convs.fused_conv_relu,
+            "dtype_configs": dtype_configs,
+            "qat_module": convs.relu_qat,
+        })
+
+        conv_configs.append({
+            "pattern": convs.relu_qat,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+        })
+
+        # (3) Conv + batchnorm (+ relu)
+        # -------------------------------
+        # 3.1 conv bn fusion configs
+        # conv + bn fusion
+        conv_configs.append({
+            "pattern": (convs.bn, convs.root),
+            "dtype_configs": dtype_configs,
+            "fuser_method": reverse2(fuse_conv_bn),
+            "fused_module": convs.fused_conv_bn,
+        })
+        # conv + bn + relu module fusion
+        conv_configs.append({
+            "pattern": (nn.ReLU, (convs.bn, convs.root)),
+            "dtype_configs": dtype_configs,
+            "fuser_method": reverse3(fuse_conv_bn_relu),
+            "fused_module": convs.fused_conv_bn_relu,
+        })
+        # conv + bn + relu functional fusion
+        conv_configs.append({
+            "pattern": (F.relu, (convs.bn, convs.root)),
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "fuser_method": reverse3(fuse_conv_bn_relu),
+            "fused_module": convs.fused_conv_bn_relu,
+        })
+        # TODO: we can add fusion for torch.relu as well
+
+        # 3.2 conv + bn (+ relu) fused module configs
+        # fused conv bn
+        conv_configs.append({
+            "pattern": convs.fused_conv_bn,
+            "dtype_configs": dtype_configs,
+            "qat_module": convs.bn_qat,
+        })
+
+        # fused conv bn relu
+        conv_configs.append({
+            "pattern": convs.fused_conv_bn_relu,
+            "dtype_configs": dtype_configs,
+            "qat_module": convs.bn_relu_qat,
+        })
+
+        # conv bn, qat fused module
+        conv_configs.append({
+            "pattern": convs.bn_qat,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+        })
+        # conv bn relu, qat fused module
+        conv_configs.append({
+            "pattern": convs.bn_relu_qat,
+            "observation_type": observation_type,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.root,
+            "reference_quantized_module_for_root": convs.reference,
+        })
+
+        # (4) conv transpose and its fusion
+        # 4.1 conv transpose config
+        conv_configs.append({
+            "pattern": convs.transpose,
+            "dtype_configs": dtype_configs,
+            "root_module": convs.transpose,
+            "reference_quantized_module_for_root": convs.transpose_reference,
+        })
+
+        # 4.2 conv transpose + bn fusion
+        conv_configs.append({
+            "pattern": (convs.bn, convs.transpose),
+            "dtype_configs": dtype_configs,
+            "fuser_method": reverse2(fuse_convtranspose_bn),
+            "root_module": convs.transpose,
+            "reference_quantized_module_for_root": convs.transpose_reference,
+        })
+
+    return conv_configs
+
+def _get_binary_op_configs(dtype_configs):
+    binary_op_configs: List[Dict[str, Any]] = []
+    num_tensor_args_to_observation_type_mapping = {
+        # TODO: this is not used right now since we have extra check in prepare
+        # will need to change this to NO_OBSERVER later after we implemented
+        # Tensor dtype inference properly
+        0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        1: ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+        2: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    }
+    for op_with_quantized_bop_scalar_variant in [
+            operator.add, torch.add, operator.mul, torch.mul]:
+        binary_op_configs.append({
+            "pattern": (torch.nn.ReLU, op_with_quantized_bop_scalar_variant),
+            "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping,
+            "dtype_configs": dtype_configs,
+        })
+        binary_op_configs.append({
+            "pattern": (torch.nn.functional.relu, op_with_quantized_bop_scalar_variant),
+            "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping,
+            "dtype_configs": dtype_configs,
+        })
+        binary_op_configs.append({
+            "pattern": (torch.relu, op_with_quantized_bop_scalar_variant),
+            "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping,
+            "dtype_configs": dtype_configs,
+        })
+        binary_op_configs.append({
+            "pattern": op_with_quantized_bop_scalar_variant,
+            "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping,
+            "dtype_configs": dtype_configs,
+        })
+    return binary_op_configs
+
+
+def _get_fixed_qparams_op_configs():
+    fixed_qparams_op_configs = []
+    for fixed_qparam_op, output_observer in [
+            (torch.nn.Hardsigmoid, default_fixed_qparams_range_0to1_observer),
+            (torch.nn.functional.hardsigmoid, default_fixed_qparams_range_0to1_observer),
+            ("hardsigmoid", default_fixed_qparams_range_0to1_observer),
+            ("hardsigmoid_", default_fixed_qparams_range_0to1_observer),
+            (torch.nn.Sigmoid, default_fixed_qparams_range_0to1_observer),
+            (torch.sigmoid, default_fixed_qparams_range_0to1_observer),
+            ("sigmoid", default_fixed_qparams_range_0to1_observer),
+            ("sigmoid_", default_fixed_qparams_range_0to1_observer),
+            (torch.nn.Tanh, default_fixed_qparams_range_neg1to1_observer),
+            (torch.tanh, default_fixed_qparams_range_neg1to1_observer),
+            ("tanh", default_fixed_qparams_range_neg1to1_observer),
+            ("tanh_", default_fixed_qparams_range_neg1to1_observer),
+            (torch.nn.Softmax, default_fixed_qparams_range_0to1_observer),
+    ]:
+        fixed_qparams_op_configs.append({
+            "pattern": fixed_qparam_op,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            # TODO: The following two keys are temporary, since we don't want to put observer in the configs
+            # we expect that it's provided by user
+            # What we want to put here is the requirement on observers, in this case dtype,
+            # quant_min, quant_max etc., but we need to first move all configs to
+            # backend_config_dict to do that, we'll remove these keys after we fully migrated
+            # everything to use backend_config_dict
+            "_overwrite_output_fake_quantizer": FixedQParamsFakeQuantize.with_args(observer=output_observer),
+            "_overwrite_output_observer": output_observer,
+            "dtype_configs": [
+                weighted_op_int8_dtype_config,
+                default_op_fp16_dtype_config,
+            ],
+        })
+    return fixed_qparams_op_configs
+
+_CAT_CONFIG = {
+    "pattern": torch.cat,
+    "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+    "dtype_configs": [
+        default_op_quint8_dtype_config,
+    ]
+}
+
+def _get_bn_configs():
+    """ Get configs related to batchnorm
+    """
+    bn_configs = []
+    bn_to_fused_bn = {
+        torch.nn.BatchNorm2d: nni.BNReLU2d,
+        torch.nn.BatchNorm3d: nni.BNReLU3d,
+    }
+    for bn in bn_to_fused_bn.keys():
+        fused_bn = bn_to_fused_bn[bn]
+        # bn module + relu module fusion config
+        bn_configs.append({
+            "pattern": (torch.nn.ReLU, bn),
+            "dtype_configs": [default_op_quint8_dtype_config],
+            "fuser_method": reverse_sequential_wrapper2(fused_bn),
+            "fused_module": fused_bn,
+        })
+        # bn module + F.relu fusion config
+        bn_configs.append({
+            "pattern": (torch.nn.functional.relu, bn),
+            "dtype_configs": [default_op_quint8_dtype_config],
+            "fuser_method": reverse_sequential_wrapper2(bn_to_fused_bn[bn]),
+            "fused_module": fused_bn,
+        })
+        bn_configs.append({
+            "pattern": bn,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            "dtype_configs": [default_op_quint8_dtype_config],
+        })
+
+    # fused bn configs
+    for fused_bn in bn_to_fused_bn.values():
+        bn_configs.append({
+            "pattern": fused_bn,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            "dtype_configs": [default_op_quint8_dtype_config],
+        })
+    return bn_configs
+
+def _get_share_qparams_op_configs(dtype_configs):
+    """ Get the operator config for the operators that works for both float and quantized input
+    if input is quantized, the output Tensor shares the same quantization parameter
+    with input.
+    Example operator: avgpool2d, reshape, transpose, maxpool2d
+    Example observed operator:
+    observer_0 - avgpool2d - observer_0 (same observer instance as input)
+    """
+
+    def _get_share_qprams_op_backend_config(op):
+        return {
+            "pattern": op,
+            "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+            "dtype_configs": dtype_configs,
+        }
+
+    share_qparams_ops = [
+        torch.nn.AdaptiveAvgPool1d,
+        torch.nn.AdaptiveAvgPool2d,
+        torch.nn.AdaptiveAvgPool3d,
+        torch.nn.AvgPool1d,
+        torch.nn.AvgPool2d,
+        torch.nn.AvgPool3d,
+        torch.nn.Hardtanh,
+        torch.nn.Identity,
+        torch.nn.MaxPool1d,
+        torch.nn.MaxPool2d,
+        torch.nn.MaxPool3d,
+        torch.nn.ReLU,
+        torch.nn.ReLU6,
+        torch.adaptive_avg_pool1d,
+        torch.nn.functional.adaptive_avg_pool2d,
+        torch.nn.functional.adaptive_avg_pool3d,
+        torch.nn.functional.hardtanh,
+        torch.nn.functional.hardtanh_,
+        torch.nn.functional.interpolate,
+        torch.nn.functional.max_pool1d,
+        torch.nn.functional.max_pool2d,
+        torch.nn.functional.max_pool3d,
+        torch.nn.functional.relu,
+        torch.nn.functional.relu6,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.clamp,
+        torch.flatten,
+        torch.mean,
+        torch.repeat_interleave,
+        torch.transpose,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+        operator.floordiv,
+        "contiguous",
+        "clamp",
+        "detach",
+        "detach_",
+        "mean",
+        "permute",
+        "repeat",
+        "repeat_interleave",
+        "reshape",
+        "resize_",
+        "relu",
+        "relu_",
+        "shape",
+        "size",
+        "squeeze",
+        "squeeze_",
+        "transpose",
+        "unsqueeze",
+        "unsqueeze_",
+        "view"
+    ]
+    return [_get_share_qprams_op_backend_config(op) for op in share_qparams_ops]
+
+def _get_rnn_op_configs():
+    rnn_op_configs = []
+    for rnn_op, ref_rnn_op in [
+            (nn.GRUCell, nnqr.GRUCell),
+            (nn.LSTMCell, nnqr.LSTMCell),
+            (nn.RNNCell, nnqr.RNNCell),
+            (nn.LSTM, nnqr.LSTM)
+    ]:
+        rnn_op_configs.append({
+            "pattern": rnn_op,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            "dtype_configs": [default_dynamic_int8_dtype_config, default_dynamic_float16_dtype_config],
+            "root_module": rnn_op,
+            "reference_quantized_module_for_root": ref_rnn_op,
+        })
+    return rnn_op_configs
+
+def _get_embedding_op_configs():
+    embedding_op_configs = []
+    for embedding_op, qat_embedding_op, ref_embedding_op in [
+            (nn.Embedding, nnqat.Embedding, nnqr.Embedding),
+            (nn.EmbeddingBag, nnqat.EmbeddingBag, nnqr.EmbeddingBag),
+    ]:
+        embedding_op_configs.append({
+            "pattern": embedding_op,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            "dtype_configs": [
+                weight_only_quint8_dtype_config,
+                weight_only_quint4x2_dtype_config
+            ],
+            "qat_module": qat_embedding_op,
+            "root_module": embedding_op,
+            "reference_quantized_module_for_root": ref_embedding_op,
+            # This is temporary, and will be removed soon
+            "_input_output_observed": False
+        })
+        # config for qat op
+        embedding_op_configs.append({
+            "pattern": qat_embedding_op,
+            "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+            "dtype_configs": [
+                weight_only_quint8_dtype_config,
+                weight_only_quint4x2_dtype_config
+            ],
+            "root_module": embedding_op,
+            "reference_quantized_module_for_root": ref_embedding_op,
+            # This is temporary, and will be removed soon
+            "_input_output_observed": False
+        })
+    return embedding_op_configs
+
+def get_native_backend_config_dict():
+    """ Get backend_config_dict for PyTorch Native backend (fbgemm/qnnpack). """
+    linear_dtype_configs = [
+        weighted_op_int8_dtype_config,
+        default_dynamic_int8_dtype_config,
+        default_dynamic_float16_dtype_config,
+        # TODO: maybe remove this since fbgemm/qnnpack doesn't have kernels for it
+        default_op_fp16_dtype_config,
+    ]
+    binary_op_dtype_configs = [
+        weighted_op_int8_dtype_config,
+        default_op_fp16_dtype_config,
+    ]
+    share_qparams_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+        default_op_fp16_dtype_config
+    ]
+    return {
+        # optional
+        "name": "native",
+        "configs": [
+            *_DEFAULT_OP_INT8_CONFIGS,
+            *_get_linear_configs(linear_dtype_configs),
+            *_get_conv_configs(),
+            *_get_binary_op_configs(binary_op_dtype_configs),
+            *_get_fixed_qparams_op_configs(),
+            _CAT_CONFIG,
+            *_get_bn_configs(),
+            *_get_share_qparams_op_configs(share_qparams_op_dtype_configs),
+            *_get_rnn_op_configs(),
+            *_get_embedding_op_configs(),
+        ],
+    }
+
+__all__ = [
+    "get_native_backend_config_dict",
+]
diff --git a/torch/ao/quantization/fx/backend_config/observation_type.py b/torch/ao/quantization/backend_config/observation_type.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config/observation_type.py
rename to torch/ao/quantization/backend_config/observation_type.py
diff --git a/torch/ao/quantization/fx/backend_config/tensorrt.py b/torch/ao/quantization/backend_config/tensorrt.py
similarity index 84%
rename from torch/ao/quantization/fx/backend_config/tensorrt.py
rename to torch/ao/quantization/backend_config/tensorrt.py
index 6504ce7a9331..94895215bb10 100644
--- a/torch/ao/quantization/fx/backend_config/tensorrt.py
+++ b/torch/ao/quantization/backend_config/tensorrt.py
@@ -3,8 +3,12 @@
 import torch.nn.qat as nnqat
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.qat as nniqat
+# TODO: maybe refactor this to a separate util function
+from .native import _get_binary_op_configs
+from .native import _get_linear_configs
+from .native import _get_share_qparams_op_configs
 
-from ...fuser_method_mappings import reverse2
+from ..fuser_method_mappings import reverse_sequential_wrapper2
 
 def get_tensorrt_backend_config_dict():
     """ Get the backend config dictionary for tensorrt backend
@@ -31,20 +35,6 @@ def get_tensorrt_backend_config_dict():
     }
 
     # operator (module/functional/torch ops) configs
-    linear_module_config = {
-        # Please see README under this folder for pattern format
-        "pattern": torch.nn.Linear,
-        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
-        "dtype_configs": [
-            weighted_op_qint8_dtype_config,
-        ],
-        # the root module for the pattern, used to query the reference quantized module
-        # e.g. for a (torch.nn.ReLU, torch.nn.Linear) pattern, the root will be torch.nn.Linear
-        "root_module": torch.nn.Linear,
-        # the corresponding reference quantized module for the root module
-        "reference_quantized_module_for_root": torch.nn.quantized._reference.Linear,
-        "qat_module": nnqat.Linear,
-    }
     linear_qat_config = {
         "pattern": nnqat.Linear,
         "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
@@ -63,7 +53,8 @@ def get_tensorrt_backend_config_dict():
         "dtype_configs": [
             weighted_op_qint8_dtype_config,
         ],
-        "fuser_method": reverse2(nni.LinearReLU),
+        "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU),
+        "fused_module": nni.LinearReLU,
     }
     linear_relu_mf_config = {
         "pattern": (torch.nn.functional.relu, torch.nn.Linear),
@@ -71,7 +62,8 @@ def get_tensorrt_backend_config_dict():
         "dtype_configs": [
             weighted_op_qint8_dtype_config,
         ],
-        "fuser_method": reverse2(nni.LinearReLU),
+        "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU),
+        "fused_module": nni.LinearReLU,
     }
 
     linear_relu_fused_config = {
@@ -156,7 +148,8 @@ def get_tensorrt_backend_config_dict():
         "dtype_configs": [
             weighted_op_qint8_dtype_config,
         ],
-        "fuser_method": reverse2(nni.ConvReLU2d),
+        "fuser_method": reverse_sequential_wrapper2(nni.ConvReLU2d),
+        "fused_module": nni.ConvReLU2d,
     }
     conv2d_relu_mm_config = {
         "pattern": (torch.nn.ReLU, torch.nn.Conv2d),
@@ -164,7 +157,8 @@ def get_tensorrt_backend_config_dict():
         "dtype_configs": [
             weighted_op_qint8_dtype_config,
         ],
-        "fuser_method": reverse2(nni.ConvReLU2d),
+        "fuser_method": reverse_sequential_wrapper2(nni.ConvReLU2d),
+        "fused_module": nni.ConvReLU2d,
     }
     addmm_config = {
         "pattern": torch.addmm,
@@ -186,18 +180,19 @@ def get_tensorrt_backend_config_dict():
             non_weighted_op_qint8_dtype_config,
         ]
     }
-    identity_config = {
-        "pattern": torch.nn.Identity,
-        "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
-        "dtype_configs": [
-            non_weighted_op_qint8_dtype_config,
-        ]
-    }
+    linear_dtype_configs = [
+        weighted_op_qint8_dtype_config,
+    ]
+    binary_op_dtype_configs = [
+        weighted_op_qint8_dtype_config,
+    ]
+    share_qparams_op_dtype_configs = [
+        non_weighted_op_qint8_dtype_config,
+    ]
     return {
         # optional
         "name": "tensorrt",
         "configs": [
-            linear_module_config,
             linear_qat_config,
             linear_relu_fused_config,
             linear_relu_qat_config,
@@ -215,6 +210,12 @@ def get_tensorrt_backend_config_dict():
             # conv3d_relu_fused_config,
             addmm_config,
             cat_config,
-            identity_config,
+            *_get_linear_configs(linear_dtype_configs),
+            *_get_binary_op_configs(binary_op_dtype_configs),
+            *_get_share_qparams_op_configs(share_qparams_op_dtype_configs),
         ]
     }
+
+__all__ = [
+    "get_tensorrt_backend_config_dict",
+]
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
new file mode 100644
index 000000000000..95df3bf310c3
--- /dev/null
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -0,0 +1,202 @@
+from typing import Dict, Any, List, Callable, Union, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..quantization_types import Pattern
+
+def get_pattern_to_dtype_configs(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, List[Dict[str, Any]]]:
+    pattern_to_dtype_configs: Dict[Pattern, List[Dict[str, torch.dtype]]] = dict()
+    for config in backend_config_dict.get("configs", []):
+        pattern = config["pattern"]
+        dtype_configs = config["dtype_configs"]
+        pattern_to_dtype_configs[pattern] = dtype_configs
+    return pattern_to_dtype_configs
+
+def get_qat_module_classes(
+        backend_config_dict: Dict[str, Any]) -> Tuple[type, ...]:
+    qat_module_classes = []
+    for config in backend_config_dict.get("configs", []):
+        pattern = config["pattern"]
+        qat_module = config.get("qat_module", None)
+        if qat_module is not None:
+            qat_module_classes.append(qat_module)
+    return tuple(set(qat_module_classes))
+
+def get_fused_module_classes(
+        backend_config_dict: Dict[str, Any]) -> Tuple[type, ...]:
+    fused_module_classes = []
+    for config in backend_config_dict.get("configs", []):
+        pattern = config["pattern"]
+        fused_module = config.get("fused_module", None)
+        if fused_module is not None:
+            fused_module_classes.append(fused_module)
+    return tuple(set(fused_module_classes))
+
+def get_pattern_to_input_type_to_index(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Dict[str, int]]:
+    pattern_to_input_type_to_index: Dict[Pattern, Dict[str, int]] = dict()
+    for config in backend_config_dict.get("configs", []):
+        pattern = config["pattern"]
+        input_type_to_index = config.get("input_type_to_index", {})
+        pattern_to_input_type_to_index[pattern] = input_type_to_index
+    return pattern_to_input_type_to_index
+
+def get_root_module_to_quantized_reference_module(
+        backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]:
+    mapping: Dict[Callable, Callable] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "root_module" in config and "reference_quantized_module_for_root" in config:
+            mapping[config["root_module"]] = config["reference_quantized_module_for_root"]
+    return mapping
+
+def get_fuser_method_mapping(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Union[nn.Sequential, Callable]]:
+    fuser_method_mapping : Dict[Pattern, Union[nn.Sequential, Callable]] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "fuser_method" in config:
+            pattern = config["pattern"]
+            fuser_method = config["fuser_method"]
+            fuser_method_mapping[pattern] = fuser_method
+
+    return fuser_method_mapping
+
+def get_module_to_qat_module(
+        backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]:
+    module_to_qat_module: Dict[Callable, Callable] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "pattern" in config and "qat_module" in config:
+            pattern = config["pattern"]
+            qat_module = config["qat_module"]
+            module_to_qat_module[pattern] = qat_module
+
+    return module_to_qat_module
+
+def get_fusion_pattern_to_root_node_getter(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]:
+    """ Get a map from fusion pattern to a function that returns the root node
+    from the fusion pattern, e.g. the most common one is:
+    def get_root_node(node_pattern):
+        while not isinstance(node_pattern[-1], Node):
+            node_pattern = node_pattern[-1]
+        return node_pattern[-1]
+    This can work for all patterns whose root node is the "last node" in the pattern,
+    e.g. (torch.add, MatchAllNode, (torch.ReLU, torch.Conv2d))
+    """
+    root_node_getter_mapping: Dict[Pattern, Callable] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "root_node_getter" in config:
+            pattern = config["pattern"]
+            root_node_getter = config["root_node_getter"]
+            root_node_getter_mapping[pattern] = root_node_getter
+
+    return root_node_getter_mapping
+
+def get_fusion_pattern_to_extra_inputs_getter(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]:
+    """ Get a map from fusion pattern to a function that returns extra input nodes
+    from the fusion pattern, in the order required by the root node. This is optional,
+    if not specified, we will not copy over any extra inputs for the root node.
+    Example:
+    # Let's say we have the pattern (torch.add, MatchAllNode, (torch.nn.BatchNorm2d, torch.nn.Conv2d))
+    # and root node is torch.nn.Conv2d, and the node in MatchAllNode would be an extra
+    # argument to the fused module, we can unpack the pattern and return the node at
+    # MatchAllNode here
+    # we can implement extra_inputs_getter as follows:
+    def extra_inputs_getter(pattern) -> List[Any]:
+        add, extra_input, conv_pattern = pattern
+        return [extra_input]
+    """
+    extra_inputs_getter_mapping: Dict[Pattern, Callable] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "extra_inputs_getter" in config:
+            pattern = config["pattern"]
+            extra_inputs_getter = config["extra_inputs_getter"]
+            extra_inputs_getter_mapping[pattern] = extra_inputs_getter
+
+    return extra_inputs_getter_mapping
+
+def remove_boolean_dispatch_from_name(p) -> Any:
+    """
+    Some ops have a default string representation such as
+    '<function boolean_dispatch.<locals>.fn at 0x7ff1106bf280>',
+    this function replaces them with the hardcoded function names.
+    """
+    if p is F.fractional_max_pool2d:
+        return "torch.nn.functional.fractional_max_pool2d"
+    elif p is F.fractional_max_pool3d:
+        return "torch.nn.functional.fractional_max_pool3d"
+    elif p is F.max_pool1d:
+        return "torch.nn.functional.max_pool1d"
+    elif p is F.max_pool2d:
+        return "torch.nn.functional.max_pool2d"
+    elif p is F.max_pool3d:
+        return "torch.nn.functional.max_pool3d"
+    elif p is F.adaptive_max_pool1d:
+        return "torch.nn.functional.adaptive_max_pool1d"
+    elif p is F.adaptive_max_pool2d:
+        return "torch.nn.functional.adaptive_max_pool2d"
+    elif p is F.adaptive_max_pool3d:
+        return "torch.nn.functional.adaptive_max_pool3d"
+    assert "boolean_dispatch" not in str(p), \
+        f"{p} does not have a human readable representation in " + \
+        "quantization documentation"
+    return p
+
+def pattern_to_human_readable(p) -> Any:
+    if isinstance(p, tuple):
+        # nested patterns, recurse
+        return tuple(pattern_to_human_readable(inner_p) for inner_p in p)
+    elif isinstance(p, str):
+        # method names are already human readable
+        return p
+    else:
+        p = remove_boolean_dispatch_from_name(p)
+        return p
+
+# TODO(future PR): move backend_config_dict to use dataclass and move this logic to
+# the corresponding __str__ function
+def entry_to_pretty_str(entry) -> str:
+    """
+    Given a backend_config_dict entry, returns a string with the human readable
+    representation of it.
+    """
+    s = "{\n"
+
+    # always output the pattern first
+    if "pattern" in entry:
+        pattern_str = pattern_to_human_readable(entry["pattern"])
+
+        s += f"  'pattern': {pattern_str},\n"
+
+    # custom output for dtype_configs to make it look nice
+    if "dtype_configs" in entry:
+        s += "  'dtype_configs': [\n"
+        for dtype_config in entry["dtype_configs"]:
+            s += "    {\n"
+            for k, v in dtype_config.items():
+                s += f"      '{k}': {v},\n"
+            s += "    },\n"
+        s += "  ],\n"
+
+    # custom output for num_tensor_args_to_observation_type to make it look nice
+    if "num_tensor_args_to_observation_type" in entry:
+        s += "  'num_tensor_args_to_observation_type': {\n"
+        for k, v in entry["num_tensor_args_to_observation_type"].items():
+            s += f"    {k}: {v},\n"
+        s += "  },\n"
+
+    # output all the other fields
+    custom_handled_fields = [
+        "pattern",
+        "dtype_configs",
+        "num_tensor_args_to_observation_type",
+    ]
+    for field_name in entry:
+        if field_name in custom_handled_fields:
+            continue
+        s += f"  '{field_name}': {entry[field_name]},\n"
+
+    s += "}"
+    return s
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index 9e49a8392e3e..b4e295fbd4d0 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -6,14 +6,12 @@
 import torch
 from torch.nn import Module
 from torch.ao.quantization.observer import (
-    MinMaxObserver,
     MovingAverageMinMaxObserver,
     HistogramObserver,
     MovingAveragePerChannelMinMaxObserver,
-    PerChannelMinMaxObserver,
     FixedQParamsObserver,
-    default_affine_fixed_qparams_observer,
-    default_symmetric_fixed_qparams_observer,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
     _with_args,
 )
 import re
@@ -92,30 +90,23 @@ class FakeQuantize(FakeQuantizeBase):
 
     * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to
 
-    * :attr:`quant_min` specifies the minimum allowable quantized value.
-
-    * :attr:`quant_max` specifies the maximum allowable quantized value.
-
     * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that
       statistics can still be updated.
 
     * :attr:`observer_enabled` controls statistics collection on tensors
 
     * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization,
-        allowable values are torch.qint8 and torch.quint8. The values of quant_min and
-        quant_max should be chosen to be consistent with the dtype
+        allowable values are torch.qint8 and torch.quint8.
 
     Args:
 
         observer (module): Module for observing statistics on input tensors and calculating scale
           and zero-point.
-        quant_min (int): The minimum allowable quantized value.
-        quant_max (int): The maximum allowable quantized value.
         observer_kwargs (optional): Arguments for the observer module
 
     Attributes:
 
-        observer (Module): User provided module that collects statistics on the input tensor and
+        activation_post_process (Module): User provided module that collects statistics on the input tensor and
           provides a method to calculate scale and zero-point.
 
     """
@@ -123,15 +114,27 @@ class FakeQuantize(FakeQuantizeBase):
     scale: torch.Tensor
     zero_point: torch.Tensor
 
-    def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
+    def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=None, quant_max=None, **observer_kwargs):
         super().__init__()
-        assert quant_min <= quant_max, \
-            'quant_min must be less than or equal to quant_max'
-        self.quant_min = quant_min
-        self.quant_max = quant_max
+        # Populate quant_min/quant_max to observer_kwargs if valid
+        if quant_min is not None and quant_max is not None:
+            assert quant_min <= quant_max, \
+                'quant_min must be less than or equal to quant_max'
+            dtype = observer_kwargs.get("dtype", torch.quint8)
+            if hasattr(observer, "p"):
+                # In case observer is _PartialWrapper, dtype can be stored in
+                # observer.p.keywords["dtype"]
+                dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
+                    "dtype", dtype
+                )
+            assert torch.iinfo(dtype).min <= quant_min, 'quant_min out of bound'
+            assert quant_max <= torch.iinfo(dtype).max, 'quant_max out of bound'
+            observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
         self.activation_post_process = observer(**observer_kwargs)
-        assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, 'quant_min out of bound'
-        assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, 'quant_max out of bound'
+        # TODO: keeping self.quant_min/max for BC; remove after a couple releases
+        # Users should use self.activation_post_process.quant_min
+        self.quant_min = self.activation_post_process.quant_min
+        self.quant_max = self.activation_post_process.quant_max
         if _is_float_qparams(self.activation_post_process.qscheme):
             zero_point_dtype = torch.float
         else:
@@ -167,11 +170,11 @@ def forward(self, X):
             if self.is_per_channel:
                 X = torch.fake_quantize_per_channel_affine(
                     X, self.scale, self.zero_point,
-                    self.ch_axis, self.quant_min, self.quant_max)
+                    self.ch_axis, self.activation_post_process.quant_min, self.activation_post_process.quant_max)
             else:
                 X = torch.fake_quantize_per_tensor_affine(
                     X, self.scale, self.zero_point,
-                    self.quant_min, self.quant_max)
+                    self.activation_post_process.quant_min, self.activation_post_process.quant_max)
         return X
 
     @torch.jit.export
@@ -180,7 +183,7 @@ def extra_repr(self):
                'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
                'scale={}, zero_point={}'.format(
                    self.fake_quant_enabled, self.observer_enabled,
-                   self.quant_min, self.quant_max,
+                   self.activation_post_process.quant_min, self.activation_post_process.quant_max,
                    self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
@@ -233,8 +236,6 @@ def __init__(self, observer):
         assert type(self.activation_post_process) == FixedQParamsObserver,\
             "%s's observer must be a %s" % (self.__class__.__name__, FixedQParamsObserver.__name__)
         self._observer_ctr = observer
-        self.quant_min = self.activation_post_process.quant_min
-        self.quant_max = self.activation_post_process.quant_max
         self.scale = self.activation_post_process.scale
         self.zero_point = self.activation_post_process.zero_point
         assert _is_per_tensor(self.qscheme), 'Only per tensor quantization is supported' + \
@@ -250,7 +251,7 @@ def extra_repr(self):
                'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
                    self.fake_quant_enabled, self.observer_enabled,
                    self.scale, self.zero_point, self.dtype,
-                   self.quant_min, self.quant_max, self.qscheme)
+                   self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.qscheme)
 
 
 class FusedMovingAvgObsFakeQuantize(FakeQuantize):
@@ -279,14 +280,10 @@ def __init__(
         super().__init__(observer, quant_min, quant_max, **observer_kwargs)
         assert isinstance(self.activation_post_process, (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver)),\
             "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
-        self.quant_min: int = quant_min
-        self.quant_max: int = quant_max
         self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
         self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
         self.is_symmetric_quant = _is_symmetric_quant(self.activation_post_process.qscheme)
 
-        self.quant_min, self.quant_max = self.activation_post_process.quant_min, self.activation_post_process.quant_max
-
     @torch.jit.export
     def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
         return self.activation_post_process.calculate_qparams()
@@ -301,8 +298,8 @@ def extra_repr(self) -> str:
                 self.scale,
                 self.zero_point,
                 self.dtype,
-                self.quant_min,
-                self.quant_max,
+                self.activation_post_process.quant_min,
+                self.activation_post_process.quant_max,
                 self.qscheme,
                 self.activation_post_process.reduce_range,
             )
@@ -318,8 +315,8 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             self.scale,
             self.zero_point,
             self.activation_post_process.averaging_constant,
-            self.quant_min,
-            self.quant_max,
+            self.activation_post_process.quant_min,
+            self.activation_post_process.quant_max,
             self.ch_axis,
             self.is_per_channel,
             self.is_symmetric_quant,
@@ -335,16 +332,24 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
                                                    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, reduce_range=False)
 """
 Default fake_quant for weights.
+Observer is memoryless since averaging_constant is 1.
 """
 
-default_dynamic_fake_quant = FakeQuantize.with_args(observer=MinMaxObserver, quant_min=0, quant_max=255,
-                                                    dtype=torch.quint8, memoryless=True)
+default_dynamic_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
+                                                    dtype=torch.quint8, averaging_constant=1)
 """
 Default dynamic fake_quant for activations.
 """
 
-default_symmetric_fixed_qparams_fake_quant = FixedQParamsFakeQuantize.with_args(observer=default_symmetric_fixed_qparams_observer)
-default_affine_fixed_qparams_fake_quant = FixedQParamsFakeQuantize.with_args(observer=default_affine_fixed_qparams_observer)
+default_fixed_qparams_range_neg1to1_fake_quant = (
+    FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_neg1to1_observer)
+)
+default_fixed_qparams_range_0to1_fake_quant = (
+    FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_0to1_observer)
+)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_fake_quant = default_fixed_qparams_range_neg1to1_fake_quant
+default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant
 
 default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
                                                                quant_min=-128,
@@ -355,23 +360,25 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
                                                                ch_axis=0)
 """
 Default fake_quant for per-channel weights.
+Observer is memoryless since averaging_constant is 1.
 """
-default_embedding_fake_quant = FakeQuantize.with_args(observer=PerChannelMinMaxObserver,
+default_embedding_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
                                                       qscheme=torch.per_channel_affine_float_qparams,
                                                       dtype=torch.quint8,
                                                       quant_min=0,
                                                       quant_max=255,
                                                       ch_axis=0,
-                                                      memoryless=True)
+                                                      averaging_constant=1)
 """
 Default fake_quant for embeddings.
+Observer is memoryless since averaging_constant is 1.
 """
 
-default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=PerChannelMinMaxObserver,
+default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
                                                            qscheme=torch.per_channel_affine_float_qparams,
                                                            ch_axis=0,
                                                            dtype=torch.quint4x2,
-                                                           memoryless=True)
+                                                           averaging_constant=1)
 
 default_histogram_fake_quant = FakeQuantize.with_args(observer=HistogramObserver,
                                                       quant_min=0,
@@ -411,6 +418,27 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
 Fused version of `default_per_channel_weight_fake_quant`, with improved performance.
 """
 
+fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                   quant_min=-127,
+                                                                                   quant_max=127,
+                                                                                   dtype=torch.qint8,
+                                                                                   qscheme=torch.per_tensor_symmetric,
+                                                                                   eps=2 ** -12)
+"""
+Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+fused_per_channel_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                               quant_min=-127,
+                                                                                               quant_max=127,
+                                                                                               dtype=torch.qint8,
+                                                                                               qscheme=torch.per_channel_symmetric,
+                                                                                               eps=2 ** -12)
+"""
+Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+
 def _is_fake_quant_script_module(mod):
     ''' Returns true if given mod is an instance of FakeQuantize script module.
     '''
diff --git a/torch/ao/quantization/fuse_modules.py b/torch/ao/quantization/fuse_modules.py
index f276eea3c871..1f7027f5c8d5 100644
--- a/torch/ao/quantization/fuse_modules.py
+++ b/torch/ao/quantization/fuse_modules.py
@@ -7,6 +7,7 @@
 # for backward compatiblity
 from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn  # noqa: F401
 from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn_relu  # noqa: F401
+from torch.nn.utils.parametrize import type_before_parametrizations
 
 from typing import List, Optional
 
@@ -41,7 +42,7 @@ def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None):
     For these sequences, the first element in the output module list performs
     the fused operation. The rest of the elements are set to nn.Identity()
     """
-    types = tuple(type(m) for m in mod_list)
+    types = tuple(type_before_parametrizations(m) for m in mod_list)
     fuser_method = get_fuser_method(types, additional_fuser_method_mapping)
     if fuser_method is None:
         raise NotImplementedError("Cannot fuse modules: {}".format(types))
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 23e5a1f4c35a..a2882f136047 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -5,7 +5,8 @@
 from torch.ao.quantization.utils import Pattern
 
 from torch.ao.quantization.utils import get_combined_dict
-
+from torch.ao.quantization.utils import MatchAllNode
+import itertools
 
 def fuse_conv_bn(is_qat, conv, bn):
     r"""Given the conv and bn modules, fuses them and returns the fused module
@@ -32,8 +33,6 @@ def fuse_conv_bn(is_qat, conv, bn):
     }
 
     if is_qat:
-        # TODO: remove the assert later
-        assert conv.training, "qat is only supported when conv.training is True currently"
         assert bn.num_features == conv.out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
         assert bn.affine, 'Only support fusing BatchNorm2d with affine set to True'
         assert bn.track_running_stats, 'Only support fusing BatchNorm2d with tracking_running_stats set to True'
@@ -65,8 +64,6 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
         "Conv and BN both must be in the same mode (train or eval)."
     fused_module : Optional[Type[nn.Sequential]] = None
     if is_qat:
-        # TODO: remove the assert later
-        assert conv.training, "qat is only supported when conv.training is True currently"
         map_to_fused_module_train = {
             nn.Conv1d: nni.ConvBnReLU1d,
             nn.Conv2d: nni.ConvBnReLU2d,
@@ -112,9 +109,12 @@ def fuse_linear_bn(is_qat, linear, bn):
         "Linear and BN both must be in the same mode (train or eval)."
 
     if is_qat:
-        # TODO: remove the assert later
-        assert linear.training, "qat is only supported when linear.training is True currently"
-        raise Exception("Fusing Linear+BatchNorm not yet supported in training.")
+        assert bn.num_features == linear.out_features,\
+            "Output features of Linear must match num_features of BatchNorm1d"
+        assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
+        assert bn.track_running_stats,\
+            "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+        return nni.LinearBn1d(linear, bn)
     else:
         return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
 
@@ -136,8 +136,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
         "ConvTranspose and BN both must be in the same mode (train or eval)."
 
     if is_qat:
-        assert convt.training, "qat is only supported when convt.training is True currently"
-        raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in training.")
+        raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in QAT.")
     else:
         return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True)
 
@@ -221,6 +220,37 @@ def reversed(is_qat, x, w):
     (nn.BatchNorm3d, nn.ConvTranspose3d): reverse2(fuse_convtranspose_bn),
 }
 
+def get_valid_patterns(op_pattern):
+    """
+    Returns a list of valid patterns generated from the op_pattern,
+    since MatchAllNode can match all types of nodes,
+    e.g. pattern (torch.nn.Conv2d, torch.add) should also be able to match keys like
+    (MatchAllNode, torch.add) and (torch.nn.Conv2d, MatchAllNode)
+
+    Example Input:
+    (torch.add, (torch.nn.ReLU, torch.nn.Conv2d))
+
+    Example Output:
+    [(torch.add, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (torch.add, (torch.nn.ReLU, MatchAllNode)),
+     (torch.add, (MatchAllNode, torch.nn.Conv2d)),
+     (torch.add, (MatchAllNode, MatchAllNode)),
+     (MatchAllNode, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (MatchAllNode, (torch.nn.ReLU, MatchAllNode)),
+     (MatchAllNode, (MatchAllNode, torch.nn.Conv2d)),
+     (MatchAllNode, (MatchAllNode, MatchAllNode)),
+    ]
+    """
+    result = []
+    if isinstance(op_pattern, (tuple, list)):
+        sub_combs = []
+        for sub_pattern in op_pattern:
+            sub_combs.append(get_valid_patterns(sub_pattern))
+        result = list(itertools.product(*sub_combs))
+    else:
+        result = [op_pattern, MatchAllNode]
+    return result
+
 def get_fuser_method_new(
         op_pattern: Pattern,
         fuser_method_mapping: Optional[Dict[Pattern, Union[nn.Sequential, Callable]]] = None):
@@ -230,6 +260,11 @@ def get_fuser_method_new(
     if fuser_method_mapping is None:
         fuser_method_mapping = DEFAULT_PATTERN_TO_FUSER_METHOD
 
-    fuser_method = fuser_method_mapping.get(op_pattern, None)
+    op_patterns = get_valid_patterns(op_pattern)
+    fuser_method = None
+    for op_pattern in op_patterns:
+        fuser_method = fuser_method_mapping.get(op_pattern, None)
+        if fuser_method is not None:
+            break
     assert fuser_method is not None, "did not find fuser method for: {} ".format(op_pattern)
     return fuser_method
diff --git a/torch/ao/quantization/fx/__init__.py b/torch/ao/quantization/fx/__init__.py
index 08d613fae771..0e37eaaded97 100644
--- a/torch/ao/quantization/fx/__init__.py
+++ b/torch/ao/quantization/fx/__init__.py
@@ -1,4 +1,3 @@
 from .prepare import prepare
 from .convert import convert
-from .fuse import Fuser
-from .backend_config import get_tensorrt_backend_config_dict
+from .fuse import fuse
diff --git a/torch/ao/quantization/fx/_convert_do_not_use.py b/torch/ao/quantization/fx/_convert_do_not_use.py
deleted file mode 100644
index 29e8b71c2a93..000000000000
--- a/torch/ao/quantization/fx/_convert_do_not_use.py
+++ /dev/null
@@ -1,316 +0,0 @@
-from typing import Any, Dict, List, Optional
-import torch
-from torch.fx import (
-    GraphModule,
-)
-from torch.fx.graph import (
-    Graph,
-    Node,
-)
-from ..qconfig import QConfigAny
-from ..utils import (
-    activation_is_int8_quantized,
-    weight_is_statically_quantized,
-    get_qparam_dict,
-    _parent_name,
-)
-from .backend_config.utils import get_quantized_reference_module_mapping
-
-from .graph_module import (
-    QuantizedGraphModule,
-    is_observed_standalone_module,
-)
-from ._equalize import update_obs_for_equalization, convert_eq_obs
-from .utils import (
-    get_custom_module_class_keys,
-    get_quantize_node_info,
-    create_getattr_from_value,
-)
-
-from torch.ao.quantization.quantize import (
-    _remove_qconfig,
-    is_activation_post_process,
-)
-
-from .convert import restore_state
-
-# these are tuples so that they can work with isinstance(module, tuple_of_classes)
-FUSED_MODULE_CLASSES = (
-    torch.nn.intrinsic.LinearReLU,
-    torch.nn.intrinsic.ConvReLU1d,
-    torch.nn.intrinsic.ConvReLU2d,
-    torch.nn.intrinsic.ConvReLU3d,
-)
-
-QAT_MODULE_CLASSES = (
-    torch.nn.qat.Linear,
-    torch.nn.qat.Conv2d,
-    torch.nn.qat.Conv3d,
-    torch.nn.intrinsic.qat.LinearReLU,
-    torch.nn.intrinsic.qat.ConvBn2d,
-    torch.nn.intrinsic.qat.ConvBnReLU2d,
-    torch.nn.intrinsic.qat.ConvReLU2d,
-    torch.nn.intrinsic.qat.ConvBn3d,
-    torch.nn.intrinsic.qat.ConvBnReLU3d,
-    torch.nn.intrinsic.qat.ConvReLU3d
-)
-
-def insert_dequantize_node(
-        node: Node,
-        graph: Graph):
-    """ Inserts dequantize node for `node` in `graph`
-    """
-    with graph.inserting_after(node):
-        dequantize_node = graph.call_method("dequantize", (node,))
-        for user_node in dict(node.users):
-            if user_node is not dequantize_node:
-                user_node.replace_input_with(node, dequantize_node)
-
-def _convert_do_not_use(
-        model: GraphModule, is_reference: bool = False,
-        convert_custom_config_dict: Dict[str, Any] = None,
-        is_standalone_module: bool = False,
-        _remove_qconfig_flag: bool = True,
-        backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module:
-    """
-    We will convert an observed model (a module with observer calls) to a reference
-    quantized model, the rule is simple:
-    1. for each observer module call in the graph, we'll convert it to calls to
-       quantize and dequantize functions based on the observer instance
-    2. for weighted operations like linear/conv, we need to convert them to reference
-       quantized module, this requires us to know whether the dtype configured for the
-       weight is supported in the backend, this is done in prepare step and the result
-       is stored in observed_node_names, we can decide whether we need to swap the
-       module based on this set
-
-    standalone_module means it a submodule that is not inlined in
-    parent module, and will be quantized separately as one unit.
-
-    Returns a quantized standalone module, whether input/output is quantized is
-    specified by prepare_custom_config_dict, with
-    input_quantized_idxs, output_quantized_idxs, please
-    see docs for prepare_fx for details
-    """
-    if convert_custom_config_dict is None:
-        convert_custom_config_dict = {}
-    patterns, node_name_to_scope, prepare_custom_config_dict, observed_node_names = restore_state(model)
-    qconfig_map: Dict[str, QConfigAny] = model._qconfig_map  # type: ignore[assignment]
-
-    assert is_reference, "_convert_do_not_use only supports reference option"
-
-    # mapping from fully qualified module name to module instance
-    # for example,
-    # {
-    #   '': Model(...),
-    #   'linear': Linear(...),
-    #   'linear.weight_fake_quant': PerChannelMinMaxObserver(...),
-    # }
-    # We use remove_duplicate=False here because torch.cat uses
-    # the same activation_post_process module instance but different names
-    modules = dict(model.named_modules(remove_duplicate=False))
-
-    custom_module_classes = get_custom_module_class_keys(
-        convert_custom_config_dict,
-        "observed_to_quantized_custom_module_class")
-
-    if model._equalization_qconfig_map is not None:
-        # If we want to do equalization then do the following:
-        # Calculate the equalization scale, update the observers with the scaled
-        # inputs, and scale the weight
-        weight_eq_obs_dict = update_obs_for_equalization(model, modules)
-        convert_eq_obs(model, modules, weight_eq_obs_dict)
-
-    graph_inputs: List[str] = []
-    for node in model.graph.nodes:
-        if node.op == 'placeholder':
-            graph_inputs.append(node.name)
-
-    def replace_observer_with_quantize_dequantize_node(graph: Graph, node: Node, modules: Dict[str, torch.nn.Module]) -> None:
-        """ Replace activation_post_process module call node with quantize and
-        dequantize node
-
-        Before:
-        ... -> observer_0(x) -> ...
-        After:
-        ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
-        """
-        assert modules is not None
-        assert isinstance(node.target, str)
-        observer_module = modules[node.target]
-        root_module = modules[""]
-        if observer_module.dtype == torch.float32:
-            # remove the node for now
-            # TODO: support dynamic quant
-            with graph.inserting_before(node):
-                node.replace_all_uses_with(node.args[0])
-                graph.erase_node(node)
-        elif observer_module.dtype in [torch.quint8, torch.qint8, torch.float16]:
-            node_type, quantize_op, qparams = get_quantize_node_info(observer_module)
-            # replace observer node with quant - dequant node
-            with graph.inserting_before(node):
-                input_node = node.args[0]
-                inputs = [input_node]
-                for key, value in qparams.items():
-                    if key in ['_scale_', '_zero_point_']:
-                        # For scale and zero_point values we register them as buffers in the root module.
-                        # TODO: maybe need more complex attr name here
-                        qparam_node = create_getattr_from_value(root_module, graph, key, value)
-                        inputs.append(qparam_node)
-                    else:
-                        # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
-                        inputs.append(value)
-
-                quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {})
-                dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
-                node.replace_all_uses_with(dequantized_node)
-                graph.erase_node(node)
-
-
-    # additional state to override inputs to be quantized, if specified
-    # by the user
-    placeholder_node_seen_cnt = 0
-    output_node_seen_cnt = 0
-    input_quantized_idxs: List[int] = prepare_custom_config_dict.get(
-        "input_quantized_idxs", [])
-    output_quantized_idxs: List[int] = prepare_custom_config_dict.get(
-        "output_quantized_idxs", [])
-
-    if backend_config_dict is None:
-        backend_config_dict = {}
-    quantized_reference_module_mapping = get_quantized_reference_module_mapping(backend_config_dict)
-    # convert tuples so that it can work with isinstance(module, tuple_of_classes)
-    weighted_module_classes = tuple(quantized_reference_module_mapping.keys())
-
-    for node in list(model.graph.nodes):
-        if node.op == 'placeholder':
-            cur_placeholder_node_idx = placeholder_node_seen_cnt
-            placeholder_node_seen_cnt += 1
-            if cur_placeholder_node_idx in input_quantized_idxs:
-                # Inputs are assumed to be quantized if the user specifid the
-                # input_quantized_idxs override.
-                # we need to dequantize the inputs since all operators took
-                # floating point inputs in reference quantized models
-                insert_dequantize_node(node, model.graph)
-        elif node.op == "output":
-            cur_output_node_idx = output_node_seen_cnt
-            output_node_seen_cnt += 1
-            if cur_output_node_idx in output_quantized_idxs:
-                # Result are kept quantized if the user specified the
-                # output_quantized_idxs override.
-                # Remove the dequantize operator in the end
-                maybe_dequantize_node = node.args[0]
-                if isinstance(maybe_dequantize_node, Node) and \
-                   maybe_dequantize_node.op == "call_method" and \
-                   maybe_dequantize_node.target == "dequantize":
-                    quantize_node = maybe_dequantize_node.args[0]
-                    maybe_dequantize_node.replace_all_uses_with(quantize_node)
-                    model.graph.erase_node(maybe_dequantize_node)
-        elif node.op == "call_module":
-            if is_activation_post_process(modules[node.target]):
-                replace_observer_with_quantize_dequantize_node(model.graph, node, modules)
-            elif is_observed_standalone_module(modules[node.target]):
-                # TODO: move this to a separate function
-                convert = torch.ao.quantization._quantize_fx_do_not_use._convert_do_not_use  # type: ignore[attr-defined]
-                # We know that observed standalone module is a GraphModule since
-                # it's produced by us
-                observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
-                sm_input_quantized_idxs = \
-                    observed_standalone_module \
-                    ._standalone_module_input_quantized_idxs\
-                    .tolist()  # type: ignore[operator]
-                # remove the dequantize nodes for inputs
-                args = list(node.args)
-                for idx in range(len(args)):
-                    if idx in sm_input_quantized_idxs:
-                        arg = args[idx]
-                        if arg.op == "call_method" and arg.target == "dequantize":
-                            quantize_node = arg.args[0]
-                            node.replace_input_with(arg, quantize_node)
-                            if len(arg.users) == 0:
-                                model.graph.erase_node(arg)
-                # add dequantize node for output
-                sm_output_quantized_idxs = \
-                    observed_standalone_module \
-                    ._standalone_module_output_quantized_idxs \
-                    .tolist()  # type: ignore[operator]
-                if len(sm_output_quantized_idxs) > 0:
-                    assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
-                    "output idxs = [0] is supported"
-
-                    # if it's non-empty, then it means the output is kept in quantized form
-                    # we'll just add a dequantize node after this node
-                    insert_dequantize_node(node, model.graph)
-
-                # TODO: allow convert_custom_config_dict to override backend_config_dict
-                # for standalone module
-                quantized_standalone_module = convert(
-                    observed_standalone_module,
-                    is_reference=True,
-                    backend_config_dict=backend_config_dict)
-                parent_name, name = _parent_name(node.target)
-                # update the modules dict
-                setattr(modules[parent_name], name, quantized_standalone_module)
-                modules[str(node.target)] = quantized_standalone_module
-            elif type(modules[node.target]) in set(
-                    weighted_module_classes).union(QAT_MODULE_CLASSES).union(FUSED_MODULE_CLASSES):
-                # TODO: refactor this part to a function
-                original_module = modules[node.target]
-                qconfig = original_module.qconfig
-
-                is_observed = node.name in observed_node_names
-                is_activation_quantized = activation_is_int8_quantized(qconfig)
-                is_weight_quantized = weight_is_statically_quantized(qconfig)
-                # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized
-                if qconfig is None or \
-                   not is_observed or \
-                   not is_weight_quantized or \
-                   not is_activation_quantized:
-                    continue
-
-                float_module = original_module
-                fused_module = None
-                if isinstance(
-                        original_module,
-                        QAT_MODULE_CLASSES):
-                    # case 1. converting qat module to
-                    # a float module, we need to attch
-                    # weight fake_quant to the module,
-                    # weight fake_quant is assumed to be run during
-                    # QAT so we don't need to run it again here
-                    float_module = original_module.to_float()  # type: ignore[operator]
-                    # change qat conv to conv
-                    parent_name, name = _parent_name(node.target)
-                    setattr(modules[parent_name], name, float_module)
-                    if isinstance(float_module, torch.nn.intrinsic._FusedModule):
-                        fused_module = float_module
-                        float_module = fused_module[0]
-                    weight_post_process = original_module.weight_fake_quant
-                else:
-                    # case 2. converting a float module/fused float module
-                    # to float module, we need to attach
-                    # weight observer to the conv module and run it
-                    # with conv weight
-                    if isinstance(original_module, torch.nn.intrinsic._FusedModule):
-                        fused_module = original_module
-                        float_module = fused_module[0]  # type: ignore[index]
-                    assert qconfig is not None
-                    weight_post_process = qconfig.weight()
-                    # run weight observer
-                    weight_post_process(float_module.weight)  # type: ignore[operator]
-                weight_qparams = get_qparam_dict(weight_post_process)
-                # TODO: may need to change the mapping when we support dynamic quantization
-                ref_qmodule_cls = quantized_reference_module_mapping.get(type(float_module), None)
-                assert ref_qmodule_cls is not None, f"No reference quantized module class configured for {type(float_module)}"
-                ref_qmodule = ref_qmodule_cls.from_float(float_module, weight_qparams)  # type: ignore[attr-defined]
-                if fused_module is not None:
-                    fused_module[0] = ref_qmodule
-                else:
-                    parent_name, name = _parent_name(node.target)
-                    setattr(modules[parent_name], name, ref_qmodule)
-
-    # removes qconfig and activation_post_process modules
-    if _remove_qconfig_flag:
-        _remove_qconfig(model)
-    preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", []))
-    model = QuantizedGraphModule(model, model.graph, preserved_attributes)
-    return model
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index cc8cfa1cb01c..41fbb366934e 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -1,115 +1,854 @@
 import torch
+from torch.fx import map_arg, Node
+from torch.fx.graph import Graph
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
 import torch.nn.quantized._reference as nnqr
-from torch.nn.quantized.modules.utils import ReferenceableQuantizedModule
-from . import subgraph_rewriter_FORKED_DO_NOT_USE
+from torch.nn.quantized.modules.utils import WeightedQuantizedModule
 from .graph_module import QuantizedGraphModule
-from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements
-from .match_utils import is_match
-from .match_utils import MatchAllNode
-from ..utils import _parent_name, check_node
-from typing import Dict, Tuple, Type, List
-from torch.fx import Node
-
-# Mapping from reference module class to the replacement quantized module class for lowering
-LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[ReferenceableQuantizedModule]] = {
+from .utils import (
+    collect_producer_nodes,
+    get_linear_prepack_op_for_dtype,
+    get_new_attr_name_with_prefix,
+    get_qconv_prepack_op,
+    graph_module_from_producer_nodes,
+)
+from ..utils import _parent_name
+from ..qconfig import QConfigAny
+from ..quantization_mappings import get_quantized_operator
+from .utils import create_node_from_old_node_preserve_meta
+from typing import Dict, Tuple, Type, List, Callable, Any, Union, Set, Optional
+import operator
+
+QOP_TO_ARG_NAMES_TO_SKIP = {
+    torch._ops.ops.quantized.hardswish: ['inplace'],
+    torch._ops.ops.quantized.elu: ['inplace'],
+    torch._ops.ops.quantized.dropout: ['inplace'],
+    torch._ops.ops.quantized.instance_norm:
+    ['running_mean', 'running_var', 'use_input_stats', 'momentum'],
+}
+
+def _is_node_in_list(node, modules, func_list, method_list, module_type_list):
+    is_call_function = node.op == "call_function" and node.target in func_list
+    is_call_method = node.op == "call_method" and node.target in method_list
+    is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list
+    return is_call_function, is_call_method, is_call_module
+
+def is_fixed_qparams_node(node, modules):
+    func_list = [
+        torch.nn.functional.hardsigmoid,
+        torch.nn.functional.sigmoid,
+        torch.sigmoid,
+        torch.tanh,
+    ]
+    method_list = [
+        "hardsigmoid",
+        "hardsigmoid_",
+        "sigmoid",
+        "sigmoid_",
+        "tanh",
+        "tanh_",
+    ]
+    module_type_list = [
+        torch.nn.Hardsigmoid,
+        torch.nn.Sigmoid,
+        torch.nn.Tanh,
+        torch.nn.Softmax,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_default_node(node, modules):
+    func_list = [
+        torch.nn.functional.elu,
+        torch.nn.functional.hardswish,
+        torch.nn.functional.instance_norm,
+        torch.nn.functional.layer_norm,
+        torch.nn.functional.leaky_relu,
+        torch.nn.functional.dropout,
+    ]
+    method_list: List[Any] = []
+    module_type_list = [
+        nnqr.ConvTranspose1d,
+        nnqr.ConvTranspose2d,
+        torch.nn.ELU,
+        torch.nn.LeakyReLU,
+        torch.nn.Hardswish,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.Dropout,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.intrinsic.BNReLU2d,
+        torch.nn.intrinsic.BNReLU3d,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_copy_node(node, modules):
+    func_list = [
+        torch.adaptive_avg_pool1d,
+        torch.nn.functional.adaptive_avg_pool2d,
+        torch.nn.functional.adaptive_avg_pool3d,
+        torch.nn.functional.hardtanh,
+        torch.nn.functional.hardtanh_,
+        torch.nn.functional.interpolate,
+        torch.nn.functional.max_pool1d,
+        torch.nn.functional.max_pool2d,
+        torch.nn.functional.max_pool3d,
+        torch.nn.functional.relu,
+        torch.nn.functional.relu6,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.clamp,
+        torch.flatten,
+        torch.mean,
+        operator.floordiv,
+    ]
+    method_list = [
+        "clamp",
+        "mean",
+        "relu",
+        "relu_",
+    ]
+    module_type_list = [
+        torch.nn.AdaptiveAvgPool1d,
+        torch.nn.AdaptiveAvgPool2d,
+        torch.nn.AdaptiveAvgPool3d,
+        torch.nn.AvgPool1d,
+        torch.nn.AvgPool2d,
+        torch.nn.AvgPool3d,
+        torch.nn.Hardtanh,
+        torch.nn.MaxPool1d,
+        torch.nn.MaxPool2d,
+        torch.nn.MaxPool3d,
+        torch.nn.ReLU,
+        torch.nn.ReLU6,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_general_tensor_shape_node(node, modules):
+    func_list = [
+        torch.transpose,
+        torch.repeat_interleave,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+    ]
+    method_list = [
+        "contiguous",
+        "detach",
+        "detach_",
+        "permute",
+        "repeat",
+        "repeat_interleave",
+        "reshape",
+        "resize_",
+        "shape",
+        "size",
+        "squeeze",
+        "squeeze_",
+        "transpose",
+        "unsqueeze",
+        "unsqueeze_",
+        "view",
+    ]
+    module_type_list = [
+        torch.nn.Identity,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_other_node(node, modules):
+    func_list = [
+        torch.cat,
+    ]
+    method_list: List[Any] = []
+    module_type_list: List[Any] = []
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_special_pattern_node(node, modules):
+    res_function, res_method, res_module = False, False, False
+    for checker in [is_fixed_qparams_node, is_default_node, is_copy_node, is_general_tensor_shape_node, is_other_node]:
+        is_call_function, is_call_method, is_call_module = checker(node, modules)
+        res_function = res_function or is_call_function
+        res_method = res_method or is_call_method
+        res_module = res_module or is_call_module
+    return res_function, res_method, res_module
+
+def is_dequantize_node(node):
+    return isinstance(node, Node) and node.op == "call_method" and node.target == "dequantize"
+
+def is_getattr_tensor_metadata_node(node):
+    return node.op == "call_function" and \
+        node.target == getattr and \
+        node.args[1] in ["shape"]
+
+def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigAny]):
+    """
+    Return True if the op is configured with a None qconfig, False otherwise.
+    Note: maybe need to generalize this to also check for the dtype, and we
+    only lower when dtype matches, but right now fbgemm/qnnpack only support
+    a single dtype, so it is OK for now.
+    """
+    return op.name in qconfig_map and qconfig_map[op.name] is None
+
+# Mapping from reference module class to the replacement static quantized module class for lowering
+STATIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[WeightedQuantizedModule]] = {
     nnqr.Linear: nnq.Linear,
     nnqr.Conv1d: nnq.Conv1d,
     nnqr.Conv2d: nnq.Conv2d,
     nnqr.Conv3d: nnq.Conv3d,
 }
 
+# Mapping from reference module class to the replacement dynamic quantized module class for lowering
+DYNAMIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = {
+    nnqr.Linear: nnqd.Linear,
+    nnqr.GRUCell: nnqd.GRUCell,
+    nnqr.LSTMCell: nnqd.LSTMCell,
+    nnqr.RNNCell: nnqd.RNNCell,
+    nnqr.LSTM: nnqd.LSTM,
+}
+
+# Mapping from reference module class to the replacement weight only quantized module class for lowering
+# TODO: correct the namespace for these modules
+WEIGHT_ONLY_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = {
+    nnqr.Embedding: nnq.Embedding,
+    nnqr.EmbeddingBag: nnq.EmbeddingBag,
+}
+
+# TODO: merge with STATIC_LOWER_MODULE_MAP after we merge
+# _lower_static_weighted_ref_module and special_pattern_replacement
+SPECIAL_PATTERN_LOWER_MODULE_MAP = {
+    nn.BatchNorm2d: nnq.BatchNorm2d,
+    nn.BatchNorm3d: nnq.BatchNorm3d,
+    nnqr.ConvTranspose1d: nnq.ConvTranspose1d,
+    nnqr.ConvTranspose2d: nnq.ConvTranspose2d,
+    nn.ELU: nnq.ELU,
+    nn.LeakyReLU: nnq.LeakyReLU,
+    nn.Hardswish: nnq.Hardswish,
+    nn.InstanceNorm1d: nnq.InstanceNorm1d,
+    nn.InstanceNorm2d: nnq.InstanceNorm2d,
+    nn.InstanceNorm3d: nnq.InstanceNorm3d,
+    nn.LayerNorm: nnq.LayerNorm,
+    nn.Dropout: nnq.Dropout,
+    nn.Softmax: nnq.Softmax,
+    nni.BNReLU2d: nniq.BNReLU2d,
+    nni.BNReLU3d: nniq.BNReLU3d,
+}
+
 # Mapping from fused module class to a 2-tuple of:
 #   1) The inner reference module class
-#   2) The replacement quantized module class for lowering
-LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[ReferenceableQuantizedModule]]] = {
-    nni.LinearReLU: (nnqr.Linear, nniq.LinearReLU)
+#   2) The replacement static quantized module class for lowering
+STATIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[WeightedQuantizedModule]]] = {
+    nni.LinearReLU: (nnqr.Linear, nniq.LinearReLU),
+    nni.ConvReLU1d: (nnqr.Conv1d, nniq.ConvReLU1d),
+    nni.ConvReLU2d: (nnqr.Conv2d, nniq.ConvReLU2d),
+    nni.ConvReLU3d: (nnqr.Conv3d, nniq.ConvReLU3d),
+}
+
+# Mapping from fused module class to a 2-tuple of:
+#   1) The inner reference module class
+#   2) The replacement dynamic quantized module class for lowering
+DYNAMIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[nn.Module]]] = {
+    nni.LinearReLU: (nnqr.Linear, nniqd.LinearReLU),
+}
+
+# Mapping from a functional to lower to a 2-tuple of
+#   1) The quantized version of the op
+#   2) The quantized version of the op fused with relu, if it exists, else None
+STATIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Tuple[Callable, Callable]] = {
+    F.linear: (torch.ops.quantized.linear, torch.ops.quantized.linear_relu),
+    F.conv1d: (torch.ops.quantized.conv1d, torch.ops.quantized.conv1d_relu),
+    F.conv2d: (torch.ops.quantized.conv2d, torch.ops.quantized.conv2d_relu),
+    F.conv3d: (torch.ops.quantized.conv3d, torch.ops.quantized.conv3d_relu),
+}
+
+WEIGHT_PREPACK_OPS: Set[Callable] = {
+    torch._ops.ops.quantized.linear_prepack,
+    torch._ops.ops.quantized.linear_prepack_fp16,
+    torch._ops.ops.quantized.conv1d_prepack,
+    torch._ops.ops.quantized.conv2d_prepack,
+    torch._ops.ops.quantized.conv3d_prepack,
+}
+
+# Mapping from a functional to a dictionary, where the key is a 2-tuple of
+# (activation_compute_dtype, weight_dtype) and the value is a 2-tuple of
+#   1) The dynamically quantized version of the op
+#   2) The dynamically quantized version of the op fused with relu, if it exists, else None
+DYNAMIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Dict[Tuple[torch.dtype, torch.dtype], Tuple[Callable, Optional[Callable]]]] = {
+    F.linear: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.linear_dynamic,
+                                      torch.ops.quantized.linear_relu_dynamic),
+        (torch.float16, torch.float16): (torch.ops.quantized.linear_dynamic_fp16,
+                                         torch.ops.quantized.linear_relu_dynamic_fp16)
+    },
+    # dynamic conv + relu is not available yet
+    F.conv1d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv1d_dynamic, None),
+    },
+    F.conv2d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv2d_dynamic, None),
+    },
+    F.conv3d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv3d_dynamic, None),
+    },
+}
+
+CONV_FUNCTIONAL_OPS: Set[Callable] = {
+    F.conv1d,
+    F.conv2d,
+    F.conv3d,
 }
 
-def _lower_weighted_ref_module(model: QuantizedGraphModule) -> QuantizedGraphModule:
+QBIN_OP_MAPPING: Dict[Union[Callable, str], Callable] = {
+    operator.add: torch.ops.quantized.add,
+    torch.add: torch.ops.quantized.add,
+    operator.mul: torch.ops.quantized.mul,
+    torch.mul: torch.ops.quantized.mul,
+    torch.matmul: torch.ops.quantized.matmul,
+}
+QBIN_RELU_OP_MAPPING: Dict[Union[Callable, str], Callable] = {
+    operator.add: torch.ops.quantized.add_relu,
+    torch.add: torch.ops.quantized.add_relu,
+    operator.mul: torch.ops.quantized.mul_relu,
+    torch.mul: torch.ops.quantized.mul_relu,
+}
+
+def fold_weight(
+    quantized: QuantizedGraphModule,
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> QuantizedGraphModule:
+    """
+    Trace back from the weight node util we hit getattr, reconstruct the
+    graph module with the traced nodes and run the graph module to pack the
+    weight. then replace the original chain of ops with the packed weight.
+    """
+    packed_weights = dict()
+    # map from folded node name to the prepacked weight name
+    folded_nodes = dict()
+    # get packed weights
+    for node in quantized.graph.nodes:
+        if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS:
+            nodes_to_fold = collect_producer_nodes(node)
+            if nodes_to_fold is not None:
+                for node_to_fold in nodes_to_fold:
+                    folded_nodes[node_to_fold.name] = node
+
+                prepacking_module = graph_module_from_producer_nodes(
+                    quantized, nodes_to_fold)
+                packed_weight = prepacking_module()
+                packed_weights[node.name] = packed_weight
+
+    # remove folded nodes and replace the prepacking node with getattr
+    folded_graph = Graph()
+    env: Dict[Any, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+    quantized_root = quantized
+    quantized_graph = quantized.graph
+
+    for node in quantized_graph.nodes:
+        prepack_node = folded_nodes.get(node.name, None)
+        if prepack_node is node:
+            packed_weight = packed_weights[node.name]
+            # add a prepacked attribute to root
+            op_node = list(prepack_node.users)[0]
+            module_path, _ = node_name_to_scope[op_node.name]
+            get_new_packed_weight_name = \
+                get_new_attr_name_with_prefix(module_path + '_packed_weight_')
+            packed_weight_name = get_new_packed_weight_name(quantized_root)
+            setattr(quantized_root, packed_weight_name, packed_weight)
+            # replace prepack node with a getattr node
+            env[node.name] = folded_graph.create_node(
+                'get_attr', packed_weight_name, (), {})
+        elif prepack_node is not None:
+            # remove the foled node
+            continue
+        else:
+            # copy other nodes
+            env[node.name] = folded_graph.node_copy(node, load_arg)
+    return QuantizedGraphModule(quantized_root, folded_graph, quantized_root.preserved_attr_names)
+
+def _get_module(node: Node, modules: Dict[str, nn.Module]) -> Optional[nn.Module]:
+    """
+    Return the `torch.nn.Module` that corresponds to the specified node's target.
+    If no such node exists, return None.
+    """
+    if node.op == "call_module" and str(node.target) in modules:
+        return modules[str(node.target)]
+    else:
+        return None
+
+def _match_static_pattern(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    qconfig_map: Dict[str, QConfigAny],
+    matching_modules_or_ops: List[Callable],
+    dequantize_node_arg_indices: List[int]
+) -> Union[Tuple[Node, Node, Node], Tuple[None, None, None]]:
+    """
+    Match the pattern (dequantize - ref node - quantize) against the node provided.
+
+    If there is a match, return a 3-tuple of:
+      1) q_node: the quantize node,
+      2) relu_node: a relu node wrapping the ref_node, and
+      3) ref_node: a reference module or functional node to replace with its quantized counterpart
+    Otherwise, if there is no match, return a 3-tuple of (None, None, None).
+
+    Parameters:
+      node: The `torch.fx.Node` to match against.
+      modules: A mapping from node names to modules in the model graph, used for module lookup.
+      qconfig_map: A mapping from node names to the qconfigs associated with the nodes.
+          If the corresponding qconfig for the reference node is None, then return no match.
+      matching_modules_or_ops: Either a list of functions or a list of `torch.nn.Module`s.
+          If the reference node is not in this list, then return no match.
+      dequantize_node_arg_indices: A list of indices in the reference node args where dequantize
+          nodes may be present. An empty list means skipping the check for dequantize nodes.
+    """
+    SKIP_LOWERING_VALUE = (None, None, None)
+
+    # Match quantize node
+    if node.op != "call_function" or node.target != torch.quantize_per_tensor:
+        return SKIP_LOWERING_VALUE
+    q_node = node
+    ref_node = q_node.args[0]
+    assert(isinstance(ref_node, Node))
+
+    # Handle cases where the node is wrapped in a ReLU
+    if (ref_node.op == "call_function" and ref_node.target in (F.relu, torch.relu)) or\
+            (ref_node.op == "call_module" and type(_get_module(ref_node, modules)) == nn.ReLU):
+        relu_node = ref_node
+        ref_node = relu_node.args[0]
+        assert(isinstance(ref_node, Node))
+    else:
+        relu_node = None
+    if should_skip_lowering(ref_node, qconfig_map):
+        return SKIP_LOWERING_VALUE
+
+    # Match reference module or functional
+    if isinstance(matching_modules_or_ops[0], type) and issubclass(matching_modules_or_ops[0], nn.Module):
+        expected_op = "call_module"
+        match_key = type(_get_module(ref_node, modules))
+    else:
+        expected_op = "call_function"
+        match_key = ref_node.target
+    if ref_node.op != expected_op or match_key not in matching_modules_or_ops:
+        return SKIP_LOWERING_VALUE
+
+    # Match dequantize node(s). Both of the following conditions must pass:
+    # (1) All `torch.fx.Node`s at the matching indices must be a dequantize node
+    # (2) There must be at least one dequantize node
+    matched_dequantize = False
+    for i in dequantize_node_arg_indices:
+        assert i < len(ref_node.args),\
+            "Dequantize index %s exceeded reference node's arg length %s" % (i, len(ref_node.args))
+        arg = ref_node.args[i]
+        if is_dequantize_node(arg):
+            matched_dequantize = True
+        elif isinstance(arg, Node):
+            return SKIP_LOWERING_VALUE
+    if not matched_dequantize:
+        return SKIP_LOWERING_VALUE
+
+    return (q_node, relu_node, ref_node)
+
+def _lower_static_weighted_ref_module(
+        model: QuantizedGraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
     """
     Traverse the graph and find dequantize - ref module - quantize patterns
     and replace them with the quantized version of the ref module.
     """
-    for ref_class in list(LOWER_MODULE_MAP.keys()) + list(LOWER_FUSED_MODULE_MAP.keys()):
-        pattern = (torch.quantize_per_tensor,
-                   (ref_class, "dequantize"),
-                   MatchAllNode, MatchAllNode, MatchAllNode)
-        modules = dict(model.named_modules(remove_duplicate=False))
-        nodes = list(model.graph.nodes)
-        # TODO: maybe orgnize this better (e.g. break down to more functions)
-        # to make this function more readable
-        for n in model.graph.nodes:
-            if not is_match(modules, n, pattern):
-                continue
-            q_node = n
-            ref_node = q_node.args[0]
-            dq_node = ref_node.args[0]
-            # get output scale/zero_point/dtype from the quantize node
-            scale_node = q_node.args[1]
-            zero_point_node = q_node.args[2]
-            dtype = q_node.args[3]
-
-            # this can be removed if we add support for "get_attr" in is_match
-            if scale_node.op != "get_attr" or zero_point_node.op != "get_attr":
-                print("Find the pattern but scale_node and zero_point node are not `get_attr`,"
-                      f"got: {scale_node.format_node} {zero_point_node.format_node()}")
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        matching_modules = list(STATIC_LOWER_MODULE_MAP.keys()) + list(STATIC_LOWER_FUSED_MODULE_MAP.keys())
+        (q_node, relu_node, ref_node) = _match_static_pattern(
+            n, modules, qconfig_map, matching_modules, dequantize_node_arg_indices=[0])  # type: ignore[arg-type]
+        if q_node is None:
+            continue
+        assert(ref_node is not None)
+        (_, scale_node, zero_point_node, _) = q_node.args
+        ref_module = _get_module(ref_node, modules)
+        ref_class = type(ref_module)
+        assert(isinstance(scale_node, Node))
+        assert(isinstance(zero_point_node, Node))
+        assert(issubclass(ref_class, nn.Module))
+
+        # Step 1: Change this pattern to use the corresponding quantized module
+        # For fused modules, we also check whether the inner module is a reference module
+        # If so, we replace the entire fused module with the corresponding quantized module
+        if ref_class in STATIC_LOWER_FUSED_MODULE_MAP:
+            inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
                 continue
+        else:
+            q_class = STATIC_LOWER_MODULE_MAP[ref_class]
+        output_scale = getattr(model, scale_node.target)
+        output_zero_point = getattr(model, zero_point_node.target)
+        q_module = q_class.from_reference(ref_module, output_scale, output_zero_point)
+        # replace reference module with quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(modules[parent_name], module_name, q_module)
+
+        # Step 2: Remove dq_node, q_node and its args
+        dq_node = ref_node.args[0]
+        assert(isinstance(dq_node, Node))
+        dq_node.replace_all_uses_with(dq_node.args[0])
+        model.graph.erase_node(dq_node)
+        q_node.replace_all_uses_with(ref_node)
+        model.graph.erase_node(q_node)
+        model.graph.erase_node(scale_node)
+        model.graph.erase_node(zero_point_node)
+
+def _lower_dynamic_weighted_ref_module(model: QuantizedGraphModule):
+    """
+    Traverse the graph and find quantize_per_tensor_dynamic - dequantize - ref_module patterns
+    and replace them with the dynamically quantized version of the ref module.
+    """
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        if n.op != "call_module" or \
+           type(named_modules[str(n.target)]) not in \
+           set(DYNAMIC_LOWER_MODULE_MAP.keys()).union(
+               set(DYNAMIC_LOWER_FUSED_MODULE_MAP.keys())):
+            continue
+        ref_node = n
+        dq_node = ref_node.args[0]
+        if dq_node.op != "call_method" or dq_node.target != "dequantize":
+            continue
+        # don't support lowering the pattern when the result of dequantize is used by
+        # multiple nodes
+        if len(dq_node.users) > 1:
+            continue
+
+        input_dynamic_q_node = dq_node.args[0]
+        # don't support lowering the pattern when the result of quantize is used by
+        # multiple nodes
+        if len(input_dynamic_q_node.users) > 1:
+            continue
 
-            # this can be removed if we add support for constants in is_match
-            if dtype != torch.quint8:
-                print(f"Only qint8 output for quantized op is supported, got: {dtype}")
+        if input_dynamic_q_node.op != "call_function" or \
+           input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic:
+            continue
+
+        activation_compute_dtype = input_dynamic_q_node.args[1]
+        is_fp16 = activation_compute_dtype == torch.float16
+        is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8]
+        if not is_int8 and not is_fp16:
+            continue
+
+        ref_module = named_modules[str(ref_node.target)]
+        ref_class = type(ref_module)
+        if ref_class in DYNAMIC_LOWER_FUSED_MODULE_MAP:
+            inner_ref_class, q_class = DYNAMIC_LOWER_FUSED_MODULE_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:
                 continue
+        else:
+            q_class = DYNAMIC_LOWER_MODULE_MAP.get(ref_class)  # type: ignore[assignment]
+        # TODO: maybe define a WeightedDynamicallyQuantizedModule
+        q_module = q_class.from_reference(ref_module)  # type: ignore[attr-defined]
 
-            # change this pattern to use the corresponding quantized module
-            ref_module = modules[ref_node.target]
-            output_scale = getattr(model, scale_node.target)
-            output_zero_point = getattr(model, zero_point_node.target)
-            # For fused modules, we also check whether the inner module is a reference module
-            # If so, we replace the entire fused module with the corresponding quantized module
-            if ref_class in LOWER_FUSED_MODULE_MAP:
-                inner_ref_class, q_class = LOWER_FUSED_MODULE_MAP[ref_class]
-                if type(ref_module[0]) != inner_ref_class:
-                    continue
-            else:
-                q_class = LOWER_MODULE_MAP[type(ref_module)]
-            assert issubclass(q_class, ReferenceableQuantizedModule)  # suppress mypy warnings
-            q_module = q_class.from_reference(ref_module, output_scale, output_zero_point)
-
-            # replace reference module with quantized module
-            parent_name, module_name = _parent_name(ref_node.target)
-            setattr(modules[parent_name], module_name, q_module)
-            # remove dq node:
-            dq_node_input = dq_node.args[0]
-
-            dq_node.replace_all_uses_with(dq_node_input)
-            model.graph.erase_node(dq_node)
+        # replace reference moduel with dynamically quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(named_modules[parent_name], module_name, q_module)
 
-            # remove q node and args:
-            q_node.replace_all_uses_with(ref_node)
-            model.graph.erase_node(q_node)
-            model.graph.erase_node(scale_node)
-            model.graph.erase_node(zero_point_node)
-        model.recompile()
-    return model
+        # remove q - dq node
+        dq_node.replace_all_uses_with(input_dynamic_q_node)
+        model.graph.erase_node(dq_node)
+        input_dynamic_q_node.replace_all_uses_with(input_dynamic_q_node.args[0])
+        model.graph.erase_node(input_dynamic_q_node)
 
-def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphModule:
+def _lower_weight_only_weighted_ref_module(model: QuantizedGraphModule):
+    """
+    Traverse the graph and find ref_module patterns
+    and replace them with the weight only quantized version of the ref module.
+    """
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        if n.op != "call_module" or \
+           type(named_modules[str(n.target)]) not in \
+           set(WEIGHT_ONLY_LOWER_MODULE_MAP.keys()):
+            continue
+        ref_node = n
+        ref_module = named_modules[str(ref_node.target)]
+        ref_class = type(ref_module)
+        q_class = WEIGHT_ONLY_LOWER_MODULE_MAP.get(ref_class)
+        # TODO: WeightedQuantizedModule is currently assuming static quant apis
+        # with output_scale, output_zero_point in from_reference, we may want to
+        # relax that, or rename this
+        # TODO: maybe define a WeightedWeightOnlyQuantizedModule
+        q_module = q_class.from_reference(ref_module)  # type: ignore[union-attr]
+
+        # replace reference moduel with dynamically quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(named_modules[parent_name], module_name, q_module)
+
+def _lower_static_weighted_ref_functional(
+        model: QuantizedGraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and replace functional reference patterns with their quantized versions.
+    """
     modules = dict(model.named_modules(remove_duplicate=False))
     nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - functional op - quantize)
+        matching_ops = list(STATIC_LOWER_FUNCTIONAL_MAP.keys())
+        (q_node, relu_node, func_node) = _match_static_pattern(
+            n, modules, qconfig_map, matching_ops, dequantize_node_arg_indices=[0, 1])
+        if q_node is None:
+            continue
+        assert(func_node is not None)
+        (_, output_scale_node, output_zp_node, _) = q_node.args
+        (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args
+        assert(isinstance(output_zp_node, Node))
+        assert(isinstance(input_dq_node, Node))
+        assert(isinstance(weight_dq_node, Node))
+        quantized_weight = weight_dq_node.args[0]
+        assert(isinstance(quantized_weight, Node))
+        if quantized_weight.op != "call_function" or\
+                quantized_weight.target not in (torch.quantize_per_tensor, torch.quantize_per_channel):
+            continue
+
+        # Step 1: Replace quantized weights with packed weights, which will be folded later
+        # Use the right prepack op and prepare the corresponding args
+        # Linear prepack args: (quantized weights[, bias])
+        # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups])
+        prepack_args = [quantized_weight] + remaining_func_args
+        if func_node.target == F.linear:
+            weight_dtype = quantized_weight.args[-1]
+            prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
+        elif func_node.target in CONV_FUNCTIONAL_OPS:
+            prepack_op = get_qconv_prepack_op(func_node.target)  # type: ignore[arg-type]
+            # For conv1d, the stride, padding, and dilation args may be ints,
+            # in which case we need to convert them to tuples
+            if func_node.target == F.conv1d:
+                for i in [2, 3, 4]:
+                    if len(prepack_args) > i and isinstance(prepack_args[i], int):
+                        prepack_args[i] = (prepack_args[i],)
+        else:
+            raise ValueError("Lowering is not supported for op '%s'" % func_node.target)
+        with model.graph.inserting_before(output_scale_node):
+            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {})
+
+        # Step 2: Replace reference pattern with the corresponding quantized op
+        (q_func, q_relu_func) = STATIC_LOWER_FUNCTIONAL_MAP[func_node.target]  # type: ignore[index]
+        func_node.target = q_relu_func if relu_node is not None else q_func
+        func_node.args = (input_dq_node.args[0], packed_weight, output_scale_node, output_zp_node)
+        q_node.replace_all_uses_with(func_node)
+        # Move func_node after output_zp_node in the graph
+        output_zp_node.append(func_node)
+
+        # Clean up: Remove dequantize and quantize nodes, and the relu node if it exists
+        for dqn in [input_dq_node, weight_dq_node]:
+            dqn_input = dqn.args[0]
+            dqn.replace_all_uses_with(dqn_input)
+            model.graph.erase_node(dqn)
+        model.graph.erase_node(q_node)
+        if relu_node is not None:
+            model.graph.erase_node(relu_node)
+
+def _lower_dynamic_weighted_ref_functional(
+        model: QuantizedGraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and replace functional reference patterns with their dynamically
+    quantized versions.
+    Examples:
+    quantize_per_tensor_dynamic - dequantize - functional linear --> linear_dynamic
+    to(torch.float16) - dequantize - functional linear --> linear_dynamic_fp16
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    # we want to search in reserved order so that we can match the larger patterns first
+    # e.g. we want to match linear - relu before linear.
+    for n in reversed(model.graph.nodes):
+
+        # Step 0: Find nodes that match this pattern
+        # (quantize_per_tensor_dynamic - dequantize - dynamically quantized op)
+        # We search for the pattern backwards, starting with the quantize node
+        # Quantize node args: (func, scale, zp, dtype)
+        func_node = n
+        # Handle cases where the functional op is wrapped in a ReLU
+        if func_node.op == "call_function" and func_node.target == F.relu or \
+           func_node.op == "call_module" and \
+           type(modules[str(func_node.target)]) == torch.nn.ReLU:
+            relu_node = func_node
+            func_node = relu_node.args[0]
+        else:
+            relu_node = None
+        if should_skip_lowering(func_node, qconfig_map):
+            continue
+        # Linear args: (dequantized inputs, dequantized weights[, bias])
+        # Conv args: (dequantized inputs, dequantized weights[, bias, stride, padding, dilation, groups])
+        if func_node.op != "call_function" or func_node.target not in DYNAMIC_LOWER_FUNCTIONAL_MAP:
+            continue
+        (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args
+        if input_dq_node.op != "call_method" or input_dq_node.target != "dequantize" or \
+           weight_dq_node.op != "call_method" or weight_dq_node.target != "dequantize":
+            continue
+
+        input_dynamic_q_node = input_dq_node.args[0]
+        # don't support lowering the pattern when the result of quantize is used by
+        # multiple nodes
+        if len(input_dynamic_q_node.users) > 1:
+            continue
+
+        if input_dynamic_q_node.op != "call_function" or \
+           input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic:
+            continue
+
+        reduce_range_node = None
+        (pattern_input, activation_compute_dtype, reduce_range_node) = input_dynamic_q_node.args
+        is_fp16 = activation_compute_dtype == torch.float16
+        is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8]
+        if not is_int8 and not is_fp16:
+            continue
+
+        quantized_weight = weight_dq_node.args[0]
+        weight_dtype = quantized_weight.args[-1]
+
+        # Step 1: Try to select reference pattern with the corresponding quantized op
+        dynamic_quant_dtype_key = (activation_compute_dtype, weight_dtype)
+        if dynamic_quant_dtype_key not in DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target]:
+            print(f"Didn't find dtype combination {dynamic_quant_dtype_key} during "
+                  f"dynamic quantized op lowering for {func_node.target}")
+            continue
+        (q_func, q_relu_func) = DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target][dynamic_quant_dtype_key]
+
+        if q_func is None or q_relu_func is None:
+            print("Didn't find corresponding quantized function or quantized relu function "
+                  f"for {func_node.target}, {dynamic_quant_dtype_key}")
+            continue
+
+        # Step 2: Replace quantized weights with packed weights, which will be folded later
+        # Use the right prepack op and prepare the corresponding args
+        # Linear prepack args: (quantized weights[, bias])
+        # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups])
+        prepack_args = [quantized_weight] + remaining_func_args
+        if func_node.target == F.linear:
+            prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
+        elif func_node.target in CONV_FUNCTIONAL_OPS:
+            prepack_op = get_qconv_prepack_op(func_node.target)
+            # For conv1d, the stride, padding, and dilation args may be ints,
+            # in which case we need to convert them to tuples
+            if func_node.target == F.conv1d:
+                for i in [2, 3, 4]:
+                    if len(prepack_args) > i and isinstance(prepack_args[i], int):
+                        prepack_args[i] = (prepack_args[i],)
+        else:
+            raise ValueError("Lowering is not supported for op '%s'" % func_node.target)
+        with model.graph.inserting_before(func_node):
+            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {})
+
+        # Step 3: Replace reference pattern with the corresponding quantized op
+        func_node.target = q_relu_func if relu_node is not None else q_func
+        if is_int8:
+            func_node.args = (pattern_input, packed_weight, reduce_range_node)
+        else:
+            func_node.args = (pattern_input, packed_weight)
+
+        if relu_node is not None:
+            relu_node.replace_all_uses_with(func_node)
+
+        # Step 4: Remove dequantize and quantize nodes, and the relu node if it exists
+        for dqn in [input_dq_node, weight_dq_node]:
+            dqn_input = dqn.args[0]
+            dqn.replace_all_uses_with(dqn_input)
+            model.graph.erase_node(dqn)
+        model.graph.erase_node(input_dynamic_q_node)
+        if relu_node is not None:
+            model.graph.erase_node(relu_node)
+
+def _lower_quantized_binary_op(
+        model: QuantizedGraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    binary_ops_to_lower: List[Callable] = [operator.add, torch.add, operator.mul, torch.mul, torch.matmul]
+    modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        (q_node, relu_node, bop_node) = _match_static_pattern(
+            n, modules, qconfig_map, binary_ops_to_lower, dequantize_node_arg_indices=[0, 1])
+        if q_node is None:
+            continue
+        assert(bop_node is not None)
+        (_, scale_node, zero_point_node, _) = q_node.args
+
+        # Step 1: Remove dequant nodes
+        num_dq_nodes = 0
+        for arg in bop_node.args:
+            if not is_dequantize_node(arg):
+                continue
+            dq_node = arg
+            assert(isinstance(dq_node, Node))
+            dn_input = dq_node.args[0]
+            dq_node.replace_all_uses_with(dn_input)
+            model.graph.erase_node(dq_node)
+            num_dq_nodes += 1
+        assert(num_dq_nodes > 0)
+
+        # Step 2: Swap binary op to quantized binary op
+        assert bop_node.target in QBIN_OP_MAPPING
+        binop_to_qbinop = QBIN_OP_MAPPING if relu_node is None else QBIN_RELU_OP_MAPPING
+        qbin_op = binop_to_qbinop[bop_node.target]
+        # prepare the args for quantized bianry op
+        # (x, y)
+        qop_node_args = list(bop_node.args)
+        # (x, y, scale, zero_point)
+        # add scale and zero_point arguments for Tensor - Tensor operation
+        if num_dq_nodes == 2:
+            qop_node_args.extend([scale_node, zero_point_node])
+        # insert a call to quantized binary op and remove the original binary op
+        with model.graph.inserting_after(q_node):
+            qop_node = create_node_from_old_node_preserve_meta(
+                model.graph,
+                ("call_function", qbin_op, tuple(qop_node_args), {}),
+                bop_node)
+            q_node.replace_all_uses_with(qop_node)
+
+        # Step 3: Remove quantize node, binary op node, and relu node if any
+        model.graph.erase_node(q_node)
+        if relu_node is not None:
+            model.graph.erase_node(relu_node)
+        model.graph.erase_node(bop_node)
+
+def special_pattern_replacement(model: QuantizedGraphModule):
+    modules = dict(model.named_modules(remove_duplicate=False))
     for n in model.graph.nodes:
         q_node = n
-        if not (q_node.target == torch.quantize_per_tensor or
-           (q_node.op == "call_method" and q_node.target == "to" and q_node.args[1] == torch.float16)):
+        is_quantize = q_node.target == torch.quantize_per_tensor
+        is_to_fp16 = q_node.op == "call_method" and q_node.target == "to" and \
+            len(q_node.args) == 2 and q_node.args[1] == torch.float16
+        if not (is_quantize or is_to_fp16):
             continue
         ref_node = q_node.args[0]
         # get output scale/zero_point/dtype from the quantize node
         # ref_node, scale_node, zero_point_node, dtype = q_node.args
         # TODO: add safety checks that users for the ref_node and dq_node needs to be one
+        is_call_function, is_call_method, is_call_module = is_fixed_qparams_node(ref_node, modules)
+        if is_to_fp16 and (is_call_function or is_call_method or is_call_module):
+            # TODO: add a warning or error out here? (bc-breaking if error out)
+            # warnings.warn(
+            #     "Only reference patterns are currently supported for {dtype} dtype with {op} op"
+            #     "".format(dtype=dtypes, op=ref_node))
+            continue
+
+        is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules)
+        if is_to_fp16 and (is_call_function or is_call_method or is_call_module):
+            # TODO: add a warning or error out here? (bc-breaking if error out)
+            continue
 
-        is_call_function, is_call_method, is_call_module = check_node(ref_node, modules)
+        # This check includes all supported ops
+        is_call_function, is_call_method, is_call_module = is_special_pattern_node(ref_node, modules)
         if not (is_call_module or is_call_function or is_call_method):
             continue
         dq_node_or_nodes = ref_node.args[0]
@@ -127,12 +866,19 @@ def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphMo
             continue
 
         # TODO: enable we have patterns that needs to swap the modules
-        # if is_call_module:
-        #     ref_module = modules[ref_node.target]
-        #     # change this pattern to use the corresponding quantized module
-        #     # replace reference module with quantized module
-        #     parent_name, module_name = _parent_name(ref_node.target)
-        #     setattr(modules[parent_name], module_name, ref_module)
+        if is_call_module:
+            ref_module = modules[ref_node.target]
+            if type(ref_module) in SPECIAL_PATTERN_LOWER_MODULE_MAP and is_quantize:
+                qmodule_cls = SPECIAL_PATTERN_LOWER_MODULE_MAP.get(type(ref_module))
+                scale_node = q_node.args[1]
+                zero_point_node = q_node.args[2]
+                output_scale = getattr(model, scale_node.target)
+                output_zero_point = getattr(model, zero_point_node.target)
+
+                qmodule = qmodule_cls.from_reference(ref_module, output_scale, output_zero_point)  # type:ignore[union-attr]
+                # replace reference module with quantized module
+                parent_name, module_name = _parent_name(ref_node.target)
+                setattr(modules[parent_name], module_name, qmodule)
 
         # remove dq node:
         dq_nodes: List[Node] = []
@@ -147,30 +893,75 @@ def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphMo
             model.graph.erase_node(dq_node)
 
         # store q node args
-        q_node_args = list(q_node.args)[1:]
-
+        qnode_qparams = list(q_node.args)[1:]
         # replace uses of q node with input and remove q node
         q_node_input = q_node.args[0]
         q_node.replace_all_uses_with(q_node_input)
         model.graph.erase_node(q_node)
 
-        # remove q node args
-        for n in q_node_args:
-            if isinstance(n, Node):
-                model.graph.erase_node(n)
-
+        is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules)
+        if is_call_function:
+            # pass scale/zer_point arguments from quantize_per_tensor to the default node operator
+            # insert an op after the zero_point node so that the scale/zero_point
+            # nodes are is available
+            qop = get_quantized_operator(ref_node.target)
+            args = list(ref_node.args)
+            kwargs = dict(ref_node.kwargs)
+            if qop in QOP_TO_ARG_NAMES_TO_SKIP:
+                args_to_skip = QOP_TO_ARG_NAMES_TO_SKIP[qop]
+                for arg in args_to_skip:
+                    if arg in kwargs:
+                        kwargs.pop(arg)
+            kwargs["output_scale"] = qnode_qparams[0]
+            kwargs["output_zero_point"] = qnode_qparams[1]
+            with model.graph.inserting_after(qnode_qparams[1]):
+                qop_node = create_node_from_old_node_preserve_meta(
+                    model.graph,
+                    ("call_function", qop, tuple(args), kwargs),
+                    ref_node)
+                ref_node.replace_all_uses_with(qop_node)
+                model.graph.erase_node(ref_node)
+        else:
+            # remove scale/zero_point node for quantize node
+            for n in qnode_qparams:
+                if isinstance(n, Node):
+                    model.graph.erase_node(n)
 
-    model.recompile()
     return model
 
-def _lower_to_native_backend(model: QuantizedGraphModule) -> QuantizedGraphModule:
+def _lower_getattr_tensor_metadta_op(model: QuantizedGraphModule):
+    """ Modified the graph of the model inplace, to skip extra dequantize op before
+    the general tensor shape ops when possible
+    """
+    for n in model.graph.nodes:
+        if is_getattr_tensor_metadata_node(n):
+            maybe_dq = n.args[0]
+            if maybe_dq.op != "call_method" or maybe_dq.target != "dequantize":
+                continue
+            # skip the dequantize node
+            args = list(n.args)
+            args[0] = n.args[0].args[0]
+            n.args = tuple(args)
+
+def _lower_to_native_backend(
+    model: QuantizedGraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> QuantizedGraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same
     operator signature so they can be lowered with the same function
     """
-    model = _lower_weighted_ref_module(model)
-    for pattern, replacement in get_fbgemm_patterns_and_replacements():
-        subgraph_rewriter_FORKED_DO_NOT_USE.replace_pattern(model, pattern, replacement)
+    _lower_static_weighted_ref_module(model, qconfig_map)
+    _lower_dynamic_weighted_ref_module(model)
+    _lower_weight_only_weighted_ref_module(model)
+    _lower_static_weighted_ref_functional(model, qconfig_map)
+    _lower_dynamic_weighted_ref_functional(model, qconfig_map)
+    _lower_quantized_binary_op(model, qconfig_map)
+    _lower_getattr_tensor_metadta_op(model)
     special_pattern_replacement(model)
+    model = fold_weight(model, node_name_to_scope)
+    model.graph.eliminate_dead_code()
+    model.recompile()
     model.graph.lint()
     return model
diff --git a/torch/ao/quantization/fx/backend_config/__init__.py b/torch/ao/quantization/fx/backend_config/__init__.py
deleted file mode 100644
index b595b660344e..000000000000
--- a/torch/ao/quantization/fx/backend_config/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .tensorrt import get_tensorrt_backend_config_dict
-
-# TODO: add more validations
-def validate_backend_config_dict(backend_config_dict):
-    return "configs" in backend_config_dict
diff --git a/torch/ao/quantization/fx/backend_config/fuse_handler.py b/torch/ao/quantization/fx/backend_config/fuse_handler.py
deleted file mode 100644
index f98a40fa51e6..000000000000
--- a/torch/ao/quantization/fx/backend_config/fuse_handler.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from ..fusion_patterns import DefaultFuseHandler
-
-# TODO: move DefaultFuseHandler
-def get_fuse_handler_cls():
-    return DefaultFuseHandler
diff --git a/torch/ao/quantization/fx/backend_config/quantize_handler.py b/torch/ao/quantization/fx/backend_config/quantize_handler.py
deleted file mode 100644
index fe932e31bd21..000000000000
--- a/torch/ao/quantization/fx/backend_config/quantize_handler.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-from typing import Dict
-from torch.fx.graph import Node
-from .observation_type import ObservationType
-from ..quantization_patterns import QuantizeHandler
-
-def get_quantize_handler_cls(observation_type, dtype_configs):
-
-    class ConfigurableQuantizeHandler(QuantizeHandler):
-        def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]):
-            super().__init__(node, modules)
-            self.observation_type = observation_type
-            self.dtype_configs = dtype_configs
-
-        def is_general_tensor_value_op(self) -> bool:
-            return observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
-
-    return ConfigurableQuantizeHandler
diff --git a/torch/ao/quantization/fx/backend_config/utils.py b/torch/ao/quantization/fx/backend_config/utils.py
deleted file mode 100644
index 7affd58476ee..000000000000
--- a/torch/ao/quantization/fx/backend_config/utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import torch.nn as nn
-from .quantize_handler import get_quantize_handler_cls
-from .fuse_handler import get_fuse_handler_cls
-from typing import Dict, Any, List, Callable, Union
-from ..quantization_types import Pattern, QuantizerCls
-
-def get_pattern_to_quantize_handlers(
-        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, QuantizerCls]:
-    """
-    Note: Quantize handler is just a holder for some check methods like
-    (should_insert_observer_for_output), maybe this can be a enum as well,
-    we can refactor this after we convert the path for fbgemm/qnnpack fully to the
-    new path, this is not exposed to backend developers
-    """
-    pattern_to_quantize_handlers = dict()
-    for config in backend_config_dict.get("configs", []):
-        pattern = config["pattern"]
-        observation_type = config["observation_type"]
-        dtype_configs = config["dtype_configs"]
-        pattern_to_quantize_handlers[pattern] = \
-            get_quantize_handler_cls(observation_type, dtype_configs)
-
-    return pattern_to_quantize_handlers
-
-def get_pattern_to_dtype_configs(
-        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, List[Dict[str, torch.dtype]]]:
-    pattern_to_dtype_configs: Dict[Pattern, List[Dict[str, torch.dtype]]] = dict()
-    for config in backend_config_dict.get("configs", []):
-        pattern = config["pattern"]
-        dtype_configs = config["dtype_configs"]
-        pattern_to_dtype_configs[pattern] = dtype_configs
-    return pattern_to_dtype_configs
-
-def get_pattern_to_input_type_to_index(
-        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Dict[str, int]]:
-    pattern_to_input_type_to_index: Dict[Pattern, Dict[str, int]] = dict()
-    for config in backend_config_dict.get("configs", []):
-        pattern = config["pattern"]
-        input_type_to_index = config.get("input_type_to_index", {})
-        pattern_to_input_type_to_index[pattern] = input_type_to_index
-    return pattern_to_input_type_to_index
-
-def get_quantized_reference_module_mapping(
-        backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]:
-    mapping: Dict[Callable, Callable] = dict()
-    for config in backend_config_dict.get("configs", []):
-        if "root_module" in config and "reference_quantized_module_for_root" in config:
-            mapping[config["root_module"]] = config["reference_quantized_module_for_root"]
-    return mapping
-
-def get_fusion_pattern_to_fuse_handler_cls(
-        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]:
-    fusion_pattern_to_fuse_handlers = dict()
-    for config in backend_config_dict.get("configs", []):
-        if "fuser_method" in config:
-            pattern = config["pattern"]
-            fusion_pattern_to_fuse_handlers[pattern] = \
-                get_fuse_handler_cls()
-
-    return fusion_pattern_to_fuse_handlers
-
-def get_fuser_method_mapping(
-        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Union[nn.Sequential, Callable]]:
-    fuser_method_mapping : Dict[Pattern, Union[nn.Sequential, Callable]] = dict()
-    for config in backend_config_dict.get("configs", []):
-        if "fuser_method" in config:
-            pattern = config["pattern"]
-            fuser_method = config["fuser_method"]
-            fuser_method_mapping[pattern] = fuser_method
-
-    return fuser_method_mapping
-
-def get_module_to_qat_module(
-        backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]:
-    module_to_qat_module: Dict[Callable, Callable] = dict()
-    for config in backend_config_dict.get("configs", []):
-        if "pattern" in config and "qat_module" in config:
-            pattern = config["pattern"]
-            qat_module = config["qat_module"]
-            module_to_qat_module[pattern] = qat_module
-
-    return module_to_qat_module
diff --git a/torch/ao/quantization/fx/backend_config_utils.py b/torch/ao/quantization/fx/backend_config_utils.py
new file mode 100644
index 000000000000..68a4823823e5
--- /dev/null
+++ b/torch/ao/quantization/fx/backend_config_utils.py
@@ -0,0 +1,141 @@
+import torch
+from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns, sorted_patterns_dict
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+from torch.ao.quantization.backend_config.observation_type import ObservationType
+from torch.ao.quantization.quantization_types import (
+    Pattern,
+    NodePattern,
+    QuantizerCls,
+)
+from torch.ao.quantization.utils import (
+    activation_dtype,
+    get_combined_dict,
+)
+
+from .quantization_patterns import QuantizeHandler
+from .fusion_patterns import DefaultFuseHandler
+
+from typing import Dict, Any, Callable, Optional
+
+def get_quantize_handler_cls(
+        observation_type,
+        dtype_configs,
+        num_tensor_args_to_observation_type,
+        overwrite_output_fake_quantizer,
+        overwrite_output_observer,
+        input_output_observed):
+
+    class ConfigurableQuantizeHandler(QuantizeHandler):
+        def __init__(
+                self,
+                node_pattern: NodePattern,
+                modules: Dict[str, torch.nn.Module],
+                root_node_getter: Callable = None):
+            super().__init__(node_pattern, modules, root_node_getter)
+            if num_tensor_args_to_observation_type:
+                assert self.num_tensor_args in num_tensor_args_to_observation_type, \
+                    f"Must provide observation_type config for tensor number {self.num_tensor_args}" \
+                    f" in num_tensor_args_to_observation_type for {node_pattern}"
+                self.observation_type = num_tensor_args_to_observation_type[self.num_tensor_args]
+            else:
+                self.observation_type = observation_type
+            self.dtype_configs = dtype_configs
+            self.overwrite_output_fake_quantizer = overwrite_output_fake_quantizer
+            self.overwrite_output_observer = overwrite_output_observer
+            self.input_output_observed_ = input_output_observed
+
+        def is_general_tensor_value_op(self) -> bool:
+            return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+
+        # TODO: change this to output activation
+        def get_activation_ctr(
+                self,
+                qconfig: Any,
+                pattern: Pattern,
+                is_training: bool,
+        ) -> Optional[Callable]:
+            """
+            Returns the constructor for the activation observer which should be
+            used for the pattern matched to this handler. Some handlers override
+            this to a different value than what is specified in the qconfig.
+            """
+            act_dtype = activation_dtype(qconfig)
+            # TODO: change to is_qat
+            if is_training:
+                if act_dtype == torch.quint8 and self.overwrite_output_fake_quantizer is not None:
+                    return self.overwrite_output_fake_quantizer
+            else:
+                if act_dtype == torch.quint8 and self.overwrite_output_observer is not None:
+                    return self.overwrite_output_observer
+            return qconfig.activation
+
+        # This is temporary, and will be removed soon
+        def input_output_observed(self):
+            return self.input_output_observed_
+
+
+    return ConfigurableQuantizeHandler
+
+def get_pattern_to_quantize_handlers(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, QuantizerCls]:
+    """
+    Note: Quantize handler is just a holder for some check methods like
+    (should_insert_observer_for_output), maybe this can be a enum as well,
+    we can refactor this after we convert the path for fbgemm/qnnpack fully to the
+    new path, this is not exposed to backend developers
+    """
+    pattern_to_quantize_handlers = dict()
+    for config in backend_config_dict.get("configs", []):
+        pattern = config["pattern"]
+        observation_type = config.get("observation_type", None)
+        dtype_configs = config["dtype_configs"]
+        num_tensor_args_to_observation_type = config.get("num_tensor_args_to_observation_type", {})
+        overwrite_fake_quantizer = config.get("_overwrite_output_fake_quantizer", None)
+        overwrite_observer = config.get("_overwrite_output_observer", None)
+        input_output_observed = config.get("_input_output_observed", True)
+        pattern_to_quantize_handlers[pattern] = \
+            get_quantize_handler_cls(
+                observation_type,
+                dtype_configs,
+                num_tensor_args_to_observation_type,
+                overwrite_fake_quantizer,
+                overwrite_observer,
+                input_output_observed)
+
+    return pattern_to_quantize_handlers
+
+def get_fusion_pattern_to_fuse_handler_cls(
+        backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]:
+    fusion_pattern_to_fuse_handlers: Dict[Pattern, Callable] = dict()
+    for config in backend_config_dict.get("configs", []):
+        if "fuser_method" in config:
+            pattern = config["pattern"]
+            fusion_pattern_to_fuse_handlers[pattern] = DefaultFuseHandler
+
+    return fusion_pattern_to_fuse_handlers
+
+# TODO: remove when all uses are changed to backend_config_dict
+def get_native_quant_patterns(additional_quant_patterns: Dict[Pattern, QuantizerCls] = None) -> Dict[Pattern, QuantizerCls]:
+    """
+    Return a map from pattern to quantize handlers based on the default patterns and the native backend_config_dict.
+    The returned map is sorted such that longer patterns will be encountered first when iterating through it.
+    """
+    patterns = get_default_quant_patterns()
+    if additional_quant_patterns is not None:
+        patterns = get_combined_dict(patterns, additional_quant_patterns)
+    # TODO: currently we just extend the quantize handlers generated from
+    # `get_native_backend_config_dict`
+    # in the future we can just assign backend_config_dict when everything is defined
+    for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config_dict()).items():
+        patterns[pattern] = quantize_handler
+    return sorted_patterns_dict(patterns)
+
+get_fusion_pattern_to_fuse_handler_cls.__module__ = "torch.ao.quantization.fx.backend_config_utils"
+get_native_quant_patterns.__module__ = "torch.ao.quantization.fx.backend_config_utils"
+get_pattern_to_quantize_handlers.__module__ = "torch.ao.quantization.fx.backend_config_utils"
+
+__all__ = [
+    "get_fusion_pattern_to_fuse_handler_cls",
+    "get_native_quant_patterns",
+    "get_pattern_to_quantize_handlers",
+]
diff --git a/torch/ao/quantization/fx/common_quantization_patterns.py b/torch/ao/quantization/fx/common_quantization_patterns.py
index a6e687cc6e91..a863c18a383e 100644
--- a/torch/ao/quantization/fx/common_quantization_patterns.py
+++ b/torch/ao/quantization/fx/common_quantization_patterns.py
@@ -1,73 +1,8 @@
-import torch
-from torch.fx.graph import (
-    Node,
-    Graph,
-)
-
-from ..utils import (
-    get_qconfig_dtypes,
-    activation_dtype,
-)
-
-from .utils import (
-    quantize_node,
-)
-
 from .quantization_patterns import (
     QuantizeHandler,
 )
-
-from ..qconfig import QConfigAny
-
-from typing import Any, Callable, Dict, Tuple
-
+# TODO: remove
 class CommonQuantizeHandler(QuantizeHandler):
     """ Common quantized op, first input and first output will be quantized
     """
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        if node.op == "call_function" or node.op == "call_method":
-            self.op = node.target
-        elif node.op == "call_module":
-            self.op = type(modules[str(node.target)])
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if not self.all_node_args_are_tensors:
-            return NotImplemented
-        assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \
-            'call_function are handled in DefaultNode'
-        assert is_reference
-        if convert_custom_config_dict is None:
-            convert_custom_config_dict = {}
-        additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
-
-        dtypes = get_qconfig_dtypes(qconfig)
-        # We can produce reference for a dtypes including
-        # (torch.quint8, torch.qint8, torch.qint32, torch.float16)
-        act_dtype = activation_dtype(qconfig)
-        if act_dtype == torch.float:
-            op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            return op_out
-        else:
-            activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
-            assert activation_post_process is not None
-            # make sure the input is quantized to act_dtype
-            load_arg(quantized={0: act_dtype})(node.args)
-            args = load_arg(quantized=torch.float)(node.args)
-            kwargs = load_arg(quantized=torch.float)(node.kwargs)
-            op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            return quantize_node(
-                op_out, activation_post_process,
-                node, modules, quantized_graph, node_name_to_scope, is_input=False)
+    pass
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index b27b68bed8b3..04d7a76fdbf7 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -1,29 +1,25 @@
-from typing import Any, Dict, Tuple, List, Callable, Optional, Union, Set
-from collections import defaultdict
-import copy
+from typing import Any, Dict, List, Optional, Set, Callable, Tuple
 import torch
+import copy
+import warnings
 from torch.fx import (
     GraphModule,
-    Proxy,
-    map_arg
 )
 from torch.fx.graph import (
     Graph,
     Node,
+    Argument,
 )
-from torch.fx.node import Argument
-from .quantization_types import Pattern
-from ..qconfig import QConfigAny, qconfig_equals
-from .match_utils import (
-    find_matches,
-)
-from .graph_module import (
-    is_observed_module,
-    is_observed_standalone_module,
-    QuantizedGraphModule,
+from ..utils import (
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    get_qparam_dict,
+    _parent_name,
+    get_swapped_custom_module_class,
 )
-from .quantization_patterns import (
-    QuantizeHandler,
+from ..qconfig import (
+    QConfigAny,
+    qconfig_equals
 )
 from ..qconfig_dict_utils import (
     convert_dict_to_ordered_dict,
@@ -33,16 +29,27 @@
     generate_qconfig_map,
     compare_prepare_convert_qconfig_dict,
     update_qconfig_for_fusion,
+    is_qconfig_supported_by_dtype_configs,
+)
+from torch.ao.quantization.backend_config.utils import (
+    get_root_module_to_quantized_reference_module,
+    get_pattern_to_dtype_configs,
+    get_fused_module_classes,
+    get_qat_module_classes,
+)
+from torch.ao.quantization.backend_config import get_native_backend_config_dict
+from .graph_module import (
+    QuantizedGraphModule,
+    is_observed_module,
+    is_observed_standalone_module,
 )
 from ._equalize import update_obs_for_equalization, convert_eq_obs
 from .utils import (
-    is_get_tensor_info_node,
-    node_return_type_is_int,
-    quantize_node,
-    get_new_attr_name_with_prefix,
+    get_custom_module_class_keys,
+    get_quantize_node_info,
+    create_getattr_from_value,
     collect_producer_nodes,
     graph_module_from_producer_nodes,
-    get_custom_module_class_keys,
     WEIGHT_INDEX_DICT,
 )
 
@@ -50,114 +57,65 @@
     _remove_qconfig,
     is_activation_post_process,
 )
-from ..utils import (
-    activation_is_statically_quantized,
-    activation_dtype,
-)
-
 from .lower_to_fbgemm import lower_to_fbgemm
-from ..quantization_mappings import (
-    DEFAULT_QAT_MODULE_MAPPINGS,
-)
 
-# weight prepacking ops
-WEIGHT_PREPACK_OPS = {
-    torch._ops.ops.quantized.linear_prepack,
-    torch._ops.ops.quantized.linear_prepack_fp16,
-    torch._ops.ops.quantized.conv1d_prepack,
-    torch._ops.ops.quantized.conv2d_prepack,
-    torch._ops.ops.quantized.conv3d_prepack,
-}
+def restore_state(
+        observed: torch.nn.Module
+) -> Tuple[Dict[str, Tuple[str, type]],
+           Dict[str, Any],
+           Set[str]]:
+    assert is_observed_module(observed), \
+        'incoming model must be produced by prepare_fx'
+    prepare_custom_config_dict: Dict[str, Any] = \
+        observed._prepare_custom_config_dict  # type: ignore[assignment]
+    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
+    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
+    return node_name_to_scope, prepare_custom_config_dict, observed_node_names
+
+def has_none_qconfig(node: Argument, qconfig_map: Dict[str, QConfigAny]) -> bool:
+    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
+    the node
+    """
+    return isinstance(node, Node) and node.name in qconfig_map and qconfig_map[node.name] is None
 
 def run_weight_observers(observed: GraphModule) -> None:
-    r''' Extract the subgraph that produces the weight for dynamic quant
+    """ Extract the subgraph that produces the weight for dynamic quant
     or weight only quant node and run the subgraph to observe the weight.
     Note that the observers of dynamic quant or weight only quant ops are
     run during the convert step.
-    '''
-    for node in observed.graph.nodes:
-        if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
-            for i, node_arg in enumerate(node.args):
-                if i in WEIGHT_INDEX_DICT[node.target]:
-                    # node_arg is weight
-                    weight_observer_nodes = collect_producer_nodes(node_arg)
-                    if weight_observer_nodes is not None:
-                        weight_observer_module = \
-                            graph_module_from_producer_nodes(
-                                observed, weight_observer_nodes)
-                        # run the weight observer
-                        weight_observer_module()
-
-def fold_weight(
-        quantized: QuantizedGraphModule,
-        node_name_to_scope: Dict[str, Tuple[str, type]]) -> QuantizedGraphModule:
     """
-    Trace back from the weight node util we hit getattr, reconstruct the
-    graph module with the traced nodes and run the graph module to pack the
-    weight. then replace the original chain of ops with the packed weight.
-    """
-    packed_weights = dict()
-    # map from folded node name to the prepacked weight name
-    folded_nodes = dict()
-    # get packed weights
-    for node in quantized.graph.nodes:
-        if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS:
-            nodes_to_fold = collect_producer_nodes(node)
-            if nodes_to_fold is not None:
-                for node_to_fold in nodes_to_fold:
-                    folded_nodes[node_to_fold.name] = node
-
-                prepacking_module = graph_module_from_producer_nodes(
-                    quantized, nodes_to_fold)
-                packed_weight = prepacking_module()
-                packed_weights[node.name] = packed_weight
-
-    # remove folded nodes and replace the prepacking node with getattr
-    folded_graph = Graph()
-    env: Dict[Any, Any] = {}
-
-    def load_arg(a):
-        return map_arg(a, lambda node: env[node.name])
-    quantized_root = quantized
-    quantized_graph = quantized.graph
-
-    for node in quantized_graph.nodes:
-        prepack_node = folded_nodes.get(node.name, None)
-        if prepack_node is node:
-            packed_weight = packed_weights[node.name]
-            # add a prepacked attribute to root
-            op_node = list(prepack_node.users)[0]
-            module_path, _ = node_name_to_scope[op_node.name]
-            get_new_packed_weight_name = \
-                get_new_attr_name_with_prefix(module_path + '_packed_weight_')
-            packed_weight_name = get_new_packed_weight_name(quantized_root)
-            setattr(quantized_root, packed_weight_name, packed_weight)
-            # replace prepack node with a getattr node
-            env[node.name] = folded_graph.create_node(
-                'get_attr', packed_weight_name, (), {})
-        elif prepack_node is not None:
-            # remove the foled node
+    for node in observed.graph.nodes:
+        if node.op != 'call_function' or node.target not in WEIGHT_INDEX_DICT:
             continue
-        else:
-            # copy other nodes
-            env[node.name] = folded_graph.node_copy(node, load_arg)
-    quantized = QuantizedGraphModule(quantized_root, folded_graph, quantized_root.preserved_attr_names)
-    return quantized
-
-def remove_quant_dequant_pairs(quantized: QuantizedGraphModule) -> QuantizedGraphModule:
+        for i, node_arg in enumerate(node.args):
+            if i not in WEIGHT_INDEX_DICT[node.target]:
+                continue
+            # node_arg is weight
+            weight_observer_nodes = collect_producer_nodes(node_arg)
+            if weight_observer_nodes is None:
+                continue
+            weight_observer_module = \
+                graph_module_from_producer_nodes(
+                    observed, weight_observer_nodes)
+            # run the weight observer
+            weight_observer_module()
+
+# this method is temporary will be removed soon
+def duplicate_quantize_dynamic_node(quantized: QuantizedGraphModule) -> QuantizedGraphModule:
     quantized_root = quantized
     for node in quantized.graph.nodes:
-        if node.op == "call_function" and node.target in [torch.quantize_per_tensor, torch.quantize_per_channel]:
+        if (node.op == "call_function" and node.target == torch.quantize_per_tensor_dynamic):
             users = list(node.users)
-            user = users[0] if users else None
-            if len(users) == 1 and user.op == "call_method" and user.target == "dequantize":
-                user.replace_all_uses_with(node.args[0])
-                quantized.graph.erase_node(user)
-                orig_args = list(node.args)
+            if len(users) > 1:
+                for user in users:
+                    with quantized.graph.inserting_before(node):
+                        new_node = quantized.graph.create_node(
+                            "call_function",
+                            torch.quantize_per_tensor_dynamic,
+                            node.args,
+                            node.kwargs)
+                    user.replace_input_with(node, new_node)
                 quantized.graph.erase_node(node)
-                for arg in orig_args:
-                    if isinstance(arg, Node) and len(list(arg.users)) == 0:
-                        quantized.graph.erase_node(arg)
 
     quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names)
     return quantized
@@ -204,28 +162,371 @@ def remove_extra_dequantize(quantized: QuantizedGraphModule) -> QuantizedGraphMo
     quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names)
     return quantized
 
+def remove_quant_dequant_pairs(quantized: QuantizedGraphModule) -> QuantizedGraphModule:
+    quantized_root = quantized
+    for node in quantized.graph.nodes:
+        if node.op == "call_function" and node.target in [torch.quantize_per_tensor, torch.quantize_per_channel]:
+            users = list(node.users)
+            user = users[0] if users else None
+            if len(users) == 1 and user.op == "call_method" and user.target == "dequantize":
+                user.replace_all_uses_with(node.args[0])
+                quantized.graph.erase_node(user)
+                orig_args = list(node.args)
+                quantized.graph.erase_node(node)
+                for arg in orig_args:
+                    if isinstance(arg, Node) and len(list(arg.users)) == 0:
+                        quantized.graph.erase_node(arg)
 
-def restore_state(
-        observed: torch.nn.Module
-) -> Tuple[Dict[Pattern, QuantizeHandler],
-           Dict[str, Tuple[str, type]],
-           Dict[str, Any],
-           Set[str]]:
-    assert is_observed_module(observed), \
-        'incoming model must be produced by prepare_fx'
-    prepare_custom_config_dict: Dict[str, Any] = \
-        observed._prepare_custom_config_dict  # type: ignore[assignment]
-    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
-    patterns: Dict[Pattern, QuantizeHandler] = observed._patterns  # type: ignore[assignment]
-    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
-    return patterns, node_name_to_scope, prepare_custom_config_dict, observed_node_names
-
-def convert(model: GraphModule, is_reference: bool = False,
-            convert_custom_config_dict: Dict[str, Any] = None,
-            is_standalone_module: bool = False,
-            _remove_qconfig_flag: bool = True,
-            convert_qconfig_dict: Dict[str, Any] = None) -> torch.nn.Module:
-    """ standalone_module means it a submodule that is not inlined in
+    quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names)
+    return quantized
+
+def maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
+    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
+    we'll recursively remove the dequantize Node
+    """
+    if isinstance(arg, Node) and \
+       arg.op == "call_method" and \
+       arg.target == "dequantize":
+        quantize_node = arg.args[0]
+        # we only replace the specific use since dequantize could be used by other nodes
+        # as well
+        node.replace_input_with(arg, quantize_node)
+    elif isinstance(arg, (list, tuple)):
+        for arg_element in arg:
+            maybe_recursive_remove_dequantize(arg_element, node, graph)
+    elif isinstance(arg, dict):
+        for arg_element in arg.values():
+            maybe_recursive_remove_dequantize(arg_element, node, graph)
+    else:
+        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
+
+def get_module_path_and_prefix(
+        obs_node: Node,
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        qconfig_map: Dict[str, QConfigAny]):
+    """ Given and observer node, get the `Scope` or the fully qualified name for
+    the submodule containing the observed node, also return a prefix of "_input"
+    when the observed node is an input of a F.linear op, and not the output of another
+    quantized op.
+    TODO: this logic is hacky, we should think about how to remove it or make it more
+    general
+    """
+    observed_node = obs_node.args[0]
+    # an observer can be inserted for both input of the next operator or output of the previous
+    # operator (they can be the same)
+    # this flag identifies if the observer is inserted only because the observed node is
+    # the input of the next operator
+    assert isinstance(observed_node, Node), \
+        f"Expecting observed node to be a Node, but got {observed_node}"
+    is_input_observer_only = qconfig_map[observed_node.name] is None if observed_node.name in qconfig_map else None
+    if is_input_observer_only:
+        # if the quantize function is at the input of op, then we find the first user of the observer_node
+        # to get the path. If a linear call_function is in the user list, we return the first instance
+        # of linear node to get the FQN.
+        users = list(obs_node.users)
+        first_linear_use_or_first_use = users[0] if users else None
+        linear_node = None
+        for n in users:
+            if n.op == "call_function" and n.target == torch.nn.functional.linear:
+                linear_node = n
+                break
+        if linear_node:
+            first_linear_use_or_first_use = linear_node
+        prefix = "_input"
+    else:
+        # if the quantize function is at the output of the op, we use the observer input node to get the path
+        first_linear_use_or_first_use = observed_node
+        prefix = ""
+
+    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
+        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
+    else:
+        # TODO: it's not used, so actually we can skip quantization
+        # but this requires changing return type of quantize_node
+        # we can fix it later if needed
+        module_path = ""
+    return module_path, prefix
+
+def insert_dequantize_node(
+        node: Node,
+        graph: Graph):
+    """ Inserts dequantize node for `node` in `graph`
+    """
+    with graph.inserting_after(node):
+        dequantize_node = graph.call_method("dequantize", (node,))
+        for user_node in dict(node.users):
+            if user_node is not dequantize_node:
+                user_node.replace_input_with(node, dequantize_node)
+
+def maybe_get_observer_for_node(
+        node: Node,
+        modules: Dict[str, torch.nn.Module]
+) -> Optional[torch.nn.Module]:
+    """
+    If the node is observed, return the observer
+    instance. Otherwise, return None.
+    """
+    for maybe_obs_node, _ in node.users.items():
+        if maybe_obs_node.op == 'call_module':
+            maybe_obs = modules[str(maybe_obs_node.target)]
+            if is_activation_post_process(maybe_obs):
+                return maybe_obs
+    return None
+
+def convert_standalone_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        model: torch.fx.GraphModule,
+        is_reference: bool,
+        backend_config_dict: Optional[Dict[str, Any]]):
+    """ Converts a observed standalone module to a quantized standalone module by calling
+    the fx convert api, currently using the same `is_reference` flag as parent, but we may
+    changing this behavior in the future (e.g. separating quantization and lowering for
+    standalone module as well)
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - model: original model
+      - is_reference: a flag from parent provided by user to decide if we want to
+        produce a reference model or a fbgemm/qnnpack model
+      - backend_config_dict: backend configuration of the target backend of quantization
+    """
+    convert = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
+    # We know that observed standalone module is a GraphModule since
+    # it's produced by us
+    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
+    sm_input_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_input_quantized_idxs\
+        .tolist()  # type: ignore[operator]
+    # remove the dequantize nodes for inputs
+    args = list(node.args)
+    for idx in range(len(args)):
+        if idx in sm_input_quantized_idxs:
+            arg = args[idx]
+            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
+                quantize_node = arg.args[0]  # type: ignore[union-attr]
+                node.replace_input_with(arg, quantize_node)
+                if len(arg.users) == 0:  # type: ignore[union-attr]
+                    model.graph.erase_node(arg)
+    # add dequantize node for output
+    sm_output_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_output_quantized_idxs \
+        .tolist()  # type: ignore[operator]
+    if len(sm_output_quantized_idxs) > 0:
+        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
+        "output idxs = [0] is supported"
+
+        # if it's non-empty, then it means the output is kept in quantized form
+        # we'll just add a dequantize node after this node
+        insert_dequantize_node(node, model.graph)
+
+    # TODO: allow convert_custom_config_dict to override backend_config_dict
+    # for standalone module
+    # TODO: think about how to handle `is_reference` here
+    quantized_standalone_module = convert(
+        observed_standalone_module,
+        is_reference=is_reference,
+        backend_config_dict=backend_config_dict)
+    parent_name, name = _parent_name(node.target)
+    # update the modules dict
+    setattr(modules[parent_name], name, quantized_standalone_module)
+    modules[str(node.target)] = quantized_standalone_module
+
+def convert_weighted_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        observed_node_names: Set[str],
+        qconfig_map: Dict[str, QConfigAny],
+        backend_config_dict: Dict[str, Any]):
+    """ Convert a weighted module to reference quantized module in the model
+    If the QConfig of a QAT module is not set, the module will still be converted to
+    a float module.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - observed_node_names: names for the set of observed fx node, we can skip
+        this conversion if the node is not observed
+    """
+    original_module = modules[str(node.target)]
+    qconfig: QConfigAny = original_module.qconfig  # type: ignore[assignment]
+    weight_post_process = None
+    qat_module_classes = get_qat_module_classes(backend_config_dict)
+
+    if isinstance(
+            original_module,
+            qat_module_classes):
+        # Converting qat module to a float module, we need to attch
+        # weight fake_quant to the module, weight fake_quant is assumed to be run during
+        # QAT so we don't need to run it again here
+        weight_post_process = original_module.weight_fake_quant
+        original_module = original_module.to_float()  # type: ignore[operator]
+        # change qat module to float module
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name, original_module)
+
+    is_observed = node.name in observed_node_names
+    # If a qconfig is not defined for this node, then skip converting to a reference module
+    if qconfig is None or has_none_qconfig(node, qconfig_map) or not is_observed:
+        return
+
+    # skip converting to reference quantized module if the qconfig is not supported
+    pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config_dict)
+    dtype_configs = pattern_to_dtype_configs.get(type(original_module), [])
+    if not is_qconfig_supported_by_dtype_configs(qconfig, dtype_configs):
+        return
+
+    # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized
+    is_weight_quantized = weight_is_quantized(qconfig)
+
+    # the condition for swapping the module to reference quantized module is:
+    # weights need to be quantized
+    if not is_weight_quantized:
+        return
+
+    fused_module = None
+    float_module = original_module
+    # extract the inidividual float_module and fused module
+    if isinstance(original_module, torch.nn.intrinsic._FusedModule):
+        fused_module = float_module
+        float_module = fused_module[0]  # type: ignore[index]
+
+    # TODO: move this to the reference quantized module
+    # weight_qparams or weight_qparams dict
+    wq_or_wq_dict = {}
+    if isinstance(float_module, torch.nn.RNNCellBase):
+        weight_post_process_ih = qconfig.weight()  # type: ignore[union-attr, operator]
+        weight_post_process_hh = qconfig.weight()  # type: ignore[union-attr, operator]
+        weight_post_process_ih(float_module.weight_ih)
+        weight_post_process_hh(float_module.weight_hh)
+        weight_qparams_ih = get_qparam_dict(weight_post_process_ih)
+        weight_qparams_hh = get_qparam_dict(weight_post_process_hh)
+        wq_or_wq_dict = {
+            "weight_ih": weight_qparams_ih,
+            "weight_hh": weight_qparams_hh,
+        }
+    elif isinstance(float_module, torch.nn.LSTM):
+        # format for wq_or_wq_dict (flattened attributes):
+        # {"weight_ih_l0_scale": ..., "weight_ih_l0_qscheme": ..., ...}
+        for wn in float_module._flat_weights_names:
+            if hasattr(float_module, wn) and wn.startswith("weight"):
+                weight = getattr(float_module, wn)
+                weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
+                if weight_post_process.dtype == torch.qint8:  # type: ignore[union-attr]
+                    weight_post_process(weight)  # type: ignore[operator, misc]
+                wq_or_wq_dict[wn] = get_qparam_dict(weight_post_process)
+    else:
+        # weight_post_process is None means the original module is not a QAT module
+        # we need to get weight_post_process from qconfig in this case
+        if weight_post_process is None:
+            weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
+        # run weight observer
+        # TODO: This is currently a hack for QAT to get the right shapes for scale and zero point.
+        # In the future, we should require the user to calibrate the model after calling prepare
+        # Issue: https://github.com/pytorch/pytorch/issues/73941
+        weight_post_process(float_module.weight)  # type: ignore[operator]
+        wq_or_wq_dict = get_qparam_dict(weight_post_process)
+
+    # We use the same reference module for all modes of quantization: static, dynamic, weight_only
+    # root_module_to_quantized_reference_module: module mapping from root (floating point) module class
+    # to quantized reference module class, e.g. nn.Conv2d to nn.quantized._reference.Conv2d
+    root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config_dict)
+    ref_qmodule_cls = root_module_to_quantized_reference_module.get(type(float_module), None)
+    assert ref_qmodule_cls is not None, f"No reference quantized module class configured for {type(float_module)}"
+    ref_qmodule = ref_qmodule_cls.from_float(float_module, wq_or_wq_dict)  # type: ignore[attr-defined]
+    if fused_module is not None:
+        fused_module[0] = ref_qmodule  # type: ignore[operator]
+    else:
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name, ref_qmodule)
+
+def convert_custom_module(
+        node: Node,
+        graph: Graph,
+        modules: Dict[str, torch.nn.Module],
+        custom_module_class_mapping: Dict[Callable, Callable],
+        statically_quantized_custom_module_nodes: Set[Node]):
+    """ Converts an observed custom module to a quantized custom module based on
+    `custom_module_class_mapping`
+    For static quantization, we'll also remove the previous `dequantize` node and
+    attach the observer node for output to the module, the observer for the node
+    will be converted to a dequantize node instead of quantize-dequantize pairs
+    later in the graph. In the end we would have a quantized custom module that
+    has the same interface as a default quantized module in nn.quantized namespace,
+    i.e. quantized input and quantized output.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - graph: The graph containing the node
+      - modules: named_module of original model
+      - custom_module_class_mapping: mapping from observed custom module class to
+        quantized custom module class, used to swap custom modules
+      - statically_quantized_custom_module_nodes: we'll add the custom module node
+        if we find it is statically quantized, this will be used later when converting
+        observers to quant/dequant node pairs, if the observed node is a statically
+        quantized custom module nodes, we'll convert the observer to a dequantize node,
+        this is to keep the interface the same as the default quantized module.
+        TODO: maybe we want to redesign this part to align with reference model design
+        as well, but there has been some discussions around the interface, so we can do
+        it later.
+    """
+    observed_custom_module = modules[str(node.target)]
+    maybe_obs = maybe_get_observer_for_node(node, modules)
+    qconfig = observed_custom_module.qconfig
+    if activation_is_statically_quantized(qconfig):
+        statically_quantized_custom_module_nodes.add(node)
+        # remove the previous dequant node
+        prev_node = node.args[0]
+        # expecting the input node for a custom module node to be a Node
+        assert isinstance(prev_node, Node), \
+            f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+        if prev_node.op == "call_method" and prev_node.target == "dequantize":
+            # change the connection for custom module, we'll change the input
+            # of custom module node to quantize node:
+            # Before: quantize - dequantize - custom - module
+            # After: quantize - custom - module
+            #              \ - dequantize
+            node.replace_input_with(prev_node, prev_node.args[0])
+
+            # Remove the dequantize node if it doesn't have other users
+            if len(prev_node.users) == 0:
+                graph.erase_node(prev_node)
+
+        # absorb the following observer into the module conversion
+        activation_post_process = maybe_get_observer_for_node(node, modules)
+        assert activation_post_process is not None
+        observed_custom_module.activation_post_process = activation_post_process
+
+    # swap the observed custom module to quantized custom module
+    quantized_custom_module_class = get_swapped_custom_module_class(
+        observed_custom_module, custom_module_class_mapping, qconfig)
+    quantized_custom_module = \
+        quantized_custom_module_class.from_observed(observed_custom_module)
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, quantized_custom_module)
+
+def convert(
+        model: GraphModule, is_reference: bool = False,
+        convert_custom_config_dict: Dict[str, Any] = None,
+        is_standalone_module: bool = False,
+        _remove_qconfig_flag: bool = True,
+        convert_qconfig_dict: Dict[str, Any] = None,
+        backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module:
+    """
+    We will convert an observed model (a module with observer calls) to a reference
+    quantized model, the rule is simple:
+    1. for each observer module call in the graph, we'll convert it to calls to
+       quantize and dequantize functions based on the observer instance
+    2. for weighted operations like linear/conv, we need to convert them to reference
+       quantized module, this requires us to know whether the dtype configured for the
+       weight is supported in the backend, this is done in prepare step and the result
+       is stored in observed_node_names, we can decide whether we need to swap the
+       module based on this set
+
+    standalone_module means it a submodule that is not inlined in
     parent module, and will be quantized separately as one unit.
 
     Returns a quantized standalone module, whether input/output is quantized is
@@ -235,7 +536,7 @@ def convert(model: GraphModule, is_reference: bool = False,
     """
     if convert_custom_config_dict is None:
         convert_custom_config_dict = {}
-    patterns, node_name_to_scope, prepare_custom_config_dict, _ = restore_state(model)
+    node_name_to_scope, prepare_custom_config_dict, observed_node_names = restore_state(model)
     qconfig_map: Dict[str, QConfigAny] = model._qconfig_map  # type: ignore[assignment]
 
     # TODO this should be removed now that gpu support for quantization is being supported.
@@ -264,9 +565,7 @@ def convert(model: GraphModule, is_reference: bool = False,
         modules_copy = copy.deepcopy(modules)
         convert_dict_to_ordered_dict(convert_qconfig_dict)
         if model._is_qat:
-            additional_qat_module_mapping = prepare_custom_config_dict.get(
-                "additional_qat_module_mapping", {})
-            convert_qconfig_dict = update_qconfig_for_qat(convert_qconfig_dict, additional_qat_module_mapping)
+            convert_qconfig_dict = update_qconfig_for_qat(convert_qconfig_dict, {})
         convert_qconfig_dict = update_qconfig_for_fusion(model, convert_qconfig_dict)
 
         compare_prepare_convert_qconfig_dict(prepare_qconfig_dict, convert_qconfig_dict)  # type: ignore[arg-type]
@@ -283,10 +582,7 @@ def convert(model: GraphModule, is_reference: bool = False,
     custom_module_classes = get_custom_module_class_keys(
         convert_custom_config_dict,
         "observed_to_quantized_custom_module_class")
-    matches = find_matches(
-        model.graph, modules, patterns,
-        qconfig_map,
-        custom_module_classes=custom_module_classes)
+    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
 
     if model._equalization_qconfig_map is not None:
         # If we want to do equalization then do the following:
@@ -299,354 +595,168 @@ def convert(model: GraphModule, is_reference: bool = False,
     # for dynamic quant ops or weight only quant ops
     run_weight_observers(model)
 
-    quantized_graph = Graph()
-    env: Dict[str, Dict[Optional[torch.dtype], Node]] = defaultdict(lambda: defaultdict(Node))  # type: ignore[arg-type]
-
     graph_inputs: List[str] = []
     for node in model.graph.nodes:
         if node.op == 'placeholder':
             graph_inputs.append(node.name)
 
-    def load_non_quantized(n: Node) -> Node:
-        assert n.name in env, \
-            'trying to load float node but did not find ' + \
-            'node:' + n.name + \
-            ' in env: ' + \
-            str(env)
-        dtype_to_node = env[n.name]
-        if torch.float in dtype_to_node:
-            return dtype_to_node[torch.float]
-        elif None in dtype_to_node:
-            return dtype_to_node[None]
-        else:
-            quantized_node = None
-            for dtype in [torch.quint8, torch.qint8, torch.float16]:
-                if dtype in dtype_to_node:
-                    quantized_node = dtype_to_node[dtype]
-                    break
-            assert quantized_node is not None, "Did not find a supported quantized dtype:{}".format(dtype_to_node)
-            env[n.name][torch.float] = Proxy(quantized_node).dequantize().node
-            return env[n.name][torch.float]
-
-    def load_quantized(dtype: torch.dtype):
-        def load_quantized_impl(n: Node):
-            assert n.name in env, \
-                'trying to load quantized node but did not find node:' + \
-                n.name + ' in environment:' + str(env)
-            dtype_to_node = env[n.name]
-            local_dtype : Optional[torch.dtype] = dtype
-            if local_dtype == torch.float and local_dtype not in dtype_to_node:
-                local_dtype = None
-            if local_dtype in [torch.float, None]:
-                return load_non_quantized(n)
-            assert local_dtype in dtype_to_node, f'Expecting {dtype} in {dtype_to_node}'
-            return dtype_to_node[local_dtype]
-
-        return load_quantized_impl
-
-    def load_x(n: Node) -> Node:
-        assert n.name in env, \
-            'node ' + n.name + ' does not exist in environment'
-        dtype_to_node = env[n.name]
-        dtypes = [torch.quint8, torch.qint8, torch.float16, torch.float32, None]
-        for dtype in dtypes:
-            if dtype in dtype_to_node:
-                return dtype_to_node[dtype]
-        raise Exception(f'dtype {dtype} not found in environment: {dtype_to_node} for node {n.name}')
-
-    def load_arg(
-            quantized: Optional[Union[List[int], Dict[int, torch.dtype], torch.dtype, Tuple[int, ...]]]
-    ) -> Callable[[Node], Argument]:
-        """
-        Input: quantized, which can be None, torch.dtype, list or tuple
-          - if quantized is None, then we'll load the node as long as it
-            exists
-          - if quantized is a dtype, then all args will be
-            quantized to the specific dtype
-          - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=torch.float)
-          - if quantized is a list or tuple, then arg should be a list and
-            the args with corresponding indexes will be quantized to torch.quint8
-
-
-        Output: fn which takes arg_or_args, and loads them from the
-            corresponding environment depending on the value of quantized.
+    # TODO: move this outside of this function
+    def replace_observer_with_quantize_dequantize_node(
+            model: torch.nn.Module,
+            graph: Graph,
+            node: Node,
+            modules: Dict[str, torch.nn.Module],
+            node_name_to_scope: Dict[str, Tuple[str, type]],
+            qconfig_map: Dict[str, QConfigAny]) -> None:
+        """ Replace activation_post_process module call node with quantize and
+        dequantize node
+
+        Before:
+        ... -> observer_0(x) -> ...
+        After:
+        ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
         """
-        assert quantized is None or \
-            isinstance(quantized, (tuple, list, dict, torch.dtype)), type(quantized)
-        if isinstance(quantized, (tuple, list, dict)) and len(quantized) == 0:
-            # empty tuple or list means nothing is quantized
-            quantized = torch.float
-
-        def load_arg_impl(arg_or_args):
-            # we'll update the format of `quantized`
-            # to better match arg_or_args
-            updated_quantized: Optional[Union[List[int], torch.dtype, Dict[int, torch.dtype], Tuple[int, ...]]] = quantized
-
-            if isinstance(quantized, (tuple, list)) and \
-               len(quantized) == 1 and isinstance(arg_or_args, Node):
-                # when argument is one Node instead of tuple, we just need to check
-                # 0 is in the quantized list
-                if 0 in quantized:
-                    updated_quantized = torch.quint8
-
-            if updated_quantized is None:
-                return map_arg(arg_or_args, load_x)
-            if isinstance(updated_quantized, torch.dtype):
-                return map_arg(
-                    arg_or_args,
-                    load_quantized(updated_quantized))
-            elif isinstance(updated_quantized, (tuple, list)):
-                assert isinstance(arg_or_args, (tuple, list)), arg_or_args
-                loaded_args = []
-                # for now, we only support quantizing positional arguments
-                for i, a in enumerate(arg_or_args):
-                    if i in updated_quantized:
-                        # Currently it's hardcoded to torch.quint8, we can extend this
-                        # in the future to support all quantized
-                        # dtypes
-                        loaded_args.append(map_arg(a, load_quantized(torch.quint8)))
-                    else:
-                        loaded_args.append(map_arg(a, load_non_quantized))
-                return type(arg_or_args)(loaded_args)
-            elif isinstance(updated_quantized, dict):
-                loaded_args = []
-                for i, a in enumerate(arg_or_args):
-                    if i in updated_quantized:
-                        loaded_args.append(map_arg(a, load_quantized(updated_quantized[i])))
-                    else:
-                        loaded_args.append(map_arg(a, load_non_quantized))
-                return type(arg_or_args)(loaded_args)
-        return load_arg_impl
-
-    def node_arg_is_quantized(node_arg: Any) -> bool:
-        if isinstance(node_arg, Node):
-            assert node_arg.name in env, \
-                'Expecting node_arg to be in the environment'
-            if node_arg.name in env:
-                dtype_to_node = env[node_arg.name]
-                return any([x in dtype_to_node for x in [torch.quint8, torch.qint8, torch.float16]])
-            else:
-                return False
-        elif isinstance(node_arg, list):
-            quantized = map(node_arg_is_quantized, node_arg)
-            if all(quantized):
-                return True
-            elif not any(quantized):
-                return False
-            else:
-                raise Exception(
-                    "partially quantized inputs in list not handled yet")
-        else:
-            return False
-
-    def is_output_quantized(
-            node: Node, obj: QuantizeHandler, qconfig: QConfigAny,
-            modules: Dict[str, torch.nn.Module]) -> bool:
-        """ Check if output node is quantized or not """
-        assert modules is not None
-        # for some ops the output is quantized only when `is_reference` is True
-        # and when `is_reference` is False, it has limited qconfig
-        # support, for example `add`
-        # ideally this check should not happen here, it should happen either in
-        # prepare or during lowering, we don't need this check
-        # after the default path is changed to produce reference patterns
-        quantized = obj.is_output_quantized(qconfig)
-
-        # Need to get correct quantized/non-quantized state forn the output
-        # of FixedQParamsQuantizeHandler
-        # TODO: we may want to try to remove the special case here
-        # as well
-        if obj.should_mark_output_quantized_from_input_quantized_status(qconfig):
-            assert node.op in [
-                'call_module',
-                'call_function',
-                'call_method'], \
-                'FixedQParamsQuantizeHandler of type ' + node.op + ' is not handled'
-            # TODO: need to extend this to consider all relevant args instead of just arg[0]
-            quantized = node_arg_is_quantized(node.args[0])
-
-        # the output is unquantized if the node is not a CopyNode
-        # or the activation is not statically quantized
-        if not activation_is_statically_quantized(qconfig) or \
-           not obj.input_output_observed():
-            quantized = False
-        if node_return_type_is_int(node):
-            quantized = False
-
-        return quantized
-
-    def insert_quantize_node(node: Node, modules: Dict[str, torch.nn.Module]) -> None:
-        """ Given a activation_post_process module call node, insert a
-        quantize node"""
         assert modules is not None
         assert isinstance(node.target, str)
+        module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, qconfig_map)
         observer_module = modules[node.target]
-        prev_node = node.args[0]
-        if observer_module.dtype == torch.float32:
-            # copy the observer for fp32 dtype
-            env[node.name][torch.float] = quantized_graph.node_copy(
-                node, load_non_quantized)
-        elif isinstance(prev_node, Node) and prev_node.name in env:
-            # if previous node is already quantized, we'll just remove the
-            # activation_post_process
-            prev_dtype_to_node: Dict[Optional[torch.dtype], Node] = env[prev_node.name]
-            current_dtype: Optional[torch.dtype] = observer_module.dtype  # type: ignore[assignment]
-            if current_dtype in prev_dtype_to_node:
-                env[node.name][current_dtype] = prev_dtype_to_node[current_dtype]
-            else:
-                root_module = modules[""]
-                assert isinstance(prev_node, Node)
-                observer_dtype: torch.dtype = observer_module.dtype  # type: ignore[assignment]
-                env[node.name][observer_dtype] = \
-                    quantize_node(
-                        load_non_quantized(prev_node),
-                        observer_module, node, modules, quantized_graph,
-                        node_name_to_scope, is_input=True)
+        maybe_quantize_node_info = get_quantize_node_info(observer_module)
+        # Skip replacing observers to quant/dequant nodes if the qconfigs of all
+        # consumers and producers of this observer are None
+        skip_replacement = all([
+            has_none_qconfig(n, qconfig_map) for n in
+            list(node.args) + list(node.users.keys())])
+        if skip_replacement or maybe_quantize_node_info is None:
+            # didn't find correponding quantize op and info for the observer_module
+            # so we just remove the observer
+            with graph.inserting_before(node):
+                node.replace_all_uses_with(node.args[0])
+                graph.erase_node(node)
         else:
-            # replace activation post process with quantization ops
-            root_module = modules[""]
-            assert isinstance(node.args[0], Node)
-            dtype: torch.dtype = observer_module.dtype  # type: ignore[assignment]
-            env[node.name][dtype] = \
-                quantize_node(
-                    load_non_quantized(node.args[0]),
-                    observer_module, node, modules,
-                    quantized_graph,
-                    node_name_to_scope, is_input=True)
+            # otherwise, we can convert the observer moduel call to quantize/dequantize node
+            node_type, quantize_op, qparams = maybe_quantize_node_info
+            # replace observer node with quant - dequant node
+            with graph.inserting_before(node):
+                input_node = node.args[0]
+                inputs = [input_node]
+                for key, value in qparams.items():
+                    # TODO: we can add the information of whether a value needs to
+                    # be registered as an attribute in qparams dict itself
+                    if key in ['_scale_', '_zero_point_']:
+                        # For scale and zero_point values we register them as buffers in the root module.
+                        # TODO: maybe need more complex attr name here
+                        qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
+                        inputs.append(qparam_node)
+                    else:
+                        # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                        inputs.append(value)
+
+                quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {})
+                dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+                node.replace_all_uses_with(dequantized_node)
+                graph.erase_node(node)
+
+    # this is a temporary hack for custom module, we may want to implement
+    # this properly after the custom module class design is finalized
+    def replace_observer_with_dequantize_node(node: Node, graph: Graph):
+        call_custom_module_node = node.args[0]
+        assert isinstance(call_custom_module_node, Node), \
+            f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+        node.replace_all_uses_with(call_custom_module_node)
+        graph.erase_node(node)
+        insert_dequantize_node(call_custom_module_node, graph)
 
     # additional state to override inputs to be quantized, if specified
     # by the user
     placeholder_node_seen_cnt = 0
-    output_node_seen_cnt = 0
     input_quantized_idxs: List[int] = prepare_custom_config_dict.get(
         "input_quantized_idxs", [])
     output_quantized_idxs: List[int] = prepare_custom_config_dict.get(
         "output_quantized_idxs", [])
 
-    for node in model.graph.nodes:
-        if node.op == "output":
-            cur_output_node_idx = output_node_seen_cnt
-            output_node_seen_cnt += 1
-            if cur_output_node_idx in output_quantized_idxs:
-                # Result are kept quantized if the user specified the
-                # output_quantized_idxs override.
-                graph_output = map_arg(node.args[0], load_x)
-            else:
-                graph_output = map_arg(node.args[0], load_non_quantized)
-            quantized_graph.output(graph_output)
-            continue
-        root_node, matched, matched_pattern, obj, qconfig = \
-            matches.get(node.name, (None, None, None, None, None))
-        if root_node is node:
-            is_observed_standalone_module_node = (
-                node.op == 'call_module' and
-                is_observed_standalone_module(
-                    modules[node.target])
-            )
-            if qconfig is None and not is_observed_standalone_module_node:
-                result = quantized_graph.node_copy(
-                    node, load_non_quantized)
-                quantized = False
-                # If there are QAT swapped modules in the graph that we don't want to quantize, rever them back to FP32 ones.
-                if node.op == 'call_module' and type(modules[node.target]) in DEFAULT_QAT_MODULE_MAPPINGS.values():
-                    float_mod = modules[node.target].to_float()
-                    setattr(model, node.name, float_mod)
-                    with model.graph.inserting_before(node):
-                        new_float_node = model.graph.create_node('call_module', node.name, node.args, node.kwargs)
-            else:
-                assert obj is not None
-                # We will get whether the output is quantized or not before
-                # convert for standalone module and after convert
-                # for non-standalone module, since _standalone_module_output_quantized_idxs
-                # is only available in observed standalone module
-                if is_observed_standalone_module_node:
-                    out_quant_idxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist()  # noqa: B950
-                    assert len(out_quant_idxs) <= 1, "Currently standalone only support one output"
-                    quantized = 0 in out_quant_idxs
-
-                qconfig = qconfig_map[node.name]
-                # Note: load_arg can be overwritten in the convert method when used to
-                # create Node in graph
-                result = obj.convert(
-                    node, qconfig, modules, quantized_graph, node_name_to_scope, load_arg, is_reference=is_reference,
-                    convert_custom_config_dict=convert_custom_config_dict)
-                if not is_observed_standalone_module_node:
-                    quantized = is_output_quantized(node, obj, qconfig, modules)
-
-            if quantized:
-                env[node.name][activation_dtype(qconfig)] = result
-            else:
-                env[node.name][torch.float] = result
-            continue
-        elif root_node is not None:
-            if qconfig is None:
-                # This branch is hit if all of these conditions are met:
-                # 1. we are in a fusion pattern of multiple nodes (i.e. add-relu)
-                # 2. the current node is not the "root_node" of the pattern
-                # 3. quantization for this pattern is disabled
-                #
-                # In this case, we need to make sure to populate the env with
-                # intermediate nodes manually, because the QuantizeHandler.convert
-                # function will not be called.
-                result = quantized_graph.node_copy(
-                    node, load_non_quantized)
-                env[node.name][torch.float] = result
-            continue
+    if backend_config_dict is None:
+        backend_config_dict = get_native_backend_config_dict()
+    root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config_dict)
+    # convert tuples so that it can work with isinstance(module, tuple_of_classes)
+    root_module_classes = tuple(root_module_to_quantized_reference_module.keys())
+    qat_module_classes = get_qat_module_classes(backend_config_dict)
+    fused_module_classes = get_fused_module_classes(backend_config_dict)
+    statically_quantized_custom_module_nodes: Set[Node] = set()
 
-        # handle activation post process calls
-        if node.op == 'call_module' and \
-                is_activation_post_process(modules[node.target]):
-            insert_quantize_node(node, modules)
-        elif node.op == 'placeholder':
+    for node in list(model.graph.nodes):
+        if node.op == 'placeholder':
             cur_placeholder_node_idx = placeholder_node_seen_cnt
             placeholder_node_seen_cnt += 1
             if cur_placeholder_node_idx in input_quantized_idxs:
-                env[node.name][torch.quint8] = quantized_graph.node_copy(
-                    node, load_non_quantized)
-            else:
-                env[node.name][torch.float] = \
-                    quantized_graph.node_copy(node, load_non_quantized)
-        else:
-            # copy quantized or non-quantized node
-            # get_tensor_info_node like shape works for both
-            # quantized and non-quantized input and output a non-Tensor
-            # (we use None for dtype currently for non-Tensors)
-            if is_get_tensor_info_node(node):
-                env[node.name][None] = \
-                    quantized_graph.node_copy(node, load_x)
+                # Inputs are assumed to be quantized if the user specifid the
+                # input_quantized_idxs override.
+                # we need to dequantize the inputs since all operators took
+                # floating point inputs in reference quantized models
+                insert_dequantize_node(node, model.graph)
+        elif node.op == "output":
+            # If the argument is empty we don't need to do anything
+            if len(output_quantized_idxs) == 0:
+                continue
+            # Result are kept quantized if the user specified the
+            # output_quantized_idxs override.
+            # Remove the dequantize operator for the node in the end if any
+            return_node = node
+            output = node.args[0]
+            # outputs can be Node, list, tuple, dict, other cases are not supported yet
+            if isinstance(output, (list, tuple)):
+                for idx in output_quantized_idxs:
+                    maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
+            elif isinstance(output, (Node, dict)):
+                # we treat dict as a single argument currently, but it can be extended
+                # to support {"key": dtype} after we change output_quantized_idxs to
+                # dict
+                if 0 in output_quantized_idxs:
+                    maybe_recursive_remove_dequantize(output, return_node, model.graph)
             else:
-                env[node.name][torch.float] = \
-                    quantized_graph.node_copy(node, load_non_quantized)
-
-    # remove activation post process
-    act_post_process_removed_graph = Graph()
-    remove_env: Dict[str, Node] = {}
+                warnings.warn(f"Unsupported node type for output_quantized_idxs: {type(output)}")
+        elif node.op == "call_module":
+            if is_activation_post_process(modules[node.target]):
+                observed_node = node.args[0]
+                if observed_node in statically_quantized_custom_module_nodes:
+                    replace_observer_with_dequantize_node(node, model.graph)
+                else:
+                    replace_observer_with_quantize_dequantize_node(
+                        model, model.graph, node, modules, node_name_to_scope,
+                        qconfig_map)
+            elif is_observed_standalone_module(modules[node.target]):
+                convert_standalone_module(
+                    node, modules, model, is_reference, backend_config_dict)
+            elif type(modules[node.target]) in set(
+                    root_module_classes).union(qat_module_classes).union(fused_module_classes):
+                # extra check for fused module classes to make sure they are fused module classes
+                # of target modules
+                if type(modules[node.target]) in fused_module_classes and \
+                   type(modules[node.target][0]) not in root_module_classes:
+                    continue
+                convert_weighted_module(
+                    node, modules, observed_node_names, qconfig_map, backend_config_dict)
+            elif type(modules[node.target]) in custom_module_classes:
+                convert_custom_module(
+                    node, model.graph, modules, custom_module_class_mapping,
+                    statically_quantized_custom_module_nodes)
 
-    def load_arg_remove(a: Argument) -> Argument:
-        return map_arg(a, lambda node: remove_env[node.name])
+    preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", []))
+    model = QuantizedGraphModule(model, copy.deepcopy(model.graph), preserved_attributes)
 
-    for node in quantized_graph.nodes:
-        if node.op == 'output':
-            act_post_process_removed_graph.output(
-                map_arg(node.args[0], load_arg_remove))
-            continue
-        if node.op == 'call_module' and \
-           is_activation_post_process(modules[node.target]):
-            # remove activation post process node
-            remove_env[node.name] = remove_env[node.args[0].name]
-        else:
-            remove_env[node.name] = act_post_process_removed_graph.node_copy(
-                node, load_arg_remove)
+    # remove deadcode after converting observers to quant/dequant ops
+    model.graph.eliminate_dead_code()
+    model.recompile()
 
-    # removes qconfig and activation_post_process modules
-    if _remove_qconfig_flag:
-        _remove_qconfig(model)
-    preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", []))
-    model = QuantizedGraphModule(model, act_post_process_removed_graph, preserved_attributes)
+    # TODO: maybe move this to quantize_fx.py
     if not is_reference:
         model = duplicate_dequantize_node(model)
-        model = fold_weight(model, node_name_to_scope)
-        model = lower_to_fbgemm(model)
+        model = duplicate_quantize_dynamic_node(model)
+        model = lower_to_fbgemm(model, qconfig_map, node_name_to_scope)
         model = remove_quant_dequant_pairs(model)
         model = remove_extra_dequantize(model)
+    # TODO: this looks hacky, we want to check why we need this and see if we can
+    # remove this
+    # removes qconfig and activation_post_process modules
+    if _remove_qconfig_flag:
+        _remove_qconfig(model)
     return model
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 60e7ccd28a59..0736f8273541 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -4,111 +4,130 @@
     map_arg
 )
 from torch.fx.graph import Graph
-from ..utils import (
-    get_combined_dict
-)
 from .graph_module import (
     FusedGraphModule
 )
-from .match_utils import is_match
+from .match_utils import (
+    is_match,
+    MatchAllNode,
+)
 from .pattern_utils import (
-    get_default_fusion_patterns,
+    sorted_patterns_dict,
 )
 
-from .backend_config.utils import get_fusion_pattern_to_fuse_handler_cls
-from .backend_config.utils import get_fuser_method_mapping
+from ..backend_config.utils import get_fuser_method_mapping
+from ..backend_config.utils import get_fusion_pattern_to_root_node_getter
+from ..backend_config.utils import get_fusion_pattern_to_extra_inputs_getter
+from ..backend_config import get_native_backend_config_dict
+from .backend_config_utils import get_fusion_pattern_to_fuse_handler_cls
 
 from .fusion_patterns import *  # noqa: F401,F403
 
 from typing import Callable, Tuple, Dict, Any, Optional, List
 
-from .quantization_types import Pattern, NodePattern
-
-class Fuser:
-    def fuse(
-        self,
-        model: GraphModule,
-        is_qat: bool,
-        fuse_custom_config_dict: Optional[Dict[str, Any]] = None,
-        backend_config_dict: Optional[Dict[str, Any]] = None,
-    ) -> GraphModule:
-        if fuse_custom_config_dict is None:
-            fuse_custom_config_dict = {}
-
-        input_root = model
-        input_graph = model.graph
-        self.modules = dict(input_root.named_modules())
-
-        if backend_config_dict is None:
-            additional_fusion_patterns = \
-                fuse_custom_config_dict.get("additional_fusion_pattern", {})
-            fusion_pattern_to_fuse_handler_cls = get_combined_dict(
-                get_default_fusion_patterns(), additional_fusion_patterns)
-            fuser_method_mapping = None
+from torch.ao.quantization.quantization_types import Pattern, NodePattern
+
+def fuse(
+    model: GraphModule,
+    is_qat: bool,
+    fuse_custom_config_dict: Optional[Dict[str, Any]] = None,
+    backend_config_dict: Optional[Dict[str, Any]] = None,
+) -> GraphModule:
+    if fuse_custom_config_dict is None:
+        fuse_custom_config_dict = {}
+
+    input_root = model
+    input_graph = model.graph
+    named_modules = dict(input_root.named_modules())
+
+    if backend_config_dict is None:
+        backend_config_dict = get_native_backend_config_dict()
+
+    fusion_pattern_to_fuse_handler_cls = sorted_patterns_dict(get_fusion_pattern_to_fuse_handler_cls(backend_config_dict))
+    fuser_method_mapping = get_fuser_method_mapping(backend_config_dict)
+    fusion_pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config_dict)
+    fusion_pattern_to_extra_inputs_getter = get_fusion_pattern_to_extra_inputs_getter(backend_config_dict)
+
+    # find fusion
+    fusion_pairs = _find_matches(
+        input_root, input_graph, fusion_pattern_to_fuse_handler_cls)
+    fused_graph = Graph()
+    env: Dict[Any, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+
+    def default_root_node_getter(node_pattern):
+        while not isinstance(node_pattern[-1], Node):
+            node_pattern = node_pattern[-1]
+        return node_pattern[-1]
+
+    for node in input_graph.nodes:
+        maybe_last_node, pattern, matched_node_pattern, obj, node_to_subpattern = \
+            fusion_pairs.get(node.name, (None, None, None, None, None))
+        # get the corresponding subpattern for the current node
+        if node_to_subpattern is not None:
+            node_subpattern = node_to_subpattern.get(node, None)
         else:
-            fusion_pattern_to_fuse_handler_cls = get_fusion_pattern_to_fuse_handler_cls(backend_config_dict)
-            fuser_method_mapping = get_fuser_method_mapping(backend_config_dict)
-        # find fusion
-        fusion_pairs = self._find_matches(
-            input_root, input_graph, fusion_pattern_to_fuse_handler_cls)
-        self.fused_graph = Graph()
-        env: Dict[Any, Any] = {}
-
-        def load_arg(a):
-            return map_arg(a, lambda node: env[node.name])
-
-        def get_root_node(node_pattern):
-            while not isinstance(node_pattern[-1], Node):
-                node_pattern = node_pattern[-1]
-            return node_pattern[-1]
-
-        for node in input_graph.nodes:
-            maybe_last_node, pattern, matched_node_pattern, obj = \
-                fusion_pairs.get(node.name, (None, None, None, None))
-            if maybe_last_node is node:
-                assert obj is not None
-                # TODO: currently we hard code the root node, which only works for
-                # a sequence of ops and assume the root node is the last node,
-                # we want to make this more general to support more complex patterns
-                root_node = get_root_node(matched_node_pattern)  # type: ignore[index]
-                env[node.name] = obj.fuse(
-                    self, load_arg, root_node, matched_node_pattern,  # type: ignore[arg-type]
-                    fuse_custom_config_dict, fuser_method_mapping, is_qat)
-            elif maybe_last_node is None:
-                env[node.name] = self.fused_graph.node_copy(node, load_arg)
-            # node matched in patterns and is not root is removed here
-
-        preserved_attributes = set(fuse_custom_config_dict.get("preserved_attributes", []))
-        model = FusedGraphModule(input_root, self.fused_graph, preserved_attributes)
-        return model
-
-    def _find_matches(
-            self, root: GraphModule, graph: Graph,
-            patterns: Dict[Pattern, Callable]
-    ) -> Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler]]:
-        modules = dict(root.named_modules())
-        match_map : Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler]] = {}  # node name -> (root_node, match_value)
-
-        def apply_match(pattern, node, match, matched_node_pattern):
-            if isinstance(pattern, tuple):
-                s, *args = pattern
-                current_node_pattern: List[Node] = []
-                apply_match(s, node, match, current_node_pattern)
-                for subpattern, arg in zip(args, node.args):
-                    apply_match(subpattern, arg, match, current_node_pattern)
-                matched_node_pattern.append(tuple(current_node_pattern))
-            else:
-                # the first pattern matches will take precedence
-                if node.name not in match_map:
-                    matched_node_pattern.append(node)
-                    root_node, pattern, handler = match
-                    match_map[node.name] = (root_node, pattern, matched_node_pattern, handler)
+            node_subpattern = None
+        if maybe_last_node is node:
+            assert obj is not None
+            root_node_getter = fusion_pattern_to_root_node_getter.get(pattern, default_root_node_getter)
+            root_node = root_node_getter(matched_node_pattern)  # type: ignore[index]
+            extra_inputs_getter = fusion_pattern_to_extra_inputs_getter.get(pattern, None)
+            extra_inputs = []
+            if extra_inputs_getter is not None:
+                extra_inputs = extra_inputs_getter(matched_node_pattern)
+            # TODO: add validation that root_node is a module and has the same type
+            # as the root_module in the configuration
+            env[node.name] = obj.fuse(
+                load_arg, named_modules, fused_graph, root_node, extra_inputs, matched_node_pattern,  # type: ignore[arg-type]
+                fuse_custom_config_dict, fuser_method_mapping, is_qat)
+        elif maybe_last_node is None or node_subpattern is MatchAllNode:
+            env[node.name] = fused_graph.node_copy(node, load_arg)
+        # node matched in patterns and is not root is removed here
+
+    preserved_attributes = set(fuse_custom_config_dict.get("preserved_attributes", []))
+    model = FusedGraphModule(input_root, fused_graph, preserved_attributes)
+    return model
 
-        for node in reversed(graph.nodes):
+def _find_matches(
+        root: GraphModule, graph: Graph,
+        patterns: Dict[Pattern, Callable]
+) -> Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]]:
+    modules = dict(root.named_modules())
+    # node name -> (root_node, match_value)
+    match_map : Dict[
+        str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]] = {}
+    # a map from node to the matched subpattern
+    node_to_subpattern: Dict[Node, Any] = {}
+
+    # TODO: dedup with quantization matching function in match_utils.py
+    def apply_match(pattern, node, match, matched_node_pattern, node_to_subpattern):
+        if isinstance(pattern, tuple):
+            s, *args = pattern
+            current_node_pattern: List[Node] = []
+            apply_match(s, node, match, current_node_pattern, node_to_subpattern)
+            for subpattern, arg in zip(args, node.args):
+                apply_match(subpattern, arg, match, current_node_pattern, node_to_subpattern)
+            matched_node_pattern.append(tuple(current_node_pattern))
+        else:
+            # the first pattern matches will take precedence
             if node.name not in match_map:
-                for pattern, value in patterns.items():
-                    matched_node_pattern: List[Node] = []
-                    if is_match(modules, node, pattern):
-                        apply_match(pattern, node, (node, pattern, value(self, node)), matched_node_pattern)
+                matched_node_pattern.append(node)
+                # MatchAllNode here is actually MatchAllInputNode which should not
+                # be added to match_map
+                if pattern is not MatchAllNode:
+                    node_to_subpattern[node] = pattern
+                    root_node, pattern, handler = match
+                    match_map[node.name] = (root_node, pattern, matched_node_pattern, handler, node_to_subpattern)
+
+    for node in reversed(graph.nodes):
+        if node.name not in match_map:
+            for pattern, value in patterns.items():
+                matched_node_pattern: List[Node] = []
+                if is_match(modules, node, pattern):
+                    apply_match(pattern, node, (node, pattern, value(node)), matched_node_pattern, node_to_subpattern)
+                    break
 
-        return match_map
+    return match_map
diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fusion_patterns.py
index 2a0b9ff6f1e5..95b0c96693a5 100644
--- a/torch/ao/quantization/fx/fusion_patterns.py
+++ b/torch/ao/quantization/fx/fusion_patterns.py
@@ -1,10 +1,7 @@
 import torch
-from torch.fx.graph import Node
-from .pattern_utils import (
-    register_fusion_pattern,
-)
+from torch.fx.graph import Node, Graph
 from ..utils import _parent_name
-from .quantization_types import QuantizerCls, NodePattern, Pattern
+from torch.ao.quantization.quantization_types import NodePattern, Pattern
 from ..fuser_method_mappings import get_fuser_method_new
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Optional, Union, List
@@ -18,97 +15,76 @@
 class FuseHandler(ABC):
     """ Base handler class for the fusion patterns
     """
-    def __init__(self, quantizer: QuantizerCls, node: Node):
+    def __init__(self, node: Node):
         pass
 
     @abstractmethod
     def fuse(self,
-             quantizer: QuantizerCls,
              load_arg: Callable,
+             named_modules: Dict[str, torch.nn.Module],
+             fused_graph: Graph,
              root_node: Node,
+             extra_inputs: List[Any],
              matched_node_pattern: NodePattern,
              fuse_custom_config_dict: Dict[str, Any],
              fuser_method_mapping: Optional[Dict[Pattern, Union[torch.nn.Sequential, Callable]]],
              is_qat: bool) -> Node:
         pass
 
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv1d))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv3d))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Linear))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.Linear))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm2d))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d))
-@register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d))
-@register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d))
-@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.Conv1d))
-@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.Conv2d))
-@register_fusion_pattern((torch.nn.BatchNorm3d, torch.nn.Conv3d))
-@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.Linear))
-@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm1d, torch.nn.Conv1d)))
-@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
-@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
-@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d)))
-@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
-@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
-@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.ConvTranspose1d))
-@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.ConvTranspose2d))
-@register_fusion_pattern((torch.nn.BatchNorm3d, torch.nn.ConvTranspose3d))
+# TODO: move this to backend_config.fuse_handler
 class DefaultFuseHandler(FuseHandler):
     def __init__(
             self,
-            quantizer: QuantizerCls,
             node: Node):
-        super().__init__(quantizer, node)
+        super().__init__(node)
 
     def fuse(self,
-             quantizer: QuantizerCls,
              load_arg: Callable,
+             named_modules: Dict[str, torch.nn.Module],
+             fused_graph: Graph,
              root_node: Node,
+             extra_inputs: List[Any],
              matched_node_pattern: NodePattern,
              fuse_custom_config_dict: Dict[str, Any],
              fuser_method_mapping: Optional[Dict[Pattern, Union[torch.nn.Sequential, Callable]]],
              is_qat: bool) -> Node:
-        additional_fuser_method_mapping = fuse_custom_config_dict.get("additional_fuser_method_mapping", {})
         assert root_node.op == "call_module", "Expecting module node to be a call_module Node"
-        root_module = quantizer.modules[root_node.target]
-        assert len(additional_fuser_method_mapping) == 0, "Fusion implementation is "
-        "undergoing changes, additoinal_fuser_method_mapping is not supported currently."
-        def get_modules(pattern, modules):
+        root_module = named_modules[str(root_node.target)]
+
+        def get_modules(pattern):
             """ Given a node pattern, extract the corresponding modules
             e.g. input: (relu_node, (bn_node, conv_node))
                  output: (relu_module, (bn_module, conv_module))
             """
             if isinstance(pattern, (tuple, list)):
                 n, *args = pattern
-                get_modules(n, modules)
-                arg_modules: List[torch.nn.Module] = []
+                modules: List[torch.nn.Module] = []
+                modules.append(get_modules(n))
                 for a in args:
-                    get_modules(a, arg_modules)
-                arg_modules = tuple(arg_modules) if len(arg_modules) > 1 else arg_modules[0]  # type: ignore[assignment]
-                modules.append(arg_modules)
+                    modules.append(get_modules(a))
+                return tuple(modules)
             else:
                 n = pattern
                 if n.op == "call_module":
-                    modules.append(quantizer.modules[n.target])
+                    return named_modules[n.target]
                 elif n.op == "call_function" and n.target == torch.nn.functional.relu:
                     relu = torch.nn.ReLU()
                     relu.training = root_module.training
-                    modules.append(relu)
+                    return relu
+                elif n.op == "call_function" or n.op == "call_method":
+                    return n.target
                 else:
-                    modules.append(MatchAllNode)
-            return tuple(modules)
+                    return MatchAllNode
 
         # since relu can be used multiple times, we'll need to create a relu module for each match
-        matched_modules = get_modules(matched_node_pattern, [])
+        matched_modules = get_modules(matched_node_pattern)
 
         def get_matched_types(m):
             if isinstance(m, tuple):
                 return tuple(map(get_matched_types, m))
-            return type(m)
+            if isinstance(m, torch.nn.Module):
+                return type(m)
+            return m
 
         matched_module_types = get_matched_types(matched_modules)
         module_parent_name, module_name = _parent_name(root_node.target)
@@ -116,6 +92,12 @@ def get_matched_types(m):
         # TODO: change the signature for fuser_method to take matched module patterns
         # as input
         fused_module = fuser_method(is_qat, *matched_modules)
-        # TODO: maybe add a pass to cleanup bn modules?
-        setattr(quantizer.modules[module_parent_name], module_name, fused_module)
-        return quantizer.fused_graph.node_copy(root_node, load_arg)
+        setattr(named_modules[module_parent_name], module_name, fused_module)
+        extra_args = []
+        for input in extra_inputs:
+            extra_args.append(load_arg(input))
+        node = fused_graph.node_copy(root_node, load_arg)
+        args = list(node.args)
+        args.extend(extra_args)
+        node.args = tuple(args)
+        return node
diff --git a/torch/ao/quantization/fx/graph_module.py b/torch/ao/quantization/fx/graph_module.py
index ef43a42d030f..2e37e4a557e4 100644
--- a/torch/ao/quantization/fx/graph_module.py
+++ b/torch/ao/quantization/fx/graph_module.py
@@ -18,7 +18,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return FusedGraphModule(fake_mod, self.graph, self.preserved_attr_names)
+        return FusedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
 
 class ObservedGraphModule(GraphModule):
 
@@ -45,7 +45,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return ObservedGraphModule(fake_mod, self.graph, self.preserved_attr_names)
+        return ObservedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
 
 def is_observed_module(module: Any) -> bool:
     return isinstance(module, ObservedGraphModule)
@@ -60,7 +60,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return ObservedStandaloneGraphModule(fake_mod, self.graph, self.preserved_attr_names)
+        return ObservedStandaloneGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
 
 def is_observed_standalone_module(module: Any) -> bool:
     return isinstance(module, ObservedStandaloneGraphModule)
@@ -104,4 +104,4 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return QuantizedGraphModule(fake_mod, self.graph, self.preserved_attr_names)
+        return QuantizedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
diff --git a/torch/ao/quantization/fx/lower_to_fbgemm.py b/torch/ao/quantization/fx/lower_to_fbgemm.py
index fc76d135ee80..c8c413cacfee 100644
--- a/torch/ao/quantization/fx/lower_to_fbgemm.py
+++ b/torch/ao/quantization/fx/lower_to_fbgemm.py
@@ -1,8 +1,14 @@
 from ._lower_to_native_backend import _lower_to_native_backend
 from .graph_module import QuantizedGraphModule
+from ..qconfig import QConfigAny
+from typing import Dict, Tuple
 
-def lower_to_fbgemm(model: QuantizedGraphModule) -> QuantizedGraphModule:
+def lower_to_fbgemm(
+    model: QuantizedGraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> QuantizedGraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to fbgemm
     """
-    return _lower_to_native_backend(model)
+    return _lower_to_native_backend(model, qconfig_map, node_name_to_scope)
diff --git a/torch/ao/quantization/fx/lower_to_qnnpack.py b/torch/ao/quantization/fx/lower_to_qnnpack.py
index 0a0ea9cd248c..e79de696e5e0 100644
--- a/torch/ao/quantization/fx/lower_to_qnnpack.py
+++ b/torch/ao/quantization/fx/lower_to_qnnpack.py
@@ -1,8 +1,14 @@
 from ._lower_to_native_backend import _lower_to_native_backend
 from .graph_module import QuantizedGraphModule
+from ..qconfig import QConfigAny
+from typing import Dict, Tuple
 
-def lower_to_qnnpack(model: QuantizedGraphModule) -> QuantizedGraphModule:
+def lower_to_qnnpack(
+    model: QuantizedGraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> QuantizedGraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to qnnpack
     """
-    return _lower_to_native_backend(model)
+    return _lower_to_native_backend(model, qconfig_map, node_name_to_scope)
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index e759583b44a8..46f7b515e860 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -4,15 +4,16 @@
     Graph,
     Node,
 )
-from .quantization_types import Pattern
+from torch.ao.quantization.quantization_types import Pattern
 from .quantization_patterns import (
     QuantizeHandler,
-    CustomModuleQuantizeHandler,
-    StandaloneModuleQuantizeHandler,
 )
 from ..qconfig import (
     QConfigAny,
 )
+from ..utils import (
+    MatchAllNode
+)
 from .graph_module import (
     is_observed_standalone_module,
 )
@@ -22,12 +23,6 @@
 MatchResult = Tuple[Node, List[Node], Optional[Pattern], QuantizeHandler,
                     QConfigAny]
 
-# TODO: maybe rename this to MatchInputNode
-class MatchAllNode:
-    """ A node pattern that matches all nodes
-    """
-    pass
-
 # Note: The order of patterns is important! match function will take whatever is matched first, so we'll
 # need to put the fusion patterns before single patterns. For example, add_relu should be registered come before relu.
 # decorators are applied in the reverse order we see. Also when we match the nodes in the graph with these patterns,
@@ -79,6 +74,7 @@ def find_matches(
         graph: Graph,
         modules: Dict[str, torch.nn.Module],
         patterns: Dict[Pattern, QuantizeHandler],
+        root_node_getter_mapping: Dict[Pattern, Callable],
         qconfig_map: Dict[str, QConfigAny],
         standalone_module_names: List[str] = None,
         standalone_module_classes: List[Callable] = None,
@@ -117,29 +113,80 @@ def find_matches(
     match_map: Dict[str, MatchResult] = {}
     all_matched : Set[str] = set()
 
-    def record_match(pattern, node, matched):
+    def _recursive_record_node_in_match_map(
+            last_node,
+            match_map,
+            node_pattern,
+            matched_node_pattern,
+            pattern,
+            match_value,
+            qconfig):
+        if isinstance(node_pattern, Node):
+            match_map[node_pattern.name] = (
+                last_node, matched_node_pattern, pattern, match_value, qconfig)
+        else:
+            for n in node_pattern:
+                _recursive_record_node_in_match_map(last_node, match_map, n, matched_node_pattern, pattern, match_value, qconfig)
+
+    # TODO: 1. merge with fuse matcher 2. document the code
+    def record_match(
+            pattern,
+            node,
+            last_node,
+            matched_node_pattern,
+            match_map):
         if isinstance(pattern, tuple):
             s, *args = pattern
-            record_match(s, node, matched)
+            current_node_pattern: List[Node] = []
+            record_match(
+                s,
+                node,
+                last_node,
+                matched_node_pattern,
+                match_map)
             if pattern[0] is not getattr:
                 for subpattern, arg in zip(args, node.args):
-                    record_match(subpattern, arg, matched)
+                    record_match(
+                        subpattern,
+                        arg,
+                        node,
+                        current_node_pattern,
+                        match_map)
+            if len(current_node_pattern) > 1:
+                matched_node_pattern.append(tuple(current_node_pattern))
+            else:
+                matched_node_pattern.append(current_node_pattern[0])
         else:
-            matched.append(node)
+            matched_node_pattern.append(node)
 
-    cache_for_no_tensor_check: Dict[Node, bool] = dict()
     for node in reversed(graph.nodes):
         if node.name not in match_map and node.name not in all_matched:
-            for pattern, value in patterns.items():
-                if is_match(modules, node, pattern):
-                    matched: List[Any] = []
-                    record_match(pattern, node, matched)
-                    for n in matched:
-                        match_map[n.name] = (
-                            node, matched, pattern, value(node, modules),  # type: ignore[operator]
-                            qconfig_map[n.name])
-                        all_matched.add(n.name)
-                    # break after finding the first match
+            for pattern, quantize_handler_cls in patterns.items():
+                root_node_getter = root_node_getter_mapping.get(pattern, None)
+                if is_match(modules, node, pattern) and node.name not in match_map:
+                    matched_node_pattern: List[Node] = []
+                    record_match(
+                        pattern,
+                        node,
+                        node,
+                        matched_node_pattern,
+                        match_map)
+                    quantize_handler = quantize_handler_cls(  # type: ignore[operator]
+                        matched_node_pattern,
+                        modules,
+                        root_node_getter)
+                    last_node = node
+                    # record the match for all nodes in the pattern
+                    _recursive_record_node_in_match_map(
+                        last_node,
+                        match_map,
+                        # we need to record all nodes in the matched pattern in the match_map
+                        matched_node_pattern,
+                        # this is a part of the value corresponding to the node
+                        matched_node_pattern,
+                        pattern,
+                        quantize_handler,
+                        qconfig_map[node.name])
                     break
 
     # add custom module instances to the match result
@@ -149,7 +196,7 @@ def record_match(pattern, node, matched):
            type(modules[node.target]) in custom_module_classes:
             custom_module_qconfig = qconfig_map[node.name]
             match_map[node.name] = (
-                node, [node], None, CustomModuleQuantizeHandler(node, modules),
+                node, node, None, QuantizeHandler(node, modules, is_custom_module=True),
                 custom_module_qconfig)
 
     def is_standalone_module(node_target: str, modules: Dict[str, torch.nn.Module]):
@@ -165,10 +212,10 @@ def is_standalone_module(node_target: str, modules: Dict[str, torch.nn.Module]):
            (is_standalone_module(node.target, modules) or
                 is_observed_standalone_module(modules[node.target])):
             # add node to matched nodes
-            custom_module_qconfig = qconfig_map[node.name]
+            standalone_module_qconfig = qconfig_map[node.name]
             match_map[node.name] = (
-                node, [node], None,
-                StandaloneModuleQuantizeHandler(node, modules),
-                custom_module_qconfig)
+                node, node, None,
+                QuantizeHandler(node, modules, is_standalone_module=True),
+                standalone_module_qconfig)
 
     return match_map
diff --git a/torch/ao/quantization/fx/pattern_utils.py b/torch/ao/quantization/fx/pattern_utils.py
index bba17d730d6a..e7c4d70fc7f3 100644
--- a/torch/ao/quantization/fx/pattern_utils.py
+++ b/torch/ao/quantization/fx/pattern_utils.py
@@ -3,12 +3,12 @@
 from torch.fx.graph import (
     Node,
 )
-from .quantization_types import Pattern
+from torch.ao.quantization.quantization_types import Pattern
 from ..qconfig import QConfigAny
 from ..fake_quantize import FixedQParamsFakeQuantize
 # from .quantization_patterns import BinaryOpQuantizeHandler
 from ..observer import ObserverBase
-
+import copy
 
 # TODO(future PR): fix the typing on QuantizeHandler (currently a circular dependency)
 QuantizeHandler = Any
@@ -25,13 +25,13 @@ def insert(fn):
     return insert
 
 def get_default_fusion_patterns() -> Dict[Pattern, QuantizeHandler]:
-    return DEFAULT_FUSION_PATTERNS
+    return copy.copy(DEFAULT_FUSION_PATTERNS)
 
 DEFAULT_QUANTIZATION_PATTERNS = OrderedDict()
 
 # Mapping from pattern to activation_post_process(observer/fake_quant) constructor for output activation
 # e.g. pattern: torch.sigmoid,
-#      output_activation_post_process: default_affine_fixed_qparams_fake_quant
+#      output_activation_post_process: default_fixed_qparams_range_0to1_fake_quant
 DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP = dict()
 DEFAULT_OUTPUT_OBSERVER_MAP = dict()
 
@@ -47,15 +47,15 @@ def insert(fn):
 
 # Get patterns for both static quantization and qat
 def get_default_quant_patterns() -> Dict[Pattern, QuantizeHandler]:
-    return DEFAULT_QUANTIZATION_PATTERNS
+    return copy.copy(DEFAULT_QUANTIZATION_PATTERNS)
 
 # a map from pattern to output activation post process constructor
 # e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant
 def get_default_output_activation_post_process_map(is_training) -> Dict[Pattern, ObserverBase]:
     if is_training:
-        return DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP
+        return copy.copy(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP)
     else:
-        return DEFAULT_OUTPUT_OBSERVER_MAP
+        return copy.copy(DEFAULT_OUTPUT_OBSERVER_MAP)
 
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
@@ -63,3 +63,27 @@ def get_default_output_activation_post_process_map(is_training) -> Dict[Pattern,
 #     def __init__(...):
 #         ...
 #
+
+def sorted_patterns_dict(patterns_dict: Dict[Pattern, QuantizeHandler]) -> Dict[Pattern, QuantizeHandler]:
+    """
+    Return a sorted version of the patterns dictionary such that longer patterns are matched first,
+    e.g. match (F.relu, F.linear) before F.relu.
+    This works for current use cases, but we may need to have a more clever way to sort
+    things to address more complex patterns
+    """
+
+    def get_len(pattern):
+        """ this will calculate the length of the pattern by counting all the entries
+        in the pattern.
+        this will make sure (nn.ReLU, (nn.BatchNorm, nn.Conv2d)) comes before
+        (nn.BatchNorm, nn.Conv2d) so that we can match the former first
+        """
+        len = 0
+        if isinstance(pattern, tuple):
+            for item in pattern:
+                len += get_len(item)
+        else:
+            len += 1
+        return len
+
+    return OrderedDict(sorted(patterns_dict.items(), key=lambda kv: -get_len(kv[0]) if isinstance(kv[0], tuple) else 1))
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index d0d951ce7aa3..086b65e13c90 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -30,11 +30,12 @@
 
 from .quantization_patterns import (
     QuantizeHandler,
-    CustomModuleQuantizeHandler,
-    StandaloneModuleQuantizeHandler,
 )
 
-from .quantization_types import Pattern
+from torch.ao.quantization.quantization_types import (
+    Pattern,
+    NodePattern
+)
 
 from ._equalize import (
     is_equalization_observer,
@@ -48,7 +49,7 @@
 
 from .pattern_utils import (
     MatchResult,
-    get_default_quant_patterns,
+    sorted_patterns_dict,
 )
 
 from .match_utils import (
@@ -60,40 +61,44 @@
     get_custom_module_class_keys,
     all_node_args_have_no_tensors,
     assert_and_get_unique_device,
-    node_bool_tensor_arg_indexes,
+    get_non_observable_arg_indexes_and_types,
     get_new_attr_name_with_prefix,
     NON_QUANTIZABLE_WEIGHT_OPS,
     WEIGHT_INDEX_DICT,
     BIAS_INDEX_DICT,
 )
 
-from ..quantization_mappings import (
-    get_default_qat_module_mappings,
-)
-
 from torch.ao.quantization.quantize import (
     is_activation_post_process,
     convert
 )
 
 from ..utils import (
-    get_combined_dict,
     get_qconfig_dtypes,
     get_swapped_custom_module_class,
     activation_is_statically_quantized,
     activation_is_int8_quantized,
 )
 
-from .backend_config.utils import (
-    get_pattern_to_quantize_handlers,
+from ..backend_config.utils import (
     get_pattern_to_dtype_configs,
     get_pattern_to_input_type_to_index,
     get_module_to_qat_module,
+    get_fusion_pattern_to_root_node_getter,
+)
+from ..backend_config import (
+    get_native_backend_config_dict,
+)
+from .backend_config_utils import (
+    get_pattern_to_quantize_handlers,
 )
 
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set
 from collections import defaultdict
 
+# list of dtypes to not add observers to
+DO_NOT_OBS_DTYPE_LIST = [int, float, torch.bool, None]
+
 def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
     return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
         is_activation_post_process(modules[str(node.target)])
@@ -125,7 +130,7 @@ def node_arg_is_bias(node: Node, arg: Any) -> bool:
 def is_input_arg_dtype_supported_by_backend(
     arg: Argument,
     node: Node,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     dtype_config: Dict[str, torch.dtype],
 ) -> bool:
     """ Check if the configured qconfig for the argument
@@ -140,9 +145,17 @@ def is_input_arg_dtype_supported_by_backend(
     is_bias = node_arg_is_bias(node, arg)
     is_activation = not is_weight and not is_bias
     if is_activation:
-        input_activation_dtype = dtype_config.get("input_activation_dtype", None)
-        return input_activation_dtype is None or \
-            node_name_to_target_dtype[node.name]["input_activation_dtype"] == input_activation_dtype
+        is_dynamic = dtype_config.get("is_dynamic", False)
+        if is_dynamic:
+            input_activation_dtype = dtype_config.get("input_dtype", None)
+            # TODO: change this after the is_dynamic refactor is landed
+            compute_dtype = node_name_to_target_dtype[node.name].get("input_activation_compute_dtype", None)
+            return input_activation_dtype is None or \
+                compute_dtype == input_activation_dtype
+        else:
+            input_activation_dtype = dtype_config.get("input_dtype", None)
+            return input_activation_dtype is None or \
+                node_name_to_target_dtype[node.name]["input_activation_dtype"] == input_activation_dtype
     elif is_weight:
         weight_dtype = dtype_config.get("weight_dtype", None)
         return weight_dtype is None or node_name_to_target_dtype[node.name]["weight_dtype"] == weight_dtype
@@ -152,7 +165,7 @@ def is_input_arg_dtype_supported_by_backend(
 
 def is_output_dtype_supported_by_backend(
     node: Node,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     dtype_config: Dict[str, torch.dtype],
 ) -> bool:
     """ Check if the configured qconfig for the output
@@ -162,10 +175,22 @@ def is_output_dtype_supported_by_backend(
     return output_dtype is None or \
         output_dtype == node_name_to_target_dtype[node.name]["output_activation_dtype"]
 
+def is_observer_in_same_graph(node, modules, node_name_to_target_dtype):
+    """ Check if observer in same graph
+    when the node output is not fp32 and input is 'placeholder'
+    the input is assumed to be quantized, so it is observed
+    in a different place rather than not observed.
+    """
+    node_output_dtype = get_arg_target_dtype_as_output(node, modules, node_name_to_target_dtype)
+    if len(node.args) > 0 and isinstance(node.args[0], Node):
+        if node_output_dtype == torch.quint8 and node.args[0].op == 'placeholder':
+            return False
+    return True
+
 def is_pattern_dtype_config_supported_by_backend(
     pattern: Optional[Pattern],
-    matched_nodes: Optional[List[Node]],
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    matched_node_pattern: Optional[NodePattern],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     backend_config_dict: Optional[Dict[str, Any]]
 ) -> bool:
     """ Check is the dtype configuration of a pattern is supported by
@@ -173,14 +198,15 @@ def is_pattern_dtype_config_supported_by_backend(
     """
     if backend_config_dict is None or pattern is None:
         return True
-    assert matched_nodes is not None and len(matched_nodes) >= 1
+    assert matched_node_pattern is not None and len(matched_node_pattern) >= 1
     pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config_dict)
-    dtype_configs: List[Dict[str, torch.dtype]] = pattern_to_dtype_configs.get(pattern, [])
+    dtype_configs: List[Dict[str, Any]] = pattern_to_dtype_configs.get(pattern, [])
 
-    # TODO: this only checks one input and one output, need to generalize to multiple
+    # TODO: this only works for one input and one output patterns, need to generalize to multiple
     # inputs/output
-    input_node = matched_nodes[-1]
-    output_node = matched_nodes[0]
+    root_node = _default_root_node_getter(matched_node_pattern)
+    input_node = root_node
+    output_node = matched_node_pattern[0]
     for dtype_config in dtype_configs:
         # check if arg dtype are supported
         supported = True
@@ -231,10 +257,21 @@ def qat_swap_modules(
         module_to_qat_module: Dict[Callable, Callable]) -> None:
     convert(root, mapping=module_to_qat_module, inplace=True, remove_qconfig=False)
 
-# TODO: remove observed_op, looks like it's not used
+def add_matched_node_name_to_set(matched_node_pattern: NodePattern, s: Set[str]):
+    if isinstance(matched_node_pattern, Node):
+        s.add(matched_node_pattern.name)
+    elif isinstance(matched_node_pattern, (list, tuple)):
+        for maybe_node in matched_node_pattern:
+            add_matched_node_name_to_set(maybe_node, s)
+
+# this is temporary, will be removed soon
+def _default_root_node_getter(node_pattern):
+    while not isinstance(node_pattern, Node):
+        node_pattern = node_pattern[-1]
+    return node_pattern
+
 def insert_observer(
     node: Node,
-    observed_op: Node,
     observer: ObserverBase,
     model: torch.nn.Module,
     modules: Dict[str, torch.nn.Module],
@@ -271,7 +308,7 @@ def get_target_activation_dtype_for_node(
     qhandler: Optional[QuantizeHandler],
     modules: Dict[str, torch.nn.Module],
     cache_for_no_tensor_check: Dict[Node, bool],
-) -> Dict[str, Optional[torch.dtype]]:
+) -> Dict[str, Optional[Union[torch.dtype, type]]]:
     """
     Returns the expected dtype of the input and output of this node after
     convert. If the value is not None, it represents the dtype of the
@@ -317,7 +354,7 @@ def get_target_activation_dtype_for_node(
 
         # get qconfig to determine the eventual dtype of this node
         if qconfig is not None:
-            if qhandler is not None and qhandler.input_output_observed() and qhandler.is_output_quantized(qconfig):
+            if qhandler is not None and qhandler.input_output_observed():
                 act_dtype, weight_dtype, act_compute_dtype = \
                     get_qconfig_dtypes(qconfig)
                 bias_dtype = torch.float16 \
@@ -325,6 +362,7 @@ def get_target_activation_dtype_for_node(
                     else torch.float
                 return {
                     "input_activation_dtype": act_dtype,
+                    "input_activation_compute_dtype": act_compute_dtype,
                     "weight_dtype": weight_dtype,
                     "bias_dtype": bias_dtype,
                     "output_activation_dtype": act_dtype,
@@ -360,8 +398,8 @@ def get_target_activation_dtype_for_node(
 def get_arg_target_dtype_as_output(
     arg: Node,
     modules: Dict[str, torch.nn.Module],
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
-) -> Optional[torch.dtype]:
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
+) -> Optional[Union[torch.dtype, type]]:
     """ Get the target output activation dtype for
     the argumnet in the original graph, skipping inserted observers
     We are assuming that the observers are inserted correctly, and the dtype for
@@ -379,8 +417,8 @@ def get_arg_target_dtype_as_input_to_node(
     arg: Node,
     node: Node,
     modules: Dict[str, torch.nn.Module],
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
-) -> Optional[torch.dtype]:
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
+) -> Optional[Union[torch.dtype, type]]:
     """ Get the target argument dtype for the argument `arg`, as input
     to node `node`
     """
@@ -398,6 +436,24 @@ def get_arg_target_dtype_as_input_to_node(
     else:
         return node_name_to_target_dtype[node.name]["bias_dtype"]
 
+def get_arg_target_compute_dtype_as_input_to_node(
+    arg: Node,
+    node: Node,
+    modules: Dict[str, torch.nn.Module],
+    node_name_to_target_dtype: Dict[str, Dict[str, Union[torch.dtype, type, None]]],
+) -> Union[torch.dtype, type, None]:
+    """ Get the target argument dtype for the argument `arg`, as input
+    to node `node`
+    """
+    assert isinstance(arg, Node)
+    is_weight = node_arg_is_weight(node, arg)
+    is_bias = node_arg_is_bias(node, arg)
+    is_activation = not is_weight and not is_bias
+    if is_activation and \
+       "input_activation_compute_dtype" in node_name_to_target_dtype[node.name]:
+        return node_name_to_target_dtype[node.name]["input_activation_compute_dtype"]
+    else:
+        return None
 
 def maybe_insert_input_observer_for_arg_or_kwarg(
     node: Union[Node, Any],
@@ -406,7 +462,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
     model: torch.nn.Module,
     modules: Dict[str, torch.nn.Module],
     graph: Graph,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     qhandler: Optional[QuantizeHandler],
     prepare_custom_config_dict: Dict[str, Any],
     backend_config_dict: Optional[Dict[str, Any]],
@@ -435,8 +491,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
     # default (no observer)
     new_arg = arg
 
-    is_standalone_module = qhandler is not None and \
-        isinstance(qhandler, StandaloneModuleQuantizeHandler)
+    is_standalone_module = qhandler is not None and qhandler.is_standalone_module()
     assert qconfig is not None
     if not is_standalone_module:
         # regular flow for most nodes, except standalone modules
@@ -449,6 +504,9 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
 
         arg_as_output_target_dtype = get_arg_target_dtype_as_output(arg, modules, node_name_to_target_dtype)
         arg_as_input_target_dtype = get_arg_target_dtype_as_input_to_node(arg, node, modules, node_name_to_target_dtype)
+        arg_as_input_target_compute_dtype = \
+            get_arg_target_compute_dtype_as_input_to_node(
+                arg, node, modules, node_name_to_target_dtype)
         needs_obs = (
             # if the dtypes are different, we need an observer
             (arg_as_output_target_dtype != arg_as_input_target_dtype) and
@@ -457,10 +515,16 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
             # TODO(future PR): change this so a placeholder is inserted for
             # future dequants, to make the logic easier to understand
             (arg_as_input_target_dtype != torch.float) and
-            # if arg is a bool tensor or not a tensor, do not insert observer
-            (arg_as_output_target_dtype not in (torch.bool, None)) and
+            # if arg output dtype is in DO_NOT_OBS_DTYPE_LIST do not insert observer
+            (arg_as_output_target_dtype not in DO_NOT_OBS_DTYPE_LIST) and
             # if qconfig is reuse_input qconfig, we won't insert extra observer for input
-            not is_reuse_input_qconfig_
+            not is_reuse_input_qconfig_ or
+            # need to add input observer for dynamic quantization
+            # only add observer for first input for now, we may need to extend
+            # qconfig_dict and backend_config_dict to support more general configurations
+            # of dynamic quantization, e.g. dynamically quantizing second input, third
+            # input etc.
+            (arg_as_input_target_compute_dtype in [torch.quint8, torch.int8, torch.float16]) and arg is node.args[0]
         )
 
     else:
@@ -517,7 +581,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
 
         if existing_obs_node is None:
             new_obs_node = insert_observer(
-                arg, node, new_obs_mod, model, modules, graph)
+                arg, new_obs_mod, model, modules, graph)
             # override this arg to be the observed arg
             new_arg = new_obs_node
         else:
@@ -532,7 +596,7 @@ def maybe_insert_input_observers_for_node(
     model: torch.nn.Module,
     modules: Dict[str, torch.nn.Module],
     graph: Graph,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     qhandler: Optional[QuantizeHandler],
     prepare_custom_config_dict: Dict[str, Any],
     backend_config_dict: Optional[Dict[str, Any]],
@@ -587,7 +651,7 @@ def maybe_insert_input_equalization_observers_for_node(
     model: torch.nn.Module,
     modules: Dict[str, torch.nn.Module],
     graph: Graph,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     is_branch: bool,
 ) -> None:
     """
@@ -618,7 +682,7 @@ def maybe_insert_input_equalization_observers_for_node(
 
         new_eq_obs_mod = act_eq_process_ctr()
         new_eq_obs_node = insert_observer(
-            arg, node, new_eq_obs_mod, model, modules, graph)
+            arg, new_eq_obs_mod, model, modules, graph)
 
         new_args.append(new_eq_obs_node)
 
@@ -631,7 +695,7 @@ def maybe_insert_output_observer_for_node(
     modules: Dict[str, torch.nn.Module],
     graph: Graph,
     matches: Dict[str, MatchResult],
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     matched_pattern: Any,
     qhandler: Optional[QuantizeHandler],
     is_qat: bool,
@@ -642,7 +706,7 @@ def maybe_insert_output_observer_for_node(
 
     If `node` does not need an output observer, returns None.
     """
-    root_node, matched_nodes, pattern, qhandler, qconfig = matches.get(
+    root_node, _, pattern, qhandler, qconfig = matches.get(
         node.name, (None, None, None, None, None))
 
     if qhandler is None:
@@ -651,13 +715,10 @@ def maybe_insert_output_observer_for_node(
     assert qconfig is not None
     assert node.op != 'output', 'observer insertion for outputs is handled elsewhere'
 
-    is_standalone_module = qhandler is not None and \
-        isinstance(qhandler, StandaloneModuleQuantizeHandler)
+    is_standalone_module = qhandler is not None and qhandler.is_standalone_module()
 
     dtype = node_name_to_target_dtype[node.name]["output_activation_dtype"]
-    should_insert_observer = \
-        qhandler.should_insert_observer_for_output(
-            qconfig, is_qat) and dtype not in (torch.bool, None, torch.float)
+    should_insert_observer = dtype not in DO_NOT_OBS_DTYPE_LIST + [torch.float]
     # TODO(future PR): move the following logic to
     # should_insert_observer_for_output
     should_insert_observer = should_insert_observer and \
@@ -676,7 +737,7 @@ def maybe_insert_output_observer_for_node(
                 matched_pattern,
                 is_qat)
         observer = act_post_process_ctr()
-        new_obs = insert_observer(node, node, observer, model, modules, graph)
+        new_obs = insert_observer(node, observer, model, modules, graph)
         return new_obs
     else:
         return None
@@ -684,7 +745,7 @@ def maybe_insert_output_observer_for_node(
 def maybe_insert_observers_before_graph_output(
     graph_output_node: Node,
     output_quantized_idxs: List[int],
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     qconfig_map: Dict[str, QConfigAny],
     model: torch.nn.Module,
     modules: Dict[str, torch.nn.Module],
@@ -713,7 +774,7 @@ def maybe_insert_observers_before_graph_output(
     def _recursive_maybe_replace_node_with_obs(
         maybe_node: Argument,
         target_dtype: torch.dtype,
-        node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+        node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
         qconfig_map: Dict[str, QConfigAny],
         model: torch.nn.Module,
         modules: Dict[str, torch.nn.Module],
@@ -748,7 +809,7 @@ def _recursive_maybe_replace_node_with_obs(
                     'Quantizing the output node without a qconfig is not supported'
                 observer_mod = qconfig.activation()
                 observer_node = insert_observer(
-                    maybe_node, maybe_node, observer_mod, model, modules, graph)
+                    maybe_node, observer_mod, model, modules, graph)
                 return observer_node
             else:
                 return maybe_node
@@ -784,8 +845,8 @@ def _recursive_maybe_replace_node_with_obs(
 
 def maybe_propagate_dtype_for_node(
     node: Node,
-    target_dtype: torch.dtype,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    target_dtype: Union[torch.dtype, type],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     matches: Dict[str, MatchResult],
 ) -> None:
     """
@@ -797,9 +858,9 @@ def maybe_propagate_dtype_for_node(
     node_name_to_target_dtype[node.name]["input_activation_dtype"] = target_dtype
     node_name_to_target_dtype[node.name]["output_activation_dtype"] = target_dtype
     # if this is a copy node, propagate to first arg
-    root_node, matched_nodes, pattern, qhandler, qconfig = matches.get(
+    root_node, _, pattern, qhandler, qconfig = matches.get(
         node.name, (None, None, None, None, None))
-    if qhandler is not None and qhandler.is_general_tensor_shape_op():
+    if qhandler is not None and qhandler.is_general_tensor_value_op():
         prev_node = node.args[0]
         if isinstance(prev_node, Node):
             maybe_propagate_dtype_for_node(
@@ -807,7 +868,7 @@ def maybe_propagate_dtype_for_node(
 
 def propagate_dtypes_for_known_nodes(
     graph: Graph,
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]],
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]],
     matches: Dict[str, MatchResult],
 ) -> None:
     """
@@ -821,11 +882,26 @@ def propagate_dtypes_for_known_nodes(
     replace this with a better way to reason about dtypes of tensors.
     """
     for node in graph.nodes:
-        bool_arg_idxs = node_bool_tensor_arg_indexes(node)
-        for bool_arg_idx in bool_arg_idxs:
-            cur_node = node.args[bool_arg_idx]
-            maybe_propagate_dtype_for_node(
-                cur_node, torch.bool, node_name_to_target_dtype, matches)
+        non_observable_arg_dict = get_non_observable_arg_indexes_and_types(node)
+
+        for arg_type in non_observable_arg_dict:
+            non_observable_indices = non_observable_arg_dict[arg_type](node)
+
+            for index in non_observable_indices:
+                arg = node.args[index]
+
+                # when an argument is a tuple, it does not show up as another node so we need to go through
+                # all elements of the tuple manually
+                if isinstance(arg, tuple) or isinstance(arg, list):
+                    arg_list = list(arg)
+                else:
+                    arg_list = [arg]
+
+                for cur_arg in arg_list:
+                    # hard coded arguments show up but aren't `Node` typed and do not need dtype propgated
+                    if isinstance(cur_arg, torch.fx.node.Node):
+                        maybe_propagate_dtype_for_node(
+                            cur_arg, arg_type, node_name_to_target_dtype, matches)
 
 def maybe_make_input_output_share_observers(
     node: Node,
@@ -900,6 +976,9 @@ def maybe_make_input_output_share_observers(
                 continue
             iteration_guard = 0
             while not is_activation_post_process_node(input_arg, modules):
+                # failed to trace back since no input arg for the current node
+                if len(input_arg.args) < 1:
+                    return False
                 input_arg = input_arg.args[0]
                 iteration_guard += 1
                 if iteration_guard > 10000:
@@ -1009,7 +1088,7 @@ def insert_observers_for_model(
     #   }
     #
     # TODO: rename this to node_name_to_target_dtype_info
-    node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]] = defaultdict(dict)
+    node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]] = defaultdict(dict)
     cache_for_no_tensor_check: Dict[Node, bool] = dict()
 
     inputs_seen_counter = 0
@@ -1021,7 +1100,7 @@ def insert_observers_for_model(
     # other nodes output dtype is specified by the qconfig
     modules = dict(model.named_modules(remove_duplicate=False))
     for node in model.graph.nodes:
-        root_node, matched_nodes, pattern, qhandler, qconfig = matches.get(
+        root_node, _, pattern, qhandler, qconfig = matches.get(
             node.name, (None, None, None, None, None))
         node_name_to_target_dtype[node.name] = get_target_activation_dtype_for_node(
             node, qconfig, inputs_seen_counter, outputs_seen_counter,
@@ -1062,7 +1141,7 @@ def insert_observers_for_model(
 
         elif node.op in ('call_module', 'call_method', 'call_function', 'output'):
             # check for matches
-            root_node, matched_nodes, pattern, qhandler, qconfig = matches.get(
+            last_node, matched_node_pattern, pattern, qhandler, qconfig = matches.get(
                 node.name, (None, None, None, None, None))
             equalization_qconfig = equalization_config_map.get(node.name, None)
 
@@ -1081,15 +1160,14 @@ def insert_observers_for_model(
             )
 
             is_supported_by_backend = is_pattern_dtype_config_supported_by_backend(
-                pattern, matched_nodes, node_name_to_target_dtype, backend_config_dict)
+                pattern, matched_node_pattern, node_name_to_target_dtype, backend_config_dict)
 
             if not skip_inserting_observers and is_supported_by_backend:
                 modules = dict(model.named_modules(remove_duplicate=False))
                 if node.op != 'output':
-                    assert matched_nodes is not None
+                    assert matched_node_pattern is not None
                     # add matched nodes to the observed node name set
-                    for n in matched_nodes:
-                        observed_node_names.add(n.name)
+                    add_matched_node_name_to_set(matched_node_pattern, observed_node_names)
 
                     # This is currently only used for equalization.
                     # Checks if the current node is in a branch in which the two
@@ -1116,26 +1194,28 @@ def insert_observers_for_model(
                             if user != node and is_user_quantized:
                                 is_quantized_branch = True
 
-                    # this modifies node inplace
-                    maybe_insert_input_observers_for_node(
-                        node, qconfig, model, modules, graph,
-                        node_name_to_target_dtype,
-                        qhandler,
-                        prepare_custom_config_dict,
-                        backend_config_dict)
-
-                    # Insert equalization input observers if needed
-                    maybe_insert_input_equalization_observers_for_node(
-                        node, equalization_qconfig, model, modules, graph,
-                        node_name_to_target_dtype, is_quantized_branch)
-
-                    is_last_node_of_pattern = root_node is node
+                    # TODO: this only works for sequential fusion right now, extend it
+                    # it to automatically detect all input nodes based on the pattern
+                    # need to change find_matches function to return this information
+                    root_node = _default_root_node_getter(matched_node_pattern)
+                    is_input_node_of_the_pattern = node is root_node
+                    if is_input_node_of_the_pattern:
+                        # this modifies node inplace
+                        maybe_insert_input_observers_for_node(
+                            node, qconfig, model, modules, graph,
+                            node_name_to_target_dtype,
+                            qhandler,
+                            prepare_custom_config_dict,
+                            backend_config_dict)
+
+                        # Insert equalization input observers if needed
+                        maybe_insert_input_equalization_observers_for_node(
+                            node, equalization_qconfig, model, modules, graph,
+                            node_name_to_target_dtype, is_quantized_branch)
+
+                    is_last_node_of_pattern = node is last_node
                     is_general_tensor_value_op = \
                         (qhandler is not None and qhandler.is_general_tensor_value_op())
-
-                    is_general_tensor_shape_op = \
-                        (qhandler is not None and qhandler.is_general_tensor_shape_op())
-
                     is_reuse_input_qconfig_ = is_reuse_input_qconfig(qconfig)
 
                     if is_last_node_of_pattern:
@@ -1165,14 +1245,17 @@ def insert_observers_for_model(
                                     continue
                                 user_node.replace_input_with(node, maybe_output_obs_node)
 
+                            is_observer_in_same_graph_ = is_observer_in_same_graph(node, modules, node_name_to_target_dtype)
+
                             # for general tensor value ops, we modify the graph
                             # to make all inputs and outputs use the first input's
                             # observer
-                            if is_general_tensor_value_op or is_general_tensor_shape_op or is_reuse_input_qconfig_:
+                            if (is_general_tensor_value_op and is_observer_in_same_graph_) or \
+                                    is_reuse_input_qconfig_:
                                 if not maybe_make_input_output_share_observers(node, model, modules):
                                     remove_output_observer(node, model, modules)
 
-                            if isinstance(qhandler, CustomModuleQuantizeHandler):
+                            if qhandler is not None and qhandler.is_custom_module():
                                 swap_custom_module_to_observed(node, qconfig, modules, prepare_custom_config_dict)
 
                 else:  # output
@@ -1211,11 +1294,11 @@ def run_prepare_fx_on_standalone_modules(
     """
     for (
         node_name,
-        (root_node, matched_nodes, pattern, qhandler, qconfig),
+        (root_node, _, pattern, qhandler, qconfig),
     ) in matches.items():
         if qhandler is None:
             continue
-        elif not isinstance(qhandler, StandaloneModuleQuantizeHandler):
+        elif not qhandler.is_standalone_module():
             continue
 
         sm_qconfig_dict, sm_prepare_config_dict, sm_backend_config_dict = \
@@ -1246,14 +1329,12 @@ def save_state(
     observed: GraphModule,
     qconfig_map: Dict[str, QConfigAny],
     node_name_to_scope: Dict[str, Tuple[str, type]],
-    patterns: Dict[Pattern, QuantizeHandler],
     prepare_custom_config_dict: Dict[str, Any],
     equalization_qconfig_map: Dict[str, Any],
     qconfig_dict: Dict[str, Dict[Any, Any]],
     is_qat: bool,
     observed_node_names: Set[str],
 ) -> None:
-    observed._patterns = patterns  # type: ignore[assignment]
     observed._qconfig_map = qconfig_map  # type: ignore[assignment]
     observed._prepare_custom_config_dict = \
         prepare_custom_config_dict  # type: ignore[assignment]
@@ -1297,8 +1378,6 @@ def prepare(
     if equalization_qconfig_dict is None:
         equalization_qconfig_dict = {}
 
-    additional_quant_patterns = \
-        prepare_custom_config_dict.get("additional_quant_pattern", {})
     # mapping from a tuple of nodes in reverse order to uninitialized
     #   QuantizeHandler subclass. For example,
     # {
@@ -1309,31 +1388,33 @@ def prepare(
     #   ((<function relu at 0x7f766a7360d0>, <built-in function add>):
     #     <class 'torch.ao.quantization.fx.quantize.Add'>),
     # }
+    # TODO: rename to pattern_to_quantize_handler
     patterns: Dict[Pattern, QuantizeHandler] = {}
     if backend_config_dict is None:
-        quant_patterns = get_default_quant_patterns()
-        patterns = get_combined_dict(
-            quant_patterns, additional_quant_patterns)
-    else:
-        patterns = get_pattern_to_quantize_handlers(backend_config_dict)
-
-        # TODO: make WEIGHT_INDEX_DICT and BIAS_INDEX_DICT an argument to the functions that needs them
-        # TODO: refactor this part to return WEIGHT_INDEX_DICT and BIAS_INDEX_DICT
-        pattern_to_input_type_to_index = get_pattern_to_input_type_to_index(backend_config_dict)
-        for pattern, input_type_to_index in pattern_to_input_type_to_index.items():
-            for input_type, index in input_type_to_index.items():
-                index_dicts = {
-                    "weight": WEIGHT_INDEX_DICT,
-                    "bias": BIAS_INDEX_DICT,
-                    "input": {}  # not used right now
-                }
-                assert input_type in index_dicts.keys(), \
-                    f"input type must be one of {index_dicts.keys()} but got: {input_type}"
-                index_dict = index_dicts[input_type]
-                if pattern in index_dict:  # type: ignore[operator]
-                    index_dict[pattern].append(index)  # type: ignore[index]
-                else:
-                    index_dict[pattern] = [index]  # type: ignore[index]
+        backend_config_dict = get_native_backend_config_dict()
+    patterns = get_pattern_to_quantize_handlers(backend_config_dict)
+    patterns = sorted_patterns_dict(patterns)
+
+    # TODO: make WEIGHT_INDEX_DICT and BIAS_INDEX_DICT an argument to the functions that needs them
+    # TODO: refactor this part to return WEIGHT_INDEX_DICT and BIAS_INDEX_DICT
+    pattern_to_input_type_to_index = get_pattern_to_input_type_to_index(backend_config_dict)
+    for pattern, input_type_to_index in pattern_to_input_type_to_index.items():
+        for input_type, index in input_type_to_index.items():
+            index_dicts = {
+                "weight": WEIGHT_INDEX_DICT,
+                "bias": BIAS_INDEX_DICT,
+                "input": {}  # not used right now
+            }
+            assert input_type in index_dicts.keys(), \
+                f"input type must be one of {index_dicts.keys()} but got: {input_type}"
+            index_dict = index_dicts[input_type]
+            if pattern in index_dict:  # type: ignore[operator]
+                index_dict[pattern].append(index)  # type: ignore[index]
+            else:
+                index_dict[pattern] = [index]  # type: ignore[index]
+
+    root_node_getter_mapping = \
+        get_fusion_pattern_to_root_node_getter(backend_config_dict)
 
     convert_dict_to_ordered_dict(qconfig_dict)
     convert_dict_to_ordered_dict(equalization_qconfig_dict)
@@ -1341,21 +1422,12 @@ def prepare(
     equalization_qconfig_dict = update_qconfig_for_fusion(model, equalization_qconfig_dict)
     flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
     # TODO: support regex as well
-    propagate_qconfig_(model, flattened_qconfig_dict)
+    propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config_dict)
 
     if is_qat:
-        additional_qat_module_mapping = prepare_custom_config_dict.get(
-            "additional_qat_module_mapping", {})
-        # this path will be deprecated after we fully migrate the convert path
-        # of fbgemm/qnnpack to use the reference path, it will stay
-        # here for a few months
-        if backend_config_dict is None:
-            module_to_qat_module = get_combined_dict(
-                get_default_qat_module_mappings(), additional_qat_module_mapping)
-        else:
-            module_to_qat_module = get_module_to_qat_module(backend_config_dict)
+        module_to_qat_module = get_module_to_qat_module(backend_config_dict)
         qat_swap_modules(model, module_to_qat_module)
-        qconfig_dict = update_qconfig_for_qat(qconfig_dict, additional_qat_module_mapping)
+        qconfig_dict = update_qconfig_for_qat(qconfig_dict, {})
 
     # mapping from fully qualified module name to module instance
     # for example,
@@ -1381,8 +1453,8 @@ def prepare(
     custom_module_classes = get_custom_module_class_keys(
         prepare_custom_config_dict, "float_to_observed_custom_module_class")
     matches = find_matches(
-        model.graph, modules, patterns, qconfig_map, standalone_module_names,
-        standalone_module_classes, custom_module_classes)
+        model.graph, modules, patterns, root_node_getter_mapping, qconfig_map,
+        standalone_module_names, standalone_module_classes, custom_module_classes)
 
     input_quantized_idxs: List[int] = prepare_custom_config_dict.get(
         "input_quantized_idxs", [])
@@ -1407,7 +1479,7 @@ def prepare(
         observed_node_names,
         is_qat)
 
-    save_state(model, qconfig_map, node_name_to_scope, patterns,
+    save_state(model, qconfig_map, node_name_to_scope,
                prepare_custom_config_dict, equalization_qconfig_map, qconfig_dict, is_qat, observed_node_names)
 
     preserved_attributes = set(prepare_custom_config_dict.get("preserved_attributes", []))
diff --git a/torch/ao/quantization/fx/qconfig_utils.py b/torch/ao/quantization/fx/qconfig_utils.py
index 80afa562a10f..4884ef08d0d6 100644
--- a/torch/ao/quantization/fx/qconfig_utils.py
+++ b/torch/ao/quantization/fx/qconfig_utils.py
@@ -1,6 +1,7 @@
 import torch
 from collections import defaultdict
-from typing import Callable, Any, Dict, Tuple, Set, Optional
+from typing import Callable, Any, Dict, Tuple, Set, Optional, List
+from torch.ao.quantization import QConfig
 from torch.ao.quantization.qconfig import add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
 from torch.ao.quantization.quantize import (
     is_activation_post_process,
@@ -13,7 +14,10 @@
 )
 from torch.nn.intrinsic import _FusedModule
 
-from ..utils import _parent_name
+from ..utils import (
+    _parent_name,
+    get_qconfig_dtypes,
+)
 from ..qconfig_dict_utils import (
     get_object_type_qconfig,
     maybe_adjust_qconfig_for_module_type_or_name,
@@ -213,10 +217,6 @@ def check_is_valid_prepare_custom_config_dict(prepare_custom_config_dict: Option
                                                "float_to_observed_custom_module_class",
                                                "non_traceable_module_name",
                                                "non_traceable_module_class",
-                                               "additional_fuser_method_mapping",
-                                               "additional_qat__module_mapping",
-                                               "additional_fusion_pattern",
-                                               "additional_quant_pattern",
                                                "input_quantized_idxs",
                                                "output_quantized_idxs",
                                                "preserved_attributes"}
@@ -234,8 +234,7 @@ def check_is_valid_convert_custom_config_dict(convert_custom_config_dict: Option
     if not convert_custom_config_dict:
         return
 
-    convert_custom_config_dict_allowed_keys = {"additional_object_mapping",
-                                               "observed_to_quantized_custom_module_class",
+    convert_custom_config_dict_allowed_keys = {"observed_to_quantized_custom_module_class",
                                                "preserved_attributes"}
     check_is_valid_config_dict(convert_custom_config_dict,
                                convert_custom_config_dict_allowed_keys, "convert_custom_config_dict")
@@ -250,8 +249,7 @@ def check_is_valid_fuse_custom_config_dict(fuse_custom_config_dict: Optional[Dic
     if not fuse_custom_config_dict:
         return
 
-    fuse_custom_config_dict_allowed_keys = {"additional_fuser_method_mapping",
-                                            "preserved_attributes"}
+    fuse_custom_config_dict_allowed_keys = {"preserved_attributes"}
     check_is_valid_config_dict(fuse_custom_config_dict, fuse_custom_config_dict_allowed_keys, "fuse_custom_config_dict")
 
 
@@ -284,6 +282,34 @@ def compare_prepare_convert_qconfig_dict(prepare_qconfig_dict: Dict[str, Dict[An
         else:
             assert "Unsupported key in convert_qconfig_dict {}".format(k)
 
+
+def is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[Dict[str, Any]]):
+    for dtype_config in dtype_configs:
+        is_dynamic = dtype_config.get("is_dynamic", False)
+        input_dtype = dtype_config.get("input_dtype", torch.float)
+        weight_dtype = dtype_config.get("weight_dtype", torch.float)
+        bias_dtype = dtype_config.get("bias_dtype", torch.float)
+        output_dtype = dtype_config.get("output_dtype", torch.float)
+        qconfig_activation_dtype, qconfig_weight_dtype, qconfig_compute_dtype = \
+            get_qconfig_dtypes(qconfig)
+        qconfig_bias_dtype = torch.float16 \
+            if qconfig_activation_dtype == torch.float16 and \
+            qconfig_weight_dtype == torch.float16 \
+            else torch.float
+
+        if is_dynamic:
+            is_match = input_dtype == qconfig_compute_dtype and \
+                output_dtype == torch.float and \
+                weight_dtype == qconfig_weight_dtype
+        else:
+            is_match = input_dtype == qconfig_activation_dtype and \
+                output_dtype == qconfig_activation_dtype and \
+                weight_dtype == qconfig_weight_dtype and \
+                bias_dtype == qconfig_bias_dtype
+        if is_match:
+            return True
+    return False
+
 # TODO: rename this file to config_utils
 def get_standalone_module_configs(
         module_name: str,
diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py
index ccb2ae98f9f8..bacec65d0337 100644
--- a/torch/ao/quantization/fx/quantization_patterns.py
+++ b/torch/ao/quantization/fx/quantization_patterns.py
@@ -1,58 +1,25 @@
 import torch
-from torch.fx import GraphModule
 from torch.fx.graph import (
     Node,
-    Graph,
-)
-from ..observer import (
-    default_affine_fixed_qparams_observer,
-    default_symmetric_fixed_qparams_observer,
-)
-
-from ..quantization_mappings import (
-    get_static_quant_module_class,
-    get_dynamic_quant_module_class,
-    get_quantized_operator,
-)
-from ..utils import (
-    get_swapped_custom_module_class,
-    activation_is_statically_quantized,
-    activation_is_int8_quantized,
-    weight_is_statically_quantized,
-    get_qconfig_dtypes,
-    activation_dtype,
-    get_qparam_dict,
-    check_node,
-)
-
-from torch.ao.quantization.quantize import (
-    is_activation_post_process,
 )
 
-from .pattern_utils import (
-    register_quant_pattern,
-    get_default_output_activation_post_process_map,
-    Pattern,
-)
-from ..utils import _parent_name
 from .utils import (
     all_node_args_have_no_tensors,
-    quantize_node,
-    get_per_tensor_qparams,
-    get_linear_prepack_op_for_dtype,
-    create_qparam_nodes,
-    get_qconv_prepack_op,
-    get_qconv_op,
-    create_node_from_old_node_preserve_meta,
 )
-
-from ..qconfig import QConfigAny
+from torch.ao.quantization.quantization_types import (
+    Pattern,
+    NodePattern,
+)
 
 from abc import ABC
-import operator
-import warnings
+from typing import Any, Callable, Dict, Optional
 
-from typing import Any, Callable, Dict, Union, Optional, Tuple, List
+def _default_root_node_getter(node_pattern):
+    if node_pattern is None:
+        return node_pattern
+    while not isinstance(node_pattern, Node):
+        node_pattern = node_pattern[-1]
+    return node_pattern
 
 # -------------------------
 # Pattern Registrations
@@ -64,33 +31,37 @@
 class QuantizeHandler(ABC):
     """ Base handler class for the quantizer patterns
     """
-    def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]):
+    def __init__(
+            self,
+            node_pattern: NodePattern,
+            modules: Dict[str, torch.nn.Module],
+            root_node_getter: Callable = None,
+            is_custom_module=False,
+            is_standalone_module=False):
         """ Records pattern information in __init__, which will be used
         in convert
         """
-        # this is an indicator of whether all the inputs are Node or not
-        # since some op might be quantized differently depending on whether
-        # all inputs are tensors or not, e.g. add/mul
-        self.num_tensor_args = len(node.args)
-        self.all_node_args_are_tensors = True
-        # the last node of the matched pattern
-        self.last_node = node
-
-    def _maybe_get_last_node_only_observer(
-        self,
-        modules: Dict[str, torch.nn.Module]
-    ) -> Optional[torch.nn.Module]:
-        """
-        If the last node of the pattern is observed, return the observer
-        instance. Otherwise, return None.
-        """
-        for maybe_obs_node, _ in self.last_node.users.items():
-            if maybe_obs_node.op == 'call_module':
-                maybe_obs = modules[str(maybe_obs_node.target)]
-                if is_activation_post_process(maybe_obs):
-                    return maybe_obs
-        return None
-
+        self.node_pattern = node_pattern
+        self.modules = modules
+        if root_node_getter is None:
+            root_node_getter = _default_root_node_getter
+        self.root_node = root_node_getter(node_pattern)
+        self.is_custom_module_ = is_custom_module
+        self.is_standalone_module_ = is_standalone_module
+        self.num_tensor_args = 0
+        # determine how many of the first two args are Tensors (versus scalars)
+        # this distinguishes things like "x + y" from "x + 2" or "2 + x"
+        if isinstance(self.root_node, Node):
+            cache_for_no_tensor_check: Dict[Node, bool] = dict()
+            for arg_idx in range(len(self.root_node.args)):
+                arg = self.root_node.args[arg_idx]
+                if isinstance(arg, Node) and (
+                        not all_node_args_have_no_tensors(
+                            arg, self.modules, cache_for_no_tensor_check)):
+                    self.num_tensor_args += 1
+
+    # TODO: can remove after the is_dynamic flag is defined, so that we can
+    # move embedding op to backend_config_dict
     def input_output_observed(self) -> bool:
         """
         Returns True if the pattern matched to this qhandler could be
@@ -102,44 +73,16 @@ def is_general_tensor_value_op(self) -> bool:
         """
         Returns True if the operator works for both floating point and
         quantized input, and does some computation based on the input Tensor,
+        or the ops that only re-arranges the Tensor values or query some metadata
+        about the Tensor
         so we need to insert observer/fake_quant for the output of the
-        operator since the distribution of values is different for input and output
-        Tensors (for HistogramObserver)
-        while they share the same quantization parameters
-        Example: avgpool2d
-        """
-        return False
-
-    def is_general_tensor_shape_op(self) -> bool:
-        """ Similar to is_general_tensor_value_op, this is a check
-        for ops that works for both floating point and quantized input,
-        that only re-arranges the Tensor values or query some metadata about the Tensor
-        We don't insert observer/fake_quant for the output of these operators
-        Example: reshape, transpose, maxpool2d
-        """
-        return False
-
-    def should_insert_observer_for_output(
-        self,
-        qconfig: Any,
-        model_is_training: bool,
-    ) -> bool:
-        """
-        Returns true if an observer should be inserted for the output of
-        the pattern matched to this QuantizeHandler instance during the
-        prepare step.
-        """
-        # TODO(future PR): potentially clean up and deduplicate these
-        # mappings.
-        return self.all_node_args_are_tensors and self.input_output_observed()
-
-    def should_mark_output_quantized_from_input_quantized_status(
-        self,
-        qconfig: QConfigAny
-    ) -> bool:
-        """
-        Returns true if after convert, the output of the matched pattern is
-        quantized iff the first input is also quantized.
+        operator (same observer instance as input)
+        since the distribution of values is different for input and output
+        Tensors (for HistogramObserver) while they share the same quantization
+        parameters
+        Example operator: avgpool2d, reshape, transpose, maxpool2d
+        Example observed operator:
+        observer_0 - avgpool2d - observer_0 (same observer instance as input)
         """
         return False
 
@@ -156,1633 +99,62 @@ def get_activation_ctr(
         """
         return qconfig.activation
 
-    def is_output_quantized(self, qconfig):
-        """ Returns true if the output node of convert is quantized
-        when is_reference is False, we would return float node when a certain dtype
-        combination is not supported (since fbgemm/qnnpack only support certain dtype
-        combinations), so the output may be float, but when is_reference is True,
-        we support all dtype combinations so the output will always be quantized.
-
-        TODO: This is fragile, whether output is quantized should not depend on `is_reference` since
-        we want to make sure whether a Tensor is quantized
-        should be the same in prepare and convert and is_reference
-        is only available in convert currently
+    def is_custom_module(self):
+        return self.is_custom_module_
 
-        """
-        return True
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        """ Convert the given node to a quantized node and insert
-        it to the quantized graph
-        """
-        return NotImplemented
+    def is_standalone_module(self):
+        return self.is_standalone_module_
 
-
-# Binary op configs
-
-# Supported combinations are:
-# quant_type | activation (compute_type) | weight
-#  static       quint8                      qint8
-
-# tuple (activation_dtype, weight_dtype, compute_dtype)
-# these are supported types for common binary ops like add/mul etc.
-all_dtypes = [
-    (torch.qint8, torch.qint8, None),
-    (torch.quint8, torch.qint8, None),
-    (torch.float16, torch.float16, None),
-]
-fp16_dtypes = [
-    (torch.float16, torch.float16, None)
-]
-int8_dtypes = [
-    (torch.qint8, torch.qint8, None),
-    (torch.quint8, torch.qint8, None),
-]
-binary_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = {
-    operator.add: all_dtypes,
-    torch.add: all_dtypes,
-    operator.mul: all_dtypes,
-    torch.mul: all_dtypes,
-    torch.bmm: fp16_dtypes,
-    torch.sub: fp16_dtypes,
-    operator.sub: fp16_dtypes,
-    torch.div: fp16_dtypes,
-    operator.truediv: fp16_dtypes,
-    torch.matmul: int8_dtypes,
-}
-
-default_op_supported_dtypes = {
-    torch.nn.ConvTranspose1d: int8_dtypes,
-    torch.nn.ConvTranspose2d: int8_dtypes,
-    torch.nn.ELU: int8_dtypes,
-    torch.nn.LeakyReLU: int8_dtypes,
-    torch.nn.Hardswish: int8_dtypes,
-    torch.nn.InstanceNorm1d: int8_dtypes,
-    torch.nn.InstanceNorm2d: int8_dtypes,
-    torch.nn.InstanceNorm3d: int8_dtypes,
-    torch.nn.LayerNorm: all_dtypes,
-    torch.nn.SiLU: fp16_dtypes,
-    torch.nn.Mish: fp16_dtypes,
-    torch.nn.GELU: int8_dtypes,
-    torch.nn.Dropout: int8_dtypes,
-    torch.nn.Softmax: int8_dtypes,
-    torch.nn.functional.elu: int8_dtypes,
-    torch.nn.functional.hardswish: int8_dtypes,
-    torch.nn.functional.instance_norm: int8_dtypes,
-    torch.nn.functional.layer_norm: all_dtypes,
-    torch.nn.functional.leaky_relu: int8_dtypes,
-    torch.nn.functional.silu: fp16_dtypes,
-    torch.nn.functional.mish: fp16_dtypes,
-    torch.nn.functional.gelu: int8_dtypes,
-    torch.nn.functional.softmax: int8_dtypes,
-    torch.nn.functional.dropout: int8_dtypes,
-    torch.sum: fp16_dtypes,
-}
-
-QAT_CONV_MODULE_CLASSES = \
-    (torch.nn.qat.Conv2d,
-     torch.nn.qat.Conv3d,
-     torch.nn.intrinsic.qat.ConvBn2d,
-     torch.nn.intrinsic.qat.ConvBnReLU2d,
-     torch.nn.intrinsic.qat.ConvReLU2d,
-     torch.nn.intrinsic.qat.ConvBn3d,
-     torch.nn.intrinsic.qat.ConvBnReLU3d,
-     torch.nn.intrinsic.qat.ConvReLU3d)
-
-
-##########################
-# Helper Functions
-##########################
-
-def _load_weight_qparams(
-        self, state_dict, prefix, local_metadata, strict,
-        missing_keys, unexpected_keys, error_msgs):
-    key = prefix + "_weight_qparams"
-    if key in state_dict:
-        self._weight_qparams = state_dict[key]
-        state_dict.pop(key)
-
-def _save_weight_qparams(self, destination, prefix, keep_vars):
-    for attr_name in dir(self):
-        if "_weight_qparams" == attr_name and \
-           isinstance(getattr(self, attr_name), dict):
-            weight_qparams = getattr(self, attr_name)
-            destination[prefix + attr_name] = weight_qparams
-
-
-def _to_reference(float_module, weight_qparams):
-    """ Make a weighted float module (e.g. conv and linear )a reference module by
-    attaching _weight_qparams that records the qparams for weight
-    and change the name for the module so that it's recognized
-    when people print the model
-    """
-    float_module._weight_qparams = weight_qparams
-    float_module._register_state_dict_hook(_save_weight_qparams)
-    float_module._register_load_state_dict_pre_hook(_load_weight_qparams, with_module=True)
-
-    float_module_name = float_module._get_name()
-
-    def _get_name():
-        return float_module_name + "(Reference)"
-
-    float_module._get_name = _get_name
-
-@register_quant_pattern(operator.add)
-@register_quant_pattern(operator.sub)
-@register_quant_pattern(operator.mul)
-@register_quant_pattern(operator.truediv)
-@register_quant_pattern(torch.add)
-@register_quant_pattern(torch.sub)
-@register_quant_pattern(torch.mul)
-@register_quant_pattern(torch.div)
-@register_quant_pattern(torch.bmm)
-@register_quant_pattern((torch.nn.ReLU, operator.add))
-@register_quant_pattern((torch.nn.ReLU, operator.mul))
-@register_quant_pattern((torch.nn.ReLU, torch.add))
-@register_quant_pattern((torch.nn.ReLU, torch.mul))
-@register_quant_pattern((torch.nn.functional.relu, operator.add))
-@register_quant_pattern((torch.nn.functional.relu, operator.mul))
-@register_quant_pattern((torch.nn.functional.relu, torch.add))
-@register_quant_pattern((torch.nn.functional.relu, torch.mul))
-@register_quant_pattern((torch.relu, operator.add))
-@register_quant_pattern((torch.relu, operator.mul))
-@register_quant_pattern(torch.matmul)
+# TODO: remove this class, this is still exposed in torch.quantization
+# but we should be able to break bc
 class BinaryOpQuantizeHandler(QuantizeHandler):
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        self.relu_node = None
-        if (
-            node.op == 'call_function' and
-                node.target in (torch.nn.functional.relu, torch.relu)
-        ) or (
-            node.op == 'call_module' and
-                isinstance(modules[str(node.target)], torch.nn.ReLU)
-        ):
-            self.relu_node = node
-            node = node.args[0]  # type: ignore[assignment]
-        self.binary_op_node = node
-        self.binary_op = node.target
-
-        # determine how many of the first two args are Tensors (versus scalars)
-        # this distinguishes things like "x + y" from "x + 2" or "2 + x"
-        self.num_tensor_args = 0
-        cache_for_no_tensor_check: Dict[Node, bool] = dict()
-        for arg_idx in range(len(self.binary_op_node.args)):
-            arg = self.binary_op_node.args[arg_idx]
-            if isinstance(arg, Node) and (not all_node_args_have_no_tensors(arg, modules, cache_for_no_tensor_check)):
-                self.num_tensor_args += 1
-        self.all_node_args_are_tensors = \
-            (self.num_tensor_args == len(self.binary_op_node.args))
-
-        qbin_op_mapping: Dict[Union[Callable, str], Callable] = {
-            operator.add: torch.ops.quantized.add,
-            torch.add: torch.ops.quantized.add,
-            operator.mul: torch.ops.quantized.mul,
-            torch.mul: torch.ops.quantized.mul,
-            torch.matmul: torch.ops.quantized.matmul,
-        }
-        qbin_relu_op_mapping: Dict[Union[Callable, str], Callable] = {
-            operator.add: torch.ops.quantized.add_relu,
-            torch.add: torch.ops.quantized.add_relu,
-            operator.mul: torch.ops.quantized.mul_relu,
-            torch.mul: torch.ops.quantized.mul_relu,
-        }
-        # corresponding quantized op
-        self.quantized_binary_op: Optional[Callable] = None
-        if self.binary_op in qbin_op_mapping:
-            self.quantized_binary_op = qbin_relu_op_mapping[self.binary_op] \
-                if self.relu_node is not None \
-                else qbin_op_mapping[self.binary_op]
-
-    def should_insert_observer_for_output(
-        self,
-        qconfig: Any,
-        model_is_training: bool,
-    ) -> bool:
-        """
-        Returns true if an observer should be inserted for the output of
-        the pattern matched to this QuantizeHandler instance during the
-        prepare step.
-        """
-        dtypes = get_qconfig_dtypes(qconfig)
-        if not (self.binary_op in binary_op_supported_dtypes and dtypes in binary_op_supported_dtypes[self.binary_op]):
-            return False
-        if self.num_tensor_args == 1:
-            return True
-        elif self.all_node_args_are_tensors and self.input_output_observed():
-            return True
-        else:
-            return False
-
-    def is_general_tensor_value_op(self) -> bool:
-        return self.num_tensor_args == 1
-
-    def input_output_observed(self):
-        # for x + y where x and y are scalars, we do not observe anything
-        return self.num_tensor_args > 0
-
-    def is_output_quantized(self, qconfig):
-        dtypes = get_qconfig_dtypes(qconfig)
-        return self.binary_op in binary_op_supported_dtypes and \
-            dtypes in binary_op_supported_dtypes[self.binary_op]
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-
-        if self.num_tensor_args == 0:
-            # example: x + y, when x and y are scalars
-            return quantized_graph.node_copy(
-                node, load_arg(quantized=None))
-
-        dtypes = get_qconfig_dtypes(qconfig)
-
-        if is_reference:
-            act_dtype = activation_dtype(qconfig)
-            dtypes = get_qconfig_dtypes(qconfig)
-            if act_dtype == torch.float or \
-               not (self.binary_op in binary_op_supported_dtypes and dtypes in binary_op_supported_dtypes[self.binary_op]):
-                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            else:
-                if self.num_tensor_args == 2:
-                    # make sure both inputs are quantized to act_dtype
-                    load_arg(quantized={0: act_dtype, 1: act_dtype})(self.binary_op_node.args)
-                args = load_arg(quantized=torch.float)(self.binary_op_node.args)
-                kwargs = load_arg(quantized=torch.float)(self.binary_op_node.kwargs)
-                op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float))
-
-                def modified_load_arg(n: Node):
-                    if n.name == self.binary_op_node.name:
-                        return op_out
-                    else:
-                        return load_arg(quantized=torch.float)(n)
-
-                if self.relu_node:
-                    op_out = quantized_graph.node_copy(self.relu_node, modified_load_arg)
-                activation_post_process = \
-                    self._maybe_get_last_node_only_observer(modules)
-                assert activation_post_process is not None
-                return quantize_node(
-                    op_out, activation_post_process,
-                    node, modules, quantized_graph, node_name_to_scope, is_input=False)
-        elif not is_reference and self.binary_op in binary_op_supported_dtypes and \
-                dtypes in binary_op_supported_dtypes[self.binary_op]:
-            if dtypes in [(torch.quint8, torch.qint8, None)]:
-                assert self.quantized_binary_op is not None
-                if self.num_tensor_args == 1:
-                    # add/mul scalar
-                    first_arg = self.binary_op_node.args[0]
-                    cache_for_no_tensor_check: Dict[Node, bool] = dict()
-                    if isinstance(first_arg, Node) and (
-                            not all_node_args_have_no_tensors(
-                                first_arg, modules, cache_for_no_tensor_check)):
-                        quantized_index = 0
-                    else:
-                        quantized_index = 1
+    pass
 
-                    return create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        (
-                            'call_function', self.quantized_binary_op,
-                            load_arg(quantized=[quantized_index])(self.binary_op_node.args),
-                            self.binary_op_node.kwargs
-                        ),
-                        self.binary_op_node)
-                else:
-                    activation_post_process = \
-                        self._maybe_get_last_node_only_observer(modules)
-                    assert activation_post_process is not None
-                    scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[operator]
-                    scale = float(scale)
-                    zero_point = int(zero_point)
-                    scale_arg, zero_point_arg = \
-                        create_qparam_nodes(
-                            node.name, scale, zero_point, modules,
-                            quantized_graph, node_name_to_scope)
-                    kwargs = {**self.binary_op_node.kwargs}
-                    add_args = (*load_arg(quantized=activation_dtype(qconfig))(self.binary_op_node.args), scale_arg, zero_point_arg)
-                    op = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ('call_function', self.quantized_binary_op, add_args, kwargs),
-                        self.binary_op_node)
-                    return op
-            else:
-                assert dtypes == (torch.float16, torch.float16, None)
-                # TODO (refactor) this is duplicated, maybe have a helper function
-                if self.relu_node:
-                    op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float))
-                    relu_args = [op_out]
-                    relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                    relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                    op_out = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                        self.relu_node)
-                else:
-                    op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantized_graph.create_node(
-                    "call_method", "to", (op_out, torch.float16,), {}
-                )
-        else:
-            # leave the op unquantized if the dtype,reference combination is not supported
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by {} for is_reference={}. "
-                "Supported non-reference dtype combinations are: {} "
-                "".format(dtypes,
-                          self.binary_op,
-                          is_reference,
-                          binary_op_supported_dtypes[self.binary_op]
-                          )
-            )
-            if self.relu_node:
-                op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float))
-                relu_args = [op_out]
-                relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                return create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                    self.relu_node)
-            else:
-                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-
-
-@register_quant_pattern(torch.cat)
 class CatQuantizeHandler(QuantizeHandler):
-    def is_general_tensor_value_op(self) -> bool:
-        return True
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if not self.all_node_args_are_tensors:
-            return NotImplemented
-        act_dtype = activation_dtype(qconfig)
-        if act_dtype == torch.float:
-            op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            return op_out
-        else:
-            activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
-            assert activation_post_process is not None
-            # make sure the first argument is quantized to act_dtype
-            load_arg(quantized={0: act_dtype})(node.args)
-            args = list(load_arg(quantized=torch.float)(node.args))
-            kwargs = load_arg(quantized=torch.float)(node.kwargs)
-            op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            return quantize_node(
-                op_out,
-                activation_post_process,
-                node,
-                modules,
-                quantized_graph,
-                node_name_to_scope,
-                is_input=False)
+    pass
 
-# handle conv, maybe followed by relu
-# NB: matching order is reversed, that is we match from the bottom of this list to the beginning
-@register_quant_pattern(torch.nn.Conv1d)
-@register_quant_pattern(torch.nn.Conv2d)
-@register_quant_pattern(torch.nn.Conv3d)
-@register_quant_pattern(torch.nn.functional.conv1d)
-@register_quant_pattern(torch.nn.functional.conv2d)
-@register_quant_pattern(torch.nn.functional.conv3d)
-# TODO: add qat.Conv1d
-@register_quant_pattern(torch.nn.qat.Conv2d)
-@register_quant_pattern(torch.nn.qat.Conv3d)
-@register_quant_pattern(torch.nn.intrinsic.ConvReLU1d)
-@register_quant_pattern(torch.nn.intrinsic.ConvReLU2d)
-@register_quant_pattern(torch.nn.intrinsic.ConvReLU3d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn1d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn2d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn3d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU3d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d)
-@register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU3d)
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d))
-# just for error checks
-@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv1d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv3d))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv3d))
-# TODO: rename Relu -> ReLU to be more consistent with other classes
+# TODO: remove this class
 class ConvReluQuantizeHandler(QuantizeHandler):
-    def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        self.relu_node = None
-        if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
-           (node.op == 'call_module' and isinstance(modules[str(node.target)], torch.nn.ReLU)):
-            self.relu_node = node
-            node = node.args[0]  # type: ignore[assignment]
-        self.conv_node = node
-        if node.op == "call_module":
-            self.conv = modules[str(self.conv_node.target)]
-        elif node.op == "call_function":
-            self.conv = node.target  # type: ignore[assignment]
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        # Supported combinations are:
-        # quant_type | activation (compute_type) | weight
-        #  static       quint8                      qint8
-
-        # tuple (activation_dtype, weight_dtype, compute_dtype)
-        supported_dtypes = [
-            (torch.quint8, torch.qint8, None),
-        ]
-
-        # TODO: is_reference option for conv module
-        dtypes = get_qconfig_dtypes(qconfig)
-        # leave the op unquantized if the dtype combination is not supported
-        if not is_reference and dtypes not in supported_dtypes:
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by Conv "
-                "supported dtype combinations are: {}".format(dtypes, supported_dtypes))
-            if self.relu_node:
-                conv_out = quantized_graph.node_copy(self.conv_node, load_arg(quantized=torch.float))
-                relu_args = [conv_out]
-                relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                return create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                    self.relu_node)
-            else:
-                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-
-        activation_int8_quantized = activation_is_int8_quantized(qconfig)
-
-        if self.conv_node.op == 'call_module':
-            # note that relu should already be fused into conv module in the fusion step
-            assert self.relu_node is None, 'conv module and relu fusion is not executed, ' \
-                'please make sure to run fusion before prepare'
-            output_activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
-            assert output_activation_post_process is not None
+    pass
 
-            # We'll always produce reference pattern for torch.nn.Conv*d,
-            # will remove the else branch after we migrated all use cases
-            if is_reference or \
-                    type(self.conv) in [torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d] and \
-                    dtypes in [(torch.quint8, torch.qint8, None)]:
-                # produce dequant - float_op - quant pattern
-                dtype = torch.float
-                if activation_int8_quantized:
-                    dtype = activation_dtype(qconfig)
-                activation = load_arg(quantized=dtype)(self.conv_node.args[0])
-                args = load_arg(quantized=torch.float)(self.conv_node.args)
-                # Get the float conv and attach quantization scheme and quantization
-                # parameters of weight to the module
-                # and qparam is a dictionary of
-                # {"qscheme": ..., "scale": ..., "zero_point": ...} for per tensor quantization or
-                # {"qscheme": ..., "scale": ..., "zero_point": ..., "axis": ...} for per channel quantization
-                float_conv = self.conv
-                fused_conv = None
-                if isinstance(
-                        float_conv,
-                        QAT_CONV_MODULE_CLASSES):
-                    # case 1. converting qat conv module to
-                    # a float conv module, we need to attch
-                    # weight fake_quant to the conv module,
-                    # weight fake_quant is assumed to be run during
-                    # QAT so we don't need to run it again here
-                    float_conv = self.conv.to_float()  # type: ignore[operator]
-                    # change qat conv to conv
-                    parent_name, name = _parent_name(self.conv_node.target)
-                    setattr(modules[parent_name], name, float_conv)
-                    if isinstance(float_conv, torch.nn.intrinsic._FusedModule):
-                        fused_conv = float_conv
-                        float_conv = float_conv[0]
-                    weight_post_process = self.conv.weight_fake_quant
-                else:
-                    # case 2. converting a conv module/fused conv module
-                    # to float conv module, we need to attach
-                    # weight observer to the conv module and run it
-                    # with conv weight
-                    if isinstance(float_conv, torch.nn.intrinsic._FusedModule):
-                        fused_conv = float_conv
-                        float_conv = float_conv[0]  # type: ignore[index]
-                    assert qconfig is not None
-                    weight_post_process = qconfig.weight()
-                    # run weight observer
-                    weight_post_process(float_conv.weight)  # type: ignore[operator]
-                weight_qparams = get_qparam_dict(weight_post_process)
-                # hardcoded for now, TODO: expose the api to user,
-                # we can have a map from module to reference module
-                # and allow user to register new ones
-                qconv_cls = get_static_quant_module_class(
-                    type(float_conv), is_reference=True)
-                ref_conv = qconv_cls.from_float(float_conv, weight_qparams)  # type: ignore[attr-defined]
-                # if the parent is a fused conv (Sequential), we can replace the first
-                # item to ref conv, otherwise we can update
-                # the conv instance in the module tree
-                if fused_conv is not None:
-                    fused_conv[0] = ref_conv
-                else:
-                    parent_name, name = _parent_name(self.conv_node.target)
-                    setattr(modules[parent_name], name, ref_conv)
-                op_out = create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ('call_module', self.conv_node.target, args, {}),
-                    self.conv_node)
-                if output_activation_post_process:
-                    op_out = quantize_node(
-                        op_out,
-                        output_activation_post_process,
-                        node,
-                        modules,
-                        quantized_graph,
-                        node_name_to_scope,
-                        is_input=False)
-                return op_out
-            else:
-                if convert_custom_config_dict is None:
-                    convert_custom_config_dict = {}
-                additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
-                # 1. attach activation post process to module
-                self.conv.activation_post_process = output_activation_post_process
-                # 2. select quantized class
-                qconv_cls = get_static_quant_module_class(
-                    type(self.conv), additional_static_quant_mapping, is_reference=is_reference)
-                quantized = qconv_cls.from_float(self.conv)
-                parent_name, name = _parent_name(self.conv_node.target)
-                setattr(modules[parent_name], name, quantized)
-                return create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    (
-                        'call_module',
-                        self.conv_node.target,
-                        (load_arg(quantized=torch.quint8)(self.conv_node.args[0]),),
-                        {},
-                    ),
-                    self.conv_node)
-        else:  # call_function
-            assert self.conv_node.op == "call_function"
-            if is_reference:
-                # make sure the input and weight are quantized to torch.quint8, torch.qint8, respectively
-                load_arg(quantized={0: torch.quint8, 1: torch.qint8})(self.conv_node.args)
-                args = load_arg(quantized=torch.float)(self.conv_node.args)
-                kwargs = load_arg(quantized=torch.float)(self.conv_node.kwargs)
-                op_out = create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ("call_function", self.conv, args, kwargs),
-                    self.conv_node)
-                if self.relu_node:
-                    relu_args = [op_out]
-                    relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                    relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                    op_out = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                        self.relu_node)
-
-                if activation_int8_quantized:
-                    root_module = modules['']
-                    act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name
-                    act_post_process_node = self.relu_node if self.relu_node else self.conv_node
-                    activation_post_process = \
-                        self._maybe_get_last_node_only_observer(modules)
-                    assert activation_post_process is not None
-                    return quantize_node(
-                        op_out,
-                        activation_post_process,
-                        act_post_process_node,
-                        modules,
-                        quantized_graph,
-                        node_name_to_scope,
-                        is_input=False)
-                else:
-                    # output for dynamically quantized conv op is not quantized
-                    return op_out
-            else:
-                assert len(self.conv_node.args) >= 7, \
-                    "only conv2d calls with all arguments specified is supported right now in is_reference=False option"
-                # make sure the input and weight are quantized to torch.quint8, torch.qint8, respectively
-                args = load_arg(quantized={0: torch.quint8, 1: torch.qint8})(self.conv_node.args)
-                # pack weight
-                weight = load_arg(quantized=torch.qint8)(self.conv_node.args[1])
-                other_args = load_arg(quantized=torch.float)(self.conv_node.args[2:])
-                bias, stride, padding, dilation, groups = other_args
-                if self.conv == torch.nn.functional.conv1d:
-                    # F.conv1d can take `int` as well as `list[int]` for stride,
-                    # padding, dilation, but the prepack op cannot. Convert
-                    # these to lists if needed.
-                    stride = [stride] if isinstance(stride, int) else stride
-                    padding = [padding] if isinstance(padding, int) else padding
-                    dilation = [dilation] if isinstance(dilation, int) else dilation
-                prepack_args = (weight, bias, stride, padding, dilation, groups)
-                prepack_op = get_qconv_prepack_op(self.conv)
-                packed_weight = quantized_graph.create_node(
-                    "call_function", prepack_op, prepack_args, {})
-                assert activation_int8_quantized, \
-                    "currently only static quantization is supported for conv"
-                # construct conv input
-                if activation_int8_quantized:
-                    qconv_op = get_qconv_op(self.conv, self.relu_node is not None)
-                    conv_input = load_arg(quantized=torch.quint8)(self.conv_node.args[0])
-
-                    activation_post_process = \
-                        self._maybe_get_last_node_only_observer(modules)
-                    assert activation_post_process is not None
-
-                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                    scale_node, zero_point_node = \
-                        create_qparam_nodes(
-                            self.conv_node.name, scale, zero_point, modules,
-                            quantized_graph, node_name_to_scope)
-                    qconv_args = (conv_input, packed_weight, scale_node, zero_point_node)
-                    kwargs = load_arg(quantized=torch.float)(self.conv_node.kwargs)
-                    op = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ('call_function', qconv_op, qconv_args, kwargs),
-                        self.conv_node)
-                    # Store the name of the fused op to get the path of node after fusion as well.
-                    # TODO: may need to change the key to Node regenerate the map in each transformation,
-                    # since we might not be able to rely on the name
-                    node_name_to_scope[op.name] = node_name_to_scope[self.conv_node.name]
-                    return op
-                else:
-                    # conv2d_dyanmic branch
-                    raise Exception("Only static quant is supported for conv")
-
-@register_quant_pattern(torch.nn.Linear)
-@register_quant_pattern(torch.nn.functional.linear)
-@register_quant_pattern(torch.nn.qat.Linear)
-@register_quant_pattern(torch.nn.intrinsic.LinearReLU)
-@register_quant_pattern(torch.nn.intrinsic.qat.LinearReLU)
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.linear))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.linear))
-# for error checks
-@register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
+# TODO: remove this class
 class LinearReLUQuantizeHandler(QuantizeHandler):
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        self.relu_node = None
-        if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
-           (node.op == 'call_module' and isinstance(modules[str(node.target)], torch.nn.ReLU)):
-            self.relu_node = node
-            node = node.args[0]  # type: ignore[assignment]
-        self.linear_node = node
-        if node.op == 'call_module':
-            self.linear = modules[str(self.linear_node.target)]
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if convert_custom_config_dict is None:
-            convert_custom_config_dict = {}
-        # Supported combinations are:
-        # quant_type | activation (compute_type) | weight
-        #  static       quint8                      qint8
-        #  dynamic      float32 (quint8)            qint8
-        #  weight_only  float32                    float16
-        # tuple (activation_dtype, weight_dtype, compute_dtype)
-        supported_dtypes = [
-            (torch.quint8, torch.qint8, None),
-            (torch.float32, torch.qint8, torch.quint8),
-            (torch.float32, torch.float16, None),
-            # static float16 quantization
-            (torch.float16, torch.float16, None),
-        ]
-        dtypes = get_qconfig_dtypes(qconfig)
-        # leave the op unquantized if the dtype combination is not supported
-        if not is_reference and dtypes not in supported_dtypes:
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by Linear "
-                "supported dtype combinations are: {}".format(dtypes, supported_dtypes))
-            if self.relu_node:
-                op_out = quantized_graph.node_copy(self.linear_node, load_arg(quantized=torch.float))
-                relu_args = [op_out]
-                relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                return create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                    self.relu_node)
-            else:
-                return quantized_graph.node_copy(node, load_arg(quantized=None))
-
-        activation_int8_quantized = activation_is_int8_quantized(qconfig)
-        activation_statically_quantized = activation_is_statically_quantized(qconfig)
-        weight_dtype = dtypes[1]
-        if self.linear_node.op == 'call_module':
-
-            output_activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
+    pass
 
-            # note that relu should already be fused into linear modul in the fusion step
-            assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \
-                'please make sure to run fusion before prepare'
-            # we'll always produce reference pattern for the following modules
-            # will remove the else branch after we migrated all use cases
-            module_allowlist = [
-                torch.nn.Linear,
-                torch.nn.qat.Linear,
-                torch.nn.intrinsic.modules.fused.LinearReLU,
-                torch.nn.intrinsic.qat.modules.linear_relu.LinearReLU
-            ]
-            if is_reference or type(self.linear) in module_allowlist and dtypes in [(torch.quint8, torch.qint8, None)]:
-                # produce dequant - float_op - quant pattern
-                dtype = torch.float
-                if activation_int8_quantized:
-                    dtype = activation_dtype(qconfig)
-                activation = load_arg(quantized=dtype)(self.linear_node.args[0])
-                args = load_arg(quantized=torch.float)(self.linear_node.args)
-
-                # Get the float linear and attach qscheme and qparams the the module
-                float_linear = self.linear
-                fused_linear = None
-                if isinstance(float_linear, (torch.nn.qat.Linear, torch.nn.intrinsic.qat.LinearReLU)):
-                    float_linear = float_linear.to_float()
-                    # change qat linear to linear
-                    parent_name, name = _parent_name(self.linear_node.target)
-                    setattr(modules[parent_name], name, float_linear)
-                    # Attach weight fake quant to the linear module
-                    if isinstance(float_linear, torch.nn.intrinsic.LinearReLU):
-                        fused_linear = float_linear
-                        float_linear = float_linear[0]
-                    weight_post_process = self.linear.weight_fake_quant
-                else:
-                    if isinstance(float_linear, torch.nn.intrinsic.LinearReLU):
-                        fused_linear = float_linear
-                        float_linear = self.linear[0]  # type: ignore[index]
-                    # Attach the weight observer to the module
-                    weight_post_process = qconfig.weight()  # type: ignore[union-attr]
-
-                # Run weight observer
-                # TODO: This is currently a hack for QAT to get the right shapes for scale and zero point.
-                # In the future, we should require the user to calibrate the model after calling prepare
-                weight_post_process(float_linear.weight)  # type: ignore[operator]
-
-                weight_qparams = get_qparam_dict(weight_post_process)
-                # TODO: include the configuration in backend_config_dict
-                # we can have a map from module to reference module
-                # and allow user to register new ones
-                qlinear_cls = get_static_quant_module_class(
-                    type(float_linear), is_reference=True)
-                ref_linear = qlinear_cls.from_float(float_linear, weight_qparams)
-
-                # if the parent is a fused linear (Sequential), we can replace the first
-                # item to ref linear, otherwise we can update
-                # the linear instance in the module tree
-                if fused_linear is not None:
-                    fused_linear[0] = ref_linear
-                else:
-                    parent_name, name = _parent_name(self.linear_node.target)
-                    setattr(modules[parent_name], name, ref_linear)
-                op_out = create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ('call_module', self.linear_node.target, args, {}),
-                    self.linear_node)
-                if output_activation_post_process:
-                    op_out = quantize_node(
-                        op_out,
-                        output_activation_post_process,
-                        node,
-                        modules,
-                        quantized_graph,
-                        node_name_to_scope,
-                        is_input=False)
-                return op_out
-            # non-reference option
-            else:
-                # 1. attach output activation post process to linear module
-                if output_activation_post_process:
-                    self.linear.activation_post_process = output_activation_post_process
-
-                # 2. select corresponding quantized linear class for the float linear class
-                if activation_int8_quantized:
-                    additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
-                    qlinear = get_static_quant_module_class(
-                        type(self.linear), additional_static_quant_mapping)
-                else:
-                    assert dtypes in [
-                        (torch.float32, torch.qint8, torch.quint8),
-                        (torch.float32, torch.float16, None),
-                    ], f"dtype {dtypes} not supported yet"
-                    additional_dynamic_quant_mapping = convert_custom_config_dict.get("dynamic", {})
-                    qlinear = get_dynamic_quant_module_class(type(self.linear), additional_dynamic_quant_mapping)
-
-                quantized = qlinear.from_float(self.linear)
-                parent_name, name = _parent_name(self.linear_node.target)
-                setattr(modules[parent_name], name, quantized)
-                # activation needs to be quantized for static quantization
-                dtype = torch.float
-                if activation_int8_quantized:
-                    dtype = activation_dtype(qconfig)
-                return create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    (
-                        'call_module',
-                        self.linear_node.target,
-                        (load_arg(quantized=dtype)(self.linear_node.args[0]),), {},
-                    ),
-                    self.linear_node)
-        else:  # call_function
-            assert self.linear_node.op == 'call_function'
-            if is_reference:
-                quantized_input_dtypes = [torch.float, torch.float]
-                if activation_int8_quantized:
-                    quantized_input_dtypes[0] = torch.quint8
-                if weight_is_statically_quantized(qconfig):
-                    quantized_input_dtypes[1] = torch.qint8
-                args = load_arg(quantized=quantized_input_dtypes)(self.linear_node.args)
-                args = load_arg(quantized=torch.float)(self.linear_node.args)
-                kwargs = load_arg(quantized=torch.float)(self.linear_node.kwargs)
-                op_out = create_node_from_old_node_preserve_meta(
-                    quantized_graph,
-                    ("call_function", torch.nn.functional.linear, args, kwargs),
-                    self.linear_node)
-                if self.relu_node:
-                    relu_args = [op_out]
-                    relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                    relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                    op_out = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                        self.relu_node)
-
-                if activation_statically_quantized:
-                    # quantize output for statically quantized linear op
-                    root_module = modules['']
-                    act_post_process_name = self.relu_node.name if self.relu_node else self.linear_node.name
-                    act_post_process_node = self.relu_node if self.relu_node else self.linear_node
-                    activation_post_process = \
-                        self._maybe_get_last_node_only_observer(modules)
-                    assert activation_post_process is not None
-                    return quantize_node(
-                        op_out,
-                        activation_post_process,
-                        act_post_process_node,
-                        modules,
-                        quantized_graph,
-                        node_name_to_scope,
-                        is_input=False)
-                else:
-                    # output for dynamically quantized linear op is not quantized
-                    return op_out
-            else:  # non-reference option
-                # prepacking weights for static int8 quant and dynamic quant
-                if dtypes != (torch.float16, torch.float16, None):
-                    # linear args
-                    # (x, weight, bias, ...)
-                    # TODO: the name should be weight is int8 quantized
-                    weight_quantized = weight_is_statically_quantized(qconfig)
-                    dtype = weight_dtype if weight_quantized else torch.float
-                    linear_weight = load_arg(quantized=dtype)(self.linear_node.args[1])
-
-                    # get other arguments
-                    kwargs = {**load_arg(quantized=torch.float)(self.linear_node.kwargs)}
-                    # all args after bias, including bias
-                    other_args = load_arg(quantized=torch.float)(self.linear_node.args[2:])
-                    # bias might be either positional, or a keyword argument
-                    if len(self.linear_node.args) > 2:
-                        bias = load_arg(quantized=torch.float)(self.linear_node.args[2])
-                        other_args = other_args[1:]  # remove the bias argument
-                    else:
-                        bias = kwargs.pop('bias', None)
-
-                    prepack_args = (linear_weight, bias)
-                    prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
-                    packed_weight = quantized_graph.create_node(
-                        'call_function', prepack_op, prepack_args, {})
-                # construct linear input
-                if activation_int8_quantized:
-                    qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear
-                    linear_input = load_arg(quantized=torch.quint8)(self.linear_node.args[0])
-                    activation_post_process = \
-                        self._maybe_get_last_node_only_observer(modules)
-                    assert activation_post_process is not None
-                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                    scale_node, zero_point_node = \
-                        create_qparam_nodes(
-                            self.linear_node.name, scale, zero_point, modules,
-                            quantized_graph, node_name_to_scope)
-
-                    qlinear_args = (linear_input, packed_weight, scale_node, zero_point_node)
-                    op = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", qlinear_op, qlinear_args, kwargs),
-                        self.linear_node)
-                    # Store the name of the fused op to get the path of node after fusion as well.
-                    # TODO: may need to change the key to Node regenerate the map in each transformation,
-                    # since we might not be able to rely on the name
-                    node_name_to_scope[op.name] = node_name_to_scope[self.linear_node.name]
-                    return op
-                elif dtypes in [(torch.float32, torch.qint8, torch.quint8),
-                                (torch.float32, torch.float16, None)]:
-                    # choose linear dynamic or linear dynamic fp16 op based on weight dtype
-                    if weight_dtype == torch.qint8:
-                        if self.relu_node:
-                            qlinear_op = torch.ops.quantized.linear_relu_dynamic
-                        else:
-                            qlinear_op = torch.ops.quantized.linear_dynamic
-                    else:
-                        if self.relu_node:
-                            qlinear_op = torch.ops.quantized.linear_relu_dynamic_fp16
-                        else:
-                            qlinear_op = torch.ops.quantized.linear_dynamic_fp16
-
-                    linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0])
-                    qlinear_args = (linear_input, packed_weight)  # type: ignore[assignment]
-                    op_out = create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", qlinear_op, qlinear_args, kwargs),
-                        self.linear_node)
-                    # Store the name of the dynamic op to get the path of node after replacement as well.
-                    # TODO: may need to change the key to Node regenerate the map in each transformation,
-                    # since we might not be able to rely on the name
-                    node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name]
-                    return op_out
-                else:
-                    assert dtypes == (torch.float16, torch.float16, None)
-                    # TODO (refactor) this is duplicated, maybe have a helper function
-                    if self.relu_node:
-                        op_out = quantized_graph.node_copy(self.linear_node, load_arg(quantized=torch.float))
-                        relu_args = [op_out]
-                        relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:]))
-                        relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs)
-                        op_out = create_node_from_old_node_preserve_meta(
-                            quantized_graph,
-                            ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs),
-                            self.relu_node)
-                    else:
-                        op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                    return quantized_graph.create_node(
-                        "call_method", "to", (op_out, torch.float16), {})
-
-@register_quant_pattern(torch.nn.BatchNorm2d)
-@register_quant_pattern(torch.nn.BatchNorm3d)
-@register_quant_pattern(torch.nn.intrinsic.BNReLU2d)
-@register_quant_pattern(torch.nn.intrinsic.BNReLU3d)
+# TODO: remove this class
 class BatchNormQuantizeHandler(QuantizeHandler):
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        assert node.op == 'call_module'
-        self.bn_node = node
-        self.bn = modules[str(self.bn_node.target)]
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if convert_custom_config_dict is None:
-            convert_custom_config_dict = {}
-        additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
-        # 1. attach activation post process to module
-        output_activation_post_process = \
-            self._maybe_get_last_node_only_observer(modules)
-        assert output_activation_post_process is not None
-        if is_reference:
-            # produce dequant - float_op - quant pattern
-            dtype = activation_dtype(qconfig)
-            activation = load_arg(quantized=dtype)(self.bn_node.args[0])
-            args = load_arg(quantized=torch.float)(self.bn_node.args)
-            op_out = create_node_from_old_node_preserve_meta(
-                quantized_graph,
-                ("call_module", self.bn_node.target, args, {}),
-                self.bn_node)
-            if output_activation_post_process:
-                op_out = quantize_node(
-                    op_out,
-                    output_activation_post_process,
-                    node,
-                    modules,
-                    quantized_graph,
-                    node_name_to_scope,
-                    is_input=False)
-            return op_out
-        else:
-            self.bn.activation_post_process = output_activation_post_process
-            qbn_cls = get_static_quant_module_class(type(self.bn), additional_static_quant_mapping)
-            quantized = qbn_cls.from_float(self.bn)
-            parent_name, name = _parent_name(self.bn_node.target)
-            setattr(modules[parent_name], name, quantized)
-            return create_node_from_old_node_preserve_meta(
-                quantized_graph,
-                (
-                    'call_module',
-                    self.bn_node.target,
-                    load_arg(quantized=[0])(self.bn_node.args),
-                    load_arg(quantized=torch.float)(self.bn_node.kwargs),
-                ),
-                self.bn_node)
+    pass
 
-@register_quant_pattern(torch.nn.qat.Embedding)
-@register_quant_pattern(torch.nn.qat.EmbeddingBag)
-@register_quant_pattern(torch.nn.Embedding)
-@register_quant_pattern(torch.nn.EmbeddingBag)
+# TODO: remove this class
 class EmbeddingQuantizeHandler(QuantizeHandler):
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-
-    def input_output_observed(self) -> bool:
-        return False
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        # Supported combinations are:
-        # quant_type  | activation | weight | activation_compute_type
-        # weight_only |  float32   | quint8 | None
-        # weight_only |  float32   | quint4x2 | None
-        # tuple (activation_dtype, weight_dtype, compute_dtype)
-        supported_dtypes = [
-            (torch.float32, torch.quint8, None),
-            (torch.float32, torch.quint4x2, None),
-        ]
-        assert node.op == 'call_module'
-        emb_node = node
-        dtypes = get_qconfig_dtypes(qconfig)
-        # leave the op unquantized if the dtype combination is not supported
-        if dtypes not in supported_dtypes:
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by Embedding/EmbeddingBag, "
-                "supported dtype combinations are: {}".format(dtypes, supported_dtypes))
-            return quantized_graph.node_copy(node, load_arg(quantized=None))
-
-        emb = modules[str(emb_node.target)]
-        qemb = get_static_quant_module_class(type(emb))
-        quantized = qemb.from_float(emb)
-        parent_name, name = _parent_name(emb_node.target)
-        setattr(modules[parent_name], name, quantized)
-        return create_node_from_old_node_preserve_meta(
-            quantized_graph,
-            (
-                'call_module',
-                emb_node.target,
-                load_arg(quantized=torch.float)(emb_node.args),
-                load_arg(quantized=torch.float)(emb_node.kwargs),
-            ),
-            emb_node)
+    pass
 
-# TODO (maybe): merge with embedding quantize handler
-@register_quant_pattern(torch.nn.GRUCell)
-@register_quant_pattern(torch.nn.LSTMCell)
-@register_quant_pattern(torch.nn.RNNCell)
-@register_quant_pattern(torch.nn.LSTM)
+# TODO: remove this class
 class RNNDynamicQuantizeHandler(QuantizeHandler):
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-
-    def input_output_observed(self) -> bool:
-        return False
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        # Supported combinations are:
-        # quant_type  | activation | weight | activation_compute_type
-        # dynamic |  float32   | qint8 | quint8
-        # dynamic |  float32   | float16 | None
-        # tuple (activation_dtype, weight_dtype, compute_dtype)
-        supported_dtypes = [
-            (torch.float32, torch.qint8, torch.quint8),
-            (torch.float32, torch.float16, None),
-        ]
-        assert node.op == 'call_module'
-        dtypes = get_qconfig_dtypes(qconfig)
-        # leave the op unquantized if the dtype combination is not supported
-        if dtypes not in supported_dtypes:
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by Embedding/EmbeddingBag, "
-                "supported dtype combinations are: {}".format(dtypes, supported_dtypes))
-            return quantized_graph.node_copy(node, load_arg(quantized=None))
-
-        module = modules[str(node.target)]
-        qmodule_cls = get_dynamic_quant_module_class(type(module))
-        qmodule = qmodule_cls.from_float(module)
-        parent_name, name = _parent_name(node.target)
-        setattr(modules[parent_name], name, qmodule)
-        return create_node_from_old_node_preserve_meta(
-            quantized_graph,
-            (
-                'call_module',
-                node.target,
-                load_arg(quantized=torch.float)(node.args),
-                load_arg(quantized=torch.float)(node.kwargs),
-            ),
-            node)
+    pass
 
-ARGS_TO_SKIP = {
-    torch._ops.ops.quantized.hardswish: ['inplace'],
-    torch._ops.ops.quantized.elu: ['inplace'],
-    torch._ops.ops.quantized.dropout: ['inplace'],
-    torch._ops.ops.quantized.instance_norm:
-    ['running_mean', 'running_var', 'use_input_stats', 'momentum'],
-}
-@register_quant_pattern(torch.nn.ConvTranspose1d)
-@register_quant_pattern(torch.nn.ConvTranspose2d)
-@register_quant_pattern(torch.nn.ELU)
-@register_quant_pattern(torch.nn.LeakyReLU)
-@register_quant_pattern(torch.nn.Hardswish)
-@register_quant_pattern(torch.nn.InstanceNorm1d)
-@register_quant_pattern(torch.nn.InstanceNorm2d)
-@register_quant_pattern(torch.nn.InstanceNorm3d)
-@register_quant_pattern(torch.nn.LayerNorm)
-@register_quant_pattern(torch.nn.SiLU)
-@register_quant_pattern(torch.nn.Mish)
-@register_quant_pattern(torch.nn.Dropout)
-# we currently only support reference patterns for these ops so they have been removed
-# until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig
-# @register_quant_pattern(torch.nn.GELU)
-# @register_quant_pattern(torch.nn.Softmax)
-@register_quant_pattern(torch.nn.functional.elu)
-@register_quant_pattern(torch.nn.functional.hardswish)
-@register_quant_pattern(torch.nn.functional.instance_norm)
-@register_quant_pattern(torch.nn.functional.layer_norm)
-@register_quant_pattern(torch.nn.functional.leaky_relu)
-@register_quant_pattern(torch.nn.functional.silu)
-@register_quant_pattern(torch.nn.functional.mish)
-@register_quant_pattern(torch.nn.functional.dropout)
-# we currently only support reference patterns for these ops so they have been removed
-# until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig
-# @register_quant_pattern(torch.nn.functional.gelu)
-# @register_quant_pattern(torch.nn.functional.softmax)
-@register_quant_pattern(torch.sum)
+# TODO: remove this class
 class DefaultNodeQuantizeHandler(QuantizeHandler):
     """ Common quantized op, first input and first output will be quantized
     """
-    def __init__(
-            self,
-            node: Node,
-            modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        if node.op == "call_function" or node.op == "call_method":
-            self.op = node.target
-        elif node.op == "call_module":
-            self.op = type(modules[str(node.target)])
-
-    def is_output_quantized(self, qconfig):
-        dtypes = get_qconfig_dtypes(qconfig)
-        return self.op in default_op_supported_dtypes and \
-            dtypes in default_op_supported_dtypes[self.op]
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if not self.all_node_args_are_tensors:
-            return NotImplemented
-        assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \
-            'call_function are handled in DefaultNode'
-        if convert_custom_config_dict is None:
-            convert_custom_config_dict = {}
-        additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
-
-        dtypes = get_qconfig_dtypes(qconfig)
-        if not is_reference and dtypes not in default_op_supported_dtypes[self.op]:
-            warnings.warn(
-                "dtype combination: {} is not "
-                "supported by {} "
-                "supported dtype combinations are: {}".format(dtypes, self.op, default_op_supported_dtypes[self.op]))
-            return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-        # TODO: make helper functions for (torch.quint8, torch.qint8, None)
-        if not is_reference:
-            if dtypes in [(torch.quint8, torch.qint8, None)]:
-                activation_post_process = \
-                    self._maybe_get_last_node_only_observer(modules)
-                assert activation_post_process is not None
-                if node.op == 'call_module':
-                    module = modules[str(node.target)]
-                    module.activation_post_process = activation_post_process
-                    quantized_module_cls = get_static_quant_module_class(
-                        type(module), additional_static_quant_mapping)
-                    quantized_module = quantized_module_cls.from_float(module)
-                    parent_name, name = _parent_name(node.target)
-                    setattr(modules[parent_name], name, quantized_module)
-                    return create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        (
-                            'call_module',
-                            node.target,
-                            load_arg(quantized=[0])(node.args),
-                            load_arg(quantized=torch.float)(node.kwargs),
-                        ),
-                        node)
-                else:
-                    assert node.op == "call_function"
-                    # call_function
-                    scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[operator]
-                    scale = float(scale)
-                    zero_point = int(zero_point)
-                    scale_arg, zero_point_arg = \
-                        create_qparam_nodes(
-                            node.name, scale, zero_point, modules,
-                            quantized_graph, node_name_to_scope)
+    pass
 
-                    assert not isinstance(node.target, str), "Expecting node.target for "
-                    "call_function to be a function instead of a string"
-                    quantized_op = get_quantized_operator(node.target)
-                    args = load_arg(quantized=[0])(node.args)
-                    kwargs = {**load_arg(quantized=torch.float)(node.kwargs), "output_scale": scale_arg,
-                              "output_zero_point": zero_point_arg}
-                    if quantized_op in ARGS_TO_SKIP:
-                        args_to_skip = ARGS_TO_SKIP[quantized_op]
-                        for arg in args_to_skip:
-                            if arg in kwargs:
-                                kwargs.pop(arg)
-                    return create_node_from_old_node_preserve_meta(
-                        quantized_graph,
-                        ("call_function", quantized_op, args, kwargs),  # type: ignore[arg-type]
-                        node)
-            else:
-                assert dtypes in [(torch.float16, torch.float16, None)]
-                # Generally fp16 kernels don't exist for fp16 ops
-                warnings.warn(
-                    "Only reference patterns are currently supported for {dtype} dtype with {op} op"
-                    "".format(dtype=dtypes, op=self.op))
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantized_graph.create_node(
-                    "call_method", "to", (op_out, torch.float16), {})
-        else:
-            assert is_reference
-            # We can produce reference for a dtypes including
-            # (torch.quint8, torch.qint8, torch.qint32, torch.float16)
-            act_dtype = activation_dtype(qconfig)
-            if act_dtype == torch.float:
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return op_out
-            else:
-                activation_post_process = \
-                    self._maybe_get_last_node_only_observer(modules)
-                assert activation_post_process is not None
-                # make sure the input is quantized to act_dtype
-                load_arg(quantized={0: act_dtype})(node.args)
-                args = load_arg(quantized=torch.float)(node.args)
-                kwargs = load_arg(quantized=torch.float)(node.kwargs)
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantize_node(
-                    op_out, activation_post_process,
-                    node, modules, quantized_graph, node_name_to_scope, is_input=False)
-
-@register_quant_pattern(torch.nn.Hardsigmoid, default_affine_fixed_qparams_observer)
-@register_quant_pattern(torch.nn.functional.hardsigmoid, default_affine_fixed_qparams_observer)
-@register_quant_pattern('hardsigmoid', default_affine_fixed_qparams_observer)
-@register_quant_pattern('hardsigmoid_', default_affine_fixed_qparams_observer)
-@register_quant_pattern(torch.nn.Sigmoid, default_affine_fixed_qparams_observer)
-@register_quant_pattern(torch.sigmoid, default_affine_fixed_qparams_observer)
-@register_quant_pattern('sigmoid', default_affine_fixed_qparams_observer)
-@register_quant_pattern('sigmoid_', default_affine_fixed_qparams_observer)
-@register_quant_pattern(torch.nn.Tanh, default_symmetric_fixed_qparams_observer)
-@register_quant_pattern(torch.tanh, default_symmetric_fixed_qparams_observer)
-@register_quant_pattern('tanh', default_symmetric_fixed_qparams_observer)
-@register_quant_pattern('tanh_', default_symmetric_fixed_qparams_observer)
+# TODO: remove this class
 class FixedQParamsOpQuantizeHandler(QuantizeHandler):
-    def __init__(self,
-                 node: Node,
-                 modules: Dict[str, torch.nn.Module]):
-        super().__init__(node, modules)
-        self.node = node
-
-    def should_mark_output_quantized_from_input_quantized_status(
-        self,
-        qconfig: QConfigAny
-    ) -> bool:
-        # FixQParamOps are the same as CopyNode in int8 quantization
-        return activation_dtype(qconfig) in [torch.quint8, torch.qint8]
-
-    # some qhandlers override the activations constructor
-    def get_activation_ctr(self, qconfig, pattern, is_training) -> Optional[Callable]:
-        act_dtype = activation_dtype(qconfig)
-        if act_dtype == torch.quint8:
-            return get_default_output_activation_post_process_map(is_training).get(
-                pattern, qconfig.activation)
-        else:
-            return qconfig.activation
+    pass
 
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if not is_reference:
-            dtypes = get_qconfig_dtypes(qconfig)
-            if dtypes == (torch.float16, torch.float16, None):
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantized_graph.create_node(
-                    "call_method", "to", (op_out, torch.float16,), {}
-                )
-            else:
-                return quantized_graph.node_copy(node, load_arg(quantized=None))
-        else:
-            act_dtype = activation_dtype(qconfig)
-            if act_dtype == torch.float:
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return op_out
-            else:
-                activation_post_process = \
-                    self._maybe_get_last_node_only_observer(modules)
-                assert activation_post_process is not None
-                # make sure the input is quantized to act_dtype
-                load_arg(quantized={0: act_dtype})(node.args)
-                args = load_arg(quantized=torch.float)(node.args)
-                kwargs = load_arg(quantized=torch.float)(node.kwargs)
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantize_node(
-                    op_out, activation_post_process,
-                    node, modules, quantized_graph, node_name_to_scope, is_input=False)
-
-@register_quant_pattern(torch.nn.AdaptiveAvgPool1d)
-@register_quant_pattern(torch.nn.AdaptiveAvgPool2d)
-@register_quant_pattern(torch.nn.AdaptiveAvgPool3d)
-@register_quant_pattern(torch.nn.AvgPool1d)
-@register_quant_pattern(torch.nn.AvgPool2d)
-@register_quant_pattern(torch.nn.AvgPool3d)
-@register_quant_pattern(torch.nn.Hardtanh)
-@register_quant_pattern(torch.nn.MaxPool1d)
-@register_quant_pattern(torch.nn.MaxPool2d)
-@register_quant_pattern(torch.nn.MaxPool3d)
-@register_quant_pattern(torch.nn.ReLU)
-@register_quant_pattern(torch.nn.ReLU6)
-@register_quant_pattern(torch.adaptive_avg_pool1d)
-@register_quant_pattern(torch.nn.functional.adaptive_avg_pool2d)
-@register_quant_pattern(torch.nn.functional.adaptive_avg_pool3d)
-@register_quant_pattern(torch.nn.functional.hardtanh)
-@register_quant_pattern(torch.nn.functional.hardtanh_)
-@register_quant_pattern(torch.nn.functional.interpolate)
-@register_quant_pattern(torch.nn.functional.max_pool1d)
-@register_quant_pattern(torch.nn.functional.max_pool2d)
-@register_quant_pattern(torch.nn.functional.max_pool3d)
-@register_quant_pattern(torch.nn.functional.relu)
-@register_quant_pattern(torch.nn.functional.relu6)
-@register_quant_pattern(torch.avg_pool1d)
-@register_quant_pattern(torch._C._nn.avg_pool2d)
-@register_quant_pattern(torch._C._nn.avg_pool3d)
-@register_quant_pattern(torch.clamp)
-@register_quant_pattern(torch.flatten)
-@register_quant_pattern(torch.mean)
-@register_quant_pattern(operator.floordiv)
-@register_quant_pattern('clamp')
-@register_quant_pattern('mean')
-@register_quant_pattern('relu')
-@register_quant_pattern('relu_')
+# TODO: remove
 class CopyNodeQuantizeHandler(QuantizeHandler):
-    """ Operators that works on both float and quantized input
-    if input is quantized, the output Tensor shares
-    the same quantization parameter with input.
-    These ops will do computation on the input Tensor, e.g. average pool, so we will
-    insert extra observer/fake_quant for the output of these operators.
-    TODO: maybe rename this to TensorValueOpQuantizeHandler
-    """
-    def should_mark_output_quantized_from_input_quantized_status(
-        self,
-        qconfig: QConfigAny
-    ) -> bool:
-        return True
-
-    def is_general_tensor_value_op(self) -> bool:
-        return True
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
+    pass
 
-        is_call_function, is_call_method, is_call_module = check_node(node, modules)
-        if is_reference or (is_call_function or is_call_method or is_call_module):
-            # when activation dtype is torch.float, the node does not require
-            # observation
-            # e.g. dynamic quantization or weight_only quantization
-            act_dtype = activation_dtype(qconfig)
-            if act_dtype == torch.float:
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return op_out
-            else:
-                activation_post_process = \
-                    self._maybe_get_last_node_only_observer(modules)
-                assert activation_post_process is not None
-                # make sure the input is quantized to act_dtype
-                load_arg(quantized={0: act_dtype})(node.args)
-                args = list(load_arg(quantized=torch.float)(node.args))
-                kwargs = load_arg(quantized=torch.float)(node.kwargs)
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantize_node(
-                    op_out,
-                    activation_post_process,
-                    node, modules, quantized_graph, node_name_to_scope, is_input=False)
-        else:
-            return quantized_graph.node_copy(node, load_arg(quantized=None))
-
-class CustomModuleQuantizeHandler(QuantizeHandler):
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        """ Convert a float custom module to quantized custom module
-        """
-        assert node.op == 'call_module'
-        assert convert_custom_config_dict is not None
-        custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", None)
-        assert custom_module_class_mapping is not None
-        observed_custom_module = modules[str(node.target)]
-        if activation_is_statically_quantized(qconfig):
-            activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
-            assert activation_post_process is not None
-            observed_custom_module.activation_post_process = activation_post_process
-        quantized_custom_module_class = get_swapped_custom_module_class(
-            observed_custom_module, custom_module_class_mapping, qconfig)
-        quantized_custom_module = \
-            quantized_custom_module_class.from_observed(observed_custom_module)
-        parent_name, name = _parent_name(node.target)
-        setattr(modules[parent_name], name, quantized_custom_module)
-        # hardcoded the quntized input to be None (take whatever is in the environemnt),
-        # we can extend this
-        # if there is a need, e.g. get the indexes of quantized inputs from some
-        # module attribute like module._QUANTIZED_INPUT_INDEXES
-        return quantized_graph.node_copy(node, load_arg(quantized=None))
-
-@register_quant_pattern(torch.nn.Identity)
-@register_quant_pattern(torch.transpose)
-@register_quant_pattern(torch.repeat_interleave)
-@register_quant_pattern(torch.squeeze)
-@register_quant_pattern(torch.stack)
-@register_quant_pattern(torch.unsqueeze)
-@register_quant_pattern('contiguous')
-@register_quant_pattern('detach')
-@register_quant_pattern('detach_')
-@register_quant_pattern('permute')
-@register_quant_pattern('repeat')
-@register_quant_pattern('repeat_interleave')
-@register_quant_pattern('reshape')
-@register_quant_pattern('resize_')
-@register_quant_pattern('shape')
-@register_quant_pattern('size')
-@register_quant_pattern('squeeze')
-@register_quant_pattern('squeeze_')
-@register_quant_pattern('transpose')
-@register_quant_pattern('unsqueeze')
-@register_quant_pattern('unsqueeze_')
-@register_quant_pattern('view')
+# TODO: remove
 class GeneralTensorShapeOpQuantizeHandler(QuantizeHandler):
-    """ Operators that works on both float and quantized input
-    if input is quantized, the output Tensor shares
-    the same quantization parameter with input.
-    These ops only do rearrangement of Tensor values, for
-    example reshape, or just query the information about Tensor
-    e.g. size, and we do not insert extra observer/fake_quant
-    for the output of the operator.
-    """
-    def is_general_tensor_shape_op(self) -> bool:
-        return True
+    pass
 
-    def should_mark_output_quantized_from_input_quantized_status(
-        self,
-        qconfig: QConfigAny
-    ) -> bool:
-        return True
-
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        # when activation dtype is torch.float, the node does not require
-        # observation
-        # e.g. dynamic quantization or weight_only quantization
-        act_dtype = activation_dtype(qconfig)
-        if act_dtype == torch.float:
-            op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-            return op_out
-        else:
-            activation_post_process = \
-                self._maybe_get_last_node_only_observer(modules)
-            if activation_post_process is not None:
-                args = list(load_arg(quantized=torch.float)(node.args))
-                kwargs = load_arg(quantized=torch.float)(node.kwargs)
-                op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float))
-                return quantize_node(
-                    op_out,
-                    activation_post_process,
-                    node, modules, quantized_graph, node_name_to_scope, is_input=False)
-            else:
-                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
+# TODO: not used, can be removed after torch.quantization namespace is deprecated
+class CustomModuleQuantizeHandler(QuantizeHandler):
+    pass
 
+# TODO: not used, can be removed after torch.quantization namespace is deprecated
 class StandaloneModuleQuantizeHandler(QuantizeHandler):
-    """ Converts an observed standalone module to quantized standalone module
-    by calling convert_fx on the observed standalone module.
-    """
-    def convert(self,
-                node: Node,
-                qconfig: QConfigAny,
-                modules: Dict[str, torch.nn.Module],
-                quantized_graph: Graph,
-                node_name_to_scope: Dict[str, Tuple[str, type]],
-                load_arg: Callable,
-                is_reference: bool = False,
-                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        assert node.op == 'call_module'
-        convert = torch.ao.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore[attr-defined]
-        # We know that observed standalone module is a GraphModule since
-        # it's produced by us
-        observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
-        input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist()  # type: ignore[operator]
-        quantized_standalone_module = convert(observed_standalone_module, is_reference=is_reference)
-        parent_name, name = _parent_name(node.target)
-        # update the modules dict
-        setattr(modules[parent_name], name, quantized_standalone_module)
-        modules[str(node.target)] = quantized_standalone_module
-        return quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs))
+    pass
diff --git a/torch/ao/quantization/fx/quantization_types.py b/torch/ao/quantization/fx/quantization_types.py
deleted file mode 100644
index 859f4b2d456a..000000000000
--- a/torch/ao/quantization/fx/quantization_types.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from typing import Any, Tuple, Union
-from torch.fx import Node
-from ..utils import Pattern  # noqa: F401
-
-NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any]
-
-# This is the Quantizer class instance from torch/quantization/fx/quantize.py.
-# Define separately to prevent circular imports.
-# TODO(future PR): improve this.
-QuantizerCls = Any
diff --git a/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py b/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py
deleted file mode 100644
index ce23f17db71d..000000000000
--- a/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import torch
-
-def relu_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.relu(x, inplace=True)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu_non_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.relu(x, inplace=False)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu_replacement(x, scale, zero_point):
-    x = torch.nn.functional.relu(x)
-    return x
-
-def relu_method_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = x.relu()
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu_method_replacement(x, scale, zero_point):
-    x = x.relu()
-    return x
-
-def relu_inplace_method_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = x.relu_()
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu_inplace_method_replacement(x, scale, zero_point):
-    x = x.relu_()
-    return x
-
-def relu6_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.relu6(x, inplace=True)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu6_non_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.relu6(x, inplace=False)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def relu6_replacement(x, scale, zero_point):
-    x = torch.nn.functional.relu6(x)
-    return x
-
-
-def hardtanh_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.hardtanh(x, inplace=True)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def hardtanh_non_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.hardtanh(x, inplace=False)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def hardtanh_replacement(x, scale, zero_point):
-    x = torch.nn.functional.hardtanh(x)
-    return x
-
-def hardtanh_inplace_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.nn.functional.hardtanh_(x)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def hardtanh_inplace_replacement(x, scale, zero_point):
-    x = torch.nn.functional.hardtanh_(x)
-    return x
-
-def min_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.min(x)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def min_replacement(x, scale, zero_point):
-    x = torch.min(x)
-    return x
-
-def max_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.max(x)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def max_replacement(x, scale, zero_point):
-    x = torch.max(x)
-    return x
-
-def mean_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.mean(x)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def mean_replacement(x, scale, zero_point):
-    x = torch.mean(x)
-    return x
-
-def mean_method_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = x.mean()
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def mean_method_replacement(x, scale, zero_point):
-    x = x.mean()
-    return x
-
-def flatten_pattern(x, scale, zero_point):
-    x = x.dequantize()
-    x = torch.flatten(x)
-    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
-    return x
-
-def flatten_replacement(x, scale, zero_point):
-    x = torch.flatten(x)
-    return x
-
-def _get_all_patterns_and_replacements():
-    return [
-        (relu_inplace_pattern, relu_replacement),
-        (relu_non_inplace_pattern, relu_replacement),
-        (relu_method_pattern, relu_method_replacement),
-        (relu_inplace_method_pattern, relu_inplace_method_replacement),
-        (relu6_inplace_pattern, relu6_replacement),
-        (relu6_non_inplace_pattern, relu6_replacement),
-        (hardtanh_pattern, hardtanh_replacement),
-        (hardtanh_non_inplace_pattern, hardtanh_replacement),
-        (hardtanh_inplace_pattern, hardtanh_inplace_replacement),
-        (mean_pattern, mean_replacement),
-        (mean_method_pattern, mean_method_replacement),
-    ]
-
-
-def get_fbgemm_patterns_and_replacements():
-    return _get_all_patterns_and_replacements()
-
-def get_qnnpack_patterns_and_replacements():
-    return _get_all_patterns_and_replacements()
diff --git a/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py b/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py
deleted file mode 100644
index a64b537173a9..000000000000
--- a/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py
+++ /dev/null
@@ -1,445 +0,0 @@
-from torch.fx.graph_module import GraphModule
-from torch.fx.graph import Graph
-from torch.fx.node import Node
-from torch.fx._symbolic_trace import symbolic_trace
-from torch.fx._compatibility import compatibility
-
-import copy
-from typing import Callable, Dict, List, NamedTuple, Optional, Set
-import torch
-
-@compatibility(is_backward_compatible=True)
-class Match(NamedTuple):
-    # Node from which the match was found
-    anchor: Node
-    # Maps nodes in the pattern subgraph to nodes in the larger graph
-    nodes_map: Dict[Node, Node]
-
-class _SubgraphMatcher:
-    def __init__(self, pattern: Graph) -> None:
-        self.pattern = pattern
-        if len(pattern.nodes) == 0:
-            raise ValueError("_SubgraphMatcher cannot be initialized with an "
-                             "empty pattern")
-        # `self.pattern_anchor` is the output Node in `pattern`
-        self.pattern_anchor = next(iter(reversed(pattern.nodes)))
-        # Ensure that there is only a single output value in the pattern
-        # since we don't support multiple outputs
-        assert len(self.pattern_anchor.all_input_nodes) == 1, \
-            "Pattern matching on multiple outputs is not supported"
-        # Maps nodes in the pattern subgraph to nodes in the larger graph
-        self.nodes_map: Dict[Node, Node] = {}
-
-    def matches_subgraph_from_anchor(self, anchor: Node) -> bool:
-        """
-        Checks if the whole pattern can be matched starting from
-        ``anchor`` in the larger graph.
-
-        Pattern matching is done by recursively comparing the pattern
-        node's use-def relationships against the graph node's.
-        """
-        self.nodes_map = {}
-        return self._match_nodes(self.pattern_anchor, anchor)
-
-    # Compare the pattern node `pn` against the graph node `gn`
-    def _match_nodes(self, pn: Node, gn: Node) -> bool:
-
-        # Check if we've already matched these nodes in the current
-        # traversal
-        if pn in self.nodes_map:
-            return self.nodes_map[pn] == gn
-
-        def attributes_are_equal(pn: Node, gn: Node) -> bool:
-            # Use placeholder and output nodes as wildcards. The
-            # only exception is that an output node can't match
-            # a placeholder
-            if (pn.op == "placeholder"
-                    or (pn.op == "output" and gn.op != "placeholder")):
-                return True
-            return pn.op == gn.op and pn.target == gn.target
-
-        # Terminate early if the node attributes are not equal
-        if not attributes_are_equal(pn, gn):
-            return False
-
-        # Optimistically mark `pn` as a match for `gn`
-        self.nodes_map[pn] = gn
-
-        # Traverse the use-def relationships to ensure that `pn` is a true
-        # match for `gn`
-        if pn.op == "placeholder":
-            return True
-        if (pn.op != "output"
-                and len(pn.all_input_nodes) != len(gn.all_input_nodes)):
-            return False
-        if pn.op == "output":
-            match_found = any(self._match_nodes(pn.all_input_nodes[0], gn_)
-                              for gn_ in gn.all_input_nodes)
-        else:
-            match_found = (len(pn.all_input_nodes) == len(gn.all_input_nodes)
-                           and all(self._match_nodes(pn_, gn_) for pn_, gn_
-                                   in zip(pn.all_input_nodes, gn.all_input_nodes)))
-        if not match_found:
-            self.nodes_map.pop(pn)
-            return False
-
-        return True
-
-
-def _replace_submodules(gm: GraphModule, replacement: torch.nn.Module) -> None:
-    gm.delete_all_unused_submodules()
-
-    if isinstance(replacement, GraphModule):
-        replacement.graph.lint()
-
-    def try_get_submodule(mod: torch.nn.Module, target: str) -> Optional[torch.nn.Module]:
-        try:
-            mod_match = mod.get_submodule(target)
-            return mod_match
-        except AttributeError:
-            return None
-
-    for node in gm.graph.nodes:
-        if node.op == "call_module" or node.op == "get_attr":
-
-            gm_submod = try_get_submodule(gm, node.target)
-
-            replacement_submod = try_get_submodule(replacement, node.target)
-
-            # CASE 1: This target already exists as a submodule in our
-            # result GraphModule. Whether or not it exists in
-            # `replacement`, the existing submodule takes precedence.
-            if gm_submod is not None:
-                continue
-
-            # CASE 2: The target exists as a submodule in `replacement`
-            # only, so we need to copy it over.
-            elif replacement_submod is not None:
-                new_submod = copy.deepcopy(getattr(replacement, node.target))
-                gm.add_submodule(node.target, new_submod)
-
-            # CASE 3: The target doesn't exist as a submodule in `gm`
-            # or `replacement`
-            else:
-                raise RuntimeError("Attempted to create a \"", node.op,
-                                   "\" node during subgraph rewriting "
-                                   f"with target {node.target}, but "
-                                   "the referenced submodule does not "
-                                   "exist in either the original "
-                                   "GraphModule `gm` or the replacement"
-                                   " GraphModule `replacement`")
-
-    gm.graph.lint()
-
-@compatibility(is_backward_compatible=True)
-def replace_pattern(gm: GraphModule, pattern: Callable, replacement: Callable) -> List[Match]:
-    """
-    Matches all possible non-overlapping sets of operators and their
-    data dependencies (``pattern``) in the Graph of a GraphModule
-    (``gm``), then replaces each of these matched subgraphs with another
-    subgraph (``replacement``).
-
-    Args:
-        ``gm``: The GraphModule that wraps the Graph to operate on
-        ``pattern``: The subgraph to match in ``gm`` for replacement
-        ``replacement``: The subgraph to replace ``pattern`` with
-
-    Returns:
-        List[Match]: A list of ``Match`` objects representing the places
-        in the original graph that ``pattern`` was matched to. The list
-        is empty if there are no matches. ``Match`` is defined as:
-
-        .. code-block:: python
-
-            class Match(NamedTuple):
-                # Node from which the match was found
-                anchor: Node
-                # Maps nodes in the pattern subgraph to nodes in the larger graph
-                nodes_map: Dict[Node, Node]
-
-    Examples:
-
-    .. code-block:: python
-
-        import torch
-        from torch.fx import symbolic_trace, subgraph_rewriter
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, w1, w2):
-                m1 = torch.cat([w1, w2]).sum()
-                m2 = torch.cat([w1, w2]).sum()
-                return x + torch.max(m1) + torch.max(m2)
-
-        def pattern(w1, w2):
-            return torch.cat([w1, w2]).sum()
-
-        def replacement(w1, w2):
-            return torch.stack([w1, w2])
-
-        traced_module = symbolic_trace(M())
-
-        subgraph_rewriter.replace_pattern(traced_module, pattern, replacement)
-
-    The above code will first match ``pattern`` in the ``forward``
-    method of ``traced_module``. Pattern-matching is done based on
-    use-def relationships, not node names. For example, if you had
-    ``p = torch.cat([a, b])`` in ``pattern``, you could match
-    ``m = torch.cat([a, b])`` in the original ``forward`` function,
-    despite the variable names being different (``p`` vs ``m``).
-
-    The ``return`` statement in ``pattern`` is matched based on its
-    value only; it may or may not match to the ``return`` statement in
-    the larger graph. In other words, the pattern doesn't have to extend
-    to the end of the larger graph.
-
-    When the pattern is matched, it will be removed from the larger
-    function and replaced by ``replacement``. If there are multiple
-    matches for ``pattern`` in the larger function, each non-overlapping
-    match will be replaced. In the case of a match overlap, the first
-    found match in the set of overlapping matches will be replaced.
-    ("First" here being defined as the first in a topological ordering
-    of the Nodes' use-def relationships. In most cases, the first Node
-    is the parameter that appears directly after ``self``, while the
-    last Node is whatever the function returns.)
-
-    One important thing to note is that the parameters of the
-    ``pattern`` Callable must be used in the Callable itself,
-    and the parameters of the ``replacement`` Callable must match
-    the pattern. The first rule is why, in the above code block, the
-    ``forward`` function has parameters ``x, w1, w2``, but the
-    ``pattern`` function only has parameters ``w1, w2``. ``pattern``
-    doesn't use ``x``, so it shouldn't specify ``x`` as a parameter.
-    As an example of the second rule, consider replacing
-
-    .. code-block:: python
-
-        def pattern(x, y):
-            return torch.neg(x) + torch.relu(y)
-
-    with
-
-    .. code-block:: python
-
-        def replacement(x, y):
-            return torch.relu(x)
-
-    In this case, ``replacement`` needs the same number of parameters
-    as ``pattern`` (both ``x`` and ``y``), even though the parameter
-    ``y`` isn't used in ``replacement``.
-
-    After calling ``subgraph_rewriter.replace_pattern``, the generated
-    Python code looks like this:
-
-    .. code-block:: python
-
-        def forward(self, x, w1, w2):
-            stack_1 = torch.stack([w1, w2])
-            sum_1 = stack_1.sum()
-            stack_2 = torch.stack([w1, w2])
-            sum_2 = stack_2.sum()
-            max_1 = torch.max(sum_1)
-            add_1 = x + max_1
-            max_2 = torch.max(sum_2)
-            add_2 = add_1 + max_2
-            return add_2
-    """
-    # Get the graphs for `gm`, `pattern`, `replacement`
-    original_graph = gm.graph
-    pattern_graph = symbolic_trace(pattern).graph
-    replacement_graph = symbolic_trace(replacement).graph
-
-    # Find all possible pattern matches in original_graph. Note that
-    # pattern matches may overlap with each other.
-    matcher = _SubgraphMatcher(pattern_graph)
-    matches: List[Match] = []
-
-    # Consider each node as an "anchor" (deepest matching graph node)
-    for anchor in original_graph.nodes:
-
-        if matcher.matches_subgraph_from_anchor(anchor):
-
-            def pattern_is_contained(nodes_map: Dict[Node, Node]) -> bool:
-                # `lookup` represents all the nodes in `original_graph`
-                # that are part of `pattern`
-                lookup: Dict[Node, Node] = {v: k for k, v in nodes_map.items()}
-                for n in lookup.keys():
-
-                    # Nodes that can "leak"...
-
-                    # Placeholders (by definition)
-                    if n.op == "placeholder":
-                        continue
-                    # Pattern output (acts as a container)
-                    if lookup[n].op == "output":
-                        continue
-                    # Result contained by pattern output (what we'll
-                    # hook in to the new Graph, thus what we'll
-                    # potentially use in other areas of the Graph as
-                    # an input Node)
-                    if (len(lookup[n].users) == 1
-                            and list(lookup[n].users.keys())[0].op == "output"):
-                        continue
-
-                    for user in n.users:
-                        # If this node has users that were not in
-                        # `lookup`, then it must leak out of the
-                        # pattern subgraph
-                        if user not in lookup:
-                            return False
-                return True
-
-            # It's not a match if the pattern leaks out into the rest
-            # of the graph
-            if pattern_is_contained(matcher.nodes_map):
-                # Shallow copy nodes_map
-                matches.append(Match(anchor=anchor,
-                                     nodes_map=copy.copy({
-                                         key: value
-                                         for key, value in matcher.nodes_map.items()
-                                     })))
-
-    # The set of all nodes in `original_graph` that we've seen thus far
-    # as part of a pattern match
-    replaced_nodes: Set[Node] = set()
-    # As we progressively replace nodes, we'll need to keep track of how the match results should change
-    match_changed_node: Dict[Node, Node] = dict()
-
-    # Return True if one of the nodes in the current match has already
-    # been used as part of another match
-    def overlaps_with_prev_match(match: Match) -> bool:
-        for pn, gn in match.nodes_map.items():
-            if pn.op in ["placeholder", "output"]:
-                continue
-            if gn in replaced_nodes and gn.op != "placeholder":
-                return True
-        return False
-
-    for match in matches:
-        # Skip overlapping matches
-        if overlaps_with_prev_match(match):
-            continue
-
-        # Map replacement graph nodes to their copy in `original_graph`
-        val_map: Dict[Node, Node] = {}
-
-        pattern_placeholders = [n for n in pattern_graph.nodes
-                                if n.op == "placeholder"]
-        assert len(pattern_placeholders) > 0
-        replacement_placeholders = [n for n in replacement_graph.nodes
-                                    if n.op == "placeholder"]
-        assert len(pattern_placeholders) == len(replacement_placeholders)
-        placeholder_map = {r: p for r, p
-                           in zip(replacement_placeholders, pattern_placeholders)}
-
-        # node from `original_graph` that matched with the output node
-        # in `pattern`
-        subgraph_output: Node = match.anchor
-
-        def mark_node_as_replaced(n: Node) -> None:
-            if n not in match.nodes_map.values():
-                return
-            for n_ in n.all_input_nodes:
-                mark_node_as_replaced(n_)
-            replaced_nodes.add(n)
-
-        for input_node in subgraph_output.all_input_nodes:
-            mark_node_as_replaced(input_node)
-
-        # Initialize `val_map` with mappings from placeholder nodes in
-        # `replacement` to their corresponding node in `original_graph`
-        for replacement_node in replacement_placeholders:
-            # Get the `original_graph` placeholder node
-            # corresponding to the current `replacement_node`
-            pattern_node = placeholder_map[replacement_node]
-            original_graph_node = match_changed_node.get(match.nodes_map[pattern_node], match.nodes_map[pattern_node])
-
-            # Populate `val_map`
-            val_map[replacement_node] = original_graph_node
-
-        # Copy the stack trace from the original graph to the replacement graph.
-        # Currently this is using a naive strategy:
-        # 1. find the first node with non-null stack trace in the original graph
-        # 2. if found, copy this stack trace to every node in the replacement graph
-        first_stack_trace = None
-        for pn, gn in match.nodes_map.items():
-            if gn.stack_trace is not None:
-                first_stack_trace = gn.stack_trace
-                break
-        if first_stack_trace is not None:
-            for node in replacement_graph.nodes:
-                node.stack_trace = first_stack_trace
-
-        # Copy the replacement graph over
-        with original_graph.inserting_before(subgraph_output):
-            copied_output = original_graph.graph_copy(replacement_graph,
-                                                      val_map)
-
-        # Clear out stack traces to prevent interference with next match
-        for node in replacement_graph.nodes:
-            node.stack_trace = None
-
-        # Hook the output Node of the replacement subgraph in to the
-        # original Graph at the correct location
-
-        # CASE 1: We need to hook the replacement subgraph in somewhere
-        # in the middle of the graph. We replace the Node in the
-        # original graph that corresponds to the end of the pattern
-        # subgraph
-        if subgraph_output.op != "output":
-            pattern_outputs = [n for n in pattern_graph.nodes
-                               if n.op == "output"]
-            assert len(pattern_outputs) > 0
-            replacement_outputs = [n for n in replacement_graph.nodes
-                                   if n.op == "output"]
-            assert len(replacement_outputs) == len(pattern_outputs)
-            outputs_map = {p: r for r, p
-                           in zip(replacement_outputs, pattern_outputs)}
-
-            for pn, gn in match.nodes_map.items():
-                if gn.op == "placeholder":
-                    continue
-
-                # Search for the node corresponding to the output of the pattern
-                if pn.op != "output":
-                    continue
-                assert subgraph_output == gn
-
-                # Update all anchor inputs to the new nodes
-                rn = outputs_map[pn]
-                for pn_input, rn_input in zip(pn.all_input_nodes, rn.all_input_nodes):
-                    gn_input = match.nodes_map[pn_input]
-                    rn_input_in_original_graph = val_map[rn_input]
-                    gn_input.replace_all_uses_with(rn_input_in_original_graph)
-                    # We store the updated node point in case other nodes want to use it
-                    match_changed_node[gn_input] = rn_input_in_original_graph
-
-            assert subgraph_output.op != "output"
-        # CASE 2: The pattern subgraph match extends to the end of the
-        # original graph, so we need to change the current graph's
-        # output Node to reflect the insertion of the replacement graph.
-        # We'll keep the current output Node, but update its args and
-        # `_input_nodes` as necessary
-        else:
-            subgraph_output.args = ((copied_output,))
-            if isinstance(copied_output, Node):
-                subgraph_output._input_nodes = {copied_output: None}
-
-        assert isinstance(copied_output, Node)
-        # Erase the `pattern` nodes
-        for node in reversed(original_graph.nodes):
-            if len(node.users) == 0 and node.op != "output":
-                original_graph.erase_node(node)
-
-    # Update the passed-in GraphModule to reflect the new state of
-    # `original_graph`
-    gm.recompile()
-
-    # If `replacement` was an nn.Module, we'll need to make sure that
-    # all the submodules have been copied over correctly
-    if isinstance(replacement, torch.nn.Module):
-        _replace_submodules(gm, replacement)
-
-    return matches
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 83b0caf5e531..70b852395ca9 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -12,7 +12,9 @@
 )
 
 from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type
+from collections import namedtuple
 import operator
+import warnings
 
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
@@ -111,12 +113,15 @@ def get_per_tensor_qparams(activation_post_process):
     dtype = activation_post_process.dtype
     return scale, zero_point, dtype
 
-def get_quantize_node_info(activation_post_process: Callable) -> Tuple[str, Union[Callable, str], Dict[str, Any]]:
+def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[str, Union[Callable, str], Dict[str, Any]]]:
     ''' Given an activation_post_process module,
     return node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary
     of extracted qparams from the module
     '''
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+    compute_dtype = None
+    if hasattr(activation_post_process, "compute_dtype"):
+        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
     quantize_op : Optional[Union[Callable, str]] = None
     if dtype in [torch.quint8, torch.qint8]:
         node_type = "call_function"
@@ -134,9 +139,17 @@ def get_quantize_node_info(activation_post_process: Callable) -> Tuple[str, Unio
         node_type = "call_method"
         quantize_op = "to"
         qparams = {"_dtype_": dtype}
+    elif dtype == torch.float32 and compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
+        # dynamic quantization
+        node_type = "call_function"
+        quantize_op = torch.quantize_per_tensor_dynamic
+        # TODO: get reduce range from observer
+        # reduce_range = activation_post_process.reduce_range
+        reduce_range = torch.backends.quantized.engine == "fbgemm"
+        qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range}
     else:
-        raise Exception("Unsupported dtype in get_quantize_node_info:" + str(dtype))
-    assert quantize_op is not None
+        warnings.warn(f"Unsupported activation_post_process in get_quantize_node_info: {activation_post_process}")
+        return None
     return node_type, quantize_op, qparams
 
 def quantize_node(
@@ -146,7 +159,8 @@ def quantize_node(
         modules: Dict[str, torch.nn.Module],
         quantized_graph: Graph,
         node_name_to_scope: Dict[str, Tuple[str, type]],
-        is_input: bool) -> Node:
+        is_input: bool,
+        output_prefix: str = "_output") -> Node:
     ''' Add quantization nodes (eg. quantize_per_tensor/per_channel) for given node to graph
     with the qparams calculated from activation_post_process (obs_module).
     The observer node (obs_node) is used to find the FQN of the user of act_post_process.
@@ -173,7 +187,7 @@ def quantize_node(
     else:
         # if the quantize function is at the output of the op, we use the observer input node to get the path
         first_linear_use_or_first_use = in_node
-        prefix = "_output"
+        prefix = output_prefix
 
     if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
         module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
@@ -184,7 +198,10 @@ def quantize_node(
         module_path = ""
     root_module = modules['']
     graph = quantized_graph
-    node_type, quantize_op, qparams = get_quantize_node_info(obs_module)
+    maybe_quantize_node_info = get_quantize_node_info(obs_module)
+    assert maybe_quantize_node_info is not None, \
+        f"Expecting quantize node info not to be None, observer: {obs_module}"
+    node_type, quantize_op, qparams = maybe_quantize_node_info
     inputs = [in_node]
 
     for key, value in qparams.items():
@@ -455,6 +472,74 @@ def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module
         cache[node] = result
     return result
 
+def all_node_args_except_first(node: Node) -> List[int]:
+    """
+    Returns all node arg indices after first
+    """
+    return list(range(1, len(node.args)))
+
+def return_arg_list(arg_indices: List[int]) -> Callable[[Node], List[int]]:
+    """
+    Constructs a function that takes a node as arg and returns the arg_indices
+    that are valid for node.args
+    """
+    def arg_indices_func(node: Node) -> List[int]:
+        return [i for i in arg_indices if i < len(node.args)]
+    return arg_indices_func
+
+NodeInfo = namedtuple("NodeInfo", "op target")
+
+# this dict identifies which indices of a node are non tensors
+# so that they can be propagated correctly since inserting observers
+# for them would cause errors
+
+NON_OBSERVABLE_ARG_DICT: Dict[NodeInfo, Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]] = {
+    NodeInfo("call_method", "masked_fill") : {
+        torch.bool: return_arg_list([1]),
+        float: return_arg_list([2])
+    },
+    NodeInfo("call_method", "permute") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "repeat") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "reshape") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "size") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "transpose") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", torch.transpose) : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "unsqueeze") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "unsqueeze_") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", torch.unsqueeze) : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "view") : {
+        int: all_node_args_except_first
+    },
+}
+
+EMPTY_ARG_DICT: Dict[Union[type, torch.dtype], Callable[[Node], List[int]]] = {}
+
+def get_non_observable_arg_indexes_and_types(node: Node) -> Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]:
+    """
+    Returns a dict with of non float tensor types as keys and values which correspond to a
+    function to retrieve the list (which takes the node as an argument)
+    """
+    info = NodeInfo(node.op, node.target)
+
+    return NON_OBSERVABLE_ARG_DICT.get(info, EMPTY_ARG_DICT)
 
 def node_return_type_is_int(node: Node) -> bool:
     """
@@ -463,13 +548,6 @@ def node_return_type_is_int(node: Node) -> bool:
     """
     return node.op == 'call_method' and node.target == 'size'
 
-def node_bool_tensor_arg_indexes(node: Node) -> List[int]:
-    """
-    Returns indexes of boolean Tensor args
-    """
-    if node.op == "call_method" and node.target == "masked_fill":
-        return [1]
-    return []
 
 def is_get_tensor_info_node(node: Node) -> bool:
     """ Returns True if this node is a node that takes a Tensor as input and output some
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 4263f4e40b68..7e86a39f1b17 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -8,7 +8,7 @@
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from functools import partial
-from typing import Any, List, Tuple, Optional, Dict, Union
+from typing import Any, List, Tuple, Optional, Dict
 
 import torch
 import torch.nn as nn
@@ -114,12 +114,9 @@ def calculate_qparams(self, **kwargs):
     with_callable_args = classmethod(_with_callable_args)
 
 
-class _ObserverBase(ObserverBase):
-    r"""Internal common base for all qint/quint8 observers.
-
-    This base is for commonly used parameters used internally.
-    Users should use `~torch.ao.quantization.observer.ObserverBase` as a base class
-    for custom observers.
+class UniformQuantizationObserverBase(ObserverBase):
+    r"""Common base for all observers using uniform quantization to calculate
+    scale and zero_point.
 
     Args:
         dtype: Quantized data type.
@@ -128,6 +125,7 @@ class _ObserverBase(ObserverBase):
                       This is sometimes required to avoid instruction overflow.
         quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
         quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     .. warning::
 
@@ -169,9 +167,10 @@ def __init__(
         quant_min=None,
         quant_max=None,
         factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
     ) -> None:
         factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
-        super(_ObserverBase, self).__init__(dtype=dtype)
+        super().__init__(dtype=dtype)
         self.qscheme = qscheme
         if reduce_range:
             warnings.warn(
@@ -180,7 +179,7 @@ def __init__(
             )
         self.reduce_range = reduce_range
         self.register_buffer(
-            "eps", torch.tensor([torch.finfo(torch.float32).eps], **factory_kwargs)
+            "eps", torch.tensor([eps], **factory_kwargs)
         )
         assert self.qscheme in (
             torch.per_tensor_affine,
@@ -195,6 +194,7 @@ def __init__(
             torch.qint8,
             torch.quint8,
             torch.quint4x2,
+            torch.qint32,
         ), "Default Observer only works for qint8, quint8 and quint4x2 data type"
         self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
         if self.has_customized_qrange:
@@ -331,7 +331,13 @@ def reset_min_max_vals(self):
         raise NotImplementedError("Cannot reset min/max values in the given observer.")
 
 
-class MinMaxObserver(_ObserverBase):
+# Originally, this class was called `_ObserverBase`.  Keeping the old name around
+# for backwards compatibility.
+# TODO(after v1.13): delete this
+_ObserverBase = UniformQuantizationObserverBase
+
+
+class MinMaxObserver(UniformQuantizationObserverBase):
     r"""Observer module for computing the quantization parameters based on the
     running min and max values.
 
@@ -345,8 +351,7 @@ class MinMaxObserver(_ObserverBase):
         reduce_range: Reduces the range of the quantized data type by 1 bit
         quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
         quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
-        memoryless: Boolean that controls whether observer removes old data when a new input is seen.
-                    This is most useful for simulating dynamic quantization, especially during QAT.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     Given running min/max as :math:`x_\text{min}` and :math:`x_\text{max}`,
     scale :math:`s` and zero point :math:`z` are computed as:
@@ -405,7 +410,7 @@ def __init__(
         quant_min=None,
         quant_max=None,
         factory_kwargs=None,
-        memoryless=False,
+        eps=torch.finfo(torch.float32).eps,
     ) -> None:
 
         # For x86 quantized kernels, we need to ensure that the vpmaddubsw
@@ -421,8 +426,8 @@ def __init__(
             quant_min=quant_min,
             quant_max=quant_max,
             factory_kwargs=factory_kwargs,
+            eps=eps,
         )
-        self.memoryless = memoryless
         factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
         self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
         self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
@@ -440,8 +445,6 @@ def forward(self, x_orig):
         r"""Records the running minimum and maximum of ``x``."""
         if x_orig.numel() == 0:
             return x_orig
-        elif self.memoryless:
-            self.reset_min_max_vals()
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_val.dtype)
         min_val_cur, max_val_cur = torch.aminmax(x)
@@ -482,6 +485,7 @@ class MovingAverageMinMaxObserver(MinMaxObserver):
         reduce_range: Reduces the range of the quantized data type by 1 bit
         quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
         quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     The moving average min/max is computed as follows
 
@@ -518,6 +522,7 @@ def __init__(
         reduce_range=False,
         quant_min=None,
         quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
         **kwargs
     ) -> None:
         self.averaging_constant = averaging_constant
@@ -527,6 +532,7 @@ def __init__(
             reduce_range=reduce_range,
             quant_min=quant_min,
             quant_max=quant_max,
+            eps=eps,
             **kwargs
         )
 
@@ -548,7 +554,7 @@ def forward(self, x_orig):
         return x_orig
 
 
-class PerChannelMinMaxObserver(_ObserverBase):
+class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
     r"""Observer module for computing the quantization parameters based on the
     running per channel min and max values.
 
@@ -564,8 +570,7 @@ class PerChannelMinMaxObserver(_ObserverBase):
         reduce_range: Reduces the range of the quantized data type by 1 bit
         quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
         quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
-        memoryless: Boolean that controls whether observer removes old data when a new input is seen.
-                    This is most useful for simulating dynamic quantization, especially during QAT.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     The quantization parameters are computed the same way as in
     :class:`~torch.ao.quantization.observer.MinMaxObserver`, with the difference
@@ -587,7 +592,7 @@ def __init__(
         quant_min=None,
         quant_max=None,
         factory_kwargs=None,
-        memoryless=False,
+        eps=torch.finfo(torch.float32).eps,
     ) -> None:
         super(PerChannelMinMaxObserver, self).__init__(
             dtype=dtype,
@@ -596,8 +601,8 @@ def __init__(
             quant_min=quant_min,
             quant_max=quant_max,
             factory_kwargs=factory_kwargs,
+            eps=eps,
         )
-        self.memoryless = memoryless
         factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
         self.ch_axis = ch_axis
         self.register_buffer("min_val", torch.tensor([], **factory_kwargs))
@@ -630,7 +635,7 @@ def _forward(self, x_orig):
         # are done in place and types need to match for comparisons
         y = y.to(self.min_val.dtype)
         y = torch.flatten(y, start_dim=1)
-        if min_val.numel() == 0 or max_val.numel() == 0 or self.memoryless:
+        if min_val.numel() == 0 or max_val.numel() == 0:
             min_val, max_val = torch.aminmax(y, dim=1)
         else:
             min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
@@ -651,7 +656,7 @@ def extra_repr(self):
 
     def _load_from_state_dict(
         self,
-        state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
+        state_dict: Dict[str, Any],
         prefix: str,
         local_metadata: Dict[str, torch.Tensor],
         strict: bool,
@@ -707,7 +712,7 @@ def _load_from_state_dict(
 
     def _load_from_state_dict_script(
         self,
-        state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
+        state_dict: Dict[str, Any],
         prefix: str,
         local_metadata: Dict[str, torch.Tensor],
         strict: bool,
@@ -750,6 +755,7 @@ class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
         reduce_range: Reduces the range of the quantized data type by 1 bit
         quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
         quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     The quantization parameters are computed the same way as in
     :class:`~torch.ao.quantization.observer.MovingAverageMinMaxObserver`, with the
@@ -769,6 +775,7 @@ def __init__(
         reduce_range=False,
         quant_min=None,
         quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
         **kwargs
     ) -> None:
         super(MovingAveragePerChannelMinMaxObserver, self).__init__(
@@ -778,6 +785,7 @@ def __init__(
             reduce_range=reduce_range,
             quant_min=quant_min,
             quant_max=quant_max,
+            eps=eps,
             **kwargs
         )
         self.averaging_constant = averaging_constant
@@ -809,7 +817,7 @@ def forward(self, x_orig):
         return x_orig
 
 
-class HistogramObserver(_ObserverBase):
+class HistogramObserver(UniformQuantizationObserverBase):
     r"""
     The module records the running histogram of tensor values along with
     min/max values. ``calculate_qparams`` will calculate scale and zero_point.
@@ -821,6 +829,7 @@ class HistogramObserver(_ObserverBase):
         dtype: Quantized data type
         qscheme: Quantization scheme to be used
         reduce_range: Reduces the range of the quantized data type by 1 bit
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
 
     The scale and zero point are computed as follows:
 
@@ -847,6 +856,7 @@ def __init__(
         quant_min=None,
         quant_max=None,
         factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
     ) -> None:
         # bins: The number of bins used for histogram calculation.
         super(HistogramObserver, self).__init__(
@@ -856,6 +866,7 @@ def __init__(
             quant_min=quant_min,
             quant_max=quant_max,
             factory_kwargs=factory_kwargs,
+            eps=eps,
         )
         factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
         self.bins = bins
@@ -1258,7 +1269,7 @@ def calculate_qparams(self):
         )
 
 
-class RecordingObserver(_ObserverBase):
+class RecordingObserver(ObserverBase):
     r"""
     The module is mainly for debug and records the tensor values during runtime.
 
@@ -1269,8 +1280,8 @@ class RecordingObserver(_ObserverBase):
     """
     __annotations__ = {"tensor_val": List[Optional[torch.Tensor]]}
 
-    def __init__(self, **kwargs):
-        super(RecordingObserver, self).__init__(**kwargs)
+    def __init__(self, dtype=torch.quint8, **kwargs):
+        super(RecordingObserver, self).__init__(dtype=dtype, **kwargs)  # type: ignore[call-arg]
         self.tensor_val = []
 
     def forward(self, x):
@@ -1434,6 +1445,13 @@ def load_observer_state_dict(mod, obs_dict):
 Default weight observer.
 """
 
+weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric,
+    quant_min=-127, quant_max=127, eps=2 ** -12)
+"""
+Symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
 default_histogram_observer = HistogramObserver.with_args(quant_min=0, quant_max=127)
 """
 Default histogram observer, usually used for PTQ.
@@ -1447,6 +1465,13 @@ def load_observer_state_dict(mod, obs_dict):
 weight quantization is supported, such as `fbgemm`.
 """
 
+per_channel_weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_channel_symmetric,
+    quant_min=-127, quant_max=127, eps=2 ** -12)
+"""
+Per-channel, symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
 default_dynamic_quant_observer = PlaceholderObserver.with_args(
     dtype=torch.float, compute_dtype=torch.quint8
 )
@@ -1470,10 +1495,14 @@ def load_observer_state_dict(mod, obs_dict):
 
 # TODO(future PR): remove these defaults and enforce activation functions
 # to explicitly specify their output range
-default_symmetric_fixed_qparams_observer = FixedQParamsObserver.with_args(
+default_fixed_qparams_range_neg1to1_observer = FixedQParamsObserver.with_args(
     scale=2.0 / 256.0, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255)
-default_affine_fixed_qparams_observer = FixedQParamsObserver.with_args(
+default_fixed_qparams_range_0to1_observer = FixedQParamsObserver.with_args(
     scale=1.0 / 256.0, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_observer = default_fixed_qparams_range_neg1to1_observer
+default_affine_fixed_qparams_observer = default_fixed_qparams_range_0to1_observer
+
 """
 Default observers for fixed qparams operations.
 """
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index bf8d185cfdb0..c093d71a6b00 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -16,6 +16,8 @@
     default_fused_per_channel_wt_fake_quant,
     default_embedding_fake_quant,
     default_embedding_fake_quant_4bit,
+    fused_wt_fake_quant_range_neg_127_to_127,
+    fused_per_channel_wt_fake_quant_range_neg_127_to_127,
 )
 
 from .observer import (
@@ -32,6 +34,8 @@
     default_per_channel_weight_observer,
     default_placeholder_observer,
     default_weight_observer,
+    weight_observer_range_neg_127_to_127,
+    per_channel_weight_observer_range_neg_127_to_127,
     default_reuse_input_observer,
 )
 import warnings
@@ -113,7 +117,7 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
 Default dynamic qconfig.
 """
 
-float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float32),
+float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float32, compute_dtype=torch.float16),
                                   weight=PlaceholderObserver.with_args(dtype=torch.float16))
 """
 Dynamic qconfig with weights quantized to `torch.float16`.
@@ -179,28 +183,71 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
 Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape
 """
 
-def get_default_qconfig(backend='fbgemm'):
+def get_default_qconfig(backend='fbgemm', version=0):
     """
     Returns the default PTQ qconfig for the specified backend.
 
     Args:
-      * `backend`: a string representing the target backend. Currently supports `fbgemm`
-        and `qnnpack`.
+      * `backend`: a string representing the target backend. Currently supports `fbgemm`,
+        `qnnpack` and `onednn`.
 
     Return:
         qconfig
     """
-
-    if backend == 'fbgemm':
-        qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
-                          weight=default_per_channel_weight_observer)
-    elif backend == 'qnnpack':
-        qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
-                          weight=default_weight_observer)
+    if version == 0:
+        if backend == 'fbgemm':
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
+                              weight=default_per_channel_weight_observer)
+        elif backend == 'qnnpack':
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
+                              weight=default_weight_observer)
+        elif backend == 'onednn':
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
+                              weight=default_per_channel_weight_observer)
+        else:
+            qconfig = default_qconfig
     else:
-        qconfig = default_qconfig
+        raise AssertionError("Version number: " + str(version) +
+                             " in get_default_qconfig is not supported. Version number must be 0")
+
     return qconfig
 
+"""
+Default, symmetric PTQ qconfig for the specified backend. And a per_channel
+variant of the same.
+
+Symmetric here applies to signed weights with zero point = 0, and additional
+value restrictions. The activations are also signed 8-bit integers with this
+qconfig.
+
+    * Once this change is merged [as of 3/17/22], with backend or qengine =
+    'qnnpack', some quantized operators with this symmetric qconfig may use
+    operators from xnnpack library.
+
+        ** Support to use xnnpack ops with `qnnpack` backed for asymmetric
+        qconfig (returned by get_default_qconfig()) is not available yet.
+
+    * This qconfig uses signed activations and weights. Weights have added
+    restrictions such as zero point is forced to be 0, making the weights
+    symmetric, hence the name. And the 8-bit quantized values are
+    restricting to to [-127, +127], excluding -128.
+
+    * xnnpack has a requantization scale value restriction, 0x1p-32 <=
+    requantization_scale < 256.0 where, `requantization_scale = (input_scale
+    * kernel_scale) / (output_scale)`. Using this eps (w/ assumed max value
+    of 256) is to prevent requantization_scale to go below xnnpack lower
+    threshold.
+"""
+default_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8,
+                                                                                   reduce_range=False,
+                                                                                   eps=2 ** -12),
+                                            weight=weight_observer_range_neg_127_to_127)
+
+default_per_channel_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8,
+                                                                                               reduce_range=False,
+                                                                                               eps=2 ** -12),
+                                                        weight=per_channel_weight_observer_range_neg_127_to_127)
+
 default_embedding_qat_qconfig = QConfig(activation=NoopObserver.with_args(dtype=torch.float32),
                                         weight=default_embedding_fake_quant)
 
@@ -212,15 +259,15 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
     Returns the default QAT qconfig for the specified backend.
 
     Args:
-      * `backend`: a string representing the target backend. Currently supports `fbgemm`
-        and `qnnpack`.
+      * `backend`: a string representing the target backend. Currently supports `fbgemm`,
+        `qnnpack` and `onednn`.
       * `version`: version, for backwards compatibility. Can be `None` or `1`.
 
     Return:
         qconfig
     """
     # Histogram observer is too slow for quantization aware training
-    if version is None:
+    if version == 0:
         if backend == 'fbgemm':
             qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                                 quant_min=0,
@@ -233,10 +280,15 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
                                                                 quant_max=255,
                                                                 reduce_range=False),
                               weight=default_weight_fake_quant)
+        elif backend == 'onednn':
+            qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                quant_min=0,
+                                                                quant_max=255),
+                              weight=default_per_channel_weight_fake_quant)
         else:
             qconfig = default_qat_qconfig
-    # Use the fused observer + fake_quant modules for doing QAT.
-    if version == 1:
+    # Use the fused observe + fake_quant modules for doing QAT.
+    elif version == 1:
         if backend == 'fbgemm':
             qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                                                  quant_min=0,
@@ -249,23 +301,86 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
                                                                                  quant_max=255,
                                                                                  reduce_range=False),
                               weight=default_fused_wt_fake_quant)
+        elif backend == 'onednn':
+            qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                 quant_min=0,
+                                                                                 quant_max=255),
+                              weight=default_fused_per_channel_wt_fake_quant)
         else:
             qconfig = default_qat_qconfig_v2
+    else:
+        raise AssertionError("Version number: " + str(version) +
+                             "in get_default_qat_qconfig is not supported. Version number must be 0 or 1")
+
     return qconfig
 
-def get_default_qconfig_dict(backend='fbgemm', version=0):
-    qconfig = get_default_qconfig(backend)
+"""
+Default symmetric QAT qconfig for qnnpack. And its per channel weight variant.
+"""
+default_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                       quant_min=-128,
+                                                       quant_max=127,
+                                                       dtype=torch.qint8,
+                                                       reduce_range=False,
+                                                       eps=2 ** -12),
+    weight=fused_wt_fake_quant_range_neg_127_to_127)
+
+default_per_channel_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                       quant_min=-128,
+                                                       quant_max=127,
+                                                       dtype=torch.qint8,
+                                                       reduce_range=False,
+                                                       eps=2 ** -12),
+    weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127)
+
+def _get_default_qconfig_dict_helper(qconfig, qconfig_transpose):
     return {
         "": qconfig,
-        "object_type": [("reshape", default_reuse_input_qconfig)]
-    }
+        "object_type": [("reshape", default_reuse_input_qconfig),
+                        (torch.nn.Conv1d, qconfig),
+                        (torch.nn.Conv2d, qconfig),
+                        (torch.nn.Conv3d, qconfig),
+                        (torch.nn.ConvTranspose1d, qconfig_transpose),
+                        (torch.nn.ConvTranspose2d, qconfig_transpose),
+                        (torch.nn.ConvTranspose3d, qconfig_transpose),
+                        (torch.nn.Linear, qconfig),
+                        (torch.nn.functional.conv1d, qconfig),
+                        (torch.nn.functional.conv2d, qconfig),
+                        (torch.nn.functional.conv3d, qconfig),
+                        (torch.nn.functional.conv_transpose1d, qconfig_transpose),
+                        (torch.nn.functional.conv_transpose2d, qconfig_transpose),
+                        (torch.nn.functional.conv_transpose3d, qconfig_transpose),
+                        (torch.nn.functional.linear, qconfig),
+                        (torch.nn.ReLU, qconfig),
+                        (torch.nn.functional.relu, qconfig),
+                        (torch.relu, qconfig),
+                        (torch.nn.BatchNorm1d, qconfig),
+                        (torch.nn.BatchNorm2d, qconfig),
+                        (torch.nn.BatchNorm3d, qconfig)]}
+
+def get_default_qconfig_dict(backend='fbgemm', version=0):
+    qconfig = get_default_qconfig(backend, version)
+    qconfig_transpose = qconfig
+    # default_per_channel_weight_observer is not currently compatible with fbgemm backend
+    # so we have to modify the weight observer to default_weight_observer or another
+    # per tensor supported observer.
+    # see https://github.com/pytorch/pytorch/issues/47535
+    if backend == "fbgemm":
+        qconfig_transpose = QConfig(activation=qconfig.activation, weight=default_weight_observer)
+    return _get_default_qconfig_dict_helper(qconfig, qconfig_transpose)
 
 def get_default_qat_qconfig_dict(backend='fbgemm', version=1):
-    qconfig = get_default_qat_qconfig(backend, version=version)
-    return {
-        "": qconfig,
-        "object_type": [("reshape", default_reuse_input_qconfig)]
-    }
+    qconfig = get_default_qat_qconfig(backend, version)
+    qconfig_transpose = qconfig
+    # default_per_channel_weight_observer is not currently compatible with fbgemm backend
+    # so we have to modify the weight observer to default_weight_observer or another
+    # per tensor supported observer
+    # see https://github.com/pytorch/pytorch/issues/47535
+    if backend == "fbgemm":
+        qconfig_transpose = QConfig(activation=qconfig.activation, weight=default_weight_fake_quant)
+    return _get_default_qconfig_dict_helper(qconfig, qconfig_transpose)
 
 def assert_valid_qconfig(qconfig: Optional[QConfig],
                          mod: torch.nn.Module) -> None:
@@ -369,9 +484,10 @@ def partial_equals(p1, p2):
 def activation_is_memoryless(qconfig: QConfig):
     """
     Return whether the observer for activations defined in the given QConfig is memoryless.
+    This means a MovingAverage observer with averaging constant equal to 1.
     """
     def _is_memoryless(observer):
-        return hasattr(observer, "memoryless") and observer.memoryless
+        return hasattr(observer, "averaging_constant") and observer.averaging_constant == 1
     act = qconfig.activation()
     if isinstance(act, FakeQuantizeBase) and hasattr(act, "activation_post_process"):
         return _is_memoryless(act.activation_post_process)
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index 3f3ce8fff5df..ebaa693c7477 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -19,17 +19,29 @@
 import torch.ao.nn as ao_nn
 from torch.ao.quantization.stubs import QuantStub, DeQuantStub
 from torch.ao.quantization.fake_quantize import (
-    default_affine_fixed_qparams_fake_quant,
-    default_symmetric_fixed_qparams_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
 )
 from torch.ao.quantization.utils import get_combined_dict
+from torch.nn.utils.parametrize import type_before_parametrizations
 
 # Default map for swapping float module to reference quantized modules
 DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
     nn.Linear: nnqr.Linear,
     nn.Conv1d: nnqr.Conv1d,
     nn.Conv2d: nnqr.Conv2d,
     nn.Conv3d: nnqr.Conv3d,
+    nn.ConvTranspose1d: nnqr.ConvTranspose1d,
+    nn.ConvTranspose2d: nnqr.ConvTranspose2d,
+    nn.ConvTranspose3d: nnqr.ConvTranspose3d,
+    nn.Embedding: nnqr.Embedding,
+    nn.EmbeddingBag: nnqr.EmbeddingBag,
+    nn.GRUCell: nnqr.GRUCell,
+    nn.LSTMCell: nnqr.LSTMCell,
+    nn.RNNCell: nnqr.RNNCell,
+    nn.LSTM: nnqr.LSTM,
 }
 
 # Default map for swapping float module to quantized ones
@@ -77,6 +89,7 @@
     nniqat.ConvReLU2d: nniq.ConvReLU2d,
     nniqat.ConvReLU3d: nniq.ConvReLU3d,
     nniqat.LinearReLU: nniq.LinearReLU,
+    nniqat.LinearBn1d: nnq.Linear,
     # QAT modules:
     nnqat.Linear: nnq.Linear,
     nnqat.Conv2d: nnq.Conv2d,
@@ -99,6 +112,7 @@
     nni.ConvReLU2d: nniqat.ConvReLU2d,
     nni.ConvReLU3d: nniqat.ConvReLU3d,
     nni.LinearReLU: nniqat.LinearReLU,
+    nni.LinearBn1d: nniqat.LinearBn1d,
 }
 
 # Default map for swapping dynamic modules
@@ -142,9 +156,10 @@
 
 # mapping from module to output activation post process class
 DEFAULT_MODULE_TO_ACT_POST_PROCESS : Dict[Callable, Callable] = {
-    nn.Hardsigmoid: default_affine_fixed_qparams_fake_quant,
-    nn.Sigmoid: default_affine_fixed_qparams_fake_quant,
-    nn.Tanh: default_symmetric_fixed_qparams_fake_quant,
+    nn.Hardsigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Sigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Softmax: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Tanh: default_fixed_qparams_range_neg1to1_fake_quant,
 }
 
 # Default map for swapping float module to static sparse quantized ones
@@ -170,6 +185,11 @@ def get_default_static_quant_module_mappings() -> Dict[Callable, Any]:
     '''
     return copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS)
 
+def get_default_static_quant_reference_module_mappings() -> Dict[Callable, Any]:
+    ''' Get reference module mapping for post training static quantization
+    '''
+    return copy.deepcopy(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS)
+
 def get_embedding_static_quant_module_mappings() -> Dict[Callable, Any]:
     ''' Get module mapping, including mapping for embedding QAT
     '''
@@ -288,7 +308,7 @@ def _get_special_act_post_process(module: torch.nn.Module) -> Optional[Callable]
     input: torch.nn.Sigmoid
     output: default_affine_fixed_qparam_fake_quant
     """
-    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(type(module), None)
+    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(type_before_parametrizations(module), None)
 
 def _has_special_act_post_process(module: torch.nn.Module) -> bool:
     return module.training and type(module) in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/torch/ao/quantization/quantization_types.py b/torch/ao/quantization/quantization_types.py
new file mode 100644
index 000000000000..b6cb5bef434e
--- /dev/null
+++ b/torch/ao/quantization/quantization_types.py
@@ -0,0 +1,18 @@
+# TODO: the name of this file is probably confusing, remove this file and move the type
+# definitions to somewhere else, e.g. to .utils
+from typing import Any, Tuple, Union
+from torch.fx import Node
+from .utils import Pattern  # noqa: F401
+
+NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any]
+
+# This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+# Define separately to prevent circular imports.
+# TODO(future PR): improve this.
+QuantizerCls = Any
+
+__all__ = [
+    "Pattern",
+    "NodePattern",
+    "QuantizerCls",
+]
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 5afff09b64b8..f5aa195c94dd 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -10,13 +10,14 @@
 from torch.ao.quantization.quantization_mappings import (
     get_default_dynamic_quant_module_mappings,
     get_default_static_quant_module_mappings,
+    get_default_static_quant_reference_module_mappings,
     get_default_qat_module_mappings,
     get_default_qconfig_propagation_list,
     no_observer_set,
     _has_special_act_post_process,
     _get_special_act_post_process,
 )
-
+from .utils import get_qparam_dict, has_no_children_ignoring_parametrizations
 from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
 from torch.ao.quantization.qconfig import (
     add_module_to_qconfig_obs_ctr,
@@ -25,6 +26,7 @@
     float_qparams_weight_only_qconfig,
     float_qparams_weight_only_qconfig_4bit,
     activation_is_memoryless)
+from torch.nn.utils.parametrize import type_before_parametrizations
 
 def is_activation_post_process(module):
     return (isinstance(module, torch.ao.quantization.ObserverBase) or
@@ -32,7 +34,7 @@ def is_activation_post_process(module):
 
 
 def _propagate_qconfig_helper(module, qconfig_dict,
-                              qconfig_parent=None, prefix=''):
+                              qconfig_parent=None, prefix='', prepare_custom_config_dict=None):
     r"""This is a helper function for `propagate_qconfig_`
 
     Args:
@@ -44,12 +46,14 @@ def _propagate_qconfig_helper(module, qconfig_dict,
                        module
         prefix: corresponding prefix of the current module, used as key in
                 qconfig_dict
+        prepare_custom_config_dict: dictionary for custom handling of modules
+                                    see docs for :func:`~torch.ao.quantization.prepare_fx`
 
     Return:
         None, module is modified inplace with qconfig attached
     """
 
-    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
+    module_qconfig = qconfig_dict.get(type_before_parametrizations(module), qconfig_parent)
     module_qconfig = qconfig_dict.get(prefix, module_qconfig)
     module_qconfig = getattr(module, 'qconfig', module_qconfig)
 
@@ -60,10 +64,16 @@ def _propagate_qconfig_helper(module, qconfig_dict,
 
     for name, child in module.named_children():
         module_prefix = prefix + '.' + name if prefix else name
-        _propagate_qconfig_helper(child, qconfig_dict,
-                                  qconfig_with_device_check, module_prefix)
+        #  do no not propagate qconfig to child if child is non traceable
+        if prepare_custom_config_dict is None or not (
+            name in prepare_custom_config_dict.get("non_traceable_module_name", [])
+            or type(child) in prepare_custom_config_dict.get("non_traceable_module_class", [])
+        ):
+            _propagate_qconfig_helper(
+                child, qconfig_dict, qconfig_with_device_check, module_prefix
+            )
 
-def propagate_qconfig_(module, qconfig_dict=None):
+def propagate_qconfig_(module, qconfig_dict=None, prepare_custom_config_dict=None):
     r"""Propagate qconfig through the module hierarchy and assign `qconfig`
     attribute on each leaf module
 
@@ -73,13 +83,17 @@ def propagate_qconfig_(module, qconfig_dict=None):
             quantization configuration, qconfig applies to all submodules of a
             given module unless qconfig for the submodules are specified (when
             the submodule already has qconfig attribute)
+        prepare_custom_config_dict: dictionary for custom handling of modules
+            see docs for :func:`~torch.ao.quantization.prepare_fx`
 
     Return:
         None, module is modified inplace with qconfig attached
     """
     if qconfig_dict is None:
         qconfig_dict = {}
-    _propagate_qconfig_helper(module, qconfig_dict)
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = {}
+    _propagate_qconfig_helper(module, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict)
 
 def _observer_forward_hook(self, input, output):
     r"""Forward hook that calls observer on the output
@@ -157,9 +171,9 @@ def insert_activation_post_process(m, special_act_post_process=None):
 
     for name, child in module.named_children():
         # TODO remove Dropout special after codebase stable
-        if type(child) in [nn.Dropout]:
+        if type_before_parametrizations(child) in [nn.Dropout]:
             continue
-        elif type(child) in [nnq.FloatFunctional, nnq.QFunctional]:
+        elif type_before_parametrizations(child) in [nnq.FloatFunctional, nnq.QFunctional]:
             if needs_observation(child):
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
         elif isinstance(child, _FusedModule):
@@ -169,23 +183,23 @@ def insert_activation_post_process(m, special_act_post_process=None):
         elif _has_special_act_post_process(child):
             special_act_post_process = _get_special_act_post_process(child)
             insert_activation_post_process(child, special_act_post_process)
-        elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
+        elif non_leaf_module_list is not None and type_before_parametrizations(child) in non_leaf_module_list:
             if needs_observation(child):
                 insert_activation_post_process(child)
-        elif needs_observation(child) and type(child) in custom_module_class_mapping:
-            observed_child = custom_module_class_mapping[type(child)].from_float(child)
+        elif needs_observation(child) and type_before_parametrizations(child) in custom_module_class_mapping:
+            observed_child = custom_module_class_mapping[type_before_parametrizations(child)].from_float(child)
             setattr(module, name, observed_child)
             # TODO: These are the modules that cannot be observed
             #       Once there are more, we should move them to a separate list
-            if custom_module_class_mapping[type(child)] not in no_observer_set():
+            if custom_module_class_mapping[type_before_parametrizations(child)] not in no_observer_set():
                 insert_activation_post_process(observed_child)
         else:
             add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
 
     # Insert observers only for leaf nodes, note that this observer is for
     # the output of the module, for input QuantStub will observe them
-    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
-       and type(module) in qconfig_propagation_list:
+    if has_no_children_ignoring_parametrizations(module) and not isinstance(module, torch.nn.Sequential) \
+       and type_before_parametrizations(module) in qconfig_propagation_list:
         insert_activation_post_process(module)
 
 def get_unique_devices_(module):
@@ -207,7 +221,7 @@ def add_quant_dequant(module):
         wraps the input module, the latter case only happens when the input
         module is a leaf module and we want to quantize it.
     """
-    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
+    if has_no_children_ignoring_parametrizations(module) and hasattr(module, 'qconfig') and module.qconfig:
         return QuantWrapper(module)
 
     for name, child in module.named_children():
@@ -472,7 +486,7 @@ def quantize_qat(model, run_fn, run_args, inplace=False):
 
 def convert(
         module, mapping=None, inplace=False, remove_qconfig=True,
-        convert_custom_config_dict=None):
+        is_reference=False, convert_custom_config_dict=None):
     r"""Converts submodules in input module to a different module according to `mapping`
     by calling `from_float` method on the target module class. And remove qconfig at the
     end if remove_qconfig is set to True.
@@ -503,7 +517,7 @@ def convert(
     if not inplace:
         module = copy.deepcopy(module)
     _convert(
-        module, mapping, inplace=True,
+        module, mapping, inplace=True, is_reference=is_reference,
         convert_custom_config_dict=convert_custom_config_dict)
     if remove_qconfig:
         _remove_qconfig(module)
@@ -511,7 +525,7 @@ def convert(
 
 def _convert(
         module, mapping=None, inplace=False,
-        convert_custom_config_dict=None):
+        is_reference=False, convert_custom_config_dict=None):
     r"""Converts submodules in input module to a different module according to `mapping`
     by calling `from_float` method on the target module class
 
@@ -522,10 +536,12 @@ def _convert(
                  Modules
         inplace: carry out model transformations in-place, the original module
                  is mutated
+        is_reference: a flag to enable quantized reference module
 
     """
     if mapping is None:
-        mapping = get_default_static_quant_module_mappings()
+        mapping = get_default_static_quant_reference_module_mappings() if is_reference \
+            else get_default_static_quant_module_mappings()
     if convert_custom_config_dict is None:
         convert_custom_config_dict = {}
     custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
@@ -537,9 +553,9 @@ def _convert(
         # both fused modules and observed custom modules are
         # swapped as one unit
         if not isinstance(mod, _FusedModule) and \
-           type(mod) not in custom_module_class_mapping:
+           type_before_parametrizations(mod) not in custom_module_class_mapping:
             _convert(mod, mapping, True,  # inplace
-                     convert_custom_config_dict)
+                     is_reference, convert_custom_config_dict)
         reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
 
     for key, value in reassign.items():
@@ -561,11 +577,19 @@ def swap_module(mod, mapping, custom_module_class_mapping):
     new_mod = mod
     if hasattr(mod, 'qconfig') and mod.qconfig is not None:
         swapped = False
-        if type(mod) in custom_module_class_mapping:
-            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
+        if type_before_parametrizations(mod) in custom_module_class_mapping:
+            new_mod = custom_module_class_mapping[type_before_parametrizations(mod)].from_observed(mod)
             swapped = True
-        elif type(mod) in mapping:
-            new_mod = mapping[type(mod)].from_float(mod)
+        elif type_before_parametrizations(mod) in mapping:
+            qmod = mapping[type_before_parametrizations(mod)]
+            if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE:
+                assert mod.qconfig is not None
+                weight_post_process = mod.qconfig.weight()
+                weight_post_process(mod.weight)
+                weight_qparams = get_qparam_dict(weight_post_process)
+                new_mod = qmod.from_float(mod, weight_qparams)
+            else:
+                new_mod = qmod.from_float(mod)
             swapped = True
 
         if swapped:
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index 07a1eb6755b2..64de11818bd1 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -5,9 +5,10 @@
 from torch.fx._symbolic_trace import Tracer
 from torch.fx.node import Target, Node, Argument
 from torch.nn.intrinsic import _FusedModule
-from .fx import Fuser  # noqa: F401
-from .fx import prepare, convert  # noqa: F401
-from .fx import get_tensorrt_backend_config_dict  # noqa: F401
+from .fx import fuse  # noqa: F401
+from .fx import prepare  # noqa: F401
+from .fx.convert import convert
+from .backend_config import get_tensorrt_backend_config_dict  # noqa: F401
 from .fx.graph_module import ObservedGraphModule
 from .fx.qconfig_utils import (
     check_is_valid_convert_custom_config_dict,
@@ -57,9 +58,8 @@ def _fuse_fx(
         graph_module: GraphModule object from symbolic tracing (torch.fx.symbolic_trace)
     """
     _check_is_graph_module(graph_module)
-    fuser = Fuser()
-    return fuser.fuse(
-        graph_module, is_qat, fuse_custom_config_dict, backend_config_dict)
+    return fuse(
+        graph_module, is_qat, fuse_custom_config_dict, backend_config_dict)  # type: ignore[operator]
 
 
 class Scope(object):
@@ -251,7 +251,7 @@ def _prepare_fx(
         equalization_qconfig_dict=equalization_qconfig_dict,
         backend_config_dict=backend_config_dict,
         is_standalone_module=is_standalone_module,
-    )
+    )  # type: ignore[operator]
 
     for attr_name in preserved_attributes:
         setattr(prepared, attr_name, getattr(model, attr_name))
@@ -298,7 +298,8 @@ def _prepare_standalone_module_fx(
 
 
 def fuse_fx(
-    model: torch.nn.Module, fuse_custom_config_dict: Optional[Dict[str, Any]] = None
+    model: torch.nn.Module, fuse_custom_config_dict: Optional[Dict[str, Any]] = None,
+    backend_config_dict: Optional[Dict[str, Any]] = None,
 ) -> GraphModule:
     r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
     Fusion rules are defined in torch.quantization.fx.fusion_pattern.py
@@ -309,10 +310,6 @@ def fuse_fx(
         * `fuse_custom_config_dict`: Dictionary for custom configurations for fuse_fx, e.g.::
 
             fuse_custom_config_dict = {
-              "additional_fuser_method_mapping": {
-                (Module1, Module2): fuse_module1_module2
-              }
-
               # Attributes that are not used in forward function will
               # be removed when constructing GraphModule, this is a list of attributes
               # to preserve as an attribute of the GraphModule even when they are
@@ -328,7 +325,6 @@ def fuse_fx(
 
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx")
-    assert not model.training, "fuse_fx only works on models in eval mode"
     check_is_valid_fuse_custom_config_dict(fuse_custom_config_dict)
     graph_module = torch.fx.symbolic_trace(model)
     preserved_attributes: Set[str] = set()
@@ -338,7 +334,7 @@ def fuse_fx(
         )
     for attr_name in preserved_attributes:
         setattr(graph_module, attr_name, getattr(model, attr_name))
-    return _fuse_fx(graph_module, False, fuse_custom_config_dict)
+    return _fuse_fx(graph_module, False, fuse_custom_config_dict, backend_config_dict)
 
 
 def prepare_fx(
@@ -439,27 +435,6 @@ def prepare_fx(
                NonTraceableModule
             ],
 
-            # Additional fuser_method mapping
-            "additional_fuser_method_mapping": {
-               (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn
-            },
-
-            # Additioanl module mapping for qat
-            "additional_qat_module_mapping": {
-               torch.nn.intrinsic.ConvBn2d: torch.nn.qat.ConvBn2d
-            },
-
-            # Additional fusion patterns
-            "additional_fusion_pattern": {
-               (torch.nn.BatchNorm2d, torch.nn.Conv2d): ConvReluFusionhandler
-            },
-
-            # Additional quantization patterns
-            "additional_quant_pattern": {
-               torch.nn.Conv2d: ConvReluQuantizeHandler,
-               (torch.nn.ReLU, torch.nn.Conv2d): ConvReluQuantizeHandler,
-            }
-
             # By default, inputs and outputs of the graph are assumed to be in
             # fp32. Providing `input_quantized_idxs` will set the inputs with the
             # corresponding indices to be quantized. Providing
@@ -511,7 +486,6 @@ def calibrate(model, data_loader):
 
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
-    assert not model.training, "prepare_fx only works for models in " + "eval mode"
     return _prepare_fx(
         model,
         qconfig_dict,
@@ -560,7 +534,6 @@ def train_loop(model, train_data):
 
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
-    assert model.training, "prepare_qat_fx only works for models in  " + "train mode"
     return _prepare_fx(
         model,
         qconfig_dict,
@@ -577,6 +550,7 @@ def _convert_fx(
     is_standalone_module: bool = False,
     _remove_qconfig: bool = True,
     qconfig_dict: Dict[str, Any] = None,
+    backend_config_dict: Dict[str, Any] = None,
 ) -> torch.nn.Module:
     """ `is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx`
     """
@@ -593,6 +567,7 @@ def _convert_fx(
         is_standalone_module,
         _remove_qconfig_flag=_remove_qconfig,
         convert_qconfig_dict=qconfig_dict,
+        backend_config_dict=backend_config_dict,
     )
 
     preserved_attributes = convert_custom_config_dict.get("preserved_attributes", [])
@@ -607,6 +582,7 @@ def convert_fx(
     convert_custom_config_dict: Optional[Dict[str, Any]] = None,
     _remove_qconfig: bool = True,
     qconfig_dict: Dict[str, Any] = None,
+    backend_config_dict: Dict[str, Any] = None,
 ) -> torch.nn.Module:
     r""" Convert a calibrated or trained model to a quantized model
 
@@ -618,20 +594,6 @@ def convert_fx(
         * `convert_custom_config_dict`: dictionary for custom configurations for convert function::
 
             convert_custom_config_dict = {
-
-              # additional object (module/operator) mappings that will overwrite the default
-              # module mappinng
-              "additional_object_mapping": {
-                 "static": {
-                    FloatModule: QuantizedModule,
-                    float_op: quantized_op
-                 },
-                 "dynamic": {
-                    FloatModule: DynamicallyQuantizedModule,
-                    float_op: dynamically_quantized_op
-                 },
-              },
-
               # user will manually define the corresponding quantized
               # module class which has a from_observed class method that converts
               # observed custom module to quantized custom module
@@ -677,6 +639,11 @@ def convert_fx(
               ],
             }
 
+         * `backend_config_dict`: A configuration for the backend which describes how
+            operators should be quantized in the backend, this includes quantization
+            mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.),
+            observer placement for each operators and fused operators. Detailed
+            documentation can be found in torch/ao/quantization/backend_config/README.md
 
     Return:
         A quantized model (GraphModule)
@@ -694,6 +661,7 @@ def convert_fx(
         convert_custom_config_dict,
         _remove_qconfig=_remove_qconfig,
         qconfig_dict=qconfig_dict,
+        backend_config_dict=backend_config_dict,
     )
 
 
diff --git a/torch/ao/quantization/stubs.py b/torch/ao/quantization/stubs.py
index 1f4c462e56e2..7ae526a8921e 100644
--- a/torch/ao/quantization/stubs.py
+++ b/torch/ao/quantization/stubs.py
@@ -21,9 +21,15 @@ def forward(self, x):
 class DeQuantStub(nn.Module):
     r"""Dequantize stub module, before calibration, this is same as identity,
     this will be swapped as `nnq.DeQuantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
     """
-    def __init__(self):
+    def __init__(self, qconfig=None):
         super(DeQuantStub, self).__init__()
+        if qconfig:
+            self.qconfig = qconfig
 
     def forward(self, x):
         return x
@@ -48,7 +54,7 @@ def __init__(self, module):
         super(QuantWrapper, self).__init__()
         qconfig = module.qconfig if hasattr(module, 'qconfig') else None
         self.add_module('quant', QuantStub(qconfig))
-        self.add_module('dequant', DeQuantStub())
+        self.add_module('dequant', DeQuantStub(qconfig))
         self.add_module('module', module)
         self.train(module.training)
 
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index e81c88993c4c..f42b5c1ce723 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -6,12 +6,20 @@
 import torch
 from torch.ao.quantization.quant_type import QuantType, quant_type_to_str
 from typing import Tuple, Any, Union, Callable
+from torch.nn.utils.parametrize import is_parametrized
 
 # Type for fusion patterns, it can be more complicated than the following actually,
 # see pattern.md for docs
 # TODO: not sure if typing supports recursive data types
 Pattern = Union[Callable, Tuple[Callable, Callable], Tuple[Callable, Tuple[Callable, Callable]], Any]
 
+# TODO: maybe rename this to MatchInputNode
+class MatchAllNode:
+    """ A node pattern that matches all nodes, used in defining
+    fusion patterns in FX Graph Mode Quantization
+    """
+    pass
+
 module_type_list = {
     torch.nn.ReLU,
     torch.nn.ReLU6,
@@ -25,21 +33,37 @@
     torch.nn.MaxPool2d,
     torch.nn.MaxPool3d,
     torch.nn.Identity,
+    torch.nn.Hardsigmoid,
+    torch.nn.Sigmoid,
+    torch.nn.Tanh,
 }
 func_list = {
     torch.nn.functional.adaptive_avg_pool1d,
     torch.nn.functional.adaptive_avg_pool2d,
     torch.nn.functional.adaptive_avg_pool3d,
+    torch.nn.functional.elu,
+    torch.nn.functional.hardswish,
+    torch.nn.functional.instance_norm,
+    torch.nn.functional.layer_norm,
+    torch.nn.functional.leaky_relu,
+    torch.nn.functional.silu,
+    torch.nn.functional.mish,
+    torch.nn.functional.dropout,
     torch.nn.functional.max_pool1d,
     torch.nn.functional.max_pool2d,
     torch.nn.functional.max_pool3d,
     torch.nn.functional.relu,
     torch.nn.functional.hardtanh,
     torch.nn.functional.hardtanh_,
+    torch.nn.functional.hardsigmoid,
+    torch.nn.functional.sigmoid,
     torch.transpose,
     torch.repeat_interleave,
+    torch.sigmoid,
     torch.squeeze,
     torch.stack,
+    torch.sum,
+    torch.tanh,
     torch.unsqueeze,
     torch.cat,
 }
@@ -50,15 +74,21 @@
     'contiguous',
     'detach',
     'detach_',
+    'hardsigmoid',
+    'hardsigmoid_',
     'permute',
     'repeat',
     'repeat_interleave',
     'reshape',
     'resize_',
     'shape',
+    'sigmoid',
+    'sigmoid_',
     'size',
     'squeeze',
     'squeeze_',
+    'tanh',
+    'tanh_',
     'transpose',
     'unsqueeze',
     'unsqueeze_',
@@ -66,6 +96,7 @@
 }
 
 def check_node(node, modules):
+    # TODO: reuse is_fixed_qparam_node after we move this function to _lower_to_native_backend.py
     is_call_function = node.op == "call_function" and node.target in func_list
     is_call_method = node.op == "call_method" and node.target in method_list
     is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list
@@ -154,17 +185,33 @@ def activation_is_statically_quantized(qconfig):
     """
     return activation_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16]
 
+def activation_is_dynamically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    dynamically quantized or not, this includes dynamically quantizing to
+    quint8, qint8 and float16
+    """
+    activation_dtype, _, activation_compute_dtype = \
+        get_qconfig_dtypes(qconfig)
+    return activation_dtype == torch.float and \
+        activation_compute_dtype in [torch.quint8, torch.qint8, torch.float16]
+
 def activation_is_int8_quantized(qconfig):
     """ Given a qconfig, decide if the activation needs to be
     quantized to int8 or not, this includes quantizing to quint8, qint8
     """
     return activation_dtype(qconfig) in [torch.quint8, torch.qint8]
 
+def activation_is_int32_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized to int32 or not
+    """
+    return activation_dtype(qconfig) == torch.qint32
+
 def weight_is_quantized(qconfig):
     """ Given a qconfig, decide if the weight needs to be
     quantized or not
     """
-    return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16]
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16, torch.quint4x2]
 
 def weight_is_statically_quantized(qconfig):
     """ Given a qconfig, decide if the weight needs to be statically
@@ -199,7 +246,7 @@ def get_quant_type(qconfig):
     assert qconfig is not None
     activation = qconfig.activation()
     weight = qconfig.weight()
-    static_dtypes = [torch.quint8, torch.qint8]
+    static_dtypes = [torch.quint8, torch.qint8, torch.quint4x2]
     if weight.dtype in static_dtypes:
         if activation.dtype in static_dtypes:
             return QuantType.STATIC
@@ -253,11 +300,15 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
     r"""Calculates actual qmin and qmax based on the quantization range,
     observer datatype and if range is reduced.
     """
+    # TODO(jerryzh): Figure out why custom quant_min/quant_max are still adjusted.
     if has_customized_qrange:
         # This initialization here is to be resolve TorchScript compilation issues and allow
         # using of refinement to decouple initial_qmin and initial_qmax from quantization range.
         # The actual values of initial_qmin and initial_qmax will be reset below.
-        initial_quant_min, initial_quant_max = 0, 255
+        if dtype == torch.qint32:
+            initial_quant_min, initial_quant_max = 0, 2**31 - 1
+        else:
+            initial_quant_min, initial_quant_max = 0, 255
         # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the
         # attribute from Optional valid integers for use, based on TorchScript's requirements.
         custom_quant_min, custom_quant_max = quant_min, quant_max
@@ -268,13 +319,14 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
             )
 
         qrange_len = initial_quant_max - initial_quant_min + 1
-        assert (
-            0 < qrange_len <= 256
-        ), "quantization range should be positive and not exceed the maximum bit range (=256)."
         if dtype == torch.qint8:
-            quant_min, quant_max = -qrange_len // 2, qrange_len // 2 - 1
-        else:
-            quant_min, quant_max = 0, qrange_len - 1
+            assert (
+                0 < qrange_len <= 256
+            ), "quantization range should be positive and not exceed the maximum bit range (=256)."
+        elif dtype == torch.qint32:
+            assert (
+                0 < qrange_len <= 2**31
+            ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
         if reduce_range:
             quant_min, quant_max = quant_min // 2, quant_max // 2
     else:
@@ -289,6 +341,8 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
                 quant_min, quant_max = 0, 127
             else:
                 quant_min, quant_max = 0, 255
+        elif dtype == torch.qint32:
+            quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1
         else:
             quant_min, quant_max = 0, 15
     return quant_min, quant_max
@@ -303,3 +357,16 @@ def _parent_name(target):
         return '', r[0]
     else:
         return r[0], r[1]
+
+def has_no_children_ignoring_parametrizations(module):
+    """
+    Checks if module._modules is empty or
+    if module is a parametrization, checks that module._modules only has
+    the 'parametrizations' module
+    """
+    if len(module._modules) == 0:
+        return True
+    elif is_parametrized(module):
+        return len(module._modules) == 1 and 'parametrizations' in module._modules
+    else:
+        return False
diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py
deleted file mode 100644
index daf2a34383fb..000000000000
--- a/torch/autocast_mode.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import torch
-import functools
-import warnings
-
-from typing import Any, Optional
-from .types import _dtype
-
-def autocast_decorator(autocast_instance, func):
-    @functools.wraps(func)
-    def decorate_autocast(*args, **kwargs):
-        with autocast_instance:
-            return func(*args, **kwargs)
-    decorate_autocast.__script_unsupported = '@autocast() decorator is not supported in script mode'  # type: ignore[attr-defined]
-    return decorate_autocast
-
-class autocast(object):
-    r"""
-    Instances of :class:`autocast` serve as context managers or decorators that
-    allow regions of your script to run in mixed precision.
-
-    In these regions, ops run in an op-specific dtype chosen by autocast
-    to improve performance while maintaining accuracy.
-    See the :ref:`Autocast Op Reference<autocast-op-reference>` for details.
-
-    When entering an autocast-enabled region, Tensors may be any type.
-    You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting.
-
-    :class:`autocast` should wrap only the forward pass(es) of your network, including the loss
-    computation(s).  Backward passes under autocast are not recommended.
-    Backward ops run in the same type that autocast used for corresponding forward ops.
-
-    Example for CUDA Devices::
-
-        # Creates model and optimizer in default precision
-        model = Net().cuda()
-        optimizer = optim.SGD(model.parameters(), ...)
-
-        for input, target in data:
-            optimizer.zero_grad()
-
-            # Enables autocasting for the forward pass (model + loss)
-            with autocast():
-                output = model(input)
-                loss = loss_fn(output, target)
-
-            # Exits the context manager before backward()
-            loss.backward()
-            optimizer.step()
-
-    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage (along with gradient scaling)
-    in more complex scenarios (e.g., gradient penalty, multiple models/losses, custom autograd functions).
-
-    :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model::
-
-        class AutocastModel(nn.Module):
-            ...
-            @autocast()
-            def forward(self, input):
-                ...
-
-    Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
-    After returning to an autocast-disabled region, using them with floating-point
-    Tensors of different dtypes may cause type mismatch errors.  If so, cast the Tensor(s)
-    produced in the autocast region back to ``float32`` (or other dtype if desired).
-    If a Tensor from the autocast region is already ``float32``, the cast is a no-op,
-    and incurs no additional overhead.
-    CUDA Example::
-
-        # Creates some tensors in default dtype (here assumed to be float32)
-        a_float32 = torch.rand((8, 8), device="cuda")
-        b_float32 = torch.rand((8, 8), device="cuda")
-        c_float32 = torch.rand((8, 8), device="cuda")
-        d_float32 = torch.rand((8, 8), device="cuda")
-
-        with autocast():
-            # torch.mm is on autocast's list of ops that should run in float16.
-            # Inputs are float32, but the op runs in float16 and produces float16 output.
-            # No manual casts are required.
-            e_float16 = torch.mm(a_float32, b_float32)
-            # Also handles mixed input types
-            f_float16 = torch.mm(d_float32, e_float16)
-
-        # After exiting autocast, calls f_float16.float() to use with d_float32
-        g_float32 = torch.mm(d_float32, f_float16.float())
-
-    CPU Example::
-
-        # Creates some tensors in default dtype (here assumed to be float32)
-        a_float32 = torch.rand((8, 8), device="cpu")
-        b_float32 = torch.rand((8, 8), device="cpu")
-        c_float32 = torch.rand((8, 8), device="cpu")
-        d_float32 = torch.rand((8, 8), device="cpu")
-
-        with autocast(dtype=torch.bfloat16, device_type="cpu"):
-            # torch.mm is on autocast's list of ops that should run in bfloat16.
-            # Inputs are float32, but the op runs in bfloat16 and produces bfloat16 output.
-            # No manual casts are required.
-            e_bfloat16 = torch.mm(a_float32, b_float32)
-            # Also handles mixed input types
-            f_bfloat16 = torch.mm(d_float32, e_bfloat16)
-
-        # After exiting autocast, calls f_float16.float() to use with d_float32
-        g_float32 = torch.mm(d_float32, f_bfloat16.float())
-
-    Type mismatch errors *in* an autocast-enabled region are a bug; if this is what you observe,
-    please file an issue.
-
-    ``autocast(enabled=False)`` subregions can be nested in autocast-enabled regions.
-    Locally disabling autocast can be useful, for example, if you want to force a subregion
-    to run in a particular ``dtype``.  Disabling autocast gives you explicit control over
-    the execution type.  In the subregion, inputs from the surrounding region
-    should be cast to ``dtype`` before use::
-
-        # Creates some tensors in default dtype (here assumed to be float32)
-        a_float32 = torch.rand((8, 8), device="cuda")
-        b_float32 = torch.rand((8, 8), device="cuda")
-        c_float32 = torch.rand((8, 8), device="cuda")
-        d_float32 = torch.rand((8, 8), device="cuda")
-
-        with autocast():
-            e_float16 = torch.mm(a_float32, b_float32)
-            with autocast(enabled=False):
-                # Calls e_float16.float() to ensure float32 execution
-                # (necessary because e_float16 was created in an autocasted region)
-                f_float32 = torch.mm(c_float32, e_float16.float())
-
-            # No manual casts are required when re-entering the autocast-enabled region.
-            # torch.mm again runs in float16 and produces float16 output, regardless of input types.
-            g_float16 = torch.mm(d_float32, f_float32)
-
-    The autocast state is thread-local.  If you want it enabled in a new thread, the context manager or decorator
-    must be invoked in that thread.  This affects :class:`torch.nn.DataParallel` and
-    :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
-    (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
-
-    Args:
-        device_type(string, required):  Whether to use 'cuda' or 'cpu' device
-        enabled(bool, optional, default=True):  Whether autocasting should be enabled in the region.
-        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
-        cache_enabled(bool, optional, default=True):  Whether the weight cache inside autocast should be enabled.
-    """
-    def __init__(self, device_type : str,
-                 dtype : Optional[_dtype] = None,
-                 enabled : bool = True,
-                 cache_enabled : Optional[bool] = None):
-        if torch._jit_internal.is_scripting():
-            self._enabled = enabled
-            self.device = device_type
-            self.fast_dtype = dtype
-            # TODO: support get_autocast_gpu/cpu_dtype
-            assert dtype is not None
-            return
-        self.device = device_type
-        if self.device == 'cuda':
-            self.fast_dtype = torch.get_autocast_gpu_dtype()
-        elif self.device == 'cpu':
-            self.fast_dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'')
-        self._cache_enabled = torch.is_autocast_cache_enabled()
-        if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda':
-            warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling')
-            enabled = False
-        if dtype is not None:
-            self.fast_dtype = dtype
-        if cache_enabled is not None:
-            self._cache_enabled = cache_enabled
-
-        if self.device == 'cpu':
-            supported_dtype = [torch.bfloat16]
-            if self.fast_dtype not in supported_dtype:
-                error_message = 'In CPU autocast, but the target dtype is not supported. Disabling autocast.\n'
-                error_message += 'CPU Autocast only supports dtype of torch.bfloat16 currently.'
-                warnings.warn(error_message)
-                enabled = False
-        if self.device == 'cuda':
-            if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
-                raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.')
-        self._enabled = enabled
-
-    def __enter__(self):
-        if torch._jit_internal.is_scripting():
-            assert self.fast_dtype is not None
-            return self
-
-        self.prev_cache_enabled = torch.is_autocast_cache_enabled()
-        if self.device == 'cpu':
-            self.prev = torch.is_autocast_cpu_enabled()
-            self.prev_fastdtype = torch.get_autocast_cpu_dtype()
-            torch.set_autocast_cpu_enabled(self._enabled)
-            torch.set_autocast_cpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
-            torch.autocast_increment_nesting()
-        else:
-            self.prev = torch.is_autocast_enabled()
-            self.prev_fastdtype = torch.get_autocast_gpu_dtype()
-            torch.set_autocast_gpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
-            torch.set_autocast_enabled(self._enabled)
-            torch.autocast_increment_nesting()
-        torch.set_autocast_cache_enabled(self._cache_enabled)
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
-        if torch._jit_internal.is_scripting():
-            return
-
-        # Drop the cache when we exit to a nesting level that's outside any instance of autocast.
-        if self.device == 'cpu':
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_cpu_enabled(self.prev)
-            torch.set_autocast_cpu_dtype(self.prev_fastdtype)
-        else:
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_enabled(self.prev)
-            torch.set_autocast_gpu_dtype(self.prev_fastdtype)
-        torch.set_autocast_cache_enabled(self.prev_cache_enabled)
-        return False
-
-    def __call__(self, func):
-        if torch._jit_internal.is_scripting():
-            return func
-        return autocast_decorator(self, func)
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 28eb729ffcba..3fb02767efba 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -10,14 +10,14 @@
 import warnings
 
 from torch.types import _TensorOrTensors
-from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, cast
 
 from .variable import Variable
 from .function import Function, NestedIOFunction
 from .gradcheck import gradcheck, gradgradcheck
 from .grad_mode import no_grad, enable_grad, set_grad_enabled, inference_mode
 from .anomaly_mode import detect_anomaly, set_detect_anomaly
-from ..overrides import has_torch_function, handle_torch_function
+from ..overrides import has_torch_function, handle_torch_function, is_tensor_like
 from . import functional
 from . import forward_ad
 from . import graph
@@ -235,20 +235,21 @@ def grad(
             to show any performance warnings and file an issue on github if warnings exist
             for your use case. Defaults to ``False``.
     """
-    outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
-    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
-    overridable_args = outputs + inputs
+    t_outputs = cast(Tuple[torch.Tensor, ...], (outputs,) if is_tensor_like(outputs) else tuple(outputs))
+    t_inputs = cast(Tuple[torch.Tensor, ...], (inputs,) if is_tensor_like(inputs) else tuple(inputs))
+    overridable_args = t_outputs + t_inputs
     if has_torch_function(overridable_args):
         return handle_torch_function(
             grad,
             overridable_args,
-            outputs,
-            inputs,
+            t_outputs,
+            t_inputs,
             grad_outputs=grad_outputs,
             retain_graph=retain_graph,
             create_graph=create_graph,
             only_inputs=only_inputs,
             allow_unused=allow_unused,
+            is_grads_batched=is_grads_batched,
         )
 
     if not only_inputs:
@@ -256,8 +257,8 @@ def grad(
                       "(defaults to True). To accumulate gradient for other "
                       "parts of the graph, please use torch.autograd.backward.")
 
-    grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs))
-    grad_outputs_ = _make_grads(outputs, grad_outputs_, is_grads_batched=is_grads_batched)
+    grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(t_outputs))
+    grad_outputs_ = _make_grads(t_outputs, grad_outputs_, is_grads_batched=is_grads_batched)
 
     if retain_graph is None:
         retain_graph = create_graph
@@ -268,12 +269,12 @@ def grad(
     if is_grads_batched:
         def vjp(gO):
             return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-                outputs, gO, retain_graph, create_graph, inputs,
+                t_outputs, gO, retain_graph, create_graph, t_inputs,
                 allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass
-        return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs)
+        return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs_)
     else:
         return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-            outputs, grad_outputs_, retain_graph, create_graph, inputs,
+            t_outputs, grad_outputs_, retain_graph, create_graph, t_inputs,
             allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass
 
 
@@ -295,8 +296,13 @@ def _is_checkpoint_valid():
 
 
 def variable(*args, **kwargs):
-    warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
-    return torch.tensor(*args, **kwargs)
+    raise RuntimeError("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
+
+# Monkey patching variable.Variable to fix FX codegen. FX generates a call by roughly doing
+# f"{fn.__module__}.{fn.__name__}(...). This yields torch.autograd.variable.Variable(...) in the
+# output of an FX graph.  Unfortunately the module name torch.autograd.variable is shadowed by the
+# deprecated function - variable(...).
+variable.Variable = Variable  # type: ignore[attr-defined]
 
 if not torch._C._autograd_init():
     raise RuntimeError("autograd initialization failed")
@@ -309,7 +315,7 @@ def variable(*args, **kwargs):
                                 _supported_activities, _add_metadata_json, SavedTensor,
                                 _push_saved_tensors_default_hooks, _pop_saved_tensors_default_hooks)
 
-from torch._C._autograd import (_ProfilerResult, _KinetoEvent,
+from torch._C._autograd import (_ProfilerResult, _KinetoEvent, _kineto_step,
                                 _prepare_profiler, _enable_profiler, _disable_profiler)
 
 from . import profiler
diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
index f6ec3612674c..cca0ece338d0 100644
--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@@ -3,6 +3,8 @@
 
 from typing import Any
 
+__all__ = ["detect_anomaly", "set_detect_anomaly"]
+
 class detect_anomaly(object):
     r"""Context-manager that enable anomaly detection for the autograd engine.
 
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
index 08f633d1fb75..b12b6c646276 100644
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@@ -4,6 +4,8 @@
 
 from typing import Any
 
+__all__ = ["UnpackedDualTensor", "enter_dual_level", "exit_dual_level", "make_dual", "unpack_dual", "dual_level"]
+
 # Global variable used to make the python API simpler to use
 _current_level = -1
 
@@ -72,7 +74,12 @@ def make_dual(tensor, tangent, *, level=None):
 
     return torch._VF._make_dual(tensor, tangent, level=level)
 
-UnpackedDualTensor = namedtuple('UnpackedDualTensor', ['primal', 'tangent'])
+_UnpackedDualTensor = namedtuple('_UnpackedDualTensor', ['primal', 'tangent'])
+
+class UnpackedDualTensor(_UnpackedDualTensor):
+    r"""Namedtuple returned by :func:`unpack_dual` containing the primal and tangent components of the dual tensor.
+    See :func:`unpack_dual` for more details."""
+    pass
 
 def unpack_dual(tensor, *, level=None):
     r"""Unpacks a "dual tensor" to get both its Tensor value and its forward AD gradient.
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 6fe0b5ee09f3..1b941967875f 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -416,11 +416,12 @@ def _construct_standard_basis_for(tensors: Tuple[torch.Tensor, ...], tensor_nume
     assert len(tensors) == len(tensor_numels)
     assert len(tensors) > 0
     total_numel = sum(tensor_numels)
-    diag_start_indices = (0, *torch.tensor(tensor_numels).cumsum(dim=0)[:-1].neg().unbind())
     chunks = tuple(tensor.new_zeros(total_numel, tensor_numel)
                    for tensor, tensor_numel in zip(tensors, tensor_numels))
-    for chunk, diag_start_idx in zip(chunks, diag_start_indices):
+    diag_start_idx = 0
+    for chunk, numel in zip(chunks, tensor_numels):
         chunk.diagonal(diag_start_idx).fill_(1)
+        diag_start_idx -= numel
     return chunks
 
 
@@ -685,7 +686,7 @@ def vjp(grad_output):
                             raise RuntimeError(msg)
                         jac_i_el.append(torch.zeros_like(inp_el))
 
-            jacobian += (tuple(torch.stack(jac_i_el, dim=0).view(out.size()
+            jacobian += (tuple(torch.stack(jac_i_el, dim=0).view(out.size()  # type: ignore[operator]
                          + inputs[el_idx].size()) for (el_idx, jac_i_el) in enumerate(jac_i)), )
 
         jacobian = _grad_postprocess(jacobian, create_graph)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index c57a16f80d76..552afa4e5243 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -111,7 +111,7 @@ class no_grad(_DecoratorContextManager):
 
     Example::
 
-        >>> x = torch.tensor([1], requires_grad=True)
+        >>> x = torch.tensor([1.], requires_grad=True)
         >>> with torch.no_grad():
         ...   y = x * 2
         >>> y.requires_grad
@@ -123,12 +123,12 @@ class no_grad(_DecoratorContextManager):
         >>> z.requires_grad
         False
     """
-    def __init__(self):
+    def __init__(self) -> None:
         if not torch._jit_internal.is_scripting():
             super().__init__()
         self.prev = False
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         self.prev = torch.is_grad_enabled()
         torch.set_grad_enabled(False)
 
@@ -206,7 +206,7 @@ class set_grad_enabled(_DecoratorContextManager):
 
     Example::
 
-        >>> x = torch.tensor([1], requires_grad=True)
+        >>> x = torch.tensor([1.], requires_grad=True)
         >>> is_train = False
         >>> with torch.set_grad_enabled(is_train):
         ...   y = x * 2
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 3aac0fca5f6f..3ae3e208978f 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -9,9 +9,14 @@
 from torch._vmap_internals import vmap, _vmap
 import functools
 
+# Note: `get_*_jacobian` functions are added here even though we didn't intend to make them public
+# since they have been exposed from before we added `__all__`  and we already maintain BC for them
+# We should eventually deprecate them and remove them from `__all__`
+__all__ = ["gradcheck", "gradgradcheck", "GradcheckError", "get_numerical_jacobian",
+           "get_analytical_jacobian", "get_numerical_jacobian_wrt_specific_input"]
 
 class GradcheckError(RuntimeError):
-    # Custom error so that user errors are not caught in the gradcheck's try-catch
+    r"""Error raised by :func:`gradcheck` and :func:`gradgradcheck`"""
     pass
 
 
@@ -257,7 +262,7 @@ def _prepare_input(input: torch.Tensor, maybe_perturbed_input: Optional[torch.Te
         return input
 
 
-def check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None:
+def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None:
     # Check that the returned outputs don't have different dtype or shape when you
     # perturb the input
     on_index = "on index {idx} " if idx is not None else ""
@@ -284,7 +289,7 @@ def get_numerical_jacobian_wrt_specific_input(fn, input_idx, inputs, outputs, ep
     for x, idx, d_idx in _iter_tensor(input):
         wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x)
         input_to_perturb = x[idx]
-        nbhd_checks_fn = functools.partial(check_outputs_same_dtype_and_shape, idx=idx, eps=eps)
+        nbhd_checks_fn = functools.partial(_check_outputs_same_dtype_and_shape, idx=idx, eps=eps)
         jvp_fn = _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn)
         jacobian_cols[d_idx] = _compute_numerical_jvps_wrt_specific_input(jvp_fn, eps, x.is_complex(), is_forward_ad)
     return _combine_jacobian_cols(jacobian_cols, outputs, input, input.numel())
@@ -428,7 +433,7 @@ def _get_numerical_jvp_wrt_specific_input(fn, input_idx, inputs, u, eps, is_forw
     input = inputs[input_idx]
     input_to_perturb = _get_input_to_perturb(input)
     wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, input_to_perturb, True)
-    nbhd_checks_fn = functools.partial(check_outputs_same_dtype_and_shape, eps=eps)
+    nbhd_checks_fn = functools.partial(_check_outputs_same_dtype_and_shape, eps=eps)
     jvp_fn = _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn)
     u = _reshape_tensor_or_tuple(u, input_to_perturb.shape)
     u = _mul_tensor_or_tuple(u, eps)
@@ -504,7 +509,7 @@ def _stack_and_check_tensors(list_of_list_of_tensors, inputs,
 If the test
 - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
   with `nondet_tol=<tol>` as a keyword argument.
-- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
   to have `gradcheck_nondet_tol=<tol>`.
 - is a Module test (e.g., in common_nn.py), then modify the corresponding
   module_test entry to have `gradcheck_nondet_tol=<tol>`
@@ -637,7 +642,7 @@ def _get_analytical_vjps_wrt_specific_output(vjp_fn, sample_output, v) -> List[L
 
 
 def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
-    if not check_sparse_nnz and any(t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)):
+    if not check_sparse_nnz and any(t.is_sparse or t.is_sparse_csr for t in tupled_inputs if isinstance(t, torch.Tensor)):
         raise GradcheckError('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.')
     # Make sure that gradients are saved for at least one input
     any_input_requiring_grad = False
@@ -649,7 +654,12 @@ def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
                     'is not a double precision floating point or complex. '
                     'This check will likely fail if all the inputs are '
                     'not of double precision floating point or complex. ')
-            content = inp._values() if inp.is_sparse else inp
+            if inp.is_sparse:
+                content = inp._values()
+            elif inp.is_sparse_csr:
+                content = inp.values()
+            else:
+                content = inp
             # TODO: To cover more problematic cases, replace stride = 0 check with
             # "any overlap in memory" once we have a proper function to check it.
             if content.layout is not torch._mkldnn:  # type: ignore[attr-defined]
@@ -712,7 +722,7 @@ def _check_no_differentiable_outputs_fast(func, func_out, all_inputs, inputs_ind
 If the test
 - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
   with `check_batched_grad=False` as a keyword argument.
-- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
   to have `check_batched_grad=False` and/or `check_batched_gradgrad=False`.
 
 If you're modifying an existing operator that supports batched grad computation,
@@ -738,7 +748,7 @@ def _check_no_differentiable_outputs_fast(func, func_out, all_inputs, inputs_ind
 If the test
 - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
   with `check_batched_forward_grad=False` as a keyword argument.
-- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
   to have `check_batched_forward_grad=False`
 """
 
@@ -1191,7 +1201,7 @@ def _adjusted_atol(atol, u, v):
 If the test
 - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
   with `fast_mode=False` as a keyword argument.
-- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
   to have `gradcheck_fast_mode=False`
 - is a Module test (e.g., in common_nn.py), then modify the corresponding
   module_test entry to have `gradcheck_fast_mode=False`
@@ -1521,16 +1531,21 @@ def gradgradcheck(
     tupled_inputs = _as_tuple(inputs)
 
     if grad_outputs is None:
-        # If grad_outputs is not specified, create random Tensors of the same
-        # shape, type, and device as the outputs
-        def randn_like(x):
-            y = torch.randn_like(
-                x if (x.is_floating_point() or x.is_complex()) else x.double(), memory_format=torch.legacy_contiguous_format)
-            if gen_non_contig_grad_outputs:
-                y = torch.testing.make_non_contiguous(y)
-            return y.requires_grad_()
+        # If grad_outputs is not specified, create random Tensors of the same shape, type, and device as the outputs
+
         outputs = _as_tuple(func(*tupled_inputs))
-        tupled_grad_outputs = tuple(randn_like(x) for x in outputs)
+        tupled_grad_outputs = tuple(
+            torch.testing.make_tensor(
+                x.shape,
+                dtype=x.dtype if x.is_floating_point() or x.is_complex() else torch.double,
+                device=x.device,
+                low=-1,
+                high=1,
+                requires_grad=True,
+                noncontiguous=gen_non_contig_grad_outputs,
+            )
+            for x in outputs
+        )
     else:
         tupled_grad_outputs = _as_tuple(grad_outputs)
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 91c8d40c0cd1..eb8c46f8f124 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -6,8 +6,9 @@
 from torch.autograd import (
     DeviceType, ProfilerActivity, ProfilerConfig, ProfilerState,
     kineto_available, _ProfilerResult, _disable_profiler, _enable_profiler,
-    _prepare_profiler, _supported_activities
+    _prepare_profiler, _supported_activities, _kineto_step,
 )
+from torch._C._autograd import _ExperimentalConfig
 import torch
 import torch.cuda
 from torch.futures import Future
@@ -83,6 +84,10 @@ class profile(object):
         use_cpu (bool, optional): profile CPU events; setting to ``False`` requires
             ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling.
 
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+
+
     .. warning:
         Enabling memory profiling or source attribution incurs additional profiler
         overhead
@@ -127,7 +132,8 @@ def __init__(
             with_stack=False,
             with_modules=False,
             use_kineto=False,
-            use_cpu=True):
+            use_cpu=True,
+            experimental_config=None):
         self.enabled: bool = enabled
         if not self.enabled:
             return
@@ -141,6 +147,9 @@ def __init__(
         self.with_stack = with_stack
         self.with_modules = with_modules
         self.use_cpu = use_cpu
+        if experimental_config is None:
+            experimental_config = _ExperimentalConfig()
+        self.experimental_config = experimental_config
         self.kineto_results: Optional[_ProfilerResult] = None
 
         if not self.use_cpu:
@@ -175,7 +184,8 @@ def config(self):
             self.profile_memory,
             self.with_stack,
             self.with_flops,
-            self.with_modules)
+            self.with_modules,
+            self.experimental_config)
 
     def __enter__(self):
         if not self.enabled:
@@ -569,7 +579,8 @@ def __enter__(self):
                 False,
                 False,
                 False,
-                False),
+                False,
+                _ExperimentalConfig()),
             set()
         )
         return self
@@ -664,3 +675,10 @@ def parse_nvprof_trace(path):
 
     functions.sort(key=lambda evt: evt.time_range.start)
     return functions
+
+
+def kineto_step():
+    """ Notify kineto so it is aware of iteration boundaries for asynchronous
+        trace requests.
+    """
+    _kineto_step()
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 445decf333e5..0211ec8a2809 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -55,7 +55,10 @@ def config(self):
             self.profile_memory,
             self.with_stack,
             self.with_flops,
-            self.with_modules)
+            self.with_modules,
+            # avoid exposing _ExperimentalConfig this in legacy public API
+            torch._C._autograd._ExperimentalConfig(),
+        )
 
     def __enter__(self):
         if not self.enabled:
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index 6062c097b253..dc505fbc210a 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -642,6 +642,7 @@ def _filter_name(name):
     filtered_out_names = [
         MEMORY_EVENT_NAME,  # used only for the top-level memory events
         "profiler::_record_function_enter",
+        "profiler::_record_function_enter_new",
         "profiler::_record_function_exit",
         "aten::is_leaf",
         "aten::output_nr",
diff --git a/torch/backends/_coreml/preprocess.py b/torch/backends/_coreml/preprocess.py
index 7f27e60e5acb..3884058cd0ec 100644
--- a/torch/backends/_coreml/preprocess.py
+++ b/torch/backends/_coreml/preprocess.py
@@ -1,7 +1,6 @@
 import hashlib
 import json
-from dataclasses import dataclass, astuple, field
-from typing import Dict, Tuple, List
+from typing import Dict, Tuple
 
 import coremltools as ct  # type: ignore[import]
 import torch
@@ -35,86 +34,56 @@ class CoreMLComputeUnit:
     ALL = "all"
 
 
-@dataclass
-class _TensorSpec:
-    shape: List[int] = field(default_factory=List[int])
-    dtype: int = ScalarType.Float
-
-
-def TensorSpec(*args, **kwargs):
-    """
-    TensorSpec specifies the tensor information. The default dtype is float32
-    Example:
-    ts = TensorSpec(
-        shape = [1, 3, 224, 224],
-        dtype = ScalarType.Float
-    )
-    """
-    return astuple(_TensorSpec(*args, **kwargs))
-
-
-@dataclass
-class _CompileSpec:
-    inputs: Tuple[_TensorSpec] = ()  # type: ignore[assignment]
-    outputs: Tuple[_TensorSpec] = ()  # type: ignore[assignment]
-    backend: str = CoreMLComputeUnit.CPU
-    allow_low_precision: bool = True
-
-
-def CompileSpec(*args, **kwargs):
-    """
-    CompileSpec specifies the model information.
-    Example:
-    cs = CompileSpec(
-            inputs=(
-                TensorSpec(
-                    shape=[1, 3, 224, 224],
-                ),
-            ),
-            outputs=(
-                TensorSpec(
-                    shape=[1, 1000],
-                ),
-            ),
-            backend=CoreMLComputeUnit.CPU,
-            allow_low_precision=True,
-    ),
-    """
-    return astuple(_CompileSpec(*args, **kwargs))
-
-
-def _convert_to_mil_type(spec: _TensorSpec, name: str):
-    ml_type = TensorType(shape=spec.shape, dtype=torch_to_mil_types[spec.dtype])
+def TensorSpec(shape, dtype=ScalarType.Float):
+    return (shape, dtype)
+
+
+def CompileSpec(inputs, outputs, backend=CoreMLComputeUnit.CPU, allow_low_precision=True):
+    return (inputs, outputs, backend, allow_low_precision)
+
+
+def _check_enumerated_shape(shape):
+    for s in shape:
+        if not isinstance(s, (list, tuple)):
+            return False
+    return True
+
+
+def _convert_to_mil_type(shape, dtype, name: str):
+    mil_shape = shape
+    if _check_enumerated_shape(shape):
+        mil_shape = ct.EnumeratedShapes(shape)
+    ml_type = TensorType(shape=mil_shape, dtype=torch_to_mil_types[dtype])
     ml_type.name = name
     return ml_type
 
 
 def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
     spec = compile_spec["forward"]
-    forward_spec = _CompileSpec(*spec)
+    input_specs, output_specs, backend, allow_low_precision = spec
     mil_inputs = []
     inputs = []
-    for index, input_spec in enumerate(forward_spec.inputs):
-        input_spec = _TensorSpec(*input_spec)  # type: ignore[misc]
+    for index, input in enumerate(input_specs):
+        shape, dtype = input
         name = "input_" + str(index)
-        inputs.append([name, str(input_spec.dtype), str(input_spec.shape)])
-        ml_type = _convert_to_mil_type(input_spec, name)
+        inputs.append([name, str(dtype), str(shape)])
+        ml_type = _convert_to_mil_type(shape, dtype, name)
         mil_inputs.append(ml_type)
     model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
     mlmodel = ct.convert(model, inputs=mil_inputs)
     spec = mlmodel.get_spec()
-    output_specs = forward_spec.outputs
     assert len(spec.description.output) == len(output_specs)  # type: ignore[attr-defined]
     outputs = []
-    for index, output_spec in enumerate(output_specs):
-        output_spec = _TensorSpec(*output_spec)  # type: ignore[misc]
+    for index, output in enumerate(output_specs):
+        shape, dtype = output
         name = spec.description.output[index].name  # type: ignore[attr-defined]
-        outputs.append([name, str(output_spec.dtype), str(output_spec.shape)])
+        outputs.append([name, str(dtype), str(shape)])
     mlmodel = ct.models.model.MLModel(spec)
+    print(mlmodel)
     config = {
         "spec_ver": str(spec.specificationVersion),  # type: ignore[attr-defined]
-        "backend": forward_spec.backend,
-        "allow_low_precision": str(forward_spec.allow_low_precision),
+        "backend": backend,
+        "allow_low_precision": str(allow_low_precision),
     }
     metadata = {
         "coremltool_ver": mlmodel.user_defined_metadata[CT_METADATA_VERSION],
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index d29b5987295c..4bbf9b5e8530 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -1549,11 +1549,28 @@ def add_adaptive_avg_pool2d(self, node):
         self.add_operation(NNAPI_OperationCode.AVERAGE_POOL_2D, inputs, outputs)
 
     def add_upsample_nearest2d(self, node):
-        assert node.inputsSize() == 3
+        assert node.inputsSize() == 3 or node.inputsSize() == 4
         assert node.outputsSize() == 1
-        image, size_jit, scale_jit = node.inputs()
+        if node.inputsSize() == 3:
+            image, size_jit, scale_jit = node.inputs()
+        else:
+            image, size_jit, scale_h_jit, scale_w_jit = node.inputs()
         size_ctype, size_arg = self.get_constant_value(size_jit)
-        scale_ctype, scale_arg = self.get_constant_value(scale_jit)
+
+        if node.inputsSize() == 3:
+            scale_ctype, scale_arg = self.get_constant_value(scale_jit)
+        else:
+            scale_h_ctype, scale_h_arg = self.get_constant_value(scale_h_jit)
+            scale_w_ctype, scale_w_arg = self.get_constant_value(scale_w_jit)
+
+            # The only way for the 4-argument overload of upsample_nearest2d to
+            # have been added to the graph without error is if the scale_h and
+            # scale_w arguments are None
+            assert scale_h_ctype.kind() == "NoneType"
+            assert scale_w_ctype.kind() == "NoneType"
+
+            scale_ctype = scale_h_ctype
+            scale_arg = scale_h_arg
 
         image_id, image_oper = self.get_tensor_operand_by_jitval(image)
         assert len(image_oper.shape) == 4
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 4f05e06225bd..d89049b5f3ca 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -133,3 +133,4 @@ def __init__(self, m, name):
 enabled: bool
 deterministic: bool
 benchmark: bool
+allow_tf32: bool
diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
new file mode 100644
index 000000000000..b0b9f90ac77a
--- /dev/null
+++ b/torch/backends/mps/__init__.py
@@ -0,0 +1,13 @@
+import sys
+import torch
+
+def is_built():
+    r"""Returns whether PyTorch is built with MPS support. Note that this
+    doesn't necessarily mean MPS is available; just that if this PyTorch
+    binary were run a machine with working MPS drivers and devices, we
+    would be able to use it."""
+    return torch._C.has_mps
+
+def is_available():
+    r"""Returns a bool indicating if MPS is currently available."""
+    return torch._C._is_mps_available
diff --git a/torch/backends/quantized/__init__.py b/torch/backends/quantized/__init__.py
index a24d88bcc6e6..6f7d479e90c4 100644
--- a/torch/backends/quantized/__init__.py
+++ b/torch/backends/quantized/__init__.py
@@ -11,6 +11,8 @@ def _get_qengine_id(qengine: str) -> int:
         ret = 1
     elif qengine == 'qnnpack':
         ret = 2
+    elif qengine == 'onednn':
+        ret = 3
     else:
         ret = -1
         raise RuntimeError("{} is not a valid value for quantized engine".format(qengine))
@@ -18,7 +20,7 @@ def _get_qengine_id(qengine: str) -> int:
 
 # This function should correspond to the enums present in c10/core/QEngine.h
 def _get_qengine_str(qengine: int) -> str:
-    all_engines = {0 : 'none', 1 : 'fbgemm', 2 : 'qnnpack'}
+    all_engines = {0 : 'none', 1 : 'fbgemm', 2 : 'qnnpack', 3 : 'onednn'}
     return all_engines.get(qengine, '*undefined')
 
 class _QEngineProp(object):
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index 49ffb5c11b42..03cbcdcda0fc 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -1,7 +1,7 @@
 import torch
 from typing import Any
 
-class autocast(torch.autocast_mode.autocast):
+class autocast(torch.amp.autocast_mode.autocast):
     r"""
     See :class:`torch.autocast`.
     ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)``
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index a2bf143aede5..502bb0fa29b0 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -66,7 +66,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage) {
       scalarType);
   auto it = attype_to_py_storage_type.find(attype);
   TORCH_INTERNAL_ASSERT(it != attype_to_py_storage_type.end(),
-        "Failed to get the Python type of `UntypedStorage`.");
+        "Failed to get the Python type of `_UntypedStorage`.");
   return it->second;
 }
 } // namespace
@@ -115,10 +115,10 @@ PyTypeObject* loadTypedStorageTypeObject() {
   PyObject* storage_module = PyImport_ImportModule("torch.storage");
   TORCH_INTERNAL_ASSERT(storage_module && PyModule_Check(storage_module));
 
-  PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "TypedStorage");
+  PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "_TypedStorage");
   TORCH_INTERNAL_ASSERT(typed_storage_obj && PyType_Check(typed_storage_obj));
   return reinterpret_cast<PyTypeObject*>(
-      PyObject_GetAttrString(storage_module, "TypedStorage"));
+      PyObject_GetAttrString(storage_module, "_TypedStorage"));
 }
 
 PyTypeObject* getTypedStorageTypeObject() {
@@ -169,7 +169,7 @@ at::Storage createStorageGetType(PyObject* obj, at::ScalarType& scalar_type, boo
     }
     if (obj_type == storage_type) {
       auto& type = *item.second;
-      // UntypedStorage should always be interpreted with byte dtype
+      // _UntypedStorage should always be interpreted with byte dtype
       scalar_type = at::kByte;
       return type.unsafeStorageFromTH(((THPVoidStorage*)obj)->cdata, true);
     }
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 8bf89ae7cbd7..2eb8ae41898e 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -43,26 +43,6 @@ could not be completed because the input matrix is singular.", PyExc_RuntimeErro
 
 namespace torch {
 
-static bool compute_cpp_stack_traces_enabled() {
-  auto envar = std::getenv("TORCH_SHOW_CPP_STACKTRACES");
-  if (envar) {
-    if (strcmp(envar, "0") == 0) {
-      return false;
-    }
-    if (strcmp(envar, "1") == 0) {
-      return true;
-    }
-    TORCH_WARN("ignoring invalid value for TORCH_SHOW_CPP_STACKTRACES: ", envar,
-               " valid values are 0 or 1.");
-  }
-  return false;
-}
-
-bool get_cpp_stacktraces_enabled() {
-  static bool enabled = compute_cpp_stack_traces_enabled();
-  return enabled;
-}
-
 void replaceAll(std::string & str,
     const std::string & old_str,
     const std::string & new_str) {
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 348dba3de064..4a644a0b45e8 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -11,6 +11,7 @@
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/utils/auto_gil.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
 #include <c10/util/StringUtil.h>
 #include <ATen/detail/FunctionTraits.h>
@@ -105,6 +106,11 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) {
       auto msg = torch::processErrorMsg(e.what());                   \
       PyErr_SetString(PyExc_TimeoutError, msg);                      \
       retstmnt;                                                      \
+    }                                                                \
+    catch (const c10d::C10dError& e) {                               \
+      auto msg = torch::processErrorMsg(e.what());                   \
+      PyErr_SetString(PyExc_RuntimeError, msg);                      \
+      retstmnt;                                                      \
     }
 #else
 #define CATCH_C10D_ERRORS(retstmnt)
@@ -275,8 +281,6 @@ TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr &);
 
 TORCH_PYTHON_API std::string processErrorMsg(std::string str);
 
-TORCH_PYTHON_API bool get_cpp_stacktraces_enabled();
-
 // Abstract base class for exceptions which translate to specific Python types
 struct PyTorchError : public std::exception {
   // NOLINTNEXTLINE(modernize-pass-by-value)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index faf132753c25..186dab63c4f1 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -55,16 +55,15 @@
 #include <torch/csrc/utils/tensor_new.h>
 #include <torch/csrc/utils/tensor_numpy.h>
 #include <torch/csrc/utils/python_dispatch.h>
-#include <torch/csrc/utils/crash_handler.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/lazy/python/init.h>
 #include <torch/csrc/jit/python/python_tracer.h>
 #include <torch/csrc/jit/python/init.h>
 #include <torch/csrc/jit/python/python_ir.h>
 #include <torch/csrc/monitor/python_init.h>
 #include <torch/csrc/onnx/init.h>
 #include <torch/csrc/utils/init.h>
-#include <torch/csrc/utils/crash_handler.h>
 #include <torch/csrc/api/include/torch/python/init.h>
 
 #ifdef USE_DISTRIBUTED
@@ -76,10 +75,6 @@
 #endif
 #endif
 
-#if defined(USE_MLCOMPUTE)
-#include <mlc/torch_mlc/csrc/MLCInit.h>
-#endif
-
 #if defined(USE_VALGRIND)
 #include <callgrind.h>
 #endif
@@ -398,6 +393,27 @@ PyObject *THPModule_allowTF32CuDNN(PyObject *_unused, PyObject *noargs)
   else Py_RETURN_FALSE;
 }
 
+PyObject *THPModule_setFloat32MatmulPrecision(PyObject *_unused, PyObject *arg)
+{
+  THPUtils_assert(THPUtils_checkString(arg), "set_float32_matmul_precision expects a str, "
+          "but got %s", THPUtils_typename(arg));
+  std::string s = THPUtils_unpackString(arg);
+  at::globalContext().setFloat32MatmulPrecision(s);
+  Py_RETURN_NONE;
+}
+
+PyObject *THPModule_float32MatmulPrecision(PyObject *_unused, PyObject *noargs)
+{
+  std::string s = "highest";
+  auto p = at::globalContext().float32MatmulPrecision();
+  if (p == at::Float32MatmulPrecision::HIGH) {
+    s = "high";
+  } else if (p == at::Float32MatmulPrecision::MEDIUM) {
+    s = "medium";
+  }
+  return THPUtils_packString(s);
+}
+
 PyObject *THPModule_setUserEnabledCuDNN(PyObject *_unused, PyObject *arg)
 {
   THPUtils_assert(PyBool_Check(arg), "set_enabled_cudnn expects a bool, "
@@ -588,11 +604,10 @@ PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs)
 {
   auto qengines = at::globalContext().supportedQEngines();
   auto list = THPObjectPtr(PyList_New(qengines.size()));
+  if (!list) return nullptr;
   for (const auto i : c10::irange(qengines.size())) {
     PyObject *i64 = THPUtils_packInt64(static_cast<int>(qengines[i]));
-    if (!i64) {
-      throw python_error();
-    }
+    if (!i64) return nullptr;
     PyList_SET_ITEM(list.get(), i, i64);
   }
   return list.release();
@@ -606,22 +621,18 @@ PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs)
 
 PyObject *THPModule_setDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
 {
-  try {
-    at::globalContext().setDefaultMobileCPUAllocator();
-  } catch (c10::Error& e) {
-    THPUtils_setError(e.what());
-  }
+  HANDLE_TH_ERRORS
+  at::globalContext().setDefaultMobileCPUAllocator();
   Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }
 
 PyObject *THPModule_unsetDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
 {
-  try {
-    at::globalContext().unsetDefaultMobileCPUAllocator();
-  } catch (c10::Error& e) {
-    THPUtils_setError(e.what());
-  }
+  HANDLE_TH_ERRORS
+  at::globalContext().unsetDefaultMobileCPUAllocator();
   Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPModule_vmapmode_increment_nesting(PyObject* _unused, PyObject *arg) {
@@ -696,6 +707,8 @@ static PyMethodDef TorchMethods[] = {
   {"_set_warnAlways", THPModule_setWarnAlways, METH_O,  nullptr},
   {"_get_cublas_allow_tf32", THPModule_allowTF32CuBLAS, METH_NOARGS,     nullptr},
   {"_set_cublas_allow_tf32", THPModule_setAllowTF32CuBLAS, METH_O,  nullptr},
+  {"_get_float32_matmul_precision", THPModule_float32MatmulPrecision, METH_NOARGS,     nullptr},
+  {"_set_float32_matmul_precision", THPModule_setFloat32MatmulPrecision, METH_O,  nullptr},
   {"_get_cublas_allow_fp16_reduced_precision_reduction", THPModule_allowFP16ReductionCuBLAS, METH_NOARGS, nullptr},
   {"_set_cublas_allow_fp16_reduced_precision_reduction", THPModule_setAllowFP16ReductionCuBLAS, METH_O, nullptr},
   {"_vmapmode_increment_nesting", THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr},
@@ -715,6 +728,7 @@ static PyMethodDef TorchMethods[] = {
   {"_unset_default_mobile_cpu_allocator", THPModule_unsetDefaultMobileCPUAllocator, METH_NOARGS, nullptr},
   {"_is_torch_function_enabled", THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr},
   {"_disabled_torch_function_impl", THPModule_disable_torch_function, METH_VARARGS, nullptr},
+  {"_disabled_torch_dispatch_impl", THPModule_disable_torch_dispatch, METH_VARARGS, nullptr},
   {"_has_torch_function", THPModule_has_torch_function, METH_O, nullptr},
   {"_has_torch_function_unary", THPModule_has_torch_function_unary, METH_O, nullptr},
   {"_has_torch_function_variadic", MAYBE_WRAP_FASTCALL(THPModule_has_torch_function_variadic), MAYBE_METH_FASTCALL, nullptr},
@@ -735,15 +749,6 @@ void initModule(PyObject *module);
 }} // namespace torch::cuda
 #endif
 
-#ifdef USE_MLCOMPUTE
-PyMethodDef* ModuleMLC_methods();
-namespace torch { namespace mlc {
-
-void initBindings(PyObject *module);
-
-}} // namespace torch::mlc
-#endif
-
 bool THDPByteStorage_init(PyObject *module);
 
 static std::vector<PyMethodDef> methods;
@@ -780,6 +785,9 @@ TORCH_API PyObject* initModule();
 // separate decl and defn for msvc error C2491
 PyObject* initModule() {
   HANDLE_TH_ERRORS
+
+  c10::initLogging();
+
   at::internal::lazy_init_num_threads();
 
   C10_LOG_API_USAGE_ONCE("torch.python.import");
@@ -794,9 +802,6 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
-#ifdef USE_MLCOMPUTE
-  THPUtils_addPyMethodDefs(methods, ModuleMLC_methods());
-#endif
 #if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
 #ifndef _WIN32
@@ -836,7 +841,6 @@ PyObject* initModule() {
   torch::monitor::initMonitorBindings(module);
   torch::impl::dispatch::initDispatchBindings(module);
   torch::throughput_benchmark::initThroughputBenchmarkBindings(module);
-  torch::crash_handler::initCrashHandlerBindings(module);
   torch::autograd::initReturnTypes(module);
   torch::autograd::initNNFunctions(module);
   torch::autograd::initFFTFunctions(module);
@@ -845,11 +849,9 @@ PyObject* initModule() {
   torch::autograd::initSpecialFunctions(module);
   torch::autograd::init_legacy_variable(module);
   torch::python::init_bindings(module);
+  torch::lazy::initLazyBindings(module);
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
-#endif
-#ifdef USE_MLCOMPUTE
-  torch::mlc::init_bindings(module);
 #endif
   ASSERT_TRUE(THPByteStorage_init(module));
 
@@ -892,10 +894,6 @@ PyObject* initModule() {
 
   // Automatically translate errors thrown from pybind11 functions
   py::register_exception_translator([](std::exception_ptr e) { // NOLINT
-    if (torch::crash_handler::is_enabled_on_exceptions()) {
-      torch::crash_handler::write_minidump();
-    }
-
     try {
       if (e) {
         std::rethrow_exception(e);
@@ -1019,15 +1017,16 @@ Call this whenever a new thread is created in order to propagate values from
 #else
   PyObject *has_cuda = Py_False;
 #endif
-#ifdef USE_MLCOMPUTE
-  PyObject *has_mlc = Py_True;
+
+#ifdef USE_MPS
+  PyObject *has_mps = Py_True;
 #else
-  PyObject *has_mlc = Py_False;
+  PyObject *has_mps = Py_False;
 #endif
 
-  ASSERT_TRUE(set_module_attr("has_mlc", has_mlc));
-
   ASSERT_TRUE(set_module_attr("has_cuda", has_cuda));
+  ASSERT_TRUE(set_module_attr("has_mps", has_mps));
+  ASSERT_TRUE(set_module_attr("_is_mps_available", at::hasMPS() ? Py_True : Py_False));
 
   ASSERT_TRUE(set_module_attr("has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
 
@@ -1060,6 +1059,13 @@ Call this whenever a new thread is created in order to propagate values from
 #endif
 #undef SET_STR_DEFINE
 
+  py_module.def("_set_conj", [](const at::Tensor & x, bool conj) {
+    x._set_conj(conj);
+  });
+  py_module.def("_set_neg", [](const at::Tensor & x, bool neg) {
+    x._set_neg(neg);
+  });
+
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
   THPDefaultCPUGenerator = (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
   // This reference is meant to be given away, so no need to incref here.
@@ -1067,6 +1073,8 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("DisableTorchFunction", (PyObject*)THPModule_DisableTorchFunctionType(), /* incref= */ false));
   torch::set_disabled_torch_function_impl(PyObject_GetAttrString(module, "_disabled_torch_function_impl"));
   ASSERT_TRUE(torch::disabled_torch_function_impl() != nullptr);
+  torch::set_disabled_torch_dispatch_impl(PyObject_GetAttrString(module, "_disabled_torch_dispatch_impl"));
+  ASSERT_TRUE(torch::disabled_torch_dispatch_impl() != nullptr);
   return module;
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/THCGenerateByteType.h b/torch/csrc/THCGenerateByteType.h
index e2c5f35d9118..23648de8025f 100644
--- a/torch/csrc/THCGenerateByteType.h
+++ b/torch/csrc/THCGenerateByteType.h
@@ -1,5 +1,5 @@
 #ifndef THC_GENERIC_FILE
-#error "You must define THC_GENERIC_FILE before including THGenerateByteType.h"
+#error "You must define THC_GENERIC_FILE before including THCGenerateByteType.h"
 #endif
 
 #define scalar_t uint8_t
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index b75f4fee4236..08fd03236428 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -21,7 +21,7 @@ PyObject* THPFInfo_New(const at::ScalarType& type) {
   if (!self)
     throw python_error();
   auto self_ = reinterpret_cast<THPDTypeInfo*>(self.get());
-  self_->type = c10::toValueType(type);
+  self_->type = c10::toRealValueType(type);
   return self.release();
 }
 
diff --git a/torch/csrc/api/include/torch/data/example.h b/torch/csrc/api/include/torch/data/example.h
index b43ef2ca1955..f302cdd9ed87 100644
--- a/torch/csrc/api/include/torch/data/example.h
+++ b/torch/csrc/api/include/torch/data/example.h
@@ -25,7 +25,7 @@ namespace example {
 using NoTarget = void;
 } //  namespace example
 
-/// A specialization for `Example` that does not have have a target.
+/// A specialization for `Example` that does not have a target.
 ///
 /// This class exists so that code can be written for a templated `Example`
 /// type, and work both for labeled and unlabeled datasets.
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 23ecbf1be0c6..71a3146c990f 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -44,7 +44,7 @@ inline Tensor ifft(const Tensor& self,
 /// torch::fft::fft2(t);
 /// ```
 inline Tensor fft2(const Tensor& self,
-                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   OptionalIntArrayRef s=c10::nullopt,
                    IntArrayRef dim={-2, -1},
                    c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_fft2(self, s, dim, norm);
@@ -59,7 +59,7 @@ inline Tensor fft2(const Tensor& self,
 /// torch::fft::ifft2(t);
 /// ```
 inline Tensor ifft2(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
                     IntArrayRef dim={-2, -1},
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_ifft2(self, s, dim, norm);
@@ -74,8 +74,8 @@ inline Tensor ifft2(const Tensor& self,
 /// torch::fft::fftn(t);
 /// ```
 inline Tensor fftn(const Tensor& self,
-                   c10::optional<IntArrayRef> s=c10::nullopt,
-                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   at::OptionalIntArrayRef s=c10::nullopt,
+                   at::OptionalIntArrayRef dim=c10::nullopt,
                    c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_fftn(self, s, dim, norm);
 }
@@ -89,8 +89,8 @@ inline Tensor fftn(const Tensor& self,
 /// torch::fft::ifftn(t);
 /// ```
 inline Tensor ifftn(const Tensor& self,
-                   c10::optional<IntArrayRef> s=c10::nullopt,
-                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   at::OptionalIntArrayRef s=c10::nullopt,
+                   at::OptionalIntArrayRef dim=c10::nullopt,
                    c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_ifftn(self, s, dim, norm);
 }
@@ -138,7 +138,7 @@ inline Tensor irfft(const Tensor& self,
 /// torch::fft::rfft2(t);
 /// ```
 inline Tensor rfft2(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
                     IntArrayRef dim={-2, -1},
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_rfft2(self, s, dim, norm);
@@ -153,7 +153,7 @@ inline Tensor rfft2(const Tensor& self,
 /// torch::fft::irfft2(t);
 /// ```
 inline Tensor irfft2(const Tensor& self,
-                     c10::optional<IntArrayRef> s=c10::nullopt,
+                     at::OptionalIntArrayRef s=c10::nullopt,
                      IntArrayRef dim={-2, -1},
                      c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_irfft2(self, s, dim, norm);
@@ -168,8 +168,8 @@ inline Tensor irfft2(const Tensor& self,
 /// torch::fft::rfftn(t);
 /// ```
 inline Tensor rfftn(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
-                    c10::optional<IntArrayRef> dim=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
+                    at::OptionalIntArrayRef dim=c10::nullopt,
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_rfftn(self, s, dim, norm);
 }
@@ -183,8 +183,8 @@ inline Tensor rfftn(const Tensor& self,
 /// torch::fft::irfftn(t);
 /// ```
 inline Tensor irfftn(const Tensor& self,
-                   c10::optional<IntArrayRef> s=c10::nullopt,
-                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   at::OptionalIntArrayRef s=c10::nullopt,
+                   at::OptionalIntArrayRef dim=c10::nullopt,
                    c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_irfftn(self, s, dim, norm);
 }
@@ -238,7 +238,7 @@ inline Tensor ihfft(const Tensor& self,
 /// assert(T.is_floating_point() && T.numel() == 128 * 128);
 /// ```
 inline Tensor hfft2(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
                     IntArrayRef dim={-2, -1},
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_hfft2(self, s, dim, norm);
@@ -256,7 +256,7 @@ inline Tensor hfft2(const Tensor& self,
 /// assert(t.is_complex() && t.size(1) == 65);
 /// ```
 inline Tensor ihfft2(const Tensor& self,
-                     c10::optional<IntArrayRef> s=c10::nullopt,
+                     at::OptionalIntArrayRef s=c10::nullopt,
                      IntArrayRef dim={-2, -1},
                      c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_ihfft2(self, s, dim, norm);
@@ -274,7 +274,7 @@ inline Tensor ihfft2(const Tensor& self,
 /// assert(T.is_floating_point() && T.numel() == 128 * 128);
 /// ```
 inline Tensor hfftn(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
                     IntArrayRef dim={-2, -1},
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_hfftn(self, s, dim, norm);
@@ -292,7 +292,7 @@ inline Tensor hfftn(const Tensor& self,
 /// assert(t.is_complex() && t.size(1) == 65);
 /// ```
 inline Tensor ihfftn(const Tensor& self,
-                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    at::OptionalIntArrayRef s=c10::nullopt,
                     IntArrayRef dim={-2, -1},
                     c10::optional<c10::string_view> norm=c10::nullopt) {
   return torch::fft_ihfftn(self, s, dim, norm);
@@ -341,7 +341,7 @@ inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
 /// auto x = torch::randn({127, 4});
 /// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
 /// ```
-inline Tensor fftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+inline Tensor fftshift(const Tensor& x, at::OptionalIntArrayRef dim=c10::nullopt) {
   return torch::fft_fftshift(x, dim);
 }
 
@@ -356,7 +356,7 @@ inline Tensor fftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::null
 /// auto unshift = torch::fft::ifftshift(shift);
 /// assert(torch::allclose(x, unshift));
 /// ```
-inline Tensor ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+inline Tensor ifftshift(const Tensor& x, at::OptionalIntArrayRef dim=c10::nullopt) {
   return torch::fft_ifftshift(x, dim);
 }
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index e16c1f61e503..fe015c8320f3 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -76,6 +76,14 @@ inline std::tuple<Tensor&, Tensor&> lu_factor_out(Tensor& LU, Tensor& pivots, co
   return torch::linalg_lu_factor_out(LU, pivots, self, pivot);
 }
 
+inline std::tuple<Tensor, Tensor, Tensor> lu(const Tensor& self, const bool pivot) {
+  return torch::linalg_lu(self, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> lu_out(Tensor& P, Tensor& L, Tensor& U, const Tensor& self, const bool pivot) {
+  return torch::linalg_lu_out(P, L, U, self, pivot);
+}
+
 inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(const Tensor& self, const Tensor& b, c10::optional<double> cond, c10::optional<c10::string_view> driver) {
   return torch::linalg_lstsq(self, b, cond, driver);
 }
@@ -84,27 +92,27 @@ inline Tensor matrix_exp(const Tensor& self) {
   return torch::linalg_matrix_exp(self);
 }
 
-inline Tensor norm(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor norm(const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor norm(const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor norm(const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& norm_out(Tensor& result, const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& norm_out(Tensor& result, const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor vector_norm(const Tensor& self, Scalar ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor vector_norm(const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_vector_norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_vector_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
@@ -228,11 +236,11 @@ inline Tensor& tensorinv_out(Tensor& result,const Tensor& self, int64_t ind) {
   return torch::linalg_tensorinv_out(result, self, ind);
 }
 
-inline Tensor tensorsolve(const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+inline Tensor tensorsolve(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) {
   return torch::linalg_tensorsolve(self, other, dims);
 }
 
-inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, optional<IntArrayRef> dims) {
+inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) {
   return torch::linalg_tensorsolve_out(result, self, other, dims);
 }
 
@@ -354,26 +362,26 @@ inline Tensor matrix_exp(const Tensor& input) {
 }
 
 // C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
-inline Tensor linalg_norm(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor linalg_norm(const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
 // C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
-inline Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor linalg_norm(const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
 // C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.")
-inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
 // C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.")
-inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, c10::string_view ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// Computes the pivoted LU factorization
+/// Computes the LU factorization with partial pivoting
 ///
 /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu_factor
 inline std::tuple<Tensor, Tensor> lu_factor(const Tensor& input, const bool pivot=true) {
@@ -384,28 +392,39 @@ inline std::tuple<Tensor&, Tensor&> lu_factor_out(Tensor& LU, Tensor& pivots, co
   return detail::lu_factor_out(LU, pivots, self, pivot);
 }
 
-inline Tensor norm(const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+/// Computes the LU factorization with partial pivoting
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu
+inline std::tuple<Tensor, Tensor, Tensor> lu(const Tensor& input, const bool pivot=true) {
+  return detail::lu(input, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> lu_out(Tensor& P, Tensor& L, Tensor& U, const Tensor& self, const bool pivot=true) {
+  return detail::lu_out(P, L, U, self, pivot);
+}
+
+inline Tensor norm(const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor norm(const Tensor& self, std::string ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor norm(const Tensor& self, std::string ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional<Scalar>& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
 /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.vector_norm
-inline Tensor vector_norm(const Tensor& self, Scalar ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor vector_norm(const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::vector_norm(self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
+inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::vector_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
@@ -501,6 +520,48 @@ inline std::tuple<Tensor&, Tensor&> qr_out(Tensor& Q, Tensor& R, const Tensor& i
   return detail::qr_out(Q, R, input, mode);
 }
 
+/// Computes the LDL decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_factor_ex
+inline std::tuple<Tensor, Tensor, Tensor> ldl_factor_ex(
+    const Tensor& input,
+    bool hermitian,
+    bool check_errors) {
+  return torch::linalg_ldl_factor_ex(input, hermitian, check_errors);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> ldl_factor_ex_out(
+    Tensor& LD,
+    Tensor& pivots,
+    Tensor& info,
+    const Tensor& input,
+    bool hermitian,
+    bool check_errors) {
+  return torch::linalg_ldl_factor_ex_out(
+      LD, pivots, info, input, hermitian, check_errors);
+}
+
+/// Solve a system of linear equations using the LDL decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_solve
+inline Tensor ldl_solve(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool hermitian) {
+  return torch::linalg_ldl_solve(LD, pivots, B, hermitian);
+}
+
+inline Tensor& ldl_solve_out(
+    Tensor& result,
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool hermitian) {
+  return torch::linalg_ldl_solve_out(
+      result, LD, pivots, B, hermitian);
+}
+
 /// Computes a tensor `x` such that `matmul(input, x) = other`.
 ///
 /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve
@@ -574,11 +635,11 @@ inline Tensor& tensorinv_out(Tensor& result, const Tensor& self, int64_t ind) {
 /// auto b = torch::randn(2*3, 4);
 /// auto x = torch::linalg::tensorsolve(a, b);
 /// ```
-inline Tensor tensorsolve(const Tensor& input, const Tensor& other, optional<IntArrayRef> dims) {
+inline Tensor tensorsolve(const Tensor& input, const Tensor& other, OptionalIntArrayRef dims) {
   return detail::tensorsolve(input, other, dims);
 }
 
-inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor& other, optional<IntArrayRef> dims) {
+inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor& other, OptionalIntArrayRef dims) {
   return detail::tensorsolve_out(result, input, other, dims);
 }
 
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index b038f1bce6ba..2258dd0c4317 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -336,8 +336,16 @@ inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) {
 
 // ============================================================================
 
-inline Tensor gelu(const Tensor& input) {
-  return torch::gelu(input);
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor gelu(const Tensor& input, string approximate) {
+  return torch::gelu(input, approximate);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+inline Tensor gelu(const Tensor& input, const GELUFuncOptions& options = {}) {
+  return detail::gelu(input, options.approximate());
 }
 
 // ============================================================================
diff --git a/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
index bb8bddfcf83c..5603ec189e91 100644
--- a/torch/csrc/api/include/torch/nn/functional/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/irange.h>
 #include <torch/nn/options/batchnorm.h>
 #include <torch/types.h>
 
@@ -20,7 +21,7 @@ inline Tensor batch_norm(const Tensor& input,
   if (training) {
     auto size = input.sizes();
     int64_t size_prods = size[0];
-    for (size_t i = 0; i < size.size() - 2; i++) {
+    for (const auto i : c10::irange(size.size() - 2)) {
       size_prods *= size[i + 2];
     }
     TORCH_CHECK(size_prods != 1,
diff --git a/torch/csrc/api/include/torch/nn/functional/padding.h b/torch/csrc/api/include/torch/nn/functional/padding.h
index 611f407d9b7a..1b2f77626cdb 100644
--- a/torch/csrc/api/include/torch/nn/functional/padding.h
+++ b/torch/csrc/api/include/torch/nn/functional/padding.h
@@ -1,83 +1,36 @@
 #pragma once
 
 #include <torch/nn/options/padding.h>
+#include <ATen/native/PadNd.h>
 
 namespace torch {
 namespace nn {
 namespace functional {
 
-inline Tensor _narrow_with_range(const Tensor& input, int64_t dim, int64_t start, int64_t end) {
-  return input.narrow(dim, start, end - start);
-}
-
-inline Tensor _pad_circular(Tensor input, IntArrayRef padding) {
-  int padding_size = padding.size();
-  input = torch::cat({input, _narrow_with_range(input, 2, 0, padding[-1 + padding_size])}, /*dim=*/2);
-  input = torch::cat({_narrow_with_range(input, 2, -(padding[-1 + padding_size] + padding[-2 + padding_size]), -padding[-1 + padding_size]), input}, /*dim=*/2);
-
-  if (padding_size > 2) {
-    input = torch::cat({input, _narrow_with_range(input, 3, 0, padding[-3 + padding_size])}, /*dim=*/3);
-    input = torch::cat({_narrow_with_range(input, 3, -(padding[-3 + padding_size] + padding[-4 + padding_size]), -padding[-3 + padding_size]), input}, /*dim=*/3);
-  }
-
-  if (padding_size > 4) {
-    input = torch::cat({input, _narrow_with_range(input, 4, 0, padding[-5 + padding_size])}, /*dim=*/4);
-    input = torch::cat({_narrow_with_range(input, 4, -(padding[-5 + padding_size] + padding[-6 + padding_size]), -padding[-5 + padding_size]), input}, /*dim=*/4);
-  }
-
-  return input;
-}
-
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 inline Tensor pad(const Tensor& input,
                   IntArrayRef pad,
                   PadFuncOptions::mode_t mode,
                   double value) {
-  TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2");
-  TORCH_CHECK(((int64_t)(pad.size() / 2)) <= input.dim(), "Padding length too large");
-  if (c10::get_if<enumtype::kConstant>(&mode)) {
-    return torch::constant_pad_nd(input, pad, value);
-  } else {
-    TORCH_CHECK(
-      value == 0,
-      "Padding mode \"",
-      torch::enumtype::get_enum_name(mode),
-      "\" doesn't take in value argument");
-    if (pad.size() == 2 && (input.dim() == 2 || input.dim() == 3)) {
-      if (c10::get_if<enumtype::kReflect>(&mode)) {
-        return torch::reflection_pad1d(input, pad);
-      } else if (c10::get_if<enumtype::kReplicate>(&mode)) {
-        return torch::replication_pad1d(input, pad);
-      } else if (c10::get_if<enumtype::kCircular>(&mode)) {
-        return _pad_circular(input, pad);
-      } else {
-        TORCH_CHECK(false, "NotImplementedError");
-      }
-    } else if(pad.size() == 4 && (input.dim() == 3 || input.dim() == 4)) {
-      if (c10::get_if<enumtype::kReflect>(&mode)) {
-        return torch::reflection_pad2d(input, pad);
-      } else if (c10::get_if<enumtype::kReplicate>(&mode)) {
-        return torch::replication_pad2d(input, pad);
-      } else if (c10::get_if<enumtype::kCircular>(&mode)) {
-        return _pad_circular(input, pad);
-      } else {
-        TORCH_CHECK(false, "NotImplementedError");
-      }
-    } else if (pad.size() == 6 && (input.dim() == 4 || input.dim() == 5)) {
-      if (c10::get_if<enumtype::kReflect>(&mode)) {
-        return torch::reflection_pad3d(input, pad);
-      } else if (c10::get_if<enumtype::kReplicate>(&mode)) {
-        return torch::replication_pad3d(input, pad);
-      } else if (c10::get_if<enumtype::kCircular>(&mode)) {
-        return _pad_circular(input, pad);
-      } else {
-        TORCH_CHECK(false, "NotImplementedError");
-      }
-    } else {
-      TORCH_CHECK(false, "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
+  const auto mode_enum = [&] {
+    if (c10::get_if<enumtype::kConstant>(&mode)) {
+      return at::padding_mode::constant;
+    } else if (c10::get_if<enumtype::kReflect>(&mode)) {
+      return at::padding_mode::reflect;
+    } else if (c10::get_if<enumtype::kReplicate>(&mode)) {
+      return at::padding_mode::replicate;
+    } else if (c10::get_if<enumtype::kCircular>(&mode)) {
+      return at::padding_mode::circular;
     }
+    TORCH_CHECK(false, "Unrecognised padding mode");
+  }();
+
+  c10::optional<double> fill_value;
+  if (value != 0.0) {
+    fill_value = value;
   }
+  return at::_pad_enum(input, pad, static_cast<int64_t>(mode_enum), fill_value);
 }
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h
index 9da99e9fa33c..ae325fc8113e 100644
--- a/torch/csrc/api/include/torch/nn/functional/pooling.h
+++ b/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -770,8 +770,8 @@ inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
   c10::optional<ExpandingArray<2>> output_size_ = output_size;
   if (output_size_ == c10::nullopt) {
     TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
-    output_size_ = {(int64_t)(input.size(-2) * (*output_ratio.value())[0]),
-                    (int64_t)(input.size(-1) * (*output_ratio.value())[1])};
+    output_size_ = {(int64_t)(static_cast<double>(input.size(-2)) * (*output_ratio.value())[0]),
+                    (int64_t)(static_cast<double>(input.size(-1)) * (*output_ratio.value())[1])};
   }
 
   Tensor _random_samples_ = _random_samples;
@@ -849,9 +849,9 @@ inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
   c10::optional<ExpandingArray<3>> output_size_ = output_size;
   if (output_size_ == c10::nullopt) {
     TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
-    output_size_ = {(int64_t)(input.size(-3) * (*output_ratio.value())[0]),
-                    (int64_t)(input.size(-2) * (*output_ratio.value())[1]),
-                    (int64_t)(input.size(-1) * (*output_ratio.value())[2])};
+    output_size_ = {(int64_t)(static_cast<double>(input.size(-3)) * (*output_ratio.value())[0]),
+                    (int64_t)(static_cast<double>(input.size(-2)) * (*output_ratio.value())[1]),
+                    (int64_t)(static_cast<double>(input.size(-1)) * (*output_ratio.value())[2])};
   }
 
   Tensor _random_samples_ = _random_samples;
diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h
index faa6e73368a8..fac6a9c6239b 100644
--- a/torch/csrc/api/include/torch/nn/functional/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -64,7 +64,7 @@ inline std::vector<int64_t> _interp_output_size(
 
   std::vector<int64_t> ret;
   for (const auto i : c10::irange(dim)) {
-    ret.emplace_back(static_cast<int64_t>(floor(input.size(i + 2) * scale_factors[i])));
+    ret.emplace_back(static_cast<int64_t>(floor(static_cast<double>(input.size(i + 2)) * scale_factors[i])));
   }
   return ret;
 }
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index 28225ee0f68b..e4fc02f310d5 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -570,12 +570,17 @@ TORCH_MODULE(GLU);
 // NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GELUImpl : public torch::nn::Cloneable<GELUImpl> {
  public:
+  explicit GELUImpl(GELUOptions options_ = {});
+
   Tensor forward(const Tensor& input);
 
   void reset() override;
 
   /// Pretty prints the `GELU` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  GELUOptions options;
 };
 
 /// A `ModuleHolder` subclass for `GELUImpl`.
diff --git a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
index 067da4094c6f..dde24f2230dd 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/irange.h>
 #include <torch/nn/cloneable.h>
 #include <torch/nn/module.h>
 
@@ -207,8 +208,10 @@ class ModuleListImpl : public Cloneable<ModuleListImpl> {
           modules_.begin() + Iterator::difference_type(index),
           std::move(module));
 
-      for (size_t i = index; i < size() - 1; ++i)
+      for (const auto i : c10::irange(index, size() - 1)) {
+        (void)i; // Suppress unused variable warning
         replace_module(c10::to_string(index), modules_[index]);
+      }
       register_module(c10::to_string(size() - 1), modules_.back());
     }
   }
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index 651c800a84cb..16ab0245fbb6 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/arg.h>
+#include <torch/enum.h>
 #include <torch/csrc/Export.h>
 #include <torch/types.h>
 
@@ -95,6 +96,33 @@ using GLUFuncOptions = GLUOptions;
 
 // ============================================================================
 
+/// Options for the `GELU` module.
+///
+/// Example:
+/// ```
+/// GELU model(GELUOptions().approximate("none"));
+/// ```
+struct TORCH_API GELUOptions {
+  /// Specifies the approximation to apply to the output.
+  TORCH_ARG(std::string, approximate) = "none";
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::gelu`.
+///
+/// See the documentation for `torch::nn::GELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gelu(input, F::GELUFuncOptions().approximate("none"));
+/// ```
+using GELUFuncOptions = GELUOptions;
+} // namespace functional
+
+// ============================================================================
+
 /// Options for the `Hardshrink` module.
 ///
 /// Example:
diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h
index 6e0ecc0fbcad..d667e094f993 100644
--- a/torch/csrc/api/include/torch/special.h
+++ b/torch/csrc/api/include/torch/special.h
@@ -215,6 +215,15 @@ inline Tensor& logsumexp_out(Tensor& result, const Tensor& self, IntArrayRef dim
   return torch::special_logsumexp_out(result, self, dims, keepdim);
 }
 
+/// Computes the argument, x, for which the area under the Gaussian probability density
+/// function (integrated from minus infinity to x) is equal to input, elementwise.
+/// See https://pytorch.org/docs/master/special.html#torch.special.ndtri
+///
+/// Example:
+/// ```
+/// auto t = torch::rand(128, dtype=kDouble);
+/// torch::special::ndtri(t);
+/// ```
 inline Tensor ndtri(const Tensor& self) {
   return torch::special_ndtri(self);
 }
@@ -223,6 +232,23 @@ inline Tensor& ndtri_out(Tensor& result, const Tensor& self) {
   return torch::special_ndtri_out(result, self);
 }
 
+/// Computes the log of area under the standard Gaussian probability density function,
+/// integrated from minus infinity to :attr:`input`, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.log_ndtr
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::log_ndtr(t);
+/// ```
+inline Tensor log_ndtr(const Tensor& self) {
+  return torch::special_log_ndtr(self);
+}
+
+inline Tensor& log_ndtr_out(Tensor& result, const Tensor& self) {
+  return torch::special_log_ndtr_out(result, self);
+}
+
 /// Computes the logit of input, elementwise.
 /// See https://pytorch.org/docs/master/special.html#torch.special.logit.
 ///
diff --git a/torch/csrc/api/include/torch/utils.h b/torch/csrc/api/include/torch/utils.h
index f664074deb03..3bb6363a4ced 100644
--- a/torch/csrc/api/include/torch/utils.h
+++ b/torch/csrc/api/include/torch/utils.h
@@ -5,7 +5,6 @@
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/api/include/torch/types.h>
-#include <torch/csrc/utils/crash_handler.h>
 #include <cstdint>
 
 namespace torch {
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 677c9e1cc836..001199e98edd 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -284,8 +284,10 @@ void GLUImpl::pretty_print(std::ostream& stream) const {
 
 // ============================================================================
 
+GELUImpl::GELUImpl(GELUOptions options_) : options(std::move(options_)) {}
+
 Tensor GELUImpl::forward(const Tensor& input) {
-  return F::gelu(input);
+  return F::detail::gelu(input, options.approximate());
 }
 
 void GELUImpl::reset() {}
diff --git a/torch/csrc/api/src/nn/modules/adaptive.cpp b/torch/csrc/api/src/nn/modules/adaptive.cpp
index 1f28d0c82816..6842b14550cd 100644
--- a/torch/csrc/api/src/nn/modules/adaptive.cpp
+++ b/torch/csrc/api/src/nn/modules/adaptive.cpp
@@ -94,7 +94,7 @@ ASMoutput AdaptiveLogSoftmaxWithLossImpl::forward(const Tensor& input_, const Te
   auto cutoff_values = cutoffs;
   cutoff_values.insert(cutoff_values.begin(), 0);
 
-  for (size_t i = 0; i < cutoff_values.size() - 1; ++i) {
+  for (const auto i : c10::irange(cutoff_values.size() - 1)) {
     int64_t low_idx = cutoff_values[i];
     int64_t high_idx = cutoff_values[i + 1];
 
@@ -148,7 +148,7 @@ Tensor AdaptiveLogSoftmaxWithLossImpl::_get_full_log_prob(const Tensor& input, c
 
   out.index_put_({Slice(), Slice(None, shortlist_size)}, head_logprob.index({Slice(), Slice(None, shortlist_size)}));
 
-  for (size_t i = 0; i < cutoffs.size() - 1; ++i) {
+  for (const auto i : c10::irange(cutoffs.size() - 1)) {
     int64_t start_idx = cutoffs[i];
     int64_t stop_idx = cutoffs[i+1];
     const Tensor cluster_output = tail[i]->as<Sequential>()->forward(input);
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index d7d8dd002eb8..d3143b07ccdd 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -232,7 +232,6 @@ std::tuple<double, Tensor, double, int64_t> _strong_wolfe(const Function& obj_fu
     auto d_norm = val(d.abs().max());
     g = g.clone(at::MemoryFormat::Contiguous);
     // evaluate objective and gradient using initial step
-    auto obj_func_res = obj_func(x, t, d);
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     double f_new;
     Tensor g_new;
@@ -285,7 +284,6 @@ std::tuple<double, Tensor, double, int64_t> _strong_wolfe(const Function& obj_fu
       f_prev = f_new;
       g_prev = g_new.clone(at::MemoryFormat::Contiguous);
       gtd_prev = gtd_new;
-      obj_func_res = obj_func(x, t, d);
       std::tie(f_new, g_new) = obj_func(x, t, d);
       ls_func_evals += 1;
       gtd_new = g_new.dot(d);
@@ -335,9 +333,7 @@ std::tuple<double, Tensor, double, int64_t> _strong_wolfe(const Function& obj_fu
       }
 
       // Evaluate new point
-      obj_func_res = obj_func(x, t, d);
-      f_new = std::get<0>(obj_func_res);
-      g_new = std::get<1>(obj_func_res);
+      std::tie(f_new, g_new) = obj_func(x, t, d);
       ls_func_evals += 1;
       gtd_new = g_new.dot(d);
       ls_iter += 1;
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index b4bcc4e4316c..bcafabea3b4b 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/autograd/FunctionsManual.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
 
 
 #include <ATen/ATen.h>
@@ -12,6 +13,7 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/Activation.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/Utils.h>
@@ -188,7 +190,12 @@ Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional<Scal
   return norm_backward(grad, self, p_, norm, {}, true);
 }
 
-Tensor norm_backward(Tensor grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
+Tensor norm_backward(
+    Tensor grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
+  // NB: We mask fill the NaNs in the output to be zero but still do float division
+  //     by zero, which ASAN complains about. One way to appease ASAN is to fill the problematic
+  //     values with something arbitrary before the division, but we decide not to due to
+  //     the perf hit. Instead we just silence ASAN where necessary
   size_t ndim = self.sizes().size();
   double p = p_.value_or(2.0).toDouble();
   Tensor self_scaled;
@@ -204,34 +211,104 @@ Tensor norm_backward(Tensor grad, const Tensor& self, const optional<Scalar> & p
   } else if (p == 1.0) {
     return self.sgn() * grad;
   } else if (p == 2.0) {
-    self_scaled = self;
-    scale_v = grad / norm;
+    return self * (grad / norm).masked_fill_(norm == 0, 0);
   } else if (std::isinf(p)) {
     const auto self_isnan = self.isnan();
     const auto norm_isnan = norm.isnan();
     const auto& self_and_norm_isnan = areAnyTensorSubclassLike({self, norm}) ?
       self_isnan.logical_and(norm_isnan) :
       self_isnan.logical_and_(norm_isnan);
-    Tensor is_eq_max = (self.abs() == norm).logical_or_(self_and_norm_isnan).type_as(self);
+    auto is_eq_max = (self.abs() == norm).logical_or_(self_and_norm_isnan).type_as(self);
     self_scaled = self.sgn() * is_eq_max;
-    Tensor nb_max = is_eq_max.count_nonzero(dim);
+    auto nb_max = is_eq_max.count_nonzero(dim);
     if (self.dim() != 0) {
       nb_max = unsqueeze_multiple(nb_max, dim, ndim);
     }
     scale_v = grad / nb_max;
+    return self_scaled * scale_v;
+  } else if (p < 1.0) {
+    self_scaled = self.sgn() * self.abs().pow_(p - 1).masked_fill_(self == 0, 0);
+    return self_scaled * grad * norm.pow(1 - p);
   } else if (p < 2.0) {
-    self_scaled = self.sgn() * self.abs().pow(p - 1);
+    self_scaled = self.sgn() * self.abs().pow_(p - 1);
     scale_v = grad / norm.pow(p - 1);
+    scale_v.masked_fill_(norm == 0, 0);
+    return self_scaled * scale_v;
   } else {
-    self_scaled = self * self.abs().pow(p - 2);
+    self_scaled = self * self.abs().pow_(p - 2);
     scale_v = grad / norm.pow(p - 1);
+    scale_v.masked_fill_(norm == 0, 0);
+    return self_scaled * scale_v;
+  }
+}
+
+// See norm_backward above for a note on ignoring the sanitizer
+Tensor norm_jvp(
+  const Tensor& self_p, const Tensor& self_t,
+  const optional<Scalar> & p_,
+  Tensor norm,
+  IntArrayRef dim,
+  bool keepdim
+) {
+  // NB: currently norm_jvp is also reused for dist's jvp (which haas two differentiable inputs)
+  //     but self_t still cannot be a ZT because that would require both self_t and other_t to be ZT
+  TORCH_INTERNAL_ASSERT(!self_t._is_zerotensor());
+  size_t ndim = self_p.dim();  // composite compliance?
+  double p = p_.value_or(2.0).toDouble();
+
+  if (p == 0.0) {
+    return at::zeros_like(norm);
+  } else if (p == 1.0) {
+    auto result = self_p.sgn();
+    result = areAnyTensorSubclassLike({self_t}) ? result.mul(self_t.conj()) : result.mul_(self_t.conj());
+    result = at::real(result);
+    return result.sum(dim, keepdim);
+  } else if (p == 2.0) {
+    auto result = self_p.mul(self_t.conj());
+    result = at::real(result);
+    result = result.sum(dim, keepdim);
+    return result.div_(norm).masked_fill_(norm == 0, 0);
+  } else if (std::isinf(p)) {
+    if (!keepdim && self_p.dim() != 0) {
+      norm = unsqueeze_multiple(norm, dim, ndim);
+    }
+    const auto self_isnan = self_p.isnan();
+    const auto norm_isnan = norm.isnan();
+    const auto& self_and_norm_isnan = areAnyTensorSubclassLike({norm}) ?
+      self_isnan.logical_and(norm_isnan) :
+      self_isnan.logical_and_(norm_isnan);
+    const auto is_eq_max = (self_p.abs() == norm).logical_or_(self_and_norm_isnan).type_as(norm);
+    auto nb_max = is_eq_max.count_nonzero(dim);
+    if (self_p.dim() != 0) {
+      nb_max = unsqueeze_multiple(nb_max, dim, ndim);
+    }
+    return (at::real(self_p.sgn() * self_t.conj()) * is_eq_max / nb_max).sum(dim, keepdim);
+  } else if (p < 1.0) {
+    auto sumpow_t = (self_p.abs().pow_(p - 1).masked_fill_(self_p == 0, 0) * at::real(self_p.sgn() * self_t.conj())).sum(dim, keepdim);
+    return sumpow_t * norm.pow(1 - p);
+  } else if (p < 2.0) {
+    auto sumpow_t = (self_p.abs().pow_(p - 1) * at::real(self_p.sgn() * self_t.conj())).sum(dim, keepdim);
+    auto out = sumpow_t / norm.pow(p - 1);
+    return out.masked_fill_(norm == 0, 0);
+  } else {
+    auto sumpow_t = (self_p.abs().pow_(p - 2) * at::real(self_p * self_t.conj())).sum(dim, keepdim);
+    auto out = sumpow_t / norm.pow(p - 1);
+    return out.masked_fill_(norm == 0, 0);
   }
-  // handle case at 0 where we return a subgradient containing 0
-  scale_v.masked_fill_(norm == 0, 0);
-  return self_scaled * scale_v;
 }
 
-Tensor linalg_vector_norm_backward(Tensor grad, const Tensor& self, const Scalar& scalar_ord, Tensor norm, const optional<IntArrayRef>& opt_dim, bool keepdim) {
+Tensor norm_jvp(const Tensor& self_p, const Tensor& self_t, const optional<Scalar> & p_, Tensor norm) {
+  return norm_jvp(self_p, self_t, p_, norm, {}, true);
+}
+
+Tensor linalg_vector_norm_jvp(const Tensor& self_p, const Tensor& self_t, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim) {
+  // No need to handle the dtype arg as it's handled via broadcasting in the function
+  auto dim = opt_dim.value_or(IntArrayRef({}));
+  return norm_jvp(self_p, self_t, scalar_ord, norm, dim, keepdim);
+}
+
+Tensor linalg_vector_norm_backward(Tensor grad, const Tensor& self, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim) {
+  // No need to handle the dtype arg as it's handled via broadcasting in the function
   auto dim = opt_dim.value_or(IntArrayRef({}));
   return norm_backward(grad, self, scalar_ord, norm, dim, keepdim);
 }
@@ -513,6 +590,7 @@ Tensor solve_backward_self(const Tensor & grad, const Tensor & self, const Tenso
 }
 
 Tensor solve_backward_A(const Tensor & grad, const Tensor & self, const Tensor & A, const Tensor & solution) {
+  at::NoTF32Guard disable_tf32;
   Tensor grad_self = solve_backward_self(grad, self, A);
   if (self.ndimension() == 2 && A.ndimension() == 2) {
     return -at::mm(grad_self, solution.mH());
@@ -548,7 +626,7 @@ Tensor logcumsumexp_backward(Tensor grad, const Tensor & self, Tensor result, in
 
   // Reference: https://github.com/tensorflow/tensorflow/blob/
   // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
-  return AT_DISPATCH_FLOATING_TYPES(
+  return AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16,
       at::typeMetaToScalarType(grad.dtype()),
       "logcumsumexp_backward",
       [grad, self, result, dim]() {
@@ -715,6 +793,22 @@ std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(
   return ret;
 }
 
+at::Tensor clamp_jvp(
+  const Tensor& self_p, const Tensor& self_t,
+  const Tensor& min_p, const Tensor& min_t,
+  const Tensor& max_p, const Tensor& max_t
+) {
+  if (min_p.defined() && max_p.defined()) {
+    return where(min_p > max_p, max_t, where(self_p < min_p, min_t, where(self_p > max_p, max_t, self_t)));
+  } else if (min_p.defined()) {
+    return where(self_p > min_p, self_t, min_t);
+  } else if (max_p.defined()) {
+    return where(self_p < max_p, self_t, max_t);
+  } else {
+    return self_t;
+  }
+}
+
 Tensor convolution_jvp(
     const Tensor& input_p, const Tensor& input_t,
     const Tensor& weight_p, const Tensor& weight_t,
@@ -762,7 +856,7 @@ Tensor convolution_backward_jvp_grad_bias(
   } else {
     TORCH_INTERNAL_ASSERT(
         false,
-        "convolution_backward_jvp_grad_bias expected dim of grad_out_t to be 3, 4, or 4, but got: ",
+        "convolution_backward_jvp_grad_bias expected dim of grad_out_t to be 3, 4, or 5, but got: ",
         grad_out_t.dim());
   }
 }
@@ -795,46 +889,51 @@ at::IntArrayRef strides_or_error(const Tensor & input, c10::string_view const &
       "Please either use a strided tensor or set requires_grad=False for '",
       input_name, "'");
     if (input.is_mkldnn()) return IntArrayRef({});
+    if (input.is_sparse_csr()) return IntArrayRef({});
     return input.strides();
   } else {
     return IntArrayRef({});
   }
 }
 
-Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, const Scalar & alpha) {
-  // if input was column-major, return grad as column-order for efficiency
-  if (mat1_strides[0] == 1 && mat1_strides[1] == mat1_sizes[0]) {
-    return maybe_multiply(mat2.conj().mm(grad.t()).t(), alpha.conj());
-  } else {
-    return maybe_multiply(grad.mm(mat2.t().conj()), alpha.conj());
-  }
-}
-
-Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntArrayRef sizes, IntArrayRef strides, const Scalar & alpha) {
-  // if input was column-major, return grad as column-order for efficiency
-  if (strides[0] == 1 && strides[1] == sizes[0]) {
-    if (mat1.is_sparse()) {
-      // Since mm(dense, sparse) doesn't exist,
-      // pass a transposed output matrix to the underlying "addmm"
-      // function directly.
-      int64_t out_rows = mat1.size(1);
-      int64_t out_cols = grad.size(1);
-      Tensor t = at::zeros({}, grad.options()).expand({out_rows, out_cols}, true);
-      Tensor r = at::empty({out_cols, out_rows}, grad.options()).t();
-      at::addmm_out(r, t, mat1.t(), grad, alpha, 1);
-      return r;
+Tensor mm_mat1_backward(const Tensor& grad, const Tensor& mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, c10::Layout mat1_layout, const Scalar& alpha) {
+  if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1_layout == c10::kStrided) {
+    // if input was column-major, return grad as column-order for efficiency
+    if (mat1_strides[0] == 1 && mat1_strides[1] == mat1_sizes[0]) {
+      return maybe_multiply(mat2.conj().mm(grad.t()).t(), alpha.conj());
     }
-    return maybe_multiply(grad.t().mm(mat1.conj()).t(), alpha.conj());
-  } else {
-    return maybe_multiply(mat1.t().conj().mm(grad), alpha.conj());
   }
+
+  // General fallback, should work for any layout
+  return maybe_multiply(grad.mm(mat2.t().conj()), alpha.conj());
 }
 
-Tensor _sparse_addmm_sparse_backward(const Tensor& grad, const Tensor& sparse_, const Tensor& dense, const Scalar& alpha) {
-  AT_ASSERT(sparse_.is_sparse());
-  auto sparse = sparse_.coalesce();
-  Tensor grad_sparse = maybe_multiply(grad.mm(dense.conj().t()), alpha);
-  return grad_sparse.sparse_mask(sparse);
+Tensor mm_mat2_backward(const Tensor& grad, const Tensor& mat1, IntArrayRef mat2_sizes, IntArrayRef mat2_strides, c10::Layout mat2_layout, const Scalar& alpha) {
+  if (grad.layout() == c10::kStrided && mat1.layout() == c10::kStrided && mat2_layout == c10::kStrided) {
+    // if input was column-major, return grad as column-order for efficiency
+    if (mat2_strides[0] == 1 && mat2_strides[1] == mat2_sizes[0]) {
+      return maybe_multiply(grad.t().mm(mat1.conj()).t(), alpha.conj());
+    }
+  }
+
+  // General fallback, should work for any layout
+  return maybe_multiply(mat1.t().conj().mm(grad), alpha.conj());
+}
+
+Tensor mm_mat1_sparse_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, const Scalar& alpha) {
+  if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.is_sparse()) {
+    auto sparse = mat1.coalesce();
+    Tensor grad_sparse = maybe_multiply(grad.mm(mat2.conj().t()), alpha);
+    return grad_sparse.sparse_mask(sparse);
+  } else if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.is_sparse_csr()) {
+    return at::sparse_sampled_addmm(at::zeros_like(mat1, mat1.options()), grad, mat2.mH(), 1.0, alpha);
+  } else if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.layout() == c10::kStrided) {
+    return maybe_multiply(grad.mm(mat2.mH()), alpha);
+  }
+  TORCH_CHECK(false, "sparse_addmm_sparse_backward: unsupported combination of layouts",
+    ", grad: ", grad.layout(),
+    ", mat1: ", mat1.layout(),
+    ", mat2: ", mat2.layout());
 }
 
 // This function return a new SparseTensor with values from Tensor `input` filtered by indices of `mask`
@@ -907,7 +1006,7 @@ Tensor renorm_backward(const Tensor & grad, const Tensor & self, const Scalar& p
         self, p, reduce_dims, /*keepdim=*/true);
   }
 
-  const auto real_acc_type = c10::toValueType(acc_type);
+  const auto real_acc_type = c10::toRealValueType(acc_type);
   auto grad_output = (self.conj() * grad);
   // vector_norm output is real, so grad_output must also be real
   if (real_acc_type != acc_type) {
@@ -915,8 +1014,7 @@ Tensor renorm_backward(const Tensor & grad, const Tensor & self, const Scalar& p
   }
   grad_output = grad_output.sum(
       reduce_dims, /*keepdim=*/true, /*dtype=*/real_acc_type);
-  auto nb = linalg_vector_norm_backward(
-      grad_output, self, p, norm, reduce_dims, /*keepdim=*/true);
+  auto nb = norm_backward(grad_output, self, p, norm, reduce_dims, /*keepdim=*/true);
 
   auto invnorm = (norm + 1e-7).reciprocal();
   auto grad_norm = maxnorm * invnorm * (grad - invnorm * nb);
@@ -1048,7 +1146,7 @@ static Tensor var_backward(const Tensor & grad, const Tensor & self, int64_t cor
   return (2.0 / (self.numel() - correction)) * grad * (self - self.mean());
 }
 
-Tensor var_backward(Tensor grad, const Tensor& self, c10::optional<IntArrayRef> dim_opt,
+Tensor var_backward(Tensor grad, const Tensor& self, at::OptionalIntArrayRef dim_opt,
     c10::optional<int64_t> correction_opt, bool keepdim) {
   auto correction = correction_opt.value_or(1);
   if (self.dim() == 0 || !dim_opt.has_value()) {
@@ -1063,7 +1161,7 @@ Tensor var_backward(Tensor grad, const Tensor& self, c10::optional<IntArrayRef>
   return (2.0 / dof) * grad * (self - self.mean(dim, /*keepdim=*/true));
 }
 
-Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result, c10::optional<IntArrayRef> dim_opt,
+Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result, at::OptionalIntArrayRef dim_opt,
     c10::optional<int64_t> correction_opt, bool keepdim) {
   auto correction = correction_opt.value_or(1);
   if (self_p.dim() == 0 || !dim_opt.has_value()) {
@@ -1076,7 +1174,7 @@ Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result,
 
 Tensor std_backward(
     const Tensor& result, const Tensor& grad, const Tensor& self,
-    c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction, bool keepdim) {
+    at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim) {
   auto grad_var = (grad / (result * 2)).masked_fill_(result == 0, 0);
   return var_backward(grad_var, self, dim, correction, keepdim);
 }
@@ -1091,7 +1189,7 @@ Tensor mean_backward(Tensor grad, const IntArrayRef sizes, int64_t numel) {
 
 static Tensor mean_backward(
     const Tensor& grad, const IntArrayRef sizes, int64_t numel,
-    c10::optional<IntArrayRef> dim, bool keepdim) {
+    at::OptionalIntArrayRef dim, bool keepdim) {
   if (dim.has_value()) {
     return mean_backward(grad, sizes, *dim, keepdim);
   } else {
@@ -1101,7 +1199,7 @@ static Tensor mean_backward(
 
 Tensor var_std_mean_backward(
     const variable_list& grads, const Tensor& self, const Tensor& r1,
-    const Tensor& r2, c10::optional<IntArrayRef> dim,
+    const Tensor& r2, at::OptionalIntArrayRef dim,
     c10::optional<int64_t> correction, bool keepdim, bool is_std) {
   Tensor grad;
   if (grads[0].defined()) {
@@ -1131,59 +1229,88 @@ Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntArra
   return mask_selected.view(sizes);
 }
 
-Tensor cholesky_jvp(const Tensor& input_tangent, const Tensor& L, bool upper) {
-  // Differentiation of the Cholesky decomposition, Iain Murray
-  // https://arxiv.org/abs/1602.07527
-  // equation 8
-  auto input_tangent_ = upper ? input_tangent.mH() : input_tangent;
+Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) {
+  at::NoTF32Guard disable_tf32;
+  // Let A = LL^H
+  // dA = dLL^H + L(dL)^H
+  // L^{-1}dA(L^{-H}) = L^{-1}dL + (L^{-1}dL)^H
+  //               = sym(L^{-1}dL)
+  // where sym(X) = X + X^H
+  // A short computaiton gives that the inverse of sym is given by
+  // \pi(X) = X.tril() - 0.5*diag(X)
+  // so
+  // dL = L\pi(L^{-1}dA(L^{-H}))
+
+  // Precondition: dA is symmetric/Hermitian
   auto L_ = upper ? L.mH() : L;
-
-  auto L_inverse = at::linalg_solve_triangular(L_, at::eye(L.size(-1), L.options()), /*upper=*/false);
-  auto phi = at::matmul(at::matmul(L_inverse, input_tangent_), L_inverse.mH());
-  phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5);
-  auto L_tangent = L_.matmul(phi);
-  return upper ? L_tangent.mH() : L_tangent;
+  auto dL = at::linalg_solve_triangular(L_, dA, /*upper=*/false, /*left=*/true);
+  dL = at::linalg_solve_triangular(L_.mH(), dL, /*upper=*/true, /*left=*/false);
+  dL = dL.tril() - dL.diagonal(0, -2, -1).mul(0.5).diag_embed();
+  dL = L_.matmul(dL);
+  return upper ? dL.mH() : dL;
 }
 
-Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) {
-  // cf. Iain Murray (2016); arXiv 1602.07527
-  // This gradient is symmetric, and not triangular.
-  // Cholesky additionally assumes that the input is symmetric, which is a subspace of
-  // R^{n x n}, and hence the derivative is not well-defined for off-diagonal
-  // elements. We resolve this by taking the gradient of the functionally independent
-  // elements of the matrix (i.e., the lower triangular portion of the input) and then
-  // reflect it on the upper triangular portion, thereby symmetrizing the gradient of
-  // the cholesky operation. The motivation behind this choice is that symmetric gradient
-  // leads to stable gradient updates, and retains symmetry of the updated matrix if it
-  // were updated by a gradient based algorithm.
-  if (upper) {
-    L = L.mH();
-    grad = grad.mH();
-  }
-  auto L_inverse = at::linalg_solve_triangular(L, at::eye(L.size(-1), L.options()), /*upper=*/false);
-  auto phi = at::matmul(L.mH(), grad);
-  phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5);
-
-  auto grad_input = at::matmul(at::matmul(L_inverse.mH(), phi), L_inverse);
-  return grad_input.add(grad_input.mH()).mul_(0.5);  // Symmetrizing the gradient
+Tensor cholesky_backward(const Tensor& gL, bool upper, const Tensor& L) {
+  at::NoTF32Guard disable_tf32;
+  // From cholesky_jvp we have that
+  // dL = L\pi(L^{-1}dA(L^-H))
+  //
+  // Let gL be the projection into the lower-triangular gradient wrt L. Taking adjoints we have
+  // gA = L^{-H}\pi^*((L^HgL).tril())L^{-1}
+  // where \pi^*(X) = 0.5 * (X + X^H - diag(X))
+  // The only non-standard point of this derivation is noting that the adjoint to multiplying
+  // on the left by a lower triangular matrix L is multiplying by L^H and then projecting back to
+  // the lower triangular matrices (hence the .tril() projection)
+  // Note that the gradient is symmetric and not triangular.
+  auto L_ = upper ? L.mH() : L;
+  auto gL_ = upper ? gL.mH() : gL;
+
+  // Nb. We don't need to compute gL_ = gL.tril() as
+  // tril(L^H gL) = tril(L^H (triu(gL, 1) + tril(gL)))
+  //              = tril(L^H tril(gL)) + tril(L^H triu(gL, 1))
+  //              = tril(L^H tril(gL))
+  // since tril(L^H triu(gL, 1)) = 0, as L^H triu(gL, 1) is upper triangular
+  auto gA = L_.mH().matmul(gL_).tril();
+  // Equivalent to 0.5 * (gA + gA^H - diag(gA))
+  gA = 0.5 * (gA + gA.tril(-1).mH());
+  gA = at::linalg_solve_triangular(L_.mH(), gA, /*upper=*/true, /*left=*/true);
+  gA = at::linalg_solve_triangular(L_, gA, /*upper=*/false, /*left=*/false);
+  return gA;
 }
 
 Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) {
+  at::NoTF32Guard disable_tf32;
   Tensor grad_L;
   if (grad.defined()) {
-    Tensor common_term = grad + grad.mT();
+    Tensor common_term = grad + grad.mH();
     common_term = at::matmul(inverse, at::matmul(common_term, inverse));
     if (upper) {
       grad_L = -at::matmul(L, common_term);
     } else {
       grad_L = -at::matmul(common_term, L);
     }
-  } else {
-    grad_L = at::zeros({1}, L.options()).expand_as(L);
   }
+
   return grad_L;
 }
 
+// If X = (L L^H)^{-1} with L lower-triangular with a real positive diagonal,
+// then dX = K^H + K, where
+// K =  L^{-H} dL^{-1} [dL^{-1} = -L^{-1} dL L^{-1}]
+//   = -L^{-H} L^{-1} dL L^{-1} [L^{-H} L^{-1} = X]
+//   = -X dL L^{-1} [X = X^H = L^{-H} L^{-1} = L^{-1} L^{-H}]
+//   = -X dL X L^{H}.
+// If X = (U^H U)^{-1} with U upper-triangular with a real positive diagonal,
+// then K becomes
+// K = -X dU^H X U
+Tensor cholesky_inverse_jvp(const Tensor& F, const Tensor& dF, const Tensor& X, bool upper) {
+  at::NoTF32Guard disable_tf32;
+  const auto CF = upper ? F : F.mH();
+  const auto dCF = upper ? dF.mH() : dF;
+  const auto partial_dX = -X.matmul(dCF).matmul(X).matmul(CF);
+  return partial_dX + partial_dX.mH();
+}
+
 // The formula for forward AD is adapted from
 //
 // Golub, Gene H., and Victor Pereyra. "The Differentiation of Pseudo-Inverses and Nonlinear
@@ -1308,11 +1435,18 @@ Tensor split_backward(const std::vector<torch::autograd::Variable> &grads,
 
 Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) {
   AT_ASSERT(indices.dim() >= dim);
-  auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
-  size.push_back(-1);
-  auto indices_view = indices.view(size);
-  const auto memory_format = indices.suggest_memory_format();
-  return grad.contiguous(memory_format).view(size).gather(-1, indices_view).view(indices.sizes());
+  // handle non-empty inputs
+  if (indices.numel()) {
+    auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
+    size.push_back(-1);
+    auto indices_view = indices.view(size);
+    const auto memory_format = indices.suggest_memory_format();
+    return grad.contiguous(memory_format).view(size).gather(-1, indices_view).view(indices.sizes());
+  }
+  // handle empty inputs
+  else {
+    return at::empty_like(indices, grad.options());
+  }
 }
 
 Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, const Tensor & input, int64_t dim) {
@@ -1445,6 +1579,45 @@ Tensor binary_cross_entropy_target_backward(
   return grad_target;
 }
 
+Tensor binary_cross_entropy_double_backward_target(
+  const Tensor& grad,
+  const Tensor& grad_output,
+  const Tensor& self,
+  const Tensor& target,
+  const c10::optional<Tensor>& weight,
+  int64_t reduction
+) {
+  auto res = -grad * grad_output;
+
+  if (isDefined(weight)) {
+    res = isTensorSubclassLike(weight.value())
+      ? res.mul(weight.value())
+      : res.mul_(weight.value());
+  }
+
+  auto neg_self = 1 - self;
+  auto denom = isTensorSubclassLike(self)
+    ? neg_self.mul(self)
+    : neg_self.mul_(self);
+  {
+    at::NoGradGuard guard;
+    // Default eps in binary_cross_entropy for ALL dtypes
+    // TODO: probably change this to a dtype-dependent value
+    double eps = 1e-12;
+    denom.clamp_min_(eps);
+  }
+
+  res = isTensorSubclassLike(denom)
+    ? res.div(denom)
+    : res.div_(denom);
+
+  if (reduction == at::Reduction::Mean) {
+    res.div_(target.numel());
+  }
+
+  return res;
+}
+
 
 Tensor binary_cross_entropy_with_logits_target_backward(const Tensor& grad_output, const Tensor& self, const Tensor& target, const c10::optional<Tensor>& weight, const c10::optional<Tensor>& pos_weight, int64_t reduction) {
   Tensor grad_target;
@@ -1497,8 +1670,8 @@ Tensor binary_cross_entropy_with_logits_jvp(const Tensor& input_t, const Tensor&
   }
 
   if (weight.defined()) {
-    grad_input.mul_(weight);
-    grad_target.mul_(weight);
+    grad_input = grad_input.mul(weight);
+    grad_target = grad_target.mul(weight);
   }
   return apply_loss_reduction(grad_target + grad_input, reduction);
 }
@@ -2249,6 +2422,22 @@ std::tuple<Tensor, Tensor> atan2_backward(const Tensor& grad, const Tensor& self
             output_mask[1] ? grad * -self * recip : Tensor() };
 }
 
+Tensor prelu_jvp(const Tensor& x, const Tensor& dx, const Tensor& w, const Tensor& dw) {
+  const auto ndim = x.dim();
+  auto as_nd = [ndim](const Tensor& t) {
+    std::vector<int64_t> sizes(ndim, 1), strides(ndim, 0);
+    if (ndim >= 2) {
+      sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1;
+      strides[1] = t.dim() == 1 ? t.strides()[0] : 0;
+      return t.as_strided(sizes, strides);
+    }
+    return t.as_strided(sizes, strides);
+  };
+  auto w_ = as_nd(w);
+  auto dw_ = as_nd(dw);
+  return at::where(x >= 0, dx, w_ * dx + dw_ * x);
+}
+
 // TODO: Seriously consider writing the derivative formulas for
 // each output separately; there is not all that much sharing
 // of computation going on here.
@@ -2338,6 +2527,47 @@ std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
   }
 }
 
+Tensor gelu_double_backward(
+                const Tensor & ggI,
+                const Tensor & gO,
+                const Tensor & input,
+                c10::string_view approximate) {
+  //if (at::native::get_gelutype_enum(approximate) == at::native::GeluType::Tanh) {
+  if (approximate == "tanh") {
+    constexpr auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+    constexpr auto kKappa = 0.044715;
+
+    auto inner = kBeta * (input + kKappa * pow(input, 3));
+    auto tanh_inner = tanh(inner);
+    auto sech_inner = 1 / cosh(inner);
+
+    auto f = 0.5 * input;
+    auto g = 1 - tanh_inner * tanh_inner;
+    auto h = kBeta * (1 + 3 * kKappa * input * input);
+
+    auto f_prime_gh = 0.5 * g * h;
+
+    auto g_prime = (2 * sech_inner) * (-sech_inner * tanh_inner) * h;
+    auto g_prime_fh = f * h * g_prime;
+
+    auto h_prime = 6 * kKappa * input * kBeta;
+    auto h_prime_fg = f * g * h_prime;
+
+    // left_derivative = f_prime_gh
+    // right_derivative = f_prime_gh + g_prime_fh + h_prime_fg
+    // dgrad_dX = left_derivative + right_derivative
+    auto gI = ggI * gO * (2 * f_prime_gh + g_prime_fh + h_prime_fg);
+    return gI;
+  } else {
+    constexpr auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+    auto input_sq = input * input;
+    auto pdf = kBeta * at::exp(-0.5 * input_sq);
+    auto dgrad_dInput = 2 * pdf - input_sq * pdf;
+    auto gI = ggI * gO * dgrad_dInput;
+    return gI;
+  }
+}
+
 Tensor elu_double_backward(
     const Tensor& grad,
     const Tensor& grad_output,
@@ -2372,6 +2602,7 @@ std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(const Tensor& dA,
                                                   const Tensor& S,
                                                   const Tensor& Vh_,
                                                   const bool full_matrices) {
+  at::NoTF32Guard disable_tf32;
   // See svd_backward for the derivation
   // With sym(X) = X + X^H, we implement
   // dU = U (sym(dX S) / E + i Im(diag(dX)) / (2S))
@@ -2475,6 +2706,7 @@ Tensor svd_backward(const Tensor& gU,
                     const Tensor& U,
                     const Tensor& S,
                     const Tensor& Vh) {
+  at::NoTF32Guard disable_tf32;
   // Throughout both the real and complex case we assume A has distinct singular values.
   // Furthermore, if A is rectangular or complex, we assume it's full-rank.
   //
@@ -2684,6 +2916,7 @@ Tensor svd_backward(const Tensor& gU,
 // See the details below.
 Tensor eig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool is_eigvec_tensor_nonempty, const Tensor& eigenvalues, const Tensor& eigenvectors) {
+  at::NoTF32Guard disable_tf32;
   TORCH_CHECK(is_eigvec_tensor_nonempty,
            "eig_backward: torch.eig(eigenvalues=False) is not differentiable. ",
            "Please use torch.linalg.eigvals");
@@ -2823,6 +3056,7 @@ Tensor linalg_eig_backward(const Tensor& gL,
                            const Tensor& V,
                            const bool is_hermitian,
                            const bool symeig_eigenvectors) {
+  at::NoTF32Guard disable_tf32;
   // https://arxiv.org/pdf/1701.00392.pdf Eq 4.77
   // For A = VLV^{-1}, denoting the gradients gA, gV and gL, we have
   // gA = V^{-H}(diag_embed(gL) + (V^H gV -V^HV diag(real(V^H gV))) / E*)V^H
@@ -2905,6 +3139,7 @@ std::tuple<Tensor, Tensor> linalg_eig_jvp(const Tensor& dA,
                                           const Tensor& L,
                                           const Tensor& V,
                                           const bool is_hermitian) {
+  at::NoTF32Guard disable_tf32;
   // https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
   // see also https://arxiv.org/pdf/1701.00392.pdf Eqs. (4.60) and (4.63)
   // Note that neither of the formulas in these pdfs are correct, as they do not assume that
@@ -2917,12 +3152,7 @@ std::tuple<Tensor, Tensor> linalg_eig_jvp(const Tensor& dA,
   // E_{ij} = L_j - L_i if i != j
   //          1         otherwise
 
-  // Note: The Hermitian case is a simplification of this formula using that V^{-1} = V^H and that L is real
-  if (is_hermitian) {
-    TORCH_CHECK(at::allclose(dA, dA.mH(), /*rtol=*/1e-2, /*atol=*/1e-2),
-                "linalg_eig_jvp: The tangent part of the matrix A should also be ", (dA.is_complex() ? "Hermitian" : "symmetric."));
-  }
-
+  // Precondition: if is_hermitian == true, then dA is Hermitian
   const auto to_complex = [](const Tensor& A){ return A.to(c10::toComplexType(A.scalar_type())); };
 
   const auto dP = is_hermitian ? at::matmul(at::matmul(V.mH(), dA), V)
@@ -2952,6 +3182,7 @@ Tensor linalg_lstsq_jvp(
   const Tensor& dA,
   const Tensor& dB
 ) {
+  at::NoTF32Guard disable_tf32;
   auto pinvA = at::linalg_pinv(A);
   auto dpinvA = pinv_jvp(A, pinvA, dA);
   auto dX = dpinvA.matmul(B) + pinvA.matmul(dB);
@@ -2966,6 +3197,7 @@ std::tuple<Tensor, Tensor> linalg_lstsq_backward(
   const c10::optional<c10::string_view> driver,
   const std::array<bool, 2>& grad_input_mask
 ) {
+  at::NoTF32Guard disable_tf32;
   Tensor A_grad, B_grad;
   if (!grad.defined()) {
     return std::make_tuple(A_grad, B_grad);
@@ -2997,168 +3229,179 @@ std::tuple<Tensor, Tensor> linalg_lstsq_backward(
 std::tuple<Tensor, Tensor> linalg_qr_jvp(
   const Tensor& dA,
   const Tensor& Q,
-  const Tensor& R
+  const Tensor& R,
+  const c10::string_view mode
 ) {
-  auto m = dA.size(-2);
-  auto n = dA.size(-1);
-  auto k = std::min(m, n);
-
-  auto dA1 = dA.narrow(-1, 0, k);
-  auto R1 = R.narrow(-1, 0, k);
-
-  // dB1 = Q^H dA1 R1^{-1}
-  auto dB1 = at::linalg_solve_triangular(R1, Q.mH().matmul(dA1), /*upper=*/true, /*left=*/false);
-
-  // dC1 = (dB1 + dB1^H).triu(-1) + (dB1 + dB1^H) * 0.5 I
-  auto dC1 = (dB1 + dB1.mH()).triu();
-  dC1.diagonal(0, -2, -1).mul_(0.5);
+  // dA = dQR + QdR
+  //
+  // Case m >= n
+  // We can put dQ in terms of dR
+  // dQ = dAR^{-1} - QdRR^{-1}
+  // Then we have
+  // Q^H dA R^{-1} = Q^HdQ + dRR^{-1}
+  // where Q^HdQ is skew Hermitian and dRR^{-1} is upper triangular
+  // Define sym(X) = X + X^H
+  // sym(dRR^{-1}) = sym(Q^H dA R^{-1})
+  // and define syminv(X) = triu(X) - 0.5 * diag(X) the inverse of
+  // sym : Triu(k, diag \in \mathbb{R}) -> Her(k) to give
+  // dR = syminv(sym(Q^H dA R^{-1}))R
+  //
+  // Case m < n
+  // Put dR as a function of dQ
+  // dR = Q^H dA - Q^H dQ R
+  // Let X_1 be the main m x m submatrix of a matrix X \in C^{m x n}
+  // Q^H A_1 R_1^{-1} = Q^H dQ + dR_1 R_1^{-1}
+  // Define trilIm(X) = X.tril(-1) + i * Im diag(X)
+  // trilIm(Q^H dQ) = trilIm(Q^H A_1 R_1^{-1})
+  // and define trilIminv(X) = X - X^H - i*Im diag(X). This is the inverse of
+  // trilIm : Skew_C(m) -> Tril(m, imaginary diag)
+  // Note that it is just the inverse when the inputs are skew-Hermitian, not necessarily
+  // when the inputs are arbitrary matrices. We then get
+  // dQ = Q trilImInv(trilIm(Q^H A_1 R_1^{-1}))
+  at::NoTF32Guard disable_tf32;
 
-  auto dR1 = dC1.matmul(R1);
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool compute_q, reduced;
+  std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode);
 
-  // dQ = (dA1 - Q dR1) R1^{-1}
-  auto dQ = at::linalg_solve_triangular(R1, dA1 - Q.matmul(dR1), /*upper=*/true, /*left=*/false);
+  TORCH_CHECK(compute_q, "The derivative of linalg.qr depends on Q, which is not computed when "
+                         "mode='r'. Please use linalg.qr(A, mode='reduced') if you are "
+                         "going to differentiate through linalg.qr.");
+  auto m = dA.size(-2);
+  auto n = dA.size(-1);
 
-  Tensor dR;
+  TORCH_CHECK(reduced || m <= n, "The QR decomposition is not differentiable when "
+                                 "mode='complete' and nrows > ncols.");
   if (m >= n) {
-    dR = dR1;
-  }
-  else {
-    auto dA2 = dA.narrow(-1, k, n - k);
-    auto R2 = R.narrow(-1, k, n - k);
-    auto dR2 = Q.mH().matmul(dA2 - dQ.matmul(R2));
-    dR = at::cat({dR1, dR2}, -1);
-  }
-
-  return std::make_tuple(dQ, dR);
-}
+    const auto sym = [](const Tensor& X) { return X + X.mH(); };
+    const auto syminv = [](const Tensor& X) {
+      auto ret = X.triu();
+      ret.diagonal(0, -2, -1).mul_(0.5);
+      return ret;
+    };
+    auto dARinv = at::linalg_solve_triangular(R, dA, /*upper=*/true, /*left=*/false);
+    auto dR = syminv(sym(Q.mH().matmul(dARinv)));
+    auto dQ = dARinv - Q.matmul(dR);
+    dR = dR.matmul(R);
+    return std::make_tuple(std::move(dQ), std::move(dR));
+  } else {
+    const auto trilim = [](const Tensor& X) {
+      if (X.is_complex()) {
+        auto ret = X.tril();
+        at::real(ret.diagonal(0, -2, -1)).zero_();
+        return ret;
+      } else {
+        return X.tril(-1);
+      }
+    };
+    const auto triliminv = [](const Tensor& X) {
+      if (X.is_complex()) {
+        auto ret = X - X.mH();
+        ret.diagonal(0, -2, -1).mul_(0.5);
+        return ret;
+      } else {
+        return X - X.mT() ;
+      }
+    };
 
-Tensor linalg_qr_jvp_Q(
-  const Tensor& dA,
-  const Tensor& Q,
-  const Tensor& R
-) {
-  return std::get<0>(linalg_qr_jvp(dA, Q, R));
+    auto QHdA = Q.mH().matmul(dA);
+    auto QHdA1Rinv = at::linalg_solve_triangular(R.narrow(-1, 0, m), QHdA.narrow(-1, 0, m), /*upper=*/true, /*left=*/false);
+    auto dQ = triliminv(trilim(QHdA1Rinv));
+    auto dR = QHdA - dQ.matmul(R);
+    dQ = Q.matmul(dQ);
+    return std::make_tuple(std::move(dQ), std::move(dR));
+  }
 }
 
-Tensor linalg_qr_jvp_R(
-  const Tensor& dA,
-  const Tensor& Q,
-  const Tensor& R
-) {
-  return std::get<1>(linalg_qr_jvp(dA, Q, R));
-}
+Tensor linalg_qr_backward(const Tensor& gQ, const Tensor& gR,
+                          const Tensor& Q, const Tensor& R,
+                          const c10::string_view mode) {
+  // Nb. We won't be too formal below, as writing this proof formaly is a pain
+  // We'll link here a formal writing of all this at some point in the future
+  //
+  // Case m >= n
+  // dQ = dAR^{-1} - Qsyminv(sym(Q^H dA R^{-1}))
+  // dR = syminv(sym(Q^H dA R^{-1}))R
+  //
+  // With the notation from the JVP formla, the only two computations that we need are
+  // syminv*(R) = 0.5 * (R.triu() + R.triu()^H - Re diag(R))
+  // sym*(X) = 2 * X
+  // Using these, after a few simplifications we get that
+  // gA = (gQ + syminvadj(triu(gR R^H - Q^H gQ)))R^{-H}
+  //
+  // Case m < n
+  // dR = Q^H dA - Q^H dQ R
+  // dQ = Q trilImInv(trilIm(Q^H A_1 R_1^{-1}))
+  //
+  // In this case trilIm*(X) = X (it's the trivial embedding)
+  // while trilImInv*(X) = tril(Y) - 0.5 * diag(Y)
+  // with Y = X - X^H
+  //
+  // We also have that if X \in C^{m, n} an dpi(X) = X_1,
+  // projects X into its leading m x m submatrix,
+  // pi*(X) = cat(X, 0_{m,n-m}, dim=-1)
+  //
+  // Using this, we get that
+  // gA = QgR + pi*(Q trilImInv*(Q^H gQ - gR R^H)R_1^{-H})
+  at::NoTF32Guard disable_tf32;
 
-Tensor linalg_qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
-                          c10::string_view mode, const Tensor& q, const Tensor& r){
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool compute_q, reduced;
   std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode);
-  TORCH_CHECK(compute_q, "The derivative of qr is not implemented when mode='r'. "
-                         "Please use torch.linalg.qr(..., mode='reduced')");
-
-  auto square_deep_case_backward = [](const Tensor& grad_Q,
-                                      const Tensor& grad_R,
-                                      const Tensor& A,
-                                      const Tensor& Q,
-                                      const Tensor& R) -> Tensor {
-    // For square and deep (tall) case we refer:
-    // Matthias Seeger, Asmus Hetzel, Zhenwen Dai, Eric Meissner, Neil D. Lawrence (2018). Auto-Differentiating Linear Algebra.
-    // https://arxiv.org/abs/1710.08717 Section 4.3 LQ Decomposition (Note that LQ decomposition is the transpose of QR decomposition)
-    // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable Programming Tensor Networks.
-    // https://arxiv.org/abs/1903.09650 Section 3. QR factorization
-    // For derivations of complex-valued input case, see https://giggleliu.github.io/2019/04/02/einsumbp.html
-
-    // Compute R grad_R^H
-    Tensor R_term;
-    if (grad_R.defined()) {
-      R_term = at::matmul(R, grad_R.mH());
-    } else {
-      // R is ... x N x N, grad_R is ... x N x N and grad_R.T is ... x N x N
-      R_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    }
 
-    // Compute grad_Q^H Q
-    Tensor Q_term;
-    if (grad_Q.defined()) {
-      Q_term = at::matmul(grad_Q.mH(), Q);
-    } else {
-      // Q is ... x M x N, Q.T is ... x N x M and grad_Q is ... x M x N
-      Q_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    }
+  TORCH_CHECK(compute_q, "The derivative of linalg.qr depends on Q, which is not computed when "
+                         "mode='r'. Please use linalg.qr(A, mode='reduced') if you are "
+                         "going to differentiate through linalg.qr.");
 
-    Tensor M = R_term - Q_term;
+  auto m = Q.size(-2);
+  auto n = R.size(-1);
 
-    // Compute M = (tril(M) + tril(M).mH()) * 0.5 Identity
-    Tensor M_tril = at::tril(M);
-    M = M_tril + M_tril.mH();
-    M.diagonal(0, -2, -1).mul_(0.5);
+  TORCH_CHECK(reduced || m <= n, "The QR decomposition is not differentiable when "
+                                 "mode='complete' and nrows > ncols.");
 
-    Tensor rhs_term;
-    if (grad_Q.defined()) {
-      rhs_term = grad_Q + at::matmul(Q, M);
+  if (!gQ.defined() && !gR.defined()) {
+    return {};
+  }
+
+  Tensor gA;
+  if (gQ.defined()) {
+    if (gR.defined()) {
+      gA = gR.matmul(R.mH()) - Q.mH().matmul(gQ);
     } else {
-      rhs_term = at::matmul(Q, M);
+      gA = -Q.mH().matmul(gQ);
     }
-
-    // Compute rhs_term @ R^{-H}
-    Tensor grad_A = at::linalg_solve_triangular(
-        R.transpose(-2, -1).conj(),
-        rhs_term,
-        /*upper=*/false,
-        /*left=*/false,
-        /*unitriangular=*/false);
-
-    return grad_A;
-  };
-
-  auto m = self.size(-2);
-  auto n = self.size(-1);
-
-  TORCH_CHECK(
-      ((m <= n && (!reduced)) || reduced),
-      "The derivative of qr is not implemented when mode='complete' and nrows > ncols.");
-
-  auto grad_Q = grads[0];
-  auto grad_R = grads[1];
-
- if (m >= n) {
-    return square_deep_case_backward(grad_Q, grad_R, self, q, r);
   } else {
-    // For wide (m < n) input matrices A,  partition A = [X|Y] and R = [U|V]
-    // X and U are square full rank matrices. We will partition grads,
-    // grad_R = [grad_U | grad_V] and grad_A = [grad_X | grad_Y].
-    // To obtain grad_X we reuse the gradient formula from the square case.
-    // Formulae: grad_X = square_case_grad(grad_Q_prime, grad_U, Q, U),
-    // where grad_Q_prime = grad_Q + Y @ grad_V^H
-    // and grad_Y = Q @ grad_V.
-    // Then concatenate grads to get grad_A = [grad_X | grad_Y].
-
-    auto Y = self.narrow(-1, m, n - m);
-    auto U = r.narrow(-1, 0, m);
-    Tensor grad_Y, grad_X, grad_V, grad_Q_prime;
-
-    if (grad_R.defined()) {
-      grad_V = grad_R.narrow(-1, m, n - m);
-      // reuse grad_R to store grad_U
-      grad_R = grad_R.narrow(-1, 0, m);
-      // grad_Q_prime starts with the value of Y @ grad_V^H
-      grad_Q_prime = at::matmul(Y, grad_V.mH());
-    } else {
-      // when grad_R is not defined then grad_V and grad_Q_prime
-      // get initialized with zeros
-      grad_V = at::zeros_like(Y, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-      grad_Q_prime = at::zeros_like(q, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      gA = gR.matmul(R.mH());
+  }
+  if (m >= n) {
+    const auto syminvadj = [](const Tensor& X) {
+      auto ret = X + X.mH();
+      at::real(ret.diagonal(0, -2, -1)).mul_(0.5);
+      return ret;
+    };
+    gA = Q.matmul(syminvadj(gA.triu()));
+    if (gQ.defined()) {
+      gA = gA + gQ;
     }
-
-    if (grad_Q.defined()) {
-      // add the grad_Q term into grad_Q_prime when defined o/w is 0
-      grad_Q_prime = grad_Q_prime + grad_Q;
+    gA = at::linalg_solve_triangular(R.mH(), gA, /*upper*/false, /*left*/false);
+    return gA;
+  } else {
+    auto trilImInvAdjSkew = [](const Tensor& X) {
+      auto ret = (X - X.mH()).tril();
+      if (X.is_complex()) {
+        at::imag(ret.diagonal(0, -2, -1)).mul_(0.5);
+      }
+      return ret;
+    };
+    gA = Q.matmul(trilImInvAdjSkew(-gA));
+    gA = at::linalg_solve_triangular(R.narrow(-1, 0, m).mH(), gA, /*upper*/false, /*left*/false);
+    auto shape = R.sizes().vec();
+    shape.end()[-1] = n - m;
+    gA = at::cat({gA, gA.new_zeros(shape)}, /*dim=*/-1);
+    if (gR.defined()) {
+      gA = gA + Q.matmul(gR);
     }
-    // Calculate grad_X using the helper. Grad_R contains the grad_U value
-    grad_X = square_deep_case_backward(grad_Q_prime, grad_R, self, q, U);
-    grad_Y = at::matmul(q, grad_V);
-    // Concatenate grad_X and grad_Y to get grad_A.
-    return at::cat({grad_X, grad_Y}, -1);
+    return gA;
   }
 }
 
@@ -3239,7 +3482,7 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det)
     return svd_backward(u_grad, s_grad, vh_grad, u, s, vh);
   };
 
-  auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type()));
+  auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type()));
   auto singular_det_cutoff = eps * at::linalg_matrix_norm(self);
 
   if (self.dim() == 2) {
@@ -3440,6 +3683,7 @@ std::tuple<Tensor, Tensor> triangular_solve_backward(
     const Tensor & b, const Tensor & a, const Tensor & x,
     const bool upper, const bool transpose, const bool unitriangular,
     std::array<bool, 2> output_mask) {
+  at::NoTF32Guard disable_tf32;
   Tensor grad_b, grad_a;
   if (grad_x.defined() || grad_m.defined()) {
     if (grad_x.defined()) {
@@ -3489,6 +3733,7 @@ Tensor linalg_solve_triangular_forward_AD(
     const bool upper,
     const bool left,
     const bool unitriangular) {
+  at::NoTF32Guard disable_tf32;
   // The forward AD formula (for left = true) is A^{-1}(B_t - A_tX)
   // For the derivation see:
   // [Note: Forward / Backward AD solve_triangular]
@@ -3506,6 +3751,7 @@ std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
     const bool left,
     const bool unitriangular,
     std::array<bool, 2> output_mask) {
+  at::NoTF32Guard disable_tf32;
   const bool A_requires_grad = output_mask[0];
   const bool B_requires_grad = output_mask[1];
   // [Note: Forward / Backward AD solve_triangular]
@@ -3556,6 +3802,7 @@ std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
 std::tuple<Tensor, Tensor> cholesky_solve_backward(
     const Tensor& grad_x, const Tensor& self,
     const Tensor& input2, const Tensor& result, const bool upper) {
+  at::NoTF32Guard disable_tf32;
   Tensor grad_self, grad_input2;
   if (grad_x.defined()) {
     grad_self = grad_x.cholesky_solve(input2, /*upper=*/upper);
@@ -3579,6 +3826,7 @@ Tensor cholesky_solve_jvp(
   const Tensor& dB,
   const bool upper
 ) {
+  at::NoTF32Guard disable_tf32;
   auto dK = upper ? dU.mH().matmul(U)
                   : dU.matmul(U.mH());
   auto dA = dK + dK.mH();
@@ -3649,7 +3897,7 @@ Tensor fft_r2c_backward(const Tensor& grad, IntArrayRef dim, int64_t normalizati
   new_grad_shape[last_dim] = last_dim_size;
 
   const auto zero_length = last_dim_size - grad.size(dim.back());
-  auto complex_full_grad = zero_length > 0 ? at::zeros(new_grad_shape, grad.options()) : grad;
+  auto complex_full_grad = zero_length > 0 ? grad.new_zeros(new_grad_shape) : grad;
   if (zero_length > 0) {
     complex_full_grad.slice(last_dim, 0, half_sizes[last_dim]).copy_(grad);
   }
@@ -4505,6 +4753,7 @@ std::tuple<Tensor, Tensor> lu_solve_backward(
   const Tensor& LU_data,
   const Tensor& LU_pivots,
   const std::array<bool, 2>& grad_input_mask) {
+  at::NoTF32Guard disable_tf32;
   const bool B_requires_grad = grad_input_mask[0];
   const bool LU_data_requires_grad = grad_input_mask[1];
   if (!grad.defined() || (!B_requires_grad && !LU_data_requires_grad)) {
@@ -4572,6 +4821,7 @@ Tensor lu_solve_jvp(
   const Tensor& dB,
   const Tensor& LU_pivots
 ) {
+  at::NoTF32Guard disable_tf32;
   Tensor L, U, dL, dU;
   std::tie(std::ignore, L, U) = at::lu_unpack(LU_data, LU_pivots, /*unpack_data=*/true, /*unpack_pivots=*/false);
   dL = dLU_data.tril(-1);
@@ -4596,35 +4846,50 @@ Tensor lu_solve_jvp(
 }
 
 Tensor lu_unpack_backward(
-  const variable_list& grads,
-  const Tensor& LU_data,
-  bool unpack_data
+  const Tensor& L_grad,
+  const Tensor& U_grad,
+  const int64_t m,
+  const int64_t n
 ) {
-  auto L_grad = grads[1];
-  auto U_grad = grads[2];
-
-  auto m = LU_data.size(-2);
-  auto n = LU_data.size(-1);
-  auto k = std::min(m, n);
-
-  TORCH_CHECK(unpack_data, "lu_unpack_backward: cannot compute gradients unless unpack_data=True");
+  if (!L_grad.defined() && !U_grad.defined()) {
+    return {};
+  }
+  const auto k = std::min(m, n);
 
-  auto res = at::zeros(LU_data.sizes(), LU_data.options());
+  // Getters for the principal and complementary part of the matrices
+  const auto get_L1 = [m, k](const Tensor& L) { return m == k ? L.tril(-1) : L.narrow(-2, 0, k).tril(-1); };
+  const auto get_L2 = [m, k](const Tensor& L) { return L.narrow(-2, k, m - k); };
+  const auto get_U1 = [n, k](const Tensor& U) { return n == k ? U.triu() : U.narrow(-1, 0, k).triu(); };
+  const auto get_U2 = [n, k](const Tensor& U) { return U.narrow(-1, k, n - k); };
 
-  Tensor L_grad_contrib;
   if (L_grad.defined()) {
-    L_grad_contrib = L_grad.tril();
-    L_grad_contrib.diagonal(0, -2, -1).fill_(0);
-    res.narrow(-2, 0, m).narrow(-1, 0, k).add_(L_grad_contrib);
-  }
-
-  Tensor U_grad_contrib;
-  if (U_grad.defined()) {
-    U_grad_contrib = U_grad.triu();
-    res.narrow(-2, 0, k).narrow(-1, 0, n).add_(U_grad_contrib);
+    if (U_grad.defined()) {
+      if (m == n) {
+        return L_grad.tril(-1) + U_grad.triu();
+      } else {
+        auto A1_grad = get_L1(L_grad) + get_U1(U_grad);
+        auto A2_grad = m > n ? get_L2(L_grad) : get_U2(U_grad);
+        const auto dim = m > n ? -2 : -1;
+        return at::cat({std::move(A1_grad), std::move(A2_grad)}, /*dim=*/dim);
+      }
+    } else {
+      if (m >= n) {
+        return L_grad.tril(-1);
+      } else {
+        auto size = L_grad.sizes().vec();
+        size.end()[-1] = n - m;
+        return at::cat({L_grad.tril(-1), at::zeros(size, L_grad.options())}, /*dim=*/-1);
+      }
+    }
+  } else {
+    if (n >= m) {
+      return U_grad.triu();
+    } else {
+      auto size = U_grad.sizes().vec();
+      size.end()[-2] = m - n;
+      return at::cat({U_grad.triu(), at::zeros(size, U_grad.options())}, /*dim=*/-2);
+    }
   }
-
-  return res;
 }
 
 Tensor cat_jvp(at::TensorList tensors, int64_t dim) {
@@ -4639,7 +4904,7 @@ Tensor cat_jvp(at::TensorList tensors, int64_t dim) {
     std::vector<Tensor> fw_grads;
 
     for (auto& t: tensors) {
-      fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::zeros_like(t));
+      fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::_efficientzerotensor(t.sizes(), t.options()));
     }
 
     out_fw_grad = at::cat(fw_grads, dim);
@@ -4662,7 +4927,7 @@ Tensor stack_jvp(at::TensorList tensors, int64_t dim) {
     std::vector<Tensor> fw_grads;
 
     for (auto& t: tensors) {
-      fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::zeros_like(t));
+      fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::_efficientzerotensor(t.sizes(), t.options()));
     }
     out_fw_grad = at::stack(fw_grads, dim);
   }
@@ -4799,6 +5064,9 @@ Tensor batch_norm_jvp(
     TORCH_INTERNAL_ASSERT(
         running_mean.has_value() && running_var.has_value(),
         "Expect running_mean and running_var to have value when train=false");
+    TORCH_CHECK(
+        !running_mean.value()._fw_grad(/*level=*/0).defined() && !running_var.value()._fw_grad(/*level=*/0).defined(),
+        "batch_norm is not differentiable wrt running_mean and running_var, they cannot have forward grad defined");
     mean_p = running_mean.value().view(view_size);
     invstd_p = (1 / at::sqrt(running_var.value() + at::Scalar(eps))).view(view_size);
     result_t = input_t * invstd_p;
@@ -4885,7 +5153,6 @@ Tensor group_norm_jvp(
 Tensor group_norm_mean_jvp(
     const Tensor& input_t, const Tensor& mean_p, int64_t groups) {
   int64_t N = input_t.size(0);
-  int64_t C = input_t.size(1);
   std::array<int64_t, 3> view_shape = {1, N * groups, N ? -1 : 1};
   auto input_t_reshaped = input_t.view(view_shape);
   return input_t_reshaped.mean({2}, false).view_as(mean_p);
@@ -4896,7 +5163,6 @@ Tensor group_norm_invstd_jvp(
     const Tensor& mean_p, const Tensor& invstd_p,
     int64_t groups) {
   int64_t N = input_p.size(0);
-  int64_t C = input_p.size(1);
 
   std::vector<int64_t> view_shape = {1, N * groups, N ? -1 : 1};
 
@@ -4921,8 +5187,8 @@ Tensor gather_with_keepdimed_indices(const Tensor& input, int64_t dim, const Ten
   return out_fw_grad;
 }
 
-// Let X in \C^{m \times n}, then its pivoted LU decomposition is
-// X = P L U, where P is a permutation matrix.
+// Let A in \C^{m \times n}, then its pivoted LU decomposition is
+// A = P L U, where P is a permutation matrix.
 //
 // Useful notation:
 // Let o denote the elementwise, or Hadamard, product.
@@ -4934,159 +5200,193 @@ Tensor gather_with_keepdimed_indices(const Tensor& input, int64_t dim, const Ten
 //
 // Below we derive the backward algorithm for the case when m <= n.
 // The case m > n could be obtained using the same idea.
-// Since we assume m <= n, the LU decomposition of X could be written as
-// X = (X1 | X2) = P L (U1 | U2) where X1, U1 in \C^{m \times m}, X2, U2 in \C^{m, n - m}
+// Since we assume m <= n, the LU decomposition of A could be written as
+// A = (A1 | A2) = P L (U1 | U2) where A1, U1 in \C^{m \times m}, A2, U2 in \C^{m, n - m}
 //
 // Forward AD:
 //
-// dX = P dL U + P L dU => [left-multiply P^T]
-// (P^T dX1 | P^T dX2) = (dL U1 + L dU1 | dL U2 + L dU2) (*)
+// dA = P dL U + P L dU => [left-multiply P^T]
+// (P^T dA1 | P^T dA2) = (dL U1 + L dU1 | dL U2 + L dU2) (*)
 // From (*):
-// P^T dX1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by U1^{-1}]
-// L^{-1} P^T dX1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**).
+// P^T dA1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by U1^{-1}]
+// L^{-1} P^T dA1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**).
 // Note, L is lower-triangular, and so is its inverse, hence L^{-1} dL is lower-triangular.
 // Also, since the diagonal of L (all ones) is never exposed explicity (packed representation),
 // the diagonal of dL is zero, and hence diag(L^{-1} dL) = 0.
 // Assuming that U1 is full-rank, similarly, dU1 U1^{-1} is upper-triangular.
 // Combining these observations we conclude:
 //
-// L^{-1} dL = (L^{-1} P^T dX1 U1^{-1}) o 1_L,
-// dU1 U1^{-1} = (L^{-1} P^T dX1 U1^{-1}) o 1_U.
+// L^{-1} dL = (L^{-1} P^T dA1 U1^{-1}) o 1_L,
+// dU1 U1^{-1} = (L^{-1} P^T dA1 U1^{-1}) o 1_U.
 //
 // Hence,
-// dL = L [(L^{-1} P^T dX1 U1^{-1}) o 1_L],
-// dU1 = [(L^{-1} P^T dX1 U1^{-1}) o 1_U] U1.
+// dL = L [(L^{-1} P^T dA1 U1^{-1}) o 1_L],
+// dU1 = [(L^{-1} P^T dA1 U1^{-1}) o 1_U] U1.
 // As for dU2, from (*) it follows
-// P^T dX2 = dL U2 + L dU2 =>
-// dU2 = L^{-1} (P^T dX2 - dL U2).
+// P^T dA2 = dL U2 + L dU2 =>
+// dU2 = L^{-1} (P^T dA2 - dL U2).
 //
 // Backward AD:
 //
 // The following equality comes very handy:
 // Tr(A (B o C)) = Tr((A o B^T) C) (!)
+// or in other words, given that X -> B o X is a pointwise operation
+// its Jacobian is diagonal, so its differential is self-adjoint
+// <A, B o C> = <A o B, C>
 //
-// Tr(X_grad^H dX) = Tr(L_grad^H dL) + Tr(U_grad^H dU), then
+// Tr(A_grad^H dA) = Tr(L_grad^H dL) + Tr(U_grad^H dU), then
 //
-// Tr(L_grad^H dL) = Tr(L_grad^H L [(L^{-1} P^T dX1 U1^{-1}) o 1_L] = [using (!)]
-//                 = Tr((L_grad^H L o 1_L^T) L^{-1} P^T dX1 U1^{-1}) = [using the cyclic property of Tr]
-//                 = Tr(U1^{-1} (L_grad^H L o 1_L^T) L^{-1} P^T dX1)
+// Tr(L_grad^H dL) = Tr(L_grad^H L [(L^{-1} P^T dA1 U1^{-1}) o 1_L] = [using (!)]
+//                 = Tr((L_grad^H L o 1_L^T) L^{-1} P^T dA1 U1^{-1}) = [using the cyclic property of Tr]
+//                 = Tr(U1^{-1} (L_grad^H L o 1_L^T) L^{-1} P^T dA1)
 //
 // Similar, using (!) and the cyclic property of the trace operator:
 // Tr(U_grad^H dU) = Tr(U1_grad^H dU1) + Tr(U2_grad^H dU2)
-//                 = Tr(U1^{-1} (U1 U1_grad^H o 1_U^T) L^{-1} P^T dX1)
-//                 + Tr(U2_grad^H L^{-1} P^T dX2)
-//                 - Tr(U1^{-1} (U2 U2_grad^H o 1_L^T) L^{-1} P^T dX1)
+//                 = Tr(U1^{-1} (U1 U1_grad^H o 1_U^T) L^{-1} P^T dA1)
+//                   + Tr(U2_grad^H L^{-1} P^T dA2)
+//                   - Tr(U1^{-1} (U2 U2_grad^H o 1_L^T) L^{-1} P^T dA1)
 //
-// By combining the matrices to the left from dX1 and dX2 and then applying conjugate transposition,
+// By combining the matrices to the left from dA1 and dA2 and then applying conjugate transposition,
 // we finally arrive at:
 //
-// X1_grad = P L^{-H} [L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H},
-// X2_grad = P L^{-H} U2_grad
-Tensor plu_backward_base(
-  const variable_list& grads,
-  const Tensor& self,
+// A1_grad = P L^{-H} [L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H},
+// A2_grad = P L^{-H} U2_grad
+Tensor linalg_lu_backward(
+  const Tensor& L_grad,
+  const Tensor& U_grad,
   const Tensor& P,
   const Tensor& L,
-  const Tensor& U) {
-  auto L_grad = grads[0];
-  auto U_grad = grads[1];
+  const Tensor& U,
+  const bool pivot) {
+  at::NoTF32Guard disable_tf32;
+  // Return early if there's nothing to do
+  if (!L_grad.defined() && !U_grad.defined()) {
+    return {};
+  }
 
-  auto m = self.size(-2);
-  auto n = self.size(-1);
+  // L.shape == (..., m, k)
+  // U.shape == (..., k, n)
+  auto m = L.size(-2);
+  auto n = U.size(-1);
   auto k = std::min(m, n);
 
-  auto L_principal = L.narrow(-2, 0, k).narrow(-1, 0, k);
-  auto L_principal_H = L_principal.mH();
-  auto L_grad_principal = L_grad.narrow(-2, 0, k).narrow(-1, 0, k);
-  auto U_principal = U.narrow(-2, 0, k).narrow(-1, 0, k);
-  auto U_principal_H = U_principal.mH();
-  auto U_grad_principal = U_grad.narrow(-2, 0, k).narrow(-1, 0, k);
+  if (m == n) {
+    // A_grad = P L^{-H} [L^H L_grad o 1_L + U_grad U^H o 1_U] U^{-H},
+    auto A_grad = L_grad.defined() ? L.mH().matmul(L_grad).tril(-1) : Tensor{};
+    if (U_grad.defined()) {
+      A_grad = A_grad.defined() ? A_grad + U_grad.matmul(U.mH()).triu()
+                                :          U_grad.matmul(U.mH()).triu();
+    }
+    A_grad = at::linalg_solve_triangular(U.mH(), A_grad,
+                                         /*upper=*/false,
+                                         /*left=*/false);
+    A_grad = at::linalg_solve_triangular(L.mH(), A_grad,
+                                         /*upper=*/true,
+                                         /*left=*/true,
+                                         /*unitriangular=*/true);
+
+    return pivot ? P.matmul(std::move(A_grad)) : A_grad;
+  } else if (m < n) {
+    // Wide case
+    // A1_grad = P L^{-H} [U1_grad + (L^H L_grad o 1_L - U_grad U^H o 1_U) U1^{-H}) U^{-H}]
+    // A2_grad = P L^{-H}  U2_grad
+    const auto get_U1 = [n, k] (const Tensor& U) { return n == k ? U : U.narrow(-1, 0, k); };
+    const auto get_U2 = [n, k] (const Tensor& U) { return U.narrow(-1, k, n - k); };
+
+    auto A_grad = L_grad.defined() ? L.mH().matmul(L_grad) : Tensor{};
+    if (U_grad.defined()) {
+      A_grad = A_grad.defined() ? A_grad - U_grad.triu().matmul(U.mH())
+                                :        - U_grad.triu().matmul(U.mH());
+    }
+    A_grad = at::linalg_solve_triangular(get_U1(U).mH(), A_grad.tril(-1),
+                                         /*upper=*/false,
+                                         /*left=*/false);
 
-  auto phi_L = L_principal_H.matmul(L_grad_principal).tril(-1);
-  auto phi_U = U_grad_principal.matmul(U_principal_H).triu();
+    if (U_grad.defined()) {
+      A_grad = at::cat({A_grad + get_U1(U_grad).triu(), get_U2(U_grad)}, /*dim=*/-1);
+    }
 
-  auto phi = phi_L + phi_U;
+    A_grad = at::linalg_solve_triangular(L.mH(), A_grad,
+                                         /*upper=*/true,
+                                         /*left=*/true,
+                                         /*unitriangular=*/true);
 
-  Tensor self_grad;
-  if (m <= n) {
-    auto U_complement = U.narrow(-2, 0, k).narrow(-1, k, n - k);
-    auto U_grad_complement = U_grad.narrow(-2, 0, k).narrow(-1, k, n - k);
-
-    auto phi_complement = U_grad_complement.matmul(U_complement.mH()).tril(-1);
-
-    // recall the result for X1_grad and X2_grad from above.
-    // It can be rewritten as
-    // (X1_grad | X2_grad) = P L^{-H} psi, where
-    // psi = (psi1 | psi2)
-    //     = ([L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H} | U2_grad),
-    // so it is filled in parts.
-
-    // solve for psi1 to avoid the inversion of U1^H
-    auto psi_principal = at::linalg_solve_triangular(U_principal_H, phi - phi_complement,
-                                                     /*upper=*/false,
-                                                     /*left=*/false,
-                                                     /*unitriangular=*/false);
-    auto psi = at::cat({psi_principal, U_grad_complement}, /*dim=*/-1);
-
-    self_grad = P.matmul(at::linalg_solve_triangular(L_principal_H, psi,
-                                                     /*upper=*/true,
-                                                     /*left=*/true,
-                                                     /*unitriangular=*/true));
-  }
-  else {
-    // variables psi and phi carry the same meaning as in the case (m <= n),
-    // albeit they are differently defined.
-    auto L_complement = L.narrow(-2, k, m - k).narrow(-1, 0, k);
-    auto L_grad_complement = L_grad.narrow(-2, k, m - k).narrow(-1, 0, k);
+    if (!U_grad.defined()) {
+      A_grad = at::cat({A_grad, at::zeros_like(get_U2(U))}, /*dim=*/-1);
+    }
+    if (pivot) {
+      A_grad = P.matmul(A_grad);
+    }
+    return A_grad;
+  } else {
+    // Tall case
+    // A1_grad = P [L1_grad + L^{-H} (U_grad U^H o 1_U - L^H L_grad o 1_L)]U^{-H}
+    // A2_grad = P  L2_grad U^{-H}
 
-    auto phi_complement = L_complement.mH().matmul(L_grad_complement).triu();
+    const auto get_L1 = [m, k] (const Tensor& L) { return m == k ? L : L.narrow(-2, 0, k); };
+    const auto get_L2 = [m, k] (const Tensor& L) { return L.narrow(-2, k, m - k); };
 
+    auto A_grad = U_grad.defined() ? U_grad.matmul(U.mH()) : Tensor{};
+    if (L_grad.defined()) {
+      A_grad = A_grad.defined() ? A_grad - L.mH().matmul(L_grad.tril(-1))
+                                :        - L.mH().matmul(L_grad.tril(-1));
+    }
+    A_grad = at::linalg_solve_triangular(get_L1(L).mH(), A_grad.triu(),
+                                         /*upper=*/true,
+                                         /*left=*/true,
+                                         /*unitriangular=*/true);
 
-    auto psi_principal = at::linalg_solve_triangular(L_principal_H, phi - phi_complement,
-                                                     /*upper=*/true,
-                                                     /*left=*/true,
-                                                     /*unitriangular=*/true);
-    auto psi = at::cat({psi_principal, L_grad_complement}, -2);
+    if (L_grad.defined()) {
+      A_grad = at::cat({A_grad + get_L1(L_grad).tril(-1), get_L2(L_grad)}, /*dim=*/-2);
+    }
 
-    self_grad = at::linalg_solve_triangular(U_principal_H, P.matmul(psi),
-                                            /*upper=*/false,
-                                            /*left=*/false,
-                                            /*unitriangular=*/false);
-  }
+    A_grad = at::linalg_solve_triangular(U.mH(), A_grad,
+                                         /*upper=*/false,
+                                         /*left=*/false);
 
-  return self_grad;
+    if (!L_grad.defined()) {
+      A_grad = at::cat({A_grad, at::zeros_like(get_L2(L))}, /*dim=*/-2);
+    }
+    if (pivot) {
+      A_grad = P.matmul(A_grad);
+    }
+    return A_grad;
+  }
 }
 
 Tensor lu_factor_ex_backward(
   const Tensor& grad,
-  const Tensor& self,
   const Tensor& LU,
-  const Tensor& pivs) {
+  const Tensor& pivs,
+  const bool pivot) {
   Tensor P, L, U;
-  std::tie(P, L, U) = at::lu_unpack(LU, pivs);
-  // Note that packed LU could be represented as
-  // LU = L + U - I, hence
-  // L_grad = LU_grad,
-  // U_grad = LU_grad.
-  return plu_backward_base({/*L_grad=*/grad, /*U_grad=*/grad}, self, P, L, U);
+  std::tie(P, L, U) = at::lu_unpack(LU, pivs, /*unpack_data=*/true, /*unpack_pivots*/pivot);
+
+  // L.shape == (..., m, k)
+  // U.shape == (..., k, n)
+  const auto m = LU.size(-2);
+  const auto n = LU.size(-1);
+  const auto k = std::min(m, n);
+  const auto L_grad = grad.narrow(-1, 0, k);
+  const auto U_grad = grad.narrow(-2, 0, k);
+  return linalg_lu_backward(/*L_grad=*/L_grad, /*U_grad=*/U_grad, P, L, U, pivot);
 }
 
-Tensor lu_factor_ex_jvp(
+// This function is based on the forward AD derivations outlined
+// in the description to the linalg_lu_backward function.
+std::tuple<Tensor, Tensor> linalg_lu_jvp(
   const Tensor& dA,
-  const Tensor& LU,
-  const Tensor& pivs
-) {
-  // This function is based on the forward AD derivations outlined
-  // in the description to the plu_backward_base function.
-
-  Tensor P, L, U;
-  std::tie(P, L, U) = at::lu_unpack(LU, pivs);
+  const Tensor& P,
+  const Tensor& L,
+  const Tensor& U,
+  const bool pivot) {
+  at::NoTF32Guard disable_tf32;
 
-  auto m = LU.size(-2);
-  auto n = LU.size(-1);
+  auto m = dA.size(-2);
+  auto n = dA.size(-1);
   auto k = std::min(m, n);
 
-  auto PdA = P.mT().matmul(dA);
+  auto PdA = pivot ? P.mT().matmul(dA) : dA;
 
   // similar to the backward implementation, we also consider block structures such as:
   // for a matrix A of size m x n we decompose it as
@@ -5096,40 +5396,79 @@ Tensor lu_factor_ex_jvp(
   auto L1 = L.narrow(-2, 0, k).narrow(-1, 0, k);
   auto U1 = U.narrow(-2, 0, k).narrow(-1, 0, k);
 
-  // dK = L1^{-1} PdA1
+  // We form using two triangular_solve the matrix, the second one in place
+  // dK = L1^{-1} PdA1 U2^{-1}
   auto dK = at::linalg_solve_triangular(L1, PdA1, /*upper=*/false, /*left=*/true, /*unitriangular*/true);
-  // dK <- dK U1^{-1}
+
+  // TODO We should be able to do this in-place. At the moment it raises:
+  //  RuntimeError: linalg_solve_triangular(): functions with out=...
+  //  arguments don't support automatic differentiation, but one of the arguments requires grad.
+
+  //  at::linalg_solve_triangular_out(dK, U1, dK, /*upper=*/true, /*left=*/false);
   dK = at::linalg_solve_triangular(U1, dK, /*upper=*/true, /*left=*/false);
 
   auto dL1 = L1.matmul(dK.tril(-1));
   auto dU1 = dK.triu().matmul(U1);
 
-  // since LU = L + U - I, we have that dLU = dL + dU
-  // if LU is of size m x n, we always have
-  // dLU1 = dL1 + dU1, where the block indexing follows the rules
-  // outlined above.
   if (m == n) {
-    return dL1 + dU1;
+    return std::make_tuple(std::move(dL1), std::move(dU1));
+  } else if (m < n) {
+    // we only need to update dU2 defined as
+    // dU2 := L1^{-1} PdA2 - dK.tril(-1) U2)
+    const auto PdA2 = PdA.narrow(-1, k, n - k);
+    const auto U2 = U.narrow(-1, k, n - k);
+    auto dU2 = at::linalg_solve_triangular(L1, PdA2, /*upper=*/false, /*left=*/true, /*unitriangular*/true) - dK.tril(-1).matmul(U2);
+    return std::make_tuple(std::move(dL1), at::cat({dU1, dU2}, /*dim=*/-1));
+  } else {
+    // we only need to update dL2 defined as
+    // dL2 := PdA2 U^{-1} - L2 dK.triu()
+    const auto PdA2 = PdA.narrow(-2, k, m - k);
+    const auto L2 = L.narrow(-2, k, m - k);
+    auto dL2 = at::linalg_solve_triangular(U1, PdA2, /*upper=*/true, /*left=*/false) - L2.matmul(dK.triu());
+    return std::make_tuple(at::cat({dL1, dL2}, /*dim=*/-2), std::move(dU1));
   }
-  else {
-    auto dLU1 = dL1 + dU1;
-
-    if (m < n) {
-      // we only need to update dLU2 defined as
-      // dLU2 := L1^{-1} PdA2 - dK.tril(-1) U2
-      auto PdA2 = PdA.narrow(-1, k, n - k);
-      auto U2 = U.narrow(-1, k, n - k);
-      auto dLU2 = at::linalg_solve_triangular(L1, PdA2, /*upper=*/false, /*left=*/true, /*unitriangular*/true) - dK.tril(-1).matmul(U2);
-      return at::cat({dLU1, dLU2}, /*dim=*/-1);
-    }
-    else {
-      // we only need to update dLU2 defined as
-      // dLU2 := PdA2 U1^{-1} - L2 dK.triu()
-      auto PdA2 = PdA.narrow(-2, k, m - k);
-      auto L2 = L.narrow(-2, k, m - k);
-      auto dLU2 = at::linalg_solve_triangular(U1, PdA2, /*upper=*/true, /*left=*/false) - L2.matmul(dK.triu());
-      return at::cat({dLU1, dLU2}, /*dim=*/-2);
-    }
+}
+
+Tensor lu_factor_ex_jvp(
+  const Tensor& dA,
+  const Tensor& LU,
+  const Tensor& pivs,
+  const bool pivot
+) {
+  Tensor dL, dU;
+  {
+    Tensor P, L, U;
+    std::tie(P, L, U) = at::lu_unpack(LU, pivs, /*unpack_data=*/true, /*unpack_pivots=*/pivot);
+    std::tie(dL, dU) = linalg_lu_jvp(dA, P, L, U, pivot);
+  }
+
+  auto m = dA.size(-2);
+  auto n = dA.size(-1);
+  if (m >= n) {
+    dL.narrow(-2, 0, n).add_(dU);
+    return dL;
+  } else {
+    dU.narrow(-1, 0, m).add_(dL);
+    return dU;
+  }
+}
+
+Tensor logsumexp_jvp(const Tensor& self_p, const Tensor& self_t, IntArrayRef dim, bool keepdim) {
+  // NB: for simplicitly, we recompute some values that can be reused from forward
+  auto self_p_exp = (self_p - at::amax(self_p, dim, true)).exp();  // Use the exp-normalize trick
+  auto sumexp_p = self_p_exp.sum(dim, keepdim);
+
+  // NB: it's OK for logsumexp_jvp to be reused for formulas like softmax/log_softmax
+  //     that only have one differentiable input, because that means self_t are never zerotensors
+  TORCH_INTERNAL_ASSERT(!self_t._is_zerotensor())
+  if (areAnyTensorSubclassLike({self_p, self_t})) {
+    auto result = (self_p_exp * self_t).sum(dim, keepdim);
+    result /= sumexp_p;
+    return result;
+  } else {
+    self_p_exp *= self_t;
+    auto sumexp_t = self_p_exp.sum(dim, keepdim);
+    return sumexp_t /= sumexp_p;
   }
 }
 
@@ -5157,41 +5496,157 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
   return result;
 }
 
-Tensor scatter_reduce_backward(const Tensor & grad,
-                               const Tensor& input,
-                               int dim,
-                               const Tensor & index,
-                               c10::string_view reduce,
-                               const Tensor & result){
-  Tensor grad_input;
-
+std::tuple<Tensor, Tensor> scatter_reduce_backward(
+  const Tensor& grad,
+  const Tensor& self,
+  int dim,
+  const Tensor& index,
+  const Tensor& src,
+  c10::string_view reduce,
+  bool include_self,
+  const Tensor& result) {
+  Tensor grad_self, grad_src;
+
+  // FIXME: complex gradients not handled correctly
+  // For now this is ok as scatter_reduce isn't added to the whitelist
+  // in tools/autograd/gen_variable_type.py
 
-  // TODO: gather doesn't support broadcasting of input and index
-  // currently this works because scatter_reduce doesn't support broadcasting yet but
-  // this needs to be fixed when scatter_reduce is upgraded to support broadcasting
-  // by broadcasting index here too.
+  if (!grad.defined()) {
+    return std::make_tuple(grad_self, grad_src);
+  }
 
   if (reduce == "sum") {
-    grad_input = grad.gather(dim, index);
+    grad_self = grad;
+    grad_src = grad.gather(dim, index);
   } else if (reduce == "prod") {
-    grad_input = (grad * result).gather(dim, index) / input;
-    // handle nans in above computation when input = 0, we know result = 0 (0 / 0 -> nan)
-    // so just replace with 0
-    grad_input.masked_fill_(input == 0, 0);
+    // Explicitly compute exclusive prod for elements in self/src that are 0
+    Tensor masked_self = self.masked_fill(self == 0, 1);
+    Tensor masked_self_result = masked_self.scatter_reduce(dim, index, src, reduce, include_self);
+    grad_self = grad * masked_self_result / masked_self;
+    Tensor src_zero = src == 0;
+    Tensor src_num_zeros = zeros_like(self).scatter_add(dim, index, src_zero.to(self.dtype())).gather(dim, index);
+    Tensor src_single_zero = bitwise_and(src_zero, src_num_zeros == 1);
+    // For src positions with src_single_zero, grad * result.gather(dim,index) / src.masked_fill(src_zero, 1)
+    // would incorrectly propagate zeros as the gradient
+    Tensor masked_src = src.masked_fill(src_single_zero, 1);
+    Tensor masked_src_result = self.scatter_reduce(dim, index, masked_src, reduce, include_self);
+    Tensor grad_src1 = where(src_single_zero,
+                             (grad * masked_src_result).gather(dim, index),
+                             (grad * result).gather(dim, index) / src.masked_fill(src_zero, 1));
+    if ((src_num_zeros > 1).any().item<bool>()) {
+      auto node = std::make_shared<DelayedError>(
+        "scatter_reduce(): Double backward is unsupported for src when >1 zeros in src are scattered to the same position in self",
+        /* num inputs */ 1);
+      auto result = node->apply({ grad_src1 });
+      grad_src = result[0];
+    } else {
+      grad_src = grad_src1;
+    }
   } else if (reduce == "mean") {
-    Tensor N = zeros_like(grad);
-    N.scatter_add_(dim, index, ones_like(input));
-    Tensor N_input = N.gather(dim, index);
-    grad_input = grad.gather(dim, index) / N_input;
-    grad_input.masked_fill_(N_input == 0, 0);
+    Tensor N = include_self ? ones_like(grad) : zeros_like(grad);
+    N = N.scatter_add(dim, index, ones_like(src));
+    N.masked_fill_(N == 0, 1);
+    grad_self = grad / N;
+    Tensor N_src = N.gather(dim, index);
+    grad_src = grad.gather(dim, index) / N_src;
   } else if (reduce == "amax" || reduce == "amin") {
+    // Evenly distribute gradient when there are multiple max/mins
     Tensor value = result.gather(dim, index);
-    grad_input = (input == value) * grad.gather(dim, index);
+    Tensor self_is_result = (self == result).to(self.scalar_type());
+    Tensor src_is_result = (src == value).to(self.scalar_type());
+    Tensor N_to_distribute = self_is_result.scatter_add(dim, index, src_is_result);
+    Tensor grad_distributed = grad / N_to_distribute;
+    grad_self = (self == result) * grad_distributed;
+    grad_src = (src == value) * grad_distributed.gather(dim, index);
   } else {
     AT_ERROR("Expected 'reduce' to be one of 'sum', 'prod', 'mean', 'amax', 'amin' but got ", reduce, ".");
   }
 
-  return grad_input;
+  if (!include_self) {
+    grad_self = grad_self.scatter(dim, index, 0);
+  }
+
+  return std::make_tuple(grad_self, grad_src);
+
+}
+
+Tensor _to_copy_backward(const Tensor &grad_, const c10::TensorOptions &self_options) {
+  // Handle R->C copies without raising a warning
+  const auto self_type = self_options.dtype().toScalarType();
+  auto grad = c10::MaybeOwned<at::Tensor>::borrowed(grad_);
+  if (!c10::isComplexType(self_type) && grad->is_complex()) {
+    grad = c10::MaybeOwned<at::Tensor>::owned(at::real(grad_));
+  }
+
+  return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false);
+}
+
+std::tuple<Tensor, Tensor> index_reduce_backward(
+  const Tensor& grad,
+  const Tensor& self,
+  int dim,
+  const Tensor& index,
+  const Tensor& source,
+  c10::string_view reduce,
+  bool include_self,
+  const Tensor& result) {
+  Tensor grad_self, grad_src;
+
+  // FIXME: index_add's backward formula has a special case for source.dim == 0
+  // but this case seems to throw the error "IndexError: dimension specified as 0 but tensor has no dimensions"
+  // look into whether this case is reachable and should be covered here
+
+  if (!grad.defined()) {
+    return std::make_tuple(grad_self, grad_src);
+  }
+
+  if (reduce == "prod") {
+    Tensor masked_self = self.masked_fill(self == 0, 1);
+    Tensor masked_self_result = masked_self.index_reduce(dim, index, source, reduce, include_self);
+    grad_self = grad * masked_self_result / masked_self;
+    Tensor src_zero = source == 0;
+    Tensor src_num_zeros = zeros_like(self).index_add(dim, index, src_zero.to(self.dtype())).index_select(dim, index);
+    Tensor src_single_zero = bitwise_and(src_zero, src_num_zeros == 1);
+    // For src positions with src_single_zero, (grad * result).index_select(dim,index) / source.masked_fill(src_zero, 1)
+    // would incorrectly propagate zeros as the gradient
+    Tensor masked_src = source.masked_fill(src_single_zero, 1);
+    Tensor masked_src_result = self.index_reduce(dim, index, masked_src, reduce, include_self);
+    Tensor grad_src1 = where(src_single_zero,
+                             (grad * masked_src_result).index_select(dim, index),
+                             (grad * result).index_select(dim, index) / source.masked_fill(src_zero, 1));
+    if ((src_num_zeros > 1).any().item<bool>()) {
+      auto node = std::make_shared<DelayedError>(
+        "index_reduce(): Double backward is unsupported for source when >1 zeros in source are scattered to the same position in self",
+        /* num inputs */ 1);
+      auto result = node->apply({ grad_src1 });
+      grad_src = result[0];
+    } else {
+      grad_src = grad_src1;
+    }
+  } else if (reduce == "mean") {
+    Tensor N = include_self ? ones_like(grad) : zeros_like(grad);
+    N = N.index_add(dim, index, ones_like(source));
+    N.masked_fill_(N == 0, 1);
+    grad_self = grad / N;
+    Tensor N_src = N.index_select(dim, index);
+    grad_src = grad.index_select(dim, index) / N_src;
+  } else if (reduce == "amax" || reduce == "amin") {
+    Tensor value = result.index_select(dim, index);
+    Tensor self_is_result = (self == result).to(self.scalar_type());
+    Tensor source_is_result = (source == value).to(self.scalar_type());
+    Tensor N_to_distribute = self_is_result.index_add(dim, index, source_is_result);
+    Tensor grad_distributed = grad / N_to_distribute;
+    grad_self = self_is_result * grad_distributed;
+    grad_src = source_is_result * grad_distributed.index_select(dim, index);
+  } else {
+    AT_ERROR("Expected 'reduce' to be one of 'prod', 'amax', 'amin' or 'mean' but got ", reduce, ".");
+  }
+
+  if (!include_self) {
+    grad_self = grad_self.index_fill(dim, index, 0);
+  }
+
+  return std::make_tuple(grad_self, grad_src);
 
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 739b44b4d62f..3f8f162ad5b1 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -49,7 +49,16 @@ Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim
 Tensor scale_grad_by_count(const Tensor &grad, const Tensor &mask, IntArrayRef dims);
 at::Tensor norm_backward(const at::Tensor & grad, const at::Tensor & self, const optional<at::Scalar> & p_, const at::Tensor & norm);
 at::Tensor norm_backward(at::Tensor grad, const at::Tensor & self, const optional<at::Scalar> & p_, at::Tensor norm, at::IntArrayRef dim, bool keepdim);
-at::Tensor linalg_vector_norm_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & ord, at::Tensor norm, const c10::optional<at::IntArrayRef> & opt_dim, bool keepdim);
+Tensor norm_jvp(
+  const Tensor& self_p, const Tensor& self_t,
+  const optional<Scalar> & p_,
+  Tensor norm,
+  IntArrayRef dim,
+  bool keepdim
+);
+Tensor norm_jvp(const Tensor& grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm);
+Tensor linalg_vector_norm_jvp(const Tensor& self_p, const Tensor& self_t, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim);
+at::Tensor linalg_vector_norm_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & ord, at::Tensor norm, const at::OptionalIntArrayRef & opt_dim, bool keepdim);
 at::Tensor pow_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & exponent_);
 at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at::Tensor & exponent);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
@@ -77,6 +86,7 @@ at::Tensor solve_backward_self(const at::Tensor & grad, const at::Tensor & self,
 at::Tensor solve_backward_A(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & A, const at::Tensor & solution);
 at::Tensor cumsum_backward(const at::Tensor & grad, int64_t dim);
 at::Tensor logsumexp_backward(at::Tensor grad, const at::Tensor & self, at::Tensor result, at::IntArrayRef dim, bool keepdim);
+at::Tensor logsumexp_jvp(const at::Tensor& self_p, const at::Tensor& self_t, IntArrayRef dim, bool keepdim);
 at::Tensor logcumsumexp_backward(at::Tensor grad, const at::Tensor & self, at::Tensor result, int64_t dim);
 at::Tensor unbind_backward(const variable_list& grads, int64_t dim);
 at::Tensor unsqueeze_to(const at::Tensor & self, at::IntArrayRef sizes);
@@ -85,10 +95,15 @@ std::vector<at::Tensor> cat_tensors_backward(const at::Tensor & grad, const std:
 at::Tensor clamp_backward(const at::Tensor & grad, const at::Tensor &self, const optional<at::Scalar>& min, const optional<at::Scalar>& max);
 at::Tensor clamp_backward(const at::Tensor & grad, const at::Tensor &self, const at::Tensor& min, const at::Tensor& max);
 std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(const at::Tensor& grad, const at::Tensor& self, const at::Tensor& min, const at::Tensor& max, const std::array<bool, 2>&);
+at::Tensor clamp_jvp(
+  const Tensor& self_p, const Tensor& self_t,
+  const Tensor& min_p, const Tensor& min_t,
+  const Tensor& max_p, const Tensor& max_t
+);
 at::IntArrayRef strides_or_error(const Tensor & input, c10::string_view const & input_name);
-at::Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, const Scalar & alpha);
-at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at::IntArrayRef sizes, at::IntArrayRef strides, const at::Scalar & alpha);
-at::Tensor _sparse_addmm_sparse_backward(const at::Tensor& grad, const at::Tensor& sparse_, const at::Tensor& dense, const at::Scalar& alpha);
+at::Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, c10::Layout mat1_layout, const Scalar & alpha);
+at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at::IntArrayRef sizes, at::IntArrayRef strides, c10::Layout layout, const at::Scalar & alpha);
+at::Tensor mm_mat1_sparse_backward(const at::Tensor& grad, const at::Tensor& mat1, const at::Tensor& mat2, const at::Scalar& alpha);
 at::Tensor sparse_sparse_matmul_backward(const at::Tensor& grad, const at::Tensor& mat1, const at::Tensor& mat2,int64_t grad_order);
 at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, const at::Scalar& p, int64_t dim, const at::Scalar& maxnorm);
 at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape);
@@ -97,16 +112,17 @@ at::Tensor infinitely_differentiable_native_dropout_backward(const at::Tensor& g
 at::Tensor native_dropout_double_backward(const at::Tensor& ggI, const at::Tensor& grad, const at::Tensor& mask, double scale);
 at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value);
 at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self);
-at::Tensor var_backward(at::Tensor grad, const at::Tensor& self, c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction, bool keepdim);
-at::Tensor var_jvp(const at::Tensor& self_t, const at::Tensor& self_p, const at::Tensor& result, c10::optional<IntArrayRef> dim_opt, c10::optional<int64_t> correction_opt, bool keepdim);
-at::Tensor std_backward(const at::Tensor& result, const at::Tensor& grad, const at::Tensor& self, c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction, bool keepdim);
+at::Tensor var_backward(at::Tensor grad, const at::Tensor& self, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim);
+at::Tensor var_jvp(const at::Tensor& self_t, const at::Tensor& self_p, const at::Tensor& result, at::OptionalIntArrayRef dim_opt, c10::optional<int64_t> correction_opt, bool keepdim);
+at::Tensor std_backward(const at::Tensor& result, const at::Tensor& grad, const at::Tensor& self, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim);
 at::Tensor mean_backward(at::Tensor grad, const at::IntArrayRef sizes, at::IntArrayRef dim, bool keepdim);
 at::Tensor mean_backward(at::Tensor grad, const at::IntArrayRef sizes, int64_t numel);
-at::Tensor var_std_mean_backward(const variable_list& grads, const at::Tensor& self, const at::Tensor& r1, const at::Tensor& r2, c10::optional<IntArrayRef> dim, c10::optional<int64_t> correction, bool keepdim, bool is_std);
+at::Tensor var_std_mean_backward(const variable_list& grads, const at::Tensor& self, const at::Tensor& r1, const at::Tensor& r2, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim, bool is_std);
 at::Tensor masked_scatter_backward(const at::Tensor & grad, const at::Tensor & mask, at::IntArrayRef sizes);
-at::Tensor cholesky_backward(at::Tensor grad, bool upper, at::Tensor L);
+at::Tensor cholesky_backward(const at::Tensor& grad, bool upper, const at::Tensor& L);
 at::Tensor cholesky_jvp(const at::Tensor& input_tangent, const at::Tensor& L, bool upper);
 at::Tensor cholesky_inverse_backward(at::Tensor grad, at::Tensor L, bool upper, at::Tensor inverse);
+at::Tensor cholesky_inverse_jvp(const at::Tensor& F, const at::Tensor& dF, const at::Tensor& X, bool upper);
 Tensor pinv_jvp(
   const Tensor& A,
   const Tensor& pinvA,
@@ -133,6 +149,14 @@ Tensor binary_cross_entropy_target_backward(
   const Tensor& target,
   const c10::optional<Tensor>& weight,
   int64_t reduction);
+Tensor binary_cross_entropy_double_backward_target(
+  const Tensor& grad,
+  const Tensor& grad_output,
+  const Tensor& self,
+  const Tensor& target,
+  const c10::optional<Tensor>& weight,
+  int64_t reduction
+);
 at::Tensor binary_cross_entropy_with_logits_target_backward(const at::Tensor& grad_output, const at::Tensor& self, const at::Tensor& target, const c10::optional<at::Tensor>& weight, const c10::optional<at::Tensor>& pos_weight, int64_t reduction);
 at::Tensor binary_cross_entropy_with_logits_jvp(const Tensor& input_t, const Tensor& target_t, const Tensor& input_p, const Tensor& target_p, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& pos_weight_opt, int64_t reduction);
 at::Tensor log_sigmoid_double_backward(const at::Tensor & grad, const at::Tensor & input);
@@ -226,23 +250,9 @@ std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3,
                                                        IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3,
                                                        IntArrayRef sumdim, std::array<bool, 3> grad_mask);
-std::tuple<Tensor, Tensor> linalg_qr_jvp(
-  const Tensor& dA,
-  const Tensor& Q,
-  const Tensor& R
-);
-Tensor linalg_qr_jvp_Q(
-  const Tensor& dA,
-  const Tensor& Q,
-  const Tensor& R
-);
-Tensor linalg_qr_jvp_R(
-  const Tensor& dA,
-  const Tensor& Q,
-  const Tensor& R
-);
-Tensor linalg_qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
-                          c10::string_view mode, const Tensor& Q, const Tensor& R);
+std::tuple<Tensor, Tensor> linalg_qr_jvp(const Tensor& dA, const Tensor& Q, const Tensor& R,
+                                         const c10::string_view mode);
+Tensor linalg_qr_backward(const Tensor& gQ, const Tensor& gR, const Tensor& Q, const Tensor& R, const c10::string_view mode);
 Tensor eig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool eigenvectors, const Tensor& lambda, const Tensor& v);
 Tensor linalg_matrix_exp_differential(const Tensor& self, const Tensor& grad, bool adjoint);
@@ -297,12 +307,18 @@ infinitely_differentiable_native_group_norm_backward(
     int64_t group,
     double eps,
     std::array<bool, 3> grad_input_mask);
+Tensor prelu_jvp(const Tensor& x, const Tensor& dx, const Tensor& w, const Tensor& dw);
 std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
     const Tensor & grad_grad_input,
     const Tensor & grad_grad_weight,
     const Tensor & grad_out,
     const Tensor & input_,
     const Tensor & weight_);
+Tensor gelu_double_backward(
+    const Tensor & ggI,
+    const Tensor & gO,
+    const Tensor & input,
+    c10::string_view approximate);
 Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntArrayRef sizes, IntArrayRef strides, optional<int64_t> storage_offset_);
 std::tuple<Tensor, Tensor> atan2_backward(const Tensor& grad, const Tensor& self, const Tensor& other, std::array<bool, 2> output_mask);
 std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
@@ -351,9 +367,10 @@ Tensor lu_solve_jvp(
   const Tensor& LU_pivots
 );
 Tensor lu_unpack_backward(
-  const variable_list& grads,
-  const Tensor& LU_data,
-  bool unpack_data
+  const Tensor& L_grad,
+  const Tensor& U_grad,
+  const int64_t m,
+  const int64_t n
 );
 
 Tensor _det_lu_based_helper_backward(
@@ -373,23 +390,32 @@ std::tuple<Tensor, Tensor> linalg_lstsq_backward(
   const std::array<bool, 2>& grad_input_mask
 );
 
-Tensor lu_backward_base(
-  const variable_list& grads,
-  const Tensor& self,
+Tensor linalg_lu_backward(
+  const Tensor& L_grad,
+  const Tensor& U_grad,
   const Tensor& P,
   const Tensor& L,
-  const Tensor& U
-);
+  const Tensor& U,
+  const bool pivot);
+
+std::tuple<Tensor, Tensor> linalg_lu_jvp(
+  const Tensor& dA,
+  const Tensor& P,
+  const Tensor& L,
+  const Tensor& U,
+  const bool pivot);
+
 Tensor lu_factor_ex_backward(
   const Tensor& grad,
-  const Tensor& self,
   const Tensor& LU,
-  const Tensor& pivs
+  const Tensor& pivs,
+  const bool pivot
 );
 Tensor lu_factor_ex_jvp(
   const Tensor& dX,
   const Tensor& LU,
-  const Tensor& pivs
+  const Tensor& pivs,
+  const bool pivot
 );
 
 Tensor batch_norm_jvp(
@@ -460,15 +486,29 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
     at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, bool transposed, int64_t groups,
     ::std::array<bool,2> output_mask);
 
-Tensor scatter_reduce_backward(
+std::tuple<Tensor, Tensor> scatter_reduce_backward(
   const Tensor& grad,
-  const Tensor& input,
+  const Tensor& self,
   int dim,
   const Tensor& index,
+  const Tensor& src,
   c10::string_view reduce,
+  bool include_self,
   const Tensor& result
 );
 
+Tensor _to_copy_backward(const Tensor &grad, const c10::TensorOptions &self_options);
+
+std::tuple<Tensor, Tensor> index_reduce_backward(
+  const Tensor& grad,
+  const Tensor& self,
+  int dim,
+  const Tensor& index,
+  const Tensor& source,
+  c10::string_view reduce,
+  bool include_self,
+  const Tensor& result
+);
 
 } // namespace details
 } // namespace generated
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 031b50215d8c..a96fa42abd17 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -283,7 +283,9 @@ void general_trace_function(
         AT_ASSERT(iter->isObject());
         tracer::addOutput(node, iter->toObject());
       } else {
-        throw std::runtime_error("unsupported output type: " + type->str());
+        throw std::runtime_error(
+            "unsupported output type: " + type->str() +
+            ", from operator: " + toString(op.operator_name()));
       }
     }
   }
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index b3bb488c9641..bf8cccf62ea7 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -90,7 +90,7 @@ namespace {
       if (base.sizes()[i] != other.sizes()[i]) {
         return false;
       }
-      if (base.strides()[i] != other.strides()[i] && base.sizes()[i] != 1) {
+      if (base.strides()[i] != other.strides()[i] && base.sizes()[i] != 1 && base.sizes()[i] != 0) {
         return false;
       }
     }
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 4acebe1266e7..553e8aa67470 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -112,7 +112,7 @@ void _process_forward_mode_AD(const variable_list &inputs,
   const auto num_forward_grads = forward_grads.size();
   // contrary to backward mode, we don't allow returning too many gradients
   TORCH_CHECK(num_forward_grads == num_outputs, "Function's jvp returned "
-              "an invalid number of of forward gradients (expected ", num_outputs,
+              "an invalid number of forward gradients (expected ", num_outputs,
               " but got ", num_forward_grads, ")");
 
   for (const auto i : c10::irange(num_outputs)) {
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 22f138e2a14f..401f679d3d89 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -50,12 +50,20 @@ static void forked_autograd_child() { in_bad_autograd_fork = true; }
 
 // Should be called before unsafe for forks (thread pool) calls
 static void track_bad_autograd_forks() {
-#if !defined(WIN32) && !defined(__XROS__)
+#if !defined(WIN32)
   static std::once_flag flag;
   std::call_once(
       flag, [&] { pthread_atfork(nullptr, nullptr, forked_autograd_child); });
 #endif
 }
+
+inline bool should_run_in_cpu_ready_queue(c10::DeviceType device) {
+  if (device == c10::kCPU || device == c10::kMeta || device == c10::kLazy) {
+    return true;
+  } else {
+    return false;
+  }
+}
 }
 
 // Threads spawned by the engine are assigned a 'worker_device' specifying
@@ -92,9 +100,10 @@ C10_DEFINE_TLS_static(std::shared_ptr<GraphTask>, tls_current_graph_task);
 // Engine::init_local_ready_queue() call in each corresponding thread before execution.
 //
 // The CUDA, XLA threads are shared among all invocations of backwards via
-// device_ready_queues_, while CPU threads are dedicated to processing CPU work for
-// the backward they invoked. So any given graph task maintains its own cpu_ready_queue_
-// where you should send work for it to be done
+// device_ready_queues_, while the caller thread is dedicated to processing work for
+// devices returning true in should_run_in_cpu_ready_queue (most notably the CPU device).
+// So any given graph task maintains its own cpu_ready_queue_ where you should send work
+// for it to be done.
 //
 // For reentrant backward calls, if we spawn new thread from the current thread
 // because we reached the maximum depth, the new thread will just reuse the same
@@ -380,6 +389,11 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
   // backwards, user thread), this function is expected to exit once that
   // graph_task complete.
 
+#ifdef USE_ROCM
+  // Keep track of backward pass for rocblas.
+  at::ROCmBackwardPassGuard in_backward;
+#endif
+
   // local_ready_queue should already been initialized when we get into thread_main
   TORCH_INTERNAL_ASSERT(local_ready_queue != nullptr);
   while (graph_task == nullptr || !graph_task->future_result_->completed()) {
@@ -424,7 +438,7 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
                 c10::str(
                     "autograd::engine::evaluate_function: ",
                     task.fn_.get()->name()),
-                std::vector<c10::IValue>());
+                c10::ArrayRef<const c10::IValue>());
             evaluate_function(
                 local_graph_task,
                 task.fn_.get(),
@@ -706,7 +720,8 @@ void validate_outputs(
        // In future, there will be an oppportunity to support more combinations of layouts if they are composable
        // (example., operations like addition etc., are well defined between tensors of different layouts.),
        // as well as all parts of autograd like AccumulateGrad correctly handle this.
-       if (!grad.is_sparse()) {
+       // We allow grad to be Strided when metadata is SparseCsr
+       if (!grad.is_sparse() && !(grad.layout() == at::kStrided && metadata.layout() == at::kSparseCsr)) {
         std::stringstream ss;
         ss << "invalid gradient at index " << i << " - expected layout ";
         ss << metadata.layout() << " but got " << grad.layout();
@@ -1044,7 +1059,6 @@ auto Engine::execute(const edge_list& roots,
 }
 
 void Engine::initialize_device_threads_pool() {
-  track_bad_autograd_forks();
   TORCH_CHECK(!in_bad_autograd_fork,
               "Unable to handle autograd's threading in combination with fork-based multiprocessing. "
               "See https://github.com/pytorch/pytorch/wiki/Autograd-and-Fork");
@@ -1167,23 +1181,14 @@ void Engine::init_local_ready_queue(std::shared_ptr<ReadyQueue> ready_queue) {
   }
 }
 
-size_t Engine::ready_queue_size(const std::shared_ptr<GraphTask>& graph_task, at::Device device) {
-  if (device_ready_queues_.empty()) {
-    // The vector device_ready_queues_ is initialized in start_device_threads, but this method
-    // can be called before start_device_threads. Adding this check to avoid index
-    // out of bound error.
-    return 0;
-  }
-  return ready_queue(graph_task->cpu_ready_queue_, device)->size();
-}
-
 // CPU ready queue is per GraphTask, but CUDA device ready queues are shared across all graph tasks
 auto Engine::ready_queue(std::shared_ptr<ReadyQueue> cpu_ready_queue, at::Device device) -> std::shared_ptr<ReadyQueue>{
-  if (device.type() == at::kCPU || device.type() == at::DeviceType::Meta) {
+  if (should_run_in_cpu_ready_queue(device.type())) {
     // return the cpu ready queue passed in
     TORCH_INTERNAL_ASSERT(cpu_ready_queue);
     return cpu_ready_queue;
   } else {
+    TORCH_INTERNAL_ASSERT(0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()));
     // See Note [Allocating GPUs to autograd threads]
     return device_ready_queues_.at(device.index());
   }
@@ -1195,8 +1200,7 @@ auto Engine::ready_queue_by_index(std::shared_ptr<ReadyQueue> cpu_ready_queue, i
     TORCH_INTERNAL_ASSERT(cpu_ready_queue);
     return cpu_ready_queue;
   } else {
-    // Static cast is ok here as the number of device should never overflow an int.
-    TORCH_INTERNAL_ASSERT(0 <= device_index && device_index < static_cast<int>(device_ready_queues_.size()));
+    TORCH_INTERNAL_ASSERT(0 <= device_index && device_index < static_cast<c10::DeviceIndex>(device_ready_queues_.size()));
     // See Note [Allocating GPUs to autograd threads]
     // NB: This function would become obsolete if we truly allocated a CPU thread
     // per device, rather than colocate.
@@ -1205,15 +1209,29 @@ auto Engine::ready_queue_by_index(std::shared_ptr<ReadyQueue> cpu_ready_queue, i
 }
 
 auto Engine::start_device_threads() -> void {
+  // First always initialize the thread pool for re-entrant threads
+  thread_pool_shared_ = std::make_shared<ThreadPoolShared>();
+
+  // Second, create special threads for each non-CPU device
   // See Note [Allocating GPUs to autograd threads]
   c10::DeviceIndex num_devices = 0;
   for (const auto& impl_atomic : c10::impl::device_guard_impl_registry) {
     auto* impl = impl_atomic.load();
-    if (impl) {
+    // Only record the number of devices for device that don't run on the
+    // cpu ready queue.
+    if (impl && !should_run_in_cpu_ready_queue(impl->type())) {
       num_devices = std::max(num_devices, impl->deviceCount());
     }
   }
 
+  // If there are no device except cpu, no need to create worker threads
+  if (num_devices == 0) {
+    return;
+  }
+
+  // Since we're about to create threads, forking is not possible anymore
+  track_bad_autograd_forks();
+
   // allocate one thread for every GPU device (but colocate GPUs of different
   // types), and pre-allocate the device_ready_queues_ to ensure safe reading on it.
   device_ready_queues_ = std::vector<std::shared_ptr<ReadyQueue>>(num_devices);
@@ -1221,8 +1239,6 @@ auto Engine::start_device_threads() -> void {
     queue = std::make_shared<ReadyQueue>();
   }
 
-  thread_pool_shared_ = std::make_shared<ThreadPoolShared>();
-
   for (const auto i : c10::irange(num_devices)) {
     std::thread t(&Engine::thread_init, this, i, device_ready_queues_[i], true);
     t.detach();
@@ -1246,6 +1262,8 @@ void Engine::add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task) {
   // Don't need to be holding the lock while actually creating the thread
   lck.unlock();
   if (create_thread) {
+    // If we're creating a new thread, forking is not allowed anymore
+    track_bad_autograd_forks();
     std::thread t(&Engine::reentrant_thread_init, this);
     t.detach();
   }
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index ae0b32932184..6aae048432ce 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -340,8 +340,6 @@ struct TORCH_API Engine {
 
   bool is_checkpoint_valid();
 
-  size_t ready_queue_size(const std::shared_ptr<GraphTask>& graph_task, at::Device device);
-
   // Should be called after fork to notify that worker threads are gone
   void release_workers();
 
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index cc5fa59e9ed6..dfeb1c973df5 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -151,24 +151,21 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
     // probably operate with names.
     at::NoNamesGuard no_names_guard;
 
-    bool pre_sampled = false;
-    if (at::shouldRunRecordFunction(&pre_sampled)) {
-      // Using RecordFunction to trigger observers in the backward pass
-      at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION, pre_sampled);
-      if (guard.isActive()) {
-        // Using sequence number and thread id to correlate with
-        // the forward pass function
-        guard.setForwardThreadId(thread_id_);
-        if (guard.needsInputs()) {
-          guard.before(
-            name(),
-            std::vector<c10::IValue>(inputs.begin(), inputs.end()),
-            sequence_nr());
-        } else {
-          guard.before(name(), sequence_nr());
-        }
+    auto step_callbacks = at::getStepCallbacks(at::RecordScope::BACKWARD_FUNCTION);
+    if (!step_callbacks.empty()) {
+      at::RecordFunction guard(std::move(step_callbacks));
+      // Using sequence number and thread id to correlate with
+      // the forward pass function
+      guard.setForwardThreadId(thread_id_);
+      if (guard.needsInputs()) {
+        std::vector<c10::IValue> inputs_vec(inputs.begin(), inputs.end());
+        guard.before(
+          name(),
+          c10::ArrayRef<const c10::IValue>(inputs_vec.data(), inputs_vec.size()),
+          sequence_nr());
+      } else {
+        guard.before(name(), sequence_nr());
       }
-      // keeping stack guard object alive during the call
       return apply(std::move(inputs));
     } else {
       return apply(std::move(inputs));
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 20074c57008d..4e269d9f4e55 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -18,7 +18,7 @@
 namespace torch { namespace autograd {
 
 #define CHECK_RESULT(RESULT, VAR) \
-  if (!(RESULT.is_sparse() || VAR.is_sparse())) { \
+  if (!(RESULT.is_sparse() || VAR.is_sparse() || RESULT.is_sparse_csr() || VAR.is_sparse_csr())) { \
     if (!utils::obeys_layout_contract(RESULT, VAR)) { \
       TORCH_WARN_ONCE("grad and param do not obey the gradient layout contract. " \
                       "This is not an error, but may impair performance.\n" \
@@ -105,7 +105,8 @@ struct TORCH_API AccumulateGrad : public Node {
       const T& update_grad) {
     if (!variable_grad.defined()) {
       if (!GradMode::is_enabled() &&
-          !new_grad.is_sparse() &&
+          !new_grad.is_sparse() && !new_grad.is_sparse_csr() &&
+          !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
           new_grad.use_count() <= num_expected_refs &&
           (new_grad.is_mkldnn() || utils::obeys_layout_contract(new_grad, variable))) {
         // we aren't setting up for double-backward
@@ -139,7 +140,7 @@ struct TORCH_API AccumulateGrad : public Node {
             new_grad.sizes(),
             new_grad.options()));
       } else {
-        if (new_grad.is_sparse()) {
+        if (new_grad.is_sparse() || new_grad.is_sparse_csr()) {
           update_grad(new_grad.clone());
         } else {
           if (new_grad.is_mkldnn()) {
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 890b7f715eae..2a5ec74f26e4 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/python_headers.h>
 
+#include <torch/csrc/utils/disable_torch_function.h>
 #include <c10/core/DeviceType.h>
 #include <c10/core/InferenceMode.h>
 #include <torch/csrc/Exceptions.h>
@@ -8,8 +9,8 @@
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <ATen/autocast_mode.h>
-#include <ATen/cpp_custom_type_hack.h>
 #include <ATen/record_function.h>
+#include <ATen/core/PythonFallbackKernel.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/profiler_python.h>
 #include <torch/csrc/autograd/python_function.h>
@@ -18,22 +19,18 @@
 #include <torch/csrc/autograd/python_saved_variable_hooks.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/utils/python_arg_parsing.h>
-#include <torch/csrc/autograd/python_mode.h>
 #include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/record_function_ops.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <c10/core/ScalarType.h>
+#include <ATen/PythonTorchFunctionTLS.h>
 
 #include <set>
 #include <unordered_set>
 
-struct DisableTorchDispatch {
-  DisableTorchDispatch() : guard_(c10::DispatchKey::Python) {
-  }
-  c10::impl::ExcludeDispatchKeyGuard guard_;
-};
-
 PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   using namespace torch::autograd::profiler;
+  using namespace torch::profiler::impl;
   auto tensor_module = THPObjectPtr(PyImport_ImportModule("torch._tensor"));
   if (!tensor_module)
     return nullptr;
@@ -75,17 +72,67 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("KINETO", ProfilerState::KINETO)
       .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);
 
+  using torch::profiler::impl::ActiveProfilerType;
+  py::enum_<ActiveProfilerType>(m, "ActiveProfilerType")
+      .value("NONE", ActiveProfilerType::NONE)
+      .value("LEGACY", ActiveProfilerType::LEGACY)
+      .value("KINETO", ActiveProfilerType::KINETO)
+      .value("NVTX", ActiveProfilerType::NVTX);
+
   py::enum_<ActivityType>(m, "ProfilerActivity")
       .value("CPU", ActivityType::CPU)
       .value("CUDA", ActivityType::CUDA);
 
+  py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
+      .def(py::init<
+          std::vector<std::string> /* profiler_metrics */,
+          bool  /* profiler_measure_per_kernel */
+          >(),
+          "An experimental config for Kineto features. Please note that"
+          "backward compatibility is not guaranteed.\n"
+          "    profiler_metrics : a list of CUPTI profiler metrics used\n"
+          "       to measure GPU performance events.\n"
+          "       If this list contains values Kineto runs in CUPTI profiler mode\n"
+          "    profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n"
+          "       or for the entire measurement duration.",
+          py::arg("profiler_metrics") = std::vector<std::string>(),
+          py::arg("profiler_measure_per_kernel") = false)
+    .def(py::pickle(
+        [](const ExperimentalConfig &p) { // __getstate__
+            py::list py_metrics;
+            for (const auto& metric : p.profiler_metrics) {
+              py::bytes mbytes(metric);
+              py_metrics.append(mbytes);
+            }
+            /* Return a tuple that fully encodes the state of the config */
+            return py::make_tuple(
+                py_metrics, p.profiler_measure_per_kernel);
+        },
+        [](py::tuple t) { // __setstate__
+            if (t.size() != 2) {
+                throw std::runtime_error("Expected 2 values in state");
+            }
+
+            py::list py_metrics = t[0].cast<py::list>();
+            std::vector<std::string> metrics{py_metrics.size()};
+
+            for (const auto& py_metric : py_metrics) {
+              metrics.push_back(py::str(py_metric));
+            }
+
+            return ExperimentalConfig(std::move(metrics), t[1].cast<bool>());
+        }
+    ));
+
+
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
       .def(py::init<ProfilerState,
           bool, /* record_input_shapes */
           bool, /* profile_memory */
           bool, /* with_stack */
           bool, /* with_flops */
-          bool  /* with_modules */
+          bool, /* with_modules */
+          ExperimentalConfig /* experimental_config */
           >());
 
   py::class_<LegacyEvent>(m, "ProfilerEvent")
@@ -123,7 +170,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("ORT", c10::DeviceType::ORT)
       .value("XLA", c10::DeviceType::XLA)
       .value("Lazy", c10::DeviceType::Lazy)
-      .value("MLC", c10::DeviceType::MLC)
+      .value("MPS", c10::DeviceType::MPS)
       .value("HPU", c10::DeviceType::HPU)
       .value("Meta", c10::DeviceType::Meta)
       .value("Vulkan", c10::DeviceType::Vulkan)
@@ -238,6 +285,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   m.def("_disable_profiler", disableProfiler);
   m.def("_prepare_profiler", prepareProfiler);
   m.def("_add_metadata_json", addMetadataJson);  // Only if `USE_KINETO` is set
+  m.def("_kineto_step", profilerStep);  // Only if `USE_KINETO` is set
   m.def("kineto_available", []() { return torch::profiler::kKinetoAvailable; });
 
   // NOTICE: These record functions are not torch operators and may not show up
@@ -246,28 +294,33 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   // Creates a new profiling scope using RecordFunction and invokes its starting
   // callbacks.
   m.def("_record_function_with_args_enter", [](const std::string& name, py::args args) {
-    auto rec = std::make_unique<at::RecordFunction>(at::RecordScope::USER_SCOPE);
+    using torch::autograd::profiler::PythonRecordFunction;
+    auto python_rec = c10::make_intrusive<PythonRecordFunction>(at::RecordScope::USER_SCOPE);
+    auto *rec = &python_rec->record;
     if (rec->isActive()) {
       if (rec->needsInputs()) {
         auto iv_inputs = std::vector<c10::IValue>();
         for (const auto& arg : args) {
             iv_inputs.push_back(torch::jit::toTypeInferredIValue(arg));
         }
-        rec->before(name, iv_inputs);
+        rec->before(name, c10::ArrayRef<const c10::IValue>(iv_inputs.data(), iv_inputs.size()));
       } else {
         rec->before(name);
       }
     }
-    return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions());
+    return torch::jit::toPyObject(std::move(python_rec));
   });
 
   // Ends the profiling scope created with record_function_with_param_enter.
-  m.def("_record_function_with_args_exit", [](const at::Tensor& handle) {
-    // We don't actually need to do anything with handle just need to persist the
-    // lifetime until now.
-    auto& rec = at::cpp_custom_type_hack::cast<at::RecordFunction>(handle);
-    rec.end();
-  });
+  m.def("_record_function_with_args_exit",
+        [](const py::object &obj) {
+          using torch::autograd::profiler::PythonRecordFunction;
+          auto python_record = torch::jit::toCustomClass<PythonRecordFunction>(obj);
+
+          // We don't actually need to do anything with handle just need to persist the
+          // lifetime until now.
+          python_record->record.end();
+        });
 
   m.def("_supported_activities", []() {
     std::set<ActivityType> activities {ActivityType::CPU};
@@ -287,6 +340,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       disableProfilerLegacy,
       py::arg("profiler_disable_options") = ProfilerDisableOptions());
   m.def("_profiler_enabled", profilerEnabled);
+  m.def("_profiler_type", torch::profiler::impl::profilerType);
   m.def("_enable_record_function", [](bool enable) {
     at::enableRecordFunction(enable);
   });
@@ -318,7 +372,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   py::class_<c10::InferenceMode>(_C_m, "_InferenceMode")
       .def(py::init<bool>());
 
-  py::class_<DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
+  py::class_<at::impl::RestorePythonTLSSnapshot>(_C_m, "_RestorePythonTLSSnapshot")
+      .def(py::init<>());
+
+  // TODO: line up this binding with DisableTorchFunction
+  py::class_<torch::DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
       .def(py::init<>());
 
   py::class_<torch::autograd::SavedVariable>(m, "SavedTensor")
@@ -544,20 +602,57 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) {
+static PyObject* set_torch_dispatch_mode(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  PythonMode::enter(arg);
+  if (arg == Py_None) {
+    at::impl::TorchDispatchModeTLS::set_state(nullptr);
+  } else {
+    Py_INCREF(arg);
+    at::impl::TorchDispatchModeTLS::set_state(
+        std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()));
+  }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) {
+static PyObject* get_torch_dispatch_mode(PyObject* _unused, PyObject* _unused2) {
   HANDLE_TH_ERRORS
-  PythonMode::exit();
+  const auto& mode = at::impl::TorchDispatchModeTLS::get_state();
+  if (!mode) {
+    Py_RETURN_NONE;
+  } else {
+    auto* r = mode->ptr(getPyInterpreter());
+    Py_INCREF(r);
+    return r;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * set_torch_function_mode(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  if (arg == Py_None) {
+    at::impl::PythonTorchFunctionTLS::set_mode(nullptr);
+  } else {
+    Py_INCREF(arg);
+    at::impl::PythonTorchFunctionTLS::set_mode(std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()));
+  }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * get_torch_function_mode(PyObject* _unused, PyObject* _unused2) {
+  HANDLE_TH_ERRORS
+  const auto& mode = at::impl::PythonTorchFunctionTLS::get_mode();
+  if (!mode) {
+    Py_RETURN_NONE;
+  } else {
+    auto* r = mode->ptr(getPyInterpreter());
+    Py_INCREF(r);
+    return r;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 // autograd methods on torch._C
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
@@ -580,8 +675,10 @@ static PyMethodDef methods[] = { // NOLINT
   {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
   {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
   {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr},
-  {"_enter_python_mode", enter_python_mode, METH_O, nullptr},
-  {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr},
+  {"_set_torch_dispatch_mode", set_torch_dispatch_mode, METH_O, nullptr},
+  {"_get_torch_dispatch_mode", get_torch_dispatch_mode, METH_NOARGS, nullptr},
+  {"_set_torch_function_mode", set_torch_function_mode, METH_O, nullptr},
+  {"_get_torch_function_mode", get_torch_function_mode, METH_NOARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
 
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 3663e1ee9915..71cc6e06d4d6 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -34,11 +34,14 @@ namespace {
     } else {
       switch (var.layout()) {
         case c10::kSparseCsr:
+        case c10::kSparseCsc:
+        case c10::kSparseBsr:
+        case c10::kSparseBsc:
           {
             auto* impl = at::sparse_csr::get_sparse_csr_impl(var);
             guard.recordDataPtrOnStream(impl->values().storage().data_ptr(), stream);
-            guard.recordDataPtrOnStream(impl->crow_indices().storage().data_ptr(), stream);
-            guard.recordDataPtrOnStream(impl->col_indices().storage().data_ptr(), stream);
+            guard.recordDataPtrOnStream(impl->compressed_indices().storage().data_ptr(), stream);
+            guard.recordDataPtrOnStream(impl->plain_indices().storage().data_ptr(), stream);
             break;
           }
         case c10::kSparse:
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index b8bba50c4063..db48e94e079f 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -4,11 +4,13 @@
 #include <c10/macros/Export.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
+#include <c10/util/overloaded.h>
+#include <c10/util/variant.h>
+#include <c10/util/C++17.h>
 
-#include <torch/csrc/jit/frontend/tracer.h>
-#include <torch/csrc/jit/runtime/interpreter.h>
-#include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/collection.h>
+#include <torch/csrc/profiler/containers.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/nvtx_observer.h>
 
@@ -116,56 +118,132 @@ void _push_reverse_order(PyTraceEvent* e, std::vector<std::string>& names) {
 namespace {
 using torch::profiler::impl::ProfilerThreadLocalStateBase;
 using torch::profiler::impl::ActiveProfilerType;
+using torch::profiler::impl::Result;
+using torch::profiler::impl::kineto::annotation_t;
+using torch::profiler::impl::shapesToStr;
+using torch::profiler::impl::dtypesToStr;
+using torch::profiler::impl::stacksToStr;
+
+struct MemoryEventData {
+  torch::profiler::impl::approx_time_t start_time;
+  void* ptr;
+  int64_t alloc_size;
+  int64_t total_allocated;
+  int64_t total_reserved;
+  uint64_t threadID;
+  torch::profiler::impl::kineto::DeviceAndResource kineto_info;
+  c10::DeviceType device_type;
+  c10::DeviceIndex device_index;
+};
+static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
+
+struct EventFieldsVisitor {
+  EventFieldsVisitor(const Result& result, KinetoEvent& kineto_event)
+      : result_{result}, kineto_event_{kineto_event} {
+    handleJIT(result_.get().jit_stack_, result_.get().jit_modules_);
+    c10::visit(*this, result.event_);
+  }
+
+  void operator()(const torch::profiler::impl::OpEvent& op_event) {
+    kineto_event_.get()
+        .endThreadId(op_event.end_thread_id_)
+        .scope(op_event.record_function_scope_)
+        .setAsync(op_event.is_async_)
+        .debugHandle(op_event.debug_handle_);
+
+    auto& shapes = result_.get().inputs_.shapes_;
+    if (!shapes.empty()) {
+      kineto_event_.get().shapes(shapes);
+      annotations_.emplace_back("Input Dims", shapesToStr(shapes));
+    }
+
+    auto& dtypes = result_.get().inputs_.dtypes_;
+    if (!dtypes.empty()) {
+      kineto_event_.get().dtypes(dtypes);
+      annotations_.emplace_back("Input type", dtypesToStr(dtypes));
+    }
+
+    if (!result_.get().extra_args_.empty()) {
+      kineto_event_.get().flops(
+          computeFlops(result_.get().name(), result_.get().extra_args_));
+    }
+    kineto_event_.get().cuda_event_start_ =
+        result_.get().gpu_fallback_.cuda_event_start_;
+    kineto_event_.get().cuda_event_end_ =
+        result_.get().gpu_fallback_.cuda_event_end_;
+
+    // add information about an associated forward op, if a sequence number
+    // is available (e.g. during training)
+    if (op_event.sequence_number_ >= 0) {
+      kineto_event_.get()
+          .sequenceNr(op_event.sequence_number_)
+          .fwdThreadId(op_event.forward_thread_id_);
+      annotations_.emplace_back(
+          "Fwd thread id", std::to_string(op_event.forward_thread_id_));
+      annotations_.emplace_back(
+          "Sequence number", std::to_string(op_event.sequence_number_));
+    }
+  }
+
+  void operator()(const torch::profiler::impl::BackendEvent& backend_event) {
+    kineto_event_.get()
+        .endThreadId(result_.get().start_tid_)
+        .scope(backend_event.record_function_scope_)
+        .debugHandle(backend_event.debug_handle_)
+        .backend(backend_event.backend_);
+
+    if (!backend_event.backend_.empty()) {
+      annotations_.emplace_back(
+          "Backend", "\"" + backend_event.backend_ + "\"");
+    }
+  }
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct OpEventData {
-    // POD members
-    int64_t start_us_;
-    int64_t end_us_;
-    uint64_t correlation_id_;
-    uint64_t start_thread_id_;
-    uint64_t end_thread_id_;
-    int64_t sequence_number_;
-    uint64_t forward_thread_id_;
-    uint8_t record_function_scope_;
-    bool is_async_;
-    int64_t debug_handle_;
-    torch::profiler::impl::kineto::DeviceAndResource kineto_info_;
-
-    std::string name_;
-
-    // report_input_shapes
-    std::vector<std::vector<int64_t>> shapes_;
-    std::vector<std::string> dtypes_;
-
-    // with_stack
-    std::vector<std::string> stack_;
-
-    // with_modules
-    c10::optional<std::vector<std::string>> module_hierarchy_;
-
-    // with_flops
-    std::unordered_map<std::string, c10::IValue> extra_args_;
-
-    // reportBackendEventToActiveKinetoProfiler
-    c10::optional<std::string> backend_;
-
-    // ProfilerState::KINETO_GPU_FALLBACK
-    torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr;
-    torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
+  void handleJIT(
+      const std::vector<std::string>& jit_stack,
+      const std::vector<std::string>& jit_modules) {
+    if (!jit_stack.empty()) {
+      // NB: This is only for the JIT stack. The python stack (if applicable)
+      //     is constructed later.
+      kineto_event_.get().stack(jit_stack);
+      annotations_.emplace_back(
+          "Call stack", torch::profiler::impl::stacksToStr(jit_stack, ";"));
+    }
+
+    if (!jit_modules.empty()) {
+      kineto_event_.get().moduleHierarchy(jit_modules);
+      annotations_.emplace_back(
+          "Module Hierarchy",
+          torch::profiler::impl::stacksToStr(jit_modules, "."));
+    }
+  }
+
+  std::reference_wrapper<const Result> result_;
+  std::reference_wrapper<KinetoEvent> kineto_event_;
+  annotation_t annotations_;
 };
 
+auto getAnnotations(const MemoryEventData& event) {
+  torch::profiler::impl::kineto::annotation_t out{
+      {"Device Type", std::to_string((int8_t)event.device_type)},
+      {"Device Id", std::to_string(event.device_index)},
+      {"Addr", std::to_string(reinterpret_cast<intptr_t>(event.ptr))},
+      {"Bytes", std::to_string(event.alloc_size)}};
+
+  if (event.total_allocated >= 0) {
+    out.emplace_back("Total Allocated", std::to_string(event.total_allocated));
+  }
+  if (event.total_reserved >= 0) {
+    out.emplace_back("Total Reserved", std::to_string(event.total_reserved));
+  }
+  return out;
+}
+
 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
 // not exceed 2^48 -1.
 static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
   return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1)));
 }
 
-struct KinetoObserverContext : public at::ObserverContext {
-  explicit KinetoObserverContext(OpEventData* data) : data_(data) {}
-  OpEventData* data_;
-};
-
 struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
   explicit KinetoThreadLocalState(
       const ProfilerConfig& config,
@@ -173,6 +251,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
       : ProfilerThreadLocalStateBase(config),
         start_time_(getTimeUs()),
         activities_(std::move(activities)),
+        record_queue_(config),
         cpu_trace_(start_time_, "PyTorch Profiler") {}
   ~KinetoThreadLocalState() override = default;
 
@@ -191,12 +270,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
     return config().with_stack && activities_.count(ActivityType::CPU);
   }
 
-  std::unique_ptr<KinetoObserverContext> newOpEvent() {
-    std::lock_guard<std::mutex> guard(state_mutex_);
-    op_events_.emplace_back();
-    return std::make_unique<KinetoObserverContext>(&op_events_.back());
-  }
-
   void reportMemoryUsage(
       void* ptr,
       int64_t alloc_size,
@@ -205,38 +278,24 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
       c10::Device device) override {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       std::lock_guard<std::mutex> guard(state_mutex_);
-      auto start_time = getTimeUs();
-      if (cpu_trace_) {
-        torch::profiler::impl::kineto::recordThreadInfo();
-        cpu_trace_.addMemoryUsageActivity(
-            kMemoryEventName,
-            torch::profiler::impl::kineto::kineto_ids(),
-            start_time,
-            device,
-            ptr,
-            alloc_size,
-            total_allocated,
-            total_reserved);
-      }
-
-      kineto_events_.emplace_back();
-      auto& evt = kineto_events_.back();
-      evt.name(kMemoryEventName)
-          .startUs(start_time)
-          .deviceIndex(device.index())
-          .deviceType(device.type())
-          .nBytes(alloc_size)
-          .startThreadId(at::RecordFunction::currentThreadId());
+      memory_events_.emplace_back(
+          torch::profiler::impl::getApproximateTime(),
+          ptr,
+          alloc_size,
+          total_allocated,
+          total_reserved,
+          at::RecordFunction::currentThreadId(),
+          torch::profiler::impl::kineto::kineto_ids(),
+          device.type(),
+          device.index());
     }
   }
 
-  const std::function<void(std::vector<KinetoEvent>&)>&
-  getEventPostProcessingCallback() const {
+  const post_process_t& getEventPostProcessingCallback() const {
     return event_post_process_cb_;
   }
 
-  void setEventPostProcessingCallback(
-      std::function<void(std::vector<KinetoEvent>&)>&& cb) {
+  void setEventPostProcessingCallback(post_process_t&& cb) {
     event_post_process_cb_ = std::move(cb);
   }
 
@@ -244,82 +303,90 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
     auto end_time = getTimeUs();
     materializeOpEvents();
 
-    // Call events post processing callback before finalizing trace, if there is
-    // one.
-    if (getEventPostProcessingCallback()) {
-      getEventPostProcessingCallback()(kineto_events_);
-    }
-
     finalizeCPUTrace(cpu_trace_.get());
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
       cpu_trace_.transferCpuTrace(end_time);
     }
 
-    auto trace = torch::profiler::impl::kineto::stopTrace();
-    TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable);
-    addTraceEvents(trace);
-    return trace;
+    if (config().state != ProfilerState::KINETO_ONDEMAND) {
+      auto trace = torch::profiler::impl::kineto::stopTrace();
+      TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable);
+      addTraceEvents(trace);
+      return trace;
+    } else {
+      return torch::profiler::impl::kineto::ActivityTraceWrapper();
+    }
   }
 
   void materializeOpEvents() {
     std::lock_guard<std::mutex> guard(state_mutex_);
-    for (const auto& e : op_events_) {
-      if (e.end_us_ < e.start_us_) {
+    auto converter = clock_converter_.makeConverter();
+
+    for (const auto& e : memory_events_) {
+      auto start_time_us = converter(e.start_time) / 1000;
+      cpu_trace_.addCPUActivity(
+          kMemoryEventName,
+          torch::profiler::impl::kineto::KinetoActivityType::CPU_INSTANT_EVENT,
+          e.kineto_info,
+          /*correlation_id=*/0,
+          start_time_us,
+          start_time_us,
+          getAnnotations(e));
+
+      kineto_events_.emplace_back();
+      auto& evt = kineto_events_.back();
+      evt.name(kMemoryEventName)
+          .startUs(start_time_us)
+          .deviceIndex(e.device_index)
+          .deviceType(e.device_type)
+          .nBytes(e.alloc_size)
+          .startThreadId(e.threadID);
+    }
+    memory_events_.clear();
+
+    for (auto& e : record_queue_.getRecords(converter)) {
+      // `take_data` handles time conversion.
+      int64_t start_us = e.start_time_us_;
+      int64_t end_us = e.end_time_us_;
+
+      if (end_us < start_us) {
         // We initialize end_us_ to the smallest int64_t, so this means that
         // the op did not finish before we stopped profiling.
         continue;
       }
 
-      cpu_trace_.addCPUActivity(
-          e.name_,
-          e.kineto_info_,
-          e.correlation_id_,
-          e.start_us_,
-          e.end_us_);
+      // Call events post processing callback before finalizing trace, if there
+      // is one.
+      if (getEventPostProcessingCallback()) {
+        getEventPostProcessingCallback()(
+            c10::visit([](const auto& i) { return i.debug_handle_; }, e.event_),
+            e.jit_stack_,
+            e.jit_modules_);
+      }
 
       kineto_events_.emplace_back();
       kineto_events_.back()
-          .name(e.name_)
-          .startUs(e.start_us_)
-          .durationUs(e.end_us_ - e.start_us_)
-          .correlationId(e.correlation_id_)
+          .name(e.name())
+          .startUs(start_us)
+          .durationUs(end_us - start_us)
+          .correlationId(e.correlation_id())
           .deviceType(c10::DeviceType::CPU)
-          .startThreadId(e.start_thread_id_)
-          .endThreadId(e.end_thread_id_)
-          .sequenceNr(e.sequence_number_)
-          .fwdThreadId(e.forward_thread_id_)
-          .scope(e.record_function_scope_)
-          .setAsync(e.is_async_)
-          .debugHandle(e.debug_handle_);
-
-      if (!e.shapes_.empty()) {
-        kineto_events_.back().shapes(e.shapes_);
-      }
-
-      if (!e.dtypes_.empty()) {
-        kineto_events_.back().dtypes(e.dtypes_);
-      }
+          .startThreadId(e.start_tid_);
 
-      if (!e.stack_.empty()) {
-        kineto_events_.back().stack(e.stack_);
-      }
+      // NB: also sets fields on `kineto_events_.back()`.
+      auto annotations =
+          EventFieldsVisitor(e, kineto_events_.back()).annotations_;
 
-      if (e.module_hierarchy_) {
-        kineto_events_.back().moduleHierarchy(*e.module_hierarchy_);
-      }
-
-      if (!e.extra_args_.empty()) {
-        kineto_events_.back().flops(
-            computeFlops(std::string(e.name_), e.extra_args_));
-      }
-      if (e.backend_) {
-        kineto_events_.back().backend(*e.backend_);
-      }
-      kineto_events_.back().cuda_event_start_ = e.cuda_event_start_;
-      kineto_events_.back().cuda_event_end_ = e.cuda_event_end_;
+      cpu_trace_.addCPUActivity(
+          e.name(),
+          e.kinetoType(),
+          e.kineto_info_,
+          e.correlation_id(),
+          start_us,
+          end_us,
+          annotations);
     }
-    op_events_.clear();
   }
 
   void finalizeCPUTrace(std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
@@ -331,46 +398,29 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
     // startThreadId_seqNum to pointer of activity.
     // Low-16bits of startThreadId and low-48bits seqNum are concatenated into
     // one uint64_t variable as key.
+
+    // From the time being, we need disable the forward/backward correlation feature to
+    // workaround the crash bug.
+    // TODO: by Mike Guo
+    // reenable the forward/backward correlation when kineto fix the following raw pointer
+    //    GenericTraceActivity.flow.linkedActivity
+
+    /*
     std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
         tidSeq2activity;
-    uint64_t fwd_bwd_link_id = 1;
 
     for (const auto idx : c10::irange(cpu_trace->activities.size())) {
       auto& kineto_event = kineto_events_[idx];
       auto& activity = cpu_trace->activities[idx];
 
-      if (kineto_event.hasShapes()) {
-        activity.addMetadata("Input Dims", torch::profiler::impl::shapesToStr(kineto_event.shapes()));
-      }
-      if (kineto_event.hasStack()) {
-        // NB: This is only for the JIT stack. The python stack (if applicable)
-        //     is constructed later.
-        activity.addMetadata(
-            "Call stack", torch::profiler::impl::stacksToStr(kineto_event.stack(), ";"));
-      }
-      if (kineto_event.hasModuleHierarchy()) {
-        activity.addMetadata(
-            "Module Hierarchy",
-            torch::profiler::impl::stacksToStr(kineto_event.moduleHierarchy(), "."));
-      }
-      if (kineto_event.hasTypes()) {
-        activity.addMetadata("Input type", torch::profiler::impl::dtypesToStr(kineto_event.dtypes()));
-      }
-      if (!kineto_event.backend().empty()) {
-        activity.addMetadata("Backend", "\"" + kineto_event.backend() + "\"");
-      }
-
       // add information about an associated forward op, if a sequence number
       // is available (e.g. during training)
       if (kineto_event.sequenceNr() >= 0) {
-        activity.addMetadata(
-            "Fwd thread id", std::to_string(kineto_event.fwdThreadId()));
-        activity.addMetadata(
-            "Sequence number", std::to_string(kineto_event.sequenceNr()));
         generateForwardBackwardLink(
             kineto_event, fwd_bwd_link_id, activity, tidSeq2activity);
       }
     }
+    */
 
     addPythonEvents(cpu_trace);
   }
@@ -406,7 +456,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
         py_event_indices_{
             { nullptr,
               std::string("null") }};
-    for (size_t i = 0; i < py_events.size(); i++) {
+    for (const auto i : c10::irange(py_events.size())) {
       py_event_indices_.insert({py_events[i].get(), std::to_string(i)});
     }
 
@@ -450,7 +500,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
       op_py_map.insert({t, py_stack.size() ? py_stack.back() : nullptr});
     }
 
-    auto activities = std::move(cpu_trace->activities);
+    std::vector<libkineto::GenericTraceActivity> py_activities;
     auto py_events_it = py_events.begin();
     auto py_device = libkineto::processId();
     auto main_thread = libkineto::systemThreadId();
@@ -471,13 +521,13 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
         op.addMetadata("Python module id", module_id_map_.at(e->module_id_));
       }
 
-      cpu_trace->activities.push_back(op);
+      py_activities.push_back(op);
       py_events_it++;
     };
 
-    TORCH_INTERNAL_ASSERT(activities.size() == kineto_events_.size());
-    for (const auto idx : c10::irange(activities.size())) {
-      auto& activity = activities[idx];
+    TORCH_INTERNAL_ASSERT(cpu_trace->activities.size() == kineto_events_.size());
+    for (const auto idx : c10::irange(cpu_trace->activities.size())) {
+      auto& activity = cpu_trace->activities[idx];
 
       // Add any python events that occurred between this Kineto event and the
       // previous Kineto event.
@@ -498,14 +548,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
         kineto_events_[idx].stack(py_names);
         activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(py_names, ";"));
       }
-
-      cpu_trace->activities.push_back(activity);
     }
 
     // Add any Python events which finish after the last Kineto event.
     while (py_events_it != py_events.end()) {
       push_py_event();
     }
+
+    cpu_trace->activities.insert(cpu_trace->activities.end(), py_activities.begin(), py_activities.end());
   }
 
   void generateForwardBackwardLink(
@@ -521,12 +571,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
       auto iter = tidSeq2activity.find(key);
       if (iter != tidSeq2activity.end()) {
         libkineto::GenericTraceActivity* fwd = iter->second;
-#ifdef USE_KINETO_UPDATED
         fwd->flow.start = true;
-#else
-        activity.flow.linkedActivity = fwd; // Only destination side set this,
-                                            // to distinguish with start side.
-#endif
         activity.flow.id = fwd->flow.id = fwd_bwd_link_id;
         activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd;
         ++fwd_bwd_link_id;
@@ -558,6 +603,9 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
 #ifdef USE_KINETO
     const auto& events = *(trace.get()->activities());
     for (const auto& ev_ptr : events) {
+      if (ev_ptr == nullptr) {
+        continue;
+      }
       const auto& activity = *ev_ptr;
       // These events are already processed
       if (activity.type() != libkineto::ActivityType::CPU_OP &&
@@ -583,101 +631,100 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
   }
 
   uint64_t start_time_;
+  torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
   std::set<torch::profiler::impl::ActivityType> activities_;
-  std::deque<OpEventData> op_events_;
+  torch::profiler::impl::RecordQueue record_queue_;
+  torch::profiler::impl::AppendOnlyList<MemoryEventData, 1024> memory_events_;
   torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
   std::vector<KinetoEvent> kineto_events_;
   // Optional, if event post-processing is enabled.
-  std::function<void(std::vector<KinetoEvent>&)> event_post_process_cb_;
+  post_process_t event_post_process_cb_;
 };
 
+static std::unique_ptr<KinetoThreadLocalState> globalStatePtr;
+
+template<typename... Args>
+static void initGlobalState(Args... args) {
+  if (globalStatePtr) {
+    LOG(WARNING) << "GlobalStatePtr already exists!";
+  } else {
+    globalStatePtr = std::make_unique<KinetoThreadLocalState>(std::forward<Args>(args)...);
+  }
+}
+
+static void resetGlobalState() {
+  TORCH_INTERNAL_ASSERT(globalStatePtr != nullptr, "Global state ptr cannot be null before resetting");
+  globalStatePtr.reset();
+}
+
+template<bool use_global>
+static KinetoThreadLocalState* getStatePtr() {
+  return c10::guts::if_constexpr<use_global>(
+      [] { return globalStatePtr.get(); },
+      [] { return KinetoThreadLocalState::getTLS(); });
+}
+
+template<bool use_global_state_ptr = false>
+std::unique_ptr<at::ObserverContext> onFunctionEnter(const at::RecordFunction& fn) {
+  auto state_ptr = getStatePtr<use_global_state_ptr>();
+  if (!state_ptr) {
+    return nullptr;
+  }
+  auto corr_id = next_correlation_id();
+  if (fn.scope() == at::RecordScope::USER_SCOPE) {
+    torch::profiler::impl::kineto::pushUserCorrelationId(corr_id);
+  } else {
+    torch::profiler::impl::kineto::pushCorrelationId(corr_id);
+  }
+  return state_ptr->record_queue_.getSubqueue()->begin_op(fn, corr_id);
+}
+
+// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter
+template<bool use_global_state_ptr = false>
+void onFunctionExit(const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
+  auto state_ptr = getStatePtr<use_global_state_ptr>();
+  if (!state_ptr) {
+    return;
+  }
+  const auto& config = state_ptr->config();
+  auto* kineto_ctx_ptr =
+    static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
+  TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
+  kineto_ctx_ptr->event_->end_time_ = torch::profiler::impl::getApproximateTime();
+  kineto_ctx_ptr->event_->end_thread_id_ = at::RecordFunction::currentThreadId();
+  if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
+    try {
+      auto fallback = kineto_ctx_ptr->fallback_;
+      TORCH_INTERNAL_ASSERT(fallback != nullptr);
+      torch::profiler::impl::cudaStubs()->record(
+          nullptr, &fallback->cuda_event_end_, nullptr);
+    } catch (const std::exception& e) {
+      LOG(WARNING) << "Failed to record CUDA event. " << e.what();
+    }
+  }
+
+  if (fn.scope() == at::RecordScope::USER_SCOPE) {
+    torch::profiler::impl::kineto::popUserCorrelationId();
+  } else {
+    torch::profiler::impl::kineto::popCorrelationId();
+  }
+}
+
+template <bool use_global_callback = false>
 void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
-  auto registration_state_ptr = KinetoThreadLocalState::getTLS();
+  auto registration_state_ptr = getStatePtr<use_global_callback>();
   TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
-  auto handle = at::addThreadLocalCallback(
+  auto recordFunctionCallback =
       at::RecordFunctionCallback(
-          [](const at::RecordFunction& fn)
-              -> std::unique_ptr<at::ObserverContext> {
-            auto state_ptr = KinetoThreadLocalState::getTLS();
-            if (!state_ptr) {
-              return nullptr;
-            }
-            const auto& config = state_ptr->config();
-            auto corr_id = next_correlation_id();
-            torch::profiler::impl::kineto::pushCorrelationId(corr_id);
-
-            auto ctx_ptr = state_ptr->newOpEvent();
-            auto data_ptr = ctx_ptr->data_;
-
-            data_ptr->end_us_ = std::numeric_limits<int64_t>::min();
-            data_ptr->correlation_id_ = corr_id;
-            data_ptr->start_thread_id_ = fn.threadId();
-            data_ptr->sequence_number_ = fn.seqNr();
-            data_ptr->forward_thread_id_ = fn.forwardThreadId();
-            data_ptr->record_function_scope_ = (uint8_t)fn.scope();
-            data_ptr->is_async_ = fn.isAsync();
-            data_ptr->debug_handle_ = fn.debugHandle();
-            data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids();
-            data_ptr->name_ = fn.name();
-            if (config.report_input_shapes) {
-              data_ptr->shapes_ = torch::profiler::impl::inputSizes(fn);
-              data_ptr->dtypes_ = torch::profiler::impl::inputTypes(fn);
-            }
-#if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
-            // backward nodes source range corresponds to the forward node
-            // TODO: consider using C++ stack trace
-            if (config.with_stack &&
-                fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
-              auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
-              data_ptr->stack_ = callstackStr(cs);
-            }
-            if (config.with_modules &&
-                fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
-              data_ptr->module_hierarchy_ = jit::currentModuleHierarchy();
-            }
-#endif
-            if (config.with_flops) {
-              data_ptr->extra_args_ = torch::profiler::impl::saveExtraArgs(fn);
-            }
-            data_ptr->start_us_ = getTimeUs();
-
-            if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
-              try {
-                torch::profiler::impl::cudaStubs()->record(
-                    nullptr, &data_ptr->cuda_event_start_, nullptr);
-              } catch (const std::exception& e) {
-                LOG(WARNING) << "Failed to record CUDA event. " << e.what();
-              }
-            }
-            return ctx_ptr;
-          },
-          [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
-            auto state_ptr = KinetoThreadLocalState::getTLS();
-            if (!state_ptr) {
-              return;
-            }
-            const auto& config = state_ptr->config();
-            auto* kineto_ctx_ptr =
-                static_cast<KinetoObserverContext*>(ctx_ptr);
-            TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
-            auto data_ptr = kineto_ctx_ptr->data_;
-            data_ptr->end_us_ = getTimeUs();
-            data_ptr->end_thread_id_ = at::RecordFunction::currentThreadId();
-
-            if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
-              try {
-                torch::profiler::impl::cudaStubs()->record(
-                    nullptr, &data_ptr->cuda_event_end_, nullptr);
-              } catch (const std::exception& e) {
-                LOG(WARNING) << "Failed to record CUDA event. " << e.what();
-              }
-            }
-
-            torch::profiler::impl::kineto::popCorrelationId();
-            torch::profiler::impl::kineto::recordThreadInfo();
-          })
+          onFunctionEnter<use_global_callback>,
+          onFunctionExit<use_global_callback>)
           .needsInputs(registration_state_ptr->config().report_input_shapes)
-          .scopes(scopes));
+          .scopes(scopes);
+
+  auto handle = c10::guts::if_constexpr<use_global_callback>(
+      [&] { return at::addGlobalCallback(recordFunctionCallback); },
+      [&] { return at::addThreadLocalCallback(recordFunctionCallback);
+      });
   registration_state_ptr->setCallbackHandle(handle);
 }
 
@@ -690,26 +737,21 @@ void reportBackendEventToActiveKinetoProfiler(
     const at::RecordScope scope,
     const std::string& event_name,
     const std::string& backend_name) {
+  TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback");
+
   auto state_ptr = KinetoThreadLocalState::getTLS();
   if (!state_ptr) {
     return;
   }
 
-  auto ctx_ptr = state_ptr->newOpEvent();
-  auto data_ptr = ctx_ptr->data_;
-  data_ptr->start_us_ = start_time_us;
-  data_ptr->end_us_ = end_time_us;
-  data_ptr->correlation_id_ = std::numeric_limits<uint64_t>::max();
-  data_ptr->start_thread_id_ = at::RecordFunction::currentThreadId();
-  data_ptr->end_thread_id_ = data_ptr->start_thread_id_;
-  data_ptr->sequence_number_ = -1;
-  data_ptr->forward_thread_id_ = data_ptr->start_thread_id_;
-  data_ptr->record_function_scope_ = (uint8_t)scope;
-  data_ptr->is_async_ = false;
-  data_ptr->debug_handle_ = debug_handle;
-  data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids();
-  data_ptr->name_ = event_name;
-  data_ptr->backend_ = backend_name;
+  state_ptr->record_queue_.getSubqueue()->emplace_backend_event(
+    torch::profiler::impl::BackendEvent {
+      start_time_us,
+      end_time_us,
+      (uint8_t)scope,
+      debug_handle,
+      event_name,
+      backend_name});
 
   /* no support for input shapes now?
   if (config.report_input_shapes) {
@@ -717,8 +759,6 @@ void reportBackendEventToActiveKinetoProfiler(
     ctx_ptr->dtypes = inputTypes(fn);
   }
   */
-
-  torch::profiler::impl::kineto::recordThreadInfo();
 }
 
 void prepareProfiler(
@@ -732,17 +772,19 @@ void prepareProfiler(
           config.state == ProfilerState::KINETO_GPU_FALLBACK,
       "Supported only in Kineto profiler");
   torch::profiler::impl::kineto::prepareTrace(
-      /*cpuOnly=*/!at::hasCUDA(), activities);
+      /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config);
 }
 
 void enableProfilerWithEventPostProcess(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
-    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    post_process_t&& cb,
     const std::unordered_set<at::RecordScope>& scopes) {
   TORCH_CHECK(
       config.state != ProfilerState::NVTX,
       "NVTX does not support post processing callback.");
+  TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback");
+
   enableProfiler(config, activities, scopes);
   auto state_ptr = KinetoThreadLocalState::getTLS();
   state_ptr->setEventPostProcessingCallback(std::move(cb));
@@ -760,36 +802,44 @@ void enableProfiler(
 
   TORCH_CHECK(
       config.state == ProfilerState::KINETO ||
-      config.state == ProfilerState::KINETO_GPU_FALLBACK);
+      config.state == ProfilerState::KINETO_GPU_FALLBACK ||
+      config.state == ProfilerState::KINETO_ONDEMAND);
   TORCH_CHECK(
       !activities.empty(), "No activities specified for Kineto profiler");
 
-  auto state = std::make_shared<KinetoThreadLocalState>(config, activities);
-  c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
+  if (config.state == ProfilerState::KINETO ||
+      config.state == ProfilerState::KINETO_GPU_FALLBACK) {
+    auto state = std::make_shared<KinetoThreadLocalState>(config, activities);
+    c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
-  if (state->tracePython()) {
-    python_tracer::call(python_tracer::Command::kStartOne);
-  }
+    if (state->tracePython()) {
+      python_tracer::call(python_tracer::Command::kStartOne);
+    }
 
-  if (activities.count(ActivityType::CPU)) {
-    pushProfilingCallbacks(scopes);
+    if (activities.count(ActivityType::CPU)) {
+      pushProfilingCallbacks<false>(scopes);
+    }
+    torch::profiler::impl::kineto::startTrace();
   }
 
-  torch::profiler::impl::kineto::startTrace();
+  if (config.state == ProfilerState::KINETO_ONDEMAND) {
+    initGlobalState(config, activities);
+
+    TORCH_INTERNAL_ASSERT(activities.count(ActivityType::CPU), "Ondemand profiling must enable CPU tracing");
+    pushProfilingCallbacks<true>(scopes);
+  }
 }
 
 std::unique_ptr<ProfilerResult> disableProfiler() {
-  // all the DebugInfoBase objects are scope based and supposed to use
-  // DebugInfoGuard
-  auto state =
-      c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  auto state_ptr = static_cast<ProfilerThreadLocalStateBase*>(
+      (globalStatePtr == nullptr) ? getStatePtr<false>() : getStatePtr<true>());
 
-  auto state_ptr = static_cast<ProfilerThreadLocalStateBase*>(state.get());
   const auto& config = state_ptr->config();
   TORCH_CHECK(
       state_ptr &&
           (config.state == ProfilerState::KINETO ||
            config.state == ProfilerState::KINETO_GPU_FALLBACK ||
+           config.state == ProfilerState::KINETO_ONDEMAND ||
            config.state == ProfilerState::NVTX),
       "Can't disable Kineto profiler when it's not running");
 
@@ -797,24 +847,42 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
     at::removeCallback(state_ptr->callbackHandle());
   }
 
-  if (state_ptr->config().state == ProfilerState::NVTX) {
+  // Traces are converged via libkineto automatically for ondemand flow
+  if (state_ptr->config().state == ProfilerState::KINETO_ONDEMAND) {
+    auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
+    auto trace = kineto_state_ptr->finalizeTrace();
+    resetGlobalState();
     return std::make_unique<ProfilerResult>();
   }
 
-  auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
-  if (kineto_state_ptr->tracePython()) {
-    python_tracer::call(python_tracer::Command::kStop);
+  // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK
+  std::unique_ptr<ProfilerResult> result;
+  if (state_ptr->config().state == ProfilerState::NVTX) {
+    result = std::make_unique<ProfilerResult>();
   }
 
-  auto trace = kineto_state_ptr->finalizeTrace();
-  if (kineto_state_ptr->tracePython()) {
-    python_tracer::call(python_tracer::Command::kClear);
+  if (config.state == ProfilerState::KINETO ||
+      config.state == ProfilerState::KINETO_GPU_FALLBACK) {
+    auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
+    if (kineto_state_ptr->tracePython()) {
+      python_tracer::call(python_tracer::Command::kStop);
+    }
+
+    auto trace = kineto_state_ptr->finalizeTrace();
+    if (kineto_state_ptr->tracePython()) {
+      python_tracer::call(python_tracer::Command::kClear);
+    }
+
+    result = std::make_unique<ProfilerResult>(
+        kineto_state_ptr->start_time_,
+        std::move(kineto_state_ptr->kineto_events_),
+        std::move(trace));
   }
 
-  return std::make_unique<ProfilerResult>(
-      kineto_state_ptr->start_time_,
-      std::move(kineto_state_ptr->kineto_events_),
-      std::move(trace));
+  // Disable thread-local profiler. We can't pop until the very end as it would invalidate
+  // the `state_ptr` reference which we need to process the traces.
+  (void)c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  return result;
 }
 
 int64_t KinetoEvent::cudaElapsedUs() const {
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index c7b130c9c250..c98009631766 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -344,10 +344,14 @@ TORCH_API void enableProfiler(
  * callback, via enableProfilerWithEventPostProcess, that takes these debug handles
  * and generates stack trace and module hierarchy information, once profiling is done.
  */
+using post_process_t = std::function<void(
+    /*debug_handle */ int64_t,
+    /*jit_stack    */ std::vector<std::string>&,
+    /*jit_modules  */ std::vector<std::string>&)>;
 TORCH_API void enableProfilerWithEventPostProcess(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
-    std::function<void(std::vector<KinetoEvent>&)>&& cb,
+    post_process_t&& cb,
     const std::unordered_set<at::RecordScope>& scopes = {});
 
 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index b0ebc3649cd9..5dad7de1a250 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -122,15 +122,15 @@ struct TORCH_API LegacyEvent {
 
   double cpuElapsedUs(const LegacyEvent& e) const {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
-    return (e.cpu_ns_ - cpu_ns_)/(1000.0);
+    return static_cast<double>(e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
 
   void setCpuUs(int64_t cpu_us) {
-    cpu_ns_ = cpu_us * 1000.0;
+    cpu_ns_ = static_cast<double>(cpu_us) * 1000.0;
   }
 
   double cpuUs() const {
-    return cpu_ns_ / (1000.0);
+    return static_cast<double>(cpu_ns_) / (1000.0);
   }
 
   double cudaElapsedUs(const LegacyEvent& e) const;
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 8ca06a3674bb..6c1675f121b0 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -31,69 +31,69 @@ namespace {
 // It is passed as the second argument when enabling tracing via
 // `PyEval_SetProfile`.
 struct TraceContext {
-    PyObject_HEAD
-
-    // It is wasteful to store an entire PyThreadState* in RawEvent. So
-    // instead, we map thread ids down to a compact space that we can store in
-    // a single byte.
-    uint8_t thread_id_;
-    PyThreadState* thread_state_;
-
-    // Likewise, int64_t is more precision than we need. By tracking when the
-    // profiler starts we can store "time since profile begin" which can fit
-    // into less space.
-    int64_t initial_us_;
-
-    // TODO:
-    //   Wall time is actually fairly expensive to compute. Empirically, it
-    //   takes ~600 ns to call `now()`. This puts a hard lower bound on the
-    //   overhead of the tracer. If we collected wall time less frequently, and
-    //   used TSC (e.g. through __rdtsc) to interpolate it should be possible
-    //   to reduce time spent on timestamps while retaining the same level of
-    //   accuracy.
+  PyObject_HEAD
+
+  // It is wasteful to store an entire PyThreadState* in RawEvent. So
+  // instead, we map thread ids down to a compact space that we can store in
+  // a single byte.
+  uint8_t thread_id_;
+  PyThreadState* thread_state_;
+
+  // Likewise, int64_t is more precision than we need. By tracking when the
+  // profiler starts we can store "time since profile begin" which can fit
+  // into less space.
+  int64_t initial_us_;
+
+  // TODO:
+  //   Wall time is actually fairly expensive to compute. Empirically, it
+  //   takes ~600 ns to call `now()`. This puts a hard lower bound on the
+  //   overhead of the tracer. If we collected wall time less frequently, and
+  //   used TSC (e.g. through __rdtsc) to interpolate it should be possible
+  //   to reduce time spent on timestamps while retaining the same level of
+  //   accuracy.
 };
 
 // CPython boilerplate to define `TraceContext` as a proper python object.
 static PyTypeObject TraceContextType = {
-    PyVarObject_HEAD_INIT(nullptr, 0)
-    "TraceContext",             /* tp_name */
-    sizeof(TraceContext),       /* tp_basicsize */
-    0,                          /* tp_itemsize */
-    nullptr,                    /* tp_dealloc */
-    0,                          /* tp_vectorcall_offset */  // NOLINT: modernize-use-nullptr
-    nullptr,                    /* tp_getattr */
-    nullptr,                    /* tp_setattr */
-    nullptr,                    /* tp_reserved */
-    nullptr,                    /* tp_repr */
-    nullptr,                    /* tp_as_number */
-    nullptr,                    /* tp_as_sequence */
-    nullptr,                    /* tp_as_mapping */
-    nullptr,                    /* tp_hash  */
-    nullptr,                    /* tp_call */
-    nullptr,                    /* tp_str */
-    nullptr,                    /* tp_getattro */
-    nullptr,                    /* tp_setattro */
-    nullptr,                    /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,         /* tp_flags */
-    "Python tracer TLS",        /* tp_doc */
-    nullptr,                    /* tp_traverse */
-    nullptr,                    /* tp_clear */
-    nullptr,                    /* tp_richcompare */
-    0,                          /* tp_weaklistoffset */
-    nullptr,                    /* tp_iter */
-    nullptr,                    /* tp_iternext */
-    nullptr,                    /* tp_methods */
-    nullptr,                    /* tp_members */
-    nullptr,                    /* tp_getset */
-    nullptr,                    /* tp_base */
-    nullptr,                    /* tp_dict */
-    nullptr,                    /* tp_descr_get */
-    nullptr,                    /* tp_descr_set */
-    0,                          /* tp_dictoffset */
-    nullptr,                    /* tp_init */
-    nullptr,                    /* tp_alloc */
-    PyType_GenericNew,          /* tp_new */
-    nullptr                     /* tp_free */
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "TraceContext",             /* tp_name */
+  sizeof(TraceContext),       /* tp_basicsize */
+  0,                          /* tp_itemsize */
+  nullptr,                    /* tp_dealloc */
+  0,                          /* tp_vectorcall_offset */  // NOLINT: modernize-use-nullptr
+  nullptr,                    /* tp_getattr */
+  nullptr,                    /* tp_setattr */
+  nullptr,                    /* tp_reserved */
+  nullptr,                    /* tp_repr */
+  nullptr,                    /* tp_as_number */
+  nullptr,                    /* tp_as_sequence */
+  nullptr,                    /* tp_as_mapping */
+  nullptr,                    /* tp_hash  */
+  nullptr,                    /* tp_call */
+  nullptr,                    /* tp_str */
+  nullptr,                    /* tp_getattro */
+  nullptr,                    /* tp_setattro */
+  nullptr,                    /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,         /* tp_flags */
+  "Python tracer TLS",        /* tp_doc */
+  nullptr,                    /* tp_traverse */
+  nullptr,                    /* tp_clear */
+  nullptr,                    /* tp_richcompare */
+  0,                          /* tp_weaklistoffset */
+  nullptr,                    /* tp_iter */
+  nullptr,                    /* tp_iternext */
+  nullptr,                    /* tp_methods */
+  nullptr,                    /* tp_members */
+  nullptr,                    /* tp_getset */
+  nullptr,                    /* tp_base */
+  nullptr,                    /* tp_dict */
+  nullptr,                    /* tp_descr_get */
+  nullptr,                    /* tp_descr_set */
+  0,                          /* tp_dictoffset */
+  nullptr,                    /* tp_init */
+  nullptr,                    /* tp_alloc */
+  PyType_GenericNew,          /* tp_new */
+  nullptr                     /* tp_free */
 };
 
 // CPython has a more expressive set of events for tracing / profiling:
@@ -105,12 +105,7 @@ static PyTypeObject TraceContextType = {
 // our replay stack), and we are not interested in `PyTrace_LINE` or
 // `PyTrace_OPCODE`. To simplify things we store our own enum when tracefunc is
 // called, and then use for all subsequent processing.
-enum TraceTag {
-    kPy_Call = 0,
-    kPy_Return,
-    kC_Call,
-    kC_Return
-};
+enum TraceTag { kPy_Call = 0, kPy_Return, kC_Call, kC_Return };
 
 //   When we are tracing a Python program, the general procedure is to record
 // every time we enter or exit a function and later replay these events during
@@ -156,78 +151,76 @@ enum TraceTag {
 // `RawEvent` would grow to three words. (Not just 50% bigger, but also less
 // cache friendly.)
 struct RawEvent {
-    RawEvent(TraceTag tag, int lasti, TraceContext* ctx)
-            : tag_(static_cast<uint8_t>(tag)),
-              thread_id_(ctx->thread_id_),
-              lasti_(static_cast<uint16_t>(lasti)),
-              misc_() {
-        int64_t t = now() - ctx->initial_us_;
-        t_ = static_cast<uint32_t>(t);
-
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(lasti <= std::numeric_limits<uint16_t>::max());
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits<uint32_t>::max());
-    }
-
-    RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code)
-            : RawEvent(tag, lasti, ctx) {
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call);
-        misc_.f_code_ = f_code;
-    }
-
-    RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg)
-            : RawEvent(tag, lasti, ctx) {
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call);
-        misc_.arg_ = arg;
-    }
-
-    uint8_t tag_;
-    uint8_t thread_id_;
-    uint16_t lasti_;
-    uint32_t t_;
-    union {
-        // TraceTag::kPy_Call
-        PyCodeObject* f_code_;
-
-        // TraceTag::kC_Call
-        PyObject* arg_;
-
-        // TraceTag::kPy_Return
-        // TraceTag::kC_Return
-        // ** Unused (placeholder) **
-        void* null_;
-    } misc_;
-
-    C10_NODISCARD TraceTag tag() const {
-        return static_cast<TraceTag>(tag_);
-    }
-
-    C10_NODISCARD int lasti() const {
-        // f_lasti is positive, with one exception: CPython intializes frames
-        // with `f_lasti = -1`. We don't want to give up half of the range by
-        // switching to int16_t. So instead we do the fast (underflowing) cast
-        // in the ctor, and rectify the value in this accessor which should
-        // only be called during trace post processing.
-        return lasti_ == std::numeric_limits<uint16_t>::max()
-            ? (int)(-1)
-            : (int)lasti_;
-    }
+  RawEvent(TraceTag tag, int lasti, TraceContext* ctx)
+      : tag_(static_cast<uint8_t>(tag)),
+        thread_id_(ctx->thread_id_),
+        lasti_(static_cast<uint16_t>(lasti)),
+        misc_() {
+    int64_t t = now() - ctx->initial_us_;
+    t_ = static_cast<uint32_t>(t);
+
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        lasti <= std::numeric_limits<uint16_t>::max());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits<uint32_t>::max());
+  }
+
+  RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code)
+      : RawEvent(tag, lasti, ctx) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call);
+    misc_.f_code_ = f_code;
+  }
+
+  RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg)
+      : RawEvent(tag, lasti, ctx) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call);
+    misc_.arg_ = arg;
+  }
+
+  uint8_t tag_;
+  uint8_t thread_id_;
+  uint16_t lasti_;
+  uint32_t t_;
+  union {
+    // TraceTag::kPy_Call
+    PyCodeObject* f_code_;
+
+    // TraceTag::kC_Call
+    PyObject* arg_;
+
+    // TraceTag::kPy_Return
+    // TraceTag::kC_Return
+    // ** Unused (placeholder) **
+    void* null_;
+  } misc_;
+
+  C10_NODISCARD TraceTag tag() const {
+    return static_cast<TraceTag>(tag_);
+  }
+
+  C10_NODISCARD int lasti() const {
+    // f_lasti is positive, with one exception: CPython intializes frames
+    // with `f_lasti = -1`. We don't want to give up half of the range by
+    // switching to int16_t. So instead we do the fast (underflowing) cast
+    // in the ctor, and rectify the value in this accessor which should
+    // only be called during trace post processing.
+    return lasti_ == std::numeric_limits<uint16_t>::max() ? (int)(-1)
+                                                          : (int)lasti_;
+  }
 };
 
 // Make sure the bit packing that we do in RawEvent actually results in the
 // desired size reduction.
 static_assert(sizeof(RawEvent) <= 16, "RawEvent is too large");
 
-
 // std::hash doesn't have a specialization for pairs so we have to define one.
 // A simple XOR is good enough for our purposes.
 struct hash_pair {
-    template <class T1, class T2>
-    size_t operator() (const std::pair<T1, T2>& pair) const {
-        return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
-    }
+  template <class T1, class T2>
+  size_t operator()(const std::pair<T1, T2>& pair) const {
+    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
+  }
 };
 
-
 // ============================================================================
 // == Tracing implementation ==================================================
 // ============================================================================
@@ -235,211 +228,221 @@ constexpr size_t max_py_threads = std::numeric_limits<uint8_t>::max() + 1;
 
 class PythonTracer final {
  public:
-    // Static methods serve as external interfaces (which expect raw pointers)
-    // and handle forwarding to the singleton.
-    static void call(Command c);
+  // Static methods serve as external interfaces (which expect raw pointers)
+  // and handle forwarding to the singleton.
+  static void call(Command c);
 
-    static int pyProfileFn(
-        PyObject* obj,
-        PyFrameObject* frame,
-        int what,
-        PyObject* arg);
+  static int pyProfileFn(
+      PyObject* obj,
+      PyFrameObject* frame,
+      int what,
+      PyObject* arg);
 
  private:
-    PythonTracer();
-    static PythonTracer& singleton();
-    friend class PyTraceReplay;
-
-    void start(size_t max_threads = max_py_threads);
-    void stop();
-    void clear();
-
-    void recordPyCall(TraceContext* ctx, PyFrameObject* frame);
-    void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg);
-    void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag);
-
-    void storeDescription(PyFrameObject* frame);
-    void trackModule(PyFrameObject* frame);
-
-    // It is imperitive that we do not store strings for each python function,
-    // as that would do terrible things to our profiling overhead. So instead
-    // we store the much cheaper pair of `PyCodeObject*` and `int` which we can
-    // pack into `RawEvent`, and then store a mapping to the full strings the
-    // first time we see a function.
-    //
-    // TODO:
-    //   In theory we should be able to use a combination of Py_INCREF on
-    //   `f_code` and string interning to skip this step. (Effectively reusing
-    //   work that the CPython interpreter has already done.) However it tends
-    //   to segfault and simply caching the strings is inexpensive.
-    struct CodeDescription {
-        CodeDescription(int line_no, std::string filename, std::string funcname)
-            : line_no_(line_no),
-              filename_(std::move(filename)),
-              funcname_(std::move(funcname)) {}
-        int line_no_;
-        std::string filename_;
-        std::string funcname_;
-    };
-
-    struct ModuleForward {
-        ModuleForward(size_t event_index, PyObject* self)
-            : event_index_(event_index), self_(self) {}
-        size_t event_index_;
-
-        // NB:
-        //  This is a non-owning reference to keep `ModuleForward` POD;
-        //  `PythonTracer` owns the contents instead. We  Py_INCREF in
-        //  `trackModule`, and `reset` is responsible for  calling Py_DECREF
-        //  when clearing `module_calls_`.
-        PyObject* self_;
-    };
-
-    bool active_;
-    PyObject* module_call_code_;
-    std::vector<std::string> path_prefixes_;
-    std::vector<TraceContext*> trace_contexts_;
-
-    std::vector<RawEvent> events_;
-    std::vector<ModuleForward> module_calls_;
-
-    using DescriptionKey = std::pair</*f_code=*/PyCodeObject*, /*f_lasti=*/int>;
-    ska::flat_hash_map<DescriptionKey, CodeDescription, hash_pair> code_descriptions_;
-    ska::flat_hash_map<PyObject*, std::string> c_function_reprs_;
+  PythonTracer();
+  static PythonTracer& singleton();
+  friend class PyTraceReplay;
+
+  void start(size_t max_threads = max_py_threads);
+  void stop();
+  void clear();
+
+  void recordPyCall(TraceContext* ctx, PyFrameObject* frame);
+  void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg);
+  void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag);
+
+  void storeDescription(PyFrameObject* frame);
+  void trackModule(PyFrameObject* frame);
+
+  // It is imperitive that we do not store strings for each python function,
+  // as that would do terrible things to our profiling overhead. So instead
+  // we store the much cheaper pair of `PyCodeObject*` and `int` which we can
+  // pack into `RawEvent`, and then store a mapping to the full strings the
+  // first time we see a function.
+  //
+  // TODO:
+  //   In theory we should be able to use a combination of Py_INCREF on
+  //   `f_code` and string interning to skip this step. (Effectively reusing
+  //   work that the CPython interpreter has already done.) However it tends
+  //   to segfault and simply caching the strings is inexpensive.
+  struct CodeDescription {
+    CodeDescription(int line_no, std::string filename, std::string funcname)
+        : line_no_(line_no),
+          filename_(std::move(filename)),
+          funcname_(std::move(funcname)) {}
+    int line_no_;
+    std::string filename_;
+    std::string funcname_;
+  };
+
+  struct ModuleForward {
+    ModuleForward(size_t event_index, PyObject* self)
+        : event_index_(event_index), self_(self) {}
+    size_t event_index_;
+
+    // NB:
+    //  This is a non-owning reference to keep `ModuleForward` POD;
+    //  `PythonTracer` owns the contents instead. We  Py_INCREF in
+    //  `trackModule`, and `reset` is responsible for  calling Py_DECREF
+    //  when clearing `module_calls_`.
+    PyObject* self_;
+  };
+
+  bool active_;
+  PyObject* module_call_code_;
+  std::vector<std::string> path_prefixes_;
+  std::vector<TraceContext*> trace_contexts_;
+
+  std::vector<RawEvent> events_;
+  std::vector<ModuleForward> module_calls_;
+
+  using DescriptionKey = std::pair</*f_code=*/PyCodeObject*, /*f_lasti=*/int>;
+  ska::flat_hash_map<DescriptionKey, CodeDescription, hash_pair>
+      code_descriptions_;
+  ska::flat_hash_map<PyObject*, std::string> c_function_reprs_;
 };
 
 PythonTracer& PythonTracer::singleton() {
-    static PythonTracer singleton_;
-    return singleton_;
+  static PythonTracer singleton_;
+  return singleton_;
 }
 
 PythonTracer::PythonTracer() : active_(false) {
-    path_prefixes_ = py::module::import("torch.profiler.python_tracer")
-        .attr("_prefix_regex")().cast<std::vector<std::string>>();
-
-    module_call_code_ = py::module::import("torch.nn")
-        .attr("Module")
-        .attr("__call__")
-        .attr("__code__")
-        .ptr();
+  path_prefixes_ = py::module::import("torch.profiler.python_tracer")
+    .attr("_prefix_regex")().cast<std::vector<std::string>>();
+
+  module_call_code_ = py::module::import("torch.nn")
+    .attr("Module")
+    .attr("__call__")
+    .attr("__code__")
+    .ptr();
 }
 
 void PythonTracer::start(size_t max_threads) {
-    TORCH_CHECK(!active_, "PythonTracer is already active")
-    TORCH_CHECK(!trace_contexts_.size(), "PythonTracer should not have active contexts");
-    TORCH_CHECK(max_threads > 0, "max_threads must be positive, got ", max_threads);
-    TORCH_CHECK(
-        max_threads <= max_py_threads,
-        "max_threads must be less than or equal to ", max_py_threads);
-
-    pybind11::gil_scoped_acquire gil;
-    auto t0 = now();
-
-    // Loop over all threads within the current interpreter. We will need to
-    // register a trace function with each thread. We set the current thread to
-    // position zero to ensure that it is traced, and so we can restore the
-    // thread state after registration.
-    std::vector<PyThreadState*> thread_states { PyThreadState_Get() };
-    if (max_threads > 1) {
-        auto thread_state = thread_states[0];
-        while (thread_state != nullptr) {
-            if (thread_state != thread_states[0]) {
-                thread_states.push_back(thread_state);
-            }
-            thread_state = PyThreadState_Next(thread_state);
-        }
-
-        if (thread_states.size() > max_threads) {
-            std::cout << "Warning: can only trace " << max_threads << " threads. "
-                    << thread_states.size() << " are currently active." << std::endl;
-            thread_states.resize(max_threads);
-        }
+  TORCH_CHECK(!active_, "PythonTracer is already active")
+  TORCH_CHECK(
+      !trace_contexts_.size(), "PythonTracer should not have active contexts");
+  TORCH_CHECK(
+      max_threads > 0, "max_threads must be positive, got ", max_threads);
+  TORCH_CHECK(
+      max_threads <= max_py_threads,
+      "max_threads must be less than or equal to ",
+      max_py_threads);
+
+  pybind11::gil_scoped_acquire gil;
+  auto t0 = now();
+
+  // Loop over all threads within the current interpreter. We will need to
+  // register a trace function with each thread. We set the current thread to
+  // position zero to ensure that it is traced, and so we can restore the
+  // thread state after registration.
+  std::vector<PyThreadState*> thread_states{PyThreadState_Get()};
+  if (max_threads > 1) {
+    auto thread_state = thread_states[0];
+    while (thread_state != nullptr) {
+      if (thread_state != thread_states[0]) {
+        thread_states.push_back(thread_state);
+      }
+      thread_state = PyThreadState_Next(thread_state);
     }
 
-    // Register the tracer in each thread.
-    for (const auto i : c10::irange(thread_states.size())) {
-        PyThreadState* thread_state = thread_states[i];
-        PyThreadState_Swap(thread_state);
-
-        auto ctx = (TraceContext*) TraceContextType.tp_alloc(&TraceContextType, 0);
-        ctx->thread_id_ = (uint8_t)i;
-        ctx->thread_state_ = thread_state;
-        ctx->initial_us_ = t0;
-        trace_contexts_.push_back(ctx);
-
-        // When we begin profiling there are already frames on the Python
-        // interpreter stack. To ensure a complete trace, we must push calls
-        // to all the prior frames onto our event stack. (We stop at depth=128)
-        std::vector<PyFrameObject*> current_stack;
-        auto frame = PyEval_GetFrame();
-        size_t depth = 0;  // Make sure we can't infinite loop.
-        while (frame != nullptr && depth <= 128) {
-            current_stack.push_back(frame);
-            frame = frame->f_back;
-            depth++;
-        }
-        for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
-            recordPyCall(ctx, *it);
-        }
-
-        // Note:
-        //   This profile will not compose with other CPython profilers, and
-        //   cannot be round tripped via `sys.settrace(sys.gettrace())`
-        PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
+    if (thread_states.size() > max_threads) {
+      std::cout << "Warning: can only trace " << max_threads << " threads. "
+                << thread_states.size() << " are currently active."
+                << std::endl;
+      thread_states.resize(max_threads);
+    }
+  }
+
+  // Register the tracer in each thread.
+  for (const auto i : c10::irange(thread_states.size())) {
+    PyThreadState* thread_state = thread_states[i];
+    PyThreadState_Swap(thread_state);
+
+    auto ctx = (TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0);
+    ctx->thread_id_ = (uint8_t)i;
+    ctx->thread_state_ = thread_state;
+    ctx->initial_us_ = t0;
+    trace_contexts_.push_back(ctx);
+
+    // When we begin profiling there are already frames on the Python
+    // interpreter stack. To ensure a complete trace, we must push calls
+    // to all the prior frames onto our event stack. (We stop at depth=128)
+    std::vector<PyFrameObject*> current_stack;
+    auto frame = PyEval_GetFrame();
+    size_t depth = 0; // Make sure we can't infinite loop.
+    while (frame != nullptr && depth <= 128) {
+      current_stack.push_back(frame);
+      frame = frame->f_back;
+      depth++;
+    }
+    for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
+      recordPyCall(ctx, *it);
     }
 
-    // Restore the thread state to its initial value.
-    PyThreadState_Swap(thread_states[0]);
+    // Note:
+    //   This profile will not compose with other CPython profilers, and
+    //   cannot be round tripped via `sys.settrace(sys.gettrace())`
+    PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
+  }
 
-    active_ = true;
+  // Restore the thread state to its initial value.
+  PyThreadState_Swap(thread_states[0]);
+
+  active_ = true;
 };
 
 void PythonTracer::stop() {
-    TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.")
+  TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.")
 
-    pybind11::gil_scoped_acquire gil;
+  pybind11::gil_scoped_acquire gil;
 
-    PyThreadState* initial_thread_state = PyThreadState_Get();
-    for (const auto i : trace_contexts_) {
-        PyThreadState_Swap(i->thread_state_);
-        PyEval_SetProfile(nullptr, nullptr);
-    }
-    PyThreadState_Swap(initial_thread_state);
-    active_ = false;
+  PyThreadState* initial_thread_state = PyThreadState_Get();
+  for (const auto i : trace_contexts_) {
+    PyThreadState_Swap(i->thread_state_);
+    PyEval_SetProfile(nullptr, nullptr);
+  }
+  PyThreadState_Swap(initial_thread_state);
+  active_ = false;
 }
 
-
 void PythonTracer::clear() {
-    TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active.");
-    for (auto i : trace_contexts_) {
-        Py_DECREF((PyObject*) i);
-    }
-    trace_contexts_.clear();
-    events_.clear();
-    code_descriptions_.clear();
-    c_function_reprs_.clear();
-    for (auto& i : module_calls_) {
-        Py_DECREF(i.self_);
-    }
-    module_calls_.clear();
+  TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active.");
+  for (auto i : trace_contexts_) {
+    Py_DECREF((PyObject*)i);
+  }
+  trace_contexts_.clear();
+  events_.clear();
+  code_descriptions_.clear();
+  c_function_reprs_.clear();
+  for (auto& i : module_calls_) {
+    Py_DECREF(i.self_);
+  }
+  module_calls_.clear();
 }
 
 void PythonTracer::recordPyCall(TraceContext* ctx, PyFrameObject* frame) {
-    events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code);
-    storeDescription(frame);
-    trackModule(frame);
+  events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code);
+  storeDescription(frame);
+  trackModule(frame);
 }
 
-void PythonTracer::recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg) {
-    events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg);
-    const auto& it = c_function_reprs_.find(arg);
-    if C10_UNLIKELY(it == c_function_reprs_.end()) {
-        c_function_reprs_[arg] = py::repr(arg);
-    }
+void PythonTracer::recordCCall(
+    TraceContext* ctx,
+    PyFrameObject* frame,
+    PyObject* arg) {
+  events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg);
+  const auto& it = c_function_reprs_.find(arg);
+  if C10_UNLIKELY (it == c_function_reprs_.end()) {
+    c_function_reprs_[arg] = py::repr(arg);
+  }
 }
 
-void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag) {
-    events_.emplace_back(tag, frame->f_lasti, ctx);
+void PythonTracer::recordReturn(
+    TraceContext* ctx,
+    PyFrameObject* frame,
+    TraceTag tag) {
+  events_.emplace_back(tag, frame->f_lasti, ctx);
 }
 
 // NB:
@@ -448,272 +451,265 @@ void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTa
 //  call rather than the return. (Otherwise we would get the line with the
 //  return stmt.)
 void PythonTracer::storeDescription(PyFrameObject* frame) {
-    const auto& it = code_descriptions_.find({ frame->f_code, frame->f_lasti });
-    if C10_UNLIKELY(it == code_descriptions_.end()) {
-        code_descriptions_.insert({
-            { frame->f_code, frame->f_lasti },
-            {
-                /*line_no=*/ PyCode_Addr2Line(frame->f_code, frame->f_lasti),
-                /*filename=*/ THPUtils_unpackString(frame->f_code->co_filename),
-                /*funcname=*/ THPUtils_unpackString(frame->f_code->co_name)
-            }
-        });
-    }
+  const auto& it = code_descriptions_.find({frame->f_code, frame->f_lasti});
+  if C10_UNLIKELY (it == code_descriptions_.end()) {
+    code_descriptions_.insert(
+        {{frame->f_code, frame->f_lasti},
+         {/*line_no=*/PyCode_Addr2Line(frame->f_code, frame->f_lasti),
+          /*filename=*/THPUtils_unpackString(frame->f_code->co_filename),
+          /*funcname=*/THPUtils_unpackString(frame->f_code->co_name)}});
+  }
 }
 
 void PythonTracer::trackModule(PyFrameObject* frame) {
-    if ((PyObject*)(frame->f_code) == module_call_code_) {
-        // By default, CPython stores locals in a "fast" format, with an array
-        // of names and an array of values. Consequently, frame->f_locals is
-        // NULL since the interpreter has no need to populate it.
-        //
-        // If these arrays were part of the public API then we could very
-        // quickly access `self`. Unfortunately they are not, and moreover are
-        // not stable across versions. As a result, we are forced to call
-        // `PyFrame_FastToLocals` which forces the interpreter to materialize
-        // the full dict of locals.
-        PyFrame_FastToLocals(frame);
-        auto self = PyDict_GetItemString(frame->f_locals, "self");
-        Py_INCREF(self);
-        module_calls_.emplace_back(
-            /*event_index=*/events_.size() - 1,
-            /*self=*/self
-        );
-        PyFrame_LocalsToFast(frame, 0);
-    }
+  if ((PyObject*)(frame->f_code) == module_call_code_) {
+    // By default, CPython stores locals in a "fast" format, with an array
+    // of names and an array of values. Consequently, frame->f_locals is
+    // NULL since the interpreter has no need to populate it.
+    //
+    // If these arrays were part of the public API then we could very
+    // quickly access `self`. Unfortunately they are not, and moreover are
+    // not stable across versions. As a result, we are forced to call
+    // `PyFrame_FastToLocals` which forces the interpreter to materialize
+    // the full dict of locals.
+    PyFrame_FastToLocals(frame);
+    auto self = PyDict_GetItemString(frame->f_locals, "self");
+    Py_INCREF(self);
+    module_calls_.emplace_back(
+        /*event_index=*/events_.size() - 1,
+        /*self=*/self);
+    PyFrame_LocalsToFast(frame, 0);
+  }
 };
 
-
 // ============================================================================
 // == Post processing =========================================================
 // ============================================================================
 
 class PyTraceReplay {
  public:
-    static std::vector<std::unique_ptr<PyTraceEvent>> getEvents() {
-        return PyTraceReplay().replayStack();
-    }
+  static std::vector<std::unique_ptr<PyTraceEvent>> getEvents() {
+    return PyTraceReplay().replayStack();
+  }
 
  private:
-    PyTraceReplay();
-    std::vector<std::unique_ptr<PyTraceEvent>> replayStack() const;
+  PyTraceReplay();
+  std::vector<std::unique_ptr<PyTraceEvent>> replayStack() const;
 
-    struct ReplayFrame {
-        std::unique_ptr<PyTraceEvent> event_;
-        size_t id_;
-        size_t parent_id_;
-    };
+  struct ReplayFrame {
+    std::unique_ptr<PyTraceEvent> event_;
+    size_t id_;
+    size_t parent_id_;
+  };
 
-    ska::flat_hash_map<size_t, PyObject*> module_self_map_;
-    ska::flat_hash_map<size_t, std::string> module_name_map_;
+  ska::flat_hash_map<size_t, PyObject*> module_self_map_;
+  ska::flat_hash_map<size_t, std::string> module_name_map_;
 };
 
 PyTraceReplay::PyTraceReplay() {
-    ska::flat_hash_map<PyObject*, std::string> module_names;
-    for (const auto& call : PythonTracer::singleton().module_calls_) {
-        if (module_names.find(call.self_) == module_names.end()) {
-            std::stringstream name_stream;
-            auto py_class_name = py::handle(call.self_)
-                .attr("__class__")
-                .attr("__name__");
-            name_stream << "nn.Module: " << py::str(py_class_name);
-            module_names.insert({ call.self_, name_stream.str() });
-        }
-
-        module_self_map_.insert({ call.event_index_, call.self_ });
-        module_name_map_.insert({ call.event_index_, module_names.at(call.self_) });
+  ska::flat_hash_map<PyObject*, std::string> module_names;
+  for (const auto& call : PythonTracer::singleton().module_calls_) {
+    if (module_names.find(call.self_) == module_names.end()) {
+      std::stringstream name_stream;
+      auto py_class_name =
+          py::handle(call.self_).attr("__class__").attr("__name__");
+      name_stream << "nn.Module: " << py::str(py_class_name);
+      module_names.insert({call.self_, name_stream.str()});
     }
-}
 
+    module_self_map_.insert({call.event_index_, call.self_});
+    module_name_map_.insert({call.event_index_, module_names.at(call.self_)});
+  }
+}
 
 // TODO: Use re2.
 void trimPrefix(std::string& s, const std::vector<std::string>& prefixes) {
-    for (const auto& p : prefixes) {
-        if (s.compare(0, p.size(), p) == 0) {
-            s.erase(0, p.size());
-            return;
-        }
+  for (const auto& p : prefixes) {
+    if (s.compare(0, p.size(), p) == 0) {
+      s.erase(0, p.size());
+      return;
     }
+  }
 }
 
-
 std::vector<std::unique_ptr<PyTraceEvent>> PyTraceReplay::replayStack() const {
-    const auto& tracer = PythonTracer::singleton();
-
-    // We want to prune paths to a sensible prefix. For example
-    //   `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py`
-    // Pruning the path prefix is somewhat expensive, so we cache it.
-    ska::flat_hash_map<std::string, std::string> filename_map;
-    for (const auto& i : tracer.code_descriptions_) {
-        if (filename_map.find(i.second.filename_) == filename_map.end()) {
-            std::string s(i.second.filename_);
-            trimPrefix(s, tracer.path_prefixes_);
-            filename_map[i.second.filename_] = s;
-        }
+  const auto& tracer = PythonTracer::singleton();
+
+  // We want to prune paths to a sensible prefix. For example
+  //   `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py`
+  // Pruning the path prefix is somewhat expensive, so we cache it.
+  ska::flat_hash_map<std::string, std::string> filename_map;
+  for (const auto& i : tracer.code_descriptions_) {
+    if (filename_map.find(i.second.filename_) == filename_map.end()) {
+      std::string s(i.second.filename_);
+      trimPrefix(s, tracer.path_prefixes_);
+      filename_map[i.second.filename_] = s;
     }
-
-    auto py_name = [&](const RawEvent& e) {
-        const auto& desc_it = tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()});
-        if (desc_it != tracer.code_descriptions_.end()) {
-            std::stringstream name_stream;
-            name_stream << filename_map.at(desc_it->second.filename_) << "("
-                        << desc_it->second.line_no_ << "): " << desc_it->second.funcname_;
-            return name_stream.str();
-        }
-        return std::string("Python: ???");
-    };
-
-    size_t id_counter = 0;
-    std::vector<std::vector<ReplayFrame>> stacks(tracer.trace_contexts_.size());
-    std::vector<ReplayFrame> results;
-
-    // Match calls and returns.
-    size_t event_idx = 0;
-    for (auto& raw_event : tracer.events_) {
-        auto& stack = stacks[raw_event.thread_id_];
-        auto ctx = tracer.trace_contexts_[raw_event.thread_id_];
-        auto t = static_cast<int64_t>(raw_event.t_) + ctx->initial_us_;
-
-        auto push_frame = [&](std::string name, CallType call_type, size_t module_id = 0) {
-            stack.push_back(ReplayFrame {
-                /*event_=*/ std::make_unique<PyTraceEvent>(PyTraceEvent{
-                    /*startTime_=*/ t,
-                    /*endTime_=*/ -1,  // Placeholder
-                    /*name_=*/ name,
-                    /*thread_id_=*/ raw_event.thread_id_,
-                    /*parent_=*/ nullptr,  // Placeholder
-                    /*call_type_=*/ call_type,
-                    /*module_id_=*/ module_id,
-                    /*call_idx_=*/ event_idx,
-                    /*return_idx_=*/ 0  // Placeholder
-                }),
-                /*id_=*/ id_counter++,
-                /*parent_id_=*/ stack.size() ? stack.back().id_ : 0,
-            });
-        };
-
-        switch (raw_event.tag()) {
-            case TraceTag::kPy_Call:
-                if (module_name_map_.find(event_idx) != module_name_map_.end()) {
-                    push_frame(
-                        module_name_map_.at(event_idx),
-                        CallType::kPyModuleCall,
-                        reinterpret_cast<size_t>(module_self_map_.at(event_idx)));
-                } else {
-                    push_frame(py_name(raw_event), CallType::kPyCall);
-                }
-                break;
-
-            case TraceTag::kC_Call:
-                push_frame(tracer.c_function_reprs_.at(raw_event.misc_.arg_), CallType::kCCall);
-                break;
-
-            case TraceTag::kPy_Return:
-            case TraceTag::kC_Return:
-                TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.")
-                stack.back().event_->endTime_ = t;
-                stack.back().event_->return_idx_ = event_idx;
-                results.push_back(std::move(stack.back()));
-                stack.pop_back();
-                break;
-        }
-        event_idx++;
+  }
+
+  auto py_name = [&](const RawEvent& e) {
+    const auto& desc_it =
+        tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()});
+    if (desc_it != tracer.code_descriptions_.end()) {
+      std::stringstream name_stream;
+      name_stream << filename_map.at(desc_it->second.filename_) << "("
+                  << desc_it->second.line_no_
+                  << "): " << desc_it->second.funcname_;
+      return name_stream.str();
     }
+    return std::string("Python: ???");
+  };
+
+  size_t id_counter = 0;
+  std::vector<std::vector<ReplayFrame>> stacks(tracer.trace_contexts_.size());
+  std::vector<ReplayFrame> results;
+
+  // Match calls and returns.
+  size_t event_idx = 0;
+  for (auto& raw_event : tracer.events_) {
+    auto& stack = stacks[raw_event.thread_id_];
+    auto ctx = tracer.trace_contexts_[raw_event.thread_id_];
+    auto t = static_cast<int64_t>(raw_event.t_) + ctx->initial_us_;
+
+    auto push_frame =
+        [&](std::string name, CallType call_type, size_t module_id = 0) {
+          stack.push_back(ReplayFrame{
+              /*event_=*/std::make_unique<PyTraceEvent>(PyTraceEvent{
+                  /*startTime_=*/t,
+                  /*endTime_=*/-1, // Placeholder
+                  /*name_=*/name,
+                  /*thread_id_=*/raw_event.thread_id_,
+                  /*parent_=*/nullptr, // Placeholder
+                  /*call_type_=*/call_type,
+                  /*module_id_=*/module_id,
+                  /*call_idx_=*/event_idx,
+                  /*return_idx_=*/0 // Placeholder
+              }),
+              /*id_=*/id_counter++,
+              /*parent_id_=*/stack.size() ? stack.back().id_ : 0,
+          });
+        };
 
-    // Cleanup by feining return to close out the stack. This is needed so
-    // frames above the one that called the profiler still appear in the trace.
-    const auto t_final = now();
-    for (auto& stack : stacks) {
-        while (stack.size()) {
-            stack.back().event_->endTime_ = t_final;
-            stack.back().event_->return_idx_ = event_idx;
-            results.push_back(std::move(stack.back()));
-            stack.pop_back();
-            event_idx++;
+    switch (raw_event.tag()) {
+      case TraceTag::kPy_Call:
+        if (module_name_map_.find(event_idx) != module_name_map_.end()) {
+          push_frame(
+              module_name_map_.at(event_idx),
+              CallType::kPyModuleCall,
+              reinterpret_cast<size_t>(module_self_map_.at(event_idx)));
+        } else {
+          push_frame(py_name(raw_event), CallType::kPyCall);
         }
+        break;
+
+      case TraceTag::kC_Call:
+        push_frame(
+            tracer.c_function_reprs_.at(raw_event.misc_.arg_),
+            CallType::kCCall);
+        break;
+
+      case TraceTag::kPy_Return:
+      case TraceTag::kC_Return:
+        TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.")
+        stack.back().event_->endTime_ = t;
+        stack.back().event_->return_idx_ = event_idx;
+        results.push_back(std::move(stack.back()));
+        stack.pop_back();
+        break;
     }
-
-    // Convert to `PyTraceEvent`, and map id to pointer.
-    ska::flat_hash_map<size_t, PyTraceEvent*> event_id_map {{0, nullptr}};
-    std::vector<std::unique_ptr<PyTraceEvent>> out;
-    for (auto& r : results) {
-        out.push_back(std::move(r.event_));
-        event_id_map.insert({r.id_, out.back().get()});
+    event_idx++;
+  }
+
+  // Cleanup by feining return to close out the stack. This is needed so
+  // frames above the one that called the profiler still appear in the trace.
+  const auto t_final = now();
+  for (auto& stack : stacks) {
+    while (stack.size()) {
+      stack.back().event_->endTime_ = t_final;
+      stack.back().event_->return_idx_ = event_idx;
+      results.push_back(std::move(stack.back()));
+      stack.pop_back();
+      event_idx++;
     }
-
-    // Link parents to children.
-    for (const auto i : c10::irange(results.size())) {
-        out[i]->parent_ = event_id_map.at(results[i].parent_id_);
-    }
-    return out;
+  }
+
+  // Convert to `PyTraceEvent`, and map id to pointer.
+  ska::flat_hash_map<size_t, PyTraceEvent*> event_id_map{{0, nullptr}};
+  std::vector<std::unique_ptr<PyTraceEvent>> out;
+  for (auto& r : results) {
+    out.push_back(std::move(r.event_));
+    event_id_map.insert({r.id_, out.back().get()});
+  }
+
+  // Link parents to children.
+  for (const auto i : c10::irange(results.size())) {
+    out[i]->parent_ = event_id_map.at(results[i].parent_id_);
+  }
+  return out;
 }
 
-
 // ============================================================================
 // == API =====================================================================
 // ============================================================================
-
 int PythonTracer::pyProfileFn(
-        PyObject* obj,
-        PyFrameObject* frame,
-        int what,
-        PyObject* arg) {
-    auto ctx = reinterpret_cast<TraceContext*>(obj);
-    switch (what) {
-        case PyTrace_CALL:
-            PythonTracer::singleton().recordPyCall(ctx, frame);
-            break;
-
-        case PyTrace_C_CALL:
-            PythonTracer::singleton().recordCCall(ctx, frame, arg);
-            break;
-
-        case PyTrace_EXCEPTION:
-        case PyTrace_RETURN:
-            PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return);
-            break;
-
-        case PyTrace_C_EXCEPTION:
-        case PyTrace_C_RETURN:
-            PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return);
-            break;
-    }
-    return 0;
+    PyObject* obj,
+    PyFrameObject* frame,
+    int what,
+    PyObject* arg) {
+  auto ctx = reinterpret_cast<TraceContext*>(obj);
+  switch (what) {
+    case PyTrace_CALL:
+      PythonTracer::singleton().recordPyCall(ctx, frame);
+      break;
+
+    case PyTrace_C_CALL:
+      PythonTracer::singleton().recordCCall(ctx, frame, arg);
+      break;
+
+    case PyTrace_EXCEPTION:
+    case PyTrace_RETURN:
+      PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return);
+      break;
+
+    case PyTrace_C_EXCEPTION:
+    case PyTrace_C_RETURN:
+      PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return);
+      break;
+  }
+  return 0;
 }
 
 void PythonTracer::call(Command c) {
-    switch (c) {
-        case Command::kStartOne:
-            PythonTracer::singleton().start(1);
-            break;
-
-        case Command::kStartAll:
-            PythonTracer::singleton().start();
-            break;
-
-        case Command::kStop:
-            PythonTracer::singleton().stop();
-            break;
-
-        case Command::kClear:
-            PythonTracer::singleton().clear();
-            break;
-
-        default:
-            break;
-    }
+  switch (c) {
+    case Command::kStartOne:
+      PythonTracer::singleton().start(1);
+      break;
+
+    case Command::kStartAll:
+      PythonTracer::singleton().start();
+      break;
+
+    case Command::kStop:
+      PythonTracer::singleton().stop();
+      break;
+
+    case Command::kClear:
+      PythonTracer::singleton().clear();
+      break;
+
+    default:
+      break;
+  }
 };
 
-}  // namespace
+} // namespace
 
 void init() {
-    pybind11::gil_scoped_acquire gil;
-    TORCH_CHECK(PyType_Ready(&TraceContextType) == 0);
+  pybind11::gil_scoped_acquire gil;
+  TORCH_CHECK(PyType_Ready(&TraceContextType) == 0);
 
-    registerFunctions(
-        /*call=*/&PythonTracer::call,
-        /*get_events=*/&PyTraceReplay::getEvents
-    );
+  registerFunctions(
+      /*call=*/&PythonTracer::call,
+      /*get_events=*/&PyTraceReplay::getEvents);
 }
-
 }}}} // namespace torch::autograd::profiler::python_tracer
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 9a6221130ed0..43911fe18b99 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -167,10 +167,16 @@ auto PyNode::is_traceable() -> bool {
 }
 
 auto PyNode::release_variables() -> void {
-  pybind11::gil_scoped_acquire gil;
-  auto f = (THPFunction*) obj;
-  f->saved_variables.clear();
-  f->has_freed_buffers = 1;
+  // This function is called as part of the Node destructor!
+  // Since this object might be kept alive by C++, it is possible
+  // that the python interpreter is already dead here. In that case
+  // we just leak the saved objects.
+  if (Py_IsInitialized()) {
+    pybind11::gil_scoped_acquire gil;
+    auto f = (THPFunction*) obj;
+    f->saved_variables.clear();
+    f->has_freed_buffers = 1;
+  }
 }
 
 auto PyNode::name() const -> std::string {
@@ -564,6 +570,11 @@ static void _trace_post_record(
   }
 
   node->i_(jit::attr::inplace, is_inplace);
+  if (PyObject* module_name = PyDict_GetItemString(((PyTypeObject*)op_obj)->tp_dict, "__module__")) {
+    if (auto ptr = PyUnicode_AsUTF8(module_name)) {
+        node->s_(jit::attr::module, std::string(ptr));
+    }
+  }
 
   // Isolate C variable ptrs in a vector
   int num_outputs = PyTuple_GET_SIZE(output_objects);
@@ -671,10 +682,19 @@ PyObject* THPFunction_name(PyObject *self, PyObject* noargs) {
 PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
 {
   HANDLE_TH_ERRORS
+
+  // save a local copy of seq_id before it gets incremented
+  int seq_id = at::sequence_number::peek();
+  auto info_pair = unpack_input<false>(inputs);
+  UnpackedInput& unpacked_input = info_pair.first;
+  InputFlags& input_info = info_pair.second;
+
+  // Call record function after all the inputs have been decoded, but
+  // before context has been allocated.
   RECORD_FUNCTION(
     ((PyTypeObject*)cls)->tp_name,
-    std::vector<c10::IValue>(),
-    at::sequence_number::peek());
+    std::vector<c10::IValue>(unpacked_input.input_vars.begin(), unpacked_input.input_vars.end()),
+    seq_id);
 
   // Temporary hack to improve functorch UX. We'll find a better solution.
   const auto& functorch_tls = at::functorch::functorchTLSAccessor();
@@ -691,11 +711,6 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
   auto cdata = std::shared_ptr<PyNode>(new PyNode(std::move(ctx_obj)), deleteNode);
   ctx->cdata = cdata;
 
-  // Prepare inputs and allocate context (grad fn)
-  auto info_pair = unpack_input<false>(inputs);
-  UnpackedInput& unpacked_input = info_pair.first;
-  InputFlags& input_info = info_pair.second;
-
   // Record input nodes if tracing
   auto* node = _trace_pre_record(cls, inputs, unpacked_input.input_vars);
 
@@ -705,6 +720,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
   ctx->needs_input_grad = input_info.needs_input_grad.release();
   ctx->is_variable_input = std::move(input_info.is_variable_input);
 
+
   // Prepend ctx to input_tuple, in preparation for static method call
   auto num_args = PyTuple_GET_SIZE(inputs);
   THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp
deleted file mode 100644
index cda38bdb7dff..000000000000
--- a/torch/csrc/autograd/python_mode.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <torch/csrc/autograd/python_mode.h>
-#include <torch/csrc/python_headers.h>
-#include <torch/csrc/autograd/python_variable.h>
-#include <ATen/core/PythonModeTLS.h>
-#include <c10/core/TensorImpl.h>
-
-namespace torch { namespace autograd {
-
-void PythonMode::enter(PyObject* type) {
-  if (at::impl::PythonModeTLS::get_state()) {
-    TORCH_CHECK(
-        false,
-        "python mode has already been set. We do not yet support nested python ",
-        "mode. Please file us an issue and reset it before setting it again.")
-  }
-  // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?]
-  Py_INCREF(type);
-  auto state = std::make_shared<c10::TorchDispatchTypeObject>(type, getPyInterpreter());
-  at::impl::PythonModeTLS::set_state(state);
-}
-
-void PythonMode::exit() {
-  TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!");
-  at::impl::PythonModeTLS::reset_state();
-}
-
-}}
diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h
deleted file mode 100644
index 03da51c1c49e..000000000000
--- a/torch/csrc/autograd/python_mode.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <torch/csrc/python_headers.h>
-#include <c10/core/TensorImpl.h>
-
-namespace torch { namespace autograd {
-
-struct TORCH_API PythonMode {
-  // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__.
-  // `type` is the type of a Tensor subclass that has __torch_dispatch__.
-  static void enter(PyObject* type);
-
-  // Exit the current python mode.
-  static void exit();
-};
-
-}}
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 5af6f3cc640b..e35af63ccf2e 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -359,8 +359,21 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject*
 static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "as_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None)",
+  });
+
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(
+        r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
   jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  return THPVariable_Wrap(torch::utils::as_tensor(
+      torch::tensors::get_default_dispatch_key(),
+      torch::tensors::get_default_scalar_type(),
+      r));
   END_HANDLE_TH_ERRORS
 }
 
@@ -394,35 +407,90 @@ static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
 
 static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs);
 
-static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
+#define THPVARIABLE_SPARSE_COMPRESSED_CTOR(NAME, NARGS, SIGNATURES)      \
+static PyObject * THPVariable_ ## NAME(PyObject* self, PyObject* args, PyObject* kwargs) \
+{                                                                       \
+  HANDLE_TH_ERRORS                                                      \
+  static PythonArgParser parser SIGNATURES ;                          \
+  ParsedArgs<NARGS> parsed_args;                                        \
+  auto r = parser.parse(args, kwargs, parsed_args);                     \
+  if (r.has_torch_function()) {                                         \
+    return handle_torch_function(r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); \
+  }                                                                     \
+  jit::tracer::warn("torch."  # NAME, jit::tracer::WARN_CONSTRUCTOR);   \
+  return THPVariable_Wrap(torch::utils::NAME ## _ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), r)); \
+  END_HANDLE_TH_ERRORS                                                  \
+}
+
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_compressed_tensor, 9,
+    ({"sparse_compressed_tensor(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+     "sparse_compressed_tensor(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_csr_tensor, 9,
+    ({"sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+      "sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_csc_tensor, 9,
+    ({"sparse_csc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+      "sparse_csc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_bsr_tensor, 9,
+    ({"sparse_bsr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+      "sparse_bsr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_bsc_tensor, 9,
+    ({"sparse_bsc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+      "sparse_bsc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"}))
+
+
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_compressed_tensor_unsafe, 8,
+    ({"_sparse_compressed_tensor_unsafe(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_csr_tensor_unsafe, 7,
+    ({"_sparse_csr_tensor_unsafe(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_csc_tensor_unsafe, 7,
+    ({"_sparse_csc_tensor_unsafe(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_bsr_tensor_unsafe, 7,
+    ({"_sparse_bsr_tensor_unsafe(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"}))
+THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_bsc_tensor_unsafe, 7,
+    ({"_sparse_bsc_tensor_unsafe(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"}))
 
 static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+      "sparse_coo_tensor(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+      "sparse_coo_tensor(IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<6> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(
+        r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
   jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(
+      torch::tensors::get_default_dispatch_key(),
+      torch::tensors::get_default_scalar_type(),
+      r));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "_sparse_coo_tensor_unsafe(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<6> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(
+        r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
   jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(
+      torch::tensors::get_default_dispatch_key(),
+      torch::tensors::get_default_scalar_type(),
+      r));
   END_HANDLE_TH_ERRORS
 }
 
@@ -431,8 +499,22 @@ static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject
 static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, DimnameList? names=None)",
+  });
+
+  constexpr int ctor_num_args = 6;
+  ParsedArgs<ctor_num_args> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(
+        r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
   jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  return THPVariable_Wrap(torch::utils::tensor_ctor(
+      torch::tensors::get_default_dispatch_key(),
+      torch::tensors::get_default_scalar_type(),
+      r));
   END_HANDLE_TH_ERRORS
 }
 
@@ -661,7 +743,19 @@ static PyObject * THPVariable__sync(PyObject *self, PyObject* args, PyObject* kw
 static PyObject * THPVariable__enable_functionalization(PyObject *self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({"_enable_functionalization(*, bool reapply_views=False)"}, /*traceable=*/true);
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  const auto reapply_views = r.toBool(0);
+
+  if (c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Functionalize)) {
+    TORCH_INTERNAL_ASSERT(false, "multiple layers of mode-style functionalization nesting is not"
+     " currently supported, outside of the functionalize() transform");
+  }
   c10::impl::tls_set_dispatch_key_included(at::DispatchKey::Functionalize, true);
+  if (reapply_views) {
+      at::functionalization::impl::setFunctionalizationReapplyViewsTLS(true);
+  }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -670,6 +764,7 @@ static PyObject * THPVariable__disable_functionalization(PyObject *self, PyObjec
 {
   HANDLE_TH_ERRORS
   c10::impl::tls_set_dispatch_key_included(at::DispatchKey::Functionalize, false);
+  at::functionalization::impl::setFunctionalizationReapplyViewsTLS(false);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -701,8 +796,16 @@ static PyMethodDef torch_functions_manual[] = {
   {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_compressed_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_compressed_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_compressed_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_compressed_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_csc_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csc_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_bsr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_bsr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_bsc_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_bsc_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_csc_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csc_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_bsr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_bsr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_bsc_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_bsc_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
   {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index eca3fce4a1da..cf3e263bb365 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1,44 +1,43 @@
-#include <torch/csrc/autograd/python_variable.h>
-
-#include <torch/csrc/THP.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/core/PythonFallbackKernel.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/util/DeadlockDetection.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/Device.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
-#include <torch/csrc/Device.h>
 #include <torch/csrc/Size.h>
+#include <torch/csrc/THP.h>
 #include <torch/csrc/Types.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/accumulate_grad.h>
+#include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_hook.h>
 #include <torch/csrc/autograd/python_variable_indexing.h>
-#include <torch/csrc/autograd/variable.h>
-#include <torch/csrc/autograd/functions/accumulate_grad.h>
-#include <torch/csrc/autograd/function.h>
-#include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/autograd/utils/error_messages.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/tensor/python_tensor.h>
-#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/cuda_lazy_init.h>
-#include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
-#include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_new.h>
-#include <torch/csrc/jit/frontend/tracer.h>
-#include <ATen/NamedTensorUtils.h>
-#include <c10/core/DeviceType.h>
-#include <c10/util/DeadlockDetection.h>
-#include <c10/util/irange.h>
-
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/tensor_memoryformats.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
 
 #include <torch/library.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
-#include <torch/csrc/autograd/python_mode.h>
 
 
 #include <ATen/ATen.h>
-#include <pybind11/pybind11.h>
 
 #include <structmember.h>
 #include <cstdint>
@@ -53,6 +52,116 @@ using namespace at;
 using namespace torch;
 using namespace torch::autograd;
 
+std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(const c10::OperatorHandle& op, const std::vector<c10::IValue>& arguments) {
+  TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call parseIValuesToPyArgsKwargs");
+  const auto& schema = op.schema();
+  py::dict kwargs;
+  // About all the pointers:
+  //
+  // f(int x, int y = 0, *, int z = 0)
+  //                                  ^- arguments.size()
+  //                        ^- kwarg_only_start
+  //          ^- positional_default_start
+  //   ^- 0
+
+  // Find the split point between kwarg-only and regular.  Since most functions
+  // don't have kwarg-only arguments, it is more efficient to scan from the
+  // right (but ideally, this would just be precomputed in FunctionSchema
+  // itself).  (NB: minus one in the loop is because we're testing if the
+  // *next* argument is kwarg-only before we advance the starting index)
+  int64_t kwarg_only_start = arguments.size();
+  for (; kwarg_only_start > 0; kwarg_only_start--) {
+    const auto& arg = schema.arguments()[kwarg_only_start - 1];
+    if (!arg.kwarg_only()) {
+      break;
+    }
+  }
+
+  // Find the first positional argument that isn't defaulted
+  auto is_default = [&](int64_t idx) -> bool {
+    const auto& arg = schema.arguments()[idx];
+    if (!arg.default_value().has_value()) {
+      return false;
+    }
+    const auto& default_ivalue = *arg.default_value();
+    const auto& ivalue = arguments[idx];
+    if (default_ivalue != ivalue) {
+      return false;
+    }
+    return true;
+  };
+
+  int64_t positional_default_start = kwarg_only_start;
+  for (; positional_default_start > 0; positional_default_start--) {
+    if (!is_default(positional_default_start - 1)) {
+      break;
+    }
+  }
+
+  auto args = py::reinterpret_steal<py::object>(PyTuple_New(positional_default_start));
+
+  auto schemaAwareToPyObject = [&](int64_t idx) -> py::object {
+    const auto& arg = schema.arguments()[idx];
+    auto match = [&](c10::TypeKind kind) {
+      const auto& t = arg.real_type();
+      if (t->kind() == kind) return true;
+      if (auto opt_t = t->cast<c10::OptionalType>()) {
+        if (opt_t->getElementType()->kind() == kind) return true;
+      }
+      return false;
+    };
+    if (arguments[idx].isNone()) {
+      return py::none();
+    } else if (match(c10::ScalarTypeType::Kind)) {
+      auto* obj = getTHPDtype(static_cast<c10::ScalarType>(arguments[idx].toInt()));
+      return py::reinterpret_borrow<py::object>(reinterpret_cast<PyObject*>(obj));
+    } else if (match(c10::LayoutType::Kind)) {
+      auto* obj = getTHPLayout(static_cast<c10::Layout>(arguments[idx].toInt()));
+      return py::reinterpret_borrow<py::object>(reinterpret_cast<PyObject*>(obj));
+    } else if (match(c10::MemoryFormatType::Kind)) {
+      return torch::utils::getTHPMemoryFormat(static_cast<c10::MemoryFormat>(arguments[idx].toInt()));
+    } else {
+      return torch::jit::toPyObject(arguments[idx]);
+    }
+  };
+
+  // Populate positional arguments
+  for (const auto idx : c10::irange(positional_default_start)) {
+    PyTuple_SET_ITEM(args.ptr(), idx, schemaAwareToPyObject(idx).release().ptr());
+  }
+
+  // Populate keyword arguments
+  for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) {
+    // But don't populate default keyword arguments
+    if (is_default(idx)) continue;
+    const auto& arg = schema.arguments()[idx];
+    kwargs[py::cast(arg.name())] = schemaAwareToPyObject(idx);
+  }
+  return std::make_pair(std::move(args), std::move(kwargs));
+}
+
+void pushPyOutToStack(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    py::object out,
+    const char* msg) {
+  TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call pushPyOutToStack");
+  auto schema_returns = op.schema().returns();
+  const auto num_returns = schema_returns.size();
+  if (num_returns == 0) {
+    // Check that we got a None return from Python. Anything else is an error.
+    TORCH_CHECK(out.is(py::none()), "Expected ", msg, " for ", op.operator_name(),
+                " to return None but it returned something else instead.");
+  } else if (num_returns == 1) {
+    torch::jit::push(stack, torch::jit::toIValue(out.ptr(), schema_returns[0].type()));
+  } else {
+    auto outs = py::cast<py::sequence>(out);
+    for (const auto idx : c10::irange(outs.size())) {
+      torch::jit::push(stack, torch::jit::toIValue(outs[idx].ptr(), schema_returns[idx].type()));
+    }
+  }
+}
+
 namespace {
 
 std::string concrete_name_fn(const c10::impl::PyInterpreter* self) {
@@ -104,7 +213,8 @@ void concrete_dispatch_fn(
     const c10::impl::PyInterpreter*,
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type);
+    const std::shared_ptr<SafePyObject>& type);
+bool concrete_is_contiguous_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self);
 
 class PyInterpreterHolder {
  public:
@@ -113,7 +223,8 @@ class PyInterpreterHolder {
             &concrete_name_fn,
             &concrete_decref_fn,
             &concrete_detach_fn,
-            &concrete_dispatch_fn)) {}
+            &concrete_dispatch_fn,
+            &concrete_is_contiguous_fn)) {}
   // NB: intentionally leaks the memory
   ~PyInterpreterHolder() {
     impl_->disarm();
@@ -133,8 +244,6 @@ c10::impl::PyInterpreter* getPyInterpreter() {
   return self_interpreter.get();
 }
 
-namespace py = pybind11;
-
 PyObject *THPVariableClass = nullptr;
 
 PyObject *ParameterClass = nullptr;
@@ -151,10 +260,12 @@ static const char* VOLATILE_WARNING =
 
 static bool check_has_torch_dispatch(PyObject *obj) {
   PyTypeObject *tp = Py_TYPE(obj);
+  py::object attr = PyObject_FastGetAttrString(obj, "__torch_dispatch__");
   return (
     !THPVariable_CheckTypeExact(tp) &&
     // TODO: test if Python key is disabled
-    PyObject_FastGetAttrString(obj, "__torch_dispatch__").ptr() != nullptr
+    attr.ptr() != nullptr &&
+    attr.ptr() != torch::disabled_torch_dispatch_impl()
   );
 }
 
@@ -236,7 +347,106 @@ PyObject * THPVariable_Wrap(at::TensorBase var)
       (PyTypeObject*)THPVariableClass, std::move(var), status);
 }
 
+bool isResurrectable(THPVariable* self) {
+  // We want to divide this check into 2 cases.
+
+  // 1. C++ owns PyObject (in this case, self->cdata.unsafeIsBorrowed() is
+  // true). You might think that in this case, it is impossible for tp_clear to
+  // be called: surely the C++ reference to the PyObject is keeping it live? And
+  // you'd be right! In fact, when C++ owns the PyObject, we have an invariant
+  // that the refcount on the PyObject should be precisely one (because if you
+  // take out another reference to the PyObject, we're supposed to flip the
+  // ownership pointer back). In reality, you can violate this invariant
+  // temporarily with weak references, so we don't test for it in asserts.
+
+  // 2. PyObject owns C++ (in this case, self->cdata.unsafeIsBorrowed() is
+  // false). In this case, tp_clear can get called if the PyObject is referenced
+  // from a dead cycle, and nowhere else. But if resurrection did not occur,
+  // then the reference to C++ from the PyObject must be the ONLY reference to
+  // the C++ object.
+  if (self->cdata.unsafeIsBorrowed()) {
+    return false;
+  }
+  auto const& tensor = THPVariable_Unpack(self);
+  if (!tensor.defined() || tensor.use_count() <= 1) {
+    return false;
+  }
+  return true;
+}
+
+// returns true if successfully rezzed; if so, cancel the
+// rest of deallocation
+static bool THPVariable_tryResurrect(THPVariable* self) {
+  const auto& tensor = THPVariable_Unpack(self);
+
+  if (!isResurrectable(self)) {
+    return false;
+  }
+
+  // At this point, we are definitely going to resurrect the tensor. So, the
+  // tensor better be defined :)
+  TORCH_INTERNAL_ASSERT(tensor.defined());
+
+  // There are other C++ owners of the tensor.  Flip ownership
+  // so that C++ owns this Python object, and cancel deallocation.
+  TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->owns_pyobj());
+
+  tensor.unsafeGetTensorImpl()->set_owns_pyobj(true);
+
+// Resurrect the Python object.  This is something CPython does
+// internally occasionally, see
+// https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259
+// so we just copy the pattern here.  Note that we don't have to worry
+// about saving and restoring the refcount (as the quoted code does)
+// because we actually DO need to reset the refcount to one here, we
+// can't assume that some other code has taken care of it.
+// NB: this will overreport _Py_RefTotal but based on inspection of object.c
+// there is no way to avoid this
+#ifdef Py_TRACE_REFS
+  _Py_AddToAllObjects(reinterpret_cast<PyObject *>(self), 1);
+#endif
+  Py_INCREF(self);
+
+  // Flip THPVariable to be non-owning
+  // (near use-after-free miss here: fresh MaybeOwned is created breaking
+  // reference on Tensor in struct BEFORE we overwrite the old one)
+  self->cdata = MaybeOwned<Variable>::borrowed(tensor);
+
+  // NB: At this point, tensor *could* be dead (e.g., some other C++ thread
+  // decrefed it.)  At this point, it is probably waiting on the GIL to
+  // deallocate the Python object and will kill self, BUT NOT YET.
+
+  return true;
+}
+
+
 static int THPVariable_clear(THPVariable* self) {
+  // Is it OK for an object to still be live after running
+  // tp_clear? Yes. When Python is breaking reference cycles, it can't assume
+  // that an object will dealloc after it's cleared.  The source code explicitly
+  // handles this case:
+  // https://github.com/python/cpython/blob/4e661cd69164318c1f871faa476c68a04092ddc4/Modules/gcmodule.c#L1010-L1025
+
+  // Note that we don't need to actually resurrect here. There are 2 cases:
+  // 1. The PyObject is not part of a reference cycle. In this case, we don't
+  // need to do anything. The GC will move on to try and break the reference
+  // cycle on another object, which will eventually trigger tp_dealloc (and thus
+  // resurrection).
+
+  // 2. The PyObject is part of a reference cycle. This case should not actually
+  // be possible, due to the logic in our tp_traverse (THPVariable_subclass_traverse).
+
+  // In fact, resurrecting here breaks the invariant that "C++ owns Python only
+  // when PyObject's refcount would otherwise be 0". Most immediately, as we're
+  // merely breaking reference cycles here, there can be other references to the
+  // PyObject. *However*, if other objects in the refcycle resurrect, then we
+  // will be in a state where the PyObject has multiple Python references, yet
+  // C++ owns the PyObject.
+
+  // See https://github.com/pytorch/pytorch/pull/75933 for more discussion.
+  if (isResurrectable((THPVariable*)self)) {
+    return 0;
+  }
   Py_CLEAR(self->backward_hooks);
   const auto& tensor = THPVariable_Unpack(self);
   if (tensor.defined()) {
@@ -289,54 +499,11 @@ static int THPVariable_clear(THPVariable* self) {
       }
     }
   }
+  TORCH_INTERNAL_ASSERT(!isResurrectable((THPVariable*)self));
   self->cdata = MaybeOwned<Variable>();
   return 0;
 }
 
-// returns true if successfully rezzed; if so, cancel the
-// rest of deallocation
-static bool THPVariable_tryResurrect(THPVariable* self) {
-  const auto& tensor = THPVariable_Unpack(self);
-
-  // Is this true or not???  Triggered by TestAutograd.test_variable_traverse
-  // TORCH_INTERNAL_ASSERT(tensor.defined());
-
-  // Check if there are other C++ owners
-  if (tensor.use_count() <= 1) {
-    return false;
-  }
-
-  // There are other C++ owners of the tensor.  Flip ownership
-  // so that C++ owns this Python object, and cancel deallocation.
-  TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->owns_pyobj());
-
-  tensor.unsafeGetTensorImpl()->set_owns_pyobj(true);
-
-// Resurrect the Python object.  This is something CPython does
-// internally occasionally, see
-// https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259
-// so we just copy the pattern here.  Note that we don't have to worry
-// about saving and restoring the refcount (as the quoted code does)
-// because we actually DO need to reset the refcount to one here, we
-// can't assume that some other code has taken care of it.
-// NB: this will overreport _Py_RefTotal but based on inspection of object.c
-// there is no way to avoid this
-#ifdef Py_TRACE_REFS
-  _Py_AddToAllObjects(reinterpret_cast<PyObject *>(self), 1);
-#endif
-  Py_INCREF(self);
-
-  // Flip THPVariable to be non-owning
-  // (near use-after-free miss here: fresh MaybeOwned is created breaking
-  // reference on Tensor in struct BEFORE we overwrite the old one)
-  self->cdata = MaybeOwned<Variable>::borrowed(tensor);
-
-  // NB: At this point, tensor *could* be dead (e.g., some other C++ thread
-  // decrefed it.)  At this point, it is probably waiting on the GIL to
-  // deallocate the Python object and will kill self, BUT NOT YET.
-
-  return true;
-}
 
 PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs);
 
@@ -369,9 +536,9 @@ static PyObject* THPVariable_as_subclass(PyObject* _self, PyObject* args, PyObje
 static PyObject* THPVariable_make_subclass(PyObject* _ignored, PyObject* args, PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-    "_make_subclass(PyObject* cls, Tensor data, bool require_grad=False)",
+    "_make_subclass(PyObject* cls, Tensor data, bool require_grad=False, *, bool dispatch_strides=False)",
   });
-  ParsedArgs<3> parsed_args{};
+  ParsedArgs<4> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
   if (!PyType_Check(cls)) {
@@ -390,6 +557,9 @@ static PyObject* THPVariable_make_subclass(PyObject* _ignored, PyObject* args, P
   // ```
   data.unsafeGetTensorImpl()->set_allow_tensor_metadata_change(true);
   data.set_requires_grad(r.toBool(2));
+  if (r.toBool(3)) {
+    data.unsafeGetTensorImpl()->set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomStrides);
+  }
   return THPVariable_NewWithVar(
       (PyTypeObject*)cls,
       std::move(data),
@@ -402,9 +572,9 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py
   // NB: pin_memory doesn't actually do anything
   // TODO: strides variant?
   static PythonArgParser parser({
-    "_make_wrapper_subclass(PyObject* cls, IntArrayRef size, *, IntArrayRef? strides=None, int64_t? storage_offset=None, MemoryFormat? memory_format=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+    "_make_wrapper_subclass(PyObject* cls, IntArrayRef size, *, IntArrayRef? strides=None, int64_t? storage_offset=None, MemoryFormat? memory_format=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False, bool dispatch_strides=False)",
   });
-  ParsedArgs<10> parsed_args{};
+  ParsedArgs<11> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
@@ -414,7 +584,12 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py
   // to continue on to the underlying CPU/CUDA kernel advertised by the dispatch
   // key, which will immediately segfault because the data pointer is null.  By
   // forcing users to define __torch_dispatch__ we ensure this does not happen
-  TORCH_CHECK_TYPE(PyObject_FastGetAttrString(cls, "__torch_dispatch__").ptr() != nullptr,
+  // TODO: This check is not complete; because the user can disable torch
+  // dispatch and then go again, triggering segfault.  TBH I'm thinking I want
+  // to delete this function entirely
+  py::object attr = PyObject_FastGetAttrString(cls, "__torch_dispatch__");
+  TORCH_CHECK_TYPE(attr.ptr() != nullptr && attr.ptr() != torch::disabled_torch_dispatch_impl()
+,
     ((PyTypeObject*)cls)->tp_name, " must define __torch_dispatch__");
 
   const auto options = TensorOptions()
@@ -439,6 +614,10 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py
         .make_tensor();
   data.set_requires_grad(r.toBool(9));
 
+  if (r.toBool(10)) {
+    data.unsafeGetTensorImpl()->set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomStrides);
+  }
+
   return THPVariable_NewWithVar(
       (PyTypeObject*)cls,
       std::move(data),
@@ -894,6 +1073,16 @@ PyObject *THPVariable_is_cuda(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THPVariable_is_ipu(THPVariable* self, void* unused) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function((PyObject*)self)) {
+    return handle_torch_function_getter(self, "is_ipu");
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  return torch::autograd::utils::wrap(self_.is_ipu());
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THPVariable_is_xpu(THPVariable* self, void* unused) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
@@ -937,14 +1126,14 @@ PyObject *THPVariable_is_mkldnn(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-PyObject *THPVariable_is_mlc(THPVariable *self, void *unused)
+PyObject *THPVariable_is_mps(THPVariable *self, void *unused)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject *)self)) {
-    return handle_torch_function_getter(self, "is_mlc");
+    return handle_torch_function_getter(self, "is_mps");
   }
   auto& self_ = THPVariable_Unpack(self);
-  return torch::autograd::utils::wrap(self_.is_mlc());
+  return torch::autograd::utils::wrap(self_.is_mps());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1003,6 +1192,17 @@ PyObject *THPVariable_is_complex(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject *THPVariable_is_nested(THPVariable *self, void *unused)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function((PyObject *)self)) {
+    return handle_torch_function_getter(self, "is_nested");
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  return torch::autograd::utils::wrap(self_.is_nested());
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject *THPVariable_dtype(THPVariable *self, void *unused)
 {
   HANDLE_TH_ERRORS
@@ -1057,28 +1257,28 @@ PyObject *THPVariable_get_imag(THPVariable* self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-int THPVariable_set_real(THPVariable *self, THPVariable *real, void *unused)
+int THPVariable_set_real(PyObject* self, PyObject* real, void *unused)
 {
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
-  auto& real_ = THPVariable_Unpack(real);
+  auto self_real = at::real(self_);
+  auto real_ = valueToTensor(self_real.options(), real, self_real.device());
   {
     pybind11::gil_scoped_release no_gil;
-    auto self_real = at::real(self_);
     self_real.copy_(real_);
     return 0;
   }
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
-int THPVariable_set_imag(THPVariable* self, THPVariable *imag, void *unused)
+int THPVariable_set_imag(PyObject* self, PyObject* imag, void *unused)
 {
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
-  auto& imag_ = THPVariable_Unpack(imag);
+  auto self_imag = at::imag(self_);
+  auto imag_ = valueToTensor(self_imag.options(), imag, self_imag.device());
   {
     pybind11::gil_scoped_release no_gil;
-    auto self_imag = at::imag(self_);
     self_imag.copy_(imag_);
     return 0;
   }
@@ -1112,15 +1312,17 @@ static struct PyGetSetDef THPVariable_properties[] = {
   {"shape", (getter)THPVariable_get_shape, nullptr, nullptr, nullptr},
   {"is_cuda", (getter)THPVariable_is_cuda, nullptr, nullptr, nullptr},
   {"is_xpu", (getter)THPVariable_is_xpu, nullptr, nullptr, nullptr},
+  {"is_ipu", (getter)THPVariable_is_ipu, nullptr, nullptr, nullptr},
   {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr},
   {"is_sparse_csr", (getter)THPVariable_is_sparse_csr, nullptr, nullptr, nullptr},
   {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr},
-  {"is_mlc", (getter)THPVariable_is_mlc, nullptr, nullptr, nullptr},
+  {"is_mps", (getter)THPVariable_is_mps, nullptr, nullptr, nullptr},
   {"is_ort", (getter)THPVariable_is_ort, nullptr, nullptr, nullptr},
   {"is_vulkan", (getter)THPVariable_is_vulkan, nullptr, nullptr, nullptr},
   {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
   {"is_quantized", (getter)THPVariable_is_quantized, nullptr, nullptr, nullptr},
   {"is_meta", (getter)THPVariable_is_meta, nullptr, nullptr, nullptr},
+  {"is_nested", (getter)THPVariable_is_nested, nullptr, nullptr, nullptr},
   {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr},
   {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr},
   {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr},
@@ -1260,7 +1462,7 @@ PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs
   HANDLE_TH_ERRORS
   TORCH_CHECK(type != &THPVariableType, "Cannot directly construct _TensorBase; subclass it and then construct that");
   jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR);
-  auto tensor = torch::utils::legacy_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs);
+  auto tensor = torch::utils::base_tensor_ctor(args, kwargs);
   // WARNING: tensor is NOT guaranteed to be a fresh tensor; e.g., if it was
   // given a raw pointer that will refcount bump
   return THPVariable_NewWithVar(
@@ -1515,10 +1717,8 @@ static int THPVariable_subclass_traverse(
   // self is live, and nothing will get GC'ed anyway (resurrection cannot happen
   // if the C++ objects owns the PyObject)
   THPVariable* var = reinterpret_cast<THPVariable*>(self);
-  if (!var->cdata.unsafeIsBorrowed()) {
-    const auto& tensor = THPVariable_Unpack(self);
-    if (tensor.defined() && tensor.use_count() > 1)
-      return 0;
+  if (isResurrectable(var)) {
+    return 0;
   }
 
   // Crappy version of subtype_traverse; same deal as
@@ -1655,10 +1855,38 @@ bool isPythonTensor(const Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
 }
 
+
+py::object torchDispatchFromTensorImpl(const c10::TensorImpl* self, const char* func_name, PyObject* torch_api_function, const char* module_name) {
+  TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call parseIValuesToPyArgsKwargs");
+
+  // Setup the arguments expected for the detach call
+  std::vector<py::handle> overloaded_args;
+  // TODO: there should be a shorter way to spell this
+  // TODO: fix the constness of target
+  Tensor self_t = Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
+  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
+  TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
+  append_overloaded_tensor(&overloaded_args, self_p.ptr());
+  auto args = py::reinterpret_steal<py::object>(PyTuple_New(1));
+  PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
+
+  py::dict kwargs;
+
+  return py::reinterpret_steal<py::object>(
+      handle_torch_function_no_python_arg_parser(
+          overloaded_args,
+          args.ptr(),
+          kwargs.ptr(),
+          func_name,
+          torch_api_function,
+          module_name,
+          TorchFunctionName::TorchDispatch));
+}
+
 // NOTE [dispatch_fn's type argument]
-// `type` is nullable and represents the PythonMode going on.
-// Right now we only support a single PythonMode, but in the future we could
-// change this to a stack of PythonModes.
+// `type` is nullable and represents the TorchDispatchMode going on.
+// Right now we only support a single TorchDispatchMode, but in the future we could
+// change this to a stack of TorchDispatchModes.
 //
 // If `type` isn't null, then we consider the type for dispatch by prepending
 // it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser`
@@ -1667,16 +1895,15 @@ void concrete_dispatch_fn(
     const c10::impl::PyInterpreter*,
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type) {
+    const std::shared_ptr<SafePyObject>& type) {
   const auto& schema = op.schema();
-  const auto num_returns = schema.returns().size();
-
   const auto num_arguments = schema.arguments().size();
   auto arguments = torch::jit::pop(*stack, num_arguments);
 
   // Parse the name into namespace and name (no overload_name)
   // TODO: put this into the library
   const auto& qualified_name = op.operator_name().name;
+  const auto& overload_name = schema.overload_name();
   auto pos = qualified_name.find("::");
   TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
   // Make me some null terminated strings
@@ -1693,59 +1920,17 @@ void concrete_dispatch_fn(
   py::gil_scoped_acquire g;
 
   std::vector<py::handle> overloaded_args;
-  // For now, overloads get coalesced.  Might be easier for users if they get
-  // overload resolution but is more complicated (need to expose separate
-  // functions per overload)
   py::handle torch_api_function = py::module::import("torch").attr("ops").attr(ns).attr(func_name);
-  std::string module_name_str = "torch.ops." + ns_str;
-
-  // About all the pointers:
-  //
-  // f(int x, int y = 0, *, int z = 0)
-  //                                  ^- arguments.size()
-  //                        ^- kwarg_only_start
-  //          ^- positional_default_start
-  //   ^- 0
-
-  // Find the split point between kwarg-only and regular.  Since most functions
-  // don't have kwarg-only arguments, it is more efficient to scan from the
-  // right (but ideally, this would just be precomputed in FunctionSchema
-  // itself).  (NB: minus one in the loop is because we're testing if the
-  // *next* argument is kwarg-only before we advance the starting index)
-  int64_t kwarg_only_start = arguments.size();
-  for (; kwarg_only_start > 0; kwarg_only_start--) {
-    const auto& arg = schema.arguments()[kwarg_only_start - 1];
-    if (!arg.kwarg_only()) {
-      break;
-    }
-  }
-
-  // Find the first positional argument that isn't defaulted
-  auto is_default = [&](int64_t idx) -> bool {
-    const auto& arg = schema.arguments()[idx];
-    if (!arg.default_value().has_value()) {
-      return false;
-    }
-    const auto& default_ivalue = *arg.default_value();
-    const auto& ivalue = arguments[idx];
-    if (default_ivalue != ivalue) {
-      return false;
-    }
-    return true;
-  };
-
-  int64_t positional_default_start = kwarg_only_start;
-  for (; positional_default_start > 0; positional_default_start--) {
-    if (!is_default(positional_default_start - 1)) {
-      break;
-    }
+  py::handle torch_api_function_overload;
+  if (overload_name == "") {
+    torch_api_function_overload = torch_api_function.attr("default");
+  } else {
+    torch_api_function_overload = torch_api_function.attr(overload_name.c_str());
   }
-
-  auto args = py::reinterpret_steal<py::object>(PyTuple_New(positional_default_start));
-  py::dict kwargs;
+  std::string module_name_str = "torch.ops." + ns_str;
 
   if (type) {
-    append_overloaded_type(&overloaded_args, type->ptr());
+    append_overloaded_type(&overloaded_args, type->ptr(getPyInterpreter()));
   }
 
   // Find overloaded tensors
@@ -1770,72 +1955,59 @@ void concrete_dispatch_fn(
     }
   }
 
-  // Populate positional arguments
-  for (const auto idx : c10::irange(positional_default_start)) {
-    PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(std::move(arguments[idx])).release().ptr());
-  }
-
-  // Populate keyword arguments
-  for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) {
-    // But don't populate default keyword arguments
-    if (is_default(idx)) continue;
-    const auto& arg = schema.arguments()[idx];
-    kwargs[py::cast(arg.name())] = torch::jit::toPyObject(std::move(arguments[idx]));
-  }
-
-  auto out = py::reinterpret_steal<py::object>(handle_torch_function_no_python_arg_parser(
-    overloaded_args,
-    args.ptr(),
-    kwargs.ptr(),
-    func_name,
-    torch_api_function.ptr(),
-    module_name_str.c_str(),
-    "__torch_dispatch__"
-  ));
+  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+  auto args = std::move(args_kwargs.first);
+  auto kwargs = std::move(args_kwargs.second);
 
-  if (num_returns == 0) {
-    // Check that we got a None return from Python. Anything else is an error.
-    TORCH_CHECK(out.is(py::none()), "Expected __torch_dispatch__ for ", op.operator_name(),
-                " to return None but it returned something else instead.");
-  } else if (num_returns == 1) {
-    torch::jit::push(stack, torch::jit::toIValue(out.ptr(), op.schema().returns()[0].type()));
-  } else {
-    auto outs = py::cast<py::sequence>(out);
-    for (const auto idx : c10::irange(outs.size())) {
-      torch::jit::push(stack, torch::jit::toIValue(outs[idx].ptr(), op.schema().returns()[idx].type()));
-    }
-  }
+  PyObject* obj = handle_torch_function_no_python_arg_parser(
+                    overloaded_args,
+                    args.ptr(),
+                    kwargs.ptr(),
+                    func_name,
+                    torch_api_function_overload.ptr(),
+                    module_name_str.c_str(),
+                    TorchFunctionName::TorchDispatch);
+  pushPyOutToStack(op, stack, py::reinterpret_steal<py::object>(obj), "__torch_dispatch__");
 }
 
 c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self) {
   pybind11::gil_scoped_acquire gil;
-
-  // Setup the arguments expected for the detach call
-  std::vector<py::handle> overloaded_args;
-  // TODO: there should be a shorter way to spell this
-  // TODO: fix the constness of target
-  Tensor self_t = Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
-  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
-  TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
-  append_overloaded_tensor(&overloaded_args, self_p.ptr());
-  auto args = py::reinterpret_steal<py::object>(PyTuple_New(1));
-  PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
-
-  py::dict kwargs;
-
-  auto out = py::reinterpret_steal<py::object>(handle_torch_function_no_python_arg_parser(
-    overloaded_args,
-    args.ptr(),
-    kwargs.ptr(),
-    "detach",
-    py::module::import("torch").attr("ops").attr("aten").attr("detach").ptr(),
-    "torch.ops.aten",
-    "__torch_dispatch__"
-  ));
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "detach",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("detach")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
 
   TORCH_CHECK(THPVariable_Check(out.ptr()), "detach returned invalid type ", py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())), ", expected Tensor");
   const Tensor& res_t = THPVariable_Unpack(out.ptr());
   return res_t.getIntrusivePtr();
 }
 
+bool concrete_is_contiguous_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self) {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "is_contiguous",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("is_contiguous")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  TORCH_CHECK(PyBool_Check(out.ptr()), "is_contiguous returned invalid type ", py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())), ", expected bool");
+
+  return PyObject_IsTrue(out.ptr());
+}
+
 } // anonymous namespace
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index b55e5c05127d..c4856cdd4d12 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -7,6 +7,10 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/Exceptions.h>
+#include <pybind11/pybind11.h>
+#include <ATen/core/function_schema.h>
+
+namespace py = pybind11;
 
 // Python object that backs torch.autograd.Variable
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -62,3 +66,7 @@ inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
 }
 
 TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+
+std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(const c10::OperatorHandle& op, const std::vector<c10::IValue>& arguments);
+
+void pushPyOutToStack(const c10::OperatorHandle& op, torch::jit::Stack* stack, py::object out, const char* msg);
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 8faa07066ead..27016f4edcc6 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -4,7 +4,6 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/autograd/function.h>
-#include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/utils/python_compat.h>
@@ -88,7 +87,7 @@ static inline Variable sequenceToVariable(c10::TensorOptions options, PyObject*
   return torch::utils::indexing_tensor_from_data(options, kLong, c10::nullopt, seq);
 }
 
-static inline Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device) {
+inline Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device) {
   if (THPVariable_Check(value)) {
     return THPVariable_Unpack(value);
   }
@@ -369,7 +368,7 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   }
 
   const auto& self_ = THPVariable_Unpack(self);
-  if (self_.is_sparse())
+  if (self_.is_sparse() || self_.is_sparse_csr())
   {
     throw TypeError("Cannot assign to a sparse tensor");
   }
diff --git a/torch/csrc/autograd/python_variable_indexing.h b/torch/csrc/autograd/python_variable_indexing.h
index 398b77293810..027bffb6dc8a 100644
--- a/torch/csrc/autograd/python_variable_indexing.h
+++ b/torch/csrc/autograd/python_variable_indexing.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/python_headers.h>
+#include <torch/csrc/autograd/python_variable.h>
 
 namespace torch { namespace autograd {
 
@@ -8,4 +9,6 @@ Py_ssize_t THPVariable_length(PyObject* self);
 PyObject* THPVariable_getitem(PyObject* self, PyObject* index);
 int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* value);
 
+Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device);
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp
index 2cf427e04f60..f5f09b3fe940 100644
--- a/torch/csrc/autograd/record_function_ops.cpp
+++ b/torch/csrc/autograd/record_function_ops.cpp
@@ -1,8 +1,10 @@
+#include <torch/csrc/autograd/record_function_ops.h>
 #include <ATen/cpp_custom_type_hack.h>
 #include <ATen/record_function.h>
 #include <ATen/ThreadLocalState.h>
 
-#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/library.h>
+#include <torch/csrc/jit/runtime/operator.h>
 
 namespace caffe2 {
 // Required for cpp_custom_type_hack to work
@@ -16,47 +18,68 @@ namespace profiler {
 
 // Creates a new profiling scope using RecordFunction and invokes its starting
 // callbacks.
-at::Tensor record_function_enter(
+void record_function_enter(
     const std::string& name,
-    const c10::optional<std::string>& args) {
-  auto rec = std::make_unique<at::RecordFunction>(at::RecordScope::USER_SCOPE);
-  if (rec->isActive()) {
-    if (rec->needsInputs() && args.has_value()) {
-      rec->before(name, std::vector<c10::IValue>{c10::IValue{args.value()}});
+    const c10::optional<std::string>& args,
+    at::RecordFunction &rec) {
+  if (rec.isActive()) {
+    if (rec.needsInputs() && args.has_value()) {
+      rec.before(name, c10::ArrayRef<const c10::IValue>{c10::IValue{args.value()}});
     } else {
-      rec->before(name);
+      rec.before(name);
     }
   }
+}
+
+// Legacy signature using cpp_custom_type_hack
+at::Tensor record_function_enter_legacy(
+    const std::string& name,
+    const c10::optional<std::string>& args) {
+  auto rec = std::make_unique<at::RecordFunction>(at::RecordScope::USER_SCOPE);
+  record_function_enter(name, args, *rec);
   return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions());
 }
 
+// New signature using custom_class
+c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
+    const std::string &name, const c10::optional<std::string> &args) {
+  auto rec = c10::make_intrusive<PythonRecordFunction>(at::RecordScope::USER_SCOPE);
+  record_function_enter(name, args, rec->record);
+  return rec;
+}
+
 at::RecordFunction& getRecordFunctionFromTensor(const at::Tensor& handle) {
   auto& rec = at::cpp_custom_type_hack::cast<at::RecordFunction>(handle);
   return rec;
 }
 
 // Ends the profiling scope created with record_function_enter.
-void record_function_exit(const at::Tensor& handle) {
+void record_function_exit(at::RecordFunction &rec) {
+  rec.end();
+}
+
+// Legacy signature using cpp_custom_type_hack
+void record_function_exit_legacy(const at::Tensor &handle) {
   // We don't actually need to do anything with handle just need to persist the
   // lifetime until now.
   auto& rec = getRecordFunctionFromTensor(handle);
-  rec.end();
+  record_function_exit(rec);
+}
+
+// New signature using custom_class
+void record_function_exit_new(const c10::intrusive_ptr<PythonRecordFunction> &record) {
+  record_function_exit(record->record);
 }
 
+template <typename Func>
 c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut(
-    const at::Tensor& handle,
+    Func get_record,
     const c10::intrusive_ptr<c10::ivalue::Future>& fut) {
   // Profiling callback that ends the associated record_function
   // and returns the value of the passed in future.
   std::function<c10::IValue(c10::ivalue::Future&)> futureProfilingFunc =
-      [handle](c10::ivalue::Future& fut) {
-        TORCH_INTERNAL_ASSERT(
-            handle.defined(),
-            "Undefined RecordFunction handle. This can happen if the handle is "
-            "not correctly persisted and is destroyed before the future is "
-            "realized.");
-
-        auto& rec = getRecordFunctionFromTensor(handle);
+      [get_record = std::move(get_record)](c10::ivalue::Future& fut) {
+        auto& rec = get_record();
         rec.end();
         // Note: this future is returned to the user to ensure that a call to wait()
         // ensures that profiling callbacks have ran. To ensure that this is
@@ -67,36 +90,74 @@ c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut(
       };
   // Define a future that completes after the profiling callbacks are run.
   auto profiledFut = fut->then(at::wrapPropagateTLSState(
-      futureProfilingFunc),
+      std::move(futureProfilingFunc)),
       fut->elementType()
       );
   return profiledFut;
 }
 
-// Internal only, do not use directly, use Python's record_function()
-TORCH_LIBRARY_FRAGMENT(profiler, m) {
-    m.def("_record_function_enter(str name, str? args=None) -> Tensor", &record_function_enter);
-    m.def("_record_function_exit", &record_function_exit);
+// Legacy signature using cpp_custom_type_hack
+c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_legacy(
+    const at::Tensor &handle,
+    const c10::intrusive_ptr<c10::ivalue::Future>& fut) {
+  return _call_end_callbacks_on_fut(
+      [handle] () -> at::RecordFunction& {
+        TORCH_INTERNAL_ASSERT(
+            handle.defined(),
+            "Undefined RecordFunction handle. This can happen if the handle is "
+            "not correctly persisted and is destroyed before the future is "
+            "realized.");
+
+        return getRecordFunctionFromTensor(handle);
+      },
+      fut
+    );
 }
 
-// Needed to register JIT operator in operator registry below
-c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
+// New signature using custom_class
+c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_new(
+    const c10::intrusive_ptr<PythonRecordFunction> &record,
+    const c10::intrusive_ptr<c10::ivalue::Future>& fut) {
+  return _call_end_callbacks_on_fut(
+      [record] () -> at::RecordFunction& { return record->record; }, fut);
 }
 
-jit::RegisterOperators reg_fut_ops({
-    jit::Operator(
+// Internal only, do not use directly, use Python's record_function()
+TORCH_LIBRARY_FRAGMENT(profiler, m) {
+  m.class_<PythonRecordFunction>("_RecordFunction");
+
+  m.def("_record_function_enter(str name, str? args=None) -> Tensor",
+        &record_function_enter_legacy);
+  m.def("_record_function_enter_new(str name, str? args=None) -> "
+        "__torch__.torch.classes.profiler._RecordFunction",
+        &record_function_enter_new);
+  m.def("_record_function_exit", &record_function_exit_legacy);
+  m.def("_record_function_exit._RecordFunction", &record_function_exit_new);
+
+  torch::jit::registerOperator(torch::jit::Operator(
         "profiler::_call_end_callbacks_on_jit_fut(Tensor x, Future(t) y) -> Future(t)",
         [](jit::Stack& stack) {
           // Pop inputs, which should be a future and a tensor
           auto fut = jit::pop(stack).toFuture();
           auto tensor = jit::pop(stack).toTensor();
-          auto profiledFut = _call_end_callbacks_on_fut(tensor, fut);
+          auto profiledFut = _call_end_callbacks_on_fut_legacy(tensor, fut);
           // return future that completes when profiling callbacks have run.
           jit::push(stack, std::move(profiledFut));
         },
-        aliasAnalysisFromSchema()),
-});
+        c10::AliasAnalysisKind::FROM_SCHEMA));
+  torch::jit::registerOperator(torch::jit::Operator(
+        "profiler::_call_end_callbacks_on_jit_fut._RecordFunction("
+            "__torch__.torch.classes.profiler._RecordFunction x, Future(t) y) -> Future(t)",
+        [](c10::Stack &stack) {
+          // Pop inputs, which should be a future and a PythonRecordFunction
+          auto fut = torch::jit::pop(stack).toFuture();
+          auto tensor = torch::jit::pop(stack).toCustomClass<PythonRecordFunction>();
+          auto profiledFut = _call_end_callbacks_on_fut_new(tensor, fut);
+          // return future that completes when profiling callbacks have run.
+          torch::jit::push(stack, std::move(profiledFut));
+        },
+        c10::AliasAnalysisKind::FROM_SCHEMA));
+}
 
 } // namespace profiler
 } // namespace autograd
diff --git a/torch/csrc/autograd/record_function_ops.h b/torch/csrc/autograd/record_function_ops.h
index 9042537aeabc..2c074f2dfe5b 100644
--- a/torch/csrc/autograd/record_function_ops.h
+++ b/torch/csrc/autograd/record_function_ops.h
@@ -1,17 +1,28 @@
 #pragma once
 #include <ATen/record_function.h>
 #include <c10/util/Optional.h>
+#include <torch/custom_class.h>
 
 namespace torch {
 namespace autograd {
 namespace profiler {
+
+struct PythonRecordFunction : public torch::CustomClassHolder {
+  at::RecordFunction record;
+
+  explicit PythonRecordFunction(
+      at::RecordScope scope = at::RecordScope::FUNCTION)
+      : record(scope) {}
+};
+
 // Creates a new profiling scope using RecordFunction and invokes its starting
 // callbacks.
-TORCH_API at::Tensor record_function_enter(const std::string& name, const c10::optional<std::string>& args = c10::nullopt);
+TORCH_API c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
+    const std::string &name, const c10::optional<std::string> &args = c10::nullopt);
 
 // Schedules RecordFunction's end callbacks to be run on completion of a future.
-TORCH_API c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut(
-    const at::Tensor& handle,
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_new(
+    const c10::intrusive_ptr<PythonRecordFunction> &record,
     const c10::intrusive_ptr<c10::ivalue::Future>& fut);
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index 4d1787d55c79..c7e1bad9fb8a 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -7,15 +7,39 @@ namespace autograd {
 namespace utils {
 
 // Helper functions to enforce the "Gradient Layout Contract" described in
-// torch/csrc/autograd/AccumulateGrad.h.
+// torch/csrc/autograd/functions/accumulate_grad.h.
 
 // Checks if grad obeys the contract with variable.
 inline bool obeys_layout_contract(const at::Tensor& grad, const at::Tensor& variable) {
   TORCH_INTERNAL_ASSERT(!grad.is_sparse());
   TORCH_INTERNAL_ASSERT(!variable.is_sparse());
-  return variable.is_non_overlapping_and_dense() ?
-         (grad.strides() == variable.strides()) :
-         grad.is_contiguous(at::MemoryFormat::Contiguous);
+  TORCH_INTERNAL_ASSERT(!grad.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT(!variable.is_sparse_csr());
+  if (variable.is_non_overlapping_and_dense()) {
+    // Only look at stride for dimensions that are not of size 1.
+    const auto& grad_sizes = grad.sizes();
+    const auto& grad_strides = grad.strides();
+    const auto& variable_strides = variable.strides();
+    for (const auto idx : c10::irange(grad_sizes.size())) {
+      if (grad_sizes[idx] != 1) {
+        if (grad_strides[idx] != variable_strides[idx]) {
+          return false;
+        }
+      } else {
+        // This should not be needed but we don't check if a Tensor has views
+        // before stashing it. And 0-strided Tensors of size 1 are actually views
+        // for ops like cat.
+        // TODO: Actually detect views in the accumulateGrad function so that this
+        // Tensor is not considered at all.
+        if (grad_strides[idx] == 0) {
+          return false;
+        }
+      }
+    }
+    return true;
+  } else {
+    return grad.is_contiguous(at::MemoryFormat::Contiguous);
+  }
 }
 
 // Creates a clone of new_grad that obeys the contract with variable.
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index 10439553fcc5..114b53487368 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -7,6 +7,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/python_headers.h>
 #include <tuple>
+#include <initializer_list>
 
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/Layout.h>
@@ -77,117 +78,6 @@ inline PyObject* wrap(at::QScheme qscheme) {
   return thp_qscheme;
 }
 
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(2)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(PyTypeObject *type, std::tuple<at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyStructSequence_New(type)};
-  if (!r) throw python_error();
-  PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(3)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  return r.release();
-}
-
-inline PyObject* wrap(PyTypeObject *type, std::tuple<at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyStructSequence_New(type)};
-  if (!r) throw python_error();
-  PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 2, wrap(std::get<2>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(PyTypeObject *type, std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyStructSequence_New(type)};
-  if (!r) throw python_error();
-  PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 2, wrap(std::get<2>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 3, wrap(std::get<3>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, int64_t> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(4)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::get<3>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, float, int64_t> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(4)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, int64_t> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(5)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 4, wrap(std::get<4>(tensors)));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, float, at::Tensor, int64_t> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(5)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 4, wrap(std::move(std::get<4>(tensors))));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(4)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
-  return r.release();
-}
-
-inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(5)};
-  if (!r) throw python_error();
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors))));
-  PyTuple_SET_ITEM(r.get(), 4, wrap(std::move(std::get<4>(tensors))));
-  return r.release();
-}
-
 inline PyObject* wrap(at::TensorList tl) {
   auto r = THPObjectPtr{PyTuple_New(tl.size())};
   if (!r) throw python_error();
@@ -206,13 +96,38 @@ inline PyObject* wrap(at::IntArrayRef list) {
   return r.release();
 }
 
-inline PyObject* wrap(std::tuple<float, int64_t> tensors) {
-  auto r = THPObjectPtr{PyTuple_New(2)};
+namespace detail {
+template <typename F, typename Tuple, size_t ...Is>
+void apply_with_idx_impl(const F &f, Tuple &t, std::index_sequence<Is...> /*indices*/) {
+  (void)std::initializer_list<int> {
+    (f(std::get<Is>(t), Is), 0)...
+  };
+}
+
+// For tuple(a, b, c), calls f(a, 0), f(b, 1), f(c, 2)
+template <typename F, typename ...Ts>
+void apply_with_idx(const F & f, std::tuple<Ts...> &t) {
+  apply_with_idx_impl(f, t, std::index_sequence_for<Ts...>{});
+}
+}  // namespace detail
+
+template <typename ...Ts>
+PyObject* wrap(std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyTuple_New(sizeof...(Ts))};
+  if (!r) throw python_error();
+  detail::apply_with_idx([&](auto &value, size_t idx) {
+      PyTuple_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+    }, values);
+  return r.release();
+}
+
+template <typename ...Ts>
+PyObject* wrap(PyTypeObject *type, std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyStructSequence_New(type)};
   if (!r) throw python_error();
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors))));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors))));
+  detail::apply_with_idx([&](auto &value, size_t idx) {
+      PyStructSequence_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+    }, values);
   return r.release();
 }
 
diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp
index 20821636a774..4312b3aaf7b0 100644
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@@ -119,7 +119,7 @@ static PyObject * THCPEvent_wait(PyObject *_self, PyObject *_stream) {
   {
     auto self = (THCPEvent*)_self;
     auto stream = (THCPStream*)_stream;
-    pybind11::gil_scoped_release no_gil;
+    pybind11::gil_scoped_release no_gil{};
     self->cuda_event.block(stream->cuda_stream);
   }
   Py_RETURN_NONE;
@@ -145,7 +145,7 @@ static PyObject * THCPEvent_synchronize(PyObject *_self, PyObject *noargs) {
   HANDLE_TH_ERRORS
   {
     auto self = (THCPEvent*)_self;
-    pybind11::gil_scoped_release no_gil;
+    pybind11::gil_scoped_release no_gil{};
     self->cuda_event.synchronize();
   }
   Py_RETURN_NONE;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 0b76eefe92c4..6c00332c21a4 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -4,8 +4,10 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/CachingHostAllocator.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/cuda/Sleep.h>
 #include <ATen/cuda/detail/CUDAHooks.h>
+#include <ATen/cuda/jiterator.h>
 #ifdef USE_NCCL
 #include <torch/csrc/cuda/python_nccl.h>
 #endif
@@ -217,6 +219,71 @@ PyObject * THCPModule_cudaCachingAllocator_raw_alloc(PyObject *_unused, PyObject
   END_HANDLE_TH_ERRORS
 }
 
+// Unpack a PyObject to at::Scalar, throw an exception if it fails
+at::Scalar as_scalar(PyObject* arg) {
+  // Zero-dim tensors are converted to Scalars as-is. Note this doesn't currently
+  // handle most NumPy scalar types except np.float64.
+  if (THPVariable_Check(arg)) {
+    return THPVariable_Unpack(arg).item();
+  }
+
+  if (THPUtils_checkLong(arg)) {
+    return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
+  }
+
+  if (PyBool_Check(arg)) {
+    return at::Scalar(THPUtils_unpackBool(arg));
+  }
+
+  if (PyComplex_Check(arg)) {
+    return at::Scalar(THPUtils_unpackComplexDouble(arg));
+  }
+  return at::Scalar(THPUtils_unpackDouble(arg));
+}
+
+// Entrypoint for the callable created by torch.cuda.jiterator
+// See jiterator.py for more details
+PyObject * THCPModule_cudaJiteratorCompileAndLaunchKernel(PyObject *_unused, PyObject *args){
+  HANDLE_TH_ERRORS
+
+  PyObject* code_string_o = nullptr;
+  PyObject* kernel_name_o = nullptr;
+  PyObject* tensors_o = nullptr;
+  PyObject* kwargs_o = nullptr;
+  if(!PyArg_ParseTuple(args, "OOO|O", &code_string_o, &kernel_name_o, &tensors_o, &kwargs_o)) {
+    return nullptr;
+  }
+
+  std::string code_string = THPUtils_unpackString(code_string_o);
+  std::string kernel_name = THPUtils_unpackString(kernel_name_o);
+
+  THPUtils_assert(PyTuple_Check(tensors_o), "tensors argument is expected to "
+      "be a tuple, but got %s", THPUtils_typename(tensors_o));
+  Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o);
+
+  std::vector<at::Tensor> tensors;
+  for(const auto i : c10::irange(num_tensors)) {
+    PyObject *_tensor = PyTuple_GET_ITEM(tensors_o, i);
+    THPUtils_assert(THPVariable_Check(_tensor), "element %d of tensors "
+        "tuple is not a Tensor", i);
+
+    tensors.emplace_back(THPVariable_Unpack(_tensor));
+  }
+
+  std::vector<at::Scalar> extra_args;
+  PyObject *key = nullptr;
+  PyObject *value  = nullptr;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
+    extra_args.emplace_back(as_scalar(value));
+  }
+
+  at::Tensor output = at::cuda::CompileAndLaunchKernel(code_string, kernel_name, tensors, extra_args);
+
+  return THPVariable_Wrap(output);
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject * THCPModule_cudaCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj){
   HANDLE_TH_ERRORS
   void* mem_ptr = PyLong_AsVoidPtr(obj);
@@ -564,6 +631,35 @@ PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *self, PyObject *noargs
   END_HANDLE_TH_ERRORS
 }
 
+PyObject * THCPModule_rocm_is_backward_pass(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+#if USE_ROCM
+  if (at::ROCmBackwardPassGuard::is_backward_pass()) {
+    Py_RETURN_TRUE;
+  }
+  else {
+    Py_RETURN_FALSE;
+  }
+#else
+  Py_RETURN_FALSE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THCPModule_isCurrentStreamCapturing_wrap(PyObject *self, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  // If there's no cuda context, at::cuda::currentStreamCaptureStatus returns
+  // CaptureStatus::None without initializing a context.
+  if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
+    Py_RETURN_FALSE;
+  } else {
+    Py_RETURN_TRUE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 // NOLINTNEXTLINE(modernize-avoid-c-arrays, cppcoreguidelines-avoid-non-const-global-variables, cppcoreguidelines-avoid-c-arrays)
 static struct PyMethodDef _THCPModule_methods[] = {
   {"_cuda_init",        THCPModule_initExtension,    METH_NOARGS,  nullptr},
@@ -578,6 +674,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
   {"_cuda_getDefaultStream",
     THCPModule_getDefaultStream_wrap, METH_O, nullptr},
   {"_cuda_getCurrentBlasHandle", THCPModule_getCurrentBlasHandle_wrap, METH_NOARGS, nullptr},
+  {"_cuda_isCurrentStreamCapturing", THCPModule_isCurrentStreamCapturing_wrap, METH_NOARGS, nullptr},
   {"_cuda_setStream",    THCPModule_setStream_wrap,  METH_O, nullptr},
   {"_cuda_getCompiledVersion", THCPModule_getCompiledVersion, METH_NOARGS, nullptr},
   {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext,  METH_O,  nullptr},
@@ -597,6 +694,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
   {"_cuda_unlock_mutex", THCPModule_cudaUnlockMutex, METH_NOARGS,  nullptr},
   {"_cuda_set_sync_debug_mode", THCPModule_cudaSetSyncDebugMode, METH_O, nullptr},
   {"_cuda_get_sync_debug_mode", THCPModule_cudaGetSyncDebugMode, METH_NOARGS, nullptr},
+  {"_cuda_jiterator_compile_and_launch_kernel", THCPModule_cudaJiteratorCompileAndLaunchKernel, METH_VARARGS, nullptr},
 #ifdef USE_NCCL
   {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
   {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
@@ -607,6 +705,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
   {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr},
   {"_nccl_reduce_scatter", THCPModule_nccl_reduce_scatter, METH_VARARGS, nullptr},
 #endif
+  {"_rocm_is_backward_pass", THCPModule_rocm_is_backward_pass, METH_NOARGS, nullptr},
   {nullptr}
 };
 
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index ae61392ab542..5817449c1a49 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -650,6 +650,9 @@ void all2all_single_equal_split(at::Tensor& input,
   const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
   auto* recvbuff = reinterpret_cast<char *>(output.data_ptr());
   auto comm = to_nccl_comm(_comm);
+#if defined(USE_ROCM) && ROCM_VERSION >= 50000
+  NCCL_CHECK(ncclAllToAll(sendbuff , recvbuff , count,  type, comm, stream));
+#else
   NCCL_CHECK(ncclCommCount(comm, &numranks));
   NCCL_CHECK(ncclGroupStart());
   for(const auto r : c10::irange(numranks)) {
@@ -661,6 +664,7 @@ void all2all_single_equal_split(at::Tensor& input,
     }
   }
   NCCL_CHECK(ncclGroupEnd());
+#endif
 #else
   AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
 #endif
@@ -833,8 +837,7 @@ void gather(
 
   if (cur_rank == root)
   {
-    for (int r = 0; r < numranks; r++)
-    {
+    for (const auto r : c10::irange(numranks)) {
       if (r != root) {
         auto* recvbuff =  reinterpret_cast<char*>(outputs[r].data_ptr());
         NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
@@ -874,8 +877,7 @@ void scatter(
   NCCL_CHECK(ncclGroupStart());
   if (cur_rank == root)
   {
-    for (int r = 0; r < numranks; r++)
-    {
+    for (const auto r : c10::irange(numranks)) {
       if (r != root) {
         size_t send_count = inputs[r].numel();
         auto send_type = to_nccl_data_type(inputs[r]);
diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp
index b93d921a16a9..b0af4c0884e9 100644
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@@ -49,8 +49,8 @@ void initCudartBindings(PyObject* module) {
 #endif
   cudart.def("cuda" "MemGetInfo", [](int device) -> std::pair<size_t, size_t> {
     c10::cuda::CUDAGuard guard(device);
-    size_t device_free;
-    size_t device_total;
+    size_t device_free = 0;
+    size_t device_total = 0;
     cudaMemGetInfo(&device_free, &device_total);
     return {device_free, device_total};
   });
diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt
index f8aa997eb109..61fe8c1bb892 100644
--- a/torch/csrc/deploy/CMakeLists.txt
+++ b/torch/csrc/deploy/CMakeLists.txt
@@ -1,6 +1,13 @@
 set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 add_subdirectory(interpreter)
 
+if(DEFINED GLIBCXX_USE_CXX11_ABI)
+  if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1")
+  endif()
+endif()
+
 # we do not want to have torch_deployinterpreter linked against libstdc++ or libc because
 # when loading it with RTLD_DEEPBIND it will resolve std::cout/stdout to the copy in libc++/libc instead of the
 # ones in the main process (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42679).
@@ -33,10 +40,23 @@ caffe2_interface_library(torch_deploy_internal torch_deploy)
 set(INTERPRETER_TEST_SOURCES
   ${DEPLOY_DIR}/test_deploy.cpp
 )
+set(INTERPRETER_TEST_SOURCES_GPU
+  ${DEPLOY_DIR}/test_deploy_gpu.cpp
+)
+
 add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
 target_compile_definitions(test_deploy PUBLIC TEST_CUSTOM_LIBRARY)
 target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
-target_link_libraries(test_deploy PUBLIC "-Wl,--no-as-needed" gtest dl torch_deploy)
+target_link_libraries(test_deploy
+  PUBLIC "-Wl,--no-as-needed -rdynamic" gtest dl torch_deploy
+)
+
+add_executable(test_deploy_gpu ${INTERPRETER_TEST_SOURCES_GPU})
+target_compile_definitions(test_deploy_gpu PUBLIC TEST_CUSTOM_LIBRARY)
+target_include_directories(test_deploy_gpu PRIVATE ${PYTORCH_ROOT}/torch)
+target_link_libraries(test_deploy_gpu
+  PUBLIC "-Wl,--no-as-needed -rdynamic" gtest dl torch_deploy
+)
 
 add_library(test_deploy_lib SHARED test_deploy_lib.cpp)
 add_dependencies(test_deploy_lib cpython)
@@ -45,14 +65,19 @@ target_link_libraries(test_deploy_lib PRIVATE pybind::pybind11)
 
 add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
 target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
-target_link_libraries(deploy_benchmark PUBLIC "-Wl,--no-as-needed" torch_deploy)
+target_link_libraries(deploy_benchmark
+  PUBLIC "-Wl,--no-as-needed -rdynamic" torch_deploy
+)
 
 add_executable(interactive_embedded_interpreter ${DEPLOY_DIR}/interactive_embedded_interpreter.cpp)
 target_include_directories(interactive_embedded_interpreter PRIVATE ${PYTORCH_ROOT}/torch)
-target_link_libraries(interactive_embedded_interpreter PUBLIC "-Wl,--no-as-needed" torch_deploy)
+target_link_libraries(interactive_embedded_interpreter
+  PUBLIC "-Wl,--no-as-needed -rdynamic" torch_deploy
+)
 
 if(INSTALL_TEST)
   install(TARGETS test_deploy DESTINATION bin)
+  install(TARGETS test_deploy_gpu DESTINATION bin)
 endif()
 
 install(TARGETS torch_deploy DESTINATION lib)
diff --git a/torch/csrc/deploy/Exception.h b/torch/csrc/deploy/Exception.h
new file mode 100644
index 000000000000..f4311debeebc
--- /dev/null
+++ b/torch/csrc/deploy/Exception.h
@@ -0,0 +1,47 @@
+#ifndef MULTIPY_EXCEPTION_H
+#define MULTIPY_EXCEPTION_H
+
+#include <exception>
+
+#define MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(condition, message)               \
+  if (!(condition)) {                                                          \
+    throw std::runtime_error(                                                  \
+        "Internal Assertion failed: (" + std::string(#condition) + "), " +     \
+        "function " + __FUNCTION__ + ", file " + __FILE__ + ", line " +        \
+        std::to_string(__LINE__) + ".\n" + "Please report bug to Pytorch.\n" + \
+        message + "\n");                                                       \
+  }
+
+#define MULTIPY_INTERNAL_ASSERT_NO_MESSAGE(condition) \
+  MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(#condition, "")
+
+#define MULTIPY_INTERNAL_ASSERT_(x, condition, message, FUNC, ...) FUNC
+
+#define MULTIPY_INTERNAL_ASSERT(...)                     \
+  MULTIPY_INTERNAL_ASSERT_(                              \
+      ,                                                  \
+      ##__VA_ARGS__,                                     \
+      MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(__VA_ARGS__), \
+      MULTIPY_INTERNAL_ASSERT_NO_MESSAGE(__VA_ARGS__));
+
+#define MULTIPY_CHECK_WITH_MESSAGE(condition, message)                      \
+  if (!(condition)) {                                                       \
+    throw std::runtime_error(                                               \
+        "Check failed: (" + std::string(#condition) + "), " + "function " + \
+        __FUNCTION__ + ", file " + __FILE__ + ", line " +                   \
+        std::to_string(__LINE__) + ".\n" + message + "\n");                 \
+  }
+
+#define MULTIPY_CHECK_NO_MESSAGE(condition) \
+  MULTIPY_CHECK_WITH_MESSAGE(#condition, "")
+
+#define MULTIPY_CHECK_(x, condition, message, FUNC, ...) FUNC
+
+#define MULTIPY_CHECK(...)                     \
+  MULTIPY_CHECK_(                              \
+      ,                                        \
+      ##__VA_ARGS__,                           \
+      MULTIPY_CHECK_WITH_MESSAGE(__VA_ARGS__), \
+      MULTIPY_CHECK_NO_MESSAGE(__VA_ARGS__));
+
+#endif // MULTIPY_EXCEPTION_H
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
index 43f6f2c85fc5..dfe436ba79fa 100644
--- a/torch/csrc/deploy/README.md
+++ b/torch/csrc/deploy/README.md
@@ -20,3 +20,8 @@ Because CPython builds successfully when optional dependencies are missing, the
 To be safe, install the [complete list of dependencies for CPython](https://devguide.python.org/setup/#install-dependencies) for your platform, before trying to build torch with USE_DEPLOY=1.
 
 If you already built CPython without all the dependencies and want to fix it, just blow away the CPython folder under torch/csrc/deploy/third_party, install the missing system dependencies, and re-attempt the pytorch build command.
+
+# Example
+
+Read the [getting started guide](https://github.com/pytorch/pytorch/blob/master/docs/source/deploy.rst) for an
+example on how to use `torch::deploy`.
diff --git a/torch/csrc/deploy/benchmark.cpp b/torch/csrc/deploy/benchmark.cpp
new file mode 100644
index 000000000000..82296a5e1a1d
--- /dev/null
+++ b/torch/csrc/deploy/benchmark.cpp
@@ -0,0 +1,336 @@
+#include <torch/deploy.h>
+
+#include <ATen/ATen.h>
+#include <ATen/TypeDefault.h>
+#include <c10/util/irange.h>
+
+#include <torch/script.h>
+
+#include <pthread.h>
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+typedef void (*function_type)(const char*);
+
+bool cuda = false;
+
+constexpr auto latency_p = {
+    25.,
+    50.,
+    95.}; //{1., 5., 25., 50., 75., 90., 95., 99., 99.25, 99.5, 99.75, 99.9};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct Report {
+  std::string benchmark;
+  std::string strategy;
+  size_t n_threads;
+  size_t items_completed;
+  double work_items_per_second;
+  std::vector<double> latencies;
+  static void report_header(std::ostream& out) {
+    out << "benchmark, strategy, n_threads, work_items_completed, work_items_per_second";
+    for (double l : latency_p) {
+      out << ", p" << l << "_latency";
+    }
+    out << ", device\n";
+  }
+  void report(std::ostream& out) {
+    out << benchmark << ", " << strategy << ", " << n_threads << ", "
+        << items_completed << ", " << work_items_per_second;
+    for (double l : latencies) {
+      out << ", " << l;
+    }
+    out << ", " << (cuda ? "cuda" : "cpu") << "\n";
+  }
+};
+
+const int min_items_to_complete = 1;
+
+struct RunPython {
+  static torch::deploy::ReplicatedObj load_and_wrap(
+      torch::deploy::Package& package) {
+    auto I = package.acquireSession();
+    auto obj = I.self.attr("load_pickle")({"model", "model.pkl"});
+    if (cuda) {
+      obj = I.global("gpu_wrapper", "GPUWrapper")({obj});
+    }
+    return I.createMovable(obj);
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  RunPython(
+      torch::deploy::Package& package,
+      std::vector<at::IValue> eg,
+      const torch::deploy::Interpreter* interps)
+      : obj_(load_and_wrap(package)), eg_(std::move(eg)), interps_(interps) {}
+  void operator()(int i) {
+    auto I = obj_.acquireSession();
+    if (cuda) {
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      std::vector<at::IValue> eg2 = {i};
+      eg2.insert(eg2.end(), eg_.begin(), eg_.end());
+      I.self(eg2);
+    } else {
+      I.self(eg_);
+    }
+  }
+  torch::deploy::ReplicatedObj obj_;
+  std::vector<at::IValue> eg_;
+  const torch::deploy::Interpreter* interps_;
+};
+
+// def to_device(i, d):
+//     if isinstance(i, torch.Tensor):
+//         return i.to(device=d)
+//     elif isinstance(i, (tuple, list)):
+//         return tuple(to_device(e, d) for e in i)
+//     else:
+//         raise RuntimeError('inputs are weird')
+
+static torch::IValue to_device(const torch::IValue& v, torch::Device to);
+
+static std::vector<torch::IValue> to_device_vec(
+    at::ArrayRef<torch::IValue> vs,
+    torch::Device to) {
+  std::vector<torch::IValue> results;
+  for (const torch::IValue& v : vs) {
+    results.push_back(to_device(v, to));
+  }
+  return results;
+}
+
+static torch::IValue to_device(const torch::IValue& v, torch::Device to) {
+  if (v.isTensor()) {
+    return v.toTensor().to(to);
+  } else if (v.isTuple()) {
+    auto tup = v.toTuple();
+    return c10::ivalue::Tuple::create(to_device_vec(tup->elements(), to));
+  } else if (v.isList()) {
+    auto converted = to_device_vec(v.toListRef(), to);
+    torch::List<torch::IValue> result(v.toList().elementType());
+    for (const torch::IValue& v : converted) {
+      result.push_back(v);
+    }
+    return result;
+  } else {
+    MULTIPY_INTERNAL_ASSERT(false, "cannot to_device");
+  }
+}
+
+static bool exists(const std::string& fname) {
+  std::fstream jit_file(fname);
+  return jit_file.good();
+}
+
+struct RunJIT {
+  RunJIT(const std::string& file_to_run, std::vector<torch::IValue> eg)
+      : eg_(std::move(eg)) {
+    if (!cuda) {
+      models_.push_back(torch::jit::load(file_to_run + "_jit"));
+    } else {
+      for (const auto i : c10::irange(2)) {
+        auto d = torch::Device(torch::DeviceType::CUDA, i);
+        std::stringstream qualified;
+        qualified << file_to_run << "_jit_" << i;
+        auto loaded = exists(qualified.str())
+            ? torch::jit::load(qualified.str(), d)
+            : torch::jit::load(file_to_run + "_jit", d);
+        loaded.to(d);
+        models_.push_back(loaded);
+      }
+    }
+  }
+  void operator()(int i) {
+    if (cuda) {
+      const auto device_id = i % models_.size();
+      auto d = torch::Device(torch::DeviceType::CUDA, device_id);
+      to_device(
+          models_[device_id].forward(to_device_vec(eg_, d)),
+          torch::DeviceType::CPU);
+    } else {
+      models_[0].forward(eg_);
+    }
+  }
+  std::vector<at::IValue> eg_;
+  std::vector<torch::jit::Module> models_;
+};
+
+struct Benchmark {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Benchmark(
+      torch::deploy::InterpreterManager& manager,
+      size_t n_threads,
+      std::string strategy,
+      // NOLINTNEXTLINE(modernize-pass-by-value)
+      std::string file_to_run,
+      size_t n_seconds = 5)
+      : manager_(manager),
+        n_threads_(n_threads),
+        strategy_(strategy),
+        file_to_run_(file_to_run),
+        n_seconds_(n_seconds),
+        should_run_(true),
+        items_completed_(0),
+        reached_min_items_completed_(0) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (strategy == "one_python") {
+      manager.debugLimitInterpreters(1);
+    } else if (strategy == "multi_python") {
+      manager.debugLimitInterpreters(n_threads_);
+    }
+  }
+
+  Report run() {
+    pthread_barrier_init(&first_run_, nullptr, n_threads_ + 1);
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    torch::deploy::Package package = manager_.loadPackage(file_to_run_);
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<at::IValue> eg;
+    {
+      auto I = package.acquireSession();
+
+      eg = I.global("builtins", "tuple")(
+                I.self.attr("load_pickle")({"model", "example.pkl"}))
+               .toIValue()
+               .toTupleRef()
+               .elements();
+    }
+
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (strategy_ == "jit") {
+      run_one_work_item = RunJIT(file_to_run_, std::move(eg));
+    } else {
+      run_one_work_item =
+          RunPython(package, std::move(eg), manager_.allInstances().data());
+    }
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<std::vector<double>> latencies(n_threads_);
+
+    for (const auto i : c10::irange(n_threads_)) {
+      threads_.emplace_back([this, &latencies, i] {
+        torch::NoGradGuard guard;
+        // do initial work
+        run_one_work_item(i);
+
+        pthread_barrier_wait(&first_run_);
+        size_t local_items_completed = 0;
+        while (should_run_) {
+          auto begin = std::chrono::steady_clock::now();
+          run_one_work_item(i);
+          auto end = std::chrono::steady_clock::now();
+          double work_seconds =
+              std::chrono::duration<double>(end - begin).count();
+          latencies[i].push_back(work_seconds);
+          local_items_completed++;
+          if (local_items_completed == min_items_to_complete) {
+            reached_min_items_completed_++;
+          }
+        }
+        items_completed_ += local_items_completed;
+      });
+    }
+
+    pthread_barrier_wait(&first_run_);
+    auto begin = std::chrono::steady_clock::now();
+    auto try_stop_at = begin + std::chrono::seconds(n_seconds_);
+    std::this_thread::sleep_until(try_stop_at);
+    for (int i = 0; reached_min_items_completed_ < n_threads_; ++i) {
+      std::this_thread::sleep_until(
+          begin + (i + 2) * std::chrono::seconds(n_seconds_));
+    }
+    should_run_ = false;
+    for (std::thread& thread : threads_) {
+      thread.join();
+    }
+    auto end = std::chrono::steady_clock::now();
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    double total_seconds = std::chrono::duration<double>(end - begin).count();
+    Report report;
+    report.benchmark = file_to_run_;
+    report.strategy = strategy_;
+    report.n_threads = n_threads_;
+    report.items_completed = items_completed_;
+    report.work_items_per_second = items_completed_ / total_seconds;
+    reportLatencies(report.latencies, latencies);
+    run_one_work_item = nullptr;
+    return report;
+  }
+
+ private:
+  void reportLatencies(
+      std::vector<double>& results,
+      const std::vector<std::vector<double>>& latencies) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<double> flat_latencies;
+    for (const auto& elem : latencies) {
+      flat_latencies.insert(flat_latencies.end(), elem.begin(), elem.end());
+    }
+    std::sort(flat_latencies.begin(), flat_latencies.end());
+    for (double target : latency_p) {
+      size_t idx = size_t(flat_latencies.size() * target / 100.0);
+      double time = flat_latencies.size() == 0
+          ? 0
+          : flat_latencies.at(std::min(flat_latencies.size() - 1, idx));
+      results.push_back(time);
+    }
+  }
+  torch::deploy::InterpreterManager& manager_;
+  size_t n_threads_;
+  std::string strategy_;
+  std::string file_to_run_;
+  size_t n_seconds_;
+  pthread_barrier_t first_run_;
+  std::atomic<bool> should_run_;
+  std::atomic<size_t> items_completed_;
+  std::atomic<size_t> reached_min_items_completed_;
+  std::vector<std::thread> threads_;
+  std::function<void(int)> run_one_work_item;
+};
+
+// NOLINTNEXTLINE(bugprone-exception-escape)
+int main(int argc, char* argv[]) {
+  int max_thread = atoi(argv[1]);
+  cuda = std::string(argv[2]) == "cuda";
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool jit_enable = std::string(argv[3]) == "jit";
+  Report::report_header(std::cout);
+  torch::deploy::InterpreterManager manager(max_thread);
+
+  // make sure gpu_wrapper.py is in the import path
+  for (auto& interp : manager.allInstances()) {
+    auto I = interp.acquireSession();
+    I.global("sys", "path").attr("append")({"torch/csrc/deploy/example"});
+  }
+
+  auto n_threads = {1, 2, 4, 8, 16, 32, 40};
+  for (const auto i : c10::irange(4, argc)) {
+    std::string model_file = argv[i];
+    for (int n_thread : n_threads) {
+      if (n_thread > max_thread) {
+        continue;
+      }
+      for (std::string strategy : {"one_python", "multi_python", "jit"}) {
+        if (strategy == "jit") {
+          if (!jit_enable) {
+            continue;
+          }
+          if (!exists(model_file + "_jit")) {
+            continue;
+          }
+        }
+        Benchmark b(manager, n_thread, strategy, model_file);
+        Report r = b.run();
+        r.report(std::cout);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/torch/csrc/deploy/deploy.cpp b/torch/csrc/deploy/deploy.cpp
index 647c9a4e810b..680b8541873f 100644
--- a/torch/csrc/deploy/deploy.cpp
+++ b/torch/csrc/deploy/deploy.cpp
@@ -1,6 +1,8 @@
-#include <c10/util/Exception.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/deploy/deploy.h>
 #include <torch/csrc/deploy/elf_file.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
+
 #include <torch/cuda.h>
 
 #include <dlfcn.h>
@@ -54,12 +56,13 @@ static bool writeDeployInterpreter(FILE* dst) {
   std::ifstream("/proc/self/cmdline") >> exePath;
   ElfFile elfFile(exePath.c_str());
   for (const auto& s : pythonInterpreterSection) {
-    at::optional<Section> payloadSection = elfFile.findSection(s.sectionName);
-    if (payloadSection != at::nullopt) {
+    multipy::optional<Section> payloadSection =
+        elfFile.findSection(s.sectionName);
+    if (payloadSection != multipy::nullopt) {
       payloadStart = payloadSection->start;
       customLoader = s.customLoader;
       size = payloadSection->len;
-      TORCH_CHECK(payloadSection.has_value(), "Missing the payload section");
+      MULTIPY_CHECK(payloadSection.has_value(), "Missing the payload section");
       break;
     }
   }
@@ -74,10 +77,10 @@ static bool writeDeployInterpreter(FILE* dst) {
         break;
       }
     }
-    TORCH_CHECK(
+    MULTIPY_CHECK(
         libStart != nullptr && libEnd != nullptr,
-        "torch::deploy requires a build-time dependency on embedded_interpreter or embedded_interpreter_cuda, neither of which were found.  torch::cuda::is_available()=",
-        torch::cuda::is_available());
+        "torch::deploy requires a build-time dependency on embedded_interpreter or embedded_interpreter_cuda, neither of which were found.  torch::cuda::is_available()=" +
+            std::to_string(torch::cuda::is_available()));
 
     size = libEnd - libStart;
     payloadStart = libStart;
@@ -91,6 +94,8 @@ InterpreterManager::InterpreterManager(
     size_t nInterp,
     std::shared_ptr<Environment> env)
     : resources_(nInterp) {
+  C10_LOG_API_USAGE_ONCE("torch.deploy.InterpreterManager");
+
   TORCH_DEPLOY_TRY
   for (const auto i : c10::irange(nInterp)) {
     instances_.emplace_back(this, env);
@@ -99,12 +104,12 @@ InterpreterManager::InterpreterManager(
     // can be used for balancing work across GPUs
     I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
     instances_.back().pImpl_->setFindModule(
-        [this](const std::string& name) -> at::optional<std::string> {
+        [this](const std::string& name) -> multipy::optional<std::string> {
           auto it = registeredModuleSource_.find(name);
           if (it != registeredModuleSource_.end()) {
             return it->second;
           } else {
-            return at::nullopt;
+            return multipy::nullopt;
           }
         });
   }
@@ -189,11 +194,11 @@ void ReplicatedObj::unload(const Interpreter* onThisInterpreter) {
 
 ReplicatedObj InterpreterSession::createMovable(Obj obj) {
   TORCH_DEPLOY_TRY
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       manager_,
       "Can only create a movable object when the session was created from an interpreter that is part of a InterpreterManager");
 
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       impl_->isOwner(obj),
       "Cannot create movable from an object that lives in different session");
 
@@ -214,6 +219,11 @@ using dlopen_t = void* (*)(const char*, int);
 // function.
 static dlopen_t find_real_dlopen() {
   void* libc = dlopen("libdl.so.2", RTLD_NOLOAD | RTLD_LAZY | RTLD_LOCAL);
+  // libdl is gone on some newer systems.
+  if (!libc) {
+    // libc.so won't open with dlopen because it's a linker script.
+    libc = dlopen("libc.so.6", RTLD_NOLOAD | RTLD_LAZY | RTLD_LOCAL);
+  }
   TORCH_INTERNAL_ASSERT(libc);
   auto dlopen_ = (dlopen_t)dlsym(libc, "dlopen");
   TORCH_INTERNAL_ASSERT(dlopen_);
@@ -293,8 +303,7 @@ int LoadBalancer::acquire() {
   size_t minusers = SIZE_MAX;
   int minIdx = 0;
   for (size_t i = 0; i < n_; ++i, ++last) {
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    if (last >= n_) {
+    if (last >= static_cast<int>(n_)) {
       last = 0;
     }
     uint64_t prev = 0;
diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h
index c6a4794a932d..b986093ed020 100644
--- a/torch/csrc/deploy/deploy.h
+++ b/torch/csrc/deploy/deploy.h
@@ -1,7 +1,7 @@
 #pragma once
-#include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/api/include/torch/imethod.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 #include <torch/csrc/deploy/interpreter/interpreter_impl.h>
 #include <torch/csrc/deploy/noop_environment.h>
 #include <torch/csrc/jit/serialization/import.h>
@@ -95,7 +95,7 @@ struct TORCH_API LoadBalancer {
   }
   void setResourceLimit(size_t n) {
     TORCH_DEPLOY_TRY
-    TORCH_INTERNAL_ASSERT(n <= allocated_);
+    MULTIPY_INTERNAL_ASSERT(n <= allocated_);
     n_ = n;
     TORCH_DEPLOY_SAFE_CATCH_RETHROW
   }
diff --git a/torch/csrc/deploy/elf_file.cpp b/torch/csrc/deploy/elf_file.cpp
index 85eaaa19cc26..ca1e749868e5 100644
--- a/torch/csrc/deploy/elf_file.cpp
+++ b/torch/csrc/deploy/elf_file.cpp
@@ -1,5 +1,7 @@
 #include <c10/util/irange.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/deploy/elf_file.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 
 namespace torch {
 namespace deploy {
@@ -13,7 +15,7 @@ ElfFile::ElfFile(const char* filename) : memFile_(filename) {
   shdrList_ = (Elf64_Shdr*)(fileData + ehdr_->e_shoff);
 
   auto strtabSecNo = ehdr_->e_shstrndx;
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       strtabSecNo >= 0 && strtabSecNo < numSections_,
       "e_shstrndx out of range");
 
@@ -25,9 +27,9 @@ ElfFile::ElfFile(const char* filename) : memFile_(filename) {
   }
 }
 
-at::optional<Section> ElfFile::findSection(const char* name) const {
-  TORCH_CHECK(name != nullptr, "Null name");
-  at::optional<Section> found = at::nullopt;
+multipy::optional<Section> ElfFile::findSection(const char* name) const {
+  MULTIPY_CHECK(name != nullptr, "Null name");
+  multipy::optional<Section> found = multipy::nullopt;
   for (const auto& section : sections_) {
     if (strcmp(name, section.name) == 0) {
       found = section;
@@ -40,13 +42,13 @@ at::optional<Section> ElfFile::findSection(const char* name) const {
 
 void ElfFile::checkFormat() const {
   // check the magic numbers
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       (ehdr_->e_ident[EI_MAG0] == ELFMAG0) &&
           (ehdr_->e_ident[EI_MAG1] == ELFMAG1) &&
           (ehdr_->e_ident[EI_MAG2] == ELFMAG2) &&
           (ehdr_->e_ident[EI_MAG3] == ELFMAG3),
       "Unexpected magic numbers");
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       ehdr_->e_ident[EI_CLASS] == ELFCLASS64, "Only support 64bit ELF file");
 }
 
diff --git a/torch/csrc/deploy/elf_file.h b/torch/csrc/deploy/elf_file.h
index e27750c01139..31ea7976af88 100644
--- a/torch/csrc/deploy/elf_file.h
+++ b/torch/csrc/deploy/elf_file.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <c10/util/Optional.h>
 #include <elf.h>
+#include <torch/csrc/deploy/Exception.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 #include <torch/csrc/deploy/mem_file.h>
 #include <vector>
 
@@ -30,7 +31,7 @@ struct Section {
 class ElfFile {
  public:
   explicit ElfFile(const char* filename);
-  at::optional<Section> findSection(const char* name) const;
+  multipy::optional<Section> findSection(const char* name) const;
 
  private:
   Section toSection(Elf64_Shdr* shdr) {
@@ -40,7 +41,7 @@ class ElfFile {
     const char* name = "";
 
     if (strtabSection_) {
-      TORCH_CHECK(nameOff >= 0 && nameOff < strtabSection_.len);
+      MULTIPY_CHECK(nameOff >= 0 && nameOff < strtabSection_.len);
       name = strtabSection_.start + nameOff;
     }
     const char* start = memFile_.data() + shOff;
@@ -48,7 +49,7 @@ class ElfFile {
   }
 
   [[nodiscard]] const char* str(size_t off) const {
-    TORCH_CHECK(off < strtabSection_.len, "String table index out of range");
+    MULTIPY_CHECK(off < strtabSection_.len, "String table index out of range");
     return strtabSection_.start + off;
   }
   void checkFormat() const;
diff --git a/torch/csrc/deploy/environment.h b/torch/csrc/deploy/environment.h
index 4485a4e1d031..433ce6bcb3f6 100644
--- a/torch/csrc/deploy/environment.h
+++ b/torch/csrc/deploy/environment.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <fmt/format.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/deploy/elf_file.h>
 #include <fstream>
 #include <string>
@@ -27,7 +28,7 @@ class Environment {
     // load the zipped torch modules
     constexpr const char* ZIPPED_TORCH_NAME = ".torch_python_modules";
     auto zippedTorchSection = elfFile.findSection(ZIPPED_TORCH_NAME);
-    TORCH_CHECK(
+    MULTIPY_CHECK(
         zippedTorchSection.has_value(), "Missing the zipped torch section");
     const char* zippedTorchStart = zippedTorchSection->start;
     auto zippedTorchSize = zippedTorchSection->len;
@@ -35,7 +36,7 @@ class Environment {
     std::string zipArchive =
         std::string(pythonAppDir) + "/torch_python_modules.zip";
     auto zippedFile = fopen(zipArchive.c_str(), "wb");
-    TORCH_CHECK(
+    MULTIPY_CHECK(
         zippedFile != nullptr, "Fail to create file: ", strerror(errno));
     fwrite(zippedTorchStart, 1, zippedTorchSize, zippedFile);
     fclose(zippedFile);
diff --git a/torch/csrc/deploy/example/examples.py b/torch/csrc/deploy/example/examples.py
index 25bb54a0c606..73eeb2149b54 100644
--- a/torch/csrc/deploy/example/examples.py
+++ b/torch/csrc/deploy/example/examples.py
@@ -146,8 +146,7 @@ class MultiReturn(torch.nn.Module):
     def __init__(self):
         super(MultiReturn, self).__init__()
 
-    def forward(self, t):
-        # type: (Tuple[Tensor, Tensor]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
+    def forward(self, t: Tuple[Tensor, Tensor]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]:
         a, b = t
         result = ((a.masked_fill_(b, 0.1), b), (torch.ones_like(a), b))
         return result
diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt
index 7f808335c82d..33b71e348396 100644
--- a/torch/csrc/deploy/interpreter/CMakeLists.txt
+++ b/torch/csrc/deploy/interpreter/CMakeLists.txt
@@ -1,8 +1,11 @@
 SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" )
 SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" PARENT_SCOPE)
-
 SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
 
+if(NOT TORCH_INSTALL_LIB_DIR)
+  set(TORCH_INSTALL_LIB_DIR lib)
+endif()
+
 # Build cpython
 SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
 SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
@@ -109,3 +112,6 @@ target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STD
 target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
 target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a)
 target_link_libraries(torch_deployinterpreter PRIVATE pybind::pybind11)
+
+# expose torch_python_static for multipy
+install(TARGETS torch_python_static DESTINATION "${TORCH_INSTALL_LIB_DIR}")
diff --git a/torch/csrc/deploy/interpreter/Optional.hpp b/torch/csrc/deploy/interpreter/Optional.hpp
new file mode 100644
index 000000000000..92b73d7f6fbb
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/Optional.hpp
@@ -0,0 +1,1107 @@
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+//
+// Source: https://github.com/akrzemi1/Optional
+
+#ifndef ___OPTIONAL_HPP___
+#define ___OPTIONAL_HPP___
+
+#include <cassert>
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#define TR2_OPTIONAL_REQUIRES(...) \
+  typename std::enable_if<__VA_ARGS__::value, bool>::type = false
+
+#if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#endif
+
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#endif
+
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#endif
+#endif
+
+#if defined __clang_major__
+#if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#elif (__clang_major__ > 3)
+#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#endif
+#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#elif ( \
+    __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#endif
+#endif
+
+#if defined _MSC_VER
+#if (_MSC_VER >= 1900)
+#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#endif
+#endif
+
+#if defined __clang__
+#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#else
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#endif
+#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#else
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#endif
+
+#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+#else
+#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#define OPTIONAL_CONSTEXPR_INIT_LIST
+#endif
+
+#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \
+    (__cplusplus != 201103L)
+#define OPTIONAL_HAS_MOVE_ACCESSORS 1
+#else
+#define OPTIONAL_HAS_MOVE_ACCESSORS 0
+#endif
+
+// In C++11 constexpr implies const, so we need to make non-const members also
+// non-constexpr
+#if (defined __cplusplus) && (__cplusplus == 201103L)
+#define OPTIONAL_MUTABLE_CONSTEXPR
+#else
+#define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+#endif
+
+namespace multipy {
+
+// BEGIN workaround for missing std::is_trivially_destructible
+#if defined TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+// leave it: it is already there
+#elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+// leave it: it is already there
+#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+// leave it: it is already there
+#elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS
+// leave it: the user doesn't want it
+#else
+template <typename T>
+using std::is_trivially_destructible = std::has_trivial_destructor<T>;
+#endif
+// END workaround for missing std::is_trivially_destructible
+
+#if (defined TR2_OPTIONAL_GCC_4_7_AND_HIGHER___)
+// leave it; our metafunctions are already defined.
+#elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+// leave it; our metafunctions are already defined.
+#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+// leave it: it is already there
+#elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS
+// leave it: the user doesn't want it
+#else
+
+// workaround for missing traits in GCC and CLANG
+template <class T>
+struct std::is_nothrow_move_constructible {
+  constexpr static bool value = std::is_nothrow_constructible<T, T&&>::value;
+};
+
+template <class T, class U>
+struct is_assignable {
+  template <class X, class Y>
+  constexpr static bool has_assign(...) {
+    return false;
+  }
+
+  template <
+      class X,
+      class Y,
+      size_t S = sizeof((std::declval<X>() = std::declval<Y>(), true))>
+  // the comma operator is necessary for the cases where operator= returns void
+  constexpr static bool has_assign(bool) {
+    return true;
+  }
+
+  constexpr static bool value = has_assign<T, U>(true);
+};
+
+template <class T>
+struct std::is_nothrow_move_assignable {
+  template <class X, bool has_any_move_assign>
+  struct has_nothrow_move_assign {
+    constexpr static bool value = false;
+  };
+
+  template <class X>
+  struct has_nothrow_move_assign<X, true> {
+    constexpr static bool value =
+        noexcept(std::declval<X&>() = std::declval<X&&>());
+  };
+
+  constexpr static bool value =
+      has_nothrow_move_assign<T, is_assignable<T&, T&&>::value>::value;
+};
+// end workaround
+
+#endif
+
+// 20.5.4, optional for object types
+template <class T>
+class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T>
+class optional<T&>;
+
+// workaround: std utility functions aren't constexpr yet
+template <class T>
+inline constexpr T&& constexpr_forward(
+    typename std::remove_reference<T>::type& t) noexcept {
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+inline constexpr T&& constexpr_forward(
+    typename std::remove_reference<T>::type&& t) noexcept {
+  static_assert(!std::is_lvalue_reference<T>::value, "!!");
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+inline constexpr typename std::remove_reference<T>::type&& constexpr_move(
+    T&& t) noexcept {
+  return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+#if defined NDEBUG
+#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \
+  ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR)))
+#endif
+
+namespace detail_ {
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof {
+  template <class X>
+  constexpr static bool has_overload(...) {
+    return false;
+  }
+
+  template <class X, size_t S = sizeof(std::declval<X&>().operator&())>
+  constexpr static bool has_overload(bool) {
+    return true;
+  }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref) {
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref) {
+  return std::addressof(ref);
+}
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b
+// decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) {
+  return v;
+}
+
+namespace swap_ns {
+using std::swap;
+
+template <class T>
+void adl_swap(T& t, T& u) noexcept(noexcept(swap(t, u))) {
+  swap(t, u);
+}
+
+} // namespace swap_ns
+
+} // namespace detail_
+
+constexpr struct trivial_init_t {
+} trivial_init{};
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t {
+} in_place{};
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t {
+  struct init {};
+  constexpr explicit nullopt_t(init) {}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public std::logic_error {
+ public:
+  explicit bad_optional_access(const std::string& what_arg)
+      : std::logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg)
+      : std::logic_error{what_arg} {}
+};
+
+template <class T>
+union storage_t {
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t(trivial_init_t) noexcept : dummy_(){};
+
+  template <class... Args>
+  constexpr storage_t(Args&&... args)
+      : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t() {}
+};
+
+template <class T>
+union constexpr_storage_t {
+  unsigned char dummy_;
+  T value_;
+
+  constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){};
+
+  template <class... Args>
+  constexpr constexpr_storage_t(Args&&... args)
+      : value_(constexpr_forward<Args>(args)...) {}
+
+  ~constexpr_storage_t() = default;
+};
+
+template <class T>
+struct optional_base {
+  bool init_;
+  storage_t<T> storage_;
+
+  constexpr optional_base() noexcept : init_(false), storage_(trivial_init){};
+
+  explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+  explicit constexpr optional_base(T&& v)
+      : init_(true), storage_(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  explicit optional_base(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+  ~optional_base() {
+    if (init_)
+      storage_.value_.T::~T();
+  }
+};
+
+template <class T>
+struct constexpr_optional_base {
+  bool init_;
+  constexpr_storage_t<T> storage_;
+
+  constexpr constexpr_optional_base() noexcept
+      : init_(false), storage_(trivial_init){};
+
+  explicit constexpr constexpr_optional_base(const T& v)
+      : init_(true), storage_(v) {}
+
+  explicit constexpr constexpr_optional_base(T&& v)
+      : init_(true), storage_(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+  ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    std::is_trivially_destructible<T>::value, // if possible
+    constexpr_optional_base<typename std::remove_const<
+        T>::type>, // use base with trivial destructor
+    optional_base<typename std::remove_const<T>::type>>::type;
+
+template <class T>
+class optional : private OptionalBase<T> {
+  static_assert(
+      !std::is_same<typename std::decay<T>::type, nullopt_t>::value,
+      "bad T");
+  static_assert(
+      !std::is_same<typename std::decay<T>::type, in_place_t>::value,
+      "bad T");
+
+  constexpr bool initialized() const noexcept {
+    return OptionalBase<T>::init_;
+  }
+  typename std::remove_const<T>::type* dataptr() {
+    return std::addressof(OptionalBase<T>::storage_.value_);
+  }
+  constexpr const T* dataptr() const {
+    return detail_::static_addressof(OptionalBase<T>::storage_.value_);
+  }
+
+#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& {
+    return OptionalBase<T>::storage_.value_;
+  }
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && {
+    return std::move(OptionalBase<T>::storage_.value_);
+  }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & {
+    return OptionalBase<T>::storage_.value_;
+  }
+#else
+  T& contained_val() & {
+    return OptionalBase<T>::storage_.value_;
+  }
+  T&& contained_val() && {
+    return std::move(OptionalBase<T>::storage_.value_);
+  }
+#endif
+#else
+  constexpr const T& contained_val() const {
+    return OptionalBase<T>::storage_.value_;
+  }
+  T& contained_val() {
+    return OptionalBase<T>::storage_.value_;
+  }
+#endif
+
+  void clear() noexcept {
+    if (initialized())
+      dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(
+      noexcept(T(std::forward<Args>(args)...))) {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(
+      noexcept(T(il, std::forward<Args>(args)...))) {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+ public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>(){};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>(){};
+
+  optional(const optional& rhs) : OptionalBase<T>() {
+    if (rhs.initialized()) {
+      ::new (static_cast<void*>(dataptr())) T(*rhs);
+      OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value)
+      : OptionalBase<T>() {
+    if (rhs.initialized()) {
+      ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+      OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+      : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs) {
+    if (initialized() == true && rhs.initialized() == false)
+      clear();
+    else if (initialized() == false && rhs.initialized() == true)
+      initialize(*rhs);
+    else if (initialized() == true && rhs.initialized() == true)
+      contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs) noexcept(
+      std::is_nothrow_move_assignable<T>::value&&
+          std::is_nothrow_move_constructible<T>::value) {
+    if (initialized() == true && rhs.initialized() == false)
+      clear();
+    else if (initialized() == false && rhs.initialized() == true)
+      initialize(std::move(*rhs));
+    else if (initialized() == true && rhs.initialized() == true)
+      contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v) -> typename std::enable_if<
+      std::is_same<typename std::decay<U>::type, T>::value,
+      optional&>::type {
+    if (initialized()) {
+      contained_val() = std::forward<U>(v);
+    } else {
+      initialize(std::forward<U>(v));
+    }
+    return *this;
+  }
+
+  template <class... Args>
+  void emplace(Args&&... args) {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(std::initializer_list<U> il, Args&&... args) {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value&& noexcept(
+          detail_::swap_ns::adl_swap(std::declval<T&>(), std::declval<T&>()))) {
+    if (initialized() == true && rhs.initialized() == false) {
+      rhs.initialize(std::move(**this));
+      clear();
+    } else if (initialized() == false && rhs.initialized() == true) {
+      initialize(std::move(*rhs));
+      rhs.clear();
+    } else if (initialized() == true && rhs.initialized() == true) {
+      using std::swap;
+      swap(**this, *rhs);
+    }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept {
+    return initialized();
+  }
+  constexpr bool has_value() const noexcept {
+    return initialized();
+  }
+
+  constexpr T const* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator->() {
+    assert(initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator*() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & {
+    assert(initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && {
+    assert(initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized())
+      throw bad_optional_access("bad optional access");
+    return std::move(contained_val());
+  }
+
+#else
+
+  T* operator->() {
+    assert(initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  T& operator*() {
+    assert(initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+#endif
+
+#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const& {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && {
+    return *this
+        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
+        : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#else
+
+  template <class V>
+  T value_or(V&& v) && {
+    return *this
+        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
+        : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#endif
+
+#else
+
+  template <class V>
+  constexpr T value_or(V&& v) const {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#endif
+
+  // 20.6.3.6, modifiers
+  void reset() noexcept {
+    clear();
+  }
+};
+
+template <class T>
+class optional<T&> {
+  static_assert(!std::is_same<T, nullopt_t>::value, "bad T");
+  static_assert(!std::is_same<T, in_place_t>::value, "bad T");
+  T* ref;
+
+ public:
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept
+      : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+  // ref = rhs.ref;
+  // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+  // ref = rhs.ref;
+  // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
+      std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+      optional&>::type {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
+      !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+      optional&>::type = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+  void swap(optional<T&>& rhs) noexcept {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref
+               : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  constexpr bool has_value() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename std::decay<T>::type value_or(V&& v) const {
+    return *this ? **this
+                 : detail_::convert<typename std::decay<T>::type>(
+                       constexpr_forward<V>(v));
+  }
+
+  // x.x.x.x, modifiers
+  void reset() noexcept {
+    ref = nullptr;
+  }
+};
+
+template <class T>
+class optional<T&&> {
+  static_assert(sizeof(T) == 0, "optional rvalue references disallowed");
+};
+
+// 20.5.8, Relational operators
+template <class T>
+constexpr bool operator==(const optional<T>& x, const optional<T>& y) {
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const optional<T>& y) {
+  return !(x == y);
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>& x, const optional<T>& y) {
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, const optional<T>& y) {
+  return (y < x);
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const optional<T>& y) {
+  return !(y < x);
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const optional<T>& y) {
+  return !(x < y);
+}
+
+// 20.5.9, Comparison with nullopt
+template <class T>
+constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>&, nullopt_t) noexcept {
+  return false;
+}
+
+template <class T>
+constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept {
+  return true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator>(nullopt_t, const optional<T>&) noexcept {
+  return false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept {
+  return true;
+}
+
+template <class T>
+constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
+  return (!x);
+}
+
+// 20.5.10, Comparison with T
+template <class T>
+constexpr bool operator==(const optional<T>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<T>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<T>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<T>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T&> with T
+template <class T>
+constexpr bool operator==(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<T&>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<T&>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<T&>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T>
+constexpr bool operator==(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y))) {
+  x.swap(y);
+}
+
+template <class T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v) {
+  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(std::reference_wrapper<X> v) {
+  return optional<X&>(v.get());
+}
+
+} // namespace multipy
+
+namespace std {
+template <typename T>
+struct hash<multipy::optional<T>> {
+  typedef typename hash<T>::result_type result_type;
+  typedef multipy::optional<T> argument_type;
+
+  constexpr result_type operator()(argument_type const& arg) const {
+    return arg ? std::hash<T>{}(*arg) : result_type{};
+  }
+};
+
+template <typename T>
+struct hash<multipy::optional<T&>> {
+  typedef typename hash<T>::result_type result_type;
+  typedef multipy::optional<T&> argument_type;
+
+  constexpr result_type operator()(argument_type const& arg) const {
+    return arg ? std::hash<T>{}(*arg) : result_type{};
+  }
+};
+} // namespace std
+
+#undef TR2_OPTIONAL_REQUIRES
+#undef TR2_OPTIONAL_ASSERTED_EXPRESSION
+
+#endif //___OPTIONAL_HPP___
diff --git a/torch/csrc/deploy/interpreter/builtin_registry.cpp b/torch/csrc/deploy/interpreter/builtin_registry.cpp
index a34768c2a009..611def2e7490 100644
--- a/torch/csrc/deploy/interpreter/builtin_registry.cpp
+++ b/torch/csrc/deploy/interpreter/builtin_registry.cpp
@@ -1,6 +1,7 @@
 #include <Python.h>
 #include <c10/util/Exception.h>
 #include <fmt/format.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/deploy/interpreter/builtin_registry.h>
 
 namespace torch {
@@ -44,7 +45,7 @@ BuiltinRegistryItem::BuiltinRegistryItem(
 
   fprintf(
       stderr,
-      "torch::deploy builtin %s contains %d modules\n",
+      "torch::deploy builtin %s contains %u modules\n",
       name,
       numModules);
 }
@@ -65,6 +66,7 @@ void BuiltinRegistry::runPreInitialization() {
 
 const char* metaPathSetupTemplate = R"PYTHON(
 import sys
+from importlib.metadata import DistributionFinder, Distribution
 # We need to register a custom meta path finder because we are registering
 # `torch._C` as a builtin module.
 #
@@ -73,12 +75,36 @@ import sys
 # are top-level imports.  Since `torch._C` is a submodule of `torch`, the
 # BuiltinImporter skips it.
 class F:
+    MODULES = {<<<DEPLOY_BUILTIN_MODULES_CSV>>>}
+
     def find_spec(self, fullname, path, target=None):
-        if fullname in [<<<DEPLOY_BUILTIN_MODULES_CSV>>>]:
+        if fullname in self.MODULES:
             # Load this module using `BuiltinImporter`, but set `path` to None
             # in order to trick it into loading our module.
             return sys.meta_path[1].find_spec(fullname, path=None, target=None)
         return None
+
+    def find_distributions(self, context=DistributionFinder.Context()):
+        modules = {"torch"} | self.MODULES
+        # Insert dummy distribution records for each builtin module so
+        # importlib.metadata.version(...) works.
+        if context.name is None:
+            for name in modules:
+                yield DummyDistribution(name)
+        if context.name in modules:
+            yield DummyDistribution(context.name)
+
+class DummyDistribution(Distribution):
+    def __init__(self, name):
+        self._metadata = {
+            "Name": name,
+            "Version": "0.0.1+fake_multipy",
+        }
+
+    @property
+    def metadata(self):
+        return self._metadata
+
 sys.meta_path.insert(0, F())
 )PYTHON";
 
@@ -86,9 +112,9 @@ void BuiltinRegistry::runPostInitialization() {
   TORCH_INTERNAL_ASSERT(Py_IsInitialized());
   std::string metaPathSetupScript(metaPathSetupTemplate);
   std::string replaceKey = "<<<DEPLOY_BUILTIN_MODULES_CSV>>>";
-  auto itr = metaPathSetupScript.find(replaceKey);
-  if (itr != std::string::npos) {
-    metaPathSetupScript.replace(itr, replaceKey.size(), getBuiltinModulesCSV());
+  size_t pos = metaPathSetupScript.find(replaceKey);
+  if (pos != std::string::npos) {
+    metaPathSetupScript.replace(pos, replaceKey.size(), getBuiltinModulesCSV());
   }
   int r = PyRun_SimpleString(metaPathSetupScript.c_str());
   TORCH_INTERNAL_ASSERT(r == 0);
@@ -109,8 +135,8 @@ BuiltinRegistryItem* BuiltinRegistry::getItem(const std::string& name) {
                                        : get()->items_[itr->second].get();
 }
 
-int BuiltinRegistry::totalNumModules() {
-  int tot = 0;
+unsigned BuiltinRegistry::totalNumModules() {
+  unsigned tot = 0;
   for (const auto& itemptr : get()->items_) {
     tot += itemptr->numModules;
   }
@@ -119,7 +145,7 @@ int BuiltinRegistry::totalNumModules() {
 
 struct _frozen* BuiltinRegistry::getAllFrozenModules() {
   /* Allocate new memory for the combined table */
-  int totNumModules = totalNumModules();
+  size_t totNumModules = totalNumModules();
   struct _frozen* p = nullptr;
   if (totNumModules > 0 &&
       totNumModules <= SIZE_MAX / sizeof(struct _frozen) - 1) {
@@ -134,7 +160,7 @@ struct _frozen* BuiltinRegistry::getAllFrozenModules() {
   memset(&p[0], 0, sizeof(p[0]));
 
   /* Copy the tables into the new memory */
-  int off = 0;
+  unsigned off = 0;
   for (const auto& itemptr : items()) {
     if (itemptr->numModules > 0) {
       memcpy(
diff --git a/torch/csrc/deploy/interpreter/builtin_registry.h b/torch/csrc/deploy/interpreter/builtin_registry.h
index da7eb372de84..5f2726db67b6 100644
--- a/torch/csrc/deploy/interpreter/builtin_registry.h
+++ b/torch/csrc/deploy/interpreter/builtin_registry.h
@@ -22,7 +22,7 @@
  * BuiltinRegisterer object. The constructor of BuiltinRegisterer does the real
  * registration work.
  */
-#include <gtest/gtest.h>
+#include <gtest/gtest_prod.h>
 #include <cstdarg>
 #include <memory>
 #include <unordered_map>
@@ -49,7 +49,7 @@ struct BuiltinRegistryItem {
       std::vector<std::pair<const char*, void*>>&& _builtinModules);
   const char* name;
   const struct _frozen* frozenModules;
-  int numModules;
+  unsigned numModules;
   std::vector<std::pair<const char*, void*>> builtinModules;
 };
 
@@ -77,7 +77,7 @@ class BuiltinRegistry {
   static const std::vector<std::unique_ptr<BuiltinRegistryItem>>& items() {
     return get()->items_;
   }
-  static int totalNumModules();
+  static unsigned totalNumModules();
   static BuiltinRegistry* get();
   static BuiltinRegistryItem* getItem(const std::string& name);
   static std::vector<std::pair<const char*, void*>> getAllBuiltinModules();
diff --git a/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp b/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp
index b8af5de3db20..2a89a96c623d 100644
--- a/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp
+++ b/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/deploy/loader.h>
+#include <sstream>
 #include <vector>
 
 using torch::deploy::CustomLibrary;
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.cpp b/torch/csrc/deploy/interpreter/interpreter_impl.cpp
index 1ff30f0afbb0..2af33582aa6d 100644
--- a/torch/csrc/deploy/interpreter/interpreter_impl.cpp
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.cpp
@@ -9,6 +9,7 @@
 #include <pybind11/functional.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 
 #include <cassert>
@@ -219,8 +220,8 @@ struct __attribute__((visibility("hidden"))) ConcreteInterpreterImpl
   }
 
   void setFindModule(
-      std::function<at::optional<std::string>(const std::string&)> find_module)
-      override {
+      std::function<multipy::optional<std::string>(const std::string&)>
+          find_module) override {
     std::function<py::object(const std::string&)> wrapped_find_module =
         [=](const std::string& name) -> py::object {
       auto r = find_module(name);
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h
index 10a1489740ec..a2dd57e9beeb 100644
--- a/torch/csrc/deploy/interpreter/interpreter_impl.h
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.h
@@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/core/ivalue.h>
 #include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 
 /* Torch Deploy intentionally embeds multiple copies of c++ libraries
    providing python bindings necessary for torch::deploy users in the same
@@ -15,8 +16,8 @@
    the client application.
 
    It is safe to throw exception types that are defined once in
-   the context of the client application, such as c10::Error, which is defined
-   in libtorch, which isn't duplicated in torch::deploy interpreters.
+   the context of the client application, such as std::runtime_error,
+   which isn't duplicated in torch::deploy interpreters.
 
    ==> Use TORCH_DEPLOY_TRY, _SAFE_CATCH_RETHROW around _ALL_ torch::deploy APIs
 
@@ -30,20 +31,17 @@
 
 */
 #define TORCH_DEPLOY_TRY try {
-#define TORCH_DEPLOY_SAFE_CATCH_RETHROW                                        \
-  }                                                                            \
-  catch (std::exception & err) {                                               \
-    throw c10::Error(                                                          \
-        std::string(                                                           \
-            "Exception Caught inside torch::deploy embedded library: \n") +    \
-            err.what(),                                                        \
-        "");                                                                   \
-  }                                                                            \
-  catch (...) {                                                                \
-    throw c10::Error(                                                          \
-        std::string(                                                           \
-            "Unknown Exception Caught inside torch::deploy embedded library"), \
-        "");                                                                   \
+#define TORCH_DEPLOY_SAFE_CATCH_RETHROW                                     \
+  }                                                                         \
+  catch (std::exception & err) {                                            \
+    throw std::runtime_error(                                               \
+        std::string(                                                        \
+            "Exception Caught inside torch::deploy embedded library: \n") + \
+        err.what());                                                        \
+  }                                                                         \
+  catch (...) {                                                             \
+    throw std::runtime_error(std::string(                                   \
+        "Unknown Exception Caught inside torch::deploy embedded library")); \
   }
 namespace torch {
 namespace deploy {
@@ -132,7 +130,7 @@ struct InterpreterSessionImpl {
 struct InterpreterImpl {
   virtual InterpreterSessionImpl* acquireSession() = 0;
   virtual void setFindModule(
-      std::function<at::optional<std::string>(const std::string&)>
+      std::function<multipy::optional<std::string>(const std::string&)>
           find_module) = 0;
   virtual ~InterpreterImpl() = default; // this will uninitialize python
 };
diff --git a/torch/csrc/deploy/loader.cpp b/torch/csrc/deploy/loader.cpp
index f03a2d299a55..ab4d0c7c329e 100644
--- a/torch/csrc/deploy/loader.cpp
+++ b/torch/csrc/deploy/loader.cpp
@@ -53,8 +53,8 @@
 // Get PAGE_SIZE and PAGE_MASK.
 #include <sys/user.h>
 
-#include <c10/util/Optional.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 
 #include <fmt/format.h>
 #include <torch/csrc/deploy/loader.h>
@@ -300,15 +300,15 @@ struct __attribute__((visibility("hidden"))) SystemLibraryImpl
   SystemLibraryImpl(void* handle, bool steal)
       : handle_(handle), own_handle_(steal && handle != RTLD_DEFAULT) {}
 
-  at::optional<Elf64_Addr> sym(const char* name) const override {
+  multipy::optional<Elf64_Addr> sym(const char* name) const override {
     void* r = dlsym(handle_, name);
     if (!r) {
-      return at::nullopt;
+      return multipy::nullopt;
     }
     return (Elf64_Addr)r;
   }
 
-  at::optional<TLSIndex> tls_sym(const char* name) const override;
+  multipy::optional<TLSIndex> tls_sym(const char* name) const override;
 
   ~SystemLibraryImpl() override {
     if (own_handle_) {
@@ -534,11 +534,11 @@ struct ElfDynamicInfo {
     }
   }
 
-  at::optional<Elf64_Addr> sym(
+  multipy::optional<Elf64_Addr> sym(
       const char* name,
       GnuHash* precomputed_hash = nullptr) const {
     if (!gnu_bucket_) {
-      return at::nullopt; // no hashtable was loaded
+      return multipy::nullopt; // no hashtable was loaded
     }
     GnuHash hash_obj = precomputed_hash ? *precomputed_hash : GnuHash(name);
     auto hash = hash_obj.hash;
@@ -551,12 +551,12 @@ struct ElfDynamicInfo {
     const uint32_t h2 = (hash >> gnu_shift2_) % kBloomMaskBits;
 
     if ((1 & (bloom_word >> h1) & (bloom_word >> h2)) != 1) {
-      return at::nullopt;
+      return multipy::nullopt;
     }
 
     uint32_t sym_idx = gnu_bucket_[hash % gnu_nbucket_];
     if (sym_idx == 0) {
-      return at::nullopt;
+      return multipy::nullopt;
     }
 
     uint32_t chain_value = 0;
@@ -574,12 +574,12 @@ struct ElfDynamicInfo {
                 ((ELF64_ST_TYPE(sym->st_info) == STT_TLS) ? 0 : load_bias_);
           }
           // symbol isn't defined
-          return at::nullopt;
+          return multipy::nullopt;
         }
       }
       ++sym_idx;
     } while ((chain_value & 1) == 0);
-    return at::nullopt;
+    return multipy::nullopt;
   }
 };
 
@@ -613,7 +613,7 @@ struct AlreadyLoadedSymTable {
     dyninfo_.initialize_from_dynamic_section(name, dynamic, load_bias, true);
   }
 
-  at::optional<Elf64_Addr> sym(const char* name) {
+  multipy::optional<Elf64_Addr> sym(const char* name) {
     return dyninfo_.sym(name);
   }
 };
@@ -626,8 +626,8 @@ static int iterate_cb(struct dl_phdr_info* info, size_t size, void* data) {
 // with a normal dlsym call. Instead we iterate through all loaded libraries and
 // check their symbol tables for the symbol. The value of the symbol is the TLS
 // offset. When we find the library we also get the module id.
-at::optional<TLSIndex> slow_find_tls_symbol_offset(const char* sym_name) {
-  at::optional<TLSIndex> result = at::nullopt;
+multipy::optional<TLSIndex> slow_find_tls_symbol_offset(const char* sym_name) {
+  multipy::optional<TLSIndex> result = multipy::nullopt;
   std::function<int(struct dl_phdr_info*, size_t)> cb =
       [&](struct dl_phdr_info* info, size_t size) {
         // std::cout << "SEARCHING .. " << info->dlpi_name << "\n";
@@ -650,10 +650,11 @@ at::optional<TLSIndex> slow_find_tls_symbol_offset(const char* sym_name) {
   return result;
 }
 
-at::optional<TLSIndex> SystemLibraryImpl::tls_sym(const char* name) const {
+multipy::optional<TLSIndex> SystemLibraryImpl::tls_sym(const char* name) const {
   if (!sym(name)) {
-    return at::nullopt; // before we do a bunch of slow lookups to find the
-                        // module_id, check that this even defines the symbol
+    return multipy::nullopt; // before we do a bunch of slow lookups to find the
+                             // module_id, check that this even defines the
+                             // symbol
   }
   if (handle_ == RTLD_DEFAULT) {
     return slow_find_tls_symbol_offset(name);
@@ -675,7 +676,7 @@ at::optional<TLSIndex> SystemLibraryImpl::tls_sym(const char* name) const {
         "failed to query dlinfo for module_id");
     return TLSIndex{module_id, *r};
   }
-  return at::nullopt;
+  return multipy::nullopt;
 }
 
 // dlopen does not accept additional search paths as an argument.
@@ -966,7 +967,7 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl
         dyninfo_.needed_);
   }
 
-  at::optional<Elf64_Addr> lookup_symbol(Elf64_Xword r_info) {
+  multipy::optional<Elf64_Addr> lookup_symbol(Elf64_Xword r_info) {
     const uint32_t r_type = ELF64_R_TYPE(r_info);
     const uint32_t r_sym = ELF64_R_SYM(r_info);
 
@@ -999,10 +1000,10 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl
           name_.c_str(),
           sym_name);
     }
-    return at::nullopt;
+    return multipy::nullopt;
   }
 
-  at::optional<TLSIndex> tls_lookup_symbol(Elf64_Xword r_info) {
+  multipy::optional<TLSIndex> tls_lookup_symbol(Elf64_Xword r_info) {
     const uint32_t r_sym = ELF64_R_SYM(r_info);
 
     if (r_sym == 0) {
@@ -1030,7 +1031,7 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl
           name_.c_str(),
           sym_name);
     }
-    return at::nullopt;
+    return multipy::nullopt;
   }
 
   void relocate_one(const Elf64_Rela& reloc) {
@@ -1177,16 +1178,16 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl
     f(argc_, argv_, environ);
   }
 
-  at::optional<Elf64_Addr> sym(const char* name) const override {
+  multipy::optional<Elf64_Addr> sym(const char* name) const override {
     return dyninfo_.sym(name);
   }
 
-  at::optional<TLSIndex> tls_sym(const char* name) const override {
+  multipy::optional<TLSIndex> tls_sym(const char* name) const override {
     auto r = dyninfo_.sym(name);
     if (r) {
       return TLSIndex{module_id(), *r};
     }
-    return at::nullopt;
+    return multipy::nullopt;
   }
 
   void* tls_addr(size_t offset) {
diff --git a/torch/csrc/deploy/loader.h b/torch/csrc/deploy/loader.h
index eeff1a30174e..9e5a7fd4571d 100644
--- a/torch/csrc/deploy/loader.h
+++ b/torch/csrc/deploy/loader.h
@@ -1,7 +1,7 @@
 #pragma once
-#include <c10/util/Optional.h>
 #include <dlfcn.h>
 #include <elf.h>
+#include <torch/csrc/deploy/interpreter/Optional.hpp>
 #include <memory>
 
 namespace torch {
@@ -19,8 +19,8 @@ struct TLSIndex {
 
 struct SymbolProvider {
   SymbolProvider() = default;
-  virtual at::optional<Elf64_Addr> sym(const char* name) const = 0;
-  virtual at::optional<TLSIndex> tls_sym(const char* name) const = 0;
+  virtual multipy::optional<Elf64_Addr> sym(const char* name) const = 0;
+  virtual multipy::optional<TLSIndex> tls_sym(const char* name) const = 0;
   SymbolProvider(const SymbolProvider&) = delete;
   SymbolProvider& operator=(const SymbolProvider&) = delete;
   virtual ~SymbolProvider() = default;
diff --git a/torch/csrc/deploy/mem_file.h b/torch/csrc/deploy/mem_file.h
index c50889f8353b..df4fe941ca58 100644
--- a/torch/csrc/deploy/mem_file.h
+++ b/torch/csrc/deploy/mem_file.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <c10/util/Exception.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <unistd.h>
 #include <cerrno>
 #include <cstdio>
@@ -20,18 +20,21 @@ namespace deploy {
 struct MemFile {
   explicit MemFile(const char* filename_) : fd_(0), mem_(nullptr), n_bytes_(0) {
     fd_ = open(filename_, O_RDONLY);
-    TORCH_CHECK(fd_ != -1, "failed to open {}: {}", filename_, strerror(errno));
+    MULTIPY_CHECK(
+        fd_ != -1, "failed to open {}: {}" + filename_ + strerror(errno));
     // NOLINTNEXTLINE
     struct stat s;
     if (-1 == fstat(fd_, &s)) {
       close(fd_); // destructors don't run during exceptions
-      TORCH_CHECK(false, "failed to stat {}: {}", filename_, strerror(errno));
+      MULTIPY_CHECK(
+          false, "failed to stat {}: {}" + filename_ + strerror(errno));
     }
     n_bytes_ = s.st_size;
     mem_ = mmap(nullptr, n_bytes_, PROT_READ, MAP_SHARED, fd_, 0);
     if (MAP_FAILED == mem_) {
       close(fd_);
-      TORCH_CHECK(false, "failed to mmap {}: {}", filename_, strerror(errno));
+      MULTIPY_CHECK(
+          false, "failed to mmap {}: {}" + filename_ + strerror(errno));
     }
   }
   MemFile(const MemFile&) = delete;
diff --git a/torch/csrc/deploy/remove_dt_needed.cpp b/torch/csrc/deploy/remove_dt_needed.cpp
index 5f4bb28c7c29..8b1cad535814 100644
--- a/torch/csrc/deploy/remove_dt_needed.cpp
+++ b/torch/csrc/deploy/remove_dt_needed.cpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <vector>
 
+#include <c10/util/irange.h>
 #include <fmt/format.h>
 
 #define ERROR(msg_fmt, ...) \
@@ -47,7 +48,7 @@ int main(int argc, const char** argv) {
   auto program_headers = (Elf64_Phdr*)(data + header->e_phoff);
   auto n_program_headers = header->e_phnum;
   Elf64_Dyn* dynamic = nullptr;
-  for (size_t i = 0; i < n_program_headers; ++i) {
+  for (const auto i : c10::irange(n_program_headers)) {
     const Elf64_Phdr* phdr = &program_headers[i];
     if (phdr->p_type == PT_DYNAMIC) {
       dynamic = reinterpret_cast<Elf64_Dyn*>(data + phdr->p_offset);
diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp
index 840720cc01f8..780937a51e7c 100644
--- a/torch/csrc/deploy/test_deploy.cpp
+++ b/torch/csrc/deploy/test_deploy.cpp
@@ -182,13 +182,14 @@ TEST(TorchpyTest, ErrorsReplicatingObj) {
   auto obj = session1.fromMovable(replicatedObj);
   // should throw an error when trying to access obj from different session
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW(session2.createMovable(obj), c10::Error);
+  EXPECT_THROW(session2.createMovable(obj), std::runtime_error);
   try {
     session2.createMovable(obj);
-  } catch (c10::Error& error) {
+  } catch (std::runtime_error& error) {
     EXPECT_TRUE(
-        error.msg().find(
-            "Cannot create movable from an object that lives in different session") !=
+        std::string(error.what())
+            .find(
+                "Cannot create movable from an object that lives in different session") !=
         std::string::npos);
   }
 }
@@ -197,15 +198,15 @@ TEST(TorchpyTest, ThrowsSafely) {
   // See explanation in deploy.h
   torch::deploy::InterpreterManager manager(3);
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW(manager.loadPackage("some garbage path"), c10::Error);
+  EXPECT_THROW(manager.loadPackage("some garbage path"), std::runtime_error);
 
   torch::deploy::Package p = manager.loadPackage(path("SIMPLE", simple));
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW(p.loadPickle("some other", "garbage path"), c10::Error);
+  EXPECT_THROW(p.loadPickle("some other", "garbage path"), std::runtime_error);
 
   auto model = p.loadPickle("model", "model.pkl");
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW(model(at::IValue("unexpected input")), c10::Error);
+  EXPECT_THROW(model(at::IValue("unexpected input")), std::runtime_error);
 }
 
 TEST(TorchpyTest, AcquireMultipleSessionsInTheSamePackage) {
@@ -238,7 +239,7 @@ TEST(TorchpyTest, TensorSharingNotAllowed) {
   auto t = obj.toIValue().toTensor();
   // try to feed it to the other interpreter, should error
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  ASSERT_THROW(I1.global("torch", "sigmoid")({t}), c10::Error);
+  ASSERT_THROW(I1.global("torch", "sigmoid")({t}), std::runtime_error);
 }
 
 TEST(TorchpyTest, TaggingRace) {
@@ -259,7 +260,7 @@ TEST(TorchpyTest, TaggingRace) {
         try {
           I.fromIValue(t);
           success++;
-        } catch (const c10::Error& e) {
+        } catch (const std::runtime_error& e) {
           failed++;
         }
       }
@@ -279,7 +280,7 @@ TEST(TorchpyTest, DisarmHook) {
   torch::deploy::InterpreterManager m(1);
   auto I = m.acquireOne();
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  ASSERT_THROW(I.fromIValue(t), c10::Error); // NOT a segfault
+  ASSERT_THROW(I.fromIValue(t), std::runtime_error); // NOT a segfault
 }
 
 TEST(TorchpyTest, RegisterModule) {
@@ -291,6 +292,7 @@ TEST(TorchpyTest, RegisterModule) {
   }
 }
 
+#ifdef FBCODE_CAFFE2
 TEST(TorchpyTest, FxModule) {
   size_t nthreads = 3;
   torch::deploy::InterpreterManager manager(nthreads);
@@ -317,6 +319,7 @@ TEST(TorchpyTest, FxModule) {
     ASSERT_TRUE(ref_output.equal(outputs[i]));
   }
 }
+#endif
 
 // Moving a tensor between interpreters should share the underlying storage.
 TEST(TorchpyTest, TensorSerializationSharing) {
@@ -448,6 +451,18 @@ result = torch.Tensor([1,2,3])
   EXPECT_TRUE(w_grad0.equal(w_grad1));
 }
 
+TEST(TorchpyTest, ImportlibMetadata) {
+  torch::deploy::InterpreterManager m(1);
+  m.registerModuleSource("importlib_test", R"PYTHON(
+from importlib.metadata import version
+
+result = version("torch")
+)PYTHON");
+  auto I = m.allInstances()[0].acquireSession();
+  auto ver = I.global("importlib_test", "result").toIValue().toString();
+  ASSERT_EQ(ver->string(), "0.0.1+fake_multipy");
+}
+
 // OSS build does not have bultin numpy support yet. Use this flag to guard the
 // test case.
 #if HAS_NUMPY
@@ -479,6 +494,42 @@ TEST(TorchpyTest, TestPyYAML) {
 }
 #endif
 
+TEST(TorchpyTest, PrintInstruction) {
+  const auto jit_script_with_print = R"JIT(
+  def forward(self, a):
+    print(a)
+    return a + a
+  )JIT";
+
+  auto input = torch::autograd::make_variable(at::randn({2, 3}));
+  auto expected_forward = input + input;
+
+  auto module = std::make_shared<torch::jit::Module>(
+      "Module", std::make_shared<at::CompilationUnit>());
+  module->define(jit_script_with_print);
+
+  std::vector<at::IValue> inputs{at::IValue(input)};
+
+  // Checking that a module containing prim::Print() works fine.
+  auto result1 = (*module)(inputs);
+  EXPECT_TRUE(result1.toTensor().equal(expected_forward));
+
+  {
+    auto interpreterManager =
+        std::make_shared<torch::deploy::InterpreterManager>(1);
+
+    // Checking that a module containing prim::Print() still works fine
+    // after Python environment was created.
+    auto result2 = (*module)(inputs);
+    EXPECT_TRUE(result2.toTensor().equal(expected_forward));
+  }
+
+  // Checking that a module containing prim::Print() still works fine
+  // after Python environment was created and then destroyed.
+  auto result3 = (*module)(inputs);
+  EXPECT_TRUE(result3.toTensor().equal(expected_forward));
+}
+
 int main(int argc, char* argv[]) {
   ::testing::InitGoogleTest(&argc, argv);
   int rc = RUN_ALL_TESTS();
diff --git a/torch/csrc/deploy/test_deploy_gpu.cpp b/torch/csrc/deploy/test_deploy_gpu.cpp
index 8fa154b80709..48660c79fefa 100644
--- a/torch/csrc/deploy/test_deploy_gpu.cpp
+++ b/torch/csrc/deploy/test_deploy_gpu.cpp
@@ -67,6 +67,7 @@ TEST(TorchDeployGPUTest, UsesDistributed) {
   }
 }
 
+#ifdef FBCODE_CAFFE2
 TEST(TorchDeployGPUTest, TensorRT) {
   if (!torch::cuda::is_available()) {
     GTEST_SKIP();
@@ -85,6 +86,7 @@ TEST(TorchDeployGPUTest, TensorRT) {
         output.allclose(model(at::IValue{input}).toIValue().toTensor()));
   }
 }
+#endif
 
 // OSS build does not have bultin numpy support yet. Use this flag to guard the
 // test case.
diff --git a/torch/csrc/deploy/test_deploy_missing_interpreter.cpp b/torch/csrc/deploy/test_deploy_missing_interpreter.cpp
index 8ac602a3f2fc..b47f4556ad78 100644
--- a/torch/csrc/deploy/test_deploy_missing_interpreter.cpp
+++ b/torch/csrc/deploy/test_deploy_missing_interpreter.cpp
@@ -10,5 +10,5 @@ int main(int argc, char* argv[]) {
 
 TEST(TorchDeployMissingInterpreter, Throws) {
   // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW(torch::deploy::InterpreterManager(1), c10::Error);
+  EXPECT_THROW(torch::deploy::InterpreterManager(1), std::runtime_error);
 }
diff --git a/torch/csrc/deploy/unity/xar_environment.cpp b/torch/csrc/deploy/unity/xar_environment.cpp
index 3ff233b0c420..4bb764374525 100644
--- a/torch/csrc/deploy/unity/xar_environment.cpp
+++ b/torch/csrc/deploy/unity/xar_environment.cpp
@@ -2,6 +2,7 @@
 #include <dlfcn.h>
 #include <fmt/format.h>
 #include <sys/stat.h>
+#include <torch/csrc/deploy/Exception.h>
 #include <torch/csrc/deploy/elf_file.h>
 #include <torch/csrc/deploy/unity/xar_environment.h>
 
@@ -59,7 +60,7 @@ bool _fileExists(const std::string& filePath) {
 }
 
 void XarEnvironment::setupPythonApp() {
-  TORCH_CHECK(
+  MULTIPY_CHECK(
       !alreadySetupPythonApp_,
       "Already setup the python application. It should only been done once!");
 
@@ -67,7 +68,8 @@ void XarEnvironment::setupPythonApp() {
   constexpr const char* SECTION_NAME = ".torch_deploy_payload.unity";
   ElfFile elfFile(exePath_.c_str());
   auto payloadSection = elfFile.findSection(SECTION_NAME);
-  TORCH_CHECK(payloadSection != at::nullopt, "Missing the payload section");
+  MULTIPY_CHECK(
+      payloadSection != multipy::nullopt, "Missing the payload section");
   const char* pythonAppPkgStart = payloadSection->start;
   auto pythonAppPkgSize = payloadSection->len;
   LOG(INFO) << "Embedded binary size " << pythonAppPkgSize;
@@ -107,23 +109,26 @@ void XarEnvironment::setupPythonApp() {
    * past runs. It should be pretty safe to discard them.
    */
   std::string rmCmd = fmt::format("rm -rf {}", pythonAppDir_);
-  TORCH_CHECK(system(rmCmd.c_str()) == 0, "Fail to remove the directory.");
+  MULTIPY_CHECK(system(rmCmd.c_str()) == 0, "Fail to remove the directory.");
 
   // recreate the directory
   auto r = mkdir(pythonAppDir_.c_str(), 0777);
-  TORCH_CHECK(r == 0, "Failed to create directory: ", strerror(errno));
+  MULTIPY_CHECK(r == 0, "Failed to create directory: " + strerror(errno));
 
   std::string pythonAppArchive = std::string(pythonAppDir_) + "/python_app.xar";
   auto fp = fopen(pythonAppArchive.c_str(), "wb");
-  TORCH_CHECK(fp != nullptr, "Fail to create file: ", strerror(errno));
+  MULTIPY_CHECK(fp != nullptr, "Fail to create file: " + strerror(errno));
   auto written = fwrite(pythonAppPkgStart, 1, pythonAppPkgSize, fp);
-  TORCH_CHECK(written == pythonAppPkgSize, "Expected written == size");
+  MULTIPY_CHECK(written == pythonAppPkgSize, "Expected written == size");
   fclose(fp);
 
   std::string extractCommand = fmt::format(
       "unsquashfs -o 4096 -d {} {}", pythonAppRoot_, pythonAppArchive);
   r = system(extractCommand.c_str());
-  TORCH_CHECK(r == 0, "Fail to extract the python package");
+  MULTIPY_CHECK(
+      r == 0,
+      "Fail to extract the python package" + std::to_string(r) +
+          extractCommand.c_str());
 
   alreadySetupPythonApp_ = true;
 }
@@ -143,12 +148,9 @@ void XarEnvironment::preloadSharedLibraries() {
                 << " does not exist in the python app root, skip loading it";
       continue;
     }
-    TORCH_CHECK(
+    MULTIPY_CHECK(
         dlopen(preloadList[i], RTLD_GLOBAL | RTLD_LAZY) != nullptr,
-        "Fail to open the shared library ",
-        preloadList[i],
-        ": ",
-        dlerror());
+        "Fail to open the shared library " + preloadList[i] + ": " + dlerror());
   }
 }
 
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp
index 369f9f1242b3..c82d940cf3d3 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/irange.h>
 #include <torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h>
 #include <torch/csrc/distributed/rpc/utils.h>
 #include <torch/csrc/jit/serialization/pickle.h>
@@ -121,9 +122,9 @@ std::unique_ptr<RpcWithProfilingResp> RpcWithProfilingResp::fromMessage(
   int profiledEventsSize = tupleElements[2].toInt();
   std::vector<torch::autograd::profiler::LegacyEvent> remoteEvents;
   remoteEvents.reserve(profiledEventsSize);
-  for (int i = kProfileEventsStartIdx;
-       i < kProfileEventsStartIdx + profiledEventsSize;
-       ++i) {
+  for (const auto i : c10::irange(
+           kProfileEventsStartIdx,
+           kProfileEventsStartIdx + profiledEventsSize)) {
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     TORCH_CHECK(i < tupleElements.size());
     // Reconstruct remote event from the ivalues.
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 0c1dd97a1468..568c23ef7a20 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -36,9 +36,12 @@ std::string getNcclVersion() {
     if (status != ncclSuccess || version < 100) {
       versionString = "Unknown NCCL version";
     } else {
-      auto ncclMajor = version / 1000;
-      auto ncclMinor = (version % 1000) / 100;
-      auto ncclPatch = version % (ncclMajor * 1000 + ncclMinor * 100);
+      // NCCL changed version coding starting 2.9
+      const int majorBase =  version < 2900 ? 1000 : 10000;
+      const int minorBase = 100;
+      auto ncclMajor = version / majorBase;
+      auto ncclMinor = (version % majorBase) / minorBase;
+      auto ncclPatch = version % (ncclMajor * majorBase + ncclMinor * minorBase);
       versionString = std::to_string(ncclMajor) + "." +
           std::to_string(ncclMinor) + "." + std::to_string(ncclPatch);
     }
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 9dabc0c8c3fc..7ca54d167ead 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -25,7 +25,8 @@ const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::
     case ncclUnhandledCudaError:
       return "ncclUnhandledCudaError: Call to CUDA function failed.";
     case ncclSystemError:
-      return "ncclSystemError: System call (socket, malloc, munmap, etc) failed.";
+      return "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
+        "It can be also caused by unexpected exit of a remote peer, you can check NCCL warnings for failure reason and see if there is connection closure by a peer.";
     case ncclInternalError:
       return "ncclInternalError: Internal check failed. This is either a bug in NCCL or due to memory corruption";
     case ncclInvalidArgument:
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 861837b00833..ff3bb7cf0748 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -2,6 +2,7 @@
 #include <c10d/ProcessGroup.hpp>
 
 #include <c10/util/Logging.h>
+#include <fmt/format.h>
 
 namespace c10d {
 
@@ -49,7 +50,8 @@ std::string opTypeToString(OpType opType) {
   return "UNKNOWN";
 }
 
-bool isP2POp(OpType opType) {
+bool isP2POp(OpType opType, bool batchP2P /*= false*/) {
+  if (batchP2P) return false;
   return opType == OpType::SEND || opType == OpType::RECV ||
       opType == OpType::RECVANYSOURCE;
 }
@@ -76,7 +78,7 @@ ProcessGroup::Work::Work(
           inputs.emplace_back(tensor);
         }
       }
-      recordingFunction->before(profilingTitle, inputs);
+      recordingFunction->before(profilingTitle, c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()));
       std::function<void()> end_handler = [recordingFunction]() {
         recordingFunction->end();
       };
@@ -174,10 +176,14 @@ void ProcessGroup::Work::finishAndThrow(std::exception_ptr exception) {
 }
 
 ProcessGroup::ProcessGroup(int rank, int size)
-    : rank_(rank), size_(size), dist_debug_level_(parseDistDebugLevel()) {
+    : rank_(rank), size_(size), dist_debug_level_(debug_level()) {
   C10_LOG_API_USAGE_ONCE("c10d.process_group");
 }
 
 ProcessGroup::~ProcessGroup() {}
 
+void ProcessGroup::init() {
+  C10_LOG_API_USAGE_ONCE(fmt::format("c10d.process_group_{}", getBackendName()));
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 999189b1fe90..af97bdc9bd8a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -12,6 +12,7 @@
 
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
+#include <c10d/debug.h>
 #include <c10d/sequence_num.hpp>
 
 // *************************************************************************
@@ -54,7 +55,7 @@ enum class OpType : std::uint8_t {
 TORCH_API std::string opTypeToString(OpType opType);
 
 // Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
-TORCH_API bool isP2POp(OpType opType);
+TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
 
 // ProcessGroup is a base class that captures collective and point to
 // point communication in a fixed set of processes.
@@ -426,13 +427,17 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   }
 
  protected:
+  // Implementations of this interface need to call this to setup
+  // appropriate logging etc.
+  void init();
+
   const int rank_;
   const int size_;
   // Optional sequence number structure for matching collectives.
   c10::optional<c10d::SequenceNum> sequenceNum_ = c10::nullopt;
   // Debug level setting. It is parsed once when ProcessGroup is constructed and
   // remains the same across use of this process group.
-  DistributedDebugLevel dist_debug_level_;
+  DebugLevel dist_debug_level_;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index d95afa32ec8e..f2b553ba1cc8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -490,7 +490,7 @@ inline void ProcessGroupGloo::AsyncWork::recordAsyncWorkProfilingInfo(
           inputs.emplace_back(tensor);
         }
       }
-      recordingFunction->before(profilingTitle, inputs);
+      recordingFunction->before(profilingTitle, c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()));
     };
     recordFunctionBeforeCallback_ = at::wrapPropagateTLSState(before_handler);
     std::function<void()> end_handler = [recordingFunction]() {
@@ -763,6 +763,8 @@ ProcessGroupGloo::ProcessGroupGloo(
   for(const auto i : c10::irange(threads_.size())) {
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i);
   }
+
+  init();
 }
 
 ProcessGroupGloo::~ProcessGroupGloo() {
@@ -2814,7 +2816,7 @@ void ProcessGroupGloo::monitoredBarrier(
           TORCH_INTERNAL_ASSERT(!failedRanks.empty());
           const std::string ranksStr = c10::Join(", ", failedRanks);
           const std::string error = c10::str(
-              "Ranks ",
+              "[Rank 0]: Ranks ",
               ranksStr,
               " failed to pass monitoredBarrier in ",
               monitoredBarrierTimeout.count(),
@@ -2834,8 +2836,9 @@ void ProcessGroupGloo::monitoredBarrier(
 
   waitLoop(sendWorkMap);
 
-  auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-      std::chrono::steady_clock::now() - startTime);
+  using namespace std::chrono;
+  C10_UNUSED auto elapsedTime = duration_cast<milliseconds>(
+      steady_clock::now() - startTime);
 }
 
 void ProcessGroupGloo::setSequenceNumberForGroup() {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index 714f3a84deb6..556ab1388712 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -310,6 +310,8 @@ ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pgComm)
 
   // Start the worker thread accepting MPI calls
   workerThread_ = std::thread(&ProcessGroupMPI::runLoop, this);
+
+  init();
 }
 
 ProcessGroupMPI::~ProcessGroupMPI() {
@@ -695,7 +697,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
         "Tensor's dim 0 does not divide equally across group size");
 
     std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-        [opts, this](std::unique_ptr<WorkEntry>& entry) {
+        [this](std::unique_ptr<WorkEntry>& entry) {
           auto srcdata = (entry->src)[0];
           auto dstdata = (entry->dst)[0];
           c10::DeviceGuard guard(srcdata.device());
@@ -722,7 +724,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
     std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-        [opts, this, inputSplitSizes, outputSplitSizes](
+        [this, inputSplitSizes, outputSplitSizes](
             std::unique_ptr<WorkEntry>& entry) {
           auto srcdata = (entry->src)[0];
           auto dstdata = (entry->dst)[0];
@@ -769,7 +771,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
       outputTensors.size() == size_,
       "Number of output tensors are not equal to group size");
   std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [opts, this](std::unique_ptr<WorkEntry>& entry) {
+      [this](std::unique_ptr<WorkEntry>& entry) {
         std::vector<int> send_lengths(size_);
         std::vector<int> recv_lengths(size_);
         std::vector<int> send_offsets(size_);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index a48435c8f5a3..8d248b0571bc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1,4 +1,5 @@
 #include <c10d/ProcessGroupNCCL.hpp>
+#include <c10d/UCCForNCCL.hpp>
 #include <sstream>
 
 #ifdef USE_C10D_NCCL
@@ -30,8 +31,6 @@ constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";
 
 namespace {
 
-constexpr int kBytes = 8;
-
 // RAII helper class to manage NCCL group API and CUDA free mutex.
 // The destructor is allowed to throw since this helper class only
 // manages group and lock lifetimes.
@@ -376,11 +375,20 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
 }
 
 bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
-  for (const auto i : c10::irange(devices_.size())) {
-    // Checking the work's corresponding CUDA events' status
-    if (!(*ncclEndEvents_)[i].query()) {
-      return false;
+  try {
+    for (const auto i : c10::irange(devices_.size())) {
+      // Checking the work's corresponding CUDA events' status
+      if (!(*ncclEndEvents_)[i].query()) {
+        return false;
+      }
+    }
+  } catch (const std::exception& e) {
+    if (std::string(e.what()).find("driver shutting down") == std::string::npos) {
+      throw;
     }
+    LOG(INFO) << "[Rank " << rank_
+              << "] Event query failed with exception: "
+              << e.what();
   }
   return true;
 }
@@ -430,10 +438,6 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
 
   // In case of blocking, wait for the operation to complete.
   if (blockingWait_) {
-    // Use the passed in timeout if provided, otherwise use the default
-    // opTimeout for each WorkNCCL object.
-    std::chrono::milliseconds workTimeout =
-        timeout == kNoTimeout ? opTimeout_ : timeout;
     // Wait for the operation to complete.
     while (!isCompleted()) {
       if (timedOut()) {
@@ -579,12 +583,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     workCleanupThread_ = std::thread(&ProcessGroupNCCL::workCleanupLoop, this);
   }
 
-  const char* ncclDebugLevel = std::getenv("NCCL_DEBUG");
-
-  if (!ncclDebugLevel) {
-    ncclDebugLevel = "UNSET";
-  }
-
+  init();
   LOG(INFO) << "[Rank " << rank_
             << "] ProcessGroupNCCL initialized with following options:"
             << "\nNCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_
@@ -592,8 +591,27 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << "\nNCCL_BLOCKING_WAIT: " << blockingWait_
             << "\nTIMEOUT(ms): " << options_->timeout.count()
             << "\nUSE_HIGH_PRIORITY_STREAM: "
-            << options_->is_high_priority_stream
-            << "\nNCCL_DEBUG: " << ncclDebugLevel;
+            << options_->is_high_priority_stream;
+
+#ifdef USE_NCCL_WITH_UCC
+  static std::once_flag initialize_ucc_lib_flag;
+  std::call_once(initialize_ucc_lib_flag, [&]{
+    uccLib_ = loadTorchUCC();
+    if (uccLib_ != nullptr) {
+      LOG(INFO) << "[Rank " << rank_  << "] torch_ucc.so loaded";
+    }
+  });
+
+  if (uccLib_ != nullptr) {
+    LOG(INFO) << "[Rank " << rank_  << "] torch_ucc.so loaded";
+    typedef c10::intrusive_ptr<ProcessGroup> fn(const c10::intrusive_ptr<Store>& store, int rank, int size);
+    auto createProcessGroupUCC = reinterpret_cast<fn*>(uccLib_->sym("createProcessGroupUCC"));
+    if (createProcessGroupUCC != nullptr) {
+      uccPG_ = createProcessGroupUCC(store, rank_, size_);
+      LOG(INFO) << "[Rank " << rank_  << "] ProcessGroupUCC created.";
+    }
+  }
+#endif
 }
 
 void ProcessGroupNCCL::runHealthCheck() {
@@ -983,7 +1001,7 @@ std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal(
 
 void ProcessGroupNCCL::broadcastUniqueNCCLID(
     ncclUniqueId* ncclID,
-    OpType opType,
+    bool isSingleP2POp,
     const std::string& p2pKey,
     int p2pRank) {
   // For collective operations:
@@ -993,7 +1011,7 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
   // retrieving the contents of that key. A single process group
   // may create multiple NCCL communicators, so we use a sequence
   // number to differentiate between them.
-  // For point-to-point operations:
+  // For single point-to-point operations:
   // The sequence number will only be increased on 2 out of all the
   // processes in a Process Group. So all following collective
   // operations will see different sequence numbers which will cause
@@ -1001,12 +1019,12 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
   // of sequence number for p2p communications.
 
   std::string storeKey;
-  if (!isP2POp(opType)) {
+  if (!isSingleP2POp) {
     storeKey = std::to_string(ncclCommCounter_++);
   } else {
     storeKey = p2pKey;
   }
-  if (rank_ == 0 || (isP2POp(opType) && p2pRank == 0)) {
+  if (rank_ == 0 || (isSingleP2POp && p2pRank == 0)) {
     auto vec = std::vector<uint8_t>(
         reinterpret_cast<uint8_t*>(ncclID),
         reinterpret_cast<uint8_t*>(ncclID) + NCCL_UNIQUE_ID_BYTES);
@@ -1097,15 +1115,18 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // Create the unique NCCL ID and broadcast it
   ncclUniqueId ncclID;
 
+  // For batch_isend_irecv, ncclGroupStart() would be called upfront
+  bool batchP2P = ncclActiveGroupCounter_ > 0;
+  bool singleP2POp = isP2POp(opType, batchP2P);
   // For point-to-point communication, lower rank of the two will get unique id.
-  if (rank_ == 0 || (isP2POp(opType) && p2pRank == 0)) {
+  if (rank_ == 0 || (singleP2POp && p2pRank == 0)) {
     C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID), c10::nullopt);
   }
 
   // For point-to-point communication on the same process, don't need broadcast.
   if (!isSendRecvSelf) {
     // Broadcast so that each process can have a unique NCCL ID
-    broadcastUniqueNCCLID(&ncclID, opType, devicesKey, p2pRank);
+    broadcastUniqueNCCLID(&ncclID, singleP2POp, devicesKey, p2pRank);
   }
 
   at::cuda::OptionalCUDAGuard gpuGuard;
@@ -1141,7 +1162,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     // GPU world size and GPU rank
     int numRanks, rank;
 
-    if (!isP2POp(opType)) {
+    if (!singleP2POp) {
+      // Collective, all-to-all, or batch P2P
       numRanks = getSize() * devices.size();
       rank = getRank() * devices.size() + i;
     } else if (isSendRecvSelf) {
@@ -1149,7 +1171,7 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
       numRanks = 1;
       rank = 0;
     } else {
-      // For point-to-point operation, there are only 2 processes involved so
+      // For single point-to-point operation, there are only 2 processes involved so
       // the GPU rank is either 0 or 1.
       numRanks = 2;
       rank = p2pRank;
@@ -1168,6 +1190,12 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // [Note 2 ]
   C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
 
+  // At this point NCCL should have been initialized, hence we can accurately get
+  // the env value even if NCCL sets it by reading from nccl.conf file
+  if (getRank() == 0) {
+    LOG(INFO) << "NCCL_DEBUG: " << parse_env("NCCL_DEBUG");
+  }
+
   // See [Group Start/End Note]
   for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
     (void)i;
@@ -1503,9 +1531,25 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     PostProcess post,
     const char* profilingTitle) {
   const auto devices = getDeviceList(tensors);
-  const auto key = getKeySendRecv(rank_, peer);
-  int p2pRank = rank_ <= peer ? 0 : 1;
-  auto isSendRecvSelf = rank_ == peer;
+  std::string key;
+  int p2pRank = 0, p2pTargetRank = 0;
+  bool isSendRecvSelf = false;
+  // For batch_isend_irecv, ncclGroupStart() would be called upfront
+  bool batchP2P = ncclActiveGroupCounter_ > 0;
+  if (batchP2P) {
+    // For batch P2P, we need to treat it like a collective when selecting
+    // communicator, because other ranks can call into this batch other than my
+    // rank and my peer
+    key = getKeyFromDevices(devices);
+    p2pRank = rank_;
+    p2pTargetRank = peer;
+  } else {
+    // For single P2P, preserve the old two-rank behavior (to avoid perf diff)
+    key = getKeySendRecv(rank_, peer);
+    p2pRank = rank_ <= peer ? 0 : 1;
+    isSendRecvSelf = rank_ == peer;
+    p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+  }
   auto& ncclComms = getNCCLComm(key, devices, opType, p2pRank, isSendRecvSelf);
 
   // First let NCCL streams wait for input tensors allocation streams
@@ -1557,9 +1601,6 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     for (const auto i : c10::irange(tensors.size())) {
       gpuGuard.set_index(devices[i].index());
       at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
-      // For point-to-point communication, NCCL ranks can only
-      // be 0 or 1.
-      int p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
       C10D_NCCL_CHECK(fn(
           tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank), ncclComms[i]->getNcclCommFailureReason());
     }
@@ -2262,6 +2303,9 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
       invalidArgument("requires empty output on non-root");
     }
     outputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    outputs.emplace_back();
   }
 
   return collective(
@@ -2337,6 +2381,9 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
       invalidArgument("requires empty input on non-root");
     }
     inputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    inputs.emplace_back();
   }
 
   return collective(
@@ -2408,6 +2455,18 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_allgather_base(
       "nccl:_all_gather_base");
 }
 
+#ifdef USE_NCCL_WITH_UCC
+std::shared_ptr<at::DynamicLibrary> ProcessGroupNCCL::uccLib_ = nullptr;
+#endif
+
+bool ProcessGroupNCCL::isUCCAvailable() const {
+#ifdef USE_NCCL_WITH_UCC
+  return (uccPG_ != nullptr);
+#else
+  return false;
+#endif
+}
+
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 77d9bb3dd596..f86cf5e9d576 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -12,7 +12,9 @@
 #include <c10d/NCCLUtils.hpp>
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Store.hpp>
+#include <c10d/UCCForNCCL.hpp>
 
+#include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/core/Stream.h>
@@ -368,11 +370,14 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   // may indicate that there is some sort of collective desynchronization.
   uint64_t getSequenceNumberForGroup() override;
 
+  // Tests if the UCC fallback path is available
+  bool isUCCAvailable() const;
+
  protected:
   // Helper that broadcasts nccl unique ID to all ranks through the store
   void broadcastUniqueNCCLID(
       ncclUniqueId* ncclID,
-      OpType opType,
+      bool isSingleP2POp,
       const std::string& devicesKey,
       int p2pRank);
 
@@ -623,6 +628,12 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
 
   // Counting for the sequential number of NCCL collective call.
   uint64_t seq_{0};
+
+#ifdef USE_NCCL_WITH_UCC
+  // ProcessGroupUCC shared library handle and ProcessGroup pointer
+  static std::shared_ptr<at::DynamicLibrary> uccLib_;
+  c10::intrusive_ptr<ProcessGroup> uccPG_;
+#endif
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index a2f03f84501e..118ee3e19c3b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -69,11 +69,11 @@ struct CollectiveFingerPrint {
     // Create output tensor data structure to pass into allgather.
     std::vector<std::vector<at::Tensor>> output_tensors;
     output_tensors.reserve(tensors_to_verify.size());
-    for (auto& tensor_shape : tensors_to_verify) {
+    for (const auto& tensor_shape : tensors_to_verify) {
       std::vector<at::Tensor> outputs;
       outputs.reserve(pg->getSize());
       for (const auto i : c10::irange(pg->getSize())) {
-        (void)i; //Suppress unused variable warning
+        (void)i;  // Suppress unused variable warning
         outputs.emplace_back(at::zeros_like(tensor_shape));
       }
       output_tensors.emplace_back(outputs);
@@ -143,12 +143,12 @@ std::ostream& operator<<(
     std::vector<std::string> dtype_strs;
     std::vector<std::string> device_type_strs;
     for (const auto& tensor_dtype : collective_fingerprint.tensor_dtypes_) {
-      dtype_strs.push_back(
+      dtype_strs.emplace_back(
           c10::toString(static_cast<at::ScalarType>(tensor_dtype)));
     }
     for (const auto& tensor_device_type :
          collective_fingerprint.tensor_device_types_) {
-      device_type_strs.push_back(
+      device_type_strs.emplace_back(
           c10::toString(static_cast<at::DeviceType>(tensor_device_type)));
     }
 
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index 111701738684..46dc29ec0f65 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -128,6 +128,7 @@ void BackgroundThread::closeStopSignal() {
 
 void BackgroundThread::stop() {
   if (controlPipeFd_[1] != -1) {
+    ::write(controlPipeFd_[1], "\0", 1);
     // close the write end of the pipe
     ::close(controlPipeFd_[1]);
     controlPipeFd_[1] = -1;
@@ -534,8 +535,16 @@ void TCPStoreMasterDaemon::run() {
 void TCPStoreMasterDaemon::run() {
   std::vector<struct pollfd> fds;
   tcputil::addPollfd(fds, storeListenSocket_.handle(), POLLIN);
-  // Push the read end of the pipe to signal the stopping of the daemon run
-  tcputil::addPollfd(fds, controlPipeFd_[0], POLLHUP);
+  // Although we haven't found any documentation or literature describing this,
+  // we've seen cases that, under certain circumstances, the read end of the
+  // pipe won't receive POLLHUP when the write end is closed. However, under
+  // the same circumstances, writing to the pipe will guarantee POLLIN to be
+  // received on the read end.
+  //
+  // For more reliable termination, the main thread will write a byte to the
+  // pipe before closing it, and the background thread will poll for both
+  // POLLIN and POLLHUP.
+  tcputil::addPollfd(fds, controlPipeFd_[0], POLLIN | POLLHUP);
 
   // receive the queries
   bool finished = false;
@@ -564,8 +573,9 @@ void TCPStoreMasterDaemon::run() {
 
     // The pipe receives an event which tells us to shutdown the daemon
     if (fds[1].revents != 0) {
-      // Will be POLLUP when the pipe is closed
-      if (fds[1].revents ^ POLLHUP) {
+      // The main thread will write a byte to the pipe then close it before
+      // joining the background thread
+      if (fds[1].revents & ~(POLLIN | POLLHUP)) {
         throw std::system_error(
             ECONNABORTED,
             std::system_category(),
@@ -700,7 +710,16 @@ void TCPStoreWorkerDaemon::run() {
 #else
 void TCPStoreWorkerDaemon::run() {
   std::vector<struct pollfd> fds;
-  tcputil::addPollfd(fds, controlPipeFd_[0], POLLHUP);
+  // Although we haven't found any documentation or literature describing this,
+  // we've seen cases that, under certain circumstances, the read end of the
+  // pipe won't receive POLLHUP when the write end is closed. However, under
+  // the same circumstances, writing to the pipe will guarantee POLLIN to be
+  // received on the read end.
+  //
+  // For more reliable termination, the main thread will write a byte to the
+  // pipe before closing it, and the background thread will poll for both
+  // POLLIN and POLLHUP.
+  tcputil::addPollfd(fds, controlPipeFd_[0], POLLIN | POLLHUP);
   tcputil::addPollfd(fds, storeListenSocket_.handle(), POLLIN);
 
   while (true) {
@@ -709,8 +728,9 @@ void TCPStoreWorkerDaemon::run() {
     // Check control and exit early if triggered
     // The pipe receives an event which tells us to shutdown the listener thread
     if (fds[0].revents != 0) {
-      // Will be POLLUP when the pipe is closed
-      if (fds[0].revents ^ POLLHUP) {
+      // The main thread will write a byte to the pipe then close it before
+      // joining the background thread
+      if (fds[0].revents & ~(POLLIN | POLLHUP)) {
         throw std::system_error(
             ECONNABORTED,
             std::system_category(),
@@ -893,7 +913,9 @@ void TCPClient::setTimeout(std::chrono::milliseconds value) {
       static_cast<long>((value.count() % 1000) * 1000)};
 #else
   struct timeval timeoutTV = {
-      .tv_sec = value.count() / 1000, .tv_usec = (value.count() % 1000) * 1000};
+      .tv_sec = value.count() / 1000,
+      .tv_usec = static_cast<suseconds_t>((value.count() % 1000) * 1000),
+  };
 #endif
   SYSCHECK_ERR_RETURN_NEG1(::setsockopt(
       socket_.handle(),
diff --git a/torch/csrc/distributed/c10d/UCCForNCCL.hpp b/torch/csrc/distributed/c10d/UCCForNCCL.hpp
new file mode 100644
index 000000000000..ce38894faebc
--- /dev/null
+++ b/torch/csrc/distributed/c10d/UCCForNCCL.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <cassert>
+#include <memory>
+
+#include <ATen/DynamicLibrary.h>
+
+namespace c10d {
+
+inline std::shared_ptr<at::DynamicLibrary> loadTorchUCC() {
+  const char *path = std::getenv("TORCH_UCC_LIBRARY_PATH");
+  if (path != nullptr) {
+    try {
+      return std::make_shared<at::DynamicLibrary>(path);
+    } catch (const c10::DynamicLibraryError &e) {
+      TORCH_WARN("TORCH_UCC_LIBRARY_PATH is set, "
+                 "but the loading of torch_ucc.so failed with:", e.msg());
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Utils.cpp b/torch/csrc/distributed/c10d/Utils.cpp
index f8a38c8625d0..924d0a233682 100644
--- a/torch/csrc/distributed/c10d/Utils.cpp
+++ b/torch/csrc/distributed/c10d/Utils.cpp
@@ -8,11 +8,6 @@
 
 namespace c10d {
 
-const char* kDistDebugEnvVar = "TORCH_DISTRIBUTED_DEBUG";
-const char* kDistDebugDetailLogLevel = "DETAIL";
-const char* kDistDebugInfoLogLevel = "INFO";
-const char* kDistDebugOffLogLevel = "OFF";
-
 std::string parse_env(const char* env_var_name) {
   char* stringValue = std::getenv(env_var_name);
   std::string res = "N/A";
@@ -22,65 +17,15 @@ std::string parse_env(const char* env_var_name) {
   return res;
 }
 
-DistributedDebugLevel parseDistDebugLevel() {
-  std::string debugLevel = parse_env(kDistDebugEnvVar);
-  const char* levelStr{nullptr};
-  if (debugLevel.compare("N/A") == 0) {
-    levelStr = kDistDebugOffLogLevel;
-  } else {
-    levelStr = debugLevel.c_str();
-    TORCH_CHECK(
-        strncmp(
-            levelStr,
-            kDistDebugDetailLogLevel,
-            strlen(kDistDebugDetailLogLevel)) == 0 ||
-            strncmp(
-                levelStr,
-                kDistDebugInfoLogLevel,
-                strlen(kDistDebugInfoLogLevel)) == 0 ||
-            strncmp(
-                levelStr,
-                kDistDebugOffLogLevel,
-                strlen(kDistDebugOffLogLevel)) == 0,
-        c10::str(
-            "Expected environment variable TORCH_DISTRIBUTED_DEBUG to be one of ",
-            kDistDebugDetailLogLevel,
-            " ",
-            kDistDebugInfoLogLevel,
-            " ",
-            kDistDebugOffLogLevel,
-            " "));
-    C10_LOG_FIRST_N(INFO, 1)
-        << "TORCH_DISTRIBUTED_DEBUG level parsed as " << levelStr;
-  }
-
-  static std::unordered_map<std::string, DistributedDebugLevel> mapping = {
-      {kDistDebugOffLogLevel, DistributedDebugLevel::OFF},
-      {kDistDebugInfoLogLevel, DistributedDebugLevel::INFO},
-      {kDistDebugDetailLogLevel, DistributedDebugLevel::DETAIL}};
-
-  auto it = mapping.find(levelStr);
-  TORCH_CHECK(
-      it != mapping.end(),
-      "Invalid string value for distributed debug mode: ",
-      levelStr);
-  return it->second;
-}
-
 std::vector<at::Tensor> getTensorShapes(
     const std::vector<at::Tensor>& tensors) {
   std::vector<at::Tensor> shapeTensors;
   shapeTensors.reserve(tensors.size());
   for (const auto& tensor : tensors) {
-    auto shapesVec = tensor.sizes().vec();
-    int64_t shapes_size = shapesVec.size();
-    // Need to clone here otherwise the shapesVec.data() memory is not copied
-    // and can be released under the hood.
-    at::Tensor shapesTensor = at::from_blob(
-                                  shapesVec.data(),
-                                  {shapes_size},
-                                  at::TensorOptions().dtype(at::kLong))
-                                  .clone();
+    // Use `at::tensor()` to copy the data underlying `sizes()` since it may be
+    // released elsewhere.
+    at::Tensor shapesTensor =
+        at::tensor(tensor.sizes(), at::TensorOptions().dtype(at::kLong));
     shapeTensors.emplace_back(std::move(shapesTensor));
   }
   return shapeTensors;
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index a8e5b1a83052..501993a728b7 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -32,23 +32,8 @@ typedef SSIZE_T ssize_t;
 
 namespace c10d {
 
-// Distributed c10d debug levels
-enum DistributedDebugLevel {
-  OFF = 0,
-  DETAIL = 1,
-  INFO = 2,
-};
-
-// String debug log levels
-extern const char* kDistDebugEnvVar;
-extern const char* kDistDebugDetailLogLevel;
-extern const char* kDistDebugInfoLogLevel;
-extern const char* kDistDebugOffLogLevel;
-
 TORCH_API std::string parse_env(const char* env_var_name);
 
-TORCH_API DistributedDebugLevel parseDistDebugLevel();
-
 // Retrieve tensor shapes from a given tensor.
 TORCH_API std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
 
@@ -422,7 +407,7 @@ inline void checkSplitSizes(
         "Tensor's dim 0 does not divide equally across group size");
   } else {
     TORCH_CHECK(
-        split_sizes.size() == group_size,
+        split_sizes.size() == static_cast<size_t>(group_size),
         "Number of tensor splits not equal to group size");
     const auto sum = c10::sum_integers(split_sizes);
     TORCH_CHECK(
diff --git a/torch/csrc/distributed/c10d/debug.cpp b/torch/csrc/distributed/c10d/debug.cpp
new file mode 100644
index 000000000000..a22f322576cd
--- /dev/null
+++ b/torch/csrc/distributed/c10d/debug.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <c10d/debug.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <string>
+
+#include <c10d/exception.h>
+#include <c10d/logging.h>
+
+namespace c10d {
+namespace detail {
+namespace {
+
+DebugLevel loadDebugLevelFromEnvironment() {
+  char* env_value = std::getenv("TORCH_DISTRIBUTED_DEBUG");
+
+  if (env_value == nullptr) {
+    return DebugLevel::Off;
+  }
+
+  DebugLevel level{};
+
+  std::string level_str{env_value};
+
+  std::transform(level_str.begin(), level_str.end(), level_str.begin(),
+    [](unsigned char c) {
+      return toupper(c);
+    });
+
+  if (level_str == "OFF") {
+    level = DebugLevel::Off;
+  } else if (level_str == "INFO") {
+    level = DebugLevel::Info;
+  } else if (level_str == "DETAIL") {
+    level = DebugLevel::Detail;
+  } else {
+    throw C10dError{"The value of TORCH_DISTRIBUTED_DEBUG must be OFF, INFO, or DETAIL."};
+  }
+
+  C10D_INFO("The debug level is set to {}.", level_str);
+
+  return level;
+}
+
+} // namespace
+} // namespace detail
+
+namespace {
+
+DebugLevel g_debug_level = DebugLevel::Off;
+
+} // namespace
+
+void setDebugLevel(DebugLevel level) {
+  g_debug_level = level;
+}
+
+void setDebugLevelFromEnvironment() {
+  g_debug_level = detail::loadDebugLevelFromEnvironment();
+}
+
+DebugLevel debug_level() noexcept {
+  return g_debug_level;
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/debug.h b/torch/csrc/distributed/c10d/debug.h
new file mode 100644
index 000000000000..ecfb49448295
--- /dev/null
+++ b/torch/csrc/distributed/c10d/debug.h
@@ -0,0 +1,27 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace c10d {
+
+enum class DebugLevel {
+  Off,
+  Info,
+  Detail
+};
+
+TORCH_API void setDebugLevel(DebugLevel level);
+
+// Sets the debug level based on the value of the `TORCH_DISTRIBUTED_DEBUG`
+// environment variable.
+TORCH_API void setDebugLevelFromEnvironment();
+
+TORCH_API DebugLevel debug_level() noexcept;
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 0084e4523a98..873b6b35f168 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -29,6 +29,7 @@
 #include <pybind11/chrono.h>
 
 #include <c10d/comm.hpp>
+#include <c10d/debug.h>
 #include <c10d/logger.hpp>
 #include <c10d/reducer.hpp>
 
@@ -449,6 +450,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           py::arg("output_device"),
           py::arg("broadcast_buffers"),
           py::arg("has_sync_bn"),
+          py::arg("static_graph"),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "set_runtime_stats_and_log",
@@ -478,20 +480,24 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           &::c10d::Logger::set_static_graph,
           py::call_guard<py::gil_scoped_release>());
 
-  py::enum_<::c10d::DistributedDebugLevel>(module, "_DistributedDebugLevel", R"(
-      An enum whose values correspond to different debug settings of the
-      torch.distributed package. Currently supporting settings are OFF, INFO,
-      and DETAIL, which can be set via the TORCH_DISTRIBUTED_DEBUG environment
-      variable.
+  py::enum_<::c10d::DebugLevel>(module, "DebugLevel", R"(
+      An enum whose values correspond to different debug levels of the
+      torch.distributed package. Currently supporting OFF, INFO, and DETAIL,
+      which can be set via the TORCH_DISTRIBUTED_DEBUG environment variable
+      or via ``set_debug_level()`` function.
   )")
-      .value("OFF", ::c10d::DistributedDebugLevel::OFF)
-      .value("INFO", ::c10d::DistributedDebugLevel::INFO)
-      .value("DETAIL", ::c10d::DistributedDebugLevel::DETAIL);
+      .value("OFF", ::c10d::DebugLevel::Off)
+      .value("INFO", ::c10d::DebugLevel::Info)
+      .value("DETAIL", ::c10d::DebugLevel::Detail);
 
-  module.def(
-      "_get_debug_mode",
-      &::c10d::parseDistDebugLevel,
-      py::call_guard<py::gil_scoped_release>());
+  module
+      .def("get_debug_level", ::c10d::debug_level,
+          R"(Gets the debug level of the torch.distributed package.)")
+      .def("set_debug_level", ::c10d::setDebugLevel,
+          R"(Sets the debug level of the torch.distributed package.)")
+      .def("set_debug_level_from_env", ::c10d::setDebugLevelFromEnvironment,
+          R"(Sets the debug level of the torch.distributed package from the
+          ``TORCH_DISTRIBUTED_DEBUG`` environment variable.)");
 
   py::enum_<::c10d::ReduceOp>(module, "ReduceOp", R"(
 An enum-like class for available reduction operations: ``SUM``, ``AVG``,
@@ -649,11 +655,13 @@ Example::
           .def(
               "get",
               [](::c10d::Store& store, const std::string& key) -> py::bytes {
-                auto value = store.get(key);
+                auto value = [&]() {
+                  py::gil_scoped_release guard;
+                  return store.get(key);
+                }();
                 return py::bytes(
                     reinterpret_cast<char*>(value.data()), value.size());
               },
-              py::call_guard<py::gil_scoped_release>(),
               R"(
 Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
 present in the store, the function will wait for ``timeout``, which is defined
@@ -887,7 +895,7 @@ the server to establish a connection.
 Arguments:
     host_name (str): The hostname or IP Address the server store should run on.
     port (int): The port on which the server store should listen for incoming requests.
-    world_size (int, optional): The total number of store users (number of clients + 1 for the server). Default is -1 (a negative value indicates a non-fixed number of store users).
+    world_size (int, optional): The total number of store users (number of clients + 1 for the server). Default is None (None indicates a non-fixed number of store users).
     is_master (bool, optional): True when initializing the server store and False for client stores. Default is False.
     timeout (timedelta, optional): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`. Default is timedelta(seconds=300)
     wait_for_worker (bool, optional): Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True.
@@ -906,14 +914,14 @@ Example::
       .def(
           py::init([](const std::string& host,
                       uint16_t port,
-                      int worldSize,
+                      c10::optional<int> worldSize,
                       bool isServer,
                       std::chrono::milliseconds timeout,
                       bool waitWorkers,
                       bool multiTenant) {
             c10::optional<std::size_t> numWorkers = c10::nullopt;
-            if (worldSize > -1) {
-              numWorkers = static_cast<std::size_t>(worldSize);
+            if (worldSize.has_value() && worldSize.value() > -1) {
+              numWorkers = static_cast<std::size_t>(worldSize.value());
             }
 
             ::c10d::TCPStoreOptions opts{
@@ -923,7 +931,7 @@ Example::
           }),
           py::arg("host_name"),
           py::arg("port"),
-          py::arg("world_size") = -1,
+          py::arg("world_size") = py::none(),
           // using noconvert() requires this argument to be True or False
           // prevents accidental implicit conversion to bool
           py::arg("is_master").noconvert() = false,
@@ -1423,7 +1431,9 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::arg("timeout") = kProcessGroupDefaultTimeout,
               py::call_guard<py::gil_scoped_release>())
           .def_property_readonly(
-              "options", &::c10d::ProcessGroupNCCL::getOptions);
+              "options", &::c10d::ProcessGroupNCCL::getOptions)
+          .def_property_readonly(
+              "is_ucc_available", &::c10d::ProcessGroupNCCL::isUCCAvailable);
 
   intrusive_ptr_class_<::c10d::ProcessGroupNCCL::Options>(
       processGroupNCCL,
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index bd6de0cfee81..93e8d05f2655 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/StringUtil.h>
 #include <c10d/Utils.hpp>
+#include <c10d/debug.h>
 #include <c10d/logger.hpp>
 #include <fmt/format.h>
 #include <string>
@@ -21,7 +22,7 @@ std::ostream& operator<<(std::ostream& output, const Logger& logger) {
   auto& ddp_logging_data = (*logger.ddp_logging_data_);
 
   std::string loggerInfo = fmt::format(
-      "[Rank {} / {}] [iteration {}] Training {} unused_parameter_size={} \n "
+      "[Rank {} / {}] [before iteration {}] Training {} unused_parameter_size={} \n "
       "Avg forward compute time: {} \n Avg backward compute time: {} \n"
       "Avg backward comm. time: {} \n Avg backward comm/comp overlap time: {}",
       ddp_logging_data.ints_map["rank"],
@@ -124,11 +125,11 @@ std::vector<std::vector<size_t>> Logger::get_per_bucket_variable_indices() {
   return per_bucket_variable_indices;
 }
 
-std::vector<int> Logger::get_bucket_sizes() {
-  std::vector<int> bucket_sizes;
+std::vector<int64_t> Logger::get_bucket_sizes() {
+  std::vector<int64_t> bucket_sizes;
   for (const auto& bucket : reducer_->buckets_) {
-    const auto& variables = bucket.replicas[0].variables;
-    int bucket_size = 0;
+    const auto& variables = bucket.variables;
+    int64_t bucket_size = 0;
     for (const auto& v : variables) {
       bucket_size += v.numel() * v.element_size();
     }
@@ -137,14 +138,6 @@ std::vector<int> Logger::get_bucket_sizes() {
   return bucket_sizes;
 }
 
-std::vector<int> Logger::get_bucket_size_limits() {
-  std::vector<int> bucket_size_limits;
-  for (const auto& bucket : reducer_->buckets_) {
-    bucket_size_limits.push_back(bucket.bucket_size_limit);
-  }
-  return bucket_size_limits;
-}
-
 // Communication hook. Empty string if not set, in which case it will not be
 // logged.
 void Logger::set_comm_hook(const std::string& hook) {
@@ -167,9 +160,13 @@ void Logger::set_construction_data_and_log(
     const std::vector<int>& device_ids,
     int output_device,
     bool broadcast_buffers,
-    bool has_sync_bn) {
+    bool has_sync_bn,
+    bool static_graph) {
   // No lock is needed, as it will be called in DistributedDataParallel
   // constructor.
+  if (static_graph) {
+    set_static_graph();
+  }
   ddp_logging_data_->strs_map["module_name"] = module_name;
   ddp_logging_data_->ints_map["world_size"] =
       reducer_->process_group_->getSize();
@@ -185,9 +182,6 @@ void Logger::set_construction_data_and_log(
   // A list of bucket sizes (Bytes) calculated during construction time
   ddp_logging_data_->strs_map["bucket_sizes"] =
       c10::Join(", ", get_bucket_sizes());
-  // A list of bucket size limits (bytes) specified during construction time
-  ddp_logging_data_->strs_map["initial_bucket_size_limits"] =
-      c10::Join(", ", get_bucket_size_limits());
   set_env_variables();
 
   // DistributedDataParallel constructor input parameters
@@ -203,7 +197,7 @@ void Logger::set_construction_data_and_log(
   ddp_logging_data_->strs_map["backend_name"] =
       reducer_->process_group_->getBackendName();
 
-  if (parseDistDebugLevel() != DistributedDebugLevel::OFF) {
+  if (debug_level() != DebugLevel::Off) {
     std::string initInfo = fmt::format(
         "[Rank {}]: DDP Initialized with: \n",
         ddp_logging_data_->ints_map["rank"]);
@@ -294,8 +288,6 @@ void Logger::set_runtime_stats_and_log() {
         reducer_->has_rebuilt_bucket_;
     ddp_logging_data_->strs_map["rebuilt_bucket_sizes"] =
         c10::Join(", ", get_bucket_sizes());
-    ddp_logging_data_->strs_map["rebuilt_bucket_size_limits"] =
-        c10::Join(", ", get_bucket_size_limits());
     // Log per-bucket variable indices
     std::vector<std::string> per_bucket_variable_indices;
     auto indices = get_per_bucket_variable_indices();
@@ -324,6 +316,14 @@ void Logger::set_runtime_stats_and_log() {
     );
     return;
   }
+  if (!reducer_->params_[0].is_cuda() && !reducer_->params_[0].is_cpu()) {
+    TORCH_WARN_ONCE(
+      "Time stats are currently only collected for CPU and CUDA devices. "
+      "Please refer to CpuTimer or CudaTimer for how to register timer "
+      "for other device type."
+    );
+    return;
+  }
   TORCH_INTERNAL_ASSERT(reducer_->timer_);
   calculate_avg_time(
       ddp_logging_data_->ints_map["avg_forward_compute_time"],
@@ -377,7 +377,7 @@ void Logger::set_runtime_stats_and_log() {
   );
 
   // Log runtime stats to stderr if TORCH_DISTRIBUTED_DEBUG=DETAIL is enabled.
-  if (parseDistDebugLevel() == DistributedDebugLevel::DETAIL) {
+  if (debug_level() == DebugLevel::Detail) {
     LOG(INFO) << *this;
   }
 
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index d47157805660..cd32c573a21e 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -15,7 +15,9 @@ class TORCH_API Logger {
       const std::vector<int>& device_ids,
       int output_device,
       bool broadcast_buffers,
-      bool has_sync_bn);
+      bool has_sync_bn,
+      bool static_graph
+  );
 
   void set_static_graph();
 
@@ -39,9 +41,7 @@ class TORCH_API Logger {
   // Set parameters stats.
   void set_parameter_stats();
   // Get size of each bucket (Bytes).
-  std::vector<int> get_bucket_sizes();
-  // Get bucket size limits specified during DDP construction.
-  std::vector<int> get_bucket_size_limits();
+  std::vector<int64_t> get_bucket_sizes();
   // Get variable indices for each bucket.
   std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
   // Set comm. hook, if used
diff --git a/torch/csrc/distributed/c10d/logging.cpp b/torch/csrc/distributed/c10d/logging.cpp
new file mode 100644
index 000000000000..c079906b878a
--- /dev/null
+++ b/torch/csrc/distributed/c10d/logging.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <c10d/logging.h>
+
+#include <c10d/debug.h>
+
+namespace c10d {
+namespace detail {
+
+bool isLogLevelEnabled(LogLevel level) noexcept {
+  // c10 logger does not support debug and trace levels. In order to map higher
+  // levels we adjust our ordinal value.
+  int level_int = static_cast<int>(level) - 2;
+
+  if (level_int >= 0) {
+    return FLAGS_caffe2_log_level <= level_int;
+  }
+
+  // Debug and trace levels are only enabled when c10 log level is set to INFO.
+  if (FLAGS_caffe2_log_level != 0) {
+    return false;
+  }
+
+  if (level_int == -1) {
+    return debug_level() != DebugLevel::Off;
+  }
+  if (level_int == -2) {
+    return debug_level() == DebugLevel::Detail;
+  }
+
+  return false;
+}
+
+} // namespace detail
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/logging.h b/torch/csrc/distributed/c10d/logging.h
index 9e6d328d324c..57ee974a0d35 100644
--- a/torch/csrc/distributed/c10d/logging.h
+++ b/torch/csrc/distributed/c10d/logging.h
@@ -1,4 +1,4 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the BSD-style license found in the
@@ -6,24 +6,49 @@
 
 #pragma once
 
-#include <fmt/format.h>
+#include <string>
 
+#include <c10/macros/Macros.h>
 #include <c10/util/Logging.h>
+#include <fmt/format.h>
 
 namespace c10d {
 namespace detail {
+
+enum class LogLevel {
+  Trace,
+  Debug,
+  Info,
+  Warning,
+  Error
+};
+
+TORCH_API bool isLogLevelEnabled(LogLevel level) noexcept;
+
 template <typename... T>
-std::string log_vformat(fmt::string_view fmt, T&&... args) {
+std::string formatLogMessage(fmt::string_view fmt, T&&... args) {
   return fmt::vformat(fmt, fmt::make_format_args(args...));
 }
-}  // namespace detail
-}  // namespace c10d
+
+} // namespace detail
+} // namespace c10d
 
 #define C10D_ERROR(...)\
-    LOG_IF(ERROR,   FLAGS_caffe2_log_level <= 2) << c10d::detail::log_vformat(__VA_ARGS__)
+    LOG_IF(ERROR,   c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Error))\
+        << "[c10d] "         << c10d::detail::formatLogMessage(__VA_ARGS__)
 
 #define C10D_WARNING(...)\
-    LOG_IF(WARNING, FLAGS_caffe2_log_level <= 1) << c10d::detail::log_vformat(__VA_ARGS__)
+    LOG_IF(WARNING, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Warning))\
+        << "[c10d] "         << c10d::detail::formatLogMessage(__VA_ARGS__)
 
 #define C10D_INFO(...)\
-    LOG_IF(INFO,    FLAGS_caffe2_log_level <= 0) << c10d::detail::log_vformat(__VA_ARGS__)
+    LOG_IF(INFO,    c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Info))\
+        << "[c10d] "         << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_DEBUG(...)\
+    LOG_IF(INFO,    c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Debug))\
+        << "[c10d - debug] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_TRACE(...)\
+    LOG_IF(INFO,    c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Trace))\
+        << "[c10d - trace] " << c10d::detail::formatLogMessage(__VA_ARGS__)
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 00b1b5cb3c0e..31d376b13a24 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -80,7 +80,7 @@ Reducer::Reducer(
     int64_t bucket_bytes_cap,
     bool find_unused_parameters,
     bool gradient_as_bucket_view,
-    std::unordered_map<size_t, std::string> paramNames,
+    std::unordered_map<size_t, std::string> param_names,
     int64_t first_bucket_bytes_cap)
     : params_(std::move(params)),
       process_group_(std::move(process_group)),
@@ -99,14 +99,14 @@ Reducer::Reducer(
       div_factor_(kUnsetDivFactor),
       static_graph_(false),
       comm_hook_(nullptr),
-      ddp_debug_level_(parseDistDebugLevel()),
-      param_names_(std::move(paramNames)),
+      ddp_debug_level_(debug_level()),
+      param_names_(std::move(param_names)),
       first_bucket_bytes_cap_(first_bucket_bytes_cap) {
   C10_LOG_API_USAGE_ONCE("torch.distributed.ddp.reducer");
   TORCH_INTERNAL_ASSERT(
       params_.size() >= 1, "Expected at least one parameter.");
 
-  if (ddp_debug_level_ != c10d::DistributedDebugLevel::OFF) {
+  if (ddp_debug_level_ != c10d::DebugLevel::Off) {
     LOG(INFO) << "Reducer initialized with bucket_bytes_cap: "
               << bucket_bytes_cap_
               << " first_bucket_bytes_cap: " << first_bucket_bytes_cap;
@@ -143,8 +143,7 @@ Reducer::Reducer(
   // This can be reinitialized later after capturing runtime information.
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    initialize_buckets(
-        std::move(bucket_indices), std::move(per_bucket_size_limits));
+    initialize_buckets(std::move(bucket_indices));
   }
 
   // All variables are expected to have their `grad_fn` set to the gradient
@@ -332,17 +331,16 @@ void Reducer::check_grad_layout(
 }
 
 void Reducer::mark_variable_ready_dense(size_t variable_index) {
-  const auto replica_index = 0;
   const auto& bucket_index = variable_locators_[variable_index];
   auto& bucket = buckets_[bucket_index.bucket_index];
-  auto& replica = bucket.replicas[replica_index];
-  auto& variable = replica.variables[bucket_index.intra_bucket_index];
-  auto& bucket_view = replica.bucket_views_in[bucket_index.intra_bucket_index];
-
-  // Copy contents of gradient tensor to bucket tensor.
-  // If the gradient is not set, we assume it wasn't computed
-  // as part of the current backwards pass, and zero the part
-  // of the bucket it would otherwise hold.
+  auto& variable = bucket.variables[bucket_index.intra_bucket_index];
+  auto& bucket_view = bucket.bucket_views_in[bucket_index.intra_bucket_index];
+
+  // Copy the contents of the gradient tensor to the corresponding part of the
+  // bucket's flattened gradient tensor.
+  // If the gradient is not set, we assume it wasn't computed as part of the
+  // current backwards pass, and we zero the part of the bucket it would
+  // otherwise hold.
   runGradCallbackForVariable(variable, [&](auto& grad) {
     if (grad.defined()) {
       this->check_grad_layout(grad, bucket_view);
@@ -414,11 +412,9 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
 }
 
 void Reducer::mark_variable_ready_sparse(size_t variable_index) {
-  const auto replica_index = 0;
   const auto& bucket_index = variable_locators_[variable_index];
   auto& bucket = buckets_[bucket_index.bucket_index];
-  auto& replica = bucket.replicas[replica_index];
-  auto& variable = replica.variables[bucket_index.intra_bucket_index];
+  auto& variable = bucket.variables[bucket_index.intra_bucket_index];
 
   runGradCallbackForVariable(variable, [&](auto& grad) {
     REDUCER_CHECK(
@@ -428,17 +424,16 @@ void Reducer::mark_variable_ready_sparse(size_t variable_index) {
         logger_,
         "Expected variable to have sparse gradient.");
 
-    // Sparse tensors cannot be grouped together with other sparse tensors
-    // in a single reduction operation like we can for dense tensors.
-    // Therefore, the `offsets` and `lengths` vectors in the bucket replica
-    // struct are empty, and there is no pre-existing accumulation tensor.
-    // Directly assign the sparse tensor to the `contents` field.
-    replica.contents = grad;
-    // If no DDP comm hook is registered,
-    // the allreduce only sums up the value, and a separate division is
-    // required.
+    // Sparse tensors cannot be grouped together with other sparse tensors in a
+    // single reduction operation like we can for dense tensors. Therefore, the
+    // `offsets` and `lengths` vectors in the bucket struct are empty, and
+    // there is no pre-existing accumulation tensor.
+    // Directly assign the sparse tensor to the `gradients` field.
+    bucket.gradients = grad;
+    // If no DDP comm hook is registered, the allreduce only sums up the
+    // value, and a separate division is required.
     if (comm_hook_ == nullptr) {
-      replica.contents.div_(div_factor_);
+      bucket.gradients.div_(div_factor_);
     }
     // The grad is modified in place and needs to be written back.
     return true;
@@ -456,11 +451,11 @@ std::vector<c10d::GradBucket> Reducer::get_grad_buckets(
     gradBuckets.emplace_back(
         i,
         buckets_.size(),
-        return_zero_tensors ? at::zeros_like(bucket.replicas[0].contents)
-                            : bucket.replicas[0].contents,
-        bucket.replicas[0].offsets,
-        bucket.replicas[0].lengths,
-        bucket.replicas[0].sizes_vec,
+        return_zero_tensors ? at::zeros_like(bucket.gradients)
+                            : bucket.gradients,
+        bucket.offsets,
+        bucket.lengths,
+        bucket.sizes_vec,
         variables_for_bucket);
   }
   return gradBuckets;
@@ -693,16 +688,15 @@ void Reducer::all_reduce_local_used_map() {
 at::Tensor& Reducer::get_param_from_index(size_t index) {
   const auto& bucket_index = variable_locators_[index];
   auto& bucket = buckets_[bucket_index.bucket_index];
-  auto& replica = bucket.replicas[0];
-  // Cannot simply access variable via replicas_[0][variable_index] since return
-  // value is used in runGradCallbackForVariable which does not accept const
-  // tensors.
-  auto& variable = replica.variables[bucket_index.intra_bucket_index];
+  // Cannot simply access variable via `bucket.variables[variable_index]` since
+  // return value is used in `runGradCallbackForVariable()` which does not
+  // accept const tensors.
+  auto& variable = bucket.variables[bucket_index.intra_bucket_index];
   return variable;
 }
 
 void Reducer::checkAndRaiseMarkedTwiceError(size_t index) {
-  // Something is wrong if all variables contained in this bucket replica have
+  // Something is wrong if all variables contained in this bucket have
   // already been marked as ready.
   // We don't expect the same variable to be marked ready twice.
   bool marked_twice =
@@ -714,7 +708,7 @@ void Reducer::checkAndRaiseMarkedTwiceError(size_t index) {
     auto param_name = param_names_.find(index);
     const bool found_param_name = param_name != param_names_.end();
     TORCH_INTERNAL_ASSERT(
-        ddp_debug_level_ == c10d::DistributedDebugLevel::OFF ||
+        ddp_debug_level_ == c10d::DebugLevel::Off ||
             found_param_name,
         "Expected to find parameter name in debug mode.");
     std::string paramInfo = c10::str(
@@ -790,7 +784,6 @@ void Reducer::mark_variable_ready(size_t variable_index) {
 
   const auto& bucket_index = variable_locators_[variable_index];
   auto& bucket = buckets_[bucket_index.bucket_index];
-  auto& replica = bucket.replicas[0];
 
   set_divide_factor();
 
@@ -802,16 +795,13 @@ void Reducer::mark_variable_ready(size_t variable_index) {
 
   // TODO(@pietern): Make this work for both CPU/CUDA tensors.
   // When using CPU tensors we don't need to do this.
-  // // Record event so that we can wait for all of them.
-  // auto& event = replica.events[bucket_index.intra_bucket_index];
+  // Record event so that we can wait for all of them.
+  // auto& event = bucket.events[bucket_index.intra_bucket_index];
   // event.record();
 
   // Check if this was the final gradient for this bucket.
-  if (--replica.pending == 0) {
-    // Kick off reduction if all replicas for this bucket are ready.
-    if (--bucket.pending == 0) {
-      mark_bucket_ready(bucket_index.bucket_index);
-    }
+  if (--bucket.pending == 0) {
+    mark_bucket_ready(bucket_index.bucket_index);
   }
 
   // Run finalizer function and kick off reduction for local_used_map once the
@@ -849,31 +839,24 @@ c10::intrusive_ptr<c10::ivalue::Future> Reducer::run_comm_hook(
 }
 
 void Reducer::all_reduce_bucket(Bucket& bucket) {
-  std::vector<at::Tensor> tensors;
-  tensors.reserve(bucket.replicas.size());
-  for (const auto& replica : bucket.replicas) {
-    // TODO(@pietern): Ensure proper synchronization with the CUDA events
-    // that recorded copies into this contents tensor. If these copies are
-    // executed on non-default streams, the current stream for the device
-    // that holds the contents tensor must wait on these events.
-    //
-    // As long as autograd uses the default stream for every device,
-    // these operations are implicitly sequenced, and we don't need to
-    // do any extra synchronization here.
-    //
-    tensors.push_back(replica.contents);
-  }
-
   auto variables_for_bucket = get_variables_for_bucket(next_bucket_, bucket);
+  // TODO(@pietern): Ensure proper synchronization with the CUDA events
+  // that recorded copies into this `gradients` tensor. If these copies are
+  // executed on non-default streams, the current stream for the device
+  // that holds the `gradients` tensor must wait on these events.
+  //
+  // As long as autograd uses the default stream for every device,
+  // these operations are implicitly sequenced, and we don't need to
+  // do any extra synchronization here.
+  const auto& tensor = bucket.gradients;
+
   GradBucket grad_bucket(
       next_bucket_,
       buckets_.size(),
-      tensors[0],
-      // Since we only support single-process single-device
-      // mode, there is always only one replica in the bucket.
-      bucket.replicas[0].offsets,
-      bucket.replicas[0].lengths,
-      bucket.replicas[0].sizes_vec,
+      tensor,
+      bucket.offsets,
+      bucket.lengths,
+      bucket.sizes_vec,
       variables_for_bucket);
   bucket.future_work = run_comm_hook(grad_bucket);
 }
@@ -890,12 +873,11 @@ std::vector<at::Tensor> Reducer::get_variables_for_bucket(
   std::vector<at::Tensor> variables_for_bucket;
   variables_for_bucket.reserve(bucket.variable_indices.size());
   for (const auto& variable_index : bucket.variable_indices) {
-    auto& replica = bucket.replicas[0];
     // Grab bucket index where gradient is located using variable_locators_.
     auto& bucket_index_for_variable = variable_locators_[variable_index];
     // Grab the actual model parameter.
     auto& variable =
-        replica.variables[bucket_index_for_variable.intra_bucket_index];
+        bucket.variables[bucket_index_for_variable.intra_bucket_index];
     variables_for_bucket.emplace_back(variable);
   }
 
@@ -945,9 +927,7 @@ void Reducer::install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>>
   }
 }
 
-void Reducer::initialize_buckets(
-    std::vector<std::vector<size_t>> bucket_indices,
-    std::vector<size_t> per_bucket_sizes) {
+void Reducer::initialize_buckets(std::vector<std::vector<size_t>> bucket_indices) {
   // If initialize_buckets is called inside DDP constructor, then
   // it does not matter rpc context ptr is nullptr or not, as grad
   // will not be mutated.
@@ -977,10 +957,8 @@ void Reducer::initialize_buckets(
   // Iterate over buckets.
   const auto bucket_count = bucket_indices.size();
   buckets_.reserve(bucket_count);
-  TORCH_INTERNAL_ASSERT(bucket_count == per_bucket_sizes.size());
   for (const auto bucket_index : c10::irange(bucket_count)) {
     Bucket bucket;
-    bucket.bucket_size_limit = per_bucket_sizes[bucket_index];
 
     // TODO(@pietern): Validate indices.
     // Must be non-empty, unique, and unique across buckets.
@@ -1004,24 +982,23 @@ void Reducer::initialize_buckets(
       }
     }
 
-    BucketReplica replica;
     if (bucket.expect_sparse_gradient) {
       const auto variable_index = bucket_indices[bucket_index].front();
       const auto& variable = params_[variable_index];
       TORCH_INTERNAL_ASSERT(bucket_indices[bucket_index].size() == 1);
-      replica.variables = {variable};
+      bucket.variables = {variable};
     } else {
       at::TensorOptions options;
       // The start index of the variable in the flattened tensor.
       size_t offset = 0;
 
-      // Reserve enough space for the per-variable fields stored in bucket
-      // replica for efficiency.
+      // Reserve enough space for the per-variable fields stored in the bucket
+      // for efficiency.
       const size_t num_variables = bucket_indices[bucket_index].size();
-      replica.variables.reserve(num_variables);
-      replica.offsets.reserve(num_variables);
-      replica.lengths.reserve(num_variables);
-      replica.sizes_vec.reserve(num_variables);
+      bucket.variables.reserve(num_variables);
+      bucket.offsets.reserve(num_variables);
+      bucket.lengths.reserve(num_variables);
+      bucket.sizes_vec.reserve(num_variables);
 
       // Iterate over bucket variables.
       for (const auto variable_index : bucket_indices[bucket_index]) {
@@ -1047,29 +1024,29 @@ void Reducer::initialize_buckets(
               "All parameters in a bucket must have the same dtype.");
         }
         const auto length = variable.numel();
-        replica.variables.push_back(variable);
-        replica.offsets.push_back(offset);
-        replica.lengths.push_back(length);
-        replica.sizes_vec.push_back(variable.sizes());
+        bucket.variables.push_back(variable);
+        bucket.offsets.push_back(offset);
+        bucket.lengths.push_back(length);
+        bucket.sizes_vec.push_back(variable.sizes());
         offset += length;
       }
 
-      // Allocate bucket contents tensor.
-      replica.contents = at::empty({static_cast<long>(offset)}, options);
+      // Allocate the bucket's flattened `gradients` tensor.
+      bucket.gradients = at::empty({static_cast<long>(offset)}, options);
 
       // Note:  "Gradient Layout Contract"
       //
-      // Here, create views into the contents tensor for each variable's grad.
-      // Views serve as entry points to copy_ each grad's data in/out of the
-      // flat contents tensor.
+      // Here, create views into the `gradients` tensor for each variable's
+      // grad. Views serve as entry points to `copy_()` each grad's data in/out
+      // of the flattened `gradients` tensor.
       //
       // Gradients may have dense memory but non-row-major-contiguous strides
       // (e.g. channels_last or channels_last_3d). For coalesced accesses
       // during copy_s, it's beneficial for each view's layout to match its
       // grad's layout.
       //
-      // Specifically, we expect torch/csrc/autograd/AccumulateGrad.h produces
-      // grads that obey there "Gradient Layout Contract":
+      // Specifically, we expect torch/csrc/autograd/functions/accumulate_grad.h
+      // produces grads that obey the "Gradient Layout Contract":
       //   (1) if variable.is_non_overlapping_and_dense(), the stashed grad's
       //       strides match variable.
       //   (2) else, stashed grad is rowmajor contiguous.
@@ -1095,14 +1072,10 @@ void Reducer::initialize_buckets(
       // Checking just once won't catch if someone messes with
       // param layouts over time, but not messing with params after DDP
       // construction is already a documented constraint.
-      initialize_bucket_views(replica, replica.contents);
+      initialize_bucket_views(bucket);
     }
 
-    // Add bucket replica to enclosing bucket.
-    bucket.replicas.push_back(std::move(replica));
-
     // Map participating variables to this bucket.
-    // This is identical across replicas so we only need to do this once.
     size_t intra_bucket_index = 0;
     for (const auto variable_index : bucket_indices[bucket_index]) {
       TORCH_INTERNAL_ASSERT(
@@ -1118,29 +1091,28 @@ void Reducer::initialize_buckets(
 }
 
 // (see Note:  "Gradient Layout Contract" in initialize_buckets).
-void Reducer::initialize_bucket_views(
-    Reducer::BucketReplica& replica,
-    at::Tensor& contents) {
-  for (const auto i : c10::irange(replica.variables.size())) {
-    auto& v = replica.variables[i];
-    const auto offset = replica.offsets[i];
-    const auto length = replica.lengths[i];
+void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
+  const auto& gradients = bucket.gradients;
+  for (const auto i : c10::irange(bucket.variables.size())) {
+    auto& v = bucket.variables[i];
+    const auto offset = bucket.offsets[i];
+    const auto length = bucket.lengths[i];
     if (v.is_non_overlapping_and_dense()) {
       // If the param's memory is dense, match its layout, anticipating
       // the autograd engine (AccumulateGrad) will also create gradients
       // matching its layout.
-      replica.bucket_views_in.push_back(
-          contents.as_strided(v.sizes(), v.strides(), offset));
+      bucket.bucket_views_in.push_back(
+          gradients.as_strided(v.sizes(), v.strides(), offset));
     } else {
       // Fall back to a C-style contiguous view, again anticipating
       // AccumulateGrad will do the same when stashing grads for non-dense
       // params.
-      replica.bucket_views_in.push_back(
-          contents.narrow(0, offset, length).view(v.sizes()));
+      bucket.bucket_views_in.push_back(
+          gradients.narrow(0, offset, length).view(v.sizes()));
     }
     // By default `bucket_views_out` and `bucket_views_in` are
     // essentially the same thing.
-    replica.bucket_views_out = replica.bucket_views_in;
+    bucket.bucket_views_out = bucket.bucket_views_in;
 
     // If gradient_as_bucket_view_ is set as true, then there are two cases to
     // handle: initialize_bucket_views could be called inside initialize_buckets
@@ -1152,7 +1124,7 @@ void Reducer::initialize_bucket_views(
     // bucket_view, because grads should be kept as being undefined for globally
     // unused parameters.
     if (gradient_as_bucket_view_) {
-      auto& bucket_view = replica.bucket_views_in.back();
+      auto& bucket_view = bucket.bucket_views_in.back();
       runGradCallbackForVariable(v, [&](auto& grad) {
         if (grad.defined() && !grad.is_alias_of(bucket_view)) {
           bucket_view.copy_(grad);
@@ -1169,24 +1141,24 @@ void Reducer::initialize_bucket_views(
 
 // (see Note:  "Gradient Layout Contract" in initialize_buckets).
 void Reducer::populate_bucket_views_out(
-    Reducer::BucketReplica& replica,
+    Reducer::Bucket& bucket,
     at::Tensor& tensor) {
-  replica.bucket_views_out.clear();
-  for (const auto i : c10::irange(replica.variables.size())) {
-    const auto& v = replica.variables[i];
-    const auto offset = replica.offsets[i];
-    const auto length = replica.lengths[i];
+  bucket.bucket_views_out.clear();
+  for (const auto i : c10::irange(bucket.variables.size())) {
+    const auto& v = bucket.variables[i];
+    const auto offset = bucket.offsets[i];
+    const auto length = bucket.lengths[i];
     if (v.is_non_overlapping_and_dense()) {
       // If the param's memory is dense, match its layout, anticipating
       // the autograd engine (AccumulateGrad) will also create gradients
       // matching its layout.
-      replica.bucket_views_out.push_back(
+      bucket.bucket_views_out.push_back(
           tensor.as_strided(v.sizes(), v.strides(), offset));
     } else {
       // Fall back to a C-style contiguous view, again anticipating
       // AccumulateGrad will do the same when stashing grads for non-dense
       // params.
-      replica.bucket_views_out.push_back(
+      bucket.bucket_views_out.push_back(
           tensor.narrow(0, offset, length).view(v.sizes()));
     }
   }
@@ -1207,10 +1179,7 @@ void Reducer::reset_bucket_counting() {
   num_buckets_ready_ = 0;
 
   for (auto& bucket : buckets_) {
-    for (auto& replica : bucket.replicas) {
-      replica.pending = replica.variables.size();
-    }
-    bucket.pending = bucket.replicas.size();
+    bucket.pending = bucket.variables.size();
   }
 
   if (static_graph_) {
@@ -1260,7 +1229,7 @@ void Reducer::search_unused_parameters(
     // If the accumulator function is present in the graph, we know
     // a gradient will be computed for the corresponding parameter.
     if (seen.count(it.first) == 0) {
-      if (ddp_debug_level_ == c10d::DistributedDebugLevel::DETAIL) {
+      if (ddp_debug_level_ == c10d::DebugLevel::Detail) {
         const auto param_info = param_names_.find(it.second);
         TORCH_INTERNAL_ASSERT(
             param_info != param_names_.end(),
@@ -1339,16 +1308,16 @@ void Reducer::prepare_for_backward(
 
 void Reducer::copy_bucket_to_grad(
     at::Tensor& variable,
-    Reducer::BucketReplica& replica,
+    Reducer::Bucket& bucket,
     size_t intra_bucket_index,
     bool global_unused) {
-  const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
+  const auto& bucket_view = bucket.bucket_views_out[intra_bucket_index];
   runGradCallbackForVariable(variable, [&](auto& grad) {
     // If a parameter is globally unused, we keep its grad untouched.
     if (!global_unused) {
       if (!grad.defined()) {
         // Creates grad according to the "Gradient Layout Contract"
-        // (see torch/csrc/grad/AccumulateGrad.h)
+        // (see torch/csrc/autograd/functions/accumulate_grad.h)
         grad =
             torch::autograd::utils::clone_obey_contract(bucket_view, variable);
       } else {
@@ -1387,10 +1356,8 @@ std::vector<size_t> Reducer::getUnmarkedParamIndicesForIteration() {
 
 // A bucket with one or more dense tensors needs to be unflattened.
 void Reducer::finalize_bucket_dense(Bucket& bucket) {
-  size_t replica_index = 0;
-  auto& replica = bucket.replicas[replica_index];
-  for (const auto intra_bucket_index : c10::irange(replica.variables.size())) {
-    auto& variable = replica.variables[intra_bucket_index];
+  for (const auto intra_bucket_index : c10::irange(bucket.variables.size())) {
+    auto& variable = bucket.variables[intra_bucket_index];
 
     bool global_unused = false;
     // See Note [Skip allreducing local_used_map_dev]
@@ -1434,15 +1401,14 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
       RECORD_FUNCTION(
           "torch.distributed.ddp.reducer::copy_bucket_to_grad",
           std::vector<c10::IValue>({variable}));
-      copy_bucket_to_grad(variable, replica, intra_bucket_index, global_unused);
+      copy_bucket_to_grad(variable, bucket, intra_bucket_index, global_unused);
     } else {
       const auto& bucket_view_out =
-          replica.bucket_views_out[intra_bucket_index];
-      auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index];
-      // If communication_hook is registered, bucket_view_out stores
-      // allreduced results in a newly allocated tensor, copy bucket_view_out
-      // back to bucket_view_in that referring to replica.content tensor and
-      // grad.
+          bucket.bucket_views_out[intra_bucket_index];
+      auto& bucket_view_in = bucket.bucket_views_in[intra_bucket_index];
+      // If a communication hook is registered, then `bucket_view_out` stores
+      // the allreduced results in a newly allocated tensor, so we copy
+      // `bucket_view_out` back to `bucket_view_in` for this gradient.
       if (!bucket_view_in.is_alias_of(bucket_view_out)) {
         bucket_view_in.copy_(bucket_view_out);
       }
@@ -1484,7 +1450,8 @@ void Reducer::finalize_backward() {
   TORCH_INTERNAL_ASSERT(require_finalize_);
   require_finalize_ = false;
 
-  // Wait for asynchronous reduction to complete and unflatten contents.
+  // Wait for asynchronous reduction to complete, and unflatten the bucket's
+  // flattened `gradients` tensor.
   for (auto& bucket : buckets_) {
     // See Note [DDP Communication Hook]
     TORCH_INTERNAL_ASSERT(
@@ -1495,13 +1462,12 @@ void Reducer::finalize_backward() {
     auto future_result = comm_hook_ == nullptr
         ? detail::parseCppCommHookResult(bucket.future_work->value())
         : comm_hook_->parseHookResult(bucket.future_work->value());
-    auto& replica = bucket.replicas[0];
     if (bucket.expect_sparse_gradient) {
-      replica.contents.copy_(future_result);
+      bucket.gradients.copy_(future_result);
     } else {
       // Reinitialize only `bucket_views_out` with the future_result by
       // following the same logic in `initialize_buckets`.
-      populate_bucket_views_out(replica, future_result);
+      populate_bucket_views_out(bucket, future_result);
     }
 
     // Unset allreduce division factor, as it may change in next backwards pass
@@ -1708,7 +1674,7 @@ bool Reducer::rebuild_buckets() {
     std::reverse(per_bucket_size_limits.begin(), per_bucket_size_limits.end());
   }
 
-  if (ddp_debug_level_ != c10d::DistributedDebugLevel::OFF) {
+  if (ddp_debug_level_ != c10d::DebugLevel::Off) {
     TORCH_INTERNAL_ASSERT(
         rebuilt_bucket_indices.size() == per_bucket_size_limits.size())
     LOG(INFO) << rebuilt_bucket_indices.size()
@@ -1726,8 +1692,7 @@ bool Reducer::rebuild_buckets() {
   rebuilt_params_.clear();
   rebuilt_param_indices_.clear();
 
-  initialize_buckets(
-      std::move(rebuilt_bucket_indices), std::move(per_bucket_size_limits));
+  initialize_buckets(std::move(rebuilt_bucket_indices));
 
   return true;
 }
@@ -1833,7 +1798,7 @@ void Reducer::ensure_prior_reduction_finished() {
         ": ",
         unmarked_param_indices);
 
-    if (ddp_debug_level_ == DistributedDebugLevel::OFF) {
+    if (ddp_debug_level_ == DebugLevel::Off) {
       // Without debug mode, log unmarked_param_indices, as well as
       // recommendation to use debug mode to print parameter names.
       kBaseErrorMsg += unmarked_param_indices_info;
@@ -1932,7 +1897,7 @@ namespace {
 // composite key of a tensor's type identifier and its device.
 struct BucketKey {
   BucketKey(c10::ScalarType type, c10::Device device)
-      : type(std::move(type)), device(std::move(device)) {}
+      : type(type), device(device) {}
 
   const c10::ScalarType type;
   const c10::Device device;
@@ -2068,7 +2033,7 @@ compute_bucket_assignment_by_size(
   bucket_indices.reserve(result.size());
   std::vector<size_t> per_bucket_size_limits;
   per_bucket_size_limits.reserve(result.size());
-  for (const auto & bucket_indices_with_size : result) {
+  for (const auto& bucket_indices_with_size : result) {
     bucket_indices.emplace_back(std::get<0>(bucket_indices_with_size));
     per_bucket_size_limits.emplace_back(std::get<1>(bucket_indices_with_size));
   }
@@ -2081,6 +2046,47 @@ void verify_params_across_processes(
     const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
     const std::vector<at::Tensor>& params,
     const c10::optional<std::weak_ptr<c10d::Logger>>& logger) {
+
+  // First verify number of parameters to avoid inconsistent inputs into
+  // broadcast which can cause a crash.
+  // See https://github.com/pytorch/pytorch/issues/73547
+
+  at::TensorOptions param_size_options;
+  param_size_options = param_size_options.dtype(at::kLong);
+  param_size_options = param_size_options.device(params[0].device());
+  // Note: Not using tensor building API because of
+  // https://github.com/pytorch/pytorch/issues/74114
+  at::Tensor param_size_tensor = at::tensor(
+    {static_cast<int64_t>(params.size())}, param_size_options);
+
+  // Allgather and verify parameter size.
+  std::vector<std::vector<at::Tensor>> param_size_output_tensors;
+  param_size_output_tensors.emplace_back(std::vector<at::Tensor>{});
+  auto world_size = process_group->getSize();
+  for (size_t i = 0 ; i < world_size ; ++i) {
+    param_size_output_tensors.front().emplace_back(
+      at::empty_like(param_size_tensor)
+    );
+  }
+
+  std::vector<at::Tensor> param_size_vec{param_size_tensor};
+  process_group->allgather(param_size_output_tensors, param_size_vec)->wait();
+  auto result_size_tensors = param_size_output_tensors.front();
+  for (size_t i = 0; i < world_size ; ++i ) {
+    auto param_size_for_rank = result_size_tensors[i][0].item<int>();
+    TORCH_CHECK(
+      param_size_for_rank == params.size(),
+      c10::str(
+        "DDP expects same model across all ranks, but Rank ",
+        process_group->getRank(),
+        " has ", params.size(), " params, while rank ", i,
+        " has inconsistent ", param_size_for_rank,
+        " params."
+      )
+    );
+  }
+
+  // Continue with parameter shape verification.
   size_t i = 0;
   for (const auto& t : params) {
     i += 2 * t.dim();
@@ -2114,10 +2120,9 @@ void verify_params_across_processes(
   i = 0;
   for (const auto p : c10::irange(params.size())) {
     const auto& t = params[p];
-    // I'd like to include which process we are in the message,
-    // but ProcessGroup::getRank is not public!
     for (const auto& sz : t.sizes()) {
-      auto msg = c10::str("params[", p, "] in this process",
+      auto msg = c10::str("[", process_group->getRank(),
+                        "]: params[", p, "] in this process",
                         " with sizes ",
                         t.sizes(),
                         " appears not to match sizes of the same param in process 0.");
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index 541e2c0802a8..cc14a1eb2be6 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -13,6 +13,8 @@
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Utils.hpp>
 #include <c10d/comm.hpp>
+#include <c10d/debug.h>
+#include <c10d/reducer_timer.hpp>
 #include <c10d/default_comm_hooks.hpp>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/profiler.h>
@@ -27,77 +29,10 @@ constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
 constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
 // Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
 constexpr int kDDPRuntimeLoggingSampleRate = 100;
-constexpr int kUnsetTime = -1;
-
-inline int64_t current_time_in_nanos() {
-  return torch::profiler::impl::getTime();
-}
 
 // Forward declaration
 class Logger;
 
-class TORCH_API Timer {
-  private:
-    // The timestamp of forward call start time in each iteration.
-    int64_t forward_start_time = kUnsetTime;
-    // The timestamp of backward computation start and end time in each
-    // iteration.
-    int64_t backward_compute_start_time = kUnsetTime;
-    int64_t backward_compute_end_time = kUnsetTime;
-    // The timestamp of first communication call start time in each iteration.
-    int64_t backward_comm_start_time = kUnsetTime;
-    // The timestamp of last communication call end time in each iteration.
-  int64_t backward_comm_end_time = kUnsetTime;
- public:
-  enum class Event {
-    kForwardStart,
-    kBackwardComputeStart,
-    kBackwardComputeEnd,
-    kBackwardCommStart,
-    kBackwardCommEnd,
-  };
-
-  // Record the current event, i.e., mark it as having occurred now. Default
-  // CPU implementation.
-  virtual void record(Event event) {
-    getTimeRef(event) = current_time_in_nanos();
-  }
-
-  // Return the difference between when two events occurred, in nanoseconds.
-  // Or nullopt if one of them hasn't been recorded.
-  virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
-
-  virtual ~Timer() = default;
-
-  // Return host-side timestamp, or nullopt if it has not yet been recorded.
-  c10::optional<int64_t> getTimestamp(Event event) {
-    auto time = getTimeRef(event);
-    if (time == kUnsetTime) {
-      return c10::nullopt;
-    } else {
-      return time;
-    }
-  }
-
-  // Return host-side time member variable corresponding to the given event.
-  int64_t& getTimeRef(Event event) {
-    switch (event) {
-      case Event::kForwardStart:
-        return forward_start_time;
-      case Event::kBackwardComputeStart:
-        return backward_compute_start_time;
-      case Event::kBackwardComputeEnd:
-        return backward_compute_end_time;
-      case Event::kBackwardCommStart:
-        return backward_comm_start_time;
-      case Event::kBackwardCommEnd:
-        return backward_comm_end_time;
-      default:
-        TORCH_INTERNAL_ASSERT(false);
-    }
-  }
-};
-
 // Local accumulator type for a single bucket.
 struct BucketAccumulator {
   std::vector<size_t> indices;
@@ -105,14 +40,13 @@ struct BucketAccumulator {
   size_t size_limit = 0;
 };
 
-C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
-
 class TORCH_API Reducer {
  public:
-  // The constructor takes a list of variables for every model replica.
-  // The bucket assignment for this reducer is specified as a list of
-  // buckets, each of which is specified as a list of indices into the
-  // variables list for **a single replica** (i.e. `variables[0]`).
+  // The constructor takes a list of variables (i.e. parameters) for this
+  // process's single model replica (as DDP assumes single-process
+  // single-device). The bucket assignment for this reducer, `bucket_indices`,
+  // is specified as a list of buckets, each of which is specified as a list of
+  // indices into the bucket's `variables` list.
   explicit Reducer(
       std::vector<at::Tensor> params,
       std::vector<std::vector<size_t>> bucket_indices,
@@ -122,18 +56,16 @@ class TORCH_API Reducer {
       int64_t bucket_bytes_cap,
       bool find_unused_parameters,
       bool gradient_as_bucket_view,
-      std::unordered_map<size_t, std::string> paramNames,
+      std::unordered_map<size_t, std::string> param_names,
       int64_t first_bucket_bytes_cap);
 
   ~Reducer() noexcept(false);
 
-  // To (re-)initialize bucket assignment, pass a list of buckets, each
-  // of which is specified by a list of indices in the variables list.
+  // To (re-)initialize bucket assignment, pass a list of buckets, each of
+  // which is specified by a list of indices in the bucket's `variables` list.
   // This function performs validation that the variables within a bucket
   // all live on the same device and have the same dimensionality.
-  void initialize_buckets(
-      std::vector<std::vector<size_t>> bucket_indices,
-      std::vector<size_t> per_bucket_sizes);
+  void initialize_buckets(std::vector<std::vector<size_t>> bucket_indices);
 
   // This function is called when the forward function has produced an output,
   // and the user wishes to reduce gradients in the backwards pass.
@@ -311,7 +243,7 @@ class TORCH_API Reducer {
 
   void mark_bucket_ready(size_t bucket_index);
 
-  void finalize_bucket_dense(Bucket& replica);
+  void finalize_bucket_dense(Bucket& bucket);
 
   void finalize_backward();
 
@@ -344,117 +276,102 @@ class TORCH_API Reducer {
 #endif
   void runGradCallbackForVariable(at::Tensor& variable, GradCallback&& cb);
 
-  // A bucket replica represents [1..N] gradients to be reduced,
-  // with the same dtype, on the same device.
-  //
-  // Batching gradients together before reducing them can result in lower
-  // overhead and/or faster time to completion. Only gradients of the same type
-  // and on the same device can be batched. The tensor that represents the
-  // flattened gradient uses the same type and is placed on the same device.
-  // Buckets are filled as the gradients they hold are computed (triggered by
-  // autograd hooks). Buckets are reduced in a predetermined order that is
-  // identical across processes.
-  struct BucketReplica {
-    // Flattened (1 dimensional) contents of bucket.
-    at::Tensor contents;
-
-    // Views into contents for each grad.  Each view will be created with
-    // layout (sizes + strides) matching the grad's expected layout
-    // ("Gradient Layout Contract" in torch/csrc/autograd/AccumulateGrad.h).
-    // `bucket_views_in[i].copy_(grad)` and
-    // `grad.copy_(bucket_views_out[i])`
-    // provide convenient ways to move grad data in/out of contents.
-    // The reason we keep two states for bucket_views is that if DDP
-    // communication hook was registered, `bucket_views_out` could be
-    // re-initialized with the value of hook's `future_work`. We still need to
-    // keep a separate view reference to replica's original contents for
-    // `bucket_views_in[i].copy_(grad)` call.
-    std::vector<at::Tensor> bucket_views_in;
-    std::vector<at::Tensor> bucket_views_out;
-
-    // Variables that contribute to this bucket replica. Use refcounted value
-    // here so that we can easily unflatten the bucket contents into the
-    // participating variables after reduction has completed.
-    std::vector<at::Tensor> variables;
-
-    // Per-variable offset/length into the flat bucket contents tensor and grad
-    // bucket.
-    std::vector<size_t> offsets;
-    std::vector<size_t> lengths;
-
-    // Per-variable sizes into the grad bucekt.
-    std::vector<c10::IntArrayRef> sizes_vec;
-
-    // Number of tensors to be added before this bucket is complete.
-    // This is reset to `variables.size()` every iteration.
-    size_t pending;
-
-    // TODO(@pietern)
-    // Memory copies from gradient tensors into the bucket are potentially
-    // done on different CUDA streams. We record an event for every copy
-    // so that we can synchronize with them prior to kicking off the reduction.
-    // std::vector<at::cuda::CUDAEvent> events;
-  };
-
-  // This function is called inside `initialize_buckets`, it initializes both
-  // bucket_views_in and bucket_views_out into the contents tensor for each
-  // variable's grad. Views serve as entry points to copy_ each grad's data
-  // in/out of the flat contents tensor.
-  void initialize_bucket_views(BucketReplica& replica, at::Tensor& contents);
+  // This function is called inside `initialize_buckets()`. It initializes both
+  // `bucket_views_in` and `bucket_views_out` with views for each variable's
+  // gradient into the bucket's flattened `gradients` tensor. Views serve as
+  // entry points to `copy_()` each grad's data in/out of the flattened
+  // `gradients` tensor.
+  void initialize_bucket_views(Bucket& bucket);
 
   // This function is called inside `finalize_backward`, it happens only if
   // DDP communication hook was registered to recreate just bucket_views_out
   // with the result of `future_work`.
-  void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor);
+  void populate_bucket_views_out(Bucket& bucket, at::Tensor& tensor);
 
   // If gradient_as_bucket_view_ is false, after allreduce buckets,
   // copy bucket results back to grads.
   void copy_bucket_to_grad(
       at::Tensor& variable,
-      Reducer::BucketReplica& replica,
+      Reducer::Bucket& bucket,
       size_t intra_bucket_index,
       bool global_unused);
   // Check layout of grad and bucket_view before copying the grad to bucket.
   void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view);
 
-  // A bucket holds N bucket replicas (1 per model replica).
-  //
-  // If every bucket in this struct is ready, the reduction can be kicked off.
-  // One bucket per replica. Reduction is kicked off when every bucket is ready.
-  //
+  // A bucket contains [1..N] gradients to be reduced, where the gradients
+  // have the same dtype and device.
+  // Coalescing gradients together before reducing can result in lower overhead
+  // and/or faster time to completion. Coalescing requires the constituent
+  // gradients to have the same dtype and device, and the resulting flattened
+  // tensor uses that common dtype and device. The flattened tensor is filled
+  // as the corresponding gradients are computed (triggered by autograd hooks),
+  // and the buckets are reduced in a predetermined order consistent across
+  // processes.
   struct Bucket {
-    std::vector<BucketReplica> replicas;
+    // Gradients of the bucket flattened into a 1-dimensional tensor
+    at::Tensor gradients;
+
+    // Views into the `gradients` tensor for each individual gradient
+    // Each view is created with layout (size and stride) matching the
+    // gradient's expected layout (see the "Gradient Layout Contract" in
+    // torch/csrc/autograd/functions/accumulate_grad.h).
+    // `bucket_views_in[i].copy_(grad)` and `grad.copy_(bucket_views_out[i])`
+    // provide convenient ways to copy gradient data in/out of `gradients`,
+    // respectively.
+    // We keep both `bucket_views_in` and `bucket_views_out` because
+    // registering a DDP communication hook may re-initialize
+    // `bucket_views_out` with the value of the hook's `future_work` but we
+    // still need separate views into the bucket's original flattened gradient
+    // to copy in gradient data.
+    std::vector<at::Tensor> bucket_views_in;
+    std::vector<at::Tensor> bucket_views_out;
 
-    // Global indices of participating variables in the bucket
-    std::vector<size_t> variable_indices;
+    // Variables whose gradients are held in this bucket
+    // We use refcounted tensors here so that we can easily unflatten the
+    // bucket's flattened `gradients` tensor into the participating variables
+    // after reduction has completed.
+    std::vector<at::Tensor> variables;
 
-    // Number of replicas to be marked done before this bucket is ready.
+    // Per-variable offset/length into the flattened `gradients` tensor and
+    // the corresponding `GradBucket` instance for communication hooks
+    std::vector<size_t> offsets;
+    std::vector<size_t> lengths;
+
+    // Per-variable sizes slicing into the bucket's `gradients` tensor
+    std::vector<c10::IntArrayRef> sizes_vec;
+
+    // Number of gradients left to be computed before the bucket is ready to
+    // be reduced
     size_t pending;
 
-    // Keep future work handle around DDP comm hook.
-    // If no hook is registered, a temporary vanilla allreduce hook will be
-    // used.
+    // Global indices of participating variables in the bucket
+    std::vector<size_t> variable_indices;
+
+    // Future work handle for DDP communication hook
+    // If no hook is registered, a temporary vanilla allreduce hook is used.
     c10::intrusive_ptr<at::ivalue::Future> future_work;
 
-    // If this bucket should expect a single sparse gradient.
-    // Implies: replicas[i].variables.size() == 1.
+    // If this bucket should expect a single sparse gradient
+    // If `true`, then this implies that `bucket.variables.size() == 1`.
     bool expect_sparse_gradient = false;
-    // "Limit" of cumulative parameter sizes that this bucket manages. It is
-    // actually a soft limit because we don't shard parameters across buckets
-    // so a single parameter may push it over the cap.
-    size_t bucket_size_limit;
+
+    // TODO(@pietern)
+    // Memory copies from gradient tensors into the bucket are potentially
+    // done on different CUDA streams. We record an event for every copy
+    // so that we can synchronize with them prior to kicking off the reduction.
+    // std::vector<at::cuda::CUDAEvent> events;
+
   };
 
   std::vector<Bucket> buckets_;
 
-  // A variable locator locates a particular variable in the bucket
-  // structure. The `bucket_index` field points to the bucket in the `buckets_`
-  // vector. The `intra_bucket_index` field points to the index of the variable
-  // in any of the vector fields in the bucket replica.
+  // A variable locator locates a particular variable in the reducer's buckets
   struct VariableLocator {
-    // Index into the `buckets_` variable.
+    // Index of the bucket containing the variable in the `buckets_` vector
     size_t bucket_index;
-    // Index of parameter in single bucket replica.
+    // Index of the variable in the bucket, which may be used consistently
+    // across `bucket_views_in`, `bucket_views_out`, `variables`, `offsets`,
+    // `lengths`, `sizes_vec`, and `variable_indices` in `Bucket`
     size_t intra_bucket_index;
 
     VariableLocator() = default;
@@ -568,7 +485,7 @@ class TORCH_API Reducer {
   std::unique_ptr<CommHookInterface> comm_hook_;
   // Debug level setting. It is parsed once when Reducer is constructed, and
   // remains the same across a single invocation of DDP training.
-  DistributedDebugLevel ddp_debug_level_;
+  DebugLevel ddp_debug_level_;
   // Mapping of variable index to fully qualified name of model to notify users
   // about errors when certain parameters do not get gradient.
   std::unordered_map<size_t, std::string> param_names_;
diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp
index b836cddd8017..a1c570da5d59 100644
--- a/torch/csrc/distributed/c10d/reducer_cuda.cpp
+++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp
@@ -1,4 +1,4 @@
-#include <c10d/reducer.hpp>
+#include <c10d/reducer_timer.hpp>
 
 #include <c10/core/DeviceGuard.h>
 #include <ATen/cuda/CUDAEvent.h>
diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp
new file mode 100644
index 000000000000..ba696383b88e
--- /dev/null
+++ b/torch/csrc/distributed/c10d/reducer_timer.hpp
@@ -0,0 +1,75 @@
+#pragma once
+#include <torch/csrc/autograd/profiler.h>
+
+namespace c10d {
+constexpr int kUnsetTime = -1;
+
+inline int64_t current_time_in_nanos() {
+  return torch::profiler::impl::getTime();
+}
+
+class TORCH_API Timer {
+ private:
+  // The timestamp of forward call start time in each iteration.
+  int64_t forward_start_time = kUnsetTime;
+  // The timestamp of backward computation start and end time in each
+  // iteration.
+  int64_t backward_compute_start_time = kUnsetTime;
+  int64_t backward_compute_end_time = kUnsetTime;
+  // The timestamp of first communication call start time in each iteration.
+  int64_t backward_comm_start_time = kUnsetTime;
+  // The timestamp of last communication call end time in each iteration.
+  int64_t backward_comm_end_time = kUnsetTime;
+
+ public:
+  enum class Event {
+    kForwardStart,
+    kBackwardComputeStart,
+    kBackwardComputeEnd,
+    kBackwardCommStart,
+    kBackwardCommEnd,
+  };
+
+  // Record the current event, i.e., mark it as having occurred now. Default
+  // CPU implementation.
+  virtual void record(Event event) {
+    getTimeRef(event) = current_time_in_nanos();
+  }
+
+  // Return the difference between when two events occurred, in nanoseconds.
+  // Or nullopt if one of them hasn't been recorded.
+  virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
+
+  virtual ~Timer() = default;
+
+  // Return host-side timestamp, or nullopt if it has not yet been recorded.
+  c10::optional<int64_t> getTimestamp(Event event) {
+    auto time = getTimeRef(event);
+    if (time == kUnsetTime) {
+      return c10::nullopt;
+    } else {
+      return time;
+    }
+  }
+
+  // Return host-side time member variable corresponding to the given event.
+  int64_t& getTimeRef(Event event) {
+    switch (event) {
+      case Event::kForwardStart:
+        return forward_start_time;
+      case Event::kBackwardComputeStart:
+        return backward_compute_start_time;
+      case Event::kBackwardComputeEnd:
+        return backward_compute_end_time;
+      case Event::kBackwardCommStart:
+        return backward_comm_start_time;
+      case Event::kBackwardCommEnd:
+        return backward_comm_end_time;
+      default:
+        TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+};
+
+C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index c99950f85895..acd819ab631c 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the BSD-style license found in the
@@ -273,7 +273,7 @@ std::unique_ptr<SocketImpl> SocketImpl::accept() const {
   addr.ai_addr = addr_ptr;
   addr.ai_addrlen = addr_len;
 
-  C10D_INFO("The server socket on {} has accepted a connection from {}.", *this, addr);
+  C10D_DEBUG("The server socket on {} has accepted a connection from {}.", *this, addr);
 
   auto impl = std::make_unique<SocketImpl>(hnd);
 
@@ -414,17 +414,17 @@ SocketListenOp::SocketListenOp(std::uint16_t port, const SocketOptions& opts)
 
 std::unique_ptr<SocketImpl> SocketListenOp::run() {
   if (opts_->prefer_ipv6()) {
-    C10D_INFO("The server socket will attempt to listen on an IPv6 address.");
+    C10D_DEBUG("The server socket will attempt to listen on an IPv6 address.");
     if (tryListen(AF_INET6)) {
       return std::move(socket_);
     }
 
-    C10D_INFO("The server socket will attempt to listen on an IPv4 address.");
+    C10D_DEBUG("The server socket will attempt to listen on an IPv4 address.");
     if (tryListen(AF_INET)) {
       return std::move(socket_);
     }
   } else {
-    C10D_INFO("The server socket will attempt to listen on an IPv4 or IPv6 address.");
+    C10D_DEBUG("The server socket will attempt to listen on an IPv4 or IPv6 address.");
     if (tryListen(AF_UNSPEC)) {
       return std::move(socket_);
     }
@@ -459,7 +459,7 @@ bool SocketListenOp::tryListen(int family) {
   addrinfo_ptr result{naked_result};
 
   for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) {
-    C10D_INFO("The server socket is attempting to listen on {}.", *addr);
+    C10D_DEBUG("The server socket is attempting to listen on {}.", *addr);
     if (tryListen(*addr)) {
       return true;
     }
@@ -534,8 +534,7 @@ class SocketConnectOp {
   enum class ConnectResult {
     Success,
     Error,
-    Retry,
-    TimeOut
+    Retry
   };
 
  public:
@@ -550,6 +549,8 @@ class SocketConnectOp {
 
   ConnectResult tryConnectCore(const ::addrinfo& addr);
 
+  [[noreturn]] void throwTimeoutError() const;
+
   template <typename... Args>
   void recordError(fmt::string_view format, Args&&... args) {
     auto msg = fmt::vformat(format, fmt::make_format_args(args...));
@@ -576,25 +577,25 @@ SocketConnectOp::SocketConnectOp(const std::string& host,
 
 std::unique_ptr<SocketImpl> SocketConnectOp::run() {
   if (opts_->prefer_ipv6()) {
-    C10D_INFO("The client socket will attempt to connect to an IPv6 address of ({}, {}).",
-              host_,
-              port_);
+    C10D_DEBUG("The client socket will attempt to connect to an IPv6 address of ({}, {}).",
+               host_,
+               port_);
 
     if (tryConnect(AF_INET6)) {
       return std::move(socket_);
     }
 
-    C10D_INFO("The client socket will attempt to connect to an IPv4 address of ({}, {}).",
-              host_,
-              port_);
+    C10D_DEBUG("The client socket will attempt to connect to an IPv4 address of ({}, {}).",
+               host_,
+               port_);
 
     if (tryConnect(AF_INET)) {
       return std::move(socket_);
     }
   } else {
-    C10D_INFO("The client socket will attempt to connect to an IPv4 or IPv6 address of ({}, {}).",
-              host_,
-              port_);
+    C10D_DEBUG("The client socket will attempt to connect to an IPv4 or IPv6 address of ({}, {}).",
+               host_,
+               port_);
 
     if (tryConnect(AF_UNSPEC)) {
       return std::move(socket_);
@@ -612,58 +613,66 @@ std::unique_ptr<SocketImpl> SocketConnectOp::run() {
 }
 
 bool SocketConnectOp::tryConnect(int family) {
-  ::addrinfo hints{}, *naked_result = nullptr;
-
+  ::addrinfo hints{};
   hints.ai_flags = AI_V4MAPPED | AI_ALL | AI_NUMERICSERV;
   hints.ai_family = family;
   hints.ai_socktype = SOCK_STREAM;
 
-  int r = ::getaddrinfo(host_, port_.c_str(), &hints, &naked_result);
-  if (r != 0) {
-    const char* gai_err = ::gai_strerror(r);
-
-    recordError("The {}network addresses of ({}, {}) cannot be retrieved (gai error: {} - {}).",
-                family == AF_INET ? "IPv4 " : family == AF_INET6 ? "IPv6 " : "",
-                host_,
-                port_,
-                r,
-                gai_err);
-
-    return false;
-  }
-
-  addrinfo_ptr result{naked_result};
-
   deadline_ = Clock::now() + opts_->connect_timeout();
 
+  std::size_t retry_attempt = 1;
+
   bool retry; // NOLINT(cppcoreguidelines-init-variables)
   do {
     retry = false;
 
     errors_.clear();
 
-    for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) {
-      C10D_INFO("The client socket is attempting to connect to {}.", *addr);
+    ::addrinfo *naked_result = nullptr;
+    // patternlint-disable cpp-dns-deps
+    int r = ::getaddrinfo(host_, port_.c_str(), &hints, &naked_result);
+    if (r != 0) {
+      const char* gai_err = ::gai_strerror(r);
+
+      recordError("The {}network addresses of ({}, {}) cannot be retrieved (gai error: {} - {}).",
+                  family == AF_INET ? "IPv4 " : family == AF_INET6 ? "IPv6 " : "",
+                  host_,
+                  port_,
+                  r,
+                  gai_err);
+      retry = true;
+    } else {
+      addrinfo_ptr result{naked_result};
+
+      for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) {
+        C10D_TRACE("The client socket is attempting to connect to {}.", *addr);
+
+        ConnectResult cr = tryConnect(*addr);
+        if (cr == ConnectResult::Success) {
+          return true;
+        }
 
-      ConnectResult cr = tryConnect(*addr);
-      if (cr == ConnectResult::Success) {
-        return true;
+        if (cr == ConnectResult::Retry) {
+          retry = true;
+        }
       }
+    }
 
-      if (cr == ConnectResult::TimeOut) {
-        auto msg = fmt::format(
-            "The client socket has timed out after {} while trying to connect to ({}, {}).",
-            opts_->connect_timeout(),
-            host_,
-            port_);
+    if (retry) {
+      if (Clock::now() < deadline_ - delay_duration_) {
+        // Prevent our log output to be too noisy, warn only every 30 seconds.
+        if (retry_attempt == 30) {
+          C10D_INFO("No socket on ({}, {}) is listening yet, will retry.", host_, port_);
 
-        C10D_ERROR(msg);
+          retry_attempt = 0;
+        }
 
-        throw TimeoutError{msg};
-      }
+        // Wait one second to avoid choking the server.
+        delay(delay_duration_);
 
-      if (cr == ConnectResult::Retry) {
-        retry = true;
+        retry_attempt++;
+      } else {
+        throwTimeoutError();
       }
     }
   } while (retry);
@@ -673,7 +682,7 @@ bool SocketConnectOp::tryConnect(int family) {
 
 SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& addr) {
   if (Clock::now() >= deadline_) {
-    return ConnectResult::TimeOut;
+    throwTimeoutError();
   }
 
   SocketImpl::Handle hnd = ::socket(addr.ai_family, addr.ai_socktype, addr.ai_protocol);
@@ -698,16 +707,9 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& add
 
     // Retry if the server is not yet listening or if its backlog is exhausted.
     if (err == std::errc::connection_refused || err == std::errc::connection_reset) {
-      C10D_WARNING("The server socket on {} is not yet listening {}, will retry.", addr, err);
+      C10D_TRACE("The server socket on {} is not yet listening {}, will retry.", addr, err);
 
-      if (Clock::now() < deadline_ - delay_duration_) {
-        // Wait a little to avoid choking the server.
-        delay(delay_duration_);
-
-        return ConnectResult::Retry;
-      } else {
-        return ConnectResult::TimeOut;
-      }
+      return ConnectResult::Retry;
     } else {
       recordError("The client socket has failed to connect to {} {}.", addr, err);
 
@@ -715,10 +717,6 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& add
     }
   }
 
-  if (cr == ConnectResult::TimeOut) {
-    return cr;
-  }
-
   socket_->closeOnExec();
 
   // TODO: Remove once we fully migrate to non-blocking mode.
@@ -750,7 +748,7 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo&
 
   Duration remaining = deadline_ - Clock::now();
   if (remaining <= Duration::zero()) {
-    return ConnectResult::TimeOut;
+    throwTimeoutError();
   }
 
   ::pollfd pfd{};
@@ -761,7 +759,7 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo&
 
   r = pollFd(&pfd, 1, static_cast<int>(ms.count()));
   if (r == 0) {
-    return ConnectResult::TimeOut;
+    throwTimeoutError();
   }
   if (r == -1) {
     return ConnectResult::Error;
@@ -785,6 +783,18 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo&
   }
 }
 
+void SocketConnectOp::throwTimeoutError() const {
+  auto msg = fmt::format(
+      "The client socket has timed out after {} while trying to connect to ({}, {}).",
+      opts_->connect_timeout(),
+      host_,
+      port_);
+
+  C10D_ERROR(msg);
+
+  throw TimeoutError{msg};
+}
+
 } // namespace
 
 void Socket::initialize() {
diff --git a/torch/csrc/distributed/c10d/socket.h b/torch/csrc/distributed/c10d/socket.h
index e247a2a0816a..c26900760fbe 100644
--- a/torch/csrc/distributed/c10d/socket.h
+++ b/torch/csrc/distributed/c10d/socket.h
@@ -1,4 +1,4 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the BSD-style license found in the
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index 45ffb2903bb0..dae9c162fe9d 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -41,6 +41,113 @@ std::unordered_map<std::string, worker_id_t> collectNames(
   return nameToId;
 }
 
+std::vector<std::string> splitString(
+    const std::string& s,
+    const std::string& delim) {
+  std::vector<std::string> tokens;
+  size_t start = 0;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  size_t end;
+  // Iterate through each delimiter
+  while ((end = s.find(delim, start)) != std::string::npos) {
+    tokens.emplace_back(s.substr(start, end - start));
+    start = end + delim.length();
+  }
+  tokens.emplace_back(s.substr(start));
+  return tokens;
+}
+
+const std::string allWorkerInfosKey = "_ALL_WORKER_INFOS";
+
+std::unordered_map<std::string, worker_id_t> collectCurrentNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName) {
+  std::vector<uint8_t> selfNameVector(
+      (uint8_t*)selfName.c_str(),
+      (uint8_t*)selfName.c_str() + selfName.length());
+
+  // Check that ID does not already exist and set {ID : NAME}
+  std::vector<uint8_t> resultVector = store.compareSet(
+      c10::to_string(selfId), std::vector<uint8_t>(), selfNameVector);
+  TORCH_CHECK(
+      resultVector == selfNameVector,
+      "RPC worker id ",
+      selfId,
+      " is not unique. Worker ",
+      resultVector,
+      " and already has ID and ",
+      selfNameVector,
+      " cannot be added.");
+
+  store.set(c10::to_string(selfId), selfNameVector);
+
+  std::unordered_map<std::string, worker_id_t> nameToId;
+  nameToId.emplace(selfName, selfId);
+
+  // Check to see if there is list of worker names in the store
+  bool worker_names_available =
+      store.check(std::vector<std::string>{allWorkerInfosKey});
+  std::string allWorkerInfos;
+  if (worker_names_available) {
+    // Get the current list of workers
+    std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
+    allWorkerInfos = std::string(
+        (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+    // workerInfos are comma separated with a comma at the end (e.g.
+    // "Name1-Rank1,Name2-Rank2,Name3-Rank2,") parse list of workers.
+    if (!allWorkerInfos.empty()) {
+      for (const std::string& workerInfoString : splitString(
+               allWorkerInfos.substr(0, allWorkerInfos.size() - 1), ",")) {
+        auto workerInfoVec = splitString(workerInfoString, "-");
+        std::string workerName = workerInfoVec.at(0);
+        int workerId = std::stoi(workerInfoVec.at(1));
+
+        TORCH_CHECK(
+            nameToId.find(workerName) == nameToId.end(),
+            "RPC worker name ",
+            workerName,
+            " is not unique. Workers ",
+            nameToId.find(workerName)->second,
+            " and ",
+            workerId,
+            " share the same name.");
+
+        nameToId.emplace(workerName, workerId);
+      }
+    }
+  }
+  // Add own name to worker list
+  allWorkerInfos = fmt::format("{}{}-{},", allWorkerInfos, selfName, selfId);
+  std::vector<uint8_t> allWorkerInfosVector(
+      (uint8_t*)allWorkerInfos.c_str(),
+      (uint8_t*)allWorkerInfos.c_str() + allWorkerInfos.length());
+  store.set(allWorkerInfosKey, allWorkerInfosVector);
+
+  return nameToId;
+}
+
+void removeCurrentName(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName) {
+  // Get current list of names/ranks
+  std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
+  std::string allWorkerInfos = std::string(
+      (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+
+  // Remove the current name and rank
+  std::string str_to_erase = fmt::format("{}-{},", selfName, selfId);
+  int start_position_to_erase = allWorkerInfos.find(str_to_erase);
+  allWorkerInfos.erase(start_position_to_erase, str_to_erase.length());
+
+  // Set the new data
+  std::vector<uint8_t> newAllWorkerInfosVector(
+      (uint8_t*)allWorkerInfos.c_str(),
+      (uint8_t*)allWorkerInfos.c_str() + allWorkerInfos.length());
+  store.set(allWorkerInfosKey, newAllWorkerInfosVector);
+}
+
 const string storeKeyBarrierId = "_ID_";
 const string storeKeyProcessCount = "PROCESS_COUNT";
 const string storeKeyActiveCallCount = "ACTIVE_CALLS";
diff --git a/torch/csrc/distributed/rpc/agent_utils.h b/torch/csrc/distributed/rpc/agent_utils.h
index befa26b86037..0288e0c063bb 100644
--- a/torch/csrc/distributed/rpc/agent_utils.h
+++ b/torch/csrc/distributed/rpc/agent_utils.h
@@ -16,6 +16,24 @@ std::unordered_map<std::string, worker_id_t> collectNames(
     const std::string& selfName,
     const int worldSize);
 
+// Ranks in dynamic RPC groups will initially call into this to establish the
+// name-to-id mapping for the current peers in the group. The current rank will
+// put its own worker info in the store and discover all the ranks that came
+// before it. NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+std::unordered_map<std::string, worker_id_t> collectCurrentNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
+// Remove name frmo Store, used in dynamic RPC groups.
+// NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+void removeCurrentName(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
 // This performs a synchronization of all call counts by using store.
 // All RPC peers wait for others to join to exit at the same time.
 int syncCallCount(
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index fd6f3aca9485..7b8a2d1f18da 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -110,11 +110,26 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           // c10::hash, so  we need to use the qualified name
           // py::detail::hash, which unfortunately is in a detail namespace.
           .def(py::detail::hash(py::self)) // NOLINT
-          .def("__repr__", [](const WorkerInfo& workerInfo) {
-            std::ostringstream os;
-            os << workerInfo;
-            return os.str();
-          });
+          .def(
+              "__repr__",
+              [](const WorkerInfo& workerInfo) {
+                std::ostringstream os;
+                os << workerInfo;
+                return os.str();
+              })
+          .def(py::pickle(
+              /* __getstate__ */
+              [](const WorkerInfo& workerInfo) {
+                return py::make_tuple(workerInfo.name_, workerInfo.id_);
+              },
+              /* __setstate__ */
+              [](py::tuple t) {
+                TORCH_CHECK(t.size() == 2, "Invalid WorkerInfo state.");
+
+                WorkerInfo info(
+                    t[0].cast<std::string>(), t[1].cast<worker_id_t>());
+                return info;
+              }));
 
   auto rpcAgent =
       shared_ptr_class_<RpcAgent>(module, "RpcAgent")
@@ -122,7 +137,8 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               "join",
               &RpcAgent::join,
               py::call_guard<py::gil_scoped_release>(),
-              py::arg("shutdown") = false)
+              py::arg("shutdown") = false,
+              py::arg("timeout") = 0)
           .def(
               "sync", &RpcAgent::sync, py::call_guard<py::gil_scoped_release>())
           .def(
@@ -561,7 +577,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               [](const c10::intrusive_ptr<::c10d::Store>& store,
                  std::string selfName,
                  worker_id_t selfId,
-                 int worldSize,
+                 optional<int> worldSize,
                  TensorPipeRpcBackendOptions opts,
                  std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
                  std::vector<c10::Device> devices) {
@@ -588,7 +604,8 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           "join",
           &TensorPipeAgent::join,
           py::call_guard<py::gil_scoped_release>(),
-          py::arg("shutdown") = false)
+          py::arg("shutdown") = false,
+          py::arg("timeout") = 0)
       .def(
           "shutdown",
           &TensorPipeAgent::shutdown,
@@ -617,7 +634,17 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           "_get_device_map",
           (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst) const) &
               TensorPipeAgent::getDeviceMap,
-          py::call_guard<py::gil_scoped_release>());
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_get_backend_options",
+          &TensorPipeAgent::getBackendOptions,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_update_group_membership",
+          &TensorPipeAgent::updateGroupMembership,
+          py::call_guard<py::gil_scoped_release>())
+      .def_readonly("is_static_group", &TensorPipeAgent::isStaticGroup_)
+      .def_property_readonly("store", &TensorPipeAgent::getStore);
 
 #endif // USE_TENSORPIPE
 
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index e50100e331f4..cd427d4a90ea 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -213,7 +213,7 @@ class TORCH_API RpcAgent {
 
   // Call sync and join all internal threads. This method should be called
   // before every RPC process exits.
-  virtual void join(bool shutdown = false) = 0;
+  virtual void join(bool shutdown = false, float timeout = 0) = 0;
 
   // Synchronize the this process with other ``RpcAgent`` processes. Block until
   // all ``RpcAgent``s reach this method and send all pending messages.
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index aaaf3c673f75..7426eb20807a 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -41,7 +41,7 @@ std::vector<c10::Device> getDevicesForTensors(
     const std::string& remoteName) {
   // If the deviceMap is overridden, use that instead.
   const auto errStr = c10::str(
-      "TensorPipe RPC backend only supports CPU tensors by default, please "
+      "TensorPipe RPC backend only supports CPU and Meta tensors by default, please "
       "move your tensors to CPU before sending them over RPC, or call "
       "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly "
       "configure device mapping. ",
@@ -51,7 +51,9 @@ std::vector<c10::Device> getDevicesForTensors(
   devices.reserve(tensors.size());
   bool hasMappedDevice = false;
   for (const auto& t : tensors) {
-    if (t.device().is_cpu()) {
+    if (t.device().is_meta()) {
+      devices.emplace_back(c10::kMeta);
+    } else if (t.device().is_cpu()) {
       const auto deviceIter = deviceMap.find(c10::kCPU);
       if (deviceIter == deviceMap.end()) {
         devices.emplace_back(c10::kCPU);
@@ -113,7 +115,7 @@ std::vector<c10::Device> getDevicesOfTensors(
   size_t deviceCount = 0;
   std::vector<bool> indexBitset;
   for (const torch::Tensor& tensor : tensors) {
-    if (!tensor.is_cpu()) {
+    if (!tensor.is_cpu() && !tensor.is_meta()) {
       c10::Device device = tensor.device();
       if (!impl.has_value()) {
         impl.emplace(device.type());
@@ -342,9 +344,15 @@ void TensorPipeAgent::removeFromTimeoutMap(uint64_t messageId) {
   }
 }
 
-void TensorPipeAgent::prepareNames() {
-  auto nameToId = collectNames(
-      rankToNameStore_, workerInfo_.id_, workerInfo_.name_, worldSize_);
+void TensorPipeAgent::prepareNames(bool isStaticGroup) {
+  std::unordered_map<std::string, worker_id_t> nameToId;
+  if (isStaticGroup) {
+    nameToId = collectNames(
+        rankToNameStore_, workerInfo_.id_, workerInfo_.name_, worldSize_);
+  } else {
+    nameToId = collectCurrentNames(
+        rankToNameStore_, workerInfo_.id_, workerInfo_.name_);
+  }
 
   for (const auto& entry : nameToId) {
     const auto& workerName = entry.first;
@@ -354,11 +362,35 @@ void TensorPipeAgent::prepareNames() {
   }
 }
 
+void TensorPipeAgent::checkAndSetStaticGroup(
+    const c10::intrusive_ptr<::c10d::Store>& store) {
+  std::string isStaticGroupKey("rpcIsStaticGroup");
+
+  std::string isStaticGroupStr = isStaticGroup_ ? "true" : "false";
+  std::vector<uint8_t> isStaticGroupVec(
+      (uint8_t*)isStaticGroupStr.c_str(),
+      (uint8_t*)isStaticGroupStr.c_str() + isStaticGroupStr.length());
+  std::vector<uint8_t> returnedVec;
+  returnedVec = store->compareSet(
+      isStaticGroupKey, std::vector<uint8_t>(), isStaticGroupVec);
+  std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end());
+  // In both cases, the returned value should be the value of isStaticGroupStr,
+  // otherwise there is a discrepency with initialization among one of the
+  // members
+  TORCH_CHECK(
+      returnedVal == isStaticGroupStr,
+      fmt::format(
+          "RPC group mixes statically and dynamically initialized members which is not supported. ",
+          "Static group property is initialized as {} and is trying to be set as {} ",
+          isStaticGroup_,
+          returnedVal));
+}
+
 TensorPipeAgent::TensorPipeAgent(
     const c10::intrusive_ptr<::c10d::Store>& store,
     std::string selfName,
     worker_id_t selfId,
-    int worldSize,
+    optional<int> worldSize,
     TensorPipeRpcBackendOptions opts,
     std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
     std::vector<c10::Device> devices,
@@ -368,6 +400,8 @@ TensorPipeAgent::TensorPipeAgent(
           std::move(cb),
           std::chrono::milliseconds(
               (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))),
+      isStaticGroup_(worldSize.has_value()),
+      store_(store),
       opts_(std::move(opts)),
       reverseDeviceMaps_(std::move(reverseDeviceMaps)),
       devices_(std::move(devices)),
@@ -376,10 +410,16 @@ TensorPipeAgent::TensorPipeAgent(
           tensorpipe::ContextOptions().name(workerInfo_.name_))),
       rankToNameStore_("names", store),
       nameToAddressStore_("addrs", store),
-      shutdownStore_("shutdown", store),
-      worldSize_(worldSize) {
+      shutdownStore_("shutdown", store) {
+  if (isStaticGroup_) {
+    worldSize_ = worldSize.value();
+  }
+
+  // check the static group attribute against store
+  checkAndSetStaticGroup(store);
+
   // collect worker names
-  prepareNames();
+  prepareNames(isStaticGroup_);
 
   // Initialize the time-series metrics tracking map
   timeSeriesMetrics_.emplace(kGilAverageWaitTime, TimeSeriesMetricsTracker());
@@ -524,7 +564,11 @@ void TensorPipeAgent::pipeRead(
       return;
     }
 
-    std::vector<c10::Stream> streams = getStreamsFromPoolForDevices(devices_);
+    std::vector<c10::Stream> streams;
+    {
+      GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+      streams = getStreamsFromPoolForDevices(devices_);
+    }
     tensorpipe::Allocation tpAllocation;
     TensorpipeReadBuffers tpBuffers;
     std::tie(tpAllocation, tpBuffers) =
@@ -604,24 +648,26 @@ void TensorPipeAgent::sendCompletedResponseMessage(
 
     for (const auto& tensor : responseMessage->tensors()) {
       const auto device = tensor.device();
-      if (!device.is_cpu() &&
-          std::find(devices_.begin(), devices_.end(), device) ==
-              devices_.end()) {
-        std::ostringstream oss;
-        std::copy(
-            devices_.begin(),
-            devices_.end(),
-            std::ostream_iterator<c10::Device>(oss, ", "));
-        responseMessage = createExceptionResponse(
-            c10::str(
-                "RPC detected that a user-function output tensor on device ",
-                device,
-                ". This device is not one of the input tensor devices: ",
-                oss.str(),
-                "which is not yet supported. Please file a feature request "
-                "issue in PyTorch GitHub repo."),
-            messageId);
-        break;
+      if (!device.is_cpu() && !device.is_meta()) {
+        GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+        if (std::find(devices_.begin(), devices_.end(), device) ==
+            devices_.end()) {
+          std::ostringstream oss;
+          std::copy(
+              devices_.begin(),
+              devices_.end(),
+              std::ostream_iterator<c10::Device>(oss, ", "));
+          responseMessage = createExceptionResponse(
+              c10::str(
+                  "RPC detected that a user-function output tensor on device ",
+                  device,
+                  ". This device is not one of the input tensor devices: ",
+                  oss.str(),
+                  "which is not yet supported. Please file a feature request "
+                  "issue in PyTorch GitHub repo."),
+              messageId);
+          break;
+        }
       }
     }
 
@@ -784,7 +830,12 @@ c10::intrusive_ptr<JitFuture> TensorPipeAgent::send(
   }
   ClientPipe& clientPipe = it->second;
 
-  auto futureResponseMessage = std::make_shared<AtomicJitFuture>(devices_);
+  std::shared_ptr<torch::distributed::rpc::TensorPipeAgent::AtomicJitFuture>
+      futureResponseMessage;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    futureResponseMessage = std::make_shared<AtomicJitFuture>(devices_);
+  }
   uint64_t messageId = nextMessageID_++;
   requestMessage->setId(messageId);
 
@@ -844,7 +895,11 @@ c10::intrusive_ptr<JitFuture> TensorPipeAgent::send(
   VLOG(1) << "RPC agent for " << workerInfo_.name_ << " is sending request #"
           << messageId << " to " << clientPipe.pipe_->getRemoteName();
 
-  std::vector<c10::Stream> streams = getStreamsFromPoolForDevices(devices_);
+  std::vector<c10::Stream> streams;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    streams = getStreamsFromPoolForDevices(devices_);
+  }
   makeStreamsWaitOnOthers(
       streams,
       getCurrentStreamsForDevices(
@@ -1011,9 +1066,27 @@ void TensorPipeAgent::pollTimeoutRpcs() {
   }
 }
 
+void TensorPipeAgent::leaveGroup() {
+  std::unique_lock<std::mutex> lock(callCountMutex_);
+  // local worker ActiveCallCount is 0 at this point and we will shutdown
+  // (any future calls will be dropped)
+  callCountCV_.wait(lock, [this] { return clientActiveCalls_ == 0; });
+
+  // Remove this agent's WorkerInfo from store
+  removeCurrentName(rankToNameStore_, workerInfo_.id_, workerInfo_.name_);
+
+  // Set internal variable to be used during destructor
+  shuttingDown_ = true;
+}
+
 // TODO: Remove join()
-void TensorPipeAgent::join(bool shutdown) {
+void TensorPipeAgent::join(bool shutdown, float /* unused */) {
   VLOG(1) << "RPC agent for " << workerInfo_.name_ << " is joining";
+  if (!isStaticGroup_) {
+    leaveGroup();
+    return;
+  }
+
   // This method behaves like a barrier, as it can only return once all workers
   // have no more requests pending, including "nested" requests (triggered from
   // within the remote code of another call) and "follow-up" requests (triggered
@@ -1024,6 +1097,7 @@ void TensorPipeAgent::join(bool shutdown) {
       // It is enough to wait for there to be no more active client calls, since
       // each server call corresponds to a client call for some other worker.
       callCountCV_.wait(lock, [this] { return clientActiveCalls_ == 0; });
+
       // We'd like to immediately proceed with the allreduce, but it's a call
       // that may block for some time, as it waits for other workers to also
       // complete all their active client calls. While we call allreduce we must
@@ -1096,16 +1170,34 @@ void TensorPipeAgent::shutdownImpl() {
 
 const WorkerInfo& TensorPipeAgent::getWorkerInfo(
     const std::string& workerName) const {
-  const auto& it = workerNameToInfo_.find(workerName);
+  std::unordered_map<std::string, WorkerInfo>::const_iterator it;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    it = workerNameToInfo_.find(workerName);
+  }
   TORCH_CHECK(
-      it != workerNameToInfo_.end(), "Unknown destination worker ", workerName);
+      it != workerNameToInfo_.end(),
+      fmt::format(
+          "name:{},rank:{} could not find destination name {}",
+          workerInfo_.name_,
+          workerInfo_.id_,
+          workerName));
   return it->second;
 }
 
 const WorkerInfo& TensorPipeAgent::getWorkerInfo(worker_id_t workerId) const {
-  const auto& it = workerIdToInfo_.find(workerId);
+  std::unordered_map<worker_id_t, WorkerInfo>::const_iterator it;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    it = workerIdToInfo_.find(workerId);
+  }
   TORCH_CHECK(
-      it != workerIdToInfo_.end(), "Unknown destination worker ", workerId);
+      it != workerIdToInfo_.end(),
+      fmt::format(
+          "name:{},rank:{} could not find destination id {}",
+          workerInfo_.name_,
+          workerInfo_.id_,
+          workerId));
   return it->second;
 }
 
@@ -1119,12 +1211,74 @@ std::vector<WorkerInfo> TensorPipeAgent::getWorkerInfos() const {
 
 const std::string& TensorPipeAgent::findWorkerURL(
     const WorkerInfo& worker) const {
-  const auto it = workerNameToURL_.find(worker.name_);
+  std::unordered_map<std::string, std::string>::const_iterator it;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    it = workerNameToURL_.find(worker.name_);
+  }
   TORCH_CHECK(
-      it != workerNameToURL_.end(), "Unknown worker name: ", worker.name_);
+      it != workerNameToURL_.end(),
+      fmt::format(
+          "name:{},rank:{} could not find destination url for name {}",
+          workerInfo_.name_,
+          workerInfo_.id_,
+          worker.name_));
   return it->second;
 }
 
+void TensorPipeAgent::updateGroupMembership(
+    const WorkerInfo& workerInfo,
+    const std::vector<c10::Device> devices,
+    const std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
+    bool isJoin) {
+  std::string name = workerInfo.name_;
+  worker_id_t id = workerInfo.id_;
+  // Rank with workerInfo is joining the group, update internal mappings
+  if (isJoin) {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    workerIdToInfo_.emplace(id, workerInfo);
+    workerNameToInfo_.emplace(name, workerInfo);
+
+    // TODO: we should get nodeAddrStr in the joining process, then pass in as
+    // an argument rather than getting from store each time
+    auto nodeAddrData = nameToAddressStore_.get(name);
+    auto nodeAddrStr =
+        std::string((const char*)nodeAddrData.data(), nodeAddrData.size());
+    workerNameToURL_.insert({name, nodeAddrStr});
+
+    for (const auto& it : reverseDeviceMaps) {
+      if (reverseDeviceMaps_.find(it.first) == reverseDeviceMaps_.end()) {
+        reverseDeviceMaps_[it.first] = it.second;
+      }
+    }
+    // TODO: clean up mutex for devices_ usage
+    // Add devices that have not been added yet
+    for (const auto& it : devices) {
+      if (std::find(devices_.begin(), devices_.end(), it) == devices_.end()) {
+        devices_.push_back(it);
+      }
+    }
+  } else {
+    workerIdToInfo_.erase(id);
+    workerNameToInfo_.erase(name);
+    workerNameToURL_.erase(name);
+
+    for (const auto& it : reverseDeviceMaps_) {
+      if (reverseDeviceMaps.find(it.first) == reverseDeviceMaps.end()) {
+        reverseDeviceMaps_.erase(it.first);
+      }
+    }
+
+    auto iter = devices_.begin();
+    while (iter != devices_.end()) {
+      if (std::find(devices.begin(), devices.end(), *iter) == devices.end()) {
+        iter = devices_.erase(iter);
+      } else {
+        iter++;
+      }
+    }
+  }
+}
 std::unordered_map<std::string, std::string> TensorPipeAgent::getMetrics() {
   std::unordered_map<std::string, std::string> metrics;
   metrics[kThreadPoolSize] = c10::to_string(threadPool_.size());
@@ -1252,11 +1406,14 @@ void TensorPipeAgent::markFutureWithError(
 std::vector<c10::Device> TensorPipeAgent::getDevicesForRemote(
     const std::string& remoteName,
     const Message& message) const {
-  const auto& deviceMaps =
-      message.isRequest() ? opts_.deviceMaps : reverseDeviceMaps_;
+  std::unordered_map<std::string, DeviceMap> deviceMaps;
+  {
+    GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
+    deviceMaps = message.isRequest() ? opts_.deviceMaps : reverseDeviceMaps_;
+  }
 
   const auto errStr = c10::str(
-      "TensorPipe RPC backend only supports CPU tensors by default, please "
+      "TensorPipe RPC backend only supports CPU and Meta tensors by default, please "
       "move your tensors to CPU before sending them over RPC, or call "
       "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly "
       "configure device mapping. ",
@@ -1268,7 +1425,7 @@ std::vector<c10::Device> TensorPipeAgent::getDevicesForRemote(
   if (iter == deviceMaps.end()) {
     for (const auto& t : message.tensors()) {
       TORCH_CHECK(
-          t.device().is_cpu(),
+          t.device().is_cpu() || t.device().is_meta(),
           errStr,
           ", but found tensor on device: ",
           t.device());
@@ -1287,7 +1444,16 @@ DeviceMap TensorPipeAgent::getDeviceMap(const WorkerInfo& dst) const {
   return it->second;
 }
 
+const c10::intrusive_ptr<::c10d::Store> TensorPipeAgent::getStore() const {
+  return store_;
+}
+
+TensorPipeRpcBackendOptions TensorPipeAgent::getBackendOptions() const {
+  return opts_;
+}
+
 const std::vector<c10::Device>& TensorPipeAgent::getDevices() const {
+  GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
   return devices_;
 }
 
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index b76e1a099beb..2ad3ef6a0d75 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -165,7 +165,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
       const c10::intrusive_ptr<::c10d::Store>& store,
       std::string selfName,
       worker_id_t selfId,
-      int worldSize,
+      optional<int> worldSize,
       TensorPipeRpcBackendOptions opts,
       std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
       std::vector<c10::Device> devices,
@@ -182,7 +182,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
 
   // join() and sync() would be deprecated -
   // https://github.com/pytorch/pytorch/issues/27647
-  void join(bool shutdown = false) override;
+  void join(bool shutdown = false, float timeout = 0) override;
   void sync() override{};
   void startImpl() override;
   void shutdownImpl() override;
@@ -192,11 +192,20 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   const WorkerInfo& getWorkerInfo(const std::string& workerName) const override;
   const WorkerInfo& getWorkerInfo(worker_id_t workerId) const override;
   std::vector<WorkerInfo> getWorkerInfos() const override;
+  void updateGroupMembership(
+      const WorkerInfo& workerInfo,
+      const std::vector<c10::Device> devices,
+      const std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
+      bool isJoin);
 
   std::unordered_map<std::string, std::string> getMetrics() override;
 
   void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override;
 
+  TensorPipeRpcBackendOptions getBackendOptions() const;
+
+  const c10::intrusive_ptr<::c10d::Store> getStore() const;
+
   DeviceMap getDeviceMap(const WorkerInfo& dest) const override;
 
   const std::vector<c10::Device>& getDevices() const override;
@@ -216,6 +225,8 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   size_t numPendingResponses();
   size_t messageIdToTimeoutMapSize();
 
+  const bool isStaticGroup_;
+
  protected:
   // TensorPipe write function that could be used to write response
   // messages by server, and write request messages by client. This
@@ -233,10 +244,16 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   void removeFromTimeoutMap(uint64_t messageId);
 
   // Populates workerIdToInfo_ and workerNameToInfo_ using addressStore_
-  void prepareNames();
+  void prepareNames(bool isStaticGroup);
+
+  // Check the static group attribute with the value set in store
+  void checkAndSetStaticGroup(const c10::intrusive_ptr<::c10d::Store>& store);
 
   const std::string& findWorkerURL(const WorkerInfo& worker) const;
 
+  // Only use for Dynamic RPC groups, method to have worker leave group
+  void leaveGroup();
+
   // TensorPipe read function that could be used to read response messages
   // by client, and read request messages by server.
   void pipeRead(
@@ -307,12 +324,16 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
         pendingResponseMessage_;
   };
 
+  const c10::intrusive_ptr<::c10d::Store> store_;
+
   const TensorPipeRpcBackendOptions opts_;
-  const std::unordered_map<std::string, DeviceMap> reverseDeviceMaps_;
+  // For dynamic RPC, the reverse device maps are updated whenever a new rank
+  // joins or leaves the group
+  std::unordered_map<std::string, DeviceMap> reverseDeviceMaps_;
   // Local devices used by this agent. If application didn't specify this
   // field, it will be initialized using corresponding local devices in
   // opts_.deviceMaps and reverseDeviceMaps_;
-  const std::vector<c10::Device> devices_;
+  std::vector<c10::Device> devices_;
 
   ThreadPool threadPool_;
   std::shared_ptr<tensorpipe::Context> context_;
@@ -331,8 +352,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // Store keys that will used to count joined processes and active calls during
   // the shutdown process
   ::c10d::PrefixStore shutdownStore_;
-  const int worldSize_;
-
+  int worldSize_ = 0;
   std::atomic<uint64_t> nextMessageID_{0};
 
   // Metadata used for tracking of whether certain RPCs have timed out or not.
@@ -410,6 +430,31 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // Mutex to guard timeSeriesMetrics_
   std::mutex metricsMutex_;
 
+  // Custom lock guard used to check if the RPC group is dynamic and lock the
+  // mutex if so
+  struct GroupMembershipLockGuard {
+    GroupMembershipLockGuard(std::mutex& mutex, bool isStaticGroup)
+        : ref_(mutex), isStaticGroup_(isStaticGroup) {
+      if (isStaticGroup_) {
+        ref_.lock();
+      }
+    }
+
+    ~GroupMembershipLockGuard() {
+      if (isStaticGroup_) {
+        ref_.unlock();
+      }
+    }
+
+   private:
+    GroupMembershipLockGuard(const GroupMembershipLockGuard&);
+    std::mutex& ref_;
+    bool isStaticGroup_;
+  };
+  // Mutex to guard access to group membership data
+  // e.g. updates to (workerIdToInfo_, workerNameToInfo_, workerNameToURL_)
+  mutable std::mutex groupMembershipMutex_;
+
   // Map to Track Network Data
   NetworkDataDict networkData_;
   // Mutex to guard networkData_
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index ee66f3108e52..e59ba06044d4 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -163,12 +163,20 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
     buffers.tensors = cloneSparseTensors(rpcMessage->tensors()).vec();
   }
 
+  // The corresponding unpickler in `tensorpipeDeserialize` uses tensor id as an index in `buffers.tensors`
+  // (see tensorReadFunc). Meta tensors don't have data and are not present in buffers.tensors, so to skip
+  // meta tensors `non_meta_idx` is used to generate consecutive indices for non-meta tensors in `buffers.tensors`.
+  // The `meta_idx` is used to generate unique ids for the remaining meta tensors, but they are unused.
+  int non_meta_idx = 0;
+  int meta_idx = std::count_if(buffers.tensors.begin(), buffers.tensors.end(), [](auto& t) { return !t.is_meta(); });
   torch::jit::Pickler pickler([&](const void* buf, size_t sz) -> size_t {
     buffers.pickle.insert(
         buffers.pickle.end(),
         static_cast<const char*>(buf),
         static_cast<const char*>(buf) + sz);
     return sz;
+  }, nullptr, nullptr, nullptr, [&](const at::Tensor& t) -> std::string {
+    return std::to_string(!t.is_meta() ? non_meta_idx++ : meta_idx++);
   });
   pickler.protocol();
   pickler.pushIValue(buffers.tensors);
@@ -177,10 +185,19 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
   tpMessage.payloads.push_back(tensorpipe::Message::Payload{
       buffers.pickle.data(), buffers.pickle.size()});
   const std::vector<torch::Tensor>& tensorDataVec = pickler.tensorData();
-  tpMessage.tensors.reserve(tensorDataVec.size());
+  // meta tensors don't have data and are not serialized to tpMessage.tensors
+  int nonMetaTensorsSize = std::count_if(tensorDataVec.begin(), tensorDataVec.end(),
+                                        [](auto& t) { return !t.is_meta(); });
+  tpMessage.tensors.reserve(nonMetaTensorsSize);
+  int metaTensorsCounter = 0;
   for (const auto i : c10::irange(tensorDataVec.size())) {
     const torch::Tensor& tensor = tensorDataVec[i];
 
+    if (tensor.is_meta()) {
+      metaTensorsCounter++;
+      continue;
+    }
+
     const TensorpipeDeviceTypeConverter* converter =
         getDeviceTypeConverter(tensor.device().type());
     TORCH_CHECK(
@@ -188,11 +205,11 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
         "Attempting to send a Tensor with unexpected device type ",
         tensor.device());
 
-    TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i);
+    TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i - metaTensorsCounter);
     c10::optional<std::vector<char>> maybeCopiedTensor =
         converter->prepareTensorForSending(
             tensor.storage(), streams, tpMessage);
-    TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1);
+    TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1 - metaTensorsCounter);
 
     tensorpipe::Device targetDevice = devices.empty() || devices[i].is_cpu()
         ? tensorpipe::Device{tensorpipe::kCpuDeviceType, 0}
@@ -311,8 +328,13 @@ c10::intrusive_ptr<Message> tensorpipeDeserialize(
     tensors.emplace_back(std::move(t));
   }
 
-  for (const auto i : c10::irange(tpDescriptor.tensors.size())) {
-    auto& tensor = tpDescriptor.tensors[i];
+  int metaTensorsCounter = 0;
+  for (const auto i : c10::irange(tensors.size())) {
+    if (tensors[i].is_meta()) {
+      metaTensorsCounter++;
+      continue;
+    }
+    auto& tensor = tpDescriptor.tensors[i - metaTensorsCounter];
     if (tensor.targetDevice.has_value() &&
         tensor.targetDevice->type == tensorpipe::kCudaDeviceType) {
       TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
index ae40f0897ce0..fc2dc156f7d5 100644
--- a/torch/csrc/distributed/rpc/testing/init.cpp
+++ b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -98,7 +98,8 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
           "join",
           &TensorPipeAgent::join,
           py::call_guard<py::gil_scoped_release>(),
-          py::arg("shutdown") = false)
+          py::arg("shutdown") = false,
+          py::arg("timeout") = 0)
       .def(
           "shutdown",
           &TensorPipeAgent::shutdown,
diff --git a/torch/csrc/distributed/rpc/torchscript_functions.cpp b/torch/csrc/distributed/rpc/torchscript_functions.cpp
index 464a290de1dc..8afbc8135914 100644
--- a/torch/csrc/distributed/rpc/torchscript_functions.cpp
+++ b/torch/csrc/distributed/rpc/torchscript_functions.cpp
@@ -21,10 +21,7 @@ c10::intrusive_ptr<JitFuture> rpcTorchscript(
     std::vector<c10::IValue>& stack,
     const float rpcTimeoutSeconds,
     const bool isAsyncExecution) {
-  // This dummy tensor holds an at::RecordFunction when profiling is enabled.
-  // This is because at::RecordFunction is not yet registered as a TorchScript
-  // custom class (https://github.com/pytorch/pytorch/issues/35026)
-  at::Tensor handle = at::zeros(1);
+  c10::intrusive_ptr<torch::autograd::profiler::PythonRecordFunction> record;
   auto shouldProfile = torch::autograd::profiler::profilerEnabled() &&
       !torch::distributed::rpc::RemoteProfilerManager::getInstance()
            .isCurrentKeySet();
@@ -35,7 +32,8 @@ c10::intrusive_ptr<JitFuture> rpcTorchscript(
             .qualifiedName(), /* name of torchscript function being run */
         RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_,
         dstWorkerName);
-    handle = torch::autograd::profiler::record_function_enter(rpcAsyncJitKey);
+    record =
+        torch::autograd::profiler::record_function_enter_new(rpcAsyncJitKey);
     auto& remoteProfilerManager =
         torch::distributed::rpc::RemoteProfilerManager::getInstance();
     remoteProfilerManager.setCurrentKey(rpcAsyncJitKey);
@@ -75,7 +73,8 @@ c10::intrusive_ptr<JitFuture> rpcTorchscript(
   }));
   if (shouldProfile) {
     auto profiledFutPtr =
-        torch::autograd::profiler::_call_end_callbacks_on_fut(handle, futPtr);
+        torch::autograd::profiler::_call_end_callbacks_on_fut_new(
+            record, futPtr);
     return profiledFutPtr;
   }
   return futPtr;
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
index 539c01cad245..4743ba1a8627 100644
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@@ -144,7 +144,7 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
     int64_t nindex = THPUtils_unpackLong(index);
     if (nindex < 0)
       nindex += (self->cdata->nbytes() / sizeof(scalar_t));
-    if (nindex < 0 || nindex >= (self->cdata->nbytes() / sizeof(scalar_t))) {
+    if (nindex < 0 || nindex >= static_cast<int64_t>(self->cdata->nbytes() / sizeof(scalar_t))) {
       PyErr_SetString(PyExc_IndexError, fmt::format(
             "index {} out of range for storage of size {}",
             nindex, self->cdata->nbytes() / sizeof(scalar_t)));
@@ -344,7 +344,7 @@ bool THPStorage_(init)(PyObject *module)
 
 void THPStorage_(postInit)(PyObject *module)
 {
-  THPStorageClass = PyObject_GetAttrString(module, "UntypedStorage");
+  THPStorageClass = PyObject_GetAttrString(module, "_UntypedStorage");
   if (!THPStorageClass) throw python_error();
 
   at::Backend backend = at::Backend::CPU;
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index 01cd5c49998b..701df7daaa0c 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -282,13 +282,9 @@ static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs)
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     cudaIpcEventHandle_t ipc_event_handle;
 
-#if !defined(USE_ROCM)
     if (sent_data->event_sync_required_) {
       C10_CUDA_CHECK(cudaIpcGetEventHandle(&ipc_event_handle, sent_data->event_));
     }
-#else
-    // ipc_event_handle unused in storage receiver, we can leave it uninitialized.
-#endif
 
     _event_handle = PyBytes_FromStringAndSize((char *)&ipc_event_handle, CUDA_IPC_HANDLE_SIZE);
     _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_);
@@ -400,7 +396,6 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
   int64_t device = THPUtils_unpackLong(_device);
   at::cuda::CUDAGuard device_guard(device);
 
-#if !defined(USE_ROCM)
   if (PyObject_IsTrue(_event_sync_required)) {
     // Ensure that producer prepared all tensor's data
     std::string s_ipc_event_handle =
@@ -413,9 +408,6 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
     AT_CUDA_CHECK(
         cudaStreamWaitEvent(c10::cuda::getCurrentCUDAStream(device), event, 0));
   }
-#else
-  // Already synchronized inside producer stream
-#endif
 
   std::string s_handle = THPStorage_(bytesAsHandleString)(_handle);
   std::shared_ptr<void> basePtr = c10::cuda::CUDACachingAllocator::getIpcDevPtr(s_handle);
diff --git a/torch/csrc/init_flatbuffer_module.cpp b/torch/csrc/init_flatbuffer_module.cpp
new file mode 100644
index 000000000000..77bb302423fe
--- /dev/null
+++ b/torch/csrc/init_flatbuffer_module.cpp
@@ -0,0 +1,116 @@
+#include <torch/csrc/python_headers.h>
+
+#include <libshm.h>
+#include <cstdlib>
+
+#include <pybind11/detail/common.h>
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+#include <Python.h> // NOLINT
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
+#include <torch/csrc/jit/python/module_python.h>
+#include <torch/csrc/jit/python/python_ivalue.h>
+#include <torch/csrc/jit/python/python_sugared_value.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
+
+namespace py = pybind11;
+
+static std::shared_ptr<char> copyStr(const std::string& bytes) {
+  size_t size = (bytes.size() / FLATBUFFERS_MAX_ALIGNMENT + 1) *
+      FLATBUFFERS_MAX_ALIGNMENT;
+#ifdef _WIN32
+  std::shared_ptr<char> bytes_copy(
+      static_cast<char*>(_aligned_malloc(size, FLATBUFFERS_MAX_ALIGNMENT)),
+      _aligned_free);
+#elif defined(__APPLE__)
+  void* p;
+  ::posix_memalign(&p, FLATBUFFERS_MAX_ALIGNMENT, size);
+  TORCH_INTERNAL_ASSERT(p, "Could not allocate memory for flatbuffer");
+  std::shared_ptr<char> bytes_copy(static_cast<char*>(p), free);
+#else
+  std::shared_ptr<char> bytes_copy(
+      static_cast<char*>(aligned_alloc(FLATBUFFERS_MAX_ALIGNMENT, size)), free);
+#endif
+  memcpy(bytes_copy.get(), bytes.data(), bytes.size());
+  return bytes_copy;
+}
+
+extern "C"
+#ifdef _WIN32
+    __declspec(dllexport)
+#endif
+        PyObject* initModuleFlatbuffer() {
+  using namespace torch::jit;
+  PyMethodDef m[] = {{nullptr, nullptr, 0, nullptr}}; // NOLINT
+  static struct PyModuleDef torchmodule = {
+      PyModuleDef_HEAD_INIT,
+      "torch._C_flatbuffer",
+      nullptr,
+      -1,
+      m,
+  }; // NOLINT
+  PyObject* module = PyModule_Create(&torchmodule);
+  auto pym = py::handle(module).cast<py::module>();
+  pym.def("_load_mobile_module_from_file", [](const std::string& filename) {
+    return torch::jit::load_mobile_module_from_file(filename);
+  });
+  pym.def("_load_mobile_module_from_bytes", [](const std::string& bytes) {
+    auto bytes_copy = copyStr(bytes);
+    return torch::jit::parse_and_initialize_mobile_module(
+        bytes_copy, bytes.size());
+  });
+  pym.def("_load_jit_module_from_file", [](const std::string& filename) {
+    ExtraFilesMap extra_files = ExtraFilesMap();
+    return torch::jit::load_jit_module_from_file(filename, extra_files);
+  });
+  pym.def("_load_jit_module_from_bytes", [](const std::string& bytes) {
+    auto bytes_copy = copyStr(bytes);
+    ExtraFilesMap extra_files = ExtraFilesMap();
+    return torch::jit::parse_and_initialize_jit_module(
+        bytes_copy, bytes.size(), extra_files);
+  });
+  pym.def(
+      "_save_mobile_module",
+      [](const torch::jit::mobile::Module& module,
+         const std::string& filename) {
+        return torch::jit::save_mobile_module(module, filename);
+      });
+  pym.def(
+      "_save_jit_module",
+      [](const torch::jit::Module& module, const std::string& filename) {
+        return torch::jit::save_jit_module(module, filename);
+      });
+  pym.def(
+      "_save_mobile_module_to_bytes",
+      [](const torch::jit::mobile::Module& module) {
+        auto detached_buffer = torch::jit::save_mobile_module_to_bytes(module);
+        return py::bytes(
+            reinterpret_cast<char*>(detached_buffer.data()),
+            detached_buffer.size());
+      });
+  pym.def("_save_jit_module_to_bytes", [](const torch::jit::Module& module) {
+    auto detached_buffer = torch::jit::save_jit_module_to_bytes(module);
+    return py::bytes(
+        reinterpret_cast<char*>(detached_buffer.data()),
+        detached_buffer.size());
+  });
+  pym.def("_get_module_info_from_flatbuffer", [](std::string flatbuffer_content) {
+    py::gil_scoped_acquire acquire;
+    py::dict result;
+    mobile::ModuleInfo minfo = torch::jit::get_module_info_from_flatbuffer(
+        &flatbuffer_content[0]);
+    result["bytecode_version"] = minfo.bytecode_version;
+    result["operator_version"] = minfo.operator_version;
+    result["function_names"] = minfo.function_names;
+    result["type_names"] = minfo.type_names;
+    result["opname_to_num_args"] = minfo.opname_to_num_args;
+    return result;
+  });
+
+  return module;
+}
diff --git a/torch/csrc/jit/api/function_impl.cpp b/torch/csrc/jit/api/function_impl.cpp
index 774136f3f455..356a67b9dfe9 100644
--- a/torch/csrc/jit/api/function_impl.cpp
+++ b/torch/csrc/jit/api/function_impl.cpp
@@ -88,6 +88,9 @@ const c10::FunctionSchema& GraphFunction::getSchema() const {
 }
 
 GraphFunction::SpecializationKey GraphFunction::currentSpecialization() const {
+  if (force_no_amp_) {
+    return SpecializationKey::AutocastOff;
+  }
 #ifdef C10_MOBILE
   // disabling autodiff pass for mobile build since autocast APIs don't exist
   return SpecializationKey::AutocastOff;
@@ -105,7 +108,7 @@ GraphFunction::SpecializationKey GraphFunction::currentSpecialization() const {
 #endif
 }
 
-void preoptimizeGraph(std::shared_ptr<Graph>& graph) {
+void preoptimizeGraph(std::shared_ptr<Graph>& graph, bool disable_autocast) {
   Inline(*graph);
 
   // Peephole Optimize cleans up many "is None" checks and creates constant prop
@@ -125,7 +128,9 @@ void preoptimizeGraph(std::shared_ptr<Graph>& graph) {
   //     of the any optimizations
   //  2. AMP transformations would benefit from followup passes's cleanup
   //
-  Autocast(graph);
+  if (!disable_autocast) {
+    Autocast(graph);
+  }
 #endif
 
   ConstantPooling(graph);
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
index c92e46a352e3..fb68e3e648d0 100644
--- a/torch/csrc/jit/api/function_impl.h
+++ b/torch/csrc/jit/api/function_impl.h
@@ -13,10 +13,14 @@ struct TORCH_API GraphFunction : public Function {
   GraphFunction(
       c10::QualifiedName name,
       std::shared_ptr<Graph> graph,
-      std::function<void(GraphFunction&)> function_creator)
+      std::function<void(GraphFunction&)> function_creator,
+      c10::optional<ExecutorExecutionMode> executor_execution_mode =
+          c10::nullopt)
       : name_(std::move(name)),
         graph_(std::move(graph)),
-        function_creator_(std::move(function_creator)) {}
+        function_creator_(std::move(function_creator)) {
+    executor_execution_mode_ = executor_execution_mode;
+  }
 
   bool isGraphFunction() const override {
     return true;
@@ -44,7 +48,7 @@ struct TORCH_API GraphFunction : public Function {
     }
     optimized_graph = graph_->copy();
     if (getGraphExecutorOptimize()) {
-      preoptimizeGraph(*optimized_graph);
+      preoptimizeGraph(*optimized_graph, force_no_amp_);
     }
     return *optimized_graph;
   }
@@ -53,6 +57,19 @@ struct TORCH_API GraphFunction : public Function {
     return name_;
   }
 
+  // private/unstable api. sets the initial execution mode
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_initial_executor_execution_mode(ExecutorExecutionMode mode) {
+    executor_execution_mode_ = mode;
+  }
+  // private/unstable api. sets flag of whether or not to ignore amp.
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_ignore_amp(bool ignore_amp) {
+    force_no_amp_ = ignore_amp;
+  }
+
   // if this isn't yet defined, run its method_creator function
   void ensure_defined() override;
 
@@ -92,14 +109,20 @@ struct TORCH_API GraphFunction : public Function {
       return *executor;
     }
     check_single_output();
-    executor = GraphExecutor(optimized_graph(), name_.name());
+    const std::string& name = name_.name();
+    std::shared_ptr<Graph> opt_graph = optimized_graph();
+    if (!executor_execution_mode_) {
+      executor = GraphExecutor(opt_graph, name);
+    } else {
+      executor = GraphExecutor(opt_graph, name, *executor_execution_mode_);
+    }
     return *executor;
   }
 
   using Function::call;
   bool call(
       Stack& stack,
-      size_t bailOut,
+      c10::optional<size_t> bailOut,
       c10::function_ref<void(const Code&)> f) override {
     f(get_executor().getPlanFor(stack, bailOut).code);
     return true;
@@ -128,6 +151,13 @@ struct TORCH_API GraphFunction : public Function {
   // The original, non-optimized graph
   std::shared_ptr<Graph> graph_; // for debugging and for inlining
 
+  // allows users to specify Simple/Profiling Executor for function
+  // TODO: add more executors
+  mutable c10::optional<ExecutorExecutionMode> executor_execution_mode_;
+
+  // if invoked on a graph that has already traced through amp
+  // don't invoke amp pass
+  mutable bool force_no_amp_ = false;
   // Optimized graph, computed lazily. Used for inlining.
   mutable std::array<
       c10::optional<std::shared_ptr<Graph>>,
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index c2506c6a9ecb..a6aa49278cbe 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -223,12 +223,14 @@ struct TORCH_API Module : public Object {
   void _save_for_mobile(
       std::ostream& out,
       const ExtraFilesMap& extra_files = ExtraFilesMap(),
-      bool save_mobile_debug_info = false) const;
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
 
   void _save_for_mobile(
       const std::string& filename,
       const ExtraFilesMap& extra_files = ExtraFilesMap(),
-      bool save_mobile_debug_info = false) const;
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
 
   Module copy() const;
 
@@ -265,6 +267,10 @@ struct TORCH_API Module : public Object {
     return _ivalue() == y._ivalue();
   }
 
+  void set_delete_memory(std::shared_ptr<char> delete_mem) {
+    mem_to_delete_ = delete_mem;
+  }
+
  private:
   Module clone_impl(
       std::unordered_map<TypePtr, TypePtr>& type_remap,
@@ -286,6 +292,9 @@ struct TORCH_API Module : public Object {
       const c10::optional<at::Device>& device,
       const c10::optional<at::ScalarType>& dtype,
       bool non_blocking);
+
+  // Extra handle for the module to delete when itself is deleted
+  std::shared_ptr<char> mem_to_delete_;
 };
 
 // C++ equivalent api of `torch.jit.freeze`. See documentation there for
@@ -301,6 +310,45 @@ TORCH_API Module optimize_for_inference(
     Module& module,
     const std::vector<std::string>& other_methods = {});
 
+enum class FusionBehavior { STATIC, DYNAMIC };
+
+using FusionStrategy = std::vector<std::pair<FusionBehavior, size_t>>;
+// clang-format off
+/*
+Sets the type and number of specializations that can occur during fusion.
+
+Usage: provide a list of pairs (type, depth) where type is one of STATIC or DYNAMIC
+and depth is an integer.
+
+Behavior - static vs dynamic:
+    In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined
+    based on some initial profiling runs.
+    In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
+    shapes are possible.
+
+In both cases, we also recompile on new striding behavior, device, or dtype.
+
+Behavior - fallback functions & depth:
+    When an input doesn't match the format required by the specialized compiled op, it will run
+    a fallback function. Fallback functions are recursively be compiled and specialized based
+    on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to
+    limit the number of specializations that can be compiled, before giving up on recompiling and
+    falling back to a completely un-fused, un-specialized implementation.
+
+The list of (type, depth) pairs controls the type of specializations and the number of
+specializations. For example: [(STATIC, 2), (DYNAMIC, 2)] indicates that the first
+two specializations will use static fusions, the following two specializations will use
+dynamic fusion, and any inputs that satisfy none of the 4 options will run an
+unfused implementation.
+
+NB: in the future, if more as more fusion backends are added there may be more granular
+apis for specific fusers.
+*/
+// clang-format on
+TORCH_API FusionStrategy getFusionStrategy();
+// returns previous strategy
+TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy);
+
 namespace detail {
 
 struct TORCH_API SlotCursor {
diff --git a/torch/csrc/jit/api/module_save.cpp b/torch/csrc/jit/api/module_save.cpp
index c8afa5efaf35..912c38612c35 100644
--- a/torch/csrc/jit/api/module_save.cpp
+++ b/torch/csrc/jit/api/module_save.cpp
@@ -16,25 +16,29 @@ void Module::save(const std::string& filename, const ExtraFilesMap& extra_files)
 void Module::_save_for_mobile(
     std::ostream& out,
     const ExtraFilesMap& extra_files,
-    bool save_mobile_debug_info) const {
+    bool save_mobile_debug_info,
+    bool use_flatbuffer) const {
   ExportModule(
       *this,
       out,
       extra_files,
       true /* bytecode_format */,
-      save_mobile_debug_info);
+      save_mobile_debug_info,
+      use_flatbuffer);
 }
 
 void Module::_save_for_mobile(
     const std::string& filename,
     const ExtraFilesMap& extra_files,
-    bool save_mobile_debug_info) const {
+    bool save_mobile_debug_info,
+    bool use_flatbuffer) const {
   ExportModule(
       *this,
       filename,
       extra_files,
       true /* bytecode_format */,
-      save_mobile_debug_info);
+      save_mobile_debug_info,
+      use_flatbuffer);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
index 8492e1608b21..e395326e28ca 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
@@ -136,7 +136,7 @@ struct API_AVAILABLE(ios(11.0), macos(10.13)) CoreMLExecutorWrapper
         inputs_(inputs),
         outputs_(outputs),
         config_(config) {}
-  c10::List<torch::Tensor> execute(c10::impl::GenericList inputs) {
+  c10::List<torch::Tensor> execute(const c10::impl::GenericList& inputs) {
     std::vector<PTMCoreMLFeatureSpecs> inputSpecs;
     std::vector<PTMCoreMLFeatureSpecs> outputSpecs;
     int inputSpecIndex = 0;
@@ -144,7 +144,7 @@ struct API_AVAILABLE(ios(11.0), macos(10.13)) CoreMLExecutorWrapper
     for (int i = 0; i < inputs.size(); ++i) {
       auto val = inputs.get(i);
       if (val.isTuple()) {
-        auto tuples = val.toTupleRef().elements();
+        auto& tuples = val.toTupleRef().elements();
         for (auto& ival : tuples) {
           TORCH_CHECK(ival.isTensor());
           auto tensor = ival.toTensor();
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
index ab79bbbd8995..fbb7abe87b52 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
@@ -7,10 +7,31 @@
 #import <UIKit/UIKit.h>
 #endif
 
+// Observer
+#import <torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h>
+
 #include <sys/utsname.h>
 #include <fstream>
 #include <iostream>
 
+// This is a utility macro that can be used to throw an exception when a CoreML
+// API function produces a NSError. The exception will contain a message with
+// useful info extracted from the NSError.
+#define COREML_THROW_IF_ERROR(error, preamble)                                   \
+  do {                                                                           \
+    if C10_LIKELY(error) {                                                       \
+      throw c10::Error(                                                          \
+          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)},                 \
+          c10::str(                                                              \
+              preamble,                                                          \
+              " Error details: ",                                                \
+              " Localized_description: ", error.localizedDescription.UTF8String, \
+              " Domain: ", error.domain.UTF8String,                              \
+              " Code: ", error.code,                                             \
+              " User Info: ", error.userInfo.description.UTF8String));           \
+    }                                                                            \
+  } while (false)
+
 @implementation PTMCoreMLFeatureProvider {
   NSUInteger _coremlVersion;
   std::vector<PTMCoreMLFeatureSpecs> _specs;
@@ -68,6 +89,14 @@ @implementation PTMCoreMLExecutor {
   MLModel* _mlModel;
   NSURL* _modelPath;
   NSURL* _compiledModelPath;
+
+  int32_t _model_load_id;
+  int32_t _inferences;
+
+  int32_t _sample_thresh;
+  int32_t _sample_every;
+
+  size_t _init_mem_limit;
 }
 
 + (void)setModelCacheDirectory:(NSString*)dir {
@@ -110,6 +139,24 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs
   [self _saveModel:modelSpecs];
   NSError* error = nil;
   _compiledModelPath = [self _compiledModelFilePath:_modelPath.path];
+
+  // Get observer and create an instance key
+  PTMCoreMLObserver* observer = coreMLObserverConfig().getCoreMLObserver();
+  int32_t instance_key = std::rand();
+  _model_load_id = std::rand();
+  _inferences = 0;
+
+  _init_mem_limit = 0;
+
+  _sample_thresh =
+      static_cast<int32_t>(1.0 / 1000.0 * static_cast<double>(RAND_MAX));
+  _sample_every = 500;
+
+  if (observer) {
+    _init_mem_limit = observer->getRemainingMemory();
+    observer->onEnterCompileModel(instance_key, _model_load_id);
+  }
+
   // Compile the model when OS version changes
   if ([self _shouldRecompileModel]) {
     if (@available(iOS 11.0, macOS 10.13, *)) {
@@ -128,17 +175,24 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs
         }
       }
     } else {
+      // Always log on failure
+      if (observer) {
+        observer->onExitCompileModel(instance_key, false, true);
+      }
       TORCH_CHECK(false, "CoreML is not available on your deivce");
     }
   }
 
   if (error) {
+    // Always log on failure
+    if (observer) {
+      observer->onExitCompileModel(instance_key, false, true);
+    }
+
     // remove cached models if compalition failed.
     [self cleanup];
-    TORCH_CHECK(
-        false,
-        "Error compiling the MLModel",
-        [error localizedDescription].UTF8String);
+
+    COREML_THROW_IF_ERROR(error, "Error compiling the MLModel file!");
     return NO;
   }
   if (@available(iOS 12.0, macOS 10.14, *)) {
@@ -158,40 +212,72 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs
     _mlModel = [MLModel modelWithContentsOfURL:_compiledModelPath error:&error];
   }
   if (error || !_mlModel) {
-    TORCH_CHECK(
-        false,
-        "Error loading the MLModel",
-        error.localizedDescription.UTF8String);
+    // Always log on failure
+    if (observer) {
+      observer->onExitCompileModel(instance_key, false, true);
+    }
+
+    COREML_THROW_IF_ERROR(error, "Error loading the MLModel file!");
+  }
+
+  if (observer) {
+    bool should_log = _model_load_id < _sample_thresh;
+    observer->onExitCompileModel(instance_key, true, should_log);
   }
+
   return YES;
 }
 
 - (id<MLFeatureProvider>)forwardWithInputs:
     (const std::vector<PTMCoreMLFeatureSpecs>&)inputs {
-  NSError* error = nil;
-  PTMCoreMLFeatureProvider* inputFeature = [[PTMCoreMLFeatureProvider alloc]
-      initWithFeatureSpecs:inputs
-             CoreMLVersion:self.coreMLVersion];
-  if (inputFeature == nil) {
-    return nil;
-  }
-  if (@available(iOS 11.0, macOS 10.13, *)) {
-    MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
-    id<MLFeatureProvider> outputFeature =
-        [_mlModel predictionFromFeatures:inputFeature
-                                 options:options
-                                   error:&error];
-    if (error) {
-      TORCH_CHECK(
-          false,
-          "Error running the prediction",
-          error.localizedDescription.UTF8String);
+  @autoreleasepool {
+    // Get observer and create an instance key
+    PTMCoreMLObserver* observer = coreMLObserverConfig().getCoreMLObserver();
+    int32_t instance_key = std::rand();
+
+    if (observer) {
+      observer->onEnterExecuteModel(
+          instance_key, _model_load_id, _init_mem_limit, _inferences);
     }
 
-    return outputFeature;
-  } else {
-    TORCH_CHECK(false, "Core ML is not available on your device");
-    return nil;
+    NSError* error = nil;
+    PTMCoreMLFeatureProvider* inputFeature = [[PTMCoreMLFeatureProvider alloc]
+        initWithFeatureSpecs:inputs
+               CoreMLVersion:self.coreMLVersion];
+    if (inputFeature == nil) {
+      return nil;
+    }
+    if (@available(iOS 11.0, macOS 10.13, *)) {
+      MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
+      id<MLFeatureProvider> outputFeature =
+          [_mlModel predictionFromFeatures:inputFeature
+                                   options:options
+                                     error:&error];
+
+      COREML_THROW_IF_ERROR(error, "Error running CoreML inference!");
+
+      ++_inferences;
+      if (observer) {
+        // Check if this inference session is being logged.
+        // If so, only log every N inferences
+        bool should_log = _model_load_id < _sample_thresh && _inferences > 1;
+        if (should_log) {
+          should_log = _inferences % _sample_every == 0;
+        }
+        observer->onExitExecuteModel(
+            instance_key, _inferences, true, should_log);
+      }
+
+      return outputFeature;
+    } else {
+      // Always log on failure
+      if (observer) {
+        observer->onExitExecuteModel(instance_key, _inferences, true, true);
+      }
+
+      TORCH_CHECK(false, "Core ML is not available on your device");
+      return nil;
+    }
   }
 }
 
diff --git a/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h
new file mode 100644
index 000000000000..57d11527ac9c
--- /dev/null
+++ b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h
@@ -0,0 +1,47 @@
+#include <memory>
+
+class PTMCoreMLObserver {
+ public:
+  virtual ~PTMCoreMLObserver() = default;
+
+  virtual size_t getRemainingMemory() {
+    return 0;
+  }
+
+  virtual void onEnterCompileModel(const int32_t, const int32_t) {}
+  virtual void onExitCompileModel(const int32_t, bool, bool) {}
+
+  virtual void onEnterExecuteModel(
+      const int32_t,
+      const int32_t,
+      const size_t,
+      const int32_t) {}
+  virtual void onExitExecuteModel(const int32_t, const int32_t, bool, bool) {}
+};
+
+class PTMCoreMLObserverConfig {
+ public:
+  PTMCoreMLObserverConfig();
+
+  // Do not allow copying/moving.
+  // There should be only one global instance of this class.
+  PTMCoreMLObserverConfig(const PTMCoreMLObserverConfig&) = delete;
+  PTMCoreMLObserverConfig& operator=(const PTMCoreMLObserverConfig&) = delete;
+
+  PTMCoreMLObserverConfig(PTMCoreMLObserverConfig&&) = delete;
+  PTMCoreMLObserverConfig& operator=(PTMCoreMLObserverConfig&&) = delete;
+
+ private:
+  std::unique_ptr<PTMCoreMLObserver> observer_;
+
+ public:
+  void setCoreMLObserver(std::unique_ptr<PTMCoreMLObserver> observer) {
+    observer_ = std::move(observer);
+  }
+
+  PTMCoreMLObserver* getCoreMLObserver() {
+    return observer_.get();
+  }
+};
+
+PTMCoreMLObserverConfig& coreMLObserverConfig();
diff --git a/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm
new file mode 100644
index 000000000000..372fc53622f7
--- /dev/null
+++ b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm
@@ -0,0 +1,8 @@
+#import <torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h>
+
+PTMCoreMLObserverConfig::PTMCoreMLObserverConfig() : observer_{nullptr} {}
+
+PTMCoreMLObserverConfig& coreMLObserverConfig() {
+  static PTMCoreMLObserverConfig global_instance;
+  return global_instance;
+}
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
index 7d9dc18c1258..ba4a2b25c23a 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
@@ -31,7 +31,7 @@ class NnapiBackend : public PyTorchBackendInterface {
   c10::impl::GenericDict compile(
       c10::IValue processed,
       c10::impl::GenericDict method_compile_spec) override {
-    // Wrap procesed in dictionary: {"forward": processed}
+    // Wrap processed in dictionary: {"forward": processed}
     auto dict = processed.toGenericDict();
     c10::Dict<c10::IValue, c10::IValue> handles(
         c10::StringType::get(), c10::AnyType::get());
@@ -64,7 +64,7 @@ class NnapiBackend : public PyTorchBackendInterface {
     auto inp_mem_fmts = dict.at("inp_mem_fmts").toIntList();
     TORCH_CHECK(tensorInp.size() == inp_mem_fmts.size());
     std::vector<at::Tensor> fixed_inputs;
-    for (int i = 0; i < tensorInp.size(); i++) {
+    for (auto i = 0U; i < tensorInp.size(); i++) {
       int fmt = inp_mem_fmts[i];
       // These constants match the values in DimOrder in serializer.py
       // 0: NCHW, 1: NHWC
@@ -84,7 +84,7 @@ class NnapiBackend : public PyTorchBackendInterface {
     // Adjust output memory formats
     auto out_mem_fmts = dict.at("out_mem_fmts").toIntList();
     TORCH_CHECK(outputs.size() == out_mem_fmts.size());
-    for (int i = 0; i < outputs.size(); i++) {
+    for (auto i = 0U; i < outputs.size(); i++) {
       int fmt = out_mem_fmts[i];
       // These constants match the values in DimOrder in serializer.py
       // 0: NCHW, 1: NHWC
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
index be0dbe18d90d..a787ecc6cbfd 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
@@ -96,7 +96,7 @@ c10::IValue preprocess(
   // transform Python lists to C++ c10::List
   c10::List<at::Tensor> weights(
       py::cast<std::vector<at::Tensor>>(nnapi_processed[2]));
-  for (int i = 0; i < weights.size(); i++) {
+  for (auto i = 0U; i < weights.size(); i++) {
     weights.set(i, weights.get(i).contiguous());
   }
   c10::List<int64_t> inp_mem_fmts(
diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md
new file mode 100644
index 000000000000..0ea084905cc1
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/README.md
@@ -0,0 +1,239 @@
+# NVFuser - A Fusion Code Generator for NVIDIA GPUs
+_NVFuser is integrated as a backend for TorchScript's Profiling Graph Executor_
+
+## Enabling NVFuser
+_NVFuser is not currently the default fuser for NVIDIA GPUs._
+
+**Fusions will only show up during the ~3rd iteration of execution, the exact number depends on profiling executor's optimization phases**
+
+### Enable by Context Manager
+
+```
+jit_model = torch.jit.script(model)
+
+with torch.jit.fuser("fuser2") :
+    for _ in range(5) :
+        outputs = jit_model(inputs)
+```
+
+### Enable by Specific Functions
+
+1. Disable cpu/gpu fusion for native/nnc fuser
+```
+torch._C._jit_override_can_fuse_on_cpu(False)
+torch._C._jit_override_can_fuse_on_gpu(False)
+```
+2. Disable nnc fuser
+```
+torch._C._jit_set_texpr_fuser_enabled(False)
+```
+3. Enable nvfuser
+```
+torch._C._jit_set_nvfuser_enabled(True)
+```
+
+## Simple knobs to change fusion behavior
+
+1. Allow single node fusion `torch._C._jit_set_nvfuser_single_node_mode(True)`
+Fusion group is only created when two or more compatible ops are grouped together. Turn on single node fusion would allow fusion pass to create fusion group with a single node, this is very handy for testing and could be useful when single node generated kernel out-performs native cuda kernels in framework.
+
+2. Allow horizontal fusion `torch._C._jit_set_nvfuser_horizontal_mode(True)`
+Fusion pass fuses producer to consumer, horizontal mode allows sibling nodes that shared tensor input to be fused together. This could save input memory bandwidth.
+
+3. Turn off guard for fusion `torch._C._jit_set_nvfuser_guard_mode(False)`
+This disables the runtime check on fusion group pre-assumptions (tensor meta information / constant inputs / profiled constants), this really is only used for testing as we want to ensure generated kernels are indeed tested and you should avoid using this in training scripts.
+
+4. Turn off fusion for certain node kinds `torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True)`
+This disables fusion for certain nodes, but allows other nodes to continue being fused. The first parameter is the node kind, and the second parameter is whether to toggle the node on or off in fusion.
+
+## Fusion Debugging
+
+Given the following script as an example
+
+```
+import torch
+
+def forward(x):
+    o = x + 1.0
+    o = o.relu()
+    return o
+
+shape = (2, 32, 128, 512)
+input = torch.rand(*shape).cuda()
+t = torch.jit.script(forward)
+
+with torch.jit.fuser("fuser2"):
+    for k in range(4):
+        o = t(input)
+```
+
+### TorchScript Based Debugging
+
+#### 1. TorchScript IR Graph
+
+##### Usage
+
+Two easy ways to checkout fusion for graph: The first one is to print out graph in python script after a few runs (for optimization to kick in).
+
+`print(t.graph_for(input))`
+
+The second way is to turn on graph dumping in profiling executor via command line below:
+
+```
+PYTORCH_JIT_LOG_LEVEL="profiling_graph_executor_impl" python <your pytorch script>
+```
+
+##### Example Output
+
+Graph print out is straight forward and you should look for `prim::CudaFusionGroup_X` for fused kernels. While profiling executor dumps many things, but the most important part is `Optimized Graph`. In this example, it shows a Fusion Group, which is an indication that fusion is happening and you should be expecting fused kernel!
+
+```
+  Optimized Graph:
+  graph(%x.1 : Tensor):
+    %12 : bool = prim::CudaFusionGuard[types=[Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)]](%x.1)
+    %11 : Tensor = prim::If(%12)
+      block0():
+        %o.8 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%x.1)
+        -> (%o.8)
+      block1():
+        %18 : Function = prim::Constant[name="fallback_function", fallback=1]()
+        %19 : (Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)) = prim::CallFunction(%18, %x.1)
+        %20 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = prim::TupleUnpack(%19)
+        -> (%20)
+    return (%11)
+  with prim::CudaFusionGroup_0 = graph(%2 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)):
+    %4 : int = prim::Constant[value=1]()
+    %3 : float = prim::Constant[value=1.]() # test.py:6:12
+    %o.1 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%2, %3, %4) # test.py:6:8
+    %o.5 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.1) # test.py:7:8
+    return (%o.5)
+```
+
+Note that one thing that could prevents fusion when you are running training is autodiff. Fusion pass only runs within `prim::DifferentiableGraph`, so the first thing you should check is to that targetted ops are within differentiable graph subgraphs.
+Graph dump could be quite confusing to look at, since it naively dumps all graphs executed by profiling executor and differentiable graphs are executed via a nested graph executor. So for each graph, you might see a few segmented `Optimized Graph` where each corresponds to a differentiable node in the original graph.
+
+#### 2. Cuda Fusion Graphs
+
+##### Usage
+
+Cuda fusion dump gives the input and output graph to fusion pass. This is a good place to check fusion pass logic.
+
+```
+PYTORCH_JIT_LOG_LEVEL="graph_fuser" python <your pytorch script>
+```
+
+##### Example Output
+
+Running the same script above, in the log, you should be looking for two graphs `Before Fusion` shows the subgraph where fusion pass runs on; `Before Compilation` shows the graph sent to codegen backend, where each `CudaFusionGroup` will trigger codegen runtime system to generate kernel(s) to execute the subgraph.
+
+```
+  Before Fusion:
+  graph(%x.1 : Tensor):
+    %2 : float = prim::Constant[value=1.]()
+    %1 : int = prim::Constant[value=1]()
+    %3 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%x.1)
+    %o.10 : Tensor = aten::add(%3, %2, %1) # test.py:6:8
+    %5 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.10)
+    %o.7 : Tensor = aten::relu(%5) # test.py:7:8
+    %7 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.7)
+    %8 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.7)
+    return (%7, %8)
+
+  Before Compilation:
+  graph(%x.1 : Tensor):
+    %13 : bool = prim::CudaFusionGuard[types=[Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)]](%x.1)
+    %12 : Tensor = prim::If(%13)
+      block0():
+        %o.11 : Tensor = prim::CudaFusionGroup_0(%x.1)
+        -> (%o.11)
+      block1():
+        %o.7 : Tensor = prim::FallbackGraph_1(%x.1)
+        -> (%o.7)
+    return (%12, %12)
+  with prim::CudaFusionGroup_0 = graph(%2 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)):
+    %4 : int = prim::Constant[value=1]()
+    %3 : float = prim::Constant[value=1.]()
+    %o.10 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%2, %3, %4) # test.py:6:8
+    %o.7 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.10) # test.py:7:8
+    return (%o.7)
+  with prim::FallbackGraph_1 = graph(%x.1 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)):
+    %1 : int = prim::Constant[value=1]()
+    %2 : float = prim::Constant[value=1.]()
+    %o.10 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%x.1, %2, %1) # test.py:6:8
+    %o.7 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.10) # test.py:7:8
+    return (%o.7)
+```
+
+### General ideas of debug no-fusion
+
+Currently there we have a few consumers that utilizes nvfuser via lowering computations to TorchScript and executing that through a ProfilingExecutor.
+
+Without going into too much details about how the integration is done, a few notes on debugging no-fusion on ProfilingExecutor:
+
+1. Run TorchScript module multiple times (5 could be a lucky number) to enable fusion.
+    Because ProfilingExecutor takes the first (few) runs for profiling, later optimization (including the fusion pass the enables nvfuser) relies on profiling information to run, so your initial runs are not going to trigger fused kernels.
+    Note that the number of profiling runs is dependent on your model.
+
+2. Fused kernel should show up in TorchScript IR as `prim::CudaFusionGroup`. You can look at your TorchScript optimized graph to see if fusion is happening `jit_model.graph_for(*inputs)`.
+
+3. If your scripted model has inputs requiring gradient, fusion is only happening for graphs inside `prim::DifferentiableGraph`.
+    There are many reasons why your graph is not autodiff-able. Take a look at `/torch/csrc/jit/runtime/symbolic_scripts.cpp`, which lists all autodiff-able ops (note that this is a different list from autograd-supported ops). There's also a threshold where tiny autodiff graph are inlined/reverted, which could be disabled via `torch._C._debug_set_autodiff_subgraph_inlining(False)`.
+
+### General ideas of debug nvfuser mal-functioning
+
+Assuming we have ProfilingExecutor things worked out properly, that is, you see a region that's supposed to be fused but did not ended up in a fused kernel, here's ways to dig deeper:
+
+1. Dump fusion pass result:
+    `PYTORCH_JIT_LOG_LEVEL=graph_fuser python your_script.py &> log`
+
+    Looks for graph dumped with `Before Fusion` & `Before Compilation`, which shows the portion of graph where fusion pass runs on and the result of fusion (`CudaFusionGroup`).
+
+2. Check out which ops are not fused and roughly why:
+    `PYTORCH_JIT_LOG_LEVEL=">partition:graph_fuser" python your_script.py &> log`
+
+    Enabling GRAPH_UPDATE from partition.cpp dumps a log when a given node is rejected by fusion.
+
+3. Disabling FALLBACK path:
+    If you see a warning where a FALLBACK path has been taken while executing your model with nvfuser enabled, it's indicating that either codegen or fusion pass has failed unexpectedly. This is likely to cause regression on model performance, even though it's still functionally correct. We recommend to disable FALLBACK path, so error would be reported properly to open an informative issue.
+
+    `PYTORCH_NVFUSER_DISABLE=fallback python your_script.py &> log`
+
+4. Pin point kernel/fusion pattern that's causing error:
+    With a larger model that includes multiple fusion patterns, it could be tricky to figure out which exact fusion is causing FALLBACK and build up a minimal python repro.
+    One quick thing to try is to run the example with a few knobs turned on:
+
+    ```
+    PYTORCH_NVFUSER_DISABLE=fallback \
+    PYTORCH_JIT_LOG_LEVEL=">partition:graph_fuser:>>kernel_cache" \
+    python your_script.py &> log
+    ```
+
+    This logs all TorchScript IR parsed to codegen IR as well as kernel generated and executed by nvfuser. Since fallback path is disabled, it's likely that the last log would indicate the failing fusion.
+
+    Hint: look for last `Before Compilation:` that indicates a parsing failure, or `running GraphCache: xxxxx`, which indicates jit compilation/execution failure (also search for the GraphCache address, which would should have dumped a TorchScript IR earlier.
+
+### Query nvfuser codegen kernels
+
+There're a few debug dump that could be turned on via environment variables. Look for `PYTORCH_NVFUSER_DUMP` inside `[pytorch_source_path]/torch/csrc/jit/codegen/cuda/utils.cpp`. A few useful ones are:
+1. `dump_eff_bandwidth`: print out effective bandwidth of each generated kernel. This naively measure the kernel time divided by I/O buffer size and is a good/simple metric of performance for bandwidth bound kernels
+2. `cuda_kernel`: print out generated cuda kernels
+3. `launch_param`: print out launch config of generated kernels
+4. `print_args`: print out input output tensors of executed codegen kernels
+
+### FAQs
+
+1. There's regression after turning on nvfuser.
+
+First thing is to check that you have fusion kernel running properly. Try to run your model with fallback disabled to see if you hit any errors that caused fallback via `export PYTORCH_NVFUSER_DISABLE=fallback`.
+
+If turning on NVFuser produces unexpected outputs, set the `PYTORCH_NVFUSER_DISABLE` environment variable to disable some of the optional features, e.g.:
+- `fma`: disable using FMA instructions
+- `index_hoist`: disble optimization to hoist comon index expressions
+- `predicate_elimination`: disble optimization to eliminate redundant predicates
+- `unroll_with_rng`: disable unrolling when RNG is used
+
+For example, `export PYTORCH_NVFUSER_DISABLE=fma,index_hoist` would disable FMA and index hoisting.
+
+2. I didn't see any speedup with nvfuser.
+
+Check if there is fusion in your script model. Run your script with `PYTORCH_JIT_LOG_LEVEL="graph_fuser"`, you should see some log dump of before/after graph regarding fusion pass. If nothing shows up in the log, that means something in TorchScript is not right and fusion pass are not executed. Check [General ideals of debug no-fusion] for more details.
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
index 2c9925cf8933..d8a9fc9751b9 100644
--- a/torch/csrc/jit/codegen/cuda/arith.cpp
+++ b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -1,8 +1,11 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 
+#include <c10/util/BFloat16.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Half.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -23,14 +26,18 @@ Val* newScalar(ValType vtype, DataType dtype) {
     case (ValType::Scalar):
       switch (dtype) {
         case DataType::Bool:
-          return new Bool();
+          return IrBuilder::create<Bool>();
         case DataType::Double:
         case DataType::Float:
         case DataType::Half:
         case DataType::BFloat16:
-          return new Double();
+          return IrBuilder::create<Double>();
+        case DataType::Int32:
         case DataType::Int:
-          return new Int();
+          return IrBuilder::create<Int>();
+        case DataType::ComplexFloat:
+        case DataType::ComplexDouble:
+          return IrBuilder::create<ComplexDouble>();
         default:
           break;
       }
@@ -103,10 +110,10 @@ TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
   }
   for (const auto dim_i : c10::irange(out_domain.size())) {
     if (extent_vals[dim_i] != nullptr) {
-      out_domain[dim_i] = new IterDomain(
-          new Int(start_offsets[dim_i]),
+      out_domain[dim_i] = IrBuilder::create<IterDomain>(
+          IrBuilder::create<Int>(start_offsets[dim_i]),
           extent_vals[dim_i],
-          new Int(stop_offsets[dim_i]),
+          IrBuilder::create<Int>(stop_offsets[dim_i]),
           ParallelType::Serial,
           iter_types[dim_i]);
     } else {
@@ -121,13 +128,17 @@ TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
           break;
         }
       }
-      out_domain[dim_i] =
-          new IterDomain(new Int(0), new Int(1), ParallelType::Serial, itype);
+      out_domain[dim_i] = IrBuilder::create<IterDomain>(
+          FusionGuard::getCurFusion()->zeroVal(),
+          FusionGuard::getCurFusion()->oneVal(),
+          ParallelType::Serial,
+          itype);
     }
   }
 
-  return new TensorView(
-      new TensorDomain(out_domain, std::vector<bool>(out_domain.size(), true)),
+  return IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, std::vector<bool>(out_domain.size(), true)),
       dtype);
 }
 
@@ -177,11 +188,84 @@ Val* newValLike(Val* val, DataType dtype) {
   return newScalar(vtype, dtype);
 }
 
+// returns the minimum init value for reduction:
+//   -inf for floating type;
+//   lowest value for integer type;
+//   false for bool.
+Val* getMinimumValue(DataType v) {
+  switch (v) {
+    case (DataType::Double):
+      return IrBuilder::create<Double>(
+          -std::numeric_limits<double>::infinity());
+      break;
+    case (DataType::Float):
+      return IrBuilder::create<Double>(-std::numeric_limits<float>::infinity());
+      break;
+    case (DataType::Half):
+      return IrBuilder::create<Double>(
+          static_cast<double>(-std::numeric_limits<c10::Half>::infinity()));
+      break;
+    case DataType::BFloat16:
+      return IrBuilder::create<Double>(
+          static_cast<double>(-std::numeric_limits<c10::BFloat16>::infinity()));
+      break;
+    case (DataType::Int):
+      return IrBuilder::create<Int>(std::numeric_limits<int64_t>::lowest());
+      break;
+    case (DataType::Int32):
+      return IrBuilder::create<Int>(std::numeric_limits<int32_t>::lowest());
+      break;
+    case (DataType::Bool):
+      return IrBuilder::create<Bool>(false);
+      break;
+    default:
+      TORCH_CHECK(
+          false, "Could not generate a min op for tensor with type: ", v);
+  }
+  return nullptr;
+}
+
+// returns the maximum init value for reduction:
+//   inf for floating type;
+//   highest value for integer type;
+//   true for bool.
+Val* getMaximumValue(DataType v) {
+  switch (v) {
+    case (DataType::Double):
+      return IrBuilder::create<Double>(std::numeric_limits<double>::infinity());
+      break;
+    case (DataType::Float):
+      return IrBuilder::create<Double>(std::numeric_limits<float>::infinity());
+      break;
+    case (DataType::Half):
+      return IrBuilder::create<Double>(
+          static_cast<double>(std::numeric_limits<c10::Half>::infinity()));
+      break;
+    case DataType::BFloat16:
+      return IrBuilder::create<Double>(
+          static_cast<double>(std::numeric_limits<c10::BFloat16>::infinity()));
+      break;
+    case (DataType::Int):
+      return IrBuilder::create<Int>(std::numeric_limits<int64_t>::max());
+      break;
+    case (DataType::Int32):
+      return IrBuilder::create<Int>(std::numeric_limits<int32_t>::max());
+      break;
+    case (DataType::Bool):
+      return IrBuilder::create<Bool>(true);
+      break;
+    default:
+      TORCH_CHECK(
+          false, "Could not generate a max op for tensor with type: ", v);
+  }
+  return nullptr;
+}
+
 } // namespace
 
 Val* castOp(DataType dtype, Val* v1) {
   if (v1->getDataType().value() == dtype) {
-    return v1;
+    return set(v1);
   }
 
   if (cast_func_str(std::make_pair(v1->getDataType().value(), dtype)) ==
@@ -195,7 +279,7 @@ Val* castOp(DataType dtype, Val* v1) {
   }
 
   Val* out = newValLike(v1, dtype);
-  new UnaryOp(UnaryOpType::Cast, out, v1);
+  IrBuilder::create<UnaryOp>(UnaryOpType::Cast, out, v1);
   return out;
 }
 
@@ -203,6 +287,24 @@ TensorView* castOp(DataType dtype, TensorView* v1) {
   return castOp(dtype, v1->as<Val>())->as<TensorView>();
 }
 
+Val* bitCastOp(DataType dtype, Val* v1) {
+  if (v1->getDataType().value() == dtype) {
+    return v1;
+  }
+
+  TORCH_CHECK(
+      dataTypeSize(v1->getDataType().value()) == dataTypeSize(dtype),
+      "BitCast only works for types of the same size");
+
+  Val* out = newValLike(v1, dtype);
+  IrBuilder::create<UnaryOp>(UnaryOpType::BitCast, out, v1);
+  return out;
+}
+
+TensorView* bitCastOp(DataType dtype, TensorView* v1) {
+  return bitCastOp(dtype, v1->as<Val>())->as<TensorView>();
+}
+
 Val* unaryOp(UnaryOpType type, Val* v1) {
   TORCH_INTERNAL_ASSERT(
       type != UnaryOpType::Address,
@@ -219,7 +321,7 @@ Val* unaryOp(UnaryOpType type, Val* v1) {
   // }
 
   Val* out = newValLike(v1, v1->getDataType().value());
-  new UnaryOp(type, out, v1);
+  IrBuilder::create<UnaryOp>(type, out, v1);
   return out;
 }
 
@@ -227,17 +329,27 @@ TensorView* unaryOp(UnaryOpType type, TensorView* v1) {
   return unaryOp(type, v1->as<Val>())->as<TensorView>();
 }
 
+Val* unaryIsOp(UnaryOpType type, Val* v) {
+  Val* out = newValLike(v, DataType::Bool);
+  IrBuilder::create<UnaryOp>(type, out, v);
+  return out;
+}
+
+TensorView* unaryIsOp(UnaryOpType type, TensorView* v) {
+  return unaryOp(type, v->asVal())->as<TensorView>();
+}
+
 Val* unaryOp(UnaryOpType type, Val* v1, const TypePromotionConfig& config) {
-  auto casted_v1 = promoteValues(config, {v1}).front();
-  return unaryOp(type, casted_v1);
+  auto cast_v1 = promoteValues(config, {v1}).front();
+  return unaryOp(type, cast_v1);
 }
 
 TensorView* unaryOp(
     UnaryOpType type,
     TensorView* v1,
     const TypePromotionConfig& config) {
-  auto casted_v1 = promoteValues(config, {v1}).front();
-  return unaryOp(type, casted_v1)->as<TensorView>();
+  auto cast_v1 = promoteValues(config, {v1}).front();
+  return unaryOp(type, cast_v1)->as<TensorView>();
 }
 
 // UNARY OPERATIONS
@@ -252,12 +364,9 @@ TensorView* unaryOp(
 
 NVFUSER_DEFINE_UNARY_OP(set, Set)
 NVFUSER_DEFINE_UNARY_OP(randlike, RandLike)
-NVFUSER_DEFINE_UNARY_OP(abs, Abs)
-NVFUSER_DEFINE_UNARY_OP(notOp, Not)
 NVFUSER_DEFINE_UNARY_OP(ceil, Ceil)
 NVFUSER_DEFINE_UNARY_OP(floor, Floor)
 NVFUSER_DEFINE_UNARY_OP(frac, Frac)
-NVFUSER_DEFINE_UNARY_OP(gelu, Gelu)
 NVFUSER_DEFINE_UNARY_OP(neg, Neg)
 NVFUSER_DEFINE_UNARY_OP(relu, Relu)
 NVFUSER_DEFINE_UNARY_OP(round, Round)
@@ -265,6 +374,41 @@ NVFUSER_DEFINE_UNARY_OP(silu, Silu)
 NVFUSER_DEFINE_UNARY_OP(trunc, Trunc)
 #undef NVFUSER_DEFINE_UNARY_OP
 
+Val* bitwise_not(Val* v) {
+  TORCH_CHECK(
+      isIntegralType(v->dtype()) || v->dtype() == DataType::Bool,
+      "input must have integral or boolean type, but got ",
+      v->dtype());
+  return unaryOp(UnaryOpType::Not, v);
+}
+
+TensorView* bitwise_not(TensorView* tv) {
+  TORCH_CHECK(
+      isIntegralType(tv->dtype()) || tv->dtype() == DataType::Bool,
+      "input must have integral or boolean type, but got ",
+      tv->dtype());
+  return unaryOp(UnaryOpType::Not, tv);
+}
+
+// The output of abs(complex_tensor) are real numbers
+Val* abs(Val* v) {
+  if (v->getDataType() == DataType::ComplexDouble) {
+    Val* out = newValLike(v, DataType::Double);
+    IrBuilder::create<UnaryOp>(UnaryOpType::Abs, out, v);
+    return out;
+  }
+  if (v->getDataType() == DataType::ComplexFloat) {
+    Val* out = newValLike(v, DataType::Float);
+    IrBuilder::create<UnaryOp>(UnaryOpType::Abs, out, v);
+    return out;
+  }
+  return unaryOp(UnaryOpType::Abs, v);
+}
+
+TensorView* abs(TensorView* tv) {
+  return abs(tv->as<Val>())->as<TensorView>();
+}
+
 // UNARY FLOAT CAST OPERATIONS
 
 #define NVFUSER_DEFINE_UNARY_FLOAT_OP(op_name, op_type)                       \
@@ -300,6 +444,22 @@ NVFUSER_DEFINE_UNARY_FLOAT_OP(tan, Tan)
 NVFUSER_DEFINE_UNARY_FLOAT_OP(tanh, Tanh)
 #undef NVFUSER_DEFINE_UNARY_FLOAT_OP
 
+#define NVFUSER_DEFINE_UNARY_IS_OP(op_name, op_type) \
+  Val* op_name(Val* v) {                             \
+    return unaryIsOp(UnaryOpType::op_type, v);       \
+  }                                                  \
+  TensorView* op_name(TensorView* tv) {              \
+    return unaryIsOp(UnaryOpType::op_type, tv);      \
+  }
+
+NVFUSER_DEFINE_UNARY_IS_OP(isfinite, IsFinite)
+NVFUSER_DEFINE_UNARY_IS_OP(isinf, IsInf)
+NVFUSER_DEFINE_UNARY_IS_OP(isnan, IsNan)
+NVFUSER_DEFINE_UNARY_IS_OP(isneginf, IsNegInf)
+NVFUSER_DEFINE_UNARY_IS_OP(isposinf, IsPosInf)
+NVFUSER_DEFINE_UNARY_IS_OP(isreal, IsReal)
+#undef NVFUSER_DEFINE_UNARY_IS_OP
+
 // BINARY OPERATIONS
 
 namespace {
@@ -379,7 +539,7 @@ Val* binaryOp(BinaryOpType type, Val* v1, Val* v2, DataType common_dtype) {
   } else {
     out = newScalar(out_vtype, out_dtype);
   }
-  new BinaryOp(type, out, vals[0], vals[1]);
+  IrBuilder::create<BinaryOp>(type, out, vals[0], vals[1]);
   return out;
 }
 
@@ -414,9 +574,8 @@ Val* binaryOp(
     const TypePromotionConfig& config) {
   std::vector<Val*> operands = {v1, v2};
   auto common_dtype = computeTypes(config, operands);
-  auto casted_values = promoteValues(operands, common_dtype);
-  return binaryOp(
-      type, casted_values.front(), casted_values.back(), common_dtype);
+  auto cast_values = promoteValues(operands, common_dtype);
+  return binaryOp(type, cast_values.front(), cast_values.back(), common_dtype);
 }
 
 TensorView* binaryOp(
@@ -426,11 +585,11 @@ TensorView* binaryOp(
     const TypePromotionConfig& config) {
   std::vector<Val*> operands = {v1, v2};
   auto common_dtype = computeTypes(config, operands);
-  auto casted_values = promoteValues(operands, common_dtype);
+  auto cast_values = promoteValues(operands, common_dtype);
   return binaryOp(
       type,
-      casted_values.front()->as<TensorView>(),
-      casted_values.back(),
+      cast_values.front()->as<TensorView>(),
+      cast_values.back(),
       common_dtype);
 }
 
@@ -441,11 +600,11 @@ TensorView* binaryOp(
     const TypePromotionConfig& config) {
   std::vector<Val*> operands = {v1, v2};
   auto common_dtype = computeTypes(config, operands);
-  auto casted_values = promoteValues(operands, common_dtype);
+  auto cast_values = promoteValues(operands, common_dtype);
   return binaryOp(
       type,
-      casted_values.front(),
-      casted_values.back()->as<TensorView>(),
+      cast_values.front(),
+      cast_values.back()->as<TensorView>(),
       common_dtype);
 }
 
@@ -456,11 +615,11 @@ TensorView* binaryOp(
     const TypePromotionConfig& config) {
   std::vector<Val*> operands = {v1, v2};
   auto common_dtype = computeTypes(config, operands);
-  auto casted_values = promoteValues(operands, common_dtype);
+  auto cast_values = promoteValues(operands, common_dtype);
   return binaryOp(
       type,
-      casted_values.front()->as<TensorView>(),
-      casted_values.back()->as<TensorView>(),
+      cast_values.front()->as<TensorView>(),
+      cast_values.back()->as<TensorView>(),
       common_dtype);
 }
 
@@ -507,20 +666,111 @@ NVFUSER_DEFINE_BINARY_FLOAT_OP(atan2, Atan2)
 // Integer binary ops
 NVFUSER_DEFINE_BINARY_CAST_OP(mod, Mod)
 NVFUSER_DEFINE_BINARY_CAST_OP(ceilDiv, CeilDiv)
-
 NVFUSER_DEFINE_BINARY_CAST_OP(add, Add)
 NVFUSER_DEFINE_BINARY_CAST_OP(fmod, Fmod)
 NVFUSER_DEFINE_BINARY_CAST_OP(mul, Mul)
 NVFUSER_DEFINE_BINARY_CAST_OP(pow, Pow)
 NVFUSER_DEFINE_BINARY_CAST_OP(remainder, Remainder)
 NVFUSER_DEFINE_BINARY_CAST_OP(sub, Sub)
-NVFUSER_DEFINE_BINARY_CAST_OP(lshift, Lshift)
-NVFUSER_DEFINE_BINARY_CAST_OP(rshift, Rshift)
-NVFUSER_DEFINE_BINARY_CAST_OP(andOp, And)
-NVFUSER_DEFINE_BINARY_CAST_OP(orOp, Or)
-NVFUSER_DEFINE_BINARY_CAST_OP(xorOp, Xor)
 #undef NVFUSER_DEFINE_BINARY_CAST_OP
 
+#define NVFUSER_DEFINE_BITWISE_OP(op_name, op_type)                         \
+  Val* op_name(Val* v1, Val* v2) {                                          \
+    TORCH_CHECK(                                                            \
+        (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) &&   \
+            (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \
+        "input must have integral or boolean type, but got ",               \
+        v1->dtype(),                                                        \
+        " and ",                                                            \
+        v2->dtype());                                                       \
+    return binaryOp(                                                        \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config);   \
+  }                                                                         \
+  TensorView* op_name(TensorView* v1, Val* v2) {                            \
+    TORCH_CHECK(                                                            \
+        (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) &&   \
+            (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \
+        "input must have integral or boolean type, but got ",               \
+        v1->dtype(),                                                        \
+        " and ",                                                            \
+        v2->dtype());                                                       \
+    return binaryOp(                                                        \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config);   \
+  }                                                                         \
+  TensorView* op_name(Val* v1, TensorView* v2) {                            \
+    TORCH_CHECK(                                                            \
+        (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) &&   \
+            (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \
+        "input must have integral or boolean type, but got ",               \
+        v1->dtype(),                                                        \
+        " and ",                                                            \
+        v2->dtype());                                                       \
+    return binaryOp(                                                        \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config);   \
+  }                                                                         \
+  TensorView* op_name(TensorView* v1, TensorView* v2) {                     \
+    TORCH_CHECK(                                                            \
+        (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) &&   \
+            (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \
+        "input must have integral or boolean type, but got ",               \
+        v1->dtype(),                                                        \
+        " and ",                                                            \
+        v2->dtype());                                                       \
+    return binaryOp(                                                        \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config);   \
+  }
+
+NVFUSER_DEFINE_BITWISE_OP(bitwise_and, And)
+NVFUSER_DEFINE_BITWISE_OP(bitwise_or, Or)
+NVFUSER_DEFINE_BITWISE_OP(bitwise_xor, Xor)
+#undef NVFUSER_DEFINE_BITWISE_OP
+
+#define NVFUSER_DEFINE_BITWISE_SHIFT_OP(op_name, op_type)                 \
+  Val* op_name(Val* v1, Val* v2) {                                        \
+    TORCH_CHECK(                                                          \
+        isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()),       \
+        "input must have integral type, but got ",                        \
+        v1->dtype(),                                                      \
+        " and ",                                                          \
+        v2->dtype());                                                     \
+    return binaryOp(                                                      \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \
+  }                                                                       \
+  TensorView* op_name(TensorView* v1, Val* v2) {                          \
+    TORCH_CHECK(                                                          \
+        isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()),       \
+        "input must have integral type, but got ",                        \
+        v1->dtype(),                                                      \
+        " and ",                                                          \
+        v2->dtype());                                                     \
+    return binaryOp(                                                      \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \
+  }                                                                       \
+  TensorView* op_name(Val* v1, TensorView* v2) {                          \
+    TORCH_CHECK(                                                          \
+        isIntegralType(v2->dtype()) && isIntegralType(v2->dtype()),       \
+        "input must have integral type, but got ",                        \
+        v1->dtype(),                                                      \
+        " and ",                                                          \
+        v2->dtype());                                                     \
+    return binaryOp(                                                      \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \
+  }                                                                       \
+  TensorView* op_name(TensorView* v1, TensorView* v2) {                   \
+    TORCH_CHECK(                                                          \
+        isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()),       \
+        "input must have integral type, but got ",                        \
+        v1->dtype(),                                                      \
+        " and ",                                                          \
+        v2->dtype());                                                     \
+    return binaryOp(                                                      \
+        BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \
+  }
+
+NVFUSER_DEFINE_BITWISE_SHIFT_OP(bitwise_left_shift, Lshift)
+NVFUSER_DEFINE_BITWISE_SHIFT_OP(bitwise_right_shift, Rshift)
+#undef NVFUSER_DEFINE_BITWISE_SHIFT_OP
+
 #define NVFUSER_DEFINE_BINARY_COMPARE_OP(op_name, op_type)                   \
   Val* op_name(Val* v1, Val* v2) {                                           \
     return binaryOp(                                                         \
@@ -589,7 +839,7 @@ static TensorView* newForReduction(
         " of tensor ",
         tv);
 
-    new_domain.push_back(new IterDomain(
+    new_domain.push_back(IrBuilder::create<IterDomain>(
         id->start(),
         id->extent(),
         id->stopOffset(),
@@ -597,12 +847,12 @@ static TensorView* newForReduction(
         isReduction ? IterType::Reduction : id->getIterType()));
   }
 
-  TensorDomain* td =
-      new TensorDomain(new_domain, std::vector<bool>(new_domain.size(), true));
+  TensorDomain* td = IrBuilder::create<TensorDomain>(
+      new_domain, std::vector<bool>(new_domain.size(), true));
 
   data_type =
       data_type == DataType::Null ? tv->getDataType().value() : data_type;
-  return new TensorView(td, data_type);
+  return IrBuilder::create<TensorView>(td, data_type);
 }
 
 TensorView* reductionOp(
@@ -610,7 +860,8 @@ TensorView* reductionOp(
     const std::vector<int>& axes,
     Val* init,
     TensorView* tv,
-    bool keep_dim /*=false*/) {
+    bool keep_dim /*=false*/,
+    DataType dtype /* DataType::Null */) {
   TORCH_CHECK(
       init->isConstScalar(),
       "Cannot create a reduction operation where the initial value is not a const scalar.");
@@ -641,21 +892,22 @@ TensorView* reductionOp(
     uint_axes.push_back((unsigned int)axis);
   }
 
-  TensorView* out = newForReduction(tv, uint_axes);
+  TensorView* out = newForReduction(tv, uint_axes, dtype);
   const auto out_type = out->getDataType().value();
   const auto init_type = init->getDataType().value();
   TORCH_CHECK(
       (isFloatingPointType(out_type) && isFloatingPointType(init_type)) ||
+          (isComplexType(out_type) && isComplexType(init_type)) ||
           (isIntegralType(out_type) && isIntegralType(init_type)) ||
-          (out_type == DataType::Bool && init_type == DataType::Bool),
+          (isBooleanType(out_type) && isBooleanType(init_type)),
       "Types should match for reduction ops but received: ",
       out_type,
       " and ",
       init_type);
-  new ReductionOp(reduction_op_type, init, out, tv);
+  IrBuilder::create<ReductionOp>(reduction_op_type, init, out, tv);
 
   if (keep_dim) {
-    auto tv_root = TensorDomain::noReductions(tv->getRootDomain());
+    auto tv_root = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
     std::vector<bool> is_broadcast(tv_root.size(), false);
     for (auto axis : uint_axes) {
       is_broadcast.at(axis) = true;
@@ -669,45 +921,44 @@ TensorView* reductionOp(
 TensorView* sum(
     TensorView* v1,
     const std::vector<int>& axes,
-    bool keep_dim /*=false*/) {
+    bool keep_dim /*=false*/,
+    DataType dtype /* DataType::Null */) {
+  if (dtype == DataType::Null) {
+    auto initial_v1_dtype = v1->getDataType().value();
+    if (isBooleanType(initial_v1_dtype) || isIntegralType(initial_v1_dtype)) {
+      dtype = DataType::Int;
+    }
+  }
+
+  // Cast input tensor to dtype before the operation is performed
+  if (dtype != DataType::Null) {
+    v1 = optionalCastStrict(dtype, v1)->as<TensorView>();
+  }
+
   Val* init = nullptr;
-  auto dtype = v1->getDataType().value();
-  if (isFloatingPointType(dtype)) {
-    init = new Double(0.0);
-  } else if (isIntegralType(dtype)) {
-    init = new Int(0);
+  auto v1_dtype = v1->getDataType().value();
+  if (isFloatingPointType(v1_dtype)) {
+    init = IrBuilder::create<Double>(0.0);
+  } else if (isComplexType(v1_dtype)) {
+    init = IrBuilder::create<ComplexDouble>(c10::complex<double>(0.0, 0.0));
+  } else if (isIntegralType(v1_dtype)) {
+    init = FusionGuard::getCurFusion()->zeroVal();
+  } else if (isBooleanType(v1_dtype)) {
+    init = IrBuilder::create<Bool>(false);
   } else {
     TORCH_CHECK(
-        false,
-        "Could not generate a sum op for tensor with type: ",
-        v1->getDataType().value());
+        false, "Could not generate a sum op for tensor with type: ", v1_dtype);
   }
 
-  return reductionOp(BinaryOpType::Add, axes, init, v1, keep_dim);
+  return reductionOp(BinaryOpType::Add, axes, init, v1, keep_dim, dtype);
 }
 
 TensorView* max(
     TensorView* v1,
     const std::vector<int>& axes,
     bool keep_dim /*=false*/) {
-  Val* init = nullptr;
-  switch (v1->getDataType().value()) {
-    case (DataType::Double):
-      init = new Double(std::numeric_limits<double>::lowest());
-      break;
-    case (DataType::Float):
-      init = new Double(std::numeric_limits<float>::lowest());
-      break;
-    case (DataType::Int):
-      init = new Int(INT_MIN);
-      break;
-    default:
-      TORCH_CHECK(
-          false,
-          "Could not generate a max op for tensor with type: ",
-          v1->getDataType().value());
-  }
-
+  Val* init = getMinimumValue(v1->getDataType().value());
+  TORCH_CHECK(init != nullptr, "Missing initial value");
   return reductionOp(BinaryOpType::Max, axes, init, v1, keep_dim);
 }
 
@@ -715,24 +966,8 @@ TensorView* min(
     TensorView* v1,
     const std::vector<int>& axes,
     bool keep_dim /*=false*/) {
-  Val* init = nullptr;
-  switch (v1->getDataType().value()) {
-    case (DataType::Double):
-      init = new Double(DBL_MAX);
-      break;
-    case (DataType::Float):
-      init = new Double(FLT_MAX);
-      break;
-    case (DataType::Int):
-      init = new Int(INT_MAX);
-      break;
-    default:
-      TORCH_CHECK(
-          false,
-          "Could not generate a min op for tensor with type: ",
-          v1->getDataType().value());
-  }
-
+  Val* init = getMaximumValue(v1->getDataType().value());
+  TORCH_CHECK(init != nullptr, "Missing initial value");
   return reductionOp(BinaryOpType::Min, axes, init, v1, keep_dim);
 }
 
@@ -742,9 +977,12 @@ TensorView* broadcast(
   auto nBCastDims = is_broadcast_dim.size();
   // Validate is_broadcast_dim
   unsigned int n_broadcasts = 0;
-  for (auto ent : is_broadcast_dim)
-    if (ent)
+  for (auto ent : is_broadcast_dim) {
+    if (ent) {
       n_broadcasts++;
+    }
+  }
+
   TORCH_CHECK(
       nBCastDims - n_broadcasts ==
           TensorDomain::noReductions(inp->getMaybeRFactorDomain()).size(),
@@ -767,22 +1005,28 @@ TensorView* broadcast(
   size_t iinp = 0, ibdim = 0;
   while (ibdim < is_broadcast_dim.size()) {
     if (is_broadcast_dim[ibdim]) {
-      out_domain.push_back(new IterDomain(
-          new Int(0),
-          new Int(1),
+      out_domain.push_back(IrBuilder::create<IterDomain>(
+          FusionGuard::getCurFusion()->zeroVal(),
+          FusionGuard::getCurFusion()->oneVal(),
           ParallelType::Serial,
           IterType::BroadcastWithoutStride));
     } else {
-      out_domain.push_back(inp_domain[iinp]->clone());
+      out_domain.push_back(IrBuilder::create<IterDomain>(
+          inp_domain[iinp]->start(),
+          inp_domain[iinp]->extent(),
+          inp_domain[iinp]->stopOffset(),
+          inp_domain[iinp]->getParallelType(),
+          inp_domain[iinp]->getIterType()));
       iinp++;
     }
     ibdim++;
   }
 
-  TensorView* out_tensor = new TensorView(
-      new TensorDomain(out_domain, std::vector<bool>(out_domain.size(), true)),
+  TensorView* out_tensor = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, std::vector<bool>(out_domain.size(), true)),
       inp->getDataType().value());
-  new BroadcastOp(out_tensor, inp, is_broadcast_dim);
+  IrBuilder::create<BroadcastOp>(out_tensor, inp, is_broadcast_dim);
   return out_tensor;
 }
 
@@ -799,6 +1043,10 @@ WelfordResult Welford(
   TORCH_CHECK(tv->nDims() > 0, "Tried to reduce a 0-dim tensor");
   TORCH_CHECK(axes.size() > 0, "No reduction axis specified");
 
+  if (init_N == nullptr) {
+    init_N = FusionGuard::getCurFusion()->zeroVal();
+  }
+
   // Initial values for welford op are tensors, so their dims have to match the
   // output dim,
   // i.e. original_dims - dims_to_be_reduced
@@ -819,8 +1067,8 @@ WelfordResult Welford(
     init_avg_val = init_avg;
     init_var_val = init_var;
   } else {
-    init_avg_val = new Double(0);
-    init_var_val = new Double(0);
+    init_avg_val = IrBuilder::create<Double>(0);
+    init_var_val = IrBuilder::create<Double>(0);
   }
 
   // Check and collect reduction axes
@@ -845,9 +1093,9 @@ WelfordResult Welford(
   // Create tensor outputs
   TensorView* out_avg = newForReduction(tv, uint_axes);
   TensorView* out_var = newForReduction(tv, uint_axes);
-  TensorView* out_N = newForReduction(tv, uint_axes, DataType::Int);
+  TensorView* out_N = newForReduction(tv, uint_axes, DataType::Index);
 
-  new WelfordOp(
+  IrBuilder::create<WelfordOp>(
       out_avg,
       out_var,
       out_N, /*out var/avg/count */
@@ -855,8 +1103,8 @@ WelfordResult Welford(
       init_var_val,
       init_N, /*init var/avg/count */
       tv,
-      nullptr,
-      new Int(1)); /*in var/avg/count */
+      FusionGuard::getCurFusion()->zeroVal(),
+      FusionGuard::getCurFusion()->oneVal()); /*in var/avg/count */
 
   return WelfordResult(out_avg, out_var, out_N);
 }
@@ -872,26 +1120,28 @@ WelfordResult::WelfordResult(
 
 WelfordResult WelfordResult::rFactor(const std::vector<int>& axes) {
   auto o_tv = avg->definition()->as<WelfordOp>()->out()->as<TensorView>();
-  return o_tv->rFactor(axes, avg, var_sum, n);
+  auto rf_tvs = o_tv->rFactor(axes, std::vector<TensorView*>{avg, var_sum, n});
+  return WelfordResult{rf_tvs.at(0), rf_tvs.at(1), rf_tvs.at(2)};
 }
 
 TensorView* transpose(
     TensorView* inp,
     const std::unordered_map<int, int>& old2new) {
-  auto inp_domain = TensorDomain::noReductions(inp->getRootDomain());
+  auto inp_domain = TensorDomain::noReductions(inp->getMaybeRFactorDomain());
   std::vector<IterDomain*> out_domain(inp_domain.size());
 
   auto new2old = ir_utils::normalizeOld2New(old2new, inp_domain.size());
 
   for (const auto i : c10::irange(out_domain.size())) {
     auto in_id = inp_domain[new2old[i]];
-    out_domain[i] = in_id->clone();
+    out_domain[i] = in_id->cloneWithoutRFactor();
   }
 
-  TensorView* out_tensor = new TensorView(
-      new TensorDomain(out_domain, std::vector<bool>(out_domain.size(), true)),
+  TensorView* out_tensor = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, std::vector<bool>(out_domain.size(), true)),
       inp->getDataType().value());
-  new TransposeOp(out_tensor, inp, new2old);
+  IrBuilder::create<TransposeOp>(out_tensor, inp, new2old);
   return out_tensor;
 }
 
@@ -904,7 +1154,10 @@ Val* add_alpha(Val* v1, Val* v2, Val* s) {
       "Alpha value should be a Scalar Valtype and not ",
       s->getValType().value());
 
-  auto vals = maybeBroadcast({v1, v2, s});
+  std::vector<Val*> operands = {v1, v2};
+  auto common_dtype = computeTypes(TypePromotion::default_op_config, operands);
+  auto cast_values = promoteValues({v1, v2, s}, common_dtype);
+  auto vals = maybeBroadcast(cast_values);
   Val* intrm = mul(vals[1], vals[2]);
   return add(vals[0], intrm);
 }
@@ -924,7 +1177,10 @@ Val* sub_alpha(Val* v1, Val* v2, Val* s) {
       "Alpha value should be a Scalar Valtype and not ",
       s->getValType().value());
 
-  auto vals = maybeBroadcast({v1, v2, s});
+  std::vector<Val*> operands = {v1, v2};
+  auto common_dtype = computeTypes(TypePromotion::default_op_config, operands);
+  auto cast_values = promoteValues({v1, v2, s}, common_dtype);
+  auto vals = maybeBroadcast(cast_values);
   Val* intrm = mul(vals[1], vals[2]);
   return sub(vals[0], intrm);
 }
@@ -938,11 +1194,29 @@ TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* v3) {
   return arithOpOverloads(sub_alpha, v1, v2, v3);
 }
 // lerp
-TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight) {
+Val* lerp(Val* start, Val* end, Val* weight) {
+  auto cast_values =
+      promoteValues(TypePromotion::default_op_config, {start, end, weight});
+  start = cast_values[0];
+  end = cast_values[1];
+  weight = cast_values[2];
+
+  auto out_dtype =
+      promote_type(start->getDataType().value(), end->getDataType().value());
+  auto out_vtype =
+      promote_type(start->getValType().value(), end->getValType().value());
+
   auto vals = maybeBroadcast({start, end, weight});
-  Val* intrm1 = sub(vals[1], vals[0]);
-  Val* intrm2 = mul(vals[2], intrm1);
-  return add(vals[0], intrm2);
+  Val* out = nullptr;
+  if (out_vtype == ValType::TensorView) {
+    out = newOutputTV(vals, out_dtype);
+  } else {
+    out = newScalar(out_vtype, out_dtype);
+  }
+
+  IrBuilder::create<TernaryOp>(
+      TernaryOpType::Lerp, out, vals[0], vals[1], vals[2]);
+  return out;
 }
 TensorView* lerp(TensorView* v1, Val* v2, Val* v3) {
   return arithOpOverloads(lerp, v1, v2, v3);
@@ -972,7 +1246,10 @@ Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s) {
       "Alpha value should be a Scalar Valtype and not ",
       s->getValType().value());
 
-  auto vals = maybeBroadcast({v1, v2, v3, s});
+  std::vector<Val*> operands = {v1, v2, v3};
+  auto common_dtype = computeTypes(TypePromotion::default_op_config, operands);
+  auto cast_values = promoteValues({v1, v2, v3, s}, common_dtype);
+  auto vals = maybeBroadcast(cast_values);
   Val* intrm1 = mul(vals[2], vals[3]);
   Val* intrm2 = mul(vals[1], intrm1);
   return add(vals[0], intrm2);
@@ -1007,10 +1284,9 @@ Val* where(Val* c, Val* v1, Val* v2) {
       "Condition should be of DataType Bool, not ",
       c->getDataType().value());
 
-  auto casted_values =
-      promoteValues(TypePromotion::default_op_config, {v1, v2});
-  v1 = casted_values[0];
-  v2 = casted_values[1];
+  auto cast_values = promoteValues(TypePromotion::default_op_config, {v1, v2});
+  v1 = cast_values[0];
+  v2 = cast_values[1];
 
   TORCH_CHECK(c->getDataType().value() == DataType::Bool);
   auto out_dtype =
@@ -1024,7 +1300,8 @@ Val* where(Val* c, Val* v1, Val* v2) {
   } else {
     out = newScalar(out_vtype, out_dtype);
   }
-  new TernaryOp(TernaryOpType::Where, out, vals[0], vals[1], vals[2]);
+  IrBuilder::create<TernaryOp>(
+      TernaryOpType::Where, out, vals[0], vals[1], vals[2]);
   return out;
 }
 
@@ -1064,7 +1341,8 @@ Val* threshold(Val* in, Val* thresh, Val* value) {
   value = optionalCast(in->getDataType().value(), value);
   Val* out = newValLike(in, in->getDataType().value());
 
-  new TernaryOp(TernaryOpType::Threshold, out, in, thresh, value);
+  IrBuilder::create<TernaryOp>(
+      TernaryOpType::Threshold, out, in, thresh, value);
   return out;
 }
 
@@ -1074,17 +1352,25 @@ TensorView* threshold(TensorView* in, Val* thresh, Val* value) {
 
 Val* clamp(Val* in, Val* min_val, Val* max_val) {
   TORCH_CHECK(
-      (min_val->getValType().value() == ValType::Scalar ||
+      (min_val == nullptr || min_val->getValType().value() == ValType::Scalar ||
        min_val->getValType().value() == ValType::NamedScalar) &&
-          (max_val->getValType().value() == ValType::Scalar ||
+          (max_val == nullptr ||
+           max_val->getValType().value() == ValType::Scalar ||
            max_val->getValType().value() == ValType::NamedScalar),
       "For Clamp operation: Min and Max values should be Scalars.");
 
-  min_val = optionalCast(in->getDataType().value(), min_val);
-  max_val = optionalCast(in->getDataType().value(), max_val);
-  Val* out = newValLike(in, in->getDataType().value());
+  min_val = (min_val == nullptr)
+      ? getMinimumValue(in->getDataType().value())
+      : optionalCast(in->getDataType().value(), min_val);
+  TORCH_CHECK(min_val != nullptr, "Missing minimum value");
 
-  new TernaryOp(TernaryOpType::Clamp, out, in, min_val, max_val);
+  max_val = (max_val == nullptr)
+      ? getMaximumValue(in->getDataType().value())
+      : optionalCast(in->getDataType().value(), max_val);
+  TORCH_CHECK(max_val != nullptr, "Missing maximum value");
+
+  Val* out = newValLike(in, in->getDataType().value());
+  IrBuilder::create<TernaryOp>(TernaryOpType::Clamp, out, in, min_val, max_val);
   return out;
 }
 
@@ -1095,7 +1381,7 @@ TensorView* clamp(TensorView* in, Val* min_val, Val* max_val) {
 // sum_to operator
 
 TensorView* sum_to(TensorView* in, const std::vector<Int*>& sum_to_size) {
-  const auto& root = TensorDomain::noReductions(in->getRootDomain());
+  const auto& root = TensorDomain::noReductions(in->getMaybeRFactorDomain());
 
   TORCH_CHECK(
       root.size() >= sum_to_size.size(),
@@ -1141,7 +1427,7 @@ TensorView* sum_to(TensorView* in, const std::vector<Int*>& sum_to_size) {
 }
 
 TensorView* sum_to(TensorView* in, const std::vector<int64_t>& sum_to_size) {
-  const auto& root = TensorDomain::noReductions(in->getRootDomain());
+  const auto& root = TensorDomain::noReductions(in->getMaybeRFactorDomain());
 
   TORCH_CHECK(
       root.size() >= sum_to_size.size(),
@@ -1186,125 +1472,157 @@ TensorView* sum_to(TensorView* in, const std::vector<int64_t>& sum_to_size) {
 }
 
 TensorView* shift(TensorView* inp, const std::vector<int>& offsets, bool pad) {
+  // When pad is false, no padding is given. When it is true, padding
+  // sizes are set so that output domains have the same extents as
+  // input domains.
+  std::vector<int> pad_width(offsets.size(), 0);
+  if (pad) {
+    for (const auto i : c10::irange(offsets.size())) {
+      pad_width[i] = std::abs(offsets[i]);
+    }
+  }
+  return shift(inp, offsets, pad_width);
+}
+
+TensorView* shift(
+    TensorView* inp,
+    const std::vector<int>& offsets,
+    const std::vector<int>& pad_width_param) {
+  auto inp_dom = TensorDomain::noReductions(inp->getRootDomain());
+  const auto ndims = inp_dom.size();
+
+  auto pad_width = pad_width_param;
+  // Default padding is set so that the extent is kept unchanged
+  if (pad_width.empty()) {
+    pad_width = offsets;
+    for (auto& p : pad_width) {
+      p = std::abs(p);
+    }
+  }
+
   TORCH_CHECK(
-      TensorDomain::noReductions(inp->getRootDomain()).size() == offsets.size(),
+      ndims == offsets.size(),
       "Invalid shift offsets, number of entries in offsets expected to be ",
-      TensorDomain::noReductions(inp->getRootDomain()).size(),
+      ndims,
       " but received ",
       offsets.size());
 
+  TORCH_CHECK(
+      ndims == pad_width.size(),
+      "Invalid padding width list, number of entries in pad_width expected to be ",
+      ndims,
+      " but received ",
+      pad_width.size());
+
+  std::for_each(pad_width.begin(), pad_width.end(), [](const auto& pad) {
+    TORCH_CHECK(pad >= 0, "Padding width must be >= 0: ", pad);
+  });
+
   TensorView* out = nullptr;
 
-  if (pad) {
-    out = newValLike(inp, inp->getDataType().value())->as<TensorView>();
-  } else {
-    auto inp_dom = TensorDomain::noReductions(inp->getRootDomain());
-    const auto ndims = inp_dom.size();
-    std::vector<IterDomain*> out_dom;
-    for (const auto i : c10::irange(ndims)) {
-      const auto inp_axis = inp_dom[i];
-      const auto offset = offsets[i];
-      if (offset == 0) {
-        out_dom.push_back(inp_axis->clone());
-        continue;
-      }
+  std::vector<IterDomain*> out_dom;
+  for (const auto i : c10::irange(ndims)) {
+    const auto inp_axis = inp_dom[i];
+    const auto offset = offsets[i];
+    const auto pad = pad_width[i];
 
-      Int* current_start_offset = dynamic_cast<Int*>(inp_axis->start());
-      TORCH_INTERNAL_ASSERT(
-          current_start_offset != nullptr && current_start_offset->isConst(),
-          "Invalid IterDomain start value:",
-          current_start_offset);
+    if (offset == 0) {
+      out_dom.push_back(inp_axis->cloneWithoutRFactor());
+      continue;
+    }
 
-      Int* current_stop_offset = dynamic_cast<Int*>(inp_axis->stopOffset());
-      TORCH_INTERNAL_ASSERT(
-          current_stop_offset != nullptr && current_stop_offset->isConst(),
-          "Invalid IterDomain stop offset value:",
-          current_stop_offset);
-
-      const auto cur_start_offset_value = current_start_offset->value().value();
-      const auto cur_stop_offset_value = current_stop_offset->value().value();
-
-      Val* out_start_offset = nullptr;
-      Val* out_stop_offset = nullptr;
-
-      if (offset > 0) {
-        // shift to right; extent remains the same, start and stop
-        // positions are moved right
-        out_start_offset = new Int(cur_start_offset_value + offset);
-        out_stop_offset =
-            new Int(std::max(cur_stop_offset_value - offset, int64_t(0)));
-      } else {
-        // shift to left; extent remains the same, start and stop
-        // positions are moved left
-        out_start_offset =
-            new Int(std::max(cur_start_offset_value + offset, int64_t(0)));
-        out_stop_offset = new Int(cur_stop_offset_value - offset);
-      }
+    Int* current_start_offset = dynamic_cast<Int*>(inp_axis->start());
+    TORCH_INTERNAL_ASSERT(
+        current_start_offset != nullptr && current_start_offset->isConst(),
+        "Invalid IterDomain start value:",
+        current_start_offset);
 
-      out_dom.push_back(new IterDomain(
-          out_start_offset,
-          inp_axis->extent(),
-          out_stop_offset,
-          ParallelType::Serial,
-          inp_axis->getIterType()));
+    Int* current_stop_offset = dynamic_cast<Int*>(inp_axis->stopOffset());
+    TORCH_INTERNAL_ASSERT(
+        current_stop_offset != nullptr && current_stop_offset->isConst(),
+        "Invalid IterDomain stop offset value:",
+        current_stop_offset);
+
+    const auto cur_start_offset_value = current_start_offset->value().value();
+    const auto cur_stop_offset_value = current_stop_offset->value().value();
+
+    int64_t out_start_offset = 0;
+    int64_t out_stop_offset = 0;
+
+    if (offset > 0) {
+      // shift to right; extent remains the same, start and stop
+      // positions are moved right
+      out_start_offset = cur_start_offset_value + offset - pad;
+      out_stop_offset = std::max(cur_stop_offset_value - offset, int64_t(0));
+      // If pad > offset, the extent of the output ID could be larger than the
+      // input, and the start offset of the output domain could become
+      // negative, which is not supported.
+      TORCH_CHECK(
+          out_start_offset >= 0,
+          "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ",
+          pad,
+          ". Shift: ",
+          offset,
+          ".");
+    } else {
+      // shift to left; extent remains the same, start and stop
+      // positions are moved left
+      out_start_offset = std::max(cur_start_offset_value + offset, int64_t(0));
+      out_stop_offset = cur_stop_offset_value - offset - pad;
+      // Similar to the above case whwere offset is positive, if pad >
+      // -offset (note offset is negative), the extent of the output
+      // ID could be larger than the input, and the stop offset of the
+      // output domain could become negative.
+      TORCH_CHECK(
+          out_stop_offset >= 0,
+          "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ",
+          pad,
+          ". Shift: ",
+          offset,
+          ".");
     }
 
-    out = new TensorView(
-        new TensorDomain(out_dom, std::vector<bool>(out_dom.size(), true)),
-        inp->getDataType().value());
+    out_dom.push_back(IrBuilder::create<IterDomain>(
+        IrBuilder::create<Int>(out_start_offset),
+        inp_axis->extent(),
+        IrBuilder::create<Int>(out_stop_offset),
+        ParallelType::Serial,
+        inp_axis->getIterType()));
   }
 
-  new ShiftOp(out, inp, offsets, pad);
-  return out;
-}
-
-namespace {
-std::vector<Int*> convertToIntVector(const std::vector<int>& x) {
-  std::vector<Int*> converted;
-  std::transform(x.begin(), x.end(), std::back_inserter(converted), [](int x) {
-    return new Int(x);
-  });
-  return converted;
-}
-} // namespace
+  out = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_dom, std::vector<bool>(out_dom.size(), true)),
+      inp->getDataType().value());
 
-TensorView* gather(
-    TensorView* inp,
-    const std::vector<int>& window_shape,
-    const std::vector<std::vector<int>>& pad_width,
-    const std::vector<int>& strides) {
-  std::vector<Int*> window_shape_int = convertToIntVector(window_shape);
-  std::vector<std::vector<Int*>> pad_width_int;
-  std::transform(
-      pad_width.begin(),
-      pad_width.end(),
-      std::back_inserter(pad_width_int),
-      [](const std::vector<int>& x) { return convertToIntVector(x); });
-  return gather(inp, window_shape_int, pad_width_int, strides);
+  IrBuilder::create<ShiftOp>(out, inp, offsets, pad_width);
+  return out;
 }
 
 namespace {
 
-// Return a new TensorDomain with given root domains. Apply strides if
-// necessary. With non-unit strides, strided domains become an rfactor
-// domain.
+// Return a new TensorDomain with given root domains. Apply
+// strides if necessary. With non-unit strides, strided domains become an
+// rfactor domain.
 TensorDomain* generateTensorDomainWithStrides(
     const std::vector<IterDomain*>& root_domains,
-    const std::vector<int>& strides) {
+    const std::vector<int>& strides,
+    bool skip_unit_stride) {
   std::vector<IterDomain*> strided_domains;
 
   // If strides are just unit strides, don't apply striding
-  if (strides.empty() || std::all_of(strides.begin(), strides.end(), [](int s) {
-        return s == 1;
-      })) {
-    return new TensorDomain(
+  if (strides.empty() ||
+      (skip_unit_stride &&
+       std::all_of(
+           strides.begin(), strides.end(), [](int s) { return s == 1; }))) {
+    return IrBuilder::create<TensorDomain>(
         root_domains, std::vector<bool>(root_domains.size(), true));
   }
 
   for (const auto i : c10::irange(root_domains.size())) {
     auto root_dom = root_domains.at(i);
 
-    if (i >= strides.size() || strides[i] == 1) {
+    if (i >= strides.size() || (skip_unit_stride && strides[i] == 1)) {
       strided_domains.push_back(root_dom);
       continue;
     }
@@ -1317,7 +1635,7 @@ TensorDomain* generateTensorDomainWithStrides(
 
   auto contig_vector_size = strided_domains.size();
 
-  auto strided_td = new TensorDomain(
+  auto strided_td = IrBuilder::create<TensorDomain>(
       root_domains,
       strided_domains,
       strided_domains,
@@ -1330,10 +1648,11 @@ TensorDomain* generateTensorDomainWithStrides(
 
 TensorView* gather(
     TensorView* inp,
-    const std::vector<Int*>& window_shape,
-    const std::vector<std::vector<Int*>>& pad_width,
-    const std::vector<int>& strides) {
-  auto inp_dom = TensorDomain::noReductions(inp->getRootDomain());
+    const std::vector<int>& window_shape,
+    const std::vector<std::vector<int>>& pad_width,
+    const std::vector<int>& strides,
+    bool trim_out_of_bounds) {
+  auto inp_dom = TensorDomain::noReductions(inp->getMaybeRFactorDomain());
   const auto ndims = inp_dom.size();
 
   TORCH_CHECK(
@@ -1343,6 +1662,10 @@ TensorView* gather(
       " but received ",
       window_shape.size());
 
+  std::for_each(window_shape.begin(), window_shape.end(), [](const auto& w) {
+    TORCH_CHECK(w > 0, "Window size must be > 0: ", w);
+  });
+
   TORCH_CHECK(
       ndims == pad_width.size(),
       "Invalid pad width: number of entries expected to be ",
@@ -1354,6 +1677,10 @@ TensorView* gather(
     TORCH_CHECK(
         p.size() == 2,
         "Each entry of pad_width must have two non-negative integers.");
+    std::for_each(p.begin(), p.end(), [](const auto& p_left_or_right) {
+      TORCH_CHECK(
+          p_left_or_right >= 0, "Padding must be >= 0: ", p_left_or_right);
+    });
   });
 
   TORCH_CHECK(
@@ -1363,6 +1690,10 @@ TensorView* gather(
       " but received ",
       strides.size());
 
+  std::for_each(strides.begin(), strides.end(), [](const auto& s) {
+    TORCH_CHECK(s > 0, "Stride must be > 0: ", s);
+  });
+
   std::vector<IterDomain*> out_root_domains;
   std::vector<IterDomain*> out_gather_dom;
 
@@ -1371,43 +1702,225 @@ TensorView* gather(
     const auto window_dim = window_shape[i];
     const auto pad_left = pad_width[i][0];
     const auto pad_right = pad_width[i][1];
+    // This may be over-conservative
     TORCH_INTERNAL_ASSERT(inp_axis->start()->isZeroInt());
-    Val* out_axis_dim = nullptr;
-    if (window_dim->isConst() && pad_left->isConst() && pad_right->isConst()) {
-      const int64_t extent_adjustment =
-          -(-window_dim->value().value() + 1 + pad_left->value().value() +
-            pad_right->value().value());
-      out_axis_dim = extent_adjustment == 0
-          ? inp_axis->extent()
-          : sub(inp_axis->extent(), new Int(extent_adjustment));
-    } else {
-      out_axis_dim =
-          add(add(sub(inp_axis->extent(), window_dim), new Int(1)),
-              add(pad_left, pad_right));
-    }
-    // TODO: out_axis_dim is assumed to be the same as the extent of
-    // the input domain. Throw an error if it isn't the case.
-    out_root_domains.push_back(new IterDomain(
-        new Int(0),
-        out_axis_dim,
+    const auto inp_stop_offset = inp_axis->stopOffset()->getInt();
+    TORCH_INTERNAL_ASSERT(
+        inp_stop_offset.has_value(),
+        "Dynamic stop offset not supported: ",
+        inp_axis);
+    const auto extent_adjustment = window_dim - 1 - pad_left - pad_right;
+    TORCH_CHECK(
+        extent_adjustment >= 0,
+        "Invalid gather window and padding as output extent would be larger than input.",
+        " Window: ",
+        window_dim,
+        ". Padding left: ",
+        pad_left,
+        ". Padding right: ",
+        pad_right);
+    const auto out_stop_offset = inp_stop_offset.value() + extent_adjustment;
+    out_root_domains.push_back(IrBuilder::create<IterDomain>(
+        FusionGuard::getCurFusion()->zeroVal(),
+        inp_axis->extent(),
+        IrBuilder::create<Int>(out_stop_offset),
         ParallelType::Serial,
         inp_axis->getIterType()));
     // create a new axis for the gathered domain
-    out_gather_dom.push_back(new IterDomain(
-        new Int(0), window_dim, ParallelType::Serial, IterType::Gather));
+    out_gather_dom.push_back(IrBuilder::create<IterDomain>(
+        FusionGuard::getCurFusion()->zeroVal(),
+        IrBuilder::create<Int>(window_dim),
+        ParallelType::Serial,
+        IterType::Gather));
   }
 
   out_root_domains.insert(
       out_root_domains.end(), out_gather_dom.begin(), out_gather_dom.end());
 
-  auto out_td = generateTensorDomainWithStrides(out_root_domains, strides);
+  TensorDomain* out_td = nullptr;
 
-  auto out_tv = new TensorView(out_td, inp->getDataType().value());
+  if (trim_out_of_bounds) {
+    // If no stride vector is given, just use stride 1. It does not do
+    // any striding effect, but out-of-bounds values are trimmed.
+    auto s = strides.empty() ? std::vector<int>(ndims, 1) : strides;
+    out_td = generateTensorDomainWithStrides(out_root_domains, strides, false);
+  } else {
+    out_td = generateTensorDomainWithStrides(out_root_domains, strides, true);
+  }
+
+  auto out_tv =
+      IrBuilder::create<TensorView>(out_td, inp->getDataType().value());
 
-  new GatherOp(out_tv, inp, window_shape, pad_width);
+  IrBuilder::create<GatherOp>(out_tv, inp, window_shape, pad_width);
   return out_tv;
 }
 
+TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp) {
+  auto inp_type = inp->getDataType().value();
+  TORCH_CHECK(
+      isVectorType(inp_type),
+      "Invalid type to viewAsScalar. A vector type is expected but ",
+      inp_type,
+      " is given.");
+  int vec_size = getVectorSizeFromType(inp_type);
+  auto out_type = getTypeFromVectorType(inp_type);
+
+  std::vector<IterDomain*> out_domain;
+  auto inp_domain = TensorDomain::noReductions(inp->getMaybeRFactorDomain());
+  out_domain.reserve(inp_domain.size());
+  for (auto d : inp_domain) {
+    out_domain.push_back(d->cloneWithoutRFactor());
+  }
+
+  IterDomain* id = IrBuilder::create<IterDomain>(
+      inp_domain[0]->container(),
+      inp_domain[0]->container()->zeroVal(),
+      IrBuilder::create<Int>(vec_size),
+      ParallelType::Serial,
+      IterType::VectorComponent,
+      false);
+  out_domain.push_back(id);
+
+  auto out = IrBuilder::create<TensorView>(
+      inp->container(),
+      IrBuilder::create<TensorDomain>(
+          out_domain, std::vector<bool>(out_domain.size(), true)),
+      out_type);
+
+  IrBuilder::create<ViewAsScalar>(inp->container(), out, inp, id);
+
+  return out;
+}
+
+namespace {
+
+//! Create new output for mma
+static TensorView* newForMma(
+    TensorView* tv_a,
+    TensorView* tv_b,
+    const std::vector<unsigned int>& axes,
+    DataType data_type = DataType::Float) {
+  auto orig_domain_a =
+      TensorDomain::noReductions(tv_a->getMaybeRFactorDomain());
+  auto orig_domain_b =
+      TensorDomain::noReductions(tv_b->getMaybeRFactorDomain());
+
+  TORCH_INTERNAL_ASSERT(
+      orig_domain_a.size() == orig_domain_b.size(),
+      "MMA op: need matching dim input");
+
+  std::set<unsigned int> axes_set(axes.begin(), axes.end());
+  std::vector<IterDomain*> new_domain;
+
+  TORCH_INTERNAL_ASSERT(
+      !axes_set.empty(),
+      "Asked for ouput of reduction, but no reduction axis provided.");
+
+  TORCH_INTERNAL_ASSERT(
+      (*(axes_set.rbegin())) < orig_domain_a.size(),
+      "Error setting up reduction, reduction axis (",
+      *(axes_set.rbegin()),
+      ") is outside nDims (",
+      orig_domain_a.size(),
+      "). Keep in mind reductions are relative to root domains, not modified views.");
+
+  auto axis_iter = axes_set.begin();
+  for (const auto dim : c10::irange(orig_domain_a.size())) {
+    bool isReduction = false;
+    if (axis_iter != axes_set.end() && *axis_iter == dim) {
+      isReduction = true;
+      axis_iter++;
+    }
+
+    const IterDomain* id = orig_domain_a[dim]->isBroadcast()
+        ? orig_domain_b[dim]
+        : orig_domain_a[dim];
+
+    TORCH_CHECK(
+        !(isReduction && id->isBroadcast() && !id->isImplicitBroadcast()),
+        "Cannot reduce an axis that is marked as broadcasted as it has an undetermined size. Tried to reduce ID = ",
+        id,
+        " of tensor ",
+        tv_a,
+        "and",
+        tv_b);
+
+    new_domain.push_back(IrBuilder::create<IterDomain>(
+        id->start(),
+        id->extent(),
+        id->stopOffset(),
+        ParallelType::Serial,
+        isReduction ? IterType::Reduction : id->getIterType()));
+  }
+
+  TensorDomain* td = IrBuilder::create<TensorDomain>(
+      new_domain, std::vector<bool>(new_domain.size(), true));
+
+  return IrBuilder::create<TensorView>(td, data_type);
+}
+
+} // namespace
+
+TensorView* fusedMultiplySum(
+    TensorView* tv_a,
+    TensorView* tv_b,
+    const std::vector<int>& axes,
+    Val* init) {
+  if (init == nullptr) {
+    init = IrBuilder::create<Double>(0);
+  }
+
+  // TODO:
+  //  We will want to support initialize and rfactor with
+  //  mma as well, for maybe fusing bias in prolog.
+  // TODO: check init type if given a tv,
+  //  not supported currently though.
+  TORCH_CHECK(
+      init->isConstScalar(),
+      "Cannot create a reduction operation where the initial value is not a const scalar.");
+
+  // TODO:
+  //  Validate axis relationships between a and b
+  TORCH_CHECK(tv_a->nDims() > 0, "Tried to reduce a 0-dim tensor");
+
+  // TODO:
+  //  Add tf32 and other mma data types
+  //  Add fallback path for non-mma data types.
+  TORCH_CHECK(tv_a->getDataType().value() == DataType::Half);
+  TORCH_CHECK(tv_b->getDataType().value() == DataType::Half);
+
+  TORCH_CHECK(axes.size() > 0, "No reduction axis specified");
+
+  // TODO:
+  //  will lift this in a follow up when we have a
+  //  more generic axes matching.
+  TORCH_CHECK(
+      axes.size() == 1, "Single axis reduction only for mma op instantiation.")
+
+  std::vector<unsigned int> uint_axes;
+  const int ndims = tv_a->domain()->noReductions().size();
+  for (int axis : axes) {
+    if (axis < 0) {
+      axis += ndims;
+    }
+
+    TORCH_CHECK(
+        axis >= 0 && axis < ndims,
+        "Reduction on invalid axis, recieved: ",
+        axis,
+        " however tensor view only has ",
+        ndims,
+        " non-reduction dims.");
+
+    uint_axes.push_back((unsigned int)axis);
+  }
+
+  TensorView* out = newForMma(tv_a, tv_b, uint_axes);
+  IrBuilder::create<MmaOp>(out, tv_a, tv_b, init);
+
+  return out;
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
index 5652d68eab8e..53efba8f7301 100644
--- a/torch/csrc/jit/codegen/cuda/arith.h
+++ b/torch/csrc/jit/codegen/cuda/arith.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -24,10 +24,14 @@ namespace cuda {
 TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1);
 TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1);
 
+TORCH_CUDA_CU_API Val* bitCastOp(DataType dtype, Val* v1);
+TORCH_CUDA_CU_API TensorView* bitCastOp(DataType dtype, TensorView* v1);
+
 // Perform unary op type and return the output
 TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1);
 TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
-
+TORCH_CUDA_CU_API Val* unaryIsOp(UnaryOpType type, Val* v1);
+TORCH_CUDA_CU_API TensorView* unaryIsOp(UnaryOpType type, TensorView* v1);
 TORCH_CUDA_CU_API Val* unaryOp(
     UnaryOpType type,
     Val* v1,
@@ -88,7 +92,8 @@ TORCH_CUDA_CU_API TensorView* reductionOp(
     const std::vector<int>& axes,
     Val* init,
     TensorView* v1,
-    bool keep_dim = false);
+    bool keep_dim = false,
+    DataType dtype = DataType::Null);
 
 //! Auxiliary Struct holding result of
 //! a single welford op in ternsorview
@@ -114,7 +119,9 @@ TORCH_CUDA_CU_API WelfordResult Welford(
     const std::vector<int>& axes,
     TensorView* init_avg = nullptr,
     TensorView* init_var = nullptr,
-    Int* init_N = new Int(0));
+    // Initializes to 0 in function definition, doing this so we don't have to
+    // import IrBuilder just for this one interface.
+    Int* init_N = nullptr);
 
 // UNARY OPERATIONS
 // abs
@@ -159,9 +166,6 @@ TORCH_CUDA_CU_API TensorView* floor(TensorView*);
 // frac
 TORCH_CUDA_CU_API Val* frac(Val*);
 TORCH_CUDA_CU_API TensorView* frac(TensorView*);
-// gelu
-TORCH_CUDA_CU_API Val* gelu(Val*);
-TORCH_CUDA_CU_API TensorView* gelu(TensorView*);
 // silu
 TORCH_CUDA_CU_API Val* silu(Val*);
 TORCH_CUDA_CU_API TensorView* silu(TensorView*);
@@ -222,9 +226,27 @@ TORCH_CUDA_CU_API TensorView* tanh(TensorView*);
 // trunc
 TORCH_CUDA_CU_API Val* trunc(Val*);
 TORCH_CUDA_CU_API TensorView* trunc(TensorView*);
-// not
-TORCH_CUDA_CU_API Val* notOp(Val*);
-TORCH_CUDA_CU_API TensorView* notOp(TensorView*);
+// bitwise_not
+TORCH_CUDA_CU_API Val* bitwise_not(Val*);
+TORCH_CUDA_CU_API TensorView* bitwise_not(TensorView*);
+// isfinite
+TORCH_CUDA_CU_API Val* isfinite(Val*);
+TORCH_CUDA_CU_API TensorView* isfinite(TensorView*);
+// isinf
+TORCH_CUDA_CU_API Val* isinf(Val*);
+TORCH_CUDA_CU_API TensorView* isinf(TensorView*);
+// isnan
+TORCH_CUDA_CU_API Val* isnan(Val*);
+TORCH_CUDA_CU_API TensorView* isnan(TensorView*);
+// isneginf
+TORCH_CUDA_CU_API Val* isneginf(Val*);
+TORCH_CUDA_CU_API TensorView* isneginf(TensorView*);
+// isposinf
+TORCH_CUDA_CU_API Val* isposinf(Val*);
+TORCH_CUDA_CU_API TensorView* isposinf(TensorView*);
+// isreal
+TORCH_CUDA_CU_API Val* isreal(Val*);
+TORCH_CUDA_CU_API TensorView* isreal(TensorView*);
 
 // Broadcasts v1 based on bool vector. Size of broadcast bool vector should be
 // the number of dims desired in the broadcasted tensor. This vector should be
@@ -298,16 +320,36 @@ TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
-// lshift
-TORCH_CUDA_CU_API Val* lshift(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* lshift(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* lshift(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* lshift(TensorView* v1, TensorView* v2);
-// rshift
-TORCH_CUDA_CU_API Val* rshift(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* rshift(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* rshift(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* rshift(TensorView* v1, TensorView* v2);
+// Bitwise binary ops
+// bitwise_and
+TORCH_CUDA_CU_API Val* bitwise_and(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_and(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, TensorView* v2);
+// bitwise_left_shift
+TORCH_CUDA_CU_API Val* bitwise_left_shift(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_left_shift(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_left_shift(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_left_shift(
+    TensorView* v1,
+    TensorView* v2);
+// bitwise_right_shift
+TORCH_CUDA_CU_API Val* bitwise_right_shift(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_right_shift(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_right_shift(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_right_shift(
+    TensorView* v1,
+    TensorView* v2);
+// bitwise_or
+TORCH_CUDA_CU_API Val* bitwise_or(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_or(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, TensorView* v2);
+// bitwise_xor
+TORCH_CUDA_CU_API Val* bitwise_xor(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_xor(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, TensorView* v2);
 // Logical binary ops
 // eq
 TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2);
@@ -340,27 +382,12 @@ TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* ne(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, TensorView* v2);
 
-// andOp
-TORCH_CUDA_CU_API Val* andOp(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* andOp(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, TensorView* v2);
-// orOp
-TORCH_CUDA_CU_API Val* orOp(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* orOp(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* orOp(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* orOp(TensorView* v1, TensorView* v2);
-// xorOp
-TORCH_CUDA_CU_API Val* xorOp(Val* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* xorOp(TensorView* v1, Val* v2);
-TORCH_CUDA_CU_API TensorView* xorOp(Val* v1, TensorView* v2);
-TORCH_CUDA_CU_API TensorView* xorOp(TensorView* v1, TensorView* v2);
-
 // REDUCTION OPERATIONS
 TORCH_CUDA_CU_API TensorView* sum(
     TensorView* v1,
     const std::vector<int>& reduction_axes,
-    bool keep_dim = false);
+    bool keep_dim = false,
+    DataType dtype = DataType::Null);
 
 TORCH_CUDA_CU_API TensorView* max(
     TensorView* v1,
@@ -484,19 +511,27 @@ TORCH_CUDA_CU_API TensorView* sum_to(
 //!     t1[i, j] = 0, otherwise
 //!
 //! The pad option controls how out-of-boundary accesses are
-//! handled. When pad is true, shifting works as if the source tensor
-//! is padded by zero. Otherwise, it does not modify the output tensor
-//! region whose source coordinates are out-of-boundry. In both cases,
-//! the size of output tensor does not change. However, when pad is
-//! false, the start or stop value of the shifted axis is adjusted
-//! accordingly. For example, when a shift offset is one, the axis start
-//! value would be incremented by one.
+//! handled. It specifies how many zeros are logically padded. If no
+//! pad option is given, it automatically pads the input tensor so
+//! that the output tensor has the same extent for each axis.
 //!
-//! \param pad If true, out-of-boundary access returns zero.
+//! When a padding value is smaller than the absolute value of a shift
+//! offset, the output axis still has the same extent but its start or
+//! stop offset is moved inward to signify those outside of the offset
+//! are invalid.
+//!
+//! It is not allowed to use padding values that are larger than shift
+//! offsets, which would mean output extentes would be larger than
+//! input extents
+TORCH_CUDA_CU_API TensorView* shift(
+    TensorView* inp,
+    const std::vector<int>& offsets,
+    const std::vector<int>& pad_width = {});
+
 TORCH_CUDA_CU_API TensorView* shift(
     TensorView* inp,
     const std::vector<int>& offsets,
-    bool pad = true);
+    bool pad);
 
 //! Gather a window of nearby elements for each element.
 //!
@@ -508,8 +543,13 @@ TORCH_CUDA_CU_API TensorView* shift(
 //! implemented with strided split, whose outer output domain becomes
 //! the root domain for subsequent consumers. The inner output domain
 //! becomes a Stride domain, which is ignored by subsequent consumers.
+//! Only valid input ranges are fed into strided splits.
 //!
-//! Example:
+//! When trim_out_of_bounds is true, the values at the first and last
+//! ends that are outside of the start and stop offsets are
+//! effetively trimmed by partial split by 1.
+//!
+//! Example 1:
 //!   t0: 2D tensor of [N, M]
 //!   t1 = gather(t0, {1, 3}, {{0, 0}, {1, 1}});
 //!
@@ -517,23 +557,61 @@ TORCH_CUDA_CU_API TensorView* shift(
 //!     t1: [N, M, 1, 3]
 //!     t1[i, j, k, l] = The value at the window position of [k, l]
 //!                      for t0[i, j]
+//!
+//! Example 2.1 (without trimming):
+//!   t0: 2D tensor of [N, M]
+//!   t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}});
+//!
+//!   then:
+//!     t1: [N (stop offset: 1), M (stop offset: 1, 2, 2)]
+//!
+//! Example 2.1 (with trimming)
+//!   t0: 2D tensor of [N, M]
+//!   t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}, true);
+//!
+//!   then:
+//!     t1: [ceilDiv(N - 1, 1), ceilDiv(M - 1, 1), 2, 2]
+//!
+//! Example 3:
+//!   t0: 2D tensor of [N, M]
+//!   t1 = gather(t0, {3, 3}, {{0, 0}, {0, 0}}, {3, 3});
+//!
+//!   then:
+//!     t1: [ceilDiv(N - 2, 3), ceilDiv(M - 2, 3), 2, 2]
+//!
 TORCH_CUDA_CU_API TensorView* gather(
     TensorView* inp,
     const std::vector<int>& window_shape,
     const std::vector<std::vector<int>>& pad_width,
-    const std::vector<int>& strides = {});
-
-//! Gather a window of nearby elements for each element.
+    const std::vector<int>& strides = {},
+    bool trim_out_of_bounds = false);
+
+// Append a new IterDomain to the end of a TenorView to allow
+// iterating on a vector type. The input tensor must have
+// vector dtype.
+TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp);
+
+//! A fused pointwise multiply and sum
+//!  operator that instantiates the following
+//!  fused pattern:
+//!     c = mul(tv_a, tv_b);
+//!     return sum(c, axes)
 //!
-//! Same as the another gather interface but with Int* parameters.
+//! \param tv_a first multiply operand
+//! \param tv_b second multiply operand
+//! \param axes axes to sum over
+//! \param init sum initial value
 //!
-//! TODO: Remove this interface as we do not intend to support dynamic
-//! window shapes at this moment.
-TORCH_CUDA_CU_API TensorView* gather(
-    TensorView* inp,
-    const std::vector<Int*>& window_shape,
-    const std::vector<std::vector<Int*>>& pad_width,
-    const std::vector<int>& strides = {});
+//! Note & TODO:
+//!   currently only support lowering to a mma op
+//!   through this interface and only support fp16 inputs.
+//!   will support converting back to multiply and reduce in
+//!   a follow up.
+TORCH_CUDA_CU_API TensorView* fusedMultiplySum(
+    TensorView* tv_a,
+    TensorView* tv_b,
+    const std::vector<int>& axes,
+    Val* init = nullptr);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
index 709c810efe3e..ef223bae6d5b 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -3,6 +3,8 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
@@ -19,7 +21,106 @@ namespace codegen {
 
 namespace {
 
-class CudaKernelGenerator : private kir::IrVisitor {
+std::string ptrType(DataType dt) {
+  std::stringstream ss;
+  ss << dt << "*";
+  return ss.str();
+}
+
+std::string refType(DataType dt) {
+  std::stringstream ss;
+  ss << dt << "&";
+  return ss.str();
+}
+
+//! Utility class to build an argument list
+class ArgumentBuilder {
+ public:
+  //! Build an argument list where each argument is separated with a comma
+  ArgumentBuilder() = default;
+
+  //! Build an argument list where each argument has its own line
+  ArgumentBuilder(int indent_level, const char* tab) {
+    std::stringstream ss;
+    for (const auto i : c10::irange(indent_level)) {
+      (void)i; // Suppress unused variable warning
+      ss << tab;
+    }
+    sep_ = ",\n" + ss.str();
+  }
+
+  //! Add a new argument
+  template <typename T>
+  ArgumentBuilder& arg(const T& x) {
+    addSeparator();
+    return append(x);
+  }
+
+  //! Append to the last argument
+  template <typename T>
+  ArgumentBuilder& append(const T& arg) {
+    ss_ << arg;
+    return *this;
+  }
+
+  //! Get a string of the argument list
+  std::string str() const {
+    return ss_.str();
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ArgumentBuilder& ab) {
+    return os << ab.str();
+  }
+
+ private:
+  void addSeparator() {
+    if (ss_.tellp() != 0) {
+      ss_ << sep_;
+    }
+  }
+
+ private:
+  std::string sep_ = ", ";
+  std::stringstream ss_;
+};
+
+//! Append to the last argument
+template <>
+ArgumentBuilder& ArgumentBuilder::append<bool>(const bool& arg) {
+  ss_ << (arg ? "true" : "false");
+  return *this;
+}
+
+//! Returns "template_name<template_arg>"
+template <typename TemplateNameT, typename TemplateArgT>
+std::string genTemplate(
+    const TemplateNameT& template_name,
+    const TemplateArgT& template_arg) {
+  std::stringstream ss;
+  ss << template_name << "<" << template_arg << ">";
+  return ss.str();
+}
+
+//! Returns "func_name(func_arg)"
+template <typename FuncNameT, typename FuncArgT>
+std::string genCall(const FuncNameT& func_name, const FuncArgT& func_arg) {
+  std::stringstream ss;
+  ss << func_name << "(" << func_arg << ")";
+  return ss.str();
+}
+
+//! Returns "func_name<template_arg>(func_arg)"
+template <typename FuncNameT, typename TemplateArgT, typename FuncArgT>
+std::string genCall(
+    const FuncNameT& func_name,
+    const TemplateArgT& template_arg,
+    const FuncArgT& func_arg) {
+  std::stringstream ss;
+  ss << func_name << "<" << template_arg << ">(" << func_arg << ")";
+  return ss.str();
+}
+
+class CudaKernelGenerator : private OptOutConstDispatch {
   static constexpr const char* kTab = "  ";
 
  public:
@@ -45,48 +146,70 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
     code_ << "__global__ void " << kernel_name << "(";
 
-    std::vector<kir::Val*> params;
+    std::unordered_set<Val*> unique_args;
+
+    std::vector<Val*> params;
 
     // Inputs & Outputs
     for (auto val : kernel_->inputs()) {
       params.push_back(val);
     }
     for (auto val : kernel_->outputs()) {
+      TORCH_INTERNAL_ASSERT(
+          !val->isScalar(), "No scalar output is allowed: ", val->toString());
       params.push_back(val);
     }
 
     // Generate parameter declarations
-    for (kir::Val* val : params) {
-      if (const auto tv = dynamic_cast<kir::TensorView*>(val)) {
-        code_ << "Tensor<" << val->dtype() << ", "
-              << TensorDomain::noReductions(
-                     tv->fuserTv()->getMaybeRFactorDomain())
-                     .size()
-              << "> " << varName(tv);
+    unsigned int duplicate_counter = 0;
+    for (auto i : c10::irange(params.size())) {
+      std::stringstream var_name_ss;
+      if (params[i]->isA<TensorView>()) {
+        var_name_ss << varName(params[i]->as<TensorView>());
+      } else {
+        var_name_ss << gen(params[i]);
+      }
+
+      // If value is duplicate in arguments change the name to avoid name
+      // conflicts in args.
+      if (!unique_args.emplace(params[i]).second) {
+        var_name_ss << "_duplicate_" << duplicate_counter++;
+      }
+
+      if (const auto tv = dynamic_cast<TensorView*>(params[i])) {
+        if (tv->isCpuScalar()) {
+          code_ << " CpuScalarTensor<" << params[i]->dtype() << "> "
+                << var_name_ss.str();
+        } else {
+          code_
+              << "Tensor<" << params[i]->dtype() << ", "
+              << TensorDomain::noReductions(tv->getMaybeRFactorDomain()).size()
+              << "> " << var_name_ss.str();
+        }
       } else {
-        TORCH_INTERNAL_ASSERT(val->isScalar()); // NOLINT (LLVM bug 48525)
-        TORCH_INTERNAL_ASSERT(val->definition() == nullptr);
-        code_ << val->dtype() << " " << gen(val);
+        TORCH_INTERNAL_ASSERT(params[i]->isScalar()); // NOLINT (LLVM bug 48525)
+        TORCH_INTERNAL_ASSERT(params[i]->definition() == nullptr);
+        code_ << params[i]->dtype() << " " << var_name_ss.str();
       }
 
-      if (val != params.back()) {
+      if (i + 1 != params.size()) {
         code_ << ", ";
       }
     }
 
     // Global buffers
     for (auto allocate : kernel_summary.global_allocations) {
-      TORCH_INTERNAL_ASSERT(allocate->buffer()->isA<kir::TensorView>());
-      const auto tv = allocate->buffer()->as<kir::TensorView>();
+      TORCH_INTERNAL_ASSERT(allocate->buffer()->isA<TensorView>());
+      const auto tv = allocate->buffer()->as<TensorView>();
       const auto& maybe_rfactor_domain = tv->domain()->hasRFactor()
-          ? tv->domain()->rfactorDomain()
-          : tv->domain()->rootDomain();
+          ? tv->domain()->getRFactorDomain()
+          : tv->domain()->getRootDomain();
       const auto nDims = std::count_if(
           maybe_rfactor_domain.begin(),
           maybe_rfactor_domain.end(),
-          [](const kir::IterDomain* id) {
+          [](const IterDomain* id) {
             return !id->isReduction() &&
-                id->iterType() != IterType::BroadcastWithoutStride;
+                id->getIterType() != IterType::BroadcastWithoutStride;
           });
       code_ << ", Tensor<" << tv->dtype() << ", " << nDims << "> "
             << varName(tv);
@@ -129,7 +252,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
     if (has_dynamic_smem || has_reductions || has_parallel_welford) {
       indent() << "alignas("
 #ifndef __HIP_PLATFORM_HCC__
-               << dataTypeSize(kernel_summary.largest_smem_data_type)
+               << 16 // always align to 16B for any shared mem allocation
 #else
                << 8 // for HIP, we want 8-aligned even for smaller datatypes
 #endif
@@ -177,7 +300,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
   void genBody() {
     for (auto expr : kernel_->topLevelExprs()) {
-      expr->accept(this);
+      OptOutConstDispatch::handle(expr);
     }
   }
 
@@ -204,139 +327,182 @@ class CudaKernelGenerator : private kir::IrVisitor {
     return code_;
   }
 
-  std::string gen(const kir::Node* node) {
+  std::string gen(const Statement* stmt) {
     std::stringstream tmp_code;
     std::swap(tmp_code, code_);
-    auto replacement = replacement_map_.find(node);
-    if (replacement != replacement_map_.end()) {
-      node = replacement->second;
-    }
-    node->accept(this);
+    OptOutConstDispatch::handle(stmt);
     std::swap(tmp_code, code_);
     return tmp_code.str();
   }
 
-  // TODO(kir): consider automatic var naming
-  std::string varName(const kir::Val* val) {
-    std::string prefix = "";
-    if (val->isA<kir::TensorView>()) {
-      prefix = "T";
+  std::string varName(const Val* val) {
+    std::stringstream name;
+    if (val->isA<TensorView>()) {
+      name << "T";
     } else {
-      prefix = typePrefix(val->dtype());
+      name << typePrefix(val->dtype());
     }
-
-    std::stringstream value_name;
-    if (val->name() != kInvalidStmName) {
-      value_name << prefix << val->name();
-    } else {
-      value_name << "k" << prefix << val->id();
-    }
-    return value_name.str();
+    name << val->name();
+    return name.str();
   }
 
-  std::string genInline(const kir::Node* node) {
+  std::string genInline(const Statement* stmt) {
     const bool saved_inline = print_inline_;
     print_inline_ = true;
-    auto result = gen(node);
+    auto result = gen(stmt);
     print_inline_ = saved_inline;
     // NOLINTNEXTLINE(performance-no-automatic-move)
     return result;
   }
 
-  void visit(const kir::Predicate* node) final {
-    TORCH_INTERNAL_ASSERT(node->hasValue());
-    code_ << gen(node->value());
+  void handle(const kir::Predicate* pred) final {
+    TORCH_INTERNAL_ASSERT(pred->hasValue());
+    code_ << gen(pred->value());
   }
 
-  void visit(const kir::Bool* node) final {
-    const auto def = node->definition();
-    if (print_inline_ && def != nullptr) {
+  void handle(const Bool* pred) final {
+    const auto def = pred->definition();
+    const bool has_alloc = alloc_map_.find(pred) != alloc_map_.end();
+    if (def != nullptr && !has_alloc) {
       code_ << "(" << gen(def) << ")";
-    } else if (node->isConst()) {
-      code_ << (*node->value() ? "true" : "false");
+    } else if (pred->isConst()) {
+      code_ << (*pred->value() ? "true" : "false");
     } else {
-      code_ << varName(node);
+      code_ << varName(pred);
     }
   }
 
-  void visit(const kir::Double* node) final {
-    const auto def = node->definition();
-    if (print_inline_ && def != nullptr) {
+  void handle(const Double* d) final {
+    const auto def = d->definition();
+    const bool has_alloc = alloc_map_.find(d) != alloc_map_.end();
+    if (def != nullptr && !has_alloc) {
       code_ << "(" << gen(def) << ")";
-    } else if (node->isConst()) {
-      const int digits = std::numeric_limits<Double::ScalarType>::max_digits10;
-      code_ << std::setprecision(digits) << *node->value();
+    } else if (d->isConst()) {
+      auto val = *d->value();
+      // note: default inf/nan doesn't work and should be replaced with macros
+      // `NAN`, `POS_INFINITY` and `NEG_INFINITY` instead.
+      if (std::isinf(val)) {
+        if (val > 0) {
+          code_ << "POS_INFINITY";
+        } else {
+          code_ << "NEG_INFINITY";
+        }
+      } else if (std::isnan(val)) {
+        code_ << "NAN";
+      } else {
+        const int digits =
+            std::numeric_limits<Double::ScalarType>::max_digits10;
+        code_ << std::setprecision(digits) << val;
+      }
     } else {
-      code_ << varName(node);
+      code_ << varName(d);
     }
   }
 
-  void visit(const kir::Int* node) final {
-    const auto def = node->definition();
-    if (print_inline_ && def != nullptr) {
+  void handle(const Int* i) final {
+    const auto def = i->definition();
+    const bool has_alloc = alloc_map_.find(i) != alloc_map_.end();
+    if (def != nullptr && !has_alloc) {
+      code_ << "(" << genInline(def) << ")";
+    } else if (i->isConst()) {
+      code_ << *i->value();
+    } else {
+      code_ << varName(i);
+    }
+  }
+
+  void handle(const ComplexDouble* c) final {
+    const auto def = c->definition();
+    const bool has_alloc = alloc_map_.find(c) != alloc_map_.end();
+    if (def != nullptr && !has_alloc) {
       code_ << "(" << gen(def) << ")";
-    } else if (node->isConst()) {
-      code_ << *node->value();
+    } else if (c->isConst()) {
+      const int digits = std::numeric_limits<double>::max_digits10;
+      code_ << "std::complex<double>" << std::setprecision(digits)
+            << *c->value();
     } else {
-      code_ << varName(node);
+      code_ << varName(c);
     }
   }
 
-  void visit(const kir::NamedScalar* node) final {
+  void handle(const NamedScalar* ns) final {
     // dim3 components are unsigned int. Cast to signed integer to
     // support negative indexing
-    if (node->getParallelIndex().has_value() ||
-        node->getParallelDim().has_value()) {
-      code_ << "((nvfuser_index_t)" << node->name() << ")";
+    if (ns->getParallelIndex().has_value() ||
+        ns->getParallelDim().has_value()) {
+      code_ << "((nvfuser_index_t)" << ns->name() << ")";
     } else {
-      code_ << node->name();
+      code_ << ns->name();
     }
   }
 
-  void visit(const kir::TensorIndex* node) final {
-    code_ << varName(node->view()) << "[";
-
+  void handle(const kir::TensorIndex* ti) final {
     bool first = true;
-    for (auto* ind : node->indices()) {
+    std::stringstream index;
+    for (auto* ind : ti->indices()) {
       if (!ind->isZeroInt()) {
         if (!first) {
-          code_ << " + ";
+          index << " + ";
         }
-        code_ << genInline(ind);
+        index << genInline(ind);
         first = false;
       }
     }
 
     if (first) {
-      code_ << "0";
+      index << "0";
     }
+    bool is_volatile = ti->view()->getMemoryType() == MemoryType::Global &&
+        kernel_->summary().sync_map.needsRawSync(ti->view()).hasBID();
+    if (is_volatile) {
+      code_ << "*(volatile " << ti->getDataType().value() << "*)&";
+    }
+    code_ << varName(ti->view()) << "[" << index.str() << "]";
+  }
 
-    code_ << "]";
+  void handle(const ViewAsScalar* sv) final {
+    indent() << gen(sv->output(0)) << " = " << gen(sv->input(0)) << "["
+             << gen(sv->index()) << "];\n";
   }
 
-  void visit(const kir::IterDomain* node) final {
-    TORCH_INTERNAL_ASSERT(false && "Unreachable");
+  void handle(const IterDomain*) final {
+    TORCH_INTERNAL_ASSERT(false, "Unreachable");
   }
 
-  void visit(const kir::TensorDomain* node) final {
-    TORCH_INTERNAL_ASSERT(false && "Unreachable");
+  void handle(const TensorDomain*) final {
+    TORCH_INTERNAL_ASSERT(false, "Unreachable");
   }
 
-  void visit(const kir::TensorView* tv) final {
-    TORCH_INTERNAL_ASSERT(false && "Unreachable");
+  void handle(const TensorView*) final {
+    TORCH_INTERNAL_ASSERT(false, "Unreachable");
   }
 
-  void visit(const kir::UnaryOp* node) final {
+  void handle(const UnaryOp* uop) final {
     bool is_vector_op = false;
     size_t vector_word_size = 1;
 
-    if (vectorize_scope_ && node->out()->isA<kir::TensorIndex>()) {
-      auto ti = node->out()->as<kir::TensorIndex>();
+    if (uop->out()->isA<kir::TensorIndex>()) {
+      auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
+      if (std::any_of(
+              out_tv->domain()->domain().begin(),
+              out_tv->domain()->domain().end(),
+              [&](IterDomain* id) { return id->isMma(); })) {
+        auto mma = dynamic_cast<MmaOp*>(
+            uop->out()->as<kir::TensorIndex>()->view()->definition());
+        TORCH_INTERNAL_ASSERT(
+            mma != nullptr, "CodeGen: mma op not in mma loop");
+        genMmaInitialization(mma, uop);
+        return;
+      }
+    }
+
+    if (vectorize_scope_ && uop->out()->isA<kir::TensorIndex>()) {
+      auto ti = uop->out()->as<kir::TensorIndex>();
 
       bool vectorize_op = false;
       bool misaligned_op = false;
 
-      for (auto id : ti->view()->fuserTv()->domain()->domain()) {
+      for (auto id : ti->view()->domain()->domain()) {
         if (!isParallelTypeVectorize(id->getParallelType())) {
           continue;
         }
@@ -358,84 +524,135 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
       if (vectorize_op) {
         TORCH_INTERNAL_ASSERT(
-            node->operation() == UnaryOpType::Set,
+            uop->getUnaryOpType() == UnaryOpType::Set,
             "Cannot vectorize operations that are not sets. ",
-            "Use cache_before and cache_after to store/load with vectorized reads into buffers.");
+            "Use cacheBefore and cacheAfter to store/load with vectorized reads into buffers.");
         is_vector_op = true;
       }
 
       if (misaligned_op) {
-        is_vector_op = (node->operation() == UnaryOpType::Set);
+        is_vector_op = (uop->getUnaryOpType() == UnaryOpType::Set);
       }
 
-      if (is_vector_op && !node->in()->isScalar()) {
+      if (is_vector_op && !uop->in()->isScalar()) {
         TORCH_INTERNAL_ASSERT(
-            node->out()->dtype() == node->in()->dtype(),
+            uop->out()->dtype() == uop->in()->dtype(),
             "Vectorized store/load requires input and output datatypes match.");
       }
-    }
 
-    if (is_vector_op) {
-      if (node->in()->isScalar()) {
-        indent() << "reinterpret_cast<"
-                 << "Array<" << node->out()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(node->out()) << ")->set(" << gen(node->in())
-                 << ");\n";
-      } else {
-        indent() << "*reinterpret_cast<"
-                 << "Array<" << node->out()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(node->out()) << ")"
-                 << " = *reinterpret_cast<"
-                 << "Array<" << node->in()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(node->in()) << ");\n";
+      if (is_vector_op) {
+        auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
+        if (uop->in()->isScalar()) {
+          // Note:
+          //  Double buffered local tensors need indexed initialization,
+          //   so will need to use `arraySet` option.
+          if (out_tv->getMemoryType() == MemoryType::Local &&
+              !out_tv->isDoubleBuffered()) {
+            // Vectorized initialization
+            indent() << varName(out_tv) << ".set(" << gen(uop->in()) << ");\n";
+          } else {
+            // Note: currently arraySet option is not vectorized, so it will
+            //  rely on auto vectorization pass of cuda compiler.
+            indent() << "arraySet<" << out_tv->getDataType().value() << ", "
+                     << vector_word_size << ">(&" << gen(uop->out()) << ", "
+                     << "(" << out_tv->getDataType().value() << ")"
+                     << gen(uop->in()) << ");\n";
+          }
+        } else {
+          // Vectorized load
+          TORCH_INTERNAL_ASSERT(
+              uop->in()->isA<kir::TensorIndex>(),
+              "Invalid input to unary op with tensor output, found: ",
+              uop->in()->toString());
+
+          auto in_tv = uop->in()->as<kir::TensorIndex>()->view();
+          bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
+              in_tv->getMemoryType() == MemoryType::Local;
+
+          bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local &&
+              in_tv->getMemoryType() == MemoryType::Global;
+
+          bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
+              in_tv->getMemoryType() == MemoryType::Global;
+
+          bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global &&
+              kernel_->summary().sync_map.needsRawSync(out_tv).hasBID();
+
+          bool is_volatile_from =
+              in_tv->getMemoryType() == MemoryType::Global &&
+              kernel_->summary().sync_map.needsRawSync(in_tv).hasBID();
+
+          if (localToGlobal) {
+            indent() << "loadLocalToGlobal<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ", "
+                     << (is_volatile_to ? "true" : "false") << ">(";
+            code_ << " &" << gen(uop->out()) << ", &" << gen(uop->in())
+                  << ");\n";
+          } else if (globalToLocal) {
+            indent() << "loadGlobalToLocal<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ", "
+                     << (is_volatile_from ? "true" : "false") << ">(&"
+                     << gen(uop->out()) << ", ";
+            code_ << " &" << gen(uop->in()) << ");\n";
+          } else if (globalToGlobal) {
+            indent() << "loadGlobalToGlobal<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ", "
+                     << (is_volatile_to ? "true" : "false") << ", "
+                     << (is_volatile_from ? "true" : "false") << ">(";
+            code_ << " &" << gen(uop->out()) << ", ";
+            code_ << " &" << gen(uop->in()) << ");\n";
+          } else {
+            indent() << "loadGeneric<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ">(";
+            code_ << " &" << gen(uop->out()) << ", ";
+            code_ << " &" << gen(uop->in()) << ");\n";
+          }
+        }
+        return;
       }
-      return;
     }
 
-    if (node->out()->isA<kir::NamedScalar>()) {
-      const auto op_type = node->operation();
+    if (uop->out()->isA<NamedScalar>()) {
+      const auto op_type = uop->getUnaryOpType();
       if (auto op = inline_op_str(op_type)) {
-        indent() << gen(node->out()) << " = " << *op << genInline(node->in())
+        indent() << gen(uop->out()) << " = " << *op << genInline(uop->in())
                  << ";\n";
       }
       return;
     }
 
     if (!print_inline_) {
-      indent() << gen(node->out());
-      if (!node->out()->isScalar() && !node->in()->isScalar()) {
+      indent() << gen(uop->out());
+      if (!uop->out()->isScalar() && !uop->in()->isScalar()) {
         code_ << "\n";
         indent() << kTab;
       }
       code_ << " = ";
     }
 
-    const auto op_type = node->operation();
+    const auto op_type = uop->getUnaryOpType();
     if (auto op = inline_op_str(op_type)) {
       if (alsoBooleanOperator(op_type) &&
-          node->out()->dtype() == DataType::Bool) {
-        code_ << stringifyBooleanOp(op_type) << gen(node->in());
+          uop->out()->dtype() == DataType::Bool) {
+        code_ << stringifyBooleanOp(op_type) << gen(uop->in());
       } else {
-        code_ << *op << gen(node->in());
+        code_ << *op << gen(uop->in());
       }
     } else {
       if (op_type == UnaryOpType::Cast) {
         const auto cast_str =
-            cast_func_str({node->in()->dtype(), node->out()->dtype()});
+            cast_func_str({uop->in()->dtype(), uop->out()->dtype()});
         TORCH_INTERNAL_ASSERT(
             cast_str.has_value(),
             "Invalid cast. Input type: ",
-            node->in()->dtype(),
+            uop->in()->dtype(),
             ", output type: ",
-            node->out()->dtype());
+            uop->out()->dtype());
         code_ << cast_str.value();
       } else {
         code_ << op_type;
         if (needFloatSuffix(op_type) &&
-            node->out()->dtype() == DataType::Float) {
+            uop->out()->dtype() == DataType::Float) {
           code_ << "f";
         }
       }
@@ -444,7 +661,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
       if (op_type == UnaryOpType::RandLike) {
         code_ << "rnd";
       } else {
-        code_ << gen(node->in());
+        code_ << gen(uop->in());
       }
       code_ << ")";
     }
@@ -456,25 +673,28 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
   std::string genBinaryOp(
       BinaryOpType op_type,
-      kir::Val* out,
+      DataType data_type,
       const std::string& lhs,
       const std::string& rhs) {
     std::stringstream expr;
     if (auto op = inline_op_str(op_type)) {
       expr << lhs << " ";
-      if (alsoBooleanOperator(op_type) && out->dtype() == DataType::Bool) {
+      if (alsoBooleanOperator(op_type) && data_type == DataType::Bool) {
         expr << stringifyBooleanOp(op_type);
       } else {
         expr << *op;
       }
       expr << " " << rhs;
     } else {
-      if (integer_op_str(op_type) && isIntegralType(out->dtype())) {
+      if (integer_op_str(op_type) && isIntegralType(data_type)) {
         auto int_op = integer_op_str(op_type);
         expr << *int_op;
+      } else if (bool_op_str(op_type) && isBooleanType(data_type)) {
+        auto bool_op = bool_op_str(op_type);
+        expr << *bool_op;
       } else {
         expr << op_type;
-        if (needFloatSuffix(op_type) && out->dtype() == DataType::Float) {
+        if (needFloatSuffix(op_type) && data_type == DataType::Float) {
           expr << "f";
         }
       }
@@ -485,7 +705,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
   // If one argument is a tensorview and the other is a scalar, make sure we
   // cast the scalar to the tensorview type
-  std::string scalarCast(kir::Val* lhs, kir::Val* rhs) {
+  std::string scalarCast(Val* lhs, Val* rhs) {
     // If neither are scalars return
     if (!((lhs->isScalar() || rhs->isScalar()) &&
           (lhs->isA<kir::TensorIndex>() || rhs->isA<kir::TensorIndex>()))) {
@@ -520,18 +740,18 @@ class CudaKernelGenerator : private kir::IrVisitor {
   }
 
   // If possible, replace pow with mul. Return true when successful.
-  bool genPowerWithMul(const kir::BinaryOp* node) {
-    if (node->operation() != BinaryOpType::Pow) {
+  bool genPowerWithMul(const BinaryOp* bop) {
+    if (bop->getBinaryOpType() != BinaryOpType::Pow) {
       return false;
     }
 
-    auto rhs = node->rhs();
+    auto rhs = bop->rhs();
     c10::optional<double> exponent;
-    if (auto val_int = dynamic_cast<kir::Int*>(rhs)) {
+    if (auto val_int = dynamic_cast<Int*>(rhs)) {
       if (val_int->isConst()) {
         exponent = val_int->value().value();
       }
-    } else if (auto val_float = dynamic_cast<kir::Double*>(rhs)) {
+    } else if (auto val_float = dynamic_cast<Double*>(rhs)) {
       if (val_float->isConst()) {
         auto fp_exp = val_float->value().value();
         double int_exp = 0;
@@ -550,7 +770,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
       return false;
     }
 
-    auto lhs = gen(node->lhs());
+    auto lhs = gen(bop->lhs());
 
     if (print_inline_) {
       code_ << lhs << " * " << lhs;
@@ -558,8 +778,8 @@ class CudaKernelGenerator : private kir::IrVisitor {
         code_ << " * " << lhs;
       }
     } else {
-      indent() << gen(node->out());
-      if (node->out()->isScalar()) {
+      indent() << gen(bop->out());
+      if (bop->out()->isScalar()) {
         code_ << " = " << lhs << " * " << lhs;
         if (exponent.value() == 3) {
           code_ << " * " << lhs;
@@ -579,24 +799,27 @@ class CudaKernelGenerator : private kir::IrVisitor {
     return true;
   }
 
-  void visit(const kir::BinaryOp* node) final {
+  void handle(const BinaryOp* bop) final {
     // Try replacing pow with mul
-    if (genPowerWithMul(node)) {
+    if (genPowerWithMul(bop)) {
       return;
     }
 
-    const auto op_type = node->operation();
+    const auto op_type = bop->getBinaryOpType();
     if (print_inline_) {
       // Inline expression: `lhs op rhs`
       code_ << genBinaryOp(
-          op_type, node->out(), gen(node->lhs()), gen(node->rhs()));
+          op_type, bop->out()->dtype(), gen(bop->lhs()), gen(bop->rhs()));
     } else {
-      indent() << gen(node->out());
-      if (node->out()->isScalar()) {
+      indent() << gen(bop->out());
+      if (bop->out()->isScalar()) {
         // Single line: `out = lhs op rhs;`
         code_ << " = "
               << genBinaryOp(
-                     op_type, node->out(), gen(node->lhs()), gen(node->rhs()));
+                     op_type,
+                     bop->out()->dtype(),
+                     gen(bop->lhs()),
+                     gen(bop->rhs()));
       } else {
         // Split TensorView expressions across multiple lines:
         //
@@ -605,64 +828,68 @@ class CudaKernelGenerator : private kir::IrVisitor {
         //    op rhs;
         //
 
-        auto cast = scalarCast(node->lhs(), node->rhs());
+        auto cast = scalarCast(bop->lhs(), bop->rhs());
         if (auto op = inline_op_str(op_type)) {
           code_ << "\n";
-          indent() << kTab << "= " << (node->lhs()->isScalar() ? cast : "")
-                   << gen(node->lhs()) << "\n";
+          indent() << kTab << "= " << (bop->lhs()->isScalar() ? cast : "")
+                   << gen(bop->lhs()) << "\n";
           indent() << kTab;
           if (alsoBooleanOperator(op_type) &&
-              node->out()->dtype() == DataType::Bool) {
+              bop->out()->dtype() == DataType::Bool) {
             code_ << stringifyBooleanOp(op_type);
           } else {
             code_ << *op;
           }
-          code_ << " " << (node->rhs()->isScalar() ? cast : "")
-                << gen(node->rhs());
+          code_ << " " << (bop->rhs()->isScalar() ? cast : "")
+                << gen(bop->rhs());
         } else {
-          if (integer_op_str(op_type) && isIntegralType(node->out()->dtype())) {
+          if (integer_op_str(op_type) && isIntegralType(bop->out()->dtype())) {
             auto int_op = integer_op_str(op_type);
             code_ << " = " << *int_op << "(\n";
+          } else if (
+              bool_op_str(op_type) && isBooleanType(bop->out()->dtype())) {
+            auto bool_op = bool_op_str(op_type);
+            code_ << " = " << *bool_op << "(\n";
           } else {
             std::stringstream op_str;
             op_str << op_type;
             if (needFloatSuffix(op_type) &&
-                node->out()->dtype() == DataType::Float) {
+                bop->out()->dtype() == DataType::Float) {
               op_str << "f";
             }
             code_ << " = " << op_str.str() << "(\n";
           }
-          indent() << kTab << (node->lhs()->isScalar() ? cast : "")
-                   << gen(node->lhs()) << ",\n";
-          indent() << kTab << (node->rhs()->isScalar() ? cast : "")
-                   << gen(node->rhs()) << ")";
+          indent() << kTab << (bop->lhs()->isScalar() ? cast : "")
+                   << gen(bop->lhs()) << ",\n";
+          indent() << kTab << (bop->rhs()->isScalar() ? cast : "")
+                   << gen(bop->rhs()) << ")";
         }
       }
       code_ << ";\n";
     }
   }
 
-  void visit(const kir::TernaryOp* node) final {
+  void handle(const TernaryOp* top) final {
     if (!print_inline_) {
-      indent() << gen(node->out());
-      if (!node->out()->isScalar()) {
+      indent() << gen(top->out());
+      if (!top->out()->isScalar()) {
         code_ << "\n";
         indent() << kTab;
       }
       code_ << " = ";
     }
 
-    code_ << node->operation() << "(" << gen(node->in1()) << ", ";
+    code_ << top->getTernaryOpType() << "(" << gen(top->in1()) << ", ";
 
     // Make sure the two operands of where has the same
     // type. Note that compiling "where(0.0f, 0.0)" fails because of
     // the overloading ambiguity.
-    if (node->operation() == TernaryOpType::Where) {
-      auto cast = scalarCast(node->in2(), node->in3());
-      code_ << (node->in2()->isScalar() ? cast : "") << gen(node->in2()) << ", "
-            << (node->in3()->isScalar() ? cast : "") << gen(node->in3()) << ")";
+    if (top->getTernaryOpType() == TernaryOpType::Where) {
+      auto cast = scalarCast(top->in2(), top->in3());
+      code_ << (top->in2()->isScalar() ? cast : "") << gen(top->in2()) << ", "
+            << (top->in3()->isScalar() ? cast : "") << gen(top->in3()) << ")";
     } else {
-      code_ << gen(node->in2()) << ", " << gen(node->in3()) << ")";
+      code_ << gen(top->in2()) << ", " << gen(top->in3()) << ")";
     }
 
     if (!print_inline_) {
@@ -670,56 +897,134 @@ class CudaKernelGenerator : private kir::IrVisitor {
     }
   }
 
-  std::string genReductionOp(BinaryOpType op_type, kir::Val* out) {
+  std::string genArchString(MmaOptions options) {
+    std::stringstream ss;
+    if (isVolta(options.macro)) {
+      ss << "Volta";
+    } else if (isTuring(options.macro)) {
+      ss << "Turing";
+    } else if (isAmpere(options.macro)) {
+      ss << "Ampere";
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "mma macro unknown arch");
+    }
+    return ss.str();
+  }
+
+  std::string genMmaOp(const MmaOp* mma, bool init = false) {
+    std::stringstream ss;
+    auto options = mma->options();
+    ss << genArchString(options) << "::";
+    if (init) {
+      ss << "init";
+    }
+    ss << toString(options.macro) << toString(options.operand_layout);
+    // TODO: additional parameter could be removed by swizzling iterdomain
+    auto acc_stride = mma->accStride();
+    TORCH_INTERNAL_ASSERT(acc_stride > 0);
+    ss << "<" << acc_stride << ">";
+    return ss.str();
+  }
+
+  void genMmaOperands(const MmaOp* mma) {
+    std::stringstream ss;
+    auto options = mma->options();
+    auto in_a = mma->inA()->as<kir::TensorIndex>()->view();
+    auto dtype = in_a->getDataType().value();
+    indent() << kTab << "reinterpret_cast<Array<" << dtype << ","
+             << getInputARegisterSize(options.macro) << ","
+             << getInputARegisterSize(options.macro) << ">*>(&"
+             << gen(mma->inA()) << "),\n";
+    indent() << kTab << "reinterpret_cast<Array<" << dtype << ","
+             << getInputBRegisterSize(options.macro) << ","
+             << getInputBRegisterSize(options.macro) << ">*>(&"
+             << gen(mma->inB()) << ")";
+  }
+
+  void genMmaInitialization(const MmaOp* mma, const UnaryOp* uop) {
+    auto options = mma->options();
+
+    indent() << genMmaOp(mma, true) << "(reinterpret_cast<Array<"
+             << mma->out()->getDataType().value() << ","
+             << getOutputRegisterSize(options.macro) << ","
+             << getOutputRegisterSize(options.macro) << ">*>"
+             << "(&" << gen(uop->out()) << "));\n";
+  }
+
+  void handle(const MmaOp* mma) final {
+    auto options = mma->options();
+    auto out = mma->out()->as<kir::TensorIndex>();
+    indent() << genMmaOp(mma) << "(\n";
+    indent() << kTab << "reinterpret_cast<Array<"
+             << out->view()->getDataType().value() << ","
+             << getOutputRegisterSize(options.macro) << ","
+             << getOutputRegisterSize(options.macro) << ">*>(&"
+             << gen(mma->out()) << "),\n";
+    genMmaOperands(mma);
+    code_ << ");\n";
+  }
+
+  std::string genReductionOp(BinaryOpType op_type, DataType data_type) {
     std::stringstream lambda;
-    DataType data_type = out->dtype();
     lambda << "[](" << data_type << " &a, " << data_type << " b) "
-           << "{ a = " << genBinaryOp(op_type, out, "a", "b") << "; }";
+           << "{ a = " << genBinaryOp(op_type, data_type, "a", "b") << "; }";
     return lambda.str();
   }
 
-  void visit(const kir::BroadcastOp* node) final {
-    TORCH_INTERNAL_ASSERT(node->out()->isA<kir::TensorIndex>());
-    const auto tensor_index = node->out()->as<kir::TensorIndex>();
-
-    const ParallelTypeBitmap domains =
-        kernel_->predicateMap().getParallelBroadcastDomains(
-            tensor_index->view()->fuserTv());
+  void handle(const BroadcastOp* stmt) final {
+    TORCH_INTERNAL_ASSERT(stmt->out()->isA<kir::TensorIndex>());
 
-    const bool thread_x = domains.get(ParallelType::TIDx);
-    const bool thread_y = domains.get(ParallelType::TIDy);
-    const bool thread_z = domains.get(ParallelType::TIDz);
-    const bool block_x = domains.get(ParallelType::BIDx);
-    const bool block_y = domains.get(ParallelType::BIDy);
-    const bool block_z = domains.get(ParallelType::BIDz);
+    const ParallelTypeBitmap parallel_types =
+        kernel_->summary().broadcast_parallel_types.at(stmt);
 
-    const bool grid_broadcast_needed = block_x || block_y || block_z;
-    const bool block_broadcast_needed = thread_x || thread_y || thread_z;
+    if (parallel_types.none()) {
+      // Not parallelized
+      indent() << gen(stmt->out()) << "\n";
+      indent() << kTab << " = " << gen(stmt->in()) << ";\n";
+      return;
+    }
 
     TORCH_INTERNAL_ASSERT(
-        !grid_broadcast_needed,
-        "Parallel broadcast across blocks not supported");
-
-    if (block_broadcast_needed) {
-      const auto data_type = node->out()->dtype();
-      indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false")
-               << ", " << (thread_y ? "true" : "false") << ", "
-               << (thread_z ? "true" : "false") << ">(\n";
-      indent() << kTab << gen(node->out()) << ",\n";
-      indent() << kTab << gen(node->in()) << ",\n";
-      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
-      TORCH_INTERNAL_ASSERT(
-          node->predicate() != nullptr && node->predicate()->hasValue());
-      indent() << kTab << genInline(node->predicate()) << ");\n";
-    } else {
-      indent() << gen(node->out()) << "\n";
-      indent() << kTab << " = " << gen(node->in()) << ";\n";
+        !parallel_types.hasBID(),
+        "Parallel broadcast across blocks should have been translated to a GridBroadcast IR node");
+
+    std::stringstream flags_str;
+    for (const ParallelType pt : kParallelTypeTIDs) {
+      const bool parallel_bcast = parallel_types.get(pt);
+      if (pt != kParallelTypeTIDs[0]) {
+        flags_str << ", ";
+      }
+      flags_str << (parallel_bcast ? "true" : "false");
     }
+
+    const auto data_type = stmt->out()->dtype();
+    indent() << "broadcast::blockBroadcast<" << flags_str.str() << ">(\n";
+    indent() << kTab << gen(stmt->out()) << ",\n";
+    indent() << kTab << gen(stmt->in()) << ",\n";
+    indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+    TORCH_INTERNAL_ASSERT(
+        stmt->predicate() != nullptr && stmt->predicate()->hasValue());
+    indent() << kTab << genInline(stmt->predicate()) << ");\n";
+  }
+
+  void genSerialReduction(
+      const kir::TensorIndex* output,
+      const Val* input,
+      BinaryOpType reduction_op_type) {
+    const auto gen_out = gen(output);
+    indent() << gen_out << " = "
+             << genBinaryOp(
+                    reduction_op_type, output->dtype(), gen_out, gen(input))
+             << ";\n";
+    return;
   }
 
-  void genWarpReductionOp(
-      const kir::ReductionOp* node,
-      const IterDomain* reduction_id) {
+  void genWarpReduction(
+      const kir::TensorIndex* output,
+      const kir::TensorIndex* input,
+      const Val* init,
+      BinaryOpType reduction_op_type,
+      kir::Predicate* read_pred) {
     bool is_single_warp =
         kernel_->getWarpPaddedParallelInfo().is_tidx_single_warp;
 
@@ -729,43 +1034,27 @@ class CudaKernelGenerator : private kir::IrVisitor {
     } else {
       code_ << "<false>(\n";
     }
-    indent() << kTab << gen(node->out()) << ",\n";
-    indent() << kTab << gen(node->in()) << ",\n";
-    indent() << kTab << genReductionOp(node->operation(), node->out()) << ",\n";
+    indent() << kTab << gen(output) << ",\n";
+    indent() << kTab << gen(input) << ",\n";
+    indent() << kTab << genReductionOp(reduction_op_type, output->dtype())
+             << ",\n";
     indent() << kTab << "threadIdx,\n";
     indent() << kTab << "blockDim,\n";
-    indent() << kTab << "static_cast<" << node->out()->dtype()
+    indent() << kTab << "static_cast<" << output->dtype()
              << "*>(shared_mem),\n";
-    TORCH_INTERNAL_ASSERT(
-        node->predicate() != nullptr && node->predicate()->hasValue());
-    indent() << kTab << genInline(node->predicate()) << ",\n";
-    indent() << kTab << node->out()->dtype() << "(" << genInline(node->init())
-             << "));\n";
+    TORCH_INTERNAL_ASSERT(read_pred != nullptr && read_pred->hasValue());
+    indent() << kTab << genInline(read_pred) << ",\n";
+    indent() << kTab << output->dtype() << "(" << genInline(init) << "));\n";
   }
 
-  void visit(const kir::ReductionOp* node) final {
-    TORCH_INTERNAL_ASSERT(node->out()->isA<kir::TensorIndex>());
-
-    const auto out = node->out()->as<kir::TensorIndex>();
-    const auto domain = out->view()->domain();
-
-    const bool has_block_reduce = domain->hasBlockReduction();
-    const bool has_grid_reduce = domain->hasGridReduction();
-
-    if (!has_block_reduce && !has_grid_reduce) {
-      const auto gen_out = gen(out);
-      const auto op_type = node->operation();
-      indent() << gen_out << " = "
-               << genBinaryOp(op_type, out, gen_out, gen(node->in())) << ";\n";
-      return;
-    }
-
-    if (auto reduction_id = ir_utils::getMaybeWarpReductionDim(node)) {
-      genWarpReductionOp(node, reduction_id.value());
-      return;
-    }
-
-    const auto par_domains = ir_utils::getParallelDomains(node->out());
+  void genBlockReduction(
+      const kir::TensorIndex* output,
+      const kir::TensorIndex* input,
+      const Val* init,
+      BinaryOpType reduction_op_type,
+      kir::Predicate* read_pred,
+      kir::Predicate* write_pred) {
+    const auto par_domains = ir_utils::getParallelDomains(output);
     // Get parallel reduction domains
     const bool tidx =
         par_domains.find(ParallelType::TIDx) != par_domains.end() &&
@@ -777,59 +1066,80 @@ class CudaKernelGenerator : private kir::IrVisitor {
         par_domains.find(ParallelType::TIDz) != par_domains.end() &&
         par_domains.at(ParallelType::TIDz)->isReduction();
 
-    const auto data_type = node->out()->dtype();
-    const auto op_type = node->operation();
+    const auto data_type = output->dtype();
 
-    if (has_block_reduce) {
-      if (has_grid_reduce) {
-        indent() << data_type << " "
-                 << "block_result_" << block_reduce_name_ << "="
-                 << gen(node->init()) << ";\n";
-      }
-      indent() << "blockReduce<" << (tidx ? "true" : "false") << ", "
-               << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false")
-               << ">(\n";
-      if (has_grid_reduce) {
-        indent() << kTab << "block_result_" << block_reduce_name_ << ",\n";
-      } else {
-        indent() << kTab << gen(node->out()) << ",\n";
-      }
-      indent() << kTab << gen(node->in()) << ",\n";
-      indent() << kTab << genReductionOp(op_type, node->out()) << ",\n";
-      indent() << kTab << "threadIdx,\n";
-      indent() << kTab << "blockDim,\n";
-      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
-      TORCH_INTERNAL_ASSERT(
-          node->predicate() != nullptr && node->predicate()->hasValue());
-      auto read_pred = genInline(node->predicate());
-      indent() << kTab << read_pred << ",\n";
-      // Pass the write predicate if available and different from the
-      // default predicate. The blockReduce runtime function uses the
-      // default predicate for both read and write when only the
-      // default one is given.
-      if (node->writePredicate() != nullptr) {
-        TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue());
-        auto write_pred = genInline(node->writePredicate());
-        indent() << kTab << write_pred << ",\n";
-      }
-      indent() << kTab << data_type << "(" << genInline(node->init())
-               << "));\n";
+    indent() << "blockReduce<" << (tidx ? "true" : "false") << ", "
+             << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false")
+             << ">(\n";
+    indent() << kTab << gen(output) << ",\n";
+    indent() << kTab << gen(input) << ",\n";
+    indent() << kTab << genReductionOp(reduction_op_type, output->dtype())
+             << ",\n";
+    indent() << kTab << "threadIdx,\n";
+    indent() << kTab << "blockDim,\n";
+    indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+    TORCH_INTERNAL_ASSERT(read_pred != nullptr && read_pred->hasValue());
+    indent() << kTab << genInline(read_pred) << ",\n";
+    // Pass the write predicate if available and different from the
+    // default predicate. The blockReduce runtime function uses the
+    // default predicate for both read and write when only the
+    // default one is given.
+    if (write_pred != nullptr) {
+      TORCH_INTERNAL_ASSERT(write_pred->hasValue());
+      indent() << kTab << genInline(write_pred) << ",\n";
+    }
+    indent() << kTab << data_type << "(" << genInline(init) << "));\n";
+  }
+
+  void handle(const ReductionOp* rop) final {
+    TORCH_INTERNAL_ASSERT(rop->out()->isA<kir::TensorIndex>());
+
+    const auto output = rop->out()->as<kir::TensorIndex>();
+    const auto input = rop->in()->as<kir::TensorIndex>();
+    const auto domain = output->view()->domain();
+    const auto op_type = rop->getReductionOpType();
+
+    const bool has_block_reduce = domain->hasBlockReduction();
+    const bool has_grid_reduce = domain->hasGridReduction();
+
+    TORCH_INTERNAL_ASSERT(
+        !has_grid_reduce,
+        "ReductionOp does not support block parallelization. GridReductionOp must be used. ",
+        rop->toString());
+
+    if (!has_block_reduce) {
+      genSerialReduction(output, input, op_type);
+    } else if (
+        auto reduction_id = ir_utils::getMaybeWarpReductionDim(output, input)) {
+      genWarpReduction(output, input, rop->init(), op_type, rop->predicate());
+    } else {
+      genBlockReduction(
+          output,
+          input,
+          rop->init(),
+          op_type,
+          rop->predicate(),
+          rop->writePredicate());
     }
   }
 
-  void visit(const kir::WelfordOp* node) final {
-    TORCH_INTERNAL_ASSERT(node->out()->isA<kir::TensorIndex>());
+  void handle(const WelfordOp* wop) final {
+    TORCH_INTERNAL_ASSERT(wop->out()->isA<kir::TensorIndex>());
 
-    const auto out = node->out()->as<kir::TensorIndex>();
+    const auto out = wop->out()->as<kir::TensorIndex>();
     const auto domain = out->view()->domain();
 
-    const auto out_var = node->outVar();
-    const auto out_avg = node->outAvg();
-    const auto out_N = node->outN();
+    const auto out_var = wop->outVar();
+    const auto out_avg = wop->outAvg();
+    const auto out_N = wop->outN();
+
+    const auto in_var = wop->inVar();
+    const auto in_avg = wop->inAvg();
+    const auto in_N = wop->inN();
 
-    const auto in_var = node->inVar();
-    const auto in_avg = node->inAvg();
-    const auto in_N = node->inN();
+    // inVar was allowed to be nullptr. Make sure it isn't.
+    TORCH_INTERNAL_ASSERT(
+        in_var != nullptr, "Welford var input nullptr not allowed");
 
     const bool has_block_reduce = domain->hasBlockReduction();
     const bool has_grid_reduce = domain->hasGridReduction();
@@ -838,21 +1148,17 @@ class CudaKernelGenerator : private kir::IrVisitor {
     if (!has_block_reduce && !has_grid_reduce) {
       indent() << "welfordCombine ("
                << "\n";
-      indent() << " " << gen(out_avg) << ",\n";
-      indent() << " " << gen(out_var) << ",\n";
-      indent() << " " << gen(out_N) << ",\n";
-      indent() << " " << gen(in_avg) << ",\n";
-      if (in_var) {
-        indent() << " " << gen(in_var) << ",\n";
-      } else {
-        indent() << " (" << in_avg->dtype() << ") 0"
-                 << ",\n";
-      }
-      indent() << " (" << out_N->dtype() << ")" << gen(in_N) << ");\n";
+      indent() << kTab << gen(out_avg) << ",\n";
+      indent() << kTab << gen(out_var) << ",\n";
+      indent() << kTab << gen(out_N) << ",\n";
+      indent() << kTab << gen(in_avg) << ",\n";
+      indent() << kTab << "(" << out_avg->dtype() << ")" << gen(in_var)
+               << ",\n";
+      indent() << kTab << "(" << out_N->dtype() << ")" << gen(in_N) << ");\n";
       return;
     }
 
-    const auto par_domains = ir_utils::getParallelDomains(node->out());
+    const auto par_domains = ir_utils::getParallelDomains(wop->out());
     // Get parallel reduction domains
     const bool tidx =
         par_domains.find(ParallelType::TIDx) != par_domains.end() &&
@@ -864,57 +1170,52 @@ class CudaKernelGenerator : private kir::IrVisitor {
         par_domains.find(ParallelType::TIDz) != par_domains.end() &&
         par_domains.at(ParallelType::TIDz)->isReduction();
 
-    const auto data_type = node->out()->dtype();
+    const auto data_type = wop->out()->dtype();
 
     if (has_block_reduce) {
       if (has_grid_reduce) {
         // allocate block result
         indent() << data_type << " "
                  << "block_result_avg_" << block_reduce_name_ << " = "
-                 << gen(node->initAvg()) << ";\n";
+                 << gen(wop->initAvg()) << ";\n";
         indent() << data_type << " "
                  << "block_result_var_" << block_reduce_name_ << " = "
-                 << gen(node->initVar()) << ";\n";
-        indent() << DataType::Int << " "
+                 << gen(wop->initVar()) << ";\n";
+        indent() << out_N->dtype() << " "
                  << "block_result_n_" << block_reduce_name_ << " = "
-                 << gen(node->initN()) << ";\n";
+                 << gen(wop->initN()) << ";\n";
       }
       indent() << "blockWelford<" << (tidx ? "true" : "false") << ", "
                << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false")
                << ">(\n";
       if (has_grid_reduce) {
-        indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n"
-                 << kTab << "block_result_var_" << block_reduce_name_ << ",\n"
-                 << kTab << "block_result_n_" << block_reduce_name_ << ",\n";
+        indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n";
+        indent() << kTab << "block_result_var_" << block_reduce_name_ << ",\n";
+        indent() << kTab << "block_result_n_" << block_reduce_name_ << ",\n";
       } else {
-        indent() << kTab << gen(node->outAvg()) << ",\n";
-        indent() << kTab << gen(node->outVar()) << ",\n";
-        indent() << kTab << gen(node->outN()) << ",\n";
+        indent() << kTab << gen(wop->outAvg()) << ",\n";
+        indent() << kTab << gen(wop->outVar()) << ",\n";
+        indent() << kTab << gen(wop->outN()) << ",\n";
       }
-      indent() << " " << gen(in_avg) << ",\n";
-      if (in_var) {
-        indent() << " " << gen(in_var) << ",\n";
-      } else {
-        indent() << " (" << in_avg->dtype() << ") 0"
-                 << ",\n";
-      }
-      indent() << out_N->dtype() << "(" << gen(in_N) << "),\n";
+      indent() << kTab << gen(in_avg) << ",\n";
+      indent() << kTab << out_avg->dtype() << "(" << gen(in_var) << "),\n";
+      indent() << kTab << out_N->dtype() << "(" << gen(in_N) << "),\n";
       indent() << kTab << "threadIdx,\n";
       indent() << kTab << "blockDim,\n";
       indent() << kTab << "reinterpret_cast<" << data_type
                << "*>(shared_mem_avg),\n";
       indent() << kTab << "reinterpret_cast<" << data_type
                << "*>(shared_mem_var),\n";
-      indent() << kTab << "reinterpret_cast<" << DataType::Int
+      indent() << kTab << "reinterpret_cast<" << out_N->dtype()
                << "*>(shared_mem_n),\n";
-      TORCH_INTERNAL_ASSERT(node->predicate() != nullptr);
+      TORCH_INTERNAL_ASSERT(wop->predicate() != nullptr);
       TORCH_INTERNAL_ASSERT(
-          node->predicate() != nullptr && node->predicate()->hasValue());
-      auto read_pred = genInline(node->predicate());
+          wop->predicate() != nullptr && wop->predicate()->hasValue());
+      auto read_pred = genInline(wop->predicate());
       indent() << kTab << read_pred << ",\n";
-      if (node->writePredicate() != nullptr) {
-        TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue());
-        auto write_pred = genInline(node->writePredicate());
+      if (wop->writePredicate() != nullptr) {
+        TORCH_INTERNAL_ASSERT(wop->writePredicate()->hasValue());
+        auto write_pred = genInline(wop->writePredicate());
         indent() << kTab << write_pred << ",\n";
       }
       indent() << kTab << data_type << "(0));\n";
@@ -926,8 +1227,12 @@ class CudaKernelGenerator : private kir::IrVisitor {
   std::string generateGridReduceTemplateFlags(
       const REDUCTION_OP* rop,
       const ParallelTypeBitmap& thread_pred) {
+    TORCH_INTERNAL_ASSERT(
+        !rop->isAllreduce(),
+        "This is not for the allreduce reduction kernel\n");
+
     const auto par_domains = ir_utils::getParallelDomains(rop->outputs()[0]);
-    std::stringstream flags;
+    ArgumentBuilder flags;
     for (const ParallelType pt : kParallelTypeThreads) {
       const bool parallel_reduction =
           par_domains.find(pt) != par_domains.end() &&
@@ -946,94 +1251,324 @@ class CudaKernelGenerator : private kir::IrVisitor {
       } else {
         flag = !pred && !parallel_reduction;
       }
-      if (pt != kParallelTypeThreads[0]) {
-        flags << ", ";
+      flags.arg(flag);
+    }
+    return flags.str();
+  }
+
+  // TODO: This should replace generateGridReduceTemplateFlags once
+  // GridWelford is refactored as GridReduction.
+  template <typename REDUCTION_OP>
+  std::string generateGridReduceTemplateFlags2(
+      const REDUCTION_OP* rop,
+      const ParallelTypeBitmap& thread_pred) {
+    TORCH_INTERNAL_ASSERT(
+        !rop->isAllreduce(),
+        "This is not for the allreduce reduction kernel\n");
+
+    const auto par_domains =
+        ir_utils::getParallelDomains(ir_utils::getTvOutput(rop));
+    ArgumentBuilder flags;
+    for (const ParallelType pt : kParallelTypeThreads) {
+      const bool parallel_reduction =
+          par_domains.find(pt) != par_domains.end() &&
+          par_domains.at(pt)->isReduction();
+      const bool pred = thread_pred.get(pt);
+      TORCH_INTERNAL_ASSERT(
+          !(parallel_reduction && pred), "Cannot reduce predicated axis: ", pt);
+      // Currently assumed that no dimensions parallelized with blocks
+      // are predicated. This assumption may be lifted, but
+      // gridReduction would need some changes.
+      if (isParallelTypeBlockDim(pt)) {
+        TORCH_INTERNAL_ASSERT(
+            !pred, "Predication on block dimensions not allowed: ", pt);
       }
-      flags << (flag ? "true" : "false");
+      flags.arg(parallel_reduction);
     }
     return flags.str();
   }
 
-  void visit(const kir::GridReduction* node) final {
-    const auto rop = node->reduction_op();
-    TORCH_INTERNAL_ASSERT(rop->out()->isA<kir::TensorIndex>());
+  void handle(const kir::GridReduction* grop) final {
+    TORCH_INTERNAL_ASSERT(grop->out()->isA<kir::TensorIndex>());
 
-    const auto out = rop->out()->as<kir::TensorIndex>();
+    const auto out = grop->out()->as<kir::TensorIndex>();
     const auto domain = out->view()->domain();
     TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
 
-    const auto data_type = rop->out()->dtype();
-    const auto op_type = rop->operation();
+    const auto data_type = grop->out()->dtype();
+    const auto op_type = grop->getReductionOpType();
 
     TORCH_INTERNAL_ASSERT(
-        node->reduction_buffer()->buffer()->isA<kir::TensorView>());
+        grop->reduction_buffer()->buffer()->isA<TensorView>());
+    TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA<TensorView>());
+    const auto work_buffer =
+        grop->reduction_buffer()->buffer()->as<TensorView>();
+    const auto sync_buffer = grop->sync_buffer()->buffer()->as<TensorView>();
+
+    if (grop->isAllreduce()) {
+      generateGridAllreduce(grop);
+      return;
+    }
+
+    const std::string flags_str =
+        generateGridReduceTemplateFlags2(grop, grop->threadPredicate());
+
+    const bool persistent_sync =
+        kernel_->summary().has_cooperative_grid_reduction;
+
+    // Since block-level reduction is already done, those dimensions
+    // with tidx/y/z being true do not participate in the grid
+    // reduction.
+    ArgumentBuilder template_args;
+    template_args.arg(flags_str).arg(persistent_sync);
+
+    ArgumentBuilder func_args(block_nest_level_ + 1, kTab);
+    func_args.arg(gen(grop->out()));
+    func_args.arg(gen(grop->in()));
+    func_args.arg(genReductionOp(op_type, out->dtype()));
+    func_args.arg("&").append(varName(work_buffer)).append("[0]");
+    func_args.arg("&").append(varName(sync_buffer)).append("[0]");
+    func_args.arg(genCall("static_cast", ptrType(data_type), "shared_mem"));
+    // read and write predicates
     TORCH_INTERNAL_ASSERT(
-        node->sync_buffer()->buffer()->isA<kir::TensorView>());
+        grop->predicate() != nullptr && grop->predicate()->hasValue());
+    const auto read_pred = genInline(grop->predicate());
+    func_args.arg(read_pred);
+    if (grop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(grop->writePredicate()->hasValue());
+      func_args.arg(genInline(grop->writePredicate()));
+    } else {
+      func_args.arg(read_pred);
+    }
+    // Init val
+    func_args.arg(genCall(data_type, genInline(grop->init())));
+    func_args.arg(genInline(grop->entrance_index()));
+    func_args.arg(genInline(grop->entrances()));
+
+    indent() << "reduction::gridReduce<" << template_args << ">(\n";
+    indent() << kTab << func_args << ");\n";
+  }
+
+  std::string genFusedReductionName(const TensorView* reduction_out) {
+    return varName(reduction_out) + "_reduction";
+  }
+
+  void generateGridAllreduce(const kir::GridReduction* grop) {
+    TORCH_INTERNAL_ASSERT(grop->isAllreduce());
+
+    const auto out = grop->out()->as<kir::TensorIndex>();
+
+    const auto data_type = grop->out()->dtype();
+    const auto op_type = grop->getReductionOpType();
+
     const auto work_buffer =
-        node->reduction_buffer()->buffer()->as<kir::TensorView>();
+        grop->reduction_buffer()->buffer()->as<TensorView>();
+    const auto sync_buffer = grop->sync_buffer()->buffer()->as<TensorView>();
+
+    const auto reduction_name = genFusedReductionName(out->view());
+
+    // template <typename Func, typename... Types>
+    // __device__ __inline__ void reduce(
+    //   RefTuple<Types...> out,
+    //   const LocalTuple<Types...>& inp,
+    //   VolatilePtrTuple<Types...> global_work_buffer,
+    //   int64_t* global_sync_buffer, // Allocated as product of all
+    //                                // non-participating Grid dimension
+    //   PtrTuple<Types...> shared_buf,
+    //   bool read_pred, // Prevent reading from out of bounds memory
+    //   bool write_pred, // Prevent from writing out of bounds
+    //   const LocalTuple<Types...>& init_val,
+    //   Func reduction_op);
+
+    indent() << reduction_name << ".reduce(\n";
+
+    ArgumentBuilder func_args(block_nest_level_ + 1, kTab);
+    // out
+    func_args.arg(genCall("RefTuple", data_type, gen(grop->out())));
+    // inp
+    func_args.arg(genCall("ConstRefTuple", data_type, gen(grop->in())));
+    // global_work_buffer
+    func_args.arg(genCall(
+        "VolatilePtrTuple", data_type, "&" + varName(work_buffer) + "[0]"));
+    // global_sync_buffer
+    func_args.arg("&").append(varName(sync_buffer)).append("[0]");
+    // shared_buf
+    func_args.arg(genCall(
+        "PtrTuple",
+        data_type,
+        genCall("static_cast", ptrType(data_type), "shared_mem")));
+    // read and write predicates
+    TORCH_INTERNAL_ASSERT(
+        grop->predicate() != nullptr && grop->predicate()->hasValue());
+    const auto read_pred = genInline(grop->predicate());
+    auto write_pred = read_pred;
+    if (grop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(grop->writePredicate()->hasValue());
+      write_pred = genInline(grop->writePredicate());
+    }
+    func_args.arg(read_pred).arg(write_pred);
+    // init_val
+    func_args.arg(genCall("LocalTuple", data_type, genInline(grop->init())));
+    // reduction_op
+    func_args.arg(genReductionOp(op_type, out->dtype()));
+
+    indent() << kTab << func_args << ");\n";
+  }
+
+  void handle(const kir::GroupedGridReduction* grouped_grop) final {
+    const auto out = ir_utils::getTvOutput(grouped_grop);
+    const auto domain = out->domain();
+    TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
+
+    TORCH_INTERNAL_ASSERT(
+        grouped_grop->sync_buffer()->buffer()->isA<TensorView>());
     const auto sync_buffer =
-        node->sync_buffer()->buffer()->as<kir::TensorView>();
+        grouped_grop->sync_buffer()->buffer()->as<TensorView>();
 
-    const std::string flags_str =
-        generateGridReduceTemplateFlags(rop, node->threadPredicate());
+    TORCH_INTERNAL_ASSERT(
+        grouped_grop->numReductions() == 2,
+        "Only grouping of 2 reductions is supported. ",
+        grouped_grop->toString());
+
+    if (grouped_grop->isAllreduce()) {
+      generateGridAllreduce(grouped_grop);
+      return;
+    }
+
+    const std::string flags_str = generateGridReduceTemplateFlags2(
+        grouped_grop, grouped_grop->threadPredicate());
 
     const bool persistent_sync =
         kernel_->summary().has_cooperative_grid_reduction;
 
     // Since block-level reduction is already done, those dimensions
-    // with tidx/y/z being true do not participate in the grid reduction.
-    indent() << "reduction::gridReduce<" << flags_str << ", "
-             << (persistent_sync ? "true" : "false") << ">(\n";
-    indent() << kTab << gen(rop->out()) << ",\n";
-    if (domain->hasBlockReduction()) {
-      indent() << kTab << "block_result_" << block_reduce_name_ << ",\n";
-      block_reduce_name_++;
+    // with tidx/y/z being true do not participate in the grid
+    // reduction.
+    ArgumentBuilder template_args;
+    template_args.arg(flags_str).arg(persistent_sync);
+
+    ArgumentBuilder func_args(block_nest_level_ + 1, kTab);
+
+    // Apped arguments for each reduction
+    for (const auto i : c10::irange(grouped_grop->numReductions())) {
+      TORCH_INTERNAL_ASSERT(
+          grouped_grop->reduction_buffers().at(i)->buffer()->isA<TensorView>());
+      const auto work_buffer =
+          grouped_grop->reduction_buffers().at(i)->buffer()->as<TensorView>();
+
+      func_args.arg(gen(grouped_grop->output(i)));
+      func_args.arg(gen(grouped_grop->input(i)));
+      func_args.arg(genCall(
+          grouped_grop->output(i)->dtype(),
+          genInline(grouped_grop->initVal(i))));
+      func_args.arg(genReductionOp(
+          grouped_grop->getReductionOpType(i),
+          grouped_grop->output(i)->dtype()));
+      func_args.arg("&").append(varName(work_buffer)).append("[0]");
+    }
+
+    // The rest of the arguments are common between the reductions
+    func_args.arg("&").append(varName(sync_buffer)).append("[0]");
+    func_args.arg("shared_mem");
+    // read and write predicates
+    TORCH_INTERNAL_ASSERT(
+        grouped_grop->predicate() != nullptr &&
+        grouped_grop->predicate()->hasValue());
+    const auto read_pred = genInline(grouped_grop->predicate());
+    func_args.arg(read_pred);
+    if (grouped_grop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(grouped_grop->writePredicate()->hasValue());
+      func_args.arg(genInline(grouped_grop->writePredicate()));
     } else {
-      indent() << kTab << gen(rop->in()) << ",\n";
+      func_args.arg(read_pred);
     }
-    indent() << kTab << genReductionOp(op_type, out) << ",\n";
-    indent() << kTab << "&" << varName(work_buffer) << "[0],\n";
-    indent() << kTab << varName(sync_buffer) << ",\n";
-    indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+
+    indent() << "reduction::gridReduceGroup<" << template_args << ">(\n";
+    indent() << kTab << func_args << ");\n";
+  }
+
+  void generateGridAllreduce(const kir::GroupedGridReduction* grouped_grop) {
+    TORCH_INTERNAL_ASSERT(grouped_grop->isAllreduce());
+
+    // First, build a list of function arguments
+    ArgumentBuilder func_args(block_nest_level_ + 1, kTab);
+
+    for (const auto i : c10::irange(grouped_grop->numReductions())) {
+      const auto data_type = grouped_grop->outputs().at(i)->dtype();
+      TORCH_INTERNAL_ASSERT(
+          grouped_grop->reduction_buffers().at(i)->buffer()->isA<TensorView>());
+
+      // out
+      func_args.arg(
+          genCall("RefTuple", data_type, gen(grouped_grop->outputs().at(i))));
+
+      // inp
+      func_args.arg(genCall(
+          "ConstRefTuple", data_type, gen(grouped_grop->inputs().at(i))));
+
+      // global_work_buffer
+      const auto work_buffer =
+          grouped_grop->reduction_buffers().at(i)->buffer()->as<TensorView>();
+      func_args.arg(genCall(
+          "VolatilePtrTuple", data_type, "&" + varName(work_buffer) + "[0]"));
+
+      // init
+      func_args.arg(genCall(
+          "LocalTuple", data_type, genInline(grouped_grop->initVal(i))));
+
+      // reduction op
+      func_args.arg(genReductionOp(
+          grouped_grop->getReductionOpType(i),
+          grouped_grop->output(i)->dtype()));
+    }
+
+    // global_sync_buffer
+    const auto sync_buffer =
+        grouped_grop->sync_buffer()->buffer()->as<TensorView>();
+    func_args.arg("&").append(varName(sync_buffer)).append("[0]");
+
+    // shared_buf
+    func_args.arg("shared_mem");
+
+    // read and write predicates
     TORCH_INTERNAL_ASSERT(
-        node->predicate() != nullptr && node->predicate()->hasValue());
-    auto read_pred = genInline(node->predicate());
-    indent() << kTab << read_pred << ",\n";
-    if (node->writePredicate() != nullptr) {
-      TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue());
-      auto write_pred = genInline(node->writePredicate());
-      indent() << kTab << write_pred << ",\n";
+        grouped_grop->predicate() != nullptr &&
+        grouped_grop->predicate()->hasValue());
+    const auto read_pred = genInline(grouped_grop->predicate());
+    func_args.arg(read_pred);
+    if (grouped_grop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(grouped_grop->writePredicate()->hasValue());
+      func_args.arg(genInline(grouped_grop->writePredicate()));
     } else {
-      indent() << kTab << read_pred << ",\n";
+      func_args.arg(read_pred);
     }
-    indent() << kTab << data_type << "("
-             << genInline(node->reduction_op()->init()) << "));\n";
+
+    indent() << genFusedReductionName(ir_utils::getTvOutput(grouped_grop))
+             << ".reduceGroup(\n";
+    indent() << kTab << func_args << ");\n";
   }
 
-  void visit(const kir::GridBroadcast* node) final {
-    const auto bop = node->broadcast_op();
+  void handle(const kir::GridBroadcast* grop) final {
+    const auto bop = grop->broadcast_op();
     TORCH_INTERNAL_ASSERT(bop->out()->isA<kir::TensorIndex>());
 
-    const auto out = bop->out()->as<kir::TensorIndex>();
-    const auto domain = out->view()->domain();
-    TORCH_INTERNAL_ASSERT(domain->hasGridBroadcast());
-
-    const auto data_type = bop->out()->dtype();
+    const ParallelTypeBitmap parallel_types =
+        kernel_->summary().broadcast_parallel_types.at(bop);
 
     TORCH_INTERNAL_ASSERT(
-        node->broadcast_buffer()->buffer()->isA<kir::TensorView>());
+        parallel_types.hasBID(),
+        "GridBroadcast needs to be used with a broadcast op that is parallelized with the BID parallel types");
+
     TORCH_INTERNAL_ASSERT(
-        node->sync_buffer()->buffer()->isA<kir::TensorView>());
+        grop->broadcast_buffer()->buffer()->isA<TensorView>());
+    TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA<TensorView>());
     const auto work_buffer =
-        node->broadcast_buffer()->buffer()->as<kir::TensorView>();
-    const auto sync_buffer =
-        node->sync_buffer()->buffer()->as<kir::TensorView>();
+        grop->broadcast_buffer()->buffer()->as<TensorView>();
+    const auto sync_buffer = grop->sync_buffer()->buffer()->as<TensorView>();
 
-    const auto par_domains = ir_utils::getParallelDomains(out);
     std::stringstream flags_str;
     for (const ParallelType pt : kParallelTypeThreads) {
-      const bool parallel_bcast = par_domains.find(pt) != par_domains.end() &&
-          par_domains.at(pt)->isBroadcast();
+      const bool parallel_bcast = parallel_types.get(pt);
       if (pt != kParallelTypeThreads[0]) {
         flags_str << ", ";
       }
@@ -1041,7 +1576,7 @@ class CudaKernelGenerator : private kir::IrVisitor {
     }
 
     // Since block-level broadcast has not necessarily been performed before
-    // this function call, so grid broadcast may  be broadcasting across both
+    // this function call, so grid broadcast may be broadcasting across both
     // the grid and the block level.
     indent() << "grid_broadcast::broadcast<" << flags_str.str() << ">(\n";
     indent() << kTab << gen(bop->out()) << ",\n";
@@ -1049,12 +1584,12 @@ class CudaKernelGenerator : private kir::IrVisitor {
     indent() << kTab << "&" << varName(work_buffer) << "[0],\n";
     indent() << kTab << varName(sync_buffer) << ",\n";
     TORCH_INTERNAL_ASSERT(
-        node->predicate() != nullptr && node->predicate()->hasValue());
-    indent() << kTab << genInline(node->predicate()) << ");\n";
+        grop->predicate() != nullptr && grop->predicate()->hasValue());
+    indent() << kTab << genInline(grop->predicate()) << ");\n";
   }
 
-  void visit(const kir::GridWelford* node) final {
-    const auto wop = node->welford_op();
+  void handle(const kir::GridWelford* gwop) final {
+    const auto wop = gwop->welford_op();
     TORCH_INTERNAL_ASSERT(wop->outAvg()->isA<kir::TensorIndex>());
 
     const auto out = wop->out()->as<kir::TensorIndex>();
@@ -1063,41 +1598,43 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
     const auto data_type = out->dtype();
 
-    TORCH_INTERNAL_ASSERT(node->var_buffer()->buffer()->isA<kir::TensorView>());
-    TORCH_INTERNAL_ASSERT(
-        node->sync_buffer()->buffer()->isA<kir::TensorView>());
+    TORCH_INTERNAL_ASSERT(gwop->var_buffer()->buffer()->isA<TensorView>());
+    TORCH_INTERNAL_ASSERT(gwop->sync_buffer()->buffer()->isA<TensorView>());
 
-    const auto avg_buffer = node->avg_buffer()->buffer()->as<kir::TensorView>();
-    const auto var_buffer = node->var_buffer()->buffer()->as<kir::TensorView>();
-    const auto n_buffer = node->N_buffer()->buffer()->as<kir::TensorView>();
-    const auto sync_buffer =
-        node->sync_buffer()->buffer()->as<kir::TensorView>();
+    const auto avg_buffer = gwop->avg_buffer()->buffer()->as<TensorView>();
+    const auto var_buffer = gwop->var_buffer()->buffer()->as<TensorView>();
+    const auto n_buffer = gwop->N_buffer()->buffer()->as<TensorView>();
+    const auto sync_buffer = gwop->sync_buffer()->buffer()->as<TensorView>();
+
+    if (wop->isAllreduce()) {
+      generateGridAllreduce(gwop);
+      return;
+    }
 
     const bool persistent_sync =
         kernel_->summary().has_cooperative_grid_reduction;
 
     const std::string flags_str =
-        generateGridReduceTemplateFlags(wop, node->threadPredicate());
+        generateGridReduceTemplateFlags(wop, gwop->threadPredicate());
 
     // Since block-level reduction is already done, those dimensions
     // with tidx/y/z being true do not participate in the grid reduction.
     indent() << "welford::gridWelford<" << flags_str << ", "
              << (persistent_sync ? "true" : "false") << ">(\n";
-    indent() << kTab << gen(wop->outAvg()) << ",\n"
-             << kTab << gen(wop->outVar()) << ",\n"
-             << kTab << gen(wop->outN()) << ",\n";
+    indent() << kTab << gen(wop->outAvg()) << ",\n";
+    indent() << kTab << gen(wop->outVar()) << ",\n";
+    indent() << kTab << gen(wop->outN()) << ",\n";
     if (domain->hasBlockReduction()) {
-      indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n"
-               << kTab << "block_result_var_" << block_reduce_name_ << ",\n"
-               << kTab << "block_result_n_" << block_reduce_name_ << ",\n";
+      indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n";
+      indent() << kTab << "block_result_var_" << block_reduce_name_ << ",\n";
+      indent() << kTab << "block_result_n_" << block_reduce_name_ << ",\n";
       block_reduce_name_++;
     } else {
       indent() << kTab << gen(wop->inAvg()) << ",\n";
-      if (wop->inVar() == nullptr) {
-        indent() << kTab << "(" << data_type << ") 0,\n";
-      } else {
-        indent() << kTab << gen(wop->inVar()) << ",\n";
-      }
+      TORCH_INTERNAL_ASSERT(
+          wop->inVar() != nullptr, "Welford var input nullptr not allowed");
+      indent() << kTab << "(" << wop->outVar()->dtype() << ")"
+               << gen(wop->inVar()) << ",\n";
       indent() << kTab << "(" << wop->outN()->dtype() << ")" << gen(wop->inN())
                << ",\n";
     }
@@ -1112,112 +1649,291 @@ class CudaKernelGenerator : private kir::IrVisitor {
     indent() << kTab << "reinterpret_cast<" << wop->outN()->dtype()
              << "*>(shared_mem_n),\n";
     TORCH_INTERNAL_ASSERT(
-        node->predicate() != nullptr && node->predicate()->hasValue());
-    auto read_pred = genInline(node->predicate());
+        gwop->predicate() != nullptr && gwop->predicate()->hasValue());
+    auto read_pred = genInline(gwop->predicate());
     indent() << kTab << read_pred << ",\n";
-    if (node->writePredicate() != nullptr) {
-      TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue());
-      auto write_pred = genInline(node->writePredicate());
+    if (gwop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(gwop->writePredicate()->hasValue());
+      auto write_pred = genInline(gwop->writePredicate());
       indent() << kTab << write_pred << ",\n";
     } else {
       indent() << kTab << read_pred << ",\n";
     }
     // TODO : init value support or remove.
-    indent() << kTab << data_type << "(0));\n";
+    indent() << kTab << data_type << "(0),\n";
+    indent() << kTab << genInline(gwop->entrance_index()) << ",\n";
+    indent() << kTab << genInline(gwop->entrances());
+    code_ << ");\n";
+  }
+
+  void generateGridAllreduce(const kir::GridWelford* gwop) {
+    const auto wop = gwop->welford_op();
+    TORCH_INTERNAL_ASSERT(wop->isAllreduce());
+
+    const auto out = wop->out()->as<kir::TensorIndex>();
+
+    const auto data_type = wop->outAvg()->dtype();
+    const auto index_type = wop->outN()->dtype();
+    TORCH_INTERNAL_ASSERT(wop->outAvg()->dtype() == wop->outVar()->dtype());
+
+    ArgumentBuilder data_type_args;
+    data_type_args.arg(data_type).arg(data_type).arg(index_type);
+
+    const auto sync_buffer = gwop->sync_buffer()->buffer()->as<TensorView>();
+
+    const auto reduction_name = genFusedReductionName(out->view());
+
+    // template <typename Func, typename... Types>
+    // __device__ __inline__ void reduce(
+    //   RefTuple<Types...> out,
+    //   const LocalTuple<Types...>& inp,
+    //   VolatilePtrTuple<Types...> global_work_buffer,
+    //   int64_t* global_sync_buffer, // Allocated as product of all
+    //                                // non-participating Grid dimension
+    //   PtrTuple<Types...> shared_buf,
+    //   bool read_pred, // Prevent reading from out of bounds memory
+    //   bool write_pred, // Prevent from writing out of bounds
+    //   const LocalTuple<Types...>& init_val,
+    //   Func reduction_op);
+
+    ArgumentBuilder out_args;
+    out_args.arg(gen(wop->outAvg()));
+    out_args.arg(gen(wop->outVar()));
+    out_args.arg(gen(wop->outN()));
+
+    ArgumentBuilder in_args;
+    in_args.arg(gen(wop->inAvg()));
+    if (wop->inVar() != nullptr) {
+      in_args.arg(gen(wop->inVar()));
+    } else {
+      in_args.arg("(").append(data_type).append(")0");
+    }
+    in_args.arg(gen(wop->inN()));
+
+    ArgumentBuilder init_args;
+    init_args.arg(gen(wop->initAvg()));
+    init_args.arg(gen(wop->initVar()));
+    init_args.arg(gen(wop->initN()));
+
+    ArgumentBuilder work_buffer_args;
+    work_buffer_args.arg("&")
+        .append(varName(gwop->avg_buffer()->buffer()->as<TensorView>()))
+        .append("[0]");
+    work_buffer_args.arg("&")
+        .append(varName(gwop->var_buffer()->buffer()->as<TensorView>()))
+        .append("[0]");
+    work_buffer_args.arg("&")
+        .append(varName(gwop->N_buffer()->buffer()->as<TensorView>()))
+        .append("[0]");
+
+    ArgumentBuilder smem_buffer_args;
+    smem_buffer_args.arg(
+        genCall("reinterpret_cast", ptrType(data_type), "shared_mem_avg"));
+    smem_buffer_args.arg(
+        genCall("reinterpret_cast", ptrType(data_type), "shared_mem_var"));
+    smem_buffer_args.arg(
+        genCall("reinterpret_cast", ptrType(index_type), "shared_mem_n"));
+
+    ArgumentBuilder func_args(block_nest_level_ + 1, kTab);
+    // out
+    func_args.arg(genCall("RefTuple", data_type_args, out_args));
+    // inp
+    func_args.arg(genCall("ConstRefTuple", data_type_args, in_args));
+    // global_work_buffer
+    func_args.arg(
+        genCall("VolatilePtrTuple", data_type_args, work_buffer_args));
+    // global_sync_buffer
+    func_args.arg("&").append(varName(sync_buffer)).append("[0]");
+    // shared_buf
+    func_args.arg(genCall("PtrTuple", data_type_args, smem_buffer_args));
+    // read and write predicates
+    TORCH_INTERNAL_ASSERT(
+        gwop->predicate() != nullptr && gwop->predicate()->hasValue());
+    const auto read_pred = genInline(gwop->predicate());
+    auto write_pred = read_pred;
+    if (gwop->writePredicate() != nullptr) {
+      TORCH_INTERNAL_ASSERT(gwop->writePredicate()->hasValue());
+      write_pred = genInline(gwop->writePredicate());
+    }
+    func_args.arg(read_pred).arg(write_pred);
+    // init_val
+    func_args.arg(genCall("LocalTuple", data_type_args, init_args));
+    // reduction_op
+    func_args.arg(genTemplate(
+        "welfordCombine", ArgumentBuilder().arg(data_type).arg(index_type)));
+
+    indent() << reduction_name << ".reduce(\n";
+    indent() << kTab << func_args << ");\n";
+  }
+
+  void handle(const kir::AllocateFusedReduction* alloc_fused_reduction) final {
+    // See the runtime file of the fused reduction
+    enum class ReductionParallelTypeState { Reduce, Iter, Pred, Inactive };
+
+    using ReductionParallelTypeStateArray =
+        ParallelTypeMap<ReductionParallelTypeState>;
+
+    ReductionParallelTypeStateArray states(
+        ReductionParallelTypeState::Inactive);
+
+    for (const ParallelType pt : kParallelTypeThreads) {
+      // It may be better to predicate grid reductions on dimensions they don't
+      // actively use, however since that should generally be discouraged (they
+      // should be part of the iter portion of the operation, or they should be
+      // predciated out) we're just going to assume they're part of the iter
+      // dimension. This would cause more communication than strictly necessary
+      // but should not be a common use case.
+      auto pt_dim = kernel_->summary().parallel_dimension_map_.get(pt);
+      if (pt_dim == nullptr || pt_dim->isOneInt()) {
+        continue;
+      }
+      // Initialize pt_dim if used to an iter dimension. It may change to a
+      // reduction or predicated dimension later.
+      states[pt] = ReductionParallelTypeState::Iter;
+    }
+
+    for (auto id : alloc_fused_reduction->out()->view()->domain()->domain()) {
+      auto pt = id->getParallelType();
+      if (isParallelTypeThread(pt)) {
+        auto state = id->isReduction() ? ReductionParallelTypeState::Reduce
+                                       : ReductionParallelTypeState::Iter;
+        states[pt] = state;
+      }
+    }
+
+    for (const auto predicated_pt : alloc_fused_reduction->threadPredicate()) {
+      auto& state = states[predicated_pt];
+      TORCH_INTERNAL_ASSERT(
+          state != ReductionParallelTypeState::Reduce,
+          "Invalid thread predication: ",
+          predicated_pt);
+      state = ReductionParallelTypeState::Pred;
+    }
+
+    ArgumentBuilder flags;
+    for (auto pt : kParallelTypeThreads) {
+      flags.arg(static_cast<int>(states[pt]));
+    }
+
+    // Persistent
+    flags.arg(true);
+
+    // Broadcast is fused
+    flags.arg(true);
+
+    const auto reduction_name =
+        genFusedReductionName(alloc_fused_reduction->out()->view());
+
+    indent() << genTemplate("fused_reduction::ParallelReduce", flags) << " "
+             << reduction_name << ";\n";
   }
 
   void handleScope(const kir::Scope& scope) {
     for (auto expr : scope.exprs()) {
-      expr->accept(this);
+      OptOutConstDispatch::handle(expr);
     }
   }
 
-  void visit(const kir::ForLoop* node) final {
-    // TODO(kir): handle this during lowering
-    if (node->iter_domain()->isBroadcast()) {
-      handleScope(node->body());
-      return;
-    } else if (node->vectorize()) {
-      vectorize_scope_ = node->vectorize();
-      handleScope(node->body());
+  void handleTrivialLoop(const kir::ForLoop* loop) {
+    if (loop->vectorize()) {
+      vectorize_scope_ = loop->vectorize();
+    }
+    handleScope(loop->body());
+    if (loop->vectorize()) {
       vectorize_scope_ = false;
-      return;
-    } else if (node->iter_domain()->isStride()) {
-      // A stride domain only executes the loop body with the loop
-      // index being zero.
-      indent() << "constexpr "
-               << "nvfuser_index_t"
-               << " " << gen(node->index()) << " = 0;\n";
-      handleScope(node->body());
-      return;
     }
+  }
 
-    // By default, a parallelized loop would look like:
-    //
-    //   for (int x = threadIdx.x; x < stop; x += blockDim.x) {
-    //     do_some_comp(x);
-    //   }
-    //
-    // When stop is guaranteed to be smaller or equal to the number of
-    // threads, the for-loop is not necessary. In the above case, we
-    // would just generate the loop body without the for clause but
-    // references to the loop index replaced by the loop start value.
-    //
-    // When the loop end is the same as the IterDomain extent, the
-    // assumption can be safely made. This is more conservative than
-    // necessary since the loop stop value just needs to be <= the
-    // IterDomain extent. However, at this point, this conservative
-    // analysis seems sufficient.
-    if (node->stop() == node->iter_domain()->extent() &&
-        node->iter_domain()->isThread()) {
-      // Register a replacement of references to the loop index with
-      // the loop start value.
-      replacement_map_.insert({node->index(), node->start()});
-      handleScope(node->body());
-      replacement_map_.erase(node->index());
-      return;
+  void handle(const GroupedReductionOp* grouped_rop) final {
+    for (const auto i : c10::irange(grouped_rop->numReductions())) {
+      TORCH_INTERNAL_ASSERT(grouped_rop->output(i)->isA<kir::TensorIndex>());
+
+      const auto output = grouped_rop->output(i)->as<kir::TensorIndex>();
+      const auto input = grouped_rop->input(i)->as<kir::TensorIndex>();
+      const auto domain = output->view()->domain();
+      const auto op_type = grouped_rop->getReductionOpType(i);
+
+      const bool has_block_reduce = domain->hasBlockReduction();
+      const bool has_grid_reduce = domain->hasGridReduction();
+
+      TORCH_INTERNAL_ASSERT(
+          !has_grid_reduce,
+          "GroupedReductionOp does not support block parallelization. GroupedGridReductionOp must be used. ",
+          grouped_rop->toString());
+
+      if (!has_block_reduce) {
+        genSerialReduction(output, input, op_type);
+      } else if (
+          auto reduction_id =
+              ir_utils::getMaybeWarpReductionDim(output, input)) {
+        genWarpReduction(
+            output,
+            input,
+            grouped_rop->initVal(i),
+            op_type,
+            grouped_rop->predicate());
+      } else {
+        genBlockReduction(
+            output,
+            input,
+            grouped_rop->initVal(i),
+            op_type,
+            grouped_rop->predicate(),
+            grouped_rop->writePredicate());
+      }
     }
+  }
 
-    if (node->start()->isZeroInt() && node->stop()->isOneInt()) {
-      indent() << "constexpr "
-               << "nvfuser_index_t"
-               << " " << gen(node->index()) << " = 0;\n";
-      handleScope(node->body());
+  void handle(const kir::ForLoop* loop) final {
+    if (loop->isTrivial()) {
+      handleTrivialLoop(loop);
       return;
     }
 
-    const auto gen_index = gen(node->index());
-    const auto gen_start = genInline(node->start());
-    const auto gen_stop = genInline(node->stop());
-    const auto gen_step = genInline(node->step());
+    const auto gen_index = gen(loop->index());
+    const auto gen_start = genInline(loop->start());
+    const auto gen_stop = genInline(loop->stop());
+    const auto gen_step = genInline(loop->step());
 
     std::stringstream step_code;
-    if (node->step()->isOneInt()) {
+    if (loop->step()->isOneInt()) {
       step_code << "++" << gen_index;
     } else {
       step_code << gen_index << " += " << gen_step;
     }
-    if (node->isUnrolled()) {
+    if (loop->isUnrolled()) {
       indent() << "#pragma unroll\n";
     } else {
       indent() << "#pragma unroll 1\n";
     }
-    indent() << "for(nvfuser_index_t " << gen_index << " = " << gen_start
-             << "; " << gen_index << " < " << gen_stop << "; "
-             << step_code.str() << ") ";
+
+    indent() << "for(nvfuser_index_t " << gen_index;
+    if (loop->iter_domain()->isParallelized()) {
+      code_ << " = " << gen_start << "; ";
+    } else {
+      // Do not start at  the start of the ID when not parallelized. Instead,
+      // start at 0. Predicates will protect buffers between 0 and ID->start(),
+      // however if we started at ID->start and extent == ID->start, we could
+      // have a "degenerate" loop (loop with no iterations). It may not be an
+      // issue to have a 0-sized loop, but all potential consequences haven't
+      // been covered. One example is WAR analysis which could incorrectly think
+      // a barrier inside a 0-sized loop actually provides protection.
+      code_ << " = 0; ";
+    }
+    code_ << gen_index << " < " << gen_stop << "; " << step_code.str() << ") ";
     startBlock(true);
-    handleScope(node->body());
+    handleScope(loop->body());
     endBlock();
   }
 
-  void visit(const kir::IfThenElse* node) final {
-    auto conditional = node->predicate()->value();
+  void handle(const kir::IfThenElse* ite) final {
+    auto conditional = ite->predicate()->value();
     if (conditional->isConst()) {
       // If the conditional is a constant, then the IfThenElse is not required
       if (conditional->value().value()) {
-        handleScope(node->thenBody());
+        handleScope(ite->thenBody());
       } else {
-        handleScope(node->elseBody());
+        handleScope(ite->elseBody());
       }
       return;
     }
@@ -1226,73 +1942,77 @@ class CudaKernelGenerator : private kir::IrVisitor {
 
     // "then" block
     startBlock(true);
-    handleScope(node->thenBody());
+    handleScope(ite->thenBody());
 
     // "else" block (optional)
-    if (node->hasElse()) {
+    if (ite->hasElse()) {
       endBlock(" else ");
       startBlock(true);
-      handleScope(node->elseBody());
+      handleScope(ite->elseBody());
     }
 
     endBlock();
   }
 
-  // TODO(kir): fold initialization into Allocate
-  void visit(const kir::Allocate* node) final {
-    const auto buffer_dtype = node->buffer()->dtype();
+  void handle(const kir::Allocate* alloc) final {
+    const auto buffer_dtype = alloc->buffer()->dtype();
 
-    if (!node->buffer()->isA<kir::TensorView>()) {
-      indent() << buffer_dtype << " " << gen(node->buffer()) << ";\n";
+    TORCH_INTERNAL_ASSERT(alloc->buffer() != nullptr);
+    alloc_map_.emplace(alloc->buffer(), alloc);
+
+    if (!alloc->buffer()->isA<TensorView>()) {
+      indent() << buffer_dtype << " " << gen(alloc->buffer()) << ";\n";
       return;
     }
 
-    const auto tv = node->buffer()->as<kir::TensorView>();
+    const auto tv = alloc->buffer()->as<TensorView>();
 
-    const auto size = node->size();
+    const auto size = alloc->size();
     TORCH_INTERNAL_ASSERT(size != nullptr);
 
-    if (node->alias() != nullptr) {
-      // Allocate alias another Allocate node
-      const auto alias_tv = node->alias()->buffer()->as<kir::TensorView>();
-      indent() << "// Alias Allocation - " << node->memoryType() << "\n";
-      indent() << buffer_dtype << "* " << varName(tv) << " = "
-               << varName(alias_tv) << ";\n";
+    if (alloc->alias() != nullptr) {
+      // Allocate alias another Allocate stmt
+      const auto alias_tv = alloc->alias()->buffer()->as<TensorView>();
+      indent() << "// Alias Allocation - " << alloc->memoryType() << "\n";
+      indent() << "auto& " << varName(tv) << " = " << varName(alias_tv)
+               << ";\n";
+
     } else {
       // Standard Memory Allocation
-      switch (tv->memoryType()) {
+      switch (tv->getMemoryType()) {
         case MemoryType::Global:
           indent() << "// Allocate global tensor " << varName(tv) << "\n";
           break;
         case MemoryType::Shared:
-          if (kir::ExpressionEvaluator::isConst(size)) {
-            // Static shared memory
-            indent() << "__shared__ " << buffer_dtype << " " << varName(tv)
-                     << "[" << genInline(size) << "];\n";
+          // Align Offset Position
+          indent() << "offset = alignBufferSize(offset, "
+                   // Always align to 128b / 16B
+                   << 16 << ");\n";
+          // Shared Memory Pointer
+          indent() << buffer_dtype << "* " << varName(tv)
+                   << " = reinterpret_cast<" << buffer_dtype << "*>"
+                   << "(array + offset);\n";
+          // Increment Offset Position
+          indent() << "offset += (" << genInline(size) << " * sizeof("
+                   << buffer_dtype << "));\n";
+          break;
+        case MemoryType::Local: {
+          auto va = kernel_->summary().vectorized_accesses;
+          if (va.find(tv) != va.end()) {
+            indent() << "Array<" << buffer_dtype << ", " << genInline(size)
+                     << ", " << va.at(tv) << "> " << varName(tv) << ";\n";
           } else {
-            // Align Offset Position
-            indent() << "offset = alignBufferSize(offset,"
-                     << dataTypeSize(buffer_dtype) << ");\n";
-            // Shared Memory Pointer
-            indent() << buffer_dtype << "* " << varName(tv)
-                     << " = reinterpret_cast<" << buffer_dtype << "*>"
-                     << "(array + offset);\n";
-            // Increment Offset Position
-            indent() << "offset += (" << genInline(size) << " * sizeof("
-                     << buffer_dtype << "));\n";
+            indent() << buffer_dtype << " " << varName(tv) << "["
+                     << genInline(size) << "];\n";
           }
-          break;
-        case MemoryType::Local:
-          indent() << buffer_dtype << " " << varName(tv) << "["
-                   << genInline(size) << "];\n";
-          break;
+        } break;
         default:
           TORCH_INTERNAL_ASSERT(false, "Unexpected memory type");
       }
     }
   }
 
-  void visit(const kir::Sync* node) final {
+  void handle(const kir::BlockSync*) final {
     // Use a custom synchronization method if enabled
     if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
       indent() << "block_sync::sync();\n";
@@ -1301,11 +2021,43 @@ class CudaKernelGenerator : private kir::IrVisitor {
     }
   }
 
-  void visit(const kir::InitMagicZero* node) final {
+  void handle(const kir::GridSync* sync) final {
+    // Use a custom synchronization method if enabled
+    bool bidx = sync->syncDims().get(ParallelType::BIDx);
+    bool bidy = sync->syncDims().get(ParallelType::BIDy);
+    bool bidz = sync->syncDims().get(ParallelType::BIDz);
+
+    ArgumentBuilder sync_call_template_parms;
+    sync_call_template_parms.arg(bidx).arg(bidy).arg(bidz).arg(true);
+
+    auto sync_idx = genCall(
+        "index_utils::maskedOffset",
+        ArgumentBuilder().arg(!bidx).arg(!bidy).arg(!bidz),
+        ArgumentBuilder().arg("blockIdx").arg("gridDim"));
+
+    auto sync_segment_size = genCall(
+        "index_utils::maskedSize",
+        ArgumentBuilder().arg(bidx).arg(bidy).arg(bidz),
+        ArgumentBuilder().arg("gridDim"));
+
+    ArgumentBuilder sync_call_args;
+    sync_call_args.arg(varName(sync->syncBuffer()))
+        .append("[")
+        .append(sync_idx)
+        .append("]");
+    sync_call_args.arg(sync_segment_size);
+
+    auto sync_call =
+        genCall("grid_sync::sync", sync_call_template_parms, sync_call_args);
+
+    indent() << sync_call << ";\n";
+  }
+
+  void handle(const kir::InitMagicZero*) final {
     indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n";
   }
 
-  void visit(const kir::UpdateMagicZero* node) final {
+  void handle(const kir::UpdateMagicZero*) final {
     indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n";
   }
 
@@ -1314,15 +2066,14 @@ class CudaKernelGenerator : private kir::IrVisitor {
   const kir::Kernel* kernel_;
   int block_nest_level_ = 0;
   int block_reduce_name_ = 0;
-
-  // TODO(kir): replace with explicit assignment statements
   bool print_inline_ = false;
 
   // Mark when we are inside of a vectorized for-loop
   bool vectorize_scope_ = false;
 
-  //! Holds active replacement mappings during codegen
-  std::unordered_map<const kir::Node*, const kir::Node*> replacement_map_;
+  //! Keep track of Allocate node for Val. Used to determine if Val
+  //! should be inlined.
+  std::unordered_map<const Val*, const kir::Allocate*> alloc_map_;
 };
 
 } // namespace
diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h
index 2ffbb872155a..31e4fb707363 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.h
+++ b/torch/csrc/jit/codegen/cuda/codegen.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 
 #include <string>
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp
index 45f744d7e2f1..77fc51363829 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp
@@ -59,14 +59,8 @@ bool validateDomain(TensorView* tv, TensorDomain* new_td) {
 unsigned int getReplayablePosPasC(
     TensorView* producer,
     TensorView* consumer,
-    const ComputeAtRootDomainMap& root_map_,
+    const std::unordered_set<IterDomain*>& unmappable_producer_dims,
     ComputeAtMode mode) {
-  // Grab dimensions in producer and consumer that are mappable to eachother
-  // based on the computeAtRootDomainMap. This will tell us which dimensions
-  // can be inlined based on avoiding trying to inline reduction structures.
-  auto mappable_roots =
-      root_map_.getMappableDims(producer->domain(), consumer->domain());
-
   // Check if any consumer dimensions are marked as vectorize as producer can
   // not be inlined to vectorized dimensions in consumer.
   auto c_dom = consumer->domain()->domain();
@@ -124,9 +118,14 @@ unsigned int getReplayablePosPasC(
     if (std::any_of(
             consumer_root_dim_ids.begin(),
             consumer_root_dim_ids.end(),
-            [&mappable_roots, &c2p_root_map](IterDomain* root_id) {
-              return mappable_roots.find(root_id) == mappable_roots.end() &&
-                  c2p_root_map.find(root_id) != c2p_root_map.end();
+            [&unmappable_producer_dims, &c2p_root_map](IterDomain* c_root_id) {
+              auto p_root_id_it = c2p_root_map.find(c_root_id);
+              if (p_root_id_it == c2p_root_map.end()) {
+                return false;
+              }
+              auto p_id = p_root_id_it->second;
+              return unmappable_producer_dims.find(p_id) !=
+                  unmappable_producer_dims.end();
             })) {
       continue;
     }
@@ -146,14 +145,8 @@ unsigned int getReplayablePosPasC(
 unsigned int getReplayablePosCasP(
     TensorView* consumer,
     TensorView* producer,
-    const ComputeAtRootDomainMap& root_map_,
+    const std::unordered_set<IterDomain*>& unmappable_producer_dims,
     ComputeAtMode mode) {
-  // Grab dimensions in producer and consumer that are mappable to eachother
-  // based on the computeAtRootDomainMap. This will tell us which dimensions
-  // can be inlined based on avoiding trying to inline reduction structures.
-  auto mappable_roots =
-      root_map_.getMappableDims(producer->domain(), consumer->domain());
-
   auto p_dom = producer->domain()->domain();
   auto first_reduction =
       std::find_if(p_dom.begin(), p_dom.end(), [](IterDomain* id) {
@@ -208,10 +201,11 @@ unsigned int getReplayablePosCasP(
     if (std::any_of(
             producer->getMaybeRFactorDomain().begin(),
             producer->getMaybeRFactorDomain().end(),
-            [&mappable_roots, &all_vals](IterDomain* root_id) {
-              return std::find(all_vals.begin(), all_vals.end(), root_id) !=
+            [&unmappable_producer_dims, &all_vals](IterDomain* p_root_id) {
+              return std::find(all_vals.begin(), all_vals.end(), p_root_id) !=
                   all_vals.end() &&
-                  mappable_roots.find(root_id) == mappable_roots.end();
+                  unmappable_producer_dims.find(p_root_id) !=
+                  unmappable_producer_dims.end();
             })) {
       continue;
     }
@@ -446,7 +440,8 @@ unsigned int ComputeAt::backwardComputeAt_impl(
   FUSER_PERF_SCOPE("backwardComputeAt_impl");
 
   auto max_consumer_compute_at_pos =
-      getReplayablePosPasC(producer, consumer, root_map_, mode_);
+      getReplayablePosPasC(producer, consumer, unmappable_dims_, mode_);
+
   if (mode_ == ComputeAtMode::BestEffort) {
     consumer_compute_at_pos =
         std::min(consumer_compute_at_pos, max_consumer_compute_at_pos);
@@ -477,7 +472,10 @@ unsigned int ComputeAt::backwardComputeAt_impl(
   }
 
   auto replay_producer_pair = TransformReplay::replayPasC(
-      producer, consumer, (int)consumer_compute_at_pos, root_map_);
+      producer,
+      consumer,
+      (int)consumer_compute_at_pos,
+      PairwiseRootDomainMap(producer, consumer));
 
   if (replay_producer_pair.second == 0) {
     return 0;
@@ -517,7 +515,7 @@ unsigned int ComputeAt::forwardComputeAt_impl(
   FUSER_PERF_SCOPE("forwardComputeAt_impl");
 
   auto max_producer_compute_at_pos =
-      getReplayablePosCasP(consumer, producer, root_map_, mode_);
+      getReplayablePosCasP(consumer, producer, unmappable_dims_, mode_);
 
   if (mode_ == ComputeAtMode::BestEffort) {
     producer_compute_at_pos =
@@ -549,7 +547,10 @@ unsigned int ComputeAt::forwardComputeAt_impl(
   }
 
   auto replay_consumer_pair = TransformReplay::replayCasP(
-      consumer, producer, (int)producer_compute_at_pos, root_map_);
+      consumer,
+      producer,
+      (int)producer_compute_at_pos,
+      PairwiseRootDomainMap(producer, consumer));
 
   if (producer_compute_at_pos > producer->getComputeAtPosition()) {
     if (!producer->isFusionInput()) {
@@ -657,7 +658,6 @@ void ComputeAt::traverseBackward() {
       running_consumer = running_producer;
       running_producer = tv_chain.back();
       tv_chain.pop_back();
-
       running_consumer_pos = backwardComputeAt_impl(
           running_producer, running_consumer, running_consumer_pos);
     }
@@ -790,16 +790,14 @@ void ComputeAt::updateSiblings() {
             id->parallelize(sibling_id->getParallelType());
           }
         }
-        if (tv->getComputeAtPosition() > sibling_tv->getComputeAtPosition()) {
-          auto sibling_domain = TransformReplay::fullSelfReplay(
-              sibling_tv->domain(), tv->domain());
-          validateDomain(sibling_tv, sibling_domain);
-          sibling_tv->setDomain(sibling_domain);
-          sibling_tv->setComputeAt(tv->getComputeAtPosition());
-          sibling_tv->setMaxProducer(tv->getMaxProducerPosition());
-          auto consumer_tvs = ir_utils::consumerTvsOf(sibling_tv);
-          consumers_to_update.insert(consumer_tvs.begin(), consumer_tvs.end());
-        }
+        auto sibling_domain =
+            TransformReplay::fullSelfReplay(sibling_tv->domain(), tv->domain());
+        validateDomain(sibling_tv, sibling_domain);
+        sibling_tv->setDomain(sibling_domain);
+        sibling_tv->setComputeAt(tv->getComputeAtPosition());
+        sibling_tv->setMaxProducer(tv->getMaxProducerPosition());
+        auto consumer_tvs = ir_utils::consumerTvsOf(sibling_tv);
+        consumers_to_update.insert(consumer_tvs.begin(), consumer_tvs.end());
       }
     }
 
@@ -865,6 +863,27 @@ void ComputeAt::runPass() {
   }
 }
 
+void ComputeAt::buildUnmappableDims() {
+  auto all_tvs = ir_utils::allTvs(producer_->fusion());
+  for (auto tv : all_tvs) {
+    auto consumers = ir_utils::consumerTvsOf(tv);
+    for (auto consumer : consumers) {
+      // Grab dimensions in producer and consumer that are mappable to eachother
+      // based on the computeAtRootDomainMap. This will tell us which dimensions
+      // can be inlined based on avoiding trying to inline non-trivial
+      // reduction structures.
+      auto mappable_roots =
+          root_map_.getMappableDims(tv->domain(), consumer->domain());
+      for (auto tv_root_id : tv->getMaybeRFactorDomain()) {
+        if (mappable_roots.find(tv_root_id) == mappable_roots.end() &&
+            !tv_root_id->isTrivialReduction()) {
+          unmappable_dims_.emplace(tv_root_id);
+        }
+      }
+    }
+  }
+}
+
 ComputeAt::ComputeAt(
     TensorView* _producer,
     TensorView* _consumer,
@@ -903,6 +922,8 @@ ComputeAt::ComputeAt(
   setCommonConsumer();
 
   root_map_.build();
+
+  buildUnmappableDims();
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h
index 391225218db9..75fca5705ed9 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at.h
@@ -2,11 +2,12 @@
 
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 
+#include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/Export.h>
 
 #include <deque>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace torch {
@@ -68,6 +69,10 @@ class ComputeAt {
   // call.
   void setCommonConsumer();
 
+  // Iterate through all TVs and collect the dimensions of each TV that don't
+  // map to all its consumer TVs.
+  void buildUnmappableDims();
+
   // Propagate backward from consumer to producer, check if it increase
   // computeAt position on tensors, if so take it!
   void traverseBackward();
@@ -106,6 +111,9 @@ class ComputeAt {
   // Producer use chains set in, used in a few spots.
   std::deque<std::deque<TensorView*>> producer_use_chains_;
 
+  // Root domains in producer that's unmappable to any of its consumers
+  std::unordered_set<IterDomain*> unmappable_dims_;
+
   ComputeAt(
       TensorView* _producer,
       TensorView* _consumer,
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
index 6671fc375463..43382f865d43 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
 
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
@@ -12,255 +12,87 @@ namespace fuser {
 namespace cuda {
 namespace {
 
-//! Class to figure out how many non-broadcast axes and how many broadcast axes
-//! were used to produce an iter domain. This is important for figuring out what
-//! the correct broadcasted extent is of an iteration domain.
-//!
-//! When GpuLower is available, trivial reductions are not counted as
-//! concrete domains so that they should not be used to generate
-//! for-loops.
-class InputDomainCounter : public IterVisitor {
- public:
-  // Returns number of {non-braodcast non-reduction iteration domains, broadcast
-  // and trivial reduction domains} used to generate the iteration domains in
-  // provided target domain.
-  static std::unordered_map<IterDomain*, std::pair<int, int>> produceCounts(
-      const std::vector<IterDomain*>& domain,
-      GpuLower* gpu_lower) {
-    if (domain.empty()) {
-      return std::unordered_map<IterDomain*, std::pair<int, int>>();
-    }
-
-    InputDomainCounter counter(domain);
-
-    std::unordered_map<IterDomain*, std::pair<int, int>> count_map;
-    for (const auto& entry : counter.domain_set_) {
-      auto id = entry.first;
-      auto input_id_set = entry.second;
-      int concrete_counts = 0;
-      int broadcast_counts = 0;
-      for (auto input_id : input_id_set) {
-        if (input_id->isBroadcast() ||
-            (gpu_lower &&
-             gpu_lower->trivialReductionInfo().isDerived(input_id))) {
-          broadcast_counts++;
-        } else {
-          concrete_counts++;
-        }
-      }
-      count_map[id] = {concrete_counts, broadcast_counts};
-    }
-
-    // Inputs may be root domains which wouldn't have any entries if no exprs
-    // were traversed, so manually insert their count
-    for (auto id : domain) {
-      if (count_map.find(id) == count_map.end()) {
-        count_map[id] =
-            (id->isBroadcast() ||
-             (gpu_lower && gpu_lower->trivialReductionInfo().isDerived(id)))
-            ? std::make_pair(0, 1)
-            : std::make_pair(1, 0);
-      }
-    }
-    return count_map;
-  }
-
- private:
-  InputDomainCounter(const std::vector<IterDomain*>& domain_) {
-    traverseFrom(
-        domain_[0]->fusion(),
-        std::vector<Val*>(domain_.begin(), domain_.end()));
-  }
-
- private:
-  std::unordered_set<IterDomain*>& getEntry(IterDomain* id) {
-    auto domain_set_it = domain_set_.find(id);
-    if (domain_set_it == domain_set_.end()) {
-      domain_set_it =
-          domain_set_
-              .emplace(std::make_pair(id, std::unordered_set<IterDomain*>()))
-              .first;
-      domain_set_it->second.emplace(id);
-    }
-
-    return domain_set_it->second;
-  }
-
-  void handle(Expr* expr) override {
-    // If we end up moving swizzle to an Expr it would be identity here, instead
-    // of outputs being a function of all inputs
-    switch (expr->getExprType().value()) {
-      case (ExprType::Split):
-      case (ExprType::Merge):
-        break;
-      default:
-        TORCH_INTERNAL_ASSERT(
-            false, "Invalid expr type found in transform traversal.");
-    }
-
-    // Gather all non-broadcast input domains
-    std::unordered_set<IterDomain*> resulting_set;
-    for (auto input_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-      auto input_entry = getEntry(input_id);
-      resulting_set.insert(input_entry.begin(), input_entry.end());
-    }
-    for (auto output_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-      domain_set_.emplace(std::make_pair(output_id, resulting_set));
-    }
-  }
-
-  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>> domain_set_;
-};
-
-// Only used once, consider removing.
-template <class T>
-std::deque<T*> deduplicateDeque(const std::deque<T*>& deque) {
-  std::unordered_set<T*> used;
-  std::deque<T*> deduped;
-  for (auto entry : deque) {
-    if (used.find(entry) == used.end()) {
-      deduped.push_back(entry);
-      used.emplace(entry);
-    }
-  }
-  return deduped;
+// Is the provided IterDomain an Leaf of provided TensorView and within its
+// computeAtPosition
+bool idIsAComputeAtLeafDomain(IterDomain* id, TensorView* tv) {
+  auto begin = tv->domain()->domain().begin();
+  auto end = tv->domain()->domain().begin() + tv->getComputeAtPosition();
+  return std::find(begin, end, id) != end;
 }
 
-void assertLowered(bool lowered) {
-  TORCH_INTERNAL_ASSERT(
-      lowered,
-      "Tried to accessed lowered values of compute at map,",
-      " however a valid lowering was not set when compute at map was created.");
+// Is the provided IterDomain an Leaf of provided TensorView
+bool idIsALeafDomain(IterDomain* id, TensorView* tv) {
+  auto begin = tv->domain()->domain().begin();
+  auto end = tv->domain()->domain().end();
+  return std::find(begin, end, id) != end;
 }
 
 } // namespace
 
-void ComputeAtMap::mapIds(IterDomain* id0, IterDomain* id1) {
-  auto set_it_0 = disjoint_iter_set_maps_.find(id0);
-  auto set_it_1 = disjoint_iter_set_maps_.find(id1);
-  if (set_it_0 == disjoint_iter_set_maps_.end() &&
-      set_it_1 == disjoint_iter_set_maps_.end()) {
-    // Neither iter domain has been mapped, so make a new disjoint set
-    auto new_set = std::make_shared<std::deque<IterDomain*>>();
-    new_set.get()->push_back(id0);
-    new_set.get()->push_back(id1);
-    disjoint_iter_set_maps_.emplace(std::make_pair(id0, new_set));
-    disjoint_iter_set_maps_.emplace(std::make_pair(id1, new_set));
-    disjoint_iter_sets_.push_back(new_set);
-
-    // Update parallel type map
-    if (mapping_mode_ == MappingMode::PARALLEL) {
-      if (id0->isParallelized() && id1->isParallelized()) {
-        // Both are parallelized, make sure they're the same, set entry for
-        // parallel map
-        TORCH_INTERNAL_ASSERT(
-            id0->getParallelType() == id1->getParallelType(),
-            "Parallel type of ",
-            id0,
-            " should match ",
-            id1);
-        parallel_type_map_[new_set] = id0->getParallelType();
-      } else if (id0->isParallelized() || id1->isParallelized()) {
-        // Only one is parallelized, set entry for parallel map
-        parallel_type_map_[new_set] = id0->isParallelized()
-            ? id0->getParallelType()
-            : id1->getParallelType();
-      }
-    }
-
-  } else if (
-      set_it_0 != disjoint_iter_set_maps_.end() &&
-      set_it_1 != disjoint_iter_set_maps_.end()) {
-    // Both iter domains have been mapped, so join their sets together
-    auto set0_ptr = set_it_0->second;
-    auto set1_ptr = set_it_1->second;
-
-    // If the sets are already the same, do nothing
-    if (set0_ptr == set1_ptr) {
-      return;
-    }
-
-    // Place everything in set1 into set0 and remap all ID's in set1 to set0
-    auto& set1 = *set1_ptr;
-    for (auto id : set1) {
-      set0_ptr->push_back(id);
-      disjoint_iter_set_maps_[id] = set0_ptr;
-    }
-
-    // set1 no longer needed as its IDs are copied into set0
-    disjoint_iter_sets_.erase(std::find(
-        disjoint_iter_sets_.begin(), disjoint_iter_sets_.end(), set1_ptr));
-
-    // Update parallel type map
-    if (mapping_mode_ == MappingMode::PARALLEL) {
-      auto parallel_type_0_it = parallel_type_map_.find(set0_ptr);
-      auto parallel_type_1_it = parallel_type_map_.find(set1_ptr);
-      if (parallel_type_0_it != parallel_type_map_.end() &&
-          parallel_type_1_it != parallel_type_map_.end()) {
-        // If both sets had a parallel type associated with them, make sure they
-        // are the same
-        TORCH_INTERNAL_ASSERT(
-            parallel_type_0_it->second == parallel_type_1_it->second);
-      } else if (parallel_type_1_it != parallel_type_map_.end()) {
-        // Set 1 has a parallel type, set 0 does not, set parallel entry
-        parallel_type_map_[set0_ptr] = parallel_type_1_it->second;
-      }
-      // Else set 0 already has the right parallel type set in the map, if at
-      // all
-
-      // Remove set1 from the parallel type map as it shouldn't exist anymore
-      parallel_type_map_.erase(set1_ptr);
-    }
+IterDomainGraph::IterDomainGraph(Fusion* fusion) {
+  build(fusion);
+}
 
-  } else {
-    auto existing_set = set_it_0 != disjoint_iter_set_maps_.end()
-        ? set_it_0->second
-        : set_it_1->second;
-    auto missing_id = set_it_0 != disjoint_iter_set_maps_.end() ? id1 : id0;
-    existing_set->push_back(missing_id);
-    disjoint_iter_set_maps_[missing_id] = existing_set;
-
-    // Update parallel type map
-    if (mapping_mode_ == MappingMode::PARALLEL) {
-      auto parallel_type_it = parallel_type_map_.find(existing_set);
-      if (parallel_type_it != parallel_type_map_.end() &&
-          missing_id->isParallelized()) {
-        // existing_set has a parallel type already and missing_id has a
-        // parallel type, make sure they match. No need to update map
-        TORCH_INTERNAL_ASSERT(
-            parallel_type_it->second == missing_id->getParallelType());
-      } else if (
-          parallel_type_it == parallel_type_map_.end() &&
-          id1->isParallelized()) {
-        // Set parallel type of existing_set as the newly added missing_id is
-        // parallel
-        parallel_type_map_[existing_set] = missing_id->getParallelType();
+void IterDomainGraph::build(Fusion* fusion) {
+  // Initialize a node for every iteration domain
+  for (auto tv : ir_utils::allTvs(fusion)) {
+    const auto& root_domain = tv->getRootDomain();
+    const auto& domain = tv->domain()->domain();
+
+    // Grab all values in the history of the tensor view's domain
+    auto all_vals = DependencyCheck::getAllValsBetween(
+        {root_domain.begin(), root_domain.end()},
+        {domain.begin(), domain.end()});
+
+    // Filter so we only have iteration domains (ignore Ints used in split)
+    auto all_ids = ir_utils::filterByType<IterDomain>(all_vals);
+
+    // Check is this domain is a consumer of a view-like operation
+    bool view_like_domain = tv->domain()->hasViewLikeRFactor();
+
+    for (auto id : all_ids) {
+      // Check if this id is a view like rfactor id
+      bool is_view_rfactor_id = false;
+      if (view_like_domain && id->isRFactorProduct()) {
+        // If the tensor domain is a view like domain, and the iteration domain
+        // is marked as an rfactor product and is in the rfactor domain, it's a
+        // view like rfactor iteration domain
+        const auto& rfactor_domain = tv->domain()->getMaybeRFactorDomain();
+        if (std::find(rfactor_domain.begin(), rfactor_domain.end(), id) !=
+            rfactor_domain.end()) {
+          is_view_rfactor_id = true;
+        }
       }
+      bool is_leaf_id =
+          std::find(domain.begin(), domain.end(), id) != domain.end();
+      initializeId(id, is_view_rfactor_id, is_leaf_id);
     }
   }
-}
 
-void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) {
-  // Consumers can only show up once in an expression, keep track of all of them
-  std::vector<TensorView*> consumer_tvs;
+  // All ID's are initialized, start connecting them on the permissive, exact,
+  // and loop dimensions.
 
   for (auto expr : fusion->exprs()) {
-    if (!expr->outputs()[0]->isA<TensorView>()) {
+    if (!ir_utils::isTvOp(expr)) {
       continue;
     }
 
     auto tv_outputs = ir_utils::filterByType<TensorView>(expr->outputs());
     TensorView* first_output_tv = nullptr;
-    for (auto c_tv : tv_outputs) {
-      consumer_tvs.push_back(c_tv);
 
+    for (auto c_tv : tv_outputs) {
       if (first_output_tv == nullptr) {
         first_output_tv = c_tv;
       } else {
-        // Map multi outputs of an expression to eachother. c is current output,
-        // and f as first output. Keep consistent with the later section of
-        // producer and consumers. Which here producer is now "first output",
-        // and consumer is still consumer.
+        // Map multi outputs of an expression to each other. c is current
+        // output, and f as first output. Keep consistent with the later section
+        // of producer and consumers. Which here producer is now "first output",
+        // and consumer is still consumer. One exception is how the
+        // domains left of CA positions are handled in the Parallel
+        // map. Those domains are not mapped in producer and consumer
+        // mappings as they do not share loops, but are mapped in the
+        // case of mapping multiple outputs since they do share the
+        // same loops.
 
         TORCH_INTERNAL_ASSERT(
             c_tv->getRootDomain().size() ==
@@ -275,7 +107,10 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) {
               c_tv->getRootDomain()[i], first_output_tv->getRootDomain()[i]));
         }
 
-        // Multi output mapping
+        // Multi output mapping, outputs are required to have the same domain
+        // and same transformations, so they can be mapped in permissive/exact,
+        // and when within compute at position of domain()->domain() in the
+        // parallel map.
         auto replay_FasC = BestEffortReplay(
             first_output_tv->domain()->domain(),
             c_tv->domain()->domain(),
@@ -283,35 +118,19 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) {
 
         auto c2f_map = replay_FasC.getReplay();
 
-        // If we're creating parallel map, only map the leaf
-        // axes. Also, the producer axis must be left of the CA
-        // point.
-        // Otherwise, map the entire replay map.
-        if (mapping_mode_ == MappingMode::PARALLEL) {
-          // Mark axes left of compute at point for parallel type tracking
-          std::unordered_set<IterDomain*> producer_axes_to_map(
-              first_output_tv->domain()->domain().begin(),
-              first_output_tv->domain()->domain().begin() +
-                  first_output_tv->getComputeAtPosition());
-
-          for (auto c_id : c_tv->domain()->domain()) {
-            auto it = c2f_map.find(c_id);
-            if (it == c2f_map.end()) {
-              continue;
-            }
-            auto f_id = it->second;
-            if (producer_axes_to_map.find(f_id) == producer_axes_to_map.end()) {
-              continue;
-            }
-            mapIds(f_id, c_id);
-          }
-        } else {
-          for (auto entry : c2f_map) {
-            auto c_id = entry.first;
-            auto f_id = entry.second;
-            // Map the id's together
-            mapIds(f_id, c_id);
+        // Map the entire replay map between the multiple
+        // consumers even for the Parallel map as they share the same
+        // loop.
+        for (auto entry : c2f_map) {
+          auto c_id = entry.first;
+          auto f_id = entry.second;
+          // Map the id's together
+          permissive_nodes_.mapEntries(f_id, c_id);
+          exact_nodes_.mapEntries(f_id, c_id);
+          if (idIsALeafDomain(f_id, first_output_tv)) {
+            loop_nodes_.mapEntries(f_id, c_id);
           }
+          sibling_sets_.mapEntries(f_id, c_id);
         }
       }
 
@@ -322,24 +141,9 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) {
         // consumer/producer as their thread mappings could change as long as
         // it's across shared/global memory.
         auto pairwise_map = PairwiseRootDomainMap(p_tv, c_tv);
-        auto c2p_root_map =
+        const auto& permissive_c2p_root_map =
             pairwise_map.mapConsumerToProducer(c_tv->domain(), p_tv->domain());
 
-        // For index map do not map any broadcast dimensions to non-broadcast
-        // dimensions
-        if (mapping_mode_ == MappingMode::INDEX) {
-          // Prevent any broadcasted axes being mapped to non-broadcasted axes.
-          for (auto it = c2p_root_map.begin(); it != c2p_root_map.end();) {
-            auto c_id = it->first;
-            auto p_id = it->second;
-            if (p_id->isBroadcast() != c_id->isBroadcast()) {
-              it = c2p_root_map.erase(it);
-            } else {
-              ++it;
-            }
-          }
-        }
-
         // Look for matching ID transformations in producer and consumer, replay
         // producer as consumer. We want to replay producer as consumer instead
         // of the other way around since consumer may have some broadcasted axes
@@ -348,304 +152,354 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) {
         // mapping. If we're using this map for indexing, we do not want to
         // propagate broadcast mismatches. If we're using it to identify loop
         // nests, we do want to propagate mismatches.
-        auto replay_PasC = mapping_mode_ == MappingMode::LOOP ||
-                mapping_mode_ == MappingMode::PARALLEL
-            ? BestEffortReplay::replayPasC(p_tv, c_tv, -1, pairwise_map)
-            : BestEffortReplay(
-                  p_tv->domain()->domain(),
-                  c_tv->domain()->domain(),
-                  c2p_root_map);
-
-        auto c2p_map = replay_PasC.getReplay();
-
-        // If we're creating parallel map, only map the leaf
-        // axes. Also, the producer axis must be left of the CA
-        // point.
-        // Otherwise, map the entire replay map.
-        if (mapping_mode_ == MappingMode::PARALLEL) {
-          // Mark axes left of compute at point for parallel type tracking
-          std::unordered_set<IterDomain*> producer_axes_to_map(
-              p_tv->domain()->domain().begin(),
-              p_tv->domain()->domain().begin() + p_tv->getComputeAtPosition());
-
-          for (auto c_id : c_tv->domain()->domain()) {
-            auto it = c2p_map.find(c_id);
-            if (it == c2p_map.end()) {
-              continue;
-            }
-            auto p_id = it->second;
-            if (producer_axes_to_map.find(p_id) == producer_axes_to_map.end()) {
-              continue;
-            }
-            mapIds(p_id, c_id);
-          }
-        } else {
-          for (auto entry : c2p_map) {
-            auto c_id = entry.first;
-            auto p_id = entry.second;
-            // Map the id's together
-            mapIds(p_id, c_id);
-          }
+        auto permissive_replay_PasC =
+            BestEffortReplay::replayPasC(p_tv, c_tv, -1, pairwise_map);
+
+        const auto& permissive_c2p_map = permissive_replay_PasC.getReplay();
+
+        // For exact mapings do not map any broadcast dimensions to
+        // non-broadcast dimensions. Prevent any broadcasted axes being mapped
+        // to non-broadcasted axes.
+        auto exact_c2p_root_map =
+            PairwiseRootDomainMap(p_tv, c_tv, true)
+                .mapConsumerToProducer(c_tv->domain(), p_tv->domain());
+
+        // Same as permissive above but for exact
+        auto exact_replay_PasC = BestEffortReplay(
+            p_tv->domain()->domain(),
+            c_tv->domain()->domain(),
+            exact_c2p_root_map);
+
+        const auto& exact_c2p_map = exact_replay_PasC.getReplay();
 
-          // Make sure we always get root mapping for the loop map. Because of
-          // forwarding we could otherwise miss some root mappings.
-          if (mapping_mode_ == MappingMode::LOOP) {
-            for (auto entry : c2p_root_map) {
-              auto c_id = entry.first;
-              auto p_id = entry.second;
-              // Map the id's together
-              mapIds(p_id, c_id);
-            }
+        for (auto entry : exact_c2p_map) {
+          auto c_id = entry.first;
+          auto p_id = entry.second;
+          exact_nodes_.mapEntries(c_id, p_id);
+          consumers_.at(p_id).pushBack(c_id);
+          producers_.at(c_id).pushBack(p_id);
+        }
+
+        for (auto entry : permissive_c2p_map) {
+          auto c_id = entry.first;
+          auto p_id = entry.second;
+          if (idIsAComputeAtLeafDomain(p_id, p_tv)) {
+            loop_nodes_.mapEntries(c_id, p_id);
           }
+          permissive_nodes_.mapEntries(c_id, p_id);
+          consumers_.at(p_id).pushBack(c_id);
+          producers_.at(c_id).pushBack(p_id);
+        }
+
+        // Make sure we always get root mapping for the permissive map. Because
+        // of forwarding we could otherwise miss some root mappings.
+        for (auto entry : permissive_c2p_root_map) {
+          auto c_id = entry.first;
+          auto p_id = entry.second;
+          // Map the id's together
+          permissive_nodes_.mapEntries(c_id, p_id);
+          consumers_.at(p_id).pushBack(c_id);
+          producers_.at(c_id).pushBack(p_id);
         }
       }
     }
   }
+}
 
-  // deduplicate iter domain entries in each set
-  for (const auto& iter_set : disjoint_iter_sets_) {
-    *iter_set = deduplicateDeque(*iter_set);
+void IterDomainGraph::initializeId(
+    IterDomain* id,
+    bool is_view_rfactor_id,
+    bool is_leaf_id) {
+  permissive_nodes_.initializeSet(id);
+  exact_nodes_.initializeSet(id);
+  if (is_leaf_id) {
+    loop_nodes_.initializeSet(id);
   }
+  consumers_[id] = {};
+  producers_[id] = {};
+  sibling_sets_.initializeSet(id);
 
-  // For each IterDomain set we will track how many concrete root domains were
-  // used to generate the IterDomain. Used to populate conrete_id_map. Concrete
-  // ID has maximum of concrete ids, ties are decided based on n_broadcast_ids.
-  // Refer to AdvancedLowering5 for why we need to split ties with broadcast
-  // dims.
-  std::unordered_map<IterDomain*, int> n_concrete_ids_;
-  std::unordered_map<IterDomain*, int> n_broadcast_ids_;
-
-  for (auto c_tv : consumer_tvs) {
-    auto counts =
-        InputDomainCounter::produceCounts(c_tv->domain()->domain(), gpu_lower);
-    std::transform(
-        counts.begin(),
-        counts.end(),
-        std::inserter(n_concrete_ids_, n_concrete_ids_.end()),
-        [](auto counts_entry) {
-          return std::make_pair(counts_entry.first, counts_entry.second.first);
-        });
-    std::transform(
-        counts.begin(),
-        counts.end(),
-        std::inserter(n_broadcast_ids_, n_broadcast_ids_.end()),
-        [](auto counts_entry) {
-          return std::make_pair(counts_entry.first, counts_entry.second.second);
-        });
-  }
+  all_ids_.pushBack(id);
 
-  for (auto inp_tv : ir_utils::filterByType<TensorView>(fusion->inputs())) {
-    auto counts = InputDomainCounter::produceCounts(
-        inp_tv->domain()->domain(), gpu_lower);
-    std::transform(
-        counts.begin(),
-        counts.end(),
-        std::inserter(n_concrete_ids_, n_concrete_ids_.end()),
-        [](auto counts_entry) {
-          return std::make_pair(counts_entry.first, counts_entry.second.first);
-        });
-    std::transform(
-        counts.begin(),
-        counts.end(),
-        std::inserter(n_broadcast_ids_, n_broadcast_ids_.end()),
-        [](auto counts_entry) {
-          return std::make_pair(counts_entry.first, counts_entry.second.second);
-        });
+  if (is_view_rfactor_id) {
+    view_rfactor_ids_.emplace(id);
   }
+}
 
-  // Populate concrete id map
-  for (const auto& set : disjoint_iter_sets_) {
-    int max_concrete_count = -1;
-    int max_broadcast_count = -1;
-    IterDomain* concrete_id = nullptr;
-    for (auto id : *set) {
-      int concrete_count = n_concrete_ids_.at(id);
-      if (concrete_count >= max_concrete_count) {
-        int broadcast_count = n_broadcast_ids_.at(id);
-        if (concrete_count > max_concrete_count ||
-            broadcast_count > max_broadcast_count) {
-          max_concrete_count = concrete_count;
-          max_broadcast_count = broadcast_count;
-          concrete_id = id;
-        }
-      }
+ComputeAtMap::ComputeAtMap(Fusion* fusion) : id_graph_(fusion) {
+  build(fusion);
+}
+
+void ComputeAtMap::build(Fusion* fusion) {
+  trivial_reduction_info_.build(fusion);
+  buildConcreteIds();
+}
+
+void ComputeAtMap::validateAndPropagatePType() {
+  for (const auto& loop_disjoint_set : id_graph_.loopNodes().disjointSets()) {
+    ParallelType common_ptype = ParallelType::Serial;
+    for (auto id : loop_disjoint_set->vector()) {
+      auto id_ptype = id->getParallelType();
+      TORCH_INTERNAL_ASSERT(
+          id_ptype == common_ptype || id_ptype == ParallelType::Serial ||
+              common_ptype == ParallelType::Serial,
+          "Issue validating parallel type disjoint ptype is, ",
+          common_ptype,
+          " but found in the set the id: ",
+          id->toString());
+      common_ptype =
+          common_ptype == ParallelType::Serial ? id_ptype : common_ptype;
     }
 
-    TORCH_INTERNAL_ASSERT(
-        concrete_id != nullptr, "Could not concretize an IterDomain set.");
-
-    for (auto id : *set) {
-      concrete_id_map_[id] = concrete_id;
-      if (mapping_mode_ == MappingMode::PARALLEL) {
-        auto parallel_map_it = parallel_type_map_.find(set);
-        // Parallelize all IterDomains to simplify lowering and codegen
-        if (parallel_map_it != parallel_type_map_.end()) {
-          // Don't propogate vectorize like other parallel types
-          if (parallel_map_it->second != ParallelType::Vectorize) {
-            id->parallelize(parallel_map_it->second);
-          }
-        }
-      }
+    for (auto id : loop_disjoint_set->vector()) {
+      id->parallelize(common_ptype);
     }
   }
+}
 
-  if (gpu_lower != nullptr) {
-    convertToKir(fusion, gpu_lower);
-  }
+bool ComputeAtMap::areMapped(
+    IterDomain* id0,
+    IterDomain* id1,
+    IdMappingMode mode) const {
+  return disjointSetOf(id0, mode)->has(id1);
 }
 
-void ComputeAtMap::convertToKir(Fusion* fusion, GpuLower* gpu_lower) {
-  TORCH_INTERNAL_ASSERT(fusion != nullptr);
-  TORCH_INTERNAL_ASSERT(gpu_lower != nullptr);
-
-  has_lowered_kir_ = true;
-
-  std::unordered_map<
-      std::shared_ptr<std::deque<IterDomain*>>,
-      std::shared_ptr<std::deque<kir::IterDomain*>>>
-      disjoint_set_2_kir;
-
-  for (const auto& disjoint_iter_set : disjoint_iter_set_maps_) {
-    auto fusion_set = disjoint_iter_set.second;
-    auto kir_set_it = disjoint_set_2_kir.find(fusion_set);
-    std::shared_ptr<std::deque<kir::IterDomain*>> kir_set;
-    if (kir_set_it == disjoint_set_2_kir.end()) {
-      kir_set = std::make_shared<std::deque<kir::IterDomain*>>();
-      std::transform(
-          fusion_set->begin(),
-          fusion_set->end(),
-          std::inserter(*kir_set, kir_set->begin()),
-          [&gpu_lower](IterDomain* id) {
-            return gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-          });
-      disjoint_set_2_kir.emplace(std::make_pair(fusion_set, kir_set));
-    } else {
-      kir_set = kir_set_it->second;
-    }
-    kir_disjoint_iter_set_maps_.emplace(std::make_pair(
-        gpu_lower->lowerValue(disjoint_iter_set.first)->as<kir::IterDomain>(),
-        kir_set));
+IterDomain* ComputeAtMap::computeConcreteId(
+    IterDomain* id,
+    IdMappingMode mode) {
+  const auto& disjoint_set_shared_ptr = disjointSetOf(id, mode);
+
+  TORCH_INTERNAL_ASSERT(
+      disjoint_set_shared_ptr->vector().size(),
+      "Empty disjoint set found for ",
+      id->toString());
+
+  if (disjoint_set_shared_ptr->vector().size() == 1) {
+    return disjoint_set_shared_ptr->vector().front();
   }
 
-  for (auto entry : concrete_id_map_) {
-    kir_concrete_id_map_.emplace(std::make_pair(
-        gpu_lower->lowerValue(entry.first)->as<kir::IterDomain>(),
-        gpu_lower->lowerValue(entry.second)->as<kir::IterDomain>()));
+  VectorOfUniqueEntries<IterDomain*> maybe_concrete_ids;
+  for (auto id : disjoint_set_shared_ptr->vector()) {
+    bool id_output = true;
+    for (auto consumer_id : id_graph_.consumers().at(id).vector()) {
+      if (disjoint_set_shared_ptr->has(consumer_id)) {
+        id_output = false;
+        break;
+      }
+    }
+    if (id_output) {
+      maybe_concrete_ids.pushBack(id);
+    }
   }
 
-  for (const auto& entry : disjoint_iter_set_maps_) {
-    kir_2_fusion_[gpu_lower->lowerValue(entry.first)->as<kir::IterDomain>()] =
-        entry.first;
+  TORCH_INTERNAL_ASSERT(
+      maybe_concrete_ids.vector().size(),
+      "No potential concrete_id's found for ",
+      id->toString());
+
+  if (maybe_concrete_ids.vector().size() == 1) {
+    return maybe_concrete_ids.vector().front();
   }
 
-  // Make sure we have all IterDomains that could be used to generate a ForLoop
-  for (auto expr : fusion->exprs()) {
-    if (!expr->outputs()[0]->isA<TensorView>()) {
-      continue;
-    }
+  IterDomain* concrete_id = nullptr;
+  int max_iter_root_count = 0;
+  int max_bcast_root_count = 0;
+
+  for (auto maybe_concrete_id : maybe_concrete_ids.vector()) {
+    std::unordered_set<IterDomain*> root_ids;
+    std::deque<IterDomain*> to_visit;
+
+    to_visit.push_back(maybe_concrete_id);
+    while (to_visit.size()) {
+      auto current_id = to_visit.front();
+      to_visit.pop_front();
+      if (isViewRfactor(current_id)) {
+        root_ids.emplace(current_id);
+        continue;
+      }
 
-    auto tv_outputs = ir_utils::filterByType<TensorView>(expr->outputs());
+      // push back producer IterDomains or add root if they don't exist
+      auto producer_vals = ir_utils::producerValsOf(current_id);
+      auto producer_ids = ir_utils::filterByType<IterDomain>(producer_vals);
 
-    for (auto out : tv_outputs) {
-      for (auto entry : out->domain()->domain()) {
-        kir_2_fusion_[gpu_lower->lowerValue(entry)->as<kir::IterDomain>()] =
-            entry;
+      if (producer_ids.empty()) {
+        root_ids.emplace(current_id);
+      } else {
+        to_visit.insert(
+            to_visit.end(), producer_ids.begin(), producer_ids.end());
       }
     }
-  }
-}
 
-bool ComputeAtMap::areMapped(IterDomain* id0, IterDomain* id1) const {
-  if (id0 == id1) {
-    return true;
-  }
-  auto set0_it = disjoint_iter_set_maps_.find(id0);
-  auto set1_it = disjoint_iter_set_maps_.find(id1);
-  if (set0_it == disjoint_iter_set_maps_.end() ||
-      set1_it == disjoint_iter_set_maps_.end()) {
-    return false;
-  }
-  return (set0_it->second.get() == set1_it->second.get());
+    int bcast_root_count = std::count_if(
+        root_ids.begin(), root_ids.end(), [&](IterDomain* root_id) {
+          return root_id->isBroadcast()
+              // TODO: This shouldn't have a negative impact, but (emperically)
+              // might not be necessary
+              || trivial_reduction_info_.isDerived(root_id);
+        });
+    int iter_root_count = (int)root_ids.size() - bcast_root_count;
+    if (iter_root_count > max_iter_root_count ||
+        (iter_root_count == max_iter_root_count &&
+         bcast_root_count > max_bcast_root_count)) {
+      max_iter_root_count = iter_root_count;
+      max_bcast_root_count = bcast_root_count;
+      concrete_id = maybe_concrete_id;
+    }
+  } // end maybe_concrete_id
+  TORCH_INTERNAL_ASSERT(
+      concrete_id != nullptr,
+      "Something went wrong, could not find a concrete id.");
+
+  return concrete_id;
 }
 
-bool ComputeAtMap::areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const {
-  assertLowered(has_lowered_kir_);
-  if (id0 == id1) {
-    return true;
-  }
-  auto set0_it = kir_disjoint_iter_set_maps_.find(id0);
-  auto set1_it = kir_disjoint_iter_set_maps_.find(id1);
-  if (set0_it == kir_disjoint_iter_set_maps_.end() ||
-      set1_it == kir_disjoint_iter_set_maps_.end()) {
-    return false;
+void ComputeAtMap::buildConcreteIds() {
+  for (const auto& disjoint_set_shared_ptr :
+       id_graph_.permissiveNodes().disjointSets()) {
+    TORCH_INTERNAL_ASSERT(
+        disjoint_set_shared_ptr->vector().size(),
+        "Cannot compute concrete id of empty set.");
+    auto first_id = disjoint_set_shared_ptr->vector().front();
+    auto concrete_id = computeConcreteId(first_id, IdMappingMode::PERMISSIVE);
+    concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id;
   }
-  return (set0_it->second.get() == set1_it->second.get());
-}
 
-IterDomain* ComputeAtMap::getConcreteMappedID(IterDomain* id) const {
-  auto it = concrete_id_map_.find(id);
-  if (it != concrete_id_map_.end()) {
-    return it->second;
+  for (const auto& disjoint_set_shared_ptr :
+       id_graph_.exactNodes().disjointSets()) {
+    TORCH_INTERNAL_ASSERT(
+        disjoint_set_shared_ptr->vector().size(),
+        "Cannot compute concrete id of empty set.");
+    auto first_id = disjoint_set_shared_ptr->vector().front();
+    auto concrete_id = computeConcreteId(first_id, IdMappingMode::EXACT);
+    concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id;
   }
-  return id;
-}
 
-kir::IterDomain* ComputeAtMap::getConcreteMappedID(kir::IterDomain* id) const {
-  assertLowered(has_lowered_kir_);
-  auto it = kir_concrete_id_map_.find(id);
-  if (it != kir_concrete_id_map_.end()) {
-    return it->second;
+  for (const auto& disjoint_set_shared_ptr :
+       id_graph_.loopNodes().disjointSets()) {
+    TORCH_INTERNAL_ASSERT(
+        disjoint_set_shared_ptr->vector().size(),
+        "Cannot compute concrete id of empty set.");
+    auto first_id = disjoint_set_shared_ptr->vector().front();
+    auto concrete_id = computeConcreteId(first_id, IdMappingMode::LOOP);
+    concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id;
   }
-  return id;
 }
 
-IterDomain* ComputeAtMap::toFusion(kir::IterDomain* kir) const {
-  assertLowered(has_lowered_kir_);
-  auto kir_2_fusion_it = kir_2_fusion_.find(kir);
+IterDomain* ComputeAtMap::getConcreteMappedID(
+    IterDomain* id,
+    IdMappingMode mode) const {
+  auto disjoint_set_shared_ptr = disjointSetOf(id, mode);
+
   TORCH_INTERNAL_ASSERT(
-      kir_2_fusion_it != kir_2_fusion_.end(),
-      "Kernel ir is not guarneteed to be reversible into fusion ir, could not find fusion entry. ",
-      kir::toString(kir, false));
-  return kir_2_fusion_it->second;
-}
+      disjoint_set_shared_ptr->vector().size() > 0,
+      "Empty disjoint set found for ",
+      id->toString());
 
-std::string ComputeAtMap::toString() const {
-  std::stringstream ss;
+  auto cache_it = concrete_id_cache_.find(disjoint_set_shared_ptr);
 
-  // We may not have cleaned up non active sets as this is intended for debug,
-  // so first grab unique entries and iterate over them.
-  std::unordered_set<std::shared_ptr<std::deque<IterDomain*>>> disjoint_sets;
+  TORCH_INTERNAL_ASSERT(
+      cache_it != concrete_id_cache_.end(),
+      "Could not find concrete id for: ",
+      id->toString(),
+      " with mode ",
+      mode);
 
-  for (const auto& entry : disjoint_iter_set_maps_) {
-    disjoint_sets.emplace(entry.second);
-  }
+  return cache_it->second;
+}
 
-  for (const auto& disjoint_set : disjoint_sets) {
-    ss << "  disjoint_set{ ";
-    TORCH_INTERNAL_ASSERT(disjoint_set->size() > 0);
-    auto concrete_id = concrete_id_map_.at(disjoint_set->front());
-    for (auto it = disjoint_set->begin(); it != disjoint_set->end(); it++) {
-      if (it != disjoint_set->begin()) {
-        ss << ", ";
-      }
-      ss << (*it);
-      if (*it == concrete_id) {
+namespace {
+
+std::string idGraphNodesToString(
+    const ComputeAtMap& ca_map,
+    IdMappingMode mode) {
+  std::stringstream ss;
+  const auto& disjoint_sets = ca_map.getIdSets(mode);
+  for (const auto& s_ptr : disjoint_sets.disjointSets()) {
+    const auto& set = *s_ptr;
+    IterDomain* concrete_id = nullptr;
+    if (!set.empty()) {
+      auto id = set.front();
+      concrete_id = ca_map.getConcreteMappedID(id, mode);
+    }
+    ss << "  {";
+    for (auto entry : set.vector()) {
+      ss << abstractToString(entry);
+      if (entry == concrete_id) {
         ss << "*";
       }
-    }
-    ss << " }";
-    if (mapping_mode_ == MappingMode::PARALLEL) {
-      if (parallel_type_map_.find(disjoint_set) != parallel_type_map_.end()) {
-        ss << "  -> " << parallel_type_map_.at(disjoint_set);
-      } else {
-        ss << "  -> " << ParallelType::Serial;
+      if (entry != set.back()) {
+        ss << "; ";
       }
     }
-    ss << "\n";
+    ss << " }\n";
   }
   return ss.str();
 }
 
+} // namespace
+
+std::string ComputeAtMap::toString() const {
+  std::stringstream ss;
+  ss << "Compute at map { \n";
+  ss << "Permissive map:\n"
+     << idGraphNodesToString(*this, IdMappingMode::PERMISSIVE);
+  ss << "Exact map:\n" << idGraphNodesToString(*this, IdMappingMode::EXACT);
+  ss << "Loop map:\n" << idGraphNodesToString(*this, IdMappingMode::LOOP);
+  ss << "Consumer maps:\n";
+  for (auto entry : id_graph_.consumers()) {
+    ss << "  " << entry.first->toString() << " :: " << entry.second.toString()
+       << "\n";
+  }
+
+  ss << "Producer maps:\n";
+  for (auto entry : id_graph_.producers()) {
+    ss << "  " << entry.first->toString() << " :: " << entry.second.toString()
+       << "\n";
+  }
+
+  ss << "Sibling map:\n" << id_graph_.siblings().toString() << "\n";
+
+  ss << "} compute at map" << std::endl;
+  return ss.str();
+}
+
+bool ComputeAtMap::isViewRfactor(IterDomain* ref_id) const {
+  return id_graph_.viewRfactorIds().find(ref_id) !=
+      id_graph_.viewRfactorIds().end();
+}
+
+std::vector<IterDomain*> ComputeAtMap::getViewRfactorDomainsOfIdGroup(
+    IterDomain* ref_id,
+    IdMappingMode mode) const {
+  auto disjoint_set = disjointSetOf(ref_id, mode);
+  std::vector<IterDomain*> rfactor_ids;
+  for (auto disjoint_id : disjoint_set->vector()) {
+    if (id_graph_.viewRfactorIds().find(disjoint_id) !=
+        id_graph_.viewRfactorIds().end()) {
+      rfactor_ids.push_back(disjoint_id);
+    }
+  }
+  return rfactor_ids;
+}
+
+const std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>& ComputeAtMap::
+    disjointSetOf(IterDomain* id, IdMappingMode mode) const {
+  return getIdSets(mode).disjointSetMap().at(id);
+}
+
+const DisjointSets<IterDomain*>& ComputeAtMap::getIdSets(
+    IdMappingMode mode) const {
+  switch (mode) {
+    case IdMappingMode::PERMISSIVE:
+      return id_graph_.permissiveNodes();
+    case IdMappingMode::EXACT:
+      return id_graph_.exactNodes();
+    case IdMappingMode::LOOP:
+      return id_graph_.loopNodes();
+  }
+  TORCH_INTERNAL_ASSERT(false, "Error with mapping mode provided.");
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/torch/csrc/jit/codegen/cuda/compute_at_map.h
index b2b70f8997d4..54bb7537a3f1 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at_map.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
 
 #include <deque>
 #include <unordered_map>
@@ -11,126 +13,171 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class GpuLower;
-
-class TORCH_CUDA_CU_API ComputeAtMap {
+// There's three modes of these iter domain mappings all uniquely important in
+// the lowering process.
+//
+// For EXACT/PERMISSIVE mode consider:
+//
+// consumer[i0, b1] = producer[i0]
+// consumer->merge(0) (consumer will now be [i0 * b1])
+// When producer is replayed as consumer (the direction we use for mapping)
+// with BestEffortReplay forward_bcast_mismatch = True the producer to
+// consumer map will have both a mapping of consumer(i0) to producer(i0) as
+// well as consumer(i0*b1) to producer(i0). This latter mapping is important
+// for loop nest mappings as the consumer will generate a loop based on i0*b1
+// and the producer may be computeAt inside this loop nest. However, for
+// indexing we do not want these two maps as producer may be indexed as i0*i1
+// depending on the loop nest structure and how it was built. Therefore we
+// really need to carry (at least) two sets of maps around for lowering.
+//
+// LOOP mode is important if we have something like:
+// consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt
+// = 1) which can easily happen when using shared memory. We want to make sure
+// that the iteration domain used for loop construction (concreteId) has the
+// proper parallelization strategy. In parallel mode we do typical iteration
+// domain mapping, however we remove from it any iteration domains outside the
+// computeAt of producer when mapping. This guarentees we won't map
+// IterDomains that could have different parallelization strategies. We also
+// propagate the parallel strategy in parallel mode so all mapped IDs that
+// must have the same parallel type, do.
+//
+// IdMappingMode::LOOP
+//   Only maps leaf axes to left of compute at
+//   Forward broadcast axes in replay
+// IdMappingMode::PERMISSIVE
+//   Forward broadcast axes in replay
+//   Map all iteration domains
+//   Always contain root mappings (otherwise they could have been forwarded in
+//   broadcast)
+// IdMappingMode::EXACT
+//   Don't map any broadcast axes to non-broadcast axes
+//   Do not forward through any broadcast IDs
+class TORCH_CUDA_CU_API IterDomainGraph {
  public:
-  // There's three modes of these iter domain mappings. For indexing, for loop
-  // nest mapping/generation, and to figure out the parallelization strategy.
-  //
-  // For index/loop mode consider:
-  //
-  // consumer[i0, b1] = producer[i0]
-  // consumer->merge(0) (consumer will now be [i0 * b1])
-  // When producer is replayed as consumer (the direction we use for mapping)
-  // with BestEffortReplay forward_bcast_mismatch = True the producer to
-  // consumer map will have both a mapping of consumer(i0) to producer(i0) as
-  // well as consumer(i0*b1) to producer(i0). This latter mapping is important
-  // for loop nest mappings as the consumer will generate a loop based on i0*b1
-  // and the producer may be computeAt inside this loop nest. However, for
-  // indexing we do not want these two maps as producer may be indexed as i0*i1
-  // depending on the loop nest structure and how it was built. Therefore we
-  // really need to carry two sets of maps around for lowering.
-  //
-  // Parallel mode is important if we have something like:
-  // consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt
-  // = 1) which can easily happen when using shared memory. We want to make sure
-  // that the iteration domain used for loop construction (concreteId) has the
-  // proper parallelization strategy. In parallel mode we do typical iteration
-  // domain mapping, however we remove from it any iteration domains outside the
-  // computeAt of producer when mapping. This guarentees we won't map
-  // IterDomains that could have different parallelization strategies. We also
-  // propagate the parallel strategy in parallel mode so all mapped IDs that
-  // must have the same parallel type, do.
-  //
-  // MappingMode::PARALLEL
-  //   Only maps leaf axes to left of compute at
-  //   Forward broadcast axes in replay
-  // MappingMode::LOOP
-  //   Forward broadcast axes in replay
-  //   Map all iteration domains
-  //   Always contain root mappings (otherwise they could have been forwarded in
-  //   broadcast)
-  // MappingMode::INDEX
-  //   Don't map any broadcast axes to non-broadcast axes
-  //   Do not forward through any broadcast IDs
-  enum class MappingMode { PARALLEL, LOOP, INDEX };
-
-  ComputeAtMap() = default;
-  ComputeAtMap(MappingMode mapping_mode) : mapping_mode_(mapping_mode) {}
-
-  //! Builds all valid mappings. When gpu_lower is not nullptr,
-  //! equivalent mappings for KIR are also created.
-  void build(Fusion* fusion, GpuLower* gpu_lower = nullptr);
-
-  //! Returns if id0 and id1 are mapped to eachother, meaning they represent the
-  //! same loop nest in the lowered code
-  bool areMapped(IterDomain* id0, IterDomain* id1) const;
-
-  bool areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const;
+  IterDomainGraph(Fusion* fusion);
+
+  const DisjointSets<IterDomain*>& permissiveNodes() const {
+    return permissive_nodes_;
+  }
+  const DisjointSets<IterDomain*>& exactNodes() const {
+    return exact_nodes_;
+  }
+  const DisjointSets<IterDomain*>& loopNodes() const {
+    return loop_nodes_;
+  }
+
+  // Consumers and producers is not symmetric like the other sets
+  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
+  consumers() const {
+    return consumers_;
+  }
+  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
+  producers() const {
+    return producers_;
+  }
+
+  const DisjointSets<IterDomain*>& siblings() const {
+    return sibling_sets_;
+  }
+
+  const VectorOfUniqueEntries<IterDomain*>& allIds() const {
+    return all_ids_;
+  }
+
+  const std::unordered_set<IterDomain*>& viewRfactorIds() const {
+    return view_rfactor_ids_;
+  }
 
-  //! Returns an iter domain that is the maximum expanded size of all iter
-  //! domains the one provided maps to. Useful for opening loops to the correct
-  //! iteration size. Not guarenteed to return the same ID every call, but is
-  //! guarenteed to return iter domains in the same disjoint set.
-  IterDomain* getConcreteMappedID(IterDomain* id) const;
+ private:
+  void build(Fusion* fusion);
 
-  kir::IterDomain* getConcreteMappedID(kir::IterDomain* id) const;
+  void initializeId(IterDomain* id, bool is_view_rfactor_id, bool is_leaf_id);
 
-  // TODO: Would be great if we didn't need this, but we have nice functionality
-  // in iter_visitor that isn't moved over. Use of this is limited to indexing
-  // and this should definitely be removed by building out kernel ir to have
-  // better parity with fusion ir.
-  IterDomain* toFusion(kir::IterDomain* kir) const;
+  DisjointSets<IterDomain*> permissive_nodes_;
+  DisjointSets<IterDomain*> exact_nodes_;
+  DisjointSets<IterDomain*> loop_nodes_;
 
-  // Prints mapping information via Fusion IR
-  std::string toString() const;
+  // Consumers and producers is not symmetric like the other sets
+  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
+      consumers_;
+  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
+      producers_;
 
- private:
-  bool has_lowered_kir_ = false;
+  DisjointSets<IterDomain*> sibling_sets_;
 
-  void mapIds(IterDomain* id0, IterDomain* id1);
+  VectorOfUniqueEntries<IterDomain*> all_ids_;
 
-  //! Convert everything to lowered structures (kernel ir), as we will use
-  //! this class frequently during lowering.
-  void convertToKir(Fusion* fusion, GpuLower* gpu_lower);
+  std::unordered_set<IterDomain*> view_rfactor_ids_;
+};
 
- private:
-  MappingMode mapping_mode_ = MappingMode::LOOP;
+class TrivialReductionInfo;
 
-  // This is actually only used when mapping mode == LOOP. Only used in expr
-  // sorting, it's actually maximum position where a loop is shared across any
-  // neighbor.
-  std::unordered_map<TensorView*, unsigned int> produce_at_map_;
+class TORCH_CUDA_CU_API ComputeAtMap {
+ public:
+  ComputeAtMap() = delete;
+  ComputeAtMap(Fusion* fusion);
 
-  // Disjoint sets of iter domains, only defined if iter domain is within
-  // compute at of a tensor view. Maps these iter domains to a set containing
-  // all other iter domains in the fusion that map to the same loop nest.
-  std::unordered_map<IterDomain*, std::shared_ptr<std::deque<IterDomain*>>>
-      disjoint_iter_set_maps_;
+  //! Run through disjoint sets in the LOOP map, make sure there's only one
+  //! non-serial parallel type in each disjoint set, set the parallel type of
+  //! all IterDomains in the disjoint set to that PType.
+  void validateAndPropagatePType();
 
-  std::unordered_map<
-      kir::IterDomain*,
-      std::shared_ptr<std::deque<kir::IterDomain*>>>
-      kir_disjoint_iter_set_maps_;
+  //! Returns if id0 and id1 are mapped to eachother with provided IdMappingMode
+  bool areMapped(IterDomain* id0, IterDomain* id1, IdMappingMode mode) const;
 
-  // Keep a list of disjoint_iter_sets that's deterministic to iterate over
-  std::deque<std::shared_ptr<std::deque<IterDomain*>>> disjoint_iter_sets_;
+  //! Returns an iter domain that is the maximum expanded size of all iter
+  //! domains the one provided maps to. Useful for opening loops to the correct
+  //! iteration size. Not guarenteed to return the same ID every call, but is
+  //! guarenteed to return iter domains in the same disjoint set.
+  IterDomain* getConcreteMappedID(IterDomain* id, IdMappingMode mode) const;
 
-  // Tracks if there's a parallel iter domain associated a disjoint iter domain
-  // set
-  std::unordered_map<std::shared_ptr<std::deque<IterDomain*>>, ParallelType>
-      parallel_type_map_;
+  // Prints mapping information, forwards to an internal IterDomainGraph
+  std::string toString() const;
+
+  // Returns if the provided ID is a view like rfactor id
+  bool isViewRfactor(IterDomain* ref_id) const;
+
+  // Returns all rfactor domains in rfactor_concrete_count_reset_domains_ that
+  // are in the disjoint set of the provided IterDomain. This will be every view
+  // like rfactor ID the provided ID "depends" on in the map.
+  std::vector<IterDomain*> getViewRfactorDomainsOfIdGroup(
+      IterDomain* ref_id,
+      IdMappingMode mode) const;
 
-  // For each IterDomain set we will track how many concrete root domains were
-  // used to generate the IterDomain
-  std::unordered_map<IterDomain*, IterDomain*> concrete_id_map_;
+  const IterDomainGraph& idGraph() const {
+    return id_graph_;
+  }
 
-  std::unordered_map<kir::IterDomain*, kir::IterDomain*> kir_concrete_id_map_;
+  //! Get the ID sets for a provided IdMappingMode
+  const DisjointSets<IterDomain*>& getIdSets(IdMappingMode mode) const;
 
-  // Map kir::IterDomain* back to the fusion IR IterDomain*.
-  // TODO: Would be great if we didn't need this.
-  std::unordered_map<kir::IterDomain*, IterDomain*> kir_2_fusion_;
+ private:
+  // Build id_graph_
+  void build(Fusion* fusion);
+
+  // Build concrete_id_cache_
+  // Build a single entry in  concrete_cache_id_
+  IterDomain* computeConcreteId(IterDomain* id, IdMappingMode mode);
+  void buildConcreteIds();
+
+  // Produce the disjoint set containing provided id with mapping mode.
+  const std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>& disjointSetOf(
+      IterDomain* id,
+      IdMappingMode mode) const;
+
+  // Should be built once and never modified again.
+  const IterDomainGraph id_graph_;
+  TrivialReductionInfo trivial_reduction_info_;
+
+  // Prevent needing to recompute concrete_id's in compute at map.
+  // VectorOfUniqueEntries is unique across mapping modes, so don't need to use
+  // mapping mode directly in this cache. const
+  // VectorOfUniqueEntries<IterDomain*>& is what's returned by
+  // ComputeAtMap::disjointSetOf which can be used directly.
+  std::unordered_map<
+      std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>,
+      IterDomain*>
+      concrete_id_cache_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.cpp b/torch/csrc/jit/codegen/cuda/contiguity.cpp
new file mode 100644
index 000000000000..dbcc160bb8c6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/contiguity.cpp
@@ -0,0 +1,207 @@
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+ContigIDs::ContigIDs(
+    const std::vector<IterDomain*>& ids,
+    const std::vector<IterDomain*>& root_domain,
+    const std::vector<bool>& root_contiguity,
+    std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref,
+    std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
+    bool ignore_halo_constraint,
+    bool ignore_indexability)
+    : root_domain_(root_domain),
+      root_contiguity_(root_contiguity),
+      concrete_to_ref_(std::move(concrete_to_ref)),
+      p2c_id_map_(std::move(p2c_id_map)),
+      ignore_indexability_(ignore_indexability) {
+  if (ids.empty()) {
+    return;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      root_domain_.size() == root_contiguity_.size(),
+      "Arguments don't match ",
+      root_domain_.size(),
+      " != ",
+      root_contiguity_.size());
+
+  // GpuLower is required to honor halo constraints
+  if (!ignore_halo_constraint) {
+    TORCH_INTERNAL_ASSERT(GpuLower::hasCurrent(), "GpuLower not found");
+  }
+
+  for (const auto i : c10::irange(root_domain_.size())) {
+    auto root_domain_i = root_domain_[i]->as<IterDomain>();
+    root_to_indexed_id_[root_domain_i] = root_domain_i;
+    // Initialize to false
+    is_contig_root_[root_domain_i] = false;
+    // If a root domain has halo, can't use merged domain even if
+    // both inputs are contiguous. HaloInfo is also initialized for
+    // rfactor root domains, which should just return "zero"
+    // RootAxisInfo. This should be safe as no rfactor tensor should
+    // need halo.
+    if (root_contiguity_[i] &&
+        (ignore_halo_constraint ||
+         !GpuLower::current()
+              ->haloInfo()
+              .getRootAxisInfo(root_domain_i)
+              .hasHalo())) {
+      contig_ids_.emplace(root_domain_i);
+      is_contig_root_[root_domain_i] = true;
+      within_contig_ids_[root_domain_i] = std::unordered_set<IterDomain*>();
+    }
+  }
+
+  if (!contig_ids_.empty()) {
+    auto exprs = StmtSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()});
+    for (auto expr : exprs) {
+      handle(expr);
+    }
+  }
+}
+
+void ContigIDs::handle(Merge* merge) {
+  // If either input is non-contiguous so is output.
+  const auto inner = merge->inner();
+  const auto outer = merge->outer();
+  const auto out = merge->out();
+
+  if (!isContig(inner) || !isContig(outer)) {
+    return;
+  }
+
+  // Stop contig merging if the merge output is not indexable.
+  if (!ignore_indexability_ && !isIndexable(out)) {
+    return;
+  }
+
+  // Grab inputs, make sure they're in root domain, check if they're
+  // contiguous.
+
+  auto lhs_inputs =
+      ir_utils::iterDomainInputsOfOrderedAs({outer}, root_domain_);
+  auto rhs_inputs =
+      ir_utils::iterDomainInputsOfOrderedAs({inner}, root_domain_);
+
+  TORCH_INTERNAL_ASSERT(
+      inRoot(lhs_inputs) && inRoot(rhs_inputs),
+      "Found an invalid merge operation, inputs of its arguments are not in the root domain.");
+
+  std::deque<IterDomain*> ordered_inputs(lhs_inputs.begin(), lhs_inputs.end());
+  ordered_inputs.insert(
+      ordered_inputs.end(), rhs_inputs.begin(), rhs_inputs.end());
+
+  // If any root input is not contig, output is not contig
+  if (!(std::all_of(
+          ordered_inputs.begin(), ordered_inputs.end(), [this](IterDomain* id) {
+            // Allow reduction tensors in contiguity check since we're using
+            // this to check contiguous vectors of reference tensors in
+            // schedulers (to set vectorization sizes), those reference tensors
+            // may have reduction dims, don't bail on contiguity just because
+            // it's a reduction dimension.
+            return is_contig_root_.at(id);
+          }))) {
+    return;
+  }
+
+  std::deque<IterDomain*> root_copy(root_domain_.begin(), root_domain_.end());
+
+  // Forward to first matching argument
+  while (!root_copy.empty() && !ordered_inputs.empty()) {
+    if (root_copy.front() != ordered_inputs.front()) {
+      root_copy.pop_front();
+    } else {
+      break;
+    }
+  }
+
+  // Forward through all matching arguments
+  while (!root_copy.empty() && !ordered_inputs.empty()) {
+    if (root_copy.front() == ordered_inputs.front()) {
+      root_copy.pop_front();
+      ordered_inputs.pop_front();
+    } else if (
+        root_copy.front()->isReduction() || root_copy.front()->isBroadcast()) {
+      // This was a cause of an error with
+      // ReductionSchedulerMultiDimNonFastest. The test no longer
+      // fails.
+      root_copy.pop_front();
+    } else {
+      break;
+    }
+  }
+
+  // If we matched all inputs, the output is contiguous. Only want to keep the
+  // top contig ID, lower ids should be placed in the "within_contig_ids" map
+  // of top id.
+  if (ordered_inputs.empty()) {
+    if (contig_ids_.find(inner) != contig_ids_.end()) {
+      contig_ids_.erase(inner);
+    }
+
+    if (contig_ids_.find(outer) != contig_ids_.end()) {
+      contig_ids_.erase(outer);
+    }
+
+    contig_ids_.emplace(out);
+
+    std::unordered_set<IterDomain*> within_out;
+    within_out.emplace(inner);
+    if (within_contig_ids_.find(inner) != within_contig_ids_.end()) {
+      auto in_inner = within_contig_ids_.at(inner);
+      within_out.insert(in_inner.begin(), in_inner.end());
+      within_contig_ids_.erase(inner);
+    }
+
+    within_out.emplace(outer);
+    if (within_contig_ids_.find(outer) != within_contig_ids_.end()) {
+      auto in_outer = within_contig_ids_.at(outer);
+      within_out.insert(in_outer.begin(), in_outer.end());
+      within_contig_ids_.erase(outer);
+    }
+
+    within_contig_ids_[out] = within_out;
+
+    for (auto root : lhs_inputs) {
+      root_to_indexed_id_[root] = out;
+    }
+    for (auto root : rhs_inputs) {
+      root_to_indexed_id_[root] = out;
+    }
+  }
+}
+
+IterDomain* ContigIDs::getMappedId(IterDomain* id) const {
+  auto it = p2c_id_map_.find(id);
+  if (it != p2c_id_map_.end()) {
+    return it->second;
+  } else {
+    return id;
+  }
+}
+
+IterDomain* ContigIDs::getCAIndexConcreteId(IterDomain* id) const {
+  TORCH_INTERNAL_ASSERT(
+      GpuLower::current() != nullptr, "GpuLower is not found");
+
+  auto c_id = GpuLower::current()->caMap()->getConcreteMappedID(
+      getMappedId(id), IdMappingMode::EXACT);
+  return c_id;
+}
+
+bool ContigIDs::isIndexable(IterDomain* id) const {
+  auto c_id = getCAIndexConcreteId(id);
+  return concrete_to_ref_.find(c_id) != concrete_to_ref_.end();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.h b/torch/csrc/jit/codegen/cuda/contiguity.h
new file mode 100644
index 000000000000..24f0ffa6c7e5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/contiguity.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// A merge is contiguous if:
+//   Inputs of outer are to the left in the root domain of the inputs of RHS.
+//   All inputs are contiguous in the root domain:
+//     - All marked as contiguous
+//     - Only gaps between inputs are broadcast or reductoin dims
+//   There are no split transformations performed on outer or inner
+//   All transformations on outer or inner are contiguous merges
+// If this criteria holds, then we can index the input root domains of this
+// merge with the indexing provided to the output of the merge in the backward
+// index pass
+
+class ContigIDs : public OptInDispatch {
+ public:
+  ContigIDs() = delete;
+
+  //! Check through the history of ids whose inputs map to root_domain with
+  //! contiguity root_contiguity. Return unordered_set of all merges that are
+  //! contiguous. Ignore root order is primarily used for predicate generation.
+  //! In this case we can linearize indexing of any ID that only consists of
+  //! merge operations.
+  //!
+  //! Mapping information from CA Index concrete to reference domains
+  //! is used to find if merged output domains can be indexed. If there's
+  //! no mapping to a reference domain, there's no corresponding
+  //! index, so it isn't marked as conting merge.
+  //!
+  //! p2c_id_map can be used when replayed producer domains are
+  //! analyzed, in which case producer-to-consumer maps should be
+  //! passed.
+  //!
+  //! If ignore_indexability and ignore_halo_constraint are true,
+  //! ignore the constraint on indexing and halo, respectively. It is
+  //! the caller that is responsible for its correctness.
+  //!
+  //! The function interface with many parameters looks ugly, but it
+  //! is also important to make ignore_indexability and
+  //! ignore_halo_constraint explicit to avoid any surprise.
+  //!
+  //! Not really sure why but clang-tidy only complains about
+  //! std::unordered_map if passed as a const reference.
+  ContigIDs(
+      const std::vector<IterDomain*>& ids,
+      const std::vector<IterDomain*>& root_domain,
+      const std::vector<bool>& root_contiguity,
+      std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref,
+      std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
+      bool ignore_indexability = false,
+      bool ignore_halo_constraint = false);
+
+  const std::unordered_set<IterDomain*>& contigIDs() const {
+    return contig_ids_;
+  }
+
+  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
+  withinContigIDs() const {
+    return within_contig_ids_;
+  }
+
+  const std::unordered_map<IterDomain*, IterDomain*>& rootToIndexedID() const {
+    return root_to_indexed_id_;
+  }
+
+ private:
+  using OptInDispatch::handle;
+
+  bool inRoot(const std::vector<IterDomain*>& ids) {
+    return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) {
+      return is_contig_root_.find(id) != is_contig_root_.end();
+    });
+  }
+
+  bool isContig(IterDomain* id) {
+    return contig_ids_.find(id) != contig_ids_.end();
+  }
+
+  // Split outputs are not contiguous, don't need to do anything.
+  void handle(Split*) override {}
+
+  void handle(Merge* merge) override;
+
+  IterDomain* getCAIndexConcreteId(IterDomain* id) const;
+
+  //! True if an ID is indexable.
+  //! E.g., a merged domain with broadcast may not be indexable when
+  //! its corresponding reference tensor has non-broadcast domains.
+  bool isIndexable(IterDomain* id) const;
+
+  //! Return an ID mapped with id_map_ or itself
+  IterDomain* getMappedId(IterDomain* id) const;
+
+ private:
+  //! Root domains to analyze contiguity
+  const std::vector<IterDomain*>& root_domain_;
+  //! Contiguity of root_domain_
+  const std::vector<bool>& root_contiguity_;
+  //! Mapping of concrete to reference domains. If a concrete domain
+  //! is not mapped, it is not indexable as there's no mapped index.
+  const std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref_;
+  //! Producer-to-consumer index map in the case of analyzing replayed
+  //! producer tensors
+  const std::unordered_map<IterDomain*, IterDomain*> p2c_id_map_;
+  const bool ignore_indexability_ = false;
+
+  //! Mapping of root domain to bool indicating contiguity
+  std::unordered_map<IterDomain*, bool> is_contig_root_;
+  // Mark if ids are result of contigous merges
+  std::unordered_set<IterDomain*> contig_ids_;
+  // Given contiguous domain, return all iter domains within its history.
+  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
+      within_contig_ids_;
+  //! Mapping of root domain to the actual indexed domain, which can
+  //! be itself or a contig merged domain if found.
+  std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/disjoint_set.h b/torch/csrc/jit/codegen/cuda/disjoint_set.h
index 99647a05496f..2b4dea404d74 100644
--- a/torch/csrc/jit/codegen/cuda/disjoint_set.h
+++ b/torch/csrc/jit/codegen/cuda/disjoint_set.h
@@ -3,169 +3,278 @@
 #include <c10/util/Exception.h>
 
 #include <algorithm>
+#include <initializer_list>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+// For printing of the set when using a Statement as the type for the set
+#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
-//! Container class DisjointSet models equivalence relationships
-//!
-//! Each instance of this class keeps a set of equivalent classes
-//! DisjointSet::join(a,b) makes the full class of a and b equivalent
-//! DisjointSet::areEqual(a,b) checks if a and b belong same class
+namespace {
+
+template <typename T>
+std::string abstractToString(T* ptr) {
+  return ptr->toString();
+}
+
+template <typename T>
+std::string abstractToString(T ref) {
+  return ref.toString();
+}
+
+} // namespace
+
+// Vector like class that will prevent adding duplicate entries by also
+// maintaing a set
 template <typename T, typename Hash = std::hash<T>>
-class DisjointSet {
+class VectorOfUniqueEntries {
  public:
-  DisjointSet() = default;
-
-  //! Joins the equivalent class that a and b belong to
-  //! areEqual(a',b') will be true for each a'=a and b'=b
-  //!
-  //! \param a An element from a equivalent class
-  //!          will create a new equivalent class if a does
-  //!          not belong to any
-  //! \param b An element from another equivalent class
-  //!          will create a new equivalent class if b does
-  //!          not belong to any
-  void join(T a, T b) {
-    // cases where either of the quiv class doesn't exist
-    if (!entry_map.count(a) && !entry_map.count(b)) {
-      createPoint(a);
-      entry_map[b] = fixedPoint(a);
-    } else if (!entry_map.count(a)) {
-      entry_map[a] = fixedPoint(b);
-    } else if (!entry_map.count(b)) {
-      entry_map[b] = fixedPoint(a);
-    } else {
-      // case where both equiv classes exist and need to join
-      const int i0 = fixedPoint(a);
-      const int i1 = fixedPoint(b);
-      int new_parent = 0;
-      int new_child = 0;
-
-      // Either order here is correct but joining larger class to smaller class
-      // tend to be faster
-      std::tie(new_parent, new_child) = (weights[i0] < weights[i1])
-          ? std::make_pair(i0, i1)
-          : std::make_pair(i1, i0);
-      weights[new_parent] += weights[new_child];
-      set_map[new_child] = new_parent;
+  VectorOfUniqueEntries() = default;
+
+  VectorOfUniqueEntries(const std::initializer_list<T>& x)
+      : vector_(x), set_(x) {}
+
+  // Returns if a node was actually added
+  bool pushBack(T entry) {
+    if (set_.emplace(entry).second) {
+      vector_.push_back(entry);
+      return true;
     }
+    return false;
   }
 
-  //! Checks if a and b belong to the same equivalent class
-  //!
-  //! \param a An element from a equivalent class
-  //! \param b An element from another equivalent class
-  //! \returns Boolean value representing if a and b are
-  //!          recorded to be in the same equivalent class
-  //!          will return false if any of a or b doesn't
-  //!          have an equivalent class recorded
-  bool areEquivalent(T a, T b) const {
-    if (!entry_map.count(a) || !entry_map.count(b)) {
-      return false;
+  // Returns if any node was added
+  bool pushBack(const VectorOfUniqueEntries<T, Hash>& other) {
+    bool any_added = false;
+    for (auto entry : other) {
+      any_added = any_added | pushBack(entry);
     }
-    return fixedPoint(a) == fixedPoint(b);
+    return any_added;
   }
 
-  //! Queries if an element exists in this set
-  bool contains(T a) const {
-    return entry_map.count(a) > 0;
+  // Returns a const vector useful for iterating on
+  const std::vector<T>& vector() const {
+    return vector_;
   }
 
-  //! Returns all elements added to this set
-  std::vector<T> getAllElements() const {
-    std::vector<T> elms(entry_map.size());
-    std::transform(
-        entry_map.begin(),
-        entry_map.end(),
-        elms.begin(),
-        [](const auto& entry_map_kv) { return entry_map_kv.first; });
-    return elms;
+  // Returns first element in vector
+  T front() const {
+    return vector_.front();
   }
 
-  //! Clears the equivalence relationships
-  void clear() {
-    set_map.clear();
-    weights.clear();
-    entry_map.clear();
-    next_index_ = 0;
-  }
-
-  //! Dumps the equivalent relationships
-  std::ostream& print(std::ostream& os) const {
-    std::unordered_map<int, std::unordered_set<T, Hash>> fixedPointMap;
-    for (const auto& kv : entry_map) {
-      int fixed_point = fixedPoint(kv.first);
-      auto it = fixedPointMap.find(fixed_point);
-      if (it == fixedPointMap.end()) {
-        it = fixedPointMap.insert({fixed_point, {}}).first;
-      }
-      it->second.insert(kv.first);
-    }
-    os << "{\n";
-    for (const auto& kv : fixedPointMap) {
-      os << "\t{ ";
-      for (const auto& val : kv.second) {
-        os << toString(val) << " ";
+  // Returns last element in vector
+  T back() const {
+    return vector_.back();
+  }
+
+  // Remove and returns the last element in vector
+  T popBack() {
+    T v = vector_.back();
+    set_.erase(v);
+    vector_.pop_back();
+    return v;
+  }
+
+  // Returns if this container is empty
+  bool empty() const {
+    return vector_.empty();
+  }
+
+  // Returns if entry is in this vector
+  bool has(T entry) const {
+    return set_.find(entry) != set_.end();
+  }
+
+  std::string toString() {
+    std::stringstream ss;
+    ss << "{ ";
+    for (auto entry : vector()) {
+      ss << abstractToString(entry);
+      if (entry != vector().back()) {
+        ss << "; ";
       }
-      os << "}\n";
     }
-    os << "}\n";
-    return os;
+    ss << " }";
+    return ss.str();
   }
 
  private:
-  // Internal fixed point implementation:
-  //  Returns the equivalent class that e belongs to
-  int getFixedPointForClass(int e) const {
-    TORCH_INTERNAL_ASSERT(static_cast<int>(set_map.size()) > e);
-    while (set_map[e] != e) {
-      // Chasing to fixed point
-      e = set_map[e];
+  std::vector<T> vector_;
+  std::unordered_set<T, Hash> set_;
+};
+
+//! Container class DisjointSet models equivalence relationships
+//!
+//! Each instance of this class keeps equivalence sets
+//! DisjointSet::mapEntries(a,b) makes the full set of a and b equivalent
+//! DisjointSet::*AreMapped(a,b) checks if a and b belong to the same disjoint
+//! set
+template <typename T, typename Hash = std::hash<T>>
+class DisjointSets {
+ public:
+  DisjointSets() = default;
+
+  // Warning: returned values should never be modified. This accessor isn't
+  // strictly safe as VectorOfUniqueEntries is not returned as a const.
+  const std::
+      unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>&
+      disjointSetMap() const {
+    return disjoint_set_maps_;
+  }
+
+  // Warning: returned values should never be modified. This accessor isn't
+  // strictly safe as VectorOfUniqueEntries is not returned as a const.
+  const std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>>&
+  disjointSets() const {
+    return disjoint_sets_;
+  }
+
+  // Return the entire disjoint set of provided entry
+  const VectorOfUniqueEntries<T, Hash>& getDisjointSetOf(T entry) const {
+    auto set_it = disjoint_set_maps_.find(entry);
+    TORCH_INTERNAL_ASSERT(
+        set_it != disjoint_set_maps_.end(),
+        "Could not find entry for ",
+        entry->toString());
+    return *(set_it->second);
+  }
+
+  // Initializes a new set for provided entry
+  //
+  // TODO: Return iterator
+  void initializeSet(T entry) {
+    disjoint_sets_.push_back(
+        std::make_shared<VectorOfUniqueEntries<T, Hash>>());
+    disjoint_sets_.back()->pushBack(entry);
+    disjoint_set_maps_.emplace(std::make_pair(entry, disjoint_sets_.back()));
+  }
+
+  // Adds all of the disjoint set belonging to entry1 to the disjoint set
+  // belonging to entry0, maps all entries of disjoint set belonging to entry1
+  // to entry0, removes original disjoint set belonging to entry1.
+  void mapEntries(T entry0, T entry1) {
+    auto set_it_0 = disjoint_set_maps_.find(entry0);
+    auto set_it_1 = disjoint_set_maps_.find(entry1);
+
+    // Track if we need to reset iterators, optimize for case where both entries
+    // exist
+    bool invalid_iterators = false;
+    if (set_it_0 == disjoint_set_maps_.end()) {
+      initializeSet(entry0);
+      invalid_iterators = true;
+    }
+
+    if (set_it_1 == disjoint_set_maps_.end()) {
+      initializeSet(entry1);
+      invalid_iterators = true;
     }
-    return e;
+
+    // TODO: We can avoid refinding one iterator if initialize set returns an
+    // iterator, though if we insert entry1 we'd have to refind entry0 as it
+    // could invalidate all iterators
+    if (invalid_iterators) {
+      set_it_0 = disjoint_set_maps_.find(entry0);
+      set_it_1 = disjoint_set_maps_.find(entry1);
+    }
+
+    auto set0_shared_ptr = set_it_0->second;
+    auto set1_shared_ptr = set_it_1->second;
+
+    // If the sets are already the same, do nothing
+    if (set0_shared_ptr == set1_shared_ptr) {
+      return;
+    }
+
+    // Place everything in set1 into set0 and remap all entries in set1 to set0
+    for (auto entry : set1_shared_ptr->vector()) {
+      set0_shared_ptr->pushBack(entry);
+      disjoint_set_maps_[entry] = set0_shared_ptr;
+    }
+
+    // set1 no longer needed as its entries are copied into set0
+    disjoint_sets_.erase(std::find(
+        disjoint_sets_.begin(), disjoint_sets_.end(), set1_shared_ptr));
   }
 
-  //! Utility to check the class e belongs to:
-  //!
-  //! \param e element e to find the equiv class for
-  //! \returns the equivalent class that e belongs to
-  //!
-  int fixedPoint(T e) const {
-    // Handles case when i doesn't have an equivalence class
-    TORCH_INTERNAL_ASSERT(entry_map.count(e));
+  // Will assert if provided entry0 is not in any disjoint set, otherwise
+  // returns if entry0 and entry1 are in the same disjoint set.
+  bool strictAreMapped(T entry0, T entry1) const {
+    auto entry_it = disjointSetMap().find(entry0);
+    TORCH_INTERNAL_ASSERT(
+        entry_it != disjointSetMap().end(),
+        "Strict mapping failed on element: ",
+        abstractToString(entry0),
+        " either an error occured, or non strict mapping should have been used.");
+    return entry_it->second->has(entry1);
+  }
+
+  // If entry0 doesn't have a disjoint set returns false, otherwise returns if
+  // entry0 and entry1 are in the same disjoint set.
+  bool permissiveAreMapped(T entry0, T entry1) const {
+    auto entry_it = disjointSetMap().find(entry0);
+    if (entry_it == disjointSetMap().end()) {
+      return false;
+    }
+    return entry_it->second->has(entry1);
+  }
 
-    // Use fixed point as a representation for the equiv class
-    return getFixedPointForClass(entry_map.at(e));
+  // Returns if a set exists with provided entry
+  bool mappingExists(T entry) const {
+    return disjoint_set_maps_.find(entry) != disjoint_set_maps_.end();
   }
 
-  //! Utility to create a new equiv class for i
+  // Returns a deterministic list of all entries that have been added to any
+  // disjoint set.
   //
-  //! \param i Element i to create the equiv class for
-  void createPoint(T i) {
-    entry_map[i] = next_index_;
-    set_map.push_back(next_index_++);
-    weights.push_back(1);
+  // Warning: constructed on every call, consider caching result.
+  VectorOfUniqueEntries<T, Hash> getAllElements() const {
+    VectorOfUniqueEntries<T, Hash> all_elements;
+    for (auto set : disjoint_sets_) {
+      for (auto entry : set->vector()) {
+        all_elements.pushBack(entry);
+      }
+    }
+    return all_elements;
+  }
+
+  // Completely clears all disjoint sets
+  void clear() {
+    disjoint_set_maps_.clear();
+    disjoint_sets_.clear();
+  }
+
+  std::string toString() const {
+    std::stringstream ss;
+    ss << "disjoint sets{\n";
+    for (auto s_ptr : disjoint_sets_) {
+      auto& set = *s_ptr;
+      ss << "  { ";
+      for (auto entry : set.vector()) {
+        ss << abstractToString(entry);
+        // DomainKey defines == but not !=
+        if (!(entry == set.back())) {
+          ss << "; ";
+        }
+      }
+      ss << " }\n";
+    }
+    ss << "}";
+    return ss.str();
   }
 
  private:
-  // Internal representation of the equivalence class as integers
-  // set_map implements the "parent" relationship
-  std::vector<int> set_map;
-  // Weights is used for preliminary perf optimization
-  std::vector<int> weights;
-
-  // Map the input of type T to its equivalence class
-  std::unordered_map<T, int, Hash> entry_map;
-
-  // Running counter for generating new index when
-  // Creating new equiv classes
-  int next_index_ = 0;
+  // Disjoint sets
+  std::unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>
+      disjoint_set_maps_;
+
+  // Keep a list of disjoint_sets that's deterministic to iterate over
+  std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>> disjoint_sets_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp
index cea8b24e7ff7..1306440d915e 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.cpp
+++ b/torch/csrc/jit/codegen/cuda/dispatch.cpp
@@ -37,7 +37,7 @@ T* ptr(T* obj) {
  * }
  *
  * And therefore dispatch should never call:
- * ptr(mutator)->handle(this->as<Statement>());
+ * ptr(mutator)->mutate(this->as<Statement>());
  */
 
 template <typename T>
@@ -52,12 +52,22 @@ void Val::dispatch(T handler, Val* val) {
           ptr(handler)->handle(val->as<Double>());
           return;
         case DataType::Int:
+        case DataType::Int32:
+          // Dispatch to Int even with Int32 as we don't have Int32 IR
+          // node.
           ptr(handler)->handle(val->as<Int>());
           return;
+        case DataType::ComplexDouble:
+          ptr(handler)->handle(val->as<ComplexDouble>());
+          return;
         default:
           break;
       }
       break;
+    case ValType::NamedScalar:
+      ptr(handler)->handle(val->as<NamedScalar>());
+      return;
+
     case ValType::IterDomain:
       ptr(handler)->handle(val->as<IterDomain>());
       return;
@@ -67,8 +77,11 @@ void Val::dispatch(T handler, Val* val) {
     case ValType::TensorView:
       ptr(handler)->handle(val->as<TensorView>());
       return;
-    case ValType::NamedScalar:
-      ptr(handler)->handle(val->as<NamedScalar>());
+    case ValType::Predicate:
+      ptr(handler)->handle(val->as<kir::Predicate>());
+      return;
+    case ValType::TensorIndex:
+      ptr(handler)->handle(val->as<kir::TensorIndex>());
       return;
     default:
       break;
@@ -79,12 +92,6 @@ void Val::dispatch(T handler, Val* val) {
 template <typename T>
 void Expr::dispatch(T handler, Expr* expr) {
   switch (*(expr->getExprType())) {
-    case ExprType::Split:
-      ptr(handler)->handle(expr->as<Split>());
-      return;
-    case ExprType::Merge:
-      ptr(handler)->handle(expr->as<Merge>());
-      return;
     case ExprType::UnaryOp:
       ptr(handler)->handle(expr->as<UnaryOp>());
       return;
@@ -97,12 +104,25 @@ void Expr::dispatch(T handler, Expr* expr) {
     case ExprType::ReductionOp:
       ptr(handler)->handle(expr->as<ReductionOp>());
       return;
+    case ExprType::GroupedReductionOp:
+      ptr(handler)->handle(expr->as<GroupedReductionOp>());
+      return;
     case ExprType::WelfordOp:
       ptr(handler)->handle(expr->as<WelfordOp>());
       return;
+    case ExprType::MmaOp:
+      ptr(handler)->handle(expr->as<MmaOp>());
+      return;
     case ExprType::BroadcastOp:
       ptr(handler)->handle(expr->as<BroadcastOp>());
       return;
+
+    case ExprType::Split:
+      ptr(handler)->handle(expr->as<Split>());
+      return;
+    case ExprType::Merge:
+      ptr(handler)->handle(expr->as<Merge>());
+      return;
     case ExprType::TransposeOp:
       ptr(handler)->handle(expr->as<TransposeOp>());
       return;
@@ -112,9 +132,49 @@ void Expr::dispatch(T handler, Expr* expr) {
     case ExprType::GatherOp:
       ptr(handler)->handle(expr->as<GatherOp>());
       return;
+    case ExprType::ViewAsScalar:
+      ptr(handler)->handle(expr->as<ViewAsScalar>());
+      return;
     case ExprType::ViewOp:
       ptr(handler)->handle(expr->as<ViewOp>());
       return;
+
+    case ExprType::Allocate:
+      ptr(handler)->handle(expr->as<kir::Allocate>());
+      return;
+    case ExprType::BlockSync:
+      ptr(handler)->handle(expr->as<kir::BlockSync>());
+      return;
+    case ExprType::GridSync:
+      ptr(handler)->handle(expr->as<kir::GridSync>());
+      return;
+    case ExprType::InitMagicZero:
+      ptr(handler)->handle(expr->as<kir::InitMagicZero>());
+      return;
+    case ExprType::UpdateMagicZero:
+      ptr(handler)->handle(expr->as<kir::UpdateMagicZero>());
+      return;
+    case ExprType::ForLoop:
+      ptr(handler)->handle(expr->as<kir::ForLoop>());
+      return;
+    case ExprType::IfThenElse:
+      ptr(handler)->handle(expr->as<kir::IfThenElse>());
+      return;
+    case ExprType::GridReduction:
+      ptr(handler)->handle(expr->as<kir::GridReduction>());
+      return;
+    case ExprType::GroupedGridReduction:
+      ptr(handler)->handle(expr->as<kir::GroupedGridReduction>());
+      return;
+    case ExprType::GridBroadcast:
+      ptr(handler)->handle(expr->as<kir::GridBroadcast>());
+      return;
+    case ExprType::GridWelford:
+      ptr(handler)->handle(expr->as<kir::GridWelford>());
+      return;
+    case ExprType::AllocateFusedReduction:
+      ptr(handler)->handle(expr->as<kir::AllocateFusedReduction>());
+      return;
     default:
       TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!");
   }
@@ -142,12 +202,22 @@ void Val::constDispatch(T handler, const Val* val) {
           ptr(handler)->handle(val->as<Double>());
           return;
         case DataType::Int:
+        case DataType::Int32:
+          // Dispatch to Int even with Int32 as we don't have Int32 IR
+          // node.
           ptr(handler)->handle(val->as<Int>());
           return;
+        case DataType::ComplexDouble:
+          ptr(handler)->handle(val->as<ComplexDouble>());
+          return;
         default:
           break;
       }
       break;
+    case ValType::NamedScalar:
+      ptr(handler)->handle(val->as<NamedScalar>());
+      return;
+
     case ValType::IterDomain:
       ptr(handler)->handle(val->as<IterDomain>());
       return;
@@ -157,8 +227,11 @@ void Val::constDispatch(T handler, const Val* val) {
     case ValType::TensorView:
       ptr(handler)->handle(val->as<TensorView>());
       return;
-    case ValType::NamedScalar:
-      ptr(handler)->handle(val->as<NamedScalar>());
+    case ValType::Predicate:
+      ptr(handler)->handle(val->as<kir::Predicate>());
+      return;
+    case ValType::TensorIndex:
+      ptr(handler)->handle(val->as<kir::TensorIndex>());
       return;
     default:
       break;
@@ -169,12 +242,6 @@ void Val::constDispatch(T handler, const Val* val) {
 template <typename T>
 void Expr::constDispatch(T handler, const Expr* expr) {
   switch (*(expr->getExprType())) {
-    case ExprType::Split:
-      ptr(handler)->handle(expr->as<Split>());
-      return;
-    case ExprType::Merge:
-      ptr(handler)->handle(expr->as<Merge>());
-      return;
     case ExprType::UnaryOp:
       ptr(handler)->handle(expr->as<UnaryOp>());
       return;
@@ -187,12 +254,25 @@ void Expr::constDispatch(T handler, const Expr* expr) {
     case ExprType::ReductionOp:
       ptr(handler)->handle(expr->as<ReductionOp>());
       return;
+    case ExprType::GroupedReductionOp:
+      ptr(handler)->handle(expr->as<GroupedReductionOp>());
+      return;
     case ExprType::WelfordOp:
       ptr(handler)->handle(expr->as<WelfordOp>());
       return;
+    case ExprType::MmaOp:
+      ptr(handler)->handle(expr->as<MmaOp>());
+      return;
     case ExprType::BroadcastOp:
       ptr(handler)->handle(expr->as<BroadcastOp>());
       return;
+
+    case ExprType::Split:
+      ptr(handler)->handle(expr->as<Split>());
+      return;
+    case ExprType::Merge:
+      ptr(handler)->handle(expr->as<Merge>());
+      return;
     case ExprType::TransposeOp:
       ptr(handler)->handle(expr->as<TransposeOp>());
       return;
@@ -202,9 +282,49 @@ void Expr::constDispatch(T handler, const Expr* expr) {
     case ExprType::GatherOp:
       ptr(handler)->handle(expr->as<GatherOp>());
       return;
+    case ExprType::ViewAsScalar:
+      ptr(handler)->handle(expr->as<ViewAsScalar>());
+      return;
     case ExprType::ViewOp:
       ptr(handler)->handle(expr->as<ViewOp>());
       return;
+
+    case ExprType::Allocate:
+      ptr(handler)->handle(expr->as<kir::Allocate>());
+      return;
+    case ExprType::BlockSync:
+      ptr(handler)->handle(expr->as<kir::BlockSync>());
+      return;
+    case ExprType::GridSync:
+      ptr(handler)->handle(expr->as<kir::GridSync>());
+      return;
+    case ExprType::InitMagicZero:
+      ptr(handler)->handle(expr->as<kir::InitMagicZero>());
+      return;
+    case ExprType::UpdateMagicZero:
+      ptr(handler)->handle(expr->as<kir::UpdateMagicZero>());
+      return;
+    case ExprType::ForLoop:
+      ptr(handler)->handle(expr->as<kir::ForLoop>());
+      return;
+    case ExprType::IfThenElse:
+      ptr(handler)->handle(expr->as<kir::IfThenElse>());
+      return;
+    case ExprType::GridReduction:
+      ptr(handler)->handle(expr->as<kir::GridReduction>());
+      return;
+    case ExprType::GroupedGridReduction:
+      ptr(handler)->handle(expr->as<kir::GroupedGridReduction>());
+      return;
+    case ExprType::GridBroadcast:
+      ptr(handler)->handle(expr->as<kir::GridBroadcast>());
+      return;
+    case ExprType::GridWelford:
+      ptr(handler)->handle(expr->as<kir::GridWelford>());
+      return;
+    case ExprType::AllocateFusedReduction:
+      ptr(handler)->handle(expr->as<kir::AllocateFusedReduction>());
+      return;
     default:
       TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!");
   }
@@ -232,28 +352,45 @@ void Statement::constDispatch(T handler, const Statement* stmt) {
  *   ptr(mutator)->mutate(this->as<Statement>());
  */
 template <typename T>
-Statement* Val::mutatorDispatch(T mutator, Val* val) {
+void Val::mutatorDispatch(T mutator, Val* val) {
   switch (*(val->getValType())) {
     case ValType::Scalar:
       switch (*(val->getDataType())) {
         case DataType::Bool:
-          return ptr(mutator)->mutate(val->as<Bool>());
+          ptr(mutator)->mutate(val->as<Bool>());
+          return;
         case DataType::Double:
-          return ptr(mutator)->mutate(val->as<Double>());
+          ptr(mutator)->mutate(val->as<Double>());
+          return;
         case DataType::Int:
-          return ptr(mutator)->mutate(val->as<Int>());
+          ptr(mutator)->mutate(val->as<Int>());
+          return;
+        case DataType::ComplexDouble:
+          ptr(mutator)->mutate(val->as<ComplexDouble>());
+          return;
         default:
           break;
       }
       break;
+    case ValType::NamedScalar:
+      ptr(mutator)->mutate(val->as<NamedScalar>());
+      return;
+
     case ValType::IterDomain:
-      return ptr(mutator)->mutate(val->as<IterDomain>());
+      ptr(mutator)->mutate(val->as<IterDomain>());
+      return;
     case ValType::TensorDomain:
-      return ptr(mutator)->mutate(val->as<TensorDomain>());
+      ptr(mutator)->mutate(val->as<TensorDomain>());
+      return;
     case ValType::TensorView:
-      return ptr(mutator)->mutate(val->as<TensorView>());
-    case ValType::NamedScalar:
-      return ptr(mutator)->mutate(val->as<NamedScalar>());
+      ptr(mutator)->mutate(val->as<TensorView>());
+      return;
+    case ValType::Predicate:
+      ptr(mutator)->mutate(val->as<kir::Predicate>());
+      return;
+    case ValType::TensorIndex:
+      ptr(mutator)->mutate(val->as<kir::TensorIndex>());
+      return;
     default:
       break;
   }
@@ -261,44 +398,105 @@ Statement* Val::mutatorDispatch(T mutator, Val* val) {
 }
 
 template <typename T>
-Statement* Expr::mutatorDispatch(T mutator, Expr* expr) {
+void Expr::mutatorDispatch(T mutator, Expr* expr) {
   switch (*(expr->getExprType())) {
-    case ExprType::Split:
-      return ptr(mutator)->mutate(expr->as<Split>());
-    case ExprType::Merge:
-      return ptr(mutator)->mutate(expr->as<Merge>());
     case ExprType::UnaryOp:
-      return ptr(mutator)->mutate(expr->as<UnaryOp>());
+      ptr(mutator)->mutate(expr->as<UnaryOp>());
+      return;
     case ExprType::BinaryOp:
-      return ptr(mutator)->mutate(expr->as<BinaryOp>());
+      ptr(mutator)->mutate(expr->as<BinaryOp>());
+      return;
     case ExprType::TernaryOp:
-      return ptr(mutator)->mutate(expr->as<TernaryOp>());
+      ptr(mutator)->mutate(expr->as<TernaryOp>());
+      return;
     case ExprType::ReductionOp:
-      return ptr(mutator)->mutate(expr->as<ReductionOp>());
+      ptr(mutator)->mutate(expr->as<ReductionOp>());
+      return;
+    case ExprType::GroupedReductionOp:
+      ptr(mutator)->mutate(expr->as<GroupedReductionOp>());
+      return;
     case ExprType::WelfordOp:
-      return ptr(mutator)->mutate(expr->as<WelfordOp>());
+      ptr(mutator)->mutate(expr->as<WelfordOp>());
+      return;
+    case ExprType::MmaOp:
+      ptr(mutator)->mutate(expr->as<MmaOp>());
+      return;
     case ExprType::BroadcastOp:
-      return ptr(mutator)->mutate(expr->as<BroadcastOp>());
+      ptr(mutator)->mutate(expr->as<BroadcastOp>());
+      return;
+
+    case ExprType::Split:
+      ptr(mutator)->mutate(expr->as<Split>());
+      return;
+    case ExprType::Merge:
+      ptr(mutator)->mutate(expr->as<Merge>());
+      return;
     case ExprType::TransposeOp:
-      return ptr(mutator)->mutate(expr->as<TransposeOp>());
+      ptr(mutator)->mutate(expr->as<TransposeOp>());
+      return;
     case ExprType::ShiftOp:
-      return ptr(mutator)->mutate(expr->as<ShiftOp>());
+      ptr(mutator)->mutate(expr->as<ShiftOp>());
+      return;
     case ExprType::GatherOp:
-      return ptr(mutator)->mutate(expr->as<GatherOp>());
+      ptr(mutator)->mutate(expr->as<GatherOp>());
+      return;
+    case ExprType::ViewAsScalar:
+      ptr(mutator)->mutate(expr->as<ViewAsScalar>());
+      return;
     case ExprType::ViewOp:
-      return ptr(mutator)->mutate(expr->as<ViewOp>());
+      ptr(mutator)->mutate(expr->as<ViewOp>());
+      return;
+
+    case ExprType::Allocate:
+      ptr(mutator)->mutate(expr->as<kir::Allocate>());
+      return;
+    case ExprType::BlockSync:
+      ptr(mutator)->mutate(expr->as<kir::BlockSync>());
+      return;
+    case ExprType::GridSync:
+      ptr(mutator)->mutate(expr->as<kir::GridSync>());
+      return;
+    case ExprType::InitMagicZero:
+      ptr(mutator)->mutate(expr->as<kir::InitMagicZero>());
+      return;
+    case ExprType::UpdateMagicZero:
+      ptr(mutator)->mutate(expr->as<kir::UpdateMagicZero>());
+      return;
+    case ExprType::ForLoop:
+      ptr(mutator)->mutate(expr->as<kir::ForLoop>());
+      return;
+    case ExprType::IfThenElse:
+      ptr(mutator)->mutate(expr->as<kir::IfThenElse>());
+      return;
+    case ExprType::GridReduction:
+      ptr(mutator)->mutate(expr->as<kir::GridReduction>());
+      return;
+    case ExprType::GroupedGridReduction:
+      ptr(mutator)->mutate(expr->as<kir::GroupedGridReduction>());
+      return;
+    case ExprType::GridBroadcast:
+      ptr(mutator)->mutate(expr->as<kir::GridBroadcast>());
+      return;
+    case ExprType::GridWelford:
+      ptr(mutator)->mutate(expr->as<kir::GridWelford>());
+      return;
+    case ExprType::AllocateFusedReduction:
+      ptr(mutator)->mutate(expr->as<kir::AllocateFusedReduction>());
+      return;
     default:
       TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!");
   }
 }
 
 template <typename T>
-Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) {
+void Statement::mutatorDispatch(T mutator, Statement* stmt) {
   if (stmt->isVal()) {
-    return ptr(mutator)->mutate(stmt->as<Val>());
+    ptr(mutator)->mutate(stmt->as<Val>());
+    return;
   }
   if (stmt->isExpr()) {
-    return ptr(mutator)->mutate(stmt->as<Expr>());
+    ptr(mutator)->mutate(stmt->as<Expr>());
+    return;
   }
   TORCH_INTERNAL_ASSERT(false, "Unknown stmttype in dispatch!");
 }
@@ -308,11 +506,11 @@ Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) {
  * classes. Actual visitors/mutators should inhereit from these classes and call
  * ->dispatch(this) to avoid needing an explicit instantiation.
  */
-template void Statement::dispatch(OptOutDispatch, Statement*);
+template void Statement::dispatch(OptOutDispatch&, Statement*);
 template void Statement::dispatch(OptOutDispatch*, Statement*);
-template void Val::dispatch(OptOutDispatch, Val*);
+template void Val::dispatch(OptOutDispatch&, Val*);
 template void Val::dispatch(OptOutDispatch*, Val*);
-template void Expr::dispatch(OptOutDispatch, Expr*);
+template void Expr::dispatch(OptOutDispatch&, Expr*);
 template void Expr::dispatch(OptOutDispatch*, Expr*);
 
 template void Statement::dispatch(OptInDispatch, Statement*);
@@ -322,33 +520,26 @@ template void Val::dispatch(OptInDispatch*, Val*);
 template void Expr::dispatch(OptInDispatch, Expr*);
 template void Expr::dispatch(OptInDispatch*, Expr*);
 
-template void Statement::constDispatch(OptOutConstDispatch, const Statement*);
+template void Statement::constDispatch(OptOutConstDispatch&, const Statement*);
 template void Statement::constDispatch(OptOutConstDispatch*, const Statement*);
-template void Val::constDispatch(OptOutConstDispatch, const Val*);
+template void Val::constDispatch(OptOutConstDispatch&, const Val*);
 template void Val::constDispatch(OptOutConstDispatch*, const Val*);
-template void Expr::constDispatch(OptOutConstDispatch, const Expr*);
+template void Expr::constDispatch(OptOutConstDispatch&, const Expr*);
 template void Expr::constDispatch(OptOutConstDispatch*, const Expr*);
 
-template void Statement::constDispatch(OptInConstDispatch, const Statement*);
+template void Statement::constDispatch(OptInConstDispatch&, const Statement*);
 template void Statement::constDispatch(OptInConstDispatch*, const Statement*);
-template void Val::constDispatch(OptInConstDispatch, const Val*);
+template void Val::constDispatch(OptInConstDispatch&, const Val*);
 template void Val::constDispatch(OptInConstDispatch*, const Val*);
-template void Expr::constDispatch(OptInConstDispatch, const Expr*);
+template void Expr::constDispatch(OptInConstDispatch&, const Expr*);
 template void Expr::constDispatch(OptInConstDispatch*, const Expr*);
 
-template Statement* Statement::mutatorDispatch(OptOutMutator, Statement*);
-template Statement* Statement::mutatorDispatch(OptOutMutator*, Statement*);
-template Statement* Val::mutatorDispatch(OptOutMutator, Val*);
-template Statement* Val::mutatorDispatch(OptOutMutator*, Val*);
-template Statement* Expr::mutatorDispatch(OptOutMutator, Expr*);
-template Statement* Expr::mutatorDispatch(OptOutMutator*, Expr*);
-
-template Statement* Statement::mutatorDispatch(OptInMutator, Statement*);
-template Statement* Statement::mutatorDispatch(OptInMutator*, Statement*);
-template Statement* Val::mutatorDispatch(OptInMutator, Val*);
-template Statement* Val::mutatorDispatch(OptInMutator*, Val*);
-template Statement* Expr::mutatorDispatch(OptInMutator, Expr*);
-template Statement* Expr::mutatorDispatch(OptInMutator*, Expr*);
+template void Statement::mutatorDispatch(OptOutMutator&, Statement*);
+template void Statement::mutatorDispatch(OptOutMutator*, Statement*);
+template void Val::mutatorDispatch(OptOutMutator&, Val*);
+template void Val::mutatorDispatch(OptOutMutator*, Val*);
+template void Expr::mutatorDispatch(OptOutMutator&, Expr*);
+template void Expr::mutatorDispatch(OptOutMutator*, Expr*);
 
 void OptOutDispatch::handle(Statement* s) {
   Statement::dispatch(this, s);
@@ -362,18 +553,6 @@ void OptOutDispatch::handle(Val* v) {
   Val::dispatch(this, v);
 }
 
-void OptInDispatch::handle(Statement* s) {
-  Statement::dispatch(this, s);
-}
-
-void OptInDispatch::handle(Expr* e) {
-  Expr::dispatch(this, e);
-}
-
-void OptInDispatch::handle(Val* v) {
-  Val::dispatch(this, v);
-}
-
 void OptOutConstDispatch::handle(const Statement* s) {
   Statement::constDispatch(this, s);
 }
@@ -386,46 +565,266 @@ void OptOutConstDispatch::handle(const Val* v) {
   Val::constDispatch(this, v);
 }
 
-void OptInConstDispatch::handle(const Statement* s) {
-  Statement::constDispatch(this, s);
+void OptInConstDispatch::unhandled(const Statement* stmt) {
+  if (stmt->isExpr()) {
+    TORCH_INTERNAL_ASSERT(
+        false, "Handle not overriden for ", stmt->getExprType().value(), ".");
+  } else if (stmt->isVal()) {
+    TORCH_INTERNAL_ASSERT(
+        false, "Handle not overriden for ", stmt->getValType().value(), ".");
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type.");
+  }
 }
 
-void OptInConstDispatch::handle(const Expr* e) {
-  Expr::constDispatch(this, e);
+void OptInDispatch::unhandled(Statement* stmt) {
+  if (stmt->isExpr()) {
+    TORCH_INTERNAL_ASSERT(
+        false, "Handle not overriden for ", stmt->getExprType().value(), ".");
+  } else if (stmt->isVal()) {
+    TORCH_INTERNAL_ASSERT(
+        false, "Handle not overriden for ", stmt->getValType().value(), ".");
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type.");
+  }
 }
 
-void OptInConstDispatch::handle(const Val* v) {
-  Val::constDispatch(this, v);
+// Vals
+void OptOutConstDispatch::handle(const Bool* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const Double* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const Int* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const ComplexDouble* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const NamedScalar* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const IterDomain* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const TensorDomain* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const TensorView* stmt) {
+  unhandled(stmt);
+}
+
+void OptOutConstDispatch::handle(const kir::Predicate* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::TensorIndex* stmt) {
+  unhandled(stmt);
+}
+
+// Exprs
+void OptOutConstDispatch::handle(const UnaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const BinaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const TernaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const ReductionOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const GroupedReductionOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const WelfordOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const MmaOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const BroadcastOp* stmt) {
+  unhandled(stmt);
 }
 
-Statement* OptInMutator::mutate(Statement* s) {
-  return Statement::mutatorDispatch(this, s);
+void OptOutConstDispatch::handle(const Split* stmt) {
+  unhandled(stmt);
 }
+void OptOutConstDispatch::handle(const Merge* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const TransposeOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const ShiftOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const GatherOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const ViewAsScalar* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const ViewOp* stmt) {
+  unhandled(stmt);
+}
+
+void OptOutConstDispatch::handle(const kir::Allocate* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::BlockSync* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::GridSync* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::InitMagicZero* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::UpdateMagicZero* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::ForLoop* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::IfThenElse* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::GridReduction* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::GroupedGridReduction* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::GridBroadcast* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::GridWelford* stmt) {
+  unhandled(stmt);
+}
+void OptOutConstDispatch::handle(const kir::AllocateFusedReduction* stmt) {
+  unhandled(stmt);
+}
+
+void OptOutDispatch::unhandled(Statement*) {}
 
-Statement* OptInMutator::mutate(Expr* e) {
-  return Expr::mutatorDispatch(this, e);
+// Vals
+void OptOutDispatch::handle(Bool* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(Double* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(Int* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(ComplexDouble* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(NamedScalar* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(IterDomain* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(TensorDomain* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(TensorView* stmt) {
+  unhandled(stmt);
 }
 
-Statement* OptInMutator::mutate(Val* v) {
-  // If value is already mutated, return the mutation
-  if (mutations.find(v) != mutations.end())
-    return mutations[v];
-  return Val::mutatorDispatch(this, v);
+void OptOutDispatch::handle(kir::Predicate* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::TensorIndex* stmt) {
+  unhandled(stmt);
 }
 
-Statement* OptOutMutator::mutate(Statement* s) {
-  return Statement::mutatorDispatch(this, s);
+// Exprs
+void OptOutDispatch::handle(UnaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(BinaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(TernaryOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(ReductionOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(GroupedReductionOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(WelfordOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(MmaOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(BroadcastOp* stmt) {
+  unhandled(stmt);
 }
 
-Statement* OptOutMutator::mutate(Expr* e) {
-  return Expr::mutatorDispatch(this, e);
+void OptOutDispatch::handle(Split* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(Merge* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(TransposeOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(ShiftOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(GatherOp* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(ViewAsScalar* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(ViewOp* stmt) {
+  unhandled(stmt);
 }
 
-Statement* OptOutMutator::mutate(Val* v) {
-  // If value is already mutated, return the mutation
-  if (mutations.find(v) != mutations.end())
-    return mutations[v];
-  return Val::mutatorDispatch(this, v);
+void OptOutDispatch::handle(kir::Allocate* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::BlockSync* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::GridSync* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::InitMagicZero* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::UpdateMagicZero* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::ForLoop* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::IfThenElse* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::GridReduction* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::GroupedGridReduction* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::GridBroadcast* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::GridWelford* stmt) {
+  unhandled(stmt);
+}
+void OptOutDispatch::handle(kir::AllocateFusedReduction* stmt) {
+  unhandled(stmt);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h
index c1be76eb950e..8c0b78702217 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.h
+++ b/torch/csrc/jit/codegen/cuda/dispatch.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/utils.h>
-
+#include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
 #include <unordered_map>
 
@@ -48,7 +48,7 @@ namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
-
+class IrContainer;
 class Fusion;
 
 // Hierarchal dispatch functions for handle
@@ -60,28 +60,56 @@ class Val;
 class IterDomain;
 class TensorDomain;
 class TensorView;
+
 class Bool;
 class Double;
 class Int;
+class ComplexDouble;
 class NamedScalar;
 
 // Exprs
-class Split;
-class Merge;
 class UnaryOp;
 class BinaryOp;
 class TernaryOp;
 class ReductionOp;
+class GroupedReductionOp;
 class WelfordOp;
+class MmaOp;
 class BroadcastOp;
 class TransposeOp;
 class ShiftOp;
 class GatherOp;
+class ViewAsScalar;
 class ViewOp;
 
+// Exprs
+class Split;
+class Merge;
+
+namespace kir {
+class Predicate;
+class TensorIndex;
+
+class Allocate;
+class BlockSync;
+class GridSync;
+class ForLoop;
+class IfThenElse;
+class GridReduction;
+class GroupedGridReduction;
+class GridBroadcast;
+class GridWelford;
+class AllocateFusedReduction;
+class InitMagicZero;
+class UpdateMagicZero;
+} // namespace kir
+
 // By default, all IR nodes are handled in this dispatch, and will call an empty
 // function on all nodes.
 class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
+ protected:
+  virtual void unhandled(const Statement*) {}
+
  public:
   // Hierarchal dispatch functions for handle
   virtual void handle(const Statement*);
@@ -89,30 +117,54 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
   virtual void handle(const Val*);
 
   // Vals
-  virtual void handle(const IterDomain*) {}
-  virtual void handle(const TensorDomain*) {}
-  virtual void handle(const TensorView*) {}
-  virtual void handle(const Bool*) {}
-  virtual void handle(const Double*) {}
-  virtual void handle(const Int*) {}
-  virtual void handle(const NamedScalar*) {}
+  virtual void handle(const IterDomain* stmt);
+  virtual void handle(const TensorDomain* stmt);
+  virtual void handle(const TensorView* stmt);
+  virtual void handle(const Bool* stmt);
+  virtual void handle(const Double* stmt);
+  virtual void handle(const Int* stmt);
+  virtual void handle(const ComplexDouble* stmt);
+  virtual void handle(const NamedScalar* stmt);
+
+  virtual void handle(const kir::Predicate*);
+  virtual void handle(const kir::TensorIndex*);
 
   // Exprs
-  virtual void handle(const Split*) {}
-  virtual void handle(const Merge*) {}
-  virtual void handle(const UnaryOp*) {}
-  virtual void handle(const BinaryOp*) {}
-  virtual void handle(const TernaryOp*) {}
-  virtual void handle(const ReductionOp*) {}
-  virtual void handle(const WelfordOp*) {}
-  virtual void handle(const BroadcastOp*) {}
-  virtual void handle(const TransposeOp*) {}
-  virtual void handle(const ShiftOp*) {}
-  virtual void handle(const GatherOp*) {}
-  virtual void handle(const ViewOp*) {}
+  virtual void handle(const UnaryOp* stmt);
+  virtual void handle(const BinaryOp* stmt);
+  virtual void handle(const TernaryOp* stmt);
+  virtual void handle(const ReductionOp* stmt);
+  virtual void handle(const GroupedReductionOp* stmt);
+  virtual void handle(const WelfordOp* stmt);
+  virtual void handle(const MmaOp* stmt);
+  virtual void handle(const BroadcastOp* stmt);
+
+  virtual void handle(const Split* stmt);
+  virtual void handle(const Merge* stmt);
+  virtual void handle(const TransposeOp* stmt);
+  virtual void handle(const ShiftOp* stmt);
+  virtual void handle(const GatherOp* stmt);
+  virtual void handle(const ViewAsScalar* stmt);
+  virtual void handle(const ViewOp* stmt);
+
+  virtual void handle(const kir::Allocate*);
+  virtual void handle(const kir::BlockSync*);
+  virtual void handle(const kir::GridSync*);
+  virtual void handle(const kir::InitMagicZero*);
+  virtual void handle(const kir::UpdateMagicZero*);
+  virtual void handle(const kir::ForLoop*);
+  virtual void handle(const kir::IfThenElse*);
+  virtual void handle(const kir::GridReduction*);
+  virtual void handle(const kir::GroupedGridReduction*);
+  virtual void handle(const kir::GridBroadcast*);
+  virtual void handle(const kir::GridWelford*);
+  virtual void handle(const kir::AllocateFusedReduction*);
 };
 
 class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
+ protected:
+  virtual void unhandled(Statement*);
+
  public:
   // Hierarchal dispatch functions for handle
   virtual void handle(Statement*);
@@ -120,190 +172,95 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
   virtual void handle(Val*);
 
   // Vals
-  virtual void handle(IterDomain*) {}
-  virtual void handle(TensorDomain*) {}
-  virtual void handle(TensorView*) {}
-  virtual void handle(Bool*) {}
-  virtual void handle(Double*) {}
-  virtual void handle(Int*) {}
-  virtual void handle(NamedScalar*) {}
+  virtual void handle(Bool* stmt);
+  virtual void handle(Double* stmt);
+  virtual void handle(Int* stmt);
+  virtual void handle(ComplexDouble* stmt);
+  virtual void handle(NamedScalar* stmt);
+  virtual void handle(IterDomain* stmt);
+  virtual void handle(TensorDomain* stmt);
+  virtual void handle(TensorView* stmt);
+
+  virtual void handle(kir::Predicate*);
+  virtual void handle(kir::TensorIndex*);
 
   // Exprs
-  virtual void handle(Split*) {}
-  virtual void handle(Merge*) {}
-  virtual void handle(UnaryOp*) {}
-  virtual void handle(BinaryOp*) {}
-  virtual void handle(TernaryOp*) {}
-  virtual void handle(ReductionOp*) {}
-  virtual void handle(WelfordOp*) {}
-  virtual void handle(BroadcastOp*) {}
-  virtual void handle(TransposeOp*) {}
-  virtual void handle(ShiftOp*) {}
-  virtual void handle(GatherOp*) {}
-  virtual void handle(ViewOp*) {}
+  virtual void handle(UnaryOp* stmt);
+  virtual void handle(BinaryOp* stmt);
+  virtual void handle(TernaryOp* stmt);
+  virtual void handle(ReductionOp* stmt);
+  virtual void handle(GroupedReductionOp* stmt);
+  virtual void handle(WelfordOp* stmt);
+  virtual void handle(MmaOp* stmt);
+  virtual void handle(BroadcastOp* stmt);
+
+  virtual void handle(Split* stmt);
+  virtual void handle(Merge* stmt);
+  virtual void handle(TransposeOp* stmt);
+  virtual void handle(ShiftOp* stmt);
+  virtual void handle(GatherOp* stmt);
+  virtual void handle(ViewAsScalar* stmt);
+  virtual void handle(ViewOp* stmt);
+
+  virtual void handle(kir::Allocate* stmt);
+  virtual void handle(kir::BlockSync* stmt);
+  virtual void handle(kir::GridSync* stmt);
+  virtual void handle(kir::InitMagicZero* stmt);
+  virtual void handle(kir::UpdateMagicZero* stmt);
+  virtual void handle(kir::ForLoop* stmt);
+  virtual void handle(kir::IfThenElse* stmt);
+  virtual void handle(kir::GridReduction* stmt);
+  virtual void handle(kir::GroupedGridReduction* stmt);
+  virtual void handle(kir::GridBroadcast* stmt);
+  virtual void handle(kir::GridWelford* stmt);
+  virtual void handle(kir::AllocateFusedReduction* stmt);
 };
 
-class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
+class TORCH_CUDA_CU_API OptInConstDispatch : public OptOutConstDispatch {
  public:
-  // Hierarchal dispatch functions for handle
-  virtual void handle(const Statement*);
-  virtual void handle(const Expr*);
-  virtual void handle(const Val*);
-
-  // Vals
-  virtual void handle(const IterDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain.");
-  }
-  virtual void handle(const TensorDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain.");
-  }
-  virtual void handle(const TensorView*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView.");
-  }
-  virtual void handle(const Bool*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
-  }
-  virtual void handle(const Double*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
-  }
-  virtual void handle(const Int*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
-  }
-  virtual void handle(const NamedScalar*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar.");
-  }
+  using OptOutConstDispatch::handle;
 
-  // Exprs
-  virtual void handle(const Split*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split.");
-  }
-  virtual void handle(const Merge*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge.");
-  }
-  virtual void handle(const UnaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp.");
-  }
-  virtual void handle(const BinaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
-  }
-  virtual void handle(const WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
-  }
-  virtual void handle(const TernaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp.");
-  }
-  virtual void handle(const ReductionOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
-  }
-  virtual void handle(const BroadcastOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
-  }
-  virtual void handle(const TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
-  }
-  virtual void handle(const ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
-  }
-  virtual void handle(const GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
-  }
-  virtual void handle(const ViewOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp.");
-  }
+ protected:
+  virtual void unhandled(const Statement* stmt) final;
 };
 
-class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase {
+class TORCH_CUDA_CU_API OptInDispatch : public OptOutDispatch {
  public:
-  // Hierarchal dispatch functions for handle
-  virtual void handle(Statement* s);
-  virtual void handle(Expr* e);
-  virtual void handle(Val* v);
+  using OptOutDispatch::handle;
 
-  // Vals
-  virtual void handle(IterDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain.");
-  }
-  virtual void handle(TensorDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain.");
-  }
-  virtual void handle(TensorView*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView.");
-  }
-  virtual void handle(Bool*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
-  }
-  virtual void handle(Double*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
-  }
-  virtual void handle(Int*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
-  }
-  virtual void handle(NamedScalar*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar.");
-  }
-
-  // Exprs
-  virtual void handle(Split*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split.");
-  }
-  virtual void handle(Merge*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge.");
-  }
-  virtual void handle(UnaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp.");
-  }
-  virtual void handle(BinaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
-  }
-  virtual void handle(TernaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp.");
-  }
-  virtual void handle(ReductionOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
-  }
-  virtual void handle(WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
-  }
-  virtual void handle(BroadcastOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
-  }
-  virtual void handle(TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
-  }
-  virtual void handle(ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
-  }
-  virtual void handle(GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
-  }
-  virtual void handle(ViewOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp.");
-  }
+ protected:
+  virtual void unhandled(Statement* stmt) final;
 };
 
+// Class to perform mutations on Fusion IR. Exprs can simply be redefined, but
+// when mutating values they have to be registered through registerMutation so
+// that exprs can detect there's been a muatation and know to modify all
+// instances of that Val. This means each Val should be mutated "consistently".
+// Otherwise behavior may be difficult to understand as it depends on which
+// order mutate is called in. This class expects user to topologically call the
+// statments of interest so inputs are called and mutated before exprs depending
+// on them.
+//
+// Warning: TensorViews need to be treated carefully. As we don't generally
+// register their mutation when their tensor domains only change. If a TV needs
+// to be swapped out, it needs to be registered as a "proper" mutation like
+// other vals, on top of TensorDomain being updated in the mutated TensorView.
+//
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
  public:
   // Hierarchal dispatch functions for handle
-  virtual Statement* mutate(Statement* s);
-  virtual Statement* mutate(Expr* e);
-  virtual Statement* mutate(Val* v);
-
-  // We always want to dispatch through a Val, so we can capture and dispatch
-  // correctly members of nodes like Split->TensorDomain If we don't call the
-  // below function or manually cast to use mutate(Val* v) we can't intercept
-  // and mutate by capturing mutate(Val* v), which is what we do when we want to
-  // replace all instances of a value.
-  Statement* mutateAsVal(Val* v) {
-    return mutate(v);
-  }
+  virtual void mutate(Statement* s);
+  virtual void mutate(Expr* e);
+  virtual void mutate(Val* v);
+
+  void registerMutation(Val* val, Val* mutation);
 
-  void registerMutation(Val* val, Val* mutation) {
-    TORCH_INTERNAL_ASSERT(
-        mutations.find(val) == mutations.end(),
-        " The same value is incorrectly being mutated twice.",
-        " One mutation per mutation pass is allowed.");
-    mutations[val] = mutation;
+  Val* maybeMutated(Val* val) {
+    if (mutations.find(val) == mutations.end()) {
+      return val;
+    }
+    return mutations.at(val);
   }
 
   std::unordered_map<Val*, Val*> mutations;
@@ -311,105 +268,51 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
   //****Functions below defined in mutator.cpp*****
 
   // Vals
-  virtual Statement* mutate(IterDomain*);
-  virtual Statement* mutate(TensorDomain*);
-  virtual Statement* mutate(TensorView*);
-  virtual Statement* mutate(Bool*);
-  virtual Statement* mutate(Double*);
-  virtual Statement* mutate(Int*);
-  virtual Statement* mutate(NamedScalar*);
+  virtual void mutate(Bool*);
+  virtual void mutate(Double*);
+  virtual void mutate(Int*);
+  virtual void mutate(ComplexDouble*);
+  virtual void mutate(NamedScalar*);
+  virtual void mutate(IterDomain*);
+  virtual void mutate(TensorDomain*);
+  virtual void mutate(TensorView*);
+
+  virtual void mutate(kir::Predicate*);
+  virtual void mutate(kir::TensorIndex*);
 
   // Exprs
-  virtual Statement* mutate(Split*);
-  virtual Statement* mutate(Merge*);
-  virtual Statement* mutate(UnaryOp*);
-  virtual Statement* mutate(BinaryOp*);
-  virtual Statement* mutate(TernaryOp*);
-  virtual Statement* mutate(ReductionOp*);
-  virtual Statement* mutate(WelfordOp*);
-  virtual Statement* mutate(BroadcastOp*);
-  virtual Statement* mutate(TransposeOp*);
-  virtual Statement* mutate(ShiftOp*);
-  virtual Statement* mutate(GatherOp*);
-  virtual Statement* mutate(ViewOp*);
-};
-
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API OptInMutator : public PolymorphicBase {
- public:
-  std::unordered_map<Val*, Val*> mutations;
-
- public:
-  void registerMutation(Val* val, Val* mutation) {
-    TORCH_INTERNAL_ASSERT(
-        mutations.find(val) == mutations.end(),
-        " The same value is incorrectly being mutated twice.",
-        " One mutation per mutation pass is allowed.");
-    mutations[val] = mutation;
-  }
-
-  // Hierarchal dispatch functions for mutate
-  virtual Statement* mutate(Statement*);
-  virtual Statement* mutate(Expr*);
-  virtual Statement* mutate(Val*);
-
-  // Vals
-  virtual Statement* mutate(IterDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for IterDomain.");
-  }
-  virtual Statement* mutate(TensorDomain*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorDomain.");
-  }
-  virtual Statement* mutate(TensorView*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorView.");
-  }
-  virtual Statement* mutate(Bool*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Bool.");
-  }
-  virtual Statement* mutate(Int*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Int.");
-  }
-  virtual Statement* mutate(NamedScalar*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for NamedScalar.");
-  }
-
-  // Exprs
-  virtual Statement* mutate(Split*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Split.");
-  }
-  virtual Statement* mutate(Merge*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Merge.");
-  }
-  virtual Statement* mutate(UnaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for UnaryOp.");
-  }
-  virtual Statement* mutate(BinaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BinaryOp.");
-  }
-  virtual Statement* mutate(TernaryOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TernaryOp.");
-  }
-  virtual Statement* mutate(ReductionOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp.");
-  }
-  virtual Statement* mutate(WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for WelfordOp.");
-  }
-  virtual Statement* mutate(BroadcastOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BroadcastOp.");
-  }
-  virtual Statement* mutate(TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TransposeOp.");
-  }
-  virtual Statement* mutate(ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ShiftOp.");
-  }
-  virtual Statement* mutate(GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for GatherOp.");
-  }
-  virtual Statement* mutate(ViewOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ViewOp.");
-  }
+  virtual void mutate(UnaryOp*);
+  virtual void mutate(BinaryOp*);
+  virtual void mutate(TernaryOp*);
+  virtual void mutate(ReductionOp*);
+  virtual void mutate(GroupedReductionOp*);
+  virtual void mutate(WelfordOp*);
+  virtual void mutate(MmaOp*);
+  virtual void mutate(BroadcastOp*);
+
+  virtual void mutate(Split*);
+  virtual void mutate(Merge*);
+  virtual void mutate(TransposeOp*);
+  virtual void mutate(ShiftOp*);
+  virtual void mutate(GatherOp*);
+  virtual void mutate(ViewAsScalar*);
+  virtual void mutate(ViewOp*);
+
+  virtual void mutate(kir::Allocate*);
+  virtual void mutate(kir::BlockSync*);
+  virtual void mutate(kir::GridSync*);
+  virtual void mutate(kir::InitMagicZero*);
+  virtual void mutate(kir::UpdateMagicZero*);
+  virtual void mutate(kir::ForLoop*);
+  virtual void mutate(kir::IfThenElse*);
+  virtual void mutate(kir::GridReduction*);
+  virtual void mutate(kir::GroupedGridReduction*);
+  virtual void mutate(kir::GridBroadcast*);
+  virtual void mutate(kir::GridWelford*);
+  virtual void mutate(kir::AllocateFusedReduction*);
+
+ protected:
+  void removeExpr(IrContainer*, Expr*);
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
index 288dbb198b00..83107569dc54 100644
--- a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
+++ b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
@@ -1,9 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
+#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -68,8 +70,8 @@ std::vector<VALTYPE*> makeSortedEvaluationList(std::vector<VALTYPE*> input) {
 //! Kernel IR utility, collects all the symbolic integers
 //!  used in allocation nodes.
 void collectBufferSizes(
-    std::vector<kir::Val*>& into,
-    const std::vector<kir::Expr*>& exprs) {
+    std::vector<Val*>& into,
+    const std::vector<Expr*>& exprs) {
   for (auto expr : exprs) {
     if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
       into.push_back(allocate->size());
@@ -82,56 +84,44 @@ void collectBufferSizes(
   }
 }
 
-//! Kernel IR utility, collects all the kir symbolic
+//! Kernel IR utility, collects all the kernel symbolic
 //!  integers we will need at runtime, i.e. after the
 //!  generated cuda kernel has already been compiled.
 //!  The values are to be used for runtime logic, like
 //!  `computeLaunchparams`.
-std::vector<kir::Val*> collectRuntimeUsedIntegers(
-    Fusion* fusion,
-    GpuLower* lower) {
-  std::vector<kir::Val*> ret;
-
+std::vector<Val*> collectRuntimeUsedIntegers(kir::Kernel* kernel) {
+  std::vector<Val*> ret;
+  auto all_tvs = ir_utils::allTvs(kernel);
   // Collect extent and integer inputs
-  for (auto val : fusion->usedMathVals()) {
-    auto kir_val = lower->lowerValue(val);
-    if (auto kir_tv = dynamic_cast<kir::TensorView*>(kir_val)) {
-      for (auto id : kir_tv->domain()->domain()) {
-        ret.push_back(id->extent());
-      }
-    } else if (val->isFusionInput()) {
-      if (kir_val->isA<kir::Int>()) {
-        ret.push_back(kir_val);
-      }
+  for (auto tv : all_tvs) {
+    for (auto id : tv->domain()->domain()) {
+      ret.push_back(id->extent());
+    }
+  }
+  for (auto inp : kernel->inputs()) {
+    if (inp->isA<Int>()) {
+      ret.push_back(inp);
     }
   }
-
   // Collect allocation sizes:
-  collectBufferSizes(ret, lower->kernel()->topLevelExprs());
-
+  collectBufferSizes(ret, kernel->topLevelExprs());
   return makeSortedEvaluationList(ret);
 }
-//! Fusion IR utility, collects all the fusionIR symbolic
-//!  integers we will need at runtime, i.e. after the
-//!  generated cuda kernel has already been compiled.
-//!  The values are to be used for runtime logic, like
-//!  `canSchedule` in heuristic look up.
+
 std::vector<Val*> collectRuntimeUsedIntegers(Fusion* fusion) {
   std::vector<Val*> ret;
-
+  auto all_tvs = ir_utils::allTvs(fusion);
   // Collect extent and integer inputs
-  for (auto val : fusion->usedMathVals()) {
-    if (auto tv = dynamic_cast<TensorView*>(val)) {
-      for (auto id : tv->domain()->domain()) {
-        ret.push_back(id->extent());
-      }
-    } else if (val->isFusionInput()) {
-      if (val->isA<Int>()) {
-        ret.push_back(val);
-      }
+  for (auto tv : all_tvs) {
+    for (auto id : tv->domain()->domain()) {
+      ret.push_back(id->extent());
+    }
+  }
+  for (auto inp : fusion->inputs()) {
+    if (inp->isA<Int>()) {
+      ret.push_back(inp);
     }
   }
-
   return makeSortedEvaluationList(ret);
 }
 
@@ -140,7 +130,7 @@ std::vector<Val*> collectRuntimeUsedIntegers(Fusion* fusion) {
 template <typename IRContext>
 void PrecomputedIntegersBase<IRContext>::initializeValueList(
     typename IRContext::EVALUATOR_TYPE& const_evaluator,
-    const std::vector<IR_VAL*>& sorted_value_list) {
+    const std::vector<Val*>& sorted_value_list) {
   // Initialize workspace
   num_of_values_ = sorted_value_list.size();
   defined_ = std::vector<bool>(num_of_values_, false);
@@ -161,7 +151,7 @@ void PrecomputedIntegersBase<IRContext>::initializeValueList(
 
 template <typename IRContext>
 c10::optional<int64_t> PrecomputedIntegersBase<IRContext>::getMaybeValueFor(
-    const IR_VAL* val) {
+    const Val* val) {
   auto index = val->evaluatorIndex();
   if (index < 0) {
     return c10::nullopt;
@@ -172,6 +162,17 @@ c10::optional<int64_t> PrecomputedIntegersBase<IRContext>::getMaybeValueFor(
   return values_[index];
 }
 
+template <typename IRContext>
+void PrecomputedIntegersBase<IRContext>::print() const {
+  std::cout << "Precomputed Integers:\n";
+  for (auto i : c10::irange(symbols_.size())) {
+    if (defined_[i]) {
+      std::cout << symbols_[i]->toInlineString() << " = " << values_[i]
+                << std::endl;
+    }
+  }
+}
+
 template <typename IRContext>
 void PrecomputedIntegersBase<IRContext>::evaluate() {
   FUSER_PERF_SCOPE("PrecomputedIntegers::Evaluate");
@@ -208,10 +209,9 @@ NaiveIntegerMachine<IRContext>::NaiveIntegerMachine(
   for (auto val : precomputed_integers_.symbols_) {
     auto def = val->definition();
     if (def) {
-      if (auto uop = dynamic_cast<typename IRContext::UNARY_OP_TYPE*>(def)) {
+      if (auto uop = dynamic_cast<UnaryOp*>(def)) {
         makeUnaryOp(uop);
-      } else if (
-          auto bop = dynamic_cast<typename IRContext::BINARY_OP_TYPE*>(def)) {
+      } else if (auto bop = dynamic_cast<BinaryOp*>(def)) {
         makeBinaryOp(bop);
       } else {
         TORCH_INTERNAL_ASSERT(false, "Unsupported expr");
@@ -234,8 +234,7 @@ void NaiveIntegerMachine<IRContext>::run() {
 }
 
 template <typename IRContext>
-void NaiveIntegerMachine<IRContext>::makeUnaryOp(
-    typename IRContext::UNARY_OP_TYPE* uop) {
+void NaiveIntegerMachine<IRContext>::makeUnaryOp(UnaryOp* uop) {
   int in = uop->inputs()[0]->evaluatorIndex();
   int out = uop->outputs()[0]->evaluatorIndex();
   TORCH_INTERNAL_ASSERT(in >= 0, "Integer Machine: unknown input: ", uop);
@@ -249,8 +248,7 @@ void NaiveIntegerMachine<IRContext>::makeUnaryOp(
 }
 
 template <typename IRContext>
-void NaiveIntegerMachine<IRContext>::makeBinaryOp(
-    typename IRContext::BINARY_OP_TYPE* bop) {
+void NaiveIntegerMachine<IRContext>::makeBinaryOp(BinaryOp* bop) {
   int in0 = bop->inputs()[0]->evaluatorIndex();
   int in1 = bop->inputs()[1]->evaluatorIndex();
   int out = bop->outputs()[0]->evaluatorIndex();
@@ -377,11 +375,8 @@ void NaiveIntegerMachine<IRContext>::runBinaryOp(int index) {
   precomputed_integers_.defined_[dest_index] = true;
 }
 
-KernelPrecomputedIntegers::KernelPrecomputedIntegers(
-    Fusion* fusion,
-    GpuLower& lower)
-    : lower_(&lower) {
-  loadSymbols(collectRuntimeUsedIntegers(fusion, lower_));
+KernelPrecomputedIntegers::KernelPrecomputedIntegers(kir::Kernel* kernel) {
+  loadSymbols(collectRuntimeUsedIntegers(kernel));
   kir::ExpressionEvaluator evaluator;
   initializeValueList(evaluator, symbols());
   initializeNamedScalars();
@@ -389,11 +384,11 @@ KernelPrecomputedIntegers::KernelPrecomputedIntegers(
 }
 
 void KernelPrecomputedIntegers::bindTensorMetaData(
-    kir::TensorView* tv,
+    TensorView* tv,
     const at::Tensor& at_tensor) {
-  std::vector<std::pair<kir::Val*, int64_t>> ret;
+  std::vector<std::pair<Val*, int64_t>> ret;
   const auto root_domain =
-      kir::TensorDomain::noReductions(tv->domain()->rootDomain());
+      TensorDomain::noReductions(tv->domain()->getMaybeRFactorDomain());
   TORCH_INTERNAL_ASSERT(
       at_tensor.ndimension() == static_cast<int>(root_domain.size()),
       "Something went wrong configuring launch. Inputs do not match.");
@@ -411,7 +406,7 @@ namespace {
 //!  and returns the corresponding parallel type if a match
 //!  is found.
 c10::optional<ParallelType> getMaybeThreadSizeParallelType(
-    kir::NamedScalar* named_scalar) {
+    NamedScalar* named_scalar) {
   auto& var_name = named_scalar->name();
   for (auto ptype : kParallelTypeThreads) {
     if (var_name == stringifyThreadSize(ptype)) {
@@ -425,7 +420,7 @@ c10::optional<ParallelType> getMaybeThreadSizeParallelType(
 
 void KernelPrecomputedIntegers::initializeNamedScalars() {
   for (auto val : symbols()) {
-    if (auto named_scalar = dynamic_cast<kir::NamedScalar*>(val)) {
+    if (auto named_scalar = dynamic_cast<NamedScalar*>(val)) {
       auto maybe_parallel_type = getMaybeThreadSizeParallelType(named_scalar);
       if (maybe_parallel_type.has_value()) {
         auto& index_list =
@@ -440,17 +435,17 @@ void KernelPrecomputedIntegers::initializeNamedScalars() {
 }
 
 void KernelPrecomputedIntegers::bindKernelInputs(
+    kir::Kernel* kernel,
     const at::ArrayRef<IValue>& aten_inputs) {
   if (hasValidValues()) {
     invalidate();
   }
 
-  auto kernel = lower_->kernel();
   const auto& inputs = kernel->inputs();
 
   for (const auto i : c10::irange(inputs.size())) {
     const auto input = inputs[i];
-    if (auto tensor_input = dynamic_cast<kir::TensorView*>(input)) {
+    if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
       const auto aten_tensor = aten_inputs[i].toTensor();
       bindTensorMetaData(tensor_input, aten_tensor);
     } else if (input->isScalar() && input->dtype() == DataType::Int) {
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.h b/torch/csrc/jit/codegen/cuda/evaluator_common.h
index 0c16e2a8b046..7cbe37c602b9 100644
--- a/torch/csrc/jit/codegen/cuda/evaluator_common.h
+++ b/torch/csrc/jit/codegen/cuda/evaluator_common.h
@@ -35,18 +35,14 @@ class ExpressionEvaluator;
 //! Context for using generic logic on FusionIR
 class FusionIRContext {
  public:
-  using VAL_TYPE = Val;
-  using EXPR_TYPE = Expr;
   using TV_TYPE = TensorView;
   using EVALUATOR_TYPE = ExpressionEvaluator;
-  using BINARY_OP_TYPE = BinaryOp;
-  using UNARY_OP_TYPE = UnaryOp;
 
-  static BinaryOpType getOpType(BINARY_OP_TYPE* bop) {
+  static BinaryOpType getOpType(BinaryOp* bop) {
     return bop->getBinaryOpType();
   }
 
-  static UnaryOpType getOpType(UNARY_OP_TYPE* uop) {
+  static UnaryOpType getOpType(UnaryOp* uop) {
     return uop->getUnaryOpType();
   }
 };
@@ -54,19 +50,14 @@ class FusionIRContext {
 //! Context for using generic logic on KernelIR
 class KernelIRContext {
  public:
-  using VAL_TYPE = kir::Val;
-  using EXPR_TYPE = kir::Expr;
-  using TV_TYPE = kir::TensorView;
   using EVALUATOR_TYPE = kir::ExpressionEvaluator;
-  using BINARY_OP_TYPE = kir::BinaryOp;
-  using UNARY_OP_TYPE = kir::UnaryOp;
 
-  static BinaryOpType getOpType(BINARY_OP_TYPE* bop) {
-    return bop->operation();
+  static BinaryOpType getOpType(BinaryOp* bop) {
+    return bop->getBinaryOpType();
   }
 
-  static UnaryOpType getOpType(UNARY_OP_TYPE* uop) {
-    return uop->operation();
+  static UnaryOpType getOpType(UnaryOp* uop) {
+    return uop->getUnaryOpType();
   }
 };
 
@@ -97,10 +88,10 @@ class NaiveIntegerMachine {
 
  private:
   //! Convert an unary IR expr to an instruction
-  void makeUnaryOp(typename IRContext::UNARY_OP_TYPE* uop);
+  void makeUnaryOp(UnaryOp* uop);
 
   //! Convert an binary IR expr to an instruction
-  void makeBinaryOp(typename IRContext::BINARY_OP_TYPE* bop);
+  void makeBinaryOp(BinaryOp* bop);
 
   //! Create an empty instruction with all default values
   //!  and place it at the end of the instruction buffer.
@@ -169,11 +160,6 @@ class NaiveIntegerMachine {
 //!  integers and store them in the workspace ahead of time.
 template <typename IRContext>
 class PrecomputedIntegersBase {
-  using IR_UNARY_OP = typename IRContext::UNARY_OP_TYPE;
-  using IR_BINARY_OP = typename IRContext::BINARY_OP_TYPE;
-  using IR_VAL = typename IRContext::VAL_TYPE;
-  using IR_EXPR = typename IRContext::EXPR_TYPE;
-  using IR_TV = typename IRContext::TV_TYPE;
   using INTEGER_MACHINE = NaiveIntegerMachine<IRContext>;
 
  public:
@@ -190,7 +176,10 @@ class PrecomputedIntegersBase {
 
   //! Returns value for the given IR node if it's stored
   //!  in the workspace and has been evaluated.
-  c10::optional<int64_t> getMaybeValueFor(const IR_VAL* val);
+  c10::optional<int64_t> getMaybeValueFor(const Val* val);
+
+  //! Debugging helper, prints all the currently known values
+  void print() const;
 
  protected:
   //! Initialize the workspace before first use.
@@ -198,7 +187,7 @@ class PrecomputedIntegersBase {
   //!  been topologically sorted.
   void initializeValueList(
       typename IRContext::EVALUATOR_TYPE& evaluator,
-      const std::vector<IR_VAL*>& sorted_value_list);
+      const std::vector<Val*>& sorted_value_list);
 
   //! Bind concrete value to the given index
   //!  if the index is valid.
@@ -215,12 +204,12 @@ class PrecomputedIntegersBase {
   void invalidate();
 
   //! Interface for subclasses to access symbols_
-  void loadSymbols(std::vector<IR_VAL*> symbols) {
+  void loadSymbols(std::vector<Val*> symbols) {
     symbols_ = std::move(symbols);
   }
 
   //! Interface for subclasses to access symbols_
-  std::vector<IR_VAL*>& symbols() {
+  std::vector<Val*>& symbols() {
     return symbols_;
   }
 
@@ -267,7 +256,7 @@ class PrecomputedIntegersBase {
   std::vector<int64_t> values_;
 
   //! Stores the IR nodes corresponding to each index.
-  std::vector<IR_VAL*> symbols_;
+  std::vector<Val*> symbols_;
 
   //! An internal log to keep track of all the bindings
   //!  used in each evaluation cycle. To be used for
@@ -308,12 +297,14 @@ class KernelPrecomputedIntegers
 
  public:
   using ParallelExtentMap =
-      std::unordered_map<ParallelType, std::vector<const kir::Val*>, TypeHash>;
+      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
 
-  KernelPrecomputedIntegers(Fusion* fusion, GpuLower& lower);
+  KernelPrecomputedIntegers(kir::Kernel* kernel);
 
   //! Bind concrete values from fusion runtime inputs
-  void bindKernelInputs(const at::ArrayRef<IValue>& aten_inputs);
+  void bindKernelInputs(
+      kir::Kernel* kernel,
+      const at::ArrayRef<IValue>& aten_inputs);
 
   //! Bind concrete values from launch constraints
   void bindParallelExtents(
@@ -326,7 +317,7 @@ class KernelPrecomputedIntegers
   void bindConcreteParallelTypeValue(ParallelType pt, int64_t value);
 
  private:
-  void bindTensorMetaData(kir::TensorView* tv, const at::Tensor& at_tensor);
+  void bindTensorMetaData(TensorView* tv, const at::Tensor& at_tensor);
 
   //! Iterate through all the named scalars corresponding
   //!  to thread sizes and pre-group them by their parallel
@@ -334,8 +325,6 @@ class KernelPrecomputedIntegers
   void initializeNamedScalars();
 
  private:
-  GpuLower* lower_ = nullptr;
-
   //! Contains all the named scalars correspond
   //!  to thread size of each parallel type.
   std::unordered_map<ParallelType, std::unique_ptr<std::vector<int>>, TypeHash>
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index 647cf4ec0e2f..98bbb9e0324e 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1,3 +1,4 @@
+
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 
 #include <torch/csrc/jit/codegen/cuda/codegen.h>
@@ -8,21 +9,12 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/llvm_jit_strings.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/empty_native.h>
-#include <ATen/ops/zeros.h>
-#endif
-
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
@@ -65,6 +57,18 @@ typedef unsigned long long int uint64_t;
 )";
 }
 
+static const std::string& defineComplexTypes() {
+  static std::string result = std::string(R"ESCAPE(
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define INFINITY POS_INFINITY
+#define NEG_INFINITY __int_as_float(0xff800000)
+#define NAN __int_as_float(0x7fffffff)
+)ESCAPE") +
+      at::cuda::get_traits_string() + at::cuda::get_complex_body_string() +
+      at::cuda::get_cmath_string() + at::cuda::get_complex_math_string();
+  return result;
+}
+
 } // namespace
 
 std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
@@ -79,7 +83,7 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
 #endif
   code += std::string("namespace ") + FusionExecutor::kernelNamespace() +
       " {\n" + defineIntegerTypes() + defineIndexMode(options_.index_mode) +
-      executor_utils::kernelPreamble() + kernel + "}\n";
+      defineComplexTypes() + executor_utils::kernelPreamble() + kernel + "}\n";
 
   if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) {
     std::cout << "\n======= Codegen output for kernel: " << kernelName()
@@ -108,8 +112,6 @@ void FusionExecutor::debugCompileFusionFromStr(
     const std::string& name,
     int id,
     CompileOptions options) {
-  fusion_ = *fusion;
-  FusionGuard fg(&fusion_);
   options_ = options;
 
   if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
@@ -126,11 +128,12 @@ void FusionExecutor::debugCompileFusionFromStr(
               << std::endl;
   }
 
-  setUsedTVs();
+  lowered_ = std::make_unique<GpuLower>(fusion);
+  const auto kernel = lowered_->kernel();
+  fusion_ = lowered_->kernel();
 
   fusion_id_ = id;
-  lowered_ = GpuLower(&fusion_);
-  const auto kernel = lowered_.kernel();
+  setUsedTVs();
 
   if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
     kernel->print();
@@ -144,20 +147,21 @@ void FusionExecutor::debugCompileFusionFromStr(
     const auto static_smem_size = computeSharedMemory(
         static_evaluator, kernel_summary.static_smem_allocations);
     TORCH_INTERNAL_ASSERT(
-        static_smem_size < max_device_smem,
+        static_smem_size < max_static_smem_,
         "The static shared memory allocation is larger than available memory.");
   }
 
-  compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_);
+  std::tie(compiled_kernel_, last_compiler_log_) =
+      executor_utils::nvrtcCompile(code, name, fusion_id_);
   TORCH_INTERNAL_ASSERT(
       fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted.");
 }
 
 void FusionExecutor::compileFusion(
     Fusion* fusion,
-    CompileOptions options,
     const at::ArrayRef<IValue>& inputs,
-    const LaunchParams& launch_constraints) {
+    const LaunchParams& launch_constraints,
+    CompileOptions options) {
   FUSER_PERF_SCOPE("compileFusion");
 
   TORCH_INTERNAL_ASSERT(
@@ -175,40 +179,50 @@ void FusionExecutor::compileFusion(
     fusion->printMath();
   }
 
-  // Clone the fusion so we can store it
-  fusion_ = *fusion;
-  FusionGuard fg(&fusion_);
   options_ = options;
   c10::DeviceGuard dg(options_.device);
 
   TORCH_INTERNAL_ASSERT(
-      options.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
-  auto properties = at::cuda::getDeviceProperties(options.device.index());
-  max_device_smem = properties->sharedMemPerBlock;
+      options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
+  auto properties = at::cuda::getDeviceProperties(options_.device.index());
+  configured_device_smem_ = properties->sharedMemPerBlock;
+#ifndef __HIP_PLATFORM_HCC__
+  device_smem_limit_ = properties->sharedMemPerBlockOptin;
+#else
+  // don't know if rocm supports opt-in shared memroy reconfiguration
+  device_smem_limit_ = properties->sharedMemPerBlock;
+#endif
   warp_size_ = properties->warpSize;
 
-  setUsedTVs();
+  lowered_ = std::make_unique<GpuLower>(
+      fusion,
+      options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
+                                                    : DataType::Int32);
+  const auto kernel = lowered_->kernel();
+  fusion_ = lowered_->kernel()->as<Fusion>();
 
   fusion_id_ = ++fusion_id_counter_;
-  lowered_ = GpuLower(&fusion_);
-  const auto kernel = lowered_.kernel();
+  setUsedTVs();
 
   if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
     kernel->print();
   }
 
-  const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName());
-  const auto structured_code = getStructuredCode(kernel_code);
+  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
+  const auto structured_code = getStructuredCode(kernel_code_);
 
   const auto& kernel_summary = kernel->summary();
 
+  // We currently shouldn't allocate any more shared mem
+  //  tensors statically but could keep this path if
+  //  needed in later development.
   if (!kernel_summary.static_smem_allocations.empty()) {
     kir::ExpressionEvaluator static_evaluator;
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     const auto static_smem_size = computeSharedMemory(
         static_evaluator, kernel_summary.static_smem_allocations);
     TORCH_INTERNAL_ASSERT(
-        static_smem_size < max_device_smem,
+        static_smem_size < max_static_smem_,
         "The static shared memory allocation is larger than available memory.");
   }
 
@@ -216,7 +230,7 @@ void FusionExecutor::compileFusion(
     std::stringstream ss;
     ss << "Allocations must be based on constant integers for local memory. However, found: ";
     for (auto alloc : kernel_summary.dynamic_lmem_allocations) {
-      ss << toString(alloc->buffer(), false) << ", ";
+      ss << alloc->buffer()->toString() << ", ";
     }
     ss << " have dynamic allocations but are placed in local memory.";
     TORCH_INTERNAL_ASSERT(false, ss.str());
@@ -233,20 +247,32 @@ void FusionExecutor::compileFusion(
         block_size > 0, "launch param inferred block size < 0");
   }
 
-  compiled_kernel_ = executor_utils::nvrtcCompile(
+  block_size_high_water_mark =
+      block_size.has_value() ? block_size.value() : block_size_high_water_mark;
+  std::tie(compiled_kernel_, last_compiler_log_) = executor_utils::nvrtcCompile(
       structured_code,
       (kernelNamespace() + "::" + kernelName()).c_str(),
       fusion_id_,
       block_size);
   TORCH_INTERNAL_ASSERT(
       fusion_id_ > 0, "failed to assign a fusion_id_ after compilation.");
+
+#ifndef __HIP_PLATFORM_HCC__
+  // The driver API call requires an int argument.
+  int max_dynamic_smem = 0;
+  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncGetAttribute(
+      &max_dynamic_smem,
+      CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+      compiled_kernel_.function));
+  maybe_available_dynamic_smem_ = max_dynamic_smem;
+#endif
 }
 
 namespace {
 
 at::Tensor inferAndAlloc(
-    const kir::TensorView* tv,
-    const std::vector<kir::Val*>& sizes,
+    const TensorView* tv,
+    const std::vector<Val*>& sizes,
     kir::ExpressionEvaluator& expr_eval,
     const CompileOptions& options,
     bool zero_init = false) {
@@ -260,9 +286,11 @@ at::Tensor inferAndAlloc(
     TORCH_INTERNAL_ASSERT(
         inferred_val.has_value(),
         "Could not launch kernel as program could not infer ",
-        kir::toString(size),
-        " for the buffer ",
-        kir::toString(tv));
+        size->toString(),
+        "(",
+        size->name(),
+        ") for the buffer ",
+        tv->toString());
     inferred_sizes.push_back(inferred_val.value());
   }
 
@@ -283,19 +311,20 @@ at::Tensor inferAndAlloc(
 }
 
 at::Tensor inferAndAllocOutput(
-    const kir::TensorView* tv,
+    const TensorView* tv,
     kir::ExpressionEvaluator& expr_eval,
     const CompileOptions& options,
     bool zero_init = false) {
   const auto domain = tv->domain();
-  const auto maybe_rfactor_domain =
-      domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain();
+  const auto maybe_rfactor_domain = domain->hasRFactor()
+      ? domain->getRFactorDomain()
+      : domain->getRootDomain();
 
-  std::vector<kir::Val*> sizes;
+  std::vector<Val*> sizes;
 
   for (const auto id : maybe_rfactor_domain) {
     if (id->isReduction() || id->isStride() ||
-        id->iterType() == IterType::BroadcastWithoutStride) {
+        id->getIterType() == IterType::BroadcastWithoutStride) {
       continue;
     }
     sizes.push_back(id->extent());
@@ -321,7 +350,8 @@ uint64_t FusionExecutor::computeSharedMemory(
         const uint64_t data_size = dataTypeSize(smem_alloc->buffer()->dtype());
         // Add padding to align dynamic shared memory
         if (align_padding) {
-          total = ceilDiv(total, data_size) * data_size;
+          const int align_size = 16; // always align to 16B/128b.
+          total = ceilDiv(total, align_size) * align_size;
         }
         total += inferred_val.value() * data_size;
       } else {
@@ -348,8 +378,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
 
   auto data_cache = compileTimeDataCache();
 
-  auto& lower = lowered_;
-
+  auto lower = lowered_.get();
   auto& used_tvs = getUsedTVs();
   auto parallel_binding_ids_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
@@ -364,9 +393,8 @@ LaunchParams FusionExecutor::computeLaunchParams(
   auto parallel_iter_extent_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::ParallelIterExtentMap>(
-          data_cache, [&parallel_binding_ids, &lower]() {
-            return executor_utils::getParallelIterExtents(
-                lower, parallel_binding_ids);
+          data_cache, [&parallel_binding_ids]() {
+            return executor_utils::getParallelIterExtents(parallel_binding_ids);
           });
   auto& parallel_iter_extents = parallel_iter_extent_entry.get();
 
@@ -385,7 +413,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
           executor_utils::caching::WarpPaddedParallelExtents>(
           data_cache, [&parallel_binding_ids, &lower]() {
             return executor_utils::getWarpPaddedExtentsInfo(
-                lower, parallel_binding_ids);
+                lower->kernel(), parallel_binding_ids);
           });
   auto& warp_padded_extent_set =
       warp_padded_parallel_entry.get().warp_padded_extent_set;
@@ -446,7 +474,9 @@ LaunchParams FusionExecutor::computeLaunchParams(
       auto val = expr_eval.evaluate(extent);
       TORCH_INTERNAL_ASSERT(
           val.has_value(),
-          "Tried to evaluate the extent of ",
+          "Tried to evaluate the extent, ",
+          extent->toInlineString(),
+          " for the ptype: ",
           p_type,
           " to set launch bounds but could not.");
 
@@ -471,8 +501,12 @@ LaunchParams FusionExecutor::computeLaunchParams(
       }
       maximum_value = std::max(maximum_value, *val);
     }
-    expr_eval.bind(p_type, maximum_value);
-    launch_params.bind(maximum_value, p_type);
+    // Protect for size-0 tensors, they still have a value so would prefer to
+    // bind nothing than 0
+    if (maximum_value > 0) {
+      expr_eval.bind(p_type, maximum_value);
+      launch_params.bind(maximum_value, p_type);
+    }
   }
 
   // Re-run the integer machine with all
@@ -481,14 +515,15 @@ LaunchParams FusionExecutor::computeLaunchParams(
     expr_eval.precomputedIntegers()->evaluate();
   }
 
-  const auto kernel = lowered_.kernel();
+  const auto kernel = lowered_->kernel();
   const auto& kernel_summary = kernel->summary();
 
   // Calculate Dynamic Shared Memory Size
   // Add workspace for reduction and broadcast
   uint64_t reduction_broadcast_workspace = 0;
   const bool has_workspace = kernel_summary.has_block_reductions ||
-      kernel_summary.has_grid_reductions || kernel_summary.has_block_broadcasts;
+      kernel_summary.has_grid_reductions ||
+      kernel_summary.has_block_broadcasts || kernel_summary.has_grid_broadcasts;
   if (has_workspace &&
       kernel_summary.largest_smem_data_type != DataType::Null) {
     // Not using nThreads here since it does not handle uninitialized value
@@ -511,19 +546,35 @@ LaunchParams FusionExecutor::computeLaunchParams(
       true,
       reduction_broadcast_workspace);
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  const uint64_t static_smem_size =
-      computeSharedMemory(expr_eval, kernel_summary.static_smem_allocations);
+  // Check that requested smem size can be dynamically allocated.
+  //  This check is only done once a kernel has been compiled, since
+  //  maybe_available_dynamic_smem_ needs to be evaluated on
+  //  a compiled kernel.
+  if (maybe_available_dynamic_smem_.has_value()) {
+    // Dynamic shared memory space that we can allocate without
+    //  carving more space from L1.
+    const uint64_t available_dynamic_smem_without_reconfiguration =
+        maybe_available_dynamic_smem_.value();
+    // Maximum additional shared memory size we could request
+    //  if we do re-configuration.
+    const uint64_t additional_dynamic_smem_available_through_reconfiguration =
+        device_smem_limit_ - configured_device_smem_;
+
+    TORCH_INTERNAL_ASSERT(
+        (dynamic_smem_size) <
+            (available_dynamic_smem_without_reconfiguration +
+             additional_dynamic_smem_available_through_reconfiguration),
+        "The total shared memory allocation is larger than available memory.",
+        " Dynamic size: ",
+        dynamic_smem_size,
+        ". Available size: ",
+        maybe_available_dynamic_smem_.value(),
+        ". Configured smem size: ",
+        configured_device_smem_,
+        ". Device limit size: ",
+        device_smem_limit_);
+  }
 
-  TORCH_INTERNAL_ASSERT(
-      (dynamic_smem_size + static_smem_size) < max_device_smem,
-      "The total shared memory allocation is larger than available memory.",
-      " Dynamic size: ",
-      dynamic_smem_size,
-      ". Static size: ",
-      static_smem_size,
-      ". Available size: ",
-      max_device_smem);
   launch_params.setSmem(dynamic_smem_size);
 
   return launch_params;
@@ -533,14 +584,14 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
     kir::ExpressionEvaluator& expr_eval) {
   FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals");
   GlobalBuffers global_buffers;
-  const auto kernel = lowered_.kernel();
-  const auto& kernel_summary = lowered_.kernel()->summary();
+  const auto kernel = lowered_->kernel();
+  const auto& kernel_summary = kernel->summary();
   for (auto alloc : kernel_summary.global_allocations) {
     TORCH_INTERNAL_ASSERT(
-        alloc->buffer()->isA<kir::TensorView>(),
+        alloc->buffer()->isA<TensorView>(),
         "Cannot allocate global buffers that are not tensors.");
-    auto tv = alloc->buffer()->as<kir::TensorView>();
-    if (kernel->isOutput(tv)) {
+    auto tv = alloc->buffer()->as<TensorView>();
+    if (tv->isFusionOutput()) {
       continue;
     }
     if (alloc->zeroInit()) {
@@ -558,30 +609,48 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
 }
 
 std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    const at::ArrayRef<IValue>& inputs,
     kir::ExpressionEvaluator& expr_eval,
     const std::unordered_set<int>& alias_indices) {
   FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
-  const auto kernel = lowered_.kernel();
+  const auto kernel = lowered_->kernel();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> outputs;
-  for (const auto i : c10::irange(kernel->outputs().size())) {
-    TORCH_INTERNAL_ASSERT(
-        kernel->outputs()[i]->isA<kir::TensorView>(),
-        "Cannot allocate outputs that are not tensors.");
-    auto output = kernel->outputs()[i]->as<kir::TensorView>();
-    if (alias_indices.count(i) == 0) {
-      outputs.push_back(
-          inferAndAllocOutput(output, expr_eval, options_, false));
+  for (const auto out_i : c10::irange(kernel->outputs().size())) {
+    // Dummy output.
+    if (kernel->outputs()[out_i]->isFusionInput()) {
+      for (auto inp_i : c10::irange(kernel->inputs().size())) {
+        if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
+          TORCH_INTERNAL_ASSERT(
+              inp_i < inputs.size(),
+              "Issue with an input showing up as output, couldn't find input.");
+          TORCH_INTERNAL_ASSERT(
+              inputs[inp_i].isTensor(),
+              "Cannot register a scalar as an output in a fusion.");
+          outputs.push_back(inputs[inp_i].toTensor());
+          break;
+        }
+      }
     } else {
-      // aliasing to inputs, no need to allocate real output
-      outputs.push_back(inferAndAlloc(output, {}, expr_eval, options_, false));
+      TORCH_INTERNAL_ASSERT(
+          kernel->outputs()[out_i]->isA<TensorView>(),
+          "Cannot allocate outputs that are not tensors.");
+      auto output = kernel->outputs()[out_i]->as<TensorView>();
+      if (alias_indices.count(out_i) == 0) {
+        outputs.push_back(
+            inferAndAllocOutput(output, expr_eval, options_, false));
+      } else {
+        // aliasing to inputs, no need to allocate real output
+        outputs.push_back(
+            inferAndAlloc(output, {}, expr_eval, options_, false));
+      }
     }
   }
   return outputs;
 }
 
 void FusionExecutor::setUsedTVs() {
-  auto used_vals = fusion_.usedMathVals();
+  auto used_vals = fusion_->usedMathVals();
   auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
   used_tvs_.clear();
 
@@ -595,24 +664,43 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     const LaunchParams& launch_constraints,
     const c10::optional<size_t>& opt_code) {
   FUSER_PERF_SCOPE("FusionExecutor::RunFusion");
-
+  TORCH_INTERNAL_ASSERT(compiled());
   TORCH_INTERNAL_ASSERT(
       fusion_id_ > 0, "Cannot run fusion, it was not compiled.");
   TORCH_INTERNAL_ASSERT(
       !opt_code.has_value() || outputs.empty(),
       "short cut input cache is not compatible with pre-allocated output");
 
+  if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) {
+    std::cout << "Arguments for fusion" << fusion_id_ << ":" << std::endl
+              << "Inputs:" << std::endl;
+    for (const auto& input : inputs) {
+      if (input.isTensor()) {
+        const auto& input_tensor = input.toTensor();
+        std::cout << "  " << input_tensor.scalar_type() << " "
+                  << input.toTensor().sizes()
+                  << " (strides = " << input.toTensor().strides() << ")"
+                  << std::endl;
+      }
+    }
+    std::cout << "Outputs:" << std::endl;
+    for (const auto& output : outputs) {
+      std::cout << "  " << output.scalar_type() << " " << output.sizes()
+                << " (strides = " << output.strides() << ")" << std::endl;
+    }
+    std::cout << launch_constraints.toString();
+  }
+
   ExecutorEntry* executor_entry = nullptr;
   if (opt_code.has_value()) {
     executor_entry = &executor_entry_lookup_[*opt_code];
   }
 
-  FusionGuard fg(&fusion_);
   c10::DeviceGuard dg(options_.device);
   auto stream = at::cuda::getCurrentCUDAStream();
   executor_utils::initializeCudaContext();
-
-  LaunchParams launch_params;
+  TORCH_INTERNAL_ASSERT(lowered_);
+  launch_params_ = LaunchParams();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> allocated_outputs = outputs;
   GlobalBuffers global_buffers;
@@ -623,7 +711,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       // context manager to disable auto grad for `empty_cuda` calls later
       at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
       // take the short-cut for launch if we see a recorded input set again
-      launch_params = executor_entry->launch_params;
+      launch_params_ = executor_entry->launch_params;
       // only allocate outputs when not given
       if (outputs.empty()) {
         FUSER_PERF_SCOPE("ExecutorRunFusion::OutputAlloc");
@@ -642,7 +730,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         }
       } else {
         TORCH_INTERNAL_ASSERT(
-            outputs.size() == fusion_.outputs().size(),
+            outputs.size() == fusion_->outputs().size(),
             __func__,
             " provided number of outputs does match fusion output");
       }
@@ -672,38 +760,55 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     // code path to take when either:
     //   1. no opt_code is provided or
     //   2. `executor_entry` is not initialized
-    executor_utils::validateKernelInputs(&fusion_, inputs, options_.device);
+    executor_utils::validateKernelInputs(fusion_, inputs, options_.device);
 
     if (!evaluator_precomputed_integers_) {
       evaluator_precomputed_integers_ =
-          std::make_unique<KernelPrecomputedIntegers>(&fusion_, lowered_);
+          std::make_unique<KernelPrecomputedIntegers>(lowered_->kernel());
     }
 
     kir::ExpressionEvaluator expr_eval;
-    evaluator_precomputed_integers_->bindKernelInputs(inputs);
+    evaluator_precomputed_integers_->bindKernelInputs(
+        lowered_->kernel(), inputs);
     expr_eval.precomputedIntegers() = evaluator_precomputed_integers_.get();
 
-    launch_params =
+    launch_params_ =
         computeLaunchParams(launch_constraints, expr_eval, warp_size_);
 
+    // Recompile the kernel if the number of threads in the block has increased
+    if (launch_params_.nThreads() > block_size_high_water_mark) {
+      const auto kernel = lowered_->kernel();
+      kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
+      const auto structured_code = getStructuredCode(kernel_code_);
+      block_size_high_water_mark = launch_params_.nThreads();
+
+      std::tie(compiled_kernel_, last_compiler_log_) =
+          executor_utils::nvrtcCompile(
+              structured_code,
+              (kernelNamespace() + "::" + kernelName()).c_str(),
+              fusion_id_,
+              block_size_high_water_mark);
+    }
+
     if (kernel()->summary().has_cooperative_grid_reduction) {
 #ifndef __HIP_PLATFORM_HCC__
       int num_blocks_per_SM = -1;
       at::globalContext().getNVRTC().cuOccupancyMaxActiveBlocksPerMultiprocessor(
           &num_blocks_per_SM,
           compiled_kernel_.function,
-          (int)(launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz()),
-          (size_t)launch_params.smem());
+          (int)(launch_params_.bdimx() * launch_params_.bdimy() * launch_params_.bdimz()),
+          (size_t)launch_params_.smem());
 
       TORCH_INTERNAL_ASSERT(
           (int64_t)(
               num_blocks_per_SM *
               at::cuda::getDeviceProperties(options_.device.index())
-                  ->multiProcessorCount) >= launch_params.gdimx() *
-                  launch_params.gdimy() * launch_params.gdimz(),
+                  ->multiProcessorCount) >= launch_params_.gdimx() *
+                  launch_params_.gdimy() * launch_params_.gdimz(),
           "Wanted to launch a cooperative kernel, however the number of blocks is greater than ",
           "what can be resident on the GPU at once. Need: ",
-          launch_params.gdimx() * launch_params.gdimy() * launch_params.gdimz(),
+          launch_params_.gdimx() * launch_params_.gdimy() *
+              launch_params_.gdimz(),
           " but limited to ",
           num_blocks_per_SM,
           " * ",
@@ -716,16 +821,18 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     }
 
     executor_utils::validateVectorizedTensors(
-        &fusion_, inputs, outputs, lowered_, compileTimeDataCache(), expr_eval);
-
-    auto& fusion = fusion_;
+        lowered_.get()->kernel(),
+        inputs,
+        outputs,
+        compileTimeDataCache(),
+        expr_eval);
 
     auto alias_indices_entry =
         executor_utils::caching::ExecutorCompileTimeEntry<
             executor_utils::caching::InputAliasIndices>(
-            compileTimeDataCache(), [&fusion]() {
+            compileTimeDataCache(), [&]() {
               return std::make_unique<std::vector<std::pair<int, int>>>(
-                  fusion.getInputAliasIndices());
+                  fusion_->getInputAliasIndices());
             });
 
     auto& alias_indices = alias_indices_entry.get();
@@ -736,14 +843,14 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       auto output_alias_indices_entry =
           executor_utils::caching::ExecutorCompileTimeEntry<
               executor_utils::caching::OutputAliasIndices>(
-              compileTimeDataCache(), [&fusion]() {
+              compileTimeDataCache(), [&]() {
                 return std::make_unique<std::unordered_set<int>>(
-                    fusion.getOutputAliasIndices());
+                    fusion_->getOutputAliasIndices());
               });
 
       auto& output_alias_indices = output_alias_indices_entry.get();
 
-      allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
+      allocated_outputs = allocOutputs(inputs, expr_eval, output_alias_indices);
 
       for (const auto& entry : alias_indices) {
         TORCH_INTERNAL_ASSERT(
@@ -753,7 +860,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     } else {
       // TODO: Update this as well;
       executor_utils::validateKernelOutputs(
-          &fusion_, allocated_outputs, options_.device);
+          fusion_, allocated_outputs, options_.device);
     }
 
     global_buffers = allocGlobalVals(expr_eval);
@@ -768,7 +875,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       rand_offset = 4 *
           (std::ceil(
                allocated_outputs[0].numel() /
-               (4.0 * 128 * launch_params.gdimx())) + // NOLINT
+               (4.0 * 128 * launch_params_.gdimx())) + // NOLINT
            1);
     }
 
@@ -777,7 +884,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     if (executor_entry) {
       FUSER_PERF_SCOPE("ExecutorRunFusion::FillCacheEntry");
       // record the the short-cut executor entry for the given input set;
-      executor_entry->launch_params = launch_params;
+      executor_entry->launch_params = launch_params_;
       executor_entry->io_alias_indices = alias_indices;
       for (const auto& output : allocated_outputs) {
         executor_entry->output_sizes.push_back(output.sizes().vec());
@@ -802,28 +909,31 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     kernel_arguments.push(inputs);
     kernel_arguments.push(allocated_outputs);
     kernel_arguments.push(global_buffers.buffers);
-    if (lowered_.kernel()->summary().is_stochastic) {
+    if (lowered_->kernel()->summary().is_stochastic) {
       kernel_arguments.appendPhiloxRNGSeed(rand_offset);
     }
   }
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
-    launch_params.print();
+    launch_params_.print();
   }
 
-  if (isDebugDumpEnabled(DebugDumpOption::PrintRuntimeArgs)) {
+  if (isDebugDumpEnabled(DebugDumpOption::KernelArgs)) {
     std::cout << "Arguments for kernel" << fusion_id_ << ":" << std::endl
               << "Inputs:" << std::endl;
     for (const auto& input : inputs) {
       if (input.isTensor()) {
-        std::cout << input.toTensor().scalar_type() << " "
-                  << input.toTensor().sizes() << std::endl;
+        const auto& input_tensor = input.toTensor();
+        std::cout << "  " << input_tensor.scalar_type() << " "
+                  << input.toTensor().sizes()
+                  << " (strides = " << input.toTensor().strides() << ")"
+                  << std::endl;
       }
     }
     std::cout << "Outputs:" << std::endl;
     for (const auto& output : allocated_outputs) {
       std::cout << "  " << output.scalar_type() << " " << output.sizes()
-                << std::endl;
+                << " (strides = " << output.strides() << ")" << std::endl;
     }
     std::cout << "Reduction and semaphore buffers:" << std::endl;
     for (const auto& buffer : global_buffers.buffers) {
@@ -836,24 +946,38 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   cudaEvent_t finish_event = {};
 
   if (measure_kernel_time_ ||
-      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
+      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
+      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
     cudaEventCreate(&start_event);
     cudaEventCreate(&finish_event);
     cudaEventRecord(start_event);
   }
 
   if (execute_kernel_) {
+    if (maybe_available_dynamic_smem_.has_value() &&
+        launch_params_.smem() > maybe_available_dynamic_smem_.value()) {
+#ifndef __HIP_PLATFORM_HCC__
+      // Increase limit of dynamic shared memory if needed.
+      AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncSetAttribute(
+          compiled_kernel_.function,
+          CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+          launch_params_.smem()));
+#else
+      TORCH_INTERNAL_ASSERT(
+          false, "cuFuncSetAttribute not supported with HIP.");
+#endif
+    }
     if (!kernel()->summary().has_cooperative_grid_reduction) {
       FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
       AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
           compiled_kernel_.function,
-          launch_params.gdimx(),
-          launch_params.gdimy(),
-          launch_params.gdimz(),
-          launch_params.bdimx(),
-          launch_params.bdimy(),
-          launch_params.bdimz(),
-          launch_params.smem(),
+          launch_params_.gdimx(),
+          launch_params_.gdimy(),
+          launch_params_.gdimz(),
+          launch_params_.bdimx(),
+          launch_params_.bdimy(),
+          launch_params_.bdimz(),
+          launch_params_.smem(),
           stream,
           kernel_arguments.getBuffer(),
           nullptr));
@@ -863,13 +987,13 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       AT_CUDA_DRIVER_CHECK(
           at::globalContext().getNVRTC().cuLaunchCooperativeKernel(
               compiled_kernel_.function,
-              launch_params.gdimx(),
-              launch_params.gdimy(),
-              launch_params.gdimz(),
-              launch_params.bdimx(),
-              launch_params.bdimy(),
-              launch_params.bdimz(),
-              launch_params.smem(),
+              launch_params_.gdimx(),
+              launch_params_.gdimy(),
+              launch_params_.gdimz(),
+              launch_params_.bdimx(),
+              launch_params_.bdimy(),
+              launch_params_.bdimz(),
+              launch_params_.smem(),
               stream,
               kernel_arguments.getBuffer()));
 #else
@@ -880,7 +1004,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   }
 
   if (measure_kernel_time_ ||
-      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
+      isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
+      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
     cudaEventRecord(finish_event);
     cudaEventSynchronize(start_event);
     cudaEventSynchronize(finish_event);
@@ -888,21 +1013,23 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     cudaEventDestroy(start_event);
     cudaEventDestroy(finish_event);
 
-    if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
-      size_t bytes = 0;
-      // Figure how many bytes are inputs, outputs, and temporary buffers
-      for (auto input : inputs) {
-        if (input.isTensor()) {
-          bytes += input.toTensor().numel() *
-              dataTypeSize(aten_to_data_type(input.toTensor().scalar_type()));
-        }
-      }
-      for (const auto& output : allocated_outputs) {
-        bytes += output.numel() *
-            dataTypeSize(aten_to_data_type(output.scalar_type()));
+    bytes_processed_ = 0;
+    // Figure how many bytes are inputs, outputs, and temporary buffers
+    for (auto input : inputs) {
+      if (input.isTensor()) {
+        bytes_processed_ += input.toTensor().numel() *
+            dataTypeSize(aten_to_data_type(input.toTensor().scalar_type()));
       }
+    }
+    for (const auto& output : allocated_outputs) {
+      bytes_processed_ += output.numel() *
+          dataTypeSize(aten_to_data_type(output.scalar_type()));
+    }
+
+    if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) {
       double gb_per_s =
-          ((double)bytes / ((double)kernel_time_ms_ / 1000)) / (double)1.0e9;
+          ((double)bytes_processed_ / ((double)kernel_time_ms_ / 1000)) /
+          (double)1.0e9;
       std::cout << "kernel" << fusion_id_ << " run in " << kernel_time_ms_
                 << " ms, achieved: " << gb_per_s << " GB/s" << std::endl;
     }
@@ -924,7 +1051,9 @@ void FusionExecutor::compileRtc(
   }
   fusion_id_ = 1;
   options_ = CompileOptions();
-  compiled_kernel_ = executor_utils::nvrtcCompile(scode, name, fusion_id_);
+
+  std::tie(compiled_kernel_, last_compiler_log_) =
+      executor_utils::nvrtcCompile(scode, name, fusion_id_);
 }
 
 void FusionExecutor::runRtc(
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
index 523f2aa0e4b2..ab5175b22a13 100644
--- a/torch/csrc/jit/codegen/cuda/executor.h
+++ b/torch/csrc/jit/codegen/cuda/executor.h
@@ -35,9 +35,9 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
 
   void compileFusion(
       Fusion* fusion,
-      CompileOptions options = CompileOptions(),
       const at::ArrayRef<IValue>& inputs = {},
-      const LaunchParams& launch_constraints = LaunchParams());
+      const LaunchParams& launch_constraints = LaunchParams(),
+      CompileOptions options = CompileOptions());
 
   std::vector<at::Tensor> runFusion(
       const at::ArrayRef<IValue>& inputs,
@@ -55,7 +55,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // function to query whether a `FusionExecutor` has a compiled kernel to
   // execute
   bool compiled() const {
-    return fusion_id_ != -1;
+    return fusion_id_ != -1 && lowered_;
   };
 
   void evictCache(size_t cache_id) {
@@ -85,7 +85,8 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
       executor_utils::caching::ExecutorCompileTimeInfoCache;
 
   kir::Kernel* kernel() const {
-    return lowered_.kernel();
+    TORCH_INTERNAL_ASSERT(lowered_);
+    return lowered_->kernel();
   }
 
   //! Internal knob used for debugging/profiling only
@@ -107,6 +108,32 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
     return measure_kernel_time_ ? kernel_time_ms_ : 0;
   }
 
+  //! Returns the number of bytes processed last kernel execution
+  int64_t bytesProcessed() const {
+    return bytes_processed_;
+  }
+
+  //! Returns the launch parameters from the last kernel execution
+  LaunchParams lastLaunchParams() const {
+    return launch_params_;
+  }
+
+  //! Returns the string of the compiled kernel
+  std::string kernelString() const {
+    return kernel_code_;
+  }
+
+  //! Returns the latest compile log
+  std::string compilerLog() const {
+    return last_compiler_log_;
+  }
+
+  std::string kernelName() const {
+    std::stringstream ss;
+    ss << "kernel" << fusion_id_;
+    return ss.str();
+  }
+
   //! Internal tests only. Compiles CUDA code with NVRTC directly from
   //! string. This util provides a path to test runtime code, i.e. the resource
   //! strings.
@@ -132,12 +159,6 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
     std::vector<bool> zero_init;
   };
 
-  std::string kernelName() const {
-    std::stringstream ss;
-    ss << "kernel" << fusion_id_;
-    return ss.str();
-  }
-
   static std::string kernelNamespace() {
     return "CudaCodeGen";
   }
@@ -164,6 +185,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // skip allocating real storage for those, but still maintain its spot to
   // maintain the indexing from output aliases to inputs
   std::vector<at::Tensor> allocOutputs(
+      const at::ArrayRef<IValue>& inputs,
       kir::ExpressionEvaluator& expr_eval,
       const std::unordered_set<int>& alias_indices = {});
 
@@ -178,10 +200,24 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   }
 
  private:
-  Fusion fusion_;
-
   CompileOptions options_;
-  size_t max_device_smem = std::numeric_limits<size_t>().max();
+
+  //! Current configured total shared mem size from cudaDeviceProp
+  size_t configured_device_smem_ = std::numeric_limits<size_t>().max();
+
+  //! Available shared memory space for dynamic allocation for the current
+  //!  compiled kernel at the current shared memory/L1 configuration
+  c10::optional<size_t> maybe_available_dynamic_smem_ = c10::nullopt;
+
+  //! Absolute limit of all available shared mem space from cudaDeviceProp
+  size_t device_smem_limit_ = std::numeric_limits<size_t>().max();
+
+  // Assuming sm70 or above:
+  //  limit of statically allocated smem is 48 KB:
+  // See:
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
+  const int max_static_smem_ = 48 << 10;
   int warp_size_ = 0;
   executor_utils::NvrtcFunction compiled_kernel_;
 
@@ -192,12 +228,28 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   int fusion_id_ = -1;
   static int fusion_id_counter_;
 
-  GpuLower lowered_;
+  std::unique_ptr<GpuLower> lowered_;
+  // Copy of lowered_->kernel()
+  Fusion* fusion_ = nullptr;
+
+  // Track the block size this kernel was compiled with. If the block size
+  // increases, recompile to adjust maxregister count.
+  int64_t block_size_high_water_mark = 1;
 
   // lookup table to take short cut to retrieve recorded information in order to
   // launch kernels without re-inference parameters.
   std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
 
+  // Compile time information caching. This is used for shape inference
+  //  support. The cache stores graph information that are available
+  //  without shape information so that each shape inference call will
+  //  not need to re-compute them.
+  ExecutorCompileTimeInfoCache compile_time_info_cache_;
+
+  // Cached expr eval
+  std::unique_ptr<KernelPrecomputedIntegers> evaluator_precomputed_integers_ =
+      nullptr;
+
   // Profiling support: knob to control wheter we actually execute the
   // kernel on the GPU or not
   bool execute_kernel_ = true;
@@ -205,21 +257,24 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // Profiling support: knob to enable measuring kernel execution time
   bool measure_kernel_time_ = false;
 
-  // The last kernel execution time, if measure_kernel_time_ is true
+  // Profiling support: the last kernel execution time, if measure_kernel_time_
+  // is true
   float kernel_time_ms_ = 0;
 
+  // Profiling support: the last kernel Bytes processed
+  int64_t bytes_processed_ = 0;
+
+  // Profiling support: the last launch param used
+  LaunchParams launch_params_;
+
   // Profiling support: knob to disable caching of launch params
   bool disable_parameter_cache_ = false;
 
-  // Compile time information caching. This is used for shape inference
-  //  support. The cache stores graph information that are available
-  //  without shape information so that each shape inference call will
-  //  not need to re-compute them.
-  ExecutorCompileTimeInfoCache compile_time_info_cache_;
+  // Profiling support: kept copy of the cuda kernel
+  std::string kernel_code_;
 
-  // Cached expr eval
-  std::unique_ptr<KernelPrecomputedIntegers> evaluator_precomputed_integers_ =
-      nullptr;
+  // Profiling support: nvrtc log for debugging
+  std::string last_compiler_log_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
index 968570c1086d..da5667f9facc 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
@@ -1,4 +1,3 @@
-#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/util/irange.h>
 
 // Extract size and strides
@@ -65,7 +64,7 @@ std::unique_ptr<TensorArgAbstract> getTensorArg(int nDims) {
           false,
           "Tried to generate a tensor to run a generated kernel with ",
           nDims,
-          " dimensions, however it must be a size 0 to 8 dimensional tensor.");
+          " dimensions, however only 0 to 8 dimensional tensor are supported.");
   }
   return nullptr;
 }
@@ -89,6 +88,10 @@ std::unique_ptr<TensorArgAbstract> getTensorArg(
       return getTensorArg<int64_t, INDEX_MODE>(nDims);
     case c10::ScalarType::Int:
       return getTensorArg<int32_t, INDEX_MODE>(nDims);
+    case c10::ScalarType::ComplexFloat:
+      return getTensorArg<c10::complex<float>, INDEX_MODE>(nDims);
+    case c10::ScalarType::ComplexDouble:
+      return getTensorArg<c10::complex<double>, INDEX_MODE>(nDims);
     default:
       TORCH_CHECK(
           false,
@@ -98,8 +101,6 @@ std::unique_ptr<TensorArgAbstract> getTensorArg(
   }
 }
 
-} // namespace
-
 std::unique_ptr<TensorArgAbstract> getTensorArg(
     c10::ScalarType dtype,
     int nDims,
@@ -117,20 +118,73 @@ std::unique_ptr<TensorArgAbstract> getTensorArg(
   return nullptr;
 }
 
+} // namespace
+
 // Push a tensor to the arguments
 void KernelArgumentHolder::push(const at::Tensor& tensor) {
   changed_ = true;
-  int nDims = tensor.ndimension();
-
-  c10::ScalarType dtype = tensor.scalar_type();
-  std::unique_ptr<TensorArgAbstract> tensor_arg =
-      getTensorArg(dtype, nDims, index_mode_);
-  tensor_arg->setPointer(tensor.data_ptr());
-  for (const auto i : c10::irange(nDims)) {
-    tensor_arg->setSize(i, tensor.sizes()[i]);
-    tensor_arg->setStride(i, tensor.strides()[i]);
+  if (is_cpu_scalar(tensor)) {
+    switch (tensor.scalar_type()) {
+      case c10::ScalarType::Double:
+        arguments_.push_back(
+            std::make_unique<
+                CpuScalarTensorArg<CpuScalarTensorCodegen<double>>>(
+                tensor.data_ptr<double>()[0]));
+        break;
+      case c10::ScalarType::Float:
+        arguments_.push_back(
+            std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<float>>>(
+                tensor.data_ptr<float>()[0]));
+        break;
+      case c10::ScalarType::Half:
+        arguments_.push_back(
+            std::make_unique<
+                CpuScalarTensorArg<CpuScalarTensorCodegen<at::Half>>>(
+                tensor.data_ptr<at::Half>()[0]));
+        break;
+      case c10::ScalarType::BFloat16:
+        arguments_.push_back(
+            std::make_unique<
+                CpuScalarTensorArg<CpuScalarTensorCodegen<at::BFloat16>>>(
+                tensor.data_ptr<at::BFloat16>()[0]));
+        break;
+      case c10::ScalarType::Bool:
+        arguments_.push_back(
+            std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<bool>>>(
+                tensor.data_ptr<bool>()[0]));
+        break;
+      case c10::ScalarType::Long:
+        arguments_.push_back(
+            std::make_unique<
+                CpuScalarTensorArg<CpuScalarTensorCodegen<int64_t>>>(
+                tensor.data_ptr<int64_t>()[0]));
+        break;
+      case c10::ScalarType::Int:
+        arguments_.push_back(
+            std::make_unique<
+                CpuScalarTensorArg<CpuScalarTensorCodegen<int32_t>>>(
+                tensor.data_ptr<int32_t>()[0]));
+        break;
+      default:
+        TORCH_CHECK(
+            false,
+            "Dtype: ",
+            tensor.scalar_type(),
+            " not currently supported in code generated kernels.");
+    }
+  } else {
+    int nDims = tensor.ndimension();
+
+    c10::ScalarType dtype = tensor.scalar_type();
+    std::unique_ptr<TensorArgAbstract> tensor_arg =
+        getTensorArg(dtype, nDims, index_mode_);
+    tensor_arg->setPointer(tensor.data_ptr());
+    for (const auto i : c10::irange(nDims)) {
+      tensor_arg->setSize(i, tensor.sizes()[i]);
+      tensor_arg->setStride(i, tensor.strides()[i]);
+    }
+    arguments_.push_back(std::move(tensor_arg));
   }
-  arguments_.push_back(std::move(tensor_arg));
 }
 
 // Push a scalar or integer to the arguments
@@ -143,6 +197,10 @@ void KernelArgumentHolder::push(const IValue& val) {
   auto scalar_val = val.toScalar();
   switch (scalar_val.type()) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
+    case c10::ScalarType::ComplexDouble:
+      arguments_.push_back(
+          std::make_unique<ComplexDoubleArg>(scalar_val.toComplexDouble()));
+      return;
     case c10::ScalarType::Double:
       arguments_.push_back(std::make_unique<DoubleArg>(scalar_val.toDouble()));
       return;
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
index d306683c43dc..c135328a3acc 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
@@ -4,6 +4,7 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <array>
 
 namespace torch {
 namespace jit {
@@ -18,10 +19,8 @@ struct TensorArgCodegen {
   };
 
   T* data;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  nvfuser_index_t size[N];
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  nvfuser_index_t stride[N];
+  std::array<nvfuser_index_t, N> size;
+  std::array<nvfuser_index_t, N> stride;
   constexpr int nDims() {
     return N;
   }
@@ -33,6 +32,7 @@ struct TensorArgCodegen {
   }
 };
 
+// 0-Dim GPU based tensor
 template <typename T, typename nvfuser_index_t>
 struct TensorArgCodegen<T, 0, nvfuser_index_t> {
   T& operator[](nvfuser_index_t ind) {
@@ -51,6 +51,17 @@ struct TensorArgCodegen<T, 0, nvfuser_index_t> {
   }
 };
 
+// Specialization for 0-dim case that's easy to pass in a CPU based tensor
+// without memcpy
+template <typename T>
+struct CpuScalarTensorCodegen {
+  T& operator[](int) {
+    return data;
+  };
+
+  T data;
+};
+
 struct ArgAbstract {
   virtual ~ArgAbstract() = default;
   virtual void* arg() = 0;
@@ -59,35 +70,39 @@ struct ArgAbstract {
 struct PhiloxCudaStateArg : public ArgAbstract {
   at::PhiloxCudaState val_;
   PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){};
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  void* arg() {
+  void* arg() override {
     return &val_;
   }
 };
 
 struct LongArg : public ArgAbstract {
   int64_t val_;
-  explicit LongArg(int64_t _val) : val_(_val){};
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  void* arg() {
+  explicit LongArg(int64_t _val) : val_(_val) {}
+  void* arg() override {
     return &val_;
   }
 };
 
 struct DoubleArg : public ArgAbstract {
   double val_;
-  explicit DoubleArg(double _val) : val_(_val){};
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  void* arg() {
+  explicit DoubleArg(double _val) : val_(_val) {}
+  void* arg() override {
+    return &val_;
+  }
+};
+
+struct ComplexDoubleArg : public ArgAbstract {
+  c10::complex<double> val_;
+  explicit ComplexDoubleArg(c10::complex<double> _val) : val_(_val) {}
+  void* arg() override {
     return &val_;
   }
 };
 
 struct BoolArg : public ArgAbstract {
   bool val_;
-  explicit BoolArg(bool _val) : val_(_val){};
-  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
-  void* arg() {
+  explicit BoolArg(bool _val) : val_(_val) {}
+  void* arg() override {
     return &val_;
   }
 };
@@ -119,9 +134,20 @@ struct TensorArg : public TensorArgAbstract {
   }
 };
 
-std::unique_ptr<TensorArgAbstract> getTensorArg(
-    c10::ScalarType dtype,
-    int nDims);
+template <typename CPU_TENSOR_TYPE>
+struct CpuScalarTensorArg : public ArgAbstract {
+  CPU_TENSOR_TYPE instance_;
+
+  CpuScalarTensorArg() = delete;
+
+  explicit CpuScalarTensorArg(decltype(CPU_TENSOR_TYPE::data) _data) {
+    instance_.data = _data;
+  }
+
+  void* arg() override {
+    return &instance_;
+  }
+};
 
 class KernelArgumentHolder {
  public:
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 13cdc29099ed..ef3d48aeb234 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -5,21 +5,24 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/util/irange.h>
 
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h>
 #include <torch/csrc/jit/resource_guard.h>
 
 #include <nvfuser_resources/PhiloxCudaStateRaw.h>
+#include <nvfuser_resources/array.h>
 #include <nvfuser_resources/bf16_support.h>
 #include <nvfuser_resources/block_reduction.h>
 #include <nvfuser_resources/block_sync_atomic.h>
 #include <nvfuser_resources/block_sync_default.h>
 #include <nvfuser_resources/broadcast.h>
 #include <nvfuser_resources/fp16_support.h>
+#include <nvfuser_resources/fused_reduction.h>
 #include <nvfuser_resources/grid_broadcast.h>
 #include <nvfuser_resources/grid_reduction.h>
 #include <nvfuser_resources/grid_sync.h>
@@ -27,6 +30,9 @@
 #include <nvfuser_resources/index_utils.h>
 #include <nvfuser_resources/random_numbers.h>
 #include <nvfuser_resources/tensor.h>
+#include <nvfuser_resources/tensorcore.h>
+#include <nvfuser_resources/tuple.h>
+#include <nvfuser_resources/type_traits.h>
 #include <nvfuser_resources/warp.h>
 #include <nvfuser_resources/welford.h>
 
@@ -69,9 +75,12 @@ std::string kernelPreamble() {
 
   // Base classes and helpers
   ss << nvfuser_resources::tensor_cu;
+  ss << nvfuser_resources::type_traits_cu;
+  ss << nvfuser_resources::array_cu;
   ss << nvfuser_resources::random_numbers_cu;
   ss << nvfuser_resources::helpers_cu;
   ss << nvfuser_resources::index_utils_cu;
+  ss << nvfuser_resources::tuple_cu;
 
   // Synchronization classes
   if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
@@ -88,6 +97,8 @@ std::string kernelPreamble() {
   ss << nvfuser_resources::broadcast_cu;
   ss << nvfuser_resources::welford_cu;
   ss << nvfuser_resources::warp_cu;
+  ss << nvfuser_resources::tensorcore_cu;
+  ss << nvfuser_resources::fused_reduction_cu;
 
   // Random utilities
   ss << nvfuser_resources::PhiloxCudaStateRaw_cu;
@@ -110,13 +121,23 @@ bool validateKernelArgTensor(
     return false;
   }
 
+  if (is_cpu_scalar(arg) && !param->as<TensorView>()->isCpuScalar()) {
+    msg << "Argument is CPU Scalar Tensor, but parameter is not.\n";
+    return false;
+  }
+
+  if (!is_cpu_scalar(arg) && !arg.is_cuda()) {
+    msg << "Argumnet is a CPU tensor which is not supported in fusions.\n";
+    return false;
+  }
+
   // Check the rank of the tensors.
   size_t arg_dim = arg.dim();
   // Note: This requires current Fusion to be active.
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  size_t param_dim =
-      TensorDomain::noReductions(param->as<TensorView>()->getRootDomain())
-          .size();
+  size_t param_dim = TensorDomain::noReductions(
+                         param->as<TensorView>()->getMaybeRFactorDomain())
+                         .size();
   // see [Note - broadcast support in integration]
   // Because of broadcasting support handled in integration, we relax the rank
   // check as necessary.
@@ -126,7 +147,7 @@ bool validateKernelArgTensor(
     return false;
   }
 
-  if (arg.device() != device) {
+  if (!is_cpu_scalar(arg) && arg.device() != device) {
     msg << "Argument is on device that is not compiled for."
         << "\n";
     return false;
@@ -157,6 +178,12 @@ bool validateKernelArgTensor(
     case at::ScalarType::Bool:
       match = param_data_type == DataType::Bool;
       break;
+    case at::ScalarType::ComplexFloat:
+      match = param_data_type == DataType::ComplexFloat;
+      break;
+    case at::ScalarType::ComplexDouble:
+      match = param_data_type == DataType::ComplexDouble;
+      break;
     default:
       msg << "Argument element type, " << arg_data_type << ", is not supported."
           << "\n";
@@ -184,6 +211,10 @@ bool validateKernelArgScalar(
     case c10::ScalarType::Long:
       match = param_type == DataType::Int || param_type == DataType::Int32;
       break;
+    case c10::ScalarType::ComplexDouble:
+      match = param_type == DataType::ComplexDouble ||
+          param_type == DataType::ComplexFloat;
+      break;
     case c10::ScalarType::Double:
       match = param_type == DataType::Double || param_type == DataType::Float ||
           param_type == DataType::Half || param_type == DataType::BFloat16;
@@ -245,6 +276,10 @@ bool checkSameStride(const std::vector<c10::IValue>& tensors) {
 
 // Return true if all the tensors are contiguous and have the same striding
 bool checkSameContiguity(const std::vector<c10::IValue>& tensors) {
+  if (tensors.size() < 2) {
+    return true;
+  }
+
   auto reference = tensors.front();
   if (!reference.isTensor()) {
     return false;
@@ -277,6 +312,7 @@ bool checkValidMisalignedTensors(
     // Only check input tensors
     return checkSameStride(inp_tensors);
   } else if (!out_tv.empty() && out_tensors.empty()) {
+    // out_tensors is empty unless outputs are given to runFusion.
     // Assume out tensors are contiguous
     return checkSameContiguity(inp_tensors);
   } else {
@@ -339,146 +375,289 @@ void validateKernelOutputs(
       !mismatch, "Found one or more invalid arguments: ", msg.str());
 }
 
-bool canVectorize(const IValue& aten_val, int word_size) {
-  if (!aten_val.isTensor()) {
-    return false;
-  }
+namespace {
 
-  const auto& aten_tensor = aten_val.toTensor();
+// Finds a fusion input or output tensor to validate its stides
+// for vectorization.
+// Returns a pair consisting of a flag indicating it's a fusion input
+// and an integer position within in the input or output tensor list.
+std::vector<std::pair<bool, int>> getVectorizedFusionInputOutput(
+    TensorView* producer_tv,
+    TensorView* consumer_tv,
+    Fusion* fusion) {
+  std::vector<std::pair<bool, int>> vectorized_input_output;
 
-  if (reinterpret_cast<size_t>(aten_tensor.data_ptr()) %
-          (word_size * aten_tensor.dtype().itemsize()) !=
-      0) {
-    return false;
-  }
+  // When the producer is a fusion input, validate only the producer
+  // and assume the consumer is contiguous. Similarly, when the
+  // consumer is a fusion output, validate the consumer and assume the
+  // producer is contiguous.
 
-  for (size_t i = aten_tensor.ndimension(); i > 0; i--) {
-    if (aten_tensor.size(i - 1) != 1) {
-      if (aten_tensor.size(aten_tensor.ndimension() - 1) % word_size != 0 ||
-          aten_tensor.stride(aten_tensor.ndimension() - 1) != 1) {
-        return false;
-      }
-      break;
-    }
+  if (producer_tv->isFusionInput()) {
+    auto producer_it = std::find(
+        fusion->inputs().begin(), fusion->inputs().end(), producer_tv);
+    TORCH_INTERNAL_ASSERT(
+        producer_it != fusion->inputs().end(),
+        "Could not find ",
+        producer_tv,
+        " in fusion inputs.");
+    auto pos = std::distance(fusion->inputs().begin(), producer_it);
+    vectorized_input_output.push_back(
+        std::make_pair<bool, int>(true, static_cast<int>(pos)));
+  } else {
+    // If not fusion input, assume it's fully contiguous, so nothing
+    // to check with respect to strides.
+    TORCH_INTERNAL_ASSERT(
+        std::all_of(
+            producer_tv->domain()->contiguity().begin(),
+            producer_tv->domain()->contiguity().end(),
+            [](bool contig) { return contig; }),
+        "Unsupported pattern of vectorization: ",
+        consumer_tv->definition()->toString());
   }
 
-  for (auto stride : aten_tensor.strides()) {
-    if (stride != 1 && stride % word_size != 0) {
-      return false;
-    }
+  if (consumer_tv->isFusionOutput()) {
+    auto consumer_it = std::find(
+        fusion->outputs().begin(), fusion->outputs().end(), consumer_tv);
+    TORCH_INTERNAL_ASSERT(
+        consumer_it != fusion->outputs().end(),
+        "Could not find ",
+        consumer_tv,
+        " in fusion outputs.");
+    auto pos = std::distance(fusion->outputs().begin(), consumer_it);
+    vectorized_input_output.push_back(
+        std::make_pair<bool, int>(false, static_cast<int>(pos)));
+  } else {
+    // If not fusion input, assume it's fully contiguous, so nothing
+    // to check with respect to strides.
+    TORCH_INTERNAL_ASSERT(
+        std::all_of(
+            consumer_tv->domain()->contiguity().begin(),
+            consumer_tv->domain()->contiguity().end(),
+            [](bool contig) { return contig; }),
+        "Unsupported pattern of vectorization: ",
+        consumer_tv->definition()->toString());
   }
 
-  return true;
+  return vectorized_input_output;
 }
 
-bool canVectorize(
-    TensorView* fusion_tv,
-    int word_size,
-    GpuLower& lower,
-    kir::ExpressionEvaluator& expr_eval) {
-  IterDomain* last_root_dim = nullptr;
-  // TODO: Should this be rfactor instead of root??
-  for (size_t i = fusion_tv->getRootDomain().size(); i > 0; i--) {
-    auto r_id = fusion_tv->getRootDomain()[i - 1];
-    if (r_id->isReduction() || r_id->isBroadcast()) {
+//! Returns the information of vectorized input/output tensors
+//! in the given fusion.
+std::unique_ptr<caching::VectorizedTensorInfo> getVectorizedTensorValidationInfo(
+    kir::Kernel* kernel) {
+  auto vectorized_tensor_info_ptr =
+      std::make_unique<caching::VectorizedTensorInfo>();
+
+  for (const auto& vector_info : kernel->summary().vectorized_set_info) {
+    auto consumer_tv = vector_info.consumer_tv;
+    auto producer_tv = vector_info.producer_tv;
+
+    auto vector_dim = vector_info.vectorized_leaf_id;
+    const auto is_aligned =
+        vector_dim->getParallelType() == ParallelType::Vectorize;
+
+    // Find fusion inputs and outputs that are used with misaligned
+    // vectorization.
+    if (!is_aligned) {
+      TORCH_INTERNAL_ASSERT(
+          producer_tv->isFusionInput() || consumer_tv->isFusionOutput(),
+          "MisalignedVectorize is assumed to be used with either input or output tensor");
+      if (consumer_tv->getMemoryType() == MemoryType::Global &&
+          producer_tv->getMemoryType() == MemoryType::Local) {
+        vectorized_tensor_info_ptr->global_out_misaligned_tv.insert(
+            consumer_tv);
+      } else if (
+          producer_tv->getMemoryType() == MemoryType::Global &&
+          consumer_tv->getMemoryType() == MemoryType::Local) {
+        vectorized_tensor_info_ptr->global_inp_misaligned_tv.insert(
+            producer_tv);
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Unsupported memory configuration for misaligned vectorization.");
+      }
+    }
+
+    // Collect information on corresponding fusion input and output
+    // tensors to verify strides.
+    auto inp_or_out_info =
+        getVectorizedFusionInputOutput(producer_tv, consumer_tv, kernel);
+
+    // If both producer and consumer are contig and intermediate,
+    // nothing to validate with respect to strides.
+    if (inp_or_out_info.empty()) {
       continue;
     }
-    last_root_dim = r_id;
-    break;
-  }
 
-  if (last_root_dim == nullptr) {
-    return false;
-  }
+    // Misaligned vectorize only allows from input to local or local
+    // to output
+    if (!is_aligned) {
+      TORCH_INTERNAL_ASSERT(inp_or_out_info.size() == 1);
+    }
 
-  auto last_dim_size =
-      expr_eval.evaluate(lower.lowerValue(last_root_dim->extent()));
+    for (const auto& inp_or_out : inp_or_out_info) {
+      const bool is_input = inp_or_out.first;
+      const int pos = inp_or_out.second;
 
-  if (!last_dim_size.has_value()) {
-    return false;
+      if (is_aligned) {
+        auto& pos_list = is_input
+            ? vectorized_tensor_info_ptr->aligned_vectorized_inp_tensor_pos
+            : vectorized_tensor_info_ptr->aligned_vectorized_out_tensor_pos;
+        pos_list.push_back(pos);
+      } else {
+        auto& map = is_input
+            ? vectorized_tensor_info_ptr->inp_misaligned_tensors_pos
+            : vectorized_tensor_info_ptr->out_misaligned_tensors_pos;
+        map.emplace_back(pos);
+      }
+    }
   }
 
-  if (last_dim_size.value() % word_size != 0) {
-    return false;
+  return vectorized_tensor_info_ptr;
+}
+
+// Make sure the root domain(s) comprising the vectorized leaf domain
+// have the (merged) extent that is divisible by the vectorization
+// word size.
+void validateAlignedVectorizeExtents(
+    const VectorizedSetInfo& info,
+    kir::ExpressionEvaluator& expr_eval) {
+  TORCH_INTERNAL_ASSERT(
+      !info.contig_root_ids.empty(),
+      "No root ID found for vectorization with ",
+      info.consumer_tv->toString(),
+      " and ",
+      info.producer_tv->toString());
+
+  int64_t vectorized_merged_domain_extent = 1;
+  for (auto id : info.contig_root_ids) {
+    auto extent_val = expr_eval.evaluate(id->extent());
+    TORCH_INTERNAL_ASSERT(
+        extent_val.has_value(),
+        "Error vectorizing, ",
+        info.consumer_tv->toString(),
+        " as the extent of a vectorized root domain, ",
+        id->toString(),
+        ", is unknown.");
+    vectorized_merged_domain_extent *= extent_val.value();
   }
 
-  return true;
+  TORCH_INTERNAL_ASSERT(
+      vectorized_merged_domain_extent % info.word_size == 0,
+      "Error vectorizing, ",
+      info.consumer_tv->toString(),
+      " as the extent of the indexed domain, ",
+      vectorized_merged_domain_extent,
+      ", is not divisible by vector word size ",
+      info.word_size);
 }
 
-namespace {
+void validateAlignedVectorizedFusionInputOutput(
+    const IValue& aten_val,
+    int word_size,
+    TensorView* tv) {
+  TORCH_INTERNAL_ASSERT(aten_val.isTensor());
 
-// Check if there's any split that is non-divisible and vectorized. If
-// found, Vectorize is illegal.
-void validateVectorizedSplits(
-    kir::Kernel* kernel,
-    kir::ExpressionEvaluator& expr_eval) {
-  for (const auto& extent_factor : kernel->summary().splits_to_validate) {
-    auto input_extent = expr_eval.evaluate(extent_factor.first);
-    auto split_factor = expr_eval.evaluate(extent_factor.second);
-    TORCH_INTERNAL_ASSERT(
-        input_extent.has_value(),
-        "Could not check if a split with vectorization is divisible because the extent, ",
-        kir::toString(extent_factor.first),
-        ", is not possible to evaluate.");
-    TORCH_INTERNAL_ASSERT(
-        input_extent.has_value(),
-        "Could not check if a split with vectorization is divisible because the split factor, ",
-        kir::toString(extent_factor.second),
-        ", is not possible to evaluate.");
+  const auto& aten_tensor = aten_val.toTensor();
+
+  TORCH_INTERNAL_ASSERT(
+      reinterpret_cast<size_t>(aten_tensor.data_ptr()) %
+              (word_size * aten_tensor.dtype().itemsize()) ==
+          0,
+      "Vectorization of ",
+      tv->toString(),
+      " not possible as the memory address is not aligned. ",
+      "Address: ",
+      aten_tensor.data_ptr(),
+      ", vector word size: ",
+      word_size,
+      ", data type: ",
+      aten_tensor.dtype());
+
+  // Traverse strides from the right-most domains. The rightmost
+  // domain must have stride 1.
+  int64_t cur_contig_stride = 1;
+  bool still_rightmost = true;
+  for (auto i = aten_tensor.ndimension() - 1; i >= 0; --i) {
+    const auto stride = aten_tensor.strides().at(i);
+    // If this domain is contiguous, then not necessary to check the
+    // stride. Otherwise, stride must be 1 if it's rightmost or
+    // divisible by word_size.
     TORCH_INTERNAL_ASSERT(
-        input_extent.value() % split_factor.value() == 0,
-        "Non-divisible split with vectorization is detected. ",
-        "Extent: ",
-        input_extent.value(),
-        ". Factor: ",
-        split_factor.value());
+        stride == cur_contig_stride || (still_rightmost && stride == 1) ||
+            (!still_rightmost && stride % word_size == 0),
+        "Vectorization of ",
+        tv->toString(),
+        " with word size ",
+        word_size,
+        " not possible due to invalid stride.",
+        " Domain: ",
+        tv->axis(i)->toString(),
+        ", stride: ",
+        stride)
+    // If the domain is size-1, the next domain is still considered
+    // rightmost.
+    const auto size = aten_tensor.sizes().at(i);
+    still_rightmost = still_rightmost && size == 1;
+    cur_contig_stride = stride * size;
   }
 }
 
-} // namespace
-
-// Misaligned vectorization check. Currently misaligned vectorization is limited
-// to global-register and register-global load/store patterns. However, this
-// could be improved to include shared memory.
-void validateVectorizedTensors(
-    Fusion* fusion,
+void validateAlignedVectorizedTensors(
+    kir::Kernel* kernel,
     const at::ArrayRef<IValue>& inputs,
     const std::vector<at::Tensor>& outputs,
-    GpuLower& lower,
     caching::ExecutorCompileTimeInfoCache* data_cache,
     kir::ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors");
-
   auto tensor_vectorization_validation_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::VectorizedTensorValidation>(
-          data_cache, [fusion, &lower]() {
-            return executor_utils::getVectorizedTensorValidationInfo(
-                fusion, lower);
+          data_cache, [kernel]() {
+            return executor_utils::getVectorizedTensorValidationInfo(kernel);
           });
 
-  // Validate all the canVectorizes:
-  for (auto it : tensor_vectorization_validation_entry.get()
-                     .inp_pos_to_word_size_map_to_verify) {
-    TORCH_INTERNAL_ASSERT(
-        canVectorize(inputs[it.first], it.second),
-        "Error vectorizing, ",
-        fusion->inputs()[it.first],
-        " as input provided does not allowed vectorization by word size, ",
-        it.second);
+  // Verify extents of aligned vectorized tensors
+  for (const auto& vec_info : kernel->summary().vectorized_set_info) {
+    if (vec_info.vectorized_leaf_id->getParallelType() ==
+        ParallelType::Vectorize) {
+      validateAlignedVectorizeExtents(vec_info, expr_eval);
+    }
   }
 
-  if (outputs.size() > 0) {
-    for (auto it : tensor_vectorization_validation_entry.get()
-                       .out_pos_to_word_size_map_to_verify) {
-      TORCH_INTERNAL_ASSERT(
-          canVectorize(outputs[it.first], it.second),
-          "Error vectorizing, ",
-          fusion->outputs()[it.first],
-          " as output provided does not allowed vectorization by word size, ",
-          it.second);
+  // Validate input and output tensors with aligend
+  // vectorization.
+  for (auto pos : tensor_vectorization_validation_entry.get()
+                      .aligned_vectorized_inp_tensor_pos) {
+    auto tv = kernel->inputs().at(pos)->as<TensorView>();
+    auto word_size = kernel->summary().vectorized_accesses.at(tv);
+    validateAlignedVectorizedFusionInputOutput(inputs[pos], word_size, tv);
+  }
+
+  if (!outputs.empty()) {
+    for (auto pos : tensor_vectorization_validation_entry.get()
+                        .aligned_vectorized_out_tensor_pos) {
+      auto tv = kernel->outputs().at(pos)->as<TensorView>();
+      auto word_size = kernel->summary().vectorized_accesses.at(tv);
+      validateAlignedVectorizedFusionInputOutput(outputs[pos], word_size, tv);
     }
   }
+}
+
+// Misaligned vectorization check. Currently misaligned vectorization is limited
+// to global-register and register-global load/store patterns. However, this
+// could be improved to include shared memory.
+void validateMisalignedVectorizedTensors(
+    kir::Kernel* kernel,
+    const at::ArrayRef<IValue>& inputs,
+    const std::vector<at::Tensor>& outputs,
+    caching::ExecutorCompileTimeInfoCache* data_cache,
+    kir::ExpressionEvaluator& expr_eval) {
+  auto tensor_vectorization_validation_entry =
+      executor_utils::caching::ExecutorCompileTimeEntry<
+          executor_utils::caching::VectorizedTensorValidation>(
+          data_cache, [kernel]() {
+            return executor_utils::getVectorizedTensorValidationInfo(kernel);
+          });
 
   std::vector<c10::IValue> inp_misaligned_tensors;
   std::vector<c10::IValue> out_misaligned_tensors;
@@ -510,8 +689,53 @@ void validateVectorizedTensors(
           inp_misaligned_tensors,
           out_misaligned_tensors),
       "All global tensors must have the same stride for misaligned vectorization.");
+}
 
-  validateVectorizedSplits(lower.kernel(), expr_eval);
+// Check if there's any split that is non-divisible and vectorized. If
+// found, Vectorize is illegal.
+void validateVectorizedSplits(
+    kir::Kernel* kernel,
+    kir::ExpressionEvaluator& expr_eval) {
+  for (const auto& extent_factor : kernel->summary().splits_to_validate) {
+    auto input_extent = expr_eval.evaluate(extent_factor.first);
+    auto split_factor = expr_eval.evaluate(extent_factor.second);
+    TORCH_INTERNAL_ASSERT(
+        input_extent.has_value(),
+        "Could not check if a split with vectorization is divisible because the extent, ",
+        extent_factor.first->toString(),
+        ", is not possible to evaluate.");
+    TORCH_INTERNAL_ASSERT(
+        input_extent.has_value(),
+        "Could not check if a split with vectorization is divisible because the split factor, ",
+        extent_factor.second->toString(),
+        ", is not possible to evaluate.");
+    TORCH_INTERNAL_ASSERT(
+        input_extent.value() % split_factor.value() == 0,
+        "Non-divisible split with vectorization is detected. ",
+        "Extent: ",
+        input_extent.value(),
+        ". Factor: ",
+        split_factor.value());
+  }
+}
+
+} // namespace
+
+void validateVectorizedTensors(
+    kir::Kernel* kernel,
+    const at::ArrayRef<IValue>& inputs,
+    const std::vector<at::Tensor>& outputs,
+    caching::ExecutorCompileTimeInfoCache* data_cache,
+    kir::ExpressionEvaluator& expr_eval) {
+  FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors");
+
+  validateAlignedVectorizedTensors(
+      kernel, inputs, outputs, data_cache, expr_eval);
+
+  validateMisalignedVectorizedTensors(
+      kernel, inputs, outputs, data_cache, expr_eval);
+
+  validateVectorizedSplits(kernel, expr_eval);
 }
 
 kir::ExpressionEvaluator bindKernelInputs(
@@ -530,15 +754,15 @@ kir::ExpressionEvaluator bindKernelInputs(
   for (const auto i : c10::irange(inputs.size())) {
     const auto input = inputs[i];
 
-    if (auto tensor_input = dynamic_cast<kir::TensorView*>(input)) {
+    if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
       TORCH_INTERNAL_ASSERT(
           aten_inputs[i].isTensor(),
           "Something went wrong configuring launch. Inputs no longer match at index:",
           i);
 
       const auto aten_tensor = aten_inputs[i].toTensor();
-      const auto root_domain =
-          kir::TensorDomain::noReductions(tensor_input->domain()->rootDomain());
+      const auto root_domain = TensorDomain::noReductions(
+          tensor_input->domain()->getMaybeRFactorDomain());
       TORCH_INTERNAL_ASSERT(
           aten_tensor.ndimension() == static_cast<int>(root_domain.size()),
           "Something went wrong configuring launch. Inputs no longer match.");
@@ -546,6 +770,11 @@ kir::ExpressionEvaluator bindKernelInputs(
       for (const auto dim : c10::irange(root_domain.size())) {
         const auto extent = root_domain[dim]->extent();
         const auto value = aten_tensor.sizes()[dim];
+        if (value == 0 && tensor_input->uses().empty()) {
+          // If there's no uses, ignore there's a size-0 dimension.
+          continue;
+        }
+        TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions");
         bool should_bind = true;
         if (check_consistency) {
           const auto prev_value = expr_eval.evaluate(extent);
@@ -553,7 +782,7 @@ kir::ExpressionEvaluator bindKernelInputs(
             TORCH_CHECK(
                 *prev_value == value,
                 "Attempting to bind ",
-                kir::toString(extent),
+                extent->toString(),
                 " to ",
                 value,
                 "but it's already set to ",
@@ -561,14 +790,16 @@ kir::ExpressionEvaluator bindKernelInputs(
             should_bind = false;
           }
         }
-        if (should_bind && !extent->isConst()) {
+        if (should_bind && !extent->isConstScalar()) {
           expr_eval.bind(extent, value);
         }
       }
       // NOLINTNEXTLINE: https://bugs.llvm.org/show_bug.cgi?id=48525
     } else if (input->isScalar() && input->dtype() == DataType::Int) {
       TORCH_INTERNAL_ASSERT(
-          aten_inputs[i].type()->kind() == c10::TypeKind::IntType);
+          aten_inputs[i].type()->kind() == c10::TypeKind::IntType,
+          "kernel expected Scalar Int inputs, but found",
+          aten_inputs[i].type()->str());
       expr_eval.bind(input, aten_inputs[i].toInt());
     }
   }
@@ -599,14 +830,19 @@ ExpressionEvaluator bindFusionInputs(
           "Something went wrong configuring launch. Inputs do not match.");
 
       auto aten_tensor = aten_inputs[i].toTensor();
-      auto root_dom = TensorDomain::noReductions(cg_tensor->getRootDomain());
+      auto root_dom =
+          TensorDomain::noReductions(cg_tensor->getMaybeRFactorDomain());
       TORCH_INTERNAL_ASSERT(
           aten_tensor.ndimension() == (int64_t)root_dom.size(),
           "Something went wrong configuring launch. Inputs do not match.");
-
       for (const auto dim : c10::irange(root_dom.size())) {
         const auto extent = root_dom[dim]->extent();
         const auto value = aten_tensor.sizes()[dim];
+        if (value == 0 && cg_tensor->uses().empty()) {
+          // If there's no uses, ignore there's a size-0 dimension.
+          continue;
+        }
+        TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions");
         const auto prev_value = evaluator.evaluate(extent);
         if (prev_value.has_value()) {
           TORCH_CHECK(
@@ -625,7 +861,9 @@ ExpressionEvaluator bindFusionInputs(
         inputs[i]->getValType().value() == ValType::Scalar &&
         inputs[i]->getDataType().value() == DataType::Int) {
       TORCH_INTERNAL_ASSERT(
-          aten_inputs[i].type()->kind() == c10::TypeKind::IntType);
+          aten_inputs[i].type()->kind() == c10::TypeKind::IntType,
+          "fusion expected Scalar Int inputs, but found",
+          aten_inputs[i].type()->str());
       evaluator.bind(inputs[i], aten_inputs[i].toInt());
     }
   }
@@ -644,7 +882,7 @@ void initializeCudaContext() {
   }
 }
 
-NvrtcFunction nvrtcCompile(
+std::pair<NvrtcFunction, std::string> nvrtcCompile(
     const std::string& code,
     const std::string& func_name,
     int id,
@@ -652,6 +890,8 @@ NvrtcFunction nvrtcCompile(
   FUSER_PERF_SCOPE("executor_utils::NVRTC");
   initializeCudaContext();
 
+  std::stringstream ptxas_log;
+
   const auto prop = at::cuda::getCurrentDeviceProperties();
 
   int major = 0, minor = 0;
@@ -697,24 +937,19 @@ NvrtcFunction nvrtcCompile(
       "--std=c++14", compute.c_str(), "-default-device"};
 #endif
 
-  const char* disable_fastmath = getenv("PYTORCH_NVFUSER_DISABLE_FASTMATH");
-  if (!disable_fastmath || (atoi(disable_fastmath) == 0)) {
-    args.push_back("--use_fast_math");
-  } else {
-    TORCH_WARN_ONCE(
-        "fast math disabled in nvfuser, try set `PYTORCH_NVFUSER_DISABLE_FASTMATH=0`");
-  }
-
-  const char* disable_fma = getenv("PYTORCH_NVFUSER_DISABLE_FMA");
-  // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0;
-  if (disable_fma && atoi(disable_fma)) {
+  const bool disable_fma = isDisabled(DisableOption::Fma);
 #ifdef __HIP_PLATFORM_HCC__
+  if (disable_fma) {
     TORCH_WARN_ONCE(
         "PYTORCH_CUDA_FUSER_DISABLE_FMA is not supported on ROCm, ignoring");
+  }
 #else
+  if (disable_fma) {
     args.push_back("--fmad=false");
-#endif
+  } else {
+    args.push_back("--fmad=true");
   }
+#endif
 
 #ifndef NDEBUG
   // Add line info to generated kernels
@@ -734,7 +969,8 @@ NvrtcFunction nvrtcCompile(
   std::vector<char> info_log;
   unsigned int log_size = 8196;
 
-  if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) {
+  if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog) ||
+      isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
     // show register usage in compilation log
     if (compile_to_sass) {
       args.push_back("--ptxas-options");
@@ -796,14 +1032,20 @@ NvrtcFunction nvrtcCompile(
     // The maximum possible count allowed by ptxas is 255
     max_register = static_cast<uint32_t>(
         std::min(effective_max_reg_per_warp / warp_size, 255));
-
     if (compile_to_sass) {
       max_register_usage += std::to_string(max_register);
+      args.push_back("--ptxas-options");
       args.push_back(max_register_usage.c_str());
     } else {
       options.push_back(CU_JIT_MAX_REGISTERS);
       option_vals.push_back((void*)(intptr_t)max_register);
     }
+
+    ptxas_log << "\nCompile options: ";
+    for (auto arg : args) {
+      ptxas_log << arg << " ";
+    }
+    ptxas_log << " ; block size=" << opt_block_size.value() << "\n";
   }
 #endif
 
@@ -816,26 +1058,21 @@ NvrtcFunction nvrtcCompile(
     const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram(
         program, args.size(), args.data());
 
-    if (result != NVRTC_SUCCESS) {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      size_t logsize;
-      at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      std::vector<char> log(logsize);
-      at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+    size_t logsize = 0;
+    at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
 
+    std::vector<char> log(logsize);
+    at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+
+    if (result != NVRTC_SUCCESS) {
       TORCH_INTERNAL_ASSERT(
           false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data());
-    } else if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      size_t logsize;
-      at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
-      std::vector<char> log(logsize);
-      at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+    }
 
+    ptxas_log << log.data() << std::endl;
+    if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) {
       std::cout << log.data() << std::endl;
     }
-
     AT_CUDA_NVRTC_CHECK(result);
   }
 
@@ -976,7 +1213,7 @@ NvrtcFunction nvrtcCompile(
       compiled_kernel_.module,
       lowered_kernel_name));
 
-  return compiled_kernel_;
+  return {compiled_kernel_, ptxas_log.str()};
 }
 
 namespace caching {
@@ -1037,7 +1274,7 @@ template class ExecutorCompileTimeEntry<OutputAliasIndices>;
 } // namespace caching
 
 std::vector<IterDomain*> getParallelBindingsIterDomains(
-    GpuLower& lower,
+    GpuLower* lower,
     const std::vector<TensorView*>& used_tvs) {
   std::vector<IterDomain*> parallel_ids;
   for (auto tv : used_tvs) {
@@ -1047,8 +1284,8 @@ std::vector<IterDomain*> getParallelBindingsIterDomains(
           // Want to keep the broadcast dimensions if they are not resolved
           // TODO: piping down the parallel dimension map here would
           //  be helpful
-          auto& parallel_map = lower.caParallelMap();
-          if (parallel_map.getConcreteMappedID(id) == id) {
+          if (lower->caMap()->getConcreteMappedID(id, IdMappingMode::LOOP) ==
+              id) {
             parallel_ids.push_back(id);
           }
         } else {
@@ -1062,46 +1299,46 @@ std::vector<IterDomain*> getParallelBindingsIterDomains(
   return parallel_ids;
 }
 
+namespace {
+
 void insertParallelExtent(
-    GpuLower& lower,
     IterDomain* binding_id,
     const std::unique_ptr<ParallelExtentMap>& parallel_iter_extents_ptr) {
-  auto kir_extent = lower.lowerValue(binding_id->extent());
+  auto extent = binding_id->extent();
   const auto it =
       parallel_iter_extents_ptr->find(binding_id->getParallelType());
   if (it != parallel_iter_extents_ptr->end()) {
-    it->second.push_back(kir_extent);
+    it->second.push_back(extent);
   } else {
     parallel_iter_extents_ptr->operator[](binding_id->getParallelType()) = {
-        kir_extent};
+        extent};
   }
 }
 
+} // namespace
+
 std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
-    GpuLower& lower,
     std::vector<IterDomain*>& parallel_binding_ids) {
   auto parallel_iter_extents_ptr = std::make_unique<ParallelExtentMap>();
   for (auto id : parallel_binding_ids) {
-    insertParallelExtent(lower, id, parallel_iter_extents_ptr);
+    insertParallelExtent(id, parallel_iter_extents_ptr);
   }
 
   return parallel_iter_extents_ptr;
 }
 
 std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
-    GpuLower& lower,
+    GpuLower* lower,
     std::vector<IterDomain*>& parallel_binding_ids) {
   auto parallel_iter_extents_ptr = std::make_unique<ParallelExtentMap>();
-  auto& parallel_map = lower.caParallelMap();
+  const auto& ca_map = lower->caMap();
   std::vector<IterDomain*> mapped;
-  bool is_tidx_warp_padded = lower.getWarpPaddedParallelInfo().is_tidx_padded;
+  bool is_tidx_warp_padded = lower->getWarpPaddedParallelInfo().is_tidx_padded;
 
   for (auto id : parallel_binding_ids) {
     if (std::any_of(
-            mapped.begin(),
-            mapped.end(),
-            [id, &parallel_map](IterDomain* mapped_id) {
-              return parallel_map.areMapped(mapped_id, id);
+            mapped.begin(), mapped.end(), [id, &ca_map](IterDomain* mapped_id) {
+              return ca_map->areMapped(mapped_id, id, IdMappingMode::LOOP);
             })) {
       if (id->getParallelType() != ParallelType::TIDx || !is_tidx_warp_padded) {
         continue;
@@ -1109,7 +1346,8 @@ std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
     }
 
     insertParallelExtent(
-        lower, parallel_map.getConcreteMappedID(id), parallel_iter_extents_ptr);
+        ca_map->getConcreteMappedID(id, IdMappingMode::LOOP),
+        parallel_iter_extents_ptr);
     mapped.push_back(id);
   }
 
@@ -1117,7 +1355,7 @@ std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
 }
 
 std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
-    GpuLower& lower,
+    kir::Kernel* kernel,
     std::vector<IterDomain*>& parallel_binding_ids) {
   auto warp_padded_extent_info_ptr =
       std::make_unique<caching::WarpPaddedExtentsInfo>();
@@ -1125,7 +1363,6 @@ std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
       warp_padded_extent_info_ptr->warp_padded_extent_set;
   auto& warp_padded_constant =
       warp_padded_extent_info_ptr->warp_padded_constant;
-  auto kernel = lower.kernel();
   bool has_warp_reduction =
       kernel->getWarpPaddedParallelInfo().has_warp_reduction;
 
@@ -1135,11 +1372,11 @@ std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
     if (has_warp_reduction) {
       if (id->hasPaddingToMultipleOfWarp() ||
           kernel->isParallelTypePadded(id->getParallelType())) {
-        auto kir_extent = lower.lowerValue(id->extent());
-        warp_padded_extent_set.insert(kir_extent);
+        auto extent = id->extent();
+        warp_padded_extent_set.insert(extent);
         auto padded_value = id->getMaybeSizeAfterPadding();
         if (padded_value.has_value()) {
-          warp_padded_constant[kir_extent] = padded_value.value();
+          warp_padded_constant[extent] = padded_value.value();
         }
       }
     }
@@ -1147,122 +1384,6 @@ std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
   return warp_padded_extent_info_ptr;
 }
 
-std::unique_ptr<caching::VectorizedTensorInfo> getVectorizedTensorValidationInfo(
-    Fusion* fusion,
-    GpuLower& lower) {
-  auto vectorized_tensor_info_ptr =
-      std::make_unique<caching::VectorizedTensorInfo>();
-  auto& tv_to_vector_word_size =
-      vectorized_tensor_info_ptr->tv_to_vector_word_size;
-  auto& global_inp_misaligned_tv =
-      vectorized_tensor_info_ptr->global_inp_misaligned_tv;
-  auto& global_out_misaligned_tv =
-      vectorized_tensor_info_ptr->global_out_misaligned_tv;
-
-  kir::ExpressionEvaluator expr_eval;
-
-  // Find all vectorized tensors and their word size
-  for (auto expr : fusion->exprs()) {
-    if (!expr->isA<UnaryOp>() ||
-        expr->as<UnaryOp>()->getUnaryOpType() != UnaryOpType::Set) {
-      continue;
-    }
-    auto uop = expr->as<UnaryOp>();
-    if (!uop->out()->isA<TensorView>() || !uop->in()->isA<TensorView>()) {
-      continue;
-    }
-    auto out_tv = uop->out()->as<TensorView>();
-    auto in_tv = uop->in()->as<TensorView>();
-    IterDomain* vector_dim = nullptr;
-    for (auto id : out_tv->domain()->domain()) {
-      if (id->getParallelType() == ParallelType::Vectorize ||
-          id->getParallelType() == ParallelType::MisalignedVectorize) {
-        TORCH_INTERNAL_ASSERT(
-            vector_dim == nullptr,
-            "Found multiple vectorized dimensions on tensor ",
-            out_tv);
-        vector_dim = id;
-      }
-    }
-    if (vector_dim == nullptr) {
-      continue;
-    }
-    auto vector_word_size =
-        expr_eval.evaluate(lower.lowerValue(vector_dim->extent()));
-    TORCH_INTERNAL_ASSERT(
-        vector_word_size.has_value(),
-        "Non constant vector dimension found in ",
-        out_tv);
-    tv_to_vector_word_size[out_tv] = vector_word_size.value();
-    tv_to_vector_word_size[in_tv] = vector_word_size.value();
-
-    if (vector_dim->getParallelType() == ParallelType::MisalignedVectorize) {
-      if (out_tv->getMemoryType() == MemoryType::Global &&
-          in_tv->getMemoryType() == MemoryType::Local) {
-        global_out_misaligned_tv.insert(out_tv);
-      } else if (
-          in_tv->getMemoryType() == MemoryType::Global &&
-          out_tv->getMemoryType() == MemoryType::Local) {
-        global_inp_misaligned_tv.insert(in_tv);
-      } else {
-        TORCH_INTERNAL_ASSERT(
-            false,
-            "Unsupported memory configuration for misaligned vectorization.");
-      }
-    }
-  }
-
-  // Check striding information on input and outputs as well as size information
-  // of all
-  auto& inp_misaligned_tensors_pos =
-      vectorized_tensor_info_ptr->inp_misaligned_tensors_pos;
-  auto& out_misaligned_tensors_pos =
-      vectorized_tensor_info_ptr->out_misaligned_tensors_pos;
-  auto& inp_pos_to_word_size_map_to_verify =
-      vectorized_tensor_info_ptr->inp_pos_to_word_size_map_to_verify;
-  auto& out_pos_to_word_size_map_to_verify =
-      vectorized_tensor_info_ptr->out_pos_to_word_size_map_to_verify;
-
-  for (auto entry : tv_to_vector_word_size) {
-    auto tv = entry.first;
-    auto word_size = entry.second;
-    if (tv->isFusionInput()) {
-      auto inp_it =
-          std::find(fusion->inputs().begin(), fusion->inputs().end(), tv);
-      TORCH_INTERNAL_ASSERT(
-          inp_it != fusion->inputs().end(),
-          "Could not find ",
-          tv,
-          " in fusion inputs.");
-      auto inp_pos = std::distance(fusion->inputs().begin(), inp_it);
-
-      if (global_inp_misaligned_tv.find(tv) != global_inp_misaligned_tv.end()) {
-        inp_misaligned_tensors_pos.emplace_back(inp_pos);
-      } else {
-        // Shouldn't visit same pos twice here, assert ?
-        inp_pos_to_word_size_map_to_verify[inp_pos] = word_size;
-      }
-    } else if (tv->isFusionOutput()) {
-      auto out_it =
-          std::find(fusion->outputs().begin(), fusion->outputs().end(), tv);
-      TORCH_INTERNAL_ASSERT(
-          out_it != fusion->outputs().end(),
-          "Could not find ",
-          tv,
-          " in provided fusion outputs.");
-      auto out_pos = std::distance(fusion->outputs().begin(), out_it);
-
-      if (global_out_misaligned_tv.find(tv) != global_out_misaligned_tv.end()) {
-        out_misaligned_tensors_pos.emplace_back(out_pos);
-      } else {
-        out_pos_to_word_size_map_to_verify[out_pos] = word_size;
-      }
-    }
-  }
-
-  return vectorized_tensor_info_ptr;
-}
-
 } // namespace executor_utils
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index d851be48991f..37817838f386 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -28,28 +28,16 @@ namespace executor_utils {
 // Include all the functions we might need in generated code
 std::string kernelPreamble();
 
-// TODO(kir): rewrite in terms of Kernel inputs
 void validateKernelInputs(
     Fusion* fusion,
     const at::ArrayRef<IValue>& inputs,
     const c10::Device& device);
 
-// TODO(kir): rewrite in terms of Kernel outputs
 void validateKernelOutputs(
     Fusion* fusion,
     const std::vector<at::Tensor>& outputs,
     const c10::Device& device);
 
-// Returns if vectorizing the aten value by word size is possible
-bool canVectorize(const IValue& aten_val, int word_size);
-
-// Returns if vectorizing the aten value by word size is possible
-bool canVectorize(
-    TensorView* fusion_tv,
-    int word_size,
-    GpuLower& lower,
-    kir::ExpressionEvaluator& expr_eval);
-
 //! Bind kernel input values to runtime values
 kir::ExpressionEvaluator bindKernelInputs(
     const at::ArrayRef<IValue>& aten_inputs,
@@ -67,7 +55,8 @@ struct NvrtcFunction {
 
 void initializeCudaContext();
 
-NvrtcFunction nvrtcCompile(
+// Returns executable function and the ptxas log from compilation
+std::pair<NvrtcFunction, std::string> nvrtcCompile(
     const std::string& code,
     const std::string& func_name,
     int id,
@@ -112,7 +101,7 @@ class ParallelBindingIterDomains {
 class ParallelIterExtentMap {
  public:
   using DataType =
-      std::unordered_map<ParallelType, std::vector<const kir::Val*>, TypeHash>;
+      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
   static const CompileTimeEntryType EntryType =
       CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP;
 };
@@ -133,7 +122,7 @@ class ParallelIterExtentMap {
 class SimplifiedParallelIterExtentMap {
  public:
   using DataType =
-      std::unordered_map<ParallelType, std::vector<const kir::Val*>, TypeHash>;
+      std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
   static const CompileTimeEntryType EntryType =
       CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP;
 };
@@ -141,8 +130,8 @@ class SimplifiedParallelIterExtentMap {
 //!  WarpPaddedExtentsInfo:
 //!    Auxiliary data type for entry class WarpPaddedParallelExtents
 struct WarpPaddedExtentsInfo {
-  std::unordered_set<const kir::Val*> warp_padded_extent_set;
-  std::unordered_map<const kir::Val*, int64_t> warp_padded_constant;
+  std::unordered_set<const Val*> warp_padded_extent_set;
+  std::unordered_map<const Val*, int64_t> warp_padded_constant;
 };
 
 //! Compile-time info to be cached in each FusionExecutor:
@@ -159,13 +148,18 @@ class WarpPaddedParallelExtents {
 //!  VectorizedTensorInfo:
 //!    Auxiliary data type for entry class VectorizedTensorValidation
 struct VectorizedTensorInfo {
+  //! Aligned vectorized fusion inputs
+  std::vector<int> aligned_vectorized_inp_tensor_pos;
+  //! Aligned vectorized fusion outputs
+  std::vector<int> aligned_vectorized_out_tensor_pos;
+  //! Misaligned vectorized input tensors
   std::unordered_set<TensorView*> global_inp_misaligned_tv;
+  //! Misaligned vectorized output tensors
   std::unordered_set<TensorView*> global_out_misaligned_tv;
-  std::unordered_map<TensorView*, int> tv_to_vector_word_size;
+  //! Positions of misaligned input tensors
   std::vector<int> inp_misaligned_tensors_pos;
+  //! Positions of misaligned output tensors
   std::vector<int> out_misaligned_tensors_pos;
-  std::unordered_map<int, int> inp_pos_to_word_size_map_to_verify;
-  std::unordered_map<int, int> out_pos_to_word_size_map_to_verify;
 };
 
 //! Compile-time info to be cached in each FusionExecutor:
@@ -284,42 +278,33 @@ class ExecutorCompileTimeEntry {
 //! Returns the vector of tensorviews that will be used to bind parallel
 //!  dimensions.
 std::vector<IterDomain*> getParallelBindingsIterDomains(
-    GpuLower& lower,
+    GpuLower* lower,
     const std::vector<TensorView*>& used_tvs);
 
 using ParallelExtentMap =
-    std::unordered_map<ParallelType, std::vector<const kir::Val*>, TypeHash>;
+    std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
 
 //! Returns the extents of all parallel binding iterdomains corresponding
 //!  to each parallel type.
 std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
-    GpuLower& lower,
     std::vector<IterDomain*>& parallel_binding_ids);
 
 //! Returns the simplified set of extents necessary for launch parameter
 //!  binding.
 std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
-    GpuLower& lower,
+    GpuLower* lower,
     std::vector<IterDomain*>& parallel_binding_ids);
 
 //! Returns the symbolic or constant extetns of warp padded parallel
 //!  iterdomains in the given vector.
 std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
-    GpuLower& lower,
+    kir::Kernel* lower,
     std::vector<IterDomain*>& parallel_binding_ids);
 
-//! Returns the position information of vectorized input/output tensors
-//!  in the given fusion.
-std::unique_ptr<caching::VectorizedTensorInfo> getVectorizedTensorValidationInfo(
-    Fusion* fusion,
-    GpuLower& lower);
-
-// TODO(kir): rewrite in terms of Kernel tensors
 void validateVectorizedTensors(
-    Fusion* fusion,
+    kir::Kernel* kernel,
     const at::ArrayRef<IValue>& inputs,
     const std::vector<at::Tensor>& outputs,
-    GpuLower& lower,
     caching::ExecutorCompileTimeInfoCache* data_cache,
     kir::ExpressionEvaluator& expr_eval);
 
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
index ced4b59a7831..5630743b6f69 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp
index d9d71e53c414..33cf499bc18b 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion.cpp
@@ -8,10 +8,9 @@
 #include <torch/csrc/jit/codegen/cuda/ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -31,19 +30,16 @@ FusionGuard::~FusionGuard() {
 Fusion* FusionGuard::getCurFusion() {
   return ACTIVE_FUSION;
 }
+void FusionGuard::setCurFusion(Fusion* fusion) {
+  ACTIVE_FUSION = fusion;
+}
 
 void swap(Fusion& a, Fusion& b) noexcept {
   FUSER_PERF_SCOPE("Fusion swap");
 
   using std::swap;
 
-  // Swap the content
-  swap(a.val_set_, b.val_set_);
-  swap(a.expr_set_, b.expr_set_);
-  swap(a.val_deque_, b.val_deque_);
-
-  swap(a.val_type_name_map_, b.val_type_name_map_);
-  swap(a.expr_name_counter_, b.expr_name_counter_);
+  swap(static_cast<IrContainer&>(a), static_cast<IrContainer&>(b));
 
   swap(a.inputs_, b.inputs_);
   swap(a.outputs_, b.outputs_);
@@ -51,27 +47,6 @@ void swap(Fusion& a, Fusion& b) noexcept {
   swap(a.io_alias_, b.io_alias_);
   swap(a.permuted_input_map_, b.permuted_input_map_);
   swap(a.permuted_output_map_, b.permuted_output_map_);
-
-  // Fixup the Statement::fusion_ links for a
-  for (auto val : a.val_set_) {
-    val->fusion_ = &a;
-  }
-  for (auto expr : a.expr_set_) {
-    expr->fusion_ = &a;
-  }
-
-  // Fixup the Statement::fusion_ links for b
-  for (auto val : b.val_set_) {
-    val->fusion_ = &b;
-  }
-  for (auto expr : b.expr_set_) {
-    expr->fusion_ = &b;
-  }
-}
-
-Fusion::Fusion(const Fusion& other) {
-  FUSER_PERF_SCOPE("Fusion copy");
-  Fusion::copy(&other, this);
 }
 
 std::unique_ptr<SegmentedFusion> Fusion::segment(
@@ -82,30 +57,21 @@ std::unique_ptr<SegmentedFusion> Fusion::segment(
 
 IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
   to->clear();
-  IrCloner ir_cloner(to);
+  auto ir_cloner = IrContainer::copy(from, to);
 
-  for (auto val : from->val_set_) {
-    to->val_set_.insert(ir_cloner.clone(val));
-  }
-
-  for (auto expr : from->expr_set_) {
-    to->expr_set_.insert(ir_cloner.clone(expr));
-  }
-
-  for (auto val : from->val_deque_) {
-    to->val_deque_.push_back(ir_cloner.clone(val));
-  }
-
-  for (auto val : from->val_set_) {
+  for (auto val : from->vals_) {
     ir_cloner.clone(val)->setDefinition(ir_cloner.clone(val->definition_));
     ir_cloner.clone(val)->setUses(ir_cloner.clone(val->uses_));
   }
 
-  to->val_type_name_map_ = from->val_type_name_map_;
-  to->expr_name_counter_ = from->expr_name_counter_;
-
   to->inputs_ = ir_cloner.clone(from->inputs_);
   to->outputs_ = ir_cloner.clone(from->outputs_);
+  for (auto inp : to->inputs_) {
+    inp->setIsFusionInput(true);
+  }
+  for (auto out : to->outputs_) {
+    out->setIsFusionOutput(true);
+  }
 
   // TODO: put this into ir_cloner instead
   for (const auto& entry : from->io_alias_) {
@@ -117,9 +83,22 @@ IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
   to->permuted_input_map_ = from->permuted_input_map_;
   to->permuted_output_map_ = from->permuted_output_map_;
 
+  to->all_tv_uses_valid_ = from->all_tv_uses_valid_;
+  // This should never be true on copy, but copying for completeness.
+  to->is_during_update_uses_ = from->is_during_update_uses_;
+
   return ir_cloner;
 }
 
+// Clang tidy complains when using default constructor for IrContainer instead
+// of copy constructor. Fusion::copy has a call to IrContainer::copy, so it's
+// redundant to use the IrContainer copy constructor, but it is harmless since
+// Fusion::copy starts by calling clear().
+Fusion::Fusion(const Fusion& other) : IrContainer(other) {
+  FUSER_PERF_SCOPE("Fusion copy");
+  Fusion::copy(&other, this);
+}
+
 Fusion::Fusion(Fusion&& other) noexcept {
   FUSER_PERF_SCOPE("Fusion move");
   swap(*this, other);
@@ -147,36 +126,22 @@ Fusion::~Fusion() {
 void Fusion::clear() noexcept {
   FUSER_PERF_SCOPE("Fusion clear");
 
-  // Free the owned values
-  for (auto ptr : val_set_) {
-    delete ptr;
-  }
-
-  // Free the owned expressions
-  for (auto ptr : expr_set_) {
-    delete ptr;
-  }
-
-  val_set_.clear();
-  val_deque_.clear();
-  expr_set_.clear();
-
-  for (auto& kv : val_type_name_map_) {
-    kv.second = 0;
-  }
-
-  expr_name_counter_ = 0;
+  IrContainer::clear();
 
   inputs_.clear();
   outputs_.clear();
 
   io_alias_.clear();
+
   permuted_input_map_.clear();
   permuted_output_map_.clear();
+
+  all_tv_uses_valid_ = false;
+  is_during_update_uses_ = false;
 }
 
 void Fusion::removeExpr(Expr* expr) {
-  assertInFusion(expr, "Cannot remove expr ");
+  assertInContainer(expr, "Cannot remove expr ");
   // If we hit this error too frequently, we could lighten the restrictions so
   // that removing something that doesn't exist simply does nothing. For now,
   // we're going with the strictest model which errors.
@@ -194,13 +159,11 @@ void Fusion::removeExpr(Expr* expr) {
     }
   }
 
-  expr_set_.erase(expr);
-
-  delete expr;
+  IrContainer::removeExpr(expr);
 }
 
 void Fusion::removeVal(Val* val) {
-  assertInFusion(val, "Cannot remove val ");
+  assertInContainer(val, "Cannot remove val ");
 
   TORCH_CHECK(
       !val->isFusionInput(),
@@ -213,26 +176,26 @@ void Fusion::removeVal(Val* val) {
   if (orig != nullptr)
     removeExpr(val->definition());
 
-  for (Expr* use : unordered_uses(val))
+  for (Expr* use : unordered_uses(val)) {
     removeExpr(use);
-
-  val_set_.erase(val);
-
-  for (auto it = val_deque_.begin(); it != val_deque_.end(); it++)
-    if (*it == val) {
-      val_deque_.erase(it);
-      break;
-    }
-
-  delete val;
+  }
+  IrContainer::removeVal(val);
 }
 
 void Fusion::addInput(Val* input) {
-  assertInFusion(input, "Cannot register input ");
+  assertInContainer(input, "Cannot register input ");
+
+  TORCH_INTERNAL_ASSERT(
+      input->getDataType() != DataType::Index,
+      "Data type Index is a local compile time data type only, it cannot be used as an input in case it was generated from another kernel.");
 
   if (input->getValType().value() == ValType::TensorView) {
     auto tv = input->as<TensorView>();
     tv->setMemoryType(MemoryType::Global);
+  } else if (input->getValType().value() == ValType::Scalar) {
+    TORCH_CHECK(
+        !input->isConst(),
+        "Immediate scalar value cannot be added as an input. It is not necessary to pass it as an input.");
   }
 
   inputs_.push_back(input);
@@ -242,7 +205,20 @@ void Fusion::addInput(Val* input) {
 }
 
 void Fusion::addOutput(Val* output) {
-  assertInFusion(output, "Cannot register output ");
+  // We currently don't support explicitly outputing aliased inputs. This is
+  // because they are already marked as output for in-place update. It's tricky
+  // to allow marking them explicitly as real output, since that requires us to
+  // register/identify output not only by `Val*` pointer, but also by indices;
+  // it also requires us to magically arrange `outputs_` entries in proper order
+  // ^^^ this doesn't look intuitive on `outputs_` in fusion.
+  // I think we can solve this by marking addOutput on io_alias_ keys after
+  // fusion is fully defined. Tracking this in #1488
+  // Apparently we can't do this neither at the time. I think segmentation
+  // unfortunately would call addOutput after we marked io_alias_ map.
+  // TORCH_CHECK(io_alias_.count(output) == 0,
+  //     "can't register aliased output as real output");
+
+  assertInContainer(output, "Cannot register output ");
   if (output->getValType().value() == ValType::TensorView) {
     auto tv = output->as<TensorView>();
     tv->setMemoryType(MemoryType::Global);
@@ -285,7 +261,11 @@ void Fusion::replaceOutput(Val* output, Val* replacement) {
   TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion");
 
   if (find_output != outputs_.end()) {
-    *find_output = replacement;
+    std::replace_if(
+        outputs_.begin(),
+        outputs_.end(),
+        [&output](Val* v) { return v == output; },
+        replacement);
 
     if (replacement->getValType().value() == ValType::TensorView) {
       replacement->setIsFusionOutput(true);
@@ -307,27 +287,8 @@ void Fusion::replaceOutput(Val* output, Val* replacement) {
   }
 }
 
-bool Fusion::inFusion(const Statement* stmt) const {
-  bool in_fusion = stmt->fusion() == this;
-  Statement* nonconst_stmt = const_cast<Statement*>(stmt); // NOLINT
-
-  if (stmt->isExpr()) {
-    in_fusion &= expr_set_.find(nonconst_stmt->as<Expr>()) != expr_set_.end();
-  }
-  if (stmt->isVal()) {
-    in_fusion &= val_set_.find(nonconst_stmt->as<Val>()) != val_set_.end();
-  }
-
-  return in_fusion;
-}
-
-void Fusion::assertInFusion(const Statement* stmt, const std::string& msg)
-    const {
-  TORCH_CHECK(inFusion(stmt), msg, " it was not found in the active fusion.");
-}
-
 std::vector<Expr*> Fusion::exprs() {
-  return ExprSort::getExprs(this);
+  return StmtSort::getExprs(this);
 }
 
 std::vector<Val*> Fusion::inputsOf(Val* val) {
@@ -341,12 +302,24 @@ void Fusion::validateInputs() {
       all_inputs.insert(input);
     }
   }
+
+  std::unordered_set<Val*> input_dims;
+  auto inp_tvs = ir_utils::filterByType<TensorView>(inputs());
+  for (auto tv : inp_tvs) {
+    for (auto id : tv->getMaybeRFactorDomain()) {
+      input_dims.emplace(id->extent());
+    }
+  }
   for (Val* input : all_inputs) {
     if (!input->isConstScalar()) {
       TORCH_CHECK(
-          hasInput(input) || inFusion(input),
+          input->isFusionInput() ||
+              // TODO: Switch:
+              inContainer(input),
+          // to: input_dims.find(input) != input_dims.end(),
+          // https://github.com/csarofeen/pytorch/issues/1365
           "Could not figure out how ",
-          input,
+          input->toString(),
           " is generated, however it was not specified as an input.");
     }
   }
@@ -365,9 +338,13 @@ void Fusion::print() {
   std::cout << "}\n\n";
 }
 
-void Fusion::printKernel() {
+void Fusion::printKernel(DataType index_type) {
   FUSER_PERF_SCOPE("Fusion::printKernel");
-  std::cout << codegen::generateCudaKernel(GpuLower(this).kernel());
+  TORCH_INTERNAL_ASSERT(
+      !this->isA<kir::Kernel>(),
+      "Cannot \"print kernel\" of a kernel container. ",
+      "This would require lowering during lowering.");
+  std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
 }
 
 void Fusion::printMath(bool from_outputs_only) {
@@ -394,7 +371,7 @@ void Fusion::printMath(bool from_outputs_only) {
         leaf_vals.push_back(val);
       }
     }
-    exprs_for_print = ExprSort::getExprs(this, leaf_vals);
+    exprs_for_print = StmtSort::getExprs(this, leaf_vals);
   }
 
   std::cout << "\n%kernel_math {\n";
@@ -412,33 +389,36 @@ void Fusion::printTransforms() {
   t_exprs.handle(this);
 }
 
-StmtNameType Fusion::registerVal(Val* val) {
+void Fusion::registerVal(Val* val) {
+  if (inContainer(val)) {
+    return;
+  }
+
   if (val->fusion()) {
-    if (val->fusion() != this) {
-      TORCH_CHECK(false, val, " was not found in the active fusion.");
-    }
-    if (inFusion(val)) {
-      return val->name();
-    }
+    TORCH_CHECK(
+        val->fusion() == this, val, " was not found in the active fusion.");
   }
 
-  val_set_.emplace(val);
-  val_deque_.push_back(val);
-  return getValName(*(val->getValType()));
+  IrContainer::registerVal(val);
 }
 
-StmtNameType Fusion::registerExpr(Expr* expr) {
+void Fusion::registerExpr(Expr* expr) {
+  if (inContainer(expr)) {
+    return;
+  }
+
   if (expr->fusion()) {
-    if (expr->fusion() != this) {
-      TORCH_CHECK(false, expr, " was not found in the active fusion.");
-    }
-    if (inFusion(expr)) {
-      return expr->name();
-    }
+    TORCH_CHECK(
+        expr->fusion() == this, expr, " was not found in the active fusion.");
   }
 
+  IrContainer::registerExpr(expr);
+
+  bool has_tv = false;
+
   for (Val* input : expr->inputs()) {
-    assertInFusion(input, "Input to expr is invalid, ");
+    has_tv = has_tv || input->isA<TensorView>();
+    assertInContainer(input, "Input to expr is invalid, ");
     auto uses_copy = input->uses();
     if (std::find(uses_copy.begin(), uses_copy.end(), expr) ==
         uses_copy.end()) {
@@ -447,34 +427,25 @@ StmtNameType Fusion::registerExpr(Expr* expr) {
     }
   }
 
+  // Kernel is the only container type that is non-ssa. This is mainly (maybe
+  // only) because of initialization expressions which would overwrite tensor
+  // view definitions.
+  bool is_ssa = !this->isA<kir::Kernel>();
+
   for (Val* output : expr->outputs()) {
-    assertInFusion(output, "Output to expr is invalid, ");
-    if (output->definition() != nullptr) {
+    has_tv = has_tv || output->isA<TensorView>();
+    assertInContainer(output, "Output to expr is invalid, ");
+    if (output->definition() != nullptr && is_ssa) {
       removeExpr(output->definition());
     }
-    output->setDefinition(expr);
+    if (is_ssa || (!is_ssa && output->definition() == nullptr)) {
+      output->setDefinition(expr);
+    }
   }
 
-  expr_set_.emplace(expr);
-
-  resetTvUses();
-  return getExprName();
-}
-
-StmtNameType Fusion::registerStatement(Statement* stmt) {
-  if (inFusion(stmt))
-    return stmt->name();
-
-  if (stmt->isVal()) {
-    return registerVal(stmt->as<Val>());
-  } else if (stmt->isExpr()) {
-    return registerExpr(stmt->as<Expr>());
+  if (has_tv) {
+    resetTvUses();
   }
-
-  TORCH_INTERNAL_ASSERT(
-      false,
-      "Could not register statement as Fusion could not recognize its type.");
-  return kInvalidStmName;
 }
 
 void Fusion::resetTvUses() {
@@ -484,8 +455,8 @@ void Fusion::resetTvUses() {
   // getExprs only uses definition, so even if we've modified uses already to
   // remove dead exprs, this could reinsert them. getExprs is also boundeds by
   // inputs as registered inputs will return nullptr as their definition.
-  const auto all_tvs = ir_utils::filterByType<TensorView>(val_set_);
-  const auto used_exprs = ExprSort::getExprs(this);
+  const auto all_tvs = ir_utils::filterByType<TensorView>(vals_);
+  const auto used_exprs = StmtSort::getExprs(this);
 
   for (auto tv : all_tvs) {
     tv->setUses({});
@@ -507,14 +478,6 @@ void Fusion::resetTvUses() {
   is_during_update_uses_ = false;
 }
 
-const std::unordered_set<Val*>& Fusion::vals() const noexcept {
-  return val_set_;
-}
-
-const std::deque<Val*>& Fusion::deterministic_vals() const noexcept {
-  return val_deque_;
-}
-
 std::vector<Val*> Fusion::usedMathVals() {
   // Note that using fusion->inputs() as the argument for the first
   // parameter of getAllValsBetween does not grab all used vals as
@@ -553,37 +516,15 @@ std::vector<Val*> Fusion::usedMathVals() {
   return used_math_vals;
 }
 
-const std::unordered_set<Expr*>& Fusion::unordered_exprs() const noexcept {
-  return expr_set_;
-}
-
 std::unordered_set<Expr*> Fusion::unordered_uses(Val* val) const {
   return std::unordered_set<Expr*>(val->uses().begin(), val->uses().end());
 }
 
 Expr* Fusion::definition(const Val* val) const {
-  assertInFusion(val, "Cannot detect the definition of val, ");
+  assertInContainer(val, "Cannot detect the definition of val, ");
   return val->definition();
 }
 
-bool Fusion::hasInput(const Val* val) const {
-  assertInFusion(val, "Cannot check if val is an input, ");
-  return val->isFusionInput();
-}
-
-bool Fusion::hasOutput(const Val* val) const {
-  assertInFusion(val, "Cannot check if val is an output, ");
-  return val->isFusionOutput();
-}
-
-StmtNameType Fusion::getValName(ValType vtype) {
-  return val_type_name_map_[vtype]++;
-}
-
-StmtNameType Fusion::getExprName() {
-  return expr_name_counter_++;
-}
-
 // Indicate to kernel to set itself up to generate random numbers
 bool Fusion::isStochastic() {
   for (auto expr : exprs())
@@ -593,28 +534,6 @@ bool Fusion::isStochastic() {
   return false;
 }
 
-bool Fusion::hasReduction() {
-  FUSER_PERF_SCOPE("Fusion::hasReduction");
-
-  for (auto expr : exprs())
-    for (auto out : expr->outputs())
-      if (out->getValType() == ValType::TensorView)
-        if (out->as<TensorView>()->hasReduction())
-          return true;
-
-  return false;
-}
-
-bool Fusion::hasWelford() {
-  FUSER_PERF_SCOPE("Fusion::hasWelford");
-  for (auto expr : exprs()) {
-    if (expr->isA<WelfordOp>()) {
-      return true;
-    }
-  }
-  return false;
-}
-
 std::vector<Val*> Fusion::getTerminatingOutputs() {
   FUSER_PERF_SCOPE("getTerminatingOutputs");
 
@@ -682,6 +601,33 @@ bool Fusion::isAliasCompatible(Val* left, Val* right) {
 }
 
 void Fusion::aliasOutputToInput(Val* output, Val* input) {
+  // Because we could cast output when input is cast.
+  TORCH_INTERNAL_ASSERT(
+      !output->isFusionOutput(),
+      "Do NOT add aliased output to fusion output outside of `aliasOutputToInput");
+
+  if (!input->isFusionInput()) {
+    auto input_expr = input->definition();
+    // TORCH_INTERNAL_ASSERT(input_def.etype() == ExprType::UnaryOp, "expected
+    // unary op for aliased input");
+    TORCH_INTERNAL_ASSERT(
+        input_expr->isA<UnaryOp>(), "expected unary op for aliased input");
+    auto input_uop = input_expr->as<UnaryOp>();
+    TORCH_INTERNAL_ASSERT(
+        input_uop->getUnaryOpType() == UnaryOpType::Cast,
+        "expected aliased input to be output of cast op");
+    input = input_uop->in();
+  }
+  TORCH_INTERNAL_ASSERT(
+      input->getDataType().has_value() && output->getDataType().has_value(),
+      "requires DataType to be available for aliased output to input");
+
+  if (input->getDataType().value() != output->getDataType().value()) {
+    output = castOp(input->getDataType().value(), output);
+  }
+  // TODO: output should be marked at the end of fusion definition #1488
+  addOutput(output);
+
   TORCH_INTERNAL_ASSERT(
       isAliasCompatible(input, output),
       "The input and output values are not alias-compatible.");
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
index c892bd8171c8..d67d0e2fea9a 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.h
+++ b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -1,10 +1,11 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_container.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
 #include <unordered_map>
@@ -65,18 +66,19 @@ class TORCH_CUDA_CU_API FusionGuard {
   ~FusionGuard();
 
   static Fusion* getCurFusion();
+  static void setCurFusion(Fusion* fusion);
 };
 
 //! Fusion is mutable but unique. Nodes cannot be copied in any way from one
 //! Fusion to another. If anything like that is desired, it would require
-//! duplicating all associated values and exprs. Fusion is considered to SSA,
+//! duplicating all associated values and exprs. Fusion is considered to be SSA,
 //! though this could also change in the future if there is a good reason to do
 //! so.
 //!
 //! The Fusion owns the whole IR graph (Vals and Exprs)
 //!
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API Fusion final {
+class TORCH_CUDA_CU_API Fusion : public IrContainer {
   typedef std::unordered_map<int, std::vector<int64_t>> PermutationMap;
 
  public:
@@ -96,45 +98,30 @@ class TORCH_CUDA_CU_API Fusion final {
 
   //! Break dependency chains associated with Expr, remove references to expr
   //! delete expr
-  void removeExpr(Expr* expr);
+  void removeExpr(Expr* expr) override;
 
   //! Completely remove val from the fusion, break all dependencies associated
   //! with it
-  void removeVal(Val* val);
+  void removeVal(Val* val) override;
 
   //! Register input as an input of the fusion
-  // TODO: Rename to register
   void addInput(Val* input);
 
   //! Register output as an output of the fusion
-  // TODO: Rename to register
   void addOutput(Val* output);
 
   //! Register output as an output of the fusion
-  // TODO: Rename to register
   void addOutput(WelfordResult& output);
 
   //! Deregister input as an input of the fusion
-  // TODO: Rename to register
   void removeInput(Val* input);
 
   //! Deregister output as an output of the fusion
-  // TODO: Rename to register
   void removeOutput(Val* output);
 
   //! Replace output with another value
   void replaceOutput(Val* output, Val* replacement);
 
-  //! Clear Expr's from TV uses that are not required to produce outputs from
-  //! inputs
-  void resetTvUses();
-
-  //! Check if stmt is properly registered with this fusion
-  bool inFusion(const Statement* stmt) const;
-
-  //! Throw an error if stmt is not in this fusion
-  void assertInFusion(const Statement* stmt, const std::string& msg = "") const;
-
   //! Assert that all leaves found from outputs are registered as an input
   void validateInputs();
 
@@ -149,18 +136,7 @@ class TORCH_CUDA_CU_API Fusion final {
   void printTransforms();
 
   //! Lower the fusion and print a kernel
-  void printKernel();
-
-  //! Register the Val with this fusion
-  StmtNameType registerVal(Val* val);
-
-  //! Register expr with this fusion.
-  //! When we register an expression, we want to update the dependency tracking
-  //! of Vals. We add expr to our general expr_set_,
-  StmtNameType registerExpr(Expr* expr);
-
-  //! Register stmt with this fusion
-  StmtNameType registerStatement(Statement* stmt);
+  void printKernel(DataType index_type = DataType::Int);
 
   //! Return a list of topologically sorted expressions. This only includes
   //! exprs required to genereate registered outputs.
@@ -169,12 +145,6 @@ class TORCH_CUDA_CU_API Fusion final {
   //! Return a vector of fusion inputs that feed this Val
   std::vector<Val*> inputsOf(Val* val);
 
-  //! Return the set of Vals registered with this fusion
-  const std::unordered_set<Val*>& vals() const noexcept;
-
-  //! Return in insertion order
-  const std::deque<Val*>& deterministic_vals() const noexcept;
-
   //! Return all Vals in math expressions that cannot be eliminated.
   //!
   //! It is generally equivalent to vals that are used to generate
@@ -183,11 +153,6 @@ class TORCH_CUDA_CU_API Fusion final {
   //! also included as they must show up in the final code.
   std::vector<Val*> usedMathVals();
 
-  //! Return the set of Exprs registered with this fusion. Warning: This will
-  //! return exprs outside inputs/outputs, so can be unsafe for use with
-  //! segmented fusions.
-  const std::unordered_set<Expr*>& unordered_exprs() const noexcept;
-
   //! Return all Exprs that use val
   std::unordered_set<Expr*> unordered_uses(Val* val) const;
 
@@ -197,12 +162,6 @@ class TORCH_CUDA_CU_API Fusion final {
   //! Indicate to kernel to set itself up to generate random numbers
   bool isStochastic();
 
-  //! Indicate that the fusion contains reduction operations
-  bool hasReduction();
-
-  //! Indicate that the fusion contains welford operations
-  bool hasWelford();
-
   //! Run fusion segmentation algorithm to create a segmented fusion
   std::unique_ptr<SegmentedFusion> segment(
       const at::ArrayRef<at::IValue>& inputs);
@@ -217,9 +176,6 @@ class TORCH_CUDA_CU_API Fusion final {
 
   std::vector<Val*> getTerminatingOutputs();
 
-  bool hasInput(const Val* val) const;
-  bool hasOutput(const Val* val) const;
-
   // Aliasing output to input value, this is a WAR to allow inplace update on
   // input tensor.
   // Note: this is not always safe and should be used with extra caution.
@@ -262,36 +218,40 @@ class TORCH_CUDA_CU_API Fusion final {
     return is_during_update_uses_;
   }
 
+  const auto& ioAlias() const {
+    return io_alias_;
+  }
+
  protected:
   friend SegmentCandidateFinder;
   friend SegmentedFusion;
   friend class TranslateApplicableWelford;
+  friend Val;
 
   static IrCloner copy(const Fusion* from, Fusion* to);
 
- private:
-  // Return an int that monotonically increases for each val/expr, some are
-  // explicitly incremented by type.
-  StmtNameType getValName(ValType vtype);
-  StmtNameType getExprName();
+  //! Register the Val with this fusion
+  virtual void registerVal(Val* val) override;
+
+  //! Register expr with this fusion.
+  //! When we register an expression, we want to update the dependency tracking
+  //! of Vals. If this container is a not a Kernel, it will remove previous
+  //! definitions of outputs and register this Expr as the definition. Otherwise
+  //! will update definition if not previously set, but will not remove old
+  //! definitions.
+  virtual void registerExpr(Expr* expr) override;
 
+  //! Clear Expr's from TV uses that are not required to produce outputs from
+  //! inputs. Only other place this is used (other than Fusion) is in
+  //! Val::uses()
+  void resetTvUses();
+
+ private:
   // Determine if the two values are compatible for aliasing
   // Same DataType, ValType, and number of dimensions
   bool isAliasCompatible(Val* left, Val* right);
 
  private:
-  // Sets of all Vals/Exprs registered with this fusion
-  // (val_deque_ is not owning the objects)
-  std::unordered_set<Val*> val_set_;
-  std::deque<Val*> val_deque_;
-  std::unordered_set<Expr*> expr_set_;
-
-  // Values names counters
-  std::unordered_map<ValType, StmtNameType, TypeHash> val_type_name_map_;
-
-  // Expression names counter
-  StmtNameType expr_name_counter_ = 0;
-
   // Fusion inputs and outputs
   std::vector<Val*> inputs_;
   std::vector<Val*> outputs_;
diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
index 9ff257808141..1138af0ca363 100644
--- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
 
 #include <sstream>
 
@@ -322,7 +323,7 @@ void SegmentedFusion::draw() {
 
   for (auto group : groups()) {
     for (auto expr : group->exprs()) {
-      if (ir_utils::isTVOp(expr)) {
+      if (ir_utils::isTvOp(expr)) {
         expr_color_map[expr] = group_index;
       }
     }
@@ -559,7 +560,10 @@ std::vector<Expr*> groupExprPrintSorting(const std::vector<Expr*>& exprs) {
   std::unordered_set<Expr*> exprs_to_print_set(exprs.begin(), exprs.end());
   std::unordered_set<Expr*> exprs_visited;
   std::vector<Expr*> sorted_list;
-  while (sorted_list.size() != exprs_to_print.size()) {
+  while (!std::all_of(
+      exprs_to_print.begin(),
+      exprs_to_print.end(),
+      [&exprs_visited](auto expr) { return exprs_visited.count(expr); })) {
     bool expr_added_to_sorted_list = false;
     for (auto expr : exprs_to_print) {
       if (!exprs_visited.count(expr)) {
@@ -652,15 +656,15 @@ TensorView* castIntermediateValueInCompleteFusion(
     // Keep broadcast axes and remove reduction axes
     size_t i = 0;
     auto no_reduction_root_domain =
-        TensorDomain::noReductions(original_tv->getRootDomain());
+        TensorDomain::noReductions(original_tv->getMaybeRFactorDomain());
     std::vector<IterDomain*> new_root_domain(no_reduction_root_domain.size());
     for (const auto& dom : no_reduction_root_domain) {
-      new_root_domain[i++] = dom->clone();
+      new_root_domain[i++] = dom->cloneWithoutRFactor();
     }
 
     // Create the actual domain and tv.
-    return new TensorView(
-        new TensorDomain(
+    return IrBuilder::create<TensorView>(
+        IrBuilder::create<TensorDomain>(
             new_root_domain, std::vector<bool>(new_root_domain.size(), true)),
         data_type);
   };
@@ -680,8 +684,8 @@ TensorView* castIntermediateValueInCompleteFusion(
   }
 
   // Insert the cast ops.
-  new UnaryOp(UnaryOpType::Cast, half_precision_tv, original_tv);
-  new UnaryOp(UnaryOpType::Cast, fp32_tv, half_precision_tv);
+  IrBuilder::create<UnaryOp>(UnaryOpType::Cast, half_precision_tv, original_tv);
+  IrBuilder::create<UnaryOp>(UnaryOpType::Cast, fp32_tv, half_precision_tv);
 
   // Return the new tv to replace original tv with
   //  on the segmented edges.
@@ -721,7 +725,7 @@ void SegmentedFusion::finalize() {
     //            \ -> half2float -> other uses in group
     // The conversion back and forth from half precision can hurt numerics.
     // Collect expressions that use the edge value of concern within the from
-    // group to avoid replacing with the casted tensor.
+    // group to avoid replacing with the cast tensor.
     std::unordered_set<Expr*> uses_in_from_group;
 
     // All expressions in the from group of the edge
@@ -1125,6 +1129,7 @@ std::ostream& operator<<(
         return group_order.at(edge_a->from) < group_order.at(edge_b->from);
       });
 
+  os << "Segmented_Fusion Dump: -- fusion segments:\n";
   os << "Segmented_Fusion{ \n";
   os << "groups: \n";
   for (const auto g : sorted_groups_to_print) {
@@ -1143,6 +1148,9 @@ std::ostream& operator<<(
 }
 
 void SegmentedFusion::print() const {
+  std::cout << "Segmented_Fusion Dump: -- Re-written complete fusion:{\n";
+  completeFusion()->printMath();
+  std::cout << "} // {Re-written complete fusion}\n";
   std::cout << this << "\n";
 }
 
@@ -1170,14 +1178,24 @@ std::unique_ptr<Fusion> SegmentedFusion::makeFusion(SegmentedGroup* sg) {
     fusion_segment->removeOutput(out);
   }
 
+  std::vector<TensorView*> view_tvs;
   for (auto inp : getAllInputs(sg)) {
-    fusion_segment->addInput(complete_to_segment_map.clone(inp));
+    auto clone_tv = complete_to_segment_map.clone(inp);
+    fusion_segment->addInput(clone_tv);
+    if (inp->isDefinitionType(ExprType::ViewOp)) {
+      TORCH_INTERNAL_ASSERT(clone_tv != nullptr && clone_tv->isA<TensorView>());
+      view_tvs.push_back(clone_tv->as<TensorView>());
+    }
   }
 
   for (auto out : getAllOutputs(sg)) {
     fusion_segment->addOutput(complete_to_segment_map.clone(out));
   }
 
+  for (auto tv : view_tvs) {
+    tv->convertRfactorToRootDomain();
+  }
+
   return fusion_segment;
 }
 
@@ -1570,6 +1588,8 @@ c10::optional<ScheduleHeuristic> tryMerge(
     SegmentedGroup* b = nullptr) {
   FusionSegmentGuard fsg(fusion, getAllInputs(a, b), getAllOutputs(a, b));
 
+  scheduler_debug_utils::canScheduleMessage(
+      "\n**Segmenter** Considering fusion:\n", fusion);
   return SchedulerEntry::proposeHeuristics(fusion, runtime_info);
 }
 
@@ -1581,6 +1601,8 @@ c10::optional<ScheduleHeuristic> tryMerge(
       fusion,
       allInputsIfTrueElseOutputs(segmented_groups, true),
       allInputsIfTrueElseOutputs(segmented_groups, false));
+  scheduler_debug_utils::canScheduleMessage(
+      "\n**Segmenter** Considering fusion:\n", fusion);
   return SchedulerEntry::proposeHeuristics(fusion, runtime_info);
 }
 
@@ -1740,9 +1762,10 @@ TranslateApplicableWelford::TranslateApplicableWelford(
     Fusion* fusion,
     const at::ArrayRef<IValue>& runtime_inputs)
     : runtime_inputs_(runtime_inputs) {
+  auto exprs = fusion->exprs();
   std::vector<WelfordOp*> orignal_welfords(
-      ir_utils::filterByType<WelfordOp>(fusion->unordered_exprs()).begin(),
-      ir_utils::filterByType<WelfordOp>(fusion->unordered_exprs()).end());
+      ir_utils::filterByType<WelfordOp>(exprs).begin(),
+      ir_utils::filterByType<WelfordOp>(exprs).end());
 
   if (wouldTranslateToPersistent(orignal_welfords)) {
     for (auto welford : orignal_welfords) {
@@ -1829,6 +1852,14 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent(
       [&original_to_test_map](auto welford) {
         return original_to_test_map.clone(welford);
       });
+  // Copied welfords will be invalidated on translation, but Vals will be
+  // reused, keep a reference to them.
+  std::vector<Val*> welford_avgs;
+  std::vector<Val*> welford_vars;
+  for (auto welford : copied_welfords) {
+    welford_avgs.push_back(welford->outAvg());
+    welford_vars.push_back(welford->outVar());
+  }
 
   // Translate the welford ops
   for (auto welford_to_translate : copied_welfords) {
@@ -1860,6 +1891,21 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent(
           return original_to_test_map.clone(out);
         });
 
+    // If only average is used from welford, we should still translate, but we
+    // might not detect persistence if variance isn't actually used/marked as an
+    // output in the test.
+    for (auto outs_i : c10::irange(welford_avgs.size())) {
+      auto avg = welford_avgs[outs_i];
+      auto var = welford_vars[outs_i];
+      if (avg->uses().empty()) {
+        test_group_outputs_.push_back(avg);
+      }
+
+      if (var->uses().empty()) {
+        test_group_outputs_.push_back(var);
+      }
+    }
+
     // Temporarily localize test copy around
     //  the group boundary
     FusionSegmentGuard fsg(
@@ -1891,29 +1937,40 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) {
   auto out_N = welford->outN()->as<TensorView>();
 
   fusion->removeExpr(welford);
+  // Not safe to use welford anymore
+  welford = nullptr;
 
   // Create normalization based welford graph
   //  largely taken from batchnorm cpp benchmark
-  auto& in_root = in_val->getRootDomain();
-  auto& out_root = out_avg->getRootDomain();
+  const auto& in_root =
+      TensorDomain::noReductions(in_val->getMaybeRFactorDomain());
+  const auto& out_root = out_avg->getRootDomain();
   std::vector<int> red_axes;
 
+  TORCH_INTERNAL_ASSERT(
+      in_root.size() == out_root.size(),
+      "Invalid root domains of Welford input and output.",
+      " Input: ",
+      ir_utils::toString(in_root),
+      ". Output: ",
+      ir_utils::toString(out_root));
+
   // Create scalar version of the feature element
   //  counting.
-  Val* num_features = new Double(1);
+  Val* num_features = IrBuilder::create<Double>(1);
   std::vector<bool> broadcast_mask(in_root.size(), false);
   for (const auto i : c10::irange(in_root.size())) {
-    if (out_root[i]->isReduction()) {
+    if (out_root.at(i)->isReduction()) {
       red_axes.push_back(i);
       broadcast_mask[i] = true;
-      num_features = mul(num_features, out_root[i]->extent());
+      num_features = mul(num_features, out_root.at(i)->extent());
     }
   }
 
   // Build a normalization expression group that is
   //  equivalent to a welford operation.
   auto x_sum = sum(in_val, red_axes);
-  new BinaryOp(BinaryOpType::Div, out_avg, x_sum, num_features);
+  IrBuilder::create<BinaryOp>(BinaryOpType::Div, out_avg, x_sum, num_features);
   // welford.avg may be broadcast. Reuse it if found.
   TensorView* x_avg_bcast = nullptr;
   for (auto& use_expr : out_avg->uses()) {
@@ -1949,8 +2006,12 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) {
   }
 
   auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub);
-  new ReductionOp(BinaryOpType::Add, new Double(0.0), out_var, x_mean_sub_pow);
-  new UnaryOp(UnaryOpType::Set, out_N, num_features);
+  IrBuilder::create<ReductionOp>(
+      BinaryOpType::Add,
+      IrBuilder::create<Double>(0.0),
+      out_var,
+      x_mean_sub_pow);
+  IrBuilder::create<UnaryOp>(UnaryOpType::Set, out_N, num_features);
 
   // out_avg, out_N are now outputs of a pointwise ops and we
   //  need to clear out its reduction domains.
@@ -2584,7 +2645,8 @@ void SegmentCandidateFinder::findSegments() {
     while (!to_visit.empty()) {
       auto expr = to_visit.front();
       to_visit.pop_front();
-      if (expr->getExprType().value() != ExprType::UnaryOp) {
+      if (expr->getExprType().value() != ExprType::UnaryOp ||
+          expr->output(0)->isFusionOutput()) {
         continue;
       }
 
@@ -2687,14 +2749,20 @@ void SegmentCandidateFinder::findSegments() {
     }
   }
 
+  auto reduction_ops = ir_utils::getReductionOps(
+      segmented_fusion_->completeFusion(), true /* ignore_trivial */);
+  auto welford_ops = ir_utils::filterByType<WelfordOp>(reduction_ops);
+
   if (options_.run_translate_welford &&
-      segmented_fusion_->completeFusion()->hasWelford()) {
+      (welford_ops.begin() != welford_ops.end())) {
     TranslateApplicableWelford::run(segmented_fusion_.get(), runtime_inputs_);
   }
 
   for (auto group : groups()) {
-    // Set heuristics in case single reduction kernels were left out
-    group->setHeuristic(deriveHeuristic(group));
+    if (!group->outputs().empty()) {
+      // Set heuristics in case single reduction kernels were left out
+      group->setHeuristic(deriveHeuristic(group));
+    }
   }
 
   // Remove all scalar edges since they do not represent actual
@@ -2764,12 +2832,12 @@ void SegmentCandidateFinder::findSegments() {
 
   if (options_.run_final_merge) {
     // TODO: consider interleaving herrmman merge and bruteforce merge, as
-    // bruteforce merge can introduce
-    //  opportunities for more herrmann merge
+    // bruteforce merge can introduce opportunities for more herrmann merge
     finalMerge();
   }
 
   finalize();
+
   if (isDebugDumpEnabled(DebugDumpOption::FusionSegmentsDrawing)) {
     segmented_fusion_->draw();
   }
@@ -2913,7 +2981,7 @@ void SegmentCandidateFinder::resolveInputsInGroup(SegmentedGroup* group) {
   group->input_vals = IterVisitor::getInputsTo(group->inputs());
 
   // Grab all expressions needed to produce to_visit
-  auto input_exprs = ExprSort::getExprs(completeFusion(), to_visit);
+  auto input_exprs = StmtSort::getExprs(completeFusion(), to_visit);
 
   // Insert those expressions at the beginning of the group
   group->exprs_.insert(
@@ -2978,6 +3046,7 @@ void SegmentCandidateFinder::finalize() {
 
   // Finalize each group, fill in the missing inputs, i.e. tensor dims.
   for (auto g : groups()) {
+    g->setHeuristic(deriveHeuristic(g));
     g->finalize();
   }
 }
@@ -3102,8 +3171,7 @@ void SegmentedFusion::annotateFP16IntermediateTensors() {
   }
 }
 
-TORCH_CUDA_CU_API std::string toString(
-    const SegmentCandidateFinderOptions& segment_options) {
+std::string toString(const SegmentCandidateFinderOptions& segment_options) {
   std::stringstream ss;
   ss << "segmentation phases {\n";
   if (segment_options.run_combine_reductions) {
diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h
index 61fa966348e3..d9c4dfbd86af 100644
--- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h
+++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h
@@ -129,7 +129,7 @@ class TORCH_CUDA_CU_API SegmentedGroup {
   int group_id_ = -1;
 
   //! The scheduler to use for compiling this group
-  ScheduleHeuristic heuristic_ = ScheduleHeuristic::PointWise;
+  ScheduleHeuristic heuristic_ = ScheduleHeuristic::None;
 
   //! Exprs that make up the group
   std::vector<Expr*> exprs_;
@@ -275,7 +275,7 @@ class TORCH_CUDA_CU_API SegmentedFusion {
   }
 
   //! Returns the original un-segmented fusion
-  Fusion* completeFusion() {
+  Fusion* completeFusion() const {
     return complete_fusion_.get();
   }
 
@@ -288,11 +288,11 @@ class TORCH_CUDA_CU_API SegmentedFusion {
   }
 
   Val* findAlias(Val* val) const {
-    Val* alias_val = nullptr;
-    if (complete_fusion_->io_alias_.count(val) != 0) {
-      alias_val = complete_fusion_->io_alias_[val];
+    auto alias_it = complete_fusion_->ioAlias().find(val);
+    if (alias_it != complete_fusion_->ioAlias().end()) {
+      return alias_it->second;
     }
-    return alias_val;
+    return nullptr;
   }
 
   //! Make a clone of the group and convert to fusion
@@ -442,7 +442,8 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder {
       SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
     auto fusion_copy = std::make_unique<Fusion>(*fusion);
     if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
-      std::cout << "Segment the fusion: " << std::endl;
+      std::cout << "Segment the fusion (Original Fusion Un-modified): "
+                << std::endl;
       fusion_copy->printMath();
     }
     SegmentCandidateFinder scf(std::move(fusion_copy), inputs, options);
@@ -456,7 +457,8 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder {
       SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
     SegmentCandidateFinder scf(std::move(fusion), inputs, options);
     if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
-      std::cout << "Segment the fusion: " << std::endl;
+      std::cout << "Segment the fusion (Original Fusion Un-modified): "
+                << std::endl;
       scf.completeFusion()->printMath();
     }
     return std::move(scf.segmented_fusion_);
@@ -606,6 +608,7 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder {
   const at::ArrayRef<IValue>& runtime_inputs_;
 };
 
+// TODO: Make as member functions on classes instead of global scope
 TORCH_CUDA_CU_API std::string toString(const SegmentedGroup* group);
 TORCH_CUDA_CU_API std::string toString(const SegmentedEdge* edge);
 TORCH_CUDA_CU_API std::string toString(const SegmentedFusion* segmented_fusion);
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 08d3e89d21c5..c6ca212ccc29 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -5,6 +5,8 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/cuda/partition.h>
+#include <torch/csrc/jit/codegen/cuda/transform_view.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -17,8 +19,10 @@
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
+#include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
 #include <queue>
@@ -46,6 +50,13 @@ bool usedOnlyInDtype(Value* v) {
 Value* broadcastSizes(at::ArrayRef<Value*> sizes) {
   AT_ASSERT(!sizes.empty());
   Graph* graph = sizes[0]->owningGraph();
+  Node* insertion_point = sizes[0]->node()->next();
+  for (size_t i = 1; i < sizes.size(); i++) {
+    if (insertion_point->isBefore(sizes[i]->node()->next())) {
+      insertion_point = sizes[i]->node()->next();
+    }
+  }
+  WithInsertPoint guard(insertion_point);
   Node* broadcast_n =
       graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
   broadcast_n->output()->setType(ListType::ofInts());
@@ -66,9 +77,13 @@ Value* createConditionalConstant(Node* profile_ivalue) {
     auto int_list = profile_ivalue->is(Symbol::attr("profiled_bool_list"));
     std::vector<bool> bool_list(int_list.begin(), int_list.end());
     val = IValue(bool_list);
-  } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_size"))) {
+  } else if (profile_ivalue->hasAttribute(
+                 Symbol::attr("profiled_reduction_size"))) {
     // int[]
-    val = IValue(profile_ivalue->is(Symbol::attr("profiled_size")));
+    val = IValue(profile_ivalue->is(Symbol::attr("profiled_reduction_size")));
+  } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_view_size"))) {
+    // int[]
+    val = IValue(profile_ivalue->is(Symbol::attr("profiled_view_size")));
   } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_bool"))) {
     // bool
     val = IValue(
@@ -77,6 +92,13 @@ Value* createConditionalConstant(Node* profile_ivalue) {
     // int
     val = IValue(
         static_cast<int>(profile_ivalue->i(Symbol::attr("profiled_int"))));
+  } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_str"))) {
+    // str
+    val = IValue(static_cast<std::string>(
+        profile_ivalue->s(Symbol::attr("profiled_str"))));
+  } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_ival"))) {
+    // ival
+    val = IValue(profile_ivalue->ival(Symbol::attr("profiled_ival")));
   } else {
     GRAPH_DEBUG("profile_ivalue: ", *profile_ivalue);
     TORCH_WARN(
@@ -97,6 +119,7 @@ struct CudaGraphFuser {
   std::unique_ptr<AliasDb> aliasDb_;
   std::shared_ptr<Graph> graph_;
   Symbol kind_ = prim::CudaFusionGroup;
+  std::unordered_map<Value*, Value*> fusion_value_to_runtime_shape_;
 
   // nvrtc has a limit on the number of arguments allowed in a CUDA kernel.
   // The specific limit is a function of constant memory size, amount available
@@ -556,7 +579,7 @@ struct CudaGraphFuser {
     Value* producer_for_chunk = *it;
     size_t producer_index = it - chunk->inputs().begin();
 
-    // all uses of the chunk must be in in this consumer
+    // all uses of the chunk must be in this consumer
     for (auto s : chunk->outputs()) {
       for (auto u : s->uses()) {
         if (u.user != consumer)
@@ -644,7 +667,7 @@ struct CudaGraphFuser {
         auto input_c_strides = input_strides.concrete_sizes().value();
         auto output_c_sizes = producer_output_sizes.concrete_sizes().value();
         int output_index = int(output_c_sizes.size()) - 1;
-        strides.resize(output_index);
+        strides.resize(output_index + 1);
         AT_ASSERT(output_index >= int(input_c_sizes.size()) - 1);
         for (int input_index = int(input_c_sizes.size()) - 1; input_index >= 0;
              input_index--, output_index--) {
@@ -760,9 +783,11 @@ struct CudaGraphFuser {
           // longer valid so we rescan the new FusionGroup for more fusions...
           return std::make_pair(fusion_group.value()->reverseIterator(), true);
         }
-        // horizontal fusion only applies on tensor inputs
+
+        // horizontal fusion only applies on non-scalar tensor inputs
         if (getHorizontalFusion() &&
-            producer->type()->isSubtypeOf(*TensorType::get())) {
+            producer->type()->isSubtypeOf(*TensorType::get()) &&
+            !is_cpu_scalar(*producer->type()->cast<TensorType>())) {
           // fusing nodes sharing inputs, this could save memory bandwidth by
           // reducing number of tensor read.
           for (const auto& u : producer->uses()) {
@@ -834,6 +859,7 @@ struct CudaGraphFuser {
   // Builds up expressions that compute shapes of all intermediates (and
   // outputs) of the fusion group, based on the sizes of inputs. You should run
   // DCE to remove those that you end up not using.
+  // TODO: Add shape support for view, reshape, unsqueeze, and squeeze
   std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
     WithInsertPoint insert_guard{fusion_group->next()};
     std::unordered_map<Value*, Value*> shape_of;
@@ -846,7 +872,9 @@ struct CudaGraphFuser {
     AT_ASSERT(inputs.size() == sinputs.size());
     for (const auto i : c10::irange(inputs.size())) {
       if (inputs[i]->type()->isSubtypeOf(*TensorType::get())) {
-        shape_of[sinputs[i]] = graph->insert(aten::size, {inputs[i]});
+        auto sinput_value = graph->insert(aten::size, {inputs[i]});
+        shape_of[sinputs[i]] = sinput_value;
+        sinput_value->node()->moveBefore(fusion_group);
       }
     }
 
@@ -865,6 +893,26 @@ struct CudaGraphFuser {
       }
     }
 
+    // Place all the shape expressions for intermediates in fusion
+    // before the CudaFusionGroup
+    graph->setInsertPoint(fusion_group);
+
+    // hmmm, do I need to setInsertPoint...
+    const auto map_inputs = [&](Value* v) -> Value* {
+      // if constant ever has an input, it has to come from
+      // profile_ivalue dependency
+      if (v->node()->kind() == prim::Param &&
+          fusion_group->input(v->offset())->node()->kind() ==
+              prim::profile_ivalue) {
+        // we need to map it along profile_ivalue dependency
+        return fusion_group->input(v->offset());
+      } else {
+        throw std::runtime_error(
+            std::string("unexpected input from node") +
+            v->node()->kind().toDisplayString());
+      }
+    };
+
     for (Node* n : subgraph->nodes()) {
       // XXX: Use of shape_of.emplace is crucial to the output shape
       // optimization!
@@ -900,7 +948,11 @@ struct CudaGraphFuser {
       // extended shape expression support to reduction operations
       // TODO: `aten::sum` is too flexible, we should restrict for a better
       // match
-      if (n->kind() == aten::sum) {
+      // TODO: Add python tests where we check for existing ops and their
+      // shape expression logic.
+      static std::unordered_set<Symbol> reduction_ops(
+          {aten::sum, aten::mean, aten::var, aten::std});
+      if (reduction_ops.find(n->kind()) != reduction_ops.end()) {
         // TODO: expand support to wire non-constant inputs, this is currently
         // blocked by profiling executor not capable of profiling scalar inputs.
         TORCH_INTERNAL_ASSERT(
@@ -908,21 +960,6 @@ struct CudaGraphFuser {
                 n->input(2)->node()->kind() == prim::Constant,
             "only supports reduction axes and keepdim being constant");
 
-        // hmmm, do I need to setInsertPoint...
-        const auto map_inputs = [&](Value* v) -> Value* {
-          // if constant ever has an input, it has to come from
-          // profile_ivalue dependency
-          if (v->node()->kind() == prim::Param &&
-              fusion_group->input(v->offset())->node()->kind() ==
-                  prim::profile_ivalue) {
-            // we need to map it along profile_ivalue dependency
-            return fusion_group->input(v->offset());
-          } else {
-            throw std::runtime_error(
-                std::string("unexpected input from node") +
-                v->node()->kind().toDisplayString());
-          }
-        };
         Node* in1_const = graph->createClone(n->input(1)->node(), map_inputs);
         graph->insertNode(in1_const);
         Node* in2_const = graph->createClone(n->input(2)->node(), map_inputs);
@@ -996,6 +1033,57 @@ struct CudaGraphFuser {
         }
         continue;
       }
+      if (n->kind() == aten::native_dropout) {
+        TORCH_INTERNAL_ASSERT(
+            shape_of.count(n->input(0)) > 0,
+            "buildShapeExpressions failed at accessing input shapes");
+        shape_of.emplace(n->output(0), shape_of.at(n->input(0)));
+        shape_of.emplace(n->output(1), shape_of.at(n->input(0)));
+        continue;
+      }
+      if (n->kind() == prim::unsqueeze_copy) {
+        TORCH_INTERNAL_ASSERT(
+            shape_of.count(n->input(0)) > 0,
+            "buildShapeExpressions failed at accessing input shapes");
+        TORCH_INTERNAL_ASSERT(
+            n->input(1)->node()->kind() == prim::Constant,
+            "only supports unsqueeze axes being constant");
+        Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs);
+        graph->insertNode(dim_const);
+        std::vector<Value*> inputs = {
+            shape_of.at(n->input(0)), dim_const->output()};
+        Node* size_node = graph->insertNode(graph->create(
+            Symbol::fromQualString("prim::infer_unsqueeze_size"), inputs, 1));
+        Value* size = size_node->output(0);
+        size->setType(ListType::ofInts());
+        shape_of.emplace(n->output(), size);
+        continue;
+      }
+      if (n->kind() == prim::squeeze_copy) {
+        TORCH_INTERNAL_ASSERT(
+            shape_of.count(n->input(0)) > 0,
+            "buildShapeExpressions failed at accessing input shapes");
+        TORCH_INTERNAL_ASSERT(
+            n->inputs().size() == 2 || n->inputs().size() == 1,
+            "prim::squeeze_copy expects one or two inputs");
+        std::vector<Value*> inputs = {shape_of.at(n->input(0))};
+
+        if (n->inputs().size() == 2) {
+          TORCH_INTERNAL_ASSERT(
+              n->input(1)->node()->kind() == prim::Constant,
+              "only supports squeeze axes being constant");
+          Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs);
+          graph->insertNode(dim_const);
+          inputs.push_back(dim_const->output());
+        }
+        Node* size_node = graph->insertNode(graph->create(
+            Symbol::fromQualString("prim::infer_squeeze_size"), inputs, 1));
+        Value* size = size_node->output(0);
+        size->setType(ListType::ofInts());
+        shape_of.emplace(n->output(), size);
+        continue;
+      }
+
       auto tensor_inputs = filter(n->inputs(), [](Value* v) {
         return v->type()->isSubtypeOf(*TensorType::get());
       });
@@ -1021,8 +1109,10 @@ struct CudaGraphFuser {
     // TODO: failure in buildShapeExpressions should not break fusion execution,
     // we can add a try/catch here to bailout from removeOutputsUsedOnlyInSize.
     GRAPH_DEBUG("before build shape expression: ", *graph_);
-    auto shape_of = buildShapeExpressions(fusion_group);
+    auto shape_map = buildShapeExpressions(fusion_group);
+    fusion_value_to_runtime_shape_.insert(shape_map.begin(), shape_map.end());
     GRAPH_DEBUG("after build shape expression: ", *graph_);
+
     auto outputs = fusion_group->outputs().vec();
     auto soutputs = subgraph->outputs().vec();
     // XXX: Iterating in this order is not only good for performance reasons!
@@ -1031,12 +1121,12 @@ struct CudaGraphFuser {
     for (int64_t i = static_cast<int64_t>(outputs.size()) - 1; i >= 0; --i) {
       auto output = outputs[i];
       auto soutput = soutputs[i];
-      if (usedOnlyInDtypeAndSize(output) && shape_of.count(soutput) > 0) {
+      if (usedOnlyInDtypeAndSize(output) && shape_map.count(soutput) > 0) {
         bool has_dtype = usedInDtype(output);
         auto uses = output->uses();
         for (Use u : uses) {
           if (u.user->matches("aten::size(Tensor self) -> int[]")) {
-            u.user->output()->replaceAllUsesWith(shape_of.at(soutput));
+            u.user->output()->replaceAllUsesWith(shape_map.at(soutput));
             u.user->destroy();
           } else if (u.user->matches("prim::dtype(Tensor a) -> int")) {
             continue;
@@ -1126,7 +1216,12 @@ struct CudaGraphFuser {
 
     for (Node* node : block_->nodes()) {
       for (Block* sub_block : node->blocks()) {
-        CudaGraphFuser(sub_block, graph_).run();
+        CudaGraphFuser sub_block_cfg(sub_block, graph_);
+        sub_block_cfg.run();
+        // Accumulate runtime shapes for all sub-blocks
+        fusion_value_to_runtime_shape_.insert(
+            sub_block_cfg.fusion_value_to_runtime_shape_.begin(),
+            sub_block_cfg.fusion_value_to_runtime_shape_.end());
       }
     }
   }
@@ -1282,6 +1377,55 @@ void PeepholeOptimizeShapeExpressions(Block* block) {
   }
 }
 
+// view_sizes_runtime is the profiled-ivalue argument for view-size.
+// view_sizes_constant_list is the constant list recorded during profiling runs.
+Value* guardView(
+    Node* fusion,
+    std::unordered_map<Value*, Value*>& fusion_value_to_runtime_size,
+    Node* versioning_if,
+    Node* view,
+    Value* view_sizes_runtime) {
+  // 1. Get self tensor sizes and view_sizes
+  auto self_value = view->inputs().front();
+  auto self_type = self_value->type()->cast<TensorType>();
+  auto self_sizes_constant_list = getTensorSizes(self_type);
+
+  auto view_sizes_constant_list =
+      constant_as<c10::List<int64_t>>(view->inputs().back());
+  TORCH_INTERNAL_ASSERT(view_sizes_constant_list.has_value());
+
+  // 2. Get constraints for self tensor and view_sizes
+  auto constraints = analyzeViewConstraint(
+      self_sizes_constant_list, view_sizes_constant_list->vec());
+
+  // 3. Add constraints as constant to graph
+  auto self_tensor_constraint = fusion->owningGraph()->insertConstant(
+      IValue(constraints.original_constraint));
+  self_tensor_constraint->node()->moveBefore(versioning_if);
+  auto view_sizes_constraint =
+      fusion->owningGraph()->insertConstant(IValue(constraints.new_constraint));
+  view_sizes_constraint->node()->moveBefore(versioning_if);
+
+  // 4. Create CudaFusionViewGuard using input tensor, profile_ivalue
+  // for view_sizes list, and constraints
+  TORCH_INTERNAL_ASSERT(
+      fusion_value_to_runtime_size.find(self_value) !=
+          fusion_value_to_runtime_size.end(),
+      "Failed to find runtime size for fusion value:\t",
+      self_value->node()->kind().toDisplayString());
+  Node* viewcheck_node =
+      fusion->owningGraph()
+          ->create(
+              c10::Symbol::fromQualString("prim::CudaFusionViewGuard"),
+              {fusion_value_to_runtime_size.at(self_value),
+               view_sizes_runtime,
+               self_tensor_constraint,
+               view_sizes_constraint},
+              1)
+          ->insertBefore(versioning_if);
+  return viewcheck_node->output();
+}
+
 //! [ Note -- CudaFusionGuard implementation ]
 //!
 //! shamelessly copying code from NNC (tensorexpr_fuser)  with very little
@@ -1320,7 +1464,9 @@ void PeepholeOptimizeShapeExpressions(Block* block) {
 //!
 //! TODO: we also need to assert/check reduction axes and replace it with
 //! constants in `CudaFusionGroup`
-void guardFusionGroup(Node* fusion) {
+void guardFusionGroup(
+    Node* fusion,
+    std::unordered_map<Value*, Value*>& fusion_value_to_runtime_size) {
   // Fixup types of the subgraph inputs
   std::vector<TypePtr> guard_types;
   std::vector<Value*> tensor_inputs_to_check;
@@ -1371,10 +1517,12 @@ void guardFusionGroup(Node* fusion) {
 
   versioning_if->insertAfter(typecheck_node);
 
+  auto fusion_graph = fusion->g(attr::Subgraph);
+  std::vector<Value*> check_flags = {};
+
   // Fill in the false block. It should contain the unoptimized
   // copy of the fused subgraph, unless we have conditional constants from
   // profiled_ivalue;
-  auto fusion_graph = fusion->g(attr::Subgraph);
   std::shared_ptr<Graph> fb_graph; // resource holder;
   // Restore the dependency for constant introduced by profiled_ivalue within
   // the graph.
@@ -1421,11 +1569,10 @@ void guardFusionGroup(Node* fusion) {
     // 2. REMOVE conditional constant dependency in fusion group
     size_t compensation = 0;
 
-    // get a constant false, which is used by `and` pattern later
+    // get a constant true, which is used by `and` pattern later
     auto const_true = fusion->owningGraph()->insertConstant(IValue(true));
     const_true->node()->moveBefore(versioning_if);
 
-    std::vector<Value*> check_flags = {};
     for (const auto& original_offset : profiled_ivalue_indices) {
       size_t offset = original_offset - compensation;
 
@@ -1453,7 +1600,7 @@ void guardFusionGroup(Node* fusion) {
                 ->insertBefore(versioning_if)
                 ->output();
       } else if (fusion->input(offset)->node()->hasAttribute(
-                     Symbol::attr("profiled_size"))) {
+                     Symbol::attr("profiled_reduction_size"))) {
         // TODO(profile_size): check sizes here with special size comparison op
         // TORCH_INTERNAL_ASSERT(false, "not implemented yet");
         ivalue_check =
@@ -1464,6 +1611,40 @@ void guardFusionGroup(Node* fusion) {
                     1)
                 ->insertBefore(versioning_if)
                 ->output();
+      } else if (fusion->input(offset)->node()->hasAttribute(
+                     Symbol::attr("profiled_view_size"))) {
+        // TODO: Add support for dynamic split to view guard
+
+        // Path from profile-ivalue to prim::view_copy operation
+        // profile-ivalue -> Constant -> CudaFusionGroup
+        // Get argument position in CudaFusionGroup
+        // Get argument in subgraph for CudaFusionGroup
+        // CudaFusionGroup argument -> Constant List -> prim::view_copy
+        auto subgraph_arg = fusion_graph->inputs()[offset];
+        auto constant = subgraph_arg->uses().front().user->output();
+
+        TORCH_INTERNAL_ASSERT(!constant->uses().empty());
+        auto view = constant->uses().front().user;
+        TORCH_INTERNAL_ASSERT(
+            view->kind() == prim::view_copy ||
+            view->kind() == prim::reshape_copy);
+
+        ivalue_check = guardView(
+            fusion,
+            fusion_value_to_runtime_size,
+            versioning_if,
+            view,
+            profiled_ival);
+      } else if (fusion->input(offset)->node()->hasAttribute(
+                     Symbol::attr("profiled_ival"))) {
+        ivalue_check =
+            fusion->owningGraph()
+                ->create(
+                    c10::Symbol::fromQualString("prim::CudaFusionIvalGuard"),
+                    {profiled_ival, const_o},
+                    1)
+                ->insertBefore(versioning_if)
+                ->output();
       } else {
         ivalue_check = fusion->owningGraph()
                            ->create(aten::eq, {profiled_ival, const_o}, 1)
@@ -1491,22 +1672,24 @@ void guardFusionGroup(Node* fusion) {
       fusion_graph->eraseInput(offset);
       compensation++;
     }
-
-    if (!check_flags.empty()) {
-      // attaching output from CudaFusionGuard to profile ivalue checks
-      check_flags.emplace_back(typecheck_result);
-      auto graph = fusion->owningGraph();
-      auto bool_list_node =
-          graph->insertNode(graph->createList(BoolType::get(), check_flags));
-      bool_list_node->moveBefore(versioning_if);
-      Value* bool_list = bool_list_node->output();
-      // new typecheck_result
-      typecheck_result = graph->insert(aten::all, {bool_list});
-      typecheck_result->node()->moveBefore(versioning_if);
-    }
     // update graph in fusion node
     fusion->g_(attr::Subgraph, fusion_graph);
-  } else {
+  }
+
+  if (!check_flags.empty()) {
+    // attaching output from CudaFusionGuard to profile ivalue checks
+    check_flags.emplace_back(typecheck_result);
+    auto graph = fusion->owningGraph();
+    auto bool_list_node =
+        graph->insertNode(graph->createList(BoolType::get(), check_flags));
+    bool_list_node->moveBefore(versioning_if);
+    Value* bool_list = bool_list_node->output();
+    // new typecheck_result
+    typecheck_result = graph->insert(aten::all, {bool_list});
+    typecheck_result->node()->moveBefore(versioning_if);
+  }
+
+  if (profiled_ivalue_indices.empty()) {
     WithInsertPoint guard(false_block->return_node());
     const auto subgraph_outputs =
         insertGraph(*fusion->owningGraph(), *fusion_graph, fusion->inputs());
@@ -1532,11 +1715,13 @@ void guardFusionGroup(Node* fusion) {
   }
 }
 
-void guardFusionGroups(Block* block) {
+void guardFusionGroups(
+    Block* block,
+    std::unordered_map<Value*, Value*>& fusion_value_to_runtime_size) {
   std::vector<Node*> fusions;
   for (Node* n : block->nodes()) {
     for (Block* b : n->blocks()) {
-      guardFusionGroups(b);
+      guardFusionGroups(b, fusion_value_to_runtime_size);
     }
     if (n->kind() == prim::CudaFusionGroup) {
       fusions.push_back(n);
@@ -1546,7 +1731,18 @@ void guardFusionGroups(Block* block) {
     // step 1: a. add prim::CudaFusionGuard and fallback logic
     //         b. insert guard logic of profile_ivalue with if block
     //         c. restore conditional constant to non-constant for fallback
-    guardFusionGroup(fusion);
+    guardFusionGroup(fusion, fusion_value_to_runtime_size);
+  }
+}
+
+void dumpFusionGroups(std::shared_ptr<Graph>& g) {
+  DepthFirstGraphNodeIterator it(g);
+  Node* n = nullptr;
+  GRAPH_DEBUG("Exporting all NVFuser fusions:");
+  while ((n = it.next()) != nullptr) {
+    if (n->kind() == prim::FallbackGraph) {
+      GRAPH_EXPORT("", n->g(attr::Subgraph));
+    }
   }
 }
 
@@ -1840,23 +2036,6 @@ void ExtractProfileIValue(Node* profile_ivalue) {
   }
 }
 
-void traverseProfileIValues(
-    Block* block,
-    const std::function<void(Node*)>& func) {
-  std::vector<Node*> profile_ivalues;
-  for (Node* n : block->nodes()) {
-    for (Block* b : n->blocks()) {
-      traverseProfileIValues(b, func);
-    }
-    if (n->kind() == prim::profile_ivalue) {
-      profile_ivalues.push_back(n);
-    }
-  }
-  for (Node* profile_ivalue : profile_ivalues) {
-    func(profile_ivalue);
-  }
-}
-
 // break `linear` layer into `matmul` and `add_optional`. This allows us to fuse
 // the binary operation without supporting gemm.
 // Note that we are not breaking `linear` layer without bias.
@@ -1866,7 +2045,7 @@ void decomposeLinearOps(Block* block) {
     for (Block* b : n->blocks()) {
       decomposeLinearOps(b);
     }
-    // only decompose `linear` layer with bias.
+    // only decompose `linear` layer with bias
     if (n->kind() == aten::linear &&
         !n->input(2)->type()->isSubtypeOf(
             static_cast<c10::TypePtr>(NoneType::get()))) {
@@ -1881,16 +2060,30 @@ void decomposeLinearOps(Block* block) {
     auto matmul = graph->insertNode(
         graph->create(aten::matmul, {n->input(0), weight_t->output()}, 1));
     auto input_tensor_type = n->input(0)->type()->cast<c10::TensorType>();
+    if (!input_tensor_type) {
+      TORCH_WARN_ONCE(
+          "linear input 0 is required to be tensor for linear decompose");
+      continue;
+    }
     auto mat0_size = input_tensor_type->sizes().concrete_sizes();
     auto mat1_size =
         n->input(1)->type()->cast<c10::TensorType>()->sizes().concrete_sizes();
 
-    // TODO: The assert is not necessary when we can handle matmul, right now we
-    // are splitting the linear between matmul & bias_add. Our fuser can only
-    // take the second half and we would need the size information.
-    TORCH_INTERNAL_ASSERT(
-        mat0_size.has_value() && mat1_size.has_value(),
-        "concrete shape for linear input & weight are required");
+    // TODO: Continuing here is not necessary when we can handle matmul, right
+    // now we are splitting the linear between matmul & bias_add. Our fuser can
+    // only take the second half and we would need the size information.
+    if (!mat0_size.has_value() || !mat1_size.has_value()) {
+      TORCH_WARN_ONCE(
+          "concrete shape for linear input & weight are required to decompose into matmul + bias");
+      continue;
+    }
+
+    // only decompose for input with nDims >= 4. since lower rank linear eager
+    // is already fused
+    if (mat0_size->size() < 4) {
+      continue;
+    }
+
     auto out_size = mat0_size.value();
     TORCH_INTERNAL_ASSERT(
         mat1_size->size() == 2 || mat1_size->size() == 1,
@@ -1914,6 +2107,101 @@ void decomposeLinearOps(Block* block) {
   }
 }
 
+// Replace 'operation' with 'operation_copy' to guard alias operations.
+// Supports View, Reshape, Squeeze, and Unsqueeze
+void replaceAliasOpsWithCopy(std::shared_ptr<Graph>& graph, Block* block) {
+  static std::unordered_map<Symbol, Symbol> alias_to_copy_mapping;
+  // TODO: revert disabled aten::view
+  //    ({{aten::view, prim::view_copy},
+  //     {aten::reshape, prim::reshape_copy},
+  //     {aten::squeeze, prim::squeeze_copy},
+  //     {aten::unsqueeze, prim::unsqueeze_copy},
+  //     {aten::flatten, prim::flatten_copy}});
+
+  std::vector<Node*> maybe_safe_alias_nodes;
+  for (Node* n : block->nodes()) {
+    for (Block* b : n->blocks()) {
+      replaceAliasOpsWithCopy(graph, b);
+    }
+    if (alias_to_copy_mapping.find(n->kind()) != alias_to_copy_mapping.end()) {
+      maybe_safe_alias_nodes.push_back(n);
+    }
+  }
+
+  auto alias_db = std::make_unique<AliasDb>(graph);
+
+  auto safeToChangeAliasToCopy = [&alias_db](Node* n) {
+    return !alias_db->hasWriters(n->input(0)) &&
+        !alias_db->hasWriters(n->output(0));
+  };
+
+  auto replaceAliasWithCopy = [&graph, &alias_db](Node* n) {
+    WithInsertPoint guard(n);
+    auto copy_op = graph->insertNode(
+        graph->create(alias_to_copy_mapping[n->kind()], n->inputs(), 1));
+    copy_op->output()->setType(n->output(0)->type());
+
+    // adding newly created value into alias_db;
+    alias_db->createValue(copy_op->output());
+
+    n->output()->replaceAllUsesWith(copy_op->output());
+    n->destroy();
+  };
+
+  for (Node* n : maybe_safe_alias_nodes) {
+    if (!safeToChangeAliasToCopy(n)) {
+      continue;
+    }
+    replaceAliasWithCopy(n);
+  }
+}
+
+// Revert all 'operation_copy' with 'operation' except in CudaFusionGroup
+// e.g., Any non-fused alias operation including within the prim::FallbackGraph
+// Supports View, Reshape, Squeeze, and Unsqueeze
+void revertAliasCopyOps(std::shared_ptr<Graph>& graph, Block* block) {
+  static std::unordered_map<Symbol, Symbol> copy_to_alias_mapping;
+  // TODO: revert disabled aten::view
+  //    ({{prim::view_copy, aten::view},
+  //     {prim::flatten_copy, aten::flatten},
+  //     {prim::reshape_copy, aten::reshape},
+  //     {prim::squeeze_copy, aten::squeeze},
+  //     {prim::unsqueeze_copy, aten::unsqueeze}});
+
+  std::vector<Node*> alias_copy_ops;
+  for (Node* n : block->nodes()) {
+    // Allow alias copy ops in CudaFusionGroup
+    if (n->kind() == prim::CudaFusionGroup) {
+      continue;
+    }
+    // Revert alias copy ops within FallbackGraph
+    if (n->kind() == prim::FallbackGraph) {
+      auto subgraph = n->g(attr::Subgraph);
+      revertAliasCopyOps(subgraph, subgraph->block());
+    }
+    for (Block* b : n->blocks()) {
+      revertAliasCopyOps(graph, b);
+    }
+    // Revert any non-fused alias copy ops
+    if (copy_to_alias_mapping.find(n->kind()) != copy_to_alias_mapping.end()) {
+      alias_copy_ops.push_back(n);
+    }
+  }
+
+  auto replaceCopyWithAlias = [&graph](Node* n) {
+    WithInsertPoint guard(n);
+    auto alias_op = graph->insertNode(
+        graph->create(copy_to_alias_mapping[n->kind()], n->inputs(), 1));
+    alias_op->output()->setType(n->output(0)->type());
+    n->output()->replaceAllUsesWith(alias_op->output());
+    n->destroy();
+  };
+
+  for (Node* n : alias_copy_ops) {
+    replaceCopyWithAlias(n);
+  }
+}
+
 // break `conv2d` layer into `conv2d` and `add_optional`. This allows us to fuse
 // the binary operation without supporting gemm.
 // Note that we are not breaking `conv2d` layer without bias.
@@ -1941,9 +2229,11 @@ void decomposeConvOps(Block* block) {
 
     auto bias_tensor_type = n->input(2)->type()->cast<c10::TensorType>();
     auto bias_size_opt = bias_tensor_type->sizes().concrete_sizes();
-    TORCH_INTERNAL_ASSERT(
-        bias_size_opt.has_value(),
-        "concrete shape for bias input to conv2d are required");
+    if (!bias_size_opt.has_value()) {
+      TORCH_WARN_ONCE(
+          "concrete shape for bias input is required to decompose into conv + bias");
+      continue;
+    }
     // bias shape (C)
     auto bias_size = bias_size_opt.value();
 
@@ -1994,6 +2284,67 @@ bool removeInplaceOperations(const std::shared_ptr<Graph>& graph) {
       graph, [&](Node* node) { return inplace_ops.count(node->kind()) != 0; });
 }
 
+// Recursively traverse blocks, gather all nodes with given symbol,
+// and then apply mutator function.
+void mutateNode(
+    Block* block,
+    Symbol symbol,
+    const std::function<void(Node*)>& func) {
+  // Recursively call mutateNode on blocks
+  // Gather all nodes with given symbol
+  std::vector<Node*> nodes;
+  for (Node* n : block->nodes()) {
+    for (Block* b : n->blocks()) {
+      mutateNode(b, symbol, func);
+    }
+    if (n->kind() == symbol) {
+      nodes.push_back(n);
+    }
+  }
+
+  // Apply mutator funcion to every node
+  for (Node* n : nodes) {
+    func(n);
+  }
+}
+
+// For the given CudaFusionGroup, separate nested views and remove any unused,
+// intermediate views
+void separateNestedViews(Node* cuda_fusion_group) {
+  TORCH_INTERNAL_ASSERT(cuda_fusion_group->kind() == prim::CudaFusionGroup);
+
+  auto isView = [](Node* node) {
+    static std::unordered_set<Symbol> alias_op_set(
+        {prim::view_copy, prim::reshape_copy});
+    return alias_op_set.find(node->kind()) != alias_op_set.end();
+  };
+
+  // node -> input / output values
+  auto isNestedView = [&isView](Node* node) {
+    return isView(node) && isView(node->input(0)->node());
+  };
+
+  auto subgraph = cuda_fusion_group->g(attr::Subgraph);
+  for (auto node : subgraph->block()->nodes()) {
+    if (isNestedView(node)) {
+      // grandparent -> (view / reshape) parent -> (view / reshape) node
+      auto parent_value = node->input(0);
+      auto parent = parent_value->node();
+
+      auto grandparent_value = parent->input(0);
+      C10_UNUSED auto grandparent = grandparent_value->node();
+
+      // Before: gp -> x -> n
+      // After: gp -> x / gp -> n
+      // Delete x if no more uses
+      node->replaceInputWith(parent_value, grandparent_value);
+      if (!parent->hasUses()) {
+        parent->destroy();
+      }
+    }
+  }
+}
+
 } // anonymous namespace
 
 void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
@@ -2004,7 +2355,7 @@ void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
   // I don't know how to store edge/node in attribute. so let's abuse data flow
   // dependency and add inputs to conditional constant generated by
   // aten::profile_ivalue
-  traverseProfileIValues(graph->block(), ExtractProfileIValue);
+  mutateNode(graph->block(), prim::profile_ivalue, ExtractProfileIValue);
   GRAPH_DEBUG("insert conditional constant from profile_ivalue: ", *graph);
 
   // TODO: we need to properly restore shape information after fusion.
@@ -2026,12 +2377,16 @@ void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
   decomposeConvOps(graph->block());
   GRAPH_DEBUG("After decompose decompose Conv Ops by nvfuser: ", *graph);
 
-  CudaGraphFuser(graph->block(), graph).run();
+  replaceAliasOpsWithCopy(graph, graph->block());
+  GRAPH_DEBUG("replace alias_op with alias_copy by nvfuser: ", *graph);
+
+  CudaGraphFuser cgf(graph->block(), graph);
+  cgf.run();
   GRAPH_DEBUG("After Fusion: ", *graph);
 
   // guard input types as well as conditional constants from
   // aten::profile_ivalue
-  guardFusionGroups(graph->block());
+  guardFusionGroups(graph->block(), cgf.fusion_value_to_runtime_shape_);
   GRAPH_DEBUG("After Guard Fusion: ", *graph);
 
   // mutate `aten::_batch_norm_impl_index` and
@@ -2040,7 +2395,7 @@ void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
   alterBatchNormImpls(graph->block());
   GRAPH_DEBUG("After _batch_norm_impl_index: ", *graph);
 
-  traverseProfileIValues(graph->block(), RemoveProfileIValue);
+  mutateNode(graph->block(), prim::profile_ivalue, RemoveProfileIValue);
 
   GRAPH_DEBUG("Before remove missing profiling: ", *graph);
   removeFusionWithMissingProfilingInformation(graph->block());
@@ -2049,6 +2404,16 @@ void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
   // optimization targeting AMP
   removeOutputUsedOnlyInDtype(graph->block());
   GRAPH_DEBUG("After removeOutputUsedOnlyInDtype: ", *graph);
+
+  mutateNode(graph->block(), prim::CudaFusionGroup, separateNestedViews);
+  GRAPH_DEBUG(
+      "separate nested and delete redundant views in CudaFusionGroup:", *graph);
+
+  revertAliasCopyOps(graph, graph->block());
+  GRAPH_DEBUG("revert alias_copy ops by nvfuser: ", *graph);
+
+  dumpFusionGroups(graph);
+
   // After FuseGraph some common subexpressions may come back
   EliminateCommonSubexpression(graph);
   // We might have emitted a fair amount of useless shape propagating code, so
diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
new file mode 100644
index 000000000000..5931eb3427aa
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
@@ -0,0 +1,210 @@
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+// Return if ref and other are transformed in the same way.
+bool hasMatchingTransformations(TensorView* ref, TensorView* other) {
+  std::unordered_map<IterDomain*, IterDomain*> ref_2_other;
+  for (const auto i : c10::irange(ref->getRootDomain().size())) {
+    ref_2_other.emplace(
+        ref->getRootDomain().at(i), other->getRootDomain().at(i));
+  }
+
+  auto replay =
+      BestEffortReplay(
+          other->domain()->domain(), ref->domain()->domain(), ref_2_other)
+          .getReplay();
+
+  for (const auto i : c10::irange(ref->nDims())) {
+    auto ref_id = ref->axis(i);
+    auto other_id = other->axis(i);
+    auto it = replay.find(ref_id);
+    if (it == replay.end() || it->second != other_id) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Validate grouping of reductions and return a new max producer position
+unsigned int validateReductionGrouping(
+    const std::vector<Val*>& inputs,
+    const std::vector<Val*>& outputs) {
+  TORCH_INTERNAL_ASSERT(inputs.size() == outputs.size());
+  TORCH_INTERNAL_ASSERT(!inputs.empty());
+
+  auto fusion = dynamic_cast<Fusion*>(outputs[0]->container());
+  TORCH_INTERNAL_ASSERT(
+      fusion != nullptr, "Grouping of reductions must be done within a Fusion");
+
+  ExactRootDomainMap exact_map(fusion);
+
+  // Pick the first output TV as a reference and compare it with the
+  // rest. Do not allow grouping if any mismatch is detected.
+  auto ref_tv = outputs[0]->as<TensorView>();
+  const auto ref_domain = ref_tv->getRootDomain();
+  const auto num_root_dims = ref_domain.size();
+  const auto num_dims = ref_tv->nDims();
+  const auto ref_ca_pos = ref_tv->getComputeAtPosition();
+  auto max_producer_pos = ref_tv->getMaxProducerPosition();
+  for (const auto i : c10::irange(inputs.size())) {
+    auto output_tv = outputs.at(i)->as<TensorView>();
+    const auto& output_domain = output_tv->getRootDomain();
+    if (ref_tv == output_tv) {
+      continue;
+    }
+    TORCH_INTERNAL_ASSERT(
+        output_domain.size() == num_root_dims,
+        "Invalid grouped reduction due to mismatched number of root dimensions. "
+        "Expected: ",
+        num_root_dims,
+        ". Detected: ",
+        output_domain.size(),
+        ". Invalid output tensor: ",
+        output_tv->toString());
+    TORCH_INTERNAL_ASSERT(
+        output_tv->nDims() == num_dims,
+        "Invalid grouped reduction due to mismatched number of dimensions. "
+        "Expected: ",
+        num_dims,
+        ". Detected: ",
+        output_tv->nDims(),
+        ". Invalid output tensor: ",
+        output_tv->toString());
+    for (const auto i : c10::irange(num_root_dims)) {
+      auto ref_id = ref_domain.at(i);
+      auto output_id = output_domain.at(i);
+      // If an IterDomain is broadcast, require the other
+      // corresponding IterDomains are also broadcast. This may not be
+      // necessary but not completely certain.
+      TORCH_INTERNAL_ASSERT(
+          ref_id->isBroadcast() == output_id->isBroadcast(),
+          "Invalid grouped reduction due to mismatched broadcast root domains. ",
+          "Reference domain: ",
+          ref_id->toString(),
+          ". Mismatched domain: ",
+          output_id->toString(),
+          ". Invalid tensor: ",
+          output_tv->toString());
+      if (ref_id->isBroadcast()) {
+        continue;
+      }
+      TORCH_INTERNAL_ASSERT(
+          ref_id->isReduction() == output_id->isReduction(),
+          "Invalid grouped reduction due to mismatched reduction root domains. ",
+          "Reference domain: ",
+          ref_id->toString(),
+          ". Mismatched domain: ",
+          output_id->toString(),
+          ". Invalid tensor: ",
+          output_tv->toString());
+      TORCH_INTERNAL_ASSERT(
+          exact_map.areMapped(ref_id, output_id) || ref_id->sameAs(output_id),
+          "Invalid grouped reduction due to mismatched root domains. ",
+          "Reference domain: ",
+          ref_id->toString(),
+          ". Mismatched domain: ",
+          output_id->toString(),
+          ". Invalid tensor: ",
+          output_tv->toString());
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        hasMatchingTransformations(ref_tv, output_tv),
+        "Invalid grouped reduction due to mismatched transformations. ",
+        "Reference tensor: ",
+        ref_tv->toString(),
+        ". Mismatched tensor: ",
+        output_tv->toString());
+
+    // Must have the same computeAt position
+    TORCH_INTERNAL_ASSERT(
+        output_tv->getComputeAtPosition() == ref_ca_pos,
+        "Invalid grouped reduction due to mismatched computeAt position. ",
+        "Reference tensor: ",
+        ref_tv->toString(),
+        ". Mismatched tensor: ",
+        output_tv->toString());
+
+    max_producer_pos =
+        std::max(max_producer_pos, output_tv->getMaxProducerPosition());
+  }
+
+  // Must not have any data dependency from outputs to inputs
+  const auto all_dep_vals = DependencyCheck::getAllValsBetween(
+      {outputs.begin(), outputs.end()}, inputs);
+  if (!all_dep_vals.empty()) {
+    std::stringstream ss;
+    ss << "Invalid dependency:";
+    for (auto val : all_dep_vals) {
+      ss << " " << val->toString();
+    }
+    TORCH_INTERNAL_ASSERT(all_dep_vals.empty(), ss.str());
+  }
+
+  return max_producer_pos;
+}
+
+} // namespace
+
+void groupReductions(const std::vector<TensorView*>& reduction_outputs) {
+  TORCH_CHECK(!reduction_outputs.empty(), "No tensor is given");
+
+  auto container = reduction_outputs[0]->container();
+
+  const auto num_reductions = reduction_outputs.size();
+
+  std::vector<BinaryOpType> op_types(num_reductions);
+  std::vector<Val*> init_vals(num_reductions);
+  std::vector<Val*> outputs(num_reductions);
+  std::vector<Val*> inputs(num_reductions);
+
+  for (const auto i : c10::irange(num_reductions)) {
+    auto reduction_out = reduction_outputs.at(i);
+    TORCH_CHECK(
+        reduction_out->definition() != nullptr,
+        "Invalid tensor to group: ",
+        reduction_out->toString(),
+        ". Definition not found");
+    auto rop = dynamic_cast<ReductionOp*>(reduction_out->definition());
+    TORCH_CHECK(
+        rop != nullptr,
+        "Invalid tensor to group: ",
+        reduction_out->toString(),
+        ". Not an output of a ReductionOp: ",
+        reduction_out->definition()->toString());
+    // Fused reduction is only enabled during the lowering, so at this
+    // point it should be false.
+    TORCH_INTERNAL_ASSERT(
+        !rop->isAllreduce(), "Invalid ReductionOp: ", rop->toString());
+    op_types.at(i) = rop->getReductionOpType();
+    init_vals.at(i) = rop->init();
+    outputs.at(i) = rop->out();
+    inputs.at(i) = rop->in();
+  }
+
+  auto max_producer_pos = validateReductionGrouping(inputs, outputs);
+
+  for (auto output : ir_utils::filterByType<TensorView>(outputs)) {
+    output->setMaxProducer(max_producer_pos);
+  }
+
+  IrBuilder::create<GroupedReductionOp>(
+      container, op_types, init_vals, outputs, inputs);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.h b/torch/csrc/jit/codegen/cuda/grouped_reduction.h
new file mode 100644
index 000000000000..39e6e0850e67
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/grouped_reduction.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Horizontally fuse multiple reductions.
+//!
+//! Given a list of tensors produced by ReductionOp, create a new
+//! GroupedReductionOp expression that takes the input tensors of the
+//! original reductions and produces the given tensors, replacing
+//! their defining expressions.
+//!
+//! GroupedReductionOp works just like ReductionOp with a potential
+//! benefit of aggregating synchronizations across individual
+//! reductions. See the reduction::gridReduce2 runtime function for a
+//! two-input version of grid reduction.
+//!
+//! The grouped reductions must follow several constraints, which
+//! include:
+//! - There must not exist any data dependency between individual
+//!   reductions.
+//! - All reduction output tensors must have the same number of
+//!   dimensions, the same transformations and the same axes to
+//!   reduce.
+//!
+//! \param reduction_outputs Tensors produced by ReductionOp
+TORCH_CUDA_CU_API void groupReductions(
+    const std::vector<TensorView*>& reduction_outputs);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index 39176a60c537..a000dca87a15 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -3,6 +3,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/index_reference_replay.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
@@ -10,13 +11,13 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
 #include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
 #include <torch/csrc/jit/codegen/cuda/lower_shift.h>
 #include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower_validation.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
@@ -28,212 +29,6 @@ namespace cuda {
 
 namespace {
 
-// A merge is contiguous if:
-//   Inputs of outer are to the left in the root domain of the inputs of RHS.
-//   All inputs are contiguous in the root domain:
-//     - All marked as contiguous
-//     - Only gaps between inputs are broadcast or reductoin dims
-//   There are no split transformations performed on outer or inner
-//   All transformations on outer or inner are contiguous merges
-// If this criteria holds, then we can index the input root domains of this
-// merge with the indexing provided to the output of the merge in the backward
-// index pass
-
-class ContigIDs : public OptInDispatch {
- private:
-  using OptInDispatch::handle;
-
-  // Mark if ids are result of contigous merges
-  std::unordered_set<kir::IterDomain*> contig_ids;
-  // Given contiguous domain, return all iter domains within its history.
-  std::unordered_map<kir::IterDomain*, std::unordered_set<kir::IterDomain*>>
-      within_contig_ids;
-  const std::vector<IterDomain*>& root_domain_;
-  const std::vector<bool>& root_contiguity_;
-  std::unordered_map<IterDomain*, bool> is_contig_root;
-
-  bool inRoot(const std::vector<IterDomain*>& ids) {
-    return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) {
-      return is_contig_root.find(id) != is_contig_root.end();
-    });
-  }
-
-  bool isContig(kir::IterDomain* id) {
-    return contig_ids.find(id) != contig_ids.end();
-  }
-
-  // Split outputs are not contiguous, don't need to do anything.
-  void handle(Split*) override {}
-
-  void handle(Merge* merge) override {
-    const auto gpu_lower = GpuLower::current();
-
-    // If either input is non-contiguous so is output.
-    const auto inner = merge->inner();
-    const auto outer = merge->outer();
-
-    if ((!isContig(gpu_lower->lowerValue(inner)->as<kir::IterDomain>()) ||
-         !isContig(gpu_lower->lowerValue(outer)->as<kir::IterDomain>()))) {
-      return;
-    }
-
-    // Grab inputs, make sure they're in root domain, check if they're
-    // contiguous.
-
-    auto lhs_inputs =
-        ir_utils::iterDomainInputsOfOrderedAs({outer}, root_domain_);
-    auto rhs_inputs =
-        ir_utils::iterDomainInputsOfOrderedAs({inner}, root_domain_);
-
-    TORCH_INTERNAL_ASSERT(
-        inRoot(lhs_inputs) && inRoot(rhs_inputs),
-        "Found an invalid merge operation, inputs of its arguments are not in the root domain.");
-
-    std::deque<IterDomain*> ordered_inputs(
-        lhs_inputs.begin(), lhs_inputs.end());
-    ordered_inputs.insert(
-        ordered_inputs.end(), rhs_inputs.begin(), rhs_inputs.end());
-
-    // If any root input is not contig, output is not contig
-    if (!(std::all_of(
-            ordered_inputs.begin(),
-            ordered_inputs.end(),
-            [this](IterDomain* id) {
-              return is_contig_root.at(id) && !id->isBroadcast() &&
-                  !id->isReduction();
-            }))) {
-      return;
-    }
-
-    std::deque<IterDomain*> root_copy(root_domain_.begin(), root_domain_.end());
-
-    // Forward to first matching argument
-    while (!root_copy.empty() && !ordered_inputs.empty()) {
-      if (root_copy.front() != ordered_inputs.front()) {
-        root_copy.pop_front();
-      } else {
-        break;
-      }
-    }
-
-    // Forward through all matching arguments
-    while (!root_copy.empty() && !ordered_inputs.empty()) {
-      if (root_copy.front() == ordered_inputs.front()) {
-        root_copy.pop_front();
-        ordered_inputs.pop_front();
-        // This is no longer causing an error in:
-        // ReductionSchedulerMultiDimNonFastest TODO: test reenablement to make
-        // sure it does what's expected
-        //  } else if (
-        //     root_copy.front()->isReduction() ||
-        //     root_copy.front()->isBroadcast()) {
-        //   root_copy.pop_front();
-      } else {
-        break;
-      }
-    }
-
-    // If we matched all inputs, the output is contiguous. Only want to keep the
-    // top contig ID, lower ids should be placed in the "within_contig_ids" map
-    // of top id.
-    auto kir_inner =
-        gpu_lower->lowerValue(merge->inner())->as<kir::IterDomain>();
-    auto kir_outer =
-        gpu_lower->lowerValue(merge->outer())->as<kir::IterDomain>();
-    auto kir_out = gpu_lower->lowerValue(merge->out())->as<kir::IterDomain>();
-    if (ordered_inputs.empty()) {
-      if (contig_ids.find(kir_inner) != contig_ids.end()) {
-        contig_ids.erase(kir_inner);
-      }
-
-      if (contig_ids.find(kir_outer) != contig_ids.end()) {
-        contig_ids.erase(kir_outer);
-      }
-
-      contig_ids.emplace(kir_out);
-
-      std::unordered_set<kir::IterDomain*> within_out;
-      within_out.emplace(kir_inner);
-      if (within_contig_ids.find(kir_inner) != within_contig_ids.end()) {
-        auto in_inner = within_contig_ids.at(kir_inner);
-        within_out.insert(in_inner.begin(), in_inner.end());
-        within_contig_ids.erase(kir_inner);
-      }
-
-      within_out.emplace(kir_outer);
-      if (within_contig_ids.find(kir_outer) != within_contig_ids.end()) {
-        auto in_outer = within_contig_ids.at(kir_outer);
-        within_out.insert(in_outer.begin(), in_outer.end());
-        within_contig_ids.erase(kir_outer);
-      }
-
-      within_contig_ids[kir_out] = within_out;
-    }
-  }
-
- public:
-  ContigIDs() = delete;
-
-  // Check through the history of ids whose inputs map to root_domain with
-  // contiguity root_contiguity. Return unordered_set of all merges that are
-  // contiguous. Ignore root order is primarily used for predicate generation.
-  // In this case we can linearize indexing of any ID that only consists of
-  // merge operations.
-  ContigIDs(
-      const std::vector<IterDomain*>& ids,
-      const std::vector<IterDomain*>& root_domain,
-      const std::vector<bool>& root_contiguity)
-      : root_domain_(root_domain), root_contiguity_(root_contiguity) {
-    if (ids.empty()) {
-      return;
-    }
-
-    TORCH_INTERNAL_ASSERT(
-        root_domain_.size() == root_contiguity_.size(),
-        "Arguments don't match ",
-        root_domain_.size(),
-        " != ",
-        root_contiguity_.size());
-
-    const auto gpu_lower = GpuLower::current();
-
-    for (const auto i : c10::irange(root_domain_.size())) {
-      // If a root domain has halo, can't use merged domain even if
-      // both inputs are contiguous. HaloInfo is also initialized for
-      // rfactor root domains, which should just return "zero"
-      // RootAxisInfo. This should be safe as no rfactor tensor should
-      // need halo.
-      if (root_contiguity_[i] &&
-          !gpu_lower->haloInfo().getRootAxisInfo(root_domain_[i]).hasHalo()) {
-        auto kir_root_domain_i =
-            gpu_lower->lowerValue(root_domain_[i])->as<kir::IterDomain>();
-        contig_ids.emplace(kir_root_domain_i);
-        within_contig_ids[kir_root_domain_i] =
-            std::unordered_set<kir::IterDomain*>();
-        is_contig_root[root_domain_[i]] = true;
-      } else {
-        is_contig_root[root_domain_[i]] = false;
-      }
-    }
-
-    auto exprs = ExprSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()});
-
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-  }
-
-  const std::unordered_set<kir::IterDomain*> contigIDs() const {
-    return contig_ids;
-  }
-
-  const std::
-      unordered_map<kir::IterDomain*, std::unordered_set<kir::IterDomain*>>
-      withinContigIDs() const {
-    return within_contig_ids;
-  }
-};
-
 // Update the HaloInfo mappings for a reference tensor by propagating
 // the halo information from the consumer tensor.
 void updateHaloInfoForReference(
@@ -248,8 +43,8 @@ void updateHaloInfoForReference(
   // First, propagate the halo information of the consumer root domain
   // to the reference root domain.
   for (auto consumer_root_id : consumer_tv->getRootDomain()) {
-    auto consumer_index_concrete_id =
-        gpu_lower->caIndexMap().getConcreteMappedID(consumer_root_id);
+    auto consumer_index_concrete_id = gpu_lower->caMap()->getConcreteMappedID(
+        consumer_root_id, IdMappingMode::EXACT);
     auto reference_it =
         reference.concrete_to_id.find(consumer_index_concrete_id);
     if (reference_it == reference.concrete_to_id.end()) {
@@ -276,21 +71,18 @@ void updateHaloInfoForReference(
 //
 // ref_map: ref-to-consumer in consumer indexing; ref-to-producer in
 // producer indexing
-std::unordered_map<kir::IterDomain*, kir::Val*> getReferenceHaloExtentMap(
+std::unordered_map<IterDomain*, Val*> getReferenceHaloExtentMap(
     const ReferenceTensor& reference,
     const std::unordered_map<IterDomain*, IterDomain*>& index_map_from_ref) {
-  const auto gpu_lower = GpuLower::current();
-
-  const auto& halo_info = gpu_lower->haloInfo();
+  const auto& halo_info = GpuLower::current()->haloInfo();
 
-  std::unordered_map<kir::IterDomain*, kir::Val*> reference_halo_extent_map;
+  std::unordered_map<IterDomain*, Val*> reference_halo_extent_map;
 
   // Propagate halo extents of the reference to the consumer or
   // producer tensor
   for (auto kv : index_map_from_ref) {
-    auto ref_id = gpu_lower->lowerValue(kv.first)->as<kir::IterDomain>();
-    auto producer_or_consumer_id =
-        gpu_lower->lowerValue(kv.second)->as<kir::IterDomain>();
+    auto ref_id = kv.first;
+    auto producer_or_consumer_id = kv.second;
     auto extent = halo_info.getExtent(ref_id);
     if (extent != nullptr) {
       reference_halo_extent_map[producer_or_consumer_id] = extent;
@@ -302,7 +94,7 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getReferenceHaloExtentMap(
 
 //! Offset of an index of a producer axis with respect to its
 //! corresponding consumer index
-kir::Val* getProducerHaloOffset(
+int getProducerHaloOffset(
     const TensorView* producer_tv,
     size_t producer_axis,
     const TensorView* consumer_tv) {
@@ -325,41 +117,31 @@ kir::Val* getProducerHaloOffset(
   const auto p_pad = halo_map.getRootAxisInfo(producer_id).width(0);
   const auto c_pad = halo_map.getRootAxisInfo(consumer_id).width(0);
 
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  kir::Val* offset = (p_pad->isConst() && c_pad->isConst())
-      ? ir_builder.create<kir::Int>(
-            p_pad->value().value() - c_pad->value().value())
-      : ir_builder.subExpr(p_pad, c_pad);
+  auto offset = p_pad - c_pad;
 
   // If the consumer is a result of shifting the producer, adjust the
   // producer index per the offsets argument of the shift op.
   if (auto shift_op = dynamic_cast<const ShiftOp*>(consumer_tv->definition())) {
-    offset = ir_builder.subExpr(
-        offset, ir_builder.create<kir::Int>(shift_op->offset(producer_axis)));
+    offset -= shift_op->offset(producer_axis);
   }
 
   return offset;
 }
 
 //! Offset producer index when necessary
-kir::Val* getProducerIndexWithHalo(
+Val* getProducerIndexWithHalo(
     const TensorView* producer_tv,
     size_t producer_axis,
-    kir::Val* producer_index,
+    Val* producer_index,
     const TensorView* consumer_tv) {
   const auto offset =
       getProducerHaloOffset(producer_tv, producer_axis, consumer_tv);
 
-  if (offset->isZeroInt()) {
+  if (offset == 0) {
     return producer_index;
   }
 
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  producer_index = ir_builder.addExpr(producer_index, offset);
+  producer_index = SimplifyingIrBuilder::addExpr(producer_index, offset);
 
   return producer_index;
 }
@@ -368,58 +150,58 @@ kir::Val* getProducerIndexWithHalo(
 //!
 //! \param consumer_root_axis Position of corresponding consumer axis
 //! \param consumer_tv Consumer TensorView
+//! \param index_map Mappings from consumer or reference to indices
+//! \param use_reference_map True when index_map maps reference domains
 //! \param concrete_to_ref_map Mappings from concrete to reference domains
-//! \param ref_index_map Mappings from reference domains to indices
-kir::Val* getProducerOffsetWithGather(
+Val* getProducerOffsetWithGather(
     size_t consumer_root_axis,
     const TensorView* consumer_tv,
-    const std::unordered_map<IterDomain*, IterDomain*>& concrete_to_ref_map,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_index_map) {
+    const std::unordered_map<IterDomain*, Val*>& index_map,
+    bool use_reference_map = false,
+    const std::unordered_map<IterDomain*, IterDomain*>& concrete_to_ref_map =
+        {}) {
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
 
   const auto gather_expr = dynamic_cast<GatherOp*>(consumer_tv->definition());
 
   if (gather_expr == nullptr) {
-    return ir_builder.zeroVal();
+    return gpu_lower->kernel()->zeroVal();
   }
 
   // If the window extent is one, no specific offsetting
   // is necessary
   if (consumer_root_axis >= gather_expr->windowShape().size() ||
-      gather_expr->windowShape()[consumer_root_axis]->isOneInt()) {
-    return ir_builder.zeroVal();
+      gather_expr->windowShape()[consumer_root_axis] == 1) {
+    return gpu_lower->kernel()->zeroVal();
   }
 
   // Basically, the goal is to build an expression of producer_index +
   // window_index, so we first need to locate the index expression
   // that corresponds to the window axis of this producer axis.
 
-  // Locate the root IterDomain of the reference that corresponds to the gather
-  // axis
   const auto window_axis = gather_expr->gatherAxis(consumer_root_axis);
   auto window_id = consumer_tv->getRootDomain().at(window_axis);
-  auto concrete_window_id =
-      gpu_lower->caIndexMap().getConcreteMappedID(window_id);
-  auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id);
-  TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end());
-  IterDomain* reference_root_of_gather_axis = concrete_2_ref_it->second;
-
-  // Now that reference_root_of_gather_axis is the IterDomain for the
-  // window axis, take its corresponding index from the index map
-  auto window_idx =
-      ref_index_map.at(gpu_lower->lowerValue(reference_root_of_gather_axis)
-                           ->as<kir::IterDomain>());
-
-  // Positive (or negative) padding at offset zero means the indexing
-  // shifted to the negative (or positive) direction.
+
+  // When index_map maps a reference tensor, find the corresponding
+  // reference ID of window_id.
+  if (use_reference_map) {
+    auto concrete_window_id = gpu_lower->caMap()->getConcreteMappedID(
+        window_id, IdMappingMode::EXACT);
+    auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id);
+    TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end());
+    window_id = concrete_2_ref_it->second;
+  }
+
+  auto window_idx = index_map.at(window_id);
+
+  // Positive padding at offset zero means the indexing shifted to the
+  // negative direction.
   auto pad_width = gather_expr->padWidth()[consumer_root_axis][0];
 
   // producer offset: window_index - padding
-  auto producer_offset =
-      ir_builder.subExpr(window_idx, ir_builder.create<kir::Int>(pad_width));
+  auto producer_offset = SimplifyingIrBuilder::subExpr(
+      window_idx, SimplifyingIrBuilder::create<Int>(pad_width));
   return producer_offset;
-  ;
 }
 
 //! Offset a producer index of a gather expression
@@ -428,13 +210,13 @@ kir::Val* getProducerOffsetWithGather(
 //! expression that accesses a window position that the current loop
 //! structure refers to. Use getGatherProducerOffset to create an
 //! offset Val.
-kir::Val* getProducerIndexWithGather(
-    kir::Val* producer_index,
+Val* getProducerIndexWithGather(
+    Val* producer_index,
     size_t producer_root_axis,
     const TensorView* producer_tv,
     const TensorView* consumer_tv,
     const std::unordered_map<IterDomain*, IterDomain*>& concrete_to_ref_map,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_index_map) {
+    const std::unordered_map<IterDomain*, Val*>& ref_index_map) {
   auto gather_op = dynamic_cast<const GatherOp*>(consumer_tv->definition());
 
   // Just return the producer index as is if this is not a gather
@@ -460,22 +242,18 @@ kir::Val* getProducerIndexWithGather(
       ", producer_axis: ",
       producer_root_axis);
 
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
   auto offset = getProducerOffsetWithGather(
-      consumer_axis, consumer_tv, concrete_to_ref_map, ref_index_map);
-  return ir_builder.addExpr(producer_index, offset);
+      consumer_axis, consumer_tv, ref_index_map, true, concrete_to_ref_map);
+  return SimplifyingIrBuilder::addExpr(producer_index, offset);
 }
 
 // Adjusts a global consumer index when its root domain is partially
 // split. Note that non-global consumer indices don't need any
 // adjustment.
-kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  auto offset = gpu_lower->partialSplitMap().getStartOffset(root_id);
+Val* getGlobalConsumerOffsetWithPartialSplit(IterDomain* root_id) {
+  auto offset = GpuLower::current()->partialSplitMap().getStartOffset(root_id);
   if (offset == nullptr) {
-    return ir_builder.zeroVal();
+    return GpuLower::current()->kernel()->zeroVal();
   } else {
     return offset;
   }
@@ -488,13 +266,12 @@ kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) {
 // it needs to be added to the index. Also, when the producer itself
 // also has a non-zero split offset, that needs to be subtracted from
 // the index.
-kir::Val* getProducerIndexWithPartialSplit(
-    kir::Val* producer_index,
+Val* getProducerIndexWithPartialSplit(
+    Val* producer_index,
     IterDomain* producer_root_id,
     const TensorView* producer_tv,
     const TensorView* consumer_tv) {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
   auto p2c =
       PairwiseRootDomainMap(producer_tv, consumer_tv)
@@ -509,31 +286,29 @@ kir::Val* getProducerIndexWithPartialSplit(
 
   auto consumer_offset =
       gpu_lower->partialSplitMap().getStartOffset(consumer_root_id);
-  auto consumer_offset_kir = consumer_offset == nullptr
-      ? ir_builder.zeroVal()
-      : gpu_lower->lowerValue(consumer_offset);
+  consumer_offset = consumer_offset == nullptr ? gpu_lower->kernel()->zeroVal()
+                                               : consumer_offset;
 
   auto producer_offset =
       gpu_lower->partialSplitMap().getStartOffset(producer_root_id);
-  auto producer_offset_kir = producer_offset == nullptr
-      ? ir_builder.zeroVal()
-      : gpu_lower->lowerValue(producer_offset);
+  producer_offset = producer_offset == nullptr ? gpu_lower->kernel()->zeroVal()
+                                               : producer_offset;
 
   // If the producer is on global memory, it's always allocated
   // without trimming the out-of-bounds region, so the consumer offset
   // should be added to the index.
   if (producer_tv->getMemoryType() == MemoryType::Global) {
-    if (consumer_offset_kir->isZeroInt()) {
+    if (consumer_offset->isZeroInt()) {
       return producer_index;
     } else {
-      return ir_builder.addExpr(producer_index, consumer_offset_kir);
+      return SimplifyingIrBuilder::addExpr(producer_index, consumer_offset);
     }
   }
 
   // Non-global case. Difference of the split offsets must be
   // accounted.
 
-  auto diff = ir_builder.subExpr(consumer_offset_kir, producer_offset_kir);
+  auto diff = SimplifyingIrBuilder::subExpr(consumer_offset, producer_offset);
   kir::ExpressionEvaluator ee;
   auto diff_eval = ee.evaluate(diff);
   // We currently only allow constant offsetting
@@ -543,19 +318,16 @@ kir::Val* getProducerIndexWithPartialSplit(
     return producer_index;
   }
 
-  return ir_builder.addExpr(
-      producer_index, ir_builder.create<kir::Int>(diff_eval.value()));
+  return SimplifyingIrBuilder::addExpr(
+      producer_index, SimplifyingIrBuilder::create<Int>(diff_eval.value()));
 }
 
 } // namespace
 
 void IndexCompute::handle(Split* split) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  auto in_id = gpu_lower->lowerValue(split->in())->as<kir::IterDomain>();
-  auto outer_id = gpu_lower->lowerValue(split->outer())->as<kir::IterDomain>();
-  auto inner_id = gpu_lower->lowerValue(split->inner())->as<kir::IterDomain>();
+  auto in_id = split->in()->as<IterDomain>();
+  auto outer_id = split->outer()->as<IterDomain>();
+  auto inner_id = split->inner()->as<IterDomain>();
 
   auto outer_it = index_map_.find(outer_id);
   auto inner_it = index_map_.find(inner_id);
@@ -588,8 +360,8 @@ void IndexCompute::handle(Split* split) {
   }
 
   if (isZero(in_id)) {
-    index_map_[in_id] = ir_builder.create<kir::Int>(0);
-    extent_map_[in_id] = ir_builder.create<kir::Int>(0);
+    index_map_[in_id] = GpuLower::current()->kernel()->zeroVal();
+    extent_map_[in_id] = GpuLower::current()->kernel()->zeroVal();
   } else if (zero_merged_in && outer_zero) {
     index_map_[in_id] = inner_ind;
     extent_map_[in_id] = getExtent(inner_id);
@@ -597,24 +369,22 @@ void IndexCompute::handle(Split* split) {
     index_map_[in_id] = outer_ind;
     extent_map_[in_id] = getExtent(outer_id);
   } else {
-    index_map_[in_id] = ir_builder.addExpr(
-        ir_builder.mulExpr(outer_ind, getExtent(inner_id)), inner_ind);
+    index_map_[in_id] = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(outer_ind, getExtent(inner_id)),
+        inner_ind);
     // The extent should be updated only when its allocation is
     // partial, i.e., zero_merged_in is true. See PR #1270.
     if (zero_merged_in) {
-      extent_map_[in_id] =
-          ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id));
+      extent_map_[in_id] = SimplifyingIrBuilder::mulExpr(
+          getExtent(outer_id), getExtent(inner_id));
     }
   }
 }
 
 void IndexCompute::handle(Merge* merge) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  auto out_id = gpu_lower->lowerValue(merge->out())->as<kir::IterDomain>();
-  auto outer_id = gpu_lower->lowerValue(merge->outer())->as<kir::IterDomain>();
-  auto inner_id = gpu_lower->lowerValue(merge->inner())->as<kir::IterDomain>();
+  auto out_id = merge->out();
+  auto outer_id = merge->outer();
+  auto inner_id = merge->inner();
 
   auto out_it = index_map_.find(out_id);
   if (out_it == index_map_.end()) {
@@ -622,7 +392,7 @@ void IndexCompute::handle(Merge* merge) {
   }
   auto out_ind = out_it->second;
 
-  auto zero = ir_builder.zeroVal();
+  auto zero = GpuLower::current()->kernel()->zeroVal();
 
   if (isZero(out_id)) {
     index_map_[outer_id] = zero;
@@ -634,7 +404,7 @@ void IndexCompute::handle(Merge* merge) {
     return;
   }
 
-  if (!hasZeroMerged(out_id) && contig_ids.find(out_id) != contig_ids.end()) {
+  if (!hasZeroMerged(out_id) && contig_ids_.find(out_id) != contig_ids_.end()) {
     // Contiguous indexing path
     auto input_ids = ir_utils::iterDomainInputsOfOrderedAs(
         {merge->out()}, td_->getMaybeRFactorDomain());
@@ -642,18 +412,40 @@ void IndexCompute::handle(Merge* merge) {
     // Shouldn't hit this, but don't want to segfault if somehow we do.
     TORCH_INTERNAL_ASSERT(!input_ids.empty());
 
+    // Try to find the last non broadcast entry to put the index in if it's a
+    // contiguous merge. This isn't strictly necessary but there's implicit
+    // assumptions in the indexing logic that assume broadcasted root domains
+    // can be ignored. This logic is just to try and match that logic.
+    // Initialize everything to zero.
     for (auto root_id : input_ids) {
-      index_map_[gpu_lower->lowerValue(root_id)->as<kir::IterDomain>()] = zero;
+      index_map_[root_id] = zero;
+    }
+
+    // If all are broadcast we can just send the index to the last entry.
+    if (std::all_of(input_ids.begin(), input_ids.end(), [](IterDomain* id) {
+          // I don't think reductions can be in here, but strictly matching the
+          // logic in the indexing functions like
+          // getNonGlobalConsumerStridedIndices
+          return id->isBroadcast() || id->isReduction() || id->isStride();
+        })) {
+      index_map_[*(input_ids.end() - 1)] = out_ind;
+    } else {
+      for (auto id_it = input_ids.rbegin(); id_it != input_ids.rend();
+           id_it++) {
+        auto id = *id_it;
+        if (id->isBroadcast() || id->isReduction() || id->isStride()) {
+          continue;
+        } else {
+          index_map_[id] = out_ind;
+          break;
+        }
+      }
     }
 
-    index_map_[gpu_lower
-                   ->lowerValue(*(input_ids.end() - 1))
-                   // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-                   ->as<kir::IterDomain>()] = out_ind;
     return;
   }
 
-  kir::Val* inner_extent = getExtent(inner_id);
+  Val* inner_extent = getExtent(inner_id);
 
   // When the reference has halo extent for inner_id, that extent needs to
   // be used to un-merge
@@ -718,8 +510,8 @@ void IndexCompute::handle(Merge* merge) {
     zero_merged_in_.emplace(inner_id);
     zero_merged_in_.emplace(outer_id);
   } else {
-    index_map_[outer_id] = ir_builder.divExpr(out_ind, inner_extent);
-    index_map_[inner_id] = ir_builder.modExpr(out_ind, inner_extent);
+    index_map_[outer_id] = SimplifyingIrBuilder::divExpr(out_ind, inner_extent);
+    index_map_[inner_id] = SimplifyingIrBuilder::modExpr(out_ind, inner_extent);
   }
 }
 
@@ -735,17 +527,37 @@ void IndexCompute::handle(Expr* e) {
   BackwardVisitor::handle(e);
 }
 
-// Otherwise warning on runBackward as it hides an overloaded virtual
-// using TransformIter::runBackward;
 IndexCompute::IndexCompute(
     const TensorDomain* _td,
-    std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map,
-    std::unordered_map<kir::IterDomain*, kir::Val*> extent_map,
-    std::unordered_set<kir::IterDomain*> zero_domains,
-    std::unordered_set<kir::IterDomain*> zero_merged_in,
-    const std::vector<bool>& root_contiguity,
-    std::unordered_set<kir::IterDomain*> preferred_paths,
-    std::unordered_map<kir::IterDomain*, kir::Val*> reference_halo_extent_map)
+    std::unordered_map<IterDomain*, Val*> initial_index_map,
+    std::unordered_map<IterDomain*, Val*> extent_map,
+    std::unordered_set<IterDomain*> zero_domains,
+    std::unordered_set<IterDomain*> zero_merged_in,
+    std::unordered_set<IterDomain*> preferred_paths,
+    std::unordered_map<IterDomain*, Val*> reference_halo_extent_map)
+    : IndexCompute(
+          _td,
+          std::move(initial_index_map),
+          std::move(extent_map),
+          std::move(zero_domains),
+          std::move(zero_merged_in),
+          ContigIDs(
+              _td->domain(),
+              _td->getMaybeRFactorDomain(),
+              std::vector<bool>(_td->getMaybeRFactorDomain().size(), false),
+              {}),
+          std::move(preferred_paths),
+          std::move(reference_halo_extent_map)) {}
+
+IndexCompute::IndexCompute(
+    const TensorDomain* _td,
+    std::unordered_map<IterDomain*, Val*> initial_index_map,
+    std::unordered_map<IterDomain*, Val*> extent_map,
+    std::unordered_set<IterDomain*> zero_domains,
+    std::unordered_set<IterDomain*> zero_merged_in,
+    const ContigIDs& contig_finder,
+    std::unordered_set<IterDomain*> preferred_paths,
+    std::unordered_map<IterDomain*, Val*> reference_halo_extent_map)
     : td_(_td),
       index_map_(std::move(initial_index_map)),
       extent_map_(std::move(extent_map)),
@@ -757,20 +569,15 @@ IndexCompute::IndexCompute(
 
   // Make sure we recompute any indices we can that map to a contiguous access
   // in physical memory.
-  if (std::any_of(root_contiguity.begin(), root_contiguity.end(), [](bool b) {
-        return b;
-      })) {
-    ContigIDs contig_finder(
-        td_->domain(), td_->getMaybeRFactorDomain(), root_contiguity);
-    contig_ids = contig_finder.contigIDs();
-    auto within_contig = contig_finder.withinContigIDs();
-    for (auto contig_id : contig_ids) {
-      if (index_map_.find(contig_id) != index_map_.end()) {
-        TORCH_INTERNAL_ASSERT(
-            within_contig.find(contig_id) != within_contig.end());
-        for (auto id : within_contig.at(contig_id)) {
-          index_map_.erase(id);
-        }
+  contig_ids_ = contig_finder.contigIDs();
+  root_to_indexed_id_ = contig_finder.rootToIndexedID();
+  const auto& within_contig = contig_finder.withinContigIDs();
+  for (auto contig_id : contig_ids_) {
+    if (index_map_.find(contig_id) != index_map_.end()) {
+      TORCH_INTERNAL_ASSERT(
+          within_contig.find(contig_id) != within_contig.end());
+      for (auto id : within_contig.at(contig_id)) {
+        index_map_.erase(id);
       }
     }
   }
@@ -783,7 +590,7 @@ void IndexCompute::run() {
   traverseFrom(td_->fusion(), domain_vals, false);
 }
 
-kir::Val* IndexCompute::getExtent(kir::IterDomain* id) {
+Val* IndexCompute::getExtent(IterDomain* id) const {
   // Pick from extent_map_ if available. Previously parallel
   // dimensions were ued (e.g., blockDim.x), however, it would result
   // in out-of-bounds errors when the extent of IterDomain is smaller
@@ -795,34 +602,30 @@ kir::Val* IndexCompute::getExtent(kir::IterDomain* id) {
   }
 }
 
-bool IndexCompute::hasZeroMerged(kir::IterDomain* id) const {
+bool IndexCompute::hasZeroMerged(IterDomain* id) const {
   return zero_merged_in_.find(id) != zero_merged_in_.end() || isZero(id);
 }
 
-bool IndexCompute::isZero(kir::IterDomain* id) const {
+bool IndexCompute::isZero(IterDomain* id) const {
   return zero_domains_.find(id) != zero_domains_.end();
 }
 
 IndexCompute IndexCompute::updateIndexCompute(
     const TensorDomain* new_td,
     const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-    const std::vector<bool>& root_contiguity,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>&
-        reference_halo_extent_map) {
+    const ContigIDs& contig_finder,
+    const std::unordered_map<IterDomain*, Val*>& reference_halo_extent_map)
+    const {
   FUSER_PERF_SCOPE("GpuLower::Lower::updateIndexCompute");
 
-  const auto gpu_lower = GpuLower::current();
-
-  std::unordered_map<kir::IterDomain*, kir::Val*> updated_index_map;
-  std::unordered_map<kir::IterDomain*, kir::Val*> updated_extent_map;
-  std::unordered_set<kir::IterDomain*> updated_zero_domains;
-  std::unordered_set<kir::IterDomain*> updated_zero_merged_in;
+  std::unordered_map<IterDomain*, Val*> updated_index_map;
+  std::unordered_map<IterDomain*, Val*> updated_extent_map;
+  std::unordered_set<IterDomain*> updated_zero_domains;
+  std::unordered_set<IterDomain*> updated_zero_merged_in;
 
   for (auto id_entry : id_map) {
-    kir::IterDomain* prev_id =
-        gpu_lower->lowerValue(id_entry.first)->as<kir::IterDomain>();
-    kir::IterDomain* new_id =
-        gpu_lower->lowerValue(id_entry.second)->as<kir::IterDomain>();
+    IterDomain* prev_id = id_entry.first;
+    IterDomain* new_id = id_entry.second;
 
     if (index_map_.find(prev_id) != index_map_.end()) {
       updated_index_map[new_id] = index_map_.at(prev_id);
@@ -845,7 +648,7 @@ IndexCompute IndexCompute::updateIndexCompute(
       updated_extent_map,
       updated_zero_domains,
       updated_zero_merged_in,
-      root_contiguity,
+      contig_finder,
       {},
       reference_halo_extent_map);
   updated_index_compute.run();
@@ -859,8 +662,8 @@ class UpdateLeafIndices : public IterVisitor {
  public:
   UpdateLeafIndices(
       const TensorDomain* td,
-      std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map,
-      std::unordered_map<kir::IterDomain*, kir::Val*> extent_map)
+      std::unordered_map<IterDomain*, Val*> initial_index_map,
+      std::unordered_map<IterDomain*, Val*> extent_map)
       : td_(td),
         index_map_(std::move(initial_index_map)),
         extent_map_(std::move(extent_map)) {
@@ -870,11 +673,11 @@ class UpdateLeafIndices : public IterVisitor {
     traverseFrom(td_->fusion(), domain_vals, false);
   }
 
-  const std::unordered_map<kir::IterDomain*, kir::Val*>& indexMap() const {
+  const std::unordered_map<IterDomain*, Val*>& indexMap() const {
     return index_map_;
   }
 
-  const std::unordered_map<kir::IterDomain*, kir::Val*>& extentMap() const {
+  const std::unordered_map<IterDomain*, Val*>& extentMap() const {
     return extent_map_;
   }
 
@@ -882,13 +685,9 @@ class UpdateLeafIndices : public IterVisitor {
   using IterVisitor::handle;
 
   void handle(Split* split) override {
-    const auto gpu_lower = GpuLower::current();
-
-    auto in_id = gpu_lower->lowerValue(split->in())->as<kir::IterDomain>();
-    auto outer_id =
-        gpu_lower->lowerValue(split->outer())->as<kir::IterDomain>();
-    auto inner_id =
-        gpu_lower->lowerValue(split->inner())->as<kir::IterDomain>();
+    auto in_id = split->in();
+    auto outer_id = split->outer();
+    auto inner_id = split->inner();
 
     // Nothing need to be done when mappings for the output axes
     // already exist.
@@ -899,22 +698,20 @@ class UpdateLeafIndices : public IterVisitor {
       return;
     }
 
-    kir::IrBuilder ir_builder(gpu_lower->kernel());
-    auto factor = gpu_lower->lowerValue(split->factor());
-    index_map_[inner_id] = ir_builder.modExpr(index_map_[in_id], factor);
+    auto factor = split->factor();
+    index_map_[inner_id] =
+        SimplifyingIrBuilder::modExpr(index_map_[in_id], factor);
     extent_map_[inner_id] = factor;
-    index_map_[outer_id] = ir_builder.divExpr(index_map_[in_id], factor);
-    extent_map_[outer_id] = ir_builder.ceilDivExpr(getExtent(in_id), factor);
+    index_map_[outer_id] =
+        SimplifyingIrBuilder::divExpr(index_map_[in_id], factor);
+    extent_map_[outer_id] =
+        SimplifyingIrBuilder::ceilDivExpr(getExtent(in_id), factor);
   }
 
   void handle(Merge* merge) override {
-    const auto gpu_lower = GpuLower::current();
-
-    auto out_id = gpu_lower->lowerValue(merge->out())->as<kir::IterDomain>();
-    auto outer_id =
-        gpu_lower->lowerValue(merge->outer())->as<kir::IterDomain>();
-    auto inner_id =
-        gpu_lower->lowerValue(merge->inner())->as<kir::IterDomain>();
+    auto out_id = merge->out();
+    auto outer_id = merge->outer();
+    auto inner_id = merge->inner();
 
     // Nothing need to be done when mappings for the output axes
     // already exist.
@@ -927,17 +724,17 @@ class UpdateLeafIndices : public IterVisitor {
     TORCH_INTERNAL_ASSERT(
         index_map_.find(inner_id) != index_map_.end(), "Inner ID not found");
 
-    kir::IrBuilder ir_builder(gpu_lower->kernel());
-    index_map_[out_id] = ir_builder.mulExpr(
+    index_map_[out_id] = SimplifyingIrBuilder::mulExpr(
         index_map_[inner_id],
-        ir_builder.mulExpr(index_map_[outer_id], getExtent(inner_id)));
+        SimplifyingIrBuilder::mulExpr(
+            index_map_[outer_id], getExtent(inner_id)));
 
     extent_map_[out_id] =
-        ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id));
+        SimplifyingIrBuilder::mulExpr(getExtent(outer_id), getExtent(inner_id));
   }
 
   // return extent_map_[id] if exists, else return id->extent()
-  kir::Val* getExtent(kir::IterDomain* id) {
+  Val* getExtent(IterDomain* id) {
     if (extent_map_.find(id) != extent_map_.end()) {
       return extent_map_.at(id);
     } else {
@@ -947,25 +744,21 @@ class UpdateLeafIndices : public IterVisitor {
 
  private:
   const TensorDomain* td_;
-  std::unordered_map<kir::IterDomain*, kir::Val*> index_map_;
-  std::unordered_map<kir::IterDomain*, kir::Val*> extent_map_;
+  std::unordered_map<IterDomain*, Val*> index_map_;
+  std::unordered_map<IterDomain*, Val*> extent_map_;
 };
 
 // Returns halo-extended extent if id has halo. Otherwise, just
 // returns id->extent.
-kir::Val* getHaloExtentOfRootAxis(
-    IterDomain* id,
-    kir::Val* normal_extent = nullptr) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
+Val* getHaloExtentOfRootAxis(IterDomain* id, Val* normal_extent = nullptr) {
   if (normal_extent == nullptr) {
-    normal_extent = gpu_lower->lowerValue(id->extent());
+    normal_extent = id->extent();
   }
 
-  const auto& halo = gpu_lower->haloInfo().getRootAxisInfo(id);
+  const auto& halo = GpuLower::current()->haloInfo().getRootAxisInfo(id);
   if (halo.hasHalo()) {
-    auto halo_extent = ir_builder.addExpr(normal_extent, halo.width());
+    auto halo_extent = SimplifyingIrBuilder::addExpr(
+        normal_extent, SimplifyingIrBuilder::create<Int>(halo.width()));
     return halo_extent;
   } else {
     return normal_extent;
@@ -976,17 +769,16 @@ kir::Val* getHaloExtentOfRootAxis(
 
 IndexSwizzle::IndexSwizzle(
     const TensorView* tv,
-    std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map,
-    std::unordered_map<kir::IterDomain*, kir::Val*> extent_map,
-    std::unordered_set<kir::IterDomain*> zero_domains,
-    std::unordered_set<kir::IterDomain*> zero_merged_in)
+    std::unordered_map<IterDomain*, Val*> initial_index_map,
+    std::unordered_map<IterDomain*, Val*> extent_map,
+    std::unordered_set<IterDomain*> zero_domains,
+    std::unordered_set<IterDomain*> zero_merged_in)
     : IndexCompute(
           tv->domain(),
           std::move(initial_index_map),
           std::move(extent_map),
           std::move(zero_domains),
-          std::move(zero_merged_in),
-          std::vector<bool>(tv->getRootDomain().size(), false)),
+          std::move(zero_merged_in)),
       tv_(tv),
       swizzle_type_(tv->swizzleType()),
       ids_to_swizzle_(tv->axesToSwizzle()) {}
@@ -996,8 +788,6 @@ void IndexSwizzle::run() {
       swizzle_type_ == SwizzleType::NoSwizzle ||
           swizzle_type_ == SwizzleType::Transpose,
       "Invalid swizzle type");
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
   if (swizzle_type_ == SwizzleType::Transpose) {
     // Shifts the second axis by the first axis as ((idx_1 + idx_2) %
     // ext). Alternatively, ((idx_1 - idx_2) & (ext - 1)) would also
@@ -1013,20 +803,16 @@ void IndexSwizzle::run() {
 
     IterDomain* id_to_swizzle_i = ids_to_swizzle_.at(0);
     IterDomain* id_to_swizzle_j = ids_to_swizzle_.at(1);
-    kir::IterDomain* id_to_swizzle_i_kir =
-        gpu_lower->lowerValue(id_to_swizzle_i)->as<kir::IterDomain>();
-    kir::IterDomain* id_to_swizzle_j_kir =
-        gpu_lower->lowerValue(id_to_swizzle_j)->as<kir::IterDomain>();
-
-    if (indexMap().find(id_to_swizzle_i_kir) != indexMap().end() &&
-        indexMap().find(id_to_swizzle_j_kir) != indexMap().end()) {
-      auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i_kir);
-      auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j_kir);
-
-      auto swizzled_idx = ir_builder.modExpr(
-          ir_builder.addExpr(idx_to_swizzle_i, idx_to_swizzle_j),
-          id_to_swizzle_j_kir->extent());
-      index_map_[id_to_swizzle_j_kir] = swizzled_idx;
+
+    if (indexMap().find(id_to_swizzle_i) != indexMap().end() &&
+        indexMap().find(id_to_swizzle_j) != indexMap().end()) {
+      auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i);
+      auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j);
+
+      auto swizzled_idx = SimplifyingIrBuilder::modExpr(
+          SimplifyingIrBuilder::addExpr(idx_to_swizzle_i, idx_to_swizzle_j),
+          id_to_swizzle_j->extent());
+      index_map_[id_to_swizzle_j] = swizzled_idx;
       swizzled_ids_.insert(id_to_swizzle_j);
       IndexCompute::run();
     }
@@ -1055,18 +841,14 @@ namespace {
 // to loop indices as well as a set of loops that do not contribute to
 // indexing.
 std::pair<
-    std::unordered_map<kir::ForLoop*, kir::Val*>,
+    std::unordered_map<kir::ForLoop*, Val*>,
     std::unordered_set<kir::ForLoop*>>
 indexMapFromTV(
     const TensorView* tv,
     const std::vector<kir::ForLoop*>& loops,
-    const std::pair<kir::ForLoop*, int64_t>& alloc_point,
-    bool as_consumer) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-  auto alloc_loop = alloc_point.first;
-
+    kir::ForLoop* alloc_loop,
+    bool as_consumer,
+    kir::ForLoop* double_buffer_loop = nullptr) {
   bool within_alloc = false;
   if (alloc_loop == nullptr) {
     within_alloc = true;
@@ -1076,7 +858,14 @@ indexMapFromTV(
   const bool is_shared = tv->getMemoryType() == MemoryType::Shared;
   const bool is_local = tv->getMemoryType() == MemoryType::Local;
 
-  std::unordered_map<kir::ForLoop*, kir::Val*> loop_to_ind_map;
+  std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map;
+
+  // Check if the current op has an implicit loop implemented
+  //  within an mma instruction.
+  bool within_mma_loops =
+      std::any_of(loops.begin(), loops.end(), [](kir::ForLoop* fl) {
+        return fl->iter_domain()->isMma();
+      });
 
   // When indexed as a producer, the parallel types of the the
   // producer domains may not be the same as those of the loops, but
@@ -1085,17 +874,18 @@ indexMapFromTV(
   // with zero isn't valid. That's only valid when there's a matching
   // IterDomain in the producer tensor that has the same parallel
   // type.
-  auto find_matching_parallel_domain = [tv](kir::IterDomain* id) -> bool {
+  auto find_matching_parallel_domain = [tv](IterDomain* id) -> bool {
     const auto gpu_lower = GpuLower::current();
     auto it = std::find_if(
         tv->domain()->domain().begin(),
         tv->domain()->domain().end(),
         [&](IterDomain* tv_id) {
-          auto kir_tv_id = gpu_lower->lowerValue(tv_id)->as<kir::IterDomain>();
           // Matching is done using the index and loop maps. See
           // validateParallelize as well.
-          return gpu_lower->caIndexMap().areMapped(id, kir_tv_id) ||
-              (gpu_lower->caLoopMap().areMapped(id, kir_tv_id) &&
+          return gpu_lower->caMap()->areMapped(
+                     id, tv_id, IdMappingMode::EXACT) ||
+              (GpuLower::current()->caMap()->areMapped(
+                   id, tv_id, IdMappingMode::PERMISSIVE) &&
                ir_utils::derivedFromRootCAAxes(tv, tv_id));
         });
     if (it == tv->domain()->domain().end()) {
@@ -1103,7 +893,7 @@ indexMapFromTV(
     }
 
     auto corresponding_domain = *it;
-    return corresponding_domain->getParallelType() == id->parallelType();
+    return corresponding_domain->getParallelType() == id->getParallelType();
   };
 
   // Track domains that do not contibute to the resulting
@@ -1113,9 +903,16 @@ indexMapFromTV(
   std::unordered_set<kir::ForLoop*> zero_loops;
 
   for (auto loop : loops) {
-    kir::Val* idx = nullptr;
-    const auto same_parallel_type =
-        as_consumer || find_matching_parallel_domain(loop->iter_domain());
+    Val* idx = nullptr;
+    const auto same_parallel_type = as_consumer ||
+        find_matching_parallel_domain(loop->iter_domain()) ||
+        // Note && TODO:
+        //  mma swizzled lane_id does not map naturally from producer
+        //   to consumer but they should still be detected as same
+        //   parallel type. In a follow up may want to extent
+        //   find_matching_parallel_domain to cover this case.
+        (within_mma_loops &&
+         loop->iter_domain()->getParallelType() == ParallelType::TIDx);
     // See also LoopNestGenerator::pushAlloc.
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (!within_alloc) {
@@ -1123,7 +920,7 @@ indexMapFromTV(
           (loop->iter_domain()->isThread() && is_global)) {
         idx = loop->index();
       } else {
-        idx = ir_builder.zeroVal();
+        idx = GpuLower::current()->kernel()->zeroVal();
         zero_loops.insert(loop);
       }
     } else if (
@@ -1143,16 +940,24 @@ indexMapFromTV(
         // Similarly for local memory tensors, zero replacement can be
         // only done when there's a matching domain with the same
         // parallel type
-        (loop->iter_domain()->isThread() && is_local && same_parallel_type) ||
-        loop->vectorize()) {
-      idx = ir_builder.zeroVal();
-      if (!loop->vectorize()) {
-        zero_loops.insert(loop);
-      }
+        (loop->iter_domain()->isThread() && is_local && same_parallel_type)) {
+      idx = GpuLower::current()->kernel()->zeroVal();
+      zero_loops.insert(loop);
     } else {
       idx = loop->index();
     }
 
+    // If the loop is trivial, the loop index can only be the loop
+    // start value.
+    if (idx == loop->index() && loop->isTrivial()) {
+      idx = loop->start();
+    }
+
+    if (loop == double_buffer_loop) {
+      idx = SimplifyingIrBuilder::addExpr(
+          idx, GpuLower::current()->kernel()->oneVal());
+    }
+
     loop_to_ind_map[loop] = idx;
 
     if (!within_alloc && loop == alloc_loop) {
@@ -1184,8 +989,6 @@ void ensureStaticIndexing(
     within_alloc = true;
   }
 
-  const auto gpu_lower = GpuLower::current();
-
   for (auto loop : loops) {
     if (!within_alloc) {
       if (loop == alloc_loop) {
@@ -1193,7 +996,7 @@ void ensureStaticIndexing(
       }
       continue;
     }
-    kir::IterDomain* loop_id = loop->iter_domain();
+    IterDomain* loop_id = loop->iter_domain();
     if (loop->vectorize() || loop_id->isThread()) {
       continue;
     }
@@ -1203,7 +1006,7 @@ void ensureStaticIndexing(
     auto it = std::find_if(
         tv->domain()->domain().begin(),
         tv->domain()->domain().end(),
-        [loop_id, gpu_lower, &id_map](IterDomain* id) {
+        [loop_id, &id_map](IterDomain* id) {
           if (id->isBroadcast() || id->isReduction() || id->isStride()) {
             return false;
           }
@@ -1211,8 +1014,8 @@ void ensureStaticIndexing(
           if (id_replacement != id_map.end()) {
             id = id_replacement->second;
           }
-          auto kir_id = gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-          return gpu_lower->caLoopMap().areMapped(loop_id, kir_id);
+          return GpuLower::current()->caMap()->areMapped(
+              loop_id, id, IdMappingMode::PERMISSIVE);
         });
     if (it != tv->domain()->domain().end()) {
       loop->requireUnroll();
@@ -1229,7 +1032,7 @@ void ensureStaticIndexing(
 // operation.
 std::unordered_map<IterDomain*, IterDomain*> indexMapReferenceTo(
     const TensorView* tv,
-    const ComputeAtMap& ca_map,
+    const std::unique_ptr<ComputeAtMap>& ca_map,
     const std::unordered_map<IterDomain*, IterDomain*>&
         reference_concrete_to_id_map,
     bool root_only = false) {
@@ -1237,7 +1040,8 @@ std::unordered_map<IterDomain*, IterDomain*> indexMapReferenceTo(
 
   auto gen_map = [&](const auto& pids) {
     for (auto p_id : pids) {
-      auto concrete_id = ca_map.getConcreteMappedID(p_id);
+      auto concrete_id =
+          ca_map->getConcreteMappedID(p_id, IdMappingMode::EXACT);
       auto ref_id_it = reference_concrete_to_id_map.find(concrete_id);
       if (ref_id_it != reference_concrete_to_id_map.end()) {
         index_map_ref_to_producer[ref_id_it->second] = p_id;
@@ -1258,18 +1062,153 @@ std::unordered_map<IterDomain*, IterDomain*> indexMapReferenceTo(
   return index_map_ref_to_producer;
 }
 
+Val* hoistConsumerIndex(
+    IterDomain* consumer_root_id,
+    const TensorView* consumer_tv,
+    const IndexCompute& consumer_indexing,
+    TensorDomain* ref_td,
+    const IndexCompute& ref_indexing,
+    const std::vector<kir::ForLoop*>& loops,
+    Val* index) {
+  // If index has no defining expression, there's nothing to hoist
+  if (isDisabled(DisableOption::IndexHoist) || index->definition() == nullptr) {
+    return index;
+  }
+
+  // The old swizzle interface, which should be deprecated, is not
+  // supported.
+  if (consumer_tv->swizzleType() != SwizzleType::NoSwizzle) {
+    return index;
+  }
+
+  // auto indexed_consumer_id = consumer_root_id;
+  // Find the true indexed domain, which can be a merged contiguous domain.
+  auto contig_id_it = consumer_indexing.rootToContigID().find(consumer_root_id);
+  TORCH_INTERNAL_ASSERT(
+      contig_id_it != consumer_indexing.rootToContigID().end(),
+      "Consumer indexed ID not found: ",
+      consumer_root_id->toString());
+  auto indexed_consumer_id = contig_id_it->second;
+  // Make sure this contig ID is indeed indexed
+  TORCH_INTERNAL_ASSERT(
+      consumer_indexing.indexMap().find(contig_id_it->second) !=
+          consumer_indexing.indexMap().end(),
+      "Invalid contig index: ",
+      contig_id_it->second->toString());
+
+  // Insert the index into the common index map. A previously inserted
+  // val can be returned.
+  auto common_index = GpuLower::current()
+                          ->commonIndexMap()
+                          .insert(
+                              indexed_consumer_id,
+                              consumer_tv->domain(),
+                              ref_td,
+                              ref_indexing.indexMap(),
+                              loops,
+                              index)
+                          .first;
+
+  return common_index;
+}
+
+std::unordered_map<IterDomain*, IterDomain*> invertOneToOneMap(
+    const std::unordered_map<IterDomain*, IterDomain*>& map) {
+  std::unordered_map<IterDomain*, IterDomain*> inverted;
+  for (const auto& kv : map) {
+    bool inserted = inverted.emplace(kv.second, kv.first).second;
+    TORCH_INTERNAL_ASSERT(
+        inserted,
+        "Multiple mappings to the same value detected: ",
+        kv.second->toString());
+  }
+  return inverted;
+}
+
+Val* hoistProducerIndex(
+    IterDomain* producer_root_id,
+    const TensorView* producer_tv,
+    const IndexCompute& producer_indexing,
+    const TensorView* consumer_tv,
+    const std::unordered_map<IterDomain*, IterDomain*>& p2c_map,
+    TensorDomain* ref_td,
+    const IndexCompute& ref_indexing,
+    const std::vector<kir::ForLoop*>& loops,
+    Val* index) {
+  // If index has no defining expression, there's nothing to hoist
+  if (isDisabled(DisableOption::IndexHoist) || index->definition() == nullptr) {
+    return index;
+  }
+
+  // The old swizzle interface, which should be deprecated, is not
+  // supported.
+  if (producer_tv->swizzleType() != SwizzleType::NoSwizzle) {
+    return index;
+  }
+
+  // auto indexed_producer_id = producer_root_id;
+  auto contig_id_it = producer_indexing.rootToContigID().find(producer_root_id);
+  TORCH_INTERNAL_ASSERT(
+      contig_id_it != producer_indexing.rootToContigID().end(),
+      "Producer indexed ID not found: ",
+      producer_root_id->toString());
+  auto indexed_producer_id = contig_id_it->second;
+  // Make sure this contig ID is indeed indexed
+  TORCH_INTERNAL_ASSERT(
+      producer_indexing.indexMap().find(indexed_producer_id) !=
+          producer_indexing.indexMap().end(),
+      "Invalid contig id: ",
+      indexed_producer_id->toString());
+
+  // Use the corresponding consumer domain to find matching
+  // for-loops. Note that there's no CA mapping with the producer
+  // domains as the producer TensorDomain is a temporary replay
+  // domain.
+
+  auto indexed_consumer_id_it = p2c_map.find(indexed_producer_id);
+
+  // There can be no corresponding consumer ID. For example, consider:
+  //   consumer: [b1, i2, i3]
+  //   producer: [i2, i3].
+  // Suppose the consumer is transformed as:
+  //   consumer: [(b1*i2)*i3]
+  // Then the producer would be transformed when indexed:
+  //   producer: [i2*i3]
+  // Assuming i2 and i3 are contiguous, the producer indexing is done
+  // with the mreged i2*i3 domain, but there's no domain in the
+  // cosumer that maps with the producer indexed domain.
+  // It seems non-trivial to support patterns like this. Skip for now.
+  if (indexed_consumer_id_it == p2c_map.end()) {
+    return index;
+  }
+
+  IterDomain* indexed_consumer_id = indexed_consumer_id_it->second;
+
+  auto common_index = GpuLower::current()
+                          ->commonIndexMap()
+                          .insert(
+                              indexed_consumer_id,
+                              consumer_tv->domain(),
+                              ref_td,
+                              ref_indexing.indexMap(),
+                              loops,
+                              index)
+                          .first;
+
+  return common_index;
+}
+
 } // namespace
 
-std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
+std::vector<Val*> Index::getGlobalProducerStridedIndices(
     TensorView* producer_tv,
     const TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
   FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalProducerIndex");
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
   // Get a reference tensor replayed as existing loop structure
-  auto reference = IndexReferenceReplay::getReference(loops);
+  auto reference = IndexReferenceReplay::getReference(loops, consumer_tv);
   auto reference_domain = reference.domain;
   auto reference_id_map = reference.concrete_to_id;
 
@@ -1286,19 +1225,24 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
   // Map everything we can from reference to producer using compute at index
   // map. Use consumer as a proxy between producer and the generated reference.
   std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_producer;
-  {
-    // This replay has to be consistent with compute at index map.
-    BestEffortReplay replay_producer_as_consumer(
-        producer_tv->domain()->domain(),
-        consumer_tv->domain()->domain(),
-        pairwise_map.mapConsumerToProducer(
-            consumer_tv->domain(), producer_tv->domain()));
 
-    const auto& c2p_map = replay_producer_as_consumer.getReplay();
+  // Map sent to best effort replay needs to match the exact incantation for
+  // compute_at_mode.cpp with MappingMode::Index
+  auto c2p_root_map =
+      PairwiseRootDomainMap(producer_tv, consumer_tv, true)
+          .mapConsumerToProducer(consumer_tv->domain(), producer_tv->domain());
 
+  // This replay has to be consistent with compute at index map.
+  BestEffortReplay replay_producer_as_consumer(
+      producer_tv->domain()->domain(),
+      consumer_tv->domain()->domain(),
+      c2p_root_map);
+
+  const auto& c2p_map = replay_producer_as_consumer.getReplay();
+  const auto p2c_map = invertOneToOneMap(c2p_map);
+  {
     std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_consumer =
-        indexMapReferenceTo(
-            consumer_tv, gpu_lower->caIndexMap(), reference_id_map);
+        indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map);
 
     for (auto entry : index_map_ref_to_consumer) {
       auto r_id = entry.first;
@@ -1311,9 +1255,12 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
     }
   }
 
+  kir::ForLoop* db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop(
+      consumer_tv, loops, true);
+
   // Index into the reference tensor. Reference indexing will handle vectorized
   // dims where index should be set to 0
-  auto ref_compute = getReferenceIndexing(loops, reference_domain);
+  auto ref_compute = getReferenceIndexing(loops, reference_domain, db_loop);
 
   // Forward vectorized IDs to index into producer correctly
   // We want p_id to be vectorized like consumer just for the indexing, then we
@@ -1338,11 +1285,18 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
   const auto reference_halo_extent_map =
       getReferenceHaloExtentMap(reference, index_map_ref_to_producer);
 
+  ContigIDs contig_finder(
+      producer_tv->domain()->domain(),
+      producer_tv->getMaybeRFactorDomain(),
+      producer_tv->domain()->contiguity(),
+      reference_id_map,
+      p2c_map);
+
   // Index into producer using reference indexing
   auto producer_indexing = ref_compute.updateIndexCompute(
       producer_tv->domain(),
       index_map_ref_to_producer,
-      producer_tv->domain()->contiguity(),
+      contig_finder,
       reference_halo_extent_map);
 
   // Revert p_ids
@@ -1355,25 +1309,25 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
   auto root_dom = producer_tv->getMaybeRFactorDomain();
 
   // TODO: Abstract stride logic to reuse with consumer indexing
-  auto zero = ir_builder.create<kir::Int>(0);
-  std::vector<kir::Val*> strides(root_dom.size(), nullptr);
+  std::vector<Val*> strides(root_dom.size(), nullptr);
   {
     int stride_i = 0;
     for (const auto i : c10::irange(root_dom.size())) {
       if (root_dom[i]->isReduction() ||
           root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) {
-        strides[i] = zero;
+        strides[i] = GpuLower::current()->kernel()->oneVal();
         continue;
       }
       std::stringstream ss;
       ss << "T" << producer_tv->name() << ".stride[" << stride_i++ << "]";
-      strides[i] = ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int);
+      strides[i] =
+          SimplifyingIrBuilder::create<NamedScalar>(ss.str(), DataType::Int);
     }
   }
 
   TORCH_INTERNAL_ASSERT(
       root_dom.size() == producer_tv->domain()->contiguity().size());
-  kir::Val* cur_contig_stride = ir_builder.create<kir::Int>(1);
+  Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal();
   for (const auto i : c10::irange(root_dom.size())) {
     auto dim = root_dom.size() - i - 1;
     if (root_dom[dim]->isReduction()) {
@@ -1383,24 +1337,26 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
       continue;
     }
 
-    kir::Val* root_ind = nullptr;
-    auto kir_root_dom =
-        gpu_lower->lowerValue(root_dom[dim])->as<kir::IterDomain>();
-    if (producer_indexing.indexMap().find(kir_root_dom) !=
+    Val* root_ind = nullptr;
+    if (producer_indexing.indexMap().find(root_dom[dim]) !=
         producer_indexing.indexMap().end()) {
-      root_ind = producer_indexing.indexMap().at(kir_root_dom);
+      root_ind = producer_indexing.indexMap().at(root_dom[dim]);
     } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) {
-      root_ind = zero;
+      root_ind = GpuLower::current()->kernel()->zeroVal();
     }
 
     TORCH_INTERNAL_ASSERT(
         root_ind != nullptr,
-        "Couldn't find root mapping for TV",
-        producer_tv->name(),
+        "Couldn't find root mapping for ",
+        producer_tv->toString(),
         " dim: ",
-        i,
+        dim,
         " id: ",
-        root_dom[dim]);
+        root_dom[dim]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
 
     if (producer_tv->domain()->contiguity()[dim]) {
       // If contig, used the stored stride which may be the previous
@@ -1410,12 +1366,13 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
       // by extent of this dimension
       auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]);
       cur_contig_stride =
-          ir_builder.mulExpr(cur_contig_stride, root_dim_extent);
+          SimplifyingIrBuilder::mulExpr(cur_contig_stride, root_dim_extent);
     } else {
       // If non contiguous dimension, keep local stride information, set cur
       // stride to local stride * local raw extent
       auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]);
-      cur_contig_stride = ir_builder.mulExpr(strides[dim], root_dim_extent);
+      cur_contig_stride =
+          SimplifyingIrBuilder::mulExpr(strides[dim], root_dim_extent);
     }
   }
 
@@ -1423,7 +1380,8 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
       loops.empty() ? nullptr : loops.back()->vectorize_shift();
 
   // Global striding
-  std::vector<kir::Val*> strided_inds(root_dom.size(), ir_builder.zeroVal());
+  std::vector<Val*> strided_inds(
+      root_dom.size(), GpuLower::current()->kernel()->zeroVal());
   for (const auto i : c10::irange(root_dom.size())) {
     // If the domain is derived from a trivial reduction, no indexing
     // to create.
@@ -1434,20 +1392,33 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
       continue;
     }
 
-    auto kir_root_dom_i =
-        gpu_lower->lowerValue(root_dom[i])->as<kir::IterDomain>();
-
     TORCH_INTERNAL_ASSERT(
-        producer_indexing.indexMap().find(kir_root_dom_i) !=
+        producer_indexing.indexMap().find(root_dom[i]) !=
             producer_indexing.indexMap().end(),
         "Couldn't find root mapping for TV",
         producer_tv->name(),
         " dim: ",
         i,
         " id: ",
-        kir::toString(kir_root_dom_i));
+        root_dom[i]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
+
+    auto root_ind = producer_indexing.indexMap().at(root_dom[i]);
 
-    auto root_ind = producer_indexing.indexMap().at(kir_root_dom_i);
+    // index hoist must be done before the adjustments for halo
+    root_ind = hoistProducerIndex(
+        root_dom[i],
+        producer_tv,
+        producer_indexing,
+        consumer_tv,
+        p2c_map,
+        reference.domain,
+        ref_compute,
+        loops,
+        root_ind);
 
     root_ind = getProducerIndexWithHalo(producer_tv, i, root_ind, consumer_tv);
 
@@ -1465,28 +1436,71 @@ std::vector<kir::Val*> Index::getGlobalProducerStridedIndices(
     if (root_ind->isZeroInt()) {
       continue;
     } else {
-      auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]);
+      auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]);
       if (i == root_dom.size() - 1 && vectorize_shift != nullptr) {
-        strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift);
+        strided_inds[i] =
+            SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift);
       } else {
         strided_inds[i] = strided_ind;
       }
     }
   }
 
+  // Save indexing info necessary for validating vectorization at launch time
+  fillProducerVectorizedContigRootDomains(
+      producer_tv, consumer_tv, c2p_map, contig_finder);
+
   return strided_inds;
 }
 
+namespace {
+
+// Maps all producer domains to consumer with broadcast
+// forwarding. Used to find the allocation position.
+std::unordered_map<IterDomain*, IterDomain*> mapAllProducerDomainsToConsumer(
+    TensorView* producer_tv,
+    const TensorView* consumer_tv) {
+  // This map has forwarded broadcast axes, it should only be used to compute
+  // the allocation position of the producer, and to figure out which producer
+  // indices are mapped to consumer trivial reductions.
+  std::unordered_map<IterDomain*, IterDomain*> p2c_alloc_map;
+
+  //  We want to replay producer as consumer instead of the other way around
+  //  since consumer may have some broadcasted axes producer doesn't have
+  //  merged into loops producer may use. If we did consumer as producer we
+  //  wouldn't have this information in the mapping.
+  auto replay_PasC = BestEffortReplay::replayPasC(
+      producer_tv,
+      consumer_tv,
+      -1,
+      PairwiseRootDomainMap(producer_tv, consumer_tv));
+
+  // Grab consumer domain entries and reverse replay map. TODO: Maybe
+  // TransformReplay::replayPasC could return this map
+  for (auto id : consumer_tv->domain()->domain()) {
+    const auto& c2p_map = replay_PasC.getReplay();
+    auto c2p_it = c2p_map.find(id);
+    if (c2p_it != c2p_map.end()) {
+      auto c_id = c2p_it->first;
+      auto p_id = c2p_it->second;
+      p2c_alloc_map[p_id] = c_id;
+    }
+  }
+
+  return p2c_alloc_map;
+}
+
+} // namespace
+
 // Producer index for either shared or local memory
-std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
+std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
     TensorView* producer_tv,
     const TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
   // Get a reference tensor replayed as existing loop structure
-  auto reference = IndexReferenceReplay::getReference(loops);
+  auto reference = IndexReferenceReplay::getReference(loops, consumer_tv);
   auto reference_domain = reference.domain;
   auto reference_id_map = reference.concrete_to_id;
 
@@ -1500,57 +1514,38 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
   ir_utils::TVDomainGuard domain_guard(
       producer_tv, producer_replayed_as_consumer);
 
-  // This map has forwarded broadcast axes, it should only be used to compute
-  // the allocation position of the producer, and to figure out which producer
-  // indices are mapped to consumer trivial reductions.
-  std::unordered_map<IterDomain*, IterDomain*> p2c_alloc_map;
-  {
-    //  We want to play producer as consumer instead of the other way around
-    //  since consumer may have some broadcasted axes producer doesn't have
-    //  merged into loops producer may use. If we did consumer as producer we
-    //  wouldn't have this information in the mapping.
-    auto replay_PasC = BestEffortReplay::replayPasC(
-        producer_tv, consumer_tv, -1, pairwise_map);
-
-    auto c2p_map = replay_PasC.getReplay();
-
-    // Grab consumer domain entries and reverse replay map. TODO: Maybe
-    // TransformReplay::replayPasC could return this map
-    for (auto id : consumer_tv->domain()->domain()) {
-      auto c2p_it = c2p_map.find(id);
-      if (c2p_it != c2p_map.end()) {
-        auto c_id = c2p_it->first;
-        auto p_id = c2p_it->second;
-        p2c_alloc_map[p_id] = c_id;
-      }
-    }
-  }
+  const auto p2c_alloc_map =
+      mapAllProducerDomainsToConsumer(producer_tv, consumer_tv);
+
+  kir::ForLoop* consumer_db_loop =
+      gpu_lower->doubleBufferInfo().getDoubleBufferLoop(
+          consumer_tv, loops, true);
 
   // Find allocation point of producer relative to loop nests. P2C map is
   // required because producer was replayed as consumer, so we can't use the
   // regular compute at maps to line up its iter domains with the for loops.
-  auto alloc_point =
-      loop_utils::getAllocPoint(producer_tv, loops, p2c_alloc_map, true);
-  std::unordered_map<kir::ForLoop*, kir::Val*> loop_to_ind_map;
+  auto alloc_info =
+      loop_utils::getAllocInformation(producer_tv, loops, p2c_alloc_map, true);
+  std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map;
   std::unordered_set<kir::ForLoop*> zero_loops;
-  std::tie(loop_to_ind_map, zero_loops) =
-      indexMapFromTV(producer_tv, loops, alloc_point, false);
+  std::tie(loop_to_ind_map, zero_loops) = indexMapFromTV(
+      producer_tv, loops, alloc_info.init_for_loop, false, consumer_db_loop);
 
-  ensureStaticIndexing(producer_tv, alloc_point.first, loops, p2c_alloc_map);
+  ensureStaticIndexing(
+      producer_tv, alloc_info.init_for_loop, loops, p2c_alloc_map);
 
   // Map loop nests to indicies, zeroing out those not used due to locality of
   // memory
-  std::unordered_map<kir::IterDomain*, kir::Val*> ref_id_to_ind_map;
+  std::unordered_map<IterDomain*, Val*> ref_id_to_ind_map;
   // Track which domains are not used
-  std::unordered_set<kir::IterDomain*> ref_zero_domains;
+  std::unordered_set<IterDomain*> ref_zero_domains;
 
   // Due to rfactor/initialization reference_domain may be bigger than loop nest
   // structure, ignore IterDomains that aren't present in the loop nest when
   // indexing reference.
   TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims());
   for (const auto loop_i : c10::irange(loops.size())) {
-    auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i))
-                        ->as<kir::IterDomain>();
+    auto ref_axis = reference_domain->axis(loop_i);
     ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]];
     if (zero_loops.count(loops[loop_i]) > 0) {
       ref_zero_domains.insert(ref_axis);
@@ -1563,25 +1558,32 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
   // more conservative approach, which is to use the consumer as a proxy between
   // producer to reference.
   std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_producer;
+  std::unordered_map<IterDomain*, IterDomain*> c2p_index_map;
+  std::unordered_map<IterDomain*, IterDomain*> p2c_index_map;
   {
+    // Map sent to best effort replay needs to match the exact incantation for
+    // compute_at_mode.cpp with MappingMode::Index
+    auto c2p_root_map = PairwiseRootDomainMap(producer_tv, consumer_tv, true)
+                            .mapConsumerToProducer(
+                                consumer_tv->domain(), producer_tv->domain());
+
     // This replay has to be consistent with compute at index map.
     BestEffortReplay replay_producer_as_consumer(
         producer_tv->domain()->domain(),
         consumer_tv->domain()->domain(),
-        pairwise_map.mapConsumerToProducer(
-            consumer_tv->domain(), producer_tv->domain()));
+        c2p_root_map);
 
-    const auto& c2p_map = replay_producer_as_consumer.getReplay();
+    c2p_index_map = replay_producer_as_consumer.getReplay();
+    p2c_index_map = invertOneToOneMap(c2p_index_map);
 
     std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_consumer =
-        indexMapReferenceTo(
-            consumer_tv, gpu_lower->caIndexMap(), reference_id_map);
+        indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map);
 
     for (auto entry : index_map_ref_to_consumer) {
       auto r_id = entry.first;
       auto c_id = entry.second;
-      auto c2p_it = c2p_map.find(c_id);
-      if (c2p_it != c2p_map.end()) {
+      auto c2p_it = c2p_index_map.find(c_id);
+      if (c2p_it != c2p_index_map.end()) {
         auto p_id = c2p_it->second;
         index_map_ref_to_producer[r_id] = p_id;
       }
@@ -1637,10 +1639,17 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
   const auto reference_halo_extent_map =
       getReferenceHaloExtentMap(reference, index_map_ref_to_producer);
 
+  ContigIDs contig_finder(
+      producer_tv->domain()->domain(),
+      producer_tv->getMaybeRFactorDomain(),
+      producer_tv->domain()->contiguity(),
+      reference_id_map,
+      p2c_index_map);
+
   auto producer_indexing = ref_compute.updateIndexCompute(
       producer_tv->domain(),
       index_map_ref_to_producer,
-      producer_tv->domain()->contiguity(),
+      contig_finder,
       reference_halo_extent_map);
 
   // Revert p_ids
@@ -1677,8 +1686,7 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
     }
 
     // Already an entry for this root domain, continue
-    if (index_map.find(gpu_lower->lowerValue(root_id)->as<kir::IterDomain>()) !=
-        index_map.end()) {
+    if (index_map.find(root_id) != index_map.end()) {
       continue;
     }
 
@@ -1690,25 +1698,39 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
     }
   }
 
-  std::vector<kir::Val*> strided_inds(root_dom.size(), ir_builder.zeroVal());
+  std::vector<Val*> strided_inds(
+      root_dom.size(), GpuLower::current()->kernel()->zeroVal());
   for (const auto i : c10::irange(root_dom.size())) {
     if (skip_indexing.count(root_dom[i])) {
       continue;
     }
 
-    auto kir_root_dom_i =
-        gpu_lower->lowerValue(root_dom[i])->as<kir::IterDomain>();
-
     TORCH_INTERNAL_ASSERT(
-        index_map.find(kir_root_dom_i) != index_map.end(),
-        "Couldn't find root mapping for TV",
-        producer_tv->name(),
+        index_map.find(root_dom[i]) != index_map.end(),
+        "Couldn't find root mapping for ",
+        producer_tv->toString(),
         " dim: ",
         i,
         " id: ",
-        kir::toString(kir_root_dom_i));
+        root_dom[i]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
+
+    auto root_ind_i = index_map.at(root_dom[i]);
 
-    auto root_ind_i = index_map.at(kir_root_dom_i);
+    // index hoist must be done before the adjustments for halo
+    root_ind_i = hoistProducerIndex(
+        root_dom[i],
+        producer_tv,
+        producer_indexing,
+        consumer_tv,
+        p2c_index_map,
+        reference.domain,
+        ref_compute,
+        loops,
+        root_ind_i);
 
     root_ind_i =
         getProducerIndexWithHalo(producer_tv, i, root_ind_i, consumer_tv);
@@ -1729,66 +1751,85 @@ std::vector<kir::Val*> Index::getNonGlobalProducerStridedIndices(
     }
 
     // Compute striding for this index.
-    kir::Val* stride = nullptr;
+    Val* stride = nullptr;
     for (const auto j : c10::irange(i + 1, root_dom.size())) {
       if (skip_indexing.count(root_dom[j])) {
         continue;
       }
 
-      auto kir_root_dom_j =
-          gpu_lower->lowerValue(root_dom[j])->as<kir::IterDomain>();
-
       TORCH_INTERNAL_ASSERT(
-          index_map.find(kir_root_dom_j) != index_map.end(),
-          "Couldn't find root mapping for TV",
-          consumer_tv->name(),
+          index_map.find(root_dom[j]) != index_map.end(),
+          "Couldn't find root mapping for ",
+          producer_tv->name(),
           " dim: ",
-          i,
+          j,
           " id: ",
-          root_dom[i]);
+          root_dom[j]->toString(),
+          ", reference domain: ",
+          reference_domain->toString(),
+          ", reference root: ",
+          ir_utils::toString(reference_domain->getRootDomain()));
 
-      auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end()
-          ? kir_root_dom_j->extent()
-          : extent_map.at(kir_root_dom_j);
+      auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end()
+          ? root_dom[j]->extent()
+          : extent_map.at(root_dom[j]);
 
       root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j);
 
-      if (zero_domain_map.count(kir_root_dom_j) == 0) {
+      if (zero_domain_map.count(root_dom[j]) == 0) {
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = ir_builder.mulExpr(stride, root_ext_j);
+          stride = SimplifyingIrBuilder::mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride);
+      strided_inds[i] = SimplifyingIrBuilder::mulExpr(root_ind_i, stride);
     } else {
       strided_inds[i] = root_ind_i;
     }
   }
 
+  if (producer_tv->isDoubleBuffered()) {
+    auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop(
+        producer_tv, loops, true);
+    if (db_loop != nullptr) {
+      auto loop_index =
+          db_loop->isTrivial() ? db_loop->start() : db_loop->index();
+      auto db_switch_index = SimplifyingIrBuilder::modExpr(
+          loop_index, SimplifyingIrBuilder::create<Int>(2));
+      auto original_alloc_size =
+          gpu_lower->doubleBufferInfo().getOriginalAllocSize(producer_tv);
+      auto db_strided_index =
+          SimplifyingIrBuilder::mulExpr(db_switch_index, original_alloc_size);
+      strided_inds.push_back(db_strided_index);
+    }
+  }
+
+  // Save indexing info necessary for validating vectorization at launch time
+  fillProducerVectorizedContigRootDomains(
+      producer_tv, consumer_tv, c2p_index_map, contig_finder);
+
   return strided_inds;
 }
 
-std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
+std::vector<Val*> Index::getGlobalConsumerStridedIndices(
     const TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
   FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalConsumerIndex");
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
 
   // Get a reference tensor replayed as existing loop structure
-  auto reference = IndexReferenceReplay::getReference(loops);
+  auto reference = IndexReferenceReplay::getReference(loops, consumer_tv);
   auto reference_domain = reference.domain;
   auto reference_id_map = reference.concrete_to_id;
 
   // Map everything we can from reference to consumer using compute at index
   // map.
   std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_consumer =
-      indexMapReferenceTo(
-          consumer_tv, gpu_lower->caIndexMap(), reference_id_map);
+      indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map);
 
   // Index into the reference tensor. Reference indexing will handle vectorized
   // dims where index should be set to 0
@@ -1802,10 +1843,16 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
   const auto reference_halo_extent_map =
       getReferenceHaloExtentMap(reference, index_map_ref_to_consumer);
 
+  ContigIDs contig_finder(
+      consumer_tv->domain()->domain(),
+      consumer_tv->getMaybeRFactorDomain(),
+      consumer_tv->domain()->contiguity(),
+      reference_id_map);
+
   auto consumer_indexing = ref_compute.updateIndexCompute(
       consumer_tv->domain(),
       index_map_ref_to_consumer,
-      consumer_tv->domain()->contiguity(),
+      contig_finder,
       reference_halo_extent_map);
 
   // Indices should now be mapped onto IterDomains in consumer, so just grab
@@ -1813,26 +1860,27 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
   auto root_dom = consumer_tv->getMaybeRFactorDomain();
 
   // TODO: Abstract stride logic to reuse with producer indexing
-  auto zero = ir_builder.zeroVal();
-  std::vector<kir::Val*> strides(root_dom.size(), zero);
+  std::vector<Val*> strides(
+      root_dom.size(), GpuLower::current()->kernel()->oneVal());
   {
     int stride_i = 0;
     for (const auto i : c10::irange(root_dom.size())) {
       if (root_dom[i]->isReduction() ||
           root_dom[i]->getIterType() == IterType::BroadcastWithoutStride ||
           root_dom[i]->isStride()) {
-        strides[i] = zero;
+        strides[i] = GpuLower::current()->kernel()->oneVal();
         continue;
       }
       std::stringstream ss;
       ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]";
-      strides[i] = ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int);
+      strides[i] =
+          SimplifyingIrBuilder::create<NamedScalar>(ss.str(), DataType::Int);
     }
   }
 
   TORCH_INTERNAL_ASSERT(
       root_dom.size() == consumer_tv->domain()->contiguity().size());
-  kir::Val* cur_contig_stride = ir_builder.oneVal();
+  Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal();
   for (const auto i : c10::irange(root_dom.size())) {
     auto dim = root_dom.size() - i - 1;
     if (root_dom[dim]->isReduction() || root_dom[dim]->isStride()) {
@@ -1842,24 +1890,26 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
       continue;
     }
 
-    kir::Val* root_ind = nullptr;
-    auto kir_root_dom =
-        gpu_lower->lowerValue(root_dom[dim])->as<kir::IterDomain>();
-    if (consumer_indexing.indexMap().find(kir_root_dom) !=
+    Val* root_ind = nullptr;
+    if (consumer_indexing.indexMap().find(root_dom[dim]) !=
         consumer_indexing.indexMap().end()) {
-      root_ind = consumer_indexing.indexMap().at(kir_root_dom);
+      root_ind = consumer_indexing.indexMap().at(root_dom[dim]);
     } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) {
-      root_ind = zero;
+      root_ind = GpuLower::current()->kernel()->zeroVal();
     }
 
     TORCH_INTERNAL_ASSERT(
         root_ind != nullptr,
-        "Couldn't find root mapping for TV",
-        consumer_tv->name(),
+        "Couldn't find root mapping for ",
+        consumer_tv->toString(),
         " dim: ",
-        i,
+        dim,
         " id: ",
-        root_dom[dim]);
+        root_dom[dim]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
 
     if (consumer_tv->domain()->contiguity()[dim]) {
       // If contig, used the stored stride which may be the previous
@@ -1869,11 +1919,11 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
       // by extent of this dimension
       auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]);
       cur_contig_stride =
-          ir_builder.mulExpr(cur_contig_stride, root_dim_extent);
+          SimplifyingIrBuilder::mulExpr(cur_contig_stride, root_dim_extent);
     } else {
       // If non contiguous dimension, keep local stride information, set cur
       // stride to local stride * local raw extent
-      cur_contig_stride = ir_builder.mulExpr(
+      cur_contig_stride = SimplifyingIrBuilder::mulExpr(
           strides[dim], getHaloExtentOfRootAxis(root_dom[dim]));
     }
   }
@@ -1882,7 +1932,8 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
       loops.empty() ? nullptr : loops.back()->vectorize_shift();
 
   // Global striding
-  std::vector<kir::Val*> strided_inds(root_dom.size(), ir_builder.zeroVal());
+  std::vector<Val*> strided_inds(
+      root_dom.size(), GpuLower::current()->kernel()->zeroVal());
   for (const auto i : c10::irange(root_dom.size())) {
     // See a comment in indexing to root domains in getGlobalProducerIndex.
     if (root_dom[i]->isReduction() ||
@@ -1893,71 +1944,87 @@ std::vector<kir::Val*> Index::getGlobalConsumerStridedIndices(
       continue;
     }
 
-    auto kir_root_dom_i =
-        gpu_lower->lowerValue(root_dom[i])->as<kir::IterDomain>();
-
     TORCH_INTERNAL_ASSERT(
-        consumer_indexing.indexMap().find(kir_root_dom_i) !=
+        consumer_indexing.indexMap().find(root_dom[i]) !=
             consumer_indexing.indexMap().end(),
-        "Couldn't find root mapping for TV",
-        consumer_tv->name(),
+        "Couldn't find root mapping for ",
+        consumer_tv->toString(),
         " dim: ",
         i,
         " id: ",
-        kir::toString(kir_root_dom_i));
+        root_dom[i]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
 
-    auto root_ind = consumer_indexing.indexMap().at(kir_root_dom_i);
+    auto root_ind = consumer_indexing.indexMap().at(root_dom[i]);
 
-    root_ind = ir_builder.addExpr(
-        root_ind, getGlobalConsumerOffsetWithPartialSplit(kir_root_dom_i));
+    // index hoist must be done before the adjustments for halo
+    root_ind = hoistConsumerIndex(
+        root_dom[i],
+        consumer_tv,
+        consumer_indexing,
+        reference.domain,
+        ref_compute,
+        loops,
+        root_ind);
+
+    root_ind = SimplifyingIrBuilder::addExpr(
+        root_ind, getGlobalConsumerOffsetWithPartialSplit(root_dom[i]));
 
     if (root_ind->isZeroInt()) {
       continue;
     } else {
-      auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]);
+      auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]);
       if (i == root_dom.size() - 1 && vectorize_shift != nullptr) {
-        strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift);
+        strided_inds[i] =
+            SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift);
       } else {
         strided_inds[i] = strided_ind;
       }
     }
   }
 
+  TORCH_INTERNAL_ASSERT(
+      strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size());
+
+  fillConsumerVectorizedContigRootDomains(consumer_tv, contig_finder);
+
   return strided_inds;
 }
 
 // Consumer index for either shared or local memory
-std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
+std::vector<Val*> Index::getNonGlobalConsumerStridedIndices(
     const TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
   // Get a reference tensor replayed as existing loop structure
-  auto reference = IndexReferenceReplay::getReference(loops);
+  auto reference = IndexReferenceReplay::getReference(loops, consumer_tv);
+
   auto reference_domain = reference.domain;
   auto reference_id_map = reference.concrete_to_id;
 
-  auto alloc_point = loop_utils::getAllocPoint(consumer_tv, loops);
-  std::unordered_map<kir::ForLoop*, kir::Val*> loop_to_ind_map;
+  auto alloc_info = loop_utils::getAllocInformation(consumer_tv, loops);
+  std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map;
   std::unordered_set<kir::ForLoop*> zero_loops;
   std::tie(loop_to_ind_map, zero_loops) =
-      indexMapFromTV(consumer_tv, loops, alloc_point, true);
+      indexMapFromTV(consumer_tv, loops, alloc_info.init_for_loop, true);
 
-  ensureStaticIndexing(consumer_tv, alloc_point.first, loops);
+  ensureStaticIndexing(consumer_tv, alloc_info.init_for_loop, loops);
 
   // Map loop nests to indicies, zeroing out those not used due to locality of
   // memory
-  std::unordered_map<kir::IterDomain*, kir::Val*> ref_id_to_ind_map;
-  std::unordered_set<kir::IterDomain*> ref_zero_domains;
+  std::unordered_map<IterDomain*, Val*> ref_id_to_ind_map;
+  std::unordered_set<IterDomain*> ref_zero_domains;
 
   // Due to rfactor/initialization reference_domain may be bigger than loop nest
   // structure, ignore IterDomains that aren't present in the loop nest when
   // indexing reference.
   TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims());
   for (const auto loop_i : c10::irange(loops.size())) {
-    auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i))
-                        ->as<kir::IterDomain>();
+    auto ref_axis = reference_domain->axis(loop_i);
     ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]];
     if (zero_loops.count(loops[loop_i]) > 0) {
       ref_zero_domains.insert(ref_axis);
@@ -1967,8 +2034,7 @@ std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
   // Map everything we can from reference to consumer using compute at index
   // map.
   std::unordered_map<IterDomain*, IterDomain*> index_map_ref_to_consumer =
-      indexMapReferenceTo(
-          consumer_tv, gpu_lower->caIndexMap(), reference_id_map);
+      indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map);
 
   // Grab roots that map into consumer and save them into the preferred roots
   // set for references indexing
@@ -1999,11 +2065,17 @@ std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
   const auto reference_halo_extent_map =
       getReferenceHaloExtentMap(reference, index_map_ref_to_consumer);
 
+  ContigIDs contig_finder(
+      consumer_tv->domain()->domain(),
+      consumer_tv->getMaybeRFactorDomain(),
+      consumer_tv->domain()->contiguity(),
+      reference_id_map);
+
   // Index into consumer using reference indexing
   auto consumer_indexing = ref_compute.updateIndexCompute(
       consumer_tv->domain(),
       index_map_ref_to_consumer,
-      consumer_tv->domain()->contiguity(),
+      contig_finder,
       reference_halo_extent_map);
 
   IndexSwizzle index_swizzle(
@@ -2022,7 +2094,8 @@ std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
   // Indices should now be mapped onto IterDomains in consumer, so just grab
   // and use them.
   auto root_dom = consumer_tv->getMaybeRFactorDomain();
-  std::vector<kir::Val*> strided_inds(root_dom.size(), ir_builder.zeroVal());
+  std::vector<Val*> strided_inds(
+      root_dom.size(), GpuLower::current()->kernel()->zeroVal());
   for (const auto i : c10::irange(root_dom.size())) {
     if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast() ||
         gpu_lower->trivialReductionInfo().isDerived(root_dom[i]) ||
@@ -2030,25 +2103,36 @@ std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
       continue;
     }
 
-    auto kir_root_dom_i =
-        gpu_lower->lowerValue(root_dom[i])->as<kir::IterDomain>();
-
     TORCH_INTERNAL_ASSERT(
-        index_map.find(kir_root_dom_i) != index_map.end(),
-        "Couldn't find root mapping for TV",
-        consumer_tv->name(),
+        index_map.find(root_dom[i]) != index_map.end(),
+        "Couldn't find root mapping for ",
+        consumer_tv->toString(),
         " dim: ",
         i,
         " id: ",
-        kir::toString(kir_root_dom_i));
+        root_dom[i]->toString(),
+        ", reference domain: ",
+        reference_domain->toString(),
+        ", reference root: ",
+        ir_utils::toString(reference_domain->getRootDomain()));
 
-    const auto root_ind_i = index_map.at(kir_root_dom_i);
+    auto root_ind_i = index_map.at(root_dom[i]);
     if (root_ind_i->isZeroInt()) {
       continue;
     }
 
+    // index hoist must be done before the adjustments for halo
+    root_ind_i = hoistConsumerIndex(
+        root_dom[i],
+        consumer_tv,
+        consumer_indexing,
+        reference.domain,
+        ref_compute,
+        loops,
+        root_ind_i);
+
     // Compute striding for this index.
-    kir::Val* stride = nullptr;
+    Val* stride = nullptr;
     for (const auto j : c10::irange(i + 1, root_dom.size())) {
       if (root_dom[j]->isBroadcast() || root_dom[j]->isReduction() ||
           gpu_lower->trivialReductionInfo().isDerived(root_dom[j]) ||
@@ -2056,57 +2140,81 @@ std::vector<kir::Val*> Index::getNonGlobalConsumerStridedIndices(
         continue;
       }
 
-      auto kir_root_dom_j =
-          gpu_lower->lowerValue(root_dom[j])->as<kir::IterDomain>();
-
       TORCH_INTERNAL_ASSERT(
-          index_map.find(kir_root_dom_j) != index_map.end(),
-          "Couldn't find root mapping for TV",
-          consumer_tv->name(),
+          index_map.find(root_dom[j]) != index_map.end(),
+          "Couldn't find root mapping for ",
+          consumer_tv->toString(),
           " dim: ",
-          i,
+          j,
           " id: ",
-          root_dom[i]);
+          root_dom[j]->toString(),
+          ", reference domain: ",
+          reference_domain->toString(),
+          ", reference root: ",
+          ir_utils::toString(reference_domain->getRootDomain()));
 
-      auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end()
-          ? kir_root_dom_j->extent()
-          : extent_map.at(kir_root_dom_j);
+      auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end()
+          ? root_dom[j]->extent()
+          : extent_map.at(root_dom[j]);
 
       root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j);
 
-      if (zero_domain_map.count(kir_root_dom_j) == 0) {
+      if (zero_domain_map.count(root_dom[j]) == 0) {
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = ir_builder.mulExpr(stride, root_ext_j);
+          stride = SimplifyingIrBuilder::mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride);
+      strided_inds[i] = SimplifyingIrBuilder::mulExpr(root_ind_i, stride);
     } else {
       strided_inds[i] = root_ind_i;
     }
   }
 
+  // This check was originally done in getConsumerStridedIndices, but
+  // the number of strided index values depends on the loop where the
+  // consumer tensor is located. If it's double buffered and not in
+  // the prologue loop, strided_inds ends up having one more
+  // index, so it's just much simpler to check here before adding the
+  // additional index for double buffering.
+  TORCH_INTERNAL_ASSERT(
+      strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size());
+
+  if (consumer_tv->isDoubleBuffered()) {
+    auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop(
+        consumer_tv, loops, true);
+    if (db_loop != nullptr) {
+      auto db_switch_index = SimplifyingIrBuilder::subExpr(
+          gpu_lower->kernel()->oneVal(),
+          SimplifyingIrBuilder::modExpr(
+              db_loop->index(), SimplifyingIrBuilder::create<Int>(2)));
+      auto original_alloc_size =
+          gpu_lower->doubleBufferInfo().getOriginalAllocSize(consumer_tv);
+      auto db_strided_index =
+          SimplifyingIrBuilder::mulExpr(db_switch_index, original_alloc_size);
+      strided_inds.push_back(db_strided_index);
+    }
+  }
+
   return strided_inds;
 }
 
-std::vector<kir::Val*> Index::getProducerStridedIndices(
+std::vector<Val*> Index::getProducerStridedIndices(
     TensorView* producer,
     const TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
   FUSER_PERF_SCOPE("GpuLower::Lower::Index::getProducerStridedIndices");
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   if (producer->domain()->noReductions().size() == 0) {
-    return std::vector<kir::Val*>(
-        producer->getMaybeRFactorDomain().size(), ir_builder.zeroVal());
+    return std::vector<Val*>(
+        producer->getMaybeRFactorDomain().size(),
+        GpuLower::current()->kernel()->zeroVal());
   }
 
-  std::vector<kir::Val*> strided_indices;
+  std::vector<Val*> strided_indices;
   if (producer->getMemoryType() == MemoryType::Global) {
     strided_indices =
         getGlobalProducerStridedIndices(producer, consumer, loops);
@@ -2116,7 +2224,9 @@ std::vector<kir::Val*> Index::getProducerStridedIndices(
   }
 
   TORCH_INTERNAL_ASSERT(
-      strided_indices.size() == producer->getMaybeRFactorDomain().size());
+      strided_indices.size() ==
+      producer->getMaybeRFactorDomain().size() +
+          (producer->isDoubleBuffered() ? 1 : 0));
 
   return strided_indices;
 }
@@ -2126,35 +2236,28 @@ kir::TensorIndex* Index::getProducerIndex(
     TensorView* producer,
     const TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   auto strided_indices = getProducerStridedIndices(producer, consumer, loops);
-  return ir_builder.create<kir::TensorIndex>(producer, strided_indices);
+  return SimplifyingIrBuilder::create<kir::TensorIndex>(
+      producer, strided_indices);
 }
 
-std::vector<kir::Val*> Index::getConsumerStridedIndices(
+std::vector<Val*> Index::getConsumerStridedIndices(
     const TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
   FUSER_PERF_SCOPE("GpuLower::Lower::Index::getConsumerStridedIndices");
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   if (consumer->domain()->noReductions().size() == 0) {
-    return std::vector<kir::Val*>(
-        consumer->getMaybeRFactorDomain().size(), ir_builder.zeroVal());
+    return std::vector<Val*>(
+        consumer->getMaybeRFactorDomain().size(),
+        GpuLower::current()->kernel()->zeroVal());
   }
 
-  std::vector<kir::Val*> strided_indices;
+  std::vector<Val*> strided_indices;
   if (consumer->getMemoryType() == MemoryType::Global) {
     strided_indices = getGlobalConsumerStridedIndices(consumer, loops);
   } else {
     strided_indices = getNonGlobalConsumerStridedIndices(consumer, loops);
   }
 
-  TORCH_INTERNAL_ASSERT(
-      strided_indices.size() == consumer->getMaybeRFactorDomain().size());
-
   return strided_indices;
 }
 
@@ -2162,11 +2265,9 @@ std::vector<kir::Val*> Index::getConsumerStridedIndices(
 kir::TensorIndex* Index::getConsumerIndex(
     const TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   auto strided_indices = getConsumerStridedIndices(consumer, loops);
-  return ir_builder.create<kir::TensorIndex>(consumer, strided_indices);
+  return SimplifyingIrBuilder::create<kir::TensorIndex>(
+      consumer, strided_indices);
 }
 
 namespace {
@@ -2184,37 +2285,23 @@ struct PredicateDomainInfo {
   bool is_non_divisible_split = false;
 };
 
-// Find iteration domains in the history of reference comprised only of
-// merge operations. Only return iteration domains that are subsequently fed
-// into a split, or are in the provided domain. In other words, we don't want to
-// return every IterDomain that's contiguous, just the one closest to the
-// leaves. Predicates are not associated with physical memory so we can treat
-// all of them as contiguous merges.
+// Find iteration domains in the history of a consumer to predicate comprised
+// only of merge operations. Only return iteration domains that are subsequently
+// fed into a split, or are in the provided domain. In other words, we don't
+// want to return every IterDomain that's contiguous, just the one closest to
+// the leaves. Predicates are not associated with physical memory so we can
+// treat all of them as contiguous merges.
+//
+// TODO: This seems to have a large overlap with ContigIDs. Consider
+// refactoring.
 std::vector<PredicateDomainInfo> getPredicateContigIds(
-    const ReferenceTensor& reference,
     TensorView* consumer_tv,
-    const std::unordered_map<IterDomain*, IterDomain*>& ref_2_consumer) {
+    const std::unordered_map<IterDomain*, Val*>& consumer_index_map) {
   const auto gpu_lower = GpuLower::current();
 
-  std::vector<IterDomain*> reference_predicated_root_domain;
-  for (const auto consumer_root : consumer_tv->getRootDomain()) {
-    if (consumer_root->isBroadcast()) {
-      continue;
-    }
-    auto consumer_root_concrete =
-        gpu_lower->caIndexMap().getConcreteMappedID(consumer_root);
-    auto it = reference.concrete_to_id.find(consumer_root_concrete);
-    // When initializing a reduction buffer, the reduction axis
-    // doesn't have a loop, so the reference tensor doesn't have a
-    // mapped domain. The reduction axis can be safely ignored.
-    if (it == reference.concrete_to_id.end()) {
-      continue;
-    }
-    auto reference_root = it->second;
-    reference_predicated_root_domain.emplace_back(reference_root);
-  }
+  const auto& consumer_root_domain = consumer_tv->getRootDomain();
 
-  std::vector<IterDomain*> contiguous_ids = reference_predicated_root_domain;
+  std::vector<IterDomain*> contiguous_ids = consumer_root_domain;
 
   if (contiguous_ids.empty()) {
     return std::vector<PredicateDomainInfo>();
@@ -2227,20 +2314,25 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
   // about halo to do correct predication, so they must be excluded.
   std::unordered_set<IterDomain*> excluded_ids;
 
-  for (auto reference_predicated_id : reference_predicated_root_domain) {
-    if (GpuLower::current()
-            ->haloInfo()
-            .getRootAxisInfo(reference_predicated_id)
-            .hasHalo()) {
+  for (auto consumer_root_id : consumer_root_domain) {
+    if (gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id).hasHalo()) {
+      excluded_ids.insert(consumer_root_id);
       continue;
     }
-    auto it = ref_2_consumer.find(reference_predicated_id);
-    if (it == ref_2_consumer.end()) {
+    if (consumer_root_id->maybePartial()) {
+      excluded_ids.insert(consumer_root_id);
       continue;
     }
-    auto consumer_root_id = it->second;
-    if (consumer_root_id->maybePartial()) {
-      excluded_ids.insert(reference_predicated_id);
+    // When consumer_root_id is a broadcast domain, do not allow contig
+    // predication as the merged output is not mapped with the
+    // reference unless the concrete domain is also a broadcast
+    // domain.
+    if (consumer_root_id->isBroadcast() &&
+        !GpuLower::current()
+             ->caMap()
+             ->getConcreteMappedID(consumer_root_id, IdMappingMode::PERMISSIVE)
+             ->isBroadcast()) {
+      excluded_ids.insert(consumer_root_id);
       continue;
     }
     // Shifted or gathered axes need to be predicated at the root domain
@@ -2252,15 +2344,16 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
     auto consumer_root_pos = consumer_tv->domain()->rootPosOf(consumer_root_id);
     if ((shift_expr && shift_expr->offset(consumer_root_pos) != 0) ||
         (gather_expr && consumer_root_pos < gather_expr->windowShape().size() &&
-         !gather_expr->windowShape().at(consumer_root_pos)->isOneInt())) {
-      excluded_ids.insert(reference_predicated_id);
+         gather_expr->windowShape().at(consumer_root_pos) != 1)) {
+      excluded_ids.insert(consumer_root_id);
     }
   }
 
   // Run through iteration domain history
-  auto exprs = ExprSort::getExprs(
+  auto exprs = StmtSort::getExprs(
       consumer_tv->fusion(),
-      {reference.domain->domain().begin(), reference.domain->domain().end()});
+      {consumer_tv->domain()->domain().begin(),
+       consumer_tv->domain()->domain().end()});
 
   for (auto expr : exprs) {
     // If not a merge, output is not contiguous
@@ -2276,6 +2369,13 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
         continue;
       }
 
+      // Do not try to predicate the merge output domain if the output
+      // domain has not a predicate that is mapped from the reference.
+      // See FusionContigPredicate_CUDA for a concrete example.
+      if (consumer_index_map.find(merge->out()) == consumer_index_map.end()) {
+        continue;
+      }
+
       if (inner_contig_it != contiguous_ids.end() &&
           outer_contig_it != contiguous_ids.end()) {
         // If inner and outer are contiguous, out must be contiguous. Remove
@@ -2296,8 +2396,7 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
     // reference_predicated_root_domain.
     auto contig_root_vals = IterVisitor::getInputsTo(
         {contig_id},
-        {reference_predicated_root_domain.begin(),
-         reference_predicated_root_domain.end()});
+        {consumer_root_domain.begin(), consumer_root_domain.end()});
     auto contig_root_ids = ir_utils::filterByType<IterDomain>(contig_root_vals);
     PredicateDomainInfo contig_id_info;
     contig_id_info.id = contig_id;
@@ -2312,8 +2411,8 @@ IterDomain* getMappedReferenceDomain(
     IterDomain* id,
     const ReferenceTensor& reference) {
   // Partially overlaps with getPredicateContigIds()
-  const auto gpu_lower = GpuLower::current();
-  auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id);
+  auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
   auto it = reference.concrete_to_id.find(concrete_id);
   if (it == reference.concrete_to_id.end()) {
     return nullptr;
@@ -2321,9 +2420,8 @@ IterDomain* getMappedReferenceDomain(
   return it->second;
 }
 
-std::vector<PredicateDomainInfo> getNonDivisibleReferenceDomainsToPredicate(
-    TensorView* consumer_tv,
-    const ReferenceTensor& reference) {
+std::vector<PredicateDomainInfo> getNonDivisibleConsumerDomainsToPredicate(
+    TensorView* consumer_tv) {
   const auto& non_divisible_split_info =
       GpuLower::current()->nonDivisibleSplitInfo();
 
@@ -2337,11 +2435,7 @@ std::vector<PredicateDomainInfo> getNonDivisibleReferenceDomainsToPredicate(
   const auto& splits_to_predicate = it->second;
 
   for (auto split : splits_to_predicate) {
-    auto ref_id = getMappedReferenceDomain(split->in(), reference);
-    if (ref_id == nullptr) {
-      continue;
-    }
-    PredicateDomainInfo info{ref_id, {ref_id}, true};
+    PredicateDomainInfo info{split->in(), {split->in()}, true};
     pred_info_vec.emplace_back(info);
   }
 
@@ -2352,9 +2446,8 @@ bool needsPadding(TensorView* tv) {
   auto shift_expr = dynamic_cast<ShiftOp*>(tv->definition());
   auto gather_expr = dynamic_cast<GatherOp*>(tv->definition());
 
-  // Padding is only necessary for padded shift and
-  // gather
-  return (shift_expr != nullptr && shift_expr->pad()) || gather_expr != nullptr;
+  return (shift_expr != nullptr && shift_expr->hasPadding()) ||
+      (gather_expr != nullptr && gather_expr->hasPadding());
 }
 
 // Get an additional offset of a stop index when building a predicate
@@ -2364,11 +2457,10 @@ bool needsPadding(TensorView* tv) {
 // compared with each other by just looking at the additional offsets.
 //
 // consumer_root_id: the domain for which a stop predicate is being built.
-kir::Val* getUnswitchStopOffset(
+int getUnswitchStopOffset(
     IterDomain* consumer_root_id,
     TensorView* consumer_tv) {
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
 
   AxisHaloInfo halo_info =
       gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id);
@@ -2376,7 +2468,7 @@ kir::Val* getUnswitchStopOffset(
   // If the consumer root domain to predicate does not have halo, no
   // adjustment is required.
   if (!halo_info.hasHalo()) {
-    return ir_builder.zeroVal();
+    return 0;
   }
 
   // Find if this contig_id is used in the unswitched domains
@@ -2400,22 +2492,14 @@ kir::Val* getUnswitchStopOffset(
           })) {
     return halo_info.width();
   } else {
-    return ir_builder.zeroVal();
+    return 0;
   }
 }
 
-// Get offsets for the start and stop predicates. Similar to the
-// gather case, but it's a little simpler as it does not (yet)
-// dynamic shifting.
-void adjustStartAndStopOffsetsForShift(
-    std::vector<kir::Val*>& start_offsets,
-    std::vector<kir::Val*>& stop_offsets,
+std::pair<Val*, Val*> getStartAndStopOffsetsForShift(
     TensorView* consumer_tv,
     IterDomain* consumer_id,
     bool padding_predicate) {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   TORCH_INTERNAL_ASSERT(consumer_id != nullptr);
 
   auto shift_expr = dynamic_cast<ShiftOp*>(consumer_tv->definition());
@@ -2423,105 +2507,124 @@ void adjustStartAndStopOffsetsForShift(
   // Adjustment is not necessary if not shift.
   // Even so, padding predicate does not need any adjustment.
   if (shift_expr == nullptr || padding_predicate) {
-    return;
+    return {
+        GpuLower::current()->kernel()->zeroVal(),
+        GpuLower::current()->kernel()->zeroVal()};
   }
 
   const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id);
 
-  // Assume this adjustment is done first, so start and stop offsets
-  // just contain zeroVal.
-  TORCH_INTERNAL_ASSERT(
-      start_offsets.size() == 1 && start_offsets[0]->isZeroInt() &&
-      stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt());
-  start_offsets.clear();
-  stop_offsets.clear();
-
-  // The consumer offset is zero.
-  auto consumer_offset = 0;
-  // The producer offset is based off the consumer offset.
-  auto producer_offset = 0;
-
-  // When the shift operation is not padded, the start and stop positions of the
-  // consumer axis, i.e., consumer_id->start and
-  // consumer_id->stop_ofset, are adjusted accordingly, which includes
-  // the effect of the shift offset, so using the consumer offset is
-  // sufficient as the only predicate is sufficient.
-
-  if (shift_expr->pad()) {
-    // Positive shift offset means shifting the input tensor to the
-    // positive direction, so the producer offset becomes negative.
-    auto shift_offset = shift_expr->offset(root_axis_pos);
-    producer_offset = -shift_offset;
-  }
-
-  // Since shift doesn't allow dynamic offsets, we can statically
-  // choose more restrictive offsets between the producer and consumer
-  // offsets. The start predicate uses greater-than, so using the
-  // smaller offset is sufficient. Similarly, for the stop predicate,
-  // using the larger offset is sufficient.
-  auto start_offset = std::min(consumer_offset, producer_offset);
-  auto stop_offset = std::max(consumer_offset, producer_offset);
-
-  start_offsets.push_back(ir_builder.create<kir::Int>(start_offset));
-  stop_offsets.push_back(ir_builder.create<kir::Int>(stop_offset));
+  // The first or last N elements, where N is the padding width,
+  // correspond to the padding predicate.
+
+  const auto shift_offset = shift_expr->offset(root_axis_pos);
+  const auto pad_width = shift_expr->padWidth().at(root_axis_pos);
+
+  int start_offset = 0;
+  int stop_offset = 0;
+
+  if (shift_offset > 0) {
+    start_offset = -pad_width;
+  } else if (shift_offset < 0) {
+    stop_offset = pad_width;
+  }
+
+  return {
+      SimplifyingIrBuilder::create<Int>(start_offset),
+      SimplifyingIrBuilder::create<Int>(stop_offset)};
 }
 
-// Get offsets for the start and stop predicates. There can be two
-// offsets because the shift offset is determined by a loop index.
-void adjustStartAndStopOffsetsForGather(
-    std::vector<kir::Val*>& start_offsets,
-    std::vector<kir::Val*>& stop_offsets,
+std::pair<Val*, Val*> getStartAndStopOffsetsForGather(
     TensorView* consumer_tv,
     IterDomain* consumer_id,
-    const ReferenceTensor& reference,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_start_index_map,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_stop_index_map,
+    const std::unordered_map<IterDomain*, Val*>& ref_start_index_map,
+    const std::unordered_map<IterDomain*, Val*>& ref_stop_index_map,
     bool padding_predicate) {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   TORCH_INTERNAL_ASSERT(consumer_id != nullptr);
 
   // Adjustment is not necessary if not gather. Even so, padding
   // predicate does not need any adjustment.
   if (!consumer_tv->definition()->isA<GatherOp>() || padding_predicate) {
-    return;
+    return {
+        GpuLower::current()->kernel()->zeroVal(),
+        GpuLower::current()->kernel()->zeroVal()};
   }
 
   const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id);
 
-  // Assume this adjustment is done first, so start and stop offsets
-  // just contain zeroVal.
-  TORCH_INTERNAL_ASSERT(
-      start_offsets.size() == 1 && start_offsets[0]->isZeroInt() &&
-      stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt());
-  start_offsets.clear();
-  stop_offsets.clear();
-
   auto producer_start_offset = getProducerOffsetWithGather(
-      root_axis_pos,
-      consumer_tv,
-      reference.concrete_to_id,
-      ref_start_index_map);
+      root_axis_pos, consumer_tv, ref_start_index_map);
 
   auto producer_stop_offset = getProducerOffsetWithGather(
-      root_axis_pos, consumer_tv, reference.concrete_to_id, ref_stop_index_map);
+      root_axis_pos, consumer_tv, ref_stop_index_map);
+
+  auto consumer_start_offset = GpuLower::current()->kernel()->zeroVal();
+  auto consumer_stop_offset = GpuLower::current()->kernel()->zeroVal();
 
-  // The producer and consumer accesses must be predicated as it is
-  // not statically determined which is more restrictive.
+  if (producer_start_offset->isZeroInt() && producer_stop_offset->isZeroInt()) {
+    return {consumer_start_offset, consumer_stop_offset};
+  }
+
+  Val* start_offset = nullptr;
+  Val* stop_offset = nullptr;
 
-  // Consumer offsets are just zero.
-  start_offsets.push_back(ir_builder.zeroVal());
-  stop_offsets.push_back(ir_builder.zeroVal());
+  // In the normal case, take the minimum of the start and the
+  // maximum of the stop offsets. If there's no padding, the producer
+  // offset must be always larger than the consumer
+  // offset. So, the consumer and produce offsets can be always used
+  // for the start and stop offsets, respectively.
+  const auto pad_left =
+      consumer_tv->definition()->as<GatherOp>()->padWidth()[root_axis_pos][0];
+  const auto pad_right =
+      consumer_tv->definition()->as<GatherOp>()->padWidth()[root_axis_pos][1];
+  const auto window_size =
+      consumer_tv->definition()->as<GatherOp>()->windowShape()[root_axis_pos];
 
-  // Adds producer offsets if they are not zero.
-  if (!producer_start_offset->isZeroInt()) {
-    start_offsets.push_back(producer_start_offset);
+  // consumer index: index
+  // producer index: index + window_index - pad_left
+  //
+  // consumer extent: ext
+  // producer extent: ext + window_size - 1 - pad_left - pad_right
+  //
+  // consumer stop pred: index < ext
+  // producer stop pred: index + window_index - pad_left < ext + window_size - 1
+  // - pad_left - pad_right
+  //                  -> index + window_index - pad_left - (window_size - 1 -
+  //                  pad_left - pad_right) < ext
+  //                  -> index + window_index - (window_size - 1 - pad_right) <
+  //                  ext
+  //
+  // consumer start pred: index >= 0
+  // producer start pred: index + window_index - pad_left >= 0
+
+  const auto producer_ext_adj = window_size - 1 - pad_left - pad_right;
+  producer_stop_offset = SimplifyingIrBuilder::subExpr(
+      producer_stop_offset,
+      SimplifyingIrBuilder::create<Int>(producer_ext_adj));
+
+  // As commented above, when pad_left is zero, the consumer predicate
+  // is always more restrictive than the producer predicate.
+  if (pad_left == 0) {
+    start_offset = consumer_start_offset;
+  } else {
+    start_offset = SimplifyingIrBuilder::minExpr(
+        consumer_start_offset, producer_start_offset);
   }
 
-  if (!producer_stop_offset->isZeroInt()) {
-    stop_offsets.push_back(producer_stop_offset);
+  // As commented above, when pad_right is zero, the consumer
+  // predicate is always more restrictive than the producer
+  // predicate.
+  if (pad_right == 0) {
+    stop_offset = consumer_stop_offset;
+  } else {
+    stop_offset = SimplifyingIrBuilder::maxExpr(
+        consumer_stop_offset, producer_stop_offset);
   }
+
+  TORCH_INTERNAL_ASSERT(start_offset != nullptr);
+  TORCH_INTERNAL_ASSERT(stop_offset != nullptr);
+
+  return {start_offset, stop_offset};
 }
 
 // Get the start and stop limit offsets that define the valid range to
@@ -2530,18 +2633,16 @@ void adjustStartAndStopOffsetsForGather(
 // stop that's different from extent. Also, when IterDomain has halo,
 // the actual offsets of the logical start and stop positions are
 // shifted.
-std::pair<kir::Val*, kir::Val*> getStartAndStopLimitOffsets(
+std::pair<Val*, Val*> getStartAndStopLimitOffsets(
     IterDomain* consumer_id,
     bool padding_predicate,
     bool non_divisible_pred) {
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
 
   TORCH_INTERNAL_ASSERT(consumer_id != nullptr);
 
-  kir::Val* start_limit = gpu_lower->lowerValue(consumer_id->start());
-  kir::Val* stop_limit =
-      ir_builder.negExpr(gpu_lower->lowerValue(consumer_id->stopOffset()));
+  Val* start_limit = consumer_id->start();
+  Val* stop_limit = SimplifyingIrBuilder::negExpr(consumer_id->stopOffset());
 
   if (!non_divisible_pred) {
     AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_id);
@@ -2554,12 +2655,14 @@ std::pair<kir::Val*, kir::Val*> getStartAndStopLimitOffsets(
     // [0, left halo)[start_limit, stop_limit)[0, right halo)
     //
     if (!padding_predicate) {
-      start_limit = ir_builder.addExpr(start_limit, halo_info.width(0));
-      stop_limit = ir_builder.addExpr(stop_limit, halo_info.width(0));
+      start_limit =
+          SimplifyingIrBuilder::addExpr(start_limit, halo_info.width(0));
+      stop_limit =
+          SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width(0));
     } else {
       // In case of the padding predicate, the whole range, including both left
       // and right halo regions, is computed.
-      stop_limit = ir_builder.addExpr(stop_limit, halo_info.width());
+      stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width());
     }
   } else {
     // For non-divisible predicates, the index must be predicated such
@@ -2568,28 +2671,26 @@ std::pair<kir::Val*, kir::Val*> getStartAndStopLimitOffsets(
     // isn't a root domain.
     if (gpu_lower->haloInfo().hasHaloWidth(consumer_id)) {
       auto halo = gpu_lower->haloInfo().getHaloWidth(consumer_id);
-      stop_limit = ir_builder.addExpr(stop_limit, halo);
+      stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo);
     }
   }
 
   return {start_limit, stop_limit};
 }
 
-// Return an index map for a predicate reference tensor. Two different
+// Return an IndexCompute for a predicate reference tensor. Two different
 // maps are used when generating predicates for unswitched expressions
 // as start and stop conditions need to use different loop-to-index
 // mappings.
-std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
+auto getPredicateReferenceIndexing(
     const std::vector<kir::ForLoop*>& loops,
     const ReferenceTensor& reference,
     kir::ForLoop* unswitch_or_vec_loop,
+    IterDomain* double_buffer_axis,
     bool start) {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   auto reference_domain = reference.domain;
 
-  std::unordered_map<kir::ForLoop*, kir::Val*> loop_to_ind_map;
+  std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map;
 
   std::transform(
       loops.begin(),
@@ -2606,7 +2707,7 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
     // vectorized loop should be like this.
 
     bool vectorized_pred =
-        unswitch_or_vec_loop->iter_domain()->parallelType() ==
+        unswitch_or_vec_loop->iter_domain()->getParallelType() ==
         ParallelType::Vectorize;
 
     TORCH_INTERNAL_ASSERT(
@@ -2614,12 +2715,11 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
         "Invalid reference generated.");
 
     bool within_unswitch = false;
-    const auto one = ir_builder.oneVal();
 
     for (const auto loop_i : c10::irange(loops.size())) {
       auto loop = loops[loop_i];
       auto loop_id = loop->iter_domain();
-      auto loop_pt = loop_id->parallelType();
+      auto loop_pt = loop_id->getParallelType();
       auto ref_id = reference_domain->axis(loop_i);
 
       if (loop == unswitch_or_vec_loop) {
@@ -2668,20 +2768,21 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
           if (loop->stop() == loop_id->extent()) {
             loop_to_ind_map[loop] = loop->start();
           } else if (start) {
-            loop_to_ind_map[loop] = ir_builder.zeroVal();
+            loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal();
           } else {
             // Note that the parallel dimension is used rather than
             // loop-stop(). See the above comment.
-            loop_to_ind_map[loop] = ir_builder.subExpr(
-                gpu_lower->parallelDimensionMap().get(loop_pt),
-                ir_builder.create<kir::Int>(1));
+            loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr(
+                GpuLower::current()->parallelDimensionMap().get(loop_pt),
+                GpuLower::current()->kernel()->zeroVal());
           }
         } else if (start) {
-          loop_to_ind_map[loop] = ir_builder.zeroVal();
+          loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal();
         } else {
           // Similar to the above, loop_id()->extent() is
           // used here instead of loop->stop(). See the above comment.
-          loop_to_ind_map[loop] = ir_builder.subExpr(loop_id->extent(), one);
+          loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr(
+              loop_id->extent(), GpuLower::current()->kernel()->oneVal());
         }
       }
 
@@ -2693,9 +2794,36 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
     }
   }
 
+  for (const auto loop : loops) {
+    auto& idx = loop_to_ind_map.at(loop);
+    // If the loop is trivial, the loop index can only be the loop
+    // start value.
+    if (idx == loop->index() && loop->isTrivial()) {
+      idx = loop->start();
+    }
+  }
+
+  if (double_buffer_axis != nullptr) {
+    auto db_loop = GpuLower::current()->doubleBufferInfo().getDoubleBufferLoop(
+        double_buffer_axis, loops, true);
+    if (db_loop != nullptr) {
+      auto loop_to_ind_map_it = loop_to_ind_map.find(db_loop);
+      TORCH_INTERNAL_ASSERT(loop_to_ind_map_it != loop_to_ind_map.end());
+      auto cur_index = loop_to_ind_map_it->second;
+      // if cur_index is not the same as the index of db_loop, it must
+      // be true that that index has been modified to support
+      // unswitch. In that case, it is not necessary to move ahead the
+      // index for double buffering.
+      if (cur_index == db_loop->index()) {
+        loop_to_ind_map[db_loop] = SimplifyingIrBuilder::addExpr(
+            cur_index, GpuLower::current()->kernel()->oneVal());
+      }
+    }
+  }
+
   // Add magic zero to a loop pretty far inside in indexing
-  kir::IterDomain* magic_zero_loop = nullptr;
-  std::unordered_map<kir::IterDomain*, kir::Val*> ref_id_to_ind_map;
+  IterDomain* magic_zero_loop = nullptr;
+  std::unordered_map<IterDomain*, Val*> ref_id_to_ind_map;
   // Due to rfactor/initialization reference_domain may be bigger than loop nest
   // structure
   TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims());
@@ -2703,19 +2831,19 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
     auto loop = loops[loop_i];
     auto ind = loop_to_ind_map[loops[loop_i]];
     auto ref_axis = reference_domain->axis(loop_i);
-    auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as<kir::IterDomain>();
 
     if (Index::protectWithMagicZero(loop, ref_axis, ind)) {
-      magic_zero_loop = kir_ref_axis;
+      magic_zero_loop = ref_axis;
     }
 
-    ref_id_to_ind_map[kir_ref_axis] = loop_to_ind_map[loop];
+    ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loop];
   }
 
   if (ref_id_to_ind_map.count(magic_zero_loop)) {
     auto& ind = ref_id_to_ind_map[magic_zero_loop];
     if (!ind->isConstScalar()) {
-      ind = ir_builder.addExpr(ind, ir_builder.magicZeroVal());
+      ind = SimplifyingIrBuilder::addExpr(
+          ind, GpuLower::current()->kernel()->magicZeroVal());
     }
   }
 
@@ -2729,7 +2857,7 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
     ref_self_map.insert({id, id});
   });
 
-  std::unordered_map<kir::IterDomain*, kir::Val*> reference_halo_extent_map =
+  std::unordered_map<IterDomain*, Val*> reference_halo_extent_map =
       getReferenceHaloExtentMap(reference, ref_self_map);
 
   // Index into the reference tensor
@@ -2741,64 +2869,55 @@ std::unordered_map<kir::IterDomain*, kir::Val*> getPredicateReferenceIndexing(
       {},
       reference_halo_extent_map);
 
-  return index_compute.indexMap();
+  return index_compute;
 }
 
 // Get the offsets for the start and stop predicates. The offsets
 // are to be added to the index.
-std::pair<std::vector<kir::Val*>, std::vector<kir::Val*>> getStartAndStopOffsets(
+std::pair<Val*, Val*> getStartAndStopOffsets(
     IterDomain* consumer_id,
     TensorView* consumer_tv,
     const ReferenceTensor& reference,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_start_index_map,
-    const std::unordered_map<kir::IterDomain*, kir::Val*>& ref_stop_index_map,
+    const std::unordered_map<IterDomain*, Val*>& consumer_start_index_map,
+    const std::unordered_map<IterDomain*, Val*>& consumer_stop_index_map,
     bool padding_predicate,
     bool unswitch,
     bool non_divisible_pred) {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   // By default, the offsets for the start and stop predicates are
-  // just zero.
-  std::vector<kir::Val*> start_offsets{ir_builder.zeroVal()};
-  std::vector<kir::Val*> stop_offsets{ir_builder.zeroVal()};
-
-  if (consumer_id == nullptr) {
-    return {start_offsets, stop_offsets};
+  // just zero. All halo-related adjustments are done at root domains,
+  // so consumer_id is not a root domain, no adjustment is required.
+  if (consumer_id->definition() != nullptr && !non_divisible_pred) {
+    return {
+        GpuLower::current()->kernel()->zeroVal(),
+        GpuLower::current()->kernel()->zeroVal()};
   }
 
   auto consumer_def = consumer_tv->definition();
 
+  Val* start_offset = GpuLower::current()->kernel()->zeroVal();
+  Val* stop_offset = GpuLower::current()->kernel()->zeroVal();
+
   // These adjustments are not required when predicating non-divisible splits
   if (!non_divisible_pred) {
     if (consumer_def->isA<ShiftOp>()) {
-      adjustStartAndStopOffsetsForShift(
-          start_offsets,
-          stop_offsets,
-          consumer_tv,
-          consumer_id,
-          padding_predicate);
+      std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForShift(
+          consumer_tv, consumer_id, padding_predicate);
     } else if (consumer_def->isA<GatherOp>()) {
-      adjustStartAndStopOffsetsForGather(
-          start_offsets,
-          stop_offsets,
+      std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForGather(
           consumer_tv,
           consumer_id,
-          reference,
-          ref_start_index_map,
-          ref_stop_index_map,
+          consumer_start_index_map,
+          consumer_stop_index_map,
           padding_predicate);
     }
 
     // Adjustment for partial split
-    auto partial_split_offset = getGlobalConsumerOffsetWithPartialSplit(
-        gpu_lower->lowerValue(consumer_id)->as<kir::IterDomain>());
-    for (auto& start_offset : start_offsets) {
-      start_offset = ir_builder.addExpr(start_offset, partial_split_offset);
-    }
-    for (auto& stop_offset : stop_offsets) {
-      stop_offset = ir_builder.addExpr(stop_offset, partial_split_offset);
-    }
+    auto partial_split_offset =
+        getGlobalConsumerOffsetWithPartialSplit(consumer_id);
+    start_offset =
+        SimplifyingIrBuilder::addExpr(start_offset, partial_split_offset);
+    stop_offset =
+        SimplifyingIrBuilder::addExpr(stop_offset, partial_split_offset);
 
     // If generating a predicate for unswitch, adjust the stop offset to
     // accommodate the addition of halo to the loop stop. See the
@@ -2808,9 +2927,8 @@ std::pair<std::vector<kir::Val*>, std::vector<kir::Val*>> getStartAndStopOffsets
           !padding_predicate, "Unswitch should not use the padding predicate");
       auto stop_unswitch_offset =
           getUnswitchStopOffset(consumer_id, consumer_tv);
-      for (auto& stop_offset : stop_offsets) {
-        stop_offset = ir_builder.addExpr(stop_offset, stop_unswitch_offset);
-      }
+      stop_offset =
+          SimplifyingIrBuilder::addExpr(stop_offset, stop_unswitch_offset);
     }
   }
 
@@ -2830,39 +2948,48 @@ std::pair<std::vector<kir::Val*>, std::vector<kir::Val*>> getStartAndStopOffsets
   //  index + (start_offset - start_limit) >= 0
   //  index + (stop_offset - stop_limit)  < extent
 
-  for (auto& start_offset : start_offsets) {
-    start_offset = ir_builder.subExpr(start_offset, limits.first);
-  }
-  for (auto& stop_offset : stop_offsets) {
-    stop_offset = ir_builder.subExpr(stop_offset, limits.second);
-  }
+  start_offset = SimplifyingIrBuilder::subExpr(start_offset, limits.first);
+  stop_offset = SimplifyingIrBuilder::subExpr(stop_offset, limits.second);
 
-  return {start_offsets, stop_offsets};
+  return {start_offset, stop_offset};
 }
 
-bool canOmitStartPredicate(kir::Val* start_offset) {
+// A partial value of a start offset is returned if determined to be
+// safe. Nullptr is returned if it can be omitted completely.
+Val* simplifyStartOffset(Val* start_offset) {
   // Start predicate can be omitted when start_offset >= 0.
-  auto offset_val = start_offset->as<kir::Int>()->value();
-  return offset_val.has_value() && offset_val.value() >= 0;
+  auto offset_val = start_offset->as<Int>()->value();
+  if (offset_val.has_value() && offset_val.value() >= 0) {
+    return nullptr;
+  }
+
+  // start_offset may look like min(0, window_index - pad). Then, can
+  // remove min and leave the rhs only.
+  auto def = dynamic_cast<BinaryOp*>(start_offset->definition());
+  if (def != nullptr && def->getBinaryOpType() == BinaryOpType::Min &&
+      def->lhs()->isZeroInt()) {
+    return def->rhs();
+  }
+
+  return start_offset;
 }
 
 bool canOmitStopPredicate(
-    kir::Val* stop_index,
-    kir::Val* stop_offset,
-    kir::IterDomain* kir_contig_id) {
+    Val* stop_index,
+    Val* stop_offset,
+    IterDomain* contig_id) {
   bool index_simple = stop_index->definition() == nullptr;
   // The definition may be just adding the magic zero, which can be
   // effectively considered "simple"
   if (!index_simple && isProtectedWithMagicZero(stop_index)) {
     // Make sure the lhs of stop_index is simple.
-    auto lhs = stop_index->definition()->as<kir::BinaryOp>()->lhs();
+    auto lhs = stop_index->definition()->as<BinaryOp>()->lhs();
     if (lhs->definition() == nullptr) {
       index_simple = true;
     }
   }
 
-  // Omit only when both the index and extent are "simple".
-  if (!(index_simple && kir_contig_id->extent()->definition() == nullptr)) {
+  if (!index_simple) {
     return false;
   }
 
@@ -2873,33 +3000,38 @@ bool canOmitStopPredicate(
   // omitted if extent + halo + stop_offset < extent, i.e., halo +
   // stop_offset <= 0.
 
-  auto stop_offset_val = stop_offset->as<kir::Int>()->value();
-
-  auto halo_ext =
-      gpu_lower->haloInfo().getRootAxisInfo(kir_contig_id).width()->value();
+  auto stop_offset_val = stop_offset->as<Int>()->value();
 
   // If they are not compile-time constant, can't prove the
   // condition.
-  if (!stop_offset_val.has_value() || !halo_ext.has_value()) {
+  if (!stop_offset_val.has_value()) {
     return false;
   }
 
-  if (halo_ext.value() + stop_offset_val.value() > 0) {
+  // Note that when a root domain is halo extended, it is the domain
+  // to be predicated, not its merged contig id even if it exists. So,
+  // if contig_id does not have root axis info, contig_id is
+  // guaranteed to have no halo.
+  auto halo_ext = gpu_lower->haloInfo().hasRootAxisInfo(contig_id)
+      ? gpu_lower->haloInfo().getRootAxisInfo(contig_id).width()
+      : 0;
+
+  if (halo_ext + stop_offset_val.value() > 0) {
     return false;
   }
 
   // When the domain is parallelized, the parallel dimension must be
   // exact. Otherwise, there would be extra threads/blocks that need
   // to be predicated out.
-  if (isParallelTypeThread(kir_contig_id->parallelType())) {
+  if (isParallelTypeThread(contig_id->getParallelType())) {
     if (!gpu_lower->parallelDimensionMap().isExact(
-            kir_contig_id->parallelType())) {
+            contig_id->getParallelType())) {
       return false;
     }
     // If the domain has halo, the loop is expanded by the halo
     // extent, so we can't prove the loop extent is the same as the
     // parallel dimension.
-    if (!(halo_ext.has_value() && halo_ext.value() == 0)) {
+    if (halo_ext != 0) {
       return false;
     }
   }
@@ -2907,55 +3039,143 @@ bool canOmitStopPredicate(
   return true;
 }
 
+std::pair<Val*, Val*> hoistPredicates(
+    Val* start_index,
+    Val* stop_index,
+    const std::vector<kir::ForLoop*>& loops,
+    kir::ForLoop* unswitch_or_vec_loop,
+    IterDomain* predicated_consumer_id,
+    TensorView* predicated_consumer_tv,
+    TensorDomain* ref_td,
+    const std::unordered_map<IterDomain*, Val*>& ref_start_index_map,
+    const std::unordered_map<IterDomain*, Val*>& ref_stop_index_map) {
+  const std::pair<Val*, Val*> same_indices{start_index, stop_index};
+
+  if (isDisabled(DisableOption::IndexHoist)) {
+    return same_indices;
+  }
+
+  const auto start_is_same_as_stop = stop_index == start_index;
+
+  Val* hoisted_stop_index = nullptr;
+
+  if (stop_index->definition() == nullptr) {
+    // If the index doens't have an expression, nothing to hoist
+    hoisted_stop_index = stop_index;
+  } else {
+    bool inserted = false;
+    std::tie(hoisted_stop_index, inserted) =
+        GpuLower::current()->commonIndexMap().insert(
+            predicated_consumer_id,
+            predicated_consumer_tv->domain(),
+            ref_td,
+            ref_stop_index_map,
+            loops,
+            stop_index);
+  }
+
+  Val* hoisted_start_index = nullptr;
+  if (start_is_same_as_stop) {
+    hoisted_start_index = hoisted_stop_index;
+  } else if (start_index->definition() == nullptr) {
+    hoisted_start_index = start_index;
+  } else {
+    bool inserted = false;
+    std::tie(hoisted_start_index, inserted) =
+        GpuLower::current()->commonIndexMap().insert(
+            predicated_consumer_id,
+            predicated_consumer_tv->domain(),
+            ref_td,
+            ref_start_index_map,
+            loops,
+            start_index);
+  }
+
+  return {hoisted_start_index, hoisted_stop_index};
+}
+
 } // namespace
 
 // Returns predicates and the concrete (by loop map) root domains they cover
 std::pair<std::vector<RootPredicateInfo>, ReferenceTensor> Index::
     getReferenceRootPredicates(
-        const kir::TensorView* kir_consumer_tv,
+        TensorView* consumer_tv,
         const std::vector<kir::ForLoop*>& loops,
         kir::ForLoop* unswitch_or_vec_loop,
         bool shift_padding) {
   FUSER_PERF_SCOPE("GpuLower::Lower::Index::getReferenceRootPredicates");
 
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
+
+  const bool is_unswitch = unswitch_or_vec_loop != nullptr;
 
   // Nothing needs to be done when padding is not required.
-  if (shift_padding && !needsPadding(kir_consumer_tv->fuserTv())) {
+  if (shift_padding && !needsPadding(consumer_tv)) {
     return {{RootPredicateInfo::getFalseInfo()}, ReferenceTensor{}};
   }
 
-  auto consumer_tv = kir_consumer_tv->fuserTv();
-
   // Get a reference tensor replayed as existing loop structure
-  ReferenceTensor reference = IndexReferenceReplay::getReference(loops);
+  ReferenceTensor reference =
+      IndexReferenceReplay::getReference(loops, consumer_tv);
 
   // Generate halo information for reference.
   updateHaloInfoForReference(reference, consumer_tv);
 
+  const auto ref_2_consumer = indexMapReferenceTo(
+      consumer_tv, gpu_lower->caMap(), reference.concrete_to_id);
+
+  const auto reference_halo_extent_map =
+      getReferenceHaloExtentMap(reference, ref_2_consumer);
+
+  auto db_axis = gpu_lower->doubleBufferInfo().getDoubleBufferAxis(consumer_tv);
+
+  // Indexing is done without considering contig merging. Actual
+  // predicated domains are determined by considering contiguity.
+  const ContigIDs contig_finder(
+      consumer_tv->domain()->domain(),
+      consumer_tv->getMaybeRFactorDomain(),
+      std::vector<bool>(consumer_tv->getMaybeRFactorDomain().size(), false),
+      {});
+
   // Both start and stop positions may need to be predicated. Indexing
   // differs when generating predicates for unswitch.
   // NOTE: If we could find-and-replace KIR nodes, we could just
   // generate one index map, clone it and replace the loop-to-index
   // mappings of unswitched loops for the start predicate.
-  const auto ref_stop_index_map = getPredicateReferenceIndexing(
-      loops, reference, unswitch_or_vec_loop, false);
-  // If not unswitch, share the same indexing map as the stop index map
-  const auto& ref_start_index_map = unswitch_or_vec_loop != nullptr
-      ? getPredicateReferenceIndexing(
-            loops, reference, unswitch_or_vec_loop, true)
-      : ref_stop_index_map;
+  auto ref_stop_indexing = getPredicateReferenceIndexing(
+      loops, reference, unswitch_or_vec_loop, db_axis, false);
+  const auto consumer_stop_indexing = ref_stop_indexing.updateIndexCompute(
+      consumer_tv->domain(),
+      ref_2_consumer,
+      contig_finder,
+      reference_halo_extent_map);
+  const auto& consumer_stop_index_map = consumer_stop_indexing.indexMap();
 
-  auto ref_2_consumer = indexMapReferenceTo(
-      consumer_tv, gpu_lower->caIndexMap(), reference.concrete_to_id);
+  // If not unswitch, share the same indexing map as the stop index
+  // map
+  const auto& ref_start_indexing = is_unswitch
+      ? getPredicateReferenceIndexing(
+            loops, reference, unswitch_or_vec_loop, db_axis, true)
+      : ref_stop_indexing;
+
+  std::unordered_map<IterDomain*, Val*> consumer_start_index_map;
+  if (is_unswitch) {
+    const auto consumer_start_indexing = ref_start_indexing.updateIndexCompute(
+        consumer_tv->domain(),
+        ref_2_consumer,
+        contig_finder,
+        reference_halo_extent_map);
+    consumer_start_index_map = consumer_start_indexing.indexMap();
+  } else {
+    consumer_start_index_map = consumer_stop_index_map;
+  }
 
   // Get the contiguous ids we need to generate predicates for
   auto contig_id_infos =
-      getPredicateContigIds(reference, consumer_tv, ref_2_consumer);
+      getPredicateContigIds(consumer_tv, consumer_stop_index_map);
 
   auto non_divisible_splits =
-      getNonDivisibleReferenceDomainsToPredicate(consumer_tv, reference);
+      getNonDivisibleConsumerDomainsToPredicate(consumer_tv);
   contig_id_infos.insert(
       contig_id_infos.end(),
       non_divisible_splits.begin(),
@@ -2972,52 +3192,22 @@ std::pair<std::vector<RootPredicateInfo>, ReferenceTensor> Index::
     }
 
     auto root_ids = contig_id_entry.covered_ids;
-    auto kir_contig_id =
-        gpu_lower->lowerValue(contig_id)->as<kir::IterDomain>();
 
-    const auto ref_stop_indexing_it = ref_stop_index_map.find(kir_contig_id);
+    const auto consumer_stop_indexing_it =
+        consumer_stop_index_map.find(contig_id);
 
-    // First condition below is due to broadcasts in consumers of consumer that
-    // are not in consumer there can be unresolved indexing in the reference
-    // tensor. This can happen when we have something like: TV3[i1o*i2, i1i] and
-    // TV1[i2] where tv3 and tv1 share their outer dimension. i1 will be part of
-    // reference tensors root domain, but when indexing into TV1 there aren't
-    // enough indices to resolve it.
-    //
-    // The condition also happens with Misaligned predicates, where
+    // First condition below happens with Misaligned predicates, where
     // inner-most vectorized loops are not included in the loops
     // parameter. Predicates involving vectorized loops are separately
     // generated in lower_misaligned_vectorization.
     //
-    // It can also happens with rfactored reductions. The reference
-    // tensor may include rfactored domains, so the contig id may be
-    // a root domain of the reference, not a rfactor root. Since
-    // there is no loop for rfactor domains, there's no indexing
-    // mapping for root domains. This seems safe as it can only happen
-    // with rfactor and rfactored tensors do not need predicates.
-    //
     // Second condition is simply to avoid predication on broadcasting axes as
     // it's not required.
-    if (ref_stop_indexing_it == ref_stop_index_map.end() ||
-        ref_stop_indexing_it->second->isZeroInt()) {
+    if (consumer_stop_indexing_it == consumer_stop_index_map.end() ||
+        consumer_stop_indexing_it->second->isZeroInt()) {
       continue;
     }
 
-    // Find a corresponding consumer root id if exists. Used to
-    // support shift. If a contig_id is a merged non-root domain, nothing
-    // is required to do for shift as shift-related domains are
-    // excluded from contig domains.
-    IterDomain* consumer_id = nullptr;
-    if (contig_id->definition() == nullptr ||
-        contig_id_entry.is_non_divisible_split) {
-      auto it = ref_2_consumer.find(contig_id);
-      if (it != ref_2_consumer.end()) {
-        consumer_id = it->second;
-      } else {
-        continue;
-      }
-    }
-
     RootPredicateInfo info;
 
     // Compute offsets for start and stop predicate. For non-shift,
@@ -3032,53 +3222,61 @@ std::pair<std::vector<RootPredicateInfo>, ReferenceTensor> Index::
     // The final predicates will look like:
     // (index + start_offset) >= 0 && (index + stop_offset) < extent.
 
-    std::tie(info.start_offsets_, info.stop_offsets_) = getStartAndStopOffsets(
-        consumer_id,
+    std::tie(info.start_offset_, info.stop_offset_) = getStartAndStopOffsets(
+        contig_id,
         consumer_tv,
         reference,
-        ref_start_index_map,
-        ref_stop_index_map,
+        consumer_start_index_map,
+        consumer_stop_index_map,
         shift_padding,
         unswitch_or_vec_loop != nullptr,
         contig_id_entry.is_non_divisible_split);
 
-    auto stop_index = ref_stop_indexing_it->second;
-    auto start_index = ref_start_index_map.at(kir_contig_id);
+    auto stop_index = consumer_stop_indexing_it->second;
+    auto start_index = consumer_start_index_map.at(contig_id);
+
+    std::tie(start_index, stop_index) = hoistPredicates(
+        start_index,
+        stop_index,
+        loops,
+        unswitch_or_vec_loop,
+        contig_id,
+        consumer_tv,
+        reference.domain,
+        ref_start_indexing.indexMap(),
+        ref_stop_indexing.indexMap());
 
     // Build predicates for start positions as:
     //   start_index + start_offset >= 0
-    for (auto start_offset : info.start_offsets_) {
-      if (canOmitStartPredicate(start_offset)) {
-        info.start_predicates_.push_back(ir_builder.trueVal());
-        continue;
-      }
+    auto start_offset = simplifyStartOffset(info.start_offset_);
+    if (start_offset == nullptr) {
+      info.start_predicate_ = GpuLower::current()->kernel()->trueVal();
+    } else {
       auto offsetted_start_index =
-          ir_builder.addExpr(start_index, start_offset);
-      auto pred =
-          ir_builder.geExpr(offsetted_start_index, ir_builder.zeroVal())
-              ->as<kir::Bool>();
-      info.start_predicates_.push_back(pred);
+          SimplifyingIrBuilder::addExpr(start_index, start_offset);
+      auto start_pred =
+          SimplifyingIrBuilder::geExpr(
+              offsetted_start_index, GpuLower::current()->kernel()->zeroVal())
+              ->as<Bool>();
+      info.start_predicate_ = start_pred;
     }
 
     // Build predicates for stop positions as:
     //   stop_index + stop_offset < IterDomain::extent
-    for (auto stop_offset : info.stop_offsets_) {
-      if (canOmitStopPredicate(stop_index, stop_offset, kir_contig_id)) {
-        info.stop_predicates_.push_back(ir_builder.trueVal());
-        continue;
-      }
-      auto offsetted_stop_index = ir_builder.addExpr(stop_index, stop_offset);
-      auto pred =
-          ir_builder.ltExpr(offsetted_stop_index, kir_contig_id->extent())
-              ->as<kir::Bool>();
-      info.stop_predicates_.push_back(pred);
+    auto stop_offset = info.stop_offset_;
+    if (canOmitStopPredicate(stop_index, stop_offset, contig_id)) {
+      info.stop_predicate_ = GpuLower::current()->kernel()->trueVal();
+    } else {
+      auto offsetted_stop_index =
+          SimplifyingIrBuilder::addExpr(stop_index, stop_offset);
+      auto stop_pred = SimplifyingIrBuilder::ltExpr(
+                           offsetted_stop_index, contig_id->extent())
+                           ->as<Bool>();
+      info.stop_predicate_ = stop_pred;
     }
 
-    // Transform ids from reference to concrete and consumer domains
-    // (based on loop compute at map)
-    for (auto ref_id : contig_id_entry.covered_ids) {
-      info.root_ids_.insert(reference.id_to_concrete.at(ref_id));
-      info.consumer_ids_.insert(ref_2_consumer.at(ref_id));
+    for (auto consumer_id : contig_id_entry.covered_ids) {
+      info.root_ids_.insert(consumer_id);
     }
     pred_info_vec.emplace_back(info);
   }
@@ -3089,7 +3287,7 @@ std::pair<std::vector<RootPredicateInfo>, ReferenceTensor> Index::
 bool Index::protectWithMagicZero(
     kir::ForLoop* loop,
     IterDomain* reference_domain,
-    kir::Val* ind) {
+    Val* ind) {
   bool ref_dom_simple =
       (reference_domain == nullptr ? true
                                    : reference_domain->definition() != nullptr);
@@ -3100,16 +3298,9 @@ bool Index::protectWithMagicZero(
 }
 
 RootPredicateInfo RootPredicateInfo::getFalseInfo() {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   RootPredicateInfo info;
-  info.start_predicates_.push_back(ir_builder.falseVal());
-  info.stop_predicates_.push_back(ir_builder.falseVal());
-  // These are just placeholder. When the predicate is false, the
-  // offset should not be used.
-  info.start_offsets_.push_back(nullptr);
-  info.stop_offsets_.push_back(nullptr);
+  info.start_predicate_ = GpuLower::current()->kernel()->falseVal();
+  info.stop_predicate_ = GpuLower::current()->kernel()->falseVal();
 
   return info;
 }
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h
index 83536067c19e..1a88b00fa25c 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.h
+++ b/torch/csrc/jit/codegen/cuda/index_compute.h
@@ -60,6 +60,8 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+class ContigIDs;
+
 class IndexCompute : public BackwardVisitor {
  protected:
   using BackwardVisitor::handle;
@@ -69,30 +71,30 @@ class IndexCompute : public BackwardVisitor {
   void handle(Expr*) override;
 
   // return extent_map_[id] if exists, else return id->extent()
-  kir::Val* getExtent(kir::IterDomain* id);
+  Val* getExtent(IterDomain* id) const;
 
   //! True if a domain is not used to index
-  bool isZero(kir::IterDomain* id) const;
+  bool isZero(IterDomain* id) const;
   //! True if any dependent of a domain is not used to index
-  bool hasZeroMerged(kir::IterDomain* id) const;
+  bool hasZeroMerged(IterDomain* id) const;
 
   // Tensor domain we're mapping back to root
   const TensorDomain* td_; // NOLINT
 
   // Map we update as we propagate backward, containing all IDs in the
   // propagation. Initial indices are mapped with this map at tv->domain()
-  // and are back propagated to tv->rootDomain(). This index_map_ keeps the
+  // and are back propagated to tv->getRootDomain(). This index_map_ keeps the
   // indices at intermediate IterDomain's in that back propagation.
-  std::unordered_map<kir::IterDomain*, kir::Val*> index_map_; // NOLINT
+  std::unordered_map<IterDomain*, Val*> index_map_; // NOLINT
 
   // Map from IterDomain to their broadcasted extent. If a TV has I0*I1 but its
   // producer has B0*I1 this map will contain a mapping from the ID{B0*I1} to
   // the extent I0*I1. Also contains updated extents if we merge in a 0 index.
   // See zero_merged_in_.
-  std::unordered_map<kir::IterDomain*, kir::Val*> extent_map_; // NOLINT
+  std::unordered_map<IterDomain*, Val*> extent_map_; // NOLINT
 
   // Keeps track of domains that do not contribute to indexing
-  std::unordered_set<kir::IterDomain*> zero_domains_; // NOLINT
+  std::unordered_set<IterDomain*> zero_domains_; // NOLINT
 
   // This set keeps track of IterDomain's that have had a zero index merged into
   // them. This happens if we do something like tv->axis(0)->split(4) then
@@ -100,56 +102,71 @@ class IndexCompute : public BackwardVisitor {
   // indexing would be (0, i) then when we do the backward computation that zero
   // and i would attempt to be merged together. We handle indices like these
   // specially.
-  std::unordered_set<kir::IterDomain*> zero_merged_in_;
+  std::unordered_set<IterDomain*> zero_merged_in_;
 
   // IDs that are a result of contiguous merges
-  std::unordered_set<kir::IterDomain*> contig_ids;
+  std::unordered_set<IterDomain*> contig_ids_;
+
+  // Map from root to indexed domains
+  std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
 
   // Mentions if we should propagate an index down a particular IterDomain path
   // if there's an option
-  std::unordered_set<kir::IterDomain*> preferred_paths_;
+  std::unordered_set<IterDomain*> preferred_paths_;
 
   // Map from IterDomains to halo-extended extents in corresponding
   // reference tensor
-  std::unordered_map<kir::IterDomain*, kir::Val*> reference_halo_extent_map_;
+  std::unordered_map<IterDomain*, Val*> reference_halo_extent_map_;
 
  public:
-  const std::unordered_map<kir::IterDomain*, kir::Val*>& indexMap() const {
+  const std::unordered_map<IterDomain*, Val*>& indexMap() const {
     return index_map_;
   }
 
-  const std::unordered_map<kir::IterDomain*, kir::Val*>& extentMap() const {
+  const std::unordered_map<IterDomain*, Val*>& extentMap() const {
     return extent_map_;
   }
 
-  const std::unordered_set<kir::IterDomain*>& zeroDomains() const {
+  const std::unordered_set<IterDomain*>& zeroDomains() const {
     return zero_domains_;
   }
 
-  const std::unordered_set<kir::IterDomain*>& zeroMergedIn() const {
+  const std::unordered_set<IterDomain*>& zeroMergedIn() const {
     return zero_merged_in_;
   }
 
+  const std::unordered_map<IterDomain*, IterDomain*>& rootToContigID() const {
+    return root_to_indexed_id_;
+  }
+
   // Propagate back from _td using initial_index_map
   IndexCompute(
       const TensorDomain* _td,
-      std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map,
-      std::unordered_map<kir::IterDomain*, kir::Val*> _extent_map,
-      std::unordered_set<kir::IterDomain*> zero_domains,
-      std::unordered_set<kir::IterDomain*> _zero_merged_in,
-      const std::vector<bool>& _root_contiguity,
-      std::unordered_set<kir::IterDomain*> preferred_paths = {},
-      std::unordered_map<kir::IterDomain*, kir::Val*>
-          reference_halo_extent_map = {});
+      std::unordered_map<IterDomain*, Val*> initial_index_map,
+      std::unordered_map<IterDomain*, Val*> _extent_map,
+      std::unordered_set<IterDomain*> zero_domains,
+      std::unordered_set<IterDomain*> _zero_merged_in,
+      std::unordered_set<IterDomain*> preferred_paths = {},
+      std::unordered_map<IterDomain*, Val*> reference_halo_extent_map = {});
+
+  IndexCompute(
+      const TensorDomain* _td,
+      std::unordered_map<IterDomain*, Val*> initial_index_map,
+      std::unordered_map<IterDomain*, Val*> _extent_map,
+      std::unordered_set<IterDomain*> zero_domains,
+      std::unordered_set<IterDomain*> _zero_merged_in,
+      const ContigIDs& contig_finder,
+      std::unordered_set<IterDomain*> preferred_paths = {},
+      std::unordered_map<IterDomain*, Val*> reference_halo_extent_map = {});
 
   // Updates index_map, extent_map, and zero_merged_in based on id_map and
   // returns a new IndexCompute ready to be used.
   IndexCompute updateIndexCompute(
       const TensorDomain* new_td,
       const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-      const std::vector<bool>& _root_contiguity,
-      const std::unordered_map<kir::IterDomain*, kir::Val*>&
-          reference_halo_extent_map = {});
+      const ContigIDs& contig_finder,
+      const std::unordered_map<IterDomain*, Val*>& reference_halo_extent_map =
+          {}) const;
 
   virtual void run();
 };
@@ -159,10 +176,10 @@ class IndexSwizzle : public IndexCompute {
  public:
   IndexSwizzle(
       const TensorView* tv,
-      std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map,
-      std::unordered_map<kir::IterDomain*, kir::Val*> extent_map,
-      std::unordered_set<kir::IterDomain*> zero_domains,
-      std::unordered_set<kir::IterDomain*> zero_merged_in);
+      std::unordered_map<IterDomain*, Val*> initial_index_map,
+      std::unordered_map<IterDomain*, Val*> extent_map,
+      std::unordered_set<IterDomain*> zero_domains,
+      std::unordered_set<IterDomain*> zero_merged_in);
 
   void run() override;
 
@@ -183,51 +200,45 @@ class RootPredicateInfo {
   friend class Index;
 
  public:
-  const auto& startPredicates() const {
-    return start_predicates_;
+  const auto& startPredicate() const {
+    return start_predicate_;
   }
 
-  auto& startPredicates() {
-    return start_predicates_;
+  auto& startPredicate() {
+    return start_predicate_;
   }
 
-  const auto& startOffsets() const {
-    return start_offsets_;
+  const auto& startOffset() const {
+    return start_offset_;
   }
 
-  const auto& stopPredicates() const {
-    return stop_predicates_;
+  const auto& stopPredicate() const {
+    return stop_predicate_;
   }
 
-  const auto& stopOffsets() const {
-    return stop_offsets_;
+  const auto& stopOffset() const {
+    return stop_offset_;
   }
 
   const auto& rootIds() const {
     return root_ids_;
   }
 
-  const auto& consumerIds() const {
-    return consumer_ids_;
-  }
-
   //! Return a false RootPredicateInfo, i.e., both start and stop
   //! predicates are false.
   static RootPredicateInfo getFalseInfo();
 
  private:
-  // prdicates for lower end
-  std::vector<kir::Bool*> start_predicates_;
-  // prdicates for upper end
-  std::vector<kir::Bool*> stop_predicates_;
-  // Offsets of the start predicate
-  std::vector<kir::Val*> start_offsets_;
-  // Offsets of the stop predicate
-  std::vector<kir::Val*> stop_offsets_;
+  // prdicate for lower end
+  Bool* start_predicate_ = nullptr;
+  // prdicate for upper end
+  Bool* stop_predicate_ = nullptr;
+  // Offset of the start predicate
+  Val* start_offset_ = nullptr;
+  // Offset of the stop predicate
+  Val* stop_offset_ = nullptr;
   // Track which roots have been handled by the generated predicates
   std::unordered_set<IterDomain*> root_ids_;
-  // Consumer IDs that correspond to root_ids_
-  std::unordered_set<IterDomain*> consumer_ids_;
 };
 
 // Simple interface for IndexCompute
@@ -236,24 +247,24 @@ class RootPredicateInfo {
 class Index {
  private:
   // Producer indexing if it's in shared or local memory
-  static std::vector<kir::Val*> getNonGlobalProducerStridedIndices(
+  static std::vector<Val*> getNonGlobalProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
   // Consumer indexing if it's in shared or local memory
-  static std::vector<kir::Val*> getNonGlobalConsumerStridedIndices(
+  static std::vector<Val*> getNonGlobalConsumerStridedIndices(
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
   // Producer if it's in global memory
-  static std::vector<kir::Val*> getGlobalProducerStridedIndices(
+  static std::vector<Val*> getGlobalProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
   // Consumer indexing if it's in global memory
-  static std::vector<kir::Val*> getGlobalConsumerStridedIndices(
+  static std::vector<Val*> getGlobalConsumerStridedIndices(
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
@@ -276,7 +287,7 @@ class Index {
   //! root domain of a producer tensor. The size of the returned
   //! vector is guaranteed to be equal to the number of axes of the
   //! indexing root domain.
-  static std::vector<kir::Val*> getProducerStridedIndices(
+  static std::vector<Val*> getProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
@@ -285,7 +296,7 @@ class Index {
   //! root domain of a consumer tensor. The size of the returned
   //! vector is guaranteed to be equal to the number of axes of the
   //! indexing root domain.
-  static std::vector<kir::Val*> getConsumerStridedIndices(
+  static std::vector<Val*> getConsumerStridedIndices(
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
@@ -313,7 +324,7 @@ class Index {
   //! vectorized loop.
   static std::pair<std::vector<RootPredicateInfo>, ReferenceTensor>
   getReferenceRootPredicates(
-      const kir::TensorView* kir_consumer_tv,
+      TensorView* consumer_tv,
       const std::vector<kir::ForLoop*>& loops,
       kir::ForLoop* unswitch_or_vec_loop,
       bool padding_predicate);
@@ -328,7 +339,7 @@ class Index {
   static bool protectWithMagicZero(
       kir::ForLoop* loop,
       IterDomain* reference_domain = nullptr,
-      kir::Val* ind = nullptr);
+      Val* ind = nullptr);
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp
index fcd0a8937ed8..bdb334ab044a 100644
--- a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp
@@ -1,11 +1,11 @@
 #include <torch/csrc/jit/codegen/cuda/index_reference_replay.h>
 
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 
 namespace torch {
 namespace jit {
@@ -41,18 +41,15 @@ IterDomain* IndexReferenceReplay::idCopy(IterDomain* id) {
   // reduction. All we care about are the transformations, and trying to make
   // sure we track correctly a replaying with consistent reduction/broadcast
   // domains is challenging and unnecessary.
-  auto copied_id =
-      new IterDomain(id->start(), id->extent(), id->getParallelType());
+  auto copied_id = SimplifyingIrBuilder::create<IterDomain>(
+      id->container(), id->start(), id->extent(), id->getParallelType());
   replayed_ids_.emplace_back(copied_id);
   return copied_id;
 }
 
-IterDomain* IndexReferenceReplay::toFusionID(kir::IterDomain* kir_id) {
-  return ca_map_.toFusion(kir_id);
-}
-
 IterDomain* IndexReferenceReplay::toConcrete(IterDomain* id) {
-  return ca_map_.getConcreteMappedID(id);
+  return GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
 }
 
 void IndexReferenceReplay::handle(Split* split) {
@@ -64,13 +61,14 @@ void IndexReferenceReplay::handle(Split* split) {
   // Don't produce the same values multiple times
   auto ref_outer = concreteToRefId(toConcrete(split->outer()));
   auto ref_inner = concreteToRefId(toConcrete(split->inner()));
-  if (ref_id_produced_.find(ref_outer) != ref_id_consumed_.end() ||
-      ref_id_produced_.find(ref_inner) != ref_id_consumed_.end()) {
+  if (ref_id_produced_.find(ref_outer) != ref_id_produced_.end() ||
+      ref_id_produced_.find(ref_inner) != ref_id_produced_.end()) {
     return;
   }
 
   // Replay the provided split operation and add it to the reference DAG
-  new Split(
+  SimplifyingIrBuilder::create<Split>(
+      split->container(),
       ref_outer,
       ref_inner,
       ref_in,
@@ -96,12 +94,13 @@ void IndexReferenceReplay::handle(Merge* merge) {
 
   // Don't produce the same values multiple times
   auto ref_out = concreteToRefId(toConcrete(merge->out()));
-  if (ref_id_produced_.find(ref_out) != ref_id_consumed_.end()) {
+  if (ref_id_produced_.find(ref_out) != ref_id_produced_.end()) {
     return;
   }
 
   // Replay the provided merge operation and add it to the reference DAG
-  new Merge(ref_out, ref_outer, ref_inner);
+  SimplifyingIrBuilder::create<Merge>(
+      merge->container(), ref_out, ref_outer, ref_inner);
 
   // Mark producers and consumers
   ref_id_consumed_.emplace(ref_outer);
@@ -122,6 +121,56 @@ void IndexReferenceReplay::handle(Expr* e) {
   OptInDispatch::handle(e);
 }
 
+namespace {
+
+bool isMappedWithAny(IterDomain* id, const std::vector<Val*>& ids) {
+  return std::any_of(ids.begin(), ids.end(), [&](Val* val) {
+    return val->isA<IterDomain>() &&
+        GpuLower::current()->caMap()->areMapped(
+            id, val->as<IterDomain>(), IdMappingMode::PERMISSIVE);
+  });
+}
+
+// Get an rfactor IterDomain that is mapped with an IterDomain. If
+// multiple such IDs exist, select one whose input IDs are mapped with
+// the consumer IDs. This is to ensure the path from the leaf
+// IterDomains to the root matches with the consumer tensor.
+IterDomain* getRfactorIDToTraverse(
+    IterDomain* id,
+    const std::vector<Val*>& consumer_all_ids) {
+  const auto& rfactor_ids =
+      GpuLower::current()->caMap()->getViewRfactorDomainsOfIdGroup(
+          id, IdMappingMode::PERMISSIVE);
+
+  if (rfactor_ids.empty()) {
+    return nullptr;
+  }
+
+  for (auto rfactor_id : rfactor_ids) {
+    auto def = rfactor_id->definition();
+    if (def == nullptr) {
+      continue;
+    }
+
+    auto rfactor_id_inputs = ir_utils::filterByType<IterDomain>(def->inputs());
+    if (std::all_of(
+            rfactor_id_inputs.begin(),
+            rfactor_id_inputs.end(),
+            [&](IterDomain* rfactor_id_input) {
+              return isMappedWithAny(rfactor_id_input, consumer_all_ids);
+            })) {
+      return rfactor_id;
+    }
+  }
+
+  // No mapped ID found, which means the consumer is a post-view
+  // tensor. In that case, it shouldn't matter which view path to
+  // traverse, so just return the first one.
+  return rfactor_ids.at(0);
+}
+
+} // namespace
+
 TensorDomain* IndexReferenceReplay::computeReplay() {
   // Throw an error when two loops are mapped with each other, which
   // violates an assumption that unique mappings between concrete
@@ -139,7 +188,10 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
        ++it_i) {
     for (auto it_j = it_i + 1; it_j != loop_structure_.end(); ++it_j) {
       TORCH_INTERNAL_ASSERT(
-          !ca_map_.areMapped((*it_i)->iter_domain(), (*it_j)->iter_domain()),
+          !GpuLower::current()->caMap()->areMapped(
+              (*it_i)->iter_domain(),
+              (*it_j)->iter_domain(),
+              IdMappingMode::EXACT),
           "Unsupported loop structure. Two loops are mapped together.");
     }
   }
@@ -149,7 +201,13 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
       loop_structure_.begin(),
       loop_structure_.end(),
       std::back_inserter(domain_ids),
-      [this](kir::ForLoop* fl) { return toFusionID(fl->iter_domain()); });
+      [](kir::ForLoop* fl) { return fl->iter_domain(); });
+
+  const auto consumer_all_ids = DependencyCheck::getAllValsBetween(
+      {consumer_tv_->getRootDomain().begin(),
+       consumer_tv_->getRootDomain().end()},
+      {consumer_tv_->domain()->domain().begin(),
+       consumer_tv_->domain()->domain().end()});
 
   // IterVisitor based traversals don't work because we don't have all outputs.
   // backward traversal's traverseFrom(domain_ids) will throw "Invalid backward
@@ -161,13 +219,21 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
   // so their broadcast dimensions are "more" resolved than those towards the
   // inner most loops.
   std::deque<IterDomain*> to_visit(domain_ids.begin(), domain_ids.end());
-  std::unordered_set<Expr*> visited;
+  std::unordered_set<Expr*> visited_exprs;
+  std::unordered_set<IterDomain*> visited_ids;
   while (!to_visit.empty()) {
     auto out_id = to_visit.front();
     to_visit.pop_front();
 
+    if (!visited_ids.emplace(out_id).second) {
+      continue;
+    }
     auto expr = out_id->definition();
 
+    if (auto rfactor_id = getRfactorIDToTraverse(out_id, consumer_all_ids)) {
+      to_visit.emplace_front(rfactor_id);
+    }
+
     // ID's will be copied for the reference as we replay transformations. If
     // there was no transformations on an iteration domain, a copy of the
     // iteration domain for the reference is made here.
@@ -179,7 +245,7 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
       continue;
     }
 
-    if (!visited.emplace(expr).second) {
+    if (!visited_exprs.emplace(expr).second) {
       continue;
     }
 
@@ -194,14 +260,14 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
   // Construct a tensor that's representitive of the replayed loop structure.
   std::vector<IterDomain*> loops_replayed_domain;
   for (auto loop : loop_structure_) {
-    auto loop_id = toFusionID(loop->iter_domain());
+    auto loop_id = loop->iter_domain();
     // Map to loops with the loop map, but make sure the replayed id is actually
     // a leaf in the replay.
     auto ref_id_it = std::find_if(
         replayed_ids_.begin(), replayed_ids_.end(), [&](IterDomain* ref_id) {
           return ref_id->uses().empty() &&
-              GpuLower::current()->caLoopMap().areMapped(
-                  refIdToConcrete(ref_id), loop_id);
+              GpuLower::current()->caMap()->areMapped(
+                  refIdToConcrete(ref_id), loop_id, IdMappingMode::PERMISSIVE);
         });
 
     TORCH_INTERNAL_ASSERT(
@@ -216,16 +282,16 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
     ref_id->parallelize(loop_id->getParallelType());
   }
 
+  TensorDomain* domain = nullptr;
   // If no domains were replayed to make the reference, just return the root
   // domain.
   if (std::none_of(
           loops_replayed_domain.begin(),
           loops_replayed_domain.end(),
           [](IterDomain* id) { return id->definition() != nullptr; })) {
-    auto domain = new TensorDomain(
+    domain = SimplifyingIrBuilder::create<TensorDomain>(
         // If there was no replay only return a domain with a root domain.
         loops_replayed_domain);
-    return domain;
   } else {
     // Construct the root domain as the inputs of the replayed domain
     auto loops_replayed_domain_vals =
@@ -257,35 +323,83 @@ TensorDomain* IndexReferenceReplay::computeReplay() {
     }
 
     // Create and return the reference.
-    auto domain = new TensorDomain(
-        {root_domain_ids.begin(), root_domain_ids.end()},
+    domain = SimplifyingIrBuilder::create<TensorDomain>(
+        std::vector<IterDomain*>(
+            root_domain_ids.begin(), root_domain_ids.end()),
         loops_replayed_domain);
-    return domain;
+  }
+
+  cleanUpMappingsOfUnusedDomains(domain);
+  return domain;
+}
+
+void IndexReferenceReplay::cleanUpMappingsOfUnusedDomains(
+    TensorDomain* ref_domain) {
+  // The ref-to-concrete and concrete-to-ref maps can have mappings of
+  // domains that do not end up being used in the final reference
+  // domain. Drop them as they are not really part of reference
+  // tensor.
+
+  const auto all_vals = DependencyCheck::getAllValsBetween(
+      {ref_domain->getRootDomain().begin(), ref_domain->getRootDomain().end()},
+      {ref_domain->domain().begin(), ref_domain->domain().end()});
+
+  const std::unordered_set<IterDomain*> all_id_set(
+      ir_utils::filterByType<IterDomain>(all_vals).begin(),
+      ir_utils::filterByType<IterDomain>(all_vals).end());
+  for (auto it = ref_id_to_concrete_.begin();
+       it != ref_id_to_concrete_.end();) {
+    IterDomain* ref_id = it->first;
+    if (all_id_set.find(ref_id) == all_id_set.end()) {
+      it = ref_id_to_concrete_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  for (auto it = concrete_to_ref_id_.begin();
+       it != concrete_to_ref_id_.end();) {
+    IterDomain* ref_id = it->second;
+    if (all_id_set.find(ref_id) == all_id_set.end()) {
+      it = concrete_to_ref_id_.erase(it);
+    } else {
+      ++it;
+    }
   }
 }
 
 IndexCompute getReferenceIndexing(
     const std::vector<kir::ForLoop*>& loop_structure,
-    TensorDomain* reference_tensor) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
+    TensorDomain* reference_tensor,
+    kir::ForLoop* double_buffer_loop) {
   // Create a simple index mapping from loop iter domains to their local index.
   // This is only applicable to global memory buffers.
-  std::unordered_map<kir::IterDomain*, kir::Val*> initial_index_map;
+  std::unordered_map<IterDomain*, Val*> initial_index_map;
 
   TORCH_INTERNAL_ASSERT(loop_structure.size() <= reference_tensor->nDims());
   int magic_zero_loop = -1;
   for (const auto loop_i : c10::irange(loop_structure.size())) {
     auto ref_axis = reference_tensor->axis(loop_i);
-    auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as<kir::IterDomain>();
     auto loop = loop_structure[loop_i];
     auto ind = loop->index();
-    ;
 
-    initial_index_map[kir_ref_axis] = ind;
-    if (loop->vectorize()) {
-      initial_index_map[kir_ref_axis] = ir_builder.create<kir::Int>(0);
+    // If the loop is trivial, only the start value is used
+    if (loop->isTrivial()) {
+      initial_index_map[ref_axis] = loop->start();
+    } else {
+      initial_index_map[ref_axis] = ind;
+    }
+
+    if (double_buffer_loop == loop) {
+      TORCH_INTERNAL_ASSERT(
+          !loop->isTrivial(), "The double buffer loop must be materialized");
+      // This version of getReferenceIndexing is only used for
+      // indexing global tensors. When indexing global producers, the
+      // index for a double buffered loop needs to be incremented. The
+      // parameter double_buffer_loop should be nullptr when indexing
+      // global consumers tensors.
+      initial_index_map[ref_axis] = SimplifyingIrBuilder::addExpr(
+          initial_index_map[ref_axis], GpuLower::current()->kernel()->oneVal());
     }
 
     if (Index::protectWithMagicZero(loop, ref_axis, ind)) {
@@ -295,10 +409,9 @@ IndexCompute getReferenceIndexing(
 
   // Add magic zero to a fairly inner most index
   if (magic_zero_loop >= 0) {
-    auto ref_id = gpu_lower->lowerValue(reference_tensor->axis(magic_zero_loop))
-                      ->as<kir::IterDomain>();
-    initial_index_map[ref_id] = ir_builder.addExpr(
-        initial_index_map[ref_id], ir_builder.magicZeroVal());
+    auto ref_id = reference_tensor->axis(magic_zero_loop);
+    initial_index_map[ref_id] = SimplifyingIrBuilder::addExpr(
+        initial_index_map[ref_id], FusionGuard::getCurFusion()->magicZeroVal());
   }
 
   // Send to the other version of reference indexing that directly takes the
@@ -310,19 +423,17 @@ IndexCompute getReferenceIndexing(
 IndexCompute getReferenceIndexing(
     const std::vector<kir::ForLoop*>& loop_structure,
     TensorDomain* reference_tensor,
-    std::unordered_map<kir::IterDomain*, kir::Val*> index_map,
-    std::unordered_set<kir::IterDomain*> zero_domains,
+    std::unordered_map<IterDomain*, Val*> index_map,
+    std::unordered_set<IterDomain*> zero_domains,
     std::unordered_set<IterDomain*> preferred_paths,
-    std::unordered_map<kir::IterDomain*, kir::Val*> halo_extent_map) {
-  auto gpu_lower = GpuLower::current();
-
+    std::unordered_map<IterDomain*, Val*> halo_extent_map) {
   // I thought this might be necesasry, but turns out it's not. I think it's
   // because of the root ordering above, however leaving it in case we find
   // out it is necessary in some cases. At the time of commiting, cuda-memcheck
   // passed without this.
   //
-  // std::unordered_map<kir::IterDomain*,
-  // kir::Val*> reference_extent_map; for (auto loop : loop_structure) {
+  // std::unordered_map<IterDomain*,
+  // Val*> reference_extent_map; for (auto loop : loop_structure) {
   //   // If there's a broadcast merged in the for loop ID we want to track its
   //   // extent
   //   auto inputs = InputsOf::outputs(
@@ -342,15 +453,13 @@ IndexCompute getReferenceIndexing(
   //   }
   // }
 
-  // Convert to preferred_path to kir::IterDomain for IndexCompute
-  std::unordered_set<kir::IterDomain*> kir_preferred_path;
-  std::transform(
-      preferred_paths.begin(),
-      preferred_paths.end(),
-      std::inserter(kir_preferred_path, kir_preferred_path.begin()),
-      [&gpu_lower](IterDomain* id) {
-        return gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-      });
+  // No contig indexing is done in reference indexing
+  ContigIDs contig_finder(
+      reference_tensor->domain(),
+      reference_tensor->getMaybeRFactorDomain(),
+      std::vector<bool>(
+          reference_tensor->getMaybeRFactorDomain().size(), false),
+      {});
 
   IndexCompute compute(
       reference_tensor,
@@ -359,9 +468,9 @@ IndexCompute getReferenceIndexing(
       // in this function
       {},
       zero_domains,
-      std::unordered_set<kir::IterDomain*>(),
-      reference_tensor->contiguity(),
-      kir_preferred_path,
+      std::unordered_set<IterDomain*>(),
+      contig_finder,
+      preferred_paths,
       halo_extent_map);
 
   compute.run();
diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.h b/torch/csrc/jit/codegen/cuda/index_reference_replay.h
index c4626213e76b..144b295faa7e 100644
--- a/torch/csrc/jit/codegen/cuda/index_reference_replay.h
+++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
@@ -17,9 +17,10 @@ namespace cuda {
 
 class IndexReferenceReplay : public OptInDispatch {
  private:
-  IndexReferenceReplay(const std::vector<kir::ForLoop*>& loop_structure)
-      : loop_structure_(loop_structure),
-        ca_map_(GpuLower::current()->caIndexMap()) {}
+  IndexReferenceReplay(
+      const std::vector<kir::ForLoop*>& loop_structure,
+      const TensorView* consumer_tv)
+      : loop_structure_(loop_structure), consumer_tv_(consumer_tv) {}
 
   // Generate the replay.
   TensorDomain* computeReplay();
@@ -34,13 +35,13 @@ class IndexReferenceReplay : public OptInDispatch {
   // Make a new id for the reference replay based on the provided id
   IterDomain* idCopy(IterDomain* id);
 
-  // Use the compute at map to get the fusion IterDomain from the
-  // kir::IterDomain
-  IterDomain* toFusionID(kir::IterDomain* kir_id);
-
   // Return the concrete entry of the non-reference id
   IterDomain* toConcrete(IterDomain* id);
 
+  //! Remove mappings of reference IDs that do not end up being used
+  //! in the final reference domain
+  void cleanUpMappingsOfUnusedDomains(TensorDomain* reference_domain);
+
   using OptInDispatch::handle;
 
   void handle(Split* split) override;
@@ -50,9 +51,8 @@ class IndexReferenceReplay : public OptInDispatch {
  private:
   // Hold the loop structure we're generating a reference for.
   const std::vector<kir::ForLoop*>& loop_structure_;
-
-  // Hold the compute at map used for the replay (index map)
-  const ComputeAtMap& ca_map_;
+  // The indexed or predicated consumer tensor
+  const TensorView* consumer_tv_ = nullptr;
 
   // Keep a vector of all iteration domains used in the reference (includes all
   // transformations)
@@ -73,8 +73,9 @@ class IndexReferenceReplay : public OptInDispatch {
  public:
   // Generate the reference of the provided loop nest structure
   static ReferenceTensor getReference(
-      const std::vector<kir::ForLoop*>& loop_structure) {
-    auto replay = IndexReferenceReplay(loop_structure);
+      const std::vector<kir::ForLoop*>& loop_structure,
+      const TensorView* consumer_tv) {
+    auto replay = IndexReferenceReplay(loop_structure, consumer_tv);
     ReferenceTensor ref;
     ref.domain = replay.computeReplay();
     ref.concrete_to_id = replay.concrete_to_ref_id_;
@@ -87,16 +88,17 @@ class IndexReferenceReplay : public OptInDispatch {
 IndexCompute getReferenceIndexing(
     const std::vector<kir::ForLoop*>& loop_structure,
     TensorDomain* reference_domain,
-    std::unordered_map<kir::IterDomain*, kir::Val*> index_map,
-    std::unordered_set<kir::IterDomain*> zero_domains,
+    std::unordered_map<IterDomain*, Val*> index_map,
+    std::unordered_set<IterDomain*> zero_domains,
     std::unordered_set<IterDomain*> preferred_path,
-    std::unordered_map<kir::IterDomain*, kir::Val*> halo_extent_map = {});
+    std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
 
 // Short cut for global TVs. Index into the reference based on all loop indicies
 // in the loop structure.
 IndexCompute getReferenceIndexing(
     const std::vector<kir::ForLoop*>& loop_structure,
-    TensorDomain* reference_domain);
+    TensorDomain* reference_domain,
+    kir::ForLoop* double_buffer_loop = nullptr);
 
 // When indexing there are sometimes an option to propagate an index down
 // multiple paths. This will return the IterDomains in the history of the
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
index 52e16b3a7afe..16b7f33a8e3a 100644
--- a/torch/csrc/jit/codegen/cuda/instrumentation.cpp
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
@@ -1,6 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #ifdef _WIN32
 #include <c10/util/win32-headers.h>
@@ -32,7 +32,7 @@ Trace::Trace() {
     logEvent('I', "TRACE_START");
   }
 
-  if (getenv("PYTORCH_NVFUSER_DISABLE_NVTX")) {
+  if (isDisabled(DisableOption::Nvtx)) {
     record_nvtx_range_ = false;
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index bd54d30811dd..b6a1c4ab84da 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -15,13 +15,132 @@ C10_DEFINE_bool(
 C10_DEFINE_bool(
     torch_jit_nvfuser_horizontal_fusion,
     true,
-    "enable single node fusion for nvfuser");
+    "enable horizontal fusion for nvfuser");
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+static std::atomic<bool> cuda_fusion_guard_mode{true};
+
+// There are 3 sources of information on whether to enable nvfuser:
+// 1. assigned value from setEnabled() - takes precendence if it has been set
+// 2. value from environment variable - only used if setEnabled() is unset
+// 3. default value - used if both 1 and 2 are unset.
+//
+// If 1 or 2 tries to enable nvfuser when it cannot be enabled (e.g. cuda not
+// available), then an error will be thrown. The default will not error.
+class NVFuserEnabler {
+ private:
+  c10::optional<bool> runtime_assigned_fuser_enabled_ = c10::nullopt;
+  std::once_flag enabled_check_flag_;
+  std::mutex mutex_;
+
+  static bool nvfuserCanBeEnabled() {
+#ifdef USE_ROCM
+    return false;
+#else
+    return at::globalContext().hasCUDA() &&
+        NVFuserPassManager::isRegistered() && getExecutorMode();
+#endif
+  }
+
+  static void assertFuserCanBeEnabled(bool is_enabled) {
+    if (!is_enabled) {
+      return;
+    }
+    TORCH_CHECK(
+        nvfuserCanBeEnabled(),
+        "Running CUDA fuser is only supported on CUDA builds.");
+  }
+
+  static c10::optional<bool> getFuserEnabledEnvVar() {
+    static const char* enable_c_str = std::getenv("PYTORCH_JIT_ENABLE_NVFUSER");
+    if (!enable_c_str) {
+      return c10::nullopt;
+    }
+    std::string enable(enable_c_str);
+    if (enable == "0" || enable == "OFF") {
+      return false;
+    }
+    return true;
+  }
+
+  static c10::optional<bool> getCachedFuserEnabledEnvVar() {
+    static c10::optional<bool> default_enabled = getFuserEnabledEnvVar();
+    return default_enabled;
+  }
+
+  static bool getNNCNotNVFuser() {
+    static const char* env_c_str =
+        std::getenv("PYTORCH_JIT_USE_NNC_NOT_NVFUSER");
+    if (!env_c_str) {
+      return false;
+    }
+    std::string env(env_c_str);
+    if (env == "1" || env == "ON") {
+      return true;
+    }
+    return false;
+  }
+
+  static bool getCachedNNCNotNVFuser() {
+    static bool force_disable = getNNCNotNVFuser();
+    return force_disable;
+  }
+
+  bool isEnabledImpl() {
+    // 0. opportunity to force disable NVFuser
+    if (getCachedNNCNotNVFuser()) {
+      return false;
+    }
+    std::call_once(enabled_check_flag_, [&]() {
+      // if environment variable is setting the value, we must
+      if (!runtime_assigned_fuser_enabled_.has_value() &&
+          getCachedFuserEnabledEnvVar().has_value()) {
+        assertFuserCanBeEnabled(*getCachedFuserEnabledEnvVar());
+      }
+    });
+    // 1. if user has explicitly assigned fuser value, that value takes
+    // precedence.
+    if (runtime_assigned_fuser_enabled_.has_value()) {
+      return *runtime_assigned_fuser_enabled_;
+    }
+    // 2. next precedence is any value assigned by
+    if (getCachedFuserEnabledEnvVar().has_value()) {
+      return *getCachedFuserEnabledEnvVar();
+    }
+    // 3. default value (if you switch this to true, make sure
+    //    to check nvfuserCanBeEnabled())
+    return false;
+  }
+
+ public:
+  bool setEnabled(bool is_enabled) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    assertFuserCanBeEnabled(is_enabled);
+    bool old_value = isEnabledImpl();
+    runtime_assigned_fuser_enabled_ = is_enabled;
+    return old_value;
+  }
+
+  bool isEnabled() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return isEnabledImpl();
+  }
+};
+
+static NVFuserEnabler nvfuser_enabler;
+
+bool isEnabled() {
+  return nvfuser_enabler.isEnabled();
+}
+
+bool setEnabled(bool is_enabled) {
+  return nvfuser_enabler.setEnabled(is_enabled);
+}
+
 bool getSingletonFusion() {
   return FLAGS_torch_jit_nvfuser_singleton_fusion;
 }
@@ -42,8 +161,6 @@ bool setHorizontalFusion(bool value) {
   return old_value;
 }
 
-static std::atomic<bool> cuda_fusion_guard_mode{true};
-
 std::atomic<bool>& getCudaFusionGuardMode() {
   return cuda_fusion_guard_mode;
 }
@@ -68,6 +185,10 @@ void runFusionGroup(const Node* fusion_node, Stack& stack) {
 }
 
 void fuseGraph(std::shared_ptr<Graph>& graph) {
+  if (!isEnabled()) {
+    return;
+  }
+
   TORCH_CHECK(
       getFuserInterface()->fn_fuse_graph != nullptr,
       "Running the CUDA fuser requires a CUDA build.");
@@ -90,6 +211,11 @@ bool profileNode(const Node* node) {
       getFuserInterface()->fn_profile_n(node);
 }
 
+bool skipNode(const std::string& symbol_str, bool flip) {
+  return getFuserInterface()->fn_skip_n != nullptr &&
+      getFuserInterface()->fn_skip_n(symbol_str, flip);
+}
+
 //! [ Note -- type guard logic in CudaFusionGuard ]
 //!
 //! CudaFusionGuard is used to Guard input tensor to `CudaFusionGroup` so that
@@ -117,11 +243,15 @@ bool profileNode(const Node* node) {
 //!             extra attention should be paid to contiguity across size-1
 //!             dimensions.
 //!   c. size check:
+//!        c.1 broadcast check:
 //!        making sure that broadcast semantics are identical. So we want to
 //!        make sure a given dimension either are both size-1 for `tensor` &
 //!        `guard_tensor_type`, or are both non-size-1.
 //!        This is due to the fact that we specialize size-1 dimension as
 //!        broadcasted dimension while translating PyTorch tensor to Fusion IR.
+//!        c.1 size-0 check:
+//!        we don't specialize this on codegen, but we do specialize fusion
+//!        logic for size-0 on reductoins, hence the check
 //!
 bool complyWith(
     const at::Tensor& tensor,
@@ -133,13 +263,19 @@ bool complyWith(
   // check a. if num_dimension check fails or scalar type check fails
   if (*guard_tensor_type->dim() != static_cast<size_t>(tensor.ndimension()) ||
       (guard_tensor_type->scalarType().has_value() &&
-       (guard_tensor_type->scalarType().value() != tensor.scalar_type()))) {
+       (guard_tensor_type->scalarType().value() != tensor.scalar_type())) ||
+      (guard_tensor_type->device().has_value() &&
+       (guard_tensor_type->device().value() != tensor.device())) ||
+      (guard_tensor_type->requiresGrad().has_value() &&
+       guard_tensor_type->requiresGrad().value() !=
+           (tensor.requires_grad() && at::GradMode::is_enabled()))) {
     return false;
   }
 
   // TODO: should we get symbolic_size instead and check for size
   // consistency across tensors as well?
   const auto& sizes = guard_tensor_type->sizes();
+  // see [ Note -- stirde_properties in tensor type ]
   const auto& stride_properties = guard_tensor_type->stride_properties();
 
   const auto& t_sizes = tensor.sizes();
@@ -207,12 +343,18 @@ bool complyWith(
       }
     }
 
-    // check c, we go along semantic ordered dimensions
+    // check c.1, we go along semantic ordered dimensions
     // check broadcast / size-1:
     bool guard_bcast = sizes[j].has_value() && sizes[j].value() == 1;
     if (guard_bcast != (t_sizes[j] == 1)) {
       return false;
     }
+
+    // check c.2, check for size-0
+    bool guard_size_0 = sizes[j].has_value() && sizes[j].value() == 0;
+    if (guard_size_0 != (t_sizes[j] == 0)) {
+      return false;
+    }
   }
 
   return true;
@@ -329,6 +471,238 @@ RegisterOperators reg_guard({
         aliasAnalysisFromSchema()),
 });
 
+// Infer dynamic axis (-1) in view_sizes given tensor_sizes
+bool inferViewShape(
+    c10::List<int64_t> tensor_sizes,
+    c10::List<int64_t> view_sizes) {
+  int64_t dynamic_index = -1;
+  size_t view_size_num_elements = 1;
+  for (size_t idx = 0; idx < view_sizes.size(); ++idx) {
+    if (view_sizes[idx] == -1) {
+      TORCH_INTERNAL_ASSERT(
+          dynamic_index == -1, "Only one dimension can by inferred.")
+      dynamic_index = idx;
+    } else {
+      TORCH_INTERNAL_ASSERT(view_sizes[idx] > 0);
+      view_size_num_elements *= view_sizes[idx];
+    }
+  }
+  const size_t kNumElements = std::accumulate(
+      tensor_sizes.begin(), tensor_sizes.end(), 1, std::multiplies<>());
+
+  if (kNumElements % view_size_num_elements != 0) {
+    return false;
+  }
+
+  if (dynamic_index != -1) {
+    view_sizes[dynamic_index] = kNumElements / view_size_num_elements;
+  }
+
+  return true;
+}
+
+//! [ Note -- type guard logic in CudaFusionViewGuard ]
+//!
+//! CudaFusionViewGuard is used to guard input tensors to a `CudaFusionGroup`
+//! that contains view operations, so that we would not feed inputs that
+//! violate the graph defined in `GraphCache`.
+//!
+//! output = view(self, view-sizes)
+//!
+//! View Guard Inputs:
+//!   1. self tensor_sizes - dynamic size List[Int]
+//!   2. view_sizes - profile_ivalue List[Int]
+//!   3. tensor_constraint - Constant List[Int]
+//!   4. view_sizes_constraint - Constant List[Int]
+//!
+//! Things that we check:
+//!   1. The #dimensions are the same for self tensor and its constraint
+//!   2. The #dimensions are the same for view-sizes and its constraint
+//!   3. Self tensor does not violate its constraint
+//!     a. Queue unrestricted sizes
+//!     b. Calculate #elements in self tensor
+//!   4. view-sizes does not violate its constraint
+//!     a. Pop unrestricted sizes from queue
+//!     b. Calculate #elements in view-sizes
+//!   5. The #elements is the same for self tensor and view-sizes
+//!
+//! Constraints:
+//! A restricted axis creates a graph constraint, so its sizes is static.
+//! An unrestricted axis is allowed to have a dynamic size, if it is consistent
+//! between self tensor and view-sizes. It is marked with -1 in the constraint.
+//! Only iterDomains with the Keep transform are dynamic. All other transforms
+//! create a static constraint.
+//!
+bool checkViewGuard(
+    c10::List<int64_t> tensor_sizes,
+    c10::List<int64_t> view_sizes,
+    c10::List<int64_t> tensor_constraint,
+    c10::List<int64_t> view_sizes_constraint) {
+  // 1: Num Dimensions Check
+  if (tensor_constraint.size() != tensor_sizes.size() ||
+      view_sizes_constraint.size() != view_sizes.size()) {
+    return false;
+  }
+
+  // If axis allows dynamic sizes, then add tensor size to this queue.
+  // For dynamic axes in view_sizes, check that it is consistent with
+  // the corresponding tensor size.
+  std::queue<int64_t> dynamic_axis_queue;
+
+  // 2. Tensor Static Check
+  int64_t tensor_size_product = 1;
+  for (const auto idx : c10::irange(tensor_sizes.size())) {
+    if (tensor_constraint[idx] == -1) {
+      dynamic_axis_queue.push(tensor_sizes[idx]);
+    } else if (tensor_constraint[idx] != tensor_sizes[idx]) {
+      return false;
+    }
+    tensor_size_product *= tensor_sizes[idx];
+  }
+
+  // 3. View-Sizes Static Check
+  int64_t view_size_product = 1;
+  for (const auto idx : c10::irange(view_sizes.size())) {
+    auto dynamic_size = (view_sizes_constraint[idx] == -1)
+        ? dynamic_axis_queue.front()
+        : view_sizes_constraint[idx];
+    if (dynamic_size != view_sizes[idx]) {
+      return false;
+    }
+    view_size_product *= dynamic_size;
+    if (view_sizes_constraint[idx] == -1) {
+      dynamic_axis_queue.pop();
+    }
+  }
+
+  // 4. Check view invariant
+  // The number of elements in the input and output tensors are the same.
+  return tensor_size_product == view_size_product;
+}
+
+//!
+//! CudaFusionViewGuard Example Graph:
+//!
+//! graph(%self : __torch__.BiasViewRelu,
+//!       %inputs.1 : Tensor):
+//!   %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40
+//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
+//!   %4 : NoneType = prim::Constant()
+//!   %5 : int[] = prim::Constant[value=[2, 3]]()
+//!   %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25
+//!   %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25
+//!   %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25
+//!   %bias : Tensor = prim::GetAttr[name="bias"](%self)
+//!   %10 : int[] = aten::size(%bias)
+//!   %11 : int[] = prim::BroadcastSizes(%6, %10)
+//!   %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias)
+//!   %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]()
+//!   %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]()
+//!   %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14)
+//!   %16 : bool[] = prim::ListConstruct(%15, %12)
+//!   %17 : bool = aten::all(%16)
+//!   %18 : Tensor = prim::If(%17)
+//!     block0():
+//!       %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias)
+//!       -> (%19)
+//!     block1():
+//!       %20 : Function = prim::Constant[name="fallback_fn", fallback=1]()
+//!       %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1)
+//!       %22 : Float(...) = prim::TupleUnpack(%21)
+//!       -> (%22)
+//!   return (%18)
+//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...),
+//!       %1 : Float(...)):
+//!   %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]()
+//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
+//!   %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16
+//!   %5 : Float(...) = prim::view_copy(%o.1, %2)
+//!   %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19
+//!   return (%6)
+//!
+RegisterOperators view_guard({
+    Operator(
+        "prim::CudaFusionViewGuard(...) -> bool",
+        // prim::CudaFusionViewGuard returns a fresh Boolean type without
+        // aliasing. if we would ever return refined tensor, which would change
+        // aliasing analysis, we should update aliasdb pass.
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            // view_sizes_constraint - Constant List[Int]
+            at::ArrayRef<IValue> inputs = last(stack, 4);
+
+            // tensor_sizes is the runtime size for the self tensor
+            // tensor_sizes - dynamic size List[Int]
+            TORCH_INTERNAL_ASSERT(
+                inputs[0].isIntList(), "tensor_sizes needs to be Int List");
+            auto tensor_sizes = inputs[0].toIntList();
+
+            // profiled_view_sizes is the runtime view size
+            // profiled_view_sizes - profile_ivalue List[Int]
+            TORCH_INTERNAL_ASSERT(
+                inputs[1].isIntList(),
+                "profiled_view_sizes needs to be Int list");
+            auto profiled_view_sizes = inputs[1].toIntList();
+
+            // tensor_constraint is a constant List[Int]
+            // used to guard tensor_sizes
+            TORCH_INTERNAL_ASSERT(
+                inputs[2].isIntList(),
+                "tensor constraint needs to be Int List");
+            auto tensor_constraint = inputs[2].toIntList();
+
+            // view_sizes_constraint is a constant List[Int]
+            // used to guard profiled_view_sizes
+            TORCH_INTERNAL_ASSERT(
+                inputs[3].isIntList(),
+                "view_sizes constraint needs to be Int List");
+            auto view_sizes_constraint = inputs[3].toIntList();
+
+            // Drop after gather all input arguments
+            // If an argument is moved, it is destroyed when dropped from stack
+            drop(stack, 4);
+
+            auto status = inferViewShape(tensor_sizes, profiled_view_sizes);
+            if (!status) {
+              push(stack, IValue(false));
+              return;
+            }
+
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+
+            auto guard_status = checkViewGuard(
+                tensor_sizes,
+                profiled_view_sizes,
+                tensor_constraint,
+                view_sizes_constraint);
+            push(stack, IValue(guard_status));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+RegisterOperators ivalue_guard({
+    Operator(
+        "prim::CudaFusionIvalGuard(...) -> bool",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            at::ArrayRef<IValue> inputs = last(stack, 2);
+            drop(stack, 2);
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+            push(stack, inputs[0].equals(inputs[1]));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 RegisterOperators reg_add_optional({
     Operator(
@@ -346,6 +720,181 @@ RegisterOperators reg_add_optional({
         },
         aliasAnalysisFromSchema()),
 });
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_view_copy({
+    Operator(
+        "prim::view_copy(Tensor self, int[] size) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "view_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, size;
+            pop(stack, self, size);
+            push(stack, at::native::view(self.toTensor(), size.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_flatten_copy({
+    Operator(
+        "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "flatten_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, start_dim, end_dim;
+            pop(stack, self, start_dim, end_dim);
+            push(
+                stack,
+                at::native::flatten(
+                    self.toTensor(), start_dim.toInt(), end_dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_reshape_copy({
+    Operator(
+        "prim::reshape_copy(Tensor self, int[] shape) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "reshape_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, shape;
+            pop(stack, self, shape);
+            push(
+                stack,
+                at::native::reshape(self.toTensor(), shape.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_squeeze_copy({
+    Operator(
+        "prim::squeeze_copy(Tensor self) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "squeeze_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self;
+            pop(stack, self);
+            push(stack, at::squeeze(self.toTensor()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_squeeze_dim_copy({
+    Operator(
+        "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "squeeze_dim_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim;
+            pop(stack, self, dim);
+            push(stack, at::squeeze(self.toTensor(), dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_unsqueeze_copy({
+    Operator(
+        "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "unsqueeze_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim;
+            pop(stack, self, dim);
+            push(stack, at::unsqueeze(self.toTensor(), dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_unsqueeze_size({
+    Operator(
+        "prim::infer_unsqueeze_size(int[] a, int dim) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto dim = pop(stack).toInt();
+            auto size = pop(stack).toIntVector();
+            if (dim < 0) {
+              dim = dim + 1 + size.size();
+            }
+            auto it = size.begin() + dim;
+            size.insert(it, 1);
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_squeeze_dim_size({
+    Operator(
+        "prim::infer_squeeze_size.dim(int[] a, int dim) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto dim = pop(stack).toInt();
+            auto size = pop(stack).toIntVector();
+            if (dim < 0) {
+              dim = dim + size.size();
+            }
+            auto it = size.begin() + dim;
+            if (*it == 1) {
+              size.erase(it);
+            }
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_squeeze_size({
+    Operator(
+        "prim::infer_squeeze_size(int[] a) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto size = pop(stack).toIntVector();
+
+            for (auto it = size.begin(); it != size.end(); it++) {
+              if (*it == 1) {
+                auto pre = it - 1;
+                size.erase(it);
+                it = pre;
+              }
+            }
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
 } // namespace
 
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h
index 1ab9e6d80086..61daad880c4c 100644
--- a/torch/csrc/jit/codegen/cuda/interface.h
+++ b/torch/csrc/jit/codegen/cuda/interface.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
 
 /*
@@ -19,10 +20,10 @@ namespace cuda {
 
 TORCH_API std::atomic<bool>& getCudaFusionGuardMode();
 
-C10_EXPORT bool getSingletonFusion();
-C10_EXPORT bool setSingletonFusion(bool value);
-C10_EXPORT bool getHorizontalFusion();
-C10_EXPORT bool setHorizontalFusion(bool value);
+TORCH_API bool getSingletonFusion();
+TORCH_API bool setSingletonFusion(bool value);
+TORCH_API bool getHorizontalFusion();
+TORCH_API bool setHorizontalFusion(bool value);
 
 // dummy struct to allow API registration
 struct CudaFuserInterface {
@@ -32,22 +33,44 @@ struct CudaFuserInterface {
   bool (*fn_can_fuse_n)(const Node*) = nullptr;
   void (*fn_insert_profile_inodes)(ProfilingRecord* pr) = nullptr;
   bool (*fn_profile_n)(const Node*) = nullptr;
+  bool (*fn_skip_n)(const std::string&, bool flip) = nullptr;
 };
 
 // Get interface, this is used by registration and user facing API internally
-C10_EXPORT CudaFuserInterface* getFuserInterface();
+TORCH_API CudaFuserInterface* getFuserInterface();
 
-C10_EXPORT void compileFusionGroup(Node* fusion_node);
-C10_EXPORT void runFusionGroup(const Node* fusion_node, Stack& stack);
-C10_EXPORT void fuseGraph(std::shared_ptr<Graph>&);
-C10_EXPORT bool canFuseNode(const Node* node);
-C10_EXPORT void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
-C10_EXPORT bool profileNode(const Node* node);
+TORCH_API void compileFusionGroup(Node* fusion_node);
+TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack);
+TORCH_API void fuseGraph(std::shared_ptr<Graph>&);
+TORCH_API bool canFuseNode(const Node* node);
+TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
+TORCH_API bool profileNode(const Node* node);
 
-C10_EXPORT bool complyWith(
+TORCH_API bool skipNode(const std::string& symbol_str, bool flip = true);
+
+TORCH_API bool complyWith(
     const at::Tensor& tensor,
     const c10::TensorTypePtr& guard_tensor_type);
 
+TORCH_API bool isEnabled();
+TORCH_API bool setEnabled(bool is_enabled);
+
+struct TORCH_API NVFuserPassManager : public PassManager<NVFuserPassManager> {
+  static bool registerPass(bool enabled) {
+    bool old_value = PassManager::isRegistered();
+    if (enabled) {
+      PassManager::registerPass(fuseGraph);
+    } else {
+      PassManager::clearPass();
+    }
+    return old_value;
+  }
+
+  static bool isRegistered() {
+    return PassManager::isRegistered();
+  }
+};
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
index cf3d9c7a8c75..0d67f780886b 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
@@ -1,8 +1,12 @@
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 
 #include <torch/csrc/jit/ir/ir.h>
@@ -20,16 +24,20 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+Statement::Statement(IrBuilderPasskey passkey) {
+  ir_container_ = passkey.ir_container_;
+}
+
 Statement::Statement(const Statement* src, IrCloner* ir_cloner) {
-  // IRCloner when cloning to a new fusion will copy the names of the original
-  // fusion. If we're cloning into the same fusion, we let Val and Expr get
-  // their names as usual by registering with the current fusion in their
-  // constructors, so don't overwrite that here.
-  if (src->fusion() != ir_cloner->fusion()) {
-    name_ = src->name_;
-  }
-  fusion_ = ir_cloner->fusion();
-  ir_cloner->registerClone(src, this);
+  ir_container_ = ir_cloner->container();
+}
+
+void Statement::setName(IrContainerPasskey, StmtNameType name) {
+  name_ = name;
+}
+
+void Statement::setName(IrBuilderPasskey, StmtNameType name) {
+  name_ = name;
 }
 
 Val* Statement::asVal() {
@@ -42,24 +50,37 @@ Expr* Statement::asExpr() {
   return this->as<Expr>();
 }
 
-void Statement::print() const {
-  IrPrinter ir_printer(std::cout);
+std::string Statement::toString() const {
+  std::stringstream ss;
+  IrPrinter ir_printer(ss);
   ir_printer.handle(this);
-  std::cout << std::endl;
+  return ss.str();
 }
 
-// When we create a Val we immediately register them with the active fusion.
-Val::Val(ValType _vtype, DataType _dtype, bool register_val)
-    : vtype_(_vtype), dtype_(_dtype) {
-  Fusion* fusion = FusionGuard::getCurFusion();
-  TORCH_CHECK(
-      fusion != nullptr, "No active fusion group found when creating a Val.");
-  fusion_ = fusion;
-  if (register_val) {
-    name_ = fusion_->registerVal(this);
-  }
+std::string Statement::toInlineString() const {
+  std::stringstream ss;
+  IrPrinter ir_printer(ss);
+  ir_printer.print_inline(this);
+  return ss.str();
+}
+
+Fusion* Statement::fusion() const {
+  TORCH_INTERNAL_ASSERT(
+      ir_container_->isA<Fusion>(), "Statement does not belong to a fusion.");
+  return ir_container_->as<Fusion>();
 }
 
+kir::Kernel* Statement::kernel() const {
+  TORCH_INTERNAL_ASSERT(
+      ir_container_->isA<kir::Kernel>(),
+      "Statement does not belong to a kernel.");
+  return ir_container_->as<kir::Kernel>();
+}
+
+// When we create a Val we immediately register them with the active fusion.
+Val::Val(IrBuilderPasskey passkey, ValType _vtype, DataType _dtype)
+    : Statement(passkey), vtype_(_vtype), dtype_(_dtype) {}
+
 // NOTE: we don't clone the definition_ and uses_ here
 //  since they may introduce cloning cycles. Instead, we copy
 //  the original pointers and we'll fix them up later part of the
@@ -67,16 +88,7 @@ Val::Val(ValType _vtype, DataType _dtype, bool register_val)
 //  this constructor now leaving them to be resolved by later stages
 //
 Val::Val(const Val* src, IrCloner* ir_cloner)
-    : Statement(src, ir_cloner),
-      vtype_(src->vtype_),
-      dtype_(src->dtype_),
-      is_fusion_input_(src->is_fusion_input_),
-      is_fusion_output_(src->is_fusion_output_) {
-  // If we're "cloning" into the same fusion, register with the fusion
-  if (src->fusion() == ir_cloner->fusion()) {
-    name_ = src->fusion()->registerVal(this);
-  }
-}
+    : Statement(src, ir_cloner), vtype_(src->vtype_), dtype_(src->dtype_) {}
 
 const std::vector<Expr*>& Val::uses() const {
   if (vtype_ == ValType::TensorView) {
@@ -87,38 +99,59 @@ const std::vector<Expr*>& Val::uses() const {
   return uses_;
 }
 
+// Converts the data type of TensorView or Scalar representing index
+// values. The data type of the original input should be
+// DataType::Index, but DataType::Int is also allowed as it is used
+// for index expressions.
+void Val::resolveIndexDtype() {
+  TORCH_INTERNAL_ASSERT(
+      vtype_ == ValType::TensorView || vtype_ == ValType::Scalar,
+      "Resolving index type is currently only supported on tensor view or scalar values. "
+      "Value type: ",
+      vtype_);
+  TORCH_INTERNAL_ASSERT(
+      dtype_ == DataType::Index || dtype_ == DataType::Int,
+      "Can only resolve index type if a Val has an Index or Int DataType. ",
+      "Data type: ",
+      dtype_);
+  TORCH_INTERNAL_ASSERT(
+      container()->isA<kir::Kernel>(),
+      "Index type can only be resolved at compile time.");
+  dtype_ = container()->as<kir::Kernel>()->indexType();
+}
+
 namespace {
 
 // Traverse definition of all values involved in constructing the provided val.
 // Check if all values involved are constant values, meaning the provided
 // val is also a constant value.
-class ConstCheck : OptOutConstDispatch {
+class ConstCheck : private OptOutConstDispatch {
  private:
   bool is_const_ = true;
 
-  void handle(const Bool* b) override {
+  void handle(const Bool* b) final {
     is_const_ = is_const_ && b->isConst();
   }
 
-  void handle(const Double* d) override {
+  void handle(const Double* d) final {
     is_const_ = is_const_ && d->isConst();
   }
 
-  void handle(const Int* i) override {
+  void handle(const Int* i) final {
     is_const_ = is_const_ && i->isConst();
   }
 
-  void handle(const NamedScalar* ns) override {
+  void handle(const NamedScalar* ns) final {
     is_const_ = is_const_ && false;
   }
 
-  void handle(const Expr* expr) override {
+  void handle(const Expr* expr) final {
     for (auto inp : expr->inputs()) {
       handle(inp);
     }
   }
 
-  void handle(const Val* val) override {
+  void handle(const Val* val) final {
     if (val->definition() != nullptr) {
       handle(val->definition());
     } else {
@@ -137,15 +170,18 @@ class ConstCheck : OptOutConstDispatch {
 } // namespace
 
 bool Val::isConstScalar() const {
-  if (!isScalar())
+  if (!isScalar()) {
     return false;
+  }
   return ConstCheck::isConst(this);
 }
 
 c10::optional<int64_t> Val::getInt() const {
   if (isConstScalar() && isAnInt()) {
     if (this->getValType() == ValType::Scalar) {
-      return this->as<Int>()->value();
+      if (this->isA<Int>()) {
+        return this->as<Int>()->value();
+      }
     }
   }
   return c10::optional<int64_t>();
@@ -161,6 +197,16 @@ bool Val::isOneInt() const {
   return int_val.has_value() && int_val.value() == 1;
 }
 
+bool Val::isDefinitionType(ExprType expression_type) const {
+  if (definition() != nullptr) {
+    auto def_expr_type = definition()->getExprType();
+    if (def_expr_type.has_value() && def_expr_type.value() == expression_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
 c10::optional<DataType> Val::getDataType() const {
   TORCH_INTERNAL_ASSERT(
       dtype_ != DataType::Null, "Value does not have a data type.");
@@ -169,7 +215,7 @@ c10::optional<DataType> Val::getDataType() const {
 
 bool Val::isProducerOf(const Val* other) const {
   TORCH_INTERNAL_ASSERT(other != nullptr);
-  TORCH_INTERNAL_ASSERT(fusion() == other->fusion());
+  TORCH_INTERNAL_ASSERT(container() == other->container());
 
   if (definition() == nullptr) {
     return false;
@@ -186,23 +232,14 @@ bool Val::isConsumerOf(const Val* other) const {
 
 // We don't register with the active fusion in Expr as this needs to be done
 // after inputs and outputs are registered with the Expr
-Expr::Expr(ExprType type) : type_{type} {
-  Fusion* fusion = FusionGuard::getCurFusion();
-  if (fusion == nullptr)
-    TORCH_CHECK(false, "No active fusion group found when creating an Expr.");
-  fusion_ = fusion;
-}
+Expr::Expr(IrBuilderPasskey passkey, ExprType etype)
+    : Statement(passkey), etype_{etype} {}
 
 Expr::Expr(const Expr* src, IrCloner* ir_cloner)
     : Statement(src, ir_cloner),
-      type_(src->type_),
+      etype_(src->etype_),
       inputs_(ir_cloner->clone(src->inputs_)),
-      outputs_(ir_cloner->clone(src->outputs_)) {
-  // If we're "cloning" into the same fusion, register with the fusion
-  if (src->fusion() == ir_cloner->fusion()) {
-    name_ = src->fusion()->registerExpr(this);
-  }
-}
+      outputs_(ir_cloner->clone(src->outputs_)) {}
 
 bool Expr::sameAs(const Statement* other) const {
   if (this == other) {
@@ -227,6 +264,30 @@ bool Expr::sameAs(const Statement* other) const {
   return true;
 }
 
+kir::Predicate* Expr::predicate() const {
+  TORCH_INTERNAL_ASSERT(
+      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
+  return predicate_;
+}
+
+void Expr::setPredicate(kir::Predicate* predicate) {
+  TORCH_INTERNAL_ASSERT(
+      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
+  predicate_ = predicate;
+}
+
+kir::Predicate* Expr::writePredicate() const {
+  TORCH_INTERNAL_ASSERT(
+      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
+  return write_predicate_;
+}
+
+void Expr::setWritePredicate(kir::Predicate* write_predicate) {
+  TORCH_INTERNAL_ASSERT(
+      container()->isA<kir::Kernel>(), "Function invalid for fusion.");
+  write_predicate_ = write_predicate;
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
index 2e0fa0885bd6..70f0b8f80fe5 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <c10/core/ScalarType.h>
+#include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
-#include <torch/csrc/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/type.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
@@ -35,6 +35,8 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+using ValueId = int32_t;
+
 using StmtNameType = unsigned int;
 
 constexpr StmtNameType kInvalidStmName =
@@ -48,6 +50,22 @@ class UnaryOp;
 class BinaryOp;
 class IterDomain;
 class IrCloner;
+class IrContainer;
+class IrBuilderPasskey;
+class IrContainerPasskey;
+
+namespace kir {
+class Kernel;
+class Predicate;
+} // namespace kir
+
+// Passkey for container to register names with statements
+class ExprPasskey {
+  friend class Expr;
+
+ private:
+  explicit ExprPasskey() {}
+};
 
 TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept;
 
@@ -60,12 +78,12 @@ TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept;
 //! is also important for the design to have a dispatch system for a Statment.
 //! Basically beinng able to succienctly traverse down the inhereitance stack of
 //! a Statment at runtime. This is currently implemented in dispatch.h
-//!
 class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
   friend void swap(Fusion&, Fusion&) noexcept;
+  friend void swap(IrContainer& a, IrContainer& b) noexcept;
 
  public:
-  Statement() = default;
+  Statement() = delete;
 
   // Cloning constructor
   Statement(const Statement* src, IrCloner* ir_cloner);
@@ -78,7 +96,7 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
   static void constDispatch(T handler, const Statement* const);
 
   template <typename T>
-  static Statement* mutatorDispatch(T mutator, Statement*);
+  static void mutatorDispatch(T mutator, Statement*);
 
   // Accessor functions to types. Vals always have a DataType, Exprs never do
   virtual c10::optional<ValType> getValType() const {
@@ -106,8 +124,14 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
   Expr* asExpr();
 
   // Return the fusion this statement belongs to
-  Fusion* fusion() const {
-    return fusion_;
+  Fusion* fusion() const;
+
+  // Return the kernel this statement belongs to
+  kir::Kernel* kernel() const;
+
+  // Return the container this statement belongs to
+  IrContainer* container() const {
+    return ir_container_;
   }
 
   // Return the int that represents its name
@@ -115,6 +139,13 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
     return name_;
   }
 
+  // Set the statements' name. Typically the container will set the name,
+  // however if we're dealing with cloning, IrBuilder will set the name, this
+  // maybe should be from IrCloner, however I didn't want to add another
+  // passkey.
+  void setName(IrContainerPasskey, StmtNameType name);
+  void setName(IrBuilderPasskey, StmtNameType name);
+
   virtual bool sameType(const Statement* const other) {
     if (isVal() && other->isVal())
       return getValType().value() == other->getValType().value();
@@ -129,13 +160,17 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
     return this == other;
   }
 
-  void print() const;
+  std::string toString() const;
+  std::string toInlineString() const;
 
  protected:
+  Statement(IrBuilderPasskey);
+
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   StmtNameType name_ = kInvalidStmName;
+
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  Fusion* fusion_ = nullptr;
+  IrContainer* ir_container_ = nullptr;
 };
 
 //! A Val represents a "value." These are objects, like tensors, scalars, and
@@ -169,34 +204,43 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
 //!
 class TORCH_CUDA_CU_API Val : public Statement {
  public:
-  // We may not want to register this value during Val's constructor. The reason
-  // for this is that if we register the val, then in a derived constructor try
-  // to throw, fusion's destructor will get called, but the pointer to this Val
-  // will be invalid. When fusion tries to delete this value it will cause a seg
-  // fault, instead of showing the thrown error.
   explicit Val(
+      IrBuilderPasskey,
       ValType _vtype,
-      DataType _dtype = DataType::Null,
-      bool register_val = true);
+      DataType _dtype = DataType::Null);
 
   Val(const Val* src, IrCloner* ir_cloner);
 
-  // TODO: why is this optional?
-  //
+  // Dispatch functions, definitions in dispatch.cpp
+  template <typename T>
+  static void dispatch(T handler, Val*);
+
+  template <typename T>
+  static void constDispatch(T handler, const Val* const);
+
+  template <typename T>
+  static void mutatorDispatch(T mutator, Val*);
+
   c10::optional<ValType> getValType() const override {
     return vtype_;
   }
 
+  ValType vtype() const {
+    return vtype_;
+  }
+
+  DataType dtype() const {
+    return dtype_;
+  }
+
   // Throws if no DataType is found. Vals must have a DataType
-  //
-  // TODO: why is this optional?
-  //
   c10::optional<DataType> getDataType() const override;
 
   bool isScalar() const {
     return vtype_ == ValType::Scalar || vtype_ == ValType::NamedScalar;
   }
 
+  // Returns if all dependencies are constant scalars
   bool isConstScalar() const;
 
   bool isAnInt() const {
@@ -205,6 +249,11 @@ class TORCH_CUDA_CU_API Val : public Statement {
 
   c10::optional<int64_t> getInt() const;
 
+  // Returns if no dependencies and is a constant scalar.
+  virtual bool isConst() const {
+    return false;
+  }
+
   bool isZeroInt() const;
   bool isOneInt() const;
 
@@ -217,6 +266,9 @@ class TORCH_CUDA_CU_API Val : public Statement {
     return definition_;
   }
 
+  // Determine if value definition matches given expression type
+  bool isDefinitionType(ExprType expression_type) const;
+
   const std::vector<Expr*>& uses() const;
 
   bool isFusionInput() const {
@@ -254,42 +306,41 @@ class TORCH_CUDA_CU_API Val : public Statement {
     return evaluator_index_;
   }
 
-  // Dispatch functions, definitions in dispatch.cpp
-  template <typename T>
-  static void dispatch(T handler, Val*);
-
-  template <typename T>
-  static void constDispatch(T handler, const Val* const);
+  // Following is managed by Fusion (or kirIrBuilder) and can change.
+  // TODO: Protect with a passkey.
+  void setDefinition(Expr* expr) {
+    definition_ = expr;
+  }
 
-  template <typename T>
-  static Statement* mutatorDispatch(T mutator, Val*);
+  void resolveIndexDtype();
 
  protected:
   friend Fusion;
 
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   const ValType vtype_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  const DataType dtype_;
-
-  // Following is managed by Fusion and can change.
-  void setDefinition(Expr* expr) {
-    definition_ = expr;
-  }
 
+  // TODO: Add fusion passkey for this
   void setIsFusionInput(bool is_fusion_input) {
     is_fusion_input_ = is_fusion_input;
   }
 
+  // TODO: Add fusion passkey for this
   void setIsFusionOutput(bool is_fusion_output) {
     is_fusion_output_ = is_fusion_output;
   }
 
+  // TODO: Add fusion or container passkey for this
   void setUses(const std::vector<Expr*>& uses) {
     uses_ = uses;
   }
 
  private:
+  // There's only one instance where dtype can change, and that's through
+  // resolving the index data type from nvfuser to either Int or Int32 for
+  // welford operations.
+  DataType dtype_;
+
   // Following is managed by Fusion and can change.
   bool is_fusion_input_ = false;
   bool is_fusion_output_ = false;
@@ -297,6 +348,7 @@ class TORCH_CUDA_CU_API Val : public Statement {
   Expr* definition_ = nullptr;
   std::vector<Expr*> uses_;
 
+  // Expr evaluator idx;
   int evaluator_index_ = -1;
 };
 
@@ -342,15 +394,16 @@ class TORCH_CUDA_CU_API Val : public Statement {
 //!
 class TORCH_CUDA_CU_API Expr : public Statement {
  public:
-  explicit Expr(ExprType type);
+  explicit Expr(IrBuilderPasskey, ExprType type);
+
   Expr(const Expr* src, IrCloner* ir_cloner);
 
   c10::optional<ExprType> getExprType() const override {
-    return type_;
+    return etype_;
   }
 
-  ExprType type() const {
-    return type_;
+  ExprType etype() const {
+    return etype_;
   }
 
   bool sameAs(const Statement* other) const override;
@@ -380,23 +433,46 @@ class TORCH_CUDA_CU_API Expr : public Statement {
   static void constDispatch(T handler, const Expr* const);
 
   template <typename T>
-  static Statement* mutatorDispatch(T mutator, Expr*);
+  static void mutatorDispatch(T mutator, Expr*);
+
+  // TODO: Protect based on being in kernel container
+  kir::Predicate* predicate() const;
+
+  // TODO: Protect based on being in kernel container
+  void setPredicate(kir::Predicate* predicate);
+
+  // TODO: Protect based on being in kernel container
+  kir::Predicate* writePredicate() const;
+
+  // TODO: Protect based on being in kernel container
+  void setWritePredicate(kir::Predicate* write_predicate);
 
  protected:
+  // TODO: Add Fusion passkey
   void addInput(Val* input) {
     TORCH_INTERNAL_ASSERT(input != nullptr);
     inputs_.push_back(input);
   }
 
+  // TODO: Add Fusion passkey
   void addOutput(Val* output) {
     TORCH_INTERNAL_ASSERT(output != nullptr);
     outputs_.push_back(output);
   }
 
+  ExprPasskey exprPasskey() {
+    return ExprPasskey();
+  }
+
  private:
-  ExprType type_ = ExprType::Invalid;
+  ExprType etype_ = ExprType::Invalid;
   std::vector<Val*> inputs_;
   std::vector<Val*> outputs_;
+
+  kir::Predicate* predicate_ = nullptr;
+
+  // Only used for reduction-related expressions
+  kir::Predicate* write_predicate_ = nullptr;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.cpp b/torch/csrc/jit/codegen/cuda/ir_builder.cpp
new file mode 100644
index 000000000000..6b990a2ea7be
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ir_builder.cpp
@@ -0,0 +1,429 @@
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Clone an IR node, forwarding the arguments to the IrCloner constructor.
+template <class T>
+T* IrBuilder::clone(const T* src, IrCloner* ir_cloner) {
+  TORCH_INTERNAL_ASSERT(
+      ir_cloner != nullptr,
+      "Cannot use create when a cloner object is set. Use clone.");
+
+  TORCH_INTERNAL_ASSERT(
+      ir_cloner->container() != nullptr,
+      "Cloner doesn't have a valid container to store cloned object.");
+
+  T* dest = new T(src, ir_cloner);
+  const Statement* src_stmt = dynamic_cast<const Statement*>(src);
+  Statement* dest_stmt = dynamic_cast<Statement*>(dest);
+
+  auto dest_container = ir_cloner->container();
+  auto src_container = src_stmt->container();
+
+  dest_container->registerStmt(IrBuilderPasskey(dest_container), dest_stmt);
+
+  if (src_container != dest_container) {
+    dest_stmt->setName(IrBuilderPasskey(dest_container), src_stmt->name());
+  }
+
+  ir_cloner->registerClone(src_stmt, dest_stmt);
+
+  return dest;
+}
+
+#define IR_BUILDER_INSTANTIATE(T) \
+  template T* IrBuilder::clone(const T* src, IrCloner* ir_cloner);
+
+// Vals
+IR_BUILDER_INSTANTIATE(IterDomain)
+IR_BUILDER_INSTANTIATE(TensorDomain)
+IR_BUILDER_INSTANTIATE(TensorView)
+IR_BUILDER_INSTANTIATE(Bool)
+IR_BUILDER_INSTANTIATE(Double)
+IR_BUILDER_INSTANTIATE(Int)
+IR_BUILDER_INSTANTIATE(ComplexDouble)
+IR_BUILDER_INSTANTIATE(NamedScalar)
+
+// Exprs
+IR_BUILDER_INSTANTIATE(Split)
+IR_BUILDER_INSTANTIATE(Merge)
+IR_BUILDER_INSTANTIATE(TransposeOp)
+IR_BUILDER_INSTANTIATE(ShiftOp)
+IR_BUILDER_INSTANTIATE(GatherOp)
+IR_BUILDER_INSTANTIATE(ViewAsScalar)
+IR_BUILDER_INSTANTIATE(ViewOp)
+IR_BUILDER_INSTANTIATE(UnaryOp)
+IR_BUILDER_INSTANTIATE(BinaryOp)
+IR_BUILDER_INSTANTIATE(TernaryOp)
+IR_BUILDER_INSTANTIATE(ReductionOp)
+IR_BUILDER_INSTANTIATE(GroupedReductionOp)
+IR_BUILDER_INSTANTIATE(WelfordOp)
+IR_BUILDER_INSTANTIATE(MmaOp)
+IR_BUILDER_INSTANTIATE(BroadcastOp)
+
+Val* IrBuilder::newResult(DataType dtype) {
+  switch (dtype) {
+    case DataType::Bool:
+      return IrBuilder::create<Bool>(c10::nullopt);
+    case DataType::Double:
+      return IrBuilder::create<Double>(c10::nullopt);
+    case DataType::Int:
+      return IrBuilder::create<Int>(c10::nullopt);
+    default:
+      TORCH_CHECK(false, "Unexpected data type");
+  }
+}
+
+Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  TORCH_CHECK(
+      lhs->dtype() == rhs->dtype(),
+      "Incompatible operand types: ",
+      lhs->dtype(),
+      " and ",
+      rhs->dtype());
+  auto result = newResult(lhs->dtype());
+  IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  return result;
+}
+
+Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  auto result = IrBuilder::create<Bool>(c10::nullopt);
+  IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  return result;
+}
+
+Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) {
+  TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types");
+  auto result = newResult(lhs->dtype());
+  IrBuilder::create<TernaryOp>(TernaryOpType::Where, result, pred, lhs, rhs);
+  return result;
+}
+
+Val* IrBuilder::negExpr(Val* val) {
+  auto result = newResult(val->dtype());
+  IrBuilder::create<UnaryOp>(UnaryOpType::Neg, result, val);
+  return result;
+}
+
+Val* IrBuilder::notExpr(Val* val) {
+  auto result = newResult(val->dtype());
+  IrBuilder::create<UnaryOp>(UnaryOpType::Not, result, val);
+  return result;
+}
+
+Val* IrBuilder::setExpr(Val* val) {
+  auto result = newResult(val->dtype());
+  IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
+  return result;
+}
+
+Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) {
+  auto result = IrBuilder::create<NamedScalar>(name, val->dtype());
+  IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
+  return result;
+}
+
+Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) {
+  auto result = IrBuilder::create<NamedScalar>(name, DataType::Int);
+  IrBuilder::create<UnaryOp>(UnaryOpType::Address, result, val);
+  return result;
+}
+
+Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::And, lhs, rhs);
+}
+
+Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
+}
+
+Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::GT, lhs, rhs);
+}
+
+Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
+}
+
+Val* IrBuilder::leExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::LE, lhs, rhs);
+}
+
+Val* IrBuilder::geExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::GE, lhs, rhs);
+}
+
+Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
+}
+
+Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
+}
+
+Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
+}
+
+Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
+}
+
+Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
+}
+
+Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
+}
+
+Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Max, lhs, rhs);
+}
+
+Val* IrBuilder::minExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Min, lhs, rhs);
+}
+
+Val* SimplifyingIrBuilder::negExpr(Val* val) {
+  if (auto int_val = dynamic_cast<Int*>(val)) {
+    if (int_val->isConst()) {
+      return IrBuilder::create<Int>(-int_val->value().value());
+    }
+  }
+  return IrBuilder::negExpr(val);
+}
+
+Val* SimplifyingIrBuilder::notExpr(Val* val) {
+  if (auto bool_val = dynamic_cast<Bool*>(val)) {
+    if (bool_val->isConst()) {
+      if (bool_val->value().value()) {
+        return FusionGuard::getCurFusion()->falseVal();
+      } else {
+        return FusionGuard::getCurFusion()->trueVal();
+      }
+    }
+  }
+  return IrBuilder::notExpr(val);
+}
+
+Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) {
+  if (rhs == 0) {
+    return lhs;
+  } else if (lhs == nullptr) {
+    return IrBuilder::IrBuilder::create<Int>(rhs);
+  } else if (lhs->isConst()) {
+    return IrBuilder::IrBuilder::create<Int>(lhs->value().value() + rhs);
+  } else if (rhs > 0) {
+    return IrBuilder::addExpr(lhs, IrBuilder::IrBuilder::create<Int>(rhs));
+  } else {
+    return IrBuilder::subExpr(lhs, IrBuilder::IrBuilder::create<Int>(-rhs));
+  }
+}
+
+Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) {
+  if (rhs == nullptr) {
+    return lhs;
+  } else if (lhs == nullptr) {
+    return rhs;
+  } else if (lhs->isConst()) {
+    return addExpr(rhs, lhs->value().value());
+  } else if (rhs->isConst()) {
+    return addExpr(lhs, rhs->value().value());
+  } else {
+    return IrBuilder::addExpr(lhs, rhs);
+  }
+}
+
+Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) {
+  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
+  if (lhs == nullptr || lhs->isZeroInt()) {
+    return rhs;
+  } else if (rhs == nullptr || rhs->isZeroInt()) {
+    return lhs;
+  }
+  auto lhs_int = dynamic_cast<Int*>(lhs);
+  auto rhs_int = dynamic_cast<Int*>(rhs);
+  if (lhs_int != nullptr && rhs_int != nullptr) {
+    return addExpr(lhs_int, rhs_int);
+  } else {
+    return IrBuilder::addExpr(lhs, rhs);
+  }
+}
+
+Val* SimplifyingIrBuilder::addExpr(Val* lhs, Int::ScalarType rhs) {
+  auto lhs_int = dynamic_cast<Int*>(lhs);
+  if (lhs_int != nullptr) {
+    return addExpr(lhs_int, rhs);
+  } else {
+    return addExpr(lhs, IrBuilder::create<Int>(rhs));
+  }
+}
+
+Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) {
+  return addExpr(lhs, negExpr(rhs));
+}
+
+Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int::ScalarType rhs) {
+  if (rhs == 0) {
+    return lhs->container()->zeroVal();
+  } else if (rhs == 1) {
+    return lhs;
+  } else if (lhs == nullptr) {
+    return IrBuilder::create<Int>(rhs);
+  } else if (lhs->isConst()) {
+    return IrBuilder::create<Int>(lhs->value().value() * rhs);
+  } else {
+    return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
+  }
+}
+
+Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Int::ScalarType rhs) {
+  auto lhs_int = dynamic_cast<Int*>(lhs);
+  if (lhs_int != nullptr) {
+    return mulExpr(lhs_int, rhs);
+  } else {
+    return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
+  }
+}
+
+Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int* rhs) {
+  if (rhs == nullptr) {
+    return lhs;
+  } else if (lhs == nullptr) {
+    return rhs;
+  } else if (lhs->isConst()) {
+    return mulExpr(rhs, lhs->value().value());
+  } else if (rhs->isConst()) {
+    return mulExpr(lhs, rhs->value().value());
+  } else {
+    return IrBuilder::mulExpr(lhs, rhs);
+  }
+}
+
+Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Val* rhs) {
+  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
+  if (lhs == nullptr || lhs->isOneInt()) {
+    return rhs;
+  } else if (rhs == nullptr || rhs->isOneInt()) {
+    return lhs;
+  } else if (lhs->isZeroInt() || rhs->isZeroInt()) {
+    return lhs->container()->zeroVal();
+  }
+  auto lhs_int = dynamic_cast<Int*>(lhs);
+  auto rhs_int = dynamic_cast<Int*>(rhs);
+  if (lhs_int != nullptr && rhs_int != nullptr) {
+    return mulExpr(lhs_int, rhs_int);
+  } else {
+    return IrBuilder::mulExpr(lhs, rhs);
+  }
+}
+
+Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) {
+  TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr));
+
+  if (lhs == nullptr) {
+    return rhs;
+  } else if (rhs == nullptr) {
+    return lhs;
+  }
+
+  bool lhs_definitely_true = false;
+  bool lhs_definitely_false = false;
+  auto lhs_bool = dynamic_cast<Bool*>(lhs);
+  if (lhs_bool && lhs_bool->isConst()) {
+    lhs_definitely_true = lhs_bool->value().value();
+    lhs_definitely_false = !lhs_bool->value().value();
+  }
+  auto rhs_bool = dynamic_cast<Bool*>(rhs);
+  bool rhs_definitely_true = false;
+  bool rhs_definitely_false = false;
+  if (rhs_bool && rhs_bool->isConst()) {
+    rhs_definitely_true = rhs_bool->value().value();
+    rhs_definitely_false = !rhs_bool->value().value();
+  }
+
+  if (lhs_definitely_true && rhs_definitely_true) {
+    return FusionGuard::getCurFusion()->trueVal();
+  } else if (lhs_definitely_false || rhs_definitely_false) {
+    return FusionGuard::getCurFusion()->falseVal();
+  } else if (lhs_definitely_true) {
+    return rhs;
+  } else if (rhs_definitely_true) {
+    return lhs;
+  }
+
+  return IrBuilder::andExpr(lhs, rhs);
+}
+
+namespace {
+
+template <typename IrBuilderFunc, typename IntFunc>
+Val* minOrMaxExpr(
+    Int* lhs,
+    Int* rhs,
+    IrBuilderFunc ir_builder_func,
+    IntFunc int_func) {
+  if (rhs == nullptr) {
+    return lhs;
+  } else if (lhs == nullptr) {
+    return rhs;
+  } else if (lhs->isConst() && rhs->isConst()) {
+    return IrBuilder::create<Int>(
+        int_func(lhs->value().value(), rhs->value().value()));
+  } else {
+    return ir_builder_func(lhs, rhs);
+  }
+}
+
+template <typename IrBuilderFunc, typename IntFunc>
+Val* minOrMaxExpr(
+    Val* lhs,
+    Val* rhs,
+    IrBuilderFunc ir_builder_func,
+    IntFunc int_func) {
+  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
+  if (lhs == nullptr) {
+    return rhs;
+  } else if (rhs == nullptr || lhs == rhs) {
+    return lhs;
+  }
+  auto lhs_int = dynamic_cast<Int*>(lhs);
+  auto rhs_int = dynamic_cast<Int*>(rhs);
+  if (lhs_int != nullptr && rhs_int != nullptr) {
+    return minOrMaxExpr(lhs_int, rhs_int, ir_builder_func, int_func);
+  } else {
+    return ir_builder_func(lhs, rhs);
+  }
+}
+
+} // namespace
+
+Val* SimplifyingIrBuilder::maxExpr(Val* lhs, Val* rhs) {
+  return minOrMaxExpr(
+      lhs,
+      rhs,
+      [](Val* lhs, Val* rhs) { return IrBuilder::maxExpr(lhs, rhs); },
+      [](int64_t lhs, int64_t rhs) { return std::max(lhs, rhs); });
+}
+
+Val* SimplifyingIrBuilder::minExpr(Val* lhs, Val* rhs) {
+  return minOrMaxExpr(
+      lhs,
+      rhs,
+      [](Val* lhs, Val* rhs) { return IrBuilder::minExpr(lhs, rhs); },
+      [](int64_t lhs, int64_t rhs) { return std::min(lhs, rhs); });
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.h b/torch/csrc/jit/codegen/cuda/ir_builder.h
new file mode 100644
index 000000000000..f122232f8fb8
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ir_builder.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_container.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace kir {
+class Kernel;
+}
+
+class IrCloner;
+
+// Passkey for builder to register properties with statements, and to call
+// functions in IrContainer
+class TORCH_CUDA_CU_API IrBuilderPasskey {
+  friend class IrBuilder;
+
+ public:
+  // TODO: Collapse ir_container and Kernel once Kernel inherits from
+  // IrContainer
+  IrContainer* const ir_container_ = nullptr;
+
+ private:
+  explicit IrBuilderPasskey(IrContainer* ir_container);
+};
+
+//! IR builder interface
+class TORCH_CUDA_CU_API IrBuilder {
+ public:
+  //! Allocate a new IR node, forwarding the arguments to the appropriate
+  //! constructor and registering with the container
+  template <class T, class... Args>
+  static T* create(Args&&... args) {
+    auto container = FusionGuard::getCurFusion();
+    // return create<T>(container, std::forward<Args>(args)...);
+    TORCH_INTERNAL_ASSERT(
+        container != nullptr, "Need an active container to build IR.");
+    T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
+
+    container->registerStmt(IrBuilderPasskey(container), node);
+
+    return node;
+  }
+
+  //! Allocate a new IR node, forwarding the arguments to the appropriate
+  //! constructor and registering with the container
+  template <class T, class... Args>
+  static T* create(IrContainer* container, Args&&... args) {
+    TORCH_INTERNAL_ASSERT(
+        container != nullptr, "Need an active container to build IR.");
+    T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
+
+    container->registerStmt(IrBuilderPasskey(container), node);
+
+    return node;
+  }
+
+  //! Clone an IR node, forwarding the arguments to the IrCloner constructor.
+  //! Register clones with IrCloner's target container.
+  template <class T>
+  static T* clone(const T* src, IrCloner* ir_cloner);
+
+  // Unary operations
+  static Val* negExpr(Val* val);
+  static Val* notExpr(Val* val);
+  static Val* setExpr(Val* val);
+  static Val* setExprNamedScalar(const std::string& name, Val* val);
+  static Val* addressExprNamedScalar(const std::string& name, Val* val);
+
+  // Binary operations
+  static Val* andExpr(Val* lhs, Val* rhs);
+  static Val* eqExpr(Val* lhs, Val* rhs);
+  static Val* gtExpr(Val* lhs, Val* rhs);
+  static Val* ltExpr(Val* lhs, Val* rhs);
+  static Val* leExpr(Val* lhs, Val* rhs);
+  static Val* geExpr(Val* lhs, Val* rhs);
+  static Val* addExpr(Val* lhs, Val* rhs);
+  static Val* subExpr(Val* lhs, Val* rhs);
+  static Val* mulExpr(Val* lhs, Val* rhs);
+  static Val* divExpr(Val* lhs, Val* rhs);
+  static Val* ceilDivExpr(Val* lhs, Val* rhs);
+  static Val* modExpr(Val* lhs, Val* rhs);
+  static Val* maxExpr(Val* lhs, Val* rhs);
+  static Val* minExpr(Val* lhs, Val* rhs);
+
+  // Ternary operations
+  static Val* whereExpr(Val* pred, Val* lhs, Val* rhs);
+
+ private:
+  static Val* newResult(DataType dtype);
+  static Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+  static Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+};
+
+//! A wrapper builder with static expression simplification
+//!
+//! Example:
+//! - addExpr(new Int(1), new Int(2)) -> Int(3)
+//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo")
+//!
+//! Designed to be used to simplify predicate and index expressions in
+//! generated code. Also, the shift validation may fail without
+//! this simplification.
+class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder {
+ public:
+  static Val* negExpr(Val* val);
+  static Val* notExpr(Val* val);
+
+  static Val* addExpr(Int* lhs, Int::ScalarType rhs);
+  static Val* addExpr(Val* lhs, Int::ScalarType rhs);
+  static Val* addExpr(Int* lhs, Int* rhs);
+  static Val* addExpr(Val* lhs, Val* rhs);
+  static Val* subExpr(Val* lhs, Val* rhs);
+  static Val* mulExpr(Int* lhs, Int::ScalarType rhs);
+  static Val* mulExpr(Val* lhs, Int::ScalarType rhs);
+  static Val* mulExpr(Int* lhs, Int* rhs);
+  static Val* mulExpr(Val* lhs, Val* rhs);
+  static Val* andExpr(Val* lhs, Val* rhs);
+  static Val* maxExpr(Val* lhs, Val* rhs);
+  static Val* minExpr(Val* lhs, Val* rhs);
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
index 7e5a9cfa8bc3..5ad17fbe1930 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
@@ -2,12 +2,15 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+IrCloner::IrCloner(IrContainer* container) : ir_container_(container) {}
+
 Statement* IrCloner::clone(const Statement* statement) {
   if (statement == nullptr) {
     return nullptr;
@@ -30,7 +33,6 @@ Statement* IrCloner::clone(const Statement* statement) {
     // that something went horribly wrong.
     TORCH_INTERNAL_ASSERT(new_node != nullptr);
     TORCH_INTERNAL_ASSERT(clones_map_[statement] == new_node);
-    TORCH_INTERNAL_ASSERT(new_node->fusion() == fusion_);
 
     return new_node;
   }
@@ -39,7 +41,6 @@ Statement* IrCloner::clone(const Statement* statement) {
 void IrCloner::registerClone(const Statement* src, Statement* clone) {
   TORCH_CHECK(src != nullptr);
   TORCH_CHECK(clone != nullptr);
-  TORCH_CHECK(clone->fusion() == fusion_);
   TORCH_CHECK(clones_map_.insert({src, clone}).second);
 }
 
@@ -56,79 +57,95 @@ void IrCloner::handle(const Expr* e) {
 }
 
 void IrCloner::handle(const TensorDomain* td) {
-  clone_ = new TensorDomain(td, this);
+  clone_ = IrBuilder::clone(td, this);
 }
 
 void IrCloner::handle(const IterDomain* id) {
-  clone_ = new IterDomain(id, this);
+  clone_ = IrBuilder::clone(id, this);
 }
 
 void IrCloner::handle(const Bool* b) {
-  clone_ = new Bool(b, this);
+  clone_ = IrBuilder::clone(b, this);
 }
 
 void IrCloner::handle(const Double* d) {
-  clone_ = new Double(d, this);
+  clone_ = IrBuilder::clone(d, this);
 }
 
 void IrCloner::handle(const Int* i) {
-  clone_ = new Int(i, this);
+  clone_ = IrBuilder::clone(i, this);
+}
+
+void IrCloner::handle(const ComplexDouble* c) {
+  clone_ = IrBuilder::clone(c, this);
 }
 
 void IrCloner::handle(const NamedScalar* named_scalar) {
-  clone_ = new NamedScalar(named_scalar, this);
+  clone_ = IrBuilder::clone(named_scalar, this);
 }
 
 void IrCloner::handle(const TensorView* tv) {
-  clone_ = new TensorView(tv, this);
+  clone_ = IrBuilder::clone(tv, this);
 }
 
 void IrCloner::handle(const UnaryOp* op) {
-  clone_ = new UnaryOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const BinaryOp* op) {
-  clone_ = new BinaryOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const TernaryOp* op) {
-  clone_ = new TernaryOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const BroadcastOp* op) {
-  clone_ = new BroadcastOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const ReductionOp* op) {
-  clone_ = new ReductionOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
+}
+
+void IrCloner::handle(const GroupedReductionOp* op) {
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const WelfordOp* op) {
-  clone_ = new WelfordOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
+}
+
+void IrCloner::handle(const MmaOp* op) {
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const TransposeOp* op) {
-  clone_ = new TransposeOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const ShiftOp* op) {
-  clone_ = new ShiftOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const GatherOp* op) {
-  clone_ = new GatherOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
+}
+
+void IrCloner::handle(const ViewAsScalar* op) {
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const ViewOp* op) {
-  clone_ = new ViewOp(op, this);
+  clone_ = IrBuilder::clone(op, this);
 }
 
 void IrCloner::handle(const Split* split) {
-  clone_ = new Split(split, this);
+  clone_ = IrBuilder::clone(split, this);
 }
 
 void IrCloner::handle(const Merge* merge) {
-  clone_ = new Merge(merge, this);
+  clone_ = IrBuilder::clone(merge, this);
 }
 
 TensorView* RecomputeTv::recompute(TensorView* tv) {
@@ -141,7 +158,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) {
       "Cannot recompute buffers that are inputs of the fusion.");
 
   // Grab all the expressions used to generate the TensorView
-  auto exprs = ExprSort::getExprs(tv->fusion(), {tv});
+  auto exprs = StmtSort::getExprs(tv->fusion(), {tv}, false);
 
   // Run the replicator
   RecomputeTv replicator(tv->fusion(), exprs);
@@ -161,7 +178,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) {
 }
 
 RecomputeTv::RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs)
-    : IrCloner(fusion) {
+    : IrCloner(fusion), fusion_(fusion) {
   // Add inputs to the clones map to prevent cloning them.
   for (const auto inp : fusion->inputs()) {
     clones_map_[inp] = inp;
@@ -183,7 +200,7 @@ void RecomputeTv::handle(const TensorDomain* td) {
   // Make sure to recompute the history of the iteration domains, explicitly go
   // through the expressions and send them to IrCloner.
   auto exprs =
-      ExprSort::getExprs(fusion(), {td->domain().begin(), td->domain().end()});
+      StmtSort::getExprs(fusion_, {td->domain().begin(), td->domain().end()});
 
   for (auto expr : exprs) {
     IrCloner::handle(expr);
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h
index ac83d9edb097..5b70b0fd048f 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.h
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 
 #include <unordered_map>
 #include <vector>
@@ -11,7 +12,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class Fusion;
+class IrContainer;
 
 //! Clones nodes from an exiting Fusion
 //!
@@ -21,10 +22,11 @@ class Fusion;
 //!
 class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   friend class Statement;
+  friend class IrBuilder;
 
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  explicit IrCloner(Fusion* new_fusion) : fusion_(new_fusion) {}
+  explicit IrCloner(IrContainer* container);
 
   Statement* clone(const Statement* statement);
 
@@ -45,8 +47,8 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
     return copy;
   }
 
-  Fusion* fusion() const {
-    return fusion_;
+  IrContainer* container() const {
+    return ir_container_;
   }
 
  protected:
@@ -63,6 +65,7 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   void handle(const Bool*) override;
   void handle(const Double*) override;
   void handle(const Int*) override;
+  void handle(const ComplexDouble*) override;
   void handle(const NamedScalar*) override;
 
   void handle(const UnaryOp*) override;
@@ -70,10 +73,13 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   void handle(const TernaryOp*) override;
   void handle(const BroadcastOp*) override;
   void handle(const ReductionOp*) override;
+  void handle(const GroupedReductionOp*) override;
   void handle(const WelfordOp*) override;
+  void handle(const MmaOp*) override;
   void handle(const TransposeOp*) override;
   void handle(const ShiftOp*) override;
   void handle(const GatherOp*) override;
+  void handle(const ViewAsScalar*) override;
   void handle(const ViewOp*) override;
 
   void handle(const Split*) override;
@@ -86,12 +92,15 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
 
  private:
   // The destination Fusion container
-  Fusion* fusion_ = nullptr;
+  IrContainer* ir_container_ = nullptr;
 
   // The dispatch interface doesn't allow returning values from
   // individual `handle()` methods, so they are storing the
   // result here
   Statement* clone_ = nullptr;
+
+  // Builder to make all the new nodes
+  IrBuilder builder_;
 };
 
 // Replicates all expressions used to generate the provided TensorView. Does not
@@ -105,7 +114,9 @@ class RecomputeTv : private IrCloner {
  private:
   RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs);
 
-  void handle(const TensorDomain*) override;
+  void handle(const TensorDomain*) final;
+
+  Fusion* fusion_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/ir_container.cpp b/torch/csrc/jit/codegen/cuda/ir_container.cpp
new file mode 100644
index 000000000000..e84418eb9733
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ir_container.cpp
@@ -0,0 +1,279 @@
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
+#include <torch/csrc/jit/codegen/cuda/ir_container.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+void swap(IrContainer& a, IrContainer& b) noexcept {
+  FUSER_PERF_SCOPE("Fusion swap");
+
+  using std::swap;
+
+  // Swap the content
+  swap(a.vals_up_, b.vals_up_);
+  swap(a.vals_, b.vals_);
+
+  swap(a.exprs_up_, b.exprs_up_);
+  swap(a.exprs_, b.exprs_);
+
+  swap(a.raw_ptrs_, b.raw_ptrs_);
+
+  swap(a.val_type_name_map_, b.val_type_name_map_);
+  swap(a.expr_name_counter_, b.expr_name_counter_);
+
+  // Fixup the Statement::fusion_ links for a
+  for (auto val : a.vals_) {
+    val->ir_container_ = &a;
+  }
+  for (auto expr : a.exprs_) {
+    expr->ir_container_ = &a;
+  }
+
+  // Fixup the Statement::fusion_ links for b
+  for (auto val : b.vals_) {
+    val->ir_container_ = &a;
+  }
+  for (auto expr : b.exprs_) {
+    expr->ir_container_ = &a;
+  }
+}
+
+IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) {
+  to->clear();
+  IrCloner ir_cloner(to);
+
+  for (auto val : from->vals_) {
+    to->vals_.insert(ir_cloner.clone(val));
+  }
+
+  for (auto expr : from->exprs_) {
+    to->exprs_.insert(ir_cloner.clone(expr));
+  }
+
+  to->val_type_name_map_ = from->val_type_name_map_;
+  to->expr_name_counter_ = from->expr_name_counter_;
+
+  return ir_cloner;
+}
+
+IrContainer::IrContainer() = default;
+
+IrContainer::IrContainer(const IrContainer& other) {
+  FUSER_PERF_SCOPE("IrContainer copy");
+  IrContainer::copy(&other, this);
+}
+
+IrContainer::IrContainer(IrContainer&& other) noexcept {
+  FUSER_PERF_SCOPE("IrContainer move");
+  swap(*this, other);
+}
+
+IrContainer& IrContainer::operator=(const IrContainer& other) {
+  FUSER_PERF_SCOPE("IrContainer copy assign");
+  IrContainer copy(other);
+  clear();
+  swap(*this, copy);
+  return *this;
+}
+
+IrContainer& IrContainer::operator=(IrContainer&& other) noexcept {
+  FUSER_PERF_SCOPE("IrContainer move assign");
+  clear();
+  swap(*this, other);
+  return *this;
+}
+
+IrContainer::~IrContainer() {
+  clear();
+}
+
+//! Register the Statement with this container
+void IrContainer::registerStmt(IrBuilderPasskey, Statement* stmt) {
+  if (stmt->isVal()) {
+    registerVal(stmt->asVal());
+  } else {
+    registerExpr(stmt->asExpr());
+  }
+}
+
+//! Register the Val with this container
+void IrContainer::registerVal(IrBuilderPasskey, Val* val) {
+  registerVal(val);
+}
+
+//! Register expr with this container.
+void IrContainer::registerExpr(IrBuilderPasskey, Expr* expr) {
+  registerExpr(expr);
+}
+
+void IrContainer::registerExpr(ExprPasskey, Expr* expr) {
+  registerExpr(expr);
+}
+
+void IrContainer::removeExpr(Expr* expr) {
+  TORCH_INTERNAL_ASSERT(
+      exprs_.find(expr) != exprs_.end(),
+      "Wanted to remove an expression but it doesn't exist in this container.");
+  auto expr_in_deque = std::find_if(
+      exprs_up_.begin(),
+      exprs_up_.end(),
+      [expr](std::unique_ptr<Expr>& expr_up) { return expr_up.get() == expr; });
+
+  TORCH_INTERNAL_ASSERT(
+      expr_in_deque != exprs_up_.end(),
+      "Wanted to remove an expression but its unique ptr is missing.");
+
+  exprs_.erase(expr);
+  exprs_up_.erase(expr_in_deque);
+  raw_ptrs_.erase((void*)expr);
+}
+
+//! Completely remove val from the fusion, break all dependencies associated
+//! with it
+void IrContainer::removeVal(Val* val) {
+  // Don't remove shortcuts
+  if (val == true_val_.get() || val == false_val_.get() ||
+      val == one_val_.get() || val == zero_val_.get() ||
+      val == magic_zero_val_.get()) {
+    return;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      vals_.find(val) != vals_.end(),
+      "Wanted to remove a value but it doesn't exist in this container.");
+  auto val_in_deque = std::find_if(
+      vals_up_.begin(), vals_up_.end(), [val](std::unique_ptr<Val>& val_up) {
+        return val_up.get() == val;
+      });
+
+  TORCH_INTERNAL_ASSERT(
+      val_in_deque != vals_up_.end(),
+      "Wanted to remove a value but its unique ptr is missing.");
+
+  vals_.erase(val);
+  vals_up_.erase(val_in_deque);
+  raw_ptrs_.erase((void*)val);
+}
+
+//! Register the Val with this container
+void IrContainer::registerVal(Val* val) {
+  if (inContainer(val)) {
+    return;
+  }
+
+  vals_up_.emplace_back(std::unique_ptr<Val>(val));
+  vals_.emplace(vals_up_.back().get());
+  val->setName(IrContainerPasskey(), getValName(vals_up_.back()->vtype()));
+  raw_ptrs_.emplace((void*)vals_up_.back().get());
+}
+
+//! Register expr with this container.
+void IrContainer::registerExpr(Expr* expr) {
+  if (inContainer(expr)) {
+    return;
+  }
+  exprs_up_.emplace_back(std::unique_ptr<Expr>(expr));
+  exprs_.emplace(exprs_up_.back().get());
+  expr->setName(IrContainerPasskey(), getExprName());
+  raw_ptrs_.emplace((void*)exprs_up_.back().get());
+}
+
+void IrContainer::clear() noexcept {
+  FUSER_PERF_SCOPE("IrContainer clear");
+  vals_.clear();
+  vals_up_.clear();
+  exprs_.clear();
+  exprs_up_.clear();
+  raw_ptrs_.clear();
+
+  val_type_name_map_.clear();
+  expr_name_counter_ = 0;
+}
+
+bool IrContainer::inContainer(const Statement* stmt) const {
+  const void* const_void = (const void*)(stmt);
+  void* nonconst_void = const_cast<void*>(const_void); // NOLINT
+  if (raw_ptrs_.find(nonconst_void) == raw_ptrs_.end()) {
+    return false;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      stmt->container() == this,
+      "Container claims to own stmt, but stmt disagrees.");
+
+  Statement* nonconst_stmt = const_cast<Statement*>(stmt); // NOLINT
+  if (stmt->isExpr()) {
+    TORCH_INTERNAL_ASSERT(
+        exprs_.find(nonconst_stmt->as<Expr>()) != exprs_.end(),
+        "Somehow container claims to and not to own an Expr.");
+  }
+  if (stmt->isVal()) {
+    TORCH_INTERNAL_ASSERT(
+        vals_.find(nonconst_stmt->as<Val>()) != vals_.end(),
+        "Somehow container claims to and not to own an Val.");
+  }
+
+  return true;
+}
+
+// Shortcuts for frequently used vals
+Int* IrContainer::zeroVal() {
+  if (!zero_val_) {
+    auto zero_val = IrBuilder::create<Int>(this, 0);
+    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == zero_val);
+    zero_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
+    vals_up_.pop_back();
+  }
+  return zero_val_.get();
+}
+
+Int* IrContainer::oneVal() {
+  if (!one_val_) {
+    auto one_val = IrBuilder::create<Int>(this, 1);
+    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == one_val);
+    one_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
+    vals_up_.pop_back();
+  }
+  return one_val_.get();
+}
+
+Bool* IrContainer::falseVal() {
+  if (!false_val_) {
+    auto false_val = IrBuilder::create<Bool>(this, false);
+    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == false_val);
+    false_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
+    vals_up_.pop_back();
+  }
+  return false_val_.get();
+}
+
+Bool* IrContainer::trueVal() {
+  if (!true_val_) {
+    auto true_val = IrBuilder::create<Bool>(this, true);
+    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == true_val);
+    true_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
+    vals_up_.pop_back();
+  }
+  return true_val_.get();
+}
+
+NamedScalar* IrContainer::magicZeroVal() {
+  if (!magic_zero_val_) {
+    auto magic_zero =
+        IrBuilder::create<NamedScalar>(kMagicZeroName, DataType::Int);
+    TORCH_INTERNAL_ASSERT(vals_up_.back().get() == magic_zero);
+    magic_zero_val_ = std::unique_ptr<NamedScalar>(
+        vals_up_.back().release()->as<NamedScalar>());
+    vals_up_.pop_back();
+  }
+  return magic_zero_val_.get();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_container.h b/torch/csrc/jit/codegen/cuda/ir_container.h
new file mode 100644
index 000000000000..fb1aaeaf383c
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ir_container.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class IrBuilderPasskey;
+class ExprPasskey;
+class OptOutMutator;
+
+class Int;
+class Bool;
+class NamedScalar;
+
+// Passkey for container to register names with statements
+class IrContainerPasskey {
+  friend class IrContainer;
+
+ private:
+  explicit IrContainerPasskey() {}
+};
+
+class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase {
+ public:
+  IrContainer();
+
+  IrContainer(const IrContainer& other);
+  IrContainer(IrContainer&& other) noexcept;
+
+  IrContainer& operator=(const IrContainer& other);
+  IrContainer& operator=(IrContainer&& other) noexcept;
+
+  virtual ~IrContainer();
+
+  bool inContainer(const Statement* stmt) const;
+
+  void assertInContainer(const Statement* stmt, const std::string& msg) const {
+    TORCH_CHECK(
+        inContainer(stmt), msg, " it was not found in the active container.");
+  }
+
+  //! Return in insertion order
+  const std::deque<Val*> deterministic_vals() const noexcept {
+    std::deque<Val*> vals_deque;
+    std::transform(
+        vals_up_.begin(),
+        vals_up_.end(),
+        std::back_inserter(vals_deque),
+        [](const std::unique_ptr<Val>& val_up) { return val_up.get(); });
+    return vals_deque;
+  }
+
+  //! Register the Statement with this container
+  virtual void registerStmt(IrBuilderPasskey, Statement* stmt);
+
+  //! Register the Val with this container
+  virtual void registerVal(IrBuilderPasskey, Val* val);
+
+  //! Register expr with this container.
+  virtual void registerExpr(IrBuilderPasskey, Expr* expr);
+
+  //! Allow expr's to register themselves with a container, this is only used
+  //! for broadcastOp so it can register itself in its constructor so root maps
+  //! can be built.
+  virtual void registerExpr(ExprPasskey, Expr* expr);
+
+  //! Return the set of Exprs registered with this fusion. Warning: This will
+  //! return exprs outside inputs/outputs, so can be unsafe for use with
+  //! segmented fusions.
+  const std::unordered_set<Expr*>& unordered_exprs() const noexcept {
+    return exprs_;
+  }
+
+  //! Return the set of Vals registered with this fusion
+  const std::unordered_set<Val*>& vals() const noexcept {
+    return vals_;
+  }
+
+  // Shortcuts for frequently used vals
+  Int* zeroVal();
+  Int* oneVal();
+  Bool* falseVal();
+  Bool* trueVal();
+  NamedScalar* magicZeroVal();
+
+ protected:
+  static IrCloner copy(const IrContainer* from, IrContainer* to);
+
+  friend void swap(IrContainer& a, IrContainer& b) noexcept;
+
+  // Let mutator remove Exprs.
+  friend OptOutMutator;
+
+  virtual void removeExpr(Expr* expr);
+
+  //! Completely remove val from the fusion, break all dependencies associated
+  //! with it
+  virtual void removeVal(Val* val);
+
+  //! Register the Val with this container
+  virtual void registerVal(Val* val);
+
+  //! Register expr with this container.
+  virtual void registerExpr(Expr* expr);
+
+  StmtNameType getValName(ValType vtype) {
+    if (val_type_name_map_.find(vtype) == val_type_name_map_.end()) {
+      val_type_name_map_[vtype] = 0;
+    }
+    return val_type_name_map_[vtype]++;
+  }
+
+  StmtNameType getExprName() {
+    return expr_name_counter_++;
+  }
+
+  void clear() noexcept;
+
+  // Deque of unique pointer is the memory owning data structure
+  std::deque<std::unique_ptr<Val>> vals_up_;
+
+  // A convenient set to return when we just need an unordered set to do
+  // something like check if a Val is in this container
+  std::unordered_set<Val*> vals_;
+
+  // Deque of unique pointer is the memory owning data structure
+  std::deque<std::unique_ptr<Expr>> exprs_up_;
+
+  // A convenient set to return when we just need an unordered set to do
+  // something like check if an Expr is in this container
+  std::unordered_set<Expr*> exprs_;
+
+  // Used to implement a generic "inContainer" that can be passed an invalid
+  // pointer. Specifically a pointer to a Statement owned by another container
+  // that has been freed. We can't check normally with the unordered_sets we
+  // already have because it would require a const_cast from a constant
+  // expr/val, or a dynamic cast from a Statement.
+  std::unordered_set<void*> raw_ptrs_;
+
+  // Values names counters
+  std::unordered_map<ValType, StmtNameType, TypeHash> val_type_name_map_;
+
+  // Expression names counter
+  StmtNameType expr_name_counter_ = 0;
+
+  // Manually store some persistent, frequently used nodes. It's very
+  // challenging to do this anything but manually as detecting when a container
+  // may or may not have one of these vals is tricky. Specifically because if
+  // the container doesn't own it, it's hard to understand from the outside if
+  // the node may have been removed then re-registered. It could also be tricky
+  // to know when we're using a different container as in FusionCopy_test
+  // demonstrates deleting then creating containers can result in the same
+  // pointer for the container.
+  std::unique_ptr<Bool> true_val_;
+  std::unique_ptr<Bool> false_val_;
+  std::unique_ptr<Int> one_val_;
+  std::unique_ptr<Int> zero_val_;
+  std::unique_ptr<NamedScalar> magic_zero_val_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
index 5ca8d54aaa9d..941bf22dea76 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 #include <fstream>
@@ -303,13 +304,13 @@ void IrGraphGenerator::generateScheduleGraph() {
       // Maybe not the best way to handle the root domain, but should be okay
       addArc(
           tv,
-          new TensorDomain(tv->getRootDomain()),
+          IrBuilder::create<TensorDomain>(tv->getRootDomain()),
           "[style=dashed, color=green, arrowhead=none]");
 
       if (tv->domain()->hasRFactor())
         addArc(
             tv,
-            new TensorDomain(tv->domain()->getRFactorDomain()),
+            IrBuilder::create<TensorDomain>(tv->domain()->getRFactorDomain()),
             "[style=dashed, color=green, arrowhead=none]");
     }
   }
@@ -370,6 +371,10 @@ void IrGraphGenerator::handle(const Int* i) {
   printValue(i, IrNodeLabel::gen(i, detail_level_));
 }
 
+void IrGraphGenerator::handle(const ComplexDouble* i) {
+  printValue(i, IrNodeLabel::gen(i, detail_level_));
+}
+
 void IrGraphGenerator::handle(const NamedScalar* i) {
   printValue(i, IrNodeLabel::gen(i, detail_level_));
 }
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
index 1144d95eb152..e5bbcac9157d 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
 #include <sstream>
@@ -79,6 +79,7 @@ class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const Bool*) override;
   void handle(const Double*) override;
   void handle(const Int*) override;
+  void handle(const ComplexDouble*) override;
   void handle(const NamedScalar*) override;
 
   void handle(const UnaryOp*) override;
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
index 02c319d36653..0584e2f33743 100644
--- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
@@ -19,6 +19,9 @@ namespace cuda {
 class WelfordResult;
 class ViewTransform;
 
+class IrCloner;
+class IrBuilderPasskey;
+
 //! A Bool value
 //!
 //! This value can be a symbolic value (defined after the kernel
@@ -26,17 +29,18 @@ class ViewTransform;
 //!
 class TORCH_CUDA_CU_API Bool : public Val {
  public:
-  Bool() : Val(ValType::Scalar, DataType::Bool), maybe_value_{c10::nullopt} {}
+  Bool(IrBuilderPasskey passkey);
+
+  explicit Bool(IrBuilderPasskey passkey, bool value);
 
-  explicit Bool(bool value)
-      : Val(ValType::Scalar, DataType::Bool), maybe_value_{value} {}
+  explicit Bool(IrBuilderPasskey passkey, c10::optional<bool> value);
 
   Bool(const Bool* src, IrCloner* ir_cloner);
 
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
-  bool isConst() const {
+  bool isConst() const final {
     return maybe_value_.has_value();
   }
   c10::optional<bool> value() const {
@@ -49,25 +53,25 @@ class TORCH_CUDA_CU_API Bool : public Val {
   const c10::optional<bool> maybe_value_;
 };
 
-//! A Float64 value. For now we don't have any other type besides
-//! Float64. This value can be a symbolic value (defined after the kernel
-//! is compiled) or a constant value (inlined into the kernel definition).
+//! A Float64 value. This value can be a symbolic value (defined after the
+//! kernel is compiled) or a constant value (inlined into the kernel
+//! definition).
 class TORCH_CUDA_CU_API Double : public Val {
  public:
   using ScalarType = double;
 
-  Double()
-      : Val(ValType::Scalar, DataType::Double), maybe_value_{c10::nullopt} {}
+  Double(IrBuilderPasskey passkey);
 
-  explicit Double(ScalarType value)
-      : Val(ValType::Scalar, DataType::Double), maybe_value_{value} {}
+  explicit Double(IrBuilderPasskey passkey, ScalarType value);
+
+  explicit Double(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
 
   Double(const Double* src, IrCloner* ir_cloner);
 
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
-  bool isConst() const {
+  bool isConst() const final {
     return maybe_value_.has_value();
   }
   c10::optional<ScalarType> value() const {
@@ -86,17 +90,51 @@ class TORCH_CUDA_CU_API Int : public Val {
  public:
   using ScalarType = int64_t;
 
-  Int() : Val(ValType::Scalar, DataType::Int), maybe_value_{c10::nullopt} {}
+  Int(IrBuilderPasskey passkey);
+
+  explicit Int(IrBuilderPasskey passkey, ScalarType value);
 
-  explicit Int(ScalarType value)
-      : Val(ValType::Scalar, DataType::Int), maybe_value_{value} {}
+  explicit Int(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
 
   Int(const Int* src, IrCloner* ir_cloner);
 
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
-  bool isConst() const {
+  bool isConst() const final {
+    return maybe_value_.has_value();
+  }
+  c10::optional<ScalarType> value() const {
+    return maybe_value_;
+  }
+
+  bool sameAs(const Statement* other) const override;
+
+ private:
+  const c10::optional<ScalarType> maybe_value_;
+};
+
+//! An c10::complex<double> value. This value can be a symbolic value (defined
+//! after the kernel is compiled) or a constant value (inlined into the kernel
+//! definition).
+class TORCH_CUDA_CU_API ComplexDouble : public Val {
+ public:
+  using ScalarType = c10::complex<double>;
+
+  ComplexDouble(IrBuilderPasskey passkey);
+
+  explicit ComplexDouble(IrBuilderPasskey passkey, ScalarType value);
+
+  explicit ComplexDouble(
+      IrBuilderPasskey passkey,
+      c10::optional<ScalarType> value);
+
+  ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner);
+
+  bool isSymbolic() const {
+    return !(maybe_value_.has_value());
+  }
+  bool isConst() const final {
     return maybe_value_.has_value();
   }
   c10::optional<ScalarType> value() const {
@@ -152,14 +190,18 @@ class TVDomainGuard;
 class TORCH_CUDA_CU_API TensorView : public Val {
  public:
   TensorView(
+      IrBuilderPasskey passkey,
       TensorDomain* domain,
       DataType dtype,
       MemoryType mtype = MemoryType::Local);
 
-  explicit TensorView(const std::shared_ptr<c10::TensorType>& tensor_type);
+  explicit TensorView(
+      IrBuilderPasskey passkey,
+      const std::shared_ptr<c10::TensorType>& tensor_type);
 
-  explicit TensorView(const std::shared_ptr<Value>& jit_value)
-      : TensorView(jit_value->type()->cast<c10::TensorType>()) {}
+  explicit TensorView(
+      IrBuilderPasskey passkey,
+      const std::shared_ptr<Value>& jit_value);
 
   TensorView(const TensorView* src, IrCloner* ir_cloner);
 
@@ -167,6 +209,13 @@ class TORCH_CUDA_CU_API TensorView : public Val {
     return domain_;
   }
 
+  //! This is for a TensorView with an rFactor domain that is an input to a
+  //! fusion segment. We convert the rfactor domain into a new root domain.
+  //! Any dynamic-sized rfactor iterDomains are given a new symbolic extent.
+  //! Concrete integer extents are kept. Output TensorViews of any subsequent
+  //! expressions that use this TensorView are also updated.
+  void convertRfactorToRootDomain();
+
   void setContiguity(const std::vector<bool>& contig) {
     domain()->setContiguity(contig);
   }
@@ -187,6 +236,16 @@ class TORCH_CUDA_CU_API TensorView : public Val {
   //! trivial reductions
   bool hasAnyReduction() const;
 
+  //! Returns true if this tensor is zero dimensional,
+  //!  i.e. a wrapped scalar or an empty placeholder.
+  bool isZeroDim() const {
+    return nDims() == 0;
+  }
+
+  //! Returns true if this tensor does not contain
+  //!  any value.
+  bool isEmptyTensor() const;
+
   c10::optional<unsigned int> getReductionAxis() const;
 
   const std::vector<IterDomain*>& getRootDomain() const;
@@ -210,6 +269,24 @@ class TORCH_CUDA_CU_API TensorView : public Val {
 
   size_t nDims() const;
 
+  // sets cpu_scalar_ value, which is special handling for CPU based zero-dim
+  // tensors (i.e. CPU Tensors that only have one value). This is only used if
+  // on an input value, otherwise ignored. This is important as special handling
+  // because these "scalars" should be type promoted as a tensor, but we want to
+  // avoid explicit copying of the data, so we want to pass the data value as a
+  // standard kernel argument value.
+  void setCpuScalar(bool is_cpu_scalar);
+
+  // returns cpu_scalar_ value, which is special handling for CPU based zero-dim
+  // tensors (i.e. CPU Tensors that only have one value). This is only used if
+  // on an input value, otherwise ignored. This is important as special handling
+  // because these "scalars" should be type promoted as a tensor, but we want to
+  // avoid explicit copying of the data, so we want to pass the data value as a
+  // standard kernel argument value.
+  bool isCpuScalar() const {
+    return cpu_scalar_;
+  }
+
   // Returns the position that this tensor is produced at relative to its axes.
   unsigned int getComputeAtPosition() const {
     return compute_at_pos_;
@@ -318,29 +395,27 @@ class TORCH_CUDA_CU_API TensorView : public Val {
   //
   TensorView* rFactor(const std::vector<int>& axes);
 
-  //! Welford Version of rFactor, semantically similar with
-  //!  the reduction version except that the rfactor is done
-  //!  in a multi-output scan pattern
-  WelfordResult rFactor(
+  //! Multi-output version of rFactor, semantically similar with
+  //! the reduction version except that the rfactor is done
+  //! for all outputs in a consistent way
+  std::vector<TensorView*> rFactor(
       const std::vector<int>& axes,
-      TensorView* avg,
-      TensorView* var,
-      TensorView* n);
+      const std::vector<TensorView*>& tvs);
 
   // Create a TensorView before the original tensor. A common use case is to
   // write results into shared memory or registers before moving to global
   // memory. Analogous to TVM Cache_Write
-  TensorView* cache_before();
+  TensorView* cacheBefore();
 
   // Create a TensorView after the original tensor. A common use case is to
   // read tensor into shared memory or registers. Analogous to TVM Cache_Read
-  TensorView* cache_after();
+  TensorView* cacheAfter();
 
   // For a fusion output with other uses, we want to avoid writing to global
   // memory and then reading the output again. We write to global memory
   // separately after an operation. We replace this fusion output with the
   // direct write TensorView.
-  TensorView* cache_fork();
+  TensorView* cacheFork();
 
   MemoryType getMemoryType() const {
     return memory_type_;
@@ -356,12 +431,38 @@ class TORCH_CUDA_CU_API TensorView : public Val {
     return axes_to_swizzle_;
   }
 
+  // Apply double buffering transformation
+  void doubleBuffer();
+
+  bool isDoubleBuffered() const {
+    return is_double_buffered_;
+  }
+
+  //! Fill in mma options in scheduling time.
+  //!  Each mma op in Fusion IR must be configured once before lowering.
+  //!  Mma options are configuration parameters used in lowering to mma
+  //!  instrinsics, mainly the type of mma macro to use and input data layout
+  //!  etc.
+  //!
+  //! TODO: This step will very likely be removed in a follow up PR. All of
+  //!  the options configured here could actually be inferred from fusion IR
+  //!  once we are feature complete.
+  void configureMma(MmaOptions options);
+
+  //! Transforms the innermost iterdomains according to the given mma swizzle,
+  //!  this should be used on the tvs that are either inputs/outputs of an
+  //!  MmaOp, or any tv's that are involved in prolog/epilog fusions and need to
+  //!  have a matching thread swizzle with the mma operand/result.
+  //! More detail on usage see [WarpMmaSwizzler] in scheduler/mma_utils.h .
+  void applyMmaSwizzle(MmaOptions options);
+
   friend TORCH_CUDA_CU_API TransformPropagator;
   friend TORCH_CUDA_CU_API TransformReplay;
   friend TORCH_CUDA_CU_API OptOutMutator;
   friend ComputeAt;
-  friend void adjustMemoryTypes(Fusion* fusion);
   friend class ir_utils::TVDomainGuard;
+  friend TORCH_CUDA_CU_API void groupReductions(
+      const std::vector<TensorView*>&);
 
  protected:
   void setDomain(TensorDomain* td) {
@@ -380,9 +481,9 @@ class TORCH_CUDA_CU_API TensorView : public Val {
     return pos;
   }
 
-  //! A helper function to maintain the consistency of welford output
-  //! schedules when doing rfactor on welford ops.
-  TensorView* welfordRfactorHelper(
+  //! A helper function to maintain the consistency of schedules of
+  //! multiple outputs wheen doing rfactor on multi-output reduction ops.
+  TensorView* multiOutputRfactorHelper(
       TensorView* tv,
       const std::vector<int>& axes);
 
@@ -393,6 +494,14 @@ class TORCH_CUDA_CU_API TensorView : public Val {
   MemoryType memory_type_ = MemoryType::Local;
   SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
   std::vector<IterDomain*> axes_to_swizzle_;
+  bool is_double_buffered_ = false;
+  // special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only
+  // have one value). This is only used if on an input value, otherwise ignored.
+  // This is important as special handling because these "scalars" should be
+  // type promoted as a tensor, but we want to avoid explicit copying of the
+  // data, so we want to pass the data value as a standard kernel argument
+  // value.
+  bool cpu_scalar_ = false;
 };
 
 //! A simple TensorView builder
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
index 8fd4475d2ddc..bf9d37867ee3 100644
--- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
@@ -1,10 +1,12 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
 
 //! Nodes in here should generally not be used by users. They should be behind
 //! the scenes and users shouldn't have to be aware of what they do to use the
@@ -20,6 +22,8 @@ namespace fuser {
 namespace cuda {
 
 class ViewTransform;
+class Scope;
+class IrCloner;
 
 //! Returns true if both v1 and v2 are scalars, are the same type of scalars,
 //! and dispatches to the inherited Val type's `->sameAs` call. e.g. if both
@@ -34,7 +38,7 @@ bool areEqualScalars(Val* v1, Val* v2);
 //!   4) split/merge
 class TORCH_CUDA_CU_API UnaryOp : public Expr {
  public:
-  UnaryOp(UnaryOpType type, Val* out, Val* in);
+  UnaryOp(IrBuilderPasskey, UnaryOpType type, Val* out, Val* in);
 
   UnaryOp(const UnaryOp* src, IrCloner* ir_cloner);
 
@@ -63,7 +67,7 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr {
 //!  2) LT (A < B)
 class TORCH_CUDA_CU_API BinaryOp : public Expr {
  public:
-  BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs);
+  BinaryOp(IrBuilderPasskey, BinaryOpType type, Val* out, Val* lhs, Val* rhs);
 
   BinaryOp(const BinaryOp* src, IrCloner* ir_cloner);
 
@@ -97,7 +101,11 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr {
   //! \param out The output tensor
   //! \param in The input tensor
   //! \param is_broadcast_dims True when output dim is a new broadcast domain
-  BroadcastOp(Val* out, Val* in, std::vector<bool> is_broadcast_dims);
+  BroadcastOp(
+      IrBuilderPasskey,
+      Val* out,
+      Val* in,
+      std::vector<bool> is_broadcast_dims);
 
   BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner);
 
@@ -138,7 +146,14 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr {
 //! non-reduction/non-broadcast dimensions.
 class TORCH_CUDA_CU_API ReductionOp : public Expr {
  public:
-  ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in);
+  ReductionOp(
+      IrBuilderPasskey,
+      BinaryOpType reduction_op_type,
+      Val* init,
+      Val* out,
+      Val* in,
+      bool is_allreduce = false,
+      ExprType expr_type = ExprType::ReductionOp);
 
   ReductionOp(const ReductionOp* src, IrCloner* ir_cloner);
 
@@ -156,6 +171,10 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr {
     return reduction_op_type_;
   }
 
+  bool isAllreduce() const {
+    return is_allreduce_;
+  }
+
   bool sameAs(const Statement* other) const override;
 
  private:
@@ -163,12 +182,67 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr {
   Val* const init_ = nullptr;
   Val* const out_ = nullptr;
   Val* const in_ = nullptr;
+  //! True if broadcast is fused
+  bool is_allreduce_ = false;
+};
+
+//! Grouped reduction operation for horizontal fusions. It works like
+//! batched GEMMs in the sense that multiple independent reductions are
+//! performed together. The main benefit is when reducing tensors across thread
+//! blocks, a single grid sync can be done for all individual
+//! reductions. As grid sync is very expensive, this can be a
+//! significant performance impact.
+class TORCH_CUDA_CU_API GroupedReductionOp : public Expr {
+ public:
+  GroupedReductionOp(
+      IrBuilderPasskey,
+      std::vector<BinaryOpType> reduction_op_type,
+      std::vector<Val*> init,
+      std::vector<Val*> out,
+      std::vector<Val*> in,
+      bool is_allreduce = false,
+      ExprType expr_type = ExprType::GroupedReductionOp);
+
+  GroupedReductionOp(const GroupedReductionOp* src, IrCloner* ir_cloner);
+
+  size_t numReductions() const {
+    return reduction_op_types_.size();
+  }
+
+  const std::vector<Val*>& initVals() const {
+    return init_vals_;
+  }
+
+  Val* initVal(size_t index) const {
+    return init_vals_.at(index);
+  }
+
+  const std::vector<BinaryOpType>& getReductionOpTypes() const {
+    return reduction_op_types_;
+  }
+
+  BinaryOpType getReductionOpType(size_t index) const {
+    return reduction_op_types_.at(index);
+  }
+
+  bool isAllreduce() const {
+    return is_allreduce_;
+  }
+
+  bool sameAs(const Statement* other) const override;
+
+ private:
+  const std::vector<BinaryOpType> reduction_op_types_;
+  const std::vector<Val*> init_vals_;
+  //! True if using the fused reduction kernel
+  bool is_allreduce_ = false;
 };
 
 //! Welford Scan operation.
 class TORCH_CUDA_CU_API WelfordOp : public Expr {
  public:
   WelfordOp(
+      IrBuilderPasskey,
       Val* out_avg,
       Val* out_var,
       Val* out_N,
@@ -177,7 +251,8 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
       Val* init_N,
       Val* in_avg,
       Val* in_var,
-      Val* in_N);
+      Val* in_N,
+      bool is_fused = false);
 
   WelfordOp(const WelfordOp* src, IrCloner* ir_cloner);
 
@@ -189,10 +264,6 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
     return in_avg_;
   }
 
-  Val* init() const {
-    return init_avg_;
-  }
-
   bool sameAs(const Statement* const other) const override;
 
   // Welford Accessors
@@ -241,6 +312,12 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
     return !init_N_->isZeroInt();
   }
 
+  bool isAllreduce() const {
+    return is_allreduce_;
+  }
+
+  std::vector<Val*> getInitVals() const;
+
  private:
   Val* const out_avg_;
   Val* const out_var_;
@@ -251,11 +328,72 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
   Val* const in_avg_;
   Val* const in_var_;
   Val* const in_N_;
+  //! True if using the fused reduction kernel (not implemented yet)
+  bool is_allreduce_ = false;
+};
+
+//! Fused Matmul operation
+class TORCH_CUDA_CU_API MmaOp : public Expr {
+ public:
+  MmaOp(IrBuilderPasskey, Val* out, Val* in_a, Val* in_b, Val* init);
+
+  MmaOp(
+      IrBuilderPasskey,
+      Val* out,
+      Val* in_a,
+      Val* in_b,
+      Val* init,
+      MmaOptions options);
+
+  MmaOp(const MmaOp* src, IrCloner* ir_cloner);
+
+  Val* out() const {
+    return out_;
+  }
+
+  Val* inA() const {
+    return in_a_;
+  }
+
+  Val* inB() const {
+    return in_b_;
+  }
+
+  Val* init() const {
+    return init_;
+  }
+
+  const auto& options() const {
+    TORCH_INTERNAL_ASSERT(options_.has_value(), "MmaOp not configured:", this);
+    return options_.value();
+  }
+
+  bool sameAs(const Statement* const other) const override;
+
+  auto accStride() const {
+    TORCH_INTERNAL_ASSERT(options_.has_value(), "MmaOp not configured:", this);
+    return options_->accumulator_stride;
+  }
+
+  void configureOptions(MmaOptions options) {
+    options_ = options;
+  }
+
+ private:
+  Val* const out_ = nullptr;
+  Val* const in_a_ = nullptr;
+  Val* const in_b_ = nullptr;
+  Val* const init_ = nullptr;
+  c10::optional<MmaOptions> options_ = c10::nullopt;
 };
 
 class TORCH_CUDA_CU_API TransposeOp : public Expr {
  public:
-  TransposeOp(TensorView* out, TensorView* in, std::vector<int> new2old);
+  TransposeOp(
+      IrBuilderPasskey,
+      TensorView* out,
+      TensorView* in,
+      std::vector<int> new2old);
 
   TransposeOp(const TransposeOp* src, IrCloner* ir_cloner);
 
@@ -279,7 +417,13 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr {
 
 class TORCH_CUDA_CU_API TernaryOp : public Expr {
  public:
-  TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3);
+  TernaryOp(
+      IrBuilderPasskey,
+      TernaryOpType type,
+      Val* out,
+      Val* in1,
+      Val* in2,
+      Val* in3);
 
   TernaryOp(const TernaryOp* src, IrCloner* ir_cloner);
 
@@ -317,7 +461,12 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr {
   //! \param out
   //! \param in
   //! \param offsets
-  ShiftOp(Val* out, Val* in, std::vector<int> offsets, bool pad);
+  ShiftOp(
+      IrBuilderPasskey,
+      Val* out,
+      Val* in,
+      std::vector<int> offsets,
+      std::vector<int> pad_width);
 
   ShiftOp(const ShiftOp* src, IrCloner* ir_cloner);
 
@@ -336,8 +485,14 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr {
     return offsets_;
   }
 
-  bool pad() const {
-    return pad_;
+  const std::vector<int>& padWidth() const {
+    return pad_width_;
+  }
+
+  bool hasPadding() const {
+    return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto p) {
+      return p > 0;
+    });
   }
 
   bool sameAs(const Statement* other) const override;
@@ -349,17 +504,18 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr {
   //! offsets_. The sign of each value indicates the direction of
   //! shifting.
   const std::vector<int> offsets_;
-  const bool pad_;
+  const std::vector<int> pad_width_;
 };
 
 //! Gather a window around each element.
 class TORCH_CUDA_CU_API GatherOp : public Expr {
  public:
   GatherOp(
+      IrBuilderPasskey,
       Val* out,
       Val* in,
-      std::vector<Int*> window_shape,
-      std::vector<std::vector<Int*>> pad_width);
+      std::vector<int> window_shape,
+      std::vector<std::vector<int>> pad_width);
 
   GatherOp(const GatherOp* src, IrCloner* ir_cloner);
 
@@ -381,20 +537,64 @@ class TORCH_CUDA_CU_API GatherOp : public Expr {
     return pad_width_;
   }
 
+  bool hasPadding() const {
+    return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto& p) {
+      return p[0] > 0 || p[1] > 0;
+    });
+  }
+
   bool sameAs(const Statement* other) const override;
 
  private:
   Val* const out_ = nullptr;
   Val* const in_ = nullptr;
   //! Shape of a window gathered for each element.
-  std::vector<Int*> window_shape_;
+  std::vector<int> window_shape_;
   //! The size of zero-padding of each axis.
-  std::vector<std::vector<Int*>> pad_width_;
+  std::vector<std::vector<int>> pad_width_;
+};
+
+class TORCH_CUDA_CU_API ViewAsScalar : public Expr {
+ public:
+  ViewAsScalar(
+      IrBuilderPasskey,
+      Val* out,
+      Val* in,
+      IterDomain* vector_id,
+      Val* index = nullptr);
+
+  ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner);
+
+  Val* out() const {
+    return out_;
+  }
+
+  Val* in() const {
+    return in_;
+  }
+
+  IterDomain* vector_id() const {
+    return vector_id_;
+  }
+
+  Val* index() const {
+    return index_;
+  }
+
+ private:
+  Val* const out_ = nullptr;
+  Val* const in_ = nullptr;
+
+  // The IterDomain of type VectorComponent newly appended to the output
+  IterDomain* vector_id_ = nullptr;
+
+  // The index that vector_id_ is lowered into
+  Val* index_ = nullptr;
 };
 
 class TORCH_CUDA_CU_API ViewOp : public Expr {
  public:
-  ViewOp(TensorView* out, TensorView* in);
+  ViewOp(IrBuilderPasskey, TensorView* out, TensorView* in);
 
   ViewOp(const ViewOp* src, IrCloner* ir_cloner);
 
@@ -422,39 +622,37 @@ class IndexReferenceReplay;
 class TORCH_CUDA_CU_API IterDomain : public Val {
  public:
   IterDomain(
+      IrBuilderPasskey,
       Val* start,
       Val* extent,
       ParallelType parallel_type = ParallelType::Serial,
       IterType iter_type = IterType::Iteration,
-      bool is_rfactor_domain = false);
+      bool is_rfactor_domain = false,
+      bool is_padded_dimension = false,
+      c10::optional<int64_t> padded_to_size_ = c10::nullopt,
+      bool is_mma_swizzled = false);
 
+  // Same as the above but can set the offset of the stop point
   IterDomain(
+      IrBuilderPasskey,
       Val* start,
       Val* extent,
       Val* stop_offset,
       ParallelType parallel_type = ParallelType::Serial,
       IterType iter_type = IterType::Iteration,
-      bool is_rfactor_domain = false);
+      bool is_rfactor_domain = false,
+      bool is_padded_dimension = false,
+      c10::optional<int64_t> padded_to_size_ = c10::nullopt,
+      bool is_mma_swizzled = false);
 
   IterDomain(const IterDomain* src, IrCloner* ir_cloner);
 
   bool sameAs(const Statement* other) const override;
 
-  // Returns a new IterDomain matching properties of this
-  // TODO: parallel_method->getParallelType
-  IterDomain* clone() const {
-    auto cloned = new IterDomain(
-        start(),
-        extent(),
-        stopOffset(),
-        getParallelType(),
-        getIterType(),
-        isRFactorProduct());
-
-    cloned->is_padded_dimension_ = is_padded_dimension_;
-    cloned->padded_to_size_ = padded_to_size_;
-    return cloned;
-  }
+  //! Returns a new IterDomain matching properties of this
+  //!
+  //! This does NOT copy the is_rfactor_domain flag.
+  IterDomain* cloneWithoutRFactor() const;
 
   //! Clone a vector domains
   static std::vector<IterDomain*> clone(
@@ -504,6 +702,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
     return getIterType() == IterType::Stride;
   }
 
+  bool isVectorComponent() const {
+    return getIterType() == IterType::VectorComponent;
+  }
+
   bool isParallelized() const {
     return getParallelType() != ParallelType::Serial;
   }
@@ -631,6 +833,55 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   //! domain.
   std::pair<IterDomain*, IterDomain*> stridedSplit(int factor);
 
+  // TODO: Remove
+  bool isSimple() const {
+    return definition() == nullptr;
+  }
+
+  //! Marks that this id represents a
+  //!  instruction loop, mma use only.
+  //!
+  //! An instruction loop can be considered a generalization of
+  //!  vectorization. It also represents a loop that's implemented
+  //!  by an instruction and should not be realized by codegen and
+  //!  cannot be inlined with.
+  //! As an example, if a mma macro, call it mma_eg implements:
+  //!  for m in M
+  //!    for n in N
+  //!      for k in K
+  //!         C[m,n] += A[m,k]*B[k,n],
+  //! But the generated code should simply be:
+  //!  mma_eg(C,A,B)
+  //! without the 3 level loopnest, i.e. they're instruction loops.
+  //!
+  //! In the actual mma macros, the loopnests it implements is a
+  //!  transformed version of above to match the mma swizzle.
+  //!  So it's different implicit loopnest for different macros.
+  //!  WarpMmaSwizzler will label the instruction loops case-by-case.
+  bool isMma() const {
+    return parallel_type_ == ParallelType::Mma;
+  }
+
+  bool isMmaSwizzled() const {
+    return is_mma_swizzled_;
+  }
+
+  //! Used by WarpMmaSwizzler, this is an utility for WarpMmaSwizzler
+  //!  to lock the thread swizzled iterdomains.
+  //! Only true for the iterdomains produced by WarpMmaSwizzler.
+  //! Mma ops require specific swizzle patterns
+  //!  and this label utility is to prevent any further transform on the
+  //!  iterdomains involved in the swizzle so that the pattern remain correct in
+  //!  generated code.
+  //!
+  //! Note:
+  //!    Used only through WarpMmaSwizzler only and mma validation relies on
+  //!    this
+  //!  flag being set on the correct iterdomains.
+  void toMmaSwizzled() {
+    is_mma_swizzled_ = true;
+  }
+
  protected:
   friend TensorDomain;
   friend ReplayTransformations;
@@ -647,6 +898,15 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   bool is_rfactor_domain_ = false;
   bool is_padded_dimension_ = false;
   c10::optional<int64_t> padded_to_size_ = c10::nullopt;
+
+  // TODO: Remove only used in kernel IR because IterDomains don't maintain
+  // definitions of split/merge.
+  bool is_simple_ = true;
+
+  //! Tracks if this id represents a thread swizzled loop or
+  //!   models an implicit loop within instructions. Should not make
+  //!   any changes once an id is warp mapped.
+  bool is_mma_swizzled_ = false;
 };
 
 //! TensorDomain holds a vector of IterDomains. It holds an IterDomain for every
@@ -666,15 +926,18 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
 class TORCH_CUDA_CU_API TensorDomain : public Val {
  public:
   explicit TensorDomain(
+      IrBuilderPasskey,
       std::vector<IterDomain*> root_domain,
       std::vector<bool> contiguity = std::vector<bool>());
 
   TensorDomain(
+      IrBuilderPasskey,
       std::vector<IterDomain*> root_domain,
       std::vector<IterDomain*> domain,
       std::vector<bool> contiguity = std::vector<bool>());
 
   TensorDomain(
+      IrBuilderPasskey,
       std::vector<IterDomain*> root_domain,
       std::vector<IterDomain*> rfactor_domain,
       std::vector<IterDomain*> domain,
@@ -718,8 +981,14 @@ class TORCH_CUDA_CU_API TensorDomain : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
+  bool hasGridBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
+
+  // Returns if rfactor domain only consists of id's of iter type.
+  bool hasViewLikeRFactor() const;
+
   bool hasVectorize() const;
 
   c10::optional<unsigned int> getReductionAxis() const;
@@ -786,6 +1055,8 @@ class TORCH_CUDA_CU_API TensorDomain : public Val {
   TensorDomain* view(
       const std::vector<std::shared_ptr<ViewTransform>>& transforms);
 
+  TensorDomain* flatten(int64_t start_dim, int64_t end_dim);
+
   static std::vector<IterDomain*> orderedAs(
       const std::vector<IterDomain*>& td,
       const std::unordered_map<int, int>& old2new);
@@ -821,6 +1092,7 @@ class TORCH_CUDA_CU_API Split : public Expr {
   // start_offset and stop_offset are distance from the left end and
   // right ends, respectively.
   Split(
+      IrBuilderPasskey,
       IterDomain* outer,
       IterDomain* inner,
       IterDomain* in,
@@ -881,12 +1153,13 @@ class TORCH_CUDA_CU_API Split : public Expr {
 //! dictate which will be traversed first (inner). Both IterDomains must be of
 //! the same iter or reduction type, as well as the same parallelization
 //! strategy if there is one
-//!
-//! \todo Should this be a unary op type?
-//!
 class TORCH_CUDA_CU_API Merge : public Expr {
  public:
-  Merge(IterDomain* out, IterDomain* outer, IterDomain* inner);
+  Merge(
+      IrBuilderPasskey,
+      IterDomain* out,
+      IterDomain* outer,
+      IterDomain* inner);
 
   Merge(const Merge* src, IrCloner* ir_cloner);
 
@@ -918,9 +1191,7 @@ class TORCH_CUDA_CU_API Merge : public Expr {
 //!
 class TORCH_CUDA_CU_API NamedScalar : public Val {
  public:
-  // NOLINTNEXTLINE(modernize-pass-by-value)
-  NamedScalar(std::string name, DataType dtype)
-      : Val(ValType::NamedScalar, dtype), name_(name) {}
+  NamedScalar(IrBuilderPasskey passkey, std::string name, DataType dtype);
 
   NamedScalar(const NamedScalar* src, IrCloner* ir_cloner);
 
@@ -931,9 +1202,11 @@ class TORCH_CUDA_CU_API NamedScalar : public Val {
   bool sameAs(const Statement* other) const override;
 
   //! Return the named scalar extent of a parallel dimension (e.g. blockDim.x)
+  //! WARNING: Only works with Fusion container at the moment
   static NamedScalar* getParallelDim(ParallelType p_type);
 
   //! Return the named scalar index of a parallel dimension (e.g. threadIdx.x)
+  //! WARNING: Only works with Fusion container at the moment
   static NamedScalar* getParallelIndex(ParallelType p_type);
 
   //! Return the parallel type of this NamedScalar if it is an extent of a
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
index a553c59fc2b0..0b83e07f784b 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
 #include <c10/util/irange.h>
@@ -14,6 +15,23 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+namespace {
+const char* boolLiteral(bool value) {
+  return value ? "true" : "false";
+}
+
+std::string varName(const Val* val) {
+  std::stringstream value_name;
+  if (val == nullptr) {
+    value_name << "$nullptr";
+  } else {
+    value_name << val->name();
+  }
+  return value_name.str();
+}
+
+} // namespace
+
 // Make sure we can inline something, before we attempt to.
 static void checkInlineable(const Expr* expr) {
   for (auto input : expr->inputs()) {
@@ -49,55 +67,52 @@ void IrPrinter::handle(Fusion* fusion) {
   }
 }
 
-void IrPrinter::handle(const TensorDomain* td) {
-  if (td->nDims() == 0) {
-    os_ << "[ 0 ]";
-    return;
+void IrPrinter::handle(const kir::Kernel* kernel) {
+  TORCH_CHECK(kernel != nullptr);
+
+  // kernel declaration
+  os_ << "\nKERNEL (";
+  for (auto in : kernel->inputs()) {
+    handle(in);
+    if (in != kernel->inputs().back()) {
+      os_ << ", ";
+    }
   }
-  os_ << "[ ";
-  for (const auto i : c10::irange(td->nDims())) {
-    handle(td->axis(i));
-    if (i != td->nDims() - 1)
+  os_ << ") -> (";
+  for (auto out : kernel->outputs()) {
+    handle(out);
+    if (out != kernel->outputs().back()) {
       os_ << ", ";
+    }
   }
-  os_ << " ]";
+  os_ << ") :\n";
+
+  // kernel body
+  indent_size_++;
+  for (auto expr : kernel->topLevelExprs()) {
+    handle(expr);
+  }
+  indent_size_--;
+  os_ << "END.\n\n";
 }
 
-void IrPrinter::handle(const TensorView* tv) {
-  if (tv->nDims() == 0) {
-    os_ << typePrefix(tv->getDataType().value()) << tv->name();
-  } else {
-    os_ << "T" << tv->name();
-    switch (tv->getMemoryType()) {
-      case MemoryType::Global:
-        os_ << "_g";
-        break;
-      case MemoryType::Shared:
-        os_ << "_s";
-        break;
-      case MemoryType::Local:
-        os_ << "_l";
-        break;
-    }
-    handle(tv->domain());
+void IrPrinter::handle(kir::Kernel& kernel) {
+  handle(&kernel);
+}
 
-    if (tv->getComputeAtPosition() > 0) {
-      os_ << " ca_pos( ";
-      os_ << tv->getComputeAtPosition();
-      os_ << " )";
-    }
-    if (tv->getMaxProducerPosition() > 0) {
-      os_ << " produce_pos( ";
-      os_ << tv->getMaxProducerPosition();
-      os_ << ")";
-    }
+void IrPrinter::handleScope(const kir::Scope& scope) {
+  // Save the uses of the parent scope
+  indent_size_++;
+  for (auto expr : scope.exprs()) {
+    handle(expr);
   }
+  indent_size_--;
 }
 
 void IrPrinter::handle(const IterDomain* id) {
   os_ << id->getIterType();
   os_ << id->getParallelType();
-  os_ << id->name();
+  os_ << varName(id);
   os_ << "{";
   if (!id->start()->isZeroInt()) {
     print_inline(id->start());
@@ -116,6 +131,47 @@ void IrPrinter::handle(const IterDomain* id) {
   }
 }
 
+void IrPrinter::handle(const TensorDomain* td) {
+  if (td->nDims() == 0) {
+    os_ << "[ 0 ]";
+    return;
+  }
+  os_ << "[ ";
+  for (const auto i : c10::irange(td->nDims())) {
+    handle(td->axis(i));
+    if (i != td->nDims() - 1)
+      os_ << ", ";
+  }
+  os_ << " ]";
+}
+
+void IrPrinter::handle(const TensorView* tv) {
+  os_ << "T" << varName(tv);
+  switch (tv->getMemoryType()) {
+    case MemoryType::Global:
+      os_ << "_g";
+      break;
+    case MemoryType::Shared:
+      os_ << "_s";
+      break;
+    case MemoryType::Local:
+      os_ << "_l";
+      break;
+  }
+  handle(tv->domain());
+
+  if (tv->getComputeAtPosition() > 0) {
+    os_ << " ca_pos( ";
+    os_ << tv->getComputeAtPosition();
+    os_ << " )";
+  }
+  if (tv->getMaxProducerPosition() > 0) {
+    os_ << " produce_pos( ";
+    os_ << tv->getMaxProducerPosition();
+    os_ << ")";
+  }
+}
+
 void IrPrinter::handle(const Bool* b) {
   if (print_inline_ && b->definition() != nullptr) {
     os_ << "( ";
@@ -124,10 +180,9 @@ void IrPrinter::handle(const Bool* b) {
     return;
   }
 
-  if (b->isSymbolic()) {
-    os_ << "b" << b->name();
-  } else {
-    os_ << "bool(" << *(b->value()) << ")";
+  os_ << "b" << varName(b);
+  if (b->isConst()) {
+    os_ << "(" << (b->value().value() ? "true" : "false") << ")";
   }
 }
 
@@ -140,7 +195,7 @@ void IrPrinter::handle(const Double* d) {
   }
 
   if (d->isSymbolic()) {
-    os_ << "d" << d->name();
+    os_ << "d" << varName(d);
   } else {
     os_ << "double("
         << std::setprecision(
@@ -160,30 +215,39 @@ void IrPrinter::handle(const Int* i) {
   }
 
   if (i->isSymbolic()) {
-    os_ << "i" << i->name();
+    os_ << "i" << varName(i);
   } else {
     os_ << *(i->value());
   }
 }
 
-void IrPrinter::handle(const NamedScalar* i) {
-  os_ << i->name();
-}
+void IrPrinter::handle(const ComplexDouble* c) {
+  if (print_inline_) {
+    if (auto def = c->definition()) {
+      os_ << "( ";
+      handle(def);
+      os_ << " )";
+      return;
+    }
+  }
 
-static bool isTV(const Val* val) {
-  return val->getValType().value() == ValType::TensorView;
+  if (c->isSymbolic()) {
+    os_ << "c" << varName(c);
+  } else {
+    os_ << "std::complex<double>"
+        << std::setprecision(std::numeric_limits<double>::max_digits10)
+        << *(c->value());
+  }
 }
 
-// Check if we're a TensorView op that we can generate code for.
-static bool isTVOp(const Expr* expr) {
-  return expr->outputs().size() == 1 && isTV(expr->outputs().front());
+void IrPrinter::handle(const NamedScalar* ns) {
+  os_ << ns->name();
 }
 
 void IrPrinter::handle(const UnaryOp* uop) {
-  bool istvop = isTVOp(uop);
+  bool istvop = ir_utils::isTvOp(uop);
   if (!print_inline_) {
-    indent();
-    os_ << uop->out();
+    indent() << uop->out();
     if (istvop) {
       os_ << "\n";
       indent_size_++;
@@ -230,10 +294,9 @@ void IrPrinter::handle(const UnaryOp* uop) {
 }
 
 void IrPrinter::handle(const BinaryOp* bop) {
-  bool istvop = isTVOp(bop);
+  bool istvop = ir_utils::isTvOp(bop);
   if (!print_inline_) {
-    indent();
-    os_ << bop->out();
+    indent() << bop->out();
 
     // tensor operations tend to be long, break them up into multiple lines
     if (istvop) {
@@ -286,7 +349,7 @@ void IrPrinter::handle(const BinaryOp* bop) {
 }
 
 void IrPrinter::handle(const TernaryOp* top) {
-  bool istvop = isTVOp(top);
+  bool istvop = ir_utils::isTvOp(top);
   if (!print_inline_) {
     indent();
     os_ << top->out();
@@ -327,18 +390,32 @@ void IrPrinter::handle(const TernaryOp* top) {
 }
 
 void IrPrinter::handle(const ReductionOp* rop) {
-  indent();
-  os_ << rop->out() << " = reduction( " << rop->in()
-      << ", op = " << rop->getReductionOpType()
-      << ", initial value = " << rop->init() << " )\n";
+  indent() << rop->out() << "\n";
+  indent() << "   = reduction( " << rop->in()
+           << ", op = " << rop->getReductionOpType()
+           << ", initial value = " << rop->init()
+           << ", allreduce = " << rop->isAllreduce() << " )\n";
+}
+
+void IrPrinter::handle(const GroupedReductionOp* grouped_rop) {
+  indent() << "Grouped reduction(\n";
+  ++indent_size_;
+  for (const auto i : c10::irange(grouped_rop->numReductions())) {
+    indent() << grouped_rop->output(i) << " = reduction( "
+             << grouped_rop->input(i)
+             << ", op = " << grouped_rop->getReductionOpType(i)
+             << ", initial value = " << grouped_rop->initVal(i) << " )\n";
+  }
+  indent() << "allreduce = " << (grouped_rop->isAllreduce() ? "true" : "false")
+           << " )\n";
+  --indent_size_;
 }
 
 void IrPrinter::handle(const WelfordOp* wop) {
-  indent();
-  os_ << wop->outAvg() << "(Avg),\n"
-      << wop->outVar() << "(Var),\n"
-      << wop->outN() << "(Count)"
-      << "\n = Welford ( ";
+  indent() << wop->outAvg() << "(Avg),\n"
+           << wop->outVar() << "(Var),\n"
+           << wop->outN() << "(Count)"
+           << "\n = Welford ( ";
   if (wop->singleValue()) {
     os_ << wop->inAvg() << "(Avg), ";
   } else {
@@ -349,28 +426,59 @@ void IrPrinter::handle(const WelfordOp* wop) {
     os_ << "\n  initial value = " << wop->initAvg() << "(Avg)\n  "
         << wop->initVar() << "(Var)\n  " << wop->initN() << "(N)";
   }
+  os_ << "\n  allreduce = " << wop->isAllreduce();
   os_ << " )\n";
 }
 
 void IrPrinter::handle(const BroadcastOp* bop) {
-  indent();
-  os_ << bop->out() << " = broadcast( " << bop->in() << " )\n";
+  indent() << bop->out() << "\n";
+  indent() << "   = broadcast( " << bop->in() << " )\n";
+}
+
+void IrPrinter::handle(const Split* s) {
+  os_ << (s->innerSplit() ? "Split: " : "Outer split: ");
+  handle(s->in());
+  os_ << " by factor " << s->factor() << " -> ";
+  handle(s->outer());
+  os_ << ", ";
+  handle(s->inner());
+  if (s->startOffset()) {
+    os_ << ", start offset: ";
+    handle(s->startOffset());
+  }
+  if (s->stopOffset()) {
+    os_ << ", stop offset: ";
+    handle(s->stopOffset());
+  }
+  os_ << "\n";
+}
+
+void IrPrinter::handle(const Merge* m) {
+  os_ << "Merge: ";
+  handle(m->outer());
+  os_ << " and ";
+  handle(m->inner());
+  os_ << " -> ";
+  handle(m->out());
+  os_ << "\n";
 }
 
 void IrPrinter::handle(const TransposeOp* top) {
-  indent();
-  os_ << top->out() << " = transpose( " << top->in() << " )\n";
+  indent() << top->out() << " = transpose( " << top->in() << " )\n";
 }
 
 void IrPrinter::handle(const ShiftOp* sop) {
-  indent();
-  os_ << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets()
-      << "}, padding = " << (sop->pad() ? "true" : "false") << " )\n";
+  indent() << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets()
+           << "}, {" << sop->padWidth() << "} )\n";
+}
+
+void IrPrinter::handle(const MmaOp* mma) {
+  indent() << mma->out() << " = mma(" << mma->inA() << "," << mma->inB();
+  os_ << ")\n";
 }
 
 void IrPrinter::handle(const GatherOp* op) {
-  indent();
-  os_ << op->out() << " = gather( " << op->in() << ", {";
+  indent() << op->out() << " = gather( " << op->in() << ", {";
   bool no_comma = true;
   for (const auto& s : op->windowShape()) {
     if (!no_comma) {
@@ -391,37 +499,261 @@ void IrPrinter::handle(const GatherOp* op) {
   os_ << "} )\n";
 }
 
+void IrPrinter::handle(const ViewAsScalar* top) {
+  indent() << top->out() << " = view_as_scalar( " << top->in() << ", "
+           << top->vector_id() << " )\n";
+}
+
 void IrPrinter::handle(const ViewOp* top) {
+  indent() << top->out() << " = view( " << top->in() << " )\n";
+}
+
+void IrPrinter::handle(const kir::Predicate* node) {
+  switch (node->predicate_type()) {
+    case PredicateType::Manual: {
+      os_ << node->value();
+      break;
+    }
+    default:
+      os_ << node->predicate_type();
+      break;
+  }
+}
+
+void IrPrinter::handle(const kir::TensorIndex* ti) {
+  os_ << "T" << varName(ti);
+  switch (ti->view()->getMemoryType()) {
+    case MemoryType::Global:
+      os_ << "_g";
+      break;
+    case MemoryType::Shared:
+      os_ << "_s";
+      break;
+    case MemoryType::Local:
+      os_ << "_l";
+      break;
+  }
+  os_ << "[";
+  for (auto index : ti->indices()) {
+    print_inline(index);
+    if (index != ti->indices().back()) {
+      os_ << ", ";
+    }
+  }
+  os_ << "]";
+  os_ << " view( T" << varName(ti->view()) << " )";
+}
+
+void IrPrinter::handle(const kir::Allocate* node) {
   indent();
-  os_ << top->out() << " = view( " << top->in() << " )\n";
+  handle(node->buffer());
+  os_ << " = ALLOCATE("
+      << "mem_type=" << node->memoryType() << ", "
+      << "size=";
+  print_inline(node->size());
+  os_ << ", "
+      << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n";
+  if (node->alias() != nullptr) {
+    indent() << kTab << ".alias=";
+    handle(node->alias()->buffer());
+    os_ << "\n";
+  }
 }
 
-void IrPrinter::handle(const Split* s) {
-  os_ << (s->innerSplit() ? "Split: " : "Outer split: ");
-  handle(s->in());
-  os_ << " by factor " << s->factor() << " -> ";
-  handle(s->outer());
-  os_ << ", ";
-  handle(s->inner());
-  if (s->startOffset()) {
-    os_ << ", start offset: ";
-    handle(s->startOffset());
+void IrPrinter::handle(const kir::BlockSync* node) {
+  indent() << "BLOCKSYNC(war_hazard=" << boolLiteral(node->isWarHazardSync())
+           << ")\n";
+}
+
+void IrPrinter::handle(const kir::GridSync* node) {
+  indent() << "GRIDSYNC(" << node->syncDims().toString() << ", ";
+  handle(node->syncBuffer());
+  os_ << ")\n";
+}
+
+void IrPrinter::handle(const kir::ForLoop* node) {
+  indent() << "FOR ";
+  handle(node->index());
+  os_ << " in ";
+  handle(node->iter_domain());
+  os_ << ":\n";
+  handleScope(node->body());
+}
+
+void IrPrinter::handle(const kir::IfThenElse* node) {
+  indent() << "IF ";
+  handle(node->predicate());
+  os_ << ":\n";
+  handleScope(node->thenBody());
+  if (node->hasElse()) {
+    indent() << "ELSE:\n";
+    handleScope(node->elseBody());
   }
-  if (s->stopOffset()) {
-    os_ << ", stop offset: ";
-    handle(s->stopOffset());
+}
+
+void IrPrinter::handle(const kir::GridBroadcast* node) {
+  const auto* broadcast_op = node->broadcast_op();
+  indent();
+  handle(broadcast_op->out());
+  os_ << " = "
+      << "GRID_BROADCAST(in=";
+  handle(broadcast_op->in());
+  os_ << ")\n";
+  indent() << kTab << ".broadcast_buffer=";
+  handle(node->broadcast_buffer()->buffer());
+  os_ << "\n";
+  indent() << kTab << ".sync_buffer=";
+  handle(node->sync_buffer()->buffer());
+  os_ << "\n";
+}
+
+void IrPrinter::handle(const kir::GridReduction* node) {
+  indent();
+  handle(node->out());
+  os_ << " = "
+      << "GRID_REDUCTION(op='" << node->getReductionOpType() << "'"
+      << ", in=";
+  handle(node->in());
+  os_ << ", init=";
+  handle(node->init());
+  os_ << ", read_pred=";
+  if (node->predicate() != nullptr) {
+    handle(node->predicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << ")\n";
+  os_ << ", write_pred=";
+  if (node->writePredicate() != nullptr) {
+    handle(node->writePredicate());
+  } else {
+    os_ << "nullptr";
   }
+  os_ << ")\n";
+  indent() << kTab << ".reduction_buffer=";
+  handle(node->reduction_buffer()->buffer());
+  os_ << "\n";
+  indent() << kTab << ".sync_buffer=";
+  handle(node->sync_buffer()->buffer());
   os_ << "\n";
 }
 
-void IrPrinter::handle(const Merge* m) {
-  os_ << "Merge: ";
-  handle(m->outer());
-  os_ << " and ";
-  handle(m->inner());
-  os_ << " -> ";
-  handle(m->out());
+void IrPrinter::handle(const kir::GroupedGridReduction* node) {
+  indent() << "Grouped grid reduction(\n";
+  ++indent_size_;
+  for (const auto i : c10::irange(node->numReductions())) {
+    indent();
+    handle(node->output(i));
+    os_ << " = "
+        << "reduction(op='" << node->getReductionOpType(i) << "'"
+        << ", in=";
+    handle(node->input(i));
+    os_ << ", init=";
+    handle(node->initVal(i));
+    os_ << "\n";
+  }
+  indent() << kTab << ".read_pred=";
+  if (node->predicate() != nullptr) {
+    handle(node->predicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << "\n";
+  indent() << kTab << ".write_pred=";
+  if (node->writePredicate() != nullptr) {
+    handle(node->writePredicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << "\n";
+  for (const auto i : c10::irange(node->numReductions())) {
+    indent() << kTab << ".reduction_buffer=";
+    handle(node->reduction_buffers().at(i)->buffer());
+    os_ << "\n";
+  }
+  indent() << kTab << ".sync_buffer=";
+  handle(node->sync_buffer()->buffer());
+  os_ << "\n";
+}
+
+void IrPrinter::handle(const kir::GridWelford* node) {
+  const auto* welford_op = node->welford_op();
+  indent();
+  handle(welford_op->outVar());
+  os_ << ",";
+  handle(welford_op->outAvg());
+  os_ << ",";
+  handle(welford_op->outN());
+  os_ << " = "
+      << "GRID_WELFORD("
+      << "inAvg=";
+  handle(welford_op->inAvg());
+  if (!welford_op->inN()->isOneInt()) {
+    indent() << ", inVar=";
+    handle(welford_op->inVar());
+  }
+  indent() << ", inN=";
+  handle(welford_op->inN());
+  if (!welford_op->initN()->isZeroInt()) {
+    indent() << ", initVar=";
+    handle(welford_op->initVar());
+    os_ << " initAvg=";
+    handle(welford_op->initAvg());
+    os_ << " initN=";
+    handle(welford_op->initN());
+  }
+  indent() << ", read_pred=";
+  if (welford_op->predicate() != nullptr) {
+    handle(welford_op->predicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << ")\n";
+  indent() << ", write_pred=";
+  if (welford_op->writePredicate() != nullptr) {
+    handle(welford_op->writePredicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << ")\n";
+  indent() << kTab << ".var_buffer=";
+  handle(node->var_buffer()->buffer());
+  os_ << ".avg_buffer=";
+  handle(node->avg_buffer()->buffer());
+  os_ << ".n_buffer=";
+  handle(node->N_buffer()->buffer());
+  os_ << "\n";
+  indent() << kTab << ".sync_buffer=";
+  handle(node->sync_buffer()->buffer());
   os_ << "\n";
+  indent() << kTab << ".grid_read_pred=";
+  if (node->predicate() != nullptr) {
+    handle(node->predicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << "\n";
+  indent() << kTab << ".grid_write_pred=";
+  if (node->writePredicate() != nullptr) {
+    handle(node->writePredicate());
+  } else {
+    os_ << "nullptr";
+  }
+  os_ << "\n";
+}
+
+void IrPrinter::handle(const kir::InitMagicZero* node) {
+  indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n";
+}
+
+void IrPrinter::handle(const kir::UpdateMagicZero* node) {
+  indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n";
+}
+
+void IrPrinter::handle(const kir::AllocateFusedReduction* node) {
+  indent() << "AllocateFusedReduction(reduction buffer=";
+  handle(node->out());
+  os_ << ")\n";
 }
 
 void IrTransformPrinter::handle(Fusion* f) {
@@ -450,7 +782,7 @@ void IrTransformPrinter::printTransforms(TensorView* tv) {
   os() << ")\n";
 
   for (auto exp : all_exp) {
-    os() << "    ";
+    os() << "  ";
     IrPrinter::handle(exp);
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h
index c080c3f8f993..f5ccf6fc5ac9 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.h
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
@@ -13,21 +13,30 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+class Fusion;
+namespace kir {
+class Kernel;
+class Scope;
+} // namespace kir
+
 //! Define pretty printing functions for IR nodes
 //!
 //! This class is intended for debug printing, so it attempts
 //! to handle invalid states as well.
 //!
 class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
+  static constexpr char const* kTab = "  ";
+
  public:
   explicit IrPrinter(std::ostream& os) : os_(os) {}
 
   // Indent the generated code
-  void indent() {
+  std::ostream& indent() {
     for (const auto i : c10::irange(indent_size_)) {
       (void)i; // Suppress unused variable warning
       os_ << "  ";
     }
+    return os_;
   }
 
   void resetIndent() {
@@ -38,6 +47,8 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
     return print_inline_;
   }
 
+  using OptInConstDispatch::handle;
+
   virtual void handle(Fusion* f);
 
   // handle calls some non const fusion ops,
@@ -52,30 +63,57 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
     handle(&f);
   }
 
-  void handle(const Statement* s) override;
-  void handle(const Val* v) override;
-  void handle(const Expr* e) override;
-
-  void handle(const TensorDomain*) override;
-  void handle(const TensorView*) override;
-  void handle(const IterDomain*) override;
-
-  void handle(const Bool*) override;
-  void handle(const Double*) override;
-  void handle(const Int*) override;
-  void handle(const NamedScalar*) override;
-
-  void handle(const UnaryOp*) override;
-  void handle(const BinaryOp*) override;
-  void handle(const TernaryOp*) override;
-  void handle(const ReductionOp*) override;
-  void handle(const WelfordOp*) override;
-  void handle(const BroadcastOp*) override;
-  void handle(const TransposeOp*) override;
-  void handle(const ShiftOp*) override;
-  void handle(const GatherOp*) override;
-  void handle(const ViewOp*) override;
-
+  virtual void handle(const kir::Kernel* kernel);
+  virtual void handle(kir::Kernel& kernel);
+
+  void handleScope(const kir::Scope& scope);
+
+  void handle(const Statement* s) final;
+  void handle(const Val* v) final;
+  void handle(const Expr* e) final;
+
+  void handle(const IterDomain*) final;
+  void handle(const TensorDomain*) final;
+  void handle(const TensorView*) final;
+
+  void handle(const Bool*) final;
+  void handle(const Double*) final;
+  void handle(const Int*) final;
+  void handle(const ComplexDouble*) final;
+  void handle(const NamedScalar*) final;
+
+  void handle(const UnaryOp*) final;
+  void handle(const BinaryOp*) final;
+  void handle(const TernaryOp*) final;
+  void handle(const ReductionOp*) final;
+  void handle(const GroupedReductionOp*) final;
+  void handle(const WelfordOp*) final;
+  void handle(const MmaOp*) final;
+  void handle(const BroadcastOp*) final;
+  void handle(const TransposeOp*) final;
+  void handle(const ShiftOp*) final;
+  void handle(const GatherOp*) final;
+  void handle(const ViewAsScalar*) final;
+  void handle(const ViewOp*) final;
+
+  void handle(const kir::Predicate*) final;
+  void handle(const kir::TensorIndex*) final;
+
+  void handle(const kir::GridBroadcast*) final;
+  void handle(const kir::GridReduction*) final;
+  void handle(const kir::GroupedGridReduction*) final;
+  void handle(const kir::GridWelford*) final;
+  void handle(const kir::ForLoop*) final;
+  void handle(const kir::IfThenElse*) final;
+  void handle(const kir::Allocate*) final;
+  void handle(const kir::BlockSync*) final;
+  void handle(const kir::GridSync*) final;
+  void handle(const kir::InitMagicZero*) final;
+  void handle(const kir::UpdateMagicZero*) final;
+  void handle(const kir::AllocateFusedReduction*) final;
+
+  // IR math printer overrides these to prevent them from printing, keep
+  // override
   void handle(const Split*) override;
   void handle(const Merge*) override;
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
index 1465a88bef32..543fdd0941fa 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
@@ -4,7 +4,9 @@
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
@@ -38,19 +40,19 @@ class ScalarCheck : OptInConstDispatch {
   }
 
  private:
-  void handle(const Bool* b) override {
+  void handle(const Bool* b) final {
     same_ = v1_->as<Bool>()->sameAs(v2_->as<Bool>());
   }
 
-  void handle(const Double* d) override {
+  void handle(const Double* d) final {
     same_ = v1_->as<Double>()->sameAs(v2_->as<Double>());
   }
 
-  void handle(const Int* i) override {
+  void handle(const Int* i) final {
     same_ = v1_->as<Int>()->sameAs(v2_->as<Int>());
   }
 
-  void handle(const NamedScalar* ns) override {
+  void handle(const NamedScalar* ns) final {
     same_ = v1_->as<NamedScalar>()->sameAs(v2_->as<NamedScalar>());
   }
 
@@ -70,6 +72,16 @@ bool areEqualScalars(Val* v1, Val* v2) {
   return ScalarCheck::sameAs(v1, v2);
 }
 
+Bool::Bool(IrBuilderPasskey passkey)
+    : Val(passkey, ValType::Scalar, DataType::Bool),
+      maybe_value_{c10::nullopt} {}
+
+Bool::Bool(IrBuilderPasskey passkey, bool value)
+    : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {}
+
+Bool::Bool(IrBuilderPasskey passkey, c10::optional<bool> value)
+    : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {}
+
 Bool::Bool(const Bool* src, IrCloner* ir_cloner)
     : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
 
@@ -87,6 +99,16 @@ bool Bool::sameAs(const Statement* other) const {
   return false;
 }
 
+Double::Double(IrBuilderPasskey passkey)
+    : Val(passkey, ValType::Scalar, DataType::Double),
+      maybe_value_{c10::nullopt} {}
+
+Double::Double(IrBuilderPasskey passkey, ScalarType value)
+    : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {}
+
+Double::Double(IrBuilderPasskey passkey, c10::optional<ScalarType> value)
+    : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {}
+
 Double::Double(const Double* src, IrCloner* ir_cloner)
     : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
 
@@ -103,6 +125,16 @@ bool Double::sameAs(const Statement* other) const {
   return false;
 }
 
+Int::Int(IrBuilderPasskey passkey)
+    : Val(passkey, ValType::Scalar, DataType::Int),
+      maybe_value_{c10::nullopt} {}
+
+Int::Int(IrBuilderPasskey passkey, ScalarType value)
+    : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {}
+
+Int::Int(IrBuilderPasskey passkey, c10::optional<ScalarType> value)
+    : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {}
+
 Int::Int(const Int* src, IrCloner* ir_cloner)
     : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
 
@@ -120,11 +152,43 @@ bool Int::sameAs(const Statement* other) const {
   return false;
 }
 
-UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in)
-    : Expr(ExprType::UnaryOp), unary_op_type_{type}, out_{out}, in_{in} {
+ComplexDouble::ComplexDouble(IrBuilderPasskey passkey)
+    : Val(passkey, ValType::Scalar, DataType::ComplexDouble),
+      maybe_value_{c10::nullopt} {}
+
+ComplexDouble::ComplexDouble(IrBuilderPasskey passkey, ScalarType value)
+    : Val(passkey, ValType::Scalar, DataType::ComplexDouble),
+      maybe_value_{value} {}
+
+ComplexDouble::ComplexDouble(
+    IrBuilderPasskey passkey,
+    c10::optional<ScalarType> value)
+    : Val(passkey, ValType::Scalar, DataType::ComplexDouble),
+      maybe_value_{value} {}
+
+ComplexDouble::ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner)
+    : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
+
+bool ComplexDouble::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (!other->isA<ComplexDouble>()) {
+    return false;
+  }
+  const auto other_complex = other->as<ComplexDouble>();
+  if (isConst() && other_complex->isConst())
+    return *value() == *(other_complex->value());
+  return false;
+}
+
+UnaryOp::UnaryOp(IrBuilderPasskey passkey, UnaryOpType type, Val* out, Val* in)
+    : Expr(passkey, ExprType::UnaryOp),
+      unary_op_type_{type},
+      out_{out},
+      in_{in} {
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner)
@@ -146,8 +210,13 @@ bool UnaryOp::sameAs(const Statement* other) const {
   return Expr::sameAs(other);
 }
 
-BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
-    : Expr(ExprType::BinaryOp),
+BinaryOp::BinaryOp(
+    IrBuilderPasskey passkey,
+    BinaryOpType type,
+    Val* out,
+    Val* lhs,
+    Val* rhs)
+    : Expr(passkey, ExprType::BinaryOp),
       binary_op_type_{type},
       out_{out},
       lhs_{lhs},
@@ -155,7 +224,6 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
   addOutput(out);
   addInput(lhs);
   addInput(rhs);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner)
@@ -178,8 +246,14 @@ bool BinaryOp::sameAs(const Statement* other) const {
   return Expr::sameAs(other);
 }
 
-TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
-    : Expr(ExprType::TernaryOp),
+TernaryOp::TernaryOp(
+    IrBuilderPasskey passkey,
+    TernaryOpType type,
+    Val* out,
+    Val* in1,
+    Val* in2,
+    Val* in3)
+    : Expr(passkey, ExprType::TernaryOp),
       ternary_op_type_{type},
       out_{out},
       in1_{in1},
@@ -189,7 +263,6 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
   addInput(in1);
   addInput(in2);
   addInput(in3);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner)
@@ -213,8 +286,12 @@ bool TernaryOp::sameAs(const Statement* other) const {
   return Expr::sameAs(other);
 }
 
-BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector<bool> is_broadcast_dims)
-    : Expr(ExprType::BroadcastOp),
+BroadcastOp::BroadcastOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in,
+    std::vector<bool> is_broadcast_dims)
+    : Expr(passkey, ExprType::BroadcastOp),
       out_(out),
       in_(in),
       is_broadcast_dims_(std::move(is_broadcast_dims)) {
@@ -226,12 +303,18 @@ BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector<bool> is_broadcast_dims)
   auto in_type = in->getValType().value();
 
   TORCH_INTERNAL_ASSERT(
-      out_type == ValType::TensorView && in_type == ValType::TensorView,
+      (out_type == ValType::TensorView && in_type == ValType::TensorView) ||
+          (out_type == ValType::TensorIndex && in_type == ValType::TensorIndex),
       "Cannot braodcast a non-tensor object.");
 
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
+
+  if (!out->isA<TensorView>() || !in->isA<TensorView>()) {
+    return;
+  }
+
+  passkey.ir_container_->registerExpr(exprPasskey(), this);
 
   // This is a generic check that root dims of a consumer and producer match.
   // Maybe we shouldn't relegate it to this constructor.
@@ -294,37 +377,100 @@ bool BroadcastOp::sameAs(const Statement* other) const {
 }
 
 ReductionOp::ReductionOp(
+    IrBuilderPasskey passkey,
     BinaryOpType reduction_op_type,
     Val* init,
     Val* out,
-    Val* in)
-    : Expr(ExprType::ReductionOp),
+    Val* in,
+    bool is_allreduce,
+    ExprType expr_type)
+    : Expr(passkey, expr_type),
       reduction_op_type_(reduction_op_type),
       init_(init),
       out_(out),
-      in_(in) {
-  TORCH_CHECK(out->getValType().value() == ValType::TensorView);
+      in_(in),
+      is_allreduce_(is_allreduce) {
+  TORCH_CHECK(
+      out->getValType().value() == ValType::TensorView ||
+      out->getValType().value() == ValType::TensorIndex);
 
   TORCH_INTERNAL_ASSERT(
-      in->getValType() == ValType::TensorView &&
-          out->getValType() == ValType::TensorView,
+      (in->getValType() == ValType::TensorView &&
+       out->getValType() == ValType::TensorView) ||
+          (in->getValType() == ValType::TensorIndex &&
+           out->getValType() == ValType::TensorIndex),
       "Reduction operation was created that does not have tensor inputs and outputs.");
 
-  TORCH_INTERNAL_ASSERT(
-      TensorDomain::noReductions(in->as<TensorView>()->getMaybeRFactorDomain())
-              .size() == out->as<TensorView>()->getRootDomain().size(),
-      "Reduction operation created with mismatched domains.");
-
+  if (in->isA<TensorView>()) {
+    TORCH_INTERNAL_ASSERT(
+        TensorDomain::noReductions(
+            in->as<TensorView>()->getMaybeRFactorDomain())
+                .size() == out->as<TensorView>()->getRootDomain().size(),
+        "Reduction operation created with mismatched domains.");
+  }
   TORCH_INTERNAL_ASSERT(
       init->isConstScalar(),
       "Tried to create a reduction operation whith an initial value that isn't a constant.");
 
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
+}
+
+GroupedReductionOp::GroupedReductionOp(
+    IrBuilderPasskey passkey,
+    std::vector<BinaryOpType> reduction_op_types,
+    std::vector<Val*> init_vals,
+    std::vector<Val*> outputs,
+    std::vector<Val*> inputs,
+    bool is_fused,
+    ExprType expr_type)
+    : Expr(passkey, expr_type),
+      reduction_op_types_(std::move(reduction_op_types)),
+      init_vals_(std::move(init_vals)),
+      is_allreduce_(is_fused) {
+  for (auto out : outputs) {
+    addOutput(out);
+  }
+
+  for (auto in : inputs) {
+    addInput(in);
+  }
+}
+
+GroupedReductionOp::GroupedReductionOp(
+    const GroupedReductionOp* src,
+    IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      reduction_op_types_(src->reduction_op_types_),
+      init_vals_(ir_cloner->clone(src->init_vals_)),
+      is_allreduce_(src->is_allreduce_) {}
+
+bool GroupedReductionOp::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+
+  auto grouped_rop = dynamic_cast<const GroupedReductionOp*>(other);
+  if (grouped_rop == nullptr) {
+    return false;
+  }
+
+  if (!Expr::sameAs(other) ||
+      getReductionOpTypes() != grouped_rop->getReductionOpTypes()) {
+    return false;
+  }
+
+  for (const auto i : c10::irange(numReductions())) {
+    if (!initVal(i)->sameAs(grouped_rop->initVal(i))) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
 WelfordOp::WelfordOp(
+    IrBuilderPasskey passkey,
     Val* out_avg,
     Val* out_var,
     Val* out_N,
@@ -333,8 +479,9 @@ WelfordOp::WelfordOp(
     Val* init_N,
     Val* in_avg,
     Val* in_var,
-    Val* in_N)
-    : Expr(ExprType::WelfordOp),
+    Val* in_N,
+    bool is_fused)
+    : Expr(passkey, ExprType::WelfordOp),
       out_avg_(out_avg),
       out_var_(out_var),
       out_N_(out_N),
@@ -342,12 +489,19 @@ WelfordOp::WelfordOp(
       init_var_(init_var),
       init_N_(init_N),
       in_avg_(in_avg),
-      in_var_(in_var),
-      in_N_(in_N) {
+      in_var_(in_var == nullptr ? in_avg->container()->zeroVal() : in_var),
+      in_N_(in_N),
+      is_allreduce_(is_fused) {
   // Check output type
-  TORCH_INTERNAL_ASSERT(out_avg->getValType().value() == ValType::TensorView);
-  TORCH_INTERNAL_ASSERT(out_var->getValType().value() == ValType::TensorView);
-  TORCH_INTERNAL_ASSERT(out_N->getValType().value() == ValType::TensorView);
+  TORCH_INTERNAL_ASSERT(
+      out_avg->getValType().value() == ValType::TensorView ||
+      out_avg->getValType().value() == ValType::TensorIndex);
+  TORCH_INTERNAL_ASSERT(
+      out_var->getValType().value() == ValType::TensorView ||
+      out_var->getValType().value() == ValType::TensorIndex);
+  TORCH_INTERNAL_ASSERT(
+      out_N->getValType().value() == ValType::TensorView ||
+      out_N->getValType().value() == ValType::TensorIndex);
 
   // check initial value
   TORCH_INTERNAL_ASSERT(init_N->getValType().value() == ValType::Scalar);
@@ -356,36 +510,48 @@ WelfordOp::WelfordOp(
     // initial value with a count of 1 is un-common enough that I'll push
     // the responsibility of creating all-zero var tensors to the user
     TORCH_INTERNAL_ASSERT(
-        init_avg && init_avg->getValType().value() == ValType::TensorView);
+        init_avg &&
+        (init_avg->getValType().value() == ValType::TensorView ||
+         init_avg->getValType().value() == ValType::TensorIndex));
     TORCH_INTERNAL_ASSERT(
-        init_var && init_var->getValType().value() == ValType::TensorView);
+        init_var &&
+        (init_var->getValType().value() == ValType::TensorView ||
+         init_var->getValType().value() == ValType::TensorIndex));
   }
 
   TORCH_INTERNAL_ASSERT(
-      in_avg && in_avg->getValType().value() == ValType::TensorView);
+      in_avg &&
+          (in_avg->getValType().value() == ValType::TensorView ||
+           in_avg->getValType().value() == ValType::TensorIndex),
+      in_avg->getValType().value());
   // check input
   TORCH_INTERNAL_ASSERT(
       in_N->getValType().value() == ValType::Scalar ||
-      in_N->getValType().value() == ValType::TensorView);
+      in_N->getValType().value() == ValType::TensorView ||
+      in_N->getValType().value() == ValType::TensorIndex);
   if (!in_N->isOneInt()) {
     // when input is only one value, only the value is required through avg
     // input the var part is implicitly 0 and codegen will handle that.
     TORCH_INTERNAL_ASSERT(
-        in_var && in_var->getValType().value() == ValType::TensorView);
+        in_var &&
+        (in_var->getValType().value() == ValType::TensorView ||
+         in_var->getValType().value() == ValType::TensorIndex));
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        in_var == nullptr || in_var->isZeroInt(),
+        "Invalid var input, which must be either nullptr or scalar zero when the N input is one.");
   }
 
-  addOutput(out_avg);
-  addOutput(out_var);
-  addOutput(out_N);
+  addOutput(out_avg_);
+  addOutput(out_var_);
+  addOutput(out_N_);
 
-  addInput(in_avg);
-  // Conditionally adding this input?
-  if (!in_N->isOneInt()) {
-    addInput(in_var);
-  }
-  addInput(in_N);
-
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
+  addInput(in_avg_);
+  // Previously in_var_ was allowed to be null
+  TORCH_INTERNAL_ASSERT(
+      in_var_ != nullptr, "Welford var input nullptr not allowed");
+  addInput(in_var_);
+  addInput(in_N_);
 }
 
 WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner)
@@ -398,7 +564,8 @@ WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner)
       init_N_(ir_cloner->clone(src->init_N_)),
       in_avg_(ir_cloner->clone(src->in_avg_)),
       in_var_(src->in_var_ ? ir_cloner->clone(src->in_var_) : nullptr),
-      in_N_(ir_cloner->clone(src->in_N_)) {}
+      in_N_(ir_cloner->clone(src->in_N_)),
+      is_allreduce_(src->is_allreduce_) {}
 
 namespace {
 inline bool sameOptionalVal(Val* a, Val* b) {
@@ -421,12 +588,80 @@ bool WelfordOp::sameAs(const Statement* other) const {
   return false;
 }
 
+std::vector<Val*> WelfordOp::getInitVals() const {
+  std::vector<Val*> init_vals({init_avg_, init_var_, init_N_});
+  return init_vals;
+}
+
+MmaOp::MmaOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in_a,
+    Val* in_b,
+    Val* init)
+    : Expr(passkey, ExprType::MmaOp),
+      out_(out),
+      in_a_(in_a),
+      in_b_(in_b),
+      init_(init) {
+  // Check output type
+  TORCH_INTERNAL_ASSERT(
+      out->getValType().value() == ValType::TensorView ||
+      out->getValType().value() == ValType::TensorIndex);
+
+  TORCH_INTERNAL_ASSERT(
+      in_a->getValType().value() == ValType::TensorView ||
+          in_a->getValType().value() == ValType::TensorIndex,
+      in_a->getValType().value());
+
+  TORCH_INTERNAL_ASSERT(
+      in_b->getValType().value() == ValType::TensorView ||
+          in_b->getValType().value() == ValType::TensorIndex,
+      in_b->getValType().value());
+
+  addOutput(out);
+  addInput(in_a);
+  addInput(in_b);
+}
+
+MmaOp::MmaOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in_a,
+    Val* in_b,
+    Val* init,
+    MmaOptions options)
+    : MmaOp(passkey, out, in_a, in_b, init) {
+  options_ = options;
+}
+
+MmaOp::MmaOp(const MmaOp* src, IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      out_(ir_cloner->clone(src->out_)),
+      in_a_(ir_cloner->clone(src->in_a_)),
+      in_b_(ir_cloner->clone(src->in_b_)),
+      init_(ir_cloner->clone(src->init_)),
+      options_(src->options_) {}
+
+bool MmaOp::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (auto other_mma = dynamic_cast<const MmaOp*>(other)) {
+    return out_->sameAs(other_mma->out_) && in_a_->sameAs(other_mma->in_a_) &&
+        in_b_->sameAs(other_mma->in_b_) && init_->sameAs(other_mma->init_) &&
+        options_ == other_mma->options_;
+  }
+  return false;
+}
+
 ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner)
     : Expr(src, ir_cloner),
       reduction_op_type_(src->reduction_op_type_),
       init_(ir_cloner->clone(src->init_)),
       out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
+      in_(ir_cloner->clone(src->in_)),
+      is_allreduce_(src->is_allreduce_) {}
 
 bool ReductionOp::sameAs(const Statement* other) const {
   if (this == other) {
@@ -444,10 +679,11 @@ bool ReductionOp::sameAs(const Statement* other) const {
 }
 
 TransposeOp::TransposeOp(
+    IrBuilderPasskey passkey,
     TensorView* out,
     TensorView* in,
     std::vector<int> new2old)
-    : Expr(ExprType::TransposeOp),
+    : Expr(passkey, ExprType::TransposeOp),
       out_(out),
       in_(in),
       new2old_(std::move(new2old)) {
@@ -481,7 +717,6 @@ TransposeOp::TransposeOp(
 
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner)
@@ -490,12 +725,17 @@ TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner)
       in_(ir_cloner->clone(src->in_)),
       new2old_(src->new2old_) {}
 
-ShiftOp::ShiftOp(Val* out, Val* in, std::vector<int> offsets, bool pad)
-    : Expr(ExprType::ShiftOp),
+ShiftOp::ShiftOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in,
+    std::vector<int> offsets,
+    std::vector<int> pad_width)
+    : Expr(passkey, ExprType::ShiftOp),
       out_(out),
       in_(in),
       offsets_(std::move(offsets)),
-      pad_(pad) {
+      pad_width_(std::move(pad_width)) {
   // clang-tidy complains about out_ that it may be null.
   TORCH_INTERNAL_ASSERT(out_ != nullptr);
   TORCH_INTERNAL_ASSERT(in_ != nullptr);
@@ -514,9 +754,15 @@ ShiftOp::ShiftOp(Val* out, Val* in, std::vector<int> offsets, bool pad)
       "Invalid offset vector: ",
       offsets_);
 
+  TORCH_INTERNAL_ASSERT(
+      pad_width_.size() ==
+          TensorDomain::noReductions(in_->as<TensorView>()->getRootDomain())
+              .size(),
+      "Invalid padding width vector: ",
+      pad_width_);
+
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner)
@@ -524,7 +770,7 @@ ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner)
       out_(ir_cloner->clone(src->out_)),
       in_(ir_cloner->clone(src->in_)),
       offsets_(src->offsets_),
-      pad_(src->pad_) {}
+      pad_width_(src->pad_width_) {}
 
 bool ShiftOp::sameAs(const Statement* other) const {
   if (this == other) {
@@ -541,11 +787,12 @@ bool ShiftOp::sameAs(const Statement* other) const {
 }
 
 GatherOp::GatherOp(
+    IrBuilderPasskey passkey,
     Val* out,
     Val* in,
-    std::vector<Int*> window_shape,
-    std::vector<std::vector<Int*>> pad_width)
-    : Expr(ExprType::GatherOp),
+    std::vector<int> window_shape,
+    std::vector<std::vector<int>> pad_width)
+    : Expr(passkey, ExprType::GatherOp),
       out_(out),
       in_(in),
       window_shape_(std::move(window_shape)),
@@ -578,28 +825,14 @@ GatherOp::GatherOp(
 
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 GatherOp::GatherOp(const GatherOp* src, IrCloner* ir_cloner)
     : Expr(src, ir_cloner),
       out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {
-  std::transform(
-      src->window_shape_.begin(),
-      src->window_shape_.end(),
-      std::back_inserter(window_shape_),
-      [&ir_cloner](const auto& x) { return ir_cloner->clone(x); });
-  for (const auto& pad : src->pad_width_) {
-    std::vector<Int*> pad_clone;
-    std::transform(
-        pad.begin(),
-        pad.end(),
-        std::back_inserter(pad_clone),
-        [&ir_cloner](const auto& x) { return ir_cloner->clone(x); });
-    pad_width_.push_back(pad_clone);
-  }
-}
+      in_(ir_cloner->clone(src->in_)),
+      window_shape_(src->window_shape_),
+      pad_width_(src->pad_width_) {}
 
 bool GatherOp::sameAs(const Statement* other) const {
   if (this == other) {
@@ -609,23 +842,10 @@ bool GatherOp::sameAs(const Statement* other) const {
     return false;
   }
   const auto other_op = other->as<GatherOp>();
-  if (windowShape().size() != other_op->windowShape().size()) {
-    return false;
-  }
-  for (const auto i : c10::irange(windowShape().size())) {
-    if (!windowShape()[i]->sameAs(other_op->windowShape()[i])) {
-      return false;
-    }
-  }
-  if (padWidth().size() != other_op->padWidth().size()) {
+  if (windowShape() != other_op->windowShape() ||
+      padWidth() != other_op->padWidth()) {
     return false;
   }
-  for (const auto i : c10::irange(padWidth().size())) {
-    if (!padWidth()[i][0]->sameAs(other_op->padWidth()[i][0]) ||
-        !padWidth()[i][1]->sameAs(other_op->padWidth()[i][1])) {
-      return false;
-    }
-  }
   return Expr::sameAs(other);
 }
 
@@ -638,11 +858,32 @@ int GatherOp::gatherAxis(int axis) const {
   return int(windowShape().size()) + axis;
 }
 
-ViewOp::ViewOp(TensorView* out, TensorView* in)
-    : Expr(ExprType::ViewOp), out_(out), in_(in) {
+ViewAsScalar::ViewAsScalar(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in,
+    IterDomain* vector_id,
+    Val* index)
+    : Expr(passkey, ExprType::ViewAsScalar),
+      out_(out),
+      in_(in),
+      vector_id_(vector_id),
+      index_(index) {
+  addOutput(out);
+  addInput(in);
+}
+
+ViewAsScalar::ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      out_(ir_cloner->clone(src->out_)),
+      in_(ir_cloner->clone(src->in_)),
+      vector_id_(ir_cloner->clone(src->vector_id_)),
+      index_(ir_cloner->clone(src->index_)) {}
+
+ViewOp::ViewOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in)
+    : Expr(passkey, ExprType::ViewOp), out_(out), in_(in) {
   addOutput(out);
   addInput(in);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner)
@@ -651,33 +892,50 @@ ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner)
       in_(ir_cloner->clone(src->in_)) {}
 
 IterDomain::IterDomain(
+    IrBuilderPasskey passkey,
     Val* start,
     Val* extent,
     ParallelType parallel_type,
     IterType iter_type,
-    bool is_rfactor_domain)
+    bool is_rfactor_domain,
+    bool is_padded_dimension,
+    c10::optional<int64_t> padded_to_size,
+    bool is_mma_swizzled)
     : IterDomain(
+          passkey,
           start,
           extent,
           nullptr,
           parallel_type,
           iter_type,
-          is_rfactor_domain) {}
+          is_rfactor_domain,
+          is_padded_dimension,
+          padded_to_size,
+          is_mma_swizzled) {}
 
 IterDomain::IterDomain(
+    IrBuilderPasskey passkey,
     Val* start,
     Val* extent,
     Val* stop_offset,
     ParallelType parallel_type,
     IterType iter_type,
-    bool is_rfactor_domain)
-    : Val(ValType::IterDomain, DataType::Int, false),
+    bool is_rfactor_domain,
+    bool is_padded_dimension,
+    c10::optional<int64_t> padded_to_size,
+    bool is_mma_swizzled)
+    : Val(passkey, ValType::IterDomain, DataType::Int),
       start_(start),
       extent_(extent),
-      stop_offset_(stop_offset == nullptr ? new Int(0) : stop_offset),
+      stop_offset_(
+          stop_offset == nullptr ? passkey.ir_container_->zeroVal()
+                                 : stop_offset),
       parallel_type_(parallel_type),
       iter_type_(iter_type),
-      is_rfactor_domain_(is_rfactor_domain) {
+      is_rfactor_domain_(is_rfactor_domain),
+      is_padded_dimension_(is_padded_dimension),
+      padded_to_size_(padded_to_size),
+      is_mma_swizzled_(is_mma_swizzled) {
   TORCH_CHECK(
       !(isRFactorProduct() && isBroadcast()),
       "IterDomain cannot be both a broadcast and rfactor domain.");
@@ -693,8 +951,6 @@ IterDomain::IterDomain(
       "Cannot create an iter domain with a start that is not an int but received ",
       start,
       " .");
-
-  name_ = fusion_->registerVal(this);
 }
 
 IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner)
@@ -706,7 +962,8 @@ IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner)
       iter_type_(src->iter_type_),
       is_rfactor_domain_(src->is_rfactor_domain_),
       is_padded_dimension_(src->is_padded_dimension_),
-      padded_to_size_(src->padded_to_size_) {}
+      padded_to_size_(src->padded_to_size_),
+      is_mma_swizzled_(src->is_mma_swizzled_) {}
 
 bool IterDomain::sameAs(const Statement* other) const {
   if (other == this) {
@@ -720,7 +977,8 @@ bool IterDomain::sameAs(const Statement* other) const {
   const IterDomain* other_id = other->as<IterDomain>();
 
   bool is_same = isReduction() == other_id->isReduction() &&
-      getParallelType() == other_id->getParallelType();
+      getParallelType() == other_id->getParallelType() &&
+      isVectorComponent() == other_id->isVectorComponent();
   is_same = is_same && ScalarCheck::sameAs(extent(), other_id->extent());
   is_same = is_same && ScalarCheck::sameAs(start(), other_id->start());
   is_same =
@@ -729,6 +987,24 @@ bool IterDomain::sameAs(const Statement* other) const {
   return is_same;
 }
 
+// Returns a new IterDomain matching properties of this except for
+// is_rfactor_domain_
+IterDomain* IterDomain::cloneWithoutRFactor() const {
+  auto cloned = IrBuilder::create<IterDomain>(
+      ir_container_,
+      start(),
+      extent(),
+      stopOffset(),
+      getParallelType(),
+      getIterType(),
+      false,
+      is_padded_dimension_,
+      padded_to_size_,
+      is_mma_swizzled_);
+
+  return cloned;
+}
+
 std::vector<IterDomain*> IterDomain::clone(
     const std::vector<IterDomain*>& domains) {
   std::vector<IterDomain*> cloned_domains;
@@ -736,7 +1012,7 @@ std::vector<IterDomain*> IterDomain::clone(
       domains.begin(),
       domains.end(),
       std::back_inserter(cloned_domains),
-      [](auto id) { return id->clone(); });
+      [](auto id) { return id->cloneWithoutRFactor(); });
   return cloned_domains;
 }
 
@@ -781,14 +1057,15 @@ IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) {
     itype = IterType::Iteration;
   }
 
-  IterDomain* merged_id = new IterDomain(
-      new Int(0),
+  IterDomain* merged_id = IrBuilder::create<IterDomain>(
+      outer->container(),
+      outer->container()->zeroVal(),
       merged_id_size->as<Int>(),
       outer->getParallelType(),
       itype,
       outer->isRFactorProduct() || inner->isRFactorProduct());
 
-  new Merge(merged_id, outer, inner);
+  IrBuilder::create<Merge>(outer->container(), merged_id, outer, inner);
 
   return merged_id;
 }
@@ -811,7 +1088,8 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
   if (factor->getValType() == ValType::Scalar) {
     TORCH_CHECK(
         factor->isConstScalar() ||
-            FusionGuard::getCurFusion()->hasInput(factor),
+            (FusionGuard::getCurFusion() == factor->fusion() &&
+             factor->isFusionInput()),
         factor,
         " is not a constant nor an input. It must be one or the other to be used in a split.",
         " If you want a symbolic split based on a thread dimension please use IterDomain::split(IterDomain*, ParallelType);");
@@ -832,24 +1110,33 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
         in->definition() == nullptr,
         "Partial split is only allowed with root domains");
   }
-
   // outer loop IterDomain
-  IterDomain* ido = new IterDomain(
-      new Int(0),
+  IterDomain* ido = IrBuilder::create<IterDomain>(
+      in->container(),
+      in->container()->zeroVal(),
       inner_split ? remainder->as<Int>() : factor,
       in->getParallelType(),
       in->getIterType(),
       in->isRFactorProduct());
 
   // inner loop IterDomain
-  IterDomain* idi = new IterDomain(
-      new Int(0),
+  IterDomain* idi = IrBuilder::create<IterDomain>(
+      in->container(),
+      in->container()->zeroVal(),
       inner_split ? factor : remainder->as<Int>(),
       in->getParallelType(),
       in->getIterType(),
       in->isRFactorProduct());
 
-  new Split(ido, idi, in, factor, inner_split, start_offset, stop_offset);
+  IrBuilder::create<Split>(
+      in->container(),
+      ido,
+      idi,
+      in,
+      factor,
+      inner_split,
+      start_offset,
+      stop_offset);
   return {ido, idi};
 }
 
@@ -864,7 +1151,9 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
 }
 
 std::pair<IterDomain*, IterDomain*> IterDomain::stridedSplit(int factor) {
-  auto split_out = IterDomain::split(this, new Int(factor), true);
+  // Use partial split so that only valid values are retained
+  auto split_out = IterDomain::split(
+      this, IrBuilder::create<Int>(container(), factor), true, true);
 
   split_out.second->iter_type_ = IterType::Stride;
   split_out.first->is_rfactor_domain_ = true;
@@ -877,7 +1166,11 @@ std::pair<IterDomain*, IterDomain*> IterDomain::stridedSplit(int factor) {
 // vectorize to the left of the computeAt domain, and could allow us to do some
 // simple validation of vectorize as it's inputs are right most and contiguous.
 void IterDomain::parallelize(ParallelType t) {
-  parallel_type_ = t;
+  if (parallel_type_ == t) {
+    // No op, don't do any more checks, it was already set to this value.
+    return;
+  }
+
   if (t == ParallelType::Unroll || isParallelTypeVectorize(t)) {
     TORCH_CHECK(
         start()->isZeroInt() && extent()->isConstScalar(),
@@ -888,6 +1181,14 @@ void IterDomain::parallelize(ParallelType t) {
         extent(),
         " .");
   }
+
+  if (isMmaSwizzled()) {
+    TORCH_CHECK(
+        t == ParallelType::Vectorize,
+        "Parallel type other than vectorize not allowed for warp mapped ids");
+  }
+
+  parallel_type_ = t;
 }
 
 bool IterDomain::maybePartial() const {
@@ -907,9 +1208,10 @@ Val* IterDomain::stop() const {
 }
 
 TensorDomain::TensorDomain(
+    IrBuilderPasskey passkey,
     std::vector<IterDomain*> root_domain,
     std::vector<bool> contiguity)
-    : Val(ValType::TensorDomain, DataType::Null, false),
+    : Val(passkey, ValType::TensorDomain, DataType::Null),
       root_domain_(std::move(root_domain)),
       contiguity_(
           contiguity.empty() ? std::vector<bool>(root_domain_.size(), false)
@@ -925,14 +1227,14 @@ TensorDomain::TensorDomain(
   has_nontrivial_reduction_ = false;
   domain_ = root_domain_;
   resetDomains();
-  name_ = fusion_->registerVal(this);
 }
 
 TensorDomain::TensorDomain(
+    IrBuilderPasskey passkey,
     std::vector<IterDomain*> root_domain,
     std::vector<IterDomain*> domain,
     std::vector<bool> contiguity)
-    : Val(ValType::TensorDomain, DataType::Null, false),
+    : Val(passkey, ValType::TensorDomain, DataType::Null),
       root_domain_(std::move(root_domain)),
       domain_(std::move(domain)),
       contiguity_(
@@ -963,15 +1265,15 @@ TensorDomain::TensorDomain(
   // Just due to clang-tidy, correct value set in resetDomains
   has_nontrivial_reduction_ = false;
   resetDomains();
-  name_ = fusion_->registerVal(this);
 }
 
 TensorDomain::TensorDomain(
+    IrBuilderPasskey passkey,
     std::vector<IterDomain*> root_domain,
     std::vector<IterDomain*> rfactor_domain,
     std::vector<IterDomain*> domain,
     std::vector<bool> contiguity)
-    : Val(ValType::TensorDomain, DataType::Null, false),
+    : Val(passkey, ValType::TensorDomain, DataType::Null),
       root_domain_(std::move(root_domain)),
       domain_(std::move(domain)),
       rfactor_domain_(std::move(rfactor_domain)),
@@ -1013,7 +1315,6 @@ TensorDomain::TensorDomain(
   // Just due to clang-tidy, correct value set in resetDomains
   has_nontrivial_reduction_ = false;
   resetDomains();
-  name_ = fusion_->registerVal(this);
 }
 
 TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
@@ -1026,6 +1327,30 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
       contiguity_(src->contiguity()),
       has_nontrivial_reduction_(src->has_nontrivial_reduction_) {}
 
+namespace {
+std::vector<IterDomain*> lowerIterDomains(
+    const std::vector<fuser::cuda::IterDomain*>& domains) {
+  std::vector<IterDomain*> lowered_domains;
+  lowered_domains.reserve(domains.size());
+  for (const auto iter_domain : domains) {
+    lowered_domains.push_back(iter_domain);
+  }
+  return lowered_domains;
+};
+} // namespace
+
+bool TensorDomain::hasBlockBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isThreadDim();
+  });
+}
+
+bool TensorDomain::hasGridBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isBlockDim();
+  });
+}
+
 bool TensorDomain::operator==(const TensorDomain& other) const {
   // Checks equality of each class field. Should not be necessary to
   // check no_bcast_domain_ and no_reduction_domain_ as they are just
@@ -1123,6 +1448,22 @@ bool TensorDomain::hasRFactor() const {
   return !rfactor_domain_.empty();
 }
 
+bool TensorDomain::hasViewLikeRFactor() const {
+  if (!hasRFactor()) {
+    // Can't have view like rfactor if there is no rfactor domain
+    return false;
+  }
+
+  // If there's an rfactor domain and no rfactor product is a reduction, this is
+  // a view like rfactor
+  return std::none_of(
+      getMaybeRFactorDomain().begin(),
+      getMaybeRFactorDomain().end(),
+      [](IterDomain* id) {
+        return id->isReduction() && id->isRFactorProduct();
+      });
+}
+
 bool TensorDomain::hasVectorize() const {
   return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
     return id->getParallelType() == ParallelType::Vectorize ||
@@ -1200,6 +1541,10 @@ void TensorDomain::split(
         "Partial split is only allowed with root domains");
   }
 
+  TORCH_INTERNAL_ASSERT(
+      !id->isMmaSwizzled(),
+      "Further transformation on warp mapped id's not allowed.");
+
   auto split_ids =
       IterDomain::split(id, factor, inner_split, trim_out_of_bounds);
   domain_.erase(domain_.begin() + axis_);
@@ -1235,6 +1580,10 @@ void TensorDomain::merge(int axis_o, int axis_i) {
   IterDomain* first = axis(axis_o);
   IterDomain* second = axis(axis_i);
 
+  TORCH_INTERNAL_ASSERT(
+      !first->isMmaSwizzled() && !second->isMmaSwizzled(),
+      "Further transformation on warp mapped id's not allowed.");
+
   IterDomain* merged_id = IterDomain::merge(first, second);
 
   domain_.erase(domain_.begin() + axis_i);
@@ -1339,6 +1688,52 @@ TensorDomain* TensorDomain::view(
   return transformView(this, transforms);
 }
 
+TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) {
+  if (start_dim < 0) {
+    start_dim += nDims();
+  }
+  if (end_dim < 0) {
+    end_dim += nDims();
+  }
+
+  std::vector<IterDomain*> new_root_domain;
+  auto inp_domain = noReductions(getMaybeRFactorDomain());
+  new_root_domain.reserve(inp_domain.size());
+  for (auto id : inp_domain) {
+    new_root_domain.push_back(id->cloneWithoutRFactor());
+  }
+
+  std::vector<IterDomain*> rfactor_domain;
+  rfactor_domain.reserve(new_root_domain.size() - (end_dim - start_dim));
+  for (auto i : c10::irange(start_dim)) {
+    rfactor_domain.push_back(new_root_domain[i]);
+  }
+
+  IterDomain* merged_id = new_root_domain[start_dim];
+  for (auto i : c10::irange(start_dim + 1, end_dim + 1)) {
+    IterDomain* new_merged_id = IrBuilder::create<IterDomain>(
+        merged_id->container(),
+        merged_id->container()->zeroVal(),
+        mul(merged_id->extent(), new_root_domain[i]->extent()),
+        ParallelType::Serial,
+        IterType::Iteration,
+        true);
+    IrBuilder::create<Merge>(new_merged_id, merged_id, new_root_domain[i]);
+    merged_id = new_merged_id;
+  }
+  rfactor_domain.push_back(merged_id);
+
+  for (auto i : c10::irange(end_dim + 1, nDims())) {
+    rfactor_domain.push_back(new_root_domain[i]);
+  }
+
+  return IrBuilder::create<TensorDomain>(
+      new_root_domain,
+      rfactor_domain,
+      rfactor_domain,
+      std::vector<bool>(rfactor_domain.size(), true));
+}
+
 // TODO: Rfactor a Welford
 
 // pair is in order where second is the consumer of first
@@ -1389,6 +1784,7 @@ std::pair<TensorDomain*, TensorDomain*> TensorDomain::rFactor(
 }
 
 Split::Split(
+    IrBuilderPasskey passkey,
     IterDomain* outer,
     IterDomain* inner,
     IterDomain* in,
@@ -1396,14 +1792,18 @@ Split::Split(
     bool inner_split,
     Val* start_offset,
     Val* stop_offset)
-    : Expr(ExprType::Split),
+    : Expr(passkey, ExprType::Split),
       outer_{outer},
       inner_{inner},
       in_{in},
       factor_{factor},
       inner_split_{inner_split},
-      start_offset_{start_offset != nullptr ? start_offset : new Int(0)},
-      stop_offset_{stop_offset != nullptr ? stop_offset : new Int(0)} {
+      start_offset_{
+          start_offset != nullptr ? start_offset
+                                  : passkey.ir_container_->zeroVal()},
+      stop_offset_{
+          stop_offset != nullptr ? stop_offset
+                                 : passkey.ir_container_->zeroVal()} {
   TORCH_INTERNAL_ASSERT(
       factor_->isAnInt(),
       "Attempted to create a Split node with a non-integer factor.");
@@ -1412,7 +1812,6 @@ Split::Split(
   addInput(in);
   // TODO add factor as an input, need to check Split::Split during validation
   // and need to check BestEffortReplay::findFirstMismatchedID addInput(factor);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 Split::Split(const Split* src, IrCloner* ir_cloner)
@@ -1453,12 +1852,15 @@ bool Split::sameAs(const Statement* other) const {
       stopOffset()->sameAs(other->as<Split>()->stopOffset());
 }
 
-Merge::Merge(IterDomain* out, IterDomain* outer, IterDomain* inner)
-    : Expr(ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} {
+Merge::Merge(
+    IrBuilderPasskey passkey,
+    IterDomain* out,
+    IterDomain* outer,
+    IterDomain* inner)
+    : Expr(passkey, ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} {
   addOutput(out);
   addInput(outer);
   addInput(inner);
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
 }
 
 Merge::Merge(const Merge* src, IrCloner* ir_cloner)
@@ -1477,6 +1879,12 @@ bool Merge::sameAs(const Statement* other) const {
   return Expr::sameAs(other);
 }
 
+NamedScalar::NamedScalar(
+    IrBuilderPasskey passkey,
+    std::string name,
+    DataType dtype)
+    : Val(passkey, ValType::NamedScalar, dtype), name_(std::move(name)) {}
+
 NamedScalar::NamedScalar(const NamedScalar* src, IrCloner* ir_cloner)
     : Val(src, ir_cloner), name_(src->name_) {}
 
@@ -1495,13 +1903,15 @@ NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) {
       isParallelTypeThread(p_type),
       "Cannot get parallel dim of non thread type, received: ",
       p_type);
+  TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr);
   std::string parallel_dim = stringifyThreadSize(p_type);
-  return new NamedScalar(parallel_dim, DataType::Int);
+  return IrBuilder::create<NamedScalar>(parallel_dim, DataType::Int);
 }
 
 NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) {
+  TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr);
   std::string parallel_ind = stringifyThread(p_type);
-  return new NamedScalar(parallel_ind, DataType::Int);
+  return IrBuilder::create<NamedScalar>(parallel_ind, DataType::Int);
 }
 
 c10::optional<ParallelType> NamedScalar::getParallelDim() const {
diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/torch/csrc/jit/codegen/cuda/ir_printer.h
index a2c14386147e..91d07b76b805 100644
--- a/torch/csrc/jit/codegen/cuda/ir_printer.h
+++ b/torch/csrc/jit/codegen/cuda/ir_printer.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
index 5bf05b0f516f..6415733ba39e 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 
@@ -140,7 +141,8 @@ struct SubstituteInExpr : public OptInDispatch {
         reference_->sameAs(unary_expr->in()) ? substitute_ : unary_expr->in();
     auto out =
         reference_->sameAs(unary_expr->out()) ? substitute_ : unary_expr->out();
-    expr_ = new UnaryOp(unary_expr->getUnaryOpType(), out, in);
+    expr_ = IrBuilder::create<UnaryOp>(
+        unary_expr->container(), unary_expr->getUnaryOpType(), out, in);
   }
 
   void handle(BinaryOp* binary_expr) final {
@@ -151,7 +153,12 @@ struct SubstituteInExpr : public OptInDispatch {
     auto out = reference_->sameAs(binary_expr->out()) ? substitute_
                                                       : binary_expr->out();
 
-    expr_ = new BinaryOp(binary_expr->getBinaryOpType(), out, lhs, rhs);
+    expr_ = IrBuilder::create<BinaryOp>(
+        binary_expr->container(),
+        binary_expr->getBinaryOpType(),
+        out,
+        lhs,
+        rhs);
   }
 
   void handle(TernaryOp* ternary_expr) final {
@@ -163,7 +170,13 @@ struct SubstituteInExpr : public OptInDispatch {
                                                        : ternary_expr->in3();
     auto out = reference_->sameAs(ternary_expr->out()) ? substitute_
                                                        : ternary_expr->out();
-    expr_ = new TernaryOp(ternary_expr->getTernaryOpType(), out, in1, in2, in3);
+    expr_ = IrBuilder::create<TernaryOp>(
+        ternary_expr->container(),
+        ternary_expr->getTernaryOpType(),
+        out,
+        in1,
+        in2,
+        in3);
   }
 
   void handle(ReductionOp* reduction_expr) final {
@@ -176,8 +189,42 @@ struct SubstituteInExpr : public OptInDispatch {
     auto in = reference_->sameAs(reduction_expr->in()) ? substitute_
                                                        : reduction_expr->in();
 
-    expr_ =
-        new ReductionOp(reduction_expr->getReductionOpType(), init, out, in);
+    expr_ = IrBuilder::create<ReductionOp>(
+        reduction_expr->container(),
+        reduction_expr->getReductionOpType(),
+        init,
+        out,
+        in);
+  }
+
+  void handle(GroupedReductionOp* grouped_reduction_expr) final {
+    std::vector<Val*> outputs;
+    std::transform(
+        grouped_reduction_expr->outputs().begin(),
+        grouped_reduction_expr->outputs().end(),
+        std::back_inserter(outputs),
+        [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; });
+
+    std::vector<Val*> inputs;
+    std::transform(
+        grouped_reduction_expr->inputs().begin(),
+        grouped_reduction_expr->inputs().end(),
+        std::back_inserter(inputs),
+        [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; });
+
+    std::vector<Val*> init_vals;
+    std::transform(
+        grouped_reduction_expr->initVals().begin(),
+        grouped_reduction_expr->initVals().end(),
+        std::back_inserter(init_vals),
+        [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; });
+
+    expr_ = IrBuilder::create<GroupedReductionOp>(
+        grouped_reduction_expr->container(),
+        grouped_reduction_expr->getReductionOpTypes(),
+        init_vals,
+        outputs,
+        inputs);
   }
 
   void handle(BroadcastOp* broadcast_expr) final {
@@ -187,7 +234,11 @@ struct SubstituteInExpr : public OptInDispatch {
     auto in = reference_->sameAs(broadcast_expr->in()) ? substitute_
                                                        : broadcast_expr->in();
 
-    expr_ = new BroadcastOp(out, in, broadcast_expr->getBroadcastDimFlags());
+    expr_ = IrBuilder::create<BroadcastOp>(
+        broadcast_expr->container(),
+        out,
+        in,
+        broadcast_expr->getBroadcastDimFlags());
   }
 
   void handle(TransposeOp* transpose_expr) final {
@@ -201,7 +252,8 @@ struct SubstituteInExpr : public OptInDispatch {
     auto in = reference_->sameAs(transpose_expr->in())
         ? substitute_->as<TensorView>()
         : transpose_expr->in();
-    expr_ = new TransposeOp(out, in, transpose_expr->new2old());
+    expr_ = IrBuilder::create<TransposeOp>(
+        transpose_expr->container(), out, in, transpose_expr->new2old());
   }
 
   void handle(ShiftOp* shift_expr) final {
@@ -210,7 +262,12 @@ struct SubstituteInExpr : public OptInDispatch {
     auto in =
         reference_->sameAs(shift_expr->in()) ? substitute_ : shift_expr->in();
 
-    expr_ = new ShiftOp(out, in, shift_expr->offsets(), shift_expr->pad());
+    expr_ = IrBuilder::create<ShiftOp>(
+        shift_expr->container(),
+        out,
+        in,
+        shift_expr->offsets(),
+        shift_expr->padWidth());
   }
 
   void handle(GatherOp* gather_expr) final {
@@ -219,8 +276,25 @@ struct SubstituteInExpr : public OptInDispatch {
     auto in =
         reference_->sameAs(gather_expr->in()) ? substitute_ : gather_expr->in();
 
-    expr_ = new GatherOp(
-        out, in, gather_expr->windowShape(), gather_expr->padWidth());
+    expr_ = IrBuilder::create<GatherOp>(
+        gather_expr->container(),
+        out,
+        in,
+        gather_expr->windowShape(),
+        gather_expr->padWidth());
+  }
+
+  void handle(ViewAsScalar* expr) final {
+    TORCH_INTERNAL_ASSERT(
+        substitute_->isA<TensorView>(),
+        "All args to view must be TensorView, but received a non-TensorView for replacement: ",
+        substitute_);
+    auto in = reference_->sameAs(expr->in()) ? substitute_->as<TensorView>()
+                                             : expr->in();
+    auto out = reference_->sameAs(expr->out()) ? substitute_->as<TensorView>()
+                                               : expr->out();
+    expr_ = IrBuilder::create<ViewAsScalar>(
+        expr->container(), out, in, expr->vector_id(), expr->index());
   }
 
   void handle(ViewOp* view_expr) final {
@@ -234,7 +308,7 @@ struct SubstituteInExpr : public OptInDispatch {
     auto out = reference_->sameAs(view_expr->out())
         ? substitute_->as<TensorView>()
         : view_expr->out();
-    expr_ = new ViewOp(out, in);
+    expr_ = IrBuilder::create<ViewOp>(view_expr->container(), out, in);
   }
 
   void handle(WelfordOp* welford_expr) final {
@@ -268,7 +342,8 @@ struct SubstituteInExpr : public OptInDispatch {
         welford_expr->initN() && reference_->sameAs(welford_expr->initN())
         ? substitute_
         : welford_expr->initN();
-    expr_ = new WelfordOp(
+    expr_ = IrBuilder::create<WelfordOp>(
+        welford_expr->container(),
         out_avg,
         out_var,
         out_N,
@@ -277,7 +352,29 @@ struct SubstituteInExpr : public OptInDispatch {
         init_N,
         in_avg,
         in_var,
-        in_N);
+        in_N,
+        welford_expr->isAllreduce());
+  }
+
+  void handle(MmaOp* mma_expr) final {
+    TORCH_INTERNAL_ASSERT(
+        substitute_->isA<TensorView>(),
+        "All args to MmaOp must be TensorView, but received a non-TensorView for replacement: ",
+        substitute_);
+    auto in_a = reference_->sameAs(mma_expr->inA())
+        ? substitute_->as<TensorView>()
+        : mma_expr->inA();
+    auto in_b = reference_->sameAs(mma_expr->inB())
+        ? substitute_->as<TensorView>()
+        : mma_expr->inB();
+    auto out = reference_->sameAs(mma_expr->out())
+        ? substitute_->as<TensorView>()
+        : mma_expr->out();
+    auto init = reference_->sameAs(mma_expr->init())
+        ? substitute_->as<TensorView>()
+        : mma_expr->init();
+    expr_ = IrBuilder::create<MmaOp>(
+        mma_expr->container(), out, in_a, in_b, init, mma_expr->options());
   }
 
  private:
@@ -307,23 +404,24 @@ TensorView* rfactorHelper(
   auto w_var = welford->outVar()->as<TensorView>();
   auto w_n = welford->outN()->as<TensorView>();
 
-  WelfordResult rtvs = reduction_tv->rFactor(axes, w_avg, w_var, w_n);
+  auto rtvs =
+      reduction_tv->rFactor(axes, std::vector<TensorView*>{w_avg, w_var, w_n});
 
   if (reduction_tv == w_n) {
-    return rtvs.n;
+    return rtvs.at(2);
   } else if (reduction_tv == w_var) {
-    return rtvs.var_sum;
+    return rtvs.at(1);
   } else {
-    return rtvs.avg;
+    return rtvs.at(0);
   }
 }
 
 namespace {
 
-std::vector<TensorView*> uniqueEntries(
-    const std::vector<TensorView*>& tv_deuqe) {
-  std::vector<TensorView*> unique_entries;
-  std::unordered_set<TensorView*> inserted;
+template <typename T>
+std::vector<T*> uniqueEntries(const std::vector<T*>& tv_deuqe) {
+  std::vector<T*> unique_entries;
+  std::unordered_set<T*> inserted;
   for (auto tv_entry : tv_deuqe) {
     if (inserted.emplace(tv_entry).second) {
       unique_entries.emplace_back(tv_entry);
@@ -334,13 +432,59 @@ std::vector<TensorView*> uniqueEntries(
 
 } // namespace
 
+// Return immediate producers of val
+TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val) {
+  if (val->definition() == nullptr) {
+    return {};
+  }
+  auto producer_vals = val->definition()->inputs();
+  return uniqueEntries<Val>({producer_vals.begin(), producer_vals.end()});
+}
+
+// Return immediate consumers of val
+TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val) {
+  std::vector<Val*> consumer_vals;
+  for (auto use_expr : val->uses()) {
+    auto outputs = use_expr->outputs();
+    consumer_vals.insert(consumer_vals.end(), outputs.begin(), outputs.end());
+  }
+  return uniqueEntries<Val>(consumer_vals);
+}
+
+// Return immediate producers of val
+TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
+    const std::vector<Val*>& vals) {
+  std::vector<Val*> all_producer_vals;
+  for (auto val : vals) {
+    auto producer_vals = producerValsOf(val);
+    all_producer_vals.insert(
+        all_producer_vals.end(), producer_vals.begin(), producer_vals.end());
+  }
+
+  return uniqueEntries<Val>(all_producer_vals);
+}
+
+// Return immediate consumers of val
+TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(
+    const std::vector<Val*>& vals) {
+  std::vector<Val*> all_consumer_vals;
+  for (auto val : vals) {
+    auto consumer_vals = consumerValsOf(val);
+    all_consumer_vals.insert(
+        all_consumer_vals.end(), consumer_vals.begin(), consumer_vals.end());
+  }
+
+  return uniqueEntries<Val>(all_consumer_vals);
+}
+
 std::vector<TensorView*> producerTvsOf(TensorView* tv) {
   if (tv->definition() == nullptr) {
     return {};
   }
   auto producer_vals =
       ir_utils::filterByType<TensorView>(tv->definition()->inputs());
-  return uniqueEntries({producer_vals.begin(), producer_vals.end()});
+  return uniqueEntries<TensorView>(
+      {producer_vals.begin(), producer_vals.end()});
 }
 
 std::vector<TensorView*> consumerTvsOf(TensorView* tv) {
@@ -349,7 +493,7 @@ std::vector<TensorView*> consumerTvsOf(TensorView* tv) {
     auto outputs = ir_utils::filterByType<TensorView>(use_expr->outputs());
     consumer_tvs.insert(consumer_tvs.end(), outputs.begin(), outputs.end());
   }
-  return uniqueEntries(consumer_tvs);
+  return uniqueEntries<TensorView>(consumer_tvs);
 }
 
 std::vector<TensorView*> producerTvsOf(const std::vector<TensorView*>& tvs) {
@@ -360,7 +504,7 @@ std::vector<TensorView*> producerTvsOf(const std::vector<TensorView*>& tvs) {
         all_producer_tvs.end(), producer_tvs.begin(), producer_tvs.end());
   }
 
-  return uniqueEntries(all_producer_tvs);
+  return uniqueEntries<TensorView>(all_producer_tvs);
 }
 
 std::vector<TensorView*> consumerTvsOf(const std::vector<TensorView*>& tvs) {
@@ -371,7 +515,7 @@ std::vector<TensorView*> consumerTvsOf(const std::vector<TensorView*>& tvs) {
         all_consumer_tvs.end(), consumer_tvs.begin(), consumer_tvs.end());
   }
 
-  return uniqueEntries(all_consumer_tvs);
+  return uniqueEntries<TensorView>(all_consumer_tvs);
 }
 
 std::vector<TensorView*> inputTvsOf(TensorView* tv) {
@@ -386,29 +530,177 @@ std::vector<TensorView*> inputTvsOf(std::vector<TensorView*> tvs) {
   auto inp_vals = IterVisitor::getInputsTo({tvs.begin(), tvs.end()});
   auto filtered = ir_utils::filterByType<TensorView>(inp_vals);
   std::vector<TensorView*> inp_tvs(filtered.begin(), filtered.end());
-  return uniqueEntries(inp_tvs);
+  return uniqueEntries<TensorView>(inp_tvs);
 }
 
 std::vector<TensorView*> outputTvsOf(std::vector<TensorView*> tvs) {
   auto out_vals = DependencyCheck::getAllOutputsOf({tvs.begin(), tvs.end()});
   auto filtered = ir_utils::filterByType<TensorView>(out_vals);
   std::vector<TensorView*> out_tvs(filtered.begin(), filtered.end());
-  return uniqueEntries(out_tvs);
+  return uniqueEntries<TensorView>(out_tvs);
 }
 
 std::vector<TensorView*> allTvs(Fusion* fusion) {
   auto used_vals = fusion->usedMathVals();
   auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
-  return uniqueEntries({used_tvs.begin(), used_tvs.end()});
+
+  // This shouldn't be necessary but FusionSegmentIoAlias_CUDA due to aliasing
+  // is having an input disconnected from outputs, and these iter domains are
+  // being checked in compute at maps in scheduling logic. This shouldn't hurt
+  // AFAICT.
+  auto tv_inputs = ir_utils::filterByType<TensorView>(fusion->inputs());
+
+  std::vector<TensorView*> all_tvs({used_tvs.begin(), used_tvs.end()});
+  // Sometimes inputs are not connected to outputs, however, we still include
+  // them when returning allTvs because they are registered as an input.
+  all_tvs.insert(all_tvs.end(), tv_inputs.begin(), tv_inputs.end());
+
+  // all_tvs has duplicates, to deduplicate it and return
+  return uniqueEntries<TensorView>(all_tvs);
 }
 
-std::vector<Expr*> historyOf(TensorDomain* td) {
-  return ExprSort::getExprs(
-      td->fusion(), {td->domain().begin(), td->domain().end()});
+std::vector<Expr*> getReductionOps(Fusion* fusion, bool ignore_trivial) {
+  std::vector<Expr*> red_ops;
+
+  auto isReduction = [&ignore_trivial](Val* out_val) {
+    if (out_val == nullptr || !out_val->isA<TensorView>()) {
+      return false;
+    }
+    auto out_tv = out_val->as<TensorView>();
+    return std::any_of(
+        out_tv->getRootDomain().begin(),
+        out_tv->getRootDomain().end(),
+        [&ignore_trivial](IterDomain* id) {
+          return id->isReduction() &&
+              !(ignore_trivial && id->isTrivialReduction());
+        });
+  };
+
+  for (auto expr : fusion->exprs()) {
+    bool is_reduction = false;
+    if (expr->isA<ReductionOp>()) {
+      is_reduction = isReduction(expr->as<ReductionOp>()->out());
+    } else if (expr->isA<GroupedReductionOp>()) {
+      is_reduction = std::any_of(
+          expr->as<GroupedReductionOp>()->outputs().begin(),
+          expr->as<GroupedReductionOp>()->outputs().end(),
+          isReduction);
+    } else if (expr->isA<WelfordOp>()) {
+      is_reduction = isReduction(expr->as<WelfordOp>()->outAvg());
+    }
+    if (is_reduction) {
+      red_ops.push_back(expr);
+    }
+  }
+
+  return red_ops;
 }
 
-std::vector<Expr*> historyOf(TensorView* tv) {
-  return historyOf(tv->domain());
+namespace {
+
+class ValReplacementMutator : private OptOutMutator {
+ public:
+  ValReplacementMutator(
+      Fusion* fusion,
+      const std::unordered_map<Val*, Val*>& replacement_map)
+      : replacement_map_(replacement_map) {
+    FusionGuard fg(fusion);
+
+    // Welford makes this a little annoying since it holds a count which is
+    // typically not used by anything else. If we don't grab that count, then it
+    // would be a tensorview that doesn't get updated extents. Therefore, first
+    // grab all leaves towards outputs and grab stmts from there.
+    auto stmts = StmtSort::getStmts(fusion, allLeafOuts(fusion), true);
+    for (auto stmt : stmts) {
+      mutate(stmt);
+    }
+  }
+
+ private:
+  using OptOutMutator::mutate;
+  void mutate(Val* val) final {
+    if (replacement_map_.find(val) == replacement_map_.end()) {
+      return OptOutMutator::mutate(val);
+    }
+    auto replaced_val = replacement_map_.at(val);
+    registerMutation(val, replaced_val);
+  }
+
+  std::vector<Val*> allLeafOuts(Fusion* fusion) {
+    auto exprs = StmtSort::getExprs(fusion, true);
+    std::unordered_set<Val*> inputs;
+    std::unordered_set<Val*> outputs;
+    std::vector<Val*> ordered_outputs;
+    for (auto expr : exprs) {
+      inputs.insert(expr->inputs().begin(), expr->inputs().end());
+      outputs.insert(expr->outputs().begin(), expr->outputs().end());
+      ordered_outputs.insert(
+          ordered_outputs.end(),
+          expr->outputs().begin(),
+          expr->outputs().end());
+    }
+    for (auto input : inputs) {
+      outputs.erase(input);
+    }
+
+    std::vector<Val*> ordered_leaf_outs;
+    for (auto out : ordered_outputs) {
+      if (outputs.find(out) != outputs.end()) {
+        ordered_leaf_outs.push_back(out);
+      }
+    }
+    return ordered_leaf_outs;
+  }
+
+  const std::unordered_map<Val*, Val*>& replacement_map_;
+};
+
+} // namespace
+
+void replaceValue(
+    Fusion* fusion,
+    const std::unordered_map<Val*, Val*>& replacement_map) {
+  ValReplacementMutator(fusion, replacement_map);
+}
+
+Val* getReductionInitValOf(TensorView* tv) {
+  auto def = tv->definition();
+  if (def == nullptr) {
+    return nullptr;
+  }
+
+  Val* init = nullptr;
+  if (auto rop = dynamic_cast<ReductionOp*>(def)) {
+    init = rop->init();
+  } else if (auto grop = dynamic_cast<GroupedReductionOp*>(def)) {
+    int output_idx = -1;
+    for (const auto i : c10::irange(grop->numReductions())) {
+      if (tv == grop->output(i)) {
+        output_idx = static_cast<int>(i);
+        break;
+      }
+    }
+    TORCH_INTERNAL_ASSERT(
+        output_idx >= 0,
+        "Matching output not found for GroupedReductionOp: ",
+        tv->toString(),
+        ". Defined by: ",
+        def->toString());
+    init = grop->initVal(output_idx);
+  } else if (auto wop = dynamic_cast<WelfordOp*>(def)) {
+    if (tv == wop->outAvg()) {
+      init = wop->initAvg();
+    } else if (tv == wop->outVar()) {
+      init = wop->initVar();
+    } else {
+      TORCH_INTERNAL_ASSERT(tv == wop->outN());
+      init = wop->initN();
+    }
+  } else if (auto mma = dynamic_cast<MmaOp*>(def)) {
+    init = mma->init();
+  }
+
+  return init;
 }
 
 } // namespace ir_utils
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/torch/csrc/jit/codegen/cuda/ir_utils.h
index c8dc2e6f6796..0b05b6fb5e86 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.h
+++ b/torch/csrc/jit/codegen/cuda/ir_utils.h
@@ -10,8 +10,14 @@ namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
+
 namespace ir_utils {
 
+// Replace values in fusion using ValReplacementMutator
+void replaceValue(
+    Fusion*,
+    const std::unordered_map<Val*, Val*>& replacement_map);
+
 template <typename FilterType, typename Iterator>
 class FilterIterator {
  public:
@@ -110,6 +116,9 @@ auto filterByType(InputIt first, InputIt last) {
   return FilteredView<FilterType, InputIt>(first, last);
 }
 
+template <typename FilterType, typename ContainerType>
+auto filterByType(const ContainerType&& inputs) = delete;
+
 template <typename FilterType, typename ContainerType>
 auto filterByType(const ContainerType& inputs) {
   return filterByType<FilterType>(inputs.cbegin(), inputs.cend());
@@ -144,17 +153,87 @@ TORCH_CUDA_CU_API TensorView* rfactorHelper(
     TensorView* red_tv,
     const std::vector<int>& axes);
 
-// Return immediate producers of tv
+// Return immediate producers of val, this function can be used on any Val and
+// will return producers through Exprs.
+//
+// Warning: returned val's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses val->definition() or val->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
+TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val);
+
+// Return immediate consumers of val, this function can be used on any Val and
+// will return consumers through Exprs.
+//
+// Warning: returned val's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses val->definition() or val->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
+TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val);
+
+// Return immediate producers of vals, this function can be used on any vals and
+// will return producers through Exprs.
+//
+// Warning: returned val's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses val->definition() or val->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
+TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
+    const std::vector<Val*>& vals);
+
+// Return immediate consumers of vals, this function can be used on any vals and
+// will return consumers through Exprs.
+//
+// Warning: returned val's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses val->definition() or val->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
+TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(
+    const std::vector<Val*>& vals);
+
+// Return immediate producers of tv, this function will return all immediate
+// producers of tv through Exprs.
+//
+// Warning: returned tv's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses tv->definition() or tv->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
 TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(TensorView* tv);
 
-// Return immediate consumers of tv
+// Return immediate consumers of tv, this function will return all immediate
+// consumers of tv through Exprs.
+//
+// Warning: returned tv's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses tv->definition() or tv->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
 TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(TensorView* tv);
 
-// Return immediate producers of tvs (can return tvs input)
+// Return immediate producers of tvs, this function will return all immediate
+// producers of tvs through Exprs.
+//
+// Warning: returned tv's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses tv->definition() or tv->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
 TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(
     const std::vector<TensorView*>& tvs);
 
-// Return immediate consumers of tvs (can return tvs input)
+// Return immediate consumers of tvs, this function will return all immediate
+// consumers of tvs through Exprs.
+//
+// Warning: returned tv's are not guaranteed to be between fusion inputs and
+// outputs. This function simply uses tv->definition() or tv->uses() which is
+// limited to not go through fusion inputs/outputs, but if on a path that isn't
+// strictly between fusion inputs/outputs, it could effectively return dead
+// code.
 TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(
     const std::vector<TensorView*>& tvs);
 
@@ -175,11 +254,24 @@ TORCH_CUDA_CU_API std::vector<TensorView*> outputTvsOf(
 // returns all tensor views in fusion that are used between outputs and inputs.
 TORCH_CUDA_CU_API std::vector<TensorView*> allTvs(Fusion* fusion);
 
-// Returns the history of expressions applied to the domains of td
-TORCH_CUDA_CU_API std::vector<Expr*> historyOf(TensorDomain* td);
-
-// Returns the history of expressions applied to the domains of tv
-TORCH_CUDA_CU_API std::vector<Expr*> historyOf(TensorView* tv);
+TORCH_CUDA_CU_API std::vector<Expr*> getReductionOps(
+    Fusion* fusion,
+    bool ignore_trivial = true);
+
+// Returns the initialization value of tv or nullptr if not initialized.
+TORCH_CUDA_CU_API Val* getReductionInitValOf(TensorView* tv);
+
+template <typename T>
+std::string toString(const T& nodes) {
+  std::stringstream ss;
+  for (Statement* stmt : nodes) {
+    if (ss.tellp() != 0) {
+      ss << ", ";
+    }
+    ss << stmt->toString();
+  }
+  return ss.str();
+}
 
 } // namespace ir_utils
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
index 344df98f5a75..6ae4e7374df5 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 namespace torch {
@@ -31,21 +32,98 @@ void remove_visited(
   }
 }
 
+// Return all dependencies of a node including members of the node.
+class RecursiveDependencies : public OptInDispatch {
+ public:
+  static std::vector<Statement*> next(Statement* stmt) {
+    RecursiveDependencies find_next(stmt);
+    return find_next.next_stmts_;
+  }
+
+ private:
+  RecursiveDependencies() = default;
+
+  RecursiveDependencies(Statement* stmt) {
+    handle(stmt);
+  }
+
+  using OptInDispatch::handle;
+
+  void handle(Expr* expr) final {
+    FusionGuard::getCurFusion()->assertInContainer(
+        expr,
+        "IterVisitor.cpp::RecursiveDependencies::handle(Expr*) Cannot traverse expr, ");
+    next_stmts_.insert(
+        next_stmts_.end(), expr->inputs().begin(), expr->inputs().end());
+  }
+
+  void handle(Val* val) final {
+    FusionGuard::getCurFusion()->assertInContainer(
+        val,
+        "IterVisitor.cpp::RecursiveDependencies::handle(Val*) Cannot traverse val, ");
+    OptInDispatch::handle(val);
+  }
+
+  void simpleVal(Val* val) {
+    if (val->definition() == nullptr) {
+      return;
+    }
+    next_stmts_.push_back(val->definition());
+  }
+
+  void handle(Bool* stmt) final {
+    simpleVal(stmt);
+  }
+
+  void handle(Double* stmt) final {
+    simpleVal(stmt);
+  }
+
+  void handle(Int* stmt) final {
+    simpleVal(stmt);
+  }
+
+  void handle(ComplexDouble* stmt) final {
+    simpleVal(stmt);
+  }
+
+  void handle(NamedScalar* stmt) final {
+    simpleVal(stmt);
+  }
+
+  void handle(IterDomain* stmt) final {
+    next_stmts_.push_back(stmt->start());
+    next_stmts_.push_back(stmt->extent());
+    next_stmts_.push_back(stmt->stopOffset());
+    simpleVal(stmt);
+  }
+
+  void handle(TensorDomain* stmt) final {
+    next_stmts_.insert(
+        next_stmts_.end(), stmt->domain().begin(), stmt->domain().end());
+    simpleVal(stmt);
+  }
+
+  void handle(TensorView* tv) final {
+    next_stmts_.push_back(tv->domain());
+    simpleVal(tv);
+  }
+
+  std::vector<Statement*> next_stmts_;
+};
+
 } // namespace
 
 std::vector<Statement*> IterVisitor::next(Statement* stmt) {
   if (stmt->isVal()) {
     return next(stmt->as<Val>());
-  } else if (stmt->isExpr()) {
-    return next(stmt->as<Expr>());
   } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "IterVisitor could not detect type in next_dispatch.");
+    return next(stmt->as<Expr>());
   }
 }
 
 std::vector<Statement*> IterVisitor::next(Val* v) {
-  FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, ");
+  FusionGuard::getCurFusion()->assertInContainer(v, "Cannot traverse val, ");
   if (v->definition() != nullptr) {
     return {v->definition()};
   }
@@ -53,7 +131,8 @@ std::vector<Statement*> IterVisitor::next(Val* v) {
 }
 
 std::vector<Statement*> IterVisitor::next(Expr* expr) {
-  FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, ");
+  FusionGuard::getCurFusion()->assertInContainer(
+      expr, "Cannot traverse expr, ");
   std::vector<Statement*> next_stmts{
       expr->inputs().begin(), expr->inputs().end()};
   return next_stmts;
@@ -93,7 +172,8 @@ void IterVisitor::handle(Val* v) {
 void IterVisitor::traverseFrom(
     Fusion* fusion,
     const std::vector<Val*>& from,
-    bool traverseAllPaths) {
+    bool traverseAllPaths,
+    bool traverseIntoMembers) {
   FusionGuard fg(fusion);
 
   std::unordered_set<Statement*> visited;
@@ -137,7 +217,8 @@ void IterVisitor::traverseFrom(
     } else {
       // We're not ready to process this node, so add all its inputs to be
       // checked Visit input nodes.
-      auto next_stmts = next(stmt);
+      auto next_stmts =
+          traverseIntoMembers ? RecursiveDependencies::next(stmt) : next(stmt);
       // We may want to retraverse nodes, in that case revisit everything!
       if (!traverseAllPaths) {
         // If we don't want to retraverse, remove nodes we already visisted.
@@ -180,17 +261,29 @@ namespace {
 // expressions.
 class Inputs : public IterVisitor {
  private:
-  //! Optional list of all input vals. If empty, vals with no defining
-  //! expression are considered as inputs.
+  //! Optional list of input vals. While traversing to inputs if a value in the
+  //! all_inputs list is found, that value will be added to the inputs_ and
+  //! traversal will not go into its definition. Otherwise traversal follows
+  //! definition paths until hitting a definition that is a nullptr (i.e. a
+  //! terminating input).
   const std::vector<Val*>& all_inputs_;
   std::vector<Val*> inputs_;
 
   Inputs(const std::vector<Val*>& all_inputs) : all_inputs_(all_inputs) {}
 
+  std::vector<Statement*> next(Val* v) override {
+    if (std::find(inputs_.begin(), inputs_.end(), v) != inputs_.end()) {
+      return {};
+    }
+    return IterVisitor::next(v);
+  }
+
   void handle(Val* val) override {
-    if ((all_inputs_.empty() && val->definition() == nullptr) ||
+    // If there's no definition to val, or val is within the provided inputs
+    if (val->definition() == nullptr ||
         std::find(all_inputs_.begin(), all_inputs_.end(), val) !=
             all_inputs_.end()) {
+      // if not already placed in the inputs
       if (std::find(inputs_.begin(), inputs_.end(), val) == inputs_.end()) {
         inputs_.push_back(val);
       }
@@ -308,7 +401,7 @@ void BackwardVisitor::traverseFrom(
 
   auto vals = AllVals::get(fusion, from);
 
-  auto exprs = ExprSort::getExprs(fusion, from);
+  auto exprs = StmtSort::getExprs(fusion, from);
 
   {
     size_t pos = 0;
@@ -516,6 +609,9 @@ class DependentVals : public IterVisitor {
   std::unordered_set<Val*> outs_;
 
   // Boundary where we want to stop searching beyond
+  // TODO: Based on the todo below, shouldn't we stop just at the definition of?
+  // If we really wanted to make this traverse left, wouldn't we first check
+  // which outputs are outputs dependent on of?
   std::unordered_set<Val*> boundary_;
 
   std::vector<Statement*> next(Val* v) override {
@@ -539,6 +635,11 @@ class DependentVals : public IterVisitor {
   }
 
   // optimization to limit search path
+  // TODO: Is this valid? Couldn't something like:
+  // out0 = of + val0
+  // out1 = out0 + val1
+  // out2 = TernaryOp(out1, val0, of)
+  // Hide the dep of out1 on of?
   void createBoundary() {
     for (auto v_of : of_) {
       for (auto v_expr : v_of->uses()) {
@@ -704,22 +805,41 @@ std::unordered_set<Val*> DependencyCheck::getAllDependentVals(
   return DependentVals::getAllDependentVals(of);
 }
 
-void ExprSort::handle(Expr* expr) {
-  exprs.push_back(expr);
+void StmtSort::handle(Statement* stmt) {
+  stmts.push_back(stmt);
 }
 
-std::vector<Expr*> ExprSort::getExprs(Fusion* fusion) {
-  ExprSort es;
-  es.traverse(fusion);
-  return es.exprs;
+std::vector<Expr*> StmtSort::getExprs(Fusion* fusion, bool traverse_members) {
+  auto terminating_outputs = fusion->getTerminatingOutputs();
+  return StmtSort::getExprs(fusion, terminating_outputs, traverse_members);
 }
 
-std::vector<Expr*> ExprSort::getExprs(
+std::vector<Expr*> StmtSort::getExprs(
     Fusion* fusion,
-    const std::vector<Val*>& from) {
-  ExprSort es;
-  es.traverseFrom(fusion, from, false);
-  return es.exprs;
+    const std::vector<Val*>& from,
+    bool traverse_members) {
+  StmtSort es;
+  es.traverseFrom(fusion, from, false, traverse_members);
+  auto stmts = StmtSort::getStmts(fusion, from, traverse_members);
+  auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
+  std::vector<Expr*> exprs(filter.begin(), filter.end());
+  return exprs;
+}
+
+std::vector<Statement*> StmtSort::getStmts(
+    Fusion* fusion,
+    bool traverse_members) {
+  auto terminating_outputs = fusion->getTerminatingOutputs();
+  return StmtSort::getStmts(fusion, terminating_outputs, traverse_members);
+}
+
+std::vector<Statement*> StmtSort::getStmts(
+    Fusion* fusion,
+    const std::vector<Val*>& from,
+    bool traverse_members) {
+  StmtSort es;
+  es.traverseFrom(fusion, from, false, traverse_members);
+  return es.stmts;
 }
 
 void InputsOf::handle(Val* v) {
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h
index d4aa56ea2fef..2447933d7373 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.h
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -83,18 +83,21 @@ class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
   void traverseHelper(Fusion* fusion, bool traverse_all_paths = false);
 
  public:
-  // Starts at nodes provided in from, traverses from these nodes to inputs.
-  // Calls handle on all Statement*s in topological sorted order.
-  // traverseAllPaths = false only call handle on each Statement* once
-  // traverseAllPaths = true traverses all paths from nodes in from to inputs.
-  // Handle on a Statement* for every path from "from" nodes, to inputs.
-  // to argument allows specification of nodes to stop at if we want to stop
-  // beffore we hit all leaf nodes. This can be helpful when we want to traverse
-  // from TensorView::domain(), to the rfactor domain, instead of root domain.
+  //! Starts at nodes provided in from, traverses from these nodes to inputs.
+  //! Calls handle on all Statement*s in topological sorted order.
+  //! \param traverseAllPaths = false only call handle on each Statement* once
+  //!    traverseAllPaths = true traverses all paths from nodes in from to
+  //!    inputs. Calls handle on a Statement* for every path from "from" nodes,
+  //!    to inputs.
+  //! \param traverseIntoMembers = When hitting nodes like TensorView,
+  //! TensorDomain, or IterDomain where there are members of the nodes that are
+  //! Val's a value of "true" will also traverse into those member Val's, a
+  //! value of "false" will not traverse into the members.
   void traverseFrom(
       Fusion* fusion,
       const std::vector<Val*>& from,
-      bool traverseAllPaths = false);
+      bool traverseAllPaths = false,
+      bool traverseIntoMembers = false);
 
   // Iterates from terminating outputs registered with the fusion. Terminating
   // means value is not used to generate any other value used in producing
@@ -246,18 +249,40 @@ class TORCH_CUDA_CU_API DependencyCheck {
 
 // Expr sort will take a fusion and return a topologically sorted list of
 // expressions.
-class ExprSort : public IterVisitor {
+class StmtSort : public IterVisitor {
  protected:
-  std::vector<Expr*> exprs;
+  std::vector<Statement*> stmts;
 
-  void handle(Expr* expr) override;
+  void handle(Statement* stmt) override;
 
  public:
-  static std::vector<Expr*> getExprs(Fusion* fusion);
+  // If traverse_members it will also extract all member nodes in the sorted
+  // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
+  static std::vector<Expr*> getExprs(
+      Fusion* fusion,
+      bool traverse_members = false);
 
+  // If traverse_members it will also extract all member nodes in the sorted
+  // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
   static std::vector<Expr*> getExprs(
       Fusion* fusion,
-      const std::vector<Val*>& from);
+      const std::vector<Val*>& from,
+      bool traverse_members = false);
+
+  // If traverse_members it will also extract all member nodes in the sorted
+  // statement list in the fusion. i.e. all IterDomains, extents, and associated
+  // expressions of them
+  static std::vector<Statement*> getStmts(
+      Fusion* fusion,
+      bool traverse_members = false);
+
+  // If traverse_members it will also extract all member nodes in the sorted
+  // expr list in the fusion. i.e. all IterDomains, extents, and associated
+  // expressions of them
+  static std::vector<Statement*> getStmts(
+      Fusion* fusion,
+      const std::vector<Val*>& from,
+      bool traverse_members = false);
 };
 
 class InputsOf : public IterVisitor {
diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
index d3ef9eeb95d5..cbbc4f53462e 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -1,7 +1,8 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 #include <iostream>
@@ -11,22 +12,24 @@ namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
+
+IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container)
+    : ir_container_(ir_container) {}
+
 namespace kir {
 
 namespace {
 
 //! Scan all primary expressions in the Kernel IR and build
 //! lists of specialized nodes and other interesting information
-class KernelIrScanner : private kir::IrVisitor {
+class KernelIrScanner : private IrVisitor {
  public:
   explicit KernelIrScanner(const Kernel* kernel) {
-    for (const auto& ir_node : kernel->irNodes()) {
-      ir_node->accept(this);
-    }
+    IrVisitor::handle(kernel->topLevelExprs());
     const auto gpu_lower = GpuLower::current();
     for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) {
-      auto extent = gpu_lower->lowerValue(split->in()->extent());
-      auto factor = gpu_lower->lowerValue(split->factor());
+      auto extent = split->in()->extent();
+      auto factor = split->factor();
       summary_.splits_to_validate.emplace_back(extent, factor);
     }
   }
@@ -36,7 +39,17 @@ class KernelIrScanner : private kir::IrVisitor {
   }
 
  private:
-  void visit(const kir::Sync* sync) final {
+  using IrVisitor::handle;
+  void handle(Expr* expr) final {
+    IrVisitor::handle(expr);
+    for (auto inp : expr->inputs()) {
+      handle(inp);
+    }
+    for (auto out : expr->outputs()) {
+      handle(out);
+    }
+  }
+  void handle(BlockSync* sync) final {
     // TODO: Move to a dedicated validation pass
     // which is not on the common execution/compilation path
     if (sync->isWarHazardSync()) {
@@ -44,17 +57,17 @@ class KernelIrScanner : private kir::IrVisitor {
     }
   }
 
-  void visit(const kir::Allocate* allocate) final {
+  void handle(GridSync* sync) final {
+    summary_.has_cooperative_grid_reduction = true;
+  }
+
+  void handle(Allocate* allocate) final {
     switch (allocate->memoryType()) {
       case MemoryType::Global:
         summary_.global_allocations.push_back(allocate);
         break;
       case MemoryType::Shared:
-        if (ExpressionEvaluator::isConst(allocate->size())) {
-          summary_.static_smem_allocations.push_back(allocate);
-        } else {
-          summary_.dynamic_smem_allocations.push_back(allocate);
-        }
+        summary_.dynamic_smem_allocations.push_back(allocate);
         break;
       case MemoryType::Local:
         if (!ExpressionEvaluator::isConst(allocate->size())) {
@@ -65,28 +78,23 @@ class KernelIrScanner : private kir::IrVisitor {
     }
   }
 
-  void visit(const kir::UnaryOp* unary_op) final {
-    if (unary_op->operation() == UnaryOpType::RandLike) {
+  void handle(UnaryOp* unary_op) final {
+    if (unary_op->getUnaryOpType() == UnaryOpType::RandLike) {
       // This kernel is using random numbers
       summary_.is_stochastic = true;
     }
   }
 
-  void visit(const kir::TensorIndex* tensor_index) final {
+  void handle(TensorIndex* tensor_index) final {
     const auto tv = tensor_index->view();
     const auto domain = tv->domain();
-
     // Do we have any reductions?
     summary_.has_block_reductions =
         summary_.has_block_reductions || domain->hasBlockReduction();
 
-    // Do we have block broadcasts?
-    summary_.has_block_broadcasts =
-        summary_.has_block_broadcasts || domain->hasBlockBroadcast();
-
     // Update the largest smem data type
     if (domain->hasBlockReduction() || domain->hasGridReduction() ||
-        tv->memoryType() == MemoryType::Shared) {
+        tv->getMemoryType() == MemoryType::Shared) {
       const auto data_type = tv->dtype();
       const size_t type_size = dataTypeSize(data_type);
       if (type_size > max_smem_type_size_) {
@@ -94,38 +102,54 @@ class KernelIrScanner : private kir::IrVisitor {
         summary_.largest_smem_data_type = data_type;
       }
     }
+  }
 
-    // Update Welford
-    if (tensor_index->definition() != nullptr &&
-        tensor_index->definition()->isA<kir::WelfordOp>()) {
-      summary_.has_welford = true;
-      summary_.has_block_welford =
-          summary_.has_block_welford || domain->hasBlockReduction();
-      summary_.has_grid_welford =
-          summary_.has_grid_welford || domain->hasGridReduction();
+  void handle(WelfordOp* welford_op) final {
+    summary_.has_welford = true;
+    TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA<TensorIndex>());
+    auto out_dom = welford_op->outAvg()->as<TensorIndex>()->view()->domain();
+    summary_.has_block_welford =
+        summary_.has_block_welford || out_dom->hasBlockReduction();
+  }
+
+  void handle(GridWelford* grid_welford) final {
+    summary_.has_welford = true;
+    summary_.has_grid_welford = true;
+    summary_.has_grid_reductions = true;
+    if (grid_welford->welford_op()->isAllreduce()) {
+      summary_.has_cooperative_grid_reduction = true;
     }
   }
 
-  void visit(const kir::GridWelford* grid_welford) final {
-    const auto dom = grid_welford->welford_op()
-                         ->out()
-                         ->as<kir::TensorIndex>()
-                         ->view()
-                         ->domain();
-    updateGridReductionInLoop(dom);
+  void handle(GridReduction* grid_reduction) final {
+    summary_.has_grid_reductions = true;
+    if (grid_reduction->isAllreduce()) {
+      summary_.has_cooperative_grid_reduction = true;
+    }
   }
 
-  void visit(const kir::GridReduction* grid_reduction) final {
-    const auto dom = grid_reduction->reduction_op()
-                         ->out()
-                         ->as<kir::TensorIndex>()
-                         ->view()
-                         ->domain();
+  void handle(GroupedGridReduction* grid_reduction) final {
+    summary_.has_grid_reductions = true;
+    const auto dom = ir_utils::getTvOutput(grid_reduction)->domain();
     updateGridReductionInLoop(dom);
   }
 
-  void visit(const kir::GridBroadcast*) final {
+  void handle(GridBroadcast* grid_broadcast) final {
     summary_.has_cooperative_grid_reduction = true;
+    handle(grid_broadcast->broadcast_op());
+  }
+
+  void handle(BroadcastOp* bop) final {
+    const ParallelTypeBitmap parallel_types =
+        GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
+            bop->out()->as<TensorIndex>()->view());
+    summary_.broadcast_parallel_types.emplace(bop, parallel_types);
+    // Do we have block broadcasts?
+    summary_.has_block_broadcasts =
+        summary_.has_block_broadcasts || parallel_types.hasTID();
+    // Do we have grid broadcasts?
+    summary_.has_grid_broadcasts =
+        summary_.has_grid_broadcasts || parallel_types.hasBID();
   }
 
  private:
@@ -134,12 +158,9 @@ class KernelIrScanner : private kir::IrVisitor {
 
  private:
   void updateGridReductionInLoop(TensorDomain* dom) {
-    summary_.has_grid_reductions = true;
-
-    const auto gpu_lower = GpuLower::current();
     for (const auto i : c10::irange(dom->nDims())) {
-      const auto id =
-          gpu_lower->caParallelMap().getConcreteMappedID(dom->domain()[i]);
+      const auto id = GpuLower::current()->caMap()->getConcreteMappedID(
+          dom->domain()[i], IdMappingMode::LOOP);
 
       summary_.has_cooperative_grid_reduction =
           summary_.has_cooperative_grid_reduction ||
@@ -169,7 +190,7 @@ class KernelIrScanner : private kir::IrVisitor {
 //! MemoryType::Global for tensors parallelized with blockIdx), it is
 //! assumed that allocation is properly extended for the iteration
 //! count.
-class ValidateAllocation : private kir::IrVisitor {
+class ValidateAllocation : private OptOutConstDispatch {
  public:
   static void validate(const Kernel* kernel) {
     ValidateAllocation validate_allocation(kernel);
@@ -178,14 +199,14 @@ class ValidateAllocation : private kir::IrVisitor {
  private:
   explicit ValidateAllocation(const Kernel* kernel) {
     live_allocations_.emplace_back(std::vector<const Allocate*>());
-    for (const auto& ir_node : kernel->topLevelExprs()) {
-      ir_node->accept(this);
+    for (const auto& expr : kernel->topLevelExprs()) {
+      OptOutConstDispatch::handle(expr);
     }
     live_allocations_.pop_back();
     TORCH_INTERNAL_ASSERT(live_allocations_.empty());
   }
 
-  void visit(const kir::Allocate* allocate) final {
+  void handle(const Allocate* allocate) final {
     TORCH_INTERNAL_ASSERT(!live_allocations_.empty());
     live_allocations_.back().push_back(allocate);
   }
@@ -195,53 +216,53 @@ class ValidateAllocation : private kir::IrVisitor {
   // during in the allocation lowering if it's thread-parallel and not
   // allocated on shared or global memories, or if it's block-parallel
   // ando not allocated on global memory.
-  void validate(const kir::ForLoop* for_loop) {
+  void validate(const ForLoop* for_loop) {
     const auto loop_id = for_loop->iter_domain();
-    const auto gpu_lower = GpuLower::current();
     for (const auto& allocations : live_allocations_) {
       for (const auto& allocate : allocations) {
-        const auto tv = dynamic_cast<kir::TensorView*>(allocate->buffer());
+        const auto tv = dynamic_cast<TensorView*>(allocate->buffer());
         if (tv == nullptr) {
           continue;
         }
         for (const auto& axis : tv->domain()->domain()) {
-          if (!gpu_lower->caParallelMap().areMapped(loop_id, axis)) {
+          if (!GpuLower::current()->caMap()->areMapped(
+                  loop_id, axis, IdMappingMode::LOOP)) {
             continue;
           }
-          if (isParallelTypeThreadDim(loop_id->parallelType())) {
+          if (isParallelTypeThreadDim(loop_id->getParallelType())) {
             TORCH_INTERNAL_ASSERT(
-                tv->memoryType() == MemoryType::Shared ||
-                    tv->memoryType() == MemoryType::Global,
+                tv->getMemoryType() == MemoryType::Shared ||
+                    tv->getMemoryType() == MemoryType::Global,
                 "Tensor t",
                 tv->name(),
                 " must be allocated on SMEM or GMEM.");
-          } else if (isParallelTypeBlockDim(loop_id->parallelType())) {
-            TORCH_INTERNAL_ASSERT(tv->memoryType() == MemoryType::Global);
+          } else if (isParallelTypeBlockDim(loop_id->getParallelType())) {
+            TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global);
           }
         }
       }
     }
   }
 
-  void visit(const kir::ForLoop* for_loop) final {
+  void handle(const ForLoop* for_loop) final {
     if (for_loop->stop() != for_loop->iter_domain()->extent() &&
-        isParallelTypeThread(for_loop->iter_domain()->parallelType())) {
+        isParallelTypeThread(for_loop->iter_domain()->getParallelType())) {
       validate(for_loop);
     }
 
     live_allocations_.emplace_back(std::vector<const Allocate*>());
     for (const auto& expr : for_loop->body().exprs()) {
-      expr->accept(this);
+      OptOutConstDispatch::handle(expr);
     }
     live_allocations_.pop_back();
   }
 
-  void visit(const kir::IfThenElse* ite) final {
+  void handle(const IfThenElse* ite) final {
     for (const auto& expr : ite->thenBody().exprs()) {
-      expr->accept(this);
+      OptOutConstDispatch::handle(expr);
     }
     for (const auto& expr : ite->elseBody().exprs()) {
-      expr->accept(this);
+      OptOutConstDispatch::handle(expr);
     }
   }
 
@@ -252,14 +273,18 @@ class ValidateAllocation : private kir::IrVisitor {
 } // namespace
 
 // TODO(kir): Kernel IR validation
-void Kernel::finalize(std::vector<kir::Expr*> top_level_exprs) {
-  TORCH_CHECK(top_level_exprs_.empty());
+void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
+  TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
   top_level_exprs_ = std::move(top_level_exprs);
-  predicate_map_ = std::make_unique<ThreadPredicateMap>(
-      GpuLower::current()->threadPredMap());
   warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
   ValidateAllocation::validate(this);
   analyze();
+  // Make sure this is after analyze as it sets summary_
+  summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses();
+  summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo();
+  summary_.sync_map = GpuLower::current()->syncMap();
+  summary_.parallel_dimension_map_ =
+      GpuLower::current()->parallelDimensionMap();
 }
 
 void Kernel::analyze() {
@@ -270,8 +295,67 @@ void Kernel::analyze() {
 }
 
 void Kernel::print() const {
-  kir::IrPrinter ir_printer(std::cout);
-  ir_printer.printKernel(this);
+  IrPrinter ir_printer(std::cout);
+  ir_printer.handle(this);
+}
+
+//! Register the Val with this fusion
+void Kernel::registerVal(Val* val) {
+  if (inContainer(val)) {
+    return;
+  }
+  if (val->kernel()) {
+    TORCH_CHECK(
+        val->kernel() == this,
+        val->toString(),
+        " was not found in the active kernel.");
+  }
+
+  Fusion::registerVal(val);
+}
+
+//! Register expr with this fusion.
+//! When we register an expression, we want to update the dependency tracking
+//! of Vals. We add expr to our general expr_set_,
+void Kernel::registerExpr(Expr* expr) {
+  if (inContainer(expr)) {
+    return;
+  }
+
+  if (expr->kernel()) {
+    TORCH_CHECK(
+        expr->kernel() == this,
+        expr->toString(),
+        " was not found in the active kernel.");
+  }
+
+  for (Val* input : expr->inputs()) {
+    TORCH_INTERNAL_ASSERT(
+        inContainer(input),
+        "Input\n",
+        input->toString(),
+        " to expr,\n",
+        expr->toString(),
+        ",\n is invalid because it is not in the same kernel.");
+  }
+
+  for (Val* output : expr->outputs()) {
+    TORCH_INTERNAL_ASSERT(
+        inContainer(output),
+        "Output\n",
+        output->toString(),
+        " to expr,\n",
+        expr->toString(),
+        ",\n is invalid because it is not in the same kernel.");
+  }
+
+  // Register expr is explicitly non-SSA when coming from a kernel. This is
+  // detected inside Fusion::registerExpr
+  Fusion::registerExpr(expr);
+}
+
+std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
+  return kernel_->top_level_exprs_;
 }
 
 } // namespace kir
diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h
index b273324e1e24..4930da1a2872 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.h
+++ b/torch/csrc/jit/codegen/cuda/kernel.h
@@ -1,12 +1,18 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
 #include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
+#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
 
 #include <memory>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -47,6 +53,9 @@ struct KernelSummary {
   //! Do we have any block broadcasts?
   bool has_block_broadcasts = false;
 
+  //! Do we have any grid broadcasts?
+  bool has_grid_broadcasts = false;
+
   //! Do we have any welford op?
   bool has_welford = false;
 
@@ -67,85 +76,69 @@ struct KernelSummary {
   std::vector<const kir::Allocate*> dynamic_lmem_allocations;
 
   //! ceilDiv extents that must be divisible
-  std::vector<std::pair<const kir::Val*, const kir::Val*>> splits_to_validate;
+  std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
+
+  //! Effective ParallelTypes of broadcast ops
+  std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
+      broadcast_parallel_types;
+
+  //! Track which tensor views are inputs or outputs of a vectorized operation
+  //! and their maximum vectorized access size
+  std::unordered_map<TensorView*, int> vectorized_accesses;
+
+  // Sync map is needed to figure out if global memory buffers need to be marked
+  // as volatile because they're used for communication.
+  SyncMap sync_map;
+
+  // Parallel dimension map needed to set the correct properties of grid buffers
+  // (is a dim inactive)
+  ParallelDimensionMap parallel_dimension_map_;
+
+  //! Track information on vectorized set operations for runtime validation
+  std::vector<VectorizedSetInfo> vectorized_set_info;
 };
 
+class KernelInternalProxy;
+
 //! Container for a lowered Kernel IR
 //!
-//! TODO(kir): currently, it is just pointing to nodes owned
-//!  by a Fusion object. The goal is to have the Kernel object
-//!  own the Kernel IR nodes
-//!
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API Kernel final : public NonCopyable {
+class TORCH_CUDA_CU_API Kernel final : public Fusion {
+  friend KernelInternalProxy;
+
  public:
-  Kernel() = default;
+  // Kernel starts by grabbing all the nodes from the provided fusion.
+  // Kernel is not SSA, if a definition is not set, we should update it, but
+  // not remove previous definition if it is set. This is primarily because when
+  // we do something like generate an initialization statement for a reduction
+  // TV, we may want to continue to do fusion like analysis on the original
+  // expression.
+  // TODO: Assert index type is int or int32
+  Kernel(Fusion* fusion, DataType index_type = DataType::Int)
+      : Fusion(*fusion), index_type_(index_type) {}
+
+  Kernel() = delete;
+
+  // No move or copy semantics
+  Kernel(const Kernel&) = delete;
+  Kernel& operator=(const Kernel&) = delete;
 
   //! Finalize a kernel definition
   //!
   //! At this point we have a complete kernel definition and we can
-  //! run analysis passes to build a KernelSummary
-  //!
-  void finalize(std::vector<kir::Expr*> top_level_exprs);
-
-  //! Register input as an input of the kernel
-  void addInput(Val* input) {
-    inputs_.push_back(input);
-    input_set_.insert(input);
-  }
-
-  //! Register output as an output of the kernel
-  void addOutput(Val* output) {
-    outputs_.push_back(output);
-    output_set_.insert(output);
-  }
-
-  const auto& inputs() const {
-    return inputs_;
-  }
-
-  const auto& outputs() const {
-    return outputs_;
-  }
-
-  bool isInput(Val* val) const {
-    return input_set_.find(val) != input_set_.end();
-  }
-
-  bool isOutput(Val* val) const {
-    return output_set_.find(val) != output_set_.end();
-  }
+  //! run analysis passes to build a KernelSummary.
+  void finalize(std::vector<Expr*> top_level_exprs);
 
-  const auto& topLevelExprs() const {
+  const std::vector<Expr*>& topLevelExprs() const {
     return top_level_exprs_;
   }
 
-  const auto& irNodes() const {
-    return ir_nodes_;
-  }
-
   const KernelSummary& summary() const {
     return summary_;
   }
 
-  const ThreadPredicateMap& predicateMap() const {
-    return *predicate_map_;
-  }
-
-  //! Register a new Kernel IR node
-  //!
-  //! \note This is a specialized helper for kir::IrBuilder, not
-  //!   intendted for general use
-  //!
-  void registerIrNode(kir::Passkey passkey, std::unique_ptr<kir::Node> node) {
-    TORCH_CHECK(passkey.kernel == this);
-    ir_nodes_.push_back(std::move(node));
-  }
-
-  //! Allocates a new value identifier
-  kir::ValueId newValueId(kir::Passkey passkey) {
-    TORCH_CHECK(passkey.kernel == this);
-    return next_value_id_++;
+  DataType indexType() const {
+    return index_type_;
   }
 
   //! Checks if parallel type is padded
@@ -161,35 +154,45 @@ class TORCH_CUDA_CU_API Kernel final : public NonCopyable {
   //! Debug dump of the Kernel IR
   void print() const;
 
+ protected:
+  //! Register the Val with this fusion
+  void registerVal(Val* val) override;
+
+  //! Register expr with this fusion.
+  //! When we register an expression, we want to update the dependency tracking
+  //! of Vals. We add expr to our general expr_set_,
+  void registerExpr(Expr* expr) override;
+
  private:
   // Analyze the kernel IR and caches the summary of interesting data
   void analyze();
 
- private:
-  // Kernel IR nodes
-  std::vector<std::unique_ptr<kir::Node>> ir_nodes_;
-
   // Top level statements
-  std::vector<kir::Expr*> top_level_exprs_;
-
-  // Kernel inputs and outputs
-  std::vector<Val*> inputs_;
-  std::vector<Val*> outputs_;
-  std::unordered_set<Val*> input_set_;
-  std::unordered_set<Val*> output_set_;
-
-  // Used to allocate unique value IDs
-  kir::ValueId next_value_id_ = 1;
+  std::vector<Expr*> top_level_exprs_;
 
   // Summary of interesting kernel data
   KernelSummary summary_;
 
-  // Predicate map
-  // TODO(kir): consider a simpler, kernel IR based version
-  std::unique_ptr<ThreadPredicateMap> predicate_map_;
+  // Is this kernel being compiled with int32 or int64 indexing. This
+  // information is required to resolve DataType::Index
+  DataType index_type_ = DataType::Int;
+
   WarpPaddedParallelInfo warp_padded_parallel_info_;
 };
 
+//! A special debugging proxy for Kernel.
+//!
+//! Should not be used for other than testing and debugging.
+class TORCH_CUDA_CU_API KernelInternalProxy {
+ public:
+  KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
+
+  std::vector<Expr*>& topLevelExprs();
+
+ private:
+  Kernel* kernel_ = nullptr;
+};
+
 } // namespace kir
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index 39350876bd2b..ccdbb2eb1d9b 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -3,10 +3,13 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/jit_log.h>
 
 namespace torch {
 namespace jit {
@@ -25,6 +28,10 @@ int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs) {
       continue;
     }
     const auto& device = input.toTensor().device();
+    // skip cpu scalar tensor as they'll be promoted to scalar later
+    if (device.is_cpu() && is_cpu_scalar(input.toTensor())) {
+      continue;
+    }
     TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device");
     auto cur_index = device.index();
     if (index != -1 && index != cur_index) {
@@ -72,6 +79,11 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
         encodeBuffer(stride, encoding_);
         encoding_.push_back(' ');
       }
+      encoding_.push_back('a');
+      encodeBuffer(
+          SchedulerRuntimeInfo::computeAlignmentSize(
+              (size_t)input_tensor.data_ptr()),
+          encoding_);
       encoding_.push_back('d');
       encodeBuffer(input_tensor.device().index(), encoding_);
     } else {
@@ -80,9 +92,6 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
     }
     encoding_.push_back(';');
   }
-  if (additional_info) {
-    encodeBuffer(additional_info->getCommonAlignmentSize(), encoding_);
-  }
 
   auto& entry = encoding_lookup_[encoding_];
 
@@ -113,7 +122,11 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
 }
 
 FusionExecutorCache::FusionExecutorCache(std::unique_ptr<Fusion> fusion)
-    : fusion_(std::move(fusion)) {}
+    : fusion_(std::move(fusion)) {
+  for (const auto& indices : fusion_->getOutputAliasIndices()) {
+    aliased_output_indices_.insert(indices);
+  }
+}
 
 // Note [ Permutation support in nvfuser ]
 //
@@ -182,6 +195,12 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
     outputs[pair.first] = outputs[pair.first].permute(pair.second);
   }
 
+  int offset = 0;
+  for (const auto& v : aliased_output_indices_) {
+    outputs.erase(outputs.begin() + v - offset);
+    offset++;
+  }
+
   return outputs;
 }
 
@@ -202,9 +221,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
   }
 
   // Access kernels associated with the common device id
-  auto dev_id = getCommonDeviceCUDA(inputs);
-  TORCH_INTERNAL_ASSERT(dev_id >= 0);
-  auto& kernel_runtimes = kernel_runtimes_[dev_id];
+  auto device_index = getCommonDeviceCUDA(inputs);
+  TORCH_CHECK(device_index >= 0, "device is not coherent for fusion inputs");
+  auto& kernel_runtimes = kernel_runtimes_[device_index];
 
   // Check for re-use hit case
   //  a kernel runtime is re-usable if all the compiled
@@ -258,6 +277,8 @@ FusionKernelRuntime::FusionKernelRuntime(
       std::make_unique<FusionPrecomputedIntegers>(fusion_copy.get());
 
   //! Try to schedule the complete fusion
+  scheduler_debug_utils::canScheduleMessage(
+      "***Runtime***: Try to schedule fusion un-segmented:\n");
   const auto maybe_complete_fusion_heuristic =
       SchedulerEntry::proposeHeuristics(fusion_copy.get(), runtime_info);
 
@@ -277,14 +298,6 @@ FusionKernelRuntime::FusionKernelRuntime(
   } else {
     auto complete_fusion_heuristic = maybe_complete_fusion_heuristic.value();
 
-    // Translate welfords if apply
-    if (fusion_copy->hasWelford()) {
-      bool translated = SegmentCandidateFinder::TranslateWelfordInFusion(
-          fusion_copy.get(), inputs);
-      if (translated) {
-        complete_fusion_heuristic = ScheduleHeuristic::Persistent;
-      }
-    }
     // Take ownership of the transformed fusion
     single_kernel_fusion_ = std::move(fusion_copy);
 
@@ -358,7 +371,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
       launch_params = scheduler_entry->pointwiseParams().lparams;
     }
     executors_[group_id].compileFusion(
-        fusion_to_run.get(), options, inputs, launch_params);
+        fusion_to_run.get(), inputs, launch_params, options);
   } else {
     // Load launch params for reduction and normalization kernels
     if (scheduler_entry->hasReductionParam()) {
@@ -370,7 +383,6 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
 
   if (profiling_) {
     most_recent_executor_log_.fusion_executor = &executors_[group_id];
-    most_recent_executor_log_.launch_constraints = launch_params;
     if (scheduler_entry->hasReductionParam()) {
       most_recent_executor_log_.reduction_params =
           scheduler_entry->reductionParams();
@@ -380,7 +392,49 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
     }
   }
 
-  return executors_[group_id].runFusion(inputs, launch_params, input_id);
+  auto& executor = executors_[group_id];
+  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+    executor.setMeasureKernelTimeFlag(true);
+  }
+
+  auto outputs = executor.runFusion(inputs, launch_params, input_id);
+
+  // Print relevant information all at once for easy debuging of perf
+  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+    std::cout << "\nRun kernel:\n";
+    if (sg) {
+      segmented_fusion_->makeFusion(sg)->printMath();
+    } else {
+      single_kernel_fusion_->printMath();
+    }
+    std::cout << "With inputs:\n";
+    for (auto inp : inputs) {
+      if (inp.isTensor()) {
+        auto inp_tensor = inp.toTensor();
+        std::cout << "  " << inp_tensor.dtype() << "  " << inp_tensor.sizes()
+                  << "  " << inp_tensor.strides() << "\n";
+      } else {
+        std::cout << "  " << inp << "\n";
+      }
+    }
+    std::cout << "Compiler log: " << executor.compilerLog() << "\n";
+    if (scheduler_entry->hasReductionParam()) {
+      std::cout << scheduler_entry->reductionParams().toString() << "\n";
+    } else {
+      std::cout << scheduler_entry->pointwiseParams().toString() << "\n";
+    }
+    std::cout << "With arguments: " << executor.lastLaunchParams().toString();
+    std::cout << executor.kernelName() << " " << executor.bytesProcessed()
+              << " bytes/ " << std::setprecision(3) << executor.kernelTimeMs()
+              << " ms "
+              << ((double)executor.bytesProcessed() /
+                  ((double)executor.kernelTimeMs() / 1000)) /
+            (double)1.0e9
+              << " GB/s" << std::endl;
+    executor.setMeasureKernelTimeFlag(false);
+  }
+
+  return outputs;
 }
 
 void FusionKernelRuntime::prepareRuntimeOrder() {
@@ -443,7 +497,9 @@ void FusionKernelRuntime::prepareRuntimeOrder() {
 std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
     const at::ArrayRef<IValue>& inputs,
     size_t input_id) {
-  if (is_segmented_) {
+  if (!is_segmented_) {
+    return runKernelWithInput(inputs, input_id);
+  } else {
     FUSER_PERF_SCOPE("FusionKernelRuntime::runMultiKernelWithInput");
 
     TORCH_INTERNAL_ASSERT(
@@ -453,6 +509,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
         " inputs but expecting ",
         segmented_fusion_->inputs().size());
 
+    c10::Device device(c10::DeviceType::CUDA, 0);
     int extent_index_ = 0;
     // Bind input in the tensor_map
     for (const auto i : c10::irange(inputs.size())) {
@@ -466,6 +523,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
       //      more convenient and safer than replication
       if (inputs[i].isTensor()) {
         auto aten_tensor = inputs[i].toTensor();
+        device = aten_tensor.device();
         for (auto dim_size : aten_tensor.sizes()) {
           runtime_workspace_.tensor_map.emplace(
               runtime_workspace_.group_extent_binding_order[extent_index_++],
@@ -474,6 +532,10 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
       }
     }
 
+    if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+      std::cout << "=================RUNNING FUSION SEGMENTS================="
+                << std::endl;
+    }
     for (auto group_to_run : runtime_workspace_.group_run_order) {
       // Prepare input vector
       for (auto input : group_to_run->inputs()) {
@@ -497,6 +559,10 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
       runtime_workspace_.group_runtime_outputs.clear();
     }
 
+    if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+      std::cout << "=============FINISHED RUNNING FUSION SEGMENTS============"
+                << std::endl;
+    }
     // Produce final global output
     std::vector<IValue> fusion_outputs;
     for (auto output : segmented_fusion_->outputs()) {
@@ -504,14 +570,36 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
       if (iter != runtime_workspace_.tensor_map.end()) {
         fusion_outputs.push_back(iter->second);
       } else {
+        bool empty_type_check = output->getDataType().has_value() &&
+            output->getDataType().value() == DataType::Float;
+
+        // Only support two cases of empty tensor here, since
+        //   this is hot path.
+        auto out_tv = output->as<TensorView>();
+
+        // TODO: should be only one of the two once the "empty"
+        //  definition has been unified throughout the ops.
+        bool empty_tensor_check =
+            out_tv->isZeroDim() || out_tv->isEmptyTensor();
+
         // This is the check for an empty tensor;
         TORCH_INTERNAL_ASSERT(
-            output->as<TensorView>()->nDims() == 0 &&
-                output->getDataType().has_value() &&
-                output->getDataType().value() == DataType::Float,
-            "Non empty tensor cannot be found at tensor_map in ",
+            empty_tensor_check && empty_type_check,
+            "Is empty tensor? ",
+            !empty_tensor_check,
+            " Is empty type check? ",
+            !empty_type_check,
+            " Output empty tensor check failed for tensor: ",
+            out_tv->toString(),
+            " In function: ",
             __FUNCTION__);
-        fusion_outputs.emplace_back(at::Tensor());
+
+        // TODO: would need to clean up this part when
+        //   we have a unified and consistent way to generate
+        //   size-0 tensors.
+        const auto tensor_options =
+            at::TensorOptions().dtype(at::kFloat).device(device);
+        fusion_outputs.emplace_back(at::empty({0}, tensor_options));
       }
     }
 
@@ -529,8 +617,6 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
 
     runtime_workspace_.tensor_map.clear();
     return fusion_output_tensors;
-  } else {
-    return runKernelWithInput(inputs, input_id);
   }
 }
 
@@ -619,6 +705,8 @@ void GraphCache::createFusion(const std::shared_ptr<Graph>& graph) {
 
   fusion_executor_cache_ =
       std::make_unique<FusionExecutorCache>(parseJitIR(graph));
+
+  num_of_outputs_ = graph->outputs().size();
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -627,6 +715,8 @@ GraphCache::GraphCache(const std::shared_ptr<Graph>& graph) {
   TORCH_INTERNAL_ASSERT(
       IsNewExecutorEnabled(), "legacy executor is not supported by nvfuser");
 
+  GRAPH_DEBUG("GraphCache constructor: ", this);
+  GRAPH_DUMP("GraphCache created for graph", graph);
   createFusion(graph);
 }
 
@@ -634,7 +724,16 @@ std::vector<at::Tensor> GraphCache::runGraphWithInputs(
     const at::ArrayRef<IValue>& inputs) {
   FUSER_PERF_SCOPE("GraphCache::runGraphWithInputs");
 
-  return fusion_executor_cache_->runFusionWithInputs(inputs);
+  GRAPH_DEBUG("running GraphCache: ", this);
+  auto outputs = fusion_executor_cache_->runFusionWithInputs(inputs);
+  TORCH_INTERNAL_ASSERT(
+      outputs.size() == num_of_outputs_,
+      "FusionExecutorCache returned ",
+      outputs.size(),
+      " outputs, doesn't match computational graph, which requires ",
+      num_of_outputs_);
+
+  return outputs;
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h
index ae84c25e4f23..2958822a2f81 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h
@@ -7,8 +7,8 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 
+#include <c10/macros/Export.h>
 #include <c10/util/ArrayRef.h>
-#include <torch/csrc/Export.h>
 
 #include <mutex>
 #include <type_traits>
@@ -27,7 +27,6 @@ class SchedulerRuntimeInfo;
 struct ExecutorLog {
   c10::optional<ReductionParams> reduction_params = c10::nullopt;
   c10::optional<PointwiseParams> pointwise_params = c10::nullopt;
-  c10::optional<LaunchParams> launch_constraints = c10::nullopt;
   FusionExecutor* fusion_executor = nullptr;
 };
 
@@ -127,9 +126,8 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
 
  private:
   //! Interface to run a single kernel, either one kernel for single-kernel
-  //! fusions,
-  //!  or a kernel for a segmentedGrouup in a segmented fusion. Returns the
-  //!  kernel outputs.
+  //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns
+  //! the kernel outputs.
   std::vector<at::Tensor> runKernelWithInput(
       const at::ArrayRef<IValue>& inputs,
       size_t input_id,
@@ -410,6 +408,11 @@ class TORCH_CUDA_CU_API FusionExecutorCache {
   //! TODO: this can be largely expanded to look at complete
   //!   caching profiles. Currently it just makes it easier to test
   FusionKernelRuntime* most_recent_runtime_ = nullptr;
+
+  //! indices of fusion outputs that are aliased to inputs. These are used only
+  //! to support in-place update and should have been dropped before pushing
+  //! outputs to stack.
+  std::set<int> aliased_output_indices_;
 };
 
 class GraphCache {
@@ -426,15 +429,15 @@ class GraphCache {
       const at::ArrayRef<IValue>& inputs);
 
  private:
-  //! Computation graph;
-  std::shared_ptr<Graph> graph_;
-
   //! construct FusionExecutorCache
   void createFusion(const std::shared_ptr<Graph>& graph);
 
  private:
   //! FusionExecutorCache that performs schedule and kernel execution;
   std::unique_ptr<FusionExecutorCache> fusion_executor_cache_;
+
+  //! num of outputs
+  size_t num_of_outputs_ = 0;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
index 7421d2e235a6..3605f7a4155f 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
@@ -1,7 +1,6 @@
 
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 
 #include <iostream>
 
@@ -16,11 +15,11 @@ void ExpressionEvaluator::bind(
     Int::ScalarType concrete_value) {
   TORCH_CHECK(value->isScalar());
   TORCH_CHECK(value->dtype() == DataType::Int);
-  TORCH_CHECK(!value->isConst(), "Tried to bind to a constant value");
+  TORCH_CHECK(!value->isConstScalar(), "Tried to bind to a constant value");
   TORCH_CHECK(
       value->definition() == nullptr,
       "Tried to bind to a value that is computed in the kernel IR: ",
-      toString(value),
+      value->toString(),
       " with ",
       concrete_value);
   known_values_[value] = concrete_value;
@@ -41,14 +40,18 @@ void ExpressionEvaluator::bind(
 
 c10::optional<Int::ScalarType> ExpressionEvaluator::evaluate(const Val* value) {
   if (precomputed_integers_ && precomputed_integers_->ready()) {
-    return precomputed_integers_->getMaybeValueFor(value);
-  } else if (value->isScalar() && value->isConst()) {
+    if (precomputed_integers_->getMaybeValueFor(value).has_value()) {
+      return precomputed_integers_->getMaybeValueFor(value);
+    }
+  }
+
+  if (value->isScalar() && value->isConst()) {
     return value->as<Int>()->value();
   } else {
     FUSER_PERF_SCOPE("kir::ExpressionEvaluator::evaluate");
 
-    TORCH_CHECK(value->isScalar());
-    TORCH_CHECK(value->dtype() == DataType::Int);
+    TORCH_CHECK(value->isScalar(), value->toString());
+    TORCH_CHECK(value->dtype() == DataType::Int, value->toString());
 
     // Is the value known (either explicit binding or memoized)?
     const auto pre_eval_it = known_values_.find(value);
@@ -56,7 +59,7 @@ c10::optional<Int::ScalarType> ExpressionEvaluator::evaluate(const Val* value) {
       return pre_eval_it->second;
     }
 
-    value->accept(this);
+    OptOutConstDispatch::handle(value);
 
     const auto post_eval_it = known_values_.find(value);
     return post_eval_it != known_values_.end()
@@ -74,24 +77,23 @@ void ExpressionEvaluator::print() const {
   std::cout << "\nEvaluation context\n";
   std::cout << "--------------------\n";
   for (const auto& kv : known_values_) {
-    std::cout << toString(kv.first) << " = " << kv.second << "\n";
+    std::cout << kv.first->toString() << " = " << kv.second << "\n";
+  }
+  std::cout << "\nPre-computed Values\n";
+  if (precomputed_integers_ != nullptr) {
+    precomputed_integers_->print();
   }
   std::cout << "--------------------\n\n";
 }
 
-void ExpressionEvaluator::unhandled(const void*) {
-  TORCH_INTERNAL_ASSERT(
-      false, "Kernel IR expression evaluation reached an unsupported node");
-}
-
-void ExpressionEvaluator::visit(const Int* value) {
+void ExpressionEvaluator::handle(const Int* value) {
   TORCH_INTERNAL_ASSERT(!value->isConst());
   if (auto def = value->definition()) {
-    def->accept(this);
+    OptOutConstDispatch::handle(def);
   }
 }
 
-void ExpressionEvaluator::visit(const NamedScalar* named_scalar) {
+void ExpressionEvaluator::handle(const NamedScalar* named_scalar) {
   const auto& name = named_scalar->name();
   for (auto pt : kParallelTypeThreads) {
     auto pt_val_it = known_parallel_dimensions_.find(pt);
@@ -105,10 +107,10 @@ void ExpressionEvaluator::visit(const NamedScalar* named_scalar) {
   }
 }
 
-void ExpressionEvaluator::visit(const UnaryOp* unary_op) {
+void ExpressionEvaluator::handle(const UnaryOp* unary_op) {
   const auto in = evaluate(unary_op->in());
   if (in.has_value()) {
-    switch (unary_op->operation()) {
+    switch (unary_op->getUnaryOpType()) {
       case UnaryOpType::Neg:
         known_values_[unary_op->out()] = -*in;
         break;
@@ -121,11 +123,11 @@ void ExpressionEvaluator::visit(const UnaryOp* unary_op) {
   }
 }
 
-void ExpressionEvaluator::visit(const BinaryOp* binary_op) {
+void ExpressionEvaluator::handle(const BinaryOp* binary_op) {
   const auto lhs = evaluate(binary_op->lhs());
   const auto rhs = evaluate(binary_op->rhs());
   if (lhs.has_value() && rhs.has_value()) {
-    switch (binary_op->operation()) {
+    switch (binary_op->getBinaryOpType()) {
       case BinaryOpType::Add:
         known_values_[binary_op->out()] = *lhs + *rhs;
         break;
diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
index 647913875430..63586857ad85 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
@@ -1,7 +1,9 @@
 
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
@@ -34,7 +36,7 @@ namespace kir {
 //!   }
 //! ```
 //!
-class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor {
+class TORCH_CUDA_CU_API ExpressionEvaluator : private OptInConstDispatch {
  public:
   //! Set a concrete value for a symbolic value
   void bind(const Val* value, Int::ScalarType concrete_value);
@@ -56,11 +58,10 @@ class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor {
   }
 
  private:
-  void unhandled(const void*) final;
-  void visit(const Int* value) final;
-  void visit(const NamedScalar* named_scalar) final;
-  void visit(const UnaryOp* unary_op) final;
-  void visit(const BinaryOp* binary_op) final;
+  void handle(const Int* value) final;
+  void handle(const NamedScalar* named_scalar) final;
+  void handle(const UnaryOp* unary_op) final;
+  void handle(const BinaryOp* binary_op) final;
 
  private:
   std::unordered_map<const Val*, Int::ScalarType> known_values_;
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
index eebfd41729cd..35537f7a4fcb 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -1,8 +1,7 @@
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -15,369 +14,52 @@ namespace fuser {
 namespace cuda {
 namespace kir {
 
-void Node::print() const {
-  std::cout << "\n";
-  IrPrinter(std::cout).printNode(this);
-  std::cout << "\n";
-}
-
-Val::Val(Passkey passkey, DataType dtype) : Node(passkey), dtype_(dtype) {
-  // NOLINTNEXTLINE: https://bugs.llvm.org/show_bug.cgi?id=48534
-  id_ = passkey.kernel->newValueId(passkey);
-}
-
-namespace {
-
-// Traverse definition of all values involved in constructing the provided val.
-// Check if all values involved are constant values, meaning the provided
-// val is also a constant value.
-class ConstCheck : IrVisitor {
- private:
-  bool is_const_ = true;
-
-  using IrVisitor::visit;
-
-  void visit(const Bool* b) override {
-    is_const_ = is_const_ && b->isConst();
-  }
-
-  void visit(const Double* d) override {
-    is_const_ = is_const_ && d->isConst();
-  }
-
-  void visit(const Int* i) override {
-    is_const_ = is_const_ && i->isConst();
-  }
-
-  void visit(const NamedScalar* ns) override {
-    is_const_ = is_const_ && false;
-  }
-
-  void visit(const Expr* expr) {
-    for (auto inp : expr->inputs()) {
-      visit(inp);
-    }
-  }
-
-  void visit(const Val* val) {
-    if (val->definition() != nullptr) {
-      visit(val->definition());
-    } else {
-      val->accept(this);
-    }
-  }
-
- public:
-  static bool isConst(const Val* val) {
-    ConstCheck cc;
-    cc.visit(val);
-    return cc.is_const_;
-  }
-};
-
-} // namespace
-
-bool Val::isConstScalar() const {
-  if (!isScalar())
-    return false;
-  return ConstCheck::isConst(this);
-}
-
-Expr* Expr::parentScope() const {
-  if (scope()) {
-    return scope()->owner();
-  } else {
-    return nullptr;
-  }
-}
-
-NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) {
-  std::string parallel_dim = stringifyThreadSize(p_type);
-  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-  return ir_builder.create<NamedScalar>(parallel_dim, DataType::Int);
-}
-
-NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) {
-  std::string parallel_ind = stringifyThread(p_type);
-  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-  return ir_builder.create<NamedScalar>(parallel_ind, DataType::Int);
-}
-
-c10::optional<ParallelType> NamedScalar::getParallelDim() const {
-  if (stringifyThreadSize(ParallelType::TIDx).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDx);
-  } else if (stringifyThreadSize(ParallelType::TIDy).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDy);
-  } else if (stringifyThreadSize(ParallelType::TIDz).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDz);
-  } else if (stringifyThreadSize(ParallelType::BIDx).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDx);
-  } else if (stringifyThreadSize(ParallelType::BIDy).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDy);
-  } else if (stringifyThreadSize(ParallelType::BIDz).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDz);
-  }
-  return c10::nullopt;
-}
-
-c10::optional<ParallelType> NamedScalar::getParallelIndex() const {
-  if (stringifyThread(ParallelType::TIDx).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDx);
-  } else if (stringifyThread(ParallelType::TIDy).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDy);
-  } else if (stringifyThread(ParallelType::TIDz).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::TIDz);
-  } else if (stringifyThread(ParallelType::BIDx).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDx);
-  } else if (stringifyThread(ParallelType::BIDy).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDy);
-  } else if (stringifyThread(ParallelType::BIDz).compare(name()) == 0) {
-    return c10::optional<ParallelType>(ParallelType::BIDz);
-  }
-  return c10::nullopt;
-}
-
-IterDomain::IterDomain(Passkey passkey, Val* start, Val* extent)
-    : Val(passkey, DataType::Int),
-      start_(start),
-      stop_(extent),
-      extent_(extent) {}
-
-IterDomain::IterDomain(
-    Passkey passkey,
-    const fuser::cuda::IterDomain* iter_domain)
-    : Val(passkey, iter_domain->getDataType().value()),
-      start_(GpuLower::current()->lowerValue(iter_domain->start())),
-      stop_(GpuLower::current()->lowerValue(iter_domain->stop())),
-      extent_(GpuLower::current()->lowerValue(iter_domain->extent())),
-      parallel_type_(iter_domain->getParallelType()),
-      iter_type_(iter_domain->getIterType()),
-      is_rfactor_domain_(iter_domain->isRFactorProduct()),
-      is_simple_(iter_domain->definition() == nullptr),
-      is_padded_dimension_(iter_domain->hasPaddingToMultipleOfWarp()) {
-  // preserve the fusion node's name
-  setName(iter_domain->name());
-}
-
-//! Note that the parallel dimension, if available, may be different
-//! from the actual extent of this IterDomain as the parallel
-//! dimension is determined by the largest extent of IterDomains
-//! sharing the same loop.
-Val* IterDomain::extent() const {
-  TORCH_INTERNAL_ASSERT(extent_ != nullptr);
-  return extent_;
-}
-
-TensorDomain::TensorDomain(Passkey passkey, std::vector<IterDomain*> domain)
-    : Val(passkey, DataType::Null), root_domain_(std::move(domain)) {
-  domain_ = root_domain_;
-  resetDomains();
-}
-
-TensorDomain::TensorDomain(
-    Passkey passkey,
-    const fuser::cuda::TensorDomain* tensor_domain)
-    : Val(passkey, DataType::Null), contiguity_(tensor_domain->contiguity()) {
-  // preserve the fusion node's name
-  setName(tensor_domain->name());
-
-  const auto lowerIterDomains =
-      [](const std::vector<fuser::cuda::IterDomain*>& domains) {
-        std::vector<IterDomain*> lowered_domains;
-        lowered_domains.reserve(domains.size());
-        for (const auto iter_domain : domains) {
-          lowered_domains.push_back(
-              GpuLower::current()->lowerValue(iter_domain)->as<IterDomain>());
-        }
-        return lowered_domains;
-      };
-
-  root_domain_ = lowerIterDomains(tensor_domain->getRootDomain());
-  domain_ = lowerIterDomains(tensor_domain->domain());
-  no_bcast_domain_ = lowerIterDomains(tensor_domain->noBroadcasts());
-  no_reduction_domain_ = lowerIterDomains(tensor_domain->noReductions());
-  rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain());
-}
-
-bool TensorDomain::hasReduction() const {
-  return no_reduction_domain_.size() != domain_.size();
-}
-
-bool TensorDomain::hasBlockReduction() const {
-  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
-    return id->isReduction() && id->isThreadDim();
-  });
-}
-
-bool TensorDomain::hasGridReduction() const {
-  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
-    return id->isReduction() && id->isBlockDim();
-  });
-}
-
-bool TensorDomain::hasBlockBroadcast() const {
-  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
-    return id->isBroadcast() && id->isThreadDim();
-  });
-}
-
-bool TensorDomain::hasGridBroadcast() const {
-  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
-    return id->isBroadcast() && id->isBlockDim();
-  });
-}
-
-bool TensorDomain::hasBroadcast() const {
-  return no_bcast_domain_.size() != domain_.size();
-}
-
-bool TensorDomain::hasRFactor() const {
-  return !rfactor_domain_.empty();
-}
-
-bool TensorDomain::hasVectorize() const {
-  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
-    return id->parallelType() == ParallelType::Vectorize ||
-        id->parallelType() == ParallelType::MisalignedVectorize;
-  });
-}
-
-IterDomain* TensorDomain::axis(int i) const {
-  TORCH_INTERNAL_ASSERT(i >= 0 && i < int(domain_.size()));
-  return domain_[i];
-}
-
-std::vector<IterDomain*> TensorDomain::noReductions(
-    const std::vector<IterDomain*>& td) {
-  std::vector<IterDomain*> no_reduction_domains;
-  for (auto id : td) {
-    if (!id->isReduction()) {
-      no_reduction_domains.push_back(id);
-    }
-  }
-  return no_reduction_domains;
-}
-
-std::vector<IterDomain*> TensorDomain::noBroadcasts(
-    const std::vector<IterDomain*>& td) {
-  std::vector<IterDomain*> no_broadcast_domains;
-  for (auto id : td) {
-    if (!id->isBroadcast()) {
-      no_broadcast_domains.push_back(id);
-    }
-  }
-  return no_broadcast_domains;
-}
-
-TensorView::TensorView(Passkey passkey, const fuser::cuda::TensorView* tv)
-    : Val(passkey, tv->getDataType().value()), fuser_tv_(tv) {
-  setName(tv->name());
-  domain_ = GpuLower::current()->lowerValue(tv->domain())->as<TensorDomain>();
-  memory_type_ = tv->getMemoryType();
-}
-
-TensorView::TensorView(
-    Passkey passkey,
-    DataType dtype,
-    TensorDomain* domain,
-    MemoryType memory_type)
-    : Val(passkey, dtype), domain_(domain), memory_type_(memory_type) {}
-
-UnaryOp::UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in)
-    : Expr(passkey), operation_(operation), out_(out), in_(in) {
-  addOutput(out);
-  addInput(in);
-}
-
-BinaryOp::BinaryOp(
-    Passkey passkey,
-    BinaryOpType operation,
-    Val* out,
-    Val* lhs,
-    Val* rhs)
-    : Expr(passkey), operation_(operation), out_(out), lhs_(lhs), rhs_(rhs) {
-  addOutput(out);
-  addInput(lhs);
-  addInput(rhs);
+Predicate::Predicate(
+    IrBuilderPasskey passkey,
+    PredicateType ptype,
+    const Expr* expr,
+    Bool* thread_pred)
+    : Val(passkey, ValType::Predicate, DataType::Bool),
+      ptype_(ptype),
+      expr_(expr),
+      thread_pred_(thread_pred) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+  TORCH_INTERNAL_ASSERT(
+      ptype != PredicateType::Unswitch && ptype != PredicateType::Manual);
 }
 
-TernaryOp::TernaryOp(
-    Passkey passkey,
-    TernaryOpType operation,
-    Val* out,
-    Val* in1,
-    Val* in2,
-    Val* in3)
-    : Expr(passkey),
-      operation_(operation),
-      out_(out),
-      in1_(in1),
-      in2_(in2),
-      in3_(in3) {
-  addOutput(out);
-  addInput(in1);
-  addInput(in2);
-  addInput(in3);
-}
-
-ReductionOp::ReductionOp(
-    Passkey passkey,
-    BinaryOpType operation,
-    Val* init,
-    Val* out,
-    Val* in)
-    : Expr(passkey), operation_(operation), init_(init), out_(out), in_(in) {
-  addOutput(out);
-  addInput(in);
-}
-
-WelfordOp::WelfordOp(
-    Passkey passkey,
-    Val* out_var,
-    Val* out_avg,
-    Val* out_N,
-    Val* init_var,
-    Val* init_avg,
-    Val* init_N,
-    Val* in_var,
-    Val* in_avg,
-    Val* in_N)
-    : Expr(passkey),
-      out_var_(out_var),
-      out_avg_(out_avg),
-      out_N_(out_N),
-      init_var_(init_var),
-      init_avg_(init_avg),
-      init_N_(init_N),
-      in_var_(in_var),
-      in_avg_(in_avg),
-      in_N_(in_N) {
-  addOutput(out_avg);
-  addOutput(out_var);
-  addOutput(out_N);
-
-  if (!in_N->isOneInt()) {
-    addInput(in_var);
-  }
-  addInput(in_avg);
-  addInput(in_N);
+Predicate::Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop)
+    : Val(passkey, ValType::Predicate, DataType::Bool),
+      ptype_(PredicateType::Unswitch),
+      unrolled_loop_(unrolled_loop) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+  TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr);
 }
 
-BroadcastOp::BroadcastOp(Passkey passkey, Val* out, Val* in)
-    : Expr(passkey), out_(out), in_(in) {
-  TORCH_CHECK(in->isA<TensorIndex>() || in->isA<TensorView>());
-  TORCH_CHECK(out->isA<TensorIndex>() || out->isA<TensorView>());
-  addOutput(out);
-  addInput(in);
+Predicate::Predicate(IrBuilderPasskey passkey, Bool* value)
+    : Val(passkey, ValType::Predicate, DataType::Bool),
+      ptype_(PredicateType::Manual),
+      value_(value) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+  TORCH_INTERNAL_ASSERT(value != nullptr);
 }
 
 TensorIndex::TensorIndex(
-    Passkey passkey,
-    const fuser::cuda::TensorView* view,
+    IrBuilderPasskey passkey,
+    const TensorView* view,
     std::vector<Val*> indices)
-    : Val(passkey, view->getDataType().value()),
-      view_(GpuLower::current()->lowerValue(view)->as<TensorView>()),
+    : Val(passkey, ValType::TensorIndex, view->getDataType().value()),
+      view_(view),
       indices_(indices) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
   TORCH_INTERNAL_ASSERT(
       std::all_of(
           indices.begin(),
@@ -392,20 +74,41 @@ TensorIndex::TensorIndex(
       indices_.end());
   // If indices becomes empty, just put one ZeroInt
   if (indices_.empty()) {
-    indices_.push_back(kir::IrBuilder(GpuLower::current()->kernel()).zeroVal());
+    indices_.push_back(FusionGuard::getCurFusion()->zeroVal());
   }
 }
 
-Sync::Sync(Passkey passkey, bool war_sync)
-    : Expr(passkey), war_sync_(war_sync) {}
+BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync)
+    : Expr(passkey, ExprType::BlockSync), war_sync_(war_sync) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+GridSync::GridSync(
+    IrBuilderPasskey passkey,
+    ParallelTypeBitmap sync_dims,
+    Val* sync_buffer)
+    : Expr(passkey, ExprType::GridSync),
+      sync_dims_(sync_dims),
+      sync_buffer_(sync_buffer) {}
 
-InitMagicZero::InitMagicZero(Passkey passkey) : Expr(passkey) {}
+InitMagicZero::InitMagicZero(IrBuilderPasskey passkey)
+    : Expr(passkey, ExprType::InitMagicZero) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
-UpdateMagicZero::UpdateMagicZero(Passkey passkey) : Expr(passkey) {}
+UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey)
+    : Expr(passkey, ExprType::UpdateMagicZero) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
 void Scope::insert(std::vector<Expr*>::const_iterator pos, Expr* expr) {
   exprs_.insert(pos, expr);
-  expr->setScope(this);
 }
 
 void Scope::insert_before(Expr* ref, Expr* expr) {
@@ -439,12 +142,7 @@ void Scope::insert(size_t pos, Expr* expr) {
 
 void Scope::erase(std::vector<Expr*>::const_iterator pos) {
   // Remove the scope of the expr if this is the scope
-  auto expr = *pos;
-  TORCH_INTERNAL_ASSERT(
-      expr->scope() == this,
-      "Inconsistent scoping of expression detected: ",
-      kir::toString(expr));
-  expr->setScope(nullptr);
+  C10_UNUSED auto expr = *pos;
   exprs_.erase(pos);
 }
 
@@ -470,7 +168,7 @@ void Scope::clear() {
 }
 
 ForLoop::ForLoop(
-    Passkey passkey,
+    IrBuilderPasskey passkey,
     IterDomain* iter_domain,
     Val* index,
     Val* start,
@@ -479,7 +177,7 @@ ForLoop::ForLoop(
     bool vectorize,
     Val* vectorize_shift,
     bool unroll_required)
-    : Expr(passkey),
+    : Expr(passkey, ExprType::ForLoop),
       iter_domain_{iter_domain},
       index_(index),
       start_(start),
@@ -489,43 +187,43 @@ ForLoop::ForLoop(
       vectorize_shift_(vectorize_shift),
       unroll_required_(unroll_required),
       body_(this) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
   TORCH_INTERNAL_ASSERT(index->dtype() == DataType::Int);
   addInput(index);
   addInput(iter_domain);
   if (start_ == nullptr && iter_domain->isThread()) {
-    start_ =
-        IrBuilder(GpuLower::current()->kernel())
-            .create<kir::NamedScalar>(
-                stringifyThread(iter_domain->parallelType()), DataType::Int);
+    start_ = NamedScalar::getParallelIndex(iter_domain->getParallelType());
   }
   if (step_ == nullptr) {
     if (iter_domain->isThread()) {
-      step_ = IrBuilder(GpuLower::current()->kernel())
-                  .create<kir::NamedScalar>(
-                      stringifyThreadSize(iter_domain->parallelType()),
-                      DataType::Int);
+      step_ = NamedScalar::getParallelDim(iter_domain->getParallelType());
     } else {
-      step_ = IrBuilder(GpuLower::current()->kernel()).oneVal();
+      step_ = FusionGuard::getCurFusion()->oneVal();
     }
   }
 }
 
-ForLoop::ForLoop(Passkey passkey, IterDomain* iter_domain)
+ForLoop::ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain)
     : ForLoop(
           passkey,
           iter_domain,
-          iter_domain->isBroadcast()
-              ? IrBuilder(GpuLower::current()->kernel()).zeroVal()
-              : IrBuilder(GpuLower::current()->kernel())
-                    .create<kir::Int>(c10::nullopt),
+          iter_domain->isBroadcast() ? FusionGuard::getCurFusion()->zeroVal()
+                                     : IrBuilder::create<Int>(c10::nullopt),
           nullptr,
           nullptr,
           nullptr,
-          isParallelTypeVectorize(iter_domain->parallelType()),
+          !iter_domain->isBroadcast() &&
+              isParallelTypeVectorize(iter_domain->getParallelType()),
           nullptr,
-          false) {}
+          false) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
-ForLoop::ForLoop(Passkey passkey, const ForLoop* other)
+ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other)
     : ForLoop(
           passkey,
           other->iter_domain(),
@@ -535,7 +233,11 @@ ForLoop::ForLoop(Passkey passkey, const ForLoop* other)
           other->step(),
           other->vectorize(),
           other->vectorize_shift(),
-          other->isUnrollRequired()) {}
+          other->isUnrollRequired()) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
 bool ForLoop::isUnrollable() const {
   // Start and stop must be constant, must not be a broadcast
@@ -550,7 +252,7 @@ bool ForLoop::isUnrolled() const {
   if (isUnrollRequired() && !isUnrollable()) {
     TORCH_WARN(
         "Unroll required but not possible. Register allocation disabled. Loop index: ",
-        kir::toString(index_));
+        index_->toString());
     return false;
   }
 
@@ -570,7 +272,7 @@ bool ForLoop::isUnrolled() const {
   }
 
   // Unrolling is technically possible but avoided
-  if (iter_domain()->parallelType() == ParallelType::Unswitch) {
+  if (iter_domain()->getParallelType() == ParallelType::Unswitch) {
     // Use ParallelType::Unroll if unrolling is desired. Note that
     // unswitched size-one loops are not unrolled as they are not
     // materialized as actual for-loops.
@@ -605,8 +307,53 @@ Val* ForLoop::step() const {
   return step_;
 }
 
-IfThenElse::IfThenElse(Passkey passkey, Predicate* cond)
-    : Expr(passkey), then_body_(this), else_body_(this) {
+bool ForLoop::isTrivial() const {
+  // These loops are not materialized
+  if (vectorize() || iter_domain()->isBroadcast() ||
+      iter_domain()->isStride() || iter_domain()->isMma()) {
+    return true;
+  }
+
+  // By default, a parallelized loop would look like:
+  //
+  //   for (int x = threadIdx.x; x < stop; x += blockDim.x) {
+  //     do_some_comp(x);
+  //   }
+  //
+  // When stop is guaranteed to be smaller or equal to the number of
+  // threads, the for-loop is not necessary. In the above case, we
+  // would just generate the loop body without the for clause but
+  // references to the loop index replaced by the loop start value.
+  //
+  // When the loop end is the same as the IterDomain extent, the
+  // assumption can be safely made. This is more conservative than
+  // necessary since the loop stop value just needs to be <= the
+  // IterDomain extent. However, at this point, this conservative
+  // analysis seems sufficient.
+  if (stop() == iter_domain()->extent() && iter_domain()->isThread()) {
+    return true;
+  }
+
+  // Extent-1 loop: for (int i = 0; i < 1; ++i) {
+  if (start()->isZeroInt() && stop()->isOneInt() && step()->isOneInt()) {
+    return true;
+  }
+
+  // Another extent-1 loop: for (int i = N - 1; i < N; ++i) {
+  if (start()->definition() != nullptr &&
+      start()->definition()->isA<BinaryOp>() &&
+      start()->definition()->as<BinaryOp>()->getBinaryOpType() ==
+          BinaryOpType::Sub &&
+      start()->definition()->as<BinaryOp>()->lhs() == stop() &&
+      start()->definition()->as<BinaryOp>()->rhs()->isOneInt()) {
+    return true;
+  }
+
+  return false;
+}
+
+IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond)
+    : Expr(passkey, ExprType::IfThenElse), then_body_(this), else_body_(this) {
   setPredicate(cond);
   addInput(cond);
 }
@@ -621,17 +368,19 @@ Val* TensorIndex::index(int i) const {
 }
 
 Allocate::Allocate(
-    Passkey passkey,
+    IrBuilderPasskey passkey,
     Val* buffer,
     MemoryType memory_type,
     std::vector<Val*> shape,
     bool zero_init)
-    : Expr(passkey),
+    : Expr(passkey, ExprType::Allocate),
       buffer_(buffer),
       memory_type_(memory_type),
       shape_(std::move(shape)),
       zero_init_(zero_init) {
-  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
   if (!shape_.empty()) {
     TORCH_INTERNAL_ASSERT(
         (shape_.size() == 1 && shape_[0]->isOneInt()) ||
@@ -639,7 +388,7 @@ Allocate::Allocate(
   } else {
     TORCH_INTERNAL_ASSERT(buffer_->isA<TensorView>());
     TORCH_INTERNAL_ASSERT(
-        buffer_->as<TensorView>()->memoryType() == memory_type_);
+        buffer_->as<TensorView>()->getMemoryType() == memory_type_);
     const auto domain = buffer_->as<TensorView>()->domain();
     for (auto axis : domain->noReductions()) {
       shape_.push_back(axis->extent());
@@ -650,19 +399,19 @@ Allocate::Allocate(
     if (size_ == nullptr) {
       size_ = s;
     } else {
-      size_ = ir_builder.mulExpr(size_, s);
+      size_ = IrBuilder::mulExpr(size_, s);
     }
   }
 
   if (size_ == nullptr) {
-    size_ = ir_builder.oneVal();
+    size_ = FusionGuard::getCurFusion()->oneVal();
   }
 
   addInput(size_);
 }
 
 Allocate::Allocate(
-    Passkey passkey,
+    IrBuilderPasskey passkey,
     Val* buffer,
     MemoryType memory_type,
     Val* size,
@@ -672,31 +421,158 @@ Allocate::Allocate(
           buffer,
           memory_type,
           size == nullptr ? std::vector<Val*>{} : std::vector<Val*>{size},
-          zero_init) {}
+          zero_init) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
 GridReduction::GridReduction(
-    Passkey passkey,
-    ReductionOp* reduction_op,
+    IrBuilderPasskey passkey,
+    BinaryOpType reduction_op_type,
+    Val* init,
+    Val* out,
+    Val* in,
     Allocate* reduction_buffer,
-    Allocate* sync_buffer)
-    : Expr(passkey),
-      reduction_op_(reduction_op),
+    Allocate* sync_buffer,
+    Val* entrance_index,
+    Val* entrances,
+    bool is_allreduce)
+    : ReductionOp(
+          passkey,
+          reduction_op_type,
+          init,
+          out,
+          in,
+          is_allreduce,
+          ExprType::GridReduction),
       reduction_buffer_(reduction_buffer),
-      sync_buffer_(sync_buffer) {}
+      sync_buffer_(sync_buffer),
+      entrance_index_(entrance_index),
+      entrances_(entrances) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+GroupedGridReduction::GroupedGridReduction(
+    IrBuilderPasskey passkey,
+    std::vector<BinaryOpType> reduction_op_types,
+    std::vector<Val*> init_vals,
+    std::vector<Val*> outputs,
+    std::vector<Val*> inputs,
+    std::vector<Allocate*> reduction_buffers,
+    Allocate* sync_buffer,
+    bool is_fused)
+    : GroupedReductionOp(
+          passkey,
+          std::move(reduction_op_types),
+          std::move(init_vals),
+          std::move(outputs),
+          std::move(inputs),
+          is_fused,
+          ExprType::GroupedGridReduction),
+      reduction_buffers_(std::move(reduction_buffers)),
+      sync_buffer_(sync_buffer) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+GridBroadcast::GridBroadcast(
+    IrBuilderPasskey passkey,
+    BroadcastOp* broadcast_op,
+    Allocate* broadcast_buffer,
+    Allocate* sync_buffer)
+    : Expr(passkey, ExprType::GridBroadcast),
+      broadcast_op_(broadcast_op),
+      broadcast_buffer_(broadcast_buffer),
+      sync_buffer_(sync_buffer) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
 
 GridWelford::GridWelford(
-    Passkey passkey,
+    IrBuilderPasskey passkey,
     WelfordOp* welford_op,
     Allocate* var_buffer,
     Allocate* avg_buffer,
     Allocate* n_buffer,
-    Allocate* sync_buffer)
-    : Expr(passkey),
+    Allocate* sync_buffer,
+    Val* entrance_index,
+    Val* entrances)
+    : Expr(passkey, ExprType::GridWelford),
       welford_op_(welford_op),
       var_buffer_(var_buffer),
       avg_buffer_(avg_buffer),
       n_buffer_(n_buffer),
-      sync_buffer_(sync_buffer) {}
+      sync_buffer_(sync_buffer),
+      entrance_index_(entrance_index),
+      entrances_(entrances) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+AllocateFusedReduction::AllocateFusedReduction(
+    IrBuilderPasskey passkey,
+    GridReduction* grid_reduction)
+    : Expr(passkey, ExprType::AllocateFusedReduction),
+      grid_expr_(grid_reduction) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+AllocateFusedReduction::AllocateFusedReduction(
+    IrBuilderPasskey passkey,
+    GridWelford* grid_welford)
+    : Expr(passkey, ExprType::AllocateFusedReduction),
+      grid_expr_(grid_welford) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+AllocateFusedReduction::AllocateFusedReduction(
+    IrBuilderPasskey passkey,
+    GroupedGridReduction* grouped_grid_reduction)
+    : Expr(passkey, ExprType::AllocateFusedReduction),
+      grid_expr_(grouped_grid_reduction) {
+  TORCH_INTERNAL_ASSERT(
+      passkey.ir_container_->isA<kir::Kernel>(),
+      "IR type only valid for Kernel container.");
+}
+
+TensorIndex* AllocateFusedReduction::out() const {
+  TORCH_INTERNAL_ASSERT(grid_expr_ != nullptr);
+  if (grid_expr_->isA<GridReduction>() ||
+      grid_expr_->isA<GroupedGridReduction>()) {
+    return grid_expr_->outputs().at(0)->as<kir::TensorIndex>();
+  } else if (auto grid_welford = dynamic_cast<GridWelford*>(grid_expr_)) {
+    return grid_welford->welford_op()->out()->as<kir::TensorIndex>();
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        false, "Invalid grid expression: ", grid_expr_->toString());
+  }
+}
+
+const ParallelTypeBitmap& AllocateFusedReduction::threadPredicate() const {
+  TORCH_INTERNAL_ASSERT(grid_expr_ != nullptr);
+  if (auto grid_reduction = dynamic_cast<GridReduction*>(grid_expr_)) {
+    return grid_reduction->threadPredicate();
+  } else if (auto grid_welford = dynamic_cast<GridWelford*>(grid_expr_)) {
+    return grid_welford->threadPredicate();
+  } else if (
+      auto grouped_grid_reduction =
+          dynamic_cast<GroupedGridReduction*>(grid_expr_)) {
+    return grouped_grid_reduction->threadPredicate();
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        false, "Invalid grid expression: ", grid_expr_->toString());
+  }
+}
 
 } // namespace kir
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
index c1ac6052783d..99ebdba5bab3 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -1,1163 +1,154 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
-
-// TODO(kir): remove these once the Kernel IR is separated from Fusion IR
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
-
-#include <c10/util/Optional.h>
-#include <torch/csrc/Export.h>
-
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-class IrBuilder;
-class Kernel;
-
-// Abstract nodes
-class Node;
-class Val;
-class Expr;
-
-// Values
-class NamedScalar;
-class Predicate;
-class Bool;
-class Double;
-class Int;
-class IterDomain;
-class TensorDomain;
-class TensorView;
-class TensorIndex;
-
-// Expressions
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class ReductionOp;
-class WelfordOp;
-class BroadcastOp;
-
-// Statements
-class Allocate;
-class Sync;
-class InitMagicZero;
-class UpdateMagicZero;
-class ForLoop;
-class IfThenElse;
-class GridReduction;
-class GridBroadcast;
-class GridWelford;
-
-// Expr container
-class Scope;
-
-using ValueId = int32_t;
-
-//! Token used to restrict the access to Kernel IR creation
-//!
-//! A token is associated with a kernel, which is passed with the key
-//! (Passkey::kernel)
-//!
-//! It is a "granular friendship" token, used to implement the "passkey" idiom:
-//! https://www.spiria.com/en/blog/desktop-software/passkey-idiom-and-better-friendship-c
-//! https://arne-mertz.de/2016/10/passkey-idiom
-//!
-class Passkey {
-  friend class IrBuilder;
-
- public:
-  Kernel* const kernel = nullptr;
-
- private:
-  explicit Passkey(Kernel* kernel) : kernel(kernel) {}
-};
-
-//! Kernel IR visitor interface
-class TORCH_CUDA_CU_API IrVisitor : public PolymorphicBase {
- public:
-  // TODO(kir): use Node* instead of void*
-  virtual void unhandled(const void* node) {}
-
-  // Values
-  virtual void visit(const NamedScalar* named_scalar) {
-    unhandled(named_scalar);
-  }
-  virtual void visit(const Predicate* value) {
-    unhandled(value);
-  }
-  virtual void visit(const Bool* value) {
-    unhandled(value);
-  }
-  virtual void visit(const Double* value) {
-    unhandled(value);
-  }
-  virtual void visit(const Int* value) {
-    unhandled(value);
-  }
-  virtual void visit(const IterDomain* iter_domain) {
-    unhandled(iter_domain);
-  }
-  virtual void visit(const TensorDomain* tensor_domain) {
-    unhandled(tensor_domain);
-  }
-  virtual void visit(const TensorView* tensor_view) {
-    unhandled(tensor_view);
-  }
-  virtual void visit(const TensorIndex* tensor_index) {
-    unhandled(tensor_index);
-  }
-
-  // Expressions
-  virtual void visit(const UnaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(const BinaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(const TernaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(const ReductionOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(const WelfordOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(const BroadcastOp* node) {
-    unhandled(node);
-  }
-
-  // Statements
-  virtual void visit(const Allocate* node) {
-    unhandled(node);
-  }
-  virtual void visit(const Sync* node) {
-    unhandled(node);
-  }
-  virtual void visit(const InitMagicZero* node) {
-    unhandled(node);
-  }
-  virtual void visit(const UpdateMagicZero* node) {
-    unhandled(node);
-  }
-  virtual void visit(const ForLoop* node) {
-    unhandled(node);
-  }
-  virtual void visit(const IfThenElse* node) {
-    unhandled(node);
-  }
-  virtual void visit(const GridReduction* node) {
-    unhandled(node);
-  }
-  virtual void visit(const GridBroadcast* node) {
-    unhandled(node);
-  }
-  virtual void visit(const GridWelford* node) {
-    unhandled(node);
-  }
-};
-
-//! Kernel IR visitor interface
-class TORCH_CUDA_CU_API MutableIrVisitor : public PolymorphicBase {
- public:
-  // TODO(kir): use Node* instead of void*
-  virtual void unhandled(const void*) {}
-
-  // Values
-  virtual void visit(NamedScalar* named_scalar) {
-    unhandled(named_scalar);
-  }
-  virtual void visit(Predicate* value) {
-    unhandled(value);
-  }
-  virtual void visit(Bool* value) {
-    unhandled(value);
-  }
-  virtual void visit(Double* value) {
-    unhandled(value);
-  }
-  virtual void visit(Int* value) {
-    unhandled(value);
-  }
-  virtual void visit(IterDomain* iter_domain) {
-    unhandled(iter_domain);
-  }
-  virtual void visit(TensorDomain* tensor_domain) {
-    unhandled(tensor_domain);
-  }
-  virtual void visit(TensorView* tensor_view) {
-    unhandled(tensor_view);
-  }
-  virtual void visit(TensorIndex* tensor_index) {
-    unhandled(tensor_index);
-  }
-
-  // Expressions
-  virtual void visit(UnaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(BinaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(TernaryOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(ReductionOp* node) {
-    unhandled(node);
-  }
-  virtual void visit(BroadcastOp* node) {
-    unhandled(node);
-  }
-
-  virtual void visit(WelfordOp* node) {
-    unhandled(node);
-  }
-
-  // Statements
-  virtual void visit(Allocate* node) {
-    unhandled(node);
-  }
-  virtual void visit(Sync* node) {
-    unhandled(node);
-  }
-  virtual void visit(InitMagicZero* node) {
-    unhandled(node);
-  }
-  virtual void visit(UpdateMagicZero* node) {
-    unhandled(node);
-  }
-  virtual void visit(ForLoop* node) {
-    unhandled(node);
-  }
-  virtual void visit(IfThenElse* node) {
-    unhandled(node);
-  }
-  virtual void visit(GridReduction* node) {
-    unhandled(node);
-  }
-  virtual void visit(GridBroadcast* node) {
-    unhandled(node);
-  }
-  virtual void visit(GridWelford* node) {
-    unhandled(node);
-  }
-};
-
-//! Base class for Kernel IR nodes
-class TORCH_CUDA_CU_API Node : public NonCopyable, public PolymorphicBase {
- public:
-  explicit Node(Passkey) {}
-
-  //! IR Visitor double-dispatch interface
-  //! (https://en.wikipedia.org/wiki/Visitor_pattern)
-  virtual void accept(IrVisitor* visitor) const = 0;
-
-  //! Non constant IR Visitor
-  virtual void accept(MutableIrVisitor* visitor) = 0;
-
-  //! Debug helper, prints the textual representation of an IR node
-  void print() const;
-};
-
-//! Generic value (scalar or tensor)
-class TORCH_CUDA_CU_API Val : public Node {
- public:
-  Val(Passkey passkey, DataType dtype);
-
-  // TODO(kir): consider renaming
-  StmtNameType name() const {
-    return name_;
-  }
-
-  void setName(StmtNameType name) {
-    name_ = name;
-  }
-
-  ValueId id() const {
-    return id_;
-  }
-
-  DataType dtype() const {
-    return dtype_;
-  }
-
-  Expr* definition() const {
-    return definition_;
-  }
-
-  void setDefinition(Expr* expr) {
-    // TODO(kir): extra checks on changing existing definitions?
-    definition_ = expr;
-  }
-
-  virtual bool isScalar() const {
-    return false;
-  }
-
-  bool isConstScalar() const;
-
-  virtual bool isConst() const {
-    return false;
-  }
-
-  // TODO(kir): revisit and find a better interface
-  virtual bool isZeroInt() const {
-    return false;
-  }
-
-  virtual bool isOneInt() const {
-    return false;
-  }
-
-  void setEvaluatorIndex(int to) {
-    TORCH_INTERNAL_ASSERT(evaluator_index_ == -1);
-    evaluator_index_ = to;
-  }
-
-  int evaluatorIndex() const {
-    return evaluator_index_;
-  }
-
- private:
-  const DataType dtype_;
-
-  // The expression which defines this value, or nullptr
-  Expr* definition_ = nullptr;
-
-  // This is a value name preserved from the Fusion IR (optional)
-  StmtNameType name_ = kInvalidStmName;
-
-  // All Kernel IR values have IDs (unique within the same Kernel)
-  ValueId id_ = -1;
-
-  // Expr evaluator idx;
-  int evaluator_index_ = -1;
-};
-
-//! Base class for expressions and statements
-//!
-//! Expressions consume inputs and produce outputs (depending on the context
-//! this may imply assignments). Currently some of the expressions
-//! don't actually produce any outputs (ForLoop, IfThenElse) and they
-//! model statements to be executed.
-//!
-//! TODO(kir): split the expressions, assignments and statements?
-//!
-class TORCH_CUDA_CU_API Expr : public Node {
- public:
-  explicit Expr(Passkey passkey) : Node(passkey) {}
-
-  const auto& inputs() const {
-    return inputs_;
-  }
-
-  const auto& outputs() const {
-    return outputs_;
-  }
-
-  Scope* scope() const {
-    return scope_;
-  }
-
-  //! Set the current scope
-  void setScope(Scope* scope) {
-    scope_ = scope;
-  }
-
-  Expr* parentScope() const;
-
-  Predicate* predicate() const {
-    return predicate_;
-  }
-
-  void setPredicate(Predicate* predicate) {
-    predicate_ = predicate;
-  }
-
-  Predicate* writePredicate() const {
-    return write_predicate_;
-  }
-
-  void setWritePredicate(Predicate* write_predicate) {
-    write_predicate_ = write_predicate;
-  }
-
- protected:
-  // TODO(kir): try to avoid this protected interface
-  void addInput(Val* input) {
-    inputs_.push_back(input);
-  }
-
-  void addOutput(Val* output) {
-    output->setDefinition(this);
-    outputs_.push_back(output);
-  }
-
- private:
-  // TODO(kir): can we avoid this?
-  std::vector<Val*> inputs_;
-  std::vector<Val*> outputs_;
-
-  // TODO(kir): revisit scope/nesting data structures
-  Scope* scope_ = nullptr;
-
-  Predicate* predicate_ = nullptr;
-  // Only used for reduction-related expressions
-  Predicate* write_predicate_ = nullptr;
-};
-
-class TORCH_CUDA_CU_API NamedScalar final : public Val {
- public:
-  // NOLINTNEXTLINE(modernize-pass-by-value)
-  NamedScalar(Passkey passkey, std::string name, DataType dtype)
-      : Val(passkey, dtype), name_(name) {}
-
-  explicit NamedScalar(Passkey passkey, const fuser::cuda::NamedScalar* node)
-      : Val(passkey, node->getDataType().value()) {
-    name_ = node->name();
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  bool isScalar() const override {
-    return true;
-  }
-
-  // TODO(kir): this is hiding and redefining Val::name()
-  const std::string& name() const {
-    return name_;
-  }
-
-  // Return the named scalar extent of a parallel dimension (e.g. blockDim.x)
-  static NamedScalar* getParallelDim(ParallelType p_type);
-
-  // Return the named scalar index of a parallel dimension (e.g. threadIdx.x)
-  static NamedScalar* getParallelIndex(ParallelType p_type);
-
-  // Return the parallel type of this NamedScalar if it is an extent of a
-  // parallel dimension
-  c10::optional<ParallelType> getParallelDim() const;
-
-  // Return the parallel type of this NamedScalar if it is an index of a
-  // parallel dimension
-  c10::optional<ParallelType> getParallelIndex() const;
-
- private:
-  std::string name_;
-};
-
-class TORCH_CUDA_CU_API Predicate final : public Val {
- public:
-  explicit Predicate(
-      Passkey passkey,
-      PredicateType ptype,
-      const Expr* expr = nullptr,
-      Bool* thread_pred = nullptr)
-      : Val(passkey, DataType::Bool),
-        ptype_(ptype),
-        expr_(expr),
-        thread_pred_(thread_pred) {
-    TORCH_INTERNAL_ASSERT(
-        ptype != PredicateType::Unswitch && ptype != PredicateType::Manual);
-  }
-
-  explicit Predicate(Passkey passkey, ForLoop* unrolled_loop)
-      : Val(passkey, DataType::Bool),
-        ptype_(PredicateType::Unswitch),
-        unrolled_loop_(unrolled_loop) {
-    TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr);
-  }
-
-  explicit Predicate(Passkey passkey, Bool* value)
-      : Val(passkey, DataType::Bool),
-        ptype_(PredicateType::Manual),
-        value_(value) {
-    TORCH_INTERNAL_ASSERT(value != nullptr);
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  PredicateType predicate_type() const {
-    return ptype_;
-  }
-
-  const Expr* expr() const {
-    TORCH_INTERNAL_ASSERT(
-        ptype_ != PredicateType::Unswitch &&
-        ptype_ != PredicateType::Vectorize && ptype_ != PredicateType::Manual);
-    return expr_;
-  }
-
-  Bool* thread_pred() {
-    TORCH_INTERNAL_ASSERT(
-        ptype_ == PredicateType::Inline ||
-        ptype_ == PredicateType::Misaligned || ptype_ == PredicateType::Shift ||
-        ptype_ == PredicateType::Padding ||
-        ptype_ == PredicateType::ReductionWrite);
-    return thread_pred_;
-  }
-
-  ForLoop* unrolled_loop() const {
-    TORCH_INTERNAL_ASSERT(ptype_ == PredicateType::Unswitch);
-    return unrolled_loop_;
-  }
-
-  bool hasValue() const {
-    return value_ != nullptr;
-  }
-
-  Bool* value() const {
-    TORCH_INTERNAL_ASSERT(
-        value_ != nullptr,
-        "The conditional expression for this Predicate is invalid.");
-    return value_;
-  }
-
-  void setValue(Bool* value) {
-    TORCH_INTERNAL_ASSERT(value != nullptr, "The Bool expression is invalid.");
-    value_ = value;
-  }
-
- private:
-  PredicateType ptype_ = PredicateType::Manual;
-
-  // For PredicateCompute::getInlinePredicate,
-  // ShiftPredicateInserter::getShiftPredicate and getPaddingPredicate
-  const Expr* expr_ = nullptr;
-
-  // For PredicateCompute::getInlinePredicate
-  Bool* thread_pred_ = nullptr;
-
-  // For ParallelType::Unswitch - UnswitchPredicate::get
-  ForLoop* unrolled_loop_ = nullptr;
-
-  // The Bool conditional value
-  // The value is nullptr until lower_predicate pass
-  Bool* value_ = nullptr;
-};
-
-class TORCH_CUDA_CU_API Bool final : public Val {
- public:
-  explicit Bool(Passkey passkey, const c10::optional<bool>& value)
-      : Val(passkey, DataType::Bool), maybe_value_(value) {}
-
-  explicit Bool(Passkey passkey, const fuser::cuda::Bool* node)
-      : Val(passkey, DataType::Bool), maybe_value_(node->value()) {
-    setName(node->name());
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  bool isScalar() const override {
-    return true;
-  }
-
-  bool isConst() const override {
-    return maybe_value_.has_value();
-  }
-
-  c10::optional<bool> value() const {
-    return maybe_value_;
-  }
-
- private:
-  const c10::optional<bool> maybe_value_;
-};
-
-class TORCH_CUDA_CU_API Double final : public Val {
- public:
-  using ScalarType = double;
-
-  explicit Double(Passkey passkey, const c10::optional<ScalarType>& value)
-      : Val(passkey, DataType::Double), maybe_value_(value) {}
-
-  explicit Double(Passkey passkey, const fuser::cuda::Double* node)
-      : Val(passkey, DataType::Double), maybe_value_(node->value()) {
-    setName(node->name());
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  bool isScalar() const override {
-    return true;
-  }
-
-  bool isConst() const override {
-    return maybe_value_.has_value();
-  }
-
-  c10::optional<ScalarType> value() const {
-    return maybe_value_;
-  }
-
- private:
-  const c10::optional<ScalarType> maybe_value_;
-};
-
-class TORCH_CUDA_CU_API Int final : public Val {
- public:
-  using ScalarType = int64_t;
-
-  explicit Int(Passkey passkey, const c10::optional<ScalarType>& value)
-      : Val(passkey, DataType::Int), maybe_value_(value) {}
-
-  // SFINAE constructor to avoid 0 constant pointer ambiguity
-  template <
-      typename T,
-      typename = typename std::enable_if<
-          std::is_pointer<T>::value &&
-          std::is_convertible<T, const fuser::cuda::Int*>::value>::type>
-  explicit Int(Passkey passkey, T node)
-      : Val(passkey, DataType::Int), maybe_value_(node->value()) {
-    setName(node->name());
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  bool isScalar() const override {
-    return true;
-  }
-
-  bool isConst() const override {
-    return maybe_value_.has_value();
-  }
-
-  bool isZeroInt() const override {
-    return maybe_value_.has_value() && *maybe_value_ == 0;
-  }
-
-  bool isOneInt() const override {
-    return maybe_value_.has_value() && *maybe_value_ == 1;
-  }
-
-  c10::optional<ScalarType> value() const {
-    return maybe_value_;
-  }
-
- private:
-  const c10::optional<ScalarType> maybe_value_;
-};
-
-class TORCH_CUDA_CU_API IterDomain final : public Val {
- public:
-  IterDomain(Passkey passkey, Val* start, Val* extent);
-
-  explicit IterDomain(Passkey, const fuser::cuda::IterDomain* iter_domain);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  bool isReduction() const {
-    return iterType() == IterType::Reduction;
-  }
-
-  bool isRFactorProduct() const {
-    return is_rfactor_domain_;
-  }
-
-  bool isBroadcast() const {
-    return iterType() == IterType::BroadcastWithStride ||
-        iterType() == IterType::BroadcastWithoutStride;
-  }
-
-  bool isGather() const {
-    return iterType() == IterType::Gather;
-  }
-
-  bool isStride() const {
-    return iterType() == IterType::Stride;
-  }
-
-  bool isParallelized() const {
-    return parallelType() != ParallelType::Serial;
-  }
-
-  // Return if this iter domain is mapped to a grid dimension
-  bool isBlockDim() const {
-    return parallelType() == ParallelType::BIDz ||
-        parallelType() == ParallelType::BIDy ||
-        parallelType() == ParallelType::BIDx;
-  }
-
-  // Return if this iter domain is mapped to a block dimension
-  bool isThreadDim() const {
-    return parallelType() == ParallelType::TIDz ||
-        parallelType() == ParallelType::TIDy ||
-        parallelType() == ParallelType::TIDx;
-  }
-
-  // Return if this iter domain is either mapped to a block or grid dimension
-  bool isThread() const {
-    return isBlockDim() || isThreadDim();
-  }
-
-  ParallelType parallelType() const {
-    return parallel_type_;
-  }
-
-  IterType iterType() const {
-    return iter_type_;
-  }
-
-  Val* start() const {
-    return start_;
-  }
-
-  Val* stop() const {
-    return stop_;
-  }
-
-  Val* extent() const;
-
-  bool isSimple() const {
-    return is_simple_;
-  }
-
-  bool hasPaddingToMultipleOfWarp() const {
-    return is_padded_dimension_;
-  }
-
- private:
-  Val* const start_ = nullptr;
-  Val* const stop_ = nullptr;
-  Val* const extent_ = nullptr;
-  ParallelType parallel_type_ = ParallelType::Serial;
-  IterType iter_type_ = IterType::Iteration;
-  bool is_rfactor_domain_ = false;
-
-  // An IterDomain is "simple" if the original Fusion IterDomain
-  // doesn't have a definition ("definition" expression)
-  //
-  // TODO(kir): this feels like a hack, revisit
-  //
-  bool is_simple_ = true;
-
-  //! Indicates if this iterdomain is a padded parallel dimension
-  bool is_padded_dimension_ = false;
-};
-
-// TODO(kir): is this really a value?
-class TORCH_CUDA_CU_API TensorDomain final : public Val {
- public:
-  explicit TensorDomain(Passkey, std::vector<IterDomain*> domain);
-
-  explicit TensorDomain(
-      Passkey passkey,
-      const fuser::cuda::TensorDomain* tensor_domain);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  std::vector<IterDomain*>::size_type nDims() const {
-    return domain_.size();
-  }
-
-  // TODO(kir): rename this
-  const std::vector<IterDomain*>& domain() const {
-    return domain_;
-  }
-
-  const std::vector<bool>& contiguity() const {
-    return contiguity_;
-  }
-
-  std::string getContiguityString() const {
-    std::stringstream ss;
-    for (auto b : contiguity()) {
-      ss << (b ? "t" : "f");
-    }
-    return ss.str();
-  }
-
-  bool hasReduction() const;
-  bool hasBlockReduction() const;
-  bool hasGridReduction() const;
-  bool hasBlockBroadcast() const;
-  bool hasGridBroadcast() const;
-  bool hasBroadcast() const;
-  bool hasRFactor() const;
-  bool hasVectorize() const;
-
-  const std::vector<IterDomain*>& noReductions() const {
-    return no_reduction_domain_;
-  }
-
-  const std::vector<IterDomain*>& noBroadcasts() const {
-    return no_bcast_domain_;
-  }
-
-  const std::vector<IterDomain*>& rootDomain() const {
-    return root_domain_;
-  };
-
-  const std::vector<IterDomain*>& rfactorDomain() const {
-    return rfactor_domain_;
-  };
-
-  void resetDomains() {
-    no_reduction_domain_ = noReductions(domain_);
-    no_bcast_domain_ = noBroadcasts(domain_);
-  }
-
-  IterDomain* axis(int i) const;
-
-  // TODO(kir): overloading non-static and static methods is not a good idea
-  static std::vector<IterDomain*> noReductions(const std::vector<IterDomain*>&);
-  static std::vector<IterDomain*> noBroadcasts(const std::vector<IterDomain*>&);
-
- private:
-  std::vector<IterDomain*> root_domain_;
-  std::vector<IterDomain*> domain_;
-  std::vector<IterDomain*> no_bcast_domain_;
-  std::vector<IterDomain*> no_reduction_domain_;
-  std::vector<IterDomain*> rfactor_domain_;
-  const std::vector<bool> contiguity_;
-};
-
-class TORCH_CUDA_CU_API TensorView final : public Val {
- public:
-  explicit TensorView(Passkey, const fuser::cuda::TensorView* tv);
-
-  TensorView(
-      Passkey,
-      DataType dtype,
-      TensorDomain* domain,
-      MemoryType memory_type);
-
-  TensorDomain* domain() const {
-    return domain_;
-  }
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  MemoryType memoryType() const {
-    return memory_type_;
-  }
-
-  fuser::cuda::TensorView* fuserTv() const {
-    TORCH_INTERNAL_ASSERT(fuser_tv_ != nullptr);
-    // TODO(kir): remove the need for const_cast
-    return const_cast<fuser::cuda::TensorView*>(fuser_tv_); // NOLINT
-  }
-
- private:
-  TensorDomain* domain_ = nullptr;
-  MemoryType memory_type_ = MemoryType::Local;
-
-  // TODO(kir): remove temporary hack
-  const fuser::cuda::TensorView* fuser_tv_ = nullptr;
-};
-
-class TORCH_CUDA_CU_API UnaryOp final : public Expr {
- public:
-  UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  Val* out() const {
-    return out_;
-  }
-
-  Val* in() const {
-    return in_;
-  }
-
-  UnaryOpType operation() const {
-    return operation_;
-  }
-
- private:
-  const UnaryOpType operation_;
-  Val* const out_ = nullptr;
-  Val* const in_ = nullptr;
-};
-
-class TORCH_CUDA_CU_API BinaryOp final : public Expr {
- public:
-  BinaryOp(
-      Passkey passkey,
-      BinaryOpType operation,
-      Val* out,
-      Val* lhs,
-      Val* rhs);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  Val* out() const {
-    return out_;
-  }
-
-  Val* lhs() const {
-    return lhs_;
-  }
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
-  Val* rhs() const {
-    return rhs_;
-  }
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
 
-  BinaryOpType operation() const {
-    return operation_;
-  }
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
- private:
-  const BinaryOpType operation_;
-  Val* const out_ = nullptr;
-  Val* const lhs_ = nullptr;
-  Val* const rhs_ = nullptr;
-};
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
 
-class TORCH_CUDA_CU_API TernaryOp final : public Expr {
- public:
-  TernaryOp(
-      Passkey passkey,
-      TernaryOpType operation,
-      Val* out,
-      Val* in1,
-      Val* in2,
-      Val* in3);
+class IrBuilderPasskey;
 
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
+// Abstract nodes
+class Val;
+class Expr;
 
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
+// Values
+class Bool;
+class Double;
+class Int;
+class NamedScalar;
 
-  Val* out() const {
-    return out_;
-  }
+class IterDomain;
+class TensorDomain;
+class TensorView;
 
-  Val* in1() const {
-    return in1_;
-  }
+// Expressions
+class UnaryOp;
+class BinaryOp;
+class TernaryOp;
+class ReductionOp;
+class WelfordOp;
+class BroadcastOp;
 
-  Val* in2() const {
-    return in2_;
-  }
+namespace kir {
+class Kernel;
 
-  Val* in3() const {
-    return in3_;
-  }
+// Values
+class Predicate;
+class TensorIndex;
 
-  TernaryOpType operation() const {
-    return operation_;
-  }
+// Expressions
+class Allocate;
+class BlockSync;
+class GridSync;
+class InitMagicZero;
+class UpdateMagicZero;
+class ForLoop;
+class IfThenElse;
+class GridReduction;
+class GroupedGridReduction;
+class GridBroadcast;
+class GridWelford;
+class AllocateFusedReduction;
 
- private:
-  const TernaryOpType operation_;
-  Val* const out_ = nullptr;
-  Val* const in1_ = nullptr;
-  Val* const in2_ = nullptr;
-  Val* const in3_ = nullptr;
-};
+// Expr container
+class Scope;
 
-class TORCH_CUDA_CU_API ReductionOp final : public Expr {
+class TORCH_CUDA_CU_API Predicate final : public Val {
  public:
-  ReductionOp(
-      Passkey passkey,
-      BinaryOpType operation,
-      Val* init,
-      Val* out,
-      Val* in);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  Val* out() const {
-    return out_;
-  }
-
-  Val* in() const {
-    return in_;
-  }
-
-  Val* init() const {
-    return init_;
-  }
-
-  BinaryOpType operation() const {
-    return operation_;
-  }
-
- private:
-  const BinaryOpType operation_;
-  Val* const init_ = nullptr;
-  Val* const out_ = nullptr;
-  Val* const in_ = nullptr;
-};
+  explicit Predicate(
+      IrBuilderPasskey passkey,
+      PredicateType ptype,
+      const Expr* expr = nullptr,
+      Bool* thread_pred = nullptr);
 
-class TORCH_CUDA_CU_API WelfordOp final : public Expr {
- public:
-  WelfordOp(
-      Passkey passkey,
-      Val* out_var,
-      Val* out_avg,
-      Val* out_N,
-      Val* init_var,
-      Val* init_avg,
-      Val* init_N,
-      Val* in_var,
-      Val* in_avg,
-      Val* in_N);
+  explicit Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop);
 
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
+  explicit Predicate(IrBuilderPasskey passkey, Bool* value);
 
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
+  PredicateType predicate_type() const {
+    return ptype_;
   }
 
-  Val* out() const {
-    return out_avg_;
+  const Expr* expr() const {
+    TORCH_INTERNAL_ASSERT(
+        ptype_ != PredicateType::Unswitch &&
+        ptype_ != PredicateType::Vectorize && ptype_ != PredicateType::Manual);
+    return expr_;
   }
 
-  Val* in() const {
-    return in_avg_;
+  Bool* thread_pred() {
+    TORCH_INTERNAL_ASSERT(
+        ptype_ == PredicateType::Inline ||
+        ptype_ == PredicateType::Misaligned || ptype_ == PredicateType::Shift ||
+        ptype_ == PredicateType::Padding ||
+        ptype_ == PredicateType::ReductionWrite);
+    return thread_pred_;
   }
 
-  // Welford Specific accessors
-  // Almost wanted to add a new struct for {var, avg, N}
-  Val* outVar() const {
-    return out_var_;
+  ForLoop* unrolled_loop() const {
+    TORCH_INTERNAL_ASSERT(ptype_ == PredicateType::Unswitch);
+    return unrolled_loop_;
   }
 
-  Val* outAvg() const {
-    return out_avg_;
+  bool hasValue() const {
+    return value_ != nullptr;
   }
 
-  Val* outN() const {
-    return out_N_;
+  Bool* value() const {
+    TORCH_INTERNAL_ASSERT(
+        value_ != nullptr,
+        "The conditional expression for this Predicate is invalid.");
+    return value_;
   }
 
-  Val* initVar() const {
-    return init_var_;
+  void setValue(Bool* value) {
+    TORCH_INTERNAL_ASSERT(value != nullptr, "The Bool expression is invalid.");
+    value_ = value;
   }
 
-  Val* initAvg() const {
-    return init_avg_;
+  bool isConst() const final {
+    return hasValue() && value_->isConst();
   }
 
-  Val* initN() const {
-    return init_N_;
-  }
+ private:
+  PredicateType ptype_ = PredicateType::Manual;
 
-  Val* inVar() const {
-    return in_var_;
-  }
+  // For PredicateCompute::getInlinePredicate,
+  // ShiftPredicateInserter::getShiftPredicate and getPaddingPredicate
+  const Expr* expr_ = nullptr;
 
-  Val* inAvg() const {
-    return in_avg_;
-  }
+  // For PredicateCompute::getInlinePredicate
+  Bool* thread_pred_ = nullptr;
 
-  Val* inN() const {
-    return in_N_;
-  }
+  // For ParallelType::Unswitch - UnswitchPredicate::get
+  ForLoop* unrolled_loop_ = nullptr;
 
- private:
-  Val* const out_var_;
-  Val* const out_avg_;
-  Val* const out_N_;
-  Val* const init_var_;
-  Val* const init_avg_;
-  Val* const init_N_;
-  Val* const in_var_;
-  Val* const in_avg_;
-  Val* const in_N_;
+  // The Bool conditional value
+  // The value is nullptr until lower_predicate pass
+  Bool* value_ = nullptr;
 };
 
 class TORCH_CUDA_CU_API TensorIndex final : public Val {
  public:
   TensorIndex(
-      Passkey,
-      const fuser::cuda::TensorView* view,
+      IrBuilderPasskey,
+      const TensorView* view,
       std::vector<Val*> indices);
 
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
   std::vector<Val*>::size_type nDims() const {
     return indices_.size();
   }
@@ -1170,8 +161,7 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val {
 
   TensorView* view() const {
     TORCH_INTERNAL_ASSERT(view_ != nullptr);
-    // TODO(kir): remove the need for const_cast
-    return const_cast<fuser::cuda::kir::TensorView*>(view_); // NOLINT
+    return const_cast<TensorView*>(view_); // NOLINT
   }
 
  private:
@@ -1179,46 +169,17 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val {
   std::vector<Val*> indices_;
 };
 
-class TORCH_CUDA_CU_API BroadcastOp final : public Expr {
- public:
-  BroadcastOp(Passkey passkey, Val* out, Val* in);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
-  Val* out() const {
-    return out_;
-  }
-
-  Val* in() const {
-    return in_;
-  }
-
- private:
-  Val* const out_ = nullptr;
-  Val* const in_ = nullptr;
-};
-
 //! Allocate is a lower level Node that describes a buffer of memory that
 //! is required as an intermediate within a kernel. The extent is the expression
 //! of the size of the buffer that is generated from the TensorView that
 //! describes the output of an operation.
-//!
-//! TODO(kir): The components of Allocate like Type and Name could be separated
-//!   from the the assocated TensorView.  Perhaps that is more appropriate?
-//!
 class TORCH_CUDA_CU_API Allocate final : public Expr {
  public:
   //! Allocation of a multi-dimensional buffer
   //!
   //! param shape Size of each dimension
   explicit Allocate(
-      Passkey passkey,
+      IrBuilderPasskey passkey,
       Val* buffer,
       MemoryType memory_type,
       std::vector<Val*> shape = {},
@@ -1228,20 +189,12 @@ class TORCH_CUDA_CU_API Allocate final : public Expr {
   //!
   //! param size Size of allocation
   explicit Allocate(
-      Passkey passkey,
+      IrBuilderPasskey passkey,
       Val* buffer,
       MemoryType memory_type,
       Val* size,
       bool zero_init = false);
 
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
   Val* buffer() const {
     return buffer_;
   }
@@ -1290,17 +243,9 @@ class TORCH_CUDA_CU_API Allocate final : public Expr {
 //
 // TODO(kir): change name to SyncThreads as we could have other barriers.
 //
-class TORCH_CUDA_CU_API Sync final : public Expr {
+class TORCH_CUDA_CU_API BlockSync final : public Expr {
  public:
-  explicit Sync(Passkey passkey, bool war_sync = false);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
+  explicit BlockSync(IrBuilderPasskey passkey, bool war_sync = false);
 
   bool isWarHazardSync() const {
     return war_sync_;
@@ -1311,34 +256,40 @@ class TORCH_CUDA_CU_API Sync final : public Expr {
   bool war_sync_ = false;
 };
 
-// Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero
-// in helpers.cu
-class TORCH_CUDA_CU_API InitMagicZero final : public Expr {
+// Synchronize all blocks in device, implies cooperative group launch is
+// required.
+class TORCH_CUDA_CU_API GridSync final : public Expr {
  public:
-  explicit InitMagicZero(Passkey passkey);
+  explicit GridSync(
+      IrBuilderPasskey passkey,
+      ParallelTypeBitmap sync_dims,
+      Val* sync_buffer);
 
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
+  ParallelTypeBitmap syncDims() const {
+    return sync_dims_;
   }
 
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
+  Val* syncBuffer() const {
+    return sync_buffer_;
   }
+
+ private:
+  ParallelTypeBitmap sync_dims_;
+  Val* sync_buffer_ = nullptr;
+};
+
+// Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero
+// in helpers.cu
+class TORCH_CUDA_CU_API InitMagicZero final : public Expr {
+ public:
+  explicit InitMagicZero(IrBuilderPasskey passkey);
 };
 
 // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero
 // in helpers.cu
 class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr {
  public:
-  explicit UpdateMagicZero(Passkey passkey);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
+  explicit UpdateMagicZero(IrBuilderPasskey passkey);
 };
 
 // TODO(kir): promote to IR node
@@ -1377,7 +328,6 @@ class TORCH_CUDA_CU_API Scope {
 
   void push_back(Expr* e) {
     exprs_.push_back(e);
-    e->setScope(this);
   }
 
   // Erase expr at pos
@@ -1425,7 +375,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
   //!
   //! TODO: cleaner way to set options?
   ForLoop(
-      Passkey passkey,
+      IrBuilderPasskey passkey,
       IterDomain* iter_domain,
       Val* index,
       Val* start,
@@ -1435,17 +385,9 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
       Val* vectorize_shift,
       bool unroll_required);
 
-  ForLoop(Passkey passkey, IterDomain* iter_domain);
-
-  ForLoop(Passkey passkey, const ForLoop* other);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
+  ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain);
 
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
+  ForLoop(IrBuilderPasskey passkey, const ForLoop* other);
 
   Val* index() const {
     return index_;
@@ -1465,6 +407,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
     return iter_domain_;
   }
 
+  // TODO: Return pointer instead of reference to be more consistent
   Scope& body() {
     return body_;
   }
@@ -1490,6 +433,9 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
     unroll_required_ = true;
   }
 
+  //! True if no actual for-loop is materialized
+  bool isTrivial() const;
+
  private:
   //! Returns if a loop could be unrolled.
   bool isUnrollable() const;
@@ -1524,15 +470,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
 //!
 class TORCH_CUDA_CU_API IfThenElse final : public Expr {
  public:
-  explicit IfThenElse(Passkey passkey, Predicate* cond);
-
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
+  explicit IfThenElse(IrBuilderPasskey passkey, Predicate* cond);
 
   Scope& thenBody() {
     return then_body_;
@@ -1565,28 +503,75 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr {
 //!
 //! This node provides FusionExecutor the information it needs to allocate the
 //! reduction and sync buffers.
-class TORCH_CUDA_CU_API GridReduction final : public Expr {
+class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
  public:
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
+  GridReduction(
+      IrBuilderPasskey passkey,
+      BinaryOpType reduction_op_type,
+      Val* init,
+      Val* out,
+      Val* in,
+      Allocate* reduction_buffer,
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances,
+      bool is_fused = false);
+
+  Allocate* reduction_buffer() const {
+    return reduction_buffer_;
   }
 
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
+  Allocate* sync_buffer() const {
+    return sync_buffer_;
   }
 
-  GridReduction(
-      Passkey passkey,
-      ReductionOp* reduction_op,
-      Allocate* reduction_buffer,
-      Allocate* sync_buffer);
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return entrance_index_;
+  }
 
-  ReductionOp* reduction_op() const {
-    return reduction_op_;
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return entrances_;
   }
 
-  Allocate* reduction_buffer() const {
-    return reduction_buffer_;
+  const ParallelTypeBitmap& threadPredicate() const {
+    return thread_predicate_;
+  }
+
+  void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
+    thread_predicate_ = thread_predicate;
+  }
+
+ private:
+  Allocate* reduction_buffer_ = nullptr;
+  Allocate* sync_buffer_ = nullptr;
+  // gridReduce has template flags for thread predicates. In order to
+  // use them, the thread predicate is held here separately from
+  // Expr::predicate_.
+  ParallelTypeBitmap thread_predicate_;
+  Val* entrance_index_ = nullptr;
+  Val* entrances_ = nullptr;
+};
+
+class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp {
+ public:
+  GroupedGridReduction(
+      IrBuilderPasskey passkey,
+      std::vector<BinaryOpType> reduction_op_type,
+      std::vector<Val*> init,
+      std::vector<Val*> out,
+      std::vector<Val*> in,
+      std::vector<Allocate*> reduction_buffers,
+      Allocate* sync_buffer,
+      bool is_allreduce = false);
+
+  const std::vector<Allocate*>& reduction_buffers() const {
+    return reduction_buffers_;
+  }
+
+  Allocate* reduction_buffer(size_t i) const {
+    return reduction_buffers_.at(i);
   }
 
   Allocate* sync_buffer() const {
@@ -1602,8 +587,7 @@ class TORCH_CUDA_CU_API GridReduction final : public Expr {
   }
 
  private:
-  ReductionOp* reduction_op_ = nullptr;
-  Allocate* reduction_buffer_ = nullptr;
+  std::vector<Allocate*> reduction_buffers_;
   Allocate* sync_buffer_ = nullptr;
   // gridReduce has template flags for thread predicates. In order to
   // use them, the thread predicate is held here separately from
@@ -1620,23 +604,11 @@ class TORCH_CUDA_CU_API GridReduction final : public Expr {
 //! broadcast and sync buffers.
 class TORCH_CUDA_CU_API GridBroadcast final : public Expr {
  public:
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
   GridBroadcast(
-      Passkey passkey,
+      IrBuilderPasskey passkey,
       BroadcastOp* broadcast_op,
       Allocate* broadcast_buffer,
-      Allocate* sync_buffer)
-      : Expr(passkey),
-        broadcast_op_(broadcast_op),
-        broadcast_buffer_(broadcast_buffer),
-        sync_buffer_(sync_buffer){};
+      Allocate* sync_buffer);
 
   BroadcastOp* broadcast_op() const {
     return broadcast_op_;
@@ -1665,21 +637,15 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr {
 //! reduction and sync buffers.
 class TORCH_CUDA_CU_API GridWelford final : public Expr {
  public:
-  void accept(IrVisitor* visitor) const override {
-    visitor->visit(this);
-  }
-
-  void accept(MutableIrVisitor* visitor) override {
-    visitor->visit(this);
-  }
-
   GridWelford(
-      Passkey passkey,
+      IrBuilderPasskey passkey,
       WelfordOp* welford_op,
       Allocate* var_buffer,
       Allocate* avg_buffer,
       Allocate* n_buffer,
-      Allocate* sync_buffer);
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances);
 
   WelfordOp* welford_op() const {
     return welford_op_;
@@ -1701,6 +667,16 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
     return sync_buffer_;
   }
 
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return entrance_index_;
+  }
+
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return entrances_;
+  }
+
   const ParallelTypeBitmap& threadPredicate() const {
     return thread_predicate_;
   }
@@ -1715,12 +691,42 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
   Allocate* avg_buffer_ = nullptr;
   Allocate* n_buffer_ = nullptr;
   Allocate* sync_buffer_ = nullptr;
+  Val* entrance_index_ = nullptr;
+  Val* entrances_ = nullptr;
   // gridReduce has template flags for thread predicates. In order to
   // use them, the thread predicate is held here separately from
   // Expr::predicate_.
   ParallelTypeBitmap thread_predicate_;
 };
 
+// Allocate an instance of the fused reduction class.
+class TORCH_CUDA_CU_API AllocateFusedReduction final : public Expr {
+ public:
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GridReduction* grid_reduction);
+
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GridWelford* grid_welford);
+
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GroupedGridReduction* grouped_grid_reduction);
+
+  Expr* gridExpr() const {
+    return grid_expr_;
+  }
+
+  TensorIndex* out() const;
+
+  const ParallelTypeBitmap& threadPredicate() const;
+
+ private:
+  //! GridReduction, GridWelford or GroupedGridReduction
+  Expr* grid_expr_ = nullptr;
+};
+
 } // namespace kir
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
deleted file mode 100644
index ce3e17d74d22..000000000000
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-Val* IrBuilder::newResult(DataType dtype) {
-  switch (dtype) {
-    case DataType::Bool:
-      return create<Bool>(c10::nullopt);
-    case DataType::Double:
-      return create<Double>(c10::nullopt);
-    case DataType::Int:
-      return create<Int>(c10::nullopt);
-    default:
-      TORCH_CHECK(false, "Unexpected data type");
-  }
-}
-
-Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types");
-  auto result = newResult(lhs->dtype());
-  create<BinaryOp>(op_type, result, lhs, rhs);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  return result;
-}
-
-Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  auto result = create<Bool>(c10::nullopt);
-  create<BinaryOp>(op_type, result, lhs, rhs);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  return result;
-}
-
-Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) {
-  TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types");
-  auto result = newResult(lhs->dtype());
-  create<TernaryOp>(TernaryOpType::Where, result, pred, lhs, rhs);
-  return result;
-}
-
-Val* IrBuilder::negExpr(Val* val) {
-  auto result = newResult(val->dtype());
-  create<UnaryOp>(UnaryOpType::Neg, result, val);
-  return result;
-}
-
-Val* IrBuilder::notExpr(Val* val) {
-  auto result = newResult(val->dtype());
-  create<UnaryOp>(UnaryOpType::Not, result, val);
-  return result;
-}
-
-Val* IrBuilder::setExpr(Val* val) {
-  auto result = newResult(val->dtype());
-  create<UnaryOp>(UnaryOpType::Set, result, val);
-  return result;
-}
-
-Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) {
-  auto result = create<NamedScalar>(name, val->dtype());
-  create<UnaryOp>(UnaryOpType::Set, result, val);
-  return result;
-}
-
-Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) {
-  auto result = create<NamedScalar>(name, DataType::Int);
-  create<UnaryOp>(UnaryOpType::Address, result, val);
-  return result;
-}
-
-Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::And, lhs, rhs);
-}
-
-Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
-}
-
-Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::GT, lhs, rhs);
-}
-
-Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
-}
-
-Val* IrBuilder::leExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LE, lhs, rhs);
-}
-
-Val* IrBuilder::geExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::GE, lhs, rhs);
-}
-
-Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
-}
-
-Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
-}
-
-Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
-}
-
-Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
-}
-
-Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
-}
-
-Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
-}
-
-Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Max, lhs, rhs);
-}
-
-Val* IrBuilder::minExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Min, lhs, rhs);
-}
-
-Int* IrBuilder::zeroVal() {
-  if (zero_ == nullptr) {
-    zero_ = create<kir::Int>(0);
-  }
-  return zero_;
-}
-
-Int* IrBuilder::oneVal() {
-  if (one_ == nullptr) {
-    one_ = create<kir::Int>(1);
-  }
-  return one_;
-}
-
-Bool* IrBuilder::falseVal() {
-  if (false_ == nullptr) {
-    false_ = create<kir::Bool>(false);
-  }
-  return false_;
-}
-
-Bool* IrBuilder::trueVal() {
-  if (true_ == nullptr) {
-    true_ = create<kir::Bool>(true);
-  }
-  return true_;
-}
-
-NamedScalar* IrBuilder::magicZeroVal() {
-  if (magic_zero_ == nullptr) {
-    magic_zero_ = create<kir::NamedScalar>(kMagicZeroName, DataType::Int);
-  }
-  return magic_zero_;
-}
-
-Val* SimplifyingIrBuilder::negExpr(Val* val) {
-  if (auto int_val = dynamic_cast<kir::Int*>(val)) {
-    if (int_val->isConst()) {
-      return create<Int>(-int_val->value().value());
-    }
-  }
-  return IrBuilder::negExpr(val);
-}
-
-Val* SimplifyingIrBuilder::notExpr(Val* val) {
-  if (auto bool_val = dynamic_cast<Bool*>(val)) {
-    if (bool_val->isConst()) {
-      if (bool_val->value().value()) {
-        return falseVal();
-      } else {
-        return trueVal();
-      }
-    }
-  }
-  return IrBuilder::notExpr(val);
-}
-
-Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) {
-  if (rhs == 0) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return IrBuilder::create<kir::Int>(rhs);
-  } else if (lhs->isConst()) {
-    return IrBuilder::create<kir::Int>(lhs->value().value() + rhs);
-  } else if (rhs > 0) {
-    return IrBuilder::addExpr(lhs, IrBuilder::create<kir::Int>(rhs));
-  } else {
-    return IrBuilder::subExpr(lhs, IrBuilder::create<kir::Int>(-rhs));
-  }
-}
-
-Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) {
-  if (rhs == nullptr) {
-    return lhs;
-  } else if (lhs == nullptr) {
-    return rhs;
-  } else if (lhs->isConst()) {
-    return addExpr(rhs, lhs->value().value());
-  } else if (rhs->isConst()) {
-    return addExpr(lhs, rhs->value().value());
-  } else {
-    return IrBuilder::addExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) {
-  TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
-  if (lhs == nullptr || lhs->isZeroInt()) {
-    return rhs;
-  } else if (rhs == nullptr || rhs->isZeroInt()) {
-    return lhs;
-  }
-  auto lhs_int = dynamic_cast<Int*>(lhs);
-  auto rhs_int = dynamic_cast<Int*>(rhs);
-  if (lhs_int != nullptr && rhs_int != nullptr) {
-    return addExpr(lhs_int, rhs_int);
-  } else {
-    return IrBuilder::addExpr(lhs, rhs);
-  }
-}
-
-Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) {
-  return addExpr(lhs, negExpr(rhs));
-}
-
-Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) {
-  TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr));
-
-  if (lhs == nullptr) {
-    return rhs;
-  } else if (rhs == nullptr) {
-    return lhs;
-  }
-
-  bool lhs_definitely_true = false;
-  bool lhs_definitely_false = false;
-  auto lhs_bool = dynamic_cast<Bool*>(lhs);
-  if (lhs_bool && lhs_bool->isConst()) {
-    lhs_definitely_true = lhs_bool->value().value();
-    lhs_definitely_false = !lhs_bool->value().value();
-  }
-  auto rhs_bool = dynamic_cast<Bool*>(rhs);
-  bool rhs_definitely_true = false;
-  bool rhs_definitely_false = false;
-  if (rhs_bool && rhs_bool->isConst()) {
-    rhs_definitely_true = rhs_bool->value().value();
-    rhs_definitely_false = !rhs_bool->value().value();
-  }
-
-  if (lhs_definitely_true && rhs_definitely_true) {
-    return trueVal();
-  } else if (lhs_definitely_false || rhs_definitely_false) {
-    return falseVal();
-  } else if (lhs_definitely_true) {
-    return rhs;
-  } else if (rhs_definitely_true) {
-    return lhs;
-  }
-
-  return IrBuilder::andExpr(lhs, rhs);
-}
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
deleted file mode 100644
index 17a095baf120..000000000000
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#pragma once
-
-#include <torch/csrc/Export.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-
-#include <memory>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-//! Kernel IR builder interface
-//!
-//! The only way to create new Kernel IR nodes is through the
-//! kir::IrBuilder interface. An IrBuilder instance is attached to a
-//! particular Kernel instance and it provides methods for creating
-//! single nodes (kir::IrBuilder::create()) or basic composite expressions
-//! (ex. kir::IrBuilder::addExpr()).
-//!
-//! If the Kernel object is readily available, an IrBuilder can be "wrapped"
-//! around it directly:
-//!
-//!   kir::IrBuilder ir_builder(kernel);
-//!
-//! During lowering, another option is to create an IrBuilder for the
-//! kernel that is being created:
-//!
-//!   kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-//!
-//! Once we have an IR builder instance, creating nodes looks like:
-//!
-//!   auto new_node = ir_builder.create<kir::Int>(1));
-//!   auto result = ir_builder.mulExpr(lhs, rhs);
-//!
-class TORCH_CUDA_CU_API IrBuilder {
- public:
-  explicit IrBuilder(Kernel* kernel) : kernel_(kernel) {}
-
-  //! Allocate a new Kernel IR node, forwarding the arguments
-  //! to the appropriate constructor
-  template <class T, class... Args>
-  T* create(Args&&... args) {
-    const kir::Passkey passkey(kernel_);
-    const auto node = new T(passkey, std::forward<Args>(args)...);
-    kernel_->registerIrNode(passkey, std::unique_ptr<T>(node));
-    return node;
-  }
-
-  // Unary operations
-  Val* negExpr(Val* val);
-  Val* notExpr(Val* val);
-  Val* setExpr(Val* val);
-  Val* setExprNamedScalar(const std::string& name, Val* val);
-  Val* addressExprNamedScalar(const std::string& name, Val* val);
-
-  // Binary operations
-  Val* andExpr(Val* lhs, Val* rhs);
-  Val* eqExpr(Val* lhs, Val* rhs);
-  Val* gtExpr(Val* lhs, Val* rhs);
-  Val* ltExpr(Val* lhs, Val* rhs);
-  Val* leExpr(Val* lhs, Val* rhs);
-  Val* geExpr(Val* lhs, Val* rhs);
-  Val* addExpr(Val* lhs, Val* rhs);
-  Val* subExpr(Val* lhs, Val* rhs);
-  Val* mulExpr(Val* lhs, Val* rhs);
-  Val* divExpr(Val* lhs, Val* rhs);
-  Val* ceilDivExpr(Val* lhs, Val* rhs);
-  Val* modExpr(Val* lhs, Val* rhs);
-  Val* maxExpr(Val* lhs, Val* rhs);
-  Val* minExpr(Val* lhs, Val* rhs);
-
-  // Ternary operations
-  Val* whereExpr(Val* pred, Val* lhs, Val* rhs);
-
-  // Shortcuts for frequently used vals
-  Int* zeroVal();
-  Int* oneVal();
-  Bool* falseVal();
-  Bool* trueVal();
-
-  NamedScalar* magicZeroVal();
-
- private:
-  Val* newResult(DataType dtype);
-  Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
-  Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
-
- private:
-  // Non-owning pointer to the kernel to be modified
-  Kernel* kernel_ = nullptr;
-  // Frequently used constant vals
-  Int* zero_ = nullptr;
-  Int* one_ = nullptr;
-  Bool* false_ = nullptr;
-  Bool* true_ = nullptr;
-
-  // Magic zero corresponds to runtime/helpers.cu magic_zero
-  NamedScalar* magic_zero_ = nullptr;
-};
-
-//! A wrapper builder with static expression simplification
-//!
-//! Example:
-//! - addExpr(new Int(1), new Int(2)) -> Int(3)
-//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo")
-//!
-//! Designed to be used to simplify predicate and index expressions in
-//! generated code. Also, the shift validation may fail without
-//! this simplification.
-class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder {
- public:
-  explicit SimplifyingIrBuilder(Kernel* kernel) : IrBuilder(kernel) {}
-
-  Val* negExpr(Val* val);
-  Val* notExpr(Val* val);
-
-  Val* addExpr(Int* lhs, Int::ScalarType rhs);
-  Val* addExpr(Int* lhs, Int* rhs);
-  Val* addExpr(Val* lhs, Val* rhs);
-  Val* subExpr(Val* lhs, Val* rhs);
-  Val* andExpr(Val* lhs, Val* rhs);
-};
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp
new file mode 100644
index 000000000000..a64b07da4a05
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp
@@ -0,0 +1,213 @@
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+namespace kir {
+std::vector<Expr*> IrVisitor::handle(const std::vector<Expr*>& exprs) {
+  exprs_ = std::vector<Expr*>(exprs);
+  for (auto expr : exprs) {
+    handle(expr);
+  }
+  return exprs_;
+}
+
+void IrVisitor::handle(ForLoop* fl) {
+  for_loops_.push_back(fl);
+  scope_.push_back(&fl->body());
+  scope_exprs_.push_back(fl);
+  auto body_exprs = std::vector<Expr*>(fl->body().exprs());
+  for (auto expr : body_exprs) {
+    handle(expr);
+  }
+  scope_exprs_.pop_back();
+  scope_.pop_back();
+  for_loops_.pop_back();
+}
+
+void IrVisitor::handle(IfThenElse* ite) {
+  scope_exprs_.push_back(ite);
+  scope_.push_back(&ite->thenBody());
+  auto then_exprs = std::vector<Expr*>(ite->thenBody().exprs());
+  for (auto expr : then_exprs) {
+    handle(expr);
+  }
+  scope_.pop_back();
+
+  scope_.push_back(&ite->elseBody());
+  auto else_exprs = std::vector<Expr*>(ite->elseBody().exprs());
+  for (auto expr : else_exprs) {
+    handle(expr);
+  }
+  scope_.pop_back();
+  scope_exprs_.pop_back();
+}
+
+std::vector<Expr*> ExprMutator::mutate(bool reverse_order) {
+  if (insertions_.empty() && replacements_.empty() && removal_.empty()) {
+    return exprs_;
+  }
+
+  auto run_insertion = [&](MutationInformation info) {
+    if (info.scope == nullptr) {
+      // If reference is nullptr and there are no expressions, simply insert the
+      // expr
+      if (exprs_.empty() && info.reference == nullptr) {
+        exprs_.push_back(info.new_expr);
+        return;
+      }
+      auto pos_it = std::find(exprs_.begin(), exprs_.end(), info.reference);
+      TORCH_INTERNAL_ASSERT(
+          pos_it != exprs_.end(),
+          "Issue finding reference expression for insertion.");
+      if (info.mode == MutationMode::BEFORE) {
+        exprs_.insert(pos_it, info.new_expr);
+      } else {
+        exprs_.insert(pos_it + 1, info.new_expr);
+      }
+    } else {
+      // If reference is nullptr and there are no expressions, simply insert the
+      // expr
+      if (info.scope->exprs().empty() && info.reference == nullptr) {
+        info.scope->push_back(info.new_expr);
+        return;
+      }
+      if (info.mode == MutationMode::BEFORE) {
+        info.scope->insert_before(info.reference, info.new_expr);
+      } else {
+        info.scope->insert_after(info.reference, info.new_expr);
+      }
+    }
+  };
+
+  if (reverse_order) {
+    for (auto it = insertions_.rbegin(); it != insertions_.rend(); ++it) {
+      run_insertion(*it);
+    }
+  } else {
+    for (auto insertion_info : insertions_) {
+      run_insertion(insertion_info);
+    }
+  }
+
+  for (auto replacement_info : replacements_) {
+    if (replacement_info.scope == nullptr) {
+      auto pos_it =
+          std::find(exprs_.begin(), exprs_.end(), replacement_info.reference);
+      TORCH_INTERNAL_ASSERT(
+          pos_it != exprs_.end(),
+          "Issue finding reference expression for replacement.");
+      exprs_.insert(pos_it, replacement_info.new_expr);
+      // iterator can be invalidated from insertion
+      pos_it =
+          std::find(exprs_.begin(), exprs_.end(), replacement_info.reference);
+      exprs_.erase(pos_it);
+    } else {
+      replacement_info.scope->insert_before(
+          replacement_info.reference, replacement_info.new_expr);
+      replacement_info.scope->erase(replacement_info.reference);
+    }
+  }
+
+  for (auto removal_info : removal_) {
+    if (removal_info.scope == nullptr) {
+      auto pos_it =
+          std::find(exprs_.begin(), exprs_.end(), removal_info.reference);
+      TORCH_INTERNAL_ASSERT(
+          pos_it != exprs_.end(), "Issue finding expression to remove.");
+      exprs_.erase(pos_it);
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          removal_info.scope->contains(removal_info.reference),
+          "Expression to remove is not found in the given scope: ",
+          removal_info.reference->toString());
+      removal_info.scope->erase(removal_info.reference);
+    }
+  }
+
+  insertions_.clear();
+  replacements_.clear();
+
+  return exprs_;
+}
+
+std::vector<Expr*> ExprMutator::traverseAndInsert(
+    const std::vector<Expr*>& exprs,
+    bool reverse_order) {
+  IrVisitor::handle(exprs);
+  return mutate(reverse_order);
+}
+
+void ExprMutator::registerMutation(
+    Expr* reference,
+    Expr* new_expr,
+    Scope* scope,
+    MutationMode mode) {
+  MutationInformation mutation;
+  mutation.reference = reference;
+  mutation.new_expr = new_expr;
+  mutation.scope = scope;
+  mutation.mode = mode;
+  if (mode == MutationMode::BEFORE || mode == MutationMode::AFTER) {
+    insertions_.push_back(mutation);
+  } else if (mode == MutationMode::REPLACE) {
+    replacements_.push_back(mutation);
+  } else if (mode == MutationMode::REMOVE) {
+    removal_.push_back(mutation);
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Invalid mutation type");
+  }
+}
+
+void ExprMutator::registerInsertBefore(
+    Expr* reference,
+    Expr* new_expr,
+    Scope* scope) {
+  registerMutation(reference, new_expr, scope, MutationMode::BEFORE);
+}
+
+void ExprMutator::registerInsertAfter(
+    Expr* reference,
+    Expr* new_expr,
+    Scope* scope) {
+  registerMutation(reference, new_expr, scope, MutationMode::AFTER);
+}
+
+void ExprMutator::registerReplace(
+    Expr* reference,
+    Expr* new_expr,
+    Scope* scope) {
+  registerMutation(reference, new_expr, scope, MutationMode::REPLACE);
+}
+
+void ExprMutator::registerRemove(Expr* expr_to_remove, Scope* scope) {
+  registerMutation(expr_to_remove, nullptr, scope, MutationMode::REMOVE);
+}
+
+void ExprMutator::registerInsertBefore(Expr* reference, Expr* new_expr) {
+  Scope* scope = scope_.empty() ? nullptr : scope_.back();
+  registerInsertBefore(reference, new_expr, scope);
+}
+
+void ExprMutator::registerInsertAfter(Expr* reference, Expr* new_expr) {
+  Scope* scope = scope_.empty() ? nullptr : scope_.back();
+  registerInsertAfter(reference, new_expr, scope);
+}
+
+void ExprMutator::registerReplace(Expr* reference, Expr* new_expr) {
+  Scope* scope = scope_.empty() ? nullptr : scope_.back();
+  registerReplace(reference, new_expr, scope);
+}
+
+void ExprMutator::registerRemove(Expr* expr_to_remove) {
+  Scope* scope = scope_.empty() ? nullptr : scope_.back();
+  registerRemove(expr_to_remove, scope);
+}
+
+} // namespace kir
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h
new file mode 100644
index 000000000000..d665c4a6fdf5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class Expr;
+
+namespace kir {
+class Predicate;
+class TensorIndex;
+class ForLoop;
+class IfThenElse;
+class Scope;
+
+// Base visitor class that visits all nodes in provided vector<Expr*>.
+//
+// Includes visiting through scopes like IfThenElse and ForLoop, and tracks
+// them in scopes_ and for_loops_.
+//
+// Makes a copy of exprs at exprs_ which could be used to modify and return.
+//
+// When traversing through ITE/FLs it will use a copy
+// of the provided expressions to make it safe to insert/delete nodes.
+//
+// Provides a simple base class to inherit from for typical lowering passes on
+// Expr list
+class TORCH_CUDA_CU_API IrVisitor : public OptOutDispatch {
+ public:
+  std::vector<Expr*> handle(const std::vector<Expr*>& expr);
+
+ protected:
+  using OptOutDispatch::handle;
+
+  virtual void handle(ForLoop*) override;
+  virtual void handle(IfThenElse*) override;
+
+ protected:
+  std::vector<ForLoop*> for_loops_;
+  std::vector<Scope*> scope_;
+  std::vector<Expr*> scope_exprs_;
+  std::vector<Expr*> exprs_;
+};
+
+// Base Expr Mutator class that visits all nodes with IrVisitor, and then
+// inserts new expressions, replaces expressions based on insertion/replace
+// maps provided or removes existing expressions. These replacement
+// maps are expected to accumulate during an initial traversal, then
+// runs an insertion based on them after the overloaded traversal.
+//
+// Order of mutations may be important, mutations are ordered according to the
+// following rules:
+//   Before/After insertions are ordered as registered when reverse_order ==
+//   false,
+//
+//   Before/After insertions are in reverse order as registered when
+//   reverse_order == true,
+//
+//   Before/After insertions are done before Expr replacements, so reference for
+//   insertions must be on pre-replaced Exprs
+//
+//   Removal of expressions is done after replacements.
+//
+// To place in a scope that is empty, simply provide a nullptr reference
+// Since insertions are done in order, it's possible to insert an expression in
+// an empty scope, and then use that inserted scope as a reference for
+// subsequent mutations.
+class ExprMutator : public IrVisitor {
+ protected:
+  std::vector<Expr*> traverseAndInsert(
+      const std::vector<Expr*>& expr,
+      bool reverse_order = false);
+
+  std::vector<Expr*> mutate(bool reverse_order = false);
+
+  using IrVisitor::handle;
+  // Registration function which *don't* need to be called "in place" during
+  // visiting.
+  void registerInsertBefore(Expr* reference, Expr* new_expr, Scope* scope);
+  void registerInsertAfter(Expr* reference, Expr* new_expr, Scope* scope);
+  void registerReplace(Expr* reference, Expr* new_expr, Scope* scope);
+  void registerRemove(Expr* expr_to_remove, Scope* scope);
+
+  // Registration function which need to be called "in place" during visiting.
+  // I.E.
+  // if you want to insert before/after or replace an Expr, you must register
+  // when in handle(Expr*) of that expr.
+  void registerInsertBefore(Expr* reference, Expr* new_expr);
+  void registerInsertAfter(Expr* reference, Expr* new_expr);
+  void registerReplace(Expr* reference, Expr* new_expr);
+  void registerRemove(Expr* expr_to_remove);
+
+ private:
+  enum class MutationMode { BEFORE, AFTER, REPLACE, REMOVE };
+
+  void registerMutation(
+      Expr* ref,
+      Expr* new_expr,
+      Scope* scope,
+      MutationMode mode);
+
+  struct MutationInformation {
+    Expr* reference = nullptr;
+    Expr* new_expr = nullptr;
+    Scope* scope = nullptr;
+    MutationMode mode = MutationMode::BEFORE;
+  };
+
+  // Track insertions as they're registered
+  std::vector<MutationInformation> insertions_;
+
+  // Track replacements as they're registered
+  std::vector<MutationInformation> replacements_;
+
+  // Track removal as they're registered
+  std::vector<MutationInformation> removal_;
+};
+
+} // namespace kir
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp
deleted file mode 100644
index e00da31423c1..000000000000
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
-
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
-
-#include <utility>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-namespace {
-
-const char* boolLiteral(bool value) {
-  return value ? "true" : "false";
-}
-
-std::string varName(const kir::Val* val, const char* prefix) {
-  std::stringstream value_name;
-  if (val == nullptr) {
-    value_name << "$nullptr";
-  } else if (val->name() != kInvalidStmName) {
-    value_name << prefix << val->name();
-  } else {
-    value_name << "k" << prefix << val->id();
-  }
-  return value_name.str();
-}
-
-} // namespace
-
-void IrPrinter::printNode(const kir::Node* node) {
-  os_ << gen(node, true);
-}
-
-void IrPrinter::printKernel(const Kernel* kernel) {
-  TORCH_CHECK(kernel != nullptr);
-
-  // kernel declaration
-  os_ << "\nKERNEL (";
-  for (auto in : kernel->inputs()) {
-    os_ << gen(in);
-    if (in != kernel->inputs().back()) {
-      os_ << ", ";
-    }
-  }
-  os_ << ") -> (";
-  for (auto out : kernel->outputs()) {
-    os_ << gen(out);
-    if (out != kernel->outputs().back()) {
-      os_ << ", ";
-    }
-  }
-  os_ << ") :\n";
-
-  // kernel body
-  startBlock();
-  for (auto expr : kernel->topLevelExprs()) {
-    os_ << gen(expr, true);
-  }
-  endBlock();
-  os_ << "END.\n\n";
-}
-
-std::ostream& IrPrinter::indent() {
-  for (const auto i : c10::irange(indent_level_)) {
-    (void)i; // Suppress unused variable warning
-    ir_str_ << kTab;
-  }
-  ir_str_ << margin_;
-  return ir_str_;
-}
-
-std::string IrPrinter::gen(const kir::Node* node, bool top_level) {
-  if (node == nullptr) {
-    return "$nullptr";
-  }
-
-  // If we're generatign a top level statement we expect to start
-  // with an empty set of uses
-  TORCH_INTERNAL_ASSERT(!implicit_definition_ || uses_.empty() || !top_level);
-
-  // Mark the node as generated
-  visited_.insert(node);
-
-  // Generate the node itself
-  std::stringstream node_str;
-  std::swap(node_str, ir_str_);
-  node->accept(this);
-  std::swap(node_str, ir_str_);
-
-  if (!implicit_definition_) {
-    return node_str.str();
-  }
-
-  if (top_level) {
-    // Implicitly mark top level nodes as used, so we
-    // get their definitions printed (useful for debugging)
-    if (auto val = dynamic_cast<const kir::Val*>(node)) {
-      uses_.insert(val);
-    }
-
-    // Make a copy of the node uses (and reset global state)
-    const auto node_uses = uses_;
-    uses_.clear();
-
-    std::stringstream top_level_str;
-
-    // Hoist implicit definitions
-    for (auto use : node_uses) {
-      const auto def = use->definition();
-      if (def && visited_.find(def) == visited_.end()) {
-        margin_ = "~ ";
-        top_level_str << gen(def, true);
-        margin_ = "";
-      }
-    }
-
-    top_level_str << node_str.str();
-    return top_level_str.str();
-  } else {
-    return node_str.str();
-  }
-}
-
-std::string IrPrinter::use(const kir::Val* val) {
-  if (val != nullptr) {
-    uses_.insert(val);
-  }
-  return gen(val);
-}
-
-void IrPrinter::startBlock() {
-  ++indent_level_;
-}
-
-void IrPrinter::endBlock() {
-  TORCH_CHECK(indent_level_ > 0);
-  --indent_level_;
-}
-
-void IrPrinter::handleBlock(const kir::Scope& scope) {
-  // Save the uses of the parent scope
-  decltype(uses_) outer_uses;
-  std::swap(uses_, outer_uses);
-
-  startBlock();
-  for (auto expr : scope.exprs()) {
-    ir_str_ << gen(expr, true);
-  }
-  endBlock();
-
-  // Restore parent's uses
-  std::swap(uses_, outer_uses);
-}
-
-void IrPrinter::visit(const kir::Bool* node) {
-  if (node->isConst()) {
-    ir_str_ << boolLiteral(*node->value());
-  } else {
-    ir_str_ << varName(node, "b");
-  }
-}
-
-void IrPrinter::visit(const kir::Double* node) {
-  if (node->isConst()) {
-    const int digits = std::numeric_limits<Double::ScalarType>::max_digits10;
-    ir_str_ << "double(" << std::setprecision(digits) << *node->value() << ")";
-  } else {
-    ir_str_ << varName(node, "d");
-  }
-}
-
-void IrPrinter::visit(const kir::Int* node) {
-  if (node->isConst()) {
-    ir_str_ << *node->value();
-  } else {
-    ir_str_ << varName(node, "i");
-  }
-}
-
-void IrPrinter::visit(const kir::NamedScalar* node) {
-  ir_str_ << node->name();
-}
-
-void IrPrinter::visit(const kir::Predicate* node) {
-  switch (node->predicate_type()) {
-    case PredicateType::Inline: {
-      ir_str_ << "Inline";
-      break;
-    }
-    case PredicateType::Manual: {
-      ir_str_ << node->value();
-      break;
-    }
-    case PredicateType::Misaligned: {
-      ir_str_ << "Misaligned";
-      break;
-    }
-    case PredicateType::Padding: {
-      ir_str_ << "Padding";
-      break;
-    }
-    case PredicateType::Shift: {
-      ir_str_ << "Shift";
-      break;
-    }
-    case PredicateType::Unswitch: {
-      ir_str_ << "Unswitch";
-      break;
-    }
-    case PredicateType::Vectorize: {
-      ir_str_ << "Vectorize";
-      break;
-    }
-    default:
-      break;
-  }
-}
-
-void IrPrinter::visit(const kir::TensorIndex* node) {
-  ir_str_ << gen(node->view()) << "[";
-  for (auto index : node->indices()) {
-    ir_str_ << use(index);
-    if (index != node->indices().back()) {
-      ir_str_ << ", ";
-    }
-  }
-  ir_str_ << "]";
-}
-
-void IrPrinter::visit(const kir::IterDomain* node) {
-  ir_str_ << varName(node, "id") << "[";
-  if (node->isRFactorProduct()) {
-    ir_str_ << "rfactor.";
-  }
-  ir_str_ << node->parallelType() << "." << node->iterType() << "("
-          << use(node->start()) << " .. " << use(node->extent()) << ")]";
-}
-
-void IrPrinter::visit(const kir::TensorDomain*) {
-  // TODO(kir): print Tensor shapes?
-  ir_str_ << "kir::TensorDomain";
-}
-
-void IrPrinter::visit(const kir::TensorView* node) {
-  // TODO(kir): print memory type too?
-  ir_str_ << varName(node, "T");
-}
-
-void IrPrinter::visit(const kir::UnaryOp* node) {
-  indent() << gen(node->out()) << " = ";
-
-  auto op_type = node->operation();
-
-  if (auto op = inline_op_str(op_type)) {
-    if (alsoBooleanOperator(op_type) &&
-        node->out()->dtype() == DataType::Bool) {
-      ir_str_ << stringifyBooleanOp(op_type) << gen(node->in());
-    } else {
-      ir_str_ << *op << gen(node->in());
-    }
-  } else {
-    if (op_type == UnaryOpType::Cast) {
-      const auto cast_str =
-          cast_func_str({node->in()->dtype(), node->out()->dtype()});
-      ir_str_ << cast_str.value();
-    } else {
-      ir_str_ << op_type;
-      if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) {
-        ir_str_ << "f";
-      }
-    }
-
-    if (op_type == UnaryOpType::RandLike) {
-      ir_str_ << "(RND";
-    } else {
-      ir_str_ << "(";
-      ir_str_ << use(node->in());
-    }
-    ir_str_ << ")";
-  }
-
-  ir_str_ << "\n";
-}
-
-void IrPrinter::visit(const kir::BinaryOp* node) {
-  indent() << gen(node->out()) << " = ";
-
-  const auto op_type = node->operation();
-  const auto lhs = use(node->lhs());
-  const auto rhs = use(node->rhs());
-
-  if (auto op = inline_op_str(op_type)) {
-    ir_str_ << lhs << " ";
-    if (alsoBooleanOperator(op_type) &&
-        node->out()->dtype() == DataType::Bool) {
-      ir_str_ << stringifyBooleanOp(op_type);
-    } else {
-      ir_str_ << *op;
-    }
-    ir_str_ << " " << rhs;
-  } else {
-    ir_str_ << op_type;
-    if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) {
-      ir_str_ << "f";
-    }
-    ir_str_ << "(" << lhs << ", " << rhs << ")";
-  }
-
-  ir_str_ << "\n";
-}
-
-void IrPrinter::visit(const kir::TernaryOp* node) {
-  indent() << gen(node->out()) << " = " << node->operation() << "("
-           << use(node->in1()) << ", " << use(node->in2()) << ", "
-           << use(node->in3()) << ")\n";
-}
-
-void IrPrinter::visit(const kir::ReductionOp* node) {
-  indent() << gen(node->out()) << " = "
-           << "REDUCTION(op='" << node->operation() << "'"
-           << ", in=" << use(node->in()) << ", init=" << use(node->init())
-           << ", pred=" << use(node->predicate()) << ")\n";
-}
-
-void IrPrinter::visit(const kir::WelfordOp* node) {
-  indent() << gen(node->outVar()) << "," << gen(node->outAvg()) << ","
-           << gen(node->outN()) << " = "
-           << "Welford( inAvg=" << use(node->inAvg());
-  if (!node->inN()->isOneInt()) {
-    indent() << " inVar=" << use(node->inVar());
-  }
-  indent() << " inN=" << use(node->inN());
-  if (!node->initN()->isZeroInt()) {
-    indent() << ", initVar=" << use(node->initVar())
-             << " initAvg=" << use(node->initAvg())
-             << " initN=" << use(node->initN());
-  }
-  indent() << ", pred=" << use(node->predicate()) << ")\n";
-}
-
-void IrPrinter::visit(const kir::GridReduction* node) {
-  const auto* reduction_op = node->reduction_op();
-  indent() << gen(reduction_op->out()) << " = "
-           << "GRID_REDUCTION(op='" << reduction_op->operation() << "'"
-           << ", in=" << use(reduction_op->in())
-           << ", init=" << use(reduction_op->init())
-           << ", pred=" << use(reduction_op->predicate()) << ")\n";
-  indent() << kTab << kTab
-           << ".reduction_buffer=" << use(node->reduction_buffer()->buffer())
-           << "\n";
-  indent() << kTab << kTab
-           << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n";
-  indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n";
-}
-
-void IrPrinter::visit(const kir::GridWelford* node) {
-  const auto* welford_op = node->welford_op();
-  indent() << gen(welford_op->outVar()) << "," << gen(welford_op->outAvg())
-           << "," << gen(welford_op->outN()) << " = "
-           << "GRID_WELFORD("
-           << "inAvg=" << use(welford_op->inAvg());
-  if (!welford_op->inN()->isOneInt()) {
-    indent() << ", inVar=" << use(welford_op->inVar());
-  }
-  indent() << ", inN=" << use(welford_op->inN());
-  if (!welford_op->initN()->isZeroInt()) {
-    indent() << ", initVar=" << use(welford_op->initVar())
-             << " initAvg=" << use(welford_op->initAvg())
-             << " initN=" << use(welford_op->initN());
-  }
-  indent() << ", pred=" << use(welford_op->predicate()) << ")\n";
-  indent() << kTab << kTab
-           << ".var_buffer=" << use(node->var_buffer()->buffer())
-           << ".avg_buffer=" << use(node->avg_buffer()->buffer())
-           << ".n_buffer=" << use(node->N_buffer()->buffer()) << "\n";
-  indent() << kTab << kTab
-           << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n";
-  indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n";
-}
-
-void IrPrinter::visit(const kir::BroadcastOp* node) {
-  indent() << gen(node->out()) << " = BROADCAST(" << use(node->in()) << ")\n";
-}
-
-void IrPrinter::visit(const kir::ForLoop* node) {
-  indent() << "FOR " << gen(node->index()) << " in " << gen(node->iter_domain())
-           << ":\n";
-  handleBlock(node->body());
-}
-
-void IrPrinter::visit(const kir::IfThenElse* node) {
-  indent() << "IF " << use(node->predicate()) << ":\n";
-  handleBlock(node->thenBody());
-  if (node->hasElse()) {
-    indent() << "ELSE:\n";
-    handleBlock(node->elseBody());
-  }
-}
-
-void IrPrinter::visit(const kir::Allocate* node) {
-  indent() << gen(node->buffer()) << " = ALLOCATE("
-           << "mem_type=" << node->memoryType() << ", "
-           << "size=" << use(node->size()) << ", "
-           << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n";
-  if (node->alias() != nullptr) {
-    indent() << kTab << kTab << ".alias=" << gen(node->alias()->buffer())
-             << "\n";
-  }
-}
-
-void IrPrinter::visit(const kir::Sync* node) {
-  indent() << "SYNC(war_hazard=" << boolLiteral(node->isWarHazardSync())
-           << ")\n";
-}
-
-void IrPrinter::visit(const kir::InitMagicZero* node) {
-  indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n";
-}
-
-void IrPrinter::visit(const kir::UpdateMagicZero* node) {
-  indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n";
-}
-
-std::string toString(const kir::Node* stmt, bool implicit_definitions) {
-  std::stringstream ss;
-  IrPrinter ir_printer(ss, implicit_definitions);
-  ir_printer.printNode(stmt);
-  return ss.str();
-}
-
-std::string toString(
-    const std::vector<kir::Expr*>& exprs,
-    bool implicit_definitions) {
-  std::stringstream ss;
-  IrPrinter ir_printer(ss, implicit_definitions);
-  for (auto expr : exprs) {
-    ir_printer.printNode(expr);
-  }
-  return ss.str();
-}
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h
deleted file mode 100644
index 115901a031a9..000000000000
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#pragma once
-
-#include <torch/csrc/Export.h>
-
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <unordered_set>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-namespace kir {
-
-//! Define pretty printing functions for Kernel IR nodes
-//!
-//! This class is intended for debug printing, so it attempts
-//! to handle invalid IR states as much as possible.
-//!
-//! implicit_definition_ = true will recurisvely print the definition of all
-//! inputs to an expression if they haven't been printed.
-class TORCH_CUDA_CU_API IrPrinter : private kir::IrVisitor {
-  static constexpr char const* kTab = "  ";
-
- public:
-  //! Constructs a new IrPrinter which outputs to the specified stream
-  explicit IrPrinter(std::ostream& os, bool implicit_definition = true)
-      : os_(os), implicit_definition_(implicit_definition) {}
-
-  //! Print a single Kernel IR node
-  void printNode(const kir::Node* node);
-
-  //! Print a complete Kernel definition
-  void printKernel(const Kernel* kernel);
-
- private:
-  // Generates a string representation of an IR node
-  //
-  // If `top_level` is true, all the value uses are tracked and
-  // their definitions are implicitly printed before the node itself
-  //
-  std::string gen(const kir::Node* node, bool top_level = false);
-
-  // Generate a string representation of an used value
-  // (this helps automatically tracking the value uses)
-  std::string use(const kir::Val* val);
-
-  std::ostream& indent();
-
-  void startBlock();
-  void endBlock();
-  void handleBlock(const kir::Scope& scope);
-
-  void visit(const kir::Bool*) final;
-  void visit(const kir::Double*) final;
-  void visit(const kir::Int*) final;
-  void visit(const kir::NamedScalar*) final;
-  void visit(const kir::Predicate*) final;
-
-  void visit(const kir::TensorIndex*) final;
-  void visit(const kir::IterDomain*) final;
-  void visit(const kir::TensorDomain*) final;
-  void visit(const kir::TensorView*) final;
-
-  void visit(const kir::UnaryOp*) final;
-  void visit(const kir::BinaryOp*) final;
-  void visit(const kir::TernaryOp*) final;
-  void visit(const kir::ReductionOp*) final;
-  void visit(const kir::WelfordOp*) final;
-  void visit(const kir::BroadcastOp*) final;
-
-  void visit(const kir::GridReduction*) final;
-  void visit(const kir::GridWelford*) final;
-  void visit(const kir::ForLoop*) final;
-  void visit(const kir::IfThenElse*) final;
-  void visit(const kir::Allocate*) final;
-  void visit(const kir::Sync*) final;
-  void visit(const kir::InitMagicZero*) final;
-  void visit(const kir::UpdateMagicZero*) final;
-
- private:
-  std::ostream& os_;
-
-  // Current indentation level
-  int indent_level_ = 0;
-
-  // Internal IR generation stream
-  std::stringstream ir_str_;
-
-  // Tracks the set of nodes which have been printed
-  std::unordered_set<const kir::Node*> visited_;
-
-  // Optional left margin printed after the indentation
-  const char* margin_ = "";
-
-  // The set of values used by the current top-level IR node
-  std::unordered_set<const kir::Val*> uses_;
-
-  // If the definition of all inputs to an expression haven't been printed
-  // already implicit_definition_ = true will print them before printing the
-  // requested node.
-  bool implicit_definition_ = true;
-};
-
-//! Returns the string representation of a Kernel IR node. If the definition of
-//! all inputs to an expression haven't been printed already
-//! implicit_definition_ = true will print them before printing the requested
-//! node.
-TORCH_CUDA_CU_API std::string toString(
-    const kir::Node* stmt,
-    bool implicit_definitions = true);
-
-//! Returns the string representation of a vector of kir::Expr, convenient
-//! debugm echanism during lowering. If the definition of all inputs to an
-//! expression haven't been printed already implicit_definition_ = true will
-//! print them before printing the requested node.
-TORCH_CUDA_CU_API std::string toString(
-    const std::vector<kir::Expr*>& exprs,
-    bool implicit_definitions = true);
-
-} // namespace kir
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
index 036eee58206a..3e644fc9a44d 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -6,18 +6,19 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower_alias_memory.h>
 #include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
+#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
 #include <torch/csrc/jit/codegen/cuda/lower_expr_sort.h>
+#include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
 #include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
 #include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
 #include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
+#include <torch/csrc/jit/codegen/cuda/lower_replace_size.h>
 #include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
 #include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
@@ -33,152 +34,15 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-// TODO(kir): revisit this
 thread_local GpuLower* active_gpu_lower = nullptr; // NOLINT
 namespace {
 
-// Going to generate a map of tensor view root domain extents to reduce the
-// number used during lowering. For example if we have:
-//
-// T2[i0, i1] = T1[i0, i1] + T2[i2, i3]
-//
-// We know it would be safe to use:
-//
-// T2[i0, i1] = T1[i0, i1] + T2[i0, i1]
-//
-// And that way we don't generate T2.size[0] and T2.size[1], instead we will
-// reuse T1.size[0] and T1.size[1]
-// This is important when doing CSE as T2 and T1 would otherwise look like
-// they're using different values, even though we know they're the same
-//
-// There's some duplicate logic here that's in computeAt map, but it's not so
-// concice there to pull out. May want to consider making this mapping its own
-// class especially as it may be useful during scheduling.
-std::unordered_map<Val*, Val*> getSimplificationMap(Fusion* fusion) {
-  std::list<std::unordered_set<IterDomain*>> disjoint_root_sets;
-  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>*>
-      id_to_disjoint_root_set;
-
-  auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set](
-                          IterDomain* id0, IterDomain* id1) {
-    if (id0->isBroadcast() || id1->isBroadcast()) {
-      return;
-    }
-
-    auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0);
-    auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1);
-    bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end();
-    bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end();
-
-    if (set_0_found && set_1_found) {
-      if (disjoint_set_0_it->second == disjoint_set_1_it->second) {
-        return;
-      }
-      // merge second disjoint set into first
-      auto* set_0 = disjoint_set_0_it->second;
-      auto* set_1 = disjoint_set_1_it->second;
-      for (auto id : *set_1) {
-        set_0->emplace(id);
-        id_to_disjoint_root_set[id] = set_0;
-      }
-      // remove second set from disjoint_root_sets
-      disjoint_root_sets.erase(std::find(
-          disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1));
-    } else if (set_0_found || set_1_found) {
-      auto existing_set =
-          set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second;
-      auto to_add_id = set_0_found ? id1 : id0;
-      existing_set->emplace(to_add_id);
-      id_to_disjoint_root_set[to_add_id] = existing_set;
-      // add entry into existing set
-    } else {
-      // create new set entry
-      disjoint_root_sets.emplace_back(std::unordered_set<IterDomain*>());
-      auto* new_set = &disjoint_root_sets.back();
-      new_set->emplace(id0);
-      new_set->emplace(id1);
-      id_to_disjoint_root_set[id0] = new_set;
-      id_to_disjoint_root_set[id1] = new_set;
-    }
-  };
-
-  auto fusion_vals = fusion->usedMathVals();
-  for (auto producer_tv : ir_utils::filterByType<TensorView>(fusion_vals)) {
-    auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv);
-    for (auto consumer_tv : consumer_tvs) {
-      auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv);
-      auto c2p_root_map = pairwise_map.mapConsumerToProducer(
-          consumer_tv->domain(), producer_tv->domain());
-      for (auto entry : c2p_root_map) {
-        auto c_id = entry.first;
-        auto p_id = entry.second;
-        map_root_ids(p_id, c_id);
-      }
-    }
-  }
-
-  // Map each set to an input ID (if it exists) that has the smallest ->name()
-  // entry value
-  std::unordered_map<std::unordered_set<IterDomain*>*, IterDomain*>
-      set_to_input_id;
-
-  // Loop over the root domains, of the inputs to the fusion. Pick an input ID
-  // to use as the representative ID of the collected sets. Only consider inputs
-  // as those are the ones that map to values like "T0.size[1]". They are he
-  // ID's that propagated their extents into the problem. We could also check
-  // the outputs as we do have C++ examples of using output dimensions for the
-  // problem size instead of inputs. However, we don't do anything where we can
-  // translate to those kinds of kernels integrated into PyTorch.
-  for (auto input_tv : ir_utils::filterByType<TensorView>(fusion->inputs())) {
-    for (auto id :
-         TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) {
-      auto id_set_it = id_to_disjoint_root_set.find(id);
-      if (id_set_it == id_to_disjoint_root_set.end()) {
-        continue;
-      }
-      auto* id_set = id_set_it->second;
-      if (set_to_input_id.find(id_set) == set_to_input_id.end()) {
-        set_to_input_id[id_set] = id;
-      } else {
-        auto input_id_of_set = set_to_input_id.at(id_set);
-        // Swap id's if new name is less than previously set
-        bool swap_ids = id->name() < input_id_of_set->name();
-        // If new id is a const scalar but previously was'nt use the const
-        // scalar
-        swap_ids = swap_ids ||
-            (id->extent()->isConstScalar() &&
-             !input_id_of_set->extent()->isConstScalar());
-        // If previous scalar was const and new isn't, don't swap
-        swap_ids = swap_ids &&
-            !(input_id_of_set->extent()->isConstScalar() &&
-              !id->extent()->isConstScalar());
-
-        if (swap_ids) {
-          set_to_input_id[id_set] = id;
-        }
-      }
-    }
-  }
-
-  // Finally make map from ID extents to the representitive ID extent.
-  std::unordered_map<Val*, Val*> extent_to_min_input_id_extent;
-  for (auto entry : set_to_input_id) {
-    auto* set = entry.first;
-    auto input_id = entry.second;
-    for (auto id : *set) {
-      extent_to_min_input_id_extent[id->extent()] = input_id->extent();
-    }
-  }
-  return extent_to_min_input_id_extent;
-}
-
-class KIRCleaner : public kir::MutableIrVisitor {
+class KIRCleaner : public OptOutDispatch {
  public:
   //! Remove nop IR nodes
-  static std::vector<kir::Expr*> cleanUp(
-      const std::vector<kir::Expr*>& loop_nests) {
+  static std::vector<Expr*> cleanUp(const std::vector<Expr*>& loop_nests) {
     KIRCleaner cleaner;
-    std::vector<kir::Expr*> out_loop_nests;
+    std::vector<Expr*> out_loop_nests;
     for (auto loop_nest : loop_nests) {
       cleaner.handle(loop_nest);
       // No need to keep the loop nest if it's determined to be nop
@@ -190,16 +54,17 @@ class KIRCleaner : public kir::MutableIrVisitor {
   }
 
  private:
-  void handle(kir::Expr* expr) {
+  using OptOutDispatch::handle;
+  void handle(Expr* expr) final {
     if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
-      expr->accept(this);
+      OptOutDispatch::handle(expr);
     } else {
       // Any non-scoping expr is not considered nop
       is_nop_ = false;
     }
   }
 
-  void visit(kir::ForLoop* fl) final {
+  void handle(kir::ForLoop* fl) final {
     auto exprs = fl->body().exprs();
     fl->body().clear();
     for (auto expr : exprs) {
@@ -213,7 +78,7 @@ class KIRCleaner : public kir::MutableIrVisitor {
     is_nop_ = fl->body().empty();
   }
 
-  void visit(kir::IfThenElse* ite) final {
+  void handle(kir::IfThenElse* ite) final {
     const auto conditional = ite->predicate()->value();
 
     // Visit the then block
@@ -248,9 +113,8 @@ class KIRCleaner : public kir::MutableIrVisitor {
     // conditional and move the exprs in the else block to the then
     // block.
     if (then_nop && !else_nop) {
-      kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
-      kir::Bool* pred = ite->predicate()->value();
-      kir::Bool* not_pred = ir_builder.notExpr(pred)->as<kir::Bool>();
+      Bool* pred = ite->predicate()->value();
+      Bool* not_pred = SimplifyingIrBuilder::notExpr(pred)->as<Bool>();
       ite->predicate()->setValue(not_pred);
       for (auto expr : ite->elseBody().exprs()) {
         ite->thenBody().push_back(expr);
@@ -269,84 +133,6 @@ class KIRCleaner : public kir::MutableIrVisitor {
 
 } // namespace
 
-void GpuLower::replaceSymbolicSizes() {
-  FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes");
-
-  kir::IrBuilder ir_builder(kernel());
-
-  // Grab inputs and outputs
-  std::vector<TensorView*> inputs_and_outputs;
-  for (auto val : fusion_->inputs()) {
-    if (ir_utils::isTV(val)) {
-      inputs_and_outputs.push_back(val->as<TensorView>());
-    }
-  }
-  // Symbolic size is necessary for outputs if there are no inputs.
-  // Otherwise infer output sizes from the inputs via expression evaluation.
-  if (fusion_->inputs().empty()) {
-    for (auto val : fusion_->outputs()) {
-      if (ir_utils::isTV(val)) {
-        inputs_and_outputs.push_back(val->as<TensorView>());
-      }
-    }
-  }
-
-  // Generate map for all tensorview root domain values to map them to symbolic
-  // values. i.e. T0->getRootDomain()[0] would map to a named scalar
-  // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir.
-  for (TensorView* tv : inputs_and_outputs) {
-    // Replace the domain with one based on Ti.size[j]
-    const std::vector<IterDomain*>& root_td = tv->getRootDomain();
-
-    size_t dim = 0;
-    for (auto id : root_td) {
-      const Val* orig_size = id->extent();
-
-      // Output sizes could have reduction axes, which isn't what gets output.
-      // NOLINTNEXTLINE(bugprone-branch-clone)
-      if (id->isReduction() ||
-          (id->getIterType() == IterType::BroadcastWithoutStride)) {
-        continue;
-      } else if (
-          id->isRFactorProduct() ||
-          // NOLINTNEXTLINE(bugprone-branch-clone)
-          (id->getIterType() == IterType::BroadcastWithStride) ||
-          orig_size->isConstScalar()) {
-        dim++;
-        continue;
-      }
-
-      // TODO(kir): consider a different implementation which doesn't
-      //  hijack the kir_val_map_
-      // Currently turn off this part for inputs of segmented fusion,
-      //  since FusionKernelRuntime will provide these as integer inputs
-      if (kir_val_map_.find(orig_size) == kir_val_map_.end() &&
-          !orig_size->isFusionInput() && !orig_size->isConstScalar()) {
-        std::stringstream ss;
-        ss << "T" << tv->name() << ".size[" << dim++ << "]";
-        kir_val_map_[orig_size] = ir_builder.create<kir::NamedScalar>(
-            ss.str(), orig_size->getDataType().value());
-      } else {
-        dim++;
-      }
-    }
-  }
-
-  // Use a minimal number of sizes from provided tensors.
-  auto extent_simplification_map = getSimplificationMap(fusion_);
-  for (auto extent_entry : extent_simplification_map) {
-    auto orig_extent = extent_entry.first;
-    auto simplified_extent = extent_entry.second;
-    if (kir_val_map_.count(orig_extent)) {
-      if (kir_val_map_.count(simplified_extent)) {
-        kir_val_map_[orig_extent] = kir_val_map_[simplified_extent];
-      } else {
-        kir_val_map_[orig_extent] = lowerValue(simplified_extent);
-      }
-    }
-  }
-}
-
 void GpuLower::collectPaddedParallelDims() {
   ExpressionEvaluator ee(fusion_);
   bool can_be_single_warp = true;
@@ -357,8 +143,11 @@ void GpuLower::collectPaddedParallelDims() {
   for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
     for (auto id : tv->domain()->domain()) {
       if (tv->definition()) {
+        // TODO: Support GroupedReductionOp
         if (auto reduction = dynamic_cast<ReductionOp*>(tv->definition())) {
-          if (ir_utils::getMaybeWarpReductionDim(reduction).has_value()) {
+          if (ir_utils::getMaybeWarpReductionDim(
+                  reduction->out(), reduction->in())
+                  .has_value()) {
             warp_pad_info_.has_warp_reduction = true;
           }
         }
@@ -398,14 +187,12 @@ void GpuLower::collectPaddedParallelDims() {
   }
 }
 
-void GpuLower::lower() {
+void GpuLower::lower(Fusion* fusion, DataType index_type) {
   FUSER_PERF_SCOPE("GpuLower::lower");
-
-  TORCH_INTERNAL_ASSERT(fusion_ != nullptr);
+  TORCH_INTERNAL_ASSERT(fusion != nullptr);
   TORCH_INTERNAL_ASSERT(
       active_gpu_lower == nullptr, "Nested lowering passes are not supported");
 
-  // TODO(kir): revisit this
   struct LowerGuard {
     LowerGuard(GpuLower* gpu_lower) {
       active_gpu_lower = gpu_lower;
@@ -414,116 +201,158 @@ void GpuLower::lower() {
       active_gpu_lower = nullptr;
     }
   } lower_guard(this);
+  // Copy fusion into a new kernel for processing
+  kernel_ = std::make_unique<kir::Kernel>(fusion, index_type);
+  // Alias the fusion kernel caries around as a view of itself.
+  fusion_ = kernel_.get();
+
+  // Convert tensor views of DataType::Index type to either Int or Int32
+  for (auto tv : ir_utils::allTvs(fusion_)) {
+    if (tv->dtype() == DataType::Index) {
+      tv->resolveIndexDtype();
+    }
+  }
 
   FusionGuard fg(fusion_);
-
-  // Start with a fresh kernel
-  kernel_ = std::make_unique<kir::Kernel>();
-
   // prepare for lowering
   validateIr(fusion_);
-  replaceSymbolicSizes();
+
+  // Checks if any TIDx dim is marked as padded to a warp. Also checks if we can
+  // determine the padding is explicitly a single warp.
   collectPaddedParallelDims();
-  trivial_reduction_info_.build(fusion_, this);
 
-  // In the future we may directly use this map, but for now it will propagate
-  // and validate (to some extent) the parallelization strategy.
-  // This is the first time nodes will be lowered to kir nodes. Since for now we
-  // propagate the parallel strategy in some instances, we need to do it before
-  // lowering.
-  ca_parallel_map_ = ComputeAtMap(ComputeAtMap::MappingMode::PARALLEL);
-  ca_parallel_map_.build(fusion_, current());
+  // Replaces integers that are tensor sizes by named scalars as "T0.size[0]"
+  replaceSymbolicSizes(fusion_);
+
+  // Traverse through reductions and termine if any iteration domains are
+  // trivial reductions. Add these iteration domains to trivial_reduction_info_
+  // which simply holds a map of which axes are trivial and which are not.
+  trivial_reduction_info_.build(fusion_);
+  // Replaces trivial reduction expressions (all id's being reduced are trivial)
+  // with set unary op
+  trivialReductionReplacement(fusion_, trivial_reduction_info_);
+
+  // Build what's refered to as the compute at map. This map contains the
+  // mappings of all iteration domains across the fusion. There are three types
+  // of mappings Permissive, Exact, and Loop, see compute_at_map.h/cpp for more
+  // information.
+  compute_at_map_ = std::make_unique<ComputeAtMap>(fusion_);
 
-  // Want to run this after parallel map is created
-  validateVectorize(fusion_);
+  if (isDebugDumpEnabled(DebugDumpOption::ComputeAtMap)) {
+    std::cout << compute_at_map_->toString() << std::endl;
+  }
 
-  // Generate mappings to generate indices
-  ca_index_map_ = ComputeAtMap(ComputeAtMap::MappingMode::INDEX);
-  ca_index_map_.build(fusion_, current());
+  compute_at_map_->validateAndPropagatePType();
 
-  // Generate mappings to generate and map to loop nests
-  ca_loop_map_ = ComputeAtMap(ComputeAtMap::MappingMode::LOOP);
-  ca_loop_map_.build(fusion_, current());
+  // Used in parallel dimension map
+  concretized_broadcast_domains_.build(fusion_);
 
   parallelDimensionMap().build(fusion_);
   if (isDebugDumpEnabled(DebugDumpOption::ParallelDimensions)) {
-    std::cout << parallelDimensionMap().toString();
+    std::cout << "Parallel dimension map:" << std::endl;
+    std::cout << parallel_dimension_map_.toString() << std::endl;
   }
 
+  // Validate mma data format and compatibility if any on the fusion.
+  validateMma(fusion_);
+
   // Compute thread predicates. Depends on parallel_dimension_map_
   thread_pred_map_.build(fusion_);
 
-  // Depends on thread_pred_map_
-  validateParallelize(fusion_);
+  // Fuse cetain patterns of reductions, such as a grid reduction
+  // followed by a grid broadcast. Only depends on parallelization and
+  // thread predicate map.
+  fuseReductionsAndBroadcasts(fusion_);
 
   // Scan the whole fusion and build mappings about halo extensions of
   // all IterDomains
   haloInfo().build(fusion_);
 
+  // Want to run this after parallel map and halo info map are
+  // created. vectorized_accesses_ and vectorized_set_info_ are filled.
+  validateAndCollectVectorizeInfo(fusion_);
+
+  // Depends on thread_pred_map_, validates parallelization collects which
+  // tensor views need WAR or RAW syncs
+  sync_map_.build(fusion_);
+
   partialSplitMap().build(fusion_);
 
   validatePartialSplit(fusion_);
 
-  // Detects all exprssions that don't need predicates
-  predicateElimination().build(fusion_);
-
   nonDivisibleSplitInfo().build(fusion_);
 
-  // Set the kernel inputs & outputs
-  for (auto input : fusion_->inputs()) {
-    kernel_->addInput(GpuLower::lowerValue(input));
-  }
+  // Detects all exprssions that don't need predicates. Depends on
+  // nonDivisibleSplitInfo.
+  predicateElimination().build(fusion_);
 
-  for (auto output : fusion_->outputs()) {
-    kernel_->addOutput(GpuLower::lowerValue(output));
-  }
+  doubleBufferInfo().build(fusion_);
 
   // Run our passes keeping the lowered expressions and forwarding
   // them
 
   // Reorder expressions for loop-nest generation respecting computeAt
   // relationships
-  auto sorted_exprs = reorderExprsForComputeAt();
+  const auto exprs_sorted = reorderExprsForComputeAt();
 
   // Generate loop-nests and place each expression at its
   // corresponding loop
-  const auto lowered_exprs = LoopNestGenerator::loweredExprs(sorted_exprs);
+  const auto exprs_lowered = LoopNestGenerator::loweredExprs(exprs_sorted);
+
+  // Replace trivial reductions, Transpose, Shift, Gather, and View ops with
+  // unary ops since they're not separately processed in lowering.
+  const auto exprs_unary_replaced = unarySetOpInserter(exprs_lowered);
 
   // Insert allocations
-  const auto alloced_exprs = insertAllocations(lowered_exprs);
+  const auto exprs_alloced = insertAllocations(exprs_unary_replaced);
 
   // Insert read after write smem syncs
-  const auto raw_sync_exprs = insertRawThreadSynchronization(alloced_exprs);
+  const auto exprs_raw_sync = insertRawThreadSynchronization(exprs_alloced);
 
   // Reuse memory locations
-  const auto reuse_mem_exprs = reuseMemoryAllocations(raw_sync_exprs);
+  const auto exprs_reuse_mem = reuseMemoryAllocations(exprs_raw_sync);
 
-  // Inserts predicates after this, need to be careful in later passes when
-  // inserting in loop nest structure as insertions could be on if then else
-  // instead of directly on a for loop
-  const auto unrolled_loops = UnrollPass::runPass(fusion_, reuse_mem_exprs);
+  // Insert SyncThreads at end of for-loop to avoid WAR race condition
+  const auto exprs_war_sync = insertWarThreadSynchronization(exprs_reuse_mem);
 
-  const auto unrolled_mv_loops =
-      processMisalignedVectorization(fusion_, unrolled_loops);
+  const auto exprs_double_buffered = DoubleBufferPass::run(exprs_war_sync);
 
-  // Insert SyncThreads at end of for-loop to avoid WAR race condition
-  const auto war_sync_exprs = insertWarThreadSynchronization(unrolled_mv_loops);
+  // This pass inserts predicates as well as branches in the code. Up until now
+  // the code is explicitly single shot for loop based. Need to be careful in
+  // later passes when doing any kind of insertions in loop nest structure as
+  // insertions could be on if then or else instead of directly on a for loop.
+  const auto exprs_unrolled_loops =
+      UnrollPass::runPass(fusion_, exprs_double_buffered);
 
-  const auto indexed_loops = IndexLowering::getIndexedExprs(war_sync_exprs);
+  const auto exprs_unrolled_mv_loops =
+      processMisalignedVectorization(exprs_unrolled_loops);
 
-  const auto exprs_with_fused_broadcast = fuseWarpReduce(indexed_loops);
+  const auto exprs_indexed_loops =
+      IndexLowering::getIndexedExprs(exprs_unrolled_mv_loops);
 
-  const auto conditional_loops =
-      generateConditionalFromPredicate(fusion_, exprs_with_fused_broadcast);
+  // TODO: It seems this type of optimization would be far easier to implement
+  // on fusion ir than kernel ir. We should likely refactor this to at least run
+  // before allocation insertion.
+  const auto exprs_with_fused_broadcast = fuseWarpReduce(exprs_indexed_loops);
+
+  const auto exprs_conditional_loops =
+      generateConditionalFromPredicate(exprs_with_fused_broadcast);
+
+  const auto exprs_common_index_allocated =
+      allocateCommonIndices(exprs_conditional_loops);
 
   // Insert fake zero updates to make sure nvrtc doesn't blow out register use
   // on index and predicate reuse
-  const auto register_adjusted = insertMagicZero(conditional_loops);
+  const auto exprs_register_adjusted =
+      insertMagicZero(exprs_common_index_allocated);
 
-  const auto cleaned_up_loops = KIRCleaner::cleanUp(register_adjusted);
+  const auto exprs_cleaned_up_loops =
+      KIRCleaner::cleanUp(exprs_register_adjusted);
 
-  // We now have the lowered expressions, finalize the kernel IR
-  kernel_->finalize(cleaned_up_loops);
+  // We now have the lowered expressions, finalize the kernel IR. This function
+  // will also copy over some relevant information for code generation from
+  // GpuLower.
+  kernel_->finalize(exprs_cleaned_up_loops);
 }
 
 kir::Kernel* GpuLower::kernel() const {
@@ -531,214 +360,18 @@ kir::Kernel* GpuLower::kernel() const {
   return kernel_.get();
 }
 
-// Maps Fusion IR nodes to the Kernel IR counterparts
-class GpuLower::KernelIrMapper : private OptInConstDispatch {
- public:
-  explicit KernelIrMapper(GpuLower* gpu_lower)
-      : gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {}
-
-  kir::Val* lowerValue(const Val* value) {
-    const auto it = gpu_lower_->kir_val_map_.find(value);
-    if (it != gpu_lower_->kir_val_map_.end()) {
-      return it->second;
-    } else {
-      handle(value);
-      const auto kir_value = gpu_lower_->kir_val_map_[value];
-      TORCH_CHECK(kir_value != nullptr);
-
-      // Lower the value definition, if any
-      if (value->isScalar()) {
-        if (auto def = value->definition()) {
-          const auto kir_def = lowerExpr(def);
-          TORCH_INTERNAL_ASSERT(kir_value->definition() == kir_def);
-        }
-      }
-
-      return kir_value;
-    }
-  }
-
-  kir::Expr* lowerExpr(const Expr* expr) {
-    const auto it = gpu_lower_->kir_expr_map_.find(expr);
-    if (it != gpu_lower_->kir_expr_map_.end()) {
-      return it->second;
-    } else {
-      handle(expr);
-      const auto lowered_node = gpu_lower_->kir_expr_map_[expr];
-      TORCH_CHECK(lowered_node != nullptr);
-      return lowered_node;
-    }
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  }
-
- private:
-  void handle(const Statement* node) final {
-    OptInConstDispatch::handle(node);
-  }
-
-  void handle(const Val* node) final {
-    OptInConstDispatch::handle(node);
-  }
-
-  void handle(const Expr* node) final {
-    OptInConstDispatch::handle(node);
-  }
-
-  void handle(const TensorDomain* node) final {
-    const auto lowered_node = ir_builder_.create<kir::TensorDomain>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const IterDomain* node) final {
-    const auto lowered_node = ir_builder_.create<kir::IterDomain>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const TensorView* node) final {
-    const auto lowered_node = ir_builder_.create<kir::TensorView>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const Bool* node) final {
-    const auto lowered_node = ir_builder_.create<kir::Bool>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const Double* node) final {
-    const auto lowered_node = ir_builder_.create<kir::Double>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const Int* node) final {
-    const auto lowered_node = ir_builder_.create<kir::Int>(node);
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const NamedScalar* node) final {
-    const auto lowered_node = ir_builder_.create<kir::NamedScalar>(
-        node->name(), node->getDataType().value());
-    TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const UnaryOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-        node->getUnaryOpType(),
-        lowerValue(node->out()),
-        lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const BinaryOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::BinaryOp>(
-        node->getBinaryOpType(),
-        lowerValue(node->out()),
-        lowerValue(node->lhs()),
-        lowerValue(node->rhs()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const TernaryOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::TernaryOp>(
-        node->getTernaryOpType(),
-        lowerValue(node->out()),
-        lowerValue(node->in1()),
-        lowerValue(node->in2()),
-        lowerValue(node->in3()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const ReductionOp* node) final {
-    auto out_tv = node->out()->as<TensorView>();
-    // If trivial reduction operation lower to set operation.
-    if (std::all_of(
-            out_tv->domain()->domain().begin(),
-            out_tv->domain()->domain().end(),
-            [&](IterDomain* id) {
-              // If id is a reduction axis, is it a trivial reduction?
-              if (id->isReduction()) {
-                return gpu_lower_->trivialReductionInfo().isDerived(id);
-              } else {
-                return true;
-              }
-            })) {
-      const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-          UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in()));
-      TORCH_CHECK(
-          gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-      return;
-    }
-
-    const auto lowered_node = ir_builder_.create<kir::ReductionOp>(
-        node->getReductionOpType(),
-        lowerValue(node->init()),
-        lowerValue(node->out()),
-        lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const WelfordOp* node) final {
-    auto lowerOptional = [&](Val* v) { return v ? lowerValue(v) : nullptr; };
-    const auto lowered_node = ir_builder_.create<kir::WelfordOp>(
-        lowerValue(node->outVar()),
-        lowerValue(node->outAvg()),
-        lowerValue(node->outN()),
-        lowerValue(node->initVar()),
-        lowerValue(node->initAvg()),
-        lowerValue(node->initN()),
-        lowerOptional(node->inVar()),
-        lowerValue(node->inAvg()),
-        lowerValue(node->inN()));
-
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const BroadcastOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::BroadcastOp>(
-        lowerValue(node->out()), lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const TransposeOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-        UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const ShiftOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-        UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const GatherOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-        UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
-  void handle(const ViewOp* node) final {
-    const auto lowered_node = ir_builder_.create<kir::UnaryOp>(
-        UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in()));
-    TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second);
-  }
-
- private:
-  GpuLower* gpu_lower_ = nullptr;
-  kir::IrBuilder ir_builder_;
-};
-
-kir::Val* GpuLower::lowerValue(const Val* val) {
-  KernelIrMapper kir_mapper(this);
-  return kir_mapper.lowerValue(val);
+GpuLower* GpuLower::current() {
+  TORCH_INTERNAL_ASSERT(
+      active_gpu_lower != nullptr, "No active GpuLower available");
+  return active_gpu_lower;
 }
 
-kir::Expr* GpuLower::lowerExpr(const Expr* expr) {
-  KernelIrMapper kir_mapper(this);
-  return kir_mapper.lowerExpr(expr);
+bool GpuLower::hasCurrent() {
+  return active_gpu_lower != nullptr;
 }
 
-GpuLower* GpuLower::current() {
-  return active_gpu_lower;
+void GpuLower::propagateExprInfo(const Expr* old_expr, const Expr* new_expr) {
+  pred_elimination_.propagateRemovalInfo(old_expr, new_expr);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
index b807bb4d480a..686b5db1ebd5 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.h
+++ b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -1,23 +1,33 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
+#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
+#include <torch/csrc/jit/codegen/cuda/lower_fused_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/lower_index_hoist.h>
 #include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
+#include <torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h>
 #include <torch/csrc/jit/codegen/cuda/lower_shift.h>
+#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
+#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
 #include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
 #include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
 #include <torch/csrc/jit/codegen/cuda/non_divisible_split.h>
 #include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
 #include <torch/csrc/jit/codegen/cuda/partial_split_map.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
 
 #include <memory>
 #include <ostream>
+#include <unordered_map>
+#include <unordered_set>
 
 namespace torch {
 namespace jit {
@@ -29,46 +39,48 @@ namespace cuda {
 // container for this information that we can reuse. Would be nice to generate
 // such a structure and propagate it through lowering.
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_CUDA_CU_API GpuLower {
+class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   class KernelIrMapper;
 
  public:
-  GpuLower() = default;
+  GpuLower() = delete;
 
+  // GpuLower lowers the provided fusion into a kernel which can be translated
+  // into cuda code. index_type allows to compile the kernel based on int32
+  // indexing instead of int64 for additional performance.
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  explicit GpuLower(Fusion* fusion) : fusion_(fusion) {
-    lower();
+  explicit GpuLower(Fusion* fusion, DataType index_type = DataType::Int) {
+    lower(fusion, index_type);
   }
 
   kir::Kernel* kernel() const;
 
-  //! Converts a Fusion IR value into the Kernel IR equivalent
-  kir::Val* lowerValue(const Val* val);
+  //! Returns the currently active lowering object.
+  //! It's an error if no lowering is in progress.
+  static GpuLower* current();
 
-  //! Converts a Fusion IR expression into the Kernel IR equivalent
-  kir::Expr* lowerExpr(const Expr* expr);
+  //! Query if lowering is in progress
+  static bool hasCurrent();
 
-  //! Returns the currently active lowering object
-  //! (or nullptr if no lowering is in progress)
-  static GpuLower* current();
+  ConcretizedBroadcastDomains& concretizedBroadcastDomains() {
+    return concretized_broadcast_domains_;
+  }
 
   const ThreadPredicateMap& threadPredMap() const {
     return thread_pred_map_;
   }
 
-  const ComputeAtMap& caLoopMap() const {
-    return ca_loop_map_;
-  }
-
-  const ComputeAtMap& caIndexMap() const {
-    return ca_index_map_;
+  // Returns non-const reference. Necessary to reset a predicate flag
+  // when a broadcast expression is fused into a reduction.
+  ThreadPredicateMap& threadPredMap() {
+    return thread_pred_map_;
   }
 
-  const ComputeAtMap& caParallelMap() const {
-    return ca_parallel_map_;
+  const std::unique_ptr<ComputeAtMap>& caMap() const {
+    return compute_at_map_;
   }
 
-  const auto& trivialReductionInfo() const {
+  const TrivialReductionInfo& trivialReductionInfo() const {
     return trivial_reduction_info_;
   }
 
@@ -120,16 +132,53 @@ class TORCH_CUDA_CU_API GpuLower {
     return non_divisible_split_info_;
   }
 
- private:
-  void lower();
+  DoubleBufferInfo& doubleBufferInfo() {
+    return double_buffer_info_;
+  }
+
+  CommonIndexMap& commonIndexMap() {
+    return common_index_map_;
+  }
 
-  // TensorViews are all based on symbolic sizes. When we first initialize them
-  // we don't know if they're inputs or outputs which would mean that they have
-  // runtime shapes. Intermediate tensors (those not going to global memory) do
-  // not have this information. Since we need to have the correct information in
-  // the kernel being fetched for shapes, we want to replace input and output
-  // tensors to reference the runtime structure containing sizes.
-  void replaceSymbolicSizes();
+  const auto& vectorizedAccesses() const {
+    return vectorized_accesses_;
+  }
+
+  auto& vectorizedAccesses() {
+    return vectorized_accesses_;
+  }
+
+  const auto& vectorizedSetInfo() const {
+    return vectorized_set_info_;
+  }
+
+  auto& vectorizedSetInfo() {
+    return vectorized_set_info_;
+  }
+
+  FusedReductionInfo& fusedReductionInfo() {
+    return fused_reduction_info_;
+  }
+
+  const SyncMap& syncMap() const {
+    return sync_map_;
+  }
+
+  // This is an interface to propagate information after expression
+  //  replacement on the kernel IR. E.g.:
+  //    for ...
+  //       c = a + b   (expr 0)
+  //  after any pass that does replacement:
+  //    for ...
+  //       c1 = a1 + b1 (expr1)
+  //  The previous analysis that was performed on expr0 might still
+  //    be valid on expr1 but that info would be lost after replacement.
+  //  This function provides an interface to manually update the info
+  //    in any pass that performs replacement.
+  void propagateExprInfo(const Expr* old_expr, const Expr* new_expr);
+
+ private:
+  void lower(Fusion* fusion, DataType index_type);
 
   // Goes through the parallelized iterdomains of the used TVs and find
   //  the parallel dimensions that need to be padded to a multiples of
@@ -140,16 +189,15 @@ class TORCH_CUDA_CU_API GpuLower {
   // Lowered Kernel IR
   std::unique_ptr<kir::Kernel> kernel_;
 
-  // Fusion IR node to Kernel IR node mapping
-  std::unordered_map<const Val*, kir::Val*> kir_val_map_;
-  std::unordered_map<const Expr*, kir::Expr*> kir_expr_map_;
-
   // Some stateful information during lowering
+  // TODO: A lot of this information uses a define class then call build. It
+  // would be safer to wrap all of these in unique pointers and remove the build
+  // interface and default constructor. That way they couldn't be accessed
+  // without being initialized.
+  ConcretizedBroadcastDomains concretized_broadcast_domains_;
   ThreadPredicateMap thread_pred_map_;
   PredicateElimination pred_elimination_;
-  ComputeAtMap ca_loop_map_;
-  ComputeAtMap ca_index_map_;
-  ComputeAtMap ca_parallel_map_;
+  std::unique_ptr<ComputeAtMap> compute_at_map_;
   TrivialReductionInfo trivial_reduction_info_;
   HaloInfo halo_info_;
   LocalAllocationInfoMap local_allocation_info_map_;
@@ -157,6 +205,17 @@ class TORCH_CUDA_CU_API GpuLower {
   ParallelDimensionMap parallel_dimension_map_;
   PartialSplitMap partial_split_map_;
   NonDivisibleSplitInfo non_divisible_split_info_;
+  DoubleBufferInfo double_buffer_info_;
+  CommonIndexMap common_index_map_;
+  FusedReductionInfo fused_reduction_info_;
+  SyncMap sync_map_;
+
+  // Track which tensor views are inputs or outputs of a vectorized operation
+  // and their maximum vectorized access size
+  // std::unordered_map<TensorView*, VectorizationInfo> vectorized_accesses_;
+  std::unordered_map<TensorView*, int> vectorized_accesses_;
+  // Info on each vectorized set op
+  std::vector<VectorizedSetInfo> vectorized_set_info_;
 
   Fusion* fusion_ = nullptr;
 };
diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
index 80e2e58c9cf2..ac1272c929af 100644
--- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
@@ -1,10 +1,10 @@
 #include <torch/csrc/jit/codegen/cuda/lower_alias_memory.h>
 
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
@@ -22,40 +22,42 @@ namespace {
 //! Get string representation of Allocate size for symbolic comparison
 //!
 //!  TODO: Some expr simplifications could also be helpful
-class SymbolicSizePrinter : private kir::IrVisitor {
+class SymbolicSizePrinter : private OptOutConstDispatch {
  public:
   static std::string printSize(const kir::Allocate* allocate) {
     SymbolicSizePrinter printer;
-    allocate->size()->accept(&printer);
+    printer.handle(allocate->size());
     return printer.os_.str();
   }
 
  private:
-  void visit(const kir::Int* node) final {
+  using OptOutConstDispatch::handle;
+
+  void handle(const Int* node) final {
     if (auto def = node->definition()) {
-      def->accept(this);
+      OptOutConstDispatch::handle(def);
     } else if (node->isConst()) {
       os_ << *node->value();
     } else {
-      os_ << "ki" << node->id();
+      os_ << "ki" << node->name();
     }
   }
 
-  void visit(const kir::NamedScalar* named_scalar) final {
+  void handle(const NamedScalar* named_scalar) final {
     os_ << "@" << named_scalar->name();
   }
 
-  void visit(const kir::UnaryOp* unary_op) final {
-    os_ << unary_op->operation() << "(";
-    unary_op->in()->accept(this);
+  void handle(const UnaryOp* unary_op) final {
+    os_ << unary_op->getUnaryOpType() << "(";
+    OptOutConstDispatch::handle(unary_op);
     os_ << ")";
   }
 
-  void visit(const kir::BinaryOp* binary_op) final {
-    os_ << binary_op->operation() << "(";
-    binary_op->lhs()->accept(this);
+  void handle(const BinaryOp* binary_op) final {
+    os_ << binary_op->getBinaryOpType() << "(";
+    OptOutConstDispatch::handle(binary_op->lhs());
     os_ << ",";
-    binary_op->rhs()->accept(this);
+    OptOutConstDispatch::handle(binary_op->rhs());
     os_ << ")";
   }
 
@@ -74,11 +76,11 @@ class BufferReuseDebugPrinter {
     DebugLineType line_type = DebugLineType::EXPR;
   };
 
-  using DebugEntry = std::pair<ExprInfo, kir::Expr*>;
+  using DebugEntry = std::pair<ExprInfo, Expr*>;
   using DebugEntryPtr = std::unique_ptr<DebugEntry>;
 
  public:
-  BufferReuseDebugPrinter() : ir_printer_(os_, false){};
+  BufferReuseDebugPrinter() : ir_printer_(os_){};
 
   std::string dumpDebugInfo() {
     os_.clear();
@@ -105,7 +107,7 @@ class BufferReuseDebugPrinter {
  private:
   friend class BufferUseDefInfo;
 
-  void pushBack(int lineno, kir::Expr* expr) {
+  void pushBack(int lineno, Expr* expr) {
     makeExprEntry(lineno, expr);
   }
 
@@ -117,7 +119,7 @@ class BufferReuseDebugPrinter {
     makeScopeEntry(DebugLineType::END_BLOCK);
   }
 
-  void makeExprEntry(int lineno, kir::Expr* expr) {
+  void makeExprEntry(int lineno, Expr* expr) {
     auto debug_entry_ptr = std::make_unique<DebugEntry>();
     debug_entry_ptr->first.lineno = lineno;
     debug_entry_ptr->second = expr;
@@ -134,14 +136,14 @@ class BufferReuseDebugPrinter {
     debug_info_.emplace_back(std::move(debug_entry_ptr));
   }
 
-  void handle(const kir::Expr* node) {
+  void handle(const Expr* node) {
     if (auto for_loop = dynamic_cast<const kir::ForLoop*>(node)) {
       handle(for_loop);
     } else if (auto ite = dynamic_cast<const kir::IfThenElse*>(node)) {
       handle(ite);
     } else {
       indent();
-      ir_printer_.printNode(node);
+      ir_printer_.handle(node);
     }
     if (auto alloc = dynamic_cast<const kir::Allocate*>(node)) {
       printAllocInfo(alloc);
@@ -151,9 +153,9 @@ class BufferReuseDebugPrinter {
   void handle(const kir::ForLoop* node) {
     indent();
     os_ << "FOR ";
-    ir_printer_.printNode(node->index());
+    ir_printer_.handle(node->index());
     os_ << " in ";
-    ir_printer_.printNode(node->iter_domain());
+    ir_printer_.handle(node->iter_domain());
     os_ << ":\n";
   }
 
@@ -186,7 +188,7 @@ class BufferReuseDebugPrinter {
 
  private:
   std::stringstream os_;
-  kir::IrPrinter ir_printer_;
+  IrPrinter ir_printer_;
   int indent_level_ = 0;
 
   std::vector<DebugEntryPtr> debug_info_;
@@ -340,7 +342,7 @@ class BufferUseDefInfo {
   static constexpr long kRegisterSizeThreshold = 1;
 
   BufferUseDefInfo(
-      const std::vector<kir::Expr*>& exprs,
+      const std::vector<Expr*>& exprs,
       BufferReuseDebugPrinter* debug_printer = nullptr)
       : debug_printer_(debug_printer) {
     if (debug_printer) {
@@ -410,7 +412,7 @@ class BufferUseDefInfo {
   }
 
  private:
-  void handle(kir::Expr* expr) {
+  void handle(Expr* expr) {
     current_pos_++;
     if (debug_printer_) {
       debug_printer_->pushBack(current_pos_, expr);
@@ -426,7 +428,7 @@ class BufferUseDefInfo {
     }
   }
 
-  void handleScope(const std::vector<kir::Expr*>& exprs) {
+  void handleScope(const std::vector<Expr*>& exprs) {
     if (debug_printer_) {
       debug_printer_->pushScope();
     }
@@ -460,15 +462,15 @@ class BufferUseDefInfo {
       return;
     }
 
-    auto kir_tv = dynamic_cast<kir::TensorView*>(alloc->buffer());
-    if (!kir_tv) {
+    auto tv = dynamic_cast<TensorView*>(alloc->buffer());
+    if (!tv) {
       return;
     }
 
     // Collect the allocate info data
 
     // Collect memory type, skip global buffers
-    auto mem_type = kir_tv->memoryType();
+    auto mem_type = tv->getMemoryType();
     if (mem_type != MemoryType::Local && mem_type != MemoryType::Shared) {
       return;
     }
@@ -487,12 +489,12 @@ class BufferUseDefInfo {
       }
     }
 
-    auto data_type = kir_tv->dtype();
+    auto data_type = tv->dtype();
     auto size_print = SymbolicSizePrinter::printSize(alloc);
 
     // Make sure we don't have conflicting information on record
     TORCH_INTERNAL_ASSERT(!map_allocate_to_info_.count(alloc));
-    TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(kir_tv->name()));
+    TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(tv->name()));
 
     // make AllocationUseDefInfo:
     auto alloc_info = makeUseDefInfo();
@@ -505,10 +507,10 @@ class BufferUseDefInfo {
 
     // record short cuts
     map_allocate_to_info_[alloc] = alloc_info;
-    map_tv_to_allocations_[kir_tv->name()] = alloc_info;
+    map_tv_to_allocations_[tv->name()] = alloc_info;
   }
 
-  void collectScopeUseDefInfo(const std::vector<kir::Expr*>& exprs) {
+  void collectScopeUseDefInfo(const std::vector<Expr*>& exprs) {
     // Reset position pointer
     resetExprCounter();
     TORCH_INTERNAL_ASSERT(global_scope_info_ != nullptr);
@@ -516,14 +518,14 @@ class BufferUseDefInfo {
     handleScope(exprs);
   }
 
-  void collectScopeInfo(const std::vector<kir::Expr*>& exprs) {
+  void collectScopeInfo(const std::vector<Expr*>& exprs) {
     // Reset position pointer
     resetExprCounter();
     collectScopeInfoWithinLoop(exprs, nullptr);
   }
 
   void collectScopeInfoWithinLoop(
-      const std::vector<kir::Expr*>& exprs,
+      const std::vector<Expr*>& exprs,
       kir::ForLoop* current_loop) {
     auto loop_info = makeScopeInfo(current_loop);
     for (auto expr : exprs) {
@@ -584,22 +586,20 @@ class BufferUseDefInfo {
 
   // Iterate over the inputs and outputs of exprs and update
   //  the liveness info of local buffers if applicaable.
-  void collectLivenessInfo(const kir::Expr* expr) {
-    if (!ir_utils::isTVOp(expr)) {
+  void collectLivenessInfo(const Expr* expr) {
+    if (!ir_utils::isTvOp(expr)) {
       return;
     }
 
-    auto out_tv = expr->outputs()[0]->as<kir::TensorView>();
-    auto fuser_out_tv = out_tv->fuserTv();
+    auto out_tv = expr->outputs()[0]->as<TensorView>();
 
     // Collect all tv's that resolves broadcast in this
     //  expr. The current analysis isn't enough to capture
     //  their liveness range.
-    for (auto input_tv :
-         ir_utils::filterByType<kir::TensorView>(expr->inputs())) {
+    for (auto input_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
       auto maybe_alloc_info = getMaybeAllocInfoFromTV(input_tv);
       if (maybe_alloc_info.has_value()) {
-        if (isSerialBroadcastResolution(input_tv->fuserTv(), fuser_out_tv)) {
+        if (isSerialBroadcastResolution(input_tv, out_tv)) {
           maybe_alloc_info.value()->inner_live_interval->markRead(current_pos_);
         } else {
           // Disable inner alias info for this buffer, since line number based
@@ -621,8 +621,7 @@ class BufferUseDefInfo {
         }
       }
     }
-    for (auto output_tv :
-         ir_utils::filterByType<kir::TensorView>(expr->outputs())) {
+    for (auto output_tv : ir_utils::filterByType<TensorView>(expr->outputs())) {
       auto maybe_alloc_info = getMaybeAllocInfoFromTV(output_tv);
       if (maybe_alloc_info.has_value()) {
         maybe_alloc_info.value()->inner_live_interval->markWrite(current_pos_);
@@ -675,8 +674,7 @@ class BufferUseDefInfo {
     return nullptr;
   }
 
-  c10::optional<AllocationInfoPtr> getMaybeAllocInfoFromTV(
-      kir::TensorView* tv) {
+  c10::optional<AllocationInfoPtr> getMaybeAllocInfoFromTV(TensorView* tv) {
     auto alloc_it = map_tv_to_allocations_.find(tv->name());
     if (alloc_it == map_tv_to_allocations_.end()) {
       return c10::nullopt;
@@ -810,11 +808,11 @@ void BufferReuseDebugPrinter::printAllocInfo(const kir::Allocate* alloc) {
 //! Reuse Allocation nodes via pointer aliasing
 class AllocateReuseModifier {
  public:
-  static void modify(const std::vector<kir::Expr*>& exprs) {
+  static void modify(const std::vector<Expr*>& exprs) {
     AllocateReuseModifier modifier(exprs);
   }
 
-  static void debugPrint(const std::vector<kir::Expr*>& exprs) {
+  static void debugPrint(const std::vector<Expr*>& exprs) {
     BufferReuseDebugPrinter debug_printer;
     AllocateReuseModifier modifier(exprs, &debug_printer);
     std::cout << debug_printer.dumpDebugInfo();
@@ -822,7 +820,7 @@ class AllocateReuseModifier {
 
  private:
   AllocateReuseModifier(
-      const std::vector<kir::Expr*>& exprs,
+      const std::vector<Expr*>& exprs,
       BufferReuseDebugPrinter* debug_printer_ = nullptr)
       : buffer_info_(exprs, debug_printer_) {
     // Perform in-place sharing first and then outer liveness
@@ -922,6 +920,31 @@ class AllocateReuseModifier {
           continue;
         }
 
+        if (alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
+          if (!alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
+            continue;
+          }
+          auto this_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
+          auto reuse_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
+          // Check that either both tv's are vectorized acceses, or neither are.
+          // Vectorized allocations require correct alignment so they can only
+          // alias with other allocations with the right alignment
+          const auto& va = GpuLower::current()->vectorizedAccesses();
+          if ((va.find(this_tv) == va.end()) !=
+              (va.find(reuse_tv) == va.end())) {
+            return false;
+          }
+
+          // Shared memory is all aligned to 128 bits, local memory might not be
+          if (this_tv->getMemoryType() == MemoryType::Local &&
+              va.find(this_tv) != va.end()) {
+            // Make sure alignment matches
+            if (va.at(this_tv) != va.at(reuse_tv)) {
+              return false;
+            }
+          }
+        }
+
         // TODO:
         //  Outer interval based sharing supports arbitrary re-indexing into
         //    the same buffer and would require additional syncs if fully
@@ -941,7 +964,7 @@ class AllocateReuseModifier {
     return false;
   }
 
-  void handle(kir::Expr* expr) {
+  void handle(Expr* expr) {
     if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
       handle(ite);
     } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
@@ -961,7 +984,7 @@ class AllocateReuseModifier {
         "lower_alias_memory: IfThenElse before unrolling is not yet supported");
   }
 
-  void handleScope(const std::vector<kir::Expr*>& exprs) {
+  void handleScope(const std::vector<Expr*>& exprs) {
     current_visible_buffer_stack_.emplace_back(
         std::make_unique<AllocationInfoList>());
     for (auto expr : exprs) {
@@ -990,10 +1013,8 @@ class AllocateReuseModifier {
     }
     // Assume inputs are TV allocations, which should have been checked
     //  before reaching this point.
-    auto this_tv =
-        alloc_info->alloc_expr->buffer()->as<kir::TensorView>()->fuserTv();
-    auto reuse_tv =
-        to_reuse->alloc_expr->buffer()->as<kir::TensorView>()->fuserTv();
+    auto this_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
+    auto reuse_tv = to_reuse->alloc_expr->buffer()->as<TensorView>();
 
     // Check the values in between the two buffers.
     auto vals_between_this_and_reuse =
@@ -1055,7 +1076,7 @@ class AllocateReuseModifier {
         if (!tv_def) {
           continue;
         }
-        if (!isPointwiseTvOp(tv_def) && !isReductionTvOp(tv_def)) {
+        if (!isPointwiseTvOp(tv_def) && !ir_utils::isReductionTvOp(tv_def)) {
           if (isBroadcastTvOp(tv_def)) {
             info.has_broadcast_between = true;
           } else {
@@ -1068,8 +1089,8 @@ class AllocateReuseModifier {
   }
 
   bool allocationDomainsIndexMapped(
-      std::vector<kir::IterDomain*>& alloc_domains,
-      std::vector<kir::IterDomain*>& reuse_domains) {
+      std::vector<IterDomain*>& alloc_domains,
+      std::vector<IterDomain*>& reuse_domains) {
     // Require that the allocated domains are exactly mapped.
     if (alloc_domains.size() != reuse_domains.size()) {
       return false;
@@ -1077,8 +1098,10 @@ class AllocateReuseModifier {
 
     // Check index map for the corresponding axes.
     for (const auto id_it : c10::irange(alloc_domains.size())) {
-      if (!GpuLower::current()->caIndexMap().areMapped(
-              alloc_domains[id_it], reuse_domains[id_it])) {
+      if (!GpuLower::current()->caMap()->areMapped(
+              alloc_domains[id_it],
+              reuse_domains[id_it],
+              IdMappingMode::EXACT)) {
         return false;
       }
     }
@@ -1099,24 +1122,16 @@ class AllocateReuseModifier {
   // Do we have a true pointwise op?
   // (ie. a TV op, excluding direct assignments and reductions)
   bool isPointwiseTvOp(const Expr* expr) {
-    if (ir_utils::isTVOp(expr)) {
+    if (ir_utils::isTvOp(expr)) {
       return expr->isA<UnaryOp>() || expr->isA<BinaryOp>() ||
           expr->isA<TernaryOp>();
     }
     return false;
   }
 
-  // Utility to capture reduction ops
-  bool isReductionTvOp(const Expr* expr) {
-    if (!ir_utils::isTVOp(expr)) {
-      return false;
-    }
-    return expr->isA<ReductionOp>() || expr->isA<WelfordOp>();
-  }
-
   // Utility to capture reduction ops
   bool isBroadcastTvOp(const Expr* expr) {
-    if (!ir_utils::isTVOp(expr)) {
+    if (!ir_utils::isTvOp(expr)) {
       return false;
     }
     return expr->isA<BroadcastOp>();
@@ -1138,8 +1153,7 @@ class AllocateReuseModifier {
 
 } // namespace
 
-std::vector<kir::Expr*> reuseMemoryAllocations(
-    const std::vector<kir::Expr*>& exprs) {
+std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("reuseMemoryAllocations");
   bool debug_print = isDebugDumpEnabled(DebugDumpOption::BufferReuseInfo);
   if (debug_print) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h
index 26b33b6d5dc7..0d144b9f2f40 100644
--- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h
+++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
@@ -28,8 +28,7 @@ namespace cuda {
 //!          is not used after this op:
 //! then alias output Allocate to input Allocate.
 //!
-std::vector<kir::Expr*> reuseMemoryAllocations(
-    const std::vector<kir::Expr*>& exprs);
+std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
index 2f70c2758328..85d09e4ca080 100644
--- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
@@ -1,10 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
 
@@ -17,8 +15,12 @@ namespace cuda {
 
 namespace {
 
-class AllocationInserter : public kir::MutableIrVisitor {
+class AllocationInserter : public kir::ExprMutator {
  private:
+  using kir::ExprMutator::handle;
+
+  // Expanded version of BasicAllocInfo in lower_utils.h helps to track
+  // additional information
   struct AllocationInformation {
     // The for loop that the initialization of this allocation must be
     // placed in, nullptr if not within a loop
@@ -26,7 +28,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
 
     // The expression that the initialization of this allocation must
     // be placed before
-    kir::Expr* init_place_before = nullptr;
+    Expr* init_place_before = nullptr;
 
     // Keep track of the actual allocation loop. This can be different
     // from init_for_loop only with unswitched shared memory allocations,
@@ -37,143 +39,93 @@ class AllocationInserter : public kir::MutableIrVisitor {
     // The expression that this allocation must be placed
     // before. Similar to alloc_for_loop, this is different from
     // init_place_before only with unswitched shared memory allocations.
-    kir::Expr* alloc_place_before = nullptr;
+    Expr* alloc_place_before = nullptr;
 
     // The allocation position relative to buffer
     size_t alloc_pos = 0;
 
     // The buffer this allocation is for
-    kir::TensorView* buffer = nullptr;
-
-    // The allocation expression
-    kir::Allocate* alloc_expr = nullptr;
-
-    // Initialization
-    kir::Expr* init_expr = nullptr;
+    TensorView* buffer = nullptr;
 
     // Info to transfer to GPU lower
     bool has_halo = false;
 
     // Local Iterdomains that this allocation covers
-    std::unique_ptr<std::vector<kir::IterDomain*>> allocation_domains;
+    std::unique_ptr<std::vector<IterDomain*>> allocation_domains;
   };
 
   // Find allocation point
-  void findAllocationPosition(AllocationInformation& info, kir::Expr* expr) {
-    size_t alloc_pos = 0;
-    kir::ForLoop* init_for_loop = nullptr;
-    auto fuser_tv = info.buffer->fuserTv();
-    size_t fl_idx_next = 0;
-
-    bool outer_alloc_found = false;
-    kir::ForLoop* alloc_for_loop = nullptr;
-    size_t alloc_fl_idx_next = 0;
-
-    for (auto fl : for_loops) {
-      if (alloc_pos == fuser_tv->getComputeAtPosition()) {
-        break;
-      }
-
-      if (fuser_tv->axis(alloc_pos)->isReduction()) {
-        const auto outputs =
-            FusionGuard::getCurFusion()->getTerminatingOutputs();
-        TORCH_INTERNAL_ASSERT(
-            std::find(outputs.begin(), outputs.end(), fuser_tv) !=
-                outputs.end(),
-            "Invalid computeAt of T",
-            fuser_tv->name(),
-            ". A reducation axis is detected within computeAt axes even though it is not an output tensor.");
-        break;
-      }
-
-      auto fl_id = fl->iter_domain();
-
-      if (fl_id->parallelType() == ParallelType::Unroll) {
-        break;
-      }
-
-      // Shared memory must be allocated outside of unswitched
-      // domains. See issue #1133.
-      if (fl_id->parallelType() == ParallelType::Unswitch &&
-          fuser_tv->getMemoryType() == MemoryType::Shared) {
-        outer_alloc_found = true;
-      }
-
-      auto local_id = gpu_lower->lowerValue(fuser_tv->axis(alloc_pos))
-                          ->as<kir::IterDomain>();
-
-      if (gpu_lower->caLoopMap().areMapped(local_id, fl_id)) {
-        alloc_pos++;
-      }
-
-      init_for_loop = fl;
-      ++fl_idx_next;
-
-      if (!outer_alloc_found) {
-        alloc_for_loop = fl;
-        ++alloc_fl_idx_next;
+  // Fills info.buffer, info.alloc_pos, info.init_for_loop,
+  // info.init_place_before, info.alloc_for_loop, info.alloc_place_before
+  void fillAllocationInformation(AllocationInformation& info, Expr* expr) {
+    auto loop_alloc_info =
+        loop_utils::getAllocInformation(info.buffer, for_loops_);
+
+    info.init_for_loop = loop_alloc_info.init_for_loop;
+    info.alloc_for_loop = loop_alloc_info.alloc_for_loop;
+    info.alloc_pos = loop_alloc_info.alloc_pos;
+
+    auto next_fl = [](kir::ForLoop* fl, const std::vector<kir::ForLoop*> fls) {
+      for (auto i : c10::irange(fls.size())) {
+        if (fl == fls[i]) {
+          if (i + 1 < fls.size()) {
+            return fls[i + 1];
+          }
+        }
       }
-    }
-
-    info.alloc_pos = alloc_pos;
-    info.init_for_loop = init_for_loop;
+      TORCH_INTERNAL_ASSERT(false, "Could not find desired loop.");
+    };
 
     if (info.init_for_loop == nullptr) {
-      info.init_place_before = for_loops.size() > 0 ? for_loops[0] : expr;
+      info.init_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr;
     } else {
-      if (info.init_for_loop == for_loops.back()) {
+      if (info.init_for_loop == for_loops_.back()) {
         // Inline allocation, place before expr
         info.init_place_before = expr;
       } else {
         // Place allocation after the last computeAt axis
         // TODO: may be more efficient to place before the first non-computeAt
         // axis
-        info.init_place_before = for_loops.at(fl_idx_next);
+        info.init_place_before = next_fl(info.init_for_loop, for_loops_);
       }
     }
 
     // Set the allocation loop and the place_before expression in the
     // same way as the initialization loop and place_before expression
-    if (!outer_alloc_found) {
+    if (info.alloc_for_loop == info.init_for_loop) {
       info.alloc_for_loop = info.init_for_loop;
       info.alloc_place_before = info.init_place_before;
     } else {
-      info.alloc_for_loop = alloc_for_loop;
       if (info.alloc_for_loop == nullptr) {
-        info.alloc_place_before = for_loops.size() > 0 ? for_loops[0] : expr;
+        info.alloc_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr;
       } else {
         // Since there must be an inner unswitched domain,
         // alloc_for_loop should never be the inner-most loop.
-        TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops.back());
-        info.alloc_place_before = for_loops.at(alloc_fl_idx_next);
+        TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops_.back());
+        info.alloc_place_before = next_fl(info.alloc_for_loop, for_loops_);
       }
     }
   }
 
   // Create initialization expression if init_val is non-null.
-  void createInitExpr(AllocationInformation& info, kir::Val* init_val) {
+  Expr* createInitExpr(AllocationInformation& info, Val* init_val) {
     if (init_val == nullptr) {
-      info.init_expr = nullptr;
-      return;
+      return nullptr;
     }
 
-    auto fuser_tv = info.buffer->fuserTv();
-
-    std::vector<kir::IterDomain*> init_dims;
-    for (const auto axis_i : c10::irange(info.alloc_pos, fuser_tv->nDims())) {
-      if (info.buffer->fuserTv()->axis(axis_i)->isReduction() ||
-          info.buffer->fuserTv()->axis(axis_i)->isBroadcast()) {
+    std::vector<IterDomain*> init_dims;
+    for (const auto axis_i :
+         c10::irange(info.alloc_pos, info.buffer->nDims())) {
+      if (info.buffer->axis(axis_i)->isReduction() ||
+          info.buffer->axis(axis_i)->isBroadcast()) {
         continue;
       }
-      auto concrete_id =
-          gpu_lower
-              ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID(
-                  fuser_tv->axis(axis_i)))
-              ->as<kir::IterDomain>();
+      auto concrete_id = gpu_lower->caMap()->getConcreteMappedID(
+          info.buffer->axis(axis_i), IdMappingMode::LOOP);
       init_dims.push_back(concrete_id);
     }
-    kir::Expr* init_expr = ir_builder.create<kir::UnaryOp>(
-        UnaryOpType::Set, info.buffer, init_val);
+    Expr* init_expr =
+        IrBuilder::create<UnaryOp>(UnaryOpType::Set, info.buffer, init_val);
     for (auto init_loop_it = init_dims.rbegin();
          init_loop_it != init_dims.rend();
          ++init_loop_it) {
@@ -181,9 +133,9 @@ class AllocationInserter : public kir::MutableIrVisitor {
       kir::ForLoop* new_loop = nullptr;
       auto extent_with_halo = gpu_lower->haloInfo().getExtent(id);
       if (extent_with_halo) {
-        new_loop = ir_builder.create<kir::ForLoop>(
+        new_loop = IrBuilder::create<kir::ForLoop>(
             id,
-            ir_builder.create<kir::Int>(c10::nullopt),
+            IrBuilder::create<Int>(c10::nullopt),
             nullptr,
             extent_with_halo,
             nullptr,
@@ -191,31 +143,33 @@ class AllocationInserter : public kir::MutableIrVisitor {
             nullptr,
             false);
       } else {
-        new_loop = ir_builder.create<kir::ForLoop>(id);
+        new_loop = IrBuilder::create<kir::ForLoop>(id);
       }
       new_loop->body().push_back(init_expr);
       init_expr = new_loop;
     }
-    info.init_expr = init_expr;
+    return init_expr;
   }
 
-  std::vector<kir::Val*> getGlobalAllocationSizes(AllocationInformation& info) {
+  std::vector<Val*> getGlobalAllocationSizes(AllocationInformation& info) {
     const auto& domain = info.buffer->domain();
-    const auto& maybe_rfactor_domain =
-        domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain();
+    const auto& maybe_rfactor_domain = domain->hasRFactor()
+        ? domain->getRFactorDomain()
+        : domain->getRootDomain();
 
-    std::vector<kir::Val*> alloc_dims;
+    std::vector<Val*> alloc_dims;
 
     for (const auto id : maybe_rfactor_domain) {
       if (id->isReduction() || id->isStride() ||
-          id->iterType() == IterType::BroadcastWithoutStride) {
+          id->getIterType() == IterType::BroadcastWithoutStride) {
         continue;
       }
       auto extent = id->extent();
       // Use halo-extended extent if found
       auto halo_extent = gpu_lower->haloInfo().getRootAxisInfo(id);
       if (halo_extent.hasHalo()) {
-        extent = ir_builder.addExpr(extent, halo_extent.width());
+        extent = IrBuilder::addExpr(
+            extent, IrBuilder::create<Int>(halo_extent.width()));
       }
       alloc_dims.push_back(extent);
     }
@@ -244,7 +198,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
   // fall back to the leaf-based allocation.
   //
   // See the FusionShiftDoubleSplit test for an example case.
-  std::vector<kir::Val*> getNonGlobalAllocExprWithHalo(
+  std::vector<Val*> getNonGlobalAllocExprWithHalo(
       TensorView* tv,
       const std::vector<IterDomain*>& alloc_domains) {
     std::vector<Val*> start_vals;
@@ -255,18 +209,18 @@ class AllocationInserter : public kir::MutableIrVisitor {
         [](IterDomain* dom) { return dom->as<Val>(); });
 
     // Get all exprs involved in generating the allocation IDs
-    auto exprs = ExprSort::getExprs(tv->fusion(), start_vals);
+    auto exprs = StmtSort::getExprs(tv->fusion(), start_vals);
 
     // Get the halo extent if found
     auto getExtent = [this](IterDomain* id) {
       auto extent = gpu_lower->haloInfo().getExtent(id);
       if (extent == nullptr) {
-        extent = gpu_lower->lowerValue(id->extent());
+        extent = id->extent();
       }
       return extent;
     };
 
-    std::unordered_map<IterDomain*, kir::Val*> known_extents;
+    std::unordered_map<IterDomain*, Val*> known_extents;
 
     // IterDomains that are allocated fully. For example, if an ID is
     // split and only one of them is used for allocation, that's not
@@ -314,7 +268,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
           } else {
             known_extents.insert(
                 {split->in(),
-                 ir_builder.mulExpr(outer_it->second, inner_it->second)});
+                 IrBuilder::mulExpr(outer_it->second, inner_it->second)});
           }
           known_extents.erase(inner_it);
           known_extents.erase(outer_it);
@@ -330,7 +284,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
       }
     }
 
-    std::vector<kir::Val*> alloc_dims;
+    std::vector<Val*> alloc_dims;
 
     for (auto root_axis : tv->getRootDomain()) {
       auto it = known_extents.find(root_axis);
@@ -355,24 +309,22 @@ class AllocationInserter : public kir::MutableIrVisitor {
     return alloc_dims;
   }
 
-  std::vector<kir::Val*> getNonGlobalAllocExpr(AllocationInformation& info) {
-    auto fuser_tv = info.buffer->fuserTv();
-    const auto memory_type = info.buffer->memoryType();
+  std::vector<Val*> getNonGlobalAllocExpr(AllocationInformation& info) {
+    const auto memory_type = info.buffer->getMemoryType();
     TORCH_INTERNAL_ASSERT(
         memory_type != MemoryType::Global,
         "Invalid memory type: ",
         memory_type);
 
-    std::vector<kir::Val*> alloc_dims;
+    std::vector<Val*> alloc_dims;
 
     bool has_halo = false;
     std::vector<IterDomain*> alloc_domains;
 
-    info.allocation_domains = std::make_unique<std::vector<kir::IterDomain*>>();
+    info.allocation_domains = std::make_unique<std::vector<IterDomain*>>();
 
-    for (const auto axis_i : c10::irange(fuser_tv->nDims())) {
-      const auto local_id =
-          gpu_lower->lowerValue(fuser_tv->axis(axis_i))->as<kir::IterDomain>();
+    for (const auto axis_i : c10::irange(info.buffer->nDims())) {
+      const auto local_id = info.buffer->axis(axis_i);
 
       // Don't use reduction/stride/broadcast axis in the allocation
       // computation
@@ -381,16 +333,14 @@ class AllocationInserter : public kir::MutableIrVisitor {
         continue;
       }
 
-      auto concrete_id =
-          gpu_lower
-              ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID(
-                  fuser_tv->axis(axis_i)))
-              ->as<kir::IterDomain>();
+      auto concrete_id = gpu_lower->caMap()->getConcreteMappedID(
+          info.buffer->axis(axis_i), IdMappingMode::LOOP);
       const bool is_block_dim =
-          isParallelTypeBlockDim(concrete_id->parallelType());
+          isParallelTypeBlockDim(concrete_id->getParallelType());
       const bool is_thread_dim =
-          isParallelTypeThreadDim(concrete_id->parallelType());
-      const bool is_thread = isParallelTypeThread(concrete_id->parallelType());
+          isParallelTypeThreadDim(concrete_id->getParallelType());
+      const bool is_thread =
+          isParallelTypeThread(concrete_id->getParallelType());
 
       if (axis_i < info.alloc_pos) {
         // Even when the axis is outside the allocation position, if the
@@ -403,7 +353,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
               (memory_type == MemoryType::Global && is_thread))) {
           continue;
         }
-        alloc_domains.push_back(fuser_tv->axis(axis_i));
+        alloc_domains.push_back(info.buffer->axis(axis_i));
       } else {
         if (
             // If shared memory, don't use any IDs bound to a grid dimension
@@ -413,12 +363,13 @@ class AllocationInserter : public kir::MutableIrVisitor {
             (memory_type == MemoryType::Local && is_thread)) {
           continue;
         }
-        alloc_domains.push_back(fuser_tv->axis(axis_i));
+        alloc_domains.push_back(info.buffer->axis(axis_i));
       }
 
       auto extent = concrete_id->extent();
 
-      if (gpu_lower->haloInfo().getExtent(fuser_tv->axis(axis_i)) != nullptr) {
+      if (gpu_lower->haloInfo().getExtent(info.buffer->axis(axis_i)) !=
+          nullptr) {
         has_halo = true;
       }
 
@@ -430,20 +381,19 @@ class AllocationInserter : public kir::MutableIrVisitor {
     // the halo extents from leaf IDs to root IDs
     if (has_halo) {
       info.has_halo = true;
-      return getNonGlobalAllocExprWithHalo(fuser_tv, alloc_domains);
+      return getNonGlobalAllocExprWithHalo(info.buffer, alloc_domains);
     }
 
     return alloc_dims;
   }
 
-  void createAllocExpr(AllocationInformation& info, bool is_output) {
+  kir::Allocate* createAllocExpr(AllocationInformation& info, bool is_output) {
     if (is_output) {
-      info.alloc_expr = nullptr;
-      return;
+      return nullptr;
     }
 
-    std::vector<kir::Val*> alloc_dims;
-    const MemoryType memory_type = info.buffer->memoryType();
+    std::vector<Val*> alloc_dims;
+    const MemoryType memory_type = info.buffer->getMemoryType();
 
     if (memory_type == MemoryType::Global) {
       alloc_dims = getGlobalAllocationSizes(info);
@@ -453,60 +403,82 @@ class AllocationInserter : public kir::MutableIrVisitor {
 
     if (alloc_dims.size() == 0 &&
         info.buffer->domain()->noReductions().size() != 0) {
-      alloc_dims.push_back(ir_builder.create<kir::Int>(1));
+      alloc_dims.push_back(info.buffer->container()->oneVal());
+    }
+
+    // Double the allocation size if double-buffered. Record the
+    // original size for indexing.
+    if (info.buffer->isDoubleBuffered()) {
+      Val* original_alloc_size = nullptr;
+      for (auto alloc_dim : alloc_dims) {
+        if (original_alloc_size == nullptr) {
+          original_alloc_size = alloc_dim;
+        } else {
+          original_alloc_size =
+              IrBuilder::mulExpr(original_alloc_size, alloc_dim);
+        }
+      }
+      GpuLower::current()->doubleBufferInfo().setOriginalAllocSize(
+          info.buffer, original_alloc_size);
+      alloc_dims.push_back(IrBuilder::create<Int>(2));
     }
 
     // Create the allocation node
-    info.alloc_expr = ir_builder.create<kir::Allocate>(
-        info.buffer, info.buffer->memoryType(), alloc_dims);
+    return IrBuilder::create<kir::Allocate>(
+        info.buffer, info.buffer->getMemoryType(), alloc_dims);
   }
 
-  void handle(kir::Expr* expr) {
-    if (!ir_utils::isTVOp(expr) || expr->isA<kir::Allocate>()) {
-      expr->accept(this);
+  void handle(Expr* expr) override {
+    if (!ir_utils::isTvOp(expr) || expr->isA<kir::Allocate>()) {
+      ExprMutator::handle(expr);
       return;
     }
 
     // // Found where the allocation needs to be inserted
 
-    for (auto out : expr->outputs()) {
-      if (!out->isA<kir::TensorView>()) {
+    for (const auto i : c10::irange(expr->outputs().size())) {
+      auto out = expr->output(i);
+      if (!out->isA<TensorView>()) {
         continue;
       }
 
-      auto out_tv = out->as<kir::TensorView>();
-      auto default_val =
-          gpu_lower->predicateElimination().getInitValue(out_tv->fuserTv());
+      auto out_tv = out->as<TensorView>();
+      auto default_val = gpu_lower->predicateElimination().getInitValue(out_tv);
 
-      kir::Val* init = nullptr;
-      if (expr->isA<kir::ReductionOp>() && out_tv->fuserTv()->hasReduction()) {
+      Val* init = nullptr;
+      if (expr->isA<ReductionOp>() && out_tv->hasReduction()) {
         TORCH_INTERNAL_ASSERT(
             default_val == nullptr,
             "Reduction should not have a default initialization value for predicate elimination.");
-        init = expr->as<kir::ReductionOp>()->init();
-      } else if (expr->isA<kir::WelfordOp>()) {
+        init = expr->as<ReductionOp>()->init();
+      } else if (expr->isA<GroupedReductionOp>() && out_tv->hasReduction()) {
+        TORCH_INTERNAL_ASSERT(
+            default_val == nullptr,
+            "Reduction should not have a default initialization value for predicate elimination.");
+        init = expr->as<GroupedReductionOp>()->initVal(i);
+      } else if (expr->isA<MmaOp>()) {
+        init = expr->as<MmaOp>()->init();
+      } else if (expr->isA<WelfordOp>()) {
         TORCH_INTERNAL_ASSERT(
             default_val == nullptr,
             "Welford should not have a default initialization value for predicate elimination.");
-        const auto welford = expr->as<kir::WelfordOp>();
-        if (out->id() == welford->outVar()->id()) {
-          init = welford->initVar() == nullptr
-              ? ir_builder.create<kir::Double>(0)
-              : welford->initVar();
-        } else if (out->id() == welford->outAvg()->id()) {
-          init = welford->initAvg() == nullptr
-              ? ir_builder.create<kir::Double>(0)
-              : welford->initAvg();
+        const auto welford = expr->as<WelfordOp>();
+        if (out->name() == welford->outVar()->name()) {
+          init = welford->initVar() == nullptr ? IrBuilder::create<Double>(0)
+                                               : welford->initVar();
+        } else if (out->name() == welford->outAvg()->name()) {
+          init = welford->initAvg() == nullptr ? IrBuilder::create<Double>(0)
+                                               : welford->initAvg();
         } else {
           TORCH_INTERNAL_ASSERT(
-              out->id() == welford->outN()->id(), "Unreachable");
+              out->name() == welford->outN()->name(), "Unreachable");
           init = welford->initN();
         }
       } else if (default_val != nullptr) {
         init = default_val;
       }
 
-      const bool is_output = gpu_lower->kernel()->isOutput(out);
+      const bool is_output = out->isFusionOutput();
 
       // Don't need to alloc outputs, and if we don't need to initialize we're
       // done.
@@ -516,150 +488,91 @@ class AllocationInserter : public kir::MutableIrVisitor {
 
       AllocationInformation allocation;
       allocation.buffer = out_tv;
-      findAllocationPosition(allocation, expr);
-      createAllocExpr(allocation, is_output);
-      createInitExpr(allocation, init);
+      fillAllocationInformation(allocation, expr);
+
+      auto alloc_expr = createAllocExpr(allocation, is_output);
+      auto init_expr = createInitExpr(allocation, init);
 
       // Write information to GPULower
-      writeInfoToGPULower(allocation);
+      writeInfoToGPULower(allocation, alloc_expr);
+
+      // Register allocations before initializations to keep them in the right
+      // order
+      if (alloc_expr != nullptr) {
+        if (allocation.buffer->getMemoryType() == MemoryType::Shared) {
+          // Shared allocations go at the begining of scope
+          TORCH_INTERNAL_ASSERT(!exprs_.empty());
+          registerInsertBefore(exprs_[0], alloc_expr, nullptr);
+        } else {
+          TORCH_INTERNAL_ASSERT(allocation.alloc_place_before != nullptr);
+          kir::Scope* scope = allocation.alloc_for_loop == nullptr
+              ? nullptr
+              : &allocation.alloc_for_loop->body();
+          registerInsertBefore(
+              allocation.alloc_place_before, alloc_expr, scope);
+        }
+      }
 
-      allocs.push_back(std::move(allocation));
+      if (init_expr != nullptr) {
+        TORCH_INTERNAL_ASSERT(allocation.init_place_before != nullptr);
+        kir::Scope* scope = allocation.init_for_loop == nullptr
+            ? nullptr
+            : &allocation.init_for_loop->body();
+        registerInsertBefore(allocation.init_place_before, init_expr, scope);
+      }
     }
   }
 
-  void writeInfoToGPULower(const AllocationInformation& allocation) {
+  // Sends alloc_expr, info.has_halo, info.allocation_domains to GpuLower
+  void writeInfoToGPULower(
+      const AllocationInformation& allocation,
+      kir::Allocate* alloc_expr) {
     auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap();
-    if (allocation.alloc_expr == nullptr) {
+    if (alloc_expr == nullptr) {
       // Skip output allocation.
       return;
     }
     TORCH_INTERNAL_ASSERT(
-        !lower_alloc_info_map.count(allocation.alloc_expr),
+        !lower_alloc_info_map.count(alloc_expr),
         "duplicated allocation info entry");
 
     // Create info entry for GPULower
     auto lower_alloc_info_ptr = std::make_unique<LocalAllocationInfo>();
-    lower_alloc_info_ptr->alloc_expr = allocation.alloc_expr;
+    lower_alloc_info_ptr->alloc_expr = alloc_expr;
     lower_alloc_info_ptr->has_halo = allocation.has_halo;
     if (allocation.allocation_domains) {
       lower_alloc_info_ptr->alloc_domains = *(allocation.allocation_domains);
     }
 
     // Write entry to the stored map
-    lower_alloc_info_map[allocation.alloc_expr] =
-        std::move(lower_alloc_info_ptr);
-  }
-
-  void visit(kir::ForLoop* fl) final {
-    for_loops.push_back(fl);
-    // Modifying in place, make a copy of the vector
-    const std::vector<kir::Expr*> exprs = fl->body().exprs();
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-    for_loops.pop_back();
+    lower_alloc_info_map[alloc_expr] = std::move(lower_alloc_info_ptr);
   }
 
-  void visit(kir::IfThenElse*) final {
+  void handle(kir::IfThenElse*) final {
     TORCH_INTERNAL_ASSERT(
         false,
         "Pass does not support conditional statements, ",
         "this pass should be run before any conditionals are placed in code.");
   }
 
-  AllocationInserter(std::vector<kir::Expr*> _loop_nests)
-      : loop_nests_(std::move(_loop_nests)),
-        gpu_lower(GpuLower::current()),
-        ir_builder(gpu_lower->kernel()) {
-    // Compute all allocations
-    const std::vector<kir::Expr*> exprs = loop_nests_;
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-
-    // First, place allocations of dynamic smem tensors at the very
-    // beginning of the expr list. Traverse backward as they should be
-    // placed in topological order.
-    for (auto it = allocs.rbegin(); it != allocs.rend(); ++it) {
-      const auto& alloc = *it;
-      if (alloc.alloc_expr == nullptr) {
-        continue;
-      }
-      // Dynamic smem exprs need to be at the begining of the kernel outside for
-      // loops
-      if (alloc.buffer->memoryType() == MemoryType::Shared &&
-          !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) {
-        loop_nests_.insert(loop_nests_.begin(), alloc.alloc_expr);
-      }
-    }
-
-    // Place the remaining allocations.
-    for (const auto& alloc : allocs) {
-      if (alloc.alloc_expr == nullptr) {
-        continue;
-      }
-      if (alloc.buffer->memoryType() == MemoryType::Shared &&
-          !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) {
-        continue;
-      }
-      if (alloc.alloc_for_loop == nullptr) {
-        auto place_before_it = std::find(
-            loop_nests_.begin(), loop_nests_.end(), alloc.alloc_place_before);
-        TORCH_INTERNAL_ASSERT(
-            place_before_it != loop_nests_.end(),
-            "Could not figure out where to place allocation. ",
-            "Use of the buffer, ",
-            toString(alloc.buffer),
-            ", could not be found.",
-            toString(alloc.alloc_place_before));
-        loop_nests_.insert(place_before_it, alloc.alloc_expr);
-      } else {
-        alloc.alloc_for_loop->body().insert_before(
-            alloc.alloc_place_before, alloc.alloc_expr);
-      }
-    }
-
-    // Now that allocations are in place, place the initializations
-    for (const auto& alloc : allocs) {
-      if (alloc.init_expr == nullptr) {
-        continue;
-      }
-      if (alloc.init_for_loop == nullptr) {
-        auto place_before_it = std::find(
-            loop_nests_.begin(), loop_nests_.end(), alloc.init_place_before);
-        // Don't need a check here as if the allocation placement succeeded
-        // this will too
-        loop_nests_.insert(place_before_it, alloc.init_expr);
-      } else {
-        alloc.init_for_loop->body().insert_before(
-            alloc.init_place_before, alloc.init_expr);
-      }
-    }
+  AllocationInserter(const std::vector<Expr*>& exprs)
+      : gpu_lower(GpuLower::current()) {
+    kir::ExprMutator::traverseAndInsert(exprs);
   }
 
  private:
-  std::deque<AllocationInformation> allocs;
-
-  std::vector<kir::ForLoop*> for_loops;
-
-  std::vector<kir::Expr*> loop_nests_;
-
   GpuLower* gpu_lower;
 
-  kir::IrBuilder ir_builder;
-
  public:
-  static std::vector<kir::Expr*> insert(
-      const std::vector<kir::Expr*>& loop_nests) {
-    AllocationInserter inserter(loop_nests);
-    return inserter.loop_nests_;
+  static std::vector<Expr*> insert(const std::vector<Expr*>& exprs) {
+    AllocationInserter inserter(exprs);
+    return inserter.exprs_;
   }
 };
 
 } // namespace
 
-std::vector<kir::Expr*> insertAllocations(
-    const std::vector<kir::Expr*>& exprs) {
+std::vector<Expr*> insertAllocations(const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::insertAllocations");
   return AllocationInserter::insert(exprs);
 }
diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.h b/torch/csrc/jit/codegen/cuda/lower_allocation.h
index bc0344ca19f6..45ebeac03f77 100644
--- a/torch/csrc/jit/codegen/cuda/lower_allocation.h
+++ b/torch/csrc/jit/codegen/cuda/lower_allocation.h
@@ -1,8 +1,7 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
@@ -17,7 +16,7 @@ namespace cuda {
 //!  logic duplication
 struct LocalAllocationInfo {
   kir::Allocate* alloc_expr = nullptr;
-  std::vector<kir::IterDomain*> alloc_domains;
+  std::vector<IterDomain*> alloc_domains;
   bool has_halo = false;
 };
 
@@ -25,7 +24,7 @@ using LocalAllocationInfoMap =
     std::unordered_map<kir::Allocate*, std::unique_ptr<LocalAllocationInfo>>;
 
 //! Insert buffer allocations
-std::vector<kir::Expr*> insertAllocations(const std::vector<kir::Expr*>& exprs);
+std::vector<Expr*> insertAllocations(const std::vector<Expr*>& exprs);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
new file mode 100644
index 000000000000..b069f4cc8ebc
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
@@ -0,0 +1,506 @@
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+unsigned int getDoubleBufferAxisPosition(const TensorView* tv) {
+  // Double-buffering prefetches the next subregion of the tensor by
+  // doubling the allocation. The subregion is defined by the axes
+  // at the CA position till the inner-most position. There must be
+  // at least one axis that is outside (left) of the CA position,
+  // which defines the loop where prefetching is applied. Therefore,
+  // the CA position must be larger than 0.
+
+  TORCH_INTERNAL_ASSERT(tv->getComputeAtPosition() > 0);
+
+  // Unroll must not exist outside of double-buffer axis
+  auto first_unroll_it = std::find_if(
+      tv->domain()->domain().begin(),
+      tv->domain()->domain().end(),
+      [](const auto axis) {
+        return axis->getParallelType() == ParallelType::Unroll;
+      });
+
+  const int first_unroll_pos =
+      std::distance(tv->domain()->domain().begin(), first_unroll_it);
+
+  const int unroll_or_ca_pos =
+      std::min((int)tv->getComputeAtPosition(), first_unroll_pos);
+
+  TORCH_INTERNAL_ASSERT(
+      unroll_or_ca_pos > 0,
+      "Invalid tensor to double-buffer. Valid double buffer axis not found due to Unroll. ",
+      tv->toString());
+
+  int valid_pos = -1;
+  // Skip parallelized or broadcast axes
+  for (int i = unroll_or_ca_pos - 1; i >= 0; --i) {
+    auto pt = tv->axis(i)->getParallelType();
+    if (!isParallelTypeThread(pt) && !tv->axis(i)->isBroadcast()) {
+      valid_pos = i;
+      break;
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      valid_pos >= 0,
+      "Invalid tensor to double-buffer. Valid double buffer axis not found. ",
+      tv->toString());
+
+  return valid_pos;
+}
+
+IterDomain* getDoubleBufferAxis(const TensorView* tv) {
+  return tv->axis((int)getDoubleBufferAxisPosition(tv));
+}
+
+void validateDoubleBufferedTensor(const TensorView* tv) {
+  auto double_buffer_pos = getDoubleBufferAxisPosition(tv);
+
+  // Like vectorization, only UnaryOp::Set with another TensorView is
+  // considered.
+  auto def = tv->definition();
+  TORCH_INTERNAL_ASSERT(
+      def->isA<UnaryOp>() &&
+          def->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::Set,
+      "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set is supported: ",
+      def->toString());
+
+  TORCH_INTERNAL_ASSERT(
+      def->as<UnaryOp>()->in()->isA<TensorView>(),
+      "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set with TensorView is supported: ",
+      def->toString());
+
+  // Require the producer tensor to have been computed entirely for
+  // the double-buffering loop. Otherwise, the producer itself would
+  // also need to be double-bufferred.
+  auto producer = def->as<UnaryOp>()->in()->as<TensorView>();
+  TORCH_INTERNAL_ASSERT(
+      producer->getComputeAtPosition() <= double_buffer_pos,
+      "Invalid tensor to double-buffer. The computeAt position of the producer tensor must be moved left: ",
+      producer->toString());
+
+  // Not strictly necessary, but only gmem -> smem or local and smem -> local
+  // are allowed.
+  const auto p_mem_type = producer->getMemoryType();
+  const auto c_mem_type = tv->getMemoryType();
+  TORCH_INTERNAL_ASSERT(
+      (p_mem_type == MemoryType::Global &&
+       (c_mem_type == MemoryType::Shared || c_mem_type == MemoryType::Local)) ||
+          (p_mem_type == MemoryType::Shared && c_mem_type == MemoryType::Local),
+      "Invalid tensor to double-buffer: ",
+      tv->toString(),
+      ". Producer memory type: ",
+      p_mem_type,
+      ". Consumer memory type: ",
+      c_mem_type);
+
+  return;
+}
+
+namespace {
+
+// Initial inspection of a fusion to find and validate double buffered tensors
+class DoubleBufferFusionInspector : private IterVisitor {
+ public:
+  DoubleBufferFusionInspector(Fusion* fusion, DoubleBufferInfo& db_info)
+      : db_info_(db_info) {
+    traverse(fusion);
+  }
+
+ private:
+  using IterVisitor::handle;
+
+  void handle(TensorView* tv) final {
+    if (!tv->isDoubleBuffered()) {
+      return;
+    }
+
+    validateDoubleBufferedTensor(tv);
+
+    auto db_axis = getDoubleBufferAxis(tv);
+
+    db_info_.setDoubleBufferAxis(tv, db_axis);
+  }
+
+ private:
+  DoubleBufferInfo& db_info_;
+};
+
+// The type of replicated double-buffer loops
+enum class LoopType { Prologue, Main, Epilogue };
+
+// The epilogue loop is only created when the producer of a double
+// buffer tensor is on smem, in which case it would otherwise require
+// an additional predicate to guard buffer overruns. When it's on
+// gmem, that isn't the case, so it does not need to create an
+// epilogue loop.
+bool requireEpilogue(const std::vector<UnaryOp*>& exprs) {
+  return std::any_of(exprs.begin(), exprs.end(), [](const UnaryOp* uop) {
+    return uop->in()->as<TensorView>()->getMemoryType() == MemoryType::Shared;
+  });
+}
+
+// Replicates double buffer loops for Prologue, Main, and
+// Epilogue. Prologue only copies the load expressions of double
+// buffered tensors, whereas Epilogue does any expression other than
+// the loads. Main copies everything.
+class DoubleBufferLoopCloner : public kir::IrVisitor {
+ public:
+  static kir::ForLoop* clone(
+      kir::ForLoop* double_buffer_loop,
+      const std::vector<UnaryOp*>& double_buffer_load_exprs,
+      LoopType loop_type) {
+    DoubleBufferLoopCloner cloner(
+        double_buffer_loop, double_buffer_load_exprs, loop_type);
+    cloner.clone();
+    return cloner.cloned_top_level_loop_;
+  }
+
+ private:
+  DoubleBufferLoopCloner(
+      kir::ForLoop* double_buffer_loop,
+      const std::vector<UnaryOp*>& double_buffer_load_exprs,
+      LoopType loop_type)
+      : double_buffer_loop_(double_buffer_loop),
+        double_buffer_load_exprs_(double_buffer_load_exprs),
+        loop_type_(loop_type) {}
+
+  using kir::IrVisitor::handle;
+
+  void clone() {
+    const auto gpu_lower = GpuLower::current();
+
+    // Cloning the double buffer loop as follows:
+    //
+    // Prologue: 0 to 1
+    // Main: 0 to (extent-1)
+    // Epilogue: (extent-1) to extent
+
+    auto index = IrBuilder::create<Int>(c10::nullopt);
+    auto start = double_buffer_loop_->start();
+    auto stop = double_buffer_loop_->stop();
+
+    if (loop_type_ == LoopType::Prologue) {
+      TORCH_INTERNAL_ASSERT(start->isZeroInt());
+      stop = gpu_lower->kernel()->oneVal();
+    } else if (
+        loop_type_ == LoopType::Main &&
+        requireEpilogue(double_buffer_load_exprs_)) {
+      stop = IrBuilder::subExpr(
+          double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal());
+    } else if (loop_type_ == LoopType::Epilogue) {
+      TORCH_INTERNAL_ASSERT(requireEpilogue(double_buffer_load_exprs_));
+      start = IrBuilder::subExpr(
+          double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal());
+    }
+
+    cloned_top_level_loop_ = IrBuilder::create<kir::ForLoop>(
+        double_buffer_loop_->iter_domain(),
+        index,
+        start,
+        stop,
+        gpu_lower->kernel()->oneVal(),
+        false,
+        nullptr,
+        double_buffer_loop_->isUnrollRequired());
+
+    handle(double_buffer_loop_);
+  }
+
+  void handle(kir::ForLoop* fl) final {
+    kir::ForLoop* cloned_loop = fl == double_buffer_loop_
+        ? cloned_top_level_loop_
+        : IrBuilder::create<kir::ForLoop>(fl);
+
+    cloned_scopes_.push_back(&cloned_loop->body());
+
+    kir::IrVisitor::handle(fl);
+
+    cloned_scopes_.pop_back();
+
+    // Add the cloned loop into the parent loop body only when the
+    // cloned loop contains expressions.
+    if (!cloned_loop->body().empty() && !cloned_scopes_.empty()) {
+      cloned_scopes_.back()->push_back(cloned_loop);
+    }
+  }
+
+  void handle(kir::IfThenElse* ite) final {
+    TORCH_INTERNAL_ASSERT(false, "No IfThenElse should exist yet");
+  }
+
+  void handle(Expr* expr) final {
+    if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
+      kir::IrVisitor::handle(expr);
+      return;
+    }
+
+    TORCH_INTERNAL_ASSERT(!cloned_scopes_.empty());
+
+    if (loop_type_ == LoopType::Main) {
+      cloned_scopes_.back()->push_back(expr);
+      return;
+    }
+
+    // In Prologue and Epilogue, either load expressions or anything
+    // else are copied. Note that there can be multiple exprs defining
+    // double buffered TVs (e.g., buffer initialization).
+
+    auto out_tv = ir_utils::getTvOutput(expr);
+    const auto is_double_buffer_load_expr = std::any_of(
+        double_buffer_load_exprs_.begin(),
+        double_buffer_load_exprs_.end(),
+        [out_tv](const auto load_expr) {
+          auto double_buffer_tv = ir_utils::getTvOutput(load_expr);
+          TORCH_INTERNAL_ASSERT(double_buffer_tv != nullptr);
+          return out_tv == double_buffer_tv;
+        });
+    if ((loop_type_ == LoopType::Prologue && is_double_buffer_load_expr) ||
+        (loop_type_ == LoopType::Epilogue && !is_double_buffer_load_expr)) {
+      cloned_scopes_.back()->push_back(expr);
+    }
+  }
+
+ private:
+  kir::ForLoop* double_buffer_loop_ = nullptr;
+  const std::vector<UnaryOp*>& double_buffer_load_exprs_;
+  const LoopType loop_type_;
+
+  kir::ForLoop* cloned_top_level_loop_ = nullptr;
+  std::deque<kir::Scope*> cloned_scopes_;
+};
+
+using InsertionInfo = std::unordered_map<kir::ForLoop*, std::vector<UnaryOp*>>;
+
+// Traverse lowered loop-nests and find all double buffer loops and
+// associated load expressions.
+class DoubleBufferLoopNestInspector : private kir::IrVisitor {
+ public:
+  static InsertionInfo run(const std::vector<Expr*>& exprs) {
+    DoubleBufferLoopNestInspector inspector(exprs);
+    return inspector.insertion_info_;
+  }
+
+ private:
+  DoubleBufferLoopNestInspector(const std::vector<Expr*>& exprs) {
+    handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+
+  void handle(UnaryOp* uop) final {
+    const auto gpu_lower = GpuLower::current();
+
+    auto out_tv = ir_utils::getTvOutput(uop);
+
+    if (out_tv == nullptr) {
+      return;
+    }
+
+    // Ignore init loop
+    if (!out_tv->isDoubleBuffered() || !uop->in()->isA<TensorView>()) {
+      return;
+    }
+
+    auto double_buffer_loop =
+        gpu_lower->doubleBufferInfo().getDoubleBufferLoop(out_tv, for_loops_);
+
+    TORCH_INTERNAL_ASSERT(
+        double_buffer_loop != nullptr,
+        "No double buffer loop found for a double buffered tensor: ",
+        out_tv->toString());
+
+    validateDoubleBufferLoop(double_buffer_loop);
+
+    insertion_info_[double_buffer_loop].push_back(uop);
+  }
+
+  static void validateDoubleBufferLoop(kir::ForLoop* loop) {
+    TORCH_INTERNAL_ASSERT(
+        loop->start()->isZeroInt(), "Unsupported loop: ", loop->toString());
+    TORCH_INTERNAL_ASSERT(
+        loop->step()->isOneInt(), "Unsupported loop: ", loop->toString());
+    TORCH_INTERNAL_ASSERT(
+        !loop->vectorize(),
+        "Vectorized loop should not be the allocation loop for double-buffered tensor: ",
+        loop->toString());
+    TORCH_INTERNAL_ASSERT(
+        !loop->vectorize_shift(),
+        "Vectorize shift loop should not be the allocation loop for double-buffered tensor: ",
+        loop->toString());
+  }
+
+  InsertionInfo insertion_info_;
+};
+
+// Apply double buffering transformations
+class DoubleBufferInserter : private kir::ExprMutator {
+ public:
+  // When there exist multiple double buffer loops, apply
+  // transformations to inner-most loops first. A single ExprMutator
+  // pass can only process one loop.
+  static std::vector<Expr*> run(
+      const std::vector<Expr*>& exprs,
+      InsertionInfo insertion_info) {
+    auto inserted_exprs = exprs;
+    while (!insertion_info.empty()) {
+      DoubleBufferInserter inserter(inserted_exprs, insertion_info);
+      inserted_exprs = inserter.exprs_;
+    }
+    return inserted_exprs;
+  }
+
+ private:
+  DoubleBufferInserter(
+      const std::vector<Expr*>& exprs,
+      InsertionInfo& insertion_info)
+      : insertion_info_(insertion_info) {
+    auto num_double_buffer_loops = insertion_info.size();
+    traverseAndInsert(exprs);
+    TORCH_INTERNAL_ASSERT(processed_loop_ != nullptr);
+    TORCH_INTERNAL_ASSERT(insertion_info.size() == num_double_buffer_loops - 1);
+  }
+
+  using kir::ExprMutator::handle;
+
+  void handle(kir::ForLoop* loop) final {
+    kir::ExprMutator::handle(loop);
+
+    // If another loop is already taken care of, no more loop should
+    // be done in the same pass
+    if (processed_loop_ != nullptr) {
+      return;
+    }
+
+    auto it = insertion_info_.find(loop);
+    if (it == insertion_info_.end()) {
+      return;
+    }
+
+    insert(loop, it->second);
+    processed_loop_ = loop;
+    insertion_info_.erase(loop);
+  }
+
+  void insert(
+      kir::ForLoop* double_buffer_loop,
+      const std::vector<UnaryOp*>& loads) {
+    auto prologue_loop = DoubleBufferLoopCloner::clone(
+        double_buffer_loop, loads, LoopType::Prologue);
+    registerInsertBefore(double_buffer_loop, prologue_loop);
+
+    auto write_to_smem =
+        std::any_of(loads.begin(), loads.end(), [](const UnaryOp* uop) {
+          return uop->out()->as<TensorView>()->getMemoryType() ==
+              MemoryType::Shared;
+        });
+
+    // RAW sync is not inserted for double buffered tensors. The only
+    // exception is the prologue load.
+    if (write_to_smem) {
+      auto sync = IrBuilder::create<kir::BlockSync>();
+      registerInsertBefore(double_buffer_loop, sync);
+    }
+
+    auto main_loop = DoubleBufferLoopCloner::clone(
+        double_buffer_loop, loads, LoopType::Main);
+    registerReplace(double_buffer_loop, main_loop);
+
+    if (requireEpilogue(loads)) {
+      auto epilogue_loop = DoubleBufferLoopCloner::clone(
+          double_buffer_loop, loads, LoopType::Epilogue);
+      registerInsertAfter(double_buffer_loop, epilogue_loop);
+    }
+  }
+
+ private:
+  InsertionInfo& insertion_info_;
+  kir::ForLoop* processed_loop_ = nullptr;
+};
+
+} // namespace
+
+void DoubleBufferInfo::build(Fusion* fusion) {
+  DoubleBufferFusionInspector inspector(fusion, *this);
+}
+
+DoubleBufferInfo::TvInfo& DoubleBufferInfo::getTvInfo(const TensorView* tv) {
+  TORCH_INTERNAL_ASSERT(
+      tv->isDoubleBuffered(), "Not a double-buffered tensor: ", tv->toString());
+  return map_[tv];
+}
+
+void DoubleBufferInfo::setDoubleBufferAxis(
+    const TensorView* tv,
+    IterDomain* axis) {
+  getTvInfo(tv).double_buffer_axis = axis;
+}
+
+IterDomain* DoubleBufferInfo::getDoubleBufferAxis(const TensorView* tv) {
+  if (!tv->isDoubleBuffered()) {
+    return nullptr;
+  }
+
+  return getTvInfo(tv).double_buffer_axis;
+}
+
+kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop(
+    IterDomain* axis,
+    const std::vector<kir::ForLoop*>& loops,
+    bool ignore_prologue) {
+  auto loop_it = std::find_if(loops.begin(), loops.end(), [&](const auto loop) {
+    return GpuLower::current()->caMap()->areMapped(
+               loop->iter_domain(), axis, IdMappingMode::EXACT) &&
+        (!ignore_prologue || !loop->stop()->isOneInt());
+  });
+
+  if (loop_it != loops.end()) {
+    return *loop_it;
+  } else {
+    return nullptr;
+  }
+}
+
+kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop(
+    const TensorView* tv,
+    const std::vector<kir::ForLoop*>& loops,
+    bool ignore_prologue) {
+  auto axis = getDoubleBufferAxis(tv);
+
+  if (axis == nullptr) {
+    return nullptr;
+  }
+
+  return getDoubleBufferLoop(axis, loops, ignore_prologue);
+}
+
+void DoubleBufferInfo::setOriginalAllocSize(
+    const TensorView* tv,
+    Val* original_alloc_size) {
+  getTvInfo(tv).original_alloc_size = original_alloc_size;
+}
+
+Val* DoubleBufferInfo::getOriginalAllocSize(const TensorView* tv) {
+  if (!tv->isDoubleBuffered()) {
+    return nullptr;
+  }
+
+  return getTvInfo(tv).original_alloc_size;
+}
+
+std::vector<Expr*> DoubleBufferPass::run(const std::vector<Expr*>& exprs) {
+  auto insertion_info = DoubleBufferLoopNestInspector::run(exprs);
+  return DoubleBufferInserter::run(exprs, insertion_info);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.h b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h
new file mode 100644
index 000000000000..96bc247f4ff6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+
+// Double buffering a tensor doubles its allocation size and uses two
+// buffers to facilitate computation and memory access
+// overlapping. The basic form of code looks like as follows:
+//
+// Before:
+// for i
+//   x[S]; // allocation
+//   for j:
+//     x[j] = y[i, j]
+//   for j:
+//     ... = x[j]
+//
+// After:
+// X[S * 2]; // allocation
+// for i in 0 to 1: // Prologue
+//   for j:
+//     x[j] = y[i, j]
+//
+// for i in 0 to N-1: // Main
+//   for j:
+//     x[j + (1 - i % 2) * S] = y[i + 1, j]
+//   for j:
+//     ... = x[j + (i % 2) * S]
+//
+// for i in N-1 to N: // Epilogue
+//   for j:
+//     ... = x[j + (i % 2) * S]
+//
+// Here, S is the original size of tensor x.
+//
+// The i loop is the double buffer loop of tensor x, where double
+// buffering is applied to the tensor. The first step of lowering is
+// to find the double buffering axis for each double buffered
+// tensor. It must not be parallelized as it isn't possible to double
+// buffer parallelized loops. Also, an unrolled axis expands the
+// allocation and is intended to make the loop completely unrolled,
+// which also conflicts with double buffering. So, basically, the double
+// buffering axis is the inner-most axis within the axes left
+// of the CA position. However, when it is parallelized or unrolled, a
+// further left axis is picked.
+//
+// Once the double buffer axis is determined, the main task is to
+// replicate the corresponding double buffer loop as illustrated
+// above. The Prologue loop is to just fetch the first element to
+// populate the buffer. The main loop is mostly the same as the
+// original loop, except for the indexing change to switch the two
+// buffers. When used as a consumer, an offset of (1 - i % 2) * S is
+// added, whereas (i % 2) * S is added when used as a producer. Here,
+// i is the index of the double buffer loop. The Epilogue loop is just
+// for the last iteration of the loop. Since the main loop reads one
+// element ahead of the producer of the double buffered tensor, it
+// would require an additional guard to prevent buffer overruns with
+// the producer if the main loop were also used for the last
+// iteration. However, the value loaded by the invalid load would not
+// be used, so instead of adding the additional predicate, the Epilogue
+// loop is replicated from the original loop, except for the load
+// expression since it's not used. Note that this overrun does not
+// happen when the producer is on gmem, so in that case, this
+// additional replication is not done.
+//
+// When creating those three types of loops, additional care must be
+// taken when multiple tensors are double buffered. When multiple
+// tensors use the same loop as their double buffer loop, one pass of
+// replication takes care of them at once, meaning the same Prologue,
+// Main, Epilogue loops are used for the multiple tensors.
+//
+// Other tasks to do for a double buffer tensor include:
+// - Move allocation to outside of the double buffer loop
+// - Double the allocation size
+// - Omit the RAW sync in the Main and Epilogue loops
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+unsigned int getDoubleBufferAxisPosition(const TensorView* tv);
+
+IterDomain* getDoubleBufferAxis(const TensorView* tv);
+
+void validateDoubleBufferedTensor(const TensorView* tv);
+
+class TORCH_CUDA_CU_API DoubleBufferPass {
+ public:
+  //! Apply double buffering transformations
+  static std::vector<Expr*> run(const std::vector<Expr*>& exprs);
+};
+
+class TORCH_CUDA_CU_API DoubleBufferInfo {
+  // Lowering information of double buffered tensors.
+  struct TvInfo {
+    IterDomain* double_buffer_axis = nullptr;
+    Val* original_alloc_size = nullptr;
+  };
+
+ public:
+  void build(Fusion* fusion);
+
+  void setDoubleBufferAxis(const TensorView* tv, IterDomain* id);
+
+  IterDomain* getDoubleBufferAxis(const TensorView* tv);
+
+  //! Get a loop that matches with a given double-buffer axis. If
+  //! ignore_prologue is true, a matched loop is ignored if it's a
+  //! prologue loop.
+  static kir::ForLoop* getDoubleBufferLoop(
+      IterDomain* axis,
+      const std::vector<kir::ForLoop*>& loops,
+      bool ignore_prologue = false);
+
+  //! Get a loop that matches with the double-buffer axis of a given
+  //! double-buffered tensor. If ignore_prologue is true, a matched
+  //! loop is ignored if it's a prologue loop.
+  kir::ForLoop* getDoubleBufferLoop(
+      const TensorView* tv,
+      const std::vector<kir::ForLoop*>& loops,
+      bool ignore_prologue = false);
+
+  void setOriginalAllocSize(const TensorView* tv, Val* size);
+
+  Val* getOriginalAllocSize(const TensorView* tv);
+
+ private:
+  TvInfo& getTvInfo(const TensorView* tv);
+
+ private:
+  //! Keeps track of information for lowering double buffered tensors
+  std::unordered_map<const TensorView*, TvInfo> map_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
index 2353ea9bbf50..281fa05bb2bd 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
@@ -305,7 +305,7 @@ class ExprSegmentationSorter {
 
   std::deque<ExprGroup*> to_visit_;
 
-  std::unordered_set<ExprGroup*> to_merge_;
+  std::vector<std::pair<ExprGroup*, ExprGroup*>> to_merge_;
 
   Fusion* fusion_;
 
@@ -541,7 +541,7 @@ ExprGroup* ExprSegmentationSorter::makeEmptyGroup() {
 ExprGroup* ExprSegmentationSorter::makeEmptyGroup(Expr* expr) {
   auto group = makeEmptyGroup();
   group->exprs().push_back(expr);
-  if (ir_utils::isTVOp(expr)) {
+  if (ir_utils::isTvOp(expr)) {
     auto out_tv = expr->outputs()[0]->as<TensorView>();
     // Grab all id's that are shared with other tensors.
     for (const auto tv_i : c10::irange(out_tv->getComputeAtPosition())) {
@@ -649,68 +649,8 @@ ExprGroup* getProducer(ExprGroup* sg1, ExprGroup* sg2) {
   return nullptr;
 }
 
-// Go through all expressions and compute a local ordering of loops. operator<
-// is implemented based on the concrete_id_dependencies analysis done. If
-// there's no dependency between two IDs then order doesn't mater, otherwise we
-// can tell which is inner most by checking if there's any dependency
-// relationships.
-//
-// Dependency relationships in concrete_id_dependencies has a "global" view in
-// the fusion, so it can resolve ordering by only looking at id's and the
-// dependency map.
-//
-// For example two expressions may have domains: [I0], [I1] Yet we
-// won't know the ordering unless we see a domain with: [I0, I1]. This happened
-// in advancedIndexing9 (also see AdvancedLowering6) test when merging T5 with
-// the group containing T10 (cache of T5, which is post broadcasted output) and
-// T6(pre broadcasted output).
-// T5 had the domain [0, 1, 2, 3, 4] produce at 3
-// T6 had the domain [0, 3, 4] compute at 3
-// Merging [0, 1, 2] and [0, 3, 4] resulted in the domain [0, 3, 4, 1, 2]
-//
-// If ID's are not in filter, we don't care about their ordering and ignore
-// them. This is because we're only focused on loops we will have to merge
-// across groups. If the domain is not in a produce at position in the producer
-// edges, or a compute at position in the consumer edges, the expressions we
-// look at may not have a unique ordering.
-
-struct LocalDomainSorter {
-  LocalDomainSorter(
-      const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
-          concrete_id_dependencies)
-      : concrete_id_dependencies_(concrete_id_dependencies) {}
-
-  // Return if id0 should be before id1
-  inline bool operator()(IterDomain* id0, IterDomain* id1) {
-    auto concrete_id_0 =
-        GpuLower::current()->caLoopMap().getConcreteMappedID(id0);
-    auto concrete_id_1 =
-        GpuLower::current()->caLoopMap().getConcreteMappedID(id1);
-
-    if (concrete_id_dependencies_.find(concrete_id_0) !=
-        concrete_id_dependencies_.end()) {
-      const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0);
-      // if id0 depends on id1 it means id1 is inside id0, so id0 < id1
-      return dependencies_0.count(concrete_id_1);
-    }
-
-    if (concrete_id_dependencies_.find(concrete_id_1) !=
-        concrete_id_dependencies_.end()) {
-      const auto& dependencies_1 = concrete_id_dependencies_.at(concrete_id_1);
-      // if id1 depends on id0 it means id0 is inside id1, so id1 < id0
-      return !dependencies_1.count(concrete_id_0);
-    }
-
-    return true;
-  }
-
-  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
-      concrete_id_dependencies_;
-};
-
 std::vector<IterDomain*> getLocalDomainOrdering(
     const std::vector<Expr*>& exprs,
-    const ComputeAtMap& map,
     const std::unordered_set<IterDomain*> filter,
     const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
         concrete_id_dependencies) {
@@ -718,10 +658,12 @@ std::vector<IterDomain*> getLocalDomainOrdering(
     return std::vector<IterDomain*>();
   }
 
+  const auto& ca_map = GpuLower::current()->caMap();
+
   std::unordered_set<IterDomain*> domains;
 
   for (auto expr : exprs) {
-    if (!ir_utils::isTVOp(expr)) {
+    if (!ir_utils::isTvOp(expr)) {
       continue;
     }
 
@@ -736,14 +678,17 @@ std::vector<IterDomain*> getLocalDomainOrdering(
                   tv_input->getComputeAtPosition(),
                   tv_input->getMaxProducerPosition()),
           std::back_inserter(domain),
-          [&map](IterDomain* id) { return map.getConcreteMappedID(id); });
+          [&ca_map](IterDomain* id) {
+            return ca_map->getConcreteMappedID(id, IdMappingMode::LOOP);
+          });
 
       domain.erase(
           std::remove_if(
               domain.begin(),
               domain.end(),
-              [&filter, &map](IterDomain* id) {
-                return filter.find(map.getConcreteMappedID(id)) == filter.end();
+              [&filter, &ca_map](IterDomain* id) {
+                return filter.find(ca_map->getConcreteMappedID(
+                           id, IdMappingMode::LOOP)) == filter.end();
               }),
           domain.end());
 
@@ -755,7 +700,8 @@ std::vector<IterDomain*> getLocalDomainOrdering(
   std::sort(
       merged_domain.begin(),
       merged_domain.end(),
-      LocalDomainSorter(concrete_id_dependencies));
+      IterDomainDependencySorter(
+          concrete_id_dependencies, GpuLower::current()->caMap()));
   return merged_domain;
 }
 } // namespace
@@ -840,8 +786,8 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode(
     if (producer_of_consumer_edge->isA<TensorView>()) {
       auto tv = producer_of_consumer_edge->as<TensorView>();
       for (const auto tv_i : c10::irange(tv->getComputeAtPosition())) {
-        ca_ids.emplace(GpuLower::current()->caLoopMap().getConcreteMappedID(
-            tv->axis(tv_i)));
+        ca_ids.emplace(GpuLower::current()->caMap()->getConcreteMappedID(
+            tv->axis(tv_i), IdMappingMode::LOOP));
       }
     }
   }
@@ -855,8 +801,8 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode(
     if (consumer_of_producer_edge->isA<TensorView>()) {
       auto tv = consumer_of_producer_edge->as<TensorView>();
       for (const auto tv_i : c10::irange(tv->getMaxProducerPosition())) {
-        pa_ids.emplace(GpuLower::current()->caLoopMap().getConcreteMappedID(
-            tv->axis(tv_i)));
+        pa_ids.emplace(GpuLower::current()->caMap()->getConcreteMappedID(
+            tv->axis(tv_i), IdMappingMode::LOOP));
       }
     }
   }
@@ -865,10 +811,7 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode(
   all_ca_pa_ids.insert(pa_ids.begin(), pa_ids.end());
 
   auto ordered_ids = getLocalDomainOrdering(
-      joined_groups->exprs(),
-      GpuLower::current()->caLoopMap(),
-      all_ca_pa_ids,
-      concrete_id_dependencies);
+      joined_groups->exprs(), all_ca_pa_ids, concrete_id_dependencies);
 
   for (auto id : ordered_ids) {
     if (ca_ids.count(id)) {
@@ -914,8 +857,8 @@ bool canReducePA(ExprGroup* group) {
     // it can't decide if it can be reduced
     bool has_matching_pa = false;
     for (const auto i : c10::irange(consumer_tv->getMaxProducerPosition())) {
-      if (GpuLower::current()->caLoopMap().areMapped(
-              consumer_tv->axis(i), group_pa_last_id)) {
+      if (GpuLower::current()->caMap()->areMapped(
+              consumer_tv->axis(i), group_pa_last_id, IdMappingMode::LOOP)) {
         has_matching_pa = true;
         break;
       }
@@ -931,8 +874,10 @@ bool canReducePA(ExprGroup* group) {
              static_cast<int>(producer_tv->getComputeAtPosition());
          producer_pos_i > 0;
          producer_pos_i--) {
-      if (GpuLower::current()->caLoopMap().areMapped(
-              producer_tv->axis(producer_pos_i - 1), group_pa_last_id)) {
+      if (GpuLower::current()->caMap()->areMapped(
+              producer_tv->axis(producer_pos_i - 1),
+              group_pa_last_id,
+              IdMappingMode::LOOP)) {
         return false;
       }
     }
@@ -990,10 +935,12 @@ void ExprSegmentationSorter::mergeNodes() {
   std::unordered_set<ExprGroupConnections*> clean_up_edges;
 
   while (!to_merge_.empty()) {
-    auto group1 = *to_merge_.begin();
-    auto group2 = group1->payload()->merge_with;
-    to_merge_.erase(group1);
-    to_merge_.erase(group2);
+    ExprGroup *group1 = nullptr, *group2 = nullptr;
+    std::tie(group1, group2) = to_merge_.back();
+    to_merge_.pop_back();
+    TORCH_INTERNAL_ASSERT(
+        group2 == group1->payload()->merge_with,
+        "Expression Sorter: inconsistent to_merge packing");
     clean_up_groups.emplace(group1);
     clean_up_groups.emplace(group2);
     makeMergedNode(group1, group2);
@@ -1026,8 +973,8 @@ void ExprSegmentationSorter::initializeForLoopDependencies() {
          tv_id_i > 0;
          tv_id_i--) {
       auto tv_id = tv->axis((int)(tv_id_i - 1));
-      auto concrete_id =
-          GpuLower::current()->caLoopMap().getConcreteMappedID(tv_id);
+      auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+          tv_id, IdMappingMode::LOOP);
 
       if (concrete_id_dependencies.find(concrete_id) ==
           concrete_id_dependencies.end()) {
@@ -1038,8 +985,8 @@ void ExprSegmentationSorter::initializeForLoopDependencies() {
       }
 
       // Loops after tv_id are dependent on tv_id
-      dependencies.emplace(
-          GpuLower::current()->caLoopMap().getConcreteMappedID(tv_id));
+      dependencies.emplace(GpuLower::current()->caMap()->getConcreteMappedID(
+          tv_id, IdMappingMode::LOOP));
     }
   }
 
@@ -1067,27 +1014,62 @@ void ExprSegmentationSorter::initializeForLoopDependencies() {
       std::back_inserter(to_visit),
       [](const auto& concrete_dep_entry) { return concrete_dep_entry.first; });
 
+  size_t inf_loop_counter = to_visit.size();
+  bool failed = false;
+
   while (!to_visit.empty()) {
     auto id = to_visit.front();
     to_visit.pop_front();
 
+    if (inf_loop_counter-- == 0) {
+      failed = true;
+      break;
+    }
+
     auto& dependencies = concrete_id_dependencies.at(id);
-    bool ready = std::all_of(
-        dependencies.begin(), dependencies.end(), [&visited](IterDomain* id) {
-          return visited.count(id);
-        });
+    bool ready = dependencies.empty() ||
+        std::all_of(dependencies.begin(),
+                    dependencies.end(),
+                    [&visited](IterDomain* id) { return visited.count(id); });
 
     if (!ready) {
       to_visit.push_back(id);
       continue;
     }
 
+    inf_loop_counter = to_visit.size();
+
     for (auto dependency : dependencies) {
       auto dep_of_dep = concrete_id_dependencies.at(dependency);
       dependencies.insert(dep_of_dep.begin(), dep_of_dep.end());
     }
     visited.emplace(id);
   }
+  if (failed) {
+    std::cerr
+        << "ERROR: Iteration domain sorting has failed, infinite loop detected."
+        << std::endl;
+    std::cerr << "Failed to sort out: " << std::endl;
+    for (auto entry : to_visit) {
+      std::cerr << entry->toString();
+      if (entry != to_visit.back()) {
+        std::cerr << ", ";
+      }
+    }
+
+    std::cerr << "Depdencies: " << std::endl;
+    for (const auto& dep_entry : concrete_id_dependencies) {
+      std::cerr << "  Deps of " << dep_entry.first->toString() << std::endl
+                << "   ";
+
+      for (auto dep : dep_entry.second) {
+        std::cerr << dep->toString() << ", ";
+      }
+      std::cerr << std::endl;
+    }
+
+    TORCH_INTERNAL_ASSERT(false);
+  }
 }
 
 // Checks if the for loop associated with the concrete ID is ready to be
@@ -1145,8 +1127,6 @@ bool ExprSegmentationSorter::supportedMerge(ExprGroup* sg1, ExprGroup* sg2) {
     return false;
   }
 
-  const auto& loop_map = GpuLower::current()->caLoopMap();
-
   // If inner loop dependencies have not been resolved, cannot merge.
   if (!loopReady(producer_ca_domain.back()) ||
       !loopReady(consumer_pa_domain.back())) {
@@ -1182,11 +1162,13 @@ bool ExprSegmentationSorter::supportedMerge(ExprGroup* sg1, ExprGroup* sg2) {
       continue;
     }
 
-    if (!loop_map.areMapped(compute_at_dim, producer_ca_domain.back())) {
+    if (!GpuLower::current()->caMap()->areMapped(
+            compute_at_dim, producer_ca_domain.back(), IdMappingMode::LOOP)) {
       continue;
     }
 
-    if (loop_map.areMapped(compute_at_dim, consumer_pa_domain.back())) {
+    if (GpuLower::current()->caMap()->areMapped(
+            compute_at_dim, consumer_pa_domain.back(), IdMappingMode::LOOP)) {
       return true;
     }
   }
@@ -1297,8 +1279,7 @@ void ExprSegmentationSorter::sort() {
             continue;
           }
 
-          to_merge_.emplace(group.get());
-          to_merge_.emplace(*candidate_it);
+          to_merge_.emplace_back(std::make_pair(group.get(), *candidate_it));
 
           group->payload()->merged = true;
           group->payload()->merge_with = *candidate_it;
@@ -1350,8 +1331,7 @@ void ExprSegmentationSorter::sort() {
           if (testStillDag(group.get(), *candidate_it)) {
             // Mark in same style as default algorithm for convenience even
             // though we will only merge once with the fallback
-            to_merge_.emplace(group.get());
-            to_merge_.emplace(*candidate_it);
+            to_merge_.emplace_back(std::make_pair(group.get(), *candidate_it));
 
             group->payload()->merged = true;
             group->payload()->merge_with = *candidate_it;
diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp
new file mode 100644
index 000000000000..213abda029a6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp
@@ -0,0 +1,344 @@
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_fused_reduction.h>
+
+#include <algorithm>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+//! An instance of reduction patterns to fuse
+class FusedReductionBroadcastInfo : public PolymorphicBase {
+ public:
+  FusedReductionBroadcastInfo(ReductionOp* reduction, bool with_broadcast)
+      : reductions_({reduction}), with_broadcast_({with_broadcast}) {}
+
+  FusedReductionBroadcastInfo(WelfordOp* welford, bool with_broadcast)
+      : reductions_({welford}), with_broadcast_({with_broadcast}) {}
+
+  FusedReductionBroadcastInfo(
+      GroupedReductionOp* grouped_rop,
+      bool with_broadcast)
+      : reductions_({grouped_rop}), with_broadcast_({with_broadcast}) {}
+
+  const std::vector<Expr*>& reductions() const {
+    return reductions_;
+  }
+
+  const std::vector<bool>& withBroadcast() const {
+    return with_broadcast_;
+  }
+
+ private:
+  // Holds ReductionOp, WelfordOp or GroupedReductionOp.
+  std::vector<Expr*> reductions_;
+  // True each reduction also broadcasts
+  std::vector<bool> with_broadcast_;
+};
+
+//! Inspect a fusion to detect eligible sequences of expressions to
+//! use the fused reduction kernel
+class FusionInspector : private IterVisitor {
+ public:
+  static std::vector<FusedReductionBroadcastInfo> run(Fusion* fusion) {
+    FusionInspector inspector(fusion);
+    return inspector.fusion_list_;
+  }
+
+ private:
+  FusionInspector(Fusion* fusion) {
+    traverse(fusion);
+  }
+
+  using IterVisitor::handle;
+
+  void handle(ReductionOp* rop) final {
+    /// If it's a grid reduction, keep track of tensors that depend on
+    /// this reduction.
+    // Only consider when out is on register as that is assumed in the
+    // fused reduction kernel.
+    auto out = ir_utils::getTvOutput(rop);
+    if (out->getMemoryType() == MemoryType::Local &&
+        out->domain()->hasGridReduction()) {
+      reduction_dep_[out].insert(rop);
+    }
+  }
+
+  void handle(WelfordOp* wop) final {
+    /// If it's a grid reduction, keep track of tensors that depend on
+    /// this reduction.
+    // Only consider when out is on register as that is assumed in the
+    // fused reduction kernel.
+    auto out = ir_utils::getTvOutput(wop);
+    if (out->getMemoryType() == MemoryType::Local &&
+        out->domain()->hasGridReduction()) {
+      reduction_dep_[out].insert(wop);
+    }
+  }
+
+  void handle(GroupedReductionOp* grouped_rop) final {
+    auto out = ir_utils::getTvOutput(grouped_rop);
+    if (out->getMemoryType() == MemoryType::Local &&
+        out->domain()->hasGridReduction()) {
+      reduction_dep_[out].insert(grouped_rop);
+    }
+  }
+
+  void handle(Expr* expr) final {
+    IterVisitor::handle(expr);
+    for (auto in_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
+      for (auto reduction_op : reduction_dep_[in_tv]) {
+        if (fused_exprs_.find(reduction_op) != fused_exprs_.end()) {
+          continue;
+        }
+        for (auto out_tv :
+             ir_utils::filterByType<TensorView>(expr->outputs())) {
+          reduction_dep_[out_tv].insert(reduction_op);
+        }
+      }
+    }
+  }
+
+  // In the case of welford, use the fused broadcast reduction when at
+  // least one of the outputs is broadcast.
+  void handle(BroadcastOp* bop) final {
+    // Detect a pattern where a reduction is followed by a broadcast
+    auto bop_out = bop->out()->as<TensorView>();
+    auto bop_in = bop->in()->as<TensorView>();
+
+    for (Expr* preceding_expr : reduction_dep_[bop_in]) {
+      auto parallel_reduction_axes =
+          getReductionParallelTypeStates(preceding_expr);
+
+      // If not matching, propagate the reduction further down to
+      // subsequent expressions
+      if (!isBroadcastFuseable(bop_out, parallel_reduction_axes)) {
+        continue;
+      }
+
+      if (fused_exprs_.find(preceding_expr) != fused_exprs_.end()) {
+        // Already added to the fusion list. This can happen with
+        // welford as there can be multiple broadcast consumer
+        // expressions.
+        continue;
+      }
+
+      if (preceding_expr->isA<ReductionOp>()) {
+        fusion_list_.emplace_back(preceding_expr->as<ReductionOp>(), true);
+      } else if (preceding_expr->isA<GroupedReductionOp>()) {
+        fusion_list_.emplace_back(
+            preceding_expr->as<GroupedReductionOp>(), true);
+      } else if (preceding_expr->isA<WelfordOp>()) {
+        fusion_list_.emplace_back(preceding_expr->as<WelfordOp>(), true);
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false, "Invalid preceding expr: ", preceding_expr->toString());
+      }
+
+      fused_exprs_.insert(preceding_expr);
+    }
+  }
+
+  ParallelTypeBitmap getReductionParallelTypeStates(Expr* expr) {
+    ParallelTypeBitmap parallel_reduction_axes;
+
+    for (auto id : ir_utils::getTvOutput(expr)->domain()->domain()) {
+      auto pt = id->getParallelType();
+      if (id->isReduction() && isParallelTypeThread(pt)) {
+        parallel_reduction_axes.set(pt);
+      }
+    }
+
+    return parallel_reduction_axes;
+  }
+
+  // Requires reduction parallel dimensions to exactly match parallel broadcast
+  // dimensions
+  bool isBroadcastFuseable(
+      TensorView* broadcast_out,
+      const ParallelTypeBitmap& parallel_reduction_axes) {
+    const auto broadcast_parallel_types =
+        GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
+            broadcast_out);
+
+    // If no parallel broadcast, nothing to fuse
+    if (broadcast_parallel_types.none()) {
+      return false;
+    }
+
+    // Make sure the broadcast parallel types are the types reduced by
+    // the preceding reduction op
+    for (auto id : broadcast_out->domain()->domain()) {
+      auto pt = id->getParallelType();
+      if (!isParallelTypeThread(pt)) {
+        continue;
+      }
+      // Parallel broadcast must be included in reduction_states
+      if (id->isBroadcast() && broadcast_parallel_types.get(pt)) {
+        if (!parallel_reduction_axes.get(pt)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  //! List of expression sequences to fuse
+  std::vector<FusedReductionBroadcastInfo> fusion_list_;
+  //! Keep track of fused reduction/welford exprs to avoid duplication
+  std::unordered_set<Expr*> fused_exprs_;
+  //! Keep track of ReductionOp/WelfordOp expressions that are
+  //! (indirectly) input to a tensor
+  std::unordered_map<TensorView*, std::unordered_set<Expr*>> reduction_dep_;
+};
+
+//! Transform a fusion to use the fused reduction kernel.
+class FusionTransformer {
+ public:
+  static void run(
+      Fusion* fusion,
+      const std::vector<FusedReductionBroadcastInfo>& fusion_list) {
+    FusionTransformer transformer(fusion, fusion_list);
+  }
+
+ private:
+  FusionTransformer(
+      Fusion* fusion,
+      const std::vector<FusedReductionBroadcastInfo>& fusion_list)
+      : fusion_(fusion), fusion_list_(fusion_list) {
+    transform();
+  }
+
+  void transform() {
+    for (const auto& info : fusion_list_) {
+      transform(info);
+    }
+    // If the thread predicate map is modified, rebuild the
+    // map. build() only updates mappings that need to be updated.
+    if (thread_pred_map_modified_) {
+      GpuLower::current()->threadPredMap().build(fusion_);
+    }
+  }
+
+  void transform(const FusedReductionBroadcastInfo& info) {
+    TORCH_INTERNAL_ASSERT(
+        info.reductions().size() == 1, "Horizontal fusion not supported yet");
+
+    for (const auto i : c10::irange(info.reductions().size())) {
+      const auto expr = info.reductions().at(i);
+      const auto with_broadcast = info.withBroadcast().at(i);
+      Expr* fused_expr = nullptr;
+
+      if (auto reduction = dynamic_cast<ReductionOp*>(expr)) {
+        TORCH_INTERNAL_ASSERT(!reduction->isAllreduce());
+
+        auto red_op_type = reduction->getReductionOpType();
+        auto init = reduction->init();
+        auto out = reduction->out();
+        auto in = reduction->in();
+
+        fusion_->removeExpr(reduction);
+
+        fused_expr =
+            IrBuilder::create<ReductionOp>(red_op_type, init, out, in, true);
+      } else if (auto welford = dynamic_cast<WelfordOp*>(expr)) {
+        TORCH_INTERNAL_ASSERT(!welford->isAllreduce());
+
+        auto out_avg = welford->outAvg();
+        auto out_var = welford->outVar();
+        auto out_n = welford->outN();
+        auto init_avg = welford->initAvg();
+        auto init_var = welford->initVar();
+        auto init_n = welford->initN();
+        auto in_avg = welford->inAvg();
+        auto in_var = welford->inVar();
+        auto in_n = welford->inN();
+
+        fusion_->removeExpr(welford);
+
+        fused_expr = IrBuilder::create<WelfordOp>(
+            out_avg,
+            out_var,
+            out_n,
+            init_avg,
+            init_var,
+            init_n,
+            in_avg,
+            in_var,
+            in_n,
+            true);
+      } else if (auto grouped_rop = dynamic_cast<GroupedReductionOp*>(expr)) {
+        TORCH_INTERNAL_ASSERT(!grouped_rop->isAllreduce());
+
+        auto op_types = grouped_rop->getReductionOpTypes();
+        auto init_vals = grouped_rop->initVals();
+        auto outputs = grouped_rop->outputs();
+        auto inputs = grouped_rop->inputs();
+
+        fusion_->removeExpr(grouped_rop);
+
+        fused_expr = IrBuilder::create<GroupedReductionOp>(
+            op_types, init_vals, outputs, inputs, true);
+      } else {
+        TORCH_INTERNAL_ASSERT(false, "Invalid expr: ", expr->toString());
+      }
+
+      TORCH_INTERNAL_ASSERT(fused_expr != nullptr);
+
+      // Do not just remove the broadcast but just reset the thread
+      // predicate of the broadcast op. Since fusion is applied only
+      // when all parallel broadcast domains are to be parallel
+      // reduction, all parallel types can be reset.
+      if (with_broadcast) {
+        // It may be just fine to remove the broadcast expr, but
+        // technically speaking that would violate the root domain mapping
+        // as broadcast domains would appear in the consumer of the
+        // broadcast output tensor without a broadcast expression.
+        for (auto reduction_out :
+             ir_utils::filterByType<TensorView>(fused_expr->outputs())) {
+          for (auto id : reduction_out->domain()->domain()) {
+            if (id->isReduction()) {
+              GpuLower::current()->fusedReductionInfo().markAsAllreduce(id);
+              GpuLower::current()->threadPredMap().markAsUpdated(reduction_out);
+              thread_pred_map_modified_ = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  Fusion* fusion_ = nullptr;
+  const std::vector<FusedReductionBroadcastInfo>& fusion_list_;
+  bool thread_pred_map_modified_ = false;
+};
+
+} // namespace
+
+void fuseReductionsAndBroadcasts(Fusion* fusion) {
+  auto fusion_list = FusionInspector::run(fusion);
+  FusionTransformer::run(fusion, fusion_list);
+}
+
+void FusedReductionInfo::markAsAllreduce(IterDomain* id) {
+  allreduce_ids_.insert(id);
+}
+
+bool FusedReductionInfo::isAllreduce(IterDomain* id) const {
+  return allreduce_ids_.find(id) != allreduce_ids_.end();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h
new file mode 100644
index 000000000000..4307a30bc512
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Keep track of certain patterns of reductions.
+//!
+//! - Allreduce IterDomain: reduced and broadcast domain.
+class FusedReductionInfo {
+ public:
+  void markAsAllreduce(IterDomain* id);
+
+  bool isAllreduce(IterDomain* id) const;
+
+ private:
+  // Reduction IterDomains that are also broadcast
+  std::unordered_set<IterDomain*> allreduce_ids_;
+};
+
+//! Detect reductions and broadcasts that are eligible for the fused
+//! reduction kernel. When found, the predicate flags of the broadcast
+//! is unset, which effectively makes the broadcast just a unary set
+//! op.
+//! TODO: Consider moving the warp-based fused reduction here.
+void fuseReductionsAndBroadcasts(Fusion*);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp
new file mode 100644
index 000000000000..b3e9b1776acf
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp
@@ -0,0 +1,149 @@
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+// Replace trivial reductions with unary ops.
+class TrivialReductionReplacement : private OptOutMutator {
+ public:
+  TrivialReductionReplacement(
+      Fusion* fusion,
+      const TrivialReductionInfo& trivial_reduction_info)
+      : trivial_reduction_info_(trivial_reduction_info) {
+    FusionGuard fg(fusion);
+    auto exprs = StmtSort::getExprs(fusion);
+    for (auto expr : exprs) {
+      mutate(expr);
+    }
+  }
+
+ private:
+  using OptOutMutator::mutate;
+  void mutate(ReductionOp* rop) final {
+    if (ir_utils::isTvOp(rop)) {
+      auto out_tv = ir_utils::getTvOutput(rop);
+      if (std::all_of(
+              out_tv->domain()->domain().begin(),
+              out_tv->domain()->domain().end(),
+              [&](IterDomain* id) {
+                // If id is a reduction axis, is it a trivial reduction?
+                if (id->isReduction()) {
+                  return trivial_reduction_info_.isDerived(id);
+                } else {
+                  return true;
+                }
+              })) {
+        auto out = rop->out();
+        auto in = rop->in();
+        auto container = out->container();
+        removeExpr(container, rop);
+        IrBuilder::create<UnaryOp>(container, UnaryOpType::Set, out, in);
+      }
+    }
+  }
+
+  void mutate(GroupedReductionOp* grouped_rop) final {
+    if (ir_utils::isTvOp(grouped_rop)) {
+      // The inputs and outputs are all uniform in grouped reductions,
+      // so just checking one of the input and output pair should be
+      // sufficient.
+      auto out_tv = ir_utils::getTvOutput(grouped_rop);
+      if (std::all_of(
+              out_tv->domain()->domain().begin(),
+              out_tv->domain()->domain().end(),
+              [&](IterDomain* id) {
+                // If id is a reduction axis, is it a trivial reduction?
+                if (id->isReduction()) {
+                  return trivial_reduction_info_.isDerived(id);
+                } else {
+                  return true;
+                }
+              })) {
+        auto outputs = grouped_rop->outputs();
+        auto inputs = grouped_rop->inputs();
+        auto container = out_tv->container();
+        removeExpr(container, grouped_rop);
+        for (const auto i : c10::irange(outputs.size())) {
+          IrBuilder::create<UnaryOp>(
+              container, UnaryOpType::Set, outputs.at(i), inputs.at(i));
+        }
+      }
+    }
+  }
+
+  const TrivialReductionInfo& trivial_reduction_info_;
+};
+
+// Replaces Transpose, Shift, Gather, and View Ops with Unary Ops.
+class UnaryOpInserter : private kir::ExprMutator {
+ public:
+  static std::vector<Expr*> insert(const std::vector<Expr*>& exprs) {
+    UnaryOpInserter inserter(exprs);
+    return inserter.exprs_;
+  }
+
+ private:
+  using kir::ExprMutator::handle;
+
+  UnaryOpInserter(const std::vector<Expr*>& exprs) {
+    kir::ExprMutator::traverseAndInsert(exprs);
+  }
+
+  void handle(TransposeOp* top) final {
+    auto out = top->out();
+    auto in = top->in();
+    auto container = out->container();
+    registerReplace(
+        top, IrBuilder::create<UnaryOp>(container, UnaryOpType::Set, out, in));
+  }
+
+  void handle(ShiftOp* sop) final {
+    auto out = sop->out();
+    auto in = sop->in();
+    auto container = out->container();
+    registerReplace(
+        sop, IrBuilder::create<UnaryOp>(container, UnaryOpType::Set, out, in));
+  }
+
+  void handle(GatherOp* gop) final {
+    auto out = gop->out();
+    auto in = gop->in();
+    auto container = out->container();
+    registerReplace(
+        gop, IrBuilder::create<UnaryOp>(container, UnaryOpType::Set, out, in));
+  }
+
+  void handle(ViewOp* vop) final {
+    auto out = vop->out();
+    auto in = vop->in();
+    auto container = out->container();
+    registerReplace(
+        vop, IrBuilder::create<UnaryOp>(container, UnaryOpType::Set, out, in));
+  }
+};
+
+} // namespace
+
+void trivialReductionReplacement(
+    Fusion* fusion,
+    const TrivialReductionInfo& trivial_reduction_info) {
+  TrivialReductionReplacement replacement(fusion, trivial_reduction_info);
+}
+
+// Transpose, Shift, Gather, and View Ops with Unary Set Ops
+std::vector<Expr*> unarySetOpInserter(const std::vector<Expr*>& exprs) {
+  return UnaryOpInserter::insert(exprs);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h
new file mode 100644
index 000000000000..e18f4a8f0778
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// Replaces trivial reductions with Unary Set Ops
+void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&);
+
+// Transpose, Shift, Gather, and View Ops with Unary Set Ops
+std::vector<Expr*> unarySetOpInserter(const std::vector<Expr*>& exprs);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
index d92dd279b179..a1a658f580a0 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
@@ -13,30 +13,24 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-IndexLowering::IndexLowering() : ir_builder_(GpuLower::current()->kernel()) {}
-
-kir::Val* IndexLowering::lowerSrcIndex(kir::Val* src, kir::Val* dst) const {
-  if (auto tv = dynamic_cast<kir::TensorView*>(src)) {
-    TORCH_INTERNAL_ASSERT(dst->isA<kir::TensorView>());
-    return Index::getProducerIndex(
-        tv->fuserTv(),
-        dst->as<kir::TensorView>()->fuserTv(),
-        scope_utils::getLoops(active_scope_expr_));
+Val* IndexLowering::lowerSrcIndex(Val* src, Val* dst) const {
+  if (auto tv = dynamic_cast<TensorView*>(src)) {
+    TORCH_INTERNAL_ASSERT(dst->isA<TensorView>());
+    return Index::getProducerIndex(tv, dst->as<TensorView>(), for_loops_);
   } else {
     return src;
   }
 }
 
-kir::Val* IndexLowering::lowerDstIndex(kir::Val* dst) const {
-  if (auto tv = dynamic_cast<kir::TensorView*>(dst)) {
-    return Index::getConsumerIndex(
-        tv->fuserTv(), scope_utils::getLoops(active_scope_expr_));
+Val* IndexLowering::lowerDstIndex(Val* dst) const {
+  if (auto tv = dynamic_cast<TensorView*>(dst)) {
+    return Index::getConsumerIndex(tv, for_loops_);
   } else {
     return dst;
   }
 }
 
-void IndexLowering::pushBack(kir::Expr* expr) {
+void IndexLowering::pushBack(Expr* expr) {
   if (active_scope_ == nullptr) {
     lowered_exprs_.push_back(expr);
   } else {
@@ -44,78 +38,113 @@ void IndexLowering::pushBack(kir::Expr* expr) {
   }
 }
 
-void IndexLowering::visit(const kir::IfThenElse* ite) {
-  const auto prev_scope_expr = active_scope_expr_;
+Expr* IndexLowering::back() const {
+  if (active_scope_ == nullptr) {
+    TORCH_INTERNAL_ASSERT(
+        !lowered_exprs_.empty(), "IndexLowering::back: empty scope.");
+    return lowered_exprs_.back();
+  }
+  TORCH_INTERNAL_ASSERT(
+      !active_scope_->empty(), "IndexLowering::back: empty scope.");
+  return active_scope_->exprs().back();
+}
+
+void IndexLowering::insertAtTopLevel(Expr* expr) {
+  TORCH_INTERNAL_ASSERT(!lowered_exprs_.empty());
+  lowered_exprs_.insert(lowered_exprs_.end() - 1, expr);
+}
+
+void IndexLowering::handle(const kir::IfThenElse* ite) {
   const auto prev_scope = active_scope_;
 
-  // TODO(kir): try to avoid recreating new nodes and leaving old ones around
-  auto new_ite = ir_builder_.create<kir::IfThenElse>(ite->predicate());
+  auto new_ite = IrBuilder::create<kir::IfThenElse>(ite->predicate());
   pushBack(new_ite);
 
-  active_scope_expr_ = new_ite;
   active_scope_ = &new_ite->thenBody();
 
   for (auto expr : ite->thenBody().exprs()) {
-    expr->accept(this);
+    OptOutConstDispatch::handle(expr);
   }
 
   active_scope_ = &new_ite->elseBody();
 
   for (auto expr : ite->elseBody().exprs()) {
-    expr->accept(this);
+    OptOutConstDispatch::handle(expr);
   }
 
   active_scope_ = prev_scope;
-  active_scope_expr_ = prev_scope_expr;
 }
 
-void IndexLowering::visit(const kir::ForLoop* for_loop) {
-  const auto prev_scope_expr = active_scope_expr_;
+void IndexLowering::handle(const kir::ForLoop* for_loop) {
   const auto prev_scope = active_scope_;
 
-  auto new_for_loop = ir_builder_.create<kir::ForLoop>(for_loop);
+  auto new_for_loop = IrBuilder::create<kir::ForLoop>(for_loop);
   pushBack(new_for_loop);
 
-  active_scope_expr_ = new_for_loop;
   active_scope_ = &new_for_loop->body();
+  for_loops_.push_back(new_for_loop);
 
   for (auto expr : for_loop->body().exprs()) {
-    expr->accept(this);
+    OptOutConstDispatch::handle(expr);
   }
 
+  for_loops_.pop_back();
   active_scope_ = prev_scope;
-  active_scope_expr_ = prev_scope_expr;
 }
 
-void IndexLowering::visit(const kir::UnaryOp* uop) {
+void IndexLowering::handle(const UnaryOp* uop) {
   const auto in = lowerSrcIndex(uop->in(), uop->out());
   const auto out = lowerDstIndex(uop->out());
-  pushBack(ir_builder_.create<kir::UnaryOp>(uop->operation(), out, in));
+  pushBack(IrBuilder::create<UnaryOp>(uop->getUnaryOpType(), out, in));
+  GpuLower::current()->propagateExprInfo(uop, back());
 }
 
-void IndexLowering::visit(const kir::BinaryOp* bop) {
+void IndexLowering::handle(const BinaryOp* bop) {
   const auto lhs = lowerSrcIndex(bop->lhs(), bop->out());
   const auto rhs = lowerSrcIndex(bop->rhs(), bop->out());
   const auto out = lowerDstIndex(bop->out());
-  pushBack(ir_builder_.create<kir::BinaryOp>(bop->operation(), out, lhs, rhs));
+  pushBack(IrBuilder::create<BinaryOp>(bop->getBinaryOpType(), out, lhs, rhs));
+  GpuLower::current()->propagateExprInfo(bop, back());
 }
 
-void IndexLowering::visit(const kir::TernaryOp* top) {
+void IndexLowering::handle(const TernaryOp* top) {
   const auto in1 = lowerSrcIndex(top->in1(), top->out());
   const auto in2 = lowerSrcIndex(top->in2(), top->out());
   const auto in3 = lowerSrcIndex(top->in3(), top->out());
   const auto out = lowerDstIndex(top->out());
-  pushBack(
-      ir_builder_.create<kir::TernaryOp>(top->operation(), out, in1, in2, in3));
+  pushBack(IrBuilder::create<TernaryOp>(
+      top->getTernaryOpType(), out, in1, in2, in3));
+  GpuLower::current()->propagateExprInfo(top, back());
+}
+
+void IndexLowering::handle(const ViewAsScalar* uop) {
+  const auto in = lowerSrcIndex(uop->in(), uop->out());
+  const auto out = lowerDstIndex(uop->out());
+  for (auto loop : for_loops_) {
+    if (GpuLower::current()->caMap()->areMapped(
+            loop->iter_domain(),
+            uop->vector_id()->as<IterDomain>(),
+            IdMappingMode::LOOP)) {
+      Val* index = loop->index();
+      pushBack(
+          IrBuilder::create<ViewAsScalar>(out, in, uop->vector_id(), index));
+      GpuLower::current()->propagateExprInfo(uop, back());
+      return;
+    }
+  }
+  TORCH_INTERNAL_ASSERT(false, "Can not find index for vector dim");
 }
 
 namespace {
 
 // Get the size of the temporary work buffer for grid communication, this can be
 // grid reduction, broadcast, or grid welford.
-kir::Val* getGridCommWorkBufferSize(
-    kir::IrBuilder& ir_builder,
-    const kir::TensorDomain* td) {
+// expansion_factor can be optionally passed to expand the allocation
+// size. For example, FusedReduction should double the work buffer size.
+Val* getGridCommWorkBufferSize(
+    const TensorDomain* td,
+    const std::vector<kir::ForLoop*>& for_loops = {},
+    int expansion_factor = 1) {
   // The buffer size is the number of thread blocks multiplied by the
   // number of threads not used for reduction domains.
   // Note: Previously it was calculated based on the shape of the
@@ -125,7 +154,11 @@ kir::Val* getGridCommWorkBufferSize(
   // size if the parallel dimensions are exact, but otherwise, just
   // computing the buffer size based on the tensor shape isn't
   // sufficient since there could be extra threads/blocks.
-  kir::Val* buffer_size = ir_builder.create<kir::Int>(1);
+  TORCH_INTERNAL_ASSERT(
+      expansion_factor >= 1, "Invalid expansion factor: ", expansion_factor);
+  Val* buffer_size = expansion_factor == 1
+      ? GpuLower::current()->kernel()->oneVal()
+      : IrBuilder::create<Int>(expansion_factor);
   for (auto pt : kParallelTypeThreads) {
     auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt);
     if (pt_dim == nullptr || pt_dim->isOneInt()) {
@@ -133,172 +166,415 @@ kir::Val* getGridCommWorkBufferSize(
     }
     if (isParallelTypeThreadDim(pt) &&
         std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) {
-          return out_id->parallelType() == pt &&
+          return out_id->getParallelType() == pt &&
               (out_id->isReduction() || out_id->isBroadcast());
         })) {
       continue;
     }
-    buffer_size = ir_builder.mulExpr(buffer_size, pt_dim);
+    buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim);
+  }
+
+  // All iteration domains require a separate entry in the buffer for re-entrant
+  // grid reductions.
+  for (auto fl : for_loops) {
+    if (fl->isTrivial()) {
+      continue;
+    }
+    if (fl->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+    buffer_size =
+        SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent());
   }
+
   return buffer_size;
 }
 
-kir::Val* getGridSyncBufferSize(
-    kir::IrBuilder& ir_builder,
-    const kir::TensorDomain* td) {
+Val* getGridSyncBufferSize(
+    const TensorDomain* td,
+    const std::vector<kir::ForLoop*>& for_loops = {}) {
   // See the comment above for getGridCommWorkBufferSize.
-  kir::Val* buffer_size = ir_builder.create<kir::Int>(1);
+  Val* buffer_size = GpuLower::current()->kernel()->oneVal();
   for (auto pt : kParallelTypeBIDs) {
     auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt);
     if (pt_dim == nullptr || pt_dim->isOneInt()) {
       continue;
     }
     if (std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) {
-          return out_id->parallelType() == pt &&
+          return out_id->getParallelType() == pt &&
               (out_id->isReduction() || out_id->isBroadcast());
         })) {
       continue;
     }
-    buffer_size = ir_builder.mulExpr(buffer_size, pt_dim);
+    buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim);
+  }
+
+  // All iteration domains require a separate semaphore for re-entrant grid
+  // reductions
+  for (auto fl : for_loops) {
+    if (fl->isTrivial()) {
+      continue;
+    }
+    if (fl->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+
+    buffer_size =
+        SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent());
   }
+
   return buffer_size;
 }
 
-// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid
-// welford reduce, grid broadcast.
-kir::Allocate* allocGlobalBufferForGridComm(
-    kir::IrBuilder& ir_builder,
-    kir::Val* buffer_size,
-    DataType dtype,
-    bool zero_init) {
-  const std::vector<kir::IterDomain*> new_buffer_ids = {
-      ir_builder.create<kir::IterDomain>(ir_builder.zeroVal(), buffer_size)};
-  const auto buffer_domain =
-      ir_builder.create<kir::TensorDomain>(new_buffer_ids);
-  const auto buffer_tv = ir_builder.create<kir::TensorView>(
-      dtype, buffer_domain, MemoryType::Global);
-  return ir_builder.create<kir::Allocate>(
-      buffer_tv, buffer_tv->memoryType(), nullptr, zero_init);
+Val* getEntranceCountGridReduce(std::vector<kir::ForLoop*>& for_loops) {
+  Val* grid_reduction_entrances = GpuLower::current()->kernel()->oneVal();
+
+  for (const auto loop : for_loops) {
+    if (loop->isTrivial()) {
+      continue;
+    }
+    if (loop->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+    // TODO: Does this work for shift/gather?
+    grid_reduction_entrances = SimplifyingIrBuilder::mulExpr(
+        grid_reduction_entrances, loop->iter_domain()->extent());
+  }
+  return grid_reduction_entrances;
+}
+
+// Linear indexing of for loops for multiple entrances into grid reduce
+// TODO: What happens if there's a broadcast that's resolved (not present in the
+// grid reduce) but the global buffer isn't expanded?
+Val* getEntranceLinIndGridReduce(std::vector<kir::ForLoop*>& for_loops) {
+  Val* linear_index = GpuLower::current()->kernel()->zeroVal();
+
+  for (const auto loop : for_loops) {
+    if (loop->isTrivial()) {
+      continue;
+    }
+    if (loop->iter_domain()->isThread()) {
+      // already accounted for.
+      continue;
+    }
+    // TODO: Does this work for shift/gather?
+    linear_index = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(
+            linear_index, loop->iter_domain()->extent()),
+        loop->index());
+  }
+  return linear_index;
 }
 
 } // namespace
 
-void IndexLowering::visit(const kir::ReductionOp* rop) {
-  TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(rop));
+void IndexLowering::handle(const ReductionOp* rop) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(rop));
 
-  const auto out_tv = rop->out()->as<kir::TensorView>();
+  const auto out_tv = rop->out()->as<TensorView>();
   const auto out_domain = out_tv->domain();
 
-  const bool is_block_reduce = out_domain->hasBlockReduction();
-  const bool is_grid_reduce = out_domain->hasGridReduction();
+  const bool has_block_reduce = out_domain->hasBlockReduction();
+  const bool has_grid_reduce = out_domain->hasGridReduction();
+
+  const auto out = lowerDstIndex(rop->out());
+  const auto in = lowerSrcIndex(rop->in(), rop->out());
+
+  if (has_grid_reduce) {
+    handleGridReduction(rop, out, in);
+  } else if (has_block_reduce) {
+    handleBlockReduction(rop, out, in);
+  } else {
+    pushBack(
+        IrBuilder::create<BinaryOp>(rop->getReductionOpType(), out, out, in));
+    GpuLower::current()->propagateExprInfo(rop, back());
+  }
+}
+
+void IndexLowering::handleBlockReduction(
+    const ReductionOp* rop,
+    Val* out,
+    Val* in) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(rop));
+
+  ReductionOp* indexed_rop = IrBuilder::create<ReductionOp>(
+      rop->getReductionOpType(), rop->init(), out, in, rop->isAllreduce());
+  if (rop->predicate()) {
+    indexed_rop->setPredicate(rop->predicate());
+  }
+  if (rop->writePredicate()) {
+    indexed_rop->setWritePredicate(rop->writePredicate());
+  }
+
+  pushBack(indexed_rop);
+  GpuLower::current()->propagateExprInfo(rop, back());
+}
+
+void IndexLowering::handleGridReduction(
+    const ReductionOp* rop,
+    Val* out,
+    Val* in) {
+  const auto out_tv = out->as<kir::TensorIndex>()->view();
+  const auto out_domain = out_tv->domain();
+
+  TORCH_INTERNAL_ASSERT(out_domain->hasGridReduction());
 
   // If we do a grid reduction we can't have a reduction axis that is not bound
-  // to a grid or block dim ()
-  if (is_grid_reduce) {
-    TORCH_INTERNAL_ASSERT(
-        std::none_of(
-            out_domain->domain().begin(),
-            out_domain->domain().end(),
-            [](kir::IterDomain* id) {
-              return !id->isThread() && id->isReduction() &&
-                  !id->extent()->isOneInt();
-            }),
-        "Found a reduction stage that has both a non-parallelized ",
-        "reduction and a grid reduction.  This is not supported, ",
-        "please use rfactor to do the serialized reduction first, ",
-        "then the grid reduction.");
+  // to a grid or block dim.
+  TORCH_INTERNAL_ASSERT(
+      std::none_of(
+          out_domain->domain().begin(),
+          out_domain->domain().end(),
+          [](IterDomain* id) {
+            return !id->isThread() && id->isReduction() &&
+                !id->extent()->isOneInt();
+          }),
+      "Found a reduction stage that has both a non-parallelized ",
+      "reduction and a grid reduction. This is not supported, ",
+      "please use rfactor to do the serialized reduction first, ",
+      "then the grid reduction.");
+
+  // When using the fused reduction in a loop, the global work buffer
+  // is double buffered to save global synchronizations.
+  auto is_within_a_loop = std::any_of(
+      out_domain->domain().begin(),
+      out_domain->domain().end(),
+      [](IterDomain* id) { return !isTrivialIterDomain(id); });
+
+  // Use a unique buffer for work and sync flag when called within a
+  // loop unless it's persistent. Grid all reduce means persistence is
+  // required. However, not being a grid all reduce does not mean
+  // non-persistence. Currently, if a cooperative grid reduction is
+  // required anywhere in the kernel, all grid reducitons are done in
+  // a persistent manner, so all grid reductions should be consulted.
+  // TODO: fix this
+  const bool privatize_buffer = !rop->isAllreduce();
+
+  const auto reduce_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridCommWorkBufferSize(
+          out_domain,
+          privatize_buffer ? for_loops_ : std::vector<kir::ForLoop*>(),
+          rop->isAllreduce() && is_within_a_loop ? 2 : 1),
+      out->dtype(),
+      false);
+
+  const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridSyncBufferSize(
+          out_domain,
+          privatize_buffer ? for_loops_ : std::vector<kir::ForLoop*>()),
+      DataType::Int,
+      true);
+
+  const auto entrance_ind = privatize_buffer
+      ? getEntranceLinIndGridReduce(for_loops_)
+      : GpuLower::current()->kernel()->zeroVal();
+  const auto n_entrances = privatize_buffer
+      ? getEntranceCountGridReduce(for_loops_)
+      : GpuLower::current()->kernel()->oneVal();
+
+  // The thread predicate for GridReduction needs to be set
+  // separately from the main predicate. Do not combine them like
+  // other expressions.
+  const auto& thread_pred =
+      GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv);
+
+  auto grid_reduction = IrBuilder::create<kir::GridReduction>(
+      rop->getReductionOpType(),
+      rop->init(),
+      out,
+      in,
+      reduce_buffer,
+      sync_buffer,
+      entrance_ind,
+      n_entrances,
+      rop->isAllreduce());
+
+  grid_reduction->setThreadPredicate(thread_pred);
+
+  if (rop->predicate()) {
+    grid_reduction->setPredicate(rop->predicate());
+  }
+  if (rop->writePredicate()) {
+    grid_reduction->setWritePredicate(rop->writePredicate());
   }
 
-  const auto out = lowerDstIndex(rop->out());
-  const auto in = lowerSrcIndex(rop->in(), rop->out());
+  pushBack(reduce_buffer);
+  pushBack(sync_buffer);
+  pushBack(grid_reduction);
+  GpuLower::current()->propagateExprInfo(rop, back());
+
+  if (rop->isAllreduce()) {
+    // When using the fused reduction, allocate the reduction object at
+    // the outer-most scope
+    auto fused_reduction_alloc_reduction =
+        IrBuilder::create<kir::AllocateFusedReduction>(grid_reduction);
+    insertAtTopLevel(fused_reduction_alloc_reduction);
+  }
+}
 
-  kir::ReductionOp* block_reduction_op = nullptr;
+void IndexLowering::handle(const GroupedReductionOp* grouped_rop) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(grouped_rop));
 
-  if (is_block_reduce) {
-    block_reduction_op = ir_builder_.create<kir::ReductionOp>(
-        rop->operation(), rop->init(), out, in);
-    if (rop->predicate()) {
-      block_reduction_op->setPredicate(rop->predicate());
-    }
-    if (rop->writePredicate()) {
-      block_reduction_op->setWritePredicate(rop->writePredicate());
-    }
-    pushBack(block_reduction_op);
-  }
-
-  if (is_grid_reduce) {
-    const auto reduce_buffer = allocGlobalBufferForGridComm(
-        ir_builder_,
-        getGridCommWorkBufferSize(ir_builder_, out_domain),
-        out->dtype(),
-        false);
-
-    const auto sync_buffer = allocGlobalBufferForGridComm(
-        ir_builder_,
-        getGridSyncBufferSize(ir_builder_, out_domain),
-        DataType::Int,
-        true);
-
-    const auto grid_reduction_op = (block_reduction_op == nullptr)
-        ? ir_builder_.create<kir::ReductionOp>(
-              rop->operation(), rop->init(), out, in)
-        : block_reduction_op;
-
-    // The thread predicate for GridReduction needs to be set
-    // separately from the main predicate. Do not combine them like
-    // other expressions.
-    const auto& thread_pred =
-        GpuLower::current()->threadPredMap().getPredicatedParallelTypes(
-            out_tv->fuserTv());
-    auto grid_reduction = ir_builder_.create<kir::GridReduction>(
-        grid_reduction_op, reduce_buffer, sync_buffer);
-    grid_reduction->setThreadPredicate(thread_pred);
-
-    if (rop->predicate()) {
-      // If preceded by a blockReduce, all thread blocks should have
-      // valid inputs to gridReduce. In fact, using the original
-      // predicate does not work when the write predicate of the
-      // blockReduce is different from the read predicate.
-      if (is_block_reduce) {
-        grid_reduction->setPredicate(
-            ir_builder_.create<kir::Predicate>(ir_builder_.trueVal()));
-      } else {
-        grid_reduction->setPredicate(rop->predicate());
-      }
-    }
+  const auto out_tv = ir_utils::getTvOutput(grouped_rop);
+  const auto out_domain = out_tv->domain();
+
+  const bool has_block_reduce = out_domain->hasBlockReduction();
+  const bool has_grid_reduce = out_domain->hasGridReduction();
+
+  std::vector<Val*> indexed_outputs(grouped_rop->numReductions());
+  std::vector<Val*> indexed_inputs(grouped_rop->numReductions());
+
+  for (const auto i : c10::irange(grouped_rop->numReductions())) {
+    indexed_outputs.at(i) = lowerDstIndex(grouped_rop->output(i));
+    indexed_inputs.at(i) =
+        lowerSrcIndex(grouped_rop->input(i), grouped_rop->output(i));
+  }
 
-    if (rop->writePredicate()) {
-      grid_reduction->setWritePredicate(rop->writePredicate());
+  if (has_grid_reduce) {
+    handleGridReduction(grouped_rop, indexed_outputs, indexed_inputs);
+  } else if (has_block_reduce) {
+    handleBlockReduction(grouped_rop, indexed_outputs, indexed_inputs);
+  } else {
+    for (const auto i : c10::irange(grouped_rop->numReductions())) {
+      pushBack(IrBuilder::create<BinaryOp>(
+          grouped_rop->getReductionOpType(i),
+          indexed_outputs.at(i),
+          indexed_outputs.at(i),
+          indexed_inputs.at(i)));
     }
+  }
+}
 
+void IndexLowering::handleBlockReduction(
+    const GroupedReductionOp* grouped_rop,
+    const std::vector<Val*>& outputs,
+    const std::vector<Val*>& inputs) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(grouped_rop));
+
+  GroupedReductionOp* indexed_rop = IrBuilder::create<GroupedReductionOp>(
+      grouped_rop->getReductionOpTypes(),
+      grouped_rop->initVals(),
+      outputs,
+      inputs,
+      grouped_rop->isAllreduce());
+  if (grouped_rop->predicate()) {
+    indexed_rop->setPredicate(grouped_rop->predicate());
+  }
+  if (grouped_rop->writePredicate()) {
+    indexed_rop->setWritePredicate(grouped_rop->writePredicate());
+  }
+
+  pushBack(indexed_rop);
+  GpuLower::current()->propagateExprInfo(grouped_rop, back());
+}
+
+void IndexLowering::handleGridReduction(
+    const GroupedReductionOp* grouped_rop,
+    const std::vector<Val*>& outputs,
+    const std::vector<Val*>& inputs) {
+  const auto out_tv = ir_utils::getTvOutput(grouped_rop);
+  const auto out_domain = out_tv->domain();
+
+  TORCH_INTERNAL_ASSERT(out_domain->hasGridReduction());
+
+  // If we do a grid reduction we can't have a reduction axis that is not bound
+  // to a grid or block dim.
+  TORCH_INTERNAL_ASSERT(
+      std::none_of(
+          out_domain->domain().begin(),
+          out_domain->domain().end(),
+          [](IterDomain* id) {
+            return !id->isThread() && id->isReduction() &&
+                !id->extent()->isOneInt();
+          }),
+      "Found a reduction stage that has both a non-parallelized ",
+      "reduction and a grid reduction. This is not supported, ",
+      "please use rfactor to do the serialized reduction first, ",
+      "then the grid reduction.");
+
+  // When using the fused reduction in a loop, the global work buffer
+  // is double buffered to save global synchronizations.
+  auto is_within_a_loop = std::any_of(
+      out_domain->domain().begin(),
+      out_domain->domain().end(),
+      [](IterDomain* id) { return !isTrivialIterDomain(id); });
+
+  std::vector<kir::Allocate*> reduce_buffers;
+  std::transform(
+      outputs.begin(),
+      outputs.end(),
+      std::back_inserter(reduce_buffers),
+      [&](Val* output) {
+        return ir_utils::allocGlobalBufferForGridComm(
+            getGridCommWorkBufferSize(
+                out_domain,
+                for_loops_,
+                (grouped_rop->isAllreduce() && is_within_a_loop ? 2 : 1)),
+            output->dtype(),
+            false);
+      });
+
+  const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridSyncBufferSize(out_domain, for_loops_), DataType::Int, true);
+
+  // The thread predicate for GridReduction needs to be set
+  // separately from the main predicate. Do not combine them like
+  // other expressions.
+  const auto& thread_pred =
+      GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv);
+
+  auto grid_reduction = IrBuilder::create<kir::GroupedGridReduction>(
+      grouped_rop->getReductionOpTypes(),
+      grouped_rop->initVals(),
+      outputs,
+      inputs,
+      reduce_buffers,
+      sync_buffer,
+      grouped_rop->isAllreduce());
+
+  grid_reduction->setThreadPredicate(thread_pred);
+
+  if (grouped_rop->predicate()) {
+    grid_reduction->setPredicate(grouped_rop->predicate());
+  }
+  if (grouped_rop->writePredicate()) {
+    grid_reduction->setWritePredicate(grouped_rop->writePredicate());
+  }
+
+  for (auto reduce_buffer : reduce_buffers) {
     pushBack(reduce_buffer);
-    pushBack(sync_buffer);
-    pushBack(grid_reduction);
   }
+  pushBack(sync_buffer);
+  pushBack(grid_reduction);
+  GpuLower::current()->propagateExprInfo(grouped_rop, back());
 
-  if (!is_block_reduce && !is_grid_reduce) {
-    // TODO(kir): this breaks our "SSA" form
-    pushBack(ir_builder_.create<kir::BinaryOp>(rop->operation(), out, out, in));
+  if (grouped_rop->isAllreduce()) {
+    auto fused_reduction_alloc_reduction =
+        IrBuilder::create<kir::AllocateFusedReduction>(grid_reduction);
+    insertAtTopLevel(fused_reduction_alloc_reduction);
   }
 }
 
-void IndexLowering::visit(const kir::WelfordOp* wop) {
-  TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(wop));
+void IndexLowering::handle(const WelfordOp* wop) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(wop));
 
-  const auto out_tv = wop->outAvg()->as<kir::TensorView>();
+  const auto out_tv = wop->outAvg()->as<TensorView>();
   const auto out_domain = out_tv->domain();
 
-  const bool is_block_reduce = out_domain->hasBlockReduction();
-  const bool is_grid_reduce = out_domain->hasGridReduction();
+  const bool has_block_reduce = out_domain->hasBlockReduction();
+  const bool has_grid_reduce = out_domain->hasGridReduction();
 
   // If we do a grid reduction we can't have a reduction axis that is not bound
   // to a grid or block dim ()
-  if (is_grid_reduce) {
+  if (has_grid_reduce) {
     TORCH_INTERNAL_ASSERT(
         std::none_of(
             out_domain->domain().begin(),
             out_domain->domain().end(),
-            [](kir::IterDomain* id) {
+            [](IterDomain* id) {
               return !id->isThread() && id->isReduction();
             }),
         "Found a reduction stage that has both a non-parallelized ",
@@ -322,96 +598,159 @@ void IndexLowering::visit(const kir::WelfordOp* wop) {
   auto out_var = lowerDstIndex(wop->outVar());
   auto out_N = lowerDstIndex(wop->outN());
 
-  kir::WelfordOp* welford_op = ir_builder_.create<kir::WelfordOp>(
-      out_var,
+  WelfordOp* indexed_wop = IrBuilder::create<WelfordOp>(
       out_avg,
+      out_var,
       out_N,
-      wop->initVar(),
       wop->initAvg(),
+      wop->initVar(),
       wop->initN(),
-      in_var,
       in_avg,
-      in_N);
+      in_var,
+      in_N,
+      wop->isAllreduce());
 
-  kir::WelfordOp* block_welford_op = nullptr;
+  if (wop->predicate()) {
+    indexed_wop->setPredicate(wop->predicate());
+  }
+  if (wop->writePredicate()) {
+    indexed_wop->setWritePredicate(wop->writePredicate());
+  }
 
-  if (is_block_reduce) {
-    block_welford_op = welford_op;
-    if (wop->predicate()) {
-      block_welford_op->setPredicate(wop->predicate());
-    }
-    if (wop->writePredicate()) {
-      block_welford_op->setWritePredicate(wop->writePredicate());
-    }
-    pushBack(block_welford_op);
-  }
-
-  if (is_grid_reduce) {
-    // Buffer allocation
-    const auto work_buffer_size =
-        getGridCommWorkBufferSize(ir_builder_, out_domain);
-
-    const auto out_var_buffer = allocGlobalBufferForGridComm(
-        ir_builder_, work_buffer_size, out_var->dtype(), false);
-    const auto out_avg_buffer = allocGlobalBufferForGridComm(
-        ir_builder_, work_buffer_size, out_avg->dtype(), false);
-    const auto out_N_buffer = allocGlobalBufferForGridComm(
-        ir_builder_, work_buffer_size, out_N->dtype(), false);
-
-    const auto sync_buffer = allocGlobalBufferForGridComm(
-        ir_builder_,
-        getGridSyncBufferSize(ir_builder_, out_domain),
-        DataType::Int,
-        true);
-
-    // Grid Welford instantiation
-    const auto grid_welford_op =
-        (block_welford_op == nullptr) ? welford_op : block_welford_op;
-
-    // The thread predicate for GridReduction needs to be set
-    // separately from the main predicate. Do not combine them like
-    // other expressions.
-    const auto& thread_pred =
-        GpuLower::current()->threadPredMap().getPredicatedParallelTypes(
-            out_tv->fuserTv());
-
-    auto grid_welford = ir_builder_.create<kir::GridWelford>(
-        grid_welford_op,
-        out_var_buffer,
-        out_avg_buffer,
-        out_N_buffer,
-        sync_buffer);
-
-    grid_welford->setThreadPredicate(thread_pred);
-
-    if (wop->predicate()) {
-      grid_welford->setPredicate(wop->predicate());
+  // Serial welford
+  if (!has_block_reduce && !has_grid_reduce) {
+    pushBack(indexed_wop);
+    GpuLower::current()->propagateExprInfo(wop, back());
+    return;
+  }
+
+  // Block-only welford
+  if (!has_grid_reduce) {
+    pushBack(indexed_wop);
+    GpuLower::current()->propagateExprInfo(wop, back());
+    return;
+  }
+
+  handleGridWelford(indexed_wop);
+}
+
+void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) {
+  const auto out_tv = indexed_wop->out()->as<kir::TensorIndex>()->view();
+  const auto out_domain = out_tv->domain();
+
+  // Buffer allocation
+  // When using the fused reduction in a loop, the global work buffer
+  // is double buffered to save global synchronizations.
+  auto is_within_a_loop = std::any_of(
+      out_domain->domain().begin(),
+      out_domain->domain().end(),
+      [](IterDomain* id) { return !isTrivialIterDomain(id); });
+
+  // TODO: See the comment on the same variable in handleGridReduction
+  const bool privatize_buffer = !indexed_wop->isAllreduce();
+
+  const auto work_buffer_size = getGridCommWorkBufferSize(
+      out_domain,
+      privatize_buffer ? for_loops_ : std::vector<kir::ForLoop*>(),
+      indexed_wop->isAllreduce() && is_within_a_loop ? 2 : 1);
+
+  const auto out_var_buffer = ir_utils::allocGlobalBufferForGridComm(
+      work_buffer_size, indexed_wop->outVar()->dtype(), false);
+  const auto out_avg_buffer = ir_utils::allocGlobalBufferForGridComm(
+      work_buffer_size, indexed_wop->outAvg()->dtype(), false);
+  const auto out_N_buffer = ir_utils::allocGlobalBufferForGridComm(
+      work_buffer_size, indexed_wop->outN()->dtype(), false);
+
+  const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridSyncBufferSize(
+          out_domain,
+          privatize_buffer ? for_loops_ : std::vector<kir::ForLoop*>()),
+      DataType::Int,
+      true);
+
+  const auto entrance_ind = privatize_buffer
+      ? getEntranceLinIndGridReduce(for_loops_)
+      : GpuLower::current()->kernel()->zeroVal();
+  const auto n_entrances = privatize_buffer
+      ? getEntranceCountGridReduce(for_loops_)
+      : GpuLower::current()->kernel()->oneVal();
+
+  // The thread predicate for GridReduction needs to be set
+  // separately from the main predicate. Do not combine them like
+  // other expressions.
+  const auto& thread_pred =
+      GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv);
+
+  auto grid_welford = IrBuilder::create<kir::GridWelford>(
+      indexed_wop,
+      out_var_buffer,
+      out_avg_buffer,
+      out_N_buffer,
+      sync_buffer,
+      entrance_ind,
+      n_entrances);
+
+  grid_welford->setThreadPredicate(thread_pred);
+
+  const bool block_reduce_separated =
+      out_domain->hasBlockReduction() && !indexed_wop->isAllreduce();
+
+  if (indexed_wop->predicate()) {
+    if (block_reduce_separated) {
+      grid_welford->setPredicate(IrBuilder::create<kir::Predicate>(
+          GpuLower::current()->kernel()->trueVal()));
+    } else {
+      grid_welford->setPredicate(indexed_wop->predicate());
     }
+  }
 
-    pushBack(out_var_buffer);
-    pushBack(out_avg_buffer);
-    pushBack(out_N_buffer);
-    pushBack(sync_buffer);
-    pushBack(grid_welford);
+  if (indexed_wop->writePredicate()) {
+    grid_welford->setWritePredicate(indexed_wop->writePredicate());
   }
 
-  if (!is_block_reduce && !is_grid_reduce) {
-    pushBack(welford_op);
+  if (block_reduce_separated) {
+    pushBack(indexed_wop);
+    GpuLower::current()->propagateExprInfo(indexed_wop, back());
+  }
+
+  pushBack(out_var_buffer);
+  pushBack(out_avg_buffer);
+  pushBack(out_N_buffer);
+  pushBack(sync_buffer);
+  pushBack(grid_welford);
+  GpuLower::current()->propagateExprInfo(indexed_wop, back());
+
+  if (indexed_wop->isAllreduce()) {
+    // When using the fused reduction, allocate the reduction object at
+    // the outer-most scope
+    auto fused_reduction_alloc_reduction =
+        IrBuilder::create<kir::AllocateFusedReduction>(grid_welford);
+    insertAtTopLevel(fused_reduction_alloc_reduction);
   }
 }
 
-void IndexLowering::visit(const kir::BroadcastOp* bop) {
-  TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(bop));
+void IndexLowering::handle(const MmaOp* mma) {
+  const auto a = lowerSrcIndex(mma->inA(), mma->out());
+  const auto b = lowerSrcIndex(mma->inB(), mma->out());
+  const auto out = lowerDstIndex(mma->out());
+  auto mma_indexed =
+      IrBuilder::create<MmaOp>(out, a, b, mma->init(), mma->options());
+  pushBack(mma_indexed);
+  GpuLower::current()->propagateExprInfo(mma, back());
+}
 
-  const auto out_tv = bop->out()->as<kir::TensorView>();
+void IndexLowering::handle(const BroadcastOp* bop) {
+  TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(bop));
+
+  const auto out_tv = bop->out()->as<TensorView>();
 
   const auto out = lowerDstIndex(bop->out());
   const auto in = lowerSrcIndex(bop->in(), bop->out());
-  auto indexed_expr = ir_builder_.create<kir::BroadcastOp>(out, in);
+  auto indexed_expr =
+      IrBuilder::create<BroadcastOp>(out, in, bop->getBroadcastDimFlags());
 
   const ParallelTypeBitmap parallel_bitmap =
-      GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
-          out_tv->fuserTv());
+      GpuLower::current()->threadPredMap().getParallelBroadcastDomains(out_tv);
 
   const bool block_x = parallel_bitmap.get(ParallelType::BIDx);
   const bool block_y = parallel_bitmap.get(ParallelType::BIDy);
@@ -424,24 +763,19 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) {
   const bool grid_broadcast_needed = block_x || block_y || block_z;
   if (!grid_broadcast_needed) {
     pushBack(indexed_expr);
+    GpuLower::current()->propagateExprInfo(bop, back());
     return;
   }
 
   // Grid broadcast
   const auto out_domain = out_tv->domain();
-  const auto broadcast_buffer = allocGlobalBufferForGridComm(
-      ir_builder_,
-      getGridCommWorkBufferSize(ir_builder_, out_domain),
-      out->dtype(),
-      false);
+  const auto broadcast_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridCommWorkBufferSize(out_domain), out->dtype(), false);
 
-  const auto sync_buffer = allocGlobalBufferForGridComm(
-      ir_builder_,
-      getGridSyncBufferSize(ir_builder_, out_domain),
-      DataType::Int,
-      true);
+  const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm(
+      getGridSyncBufferSize(out_domain), DataType::Int, true);
 
-  auto grid_broadcast = ir_builder_.create<kir::GridBroadcast>(
+  auto grid_broadcast = IrBuilder::create<kir::GridBroadcast>(
       indexed_expr, broadcast_buffer, sync_buffer);
 
   if (bop->predicate()) {
@@ -451,21 +785,27 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) {
   pushBack(broadcast_buffer);
   pushBack(sync_buffer);
   pushBack(grid_broadcast);
+  GpuLower::current()->propagateExprInfo(bop, back());
 }
 
-void IndexLowering::visit(const kir::Allocate* allocate) {
+void IndexLowering::handle(const kir::Allocate* allocate) {
   // TODO(kir): remove the need for const_cast
   pushBack(const_cast<kir::Allocate*>(allocate)); // NOLINT
 }
 
-void IndexLowering::visit(const kir::Sync* sync) {
+void IndexLowering::handle(const kir::BlockSync* sync) {
+  // TODO(kir): remove the need for const_cast
+  pushBack(const_cast<kir::BlockSync*>(sync)); // NOLINT
+}
+
+void IndexLowering::handle(const kir::GridSync* sync) {
   // TODO(kir): remove the need for const_cast
-  pushBack(const_cast<kir::Sync*>(sync)); // NOLINT
+  pushBack(const_cast<kir::GridSync*>(sync)); // NOLINT
 }
 
-void IndexLowering::generate(const std::vector<kir::Expr*>& exprs) {
+void IndexLowering::generate(const std::vector<Expr*>& exprs) {
   for (auto expr : exprs) {
-    expr->accept(this);
+    OptOutConstDispatch::handle(expr);
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h
index 5eb27c78f283..dfb14933770e 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.h
+++ b/torch/csrc/jit/codegen/cuda/lower_index.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 
 #include <vector>
@@ -14,10 +14,11 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor {
+// TODO: Replace with mutator as IndexLowering is replacing expr's with
+// versions that are doing indexing
+class TORCH_CUDA_CU_API IndexLowering : private OptOutConstDispatch {
  public:
-  static std::vector<kir::Expr*> getIndexedExprs(
-      std::vector<kir::Expr*> incoming_exprs) {
+  static std::vector<Expr*> getIndexedExprs(std::vector<Expr*> incoming_exprs) {
     FUSER_PERF_SCOPE("GpuLower::Lower::IndexLowering::getIndexedExprs");
     IndexLowering il;
     il.generate(incoming_exprs);
@@ -25,28 +26,56 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor {
   }
 
  private:
-  IndexLowering();
+  IndexLowering() = default;
 
-  void pushBack(kir::Expr*);
+  void pushBack(Expr*);
 
-  void visit(const kir::ForLoop*) final;
-  void visit(const kir::IfThenElse*) final;
-  void visit(const kir::UnaryOp*) final;
-  void visit(const kir::BinaryOp*) final;
-  void visit(const kir::TernaryOp*) final;
-  void visit(const kir::ReductionOp*) final;
-  void visit(const kir::WelfordOp*) final;
-  void visit(const kir::BroadcastOp*) final;
-  void visit(const kir::Allocate*) final;
-  void visit(const kir::Sync*) final;
+  // Return the most recently inserted
+  //  expression in the current active
+  //  scope or global scope.
+  Expr* back() const;
 
-  void generate(const std::vector<kir::Expr*>& exprs);
+  // Insert an expression before the current top-level expression.
+  void insertAtTopLevel(Expr* expr);
 
-  kir::Val* lowerSrcIndex(kir::Val* val, kir::Val* dst) const;
-  kir::Val* lowerDstIndex(kir::Val* dst) const;
+  void handle(const ViewAsScalar*) final;
+  void handle(const UnaryOp*) final;
+  void handle(const BinaryOp*) final;
+  void handle(const TernaryOp*) final;
+  void handle(const ReductionOp*) final;
+  void handle(const GroupedReductionOp*) final;
+  void handle(const WelfordOp*) final;
+  void handle(const MmaOp*) final;
+  void handle(const BroadcastOp*) final;
+
+  void handle(const kir::ForLoop*) final;
+  void handle(const kir::IfThenElse*) final;
+  void handle(const kir::Allocate*) final;
+  void handle(const kir::BlockSync*) final;
+  void handle(const kir::GridSync*) final;
+
+  void generate(const std::vector<Expr*>& exprs);
+
+  Val* lowerSrcIndex(Val* val, Val* dst) const;
+
+  Val* lowerDstIndex(Val* dst) const;
+
+  void handleBlockReduction(const ReductionOp* rop, Val* out, Val* in);
+  void handleGridReduction(const ReductionOp* rop, Val* out, Val* in);
+
+  void handleBlockReduction(
+      const GroupedReductionOp* rop,
+      const std::vector<Val*>& outputs,
+      const std::vector<Val*>& inputs);
+  void handleGridReduction(
+      const GroupedReductionOp* rop,
+      const std::vector<Val*>& outputs,
+      const std::vector<Val*>& inputs);
+
+  void handleGridWelford(WelfordOp* new_wop);
 
  private:
-  std::vector<kir::Expr*> lowered_exprs_;
+  std::vector<Expr*> lowered_exprs_;
 
   // This is a slight work around as scope has a couple definitions, we have the
   // Scope that's in ForLoop/IfThenElse which is really just a wrapper around
@@ -55,9 +84,10 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor {
   // could be either the body or else body of the IfThenElse. However, we want
   // to understand the nesting of IfThenElse/ForLoop nodes.
   kir::Scope* active_scope_ = nullptr;
-  kir::Expr* active_scope_expr_ = nullptr;
 
-  kir::IrBuilder ir_builder_;
+  // Track for loops to send to indexing. Similar to what's done in
+  // kir::IrVisitor
+  std::vector<kir::ForLoop*> for_loops_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp b/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp
new file mode 100644
index 000000000000..309867477924
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp
@@ -0,0 +1,338 @@
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_index_hoist.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+// Return leaf domains of a given domain.
+std::unordered_set<IterDomain*> getUsedLeafIds(
+    IterDomain* id,
+    TensorDomain* td) {
+  const auto all_vals_between = DependencyCheck::getAllValsBetween(
+      {id}, {td->domain().begin(), td->domain().end()});
+
+  std::unordered_set<IterDomain*> used_leaf_ids;
+
+  for (const auto leaf : td->domain()) {
+    if (std::find(all_vals_between.begin(), all_vals_between.end(), leaf) !=
+        all_vals_between.end()) {
+      used_leaf_ids.insert(leaf);
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      !used_leaf_ids.empty(),
+      "No used id found: ",
+      id->toString(),
+      ", ",
+      td->toString());
+
+  return used_leaf_ids;
+}
+
+} // namespace
+
+CommonIndexKey::CommonIndexKey(
+    IterDomain* consumer_indexed_id,
+    TensorDomain* consumer_td,
+    TensorDomain* ref_td,
+    const std::unordered_map<IterDomain*, Val*>& ref_index_map,
+    const std::vector<kir::ForLoop*>& loops) {
+  auto gpu_lower = GpuLower::current();
+
+  concrete_indexed_id_ = gpu_lower->caMap()->getConcreteMappedID(
+      consumer_indexed_id, IdMappingMode::EXACT);
+
+  const auto consumer_leaf_ids =
+      getUsedLeafIds(consumer_indexed_id, consumer_td);
+
+  // Convert to Parallel concrete IDs to find matching loops.
+  std::unordered_set<IterDomain*> concrete_leaf_ids;
+  for (auto& id : consumer_leaf_ids) {
+    concrete_leaf_ids.insert(
+        gpu_lower->caMap()->getConcreteMappedID(id, IdMappingMode::LOOP));
+  }
+
+  // Find used loops and their index vals
+  for (const auto i : c10::irange(loops.size())) {
+    auto loop = loops.at(i);
+    auto loop_id = gpu_lower->caMap()->getConcreteMappedID(
+        loop->iter_domain(), IdMappingMode::LOOP);
+    auto it = concrete_leaf_ids.find(loop_id);
+    if (it != concrete_leaf_ids.end()) {
+      // This leaf reference id is used for indexing the consumer id
+      used_loops_.push_back(loop);
+      auto index_it = ref_index_map.find(ref_td->axis(i));
+      TORCH_INTERNAL_ASSERT(
+          index_it != ref_index_map.end(),
+          "Index not found for leaf ID, ",
+          ref_td->axis(i)->toString());
+      loop_index_vals_.push_back(index_it->second);
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      !used_loops_.empty(),
+      "No loop used for indexing found. ",
+      consumer_indexed_id->toString());
+
+  TORCH_INTERNAL_ASSERT(
+      consumer_leaf_ids.size() == used_loops_.size(),
+      "consumer_leaf_ids.size() = ",
+      consumer_leaf_ids.size(),
+      ", used_loops_.size() == ",
+      used_loops_.size(),
+      ", loops.size() == ",
+      loops.size());
+}
+
+bool CommonIndexKey::operator==(const CommonIndexKey& other) const {
+  auto gpu_lower = GpuLower::current();
+
+  if (concrete_indexed_id_ != other.concrete_indexed_id_) {
+    return false;
+  }
+
+  if (used_loops_.size() != other.used_loops_.size()) {
+    return false;
+  }
+
+  // Check if both CommonIndexKeys use the same loops. If not, it's
+  // still valid to share the same hoisted index as long as: 1) each
+  // loop pair is mapped with the CA index map, and 2) they are not
+  // instantiated as actual loops.
+  for (const auto i : c10::irange(used_loops_.size())) {
+    auto lhs_loop = used_loops_.at(i);
+    auto rhs_loop = other.used_loops_.at(i);
+    if (lhs_loop == rhs_loop) {
+      continue;
+    }
+    if (gpu_lower->caMap()->areMapped(
+            lhs_loop->iter_domain(),
+            rhs_loop->iter_domain(),
+            IdMappingMode::EXACT) &&
+        lhs_loop->isTrivial() && rhs_loop->isTrivial()) {
+      continue;
+    }
+    return false;
+  }
+
+  for (const auto i : c10::irange(loop_index_vals_.size())) {
+    auto lhs_index = loop_index_vals_.at(i);
+    auto rhs_index = other.loop_index_vals_.at(i);
+    if (lhs_index == rhs_index) {
+      continue;
+    }
+    // Initial index variables can have some additions such as magic
+    // zero and "1" when used in producer indexing for double buffered
+    // tensors. Thus, the initial variables themselves may be
+    // different, and its components need to be examined. An easy way
+    // is to flatten them to strings as follows.
+    auto lhs_str = loop_index_vals_.at(i)->toInlineString();
+    auto rhs_str = other.loop_index_vals_.at(i)->toInlineString();
+    if (lhs_str == rhs_str) {
+      continue;
+    }
+
+    return false;
+  }
+
+  return true;
+}
+
+std::string CommonIndexKey::toString() const {
+  TORCH_INTERNAL_ASSERT(concrete_indexed_id_ != nullptr);
+  std::stringstream ss;
+  ss << "CommonIndexKey: " << concrete_indexed_id_->toString();
+  ss << ", { ";
+  for (auto loop : used_loops_) {
+    ss << loop->iter_domain()->toString() << " ";
+  }
+  ss << "}";
+  ss << ", { ";
+  for (auto val : loop_index_vals_) {
+    ss << val->toString() << " ";
+  }
+  ss << "}";
+  return ss.str();
+}
+
+std::pair<Val*, bool> CommonIndexMap::insert(
+    IterDomain* indexed_consumer_id,
+    TensorDomain* consumer_td,
+    TensorDomain* ref_td,
+    const std::unordered_map<IterDomain*, Val*>& ref_index_map,
+    const std::vector<kir::ForLoop*>& loops,
+    Val* index) {
+  if (index->definition() == nullptr) {
+    // Only expression is eligible to hoist
+    return {index, false};
+  }
+
+  const CommonIndexKey key(
+      indexed_consumer_id, consumer_td, ref_td, ref_index_map, loops);
+
+  Val* hoisted_index = nullptr;
+  bool new_index_inserted = false;
+
+  // If already mapped, return the previously mapped index
+  auto it = common_index_map_.find(key);
+  if (it != common_index_map_.end()) {
+    hoisted_index = it->second;
+    new_index_inserted = false;
+    ++use_counts_.at(key);
+  } else {
+    common_index_map_.emplace(key, index);
+    hoisted_index = index;
+    new_index_inserted = true;
+    use_counts_[key] = 1;
+  }
+
+  return {hoisted_index, new_index_inserted};
+}
+
+namespace {
+
+//! Insertion point of allocation
+struct CommonIndexInsertionInfo {
+  Expr* ref = nullptr;
+  kir::Scope* scope = nullptr;
+};
+
+// Inserts allocations of hoisted indices
+class CommonIndexInserter : private kir::ExprMutator {
+ public:
+  static std::vector<Expr*> run(
+      const std::vector<Expr*>& exprs,
+      const CommonIndexMap& common_indices) {
+    CommonIndexInserter inserter(exprs, common_indices);
+    return inserter.exprs_;
+  }
+
+ private:
+  CommonIndexInserter(
+      const std::vector<Expr*>& exprs,
+      const CommonIndexMap& common_index_map)
+      : common_index_map_(common_index_map) {
+    // Create a map to keys from loops where they should be inserted
+    for (const auto& kv : common_index_map.commonIndexMap()) {
+      const auto& key = kv.first;
+      // Only consider indices used multiple times
+      if (!usedMultipleTimes(key)) {
+        continue;
+      }
+      TORCH_INTERNAL_ASSERT(!key.usedLoops().empty());
+      auto insertion_loop = key.usedLoops().back();
+      innermost_used_loop_map_[insertion_loop].push_back(key);
+    }
+
+    traverseAndInsert(exprs);
+  }
+
+  CommonIndexInsertionInfo findInsertionPoint(
+      const CommonIndexKey& key,
+      kir::ForLoop* current_loop) const {
+    CommonIndexInsertionInfo info;
+
+    // Allocation must be inside any used non-trivial loop. Since the
+    // loop index value is constant if a loop is trivial, allocation
+    // does not need to be inside trivial loops.
+    for (const auto loop : key.usedLoops()) {
+      if (!loop->isTrivial()) {
+        info.ref = loop->body()[0];
+        info.scope = &(loop->body());
+      }
+    }
+
+    // If no non-trivial used loop is found, insert at the top-level
+    // scope just before the outer-most loop.
+    if (info.ref == nullptr) {
+      info.ref = scope_exprs_.empty() ? current_loop : scope_exprs_.at(0);
+      info.scope = nullptr;
+    }
+
+    return info;
+  }
+
+  using kir::ExprMutator::handle;
+
+  void handle(kir::ForLoop* loop) final {
+    auto innermost_loop_map_it = innermost_used_loop_map_.find(loop);
+    if (innermost_loop_map_it == innermost_used_loop_map_.end()) {
+      kir::ExprMutator::handle(loop);
+      return;
+    }
+
+    for (const auto& key : innermost_loop_map_it->second) {
+      auto common_index = common_index_map_.commonIndexMap().at(key);
+
+      // Insert only when the index is used multiple times and is not
+      // yet inserted.
+      if (inserted_indices_.find(common_index) != inserted_indices_.end()) {
+        continue;
+      }
+
+      // Make the type of the hoisted index be the index type of the
+      // kernel, which can be either int64_t or int. Not very clean,
+      // but this seems to be the quickest way to use the index type
+      // as we don't have a scalar IR node for the index type.
+      common_index->resolveIndexDtype();
+
+      auto alloc = IrBuilder::create<kir::Allocate>(
+          common_index,
+          MemoryType::Local,
+          GpuLower::current()->kernel()->oneVal());
+      const auto common_index_def = common_index->definition();
+      TORCH_INTERNAL_ASSERT(
+          common_index_def != nullptr,
+          "Hosted index must have a definition. ",
+          common_index->toString());
+
+      const auto insertion_info = findInsertionPoint(key, loop);
+      registerInsertBefore(insertion_info.ref, alloc, insertion_info.scope);
+      registerInsertBefore(
+          insertion_info.ref, common_index_def, insertion_info.scope);
+
+      // Track inserted index
+      inserted_indices_.emplace(common_index);
+    }
+
+    kir::ExprMutator::handle(loop);
+  }
+
+  bool usedMultipleTimes(const CommonIndexKey& key) {
+    auto it = common_index_map_.useCounts().find(key);
+    TORCH_INTERNAL_ASSERT(
+        it != common_index_map_.useCounts().end(),
+        "Key not found in the use-count map: ",
+        key.toString());
+    return it->second > 1;
+  }
+
+ private:
+  const CommonIndexMap& common_index_map_;
+  //! Map to CommonIndexKeys from their innermost used loops
+  std::unordered_map<kir::ForLoop*, std::vector<CommonIndexKey>>
+      innermost_used_loop_map_;
+  //! Keep track of inserted indices
+  std::unordered_set<Val*> inserted_indices_;
+};
+
+} // namespace
+
+std::vector<Expr*> allocateCommonIndices(const std::vector<Expr*>& exprs) {
+  return CommonIndexInserter::run(exprs, GpuLower::current()->commonIndexMap());
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.h b/torch/csrc/jit/codegen/cuda/lower_index_hoist.h
new file mode 100644
index 000000000000..5e0256f9e844
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_index_hoist.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+// Hoisting common index subexpressions
+//
+// Class CommonIndexMap is updated during the lowering as new indices
+// are inserted. An index is uniquely identified with CommonIndexKey,
+// which consists of the concrete ID of the indexed/predicated domain,
+// the for-loops used in the index, and the index vals of the use
+// for-loops.
+//
+// Once all indices are inserted to CommonIndexMap, allocations of the
+// the hoisted indices are inserted by allocateCommonIndices. Note
+// that this assumes that the CUDA code generator does not inline a
+// scalar Val with allocation (PR #1434).
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Class to represent unique indexed domains for index
+//! hoisting. Uniquenesss is determined with the indexed domain
+//! itself, the for-loops and their index values.
+class CommonIndexKey {
+  friend struct CommonIndexKeyHash;
+
+ public:
+  //! \param consumer_indexed_id Indexed consumer domain
+  //! \param consumer_td TensorDomain of consumer_indexed_id
+  //! \param ref_td Reference domain at the time of indexing
+  //! \param ref_index_map Index map of the reference domain
+  //! \param loops Loop structure where this id is indexed
+  CommonIndexKey(
+      IterDomain* consumer_indexed_id,
+      TensorDomain* consumer_td,
+      TensorDomain* ref_td,
+      const std::unordered_map<IterDomain*, Val*>& ref_index_map,
+      const std::vector<kir::ForLoop*>& loops);
+
+  const IterDomain* concreteIndexedId() const {
+    return concrete_indexed_id_;
+  }
+
+  const std::vector<kir::ForLoop*>& usedLoops() const {
+    return used_loops_;
+  }
+
+  const std::vector<Val*>& loopIndexVals() const {
+    return loop_index_vals_;
+  }
+
+  bool operator==(const CommonIndexKey& other) const;
+
+  std::string toString() const;
+
+ private:
+  //! Concrete domain of indexed domain
+  IterDomain* concrete_indexed_id_ = nullptr;
+  //! Loops used for the index
+  std::vector<kir::ForLoop*> used_loops_;
+  //! Loop index vals for the used loops
+  std::vector<Val*> loop_index_vals_;
+};
+
+struct CommonIndexKeyHash {
+  std::size_t operator()(const CommonIndexKey& key) const {
+    auto h = std::hash<const IterDomain*>{}(key.concrete_indexed_id_);
+    // NOTE: do not use other fields as the pointers can be different
+    // even when two keys can share the same index
+    return h;
+  }
+};
+
+//! Map to hold hoisted common indices
+class TORCH_CUDA_CU_API CommonIndexMap {
+ public:
+  //! Register an indexd consumer domain to hoist
+  //!
+  //! Returns a corresponding hoisted index and a flag indicating if a
+  //! new index is inserted.
+  //!
+  //! Consumer domains are used even for producer indexing since
+  //! producer domains in producer indexing are temporary replay
+  //! domains.
+  std::pair<Val*, bool> insert(
+      IterDomain* indexed_consumer_id,
+      TensorDomain* consumer_td,
+      TensorDomain* ref_td,
+      const std::unordered_map<IterDomain*, Val*>& ref_index_map,
+      const std::vector<kir::ForLoop*>& loops,
+      Val* index);
+
+  const auto& commonIndexMap() const {
+    return common_index_map_;
+  }
+
+  const auto& useCounts() const {
+    return use_counts_;
+  }
+
+ private:
+  //! Map to hold hoisted common indices
+  std::unordered_map<CommonIndexKey, Val*, CommonIndexKeyHash>
+      common_index_map_;
+  std::unordered_map<CommonIndexKey, int, CommonIndexKeyHash> use_counts_;
+};
+
+//! Insert allocations of hoisted indices. Must be called after
+//! collecting all common indices.
+std::vector<Expr*> allocateCommonIndices(const std::vector<Expr*>& exprs);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
index 0947ef0f5790..34f3068d0699 100644
--- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
@@ -1,8 +1,8 @@
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
 
@@ -33,8 +33,8 @@ class SmemAllocMap {
  public:
   //! Insert a new node if it's a SMEM allocation
   void insert(kir::Allocate* alloc) {
-    if (auto tv = dynamic_cast<kir::TensorView*>(alloc->buffer())) {
-      if (tv->memoryType() == MemoryType::Shared) {
+    if (auto tv = dynamic_cast<TensorView*>(alloc->buffer())) {
+      if (tv->getMemoryType() == MemoryType::Shared) {
         // Note that a TensorView can have two allocations due to
         // unswitch.
         auto p = map_.insert({tv, alloc});
@@ -50,290 +50,313 @@ class SmemAllocMap {
     }
   }
 
-  //! Get the buffer that is actually allocated for a given TV
-  kir::TensorView* getRealBuffer(kir::TensorView* tv) const {
+  //! Run through aliases to get the buffer that is actually allocated for a
+  //! given TV
+  TensorView* getRealBuffer(TensorView* tv) const {
     auto it = map_.find(tv);
     TORCH_INTERNAL_ASSERT(
-        it != map_.end(), "Allocation not found for ", kir::toString(tv));
+        it != map_.end(), "Allocation not found for ", tv->toString());
     const kir::Allocate* alloc = it->second;
     while (alloc->alias()) {
       alloc = alloc->alias();
     }
     auto buf = alloc->buffer();
-    TORCH_INTERNAL_ASSERT(buf->isA<kir::TensorView>());
-    return buf->as<kir::TensorView>();
+    TORCH_INTERNAL_ASSERT(buf->isA<TensorView>());
+    return buf->as<TensorView>();
   }
 
  private:
-  std::unordered_map<kir::TensorView*, kir::Allocate*> map_;
+  std::unordered_map<TensorView*, kir::Allocate*> map_;
 };
 
-//! Insert WAR sync for a given ForLoop
-class LocalSyncInserterForLoop {
-  using TvSet = std::unordered_set<const kir::TensorView*>;
+struct WarMemoryInfo {
+  // True if there's a sync after the last read within the alloc loop.
+  bool sync_after_read = false;
 
- public:
-  //! Insert Sync nodes at the end of a given for-loop when a WAR
-  //! hazard may happen.
-  LocalSyncInserterForLoop(kir::ForLoop* fl, SmemAllocMap& alloc_map)
-      : alloc_map_(alloc_map) {
-    for (auto expr : fl->body().exprs()) {
-      handle(expr);
-    }
+  // True if there's a sync before the first write. There can be multiple writes
+  // from memory aliasing.
+  bool sync_before_write = false;
 
-    // No need to insert sync when the loop is not actually generated
-    if (fl->iter_domain()->isThread() || fl->iter_domain()->isBroadcast()) {
-      return;
-    }
-
-    // Determine if any smem TV is written to at beginning of the for-loop
-    // and whether that smem TV is read from at the end of the for-loop
-    // Insert new SyncThreads at end of for-loop to prevent WAR race condition
-    //
-    // TODO: replace __syncthreads with __threadfence for alias ops
-    //
-    if (detectIntersection(initial_, final_) &&
-        !fl->body().exprs().back()->isA<kir::Sync>() && !is_last_op_sync_) {
-      kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-      fl->body().push_back(ir_builder.create<kir::Sync>(true));
-      initial_sync_ = true;
-      is_last_op_sync_ = true;
-      final_.clear();
-    }
-  }
+  // Has there been a read of this memory location
+  bool read_hit = false;
 
-  const auto& initial() const {
-    return initial_;
-  }
+  // Has there been *the* write to this memory location, assumes single write
+  // instruction (needs to be before conditionals added to code)
+  bool write_hit = false;
 
-  const auto& final() const {
-    return final_;
-  }
+  // For loop this TV is compute_at'ed in.
+  kir::ForLoop* ca_loop = nullptr;
+};
 
-  const auto& all_smem_inputs() const {
-    return all_smem_inputs_;
+// To prevent shared memory from being over written before it is read, a
+// synchronization point has to be inserted either between the allocation of an
+// SMEM buffer and where we write into it, or after the buffer's last read
+// before exiting the allocation's scope.
+//
+// e.g.
+//  for i:
+//    "alloc A" in shared memory - This is really marked by the compute_at point
+//    sync_loc_0
+//    for j:
+//      sync_loc_1
+//      for k:
+//        sync_loc_2
+//        A = ...
+//      for k:
+//        ... = ... A
+//    for j:
+//      for k:
+//        ... = ... A
+//        sync_loc_3
+//      sync_loc_4
+//    sync_loc_5
+//
+// All sync locations here provide valid protection that memory in A is finished
+// being read before it is over written in the next iteration
+//
+// Insertion of sync threads will be done from the inner most position to the
+// outer most. If a sync protecting the buffer is not already placed, the
+// location prefered for the sync threads is the last possible position. One
+// future optimization could be to not sync on the last iteration of the loop
+// the sync is placed in.
+class WarSyncInserter : private kir::ExprMutator {
+ public:
+  static std::vector<Expr*> insert(const std::vector<Expr*>& exprs) {
+    WarSyncInserter inserter(exprs);
+    return inserter.exprs_;
   }
 
-  const auto& all_smem_outputs() const {
-    return all_smem_outputs_;
+ private:
+  //! Insert Sync nodes at the end of a given for-loop when a WAR
+  //! hazard may happen.
+  WarSyncInserter(const std::vector<Expr*>& exprs) {
+    auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap();
+    for (const auto& entry : lower_alloc_info_map) {
+      alloc_map_.insert(entry.first);
+    }
+    kir::ExprMutator::traverseAndInsert(exprs);
   }
 
-  void handle(kir::Expr* expr) {
-    if (ir_utils::isTVOp(expr)) {
-      is_last_op_sync_ = false;
-
-      // For this SyncInserter
-      if (initial_sync_) {
-        addInputSmemTvs(expr, final_);
-      } else {
-        addInputSmemTvs(expr, final_);
-        addOutputSmemTvs(expr, initial_);
+  void handle(kir::IfThenElse* ite) final {
+    TORCH_INTERNAL_ASSERT(
+        ite->elseBody().empty(),
+        "Pass does not support conditional flow,",
+        " needs to be done before conditional execution is lowered.");
+    kir::ExprMutator::handle(ite);
+  }
+
+  void handle(kir::BlockSync* sync) final {
+    // Register the sync for the active for loop
+    sync_hit_.back() = true;
+    // Run through the active allocations, if a read was hit, register there was
+    // a sync after the read. If there's subsequent reads on this buffer the
+    // sync_after_read will be cleared.
+    for (auto& entry : smem_allocations_) {
+      auto& alloc_stack = entry.second;
+      if (alloc_stack.back().read_hit) {
+        alloc_stack.back().sync_after_read = true;
       }
-
-      // For parent SyncInserter
-      addOutputSmemTvs(expr, all_smem_outputs_);
-      addInputSmemTvs(expr, all_smem_inputs_);
-    } else if (auto sync = dynamic_cast<kir::Sync*>(expr)) {
-      handle(sync);
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle(ite);
-    } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle(for_loop);
-    } else if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
-      alloc_map_.insert(alloc);
     }
   }
 
-  void handle(kir::Sync* sync) {
-    is_last_op_sync_ = true;
-    initial_sync_ = true;
-    final_.clear();
-  }
-
-  void handle(kir::IfThenElse* ite) {
-    for (auto expr : ite->thenBody().exprs()) {
-      handle(expr);
-    }
-    for (auto expr : ite->elseBody().exprs()) {
-      handle(expr);
+  void handle(kir::GridSync* sync) final {
+    // Register the sync for the active for loop
+    sync_hit_.back() = true;
+    // Run through the active allocations, if a read was hit, register there was
+    // a sync after the read. If there's subsequent reads on this buffer the
+    // sync_after_read will be cleared.
+    for (auto& entry : smem_allocations_) {
+      auto& alloc_stack = entry.second;
+      if (alloc_stack.back().read_hit) {
+        alloc_stack.back().sync_after_read = true;
+      }
     }
   }
 
-  void handle(kir::ForLoop* fl) {
-    LocalSyncInserterForLoop child_sync_inserter(fl, alloc_map_);
-
-    const auto& child_inputs = child_sync_inserter.all_smem_inputs();
-    const auto& child_outputs = child_sync_inserter.all_smem_outputs();
-    const bool maybe_skipped = !fl->start()->isZeroInt() &&
-        !isParallelTypeThread(fl->iter_domain()->parallelType());
-
-    // Default - Track all smem inputs / outputs
-    all_smem_inputs_.insert(child_inputs.begin(), child_inputs.end());
-    all_smem_outputs_.insert(child_outputs.begin(), child_outputs.end());
-
-    // Propagate the last_op_sync flag from the child loop. If the
-    // child is deterministically executed at least once, just set the
-    // flag with the child flag. Otherwise, conservatively set the
-    // flag, i.e., if the current flag is true and the child flag is
-    // also true, we can say the last op is still sync.
-    if (!maybe_skipped) {
-      is_last_op_sync_ = child_sync_inserter.is_last_op_sync_;
-    } else {
-      is_last_op_sync_ =
-          is_last_op_sync_ && child_sync_inserter.is_last_op_sync_;
+  // Checks if fl or loops within it have hit a sync
+  bool syncWithin(kir::ForLoop* fl) {
+    // If outer most scope check the first sync_hit_ position
+    if (fl == nullptr) {
+      return sync_hit_[0];
     }
 
-    // When the child is not guaranteed to have sync.
-    if (!child_sync_inserter.initial_sync_) {
-      // If no sync is yet found, add the child outputs to
-      // initial.
-      if (!initial_sync_) {
-        initial_.insert(child_outputs.begin(), child_outputs.end());
-      }
-      // Add the child inputs to final even when inital_sync is false,
-      // which only means sync may not be found yet.
-      final_.insert(child_inputs.begin(), child_inputs.end());
-    } else {
-      // Similar to the above case, but here, the child is guaranteed
-      // to have sync, so we only need to look at initial and final.
-      if (!initial_sync_) {
-        initial_.insert(
-            child_sync_inserter.initial().begin(),
-            child_sync_inserter.initial().end());
-      }
-      if (!maybe_skipped) {
-        initial_sync_ = true;
-        final_.clear();
-      }
-      final_.insert(
-          child_sync_inserter.final().begin(),
-          child_sync_inserter.final().end());
-    }
-  }
+    // Find the for loop we want to look within
+    auto fl_it = std::find(for_loops_.begin(), for_loops_.end(), fl);
 
-  static bool detectIntersection(const TvSet& left, const TvSet& right) {
-    for (auto item : left) {
-      if (right.find(item) != right.end()) {
+    // Convert it to an index, but add one for the outer most scope
+    auto fl_i = std::distance(for_loops_.begin(), fl_it) + 1;
+
+    // Start at that index and see if there's syncs within that for loop
+    for (auto i : c10::irange(fl_i, sync_hit_.size())) {
+      if (sync_hit_[i]) {
         return true;
       }
     }
     return false;
   }
 
-  void addOutputSmemTvs(const kir::Expr* expr, TvSet& set) {
-    for (auto out : expr->outputs()) {
-      if (auto tv = dynamic_cast<kir::TensorView*>(out)) {
-        if (tv->memoryType() == MemoryType::Shared) {
-          auto real_tv = alloc_map_.getRealBuffer(tv);
-          set.insert(real_tv);
-        }
-      }
+  void handle(Expr* expr) final {
+    // If not a tensor view expression continue with dispatch
+    if (!ir_utils::isTvOp(expr)) {
+      kir::ExprMutator::handle(expr);
+      return;
     }
-  }
 
-  void addInputSmemTvs(const kir::Expr* expr, TvSet& set) {
-    for (auto in : expr->inputs()) {
-      if (auto tv = dynamic_cast<kir::TensorView*>(in)) {
-        if (tv->memoryType() == MemoryType::Shared) {
-          auto real_tv = alloc_map_.getRealBuffer(tv);
-          set.insert(real_tv);
-        }
+    // Mark write has been hit for all output tvs
+    auto out_tvs = ir_utils::filterByType<TensorView>(expr->outputs());
+    for (auto out_tv : out_tvs) {
+      if (out_tv->getMemoryType() != MemoryType::Shared ||
+          GpuLower::current()->syncMap().needsRawSync(out_tv).none()) {
+        continue;
       }
-    }
-  }
 
- private:
-  //! Allocation map of SMEM buffers
-  SmemAllocMap& alloc_map_;
+      auto& entry = getMemInfo(out_tv);
 
-  //! Track Shared Memory Inputs (Reads) for parent for-loop
-  TvSet all_smem_inputs_;
+      // If this is the first write and there's a sync in one of the loops after
+      // the compute at loop, then this buffer is protected.
+      if (syncWithin(entry.ca_loop) && !entry.write_hit) {
+        entry.sync_before_write = true;
+      }
+      entry.write_hit = true;
+    }
 
-  //! Track Shared Memory Outputs (Writes) for parent for-loop
-  TvSet all_smem_outputs_;
+    // Mark read was hit, if sync_after_read was set, clear it.
+    auto inp_tvs = ir_utils::filterByType<TensorView>(expr->inputs());
+    for (auto inp_tv : inp_tvs) {
+      if (inp_tv->getMemoryType() != MemoryType::Shared ||
+          GpuLower::current()->syncMap().needsRawSync(inp_tv).none()) {
+        continue;
+      }
 
-  //! Shared Memory Writes at beginning of the for-loop
-  //! before first SyncThreads
-  TvSet initial_;
+      auto& entry = getMemInfo(inp_tv);
+      entry.read_hit = true;
+      // Clear the sync_after_read if it was set because there was another write
+      entry.sync_after_read = false;
+    }
+  }
 
-  //! Shared Memory Reads at end of the for-loop
-  //! Cleared after each SyncThreads
-  TvSet final_;
+  void handle(kir::ForLoop* for_loop) final {
+    // Push loop scope information
+    auto prev_within_iter_loop_ = within_iter_loop_;
+    sync_hit_.push_back(false);
 
-  //! Track first sync deterministically found in for-loop. Even when a
-  //! child loop has a sync, if it may not be executed due to non-zero
-  //! start value, this flag remains false.
-  bool initial_sync_ = false;
+    // If there is no real iterating loop WAR syncs aren't necessary
+    within_iter_loop_ = within_iter_loop_ || !for_loop->isTrivial();
 
-  //! Track if last op is sync
-  bool is_last_op_sync_ = false;
-};
+    // Process the expressions in the for loop
+    kir::ExprMutator::handle(for_loop);
 
-class LocalSyncInserter {
- public:
-  //! Write-After-Read race conditions are only found within for-loops.
-  //! Sync nodes are inserted directly into the for-loops.
-  //! The expressions are modified in-place and exprs is const.
-  static void insertSyncs(const std::vector<kir::Expr*>& exprs) {
-    LocalSyncInserter inserter;
-    inserter.insert(exprs);
-  }
+    // Sync analysis and cleanup:
+    //
+    //   Pop for loop stack inside WarMemoryInfo structs if they match this one.
+    //   Erase empty entries so we don't continue to search over them
+    //
+    //   Insert sync at end of this for loop if any of the entries require
+    std::vector<TensorView*> to_erase;
+    bool insert_sync = false;
+    for (auto& entry : smem_allocations_) {
+      auto& alloc_stack = entry.second;
+      if (alloc_stack.size() && alloc_stack.back().ca_loop == for_loop) {
+        if (!alloc_stack.back().sync_after_read &&
+            !alloc_stack.back().sync_before_write) {
+          insert_sync = within_iter_loop_;
+        }
 
- private:
-  void insert(const std::vector<kir::Expr*>& exprs) {
-    for (auto expr : exprs) {
-      if (auto fl = dynamic_cast<kir::ForLoop*>(expr)) {
-        LocalSyncInserterForLoop sync_inserter(fl, alloc_map_);
-      } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-        insert(ite->thenBody().exprs());
-        insert(ite->elseBody().exprs());
-      } else if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
-        alloc_map_.insert(alloc);
+        alloc_stack.pop_back();
+        if (alloc_stack.empty()) {
+          to_erase.push_back(entry.first);
+        }
       }
     }
-  }
 
- private:
+    for (auto tv : to_erase) {
+      smem_allocations_.erase(tv);
+    }
+
+    // WAR Sync is necessary in this loop, register its insertion.
+    if (insert_sync) {
+      auto sync_expr = IrBuilder::create<kir::BlockSync>(true);
+      kir::ExprMutator::registerInsertAfter(
+          for_loop->body().exprs().back(), sync_expr, &for_loop->body());
+      handle(sync_expr);
+    }
+
+    // Pop for loop scope information
+    sync_hit_.pop_back();
+    within_iter_loop_ = prev_within_iter_loop_;
+  }
+
+  // Create a new WarMemoryInfo entry if required and return a reference to it,
+  // else return the WarMemoryInfo associated with tv
+  WarMemoryInfo& getMemInfo(TensorView* tv) {
+    auto maybe_aliased_tv = alloc_map_.getRealBuffer(tv);
+    auto alloc_it = smem_allocations_.find(maybe_aliased_tv);
+    auto ca_loop =
+        loop_utils::getAllocInformation(tv, for_loops_).init_for_loop;
+    if (alloc_it == smem_allocations_.end()) {
+      WarMemoryInfo mem_info;
+      mem_info.ca_loop = ca_loop;
+      auto entry_it =
+          smem_allocations_
+              .insert(std::make_pair(
+                  maybe_aliased_tv, std::vector<WarMemoryInfo>({mem_info})))
+              .first;
+      return entry_it->second.back();
+    } else if (
+        maybe_aliased_tv != tv && alloc_it->second.back().ca_loop != ca_loop) {
+      WarMemoryInfo mem_info;
+      mem_info.ca_loop = ca_loop;
+      auto& alloc_stack = alloc_it->second;
+      alloc_stack.push_back(mem_info);
+      return alloc_stack.back();
+    }
+    return alloc_it->second.back();
+  }
+
+  //! Allocation map of SMEM buffers. Needed because of SMEM buffer aliasing,
+  //! need to track the root of the alias to properly insert WAR hazard syncs
   SmemAllocMap alloc_map_;
+
+  //! Is there a loop nest that has a non-trivial iteration (extent != 1) and
+  //! not bound to a block/thread. This indicates if a WAR sync is necessary,
+  //! otherwise the Expr is not in an iterating for loop.
+  bool within_iter_loop_ = false;
+
+  // Track which loops have hit a sync. Used to see if there's a sync before
+  // write.
+  std::vector<bool> sync_hit_ = {false};
+
+  // Keep track of the active allocations we need to protect. Key is the
+  // "getRealBuffer", not the raw tv. There can be multiple WarMemoryInfo's
+  // because of aliasing. If the "getRealBuffer" tv has a compute at outside the
+  // alias tv, each aliased tv in a unique ca_loop has to be tracked separately
+  // for WAR insertion.
+  std::unordered_map<TensorView*, std::vector<WarMemoryInfo>> smem_allocations_;
 };
 
 class ExprFlattener : private kir::IrVisitor {
  private:
-  void handle(kir::Expr* expr) {
+  using kir::IrVisitor::handle;
+
+  void handle(Expr* expr) final {
     if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
-      expr->accept(this);
+      kir::IrVisitor::handle(expr);
     } else {
-      exprs_.push_back(expr);
-    }
-  }
-
-  void visit(const kir::ForLoop* fl) final {
-    for (auto expr : fl->body().exprs()) {
-      handle(expr);
-    }
-  }
-
-  void visit(const kir::IfThenElse* ite) final {
-    for (auto expr : ite->thenBody().exprs()) {
-      handle(expr);
-    }
-    for (auto expr : ite->elseBody().exprs()) {
-      handle(expr);
+      flat_exprs_.push_back(expr);
     }
   }
 
  private:
-  std::vector<kir::Expr*> exprs_;
+  std::vector<Expr*> flat_exprs_;
 
  public:
   //! Flattens scopes extracting out a single ordered list of exprs.
-  static std::vector<kir::Expr*> flatten(
-      const std::vector<kir::Expr*>& loop_nests) {
+  static std::vector<Expr*> flatten(const std::vector<Expr*>& loop_nests) {
     ExprFlattener flattener;
     for (auto expr : loop_nests) {
       flattener.handle(expr);
     }
-    return flattener.exprs_;
+    return flattener.flat_exprs_;
   }
 };
 
@@ -342,53 +365,70 @@ class ValidatePlacementAfterWrites : private kir::IrVisitor {
   //! Validate no expr in writes found under loop
   static void validate(
       kir::ForLoop* loop,
-      const std::unordered_set<kir::Expr*>& writes) {
+      const std::unordered_set<Expr*>& writes) {
     ValidatePlacementAfterWrites validator(writes);
     validator.handle(loop);
   }
 
  private:
-  ValidatePlacementAfterWrites(const std::unordered_set<kir::Expr*>& writes)
+  using kir::IrVisitor::handle;
+
+  ValidatePlacementAfterWrites(const std::unordered_set<Expr*>& writes)
       : writes_(writes) {}
 
-  void handle(kir::Expr* expr) {
+  void handle(Expr* expr) final {
     if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
-      expr->accept(this);
+      kir::IrVisitor::handle(expr);
     } else {
       TORCH_INTERNAL_ASSERT(
           writes_.find(expr) == writes_.end(),
           "Block sync must be placed after ",
-          kir::toString(expr));
-    }
-  }
-
-  void visit(const kir::ForLoop* fl) final {
-    for (auto expr : fl->body().exprs()) {
-      handle(expr);
-    }
-  }
-
-  void visit(const kir::IfThenElse* ite) final {
-    for (auto expr : ite->thenBody().exprs()) {
-      handle(expr);
-    }
-    for (auto expr : ite->elseBody().exprs()) {
-      handle(expr);
+          expr->toString());
     }
   }
 
  private:
-  const std::unordered_set<kir::Expr*>& writes_;
+  const std::unordered_set<Expr*>& writes_;
 };
 
-class ReadAfterWriteSyncs : public kir::MutableIrVisitor {
+namespace {
+
+Val* getGridSyncBufferSize(const ParallelTypeBitmap& ptb) {
+  // See the comment above for getGridCommWorkBufferSize.
+  TORCH_INTERNAL_ASSERT(
+      ptb.hasBID(),
+      "Detected  needing a grid sync but no grid bits set in bitmap.");
+  Val* buffer_size = GpuLower::current()->kernel()->oneVal();
+  for (auto pt : kParallelTypeBIDs) {
+    // Synchronized within pt, so all blocks of this PT use the same
+    // sync buffer location, and thus no need to expand the sync
+    // buffer size.
+    if (ptb.get(pt)) {
+      continue;
+    }
+    auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt);
+    if (pt_dim == nullptr || pt_dim->isOneInt()) {
+      continue;
+    }
+    buffer_size = IrBuilder::mulExpr(buffer_size, pt_dim);
+  }
+  return buffer_size;
+}
+
+} // namespace
+
+class ReadAfterWriteSyncs : public kir::ExprMutator {
  private:
+  using kir::ExprMutator::handle;
+
   //! Traverse up the loop stack from loops_it and if a halo loop is
   //! found, place a given sync expr before the outer-most halo loop.
+  // TODO: What needs to be done here for gmem comm?
   bool insertBeforeHaloLoop(
       std::vector<kir::ForLoop*>::iterator loops_it,
-      kir::Sync* sync_expr,
-      const std::unordered_set<kir::Expr*>& writes) {
+      Expr* sync_expr,
+      Expr* maybe_alloc,
+      const std::unordered_set<Expr*>& writes) {
     std::vector<kir::ForLoop*>::iterator halo_loop_it;
     bool halo_loop_found = false;
 
@@ -420,131 +460,159 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor {
 
     if (halo_loop_it == for_loops_.begin()) {
       // place in global scope
-      auto place_before_it =
-          std::find(loop_nests_.begin(), loop_nests_.end(), halo_loop);
-      TORCH_INTERNAL_ASSERT(place_before_it != loop_nests_.end());
-      loop_nests_.insert(place_before_it, sync_expr);
+      auto place_before_it = std::find(exprs_.begin(), exprs_.end(), halo_loop);
+      TORCH_INTERNAL_ASSERT(place_before_it != exprs_.end());
+      exprs_.insert(place_before_it, sync_expr);
     } else {
       auto place_in = *(halo_loop_it - 1);
-      place_in->body().insert_before(halo_loop, sync_expr);
+      kir::ExprMutator::registerInsertBefore(
+          halo_loop, sync_expr, &place_in->body());
+      if (maybe_alloc != nullptr) {
+        kir::ExprMutator::registerInsertBefore(
+            halo_loop, maybe_alloc, &place_in->body());
+      }
     }
 
     return true;
   }
 
-  void handle(kir::Expr* expr) {
-    if (!ir_utils::isTVOp(expr) || expr->isA<kir::Allocate>()) {
-      expr->accept(this);
+  void handle(Expr* expr) final {
+    if (!ir_utils::isTvOp(expr) || expr->isA<kir::Allocate>()) {
+      kir::ExprMutator::handle(expr);
       return;
     }
 
-    if (sync_after_.size() > 0 && sync_after_.front() == expr) {
-      sync_after_.pop_front();
+    if (sync_before_.size() > 0 && sync_before_.front().first == expr) {
+      auto sync_bitmap = sync_before_.front().second;
+      sync_before_.pop_front();
       auto last_writes = last_writes_.front();
       last_writes_.pop_front();
       // Found that a sync is needed
-      TORCH_INTERNAL_ASSERT(expr->outputs()[0]->isA<kir::TensorView>());
-      auto out_tv = expr->outputs()[0]->as<kir::TensorView>();
 
       // Find where a sync needs to be inserted
       // This is very similar to how allocations are placed, simply place sync
-      // after the expression instead of placing like allocation where it goes
-      // before.
-      // TODO: This may be a common operation, could be worth making a utility
-      // out of or saving state for tensor view ID -> for loop
+      // before the expression at the common alloc point of producers (really
+      // last_writes because we may have other exprs we're syncing besides the
+      // producers of this one)
       // TODO: Explicitly test the 3 cases below
+      Expr* sync_expr = nullptr;
+      kir::Allocate* maybe_alloc = nullptr;
+      if (sync_bitmap.hasBID()) {
+        maybe_alloc = ir_utils::allocGlobalBufferForGridComm(
+            getGridSyncBufferSize(sync_bitmap), DataType::Int, true);
+        sync_expr = IrBuilder::create<kir::GridSync>(
+            sync_bitmap, maybe_alloc->buffer());
+      } else {
+        sync_expr = IrBuilder::create<kir::BlockSync>();
+      }
 
-      kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-      auto sync_expr = ir_builder.create<kir::Sync>();
-      if (out_tv->fuserTv()->getComputeAtPosition() == 0) {
-        // Sync should be placed at global scope, after its outer most loop if
-        // it has one.
-        kir::Expr* place_after = for_loops_.size() > 0 ? for_loops_[0] : expr;
-        // Find location in loop_nests_
-        auto place_after_it =
-            std::find(loop_nests_.begin(), loop_nests_.end(), place_after);
+      // The expressions in last_writes are those we're protecting the read
+      // from. To figure out which loop we need a syncthread in, take the inner
+      // most compute at for loop of all the outputs of the last writes.
+      std::unordered_set<kir::ForLoop*> sync_within;
+
+      for (auto last_write : last_writes) {
+        auto write_out_tv = ir_utils::getTvOutput(last_write);
         TORCH_INTERNAL_ASSERT(
-            place_after_it != loop_nests_.end(),
-            "Could not figure out where to place synchronization. ",
-            "Tried to place after, ",
-            toString(place_after),
-            ", but could not find this expression at the global scope.");
-        loop_nests_.insert(place_after_it + 1, sync_expr);
-      } else {
-        // Find the last loop in computeAt of out_tv, this is the loop where we
-        // would place an allocation for out_tv
-        auto fuser_tv = out_tv->fuserTv();
-        auto lowered_local_id =
-            GpuLower::current()
-                ->lowerValue(fuser_tv->axis(
-                    (int)out_tv->fuserTv()->getComputeAtPosition() - 1))
-                ->as<kir::IterDomain>();
+            write_out_tv != nullptr,
+            "Error in RAW sync insertion, expecting a TV expr, but didn't find one.");
+        if (write_out_tv->getComputeAtPosition() == 0) {
+          continue;
+        }
+
+        auto local_id =
+            write_out_tv->axis((int)write_out_tv->getComputeAtPosition() - 1);
 
         auto loops_it = std::find_if(
             for_loops_.begin(),
             for_loops_.end(),
-            [&lowered_local_id](const auto& loop) {
-              return GpuLower::current()->caLoopMap().areMapped(
-                         loop->iter_domain(), lowered_local_id) ||
-                  loop->iter_domain()->parallelType() == ParallelType::Unroll;
+            [&local_id](const auto& loop) {
+              return GpuLower::current()->caMap()->areMapped(
+                  loop->iter_domain(), local_id, IdMappingMode::PERMISSIVE);
             });
 
-        TORCH_INTERNAL_ASSERT(loops_it != for_loops_.end());
+        TORCH_INTERNAL_ASSERT(
+            loops_it != for_loops_.end(),
+            "Could not find loop associated with the alloc position of ",
+            write_out_tv->toString());
+
+        sync_within.emplace(*loops_it);
+      }
+
+      // The for loop the sync needs to be in
+      kir::ForLoop* sync_within_fl = nullptr;
+      for (auto fl : for_loops_) {
+        if (sync_within.count(fl)) {
+          sync_within_fl = fl;
+        }
+      }
+
+      if (sync_within_fl == nullptr) {
+        // Sync should be placed at global scope, after its outer most loop if
+        // it has one.
+        Expr* place_before = for_loops_.size() > 0 ? for_loops_[0] : expr;
+        // Find location in exprs_
+        auto place_before_it =
+            std::find(exprs_.begin(), exprs_.end(), place_before);
+        TORCH_INTERNAL_ASSERT(
+            place_before_it != exprs_.end(),
+            "Could not figure out where to place synchronization. ",
+            "Tried to place after, ",
+            place_before->toString(),
+            ", but could not find this expression at the global scope.");
+        if (maybe_alloc != nullptr) {
+          registerInsertBefore(place_before, maybe_alloc, nullptr);
+        }
+        registerInsertBefore(*(place_before_it), sync_expr, nullptr);
+      } else {
+        auto sync_within_loop_it =
+            std::find(for_loops_.begin(), for_loops_.end(), sync_within_fl);
 
         // block sync must be placed before halo-extended loops
-        if (insertBeforeHaloLoop(loops_it, sync_expr, last_writes)) {
+        if (insertBeforeHaloLoop(
+                sync_within_loop_it, sync_expr, maybe_alloc, last_writes)) {
           return;
         }
 
-        auto place_in = *loops_it;
-        kir::Expr* place_after = nullptr;
+        auto place_in = *sync_within_loop_it;
+        Expr* place_before = nullptr;
 
-        if (loops_it + 1 == for_loops_.end()) {
-          // Inline allocation, place after expr
-          place_after = expr;
+        if (sync_within_loop_it + 1 == for_loops_.end()) {
+          // Inline, place before expr
+          place_before = expr;
         } else {
-          // Place allocation after the last computeAt axis
-          // TODO: may be more efficient to place after the first non-computeAt
-          // axis
-          place_after = *(loops_it + 1);
+          place_before = *(sync_within_loop_it + 1);
         }
 
-        place_in->body().insert_after(place_after, sync_expr);
+        registerInsertBefore(place_before, sync_expr, &place_in->body());
+        if (maybe_alloc != nullptr) {
+          registerInsertBefore(place_before, maybe_alloc, &place_in->body());
+        }
       }
     }
   }
 
-  void visit(kir::ForLoop* fl) final {
-    for_loops_.push_back(fl);
-    // Modifying in place, make a copy of the vector
-    const std::vector<kir::Expr*> exprs = fl->body().exprs();
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-    for_loops_.pop_back();
-  }
-
-  void visit(kir::IfThenElse*) final {
+  void handle(kir::IfThenElse*) final {
     TORCH_INTERNAL_ASSERT(
         false,
         "Pass does not support conditional statements, ",
         "this pass should be run before any conditionals are placed in code.");
   }
 
-  // Clear the modify status for all shared memory buffers
-  static void cleanSharedMemory(
-      std::unordered_map<kir::Val*, kir::Expr*>& smem) {
-    smem.clear();
-  }
-
   // Return a set of expressions that modify shared-memory
   // tensors. Expressions are excluded when syncthreads are already
   // placed.
-  std::unordered_set<kir::Expr*> isModifiedSharedMemory(
-      const std::unordered_map<kir::Val*, kir::Expr*>& smem,
-      const std::vector<kir::Val*>& tvs) const {
-    std::unordered_set<kir::Expr*> last_writes;
-    for (auto tv : tvs) {
+  std::unordered_set<Expr*> isModifiedSharedMemory(
+      const std::unordered_map<Val*, Expr*>& smem,
+      const std::vector<Val*>& tvs) const {
+    std::unordered_set<Expr*> last_writes;
+    for (auto tv : ir_utils::filterByType<TensorView>(tvs)) {
+      if (GpuLower::current()->syncMap().needsRawSync(tv).none()) {
+        continue;
+      }
+      if (tv->getMemoryType() != MemoryType::Shared) {
+        continue;
+      }
       auto it = smem.find(tv);
       if (it != smem.end()) {
         last_writes.insert(it->second);
@@ -553,93 +621,140 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor {
     return last_writes;
   }
 
-  ReadAfterWriteSyncs(std::vector<kir::Expr*> _loop_nests)
-      : loop_nests_(std::move(_loop_nests)) {
+  std::unordered_set<Expr*> isModifiedGlobalMemory(
+      const std::unordered_map<Val*, Expr*>& gmem,
+      const std::vector<Val*>& tvs) const {
+    std::unordered_set<Expr*> last_writes;
+    for (auto tv : ir_utils::filterByType<TensorView>(tvs)) {
+      if (GpuLower::current()->syncMap().needsRawSync(tv).none()) {
+        continue;
+      }
+      auto it = gmem.find(tv);
+      if (it != gmem.end()) {
+        last_writes.insert(it->second);
+      }
+    }
+    return last_writes;
+  }
+
+  ReadAfterWriteSyncs(const std::vector<Expr*>& _exprs) {
     // Fusion shared_memory values
     // Tracks if shared memory is modified
-    std::unordered_map<kir::Val*, kir::Expr*> smem;
+    std::unordered_map<Val*, Expr*> smem;
+    std::unordered_map<Val*, Expr*> gmem;
 
     // Flatten all the expressions
-    auto flattened_exprs = ExprFlattener::flatten(loop_nests_);
+    auto flattened_exprs = ExprFlattener::flatten(_exprs);
 
-    kir::Expr* prev_tv_expr = nullptr;
+    Expr* prev_tv_expr = nullptr;
     for (auto expr : flattened_exprs) {
-      if (!ir_utils::isTVOp(expr) || expr->isA<kir::Allocate>()) {
+      if (!ir_utils::isTvOp(expr) || expr->isA<kir::Allocate>()) {
         continue;
       }
 
-      auto last_writes = isModifiedSharedMemory(smem, expr->inputs());
-      if (!last_writes.empty()) {
+      auto last_gmem_writes = isModifiedGlobalMemory(gmem, expr->inputs());
+      if (!last_gmem_writes.empty()) {
         TORCH_INTERNAL_ASSERT(
             prev_tv_expr != nullptr,
             "Can't require sync on inputs, however, detected it's needed.");
-        sync_after_.push_back(prev_tv_expr);
-        last_writes_.push_back(last_writes);
-        cleanSharedMemory(smem);
+        ParallelTypeBitmap bitmap;
+        for (auto entry : gmem) {
+          TORCH_INTERNAL_ASSERT(entry.first->isA<TensorView>());
+          auto sync_bits = GpuLower::current()->syncMap().needsRawSync(
+              entry.first->as<TensorView>());
+          bitmap |= sync_bits;
+        }
+
+        sync_before_.emplace_back(std::make_pair(expr, bitmap));
+        last_writes_.push_back(last_gmem_writes);
+        gmem.clear();
       }
 
-      for (auto out : expr->outputs()) {
-        if (out->isA<kir::TensorView>()) {
-          if (out->as<kir::TensorView>()->memoryType() == MemoryType::Shared) {
-            smem[out] = expr;
+      auto last_smem_writes = isModifiedSharedMemory(smem, expr->inputs());
+      if (!last_smem_writes.empty()) {
+        TORCH_INTERNAL_ASSERT(
+            prev_tv_expr != nullptr,
+            "Can't require sync on inputs, however, detected it's needed.");
+        ParallelTypeBitmap bitmap;
+        bitmap.set(ParallelType::TIDx);
+        bitmap.set(ParallelType::TIDy);
+        bitmap.set(ParallelType::TIDz);
+        sync_before_.emplace_back(std::make_pair(expr, bitmap));
+
+        // Before clearing `smem`, put all the currently pending smem writes
+        //  in last_writes_. This will make sure all the smem writes will
+        //  be taken into consideration when deciding which loopnest level
+        //  to insert the block sync. see FusionRAWSyncInsertionPlace4.
+        std::unordered_set<Expr*> smem_writes;
+        for (auto it : smem) {
+          // No need to keep track of shared mem writes that does not
+          //  require a RAW block sync.
+          if (GpuLower::current()
+                  ->syncMap()
+                  .needsRawSync(it.first->as<TensorView>())
+                  .hasTID()) {
+            smem_writes.insert(it.second);
           }
         }
+        last_writes_.push_back(smem_writes);
+        smem.clear();
+      }
+
+      for (auto tv : ir_utils::filterByType<TensorView>(expr->outputs())) {
+        // Double buffered tensors do not need RAW sync to be inserted
+        // here, except for the initial load part, which is taken care
+        // separately by DoubleBufferInserter.
+        if (tv->getMemoryType() == MemoryType::Shared &&
+            !tv->isDoubleBuffered()) {
+          smem[tv] = expr;
+        }
+        if (tv->getMemoryType() == MemoryType::Global) {
+          gmem[tv] = expr;
+        }
       }
 
       prev_tv_expr = expr;
     }
 
-    // Insert read after write syncs
-    const std::vector<kir::Expr*> exprs = loop_nests_;
-    for (auto expr : exprs) {
-      handle(expr);
-    }
+    kir::ExprMutator::traverseAndInsert(_exprs);
 
     TORCH_INTERNAL_ASSERT(
-        sync_after_.empty(), "Didn't place all required syncs.");
+        sync_before_.empty(), "Didn't place all required syncs.");
   }
 
  private:
   //! Keep track of expressions that must be followed by syncthreads
-  std::deque<kir::Expr*> sync_after_;
+  std::deque<std::pair<Expr*, ParallelTypeBitmap>> sync_before_;
 
   //! Keep track of write expressions that must be placed before
   //! syncthreads.
   //!
-  //! syncthreads is placed after for each expression of
-  //! sync_after_. However, if it's inside a loop with halo, it must
+  //! syncthreads is placed before for each expression of
+  //! sync_before_. However, if it's inside a loop with halo, it must
   //! be placed before that. last_writes_ keeps track of expressions
   //! modifying the smem buffer each syncthreads is used for so that
   //! it is not placed before those write expressions.
-  std::deque<std::unordered_set<kir::Expr*>> last_writes_;
-
-  //! Keep track of for loops while inserting syncthreads
-  std::vector<kir::ForLoop*> for_loops_;
-
-  //! Loop-nests where syncthreads are inserted
-  std::vector<kir::Expr*> loop_nests_;
+  std::deque<std::unordered_set<Expr*>> last_writes_;
 
  public:
-  static std::vector<kir::Expr*> insert(
-      const std::vector<kir::Expr*>& loop_nests) {
+  static std::vector<Expr*> insert(const std::vector<Expr*>& loop_nests) {
     ReadAfterWriteSyncs inserter(loop_nests);
-    return inserter.loop_nests_;
+    return inserter.exprs_;
   }
 };
 
 } // namespace
 
-std::vector<kir::Expr*> insertRawThreadSynchronization(
-    const std::vector<kir::Expr*>& exprs) {
+std::vector<Expr*> insertRawThreadSynchronization(
+    const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::insertRawThreadSynchronization");
   return ReadAfterWriteSyncs::insert(exprs);
 }
 
-std::vector<kir::Expr*> insertWarThreadSynchronization(
-    const std::vector<kir::Expr*>& exprs) {
+std::vector<Expr*> insertWarThreadSynchronization(
+    const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::insertWarThreadSynchronization");
-  LocalSyncInserter::insertSyncs(exprs);
-  return exprs;
+  return WarSyncInserter::insert(exprs);
 }
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
index 506183734484..756462f0bd7c 100644
--- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
@@ -16,40 +16,14 @@ namespace cuda {
 //!
 //! WAR race condition occurs when the next iteration of the loop overwrites
 //! shared memory value before a previous operation has finished reading it.
-//!
-//! WAR Race Check:
-//! Track all output shared memory TVs before first sync
-//! Track all input shared memory TVs after last sync
-//! If the intersection is non-empty, then there is a WAR race condition.
-//! Recursively check each nested for-loop
-//!
-//! Parent-Child For-Loop Recursive Relationship
-//! Notation:
-//! None - Zero Syncs
-//!   1+ - One or more Syncs
-//!  End - Sync is last op in for-loop to prevent WAR race condition
-//!
-//! Default: Track all shared memory inputs and outputs
-//!
-//! Parent - None
-//!  Child - None => Append All Child Outputs to Parent Initial
-//!  Child - 1+ => Parent first sync => Inherit Child Initial + Final
-//!  Child - End => Parent first sync => Keep Child Initial / Clear Parent Final
-//!
-//! Parent - 1+
-//!  Child - None => Append All Child to Parent Last
-//!  Child - 1+ => Child Final to Parent Final / Discard Child Initial
-//!  Child - End => Clear Parent Last / Discard Child Initial
-//!
-//! If Child - End and Parent has zero remaining operations, then
-//! Parent inherits Child End.
-//!
-std::vector<kir::Expr*> insertWarThreadSynchronization(
-    const std::vector<kir::Expr*>& exprs);
+std::vector<Expr*> insertWarThreadSynchronization(
+    const std::vector<Expr*>& exprs);
 
 //! Insert syncs between writing to shared memory and then reading it.
-std::vector<kir::Expr*> insertRawThreadSynchronization(
-    const std::vector<kir::Expr*>& exprs);
+//! RAW pass is run before indexing, unrolling (loop duplication), memory
+//! aliasing, and index (grid/block bcast/reduction)
+std::vector<Expr*> insertRawThreadSynchronization(
+    const std::vector<Expr*>& exprs);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
index e4396f9a864b..aa0ff1a44469 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -5,7 +5,6 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
@@ -19,7 +18,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-std::vector<kir::Expr*> LoopNestGenerator::loweredExprs(
+std::vector<Expr*> LoopNestGenerator::loweredExprs(
     const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::LoopNestGenerator::loweredExprs");
   TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr);
@@ -33,22 +32,20 @@ LoopNestGenerator::LoopNestGenerator(const std::vector<Expr*>& exprs) {
 
 namespace {
 
-kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-  auto extent_with_halo = gpu_lower->haloInfo().getExtent(kir_id);
+kir::ForLoop* openForHelper(kir::ForLoop* scope, IterDomain* id) {
+  auto extent_with_halo = GpuLower::current()->haloInfo().getExtent(id);
   kir::ForLoop* new_scope = nullptr;
   if (extent_with_halo) {
     // When an axis is extended with halo, unrolling and vectorization
     // are assumed to not be used for now.
     TORCH_INTERNAL_ASSERT(
-        kir_id->parallelType() != ParallelType::Unroll &&
-        !isParallelTypeVectorize(kir_id->parallelType()));
+        id->getParallelType() != ParallelType::Unroll &&
+        !isParallelTypeVectorize(id->getParallelType()));
     // Use the extent that's extended by halo
-    new_scope = ir_builder.create<kir::ForLoop>(
-        kir_id,
-        kir_id->isBroadcast() ? ir_builder.zeroVal()
-                              : ir_builder.create<kir::Int>(c10::nullopt),
+    new_scope = IrBuilder::create<kir::ForLoop>(
+        id,
+        id->isBroadcast() ? GpuLower::current()->kernel()->zeroVal()
+                          : IrBuilder::create<Int>(c10::nullopt),
         nullptr,
         extent_with_halo,
         nullptr,
@@ -56,7 +53,7 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) {
         nullptr,
         false);
   } else {
-    new_scope = ir_builder.create<kir::ForLoop>(kir_id);
+    new_scope = IrBuilder::create<kir::ForLoop>(id);
   }
   if (scope != nullptr) {
     scope->body().insert(0, new_scope);
@@ -66,13 +63,13 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) {
 
 } // namespace
 
-void LoopNestGenerator::openFor(kir::IterDomain* kir_iter_domain) {
+void LoopNestGenerator::openFor(IterDomain* id) {
   if (for_loops_.size() > 0) {
-    const auto new_scope = openForHelper(for_loops_.back(), kir_iter_domain);
+    const auto new_scope = openForHelper(for_loops_.back(), id);
     // for_loop_allocations_.insert({new_scope, 0});
     for_loops_.push_back(new_scope);
   } else {
-    for_loops_.push_back(openForHelper(nullptr, kir_iter_domain));
+    for_loops_.push_back(openForHelper(nullptr, id));
     lowered_exprs_.insert(lowered_exprs_.begin(), for_loops_.back());
   }
 }
@@ -82,7 +79,7 @@ void LoopNestGenerator::closeFor() {
   for_loops_.pop_back();
 }
 
-void LoopNestGenerator::pushFront(kir::Expr* expr) {
+void LoopNestGenerator::pushFront(Expr* expr) {
   if (for_loops_.size() == 0) {
     lowered_exprs_.insert(lowered_exprs_.begin(), expr);
   } else {
@@ -91,18 +88,15 @@ void LoopNestGenerator::pushFront(kir::Expr* expr) {
 }
 
 void LoopNestGenerator::handle(Expr* expr) {
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   // Check if it's a tensor view expression we need to place in the loop nest
   // structure
-  if (!ir_utils::isTVOp(expr)) {
+  if (!ir_utils::isTvOp(expr)) {
     // Close all the loops, scalar operations cannot be inside for loops based
     // on expr sorting.
     while (!for_loops_.empty()) {
       closeFor();
     }
-    pushFront(gpu_lower->lowerExpr(expr));
+    pushFront(expr);
 
     for (auto out : expr->outputs()) {
       TORCH_INTERNAL_ASSERT(
@@ -112,10 +106,8 @@ void LoopNestGenerator::handle(Expr* expr) {
           " cannot lower ",
           out->getValType().value());
 
-      pushFront(ir_builder.create<kir::Allocate>(
-          gpu_lower->lowerValue(out),
-          MemoryType::Local,
-          ir_builder.create<kir::Int>(1)));
+      pushFront(IrBuilder::create<kir::Allocate>(
+          out, MemoryType::Local, GpuLower::current()->kernel()->oneVal()));
     }
     return;
   }
@@ -130,27 +122,19 @@ void LoopNestGenerator::handle(Expr* expr) {
 
   // Figure out what the entire loop structure should look like.
   std::vector<IterDomain*> loop_structure = loop_structures_.at(out_tv);
-  std::vector<kir::IterDomain*> kir_loop_structure;
 
-  std::transform(
-      loop_structure.begin(),
-      loop_structure.end(),
-      std::back_inserter(kir_loop_structure),
-      [&gpu_lower](IterDomain* id) {
-        return gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-      });
   // Ordering of loop_structure is global, so simply close loops we don't need,
   // and open the ones we do.
 
   while (!for_loops_.empty() &&
          std::find(
-             kir_loop_structure.begin(),
-             kir_loop_structure.end(),
-             for_loops_.back()->iter_domain()) == kir_loop_structure.end()) {
+             loop_structure.begin(),
+             loop_structure.end(),
+             for_loops_.back()->iter_domain()) == loop_structure.end()) {
     closeFor();
   }
 
-  for (auto loop : kir_loop_structure) {
+  for (auto loop : loop_structure) {
     auto find_it = std::find_if(
         for_loops_.begin(), for_loops_.end(), [loop](kir::ForLoop* fl) {
           return fl->iter_domain() == loop;
@@ -160,47 +144,9 @@ void LoopNestGenerator::handle(Expr* expr) {
     }
   }
 
-  pushFront(gpu_lower->lowerExpr(expr));
+  pushFront(expr);
 }
 
-namespace {
-// Copied verbatim from lower_expr_sort EXCEPT map is parallel map, not loop
-// map, and direction is reversed
-struct LocalDomainSorter {
-  LocalDomainSorter(
-      const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
-          concrete_id_dependencies)
-      : concrete_id_dependencies_(concrete_id_dependencies) {}
-
-  // Return if id0 should be before id1
-  inline bool operator()(IterDomain* id0, IterDomain* id1) {
-    auto concrete_id_0 =
-        GpuLower::current()->caParallelMap().getConcreteMappedID(id0);
-    auto concrete_id_1 =
-        GpuLower::current()->caParallelMap().getConcreteMappedID(id1);
-
-    if (concrete_id_dependencies_.find(concrete_id_0) !=
-        concrete_id_dependencies_.end()) {
-      const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0);
-      // if id0 depends on id1 it means id1 is outside id0, so id1 < id0
-      return !dependencies_0.count(concrete_id_1);
-    }
-
-    if (concrete_id_dependencies_.find(concrete_id_1) !=
-        concrete_id_dependencies_.end()) {
-      const auto& dependencies_1 = concrete_id_dependencies_.at(concrete_id_1);
-      // if id1 depends on id0 it means id1 is inside id0, so id0 < id1
-      return dependencies_1.count(concrete_id_0);
-    }
-
-    return true;
-  }
-
-  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
-      concrete_id_dependencies_;
-};
-} // namespace
-
 // Generate the loop nest structure and place it in lowered_exprs_
 void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   TORCH_INTERNAL_ASSERT(lowered_exprs_.empty());
@@ -209,11 +155,10 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   // for an example why see FusionAdvancedLowering6
 
   // Grab iteration domain dependencies, similar to the logic in
-  // lower_expr_sort, EXCEPT it is based on parallel map not loop map, and
-  // dependencies are in opposite order, inner loops are dependant on outer
-  // loops.
+  // lower_expr_sort, EXCEPT dependencies are in opposite order,
+  // inner loops are dependant on outer loops.
 
-  const auto& parallel_map = GpuLower::current()->caParallelMap();
+  const auto& ca_map = GpuLower::current()->caMap();
 
   std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
       concrete_id_dependencies;
@@ -221,7 +166,8 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
     std::unordered_set<IterDomain*> dependencies;
 
     for (auto tv_id : tv->domain()->domain()) {
-      auto concrete_id = parallel_map.getConcreteMappedID(tv_id);
+      auto concrete_id =
+          ca_map->getConcreteMappedID(tv_id, IdMappingMode::LOOP);
 
       if (concrete_id_dependencies.find(concrete_id) ==
           concrete_id_dependencies.end()) {
@@ -232,7 +178,7 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
       }
 
       // Loops after tv_id are dependent on tv_id
-      dependencies.emplace(parallel_map.getConcreteMappedID(tv_id));
+      dependencies.emplace(concrete_id);
     }
   }
 
@@ -290,8 +236,8 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
       continue;
     }
 
-    auto last_id_concrete =
-        parallel_map.getConcreteMappedID(tv->axis((int)(tv->nDims() - 1)));
+    auto last_id_concrete = ca_map->getConcreteMappedID(
+        tv->axis((int)(tv->nDims() - 1)), IdMappingMode::LOOP);
     auto all_loops_it = concrete_id_dependencies.find(last_id_concrete);
     TORCH_INTERNAL_ASSERT(
         all_loops_it != concrete_id_dependencies.end(),
@@ -301,10 +247,13 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
     // Dependencies of last domain doesn't include last domain, include it
     // manually
     loop_structure.emplace_back(last_id_concrete);
+    // reverse sort (rbegin & rend) since we want the reverse of the order
+    // given by IterDomainDependencySorter
     std::sort(
-        loop_structure.begin(),
-        loop_structure.end(),
-        LocalDomainSorter(concrete_id_dependencies));
+        loop_structure.rbegin(),
+        loop_structure.rend(),
+        IterDomainDependencySorter(
+            concrete_id_dependencies, GpuLower::current()->caMap()));
     loop_structures_[tv] = loop_structure;
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h
index fbbdf079e89c..9b480d7eb6f8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.h
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.h
@@ -1,13 +1,12 @@
 
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 namespace torch {
@@ -30,20 +29,20 @@ namespace cuda {
 //! nests to initialize reduction buffers.
 class TORCH_CUDA_CU_API LoopNestGenerator {
  public:
-  static std::vector<kir::Expr*> loweredExprs(const std::vector<Expr*>& exprs);
+  static std::vector<Expr*> loweredExprs(const std::vector<Expr*>& exprs);
 
  private:
   LoopNestGenerator(const std::vector<Expr*>& exprs);
 
   // Open a new inner most for loop, track which TV it was constructed from
   // according to the computeAt chain.
-  void openFor(kir::IterDomain*);
+  void openFor(IterDomain*);
 
   // Close the inner most for loop
   void closeFor();
 
   // Appends an expression to the current scope
-  void pushFront(kir::Expr* expr);
+  void pushFront(Expr* expr);
 
   void handle(Expr* expr);
 
@@ -52,7 +51,7 @@ class TORCH_CUDA_CU_API LoopNestGenerator {
 
  private:
   // Lowered exprs to return
-  std::vector<kir::Expr*> lowered_exprs_;
+  std::vector<Expr*> lowered_exprs_;
 
   // Keep all for loops conveniently to make unrolling easier, basically just a
   // stack of the active for_loops
diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp
index f5f5c72676a6..f17f91806d61 100644
--- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp
@@ -2,7 +2,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 namespace torch {
@@ -12,11 +12,11 @@ namespace cuda {
 
 namespace {
 
-class MagicZeroInserter : public kir::MutableIrVisitor {
+class MagicZeroInserter : public kir::ExprMutator {
  public:
-  static std::vector<kir::Expr*> insert(const std::vector<kir::Expr*>& exprs) {
+  static std::vector<Expr*> insert(const std::vector<Expr*>& exprs) {
     MagicZeroInserter inserter(exprs);
-    return inserter.loop_nests_;
+    return inserter.exprs_;
   }
 
  private:
@@ -25,94 +25,43 @@ class MagicZeroInserter : public kir::MutableIrVisitor {
     kir::ForLoop* fl = nullptr;
   };
 
-  MagicZeroInserter(const std::vector<kir::Expr*>& exprs)
-      : loop_nests_(exprs), ir_builder(GpuLower::current()->kernel()) {
-    loop_nests_.insert(
-        loop_nests_.begin(), ir_builder.create<kir::InitMagicZero>());
-    for (auto expr : exprs) {
-      handle(expr);
-    }
-    insertAll();
-  }
-
-  void handle(kir::Expr* expr) {
-    if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle(ite);
-    } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle(for_loop);
-    }
-  }
-
-  void handle(kir::IfThenElse* ite) {
-    scope_nest_.push_back(&ite->thenBody());
-    for (auto expr : ite->thenBody().exprs()) {
-      handle(expr);
-    }
-    scope_nest_.pop_back();
-    scope_nest_.push_back(&ite->elseBody());
-    for (auto expr : ite->elseBody().exprs()) {
-      handle(expr);
-    }
-    scope_nest_.pop_back();
+  MagicZeroInserter(const std::vector<Expr*>& exprs) {
+    TORCH_INTERNAL_ASSERT(exprs.size());
+    kir::ExprMutator::registerInsertBefore(
+        exprs.front(), IrBuilder::create<kir::InitMagicZero>(), nullptr);
+    kir::ExprMutator::traverseAndInsert(exprs);
   }
 
-  void handle(kir::ForLoop* fl) {
+  void handle(kir::ForLoop* fl) final {
     if (fl->isUnrolled()) {
-      kir::Scope* scope = nullptr;
-      if (!scope_nest_.empty()) {
-        scope = scope_nest_.back();
-      }
-      insertion_list_.push_back({scope, fl});
-    } else {
-      scope_nest_.push_back(&fl->body());
-      for (auto expr : fl->body().exprs()) {
-        handle(expr);
-      }
-      scope_nest_.pop_back();
-    }
-  }
-
-  void insertAll() {
-    for (const auto& info : insertion_list_) {
-      auto fl = info.fl;
-      auto scope = info.scope;
-      if (scope == nullptr) {
-        // place in global scope
-        auto loop_it = std::find(loop_nests_.begin(), loop_nests_.end(), fl);
-        TORCH_INTERNAL_ASSERT(loop_it != loop_nests_.end());
-        // Place after the loop
-        loop_it++;
-        loop_nests_.insert(loop_it, ir_builder.create<kir::UpdateMagicZero>());
+      if (scope_.empty()) {
+        kir::ExprMutator::registerInsertAfter(
+            fl, IrBuilder::create<kir::UpdateMagicZero>());
       } else {
-        scope->insert_after(fl, ir_builder.create<kir::UpdateMagicZero>());
+        TORCH_INTERNAL_ASSERT(
+            scope_.back()->exprs().size(), "Not expecting an empty loop.");
+        kir::ExprMutator::registerInsertAfter(
+            fl, IrBuilder::create<kir::UpdateMagicZero>(), scope_.back());
       }
+    } else {
+      kir::ExprMutator::handle(fl);
     }
   }
 
-  //! Keep track for loop structure
-  std::vector<kir::Scope*> scope_nest_;
-
-  // Keep a copy of the expressions provided
-  std::vector<kir::Expr*> loop_nests_;
-
-  kir::IrBuilder ir_builder;
-
   std::vector<InsertionInfo> insertion_list_;
 };
 
 } // namespace
 
-std::vector<kir::Expr*> insertMagicZero(const std::vector<kir::Expr*>& exprs) {
+std::vector<Expr*> insertMagicZero(const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::insertMagicZero");
   // Check if magic zero was even used, if not we don't have to define it or
   // update it.
   const auto gpu_lower = GpuLower::current();
   auto kernel = gpu_lower->kernel();
-  const bool has_magic_zero = std::any_of(
-      kernel->irNodes().begin(),
-      kernel->irNodes().end(),
-      [](const std::unique_ptr<kir::Node>& ir_node) {
-        return ir_node->isA<kir::Val>() && isMagicZero(ir_node->as<kir::Val>());
+  const bool has_magic_zero =
+      std::any_of(kernel->vals().begin(), kernel->vals().end(), [](Val* val) {
+        return isMagicZero(val);
       });
 
   if (!has_magic_zero) {
@@ -122,19 +71,21 @@ std::vector<kir::Expr*> insertMagicZero(const std::vector<kir::Expr*>& exprs) {
   return MagicZeroInserter::insert(exprs);
 }
 
-bool isMagicZero(kir::Val* val) {
-  auto ns = dynamic_cast<kir::NamedScalar*>(val);
-  if (ns == nullptr) {
+bool isMagicZero(const Val* val) {
+  if (!val->isA<NamedScalar>()) {
     return false;
   }
+  auto ns = val->as<NamedScalar>();
   return ns->dtype() == DataType::Int &&
       ns->name() == std::string(kMagicZeroName);
 }
 
-bool isProtectedWithMagicZero(kir::Val* val) {
-  auto def = dynamic_cast<kir::BinaryOp*>(val->definition());
-  return def && def->operation() == BinaryOpType::Add &&
-      isMagicZero(def->rhs());
+bool isProtectedWithMagicZero(const Val* val) {
+  if (val->definition() == nullptr || !val->definition()->isA<BinaryOp>()) {
+    return false;
+  }
+  auto bop = val->definition()->as<BinaryOp>();
+  return bop->getBinaryOpType() == BinaryOpType::Add && isMagicZero(bop->rhs());
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h
index 03a37a46813c..942a33028017 100644
--- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h
+++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h
@@ -14,15 +14,15 @@ namespace cuda {
 //! zero update after every (outer most) loop nest with a compile time extent.
 //!
 //! This will make sure nvrtc does not aggressively save predicate and indices.
-std::vector<kir::Expr*> insertMagicZero(const std::vector<kir::Expr*>& exprs);
+std::vector<Expr*> insertMagicZero(const std::vector<Expr*>& exprs);
 
 //! Check if val is a reference to the magic zero variable
-bool isMagicZero(kir::Val* val);
+bool isMagicZero(const Val* val);
 
 //! Check if val is protected with magic zero.
 //!
 //! Specifically, this returns true if val is defined as "x + magic_zero".
-bool isProtectedWithMagicZero(kir::Val* val);
+bool isProtectedWithMagicZero(const Val* val);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
index b94c12c27c83..66b405ac8e2f 100644
--- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
@@ -5,8 +5,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
@@ -18,85 +17,64 @@ namespace cuda {
 
 namespace {
 
-class MisalignedVectorizationModifier {
+class MisalignedVectorizationModifier : public kir::ExprMutator {
  public:
-  void process(const std::vector<kir::Expr*>& exprs) {
-    FUSER_PERF_SCOPE(
-        "GpuLower::Lower::MisalignedVectorizationModifier::process");
-    // Run through loop nests
-    // Find for-loops with misaligned vectorization domains
-    for (auto* expr : exprs) {
-      handle(expr);
-    }
-  }
+  MisalignedVectorizationModifier() = delete;
 
-  const std::unordered_map<kir::Expr*, kir::Expr*>& replacementMap() const {
-    return expr_replacement_map_;
+  static std::vector<Expr*> processMisalignedVectorization(
+      const std::vector<Expr*>& exprs) {
+    FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization");
+    MisalignedVectorizationModifier mvm(exprs);
+    return mvm.exprs_;
   }
 
  private:
-  void handle(kir::Expr* expr) {
-    if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle(for_loop);
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle(ite);
-    }
+  MisalignedVectorizationModifier(const std::vector<Expr*>& exprs) {
+    FUSER_PERF_SCOPE("GpuLower::Lower::MisalignedVectorizationModifier");
+    // Run through loop nests
+    // Find for-loops with misaligned vectorization domains
+    kir::ExprMutator::traverseAndInsert(exprs);
   }
 
-  void handle(kir::ForLoop* fl) {
-    for_loops_structure_.push_back(fl);
-
-    // Make copy of exprs because we replace them inplace in fl
-    const auto exprs_copy = fl->body().exprs();
-
+  void handle(kir::ForLoop* fl) final {
+    kir::Scope* scope = scope_.empty() ? nullptr : scope_.back();
     if (containsAnyDirectChildMisalignedVectorize(fl)) {
-      auto new_fl = handleMisalignedVectorize(for_loops_structure_, fl);
-      expr_replacement_map_.insert({fl, new_fl});
-    } else {
-      for (auto expr : exprs_copy) {
-        handle(expr);
-      }
-    }
+      for_loops_.push_back(fl);
+      auto new_fl = handleMisalignedVectorize(for_loops_, fl);
+      for_loops_.pop_back();
 
-    for_loops_structure_.pop_back();
-  }
-
-  void handle(kir::IfThenElse* ite) {
-    for (auto expr : ite->thenBody().exprs()) {
-      handle(expr);
-    }
-    for (auto expr : ite->elseBody().exprs()) {
-      handle(expr);
+      kir::ExprMutator::registerReplace(fl, new_fl, scope);
+    } else {
+      kir::ExprMutator::handle(fl);
     }
   }
 
   struct ReferenceTensors {
     // Input TensorView to Vectorize Set operation
-    kir::TensorView* in_tv = nullptr;
+    TensorView* in_tv = nullptr;
     // Output TensorView to Vectorize Set operation
-    kir::TensorView* out_tv = nullptr;
+    TensorView* out_tv = nullptr;
     // TensorView in global memory
-    kir::TensorView* global_tv = nullptr;
+    TensorView* global_tv = nullptr;
     // TensorView with vectorize IterDomain and not in global memory
-    kir::TensorView* vec_tv = nullptr;
+    TensorView* vec_tv = nullptr;
   };
 
-  ReferenceTensors getReferenceTensors(kir::Expr* vectorized_expr) {
+  ReferenceTensors getReferenceTensors(Expr* vectorized_expr) {
     TORCH_INTERNAL_ASSERT(vectorized_expr != nullptr);
     TORCH_INTERNAL_ASSERT(
-        vectorized_expr->outputs().front()->isA<kir::TensorView>());
-    TORCH_INTERNAL_ASSERT(
-        vectorized_expr->inputs().front()->isA<kir::TensorView>());
+        vectorized_expr->outputs().front()->isA<TensorView>());
+    TORCH_INTERNAL_ASSERT(vectorized_expr->inputs().front()->isA<TensorView>());
 
-    auto in_tv = vectorized_expr->inputs().front()->as<kir::TensorView>();
-    auto out_tv = vectorized_expr->outputs().front()->as<kir::TensorView>();
+    auto in_tv = vectorized_expr->inputs().front()->as<TensorView>();
+    auto out_tv = vectorized_expr->outputs().front()->as<TensorView>();
 
     const bool global_vectorize_write_op =
-        (out_tv->memoryType() == MemoryType::Global &&
-         in_tv->memoryType() == MemoryType::Local);
+        (out_tv->getMemoryType() == MemoryType::Global &&
+         in_tv->getMemoryType() == MemoryType::Local);
     const bool global_vectorize_read_op =
-        (out_tv->memoryType() == MemoryType::Local &&
-         in_tv->memoryType() == MemoryType::Global);
+        (out_tv->getMemoryType() == MemoryType::Local &&
+         in_tv->getMemoryType() == MemoryType::Global);
     TORCH_INTERNAL_ASSERT(
         global_vectorize_write_op || global_vectorize_read_op,
         "Unsupported vectorize memory configuration detected.");
@@ -104,25 +82,26 @@ class MisalignedVectorizationModifier {
     // TensorView on global memory. This is the tensor that may have
     // a non-aligned base address.
     auto global_tv =
-        (out_tv->memoryType() == MemoryType::Global) ? out_tv : in_tv;
+        (out_tv->getMemoryType() == MemoryType::Global) ? out_tv : in_tv;
 
     // TensorView with the misaligned vec iterDomain. It is the consumer
     // of vectorized load or the producer of vectorized store. It is
     // assumed that when the output TV is not on global memory, this
     // expression is a vectorized load, so the output TV is vec_tv.
-    auto vec_tv = (out_tv->memoryType() != MemoryType::Global) ? out_tv : in_tv;
+    auto vec_tv =
+        (out_tv->getMemoryType() != MemoryType::Global) ? out_tv : in_tv;
 
     return {in_tv, out_tv, global_tv, vec_tv};
   }
 
   struct VectorizeData {
-    kir::Val* vector_size = nullptr;
-    kir::Val* shift = nullptr;
-    kir::Val* extent = nullptr;
-    kir::Val* remainder = nullptr;
-    kir::Val* extent_minus_remainder = nullptr;
-    kir::Val* last_root_domain_index = nullptr;
-    kir::Val* last_root_domain_index_shift = nullptr;
+    Val* vector_size = nullptr;
+    Val* shift = nullptr;
+    Val* extent = nullptr;
+    Val* remainder = nullptr;
+    Val* extent_minus_remainder = nullptr;
+    Val* last_root_domain_index = nullptr;
+    Val* last_root_domain_index_shift = nullptr;
   };
 
   // Create constants for handling misaligned addresses
@@ -130,48 +109,43 @@ class MisalignedVectorizationModifier {
       const std::vector<kir::ForLoop*>& for_loop_structure,
       const ReferenceTensors& tensors,
       kir::IfThenElse* parent_scope_ite) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-
     // Generate vectorize index
-    auto indices = (tensors.out_tv->memoryType() == MemoryType::Global)
-        ? Index::getConsumerStridedIndices(
-              tensors.out_tv->fuserTv(), for_loop_structure)
+    auto indices = (tensors.out_tv->getMemoryType() == MemoryType::Global)
+        ? Index::getConsumerStridedIndices(tensors.out_tv, for_loop_structure)
         : Index::getProducerStridedIndices(
-              tensors.in_tv->fuserTv(),
-              tensors.out_tv->fuserTv(),
-              for_loop_structure);
+              tensors.in_tv, tensors.out_tv, for_loop_structure);
 
     // >>>>>>>>>>>>>
     // Number of elements in vectorize access
     auto vector_size =
-        tensors.vec_tv->domain()->domain().back()->extent()->as<kir::Int>();
+        tensors.vec_tv->domain()->domain().back()->extent()->as<Int>();
 
     // Size of memory type for the elements
-    kir::Int* data_size_in_bytes =
-        ir_builder.create<kir::Int>(dataTypeSize(tensors.vec_tv->dtype()));
+    Int* data_size_in_bytes =
+        IrBuilder::create<Int>(dataTypeSize(tensors.vec_tv->dtype()));
 
     // The number of bytes in the vectorize access
     auto vector_size_in_bytes =
-        ir_builder.mulExpr(vector_size, data_size_in_bytes);
+        IrBuilder::mulExpr(vector_size, data_size_in_bytes);
 
-    auto index = ir_builder.create<kir::TensorIndex>(
-        tensors.global_tv->fuserTv(), indices);
+    auto index =
+        IrBuilder::create<kir::TensorIndex>(tensors.global_tv, indices);
     auto address = createNamedScalarFromValue(
         parent_scope_ite->thenBody(), index, "address", true);
 
     // offset_size = (address % vector_size_bytes) / data_type_size_bytes
     // shift_init = vector_size - offset_size
-    auto a = ir_builder.modExpr(address, vector_size_in_bytes);
-    auto b = ir_builder.divExpr(a, data_size_in_bytes);
-    auto c = ir_builder.subExpr(vector_size, b);
+    auto a = IrBuilder::modExpr(address, vector_size_in_bytes);
+    auto b = IrBuilder::divExpr(a, data_size_in_bytes);
+    auto c = IrBuilder::subExpr(vector_size, b);
     auto shift_init = createNamedScalarFromValue(
         parent_scope_ite->thenBody(), c, "shift_val");
 
     // shift = (shift_init == vector_size) ? 0 : shift_init
     // The number of elements until the first aligned address
-    auto shift_pred = ir_builder.eqExpr(shift_init, vector_size);
-    auto shift_val =
-        ir_builder.whereExpr(shift_pred, ir_builder.zeroVal(), shift_init);
+    auto shift_pred = IrBuilder::eqExpr(shift_init, vector_size);
+    auto shift_val = IrBuilder::whereExpr(
+        shift_pred, GpuLower::current()->kernel()->zeroVal(), shift_init);
 
     // >>>>>>>>>>>>>
     auto shift = createNamedScalarFromValue(
@@ -183,13 +157,13 @@ class MisalignedVectorizationModifier {
 
     // remainder = (extent - shift) % vector_size
     // The number of elements remaining not accessed by vectorized operations
-    auto remaining_extent = ir_builder.subExpr(extent, shift);
-    auto remainder_val = ir_builder.modExpr(remaining_extent, vector_size);
+    auto remaining_extent = IrBuilder::subExpr(extent, shift);
+    auto remainder_val = IrBuilder::modExpr(remaining_extent, vector_size);
     auto remainder = createNamedScalarFromValue(
         parent_scope_ite->thenBody(), remainder_val, "remainder");
 
     // (extent - remainder) is the upper-bound for the vectorize section
-    auto extent_remainder_val = ir_builder.subExpr(extent, remainder);
+    auto extent_remainder_val = IrBuilder::subExpr(extent, remainder);
 
     // >>>>>>>>>>>>>
     auto extent_minus_remainder = createNamedScalarFromValue(
@@ -203,7 +177,7 @@ class MisalignedVectorizationModifier {
 
     // >>>>>>>>>>>>>
     auto last_root_domain_index_shift =
-        ir_builder.addExpr(last_root_domain_index, shift);
+        IrBuilder::addExpr(last_root_domain_index, shift);
 
     return {
         vector_size,
@@ -220,20 +194,18 @@ class MisalignedVectorizationModifier {
   kir::IfThenElse* createVectorizeSection(
       const std::vector<kir::ForLoop*>& child_loops,
       const VectorizeData& params) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-
     auto vectorized_child_loops = cloneForLoops(
         child_loops, params.vector_size, nullptr, true, params.shift);
 
     // Vectorize Range: [shift - (extent-remainder))
     // (last_root_domain_index + shift) < (extent - remainder)
-    kir::Val* vectorize_cond = ir_builder.ltExpr(
+    Val* vectorize_cond = IrBuilder::ltExpr(
         params.last_root_domain_index_shift, params.extent_minus_remainder);
 
     kir::Predicate* vectorize_pred =
-        ir_builder.create<kir::Predicate>(vectorize_cond->as<kir::Bool>());
+        IrBuilder::create<kir::Predicate>(vectorize_cond->as<Bool>());
     kir::IfThenElse* vectorize_ite =
-        ir_builder.create<kir::IfThenElse>(vectorize_pred);
+        IrBuilder::create<kir::IfThenElse>(vectorize_pred);
 
     for (auto cloned_loop : vectorized_child_loops) {
       vectorize_ite->thenBody().push_back(cloned_loop);
@@ -247,20 +219,19 @@ class MisalignedVectorizationModifier {
   kir::IfThenElse* createInitialSection(
       const std::vector<kir::ForLoop*>& child_loops,
       const VectorizeData& params) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-
     auto pre_child_loops = cloneForLoops(
         child_loops, params.vector_size, params.shift, false, nullptr);
 
     // Initial Range: [0 - shift)
     // last_root_domain_index == 0
-    kir::Val* initial_cond =
-        ir_builder.eqExpr(params.last_root_domain_index, ir_builder.zeroVal());
+    Val* initial_cond = IrBuilder::eqExpr(
+        params.last_root_domain_index,
+        GpuLower::current()->kernel()->zeroVal());
 
     kir::Predicate* initial_pred =
-        ir_builder.create<kir::Predicate>(initial_cond->as<kir::Bool>());
+        IrBuilder::create<kir::Predicate>(initial_cond->as<Bool>());
     kir::IfThenElse* initial_ite =
-        ir_builder.create<kir::IfThenElse>(initial_pred);
+        IrBuilder::create<kir::IfThenElse>(initial_pred);
 
     for (auto cloned_loop : pre_child_loops) {
       initial_ite->thenBody().push_back(cloned_loop);
@@ -274,23 +245,21 @@ class MisalignedVectorizationModifier {
   kir::IfThenElse* createRemainderSection(
       const std::vector<kir::ForLoop*>& child_loops,
       const VectorizeData& params) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-
     auto post_child_loops = cloneForLoops(
         child_loops, params.vector_size, params.remainder, false, params.shift);
 
     // Remainder Range: [(extent-remainder) - extent)
     // (extent - remainder) <= last_root_domain_index + shift < extent
-    kir::Val* lower_bound = ir_builder.geExpr(
+    Val* lower_bound = IrBuilder::geExpr(
         params.last_root_domain_index_shift, params.extent_minus_remainder);
-    kir::Val* upper_bound =
-        ir_builder.ltExpr(params.last_root_domain_index_shift, params.extent);
-    kir::Val* remainder_cond = ir_builder.andExpr(lower_bound, upper_bound);
+    Val* upper_bound =
+        IrBuilder::ltExpr(params.last_root_domain_index_shift, params.extent);
+    Val* remainder_cond = IrBuilder::andExpr(lower_bound, upper_bound);
 
     kir::Predicate* remainder_pred =
-        ir_builder.create<kir::Predicate>(remainder_cond->as<kir::Bool>());
+        IrBuilder::create<kir::Predicate>(remainder_cond->as<Bool>());
     kir::IfThenElse* remainder_ite =
-        ir_builder.create<kir::IfThenElse>(remainder_pred);
+        IrBuilder::create<kir::IfThenElse>(remainder_pred);
 
     for (auto cloned_loop : post_child_loops) {
       remainder_ite->thenBody().push_back(cloned_loop);
@@ -302,8 +271,6 @@ class MisalignedVectorizationModifier {
   kir::ForLoop* handleMisalignedVectorize(
       std::vector<kir::ForLoop*> for_loop_structure,
       const kir::ForLoop* parent_for_loop) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-
     auto child_loops = findChildForLoops(parent_for_loop);
 
     // Assumption: All vectorize operations have the same shift
@@ -315,17 +282,19 @@ class MisalignedVectorizationModifier {
 
     // The parent_for_loop contains allocate, read, compute, write operations
     const auto new_parent_for_loop =
-        ir_builder.create<kir::ForLoop>(parent_for_loop);
+        IrBuilder::create<kir::ForLoop>(parent_for_loop);
 
     // Transfer all expressions except for-loops to new parent for-loop
     // All expressions are placed at the beginning of the new for-loop
-    moveExprsExceptForLoops(parent_for_loop, new_parent_for_loop);
+    copyExprsExceptForLoops(parent_for_loop, new_parent_for_loop);
 
     // Get the predicate for all but the last root domain
-    auto pred_except_last_root_domain = ir_builder.create<kir::Predicate>(
-        PredicateType::Misaligned, vectorized_expr, ir_builder.trueVal());
+    auto pred_except_last_root_domain = IrBuilder::create<kir::Predicate>(
+        PredicateType::Misaligned,
+        vectorized_expr,
+        GpuLower::current()->kernel()->trueVal());
     kir::IfThenElse* pred_ite =
-        ir_builder.create<kir::IfThenElse>(pred_except_last_root_domain);
+        IrBuilder::create<kir::IfThenElse>(pred_except_last_root_domain);
     new_parent_for_loop->body().push_back(pred_ite);
 
     auto constants = createVectorizeConstants(
@@ -351,17 +320,17 @@ class MisalignedVectorizationModifier {
 
   // Determine that the expression is UnaryOpType::Set AND
   // the output TensorView domain is vectorized
-  bool isVectorizeSetOp(kir::ForLoop* fl, kir::Expr* expr) {
-    if (fl->iter_domain()->parallelType() !=
+  bool isVectorizeSetOp(kir::ForLoop* fl, Expr* expr) {
+    if (fl->iter_domain()->getParallelType() !=
         ParallelType::MisalignedVectorize) {
       return false;
     }
 
-    if (expr->isA<kir::UnaryOp>()) {
-      auto unaryOp = expr->as<kir::UnaryOp>();
-      if (unaryOp->out()->isA<kir::TensorView>()) {
-        auto out_tv = unaryOp->out()->as<kir::TensorView>();
-        return unaryOp->operation() == UnaryOpType::Set &&
+    if (expr->isA<UnaryOp>()) {
+      auto unaryOp = expr->as<UnaryOp>();
+      if (unaryOp->out()->isA<TensorView>()) {
+        auto out_tv = unaryOp->out()->as<TensorView>();
+        return unaryOp->getUnaryOpType() == UnaryOpType::Set &&
             out_tv->domain()->hasVectorize();
       }
     }
@@ -374,15 +343,14 @@ class MisalignedVectorizationModifier {
   // vectorize flag - Do not generate for loop header
   // shift value - Add shift to global indices generated within for loop
   std::vector<kir::ForLoop*> cloneForLoops(
-      const std::vector<kir::ForLoop*>& for_loops,
-      kir::Val* loop_stop,
-      kir::Val* pred_stop,
+      const std::vector<kir::ForLoop*>& for_loops_,
+      Val* loop_stop,
+      Val* pred_stop,
       bool vectorize,
-      kir::Val* vectorize_shift) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+      Val* vectorize_shift) {
     std::vector<kir::ForLoop*> cloned_for_loops;
 
-    for (auto fl : for_loops) {
+    for (auto fl : for_loops_) {
       auto first_expr = fl->body().exprs().front();
       bool has_vectorize_op = isVectorizeSetOp(fl, first_expr);
 
@@ -391,12 +359,12 @@ class MisalignedVectorizationModifier {
       TORCH_INTERNAL_ASSERT(
           !has_vectorize_op || fl->body().exprs().size() == 1);
 
-      const auto new_loop = ir_builder.create<kir::ForLoop>(
+      const auto new_loop = IrBuilder::create<kir::ForLoop>(
           fl->iter_domain(),
           fl->index(),
-          ir_builder.zeroVal(),
+          GpuLower::current()->kernel()->zeroVal(),
           loop_stop,
-          ir_builder.oneVal(),
+          GpuLower::current()->kernel()->oneVal(),
           vectorize && has_vectorize_op,
           vectorize_shift,
           fl->isUnrollRequired());
@@ -406,9 +374,9 @@ class MisalignedVectorizationModifier {
       // Predicate the loop body if pred_stop is not null. This is to
       // make sure the loop itself is completely unrollable.
       if (pred_stop != nullptr) {
-        auto body_pred = ir_builder.create<kir::Predicate>(
-            ir_builder.ltExpr(new_loop->index(), pred_stop)->as<kir::Bool>());
-        auto body_ite = ir_builder.create<kir::IfThenElse>(body_pred);
+        auto body_pred = IrBuilder::create<kir::Predicate>(
+            IrBuilder::ltExpr(new_loop->index(), pred_stop)->as<Bool>());
+        auto body_ite = IrBuilder::create<kir::IfThenElse>(body_pred);
         body->push_back(body_ite);
         body = &body_ite->thenBody();
       }
@@ -423,7 +391,7 @@ class MisalignedVectorizationModifier {
   }
 
   // Add all expressions except for loops to new parent for loop
-  void moveExprsExceptForLoops(
+  void copyExprsExceptForLoops(
       const kir::ForLoop* for_loop,
       kir::ForLoop* new_loop) {
     std::vector<kir::ForLoop*> loops;
@@ -448,10 +416,10 @@ class MisalignedVectorizationModifier {
   // Find the first vectorize set - either read or write
   // Add child For-Loop to for_loop_structure
   // Enable vectorize flag in child For-Loop
-  kir::Expr* findFirstVectorizedSetOp(
+  Expr* findFirstVectorizedSetOp(
       std::vector<kir::ForLoop*>& for_loop_structure,
-      const std::vector<kir::ForLoop*>& for_loops) {
-    for (auto fl : for_loops) {
+      const std::vector<kir::ForLoop*>& for_loops_) {
+    for (auto fl : for_loops_) {
       auto first_expr = fl->body().exprs().front();
       bool has_vectorize_op = isVectorizeSetOp(fl, first_expr);
       if (has_vectorize_op) {
@@ -463,38 +431,31 @@ class MisalignedVectorizationModifier {
   }
 
   // Get full extent for the inner-most, merged root domain
-  kir::Val* getVectorizeExtent(
-      kir::TensorView* producer_tv,
-      kir::TensorView* consumer_tv) {
+  Val* getVectorizeExtent(TensorView* producer_tv, TensorView* consumer_tv) {
     const auto gpu_lower = GpuLower::current();
-    kir::IrBuilder ir_builder(gpu_lower->kernel());
-
-    auto consumer_fuser_tv = consumer_tv->fuserTv();
-    auto producer_fuser_tv = producer_tv->fuserTv();
 
-    auto p2c =
-        PairwiseRootDomainMap(producer_fuser_tv, consumer_fuser_tv)
-            .mapProducerToConsumer(
-                producer_fuser_tv->domain(), consumer_fuser_tv->domain());
+    auto p2c = PairwiseRootDomainMap(producer_tv, consumer_tv)
+                   .mapProducerToConsumer(
+                       producer_tv->domain(), consumer_tv->domain());
 
     auto consumer_root_right_of_ca_domains = IterVisitor::getInputsTo(
-        {consumer_fuser_tv->domain()->domain().begin() +
-             consumer_fuser_tv->getComputeAtPosition(),
-         consumer_fuser_tv->domain()->domain().end()});
+        {consumer_tv->domain()->domain().begin() +
+             consumer_tv->getComputeAtPosition(),
+         consumer_tv->domain()->domain().end()});
     auto producer_root_right_of_ca_domains = IterVisitor::getInputsTo(
-        {producer_fuser_tv->domain()->domain().begin() +
-             producer_fuser_tv->getComputeAtPosition(),
-         producer_fuser_tv->domain()->domain().end()});
+        {producer_tv->domain()->domain().begin() +
+             producer_tv->getComputeAtPosition(),
+         producer_tv->domain()->domain().end()});
 
-    const auto& consumer_contig = consumer_fuser_tv->domain()->contiguity();
-    const auto& producer_contig = producer_fuser_tv->domain()->contiguity();
+    const auto& consumer_contig = consumer_tv->domain()->contiguity();
+    const auto& producer_contig = producer_tv->domain()->contiguity();
 
-    auto producer_root_domain = producer_fuser_tv->getMaybeRFactorDomain();
+    auto producer_root_domain = producer_tv->getMaybeRFactorDomain();
 
     // Calculate extent of merged root domains
-    kir::Val* extent = nullptr;
+    Val* extent = nullptr;
     auto consumer_root_idx =
-        int(consumer_fuser_tv->getMaybeRFactorDomain().size()) - 1;
+        int(consumer_tv->getMaybeRFactorDomain().size()) - 1;
     for (int i = int(producer_root_domain.size()) - 1; i >= 0; --i) {
       auto producer_root_id = producer_root_domain.at(i);
 
@@ -533,11 +494,10 @@ class MisalignedVectorizationModifier {
 
       // We now know it's safe to extend the vectorization domain to these
       // axes. It shouldn't matter whether producer or consumer is used.
-      auto consumer_extent = gpu_lower->lowerValue(consumer_root_id->extent());
       if (extent == nullptr) {
-        extent = consumer_extent;
+        extent = consumer_root_id->extent();
       } else {
-        extent = ir_builder.mulExpr(extent, consumer_extent);
+        extent = IrBuilder::mulExpr(extent, consumer_root_id->extent());
       }
 
       // If it's not contiguous, extending the vectorization domain
@@ -554,57 +514,37 @@ class MisalignedVectorizationModifier {
     return extent;
   }
 
-  kir::Val* createNamedScalarFromValue(
+  Val* createNamedScalarFromValue(
       kir::Scope& body,
-      kir::Val* val,
+      Val* val,
       const std::string& name,
       bool address = false) {
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-    auto namedScalar = (address) ? ir_builder.addressExprNamedScalar(name, val)
-                                 : ir_builder.setExprNamedScalar(name, val);
+    auto namedScalar = (address) ? IrBuilder::addressExprNamedScalar(name, val)
+                                 : IrBuilder::setExprNamedScalar(name, val);
     TORCH_INTERNAL_ASSERT(namedScalar->definition() != nullptr);
 
-    auto alloc = ir_builder.create<kir::Allocate>(
-        namedScalar, MemoryType::Local, ir_builder.oneVal());
+    auto alloc = IrBuilder::create<kir::Allocate>(
+        namedScalar,
+        MemoryType::Local,
+        GpuLower::current()->kernel()->oneVal());
     body.push_back(alloc);
     body.push_back(namedScalar->definition());
     return namedScalar;
   }
-
- private:
-  // We will track which loops in the incoming IR will be replaced and by what
-  std::unordered_map<kir::Expr*, kir::Expr*> expr_replacement_map_;
-
-  // A depth-first ordering of nested for loops
-  // It is used for indexing and predicate generation
-  std::vector<kir::ForLoop*> for_loops_structure_;
 };
 
 } // namespace
 
-std::vector<kir::Expr*> processMisalignedVectorization(
-    Fusion* fusion,
-    const std::vector<kir::Expr*>& exprs) {
-  FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization");
-
-  MisalignedVectorizationModifier mvm;
-  mvm.process(exprs);
-
-  std::vector<kir::Expr*> mutated_exprs;
-  mutated_exprs.reserve(exprs.size());
-  for (auto expr : exprs) {
-    mutated_exprs.push_back(
-        ir_utils::applyReplacements(mvm.replacementMap(), expr));
-  }
-
-  return mutated_exprs;
+std::vector<Expr*> processMisalignedVectorization(
+    const std::vector<Expr*>& exprs) {
+  return MisalignedVectorizationModifier::processMisalignedVectorization(exprs);
 }
 
 bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl) {
   for (auto expr : fl->body().exprs()) {
     if (expr->isA<kir::ForLoop>()) {
       auto child_fl = expr->as<kir::ForLoop>();
-      if (child_fl->iter_domain()->parallelType() ==
+      if (child_fl->iter_domain()->getParallelType() ==
           ParallelType::MisalignedVectorize) {
         return true;
       }
diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h
index 588d3787752b..bd7ae19d93a8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h
+++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 
@@ -106,9 +106,8 @@ namespace cuda {
 //!   }
 //! }
 //!
-std::vector<kir::Expr*> processMisalignedVectorization(
-    Fusion* fusion,
-    const std::vector<kir::Expr*>& exprs);
+std::vector<Expr*> processMisalignedVectorization(
+    const std::vector<Expr*>& exprs);
 
 bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl);
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
index 838d5d85d9e4..cda210989f17 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
@@ -1,16 +1,13 @@
 #include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
@@ -23,37 +20,65 @@ namespace cuda {
 
 namespace {
 
-class ConditionalFromPredicateModifier {
+class ConditionalFromPredicateModifier : public kir::IrVisitor {
  public:
-  ConditionalFromPredicateModifier(const std::vector<kir::Expr*>& exprs) {
+  ConditionalFromPredicateModifier() = delete;
+
+  static std::vector<Expr*> fillPredicates(const std::vector<Expr*>& exprs) {
+    ConditionalFromPredicateModifier cfpm(exprs);
+    return cfpm.exprs_;
+  }
+
+ private:
+  ConditionalFromPredicateModifier(const std::vector<Expr*>& exprs) {
     FUSER_PERF_SCOPE(
         "GpuLower::Lower::ConditionalFromPredicateModifier::process");
-    for (auto* expr : exprs) {
-      handle(expr);
-    }
+    kir::IrVisitor::handle(exprs);
   }
 
-  const std::unordered_map<kir::Expr*, kir::Expr*>& replacementMap() const {
-    return expr_replacement_map_;
-  }
+  using kir::IrVisitor::handle;
 
- private:
-  void handle(kir::Expr* expr) {
-    if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle(for_loop);
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle(ite);
-    } else if (expr != nullptr && expr->predicate() != nullptr) {
+  void handle(Expr* expr) final {
+    if (expr != nullptr && expr->predicate() != nullptr) {
       // Replace expr predicate with bool conditional
       auto conditional = generateConditional(expr->predicate());
+      if (expr->predicate()->predicate_type() == PredicateType::Vectorize) {
+        // TODO: This logic doesn't seem to fit well here, for unswitch the
+        // logic is in the unroll loop to set the thread predicate to the expr.
+        // I didn't have a quick way to do that so placing this here for now.
+        TORCH_INTERNAL_ASSERT(
+            expr->isA<kir::IfThenElse>(),
+            "Predicate handling expects ITE statement.");
+        auto ite = expr->as<kir::IfThenElse>();
+
+        TORCH_INTERNAL_ASSERT(
+            ite->thenBody().size() == 1,
+            "Expecting predicated body to only have one vectorized expression.");
+        auto vec_expr = ite->thenBody()[0];
+        TORCH_INTERNAL_ASSERT(
+            vec_expr->isA<UnaryOp>(),
+            "Vectorize predicate exprs only supported on set operations.");
+        TORCH_INTERNAL_ASSERT(
+            ir_utils::isTvOp(vec_expr),
+            "Vectorize predicate exprs only supported on tensor view operations.");
+        if (!vec_expr->inputs()[0]->isConstScalar()) {
+          conditional = SimplifyingIrBuilder::andExpr(
+                            conditional,
+                            GpuLower::current()->threadPredMap().getPredicate(
+                                ir_utils::getTvOutput(vec_expr)))
+                            ->as<Bool>();
+        }
+      }
       TORCH_INTERNAL_ASSERT(conditional != nullptr);
       expr->predicate()->setValue(conditional);
       TORCH_INTERNAL_ASSERT(expr->predicate()->value() != nullptr);
       setWritePredicate(expr, conditional);
     }
+
+    kir::IrVisitor::handle(expr);
   }
 
-  void setWritePredicate(kir::Expr* expr, kir::Bool* read_cond) {
+  void setWritePredicate(Expr* expr, Bool* read_cond) {
     if (expr->writePredicate() != nullptr) {
       auto write_cond = generateConditional(expr->writePredicate());
       if (write_cond) {
@@ -66,46 +91,25 @@ class ConditionalFromPredicateModifier {
     }
   }
 
-  void handle(kir::ForLoop* fl) {
-    for_loops_structure_.push_back(fl);
-
-    const auto exprs_copy = fl->body().exprs();
-    for (auto expr : exprs_copy) {
-      handle(expr);
-    }
-
-    for_loops_structure_.pop_back();
-  }
-
-  void handle(kir::IfThenElse* ite) {
+  void handle(kir::IfThenElse* ite) final {
     TORCH_INTERNAL_ASSERT(ite->predicate() != nullptr);
 
     // If ite already has Bool conditional, handle internal expressions
     // Otherwise, generate conditional and update predicate
-    if (ite->predicate()->hasValue()) {
-      const auto then_exprs_copy = ite->thenBody().exprs();
-      for (auto expr : then_exprs_copy) {
-        handle(expr);
-      }
-
-      const auto else_exprs_copy = ite->elseBody().exprs();
-      for (auto expr : else_exprs_copy) {
-        handle(expr);
-      }
-    } else {
+    if (!ite->predicate()->hasValue()) {
       auto conditional = generateConditional(ite->predicate());
       TORCH_INTERNAL_ASSERT(conditional != nullptr);
-      TORCH_INTERNAL_ASSERT(conditional->isA<kir::Bool>());
+      TORCH_INTERNAL_ASSERT(conditional->isA<Bool>());
 
       // Update bool conditional in-place
       ite->predicate()->setValue(conditional);
-      handle(ite);
       TORCH_INTERNAL_ASSERT(ite->predicate()->value() != nullptr);
     }
+    kir::IrVisitor::handle(ite);
   }
 
   // Generate conditional according to PredicateType
-  kir::Bool* generateConditional(kir::Predicate* pred) {
+  Bool* generateConditional(kir::Predicate* pred) {
     switch (pred->predicate_type()) {
       case PredicateType::Inline:
       case PredicateType::ReductionWrite:
@@ -114,15 +118,16 @@ class ConditionalFromPredicateModifier {
       case PredicateType::Padding: {
         return PredicateCompute::getInlinePredicate(
             pred->expr(),
-            for_loops_structure_,
+            for_loops_,
             pred->thread_pred(),
             pred->predicate_type());
       }
       case PredicateType::Vectorize: {
         std::vector<kir::ForLoop*> outer_loops;
         kir::ForLoop* vectorized_loop = nullptr;
-        for (auto loop : for_loops_structure_) {
-          if (loop->iter_domain()->parallelType() == ParallelType::Vectorize) {
+        for (auto loop : for_loops_) {
+          if (loop->iter_domain()->getParallelType() ==
+              ParallelType::Vectorize) {
             vectorized_loop = loop;
             break;
           } else {
@@ -134,8 +139,7 @@ class ConditionalFromPredicateModifier {
         return UnswitchPredicate::get(outer_loops, vectorized_loop);
       }
       case PredicateType::Unswitch: {
-        return UnswitchPredicate::get(
-            for_loops_structure_, pred->unrolled_loop());
+        return UnswitchPredicate::get(for_loops_, pred->unrolled_loop());
       }
       case PredicateType::Manual: {
         return pred->value();
@@ -145,429 +149,13 @@ class ConditionalFromPredicateModifier {
     }
     return nullptr;
   }
-
- private:
-  // We will track which loops in the incoming IR will be replaced and by what
-  std::unordered_map<kir::Expr*, kir::Expr*> expr_replacement_map_;
-
-  // A depth-first ordering of nested for loops
-  // It is used for indexing and predicate generation
-  std::vector<kir::ForLoop*> for_loops_structure_;
-};
-
-} // namespace
-
-std::vector<kir::Expr*> generateConditionalFromPredicate(
-    Fusion* fusion,
-    const std::vector<kir::Expr*>& exprs) {
-  FUSER_PERF_SCOPE("GpuLower::Lower::generateConditionalFromPredicate");
-
-  ConditionalFromPredicateModifier p2cm(exprs);
-
-  std::vector<kir::Expr*> mutated_exprs;
-  mutated_exprs.reserve(exprs.size());
-  for (auto expr : exprs) {
-    mutated_exprs.push_back(
-        ir_utils::applyReplacements(p2cm.replacementMap(), expr));
-  }
-
-  return mutated_exprs;
-}
-
-namespace {
-
-class PredicateAnalyzer : public OptOutDispatch {
- public:
-  //! Checks if a predicate is needed to avoid out-of-bound accesses.
-  //!
-  //! Due to the way we allocate local-memory tensors, there should
-  //! never be out-of-bound accesses with consumer tensors when allocated on
-  //! local memory. However, accessing producer tensors still may
-  //! result in out-of-bound as they are replayed as consumers.
-  static bool needsPredicate(TensorView* producer, TensorView* consumer) {
-    // Both tensors must be on local memory. Global tensors must be
-    // predicated as allocation is done based on root domains. Smem
-    // and local tensors are allocated based on leaf domains, however,
-    // smem tensors are parallelized, which is highly likely, the size
-    // of the parallelized axis is the actual size of the axis, not
-    // the number of threads. Since the number of threads can be
-    // larger than the axis size, it's not safe to skip predication
-    if (!(producer->getMemoryType() == MemoryType::Local &&
-          consumer->getMemoryType() == MemoryType::Local)) {
-      return true;
-    }
-
-    auto pairwise_map = PairwiseRootDomainMap(producer, consumer);
-    auto c2p =
-        BestEffortReplay::replayPasC(producer, consumer, -1, pairwise_map)
-            .getReplay();
-
-    PredicateAnalyzer analyzer(c2p);
-
-    for (auto id : consumer->domain()->domain()) {
-      if (analyzer.needsPredicate(id)) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
- private:
-  PredicateAnalyzer(const std::unordered_map<IterDomain*, IterDomain*>& c2p_map)
-      : c2p_map_(c2p_map) {}
-
-  // Returns true if no out-of-bound accesses could occur with a
-  // producer
-  bool needsPredicate(IterDomain* consumer_id) {
-    needs_predicate_ = false;
-    handle(consumer_id);
-    return needs_predicate_;
-  }
-
-  using OptOutDispatch::handle;
-
-  void handle(IterDomain* consumer_id) override {
-    // The traversal should have ended if needs_predicate_ was true
-    TORCH_INTERNAL_ASSERT(!needs_predicate_);
-
-    // If consumer_id is not going to be materialized as a loop (e.g.,
-    // broadcast), no need to predicate
-    const auto gpu_lower = GpuLower::current();
-    if (consumer_id->isBroadcast() ||
-        gpu_lower->trivialReductionInfo().isDerived(consumer_id)) {
-      return;
-    }
-
-    // If the producer has a matching domain, it should not cause
-    // out-of-bound accesses
-    if (c2p_map_.find(consumer_id) != c2p_map_.end()) {
-      return;
-    }
-
-    // If no definition exists, stop traversing
-    if (consumer_id->definition() == nullptr) {
-      return;
-    }
-
-    handle(consumer_id->definition());
-  }
-
-  // If it splits the input axis evenly, proceeds to check the input
-  // axis. Otherwise, we can't skip predication as it might cause
-  // out-bound accesses with the producer tensor
-  void handle(Split* split) override {
-    auto factor = split->factor()->getInt();
-    if (!factor.has_value()) {
-      needs_predicate_ = true;
-      return;
-    }
-
-    ExpressionEvaluator ee(split->fusion());
-    const auto in_extent = ee.evaluate(split->in()->extent());
-
-    if (!in_extent.has_value() || ((in_extent.value() % factor.value()) != 0)) {
-      needs_predicate_ = true;
-      return;
-    }
-
-    handle(split->in());
-  }
-
-  void handle(Merge* merge) override {
-    handle(merge->inner());
-    if (needs_predicate_) {
-      return;
-    }
-    handle(merge->outer());
-  }
-
- private:
-  //! BestEffort map from consumer IDs to producer IDs
-  const std::unordered_map<IterDomain*, IterDomain*>& c2p_map_;
-  bool needs_predicate_ = false;
 };
 
 } // namespace
 
-bool PredicateElimination::needsPredicate(Expr* expr) const {
-  if (!ir_utils::isTVOp(expr)) {
-    return false;
-  }
-
-  std::vector<std::function<bool(Expr*)>> filters;
-
-  // Always predicate integer division and related ops as we don't
-  // know what values are in the out-of-bound region and they may
-  // cause exceptions
-  filters.emplace_back([](Expr* expr) {
-    auto dt = expr->outputs()[0]->getDataType().value();
-    return (
-        (dt == DataType::Int || dt == DataType::Int32) &&
-        expr->isA<BinaryOp>() &&
-        (expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Div ||
-         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Mod ||
-         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Remainder ||
-         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::CeilDiv));
-  });
-
-  // Skip if MisalignedVectorize is involved for now. This could be
-  // relaxed.
-  filters.emplace_back([](Expr* expr) {
-    std::vector<const std::vector<Val*>*> inputs_and_outputs = {
-        &(expr->inputs()), &(expr->outputs())};
-    for (const auto& inputs_or_outputs : inputs_and_outputs) {
-      for (auto tv : ir_utils::filterByType<TensorView>(*inputs_or_outputs)) {
-        if (std::any_of(
-                tv->domain()->domain().begin(),
-                tv->domain()->domain().end(),
-                [](IterDomain* axis) {
-                  return axis->getParallelType() ==
-                      ParallelType::MisalignedVectorize;
-                })) {
-          return true;
-        }
-      }
-    }
-    return false;
-  });
-
-  // Shift is not supported yet.
-  filters.emplace_back([](Expr* expr) {
-    auto& halo_info = GpuLower::current()->haloInfo();
-    auto input_tvs = ir_utils::filterByType<TensorView>(expr->inputs());
-    return halo_info.needsShiftPredicate(expr) ||
-        std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) {
-             return input_tv->definition() != nullptr &&
-                 halo_info.needsShiftPredicate(input_tv->definition());
-           });
-  });
-
-  // Predicates the expression if any producer-consumer pair of the
-  // expression needs to be predicated
-  filters.emplace_back([](Expr* expr) {
-    for (auto output : ir_utils::filterByType<TensorView>(expr->outputs())) {
-      for (auto input : ir_utils::filterByType<TensorView>(expr->inputs())) {
-        if (PredicateAnalyzer::needsPredicate(input, output)) {
-          return true;
-        }
-      }
-    }
-    return false;
-  });
-
-  // Predicates Welford ops
-  filters.emplace_back([](Expr* expr) { return expr->isA<WelfordOp>(); });
-
-  // If this is a reduction, and if we omit the predicate for the
-  // input, the input may have a garbabe value, which must not be used
-  // for this reduction. However, if the input is also an output of
-  // another reduction with the same binary op, which is a common
-  // pattern with rfactor, the input should be safe to use with no
-  // predication.
-  filters.emplace_back([this](Expr* expr) {
-    if (expr->isA<ReductionOp>()) {
-      auto input = expr->inputs()[0]->as<TensorView>();
-      auto input_def = input->definition();
-      // When input_def is null, input must be an input to the fusion,
-      // so that must be allocated on global memory. Since we don't omit
-      // predication for expressions involving global memory, this
-      // should never occur.
-      TORCH_INTERNAL_ASSERT(
-          input_def != nullptr, "Inconsistent input found: ", input);
-
-      if (non_predicated_exprs_.find(input_def) !=
-              non_predicated_exprs_.end() &&
-          !(input_def->isA<ReductionOp>() &&
-            (expr->as<ReductionOp>()->getReductionOpType() ==
-             input_def->as<ReductionOp>()->getReductionOpType()))) {
-        return true;
-      }
-    }
-    return false;
-  });
-
-  // If any of the filters returns true, predicate must be used.
-  return std::any_of(filters.begin(), filters.end(), [expr](auto filter) {
-    return filter(expr);
-  });
-}
-
-void PredicateElimination::handle(Expr* expr) {
-  if (!ir_utils::isTVOp(expr)) {
-    return;
-  }
-
-  if (needsPredicate(expr)) {
-    return;
-  }
-
-  non_predicated_exprs_.insert(expr);
-
-  // Ensure all inputs have some values set at the out-of-bound
-  // regions
-  for (auto input : ir_utils::filterByType<TensorView>(expr->inputs())) {
-    auto input_def = input->definition();
-    // When input_def is null, input must be an input to the fusion,
-    // so that must be allocated on global memory. Since we don't omit
-    // predication for expressions involving global memory, this
-    // should never occur.
-    std::stringstream ss;
-    ss << input;
-    TORCH_INTERNAL_ASSERT(
-        input_def != nullptr, "Inconsistent input found: ", ss.str());
-
-    // If input is an output of reduction, it should be fully
-    // initialied as it's allocated on local memory.
-    if (input_def->isA<ReductionOp>() || input_def->isA<WelfordOp>()) {
-      continue;
-    }
-
-    // If this expr is reduction, always initilize the input with the
-    // default value. NOTE: This can be done more
-    // intelligently. A garbage value can only cause a problem when
-    // it's reduced with non-garbage values, so if the non-reduction
-    // axes do not have any garbage, it should be just fine without
-    // explicit initialization. However, initialization cost should be
-    // cheap, so that further optimization should not make a large
-    // difference.
-    if (expr->isA<ReductionOp>()) {
-      setReductionInitValue(input, expr->as<ReductionOp>()->init());
-      continue;
-    }
-
-    // If an input does not need a predicate either, then it should
-    // have some value, so no need to set a default value
-    if (non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) {
-      continue;
-    }
-
-    // Make sure input is initialized
-    setDefaultInitValue(input);
-  }
-}
-
-bool PredicateElimination::setDefaultInitValue(TensorView* tv) {
-  auto it = init_value_map_.find(tv);
-  // If there's already a mapping for tv, it should be mapped to a
-  // zero val or a reduction init. Either case, no need to modify
-  // the existing mapping.
-  if (it == init_value_map_.end()) {
-    init_value_map_.insert({tv, nullptr});
-  }
-  return true;
-}
-
-bool PredicateElimination::setReductionInitValue(
-    TensorView* tv,
-    Val* reduction_init) {
-  auto it = init_value_map_.find(tv);
-  if (it == init_value_map_.end()) {
-    init_value_map_.insert({tv, reduction_init});
-    return true;
-  }
-
-  auto existing_val = it->second;
-  if (existing_val == nullptr) {
-    // If the existing mapping returns nullptr, it means that a
-    // default init was set before. Overwrite with the reduction
-    // init val.
-    init_value_map_[tv] = reduction_init;
-    return true;
-  } else if (existing_val->sameAs(reduction_init)) {
-    return true;
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false,
-        "Incosistent setting of initialization value for t",
-        tv->name(),
-        ". Prev: ",
-        existing_val,
-        ", New: ",
-        reduction_init);
-    return false;
-  }
-}
-
-bool PredicateElimination::canOmitPredicate(const Expr* expr) const {
-  TORCH_INTERNAL_ASSERT(expr != nullptr);
-  const auto out_tv = ir_utils::getTVOutput(expr);
-  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression");
-  // No need to predicate local tensors to which a scalar is assigned
-  if (out_tv->getMemoryType() == MemoryType::Local) {
-    if (auto uop = dynamic_cast<const UnaryOp*>(expr)) {
-      if (uop->getUnaryOpType() == UnaryOpType::Set && uop->in()->isScalar()) {
-        return true;
-      }
-    }
-  }
-  if (non_predicated_exprs_.find(expr) != non_predicated_exprs_.end()) {
-    return true;
-  }
-
-  return false;
-}
-
-bool PredicateElimination::canOmitPredicate(const kir::Expr* kir_expr) const {
-  TORCH_INTERNAL_ASSERT(kir_expr != nullptr);
-  const auto out_tv = ir_utils::getTVOutput(kir_expr);
-  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression");
-  // No need to predicate local tensors to which a scalar is assigned
-  if (out_tv->memoryType() == MemoryType::Local) {
-    if (auto uop = dynamic_cast<const kir::UnaryOp*>(kir_expr)) {
-      if (uop->operation() == UnaryOpType::Set && uop->in()->isScalar()) {
-        return true;
-      }
-    }
-  }
-  const auto fuser_tv = out_tv->fuserTv();
-  if (fuser_tv == nullptr) {
-    return false;
-  }
-  return canOmitPredicate(fuser_tv->definition());
-}
-
-kir::Val* PredicateElimination::getInitValue(TensorView* tv) const {
-  auto it = init_value_map_.find(tv);
-  if (it == init_value_map_.end()) {
-    return nullptr;
-  }
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-  auto init_val = it->second;
-  if (init_val == nullptr) {
-    // No reduction restriction. Just use zero
-    return ir_builder.zeroVal();
-  } else {
-    return gpu_lower->lowerValue(init_val);
-  }
-}
-
-void PredicateElimination::build(Fusion* fusion) {
-  traverseFrom(fusion, fusion->outputs());
-}
-
-std::string PredicateElimination::toString() const {
-  std::stringstream ss;
-  ss << "Tensors that do not need predication:";
-  for (auto expr : non_predicated_exprs_) {
-    for (auto out : expr->outputs()) {
-      TORCH_INTERNAL_ASSERT(out->isA<TensorView>());
-      ss << " T" << out->name();
-    }
-  }
-  ss << "\n";
-  ss << "Init values:";
-  for (auto kv : init_value_map_) {
-    ss << " T" << kv.first->name() << "->";
-    if (kv.second == nullptr) {
-      ss << "<default(0)>";
-    } else {
-      ss << kv.second;
-    }
-  }
-  ss << "\n";
-  return ss.str();
+std::vector<Expr*> generateConditionalFromPredicate(
+    const std::vector<Expr*>& exprs) {
+  return ConditionalFromPredicateModifier::fillPredicates(exprs);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.h b/torch/csrc/jit/codegen/cuda/lower_predicate.h
index 393d0fa5c184..7f4926dad917 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate.h
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
@@ -13,49 +13,8 @@ namespace cuda {
 
 //! Update predicates with valid bool conditionals
 //!
-std::vector<kir::Expr*> generateConditionalFromPredicate(
-    Fusion* fusion,
-    const std::vector<kir::Expr*>& exprs);
-
-class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor {
- public:
-  void build(Fusion* fusion);
-
-  //! True if expr does not need a predicate
-  //!
-  //! \param expr Tensor expression
-  bool canOmitPredicate(const Expr* expr) const;
-
-  //! True if expr does not need a predicate
-  //!
-  //! \param expr KIR tensor expr
-  bool canOmitPredicate(const kir::Expr* expr) const;
-
-  //! Value to initialize out-of-bound regions
-  kir::Val* getInitValue(TensorView* tv) const;
-
-  //! Dump to string for debugging
-  std::string toString() const;
-
- private:
-  using IterVisitor::handle;
-
-  void handle(Expr* expr) override;
-
-  //! Set a value to initialize out-of-bound regions
-  bool setDefaultInitValue(TensorView* tv);
-  //! Set a value to initialize out-of-bound regions of reduction tensors
-  bool setReductionInitValue(TensorView* tv, Val* reduction_init);
-
-  //! Check if expr needs to be predicated
-  bool needsPredicate(Expr* expr) const;
-
- private:
-  //! Expressions that are found to be safe without predicates
-  std::unordered_set<const Expr*> non_predicated_exprs_;
-  //! Tensors and their initialization values
-  std::unordered_map<TensorView*, Val*> init_value_map_;
-};
+std::vector<Expr*> generateConditionalFromPredicate(
+    const std::vector<Expr*>& exprs);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
new file mode 100644
index 000000000000..53fccbdfc5c6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
@@ -0,0 +1,715 @@
+#include <torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
+#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+// Warp primitives are currently limited to un-predicated usage,
+//   predicating these ops will require extra steps to ensure that
+//   the whole warp will get the same value.
+void assertOnWarpOps(const Expr* expr) {
+  TORCH_INTERNAL_ASSERT(
+      !expr->isA<MmaOp>(),
+      "Mma op: cannot eliminate predicate for mma op, tiling not valid. ",
+      expr->toString());
+}
+
+} // namespace
+
+namespace {
+
+class PredicateAnalyzer : public OptOutDispatch {
+ public:
+  //! Checks if a predicate is needed to avoid out-of-bound accesses.
+  //!
+  //! Due to the way we allocate local-memory tensors, there should
+  //! never be out-of-bound accesses with consumer tensors when allocated on
+  //! local memory. However, accessing producer tensors still may
+  //! result in out-of-bound as they are replayed as consumers.
+  static bool needsPredicate(TensorView* producer, TensorView* consumer) {
+    // Both tensors must be on local memory. Global tensors must be
+    // predicated as allocation is done based on root domains. Smem
+    // and local tensors are allocated based on leaf domains, however,
+    // smem tensors are parallelized, which is highly likely, the size
+    // of the parallelized axis is the actual size of the axis, not
+    // the number of threads. Since the number of threads can be
+    // larger than the axis size, it's not safe to skip predication
+
+    // Check that parallel dimension will not generate out of bound index
+    if (!(producer->getMemoryType() == MemoryType::Local &&
+          consumer->getMemoryType() == MemoryType::Local)) {
+      return true;
+    }
+
+    auto pairwise_map = PairwiseRootDomainMap(producer, consumer);
+    auto c2p =
+        BestEffortReplay::replayPasC(producer, consumer, -1, pairwise_map)
+            .getReplay();
+
+    PredicateAnalyzer analyzer(c2p);
+
+    for (auto id : consumer->domain()->domain()) {
+      if (analyzer.needsPredicate(id)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+ private:
+  PredicateAnalyzer(const std::unordered_map<IterDomain*, IterDomain*>& c2p_map)
+      : c2p_map_(c2p_map) {}
+
+  // Returns true if no out-of-bound accesses could occur with a
+  // producer
+  bool needsPredicate(IterDomain* consumer_id) {
+    needs_predicate_ = false;
+    handle(consumer_id);
+    return needs_predicate_;
+  }
+
+  void handle(IterDomain* consumer_id) override {
+    // The traversal should have ended if needs_predicate_ was true
+    TORCH_INTERNAL_ASSERT(!needs_predicate_);
+
+    // If consumer_id is not going to be materialized as a loop (e.g.,
+    // broadcast), no need to predicate
+    if (consumer_id->isBroadcast() ||
+        GpuLower::current()->trivialReductionInfo().isDerived(consumer_id)) {
+      return;
+    }
+
+    // If the producer has a matching domain, it should not cause
+    // out-of-bound accesses
+    if (c2p_map_.find(consumer_id) != c2p_map_.end()) {
+      return;
+    }
+
+    // If no definition exists, stop traversing
+    if (consumer_id->definition() == nullptr) {
+      return;
+    }
+
+    OptOutDispatch::handle(consumer_id->definition());
+  }
+
+  // If it splits the input axis evenly, proceeds to check the input
+  // axis. Otherwise, we can't skip predication as it might cause
+  // out-bound accesses with the producer tensor
+  void handle(Split* split) override {
+    auto factor = split->factor()->getInt();
+    if (!factor.has_value()) {
+      needs_predicate_ = true;
+      return;
+    }
+
+    ExpressionEvaluator ee(split->fusion());
+    const auto in_extent = ee.evaluate(split->in()->extent());
+
+    if (!in_extent.has_value() || ((in_extent.value() % factor.value()) != 0)) {
+      needs_predicate_ = true;
+      return;
+    }
+
+    handle(split->in());
+  }
+
+  void handle(Merge* merge) override {
+    handle(merge->inner());
+    if (needs_predicate_) {
+      return;
+    }
+    handle(merge->outer());
+  }
+
+ private:
+  //! BestEffort map from consumer IDs to producer IDs
+  const std::unordered_map<IterDomain*, IterDomain*>& c2p_map_;
+  bool needs_predicate_ = false;
+};
+
+class PredicateChcker : public IterVisitor {
+ public:
+  static bool needsPredicate(
+      Expr* expr,
+      const std::unordered_set<const Expr*>& non_predicated_exprs) {
+    if (!ir_utils::isTvOp(expr)) {
+      return false;
+    }
+
+    PredicateChcker checker(non_predicated_exprs);
+    checker.handle(expr);
+    return checker.needs_predicate_;
+  }
+
+ private:
+  PredicateChcker(const std::unordered_set<const Expr*>& non_predicated_exprs)
+      : non_predicated_exprs_(non_predicated_exprs) {}
+
+  using IterVisitor::handle;
+
+  void handle(Expr* expr) final {
+    needs_predicate_ = predicateIntDiv(expr) ||
+        predicateMisalignedVectorize(expr) || predicateShift(expr) ||
+        predicateProducerConsumerPair(expr) ||
+        predicateNonDivisibleRootDomains(expr) ||
+        predicateNonDivisibleSplit(expr);
+
+    if (needs_predicate_) {
+      return;
+    }
+
+    // Check ExprType-specific conditions
+    IterVisitor::handle(expr);
+  }
+
+  // All "predicateXYZ" functions return true if an expr needs to be
+  // predicated.
+
+  // Always predicate integer division and related ops as we don't
+  // know what values are in the out-of-bound region and they may
+  // cause exceptions
+  bool predicateIntDiv(Expr* expr) const {
+    auto dt = expr->outputs()[0]->getDataType().value();
+    return (
+        (dt == DataType::Int || dt == DataType::Int32) &&
+        expr->isA<BinaryOp>() &&
+        (expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Div ||
+         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Mod ||
+         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::Remainder ||
+         expr->as<BinaryOp>()->getBinaryOpType() == BinaryOpType::CeilDiv));
+  }
+
+  // Skip if MisalignedVectorize is involved for now. This could be
+  // relaxed.
+  bool predicateMisalignedVectorize(Expr* expr) const {
+    std::vector<const std::vector<Val*>*> inputs_and_outputs = {
+        &(expr->inputs()), &(expr->outputs())};
+    for (const auto& inputs_or_outputs : inputs_and_outputs) {
+      for (auto tv : ir_utils::filterByType<TensorView>(*inputs_or_outputs)) {
+        if (std::any_of(
+                tv->domain()->domain().begin(),
+                tv->domain()->domain().end(),
+                [](IterDomain* axis) {
+                  return axis->getParallelType() ==
+                      ParallelType::MisalignedVectorize;
+                })) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Shift is not supported yet.
+  bool predicateShift(Expr* expr) const {
+    auto& halo_info = GpuLower::current()->haloInfo();
+    auto input_tvs = ir_utils::filterByType<TensorView>(expr->inputs());
+    return halo_info.needsShiftPredicate(expr) ||
+        std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) {
+             return input_tv->definition() != nullptr &&
+                 halo_info.needsShiftPredicate(input_tv->definition());
+           });
+  }
+
+  // Predicates the expression if any producer-consumer pair of the
+  // expression needs to be predicated
+  bool predicateProducerConsumerPair(Expr* expr) const {
+    for (auto output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+      for (auto input : ir_utils::filterByType<TensorView>(expr->inputs())) {
+        if (PredicateAnalyzer::needsPredicate(input, output)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // An index can exceed the logical extent of the indexed domain if
+  // it's split. It can cause a reduction op to reduce the same value
+  // multiple times. Even a pointwise op can be a problem if the
+  // consumer is an alias of the producer. This check excludes such
+  // expressions from predicate elimination.
+  //
+  // This is not an issue if the index includes a zero domain (as defined in
+  // index_compute.cpp), the extent is calculated by multiplying the
+  // split output domains, so it never cross the domain boundary.
+  // So, if a root domain is split and none of its descendants is a
+  // zero domain, the expr needs to be predicated. See
+  // FusionPredicateElimination6 for a concrete example.
+  //
+  // It would be also possible to avoid register aliasing instead of
+  // giving up predicate elimination. Since this condition should be
+  // rather uncommon, either would be fine as long as correctness is
+  // provided.
+  bool predicateNonDivisibleRootDomains(Expr* expr) const {
+    for (auto output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+      const auto all_exprs = DependencyCheck::getAllExprsBetween(
+          {output->getMaybeRFactorDomain().begin(),
+           output->getMaybeRFactorDomain().end()},
+          {output->domain()->domain().begin(),
+           output->domain()->domain().end()});
+      std::unordered_set<Val*> split_root;
+      std::copy_if(
+          output->getMaybeRFactorDomain().begin(),
+          output->getMaybeRFactorDomain().end(),
+          std::inserter(split_root, split_root.end()),
+          [&](auto rf_root) {
+            if (rf_root->isBroadcast() ||
+                GpuLower::current()->trivialReductionInfo().isDerived(
+                    rf_root)) {
+              return false;
+            }
+            for (Expr* use : rf_root->uses()) {
+              if (std::find(all_exprs.begin(), all_exprs.end(), use) ==
+                  all_exprs.end()) {
+                continue;
+              }
+              return use->isA<Split>();
+            }
+            return false;
+          });
+      // If no root domain is split, no need to predicate
+      if (split_root.empty()) {
+        continue;
+      }
+      TORCH_INTERNAL_ASSERT(
+          output->getMemoryType() == MemoryType::Local,
+          "Local memory tensor is assumed: ",
+          output->toString());
+      std::vector<Val*> zero_leaf_ids;
+      for (const auto i : c10::irange(output->nDims())) {
+        auto leaf_id = output->axis(i);
+        if (i < output->getComputeAtPosition() || leaf_id->isThread() ||
+            leaf_id->isMma()) {
+          zero_leaf_ids.push_back(leaf_id);
+        }
+      }
+      if (zero_leaf_ids.empty()) {
+        return true;
+      }
+      const auto vals =
+          DependencyCheck::getAllValsBetween(split_root, zero_leaf_ids);
+      if (std::any_of(
+              split_root.begin(),
+              split_root.end(),
+              [&vals](auto split_root_id) {
+                return std::find(vals.begin(), vals.end(), split_root_id) ==
+                    vals.end();
+              })) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Always predicate if non-divisible split is found. It may be
+  // possible to make it less conservative.
+  // See FusionPredicateElimination7 for a concrete example.
+  bool predicateNonDivisibleSplit(Expr* expr) const {
+    const auto& non_divisible_split_info =
+        GpuLower::current()->nonDivisibleSplitInfo();
+    for (auto output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+      if (non_divisible_split_info.splitsToPredicate().find(output) !=
+          non_divisible_split_info.splitsToPredicate().end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // If this is a reduction, and if we omit the predicate for the
+  // input, the input may have a garbabe value, which must not be used
+  // for this reduction. However, it is still legal to omit its
+  // predicate when: 1) the predicate of the input is not omitted and
+  // 2) the input can be initialized to the init value of this
+  // reduction. When the input is the output of another reduciton, the
+  // input is initialized to the init value of the reduction, so the
+  // two reductions must use the same init value.
+  // See FusionPredicateElimination3 and FusionPredicateElimination4
+  // for concrete examples.
+  void handle(ReductionOp* rop) final {
+    auto input = rop->inputs()[0]->as<TensorView>();
+    auto input_def = input->definition();
+    // When input_def is null, input must be an input to the fusion,
+    // so that must be allocated on global memory. Since we don't omit
+    // predication for expressions involving global memory, this
+    // should never occur.
+    TORCH_INTERNAL_ASSERT(
+        input_def != nullptr, "Inconsistent input found: ", input);
+
+    // The input needs to be initialized to the init value to omit
+    // the predicate, so if the input has its own init value, i.e.,
+    // produced by another reduction, they must use the same init
+    // value.
+    Val* input_init = ir_utils::getReductionInitValOf(input);
+    if (input_init != nullptr && !rop->init()->sameAs(input_init)) {
+      needs_predicate_ = true;
+      return;
+    }
+
+    // If input is not predicated, out-of-bound value may be
+    // overwritten by a garbage value. However, it doesn't matter if
+    // the input is also produced by another reduction. If the preceding
+    // reduction omits the predicate, it means its input must be
+    // initialized to its init value, so no predicate should be
+    // needed in both of the two reduction ops if they use the same
+    // init value, which is guaranteed by the above check, and the
+    // same reduction op.
+    if (auto input_def_rop = dynamic_cast<ReductionOp*>(input_def)) {
+      if (rop->getReductionOpType() != input_def_rop->getReductionOpType() &&
+          non_predicated_exprs_.find(input_def) !=
+              non_predicated_exprs_.end()) {
+        needs_predicate_ = true;
+        return;
+      }
+    } else if (
+        non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) {
+      needs_predicate_ = true;
+      return;
+    }
+  }
+
+  // Welford. See FusionPredicateElimination5.
+  void handle(WelfordOp* wop) final {
+    for (const auto i : c10::irange(3)) {
+      auto init = wop->getInitVals()[i];
+
+      // Welford input can be a scalar. Predicate is required unless
+      // the scalar value is equal to the init value.
+      auto input = wop->inputs().at(i);
+      if (input->isScalar()) {
+        if (!input->sameAs(init)) {
+          needs_predicate_ = true;
+          return;
+        }
+        continue;
+      }
+
+      auto input_tv = dynamic_cast<TensorView*>(input);
+      TORCH_INTERNAL_ASSERT(input_tv != nullptr);
+
+      auto input_def = input->definition();
+
+      // When input_def is null, input must be an input to the fusion,
+      // so that must be allocated on global memory. Since we don't omit
+      // predication for expressions involving global memory, this
+      // should never occur.
+      TORCH_INTERNAL_ASSERT(
+          input_def != nullptr, "Inconsistent input found: ", input);
+
+      // The input needs to be initialized to the init value to omit
+      // the predicate, so if the input has its own init value, i.e.,
+      // produced by another reduction, they must use the same init
+      // value.
+      Val* input_init = ir_utils::getReductionInitValOf(input_tv);
+      if (input_init != nullptr && !init->sameAs(input_init)) {
+        needs_predicate_ = true;
+        return;
+      }
+
+      // If input is not predicated, out-of-bound value may be
+      // overwritten by a garbage value. However, it doesn't matter if
+      // the input is also produced by another welford.
+      if (!input_def->isA<WelfordOp>() &&
+          non_predicated_exprs_.find(input_def) !=
+              non_predicated_exprs_.end()) {
+        needs_predicate_ = true;
+      }
+    }
+  }
+
+  void handle(GroupedReductionOp* grouped_rop) final {
+    for (const auto i : c10::irange(grouped_rop->numReductions())) {
+      auto input = grouped_rop->input(i)->as<TensorView>();
+      auto input_def = input->definition();
+      // When input_def is null, input must be an input to the fusion,
+      // so that must be allocated on global memory. Since we don't omit
+      // predication for expressions involving global memory, this
+      // should never occur.
+      TORCH_INTERNAL_ASSERT(
+          input_def != nullptr, "Inconsistent input found: ", input);
+
+      // The input needs to be initialized to the init value to omit
+      // the predicate, so if the input has its own init value, i.e.,
+      // produced by another reduction, they must use the same init
+      // value.
+      Val* input_init = ir_utils::getReductionInitValOf(input);
+      if (input_init != nullptr &&
+          !grouped_rop->initVal(i)->sameAs(input_init)) {
+        needs_predicate_ = true;
+        return;
+      }
+
+      // If input is not predicated, out-of-bound value may be
+      // overwritten by a garbage value. However, it doesn't matter if
+      // the input is also produced by another reduction. If the preceding
+      // reduction omits the predicate, it means its input must be
+      // initialized to its init value, so no predicate should be
+      // needed in both of the two reduction ops if they use the same
+      // init value, which is guaranteed by the above check, and the
+      // same reduction op.
+      if (auto input_def_rop = dynamic_cast<ReductionOp*>(input_def)) {
+        if (grouped_rop->getReductionOpType(i) !=
+                input_def_rop->getReductionOpType() &&
+            non_predicated_exprs_.find(input_def) !=
+                non_predicated_exprs_.end()) {
+          needs_predicate_ = true;
+          return;
+        }
+      } else if (
+          auto input_def_grouped_rop =
+              dynamic_cast<GroupedReductionOp*>(input_def)) {
+        auto input_index_as_output = std::distance(
+            input_def_grouped_rop->outputs().begin(),
+            std::find(
+                input_def_grouped_rop->outputs().begin(),
+                input_def_grouped_rop->outputs().end(),
+                input));
+        if (grouped_rop->getReductionOpType(i) !=
+                input_def_grouped_rop->getReductionOpType(
+                    input_index_as_output) &&
+            non_predicated_exprs_.find(input_def) !=
+                non_predicated_exprs_.end()) {
+          needs_predicate_ = true;
+          return;
+        }
+      } else if (
+          non_predicated_exprs_.find(input_def) !=
+          non_predicated_exprs_.end()) {
+        needs_predicate_ = true;
+        return;
+      }
+    }
+  }
+
+  // Similar to the above reduction constraint but for MMA
+  void handle(MmaOp* mma) final {
+    for (auto input : ir_utils::filterByType<TensorView>(mma->inputs())) {
+      auto input_def = input->definition();
+      TORCH_INTERNAL_ASSERT(
+          input_def != nullptr, "Inconsistent input found: ", input);
+
+      Val* input_init = ir_utils::getReductionInitValOf(input);
+      if (input_init != nullptr && !mma->init()->sameAs(input_init)) {
+        needs_predicate_ = true;
+        return;
+      }
+
+      if (non_predicated_exprs_.find(input_def) !=
+          non_predicated_exprs_.end()) {
+        needs_predicate_ = true;
+        return;
+      }
+    }
+  }
+
+ private:
+  const std::unordered_set<const Expr*>& non_predicated_exprs_;
+  bool needs_predicate_ = false;
+};
+
+} // namespace
+
+bool PredicateElimination::needsPredicate(Expr* expr) const {
+  return PredicateChcker::needsPredicate(expr, non_predicated_exprs_);
+}
+
+void PredicateElimination::handle(Expr* expr) {
+  if (!ir_utils::isTvOp(expr)) {
+    return;
+  }
+
+  if (needsPredicate(expr)) {
+    assertOnWarpOps(expr);
+    return;
+  }
+
+  non_predicated_exprs_.insert(expr);
+
+  // Ensure all inputs have some values set at the out-of-bound
+  // regions
+  for (const auto i : c10::irange(expr->inputs().size())) {
+    auto input = dynamic_cast<TensorView*>(expr->inputs()[i]);
+    if (input == nullptr) {
+      continue;
+    }
+    auto input_def = input->definition();
+    // When input_def is null, input must be an input to the fusion,
+    // so that must be allocated on global memory. Since we don't omit
+    // predication for expressions involving global memory, this
+    // should never occur.
+    TORCH_INTERNAL_ASSERT(
+        input_def != nullptr, "Inconsistent input found: ", input->toString());
+
+    // If input is an output of reduction, it should be fully
+    // initialied as it's allocated on local memory.
+    if (ir_utils::isReductionOp(input_def)) {
+      continue;
+    }
+
+    if (expr->isA<ReductionOp>()) {
+      setReductionInitValue(input, expr->as<ReductionOp>()->init());
+      continue;
+    } else if (expr->isA<GroupedReductionOp>()) {
+      setReductionInitValue(input, expr->as<GroupedReductionOp>()->initVal(i));
+      continue;
+    } else if (auto wop = dynamic_cast<WelfordOp*>(expr)) {
+      Val* init = wop->getInitVals().at(i);
+      setReductionInitValue(input, init);
+      continue;
+    } else if (expr->isA<MmaOp>()) {
+      setReductionInitValue(input, expr->as<MmaOp>()->init());
+      continue;
+    } else if (
+        non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) {
+      // If an input does not need a predicate either, then it should
+      // have some value, so no need to set a default value
+      continue;
+    } else {
+      // Make sure input is initialized
+      setDefaultInitValue(input);
+    }
+  }
+}
+
+bool PredicateElimination::setDefaultInitValue(TensorView* tv) {
+  auto it = init_value_map_.find(tv);
+  // If there's already a mapping for tv, it should be mapped to a
+  // zero val or a reduction init. Either case, no need to modify
+  // the existing mapping.
+  if (it == init_value_map_.end()) {
+    init_value_map_.insert({tv, nullptr});
+  }
+  return true;
+}
+
+bool PredicateElimination::setReductionInitValue(
+    TensorView* tv,
+    Val* reduction_init) {
+  TORCH_INTERNAL_ASSERT(tv != nullptr);
+
+  auto it = init_value_map_.find(tv);
+  if (it == init_value_map_.end()) {
+    init_value_map_.insert({tv, reduction_init});
+    return true;
+  }
+
+  auto existing_val = it->second;
+  if (existing_val == nullptr) {
+    // If the existing mapping returns nullptr, it means that a
+    // default init was set before. Overwrite with the reduction
+    // init val.
+    init_value_map_[tv] = reduction_init;
+    return true;
+  } else if (existing_val->sameAs(reduction_init)) {
+    return true;
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        false,
+        "Incosistent setting of initialization value for t",
+        tv->name(),
+        ". Prev: ",
+        existing_val,
+        ", New: ",
+        reduction_init);
+    return false;
+  }
+}
+
+bool PredicateElimination::canOmitPredicate(const Expr* expr) const {
+  // Predicate elimination can be disabled with
+  // PYTORCH_NVFUSER_DISABLE=predicate_elimination
+  if (isDisabled(DisableOption::PredicateElimination)) {
+    assertOnWarpOps(expr);
+    return false;
+  }
+
+  TORCH_INTERNAL_ASSERT(expr != nullptr);
+  const auto out_tv = ir_utils::getTvOutput(expr);
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression");
+  // No need to predicate local tensors to which a scalar is assigned
+  if (out_tv->getMemoryType() == MemoryType::Local) {
+    if (auto uop = dynamic_cast<const UnaryOp*>(expr)) {
+      if (uop->getUnaryOpType() == UnaryOpType::Set && uop->in()->isScalar()) {
+        return true;
+      }
+    }
+  }
+  if (non_predicated_exprs_.find(expr) != non_predicated_exprs_.end()) {
+    return true;
+  }
+
+  assertOnWarpOps(expr);
+  return false;
+}
+
+void PredicateElimination::propagateRemovalInfo(
+    const Expr* from,
+    const Expr* to) {
+  if (non_predicated_exprs_.count(from)) {
+    non_predicated_exprs_.insert(to);
+  }
+}
+
+Val* PredicateElimination::getInitValue(TensorView* tv) const {
+  auto it = init_value_map_.find(tv);
+  if (it == init_value_map_.end()) {
+    return nullptr;
+  }
+  auto init_val = it->second;
+  if (init_val == nullptr) {
+    // No reduction restriction. Just use zero
+    return GpuLower::current()->kernel()->zeroVal();
+  } else {
+    return init_val;
+  }
+}
+
+void PredicateElimination::build(Fusion* fusion) {
+  traverseFrom(fusion, fusion->outputs());
+}
+
+std::string PredicateElimination::toString() const {
+  std::stringstream ss;
+  ss << "Tensors that do not need predication:";
+  for (auto expr : non_predicated_exprs_) {
+    for (auto out : expr->outputs()) {
+      TORCH_INTERNAL_ASSERT(out->isA<TensorView>());
+      ss << " T" << out->name();
+    }
+  }
+  ss << "\n";
+  ss << "Init values:";
+  for (auto kv : init_value_map_) {
+    ss << " T" << kv.first->name() << "->";
+    if (kv.second == nullptr) {
+      ss << "<default(0)>";
+    } else {
+      ss << kv.second;
+    }
+  }
+  ss << "\n";
+  return ss.str();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h
new file mode 100644
index 000000000000..557796ce9d4d
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor {
+ public:
+  void build(Fusion* fusion);
+
+  //! True if expr does not need a predicate
+  //!
+  //! \param expr Tensor expression
+  bool canOmitPredicate(const Expr* expr) const;
+
+  //! Value to initialize out-of-bound regions
+  Val* getInitValue(TensorView* tv) const;
+
+  //! Dump to string for debugging
+  std::string toString() const;
+
+  // A utility to set removal info of `to` the same as `from`.
+  //  See issue #1641
+  // We build predicate info before lowering but more expressions
+  //  are created during lowering that this class also need to
+  //  keep track of to make sure correct predicate removal is
+  //  applied.
+  // This utility is a quick patch for the missing information
+  //  since it might be better just to recompute predicate info
+  //  if all expressions were mutated, but that'd take much more
+  //  global info to reliably track.
+  void propagateRemovalInfo(const Expr* from, const Expr* to);
+
+ private:
+  using IterVisitor::handle;
+
+  void handle(Expr* expr) final;
+
+  //! Set a value to initialize out-of-bound regions
+  bool setDefaultInitValue(TensorView* tv);
+  //! Set a value to initialize out-of-bound regions of reduction tensors
+  bool setReductionInitValue(TensorView* tv, Val* reduction_init);
+
+  //! Check if expr needs to be predicated
+  bool needsPredicate(Expr* expr) const;
+
+ private:
+  //! Expressions that are found to be safe without predicates
+  std::unordered_set<const Expr*> non_predicated_exprs_;
+  //! Tensors and their initialization values
+  std::unordered_map<TensorView*, Val*> init_value_map_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp
new file mode 100644
index 000000000000..beec550e537f
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp
@@ -0,0 +1,233 @@
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_replace_size.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+// Going to generate a map of tensor view root domain extents to reduce the
+// number used during lowering. For example if we have:
+//
+// T2[i0, i1] = T1[i0, i1] + T2[i2, i3]
+//
+// We know it would be safe to use:
+//
+// T2[i0, i1] = T1[i0, i1] + T2[i0, i1]
+//
+// And that way we don't generate T2.size[0] and T2.size[1], instead we will
+// reuse T1.size[0] and T1.size[1]
+// This is important when doing CSE as T2 and T1 would otherwise look like
+// they're using different values, even though we know they're the same
+//
+// There's some duplicate logic here that's in computeAt map, but it's not so
+// concice there to pull out. May want to consider making this mapping its own
+// class especially as it may be useful during scheduling.
+std::unordered_map<Val*, Val*> getSimplificationMap(Fusion* fusion) {
+  std::list<std::unordered_set<IterDomain*>> disjoint_root_sets;
+  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>*>
+      id_to_disjoint_root_set;
+
+  auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set](
+                          IterDomain* id0, IterDomain* id1) {
+    if (id0->isBroadcast() || id1->isBroadcast()) {
+      return;
+    }
+
+    auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0);
+    auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1);
+    bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end();
+    bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end();
+
+    if (set_0_found && set_1_found) {
+      if (disjoint_set_0_it->second == disjoint_set_1_it->second) {
+        return;
+      }
+      // merge second disjoint set into first
+      auto* set_0 = disjoint_set_0_it->second;
+      auto* set_1 = disjoint_set_1_it->second;
+      for (auto id : *set_1) {
+        set_0->emplace(id);
+        id_to_disjoint_root_set[id] = set_0;
+      }
+      // remove second set from disjoint_root_sets
+      disjoint_root_sets.erase(std::find(
+          disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1));
+    } else if (set_0_found || set_1_found) {
+      auto existing_set =
+          set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second;
+      auto to_add_id = set_0_found ? id1 : id0;
+      existing_set->emplace(to_add_id);
+      id_to_disjoint_root_set[to_add_id] = existing_set;
+      // add entry into existing set
+    } else {
+      // create new set entry
+      disjoint_root_sets.emplace_back(std::unordered_set<IterDomain*>());
+      auto* new_set = &disjoint_root_sets.back();
+      new_set->emplace(id0);
+      new_set->emplace(id1);
+      id_to_disjoint_root_set[id0] = new_set;
+      id_to_disjoint_root_set[id1] = new_set;
+    }
+  };
+
+  auto fusion_vals = fusion->usedMathVals();
+  for (auto producer_tv : ir_utils::filterByType<TensorView>(fusion_vals)) {
+    auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv);
+    for (auto consumer_tv : consumer_tvs) {
+      auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv);
+      auto c2p_root_map = pairwise_map.mapConsumerToProducer(
+          consumer_tv->domain(), producer_tv->domain());
+      for (auto entry : c2p_root_map) {
+        auto c_id = entry.first;
+        auto p_id = entry.second;
+        map_root_ids(p_id, c_id);
+      }
+    }
+  }
+
+  // Map each set to an input ID (if it exists) that has the smallest ->name()
+  // entry value
+  std::unordered_map<std::unordered_set<IterDomain*>*, IterDomain*>
+      set_to_input_id;
+
+  // Loop over the root domains, of the inputs to the fusion. Pick an input ID
+  // to use as the representative ID of the collected sets. Only consider inputs
+  // as those are the ones that map to values like "T0.size[1]". They are he
+  // ID's that propagated their extents into the problem. We could also check
+  // the outputs as we do have C++ examples of using output dimensions for the
+  // problem size instead of inputs. However, we don't do anything where we can
+  // translate to those kinds of kernels integrated into PyTorch.
+  for (auto input_tv : ir_utils::filterByType<TensorView>(fusion->inputs())) {
+    for (auto id :
+         TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) {
+      auto id_set_it = id_to_disjoint_root_set.find(id);
+      if (id_set_it == id_to_disjoint_root_set.end()) {
+        continue;
+      }
+      auto* id_set = id_set_it->second;
+      if (set_to_input_id.find(id_set) == set_to_input_id.end()) {
+        set_to_input_id[id_set] = id;
+      } else {
+        auto input_id_of_set = set_to_input_id.at(id_set);
+        // Swap id's if new name is less than previously set
+        bool swap_ids = id->name() < input_id_of_set->name();
+        // If new id is a const scalar but previously was'nt use the const
+        // scalar
+        swap_ids = swap_ids ||
+            (id->extent()->isConstScalar() &&
+             !input_id_of_set->extent()->isConstScalar());
+        // If previous scalar was const and new isn't, don't swap
+        swap_ids = swap_ids &&
+            !(input_id_of_set->extent()->isConstScalar() &&
+              !id->extent()->isConstScalar());
+
+        if (swap_ids) {
+          set_to_input_id[id_set] = id;
+        }
+      }
+    }
+  }
+
+  // Finally make map from ID extents to the representitive ID extent.
+  std::unordered_map<Val*, Val*> extent_to_min_input_id_extent;
+  for (auto entry : set_to_input_id) {
+    auto* set = entry.first;
+    auto input_id = entry.second;
+    for (auto id : *set) {
+      extent_to_min_input_id_extent[id->extent()] = input_id->extent();
+    }
+  }
+  return extent_to_min_input_id_extent;
+}
+
+} // namespace
+
+void replaceSymbolicSizes(Fusion* fusion) {
+  FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes");
+  std::unordered_map<Val*, Val*> tensor_dim_map;
+
+  // Grab inputs and outputs
+  std::vector<TensorView*> inputs_and_outputs;
+  for (auto val : fusion->inputs()) {
+    if (ir_utils::isTV(val)) {
+      inputs_and_outputs.push_back(val->as<TensorView>());
+    }
+  }
+  // Symbolic size is necessary for outputs if there are no inputs.
+  // Otherwise infer output sizes from the inputs via expression evaluation.
+  if (fusion->inputs().empty()) {
+    for (auto val : fusion->outputs()) {
+      if (ir_utils::isTV(val)) {
+        inputs_and_outputs.push_back(val->as<TensorView>());
+      }
+    }
+  }
+
+  // Generate map for all tensorview root domain values to map them to symbolic
+  // values. i.e. T0->getRootDomain()[0] would map to a named scalar
+  // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir.
+  for (TensorView* tv : inputs_and_outputs) {
+    // Replace the domain with one based on Ti.size[j]
+    const std::vector<IterDomain*>& root_td = tv->getRootDomain();
+
+    size_t dim = 0;
+    for (auto id : root_td) {
+      Val* orig_size = id->extent();
+
+      // Output sizes could have reduction axes, which isn't what gets output.
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+      if (id->isReduction() ||
+          (id->getIterType() == IterType::BroadcastWithoutStride)) {
+        continue;
+      } else if (
+          id->isRFactorProduct() ||
+          // NOLINTNEXTLINE(bugprone-branch-clone)
+          (id->getIterType() == IterType::BroadcastWithStride) ||
+          orig_size->isConstScalar()) {
+        dim++;
+        continue;
+      }
+
+      // Currently turn off this part for inputs of segmented fusion,
+      //  since FusionKernelRuntime will provide these as integer inputs
+      if (tensor_dim_map.find(orig_size) == tensor_dim_map.end() &&
+          !orig_size->isFusionInput() && !orig_size->isConstScalar()) {
+        std::stringstream ss;
+        ss << "T" << tv->name() << ".size[" << dim++ << "]";
+        tensor_dim_map[orig_size] = IrBuilder::create<NamedScalar>(
+            ss.str(), orig_size->getDataType().value());
+      } else {
+        dim++;
+      }
+    }
+  }
+
+  // Use a minimal number of sizes from provided tensors.
+  auto extent_simplification_map = getSimplificationMap(fusion);
+  for (auto extent_entry : extent_simplification_map) {
+    auto orig_extent = extent_entry.first;
+    auto simplified_extent = extent_entry.second;
+    if (tensor_dim_map.count(orig_extent)) {
+      if (tensor_dim_map.count(simplified_extent)) {
+        tensor_dim_map[orig_extent] = tensor_dim_map[simplified_extent];
+      } else {
+        tensor_dim_map[orig_extent] = simplified_extent;
+      }
+    }
+  }
+
+  // Run mutation on the fusion with the tensor_dim_map
+  ir_utils::replaceValue(fusion, tensor_dim_map);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.h b/torch/csrc/jit/codegen/cuda/lower_replace_size.h
new file mode 100644
index 000000000000..81cee9f6ffe0
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// TensorViews are all based on symbolic sizes. When we first initialize them
+// we don't know if they're inputs or outputs which would mean that they have
+// runtime shapes. Intermediate tensors (those not going to global memory) do
+// not have this information. Since we need to have the correct information in
+// the kernel being fetched for shapes, we want to replace input and output
+// tensors to reference the runtime structure containing sizes.
+void replaceSymbolicSizes(Fusion*);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
index 8a4f6980e015..913b246e71ac 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
@@ -5,8 +5,6 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_shift.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
@@ -19,19 +17,17 @@ namespace fuser {
 namespace cuda {
 
 void ShiftPredicateInserter::insert(
-    kir::Expr* expr,
+    Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
-    kir::Bool* thread_pred,
+    Bool* thread_pred,
     bool within_unswitch) {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
-  kir::TensorView* out_tv = ir_utils::getTVOutput(expr);
-  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output");
+  TensorView* out_tv = ir_utils::getTvOutput(expr);
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output");
 
-  TensorView* out_fuser_tv = out_tv->fuserTv();
   const bool needs_shift_predicate =
-      gpu_lower->haloInfo().needsShiftPredicate(out_fuser_tv->definition());
+      gpu_lower->haloInfo().needsShiftPredicate(out_tv->definition());
   if (!needs_shift_predicate) {
     return;
   }
@@ -48,12 +44,12 @@ void ShiftPredicateInserter::insert(
 
   kir::Predicate* thread_pred_expr = nullptr;
   if (within_unswitch) {
-    thread_pred_expr = ir_builder.create<kir::Predicate>(thread_pred);
+    thread_pred_expr = IrBuilder::create<kir::Predicate>(thread_pred);
   }
 
   kir::Predicate* shift_pred = within_unswitch
       ? thread_pred_expr
-      : ir_builder.create<kir::Predicate>(
+      : IrBuilder::create<kir::Predicate>(
             PredicateType::Shift, expr, thread_pred);
 
   // If the expr involves a thread-block barrier, set the predicate of
@@ -64,7 +60,7 @@ void ShiftPredicateInserter::insert(
     return;
   }
 
-  auto shift_ite = ir_builder.create<kir::IfThenElse>(shift_pred);
+  auto shift_ite = IrBuilder::create<kir::IfThenElse>(shift_pred);
 
   auto& scope = loops.back()->body();
 
@@ -83,56 +79,33 @@ void ShiftPredicateInserter::insert(
   }
 
   // Padding by zero
-  kir::Predicate* padding_pred = ir_builder.create<kir::Predicate>(
+  kir::Predicate* padding_pred = IrBuilder::create<kir::Predicate>(
       PredicateType::Padding, expr, thread_pred);
-  auto bounds_ite = ir_builder.create<kir::IfThenElse>(padding_pred);
+  auto bounds_ite = IrBuilder::create<kir::IfThenElse>(padding_pred);
   const int pad_value = 0;
-  auto pad_expr = ir_builder.create<kir::UnaryOp>(
-      UnaryOpType::Set, out_tv, ir_builder.create<kir::Int>(pad_value));
+  auto pad_expr = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Set, out_tv, IrBuilder::create<Int>(pad_value));
   bounds_ite->thenBody().push_back(pad_expr);
   // Insert the else block
   shift_ite->elseBody().push_back(bounds_ite);
 }
 
-AxisHaloInfo::AxisHaloInfo() {
-  auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-  setWidth(0, ir_builder.zeroVal());
-  setWidth(1, ir_builder.zeroVal());
-}
-
-kir::Int* AxisHaloInfo::width() const {
-  auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-  return ir_builder.addExpr(width(0), width(1))->as<kir::Int>();
+int AxisHaloInfo::width() const {
+  return width(0) + width(1);
 }
 
-kir::Int* AxisHaloInfo::width(int pos) const {
+int AxisHaloInfo::width(int pos) const {
   TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2);
-  TORCH_INTERNAL_ASSERT(widths_[pos] != nullptr);
   return widths_[pos];
 }
 
-void AxisHaloInfo::setWidth(int pos, kir::Int* width) {
+void AxisHaloInfo::setWidth(int pos, int width) {
   TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2);
   widths_[pos] = width;
 }
 
-void AxisHaloInfo::merge(int pos, kir::Int* other) {
-  auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-  auto cur = width(pos);
-  kir::Int* new_width = nullptr;
-  if (cur->isConst() && other->isConst()) {
-    new_width = ir_builder.create<kir::Int>(
-        std::max(cur->value().value(), other->value().value()));
-  } else if (cur->isZeroInt()) {
-    new_width = other;
-  } else if (other->isZeroInt()) {
-    new_width = cur;
-  } else {
-    new_width = ir_builder.maxExpr(width(pos), other)->as<kir::Int>();
-  }
+void AxisHaloInfo::merge(int pos, int other) {
+  auto new_width = std::max(width(pos), other);
   setWidth(pos, new_width);
 }
 
@@ -144,13 +117,12 @@ void AxisHaloInfo::merge(const AxisHaloInfo& other) {
 
 bool AxisHaloInfo::hasHalo() const {
   return std::any_of(
-      widths_.begin(), widths_.end(), [](auto w) { return !w->isZeroInt(); });
+      widths_.begin(), widths_.end(), [](auto w) { return w != 0; });
 }
 
 std::string AxisHaloInfo::toString() const {
   std::stringstream ss;
-  ss << "<" << kir::toString(width(0)) << ", " << kir::toString(width(1))
-     << ">";
+  ss << "<" << width(0) << ", " << width(1) << ">";
   return ss.str();
 }
 
@@ -158,38 +130,21 @@ bool HaloInfo::hasRootAxisInfo(IterDomain* id) const {
   return root_axis_map_.find(id) != root_axis_map_.end();
 }
 
-bool HaloInfo::hasRootAxisInfo(kir::IterDomain* id) const {
-  return kir_root_axis_map_.find(id) != kir_root_axis_map_.end();
-}
-
 const AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) const {
+  // TODO: Enable this check, was failing in many tests
+  // TORCH_INTERNAL_ASSERT(
+  //     id->definition() == nullptr || id->isRFactorProduct(),
+  //     "Invalid IterDomain: ",
+  //     id);
   auto it = root_axis_map_.find(id);
   TORCH_INTERNAL_ASSERT(
-      it != root_axis_map_.end(), "Halo root axis info not found for ", id);
-  return it->second;
-}
-
-AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  return const_cast<AxisHaloInfo&>(
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      const_cast<const HaloInfo*>(this)->getRootAxisInfo(id));
-}
-
-const AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) const {
-  TORCH_INTERNAL_ASSERT(
-      id->definition() == nullptr || id->isRFactorProduct(),
-      "Invalid IterDomain: ",
-      id);
-  auto it = kir_root_axis_map_.find(id);
-  TORCH_INTERNAL_ASSERT(
-      it != kir_root_axis_map_.end(),
+      it != root_axis_map_.end(),
       "Halo root axis info not found for ",
-      kir::toString(id));
+      id->toString());
   return it->second;
 }
 
-AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) {
+AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   return const_cast<AxisHaloInfo&>(
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@@ -200,9 +155,6 @@ void HaloInfo::setRootAxisInfo(
     IterDomain* id,
     const AxisHaloInfo& root_axis_info) {
   root_axis_map_[id] = root_axis_info;
-  kir_root_axis_map_
-      [GpuLower::current()->lowerValue(id)->as<kir::IterDomain>()] =
-          root_axis_info;
 
   initializeFromRootAxisInfo(id);
   return;
@@ -283,9 +235,6 @@ void HaloInfo::propagateRootAxisInfo(
 
   const auto& c_root = consumer->getRootDomain();
 
-  auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
   for (const auto i : c10::irange(c_root.size())) {
     auto c_id = c_root[i];
     auto it = c2p.find(c_id);
@@ -332,31 +281,19 @@ void HaloInfo::propagateRootAxisInfo(
         p_info.merge(c_info);
       } else {
         int pos = (offset > 0) ? 0 : 1;
-        p_info.merge(
-            pos,
-            ir_builder.addExpr(c_info.width(pos), std::abs(offset))
-                ->as<kir::Int>());
+        p_info.merge(pos, c_info.width(pos) + std::abs(offset));
       }
     } else if (auto gather_op = dynamic_cast<GatherOp*>(expr)) {
-      const auto window_dim =
-          gpu_lower->lowerValue(gather_op->windowShape()[i]);
-      if (window_dim->isOneInt()) {
+      const auto window_dim = gather_op->windowShape()[i];
+      if (window_dim == 1) {
         p_info.merge(c_info);
         continue;
       }
-      const auto& pad_dim = gather_op->padWidth()[i];
-      const auto pad_dim0 = gpu_lower->lowerValue(pad_dim[0])->as<kir::Int>();
-      p_info.merge(
-          0, ir_builder.addExpr(c_info.width(0), pad_dim0)->as<kir::Int>());
+      const auto pad_dim0 = gather_op->padWidth()[i][0];
+      p_info.merge(0, c_info.width(0) + pad_dim0);
       // The right-side halo is propagated as:
       //   consumer_right_halo + (window_dim - 1 - left_padding)
-      p_info.merge(
-          1,
-          ir_builder
-              .subExpr(
-                  ir_builder.addExpr(c_info.width(1), window_dim),
-                  ir_builder.addExpr(pad_dim0, 1))
-              ->as<kir::Int>());
+      p_info.merge(1, c_info.width(1) + window_dim - 1 - pad_dim0);
     } else {
       p_info.merge(c_info);
     }
@@ -389,31 +326,28 @@ void HaloInfo::insertToInheritanceMap(
 void HaloInfo::initializeFromRootAxisInfo(IterDomain* id) {
   TORCH_INTERNAL_ASSERT(hasRootAxisInfo(id));
 
-  auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   const auto& halo_info = getRootAxisInfo(id);
   auto halo_width = halo_info.width();
 
   if (!halo_info.hasHalo()) {
-    halo_width_map_[id] = ir_builder.zeroVal();
+    setHaloWidth(id, 0);
     return;
   }
 
   auto expanded_extent =
-      ir_builder.addExpr(gpu_lower->lowerValue(id->extent()), halo_width);
-  kir_extent_map_[gpu_lower->lowerValue(id)->as<kir::IterDomain>()] =
-      expanded_extent;
+      IrBuilder::addExpr(id->extent(), IrBuilder::create<Int>(halo_width));
+  extent_map_[id] = expanded_extent;
   halo_width_map_[id] = halo_width;
 
   inheritance_map_[id] = {id};
 }
 
+void HaloInfo::setHaloWidth(IterDomain* id, int halo_width) {
+  halo_width_map_[id] = halo_width;
+}
+
 // Propagate extent information from root axes to descendants
 void HaloInfo::build(TensorDomain* td) {
-  auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   auto exprs = DependencyCheck::getAllExprsBetween(
       {td->getMaybeRFactorDomain().begin(), td->getMaybeRFactorDomain().end()},
       {td->domain().begin(), td->domain().end()});
@@ -459,33 +393,29 @@ void HaloInfo::build(TensorDomain* td) {
 
       auto in_id = split->in();
 
-      const auto& halo_width_it = halo_width_map_.find(in_id);
-
       // If no halo info is found, nothing needs to be done. This ID
       // must be an ancestor of a domain set by setRootAxisInfo.
-      if (halo_width_it == halo_width_map_.end()) {
+      if (!hasHaloWidth(in_id)) {
         continue;
       }
 
-      const auto halo_width = halo_width_it->second;
+      const auto halo_width = getHaloWidth(in_id);
 
-      if (halo_width->isZeroInt()) {
-        halo_width_map_.insert({split->outer(), halo_width});
-        halo_width_map_.insert({split->inner(), halo_width});
+      if (halo_width == 0) {
+        setHaloWidth(split->outer(), 0);
+        setHaloWidth(split->inner(), 0);
         continue;
       }
 
       // propagate to inner domain
       auto out_id = split->inner();
 
-      auto expanded_extent = ir_builder.addExpr(
-          gpu_lower->lowerValue(out_id->extent()), halo_width);
-      kir_extent_map_.insert(
-          {gpu_lower->lowerValue(out_id)->as<kir::IterDomain>(),
-           expanded_extent});
+      auto expanded_extent =
+          SimplifyingIrBuilder::addExpr(out_id->extent(), halo_width);
+      extent_map_.insert({out_id, expanded_extent});
 
-      halo_width_map_.insert({split->outer(), ir_builder.zeroVal()});
-      halo_width_map_.insert({split->inner(), halo_width});
+      setHaloWidth(split->outer(), 0);
+      setHaloWidth(split->inner(), halo_width);
 
       insertToInheritanceMap(td, in_id, split->inner());
     } else if (auto merge = dynamic_cast<Merge*>(expr)) {
@@ -495,25 +425,24 @@ void HaloInfo::build(TensorDomain* td) {
       auto outer_extent = getExtent(merge->outer());
       if (inner_extent != nullptr || outer_extent != nullptr) {
         if (inner_extent == nullptr) {
-          inner_extent = gpu_lower->lowerValue(merge->inner()->extent());
+          inner_extent = merge->inner()->extent();
         } else {
           insertToInheritanceMap(td, merge->inner(), merge->out());
         }
         if (outer_extent == nullptr) {
-          outer_extent = gpu_lower->lowerValue(merge->outer()->extent());
+          outer_extent = merge->outer()->extent();
         } else {
           insertToInheritanceMap(td, merge->outer(), merge->out());
         }
-        auto expanded_extent = ir_builder.mulExpr(outer_extent, inner_extent);
-        kir_extent_map_.insert(
-            {gpu_lower->lowerValue(merge->out())->as<kir::IterDomain>(),
-             expanded_extent});
+        auto expanded_extent =
+            SimplifyingIrBuilder::mulExpr(outer_extent, inner_extent);
+        extent_map_.insert({merge->out(), expanded_extent});
         // Splitting the output of this merge is not allowed, so
         // remember it
         merged_shifted_ids.insert(merge->out());
         // Note that halo_width_map_ is not updated
       } else {
-        halo_width_map_.insert({merge->out(), ir_builder.zeroVal()});
+        setHaloWidth(merge->out(), 0);
       }
     } else {
       TORCH_INTERNAL_ASSERT(false, "Unsupported expr: ", expr);
@@ -540,12 +469,11 @@ void HaloInfo::build(TensorDomain* td) {
 //! vectorization. Vectorization should be eventually supported but
 //! needs further work.
 void HaloInfo::validate(TensorView* tv) const {
-  const auto& par_map = GpuLower::current()->caParallelMap();
-  const auto& loop_map = GpuLower::current()->caLoopMap();
   const auto mem_type = tv->getMemoryType();
 
   for (auto axis : tv->domain()->domain()) {
-    auto concrete_id = par_map.getConcreteMappedID(axis);
+    auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        axis, IdMappingMode::LOOP);
 
     // The extent is assumed to be the same
     TORCH_INTERNAL_ASSERT(
@@ -579,7 +507,7 @@ void HaloInfo::validate(TensorView* tv) const {
 
     bool shared_mem_needed = false;
     for (auto use : tv->uses()) {
-      if (!ir_utils::isTVOp(use)) {
+      if (!ir_utils::isTvOp(use)) {
         continue;
       }
       if (use->isA<ShiftOp>() || use->isA<GatherOp>()) {
@@ -592,7 +520,8 @@ void HaloInfo::validate(TensorView* tv) const {
           consumer->domain()->domain().begin(),
           consumer->domain()->domain().end(),
           [&](IterDomain* consumer_axis) {
-            return loop_map.areMapped(axis, consumer_axis);
+            return GpuLower::current()->caMap()->areMapped(
+                axis, consumer_axis, IdMappingMode::PERMISSIVE);
           });
       if (it == consumer->domain()->domain().end()) {
         continue;
@@ -629,21 +558,16 @@ void HaloInfo::validate(TensorView* tv) const {
   return;
 }
 
-kir::Val* HaloInfo::getExtent(IterDomain* id) const {
-  auto kir_id = GpuLower::current()->lowerValue(id)->as<kir::IterDomain>();
-  return getExtent(kir_id);
-}
-
-kir::Val* HaloInfo::getExtent(kir::IterDomain* id) const {
-  auto it = kir_extent_map_.find(id);
-  if (it != kir_extent_map_.end()) {
+Val* HaloInfo::getExtent(IterDomain* id) const {
+  auto it = extent_map_.find(id);
+  if (it != extent_map_.end()) {
     return it->second;
   } else {
     return nullptr;
   }
 }
 
-kir::Int* HaloInfo::getHaloWidth(IterDomain* id) const {
+int HaloInfo::getHaloWidth(IterDomain* id) const {
   auto it = halo_width_map_.find(id);
   TORCH_INTERNAL_ASSERT(it != halo_width_map_.end());
   return it->second;
@@ -699,7 +623,8 @@ bool extentCompare(
     Cmp cmp) {
   auto gpu_lower = GpuLower::current();
   TORCH_INTERNAL_ASSERT(
-      gpu_lower->caLoopMap().areMapped(id1, id2), "Invalid axes to compare");
+      gpu_lower->caMap()->areMapped(id1, id2, IdMappingMode::PERMISSIVE),
+      "Invalid axes to compare");
 
   // It's invalid to compare two axes and when only either of them has
   // halo.
@@ -736,63 +661,11 @@ bool extentCompare(
 } // namespace
 
 bool HaloInfo::extentLessEqual(IterDomain* id1, IterDomain* id2) const {
-  auto cmp = [](kir::Int* x, kir::Int* y) {
-    if (x == y) {
-      return true;
-    }
-    auto xv = x->value();
-    auto yv = y->value();
-    return xv.has_value() && yv.has_value() && xv.value() <= yv.value();
-  };
-  return extentCompare(*this, id1, id2, cmp);
+  return extentCompare(*this, id1, id2, std::less_equal<>());
 }
 
 bool HaloInfo::extentEqual(IterDomain* id1, IterDomain* id2) const {
-  // Returns true only when x and y are proven to be the same. The
-  // analysis is not comprehensive and can prove in rather trivial
-  // cases only. Specifically:
-  //   - x and y are the same pointers
-  //   - Both have static values and they are the same
-  //   - Both are defined by the same expression and the inputs are
-  //     proven to be equal
-  std::function<bool(kir::Int*, kir::Int*)> cmp = [&](kir::Int* x,
-                                                      kir::Int* y) {
-    if (x == y) {
-      return true;
-    }
-
-    auto xv = x->value();
-    auto yv = y->value();
-    if (xv.has_value() && yv.has_value() && xv.value() == yv.value()) {
-      return true;
-    }
-
-    // Check if both are defined by an expression of the same type. If
-    // so, recursively check the input operands.
-    auto x_def = x->definition();
-    auto y_def = y->definition();
-    if (x_def && y_def &&
-        ((x_def->isA<kir::UnaryOp>() && y_def->isA<kir::UnaryOp>() &&
-          x_def->as<kir::UnaryOp>()->operation() ==
-              y_def->as<kir::UnaryOp>()->operation()) ||
-         (x_def->isA<kir::BinaryOp>() && y_def->isA<kir::BinaryOp>() &&
-          x_def->as<kir::BinaryOp>()->operation() ==
-              y_def->as<kir::BinaryOp>()->operation()))) {
-      for (const auto i : c10::irange(x_def->inputs().size())) {
-        auto x_input = dynamic_cast<kir::Int*>(x_def->inputs()[i]);
-        auto y_input = dynamic_cast<kir::Int*>(y_def->inputs()[i]);
-        // Both must be kir::Int
-        TORCH_INTERNAL_ASSERT(x_input && y_input);
-        if (!cmp(x_input, y_input)) {
-          return false;
-        }
-      }
-      return true;
-    }
-
-    return false;
-  };
-  return extentCompare(*this, id1, id2, cmp);
+  return extentCompare(*this, id1, id2, std::equal_to<>());
 }
 
 std::string HaloInfo::toString() const {
@@ -822,16 +695,19 @@ std::string HaloInfo::toString() const {
 }
 
 bool HaloInfo::needsShiftPredicate(Expr* expr) const {
-  auto consumer_td = ir_utils::getTVOutput(expr)->domain();
-  auto shift_expr = dynamic_cast<ShiftOp*>(expr);
-  auto gather_expr = dynamic_cast<GatherOp*>(expr);
+  // In lowering shift and gather turn into a unary op. We really need the shift
+  // expr. Do a round about trick to grab it:
+  auto tv_out = ir_utils::getTvOutput(expr);
+  auto consumer_td = tv_out->domain();
+  auto shift_expr = dynamic_cast<ShiftOp*>(tv_out->definition());
+  auto gather_expr = dynamic_cast<GatherOp*>(tv_out->definition());
   for (const auto i : c10::irange(consumer_td->getRootDomain().size())) {
     auto consumer_id = consumer_td->getRootDomain()[i];
     const auto consumer_halo_info = getRootAxisInfo(consumer_id);
     if (consumer_halo_info.hasHalo() ||
         (shift_expr != nullptr && shift_expr->offset(i) != 0 &&
          !consumer_id->isBroadcast()) ||
-        (gather_expr != nullptr && !gather_expr->windowShape()[i]->isOneInt() &&
+        (gather_expr != nullptr && gather_expr->windowShape()[i] != 1 &&
          !consumer_id->isBroadcast())) {
       return true;
     }
@@ -839,13 +715,6 @@ bool HaloInfo::needsShiftPredicate(Expr* expr) const {
   return false;
 }
 
-bool HaloInfo::needsShiftPredicate(kir::Expr* expr) const {
-  const auto out_tv = expr->outputs()[0]->as<kir::TensorView>();
-  auto fuser_expr = out_tv->fuserTv()->definition();
-  TORCH_INTERNAL_ASSERT(fuser_expr != nullptr);
-  return needsShiftPredicate(fuser_expr);
-}
-
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.h b/torch/csrc/jit/codegen/cuda/lower_shift.h
index 378709ca4430..c0fea8c1eadd 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.h
+++ b/torch/csrc/jit/codegen/cuda/lower_shift.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
@@ -16,16 +16,14 @@ namespace cuda {
 //! Auxiliary class to represent information about halo of an axis
 class AxisHaloInfo {
  public:
-  AxisHaloInfo();
-
   //! Width of halo.
   //!
   //! pos is either 0 or 1. The width of halo at offset zero is set
   //! when pos is 0.
-  kir::Int* width(int pos) const;
+  int width(int pos) const;
 
   //! Sum of the widths of both widths
-  kir::Int* width() const;
+  int width() const;
 
   const auto& widths() const {
     return widths_;
@@ -34,10 +32,10 @@ class AxisHaloInfo {
   //! Set the halo width of either side.
   //! pos is either 0 or 1. The width of halo at offset zero is set
   //! when pos is 0.
-  void setWidth(int pos, kir::Int* width);
+  void setWidth(int pos, int width);
 
   //! Extend the halo width to account for another axis.
-  void merge(int pos, kir::Int* other);
+  void merge(int pos, int other);
 
   //! Extend the halo width to account for another axis.
   void merge(const AxisHaloInfo& other);
@@ -53,7 +51,7 @@ class AxisHaloInfo {
   //! widths_[0] is non-zero and designates the size of the
   //! halo. Similarly, non-zero widths_[1] means the axis has halo at
   //! the other end of the axis.
-  std::array<kir::Int*, 2> widths_ = {nullptr, nullptr};
+  std::array<int, 2> widths_ = {0, 0};
 };
 
 //! Helper class for lowering tensors with halo. Only valid at the
@@ -77,7 +75,6 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! Returns true if id has the root halo information set by
   //! setRootAxisInfo.
   bool hasRootAxisInfo(IterDomain* id) const;
-  bool hasRootAxisInfo(kir::IterDomain* id) const;
 
   //! Returns the registed AxisHaloInfo of a root axis.
   //!
@@ -85,9 +82,6 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! non-root axes.
   const AxisHaloInfo& getRootAxisInfo(IterDomain* id) const;
   AxisHaloInfo& getRootAxisInfo(IterDomain* id);
-  //! KIR version
-  const AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id) const;
-  AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id);
 
   //! Query if an axis has a halo width.
   //!
@@ -98,12 +92,11 @@ class TORCH_CUDA_CU_API HaloInfo {
   //!
   //! It's an error if queried for an axis with no halo width
   //! information.
-  kir::Int* getHaloWidth(IterDomain* id) const;
+  int getHaloWidth(IterDomain* id) const;
 
   //! Returns an extent if id is extended for halo. Nullptr is
   //! returned otherwise.
-  kir::Val* getExtent(IterDomain* id) const;
-  kir::Val* getExtent(kir::IterDomain* id) const;
+  Val* getExtent(IterDomain* id) const;
 
   //! Returns all child domains of a root domain that inherits the
   //! halo of the root domain.
@@ -135,7 +128,6 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! interior and another for padding. Predicate insertion is done in
   //! the ShiftPredicateInserter class below.
   bool needsShiftPredicate(Expr* expr) const;
-  bool needsShiftPredicate(kir::Expr* expr) const;
 
   std::string toString() const;
 
@@ -166,14 +158,14 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! Validate shift usage
   void validate(TensorView* td) const;
 
+  void setHaloWidth(IterDomain* id, int halo_width);
+
  private:
   //! Halo information of root axes
   std::unordered_map<IterDomain*, AxisHaloInfo> root_axis_map_;
-  //! KIR version
-  std::unordered_map<kir::IterDomain*, AxisHaloInfo> kir_root_axis_map_;
 
   //! Halo-extended extents. No mapping for axes without halo extension
-  std::unordered_map<kir::IterDomain*, kir::Val*> kir_extent_map_;
+  std::unordered_map<IterDomain*, Val*> extent_map_;
 
   //! The halo width of an axis.
   //!
@@ -209,7 +201,7 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! inner axis is merged with another axis of extent M, we know that
   //! the extent of the resulting output axis is 5*M, but we don't
   //! create its mapping.
-  std::unordered_map<IterDomain*, kir::Int*> halo_width_map_;
+  std::unordered_map<IterDomain*, int> halo_width_map_;
 
   //! Mappings from root domains to child domains that inherit halo
   std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
@@ -224,9 +216,9 @@ class ShiftPredicateInserter {
   //! the usual predicated expression, so the insertion is also done
   //! here.
   static void insert(
-      kir::Expr* expr,
+      Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
-      kir::Bool* thread_pred,
+      Bool* thread_pred,
       bool within_unswitch);
 };
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
new file mode 100644
index 000000000000..5f3eebceb303
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
@@ -0,0 +1,483 @@
+
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+// Validate parallelization of a single tensor
+void validateParallelizationOfTensor(TensorView* tv) {
+  // Each ParallelType can be used only once.
+  ParallelTypeBitmap pt_map;
+  for (size_t i = 0; i < tv->nDims(); ++i) {
+    auto axis = tv->axis(i);
+    auto ptype = axis->getParallelType();
+    if (!isParallelTypeThread(ptype)) {
+      continue;
+    }
+
+    // It doesn't matter if this axis is a non-concretized broadcast
+    // TODO: merging broadcast and non-broadcast
+    if (axis->isBroadcast() &&
+        !GpuLower::current()->concretizedBroadcastDomains().isConcretized(
+            axis)) {
+      continue;
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        !pt_map.get(ptype),
+        "Multiple use of ",
+        ptype,
+        " in tensor t",
+        tv->name(),
+        ": ",
+        tv);
+    pt_map.set(ptype);
+  }
+
+  // If this tensor is predicated by a paralel type, it should not be
+  // used to parallelize any domain of this tensor
+
+  const auto thread_pred =
+      GpuLower::current()->threadPredMap().getPredicateInfo(tv);
+
+  auto predicated_parallel_types = pt_map & thread_pred.limited_types;
+
+  TORCH_INTERNAL_ASSERT(
+      predicated_parallel_types.none(),
+      "Invalid parallelization of tensor t",
+      tv->name(),
+      ". The tensor is parallelized with ",
+      predicated_parallel_types.toString(),
+      ", but it's invalid to use the types as the tensor is also predicated with them.",
+      ", thread pred: ",
+      thread_pred.limited_types.toString());
+}
+
+//! Return true if axis is derived from a root axis that is an input
+//! to a CA leaf axis.
+bool derivedFromRootCAAxes(TensorView* tv, IterDomain* axis) {
+  std::vector<IterDomain*> ca_axes(
+      tv->domain()->domain().begin(),
+      tv->domain()->domain().begin() + tv->getComputeAtPosition());
+
+  auto ca_root_vals = IterVisitor::getInputsTo(
+      std::vector<Val*>(ca_axes.begin(), ca_axes.end()));
+
+  auto root_vals = IterVisitor::getInputsTo({axis});
+
+  return std::any_of(
+      root_vals.begin(), root_vals.end(), [&ca_root_vals](auto root) {
+        return std::find(ca_root_vals.begin(), ca_root_vals.end(), root) !=
+            ca_root_vals.end();
+      });
+}
+
+} // namespace
+
+void SyncMap::build(Fusion* fusion) {
+  FUSER_PERF_SCOPE("GpuLower::Lower::validateParallelize");
+  FusionGuard fg(fusion);
+
+  const auto& ca_map = GpuLower::current()->caMap();
+  const auto& pred_map = GpuLower::current()->threadPredMap();
+
+  auto exprs = StmtSort::getExprs(fusion);
+
+  // Run through expressions and check for communication across threads/blocks
+  // occuring from producer to consumer of the expression
+  for (auto expr : exprs) {
+    if (!ir_utils::isTvOp(expr)) {
+      continue;
+    }
+
+    // Validate parallelization of each consumer by itself
+    for (auto consumer : ir_utils::filterByType<TensorView>(expr->outputs())) {
+      validateParallelizationOfTensor(consumer);
+    }
+
+    // It's probably enough to just check all producers to one consumer as
+    // multi-consumers are guaranteed to be transformed/parallelized the same,
+    // but to be conservative for now checking every producer <-> consumer
+    // relationship.
+    for (auto producer : ir_utils::filterByType<TensorView>(expr->inputs())) {
+      // Parallelization on input tensors have no effect.
+      if (producer->isFusionInput()) {
+        continue;
+      }
+
+      ParallelTypeBitmap raw_dims;
+
+      const auto parallel_bcast_doms =
+          pred_map.getParallelBroadcastDomains(producer);
+
+      // Stash information about parallelized producer iteration domains
+      std::vector<IterDomain*> producer_parallel_ids(
+          ParallelTypeBitmap::kNumParallelTypes, nullptr);
+      ParallelTypeBitmap producer_parallel_bitmap;
+
+      // Tracking for quick check later
+      std::unordered_set<IterDomain*> producer_within_compute_at;
+
+      // Get the parallel types that producer will be predicated off in producer
+      // writes.
+      //  In this case we need a sync whether the producer-consumer axes are
+      //  mapped or not since the predicate pass will generate pattern like
+      //  below to eliminate redundant writes: if(threadIdx.x == 0)
+      //    shared[threadIdx.x + i] = ...
+      // We will need a raw sync after this pattern for correctness.
+      auto producer_redundant_types = GpuLower::current()
+                                          ->threadPredMap()
+                                          .getPredicateInfo(producer)
+                                          .redundant_types;
+
+      for (const auto producer_i : c10::irange(producer->nDims())) {
+        auto producer_axis = producer->axis(producer_i);
+        auto producer_ptype =
+            ca_map->getConcreteMappedID(producer_axis, IdMappingMode::LOOP)
+                ->getParallelType();
+
+        if (!isParallelTypeThread(producer_ptype)) {
+          continue;
+        }
+
+        // Producer reductions shouldn't map to consumers
+        if (producer_axis->isReduction()) {
+          continue;
+        }
+
+        if (producer_i < producer->getComputeAtPosition()) {
+          producer_within_compute_at.emplace(producer_axis);
+        }
+
+        producer_parallel_bitmap.set(producer_ptype);
+        producer_parallel_ids[getParallelTypeBitMapOffset(producer_ptype)] =
+            producer_axis;
+      }
+
+      for (auto consumer :
+           ir_utils::filterByType<TensorView>(expr->outputs())) {
+        // Stash information about parallelized consumer iteration domains
+        std::vector<IterDomain*> consumer_parallel_ids(
+            ParallelTypeBitmap::kNumParallelTypes, nullptr);
+        ParallelTypeBitmap consumer_parallel_bitmap;
+
+        for (const auto consumer_i : c10::irange(consumer->nDims())) {
+          auto consumer_axis = consumer->axis(consumer_i);
+          auto consumer_ptype =
+              ca_map->getConcreteMappedID(consumer_axis, IdMappingMode::LOOP)
+                  ->getParallelType();
+
+          if (!isParallelTypeThread(consumer_ptype)) {
+            continue;
+          }
+
+          // When the consumer axis is a broadcast, it is not really
+          // parallelized unless thread-predicated and eventually concretized
+          if (consumer_axis->isBroadcast() &&
+              (!parallel_bcast_doms.get(consumer_ptype) ||
+               !GpuLower::current()
+                    ->concretizedBroadcastDomains()
+                    .isConcretized(consumer_axis))) {
+            continue;
+          }
+
+          consumer_parallel_bitmap.set(consumer_ptype);
+          consumer_parallel_ids[getParallelTypeBitMapOffset(consumer_ptype)] =
+              consumer_axis;
+        }
+
+        // At this point each parallel type that's present in the consumer or
+        // the producer will be present in their corresponding `_parallel_ids`
+        // map going from parallel index type (only size 6 for grid/block dims)
+        // to the iteration domain of that parallel type.
+        for (auto parallel_type : kParallelTypeThreads) {
+          // TIDx is reserved for lane_id in the case of mma ops.
+          //  It is swizzled and handled separately in validateMma.
+          if (parallel_type == ParallelType::TIDx && expr->isA<MmaOp>()) {
+            continue;
+          }
+
+          auto parallel_type_i = getParallelTypeBitMapOffset(parallel_type);
+
+          auto p_id = producer_parallel_ids[parallel_type_i];
+          auto c_id = consumer_parallel_ids[parallel_type_i];
+
+          // If consumer is parallelized with this type but producer is
+          //  predicated redundant on this type. This parallel dimension
+          //  is a RAW dimension. See test: FusionSeriaSmemWriteParallelRead1/2
+          //
+          // Even if consumer is not parallelized with this type, would still
+          //  need a raw sync unless all use chain of the producer end with an
+          //  output with the same redundant type.
+          // TODO: need a separate pass to detect the case where no raw sync
+          //  is needed in this case, i.e. all use-def chains are redundant.
+          if (producer_redundant_types.get(parallel_type)) {
+            raw_dims.set(parallel_type);
+            continue;
+          }
+
+          if (p_id == nullptr && c_id == nullptr) {
+            continue;
+          } else if (p_id != nullptr && c_id != nullptr) {
+            if (GpuLower::current()->caMap()->areMapped(
+                    p_id, c_id, IdMappingMode::PERMISSIVE)) {
+              const auto halo_info = GpuLower::current()->haloInfo();
+
+              if (halo_info.hasHaloWidth(p_id) !=
+                      halo_info.hasHaloWidth(c_id) ||
+                  (halo_info.hasHaloWidth(p_id) &&
+                   halo_info.hasHaloWidth(c_id) &&
+                   halo_info.getHaloWidth(p_id) !=
+                       halo_info.getHaloWidth(c_id))) {
+                raw_dims.set(parallel_type);
+                continue;
+              }
+            }
+          } else {
+            if (p_id != nullptr) {
+              auto it = std::find_if(
+                  consumer->domain()->domain().begin(),
+                  consumer->domain()->domain().end(),
+                  [&](IterDomain* c_id) {
+                    return GpuLower::current()->caMap()->areMapped(
+                        p_id, c_id, IdMappingMode::PERMISSIVE);
+                  });
+
+              // If there isn't a mapping from producer to a consumer domain,
+              // need to assume there's communication across this parallel
+              // dimension.
+              c_id = it == consumer->domain()->domain().end() ? nullptr : *it;
+              // i.e. if producer is parallelized across threadIdx.x in a
+              // certain split, if the consumer doesn't map to this split,
+              // then we need to assume it has to be in smem with proper
+              // syncs.
+            } else {
+              auto it = std::find_if(
+                  producer->domain()->domain().begin(),
+                  producer->domain()->domain().end(),
+                  [&](IterDomain* p_id) {
+                    return GpuLower::current()->caMap()->areMapped(
+                        p_id, c_id, IdMappingMode::PERMISSIVE);
+                  });
+              if (it == producer->domain()->domain().end()) {
+                // Can't infer anything if producer doesn't have a matching axis
+                // to parallel consumer dim.
+                continue;
+              }
+              p_id = *it;
+            }
+          }
+
+          // Comm pattern options (when parallel types don't have matching
+          // axes) and required memory, Chart is producer parallel type,
+          // consumer parallel type Parallel types are Serial(S),
+          // threadIdx(T), blockIdx(B), Memory required for the producer is
+          // Local(L), Shared(S), Global(G), Sync is None (N/A), blockSync(B),
+          // grid_sync(G)
+          //
+          // P    C   Mem Req   Sync Type
+          // S    S      L          N/A
+          // S    T      L          N/A
+          // S    B      L          N/A
+          // T    S      S           B
+          // T    T      S           B
+          // T    B      S           B
+          // B    S      G           G
+          // B    T      G           G
+          // B    B      G           G
+
+          auto producer_ptype =
+              ca_map->getConcreteMappedID(p_id, IdMappingMode::LOOP)
+                  ->getParallelType();
+          auto consumer_ptype = c_id == nullptr
+              ? ParallelType::Serial
+              : ca_map->getConcreteMappedID(c_id, IdMappingMode::LOOP)
+                    ->getParallelType();
+
+          if (!p_id->isBroadcast() && isParallelTypeThread(producer_ptype) &&
+              !(isParallelTypeThread(consumer_ptype) &&
+                parallel_bcast_doms.get(consumer_ptype)) &&
+              // Being in compute at means consumer and producer rely on the
+              // same loop size
+              !producer_within_compute_at.count(p_id) &&
+              // For usage of derivedFromRootCAAxes check
+              // NVFuserTest.FusionAdvancedIndexing1_CUDA
+              (c_id == nullptr || !derivedFromRootCAAxes(producer, p_id))) {
+            // There must be a consumer axis that uses the same indexing
+            // with the same parallel type as the producer axis. The index
+            // map is used to to find such an axis. In addition, even when
+            // no mapped axis is found in the index map, but when an mapped
+            // axis exists in the loop map, the producer and consumer axes
+            // may still use the same indexing. That only happens when the
+            // producer is derived from a root axis that is an input to any
+            // leaf CA axes. In such a case, the axis in the reference
+            // tensor that maps to the producer axis is created based on the
+            // consumer, so both the producer and consumer axes should have
+            // the same indexing. See issue #995 as well as the
+            // FusionValidateParallelize6 test for a concrete example.
+            auto it = std::find_if(
+                consumer->domain()->domain().begin(),
+                consumer->domain()->domain().end(),
+                [&](IterDomain* c_id_) {
+                  return ca_map->areMapped(p_id, c_id_, IdMappingMode::EXACT);
+                });
+            if (it == consumer->domain()->domain().end()) {
+              if (isParallelTypeThread(producer_ptype)) {
+                raw_dims.set(producer_ptype);
+              }
+              if (isParallelTypeThread(consumer_ptype)) {
+                raw_dims.set(consumer_ptype);
+              }
+            }
+          }
+
+          // In shift or gather operations, if a thread or block
+          // domain's root ID is shifted or gathered, it can overlap
+          // in shared or global memory. This doesn't
+          // require a RAW sync since each thread would still write every value
+          // it would read, but it can require a WAR sync for Shared Memory.
+          // Since there isn't a separate structure for WAR than RAW for now
+          // we'll flag it on RAW which will trigger the WAR.
+          // See test FusionValidateParallelizeShift_CUDA for a
+          // concrete example where this sync is required.
+          if ((expr->getExprType() == ExprType::GatherOp ||
+               expr->getExprType() == ExprType::ShiftOp) &&
+              producer->getMemoryType() == MemoryType::Shared &&
+              isParallelTypeThreadDim(producer_ptype)) {
+            std::unordered_set<Val*> shifted_rfactor_ids;
+            if (expr->getExprType() == ExprType::GatherOp) {
+              auto gather_op = expr->as<GatherOp>();
+              for (auto root_i :
+                   c10::irange(producer->getMaybeRFactorDomain().size())) {
+                auto rfactor_id = producer->getMaybeRFactorDomain()[root_i];
+                // If the window shape is 1, it just copies the
+                // producer to the consumer
+                if (gather_op->windowShape()[root_i] != 1) {
+                  shifted_rfactor_ids.insert(rfactor_id);
+                }
+              }
+            } else if (expr->getExprType() == ExprType::ShiftOp) {
+              auto shift_op = expr->as<ShiftOp>();
+              for (auto root_i :
+                   c10::irange(producer->getMaybeRFactorDomain().size())) {
+                auto rfactor_id = producer->getMaybeRFactorDomain()[root_i];
+                // If the shift offset is 0, it doesn't actually shift
+                if (shift_op->offsets()[root_i] != 0) {
+                  shifted_rfactor_ids.insert(rfactor_id);
+                }
+              }
+            }
+
+            // Grab all values between shifted rfactor domains and p_id so we
+            // can identify which rfactor domains are inputs to the p_id
+            auto p_id_dep_vals =
+                DependencyCheck::getAllValsBetween(shifted_rfactor_ids, {p_id});
+            // If this shifted rfactor domain is an input to p_id, we
+            // must have a WAR sync. Mark raw sync so it will be generated.
+            if (!p_id_dep_vals.empty()) {
+              raw_dims.set(producer_ptype);
+            }
+          }
+
+          // If same parallel type and mapped, no need for syncs unless
+          // producer is in smem, producer parallel type is a thread
+          // dimension, and consumer concretizes the dimension. This sync is
+          // due to the redundant predicate omission in lower thread
+          // predicate.
+          auto redundant_preds = GpuLower::current()
+                                     ->threadPredMap()
+                                     .getPredicateInfo(producer)
+                                     .redundant_types;
+
+          if (p_id->isBroadcast() &&
+              GpuLower::current()->concretizedBroadcastDomains().isConcretized(
+                  p_id) &&
+              producer->getMemoryType() == MemoryType::Shared &&
+              redundant_preds.hasTID()) {
+            redundant_preds.clearAllBID();
+            raw_dims |= redundant_preds;
+            continue;
+          }
+
+          // When the producer axis is a broadcast, it is not really
+          // parallelized unless thread-predicated and concretized
+          if (isParallelTypeThread(producer_ptype) && p_id->isBroadcast() &&
+              (!parallel_bcast_doms.get(producer_ptype) ||
+               !GpuLower::current()
+                    ->concretizedBroadcastDomains()
+                    .isConcretized(p_id))) {
+            continue;
+          }
+
+          // If matching dims and matching parallel types, no comm is necessary.
+          if (producer_ptype == consumer_ptype &&
+              GpuLower::current()->caMap()->areMapped(
+                  p_id, c_id, IdMappingMode::PERMISSIVE)) {
+            continue;
+          }
+
+          // Set parallel dimensions that communication is occuring over.
+          if (isParallelTypeThread(producer_ptype)) {
+            raw_dims.set(producer_ptype);
+          }
+        } // end for ptypes
+
+        if (raw_dims.hasBID()) {
+          TORCH_INTERNAL_ASSERT(
+              producer->getMemoryType() == MemoryType::Global,
+              "Inconsistent parallelization found between TV",
+              producer->name(),
+              " (",
+              producer->toString(),
+              ") and TV",
+              consumer->name(),
+              "(",
+              consumer->toString(),
+              "). Producer is required to be in Global Memory based on parallelization strategy.");
+        } else if (raw_dims.hasTID()) {
+          TORCH_INTERNAL_ASSERT(
+              producer->getMemoryType() == MemoryType::Global ||
+                  producer->getMemoryType() == MemoryType::Shared,
+              "Inconsistent parallelization found between TV",
+              producer->name(),
+              " (",
+              producer->toString(),
+              ") and TV",
+              consumer->name(),
+              "(",
+              consumer->toString(),
+              "). Producer is required to be in Global or Shared Memory based on parallelization strategy.");
+        }
+
+      } // end for consumers
+
+      if (raw_dims.any()) {
+        needs_raw_sync_[producer] = raw_dims;
+      }
+
+    } // end producer
+  }
+}
+
+std::string SyncMap::toString() const {
+  std::stringstream ss;
+  ss << "TVs requiring RAW:" << std::endl;
+  for (auto entry : needs_raw_sync_) {
+    ss << "  " << entry.first->toString() << " :: " << entry.second.toString()
+       << std::endl;
+  }
+  return ss.str();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.h b/torch/csrc/jit/codegen/cuda/lower_sync_information.h
new file mode 100644
index 000000000000..09fcf9eabd7f
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_sync_information.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class SyncMap {
+ public:
+  std::string toString() const;
+
+  //! Validates all tensors are consistently parallelized. Basically,
+  //! when a producer axis is threaded, either with threadIdx or
+  //! blockIdx, there must be a mapped consumer axis with the
+  //! same ParallelType with some exceptions.
+  //!
+  //! This function assumes Loop and Parallel ComputeAtMaps are already
+  //! built as they are used to validate consistency.
+  //!
+  //! Fills needs_raw_sync with output TVs if they need a raw sync if on smem or
+  //! gmem. The second entry in this map is the parallel dimensions being
+  //! communicated across.
+  void build(Fusion* fusion);
+
+  ParallelTypeBitmap needsRawSync(TensorView* tv) const {
+    auto it = needs_raw_sync_.find(tv);
+    if (it != needs_raw_sync_.end()) {
+      return it->second;
+    }
+    return ParallelTypeBitmap();
+  }
+
+ private:
+  std::unordered_map<TensorView*, ParallelTypeBitmap> needs_raw_sync_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
index a7f8768883d0..3769c9c9d974 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@@ -4,7 +4,6 @@
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
@@ -17,55 +16,49 @@ namespace cuda {
 
 namespace {
 
-kir::Bool* getPredicatePerParallelType(
+Bool* getPredicatePerParallelType(
     ParallelType pt,
     const ThreadPredicateMap::PredicateInfo& pred_info) {
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
-
   auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt);
 
   // If pt is not used or is proven to be one, no need to predicate.
   if (pt_dim == nullptr || pt_dim->isOneInt()) {
-    return ir_builder.trueVal();
+    return GpuLower::current()->kernel()->trueVal();
   }
-
   // When BID needs to be predicated, that means it's an output of a grid
   // reduction and only the last block index in that dimension has the right
   // value from the grid reduce.
   if (isParallelTypeBlockDim(pt) && pred_info.limited_types.get(pt)) {
-    return ir_builder
-        .eqExpr(
-            kir::NamedScalar::getParallelIndex(pt),
-            ir_builder.subExpr(
-                kir::NamedScalar::getParallelDim(pt), ir_builder.oneVal()))
-        ->as<kir::Bool>();
+    return SimplifyingIrBuilder::eqExpr(
+               NamedScalar::getParallelIndex(pt),
+               SimplifyingIrBuilder::subExpr(
+                   NamedScalar::getParallelDim(pt),
+                   GpuLower::current()->kernel()->oneVal()))
+        ->as<Bool>();
   }
 
   // Otherwise, only thread of index 0 executes the computation
-  return ir_builder
-      .eqExpr(kir::NamedScalar::getParallelIndex(pt), ir_builder.zeroVal())
-      ->as<kir::Bool>();
+  return SimplifyingIrBuilder::eqExpr(
+             NamedScalar::getParallelIndex(pt),
+             GpuLower::current()->kernel()->zeroVal())
+      ->as<Bool>();
 }
 
 } // namespace
 
-kir::Bool* ThreadPredicateMap::getPredicateFromPredicateInfo(
+Bool* ThreadPredicateMap::getPredicateFromPredicateInfo(
     const ThreadPredicateMap::PredicateInfo& pred_info) {
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
-
   const auto pred_types = pred_info.limited_types | pred_info.redundant_types;
 
   if (pred_types.none()) {
-    return ir_builder.trueVal();
+    return GpuLower::current()->kernel()->trueVal();
   }
 
-  kir::Bool* pred = nullptr;
-
+  Bool* pred = nullptr;
   for (const auto pt : pred_types) {
     const auto tp = getPredicatePerParallelType(pt, pred_info);
-    pred = ir_builder.andExpr(pred, tp)->as<kir::Bool>();
+    pred = SimplifyingIrBuilder::andExpr(pred, tp)->as<Bool>();
   }
-
   TORCH_INTERNAL_ASSERT(pred != nullptr);
 
   return pred;
@@ -79,22 +72,44 @@ ParallelTypeBitmap avoidRedundantWrites(const TensorView* out_tv) {
   // If the memory type is Local, it's fine to write into it always as
   // it's thread local. If it's Global, it's also fine to let each
   // thread do its own write, unless out_tv is an output of a
-  // reduction. Reduction reads from and writes to the tensor, so the
-  // result would be incorrect if the buffer is shared by redundant
-  // threads. Correctness issues here come from smem aliasing or grid reductions
-  // because the reduction itself performs an update to a value, not just a set.
-  const bool is_reduction = out_tv->definition()->isA<ReductionOp>() ||
-      out_tv->definition()->isA<WelfordOp>();
+  // reduction. Standard reductions (forget gridReduce for the sake of this
+  // argument) directly into global memory buffers accumulate into the global
+  // memory buffer. If this is done redundantly then it could lead to incorrect
+  // results. Correctness issues here can come from smem aliasing, smem
+  // reductions or gmem reductions because the reduction itself performs an
+  // update to a value, not just a set. For performance it's safe to ommit the
+  // redundant writes to gmem or smem, this comment is just specifying it's not
+  // always just a performance optimization, but can also be a correctness
+  // requirement.
+  //
+  // For now this is enabled for shared memory buffers, global memory buffers
+  // undergoing a reduction, and global memory buffers with terminating outputs.
+  // This could be extended to all global memory buffer transactions, but in the
+  // test AdvancedIndexing11 there's a case where an intermediate global buffer
+  // is set and used to perform a broadcast. At the moment a grid sync is not
+  // being inserted here, and it's generally safe since it's just a set. We
+  // could enable this more generally for global memory buffers, but would have
+  // to insert a sync or a grid broadcast in that example. For now the
+  // approach is to only do this on a grid buffer (not undergoing a reduction)
+  // if there are no other uses in the kernel.
+  //
+  // TODO: Revisit if something like AdvancedIndexing11 could be happening at
+  // the same time of a global reduction in a way that could produce an
+  // incorrect result.
+  const bool is_reduction = ir_utils::isReductionOp(out_tv->definition());
   if (!(out_tv->getMemoryType() == MemoryType::Shared ||
-        (out_tv->getMemoryType() == MemoryType::Global && is_reduction))) {
+        (out_tv->getMemoryType() == MemoryType::Global && is_reduction) ||
+        (out_tv->getMemoryType() == MemoryType::Global &&
+         out_tv->uses().empty()))) {
     return ParallelTypeBitmap();
   }
+
   ParallelTypeBitmap pred;
   // Track which TID types are not used to find redundant parallel
-  // types. Only TID types are checked as the tensor is on shared
-  // memory.
+  // types. Only TID types are checked if the tensor is on shared
+  // memory otherwise on global memory all TID and BID types are checked.
   ParallelTypeBitmap unused_types;
-  // Initially all types are conservatively assumed to be used.
+  // Initially all types are conservatively assumed to not be used.
   unused_types = ~unused_types;
   for (auto out_tv_id : out_tv->domain()->domain()) {
     auto pt = out_tv_id->getParallelType();
@@ -104,8 +119,22 @@ ParallelTypeBitmap avoidRedundantWrites(const TensorView* out_tv) {
     // If the axis is a broadcast domain and is parallelized by TID,
     // it is sufficient to use just one thread since the tensor is on
     // shared memory.
-    if (out_tv->getMemoryType() == MemoryType::Shared &&
-        out_tv_id->isBroadcast() && isParallelTypeThreadDim(pt)) {
+    if ((out_tv->getMemoryType() == MemoryType::Shared &&
+         out_tv_id->isBroadcast() && isParallelTypeThreadDim(pt)) ||
+        // Protect against global memory and is_reduction as we don't want to
+        // predicate grid dimensions as codegen will complain predication on
+        // block dimensions is not allowed in grid reductions. The old
+        // grid reduction runtime kernel does not differentiate
+        // non-reduction and predicated parallel types, so the sync
+        // integer buffer would need to be expanded even for
+        // predicated parallel types, which is not what
+        // getGridSyncBufferSize does. The right thing here is either:
+        // retire the old grid reduction kernel, or update the kernel
+        // to propertly ignore predicated types. The new kernel is
+        // significantly complex and has not been tested, so the
+        // latter option seems more reasonable for now. See #1671.
+        (!is_reduction && out_tv->getMemoryType() == MemoryType::Global &&
+         out_tv_id->isBroadcast() && isParallelTypeThread(pt))) {
       pred.set(pt);
     }
     unused_types.clear(pt);
@@ -138,7 +167,7 @@ ParallelTypeBitmap getReductionPredicateForUnusedParallelTypes(
     const TensorView* tv,
     const ThreadPredicateMap::PredicateInfo& pred_info) {
   auto tv_def = tv->definition();
-  if (!(tv_def && (tv_def->isA<ReductionOp>() || tv_def->isA<WelfordOp>()) &&
+  if (!(tv_def && ir_utils::isReductionOp(tv_def) &&
         tv->getMemoryType() == MemoryType::Global)) {
     return {};
   }
@@ -153,6 +182,21 @@ ParallelTypeBitmap getReductionPredicateForUnusedParallelTypes(
 void ThreadPredicateMap::updateBitSet(const Expr* expr) {
   FUSER_PERF_SCOPE("GpuLower::Lower::ThreadPredicateMap::updateBitSet");
 
+  // If all of the inputs are not updated and all of the outputs have
+  // already mappings, don't do anything
+  if (std::all_of(
+          ir_utils::filterByType<TensorView>(expr->inputs()).begin(),
+          ir_utils::filterByType<TensorView>(expr->inputs()).end(),
+          [this](TensorView* tv) {
+            return updated_tvs_.find(tv) == updated_tvs_.end();
+          }) &&
+      std::all_of(
+          ir_utils::filterByType<TensorView>(expr->outputs()).begin(),
+          ir_utils::filterByType<TensorView>(expr->outputs()).end(),
+          [this](TensorView* tv) { return find(tv) != end(); })) {
+    return;
+  }
+
   // Which predicates were set for the inputs
   ParallelTypeBitmap input_preds;
 
@@ -188,10 +232,13 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) {
     for (auto id : tv_inp->domain()->domain()) {
       if (id->isThread()) {
         id_ptypes.set(id->getParallelType());
-        if (id->isReduction()) {
+        if (id->isReduction() &&
+            !GpuLower::current()->fusedReductionInfo().isAllreduce(id)) {
           id_reductions.set(id->getParallelType());
         }
-        if (id->isBroadcast()) {
+        if (id->isBroadcast() &&
+            GpuLower::current()->concretizedBroadcastDomains().isConcretized(
+                id)) {
           id_bcasts.set(id->getParallelType());
         }
       }
@@ -233,9 +280,8 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) {
 
   // Run through outputs and set bitset predicates
   for (auto* out_tv : ir_utils::filterByType<TensorView>(expr->outputs())) {
-    TORCH_INTERNAL_ASSERT(find(out_tv) == end());
     auto redundant_types = avoidRedundantWrites(out_tv);
-    insert(out_tv, output_preds, redundant_types);
+    update(out_tv, output_preds, redundant_types);
   }
 }
 
@@ -245,12 +291,13 @@ void ThreadPredicateMap::build(Fusion* fusion) {
   // Initialize mapping for input tensors
   for (auto inp : fusion->inputs()) {
     if (auto tv = dynamic_cast<const TensorView*>(inp)) {
-      insert(tv, ParallelTypeBitmap(), ParallelTypeBitmap());
+      update(tv, ParallelTypeBitmap(), ParallelTypeBitmap());
     }
   }
   for (auto expr : fusion->exprs()) {
     updateBitSet(expr);
   }
+  updated_tvs_.clear();
 }
 
 ThreadPredicateMap::const_iterator ThreadPredicateMap::find(
@@ -289,20 +336,34 @@ ParallelTypeBitmap ThreadPredicateMap::getPredicatedParallelTypes(
   return pred_info.limited_types | pred_info.redundant_types;
 }
 
-void ThreadPredicateMap::insert(
+bool ThreadPredicateMap::update(
     const TensorView* tv,
-    const ParallelTypeBitmap& valid_types,
+    const ParallelTypeBitmap& limited_types,
     const ParallelTypeBitmap& redundant_types) {
-  insert(tv, {valid_types, redundant_types});
+  return update(tv, {limited_types, redundant_types});
 }
 
-void ThreadPredicateMap::insert(
+bool ThreadPredicateMap::update(
     const TensorView* tv,
     const PredicateInfo& pred_info) {
-  thread_predicates_.insert({tv, pred_info});
+  auto existing_mapping_it = thread_predicates_.find(tv);
+  if (existing_mapping_it != end()) {
+    PredicateInfo& existing_info = existing_mapping_it->second;
+    if (existing_info == pred_info) {
+      return false;
+    } else {
+      existing_info = pred_info;
+      markAsUpdated(tv);
+      return true;
+    }
+  } else {
+    thread_predicates_.insert({tv, pred_info});
+    markAsUpdated(tv);
+    return true;
+  }
 }
 
-kir::Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const {
+Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const {
   TORCH_INTERNAL_ASSERT(find(tv) != end(), "Couldn't find ", tv);
   auto pred_info = getPredicateInfo(tv);
   return getPredicateFromPredicateInfo(pred_info);
@@ -326,7 +387,8 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains(
   const bool output_smem = tv->getMemoryType() == MemoryType::Shared;
 
   for (auto id : iter_domains) {
-    if (!id->isBroadcast()) {
+    if (!id->isBroadcast() ||
+        !GpuLower::current()->concretizedBroadcastDomains().isConcretized(id)) {
       continue;
     }
     if (id->isBlockDim() || (!output_smem && id->isThreadDim())) {
@@ -337,6 +399,10 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains(
   return parallel_broadcast & at(tv).limited_types;
 }
 
+void ThreadPredicateMap::markAsUpdated(const TensorView* tv) {
+  updated_tvs_.insert(tv);
+}
+
 void ThreadPredicateMap::print() const {
   std::cout << "\nThreadPredicateMap\n";
   std::cout << "--------------------------------\n";
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
index 256e0385aeb1..2fb115953c6e 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
@@ -1,7 +1,7 @@
 
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
@@ -48,6 +48,10 @@ class TORCH_CUDA_CU_API ThreadPredicateMap {
     ParallelTypeBitmap limited_types;
     // Parallel types where only one thread/block is enough.
     ParallelTypeBitmap redundant_types;
+    bool operator==(const PredicateInfo& other) const {
+      return limited_types == other.limited_types &&
+          redundant_types == other.redundant_types;
+    }
   };
 
   using MapType = std::unordered_map<const TensorView*, PredicateInfo>;
@@ -69,7 +73,7 @@ class TORCH_CUDA_CU_API ThreadPredicateMap {
   ParallelTypeBitmap getPredicatedParallelTypes(const TensorView* tv) const;
 
   //! Returns a Bool predicate for a given TensorView.
-  kir::Bool* getPredicate(const TensorView* tv) const;
+  Bool* getPredicate(const TensorView* tv) const;
 
   //! Returns a ParallelTypeBitmap representing which domain needs
   //! blockBroadcast.
@@ -78,10 +82,14 @@ class TORCH_CUDA_CU_API ThreadPredicateMap {
   //! blockBroadcast unless it is predicated by limited_types_
   ParallelTypeBitmap getParallelBroadcastDomains(const TensorView* tv) const;
 
+  //! Mark tv as updated so that rebuilding the map should recompute
+  //! its predicates and those of its dependents.
+  void markAsUpdated(const TensorView* tv);
+
   void print() const;
 
   //! Generate a Bool value from PredicateInfo.
-  static kir::Bool* getPredicateFromPredicateInfo(
+  static Bool* getPredicateFromPredicateInfo(
       const ThreadPredicateMap::PredicateInfo& pred_info);
 
  private:
@@ -94,17 +102,19 @@ class TORCH_CUDA_CU_API ThreadPredicateMap {
   const PredicateInfo& at(const TensorView* tv) const;
   PredicateInfo& at(const TensorView* tv);
 
-  //! Insert a new mapping
-  void insert(
+  //! Update a mapping
+  bool update(
       const TensorView* tv,
-      const ParallelTypeBitmap& valid_types,
+      const ParallelTypeBitmap& limited_types,
       const ParallelTypeBitmap& redundant_types);
 
-  //! Insert a new mapping
-  void insert(const TensorView* tv, const PredicateInfo& pred_and_src);
+  //! Update a mapping
+  bool update(const TensorView* tv, const PredicateInfo& pred_and_src);
 
  private:
   MapType thread_predicates_;
+  //! Keep track of updated tensors that need predicates to be computed
+  std::unordered_set<const TensorView*> updated_tvs_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
new file mode 100644
index 000000000000..ab62530591ab
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
@@ -0,0 +1,119 @@
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+
+#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+void ConcretizedBroadcastDomains::build(Fusion* fusion) {
+  // Initialize the origin map with input broadcast domains
+  for (const auto fusion_input_tv :
+       ir_utils::filterByType<TensorView>(fusion->inputs())) {
+    for (auto root_id : fusion_input_tv->getRootDomain()) {
+      if (root_id->isBroadcast()) {
+        broadcast_origin_map_.emplace(
+            root_id, std::unordered_set<IterDomain*>({root_id}));
+      }
+    }
+  }
+  traverse(fusion);
+}
+
+bool ConcretizedBroadcastDomains::isConcretized(IterDomain* id) const {
+  auto it = concretized_domains_.find(id);
+  return it != concretized_domains_.end();
+}
+
+void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) {
+  // Create a new entry for each of new broadcast domains
+  auto out = bop->out()->as<TensorView>();
+  for (const auto i : c10::irange(out->getRootDomain().size())) {
+    if (bop->getBroadcastDimFlags().at(i)) {
+      auto new_bcast_id = out->getRootDomain().at(i);
+      broadcast_origin_map_.emplace(
+          new_bcast_id, std::unordered_set<IterDomain*>({new_bcast_id}));
+    }
+  }
+}
+
+void ConcretizedBroadcastDomains::handle(Expr* expr) {
+  IterVisitor::handle(expr);
+
+  // Propagate broadcast origin info from producers to consumers
+  for (auto producer : ir_utils::filterByType<TensorView>(expr->inputs())) {
+    std::unordered_set<IterDomain*> producer_broadcasts;
+    // This assumes there's no merged broadcast axes between root and rfactor
+    // domains which is not possible at the moment. If this assumption is ever
+    // invalidated we would need to manaually propagate root IDs to rfactor IDs.
+    for (auto producer_id : producer->getMaybeRFactorDomain()) {
+      if (producer_id->isBroadcast()) {
+        producer_broadcasts.insert(producer_id);
+      }
+    }
+    if (producer_broadcasts.empty()) {
+      continue;
+    }
+
+    for (auto consumer : ir_utils::filterByType<TensorView>(expr->outputs())) {
+      auto p2c_map =
+          PairwiseRootDomainMap(producer, consumer)
+              .mapProducerToConsumer(
+                  producer->domain(), consumer->domain(), producer_broadcasts);
+      for (const auto& kv : p2c_map) {
+        auto p_id = kv.first;
+        auto c_id = kv.second;
+        const bool is_concretized = !c_id->isBroadcast();
+        auto it = broadcast_origin_map_.find(p_id);
+        TORCH_INTERNAL_ASSERT(
+            it != broadcast_origin_map_.end(),
+            "Broadcast origin info not found for producer broadcast domain: ",
+            p_id->toString(),
+            " of ",
+            producer->toString());
+        const auto& producer_origins = it->second;
+        if (is_concretized) {
+          // Keep track of all the origin domains as concretized
+          for (auto origin : producer_origins) {
+            // concretized_root_domains_.insert(origin);
+            markAsConcretized(origin);
+          }
+        } else {
+          // Not concretized yet. Propagate forward the origin info.
+          auto& consumer_origins = broadcast_origin_map_[c_id];
+          for (auto origin : producer_origins) {
+            consumer_origins.insert(origin);
+          }
+          consumer_origins.insert(c_id);
+        }
+      }
+    }
+  }
+}
+
+void ConcretizedBroadcastDomains::markAsConcretized(IterDomain* root_domain) {
+  std::deque<IterDomain*> child_domains({root_domain});
+  while (!child_domains.empty()) {
+    auto child = child_domains.front();
+    child_domains.pop_front();
+    if (!concretized_domains_.emplace(child).second) {
+      continue;
+    }
+    const auto& child_uses = child->uses();
+    for (auto child_use : child_uses) {
+      for (auto out_id :
+           ir_utils::filterByType<IterDomain>(child_use->outputs())) {
+        child_domains.push_back(out_id);
+      }
+    }
+  }
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
new file mode 100644
index 000000000000..9dd50e8afc1d
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Traverse and collect all concretized broadcast domains.
+//!
+//! The traversal first initializes the origin map with broadcast
+//! domains in input tensors. Then, a new entry is added to the origin
+//! map when a broadcast op is encountered during a forward traversal
+//! of the given fusion. For non-broadcast ops, mappings are just
+//! propagated forward using PairwiseRootDomainMap.
+//!
+//! When the mapped consumer domain is not broadcast, it means the
+//! producer broadcast domain is concretized, and its origin broadcast
+//! domains are marked as concretized.
+class TORCH_CUDA_CU_API ConcretizedBroadcastDomains : private IterVisitor {
+ public:
+  void build(Fusion* fusion);
+
+  bool isConcretized(IterDomain* id) const;
+
+ private:
+  using IterVisitor::handle;
+
+  void handle(BroadcastOp* bop) final;
+
+  void handle(Expr* expr) final;
+
+  void markAsConcretized(IterDomain* root_domain);
+
+ private:
+  //! Maps each broadcast domain to its original broadcast
+  //! domains. Their can be multiple original domains due to, e.g.,
+  //! binary ops with broadcast domains in both inputs.
+  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
+      broadcast_origin_map_;
+  //! Set of all concretized original domains
+  std::unordered_set<IterDomain*> concretized_domains_;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
index 33651785d43c..9922b243e4ee 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
@@ -18,6 +18,7 @@ namespace {
 
 bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id);
 
+// Checks the producer of tv to see if the
 bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) {
   TORCH_INTERNAL_ASSERT(
       root_id->definition() == nullptr, "Not root IterDomain: ", root_id);
@@ -29,6 +30,7 @@ bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) {
 
   const auto& inputs = tv->definition()->inputs();
 
+  // Check the reduction expression that produces tv
   if (inputs.size() != 1 || !inputs[0]->isA<TensorView>() ||
       (tv->definition()->getExprType() != ExprType::ReductionOp &&
        tv->definition()->getExprType() != ExprType::WelfordOp)) {
@@ -63,8 +65,10 @@ bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) {
       continue;
     }
     // If not possible to prove the root ID is trivial, see if the ID
-    // is derived from a rfactor tensor and, if so, continue the
-    // analysis at the rfactor tensor.
+    // is derived from a rfactor tensor. This may mean that the iteration domain
+    // was merged or split in another expression through rfactor. Trace back
+    // through rfactor expressions to find original roots and determine there if
+    // trivial.
     if (!traverseToRFactorTensor(tv, root_id)) {
       return false;
     }
@@ -74,7 +78,7 @@ bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) {
 
 } // namespace
 
-void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) {
+void TrivialReductionInfo::build(Fusion* fusion) {
   auto used_vals = fusion->usedMathVals();
 
   for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
@@ -99,20 +103,6 @@ void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) {
       }
     }
   }
-
-  buildKir(fusion, gpu_lower);
-}
-
-void TrivialReductionInfo::buildKir(Fusion* fusion, GpuLower* gpu_lower) {
-  for (auto id : domains_) {
-    auto kir_trivial_id = gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-    kir_domains_.insert(kir_trivial_id);
-  }
-
-  for (auto id : domains_derived_from_root_) {
-    auto kir_trivial_id = gpu_lower->lowerValue(id)->as<kir::IterDomain>();
-    kir_domains_derived_from_root_.insert(kir_trivial_id);
-  }
 }
 
 bool TrivialReductionInfo::isDerived(IterDomain* id) const {
@@ -124,15 +114,6 @@ bool TrivialReductionInfo::isDerivedFromRoot(IterDomain* id) const {
       domains_derived_from_root_.end();
 }
 
-bool TrivialReductionInfo::isDerived(kir::IterDomain* id) const {
-  return kir_domains_.find(id) != kir_domains_.end();
-}
-
-bool TrivialReductionInfo::isDerivedFromRoot(kir::IterDomain* id) const {
-  return kir_domains_derived_from_root_.find(id) !=
-      kir_domains_derived_from_root_.end();
-}
-
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h
index c16439ed4f03..655d64a04179 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
@@ -13,23 +13,16 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class GpuLower;
-
 //! Detect almost all IterDomains that are derived from trivial
 //! reductons.
 class TORCH_CUDA_CU_API TrivialReductionInfo {
  public:
-  void build(Fusion* fusion, GpuLower* gpu_lower);
+  void build(Fusion* fusion);
 
   bool isDerived(IterDomain* id) const;
-  bool isDerivedFromRoot(IterDomain* id) const;
-
-  bool isDerived(kir::IterDomain* id) const;
-  bool isDerivedFromRoot(kir::IterDomain* id) const;
 
- private:
-  //! Convert the sets to KIR sets
-  void buildKir(Fusion* fusion, GpuLower* gpu_lower);
+  // TODO: Not used, cleanup
+  bool isDerivedFromRoot(IterDomain* id) const;
 
  private:
   //! IterDomains that are derived only from trivial
@@ -48,9 +41,6 @@ class TORCH_CUDA_CU_API TrivialReductionInfo {
   //! trivial reductions. These domains do not need to manifest as
   //! for-loops.
   std::unordered_set<IterDomain*> domains_derived_from_root_;
-
-  std::unordered_set<kir::IterDomain*> kir_domains_;
-  std::unordered_set<kir::IterDomain*> kir_domains_derived_from_root_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
index 08f91ba59bd7..434d1711d9c8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -6,8 +6,6 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
@@ -22,8 +20,7 @@ namespace {
 
 // Provide a new for loop matching the one provided
 kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) {
-  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-  const auto new_loop = ir_builder.create<kir::ForLoop>(for_loop);
+  const auto new_loop = IrBuilder::create<kir::ForLoop>(for_loop);
   for (auto expr : for_loop->body().exprs()) {
     if (auto nested_for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
       expr = cloneLoopNest(nested_for_loop);
@@ -35,20 +32,20 @@ kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) {
 
 // Returns true if expr is an expression that initializes a reduction
 // buffer.
-bool isReductionInitExpr(const kir::Expr* expr) {
+bool isReductionInitExpr(const Expr* expr) {
   // False if its output isn't a TensorView
-  if (!ir_utils::isTVOp(expr)) {
+  if (!ir_utils::isTvOp(expr)) {
     return false;
   }
   // False if it doesn't have any reduction axis
-  const auto out_tv = expr->outputs()[0]->as<kir::TensorView>();
+  const auto out_tv = expr->outputs()[0]->as<TensorView>();
   if (!out_tv->domain()->hasReduction()) {
     return false;
   }
   // False if it has have TensorView inputs as initialization should
   // never use TensorViews
   const auto tv_filter_inp_view =
-      ir_utils::filterByType<kir::TensorView>(expr->inputs());
+      ir_utils::filterByType<TensorView>(expr->inputs());
   if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) {
     return false;
   }
@@ -57,28 +54,27 @@ bool isReductionInitExpr(const kir::Expr* expr) {
 
 } // namespace
 
-void UnrollPass::handle(kir::Expr* expr) {
-  if (ir_utils::isTVOp(expr)) {
+void UnrollPass::handle(Expr* expr) {
+  if (ir_utils::isTvOp(expr)) {
     // If tv op, predicate it
-    const auto out_tv = ir_utils::getTVOutput(expr);
+    const auto out_tv = ir_utils::getTvOutput(expr);
     const bool should_predicate = !for_loops_.empty() ||
-        out_tv->memoryType() == MemoryType::Global ||
-        out_tv->memoryType() == MemoryType::Shared;
+        out_tv->getMemoryType() == MemoryType::Global ||
+        out_tv->getMemoryType() == MemoryType::Shared;
     if (!should_predicate) {
       return;
     }
 
-    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
     const auto thread_pred = isReductionInitExpr(expr)
-        ? ir_builder.trueVal()
-        : GpuLower::current()->threadPredMap().getPredicate(out_tv->fuserTv());
+        ? GpuLower::current()->kernel()->trueVal()
+        : GpuLower::current()->threadPredMap().getPredicate(out_tv);
 
     // When this expr is in an unswitched block, only attach the
     // thread predicate to the expr as thread predicates are not
     // grouped to the unswitch predicate.
     kir::Predicate* thread_pred_expr = nullptr;
     if (unswitched_loop_) {
-      thread_pred_expr = ir_builder.create<kir::Predicate>(thread_pred);
+      thread_pred_expr = IrBuilder::create<kir::Predicate>(thread_pred);
     }
 
     non_trivial_pred_found_ = true;
@@ -95,7 +91,7 @@ void UnrollPass::handle(kir::Expr* expr) {
     if (!isReductionInitExpr(expr) && out_tv->domain()->hasReduction()) {
       const auto write_pred = unswitched_loop_
           ? thread_pred_expr
-          : ir_builder.create<kir::Predicate>(
+          : IrBuilder::create<kir::Predicate>(
                 PredicateType::ReductionWrite, expr, thread_pred);
       expr->setWritePredicate(write_pred);
     }
@@ -105,7 +101,7 @@ void UnrollPass::handle(kir::Expr* expr) {
     if (ir_utils::hasBlockSync(expr, GpuLower::current()->threadPredMap())) {
       const auto pred = unswitched_loop_
           ? thread_pred_expr
-          : ir_builder.create<kir::Predicate>(
+          : IrBuilder::create<kir::Predicate>(
                 PredicateType::Inline, expr, thread_pred);
       expr->setPredicate(pred);
       return;
@@ -116,28 +112,28 @@ void UnrollPass::handle(kir::Expr* expr) {
     if (!unswitched_loop_ &&
         std::any_of(
             for_loops_.begin(), for_loops_.end(), [](const kir::ForLoop* fl) {
-              return fl->iter_domain()->parallelType() ==
+              return fl->iter_domain()->getParallelType() ==
                   ParallelType::Vectorize;
             })) {
-      pred = ir_builder.create<kir::Predicate>(PredicateType::Vectorize);
+      pred = IrBuilder::create<kir::Predicate>(PredicateType::Vectorize);
     }
 
     if (pred == nullptr) {
       pred = unswitched_loop_ ? thread_pred_expr
-                              : ir_builder.create<kir::Predicate>(
+                              : IrBuilder::create<kir::Predicate>(
                                     PredicateType::Inline, expr, thread_pred);
     }
 
     // If we need a predicate, put expr inside an if then else
-    kir::IfThenElse* inline_ite = ir_builder.create<kir::IfThenElse>(pred);
+    kir::IfThenElse* inline_ite = IrBuilder::create<kir::IfThenElse>(pred);
     if (for_loops_.empty()) {
       // Special handling for top level output expressions that still
       // need predicates. One motivating example is a reduction op that
       // reduces to a scalar (issue #491)
-      expr_replacement_map_.insert({expr, inline_ite});
+      kir::ExprMutator::registerReplace(expr, inline_ite, nullptr);
     } else {
-      for_loops_.back()->body().insert_before(expr, inline_ite);
-      for_loops_.back()->body().erase(expr);
+      kir::ExprMutator::registerReplace(
+          expr, inline_ite, &for_loops_.back()->body());
     }
     inline_ite->thenBody().push_back(expr);
   } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
@@ -150,8 +146,8 @@ void UnrollPass::handle(kir::Expr* expr) {
 void UnrollPass::handle(kir::ForLoop* fl) {
   // Setup for loop scoping
   const bool is_unroll =
-      fl->iter_domain()->parallelType() == ParallelType::Unroll ||
-      fl->iter_domain()->parallelType() == ParallelType::Unswitch;
+      fl->iter_domain()->getParallelType() == ParallelType::Unroll ||
+      fl->iter_domain()->getParallelType() == ParallelType::Unswitch;
 
   // If we're not looking for an unroll loop, or didn't find one, process as
   // normal.
@@ -172,10 +168,9 @@ void UnrollPass::handle(kir::ForLoop* fl) {
     return;
   }
 
-  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
-  auto unroll_pred = ir_builder.create<kir::Predicate>(fl);
+  auto unroll_pred = IrBuilder::create<kir::Predicate>(fl);
 
-  kir::IfThenElse* unroll_ite = ir_builder.create<kir::IfThenElse>(unroll_pred);
+  kir::IfThenElse* unroll_ite = IrBuilder::create<kir::IfThenElse>(unroll_pred);
 
   // Get the loop nest for the unrolled path
   kir::ForLoop* unrolled_loop_nest = cloneLoopNest(fl);
@@ -199,12 +194,18 @@ void UnrollPass::handle(kir::ForLoop* fl) {
   handle(inlined_loop);
   look_for_unroll_ = true;
   if (!non_trivial_pred_found_) {
-    expr_replacement_map_.insert({fl, inlined_loop});
+    kir::ExprMutator::registerReplace(
+        fl,
+        inlined_loop,
+        for_loops_.empty() ? nullptr : &for_loops_.back()->body());
   } else {
     if (!canOmitElseClause(fl)) {
       unroll_ite->elseBody().push_back(inlined_loop);
     }
-    expr_replacement_map_.insert({fl, unroll_ite});
+    kir::ExprMutator::registerReplace(
+        fl,
+        unroll_ite,
+        for_loops_.empty() ? nullptr : &for_loops_.back()->body());
   }
 }
 
@@ -221,31 +222,22 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) {
     // If there's any expression that requires barrier
     // synchronization, the else part can't be omitted
     for (auto expr : loop->body().exprs()) {
-      if (expr->isA<kir::BroadcastOp>()) {
-        const ParallelTypeBitmap domains = pred_map.getParallelBroadcastDomains(
-            expr->outputs()[0]->as<kir::TensorView>()->fuserTv());
-        if (domains.any()) {
-          return false;
-        }
-      } else if (expr->isA<kir::ReductionOp>() || expr->isA<kir::WelfordOp>()) {
-        auto td = ir_utils::getTVOutput(expr)->domain();
-        if (td->hasBlockReduction() || td->hasGridReduction()) {
-          return false;
-        }
+      if (ir_utils::hasBlockSync(expr, pred_map)) {
+        return false;
       }
     }
     // If the number of visits of the loop body per thread is one, the
     // unswitch predicate is sufficient.
     // When the loop stop is the same as the extent of its IterDomain,
     // the per-thread visit count is guaranteed to be one at most (see
-    // CudaKernelGenerator::visit(kir::ForLoop*) as well. Also, when a
+    // CudaKernelGenerator::handle(kir::ForLoop*) as well. Also, when a
     // loop is vectorized (not misaligned), the count must be one at
     // most. Even if not parallelized nor vectoirzed, it is also
     // sufficient if the loop stop is in fact one.
     bool visit_once = false;
     auto id = loop->iter_domain();
     if ((id->isThread() && (loop->stop() == id->extent())) ||
-        id->parallelType() == ParallelType::Vectorize) {
+        id->getParallelType() == ParallelType::Vectorize) {
       visit_once = true;
     }
     if (!visit_once) {
@@ -273,30 +265,18 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) {
 }
 
 // Generate the loop nest structure and place it in lowered_exprs
-UnrollPass::UnrollPass(const std::vector<kir::Expr*>& exprs) {
+UnrollPass::UnrollPass(const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::computeMap");
-
-  // Run through loop nests and further lower the expressions
-  for (auto* expr : exprs) {
-    handle(expr);
-  }
+  kir::ExprMutator::traverseAndInsert(exprs);
 }
 
-std::vector<kir::Expr*> UnrollPass::runPass(
+std::vector<Expr*> UnrollPass::runPass(
     Fusion* fusion,
-    const std::vector<kir::Expr*>& exprs) {
+    const std::vector<Expr*>& exprs) {
   FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::runPass");
 
   UnrollPass unroll_pass(exprs);
-
-  std::vector<kir::Expr*> mutated_exprs;
-  mutated_exprs.reserve(exprs.size());
-  for (auto expr : exprs) {
-    mutated_exprs.push_back(
-        ir_utils::applyReplacements(unroll_pass.replacementMap(), expr));
-  }
-
-  return mutated_exprs;
+  return unroll_pass.exprs_;
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h
index bec4966dd946..14725c405b77 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.h
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h
@@ -1,7 +1,8 @@
 #pragma once
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
@@ -51,33 +52,32 @@ namespace cuda {
 //! predicate still in the inner most loop, making sure that we cover edges and
 //! corners.
 //!
-class TORCH_CUDA_CU_API UnrollPass {
+class TORCH_CUDA_CU_API UnrollPass : kir::ExprMutator {
  public:
   // Take the incoming exprs and run loop unrolling, returning the new IR
-  static std::vector<kir::Expr*> runPass(
+  static std::vector<Expr*> runPass(
       Fusion* fusion,
-      const std::vector<kir::Expr*>& exprs);
+      const std::vector<Expr*>& exprs);
 
   static bool canOmitElseClause(kir::ForLoop* fl);
 
  private:
   // Generate the for Expr replacement map
-  UnrollPass(const std::vector<kir::Expr*>& exprs);
+  UnrollPass(const std::vector<Expr*>& exprs);
 
-  const std::unordered_map<kir::Expr*, kir::Expr*>& replacementMap() const {
+  const std::unordered_map<Expr*, Expr*>& replacementMap() const {
     return expr_replacement_map_;
   }
 
-  void handle(kir::ForLoop* fl);
+  using OptOutDispatch::handle;
 
-  void handle(kir::Expr* expr);
+  void handle(kir::ForLoop* fl) final;
+
+  void handle(Expr* expr) final;
 
  private:
   // We will track which loops in the incoming IR will be replaced and by what
-  std::unordered_map<kir::Expr*, kir::Expr*> expr_replacement_map_;
-
-  // Keep all for loops conveniently to make unrolling easier
-  std::vector<kir::ForLoop*> for_loops_;
+  std::unordered_map<Expr*, Expr*> expr_replacement_map_;
 
   // keep track if we're within an unrolled loop
   bool look_for_unroll_ = true;
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
index 5d015c450d9f..620d38fd04b5 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -6,8 +6,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
@@ -23,38 +22,14 @@ namespace cuda {
 
 namespace scope_utils {
 
-std::vector<kir::ForLoop*> getLoops(kir::Expr* scope) {
-  std::vector<kir::ForLoop*> loops;
-  while (scope != nullptr) {
-    if (auto loop = dynamic_cast<kir::ForLoop*>(scope)) {
-      loops.push_back(loop);
-    }
-    scope = scope->parentScope();
-  }
-  std::reverse(loops.begin(), loops.end());
-  return loops;
-}
-
-void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr) {
-  if (auto ite = dynamic_cast<kir::IfThenElse*>(scope)) {
-    ite->thenBody().insert_before(ref, expr);
-  } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(scope)) {
-    for_loop->body().insert_before(ref, expr);
-  } else {
-    TORCH_INTERNAL_ASSERT(false, "Unexpected scope expression");
-  }
-}
-
 //! Create an **empty** Forloop and copy the metadata.
-kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop) {
-  return ir_builder.create<kir::ForLoop>(for_loop);
+kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop) {
+  return IrBuilder::create<kir::ForLoop>(for_loop);
 }
 
 //! Create an **empty** IfThenElse and copy the metadata.
-kir::IfThenElse* cloneIfThenElse(
-    kir::IrBuilder& ir_builder,
-    kir::IfThenElse* ite) {
-  return ir_builder.create<kir::IfThenElse>(ite->predicate());
+kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite) {
+  return IrBuilder::create<kir::IfThenElse>(ite->predicate());
 }
 
 } // namespace scope_utils
@@ -103,46 +78,53 @@ std::vector<IterDomain*> iterDomainInputsOfOrderedAs(
 }
 
 bool isTV(const Val* val) {
-  return val->getValType().value() == ValType::TensorView;
+  return val->getValType().value() == ValType::TensorView ||
+      val->getValType().value() == ValType::TensorIndex;
 }
 
 // Check if we're a TensorView op that we can generate code for.
-bool isTVOp(const Expr* expr) {
+bool isTvOp(const Expr* expr) {
   if (std::any_of(
           expr->outputs().begin(),
           expr->outputs().end(),
           [](Val* v) { return isTV(v); }) &&
-      (expr->getExprType().value() == ExprType::BinaryOp ||
-       expr->getExprType().value() == ExprType::UnaryOp ||
+      (expr->getExprType().value() == ExprType::UnaryOp ||
+       expr->getExprType().value() == ExprType::BinaryOp ||
        expr->getExprType().value() == ExprType::TernaryOp ||
        expr->getExprType().value() == ExprType::ReductionOp ||
+       expr->getExprType().value() == ExprType::GroupedReductionOp ||
        expr->getExprType().value() == ExprType::WelfordOp ||
+       expr->getExprType().value() == ExprType::MmaOp ||
        expr->getExprType().value() == ExprType::BroadcastOp ||
        expr->getExprType().value() == ExprType::TransposeOp ||
        expr->getExprType().value() == ExprType::ShiftOp ||
        expr->getExprType().value() == ExprType::GatherOp ||
-       expr->getExprType().value() == ExprType::ViewOp)) {
+       expr->getExprType().value() == ExprType::ViewAsScalar ||
+       expr->getExprType().value() == ExprType::ViewOp ||
+       expr->getExprType().value() == ExprType::GridReduction ||
+       expr->getExprType().value() == ExprType::GridBroadcast ||
+       expr->getExprType().value() == ExprType::GridWelford)) {
     return true;
   }
   return false;
 }
 
-bool isTVOp(const kir::Expr* expr) {
-  const auto& outputs = expr->outputs();
-  return outputs.size() >= 1 && outputs[0]->isA<kir::TensorView>();
+TensorView* getTv(Val* val) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  return const_cast<TensorView*>(getTv(const_cast<const Val*>(val)));
 }
 
-kir::TensorView* getTv(kir::Val* val) {
-  if (auto tv = dynamic_cast<kir::TensorView*>(val)) {
-    return tv;
-  } else if (auto ti = dynamic_cast<kir::TensorIndex*>(val)) {
-    return ti->view();
+const TensorView* getTv(const Val* val) {
+  if (val->isA<TensorView>()) {
+    return val->as<TensorView>();
+  } else if (val->isA<kir::TensorIndex>()) {
+    return val->as<kir::TensorIndex>()->view();
   }
   return nullptr;
 }
 
-std::vector<kir::TensorView*> getTvs(const std::vector<kir::Val*>& vals) {
-  std::vector<kir::TensorView*> tvs;
+std::vector<TensorView*> getTvs(const std::vector<Val*>& vals) {
+  std::vector<TensorView*> tvs;
   for (auto val : vals) {
     auto tv = ir_utils::getTv(val);
     if (tv) {
@@ -152,32 +134,7 @@ std::vector<kir::TensorView*> getTvs(const std::vector<kir::Val*>& vals) {
   return tvs;
 }
 
-kir::TensorView* asTv(kir::Val* val) {
-  auto tv = getTv(val);
-  TORCH_INTERNAL_ASSERT(tv != nullptr, "Neigher TensorView nor TensorIndex");
-  return tv;
-}
-
-std::vector<kir::TensorView*> asTvs(const std::vector<kir::Val*> vals) {
-  std::vector<kir::TensorView*> tvs;
-  for (auto val : vals) {
-    auto tv = ir_utils::asTv(val);
-    tvs.emplace_back(tv);
-  }
-  return tvs;
-}
-
-// TODO: why do we assume there's a single TV output?
-TensorView* getTVOutput(const Expr* expr) {
-  for (auto out : expr->outputs()) {
-    if (out->getValType().value() == ValType::TensorView) {
-      return out->as<TensorView>();
-    }
-  }
-  return nullptr;
-}
-
-kir::TensorView* getTVOutput(const kir::Expr* expr) {
+TensorView* getTvOutput(const Expr* expr) {
   for (auto out : expr->outputs()) {
     if (auto tv = getTv(out)) {
       return tv;
@@ -186,6 +143,16 @@ kir::TensorView* getTVOutput(const kir::Expr* expr) {
   return nullptr;
 }
 
+bool isReductionOp(const Expr* expr) {
+  // Note that GridReduction inherits ReductionOp
+  return expr->isA<ReductionOp>() || expr->isA<GroupedReductionOp>() ||
+      expr->isA<WelfordOp>() || expr->isA<kir::GridWelford>();
+}
+
+bool isReductionTvOp(const Expr* expr) {
+  return isTvOp(expr) && isReductionOp(expr);
+}
+
 bool isScalarOp(const Expr* expr) {
   for (auto out : expr->outputs())
     if (!out->isScalar())
@@ -193,25 +160,21 @@ bool isScalarOp(const Expr* expr) {
   return true;
 }
 
-Expr* asExpr(Statement* stmt) {
-  TORCH_INTERNAL_ASSERT(stmt->isExpr());
-  return stmt->as<Expr>();
-}
-
-TensorView* asTV(Val* val) {
-  TORCH_INTERNAL_ASSERT(isTV(val));
-  return val->as<TensorView>();
-}
-
 bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {
-  if (!isTVOp(expr)) {
+  if (!isTvOp(expr)) {
     return false;
   }
 
-  auto tv = getTVOutput(expr);
+  if (!(isReductionOp(expr) || expr->isA<BroadcastOp>() ||
+        expr->isA<kir::GridBroadcast>())) {
+    return false;
+  }
 
-  if ((expr->isA<ReductionOp>() || expr->isA<WelfordOp>()) &&
-      (tv->hasBlockReduction() || tv->hasGridReduction())) {
+  // GroupedReductionOp can have multiple output TVs, but they must be
+  // parallelized in the same way, so just checking one of them is enough.
+  auto tv = getTvOutput(expr);
+
+  if (tv->hasBlockReduction() || tv->hasGridReduction()) {
     return true;
   } else if (expr->isA<BroadcastOp>()) {
     const ParallelTypeBitmap pt_map =
@@ -222,64 +185,23 @@ bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {
   return false;
 }
 
-bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map) {
-  if (expr->isA<kir::ReductionOp>() || expr->isA<kir::GridReduction>() ||
-      expr->isA<kir::GridBroadcast>() || expr->isA<kir::BroadcastOp>() ||
-      expr->isA<kir::WelfordOp>() || expr->isA<kir::GridWelford>()) {
-    auto fuser_tv = getTVOutput(expr)->fuserTv();
-    auto fuser_expr = fuser_tv->definition();
-    TORCH_INTERNAL_ASSERT(fuser_expr != nullptr);
-    return hasBlockSync(fuser_expr, pred_map);
-  }
-
-  return false;
-}
-
-kir::Expr* applyReplacements(
-    const std::unordered_map<kir::Expr*, kir::Expr*>& expr_replacement_map,
-    kir::Expr* expr) {
-  auto handle_scope = [&](kir::Scope& scope) {
-    for (const auto i : c10::irange(scope.size())) {
-      scope[i] = applyReplacements(expr_replacement_map, scope[i]);
-    }
-  };
-
-  const auto it = expr_replacement_map.find(expr);
-  if (it != expr_replacement_map.end()) {
-    return it->second;
-  } else {
-    if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle_scope(for_loop->body());
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle_scope(ite->thenBody());
-      handle_scope(ite->elseBody());
-    }
-    return expr;
-  }
-}
-
 c10::optional<IterDomain*> getMaybeWarpReductionDim(
-    const kir::ReductionOp* node) {
-  auto kir_tv = ir_utils::getTVOutput(node);
-  if (!kir_tv) {
+    const Val* output,
+    const Val* input) {
+  auto tv_out = getTv(output);
+  if (tv_out == nullptr) {
     return c10::nullopt;
   }
-  auto fuser_reduction = kir_tv->fuserTv()->definition()->as<ReductionOp>();
-  return getMaybeWarpReductionDim(fuser_reduction);
-}
-
-c10::optional<IterDomain*> getMaybeWarpReductionDim(const ReductionOp* node) {
-  auto fuser_tv_out = node->out()->as<TensorView>();
-  auto fuser_tv_in = node->in()->as<TensorView>();
 
+  auto tv_in = getTv(input);
   // only support reducing to registers for now.
-  if (fuser_tv_in->getMemoryType() != MemoryType::Local ||
-      fuser_tv_out->getMemoryType() != MemoryType::Local) {
+  if (tv_in->getMemoryType() != MemoryType::Local ||
+      tv_out->getMemoryType() != MemoryType::Local) {
     return c10::nullopt;
   }
 
   IterDomain* reduction_on_xdim = nullptr;
-  for (auto id : fuser_tv_out->domain()->domain()) {
+  for (auto id : tv_out->domain()->domain()) {
     // Currently warp reduction only allows
     //  serial and block.x parallel reductions
     if (id->isReduction() && id->isParallelized()) {
@@ -302,7 +224,7 @@ c10::optional<IterDomain*> getMaybeWarpReductionDim(const ReductionOp* node) {
     return c10::optional<IterDomain*>(reduction_on_xdim);
   }
 
-  if (reduction_on_xdim->extent()->isConstScalar()) {
+  if (reduction_on_xdim->extent()->isConst()) {
     auto extent_value = reduction_on_xdim->extent()->getInt().value();
     if (extent_value % at::cuda::warp_size() == 0) {
       return c10::optional<IterDomain*>(reduction_on_xdim);
@@ -329,54 +251,98 @@ bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis) {
       });
 }
 
-std::unordered_map<ParallelType, kir::IterDomain*, TypeHash> getParallelDomains(
-    kir::Val* val) {
-  kir::TensorView* kir_tv = nullptr;
-  if (val->isA<kir::TensorView>()) {
-    kir_tv = val->as<kir::TensorView>();
+std::unordered_map<ParallelType, IterDomain*, TypeHash> getParallelDomains(
+    const Val* val) {
+  const TensorView* tv = nullptr;
+  if (val->isA<TensorView>()) {
+    tv = val->as<TensorView>();
   } else if (val->isA<kir::TensorIndex>()) {
-    kir_tv = val->as<kir::TensorIndex>()->view();
+    tv = val->as<kir::TensorIndex>()->view();
   } else {
     TORCH_INTERNAL_ASSERT(
         false, "Provided val is not TensorIndex or TensorView.");
   }
 
-  std::unordered_map<ParallelType, kir::IterDomain*, TypeHash> parallel_domains;
-  for (auto d : kir_tv->domain()->domain()) {
+  std::unordered_map<ParallelType, IterDomain*, TypeHash> parallel_domains;
+  for (auto d : tv->domain()->domain()) {
     if (d->isThread()) {
-      parallel_domains.insert(std::make_pair(d->parallelType(), d));
+      parallel_domains.insert(std::make_pair(d->getParallelType(), d));
     }
   }
   return parallel_domains;
 }
 
+kir::Allocate* allocGlobalBufferForGridComm(
+    Val* buffer_size,
+    DataType dtype,
+    bool zero_init) {
+  const std::vector<IterDomain*> new_buffer_ids = {
+      IrBuilder::create<IterDomain>(
+          GpuLower::current()->kernel()->zeroVal(), buffer_size)};
+  const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids);
+  const auto buffer_tv =
+      IrBuilder::create<TensorView>(buffer_domain, dtype, MemoryType::Global);
+  return IrBuilder::create<kir::Allocate>(
+      buffer_tv, buffer_tv->getMemoryType(), nullptr, zero_init);
+}
+
 } // namespace ir_utils
 
 namespace loop_utils {
 
-// TODO: Clean this up, Naoya added a mechanism we should be able to reuse.
-std::pair<kir::ForLoop*, int64_t> getAllocPoint(
+BasicAllocInfo getAllocInformation(
     const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops,
+    const std::vector<kir::ForLoop*>& for_loops,
     const std::unordered_map<IterDomain*, IterDomain*>& id_map,
     bool use_id_map) {
-  const auto gpu_lower = GpuLower::current();
+  BasicAllocInfo info;
+  auto gpu_lower = GpuLower::current();
 
-  // If in global memory, it can be all the way outside the loops.
-  if (tv->getMemoryType() == MemoryType::Global) {
-    return {nullptr, 0};
-  }
+  bool outer_alloc_found = false;
 
-  // Figure out where we want to place alloc/reduction initialization. We want
-  // outside an unroll loop, or inside our computeAt point.
-  kir::ForLoop* alloc_loop = nullptr;
+  for (auto fl : for_loops) {
+    if (info.alloc_pos == tv->getComputeAtPosition()) {
+      break;
+    }
+
+    if (tv->axis(info.alloc_pos)->isReduction()) {
+      const auto outputs = FusionGuard::getCurFusion()->getTerminatingOutputs();
+      TORCH_INTERNAL_ASSERT(
+          std::find(outputs.begin(), outputs.end(), tv) != outputs.end(),
+          "Invalid computeAt of T",
+          tv->name(),
+          ". A reducation axis is detected outside computeAt point even though it is not an output tensor.");
+      break;
+    }
+
+    auto fl_id = fl->iter_domain();
 
-  auto loops_it = loops.begin();
-  // Look at each axis individually in out's domain
-  for (const auto tv_i : c10::irange((int64_t)tv->getComputeAtPosition())) {
-    // Grab the axis ID
+    if (fl_id->getParallelType() == ParallelType::Unroll) {
+      break;
+    }
+
+    // Shared memory must be allocated outside of unswitched
+    // domains. See issue #1133.
+    if (fl_id->getParallelType() == ParallelType::Unswitch &&
+        tv->getMemoryType() == MemoryType::Shared) {
+      outer_alloc_found = true;
+    }
+
+    // Assume global memory is allocated at outer most scope.
+    if (tv->getMemoryType() == MemoryType::Global) {
+      outer_alloc_found = true;
+    }
+
+    // Allocation of a double buffered tensor is placed outside its
+    // double buffer axis.
+    if (tv->isDoubleBuffered() &&
+        tv->axis(info.alloc_pos) ==
+            gpu_lower->doubleBufferInfo().getDoubleBufferAxis(tv)) {
+      outer_alloc_found = true;
+    }
+
+    auto local_id = tv->axis(info.alloc_pos);
 
-    auto local_id = tv->axis(tv_i);
     if (use_id_map) {
       auto id_it = id_map.find(local_id);
       if (id_it != id_map.end()) {
@@ -384,91 +350,46 @@ std::pair<kir::ForLoop*, int64_t> getAllocPoint(
       }
     }
 
-    if (gpu_lower->trivialReductionInfo().isDerivedFromRoot(local_id)) {
-      continue;
+    if (GpuLower::current()->caMap()->areMapped(
+            local_id, fl_id, IdMappingMode::PERMISSIVE)) {
+      info.alloc_pos++;
     }
 
-    auto lowered_local_id =
-        gpu_lower->lowerValue(local_id)->as<kir::IterDomain>();
-    loops_it = std::find_if(
-        loops_it, loops.end(), [&lowered_local_id](const auto& loop) {
-          return GpuLower::current()->caLoopMap().areMapped(
-                     lowered_local_id, loop->iter_domain()) ||
-              loop->iter_domain()->parallelType() == ParallelType::Unroll;
-        });
+    info.init_for_loop = fl;
 
-    TORCH_INTERNAL_ASSERT(
-        loops_it != loops.end(),
-        "Could not find all required axes for indexing when trying to index into ",
-        tv);
-    if ((*loops_it)->iter_domain()->parallelType() == ParallelType::Unroll) {
-      return {alloc_loop, tv_i};
+    if (!outer_alloc_found) {
+      info.alloc_for_loop = fl;
     }
-
-    alloc_loop = *loops_it;
-    ++loops_it;
   }
 
-  return {alloc_loop, (int64_t)tv->getComputeAtPosition()};
-}
-
-std::pair<kir::ForLoop*, int64_t> getAllocPoint(
-    const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops) {
-  return getAllocPoint(tv, loops, {}, false);
+  return info;
 }
 
 } // namespace loop_utils
 
 namespace {
 
-class ReplaceExprInput : public kir::MutableIrVisitor {
+class ReplaceExprInput : private kir::ExprMutator {
  public:
-  static kir::Expr* replace(
-      kir::Expr* expr,
-      const std::unordered_map<kir::Val*, kir::Val*>& replacement_map) {
-    ReplaceExprInput replacer(expr, replacement_map);
-    TORCH_INTERNAL_ASSERT(expr != nullptr);
-    expr->accept(&replacer);
-    TORCH_INTERNAL_ASSERT(replacer.replaced_expr_ != nullptr);
-    auto ret_expr = replacer.replaced_expr_;
-
-    // Copy predicates if the original expr is predicated
-    if (ret_expr != expr) {
-      ret_expr->setPredicate(expr->predicate());
-      ret_expr->setWritePredicate(expr->writePredicate());
-    }
-    return ret_expr;
-  }
-
-  static std::vector<kir::Expr*> replace(
-      const std::vector<kir::Expr*>& scope,
-      const std::unordered_map<kir::Val*, kir::Val*>& replacement_map) {
-    std::vector<kir::Expr*> ret_expr;
-    ret_expr.reserve(scope.size());
-
-    for (auto expr : scope) {
-      ret_expr.push_back(replace(expr, replacement_map));
-    }
-
-    return ret_expr;
+  static std::vector<Expr*> replace(
+      const std::vector<Expr*>& exprs,
+      const std::unordered_map<Val*, Val*>& replacement_map) {
+    ReplaceExprInput replacer(replacement_map);
+    replacer.traverseAndInsert(exprs);
+    return replacer.exprs_;
   }
 
  private:
-  ReplaceExprInput(
-      kir::Expr* expr,
-      const std::unordered_map<kir::Val*, kir::Val*>& replacement_map)
-      : gpu_lower_(GpuLower::current()),
-        ir_builder_(gpu_lower_->kernel()),
-        replacement_map_(replacement_map) {
-    replaced_expr_ = expr;
-  }
+  ReplaceExprInput(const std::unordered_map<Val*, Val*>& replacement_map)
+      : replacement_map_(replacement_map) {}
+
+  using kir::ExprMutator::handle;
 
-  c10::optional<std::unordered_map<kir::Val*, kir::Val*>>
-  getMaybeInputReplacementMap(kir::Expr* expr) {
+  c10::optional<std::unordered_map<Val*, Val*>> getMaybeInputReplacementMap(
+      Expr* expr) {
     bool need_replacement = false;
 
-    std::unordered_map<kir::Val*, kir::Val*> replaced_val;
+    std::unordered_map<Val*, Val*> replaced_val;
     for (auto in : expr->inputs()) {
       auto replace_it = replacement_map_.find(in);
       if (replace_it != replacement_map_.end()) {
@@ -479,98 +400,103 @@ class ReplaceExprInput : public kir::MutableIrVisitor {
       }
     }
     if (need_replacement) {
-      return c10::optional<std::unordered_map<kir::Val*, kir::Val*>>(
-          replaced_val);
+      return c10::optional<std::unordered_map<Val*, Val*>>(replaced_val);
     } else {
       return c10::nullopt;
     }
   }
 
-  // IR visitor interface
-  void visit(kir::ForLoop* for_loop) final {
-    auto new_for_loop = ir_builder_.create<kir::ForLoop>(for_loop);
-
-    auto replaced_loop_body =
-        replace(for_loop->body().exprs(), replacement_map_);
-
-    for (auto new_expr : replaced_loop_body) {
-      new_for_loop->body().push_back(new_expr);
-    }
-    replaced_expr_ = new_for_loop;
+  // Copy predicates and register expression replacement
+  void registerReplaceWithPredicate(Expr* old_expr, Expr* new_expr) {
+    new_expr->setPredicate(old_expr->predicate());
+    new_expr->setWritePredicate(old_expr->writePredicate());
+    registerReplace(old_expr, new_expr);
   }
 
-  void visit(kir::IfThenElse* ite) final {
-    auto new_ite = ir_builder_.create<kir::IfThenElse>(ite->predicate());
-    auto replaced_then_body =
-        replace(ite->thenBody().exprs(), replacement_map_);
-    for (auto new_expr : replaced_then_body) {
-      new_ite->thenBody().push_back(new_expr);
-    }
-    if (ite->hasElse()) {
-      auto replaced_else_body =
-          replace(ite->elseBody().exprs(), replacement_map_);
-      for (auto new_expr : replaced_else_body) {
-        new_ite->elseBody().push_back(new_expr);
-      }
-    }
-    replaced_expr_ = new_ite;
-  }
-
-  void visit(kir::UnaryOp* node) final {
+  void handle(UnaryOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::UnaryOp>(
-          node->operation(),
+      auto replacement = IrBuilder::create<UnaryOp>(
+          node->getUnaryOpType(),
           node->out(),
           replaced_inputs.value().at(node->in()));
+      registerReplaceWithPredicate(node, replacement);
     }
   }
-  void visit(kir::BinaryOp* node) final {
+
+  void handle(BinaryOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::BinaryOp>(
-          node->operation(),
+      auto replacement = IrBuilder::create<BinaryOp>(
+          node->getBinaryOpType(),
           node->out(),
           replaced_inputs.value().at(node->lhs()),
           replaced_inputs.value().at(node->rhs()));
+      registerReplaceWithPredicate(node, replacement);
     }
   }
 
-  void visit(kir::TernaryOp* node) final {
+  void handle(TernaryOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::TernaryOp>(
-          node->operation(),
+      auto replacement = IrBuilder::create<TernaryOp>(
+          node->getTernaryOpType(),
           node->out(),
           replaced_inputs.value().at(node->in1()),
           replaced_inputs.value().at(node->in2()),
           replaced_inputs.value().at(node->in3()));
+      registerReplaceWithPredicate(node, replacement);
     }
   }
 
-  void visit(kir::ReductionOp* node) final {
+  void handle(ReductionOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::ReductionOp>(
-          node->operation(),
+      auto replacement = IrBuilder::create<ReductionOp>(
+          node->getReductionOpType(),
           node->init(),
           node->out(),
-          replaced_inputs.value().at(node->in()));
+          replaced_inputs.value().at(node->in()),
+          node->isAllreduce());
+      registerReplaceWithPredicate(node, replacement);
     }
   }
 
-  void visit(kir::BroadcastOp* node) final {
+  void handle(GroupedReductionOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::BroadcastOp>(
-          node->out(), replaced_inputs.value().at(node->in()));
+      const auto& map = replaced_inputs.value();
+      auto inputs = node->inputs();
+      for (auto& input : inputs) {
+        auto it = map.find(input);
+        if (it != map.end()) {
+          input = it->second;
+        }
+      }
+      auto replacement = IrBuilder::create<GroupedReductionOp>(
+          node->getReductionOpTypes(),
+          node->initVals(),
+          node->outputs(),
+          inputs,
+          node->isAllreduce());
+      registerReplaceWithPredicate(node, replacement);
+    }
+  }
+  void handle(BroadcastOp* node) final {
+    auto replaced_inputs = getMaybeInputReplacementMap(node);
+    if (replaced_inputs.has_value()) {
+      auto replacement = IrBuilder::create<BroadcastOp>(
+          node->out(),
+          replaced_inputs.value().at(node->in()),
+          node->getBroadcastDimFlags());
+      registerReplaceWithPredicate(node, replacement);
     }
   }
 
-  void visit(kir::WelfordOp* node) final {
+  void handle(WelfordOp* node) final {
     auto replaced_inputs = getMaybeInputReplacementMap(node);
     if (replaced_inputs.has_value()) {
-      replaced_expr_ = ir_builder_.create<kir::WelfordOp>(
+      auto replacement = IrBuilder::create<WelfordOp>(
           node->outAvg(),
           node->outVar(),
           node->outN(),
@@ -580,24 +506,44 @@ class ReplaceExprInput : public kir::MutableIrVisitor {
           replaced_inputs.value().at(node->inAvg()),
           replaced_inputs.value().at(node->inVar()),
           replaced_inputs.value().at(node->inN()));
+      registerReplaceWithPredicate(node, replacement);
+    }
+  }
+
+  void handle(MmaOp* node) final {
+    auto replaced_inputs = getMaybeInputReplacementMap(node);
+    if (replaced_inputs.has_value()) {
+      auto replacement = IrBuilder::create<MmaOp>(
+          node->out(),
+          replaced_inputs.value().at(node->inA()),
+          replaced_inputs.value().at(node->inB()),
+          node->init(),
+          node->options());
+      registerReplaceWithPredicate(node, replacement);
     }
   }
 
  private:
-  GpuLower* gpu_lower_;
-  kir::IrBuilder ir_builder_;
-  kir::Expr* replaced_expr_ = nullptr;
-  const std::unordered_map<kir::Val*, kir::Val*>& replacement_map_;
+  const std::unordered_map<Val*, Val*>& replacement_map_;
 };
 
 } // namespace
 
-std::vector<kir::Expr*> replaceInputsInExpr(
-    const std::vector<kir::Expr*>& exprs,
-    const std::unordered_map<kir::Val*, kir::Val*>& replacement_map) {
+std::vector<Expr*> replaceInputsInExpr(
+    const std::vector<Expr*>& exprs,
+    const std::unordered_map<Val*, Val*>& replacement_map) {
   return ReplaceExprInput::replace(exprs, replacement_map);
 }
 
+bool isTrivialIterDomain(IterDomain* id) {
+  auto pt = id->getParallelType();
+  return id->isReduction() || id->isBroadcast() || id->isStride() ||
+      (id->extent()->isOneInt() && id->start()->isZeroInt()) ||
+      pt == ParallelType::Vectorize ||
+      (isParallelTypeThread(pt) &&
+       !GpuLower::current()->haloInfo().hasHaloWidth(id));
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h
index 1c8a0df5cd79..50cce7d96b9e 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.h
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -1,8 +1,9 @@
 
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
+#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
@@ -19,27 +20,15 @@ namespace cuda {
 
 class ThreadPredicateMap;
 
-using IterDomainMap = std::unordered_map<kir::IterDomain*, kir::IterDomain*>;
+using IterDomainMap = std::unordered_map<IterDomain*, IterDomain*>;
 
 namespace scope_utils {
 
-//! Returns the list of nesting loops starting at `scope`
-// Primarily used in indexing, maybe could be moved there
-std::vector<kir::ForLoop*> getLoops(kir::Expr* scope);
-
-//! Insert expr in scope before ref
-//!
-//! \warning for kir::IfThenElse we implicitly insert in the "then" branch!
-//!
-void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr);
-
 //! Create an **empty** Forloop and copy the metadata.
-kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop);
+kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop);
 
 //! Create an **empty** IfThenElse and copy the metadata.
-kir::IfThenElse* cloneIfThenElse(
-    kir::IrBuilder& ir_builder,
-    kir::IfThenElse* ite);
+kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite);
 
 } // namespace scope_utils
 
@@ -74,107 +63,157 @@ std::vector<IterDomain*> iterDomainInputsOfOrderedAs(
     const std::vector<IterDomain*>& of,
     const std::vector<IterDomain*>& order);
 
+// Returns if Val is a TensorView or TensorIndex
 bool isTV(const Val* const);
 
-TORCH_CUDA_CU_API bool isTVOp(const Expr*);
-
-bool isTVOp(const kir::Expr* expr);
-
-TensorView* getTVOutput(const Expr*);
-kir::TensorView* getTVOutput(const kir::Expr*);
-
-bool isScalarOp(const Expr*);
-
-// TODO(kir): remove
-Expr* asExpr(Statement*);
+// Returns if Expr is a TensorView or TensorIndex Expr.
+TORCH_CUDA_CU_API bool isTvOp(const Expr*);
 
-// TODO(kir): Remove in favor of ->as<TensorView>()
-TensorView* asTV(Val*);
+// Returns the first output of Expr that is a TensorView
+TensorView* getTvOutput(const Expr*);
 
-//! Get kir::TensorView potentially via kir::TensorIndex. Returns nullptr if
-//! cast fails.
-kir::TensorView* getTv(kir::Val*);
-
-//! Get only kir::TensorView potentially via kir::TensorIndex.
-std::vector<kir::TensorView*> getTvs(const std::vector<kir::Val*>& vals);
+// Returns if Expr is a reduction op
+TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
 
-//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails.
-kir::TensorView* asTv(kir::Val*);
-
-//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails.
-std::vector<kir::TensorView*> asTvs(const std::vector<kir::Val*>& vals);
+// Returns if Expr is a reduction op with TensorView or TensorIndex
+TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
 
 bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map);
-bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map);
 
-// expr_replacement_map maps an expression to its replacement.
-//
-// The applyReplacement function serves two purposes.
-//
-// 1. If expr is found in expr_replacement_map, return the value for expr key.
-// Otherwise, return the original expression.
-//
-// 2. If a replacement is not found and the expression is a ForLoop or an
-// IfThenElse, it modifies the expressions in its scope by running the
-// handle_scope function
-//
-// The handle_scope function iterates over the expressions in the scope.
-// For each expression, it updates the expression the value returned by
-// applyReplacement.
-kir::Expr* applyReplacements(
-    const std::unordered_map<kir::Expr*, kir::Expr*>& expr_replacement_map,
-    kir::Expr* expr);
-
-//! Returns the Fuser iterdomain that maps to the thread dimension grouped
+//! Returns the iterdomain that maps to the thread dimension grouped
 //!  to warps. Returns nullopt if the reduction is not to be lowered to
 //!  a warp reduction.
 c10::optional<IterDomain*> getMaybeWarpReductionDim(
-    const kir::ReductionOp* node);
+    const Val* output,
+    const Val* input);
 
-c10::optional<IterDomain*> getMaybeWarpReductionDim(const ReductionOp* node);
+bool isScalarOp(const Expr*);
+
+//! Get TensorView potentially via kir::TensorIndex. Returns nullptr if
+//! cast fails.
+TensorView* getTv(Val*);
+const TensorView* getTv(const Val*);
+
+//! Get only TensorView potentially via kir::TensorIndex.
+std::vector<TensorView*> getTvs(const std::vector<Val*>& vals);
 
 //! Return true if axis is derived from a root axis that is an input
 //! to a CA leaf axis.
 bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis);
 
-std::unordered_map<ParallelType, kir::IterDomain*, TypeHash> getParallelDomains(
-    kir::Val* val);
+std::unordered_map<ParallelType, IterDomain*, TypeHash> getParallelDomains(
+    const Val* val);
+
+// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid
+// welford reduce, grid broadcast.
+kir::Allocate* allocGlobalBufferForGridComm(
+    Val* buffer_size,
+    DataType dtype,
+    bool zero_init);
 
 } // namespace ir_utils
 
 namespace loop_utils {
 
-// I wanted to make the tv's in these util functions constant, but that started
-// a long const-ness project going into TensorView (making functions const
-// there) then into lower_loops where we sort exprs.
-// TODO: We should fix this when we have some time.
-
-// Figure out which loop the allocation needs to be in. Returns nullptr if
-// outside the first loop in loops. Also find out which index in tv the
-// first dimension that needs to be allocated is. Meaning we need to allocate
-// that local axis and above.
-// TODO: Only remaining use of this is in index compute, remove use from there,
-// or refactor and use in lower_allocation
-std::pair<kir::ForLoop*, int64_t> getAllocPoint(
-    const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops,
-    const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-    bool use_id_map);
+struct BasicAllocInfo {
+  // The for loop that the initialization of this allocation must be
+  // placed in, nullptr if not within a loop
+  kir::ForLoop* init_for_loop = nullptr;
+
+  // Keep track of the actual allocation loop. This can be different
+  // from init_for_loop only with unswitched shared memory allocations,
+  // which are moved outer loops to avoid duplicated allocations. This means
+  // that the alloc position may be outside what's expected. Most applications
+  // outside lower_allocation is likely looking for init_for_loop which is
+  // more directly related to how large an allocation is and how it's used.
+  // (see issue #1133).
+  kir::ForLoop* alloc_for_loop = nullptr;
+
+  // The allocation position relative to buffer IDs, it could be outside the
+  // compute at position if it's shared memory with a compute at inside an
+  // unswitch
+  size_t alloc_pos = 0;
+};
 
-std::pair<kir::ForLoop*, int64_t> getAllocPoint(
+// Fill the above allocation struct based on provided information. id_map is
+// used if we're looking at a producer tensor but loops on a consumer tensor.
+BasicAllocInfo getAllocInformation(
     const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops);
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, IterDomain*>& id_map = {},
+    bool use_id_map = false);
 } // namespace loop_utils
 
 // Replace value pass on Kernel IR.
-//  Replace each use of any kir::Val* that apears in the given `replacement_map`
+//  Replace each use of any Val* that apears in the given `replacement_map`
 //  Keeps the predicate carried by each expr
 //
 // Warning: Blindly replaces all use based on pointer
 // Warning: May invalidate indexing if replacing uses of allocated values
-std::vector<kir::Expr*> replaceInputsInExpr(
-    const std::vector<kir::Expr*>& exprs,
-    const std::unordered_map<kir::Val*, kir::Val*>& replacement_map);
+std::vector<Expr*> replaceInputsInExpr(
+    const std::vector<Expr*>& exprs,
+    const std::unordered_map<Val*, Val*>& replacement_map);
+
+// True if an IterDomain does not materialize a loop
+bool isTrivialIterDomain(IterDomain* id);
+
+// Go through all expressions and compute a local ordering of loops. operator<
+// is implemented based on the concrete_id_dependencies analysis done. If
+// there's no dependency between two IDs then order doesn't mater, otherwise we
+// can tell which is inner most by checking if there's any dependency
+// relationships.
+//
+// Dependency relationships in concrete_id_dependencies has a "global" view in
+// the fusion, so it can resolve ordering by only looking at id's and the
+// dependency map.
+//
+// For example two expressions may have domains: [I0], [I1] Yet we
+// won't know the ordering unless we see a domain with: [I0, I1]. This happened
+// in advancedIndexing9 (also see AdvancedLowering6) test when merging T5 with
+// the group containing T10 (cache of T5, which is post broadcasted output) and
+// T6(pre broadcasted output).
+// T5 had the domain [0, 1, 2, 3, 4] produce at 3
+// T6 had the domain [0, 3, 4] compute at 3
+// Merging [0, 1, 2] and [0, 3, 4] resulted in the domain [0, 3, 4, 1, 2]
+//
+// If ID's are not in filter, we don't care about their ordering and ignore
+// them. This is because we're only focused on loops we will have to merge
+// across groups. If the domain is not in a produce at position in the producer
+// edges, or a compute at position in the consumer edges, the expressions we
+// look at may not have a unique ordering.
+
+struct TORCH_CUDA_CU_API IterDomainDependencySorter {
+  IterDomainDependencySorter(
+      const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
+          concrete_id_dependencies,
+      const std::unique_ptr<ComputeAtMap>& compute_at_map)
+      : concrete_id_dependencies_(concrete_id_dependencies),
+        compute_at_map_(compute_at_map) {}
+
+  // Return true if id0 should be before id1
+  // Orders such that if x maps to {y}, x comes before y in final ordering.
+  inline bool operator()(IterDomain* id0, IterDomain* id1) {
+    auto concrete_id_0 =
+        compute_at_map_->getConcreteMappedID(id0, IdMappingMode::LOOP);
+    auto concrete_id_1 =
+        compute_at_map_->getConcreteMappedID(id1, IdMappingMode::LOOP);
+
+    if (concrete_id_dependencies_.find(concrete_id_0) !=
+        concrete_id_dependencies_.end()) {
+      const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0);
+      // if id0 depends on id1 it means id1 is inside id0, so id0 < id1
+      if (dependencies_0.count(concrete_id_1)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
+      concrete_id_dependencies_;
+  const std::unique_ptr<ComputeAtMap>& compute_at_map_;
+};
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
index 0579e44dcd6b..241e45f3eaaa 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -1,16 +1,18 @@
 #include <torch/csrc/jit/codegen/cuda/lower_validation.h>
 
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
+#include <ATen/cuda/CUDAContext.h>
 #include <limits>
 
 namespace torch {
@@ -20,23 +22,80 @@ namespace cuda {
 
 namespace {
 
-//! A parallel type validation pass to make sure all the outputs of
-//!   welford ops are parallelized the same way. Will infer and modify serial
-//!   parallel types if other output/s are parallelized, so that
-//!   user wouldn't have to specify the same parallelization
-//!   3 times. Will throw if conflicts are detected, i.e.
-//!   TIDx vs BIDx etc.
-class ValidateParallelType : public IterVisitor {
+//! Validate multiple output tensors of the same expression, i.e.,
+//! siblings, have valid domains and parallel types. Since siblings
+//! are placed in the same loop nest, they must be parallelized the
+//! same way. Will infer and modify serial parallel types if other
+//! output/s are parallelized, so that user wouldn't have to specify
+//! the same parallelization 3 times. Will throw if conflicts are
+//! detected, i.e. TIDx vs BIDx etc.
+class ValidateSiblings : public IterVisitor {
  public:
   static void validate(Fusion* fusion) {
-    ValidateParallelType VPT;
-    VPT.traverse(fusion);
+    ValidateSiblings validator;
+    validator.traverse(fusion);
   }
 
  private:
   using IterVisitor::handle;
+
+  void handle(Expr* expr) final {
+    if (!ir_utils::isTvOp(expr) || expr->outputs().size() < 2) {
+      IterVisitor::handle(expr);
+      return;
+    }
+
+    auto ref_output = expr->outputs().at(0)->as<TensorView>();
+    auto ref_ndims = ref_output->nDims();
+    const auto& ref_root = ref_output->getRootDomain();
+    std::unordered_map<IterDomain*, IterDomain*> id_map;
+
+    for (const auto sibling :
+         ir_utils::filterByType<TensorView>(expr->outputs())) {
+      if (ref_output == sibling) {
+        continue;
+      }
+
+      TORCH_INTERNAL_ASSERT(
+          sibling->nDims() == ref_ndims,
+          "Mismatched dimensionality detected. Expr: ",
+          expr->toString(),
+          "Ref output: ",
+          ref_output->toString(),
+          ". Sibling: ",
+          sibling->toString());
+
+      for (const auto i : c10::irange(ref_ndims)) {
+        validateParallelTypes(ref_output->axis(i), sibling->axis(i));
+      }
+
+      for (const auto i : c10::irange(ref_root.size())) {
+        id_map[ref_root[i]] = sibling->getRootDomain().at(i);
+      }
+
+      BestEffortReplay replay(
+          sibling->domain()->domain(), ref_output->domain()->domain(), id_map);
+      for (const auto i : c10::irange(ref_ndims)) {
+        auto it = replay.getReplay().find(ref_output->axis(i));
+        TORCH_INTERNAL_ASSERT(
+            it != replay.getReplay().end(),
+            "Matching sibling ID not found. Expr: ",
+            expr->toString(),
+            "Ref ID: ",
+            ref_output->axis(i)->toString());
+        auto sibling_id = it->second;
+        TORCH_INTERNAL_ASSERT(
+            sibling->axis(i) == sibling_id,
+            "Invalid matching sinbling ID detected. Expr: ",
+            expr->toString(),
+            "Sibling ID: ",
+            sibling_id->toString());
+      }
+    }
+  }
+
   // Parallelize id1 and id0 consistently if one is serial and the other isn't
-  void convertIterDomain(IterDomain* id0, IterDomain* id1) {
+  void validateParallelTypes(IterDomain* id0, IterDomain* id1) {
     const auto ptype0 = id0->getParallelType();
     const auto ptype1 = id1->getParallelType();
 
@@ -64,20 +123,6 @@ class ValidateParallelType : public IterVisitor {
       }
     }
   }
-
-  void handle(WelfordOp* wop) override {
-    auto out_avg = wop->outAvg()->as<TensorView>();
-    auto out_var = wop->outVar()->as<TensorView>();
-    auto out_n = wop->outN()->as<TensorView>();
-    TORCH_INTERNAL_ASSERT(out_avg->nDims() == out_var->nDims());
-    TORCH_INTERNAL_ASSERT(out_avg->nDims() == out_n->nDims());
-    for (const auto i : c10::irange(out_avg->nDims())) {
-      // TODO: can be cleaner.
-      convertIterDomain(out_avg->axis(i), out_var->axis(i));
-      convertIterDomain(out_avg->axis(i), out_n->axis(i));
-      convertIterDomain(out_n->axis(i), out_var->axis(i));
-    }
-  }
 };
 
 // Make sure all IterDomains are only used for a unique
@@ -151,7 +196,7 @@ void validateIr(Fusion* fusion) {
   }
 
   // Validate Parallelization
-  ValidateParallelType::validate(fusion);
+  ValidateSiblings::validate(fusion);
 
   validateIterDomainUsage(fusion);
 }
@@ -261,6 +306,35 @@ class VectorizeValidator : public OptInDispatch {
     domains_.insert(m->inner());
   }
 
+  // For the producer tensor, it's indexed first by transformed like
+  // the consumer. So, to find its contig merged domain, use the
+  // consumer TensorDomain with the producer contiguity info.
+  static std::vector<bool> mapProducerContiguity(
+      TensorView* producer_tv,
+      TensorView* consumer_tv) {
+    const auto c2p = PairwiseRootDomainMap(producer_tv, consumer_tv)
+                         .mapConsumerToProducer(
+                             consumer_tv->domain(), producer_tv->domain());
+
+    std::vector<bool> producer_contiguity;
+
+    for (auto consumer_root_id : consumer_tv->getRootDomain()) {
+      auto producer_root_id = c2p.at(consumer_root_id);
+      auto producer_root_it = std::find(
+          producer_tv->getMaybeRFactorDomain().begin(),
+          producer_tv->getMaybeRFactorDomain().end(),
+          producer_root_id);
+      TORCH_INTERNAL_ASSERT(
+          producer_root_it != producer_tv->getMaybeRFactorDomain().end());
+      auto producer_root_id_offset = std::distance(
+          producer_tv->getMaybeRFactorDomain().begin(), producer_root_it);
+      producer_contiguity.push_back(
+          producer_tv->domain()->contiguity().at(producer_root_id_offset));
+    }
+
+    return producer_contiguity;
+  }
+
  private:
   std::unordered_set<IterDomain*> domains_;
   IterDomain* vectorized_id_ = nullptr;
@@ -285,8 +359,10 @@ class VectorizeValidator : public OptInDispatch {
       }
     }
 
-    // If no vectorized id's found simply return;
-    if (v_id == nullptr) {
+    // If no vectorized ids found simply return. If vectorized access is
+    // broadcast, it won't generate an actual vector instruction, so can safely
+    // be ignore
+    if (v_id == nullptr || v_id->isBroadcast()) {
       return;
     }
 
@@ -319,7 +395,10 @@ class VectorizeValidator : public OptInDispatch {
         vector_size,
         " however, vector sizes only upto and including 16 bytes are supported.");
 
-    auto replay_exprs = ExprSort::getExprs(fusion, {v_id});
+    auto replay_exprs = DependencyCheck::getAllExprsBetween(
+        {tv->getMaybeRFactorDomain().begin(),
+         tv->getMaybeRFactorDomain().end()},
+        {v_id});
 
     VectorizeValidator validator(v_id);
 
@@ -377,12 +456,54 @@ class VectorizeValidator : public OptInDispatch {
         "Vectorized dim has to be from a contiguous inner most position: ",
         tv,
         "\n");
+
+    // Save info required to lowering and runtime validation
+    auto consumer_word_size_it =
+        GpuLower::current()->vectorizedAccesses().find(tv);
+    if (consumer_word_size_it !=
+        GpuLower::current()->vectorizedAccesses().end()) {
+      consumer_word_size_it->second = std::max(
+          (int)vector_size_optional.value(), consumer_word_size_it->second);
+    } else {
+      GpuLower::current()->vectorizedAccesses().emplace(
+          tv, (int)vector_size_optional.value());
+    }
+    auto producer_tv = tv->definition()->inputs().at(0)->as<TensorView>();
+    auto producer_word_size_it =
+        GpuLower::current()->vectorizedAccesses().find(producer_tv);
+    if (producer_word_size_it !=
+        GpuLower::current()->vectorizedAccesses().end()) {
+      producer_word_size_it->second = std::max(
+          (int)vector_size_optional.value(), producer_word_size_it->second);
+    } else {
+      GpuLower::current()->vectorizedAccesses().emplace(
+          producer_tv, (int)vector_size_optional.value());
+    }
+
+    VectorizedSetInfo vectorized_set_info;
+    vectorized_set_info.consumer_tv = tv;
+    vectorized_set_info.producer_tv = producer_tv;
+    // Note that VectorizedSetInfo is about each instance of
+    // vectorized set operations, so the word size is the size of this
+    // specific vectorized set.
+    vectorized_set_info.word_size = (int)vector_size_optional.value();
+    vectorized_set_info.vectorized_leaf_id = v_id;
+    vectorized_set_info.vectorized_root_id = validator.vectorized_id_;
+    // For aligned vectorize, the extent of a vectorized domain must
+    // be divisible by the vector word size. The domain is usually
+    // just one of the root domains, but can be a merged domain of
+    // contiguous domains. Those domains are saved in
+    // VectorizedSetInfo.contig_root_ids at the time of indexing.
+    GpuLower::current()->vectorizedSetInfo().emplace_back(vectorized_set_info);
   }
 };
 
 } // namespace
 
-void validateVectorize(Fusion* fusion) {
+// Uses ContigIDs to find root contig domains that a vectorized domain
+// depends on. As ContigIDs depends on HaloInfo, this must be done
+// after HaloInfo is created.
+void validateAndCollectVectorizeInfo(Fusion* fusion) {
   FUSER_PERF_SCOPE("GpuLower::Lower::validateVectorize");
   FusionGuard fg(fusion);
 
@@ -403,7 +524,8 @@ void validateVectorize(Fusion* fusion) {
     for (const auto i : c10::irange(tv->nDims())) {
       IterDomain* id = tv->axis(i);
       IterDomain* concrete_id =
-          GpuLower::current()->caParallelMap().getConcreteMappedID(id);
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              id, IdMappingMode::LOOP);
 
       auto ptype = concrete_id->getParallelType();
 
@@ -444,6 +566,10 @@ void validateVectorize(Fusion* fusion) {
           "TensorView: ",
           tv);
     }
+    // Validate the vectorized domain maps to the innermost domain of
+    // tv. Note that we don't need to validate its producer tv as
+    // both Vectorize and MisalignedVectorize can only be used with
+    // UnaryOp::Set.
     if (has_vectorize_dim || has_misaligned_vectorize_dim) {
       VectorizeValidator::validate(tv);
     }
@@ -452,164 +578,106 @@ void validateVectorize(Fusion* fusion) {
 
 namespace {
 
-// Validate parallelization of a single tensor
-void validateParallelizationOfTensor(TensorView* tv) {
-  // Each ParallelType can be used only once.
-  ParallelTypeBitmap pt_map;
-  for (size_t i = 0; i < tv->nDims(); ++i) {
-    auto axis = tv->axis(i);
-    auto ptype = axis->getParallelType();
-    if (!isParallelTypeThread(ptype)) {
-      continue;
-    }
+void fillVectorizedContigRootDomains(
+    const TensorView* tv,
+    const ContigIDs& contig_finder,
+    IterDomain* vectorized_root_id,
+    VectorizedSetInfo& info) {
+  const auto& root_dom = tv->getMaybeRFactorDomain();
+
+  // Find the root domains that are dependency of the merged contig
+  // domain.
 
+  auto consumer_indexed_it =
+      contig_finder.rootToIndexedID().find(vectorized_root_id);
+  TORCH_INTERNAL_ASSERT(
+      consumer_indexed_it != contig_finder.rootToIndexedID().end(),
+      "Contiguity information not found for root domain: ",
+      vectorized_root_id->toString());
+  auto consumer_indexed_id = consumer_indexed_it->second;
+
+  // Actual indexed root domains for this root domain. If
+  // contig merge is done, multiple root domains are included.
+  std::unordered_set<IterDomain*> indexed_root_ids;
+
+  if (consumer_indexed_id == vectorized_root_id) {
+    // Indexed domain is equal to the root domain, meaning no contig
+    // merge is involved.
+    indexed_root_ids.insert(vectorized_root_id);
+  } else {
+    auto consumer_within_contig_it =
+        contig_finder.withinContigIDs().find(consumer_indexed_id);
     TORCH_INTERNAL_ASSERT(
-        !pt_map.get(ptype),
-        "Multiple use of ",
-        ptype,
-        " in tensor t",
-        tv->name(),
-        ": ",
-        tv);
-    pt_map.set(ptype);
+        consumer_within_contig_it != contig_finder.withinContigIDs().end());
+    const auto& within_ids = consumer_within_contig_it->second;
+    std::copy_if(
+        root_dom.begin(),
+        root_dom.end(),
+        std::inserter(indexed_root_ids, indexed_root_ids.end()),
+        [&](IterDomain* root_id) {
+          return within_ids.find(root_id) != within_ids.end();
+        });
   }
 
-  // If this tensor is predicated by a paralel type, it should not be
-  // used to parallelize any domain of this tensor
+  // Store the contig merged root domains. If it is already set, pick
+  // the smaller one as it is used for validating divisibility of the
+  // merged extent.
+  if (info.contig_root_ids.empty() ||
+      indexed_root_ids.size() < info.contig_root_ids.size()) {
+    info.contig_root_ids = indexed_root_ids;
+  }
+}
 
-  const auto thread_pred =
-      GpuLower::current()->threadPredMap().getPredicateInfo(tv);
+} // namespace
 
-  auto predicated_parallel_types = pt_map & thread_pred.limited_types;
+void fillConsumerVectorizedContigRootDomains(
+    const TensorView* consumer_tv,
+    const ContigIDs& contig_finder) {
+  auto& info_vector = GpuLower::current()->vectorizedSetInfo();
+  auto it = std::find_if(
+      info_vector.begin(), info_vector.end(), [&consumer_tv](auto& info) {
+        return info.consumer_tv == consumer_tv;
+      });
+  if (it == info_vector.end()) {
+    return;
+  }
 
-  TORCH_INTERNAL_ASSERT(
-      predicated_parallel_types.none(),
-      "Invalid parallelization of tensor t",
-      tv->name(),
-      ". The tensor is parallelized with ",
-      predicated_parallel_types.toString(),
-      ", but it's invalid to use the types as the tensor is also predicated with them.",
-      ", thread prd: ",
-      thread_pred.limited_types.toString());
+  VectorizedSetInfo& info = *it;
+
+  // info.vectorized_root_id is validated at this point to be the
+  // last concrete root domain in consumer.
+  auto consumer_root_id = info.vectorized_root_id;
+
+  fillVectorizedContigRootDomains(
+      consumer_tv, contig_finder, consumer_root_id, info);
 }
 
-} // namespace
+void fillProducerVectorizedContigRootDomains(
+    const TensorView* producer_tv,
+    const TensorView* consumer_tv,
+    const std::unordered_map<IterDomain*, IterDomain*>& c2p_map,
+    const ContigIDs& contig_finder) {
+  auto& info_vector = GpuLower::current()->vectorizedSetInfo();
+  auto it = std::find_if(
+      info_vector.begin(),
+      info_vector.end(),
+      [&producer_tv, &consumer_tv](auto& info) {
+        return info.consumer_tv == consumer_tv &&
+            info.producer_tv == producer_tv;
+      });
+  if (it == info_vector.end()) {
+    return;
+  }
 
-void validateParallelize(Fusion* fusion) {
-  FUSER_PERF_SCOPE("GpuLower::Lower::validateParallelize");
-  FusionGuard fg(fusion);
+  VectorizedSetInfo& info = *it;
 
-  const auto& par_map = GpuLower::current()->caParallelMap();
-  const auto& loop_map = GpuLower::current()->caLoopMap();
-  const auto& pred_map = GpuLower::current()->threadPredMap();
+  // info.vectorized_root_id is validated at this point to be the
+  // last concrete root domain in consumer.
+  auto consumer_root_id = info.vectorized_root_id;
 
-  auto exprs = ExprSort::getExprs(fusion);
+  auto root_id = c2p_map.at(consumer_root_id);
 
-  for (auto expr : exprs) {
-    if (!ir_utils::isTVOp(expr)) {
-      continue;
-    }
-    // Validate parallelization of each consumer by itself
-    for (auto consumer : ir_utils::filterByType<TensorView>(expr->outputs())) {
-      validateParallelizationOfTensor(consumer);
-    }
-    // Validate parallelization between a producer and a consumer
-    for (auto producer : ir_utils::filterByType<TensorView>(expr->inputs())) {
-      // Parallelization on input tensors have no effect.
-      if (producer->isFusionInput()) {
-        continue;
-      }
-      const auto parallel_bcast_doms =
-          pred_map.getParallelBroadcastDomains(producer);
-      for (const auto i : c10::irange(producer->nDims())) {
-        // If a producer axis is threaded, either with threadIdx or
-        // blockIdx, there must be a mapped consumer axis with the
-        // same ParallelType. An exception is when the producer is
-        // allocated on shared memory and its parallelized with
-        // threadIdx. In that case, there is no parallelization
-        // constraint on the consumer as syncthreads will be inserted
-        // when necessary.
-        auto producer_axis = producer->axis(i);
-        auto producer_ptype =
-            par_map.getConcreteMappedID(producer_axis)->getParallelType();
-        if (!isParallelTypeThread(producer_ptype)) {
-          continue;
-        }
-        // When the producer axis is a broadcast, it is not really
-        // parallelized unless thread-predicated
-        if (producer_axis->isBroadcast() &&
-            !parallel_bcast_doms.get(producer_ptype)) {
-          continue;
-        }
-        // No constraint on the consumer tensor when the producer
-        // axis is parallelized with threadIdx and allocates on
-        // shared memory
-        if (isParallelTypeThreadDim(producer_ptype) &&
-            producer->getMemoryType() == MemoryType::Shared) {
-          continue;
-        }
-        // There should be also nothing to validate when the producer
-        // axis is reduction.
-        if (producer_axis->isReduction()) {
-          continue;
-        }
-        // There must be a consumer axis that uses the same indexing
-        // with the same parallel type as the producer axis. The loop
-        // map is used to to find such an axis. Broadcast forwarding
-        // does not cause any inconsistent parallelization as indexing
-        // takes care of the forwarding.
-        for (auto consumer :
-             ir_utils::filterByType<TensorView>(expr->outputs())) {
-          auto it = std::find_if(
-              consumer->domain()->domain().begin(),
-              consumer->domain()->domain().end(),
-              [&](IterDomain* consumer_axis) {
-                return loop_map.areMapped(producer_axis, consumer_axis);
-              });
-          TORCH_INTERNAL_ASSERT(
-              it != consumer->domain()->domain().end(),
-              "Inconsistent parallelization found between TV",
-              producer->name(),
-              " (",
-              producer,
-              ") and TV",
-              consumer->name(),
-              "(",
-              consumer,
-              "). ",
-              "TV",
-              consumer->name(),
-              " does not have a matching axis for parallelized producer axis, ",
-              producer_axis,
-              ". CA Map: ",
-              loop_map.toString());
-          auto consumer_axis = *it;
-          auto consumer_ptype =
-              par_map.getConcreteMappedID(consumer_axis)->getParallelType();
-          TORCH_INTERNAL_ASSERT(
-              producer_ptype == consumer_ptype,
-              "Inconsistent parallelization found between TV",
-              producer->name(),
-              " (",
-              producer,
-              ") and TV",
-              consumer->name(),
-              "(",
-              consumer,
-              "). "
-              "Producer axis, ",
-              producer_axis,
-              " is parallelized with ",
-              stringifyThread(producer_ptype),
-              ", but the parallel type of its matching consumer axis, ",
-              consumer_axis,
-              " is ",
-              stringifyThread(consumer_ptype),
-              ".");
-        }
-      }
-    }
-  }
+  fillVectorizedContigRootDomains(producer_tv, contig_finder, root_id, info);
 }
 
 namespace {
@@ -630,7 +698,7 @@ namespace {
 // each tensor that needs to be computed.
 std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> getLiveRangeOffsets(
     Fusion* fusion) {
-  auto exprs = ExprSort::getExprs(fusion);
+  auto exprs = StmtSort::getExprs(fusion);
 
   std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> map;
 
@@ -760,7 +828,9 @@ void validatePartialSplit(Fusion* fusion) {
   auto range_info = getLiveRangeOffsets(fusion);
 
   for (auto tv : ir_utils::allTvs(fusion)) {
-    auto exprs = ir_utils::historyOf(tv);
+    auto exprs = StmtSort::getExprs(
+        tv->fusion(),
+        {tv->domain()->domain().begin(), tv->domain()->domain().end()});
     for (auto split : ir_utils::filterByType<Split>(exprs)) {
       // When the start and stop offsets are not zero, make sure the
       // range defined by the split includes the required range to
@@ -793,6 +863,95 @@ void validatePartialSplit(Fusion* fusion) {
   }
 }
 
+namespace {
+
+//! Utility to make sure targeted gpu capability is
+//!  higher than provided major.minor.
+void validateMinimumArch(int major, int minor) {
+  auto prop = at::cuda::getCurrentDeviceProperties();
+  TORCH_INTERNAL_ASSERT(prop->major >= major);
+  if (prop->major == major) {
+    TORCH_INTERNAL_ASSERT(prop->minor >= minor);
+  }
+}
+
+//! Validates that the operand and result tensors
+//!  of mma ops are swizzled and also validates
+//!  specialization of tidx as lane id.
+void validateMmaTensors(MmaOp* mma) {
+  bool tidx_validated = false;
+  std::vector<TensorView*> to_validate = {
+      mma->inA()->as<TensorView>(),
+      mma->inB()->as<TensorView>(),
+      mma->out()->as<TensorView>()};
+
+  for (auto tv : to_validate) {
+    for (auto id : tv->domain()->domain()) {
+      auto ptype = id->getParallelType();
+      if (ptype == ParallelType::TIDx) {
+        TORCH_INTERNAL_ASSERT(
+            id->isMmaSwizzled(),
+            "TIDx for mma input/output must be set by WarpMmaSwizzler",
+            id,
+            tv);
+        if (!tidx_validated) {
+          // Check that TIDx is exact lane_id
+          const auto& paralel_dim_map =
+              GpuLower::current()->parallelDimensionMap();
+          TORCH_INTERNAL_ASSERT(
+              paralel_dim_map.isExact(ptype) &&
+                  paralel_dim_map.get(ptype)->getInt().has_value() &&
+                  paralel_dim_map.get(ptype)->getInt().value() ==
+                      at::cuda::warp_size(),
+              "TIDx is reserved for lane id in mma kernels, and it needs to be exactly a warp");
+          tidx_validated = true;
+        }
+      }
+    }
+  }
+
+  // Note: this check will be relaxed in a follow up.
+  auto validate_operand_ids = [](const TensorView* tv) {
+    TORCH_INTERNAL_ASSERT(
+        std::all_of(
+            tv->domain()->domain().begin() + tv->getComputeAtPosition(),
+            tv->domain()->domain().end(),
+            [](IterDomain* id) {
+              return id->isMmaSwizzled() ||
+                  (id->isBroadcast() &&
+                   id->getParallelType() == ParallelType::Serial);
+            }),
+        "All id's on the right of CA pos needs to be mma-swizzled by WarpMmaSwizzler\n",
+        tv);
+  };
+
+  validate_operand_ids(mma->inA()->as<TensorView>());
+  validate_operand_ids(mma->inB()->as<TensorView>());
+}
+
+} // namespace
+
+//! Validate data format and GPU arch compatibility of scheduled
+//!  mma operators on the fusion.
+void validateMma(Fusion* fusion) {
+  auto exprs = StmtSort::getExprs(fusion);
+
+  for (auto expr : exprs) {
+    if (auto mma = dynamic_cast<MmaOp*>(expr)) {
+      validateMmaTensors(mma);
+
+      switch (mma->options().macro) {
+        case MmaOptions::MacroType::Volta_16_16_4:
+          validateMinimumArch(7, 0);
+          break;
+        default:
+          TORCH_INTERNAL_ASSERT(false, "validate mma: unsupported macro");
+          break;
+      }
+    }
+  }
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.h b/torch/csrc/jit/codegen/cuda/lower_validation.h
index 89de85026ee7..d8c95d8d1f05 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.h
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 
@@ -9,18 +9,28 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-void validateIr(Fusion* fusion);
+class ContigIDs;
 
-void validateVectorize(Fusion* fusion);
+void validateIr(Fusion* fusion);
 
-//! Validates all tensors are consistently parallelized. Basically,
-//! when a producer axis is threaded, either with threadIdx or
-//! blockIdx, there must be a mapped consumer axis with the
-//! same ParallelType with some exceptions.
-//!
-//! This function assumes Loop and Parallel ComputeAtMaps are already
-//! built as they are used to validate consistency.
-void validateParallelize(Fusion* fusion);
+//! Validate vectorization and collect information on vectorization
+//! used in code generation as well as runtime validation.
+void validateAndCollectVectorizeInfo(Fusion* fusion);
+
+//! Find the contig root domains that a vectorized leaf domain
+//! of a consumer TV depends on. Required for runtime validation.
+void fillConsumerVectorizedContigRootDomains(
+    const TensorView* consumer_tv,
+    const ContigIDs& contig_finder);
+
+//! Find the contig root domains that a vectorized leaf domain
+//! of a producer TV depends on. Required for runtime validation.
+//! Producer must be transformed as consumer.
+void fillProducerVectorizedContigRootDomains(
+    const TensorView* producer_tv,
+    const TensorView* consumer_tv,
+    const std::unordered_map<IterDomain*, IterDomain*>& c2p_map,
+    const ContigIDs& contig_finder);
 
 //! Validates partial split expressions. Partial split only uses an
 //! inner subdomain specified by start and stop offsets, ignoring the
@@ -30,6 +40,10 @@ void validateParallelize(Fusion* fusion);
 //! calculated that are necessary for output values.
 void validatePartialSplit(Fusion* fusion);
 
+//! Validate data format and GPU arch compatibility of scheduled
+//!  mma operators on the fusion.
+void validateMma(Fusion* fusion);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
index eaddf7faea32..1d87790c014f 100644
--- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
@@ -1,7 +1,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
@@ -13,25 +13,63 @@ namespace cuda {
 
 namespace {
 
+//! A helper class for EliminateDeadBroadcastAndAllocate. Eliminate
+//! dead Allocate and Broadcast detected by EliminateDeadBroadcastAndAllocate.
+class DeadTvEliminator : private kir::ExprMutator {
+ public:
+  static std::vector<Expr*> run(
+      const std::vector<Expr*>& exprs,
+      const std::unordered_set<TensorView*>& dead_tvs) {
+    return DeadTvEliminator(exprs, dead_tvs).exprs_;
+  }
+
+ private:
+  DeadTvEliminator(
+      const std::vector<Expr*>& exprs,
+      const std::unordered_set<TensorView*>& dead_tvs)
+      : dead_tvs_(dead_tvs) {
+    traverseAndInsert(exprs);
+  }
+
+  using kir::ExprMutator::handle;
+
+  void handle(kir::Allocate* allocate) final {
+    if (auto buffer_tv = dynamic_cast<TensorView*>(allocate->buffer())) {
+      if (dead_tvs_.count(buffer_tv)) {
+        registerRemove(allocate);
+      }
+    }
+  }
+
+  void handle(BroadcastOp* broadcast) final {
+    if (auto out_ti = dynamic_cast<kir::TensorIndex*>(broadcast->out())) {
+      if (dead_tvs_.count(out_ti->view())) {
+        registerRemove(broadcast);
+      }
+    }
+  }
+
+ private:
+  const std::unordered_set<TensorView*>& dead_tvs_;
+};
+
 //! A simple DCE for eliminating the
 //!  parallel broadcasts that has been fused
 //!  and their corresponding allocations
 class EliminateDeadBroadcastAndAllocate {
  public:
-  static std::vector<kir::Expr*> run(const std::vector<kir::Expr*>& exprs) {
+  static std::vector<Expr*> run(const std::vector<Expr*>& exprs) {
     EliminateDeadBroadcastAndAllocate dce(exprs);
-    return dce.result_exprs_;
+    return DeadTvEliminator::run(exprs, dce.dead_tvs_);
   }
 
  private:
-  EliminateDeadBroadcastAndAllocate(const std::vector<kir::Expr*>& exprs)
-      : ir_builder_(GpuLower::current()->kernel()) {
+  EliminateDeadBroadcastAndAllocate(const std::vector<Expr*>& exprs) {
     findLiveTvs(exprs);
     findDeadTvs();
-    eliminateDeadCode(exprs);
   }
 
-  void findLiveTvs(const std::vector<kir::Expr*>& exprs) {
+  void findLiveTvs(const std::vector<Expr*>& exprs) {
     for (auto expr : exprs) {
       if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
         findLiveTvs(for_loop->body().exprs());
@@ -44,11 +82,10 @@ class EliminateDeadBroadcastAndAllocate {
 
       if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
         if (allocate->memoryType() == MemoryType::Local) {
-          if (auto kir_tv =
-                  dynamic_cast<kir::TensorView*>(allocate->buffer())) {
+          if (auto tv = dynamic_cast<TensorView*>(allocate->buffer())) {
             // We know only tvs that we'd want to consider are broadcast outputs
-            if (kir_tv->fuserTv()->definition()->isA<BroadcastOp>()) {
-              candidate_tv_set_.insert(kir_tv);
+            if (tv->definition()->isA<BroadcastOp>()) {
+              candidate_tv_set_.insert(tv);
             }
           }
         }
@@ -72,95 +109,10 @@ class EliminateDeadBroadcastAndAllocate {
     }
   }
 
-  void eliminateDeadCode(const std::vector<kir::Expr*>& exprs) {
-    result_exprs_ = eliminateDeadCodeInScope(exprs);
-  }
-
-  bool shouldEliminate(kir::Expr* expr) {
-    if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
-      if (auto buffer_tv = dynamic_cast<kir::TensorView*>(allocate->buffer())) {
-        if (dead_tvs_.count(buffer_tv)) {
-          return true;
-        }
-      }
-    } else if (auto broadcast = dynamic_cast<kir::BroadcastOp*>(expr)) {
-      if (auto out_ti = dynamic_cast<kir::TensorIndex*>(broadcast->out())) {
-        if (dead_tvs_.count(out_ti->view())) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
-  //! Returns a new vector of exprs with dead exprs
-  //!  eliminated.
-  std::vector<kir::Expr*> eliminateDeadCodeInScope(
-      const std::vector<kir::Expr*>& exprs) {
-    std::vector<kir::Expr*> result_exprs;
-
-    for (auto expr : exprs) {
-      auto result_expr = expr;
-      if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-        result_expr = eliminateDeadCode(for_loop);
-      } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-        result_expr = eliminateDeadCode(ite);
-      } else {
-        if (shouldEliminate(expr)) {
-          result_expr = nullptr;
-        }
-      }
-
-      // Push the result expr if not eliminated
-      if (result_expr) {
-        result_exprs.push_back(result_expr);
-      }
-    }
-
-    return result_exprs;
-  }
-
-  kir::ForLoop* eliminateDeadCode(kir::ForLoop* for_loop) {
-    auto new_loop_body = eliminateDeadCodeInScope(for_loop->body().exprs());
-    if (new_loop_body.empty()) {
-      return nullptr;
-    }
-
-    // TODO: we will need a kernel_ir cloner to make this
-    //  kind of logic re-usable.
-    auto new_loop = scope_utils::cloneForLoop(ir_builder_, for_loop);
-
-    for (auto expr : new_loop_body) {
-      new_loop->body().push_back(expr);
-    }
-    return new_loop;
-  }
-
-  kir::IfThenElse* eliminateDeadCode(kir::IfThenElse* ite) {
-    auto new_then_body = eliminateDeadCodeInScope(ite->thenBody().exprs());
-    auto new_else_body = eliminateDeadCodeInScope(ite->elseBody().exprs());
-    if (new_then_body.empty() && new_else_body.empty()) {
-      return nullptr;
-    }
-
-    auto new_ite = scope_utils::cloneIfThenElse(ir_builder_, ite);
-
-    for (auto expr : new_then_body) {
-      new_ite->thenBody().push_back(expr);
-    }
-    for (auto expr : new_else_body) {
-      new_ite->elseBody().push_back(expr);
-    }
-    return new_ite;
-  }
-
  private:
-  std::unordered_set<kir::TensorView*> live_tvs_;
-  std::unordered_set<kir::TensorView*> dead_tvs_;
-  std::unordered_set<kir::TensorView*> candidate_tv_set_;
-
-  std::vector<kir::Expr*> result_exprs_;
-  kir::IrBuilder ir_builder_;
+  std::unordered_set<TensorView*> live_tvs_;
+  std::unordered_set<TensorView*> dead_tvs_;
+  std::unordered_set<TensorView*> candidate_tv_set_;
 };
 
 //! A pass to eliminate redundant parallel broadcasts that are consumers
@@ -189,9 +141,9 @@ class EliminateDeadBroadcastAndAllocate {
 //!
 //!   3. EliminateDeadBroadcastAndAllocate removes the broadcast ops
 //!    and corresponding allocations if they're un-used after step 2.
-class FuseBroadcastWithWarpReduce {
+class FuseBroadcastWithWarpReduce : private kir::IrVisitor {
  public:
-  static std::vector<kir::Expr*> fuse(const std::vector<kir::Expr*>& exprs) {
+  static std::vector<Expr*> fuse(const std::vector<Expr*>& exprs) {
     FuseBroadcastWithWarpReduce fuse_broadcast_map(exprs);
     const auto replaced_inputs =
         replaceInputsInExpr(exprs, fuse_broadcast_map.val_replacement_map_);
@@ -199,70 +151,51 @@ class FuseBroadcastWithWarpReduce {
   }
 
  private:
-  FuseBroadcastWithWarpReduce(const std::vector<kir::Expr*>& exprs) {
+  FuseBroadcastWithWarpReduce(const std::vector<Expr*>& exprs) {
     // open stack space for global scope
-    // The scope stack for kir_tv_to_allocate wouldn't be needed
+    // The scope stack for tv_to_allocate wouldn't be needed
     //  if the allocations are guaranteed to be once and unique,
     //  which can currently be assumed but this pass tries not
     //  to rely on this assumption.
-    running_kir_tv_to_allocate_map_.emplace_back(
-        std::make_unique<
-            std::unordered_map<kir::TensorView*, kir::Allocate*>>());
+    running_tv_to_allocate_map_.emplace_back(
+        std::make_unique<std::unordered_map<TensorView*, kir::Allocate*>>());
     running_visible_allocation_stack_.emplace_back(
         std::make_unique<std::vector<kir::Allocate*>>());
-
-    for (auto expr : exprs) {
-      handle(expr);
-    }
+    kir::IrVisitor::handle(exprs);
   }
 
-  void handle(kir::Expr* expr) {
-    if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
-      handle(for_loop);
-      return;
-    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
-      handle(ite);
-      return;
-    }
-
-    // Process expr inputs if needs replacement
-    for (auto inp : expr->inputs()) {
-      if (auto input_ti = dynamic_cast<kir::TensorIndex*>(inp)) {
-        auto replace = findMaybeReplacedTensorIndex(input_ti);
-        if (replace.has_value()) {
-          val_replacement_map_[input_ti] = replace.value();
+  void handle(Expr* expr) final {
+    if (ir_utils::isTvOp(expr)) {
+      // Process expr inputs if needs replacement
+      for (auto inp : expr->inputs()) {
+        if (auto input_ti = dynamic_cast<kir::TensorIndex*>(inp)) {
+          auto replace = findMaybeReplacedTensorIndex(input_ti);
+          if (replace.has_value()) {
+            val_replacement_map_[input_ti] = replace.value();
+          }
         }
       }
     }
-
-    // Handle reduction definitions
-    if (auto reduction = dynamic_cast<kir::ReductionOp*>(expr)) {
-      handle(reduction);
-    } else if (auto broadcast = dynamic_cast<kir::BroadcastOp*>(expr)) {
-      handle(broadcast);
-    } else if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
-      handle(allocate);
-    }
+    kir::IrVisitor::handle(expr);
   }
 
-  bool openLoopNestLevel(kir::IterDomain* id) {
-    if (id->isThread() || id->parallelType() == ParallelType::Unswitch) {
+  bool openLoopNestLevel(IterDomain* id) {
+    if (id->isThread() || id->getParallelType() == ParallelType::Unswitch) {
       return false;
     }
-    if (id->parallelType() == ParallelType::Serial ||
-        id->parallelType() == ParallelType::Unroll) {
+    if (id->getParallelType() == ParallelType::Serial ||
+        id->getParallelType() == ParallelType::Unroll) {
       return !id->isBroadcast();
     }
     return true;
   }
 
-  void handle(kir::ForLoop* for_loop) {
+  void handle(kir::ForLoop* for_loop) final {
     // Keep track of visible reduction outputs
     bool open_nest_level = openLoopNestLevel(for_loop->iter_domain());
     if (open_nest_level) {
-      running_kir_tv_to_allocate_map_.emplace_back(
-          std::make_unique<
-              std::unordered_map<kir::TensorView*, kir::Allocate*>>());
+      running_tv_to_allocate_map_.emplace_back(
+          std::make_unique<std::unordered_map<TensorView*, kir::Allocate*>>());
       running_visible_allocation_stack_.emplace_back(
           std::make_unique<std::vector<kir::Allocate*>>());
     }
@@ -270,12 +203,12 @@ class FuseBroadcastWithWarpReduce {
       handle(expr);
     }
     if (open_nest_level) {
-      running_kir_tv_to_allocate_map_.pop_back();
+      running_tv_to_allocate_map_.pop_back();
       running_visible_allocation_stack_.pop_back();
     }
   }
 
-  void handle(kir::IfThenElse* ite) {
+  void handle(kir::IfThenElse* ite) final {
     running_visible_allocation_stack_.emplace_back(
         std::make_unique<std::vector<kir::Allocate*>>());
     for (auto expr : ite->thenBody().exprs()) {
@@ -292,15 +225,14 @@ class FuseBroadcastWithWarpReduce {
 
   //! Place this allocate on the list of currently visible allocations,
   //!  organized by loop nest level.
-  void handle(kir::Allocate* allocate) {
+  void handle(kir::Allocate* allocate) final {
     if (allocate->memoryType() != MemoryType::Local) {
       return;
     }
-    if (auto kir_tv = dynamic_cast<kir::TensorView*>(allocate->buffer())) {
-      auto fuser_tv = kir_tv->fuserTv();
-      if (fuser_tv->definition()) {
-        if (fuser_tv->definition()->isA<ReductionOp>() ||
-            fuser_tv->definition()->isA<BroadcastOp>()) {
+    if (auto tv = dynamic_cast<TensorView*>(allocate->buffer())) {
+      if (tv->definition()) {
+        if (tv->definition()->isA<ReductionOp>() ||
+            tv->definition()->isA<BroadcastOp>()) {
           running_visible_allocation_stack_.back()->push_back(allocate);
         }
       }
@@ -311,18 +243,18 @@ class FuseBroadcastWithWarpReduce {
   //!  returns the replaced TensorIndex if so.
   c10::optional<kir::TensorIndex*> findMaybeReplacedTensorIndex(
       kir::TensorIndex* tensor_index) {
-    auto kir_tv = tensor_index->view();
-    auto tensor_index_it = running_tv_replacement_map_.find(kir_tv);
+    auto tv = tensor_index->view();
+    auto tensor_index_it = running_tv_replacement_map_.find(tv);
     if (tensor_index_it != running_tv_replacement_map_.end()) {
       return tensor_index_it->second;
     }
     return c10::nullopt;
   }
 
-  //! Iteratve backwards on the currently visible loop scopes
+  //! Iterate backwards on the currently visible loop scopes
   //!  and find the first allocation corresponding to the
   //!  given tv.
-  kir::Allocate* getActiveAllocateFor(kir::TensorView* tv) {
+  kir::Allocate* getActiveAllocateFor(TensorView* tv) {
     for (auto frame_it = running_visible_allocation_stack_.rbegin();
          frame_it != running_visible_allocation_stack_.rend();
          frame_it++) {
@@ -340,19 +272,10 @@ class FuseBroadcastWithWarpReduce {
     return nullptr;
   }
 
-  Expr* getFuserTVExpr(kir::Expr* expr) {
-    auto out = expr->outputs()[0];
-    auto out_ti = dynamic_cast<kir::TensorIndex*>(out);
-    if (!out_ti) {
-      return nullptr;
-    }
-    return out_ti->view()->fuserTv()->definition();
-  }
-
-  bool isOpInputRegisterTV(kir::Expr* expr) {
+  bool isOpInputRegisterTV(Expr* expr) {
     for (auto inp : expr->inputs()) {
       if (auto inp_ti = dynamic_cast<kir::TensorIndex*>(inp)) {
-        if (inp_ti->view()->memoryType() != MemoryType::Local) {
+        if (inp_ti->view()->getMemoryType() != MemoryType::Local) {
           return false;
         }
       }
@@ -361,10 +284,10 @@ class FuseBroadcastWithWarpReduce {
     return true;
   }
 
-  bool isOpOutputRegisterTV(kir::Expr* expr) {
+  bool isOpOutputRegisterTV(Expr* expr) {
     for (auto out : expr->outputs()) {
       if (auto out_ti = dynamic_cast<kir::TensorIndex*>(out)) {
-        if (out_ti->view()->memoryType() != MemoryType::Local) {
+        if (out_ti->view()->getMemoryType() != MemoryType::Local) {
           return false;
         }
       }
@@ -374,8 +297,8 @@ class FuseBroadcastWithWarpReduce {
   }
 
   //! Updates map of serially visible reduction tvs, see comment on
-  //!  running_kir_tv_to_allocate_map_.
-  void handle(kir::ReductionOp* reduction) {
+  //!  running_tv_to_allocate_map_.
+  void handle(ReductionOp* reduction) final {
     if (!isOpOutputRegisterTV(reduction)) {
       return;
     }
@@ -386,11 +309,11 @@ class FuseBroadcastWithWarpReduce {
 
     // keep track of which reduction buffer this expr writes into
     auto reduction_allocate = getActiveAllocateFor(reduction_ti_out->view());
-    running_kir_tv_to_allocate_map_.back()->operator[](
-        reduction_ti_out->view()) = reduction_allocate;
+    running_tv_to_allocate_map_.back()->operator[](reduction_ti_out->view()) =
+        reduction_allocate;
   }
 
-  void handle(kir::BroadcastOp* broadcast) {
+  void handle(BroadcastOp* broadcast) final {
     if (!isOpInputRegisterTV(broadcast) || !isOpOutputRegisterTV(broadcast)) {
       return;
     }
@@ -400,9 +323,9 @@ class FuseBroadcastWithWarpReduce {
   //! Detects if this broadcast can be fused with the producer reduction.
   //!  adds the output of broadcast to replacement map if all above mentioned
   //!  conditions check.
-  void tryAddOutputToReplaceMap(kir::BroadcastOp* broadcast) {
+  void tryAddOutputToReplaceMap(BroadcastOp* broadcast) {
     if (auto in_ti = dynamic_cast<kir::TensorIndex*>(broadcast->in())) {
-      if (!in_ti->view()->fuserTv()->definition()->isA<ReductionOp>()) {
+      if (!in_ti->view()->definition()->isA<ReductionOp>()) {
         return;
       }
       auto out_ti = broadcast->out()->as<kir::TensorIndex>();
@@ -410,15 +333,14 @@ class FuseBroadcastWithWarpReduce {
 
       // check reduction-broadcast mapping:
       if (!canFuseBroadcastWithWarpReduction(
-              out_tv->fuserTv()->definition()->as<BroadcastOp>())) {
+              out_tv->definition()->as<BroadcastOp>())) {
         return;
       }
 
       // check buffers are size-1
       auto reduction_allocate_it =
-          running_kir_tv_to_allocate_map_.back()->find(in_ti->view());
-      if (reduction_allocate_it ==
-          running_kir_tv_to_allocate_map_.back()->end()) {
+          running_tv_to_allocate_map_.back()->find(in_ti->view());
+      if (reduction_allocate_it == running_tv_to_allocate_map_.back()->end()) {
         // The producer reduction is not in the serially visible scope,
         //  as defined in openLoopNestLevel. There still could be some
         //  cases that we could fuse but disabled for simplicity.
@@ -444,7 +366,7 @@ class FuseBroadcastWithWarpReduce {
         return;
       }
 
-      // Write the kir_tv in to the replacement map
+      // Write the tv in to the replacement map
       //  so the future uses of this tv will put
       //  the tensorIndex's in the actual replacement map.
       running_tv_replacement_map_[out_tv] = in_ti;
@@ -515,7 +437,7 @@ class FuseBroadcastWithWarpReduce {
   //!  could need some extension for more precise scope based analysis in the
   //!  future especially if we have more complex IfThenElse blocks than
   //!  predicates and unroll.
-  std::unordered_map<kir::TensorView*, kir::TensorIndex*>
+  std::unordered_map<TensorView*, kir::TensorIndex*>
       running_tv_replacement_map_;
 
   //! Keeps track of the allocated buffers that the exprs will write/read
@@ -531,21 +453,20 @@ class FuseBroadcastWithWarpReduce {
   //!  visibility on the generated kernel. The model of IfThenElse assumes the
   //!  only ITE's we have are predicates and unrolls, which might need to be
   //!  more precise.
-  std::vector<
-      std::unique_ptr<std::unordered_map<kir::TensorView*, kir::Allocate*>>>
-      running_kir_tv_to_allocate_map_;
+  std::vector<std::unique_ptr<std::unordered_map<TensorView*, kir::Allocate*>>>
+      running_tv_to_allocate_map_;
 
   //! This map is the final output of this pass and a val replacement map will
   //! be run using
   //!  it. All keys and values are TensorIndex's, and before this pass each
   //!  TensorIndex is uniquely generated by lower_index pass for each access of
-  //!  a kir_tv.
-  std::unordered_map<kir::Val*, kir::Val*> val_replacement_map_;
+  //!  a tv.
+  std::unordered_map<Val*, Val*> val_replacement_map_;
 };
 
 } // namespace
 
-std::vector<kir::Expr*> fuseWarpReduce(const std::vector<kir::Expr*> exprs) {
+std::vector<Expr*> fuseWarpReduce(const std::vector<Expr*> exprs) {
   return FuseBroadcastWithWarpReduce::fuse(exprs);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h
index 785c0b59122e..7480809c7dce 100644
--- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h
+++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h
@@ -13,7 +13,7 @@ struct WarpPaddedParallelInfo {
   bool has_warp_reduction = false;
 };
 
-std::vector<kir::Expr*> fuseWarpReduce(const std::vector<kir::Expr*> exprs);
+std::vector<Expr*> fuseWarpReduce(const std::vector<Expr*> exprs);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp
index ee1bea815359..4fef32286c8e 100644
--- a/torch/csrc/jit/codegen/cuda/manager.cpp
+++ b/torch/csrc/jit/codegen/cuda/manager.cpp
@@ -8,7 +8,9 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 #include <torch/csrc/jit/codegen/cuda/type_inference.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
+#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
@@ -51,6 +53,38 @@ namespace cuda {
 
 namespace {
 
+// TODO remove this (75983):
+//   we don't need this any more. I think we can use revertAliasCopyOps.
+//   Similar refactor should be done infallback graph used by fusion guard.
+//   implementation of xxxx_copy ops should be removed.
+//
+// Mark string attribute in alias-copy nodes to enable its implementation
+// in the fallback path.
+void enableAliasCopyNodes(const std::shared_ptr<Graph>& graph, Block* block) {
+  static std::unordered_set<Symbol> alias_copy_op(
+      {prim::view_copy,
+       prim::reshape_copy,
+       prim::squeeze_copy,
+       prim::unsqueeze_copy});
+
+  for (Node* n : block->nodes()) {
+    for (Block* b : n->blocks()) {
+      enableAliasCopyNodes(graph, b);
+    }
+    if (alias_copy_op.find(n->kind()) != alias_copy_op.end()) {
+      n->s_(attr::name, "CudaFusionGroup");
+    }
+  }
+}
+
+static std::unique_ptr<Code> createFallbackCode(const Node* fusion_node) {
+  auto copied_graph = fusion_node->g(attr::Subgraph)->copy();
+  EraseShapeInformation(copied_graph);
+  enableAliasCopyNodes(copied_graph, copied_graph->block());
+  auto code = std::make_unique<Code>(copied_graph, "fallback_cuda_fuser");
+  return code;
+}
+
 // CudaFusionManager is not thread safe!
 // TODO: we should make the tradeoff here to use thread_local instead of global
 // singleton;
@@ -68,8 +102,6 @@ class CudaFusionManager {
   //       have identical contiguity information! (So identical stride + shape
   //       is even more restricting in a good way)
   int32_t registerOrGetCacheId(std::shared_ptr<Graph>& graph) {
-    std::lock_guard<std::mutex> guard(mutex_);
-
     // prepare graph for lowering;
     // We should not call `EraseShapeInformation(graph);`, graph representation
     // does not incorporate static sizes, but just rank of input tensors, which
@@ -77,6 +109,7 @@ class CudaFusionManager {
     auto canonical_graph = Canonicalize(graph, false);
     auto repr = canonical_graph->toString(false);
 
+    std::lock_guard<std::mutex> guard(mutex_);
     // create new graph_cache_ids_ entry if none existed yet;
     if (graph_cache_ids_.count(repr) == 0) {
       int32_t kernel_id = getNextUniqueID();
@@ -88,6 +121,12 @@ class CudaFusionManager {
     return graph_cache_ids_[repr];
   };
 
+  // get fallback kernel id
+  int32_t getFallbackKernelId() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return getNextUniqueID();
+  }
+
   void unregisterCacheId(std::shared_ptr<Graph>& graph) {
     auto canonical_graph = Canonicalize(graph, false);
     auto repr = canonical_graph->toString(false);
@@ -109,6 +148,27 @@ class CudaFusionManager {
     return graph_cache_[kernel_id]->runGraphWithInputs(inputs);
   }
 
+  bool hasFallbackCode(int32_t kernel_id) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return fallback_cache_.count(kernel_id);
+  }
+
+  Code* getFallbackCode(int32_t kernel_id, const Node* fusion_node) {
+    {
+      std::lock_guard<std::mutex> guard(mutex_);
+      auto it = fallback_cache_.find(kernel_id);
+      if (it != fallback_cache_.end()) {
+        return it->second.get();
+      }
+    }
+
+    std::unique_ptr<Code> code = createFallbackCode(fusion_node);
+
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto it = fallback_cache_.insert({kernel_id, std::move(code)}).first;
+    return it->second.get();
+  }
+
  private:
   // TODO: Dimension collapsing should be abstracted out and integrated into
   // graph caching.
@@ -137,6 +197,7 @@ class CudaFusionManager {
 
   std::unordered_map<std::string, int32_t> graph_cache_ids_;
   std::unordered_map<int64_t, std::unique_ptr<GraphCache>> graph_cache_;
+  std::unordered_map<int64_t, std::unique_ptr<Code>> fallback_cache_;
 
   int32_t next_unique_id_ = 0;
 };
@@ -163,7 +224,6 @@ void compileCudaFusionGroup(Node* fusion_node) {
     // node only insert meta information after itself).
     PropagateShapesOnGraph(graph);
     TypePropagate(graph);
-    PropagateShapesOnGraph(graph);
 
     int32_t fusion_cache_id =
         CudaFusionManager::getManager().registerOrGetCacheId(graph);
@@ -175,38 +235,66 @@ void compileCudaFusionGroup(Node* fusion_node) {
       compile_fusion();
     } catch (...) {
       TORCH_WARN(
-          "FALLBACK path has been taken. This is an indication that codegen"
-          "Failed for some reason. To debug try disable codegen fallback path"
-          "via setting the env variable"
-          "`export PYTORCH_NVFUSER_DISABLE_FALLBACK=1`");
+          "FALLBACK path has been taken inside: ",
+          __FUNCTION__,
+          ". This is an indication that codegen Failed for some reason.\n"
+          "To debug try disable codegen fallback path via setting the env"
+          " variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n"
+          "To report the issue, try enable logging via setting the env"
+          "variable ` export PYTORCH_JIT_LOG_LEVEL=manager.cpp`\n");
+      GRAPH_DUMP("`compile_fusion` hits fallback on graph\n", graph);
       CudaFusionManager::getManager().unregisterCacheId(graph);
     }
   } else {
     compile_fusion();
   }
+
+  // Assigning a cache_id to facilitate graph execution and fallback
+  if (!fusion_node->hasAttribute(attr::cache_id)) {
+    int32_t fusion_cache_id =
+        CudaFusionManager::getManager().getFallbackKernelId();
+    fusion_node->i_(attr::cache_id, fusion_cache_id);
+  }
 }
 
 void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
   FUSER_PERF_SCOPE("nvFuser::Manager::runCudaFusionGroup");
+  TORCH_CHECK(
+      fusion_node->hasAttribute(attr::cache_id),
+      "node prim::CudaFusionGroup has not been compiled yet");
 
   // Fallback to use if anything goes wrong
-  auto take_fallback = [&]() {
-    // copying graph here since we are eliminating shape information;
-    auto copied_graph = fusion_node->g(attr::Subgraph)->copy();
-    EraseShapeInformation(copied_graph);
-    InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack);
+  auto take_fallback = [&](Stack& stack) {
+    std::unique_ptr<Code> fallback_code_unique;
+    Code* fallback_code;
+    int32_t kernel_id = fusion_node->i(attr::cache_id);
+    fallback_code =
+        CudaFusionManager::getManager().getFallbackCode(kernel_id, fusion_node);
+    InterpreterState{*fallback_code}.run(stack);
   };
 
+  c10::optional<Stack> stack_copy;
+  auto compare_callback = getCudaFuserComparisonCallback();
+  if (compare_callback.run_fallback) {
+    // make a copy of the stack
+    int64_t inputs_size =
+        static_cast<int64_t>(fusion_node->g(attr::Subgraph)->inputs().size());
+    TORCH_INTERNAL_ASSERT(stack.size() >= inputs_size);
+    stack_copy = Stack();
+    stack_copy->insert(
+        stack_copy->end(), stack.begin(), stack.end() - inputs_size);
+    // deepcopy the last (inputs_size) stack items
+    std::transform(
+        stack.end() - inputs_size,
+        stack.end(),
+        std::back_inserter(*stack_copy),
+        [](const c10::IValue& ivalue) { return ivalue.deepcopy(); });
+  }
+
   auto run_fusion = [&]() {
     TORCH_CHECK(
         fusion_node->kind() == prim::CudaFusionGroup,
         "prim::CudaFusionGroup expected");
-    // TODO: should we support runtime compilation with updated dynamic shape;
-    //       shape inference would be needed so we can allocate output;
-    TORCH_CHECK(
-        fusion_node->hasAttribute(attr::cache_id),
-        "node prim::CudaFusionGroup has not been compiled yet");
-
     int32_t kernel_id = fusion_node->i(attr::cache_id);
     // Currently we just construct I/O tensors for static graph;
 
@@ -226,18 +314,63 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
 
   if (useFallback()) {
     try {
-      run_fusion();
+      // if fusion failed once, it's likely to fail again; and failures are
+      // slow. So if the fusion fails, then record the failure and always use
+      // the fallback instead
+      int32_t kernel_id = fusion_node->i(attr::cache_id);
+      bool force_fallback =
+          CudaFusionManager::getManager().hasFallbackCode(kernel_id);
+      if (force_fallback) {
+        take_fallback(stack);
+      } else {
+        run_fusion();
+      }
     } catch (...) {
       TORCH_WARN(
-          "FALLBACK path has been taken. This is an indication that codegen"
-          "Failed for some reason. To debug try disable codegen fallback path"
-          "via setting the env variable"
-          "`export PYTORCH_NVFUSER_DISABLE_FALLBACK=1`");
-      take_fallback();
+          "FALLBACK path has been taken inside: ",
+          __FUNCTION__,
+          ". This is an indication that codegen Failed for some reason.\n"
+          "To debug try disable codegen fallback path via setting the env"
+          " variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n");
+      take_fallback(stack);
     }
   } else {
     run_fusion();
   }
+
+  if (compare_callback.callback != nullptr) {
+    Stack fused_outputs;
+    Stack fallback_outputs;
+    int64_t output_count =
+        static_cast<int64_t>(fusion_node->g(attr::Subgraph)->outputs().size());
+    TORCH_CHECK(
+        output_count <= stack.size(),
+        "Expected ",
+        output_count,
+        " outputs but found only ",
+        stack.size(),
+        " items on the stack");
+
+    fused_outputs.insert(
+        fused_outputs.begin(), stack.end() - output_count, stack.end());
+
+    if (stack_copy) {
+      take_fallback(*stack_copy);
+      TORCH_CHECK(
+          stack_copy->size() == stack.size(),
+          "Fused graph returns stack with ",
+          stack.size(),
+          " items, compared to ",
+          stack_copy->size(),
+          " from unfused graph");
+      fallback_outputs.insert(
+          fallback_outputs.begin(),
+          stack_copy->end() - output_count,
+          stack_copy->end());
+    }
+    auto graph_str = fusion_node->g(attr::Subgraph)->toString();
+    compare_callback.callback(fused_outputs, fallback_outputs, graph_str);
+  }
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/manager.h b/torch/csrc/jit/codegen/cuda/manager.h
index 39c97478effe..4b725cd80bc6 100644
--- a/torch/csrc/jit/codegen/cuda/manager.h
+++ b/torch/csrc/jit/codegen/cuda/manager.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 /*
diff --git a/torch/csrc/jit/codegen/cuda/mma_type.cpp b/torch/csrc/jit/codegen/cuda/mma_type.cpp
new file mode 100644
index 000000000000..3751cdea6bcf
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/mma_type.cpp
@@ -0,0 +1,139 @@
+#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+MmaBuilder::MmaBuilder(
+    MmaOptions::MacroType macro,
+    MatMulTileOptions gemm_tile) {
+  option_.macro = macro;
+  // Calculate accumulator stride, will be removed once transpose swizzle ready
+  int outer_stride = gemm_tile.warp_tile.n / gemm_tile.instruction_tile.n;
+  switch (macro) {
+    // Numbers depend on actual output layout of mma instruction
+    case MmaOptions::MacroType::Volta_16_16_4:
+      option_.accumulator_stride = outer_stride * 4;
+      break;
+    default:
+      TORCH_CHECK(false, "unsupported macro");
+      break;
+  }
+}
+
+MmaBuilder& MmaBuilder::layout(MmaOptions::MmaInputLayout layout) {
+  option_.operand_layout = layout;
+  return *this;
+}
+
+MmaBuilder& MmaBuilder::operand(MmaOptions::Operand a_or_b) {
+  option_.operand = a_or_b;
+  return *this;
+}
+
+// TODO: validate op config
+MmaOptions MmaBuilder::build() const {
+  return option_;
+}
+
+bool isVolta(MmaOptions::MacroType macro) {
+  return macro == MmaOptions::MacroType::Volta_16_16_4;
+}
+
+bool isTuring(MmaOptions::MacroType macro) {
+  return macro == MmaOptions::MacroType::Turing_16_8_16;
+}
+
+bool isAmpere(MmaOptions::MacroType macro) {
+  return false;
+}
+
+int getOutputRegisterSize(MmaOptions::MacroType macro) {
+  switch (macro) {
+    case MmaOptions::MacroType::Volta_16_16_4:
+      return 8;
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unknown macro");
+      break;
+  }
+  return -1;
+}
+
+int getInputARegisterSize(MmaOptions::MacroType macro) {
+  switch (macro) {
+    case MmaOptions::MacroType::Volta_16_16_4:
+      return 4;
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unknown macro");
+      break;
+  }
+  return -1;
+}
+
+int getInputBRegisterSize(MmaOptions::MacroType macro) {
+  switch (macro) {
+    case MmaOptions::MacroType::Volta_16_16_4:
+      return 4;
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unknown macro");
+      break;
+  }
+  return -1;
+}
+
+bool isOperandTransposed(MmaOptions options) {
+  switch (options.operand) {
+    case MmaOptions::Operand::A:
+      return options.operand_layout == MmaOptions::MmaInputLayout::TT ||
+          options.operand_layout == MmaOptions::MmaInputLayout::TN;
+    case MmaOptions::Operand::B:
+      return options.operand_layout == MmaOptions::MmaInputLayout::TT ||
+          options.operand_layout == MmaOptions::MmaInputLayout::NT;
+    default:
+      TORCH_CHECK(false, "isOperandTransposed: please specify operand");
+  }
+  return false;
+}
+
+std::string toString(MmaOptions::MmaInputLayout input_layout) {
+  std::stringstream ss;
+  switch (input_layout) {
+    case MmaOptions::MmaInputLayout::TT:
+      ss << "TT";
+      break;
+    case MmaOptions::MmaInputLayout::TN:
+      ss << "TN";
+      break;
+    case MmaOptions::MmaInputLayout::NT:
+      ss << "NT";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unsupported operand layout");
+  }
+  return ss.str();
+}
+
+std::string toString(MmaOptions::MacroType mt) {
+  std::stringstream ss;
+  switch (mt) {
+    case MmaOptions::MacroType::NoMMA:
+      ss << "NoOp";
+      break;
+    case MmaOptions::MacroType::Volta_16_16_4:
+      ss << "M16N16K4";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "undefined mma type");
+      break;
+  }
+  return ss.str();
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/mma_type.h b/torch/csrc/jit/codegen/cuda/mma_type.h
new file mode 100644
index 000000000000..5f42d41ded65
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/mma_type.h
@@ -0,0 +1,132 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+//! Utility data structure for recording gemm tiles
+struct GemmTile {
+  int m, n, k;
+  GemmTile(int m_, int n_, int k_) : m(m_), n(n_), k(k_) {}
+
+  bool operator==(const GemmTile& other) {
+    return m == other.m && n == other.n && k == other.k;
+  }
+
+  GemmTile operator/(const GemmTile& other) {
+    return GemmTile(m / other.m, n / other.n, k / other.k);
+  }
+};
+
+//! Utility data structure for recording gemm tiles
+struct TORCH_CUDA_CU_API MatMulTileOptions {
+  GemmTile cta_tile = GemmTile(128, 128, 32);
+  GemmTile warp_tile = GemmTile(64, 64, 32);
+  GemmTile instruction_tile = GemmTile(16, 8, 16);
+
+  MatMulTileOptions() = default;
+  MatMulTileOptions(
+      GemmTile cta_tile_,
+      GemmTile warp_tile_,
+      GemmTile instruction_tile_)
+      : cta_tile(cta_tile_),
+        warp_tile(warp_tile_),
+        instruction_tile(instruction_tile_) {}
+
+  bool operator==(const MatMulTileOptions& other) {
+    return cta_tile == other.cta_tile && warp_tile == other.warp_tile &&
+        instruction_tile == other.instruction_tile;
+  }
+};
+
+//! Information for configuring and lowering mma ops
+struct MmaOptions {
+  //! Type of mma instrinsic macro to use
+  //!  This will translate to which mma intrinsic from runtime string
+  //!    to be generated to implement the mma op. The current plan
+  //!    is to have exactly one macro for each
+  //!  (arch, datatype, operand layout) triple, though there
+  //!  exists multiple possibilities for some cases, e.g. for Turing and fp16
+  //!  one can use 16_8_8 or 16_8_16.
+  //! Will consider adding more choices that the scheduler can pick from
+  //!  when our perf target becomes more fine grained, which is more likely in
+  //!  latency bound kernels.
+  enum class MacroType {
+    NoMMA = 0,
+    Volta_16_16_4,
+    Turing_16_8_16, // place holder for turing/ampere mma
+    Ampere_16_8_8 // place holder for tf32
+  };
+
+  //! [Operand Layout Convention]
+  //! Operand layout, T=transposed/row_major, N=normal/col_major
+  //!   We don't support calling NN mma directly since it implies
+  //!    a fused transpose. User needs to swap the operands and use
+  //!    TT mma to make the transpose explicit.
+  //! Ordered by position of K
+  //! NT : K,M x K,N -> K,M,N
+  //! TT : M,K X K,N -> M,K,N
+  //! TN : M,K X N,K -> M,N,K
+  enum class MmaInputLayout { NT = 0, TT, TN };
+
+  //! Utility to annotate which input of mma this option struct describes
+  enum class Operand { NotOperand = 0, A, B };
+
+  //! Utility to annotate which mma macro this config uses.
+  MacroType macro = MacroType::NoMMA;
+
+  //! Utility to annotate transposition of operands
+  MmaInputLayout operand_layout = MmaInputLayout::TT;
+
+  //! Utility to annotate which input of mma this option struct describes
+  Operand operand = Operand::A;
+
+  //! Accumulator register stride, will be removed when the swizzle op
+  //!  is introduced and the output can be labeled with a transpose swizzle.
+  int accumulator_stride = 0;
+
+  bool operator==(const MmaOptions& other) const {
+    return macro == other.macro && operand_layout == other.operand_layout &&
+        operand == other.operand &&
+        accumulator_stride == other.accumulator_stride;
+  }
+};
+
+//! User interface generating mma options for mma op
+class TORCH_CUDA_CU_API MmaBuilder {
+ public:
+  MmaBuilder(MmaOptions::MacroType macro, MatMulTileOptions gemm_tile);
+  MmaBuilder& layout(MmaOptions::MmaInputLayout layout);
+  MmaBuilder& operand(MmaOptions::Operand a_or_b);
+  MmaOptions build() const;
+
+ private:
+  MmaOptions option_;
+};
+
+//! GPU arch check for macro type
+bool isVolta(MmaOptions::MacroType macro);
+bool isTuring(MmaOptions::MacroType macro);
+bool isAmpere(MmaOptions::MacroType macro);
+
+//! Returns true if the given option describes a transposed operand
+bool isOperandTransposed(MmaOptions options);
+
+// Unpacked constants from macro type:
+//   exact numbers are defined by each individual instruction.
+int getOutputRegisterSize(MmaOptions::MacroType macro);
+int getInputARegisterSize(MmaOptions::MacroType macro);
+int getInputBRegisterSize(MmaOptions::MacroType macro);
+
+// MMA stringify utils
+std::string toString(MmaOptions::MacroType macro);
+std::string toString(MmaOptions::MmaInputLayout input_layout);
+std::string toString(MmaOptions::MacroType mt);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/mutator.cpp b/torch/csrc/jit/codegen/cuda/mutator.cpp
index 8d13f1e299e2..feccb5608cbc 100644
--- a/torch/csrc/jit/codegen/cuda/mutator.cpp
+++ b/torch/csrc/jit/codegen/cuda/mutator.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 
 #include <vector>
@@ -10,143 +11,215 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-// MUTATE FUNCTIONS FOR VALS
+void OptOutMutator::mutate(Statement* s) {
+  Statement::mutatorDispatch(this, s);
+}
+
+void OptOutMutator::mutate(Expr* e) {
+  Expr::mutatorDispatch(this, e);
+}
+
+void OptOutMutator::mutate(Val* v) {
+  Val::mutatorDispatch(this, v);
+}
+
+void OptOutMutator::registerMutation(Val* val, Val* mutation) {
+  bool val_is_ns = val->vtype() == ValType::NamedScalar;
+  bool mutation_is_ns = mutation->vtype() == ValType::NamedScalar;
+  bool val_is_scalar = val->vtype() == ValType::Scalar;
+  bool mutation_is_scalar = mutation->vtype() == ValType::Scalar;
+  TORCH_INTERNAL_ASSERT(
+      mutation->dtype() == val->dtype() &&
+          (mutation->vtype() == val->vtype() ||
+           ((val_is_ns && mutation_is_scalar) ||
+            (mutation_is_ns && val_is_scalar))),
+      "Mutations are not allowed to change types, tried to go from: (",
+      val->vtype(),
+      ", ",
+      val->dtype(),
+      ") to: (",
+      mutation->vtype(),
+      ", ",
+      mutation->dtype(),
+      ")");
+  mutations[val] = mutation;
+}
+
+void OptOutMutator::mutate(Bool* b) {}
+
+void OptOutMutator::mutate(Double* d) {}
+
+void OptOutMutator::mutate(Int* i) {}
+
+void OptOutMutator::mutate(ComplexDouble* c) {}
 
-Statement* OptOutMutator::mutate(IterDomain* id) {
-  Val* start = mutateAsVal(id->start())->asVal();
-  Val* extent = mutateAsVal(id->extent())->asVal();
-  Val* stop_offset = mutateAsVal(id->stopOffset())->asVal();
+void OptOutMutator::mutate(NamedScalar* ns) {}
+
+void OptOutMutator::mutate(IterDomain* id) {
+  Val* start = maybeMutated(id->start());
+  Val* extent = maybeMutated(id->extent());
+  Val* stop_offset = maybeMutated(id->stopOffset());
   if (start->sameAs(id->start()) && extent->sameAs(id->extent()) &&
       stop_offset->sameAs(id->stopOffset())) {
-    return id;
+    return;
   }
 
-  Val* mutated_val = new IterDomain(
+  Val* mutated_val = IrBuilder::create<IterDomain>(
+      id->container(),
       start,
       extent,
       stop_offset,
       id->getParallelType(),
       id->getIterType(),
       id->isRFactorProduct());
+  if (id->hasPaddingToMultipleOfWarp()) {
+    mutated_val->as<IterDomain>()->padToMultipleOfWarp(
+        id->getMaybeSizeAfterPadding());
+  }
   registerMutation(id, mutated_val);
-  return mutated_val;
 }
 
-Statement* OptOutMutator::mutate(TensorDomain* td) {
-  std::vector<IterDomain*> dom;
+void OptOutMutator::mutate(TensorDomain* td) {
   bool mutated = false;
-  for (const auto i : c10::irange(td->nDims())) {
-    IterDomain* id = mutateAsVal(td->axis(i))->as<IterDomain>();
-    dom.push_back(id);
-    if (!id->sameAs(td->axis(i)))
-      mutated = true;
-  }
 
-  if (mutated) {
-    Val* mutated_val = new TensorDomain(
-        td->getRootDomain(), td->getRFactorDomain(), dom, td->contiguity());
-    registerMutation(td, mutated_val);
-    return mutated_val;
+  auto updateIdVec = [&](const std::vector<IterDomain*>& ids) {
+    std::vector<IterDomain*> updated_ids;
+    for (auto id : ids) {
+      auto updated_id = maybeMutated(id)->as<IterDomain>();
+      updated_ids.push_back(updated_id);
+      if (!updated_id->sameAs(id)) {
+        mutated = true;
+      }
+    }
+    return updated_ids;
+  };
+
+  std::vector<IterDomain*> root_dom = updateIdVec(td->getRootDomain());
+  std::vector<IterDomain*> rfactor_dom = td->hasRFactor()
+      ? updateIdVec(td->getMaybeRFactorDomain())
+      : std::vector<IterDomain*>();
+  std::vector<IterDomain*> domain = updateIdVec(td->domain());
+
+  if (!mutated) {
+    return;
   }
-  return td;
-}
 
-Statement* OptOutMutator::mutate(TensorView* tv) {
-  TensorDomain* td = mutateAsVal(tv->domain())->as<TensorDomain>();
+  Val* mutated_val = IrBuilder::create<TensorDomain>(
+      td->container(), root_dom, rfactor_dom, domain, td->contiguity());
+  registerMutation(td, mutated_val);
+}
 
+void OptOutMutator::mutate(TensorView* tv) {
+  TensorDomain* td = maybeMutated(tv->domain())->as<TensorDomain>();
   if (!tv->domain()->sameAs(td)) {
-    TensorView* mutated_tv = new TensorView(td, tv->getDataType().value());
-    registerMutation(tv, mutated_tv);
-    return mutated_tv;
+    tv->setDomain(td);
   }
-  return tv;
+  // Don't register tv mutations as we just want to update the TD
 }
 
-Statement* OptOutMutator::mutate(Bool* b) {
-  return b;
+void OptOutMutator::mutate(kir::Predicate*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
-Statement* OptOutMutator::mutate(Double* d) {
-  return d;
+void OptOutMutator::mutate(kir::TensorIndex*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
-Statement* OptOutMutator::mutate(Int* i) {
-  return i;
-}
+// MUTATE FUNCTIONS FOR EXPRESSIONS.
+void OptOutMutator::mutate(UnaryOp* uop) {
+  Val* out = maybeMutated(uop->out());
+  Val* in = maybeMutated(uop->in());
 
-Statement* OptOutMutator::mutate(NamedScalar* ns) {
-  return ns;
+  if (out->sameAs(uop->out()) && in->sameAs(uop->in())) {
+    return;
+  }
+  auto container = uop->container();
+  auto uop_type = uop->getUnaryOpType();
+  container->removeExpr(uop);
+  IrBuilder::create<UnaryOp>(container, uop_type, out, in);
 }
 
-// MUTATE FUNCTIONS FOR EXPRESSIONS.
+void OptOutMutator::mutate(BinaryOp* bop) {
+  Val* out = maybeMutated(bop->out());
+  Val* lhs = maybeMutated(bop->lhs());
+  Val* rhs = maybeMutated(bop->rhs());
 
-Statement* OptOutMutator::mutate(Split* s) {
-  IterDomain* ot = mutateAsVal(s->outer())->as<IterDomain>();
-  IterDomain* inr = mutateAsVal(s->inner())->as<IterDomain>();
-  IterDomain* in = mutateAsVal(s->in())->as<IterDomain>();
-  Val* fact = mutateAsVal(s->factor())->as<Val>();
-
-  if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) &&
-      in->sameAs(s->in()) && areEqualScalars(fact, s->factor())) {
-    return s;
+  if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs()) {
+    return;
   }
-  FusionGuard::getCurFusion()->removeExpr(s);
-  return new Split(ot, inr, in, fact, s->innerSplit());
+
+  auto container = bop->container();
+  auto bop_type = bop->getBinaryOpType();
+  container->removeExpr(bop);
+  IrBuilder::create<BinaryOp>(container, bop_type, out, lhs, rhs);
 }
 
-Statement* OptOutMutator::mutate(Merge* m) {
-  IterDomain* ot = mutateAsVal(m->out())->as<IterDomain>();
-  IterDomain* otr = mutateAsVal(m->outer())->as<IterDomain>();
-  IterDomain* in = mutateAsVal(m->inner())->as<IterDomain>();
+void OptOutMutator::mutate(TernaryOp* top) {
+  Val* out = maybeMutated(top->out());
+  Val* in1 = maybeMutated(top->in1());
+  Val* in2 = maybeMutated(top->in2());
+  Val* in3 = maybeMutated(top->in3());
 
-  if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) && in->sameAs(m->inner()))
-    return m;
+  if (out == top->out() && in1 == top->in1() && in2 == top->in2() &&
+      in3 == top->in3()) {
+    return;
+  }
 
-  FusionGuard::getCurFusion()->removeExpr(m);
-  return new Merge(ot, otr, in);
+  auto container = top->container();
+  auto top_type = top->getTernaryOpType();
+  container->removeExpr(top);
+  IrBuilder::create<TernaryOp>(container, top_type, out, in1, in2, in3);
 }
 
-Statement* OptOutMutator::mutate(UnaryOp* uop) {
-  Val* out = mutateAsVal(uop->out())->asVal();
-  Val* in = mutateAsVal(uop->in())->asVal();
+void OptOutMutator::mutate(ReductionOp* rop) {
+  Val* out = maybeMutated(rop->out());
+  Val* in = maybeMutated(rop->in());
+  Val* init = rop->init();
+  if (out->sameAs(rop->out()) && in->sameAs(rop->in()) &&
+      init->sameAs(rop->init())) {
+    return;
+  }
 
-  if (out->sameAs(uop->out()) && in->sameAs(uop->in()))
-    return uop;
-  FusionGuard::getCurFusion()->removeExpr(uop);
-  return new UnaryOp(uop->getUnaryOpType(), out, in);
+  auto container = rop->container();
+  auto rop_type = rop->getReductionOpType();
+  container->removeExpr(rop);
+  IrBuilder::create<ReductionOp>(
+      container, rop_type, init, out, in, rop->isAllreduce());
 }
 
-Statement* OptOutMutator::mutate(BinaryOp* bop) {
-  Val* out = mutateAsVal(bop->out())->asVal();
-  Val* lhs = mutateAsVal(bop->lhs())->asVal();
-  Val* rhs = mutateAsVal(bop->rhs())->asVal();
-  if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs())
-    return bop;
-  FusionGuard::getCurFusion()->removeExpr(bop);
-  return new BinaryOp(bop->getBinaryOpType(), out, lhs, rhs);
-}
+void OptOutMutator::mutate(GroupedReductionOp* rop) {
+  bool is_same = true;
 
-Statement* OptOutMutator::mutate(TernaryOp* top) {
-  Val* out = mutateAsVal(top->out())->asVal();
-  Val* in1 = mutateAsVal(top->in1())->asVal();
-  Val* in2 = mutateAsVal(top->in2())->asVal();
-  Val* in3 = mutateAsVal(top->in3())->asVal();
-  if (out == top->out() && in1 == top->in1() && in2 == top->in2() &&
-      in3 == top->in3())
-    return top;
-  FusionGuard::getCurFusion()->removeExpr(top);
-  return new TernaryOp(top->getTernaryOpType(), out, in1, in2, in3);
-}
+  std::vector<Val*> outputs;
+  for (auto out : rop->outputs()) {
+    auto maybe_mutated = maybeMutated(out);
+    is_same = is_same && maybe_mutated->sameAs(out);
+    outputs.push_back(maybe_mutated);
+  }
 
-Statement* OptOutMutator::mutate(ReductionOp* rop) {
-  Val* out = mutateAsVal(rop->out())->asVal();
-  Val* in = mutateAsVal(rop->in())->asVal();
-  Val* init = rop->init();
-  if (out->sameAs(rop->out()) && in->sameAs(rop->in()) &&
-      init->sameAs(rop->init()))
-    return rop;
+  std::vector<Val*> inputs;
+  for (auto in : rop->inputs()) {
+    auto maybe_mutated = maybeMutated(in);
+    is_same = is_same && maybe_mutated->sameAs(in);
+    inputs.push_back(maybe_mutated);
+  }
+
+  std::vector<Val*> init_vals;
+  for (auto init : rop->initVals()) {
+    auto maybe_mutated = maybeMutated(init);
+    is_same = is_same && maybe_mutated->sameAs(init);
+    init_vals.push_back(maybe_mutated);
+  }
+
+  if (is_same) {
+    return;
+  }
 
-  return new ReductionOp(rop->getReductionOpType(), init, out, in);
+  auto container = rop->container();
+  const auto& rop_types = rop->getReductionOpTypes();
+  container->removeExpr(rop);
+  IrBuilder::create<GroupedReductionOp>(
+      container, rop_types, init_vals, outputs, inputs, rop->isAllreduce());
 }
 
 namespace {
@@ -159,20 +232,18 @@ inline bool compareOptional(Val* a, Val* b) {
 
 } // namespace
 
-Statement* OptOutMutator::mutate(WelfordOp* wop) {
-  Val* out_avg = mutateAsVal(wop->outAvg())->asVal();
-  Val* out_var = mutateAsVal(wop->outVar())->asVal();
-  Val* out_N = mutateAsVal(wop->outN())->asVal();
+void OptOutMutator::mutate(WelfordOp* wop) {
+  Val* out_avg = maybeMutated(wop->outAvg());
+  Val* out_var = maybeMutated(wop->outVar());
+  Val* out_N = maybeMutated(wop->outN());
 
-  Val* in_avg = mutateAsVal(wop->inAvg())->asVal();
-  Val* in_var = wop->inVar() ? mutateAsVal(wop->inVar())->asVal() : nullptr;
-  Val* in_N = mutateAsVal(wop->inN())->asVal();
+  Val* in_avg = maybeMutated(wop->inAvg());
+  Val* in_var = wop->inVar() ? maybeMutated(wop->inVar()) : nullptr;
+  Val* in_N = maybeMutated(wop->inN());
 
-  Val* init_avg =
-      wop->initAvg() ? mutateAsVal(wop->initAvg())->asVal() : nullptr;
-  Val* init_var =
-      wop->initVar() ? mutateAsVal(wop->initVar())->asVal() : nullptr;
-  Val* init_N = mutateAsVal(wop->initN())->asVal();
+  Val* init_avg = wop->initAvg() ? maybeMutated(wop->initAvg()) : nullptr;
+  Val* init_var = wop->initVar() ? maybeMutated(wop->initVar()) : nullptr;
+  Val* init_N = maybeMutated(wop->initN());
 
   const bool out_compare = out_avg->sameAs(wop->outAvg()) &&
       out_var->sameAs(wop->outVar()) && out_N->sameAs(wop->outN());
@@ -182,56 +253,205 @@ Statement* OptOutMutator::mutate(WelfordOp* wop) {
       compareOptional(init_var, wop->initVar()) && init_N->sameAs(wop->initN());
 
   if (out_compare && init_compare && in_compare) {
-    return wop;
-  } else {
-    return new WelfordOp(
-        out_avg,
-        out_var,
-        out_N,
-        init_avg,
-        init_var,
-        init_N,
-        in_avg,
-        in_var,
-        in_N);
+    return;
+  }
+
+  auto container = wop->container();
+  container->removeExpr(wop);
+  IrBuilder::create<WelfordOp>(
+      container,
+      out_avg,
+      out_var,
+      out_N,
+      init_avg,
+      init_var,
+      init_N,
+      in_avg,
+      in_var,
+      in_N,
+      wop->isAllreduce());
+}
+
+void OptOutMutator::mutate(MmaOp* mma) {
+  Val* out = maybeMutated(mma->out());
+  Val* in_a = maybeMutated(mma->inA());
+  Val* in_b = maybeMutated(mma->inB());
+  Val* init = mma->init();
+
+  if (out->sameAs(mma->out()) && in_a->sameAs(mma->inA()) &&
+      in_b->sameAs(mma->inB())) {
+    return;
   }
+
+  auto container = mma->container();
+  auto options = mma->options();
+  container->removeExpr(mma);
+  C10_UNUSED auto new_mma =
+      IrBuilder::create<MmaOp>(container, out, in_a, in_b, init, options);
 }
 
-Statement* OptOutMutator::mutate(BroadcastOp* bop) {
-  return bop;
+void OptOutMutator::mutate(BroadcastOp* bop) {
+  Val* out = maybeMutated(bop->out());
+  Val* in = maybeMutated(bop->in());
+
+  if (out->sameAs(bop->out()) && in->sameAs(bop->in())) {
+    return;
+  }
+
+  auto container = bop->container();
+  auto flags = bop->getBroadcastDimFlags();
+  container->removeExpr(bop);
+  IrBuilder::create<BroadcastOp>(container, out, in, flags);
 }
 
-Statement* OptOutMutator::mutate(TransposeOp* top) {
-  return top;
+void OptOutMutator::mutate(TransposeOp* top) {
+  TensorView* out = maybeMutated(top->out())->as<TensorView>();
+  TensorView* in = maybeMutated(top->in())->as<TensorView>();
+
+  if (out->sameAs(top->out()) && in->sameAs(top->in())) {
+    return;
+  }
+
+  auto container = top->container();
+  auto new2old = top->new2old();
+  container->removeExpr(top);
+  IrBuilder::create<TransposeOp>(container, out, in, new2old);
 }
 
-Statement* OptOutMutator::mutate(ShiftOp* sop) {
-  Val* out = mutateAsVal(sop->out())->asVal();
-  Val* in = mutateAsVal(sop->in())->asVal();
+void OptOutMutator::mutate(ShiftOp* sop) {
+  Val* out = maybeMutated(sop->out())->asVal();
+  Val* in = maybeMutated(sop->in())->asVal();
+
+  if (out->sameAs(sop->out()) && in->sameAs(sop->in())) {
+    return;
+  }
 
-  if (out->sameAs(sop->out()) && in->sameAs(sop->in()))
-    return sop;
   auto offsets = sop->offsets();
-  FusionGuard::getCurFusion()->removeExpr(sop);
-  return new ShiftOp(out, in, offsets, sop->pad());
+  auto pad_width = sop->padWidth();
+  auto container = sop->container();
+  container->removeExpr(sop);
+  IrBuilder::create<ShiftOp>(container, out, in, offsets, pad_width);
 }
 
-Statement* OptOutMutator::mutate(GatherOp* op) {
-  Val* out = mutateAsVal(op->out())->asVal();
-  Val* in = mutateAsVal(op->in())->asVal();
+void OptOutMutator::mutate(GatherOp* op) {
+  Val* out = maybeMutated(op->out())->asVal();
+  Val* in = maybeMutated(op->in())->asVal();
+
+  if (out->sameAs(op->out()) && in->sameAs(op->in())) {
+    return;
+  }
 
-  if (out->sameAs(op->out()) && in->sameAs(op->in()))
-    return op;
   auto window_shape = op->windowShape();
   auto pad_width = op->padWidth();
-  FusionGuard::getCurFusion()->removeExpr(op);
-  return new GatherOp(out, in, window_shape, pad_width);
+  auto container = op->container();
+  container->removeExpr(op);
+  IrBuilder::create<GatherOp>(container, out, in, window_shape, pad_width);
+}
+
+void OptOutMutator::mutate(ViewAsScalar* vop) {
+  TensorView* out = maybeMutated(vop->out())->as<TensorView>();
+  TensorView* in = maybeMutated(vop->in())->as<TensorView>();
+
+  if (out->sameAs(vop->out()) && in->sameAs(vop->in())) {
+    return;
+  }
+
+  auto container = vop->container();
+  container->removeExpr(vop);
+  IrBuilder::create<ViewAsScalar>(
+      container, out, in, vop->vector_id(), vop->index());
+}
+
+void OptOutMutator::mutate(ViewOp* vop) {
+  TensorView* out = maybeMutated(vop->out())->as<TensorView>();
+  TensorView* in = maybeMutated(vop->in())->as<TensorView>();
+
+  if (out->sameAs(vop->out()) && in->sameAs(vop->in())) {
+    return;
+  }
+
+  auto container = vop->container();
+  container->removeExpr(vop);
+  IrBuilder::create<ViewOp>(container, out, in);
+}
+
+void OptOutMutator::mutate(Split* s) {
+  IterDomain* ot = maybeMutated(s->outer())->as<IterDomain>();
+  IterDomain* inr = maybeMutated(s->inner())->as<IterDomain>();
+  IterDomain* in = maybeMutated(s->in())->as<IterDomain>();
+  Val* fact = maybeMutated(s->factor())->as<Val>();
+  Val* start_offset = maybeMutated(s->startOffset());
+  Val* stop_offset = maybeMutated(s->stopOffset());
+
+  if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) &&
+      in->sameAs(s->in()) && areEqualScalars(fact, s->factor()) &&
+      start_offset->sameAs(s->startOffset()) &&
+      stop_offset->sameAs(s->stopOffset())) {
+    return;
+  }
+
+  auto container = s->container();
+  auto inner_split = s->innerSplit();
+  container->removeExpr(s);
+  C10_UNUSED auto new_node = IrBuilder::create<Split>(
+      container, ot, inr, in, fact, inner_split, start_offset, stop_offset);
 }
 
-Statement* OptOutMutator::mutate(ViewOp* vop) {
-  return vop;
+void OptOutMutator::mutate(Merge* m) {
+  IterDomain* ot = maybeMutated(m->out())->as<IterDomain>();
+  IterDomain* otr = maybeMutated(m->outer())->as<IterDomain>();
+  IterDomain* in = maybeMutated(m->inner())->as<IterDomain>();
+
+  if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) &&
+      in->sameAs(m->inner())) {
+    return;
+  }
+
+  auto container = m->container();
+  container->removeExpr(m);
+  C10_UNUSED auto new_node = IrBuilder::create<Merge>(container, ot, otr, in);
+}
+
+void OptOutMutator::mutate(kir::Allocate*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::BlockSync*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::GridSync*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::InitMagicZero*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::UpdateMagicZero*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::ForLoop*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::IfThenElse*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::GridReduction*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::GroupedGridReduction*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::GridBroadcast*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::GridWelford*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
+}
+void OptOutMutator::mutate(kir::AllocateFusedReduction*) {
+  TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
+void OptOutMutator::removeExpr(IrContainer* container, Expr* expr) {
+  container->removeExpr(expr);
+}
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/mutator.h b/torch/csrc/jit/codegen/cuda/mutator.h
index f9ec40ca9f57..433de485cf19 100644
--- a/torch/csrc/jit/codegen/cuda/mutator.h
+++ b/torch/csrc/jit/codegen/cuda/mutator.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
index 426bcadb2c5e..3a2ab5f5eb5b 100644
--- a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
+++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
@@ -128,8 +128,8 @@ void NonDivisibleSplitInfo::removeRedundancy() {
   std::unordered_set<IterDomain*> split_to_validate_outer;
   for (auto it = splits_to_validate_.begin();
        it != splits_to_validate_.end();) {
-    auto outer_concrete =
-        gpu_lower->caIndexMap().getConcreteMappedID((*it)->outer());
+    auto outer_concrete = gpu_lower->caMap()->getConcreteMappedID(
+        (*it)->outer(), IdMappingMode::EXACT);
     auto new_domain = split_to_validate_outer.insert(outer_concrete).second;
     if (!new_domain) {
       it = splits_to_validate_.erase(it);
@@ -150,8 +150,10 @@ void NonDivisibleSplitInfo::removeRedundancy() {
               splits_to_validate_.begin(),
               splits_to_validate_.end(),
               [&](Split* split_to_validate) {
-                return gpu_lower->caIndexMap().areMapped(
-                    split_to_validate->outer(), split_to_predicate->outer());
+                return gpu_lower->caMap()->areMapped(
+                    split_to_validate->outer(),
+                    split_to_predicate->outer(),
+                    IdMappingMode::EXACT);
               })) {
         it = splits.erase(it);
       } else {
diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.h b/torch/csrc/jit/codegen/cuda/non_divisible_split.h
index f17bf2d62468..6706c9f072d3 100644
--- a/torch/csrc/jit/codegen/cuda/non_divisible_split.h
+++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
new file mode 100644
index 000000000000..5dc211eb4f6c
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
@@ -0,0 +1,58 @@
+if(BUILD_SPLIT_CUDA)
+  set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp
+elseif(USE_CUDA)
+  set(TORCHLIB_FLAVOR torch_cuda)
+elseif(USE_ROCM)
+  set(TORCHLIB_FLAVOR torch_hip)
+endif()
+
+# The list of NVFUSER runtime files
+list(APPEND NVFUSER_RUNTIME_FILES
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tuple.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/type_traits.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu
+  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensorcore.cu
+  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh
+)
+
+file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
+
+# "stringify" NVFUSER runtime sources
+# (generate C++ header files embedding the original input as a string literal)
+set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py")
+foreach(src ${NVFUSER_RUNTIME_FILES})
+  get_filename_component(filename ${src} NAME_WE)
+  set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
+  add_custom_command(
+    COMMENT "Stringify NVFUSER runtime source file"
+    OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
+  )
+  add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
+  add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename})
+
+  # also generate the resource headers during the configuration step
+  # (so tools like clang-tidy can run w/o requiring a real build)
+  execute_process(COMMAND
+    ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
+endforeach()
+
+target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include")
diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.cpp b/torch/csrc/jit/codegen/cuda/ops/alias.cpp
new file mode 100644
index 000000000000..d5bbd4878828
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ops/alias.cpp
@@ -0,0 +1,199 @@
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ops/alias.h>
+#include <torch/csrc/jit/codegen/cuda/transform_view.h>
+#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+//! Transform TensorView according to keep, merge, and split transformations.
+//! Trivial reduction and broadcast transformations are handled separately.
+//! It is recommend to use the composite ops view function, which will call
+//! the analyzeView function to generate the appropriate transformations.
+//!
+//! For example:
+//! original sizes = [2, 10, 40]
+//! new_size = [2, 10, 2, 20]
+//! auto analysis = analyzeView(TV0, original_sizes, new_sizes)
+//! auto TV1 = TV0->view(analysis.transforms);
+//!
+//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)]
+//! Before: TV0[I0, I1, I2]
+//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)]
+//!
+TensorView* applyViewTransforms(
+    TensorView* tv,
+    const std::vector<std::shared_ptr<ViewTransform>>& transforms) {
+  TORCH_INTERNAL_ASSERT(
+      !tv->hasComputeAt(),
+      "Cannot modify rfactor domain after compute at has been set.");
+
+  TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView");
+
+  TORCH_CHECK(
+      !tv->domain()->hasRFactor(),
+      "Cannot call view on the same TensorView twice.");
+
+  TORCH_INTERNAL_ASSERT(!transforms.empty());
+
+  TensorView* consumer = IrBuilder::create<TensorView>(
+      tv->container(),
+      tv->domain()->view(transforms),
+      tv->getDataType().value());
+
+  IrBuilder::create<ViewOp>(tv->container(), consumer, tv);
+
+  return consumer;
+}
+
+} // namespace
+
+TensorView* view(TensorView* x, DataType dtype) {
+  if (x->getDataType() == dtype) {
+    return x;
+  }
+
+  auto input_type = x->getDataType().value();
+  auto input_size = dataTypeSize(input_type);
+  auto newsize = dataTypeSize(dtype);
+
+  if (input_size == newsize) {
+    return bitCastOp(dtype, x);
+  }
+  // TODO: support view(dtype) for dtypes where input_size != newsize
+  TORCH_INTERNAL_ASSERT(false, "Unsupported reinterpret casting view");
+}
+
+TensorView* view(
+    TensorView* x,
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& new_sizes) {
+  TORCH_INTERNAL_ASSERT(
+      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size() ==
+      original_sizes.size());
+
+  auto analyze_view = analyzeView(x, original_sizes, new_sizes);
+
+  auto reduction = (!analyze_view.trivial_reduction_axes.empty())
+      ? sum(x,
+            analyze_view.trivial_reduction_axes,
+            false /* keep_dim */,
+            x->getDataType().value())
+      : x;
+
+  auto view = (!analyze_view.transforms.empty())
+      ? applyViewTransforms(reduction, analyze_view.transforms)
+      : reduction;
+
+  return (analyze_view.has_broadcast)
+      ? broadcast(view, analyze_view.broadcast_axes)
+      : view;
+}
+
+TensorView* flatten(TensorView* x, int64_t start_dim, int64_t end_dim) {
+  if (start_dim < 0) {
+    start_dim += x->nDims();
+  }
+  if (end_dim < 0) {
+    end_dim += x->nDims();
+  }
+  TORCH_CHECK(
+      start_dim >= 0 && start_dim < x->nDims(),
+      "Invalid start_dim ",
+      start_dim);
+  TORCH_CHECK(
+      end_dim >= 0 && end_dim < x->nDims(), "Invalid end_dim ", end_dim);
+  TORCH_CHECK(start_dim <= end_dim, "start_dim must be <= end_dim");
+
+  if (start_dim == end_dim) {
+    return x;
+  }
+
+  auto out = IrBuilder::create<TensorView>(
+      x->container(),
+      x->domain()->flatten(start_dim, end_dim),
+      x->getDataType().value());
+
+  IrBuilder::create<ViewOp>(out, x);
+  return out;
+}
+
+TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes) {
+  const auto ndims = static_cast<int>(x->domain()->noReductions().size());
+
+  TORCH_INTERNAL_ASSERT(
+      ndims == sizes.size(),
+      "Invalid sizes for squeeze: ",
+      sizes,
+      ". Input tensor: ",
+      x->toString());
+
+  std::vector<int> trivial_reduction_axes;
+  for (const auto idx : c10::irange(sizes.size())) {
+    if (sizes[idx] == 1) {
+      trivial_reduction_axes.push_back(idx);
+    }
+  }
+  return (trivial_reduction_axes.empty()) ? x
+                                          : sum(x,
+                                                trivial_reduction_axes,
+                                                false /* keep_dim */,
+                                                x->getDataType().value());
+}
+
+TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes, int dim) {
+  const auto ndims = static_cast<int>(x->domain()->noReductions().size());
+
+  TORCH_INTERNAL_ASSERT(
+      ndims == sizes.size(),
+      "Invalid sizes for squeeze: ",
+      sizes,
+      ". Input tensor: ",
+      x->toString());
+
+  if (dim < 0) {
+    dim = ndims + dim;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      dim >= 0 && dim < ndims,
+      "Invalid position to squeeze: ",
+      dim,
+      ". Input tensor: ",
+      x->toString());
+
+  if (sizes[dim] == 1) {
+    return sum(x, {dim}, false /* keep_dim */, x->getDataType().value());
+  } else {
+    return set(x);
+  }
+}
+
+TensorView* unsqueeze(TensorView* x, int dim) {
+  const auto ndims = static_cast<int>(x->domain()->noReductions().size());
+
+  if (dim < 0) {
+    dim = ndims + dim + 1;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      dim >= 0 && dim <= ndims,
+      "Invalid position to unsqueeze: ",
+      dim,
+      ". Input tensor: ",
+      x->toString());
+
+  std::vector<bool> broadcast_axes(ndims + 1, false);
+  broadcast_axes[dim] = true;
+  return broadcast(x, broadcast_axes);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.h b/torch/csrc/jit/codegen/cuda/ops/alias.h
new file mode 100644
index 000000000000..f33a5a745a89
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/ops/alias.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+
+//
+// The operations defined in this header is intended as user facing functions.
+// The user will provide the necessary input TensorViews and the function will
+// create the correct intermediate nodes and return the output TensorViews.
+//
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+TORCH_CUDA_CU_API TensorView* view(TensorView* x, DataType dtype);
+
+TORCH_CUDA_CU_API TensorView* view(
+    TensorView* x,
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& new_sizes);
+
+TORCH_CUDA_CU_API TensorView* flatten(
+    TensorView* x,
+    int64_t start_dim = 0,
+    int64_t end_dim = -1);
+
+TORCH_CUDA_CU_API TensorView* squeeze(
+    TensorView* x,
+    const std::vector<int64_t>& sizes);
+
+TORCH_CUDA_CU_API TensorView* squeeze(
+    TensorView* x,
+    const std::vector<int64_t>& sizes,
+    int dim);
+
+TORCH_CUDA_CU_API TensorView* unsqueeze(TensorView* x, int dim);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ops/all_ops.h b/torch/csrc/jit/codegen/cuda/ops/all_ops.h
index 1ebd2bb87f1b..07d3eb944e89 100644
--- a/torch/csrc/jit/codegen/cuda/ops/all_ops.h
+++ b/torch/csrc/jit/codegen/cuda/ops/all_ops.h
@@ -1,4 +1,5 @@
 #pragma once
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/ops/alias.h>
 #include <torch/csrc/jit/codegen/cuda/ops/composite.h>
 #include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.cpp b/torch/csrc/jit/codegen/cuda/ops/composite.cpp
index 06bcf2d0494a..08c58d2becb5 100644
--- a/torch/csrc/jit/codegen/cuda/ops/composite.cpp
+++ b/torch/csrc/jit/codegen/cuda/ops/composite.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ops/composite.h>
 #include <torch/csrc/jit/codegen/cuda/transform_view.h>
 
@@ -8,9 +9,10 @@ namespace fuser {
 namespace cuda {
 
 ForwardDropoutResult dropout(TensorView* x, Val* prob) {
-  auto p1m = sub(new Double(1.), prob);
-  auto zero_check = add(eq(p1m, new Double(0.)), p1m);
-  auto scale = div(new Double(1.), zero_check);
+  auto p1m = sub(IrBuilder::create<Double>(x->container(), 1.), prob);
+  auto zero_check =
+      add(eq(p1m, IrBuilder::create<Double>(x->container(), 0.)), p1m);
+  auto scale = div(IrBuilder::create<Double>(x->container(), 1.), zero_check);
   return dropout(x, p1m, scale);
 }
 
@@ -47,18 +49,6 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
   return dx;
 }
 
-Val* softplus(Val* x, Val* beta, Val* threshold) {
-  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
-  TORCH_INTERNAL_ASSERT(beta != nullptr, "Beta is invalid.");
-  TORCH_INTERNAL_ASSERT(
-      threshold != nullptr, "Threshold is not a valid Double.");
-
-  auto op_beta = mul(x, beta);
-  auto maybe_result = div(log1p(exp(op_beta)), beta);
-  auto y = where(gt(op_beta, threshold), x, maybe_result);
-  return y;
-}
-
 LstmResult lstm(
     TensorView* prev_cell,
     TensorView* in_x,
@@ -83,7 +73,53 @@ LstmResult lstm(
   return {cell, hidden};
 }
 
-Val* fast_gelu(Val* x) {
+TensorView* softplus(TensorView* x, Val* beta, Val* threshold) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(beta != nullptr, "Beta is invalid.");
+  TORCH_INTERNAL_ASSERT(
+      threshold != nullptr, "Threshold is not a valid Double.");
+
+  auto op_beta = mul(x, beta);
+  auto maybe_result = div(log1p(exp(op_beta)), beta);
+  auto y = where(gt(op_beta, threshold), x, maybe_result);
+  return y;
+}
+
+TensorView* gelu(TensorView* x) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid");
+
+  auto kappa = IrBuilder::create<Double>(x->container(), M_SQRT1_2);
+  auto half = IrBuilder::create<Double>(x->container(), 0.5);
+  auto one = IrBuilder::create<Double>(x->container(), 1.);
+
+  auto cdf = mul(half, add(one, erf(mul(x, kappa))));
+  auto y = mul(x, cdf);
+  return y;
+}
+
+TensorView* gelu_backward(TensorView* dy, TensorView* x) {
+  TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid.");
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid");
+
+  constexpr double kAlpha = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+  const double kHalf = 0.5;
+
+  auto cdf_1 = mul(x, IrBuilder::create<Double>(x->container(), M_SQRT1_2));
+  auto cdf_2 = erf(cdf_1);
+  auto cdf_3 = add(cdf_2, IrBuilder::create<Double>(x->container(), 1.));
+  auto cdf_4 = mul(cdf_3, IrBuilder::create<Double>(x->container(), kHalf));
+
+  auto pdf_1 = mul(x, x);
+  auto pdf_2 = mul(pdf_1, IrBuilder::create<Double>(x->container(), -kHalf));
+  auto pdf_3 = exp(pdf_2);
+
+  auto out = addcmul(
+      cdf_4, x, pdf_3, IrBuilder::create<Double>(x->container(), kAlpha));
+  auto dx = mul(out, dy);
+  return dx;
+}
+
+TensorView* tanh_gelu(TensorView* x) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid");
 
   constexpr double kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
@@ -91,17 +127,18 @@ Val* fast_gelu(Val* x) {
 
   auto x_cube = mul(x, mul(x, x));
 
-  auto inner_1 = mul(new Double(kKappa), x_cube);
+  auto inner_1 = mul(IrBuilder::create<Double>(x->container(), kKappa), x_cube);
   auto inner_2 = add(x, inner_1);
-  auto inner_3 = mul(new Double(kBeta), inner_2);
+  auto inner_3 = mul(IrBuilder::create<Double>(x->container(), kBeta), inner_2);
   auto tanh_inner = tanh(inner_3);
 
-  auto out = mul(x, add(new Double(1.), tanh_inner));
-  auto y = mul(new Double(0.5), out);
+  auto out =
+      mul(x, add(IrBuilder::create<Double>(x->container(), 1.), tanh_inner));
+  auto y = mul(IrBuilder::create<Double>(x->container(), 0.5), out);
   return y;
 }
 
-Val* fast_gelu_backward(Val* dy, Val* x) {
+TensorView* tanh_gelu_backward(TensorView* dy, TensorView* x) {
   TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid.");
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid");
 
@@ -111,107 +148,51 @@ Val* fast_gelu_backward(Val* dy, Val* x) {
   auto x_sq = mul(x, x);
   auto x_cube = mul(x, x_sq);
 
-  auto inner_1 = mul(new Double(kKappa), x_cube);
+  auto inner_1 = mul(IrBuilder::create<Double>(x->container(), kKappa), x_cube);
   auto inner_2 = add(x, inner_1);
-  auto inner_3 = mul(new Double(kBeta), inner_2);
+  auto inner_3 = mul(IrBuilder::create<Double>(x->container(), kBeta), inner_2);
   auto tanh_inner = tanh(inner_3);
 
-  auto left = mul(new Double(0.5), x);
-  auto right = add(new Double(1.), tanh_inner);
+  auto left = mul(IrBuilder::create<Double>(x->container(), 0.5), x);
+  auto right = add(IrBuilder::create<Double>(x->container(), 1.), tanh_inner);
 
-  auto left_derivative = mul(new Double(0.5), right);
+  auto left_derivative =
+      mul(IrBuilder::create<Double>(x->container(), 0.5), right);
 
   auto tanh_inner_sq = mul(tanh_inner, tanh_inner);
-  auto tanh_derivative = sub(new Double(1), tanh_inner_sq);
+  auto tanh_derivative =
+      sub(IrBuilder::create<Double>(x->container(), 1), tanh_inner_sq);
 
-  auto constant_mul_x_sq = mul(new Double(kBeta * 3 * kKappa), x_sq);
-  auto inner_derivative = add(new Double(kBeta), constant_mul_x_sq);
+  auto constant_mul_x_sq =
+      mul(IrBuilder::create<Double>(x->container(), kBeta * 3 * kKappa), x_sq);
+  auto inner_derivative =
+      add(IrBuilder::create<Double>(x->container(), kBeta), constant_mul_x_sq);
   auto right_derivative = mul(left, mul(tanh_derivative, inner_derivative));
 
   auto dx = mul(dy, add(left_derivative, right_derivative));
   return dx;
 }
 
-Val* gelu_backward(Val* dy, Val* x) {
+TensorView* tanh_backward(TensorView* dy, TensorView* tanh_x) {
   TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid.");
-  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid");
-
-  constexpr double kAlpha = M_2_SQRTPI * M_SQRT1_2 * 0.5;
-  const double kHalf = 0.5;
-
-  auto cdf_1 = mul(x, new Double(M_SQRT1_2));
-  auto cdf_2 = erf(cdf_1);
-  auto cdf_3 = add(cdf_2, new Double(1.));
-  auto cdf_4 = mul(cdf_3, new Double(kHalf));
+  TORCH_INTERNAL_ASSERT(tanh_x != nullptr, "Input is invalid");
 
-  auto pdf_1 = mul(x, x);
-  auto pdf_2 = mul(pdf_1, new Double(-kHalf));
-  auto pdf_3 = exp(pdf_2);
-
-  auto out = addcmul(cdf_4, x, pdf_3, new Double(kAlpha));
-  auto dx = mul(out, dy);
+  auto one = IrBuilder::create<Double>(tanh_x->container(), 1.);
+  auto tanh_sq = mul(tanh_x, tanh_x);
+  auto sub_tanh_sq = sub(one, tanh_sq);
+  auto dx = mul(dy, sub_tanh_sq);
   return dx;
 }
 
-namespace {
-
-//! Transform TensorView according to keep, merge, and split transformations.
-//! Trivial reduction and broadcast transformations are handled separately.
-//! It is recommend to use the composite ops view function, which will call
-//! the analyzeView function to generate the appropriate transformations.
-//!
-//! For example:
-//! original sizes = [2, 10, 40]
-//! new_size = [2, 10, 2, 20]
-//! auto analysis = analyzeView(TV0, original_sizes, new_sizes)
-//! auto TV1 = TV0->view(analysis.transforms);
-//!
-//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)]
-//! Before: TV0[I0, I1, I2]
-//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)]
-//!
-TensorView* applyViewTransforms(
-    TensorView* tv,
-    const std::vector<std::shared_ptr<ViewTransform>>& transforms) {
-  TORCH_INTERNAL_ASSERT(
-      !tv->hasComputeAt(),
-      "Cannot modify rfactor domain after compute at has been set.");
-
-  TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView");
-
+TensorView* view_as_real(TensorView* x) {
+  auto input_type = x->getDataType().value();
   TORCH_CHECK(
-      !tv->domain()->hasRFactor(),
-      "Cannot call view on the same TensorView twice.");
-
-  TORCH_INTERNAL_ASSERT(!transforms.empty());
-
-  TensorView* consumer =
-      new TensorView(tv->domain()->view(transforms), tv->getDataType().value());
-
-  new ViewOp(consumer, tv);
-
-  return consumer;
-}
-
-} // namespace
-
-TensorView* view(
-    TensorView* x,
-    const std::vector<int64_t>& original_sizes,
-    const std::vector<int64_t>& new_sizes) {
-  auto analyze_view = analyzeView(x, original_sizes, new_sizes);
-
-  auto reduction = (!analyze_view.trivial_reduction_axes.empty())
-      ? sum(x, analyze_view.trivial_reduction_axes)
-      : x;
-
-  auto view = (!analyze_view.transforms.empty())
-      ? applyViewTransforms(reduction, analyze_view.transforms)
-      : reduction;
+      isComplexType(input_type),
+      "Operand of view_as_real must have complex type");
 
-  return (analyze_view.has_broadcast)
-      ? broadcast(view, analyze_view.broadcast_axes)
-      : view;
+  auto vec_type = getVectorType(getTypeFromComplexType(input_type), 2);
+  auto tv_vector = bitCastOp(vec_type, x);
+  return viewAsScalar(tv_vector);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.h b/torch/csrc/jit/codegen/cuda/ops/composite.h
index 4470f0cc6f05..d73be9c469da 100644
--- a/torch/csrc/jit/codegen/cuda/ops/composite.h
+++ b/torch/csrc/jit/codegen/cuda/ops/composite.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -31,8 +31,6 @@ TORCH_CUDA_CU_API TensorView* dropout_backward(
     TensorView* mask,
     Val* scale);
 
-TORCH_CUDA_CU_API Val* softplus(Val* x, Val* beta, Val* threshold);
-
 struct LstmResult {
   TensorView* cell = nullptr;
   TensorView* hidden = nullptr;
@@ -45,14 +43,17 @@ TORCH_CUDA_CU_API LstmResult lstm(
     TensorView* cell_x,
     TensorView* out_x);
 
-TORCH_CUDA_CU_API Val* fast_gelu(Val* x);
-TORCH_CUDA_CU_API Val* fast_gelu_backward(Val* dy, Val* x);
-TORCH_CUDA_CU_API Val* gelu_backward(Val* dy, Val* x);
-
-TORCH_CUDA_CU_API TensorView* view(
+TORCH_CUDA_CU_API TensorView* softplus(
     TensorView* x,
-    const std::vector<int64_t>& x_sizes,
-    const std::vector<int64_t>& new_sizes);
+    Val* beta,
+    Val* threshold);
+TORCH_CUDA_CU_API TensorView* gelu(TensorView* x);
+TORCH_CUDA_CU_API TensorView* gelu_backward(TensorView* dy, TensorView* x);
+TORCH_CUDA_CU_API TensorView* tanh_gelu(TensorView* x);
+TORCH_CUDA_CU_API TensorView* tanh_gelu_backward(TensorView* dy, TensorView* x);
+TORCH_CUDA_CU_API TensorView* tanh_backward(TensorView* dy, TensorView* tanh_x);
+
+TORCH_CUDA_CU_API TensorView* view_as_real(TensorView* x);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
index 19201687553b..00b013bdc524 100644
--- a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
+++ b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
 
 namespace torch {
@@ -6,6 +7,64 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+int nonNegativeAxis(int axis, int ndims) {
+  return (axis >= 0) ? axis : (ndims + axis);
+}
+
+Val* numFeatures(TensorView* x, const std::vector<int>& dims, int ndims) {
+  Val* num_features = IrBuilder::create<Double>(x->container(), 1);
+  for (const auto dim : dims) {
+    const int axis = nonNegativeAxis(dim, ndims);
+    num_features = mul(num_features, x->domain()->domain()[axis]->extent());
+  }
+  return num_features;
+}
+
+TensorView* mean(TensorView* x, const std::vector<int>& dims, bool keepdim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+
+  const int kNumberOfDims =
+      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
+
+  auto sum_x = sum(x, dims, keepdim);
+  auto y = div(sum_x, numFeatures(x, dims, kNumberOfDims));
+  return y;
+}
+
+TensorView* variance(
+    TensorView* x,
+    const std::vector<int>& dims,
+    bool unbiased,
+    bool keepdim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+
+  const int kNumberOfDims =
+      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
+
+  auto bcast_mean = mean(x, dims, true /* keepdim */);
+  auto x_mean_sub = sub(x, bcast_mean);
+  auto x_mean_sub_sq = mul(x_mean_sub, x_mean_sub);
+  auto sum_x_mean_sub_sq = sum(x_mean_sub_sq, dims, keepdim);
+
+  auto num_features = numFeatures(x, dims, kNumberOfDims);
+  if (unbiased) {
+    num_features =
+        sub(num_features, IrBuilder::create<Double>(x->container(), 1.));
+  }
+  auto y = div(sum_x_mean_sub_sq, num_features);
+
+  return y;
+}
+
+TensorView* standard_deviation(
+    TensorView* x,
+    const std::vector<int>& dims,
+    bool unbiased,
+    bool keepdim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+  return sqrt(variance(x, dims, unbiased, keepdim));
+}
+
 TensorView* softmax(TensorView* x, int dim) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
 
@@ -23,7 +82,7 @@ TensorView* softmax(TensorView* x, int dim) {
   auto exp_val = exp(x_max_sub);
   auto sum_exp = sum(exp_val, {kReductionAxis});
   auto bcast_sum = broadcast(sum_exp, broadcast_mask);
-  auto y = div(exp_val, bcast_sum);
+  auto y = mul(exp_val, reciprocal(bcast_sum));
 
   return y;
 }
@@ -49,6 +108,45 @@ TensorView* softmax_backward(TensorView* dy, TensorView* y, int dim) {
   return dx;
 }
 
+TensorView* log_softmax(TensorView* x, int dim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+
+  const int kNumberOfDims =
+      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
+  const int kReductionAxis = (dim < 0) ? dim + kNumberOfDims : dim;
+  TORCH_INTERNAL_ASSERT(kReductionAxis >= 0 && kReductionAxis < kNumberOfDims);
+
+  std::vector<bool> broadcast_mask(kNumberOfDims, false);
+  broadcast_mask[kReductionAxis] = true;
+
+  auto max_val = max(x, {kReductionAxis});
+  auto bcast_max = broadcast(max_val, broadcast_mask);
+  auto x_max_sub = sub(x, bcast_max);
+  auto exp_val = exp(x_max_sub);
+  auto bcast_sum = sum(exp_val, {kReductionAxis}, true /* keepdim */);
+  auto log_sum_exp = log(bcast_sum);
+  auto y = sub(x_max_sub, log_sum_exp);
+
+  return y;
+}
+
+TensorView* log_softmax_backward(TensorView* dy, TensorView* y, int dim) {
+  TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid.");
+  TORCH_INTERNAL_ASSERT(y != nullptr, "Output is invalid.");
+
+  const int kNumberOfDims =
+      TensorDomain::noReductions(y->getMaybeRFactorDomain()).size();
+  const int kReductionAxis = (dim < 0) ? dim + kNumberOfDims : dim;
+  TORCH_INTERNAL_ASSERT(kReductionAxis >= 0 && kReductionAxis < kNumberOfDims);
+
+  auto bcast_sum_grad = sum(dy, {kReductionAxis}, true /* keepdim */);
+  auto softmax = exp(y);
+  auto softmax_sum_mul = mul(softmax, bcast_sum_grad);
+  auto dx = sub(dy, softmax_sum_mul);
+
+  return dx;
+}
+
 ForwardNormResult layer_norm(
     TensorView* x,
     const std::vector<int64_t>& norm_shape,
@@ -58,18 +156,9 @@ ForwardNormResult layer_norm(
   return layer_norm(x, norm_shape.size(), weight, bias, eps);
 }
 
-ForwardNormResult layer_norm(
-    TensorView* x,
-    const size_t kNormShapeNumDims,
-    TensorView* weight,
-    TensorView* bias,
-    Val* eps) {
-  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
-  TORCH_INTERNAL_ASSERT(
-      eps != nullptr && eps->getDataType().has_value() &&
-          eps->getDataType().value() == DataType::Double,
-      "Epsilon (eps) is not a valid Double.");
-
+auto norm_properties_from_num_dims(
+    const TensorView* x,
+    const size_t kNormShapeNumDims) {
   // (B, C, H, W, D) tensor
   // norm_shape = [H, W, D]
   // M = outer = product of remaining dimensions = B * C
@@ -81,28 +170,57 @@ ForwardNormResult layer_norm(
 
   std::vector<int> outer_reduction_axes(kOuterNumDims);
   std::vector<bool> outer_broadcast_mask(kNumberOfDims, false);
+  std::vector<int> inner_reduction_axes(kNormShapeNumDims);
+  std::vector<bool> inner_broadcast_mask(kNumberOfDims, false);
+
   for (const auto idx : c10::irange(kOuterNumDims)) {
     outer_reduction_axes[idx] = idx;
     outer_broadcast_mask[idx] = true;
   }
 
-  std::vector<int> inner_reduction_axes(kNormShapeNumDims);
-  std::vector<bool> inner_broadcast_mask(kNumberOfDims, false);
-  Val* num_features = new Double(1);
+  Val* num_features = IrBuilder::create<Double>(x->container(), 1);
   for (const auto idx : c10::irange(kNormShapeNumDims)) {
     const size_t axis = kNumberOfDims - 1 - idx;
     inner_reduction_axes[idx] = axis;
     inner_broadcast_mask[axis] = true;
     num_features = mul(num_features, x->domain()->domain()[axis]->extent());
   }
+  struct result {
+    std::vector<int> outer_reduction_axes;
+    std::vector<bool> outer_broadcast_mask;
+    std::vector<int> inner_reduction_axes;
+    std::vector<bool> inner_broadcast_mask;
+    Val* num_features = nullptr;
+  } r;
+  r.outer_reduction_axes = outer_reduction_axes;
+  r.outer_broadcast_mask = outer_broadcast_mask;
+  r.inner_reduction_axes = inner_reduction_axes;
+  r.inner_broadcast_mask = inner_broadcast_mask;
+  r.num_features = num_features;
+  return r;
+}
+
+ForwardNormResult layer_norm(
+    TensorView* x,
+    const size_t kNormShapeNumDims,
+    TensorView* weight,
+    TensorView* bias,
+    Val* eps) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(
+      eps != nullptr && eps->getDataType().has_value() &&
+          eps->getDataType().value() == DataType::Double,
+      "Epsilon (eps) is not a valid Double.");
+
+  auto r = norm_properties_from_num_dims(x, kNormShapeNumDims);
 
   // Main algorithm
-  auto welford_out = Welford(x, inner_reduction_axes);
-  auto mean_bcast = broadcast(welford_out.avg, inner_broadcast_mask);
+  auto welford_out = Welford(x, r.inner_reduction_axes);
+  auto mean_bcast = broadcast(welford_out.avg, r.inner_broadcast_mask);
   auto x_sub_mean = sub(x, mean_bcast);
 
-  auto var_sum_bcast = broadcast(welford_out.var_sum, inner_broadcast_mask);
-  auto var = div(var_sum_bcast, num_features);
+  auto var_sum_bcast = broadcast(welford_out.var_sum, r.inner_broadcast_mask);
+  auto var = mul(var_sum_bcast, reciprocal(r.num_features));
   auto var_eps = add(var, eps);
   auto invstd = rsqrt(var_eps);
 
@@ -110,19 +228,58 @@ ForwardNormResult layer_norm(
 
   // Optional: norm * weight
   if (weight != nullptr) {
-    auto weight_bcast = broadcast(weight, outer_broadcast_mask);
+    auto weight_bcast = broadcast(weight, r.outer_broadcast_mask);
     y = mul(y, weight_bcast);
   }
 
   // Optional: norm * weight + bias
   if (bias != nullptr) {
-    auto bias_bcast = broadcast(bias, outer_broadcast_mask);
+    auto bias_bcast = broadcast(bias, r.outer_broadcast_mask);
     y = add(y, bias_bcast);
   }
 
   return {y, mean_bcast, invstd};
 }
 
+ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* weight,
+    Val* eps) {
+  return rms_norm(x, norm_shape.size(), weight, eps);
+}
+
+ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const size_t kNormShapeNumDims,
+    TensorView* weight,
+    Val* eps) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(
+      eps != nullptr && eps->getDataType().has_value() &&
+          eps->getDataType().value() == DataType::Double,
+      "Epsilon (eps) is not a valid Double.");
+
+  auto r = norm_properties_from_num_dims(x, kNormShapeNumDims);
+
+  // Main algorithm
+  auto var_sum = sum(mul(x, x), r.inner_reduction_axes);
+  auto var_sum_bcast = broadcast(var_sum, r.inner_broadcast_mask);
+  auto var = mul(var_sum_bcast, reciprocal(r.num_features));
+  auto var_eps = add(var, eps);
+  auto invstd = rsqrt(var_eps);
+
+  auto y = mul(x, invstd);
+
+  // Optional: norm * weight
+  if (weight != nullptr) {
+    auto weight_bcast = broadcast(weight, r.outer_broadcast_mask);
+    y = mul(y, weight_bcast);
+  }
+
+  return {y, invstd};
+}
+
 BackwardNormResult layer_norm_backward(
     TensorView* dy,
     TensorView* x,
@@ -137,55 +294,30 @@ BackwardNormResult layer_norm_backward(
   TORCH_INTERNAL_ASSERT(mean != nullptr, "Mean is invalid.");
   TORCH_INTERNAL_ASSERT(invstd != nullptr, "Inv std is invalid.");
 
-  // (B, C, H, W, D) tensor
-  // norm_shape = [H, W, D]
-  // M = outer = product of remaining dimensions = B * C
-  // N = reduction = product of norm_shape = H * W * D
-  // weight = bias = norm_shape tensor
-  const size_t kNumberOfDims =
-      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
-  const size_t kNormShapeNumDims = norm_shape.size();
-  const size_t kOuterNumDims = kNumberOfDims - kNormShapeNumDims;
-
-  std::vector<int> outer_reduction_axes(kOuterNumDims);
-  std::vector<bool> outer_broadcast_mask(kNumberOfDims, false);
-  for (const auto idx : c10::irange(kOuterNumDims)) {
-    outer_reduction_axes[idx] = idx;
-    outer_broadcast_mask[idx] = true;
-  }
-
-  std::vector<int> inner_reduction_axes(kNormShapeNumDims);
-  std::vector<bool> inner_broadcast_mask(kNumberOfDims, false);
-  Val* num_features = new Double(1);
-  for (const auto idx : c10::irange(kNormShapeNumDims)) {
-    const size_t axis = kNumberOfDims - 1 - idx;
-    inner_reduction_axes[idx] = axis;
-    inner_broadcast_mask[axis] = true;
-    num_features = mul(num_features, x->domain()->domain()[axis]->extent());
-  }
+  auto r = norm_properties_from_num_dims(x, norm_shape.size());
 
   auto x_hat = mul(sub(x, mean), invstd);
 
   TensorView* grad_x_hat = nullptr;
   if (weight != nullptr) {
-    auto* bcast_weight = broadcast(weight, outer_broadcast_mask);
+    auto* bcast_weight = broadcast(weight, r.outer_broadcast_mask);
     grad_x_hat = mul(dy, bcast_weight);
   } else {
     grad_x_hat = dy;
   }
 
-  auto a = mul(num_features, grad_x_hat);
+  auto a = mul(r.num_features, grad_x_hat);
 
-  auto b = sum(grad_x_hat, inner_reduction_axes);
-  auto bcast_b = broadcast(b, inner_broadcast_mask);
+  auto b = sum(grad_x_hat, r.inner_reduction_axes);
+  auto bcast_b = broadcast(b, r.inner_broadcast_mask);
 
   auto c1 = mul(grad_x_hat, x_hat);
-  auto c2 = sum(c1, inner_reduction_axes);
-  auto bcast_c2 = broadcast(c2, inner_broadcast_mask);
+  auto c2 = sum(c1, r.inner_reduction_axes);
+  auto bcast_c2 = broadcast(c2, r.inner_broadcast_mask);
   auto c3 = mul(x_hat, bcast_c2);
 
   auto inner = sub(sub(a, bcast_b), c3);
-  auto reciprocal_size = reciprocal(num_features);
+  auto reciprocal_size = reciprocal(r.num_features);
 
   TensorView* dx = nullptr;
   if (output_mask[0]) {
@@ -194,16 +326,65 @@ BackwardNormResult layer_norm_backward(
 
   TensorView* dw = nullptr;
   if (output_mask[1] && weight != nullptr) {
-    dw = sum(mul(dy, x_hat), outer_reduction_axes);
+    dw = sum(mul(dy, x_hat), r.outer_reduction_axes);
   }
 
   TensorView* db = nullptr;
   if (output_mask[2] && bias != nullptr) {
-    db = sum(dy, outer_reduction_axes);
+    db = sum(dy, r.outer_reduction_axes);
   }
   return {dx, dw, db};
 }
 
+BackwardRMSNormResult rms_norm_backward(
+    TensorView* dy,
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* invstd,
+    TensorView* weight,
+    const std::vector<bool>& output_mask) {
+  TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid.");
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(invstd != nullptr, "Inv std is invalid.");
+
+  auto r = norm_properties_from_num_dims(x, norm_shape.size());
+
+  auto x_hat = mul(x, invstd);
+
+  TensorView* grad_x_hat = nullptr;
+  if (weight != nullptr) {
+    auto* bcast_weight = broadcast(weight, r.outer_broadcast_mask);
+    grad_x_hat = mul(dy, bcast_weight);
+  } else {
+    grad_x_hat = dy;
+  }
+
+  auto a = mul(r.num_features, grad_x_hat);
+
+  auto b = sum(grad_x_hat, r.inner_reduction_axes);
+  auto bcast_b = broadcast(b, r.inner_broadcast_mask);
+
+  auto c1 = mul(grad_x_hat, x_hat);
+  auto c2 = sum(c1, r.inner_reduction_axes);
+  auto bcast_c2 = broadcast(c2, r.inner_broadcast_mask);
+  auto c3 = mul(x_hat, bcast_c2);
+
+  auto inner = sub(sub(a, bcast_b), c3);
+  auto reciprocal_size = reciprocal(r.num_features);
+
+  TensorView* dx = nullptr;
+  if (output_mask[0]) {
+    dx = mul(mul(reciprocal_size, invstd), inner);
+  }
+
+  TensorView* dw = nullptr;
+  if (output_mask[1] && weight != nullptr) {
+    dw = sum(mul(dy, x_hat), r.outer_reduction_axes);
+  }
+
+  return {dx, dw};
+}
+
 ForwardNormResult batch_norm(
     TensorView* x,
     TensorView* weight,
@@ -243,7 +424,7 @@ ForwardNormResult batch_norm(
 
   std::vector<int> reduction_axes;
   std::vector<bool> broadcast_mask(kNumberOfDims, false);
-  Val* num_features = new Double(1);
+  Val* num_features = IrBuilder::create<Double>(x->container(), 1);
 
   for (const auto axis : c10::irange(kNumberOfDims)) {
     if (axis != c_axis) {
@@ -267,22 +448,24 @@ ForwardNormResult batch_norm(
           kTraining,
           "When running stats are provided, batch stats should only be computed during training");
 
-      auto rev_momentum = sub(new Double(1.0), momentum);
+      auto rev_momentum =
+          sub(IrBuilder::create<Double>(x->container(), 1.0), momentum);
       auto current_mean_hat = mul(welford_out.avg, momentum);
       auto mean_hat = mul(running_mean, rev_momentum);
       auto new_mean_hat = add(mean_hat, current_mean_hat);
 
-      auto num_feature_decrement = sub(num_features, new Int(1));
-      auto unbiased_var = div(welford_out.var_sum, num_feature_decrement);
+      auto num_feature_decrement = sub(num_features, x->container()->oneVal());
+      auto unbiased_var =
+          mul(welford_out.var_sum, reciprocal(num_feature_decrement));
       auto current_var_hat = mul(unbiased_var, momentum);
       auto var_hat = mul(running_var, rev_momentum);
       auto new_var_hat = add(var_hat, current_var_hat);
 
-      // when inputs have been casted by parser. We want to alias the output to
-      // the pre-casted input, so we can still update running stats
+      // when inputs have been cast by parser. We want to alias the output to
+      // the pre-cast input, so we can still update running stats
       auto cast_to_input_dtype = [fusion](
-                                     Val* casted_input, Val* aliased_output) {
-        auto unary_op = casted_input->definition();
+                                     Val* cast_input, Val* aliased_output) {
+        auto unary_op = cast_input->definition();
         TORCH_INTERNAL_ASSERT(
             unary_op->isA<UnaryOp>() &&
                 unary_op->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::Cast,
@@ -295,21 +478,18 @@ ForwardNormResult batch_norm(
         TORCH_INTERNAL_ASSERT(
             rm_dtype.has_value(),
             "Input running stats must have dtype defined");
-        auto casted_output = castOp(*rm_dtype, aliased_output);
+        auto cast_output = castOp(*rm_dtype, aliased_output);
 
-        fusion->addOutput(casted_output);
-        fusion->aliasOutputToInput(casted_output, input_to_cast);
+        fusion->aliasOutputToInput(cast_output, input_to_cast);
       };
 
-      if (fusion->hasInput(running_mean)) {
-        fusion->addOutput(new_mean_hat);
+      if (running_mean->isFusionInput()) {
         fusion->aliasOutputToInput(new_mean_hat, running_mean);
       } else {
         cast_to_input_dtype(running_mean, new_mean_hat);
       }
 
-      if (fusion->hasInput(running_var)) {
-        fusion->addOutput(new_var_hat);
+      if (running_var->isFusionInput()) {
         fusion->aliasOutputToInput(new_var_hat, running_var);
       } else {
         cast_to_input_dtype(running_var, new_var_hat);
@@ -320,7 +500,7 @@ ForwardNormResult batch_norm(
     auto mean_bcast = broadcast(mean, broadcast_mask);
     auto x_sub_mean = sub(x, mean_bcast);
 
-    auto var = div(welford_out.var_sum, num_features);
+    auto var = mul(welford_out.var_sum, reciprocal(num_features));
     auto var_eps = add(var, eps);
     invstd = rsqrt(var_eps);
     auto invstd_bcast = broadcast(invstd, broadcast_mask);
@@ -414,19 +594,6 @@ BackwardNormResult batch_norm_backward(
 
   mean = broadcast(mean, broadcast_mask);
 
-  TensorView* weight_val = nullptr;
-  if (weight == nullptr) {
-    weight_val = TensorViewBuilder()
-                     .ndims(kNumberOfDims)
-                     .dtype(input->getDataType().value())
-                     .shape(std::vector<int64_t>(kNumberOfDims, 1))
-                     .build();
-    new UnaryOp(
-        UnaryOpType::Set, weight_val->as<Val>(), (new Double(1.0))->as<Val>());
-  } else {
-    weight_val = broadcast(weight, broadcast_mask);
-  }
-
   auto norm = reciprocal(num_features);
 
   auto grad_output_sum = sum(grad_output, reduction_axes);
@@ -435,7 +602,16 @@ BackwardNormResult batch_norm_backward(
   auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask);
   auto proj_scale =
       broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask);
-  auto grad_scale = mul(broadcast(invstd, broadcast_mask), weight_val);
+  TensorView* grad_scale = nullptr;
+
+  if (weight == nullptr) {
+    grad_scale =
+        mul(broadcast(invstd, broadcast_mask),
+            IrBuilder::create<Double>(input->container(), 1));
+  } else {
+    grad_scale = mul(
+        broadcast(invstd, broadcast_mask), broadcast(weight, broadcast_mask));
+  }
 
   TensorView* grad_input = nullptr;
   if (kTraining) {
@@ -466,7 +642,8 @@ ForwardNormResult instance_norm(
     TensorView* running_var,
     const bool kUseInputStats,
     Val* momentum,
-    Val* eps) {
+    Val* eps,
+    bool channels_last) {
   auto fusion = FusionGuard::getCurFusion();
 
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
@@ -490,13 +667,13 @@ ForwardNormResult instance_norm(
   // N = reduction = H * W * D
   // weight = bias = C tensor
   const size_t kBatchDim = 0;
-  const size_t kChannelsDim = 1;
   const size_t kNumberOfDims =
       TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
+  const size_t kChannelsDim = channels_last ? kNumberOfDims - 1 : 1;
 
   std::vector<int> x_reduction_axes;
   std::vector<bool> x_broadcast_mask(kNumberOfDims, false);
-  Val* N = new Double(1);
+  Val* N = IrBuilder::create<Double>(x->container(), 1);
   for (const auto axis : c10::irange(kNumberOfDims)) {
     if (axis != kBatchDim && axis != kChannelsDim) {
       x_reduction_axes.push_back(axis);
@@ -504,7 +681,7 @@ ForwardNormResult instance_norm(
       N = mul(N, x->domain()->domain()[axis]->extent());
     }
   }
-  Val* B = new Double(1);
+  Val* B = IrBuilder::create<Double>(x->container(), 1);
   B = mul(B, x->domain()->domain()[kBatchDim]->extent());
 
   std::vector<bool> channels_only_broadcast_mask(kNumberOfDims, false);
@@ -523,29 +700,51 @@ ForwardNormResult instance_norm(
 
     // updating running mean and running var
     if (running_mean != nullptr && running_var != nullptr) {
-      auto rev_momentum = sub(new Double(1.0), momentum);
+      auto _running_mean = running_mean;
+      auto _running_var = running_var;
+      if (_running_mean->getDataType().value() == DataType::Half ||
+          _running_mean->getDataType().value() == DataType::BFloat16) {
+        _running_mean = castOp(DataType::Float, _running_mean);
+      }
+      if (_running_var->getDataType().value() == DataType::Half ||
+          _running_var->getDataType().value() == DataType::BFloat16) {
+        _running_var = castOp(DataType::Float, running_var);
+      }
+      auto rev_momentum =
+          sub(IrBuilder::create<Double>(x->container(), 1.0), momentum);
       auto current_mean_hat = mul(welford_out.avg, momentum);
-      auto mean_hat = mul(running_mean, rev_momentum);
+      auto mean_hat = mul(_running_mean, rev_momentum);
       auto new_mean_hat = add(mean_hat, current_mean_hat);
 
       // NS: static_cast to workaround VC++ error, see
       // https://godbolt.org/z/6Prd77xYs
       auto new_mean_sum = sum(new_mean_hat, {static_cast<int>(kBatchDim)});
-      auto new_mean_channels_only = div(new_mean_sum, B);
-      fusion->addOutput(new_mean_channels_only);
+      auto new_mean_channels_only = mul(new_mean_sum, reciprocal(B));
+      if (running_mean->getDataType().value() == DataType::Half ||
+          running_mean->getDataType().value() == DataType::BFloat16) {
+        new_mean_channels_only =
+            castOp(running_mean->getDataType().value(), new_mean_channels_only);
+      }
+      // fusion->addOutput(new_mean_channels_only);
       fusion->aliasOutputToInput(new_mean_channels_only, running_mean);
 
-      auto num_feature_decrement = sub(N, new Int(1));
-      auto unbiased_var = div(welford_out.var_sum, num_feature_decrement);
+      auto num_feature_decrement = sub(N, x->container()->oneVal());
+      auto unbiased_var =
+          mul(welford_out.var_sum, reciprocal(num_feature_decrement));
       auto current_var_hat = mul(unbiased_var, momentum);
-      auto var_hat = mul(running_var, rev_momentum);
+      auto var_hat = mul(_running_var, rev_momentum);
       auto new_var_hat = add(var_hat, current_var_hat);
 
       // NS: static_cast to workaround VC++ error, see
       // https://godbolt.org/z/6Prd77xYs
       auto new_var_sum = sum(new_var_hat, {static_cast<int>(kBatchDim)});
-      auto new_var_channels_only = div(new_var_sum, B);
-      fusion->addOutput(new_var_channels_only);
+      auto new_var_channels_only = mul(new_var_sum, reciprocal(B));
+      if (running_var->getDataType().value() == DataType::Half ||
+          running_var->getDataType().value() == DataType::BFloat16) {
+        new_var_channels_only =
+            castOp(running_var->getDataType().value(), new_var_channels_only);
+      }
+      // fusion->addOutput(new_var_channels_only);
       fusion->aliasOutputToInput(new_var_channels_only, running_var);
     }
 
@@ -553,7 +752,7 @@ ForwardNormResult instance_norm(
     auto mean_bcast = broadcast(mean, x_broadcast_mask);
     auto x_sub_mean = sub(x, mean_bcast);
 
-    auto var = div(welford_out.var_sum, N);
+    auto var = mul(welford_out.var_sum, reciprocal(N));
     auto var_eps = add(var, eps);
     invstd = rsqrt(var_eps);
     auto invstd_bcast = broadcast(invstd, x_broadcast_mask);
@@ -589,6 +788,121 @@ ForwardNormResult instance_norm(
   return {y, mean, invstd};
 }
 
+BackwardNormResult instance_norm_backward(
+    TensorView* input,
+    TensorView* grad_output,
+    TensorView* weight,
+    TensorView* running_mean,
+    TensorView* running_var,
+    TensorView* save_mean,
+    TensorView* save_invstd,
+    const bool kTraining,
+    Val* eps,
+    const std::vector<bool>& output_mask,
+    bool channels_last) {
+  TORCH_INTERNAL_ASSERT(input != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(grad_output != nullptr, "Grad Output is invalid.");
+  TORCH_INTERNAL_ASSERT(
+      eps != nullptr && eps->getDataType().has_value() &&
+          eps->getDataType().value() == DataType::Double,
+      "Epsilon (eps) is not a valid Double.");
+
+  // (B, C, H, W, D) tensor
+  // M = outer = channels
+  // N = reduction = B * H * W * D
+  // weight = bias = (C) tensor
+  const size_t kNumberOfDims =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain()).size();
+  // channels last format means C dimension is at axis kNumberOfDims-1 at x /
+  // grad_out
+  const size_t b_axis = 0; // for clarity
+  const size_t c_axis = channels_last ? kNumberOfDims - 1 : 1;
+
+  std::vector<int> reduction_axes;
+  std::vector<bool> broadcast_mask(kNumberOfDims, false);
+  // weight has its own broadcast mask as it is broadcast for the batch unlike
+  // mean/var
+  std::vector<bool> weight_broadcast_mask(kNumberOfDims, false);
+  Val* num_features = nullptr;
+  for (const auto axis : c10::irange(kNumberOfDims)) {
+    if (axis != c_axis) {
+      weight_broadcast_mask[axis] = true;
+      if (axis != b_axis) {
+        reduction_axes.push_back(axis);
+        broadcast_mask[axis] = true;
+        if (num_features == nullptr) {
+          num_features = castOp(
+              DataType::Double, input->domain()->domain()[axis]->extent());
+        } else {
+          num_features =
+              mul(num_features, input->domain()->domain()[axis]->extent());
+        }
+      }
+    }
+  }
+
+  auto mean = save_mean;
+  auto invstd = save_invstd;
+  if (kTraining) {
+    TORCH_INTERNAL_ASSERT(
+        save_mean != nullptr && save_invstd != nullptr,
+        "When training=True, save_mean and save_invstd are required.");
+  } else {
+    mean = running_mean;
+    invstd = rsqrt(add(running_var, eps));
+  }
+  mean = broadcast(mean, broadcast_mask);
+
+  auto norm = reciprocal(num_features);
+
+  auto grad_output_sum = sum(grad_output, reduction_axes);
+  auto dot_p = sum(mul(grad_output, sub(input, mean)), reduction_axes);
+
+  auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask);
+
+  auto proj_scale =
+      broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask);
+
+  TensorView* grad_scale = nullptr;
+
+  if (weight == nullptr) {
+    grad_scale =
+        mul(broadcast(invstd, broadcast_mask),
+            IrBuilder::create<Double>(input->container(), 1));
+  } else {
+    grad_scale =
+        mul(broadcast(invstd, broadcast_mask),
+            broadcast(weight, weight_broadcast_mask));
+  }
+
+  TensorView* grad_input = nullptr;
+  if (kTraining) {
+    auto proj = mul(sub(input, mean), proj_scale);
+    grad_input = mul(sub(sub(grad_output, proj), grad_mean), grad_scale);
+  } else {
+    grad_input = mul(grad_output, grad_scale);
+  }
+
+  TensorView* grad_weight = nullptr;
+  TensorView* grad_weight_reduced = nullptr;
+  if (output_mask[1]) {
+    grad_weight = mul(dot_p, invstd);
+    // TODO: grad weight needs to be reduced across batch-dim but is this the
+    // most efficient place or can reduction happen earlier?
+    grad_weight_reduced = sum(grad_weight, {0});
+  }
+
+  TensorView* grad_bias = nullptr;
+  TensorView* grad_bias_reduced = nullptr;
+  if (output_mask[2]) {
+    grad_bias = grad_output_sum;
+    // TODO: same as above for grad weight
+    grad_bias_reduced = sum(grad_bias, {0});
+  }
+
+  return {grad_input, grad_weight_reduced, grad_bias_reduced};
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.h b/torch/csrc/jit/codegen/cuda/ops/normalization.h
index dae58462b929..74d8cc4ab650 100644
--- a/torch/csrc/jit/codegen/cuda/ops/normalization.h
+++ b/torch/csrc/jit/codegen/cuda/ops/normalization.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -28,6 +28,33 @@ struct BackwardNormResult {
   TensorView* grad_bias = nullptr;
 };
 
+struct ForwardRMSNormResult {
+  TensorView* output = nullptr;
+  TensorView* invstd = nullptr;
+};
+
+struct BackwardRMSNormResult {
+  TensorView* grad_input = nullptr;
+  TensorView* grad_weight = nullptr;
+};
+
+TORCH_CUDA_CU_API TensorView* mean(
+    TensorView* x,
+    const std::vector<int>& dims,
+    bool keepdim);
+
+TORCH_CUDA_CU_API TensorView* variance(
+    TensorView* x,
+    const std::vector<int>& dims,
+    bool unbiased,
+    bool keepdim);
+
+TORCH_CUDA_CU_API TensorView* standard_deviation(
+    TensorView* x,
+    const std::vector<int>& dims,
+    bool unbiased,
+    bool keepdim);
+
 TORCH_CUDA_CU_API TensorView* softmax(TensorView* x, int dim);
 
 TORCH_CUDA_CU_API TensorView* softmax_backward(
@@ -35,6 +62,13 @@ TORCH_CUDA_CU_API TensorView* softmax_backward(
     TensorView* y,
     const int dim);
 
+TORCH_CUDA_CU_API TensorView* log_softmax(TensorView* x, int dim);
+
+TORCH_CUDA_CU_API TensorView* log_softmax_backward(
+    TensorView* dy,
+    TensorView* y,
+    const int dim);
+
 TORCH_CUDA_CU_API ForwardNormResult layer_norm(
     TensorView* x,
     const std::vector<int64_t>& norm_shape,
@@ -49,6 +83,18 @@ TORCH_CUDA_CU_API ForwardNormResult layer_norm(
     TensorView* bias,
     Val* eps);
 
+TORCH_CUDA_CU_API ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* weight,
+    Val* eps);
+
+TORCH_CUDA_CU_API ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const size_t kNormShapeNumDims,
+    TensorView* weight,
+    Val* eps);
+
 TORCH_CUDA_CU_API BackwardNormResult layer_norm_backward(
     TensorView* dy,
     TensorView* x,
@@ -59,6 +105,14 @@ TORCH_CUDA_CU_API BackwardNormResult layer_norm_backward(
     TensorView* bias,
     const std::vector<bool>& output_mask);
 
+TORCH_CUDA_CU_API BackwardRMSNormResult rms_norm_backward(
+    TensorView* dy,
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* rstd,
+    TensorView* weight,
+    const std::vector<bool>& output_mask);
+
 TORCH_CUDA_CU_API ForwardNormResult batch_norm(
     TensorView* x,
     TensorView* weight,
@@ -89,9 +143,23 @@ TORCH_CUDA_CU_API ForwardNormResult instance_norm(
     TensorView* bias,
     TensorView* running_mean,
     TensorView* running_var,
-    const bool kUseInputStats,
+    const bool kUseInputStats, // kTraining?
     Val* momentum,
-    Val* eps);
+    Val* eps,
+    bool channels_last = false);
+
+TORCH_CUDA_CU_API BackwardNormResult instance_norm_backward(
+    TensorView* x,
+    TensorView* dy,
+    TensorView* weight,
+    TensorView* running_mean,
+    TensorView* running_var,
+    TensorView* save_mean,
+    TensorView* save_invstd,
+    const bool kTraining,
+    Val* eps,
+    const std::vector<bool>& output_mask,
+    bool channels_last = false);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
index 3dcb58335a44..fd468a8b792e 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
@@ -5,8 +5,6 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 #include <sstream>
@@ -45,28 +43,22 @@ void ParallelDimensionMap::build(Fusion* fusion) {
 }
 
 void ParallelDimensionMap::registerConstantExtent(IterDomain* id) {
-  ExpressionEvaluator ee(id->fusion());
-  auto extent_int = ee.evaluate(id->extent());
-  if (!extent_int.has_value()) {
+  if (!id->extent()->isConstScalar()) {
     // Nothing to do if not constant
     return;
   }
 
-  auto const_extent = extent_int.value();
+  ExpressionEvaluator ee(id->fusion());
+  auto extent_int = ee.evaluate(id->extent());
+  TORCH_INTERNAL_ASSERT(
+      extent_int.has_value(),
+      "Extent of ",
+      id->toString(),
+      " should have been constant, but could not be evaluated at compile time.");
 
-  // Ignore if this is derived from a size-1 domain as it is likely a
-  // size-1 broadcast domain and that does not represent the actual
-  // dimension even if it's constant. Being size-1 may not always mean
-  // it's a broadcast domain, but it'd be safe to assume it is mostly
-  // the case. If it is not a broadcast, ignoring this domain does not
-  // impact the correctness.
-  auto extent_inputs = InputsOf::output(id->fusion(), id->extent());
-  if (std::any_of(extent_inputs.begin(), extent_inputs.end(), [](Val* input) {
-        return input->isOneInt();
-      })) {
-    return;
-  }
+  auto const_extent = extent_int.value();
 
+  // Uses index map
   auto concrete_id = getCAMappedConcreteDomain(id);
 
   auto existing_it = constant_extent_map_.find(id);
@@ -101,25 +93,21 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet(
     const std::unordered_set<IterDomain*>& dom_set) {
   TORCH_INTERNAL_ASSERT(dom_set.size() == 1);
 
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   // pt is used by only one concrete domain
   auto id = *dom_set.begin();
   auto it = constant_extent_map_.find(id);
 
   if (it != constant_extent_map_.end()) {
-    if (it->second.size() == 1) {
-      dim_map_.insert({pt, ir_builder.create<kir::Int>(*(it->second.begin()))});
-      exact_types_.insert(pt);
-    } else {
-      // Multiple constant dimensions found; Use the corresponding
-      // symbolic parallel dim
-      dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)});
-    }
+    TORCH_INTERNAL_ASSERT(
+        it->second.size() == 1,
+        "Only one value found mapped to parallel type ",
+        stringifyThread(pt),
+        " yet its bound to multiple extents.");
+    dim_map_.insert({pt, IrBuilder::create<Int>(*(it->second.begin()))});
+    exact_types_.insert(pt);
   } else {
     // Prefer to use blockDim/gridDim if not constant
-    dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)});
+    dim_map_.insert({pt, NamedScalar::getParallelDim(pt)});
     exact_types_.insert(pt);
   }
 }
@@ -129,12 +117,9 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet(
     const std::unordered_set<IterDomain*>& dom_set) {
   TORCH_INTERNAL_ASSERT(dom_set.size() > 1);
 
-  const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   bool all_equal = true;
   // Use nullptr to signal it's not initialied yet
-  kir::Val* known_dimension = nullptr;
+  Val* known_dimension = nullptr;
   // Use -1 to signal it's not initialied yet
   int64_t known_const = -1;
 
@@ -172,7 +157,7 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet(
     // At this point, it still remains undetermined whether this id
     // matches with those previously looked at. Constant check failed,
     // but symbolic matching may succeed.
-    auto this_dimension = gpu_lower->lowerValue(concrete_id->extent());
+    auto this_dimension = concrete_id->extent();
     if (known_dimension == nullptr) {
       // No previous dimension found yet
       known_dimension = this_dimension;
@@ -191,21 +176,22 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet(
   }
   // Use the const value, if found, as its dimension
   if (all_equal && known_const != -1) {
-    dim_map_.insert({pt, ir_builder.create<kir::Int>(known_const)});
+    dim_map_.insert({pt, IrBuilder::create<Int>(known_const)});
   } else {
-    dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)});
+    dim_map_.insert({pt, NamedScalar::getParallelDim(pt)});
   }
 }
 
 void ParallelDimensionMap::adjustMappingsForWarpPadding() {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
 
   // If TIDx is padded to a multiple of the warp size, mark it as
   // non-exact.
 
   auto& warp_info = gpu_lower->getWarpPaddedParallelInfo();
-  if (!warp_info.is_tidx_padded) {
+  // TIDx isn't really padded if there isn't a warp reduction (this could
+  // change)
+  if (!(warp_info.is_tidx_padded && warp_info.has_warp_reduction)) {
     return;
   }
 
@@ -215,7 +201,7 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() {
   // If the dimension of TIDx is actually a multple of the warp size
   // before padding, it can be left as exact
   if (isExact(tidx_pt)) {
-    auto tidx_dim = dynamic_cast<kir::Int*>(get(tidx_pt));
+    auto tidx_dim = dynamic_cast<Int*>(get(tidx_pt));
     if (tidx_dim && tidx_dim->isConst()) {
       auto tidx_dim_val = tidx_dim->value().value();
       if (tidx_dim_val % warp_size == 0) {
@@ -223,23 +209,36 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() {
         return;
       }
     }
+    // If tidx is strictly defined as blockDim.x then it must be set to a
+    // multiple of the warp and can be considered exact
+    bool tidx_def_trivial = true;
+    for (auto entry : concrete_dom_map_.at(tidx_pt)) {
+      if (!entry->isA<NamedScalar>() ||
+          !entry->as<NamedScalar>()->sameAs(
+              NamedScalar::getParallelDim(tidx_pt))) {
+        tidx_def_trivial = false;
+      }
+    }
+    if (tidx_def_trivial) {
+      return;
+    }
   }
 
   // TIDx is padded to a multiple of warp. If it's known to be a
   // single warp, use the constant warp size as the dimension of
-  // TIDx. Otherwise, jsut use blockDim.x.
+  // TIDx. Otherwise, just use blockDim.x.
   if (warp_info.is_tidx_single_warp) {
-    dim_map_.at(ParallelType::TIDx) = ir_builder.create<kir::Int>(warp_size);
+    dim_map_.at(ParallelType::TIDx) = IrBuilder::create<Int>(warp_size);
   } else {
     dim_map_.at(ParallelType::TIDx) =
-        kir::NamedScalar::getParallelDim(ParallelType::TIDx);
+        NamedScalar::getParallelDim(ParallelType::TIDx);
   }
 
   // TIDx is no longer exact
   exact_types_.erase(ParallelType::TIDx);
 }
 
-kir::Val* ParallelDimensionMap::get(ParallelType pt) const {
+Val* ParallelDimensionMap::get(ParallelType pt) const {
   TORCH_INTERNAL_ASSERT(isParallelTypeThread(pt), "Invalid ParallelType: ", pt);
   auto it = dim_map_.find(pt);
   if (it == dim_map_.end()) {
@@ -254,14 +253,13 @@ bool ParallelDimensionMap::isExact(ParallelType pt) const {
 }
 
 IterDomain* ParallelDimensionMap::getCAMappedConcreteDomain(IterDomain* id) {
-  const auto gpu_lower = GpuLower::current();
-  const auto& ca_map = gpu_lower->caIndexMap();
-  return ca_map.getConcreteMappedID(id);
+  return GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
 }
 
 // Symbolically compares equality of two KIR vals. Comparison is done
 // conservatively, so returning false does not guarantee non-equality.
-bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) {
+bool ParallelDimensionMap::equalDim(Val* dim1, Val* dim2) {
   TORCH_INTERNAL_ASSERT(dim1 != nullptr && dim2 != nullptr);
 
   if (dim1 == dim2) {
@@ -269,8 +267,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) {
   }
 
   // When Both are Int, they are same if both have the same constant
-  auto dim1_int = dynamic_cast<kir::Int*>(dim1);
-  auto dim2_int = dynamic_cast<kir::Int*>(dim2);
+  auto dim1_int = dynamic_cast<Int*>(dim1);
+  auto dim2_int = dynamic_cast<Int*>(dim2);
   if (dim1_int && dim2_int) {
     if (dim1_int->isConst() && dim2_int->isConst()) {
       return dim1_int->value() == dim2_int->value();
@@ -279,8 +277,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) {
 
   // When both are NamedScalar, they are same if Both have the same
   // name
-  auto dim1_ns = dynamic_cast<kir::NamedScalar*>(dim1);
-  auto dim2_ns = dynamic_cast<kir::NamedScalar*>(dim2);
+  auto dim1_ns = dynamic_cast<NamedScalar*>(dim1);
+  auto dim2_ns = dynamic_cast<NamedScalar*>(dim2);
   if (dim1_ns && dim2_ns) {
     return dim1_ns->name() == dim2_ns->name();
   }
@@ -297,12 +295,19 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) {
   // If both are BinaryOp or UnaryOp, check their inputs. Since these
   // Vals are IterDomain extents, UnaryOp should not occur, but
   // checking shouldn't be harmful.
-  if ((dim1_def->isA<kir::BinaryOp>() && dim2_def->isA<kir::BinaryOp>() &&
-       (dim1_def->as<kir::BinaryOp>()->operation() ==
-        dim2_def->as<kir::BinaryOp>()->operation())) ||
-      (dim1_def->isA<kir::UnaryOp>() && dim2_def->isA<kir::UnaryOp>() &&
-       (dim1_def->as<kir::UnaryOp>()->operation() ==
-        dim2_def->as<kir::UnaryOp>()->operation()))) {
+  // TODO:
+  //   We might be able to replace this with dim1->toInlineString() ==
+  //   dim2->toInlineString()
+  //   If we want this less conservative we could make an "exact map" which
+  //   could be another mode in compute at that maps all iter domains, but not
+  //   concretized broadcast axes and only forwards through non-concretized
+  //   broadcast axes.
+  if ((dim1_def->isA<BinaryOp>() && dim2_def->isA<BinaryOp>() &&
+       (dim1_def->as<BinaryOp>()->getBinaryOpType() ==
+        dim2_def->as<BinaryOp>()->getBinaryOpType())) ||
+      (dim1_def->isA<UnaryOp>() && dim2_def->isA<UnaryOp>() &&
+       (dim1_def->as<UnaryOp>()->getUnaryOpType() ==
+        dim2_def->as<UnaryOp>()->getUnaryOpType()))) {
     for (const auto i : c10::irange(dim1_def->inputs().size())) {
       (void)i; // Suppress unused variable warning
       if (!equalDim(dim1_def->inputs()[0], dim2_def->inputs()[0])) {
@@ -321,7 +326,7 @@ std::string ParallelDimensionMap::toString() const {
     ss << pt << ": ";
     auto dim = get(pt);
     if (dim != nullptr) {
-      ss << kir::toString(dim);
+      ss << dim->toString();
       if (isExact(pt)) {
         ss << ", exact";
       } else {
diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h
index d05c17adea29..03bd513396f9 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h
+++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h
@@ -21,7 +21,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap {
 
   //! Returns the dimension of a ParallelType. nullptr is returned if
   //! a ParallelType is unused.
-  kir::Val* get(ParallelType pt) const;
+  Val* get(ParallelType pt) const;
 
   //! True if the dimension of a ParallelType is known to be exact
   bool isExact(ParallelType pt) const;
@@ -29,7 +29,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap {
   std::string toString() const;
 
   //! Symbolically analyze if two extent vals are equal
-  static bool equalDim(kir::Val* dim1, kir::Val* dim2);
+  static bool equalDim(Val* dim1, Val* dim2);
 
  private:
   //! Register the extent of an IterDomain if its constant
@@ -54,7 +54,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap {
  private:
   //! Maps from parallel types to dimensions, which are constant if
   //! a unique value is found.
-  std::unordered_map<ParallelType, kir::Val*, TypeHash> dim_map_;
+  std::unordered_map<ParallelType, Val*, TypeHash> dim_map_;
   //! Set of parallel types whose dimensions are identified to be
   //! exactly the same as extents of mapped domains.
   std::unordered_set<ParallelType, TypeHash> exact_types_;
diff --git a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h
index 0bf8ae39277b..642017a3c097 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h
+++ b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h
@@ -1,8 +1,9 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
+#include <array>
 #include <bitset>
 #include <map>
 #include <unordered_map>
@@ -160,6 +161,20 @@ class ParallelTypeBitmap {
     *this |= ParallelTypeBitmap(kBIDBits);
   }
 
+  //! Clear all of the TID flags
+  void clearAllTID() {
+    auto tid_bits = ParallelTypeBitmap(kTIDBits);
+    auto not_tid_bits = ~tid_bits;
+    *this &= not_tid_bits;
+  }
+
+  //! Clear all of the BID flags
+  void clearAllBID() {
+    auto bid_bits = ParallelTypeBitmap(kBIDBits);
+    auto not_bid_bits = ~bid_bits;
+    *this &= not_bid_bits;
+  }
+
   //! Get an iterator to traverse set types
   Iterator begin() const {
     return Iterator::begin(*this);
@@ -271,6 +286,52 @@ inline ParallelTypeBitmap::Iterator ParallelTypeBitmap::Iterator::end(
   return Iterator(map, kOffsetEnd);
 }
 
+//! Map from ParallelType to template type T
+template <typename T>
+class ParallelTypeMap {
+ public:
+  ParallelTypeMap() = default;
+
+  ParallelTypeMap(const T& init) {
+    std::fill(map_.begin(), map_.end(), init);
+  }
+
+  T& operator[](ParallelType pt) {
+    return map_[getParallelTypeBitMapOffset(pt)];
+  }
+
+  const T& operator[](ParallelType pt) const {
+    return map_[getParallelTypeBitMapOffset(pt)];
+  }
+
+  T& at(ParallelType pt) {
+    return map_.at(getParallelTypeBitMapOffset(pt));
+  }
+
+  const T& at(ParallelType pt) const {
+    return map_.at(getParallelTypeBitMapOffset(pt));
+  }
+
+  auto begin() {
+    return map_.begin();
+  }
+
+  auto begin() const {
+    return map_.begin();
+  }
+
+  auto end() {
+    return map_.begin();
+  }
+
+  auto end() const {
+    return map_.begin();
+  }
+
+ private:
+  std::array<T, ParallelTypeBitmap::kNumParallelTypes> map_;
+};
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index a33b33895c5b..187230dd6758 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/type_inference.h>
@@ -12,6 +13,8 @@
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/csrc/jit/ir/constants.h>
 
+#include <ATen/native/Activation.h>
+
 #include <unordered_map>
 #include <utility>
 
@@ -26,30 +29,29 @@ namespace cuda {
 
 constexpr auto kNumUnaryOps = 10;
 constexpr auto kNumUnaryFloatOps = 23;
+constexpr auto kNumUnaryIsOps = 6;
 
 constexpr auto kNumBinaryFloatOps = 3;
 constexpr auto kNumBinaryComparisonOps = 12;
-constexpr auto kNumBinaryCastOps = 14;
+constexpr auto kNumBinaryCastOps = 19;
 
-constexpr auto kNumBinaryOpsWithAlpha = 4;
+constexpr auto kNumBinaryOpsWithAlpha = 6;
 constexpr auto kNumLerpOps = 2;
 constexpr auto kNumLayernormFwd = 2;
 constexpr auto kNumBatchnormFwd = 3;
+constexpr auto kNumBatchnormBwd = 2;
 constexpr auto kNumInstancenormFwd = 1;
 constexpr auto kNumSumToSize = 2;
 constexpr auto kNumAutocastOps = 2;
-// constexpr auto kNumViewSize = 2;
+constexpr auto kNumAliasDimOps = 2;
+constexpr auto kNumViewOps = 2;
+constexpr auto kNumVarOps = 2;
+constexpr auto kNumSoftmaxFwd = 2;
+constexpr auto kNumSoftmaxBwd = 2;
+constexpr auto kNumAminAmaxOps = 2;
 
 namespace {
 
-std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) {
-  TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor.");
-  auto optional_sizes = tensor_type->sizes().concrete_sizes();
-  TORCH_INTERNAL_ASSERT(
-      optional_sizes.has_value(), "Missing size information for the tensor.");
-  return optional_sizes.value();
-}
-
 #define REGISTER_PARSE_RULE(op, func_body, ...)                                \
   registerParseRule(                                                           \
       op,                                                                      \
@@ -57,15 +59,53 @@ std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) {
           -> void func_body,                                                   \
       __VA_ARGS__)
 
-const auto& sizeAttr = Symbol::attr("profiled_size");
+const auto& reductionSizeAttr = Symbol::attr("profiled_reduction_size");
+const auto& viewSizeAttr = Symbol::attr("profiled_view_size");
 const auto& intListAttr = Symbol::attr("profiled_int_list");
 const auto& intAttr = Symbol::attr("profiled_int");
 const auto& boolListAttr = Symbol::attr("profiled_bool_list");
 const auto& boolAttr = Symbol::attr("profiled_bool");
+const auto& strAttr = Symbol::attr("profiled_str");
+const auto& ivalAttr = Symbol::attr("profiled_ival");
+const auto& profileFailedAttr = Symbol::attr("profile_failed");
 
 typedef Val* CgValue;
 typedef Expr* CgOp;
 
+bool isReductionNonCompatibleTensor(
+    const std::shared_ptr<c10::TensorType>& tensor_type) {
+  return is_zero_dim_tensor(tensor_type) || is_zero_sized_tensor(tensor_type);
+}
+
+bool isInputNonSizeZeroTensor(const Node* node) {
+  for (const auto& val : node->inputs()) {
+    auto tensor_type = val->type()->cast<TensorType>();
+    if (tensor_type && is_zero_sized_tensor(tensor_type)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool isScalarTypeCompatible(const Node* node, size_t offset) {
+  auto val = node->input(offset);
+  // return true if it's not specified
+  if (val->type()->isSubtypeOf(static_cast<c10::TypePtr>(NoneType::get()))) {
+    return true;
+  }
+  // return false if it's runtime value
+  if (val->node()->kind() != prim::Constant) {
+    return false;
+  }
+  auto dtype = toIValue(val)->toScalarType();
+
+  // we do NOT support half math type yet
+  if (dtype == at::ScalarType::Half || dtype == at::ScalarType::BFloat16) {
+    return false;
+  }
+  return true;
+}
+
 // Note [ Permutation Bookkeeping and Propagation in Parser ]
 //
 // The goal in supporting permutation propagation in parser is to:
@@ -120,17 +160,33 @@ struct MemoryFormat {
   // e.g. for an channels-last tensor, permutation_ would be (n-1)123...(n-2);
   // Note: we are omitting the leading '0' when applicable, and apparently this
   //       encoding only works with rank < 10
+  // see [ Note: MemoryFormat and Stride Order ]
   size_t permutation_ = 0;
 
   // default to non-permuted tensor
   MemoryFormat() = default;
 
+  // [ Note: MemoryFormat and Stride Order ]
   // stride_order is extracted from
   //     `TensorType::stride_properties()::stride_index_`, it describes the
   // index of axes from fastest to slowest.
+  // or a 4d tensor, if we have stride_order = {x0, x1, x2, x3}, The i-th
+  // fastest dimension would be stride_order[i].
+  //
   // Look at comment for c10::Stride in aten/src/ATen/core/jit_type.h
-  // e.g. for rank 4 non-permuted tensor, stride_order would be {3, 2, 1, 0}
-  //      for rank 4 channels last tensor, stride_order would be {1, 3, 2, 0}
+  //
+  // eg0. for rank 4 non-permuted tensor, stride_order would be {3, 2, 1, 0}, it
+  // means the fastest dimension is axis-3. the next one would be 2, e.t.c.. So
+  // it's a non-permuted tensor.
+  //      it should be encoded as permutation_ = 3210 (we special case it to 0)
+  //
+  // eg1. for rank 4 channels-last tensor, stride_order would be {1, 3, 2, 0},
+  // it means the fastest dimension is axis-1. the next one would be 3, and then
+  // 2, and then 0. So this is a channels last tensor (NCHW).
+  //      it will be encoded as permutation_ = 1320
+  //
+  // eg2. for a rank 4 permuted tensor, stride_order can be {0, 3, 2, 1}
+  //      it will be encoded as permutation_ = 321 (omitting leading '0')
   void setPermutation(const std::vector<int>& stride_order) {
     int rank = stride_order.size();
     TORCH_INTERNAL_ASSERT(
@@ -139,20 +195,111 @@ struct MemoryFormat {
     // storing stride_order in `permuted_order` for a simpler life, so we don't
     // have to decode `permutation_` when we want to apply/restore permutation_.
     permuted_order_ = stride_order;
-    bool has_permutation_ = false;
+    bool has_permutation = false;
+    permutation_ = 0;
     for (const auto i : c10::irange(rank)) {
       permutation_ = permutation_ * 10 + stride_order[i];
-      if (!has_permutation_ && stride_order[i] != rank - 1 - i) {
-        has_permutation_ = true;
+      if (!has_permutation && stride_order[i] != rank - 1 - i) {
+        has_permutation = true;
       }
     }
 
     // special case permutation_ to reflect non-permuted tensor
-    if (!has_permutation_) {
+    if (!has_permutation) {
       permutation_ = 0;
     }
   }
 
+  // returns the stride order for given MemoryFormat encoding permutation_
+  //
+  // see details for encoding in [ Note: MemoryFormat and Stride Order ]
+  std::vector<int> toStrideOrder() const {
+    std::vector<int> stride_order;
+    // return empty vector for no permutation
+    if (hasPermutation()) {
+      // be generous with reserved space
+      stride_order.reserve(10);
+      bool encountered_zero = false;
+      size_t permutation = permutation_;
+      while (permutation != 0) {
+        int order = static_cast<int>(permutation % 10);
+        permutation /= 10;
+        if (order == 0) {
+          encountered_zero = true;
+        }
+        stride_order.push_back(order);
+      }
+      if (!encountered_zero) {
+        // in case leading '0' is omitted, push it back
+        stride_order.push_back(0);
+      }
+      // since we use push_back, our stride_order is reversed.
+      std::reverse(stride_order.begin(), stride_order.end());
+    }
+    return stride_order;
+  }
+
+  // returns c10::nullopt when it's not safe to broadcast current permutation to
+  // rank
+  c10::optional<MemoryFormat> broadcastToRank(size_t rank) const {
+    auto ret = Contiguous();
+    if (hasPermutation()) {
+      auto stride_order = toStrideOrder();
+      auto cur_rank = stride_order.size();
+      // no op for (cur_rank == 0) || (cur_rank == rank)
+      if (cur_rank < rank) {
+        // broadcasting to hight rank can be done by:
+        //   1. incrementing all existing stride order by rank_diff;
+        //   2. push back decrementing elements starting with rank_diff;
+        //   where rank_diff = rank - cur_rank
+        //
+        // see [ Note: MemoryFormat and Stride Order]
+        // e.g.
+        //   taking broadcasted bias for channels last as an example
+        //     stride_order = {0, 2, 1} broadcasted to rank == 4 would give us
+        //     rank_diff = 4 - 3 = 1
+        //     take step 1 -> {1, 3, 2}
+        //     take step 2 -> {1, 3, 2, 0}
+        int rank_diff = static_cast<int>(rank - cur_rank);
+        for (auto& val : stride_order) {
+          val += rank_diff;
+        }
+        for (int i = rank_diff - 1; i >= 0; i--) {
+          stride_order.push_back(i);
+        }
+      } else if (cur_rank > rank) {
+        // shrink permutation to lower rank. We can simply discard higher rank
+        // stride order when they are not permuted to lower rank bit, because in
+        // those instance we can't obey broadcasting semantics while preserving
+        // permutation. We check for stride order and ensure that the lower
+        // `rank` bits are all permuted within the lower rank. Afterwards, we
+        // update stride_order by decrement each entry by rank_diff to reflect
+        // correct stride order.
+        //
+        // see [ Note: MemoryFormat and Stride Order]
+        // e.g. for rank 4 channels last {1, 3, 2, 0}:
+        //   1. format can safely shrink to rank 3, since any@{1, 3, 2} >=
+        //   (4-3); We ditch last (4-3) rank and decrement each element by (4-1)
+        //   that gives us {0, 2, 1};
+        //   2. but when we shrink it to rank 2, we have {1, 3} where 1 < (4-2)
+        //   and it can't be handled, we return c10::nullopt.
+        int collapsed_ranks = static_cast<int>(cur_rank - rank);
+        for (size_t i = 0; i < rank; i++) {
+          if (stride_order[i] < collapsed_ranks) {
+            // illegal collapsing, return c10::nullopt
+            return c10::nullopt;
+          }
+          // update collapsed stride_order
+          stride_order[i] -= collapsed_ranks;
+        }
+        // discard higher rank stride order.
+        stride_order.resize(rank);
+      }
+      ret.setPermutation(stride_order);
+    }
+    return ret;
+  }
+
   // returns non-permuted format
   static MemoryFormat Contiguous() {
     return MemoryFormat();
@@ -276,19 +423,29 @@ class ValueHolder {
   // returns Val in target format if it exists, otherwise, transpose an existing
   // copy and add that to bookkeeping.
   CgValue maybeConvertValue(const MemoryFormat& format) {
-    auto iter_val = vals_.find(format);
-    if (iter_val != vals_.end()) {
-      return iter_val->second;
-    }
-    // patching scalar value, because memory format doesn't carry real meaning.
-    if (!is_tensor_view_) {
+    auto cur_rank = rank();
+    // scalar (tensor) where cur_rank == 0, memory format doesn't carry meaning
+    // and should just return the value as-is. same for non-tensor where
+    // cur_rank == -1
+    if (cur_rank <= 0) {
       return std::get<1>(getEntry());
     }
     MemoryFormat format_s;
     CgValue value_s = nullptr;
     std::tie(format_s, value_s) = getEntry();
-    auto val = convertValue(format, format_s, value_s);
-    vals_[format] = val;
+
+    auto opt_format_d = format.broadcastToRank(static_cast<size_t>(cur_rank));
+    TORCH_INTERNAL_ASSERT(
+        opt_format_d.has_value(),
+        "maybeConvertValue requested for illegal permutation");
+    MemoryFormat format_d = opt_format_d.value();
+
+    auto iter_val = vals_.find(format_d);
+    if (iter_val != vals_.end()) {
+      return iter_val->second;
+    }
+    auto val = convertValue(format_d, format_s, value_s);
+    vals_[format_d] = val;
     return val;
   }
 
@@ -435,6 +592,79 @@ std::pair<MemoryFormat, std::list<CgValue>> getConsistentValues(
   return std::make_pair(format, list_val);
 }
 
+// iterate through all vals and return the output MemoryFormat and copies of
+// vals.
+//   1. When `forced_format == c10::nullopt`, target MemoryFormat returns the
+//      format of the first val in `vals`, this is to achieve a coherent
+//      behavior as with eager TensorIterator;
+//   2. The target can be overwritten vias specifying `forced_format`.
+//
+// Note: take `Values&` by reference, since `maybeConvertValue` needs to modify
+// the entry and we want that to be updated in `value_map_`
+template <class... Values>
+std::pair<MemoryFormat, std::list<CgValue>> getPWFormatValues(
+    c10::optional<MemoryFormat> forced_format,
+    Values&... vals) {
+  MemoryFormat format;
+  if (forced_format.has_value()) {
+    format = forced_format.value();
+  } else {
+    // get maximum rank on vals
+    std::vector<MemoryFormat> formats;
+    std::vector<int> ranks;
+    auto max_rank_func = [&ranks](const ValueHolder& val, int rank = 0) {
+      int v_rank = val.rank();
+      ranks.push_back(v_rank);
+      return std::max(rank, v_rank);
+    };
+    int max_rank = iterate(max_rank_func, vals...);
+
+    // going through all permutation, keeping consistency with TensorIterator
+    // behavior and the first tensor with highest rank dictates output
+    // permutation
+    auto format_func = [&formats, &max_rank](
+                           const ValueHolder& val,
+                           MemoryFormat f = MemoryFormat::Contiguous()) {
+      auto cur_format = std::get<0>(val.getEntry());
+      formats.push_back(cur_format);
+      return val.rank() == max_rank ? cur_format : f;
+    };
+    format = iterate(format_func, vals...);
+
+    // we need to do pair-wise comparison to ensure that all permutation are
+    // compatible since permutation could have changed semantics among
+    // broadcasted tensors. Consider pointwise operation between three tensor
+    // [N, C, H, W] + [C, H, W] + [H, W]
+    for (size_t i = 0; i < formats.size() && format.hasPermutation(); i++) {
+      for (size_t j = 0; j < formats.size(); j++) {
+        // don't compare scalar tensor or scalar
+        if (ranks[i] <= 0 || ranks[j] <= 0 || i == j) {
+          continue;
+        }
+        size_t lower_rank = std::min(ranks[i], ranks[j]);
+        auto i_format = formats[i].broadcastToRank(lower_rank);
+        auto j_format = formats[j].broadcastToRank(lower_rank);
+
+        // breaks permutation if any:
+        //   1. i_format can't be broadcasted to lower_rank;
+        //   2. j_format can't be broadcasted to lower_rank;
+        if (!i_format.has_value() || !j_format.has_value()) {
+          format = MemoryFormat::Contiguous();
+        }
+      }
+    }
+  }
+
+  auto convert_func = [format](
+                          ValueHolder& val, std::list<CgValue> list_val = {}) {
+    list_val.push_front(val.maybeConvertValue(format));
+    return list_val;
+  };
+  auto list_val = iterate(convert_func, vals...);
+
+  return std::make_pair(format, list_val);
+}
+
 typedef void (
     *ParseFuncPtr)(const Node*, std::unordered_map<size_t, ValueHolder>&);
 typedef bool (*MergeQueryFuncPtr)(const Node*);
@@ -502,7 +732,7 @@ class IrParser {
           "Failure when register value: ",
           *(val->node()),
           " with type: ",
-          val->type());
+          val->type()->repr_str());
       MemoryFormat format;
       Val* operand = nullptr;
       std::tie(format, operand) = value_map_[val->unique()].getEntry();
@@ -520,7 +750,6 @@ class IrParser {
           (opt_dtype.value() == DataType::Half ||
            opt_dtype.value() == DataType::BFloat16)) {
         Val* promoted_val = castOp(DataType::Float, operand);
-        // value_map_.emplace(val->unique(), ValueHolder(promoted_val, format));
         value_map_[val->unique()] = ValueHolder(promoted_val, format);
       }
     }
@@ -540,13 +769,10 @@ class IrParser {
       auto tensor_type = jit_output->type()->cast<TensorType>();
       TORCH_INTERNAL_ASSERT(
           tensor_type, "output of fusion group is not TensorType.");
-      if (tensor_type->scalarType() == at::ScalarType::Half) {
-        // No need to update value_map_ after this point.
-        out = castOp(DataType::Half, out)->as<TensorView>();
-      }
-      if (tensor_type->scalarType() == at::ScalarType::BFloat16) {
-        // No need to update value_map_ after this point.
-        out = castOp(DataType::BFloat16, out)->as<TensorView>();
+      if (tensor_type->scalarType().has_value()) {
+        out = optionalCastStrict(
+                  aten_to_data_type(*tensor_type->scalarType()), out)
+                  ->as<TensorView>();
       }
       fusion->addOutput(out);
 
@@ -574,11 +800,17 @@ class IrParser {
   static bool lookupInSymbolSet(const Node* node) {
     initRegistry();
 
+    std::lock_guard<std::mutex> lock(parser_mutex_);
     return parser_symbol_set_.count(node->kind()) != 0;
   }
 
   // return nullptr if entry does not exist
   static const RegistrationEntry* lookupInRegistry(const Node* node) {
+    std::lock_guard<std::mutex> lock(parser_mutex_);
+
+    if (parser_skip_set_.count(node->kind()) != 0) {
+      return nullptr;
+    }
     // we need to use maybeSchema for nodes like prim::Constant, which doesn't
     // have a schema
     auto schema_ptr = node->maybeSchema();
@@ -602,12 +834,28 @@ class IrParser {
     return nullptr;
   }
 
+  static bool querySkipSymbolSet(c10::Symbol symbol, bool flip) {
+    initRegistry();
+
+    std::lock_guard<std::mutex> lock(parser_mutex_);
+    // no need to init registry here (unlike `lookupInSymbolSet`, as
+    // `parser_skip_set_` is not initialized via initialization
+    bool ret = parser_skip_set_.count(symbol) != 0;
+    if (flip) {
+      if (ret) {
+        parser_skip_set_.erase(symbol);
+      } else {
+        parser_skip_set_.insert(symbol);
+      }
+    }
+    return ret;
+  }
+
   static void initRegistry() {
-    if (init_registry_) {
-      // TODO: mutex this guy;
+    std::call_once(once_flag_, []() {
+      std::lock_guard<std::mutex> lock(parser_mutex_);
       registerJitOperator();
-      init_registry_ = false;
-    }
+    });
   }
 
   static bool canParseNode(const Node* node) {
@@ -685,7 +933,9 @@ class IrParser {
         "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
         "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
         "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
-        "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor"};
+        "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+        "aten::rsub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+        "aten::rsub(Tensor self, Scalar other, Scalar alpha) -> Tensor"};
     for (auto signature : BinaryOpWithAlpha) {
       auto ptr_op = getOperatorForLiteral(signature);
       REGISTER_PARSE_RULE(
@@ -701,13 +951,17 @@ class IrParser {
                           BinaryOpType::Add,
                           static_cast<BinaryOpWithAlphaType>(&add_alpha))},
                      {aten::sub,
+                      std::make_pair(
+                          BinaryOpType::Sub,
+                          static_cast<BinaryOpWithAlphaType>(&sub_alpha))},
+                     {aten::rsub,
                       std::make_pair(
                           BinaryOpType::Sub,
                           static_cast<BinaryOpWithAlphaType>(&sub_alpha))}});
             // TODO: handle scaling factor when it's not constant 1;
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
@@ -720,14 +974,16 @@ class IrParser {
             auto out = alpha->isOneInt()
                 ? binaryOp(
                       op_mapping[node->kind()].first,
-                      lhs,
-                      rhs,
+                      node->kind() == aten::rsub ? rhs : lhs,
+                      node->kind() == aten::rsub ? lhs : rhs,
                       TypePromotion::default_op_config)
-                : op_mapping[node->kind()].second(lhs, rhs, alpha);
+                : (node->kind() == aten::rsub
+                       ? op_mapping[node->kind()].second(rhs, lhs, alpha)
+                       : op_mapping[node->kind()].second(lhs, rhs, alpha));
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -746,7 +1002,7 @@ class IrParser {
 
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
@@ -763,7 +1019,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -777,10 +1033,15 @@ class IrParser {
         "aten::pow(Scalar self, Tensor exponent) -> Tensor",
         "aten::remainder(Tensor self, Tensor other) -> Tensor",
         "aten::fmod(Tensor self, Tensor other) -> Tensor",
+        "aten::bitwise_and(Tensor self, Tensor other) -> Tensor",
         "aten::__and__(Tensor self, Tensor other) -> Tensor",
+        "aten::bitwise_or(Tensor self, Tensor other) -> Tensor",
         "aten::__or__(Tensor self, Tensor other) -> Tensor",
+        "aten::bitwise_xor(Tensor self, Tensor other) -> Tensor",
         "aten::__xor__(Tensor self, Tensor other) -> Tensor",
+        "aten::bitwise_left_shift(Tensor self, Tensor other) -> Tensor",
         "aten::__lshift__(Tensor self, Tensor other) -> Tensor",
+        "aten::bitwise_right_shift(Tensor self, Tensor other) -> Tensor",
         "aten::__rshift__(Tensor self, Tensor other) -> Tensor"};
     for (auto signature : BinaryCastOp) {
       auto ptr_op = getOperatorForLiteral(signature);
@@ -794,15 +1055,20 @@ class IrParser {
                  {aten::pow, BinaryOpType::Pow},
                  {aten::remainder, BinaryOpType::Remainder},
                  {aten::fmod, BinaryOpType::Fmod},
+                 {aten::bitwise_and, BinaryOpType::And},
                  {aten::__and__, BinaryOpType::And},
+                 {aten::bitwise_or, BinaryOpType::Or},
                  {aten::__or__, BinaryOpType::Or},
+                 {aten::bitwise_xor, BinaryOpType::Xor},
                  {aten::__xor__, BinaryOpType::Xor},
+                 {aten::bitwise_left_shift, BinaryOpType::Lshift},
                  {aten::__lshift__, BinaryOpType::Lshift},
+                 {aten::bitwise_right_shift, BinaryOpType::Rshift},
                  {aten::__rshift__, BinaryOpType::Rshift}});
 
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
@@ -819,7 +1085,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -851,7 +1117,7 @@ class IrParser {
 
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
@@ -868,7 +1134,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -911,7 +1177,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -982,7 +1248,41 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
+          nullptr);
+    }
+
+    std::array<const char*, kNumUnaryIsOps> UnaryIsOp = {
+        "aten::isfinite(Tensor self) -> Tensor",
+        "aten::isinf(Tensor self) -> Tensor",
+        "aten::isnan(Tensor self) -> Tensor",
+        "aten::isneginf(Tensor self) -> Tensor",
+        "aten::isposinf(Tensor self) -> Tensor",
+        "aten::isreal(Tensor self) -> Tensor"};
+    for (auto signature : UnaryIsOp) {
+      auto ptr_op = getOperatorForLiteral(signature);
+      REGISTER_PARSE_RULE(
+          ptr_op,
+          {
+            static std::unordered_map<Symbol, UnaryOpType> op_mapping({
+                {aten::isfinite, UnaryOpType::IsFinite},
+                {aten::isinf, UnaryOpType::IsInf},
+                {aten::isnan, UnaryOpType::IsNan},
+                {aten::isneginf, UnaryOpType::IsNegInf},
+                {aten::isposinf, UnaryOpType::IsPosInf},
+                {aten::isreal, UnaryOpType::IsReal},
+            });
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getConsistentValues(
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
+            auto operand = list_val.front();
+            list_val.pop_front();
+            auto out = unaryIsOp(op_mapping[node->kind()], operand);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
+          },
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -995,15 +1295,49 @@ class IrParser {
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
-                value_map[node->inputs()[0]->unique()]);
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
             auto operand = list_val.front();
             list_val.pop_front();
 
+            if (!node->input(3)->type()->isSubtypeOf(
+                    static_cast<c10::TypePtr>(NoneType::get()))) {
+              auto device = constant_as<c10::Device>(node->input(3));
+              TORCH_INTERNAL_ASSERT(
+                  device.has_value() && device->is_cuda(),
+                  "rand_like in nvfuser is not on cuda device");
+              auto input_tensor_type =
+                  node->input(0)->type()->cast<TensorType>();
+              // device->index() == -1 indicating that we don't change device
+              // index
+              if (device->index() != -1 && input_tensor_type) {
+                auto input_device = input_tensor_type->device();
+                // we expect device index to be consistent with input and it
+                // should have already been handled by partition
+                TORCH_INTERNAL_ASSERT(
+                    !input_device.has_value() ||
+                        input_device->index() == device->index(),
+                    "rand_like in nvfuser is not on cuda device");
+              }
+            }
+
             auto out = randlike(operand);
-            value_map.emplace(node->output()->unique(), out);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (!node->input(1)->type()->isSubtypeOf(
+                    static_cast<c10::TypePtr>(NoneType::get())) ||
+                !node->input(2)->type()->isSubtypeOf(
+                    static_cast<c10::TypePtr>(NoneType::get())) ||
+                !node->input(5)->type()->isSubtypeOf(
+                    static_cast<c10::TypePtr>(NoneType::get()))) {
+              return false;
+            }
+            return true;
           },
-          nullptr,
           nullptr);
     }
 
@@ -1016,16 +1350,16 @@ class IrParser {
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
-                value_map[node->inputs()[0]->unique()]);
-            auto operand = list_val.front();
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
+            auto operand = list_val.front()->as<TensorView>();
             list_val.pop_front();
             auto& beta = value_map[node->inputs()[1]->unique()];
             auto& threshold = value_map[node->inputs()[2]->unique()];
             auto out = softplus(operand, beta, threshold);
-            value_map.emplace(node->output()->unique(), out);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1038,17 +1372,17 @@ class IrParser {
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
-                value_map[node->inputs()[0]->unique()]);
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
             auto operand = list_val.front();
             list_val.pop_front();
             auto& th = value_map[node->inputs()[1]->unique()];
             auto& value = value_map[node->inputs()[2]->unique()];
 
             auto out = threshold(operand, th, value);
-            value_map.emplace(node->output()->unique(), out);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1060,7 +1394,7 @@ class IrParser {
           {
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
@@ -1080,7 +1414,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1096,17 +1430,18 @@ class IrParser {
                 c10::nullopt, value_map[node->inputs()[0]->unique()]);
             auto operand = list_val.front();
             list_val.pop_front();
-            Val* low = value_map.count(node->inputs()[1]->unique()) != 0
+            Val* min = value_map.count(node->inputs()[1]->unique()) != 0
                 ? *value_map[node->inputs()[1]->unique()]
-                : new Double(std::numeric_limits<float>::min());
-            Val* high = value_map.count(node->inputs()[2]->unique()) != 0
+                : nullptr;
+            Val* max = value_map.count(node->inputs()[2]->unique()) != 0
                 ? *value_map[node->inputs()[2]->unique()]
-                : new Double(std::numeric_limits<float>::max());
+                : nullptr;
 
-            auto out = clamp(operand, low, high);
-            value_map.emplace(node->output()->unique(), out);
+            Val* out = clamp(operand, min, max);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1118,8 +1453,8 @@ class IrParser {
           {
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
+            std::tie(format, list_val) = getPWFormatValues(
+                c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()],
                 value_map[node->inputs()[2]->unique()]);
@@ -1134,7 +1469,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1149,8 +1484,8 @@ class IrParser {
             {
               MemoryFormat format;
               std::list<Val*> list_val;
-              std::tie(format, list_val) = getConsistentValues(
-                  MemoryFormat::Contiguous(),
+              std::tie(format, list_val) = getPWFormatValues(
+                  c10::nullopt,
                   value_map[node->inputs()[0]->unique()],
                   value_map[node->inputs()[1]->unique()],
                   value_map[node->inputs()[2]->unique()]);
@@ -1165,7 +1500,7 @@ class IrParser {
               value_map.emplace(
                   node->output()->unique(), ValueHolder(out, format));
             },
-            nullptr,
+            isInputNonSizeZeroTensor,
             nullptr);
       }
     }
@@ -1178,7 +1513,7 @@ class IrParser {
           {
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()],
@@ -1197,7 +1532,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1210,7 +1545,7 @@ class IrParser {
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
+                c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
             auto input = list_val.front();
@@ -1225,8 +1560,11 @@ class IrParser {
             if (train.value()) {
               auto result = dropout(input->as<TensorView>(), prob);
 
-              value_map.emplace(node->output(0)->unique(), result.output);
-              value_map.emplace(node->output(1)->unique(), result.mask);
+              value_map.emplace(
+                  node->output(0)->unique(),
+                  ValueHolder(result.output, format));
+              value_map.emplace(
+                  node->output(1)->unique(), ValueHolder(result.mask, format));
             } else {
               value_map.emplace(node->output(0)->unique(), input);
               value_map.emplace(
@@ -1234,7 +1572,15 @@ class IrParser {
                   ValueHolder(TensorViewBuilder().build(), format));
             }
           },
-          nullptr,
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->inputs()[2]->node()->kind() != prim::Constant) {
+              return false;
+            }
+            return true;
+          },
           nullptr);
     }
 
@@ -1247,7 +1593,7 @@ class IrParser {
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
+                c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
             auto input = list_val.front();
@@ -1262,12 +1608,22 @@ class IrParser {
             if (train.value()) {
               auto result = dropout(input->as<TensorView>(), prob);
 
-              value_map.emplace(node->output()->unique(), result.output);
+              value_map.emplace(
+                  node->output()->unique(), ValueHolder(result.output, format));
             } else {
-              value_map.emplace(node->output()->unique(), input);
+              value_map.emplace(
+                  node->output()->unique(), ValueHolder(input, format));
+            }
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->inputs()[2]->node()->kind() != prim::Constant) {
+              return false;
             }
+            return true;
           },
-          nullptr,
           nullptr);
     }
 
@@ -1279,8 +1635,8 @@ class IrParser {
           {
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
+            std::tie(format, list_val) = getPWFormatValues(
+                c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()],
                 value_map[node->inputs()[2]->unique()]);
@@ -1293,9 +1649,10 @@ class IrParser {
 
             auto output = dropout_backward(
                 grad->as<TensorView>(), mask->as<TensorView>(), scale);
-            value_map.emplace(node->output()->unique(), output);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(output, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -1307,8 +1664,6 @@ class IrParser {
         REGISTER_PARSE_RULE(
             ptr_op,
             {
-              auto fusion = FusionGuard::getCurFusion();
-
               // TODO: handle channels last
               MemoryFormat format;
               std::list<Val*> list_val;
@@ -1336,9 +1691,6 @@ class IrParser {
                       static_cast<c10::TypePtr>(NoneType::get()))) {
                 running_mean =
                     value_map[node->input(3)->unique()]->as<TensorView>();
-                TORCH_INTERNAL_ASSERT(
-                    fusion->hasInput(running_mean),
-                    "IO_tensor `instance_norm::running_mean` can only be input tensor to fusion");
               }
 
               TensorView* running_var = nullptr;
@@ -1346,9 +1698,6 @@ class IrParser {
                       static_cast<c10::TypePtr>(NoneType::get()))) {
                 running_var =
                     value_map[node->input(4)->unique()]->as<TensorView>();
-                TORCH_INTERNAL_ASSERT(
-                    fusion->hasInput(running_var),
-                    "IO_tensor `instance_norm::running_var` can only be input tensor to fusion");
               }
 
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
@@ -1361,7 +1710,7 @@ class IrParser {
               Val* momentum_ptr = nullptr;
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
               if (auto momentum = constant_as<float>(node->input(6))) {
-                momentum_ptr = new Double(momentum.value());
+                momentum_ptr = IrBuilder::create<Double>(momentum.value());
               } else {
                 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                 momentum_ptr = value_map[node->input(6)->unique()];
@@ -1370,7 +1719,7 @@ class IrParser {
               Val* eps_ptr = nullptr;
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
               if (auto eps = constant_as<float>(node->input(7))) {
-                eps_ptr = new Double(eps.value());
+                eps_ptr = IrBuilder::create<Double>(eps.value());
               } else {
                 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                 eps_ptr = value_map[node->input(7)->unique()];
@@ -1391,7 +1740,13 @@ class IrParser {
                 value_map.emplace(node->output()->unique(), result.output);
               }
             },
-            [](const Node* node) -> bool { return true; },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              return true;
+            },
             [](const Node* node) -> OperatorType {
               return OperatorType::Normalization;
             });
@@ -1455,7 +1810,7 @@ class IrParser {
               Val* momentum_ptr = nullptr;
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
               if (auto momentum = constant_as<float>(node->input(6))) {
-                momentum_ptr = new Double(momentum.value());
+                momentum_ptr = IrBuilder::create<Double>(momentum.value());
               } else {
                 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                 momentum_ptr = value_map[node->input(6)->unique()];
@@ -1464,7 +1819,7 @@ class IrParser {
               Val* eps_ptr = nullptr;
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
               if (auto eps = constant_as<float>(node->input(7))) {
-                eps_ptr = new Double(eps.value());
+                eps_ptr = IrBuilder::create<Double>(eps.value());
               } else {
                 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                 eps_ptr = value_map[node->input(7)->unique()];
@@ -1502,7 +1857,16 @@ class IrParser {
                     ValueHolder(result.output, format));
               }
             },
-            [](const Node* node) -> bool { return true; },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              if (node->input(5)->node()->kind() != prim::Constant) {
+                return false;
+              }
+              return true;
+            },
             [](const Node* node) -> OperatorType {
               return OperatorType::Normalization;
             });
@@ -1510,156 +1874,233 @@ class IrParser {
     }
 
     {
-      auto ptr_op = getOperatorForLiteral(
-          "aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)");
-      REGISTER_PARSE_RULE(
-          ptr_op,
-          {
-            // discard impl_index and reservedSpace since we don't use them
-            MemoryFormat format;
-            std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
-                c10::nullopt,
-                value_map[node->inputs()[1]->unique()],
-                value_map[node->inputs()[2]->unique()]);
-            if (format.hasPermutation() && !format.isChannelsLast()) {
+      std::array<const char*, kNumBatchnormBwd> BatchNormBwd = {
+          "aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)",
+          "aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"};
+      for (auto signature : BatchNormBwd) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              JitValue* ts_input = nullptr;
+              JitValue* ts_grad_output;
+              JitValue* ts_weight = nullptr;
+              JitValue* ts_r_mean = nullptr;
+              JitValue* ts_r_var = nullptr;
+              JitValue* ts_save_mean = nullptr;
+              JitValue* ts_save_invstd = nullptr;
+              JitValue* ts_train = nullptr;
+              JitValue* ts_eps = nullptr;
+              JitValue* ts_mask = nullptr;
+              if (node->kind() ==
+                  c10::Symbol::fromQualString(
+                      "aten::_batch_norm_impl_index_backward")) {
+                ts_input = node->input(1);
+                ts_grad_output = node->input(2);
+                ts_weight = node->input(3);
+                ts_r_mean = node->input(4);
+                ts_r_var = node->input(5);
+                ts_save_mean = node->input(6);
+                ts_save_invstd = node->input(7);
+                ts_train = node->input(8);
+                ts_eps = node->input(9);
+                ts_mask = node->input(10);
+              } else if (
+                  node->kind() ==
+                  c10::Symbol::fromQualString(
+                      "aten::native_batch_norm_backward")) {
+                ts_grad_output = node->input(0);
+                ts_input = node->input(1);
+                ts_weight = node->input(2);
+                ts_r_mean = node->input(3);
+                ts_r_var = node->input(4);
+                ts_save_mean = node->input(5);
+                ts_save_invstd = node->input(6);
+                ts_train = node->input(7);
+                ts_eps = node->input(8);
+                ts_mask = node->input(9);
+              } else {
+                TORCH_INTERNAL_ASSERT(
+                    false,
+                    "Forgot to register the key for BN variation: ",
+                    node->kind().toDisplayString());
+              }
+
+              // discard impl_index and reservedSpace since we don't use them
+              MemoryFormat format;
+              std::list<Val*> list_val;
               std::tie(format, list_val) = getConsistentValues(
-                  MemoryFormat::Contiguous(),
-                  value_map[node->inputs()[1]->unique()],
-                  value_map[node->inputs()[2]->unique()]);
-            }
-            auto operand0 = list_val.front();
-            list_val.pop_front();
-            auto operand1 = list_val.front();
-            list_val.pop_front();
-            auto input = operand0->as<TensorView>();
-            auto grad_out = operand1->as<TensorView>();
+                  c10::nullopt,
+                  value_map[ts_input->unique()],
+                  value_map[ts_grad_output->unique()]);
+              if (format.hasPermutation() && !format.isChannelsLast()) {
+                std::tie(format, list_val) = getConsistentValues(
+                    MemoryFormat::Contiguous(),
+                    value_map[ts_input->unique()],
+                    value_map[ts_grad_output->unique()]);
+              }
+              auto operand0 = list_val.front();
+              list_val.pop_front();
+              auto operand1 = list_val.front();
+              list_val.pop_front();
+              auto input = operand0->as<TensorView>();
+              auto grad_out = operand1->as<TensorView>();
 
-            TensorView* weight = nullptr;
-            if (!node->input(3)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
-              weight = value_map[node->input(3)->unique()]->as<TensorView>();
-            }
+              TensorView* weight = nullptr;
+              if (!ts_weight->type()->isSubtypeOf(
+                      static_cast<c10::TypePtr>(NoneType::get()))) {
+                weight = value_map[ts_weight->unique()]->as<TensorView>();
+              }
 
-            TensorView* running_mean = nullptr;
-            if (!node->input(4)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
-              running_mean =
-                  value_map[node->input(4)->unique()]->as<TensorView>();
-            }
+              TensorView* running_mean = nullptr;
+              if (!ts_r_mean->type()->isSubtypeOf(
+                      static_cast<c10::TypePtr>(NoneType::get()))) {
+                running_mean = value_map[ts_r_mean->unique()]->as<TensorView>();
+              }
 
-            TensorView* running_var = nullptr;
-            if (!node->input(5)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
-              running_var =
-                  value_map[node->input(5)->unique()]->as<TensorView>();
-            }
+              TensorView* running_var = nullptr;
+              if (!ts_r_var->type()->isSubtypeOf(
+                      static_cast<c10::TypePtr>(NoneType::get()))) {
+                running_var = value_map[ts_r_var->unique()]->as<TensorView>();
+              }
 
-            TensorView* save_mean = nullptr;
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            if (!node->input(6)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
+              TensorView* save_mean = nullptr;
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-              save_mean = value_map[node->input(6)->unique()]->as<TensorView>();
-            }
-
-            TensorView* save_invstd = nullptr;
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            if (!node->input(7)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
-              save_invstd =
-                  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-                  value_map[node->input(7)->unique()]->as<TensorView>();
-            }
+              if (!ts_save_mean->type()->isSubtypeOf(
+                      static_cast<c10::TypePtr>(NoneType::get()))) {
+                // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+                save_mean = value_map[ts_save_mean->unique()]->as<TensorView>();
+              }
 
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            auto training = constant_as<bool>(node->input(8));
-            TORCH_INTERNAL_ASSERT(
-                training.has_value(),
-                "The training (bool) parameter is required.");
-            const bool kTraining = training.value();
+              TensorView* save_invstd = nullptr;
+              // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+              if (!ts_save_invstd->type()->isSubtypeOf(
+                      static_cast<c10::TypePtr>(NoneType::get()))) {
+                save_invstd =
+                    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+                    value_map[ts_save_invstd->unique()]->as<TensorView>();
+              }
 
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            Val* eps_ptr = nullptr;
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            if (auto eps = constant_as<float>(node->input(9))) {
-              eps_ptr = new Double(eps.value());
-            } else {
               // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-              eps_ptr = value_map[node->input(7)->unique()];
-            }
+              auto training = constant_as<bool>(ts_train);
+              TORCH_INTERNAL_ASSERT(
+                  training.has_value(),
+                  "The training (bool) parameter is required.");
+              const bool kTraining = training.value();
 
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-            auto out_mask_list = constant_as<c10::List<bool>>(node->input(10));
-            TORCH_INTERNAL_ASSERT(
-                out_mask_list.has_value(),
-                "output mask for batch_norm_backward");
-            std::vector<bool> output_mask;
-            for (const auto value : out_mask_list->vec()) {
-              output_mask.emplace_back(static_cast<bool>(value));
-            }
+              // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+              Val* eps_ptr = nullptr;
+              // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+              if (auto eps = constant_as<float>(ts_eps)) {
+                eps_ptr = IrBuilder::create<Double>(eps.value());
+              } else {
+                // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+                eps_ptr = value_map[ts_eps->unique()];
+              }
 
-            // TODO: merge this loop below.
-            if (kTraining) {
-              TORCH_INTERNAL_ASSERT(
-                  save_mean != nullptr && save_invstd != nullptr,
-                  "When training=True, save_mean and save_invstd are required.");
-            } else {
-              // TODO: this is not a legit assumption? Can't we run with
-              // track_running_stats == false && training == false
-              // which should just run through the case above.
+              // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+              auto out_mask_list = constant_as<c10::List<bool>>(ts_mask);
               TORCH_INTERNAL_ASSERT(
-                  running_mean != nullptr && running_var != nullptr,
-                  "When training=False, running_mean and running_invstd are required.");
-            }
+                  out_mask_list.has_value(),
+                  "output mask for batch_norm_backward");
+              std::vector<bool> output_mask;
+              for (const auto value : out_mask_list->vec()) {
+                output_mask.emplace_back(static_cast<bool>(value));
+              }
 
-            auto grads = batch_norm_backward(
-                input,
-                grad_out,
-                weight,
-                running_mean,
-                running_var,
-                save_mean,
-                save_invstd,
-                kTraining,
-                eps_ptr,
-                output_mask,
-                format.isChannelsLast());
+              // TODO: merge this loop below.
+              if (kTraining) {
+                TORCH_INTERNAL_ASSERT(
+                    save_mean != nullptr && save_invstd != nullptr,
+                    "When training=True, save_mean and save_invstd are required.");
+              } else {
+                // TODO: this is not a legit assumption? Can't we run with
+                // track_running_stats == false && training == false
+                // which should just run through the case above.
+                TORCH_INTERNAL_ASSERT(
+                    running_mean != nullptr && running_var != nullptr,
+                    "When training=False, running_mean and running_invstd are required.");
+              }
 
-            if (output_mask[0]) {
-              TORCH_INTERNAL_ASSERT(grads.grad_input != nullptr);
-              value_map.emplace(
-                  node->output(0)->unique(),
-                  ValueHolder(grads.grad_input, format));
-            } else {
-              TORCH_INTERNAL_ASSERT(grads.grad_input == nullptr);
-              value_map.emplace(
-                  node->output(0)->unique(),
-                  ValueHolder(TensorViewBuilder().build(), format));
-            }
+              auto grads = batch_norm_backward(
+                  input,
+                  grad_out,
+                  weight,
+                  running_mean,
+                  running_var,
+                  save_mean,
+                  save_invstd,
+                  kTraining,
+                  eps_ptr,
+                  output_mask,
+                  format.isChannelsLast());
 
-            if (output_mask[1]) {
-              TORCH_INTERNAL_ASSERT(grads.grad_weight != nullptr);
-              value_map.emplace(node->output(1)->unique(), grads.grad_weight);
-            } else {
-              TORCH_INTERNAL_ASSERT(grads.grad_weight == nullptr);
-              value_map.emplace(
-                  node->output(1)->unique(), TensorViewBuilder().build());
-            }
+              if (output_mask[0]) {
+                TORCH_INTERNAL_ASSERT(grads.grad_input != nullptr);
+                value_map.emplace(
+                    node->output(0)->unique(),
+                    ValueHolder(grads.grad_input, format));
+              } else {
+                TORCH_INTERNAL_ASSERT(grads.grad_input == nullptr);
+                value_map.emplace(
+                    node->output(0)->unique(),
+                    ValueHolder(TensorViewBuilder().build(), format));
+              }
 
-            if (output_mask[2]) {
-              TORCH_INTERNAL_ASSERT(grads.grad_bias != nullptr);
-              value_map.emplace(node->output(2)->unique(), grads.grad_bias);
-            } else {
-              TORCH_INTERNAL_ASSERT(grads.grad_bias == nullptr);
-              value_map.emplace(
-                  node->output(2)->unique(), TensorViewBuilder().build());
-            }
-          },
-          [](const Node* node) -> bool { return true; },
-          [](const Node* node) -> OperatorType {
-            return OperatorType::Normalization;
-          });
+              if (output_mask[1]) {
+                TORCH_INTERNAL_ASSERT(grads.grad_weight != nullptr);
+                value_map.emplace(node->output(1)->unique(), grads.grad_weight);
+              } else {
+                TORCH_INTERNAL_ASSERT(grads.grad_weight == nullptr);
+                value_map.emplace(
+                    node->output(1)->unique(), TensorViewBuilder().build());
+              }
+
+              if (output_mask[2]) {
+                TORCH_INTERNAL_ASSERT(grads.grad_bias != nullptr);
+                value_map.emplace(node->output(2)->unique(), grads.grad_bias);
+              } else {
+                TORCH_INTERNAL_ASSERT(grads.grad_bias == nullptr);
+                value_map.emplace(
+                    node->output(2)->unique(), TensorViewBuilder().build());
+              }
+            },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(1)->type()->cast<TensorType>())) {
+                return false;
+              }
+              if (node->kind() ==
+                  c10::Symbol::fromQualString(
+                      "aten::_batch_norm_impl_index_backward")) {
+                if (node->inputs()[8]->node()->kind() != prim::Constant) {
+                  return false;
+                }
+                if (node->inputs()[10]->node()->kind() != prim::Constant) {
+                  return false;
+                }
+              } else if (
+                  node->kind() ==
+                  c10::Symbol::fromQualString(
+                      "aten::native_batch_norm_backward")) {
+                if (node->inputs()[7]->node()->kind() != prim::Constant) {
+                  return false;
+                }
+                if (node->inputs()[9]->node()->kind() != prim::Constant) {
+                  return false;
+                }
+              } else {
+                TORCH_INTERNAL_ASSERT(
+                    false,
+                    "Forgot to update profiled constant check for",
+                    node->kind().toDisplayString());
+              }
+              return true;
+            },
+            [](const Node* node) -> OperatorType {
+              return OperatorType::Normalization;
+            });
+      }
     }
 
     {
@@ -1701,7 +2142,7 @@ class IrParser {
 
               Val* eps_ptr = nullptr;
               if (auto eps = constant_as<float>(node->input(4))) {
-                eps_ptr = new Double(eps.value());
+                eps_ptr = IrBuilder::create<Double>(eps.value());
               } else {
                 eps_ptr = value_map[node->input(4)->unique()];
               }
@@ -1721,7 +2162,16 @@ class IrParser {
               }
             },
             // TODO: #ProfileIValue List should update this
-            [](const Node* node) -> bool { return true; },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              if (node->inputs()[1]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              return true;
+            },
             [](const Node* node) -> OperatorType {
               return OperatorType::Normalization;
             });
@@ -1819,42 +2269,15 @@ class IrParser {
             }
           },
           // TODO: #ProfileIValue List should update this
-          [](const Node* node) -> bool { return true; },
-          [](const Node* node) -> OperatorType {
-            return OperatorType::Normalization;
-          });
-    }
-
-    {
-      auto ptr_op = getOperatorForLiteral(
-          "aten::softmax.int(Tensor self, int dim, int? dtype) -> Tensor");
-      REGISTER_PARSE_RULE(
-          ptr_op,
-          {
-            MemoryFormat format;
-            std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
-                value_map[node->inputs()[0]->unique()]);
-            auto input_t = list_val.front();
-            list_val.pop_front();
-            auto input = input_t->as<TensorView>();
-
-            auto dim_value = constant_as<int>(node->input(1));
-            TORCH_INTERNAL_ASSERT(
-                dim_value.has_value(), "dim in softmax is not valid");
-
-            auto output = softmax(input, dim_value.value());
-            value_map.emplace(node->output()->unique(), output);
-          },
           [](const Node* node) -> bool {
-            if (node->inputs()[1]->node()->kind() != prim::Constant) {
+            if (isReductionNonCompatibleTensor(
+                    node->input(0)->type()->cast<TensorType>())) {
               return false;
             }
-            // TODO: support dynamic input by profiling it
-            if (!node->inputs()[2]->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get())) &&
-                node->inputs()[2]->node()->kind() != prim::Constant) {
+            if (node->inputs()[2]->node()->kind() != prim::Constant) {
+              return false;
+            }
+            if (node->inputs()[7]->node()->kind() != prim::Constant) {
               return false;
             }
             return true;
@@ -1864,6 +2287,67 @@ class IrParser {
           });
     }
 
+    {
+      std::array<const char*, kNumSoftmaxFwd> SoftmaxFwd = {
+          "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor",
+          "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"};
+      for (auto signature : SoftmaxFwd) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(),
+                  value_map[node->inputs()[0]->unique()]);
+              auto input_t = list_val.front();
+              list_val.pop_front();
+              auto input = input_t->as<TensorView>();
+
+              auto dim_value = constant_as<int>(node->input(1));
+              TORCH_INTERNAL_ASSERT(
+                  dim_value.has_value(), "dim in softmax is not valid");
+
+              auto data_type = DataType::Null;
+              if (const auto opt_ivalue = toIValue(node->input(2))) {
+                if (!opt_ivalue.value().isNone()) {
+                  data_type = aten_to_data_type(opt_ivalue->toScalarType());
+                }
+              }
+
+              input = (data_type != DataType::Null)
+                  ? optionalCastStrict(data_type, input)->as<TensorView>()
+                  : input;
+
+              bool is_log_softmax = node->kind() ==
+                  c10::Symbol::fromQualString("aten::log_softmax");
+
+              auto output = (is_log_softmax)
+                  ? log_softmax(input, dim_value.value())
+                  : softmax(input, dim_value.value());
+
+              value_map.emplace(node->output()->unique(), output);
+            },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              if (node->inputs()[1]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              if (!isScalarTypeCompatible(node, 2)) {
+                return false;
+              }
+              return true;
+            },
+            [](const Node* node) -> OperatorType {
+              return OperatorType::Normalization;
+            });
+      }
+    }
+
     { // LTC uses this op for softmax
       auto ptr_op = getOperatorForLiteral(
           "aten::_softmax(Tensor self, int dim, bool half_to_float) -> Tensor");
@@ -1887,6 +2371,10 @@ class IrParser {
             value_map.emplace(node->output()->unique(), output);
           },
           [](const Node* node) -> bool {
+            if (isReductionNonCompatibleTensor(
+                    node->input(0)->type()->cast<TensorType>())) {
+              return false;
+            }
             if (node->inputs()[1]->node()->kind() != prim::Constant) {
               return false;
             }
@@ -1911,35 +2399,115 @@ class IrParser {
     }
 
     {
-      auto ptr_op = getOperatorForLiteral(
-          "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor");
-      REGISTER_PARSE_RULE(
-          ptr_op,
-          {
-            auto grad_output =
-                value_map[node->input(0)->unique()]->as<TensorView>();
+      std::array<const char*, kNumSoftmaxBwd> SoftmaxBwd = {
+          "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor",
+          "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor"};
+      for (auto signature : SoftmaxBwd) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(),
+                  value_map[node->inputs()[0]->unique()],
+                  value_map[node->inputs()[1]->unique()]);
+              auto grad_output_t = list_val.front();
+              list_val.pop_front();
+              auto grad_output = grad_output_t->as<TensorView>();
 
-            auto output = value_map[node->input(1)->unique()]->as<TensorView>();
+              auto output_t = list_val.front();
+              list_val.pop_front();
+              auto output = output_t->as<TensorView>();
 
-            auto dim_value = constant_as<int>(node->input(2));
-            TORCH_INTERNAL_ASSERT(
-                dim_value.has_value(), "dim in softmax is not valid");
+              auto dim_value = constant_as<int>(node->input(2));
+              TORCH_INTERNAL_ASSERT(
+                  dim_value.has_value(), "dim in softmax is not valid");
 
-            // input_dtype here is ignored! type_inference handles it
-            auto grad_input =
-                softmax_backward(grad_output, output, dim_value.value());
+              // input_dtype here is ignored! type_inference handles it
+              bool is_log_softmax = node->kind() ==
+                  c10::Symbol::fromQualString(
+                                        "aten::_log_softmax_backward_data");
+              auto grad_input = (is_log_softmax)
+                  ? log_softmax_backward(grad_output, output, dim_value.value())
+                  : softmax_backward(grad_output, output, dim_value.value());
 
-            value_map.emplace(node->output()->unique(), grad_input);
-          },
-          [](const Node* node) -> bool {
-            if (node->inputs()[2]->node()->kind() != prim::Constant) {
-              return false;
-            }
-            return true;
-          },
-          [](const Node* node) -> OperatorType {
-            return OperatorType::Normalization;
-          });
+              value_map.emplace(node->output()->unique(), grad_input);
+            },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              if (node->inputs()[2]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              if (node->inputs()[3]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              return true;
+            },
+            [](const Node* node) -> OperatorType {
+              return OperatorType::Normalization;
+            });
+      }
+    }
+
+    {
+      std::array<const char*, kNumVarOps> Variance = {
+          "aten::var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor",
+          "aten::std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor"};
+      for (auto signature : Variance) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(),
+                  value_map[node->inputs()[0]->unique()]);
+              auto input_t = list_val.front();
+              list_val.pop_front();
+              auto input = input_t->as<TensorView>();
+
+              bool is_variance =
+                  node->kind() == c10::Symbol::fromQualString("aten::var");
+
+              auto dims_list = constant_as<c10::List<int64_t>>(node->input(1));
+              TORCH_INTERNAL_ASSERT(
+                  dims_list.has_value(), "Cannot fuse with dynamic axes");
+              std::vector<int> dims;
+              for (const auto dim : dims_list->vec()) {
+                dims.emplace_back(static_cast<int>(dim));
+              }
+
+              auto unbiased = constant_as<bool>(node->input(2));
+              TORCH_INTERNAL_ASSERT(
+                  unbiased.has_value(), "Cannot fuse with dynamic unbiased");
+
+              auto keepdim = constant_as<bool>(node->input(3));
+              TORCH_INTERNAL_ASSERT(
+                  keepdim.has_value(), "Cannot fuse with dynamic keepdim");
+
+              auto output = (is_variance)
+                  ? variance(input, dims, unbiased.value(), keepdim.value())
+                  : standard_deviation(
+                        input, dims, unbiased.value(), keepdim.value());
+              value_map.emplace(node->output()->unique(), output);
+            },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              return true;
+            },
+            [](const Node* node) -> OperatorType {
+              return OperatorType::Normalization;
+            });
+      }
     }
 
     {
@@ -1961,8 +2529,13 @@ class IrParser {
                 dims_list.has_value(),
                 "aten::sum cannot be fused with dynamic axes");
             std::vector<int> dims;
-            for (const auto dim : dims_list->vec()) {
-              dims.emplace_back(static_cast<int>(dim));
+            if (!dims_list->empty()) {
+              for (const auto dim : dims_list->vec()) {
+                dims.emplace_back(static_cast<int>(dim));
+              }
+            } else {
+              dims.resize(self->as<TensorView>()->nDims());
+              std::iota(dims.begin(), dims.end(), 0);
             }
             auto keepdim = constant_as<bool>(node->input(2));
             TORCH_INTERNAL_ASSERT(
@@ -1972,20 +2545,20 @@ class IrParser {
             value_map.emplace(node->output()->unique(), out);
           },
           [](const Node* node) -> bool {
+            if (isReductionNonCompatibleTensor(
+                    node->input(0)->type()->cast<TensorType>())) {
+              return false;
+            }
             // TODO: support cast of output types
             if (!node->inputs()[3]->type()->isSubtypeOf(
                     static_cast<c10::TypePtr>(NoneType::get()))) {
               // We can only handle output as half, float, and double;
               if (const auto opt_ivalue = toIValue(node->input(3))) {
                 const auto scalar_type = opt_ivalue->toScalarType();
-                if (scalar_type == at::ScalarType::Double ||
-                    scalar_type == at::ScalarType::Float ||
-                    scalar_type == at::ScalarType::BFloat16 ||
-                    scalar_type == at::ScalarType::Half) {
-                  return true;
+                if (!at::isFloatingType(scalar_type)) {
+                  return false;
                 }
               }
-              return false;
             }
             // we don't support dynamic reduction axes;
             if (node->inputs()[1]->node()->kind() != prim::Constant) {
@@ -2021,15 +2594,20 @@ class IrParser {
                 dims_list.has_value(),
                 "aten::mean cannot be fused with dynamic axes");
             std::vector<int> dims;
-            for (const auto dim : dims_list->vec()) {
-              dims.emplace_back(static_cast<int>(dim));
+            if (!dims_list->empty()) {
+              for (const auto dim : dims_list->vec()) {
+                dims.emplace_back(static_cast<int>(dim));
+              }
+            } else {
+              dims.resize(self->as<TensorView>()->nDims());
+              std::iota(dims.begin(), dims.end(), 0);
             }
             auto keepdim = constant_as<bool>(node->input(2));
             TORCH_INTERNAL_ASSERT(
                 keepdim.has_value(),
                 "aten::mean cannot be fused with dynamic keepdim");
             auto o_sum = sum(self, dims, keepdim.value());
-            Val* num_features = new Double(1);
+            Val* num_features = IrBuilder::create<Double>(1);
             for (auto axis : dims) {
               if (axis < 0) {
                 axis += int(self->nDims());
@@ -2041,20 +2619,20 @@ class IrParser {
             value_map.emplace(node->output()->unique(), out);
           },
           [](const Node* node) -> bool {
+            if (isReductionNonCompatibleTensor(
+                    node->input(0)->type()->cast<TensorType>())) {
+              return false;
+            }
             // TODO: support cast of output types
             if (!node->inputs()[3]->type()->isSubtypeOf(
                     static_cast<c10::TypePtr>(NoneType::get()))) {
               // We can only handle output as half, float, and double;
               if (const auto opt_ivalue = toIValue(node->input(3))) {
                 const auto scalar_type = opt_ivalue->toScalarType();
-                if (scalar_type == at::ScalarType::Double ||
-                    scalar_type == at::ScalarType::Float ||
-                    scalar_type == at::ScalarType::BFloat16 ||
-                    scalar_type == at::ScalarType::Half) {
-                  return true;
+                if (!at::isFloatingType(scalar_type)) {
+                  return false;
                 }
               }
-              return false;
             }
             // we don't support dynamic reduction axes;
             if (node->inputs()[1]->node()->kind() != prim::Constant) {
@@ -2091,7 +2669,13 @@ class IrParser {
                   size_to.has_value(),
                   "aten::sum cannot be fused with dynamic axes");
               if (!size_to->empty()) {
-                auto out = sum_to(self->as<TensorView>(), size_to->vec());
+                auto input = self->as<TensorView>();
+                auto out = sum_to(input, size_to->vec());
+                // this copy is not necessary, but making copy avoids tricky
+                // computational graph where no-op could be challenging.
+                if (out == input) {
+                  out = set(input);
+                }
                 value_map.emplace(node->output()->unique(), out);
               } else {
                 // We are introducing alias here!
@@ -2099,13 +2683,15 @@ class IrParser {
               }
             },
             [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
               // we don't support dynamic reduction axes;
               if (node->inputs()[1]->node()->kind() != prim::Constant) {
                 return false;
               }
               return true;
-              // auto size_to = constant_as<c10::List<int64_t>>(node->input(1));
-              // return size_to.has_value() && !size_to->empty();
             },
             [](const Node* node) -> OperatorType {
               auto size_to = constant_as<c10::List<int64_t>>(node->input(1));
@@ -2140,7 +2726,7 @@ class IrParser {
               value_map.emplace(
                   node->output()->unique(), ValueHolder(out, format));
             },
-            nullptr,
+            isInputNonSizeZeroTensor,
             nullptr);
       }
     }
@@ -2178,7 +2764,20 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->inputs()[1]->node()->kind() != prim::Constant) {
+              return false;
+            }
+            // we do not support explicit memory_format on output
+            if (!node->inputs()[4]->type()->isSubtypeOf(
+                    static_cast<c10::TypePtr>(NoneType::get()))) {
+              return false;
+            }
+            return true;
+          },
           nullptr);
     }
 
@@ -2207,7 +2806,7 @@ class IrParser {
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
@@ -2225,12 +2824,8 @@ class IrParser {
             TORCH_INTERNAL_ASSERT(false, "not implemented yet");
           },
           [](const Node* node) -> bool {
-            // We only profile `linear` layer with bias.
-            if (node->input(2)->type()->isSubtypeOf(
-                    static_cast<c10::TypePtr>(NoneType::get()))) {
-              return false;
-            }
-            return true;
+            // We only profile `linear` layer but not fusing it.
+            return false;
           });
     }
 
@@ -2250,7 +2845,7 @@ class IrParser {
             } else {
               MemoryFormat format;
               std::list<Val*> list_val;
-              std::tie(format, list_val) = getConsistentValues(
+              std::tie(format, list_val) = getPWFormatValues(
                   c10::nullopt,
                   value_map[node->inputs()[0]->unique()],
                   value_map[node->inputs()[1]->unique()]);
@@ -2268,12 +2863,13 @@ class IrParser {
                   node->output()->unique(), ValueHolder(out, format));
             }
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
           nullptr);
     }
 
     {
-      auto ptr_op = getOperatorForLiteral("aten::gelu(Tensor self) -> Tensor");
+      auto ptr_op = getOperatorForLiteral(
+          "aten::gelu(Tensor self, *, str approximate='none') -> Tensor");
       REGISTER_PARSE_RULE(
           ptr_op,
           {
@@ -2281,117 +2877,364 @@ class IrParser {
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
                 c10::nullopt, value_map[node->inputs()[0]->unique()]);
-            auto self = list_val.front();
+            auto self = list_val.front()->as<TensorView>();
             list_val.pop_front();
-            auto out = gelu(self);
+
+            auto approximate = constant_as<std::string>(node->input(1));
+            TORCH_INTERNAL_ASSERT(
+                approximate.has_value(),
+                "The approximate parameter is required.");
+            const auto kTanhGelu =
+                at::native::get_gelutype_enum(approximate.value()) ==
+                at::native::GeluType::Tanh;
+
+            auto out = (kTanhGelu) ? tanh_gelu(self) : gelu(self);
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
-          nullptr,
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->input(1)->node()->kind() != prim::Constant) {
+              return false;
+            }
+            return true;
+          },
           nullptr);
     }
 
     {
       auto ptr_op = getOperatorForLiteral(
-          "aten::gelu_backward(Tensor grad, Tensor self) -> Tensor");
+          "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor");
       REGISTER_PARSE_RULE(
           ptr_op,
           {
             MemoryFormat format;
             std::list<Val*> list_val;
-            std::tie(format, list_val) = getConsistentValues(
+            std::tie(format, list_val) = getPWFormatValues(
                 c10::nullopt,
                 value_map[node->inputs()[0]->unique()],
                 value_map[node->inputs()[1]->unique()]);
-            auto grad_out = list_val.front();
+            auto grad_out = list_val.front()->as<TensorView>();
             list_val.pop_front();
-            auto self = list_val.front();
+            auto self = list_val.front()->as<TensorView>();
+            list_val.pop_front();
+
+            auto approximate = constant_as<std::string>(node->input(2));
+            TORCH_INTERNAL_ASSERT(
+                approximate.has_value(),
+                "The approximate parameter is required.");
+            const auto kTanhGelu =
+                at::native::get_gelutype_enum(approximate.value()) ==
+                at::native::GeluType::Tanh;
+
+            auto grad_in = (kTanhGelu) ? tanh_gelu_backward(grad_out, self)
+                                       : gelu_backward(grad_out, self);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(grad_in, format));
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->input(2)->node()->kind() != prim::Constant) {
+              return false;
+            }
+            return true;
+          },
+          nullptr);
+    }
+
+    {
+      auto ptr_op = getOperatorForLiteral(
+          "aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor");
+      REGISTER_PARSE_RULE(
+          ptr_op,
+          {
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getPWFormatValues(
+                c10::nullopt,
+                value_map[node->inputs()[0]->unique()],
+                value_map[node->inputs()[1]->unique()]);
+            auto grad_out = list_val.front()->as<TensorView>();
+            list_val.pop_front();
+            auto self = list_val.front()->as<TensorView>();
             list_val.pop_front();
 
-            auto grad_in = gelu_backward(grad_out, self);
+            auto grad_in = tanh_backward(grad_out, self);
             value_map.emplace(
                 node->output()->unique(), ValueHolder(grad_in, format));
           },
-          nullptr,
+          isInputNonSizeZeroTensor,
+          nullptr);
+    }
+
+    {
+      std::array<const char*, kNumAminAmaxOps> BinaryFloatOp = {
+          "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor",
+          "aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor"};
+      for (auto signature : BinaryFloatOp) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(),
+                  value_map[node->inputs()[0]->unique()]);
+              auto self = list_val.front();
+              list_val.pop_front();
+              auto dims_list = constant_as<c10::List<int64_t>>(node->input(1));
+              TORCH_INTERNAL_ASSERT(
+                  dims_list.has_value(),
+                  "aten::amax/amin cannot be fused with dynamic axes");
+              std::vector<int> dims;
+              if (!dims_list->empty()) {
+                for (const auto dim : dims_list->vec()) {
+                  dims.emplace_back(static_cast<int>(dim));
+                }
+              } else {
+                dims.resize(self->as<TensorView>()->nDims());
+                std::iota(dims.begin(), dims.end(), 0);
+              }
+              auto keepdim = constant_as<bool>(node->input(2));
+              TORCH_INTERNAL_ASSERT(
+                  keepdim.has_value(),
+                  "aten::amax/amin cannot be fused with dynamic keepdim");
+
+              TensorView* out = nullptr;
+              if (node->kind() == c10::Symbol::fromQualString("aten::amax")) {
+                out = max(self->as<TensorView>(), dims, keepdim.value());
+              } else if (
+                  node->kind() == c10::Symbol::fromQualString("aten::amin")) {
+                out = min(self->as<TensorView>(), dims, keepdim.value());
+              } else {
+                TORCH_INTERNAL_ASSERT(
+                    false, "unrecognized operation in aten::amax/amin");
+              }
+              value_map.emplace(node->output()->unique(), out);
+            },
+            [](const Node* node) -> bool {
+              if (isReductionNonCompatibleTensor(
+                      node->input(0)->type()->cast<TensorType>())) {
+                return false;
+              }
+              // we don't support dynamic reduction axes;
+              if (node->inputs()[1]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              // we don't support dynamic keepdim yet;
+              if (node->inputs()[2]->node()->kind() != prim::Constant) {
+                return false;
+              }
+              return true;
+            },
+            [](const Node* node) -> OperatorType {
+              return OperatorType::Reduction;
+            });
+      }
+    }
+
+    {
+      std::array<const char*, kNumViewOps> ViewOps = {
+          "prim::reshape_copy(Tensor self, int[] shape) -> Tensor",
+          "prim::view_copy(Tensor self, int[] size) -> Tensor"};
+      for (auto signature : ViewOps) {
+        auto ptr_op = getOperatorForLiteral(signature);
+        REGISTER_PARSE_RULE(
+            ptr_op,
+            {
+              auto self_value = node->inputs()[0];
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(), value_map[self_value->unique()]);
+              auto self = list_val.front()->as<TensorView>();
+              list_val.pop_front();
+
+              auto self_type = self_value->type()->cast<c10::TensorType>();
+              TORCH_INTERNAL_ASSERT(self_type != nullptr);
+              auto self_sizes = getTensorSizes(self_type);
+
+              auto view_sizes = constant_as<c10::List<int64_t>>(node->input(1));
+              TORCH_INTERNAL_ASSERT(
+                  view_sizes.has_value(), "The size parameter is required.");
+
+              auto output = view(self, self_sizes, view_sizes->vec());
+              value_map.emplace(node->output()->unique(), output);
+            },
+            [](const Node* node) -> bool {
+              auto self_value = node->inputs()[0];
+              auto tensor_type = self_value->type()->cast<c10::TensorType>();
+              if (tensor_type == nullptr) {
+                return false;
+              }
+              if (!tensor_type->sizes().concrete_sizes().has_value()) {
+                // Shape information for input tensor is required.
+                return false;
+              }
+
+              if (!isInputNonSizeZeroTensor(node)) {
+                return false;
+              }
+              // Reject fusing node if view_sizes contains an inferred dimension
+              auto view_sizes = constant_as<c10::List<int64_t>>(node->input(1));
+              if (!view_sizes.has_value()) {
+                // The size parameter is required.
+                return false;
+              }
+
+              for (auto axis_size : view_sizes->vec()) {
+                if (axis_size == -1) {
+                  return false;
+                }
+              }
+              return true;
+            },
+            nullptr);
+      }
+    }
+
+    {
+      auto flatten_op = getOperatorForLiteral(
+          "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor");
+      REGISTER_PARSE_RULE(
+          flatten_op,
+          {
+            auto self_value = node->inputs()[0];
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getConsistentValues(
+                MemoryFormat::Contiguous(), value_map[self_value->unique()]);
+            auto self = list_val.front()->as<TensorView>();
+            list_val.pop_front();
+
+            auto start_dim_value = constant_as<int>(node->input(1));
+            TORCH_INTERNAL_ASSERT(
+                start_dim_value.has_value(), "start_dim is not valid");
+            auto end_dim_value = constant_as<int>(node->input(2));
+            TORCH_INTERNAL_ASSERT(
+                end_dim_value.has_value(), "end_dim is not valid");
+
+            TensorView* output =
+                flatten(self, start_dim_value.value(), end_dim_value.value());
+            value_map.emplace(node->output()->unique(), output);
+          },
+          [](const Node* node) -> bool {
+            // we don't support dynamic start_dim;
+            if (node->inputs()[1]->node()->kind() != prim::Constant) {
+              return false;
+            }
+            // we don't support dynamic end_dim yet;
+            if (node->inputs()[2]->node()->kind() != prim::Constant) {
+              return false;
+            }
+            return true;
+          },
           nullptr);
     }
 
     {
-      auto ptr_op = getOperatorForLiteral(
-          "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor");
+      auto ptr_op =
+          getOperatorForLiteral("prim::squeeze_copy(Tensor self) -> Tensor");
       REGISTER_PARSE_RULE(
           ptr_op,
           {
+            auto self_value = node->inputs()[0];
             MemoryFormat format;
             std::list<Val*> list_val;
             std::tie(format, list_val) = getConsistentValues(
-                MemoryFormat::Contiguous(),
-                value_map[node->inputs()[0]->unique()]);
-            auto self = list_val.front();
+                MemoryFormat::Contiguous(), value_map[self_value->unique()]);
+            auto self = list_val.front()->as<TensorView>();
             list_val.pop_front();
-            auto dims_list = constant_as<c10::List<int64_t>>(node->input(1));
-            TORCH_INTERNAL_ASSERT(
-                dims_list.has_value(),
-                "aten::amax cannot be fused with dynamic axes");
-            std::vector<int> dims;
-            for (const auto dim : dims_list->vec()) {
-              dims.emplace_back(static_cast<int>(dim));
-            }
-            auto keepdim = constant_as<bool>(node->input(2));
-            TORCH_INTERNAL_ASSERT(
-                keepdim.has_value(),
-                "aten::amax cannot be fused with dynamic keepdim");
 
-            auto out = max(self->as<TensorView>(), dims, keepdim.value());
-            value_map.emplace(node->output()->unique(), out);
+            auto self_type = self_value->type()->cast<c10::TensorType>();
+            TORCH_INTERNAL_ASSERT(self_type != nullptr);
+            auto self_sizes = getTensorSizes(self_type);
+
+            TensorView* output = nullptr;
+            if (self_sizes.empty()) {
+              // squeeze on scalar tensor should just return itself;
+              output = set(self);
+            } else {
+              output = squeeze(self, self_sizes);
+            }
+            value_map.emplace(node->output()->unique(), output);
           },
           [](const Node* node) -> bool {
-            // we don't support dynamic reduction axes;
-            if (node->inputs()[1]->node()->kind() != prim::Constant) {
+            // Shape information for input tensor is required.
+            auto self_value = node->inputs()[0];
+            auto tensor_type = self_value->type()->cast<c10::TensorType>();
+            if (tensor_type == nullptr) {
               return false;
             }
-            // we don't support dynamic keepdim yet;
-            if (node->inputs()[2]->node()->kind() != prim::Constant) {
+            if (!isInputNonSizeZeroTensor(node)) {
               return false;
             }
-            return true;
+            return tensor_type->sizes().concrete_sizes().has_value();
           },
-          [](const Node* node) -> OperatorType {
-            return OperatorType::Reduction;
-          });
+          nullptr);
     }
 
-    /*
-    // TODO: Enable view in parser by detecting non-alias view operation
     {
-      std::array<const char*, kNumViewSize> View = {
-          "aten::view(Tensor(a) self, int[] size) -> Tensor(a)",
-          "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)"};
-      for (auto signature : View) {
+      std::array<const char*, kNumAliasDimOps> AliasOpWithDim = {
+          "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor",
+          "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor"};
+      for (auto signature : AliasOpWithDim) {
         auto ptr_op = getOperatorForLiteral(signature);
         REGISTER_PARSE_RULE(
             ptr_op,
             {
               auto self_value = node->inputs()[0];
-              auto self = value_map[self_value->unique()]->as<TensorView>();
-
-              auto self_type = self_value->type()->cast<c10::TensorType>();
-              TORCH_INTERNAL_ASSERT(self_type != nullptr);
-              auto self_sizes = getTensorSizes(self_type);
+              MemoryFormat format;
+              std::list<Val*> list_val;
+              std::tie(format, list_val) = getConsistentValues(
+                  MemoryFormat::Contiguous(),
+                  value_map[node->inputs()[0]->unique()]);
+              auto self = list_val.front()->as<TensorView>();
+              list_val.pop_front();
 
-              auto size_optional =
-                  constant_as<c10::List<int64_t>>(node->input(1));
-              TORCH_INTERNAL_ASSERT(
-                  size_optional.has_value(), "The size parameter is required.");
+              auto dim_value = constant_as<int>(node->input(1));
+              TORCH_INTERNAL_ASSERT(dim_value.has_value(), "dim is not valid");
 
-              auto output = view(self, self_sizes, size_optional->vec());
+              TensorView* output = nullptr;
+              if (node->kind() == prim::unsqueeze_copy) {
+                output = unsqueeze(self, dim_value.value());
+              } else {
+                auto self_type = self_value->type()->cast<c10::TensorType>();
+                TORCH_INTERNAL_ASSERT(self_type != nullptr);
+                auto self_sizes = getTensorSizes(self_type);
+                if (self_sizes.empty()) {
+                  // squeeze on scalar tensor should just return itself;
+                  output = set(self);
+                } else {
+                  output = squeeze(self, self_sizes, dim_value.value());
+                }
+              }
               value_map.emplace(node->output()->unique(), output);
             },
-            nullptr,
+            [](const Node* node) -> bool {
+              // Shape information for input tensor is required.
+              auto self_value = node->inputs()[0];
+              auto tensor_type = self_value->type()->cast<c10::TensorType>();
+              if (tensor_type == nullptr) {
+                return false;
+              }
+              if (!isInputNonSizeZeroTensor(node)) {
+                return false;
+              }
+              if (node->input(1)->node()->kind() != prim::Constant) {
+                return false;
+              }
+              auto optional_sizes = tensor_type->sizes().concrete_sizes();
+              return tensor_type->sizes().concrete_sizes().has_value();
+            },
             nullptr);
       }
     }
-    */
   }
 
   void processJitNode(const JitOp* node) {
@@ -2425,9 +3268,9 @@ class IrParser {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       CgValue cg_val;
       if (auto ival = constant_as<double>(val)) {
-        cg_val = new Double(ival.value());
+        cg_val = IrBuilder::create<Double>(ival.value());
       } else {
-        cg_val = new Double();
+        cg_val = IrBuilder::create<Double>();
       }
       value_map_.emplace(val->unique(), cg_val);
       return true;
@@ -2436,9 +3279,9 @@ class IrParser {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       CgValue cg_val;
       if (auto ival = constant_as<int64_t>(val)) {
-        cg_val = new Int(ival.value());
+        cg_val = IrBuilder::create<Int>(ival.value());
       } else {
-        cg_val = new Int();
+        cg_val = IrBuilder::create<Int>();
       }
       value_map_.emplace(val->unique(), cg_val);
       return true;
@@ -2447,21 +3290,31 @@ class IrParser {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       CgValue cg_val;
       if (auto ival = constant_as<bool>(val)) {
-        cg_val = new Bool(ival.value());
+        cg_val = IrBuilder::create<Bool>(ival.value());
       } else {
-        cg_val = new Bool();
+        cg_val = IrBuilder::create<Bool>();
       }
       value_map_.emplace(val->unique(), cg_val);
       return true;
-    } else if (val->type()->isSubtypeOf(
-                   static_cast<c10::TypePtr>(NoneType::get()))) {
+    } else if (
+        val->type()->isSubtypeOf(
+            static_cast<c10::TypePtr>(StringType::get())) ||
+        val->type()->isSubtypeOf(
+            static_cast<c10::TypePtr>(DeviceObjType::get())) ||
+        val->type()->isSubtypeOf(static_cast<c10::TypePtr>(NoneType::get()))) {
       // TODO: should we consider adding support for NoneType;
+      // Note: String/Device scalars are only used in parsing rules, do not
+      // register string with codegen IR.
       return true;
     } else if (val->type()->cast<ListType>()) {
       // TODO: we don't support list type in codegen yet;
       // This is a WAR to allow axes of reduction to be passed as constant list;
       // We simply ignore conversion if the scalar value is a constant;
-      return toIValue(val).has_value();
+      auto ivalue = toIValue(val);
+      TORCH_INTERNAL_ASSERT(
+          ivalue.has_value(),
+          "List[T] is not supported as an argument by NvFuser. Use a Constant List.");
+      return true;
     }
     return false;
   }
@@ -2521,7 +3374,6 @@ class IrParser {
           nhwc_stride_vec[i]->stride_index_ = n_dim - i - 1;
         }
 
-        // auto updated_tensor_type = c10::TensorType::create(
         tensor_type = c10::TensorType::create(
             tensor_type->scalarType(),
             tensor_type->device(),
@@ -2531,7 +3383,10 @@ class IrParser {
             tensor_type->undefined());
       }
 
-      cg_val = new TensorView(tensor_type);
+      cg_val = IrBuilder::create<TensorView>(tensor_type);
+      if (is_cpu_scalar(*tensor_type)) {
+        cg_val->as<TensorView>()->setCpuScalar(true);
+      }
       value_map_.emplace(val->unique(), ValueHolder(cg_val, format));
       return true;
     }
@@ -2544,6 +3399,8 @@ class IrParser {
   std::unordered_map<size_t, ValueHolder> value_map_;
 
   static std::unordered_set<Symbol> parser_symbol_set_;
+  static std::unordered_set<Symbol> parser_skip_set_;
+  static std::mutex parser_mutex_;
 
   // parsing rule registry.
   static std::unordered_map<std::string, RegistrationEntry>
@@ -2554,16 +3411,18 @@ class IrParser {
       cached_registry_lookup_; // NOLINT
 
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-  static bool init_registry_;
+  static std::once_flag once_flag_;
 };
 std::unordered_set<Symbol> IrParser::parser_symbol_set_; // NOLINT
+std::unordered_set<Symbol> IrParser::parser_skip_set_; // NOLINT
+std::mutex IrParser::parser_mutex_;
 std::unordered_map<std::string, IrParser::RegistrationEntry>
     IrParser::jit_operator_registry_; // NOLINT
 std::unordered_map<const FunctionSchema*, const IrParser::RegistrationEntry*>
     IrParser::cached_registry_lookup_; // NOLINT
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-bool IrParser::init_registry_ = true;
+std::once_flag IrParser::once_flag_;
 
 ProfileIValueOp* insertProfileIValueOp(
     Node* node,
@@ -2576,7 +3435,7 @@ ProfileIValueOp* insertProfileIValueOp(
   return pn;
 }
 
-void profileSize(ProfilingRecord* pr, Node* node, size_t offset) {
+void profileReductionSize(ProfilingRecord* pr, Node* node, size_t offset) {
   auto pn = insertProfileIValueOp(node, offset, pr);
 
   const auto ivalue_profiler = [pr, pn](Stack& stack) {
@@ -2596,20 +3455,76 @@ void profileSize(ProfilingRecord* pr, Node* node, size_t offset) {
       size_vec.clear();
     } else {
       TORCH_INTERNAL_ASSERT(
-          false, "profileSize does not support data type: ", value.tagKind());
+          false,
+          "profileReductionSize does not support data type: ",
+          value.tagKind());
+    }
+    // We stop profiling when it has failed
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(reductionSizeAttr)) {
+        pn->is_(reductionSizeAttr, size_vec);
+      } else {
+        auto profiled_ints = pn->is(reductionSizeAttr);
+        if (profiled_ints.size() != size_vec.size() ||
+            !std::equal(
+                profiled_ints.begin(), profiled_ints.end(), size_vec.begin())) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(reductionSizeAttr);
+        }
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          !pn->hasAttribute(reductionSizeAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
-    if (!pn->hasAttribute(sizeAttr)) {
-      pn->is_(sizeAttr, size_vec);
+    push(stack, value);
+  };
+  pn->setCallback(ivalue_profiler);
+}
+
+void profileViewSize(ProfilingRecord* pr, Node* node, size_t offset) {
+  auto pn = insertProfileIValueOp(node, offset, pr);
+
+  const auto ivalue_profiler = [pr, pn](Stack& stack) {
+    std::lock_guard<std::mutex> lock(pr->mutex_);
+
+    // TODO: we don't care about merging multiple profiling runs as we don't
+    // support it at all;
+    int64_t frame_id = 0;
+    pop(stack, frame_id);
+    IValue value;
+    pop(stack, value);
+    TORCH_INTERNAL_ASSERT(
+        value.isIntList(), "profiling seeing the wrong data type");
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(viewSizeAttr)) {
+        pn->is_(viewSizeAttr, value.toIntVector());
+      } else {
+        auto profiled_ints = pn->is(viewSizeAttr);
+        auto input_ints = value.toIntList();
+        if (profiled_ints.size() != input_ints.size() ||
+            !std::equal(
+                profiled_ints.begin(),
+                profiled_ints.end(),
+                input_ints.begin())) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(viewSizeAttr);
+        }
+      }
     } else {
-      auto profiled_ints = pn->is(sizeAttr);
       TORCH_INTERNAL_ASSERT(
-          profiled_ints.size() == size_vec.size() &&
-              std::equal(
-                  profiled_ints.begin(), profiled_ints.end(), size_vec.begin()),
-          "profiling ivalue doesn't support merge");
+          !pn->hasAttribute(viewSizeAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
     push(stack, value);
   };
+
   pn->setCallback(ivalue_profiler);
 }
 
@@ -2627,18 +3542,67 @@ void profileIntList(ProfilingRecord* pr, Node* node, size_t offset) {
     pop(stack, value);
     TORCH_INTERNAL_ASSERT(
         value.isIntList(), "profiling seeing the wrong data type");
-    if (!pn->hasAttribute(intListAttr)) {
-      pn->is_(intListAttr, value.toIntVector());
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(intListAttr)) {
+        pn->is_(intListAttr, value.toIntVector());
+      } else {
+        auto profiled_ints = pn->is(intListAttr);
+        auto input_ints = value.toIntList();
+        if (profiled_ints.size() != input_ints.size() ||
+            !std::equal(
+                profiled_ints.begin(),
+                profiled_ints.end(),
+                input_ints.begin())) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(intListAttr);
+        }
+      }
     } else {
-      auto profiled_ints = pn->is(intListAttr);
-      auto input_ints = value.toIntList();
       TORCH_INTERNAL_ASSERT(
-          profiled_ints.size() == input_ints.size() &&
-              std::equal(
-                  profiled_ints.begin(),
-                  profiled_ints.end(),
-                  input_ints.begin()),
-          "profiling ivalue doesn't support merge");
+          !pn->hasAttribute(intListAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
+    }
+    push(stack, value);
+  };
+
+  pn->setCallback(ivalue_profiler);
+}
+
+void profileString(ProfilingRecord* pr, Node* node, size_t offset) {
+  auto pn = insertProfileIValueOp(node, offset, pr);
+
+  const auto ivalue_profiler = [pr, pn](Stack& stack) {
+    std::lock_guard<std::mutex> lock(pr->mutex_);
+
+    // TODO: we don't care about merging multiple profiling runs as we don't
+    // support it at all;
+    int64_t frame_id = 0;
+    pop(stack, frame_id);
+    IValue value;
+    pop(stack, value);
+    TORCH_INTERNAL_ASSERT(
+        value.isString(), "profiling seeing the wrong data type");
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(strAttr)) {
+        pn->s_(strAttr, value.toStringRef());
+      } else {
+        const auto& profiled_str = pn->s(strAttr);
+        const auto& input_str = value.toStringRef();
+        if (input_str != profiled_str) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(strAttr);
+        }
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          !pn->hasAttribute(strAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
     push(stack, value);
   };
@@ -2660,14 +3624,24 @@ void profileBool(ProfilingRecord* pr, Node* node, size_t offset) {
     pop(stack, value);
     TORCH_INTERNAL_ASSERT(
         value.isBool(), "profiling seeing the wrong data type");
-    if (!pn->hasAttribute(boolAttr)) {
-      pn->i_(boolAttr, value.toBool());
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(boolAttr)) {
+        pn->i_(boolAttr, value.toBool());
+      } else {
+        auto profiled_bool = pn->i(boolAttr);
+        auto input_bool = value.toBool();
+        if (input_bool != profiled_bool) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(boolAttr);
+        }
+      }
     } else {
-      auto profiled_bool = pn->i(boolAttr);
-      auto input_bool = value.toBool();
       TORCH_INTERNAL_ASSERT(
-          input_bool == profiled_bool,
-          "profiling ivalue doesn't support merge");
+          !pn->hasAttribute(boolAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
     push(stack, value);
   };
@@ -2689,13 +3663,61 @@ void profileInt(ProfilingRecord* pr, Node* node, size_t offset) {
     pop(stack, value);
     TORCH_INTERNAL_ASSERT(
         value.isInt(), "profiling seeing the wrong data type");
-    if (!pn->hasAttribute(intAttr)) {
-      pn->i_(intAttr, value.toInt());
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(intAttr)) {
+        pn->i_(intAttr, value.toInt());
+      } else {
+        auto profiled_int = pn->i(intAttr);
+        auto input_int = value.toInt();
+        if (input_int != profiled_int) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(intAttr);
+        }
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          !pn->hasAttribute(intAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
+    }
+    push(stack, value);
+  };
+
+  pn->setCallback(ivalue_profiler);
+}
+
+// profile ivalue, used for optional arguments
+void profileIval(ProfilingRecord* pr, Node* node, size_t offset) {
+  auto pn = insertProfileIValueOp(node, offset, pr);
+
+  const auto ivalue_profiler = [pr, pn](Stack& stack) {
+    std::lock_guard<std::mutex> lock(pr->mutex_);
+
+    // TODO: we don't care about merging multiple profiling runs as we don't
+    // support it at all;
+    int64_t frame_id = 0;
+    pop(stack, frame_id);
+    IValue value;
+    pop(stack, value);
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(ivalAttr)) {
+        pn->ival_(ivalAttr, value);
+      } else {
+        auto profiled_ival = pn->ival(ivalAttr);
+        if (value != profiled_ival) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(ivalAttr);
+        }
+      }
     } else {
-      auto profiled_int = pn->i(intAttr);
-      auto input_int = value.toInt();
       TORCH_INTERNAL_ASSERT(
-          input_int == profiled_int, "profiling ivalue doesn't support merge");
+          !pn->hasAttribute(ivalAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
     push(stack, value);
   };
@@ -2717,20 +3739,30 @@ void profileBoolList(ProfilingRecord* pr, Node* node, size_t offset) {
     pop(stack, value);
     TORCH_INTERNAL_ASSERT(
         value.isBoolList(), "profiling seeing the wrong data type");
-    if (!pn->hasAttribute(boolListAttr)) {
-      auto list = value.toBoolList();
-      std::vector<int64_t> val(list.begin(), list.end());
-      pn->is_(boolListAttr, val);
+    if (!pn->hasAttribute(profileFailedAttr)) {
+      if (!pn->hasAttribute(boolListAttr)) {
+        auto list = value.toBoolList();
+        std::vector<int64_t> val(list.begin(), list.end());
+        pn->is_(boolListAttr, val);
+      } else {
+        auto profiled_ints = pn->is(boolListAttr);
+        auto input_bools = value.toBoolList();
+        if (profiled_ints.size() != input_bools.size() ||
+            !std::equal(
+                input_bools.begin(),
+                input_bools.end(),
+                profiled_ints.begin())) {
+          TORCH_WARN(
+              __FUNCTION__,
+              " sees varying value in profiling, ignoring and this should be handled by GUARD logic");
+          pn->s_(profileFailedAttr, "varying profile values");
+          pn->removeAttribute(boolListAttr);
+        }
+      }
     } else {
-      auto profiled_ints = pn->is(boolListAttr);
-      auto input_bools = value.toBoolList();
       TORCH_INTERNAL_ASSERT(
-          profiled_ints.size() == input_bools.size() &&
-              std::equal(
-                  input_bools.begin(),
-                  input_bools.end(),
-                  profiled_ints.begin()),
-          "profiling ivalue doesn't support merge");
+          !pn->hasAttribute(boolListAttr),
+          "profiled attribute should have been removed when profiling is marked as failed");
     }
     push(stack, value);
   };
@@ -2788,6 +3820,11 @@ bool shouldProfileNode(const Node* node) {
   return IrParser::lookupInSymbolSet(node);
 }
 
+bool skipNodeKind(const std::string& symbol_str, bool flip) {
+  return IrParser::querySkipSymbolSet(
+      c10::Symbol::fromQualString(symbol_str), flip);
+}
+
 bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
   // is skip constant necessary?
   if (node->input(offset)->node()->kind() == prim::Constant) {
@@ -2798,23 +3835,11 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
       getOperatorForLiteral(
           "aten::dropout(Tensor input, float p, bool train) -> Tensor")
           ->schema();
-  if (node->matches(dropout_schema)) {
-    switch (offset) {
-      // argument 2: Is training?
-      case 2:
-        profileBool(pr, node, offset);
-        break;
-      default:
-        return false;
-    }
-    return true;
-  }
-
   static auto native_dropout_schema =
       getOperatorForLiteral(
           "aten::native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)")
           ->schema();
-  if (node->matches(native_dropout_schema)) {
+  if (node->matches(dropout_schema) || node->matches(native_dropout_schema)) {
     switch (offset) {
       // argument 2: Is training?
       case 2:
@@ -2830,7 +3855,11 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
       getOperatorForLiteral(
           "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor")
           ->schema();
-  if (node->matches(amax_schema)) {
+  static auto amin_schema =
+      getOperatorForLiteral(
+          "aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor")
+          ->schema();
+  if (node->matches(amax_schema) || node->matches(amin_schema)) {
     switch (offset) {
       // argument 1: reduction axes;
       case 1:
@@ -2880,7 +3909,7 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
       // argument 1: reduction sizes;
       case 1:
         // TODO(profile_size): double check optional[size]?
-        profileSize(pr, node, offset);
+        profileReductionSize(pr, node, offset);
         break;
       default:
         return false;
@@ -2888,28 +3917,74 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     return true;
   }
 
-  /*
-  // TODO: Enable view in parser by detecting non-alias view operation
-  static auto view_schema =
+  static auto reshape_schema =
+      getOperatorForLiteral("aten::reshape(Tensor self, int[] shape) -> Tensor")
+          ->schema();
+  static auto reshape_copy_schema =
       getOperatorForLiteral(
-          "aten::view(Tensor(a) self, int[] size) -> Tensor(a)")
+          "prim::reshape_copy(Tensor self, int[] shape) -> Tensor")
           ->schema();
-  static auto reshape_schema =
+  static auto view_schema =
+      getOperatorForLiteral("aten::view(Tensor self, int[] size) -> Tensor")
+          ->schema();
+  static auto view_copy_schema =
       getOperatorForLiteral(
-          "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)")
+          "prim::view_copy(Tensor self, int[] size) -> Tensor")
           ->schema();
-  if (node->matches(view_schema) || node->matches(reshape_schema)) {
+  if (node->matches(reshape_schema) || node->matches(reshape_copy_schema) ||
+      node->matches(view_schema) || node->matches(view_copy_schema)) {
     switch (offset) {
       // argument 1: new tensor size;
       case 1:
-        profileSize(pr, node, offset);
+        profileViewSize(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
+  static auto flatten_schema1 =
+      getOperatorForLiteral(
+          "aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> Tensor")
+          ->schema();
+  static auto flatten_schema2 =
+      getOperatorForLiteral(
+          "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor")
+          ->schema();
+  if (node->matches(flatten_schema1) || node->matches(flatten_schema2)) {
+    switch (offset) {
+      // argument 1: start_dim;
+      // argument 2: end_dim;
+      case 1:
+      case 2:
+        profileInt(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
+  static auto squeeze_dim_schema =
+      getOperatorForLiteral(
+          "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor")
+          ->schema();
+  static auto unsqueeze_schema =
+      getOperatorForLiteral(
+          "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor")
+          ->schema();
+  if (node->matches(squeeze_dim_schema) || node->matches(unsqueeze_schema)) {
+    switch (offset) {
+      // argument 1: unsqueeze dim;
+      case 1:
+        profileInt(pr, node, offset);
         break;
       default:
         return false;
     }
     return true;
   }
-  */
 
   static auto batch_norm_impl_index_schema =
       getOperatorForLiteral(
@@ -2941,6 +4016,38 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     return true;
   }
 
+  static auto gelu_schema =
+      getOperatorForLiteral(
+          "aten::gelu(Tensor self, *, str approximate='none') -> Tensor")
+          ->schema();
+  if (node->matches(gelu_schema)) {
+    switch (offset) {
+      // argument 1: approximate;
+      case 1:
+        profileString(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
+  static auto gelu_backward_schema =
+      getOperatorForLiteral(
+          "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor")
+          ->schema();
+  if (node->matches(gelu_backward_schema)) {
+    switch (offset) {
+      // argument 2: approximate;
+      case 2:
+        profileString(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
   static auto native_layer_norm_schema =
       getOperatorForLiteral(
           "aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)")
@@ -2982,6 +4089,26 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     return true;
   }
 
+  static auto batch_norm_backward_schema =
+      getOperatorForLiteral(
+          "aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)")
+          ->schema();
+  if (node->matches(batch_norm_backward_schema)) {
+    switch (offset) {
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+      case 7: // argument 8: training;
+        profileBool(pr, node, offset);
+        break;
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+      case 9:
+        profileBoolList(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
   static auto native_layer_norm_backward_schema =
       getOperatorForLiteral(
           "aten::native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)")
@@ -3015,12 +4142,39 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     }
   }
 
+  static auto log_softmax_data_schema =
+      getOperatorForLiteral(
+          "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor")
+          ->schema();
+  static auto softmax_data_schema =
+      getOperatorForLiteral(
+          "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor")
+          ->schema();
+  if (node->matches(log_softmax_data_schema) ||
+      node->matches(softmax_data_schema)) {
+    switch (offset) {
+      case 2:
+        profileIval(pr, node, offset);
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  static auto log_softmax_backward_data_schema =
+      getOperatorForLiteral(
+          "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor")
+          ->schema();
   static auto softmax_backward_data_schema =
       getOperatorForLiteral(
           "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor")
           ->schema();
-  if (node->matches(softmax_backward_data_schema)) {
+  if (node->matches(log_softmax_backward_data_schema) ||
+      node->matches(softmax_backward_data_schema)) {
     switch (offset) {
+      case 2:
+        profileInt(pr, node, offset);
+        return true;
       case 3:
         profileInt(pr, node, offset);
         return true;
diff --git a/torch/csrc/jit/codegen/cuda/parser.h b/torch/csrc/jit/codegen/cuda/parser.h
index 4b2fcf50f992..ddfbf7762742 100644
--- a/torch/csrc/jit/codegen/cuda/parser.h
+++ b/torch/csrc/jit/codegen/cuda/parser.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
 
@@ -44,6 +44,8 @@ TORCH_CUDA_CU_API bool isElementWiseNode(const Node* node);
 TORCH_CUDA_CU_API bool isNodeParsible(const Node* node);
 TORCH_CUDA_CU_API bool shouldProfileNode(const Node* node);
 
+TORCH_CUDA_CU_API bool skipNodeKind(const std::string& symbol_str, bool flip);
+
 void InsertProfileNodes(ProfilingRecord* pr);
 
 // lowers PyTorch jit graph to `Fusion`.
diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
index e7b6db4d165f..dd8fb05a0493 100644
--- a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
@@ -8,11 +8,10 @@ namespace fuser {
 namespace cuda {
 
 void PartialSplitMap::build(Fusion* fusion) {
-  const auto gpu_lower = GpuLower::current();
   auto used_vals = ir_utils::allTvs(fusion);
 
   for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
-    auto exprs = ExprSort::getExprs(
+    auto exprs = StmtSort::getExprs(
         fusion, {tv->domain()->domain().begin(), tv->domain()->domain().end()});
     for (auto split : ir_utils::filterByType<Split>(exprs)) {
       // Only needs to check root domains as partial split is only
@@ -24,18 +23,10 @@ void PartialSplitMap::build(Fusion* fusion) {
         continue;
       }
       auto root_domain = split->in();
-      auto kir_root_domain =
-          gpu_lower->lowerValue(split->in())->as<kir::IterDomain>();
       auto start_offset = split->startOffset();
       start_offset_map_.insert({root_domain, start_offset});
-      kir_start_offset_map_.insert(
-          {kir_root_domain,
-           gpu_lower->lowerValue(start_offset)->as<kir::Val>()});
       auto stop_offset = split->stopOffset();
       stop_offset_map_.insert({root_domain, stop_offset});
-      kir_stop_offset_map_.insert(
-          {kir_root_domain,
-           gpu_lower->lowerValue(stop_offset)->as<kir::Val>()});
     }
   }
 }
@@ -49,15 +40,6 @@ Val* PartialSplitMap::getStartOffset(IterDomain* root_domain) const {
   }
 }
 
-kir::Val* PartialSplitMap::getStartOffset(kir::IterDomain* root_domain) const {
-  auto it = kir_start_offset_map_.find(root_domain);
-  if (it == kir_start_offset_map_.end()) {
-    return nullptr;
-  } else {
-    return it->second;
-  }
-}
-
 Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const {
   auto it = stop_offset_map_.find(root_domain);
   if (it == stop_offset_map_.end()) {
@@ -67,15 +49,6 @@ Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const {
   }
 }
 
-kir::Val* PartialSplitMap::getStopOffset(kir::IterDomain* root_domain) const {
-  auto it = kir_stop_offset_map_.find(root_domain);
-  if (it == kir_stop_offset_map_.end()) {
-    return nullptr;
-  } else {
-    return it->second;
-  }
-}
-
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.h b/torch/csrc/jit/codegen/cuda/partial_split_map.h
index be432bd5a161..8ec489915b79 100644
--- a/torch/csrc/jit/codegen/cuda/partial_split_map.h
+++ b/torch/csrc/jit/codegen/cuda/partial_split_map.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
@@ -20,15 +20,11 @@ class TORCH_CUDA_CU_API PartialSplitMap {
   void build(Fusion* fusion);
 
   Val* getStartOffset(IterDomain* root_domain) const;
-  kir::Val* getStartOffset(kir::IterDomain* root_domain) const;
   Val* getStopOffset(IterDomain* root_domain) const;
-  kir::Val* getStopOffset(kir::IterDomain* root_domain) const;
 
  private:
   std::unordered_map<IterDomain*, Val*> start_offset_map_;
-  std::unordered_map<kir::IterDomain*, kir::Val*> kir_start_offset_map_;
   std::unordered_map<IterDomain*, Val*> stop_offset_map_;
-  std::unordered_map<kir::IterDomain*, kir::Val*> kir_stop_offset_map_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/torch/csrc/jit/codegen/cuda/partition.cpp
index 004c836ec4ed..7e2c04b56c56 100644
--- a/torch/csrc/jit/codegen/cuda/partition.cpp
+++ b/torch/csrc/jit/codegen/cuda/partition.cpp
@@ -5,12 +5,16 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <torch/csrc/jit/jit_log.h>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+const c10::DeviceIndex INVALID_INDEX = -2;
+
 namespace {
 
 bool hasNonElementWiseOperation(const Node* node) {
@@ -38,26 +42,109 @@ static c10::optional<c10::Device> getDevice(const Value* value) {
     // not tensor type, return false as the op is not outputing scalar.
     return c10::nullopt;
   }
-  return value->type()->expectRef<TensorType>().device();
+  auto tensor_type = value->type()->expectRef<TensorType>();
+  // special case for scalar tensor: return c10::nullopt instead of cpu device.
+  // this allows us to fuse scalar cpu tensor with cuda tensor, while avoid
+  // merging ops with pure scalar cpu tensors.
+  if (is_cpu_scalar(tensor_type)) {
+    return c10::nullopt;
+  }
+  return tensor_type.device();
+}
+
+static bool hasBfloat(const Node* node) {
+  auto has_bfloat = [](const Value* value) {
+    if (!value->type()->isSubtypeOf(*TensorType::get())) {
+      return false;
+    }
+    auto opt_scalar_type = value->type()->expectRef<TensorType>().scalarType();
+    if (opt_scalar_type.has_value() &&
+        opt_scalar_type.value() == at::ScalarType::BFloat16) {
+      return true;
+    }
+    return false;
+  };
+
+  if (std::any_of(node->inputs().begin(), node->inputs().end(), has_bfloat) ||
+      std::any_of(node->outputs().begin(), node->outputs().end(), has_bfloat)) {
+    return true;
+  }
+  return false;
 }
 
 static c10::optional<c10::Device> getDevice(const Node* node) {
-  auto outputs = node->outputs();
-  for (auto output : outputs) {
-    auto device = getDevice(output);
+  c10::optional<c10::Device> ret = c10::nullopt;
+  auto merge_devices = [&ret](const c10::optional<c10::Device>& device) {
     if (device.has_value()) {
-      return device;
+      if (ret.has_value()) {
+        if (ret.value() != device.value()) {
+          // invalidate device to reflect conflicts
+          ret->set_index(INVALID_INDEX);
+          // return false to indicate early termination
+          return false;
+        } else {
+          // same device, do nothing
+          return true;
+        }
+      } else {
+        // initialize return device
+        ret = device.value();
+        return true;
+      }
+    }
+    // no device information, do nothing
+    return true;
+  };
+  for (auto val : node->inputs()) {
+    if (!merge_devices(getDevice(val))) {
+      return ret;
+    }
+  }
+  for (auto val : node->outputs()) {
+    if (!merge_devices(getDevice(val))) {
+      return ret;
     }
   }
-  return c10::nullopt;
+  return ret;
 }
 
-static bool isFusibleDevice(const Node* node, const c10::Device device) {
-  for (auto value : node->outputs()) {
-    auto output_device = getDevice(value);
-    if (output_device.has_value() && output_device.value() != device) {
-      return false;
-    }
+static bool isDeviceCompatible(const Node* node, const c10::Device& device) {
+  // only fuses cuda device
+  if (!device.is_cuda()) {
+    GRAPH_UPDATE("rejecting node (non-cuda device): ", *node);
+    return false;
+  }
+  const auto major = at::cuda::getDeviceProperties(device.index())->major;
+  // disable non-elementwise fusion on pre-volta devices
+  if (major < 7 && hasNonElementWiseOperation(node)) {
+    GRAPH_UPDATE(
+        "rejecting node (non element-wise op not supported on SM < 7X): ",
+        *node);
+    return false;
+  }
+  // disable bfloat fusion on pre-ampere devices
+  if (major < 8 && hasBfloat(node)) {
+    GRAPH_UPDATE("rejecting node (bfloat not supported on SM < 8X): ", *node);
+    return false;
+  }
+  return true;
+}
+
+static bool isFusibleDevice(const Node* node, const c10::Device& device) {
+  TORCH_INTERNAL_ASSERT(
+      device.index() != INVALID_INDEX, "fusible device needs to be validate");
+  auto opt_device = getDevice(node);
+  // we can be more relaxed here as we known that this function tries to merge
+  // node into an existing `device`
+  if (opt_device.has_value() &&
+      (opt_device->index() == INVALID_INDEX || opt_device != device)) {
+    GRAPH_UPDATE(
+        "rejecting node from fusion (outputs device not matching fusion): ",
+        *node);
+    return false;
+  }
+  if (!isDeviceCompatible(node, device)) {
+    return false;
   }
   return true;
 }
@@ -65,12 +152,16 @@ static bool isFusibleDevice(const Node* node, const c10::Device device) {
 // TODO: we need to check input type when we handle `to()`
 static bool isFusibleDevice(const Node* node) {
   auto device = getDevice(node);
-  if (!device.has_value()) {
-    return true;
+  // be conservative and only fuse cuda operations, this avoids us initializing
+  // operations that produces cpu scalar outputs
+  if (!device.has_value() || device->index() == INVALID_INDEX) {
+    return false;
   }
-  return device->is_cuda() &&
-      (at::cuda::getDeviceProperties(device->index())->major >= 7 ||
-       !hasNonElementWiseOperation(node));
+
+  if (!isDeviceCompatible(node, device.value())) {
+    return false;
+  }
+  return true;
 }
 
 bool compatibleType(const torch::jit::Value* val) {
@@ -80,6 +171,15 @@ bool compatibleType(const torch::jit::Value* val) {
           DataType::Null) {
         return false;
       }
+      // Complex is disabled until its support is completely added
+      // TODO: remove this logic
+      if (isComplexType(aten_to_data_type(tensor_type->scalarType().value()))) {
+        return false;
+      }
+    }
+    // magic number 8 here since our kernel argument only supports rank <= 8
+    if (tensor_type->dim().has_value() && (tensor_type->dim().value() > 8)) {
+      return false;
     }
   }
   return true;
@@ -121,268 +221,35 @@ bool checkOutputTensorTypes(const Node* node) {
 }
 
 inline bool isFusibleNode(const Node* node) {
+  // Check if already part of a fusion group
   if (node->kind() == prim::CudaFusionGroup)
     return true;
   // Check we have a parsing rule
-  bool isFusible = isNodeParsible(node);
-  // Check if we have a tensor type it's one we support
-  isFusible = isFusible && checkInputTensorTypes(node);
-  isFusible = isFusible && checkOutputTensorTypes(node);
-  // Check if already part of a fusion group
-  return isFusible;
-}
-
-bool maybeBroadcast(
-    const TensorTypePtr& type,
-    const std::vector<c10::optional<int64_t>>& shape) {
-  if (type->dim()) {
-    if (type->dim().value() < shape.size()) {
-      // no broadcast for reduction operation;
-      return false;
-    } else if (type->dim().value() > shape.size()) {
-      // increased rank means there is reduction;
-      return true;
-    } else {
-      // same rank, we need to iterate through sizes and check if size-1
-      // exists in input `shape`
-      for (const auto& opt_size : shape) {
-        // TODO: not sure if we need to check for output size != 1, since we
-        // are currently marking all size-1 dimension as broadcast in codegen.
-        if (opt_size.has_value() && opt_size.value() == 1) {
-          return true;
-        }
-      }
+  if (!isNodeParsible(node)) {
+    // ignoring profile nodes & constant nodes to avoid noise from debugging
+    if (node->kind() != prim::Constant &&
+        node->kind() != prim::profile_ivalue && node->kind() != prim::profile &&
+        node->kind() != prim::Param) {
+      GRAPH_UPDATE("rejecting node from fusion (node not parsible): ", *node);
     }
+    return false;
   }
-  return false;
-}
-
-// utility function to check if the node implies broadcast on a given shape (
-// assumed to be shape of an input tensor)
-// limitations:
-//   1. we rely on shape information to judge this. so we would require output
-//      shape to be available;
-//   2. we basically compares given shape to the shape of the only output of
-//      the node and return true if it implies broadcast from the former to the
-//      latter.
-bool maybeBroadcastOnShape(
-    const Node* n,
-    const std::vector<c10::optional<int64_t>>& shape) {
-  // TODO: we are only checking output 0. This means that our current check for
-  // normalization is not complete.
-  // assumes that if output is not a tensor type, it's not broadcasting
-  if (auto out_type = n->output(0)->type()->cast<TensorType>()) {
-    return maybeBroadcast(out_type, shape);
-  }
-  return false;
-};
-
-// return true if node is pointwise operation and input tensors all have
-// identical shape.
-bool isNonBroadcastElementWise(const Node* n) {
-  if (hasNonElementWiseOperation(n)) {
+  // Check if we have a tensor type it's one we support
+  if (!checkInputTensorTypes(node)) {
+    GRAPH_UPDATE(
+        "rejecting node from fusion (input scalar type not supported): ",
+        *node);
     return false;
   }
-
-  for (const auto output : n->outputs()) {
-    const auto& n_output_type = output->type()->cast<TensorType>();
-
-    // TODO: we need to stay on safer side instead of "default to return true
-    // when shape information is not available.", Change that when we enable
-    // profiling on autodiff FW execution.
-    if (n_output_type != nullptr && n_output_type->sizes().sizes()) {
-      const std::vector<c10::optional<int64_t>>& n_output_shape =
-          n_output_type->sizes().sizes().value();
-
-      for (auto input : n->inputs()) {
-        if (auto t_type = input->type()->cast<TensorType>()) {
-          if (maybeBroadcast(t_type, n_output_shape)) {
-            return false;
-          }
-        }
-      }
-    }
+  if (!checkOutputTensorTypes(node)) {
+    GRAPH_UPDATE(
+        "rejecting node from fusion (output scalar type not supported): ",
+        *node);
+    return false;
   }
-
   return true;
 }
 
-//! [ Note - tricky broadcasting ]
-//!
-//! github issue # 190
-//!
-//! To extend the issue further, we consider two difficult broadcasting cases
-//! that is difficult to naively schedule:
-//!   scenario 1: single tensor with multiple broadcasting semantics;
-//!               ```
-//!                   %t = op(...)
-//!                   %t0_o = op0(%t, %t0)
-//!                   %t1_o = op1(%t, %t1)
-//!               ```
-//!               It's hard to check/validate whether `%t0` and `%t1` implies
-//!               identical broadcasting for `%t` so that we can simply
-//!               broadcast it to their common shape and use the broadcasted
-//!               tensor view in both `op0` and `op1`; or, if `%t0` and `%t1`
-//!               has different shapes, we would need differently broadcasted
-//!               `%t` for the two ops. Even with this condition sorted out,
-//!               scheduling is challenging. As we cannot inline the computation
-//!               of `%t` to the downstream consumer of `%t0_o` and `%t1_o`
-//!               easily, because `computeAt` could propagate contradicting
-//!               transformations on the common ancestor `%t`. See footnote*;
-//!   scenario 2: output tensor_view which is broadcasted later;
-//!               ```
-//!                   %t = op(...)
-//!                   %t0_o = op0(%t, %t0)
-//!                   return (%t, %t0_o)
-//!               ```
-//!               Similarly, if we need to broadcast `%t` to `%t0` for `op0`,
-//!               and use it as output, it also complicates schedule.
-//!
-//! Currently we just avoid the two cases in our graph partitioning.
-//!
-//! We bake the implementation along with our partition, where we merge nodes
-//! from producer to consumer. In the example down, we list all "type"s of edges
-//! among producer/consumer and the out side world.
-//!
-//!   %input_t0, %input_t1, %input_t2 # inputs from outside world feeding
-//!                                   # producer/consumer pair
-//!   %p_out_t0, %p_out_t1 = producer(%input_t0, %input_t1)
-//!   %c_out_t, ... = consumer(%input_t0, %input_t2, %p_out_t0)
-//!
-//! producer/consumer : the nodes that we are trying to merge, each node could
-//! be
-//!                     a parsible real operation or a `CudaFusionGroup`.
-//! %input_t0         : inputs shared by both producer & consumer
-//! %input_t1         : inputs feed only to producer, but not to consumer
-//! %input_t2         : inputs feed only to consumer, but not to producer
-//! %p_put_t0         : outputs of producer that is fed to consumer
-//! %p_put_t1         : outputs of producer that is not fed to consumer
-//! %c_put_t0         : outputs of consumer
-//!
-//! We can see that after merging consumer & producer, we will have:
-//!   %input_t0, %input_t1, %input_t2 # inputs from outside world feeding
-//!                                   # producer/consumer pair
-//!   %p_out_t, %c_out_t = group(%input_t0, %input_t1, %input_t2)
-//!
-//! Under the assumption that any existing `CudaFusionGroup` does not have
-//! violating broadcasting semantics mentioned above.
-//!
-//! If we examine the `group`, new cases of scenario 1 (multiple broadcast)
-//! could only be created by merging new edges in the new `group`, that is:
-//!   case 1. `%input_t0`, shared by `producer` and `consumer`
-//!   case 2. `%p_out_t0`, produced by `producer` and fed to `consumer`
-//!
-//! new cases of scenario 2 (output was broadcasted later) could only be added
-//! via:
-//!   case 3. `%p_out_t0`, produced by `producer` and fed to `consumer`, which
-//!           could be broadcasted in the consumer subgraph.
-//!
-//! footnote*:
-//! We are only disabling multiple broadcast right on the tensor, instead of
-//! tracing all the broadcast further down.
-//! I don't think we need to worry about broadcasting further down the
-//! dependency chain, as those would create new IterDomain, which doesn't have
-//! th problem of conflicting broadcasting.
-bool createTrickyBroadcast(const Node* consumer, const Node* producer) {
-  auto count_broadcasting_in_node =
-      [](const Node* node,
-         const std::vector<c10::optional<int64_t>>& shape,
-         size_t offset) {
-        int num_broadcasting = 0;
-        if (node->kind() == prim::CudaFusionGroup) {
-          // be careful here as `subgraph_input`, as its name suggests, is in a
-          // different fraph from `node`.
-          const auto& subgraph_input =
-              node->g(attr::Subgraph)->inputs()[offset];
-          for (const auto& use : subgraph_input->uses()) {
-            if (maybeBroadcastOnShape(use.user, shape)) {
-              num_broadcasting++;
-            }
-          }
-        } else {
-          if (maybeBroadcastOnShape(node, shape)) {
-            num_broadcasting++;
-          }
-        }
-        return num_broadcasting;
-      };
-
-  // case 1. We check shared inputs to `producer` & `consumer`;
-  for (const auto i : c10::irange(producer->inputs().size())) {
-    auto n_input = producer->input(i);
-    auto n_input_type = n_input->type()->cast<TensorType>();
-    if (n_input_type != nullptr && n_input_type->sizes().sizes()) {
-      std::vector<c10::optional<int64_t>> n_input_shape =
-          n_input_type->sizes().sizes().value();
-      int num_broadcasting = 0;
-
-      // check broadcasting for the n_input inside `consumer`;
-      for (const auto& use : n_input->uses()) {
-        if (use.user == consumer) {
-          num_broadcasting +=
-              count_broadcasting_in_node(consumer, n_input_shape, use.offset);
-        }
-      }
-
-      // if no broadcasting happened for consumer, there's no point check
-      // multiple broadcasting in producer alone;
-      if (num_broadcasting == 0) {
-        continue;
-      }
-
-      // check broadcasting for n_input inside `producer`;
-      num_broadcasting +=
-          count_broadcasting_in_node(producer, n_input_shape, i);
-
-      // encounted multiple broadcasting scheme for a single TV, we will not be
-      // able to schedule this, prevent the fusion; (case 1)
-      if (num_broadcasting > 1) {
-        return true;
-      }
-    }
-  }
-
-  // case 2. We check input to `consumer` that is also the output from
-  // `producer`
-  for (const auto i : c10::irange(producer->outputs().size())) {
-    auto n_output = producer->output(i);
-    auto n_output_type = n_output->type()->cast<TensorType>();
-    if (n_output_type != nullptr && n_output_type->sizes().sizes()) {
-      std::vector<c10::optional<int64_t>> n_output_shape =
-          n_output_type->sizes().sizes().value();
-      int num_broadcasting = 0;
-      // If we only look at case 1 & case 2, we need to check broadcast of
-      // `n_output` inside `producer`, if it is a `prim::CudaFusionGroup`.
-      // this is actually not necessary when we consider case 3, as we avoid
-      // broadcasting on outputs already;
-
-      // TODO: merge this code with case 1.
-      // check broadcasting for the n_output inside `consumer`;
-      bool use_as_output = false;
-      for (const auto& use : n_output->uses()) {
-        if (use.user == consumer) {
-          num_broadcasting +=
-              count_broadcasting_in_node(consumer, n_output_shape, use.offset);
-        } else {
-          // case 3. output is used by other nodes not the consumer, no
-          //         broadcasting is allowed;
-          use_as_output = true;
-        }
-      }
-
-      // encounted multiple broadcasting scheme for a single TV, we will not be
-      // able to schedule this, prevent the fusion; (case 2)
-      // Alternatively, if use_as_output is true, we would not permit broadcast
-      // at all. (case 3)
-      if (num_broadcasting > (use_as_output ? 0 : 1)) {
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-
 } // namespace
 
 bool isFusibleCudaFusionGroup(const Node* node) {
@@ -400,7 +267,7 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) {
   bool fused = false;
   // TODO: lift the restriction of not fusing producer containing reduction when
   //       we have proper scheduling.
-  if (isFusibleCudaFusionGroup(node)) {
+  if (isFusibleNode(node)) {
     // ensure if the node has a designated device, it's on the same device with
     // fusion.
     // TODO: is there a danger of us fusing operations that's supposed to be on
@@ -408,7 +275,6 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) {
     auto device = getDevice(fusion);
     fused = (!device.has_value() || isFusibleDevice(node, device.value()));
   }
-
   return fused;
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/partition.h b/torch/csrc/jit/codegen/cuda/partition.h
index 0d8baca47007..b295cb582e57 100644
--- a/torch/csrc/jit/codegen/cuda/partition.h
+++ b/torch/csrc/jit/codegen/cuda/partition.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 /*
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
index b501a6133f60..9cafd20c7010 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
@@ -6,8 +6,6 @@
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 
@@ -20,27 +18,24 @@ namespace cuda {
 
 namespace {
 
-bool isTensorIndexOp(kir::Expr* expr) {
+bool isTensorIndexOp(Expr* expr) {
   const auto& outputs = expr->outputs();
   return outputs.size() >= 1 && outputs[0]->isA<kir::TensorIndex>();
 }
 
-bool isOutputLocal(const kir::Expr* expr) {
+bool isOutputLocal(const Expr* expr) {
   return std::all_of(
-      expr->outputs().begin(),
-      expr->outputs().end(),
-      [](const kir::Val* output) {
-        return !output->isA<kir::TensorView>() ||
-            output->as<kir::TensorView>()->memoryType() == MemoryType::Local;
+      expr->outputs().begin(), expr->outputs().end(), [](const Val* output) {
+        return !output->isA<TensorView>() ||
+            output->as<TensorView>()->getMemoryType() == MemoryType::Local;
       });
 }
 
 } // namespace
 
-bool ParallelizedDomainPredicate::PredicateInfo::addDomain(
-    kir::IterDomain* id) {
-  const auto gpu_lower = GpuLower::current();
-  auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id);
+bool ParallelizedDomainPredicate::PredicateInfo::addDomain(IterDomain* id) {
+  auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
   if (std::find(ids_.begin(), ids_.end(), concrete_id) == ids_.end()) {
     ids_.push_back(concrete_id);
     return true;
@@ -49,21 +44,20 @@ bool ParallelizedDomainPredicate::PredicateInfo::addDomain(
   }
 }
 
-kir::Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const {
-  const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
-
-  kir::Bool* pred = nullptr;
+Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const {
+  Bool* pred = nullptr;
 
-  auto index =
-      ir_builder.create<kir::NamedScalar>(stringifyThread(pt_), DataType::Int);
+  auto index = SimplifyingIrBuilder::create<NamedScalar>(
+      stringifyThread(pt_), DataType::Int);
 
   for (const auto& pred_id : ids()) {
     // Just sanity check that pred_id is concrete
     TORCH_INTERNAL_ASSERT(
-        pred_id == gpu_lower->caIndexMap().getConcreteMappedID(pred_id));
-    auto new_pred = ir_builder.ltExpr(index, pred_id->extent());
-    pred = ir_builder.andExpr(pred, new_pred)->as<kir::Bool>();
+        pred_id ==
+        GpuLower::current()->caMap()->getConcreteMappedID(
+            pred_id, IdMappingMode::EXACT));
+    auto new_pred = SimplifyingIrBuilder::ltExpr(index, pred_id->extent());
+    pred = SimplifyingIrBuilder::andExpr(pred, new_pred)->as<Bool>();
   }
 
   return pred;
@@ -74,16 +68,12 @@ namespace {
 std::unordered_set<Val*> getNonUnswitchedRootDomains(
     const std::vector<kir::ForLoop*>& loops,
     size_t unswitched_loop_index) {
-  const auto gpu_lower = GpuLower::current();
-
   std::vector<Val*> non_unswited_leaf_domains;
   std::transform(
       loops.begin(),
       loops.begin() + unswitched_loop_index,
       std::back_inserter(non_unswited_leaf_domains),
-      [&](kir::ForLoop* loop) {
-        return gpu_lower->caIndexMap().toFusion(loop->iter_domain());
-      });
+      [&](kir::ForLoop* loop) { return loop->iter_domain(); });
 
   auto non_unswitched_inputs =
       IterVisitor::getInputsTo(non_unswited_leaf_domains);
@@ -100,26 +90,25 @@ std::unordered_set<Val*> getNonUnswitchedRootDomains(
           non_unswitched_concrete_root_domains,
           non_unswitched_concrete_root_domains.end()),
       [&](auto root_dom) {
-        return gpu_lower->caIndexMap().getConcreteMappedID(root_dom);
+        return GpuLower::current()->caMap()->getConcreteMappedID(
+            root_dom, IdMappingMode::EXACT);
       });
 
   return non_unswitched_concrete_root_domains;
 }
 
 bool isFullyUnswitched(
-    kir::IterDomain* loop_id,
+    IterDomain* loop_id,
     const std::unordered_set<Val*>& non_unswitched_root_domains) {
-  const auto gpu_lower = GpuLower::current();
-
-  auto root_vals =
-      IterVisitor::getInputsTo({gpu_lower->caIndexMap().toFusion(loop_id)});
+  auto root_vals = IterVisitor::getInputsTo({loop_id});
 
   auto root_domains = ir_utils::filterByType<IterDomain>(root_vals);
 
   return std::none_of(
       root_domains.begin(), root_domains.end(), [&](auto root_dom) {
         auto concrete_root_dom =
-            gpu_lower->caIndexMap().getConcreteMappedID(root_dom);
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                root_dom, IdMappingMode::EXACT);
         return non_unswitched_root_domains.count(concrete_root_dom) > 0;
       });
 }
@@ -131,12 +120,10 @@ std::unordered_map<
     ParallelizedDomainPredicate::PredicateInfo,
     TypeHash>
 ParallelizedDomainPredicate::getPredicateMap(
-    const kir::Expr* expr,
+    const Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
     kir::ForLoop* unswitched_loop) {
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   auto output_tvs = ir_utils::getTvs(expr->outputs());
 
   if (output_tvs.empty()) {
@@ -167,7 +154,7 @@ ParallelizedDomainPredicate::getPredicateMap(
     }
 
     auto loop_id = loop->iter_domain();
-    auto loop_ptype = loop_id->parallelType();
+    auto loop_ptype = loop_id->getParallelType();
 
     // Not necessary to add a predicate if the paralle type is exact
     if (!isParallelTypeThread(loop_ptype) ||
@@ -187,13 +174,14 @@ ParallelizedDomainPredicate::getPredicateMap(
           tv->domain()->domain().begin(),
           tv->domain()->domain().end(),
           [&](auto tv_id) {
-            return gpu_lower->caIndexMap().areMapped(loop_id, tv_id);
+            return gpu_lower->caMap()->areMapped(
+                loop_id, tv_id, IdMappingMode::EXACT);
           });
       if (it == tv->domain()->domain().end()) {
         continue;
       }
 
-      kir::IterDomain* tv_id = *it;
+      IterDomain* tv_id = *it;
 
       // If the corresponding domain is a broadcast, it's not really used.
       if (tv_id->isBroadcast()) {
@@ -203,9 +191,9 @@ ParallelizedDomainPredicate::getPredicateMap(
       // If it's a root domain, it should be covered by the root
       // predicates, so no extra predicate is required.
       if (std::find(
-              tv->domain()->rootDomain().begin(),
-              tv->domain()->rootDomain().end(),
-              tv_id) != tv->domain()->rootDomain().end()) {
+              tv->domain()->getRootDomain().begin(),
+              tv->domain()->getRootDomain().end(),
+              tv_id) != tv->domain()->getRootDomain().end()) {
         continue;
       }
 
@@ -218,29 +206,24 @@ ParallelizedDomainPredicate::getPredicateMap(
   return map;
 }
 
-kir::Bool* ParallelizedDomainPredicate::getPredicate(
-    const kir::Expr* expr,
+Bool* ParallelizedDomainPredicate::getPredicate(
+    const Expr* expr,
     const std::vector<kir::ForLoop*>& loops) {
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
-
   auto pred_map = getPredicateMap(expr, loops);
 
-  kir::Val* pred = ir_builder.trueVal();
+  Val* pred = GpuLower::current()->kernel()->trueVal();
 
   for (auto pt : kParallelTypeThreads) {
     auto pred_info_it = pred_map.find(pt);
     if (pred_info_it != pred_map.end()) {
       const auto& pred_info = pred_info_it->second;
       auto tid_pred = pred_info.getPredicate();
-      pred = ir_builder.andExpr(pred, tid_pred);
+      pred = SimplifyingIrBuilder::andExpr(pred, tid_pred);
     }
   }
 
-  if (pred) {
-    return pred->as<kir::Bool>();
-  } else {
-    return nullptr;
-  }
+  TORCH_INTERNAL_ASSERT(pred != nullptr);
+  return pred->as<Bool>();
 }
 
 UnswitchPredicateKey::UnswitchPredicateKey()
@@ -256,61 +239,55 @@ UnswitchPredicateKey::UnswitchPredicateKey()
 // concrete domains are used to uniquely collect all necessary
 // unswitch predicates.
 UnswitchPredicateKey::UnswitchPredicateKey(
-    IterDomain* predicated_concrete_id,
-    const ReferenceTensor& reference)
+    IterDomain* predicated_consumer_id,
+    TensorView* consumer_tv,
+    IterDomain* predicated_concrete_id)
     : predicated_concrete_id_(predicated_concrete_id) {
   // Initialize the parallelized domain map
   for (auto pt : kParallelTypeThreads) {
     parallel_concrete_ids_.insert({pt, nullptr});
   }
 
-  // The id parameter is a concrete domain. Needs to find the
-  // corresponding reference domain to find leaf domains that are
-  // parallelized.
-  IterDomain* predicated_ref_id =
-      reference.concrete_to_id.at(predicated_concrete_id_);
-  TensorDomain* ref_td = reference.domain;
-
-  std::vector<Val*> all_parallelized_ref_leaf_ids;
+  std::vector<Val*> all_parallelized_consumer_leaf_ids;
   std::copy_if(
-      ref_td->domain().begin(),
-      ref_td->domain().end(),
-      std::back_inserter(all_parallelized_ref_leaf_ids),
+      consumer_tv->domain()->domain().begin(),
+      consumer_tv->domain()->domain().end(),
+      std::back_inserter(all_parallelized_consumer_leaf_ids),
       [](IterDomain* x) { return isParallelTypeThread(x->getParallelType()); });
 
-  // If the reference is not parallelized at all, no need to
+  // If the consumer domais are not parallelized at all, no need to
   // differentiate keys based on how the predicated id is parallelized
-  if (all_parallelized_ref_leaf_ids.empty()) {
+  if (all_parallelized_consumer_leaf_ids.empty()) {
     return;
   }
 
-  // All domains that are parallelized descendants of predicated_ref_id
-  auto all_parallelized_ref_ids = DependencyCheck::getAllValsBetween(
-      {predicated_ref_id}, all_parallelized_ref_leaf_ids);
+  // All domains that are parallelized descendants of predicated_consumer_id
+  auto all_parallelized_consumer_ids = DependencyCheck::getAllValsBetween(
+      {predicated_consumer_id}, all_parallelized_consumer_leaf_ids);
   // Just pick leaf domains
-  std::vector<IterDomain*> parallelized_ref_leaf_ids;
+  std::vector<IterDomain*> parallelized_consumer_leaf_ids;
   std::copy_if(
-      ref_td->domain().begin(),
-      ref_td->domain().end(),
-      std::back_inserter(parallelized_ref_leaf_ids),
+      consumer_tv->domain()->domain().begin(),
+      consumer_tv->domain()->domain().end(),
+      std::back_inserter(parallelized_consumer_leaf_ids),
       [&](IterDomain* x) {
         return std::find(
-                   all_parallelized_ref_ids.begin(),
-                   all_parallelized_ref_ids.end(),
-                   x) != all_parallelized_ref_ids.end();
+                   all_parallelized_consumer_ids.begin(),
+                   all_parallelized_consumer_ids.end(),
+                   x) != all_parallelized_consumer_ids.end();
       });
 
-  if (parallelized_ref_leaf_ids.empty()) {
-    // None of the parallelized leaf domains are derived from predicated_ref_id
+  if (parallelized_consumer_leaf_ids.empty()) {
+    // None of the parallelized leaf domains are derived from
+    // predicated_consumer_id
     return;
   }
 
   // Find the corresponding concrete id for each parallel type
-  for (auto ref_leaf : parallelized_ref_leaf_ids) {
-    auto pt = ref_leaf->getParallelType();
-    auto it = reference.id_to_concrete.find(ref_leaf);
-    TORCH_INTERNAL_ASSERT(it != reference.id_to_concrete.end());
-    auto concrete_leaf = it->second;
+  for (auto consumer_leaf : parallelized_consumer_leaf_ids) {
+    auto pt = consumer_leaf->getParallelType();
+    auto concrete_leaf = GpuLower::current()->caMap()->getConcreteMappedID(
+        consumer_leaf, IdMappingMode::EXACT);
     parallel_concrete_ids_.at(pt) = concrete_leaf;
   }
 }
@@ -344,19 +321,18 @@ std::size_t UnswitchPredicateKeyHash::operator()(
   return h;
 };
 
-kir::Bool* PredicateCompute::getInlinePredicate(
-    const kir::Expr* expr,
+Bool* PredicateCompute::getInlinePredicate(
+    const Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
-    kir::Bool* thread_pred,
+    Bool* thread_pred,
     PredicateType pred_type) {
   FUSER_PERF_SCOPE("GpuLower::Lower::getInlinePredicate");
 
   const auto gpu_lower = GpuLower::current();
-  kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel());
 
   // If outputs are registers, no need to predicate for threads
   if (isOutputLocal(expr)) {
-    thread_pred = ir_builder.trueVal();
+    thread_pred = gpu_lower->kernel()->trueVal();
   }
 
   if (loops.empty()) {
@@ -364,11 +340,18 @@ kir::Bool* PredicateCompute::getInlinePredicate(
     return thread_pred;
   }
 
-  auto out_tv = ir_utils::getTVOutput(expr);
-  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output");
+  auto out_tv = ir_utils::getTvOutput(expr);
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output");
+
+  // Predicates for non-exact parallel dimensions must be used even
+  // when PredicateElimination::canOmitPredicate is true.
+  auto parallel_dom_pred =
+      ParallelizedDomainPredicate::getPredicate(expr, loops);
+  TORCH_INTERNAL_ASSERT(parallel_dom_pred != nullptr);
 
   if (gpu_lower->predicateElimination().canOmitPredicate(expr)) {
-    return thread_pred;
+    return SimplifyingIrBuilder::andExpr(thread_pred, parallel_dom_pred)
+        ->as<Bool>();
   }
 
   auto pred_info_vec =
@@ -376,7 +359,7 @@ kir::Bool* PredicateCompute::getInlinePredicate(
           out_tv, loops, nullptr, pred_type == PredicateType::Padding)
           .first;
 
-  std::vector<kir::Bool*> preds;
+  std::vector<Bool*> preds;
 
   // When pred_type is ReductionWrite, filter out predicates for
   // reduction axes. For blockReduce, this is necessary when reduction
@@ -388,7 +371,7 @@ kir::Bool* PredicateCompute::getInlinePredicate(
   bool non_zero_start_found = false;
   for (const auto& pred_info : pred_info_vec) {
     if (pred_type == PredicateType::ReductionWrite) {
-      const auto& consumer_ids = pred_info.consumerIds();
+      const auto& consumer_ids = pred_info.rootIds();
       bool pred_for_reduction_axis = false;
       for (auto consumer_id : consumer_ids) {
         if (consumer_id->isReduction()) {
@@ -404,64 +387,52 @@ kir::Bool* PredicateCompute::getInlinePredicate(
         continue;
       }
     }
-    for (auto pred : pred_info.startPredicates()) {
-      TORCH_INTERNAL_ASSERT(pred != nullptr);
-      preds.push_back(pred);
-    }
-    for (auto pred : pred_info.stopPredicates()) {
-      TORCH_INTERNAL_ASSERT(pred != nullptr);
-      preds.push_back(pred);
-    }
+    preds.push_back(pred_info.startPredicate());
+    preds.push_back(pred_info.stopPredicate());
   }
 
   // When generating a predicate for blockReduce writes and not for
   // gridReduce, if all reduction axes start with zero, we can just
   // use the same predicate for reads. nullptr is returned then.
   if (pred_type == PredicateType::ReductionWrite && !non_zero_start_found &&
-      !out_tv->fuserTv()->domain()->hasGridReduction()) {
+      !out_tv->domain()->hasGridReduction()) {
     return nullptr;
   }
 
-  auto parallel_dom_pred =
-      ParallelizedDomainPredicate::getPredicate(expr, loops);
-  if (parallel_dom_pred) {
-    preds.push_back(parallel_dom_pred);
-  }
+  preds.push_back(parallel_dom_pred);
 
   if (thread_pred != nullptr) {
     preds.push_back(thread_pred);
   }
 
   if (preds.empty()) {
-    return ir_builder.trueVal();
+    return GpuLower::current()->kernel()->trueVal();
   }
 
-  kir::Val* cond = preds[0];
+  Val* cond = preds[0];
   for (const auto i : c10::irange(1, preds.size())) {
-    cond = ir_builder.andExpr(cond, preds[i]);
+    cond = SimplifyingIrBuilder::andExpr(cond, preds[i]);
   }
 
-  return cond->as<kir::Bool>();
+  return cond->as<Bool>();
 }
 
-kir::Bool* UnswitchPredicate::get(
+Bool* UnswitchPredicate::get(
     const std::vector<kir::ForLoop*>& outer_loops,
     kir::ForLoop* unrolled_loop) {
   FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::get");
 
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
-
   UnswitchPredicate up(outer_loops, unrolled_loop);
 
-  kir::Val* unswitch_pred = ir_builder.trueVal();
+  Val* unswitch_pred = GpuLower::current()->kernel()->trueVal();
   for (auto pred : up.predicates_) {
-    unswitch_pred = ir_builder.andExpr(unswitch_pred, pred);
+    unswitch_pred = SimplifyingIrBuilder::andExpr(unswitch_pred, pred);
   }
 
-  return unswitch_pred->as<kir::Bool>();
+  return unswitch_pred->as<Bool>();
 }
 
-void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) {
+void UnswitchPredicate::predicateOn(Expr* tv_expr) {
   FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::predicateOn");
 
   if (for_loops_.empty()) {
@@ -469,18 +440,16 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) {
   }
 
   const auto gpu_lower = GpuLower::current();
-  kir::IrBuilder ir_builder(gpu_lower->kernel());
-
   if (gpu_lower->predicateElimination().canOmitPredicate(tv_expr)) {
+    addParallelizedDomainPredicates(tv_expr);
     return;
   }
 
-  auto out_tv = ir_utils::getTVOutput(tv_expr);
-  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output");
+  auto out_tv = ir_utils::getTvOutput(tv_expr);
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output");
 
   auto ref_pred_info = Index::getReferenceRootPredicates(
       out_tv, for_loops_, unrolled_loop_, false);
-  const ReferenceTensor& reference = ref_pred_info.second;
 
   // If RootPredicateInfo has a static predicate that is more
   // restrictive than the current one, replace the current with the
@@ -491,10 +460,8 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) {
   // predicates are generated in the finalize function.
 
   for (const auto& pred_info : ref_pred_info.first) {
-    if (pred_info.startPredicates().empty() &&
-        pred_info.stopPredicates().empty()) {
-      continue;
-    }
+    TORCH_INTERNAL_ASSERT(pred_info.startPredicate() != nullptr);
+    TORCH_INTERNAL_ASSERT(pred_info.stopPredicate() != nullptr);
 
     const auto& root_ids = pred_info.rootIds();
 
@@ -505,13 +472,14 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) {
     bool first_key_set = false;
 
     for (auto root_id : root_ids) {
-      auto kir_root_id = gpu_lower->lowerValue(root_id)->as<kir::IterDomain>();
+      auto concrete_root_id = gpu_lower->caMap()->getConcreteMappedID(
+          root_id, IdMappingMode::EXACT);
 
-      if (kir_root_id->isBroadcast()) {
+      if (root_id->isBroadcast()) {
         continue;
       }
 
-      UnswitchPredicateKey key(root_id, reference);
+      UnswitchPredicateKey key(root_id, out_tv, concrete_root_id);
       auto inserted = predicated_keys_.insert(key).second;
       add_pred = add_pred || inserted;
 
@@ -573,20 +541,23 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) {
     // start and stop offsets.
     if (merged_pred_it != pending_predicates_.end()) {
       mergeUnswitchPredicateOffsets(
-          pred_info.startPredicates(),
-          pred_info.startOffsets(),
+          pred_info.startPredicate(),
+          pred_info.startOffset(),
           merged_pred_it->start,
           true);
 
       mergeUnswitchPredicateOffsets(
-          pred_info.stopPredicates(),
-          pred_info.stopOffsets(),
+          pred_info.stopPredicate(),
+          pred_info.stopOffset(),
           merged_pred_it->stop,
           false);
     }
   }
 
-  // Adds new predicates for parallelized domains
+  addParallelizedDomainPredicates(tv_expr);
+}
+
+void UnswitchPredicate::addParallelizedDomainPredicates(Expr* tv_expr) {
   auto pred_map = ParallelizedDomainPredicate::getPredicateMap(
       tv_expr, for_loops_, unrolled_loop_);
   for (auto pt : kParallelTypeThreads) {
@@ -613,7 +584,7 @@ void UnswitchPredicate::openLoop(kir::ForLoop* fl) {
   for_loops_.push_back(fl);
 
   for (auto expr : fl->body().exprs()) {
-    if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) {
+    if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) {
       predicateOn(expr);
     } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
       openIte(ite);
@@ -630,7 +601,7 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) {
 
   // only expand the ite thenBody
   for (auto expr : ite->thenBody().exprs()) {
-    if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) {
+    if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) {
       predicateOn(expr);
     } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
       openIte(ite);
@@ -641,7 +612,6 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) {
 }
 
 void UnswitchPredicate::finalize() {
-  kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel());
   for (const auto& merged_pred : pending_predicates_) {
     const auto& start_info = merged_pred.start;
     if (start_info.static_pred) {
@@ -661,12 +631,10 @@ void UnswitchPredicate::finalize() {
 }
 
 void UnswitchPredicate::mergeUnswitchPredicateOffsets(
-    const std::vector<kir::Bool*>& predicates,
-    const std::vector<kir::Val*>& offsets,
+    Bool* predicate,
+    Val* offset,
     MergedPredicates::Info& merged_predicate_info,
     bool is_start) {
-  TORCH_INTERNAL_ASSERT(predicates.size() == offsets.size());
-
   auto is_more_restrictive = [&is_start](int64_t new_val, int64_t current_val) {
     if (is_start) {
       return new_val < current_val;
@@ -675,25 +643,21 @@ void UnswitchPredicate::mergeUnswitchPredicateOffsets(
     }
   };
 
-  for (const auto i : c10::irange(predicates.size())) {
-    auto pred = predicates.at(i);
-    auto offset = offsets.at(i);
-    auto offset_int = dynamic_cast<kir::Int*>(offset);
-    // If it's a static predicate, replace the current one if it's
-    // more restrictive. If it's dynamic, just adds it to the dynamic
-    // predicate list.
-    if (offset_int && offset_int->isConst()) {
-      auto offset_const = offset_int->value().value();
-      auto& static_pred = merged_predicate_info.static_pred;
-      auto& static_offset = merged_predicate_info.static_offset;
-      if (static_pred == nullptr ||
-          is_more_restrictive(offset_const, static_offset)) {
-        static_pred = pred;
-        static_offset = offset_const;
-      }
-    } else {
-      merged_predicate_info.dynamic_preds.push_back(pred);
+  auto offset_int = dynamic_cast<Int*>(offset);
+  // If it's a static predicate, replace the current one if it's
+  // more restrictive. If it's dynamic, just adds it to the dynamic
+  // predicate list.
+  if (offset_int && offset_int->isConst()) {
+    auto offset_const = offset_int->value().value();
+    auto& static_pred = merged_predicate_info.static_pred;
+    auto& static_offset = merged_predicate_info.static_offset;
+    if (static_pred == nullptr ||
+        is_more_restrictive(offset_const, static_offset)) {
+      static_pred = predicate;
+      static_offset = offset_const;
     }
+  } else {
+    merged_predicate_info.dynamic_preds.push_back(predicate);
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/torch/csrc/jit/codegen/cuda/predicate_compute.h
index 989bffb3bd18..6cf3609d3151 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.h
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.h
@@ -16,10 +16,10 @@ class PredicateCompute {
   // ignore_internal_syncthread_ops will prevent creation of predicates on
   // block/grid broadcast/reduce as these have syncthread calls within them
   // so all threads need to execute the function.
-  static kir::Bool* getInlinePredicate(
-      const kir::Expr* expr,
+  static Bool* getInlinePredicate(
+      const Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
-      kir::Bool* thread_pred,
+      Bool* thread_pred,
       PredicateType pred_type);
 };
 
@@ -40,31 +40,31 @@ class ParallelizedDomainPredicate {
     explicit PredicateInfo(ParallelType pt) : pt_(pt) {}
 
     //! Adds a domain that is parallized by the same paralell type
-    bool addDomain(kir::IterDomain* id);
+    bool addDomain(IterDomain* id);
 
-    const std::vector<kir::IterDomain*>& ids() const {
+    const std::vector<IterDomain*>& ids() const {
       return ids_;
     }
 
     //! Generates a predicate Val from predicate information
-    kir::Bool* getPredicate() const;
+    Bool* getPredicate() const;
 
    private:
     ParallelType pt_;
     //! Domains parallelized by the same parallel type
-    std::vector<kir::IterDomain*> ids_;
+    std::vector<IterDomain*> ids_;
   };
 
   //! Returns a predicate Val for parallelied domains of an expression.
-  static kir::Bool* getPredicate(
-      const kir::Expr* expr,
+  static Bool* getPredicate(
+      const Expr* expr,
       const std::vector<kir::ForLoop*>& loops);
 
   //! Returns predicate information for parallelied domains of an
   //! expression.
   static std::unordered_map<ParallelType, PredicateInfo, TypeHash>
   getPredicateMap(
-      const kir::Expr* expr,
+      const Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
       kir::ForLoop* unswitched_loop = nullptr);
 };
@@ -80,8 +80,9 @@ class UnswitchPredicateKey {
   UnswitchPredicateKey();
 
   UnswitchPredicateKey(
-      IterDomain* predicated_concrete_id,
-      const ReferenceTensor& reference);
+      IterDomain* predicated_consumer_id,
+      TensorView* consumer_tv,
+      IterDomain* predicated_concrete_id);
 
   bool operator==(const UnswitchPredicateKey& other) const {
     return predicated_concrete_id_ == other.predicated_concrete_id_ &&
@@ -121,7 +122,7 @@ struct UnswitchPredicateKeyHash {
 
 class TORCH_CUDA_CU_API UnswitchPredicate {
  public:
-  static kir::Bool* get(
+  static Bool* get(
       const std::vector<kir::ForLoop*>& outer_loops,
       kir::ForLoop* unrolled_loop);
 
@@ -132,11 +133,11 @@ class TORCH_CUDA_CU_API UnswitchPredicate {
     struct Info {
       //! Most restrictive static predicate. Nullptr if no static
       //! predicate found.
-      kir::Bool* static_pred = nullptr;
+      Bool* static_pred = nullptr;
       //! The offset value of static_pred
       int64_t static_offset = 0;
       //! List of dynamic predicates.
-      std::vector<kir::Bool*> dynamic_preds;
+      std::vector<Bool*> dynamic_preds;
     };
     UnswitchPredicateKey predicate_key;
     Info start;
@@ -147,7 +148,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate {
       std::vector<kir::ForLoop*> outer_loops,
       kir::ForLoop* unrolled_loop);
 
-  void predicateOn(kir::Expr*);
+  void predicateOn(Expr*);
 
   void openLoop(kir::ForLoop*);
 
@@ -160,11 +161,14 @@ class TORCH_CUDA_CU_API UnswitchPredicate {
   //! static, only pick the most restrictive one, e.g., the one with the
   //! minimum offset for the start predication.
   void mergeUnswitchPredicateOffsets(
-      const std::vector<kir::Bool*>& predicates,
-      const std::vector<kir::Val*>& offsets,
+      Bool* predicate,
+      Val* offset,
       MergedPredicates::Info& merged_predicate_info,
       bool is_start);
 
+  //! Adds new predicates for parallelized domains
+  void addParallelizedDomainPredicates(Expr*);
+
  private:
   //! Track which iter domains have been predicated
   std::unordered_set<UnswitchPredicateKey, UnswitchPredicateKeyHash>
@@ -181,7 +185,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate {
       parallelized_dom_predicates_;
 
   //! The predicates that have been generated.
-  std::vector<kir::Bool*> predicates_;
+  std::vector<Bool*> predicates_;
 
   std::vector<kir::ForLoop*> for_loops_;
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py
new file mode 100644
index 000000000000..fbd85fa197e8
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py
@@ -0,0 +1,33 @@
+import torch
+
+from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+
+# Construct and Define Fusion
+fusion = Fusion()
+
+with FusionDefinition(fusion) as fd :
+    t0 = fd.define_tensor(2, DataType.Double)
+    t1 = fd.define_tensor(2, DataType.Double)
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+
+    t0h = fd.Ops.cast(DataType.Half, t0)
+    t1h = fd.Ops.cast(DataType.Half, t1)
+    t2 = fd.Ops.add(t0h, t1h)
+    t3 = fd.Ops.relu(t2)
+
+    fd.add_output(t3)
+
+fusion.print_ir()
+
+# Execute Fusion
+input1 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
+input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion.execute([input1, input2])
+
+print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py
new file mode 100644
index 000000000000..faa71fbba8ac
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py
@@ -0,0 +1,31 @@
+import torch
+
+from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+
+# Construct and Define Fusion
+fusion = Fusion()
+
+with FusionDefinition(fusion) as fd :
+    t0 = fd.define_tensor(2, DataType.Half)
+    t1 = fd.define_tensor(2, DataType.Double)
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+
+    t2 = fd.Ops.add(t0, t1)
+    t5 = fd.Ops.relu(t2)
+
+    fd.add_output(t5)
+
+fusion.print_ir()
+
+# Execute Fusion
+input1 = torch.ones(2, 4, device='cuda', dtype=torch.float16)
+input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion.execute([input1, input2])
+
+print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py
new file mode 100644
index 000000000000..ce6e490ac997
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py
@@ -0,0 +1,41 @@
+import torch
+from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+
+# Construct and Define Fusion
+fusion = Fusion()
+
+with FusionDefinition(fusion) as fd :
+    t0 = fd.define_tensor(3)
+    t1 = fd.define_tensor(1)
+    s0 = fd.define_scalar()
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+    fd.add_input(s0)
+
+    c0 = fd.define_constant(3.0)
+
+    t1_b = fd.Ops.broadcast(t1, [True, True, False])
+    t2 = fd.Ops.add(t0, t1)
+    t3 = fd.Ops.mul(t2, c0)
+    t4 = fd.Ops.atan2(t3, s0)
+    t5 = fd.Ops.relu(t4)
+    t6 = fd.Ops.sum(t5, [-1], False, DataType.Float)
+    t7 = fd.Ops.isfinite(t6)
+
+    fd.add_output(t6)
+    fd.add_output(t7)
+
+fusion.print_ir()
+
+# Execute Fusion
+input1 = torch.ones(2, 4, 8, device='cuda')
+input2 = torch.ones(8, device='cuda')
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion.execute([input1, input2, 2.0])
+
+print(outputs[0])
+print(outputs[1])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py
new file mode 100644
index 000000000000..aa2fb2016de8
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py
@@ -0,0 +1,58 @@
+import torch
+
+from torch._C._nvfuser import Fusion, FusionDefinition
+
+# Construct and Define Fusion
+fusion1 = Fusion()
+
+with FusionDefinition(fusion1) as fd :
+    t0 = fd.define_tensor(1)
+    t1 = fd.define_tensor(3)
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+
+    t0_b = fd.Ops.broadcast_in_dim(t0, [2, 3, 4], [1])
+    t2 = fd.Ops.add(t0_b, t1)
+
+    fd.add_output(t2)
+
+fusion1.print_ir()
+
+# Execute Fusion
+input1 = torch.ones(3, device='cuda')
+input2 = torch.ones(2, 3, 4, device='cuda')
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion1.execute([input1, input2])
+
+print(outputs[0])
+
+fusion2 = Fusion()
+
+input1 = torch.ones(1, 1, 4, device='cuda')
+input2 = torch.ones(2, 3, 4, device='cuda')
+
+with FusionDefinition(fusion2) as fd :
+    t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride())
+    t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride())
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+
+    t0_b = fd.Ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2])
+    print("Broadcast TensorView", t0_b)
+    t2 = fd.Ops.add(t0_b, t1)
+
+    fd.add_output(t2)
+
+fusion2.print_ir()
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion2.execute([input1, input2])
+
+print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py
new file mode 100644
index 000000000000..e707a863dc86
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py
@@ -0,0 +1,40 @@
+import torch
+
+from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+
+# Construct and Define Fusion
+fusion = Fusion()
+
+with FusionDefinition(fusion) as fd :
+    t0 = fd.define_tensor(3, DataType.Half)
+    t1 = fd.define_tensor(1, DataType.Half)
+    s0 = fd.define_scalar()
+
+    fd.add_input(t0)
+    fd.add_input(t1)
+    fd.add_input(s0)
+
+    c0 = fd.define_constant(3.0)
+
+    t1_b = fd.Ops.broadcast(t1, [True, True, False])
+    t2 = fd.Ops.add(t0, t1)
+    t3 = fd.Ops.mul(t2, c0)
+    t4 = fd.Ops.mul(t3, s0)
+    t5 = fd.Ops.relu(t4)
+    t6 = fd.Ops.sum(t5, [-1], False, DataType.Float)
+
+    t7 = fd.Ops.cast(DataType.Half, t6)
+    fd.add_output(t7)
+
+fusion.print_ir()
+
+# Execute Fusion
+input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
+input2 = torch.ones(8, device='cuda', dtype=torch.float16)
+
+# Kernel compilation should be cached for the 2nd iteration
+# with input tensors of the same shape
+for _ in range(5) :
+    outputs = fusion.execute([input1, input2, 2.0])
+
+print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
new file mode 100644
index 000000000000..c619b557fa12
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -0,0 +1,641 @@
+#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
+
+#ifdef USE_CUDA
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <iostream>
+
+using namespace torch::jit::fuser::cuda;
+
+namespace {
+
+class PythonFusionOwner {
+ public:
+  PythonFusionOwner() : executor_cache_(std::make_unique<Fusion>()) {}
+
+  // Non-copyable
+  PythonFusionOwner(const PythonFusionOwner&) = delete;
+  PythonFusionOwner& operator=(const PythonFusionOwner&) = delete;
+
+  std::vector<at::Tensor> execute(const at::ArrayRef<c10::IValue>& inputs) {
+    return executor_cache_.runFusionWithInputs(inputs);
+  }
+  Fusion* fusionPtr() {
+    return executor_cache_.fusion();
+  }
+
+  void printIr() {
+    executor_cache_.printFusion();
+  }
+  void printKernel() {
+    executor_cache_.fusion()->printKernel();
+  }
+
+ private:
+  FusionExecutorCache executor_cache_;
+};
+
+// Manually applying the fusion guard via a context manager
+class FusionDefinitionContextManager {
+ public:
+  FusionDefinitionContextManager(PythonFusionOwner* fusion_owner)
+      : fusion_owner_(fusion_owner), prev_fusion_(nullptr) {}
+
+  // Context Manager Methods
+  FusionDefinitionContextManager* enter() {
+    prev_fusion_ = FusionGuard::getCurFusion();
+    FusionGuard::setCurFusion(fusionPtr());
+    return this;
+  }
+
+  void exit() {
+    FusionGuard::setCurFusion(prev_fusion_);
+    prev_fusion_ = nullptr;
+  }
+
+  void addInput(torch::jit::fuser::cuda::Val* input) {
+    fusionPtr()->addInput(input);
+  }
+  void addOutput(torch::jit::fuser::cuda::Val* output) {
+    fusionPtr()->addOutput(output);
+  }
+
+  Fusion* fusionPtr() {
+    return fusion_owner_->fusionPtr();
+  }
+
+  // An Empty namespace to add arith ops
+  struct Ops {};
+
+ private:
+  PythonFusionOwner* fusion_owner_;
+  Fusion* prev_fusion_;
+};
+
+} // namespace
+
+namespace torch {
+namespace jit {
+
+void initNvFuserPythonBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  auto nvfuser = m.def_submodule("_nvfuser");
+
+  // DataTypes supported by NVFuser in Fusion Definition
+  // Types not related to values found in fusion defintions
+  // were purposely left out.
+  // NOTE: DataType was ambiguous under torch::jit without full qualification.
+  py::enum_<torch::jit::fuser::cuda::DataType>(nvfuser, "DataType")
+      .value("Double", torch::jit::fuser::cuda::DataType::Double)
+      .value("Float", torch::jit::fuser::cuda::DataType::Float)
+      .value("Half", torch::jit::fuser::cuda::DataType::Half)
+      .value("Int", torch::jit::fuser::cuda::DataType::Int)
+      .value("Int32", torch::jit::fuser::cuda::DataType::Int32)
+      .value("Bool", torch::jit::fuser::cuda::DataType::Bool)
+      .value("BFloat16", torch::jit::fuser::cuda::DataType::BFloat16)
+      .value("ComplexFloat", torch::jit::fuser::cuda::DataType::ComplexFloat)
+      .value("ComplexDouble", torch::jit::fuser::cuda::DataType::ComplexDouble);
+
+  // Binding an object that owns a FusionExecutorCache instance and provides an
+  // interface
+  py::class_<PythonFusionOwner> fusion(nvfuser, "Fusion");
+  fusion.def(py::init<>())
+      .def(
+          "execute",
+          [](PythonFusionOwner& self, const py::iterable& iter) {
+            std::vector<IValue> inputs;
+            for (py::handle obj : iter) {
+              inputs.push_back(toIValue(obj, c10::AnyType::get()));
+            }
+            return self.execute(inputs);
+          },
+          py::return_value_policy::reference)
+      .def("print_ir", [](PythonFusionOwner& self) { self.printIr(); })
+      .def("print_kernel", [](PythonFusionOwner& self) { self.printKernel(); });
+
+  // Bindings to Types required for Tensor/Scalar Creation
+  py::class_<TensorView>(nvfuser, "TensorView")
+      .def(
+          "__str__",
+          [](TensorView& self) -> std::string {
+            std::stringstream ss;
+            TORCH_CHECK(
+                self.getDataType().has_value(),
+                "TensorView does not have DataType?");
+            ss << self.getDataType().value();
+            return self.toString() + " DataType: " + ss.str() +
+                " Contiguity: " + self.domain()->getContiguityString();
+          },
+          py::return_value_policy::reference);
+  py::class_<torch::jit::fuser::cuda::Val>(nvfuser, "Val")
+      .def(
+          "__str__",
+          [](torch::jit::fuser::cuda::Val& self) -> std::string {
+            return self.toString();
+          },
+          py::return_value_policy::reference);
+
+  // C++ Side of Context Manager used to mimic the FusionGuard as a way
+  // to programatically distinguish code used to define the Fusion instead
+  // of having the user mysteriously create an object prior to adding definition
+  // code where the object is not used.
+  py::class_<FusionDefinitionContextManager> fusion_def(
+      nvfuser, "FusionDefinition");
+  fusion_def.def(py::init<PythonFusionOwner*>())
+      .def(
+          "__enter__",
+          [](FusionDefinitionContextManager& self) { return self.enter(); })
+      .def(
+          "__exit__",
+          [](FusionDefinitionContextManager& self,
+             void* exc_type,
+             void* exc_value,
+             void* traceback) { self.exit(); })
+      .def(
+          "add_input",
+          [](FusionDefinitionContextManager& self,
+             torch::jit::fuser::cuda::Val* input) { self.addInput(input); })
+      .def(
+          "add_input",
+          [](FusionDefinitionContextManager& self, TensorView* input) {
+            self.addInput(input);
+          })
+      .def(
+          "add_output",
+          [](FusionDefinitionContextManager& self,
+             torch::jit::fuser::cuda::Val* output) { self.addOutput(output); })
+      .def(
+          "add_output",
+          [](FusionDefinitionContextManager& self, TensorView* output) {
+            self.addOutput(output);
+          })
+      .def(
+          "define_tensor",
+          [](FusionDefinitionContextManager& self,
+             size_t ndims,
+             torch::jit::fuser::cuda::DataType dtype =
+                 torch::jit::fuser::cuda::DataType::Float) -> TensorView* {
+            return TensorViewBuilder()
+                .ndims(ndims)
+                .dtype(dtype)
+                .contiguity(std::vector<bool>(ndims, true))
+                .build();
+          },
+          py::arg("ndims"),
+          py::arg("dtype") = torch::jit::fuser::cuda::DataType::Float,
+          py::return_value_policy::reference)
+      .def(
+          "define_tensor",
+          [](FusionDefinitionContextManager& self,
+             std::vector<int> sizes,
+             std::vector<int> strides,
+             torch::jit::fuser::cuda::DataType dtype =
+                 torch::jit::fuser::cuda::DataType::Float) -> TensorView* {
+            TORCH_CHECK(
+                sizes.size() == strides.size(),
+                "The number of sizes does not match the number of strides.",
+                sizes.size(),
+                strides.size());
+
+            std::vector<IterDomain*> domain_sizes;
+            for (const auto i : c10::irange(sizes.size())) {
+              if (sizes[i] == 1) {
+                domain_sizes.push_back(IrBuilder::create<IterDomain>(
+                    self.fusionPtr()->zeroVal(),
+                    self.fusionPtr()->oneVal(),
+                    ParallelType::Serial,
+                    IterType::BroadcastWithStride));
+              } else {
+                domain_sizes.push_back(IrBuilder::create<IterDomain>(
+                    self.fusionPtr()->zeroVal(), IrBuilder::create<Int>()));
+              }
+            }
+
+            std::vector<bool> contig_info(strides.size(), false);
+            for (int i = contig_info.size() - 1; i >= 0; --i) {
+              if (i == static_cast<int>(contig_info.size() - 1)) {
+                contig_info[i] = (strides[i] == 1);
+              } else {
+                contig_info[i] =
+                    (strides[i] == (strides[i + 1] * sizes[i + 1]));
+              }
+            }
+
+            return IrBuilder::create<TensorView>(
+                IrBuilder::create<TensorDomain>(domain_sizes, contig_info),
+                dtype);
+          },
+          py::arg("sizes"),
+          py::arg("strides"),
+          py::arg("dtype") = torch::jit::fuser::cuda::DataType::Float,
+          py::return_value_policy::reference)
+      .def(
+          "define_constant",
+          [](FusionDefinitionContextManager& self,
+             double val) -> torch::jit::fuser::cuda::Val* {
+            return IrBuilder::create<Double>(val);
+          },
+          py::return_value_policy::reference)
+      .def(
+          "define_constant",
+          [](FusionDefinitionContextManager& self,
+             bool val) -> torch::jit::fuser::cuda::Val* {
+            return IrBuilder::create<Bool>(val);
+          },
+          py::return_value_policy::reference)
+      .def(
+          "define_constant",
+          [](FusionDefinitionContextManager& self,
+             int64_t val) -> torch::jit::fuser::cuda::Val* {
+            return IrBuilder::create<Int>(val);
+          },
+          py::return_value_policy::reference)
+      .def(
+          "define_scalar",
+          [](FusionDefinitionContextManager& self,
+             torch::jit::fuser::cuda::DataType dtype =
+                 torch::jit::fuser::cuda::DataType::Double)
+              -> torch::jit::fuser::cuda::Val* {
+            if (dtype == torch::jit::fuser::cuda::DataType::Double) {
+              return IrBuilder::create<Double>();
+            } else if (dtype == torch::jit::fuser::cuda::DataType::Bool) {
+              return IrBuilder::create<Bool>();
+            } else if (dtype == torch::jit::fuser::cuda::DataType::Int) {
+              return IrBuilder::create<Int>();
+            } else {
+              TORCH_CHECK(false, "Dtype is not supported:", dtype);
+            }
+          },
+          py::arg("dtype") = torch::jit::fuser::cuda::DataType::Double,
+          py::return_value_policy::reference);
+
+  py::class_<FusionDefinitionContextManager::Ops> nvf_ops(fusion_def, "Ops");
+
+  // ******************** INSERT OP BINDINGS BELOW HERE ********************
+
+#define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name)                 \
+  nvf_ops.def_static(                                                    \
+      op_str,                                                            \
+      py::overload_cast<TensorView*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                               \
+  nvf_ops.def_static(                                                    \
+      op_str,                                                            \
+      py::overload_cast<torch::jit::fuser::cuda::Val*>(                  \
+          &torch::jit::fuser::cuda::op_name),                            \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_UNARY_OP("abs", abs)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("acos", acos)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("asin", asin)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("atan", atan)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("atanh", atanh)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("ceil", ceil)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("cos", cos)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("cosh", cosh)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("exp", exp)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("expm1", expm1)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("erf", erf)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("erfc", erfc)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("floor", floor)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("frac", frac)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("lgamma", lgamma)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("log", log)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("log10", log10)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("log1p", log1p)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("log2", log2)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("neg", neg)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("bitwise_not", bitwise_not)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("relu", relu)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("rand_like", randlike)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("reciprocal", reciprocal)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("round", round)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("rsqrt", rsqrt)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("set", set)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("sigmoid", sigmoid)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("silu", silu)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("sin", sin)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("sinh", sinh)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("sqrt", sqrt)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("tan", tan)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("tanh", tanh)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("trunc", trunc)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isfinite", isfinite)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isinf", isinf)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isnan", isnan)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isneginf", isneginf)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isposinf", isposinf)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("isreal", isreal)
+#undef NVFUSER_PYTHON_BINDING_UNARY_OP
+
+#define NVFUSER_PYTHON_BINDING_BINARY_OP(op_str, op_name)                    \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<TensorView*, TensorView*>(                           \
+          &torch::jit::fuser::cuda::op_name),                                \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<TensorView*, torch::jit::fuser::cuda::Val*>(         \
+          &torch::jit::fuser::cuda::op_name),                                \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<torch::jit::fuser::cuda::Val*, TensorView*>(         \
+          &torch::jit::fuser::cuda::op_name),                                \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_BINARY_OP("add", add)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("atan2", atan2)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("div", div)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("fmod", fmod)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("mul", mul)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("pow", pow)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("remainder", remainder)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("sub", sub)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("mod", mod)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("eq", eq)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("ge", ge)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("gt", gt)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("le", le)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("lt", lt)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("ne", ne)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_and", bitwise_and)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_or", bitwise_or)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_xor", bitwise_xor)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_left_shift", bitwise_left_shift)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_right_shift", bitwise_left_shift)
+#undef NVFUSER_PYTHON_BINDING_BINARY_OP
+
+#define NVFUSER_PYTHON_BINDING_TERNARY_OP(op_str, op_name)                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<TensorView*, TensorView*, TensorView*>(              \
+          &torch::jit::fuser::cuda::op_name),                                \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*>(&torch::jit::fuser::cuda::op_name),                   \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          TensorView*>(&torch::jit::fuser::cuda::op_name),                   \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*>(&torch::jit::fuser::cuda::op_name),                   \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_TERNARY_OP("lerp", lerp)
+  NVFUSER_PYTHON_BINDING_TERNARY_OP("where", where)
+#undef NVFUSER_PYTHON_BINDING_TERNARY_OP
+
+#define NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP(op_str, op_name)             \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP("clamp", clamp)
+  NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP("threshold", threshold)
+#undef NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP
+
+#define NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP(op_str, op_name)             \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP("add_alpha", add_alpha)
+  NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP("sub_alpha", sub_alpha)
+#undef NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP
+
+#define NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP(op_str, op_name)                \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          TensorView*,                                                       \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);                                   \
+  nvf_ops.def_static(                                                        \
+      op_str,                                                                \
+      py::overload_cast<                                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*,                                     \
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \
+      py::return_value_policy::reference);
+
+  NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP("addcmul", addcmul)
+#undef NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP
+
+  // Reduction Operations
+  nvf_ops.def_static(
+      "max", &torch::jit::fuser::cuda::max, py::return_value_policy::reference);
+  nvf_ops.def_static(
+      "min", &torch::jit::fuser::cuda::min, py::return_value_policy::reference);
+  nvf_ops.def_static(
+      "sum", &torch::jit::fuser::cuda::sum, py::return_value_policy::reference);
+
+  // Broadcast operations
+  nvf_ops.def_static(
+      "broadcast",
+      &torch::jit::fuser::cuda::broadcast,
+      py::return_value_policy::reference);
+  // TODO: We don't have a way to realize a tensor if the operation creates
+  // the output of a fusion.
+  nvf_ops.def_static(
+      "broadcast_in_dim",
+      [](TensorView* input,
+         std::vector<int>& output_shape,
+         std::vector<int>& broadcast_dims) -> TensorView* {
+        TORCH_CHECK(
+            output_shape.size() >= input->nDims(),
+            "The new shape is expected to be greater-then-or-equal to the input",
+            output_shape.size(),
+            input->nDims());
+        TORCH_CHECK(
+            input->nDims() == broadcast_dims.size(),
+            "The broadcast dimensions should match the input dimensions.",
+            input->nDims(),
+            broadcast_dims.size());
+
+        std::vector<bool> is_broadcast_dim(output_shape.size(), true);
+        for (const auto idx : c10::irange(broadcast_dims.size())) {
+          if (idx > 0) {
+            TORCH_CHECK(
+                broadcast_dims[idx - 1] < broadcast_dims[idx],
+                "Broadcast dimension is not greater than the previous value.");
+          }
+          TORCH_CHECK(
+              broadcast_dims[idx] < static_cast<int>(output_shape.size()),
+              "Invalid broadcast_dims value.");
+          is_broadcast_dim.at(broadcast_dims[idx]) = false;
+        }
+
+        return torch::jit::fuser::cuda::broadcast(input, is_broadcast_dim);
+      },
+      py::return_value_policy::reference);
+
+  // Cast Operations
+  nvf_ops.def_static(
+      "cast",
+      py::overload_cast<torch::jit::fuser::cuda::DataType, TensorView*>(
+          &torch::jit::fuser::cuda::castOp),
+      py::return_value_policy::reference);
+  nvf_ops.def_static(
+      "cast",
+      py::overload_cast<
+          torch::jit::fuser::cuda::DataType,
+          torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::castOp),
+      py::return_value_policy::reference);
+}
+
+} // namespace jit
+} // namespace torch
+
+#else
+
+namespace torch {
+namespace jit {
+
+void initNvFuserPythonBindings(PyObject* module) {}
+
+} // namespace jit
+} // namespace torch
+
+#endif // USE_CUDA
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h
new file mode 100644
index 000000000000..c5785bc31de3
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace jit {
+void initNvFuserPythonBindings(PyObject* module);
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/reference_tensor.h b/torch/csrc/jit/codegen/cuda/reference_tensor.h
index 2220831dc09f..07c83bb6ed74 100644
--- a/torch/csrc/jit/codegen/cuda/reference_tensor.h
+++ b/torch/csrc/jit/codegen/cuda/reference_tensor.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/register_interface.cpp b/torch/csrc/jit/codegen/cuda/register_interface.cpp
index a3fba4b62975..c89f8c5a7a6a 100644
--- a/torch/csrc/jit/codegen/cuda/register_interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/register_interface.cpp
@@ -25,10 +25,21 @@ class RegisterInterface {
     ptr->fn_can_fuse_n = &isFusibleCudaFusionGroup;
     ptr->fn_insert_profile_inodes = &InsertProfileNodes;
     ptr->fn_profile_n = &shouldProfileNode;
+    ptr->fn_skip_n = &skipNodeKind;
   }
 };
 
 static RegisterInterface register_interface_;
+
+class RegisterNVFuserPass {
+ public:
+  RegisterNVFuserPass() {
+    NVFuserPassManager::registerPass(true);
+  }
+};
+
+static RegisterNVFuserPass register_nvfuser_pass_;
+
 } // namespace
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
index ddb92371baa2..f7d00799e83e 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
@@ -47,8 +47,9 @@ std::unordered_map<IterDomain*, IterDomain*> RootDomainMap::
 
 PairwiseRootDomainMap::PairwiseRootDomainMap(
     const TensorView* producer,
-    const TensorView* consumer)
-    : producer_tv_(producer), consumer_tv_(consumer) {
+    const TensorView* consumer,
+    bool is_exact)
+    : producer_tv_(producer), consumer_tv_(consumer), is_exact_(is_exact) {
   TORCH_INTERNAL_ASSERT(producer != nullptr);
   TORCH_INTERNAL_ASSERT(consumer != nullptr);
   TORCH_INTERNAL_ASSERT(producer->fusion() == consumer->fusion());
@@ -100,6 +101,14 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       continue;
     }
 
+    // In exact mapping, do not map broadcast domains with
+    // non-broadcast domains
+    if (is_exact_ && producer_id->isBroadcast() != consumer_id->isBroadcast()) {
+      itc++;
+      itp++;
+      continue;
+    }
+
     IterDomain* map_key_id = producer_id;
     IterDomain* map_value_id = consumer_id;
     if (!producer_to_consumer) {
@@ -134,9 +143,17 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::
   for (const auto i : c10::irange(consumer_root.size())) {
     IterDomain* map_key_id = producer_root[new2old[i]];
     IterDomain* map_value_id = consumer_root[i];
+
+    // In exact mapping, do not map broadcast domains with
+    // non-broadcast domains
+    if (is_exact_ && map_key_id->isBroadcast() != map_value_id->isBroadcast()) {
+      continue;
+    }
+
     if (!producer_to_consumer) {
       std::swap(map_key_id, map_value_id);
     }
+
     if (root_dims_to_map.find(map_key_id) != root_dims_to_map.end()) {
       dom_map.insert(std::make_pair(map_key_id, map_value_id));
     }
@@ -144,10 +161,14 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::
   return dom_map;
 }
 
-std::string toString(const PairwiseRootDomainMap& root_map) {
+std::string PairwiseRootDomainMap::toString() const {
   std::stringstream ss;
-  ss << "{producer: " << root_map.producer()
-     << ", consumer: " << root_map.consumer() << "}";
+  ss << "{producer: " << producer() << ", consumer: " << consumer();
+  auto p2c = mapProducerToConsumer(producer()->domain(), consumer()->domain());
+  for (auto pair : p2c) {
+    ss << ", " << pair.first->toString() << " -> " << pair.second->toString();
+  }
+  ss << "}";
   return ss.str();
 }
 
@@ -167,23 +188,23 @@ auto ensureMapping(
 
 } // namespace
 
-std::string toString(const DomainKey& key) {
+std::string DomainKey::toString() const {
   std::stringstream ss;
   ss << "{";
-  if (key.td()) {
-    ss << key.td() << " (root: " << key.td()->getRootDomain()
-       << ", maybe rfactor: " << key.td()->getMaybeRFactorDomain() << ")";
+  if (td()) {
+    ss << td() << " (root: " << td()->getRootDomain()
+       << ", maybe rfactor: " << td()->getMaybeRFactorDomain() << ")";
   } else {
     ss << "null";
   }
   ss << ", ";
-  if (key.id()) {
-    ss << key.id();
+  if (id()) {
+    ss << id();
   } else {
     ss << "null";
   }
-  if (key.concreteId()) {
-    ss << " (" << key.concreteId() << ")";
+  if (concreteId()) {
+    ss << " (" << concreteId() << ")";
   }
   ss << "}";
   return ss.str();
@@ -196,7 +217,7 @@ UnmappableReductionDomains::UnmappableReductionDomains() {
 
 namespace {
 
-//! Find all domains that a given domain is depeendent on
+//! Find all domains that a given domain is dependent on
 class FindInputDomains : BackwardVisitor {
  private:
   FindInputDomains(TensorView* tv, const IterDomain* id)
@@ -285,6 +306,19 @@ void UnmappableReductionDomains::handle(ReductionOp* op) {
   handleReductionOutput(out_tv);
 }
 
+void UnmappableReductionDomains::handle(GroupedReductionOp* op) {
+  // Builds a map from reduction domains to consumer domains.
+  for (auto out : op->outputs()) {
+    handleReductionOutput(out->as<TensorView>());
+  }
+}
+
+void UnmappableReductionDomains::handle(MmaOp* mma) {
+  // Builds a map from reduction domains to consumer domains.
+  TensorView* out_tv = mma->out()->as<TensorView>();
+  handleReductionOutput(out_tv);
+}
+
 void UnmappableReductionDomains::handle(WelfordOp* op) {
   // Builds a map from reduction domains to consumer domains.
   handleReductionOutput(op->outAvg()->as<TensorView>());
@@ -446,7 +480,7 @@ bool ComputeAtRootDomainMap::canMap(
 bool ComputeAtRootDomainMap::canMap(
     const DomainKey& key_a,
     const DomainKey& key_b) const {
-  return key_a == key_b || eq_set_.areEquivalent(key_a, key_b);
+  return key_a == key_b || eq_set_.permissiveAreMapped(key_a, key_b);
 }
 
 void ComputeAtRootDomainMap::setAlias(
@@ -463,10 +497,11 @@ void ComputeAtRootDomainMap::setAlias(
   }
   bcast_map_ = tmp_bcast_map;
 
-  for (const auto& key : eq_set_.getAllElements()) {
+  auto all_elements = eq_set_.getAllElements();
+  for (const auto& key : all_elements.vector()) {
     if (key.td() == td) {
       DomainKey alias_key(td_alias, key.id(), key.concreteId());
-      eq_set_.join(key, alias_key);
+      eq_set_.mapEntries(key, alias_key);
     }
   }
 
@@ -485,7 +520,7 @@ std::vector<DomainKey> ComputeAtRootDomainMap::getConcretizedKeys(
     const IterDomain* id) const {
   DomainKey key(td, id);
   auto it = bcast_map_.find(key);
-  TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", toString(key));
+  TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", key.toString());
   std::vector<DomainKey> domains;
   std::transform(
       it->second.begin(),
@@ -501,7 +536,7 @@ std::unordered_set<const IterDomain*>& ComputeAtRootDomainMap::
     getConcretizedDomains(const TensorDomain* td, const IterDomain* id) {
   DomainKey key(td, id);
   auto it = bcast_map_.find(key);
-  TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", toString(key));
+  TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", key.toString());
   return it->second;
 }
 
@@ -548,13 +583,15 @@ std::unordered_map<IterDomain*, IterDomain*> ComputeAtRootDomainMap::map(
     if (id_map.find(from_id) != id_map.end()) {
       continue;
     }
-    // Matching ID not found. It's an error unless from_id is a new
-    // broadcast of a consumer domain; or from_id is a window axis of
-    // a consumer domain. Note that reduction domains are removed from
-    // the producer root domain.
+    // Matching ID not found. It's an error unless the following three cases:
+    // 1. from_id is a new broadcast of a consumer domain; or
+    // 2. from_id is a window axis of a consumer domain; or
+    // 3. from_id is a ViewAsScalar domain
+    // Note that reduction domains are removed from the producer root domain.
     if (!producer_to_consumer &&
         (new_broadcast_domains_.find(DomainKey(from_td, from_id)) !=
              new_broadcast_domains_.end() ||
+         from_id->getIterType() == IterType::VectorComponent ||
          (window_axes_.count(from_id) > 0))) {
       continue;
     }
@@ -570,7 +607,7 @@ std::unordered_map<IterDomain*, IterDomain*> ComputeAtRootDomainMap::map(
         ". Consumer root: ",
         consumer_root,
         ". Mapping: ",
-        toString(*this));
+        this->toString());
   }
   return id_map;
 }
@@ -578,27 +615,30 @@ std::unordered_map<IterDomain*, IterDomain*> ComputeAtRootDomainMap::map(
 std::unordered_set<IterDomain*> ComputeAtRootDomainMap::getMappableDims(
     const TensorDomain* producer,
     const TensorDomain* consumer) const {
+  //! This funciton previously used mapBestEffort but it can fail when
+  //! a domain is mapped to multitple domains, which can happen with
+  //! views. Since we only need to find mappable domains, just
+  //! grab any domain that is mapped in a pairwise way.
+
   const auto& producer_root = producer->getMaybeRFactorDomain();
   const auto& consumer_root = consumer->getRootDomain();
 
-  std::unordered_map<IterDomain*, IterDomain*> id_map =
-      mapBestEffort(producer, producer_root, consumer, consumer_root);
-
   std::unordered_set<IterDomain*> mappable_ids;
 
-  for (auto& from_id : producer_root) {
-    if (id_map.find(from_id) != id_map.end()) {
-      mappable_ids.emplace(from_id);
-      mappable_ids.emplace(id_map.at(from_id));
+  for (const auto& p_id : producer_root) {
+    for (const auto& c_id : consumer_root) {
+      if (canMap(producer, p_id, consumer, c_id)) {
+        mappable_ids.emplace(p_id);
+        mappable_ids.emplace(c_id);
+      }
     }
   }
+
   return mappable_ids;
 }
 
-std::string toString(const ComputeAtRootDomainMap& root_map) {
-  std::stringstream ss;
-  root_map.eq_set_.print(ss);
-  return ss.str();
+std::string ComputeAtRootDomainMap::toString() const {
+  return eq_set_.toString();
 }
 
 ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder(
@@ -614,9 +654,9 @@ ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder(
     std::stringstream ss;
     ss << "pending map:\n";
     for (auto& kv : pending_map_) {
-      ss << "\t" << toString(kv.first) << "\n";
+      ss << "\t" << kv.first.toString() << "\n";
       for (auto& dk : kv.second) {
-        ss << "\t\t" << toString(dk) << "\n";
+        ss << "\t\t" << dk.toString() << "\n";
       }
     }
     std::cerr << ss.str();
@@ -638,10 +678,14 @@ void ComputeAtRootDomainMapBuilder::initializeBcastMap(
     return;
   }
 
-  // This initialization should be only used for fusion output tensors and
-  // outputs of multi-consumer expressions that are not fusion outputs.
+  // This initialization should be only used for: 1) fusion output
+  // tensors, 2) outputs of multi-consumer expressions that are not
+  // fusion outputs, and 3) view outputs as broadcasts can be merged
+  // with non-broadcast domains, resulting in non-broadcast rfactor
+  // domains.
   TORCH_INTERNAL_ASSERT(
-      tv->isFusionOutput() || tv->definition()->outputs().size() > 1,
+      tv->isFusionOutput() || tv->definition()->outputs().size() > 1 ||
+          tv->isDefinitionType(ExprType::ViewOp),
       "Invalid tensor to initialize bcast map: t",
       tv->name());
   root_map_.bcast_map_.insert({key, {id}});
@@ -658,7 +702,59 @@ void ComputeAtRootDomainMapBuilder::addToPendingList(
 void ComputeAtRootDomainMapBuilder::setMapped(
     const DomainKey& producer,
     const DomainKey& consumer) {
-  root_map_.eq_set_.join(producer, consumer);
+  root_map_.eq_set_.mapEntries(producer, consumer);
+}
+
+void ComputeAtRootDomainMapBuilder::setInvalid(
+    const DomainKey& key1,
+    const DomainKey& key2) {
+  invalid_mappings_.emplace_back(key1, key2);
+}
+
+bool ComputeAtRootDomainMapBuilder::isInvalid(
+    const std::vector<DomainKey>& domains) const {
+  // First, collect all invalid mappings for each of the keys in domains
+  DomainKeyMap<DomainKeySet> invalid_key_map;
+  for (const auto& key : domains) {
+    DomainKeySet invalid_keys;
+    for (const auto& invalid_pair : invalid_mappings_) {
+      if (root_map_.canMap(key, invalid_pair.first)) {
+        invalid_keys.insert(invalid_pair.second);
+      } else if (root_map_.canMap(key, invalid_pair.second)) {
+        invalid_keys.insert(invalid_pair.first);
+      }
+    }
+    invalid_key_map.emplace(key, invalid_keys);
+  }
+
+  // Next, check if any pair is invalid to map.
+  const auto num_keys = domains.size();
+  for (const auto i : c10::irange(num_keys)) {
+    const auto& key_i = domains[i];
+    // If no invalid keys found for key_i, it can be skipped.
+    const auto invalid_key_map_it = invalid_key_map.find(key_i);
+    if (invalid_key_map_it == invalid_key_map.end()) {
+      continue;
+    }
+
+    // Set of keys that are invalid to be mapped with key_i.
+    const DomainKeySet& invalid_keys_for_i = invalid_key_map_it->second;
+
+    // If any other key in domains is identified mappable with any of
+    // the keys in this set, the mapping with key_i is invalid.
+    for (const auto j : c10::irange(i + 1, num_keys)) {
+      const auto& key_j = domains[j];
+      if (std::any_of(
+              invalid_keys_for_i.begin(),
+              invalid_keys_for_i.end(),
+              [&](const auto& invalid_key_for_i) {
+                return root_map_.canMap(key_j, invalid_key_for_i);
+              })) {
+        return true;
+      }
+    }
+  }
+  return false;
 }
 
 void ComputeAtRootDomainMapBuilder::setMaybeMapped(
@@ -693,7 +789,7 @@ void ComputeAtRootDomainMapBuilder::setMaybeMapped(
     TORCH_INTERNAL_ASSERT(
         !consumer_id->isBroadcast(),
         "No concrete domain found for a broadcast domain: ",
-        toString(consumer_key));
+        consumer_key.toString());
     auto producer_concrete_key = producer_key;
     if (producer_id->isBroadcast()) {
       const auto concrete_id = consumer_id;
@@ -730,10 +826,10 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) {
 
   // Record equalities from output to all the inputs
   // ignores un-concretizable broadcasts
-  for (auto* i : ir_utils::filterByType<TensorView>(e->inputs())) {
-    const TensorDomain* in_td = i->domain();
+  for (auto* in_tv : ir_utils::filterByType<TensorView>(e->inputs())) {
+    const TensorDomain* in_td = in_tv->domain();
     std::vector<IterDomain*> in_root =
-        TensorDomain::noReductions(i->getMaybeRFactorDomain());
+        TensorDomain::noReductions(in_tv->getMaybeRFactorDomain());
     TORCH_INTERNAL_ASSERT(
         in_root.size() == out_root.size(),
         "\nExpression: ",
@@ -745,7 +841,10 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) {
     for (const auto it : c10::irange(in_root.size())) {
       if (e->outputs().size() > 1) {
         TORCH_INTERNAL_ASSERT(
-            e->isA<WelfordOp>(), "Only supported multioutput op is welford");
+            e->isA<WelfordOp>() || e->isA<GroupedReductionOp>(),
+            "Multi-output mapping assumes WelforddOp or GroupedReductionOp but, ",
+            e->getExprType().value(),
+            " is found");
         for (auto o : e->outputs()) {
           auto o_tv = o->as<TensorView>();
           auto o_td = o_tv->domain();
@@ -807,6 +906,36 @@ void ComputeAtRootDomainMapBuilder::handle(BroadcastOp* op) {
   }
 }
 
+void ComputeAtRootDomainMapBuilder::handle(ViewAsScalar* op) {
+  const TensorView* out_tv = op->output(0)->as<TensorView>();
+  const TensorDomain* out_td = out_tv->domain();
+  const auto& out_root = out_td->getRootDomain();
+
+  const TensorView* in_tv = op->input(0)->as<TensorView>();
+  const TensorDomain* in_td = in_tv->domain();
+
+  std::vector<IterDomain*> in_root =
+      TensorDomain::noReductions(in_tv->getMaybeRFactorDomain());
+  TORCH_INTERNAL_ASSERT(
+      in_root.size() + 1 == out_root.size(),
+      "\nExpression: ",
+      op,
+      "\nInput root domain: ",
+      in_root,
+      "\nOutput root domain: ",
+      out_root);
+  auto in_it = in_root.begin();
+  auto out_it = out_root.begin();
+  while (in_it != in_root.end() && out_it != out_root.end()) {
+    setMaybeMapped(in_td, *in_it, out_td, *out_it);
+    ++in_it;
+    ++out_it;
+  }
+  TORCH_INTERNAL_ASSERT(
+      (*out_it)->isVectorComponent(),
+      "The last dim of ViewDtypeOp's output must be a ViewAsScalar");
+}
+
 void ComputeAtRootDomainMapBuilder::handle(TransposeOp* op) {
   const TensorDomain* in_td = op->in()->as<TensorView>()->domain();
   std::vector<IterDomain*> in_root =
@@ -843,37 +972,77 @@ void ComputeAtRootDomainMapBuilder::handle(GatherOp* op) {
   }
 }
 
-bool ComputeAtRootDomainMapBuilder::mapAllConsumers(
-    const DomainKey& producer_key) {
-  auto it = pending_map_.find(producer_key);
+void ComputeAtRootDomainMapBuilder::mapAllPendingMappings(
+    const DomainKey& key) {
+  auto it = pending_map_.find(key);
   if (it == pending_map_.end()) {
-    return false;
+    return;
   }
-  const auto& consumer_set = it->second;
+  const auto& pending_set = it->second;
   // All entries in key_set must be equivalent with each other.
-  TORCH_INTERNAL_ASSERT(consumer_set.size() > 0);
-  bool consistent = safeToMap(consumer_set);
-  if (consistent) {
-    for (const auto pending_consumer : consumer_set) {
-      setMapped(producer_key, pending_consumer);
+  TORCH_INTERNAL_ASSERT(pending_set.size() > 0);
+  bool consistent = safeToMap(pending_set);
+  for (const auto pending_key : pending_set) {
+    if (consistent) {
+      setMapped(key, pending_key);
+    } else {
+      setInvalid(key, pending_key);
     }
   }
   // This entry should never be used again, so remove it.
   pending_map_.erase(it);
-  return consistent;
+}
+
+void ComputeAtRootDomainMapBuilder::mapAllPendingMappings(
+    const TensorDomain* td,
+    IterDomain* id) {
+  if (id->isBroadcast()) {
+    for (const auto& key : root_map_.getConcretizedKeys(td, id)) {
+      mapAllPendingMappings(key);
+    }
+  } else {
+    mapAllPendingMappings(DomainKey(td, id));
+  }
 }
 
 void ComputeAtRootDomainMapBuilder::handle(TensorView* tv) {
   const TensorDomain* td = tv->domain();
-  const auto root = TensorDomain::noReductions(td->getMaybeRFactorDomain());
-  for (auto id : root) {
+  const auto rfactor = TensorDomain::noReductions(td->getMaybeRFactorDomain());
+  for (auto id : rfactor) {
     if (id->isBroadcast()) {
       initializeBcastMap(tv, id);
-      for (const auto& key : root_map_.getConcretizedKeys(td, id)) {
-        mapAllConsumers(key);
+    }
+    mapAllPendingMappings(td, id);
+  }
+
+  // When tv has a rfactor domain, propagate the domain mappings from
+  // each of the rfactor axes to the dependent root axes.
+  if (td->hasViewLikeRFactor()) {
+    std::unordered_set<Val*> root_set(
+        {td->getRootDomain().begin(), td->getRootDomain().end()});
+    for (auto rf_id : rfactor) {
+      if (!rf_id->isRFactorProduct()) {
+        continue;
+      }
+      auto dep = DependencyCheck::getAllValsBetween(root_set, {rf_id});
+      for (auto id : ir_utils::filterByType<IterDomain>(dep)) {
+        if (root_set.find(id) == root_set.end() || rf_id == id) {
+          continue;
+        }
+        setMaybeMapped(td, id, td, rf_id);
       }
-    } else {
-      mapAllConsumers(DomainKey(td, id));
+    }
+    // Once mappings for rfactor axes are propagated to root axes,
+    // aggregates them at each root axis
+    for (auto id : tv->getRootDomain()) {
+      if (id->isBroadcast()) {
+        // There can be broadcast domains that appear at root domains but
+        // are removed at rfactor domains as they are merged into
+        // non-reduction domains. Initialize the map for those broadcast
+        // domains.
+        initializeBcastMap(tv, id);
+      }
+      mapAllPendingMappings(td, id);
     }
   }
 }
@@ -931,9 +1100,90 @@ bool ComputeAtRootDomainMapBuilder::safeToMap(const DomainKeySet& domains) {
       !map_through_reduction_) {
     return false;
   }
+  // Make sure mapping these domains won't cause any invalid mapping
+  if (isInvalid(unique_domains)) {
+    return false;
+  }
   return true;
 }
 
+namespace {
+class ExactRootDomainMapBuilder : private IterVisitor {
+ public:
+  ExactRootDomainMapBuilder(
+      Fusion* fusion,
+      DisjointSets<const IterDomain*>& eq_sets)
+      : eq_sets_(eq_sets) {
+    traverseFrom(fusion, fusion->outputs());
+  }
+
+ private:
+  using IterVisitor::handle;
+
+  void handle(Expr* expr) final {
+    for (auto producer : ir_utils::filterByType<TensorView>(expr->inputs())) {
+      for (auto consumer :
+           ir_utils::filterByType<TensorView>(expr->outputs())) {
+        PairwiseRootDomainMap pwise_map(producer, consumer, true);
+        const auto mappings = pwise_map.mapProducerToConsumer(
+            producer->domain(), consumer->domain());
+        for (const auto& mapping : mappings) {
+          eq_sets_.mapEntries(mapping.first, mapping.second);
+        }
+      }
+    }
+  }
+
+ private:
+  DisjointSets<const IterDomain*>& eq_sets_;
+};
+
+} // namespace
+
+ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion) {
+  ExactRootDomainMapBuilder builder(fusion, eq_sets_);
+}
+
+bool ExactRootDomainMap::areMapped(
+    const IterDomain* id_a,
+    const IterDomain* id_b) const {
+  return eq_sets_.strictAreMapped(id_a, id_b);
+}
+
+std::unordered_map<IterDomain*, IterDomain*> ExactRootDomainMap::map(
+    const TensorDomain* producer,
+    const TensorDomain* consumer,
+    const std::unordered_set<IterDomain*>& root_dims_to_map,
+    bool producer_to_consumer) const {
+  const auto& producer_root =
+      TensorDomain::noReductions(producer->getMaybeRFactorDomain());
+  const auto& consumer_root = consumer->getRootDomain();
+  const auto& from_ids = producer_to_consumer ? producer_root : consumer_root;
+  const auto& to_ids = producer_to_consumer ? consumer_root : producer_root;
+
+  std::unordered_map<IterDomain*, IterDomain*> id_map;
+
+  for (auto& from_id : from_ids) {
+    if (root_dims_to_map.find(from_id) == root_dims_to_map.end()) {
+      continue;
+    }
+    for (const auto& to_id : to_ids) {
+      if (areMapped(from_id, to_id)) {
+        TORCH_INTERNAL_ASSERT(
+            id_map.insert({from_id, to_id}).second,
+            "Multiple matching ID detected for ",
+            from_id);
+      }
+    }
+  }
+
+  return id_map;
+}
+
+std::string ExactRootDomainMap::toString() const {
+  return eq_sets_.toString();
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.h b/torch/csrc/jit/codegen/cuda/root_domain_map.h
index 23ada0fb1201..2054e3272686 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.h
+++ b/torch/csrc/jit/codegen/cuda/root_domain_map.h
@@ -5,7 +5,7 @@
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 namespace torch {
 namespace jit {
@@ -82,7 +82,8 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap {
   //! \param consumer The consumer tensor of a producer-consumer pair.
   explicit PairwiseRootDomainMap(
       const TensorView* producer,
-      const TensorView* consumer);
+      const TensorView* consumer,
+      bool is_exact = false);
 
   const TensorView* producer() const {
     return producer_tv_;
@@ -92,6 +93,8 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap {
     return consumer_tv_;
   }
 
+  std::string toString() const;
+
  protected:
   std::unordered_map<IterDomain*, IterDomain*> map(
       const TensorDomain* producer,
@@ -108,10 +111,10 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap {
  private:
   const TensorView* producer_tv_ = nullptr;
   const TensorView* consumer_tv_ = nullptr;
+  //! If true, does not map broadcast IDs with non-broadcast IDs
+  const bool is_exact_ = false;
 };
 
-std::string toString(const PairwiseRootDomainMap& root_map);
-
 //! Represents an iteration domain of a TensorDomain. Only used for
 //! root domain mapping.
 //!
@@ -143,14 +146,14 @@ class DomainKey {
         concreteId() == other.concreteId();
   }
 
+  std::string toString() const;
+
  private:
   const TensorDomain* td_ = nullptr;
   const IterDomain* id_ = nullptr;
   const IterDomain* concrete_id_ = nullptr;
 };
 
-std::string toString(const DomainKey& key);
-
 struct DomainKeyHash {
   std::size_t operator()(const DomainKey& key) const {
     return std::hash<const TensorDomain*>{}(key.td()) ^
@@ -186,7 +189,9 @@ class TORCH_CUDA_CU_API UnmappableReductionDomains : private IterVisitor {
  private:
   using IterVisitor::handle;
   void handle(ReductionOp* op) override;
+  void handle(GroupedReductionOp* op) override;
   void handle(WelfordOp* op) override;
+  void handle(MmaOp* op) override;
 
   void handleReductionOutput(TensorView* out_tv);
 
@@ -204,9 +209,14 @@ class TORCH_CUDA_CU_API UnmappableReductionDomains : private IterVisitor {
 //! example:
 //!    T2 [i0,i1] = T1[i2,i3] + T0[i4,i5]
 //! This will create mappings between i0, i2 and i4.
+//!
+//! Note that with views, there can be multiple domains mapped with
+//! the same domain. Thus, obtaining one-to-one maps can
+//! fail. Currently, the only use of this class is getMappableDims,
+//! which just grabs any domain that is mappable, which works no
+//! matter view is used or not.
 class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
   friend class ComputeAtRootDomainMapBuilder;
-  friend std::string toString(const ComputeAtRootDomainMap&);
 
  public:
   //! Builds a mapping table by analyzing the current
@@ -252,7 +262,11 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
   //! be a producer-consumer pair. Since they may not be a
   //! producer-consumer pair, this function requires proper root
   //! domains, which may be root or rfactor domains. Also, no error
-  //! check is done as we do not assume producer-consumer relationship.
+  //! check is done as we do not assume producer-consumer
+  //! relationship.
+  //!
+  //! Note that an exception is thrown when a domain is found to be
+  //! mapped to multiple domains, which can happen with views.
   //!
   //! \param from_td A TensorDomain from which a map is created
   //! \param from_root A root domain of from_td
@@ -283,8 +297,8 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
       const TensorDomain* td_b,
       const IterDomain* id_b) const;
 
-  //! Returns if key_a and key_b are mapped to eachother (equivalent), or are
-  //! the same key.
+  //! Returns if key_a and key_b are mapped to each other (equivalent), or are
+  //! the same key. Returns false if two keys are not known to be mapped.
   bool canMap(const DomainKey& key_a, const DomainKey& key_b) const;
 
   //! Returns the set of (non-broadcast) DomainKeys that id in td is
@@ -312,9 +326,11 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
       const std::unordered_set<IterDomain*>& root_dims_to_map,
       bool producer_to_consumer) const override;
 
+  std::string toString() const;
+
  private:
   //! Disjoint set of all mapped <TD, ID> keys to determine axes equivalency
-  DisjointSet<DomainKey, DomainKeyHash> eq_set_;
+  DisjointSets<DomainKey, DomainKeyHash> eq_set_;
 
   //! All IterDomains in the mapping that are a broadcast ID
   DomainKeyMap<std::unordered_set<const IterDomain*>> bcast_map_;
@@ -327,12 +343,10 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
   std::unordered_set<IterDomain*> window_axes_;
 };
 
-std::string toString(const ComputeAtRootDomainMap& root_map);
-
-//! Create a DisjointSet of root IterDomains by traversing the
+//! Create a DisjointSets of root IterDomains by traversing the
 //! current fusion entirely. IterDomains that can be mapped each
 //! other with computeAt are grouped into the same subset in the
-//! DisjointSet.
+//! DisjointSets.
 class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
     : private BackwardVisitor {
  public:
@@ -347,6 +361,12 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
   //! Set a pair of producer-consumer domain keys as mappable
   void setMapped(const DomainKey& producer, const DomainKey& consumer);
 
+  //! Records two domains are invalid to map
+  void setInvalid(const DomainKey& key1, const DomainKey& key2);
+
+  //! Check if no pair of domains is invalid to map
+  bool isInvalid(const std::vector<DomainKey>& domains) const;
+
   //! Track a pair of producer-consumer domains as potentially mappable. Inserts
   //! entries into pending_map_, but does not add anything into the root_map_
   //! (added when handle is called on a TensorView). Maybe mapped will, however,
@@ -383,10 +403,18 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
     mapPointwiseOrReductionOp(op);
   }
 
+  void handle(GroupedReductionOp* op) override {
+    mapPointwiseOrReductionOp(op);
+  }
+
   void handle(WelfordOp* wop) override {
     mapPointwiseOrReductionOp(wop);
   }
 
+  void handle(MmaOp* wop) override {
+    mapPointwiseOrReductionOp(wop);
+  }
+
   void handle(ShiftOp* op) override {
     mapPointwiseOrReductionOp(op);
   }
@@ -395,6 +423,8 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
     mapPointwiseOrReductionOp(op);
   }
 
+  void handle(ViewAsScalar* op) override;
+
   void handle(BroadcastOp* op) override;
 
   void handle(TransposeOp* op) override;
@@ -403,11 +433,15 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
 
   void handle(TensorView* tv) override;
 
-  //! Maps all consumers with a producer.
+  //! Maps all pending mappings.
   //! This is called for each of TensorViews in a backward traversal,
   //! recursively building mappings from the output tensors to the
   //! input tensors.
-  bool mapAllConsumers(const DomainKey& producer_key);
+  void mapAllPendingMappings(const DomainKey& key);
+
+  //! Maps all pending mappings for id of td. When id is a broadcast,
+  //! mapping is done separately for each concrete domain.
+  void mapAllPendingMappings(const TensorDomain* td, IterDomain* id);
 
   bool hasMatchingDomains(const std::vector<DomainKey>& unique_domains);
 
@@ -415,16 +449,40 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder
 
  private:
   ComputeAtRootDomainMap& root_map_;
-  //! Keep track of what we want to try and map. Set in attemptToProveId.
+  //! Keep track of what we want to try and map
   DomainKeyMap<DomainKeySet> pending_map_;
   std::unordered_set<Expr*> visited_;
+  //! Helper class to find invalid mappings due to reductions
   UnmappableReductionDomains incompatible_domains_;
+  //! Running vector of domain pairs that are invalid to map
+  std::vector<std::pair<DomainKey, DomainKey>> invalid_mappings_;
 
   //! Disable UnmappableReductions check, should
   //!  always be false for compute_at use cases
   bool map_through_reduction_ = false;
 };
 
+//! Maps root domains of an entire fusion. Does not map broadcast
+//! domains with non-broadcast domains.
+class TORCH_CUDA_CU_API ExactRootDomainMap : public RootDomainMap {
+ public:
+  ExactRootDomainMap(Fusion* fusion);
+
+  bool areMapped(const IterDomain* id_a, const IterDomain* id_b) const;
+
+  std::string toString() const;
+
+ protected:
+  std::unordered_map<IterDomain*, IterDomain*> map(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& root_dims_to_map,
+      bool producer_to_consumer) const override;
+
+ private:
+  DisjointSets<const IterDomain*> eq_sets_;
+};
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/runtime/array.cu b/torch/csrc/jit/codegen/cuda/runtime/array.cu
new file mode 100644
index 000000000000..2f06ddd92e18
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/array.cu
@@ -0,0 +1,231 @@
+// aligned register array for vectorized load/store
+template <typename scalar_t, int size, int align_size>
+struct alignas(sizeof(scalar_t) * align_size) Array {
+  scalar_t array[size];
+
+  __device__ void set(scalar_t v) {
+#pragma unroll
+    for (int i = 0; i < size; ++i) {
+      array[i] = v;
+    }
+  }
+
+  __device__ scalar_t& operator[](const unsigned int i) {
+    return array[i];
+  }
+};
+
+// Used for vectorized allocations that are not in registers
+template <typename scalar_t, int vec_size>
+__device__ void arraySet(scalar_t* buff, scalar_t val) {
+#pragma unroll
+  for (int i = 0; i < vec_size; ++i) {
+    buff[i] = val;
+  }
+}
+
+template <typename scalar_t, int vec_size>
+__device__ void loadGeneric(scalar_t* to, scalar_t* from) {
+  // It would be really nice to use memcpy here, but one example was failing
+  // with:
+  //
+  //  memcpy(to, from, vec_size * sizeof(scalar_t));
+  //
+  // Yet passing with:
+  //
+  // for(int i = 0; i < vec_size; i++){
+  //   to[i] = from[i];
+  // }
+
+  switch (sizeof(scalar_t) * vec_size) {
+    case 1:
+      *reinterpret_cast<uchar1*>(to) = *reinterpret_cast<uchar1*>(from);
+      break;
+    case 2:
+      *reinterpret_cast<uchar2*>(to) = *reinterpret_cast<uchar2*>(from);
+      break;
+    case 4:
+      *reinterpret_cast<uint1*>(to) = *reinterpret_cast<uint1*>(from);
+      break;
+    case 8:
+      *reinterpret_cast<uint2*>(to) = *reinterpret_cast<uint2*>(from);
+      break;
+    case 12:
+      *reinterpret_cast<uint3*>(to) = *reinterpret_cast<uint3*>(from);
+      break;
+    case 16:
+      *reinterpret_cast<uint4*>(to) = *reinterpret_cast<uint4*>(from);
+      break;
+  }
+}
+
+// Volatile version only works with c++ fundamnetal types
+template <
+    typename scalar_t,
+    int vec_size,
+    bool is_volatile_to,
+    bool is_volatile_from>
+__device__ void loadGenericVolatile(
+    typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
+    typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
+  switch (sizeof(scalar_t) * vec_size) {
+    // Reinterpret cast like this with volatile types only works for C++
+    // fundamental types otherwise the = operator is not defined
+    case 1:
+      *reinterpret_cast<
+          typename MaybeVolatile<unsigned char, is_volatile_to>::type*>(to) =
+          *reinterpret_cast<
+              typename MaybeVolatile<unsigned char, is_volatile_from>::type*>(
+              from);
+      break;
+    case 2:
+      *reinterpret_cast<typename MaybeVolatile<short, is_volatile_to>::type*>(
+          to) =
+          *reinterpret_cast<
+              typename MaybeVolatile<short, is_volatile_from>::type*>(from);
+      break;
+    case 4:
+      *reinterpret_cast<
+          typename MaybeVolatile<unsigned int, is_volatile_to>::type*>(to) =
+          *reinterpret_cast<
+              typename MaybeVolatile<unsigned int, is_volatile_from>::type*>(
+              from);
+      break;
+    case 8:
+      *reinterpret_cast<typename MaybeVolatile<double, is_volatile_to>::type*>(
+          to) =
+          *reinterpret_cast<
+              typename MaybeVolatile<double, is_volatile_from>::type*>(from);
+      break;
+  }
+}
+
+template <typename scalar_t, int vec_size, bool is_volatile>
+__device__ void loadLocalToGlobal(
+    typename MaybeVolatile<scalar_t, is_volatile>::type* to,
+    scalar_t* from) {
+  switch (sizeof(scalar_t) * vec_size) {
+    case 1:
+    case 2:
+    case 4:
+      loadGenericVolatile<scalar_t, vec_size, is_volatile, false>(to, from);
+      break;
+    case 8: {
+      uint2 const& data = *reinterpret_cast<uint2*>(from);
+      if (is_volatile) {
+        asm volatile(
+            "st.volatile.global.v2.s32 [%0], {%1,%2};" ::"l"(
+                (typename MaybeVolatile<uint2, is_volatile>::type*)to),
+            "r"(data.x),
+            "r"(data.y));
+      } else {
+        asm volatile(
+            "st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"(
+                (typename MaybeVolatile<uint2, is_volatile>::type*)to),
+            "r"(data.x),
+            "r"(data.y));
+      }
+      break;
+    }
+    case 16: {
+      uint4 const& data = *reinterpret_cast<uint4*>(from);
+      if (is_volatile) {
+        asm volatile(
+            "st.volatile.global.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
+                (typename MaybeVolatile<uint4, is_volatile>::type*)to),
+            "r"(data.x),
+            "r"(data.y),
+            "r"(data.z),
+            "r"(data.w));
+      } else {
+        asm volatile(
+            "st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
+                (typename MaybeVolatile<uint4, is_volatile>::type*)to),
+            "r"(data.x),
+            "r"(data.y),
+            "r"(data.z),
+            "r"(data.w));
+      }
+      break;
+    }
+  }
+}
+
+template <typename scalar_t, int vec_size, bool is_volatile>
+__device__ void loadGlobalToLocal(
+    scalar_t* to,
+    typename MaybeVolatile<scalar_t, is_volatile>::type* from) {
+  switch (sizeof(scalar_t) * vec_size) {
+    case 1:
+    case 2:
+    case 4:
+      loadGenericVolatile<scalar_t, vec_size, false, is_volatile>(to, from);
+      break;
+    case 8: {
+      if (is_volatile) {
+        uint2& data = *reinterpret_cast<uint2*>(to);
+        asm volatile("ld.volatile.global.v2.s32 {%0,%1}, [%2];"
+                     : "=r"(data.x), "=r"(data.y)
+                     : "l"((uint2*)from));
+        break;
+      } else {
+        uint2& data = *reinterpret_cast<uint2*>(to);
+        asm volatile("ld.global.cs.v2.s32 {%0,%1}, [%2];"
+                     : "=r"(data.x), "=r"(data.y)
+                     : "l"((uint2*)from));
+      }
+      break;
+    }
+    case 16: {
+      if (is_volatile) {
+        uint4& data = *reinterpret_cast<uint4*>(to);
+        asm volatile("ld.volatile.global.v4.s32 {%0,%1,%2,%3}, [%4];"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"((uint4*)from));
+      } else {
+        uint4& data = *reinterpret_cast<uint4*>(to);
+        asm volatile("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"((uint4*)from));
+      }
+      break;
+    }
+  }
+}
+
+template <
+    typename scalar_t,
+    int vec_size,
+    bool is_volatile_to,
+    bool is_volatile_from>
+__device__ void loadGlobalToGlobal(
+    typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
+    typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
+  switch (sizeof(scalar_t) * vec_size) {
+    // Reinterpret cast like this with volatile types only works for C++
+    // fundamental types otherwise the = operator is not defined
+    case 1:
+    case 2:
+    case 4:
+    case 8:
+      loadGenericVolatile<scalar_t, vec_size, is_volatile_to, is_volatile_from>(
+          to, from);
+      break;
+    case 12: {
+      uint3 local_intermediate;
+      loadGlobalToLocal<scalar_t, vec_size, is_volatile_from>(
+          reinterpret_cast<scalar_t*>(&local_intermediate), from);
+      loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
+          to, reinterpret_cast<scalar_t*>(&local_intermediate));
+      break;
+    }
+    case 16: {
+      uint4 local_intermediate;
+      loadGlobalToLocal<scalar_t, vec_size, is_volatile_from>(
+          reinterpret_cast<scalar_t*>(&local_intermediate), from);
+      loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
+          to, reinterpret_cast<scalar_t*>(&local_intermediate));
+      break;
+    }
+  }
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
index ed366132689d..fcbc98e7818c 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
@@ -41,10 +41,8 @@ __device__ void sync() {
   // threads have incremented the counter.
   while (local_sync_counter < next && old < local_sync_counter) {
 #if __CUDA_ARCH__ >= 700
-    __nanosleep(backoff);
-#else
-    // __nanosleep is not available for sm < 70
-    assert(false);
+    // __nanosleep only available on compute capability 7.0 or higher
+    __nanosleep(backoff); // avoids busy waiting
 #endif
     if (backoff < backoff_max) {
       backoff *= 2;
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
index 4bd402e84c60..46564c981f18 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
@@ -31,13 +31,24 @@ __device__ float __half2float(const __half h) {
   return val;
 }
 
-// aligned vector generates vectorized load/store on CUDA
-template <typename scalar_t, int vec_size>
-struct alignas(sizeof(scalar_t) * vec_size) Array {
-  scalar_t val[vec_size];
-  __device__ void set(scalar_t v) {
-    for (int i = 0; i < vec_size; ++i) {
-      val[i] = v;
-    }
-  }
-};
+__device__ __half __double2half(const double d) {
+#if __CUDA_ARCH__ >= 700
+  __half val;
+  asm("{  cvt.rn.f16.f64 %0, %1;}\n"
+      : "=h"(__NVFUSER_HALF_TO_US(val))
+      : "d"(d));
+  return val;
+#else
+  return __float2half(static_cast<float>(d));
+#endif
+}
+
+__device__ double __half2double(const __half h) {
+#if __CUDA_ARCH__ >= 700
+  double val;
+  asm("{  cvt.f64.f16 %0, %1;}\n" : "=d"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
+  return val;
+#else
+  return static_cast<double>(__half2float(h));
+#endif
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
new file mode 100644
index 000000000000..6fd6f398eb06
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
@@ -0,0 +1,1081 @@
+namespace fused_reduction {
+
+// We have 6 dimensions, 3 in the grid, 3 in the block
+// They can be 1 of 3 states,
+// Reduction Domain - TEMPLATE STATE 0
+//   - Participating in the reduction, has values coming in, one value coming
+//     out across the dimension
+// Iteration Domain - TEMPLATE STATE 1
+//   - Not participating in the reduction, has values across the dimension after
+//     the reduction
+// Collapsed Domain - TEMPLATE STATE 2
+//   - Previously reduced, doesn't need to be reduced on that dimension, doesn't
+//     have values across that dimension
+constexpr __device__ bool isReduce(int STATE) {
+  return STATE == 0;
+}
+
+constexpr __device__ bool isIter(int STATE) {
+  return STATE == 1;
+}
+
+constexpr __device__ bool isPred(int STATE) {
+  return STATE == 2;
+}
+
+constexpr __device__ bool inactive(int STATE) {
+  return STATE == 3;
+}
+
+constexpr __device__ bool activeNotIter(int STATE) {
+  return STATE != 3 && STATE != 1;
+}
+
+// When generating an index into the reduction, we have to stride by iteration
+// domains and reduction domains. Collapsed domains we can ignore, but we need
+// to make sure they never read or write (need to be predicated to correct
+// participation).
+
+// All inclusive reduction with option to re-broadcast. This reduction class
+// does not use predication of parallelization in the read or write predicates.
+// Instead there are 3 states each dimension of parallelization can have,
+// described above. Predication, indexing, and reduction will be done based on
+// this information.
+template <
+    int X_BLOCK,
+    int Y_BLOCK,
+    int Z_BLOCK,
+    int X_THREAD,
+    int Y_THREAD,
+    int Z_THREAD,
+    bool PERSISTENT_REDUCTION,
+    bool BROADCAST>
+class ParallelReduce {
+  static constexpr bool BLOCK_REDUCE =
+      isReduce(X_THREAD) || isReduce(Y_THREAD) || isReduce(Z_THREAD);
+
+  static constexpr bool GRID_REDUCE =
+      isReduce(X_BLOCK) || isReduce(Y_BLOCK) || isReduce(Z_BLOCK);
+
+  // ping-pong between global buffers to avoid a second sync
+  bool flip = false;
+
+ public:
+  __device__ ParallelReduce() {}
+
+  template <typename Func, typename... Types>
+  __device__ __inline__ void reduce(
+      RefTuple<Types...> out,
+      const ConstRefTuple<Types...>& inp,
+      VolatilePtrTuple<Types...> global_work_buffer,
+      int64_t* global_sync_buffer, // Allocated as product of all
+                                   // non-participating Grid dimension
+      PtrTuple<Types...> shared_buf,
+      bool read_pred, // Prevent reading from out of bounds memory
+      bool write_pred, // Prevent from writing out of bounds
+      const LocalTuple<Types...>& init_val,
+      Func reduction_op) {
+    // If no reduction needed, just return input
+    if (!BLOCK_REDUCE && !GRID_REDUCE) {
+      if (read_pred && write_pred) {
+        out = inp;
+      }
+      return;
+    }
+
+    // Don't read/write in temporary buffers if in a predicated dimension
+    bool block_reduce_participate = index_utils::
+        maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
+            threadIdx);
+
+    // Initialize block result
+    LocalTuple<Types...> block_result = init_val;
+
+    // Grab input data if participating in the reduction, set to block_result in
+    // the case there is no block reduction
+    if (block_reduce_participate && read_pred) {
+      block_result = inp;
+    }
+
+    // Only threads that with id == 0 in the dimensions being reduced will
+    // have a valid result
+    bool has_block_result = index_utils::maskedIsZero<
+        isReduce(X_THREAD),
+        isReduce(Y_THREAD),
+        isReduce(Z_THREAD)>(threadIdx);
+
+    if (BLOCK_REDUCE) {
+      // -- START BLOCK REDUCTION -- //
+
+      // Size of the block reduction segment, can be an int since it's limited
+      // to number of threads
+      int block_reduction_size = index_utils::maskedSize<
+          isReduce(X_THREAD),
+          isReduce(Y_THREAD),
+          isReduce(Z_THREAD)>(blockDim);
+
+      // Index in the reduction segment, can be an int since it's limited to
+      // number of threads
+      int tid_in_block_reduction = index_utils::maskedOffset<
+          isReduce(X_THREAD),
+          isReduce(Y_THREAD),
+          isReduce(Z_THREAD)>(threadIdx, blockDim);
+
+      // ID of the block reduction this thread is participating in
+      //
+      // If any of the parallel dimensions are predicated out, that means
+      // they've already been reduced, so we only care about the first thread in
+      // that dimension. Therefore don't expand the reduction_idx by that
+      // dimension
+      int block_reduction_idx = index_utils::
+          maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+              threadIdx, blockDim);
+
+      // Shared memory buffer is 2D
+      // [iter dimension, reduction dimension]
+
+      // Offset into smem for the current thread
+      int block_reduce_smem_offset =
+          block_reduction_idx * block_reduction_size + tid_in_block_reduction;
+
+      // Initialize shared memory
+      if (block_reduce_participate) {
+        copyTuple(shared_buf, block_reduce_smem_offset, block_result);
+      }
+
+      // Sync to make sure smem is completely initialized
+      block_sync::sync();
+
+      // Round reduction size down to nearest power of 2
+      int np2 = 1 << (31 - __clz(block_reduction_size));
+
+      // Perform an initial reduction leaving np2 elements
+      if (block_reduce_participate && tid_in_block_reduction < np2 &&
+          tid_in_block_reduction + np2 < block_reduction_size) {
+        reduce(
+            shared_buf,
+            block_reduce_smem_offset,
+            shared_buf,
+            block_reduce_smem_offset + np2,
+            reduction_op);
+      }
+
+      // Always need to sync while operating on shared memory
+      block_sync::sync();
+
+      // Reduce down until 2 values, leaving 2 values allows us to manually
+      // perform the last reduction and avoid a syncthreads
+      for (int factor = np2 / 2; factor > 1; factor >>= 1) {
+        if (tid_in_block_reduction < factor && block_reduce_participate) {
+          reduce(
+              shared_buf,
+              block_reduce_smem_offset,
+              shared_buf,
+              block_reduce_smem_offset + factor,
+              reduction_op);
+        }
+        block_sync::sync();
+      }
+
+      // Accumulate that last valid result
+      if (has_block_result) {
+        copyTuple(block_result, shared_buf, block_reduce_smem_offset);
+        if (block_reduction_size > 1) {
+          reduce(
+              block_result,
+              0,
+              shared_buf,
+              block_reduce_smem_offset + 1,
+              reduction_op);
+        }
+      }
+
+      // ===== BLOCK REDUCTION CLEANUP =======
+      if (!GRID_REDUCE) {
+        // If no grid reduction, we don't have to continue. Either broadcast
+        // back across the block or return the correct reduction
+        if (has_block_result && write_pred) {
+          reduce(block_result, 0, out, 0, reduction_op);
+          out = block_result;
+        }
+        if (BROADCAST) {
+          // No grid reduce, but need to broadcast, perform block broadcast
+          if (has_block_result && write_pred) {
+            // Put result back in shared memory, put in the first entry of the
+            // reduction segment's buffer
+            copyTuple(
+                shared_buf,
+                block_reduction_idx * block_reduction_size,
+                block_result);
+          }
+
+          // Sync threads to make sure result is in smem
+          block_sync::sync();
+          // If the thread is participating, and is not attempting to write out
+          // of bounds, return the broadcasted value.
+          if (block_reduce_participate && write_pred) {
+            copyTuple(
+                out, shared_buf, block_reduction_idx * block_reduction_size);
+          }
+        }
+
+        // Forward protect shared memory, don't want threads to continue to
+        // another reduction/broadcast and pollute shared memory before the
+        // reduction is completely finished.
+        //
+        // This could be avoided in some cases if we added thread syncs from
+        // block reductions in the syncthread insertion pass.
+        block_sync::sync();
+        return;
+      }
+    }
+
+    // -- START GRID REDUCTION -- //
+    // Grid reductions are more challenging for two reasons, (1) the reduction
+    // itself is 3D instead of 2D because we now have an iter domain space in
+    // the grid dimension. (2) a tree reduction isn't performed, instead all
+    // blocks will populate GMEM and one  block will finish the grid reduction.
+
+    // What is the grid reduction size, block reduction already performed so
+    // that doesn't have to be taken into consideration
+    const auto grid_red_size = index_utils::
+        maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            gridDim);
+
+    // Which ID in the reduction is this block. Threads can participate in
+    // multiple grid reductions, but the block will have the same relative index
+    // in those reductions
+    const auto idx_in_grid_red = index_utils::
+        maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    if (PERSISTENT_REDUCTION && flip) {
+      auto global_buffer_size =
+          index_utils::
+              maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
+                  gridDim) *
+          grid_red_size;
+      global_work_buffer += global_buffer_size;
+    }
+    flip = ~flip;
+
+    // How many grid reductions have to be performed, in the grid dimension
+    const auto num_block_iters = index_utils::
+        maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
+
+    // Which grid reduction does this block participate in, in the grid
+    // dimension
+    const auto block_red_idx_offset = index_utils::
+        maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    // How many grid reductions have to be performed, in the block dimension
+    const auto num_thread_iters = index_utils::
+        maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+            blockDim);
+
+    // Which grid reduction does this thread participate in, in the block
+    // dimension
+    const auto thread_red_idx_offset = index_utils::
+        maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+            threadIdx, blockDim);
+
+    // 3D buffer of reductions:
+    //    [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
+    // Offset into the work buffer
+    const auto work_buf_offset =
+        (idx_in_grid_red * num_block_iters + block_red_idx_offset) *
+            num_thread_iters +
+        thread_red_idx_offset;
+
+    // Don't read/write in temporary buffers if in a predicated dimension
+    bool grid_reduce_participate = index_utils::
+        maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(
+            blockIdx);
+
+    if (grid_reduce_participate && block_reduce_participate) {
+      if (has_block_result) {
+        copyTuple(global_work_buffer, work_buf_offset, block_result);
+      }
+    }
+
+    // -- GLOBAL BUFFER FILLED -- //
+
+    bool last_block = index_utils::
+        maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    if (grid_reduce_participate) {
+      // Don't need to sync up blocks that are not participating in this
+      // reduction
+      grid_sync::sync<
+          isReduce(X_BLOCK),
+          isReduce(Y_BLOCK),
+          isReduce(Z_BLOCK),
+          PERSISTENT_REDUCTION>(
+          global_sync_buffer[block_red_idx_offset], grid_red_size, last_block);
+    }
+
+    // -- START BLOCK CLEANUP -- //
+    // All blocks perform the last cleanup, so every block, and every thread
+    // will have the final result
+
+    // Initialize block result
+    LocalTuple<Types...> last_block_result(init_val);
+
+    if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
+      // Can use the last block to reduce all the values the blocks filled in.
+      // Can use any thread that has been predicated, or has been reduced to do
+      // this reduction, cannot use any block that's associated with an
+      // iteration domain
+
+      // Start with non-block reduction
+
+      // Index in the reduction segment
+      int tid_in_block_reduction_2 = index_utils::maskedOffset<
+          activeNotIter(X_THREAD),
+          activeNotIter(Y_THREAD),
+          activeNotIter(Z_THREAD)>(threadIdx, blockDim);
+
+      int block_reduction_size_2 = index_utils::maskedSize<
+          activeNotIter(X_THREAD),
+          activeNotIter(Y_THREAD),
+          activeNotIter(Z_THREAD)>(blockDim);
+
+      // 3D buffer of reductions:
+      //    [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
+      // Change the offset, we want to keep the last two dimensions, but the
+      // first dimension is what we will reduce over
+      const auto work_buf_offset_2 =
+          block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
+      for (auto reduction_i = tid_in_block_reduction_2;
+           reduction_i < grid_red_size;
+           reduction_i += block_reduction_size_2) {
+        reduce(
+            last_block_result,
+            0,
+            global_work_buffer,
+            work_buf_offset_2 +
+                reduction_i * num_block_iters *
+                    num_thread_iters, // Iterating over the outer most
+                                      // dimension, so need to stride by the
+                                      // total number of grid reductions. Could
+                                      // come back and change it so this is the
+                                      // contiguous dimension
+            reduction_op);
+      }
+
+      // -- START LAST BLOCK - BLOCK REDUCTION -- //
+
+      // Reduced so we have one value per thread, we need to further reduce any
+      // dimension that is not an iter dimension
+
+      // Which block reduction this thread is participating in
+      int block_reduction_idx = index_utils::
+          maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+              threadIdx, blockDim);
+
+      // Offset in smem for this thread's result
+      auto smem_offset = block_reduction_idx * block_reduction_size_2 +
+          tid_in_block_reduction_2;
+
+      // Similar as before, reduce down to nearest power of 2 so we can do a
+      // tree reduction
+      int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size)));
+
+      // Threads values are initialized, so all can participate here
+      if (tid_in_block_reduction_2 >= np2) {
+        copyTuple(shared_buf, smem_offset, last_block_result);
+      }
+
+      block_sync::sync();
+
+      if (tid_in_block_reduction_2 < np2 &&
+          tid_in_block_reduction_2 + np2 <
+              min(block_reduction_size_2, grid_red_size)) {
+        reduce(
+            last_block_result, 0, shared_buf, smem_offset + np2, reduction_op);
+      }
+
+      if (tid_in_block_reduction_2 < np2) {
+        copyTuple(shared_buf, smem_offset, last_block_result);
+      }
+
+      // Always sync when communicating across smem
+      block_sync::sync();
+
+      // Reduce down to 2 values, last thread will do the final reduction and
+      // can save a syncthreads this way
+      for (int factor = np2 / 2; factor > 1; factor >>= 1) {
+        if (tid_in_block_reduction_2 < factor) {
+          reduce(
+              shared_buf,
+              smem_offset,
+              shared_buf,
+              smem_offset + factor,
+              reduction_op);
+        }
+        block_sync::sync();
+      }
+
+      // If this thread in each block has the final result before broadcasting
+      // to all other threads in block
+      bool has_block_result_2 = index_utils::maskedIsZero<
+          activeNotIter(X_THREAD),
+          activeNotIter(Y_THREAD),
+          activeNotIter(Z_THREAD)>(threadIdx);
+      // Do the last reduction, protected by the write predicate
+      copyTuple(last_block_result, shared_buf, smem_offset);
+      if (has_block_result && grid_reduce_participate) {
+        reduce(last_block_result, 0, out, 0, reduction_op);
+        if (min(block_reduction_size_2, grid_red_size) > 1) {
+          reduce(
+              last_block_result, 0, shared_buf, smem_offset + 1, reduction_op);
+        }
+      }
+      if (grid_reduce_participate && PERSISTENT_REDUCTION) {
+        // If persistent reduction, always broadcast reduced values
+        copyTuple(shared_buf, smem_offset, last_block_result);
+        block_sync::sync();
+        if (write_pred && block_reduce_participate) {
+          copyTuple(
+              out, shared_buf, block_reduction_idx * block_reduction_size_2);
+        }
+        // For persistent kernels we double the global buffer allocation so we
+        // don't need to protect those buffers every iteration preventing the
+        // need of an additional grid_sync. Since we flip back and forth between
+        // sections of the buffer, the one grid sync protects the other part of
+        // the buffer.
+
+      } else {
+        // Forward protect the smem used in this reduction
+        if (grid_reduce_participate) {
+          if (last_block && has_block_result && block_reduce_participate &&
+              write_pred) {
+            copyTuple(
+                out, shared_buf, block_reduction_idx * block_reduction_size_2);
+          }
+        }
+        block_sync::sync();
+      }
+    }
+  }
+
+  // Only unary tuples are supported, i.e., no Welford tuple is allowed.
+  template <
+      typename Func1,
+      typename DataType1,
+      typename Func2,
+      typename DataType2>
+  __device__ __inline__ void reduceGroup(
+      RefTuple<DataType1> out1,
+      const ConstRefTuple<DataType1>& inp1,
+      VolatilePtrTuple<DataType1> global_work_buffer1,
+      const LocalTuple<DataType1>& init_val1,
+      Func1 reduction_op1,
+      RefTuple<DataType2> out2,
+      const ConstRefTuple<DataType2>& inp2,
+      VolatilePtrTuple<DataType2> global_work_buffer2,
+      const LocalTuple<DataType2>& init_val2,
+      Func2 reduction_op2,
+      int64_t* global_sync_buffer, // Allocated as product of all
+                                   // non-participating Grid dimension
+      void* shared_mem,
+      bool read_pred, // Prevent reading from out of bounds memory
+      bool write_pred) { // Prevent from writing out of bounds
+    // If no reduction needed, just return input
+    if (!BLOCK_REDUCE && !GRID_REDUCE) {
+      if (read_pred && write_pred) {
+        out1 = inp1;
+        out2 = inp2;
+      }
+      return;
+    }
+
+    // Don't read/write in temporary buffers if in a predicated dimension
+    const bool block_reduce_participate = index_utils::
+        maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
+            threadIdx);
+
+    // Only threads that with id == 0 in the dimensions being reduced will
+    // have a valid result
+    const bool has_block_result = index_utils::maskedIsZero<
+        isReduce(X_THREAD),
+        isReduce(Y_THREAD),
+        isReduce(Z_THREAD)>(threadIdx);
+
+    // Block reduction only
+    if (!GRID_REDUCE) {
+      reduceBlock(
+          out1,
+          inp1,
+          init_val1,
+          reduction_op1,
+          shared_mem,
+          read_pred,
+          write_pred,
+          block_reduce_participate,
+          has_block_result);
+      reduceBlock(
+          out2,
+          inp2,
+          init_val2,
+          reduction_op2,
+          shared_mem,
+          read_pred,
+          write_pred,
+          block_reduce_participate,
+          has_block_result);
+      return;
+    }
+
+    // -- START GRID REDUCTION -- //
+    // Grid reductions are more challenging for two reasons, (1) the reduction
+    // itself is 3D instead of 2D because we now have an iter domain space in
+    // the grid dimension. (2) a tree reduction isn't performed, instead all
+    // blocks will populate GMEM and one  block will finish the grid reduction.
+
+    // What is the grid reduction size, block reduction already performed so
+    // that doesn't have to be taken into consideration
+    const auto grid_red_size = index_utils::
+        maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            gridDim);
+
+    // Which ID in the reduction is this block. Threads can participate in
+    // multiple grid reductions, but the block will have the same relative index
+    // in those reductions
+    const auto idx_in_grid_red = index_utils::
+        maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    // How many grid reductions have to be performed, in the grid dimension
+    const auto num_block_iters = index_utils::
+        maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
+
+    // Which grid reduction does this block participate in, in the grid
+    // dimension
+    const auto block_red_idx_offset = index_utils::
+        maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    // How many grid reductions have to be performed, in the block dimension
+    const auto num_thread_iters = index_utils::
+        maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+            blockDim);
+
+    // Which grid reduction does this thread participate in, in the block
+    // dimension
+    const auto thread_red_idx_offset = index_utils::
+        maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+            threadIdx, blockDim);
+
+    // 3D buffer of reductions:
+    //    [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
+    // Offset into the work buffer
+    const auto work_buf_offset =
+        (idx_in_grid_red * num_block_iters + block_red_idx_offset) *
+            num_thread_iters +
+        thread_red_idx_offset;
+
+    // Don't read/write in temporary buffers if in a predicated dimension
+    bool grid_reduce_participate = index_utils::
+        maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(
+            blockIdx);
+
+    if (PERSISTENT_REDUCTION && flip) {
+      auto global_buffer_size =
+          index_utils::
+              maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
+                  gridDim) *
+          grid_red_size;
+      global_work_buffer1 += global_buffer_size;
+      global_work_buffer2 += global_buffer_size;
+    }
+    flip = ~flip;
+
+    // Per-block partial reduction to global work buffer
+    {
+      const auto block_result = reduceBlock(
+          out1,
+          inp1,
+          init_val1,
+          reduction_op1,
+          shared_mem,
+          read_pred,
+          write_pred,
+          block_reduce_participate,
+          has_block_result);
+      if (grid_reduce_participate && block_reduce_participate) {
+        if (has_block_result) {
+          copyTuple(global_work_buffer1, work_buf_offset, block_result);
+        }
+      }
+    }
+    {
+      const auto block_result = reduceBlock(
+          out2,
+          inp2,
+          init_val2,
+          reduction_op2,
+          shared_mem,
+          read_pred,
+          write_pred,
+          block_reduce_participate,
+          has_block_result);
+      if (grid_reduce_participate && block_reduce_participate) {
+        if (has_block_result) {
+          copyTuple(global_work_buffer2, work_buf_offset, block_result);
+        }
+      }
+    }
+
+    // -- GLOBAL BUFFER FILLED -- //
+
+    bool last_block = index_utils::
+        maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
+            blockIdx, gridDim);
+
+    if (grid_reduce_participate) {
+      // Don't need to sync up blocks that are not participating in this
+      // reduction
+      grid_sync::sync<
+          isReduce(X_BLOCK),
+          isReduce(Y_BLOCK),
+          isReduce(Z_BLOCK),
+          PERSISTENT_REDUCTION>(
+          global_sync_buffer[block_red_idx_offset], grid_red_size, last_block);
+    }
+
+    // -- START BLOCK CLEANUP -- //
+    reduceLastBlock(
+        out1,
+        global_work_buffer1,
+        init_val1,
+        reduction_op1,
+        shared_mem,
+        block_red_idx_offset,
+        num_thread_iters,
+        num_block_iters,
+        thread_red_idx_offset,
+        grid_red_size,
+        write_pred,
+        last_block,
+        block_reduce_participate,
+        grid_reduce_participate,
+        has_block_result);
+    reduceLastBlock(
+        out2,
+        global_work_buffer2,
+        init_val2,
+        reduction_op2,
+        shared_mem,
+        block_red_idx_offset,
+        num_thread_iters,
+        num_block_iters,
+        thread_red_idx_offset,
+        grid_red_size,
+        write_pred,
+        last_block,
+        block_reduce_participate,
+        grid_reduce_participate,
+        has_block_result);
+  }
+
+ private:
+  // Almost exact copy of the initial block reduction part in the
+  // reduce function, but only unary tuples are supported as there's
+  // only one shared-memory buffer. As such, this can't be used with
+  // the non-group reduce function.
+  template <typename Func, typename Type>
+  __device__ __inline__ LocalTuple<Type> reduceBlock(
+      RefTuple<Type>& out,
+      const ConstRefTuple<Type>& inp,
+      const LocalTuple<Type>& init_val,
+      Func reduction_op,
+      void* shared_mem,
+      bool read_pred,
+      bool write_pred,
+      bool block_reduce_participate,
+      bool has_block_result) {
+    PtrTuple<Type> shared_buf(static_cast<Type*>(shared_mem));
+
+    // Initialize block result
+    LocalTuple<Type> block_result = init_val;
+
+    // Grab input data if participating in the reduction, set to block_result in
+    // the case there is no block reduction
+    if (block_reduce_participate && read_pred) {
+      block_result = inp;
+    }
+
+    // Size of the block reduction segment, can be an int since it's limited
+    // to number of threads
+    int block_reduction_size = index_utils::
+        maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
+            blockDim);
+
+    // Index in the reduction segment, can be an int since it's limited to
+    // number of threads
+    int tid_in_block_reduction = index_utils::maskedOffset<
+        isReduce(X_THREAD),
+        isReduce(Y_THREAD),
+        isReduce(Z_THREAD)>(threadIdx, blockDim);
+
+    // ID of the block reduction this thread is participating in
+    //
+    // If any of the parallel dimensions are predicated out, that means
+    // they've already been reduced, so we only care about the first thread in
+    // that dimension. Therefore don't expand the reduction_idx by that
+    // dimension
+    int block_reduction_idx = index_utils::
+        maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+            threadIdx, blockDim);
+
+    // Shared memory buffer is 2D
+    // [iter dimension, reduction dimension]
+
+    // Offset into smem for the current thread
+    int block_reduce_smem_offset =
+        block_reduction_idx * block_reduction_size + tid_in_block_reduction;
+
+    // Initialize shared memory
+    if (block_reduce_participate) {
+      copyTuple(shared_buf, block_reduce_smem_offset, block_result);
+    }
+
+    // Sync to make sure smem is completely initialized
+    block_sync::sync();
+
+    // Round reduction size down to nearest power of 2
+    int np2 = 1 << (31 - __clz(block_reduction_size));
+
+    // Perform an initial reduction leaving np2 elements
+    if (block_reduce_participate && tid_in_block_reduction < np2 &&
+        tid_in_block_reduction + np2 < block_reduction_size) {
+      reduce(
+          shared_buf,
+          block_reduce_smem_offset,
+          shared_buf,
+          block_reduce_smem_offset + np2,
+          reduction_op);
+    }
+
+    // Always need to sync while operating on shared memory
+    block_sync::sync();
+
+    // Reduce down until 2 values, leaving 2 values allows us to manually
+    // perform the last reduction and avoid a syncthreads
+    for (int factor = np2 / 2; factor > 1; factor >>= 1) {
+      if (tid_in_block_reduction < factor && block_reduce_participate) {
+        reduce(
+            shared_buf,
+            block_reduce_smem_offset,
+            shared_buf,
+            block_reduce_smem_offset + factor,
+            reduction_op);
+      }
+      block_sync::sync();
+    }
+
+    // Accumulate that last valid result
+    if (has_block_result) {
+      copyTuple(block_result, shared_buf, block_reduce_smem_offset);
+      if (block_reduction_size > 1) {
+        reduce(
+            block_result,
+            0,
+            shared_buf,
+            block_reduce_smem_offset + 1,
+            reduction_op);
+      }
+    }
+
+    // ===== BLOCK REDUCTION CLEANUP =======
+    if (!GRID_REDUCE) {
+      // If no grid reduction, we don't have to continue. Either broadcast
+      // back across the block or return the correct reduction
+      if (has_block_result && write_pred) {
+        reduce(block_result, 0, out, 0, reduction_op);
+        out = block_result;
+      }
+      if (BROADCAST) {
+        // No grid reduce, but need to broadcast, perform block broadcast
+        if (has_block_result && write_pred) {
+          // Put result back in shared memory, put in the first entry of the
+          // reduction segment's buffer
+          copyTuple(
+              shared_buf,
+              block_reduction_idx * block_reduction_size,
+              block_result);
+        }
+
+        // Sync threads to make sure result is in smem
+        block_sync::sync();
+        // If the thread is participating, and is not attempting to write out
+        // of bounds, return the broadcasted value.
+        if (block_reduce_participate && write_pred) {
+          copyTuple(
+              out, shared_buf, block_reduction_idx * block_reduction_size);
+        }
+      }
+
+      // Forward protect shared memory, don't want threads to continue to
+      // another reduction/broadcast and pollute shared memory before the
+      // reduction is completely finished.
+      //
+      // This could be avoided in some cases if we added thread syncs from
+      // block reductions in the syncthread insertion pass.
+      block_sync::sync();
+    }
+
+    return block_result;
+  }
+
+  // Almost exact copy of the last-block reduction in the reduce
+  // function, but only unary tuples are supported as there's only one
+  // shared-memory buffer. As such, this can't be used with the
+  // non-group reduce function.
+  template <typename Func, typename Type>
+  __device__ __inline__ void reduceLastBlock(
+      RefTuple<Type>& out,
+      const VolatilePtrTuple<Type>& global_work_buffer,
+      const LocalTuple<Type>& init_val,
+      Func reduction_op,
+      void* shared_mem,
+      nvfuser_index_t block_red_idx_offset,
+      nvfuser_index_t num_thread_iters,
+      nvfuser_index_t num_block_iters,
+      nvfuser_index_t thread_red_idx_offset,
+      nvfuser_index_t grid_red_size,
+      bool write_pred,
+      bool last_block,
+      bool block_reduce_participate,
+      bool grid_reduce_participate,
+      bool has_block_result) {
+    // Initialize block result
+    LocalTuple<Type> last_block_result(init_val);
+
+    PtrTuple<Type> shared_buf(static_cast<Type*>(shared_mem));
+
+    if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
+      // Can use the last block to reduce all the values the blocks filled in.
+      // Can use any thread that has been predicated, or has been reduced to do
+      // this reduction, cannot use any block that's associated with an
+      // iteration domain
+
+      // Start with non-block reduction
+
+      // Index in the reduction segment
+      int tid_in_block_reduction_2 = index_utils::maskedOffset<
+          activeNotIter(X_THREAD),
+          activeNotIter(Y_THREAD),
+          activeNotIter(Z_THREAD)>(threadIdx, blockDim);
+
+      int block_reduction_size_2 = index_utils::maskedSize<
+          activeNotIter(X_THREAD),
+          activeNotIter(Y_THREAD),
+          activeNotIter(Z_THREAD)>(blockDim);
+
+      // 3D buffer of reductions:
+      //    [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
+      // Change the offset, we want to keep the last two dimensions, but the
+      // first dimension is what we will reduce over
+      const auto work_buf_offset_2 =
+          block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
+      for (auto reduction_i = tid_in_block_reduction_2;
+           reduction_i < grid_red_size;
+           reduction_i += block_reduction_size_2) {
+        reduce(
+            last_block_result,
+            0,
+            global_work_buffer,
+            work_buf_offset_2 +
+                reduction_i * num_block_iters *
+                    num_thread_iters, // Iterating over the outer most
+                                      // dimension, so need to stride by the
+                                      // total number of grid reductions. Could
+                                      // come back and change it so this is the
+                                      // contiguous dimension
+            reduction_op);
+      }
+
+      // -- START LAST BLOCK - BLOCK REDUCTION -- //
+
+      // Reduced so we have one value per thread, we need to further reduce any
+      // dimension that is not an iter dimension
+
+      // Which block reduction this thread is participating in
+      int block_reduction_idx = index_utils::
+          maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
+              threadIdx, blockDim);
+
+      // Offset in smem for this thread's result
+      auto smem_offset = block_reduction_idx * block_reduction_size_2 +
+          tid_in_block_reduction_2;
+
+      // Similar as before, reduce down to nearest power of 2 so we can do a
+      // tree reduction
+      int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size)));
+
+      // Threads values are initialized, so all can participate here
+      if (tid_in_block_reduction_2 >= np2) {
+        copyTuple(shared_buf, smem_offset, last_block_result);
+      }
+
+      block_sync::sync();
+
+      if (tid_in_block_reduction_2 < np2 &&
+          tid_in_block_reduction_2 + np2 <
+              min(block_reduction_size_2, grid_red_size)) {
+        reduce(
+            last_block_result, 0, shared_buf, smem_offset + np2, reduction_op);
+      }
+
+      if (tid_in_block_reduction_2 < np2) {
+        copyTuple(shared_buf, smem_offset, last_block_result);
+      }
+
+      // Always sync when communicating across smem
+      block_sync::sync();
+
+      // Reduce down to 2 values, last thread will do the final reduction and
+      // can save a syncthreads this way
+      for (int factor = np2 / 2; factor > 1; factor >>= 1) {
+        if (tid_in_block_reduction_2 < factor) {
+          reduce(
+              shared_buf,
+              smem_offset,
+              shared_buf,
+              smem_offset + factor,
+              reduction_op);
+        }
+        block_sync::sync();
+      }
+
+      // If this thread in each block has the final result before broadcasting
+      // to all other threads in block
+
+      // Do the last reduction, protected by the write predicate
+      copyTuple(last_block_result, shared_buf, smem_offset);
+      if (has_block_result && grid_reduce_participate) {
+        reduce(last_block_result, 0, out, 0, reduction_op);
+        if (min(block_reduction_size_2, grid_red_size) > 1) {
+          reduce(
+              last_block_result, 0, shared_buf, smem_offset + 1, reduction_op);
+        }
+      }
+
+      if (grid_reduce_participate && PERSISTENT_REDUCTION) {
+        // If persistent reduction, always broadcast reduced values
+        copyTuple(shared_buf, smem_offset, last_block_result);
+        block_sync::sync();
+        if (write_pred && block_reduce_participate) {
+          copyTuple(
+              out, shared_buf, block_reduction_idx * block_reduction_size_2);
+        }
+        // For persistent kernels we double the global buffer allocation so we
+        // don't need to protect those buffers every iteration preventing the
+        // need of an additional grid_sync. Since we flip back and forth between
+        // sections of the buffer, the one grid sync protects the other part of
+        // the buffer.
+
+      } else {
+        // Forward protect the smem used in this reduction
+        if (grid_reduce_participate) {
+          if (last_block && has_block_result && block_reduce_participate &&
+              write_pred) {
+            copyTuple(
+                out, shared_buf, block_reduction_idx * block_reduction_size_2);
+          }
+        }
+        block_sync::sync();
+      }
+    }
+  }
+
+  template <typename TupleType0, typename TupleType1, typename Func>
+  __inline__ __device__ static void reduce(
+      TupleType0& val0,
+      nvfuser_index_t offset0,
+      const TupleType1& val1,
+      nvfuser_index_t offset1,
+      Func reduction_op) {
+    static_assert(
+        TupleType0::num_vals == TupleType1::num_vals,
+        "Invalid number of values");
+    TupleReduce<TupleType0, TupleType1, Func, TupleType0::num_vals>::reduce(
+        val0, offset0, val1, offset1, reduction_op);
+  }
+
+  template <
+      typename TupleType0,
+      typename TupleType1,
+      typename Func,
+      int num_vals>
+  struct TupleReduce {};
+
+  template <typename TupleType0, typename TupleType1, typename Func>
+  struct TupleReduce<TupleType0, TupleType1, Func, 1> {
+    __inline__ __device__ static void reduce(
+        TupleType0& val0,
+        nvfuser_index_t offset0,
+        const TupleType1& val1,
+        nvfuser_index_t offset1,
+        Func reduction_op) {
+      static_assert(
+          IsSameType<
+              typename TupleType0::ValTypes,
+              typename TupleType1::ValTypes>::value,
+          "Invalid value types");
+      reduction_op(val0.val<0>(offset0), val1.val<0>(offset1));
+    }
+  };
+
+  template <typename TupleType0, typename TupleType1, typename Func>
+  struct TupleReduce<TupleType0, TupleType1, Func, 2> {
+    __inline__ __device__ static void reduce(
+        TupleType0& val0,
+        nvfuser_index_t offset0,
+        const TupleType1& val1,
+        nvfuser_index_t offset1,
+        Func reduction_op) {
+      static_assert(
+          IsSameType<
+              typename TupleType0::ValTypes,
+              typename TupleType1::ValTypes>::value,
+          "Invalid value types");
+      reduction_op(
+          val0.val<0>(offset0),
+          val0.val<1>(offset0),
+          val1.val<0>(offset1),
+          val1.val<1>(offset1));
+    }
+  };
+
+  template <typename TupleType0, typename TupleType1, typename Func>
+  struct TupleReduce<TupleType0, TupleType1, Func, 3> {
+    __inline__ __device__ static void reduce(
+        TupleType0& val0,
+        nvfuser_index_t offset0,
+        const TupleType1& val1,
+        nvfuser_index_t offset1,
+        Func reduction_op) {
+      static_assert(
+          IsSameType<
+              typename TupleType0::ValTypes,
+              typename TupleType1::ValTypes>::value,
+          "Invalid value types");
+      reduction_op(
+          val0.val<0>(offset0),
+          val0.val<1>(offset0),
+          val0.val<2>(offset0),
+          val1.val<0>(offset1),
+          val1.val<1>(offset1),
+          val1.val<2>(offset1));
+    }
+  };
+
+  // End Parallel reduce class
+};
+
+} // namespace fused_reduction
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
index a75d0d5904a5..d3a15be0ae80 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
@@ -69,7 +69,7 @@ template <
     typename Func>
 __device__ void gridReduceLastBlock(
     T& out,
-    const T* in,
+    const volatile T* in,
     const nvfuser_index_t
         grid_reduction_segment_size, // Number of reductions across
                                      // grid reduce dimensions
@@ -129,7 +129,7 @@ __device__ void gridReduceLastBlock(
   }
 }
 
-// Reduces per-thread values across thread blocks.
+// Reduces per-thread values across threads and thread blocks.
 //
 // Function parameters:
 // - out: Per-thread output location
@@ -143,14 +143,8 @@ __device__ void gridReduceLastBlock(
 // reduction dimension
 //
 // Template parameters:
-// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
+// - X/Y/Z_BLOCK/THREAD: When true, reduces across thread blocks along the X/Y/Z
 //   dimensions
-// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
-//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
-//   These are set to true if the dimension in the block has not been reduced
-//   previously in producer tensors, and does not participate in the reduction
-//   (right now they can't), so it's just a "pure" iteration domain as far as
-//   the grid reduce is concerned.
 // - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
 //   the result of the grid reduction will be broadcasted and used across the
 //   grid. These requires cross grid communication and the grid synchronizations
@@ -173,21 +167,18 @@ __device__ void gridReduceLastBlock(
 // blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
 // such segments.
 //
-// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
-// with the sub regions of other thread blocks. We call it a reduction block.
-// E.g.,
-//
-// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
-// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
-//
-// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
-// participate in the cross-block reductions. The reduction block in this case
-// is equivalent to the thread block.
+// X/Y/Z_THREAD also works similarly as X/Y/Z_BLOCK and defines a
+// group of threads that are reduced togather.
 //
 // After the function completes, only one thread block per reduction segment
 // gets valid reduction results. There is no guarantee which particular block
 // gets the final results.
 //
+// entrance_ind and n_entrances are allowed when PERSISTENT_REDUCTION = false.
+// If a grid reduction call is only called once per thread, entrance_ind == 0
+// and n_entrances == 1. However, grid reduction can be called in a loop in a
+// thread, in that case entrance_ind is the count of times the function has been
+// called, and n_entrances is the total number of times it will be called.
 template <
     bool X_BLOCK,
     bool Y_BLOCK,
@@ -203,11 +194,35 @@ __device__ void gridReduce(
     const T& inp_val,
     Func reduction_op,
     volatile T* work_buf,
-    Tensor<int64_t, 1> sync_flags,
+    int64_t* sync_flags,
     T* shared_buf,
     bool read_pred,
     bool write_pred,
-    T init_val) {
+    T init_val,
+    const nvfuser_index_t entrance_ind,
+    const nvfuser_index_t n_entrances) {
+  T block_reduction_val = init_val;
+
+  // entrance index only matters for non-persistent re-entrant grid reductions.
+  const nvfuser_index_t entrance_ind_ = PERSISTENT_REDUCTION ? 0 : entrance_ind;
+  const nvfuser_index_t n_entrances_ = PERSISTENT_REDUCTION ? 1 : n_entrances;
+
+  // Do block reduction when required
+  if (X_THREAD || Y_THREAD || Z_THREAD) {
+    blockReduce<X_THREAD, Y_THREAD, Z_THREAD>(
+        block_reduction_val,
+        inp_val,
+        reduction_op,
+        threadIdx,
+        blockDim,
+        shared_buf,
+        read_pred,
+        true,
+        init_val);
+  } else if (read_pred) {
+    block_reduction_val = inp_val;
+  }
+
   // Number of values to reduce in the reduction segment
   const auto grid_reduction_segment_size =
       index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
@@ -221,38 +236,46 @@ __device__ void gridReduce(
   // Number of threads we can use in final reduction, Seems to assume all
   // threads in the block participate
   const auto block_reduction_segment_size =
-      index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
+      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);
+
+  // Number of reductions in the grid
+  const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
+      ? 1
+      : index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
 
   // advance to the offset for this segment
   // index of reduction * size of the reduction * size of threads
-  work_buf += idx_in_grid_segment * grid_reduction_segment_size *
-      block_reduction_segment_size;
+  work_buf += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
+      grid_reduction_segment_size * block_reduction_segment_size;
 
-  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
-      (Z_THREAD || threadIdx.z == 0)) {
+  if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
+      (!Z_THREAD || threadIdx.z == 0)) {
     auto block_offset =
         index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
     auto thread_offset =
-        index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
+        index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
             threadIdx, blockDim);
     auto work_buf_offset =
         block_offset * block_reduction_segment_size + thread_offset;
-    if (read_pred) {
-      work_buf[work_buf_offset] = inp_val;
-    } else {
-      work_buf[work_buf_offset] = init_val;
-    }
+    work_buf[work_buf_offset] = block_reduction_val;
   }
+  if (PERSISTENT_REDUCTION) {
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
 
-  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
-      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
+  } else {
+    // Use a different sync flag for each call
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment],
+        grid_reduction_segment_size);
+  }
 
   bool last_block =
       index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
 
   if (last_block) {
     // Cleanup with block reduction
-    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
+    gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD>(
         out,
         (T*)work_buf,
         grid_reduction_segment_size,
@@ -271,7 +294,175 @@ __device__ void gridReduce(
   }
 }
 
-} // namespace reduction
+template <
+    bool X_BLOCK,
+    bool Y_BLOCK,
+    bool Z_BLOCK,
+    bool X_THREAD,
+    bool Y_THREAD,
+    bool Z_THREAD,
+    typename T,
+    typename Func>
+__device__ void gridReduce2PartialReduction(
+    const T& inp_val,
+    T init_val,
+    Func reduction_op,
+    volatile T* work_buf,
+    T* shared_buf,
+    bool read_pred,
+    nvfuser_index_t grid_reduction_segment_size,
+    nvfuser_index_t idx_in_grid_segment,
+    nvfuser_index_t block_reduction_segment_size) {
+  T block_reduction_val = init_val;
+
+  // Do block reduction when required
+  if (X_THREAD || Y_THREAD || Z_THREAD) {
+    blockReduce<X_THREAD, Y_THREAD, Z_THREAD>(
+        block_reduction_val,
+        inp_val,
+        reduction_op,
+        threadIdx,
+        blockDim,
+        shared_buf,
+        read_pred,
+        true,
+        init_val);
+  } else if (read_pred) {
+    block_reduction_val = inp_val;
+  }
+
+  if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
+      (!Z_THREAD || threadIdx.z == 0)) {
+    auto block_offset =
+        index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+    auto thread_offset =
+        index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
+            threadIdx, blockDim);
+    auto work_buf_offset =
+        block_offset * block_reduction_segment_size + thread_offset;
+    work_buf[work_buf_offset] = block_reduction_val;
+  }
+}
+
+// 2-way horizontally fused grid reduction
+template <
+    bool X_BLOCK,
+    bool Y_BLOCK,
+    bool Z_BLOCK,
+    bool X_THREAD,
+    bool Y_THREAD,
+    bool Z_THREAD,
+    bool PERSISTENT_REDUCTION,
+    typename T1,
+    typename Func1,
+    typename T2,
+    typename Func2>
+__device__ void gridReduceGroup(
+    T1& out1,
+    const T1& inp_val1,
+    T1 init_val1,
+    Func1 reduction_op1,
+    volatile T1* work_buf1,
+    T2& out2,
+    const T2& inp_val2,
+    T2 init_val2,
+    Func2 reduction_op2,
+    volatile T2* work_buf2,
+    int64_t* sync_flags,
+    void* shared_buf,
+    bool read_pred,
+    bool write_pred) {
+  // Number of values to reduce in the reduction segment
+  const auto grid_reduction_segment_size =
+      index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
+
+  // Index of the reduction we're performing out of the
+  // grid_reduction_segment_size
+  const auto idx_in_grid_segment =
+      index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
+          blockIdx, gridDim);
+
+  // Number of threads we can use in final reduction, Seems to assume all
+  // threads in the block participate
+  const auto block_reduction_segment_size =
+      index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(blockDim);
+
+  // advance to the offset for this segment
+  // index of reduction * size of the reduction * size of threads
+  work_buf1 += idx_in_grid_segment * grid_reduction_segment_size *
+      block_reduction_segment_size;
+
+  work_buf2 += idx_in_grid_segment * grid_reduction_segment_size *
+      block_reduction_segment_size;
+
+  gridReduce2PartialReduction<
+      X_BLOCK,
+      Y_BLOCK,
+      Z_BLOCK,
+      X_THREAD,
+      Y_THREAD,
+      Z_THREAD>(
+      inp_val1,
+      init_val1,
+      reduction_op1,
+      work_buf1,
+      (T1*)shared_buf,
+      read_pred,
+      grid_reduction_segment_size,
+      idx_in_grid_segment,
+      block_reduction_segment_size);
 
-#undef isize
-#undef ioffset
+  gridReduce2PartialReduction<
+      X_BLOCK,
+      Y_BLOCK,
+      Z_BLOCK,
+      X_THREAD,
+      Y_THREAD,
+      Z_THREAD>(
+      inp_val2,
+      init_val2,
+      reduction_op2,
+      work_buf2,
+      (T2*)shared_buf,
+      read_pred,
+      grid_reduction_segment_size,
+      idx_in_grid_segment,
+      block_reduction_segment_size);
+
+  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
+
+  bool last_block =
+      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+
+  if (last_block) {
+    // Cleanup with block reduction
+    gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD>(
+        out1,
+        work_buf1,
+        grid_reduction_segment_size,
+        block_reduction_segment_size,
+        reduction_op1,
+        (T1*)shared_buf,
+        write_pred,
+        init_val1);
+    gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD>(
+        out2,
+        work_buf2,
+        grid_reduction_segment_size,
+        block_reduction_segment_size,
+        reduction_op2,
+        (T2*)shared_buf,
+        write_pred,
+        init_val2);
+  }
+
+  if (PERSISTENT_REDUCTION) {
+    // Make sure we're done with global memory before we allow the kernel to
+    // continue
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
+  }
+}
+
+} // namespace reduction
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu
index 0ccb07142aaa..1a6d7437d925 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu
@@ -18,7 +18,10 @@ __device__ T globalAsVolatile(volatile T& global_val) {
 // [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
 // Marking X and Y but not Z means there should be Z semaphores of size X*Y.
 template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
-__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
+__device__ void sync(
+    int64_t& semaphore,
+    const uint64_t& segment_size,
+    const bool last_block) {
   // Finish all global memory transactions before synchronizing
   __threadfence();
 
@@ -36,8 +39,6 @@ __device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
     // Makes the assumption that blocks are in increasing order, this is not
     // guaranteed by CUDA but this is the current behavior, and unlikely to
     // change.
-    bool last_block =
-        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
     if (last_block) {
       semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
     }
@@ -48,21 +49,86 @@ __device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
     // If for persistent kernels, lock all blocks until the semaphore has been
     // reached. Make sure we access semaphore as a volatile address so we get
     // the global memory updates.
+    unsigned int ns = 8;
     while ((PERSISTENT || last_block) &&
            ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
                0) {
       // Put a sleep here so we have some breaks in probing the global
       // semaphore, giving a better chance for other warps/blocks to catch up.
 #if __CUDA_ARCH__ >= 700
-      __nanosleep(200);
-#else
-      // __nanosleep is not available for sm < 70
-      assert(false);
+      // __nanosleep only available on compute capability 7.0 or higher
+      __nanosleep(ns); // avoids busy waiting
+      if (ns < 256) {
+        ns *= 2;
+      }
+#endif
+    }
+  }
+
+  // Sync block to make sure all other threads are waiting on the sync
+  block_sync::sync();
+}
+
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, bool PERSISTENT>
+__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) {
+  sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT>(
+      semaphore,
+      segment_size,
+      index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim));
+}
+
+// Grid sync that can be called multiple times in the same kernel without all
+// blocks being resident on device. This allows grid sync to be called multiple
+// times as long as it's not broadcasted on the parallel axis it was reduced on.
+//
+// n_entrances is how many times every block is expected to enter into this
+// function. All blocks must enter n_entrances times. The last block is only
+// allowed to proceed once all other blocks have entered n_entrance
+// times.
+//
+// Note that this is not currently used by grid and welford reduction
+// as they use a separate sync flag for each each grid sync call.
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
+__device__ void sync(
+    int64_t& semaphore,
+    const uint64_t& segment_size,
+    const nvfuser_index_t n_entrances) {
+  // Finish all global memory transactions before synchronizing
+  __threadfence();
+
+  // Synchronize all threads in a block before synchronizing blocks
+  block_sync::sync();
+
+  // Only allow linear_tid == 0 to participate in the synchronization
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
+    // Makes the assumption that blocks are in increasing order, this is not
+    // guaranteed by CUDA but this is the current behavior, and unlikely to
+    // change.
+    bool last_block =
+        index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+    if (last_block) {
+      int64_t finished_val =
+          ((int64_t)(index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim) - 1)) *
+          ((int64_t)n_entrances);
+
+      unsigned int ns = 8;
+      // Last block needs to wait for all other blocks to finish
+      while (globalAsVolatile(semaphore) < finished_val) {
+#if __CUDA_ARCH__ >= 700
+        // __nanosleep only available on compute capability 7.0 or higher
+        __nanosleep(ns); // avoids busy waiting
+        if (ns < 256) {
+          ns *= 2;
+        }
 #endif
+      }
+    } else {
+      auto old = atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), 1);
     }
   }
 
   // Sync block to make sure all other threads are waiting on the sync
   block_sync::sync();
 }
+
 } // namespace grid_sync
diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
index 61dccb4dff21..027e6ceadbdb 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
@@ -27,20 +27,64 @@ __device__ constexpr int64_t ceilDiv(int a, int64_t b) {
   return ceilDiv((int64_t)a, b);
 }
 
+// Monotonic and precise lerp is described here:
+// https://math.stackexchange.com/a/1798323
+__device__ double lerp(double start, double end, double weight) {
+  if (weight < 0.5) {
+    return start + weight * (end - start);
+  } else {
+    return end - (end - start) * (1.0 - weight);
+  }
+}
+
+__device__ float lerp(float start, float end, float weight) {
+  if (weight < 0.5f) {
+    return start + weight * (end - start);
+  } else {
+    return end - (end - start) * (1.0f - weight);
+  }
+}
+
+__device__ std::complex<double> lerp(
+    std::complex<double> start,
+    std::complex<double> end,
+    std::complex<double> weight) {
+  if (abs(weight) < 0.5) {
+    return start + weight * (end - start);
+  } else {
+    return end - (end - start) * (1.0 - weight);
+  }
+}
+
+__device__ std::complex<float> lerp(
+    std::complex<float> start,
+    std::complex<float> end,
+    std::complex<float> weight) {
+  if (abs(weight) < 0.5f) {
+    return start + weight * (end - start);
+  } else {
+    return end - (end - start) * (1.0f - weight);
+  }
+}
+
+__device__ float lerp(float start, float end, double weight) {
+  return lerp(start, end, static_cast<float>(weight));
+}
+
 __device__ constexpr int max(int a, int b) {
-  return ::max(a, b);
+  return a > b ? a : b;
 }
 
 __device__ constexpr int64_t max(int64_t a, int b) {
-  return ::max(a, (int64_t)b);
+  return a > (int64_t)b ? a : (int64_t)b;
 }
 
 __device__ constexpr int64_t max(int a, int64_t b) {
-  return ::max((int64_t)a, b);
+  return (int64_t)a > b ? (int64_t)a : b;
 }
 
 __device__ constexpr int64_t max(int64_t a, int64_t b) {
-  return ::max(a, b);
+  return a > b ? a : b;
 }
 
 __device__ double fmax(double a, double b) {
@@ -50,7 +94,7 @@ __device__ double fmax(double a, double b) {
   } else if (b != b) {
     return b;
   } else {
-    return ::fmax(a, b);
+    return a > b ? a : b;
   }
 }
 
@@ -61,24 +105,24 @@ __device__ float fmax(float a, float b) {
   } else if (b != b) {
     return b;
   } else {
-    return ::fmax(a, b);
+    return a > b ? a : b;
   }
 }
 
 __device__ constexpr int min(int a, int b) {
-  return ::min(a, b);
+  return a > b ? b : a;
 }
 
 __device__ constexpr int64_t min(int64_t a, int b) {
-  return ::min(a, (int64_t)b);
+  return (int64_t)a > b ? b : (int64_t)a;
 }
 
 __device__ constexpr int64_t min(int a, int64_t b) {
-  return ::min((int64_t)a, b);
+  return a > (int64_t)b ? (int64_t)b : a;
 }
 
 __device__ constexpr int64_t min(int64_t a, int64_t b) {
-  return ::min(a, b);
+  return a > b ? b : a;
 }
 
 __device__ double fmin(double a, double b) {
@@ -88,7 +132,7 @@ __device__ double fmin(double a, double b) {
   } else if (b != b) {
     return b;
   } else {
-    return ::fmin(a, b);
+    return a > b ? b : a;
   }
 }
 
@@ -99,7 +143,7 @@ __device__ float fmin(float a, float b) {
   } else if (b != b) {
     return b;
   } else {
-    return ::fmin(a, b);
+    return a > b ? b : a;
   }
 }
 
@@ -108,27 +152,27 @@ __device__ constexpr int alignBufferSize(int buffer, int size) {
 }
 
 __device__ double clamp(double x, double minv, double maxv) {
-  return x < minv ? minv : (x > maxv ? maxv : x);
+  return fmin(fmax(x, minv), maxv);
 }
 
 __device__ float clamp(float x, double minv, double maxv) {
-  return x < minv ? minv : (x > maxv ? maxv : x);
+  return fmin(fmax((double)x, minv), maxv);
 }
 
-__device__ double frac(double x) {
-  return x - trunc(x);
+__device__ int clamp(int x, int64_t minv, int64_t maxv) {
+  return min(max((int64_t)x, minv), maxv);
 }
 
-__device__ float frac(float x) {
-  return x - trunc(x);
+__device__ int64_t clamp(int64_t x, int64_t minv, int64_t maxv) {
+  return min(max(x, minv), maxv);
 }
 
-__device__ double gelu(double x) {
-  return x * normcdf(x);
+__device__ double frac(double x) {
+  return x - trunc(x);
 }
 
-__device__ float gelu(float x) {
-  return x * normcdf(x);
+__device__ float frac(float x) {
+  return x - trunc(x);
 }
 
 __device__ double reciprocal(double x) {
@@ -139,6 +183,14 @@ __device__ float reciprocal(float x) {
   return 1 / x;
 }
 
+__device__ std::complex<double> reciprocal(std::complex<double> x) {
+  return 1.0 / x;
+}
+
+__device__ std::complex<float> reciprocal(std::complex<float> x) {
+  return 1.0f / x;
+}
+
 __device__ double relu(double x) {
   return x <= 0 ? 0 : x;
 }
@@ -170,11 +222,19 @@ __device__ float remainder(float a, float b) {
 }
 
 __device__ double sigmoid(double x) {
-  return 1 / (1 + exp(-x));
+  return 1.0 / (1.0 + exp(-x));
 }
 
 __device__ float sigmoid(float x) {
-  return 1 / (1 + exp(-x));
+  return 1.0f / (1.0f + exp(-x));
+}
+
+__device__ std::complex<double> sigmoid(std::complex<double> x) {
+  return 1.0 / (1.0 + exp(-x));
+}
+
+__device__ std::complex<float> sigmoid(std::complex<float> x) {
+  return 1.0f / (1.0f + exp(-x));
 }
 
 __device__ double silu(double x) {
@@ -193,6 +253,28 @@ __device__ float threshold(float x, double t, double v) {
   return x <= t ? v : x;
 }
 
+__device__ std::complex<double> where(
+    bool c,
+    std::complex<double> a,
+    std::complex<double> b) {
+  return c ? a : b;
+}
+
+__device__ std::complex<float> where(
+    bool c,
+    std::complex<float> a,
+    std::complex<float> b) {
+  return c ? a : b;
+}
+
+__device__ int threshold(int x, int64_t t, int64_t v) {
+  return x <= t ? v : x;
+}
+
+__device__ int64_t threshold(int64_t x, int64_t t, int64_t v) {
+  return x <= t ? v : x;
+}
+
 __device__ double where(bool c, double a, double b) {
   return c ? a : b;
 }
@@ -205,6 +287,18 @@ __device__ int64_t where(bool c, int64_t a, int64_t b) {
   return c ? a : b;
 }
 
+__device__ int where(bool c, int a, int b) {
+  return c ? a : b;
+}
+
+__device__ int64_t where(bool c, int64_t a, int b) {
+  return c ? a : b;
+}
+
+__device__ int64_t where(bool c, int a, int64_t b) {
+  return c ? a : b;
+}
+
 __device__ double randLike(Philox& rnd) {
   return uniform(rnd(), rnd());
 }
@@ -267,15 +361,161 @@ __device__ T pow(T a, T b) {
   }
 }
 
-template int pow<int>(int a, int b);
-template int64_t pow<int64_t>(int64_t a, int64_t b);
+template __device__ int pow<int>(int a, int b);
+template __device__ int64_t pow<int64_t>(int64_t a, int64_t b);
 
 template <>
-float pow<float>(float a, float b) {
+__device__ float pow<float>(float a, float b) {
   return ::pow(a, b);
 }
 
 template <>
-double pow<double>(double a, double b) {
+__device__ double pow<double>(double a, double b) {
   return ::pow(a, b);
 }
+
+__device__ float pow(float a, int b) {
+  return pow(a, (float)b);
+}
+
+__device__ double pow(double a, int b) {
+  return pow(a, (double)b);
+}
+
+__device__ float pow(float a, int64_t b) {
+  return pow(a, (float)b);
+}
+
+__device__ double pow(double a, int64_t b) {
+  return pow(a, (double)b);
+}
+
+int64_t pow(int64_t a, int b) {
+  return pow(a, (int64_t)b);
+}
+
+int64_t pow(int a, int64_t b) {
+  return pow((int64_t)a, b);
+}
+
+template <int size, int align = size>
+struct alignas(align) TypelessData {
+  int8_t data[size];
+
+  template <typename T, std::enable_if_t<sizeof(T) == size, int> _ = 0>
+  TypelessData(T x) {
+    *reinterpret_cast<T*>(data) = x;
+  }
+
+  template <typename T, std::enable_if_t<sizeof(T) == size, int> _ = 0>
+  operator T() {
+    return *reinterpret_cast<T*>(data);
+  }
+};
+
+template <typename T>
+TypelessData<sizeof(T), alignof(T)> erase_type(T x) {
+  return x;
+}
+
+template <typename T>
+bool isfinite(T x) {
+  return ::isfinite(x);
+}
+
+template <typename T>
+bool isfinite(std::complex<T> x) {
+  return ::isfinite(std::real(x)) && ::isfinite(std::imag(x));
+}
+
+template <typename T>
+bool isinf(T x) {
+  return ::isinf(x);
+}
+
+template <typename T>
+bool isinf(std::complex<T> x) {
+  return ::isinf(std::real(x)) || ::isinf(std::imag(x));
+}
+
+////////////////////////////////////////////////////////////
+// TODO: the following overloads are only needed for CUDA //
+// 10.2 Please remove when CUDA 10.2 support is dropped   //
+////////////////////////////////////////////////////////////
+
+bool isinf(int64_t x) {
+  return false;
+}
+
+bool isinf(int x) {
+  return false;
+}
+
+bool isinf(short x) {
+  return false;
+}
+
+bool isinf(char x) {
+  return false;
+}
+
+bool isinf(unsigned char x) {
+  return false;
+}
+
+bool isinf(bool x) {
+  return false;
+}
+
+bool isfinite(int64_t x) {
+  return true;
+}
+
+bool isfinite(int x) {
+  return true;
+}
+
+bool isfinite(short x) {
+  return true;
+}
+
+bool isfinite(char x) {
+  return true;
+}
+
+bool isfinite(unsigned char x) {
+  return true;
+}
+
+bool isfinite(bool x) {
+  return true;
+}
+
+////////////////////////////////////////////////////////////
+//                        End TODO                        //
+////////////////////////////////////////////////////////////
+
+template <typename T>
+bool isnan(T x) {
+  return x != x;
+}
+
+template <typename T>
+bool isneginf(T x) {
+  return x < 0 && isinf(x);
+}
+
+template <typename T>
+bool isposinf(T x) {
+  return x > 0 && isinf(x);
+}
+
+template <typename T>
+bool isreal(T x) {
+  return true;
+}
+
+template <typename T>
+bool isreal(std::complex<T> x) {
+  return std::imag(x) == 0;
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu
index aab51a8f1585..ac4f2069b3b1 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu
@@ -19,3 +19,13 @@ struct Tensor<T, 0> {
 
   T* data;
 };
+
+// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
+template <typename T>
+struct CpuScalarTensor {
+  __device__ T& operator[](int) {
+    return data;
+  };
+
+  T data;
+};
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu b/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu
new file mode 100644
index 000000000000..f95978e84475
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu
@@ -0,0 +1,215 @@
+// Utility macro for this file
+#define DEVICE_INLINE __device__ inline
+
+// MMA instruction wrappers:
+//  The wrappers are subroutines that implement matrix of size
+//    A(M,K) X B(K,N) = C(M,N)
+//  The naming of the wrappers follow similar naming conventions
+//    as the mma instructions.
+//  All the mma macros follow the namespace and naming like
+//    Arch::M (M-dim) N (N-dim) K(K-dim) (Layout), eg.
+//    Volta::M16N16K4TT,
+//  with the dimensions describing the size of the sub-matrices being
+//   multiplied by this wrapper.
+//  see [Operand Layout Convention] in mma_type.h for details on the layout
+//   notation.
+namespace Volta {
+
+namespace util {
+// MMA instruction wrappers (sm_70+):
+// The instruction wrappers below are quarter-warp macros, which currently
+// nvfuser
+//  doesn't explicitly model. So they are currently only meant to be
+//  used as building blocks in warp level mma macros
+
+//  8x8x4 mma instruction, per quarter warp (8 threads), fp32 accumulate
+//  per thread register:
+//   A[4] x B[4] -> C[8]
+DEVICE_INLINE void mmaM8n8k4tt(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  unsigned const* _A = reinterpret_cast<unsigned const*>(A);
+  unsigned const* _B = reinterpret_cast<unsigned const*>(B);
+  unsigned* _C = reinterpret_cast<unsigned*>(C);
+
+  asm("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=r"(_C[0]),
+        "=r"(_C[1]),
+        "=r"(_C[2]),
+        "=r"(_C[3]),
+        "=r"(_C[4]),
+        "=r"(_C[5]),
+        "=r"(_C[6]),
+        "=r"(_C[7])
+      : "r"(_A[0]),
+        "r"(_A[1]),
+        "r"(_B[0]),
+        "r"(_B[1]),
+        "r"(_C[0]),
+        "r"(_C[1]),
+        "r"(_C[2]),
+        "r"(_C[3]),
+        "r"(_C[4]),
+        "r"(_C[5]),
+        "r"(_C[6]),
+        "r"(_C[7]));
+}
+
+DEVICE_INLINE void mmaM8n8k4tn(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  unsigned const* _A = reinterpret_cast<unsigned const*>(A);
+  unsigned const* _B = reinterpret_cast<unsigned const*>(B);
+  unsigned* _C = reinterpret_cast<unsigned*>(C);
+
+  asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=r"(_C[0]),
+        "=r"(_C[1]),
+        "=r"(_C[2]),
+        "=r"(_C[3]),
+        "=r"(_C[4]),
+        "=r"(_C[5]),
+        "=r"(_C[6]),
+        "=r"(_C[7])
+      : "r"(_A[0]),
+        "r"(_A[1]),
+        "r"(_B[0]),
+        "r"(_B[1]),
+        "r"(_C[0]),
+        "r"(_C[1]),
+        "r"(_C[2]),
+        "r"(_C[3]),
+        "r"(_C[4]),
+        "r"(_C[5]),
+        "r"(_C[6]),
+        "r"(_C[7]));
+}
+
+DEVICE_INLINE void mmaM8n8k4nt(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  unsigned const* _A = reinterpret_cast<unsigned const*>(A);
+  unsigned const* _B = reinterpret_cast<unsigned const*>(B);
+  unsigned* _C = reinterpret_cast<unsigned*>(C);
+
+  asm("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=r"(_C[0]),
+        "=r"(_C[1]),
+        "=r"(_C[2]),
+        "=r"(_C[3]),
+        "=r"(_C[4]),
+        "=r"(_C[5]),
+        "=r"(_C[6]),
+        "=r"(_C[7])
+      : "r"(_A[0]),
+        "r"(_A[1]),
+        "r"(_B[0]),
+        "r"(_B[1]),
+        "r"(_C[0]),
+        "r"(_C[1]),
+        "r"(_C[2]),
+        "r"(_C[3]),
+        "r"(_C[4]),
+        "r"(_C[5]),
+        "r"(_C[6]),
+        "r"(_C[7]));
+}
+
+// TODO: in a follow up,
+//    lift this part onto iterdomain ops, once the
+//    swizzle ops are ready.
+template <int acc_stride>
+DEVICE_INLINE Array<float, 8, 8> accToMma(float* _C) {
+  float C_data[8] = {
+      _C[0],
+      _C[1],
+      _C[acc_stride],
+      _C[acc_stride + 1],
+      _C[2],
+      _C[3],
+      _C[acc_stride + 2],
+      _C[acc_stride + 3],
+  };
+
+  return *reinterpret_cast<Array<float, 8, 8>*>(&C_data[0]);
+}
+
+template <int acc_stride>
+DEVICE_INLINE void mmaToAcc(float* _C, Array<float, 8, 8>& C) {
+  float* C_data = reinterpret_cast<float*>(&C);
+  _C[0] = C_data[0];
+  _C[1] = C_data[1];
+  _C[acc_stride] = C_data[2];
+  _C[acc_stride + 1] = C_data[3];
+  _C[2] = C_data[4];
+  _C[3] = C_data[5];
+  _C[acc_stride + 2] = C_data[6];
+  _C[acc_stride + 3] = C_data[7];
+}
+
+// Should be able to lift this with transpose op as well.
+template <int acc_stride>
+DEVICE_INLINE void initM16N16K4(Array<float, 8, 8>& accumulator) {
+  float* _C = reinterpret_cast<float*>(&accumulator);
+  float zeros[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  mmaToAcc<acc_stride>(_C, *reinterpret_cast<Array<float, 8, 8>*>(&zeros[0]));
+}
+
+} // namespace util
+
+template <int acc_stride>
+DEVICE_INLINE void M16N16K4TT(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  float* _C = reinterpret_cast<float*>(C);
+  Array<float, 8, 8> C_data = util::accToMma<acc_stride>(_C);
+  util::mmaM8n8k4tt(&C_data, A, B);
+  util::mmaToAcc<acc_stride>(_C, C_data);
+}
+
+template <int acc_stride>
+DEVICE_INLINE void M16N16K4TN(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  float* _C = reinterpret_cast<float*>(C);
+  Array<float, 8, 8> C_data = util::accToMma<acc_stride>(_C);
+  util::mmaM8n8k4tn(&C_data, A, B);
+  util::mmaToAcc<acc_stride>(_C, C_data);
+}
+
+template <int acc_stride>
+DEVICE_INLINE void M16N16K4NT(
+    Array<float, 8, 8>* C,
+    Array<__half, 4, 4>* A,
+    Array<__half, 4, 4>* B) {
+  float* _C = reinterpret_cast<float*>(C);
+  Array<float, 8, 8> C_data = util::accToMma<acc_stride>(_C);
+  util::mmaM8n8k4nt(&C_data, A, B);
+  util::mmaToAcc<acc_stride>(_C, C_data);
+}
+
+// Same initialization for now, will be different in interleaved
+//   macros
+template <int acc_stride>
+DEVICE_INLINE void initM16N16K4TT(Array<float, 8, 8>* accumulator) {
+  util::initM16N16K4<acc_stride>(*accumulator);
+}
+
+template <int acc_stride>
+DEVICE_INLINE void initM16N16K4TN(Array<float, 8, 8>* accumulator) {
+  util::initM16N16K4<acc_stride>(*accumulator);
+}
+
+template <int acc_stride>
+DEVICE_INLINE void initM16N16K4NT(Array<float, 8, 8>* accumulator) {
+  util::initM16N16K4<acc_stride>(*accumulator);
+}
+
+} // namespace Volta
+
+#undef DEVICE_INLINE
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tuple.cu b/torch/csrc/jit/codegen/cuda/runtime/tuple.cu
new file mode 100644
index 000000000000..8e67dba7da72
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/tuple.cu
@@ -0,0 +1,322 @@
+// std::tuple-like type
+template <typename... Types>
+struct Tuple;
+
+template <typename T0>
+struct Tuple<T0> {
+  T0 val0;
+
+  __device__ Tuple(T0 _val0) : val0(_val0) {}
+
+  // Only valid when instantiated for pointer types
+  __device__ void operator+=(nvfuser_index_t offset) {
+    static_assert(IsPointerType<T0>::value, "Invalid for non-pointer types");
+    val0 += offset;
+  }
+};
+
+template <typename T0, typename T1>
+struct Tuple<T0, T1> {
+  T0 val0;
+  T1 val1;
+
+  __device__ Tuple(T0 _val0, T1 _val1) : val0(_val0), val1(_val1) {}
+
+  // Only valid when instantiated for pointer types
+  __device__ void operator+=(nvfuser_index_t offset) {
+    static_assert(IsPointerType<T0>::value, "Invalid for non-pointer types");
+    static_assert(IsPointerType<T1>::value, "Invalid for non-pointer types");
+    val0 += offset;
+    val1 += offset;
+  }
+};
+
+template <typename T0, typename T1, typename T2>
+struct Tuple<T0, T1, T2> {
+  T0 val0;
+  T1 val1;
+  T2 val2;
+
+  __device__ Tuple(T0 _val0, T1 _val1, T2 _val2)
+      : val0(_val0), val1(_val1), val2(_val2) {}
+
+  // Only valid when instantiated for pointer types
+  __device__ void operator+=(nvfuser_index_t offset) {
+    static_assert(IsPointerType<T0>::value, "Invalid for non-pointer types");
+    static_assert(IsPointerType<T1>::value, "Invalid for non-pointer types");
+    static_assert(IsPointerType<T2>::value, "Invalid for non-pointer types");
+    val0 += offset;
+    val1 += offset;
+    val2 += offset;
+  }
+};
+
+// Accessor for Tuple
+template <int idx>
+struct get;
+
+template <>
+struct get<0> {
+  template <typename Tuple>
+  __device__ auto& operator()(Tuple& vals) {
+    return vals.val0;
+  }
+  template <typename Tuple>
+  __device__ const auto& operator()(const Tuple& vals) {
+    return vals.val0;
+  }
+};
+
+template <>
+struct get<1> {
+  template <typename Tuple>
+  __device__ auto& operator()(Tuple& vals) {
+    return vals.val1;
+  }
+  template <typename Tuple>
+  __device__ const auto& operator()(const Tuple& vals) {
+    return vals.val1;
+  }
+};
+
+template <>
+struct get<2> {
+  template <typename Tuple>
+  __device__ auto& operator()(Tuple& vals) {
+    return vals.val2;
+  }
+  template <typename Tuple>
+  __device__ const auto& operator()(const Tuple& vals) {
+    return vals.val2;
+  }
+};
+
+template <typename DstType, typename SrcType>
+__inline__ __device__ static void copyTuple(
+    DstType& dst,
+    nvfuser_index_t dst_offset,
+    const SrcType& src,
+    nvfuser_index_t src_offset = 0);
+
+template <typename DstType, typename SrcType>
+__inline__ __device__ static void copyTuple(
+    DstType& dst,
+    const SrcType& src,
+    nvfuser_index_t src_offset = 0);
+
+template <typename... Types>
+class LocalTuple {
+ public:
+  static constexpr int num_vals = sizeof...(Types);
+  using ValTypes = TypeList<Types...>;
+
+  __device__ LocalTuple(Types... args) : vals_(args...) {}
+
+  __device__ LocalTuple(const LocalTuple& other) : vals_(other.vals_) {}
+
+  template <template <typename...> typename TupleType>
+  __device__ LocalTuple(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+  }
+
+  __device__ LocalTuple& operator=(const LocalTuple<Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <template <typename...> typename TupleType>
+  __device__ LocalTuple& operator=(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <int val_idx>
+  __device__ auto& val(nvfuser_index_t ptr_offset = 0) {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return get<val_idx>()(vals_);
+  }
+
+  template <int val_idx>
+  __device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return get<val_idx>()(vals_);
+  }
+
+ private:
+  Tuple<Types...> vals_;
+};
+
+template <bool is_volatile, typename... Types>
+class PtrTupleBase {
+ public:
+  static constexpr int num_vals = sizeof...(Types);
+  using ValTypes = TypeList<Types...>;
+  template <int val_idx>
+  using TypeIMaybeVolatile = typename MaybeVolatile<
+      typename TypeSelector<val_idx, Types...>::type,
+      is_volatile>::type;
+
+  __device__ PtrTupleBase(Types*... args) : vals_(args...) {}
+
+  __device__ PtrTupleBase(const PtrTupleBase& other) : vals_(other.vals_) {}
+
+  // Note: this is a deep copy
+  __device__ PtrTupleBase& operator=(
+      const PtrTupleBase<is_volatile, Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <template <typename...> typename TupleType>
+  __device__ PtrTupleBase& operator=(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <int val_idx>
+  __device__ TypeIMaybeVolatile<val_idx>& val(nvfuser_index_t ptr_offset = 0) {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
+  }
+
+  template <int val_idx>
+  __device__ const TypeIMaybeVolatile<val_idx>& val(
+      nvfuser_index_t ptr_offset = 0) const {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
+  }
+
+  __device__ void operator+=(nvfuser_index_t ptr_offset) {
+    vals_ += ptr_offset;
+  }
+
+ private:
+  Tuple<Types*...> vals_;
+};
+
+template <typename... Types>
+class RefTuple {
+ public:
+  static constexpr int num_vals = sizeof...(Types);
+  using ValTypes = TypeList<Types...>;
+
+  __device__ RefTuple(Types&... args) : vals_(args...) {}
+
+  __device__ RefTuple(const RefTuple& other) : vals_(other.vals_) {}
+
+  template <template <typename...> typename TupleType>
+  __device__ RefTuple(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+  }
+
+  __device__ RefTuple& operator=(const RefTuple<Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <template <typename...> typename TupleType>
+  __device__ RefTuple& operator=(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+    return *this;
+  }
+
+  template <int val_idx>
+  __device__ auto& val(nvfuser_index_t ptr_offset = 0) {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return get<val_idx>()(vals_);
+  }
+
+  template <int val_idx>
+  __device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return get<val_idx>()(vals_);
+  }
+
+ private:
+  Tuple<Types&...> vals_;
+};
+
+template <typename DstType, typename SrcType, int num_vals>
+struct TupleCopy {
+  __inline__ __device__ static void copy(
+      DstType& dst,
+      nvfuser_index_t dst_offset,
+      const SrcType& src,
+      nvfuser_index_t src_offset) {
+    static_assert(
+        IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::
+            value,
+        "Invalid value types");
+    TupleCopy<DstType, SrcType, num_vals - 1>::copy(
+        dst, dst_offset, src, src_offset);
+    dst.val<num_vals - 1>(dst_offset) = src.val<num_vals - 1>(src_offset);
+  }
+};
+
+// Can a generic const and non-const RefTupe be defined?
+template <typename... Types>
+class ConstRefTuple {
+ public:
+  static constexpr int num_vals = sizeof...(Types);
+  using ValTypes = TypeList<Types...>;
+
+  __device__ ConstRefTuple(const Types&... args) : vals_(args...) {}
+
+  __device__ ConstRefTuple(const ConstRefTuple& other) : vals_(other.vals_) {}
+
+  template <template <typename...> typename TupleType>
+  __device__ ConstRefTuple(const TupleType<Types...>& other) {
+    copyTuple(*this, other);
+  }
+
+  template <int val_idx>
+  __device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
+    static_assert(val_idx < num_vals, "Out-of-range value index");
+    return get<val_idx>()(vals_);
+  }
+
+ private:
+  Tuple<const Types&...> vals_;
+};
+
+template <typename DstType, typename SrcType>
+struct TupleCopy<DstType, SrcType, 1> {
+  __inline__ __device__ static void copy(
+      DstType& dst,
+      nvfuser_index_t dst_offset,
+      const SrcType& src,
+      nvfuser_index_t src_offset) {
+    static_assert(
+        IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::
+            value,
+        "Invalid value types");
+    dst.val<0>(dst_offset) = src.val<0>(src_offset);
+  }
+};
+
+template <typename DstType, typename SrcType>
+__inline__ __device__ static void copyTuple(
+    DstType& dst,
+    nvfuser_index_t dst_offset,
+    const SrcType& src,
+    nvfuser_index_t src_offset) {
+  static_assert(
+      IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::value,
+      "Invalid value types");
+  TupleCopy<DstType, SrcType, DstType::num_vals>::copy(
+      dst, dst_offset, src, src_offset);
+};
+
+template <typename DstType, typename SrcType>
+__inline__ __device__ static void copyTuple(
+    DstType& dst,
+    const SrcType& src,
+    nvfuser_index_t src_offset) {
+  copyTuple(dst, 0, src, src_offset);
+};
+
+template <typename... Types>
+using PtrTuple = PtrTupleBase<false, Types...>;
+
+template <typename... Types>
+using VolatilePtrTuple = PtrTupleBase<true, Types...>;
diff --git a/torch/csrc/jit/codegen/cuda/runtime/type_traits.cu b/torch/csrc/jit/codegen/cuda/runtime/type_traits.cu
new file mode 100644
index 000000000000..6f815f26d6b4
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/type_traits.cu
@@ -0,0 +1,46 @@
+// Type trait utils
+template <typename Type, bool is_volatile>
+struct MaybeVolatile;
+
+template <typename Type>
+struct MaybeVolatile<Type, true> {
+  using type = volatile Type;
+};
+
+template <typename Type>
+struct MaybeVolatile<Type, false> {
+  using type = Type;
+};
+
+template <typename... Types>
+struct TypeList {};
+
+template <int idx, typename T, typename... Types>
+struct TypeSelector {
+  using type = typename TypeSelector<idx - 1, Types...>::type;
+};
+
+template <typename T, typename... Types>
+struct TypeSelector<0, T, Types...> {
+  using type = T;
+};
+
+template <typename T0, typename T1>
+struct IsSameType {
+  static constexpr bool value = false;
+};
+
+template <typename T0>
+struct IsSameType<T0, T0> {
+  static constexpr bool value = true;
+};
+
+template <typename T>
+struct IsPointerType {
+  static constexpr bool value = false;
+};
+
+template <typename T>
+struct IsPointerType<T*> {
+  static constexpr bool value = true;
+};
diff --git a/torch/csrc/jit/codegen/cuda/runtime/warp.cu b/torch/csrc/jit/codegen/cuda/runtime/warp.cu
index 985df8823b08..c7881262c5df 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/warp.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/warp.cu
@@ -44,14 +44,14 @@ __device__ void warpReduceTIDX(
 
     block_sync::sync();
 
-    if (read_write_pred && is_warp_head) {
+    if (is_warp_head) {
       shared_mem[smem_offset + warp_idx] = reduce_val;
     }
 
     block_sync::sync();
 
     if (warp_idx == 0) {
-      // This assumes num_of_warps will be < 32, meaning < 1024 blocks.
+      // This assumes num_of_warps will be < 32, meaning < 1024 threads.
       //  Should be true for long enough.
       assert(num_of_warps <= 32);
 
diff --git a/torch/csrc/jit/codegen/cuda/runtime/welford.cu b/torch/csrc/jit/codegen/cuda/runtime/welford.cu
index 07d848c55f22..eec40ea6453c 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/welford.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/welford.cu
@@ -8,8 +8,8 @@ __inline__ __device__ void welfordCombine(
     T& a_avg,
     T& a_M2,
     TN& a_N,
-    const T& b_avg,
-    const T& b_M2,
+    const T b_avg,
+    const T b_M2,
     TN b_N) {
   if (b_N == 0) {
     return;
@@ -183,9 +183,9 @@ __device__ void gridWelfordLastBlock(
     T& out_avg,
     T& out_M2,
     TN& out_N,
-    const T* in_avg,
-    const T* in_M2,
-    const TN* in_N,
+    const volatile T* in_avg,
+    const volatile T* in_M2,
+    const volatile TN* in_N,
     const nvfuser_index_t
         grid_reduction_segment_size, // Number of reductions across
                                      // grid reduce dimensions
@@ -261,7 +261,7 @@ __device__ void gridWelfordLastBlock(
   }
 }
 
-//    Grid welford combine
+// Grid welford combine. See GridReduction for more information
 template <
     bool X_BLOCK,
     bool Y_BLOCK,
@@ -288,7 +288,13 @@ __device__ void gridWelford(
     TN* shared_buf_N,
     bool read_pred,
     bool write_pred,
-    T init_val) {
+    T init_val,
+    const nvfuser_index_t entrance_ind,
+    const nvfuser_index_t n_entrances) {
+  // entrance index only matters for non-persistent re-entrant grid reductions.
+  const nvfuser_index_t entrance_ind_ = PERSISTENT_REDUCTION ? 0 : entrance_ind;
+  const nvfuser_index_t n_entrances_ = PERSISTENT_REDUCTION ? 1 : n_entrances;
+
   // Number of values to reduce in the reduction segment
   const auto grid_reduction_segment_size =
       index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
@@ -304,14 +310,19 @@ __device__ void gridWelford(
   const auto block_reduction_segment_size =
       index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
 
+  // Number of reductions in the grid
+  const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
+      ? 1
+      : index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
+
   // advance to the offset for this segment
   // index of reduction * size of the reduction * size of threads
-  work_buf_avg += idx_in_grid_segment * grid_reduction_segment_size *
-      block_reduction_segment_size;
-  work_buf_M2 += idx_in_grid_segment * grid_reduction_segment_size *
-      block_reduction_segment_size;
-  work_buf_N += idx_in_grid_segment * grid_reduction_segment_size *
-      block_reduction_segment_size;
+  work_buf_avg += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
+      grid_reduction_segment_size * block_reduction_segment_size;
+  work_buf_M2 += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
+      grid_reduction_segment_size * block_reduction_segment_size;
+  work_buf_N += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
+      grid_reduction_segment_size * block_reduction_segment_size;
 
   if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
       (Z_THREAD || threadIdx.z == 0)) {
@@ -333,8 +344,15 @@ __device__ void gridWelford(
     }
   }
 
-  grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
-      sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
+  if (PERSISTENT_REDUCTION) {
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[idx_in_grid_segment], grid_reduction_segment_size);
+  } else {
+    // Use a different sync flag for each call
+    grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION>(
+        sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment],
+        grid_reduction_segment_size);
+  }
 
   bool last_block =
       index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
@@ -345,9 +363,9 @@ __device__ void gridWelford(
         out_avg,
         out_M2,
         out_N,
-        (T*)work_buf_avg,
-        (T*)work_buf_M2,
-        (TN*)work_buf_N,
+        work_buf_avg,
+        work_buf_M2,
+        work_buf_N,
         grid_reduction_segment_size,
         block_reduction_segment_size,
         shared_buf_avg,
@@ -366,6 +384,3 @@ __device__ void gridWelford(
 }
 
 } // namespace welford
-
-#undef isize
-#undef ioffset
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h b/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
index 7483cc7c2ae3..56460ec92695 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
@@ -9,6 +9,7 @@ namespace fuser {
 namespace cuda {
 
 enum class TORCH_CUDA_CU_API ScheduleHeuristic {
+  None,
   PointWise,
   Reduction,
   Persistent
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h
new file mode 100644
index 000000000000..c059eea19cd6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h
@@ -0,0 +1,34 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace scheduler_debug_utils {
+
+// Basic logging utility for any messages in scheduler or segmenter
+template <typename... Args>
+void canScheduleMessage(const Args&... args) {
+  // Using builtin expect to reduce the overhead slightly,
+  //  alternatively may want to allow this message in debug
+  //  build only but that'd be inconvenient for user support.
+  if (C10_UNLIKELY(isDebugDumpEnabled(DebugDumpOption::FusionSegmenterLog))) {
+    std::cout << c10::str(args...) << "\n";
+  }
+}
+
+// Short-cut message for flagging why shedulers cannot schedule fusions,
+//  assuming first argument is heuristic type (not actively checked).
+template <typename HeuristicType, typename... Args>
+void canScheduleRejectReason(HeuristicType heuristic, const Args&... args) {
+  canScheduleMessage(
+      "Scheduler _", heuristic, "_ ***rejected*** because : ", args...);
+}
+
+} // namespace scheduler_debug_utils
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
new file mode 100644
index 000000000000..1868d9cb9159
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
@@ -0,0 +1,431 @@
+
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace mma_util {
+
+namespace {
+
+// Utility for mma dimension matching
+enum class MmaDimension { M = 0, N, K };
+
+// Utility for mma dimension matching, assumes the innermost
+//  3 dimensions are the mma operand dimensions, i.e. mnk, but
+// not necessarily in this order.
+// For matmul use cases the root domains are always 3 dimensional,
+//  but this wouldn't be the case for other kernels such as batched gemm.
+// This utility only applies to the case where the innermost 3 dims
+//  are the one that mma's are used. We probably don't want to use
+//  mma intrinsics if that's not the case.
+IterDomain* getMmaOperandRootDimension3d(
+    TensorView* tv,
+    MmaOptions::MmaInputLayout layout,
+    MmaDimension mma_dimension) {
+  TORCH_INTERNAL_ASSERT(tv->getMaybeRFactorDomain().size() >= 3);
+  // NT : K,M x K,N -> K,M,N
+  // TT : M,K X K,N -> M,K,N
+  // TN : M,K X N,K -> M,N,K
+  int axis_id = -1;
+  switch (mma_dimension) {
+    case MmaDimension::K:
+      axis_id = (int)layout;
+      break;
+    case MmaDimension::M:
+      axis_id = layout == MmaOptions::MmaInputLayout::NT ? 1 : 0;
+      break;
+    case MmaDimension::N:
+      axis_id = layout == MmaOptions::MmaInputLayout::TN ? 1 : 2;
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unreachable");
+      break;
+  }
+
+  int root_size = tv->getMaybeRFactorDomain().size();
+  // Convert to index from right.
+  return tv->getMaybeRFactorDomain().at(root_size + axis_id - 3);
+}
+
+// Locate the root id corresponding to the given mma dimension
+//  Assumes the mma dimension always the innermost 2 or 3, might
+//  need to extend for more complex fusions.
+IterDomain* getMmaOperandRootDimension(
+    TensorView* tv,
+    MmaOptions options,
+    MmaDimension mma_dimension) {
+  if (isVolta(options.macro)) {
+    return getMmaOperandRootDimension3d(
+        tv, options.operand_layout, mma_dimension);
+  }
+  TORCH_INTERNAL_ASSERT(false, "unreachable");
+  return nullptr;
+}
+
+// Preliminary checks to try to validate that leaf is
+//  a innermost dim of root of exactly the given size.
+bool canValidateIsInnerDim(
+    IterDomain* root,
+    IterDomain* leaf,
+    int inner_dim_size) {
+  // Accept boundary case for Volta.
+  if (leaf == root && leaf->isBroadcast()) {
+    return true;
+  }
+  auto expr = leaf->definition();
+  ExpressionEvaluator const_eval(leaf->fusion());
+  auto maybe_leaf_size = const_eval.evaluate(leaf->extent());
+  if (!maybe_leaf_size.has_value()) {
+    return false;
+  }
+  if (maybe_leaf_size.value() != inner_dim_size) {
+    return false;
+  }
+
+  while (expr) {
+    if (auto split = dynamic_cast<Split*>(expr)) {
+      // Inner split only
+      if (leaf != split->inner()) {
+        return false;
+      }
+      // Const split only
+      auto maybe_factor = const_eval.evaluate(split->factor());
+      if (!maybe_factor.has_value()) {
+        return false;
+      }
+      int factor = maybe_factor.value();
+      if (factor < inner_dim_size) {
+        // This might be too restrictive. Would need more
+        //   bookkeeping to relax.
+        return false;
+      }
+      leaf = split->in();
+    } else if (auto merge = dynamic_cast<Merge*>(expr)) {
+      // Might consider just rejecting merge.
+      auto outer = merge->outer();
+      if (outer->isBroadcast()) {
+        return false;
+      }
+
+      // Only support merging with constant sized dims
+      maybe_leaf_size = const_eval.evaluate(leaf->extent());
+      if (!maybe_leaf_size.has_value()) {
+        return false;
+      }
+      if (maybe_leaf_size.value() != inner_dim_size) {
+        return false;
+      }
+      leaf = merge->inner();
+    } else {
+      // No support for swizzled inner dim for now.
+      //  Might need to add transpose swizzle here.
+      return false;
+    }
+    expr = leaf->definition();
+  }
+  return leaf == root;
+}
+
+} // namespace
+
+void checkDimSize(
+    TensorView* tv,
+    std::vector<int> axis,
+    std::vector<int> expect) {
+  TORCH_INTERNAL_ASSERT(
+      axis.size() == expect.size(),
+      "CheckDimSize: Mismatched axis and expect size");
+  ExpressionEvaluator const_eval(tv->fusion());
+  for (auto axis_index : c10::irange(axis.size())) {
+    TORCH_INTERNAL_ASSERT(
+        ((axis[axis_index] + tv->nDims()) >= 0) &&
+            (axis[axis_index] < (int)tv->nDims()),
+        "CheckDimSize: axis position out of bound ",
+        axis[axis_index],
+        " ",
+        tv->nDims());
+    auto id = tv->axis(axis[axis_index]);
+    auto maybe_extent = const_eval.evaluate(id->extent());
+    TORCH_CHECK(
+        maybe_extent.has_value(),
+        "Mma warp mapping: instruction tile has to be constant");
+    TORCH_CHECK(
+        maybe_extent.value() == expect[axis_index],
+        "Mma warp mapping: unexpected tile size at",
+        axis_index,
+        ":",
+        maybe_extent.value(),
+        "vs",
+        expect[axis_index]);
+  }
+}
+
+void WarpMmaSwizzler::scheduleMmaWarpOutput(
+    TensorView* tv,
+    MmaOptions options) {
+  auto macro = options.macro;
+  switch (macro) {
+    case MmaOptions::MacroType::Volta_16_16_4:
+      scheduleVoltaM16N16K4Fp32Output(tv, options);
+      if (tv->definition()->isA<MmaOp>()) {
+        setWarpMapped(tv, 5);
+      }
+      break;
+    default:
+      TORCH_CHECK(
+          false, "scheduleMmaWarp: unsupported mma option ", toString(macro));
+      break;
+  }
+}
+
+void WarpMmaSwizzler::scheduleOperandRead(TensorView* tv, MmaOptions options) {
+  // Schedules operand for inner most 3 contiguous dimensions
+  // Assumes M, N, K
+
+  switch (options.macro) {
+    case MmaOptions::MacroType::Volta_16_16_4:
+      scheduleVoltaOperandRead(tv, options);
+      break;
+    default:
+      TORCH_CHECK(false, "WarpMmaSwizzler: please specify macro");
+      break;
+  }
+}
+
+void WarpMmaSwizzler::setWarpMapped(TensorView* tv, int number_of_dims) {
+  for (int id : c10::irange(number_of_dims)) {
+    tv->axis(-id - 1)->toMmaSwizzled();
+  }
+}
+
+namespace {
+
+// Utility to check operand innermost scheduled dimensions
+void validateInnerMNK(TensorView* tv, MmaOptions options, int m, int n, int k) {
+  TORCH_INTERNAL_ASSERT(tv->nDims() >= 3);
+  TORCH_INTERNAL_ASSERT(canValidateIsInnerDim(
+      getMmaOperandRootDimension(tv, options, MmaDimension::M),
+      tv->axis(-3),
+      m));
+  TORCH_INTERNAL_ASSERT(canValidateIsInnerDim(
+      getMmaOperandRootDimension(tv, options, MmaDimension::N),
+      tv->axis(-2),
+      n));
+  TORCH_INTERNAL_ASSERT(canValidateIsInnerDim(
+      getMmaOperandRootDimension(tv, options, MmaDimension::K),
+      tv->axis(-1),
+      k));
+}
+
+void validateResultInnerMN(TensorView* tv, int m, int n) {
+  TORCH_INTERNAL_ASSERT(tv->nDims() >= 2);
+  int root_dim = tv->getMaybeRFactorDomain().size();
+  TORCH_INTERNAL_ASSERT(canValidateIsInnerDim(
+      tv->getMaybeRFactorDomain()[root_dim - 2], tv->axis(-2), m));
+  TORCH_INTERNAL_ASSERT(canValidateIsInnerDim(
+      tv->getMaybeRFactorDomain()[root_dim - 1], tv->axis(-1), n));
+}
+
+void scheduleVoltaA(TensorView* tv, MmaOptions options) {
+  // Assumed:
+  // [..., 16, 16 ,4]
+  // [..., M,  BN, K]
+  // Some validation:
+  validateInnerMNK(tv, options, 16, 16, 4);
+  bool transposed = isOperandTransposed(options);
+
+  tv->split(-3, 4);
+
+  // Split out 16 from the bcast
+  tv->split(-2, 16);
+  tv->split(-2, 8);
+
+  // -6   -5    -4  -3   -2  -1
+  //[Mo4, Mi4, Noo, No2, Ni8, K]
+
+  if (transposed) {
+    tv->reorder({{-5, -3}, {-3, -5}});
+    // -6   -5    -4  -3   -2  -1
+    //[Mo4, No2, Noo, Mi4, Ni8, K]
+
+  } else {
+    tv->reorder({{-5, -1}, {-3, -5}, {-1, -3}});
+    // -6   -5    -4  -3  -2  -1
+    //[Mo4, No2, Noo,  K, Ni8, Mi4]
+  }
+
+  tv->merge(-6);
+  tv->merge(-5);
+  tv->merge(-4);
+
+  //[Warp, Ni8, K/Mi4]
+  tv->axis(-3)->parallelize(ParallelType::TIDx);
+}
+
+void scheduleVoltaB(TensorView* tv, MmaOptions options) {
+  // Assumed:
+  // [..., 16,16,4]
+  // [..., BM, N, K]
+  // Some validation:
+  validateInnerMNK(tv, options, 16, 16, 4);
+
+  bool transposed = isOperandTransposed(options);
+  tv->split(-3, 16);
+  tv->split(-3, 8);
+
+  tv->split(-2, 8);
+  tv->split(-2, 4);
+
+  // -7   -6   -5   -4   -3    -2   -1
+  //[Moo, Mo2, Mi8, No2, Nio2, Nii4, K]
+  tv->reorder({{-6, -4}, {-5, -6}, {-4, -3}, {-3, -5}});
+
+  // -7   -6   -5   -4    -3    -2   -1
+  //[Moo, Mi8, Nio2, Mo2, No2,  Nii4, K ]
+  if (transposed) {
+    tv->reorder({{-2, -1}, {-1, -2}});
+    //  -7   -6   -5   -4    -3  -2   -1
+    //[Moo, Mi8, Nio2, Mo2, No2, K, Nii4]
+  }
+
+  tv->merge(-5);
+  tv->merge(-4);
+  tv->merge(-3);
+
+  //[Moo, Mi8, Warp, K/Nii4]
+  tv->axis(-2)->parallelize(ParallelType::TIDx);
+}
+
+} // namespace
+
+void WarpMmaSwizzler::scheduleVoltaOperandRead(
+    TensorView* tv,
+    MmaOptions options) {
+  switch (options.operand) {
+    case MmaOptions::Operand::A:
+      scheduleVoltaA(tv, options);
+      setWarpMapped(tv, 3);
+      break;
+    case MmaOptions::Operand::B:
+      scheduleVoltaB(tv, options);
+      setWarpMapped(tv, 4);
+      break;
+    default:
+      TORCH_CHECK(false, "WarpMmaSwizzler: please specify operand");
+  }
+}
+
+// Fp32 and Fp16 outputs have different layouts on volta,
+//   but we only support fp32 accumulate at this stage.
+void WarpMmaSwizzler::scheduleVoltaM16N16K4Fp32Output(
+    TensorView* tv,
+    const MmaOptions& options) {
+  // Assume last 2 dims [M16, N16] or [M16, N16, R]
+  bool is_reduction = tv->axis(-1)->isReduction();
+
+  // Make sure instruction tile size is correct.
+  if (is_reduction) {
+    validateInnerMNK(tv, options, 16, 16, 4);
+  } else {
+    validateResultInnerMN(tv, 16, 16);
+  }
+
+  int m_pos = is_reduction ? -3 : -2;
+
+  // Assumed:
+  //       m
+  // [..., 16,16, (4)]
+  // [..., M, N,  (R)]
+  tv->split(m_pos, 4);
+  tv->split(m_pos, 2);
+  tv->split(m_pos + 1, 8);
+  tv->split(m_pos + 1, 4);
+  tv->split(m_pos + 1, 2);
+
+  //        m-5  m-4   m-3   m-2   m-1    m     m+1   m+2
+  // [..., Mo4, Mio2, Mii2,  No2, Nio2, Niio2, Niii2, (R)]
+  tv->reorder(
+      {{m_pos - 4, m_pos - 1},
+       {m_pos - 3, m_pos - 2},
+       {m_pos - 2, m_pos - 4},
+       {m_pos - 1, m_pos},
+       {m_pos, m_pos - 3}});
+
+  //        m-5  m-4   m-3   m-2   m-1    m     m+1   m+2
+  //  [..., Mo4, No2, Niio2, Mii2, Mio2, Nio2, Niii2, (R)]
+
+  tv->merge(m_pos - 5);
+  tv->merge(m_pos - 4);
+  tv->merge(m_pos - 3);
+
+  //  m-2   m-1   m     m+1   m+2
+  //[Warp, Mio2, Nio2, Niii2, (R)]
+  tv->axis(m_pos - 2)->parallelize(ParallelType::TIDx);
+
+  if (is_reduction && tv->definition()->isA<MmaOp>()) {
+    // Set instruction loops for mma reduce output
+    for (int pos : c10::irange(5)) {
+      if (!tv->axis(-pos - 1)->isThread()) {
+        tv->axis(-pos - 1)->parallelize(ParallelType::Mma);
+      }
+      tv->axis(-pos - 1)->toMmaSwizzled();
+    }
+  }
+}
+
+namespace {
+
+bool isMmaInitLoop(const kir::Scope& loop_body) {
+  for (auto expr : loop_body.exprs()) {
+    if (auto inner_loop = dynamic_cast<kir::ForLoop*>(expr)) {
+      if (!isMmaInitLoop(inner_loop->body())) {
+        return false;
+      }
+    } else if (auto uop = dynamic_cast<UnaryOp*>(expr)) {
+      if (!ir_utils::isTvOp(expr) ||
+          uop->getUnaryOpType() != UnaryOpType::Set) {
+        return false;
+      }
+      if (auto ti = dynamic_cast<kir::TensorIndex*>(expr->output(0))) {
+        if (!ti->view()->definition() ||
+            !ti->view()->definition()->isA<MmaOp>()) {
+          return false;
+        }
+      }
+      if (auto tv = dynamic_cast<TensorView*>(expr->output(0))) {
+        if (!tv->definition() || !tv->definition()->isA<MmaOp>()) {
+          return false;
+        }
+      }
+    } else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
+      if (!isMmaInitLoop(ite->thenBody())) {
+        return false;
+      }
+      if (!isMmaInitLoop(ite->elseBody())) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+bool isMmaInitLoop(const kir::ForLoop* loop) {
+  return isMmaInitLoop(loop->body());
+}
+
+} // namespace mma_util
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h
new file mode 100644
index 000000000000..2ee1b4473277
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace mma_util {
+
+//! [WarpMmaSwizzler]:
+//!   This class is used to implement the thread swizzle format
+//!     required for the mma macros, cf. PTX ISA 9.7.13.4.
+//!
+//!   The mma instructions (Volta through Ampere) require specific
+//!     thread mapping within a warp for both the mma inputs and
+//!     mma outputs. All mma swizzle patterns seen so far turned out
+//!     to be affine, so we could use the normal scheduler interface
+//!     to fulfill the mma thread swizzle pattern. And fusion with
+//!     other non-mma ops and validations can just natually rely on the current
+//!     iterdomain infrastructure.
+//!
+//!   This is different from a normal scheduler utility though,
+//!      as the thread mapping within a warp are *required* to be
+//!      a specific pattern which currently translates to an enforced
+//!      requirement that all the leaf domains produced by WarpMmaSwizzler
+//!      cannot be further transformed (split/merge/reorder etc.).
+//!
+//!   Currently WarpMmaSwizzler can be accessed by schedulers through
+//!     TensorView::applyMmaSwizzle, and the current scheduling procedure is
+//!     as follows:
+//!
+//!   Step 1. Before scheduling, the mma op needs to be configured with a macro
+//!   type, either manually or inferred (eg. Volta_16_16_4).
+//!
+//!   Step 2. Scheduler can tile the outer dimensions based on any heuristics,
+//!   i.e. the CTA tiling, warp tiling, splitK etc.
+//!
+//!   Step 3. The scheduler will need to split the innermost part of the 3
+//!   involved
+//!    root dimensions, they need to be ordered as M,N,K on the rightmost of
+//!    tensordomain (see [Operand Layout Convention] for exact definition).
+//!
+//!    For example before calling WarpMmaSwizzler, the domain could look like:
+//!    [TileM, TileN, TileK, Im(16), In(16), Rk(4)], to use Volta_16_16_4.
+//!    The rightmost 3 iterdomains need to be the innermost component of their
+//!    corresponding root id, similar to vectorization except this requirement
+//!    applies to all 3 rightmost dims.
+//!
+//!         Before applying swizzle, WarpMmaSwizzler will try to validate:
+//!           1. The "innermost-ness" of the rightmost 3 iterdomains. E.g:
+//!              Xo, Xi = split(X, 16),
+//!               Xo doesn't check, Xi would check.
+//!           2. The rightmost three are constant sized, and they are ordered as
+//!           M,N,K.
+//!             In the case of operand schedule before the broadcast, only 2 of
+//!             the axis are see, and they still need to follow the same order,
+//!             i.e. need to be M,K or N,K.
+//!           3. The rightmost three axes have matching size with the selected
+//!           mma macro.
+//!
+//!    Step 4. WarpMmaSwizzler will transform the rightmost 3 domains to the
+//!    correct swizzle
+//!     format and will parallelize the TIDx, which is reserved for lane id. The
+//!     transformed inner iterdomains will be locked with WarpMapped tag so that
+//!     they cannot be further transformed. Currently the only change that
+//!     scheduler can still do after this step is to vectorize the innermost
+//!     iterdomain.
+//!
+//! Notes:
+//!   This version of implementation is trying to balance the composition
+//!   flexibility and validation complexity. Currently the validation protocol
+//!   is that if the rightmost 3 dimensions given to WarpMmaSwizzler are indeed
+//!   innermost components of the 3 root id's and their dimensions match the mma
+//!   macro, the swizzle format produced by WarpMmaSwizzler will be correct for
+//!   the macro and we just lock the innermost iterdomains from further
+//!   transformations.
+//!
+//!   Ninja users/schedulers might go for 2 cases that we currently don't
+//!   support:
+//!
+//!   1. Equivalent affine transforms:
+//!     Even though the mma swizzles are affine, there are still infinitely many
+//!     equivalent ways to implement
+//!      the same affine transform. E.g. io,ii = split(i,8); ioii =
+//!      merge(io,ii); would make ioii equiv to i if it's a divisible split. One
+//!      can use this to construct infinite many equivalent affine swizzles.
+//!
+//!     Users/schedulers might want to have a different but equivalent affine
+//!     representation from the one provided
+//!      by WarpMmaSwizzler, but validating them needs some extra work
+//!      canonicalizing the affine transforms. So short term wouldn't support
+//!      this flexibility.
+//!
+//!   2. Swizzled data input:
+//!     It is also possible that the data input has other swizzles before
+//!     entering the fusion already and some might be natively compatible
+//!     with mma format. This is a very broad category of use cases
+//!     and we'd have to consider enabling any use like this case-by-case.
+class TORCH_CUDA_CU_API WarpMmaSwizzler {
+ public:
+  //! Applies the output mma swizzling to the given tv, should be used
+  //!  on mma output or tv's involved in epilog fusion, i.e. bias.
+  //! The rightmost iterdomains must follow the m,n,k convention before calling.
+  static void scheduleMmaWarpOutput(TensorView* tv, MmaOptions options);
+
+  //! Applies the input mma swizzling to the given tv, should be used
+  //!  on mma input or tv's involved in any fusion before mma, but after smem
+  //!  read.
+  //! The rightmost iterdomains must follow the m,n,k convention before calling.
+  static void scheduleOperandRead(
+      TensorView* tv,
+      MmaOptions options = MmaOptions());
+
+ private:
+  //! Swizzle implementations for Volta mma.
+  static void scheduleVoltaOperandRead(TensorView* tv, MmaOptions options);
+  static void scheduleVoltaM16N16K4Fp32Output(
+      TensorView* tv,
+      const MmaOptions& options);
+
+  //! Utility to lock the transformed dimensions from further transforms.
+  static void setWarpMapped(TensorView* tv, int number_of_dims);
+};
+
+void checkDimSize(
+    TensorView* tv,
+    std::vector<int> axis,
+    std::vector<int> expect);
+
+// Returns if the loopnest is initializing for an mma op.
+bool isMmaInitLoop(const kir::ForLoop* loop);
+
+} // namespace mma_util
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
index b856d83ac920..6f3b736579d9 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
 #include <ATen/cuda/CUDAContext.h>
@@ -43,6 +44,9 @@ ReductionParams innerPersistentHeuristic(
   // Set some targets for parallelization
   const int64_t n_elems = total_reduction_numel * total_iteration_numel;
 
+  const int64_t outer_reduction_numel =
+      total_reduction_numel / inner_most_dimension_numel;
+
   // WARNING: At some point we may want to generate heuristics for another
   // device that is not the current device.
   const int64_t device_max_threads_per_multiprocessor =
@@ -228,7 +232,7 @@ ReductionParams innerPersistentHeuristic(
   bdimz = std::min(
       std::min(
           std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1),
-          ceilDiv(total_reduction_numel, inner_most_dimension_numel)),
+          outer_reduction_numel),
       scheduler_utils::z_block_limit);
 
   // If 3D doesn't fill out the threads, adjust to add to bdimy
@@ -251,15 +255,13 @@ ReductionParams innerPersistentHeuristic(
 
     bdimz = std::min(
         std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1),
-        ceilDiv(total_reduction_numel, inner_most_dimension_numel));
+        outer_reduction_numel);
 
     bdimy = std::min(
         std::max(max_threads_in_block / (bdimx * bdimz), (int64_t)1),
         max_multi_reduction_factor);
   }
 
-  godim = ceilDiv(total_iteration_numel, bdimy);
-
   bool vectorize = false;
 
   // Move unrolling factor into vectorization upto vectorization limit.
@@ -275,8 +277,7 @@ ReductionParams innerPersistentHeuristic(
   if (inner_reduction_unroll_factor < max_unroll) {
     outer_reduction_unroll_factor = std::min(
         ceilDiv(max_unroll, inner_reduction_unroll_factor),
-        ceilDiv(
-            ceilDiv(total_reduction_numel, inner_most_dimension_numel), bdimz));
+        ceilDiv(outer_reduction_numel, bdimz));
   }
 
   godim = ceilDiv(total_iteration_numel, bdimy);
@@ -304,9 +305,8 @@ ReductionParams innerPersistentHeuristic(
   while (outer_reduction_unroll_factor < max_unroll &&
          batches_per_block_outer_reduction >= 2) {
     outer_reduction_unroll_factor *= 2;
-    batches_per_block_outer_reduction = roundUpPow2Or8(ceilDiv(
-        ceilDiv(total_reduction_numel, inner_most_dimension_numel),
-        bdimz * outer_reduction_unroll_factor));
+    batches_per_block_outer_reduction = roundUpPow2Or8(
+        ceilDiv(outer_reduction_numel, bdimz * outer_reduction_unroll_factor));
   }
 
   // If we haven't gotten to the max_unroll case, try to take it out of the
@@ -334,7 +334,7 @@ ReductionParams innerPersistentHeuristic(
       inner_most_dimension_numel,
       inner_reduction_unroll_factor * batches_per_block_inner_reduction);
   bdimz = ceilDiv(
-      ceilDiv(total_reduction_numel, inner_most_dimension_numel),
+      outer_reduction_numel,
       outer_reduction_unroll_factor * batches_per_block_outer_reduction);
 
   // Try moving persistent buffer factors into threads until we have too many
@@ -368,9 +368,8 @@ ReductionParams innerPersistentHeuristic(
       batches_per_block_outer_reduction =
           roundUpPow2Or8(batches_per_block_outer_reduction / 2);
       bdimz = ceilDiv(
-          ceilDiv(total_reduction_numel, inner_most_dimension_numel),
+          outer_reduction_numel,
           batches_per_block_outer_reduction * outer_reduction_unroll_factor);
-
       continue;
     }
     break;
@@ -410,20 +409,24 @@ ReductionParams innerPersistentHeuristic(
   pad_bdimx = pad_bdimx &&
       bdimx * inner_reduction_unroll_factor != inner_most_dimension_numel;
 
+  // Will be used once supporting inter-block persistence
+  int64_t gdimx = LaunchParams::UNINITIALIZED_VAL;
+  int64_t gdimy = LaunchParams::UNINITIALIZED_VAL;
+  int64_t gdimz = LaunchParams::UNINITIALIZED_VAL;
+
   ReductionParams rparams;
 
   rparams.persistent_kernel = true;
   rparams.fastest_dim = true;
 
   // Inner reduction domain
-  rparams.cross_block_inner_reduce = true;
+  rparams.cross_block_inner_reduction = true;
   rparams.block_dim_inner_reduction = ParallelType::TIDx;
   rparams.pad_inner_reduction_to_warp = pad_bdimx;
   rparams.batches_per_block_inner_reduction = batches_per_block_inner_reduction;
 
   // For persistent schedules always have to mark the reduction unrolled
   // otherwise rfactor can fail
-  rparams.unroll_inner_reduction = true;
   rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
   rparams.vectorize_inner_reduction = vectorize;
 
@@ -432,10 +435,16 @@ ReductionParams innerPersistentHeuristic(
   if (rparams.multiple_reds_per_blk) {
     rparams.block_dim_iter_dom = ParallelType::TIDy;
   }
-  rparams.grid_dim_iter_dom = ParallelType::BIDx;
-  rparams.split_grid_dim_iter_dom = godim > scheduler_utils::x_grid_limit;
+
+  if (godim > 1) {
+    rparams.grid_dim_iter_dom = ParallelType::BIDx;
+    if (godim > scheduler_utils::x_grid_limit) {
+      rparams.split_grid_dim_iter_dom = true;
+      gdimx = scheduler_utils::x_grid_limit;
+    }
+  }
+
   if (iter_unroll_factor > 1) {
-    rparams.unroll_iter_dom = true;
     rparams.unroll_factor_iter_dom = iter_unroll_factor;
   }
 
@@ -445,15 +454,14 @@ ReductionParams innerPersistentHeuristic(
     rparams.batches_per_block_outer_reduction =
         batches_per_block_outer_reduction;
     rparams.block_dim_outer_reduction = ParallelType::TIDz;
-    rparams.cross_block_outer_reduce = true;
-    rparams.unroll_outer_reduction = outer_reduction_unroll_factor > 1;
+    rparams.cross_block_outer_reduction = true;
     rparams.unroll_factor_outer_reduction = outer_reduction_unroll_factor;
   }
 
   rparams.lparams = LaunchParams(
-      LaunchParams::UNINITIALIZED_VAL,
-      LaunchParams::UNINITIALIZED_VAL,
-      LaunchParams::UNINITIALIZED_VAL,
+      gdimx,
+      gdimy,
+      gdimz,
       LaunchParams::UNINITIALIZED_VAL,
       bdimy,
       LaunchParams::UNINITIALIZED_VAL);
@@ -697,8 +705,8 @@ ReductionParams OuterPersistentHeuristic(
   rparams.persistent_kernel = true;
 
   rparams.fastest_dim = false;
-  rparams.cross_block_inner_reduce = true;
-  rparams.cross_grid_inner_reduce = false;
+  rparams.cross_block_inner_reduction = true;
+  rparams.cross_grid_inner_reduction = false;
   rparams.multiple_reds_per_blk = bdimx > 1;
 
   if (rparams.multiple_reds_per_blk) {
@@ -716,12 +724,11 @@ ReductionParams OuterPersistentHeuristic(
 
   // Always need to mark inner reduction unroll for rfactor in outer persitent
   // kernels
-  rparams.unroll_inner_reduction = true;
   rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
 
+  rparams.unroll_factor_iter_dom = iter_unroll_factor;
+
   if (iter_unroll_factor > 1) {
-    rparams.unroll_iter_dom = true;
-    rparams.unroll_factor_iter_dom = iter_unroll_factor;
     rparams.vectorize_iter_dom = vectorize;
   }
 
@@ -800,7 +807,8 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getPersistentHeuristics(
       HeuristicSummaryEntry<HeuristicCompileTime::ReductionTVs>(
           data_cache, [&fusion]() {
             return std::make_unique<std::vector<TensorView*>>(
-                scheduler_utils::getReductionTvs(fusion));
+                scheduler_utils::getReductionTvs(
+                    fusion /*, ignore_trivial = true */));
           });
 
   auto& reduction_tvs = reduction_tv_entry.get();
@@ -896,7 +904,8 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getPersistentHeuristics(
   size_t vectorize_factor = std::numeric_limits<size_t>::max();
 
   for (auto tv : vectorizable_inputs_outputs) {
-    const auto tv_vectorize_factor = runtime_info.getVectorizableWidth(tv);
+    const auto tv_vectorize_factor =
+        runtime_info.getInnerDimVectorizableWidth(tv);
     vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor);
   }
 
@@ -904,6 +913,15 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getPersistentHeuristics(
     vectorize_factor = 1;
   }
 
+  // Try expanding vectorization to contig merged domains
+  vectorize_factor = scheduler_utils::expandVectorizationToContigMergedDomains(
+      fusion,
+      runtime_info,
+      vectorizable_inputs_outputs,
+      first_red_tv,
+      (int)(first_red_tv->nDims() - properties.inner_most_dimension_ndims),
+      vectorize_factor);
+
   // Base max dtype and n_tensor_inputs on tensors that are vectorizable (i.e.
   // share inner dimension with data pattern we're looking at).
   size_t max_dtype_size = 1;
@@ -912,8 +930,12 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getPersistentHeuristics(
     if (!tv->isFusionInput()) {
       continue;
     }
-    max_dtype_size =
-        std::max(max_dtype_size, dataTypeSize(tv->getDataType().value()));
+
+    max_dtype_size = std::max(
+        max_dtype_size,
+        dataTypeSize(
+            tv->getDataType().value(),
+            indexModeToDtype(runtime_info.getIndexMode())));
     n_tensor_inputs++;
   }
 
@@ -956,7 +978,7 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
   // can invalidate the references since when applied to a reduction tensor view
   // the new tensor view contains the reduction and original doesn't.
 
-  bool unroll = rparams.unroll_inner_reduction || rparams.unroll_iter_dom;
+  bool unroll = rparams.isUnrolled();
 
   // Cache inputs if unrolled
   auto cached_inputs = scheduler_utils::cacheInputs(fusion, unroll);
@@ -970,9 +992,9 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
   scheduler_utils::clearMemorySpace(fusion);
 
   auto persistent_info = scheduler_utils::persistentBuffers(fusion);
-  // persistent_info.buffers[1]->setMemoryType(MemoryType::Shared);
 
-  auto reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+  auto reduction_tvs =
+      scheduler_utils::getReductionTvs(fusion /*, ignore_trivial = true */);
 
   TORCH_INTERNAL_ASSERT(reduction_tvs.size());
   auto reduction_tv = reduction_tvs[0];
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
index fb478f1110f3..5a482fc32111 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -2,15 +2,15 @@
 
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-
 #include <ATen/cuda/CUDAContext.h>
 
 namespace torch {
@@ -22,6 +22,146 @@ namespace {
 // constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1;
 // Unused at the moment, commenting for clang tidy
 constexpr int64_t kThreadX = 128;
+
+// DomainMap uses the ComputeAtMap to find a reference TensorView
+// that maps to all iterDomains in the fusion.
+class DomainMap {
+ public:
+  DomainMap(Fusion* fusion) : fusion_(fusion), ca_map_(ComputeAtMap(fusion)) {
+    view_tvs_ = scheduler_utils::getViewTVs(fusion);
+  }
+
+  // The pointwise scheduler heuristics requires a minimum number of axes.
+  // The output reference tensor should respect this requirement.
+  TensorView* findReferenceTensorView(int minimum_num_axes = 0) const {
+    for (auto output_tv :
+         ir_utils::filterByType<TensorView>(fusion_->outputs())) {
+      if (isValidReference(output_tv) &&
+          hasMinimumSize(output_tv, minimum_num_axes) &&
+          !output_tv->isFusionInput()) {
+        return output_tv;
+      }
+    }
+    return nullptr;
+  }
+
+  static bool hasReferenceTensorView(Fusion* fusion) {
+    FusionGuard fg(fusion);
+    DomainMap domain_map(fusion);
+    return domain_map.findReferenceTensorView() != nullptr;
+  }
+
+  // Determine if output TensorView is a valid reference tensor for this fusion.
+  // The reference tensor must map to all the iterDomains in each input.
+  bool isValidReference(TensorView* output_tv) const {
+    if (output_tv->isFusionInput()) {
+      return false;
+    }
+    for (auto input_tv :
+         ir_utils::filterByType<TensorView>(fusion_->inputs())) {
+      if (input_tv->uses().empty()) {
+        continue;
+      }
+
+      if (fusion_->getOutputAlias(output_tv) == input_tv) {
+        continue;
+      }
+
+      if (!areAllMapped(input_tv, output_tv)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  bool hasMinimumSize(TensorView* tv, int num_axes) const {
+    TORCH_INTERNAL_ASSERT(tv != nullptr);
+    return (num_axes == 0 || tv->getMaybeRFactorDomain().size() > num_axes);
+  }
+
+  // Determine if all iterDomains are mapped between input and output tvs
+  bool areAllMapped(TensorView* input_tv, TensorView* output_tv) const {
+    // Get concrete IDs for input root or rfactor domain
+    std::unordered_set<IterDomain*> in_concrete_ids;
+    for (auto in_id : input_tv->getMaybeRFactorDomain()) {
+      if (!ca_map_.getConcreteMappedID(in_id, IdMappingMode::EXACT)
+               ->isBroadcast() &&
+          !in_id->isReduction()) {
+        in_concrete_ids.insert(
+            ca_map_.getConcreteMappedID(in_id, IdMappingMode::EXACT));
+      }
+    }
+
+    // Erase all input concrete IDs mapped to the output domain
+    // Ignore unresolved broadcast dimensions
+    for (auto out_id : output_tv->getMaybeRFactorDomain()) {
+      if (!out_id->isBroadcast()) {
+        if (!eraseIfMapped(in_concrete_ids, out_id)) {
+          eraseIfMappedThroughView(in_concrete_ids, out_id);
+        }
+      }
+    }
+    return in_concrete_ids.empty();
+  }
+
+  // Erase input concrete ID if it is mapped to output ID
+  bool eraseIfMapped(
+      std::unordered_set<IterDomain*>& in_concrete_ids,
+      IterDomain* out_id) const {
+    auto out_concrete_id =
+        ca_map_.getConcreteMappedID(out_id, IdMappingMode::EXACT);
+    auto in_concrete_id_iter = in_concrete_ids.find(out_concrete_id);
+    bool found_match = in_concrete_id_iter != in_concrete_ids.end();
+    if (found_match) {
+      in_concrete_ids.erase(in_concrete_id_iter);
+    }
+    return found_match;
+  }
+
+  // Check if in_id is mapped to out_id through any view rfactor domain
+  void eraseIfMappedThroughView(
+      std::unordered_set<IterDomain*>& in_concrete_ids,
+      IterDomain* out_id) const {
+    for (auto view : view_tvs_) {
+      // Find any ID in view rfactor domain that is mapped to output ID
+      auto view_rfactor_id = anyMapped(view->getRFactorDomain(), out_id);
+      if (view_rfactor_id == nullptr) {
+        continue;
+      }
+
+      if (view_rfactor_id->isRFactorProduct()) {
+        // Check if input ID is mapped to any input IDs of the view rfactor ID
+        auto root_inputs = InputsOf::outputs(fusion_, {view_rfactor_id});
+        auto filtered_root_ids =
+            ir_utils::filterByType<IterDomain>(root_inputs);
+        for (auto view_root_id : filtered_root_ids) {
+          eraseIfMapped(in_concrete_ids, view_root_id);
+        }
+      } else {
+        // Otherwise, the input ID must map to the view rfactor ID
+        eraseIfMapped(in_concrete_ids, view_rfactor_id);
+      }
+    }
+  }
+
+  // Find any id in domain that maps with target id
+  IterDomain* anyMapped(
+      const std::vector<IterDomain*> domain,
+      IterDomain* target) const {
+    for (auto id : domain) {
+      if (ca_map_.areMapped(id, target, IdMappingMode::EXACT)) {
+        return id;
+      }
+    }
+    return nullptr;
+  }
+
+  Fusion* fusion_ = nullptr;
+  ComputeAtMap ca_map_;
+  std::vector<TensorView*> view_tvs_;
+};
+
 } // namespace
 
 c10::optional<PointwiseParams> getPointwiseHeuristics(
@@ -39,13 +179,25 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
   FUSER_PERF_SCOPE("getPointwiseHeuristics");
 
   FusionGuard fg(fusion);
+
+  // Incase any buffer is of type DataType::Index
+  DataType index_type = indexModeToDtype(runtime_info.getIndexMode());
+
   TensorView* largest_out = nullptr;
   int max_dims = -1;
 
   auto in_tvs = ir_utils::filterByType<TensorView>(fusion->inputs());
-  auto out_tvs_it = ir_utils::filterByType<TensorView>(fusion->outputs());
   // Will want to access this with direct indexing later, convert now.
-  std::vector<TensorView*> out_tvs(out_tvs_it.begin(), out_tvs_it.end());
+  std::vector<TensorView*> out_tvs;
+  // Only use valid reference tensors during heuristics analysis
+  DomainMap domain_map(fusion);
+  for (auto out_tv : ir_utils::filterByType<TensorView>(fusion->outputs())) {
+    if (domain_map.isValidReference(out_tv)) {
+      out_tvs.push_back(out_tv);
+    }
+  }
+  TORCH_INTERNAL_ASSERT(
+      !out_tvs.empty(), "No valid reference outputs were found!");
 
   for (auto out_tv : out_tvs) {
     int n_dims = 0;
@@ -63,10 +215,40 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
 
   TORCH_INTERNAL_ASSERT(largest_out != nullptr);
 
-  // If zero dimensional, return default parameters
+  const int64_t device_multiprocessor_count =
+      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+
+  // TODO: Set to 1?
+  int64_t max_input_dtype_size = 2;
+  size_t n_tensors = 0;
+
+  for (auto inp : in_tvs) {
+    max_input_dtype_size = std::max(
+        max_input_dtype_size,
+        (int64_t)dataTypeSize(inp->getDataType().value(), index_type));
+    n_tensors++;
+  }
+  n_tensors += std::distance(out_tvs.begin(), out_tvs.end());
+
+  auto ref_root = largest_out->getMaybeRFactorDomain();
+  std::vector<int64_t> elem_counts(ref_root.size(), 1);
+  int64_t n_elems = 1;
+  for (size_t ref_i = 0; ref_i < ref_root.size(); ref_i++) {
+    auto inferred_val =
+        runtime_info.expressionEvaluator().evaluate(ref_root[ref_i]->extent());
+    TORCH_INTERNAL_ASSERT(
+        inferred_val.has_value(),
+        "Error inferring size for pointwise scheduler: ",
+        ref_root[ref_i]->extent()->toInlineString());
+    elem_counts[ref_i] = inferred_val.value();
+    n_elems *= inferred_val.value();
+  }
+
+  // If zero dimensional or zero size, return default parameters
   if (TensorDomain::noReductions(
           TensorDomain::noBroadcasts(largest_out->domain()->domain()))
-          .size() == 0) {
+              .size() == 0 ||
+      n_elems == 0) {
     auto vectorizable_inputs_outputs_entry = HeuristicSummaryEntry<
         HeuristicCompileTime::VectorizableInputsAndOutputs>(data_cache, []() {
       return std::make_unique<std::vector<TensorView*>>();
@@ -86,42 +268,27 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
     return params;
   }
 
-  const int64_t device_multiprocessor_count =
-      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-
-  // TODO: Set to 1?
-  int64_t max_input_dtype_size = 2;
-  size_t n_tensors = 0;
-
-  for (auto inp : in_tvs) {
-    max_input_dtype_size = std::max(
-        max_input_dtype_size,
-        (int64_t)dataTypeSize(inp->getDataType().value()));
-    n_tensors++;
-  }
-  n_tensors += std::distance(out_tvs.begin(), out_tvs.end());
+  // Find all vectorizable inputs/outputs
+  auto vectorizable_inputs_outputs_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::VectorizableInputsAndOutputs>(
+          data_cache, [&largest_out]() {
+            return std::make_unique<std::vector<TensorView*>>(
+                scheduler_utils::getInputsOutputsWithInnerDim(
+                    largest_out, true));
+          });
 
   constexpr int64_t kSixteen = 16; // clang tidy
 
   auto max_unroll_factor = ceilDiv(
       // Available unrolling based on size of data type
       (int64_t)kSixteen / max_input_dtype_size,
-      // Reduce unrolling if we have many inputs, start reduction at 4 inputs
+      // Reduce max unrolling factor if we have many inputs/outputs to unroll
+      // as it could start consuming a lot of registers.
       std::max(
-          (scheduler_utils::lastPow2((int64_t)n_tensors) >> 2), (int64_t)1));
-
-  auto ref_root = largest_out->getMaybeRFactorDomain();
-  std::vector<int64_t> elem_counts(ref_root.size(), 1);
-  int64_t n_elems = 1;
-  for (size_t ref_i = 0; ref_i < ref_root.size(); ref_i++) {
-    auto inferred_val =
-        runtime_info.expressionEvaluator().evaluate(ref_root[ref_i]->extent());
-    TORCH_INTERNAL_ASSERT(
-        inferred_val.has_value(),
-        "Error inferring size for pointwise scheduler.");
-    elem_counts[ref_i] = inferred_val.value();
-    n_elems *= inferred_val.value();
-  }
+          (scheduler_utils::lastPow2(
+               (int64_t)vectorizable_inputs_outputs_entry.get().size()) >>
+           2),
+          (int64_t)1));
 
   // Don't unroll at the cost of getting a full wave on the GPU
   if (n_elems < device_multiprocessor_count * kThreadX &&
@@ -132,41 +299,13 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
   }
 
   // If we use RNG don't unroll so we can do correctness testing
-  if (fusion->isStochastic() && disableRNGUnrolling()) {
+  if (fusion->isStochastic() && isDisabled(DisableOption::UnrollWithRng)) {
     max_unroll_factor = 1;
   }
 
   PointwiseParams params;
   params.tag = "Pointwise heuristics";
 
-  // Don't try to vectorize if it's not recommended
-  params.inner_factor = 1;
-
-  // Vectorize as much as we can
-  size_t vectorize_factor = max_unroll_factor;
-
-  auto vectorizable_inputs_outputs_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::VectorizableInputsAndOutputs>(
-          data_cache, [&largest_out]() {
-            return std::make_unique<std::vector<TensorView*>>(
-                scheduler_utils::getInputsOutputsWithInnerDim(
-                    largest_out, true));
-          });
-
-  auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get();
-
-  for (auto tv : vectorizable_inputs_outputs) {
-    const auto tv_vectorize_factor = runtime_info.getVectorizableWidth(tv);
-    vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor);
-  }
-
-  if (vectorize_factor == 1) {
-    params.vectorize = false;
-    params.inner_factor = max_unroll_factor;
-  } else {
-    params.vectorize = true;
-    params.inner_factor = vectorize_factor;
-  }
   /*
    * 2D pointwise scheduling logic. What is expected is there's some
    * broadcasting pattern which would make scheduling as a 2D problem more
@@ -187,7 +326,11 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
    */
 
   // Ideal break point location
-  int64_t break_point = 0;
+  int break_point = 0;
+
+  // If break_point, mark if BIDy and BIDx should be positionally reversed
+  // relative to root domains
+  bool flip_grid_binding = false;
 
   // Elements on the right of break point (without break point all are on the
   // right)
@@ -200,20 +343,24 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
   // point.
   int64_t bdimy = 1;
 
-  // In 2D scheduler gdimx is used to parallelize the left side of the break
+  // In 2D scheduler gdim_left is used to parallelize the left side of the break
   // point.
-  int64_t gdimx = 1;
-
-  // gdimy is used if there's too much parallelization in the right side of the
-  // break point. We will expand grid parallelization into the right side of the
-  // break point with gdimx and use gdimy for the left side of the break point.
-  int64_t gdimy = 1;
-
-  auto broadcast_byte_multiples_entry = HeuristicSummaryEntry<
-      HeuristicCompileTime::BroadcastMultiples>(data_cache, [&largest_out]() {
-    return std::make_unique<std::vector<scheduler_utils::BroadcastMultiple>>(
-        scheduler_utils::getBroadcastMultiples(largest_out));
-  });
+  int64_t gdim_left = 1;
+
+  // gdim_right is used if there's too much parallelization in the right side of
+  // the break point. We will expand grid parallelization into the right side of
+  // the break point with gdim_left and use gdim_right for the left side of the
+  // break point.
+  int64_t gdim_right = 1;
+
+  auto broadcast_byte_multiples_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>(
+          data_cache, [&largest_out, &index_type]() {
+            return std::make_unique<
+                std::vector<scheduler_utils::BroadcastMultiple>>(
+                scheduler_utils::getBroadcastMultiples(
+                    largest_out, index_type));
+          });
 
   auto& broadcast_byte_multiples = broadcast_byte_multiples_entry.get();
 
@@ -221,13 +368,15 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
 
   int64_t dtype_sum = 0;
   for (auto inp : ir_utils::filterByType<TensorView>(fusion->inputs())) {
-    dtype_sum += dataTypeSize(inp->getDataType().value());
+    dtype_sum += dataTypeSize(inp->getDataType().value(), index_type);
   }
   for (auto out : ir_utils::filterByType<TensorView>(fusion->outputs())) {
-    dtype_sum += dataTypeSize(out->getDataType().value());
+    dtype_sum += dataTypeSize(out->getDataType().value(), index_type);
   }
 
-  {
+  { // Figure out break point position. Empty scope, consider moving to a
+    // separate function.
+    //
     // How much would this transfer cost if it was done as a 1-D schedule
     int64_t transfer_size_1d = 1;
 
@@ -247,10 +396,6 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
           cur_right_elem_count = cur_right_elem_count * elem_counts[right_i];
         }
 
-        if (cur_right_elem_count <= 1) {
-          continue;
-        }
-
         auto cur_left_elem_count = n_elems / cur_right_elem_count;
         if (cur_left_elem_count <= 1) {
           continue;
@@ -263,6 +408,7 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
 
         // Estimate transfer cost with this break point
         int64_t cur_transfer_size = 1;
+        int64_t right_transfer_size = 1;
 
         for (const auto left_i : c10::irange(break_point_i)) {
           cur_transfer_size =
@@ -270,26 +416,40 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
         }
 
         for (const auto right_i : c10::irange(break_point_i, ref_root.size())) {
-          cur_transfer_size =
-              cur_transfer_size * elem_counts[right_i] * rhs_byte_multiple;
+          right_transfer_size =
+              right_transfer_size * elem_counts[right_i] * rhs_byte_multiple;
         }
+        cur_transfer_size *= right_transfer_size;
 
         //  Continue if this break point doesn't save at least 10% of 1D
-        //  scheduling.
+        //  scheduling or isn't better than previous break_points found.
         if (cur_transfer_size >= min_total_transfer ||
             cur_transfer_size * 10 >= transfer_size_1d * 9) {
           continue;
         }
 
-        // Don't limit unroll factor with break point
-        if (cur_right_elem_count < max_unroll_factor) {
+        // Need to be able to parallelize, don't use break if there's not
+        // at least an unrolled warp.
+        if (ceilDiv(cur_right_elem_count, max_unroll_factor) <=
+            at::cuda::getCurrentDeviceProperties()->warpSize) {
           continue;
         }
 
+        // If outer broadcast, or balanced broadcast:
+        if (lhs_byte_multiple <= rhs_byte_multiple &&
+            // If right transfer size is bigger than half of L2
+            at::cuda::getCurrentDeviceProperties()->l2CacheSize <
+                right_transfer_size * 2) {
+          // flip BIDx and BIDy bindings
+          flip_grid_binding = true;
+        } else {
+          flip_grid_binding = false;
+        }
+        // Min transfer found, start setting values
         bdimx = std::min(
             ceilDiv(cur_right_elem_count, max_unroll_factor), kThreadX);
         bdimy = 1;
-        gdimy = 1;
+        gdim_right = 1;
         // Put remainder in bdimy if there's at least a wave of grid level
         // parallelism.
         if (cur_left_elem_count > device_multiprocessor_count) {
@@ -297,32 +457,71 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
         }
         auto remainder_left = ceilDiv(cur_left_elem_count, bdimy);
         auto remainder_right =
-            ceilDiv(cur_right_elem_count, bdimy * bdimx * max_unroll_factor);
-
+            ceilDiv(cur_right_elem_count, bdimx * max_unroll_factor);
         // Use this break point
-        break_point = break_point_i;
+        break_point = static_cast<int>(break_point_i);
         min_total_transfer = cur_transfer_size;
         right_elem_count = cur_right_elem_count;
 
-        gdimx = remainder_left;
-        if (remainder_right > 1 && bdimy <= 1) {
-          gdimy = remainder_right;
-        }
+        gdim_left = remainder_left;
+        gdim_right = remainder_right;
       }
     }
   }
 
-  TORCH_INTERNAL_ASSERT(right_elem_count > 0 || params.break_point == 0);
+  // Vectorizing innermost domains
+
+  // Don't try to vectorize if it's not recommended
+  params.unroll_factor = 1;
+
+  // Compute maximum vectorize factor that can be used
+  size_t vectorize_factor = max_unroll_factor;
+  auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get();
+
+  for (auto tv : vectorizable_inputs_outputs) {
+    const auto tv_vectorize_factor =
+        runtime_info.getInnerDimVectorizableWidth(tv);
+    vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor);
+  }
+
+  // Try expanding vectorization to contig merged domains
+  auto expanded_vector_word_size =
+      scheduler_utils::expandVectorizationToContigMergedDomains(
+          fusion,
+          runtime_info,
+          vectorizable_inputs_outputs,
+          largest_out,
+          break_point,
+          vectorize_factor);
+
+  expanded_vector_word_size = std::min(
+      static_cast<size_t>(max_unroll_factor), expanded_vector_word_size);
+
+  if (expanded_vector_word_size > vectorize_factor) {
+    vectorize_factor = expanded_vector_word_size;
+  }
+
+  if (vectorize_factor == 1) {
+    params.vectorize = false;
+    params.unroll_factor = max_unroll_factor;
+  } else {
+    params.vectorize = true;
+    params.unroll_factor = vectorize_factor;
+  }
+
+  TORCH_INTERNAL_ASSERT(right_elem_count > 0 || break_point == 0);
+  TORCH_INTERNAL_ASSERT(!(bdimy > 1 && gdim_right > 1));
 
-  TORCH_INTERNAL_ASSERT(!(bdimy > 1 && gdimy > 1));
   params.break_point = break_point;
+  params.flip_grid_binding = flip_grid_binding;
   params.split_block = bdimy > 1;
 
   params.lparams.bind(bdimx, ParallelType::TIDx);
   if (params.split_block) {
     params.lparams.bind(bdimy, ParallelType::TIDy);
   }
-  if (gdimy > 65535) {
+  if ((flip_grid_binding && gdim_right > 65535) ||
+      (!flip_grid_binding && gdim_left > 65535)) {
     params.split_grid_y_dim = true;
   }
 
@@ -338,6 +537,9 @@ c10::optional<PointwiseParams> getPointwiseHeuristics(
       std::cerr << "(" << multiple.lhs_multiple << ", " << multiple.rhs_multiple
                 << "), ";
     }
+    std::cerr << "LHS elems: "
+              << (right_elem_count > 0 ? n_elems / right_elem_count : 0)
+              << " RHS elems: " << right_elem_count << std::endl;
     std::cerr << std::endl;
     std::cerr << params.toString() << std::endl;
   }
@@ -369,128 +571,12 @@ size_t nRootDims(const TensorView* tv) {
   }
   return tv_n_dims;
 }
-
-// DomainMap uses the ComputeAtMap to find a reference TensorView
-// that maps to all iterDomains in the fusion.
-class DomainMap {
- public:
-  DomainMap(Fusion* fusion)
-      : fusion_(fusion),
-        ca_index_map_(ComputeAtMap(ComputeAtMap::MappingMode::INDEX)) {
-    ca_index_map_.build(fusion);
-    view_tvs_ = scheduler_utils::getViewTVs(fusion);
-  }
-
-  TensorView* findReferenceTensorView() const {
-    auto fusion_outputs = fusion_->outputs();
-    for (auto output_tv : ir_utils::filterByType<TensorView>(fusion_outputs)) {
-      if (isValidReference(output_tv)) {
-        return output_tv;
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  // Determine if output TensorView is a valid reference tensor for this fusion.
-  // The reference tensor must map to all the iterDomains in each input.
-  bool isValidReference(TensorView* output_tv) const {
-    auto fusion_inputs = fusion_->inputs();
-    for (auto input_tv : ir_utils::filterByType<TensorView>(fusion_inputs)) {
-      if (input_tv->uses().empty()) {
-        continue;
-      }
-
-      if (fusion_->getOutputAlias(output_tv) == input_tv) {
-        continue;
-      }
-
-      if (!areAllMapped(input_tv, output_tv)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Determine if all iterDomains are mapped between input and output tvs
-  bool areAllMapped(TensorView* input_tv, TensorView* output_tv) const {
-    // Get concrete IDs for input root or rfactor domain
-    std::unordered_set<IterDomain*> in_concrete_ids;
-    for (auto in_id : input_tv->getMaybeRFactorDomain()) {
-      if (!in_id->isBroadcast() && !in_id->isReduction()) {
-        in_concrete_ids.insert(ca_index_map_.getConcreteMappedID(in_id));
-      }
-    }
-
-    // Erase all input concrete IDs mapped to the output domain
-    for (auto out_id : output_tv->getMaybeRFactorDomain()) {
-      if (!out_id->isBroadcast() && !out_id->isReduction()) {
-        if (!eraseIfMapped(in_concrete_ids, out_id)) {
-          eraseIfMappedThroughView(in_concrete_ids, out_id);
-        }
-      }
-    }
-    return in_concrete_ids.empty();
-  }
-
-  // Erase input concrete ID if it is mapped to output ID
-  bool eraseIfMapped(
-      std::unordered_set<IterDomain*>& in_concrete_ids,
-      IterDomain* out_id) const {
-    auto out_concrete_id = ca_index_map_.getConcreteMappedID(out_id);
-    auto in_concrete_id_iter = in_concrete_ids.find(out_concrete_id);
-    bool found_match = in_concrete_id_iter != in_concrete_ids.end();
-    if (found_match) {
-      in_concrete_ids.erase(in_concrete_id_iter);
-    }
-    return found_match;
-  }
-
-  // Check if in_id is mapped to out_id through any view rfactor domain
-  void eraseIfMappedThroughView(
-      std::unordered_set<IterDomain*>& in_concrete_ids,
-      IterDomain* out_id) const {
-    for (auto view : view_tvs_) {
-      // Find any ID in view rfactor domain that is mapped to output ID
-      auto view_rfactor_id = anyMapped(view->getRFactorDomain(), out_id);
-      if (view_rfactor_id == nullptr) {
-        continue;
-      }
-
-      if (view_rfactor_id->isRFactorProduct()) {
-        // Check if input ID is mapped to any input IDs of the view rfactor ID
-        auto root_inputs = InputsOf::outputs(fusion_, {view_rfactor_id});
-        auto filtered_root_ids =
-            ir_utils::filterByType<IterDomain>(root_inputs);
-        for (auto view_root_id : filtered_root_ids) {
-          eraseIfMapped(in_concrete_ids, view_root_id);
-        }
-      } else {
-        // Otherwise, the input ID must map to the view rfactor ID
-        eraseIfMapped(in_concrete_ids, view_rfactor_id);
-      }
-    }
-  }
-
-  // Find any id in domain that maps with target id
-  IterDomain* anyMapped(
-      const std::vector<IterDomain*> domain,
-      IterDomain* target) const {
-    for (auto id : domain) {
-      if (ca_index_map_.areMapped(id, target)) {
-        return id;
-      }
-    }
-    return nullptr;
-  }
-
-  Fusion* fusion_ = nullptr;
-  ComputeAtMap ca_index_map_;
-  std::vector<TensorView*> view_tvs_;
-};
-
 } // namespace
 
+bool hasReferenceTensorView(Fusion* fusion) {
+  return DomainMap::hasReferenceTensorView(fusion);
+}
+
 // TODO: Inline intermediate operations (avoid inlining unrolled/vectorized
 // input/output caches)
 void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
@@ -503,14 +589,15 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
   // maybe has_reduction for scheduling should be done on a per output tensor
   // basis.
   TORCH_INTERNAL_ASSERT(
-      !fusion->hasReduction(), "This scheduler only handles pointwise ops.");
+      ir_utils::getReductionOps(fusion /*, ignore_trivial=true */).empty(),
+      "This scheduler only handles pointwise ops.");
 
-  // For intermediate outputs, apply cache_fork
+  // For intermediate outputs, apply cacheFork
   auto outs = fusion->outputs();
   for (const auto output : outs) {
     if (!output->uses().empty() && output->definition() != nullptr) {
       if (output->getValType().value() == ValType::TensorView) {
-        output->as<TensorView>()->cache_fork();
+        output->as<TensorView>()->cacheFork();
       }
     }
   }
@@ -543,7 +630,8 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
   }
 
   DomainMap domain_map(fusion);
-  TensorView* reference_tv = domain_map.findReferenceTensorView();
+  TensorView* reference_tv =
+      domain_map.findReferenceTensorView(params.break_point);
 
   TORCH_INTERNAL_ASSERT(
       reference_tv != nullptr,
@@ -568,7 +656,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
   // Caches of inputs
   std::vector<TensorView*> cached_inputs;
 
-  // Output, cache_before of output
+  // Output, cacheBefore of output
   std::vector<std::pair<TensorView*, TensorView*>> cached_outputs;
 
   // Track what should be vectorized versus unrolled
@@ -576,10 +664,10 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
 
   // Figure out which inputs to cache for unrolling or vectorization
   for (auto inp : input_tvs) {
-    if (inp->uses().empty()) {
+    if (inp->uses().empty() || inp->isFusionOutput()) {
       continue;
     }
-    cached_inputs.emplace_back(inp->cache_after());
+    cached_inputs.emplace_back(inp->cacheAfter());
   }
 
   // Figure out which outputs to cache for unrolling or vectorization
@@ -587,7 +675,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
     if (out->definition() == nullptr) {
       continue;
     }
-    cached_outputs.emplace_back(std::make_pair(out, out->cache_before()));
+    cached_outputs.emplace_back(std::make_pair(out, out->cacheBefore()));
   }
 
   auto all_tvs = ir_utils::allTvs(fusion);
@@ -637,7 +725,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
     reference_tv->reorder({{lhs_i, 0}, {-1, 1}});
 
     if (params.vectorize) {
-      reference_tv->split(1, params.inner_factor);
+      reference_tv->split(1, params.unroll_factor);
       reference_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
       reference_tv->split(0, 1);
       // [outer, Unswitch | i-remainder, TIDx, Vectorization]
@@ -654,7 +742,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       //[outer | i-remainder, Unswitch, Vectorization, TIDx]
     } else {
       reference_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-      reference_tv->split(1, params.inner_factor);
+      reference_tv->split(1, params.unroll_factor);
 
       reference_tv->split(0, 1);
       // [outer, unswitch | i-remainder, unroll, TIDx ]
@@ -672,31 +760,65 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
     //[i-remainder | outer | Unswitch, Unroll, TIDx]
     if (params.split_block) {
       reference_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
-      // [i-remainder | BIDx TIDy | Unswitch, Unroll, TIDx]
-      reference_tv->axis(1)->parallelize(ParallelType::BIDx);
-      reference_tv->axis(2)->parallelize(ParallelType::TIDy);
+      if (params.flip_grid_binding) {
+        // [BIDy | BIDx, TIDy | Unswitch, Unroll, TIDx]
+        reference_tv->axis(1)->parallelize(ParallelType::BIDx);
+        reference_tv->axis(2)->parallelize(ParallelType::TIDy);
+        if (params.split_grid_y_dim) {
+          // [i-remainder, BIDy{65535} | BIDx, TIDy | Unswitch, Unroll, TIDx]
+          reference_tv->split(0, 65535);
+          reference_tv->axis(1)->parallelize(ParallelType::BIDy);
+        } else {
+          reference_tv->axis(0)->parallelize(ParallelType::BIDy);
+        }
+      } else {
+        // [BIDx | BIDy TIDy | Unswitch, Unroll, TIDx]
+        reference_tv->axis(0)->parallelize(ParallelType::BIDx);
+        reference_tv->axis(2)->parallelize(ParallelType::TIDy);
+        if (params.split_grid_y_dim) {
+          // [BIDx | i-remainder, BIDy{65535}, TIDy | Unswitch, Unroll, TIDx]
+          reference_tv->split(1, 65535);
+          reference_tv->axis(2)->parallelize(ParallelType::BIDy);
+        } else {
+          reference_tv->axis(1)->parallelize(ParallelType::BIDy);
+        }
+      }
     } else {
       // [BIDy | BIDx | Unswitch, Unroll, TIDx]
-      reference_tv->axis(1)->parallelize(ParallelType::BIDx);
-      if (params.split_grid_y_dim) {
-        reference_tv->split(0, 65535);
-        reference_tv->axis(1)->parallelize(ParallelType::BIDy);
+      if (params.flip_grid_binding) {
+        // [BIDy | BIDx | Unswitch, Unroll, TIDx]
+        reference_tv->axis(1)->parallelize(ParallelType::BIDx);
+        if (params.split_grid_y_dim) {
+          // [i-remainder, BIDy{65535} | BIDx | Unswitch, Unroll, TIDx]
+          reference_tv->split(0, 65535);
+          reference_tv->axis(1)->parallelize(ParallelType::BIDy);
+        } else {
+          reference_tv->axis(0)->parallelize(ParallelType::BIDy);
+        }
       } else {
-        reference_tv->axis(0)->parallelize(ParallelType::BIDy);
+        // [BIDx | BIDy | Unswitch, Unroll, TIDx]
+        reference_tv->axis(0)->parallelize(ParallelType::BIDx);
+        if (params.split_grid_y_dim) {
+          // [BIDx | i-remainder, BIDy{65535} | Unswitch, Unroll, TIDx]
+          reference_tv->split(1, 65535);
+          reference_tv->axis(2)->parallelize(ParallelType::BIDy);
+        } else {
+          reference_tv->axis(1)->parallelize(ParallelType::BIDy);
+        }
       }
     }
-
   } else {
     // 1D Scheduler
     TORCH_INTERNAL_ASSERT(rhs_i >= 0 && lhs_i == -1);
 
-    // right hand side exists and is the only axis we care to schedule, move it
-    // from the inner most position to left most. Order as [rhs_i, unmerged...]
+    // right hand side exists and is the only axis we care to schedule, move
+    // it from the inner most position to left most. Order as [rhs_i,
+    // unmerged...]
     reference_tv->reorder({{-1, 0}});
 
     if (params.vectorize) {
       // Vectorize
-      reference_tv->split(0, params.inner_factor);
+      reference_tv->split(0, params.unroll_factor);
       // Unswitch
       reference_tv->split(0, 1);
       // Threads
@@ -705,8 +827,8 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       reference_tv->axis(0)->parallelize(ParallelType::BIDx);
       reference_tv->axis(1)->parallelize(ParallelType::TIDx);
       reference_tv->axis(2)->parallelize(ParallelType::Unswitch);
-      // Aggressively mark with vectorized and cleanup later. That way we don't
-      // have to manually specify parallelization outside the reference.
+      // Aggressively mark with vectorized and cleanup later. That way we
+      // don't have to manually specify parallelization outside the reference.
       reference_tv->axis(3)->parallelize(ParallelType::Vectorize);
 
       //[BIDx, TIDx, Unswitch, Vectorization]
@@ -717,7 +839,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       // Threads
       reference_tv->split(0, kThreadX);
       // Unroll
-      reference_tv->split(0, params.inner_factor);
+      reference_tv->split(0, params.unroll_factor);
       // Unswitch
       reference_tv->split(0, 1);
 
@@ -727,19 +849,33 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       reference_tv->axis(3)->parallelize(ParallelType::TIDx);
     }
   }
+
   TransformPropagator::from(reference_tv);
   scheduler_utils::parallelizeAllLike(reference_tv, all_tvs);
 
   if (params.vectorize) {
     // Grab all tensor views that should be vectorized
-    auto vectorizable_inputs_outputs =
+    auto vectorized_tvs =
         scheduler_utils::getInputsOutputsWithInnerDim(reference_tv, true);
+    // Going to move inputs to consumers of inputs, need a copy as we'll modify
+    // the original.
+    {
+      auto vectorized_tvs_copy = vectorized_tvs;
+      for (auto inp : vectorized_tvs_copy) {
+        if (!inp->isFusionInput()) {
+          continue;
+        }
+        vectorized_tvs.erase(
+            std::find(vectorized_tvs.begin(), vectorized_tvs.end(), inp));
+        auto consumer_tvs = ir_utils::consumerTvsOf(inp);
+        vectorized_tvs.insert(
+            vectorized_tvs.end(), consumer_tvs.begin(), consumer_tvs.end());
+      }
+    }
     // Clear vectorize on tensors that shouldn't have it
     for (auto tv : all_tvs) {
-      if (std::find(
-              vectorizable_inputs_outputs.begin(),
-              vectorizable_inputs_outputs.end(),
-              tv) == vectorizable_inputs_outputs.end()) {
+      if (std::find(vectorized_tvs.begin(), vectorized_tvs.end(), tv) ==
+          vectorized_tvs.end()) {
         for (auto id : tv->domain()->domain()) {
           if (id->getParallelType() == ParallelType::Vectorize) {
             id->parallelize(ParallelType::Serial);
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
index cb626556579f..57b77bb20cc9 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
@@ -31,6 +31,11 @@ TORCH_CUDA_CU_API LaunchParams schedulePointwise(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& runtime_inputs);
 
+//! Utility for canSchedule interface to check if this fusion has
+//!  a fully broadcasted reference tensor, which is necessary for
+//!  the pointwise scheduler.
+bool hasReferenceTensorView(Fusion* fusion);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h
index dc5d9db89d47..73d49bb985ad 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h
@@ -30,8 +30,14 @@ class PointwiseParams {
   // Split grid y dimension, if otherwise it would be too large
   bool split_grid_y_dim = false;
 
+  // For many instances having BIDx on the inner most dimension is the most
+  // performant parallel binding. However, if we're broadcasting the outer
+  // dimension with a large inner dimension, it can be more performant to bind
+  // BIDy on the inner most dimension.
+  bool flip_grid_binding = false;
+
   // Unroll or vectorization factor
-  int64_t inner_factor = 1;
+  size_t unroll_factor = 1;
 
   std::string tag = "";
 
@@ -42,7 +48,8 @@ class PointwiseParams {
     bool attr_equal = other.vectorize == vectorize &&
         other.break_point == break_point && other.split_block == split_block &&
         other.split_grid_y_dim == split_grid_y_dim &&
-        other.inner_factor == inner_factor;
+        other.unroll_factor == unroll_factor &&
+        other.flip_grid_binding == flip_grid_binding;
     return attr_equal;
   }
 
@@ -62,13 +69,16 @@ class PointwiseParams {
         ss << "  Split y grid dim\n";
       }
     }
-    if (inner_factor > 1) {
+    if (unroll_factor > 1) {
       if (vectorize) {
-        ss << "Vectorize, Factor: " << inner_factor << "\n";
+        ss << "Vectorize, Factor: " << unroll_factor << "\n";
       } else {
-        ss << "Unroll, Factor: " << inner_factor << "\n";
+        ss << "Unroll, Factor: " << unroll_factor << "\n";
       }
     }
+    if (flip_grid_binding) {
+      ss << "Flip BIDx/BIDy bindings\n";
+    }
     ss << "====================================\n";
     return ss.str();
   }
@@ -82,7 +92,8 @@ class PointwiseParamsHash {
         static_cast<size_t>(pp.break_point) << 4 ^
         static_cast<size_t>(pp.split_block) << 5 ^
         static_cast<size_t>(pp.split_grid_y_dim) << 6 ^
-        static_cast<size_t>(pp.inner_factor) << 9;
+        static_cast<size_t>(pp.unroll_factor) << 9 ^
+        static_cast<size_t>(pp.flip_grid_binding) << 10;
     return attr_hash;
   }
 };
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
index b0d4f12b9211..1696242f4ff2 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
@@ -20,6 +21,52 @@ namespace cuda {
 
 namespace {
 
+// Rounds x up to a power of 2 or a multiple of multiple
+int64_t roundUpPow2OrMultipleOf(const int64_t x, const int64_t multiple) {
+  auto round_up_pow2 = scheduler_utils::lastPow2(x);
+  if (round_up_pow2 < x) {
+    round_up_pow2 *= 2;
+  }
+  auto round_up_multiple =
+      x % multiple == 0 ? x : x + (multiple - x % multiple);
+  return std::max(std::min(round_up_multiple, round_up_pow2), (int64_t)1);
+}
+
+// Rounds x down to a power of 2 or a multiple of multiple, whichever is bigger
+int64_t roundDownPow2OrMultipleOf(const int64_t x, const int64_t multiple) {
+  auto round_down_pow2 = scheduler_utils::lastPow2(x);
+
+  auto round_down_multiple = x % multiple == 0 ? x : x - x % multiple;
+  return std::max(std::max(round_down_multiple, round_down_pow2), (int64_t)1);
+}
+
+// Div x by y, but min at 1
+int64_t safeDiv(const int64_t x, const int64_t y) {
+  return std::max(x / y, (int64_t)1);
+}
+
+int64_t clamp(const int64_t val, const int64_t min_val, const int64_t max_val) {
+  return std::min(std::max(val, min_val), max_val);
+}
+
+// Reduce x, y, z until it's product is less than max value, reduce round robin
+// starting with x
+void reduceProductTo(int64_t& z, int64_t& y, int64_t& x, const int64_t max) {
+  TORCH_INTERNAL_ASSERT(max > 1);
+  if (z * y * x > max) {
+    z = safeDiv(z, 2);
+  }
+  if (z * y * x > max) {
+    y = safeDiv(y, 2);
+  }
+  if (z * y * x > max) {
+    x = safeDiv(x, 2);
+  }
+  if (z * y * x > max) {
+    reduceProductTo(x, y, z, max);
+  }
+}
+
 ReductionParams innerReductionHeuristic(
     const int64_t total_reduction_numel,
     const int64_t total_iteration_numel,
@@ -77,7 +124,7 @@ ReductionParams innerReductionHeuristic(
       (int64_t)16);
 
   // Take the smaller
-  const int64_t warp_size =
+  const int64_t min_warp_size =
       std::min(warp_size_based_on_l1, warp_size_based_on_l2);
 
   // Initialization
@@ -94,19 +141,19 @@ ReductionParams innerReductionHeuristic(
   // Start trying to break parallelization up across threads,
   // unrolling/iterations, and blocks.
 
-  // max_threads_in_block is the cap on a thread block, the minimum is based on
-  // warp_size
-  int64_t max_threads_in_block = std::max(
-      warp_size, ceilDiv(total_reduction_numel, min_target_iterations));
+  // target_threads_in_block is the cap on a thread block, the minimum is based
+  // on min_warp_size
+  int64_t target_threads_in_block = std::max(
+      min_warp_size, ceilDiv(total_reduction_numel, min_target_iterations));
 
   // If we have one warp per block, check if that's enough to saturate the SMs
-  target_blocks = ceilDiv(n_elems, warp_size);
+  target_blocks = ceilDiv(n_elems, min_warp_size);
 
   // If we have more than a wave of blocks, put parallelism into unrolling and
   // target iterations
   if (target_blocks > device_multiprocessor_count) {
     auto available_unroll = std::max(
-        n_elems / (warp_size * device_multiprocessor_count), (int64_t)1);
+        n_elems / (min_warp_size * device_multiprocessor_count), (int64_t)1);
 
     // Spread across unrolling and iterations, want a balance of the two so flip
     // back and forth to alternate adding to them.
@@ -126,7 +173,7 @@ ReductionParams innerReductionHeuristic(
 
       available_unroll = std::max(
           n_elems /
-              (warp_size * device_multiprocessor_count * target_unroll *
+              (min_warp_size * device_multiprocessor_count * target_unroll *
                target_iterations),
           (int64_t)1);
 
@@ -135,7 +182,7 @@ ReductionParams innerReductionHeuristic(
 
     // Recompute target blocks
     target_blocks =
-        ceilDiv(n_elems, warp_size * target_unroll * target_iterations);
+        ceilDiv(n_elems, min_warp_size * target_unroll * target_iterations);
   }
 
   // Cap target blocks to 4 waves
@@ -143,14 +190,15 @@ ReductionParams innerReductionHeuristic(
 
   if (target_blocks * target_unroll * target_iterations < n_elems) {
     // targetting 4 waves, so try to use a quarter of available threads
-    max_threads_in_block = std::min(
+    target_threads_in_block = std::min(
         ceilDiv(n_elems, target_blocks * target_unroll),
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
   }
 
   // Round up to nearest warp.
-  if (max_threads_in_block % warp_size != 0) {
-    max_threads_in_block += warp_size - max_threads_in_block % warp_size;
+  if (target_threads_in_block % min_warp_size != 0) {
+    target_threads_in_block +=
+        min_warp_size - target_threads_in_block % min_warp_size;
   }
 
   // To get to target threads:
@@ -189,50 +237,52 @@ ReductionParams innerReductionHeuristic(
   bdimx = std::min(
       std::max(
           ceilDiv(inner_most_dimension_numel, inner_reduction_unroll_factor),
-          (int64_t)warp_size),
-      max_threads_in_block);
+          (int64_t)min_warp_size),
+      target_threads_in_block);
 
   // If we're not just barely covering the dimension, round to a more friendly
   // number
   if (bdimx * inner_reduction_unroll_factor != inner_most_dimension_numel) {
     // Round bdimx down to multiple of warp size or power 2
-    if (bdimx < warp_size) {
+    if (bdimx < min_warp_size) {
       bdimx = scheduler_utils::lastPow2(bdimx);
     } else {
-      bdimx = bdimx - bdimx % warp_size;
+      bdimx = bdimx - bdimx % min_warp_size;
     }
   }
 
   // Put everything else in bdimy for now
-  bdimy = std::max(warp_size / bdimx, (int64_t)1);
+  bdimy = std::max(min_warp_size / bdimx, (int64_t)1);
 
   // If 3D fill the rest of the threads into bdimz
   bdimz = std::min(
       std::min(
-          std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1),
+          std::max(target_threads_in_block / (bdimx * bdimy), (int64_t)1),
           ceilDiv(total_reduction_numel, inner_most_dimension_numel)),
       scheduler_utils::z_block_limit);
 
   // If 3D doesn't fill out the threads, adjust to add to bdimy
-  bdimy = std::max(max_threads_in_block / (bdimx * bdimz), (int64_t)1);
+  bdimy = std::max(target_threads_in_block / (bdimx * bdimz), (int64_t)1);
 
   // If we don't have a full warp and have an unroll factor, move unroll into
   // bdimx
-  if (bdimx * bdimy * bdimz < warp_size && inner_reduction_unroll_factor > 1) {
+  if (bdimx * bdimy * bdimz < min_warp_size &&
+      inner_reduction_unroll_factor > 1) {
     bdimx = std::min(
-        std::max(inner_most_dimension_numel, warp_size), max_threads_in_block);
+        std::max(inner_most_dimension_numel, min_warp_size),
+        target_threads_in_block);
 
     inner_reduction_unroll_factor =
         std::min(ceilDiv(inner_most_dimension_numel, bdimx), max_unroll);
 
     // Readjust bdimy and bdimz
-    bdimy = std::max(warp_size / bdimx, (int64_t)1);
+    bdimy = std::max(min_warp_size / bdimx, (int64_t)1);
 
     bdimz = std::min(
-        std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1),
+        std::max(target_threads_in_block / (bdimx * bdimy), (int64_t)1),
         ceilDiv(total_reduction_numel, inner_most_dimension_numel));
 
-    bdimy = std::max(max_threads_in_block / (bdimx * bdimz), (int64_t)1);
+    bdimy = std::max(target_threads_in_block / (bdimx * bdimz), (int64_t)1);
   }
 
   godim = ceilDiv(total_iteration_numel, bdimy);
@@ -334,9 +384,9 @@ ReductionParams innerReductionHeuristic(
 
   ReductionParams rparams;
   rparams.fastest_dim = true;
-  rparams.cross_block_inner_reduce = true;
+  rparams.cross_block_inner_reduction = true;
   rparams.block_dim_inner_reduction = ParallelType::TIDx;
-  rparams.cross_grid_inner_reduce = gridim > 1;
+  rparams.cross_grid_inner_reduction = gridim > 1;
   rparams.multiple_reds_per_blk = bdimy > 1;
   bool pad_bdimx = bdimx > 16 &&
       bdimx * bdimy <
@@ -348,32 +398,30 @@ ReductionParams innerReductionHeuristic(
 
   if (rparams.pad_inner_reduction_to_warp) {
     // Adjust bdimx based on padding
-    auto warp_size = (int64_t)at::cuda::getCurrentDeviceProperties()->warpSize;
-    bdimx =
-        bdimx % warp_size == 0 ? bdimx : bdimx + warp_size - bdimx % warp_size;
+    auto min_warp_size =
+        (int64_t)at::cuda::getCurrentDeviceProperties()->warpSize;
+    bdimx = bdimx % min_warp_size == 0
+        ? bdimx
+        : bdimx + min_warp_size - bdimx % min_warp_size;
   }
 
-  if (inner_reduction_unroll_factor || iter_unroll_factor == 1) {
-    rparams.unroll_inner_reduction = true;
-    rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
-    rparams.vectorize_inner_reduction = vectorize;
-  }
+  rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
+  rparams.vectorize_inner_reduction = vectorize;
 
-  rparams.block_dim_iter_dom = ParallelType::TIDy;
-  if (iter_unroll_factor > 1) {
-    rparams.unroll_iter_dom = true;
-    rparams.unroll_factor_iter_dom = iter_unroll_factor;
+  if (rparams.multiple_reds_per_blk) {
+    rparams.block_dim_iter_dom = ParallelType::TIDy;
   }
 
+  rparams.unroll_factor_iter_dom = iter_unroll_factor;
+
   rparams.schedule_3D = total_reduction_numel != inner_most_dimension_numel;
   // Outer reduction domain
   if (rparams.schedule_3D) {
-    rparams.cross_grid_outer_reduce = grodim > 1;
+    rparams.cross_grid_outer_reduction = grodim > 1;
     if (bdimz > 1) {
       rparams.block_dim_outer_reduction = ParallelType::TIDz;
-      rparams.cross_block_outer_reduce = true;
+      rparams.cross_block_outer_reduction = true;
     }
-    rparams.unroll_outer_reduction = outer_reduction_unroll_factor > 1;
     rparams.unroll_factor_outer_reduction = outer_reduction_unroll_factor;
   }
 
@@ -385,39 +433,40 @@ ReductionParams innerReductionHeuristic(
   // gdimx assigned to grdim. Otherwise it's helpful to pull godim into gdimx in
   // case it's larger than gdimy can hold, as not doing so can thrash the cache.
 
-  if (rparams.cross_grid_inner_reduce) {
+  if (rparams.cross_grid_inner_reduction) {
     rparams.grid_dim_inner_reduction = ParallelType::BIDx;
-    gdimx = gridim;
-    rparams.split_grid_dim_inner_reduction =
-        gdimx > scheduler_utils::x_grid_limit;
+    rparams.split_grid_dim_inner_reduction = true;
+    gdimx = std::min(gridim, scheduler_utils::x_grid_limit);
 
     rparams.grid_dim_iter_dom = ParallelType::BIDy;
-    gdimy = godim;
-    rparams.split_grid_dim_iter_dom = gdimy > scheduler_utils::y_grid_limit;
+    if (godim > scheduler_utils::y_grid_limit) {
+      rparams.split_grid_dim_iter_dom = true;
+      gdimy = std::min(godim, scheduler_utils::y_grid_limit);
+    }
 
   } else {
-    gdimx = godim;
     rparams.grid_dim_iter_dom = ParallelType::BIDx;
-    rparams.split_grid_dim_iter_dom = gdimx > scheduler_utils::x_grid_limit;
+    if (gdimx > scheduler_utils::x_grid_limit) {
+      rparams.split_grid_dim_iter_dom = true;
+      gdimx = godim;
+    }
   }
 
-  if (rparams.cross_grid_outer_reduce) {
-    if (rparams.cross_block_inner_reduce) {
-      gdimz = grodim;
+  if (rparams.cross_grid_outer_reduction) {
+    if (rparams.cross_block_inner_reduction) {
       rparams.grid_dim_outer_reduction = ParallelType::BIDz;
+      gdimz = std::min(grodim, scheduler_utils::z_grid_limit);
+      rparams.split_grid_dim_outer_reduction = true;
     } else {
-      gdimy = grodim;
       rparams.grid_dim_outer_reduction = ParallelType::BIDy;
+      gdimy = std::min(grodim, scheduler_utils::y_grid_limit);
+      rparams.split_grid_dim_outer_reduction = true;
     }
   }
 
   rparams.lparams = LaunchParams(
-      rparams.grid_dim_iter_dom == ParallelType::BIDx
-          ? LaunchParams::UNINITIALIZED_VAL
-          : gdimx,
-      rparams.grid_dim_iter_dom == ParallelType::BIDy
-          ? LaunchParams::UNINITIALIZED_VAL
-          : gdimy,
+      gdimx,
+      gdimy,
       gdimz,
       bdimx,
       bdimy > 1 ? bdimy : LaunchParams::UNINITIALIZED_VAL,
@@ -441,12 +490,13 @@ ReductionParams innerReductionHeuristic(
   // schedule
   if (rparams.schedule_3D) {
     if (rparams.multiple_reds_per_blk &&
-        (rparams.cross_grid_inner_reduce || rparams.cross_grid_outer_reduce)) {
+        (rparams.cross_grid_inner_reduction ||
+         rparams.cross_grid_outer_reduction)) {
       if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
         std::cerr << "\n===== UNSUPPORTED REDUCTION HEURISTIC ========\n";
         std::cerr << rparams.multiple_reds_per_blk << ", "
-                  << rparams.unroll_inner_reduction << ", "
-                  << rparams.cross_grid_inner_reduce << std::endl;
+                  << (rparams.unroll_factor_inner_reduction > 1) << ", "
+                  << rparams.cross_grid_inner_reduction << std::endl;
       }
       return innerReductionHeuristic(
           total_reduction_numel,
@@ -467,21 +517,6 @@ ReductionParams OuterReductionHeuristic(
     const int64_t n_tensor_inputs,
     const int64_t max_input_dtype_size,
     const size_t vectorize_factor) {
-  // Set some targets for parallelization
-
-  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
-  const int64_t l2_cache_size =
-      at::cuda::getCurrentDeviceProperties()->l2CacheSize;
-
-  const int64_t warp_size =
-      n_elems * max_input_dtype_size * n_tensor_inputs < l2_cache_size
-      ? (int64_t)32 / max_input_dtype_size
-      : 32;
-
-  int64_t target_blocks = 1;
-  int64_t target_unroll = 1;
-  int64_t max_threads_in_block = warp_size;
-
   // WARNING: Current device for codegen may not be the target device
   const int64_t device_max_threads_per_multiprocessor =
       (int64_t)at::cuda::getCurrentDeviceProperties()
@@ -497,31 +532,75 @@ ReductionParams OuterReductionHeuristic(
       scheduler_utils::lastPow2(
           std::max((int64_t)n_tensor_inputs >> 2, (int64_t)1)));
 
-  // If we have one warp per block, how many blocks would that be?
-  target_blocks = ceilDiv(n_elems, (int64_t)warp_size);
+  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
 
-  // If we have more than a wave, put parallelism into unrolling
-  if (target_blocks > device_multiprocessor_count) {
-    target_unroll = std::min(
-        max_unroll, ceilDiv(target_blocks, device_multiprocessor_count));
-    target_blocks = ceilDiv(target_blocks, target_unroll);
-  }
+  // if data fits in l2 and we need more parallelization in the iter dim,
+  // we can use a smaller warp size. While thread local data fits in l1, and
+  // iter dim is really small, we can use <32 threads per warp.
+  // TODO: Could get a much more accurate estimation of it the problem fits in
+  // L2
+  const bool fits_in_l2 = n_elems * max_input_dtype_size * n_tensor_inputs <
+      at::cuda::getCurrentDeviceProperties()->l2CacheSize;
 
-  // Cap target blocks to 4 waves
-  target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);
+  const int64_t min_warp_size = fits_in_l2 ? 16 : 32;
 
-  if (target_blocks * target_unroll * max_threads_in_block < n_elems) {
-    // targetting 4 waves, so try to use a quarter of available threads
-    max_threads_in_block = std::min(
-        ceilDiv(n_elems, target_blocks * target_unroll),
-        ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
+  // Set some targets for parallelization
+  int64_t target_threads_in_block = min_warp_size;
+  // Start target blocks at roughly a quarter wave if available
+  int64_t target_blocks = std::min(
+      ceilDiv(device_multiprocessor_count, (int64_t)4),
+      ceilDiv(n_elems, min_warp_size));
+  int64_t target_unroll = 1;
+
+  auto available_parallelism =
+      [&n_elems, &target_threads_in_block, &target_blocks, &target_unroll]() {
+        return ceilDiv(
+            n_elems, target_threads_in_block * target_blocks * target_unroll);
+      };
+
+  // If there's available parallelism, divide it between threads, blocks, and
+  // vectorization
+  // Threads are currently at a warp (16 or 32)
+  // Blocks are currently at a quarter wave
+  // Unroll is currently at 1
+  while (
+      // and there's parallelism left
+      available_parallelism() > 1 &&
+      (
+          //  There's a place to put it in the block
+          target_threads_in_block <
+              ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4)
+          // There's a place to put it in the device
+          || target_blocks < device_multiprocessor_count * 4
+          // There's a place to put it in unrolling
+          || target_unroll < vectorize_factor)) {
+    if (target_threads_in_block <
+        ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4)) {
+      target_threads_in_block *= 2;
+    }
+
+    if (target_blocks < device_multiprocessor_count * 4 &&
+        available_parallelism() > 1) {
+      target_blocks *= 2;
+    }
+
+    // Delay increasing unroll until we're at a quarter of the target blocks and
+    // threads
+    if (target_blocks > device_multiprocessor_count &&
+        target_threads_in_block >
+            ceilDiv(device_max_threads_per_multiprocessor, (int64_t)16) &&
+        target_unroll < vectorize_factor && available_parallelism() > 1) {
+      target_unroll *= 2;
+    }
   }
 
-  // Round up to nearest warp.
-  if (max_threads_in_block % warp_size != 0) {
-    max_threads_in_block += warp_size - max_threads_in_block % warp_size;
+  // Fill out unrolling if possible
+  if (target_unroll < max_unroll && available_parallelism() > 1) {
+    target_unroll = std::min(available_parallelism(), max_unroll);
   }
 
+  target_unroll = scheduler_utils::lastPow2(target_unroll);
+
   // To get to target threads:
   // Prioritize
   // (1) x dim in iter domain
@@ -534,9 +613,9 @@ ReductionParams OuterReductionHeuristic(
   // domain for this
 
   // Blocks for reductions
-  int64_t gdimy = 1;
+  int64_t grdim = 1;
   // Blocks for outputs
-  int64_t gdimx = 1;
+  int64_t gidim = 1;
 
   // Threads for reduction
   int64_t bdimy = 1;
@@ -546,144 +625,187 @@ ReductionParams OuterReductionHeuristic(
   // Unroll amount
   int64_t inner_reduction_unroll_factor = 1;
   int64_t iter_unroll_factor = 1;
+  bool vectorize = false;
 
-  // Start bdimx as a warp
-  bdimx = std::min(warp_size, total_iteration_numel);
+  // Helper lambda's to figure out how much is left in the iter or reduction dim
+  auto iDimAvail = [&]() {
+    return ceilDiv(total_iteration_numel, gidim * bdimx * iter_unroll_factor);
+  };
 
-  // If we didn't hit a warp, round down to pow 2
-  bdimx = scheduler_utils::lastPow2(bdimx);
+  auto rDimAvail = [&]() {
+    return ceilDiv(
+        total_reduction_numel, grdim * bdimy * inner_reduction_unroll_factor);
+  };
 
-  // Prioritie unrolling on iteration domain, maintaining a wave on the device
-  if (ceilDiv(total_iteration_numel, bdimx) > 2 * device_multiprocessor_count) {
-    iter_unroll_factor =
-        std::min(max_unroll, ceilDiv(device_multiprocessor_count, bdimx));
+  // Start bdimx as a warp
+  bdimx = std::min(min_warp_size, total_iteration_numel);
+
+  if (total_iteration_numel > bdimx && total_iteration_numel < bdimx * 2) {
+    // If rounding up would require less than 3/4 of the warp
+    if ((total_iteration_numel % bdimx) * 4 < bdimx * 3) {
+      // Round up to avoid nasty edge effects
+      bdimx = total_iteration_numel;
+    }
   }
 
-  // If there's 2 waves, continue to fill bdimx
-  if (ceilDiv(total_iteration_numel, bdimx * iter_unroll_factor) >=
-      2 * device_multiprocessor_count) {
+  // If iteration numel is not something huge like 64k we probably shouldn't do
+  // this, maybe it could be 2 * device_multi_count to make sure iter dim is
+  if (iDimAvail() > device_multiprocessor_count) {
     // Put more into bdimx
     bdimx = std::min(
-        // Leave a full wave of blocks
+        // Leave 2x a full wave of blocks
         ceilDiv(
             total_iteration_numel,
             iter_unroll_factor * device_multiprocessor_count),
         // Don't exceed max thread count
-        max_threads_in_block);
+        target_threads_in_block);
+  }
 
-    // Round bdimx down to power of 2 or multiple of warp
-    bdimx = bdimx > warp_size ? bdimx - bdimx % warp_size
-                              : scheduler_utils::lastPow2(bdimx);
+  // Purely empirically found switch to start vectorization, tuned on v100,
+  // should check it's validity on other hardware or if we need to switch to
+  // size not n_elems
+  if (n_elems * max_input_dtype_size > 64 * 1024 * 1024) {
+    // Do some unrolling on the iter dimension
+    iter_unroll_factor =
+        vectorize_factor > 1 ? (int64_t)vectorize_factor : max_unroll;
+    iter_unroll_factor =
+        std::min(iter_unroll_factor, ceilDiv(n_elems, 32 * 1024 * 1024));
+    iter_unroll_factor = std::min(iter_unroll_factor, iDimAvail());
+    iter_unroll_factor = std::min(iter_unroll_factor, target_unroll);
+    iter_unroll_factor = scheduler_utils::lastPow2(iter_unroll_factor);
+    if (vectorize_factor > 1 && iter_unroll_factor <= vectorize_factor) {
+      iter_unroll_factor =
+          std::min(iter_unroll_factor, (int64_t)vectorize_factor);
+      vectorize = true;
+    }
   }
 
+  // Round bdimx to a nice value
+  bdimx = roundUpPow2OrMultipleOf(bdimx, 8);
+
   // Fill bdimy with left over threads
-  bdimy = std::min(
-      std::max(max_threads_in_block / bdimx, (int64_t)1),
-      total_reduction_numel);
+  bdimy =
+      std::min(safeDiv(target_threads_in_block, bdimx), total_reduction_numel);
 
-  bool vectorize = false;
+  bdimy = roundDownPow2OrMultipleOf(bdimy, 8);
 
-  // Move unrolling factor into vectorization upto vectorization limit.
-  if (vectorize_factor > 1 && iter_unroll_factor > 1) {
-    vectorize = true;
-    iter_unroll_factor = std::min(
-        scheduler_utils::lastPow2(iter_unroll_factor),
-        (int64_t)vectorize_factor);
-  }
+  // Move parallelization into unrolling the reduction dimension if
+  // parallelizing iteration dimension didn't take the available unroll factor.
+  if (iter_unroll_factor < max_unroll && rDimAvail() > 2) {
+    inner_reduction_unroll_factor =
+        std::min(rDimAvail(), safeDiv(max_unroll, iter_unroll_factor));
 
-  // Since this is persistent and registers will have to be used anyways unroll
-  // the reduction dim if it's available
-  inner_reduction_unroll_factor =
-      std::min(max_unroll, ceilDiv(total_reduction_numel, bdimy));
+    inner_reduction_unroll_factor =
+        scheduler_utils::lastPow2(inner_reduction_unroll_factor);
+  }
 
-  // Go cross grid
-  gdimy = ceilDiv(
-      ceilDiv(total_reduction_numel, bdimy * inner_reduction_unroll_factor),
-      (int64_t)4);
+  gidim = iDimAvail();
+
+  // Try to hit a wave by going cross reduction
+  grdim = std::min(rDimAvail(), ceilDiv(device_multiprocessor_count, gidim));
+
+  // // Extend to go to target blocks, but keep 16 iterations per thread
+  if (gidim * grdim < target_blocks) {
+    // What should we use out of the reduction factor to hit target blocks? Make
+    // sure we have 4 reductions per thread beyond what's already set as we
+    // consider expanding to target block
+    grdim = std::min(
+        // At least 4 iterations of the reduction per thread ontop of unroll
+        ceilDiv(rDimAvail() * grdim, 4),
+        // Expand to target blocks
+        ceilDiv(target_blocks, gidim));
+  }
 
-  gdimx = ceilDiv(total_iteration_numel, bdimx * iter_unroll_factor);
+  // If there isn't a lot of available parallelism from the iteration dimension,
+  // expand across the reduction dimension. This has to be done carefully.
+  // expand further
+  if (rDimAvail() > 16 &&
+      ceilDiv(total_iteration_numel, min_warp_size) <
+          device_multiprocessor_count * 2) {
+    // Find minimum we want to parallelize by, we don't want blocks striding
+    // across too many elements: In the parallel scheme [rBIDy, remainder,
+    // iBIDx, rTIDy, i_unroll, r_unroll] figure out how many bytes iterations
+    // across remainder stride
+    int64_t bytes_stride_remainder = max_input_dtype_size * bdimx * bdimy *
+        iter_unroll_factor * inner_reduction_unroll_factor;
+    // Empiercally found stride shouldn't exceed 256kiB boundaries in a block
+    int64_t kMaxStride = 128 * 1024;
+
+    int64_t max_remainder_size = safeDiv(kMaxStride, bytes_stride_remainder);
+
+    int64_t grdim_for_stride = ceilDiv(
+        total_reduction_numel,
+        max_remainder_size * bdimy * inner_reduction_unroll_factor);
 
-  // Clang tidy
-  constexpr int64_t kEight = 8;
-  constexpr int64_t kSixteen = 16;
-  constexpr int64_t kThirtyTwo = 32;
-
-  if (ceilDiv(total_reduction_numel, bdimy * inner_reduction_unroll_factor) >=
-      kThirtyTwo) {
-    // Many reduction elements, go cross grid
-    int64_t min_gdimy = 1;
-    if (gdimy > 1) {
-      // already cross grid, don't go below target or what was already set
-      min_gdimy = std::min(gdimy, ceilDiv(target_blocks, gdimx));
-    }
-    gdimy = std::max(
-        min_gdimy,
-        ceilDiv(
-            ceilDiv(
-                total_reduction_numel, bdimy * inner_reduction_unroll_factor),
-            (int64_t)kSixteen));
-    // Don't go too far above number of threads in a block since that's how many
-    // threads are available to do final reduction iteration
-    // This is good!
-    gdimy = std::min(gdimy, bdimx * bdimy * kEight);
+    grdim = grdim_for_stride;
   }
 
   // Try to do some cleanup of ragged waves on device
   if (
       // If we have less than 8 waves of blocks
-      gdimy * gdimx < device_multiprocessor_count * kEight &&
+      grdim * gidim < device_multiprocessor_count * 8 &&
       // And we don't have an even divisible number of blocks
-      (gdimy * gdimx) % device_multiprocessor_count != 0 &&
+      (grdim * gidim) % device_multiprocessor_count != 0 &&
       // And we have more than one wave
-      gdimy * gdimx > device_multiprocessor_count) {
+      grdim * gidim > device_multiprocessor_count) {
     // round waves down
     auto waves =
-        std::max((gdimx * gdimy) / device_multiprocessor_count, (int64_t)1);
-    auto new_gdimy =
-        std::max((waves * device_multiprocessor_count) / gdimx, (int64_t)1);
+        std::max((gidim * grdim) / device_multiprocessor_count, (int64_t)1);
+    auto new_grdim =
+        std::max((waves * device_multiprocessor_count) / gidim, (int64_t)1);
     if (
-        // If difference is less than 25% of the original gdimy
-        (new_gdimy - gdimy) * 4 < gdimy &&
+        // If difference is less than 25% of the original grdim
+        (new_grdim - grdim) * 4 < grdim &&
         // and difference is less than 25% of the original number of blocks
-        ((new_gdimy * gdimx) - (gdimy * gdimx)) * 4 < gdimy * gdimx) {
-      gdimy = new_gdimy;
+        ((new_grdim * gidim) - (grdim * gidim)) * 4 < grdim * gidim) {
+      grdim = new_grdim;
     }
   }
 
-  // Cannot unroll with cross grid reductions
-  if (gdimy > 1 && iter_unroll_factor > 1) {
-    // Readjust the thread bindings, ideally we would repeat the block setup
-    // without considering iter domain unrolling, but for now will simplify
-    bdimx = std::min(max_threads_in_block, bdimx * iter_unroll_factor);
-    // Round bdimx down to power of 2 or multiple of warp
-    bdimx = bdimx > warp_size ? bdimx - bdimx % warp_size
-                              : scheduler_utils::lastPow2(bdimx);
-    // bdimy can only be reduced here from before
-    bdimy = std::max(max_threads_in_block / bdimx, (int64_t)1);
-    // Reset iteration unroll
-    iter_unroll_factor = 1;
-  }
+  int64_t gdimx = LaunchParams::UNINITIALIZED_VAL;
+  int64_t gdimy = LaunchParams::UNINITIALIZED_VAL;
 
+  // In these instances latency of the cleanup may be significant so flip gdimx
+  // and gdimy to try and prevent all cleanup from happening at the
+  // same time
+  // Always disabled for now.
+  // bool flip_grid = gidim > 1 && gidim < 8;
+  const bool flip_grid = false;
   ReductionParams rparams;
   // cross grid implies cross block
-  rparams.cross_block_inner_reduce = bdimy > 1 || gdimy > 1;
-  rparams.cross_grid_inner_reduce = gdimy > 1;
+  rparams.cross_block_inner_reduction = bdimy > 1 || grdim > 1;
+  rparams.cross_grid_inner_reduction = grdim > 1;
+  if (rparams.cross_grid_inner_reduction) {
+    rparams.split_grid_dim_inner_reduction = true;
+    rparams.grid_dim_inner_reduction =
+        flip_grid ? ParallelType::BIDx : ParallelType::BIDy;
+    if (flip_grid) {
+      gdimx = std::min(grdim, scheduler_utils::x_grid_limit);
+    } else {
+      gdimy = std::min(grdim, scheduler_utils::y_grid_limit);
+    }
+  }
   rparams.multiple_reds_per_blk = bdimx > 1 || iter_unroll_factor > 1;
 
   if (rparams.multiple_reds_per_blk) {
     rparams.block_dim_iter_dom = ParallelType::TIDx;
   }
 
-  rparams.grid_dim_iter_dom = ParallelType::BIDx;
-  rparams.split_grid_dim_iter_dom = gdimx > scheduler_utils::x_grid_limit;
-
-  if (rparams.cross_grid_inner_reduce) {
-    rparams.grid_dim_inner_reduction = ParallelType::BIDy;
-    rparams.split_grid_dim_inner_reduction =
-        gdimy > scheduler_utils::y_grid_limit;
+  rparams.grid_dim_iter_dom =
+      flip_grid ? ParallelType::BIDy : ParallelType::BIDx;
+  if (gidim > (flip_grid ? scheduler_utils::y_grid_limit
+                         : scheduler_utils::x_grid_limit)) {
+    rparams.split_grid_dim_iter_dom = true;
+    if (flip_grid) {
+      gdimy = scheduler_utils::y_grid_limit;
+    } else {
+      gdimx = scheduler_utils::x_grid_limit;
+    }
   }
 
-  if (rparams.cross_block_inner_reduce) {
+  rparams.flip_grid = flip_grid;
+
+  if (rparams.cross_block_inner_reduction) {
     if (rparams.block_dim_iter_dom == ParallelType::TIDx) {
       rparams.block_dim_inner_reduction = ParallelType::TIDy;
     } else {
@@ -691,18 +813,15 @@ ReductionParams OuterReductionHeuristic(
     }
   }
 
-  if (inner_reduction_unroll_factor > 1) {
-    rparams.unroll_inner_reduction = true;
-    rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
-  }
+  rparams.unroll_factor_inner_reduction = inner_reduction_unroll_factor;
+
+  rparams.unroll_factor_iter_dom = iter_unroll_factor;
   if (iter_unroll_factor > 1) {
-    rparams.unroll_iter_dom = true;
-    rparams.unroll_factor_iter_dom = iter_unroll_factor;
     rparams.vectorize_iter_dom = vectorize;
   }
 
   rparams.lparams = LaunchParams(
-      LaunchParams::UNINITIALIZED_VAL,
+      gdimx,
       gdimy,
       LaunchParams::UNINITIALIZED_VAL,
       rparams.multiple_reds_per_blk ? bdimx : bdimy,
@@ -774,7 +893,8 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
       HeuristicSummaryEntry<HeuristicCompileTime::ReductionTVs>(
           data_cache, [&fusion]() {
             return std::make_unique<std::vector<TensorView*>>(
-                scheduler_utils::getReductionTvs(fusion));
+                scheduler_utils::getReductionTvs(
+                    fusion /*, ignore_trivial = true */));
           });
 
   auto& reduction_tvs = reduction_tv_entry.get();
@@ -795,6 +915,9 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
            red_expr->getExprType().value() == ExprType::WelfordOp),
       "TensorView doesn't have a reduction.");
 
+  auto properties =
+      scheduler_utils::getProperties(fusion, runtime_info, reduction_tv);
+
   auto tv_inps = ir_utils::filterByType<TensorView>(fusion->inputs());
   TORCH_INTERNAL_ASSERT(
       !tv_inps.empty(),
@@ -825,7 +948,8 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
   size_t vectorize_factor = std::numeric_limits<size_t>::max();
 
   for (auto tv : vectorizable_inputs_outputs) {
-    const auto tv_vectorize_factor = runtime_info.getVectorizableWidth(tv);
+    const auto tv_vectorize_factor =
+        runtime_info.getInnerDimVectorizableWidth(tv);
     vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor);
   }
 
@@ -833,6 +957,15 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
     vectorize_factor = 1;
   }
 
+  // Try expanding vectorization to contig merged domains
+  vectorize_factor = scheduler_utils::expandVectorizationToContigMergedDomains(
+      fusion,
+      runtime_info,
+      vectorizable_inputs_outputs,
+      reduction_tv,
+      (int)(reduction_tv->nDims() - properties.inner_most_dimension_ndims),
+      vectorize_factor);
+
   // Base max dtype and n_tensor_inputs on tensors that are vectorizable (i.e.
   // share inner dimension with data pattern we're looking at).
   size_t max_dtype_size = 1;
@@ -841,14 +974,14 @@ TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
     if (!tv->isFusionInput()) {
       continue;
     }
-    max_dtype_size =
-        std::max(max_dtype_size, dataTypeSize(tv->getDataType().value()));
+    max_dtype_size = std::max(
+        max_dtype_size,
+        dataTypeSize(
+            tv->getDataType().value(),
+            indexModeToDtype(runtime_info.getIndexMode())));
     n_tensor_inputs++;
   }
 
-  auto properties =
-      scheduler_utils::getProperties(fusion, runtime_info, reduction_tv);
-
   return reductionHeuristic(
       properties.total_reduction_numel,
       properties.total_iteration_numel,
@@ -864,7 +997,7 @@ void scheduleReduction(Fusion* fusion, const ReductionParams& rparams) {
   FUSER_PERF_SCOPE("scheduleReduction");
   FusionGuard fg(fusion);
 
-  bool unroll = rparams.unroll_inner_reduction || rparams.unroll_iter_dom;
+  bool unroll = rparams.isUnrolled();
 
   // Cache inputs if unrolled
   auto cached_inputs = scheduler_utils::cacheInputs(fusion, unroll);
@@ -877,7 +1010,8 @@ void scheduleReduction(Fusion* fusion, const ReductionParams& rparams) {
   // fusion segmentation
   scheduler_utils::clearMemorySpace(fusion);
 
-  auto reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+  auto reduction_tvs =
+      scheduler_utils::getReductionTvs(fusion /*, ignore_trivial = true */);
 
   TORCH_INTERNAL_ASSERT(reduction_tvs.size());
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h
index aafae3f09ff3..4df5e288eadc 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h
@@ -28,15 +28,17 @@ class ReductionParams {
   // like [reduction, iteration, reduction].
   bool schedule_3D = false;
 
+  // For outer reductions we may want to swap the gdimx and gdimy bindings to
+  // amortize the cost of the final cleanup in grid reductions.
+  bool flip_grid = false;
+
   // Inner Reduction Domain:
 
   // Reduce across the block?
-  bool cross_block_inner_reduce = false;
+  bool cross_block_inner_reduction = false;
   // Reduce across the grid?
-  bool cross_grid_inner_reduce = false;
-  // Inner reduction unroll/vectorize
-  bool unroll_inner_reduction = false;
-  // Unrolling factor
+  bool cross_grid_inner_reduction = false;
+  // Unrolling/Vectorization factor for inner reduction dimension
   int64_t unroll_factor_inner_reduction = 1;
   // vectorize instead of unroll
   bool vectorize_inner_reduction = false;
@@ -60,9 +62,7 @@ class ReductionParams {
 
   // Perform multiple reductions per block?
   bool multiple_reds_per_blk = false;
-  // Iteration dimension unroll/vectorize
-  bool unroll_iter_dom = false;
-  // Unrolling factor
+  // Unrolling/Vectorization factor for iteration dimension
   int64_t unroll_factor_iter_dom = 1;
   // vectorize instead of unroll
   bool vectorize_iter_dom = false;
@@ -81,16 +81,14 @@ class ReductionParams {
   // Outer Reduction Domain if 3D Scheduled:
 
   // Reduce across the block?
-  bool cross_block_outer_reduce = false;
+  bool cross_block_outer_reduction = false;
   // Reduce across the grid?
-  bool cross_grid_outer_reduce = false;
+  bool cross_grid_outer_reduction = false;
   // Split grid dim for iteration axis in case it's too large for cuda
   bool split_grid_dim_outer_reduction = false;
   // Register persistent buffer size in outer dimension
   int64_t batches_per_block_outer_reduction = 1;
-  // Outer reduction unroll
-  bool unroll_outer_reduction = false;
-  // Unrolling factor
+  // Unrolling/Vectorization factor for outer reduction factor
   int64_t unroll_factor_outer_reduction = 1;
 
   // Which block parallel dimension should be used for the outer reduction.
@@ -106,16 +104,20 @@ class ReductionParams {
 
   LaunchParams lparams;
 
+  bool isUnrolled() const {
+    return unroll_factor_inner_reduction > 1 || unroll_factor_iter_dom > 1 ||
+        unroll_factor_outer_reduction > 1;
+  }
+
  public:
   // Warning: Does not check launch parameters!
   bool operator==(const ReductionParams& other) const {
     bool attr_equal = other.fastest_dim == fastest_dim &&
         other.persistent_kernel == persistent_kernel &&
         other.project_persistent_buffers == project_persistent_buffers &&
-        other.schedule_3D == schedule_3D &&
-        other.cross_block_inner_reduce == cross_block_inner_reduce &&
-        other.cross_grid_inner_reduce == cross_grid_inner_reduce &&
-        other.unroll_inner_reduction == unroll_inner_reduction &&
+        other.schedule_3D == schedule_3D && other.flip_grid == flip_grid &&
+        other.cross_block_inner_reduction == cross_block_inner_reduction &&
+        other.cross_grid_inner_reduction == cross_grid_inner_reduction &&
         other.unroll_factor_inner_reduction == unroll_factor_inner_reduction &&
         other.vectorize_inner_reduction == vectorize_inner_reduction &&
         other.split_grid_dim_inner_reduction ==
@@ -124,13 +126,11 @@ class ReductionParams {
         other.batches_per_block_inner_reduction ==
             batches_per_block_inner_reduction &&
         other.multiple_reds_per_blk == multiple_reds_per_blk &&
-        other.unroll_iter_dom == unroll_iter_dom &&
         other.unroll_factor_iter_dom == unroll_factor_iter_dom &&
         other.vectorize_iter_dom == vectorize_iter_dom &&
         other.split_grid_dim_iter_dom == split_grid_dim_iter_dom &&
-        other.cross_block_outer_reduce == cross_block_outer_reduce &&
-        other.cross_grid_outer_reduce == cross_grid_outer_reduce &&
-        other.unroll_outer_reduction == unroll_outer_reduction &&
+        other.cross_block_outer_reduction == cross_block_outer_reduction &&
+        other.cross_grid_outer_reduction == cross_grid_outer_reduction &&
         other.unroll_factor_outer_reduction == unroll_factor_outer_reduction &&
         other.split_grid_dim_outer_reduction ==
             split_grid_dim_outer_reduction &&
@@ -153,16 +153,16 @@ class ReductionParams {
     if (schedule_3D) {
       ss << "3D Schedule\n"
          << "Outer Reduction: ";
-      if (cross_block_outer_reduce) {
+      if (cross_block_outer_reduction) {
         ss << "cross block - " << block_dim_outer_reduction << " / ";
       }
-      if (cross_grid_outer_reduce) {
+      if (cross_grid_outer_reduction) {
         ss << "cross grid - " << grid_dim_outer_reduction << " / ";
         ss << (split_grid_dim_outer_reduction ? "split grid dim / " : "");
       }
 
-      ss << (unroll_outer_reduction ? "unroll / " : "");
-      if (unroll_outer_reduction) {
+      ss << (unroll_factor_outer_reduction > 1 ? "unroll / " : "");
+      if (unroll_factor_outer_reduction > 1) {
         ss << "factor " << unroll_factor_outer_reduction << " ";
       }
 
@@ -182,31 +182,33 @@ class ReductionParams {
     }
     ss << (multiple_reds_per_blk ? "multiple reductions per block / " : "")
        << (vectorize_iter_dom ? "vectorize / " : "")
-       << (unroll_iter_dom && !vectorize_iter_dom ? "unroll / " : "");
-    if (unroll_iter_dom || vectorize_iter_dom) {
+       << (unroll_factor_iter_dom > 1 && !vectorize_iter_dom ? "unroll / "
+                                                             : "");
+    if (unroll_factor_iter_dom > 1) {
       ss << "factor " << unroll_factor_iter_dom;
     }
 
     ss << "\nInner Reduction Domain: ";
 
-    if (cross_block_inner_reduce) {
+    if (cross_block_inner_reduction) {
       ss << "cross block - " << block_dim_inner_reduction << " / ";
       ss << (pad_inner_reduction_to_warp ? " pad to warp / " : "");
     }
-    if (cross_grid_inner_reduce) {
+    if (cross_grid_inner_reduction) {
       ss << "cross grid - " << grid_dim_inner_reduction << " / ";
       ss << (split_grid_dim_inner_reduction ? "split grid dim / " : "");
     }
     if (batches_per_block_inner_reduction > 1 || persistent_kernel) {
       ss << "persistent batch - " << batches_per_block_inner_reduction << " / ";
     }
-    ss << (cross_grid_inner_reduce && split_grid_dim_inner_reduction
+    ss << (cross_grid_inner_reduction && split_grid_dim_inner_reduction
                ? "split grid dimension / "
                : "")
        << (vectorize_inner_reduction ? "vectorize / " : "")
-       << (unroll_inner_reduction && !vectorize_inner_reduction ? "unroll / "
-                                                                : "");
-    if (unroll_inner_reduction || vectorize_inner_reduction) {
+       << (unroll_factor_inner_reduction > 1 && !vectorize_inner_reduction
+               ? "unroll / "
+               : "");
+    if (unroll_factor_inner_reduction > 1) {
       ss << "factor " << unroll_factor_inner_reduction;
     }
 
@@ -225,25 +227,25 @@ class ReductionParamsHash {
         static_cast<size_t>(rp.persistent_kernel) << (bits - 2) ^
         static_cast<size_t>(rp.project_persistent_buffers) << (bits - 3) ^
         static_cast<size_t>(rp.schedule_3D) << (bits - 4) ^
-        static_cast<size_t>(rp.cross_block_inner_reduce) << (bits - 5) ^
-        static_cast<size_t>(rp.cross_grid_inner_reduce) << (bits - 6) ^
-        static_cast<size_t>(rp.unroll_inner_reduction) << (bits - 7) ^
-        static_cast<size_t>(rp.unroll_factor_inner_reduction) ^
-        static_cast<size_t>(rp.vectorize_inner_reduction) << (bits - 8) ^
-        static_cast<size_t>(rp.split_grid_dim_inner_reduction) << (bits - 9) ^
-        static_cast<size_t>(rp.pad_inner_reduction_to_warp) << (bits - 10) ^
+        static_cast<size_t>(rp.flip_grid) << (bits - 5) ^
+        static_cast<size_t>(rp.cross_block_inner_reduction) << (bits - 6) ^
+        static_cast<size_t>(rp.cross_grid_inner_reduction) << (bits - 7) ^
+        static_cast<size_t>(rp.unroll_factor_inner_reduction) << (bits - 8) ^
+        static_cast<size_t>(rp.vectorize_inner_reduction) << (bits - 9) ^
+        static_cast<size_t>(rp.split_grid_dim_inner_reduction) << (bits - 10) ^
+        static_cast<size_t>(rp.pad_inner_reduction_to_warp) << (bits - 11) ^
         static_cast<size_t>(rp.batches_per_block_inner_reduction)
-            << (bits - 11) ^
-        static_cast<size_t>(rp.multiple_reds_per_blk) << (bits - 12) ^
-        static_cast<size_t>(rp.unroll_iter_dom) << (bits - 13) ^
-        static_cast<size_t>(rp.unroll_factor_iter_dom) ^
-        static_cast<size_t>(rp.vectorize_iter_dom) << (bits - 14) ^
-        static_cast<size_t>(rp.split_grid_dim_iter_dom) << (bits - 15) ^
-        static_cast<size_t>(rp.cross_block_outer_reduce) << (bits - 16) ^
-        static_cast<size_t>(rp.cross_grid_outer_reduce) << (bits - 17) ^
-        static_cast<size_t>(rp.split_grid_dim_outer_reduction) << (bits - 18) ^
+            << (bits - 12) ^
+        static_cast<size_t>(rp.multiple_reds_per_blk) << (bits - 13) ^
+        static_cast<size_t>(rp.unroll_factor_iter_dom) << (bits - 14) ^
+        static_cast<size_t>(rp.vectorize_iter_dom) << (bits - 15) ^
+        static_cast<size_t>(rp.split_grid_dim_iter_dom) << (bits - 16) ^
+        static_cast<size_t>(rp.cross_block_outer_reduction) << (bits - 17) ^
+        static_cast<size_t>(rp.cross_grid_outer_reduction) << (bits - 18) ^
+        static_cast<size_t>(rp.split_grid_dim_outer_reduction) << (bits - 19) ^
         static_cast<size_t>(rp.batches_per_block_outer_reduction)
-            << (bits - 19);
+            << (bits - 20) ^
+        static_cast<size_t>(rp.unroll_factor_outer_reduction) << (bits - 21);
     return attr_hash;
   }
 };
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
index 3850fa9638bd..f8a7e04a7143 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
@@ -43,257 +43,168 @@ TensorView* scheduleReductionTV(
       !(!rparams.fastest_dim && rparams.vectorize_inner_reduction),
       "Cannot vectorize reduction domain on outer reductions.");
 
-  TORCH_INTERNAL_ASSERT(
-      !(rparams.cross_grid_inner_reduce && rparams.persistent_kernel),
-      "Grid reductions not implemented yet for persistent kernels.");
-
   TORCH_INTERNAL_ASSERT(
       !(rparams.multiple_reds_per_blk && !has_iter_axis),
       "Multiple reductions requires an iter domain, but one wasn't found.");
 
   TORCH_INTERNAL_ASSERT(
-      !(rparams.cross_grid_inner_reduce && rparams.unroll_iter_dom),
-      "Unrolling on iter domain not supported with cross grid reductions.");
-
-  TORCH_INTERNAL_ASSERT(
-      !(rparams.unroll_iter_dom && !has_iter_axis),
+      !(rparams.unroll_factor_iter_dom > 1 && !has_iter_axis),
       "Unrolling on iter domain requires an iter domain.");
 
-  // Inner reduction axis:
-  if (rparams.unroll_inner_reduction) {
-    if (rparams.persistent_kernel) {
-      if (rparams.vectorize_inner_reduction) {
-        reduction_tv->split(
-            inner_reduce_axis,
-            rparams.batches_per_block_inner_reduction,
-            false);
-        reduction_tv->split(
-            inner_reduce_axis + 1, rparams.unroll_factor_inner_reduction);
-
-        reduction_tv->axis(inner_reduce_axis + 1)
-            ->parallelize(rparams.block_dim_inner_reduction);
-        if (rparams.pad_inner_reduction_to_warp) {
-          reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
-        }
-        reduction_tv->axis(inner_reduce_axis + 2)
-            ->parallelize(ParallelType::Vectorize);
-      } else {
-        reduction_tv->split(
-            inner_reduce_axis,
-            rparams.batches_per_block_inner_reduction *
-                rparams.unroll_factor_inner_reduction,
-            false);
-        reduction_tv->split(
-            inner_reduce_axis, rparams.unroll_factor_inner_reduction);
-
-        reduction_tv->axis(inner_reduce_axis + 1)
-            ->parallelize(ParallelType::Unroll);
-        reduction_tv->axis(inner_reduce_axis + 2)
-            ->parallelize(rparams.block_dim_inner_reduction);
-        if (rparams.pad_inner_reduction_to_warp) {
-          reduction_tv->axis(inner_reduce_axis + 2)->padToMultipleOfWarp();
-        }
-      }
-    } else {
-      if (isParallelTypeThread(rparams.block_dim_inner_reduction)) {
-        if (rparams.vectorize_inner_reduction) {
-          reduction_tv->split(
-              inner_reduce_axis, rparams.unroll_factor_inner_reduction);
-          reduction_tv->split(
-              inner_reduce_axis,
-              NamedScalar::getParallelDim(rparams.block_dim_inner_reduction));
-          reduction_tv->axis(inner_reduce_axis + 2)
-              ->parallelize(ParallelType::Vectorize);
-          reduction_tv->axis(inner_reduce_axis + 1)
-              ->parallelize(rparams.block_dim_inner_reduction);
-          if (rparams.pad_inner_reduction_to_warp) {
-            reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
-          }
-        } else {
-          reduction_tv->split(
-              inner_reduce_axis,
-              NamedScalar::getParallelDim(rparams.block_dim_inner_reduction));
-          reduction_tv->split(
-              inner_reduce_axis, rparams.unroll_factor_inner_reduction);
-
-          reduction_tv->axis(inner_reduce_axis + 1)
-              ->parallelize(ParallelType::Unroll);
-          reduction_tv->axis(inner_reduce_axis + 2)
-              ->parallelize(rparams.block_dim_inner_reduction);
-
-          if (rparams.pad_inner_reduction_to_warp) {
-            reduction_tv->axis(inner_reduce_axis + 2)->padToMultipleOfWarp();
-          }
-        }
-      } else {
-        // Inner reduction is not parallelized, but is unrolled or vectorized:
-        reduction_tv->split(
-            inner_reduce_axis, rparams.unroll_factor_inner_reduction);
-        reduction_tv->axis(inner_reduce_axis + 1)
-            ->parallelize(
-                rparams.vectorize_inner_reduction ? ParallelType::Vectorize
-                                                  : ParallelType::Unroll);
-      }
+  auto vectorize = [&reduction_tv](int axis, int factor) {
+    reduction_tv->split(axis, factor);
+    reduction_tv->axis(axis + 1)->parallelize(ParallelType::Vectorize);
+  };
+
+  auto inner_parallel = [&reduction_tv](int axis, ParallelType ptype) {
+    reduction_tv->split(axis, NamedScalar::getParallelDim(ptype));
+    reduction_tv->axis(axis + 1)->parallelize(ptype);
+  };
+
+  auto inner_unswitch = [&reduction_tv](int axis) {
+    reduction_tv->split(axis, 1);
+    reduction_tv->axis(axis + 1)->parallelize(ParallelType::Unswitch);
+  };
+
+  auto inner_unroll = [&reduction_tv](int axis, int factor) {
+    reduction_tv->split(axis, factor);
+    reduction_tv->axis(axis + 1)->parallelize(ParallelType::Unroll);
+  };
+
+  auto outer_parallel = [&reduction_tv](int axis, ParallelType ptype) {
+    reduction_tv->split(axis, NamedScalar::getParallelDim(ptype), false);
+    reduction_tv->axis(axis)->parallelize(ptype);
+  };
+
+  auto outer_unswitch = [&reduction_tv](int axis) {
+    reduction_tv->split(axis, 1, false);
+    reduction_tv->axis(axis)->parallelize(ParallelType::Unswitch);
+  };
+
+  auto outer_unroll = [&reduction_tv](int axis, int factor) {
+    reduction_tv->split(axis, factor, false);
+    reduction_tv->axis(axis)->parallelize(ParallelType::Unroll);
+  };
+
+  if (rparams.persistent_kernel) {
+    // Persistent Format:
+    // [Grid Split, persistent buffer, unswitch, unroll, thread dim, vectorize]
+    if (rparams.vectorize_inner_reduction) {
+      vectorize(inner_reduce_axis, rparams.unroll_factor_inner_reduction);
+    }
+    auto outer_i = inner_reduce_axis;
+    if (rparams.cross_grid_inner_reduction) {
+      outer_parallel(outer_i++, rparams.grid_dim_inner_reduction);
+    }
+
+    reduction_tv->split(
+        outer_i++, rparams.batches_per_block_inner_reduction, false);
+
+    outer_unswitch(outer_i++);
+
+    if (!rparams.vectorize_inner_reduction &&
+        rparams.unroll_factor_inner_reduction > 1) {
+      outer_unroll(outer_i++, rparams.unroll_factor_inner_reduction);
+    }
+
+    reduction_tv->axis(outer_i)->parallelize(rparams.block_dim_inner_reduction);
+
+    if (rparams.pad_inner_reduction_to_warp) {
+      reduction_tv->axis(outer_i)->padToMultipleOfWarp();
     }
 
-    // Unswitch axis which gives us finer control on allocations with
-    // unrolling
-    reduction_tv->split(inner_reduce_axis, 1);
-    reduction_tv->axis(inner_reduce_axis + 1)
-        ->parallelize(ParallelType::Unswitch);
   } else {
-    // Parallelize reduction axis, don't unroll it0
-    if (rparams.cross_block_inner_reduce) {
-      if (rparams.persistent_kernel) {
-        reduction_tv->split(
-            inner_reduce_axis,
-            rparams.batches_per_block_inner_reduction,
-            false);
-        reduction_tv->axis(inner_reduce_axis + 1)
-            ->parallelize(rparams.block_dim_inner_reduction);
-
-        if (rparams.pad_inner_reduction_to_warp) {
-          reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
-        }
-      } else {
-        reduction_tv->split(
-            inner_reduce_axis,
-            NamedScalar::getParallelDim(rparams.block_dim_inner_reduction));
-        reduction_tv->axis(inner_reduce_axis + 1)
-            ->parallelize(rparams.block_dim_inner_reduction);
-        if (rparams.pad_inner_reduction_to_warp) {
-          reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
-        }
+    // Non-persistent format:
+    // [Grid Split, Remainder, unswitch, unroll, thread dim, vectorize]
+    if (rparams.vectorize_inner_reduction) {
+      vectorize(inner_reduce_axis, rparams.unroll_factor_inner_reduction);
+    }
+
+    if (rparams.cross_block_inner_reduction) {
+      inner_parallel(inner_reduce_axis, rparams.block_dim_inner_reduction);
+      if (rparams.pad_inner_reduction_to_warp) {
+        reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();
       }
-    } else {
-      // No parallelization on reduction dim, fake an unswitch axis for
-      // rfactor
-      reduction_tv->split(inner_reduce_axis, 1);
-      reduction_tv->axis(inner_reduce_axis + 1)
-          ->parallelize(ParallelType::Unswitch);
     }
-  }
 
-  if (rparams.cross_grid_inner_reduce) {
-    reduction_tv->split(
-        inner_reduce_axis,
-        NamedScalar::getParallelDim(rparams.grid_dim_inner_reduction),
-        false);
-    reduction_tv->axis(inner_reduce_axis)
-        ->parallelize(rparams.grid_dim_inner_reduction);
+    if (!rparams.vectorize_inner_reduction &&
+        rparams.unroll_factor_inner_reduction > 1) {
+      inner_unroll(inner_reduce_axis, rparams.unroll_factor_inner_reduction);
+    }
+
+    inner_unswitch(inner_reduce_axis);
+    if (rparams.cross_grid_inner_reduction) {
+      if (rparams.split_grid_dim_inner_reduction) {
+        outer_parallel(inner_reduce_axis, rparams.grid_dim_inner_reduction);
+      } else {
+        reduction_tv->axis(inner_reduce_axis)
+            ->parallelize(rparams.grid_dim_inner_reduction);
+      }
+    }
   }
 
   // Outer reduction axis
   if (rparams.schedule_3D) {
-    if (rparams.unroll_outer_reduction) {
-      if (rparams.persistent_kernel) {
-        reduction_tv->split(
-            outer_reduce_axis,
-            rparams.batches_per_block_outer_reduction *
-                rparams.unroll_factor_outer_reduction,
-            false);
-        reduction_tv->split(
-            outer_reduce_axis, rparams.unroll_factor_outer_reduction);
-
-        reduction_tv->axis(outer_reduce_axis + 1)
-            ->parallelize(ParallelType::Unroll);
-        reduction_tv->axis(outer_reduce_axis + 2)
-            ->parallelize(rparams.block_dim_outer_reduction);
-      } else {
-        if (isParallelTypeThread(rparams.block_dim_outer_reduction)) {
-          reduction_tv->split(
-              outer_reduce_axis,
-              NamedScalar::getParallelDim(rparams.block_dim_outer_reduction));
-          reduction_tv->split(
-              outer_reduce_axis, rparams.unroll_factor_outer_reduction);
-
-          reduction_tv->axis(outer_reduce_axis + 1)
-              ->parallelize(ParallelType::Unroll);
-          reduction_tv->axis(outer_reduce_axis + 2)
-              ->parallelize(rparams.block_dim_outer_reduction);
+    if (rparams.persistent_kernel) {
+      // Persistent Format:
+      // [Grid Split, persistent buffer, unroll, thread dim]
+      auto outer_i = outer_reduce_axis;
+      if (rparams.cross_grid_outer_reduction) {
+        outer_parallel(outer_i++, rparams.grid_dim_outer_reduction);
+      }
 
-        } else {
-          // outer reduction is not parallelized, but is unrolled or vectorized:
-          reduction_tv->split(
-              outer_reduce_axis, rparams.unroll_factor_outer_reduction);
-          reduction_tv->axis(outer_reduce_axis + 1)
-              ->parallelize(ParallelType::Unroll);
-        }
+      reduction_tv->split(
+          outer_i++, rparams.batches_per_block_outer_reduction, false);
+
+      if (rparams.unroll_factor_outer_reduction > 1) {
+        outer_unroll(outer_i++, rparams.unroll_factor_outer_reduction);
       }
+
+      reduction_tv->axis(outer_i)->parallelize(
+          rparams.block_dim_outer_reduction);
     } else {
-      // Parallelize reduction axis, don't unroll it0
-      if (rparams.cross_block_outer_reduce) {
-        if (rparams.persistent_kernel) {
-          reduction_tv->split(
-              outer_reduce_axis,
-              rparams.batches_per_block_outer_reduction,
-              false);
-          reduction_tv->axis(outer_reduce_axis + 1)
-              ->parallelize(rparams.block_dim_outer_reduction);
-        } else {
-          reduction_tv->split(
-              outer_reduce_axis,
-              NamedScalar::getParallelDim(rparams.block_dim_outer_reduction));
-          reduction_tv->axis(outer_reduce_axis + 1)
-              ->parallelize(rparams.block_dim_outer_reduction);
-        }
+      // Non-persistent format:
+      // [Grid Split, Remainder, unroll, thread dim]
+      if (rparams.cross_block_outer_reduction) {
+        inner_parallel(outer_reduce_axis, rparams.block_dim_outer_reduction);
       }
-    }
 
-    if (rparams.cross_grid_outer_reduce) {
-      reduction_tv->split(
-          outer_reduce_axis,
-          NamedScalar::getParallelDim(rparams.grid_dim_outer_reduction),
-          false);
-      reduction_tv->axis(outer_reduce_axis)
-          ->parallelize(rparams.grid_dim_outer_reduction);
+      if (rparams.unroll_factor_outer_reduction > 1) {
+        inner_unroll(outer_reduce_axis, rparams.unroll_factor_outer_reduction);
+      }
+
+      if (rparams.cross_grid_outer_reduction) {
+        outer_parallel(outer_reduce_axis, rparams.grid_dim_outer_reduction);
+      }
     }
   }
 
   // Iteration domain
   if (has_iter_axis) {
+    // [Grid Split, unswitch, unroll, thread dim, vectorize]
+
+    if (rparams.vectorize_iter_dom) {
+      vectorize(iter_axis, rparams.unroll_factor_iter_dom);
+    }
+
     if (isParallelTypeThread(rparams.block_dim_iter_dom)) {
-      if (rparams.vectorize_iter_dom) {
-        reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom);
-        reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Vectorize);
-
-        reduction_tv->split(
-            iter_axis, NamedScalar::getParallelDim(rparams.block_dim_iter_dom));
-        reduction_tv->axis(iter_axis + 1)
-            ->parallelize(rparams.block_dim_iter_dom);
-      } else {
-        if ((rparams.fastest_dim && rparams.multiple_reds_per_blk) ||
-            !rparams.fastest_dim) {
-          reduction_tv->split(
-              iter_axis,
-              NamedScalar::getParallelDim(rparams.block_dim_iter_dom));
-          reduction_tv->axis(iter_axis + 1)
-              ->parallelize(rparams.block_dim_iter_dom);
-        }
-        if (rparams.unroll_iter_dom) {
-          reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom);
-          reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unroll);
-        }
-      }
-    } else if (rparams.unroll_iter_dom) {
-      // Iteration domain is not parallelized but it is unrolled or vectorized
-      reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom);
-      if (rparams.vectorize_iter_dom) {
-        reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Vectorize);
-      } else {
-        reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unroll);
-      }
+      inner_parallel(iter_axis, rparams.block_dim_iter_dom);
     }
-    if (rparams.unroll_iter_dom) {
-      reduction_tv->split(iter_axis, 1);
-      reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unswitch);
+
+    if (!rparams.vectorize_iter_dom && rparams.unroll_factor_iter_dom > 1) {
+      inner_unroll(iter_axis, rparams.unroll_factor_iter_dom);
     }
 
-    if (rparams.fastest_dim && rparams.split_grid_dim_iter_dom) {
-      reduction_tv->split(iter_axis, scheduler_utils::x_grid_limit);
-      reduction_tv->axis(iter_axis + 1)->parallelize(rparams.grid_dim_iter_dom);
-    } else {
-      reduction_tv->axis(iter_axis)->parallelize(rparams.grid_dim_iter_dom);
+    if (rparams.unroll_factor_iter_dom > 1) {
+      inner_unswitch(iter_axis);
+    }
+
+    if (isParallelTypeThread(rparams.grid_dim_iter_dom)) {
+      if (rparams.split_grid_dim_iter_dom) {
+        outer_parallel(iter_axis, rparams.grid_dim_iter_dom);
+      } else {
+        reduction_tv->axis(iter_axis)->parallelize(rparams.grid_dim_iter_dom);
+      }
     }
   }
 
@@ -346,7 +257,7 @@ void multiReductionInliner(
   std::unordered_set<IterDomain*> mapped_to_trivial_reduction =
       scheduler_utils::getTrivialReductionMap(fusion);
 
-  bool unroll = rparams.unroll_inner_reduction || rparams.unroll_iter_dom;
+  bool unroll = rparams.isUnrolled();
 
   bool vectorize =
       rparams.vectorize_inner_reduction || rparams.vectorize_iter_dom;
@@ -362,7 +273,7 @@ void multiReductionInliner(
     std::vector<TensorView*> compute_from;
 
     // Grab all tensor views that should be vectorized
-    auto vecotrizable_inputs_outputs =
+    auto vectorizable_inputs_outputs =
         scheduler_utils::getInputsOutputsWithInnerDim(reference_tv, true);
 
     // Inputs to cache
@@ -392,9 +303,9 @@ void multiReductionInliner(
           auto producer_tvs = ir_utils::producerTvsOf(cached_input);
           if (producer_tvs.size() == 1 &&
               std::find(
-                  vecotrizable_inputs_outputs.begin(),
-                  vecotrizable_inputs_outputs.end(),
-                  producer_tvs[0]) != vecotrizable_inputs_outputs.end()) {
+                  vectorizable_inputs_outputs.begin(),
+                  vectorizable_inputs_outputs.end(),
+                  producer_tvs[0]) != vectorizable_inputs_outputs.end()) {
             keep_unrolled.emplace(cached_input);
           }
         } else {
@@ -409,8 +320,19 @@ void multiReductionInliner(
       auto cached_output = cached_output_pair.first;
       auto output = cached_output_pair.second;
 
-      // If an output has multiple consumers don't process here, we want only
-      // terminating outputs
+      if (vectorize) {
+        if (std::find(
+                vectorizable_inputs_outputs.begin(),
+                vectorizable_inputs_outputs.end(),
+                output) != vectorizable_inputs_outputs.end()) {
+          keep_unrolled.emplace(output);
+        }
+      } else {
+        keep_unrolled.emplace(output);
+      }
+
+      // If an output has multiple consumers don't process compute at structure
+      // here, we want only terminating outputs
       if (cached_output->uses().size() > 1) {
         continue;
       }
@@ -432,15 +354,27 @@ void multiReductionInliner(
       cached_output->computeAt(output, pos, ComputeAtMode::BestEffort);
 
       compute_to.push_back(cached_output);
-      if (vectorize) {
-        if (std::find(
-                vecotrizable_inputs_outputs.begin(),
-                vecotrizable_inputs_outputs.end(),
-                output) != vecotrizable_inputs_outputs.end()) {
-          keep_unrolled.emplace(output);
+    }
+
+    {
+      // Add inputs to compute_at that weren't unrolled
+      auto processed_inputs = ir_utils::inputTvsOf(compute_from);
+      std::unordered_set<TensorView*> processed_inputs_set{
+          processed_inputs.begin(), processed_inputs.end()};
+      for (auto inp_tv : ir_utils::filterByType<TensorView>(fusion->inputs())) {
+        if (!processed_inputs_set.count(inp_tv)) {
+          compute_from.push_back(inp_tv);
+        }
+      }
+
+      auto processed_outputs = ir_utils::inputTvsOf(compute_to);
+      std::unordered_set<TensorView*> processed_outputs_set{
+          processed_outputs.begin(), processed_outputs.end()};
+      for (auto out_tv :
+           ir_utils::filterByType<TensorView>(fusion->outputs())) {
+        if (!processed_outputs_set.count(out_tv) && out_tv->uses().empty()) {
+          compute_to.push_back(out_tv);
         }
-      } else {
-        keep_unrolled.emplace(output);
       }
     }
 
@@ -483,7 +417,7 @@ void multiReductionInliner(
       // Compute at rfactor into following reduction, keep outside first
       // reduction iter domain in the rfactor tensor view
       for (const auto i : c10::irange(rfactor_tvs.size())) {
-        if (rparams.unroll_iter_dom) {
+        if (rparams.unroll_factor_iter_dom > 1) {
           auto rfactor_tv = rfactor_tvs[i];
           auto rfactor_tv_dom = rfactor_tv->domain()->domain();
           auto reduction_it = std::find_if(
@@ -563,6 +497,42 @@ void multiReductionInliner(
       scheduler_utils::computeWithOutputs(
           red_tv, pos, ComputeAtMode::BestEffort);
     }
+    // For topologies where there may not be paths to all inputs/outputs from
+    // the reductions, we need to take a similar approach to the unrolled
+    // version and setup of compute at from inputs->outputs avoiding going
+    // through the reduction expressions. This can be done by grabbing inputs
+    // not on path to a reduction, and computeAt-ing with all outputs. This
+    // doesn't guarantee we don't go through a reduction, but with best effort
+    // it should minimize damage if it does.
+    std::vector<TensorView*> compute_to;
+    for (auto out : ir_utils::filterByType<TensorView>(fusion->outputs())) {
+      // only terminating outputs
+      if (out->uses().size() || out->isFusionInput()) {
+        continue;
+      }
+      compute_to.push_back(out);
+    }
+
+    std::vector<TensorView*> compute_from;
+    std::unordered_set<TensorView*> inps_of_reds;
+    {
+      auto inps_of_red_vec = ir_utils::inputTvsOf(ref_tvs);
+      inps_of_reds = std::unordered_set<TensorView*>(
+          inps_of_red_vec.begin(), inps_of_red_vec.end());
+    }
+    for (auto inp : ir_utils::filterByType<TensorView>(fusion->inputs())) {
+      if (inps_of_reds.find(inp) != inps_of_reds.end()) {
+        continue;
+      }
+      compute_from.push_back(inp);
+    }
+
+    scheduler_utils::computeAtBetween(
+        compute_from,
+        compute_to,
+        -1,
+        ComputeAtMode::BestEffort,
+        mapped_to_trivial_reduction);
   }
 }
 
@@ -580,12 +550,6 @@ int idPos(const IterDomain* id) {
   }
   inner_most--;
 
-  // Broadcast
-  if (id->isBroadcast() || id->isImplicitBroadcast()) {
-    return inner_most;
-  }
-  inner_most--;
-
   // Reduction and unrolled
   if (id->isReduction() &&
       (id->getParallelType() == ParallelType::Unroll ||
@@ -595,12 +559,6 @@ int idPos(const IterDomain* id) {
   }
   inner_most--;
 
-  // Reduction and block
-  if (id->isReduction() && id->isBlockDim()) {
-    return inner_most;
-  }
-  inner_most--;
-
   // Reduction and constant
   if (id->isReduction() && id->extent()->isConstScalar()) {
     return inner_most;
@@ -614,7 +572,13 @@ int idPos(const IterDomain* id) {
   inner_most--;
 
   // Reduction and thread
-  if (id->isReduction() && id->isThreadDim()) {
+  if (id->isReduction() && id->isThread()) {
+    return inner_most;
+  }
+  inner_most--;
+
+  // Broadcast
+  if (id->isBroadcast() || id->isImplicitBroadcast()) {
     return inner_most;
   }
   inner_most--;
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 46b574ac6af5..a1a7ef124f42 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1,10 +1,13 @@
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
 
@@ -38,7 +41,8 @@ class SchedulerTopologyChecker {
     auto all_vals = fusion->usedMathVals();
     std::vector<TensorView*> reduction_tvs;
     for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
-      if (tv->hasReduction() && !fusion->hasInput(tv)) {
+      if (tv->hasReduction() &&
+          !(fusion == tv->fusion() && tv->isFusionInput())) {
         reduction_tvs.push_back(tv);
       }
     }
@@ -355,6 +359,50 @@ class SchedulerTopologyChecker {
     return true;
   }
 };
+
+bool isConnectedFusionGraph(Fusion* fusion) {
+  if (fusion->outputs().empty()) {
+    // Trivial case interpreted as connected
+    return true;
+  }
+
+  // A set of connected components on the fusion graph
+  DisjointSets<Val*> component_sets;
+
+  // Iterate through all used exprs
+  for (auto expr : fusion->exprs()) {
+    TORCH_INTERNAL_ASSERT(
+        !expr->inputs().empty(), "unknown expr with zero input");
+
+    // Each expr maps all its inputs and
+    //  outputs to the same component
+    auto input0 = expr->inputs()[0];
+    for (auto input : expr->inputs()) {
+      component_sets.mapEntries(input0, input);
+    }
+    for (auto output : expr->outputs()) {
+      component_sets.mapEntries(input0, output);
+    }
+  }
+
+  // Map aliased outputs
+  for (auto alias_it : fusion->ioAlias()) {
+    component_sets.mapEntries(alias_it.first, alias_it.second);
+  }
+
+  // Check connected-ness:
+  //  If there is no independent compute flow
+  // on this fusion graph, all outputs will be
+  // equivalent/connected to the first output.
+  auto output0 = fusion->outputs()[0];
+  for (auto output : fusion->outputs()) {
+    if (!component_sets.strictAreMapped(output0, output)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 } // namespace
 
 SchedulerRuntimeInfo::SchedulerRuntimeInfo(
@@ -362,7 +410,19 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo(
     const at::ArrayRef<IValue>& inputs,
     bool create_expr_evaluator)
     : complete_fusion_(complete_fusion) {
-  collectVectorizationInfo(inputs);
+  TORCH_INTERNAL_ASSERT(
+      complete_fusion_->inputs().size() == inputs.size(),
+      "Invalid number of arguments passed in for provided fusion group.");
+
+  for (auto inp_i : c10::irange(inputs.size())) {
+    auto aten_inp = inputs[inp_i];
+    if (aten_inp.isTensor()) {
+      auto fusion_inp = complete_fusion_->inputs()[inp_i];
+      auto data_ptr = aten_inp.toTensor().data_ptr();
+      input_ptrs_[fusion_inp] = (size_t)data_ptr;
+    }
+  }
+
   expression_evaluator_ =
       std::make_unique<ExpressionEvaluator>(complete_fusion_);
   if (create_expr_evaluator) {
@@ -371,22 +431,13 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo(
   collectIndexModeInfo(inputs);
 }
 
-SchedulerRuntimeInfo::SchedulerRuntimeInfo(
-    const SchedulerRuntimeInfo& copy_from)
-    : complete_fusion_(copy_from.complete_fusion_),
-      alignment_map_(copy_from.alignment_map_),
-      common_alignment_size_(copy_from.common_alignment_size_) {
-  expression_evaluator_ =
-      std::make_unique<ExpressionEvaluator>(complete_fusion_);
-}
-
-size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) {
-  auto alignment_entry = alignment_map_.find(tv);
-  if (alignment_entry == alignment_map_.end()) {
-    return max_alignment_size_in_byte;
-  } else {
-    return alignment_entry->second;
+// TODO: Output tensors could have an alignment that is not 16 Bytes passed in
+// from user.
+size_t SchedulerRuntimeInfo::ptrOf(TensorView* tv) {
+  if (input_ptrs_.find(tv) != input_ptrs_.end()) {
+    return input_ptrs_.at(tv);
   }
+  return max_alignment_size_in_byte;
 }
 
 void SchedulerRuntimeInfo::initializeExpressionEvaluator(
@@ -397,120 +448,161 @@ void SchedulerRuntimeInfo::initializeExpressionEvaluator(
       executor_utils::bindFusionInputs(inputs, complete_fusion_);
 }
 
-size_t SchedulerRuntimeInfo::collectAlignmentSize(
-    const at::Tensor& tensor) const {
-  const size_t address = reinterpret_cast<size_t>(tensor.data_ptr());
+size_t SchedulerRuntimeInfo::computeAlignmentSize(size_t ptr_address) {
   size_t alignment_size = 1;
   size_t next_alignment_size = 2;
 
-  while (alignment_size <= max_alignment_size_in_byte &&
-         address % next_alignment_size == 0) {
+  while (next_alignment_size <= max_alignment_size_in_byte &&
+         ptr_address % next_alignment_size == 0) {
     alignment_size = next_alignment_size;
     next_alignment_size *= 2;
   }
+  return alignment_size;
+}
+
+size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) {
+  auto alignment_entry = alignment_map_.find(tv);
+  if (alignment_entry != alignment_map_.end()) {
+    return alignment_entry->second;
+  }
 
+  auto alignment_size = SchedulerRuntimeInfo::computeAlignmentSize(ptrOf(tv));
+  alignment_map_[tv] = alignment_size;
   return alignment_size;
 }
 
-void SchedulerRuntimeInfo::collectVectorizationInfo(
-    const at::ArrayRef<IValue>& inputs) {
-  common_alignment_size_ = max_alignment_size_in_byte;
-  size_t number_of_inputs = complete_fusion_->inputs().size();
-  std::unordered_map<TensorView*, size_t> cg_tensor_to_at_tensor_index;
-
-  for (auto input_index : c10::irange(number_of_inputs)) {
-    if (auto input_tensor = dynamic_cast<TensorView*>(
-            complete_fusion_->inputs()[input_index])) {
-      if (input_tensor->nDims() == 0) {
-        // A 0-dim tensor input would not need vectorization
-        continue;
-      }
-      if (input_tensor->domain()
-              ->domain()[input_tensor->nDims() - 1]
-              ->isBroadcast()) {
-        // skip the tensors with innermost iterdomain broadcasted,
-        //  as we will not vectorize these.
-        continue;
+// Gets maximum vectorizable width of tv, assumes we can merge across all
+// iteration domains if contiguous. Cannot permute the dimensions to fix
+// contiguity.
+size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) {
+  // Gets the vectorizable width of the tv starting from the inner most
+  // dimension, working its way towards the outer most dimension, if they're
+  // contiguous. Ignores broadcast and reduction domains.
+  auto max_vectorword_map_it_ = max_vectorword_map_.find(tv);
+  if (max_vectorword_map_it_ != max_vectorword_map_.end()) {
+    return max_vectorword_map_it_->second;
+  }
+
+  // If we don't have an record, either it is a tv with innermost broadcast,
+  // or it is an intermediate tensor allocated by fuser. Logic copied to get
+  // root according to scheduler_utils::innerMostRootDim.
+  auto tv_root = tv->hasReduction() && tv->hasRFactor()
+      ? tv->getRootDomain()
+      : tv->getMaybeRFactorDomain();
+
+  auto tv_root_no_reductions = TensorDomain::noReductions(tv_root);
+
+  auto contiguity = tv->domain()->contiguity();
+  // Appears after reductions the reduction domain often has a contiguity entry.
+  // This only matters if the result of the reduction is an output
+  if (contiguity.size() == tv_root.size() &&
+      contiguity.size() != tv_root_no_reductions.size()) {
+    std::vector<bool> new_contiguity;
+    for (auto i : c10::irange(tv_root.size())) {
+      if (!tv_root[i]->isReduction()) {
+        new_contiguity.push_back(contiguity[i]);
       }
+    }
+    contiguity = new_contiguity;
+  }
+  tv_root = tv_root_no_reductions;
 
-      // Collect strides of the input tensor
-      TORCH_INTERNAL_ASSERT(inputs[input_index].isTensor());
-      const auto& at_tensor = inputs[input_index].toTensor();
+  auto tv_root_size = tv_root.size();
 
-      cg_tensor_to_at_tensor_index.emplace(
-          std::make_pair(input_tensor, input_index));
+  // Filter out 0-dim tensors
+  if (tv_root_size < 1) {
+    return 1;
+  }
 
-      // Collect alignment of the input tensor
-      auto alignment_size = collectAlignmentSize(at_tensor);
-      common_alignment_size_ = std::min(alignment_size, common_alignment_size_);
-      alignment_map_[input_tensor] = alignment_size;
-    }
+  // Filter out mismatched contiguity info
+  if (tv_root_size != contiguity.size()) {
+    return 1;
   }
 
-  // Compute max vector word size for each input,
-  //  tensors with inner most broadcast already
-  //  filtered out.  common_alignment_size_ is
-  //  computed up to this point.
-  for (auto it : cg_tensor_to_at_tensor_index) {
-    vectorword_map_[it.first] = collectMaxVectorizeSize(
-        inputs[it.second].toTensor(), common_alignment_size_);
+  size_t item_size =
+      dataTypeSize(tv->dtype(), indexModeToDtype(getIndexMode()));
+
+  // Alignment should always at least be the data type size
+  TORCH_INTERNAL_ASSERT(getAlignmentSize(tv) % item_size == 0);
+  size_t max_vector_size = getAlignmentSize(tv) / item_size;
+
+  if (max_vector_size == 1) {
+    return 1;
   }
-}
 
-size_t SchedulerRuntimeInfo::collectMaxVectorizeSize(
-    const at::Tensor& tensor,
-    size_t max_vector_size_in_byte) {
-  size_t vector_size = 1;
-  size_t next_vector_size = 2;
-  bool next_size_compatible = true;
-
-  while (next_size_compatible &&
-         next_vector_size * tensor.itemsize() <= max_vector_size_in_byte) {
-    // If inner most dimension size is not divisible by new word size
-    //  then we cannot vectorize with this width. But we do not
-    //  care if all dimensions of this tensor is 1, i.e.
-    //  input is actually a un-squeezed 0-dim tensor.
-    for (size_t i = tensor.ndimension(); i > 0; i--) {
-      if (tensor.size(i - 1) != 1) {
-        if (tensor.size(tensor.ndimension() - 1) % next_vector_size != 0 ||
-            tensor.stride(tensor.ndimension() - 1) != 1) {
-          next_size_compatible = false;
-        }
-        break;
-      }
+  auto numel = 1;
+  for (auto i : c10::irange(tv_root_size)) {
+    auto root_i = tv_root_size - i - 1;
+    auto root_id = tv_root[root_i];
+
+    if (root_id->extent()->isOneInt() || root_id->isBroadcast()) {
+      continue;
     }
 
-    if (!next_size_compatible) {
+    // Not contiguous
+    if (!contiguity[root_i]) {
       break;
     }
 
-    // If any stride is not divisible by the next word size,
-    //  we cannot vectorize with this width.
-    for (auto stride : tensor.strides()) {
-      if (stride != 1 && stride % next_vector_size != 0) {
-        next_size_compatible = false;
-        break;
-      }
+    auto dim_size = expression_evaluator_->evaluate(root_id->extent());
+    // Inference failed for some reason, assume not-contiguous at this point
+    if (!dim_size.has_value()) {
+      break;
     }
 
-    if (next_size_compatible) {
-      vector_size = next_vector_size;
-      next_vector_size *= 2;
-    }
+    // Still contiguous
+    numel *= dim_size.value();
   }
 
+  // Assuming intermediate tensors have friendly alignment, and
+  //  all contiguity true. Determine the largest power of 2 below
+  //  innermost dimension size for the word size of vectorizaiton
+  size_t vector_size = 1;
+  size_t next_vector_size = 2;
+  while (next_vector_size <= max_vector_size && next_vector_size <= numel &&
+         numel % next_vector_size == 0) {
+    vector_size = next_vector_size;
+    next_vector_size *= 2;
+  }
+
+  // save output to avoid re-compute
+  max_vectorword_map_[tv] = vector_size;
+
   return vector_size;
 }
 
-size_t SchedulerRuntimeInfo::getVectorizableWidth(TensorView* tv) {
-  auto recorded_size_it = vectorword_map_.find(tv);
-  if (recorded_size_it != vectorword_map_.end()) {
-    return recorded_size_it->second;
+// Gets the vectorizable width of the inner most dimension of tv if it's
+// contiguous. Ignores inner most dimensions that are broadcast or reduction.
+size_t SchedulerRuntimeInfo::getInnerDimVectorizableWidth(TensorView* tv) {
+  auto inner_vectorword_map_it_ = inner_vectorword_map_.find(tv);
+  if (inner_vectorword_map_it_ != inner_vectorword_map_.end()) {
+    return inner_vectorword_map_it_->second;
   }
 
-  // If we don't have an record, either it is a tv with innermost
-  //  broadcast, or it is an intermediate tensor allocated by fuser
-  auto tv_root = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  // If we don't have an record, either it is a tv with innermost broadcast,
+  // or it is an intermediate tensor allocated by fuser. Logic copied to get
+  // root according to scheduler_utils::innerMostRootDim.
+  auto tv_root = tv->hasReduction() && tv->hasRFactor()
+      ? tv->getRootDomain()
+      : tv->getMaybeRFactorDomain();
+
+  auto tv_root_no_reductions = TensorDomain::noReductions(tv_root);
+
+  auto contiguity = tv->domain()->contiguity();
+  // Appears after reductions the reduction domain often has a contiguity entry.
+  // This only matters if the result of the reduction is an output
+  if (contiguity.size() == tv_root.size() &&
+      contiguity.size() != tv_root_no_reductions.size()) {
+    std::vector<bool> new_contiguity;
+    for (auto i : c10::irange(tv_root.size())) {
+      if (!tv_root[i]->isReduction()) {
+        new_contiguity.push_back(contiguity[i]);
+      }
+    }
+    contiguity = new_contiguity;
+  }
+  tv_root = tv_root_no_reductions;
+
   auto tv_root_size = tv_root.size();
 
   // Filter out 0-dim tensors
@@ -519,29 +611,37 @@ size_t SchedulerRuntimeInfo::getVectorizableWidth(TensorView* tv) {
   }
 
   // Filter out mismatched contiguity info
-  if (tv_root_size != tv->domain()->contiguity().size()) {
+  if (tv_root_size != contiguity.size()) {
     return 1;
   }
 
-  // Filter out innermost broadcast tensors
-  auto inner_dimension = tv_root[tv_root_size - 1];
-  if (inner_dimension->isBroadcast()) {
-    return 1;
+  auto inner_most_dim = scheduler_utils::innerMostRootDim(tv);
+
+  int id_pos = -1;
+  for (auto root_i : c10::irange(tv_root_size)) {
+    if (tv_root[root_i] == inner_most_dim) {
+      id_pos = root_i;
+      break;
+    }
   }
 
-  // Handle intermediate or output tensors that
-  //  will be allocated by fuser
-  auto maybe_data_type = tv->getDataType();
+  // Something went wrong with finding the inner most dimension, just
+  // return 1.
+  if (id_pos == -1) {
+    return 1;
+  }
 
-  // Do not vectorize on data with unknown type
-  if (!maybe_data_type.has_value()) {
+  // If the inner most dimension is not contiguous return 1
+  if (!contiguity[id_pos]) {
     return 1;
   }
 
-  size_t item_size = dataTypeSize(maybe_data_type.value());
-  // Assume we don't have non-divisible types for now.
-  TORCH_INTERNAL_ASSERT(max_alignment_size_in_byte % item_size == 0);
-  size_t max_vector_size = max_alignment_size_in_byte / item_size;
+  size_t item_size =
+      dataTypeSize(tv->dtype(), indexModeToDtype(getIndexMode()));
+
+  // Alignment should always at least be the data type size
+  TORCH_INTERNAL_ASSERT(getAlignmentSize(tv) % item_size == 0);
+  size_t max_vector_size = getAlignmentSize(tv) / item_size;
 
   // Assuming intermediate tensors have friendly alignment, and
   //  all contiguity true. Determine the largest power of 2 below
@@ -549,7 +649,7 @@ size_t SchedulerRuntimeInfo::getVectorizableWidth(TensorView* tv) {
   size_t vector_size = 1;
   size_t next_vector_size = 2;
   auto maybe_inner_dimension_size =
-      expression_evaluator_->evaluate(inner_dimension->extent());
+      expression_evaluator_->evaluate(inner_most_dim->extent());
   TORCH_INTERNAL_ASSERT(maybe_inner_dimension_size.has_value());
   size_t inner_dimension_size = maybe_inner_dimension_size.value();
 
@@ -561,13 +661,15 @@ size_t SchedulerRuntimeInfo::getVectorizableWidth(TensorView* tv) {
   }
 
   // save output to avoid re-compute
-  vectorword_map_[tv] = vector_size;
+  inner_vectorword_map_[tv] = vector_size;
 
   return vector_size;
 }
 
 void SchedulerRuntimeInfo::collectIndexModeInfo(
     const at::ArrayRef<at::IValue>& inputs) {
+  // TODO: Need to check the output sizes as well.
+
   // Save 1 more bit besides the sign bit to be conservative
   constexpr int64_t most_positive_int32_index =
       std::numeric_limits<int>::max() / 2;
@@ -634,39 +736,10 @@ bool SchedulerEntry::sameAs(const SchedulerEntry* other) {
 }
 
 namespace {
-template <typename REDUCTION_OP = ReductionOp>
-inline bool isTrivialReduction(REDUCTION_OP* red) {
-  auto o_tv = red->out()->template as<TensorView>();
-  // Assuming graph unscheduled at this point.
-  for (auto id : o_tv->getRootDomain()) {
-    if (id->isReduction() && !id->extent()->isOneInt()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename REDUCTION_OP = ReductionOp>
-std::vector<REDUCTION_OP*> findReductionOps(Fusion* fusion) {
-  std::vector<REDUCTION_OP*> red_ops;
-  for (auto expr : fusion->exprs()) {
-    if (auto red = dynamic_cast<REDUCTION_OP*>(expr)) {
-      if (!isTrivialReduction(red)) {
-        red_ops.push_back(red);
-      }
-    }
-  }
-  return red_ops;
-}
-
 std::vector<TransposeOp*> findTransposeOps(Fusion* fusion) {
-  std::vector<TransposeOp*> transpose_ops;
-  for (auto expr : fusion->exprs()) {
-    if (auto transpose_op = dynamic_cast<TransposeOp*>(expr)) {
-      transpose_ops.push_back(transpose_op);
-    }
-  }
-  return transpose_ops;
+  auto exprs = fusion->exprs();
+  auto transpose_ops = ir_utils::filterByType<TransposeOp>(exprs);
+  return std::vector<TransposeOp*>(transpose_ops.begin(), transpose_ops.end());
 }
 
 static bool checkPatternEquivalence(
@@ -747,12 +820,24 @@ class ReductionScheduler : public SchedulerEntry {
 
   //! Check if the reduction heuristics apply in given fusion
   static bool canScheduleCompileTime(Fusion* fusion) {
+    // Temporarily allow view in reduction scheduler
+    // TODO Add more testing before enabling
     auto view_tvs = scheduler_utils::getViewTVs(fusion);
     if (view_tvs.size() > 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction, "No support for view op");
       return false;
     }
 
-    auto reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+    // Needs at least one non-trivial reduction to consider.
+    if (ir_utils::getReductionOps(fusion, true /* ignore_trivial */).empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction, "No reduction op to schedule");
+      return false;
+    }
+
+    auto reduction_tvs =
+        scheduler_utils::getReductionTvs(fusion, false /* ignore_trivial */);
 
     if (reduction_tvs.size() == 0) {
       // Use pointwise logic
@@ -761,13 +846,15 @@ class ReductionScheduler : public SchedulerEntry {
 
     if (findTransposeOps(fusion).size() > 0) {
       // Use pointwise logic
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction, "No support for transpose op");
       return false;
     }
 
     // Make sure reduction axes are consistent through the fusion
-    if (findReductionOps(fusion).size() +
-            findReductionOps<WelfordOp>(fusion).size() >
-        1) {
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
+    if (reduction_ops.size() > 1) {
       // Before examining the reduction axes want to quickly
       //   check the reductions have the same axis width
       //   to avoid building root domain map in easier cases
@@ -789,6 +876,12 @@ class ReductionScheduler : public SchedulerEntry {
           axis_count = reduction_root_size(red);
         } else {
           if (reduction_root_size(red) != axis_count) {
+            scheduler_debug_utils::canScheduleRejectReason(
+                ScheduleHeuristic::Reduction,
+                "Inconsistent reduction axes ",
+                red,
+                "is not ",
+                axis_count);
             return false;
           }
         }
@@ -803,6 +896,12 @@ class ReductionScheduler : public SchedulerEntry {
       for (size_t it = 1; it < reduction_tvs.size(); it++) {
         if (!checkPatternEquivalence(
                 reduction_tvs[it - 1], reduction_tvs[it], root_map)) {
+          scheduler_debug_utils::canScheduleRejectReason(
+              ScheduleHeuristic::Reduction,
+              "Un-mapped multi-reduction: ",
+              reduction_tvs[it - 1],
+              " ",
+              reduction_tvs[it]);
           return false;
         }
       }
@@ -811,12 +910,18 @@ class ReductionScheduler : public SchedulerEntry {
     // Doesn't allow persistent kernels in this scheduler
     auto persistent_buffer_info = scheduler_utils::persistentBuffers(fusion);
     if (persistent_buffer_info.persistent_buffers.size() > 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction,
+          "need persistent buffers that reduction scheduler doesn't handle");
       return false;
     }
 
     if (!SchedulerTopologyChecker::supportedPostReductionFusion(
             fusion, reduction_tvs) ||
         SchedulerTopologyChecker::hasPostReductionBCast(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction,
+          "has unsupported post reduction fusion");
       return false;
     }
 
@@ -857,9 +962,26 @@ class PointWiseScheduler : public SchedulerEntry {
   }
 
   static bool canScheduleCompileTime(Fusion* fusion) {
-    auto red_ops = findReductionOps(fusion);
-    auto welford_ops = findReductionOps<WelfordOp>(fusion);
-    return red_ops.empty() && welford_ops.empty();
+    //   Currently using the same path as the scheduler
+    // to eliminate mismatch between canSchedule and
+    // schedule pointwise.
+    if (!hasReferenceTensorView(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::PointWise, "cannot find reference tensor");
+      return false;
+    }
+
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, true /* ignore_trivial */);
+    auto welford_ops = ir_utils::filterByType<WelfordOp>(reduction_ops);
+
+    if (!reduction_ops.empty() || !welford_ops.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::PointWise, "no support for reduction ops");
+      return false;
+    }
+
+    return true;
   }
 
   static bool canScheduleRunTime(
@@ -900,20 +1022,46 @@ class PersistentKernelScheduler : public SchedulerEntry {
   }
 
   static bool canScheduleCompileTime(Fusion* fusion) {
+    // Needs at least one non-trivial reduction to consider.
+    if (ir_utils::getReductionOps(fusion, true /* ignore_trivial */).empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "needs a reduction op");
+      return false;
+    }
+
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
+    auto welford_ops = ir_utils::filterByType<WelfordOp>(reduction_ops);
+    // For persistent schedule we want welford translated to average and
+    // standard deviation reductions.
+    if (welford_ops.begin() != welford_ops.end()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent,
+          "no support for un-translated welford");
+      return false;
+    }
+
     auto view_tvs = scheduler_utils::getViewTVs(fusion);
     if (view_tvs.size() > 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "no support for view");
       return false;
     }
 
-    auto reduction_tvs = scheduler_utils::getReductionTvs(fusion);
+    auto reduction_tvs =
+        scheduler_utils::getReductionTvs(fusion, false /* ignore_trivial */);
 
     if (reduction_tvs.size() == 0) {
       // Use pointwise logic
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "no reduction tv");
       return false;
     }
 
     if (findTransposeOps(fusion).size() > 0) {
       // Use pointwise logic
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "no support for transpose");
       return false;
     }
 
@@ -938,6 +1086,9 @@ class PersistentKernelScheduler : public SchedulerEntry {
         axis_count = reduction_root_size(red);
       } else {
         if (reduction_root_size(red) != axis_count) {
+          scheduler_debug_utils::canScheduleRejectReason(
+              ScheduleHeuristic::Persistent,
+              "inconsistent reduction root size");
           return false;
         }
       }
@@ -952,6 +1103,12 @@ class PersistentKernelScheduler : public SchedulerEntry {
     for (const auto it : c10::irange(1, reduction_tvs.size())) {
       if (!checkPatternEquivalence(
               reduction_tvs[it - 1], reduction_tvs[it], root_map)) {
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::Persistent,
+            "unmapped reduction ",
+            reduction_tvs[it - 1],
+            " and ",
+            reduction_tvs[it]);
         return false;
       }
     }
@@ -959,10 +1116,15 @@ class PersistentKernelScheduler : public SchedulerEntry {
     // Only accept persistent kernels
     auto persistent_buffer_info = scheduler_utils::persistentBuffers(fusion);
     if (persistent_buffer_info.persistent_buffers.size() == 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "no persistent buffer identified");
       return false;
     }
 
     if (SchedulerTopologyChecker::hasNonNormalizePostReductionBCast(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent,
+          "unsupported post reduction normalization");
       return false;
     }
 
@@ -979,7 +1141,8 @@ class PersistentKernelScheduler : public SchedulerEntry {
         HeuristicSummaryEntry<HeuristicCompileTime::ReductionTVs>(
             data_cache, [&fusion]() {
               return std::make_unique<std::vector<TensorView*>>(
-                  scheduler_utils::getReductionTvs(fusion));
+                  scheduler_utils::getReductionTvs(
+                      fusion /*, ignore_trivial = true*/));
             });
 
     auto& reduction_tvs = reduction_tv_entry.get();
@@ -1001,6 +1164,9 @@ class PersistentKernelScheduler : public SchedulerEntry {
         persistent_buffer_size_info.projected_persistent_buffer_size);
 
     if (persistent_buffer_size > scheduler_utils::register_file_size) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent,
+          "not enough registers for persistence");
       return false;
     }
 
@@ -1042,6 +1208,9 @@ class PersistentKernelScheduler : public SchedulerEntry {
             // Reduction count is larger than max thread count * 4
             properties.total_reduction_numel >=
                 device_max_threads_per_multiprocessor * 4)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "unsupported cross grid persistence");
+
       return false;
     }
 
@@ -1079,8 +1248,13 @@ bool checkCanSchedule(
   // since for all current use cases
   //  it has to pass all the compile time checks to create a data cache for this
   //  fusion.
-  if (!data_cache && !SchedulerType::canScheduleCompileTime(fusion)) {
-    return false;
+  if (!data_cache) {
+    if (!isConnectedFusionGraph(fusion)) {
+      return false;
+    }
+    if (!SchedulerType::canScheduleCompileTime(fusion)) {
+      return false;
+    }
   }
 
   return SchedulerType::canScheduleRunTime(fusion, runtime_info, data_cache);
@@ -1144,6 +1318,7 @@ c10::optional<ScheduleHeuristic> SchedulerEntry::proposeHeuristics(
     SchedulerRuntimeInfo& runtime_info) {
   for (auto sh : all_heuristics()) {
     if (canSchedule(sh, fusion, runtime_info)) {
+      scheduler_debug_utils::canScheduleMessage("***Accepted*** as: ", sh);
       return sh;
     }
   }
@@ -1172,6 +1347,11 @@ std::string toString(ScheduleHeuristic sh) {
   return "";
 }
 
+std::ostream& operator<<(std::ostream& os, ScheduleHeuristic sh) {
+  os << toString(sh);
+  return os;
+}
+
 namespace {
 
 //! CompileTimeInfo is the actual subclass of CompileTimeInfoBase that will
@@ -1222,20 +1402,22 @@ HeuristicSummary::HeuristicSummary(
 
 void HeuristicSummary::validate() const {
   switch (heuristic_) {
-    case ScheduleHeuristic::PointWise:
+    case ScheduleHeuristic::PointWise: {
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::BROADCAST_BYTE_MULTIPLES));
       break;
-    case ScheduleHeuristic::Reduction:
+    }
+    case ScheduleHeuristic::Reduction: {
       TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::REDUCTION_TVS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::UNROLLABLE_INPUTS_AND_OUTPUTS));
       break;
-    case ScheduleHeuristic::Persistent:
+    }
+    case ScheduleHeuristic::Persistent: {
       TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::REDUCTION_TVS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
@@ -1253,6 +1435,9 @@ void HeuristicSummary::validate() const {
           !persistent_buffer_info->persistent_buffers.empty() &&
           entry_type_map_.count(EntryType::SCOPE_PERSISTENT_FACTOR_INFO));
       break;
+    }
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unknown heuristic");
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.h b/torch/csrc/jit/codegen/cuda/scheduler/registry.h
index 458f71baf751..3743feecd3cb 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.h
@@ -23,11 +23,11 @@ class ExpressionEvaluator;
 //!    segmenter and schedulers.
 //!  It is important that input id encoding should be up to date with any change
 //!   of this class to avoid launching compiled kernels with illegal inputs.
-class TORCH_CUDA_CU_API SchedulerRuntimeInfo {
+class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
  public:
   // Max vector size we will consider, in bytes,
   //  currently set to 16B = 128b
-  const size_t max_alignment_size_in_byte = 16;
+  static constexpr size_t max_alignment_size_in_byte = 16;
 
   //! Create runtime info for given fusion and input. Creating and binding
   //! evaluator is optional. The evaluator is used to manage intermediate
@@ -39,30 +39,26 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo {
       const at::ArrayRef<at::IValue>& inputs,
       bool create_expr_evaluator = false);
 
-  //! Create runtime info by copying all the global
-  //! input meta data (i.e. alignment), but not the
-  //! expression evaluator.
-  SchedulerRuntimeInfo(const SchedulerRuntimeInfo& global_runtime_info);
-
   //! Lookup for the alignment sizes of the given tv. Currently only returns
   //!  actual alignment info for input tensors to the complete fusion,
   //!  and for other intermediate/fuser-allocated tensors will
   //!  return max_alignment_size_in_byte.
   size_t getAlignmentSize(TensorView* tv);
 
-  //! Take the minimum of input tv alignment sizes. This is both information for
-  //! vectorization and
-  //!  a signature for kernel cache id lookup. May need to be updated with
-  //!  vectorization logic.
-  size_t getCommonAlignmentSize() const {
-    return common_alignment_size_;
-  }
+  // Gets maximum vectorizable width of tv, assumes we can merge across all
+  // iteration domains if contiguous. Cannot permute the dimensions to fix
+  // contiguity. Ignores dimensions that are broadcast or reduction.
+  size_t getMaxVectorizableWidth(TensorView* tv);
+
+  // Gets the vectorizable width of the inner most dimension of tv if it's
+  // contiguous. Ignores inner most dimensions that are broadcast or reduction.
+  size_t getInnerDimVectorizableWidth(TensorView* tv);
 
-  //! Returns the max width the given tensor view can be vectorized,
-  //!  for input tensors will use the pre-computed value based on
-  //!  the given tensor alignment and strides. For intermediate tensors
-  //!  will assume it is contiguous and aligned to 128bit/16Byte
-  size_t getVectorizableWidth(TensorView* tv);
+  // Computes alignment size in bytes for provided ptr address
+  static size_t computeAlignmentSize(size_t ptr_address);
+
+  // Return the runtime pointer value for provided tensor view
+  size_t ptrOf(TensorView* tv);
 
   KernelIndexMode getIndexMode() {
     return index_mode_;
@@ -81,27 +77,43 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo {
   // Bind full fusion inputs to the internal expression evaluator
   void initializeExpressionEvaluator(const at::ArrayRef<at::IValue>& inputs);
 
-  // Compute alignment data for all input tensors of full fusion
-  void collectVectorizationInfo(const at::ArrayRef<at::IValue>& inputs);
-
-  // Compute alignment data for given tensor
-  size_t collectAlignmentSize(const at::Tensor& tensor) const;
-
-  // Compute max vectorization word size for each an input tensor
-  size_t collectMaxVectorizeSize(
-      const at::Tensor& tensor,
-      size_t max_word_size_in_byte);
-
   // check if input is compatible with 32b index mode
   void collectIndexModeInfo(const at::ArrayRef<at::IValue>& inputs);
 
  private:
+  bool isInputTv(TensorView* tv) {
+    return std::find(
+               complete_fusion_->inputs().begin(),
+               complete_fusion_->inputs().end(),
+               tv) != complete_fusion_->inputs().end();
+  }
+
+  // Returns the offset of tv in the inputs ignoring non tensor views. Used to
+  // access input_sizes, input_strides, input_ptr
+  int offsetTensorPos(TensorView* tv);
+
+  // Expression evaluator used to probe sizes in the fusion IR
   std::unique_ptr<ExpressionEvaluator> expression_evaluator_ = nullptr;
+
+  // Fusion reference that this runtime info is associated with
   Fusion* complete_fusion_ = nullptr;
+
+  // Copy of aten input pointer addresses
+  // TODO: Support output tensor pointers
+  std::unordered_map<Val*, size_t> input_ptrs_;
+
+  // Cache for getAlignmentSize
   std::unordered_map<TensorView*, size_t> alignment_map_;
-  std::unordered_map<TensorView*, size_t> vectorword_map_;
-  size_t common_alignment_size_ = 0;
+  // Cache for getMaxVectorizableWidth
+  std::unordered_map<TensorView*, size_t> max_vectorword_map_;
+  // Cache for getInnerDimVectorizableWidth
+  std::unordered_map<TensorView*, size_t> inner_vectorword_map_;
+
+  // Found index mode kernel needs to be run in
   KernelIndexMode index_mode_ = KernelIndexMode::INT64;
+
+  // TODO: Remove
+  std::unordered_map<TensorView*, size_t> vectorword_map_;
 };
 
 class HeuristicSummary;
@@ -184,7 +196,7 @@ class TORCH_CUDA_CU_API SchedulerEntry {
 
   ReductionParams& rparams() {
     return rparams_;
-  };
+  }
 
   PointwiseParams& pparams() {
     return pparams_;
@@ -216,6 +228,9 @@ class TORCH_CUDA_CU_API SchedulerEntryHash {
 //! Debug print function for heuristics
 std::string toString(ScheduleHeuristic sh);
 
+//! Debug print function for heuristics
+std::ostream& operator<<(std::ostream& os, ScheduleHeuristic sh);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
index 7ce9addf0cb0..56310f226fde 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
@@ -1,11 +1,14 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
 
 #include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
 namespace torch {
@@ -188,14 +191,14 @@ void parallelizeAllLike(
     const std::vector<TensorView*>& all_tvs) {
   FusionGuard fg(reference_tv->fusion());
 
-  // Use loop map as that is the most permissive.
-  auto ca_loop_map = ComputeAtMap(ComputeAtMap::MappingMode::LOOP);
-  ca_loop_map.build(FusionGuard::getCurFusion());
+  auto ca_map = ComputeAtMap(FusionGuard::getCurFusion());
+
   for (auto id : reference_tv->domain()->domain()) {
-    ca_loop_map.getConcreteMappedID(id)->parallelize(id->getParallelType());
+    ca_map.getConcreteMappedID(id, IdMappingMode::PERMISSIVE)
+        ->parallelize(id->getParallelType());
     if (id->hasPaddingToMultipleOfWarp()) {
-      ca_loop_map.getConcreteMappedID(id)->padToMultipleOfWarp(
-          id->getMaybeSizeAfterPadding());
+      ca_map.getConcreteMappedID(id, IdMappingMode::PERMISSIVE)
+          ->padToMultipleOfWarp(id->getMaybeSizeAfterPadding());
     }
   }
 
@@ -204,7 +207,8 @@ void parallelizeAllLike(
       continue;
     }
     for (const auto i : c10::irange(tv->domain()->domain().size())) {
-      auto ca_id = ca_loop_map.getConcreteMappedID(tv->axis(i));
+      auto ca_id =
+          ca_map.getConcreteMappedID(tv->axis(i), IdMappingMode::PERMISSIVE);
       tv->axis(i)->parallelize(ca_id->getParallelType());
       if (ca_id->hasPaddingToMultipleOfWarp()) {
         tv->axis(i)->padToMultipleOfWarp(ca_id->getMaybeSizeAfterPadding());
@@ -221,6 +225,9 @@ void computeAtInputs(TensorView* consumer, int pos, ComputeAtMode mode) {
 
 void computeWithOutputs(TensorView* producer, int pos, ComputeAtMode mode) {
   for (auto out_tv : ir_utils::outputTvsOf(producer)) {
+    if (out_tv == producer) {
+      continue;
+    }
     producer->computeWith(out_tv, pos, mode);
   }
 }
@@ -287,6 +294,15 @@ class PersistentBufferResolution : public IterVisitor {
     }
 
     if (tv->hasReduction()) {
+      if (std::any_of(
+              resolution_points_.begin(),
+              resolution_points_.end(),
+              [&tv](TensorView* resolution_point) {
+                return DependencyCheck::isDependencyOf(resolution_point, tv);
+              })) {
+        // If already resolved, don't start a new reduction path.
+        return;
+      }
       on_reduction_path_.emplace(tv);
     }
   }
@@ -425,7 +441,7 @@ PersistentBufferInfo persistentBuffers(Fusion* fusion) {
   }
 
   // Find projectable persistent buffers
-  auto reduction_tvs = getReductionTvs(fusion);
+  auto reduction_tvs = getReductionTvs(fusion /*, ignore_trivial=true */);
   for (auto persistent_buffer : persistent_buffer_info.persistent_buffers) {
     // Inputs marked as persistent buffers can't be projected any further back
     if (persistent_buffer->isFusionInput()) {
@@ -447,18 +463,19 @@ PersistentBufferInfo persistentBuffers(Fusion* fusion) {
       persistent_buffer_info.projectable_persistent_buffers);
 
   // Map unmappable dims to inputs, doesn't matter which compute at map used
-  auto ca_index_map = ComputeAtMap(ComputeAtMap::MappingMode::INDEX);
-  ca_index_map.build(fusion);
+  auto ca_map = ComputeAtMap(fusion);
 
   std::unordered_set<IterDomain*> unmappable_concrete_ids;
   for (auto id : persistent_buffer_info.unmappable_dims) {
-    unmappable_concrete_ids.emplace(ca_index_map.getConcreteMappedID(id));
+    unmappable_concrete_ids.emplace(
+        ca_map.getConcreteMappedID(id, IdMappingMode::EXACT));
   }
 
   for (auto input : all_inputs) {
     bool has_unmappable_dim = false;
     for (auto input_id : input->getMaybeRFactorDomain()) {
-      auto concrete_input_id = ca_index_map.getConcreteMappedID(input_id);
+      auto concrete_input_id =
+          ca_map.getConcreteMappedID(input_id, IdMappingMode::EXACT);
       if (unmappable_concrete_ids.find(concrete_input_id) !=
           unmappable_concrete_ids.end()) {
         persistent_buffer_info.unamppable_dims_projected_to_inputs.emplace(
@@ -507,6 +524,7 @@ TvProperties getProperties(
   bool cur_dim_is_reduction = fastest_dim_reduction;
   // Compute the size of the inner most dimension
   int64_t inner_most_dimension_numel = 1;
+  int64_t inner_most_dimension_ndims = 0;
 
   // Start from the inner most dimension, and work outwards. If this is a 3D
   // pattern, i.e. theres a pattern like [r0, r1, i2, r3] or [i0, r1, r2, i3,
@@ -526,6 +544,7 @@ TvProperties getProperties(
           inferred_val.has_value(), "Error inferring reduction size.");
       inner_most_dimension_numel =
           inner_most_dimension_numel * inferred_val.value();
+      inner_most_dimension_ndims++;
     }
   }
 
@@ -552,6 +571,7 @@ TvProperties getProperties(
   properties.total_iteration_numel = total_iteration_numel;
   properties.fastest_dim_reduction = fastest_dim_reduction;
   properties.inner_most_dimension_numel = inner_most_dimension_numel;
+  properties.inner_most_dimension_ndims = inner_most_dimension_ndims;
   properties.dimensionality = dimensionality;
 
   return properties;
@@ -587,7 +607,7 @@ void computeAtBetween(
               return mapped_to_trivial_reduction.count(id);
             });
 
-        pos = pos_it == consumer->domain()->domain().end()
+        auto consumer_pos = pos_it == consumer->domain()->domain().end()
             ? pos
             : std::min(
                   (int)std::distance(
@@ -596,7 +616,7 @@ void computeAtBetween(
                   (pos < 0 ? pos + (int)consumer->nDims() : pos));
         // Assume we don't want to reset computeAt on tensors that have already
         // performed it.
-        producer->computeAt(consumer, pos, mode);
+        producer->computeAt(consumer, consumer_pos, mode);
       }
     }
   }
@@ -788,14 +808,15 @@ PersistentBufferSizeReturn persistentBufferSize(
     persistent_buffer_sizes[buffer_i] = persistent_buffer_sizes[buffer_i] == -1
         ? 0
         : persistent_buffer_sizes[buffer_i] *
-            dataTypeSize(buffer->getDataType().value());
+            dataTypeSize(
+                buffer->getDataType().value(),
+                indexModeToDtype(runtime_info.getIndexMode()));
   }
 
   // Buffers involved in normal persistence
   std::vector<bool> persistent_mask(all_buffers.size(), false);
 
   for (auto buffer_i : c10::irange(persistent_buffers.size())) {
-    auto buffer = all_buffers[buffer_i];
     persistent_mask[buffer_i] = true;
   }
 
@@ -842,7 +863,6 @@ PersistentBufferSizeReturn persistentBufferSize(
   int64_t max_persistence_size = 0;
   int64_t max_proj_persistence_size = 0;
   for (const auto& entry : scoped_persistence_factor) {
-    auto val = entry.first;
     auto active_buffers = entry.second;
     auto persistent_buffer_size = masked_dot_product(
         persistent_mask, active_buffers, persistent_buffer_sizes);
@@ -877,8 +897,7 @@ std::unordered_set<IterDomain*> getTrivialReductionMap(Fusion* fusion) {
 
   if (!mapped_to_trivial_reduction.empty()) {
     // Use the loop map as that is the most permissive
-    auto ca_loop_map = ComputeAtMap(ComputeAtMap::MappingMode::LOOP);
-    ca_loop_map.build(fusion);
+    auto ca_map = ComputeAtMap(fusion);
     // Make a copy we need to check mappings of all
     auto trivial_ids = mapped_to_trivial_reduction;
     for (auto tv : all_tvs) {
@@ -889,8 +908,9 @@ std::unordered_set<IterDomain*> getTrivialReductionMap(Fusion* fusion) {
         if (std::any_of(
                 trivial_ids.begin(),
                 trivial_ids.end(),
-                [&ca_loop_map, &id](IterDomain* trivial_id) {
-                  return ca_loop_map.areMapped(id, trivial_id);
+                [&ca_map, &id](IterDomain* trivial_id) {
+                  return ca_map.areMapped(
+                      id, trivial_id, IdMappingMode::PERMISSIVE);
                 })) {
           mapped_to_trivial_reduction.emplace(id);
         }
@@ -923,7 +943,7 @@ std::pair<bool, bool> canonicalDimReduction(
   }
 }
 
-std::vector<TensorView*> getReductionTvs(Fusion* fusion) {
+std::vector<TensorView*> getReductionTvs(Fusion* fusion, bool ignore_trivial) {
   auto all_tvs = ir_utils::allTvs(fusion);
   std::vector<TensorView*> reduction_tvs;
   for (auto tv : all_tvs) {
@@ -931,8 +951,9 @@ std::vector<TensorView*> getReductionTvs(Fusion* fusion) {
         std::any_of(
             tv->domain()->domain().begin(),
             tv->domain()->domain().end(),
-            [](IterDomain* id) {
-              return id->isReduction() && !id->isTrivialReduction();
+            [&ignore_trivial](IterDomain* id) {
+              return id->isReduction() &&
+                  !(ignore_trivial && id->isTrivialReduction());
             })) {
       reduction_tvs.emplace_back(tv);
     }
@@ -957,25 +978,13 @@ std::vector<TensorView*> getReductionTvs(Fusion* fusion) {
   return reduction_tvs;
 }
 
-bool isViewDefinition(TensorView* tv) {
-  auto def_expr = tv->definition();
-  if (def_expr != nullptr) {
-    auto def_expr_type = def_expr->getExprType();
-    if (def_expr_type.has_value() &&
-        def_expr_type.value() == ExprType::ViewOp) {
-      return true;
-    }
-  }
-  return false;
-}
-
 std::vector<TensorView*> getViewTVs(Fusion* fusion) {
   std::vector<TensorView*> view_tvs;
   auto fusion_vals = fusion->usedMathVals();
   for (auto producer_tv : ir_utils::filterByType<TensorView>(fusion_vals)) {
     auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv);
     for (auto consumer_tv : consumer_tvs) {
-      if (isViewDefinition(consumer_tv)) {
+      if (consumer_tv->isDefinitionType(ExprType::ViewOp)) {
         view_tvs.push_back(consumer_tv);
       }
     }
@@ -1005,10 +1014,10 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
   // If we're going to unroll, make a cache of the inputs
   auto in_tvs = ir_utils::filterByType<TensorView>(fusion->inputs());
   for (auto tv : in_tvs) {
-    if (tv->uses().empty()) {
+    if (tv->uses().empty() || tv->isFusionOutput()) {
       continue;
     }
-    auto cached_tv = tv->cache_after();
+    auto cached_tv = tv->cacheAfter();
     cached_inputs.emplace_back(cached_tv);
   }
   return cached_inputs;
@@ -1020,17 +1029,22 @@ std::vector<std::pair<TensorView*, TensorView*>> cacheAndForkOutputs(
     Fusion* fusion,
     bool unroll) {
   std::vector<std::pair<TensorView*, TensorView*>> cached_outputs;
-  // For intermediate outputs, apply cache_fork
-  for (const auto output :
-       ir_utils::filterByType<TensorView>(fusion->outputs())) {
+  // For intermediate outputs, apply cacheFork
+  for (auto output : ir_utils::filterByType<TensorView>(fusion->outputs())) {
     if (output->definition() == nullptr) {
       continue;
     }
     if (!output->uses().empty()) {
-      auto cached_output = output->cache_fork();
-      cached_outputs.emplace_back(std::make_pair(output, cached_output));
-    } else if (unroll) {
-      auto cached_output = output->cache_before();
+      output = output->cacheFork();
+    }
+    // We shouldn't necessarily need to fork and cache for unrolling, but
+    // compute at best effort replay doesn't look at multiple outputs to limit
+    // itself by, so to make sure vectorization is done correctly we fork and
+    // cache. This is partially a compute at issue, but even with that fixed,
+    // we'd likely want to cache a forked output to make sure our inlining
+    // strategy is optimal.
+    if (unroll) {
+      auto cached_output = output->cacheBefore();
       cached_outputs.emplace_back(std::make_pair(cached_output, output));
     }
   }
@@ -1038,37 +1052,6 @@ std::vector<std::pair<TensorView*, TensorView*>> cacheAndForkOutputs(
 }
 
 namespace {
-IterDomain* innerMostRootDim(TensorView* tv) {
-  if (tv->nDims() == 0) {
-    return nullptr;
-  }
-
-  IterDomain* inner_most_id = nullptr;
-  for (auto it = tv->getMaybeRFactorDomain().rbegin();
-       it != tv->getMaybeRFactorDomain().rend();
-       it++) {
-    if ((*it)->isReduction() && tv->isFusionInput()) {
-      continue;
-    }
-    if ((*it)->isBroadcast()) {
-      if (inner_most_id == nullptr) {
-        inner_most_id = *it;
-      }
-      continue;
-    }
-    if ((*it)->isTrivialReduction()) {
-      if (inner_most_id == nullptr) {
-        inner_most_id = *it;
-      }
-      continue;
-    }
-    inner_most_id = *it;
-    break;
-  }
-
-  return inner_most_id;
-}
-
 // Take the inner most rfactor id from innerMostRootDim and project it to the
 // root domain if the provided domain is on the rfactor domain. If vectorize,
 // will not project if not following the inner most path.
@@ -1084,7 +1067,7 @@ IterDomain* projectIdToRoot(
     return reference_id;
   }
 
-  auto replay_exprs = ExprSort::getExprs(tv->fusion(), {reference_id});
+  auto replay_exprs = StmtSort::getExprs(tv->fusion(), {reference_id}, false);
   if (replay_exprs.empty()) {
     return reference_id;
   }
@@ -1121,6 +1104,52 @@ IterDomain* projectIdToRoot(
 }
 } // namespace
 
+IterDomain* innerMostRootDim(TensorView* tv) {
+  // This is backwards from how we normally think about grabbing root dimensions
+  // to process. If we're in a reduction scheduler and we're using the rfactored
+  // reduction tensor view, we don't care about the rfactor domain, we care
+  // about the root domain because we're looking to vectorize the reads (input
+  // tensor views). Otherwise we do want the rfactor domain. So this is the
+  // reverse of our typical check, we actually want to selectively ignore the
+  // rfactor domain.
+  const auto& root_domain = tv->hasReduction() && tv->hasRFactor()
+      ? tv->getRootDomain()
+      : tv->getMaybeRFactorDomain();
+
+  if (tv->nDims() == 0) {
+    return nullptr;
+  }
+
+  IterDomain* inner_most_id = nullptr;
+
+  for (auto it = root_domain.rbegin(); it != root_domain.rend(); it++) {
+    // If we're looking at a reduction domain on an input because of
+    // segmentation we don't want to consider those reduction domains as a
+    // vectorization opportunity. If we're looking at a reduction reference
+    // tensor we want to consider the reduction iteration domains as domains we
+    // can vectorize on.
+    if ((*it)->isReduction() && tv->isFusionInput()) {
+      continue;
+    }
+    if ((*it)->isBroadcast()) {
+      if (inner_most_id == nullptr) {
+        inner_most_id = *it;
+      }
+      continue;
+    }
+    if ((*it)->isTrivialReduction()) {
+      if (inner_most_id == nullptr) {
+        inner_most_id = *it;
+      }
+      continue;
+    }
+    inner_most_id = *it;
+    break;
+  }
+
+  return inner_most_id;
+}
+
 FindAllMappedDims::FindAllMappedDims(
     TensorView* from,
     IterDomain* id,
@@ -1193,12 +1222,16 @@ std::unordered_set<IterDomain*> FindAllMappedDims::from(
     TensorView* tv,
     IterDomain* id,
     bool vectorize_pass) {
+  auto root_domain = tv->hasReduction() && tv->hasRFactor()
+      ? tv->getRootDomain()
+      : tv->getMaybeRFactorDomain();
+
   TORCH_INTERNAL_ASSERT(
       std::find_if(
-          tv->getMaybeRFactorDomain().begin(),
-          tv->getMaybeRFactorDomain().end(),
+          root_domain.begin(),
+          root_domain.end(),
           [&id](IterDomain* root_id) { return root_id == id; }) !=
-          tv->getMaybeRFactorDomain().end(),
+          root_domain.end(),
       "Tried to map out ",
       id,
       " from TV ",
@@ -1285,7 +1318,9 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
   return vectorizable_tensors;
 }
 
-std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv) {
+std::vector<BroadcastMultiple> getBroadcastMultiples(
+    TensorView* reference_tv,
+    DataType index_type) {
   auto fusion = reference_tv->fusion();
   FusionGuard fg(fusion);
 
@@ -1301,9 +1336,9 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv) {
     in_out_tvs.insert(in_out_tvs.end(), out_tvs.begin(), out_tvs.end());
   }
 
-  // Shouldn't matter which compute at map we use
-  auto ca_index_map = ComputeAtMap(ComputeAtMap::MappingMode::INDEX);
-  ca_index_map.build(fusion);
+  // Shouldn't matter if we use EXACT or PERMISSIVE mapping mode for compute at
+  // map as we're just looking at the root mappings.
+  auto ca_map = ComputeAtMap(fusion);
 
   auto ref_root_domain = reference_tv->getMaybeRFactorDomain();
 
@@ -1325,8 +1360,8 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv) {
       auto map_it = std::find_if(
           in_out_tv_domain_list.begin(),
           in_out_tv_domain_list.end(),
-          [&ref_id, &ca_index_map](IterDomain* in_out_tv_id) {
-            return ca_index_map.areMapped(in_out_tv_id, ref_id);
+          [&ref_id, &ca_map](IterDomain* in_out_tv_id) {
+            return ca_map.areMapped(in_out_tv_id, ref_id, IdMappingMode::EXACT);
           });
 
       if (map_it == in_out_tv_domain_list.end()) {
@@ -1347,7 +1382,8 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv) {
     {
       bool rhs = false;
       bool lhs = false;
-      auto dtype_size = dataTypeSize(in_out_tv->getDataType().value());
+      auto dtype_size =
+          dataTypeSize(in_out_tv->getDataType().value(), index_type);
       for (size_t mapped_axes_i = 0; mapped_axes_i < mapped_axes.size();
            mapped_axes_i++) {
         auto lhs_i = mapped_axes_i;
@@ -1370,7 +1406,364 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv) {
   return multiples;
 }
 
+size_t collectMaxVectorizeSizeWithContigMerge(
+    TensorView* tv,
+    IterDomain* leaf_merged_domain,
+    size_t max_vector_size_in_byte,
+    ExpressionEvaluator& expression_evaluator,
+    DataType index_type) {
+  // Maybe too conservative, but only handles fully contiguous tensors
+  // TODO: Relax the contiguity constraint to be similar to that in index
+  // computing. Just looking for all merged root domains in the right order, all
+  // merged root dimensions are contiguous, all merged root dimensions are next
+  // to eachother (exlcuding broadcast).
+  if (std::any_of(
+          tv->domain()->contiguity().begin(),
+          tv->domain()->contiguity().end(),
+          [](const auto contig) { return !contig; })) {
+    return 1;
+  }
+
+  auto dtype_size = dataTypeSize(tv->dtype(), index_type);
+  const size_t max_vector_size = max_vector_size_in_byte / dtype_size;
+
+  // Assume no halo-related expression appears in the fusion. No
+  // broadcast is merged, so indexability can be assumed to be true.
+  ContigIDs contigIds(
+      {leaf_merged_domain},
+      tv->getMaybeRFactorDomain(),
+      tv->domain()->contiguity(),
+      {},
+      {},
+      true,
+      true);
+
+  auto innermost_root_id = tv->getMaybeRFactorDomain().back();
+  auto indexed_id = contigIds.rootToIndexedID().at(innermost_root_id);
+
+  size_t merged_size = 1;
+  // If the indexed ID is a contig merged domain, i.e., it is
+  // different from innermost_root_id, we accumulate the extents of
+  // all the root domains covered by the contig indexed ID. Otherwise,
+  // just look at the extent of the innermost root ID.
+  if (indexed_id != innermost_root_id) {
+    const auto& within_root = contigIds.withinContigIDs().at(indexed_id);
+    for (auto root_id : tv->getMaybeRFactorDomain()) {
+      if (within_root.find(root_id) == within_root.end()) {
+        continue;
+      }
+      auto maybe_dimension_size =
+          expression_evaluator.evaluate(root_id->extent());
+      TORCH_INTERNAL_ASSERT(
+          maybe_dimension_size.has_value(),
+          "Unknown extent of tv: ",
+          tv->toString(),
+          ", id: ",
+          root_id->toString());
+      merged_size *= maybe_dimension_size.value();
+    }
+  } else {
+    auto maybe_dimension_size =
+        expression_evaluator.evaluate(innermost_root_id->extent());
+    TORCH_INTERNAL_ASSERT(
+        maybe_dimension_size.has_value(),
+        "Unknown extent of tv: ",
+        tv->toString(),
+        ", id: ",
+        innermost_root_id->toString());
+    merged_size = maybe_dimension_size.value();
+  }
+
+  size_t vector_size = 1;
+  size_t next_vector_size = vector_size * 2;
+
+  // Try until vector size exceeds the max allowed size
+  while (next_vector_size <= max_vector_size) {
+    if (merged_size % next_vector_size != 0) {
+      break;
+    }
+    vector_size = next_vector_size;
+    next_vector_size *= 2;
+  }
+
+  return vector_size;
+}
+
+namespace matmul_utils {
+
+void scheduleWarpTileWithReduction(TensorView* tv, MatMulTileOptions tile) {
+  // Assumes
+  // [M, N, K]
+  auto cta_tile = tile.cta_tile;
+  auto warp_tile = tile.warp_tile;
+  auto instruction_tile = tile.instruction_tile;
+
+  TORCH_CHECK(
+      warp_tile.k == cta_tile.k,
+      "schedule warp tile: currently no support for splitting k dimension to different warps");
+
+  mma_util::checkDimSize(
+      tv, {-3, -2, -1}, {cta_tile.m, cta_tile.n, cta_tile.k});
+
+  //       -3   -2  -1
+  //[...    M,   N,  K]
+
+  // Distribute warp tile:
+  tv->split(-3, warp_tile.m);
+  tv->split(-2, warp_tile.n);
+
+  //  -5   -4   -3   -2   -1
+  // [Mwo  Mw  Nwo   Nw   K]
+  tv->split(-4, instruction_tile.m);
+  tv->split(-2, instruction_tile.n);
+  tv->split(-1, instruction_tile.k);
+
+  //   -8  -7 -6 -5 -4 -3 -2 -1
+  // [Mwo Mw Mi Nwo Nw Ni Ko Ki]
+
+  tv->reorder({{-7, -5}, {-6, -3}, {-5, -7}, {-3, -2}, {-2, -6}});
+
+  //   -8  -7  -6 -5 -4 -3 -2 -1
+  // [Mwo  Nwo Ko Mw Nw Mi Ni Ki]
+}
+
+void scheduleWarpTileWithNoReduction(TensorView* tv, MatMulTileOptions tile) {
+  // Assumes
+  // [M, N, K]
+  auto cta_tile = tile.cta_tile;
+  auto warp_tile = tile.warp_tile;
+  auto instruction_tile = tile.instruction_tile;
+
+  mma_util::checkDimSize(tv, {-2, -1}, {cta_tile.m, cta_tile.n});
+
+  //        -2  -1
+  //[...    M,   N]
+
+  // Distribute warp tile:
+  tv->split(-2, warp_tile.m);
+  tv->split(-1, warp_tile.n);
+
+  //  -4   -3   -2   -1
+  // [Mwo  Mw  Nwo   Nw ]
+  tv->split(-3, instruction_tile.m);
+  tv->split(-1, instruction_tile.n);
+
+  //  -6 -5  -4 -3 -2 -1
+  // [Mwo Mw Mi Nwo Nw Ni]
+
+  tv->reorder({{-5, -4}, {-4, -2}, {-3, -5}, {-2, -3}});
+
+  //  -6   -5  -4 -3 -2 -1
+  // [Mwo  Nwo Mw Nw Mi Ni]
+}
+
+//! Split the innermost dim to a vectorized load
+void scheduleContiguousVectorLoad(
+    TensorView* tv,
+    MatMulTileOptions tile,
+    int vector_word) {
+  auto warp_dims = tile.cta_tile / tile.warp_tile;
+  int num_of_thread = warp_dims.m * warp_dims.n * warp_dims.k * 32;
+
+  tv->split(-1, num_of_thread * vector_word);
+  tv->split(-1, vector_word);
+  // [..., thread, vec]
+  // distribute to warp:
+  tv->split(-2, 32);
+  tv->split(-3, warp_dims.n * warp_dims.k);
+
+  tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv->axis(-2)->parallelize(ParallelType::TIDx);
+  tv->axis(-3)->parallelize(ParallelType::TIDy);
+  tv->axis(-4)->parallelize(ParallelType::TIDz);
+}
+
+} // namespace matmul_utils
+
+// Grab all values and expressions used to make the merged_domain and remove
+// them from the fusion
+void cleanUpInnermostMergedDomains(
+    const std::vector<IterDomain*>& root_domain,
+    IterDomain* merged_domain) {
+  TORCH_INTERNAL_ASSERT(merged_domain != nullptr);
+  TORCH_INTERNAL_ASSERT(!root_domain.empty());
+
+  std::unordered_set<Val*> root_set({root_domain.begin(), root_domain.end()});
+
+  auto vals = DependencyCheck::getAllValsBetween(root_set, {merged_domain});
+
+  for (auto it = vals.rbegin(); it != vals.rend(); ++it) {
+    TORCH_INTERNAL_ASSERT((*it)->isA<IterDomain>());
+    auto id = (*it)->as<IterDomain>();
+    if (root_set.find(id) != root_set.end()) {
+      continue;
+    }
+    Fusion* fusion = id->container()->as<Fusion>();
+    auto id_def = id->definition();
+    TORCH_INTERNAL_ASSERT(
+        id_def->isA<Merge>(),
+        "Invalid ID: ",
+        id->toString(),
+        ". Expected definition of a Merge expression: ",
+        (id_def != nullptr ? id_def->toString() : "nullptr"));
+    fusion->removeExpr(id_def);
+    fusion->removeVal(id);
+  }
+}
+
+// Merge innermost domains for finding the widest vectorizable
+// size. Return the merged domain or nullptr if no merge is done.
+IterDomain* mergeInnermostDomains(
+    const std::vector<IterDomain*>& domain,
+    int num_merged_domains) {
+  const auto ndims = domain.size();
+  IterDomain* merged_id = nullptr;
+  bool is_merge_done = false;
+  for (const auto i : c10::irange(num_merged_domains)) {
+    auto id = domain.at(ndims - 1 - i);
+    // broadcast and trivial reductions are ignored
+    if (id->isBroadcast() || id->isTrivialReduction()) {
+      continue;
+    }
+    if (merged_id == nullptr) {
+      merged_id = id;
+    } else {
+      auto id_inner = merged_id;
+      auto id_outer = id;
+      merged_id = IterDomain::merge(id_outer, id_inner);
+      is_merge_done = true;
+    }
+  }
+  return is_merge_done ? merged_id : nullptr;
+}
+
+//! Attempt to expand vectorized domains to contig merged domains. Break point
+//! identifies the point in which you can't propagate contiguous merges. For
+//! example in pointwise this is the point where we want to split the
+//! parallelization to take advantage of broadcast, and for reduction schedulers
+//! it's the point where we switch from a reduction domain to an iter domain (or
+//! vice versa).
+size_t expandVectorizationToContigMergedDomains(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    const std::vector<TensorView*> vectorizable_inputs_outputs,
+    TensorView* reference_tv,
+    int break_point,
+    size_t default_word_size) {
+  // Don't vectorize when RNG is used
+  if (fusion->isStochastic() && isDisabled(DisableOption::UnrollWithRng)) {
+    return default_word_size;
+  }
+
+  size_t max_expand_size = SchedulerRuntimeInfo::max_alignment_size_in_byte;
+  size_t common_alignment_size =
+      SchedulerRuntimeInfo::max_alignment_size_in_byte;
+
+  for (auto inp_out : vectorizable_inputs_outputs) {
+    auto dtype_size = dataTypeSize(
+        inp_out->dtype(), indexModeToDtype(runtime_info.getIndexMode()));
+
+    max_expand_size = std::min(
+        max_expand_size,
+        SchedulerRuntimeInfo::max_alignment_size_in_byte / dtype_size);
+    max_expand_size = std::min(
+        max_expand_size, runtime_info.getMaxVectorizableWidth(inp_out));
+    common_alignment_size =
+        std::min(common_alignment_size, runtime_info.getAlignmentSize(inp_out));
+  }
+
+  // If there's no possibility to increase vector size of provided tensors, then
+  // don't bother doing a more complex analysis to try and do so, just return
+  // early.
+  if (max_expand_size == default_word_size) {
+    return default_word_size;
+  }
+
+  auto ca_map = ComputeAtMap(fusion);
+
+  // Merge the domains right of the break point
+  const auto& ref_root = reference_tv->getMaybeRFactorDomain();
+  const int num_merged_domains =
+      static_cast<int>(ref_root.size()) - static_cast<int>(break_point);
+
+  // No expansion with no merged domain
+  if (num_merged_domains == 0) {
+    return default_word_size;
+  }
+
+  // Merge the domains but don't modify TensorDomain
+  auto merged_domain = mergeInnermostDomains(ref_root, num_merged_domains);
+
+  // No expansion is done if no merge is done.
+  if (merged_domain == nullptr) {
+    return default_word_size;
+  }
+
+  // Find the vectorizable word size with the merged domains
+  size_t word_size = scheduler_utils::collectMaxVectorizeSizeWithContigMerge(
+      reference_tv,
+      merged_domain,
+      common_alignment_size,
+      runtime_info.expressionEvaluator(),
+      indexModeToDtype(runtime_info.getIndexMode()));
+
+  cleanUpInnermostMergedDomains(ref_root, merged_domain);
+
+  // Stop if the reference doesn't get a larger word size.
+  if (word_size <= default_word_size) {
+    return default_word_size;
+  }
+
+  // Check the other TVs and take the minimum of the valid word sizes
+  for (const auto tv : vectorizable_inputs_outputs) {
+    if (tv == reference_tv) {
+      continue;
+    }
+
+    const auto& tv_root = tv->getMaybeRFactorDomain();
+
+    int tv_num_merged_domains = 0;
+    for (const auto i : c10::irange(num_merged_domains)) {
+      if (i == tv_root.size()) {
+        break;
+      }
+      auto ref_id = ref_root.at(ref_root.size() - 1 - i);
+      IterDomain* tv_id = tv_root.at(tv_root.size() - 1 - i);
+      // If not mapped, stop expanding.
+      if (!ca_map.areMapped(ref_id, tv_id, IdMappingMode::EXACT)) {
+        break;
+      } else {
+        ++tv_num_merged_domains;
+      }
+    }
+
+    size_t tv_word_size = 1;
+    if (tv_num_merged_domains > 1) {
+      auto tv_merged_domain =
+          mergeInnermostDomains(tv_root, tv_num_merged_domains);
+      if (tv_merged_domain == nullptr) {
+        tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
+      } else {
+        tv_word_size = scheduler_utils::collectMaxVectorizeSizeWithContigMerge(
+            tv,
+            tv_merged_domain,
+            common_alignment_size,
+            runtime_info.expressionEvaluator(),
+            indexModeToDtype(runtime_info.getIndexMode()));
+        cleanUpInnermostMergedDomains(tv_root, tv_merged_domain);
+      }
+    } else {
+      tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
+    }
+
+    word_size = std::min(word_size, tv_word_size);
+  }
+
+  return word_size;
+}
+
 } // namespace scheduler_utils
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.h b/torch/csrc/jit/codegen/cuda/scheduler/utils.h
index 48686e09d959..382019daeaee 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.h
@@ -10,6 +10,7 @@ namespace fuser {
 namespace cuda {
 
 class SchedulerRuntimeInfo;
+class ExpressionEvaluator;
 
 namespace scheduler_utils {
 
@@ -110,6 +111,9 @@ struct TvProperties {
   // reduction/normalization.
   int64_t inner_most_dimension_numel = 1;
 
+  // Same thing as above, but the number of dimensions instead of the numel.
+  int64_t inner_most_dimension_ndims = 1;
+
   // Merging neighboring iteration domains, and reduction domains, what's the
   // resulting dimensionality of the problem.
   int64_t dimensionality = 1;
@@ -163,7 +167,9 @@ std::pair<bool, bool> canonicalDimReduction(
 // Return a list of tensor views that are outputs of reduction operations. If
 // multiple outputs of an expression are found, only include one in the list
 // (WelfordOp)
-TORCH_CUDA_CU_API std::vector<TensorView*> getReductionTvs(Fusion* fusion);
+TORCH_CUDA_CU_API std::vector<TensorView*> getReductionTvs(
+    Fusion* fusion,
+    bool ignore_trivial = true);
 
 // Returns a list of TensorViews that are the consumer tv for a view operation.
 std::vector<TensorView*> getViewTVs(Fusion* fusion);
@@ -181,6 +187,17 @@ std::vector<std::pair<TensorView*, TensorView*>> cacheAndForkOutputs(
     Fusion* fusion,
     bool unroll);
 
+// Ignores broadcast and reduction, returns iter domain in root domain that's
+// "inner most". If this is an rfactored reduction domain, actually check the
+// root domain, this is because the rfactored reduction tensorview has the
+// vectorized dimension, but that means the rfactor domain could have reordered
+// what we consider the "inner most" allocated position on it if we consider the
+// rfactor dimension.
+//
+// If reduction tv and has rfactor return root domain, otherwise return rfactor
+// domain.
+IterDomain* innerMostRootDim(TensorView* tv);
+
 // Uses a lot of logic from TransformPropagator in the implementation
 class FindAllMappedDims {
  private:
@@ -246,7 +263,48 @@ struct BroadcastMultiple {
 // mapped to the corresponding dimension in reference_tv. Count includes
 // reference_tv if reference_tv is an input or output. Count is multiplied by
 // data type size.
-std::vector<BroadcastMultiple> getBroadcastMultiples(TensorView* reference_tv);
+std::vector<BroadcastMultiple> getBroadcastMultiples(
+    TensorView* reference_tv,
+    DataType index_type);
+
+//! Collect maximum vectorization word size of a tensor whose
+//! innermost domain is leaf_merged_domain. Contig merging is taken
+//! into account to expand vectorization if possible.
+size_t collectMaxVectorizeSizeWithContigMerge(
+    TensorView* tv,
+    IterDomain* leaf_merged_domain,
+    size_t max_word_size_in_byte,
+    ExpressionEvaluator& expression_evaluator,
+    DataType index_type);
+
+namespace matmul_utils {
+//! Utilities in this namespace facilitates scheduling matmul kernels with
+//!  hierarchichal tiling specified in MatMulTileOptions.
+
+//! Schedule utility for matmul prolog:
+//!   Use all the threads on a CTA tile to load matmul operands
+//!  into shared memory with the given vectorization word.
+//! TODO:
+//!  will need to add bank conflict removal swizzle in a follow up.
+TORCH_CUDA_CU_API void scheduleContiguousVectorLoad(
+    TensorView* tv,
+    MatMulTileOptions tile,
+    int vector_word);
+
+//! Schedule utility for mma output in matmul main loop:
+//!  Realize the hierarchical tiling based on the given tiling options.
+TORCH_CUDA_CU_API void scheduleWarpTileWithReduction(
+    TensorView* tv,
+    MatMulTileOptions tile);
+
+//! Schedule utility for mma output in matmul main loop:
+//!  Realize the hierarchical tiling based on the given tiling options
+//! on consumers of mma ops in epilog.
+TORCH_CUDA_CU_API void scheduleWarpTileWithNoReduction(
+    TensorView* tv,
+    MatMulTileOptions tile);
+
+} // namespace matmul_utils
 
 } // namespace scheduler_utils
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
new file mode 100644
index 000000000000..0a67d00618e2
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// TODO: Put implementations in a vectorize_helper.cpp
+namespace scheduler_utils {
+
+// Moved the definition of these to
+// torch/csrc/jit/codegen/cuda/scheduler/utils.cpp as making new CPP files is
+// painful for multiple reasons.
+
+// Grab all values and expressions used to make the merged_domain and remove
+// them from the fusion
+void cleanUpInnermostMergedDomains(
+    const std::vector<IterDomain*>& root_domain,
+    IterDomain* merged_domain);
+
+// Merge innermost domains for finding the widest vectorizable
+// size. Return the merged domain or nullptr if no merge is done.
+IterDomain* mergeInnermostDomains(
+    const std::vector<IterDomain*>& domain,
+    int num_merged_domains);
+
+//! Attempt to expand vectorized domains to contig merged domains. Break point
+//! identifies the point in which you can't propagate contiguous merges. For
+//! example in pointwise this is the point where we want to split the
+//! parallelization to take advantage of broadcast, and for reduction schedulers
+//! it's the point where we switch from a reduction domain to an iter domain (or
+//! vice versa).
+size_t expandVectorizationToContigMergedDomains(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    const std::vector<TensorView*> vectorizable_inputs_outputs,
+    TensorView* reference_tv,
+    int break_point,
+    size_t default_word_size);
+
+} // namespace scheduler_utils
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
index 2bf8967f74e1..019e294e0e12 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -3,10 +3,14 @@
 #include <torch/csrc/jit/codegen/cuda/compute_at.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
 
 // Cleanup
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
@@ -24,8 +28,14 @@ DataType aten_opt_type_map(const c10::optional<at::ScalarType>& scalar_type) {
 }
 } // namespace
 
-TensorView::TensorView(TensorDomain* domain, DataType dtype, MemoryType mtype)
-    : Val(ValType::TensorView, dtype), domain_(domain), memory_type_(mtype) {
+TensorView::TensorView(
+    IrBuilderPasskey passkey,
+    TensorDomain* domain,
+    DataType dtype,
+    MemoryType mtype)
+    : Val(passkey, ValType::TensorView, dtype),
+      domain_(domain),
+      memory_type_(mtype) {
   // Don't do this after transforms
   if (domain_->domain() == domain_->getRootDomain()) {
     // Mark the size-1 axes as broadcast to support implicit broadcast semantic
@@ -38,10 +48,15 @@ TensorView::TensorView(TensorDomain* domain, DataType dtype, MemoryType mtype)
   }
 }
 
-TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
-    : Val(ValType::TensorView,
-          aten_opt_type_map(tensor_type->scalarType()),
-          false) {
+TensorView::TensorView(
+    IrBuilderPasskey passkey,
+    const std::shared_ptr<c10::TensorType>& tensor_type)
+    : Val(passkey,
+          ValType::TensorView,
+          aten_opt_type_map(tensor_type->scalarType())) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   std::vector<IterDomain*> sizes;
 
   TORCH_CHECK(
@@ -51,15 +66,37 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
     if (tensor_type->sizes()[i].has_value() &&
         tensor_type->sizes()[i].value() == 1) {
       // If size is known to be 1, assuem it needs to be broadcasted.
-      sizes.push_back(new IterDomain(
-          new Int(0),
-          new Int(1),
+      sizes.push_back(IrBuilder::create<IterDomain>(
+          passkey.ir_container_->zeroVal(),
+          passkey.ir_container_->oneVal(),
           ParallelType::Serial,
           IterType::BroadcastWithStride));
     } else {
-      sizes.push_back(new IterDomain(new Int(0), new Int()));
+      sizes.push_back(IrBuilder::create<IterDomain>(
+          passkey.ir_container_->zeroVal(), IrBuilder::create<Int>()));
     }
   }
+  // [ Note -- stride_properties in tensor type ]
+  //
+  // `stride_properties()` returns a vector<optional<Stride>>, while
+  //     Stride {
+  //       optional<size_t> stride_index_;
+  //       optional<bool> contiguous_;
+  //       optional<size_t> stride_;
+  //     };
+  // To keep things simple, we ignore all the optional wrapper, as in reality,
+  // they would always be available unless we start doing multiple profiling
+  // runs.
+  //
+  //   `stride_properties()` returns the vector of Stride, where it is ordered
+  //   from the fastest to slowest dimensions. i.e. stride_properties()[i] would
+  //   give us the i-th fastest dimension. where:
+  //     1. `Stride::stride_index_` gives the index to the dimension;
+  //     2. `Stride::contiguous_` indicates whether this dimension is
+  //     memory-dense*;
+  //     3. `Stride::stride_` is the actual stride for the given dimension.
+  // * note that memory-dense means different things depending on the order of
+  // the dimension. checkout `TensorType::computeStrideProps` for details
 
   // default to non_contiguous;
   std::vector<bool> contig_info(tensor_type->dim().value(), false);
@@ -81,19 +118,100 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
         // dim;
         contig_info[index] = (index == tensor_type->dim().value() - 1);
       } else {
-        // check the neighboring faster dimension;
-        if (auto left_index_opt =
-                tensor_type->stride_properties()[static_cast<int>(i) - 1]
-                    ->stride_index_) {
-          // collapse if two axes are neighboring in both sizes & stride_index;
-          contig_info[index] = (left_index_opt.value() == (index + 1));
+        // check the neighboring faster dimension, collapse if it is considered
+        // as inner dimension per stride_index
+        auto inner_index_opt =
+            tensor_type->stride_properties()[static_cast<int>(i) - 1]
+                ->stride_index_;
+        if (inner_index_opt.has_value() &&
+            inner_index_opt.value() == (index + 1)) {
+          // collapse if inner dimension has non-broadcasted strides
+          auto inner_stride_opt =
+              tensor_type->stride_properties()[static_cast<int>(i) - 1]
+                  ->stride_;
+          contig_info[index] =
+              inner_stride_opt.has_value() && inner_stride_opt.value() != 0;
         }
       }
     }
   }
 
-  domain_ = new TensorDomain(sizes, contig_info);
-  name_ = fusion_->registerVal(this);
+  domain_ = IrBuilder::create<TensorDomain>(sizes, contig_info);
+}
+
+TensorView::TensorView(
+    IrBuilderPasskey passkey,
+    const std::shared_ptr<Value>& jit_value)
+    : TensorView(passkey, jit_value->type()->cast<c10::TensorType>()) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
+}
+
+void TensorView::convertRfactorToRootDomain() {
+  // For a given TensorView, does its domain (root / rfactor) contain any
+  // concrete sized extents?
+  auto is_concrete_tensor = [](TensorView* tv) {
+    for (auto id : tv->getMaybeRFactorDomain()) {
+      if (!id->extent()->isConstScalar()) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // Create a new root domain and replacement TensorDomain.
+  // Given an rfactor domain, create a new IterDomain.
+  // Otherwise, clone the previous IterDomain
+  auto createReplacementDomain =
+      [this](const std::vector<Val*>& replacement_extents) {
+        TORCH_INTERNAL_ASSERT(
+            !replacement_extents.empty() &&
+            getMaybeRFactorDomain().size() == replacement_extents.size());
+        size_t idx = 0;
+        std::vector<IterDomain*> new_root_domain(
+            getMaybeRFactorDomain().size());
+        for (const auto& id : getMaybeRFactorDomain()) {
+          if (replacement_extents[idx] != nullptr) {
+            new_root_domain[idx] = IrBuilder::create<IterDomain>(
+                container(),
+                id->start(),
+                replacement_extents[idx],
+                id->stopOffset(),
+                id->getParallelType(),
+                id->getIterType());
+            ++idx;
+          } else {
+            TORCH_INTERNAL_ASSERT(!id->isRFactorProduct());
+            new_root_domain[idx++] = id->cloneWithoutRFactor();
+          }
+        }
+
+        TORCH_INTERNAL_ASSERT(
+            new_root_domain.size() == domain()->contiguity().size());
+        setDomain(IrBuilder::create<TensorDomain>(
+            container(), new_root_domain, domain()->contiguity()));
+      };
+
+  std::vector<Val*> rfactor_extents;
+  std::unordered_map<Val*, Val*> replacement_map;
+  const auto kThisIsConcreteTensor = is_concrete_tensor(this);
+  for (const auto& id : getMaybeRFactorDomain()) {
+    if (id->isRFactorProduct()) {
+      // Create new symbolic extents for rfactor iterDomains
+      auto domain_extent = (!kThisIsConcreteTensor)
+          ? IrBuilder::create<Int>(container())
+          : id->extent();
+      rfactor_extents.push_back(domain_extent);
+      replacement_map.emplace(id->extent(), domain_extent);
+    } else {
+      rfactor_extents.push_back(nullptr);
+    }
+  }
+  createReplacementDomain(rfactor_extents);
+
+  // Propagate new extent throughout fusion using ValReplacementMutator
+  ir_utils::replaceValue(fusion(), replacement_map);
 }
 
 TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner)
@@ -102,7 +220,9 @@ TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner)
       compute_at_pos_(src->compute_at_pos_),
       max_producer_pos_(src->max_producer_pos_),
       memory_type_(src->memory_type_),
-      swizzle_type_(src->swizzle_type_) {
+      swizzle_type_(src->swizzle_type_),
+      is_double_buffered_(src->is_double_buffered_),
+      cpu_scalar_(src->cpu_scalar_) {
   for (const auto id : src->axesToSwizzle()) {
     axes_to_swizzle_.push_back(ir_cloner->clone(id));
   }
@@ -152,6 +272,18 @@ std::vector<IterDomain*>::size_type TensorView::nDims() const {
   return domain()->nDims();
 }
 
+// sets cpu_scalar_ value, which is special handling for CPU based zero-dim
+// tensors (i.e. CPU Tensors that only have one value). This is only used if
+// on an input value, otherwise ignored. This is important as special handling
+// because these "scalars" should be type promoted as a tensor, but we want to
+// avoid explicit copying of the data, so we want to pass the data value as a
+// standard kernel argument value.
+void TensorView::setCpuScalar(bool is_cpu_scalar) {
+  TORCH_INTERNAL_ASSERT(
+      nDims() == 0, "Only 0-dim tensors can be marked as a cpu scalar.");
+  cpu_scalar_ = is_cpu_scalar;
+}
+
 IterDomain* TensorView::axis(int pos) const {
   TORCH_INTERNAL_ASSERT(
       nDims() > 0, "Tried to access an axis in a 0-dim TensorView");
@@ -167,6 +299,9 @@ IterDomain* TensorView::axis(int pos) const {
 }
 
 void TensorView::setComputeAt(unsigned int pos, bool decrease) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   if (pos <= compute_at_pos_ && !decrease) {
     return;
   }
@@ -182,6 +317,9 @@ void TensorView::setComputeAt(unsigned int pos, bool decrease) {
 }
 
 void TensorView::setMaxProducer(unsigned int pos, bool decrease) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   if (pos <= max_producer_pos_ && !decrease) {
     return;
   }
@@ -200,6 +338,9 @@ TensorView* TensorView::computeAt(
     TensorView* consumer,
     int position,
     ComputeAtMode mode) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   // Make sure this and consumer are not the same tensor, that's illegal
   TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)");
 
@@ -228,6 +369,9 @@ TensorView* TensorView::computeWith(
     TensorView* consumer,
     int position,
     ComputeAtMode mode) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   // Make sure this and consumer are not the same tensor, that's illegal
   TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)");
 
@@ -290,7 +434,7 @@ TensorView* TensorView::split(
     unsigned int factor,
     bool inner_split,
     bool trim_out_of_bounds) {
-  split(axis, new Int(factor), inner_split, trim_out_of_bounds);
+  split(axis, IrBuilder::create<Int>(factor), inner_split, trim_out_of_bounds);
   return this;
 }
 
@@ -336,6 +480,9 @@ TensorView* TensorView::merge(int axis_o, int axis_i) {
 }
 
 TensorView* TensorView::reorder(const std::unordered_map<int, int>& old2new_) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   TORCH_INTERNAL_ASSERT(
       !(nDims() == 0 && old2new_.size() > 0),
       "Tried to reorder a 0-dim TensorView");
@@ -383,6 +530,9 @@ TensorView* TensorView::reorder(const std::unordered_map<int, int>& old2new_) {
 TensorView* TensorView::swizzle(
     SwizzleType type,
     const std::vector<int>& axes) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   swizzle_type_ = type;
 
   // Clear previously set swizzle axes if any
@@ -432,6 +582,9 @@ TensorView* TensorView::swizzle(
 }
 
 TensorView* TensorView::rFactor(const std::vector<int>& axes) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   // TODO: I think we should do this but
   // NVFuserTest.FusionSmemBlockGemmCache_CUDA prevents it from going in at the
   // moment.
@@ -451,6 +604,10 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
   TORCH_CHECK(
       !domain()->hasRFactor(), "Cannot call rfactor on the same view twice.");
 
+  TORCH_CHECK(
+      !definition()->isA<GroupedReductionOp>(),
+      "For GroupedReducitonOp, use TensorView::rFactor(const std::vector<int>& axes, const std::vector<TensorView*>& tvs)");
+
   ReductionOp* this_definition = definition()->as<ReductionOp>();
 
   // Split tensor view into 2 parts
@@ -462,7 +619,8 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
   auto consumer_domain = domain_pair.second;
 
   // This domain will be the consumer, so create the producer
-  TensorView* producer = new TensorView(producer_domain, getDataType().value());
+  TensorView* producer =
+      IrBuilder::create<TensorView>(producer_domain, getDataType().value());
 
   // Set domain of consumer
   setDomain(consumer_domain);
@@ -470,14 +628,14 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
 
   // Setup dependency chain, inserting producer before this op.
   // Expr* producer_definition =
-  new ReductionOp(
+  IrBuilder::create<ReductionOp>(
       this_definition->getReductionOpType(),
       this_definition->init(),
       producer,
       this_definition->in());
 
   // Expr* consumer_definition =
-  new ReductionOp(
+  IrBuilder::create<ReductionOp>(
       this_definition->getReductionOpType(),
       this_definition->init(),
       consumer,
@@ -486,9 +644,12 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
   return producer;
 }
 
-TensorView* TensorView::welfordRfactorHelper(
+TensorView* TensorView::multiOutputRfactorHelper(
     TensorView* tv,
     const std::vector<int>& axes) {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   // Hack:
   // Semantically we should always keep the outputs of welfordOp scheduled
   // the same but the user end cannot guarantee that.
@@ -520,7 +681,8 @@ TensorView* TensorView::welfordRfactorHelper(
     std::vector<bool> new_contig(
         tv->domain()->contiguity().begin(), tv->domain()->contiguity().end());
     // replace tensor domain of target tv
-    tv->setDomain(new TensorDomain(tv->getRootDomain(), new_id, new_contig));
+    tv->setDomain(IrBuilder::create<TensorDomain>(
+        tv->getRootDomain(), new_id, new_contig));
   }
 
   // Split tensor view into 2 parts
@@ -532,7 +694,7 @@ TensorView* TensorView::welfordRfactorHelper(
 
   // This domain will be the consumer, so create the producer
   TensorView* producer =
-      new TensorView(producer_domain, tv->getDataType().value());
+      IrBuilder::create<TensorView>(producer_domain, tv->getDataType().value());
 
   // Set domain of consumer
   tv->setDomain(consumer_domain);
@@ -540,96 +702,111 @@ TensorView* TensorView::welfordRfactorHelper(
   return producer;
 }
 
-WelfordResult TensorView::rFactor(
+std::vector<TensorView*> TensorView::rFactor(
     const std::vector<int>& axes,
-    TensorView* avg,
-    TensorView* var,
-    TensorView* n) {
-  TORCH_INTERNAL_ASSERT(nDims() > 0, "Tried to rFactor a 0-dim TensorView");
+    const std::vector<TensorView*>& tvs) {
+  TORCH_CHECK(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
+  TORCH_CHECK(nDims() > 0, "Tried to rFactor a 0-dim TensorView");
   FusionGuard fg(fusion());
   TORCH_CHECK(
       definition() != nullptr &&
-          definition()->getExprType() == ExprType::WelfordOp,
+          (definition()->getExprType() == ExprType::GroupedReductionOp ||
+           definition()->getExprType() == ExprType::WelfordOp),
       "Error rfactoring welford ",
       this,
-      " its definition is either a nullptr or not a welford.");
+      " its definition is either a nullptr or not a GroupedReductionOp or a WelfordOp.");
   TORCH_CHECK(
       !domain()->hasRFactor(), "Cannot call rfactor on the same view twice.");
 
-  WelfordOp* wop = definition()->as<WelfordOp>();
+  TORCH_CHECK(
+      definition()->outputs().size() == tvs.size(),
+      "Rfactor of a multi-output reduction not used correctly");
 
-  TORCH_INTERNAL_ASSERT(
-      avg->sameAs(wop->outAvg()), "Welford rfactor not used correctly");
-  TORCH_INTERNAL_ASSERT(
-      var->sameAs(wop->outVar()), "Welford rfactor not used correctly");
-  TORCH_INTERNAL_ASSERT(
-      n->sameAs(wop->outN()), "Welford rfactor not used correctly");
+  for (const auto i : c10::irange(tvs.size())) {
+    TORCH_CHECK(
+        definition()->output(i) == tvs.at(i),
+        "Rfactor of a multi-output reduction not used correctly");
+  }
 
-  std::vector<std::pair<TensorView*, TensorView*>> tv2rf{
-      {avg, nullptr}, {var, nullptr}, {n, nullptr}};
+  std::vector<TensorView*> rf_tvs(tvs.size());
 
   // Make sure this gets rfactored last so everybody gets
   //  replayed correctly
-  for (auto& it : tv2rf) {
-    if (!sameAs(it.first)) {
-      it.second = welfordRfactorHelper(it.first, axes);
+  for (const auto i : c10::irange(tvs.size())) {
+    if (this != tvs.at(i)) {
+      rf_tvs.at(i) = multiOutputRfactorHelper(tvs.at(i), axes);
     }
   }
 
-  for (auto& it : tv2rf) {
-    if (sameAs(it.first)) {
-      it.second = welfordRfactorHelper(it.first, axes);
+  for (const auto i : c10::irange(tvs.size())) {
+    if (this == tvs.at(i)) {
+      rf_tvs.at(i) = multiOutputRfactorHelper(tvs.at(i), axes);
     }
   }
 
-  TensorView* producer_avg = tv2rf[0].second;
-  TensorView* producer_var = tv2rf[1].second;
-  TensorView* producer_n = tv2rf[2].second;
-
-  // Setup dependency chain, inserting producer before this op.
-  // Expr* producer_definition =
-  new WelfordOp(
-      producer_avg,
-      producer_var,
-      producer_n, /*out var/avg/count */
-      wop->initAvg(),
-      wop->initVar(),
-      wop->initN(), /*init var/avg/count */
-      wop->inAvg(),
-      wop->inVar(),
-      wop->inN());
-
-  // Expr* consumer_definition =
-  new WelfordOp(
-      avg,
-      var,
-      n,
-      wop->initAvg(),
-      wop->initVar(),
-      wop->initN(),
-      producer_avg,
-      producer_var,
-      producer_n);
+  if (auto wop = dynamic_cast<WelfordOp*>(definition())) {
+    TensorView* producer_avg = rf_tvs.at(0);
+    TensorView* producer_var = rf_tvs.at(1);
+    TensorView* producer_n = rf_tvs.at(2);
+
+    // Setup dependency chain, inserting producer before this op.
+    // Expr* producer_definition =
+    IrBuilder::create<WelfordOp>(
+        producer_avg,
+        producer_var,
+        producer_n, /*out var/avg/count */
+        wop->initAvg(),
+        wop->initVar(),
+        wop->initN(), /*init var/avg/count */
+        wop->inAvg(),
+        wop->inVar(),
+        wop->inN());
+
+    // Expr* consumer_definition =
+    IrBuilder::create<WelfordOp>(
+        wop->outAvg(),
+        wop->outVar(),
+        wop->outN(),
+        wop->initAvg(),
+        wop->initVar(),
+        wop->initN(),
+        producer_avg,
+        producer_var,
+        producer_n);
+  } else if (
+      auto grouped_rop = dynamic_cast<GroupedReductionOp*>(definition())) {
+    IrBuilder::create<GroupedReductionOp>(
+        grouped_rop->getReductionOpTypes(),
+        grouped_rop->initVals(),
+        std::vector<Val*>{rf_tvs.begin(), rf_tvs.end()},
+        grouped_rop->inputs());
+
+    IrBuilder::create<GroupedReductionOp>(
+        grouped_rop->getReductionOpTypes(),
+        grouped_rop->initVals(),
+        grouped_rop->outputs(),
+        std::vector<Val*>{rf_tvs.begin(), rf_tvs.end()});
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        false, "Invalid definition: ", definition()->toString());
+  }
 
-  return WelfordResult(producer_avg, producer_var, producer_n);
+  return rf_tvs;
 }
 
-TensorView* TensorView::cache_before() {
+TensorView* TensorView::cacheBefore() {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   FusionGuard fg(fusion());
 
   TORCH_CHECK(
       definition() != nullptr && !isFusionInput(),
-      "Error adding cache_before ",
+      "Error adding cacheBefore ",
       this,
-      " its definition is a nullptr and we restrict using cache_before on an input.");
-
-  TORCH_CHECK(
-      isFusionOutput() ||
-          definition()->getExprType() != ExprType::ReductionOp ||
-          definition()->getExprType() != ExprType::WelfordOp,
-      "Error adding cache_before ",
-      this,
-      " its definition is a reduction and it is not an output, instead please use cache_after.");
+      " its definition is a nullptr and we restrict using cacheBefore on an input.");
 
   // Previously, caching computed-at tensors was allowed but was never
   // really robust. Make it an error unless it is really needed.
@@ -652,8 +829,10 @@ TensorView* TensorView::cache_before() {
   // This domain will be the consumer which needs a new domain, so replace the
   // producers domain with this domain.
 
-  TensorView* producer = new TensorView(
-      new TensorDomain(
+  TensorView* producer = IrBuilder::create<TensorView>(
+      container(),
+      IrBuilder::create<TensorDomain>(
+          container(),
           domain()->getRootDomain(),
           domain()->getRFactorDomain(),
           domain()->domain(),
@@ -668,11 +847,13 @@ TensorView* TensorView::cache_before() {
       TensorDomain::noReductions(getMaybeRFactorDomain());
   std::vector<IterDomain*> new_root_domain(no_reduction_root_domain.size());
   for (const auto& dom : no_reduction_root_domain) {
-    new_root_domain[i++] = dom->clone();
+    new_root_domain[i++] = dom->cloneWithoutRFactor();
   }
 
-  consumer->setDomain(new TensorDomain(
-      new_root_domain, std::vector<bool>(new_root_domain.size(), true)));
+  consumer->setDomain(IrBuilder::create<TensorDomain>(
+      container(),
+      new_root_domain,
+      std::vector<bool>(new_root_domain.size(), true)));
 
   // Insert producer - Cache_Before (CB) - before this TV.
   // Before: Prev TV -> [Definition Op] -> This TV
@@ -684,7 +865,7 @@ TensorView* TensorView::cache_before() {
   ir_utils::replaceValInExpr(definition(), this, producer);
 
   // Expr* producer_uses =
-  new UnaryOp(UnaryOpType::Set, consumer, producer);
+  IrBuilder::create<UnaryOp>(container(), UnaryOpType::Set, consumer, producer);
 
   // definition_ is no longer valid
   // setDefinition(nullptr);
@@ -696,7 +877,10 @@ TensorView* TensorView::cache_before() {
   return producer;
 }
 
-TensorView* TensorView::cache_fork() {
+TensorView* TensorView::cacheFork() {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   FusionGuard fg(fusion());
 
   // Before: [Expr] -> This TV (Global Output) -> [Usage Expr]
@@ -704,8 +888,8 @@ TensorView* TensorView::cache_fork() {
   //                            (Fork) -> [Set Expr]   -> New TV (Global Output)
 
   TORCH_CHECK(
-      fusion()->hasOutput(this) && !this->uses().empty(),
-      "Error adding cache_fork ",
+      this->isFusionOutput() && !this->uses().empty(),
+      "Error adding cacheFork ",
       this,
       " this TensorView must be an output with subsequent uses");
 
@@ -717,14 +901,16 @@ TensorView* TensorView::cache_fork() {
 
   // This domain will be the producer, so create the consumer
   auto root_domain = TensorDomain::noReductions(getMaybeRFactorDomain());
-  TensorView* new_output = new TensorView(
-      new TensorDomain(
+  TensorView* new_output = IrBuilder::create<TensorView>(
+      container(),
+      IrBuilder::create<TensorDomain>(
+          container(),
           IterDomain::clone(root_domain),
           std::vector<bool>(root_domain.size(), true)),
       getDataType().value());
 
   // Create write operation from this TV to new output
-  new UnaryOp(UnaryOpType::Set, new_output, this);
+  IrBuilder::create<UnaryOp>(container(), UnaryOpType::Set, new_output, this);
 
   // The new TV becomes an output.
   // New TV has global memory type.
@@ -738,17 +924,18 @@ TensorView* TensorView::cache_fork() {
   return new_output;
 }
 
-TensorView* TensorView::cache_after() {
+TensorView* TensorView::cacheAfter() {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
   FusionGuard fg(fusion());
 
-  const bool kIsFusionInput = fusion()->hasInput(this);
-
   // Get all the uses for this Tensorview
   TORCH_CHECK(
-      !fusion()->hasOutput(this),
-      "Error adding cache_after ",
+      !isFusionOutput(),
+      "Error adding cacheAfter ",
       this,
-      " we restrict using cache_after on an output.");
+      " we restrict using cacheAfter on an output.");
 
   // Previously, caching computed-at tensors was allowed but was never
   // really robust. Make it an error unless it is really needed.
@@ -759,7 +946,7 @@ TensorView* TensorView::cache_after() {
   // It also did additional transformation when this tensor is an
   // input and the outputs of its consumers have computeAt. Make sure
   // we no longer rely on that behavior.
-  if (kIsFusionInput) {
+  if (isFusionInput()) {
     for (const auto& expr : uses()) {
       for (TensorView* output :
            ir_utils::filterByType<TensorView>(expr->outputs())) {
@@ -778,13 +965,16 @@ TensorView* TensorView::cache_after() {
       TensorDomain::noReductions(getMaybeRFactorDomain());
   std::vector<IterDomain*> new_root_domain(no_reduction_root_domain.size());
   for (const auto& dom : no_reduction_root_domain) {
-    new_root_domain[i++] = dom->clone();
+    new_root_domain[i++] = dom->cloneWithoutRFactor();
   }
 
   // This domain will be the producer, so create the consumer
-  TensorView* consumer = new TensorView(
-      new TensorDomain(
-          new_root_domain, std::vector<bool>(new_root_domain.size(), true)),
+  TensorView* consumer = IrBuilder::create<TensorView>(
+      container(),
+      IrBuilder::create<TensorDomain>(
+          container(),
+          new_root_domain,
+          std::vector<bool>(new_root_domain.size(), true)),
       getDataType().value());
 
   // Set domain of producer - No Change
@@ -800,14 +990,14 @@ TensorView* TensorView::cache_after() {
   }
 
   // Expr* consumer_definition =
-  new UnaryOp(UnaryOpType::Set, consumer, producer);
+  IrBuilder::create<UnaryOp>(container(), UnaryOpType::Set, consumer, producer);
 
   return consumer;
 }
 
 void TensorView::setMemoryType(MemoryType mt) {
   memory_type_ = mt;
-  if (fusion()->hasInput(this) || fusion()->hasOutput(this)) {
+  if (isFusionInput() || isFusionOutput()) {
     TORCH_INTERNAL_ASSERT(
         mt == MemoryType::Global,
         "Tried to set an input or output to the fusion to a non-global memory type.");
@@ -832,7 +1022,38 @@ void TensorView::clearReductionIterDomains() {
     }
   }
 
-  setDomain(new TensorDomain(new_root, new_contig));
+  setDomain(IrBuilder::create<TensorDomain>(container(), new_root, new_contig));
+}
+
+void TensorView::doubleBuffer() {
+  // Early correctness checking. May miss eventual errors as the
+  // checks depend on memory types and parallelization, which may not
+  // be finalized until lowering.
+  validateDoubleBufferedTensor(this);
+  is_double_buffered_ = true;
+}
+
+bool TensorView::isEmptyTensor() const {
+  auto& root_domain = getMaybeRFactorDomain();
+  return std::all_of(
+      root_domain.begin(), root_domain.end(), [](IterDomain* id) {
+        return id->extent()->isZeroInt();
+      });
+}
+
+void TensorView::applyMmaSwizzle(MmaOptions options) {
+  switch (options.operand) {
+    case MmaOptions::Operand::NotOperand:
+      mma_util::WarpMmaSwizzler::scheduleMmaWarpOutput(this, options);
+      break;
+    case MmaOptions::Operand::A:
+    case MmaOptions::Operand::B:
+      mma_util::WarpMmaSwizzler::scheduleOperandRead(this, options);
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unknown operand flag");
+      break;
+  }
 }
 
 TensorViewBuilder& TensorViewBuilder::ndims(size_t ndims) {
@@ -872,7 +1093,8 @@ TensorView* TensorViewBuilder::build() const {
   std::vector<IterDomain*> domain(ndims_, nullptr);
   for (const auto i : c10::irange(ndims_)) {
     if (shape_.empty() || shape_[i] == -1) {
-      domain[i] = new IterDomain(new Int(0), new Int());
+      domain[i] = IrBuilder::create<IterDomain>(
+          FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>());
     } else {
       TORCH_CHECK(
           shape_[i] >= 0,
@@ -880,19 +1102,29 @@ TensorView* TensorViewBuilder::build() const {
           "For a tensor representing a single scalar use ndims = 0 with no sizes set.");
       if (shape_[i] == 1) {
         // If size is known to be 1, assume it needs to be broadcasted.
-        domain[i] = new IterDomain(
-            new Int(0),
-            new Int(1),
+        domain[i] = IrBuilder::create<IterDomain>(
+            FusionGuard::getCurFusion()->zeroVal(),
+            FusionGuard::getCurFusion()->oneVal(),
             ParallelType::Serial,
             IterType::BroadcastWithStride);
       } else {
-        domain[i] = new IterDomain(new Int(0), new Int(shape_[i]));
+        domain[i] = IrBuilder::create<IterDomain>(
+            FusionGuard::getCurFusion()->zeroVal(),
+            IrBuilder::create<Int>(shape_[i]));
       }
     }
   }
 
   // Create the final TensorView
-  return new TensorView(new TensorDomain(domain, contiguity_), dtype_);
+  return IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(domain, contiguity_), dtype_);
+}
+
+void TensorView::configureMma(MmaOptions options) {
+  TORCH_CHECK(definition(), "configureMma: invalid for input tensor ", this);
+  auto mma = dynamic_cast<MmaOp*>(definition());
+  TORCH_CHECK(mma, "configureMma: invalid for non-mma output: ", this);
+  mma->configureOptions(options);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
new file mode 100644
index 000000000000..4fcbd7a67bc5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -0,0 +1,22981 @@
+#if defined(USE_CUDA)
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/torch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+#include <thread>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+namespace {
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .ndims(ndims)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(ndims, true))
+      .build();
+}
+
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+TensorView* makeContigConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .shape(shape)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(shape.size(), true))
+      .build();
+}
+
+void checkIntValue(
+    ExpressionEvaluator& evaluator,
+    Val* val,
+    Int::ScalarType expected_value) {
+  TORCH_CHECK(val->isAnInt());
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+void checkIntValue(
+    kir::ExpressionEvaluator& evaluator,
+    const Val* val,
+    Int::ScalarType expected_value) {
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+TensorView* loweredTv(TensorView* tv, GpuLower& gpulw) {
+  auto used_tvs = ir_utils::allTvs(gpulw.kernel()->as<Fusion>());
+  TensorView* matching_tv = nullptr;
+  for (auto lowered_tv : used_tvs) {
+    if (lowered_tv->name() == tv->name()) {
+      matching_tv = lowered_tv;
+    }
+  }
+  TORCH_INTERNAL_ASSERT(matching_tv != nullptr);
+  return matching_tv;
+}
+
+class PredicatedChecker : public kir::IrVisitor {
+ public:
+  // Checks if the provided tv is written to within a non-trivial conditional
+  static bool isPredicated(TensorView* tv, GpuLower& gpulw) {
+    PredicatedChecker checker(
+        loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs());
+    return checker.is_predicated_;
+  }
+
+ private:
+  PredicatedChecker() = delete;
+
+  PredicatedChecker(TensorView* tv, std::vector<Expr*> exprs) : tv_(tv) {
+    kir::IrVisitor::handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+  bool is_predicated_ = false;
+  bool predicated_ite_ = false;
+  TensorView* tv_ = nullptr;
+
+  void handle(kir::IfThenElse* ite) final {
+    auto prev_ite = predicated_ite_;
+    predicated_ite_ = !ite->predicate()->value()->isConstScalar();
+    kir::IrVisitor::handle(ite);
+    predicated_ite_ = prev_ite;
+  }
+
+  void handle(Expr* expr) final {
+    if (expr->outputs().size() && expr->outputs()[0]->isA<kir::TensorIndex>()) {
+      auto ti = expr->outputs()[0]->as<kir::TensorIndex>();
+      if (ti->view() == tv_) {
+        is_predicated_ = is_predicated_ | predicated_ite_;
+        if (expr->predicate() != nullptr &&
+            !expr->predicate()->value()->isConst()) {
+          is_predicated_ = true;
+        }
+      }
+    }
+    kir::IrVisitor::handle(expr);
+  }
+};
+
+class UnswitchInElseChecker : public kir::IrVisitor {
+ public:
+  // Checks if there are any unswitched for loops within an else clause
+  static bool check(GpuLower& gpulw) {
+    UnswitchInElseChecker checker(gpulw.kernel()->topLevelExprs());
+    return checker.found_in_else_;
+  }
+
+ private:
+  UnswitchInElseChecker() = delete;
+  UnswitchInElseChecker(std::vector<Expr*> exprs) {
+    kir::IrVisitor::handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+  bool within_else_ = false;
+  bool found_in_else_ = false;
+
+  void handle(kir::IfThenElse* ite) final {
+    auto prev_within_else = within_else_;
+    within_else_ = true;
+    kir::IrVisitor::handle(ite->elseBody().exprs());
+    within_else_ = prev_within_else;
+  }
+
+  void handle(kir::ForLoop* for_loop) final {
+    if (for_loop->iter_domain()->getParallelType() == ParallelType::Unswitch) {
+      found_in_else_ = found_in_else_ || within_else_;
+    }
+    kir::IrVisitor::handle(for_loop);
+  }
+};
+
+} // namespace
+
+// 1. Test cases are void() functions.
+// 2. They start with the prefix `test`
+
+// A few smoke tests for IrGraphGenerator
+// (These tests exercise IrGraphGenerator through a non-trivial IR,
+//  to make sure that it runs w/o crashing. The actual output is not
+//  validated)
+TEST_F(NVFuserTest, FusionIrGraphGenerator_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Make sure we can handle empty IRs
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Basic)
+                   .empty());
+
+  // Construct an interesting IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.141));
+  TensorView* tv3 = broadcast(tv0, {false, true, false, true});
+  TensorView* tv4 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv3);
+  TensorView* tv5 = clamp(
+      tv4, IrBuilder::create<Double>(0.f), IrBuilder::create<Double>(1.f));
+  TensorView* tv6 = add(tv2, tv2);
+
+  // Another checkpoint before adding outputs
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Explicit)
+                   .empty());
+
+  fusion.addOutput(tv6);
+
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
+  tv6->merge(0);
+  tv6->split(0, 4);
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->reorder({{-1, 0}});
+  tv2->computeAt(tv6, 1);
+
+  // Another checkpoint with more node types
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::ComputeOnly)
+                   .empty());
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // Final IR graph
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Verbose)
+                   .empty());
+}
+
+TEST_F(NVFuserTest, FusionDispatch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* f = IrBuilder::create<Double>(2.f);
+  std::stringstream ss1, ss2, ss3;
+  ss1 << f;
+  ss2 << static_cast<Val*>(f);
+  ss3 << static_cast<Statement*>(f);
+  TORCH_CHECK(
+      ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0,
+      "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*.");
+}
+
+// Evaluate basic scalar operations with constant values
+TEST_F(NVFuserTest, FusionExprEvalConstants_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  ExpressionEvaluator evaluator(&fusion);
+
+  auto* a = IrBuilder::create<Int>(7);
+  auto* b = IrBuilder::create<Int>(3);
+
+  // Avoid div operation because it casts int operands to float
+  checkIntValue(evaluator, neg(a), -7);
+  checkIntValue(evaluator, add(a, b), 10);
+  checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+}
+
+// Evaluate basic scalar operations with bound values
+TEST_F(NVFuserTest, FusionExprEvalBindings_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  ExpressionEvaluator evaluator(&fusion);
+
+  auto* a = IrBuilder::create<Int>();
+  auto* b = IrBuilder::create<Int>();
+  auto* c = add(a, b);
+  auto* d = neg(ceilDiv(c, b));
+  auto* e = IrBuilder::create<Int>(0);
+
+  // trying to evaluate before binding should give empty results
+  TORCH_CHECK(!evaluator.evaluate(a).has_value());
+  TORCH_CHECK(!evaluator.evaluate(d).has_value());
+
+  evaluator.bind(a, 7);
+  evaluator.bind(b, 3);
+
+  // can't bind to the results of expressions
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(c, 100));
+
+  // can't bind to concrete values
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(e, 100));
+
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, sub(a, b), 4);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, d, -4);
+
+  // Reset evaluation context
+  evaluator = ExpressionEvaluator(&fusion);
+
+  evaluator.bind(a, 2);
+  evaluator.bind(b, 5);
+
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, sub(a, b), -3);
+  checkIntValue(evaluator, mod(a, b), 2);
+  checkIntValue(evaluator, ceilDiv(a, b), 1);
+  checkIntValue(evaluator, d, -2);
+}
+
+// Evaluate expressions in a simple IR
+TEST_F(NVFuserTest, FusionExprEvalBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Create a non-trivial IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // 1. Create an evaluator
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  //
+  // IMPORTANT:
+  // a. The bindings are only as stable as the Vals are in the fusion graph
+  // b. You must use the original (rootDomain) extents
+  //  (ex. `tv0->getRootDomain()[0]->extent()`
+  //   instead of `tv0->axis(0)->extent()`)
+  //
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
+
+  // 3. Evaluate and check result values
+  TORCH_CHECK(tv2->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
+}
+
+// Evaluate expressions in a more complex IR
+TEST_F(NVFuserTest, FusionExprEvalComplex_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv0, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  tv5->reorder({{-1, 0}});
+
+  tv6->split(0, 5);
+  tv5->merge(0);
+
+  // 1. Create an evaluator
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 129);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 127);
+
+  // Evaluate and check extent values
+  TORCH_CHECK(tv0->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv0->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv0->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv4->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv4->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv4->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv5->domain()->nDims() == 1);
+  checkIntValue(evaluator, tv5->axis(0)->extent(), 16383);
+
+  TORCH_CHECK(tv6->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv6->axis(0)->extent(), 26);
+  checkIntValue(evaluator, tv6->axis(1)->extent(), 5);
+  checkIntValue(evaluator, tv6->axis(2)->extent(), 127);
+}
+
+// Evaluate expressions post lowering
+TEST_F(NVFuserTest, FusionExprEvalPostLower_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Create a non-trivial IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto* bid_x = add(tv3->axis(0)->extent(), IrBuilder::create<Int>(0));
+  auto* tid_x = add(tv3->axis(-1)->extent(), IrBuilder::create<Int>(0));
+
+  // Lower
+  GpuLower gpulw(&fusion);
+
+  // 1. Create an evaluation context
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
+
+  // 3. Evaluate and check result values
+  TORCH_CHECK(tv2->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
+
+  checkIntValue(evaluator, bid_x, 2);
+  checkIntValue(evaluator, tid_x, 128);
+}
+
+// Kernel IR: Evaluate basic scalar operations with constant values
+TEST_F(NVFuserTest, FusionKernelExprEvalConstants_CUDA) {
+  Fusion fusion;
+  kir::Kernel kernel(&fusion);
+  FusionGuard fg((&kernel)->as<Fusion>());
+
+  auto a = IrBuilder::create<Int>(7);
+  auto b = IrBuilder::create<Int>(3);
+  auto c = IrBuilder::subExpr(a, b);
+  auto d = IrBuilder::divExpr(a, b);
+  auto e = IrBuilder::mulExpr(c, d);
+
+  kir::ExpressionEvaluator evaluator;
+
+  checkIntValue(evaluator, IrBuilder::negExpr(a), -7);
+  checkIntValue(evaluator, IrBuilder::addExpr(a, b), 10);
+  checkIntValue(evaluator, IrBuilder::negExpr(e), -8);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
+}
+
+// Kernel IR: Evaluate basic scalar operations with bound values
+TEST_F(NVFuserTest, FusionKernelExprEvalBindings_CUDA) {
+  Fusion fusion;
+  kir::Kernel kernel(&fusion);
+  FusionGuard fg((&kernel)->as<Fusion>());
+
+  kir::ExpressionEvaluator evaluator;
+
+  auto a = IrBuilder::create<Int>(c10::nullopt);
+  auto b = IrBuilder::create<Int>(c10::nullopt);
+  auto c = IrBuilder::addExpr(a, b);
+  auto d = IrBuilder::negExpr(IrBuilder::ceilDivExpr(c, b));
+  auto e = IrBuilder::create<Int>(0);
+
+  // trying to evaluate before binding should give empty results
+  TORCH_CHECK(!evaluator.evaluate(a).has_value());
+  TORCH_CHECK(!evaluator.evaluate(d).has_value());
+
+  evaluator.bind(a, 7);
+  evaluator.bind(b, 3);
+
+  // can't bind to the results of expressions
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(c, 100));
+
+  // can't bind to concrete values
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(e, 100));
+
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, IrBuilder::subExpr(a, b), 4);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
+  checkIntValue(evaluator, d, -4);
+
+  // Reset the evaluation context
+  evaluator = kir::ExpressionEvaluator();
+
+  evaluator.bind(a, 2);
+  evaluator.bind(b, 5);
+
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, IrBuilder::subExpr(a, b), -3);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 2);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 1);
+  checkIntValue(evaluator, d, -2);
+}
+
+TEST_F(NVFuserTest, FusionClear_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 1. Create a dummy IR
+
+  {
+    TensorView* tv0 = makeSymbolicTensor(2);
+    TensorView* tv1 = makeSymbolicTensor(2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    TensorView* tv3 = add(tv0, tv2);
+
+    fusion.addOutput(tv3);
+
+    tv3->split(0, 4);
+    tv0->computeAt(tv3, 1);
+    tv1->computeAt(tv3, 1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv2->axis(1)->parallelize(ParallelType::Unroll);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // 2. Clear the IR
+
+  fusion.clear();
+
+  TORCH_CHECK(fusion.unordered_exprs().empty());
+  TORCH_CHECK(fusion.vals().empty());
+
+  TORCH_CHECK(fusion.inputs().empty());
+  TORCH_CHECK(fusion.outputs().empty());
+
+  TORCH_CHECK(ir_utils::getReductionOps(&fusion).empty());
+
+  // 3. Rebuild the IR
+
+  {
+    TensorView* tv0 = makeSymbolicTensor(3);
+    TensorView* tv1 = makeSymbolicTensor(3);
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    TensorView* tv3 = add(tv0, tv2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+    fusion.addOutput(tv3);
+
+    // tv3 [i0, i1, i2]
+    tv3->reorder({{0, 2}, {2, 0}});
+    // tv3 [i2, i1, i0]
+    tv3->split(-1, 4);
+    // tv3 [i2, i1, i0outer, i0inner{4}]
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+    // tv3 [i0outer, i0inner{4}, i1, i2]
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+    tv3->axis(1)->parallelize(ParallelType::BIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({16, 8, 8}, options);
+  at::Tensor input2 = at::randn_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionCopy_CUDA) {
+  Fusion original_fusion;
+
+  // Create the test IR
+  {
+    FusionGuard fg(&original_fusion);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
+
+    original_fusion.addInput(tv0);
+    original_fusion.addInput(tv1);
+    original_fusion.addOutput(tv3);
+
+    tv3->reorder({{0, 2}, {2, 0}});
+    tv3->split(-1, 4);
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // Test copy before lowering
+  Fusion clone = original_fusion;
+
+  // Compare IR dumps
+  std::stringstream original_ir;
+  std::stringstream clone_ir;
+  original_ir << original_fusion;
+  clone_ir << clone;
+  ASSERT_EQ(original_ir.str(), clone_ir.str());
+
+  // Lower original fusion
+  std::string original_kernel;
+  {
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&original_fusion);
+    original_kernel =
+        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
+  }
+
+  // Make sure the "before lowering" clone was not mutated
+  // while lowering the original fusion IR
+  std::stringstream before_lowering_ir;
+  before_lowering_ir << clone;
+  ASSERT_EQ(original_ir.str(), before_lowering_ir.str());
+
+  // Test copy after lowering (including assignment operator)
+  Fusion before_lowering = clone;
+  clone = original_fusion;
+
+  // Compare IR dumps
+  std::stringstream original_lowered_ir;
+  std::stringstream clone_lowered_ir;
+  original_lowered_ir << original_fusion;
+  clone_lowered_ir << clone;
+  ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
+
+  // Lower the "before lowering" and compare kernels
+  std::string clone_kernel;
+  {
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&before_lowering);
+    clone_kernel =
+        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
+  }
+  ASSERT_EQ(original_kernel, clone_kernel);
+}
+
+TEST_F(NVFuserTest, FusionMove_CUDA) {
+  Fusion fusion;
+
+  // Create the test IR
+  {
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+    fusion.addOutput(tv3);
+
+    tv3->reorder({{0, 2}, {2, 0}});
+    tv3->split(-1, 4);
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  std::stringstream original_ir;
+  original_ir << fusion;
+
+  // Test move before lowering
+  Fusion another_fusion = std::move(fusion);
+
+  // Check that the original fusion is "empty"
+  //
+  // IMPORTANT: these checks assume knowledge of the internal
+  //    implementation of the move operations. General uses
+  //    should only assume that the moved-from object is in
+  //    a valid, but unspecified state. This is similar to the
+  //    standard library containers:
+  //    https://en.cppreference.com/w/cpp/utility/move
+  //
+  TORCH_CHECK(fusion.unordered_exprs().empty());
+  TORCH_CHECK(fusion.vals().empty());
+  TORCH_CHECK(fusion.inputs().empty());
+  TORCH_CHECK(fusion.outputs().empty());
+
+  // clear() has no pre-conditions so it's valid to call on a moved-from object
+  fusion.clear();
+
+  // Compare IR dumps
+  std::stringstream another_ir;
+  another_ir << another_fusion;
+  ASSERT_EQ(original_ir.str(), another_ir.str());
+
+  // Lower the fusion IR
+  GpuLower lower(&another_fusion);
+
+  std::stringstream lowered_ir;
+  lowered_ir << another_fusion;
+
+  // Test move assignment after lowering
+  fusion = std::move(another_fusion);
+
+  // Compare IR dumps
+  std::stringstream moved_lowered_ir;
+  moved_lowered_ir << fusion;
+  ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
+}
+
+TEST_F(NVFuserTest, FusionSimpleArith_CUDA) {
+  std::stringstream ss1, ss2;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* d1 = IrBuilder::create<Double>(1.f);
+  Double* d2 = IrBuilder::create<Double>(2.f);
+  Double* d3 = IrBuilder::create<Double>();
+
+  // Disrupt the fusion to make sure guard works well
+  {
+    Fusion fusion2;
+    FusionGuard fg(&fusion2);
+
+    Double* d1 = IrBuilder::create<Double>(1.f);
+    Double* d2 = IrBuilder::create<Double>(2.f);
+    add(d1, d2);
+    ss2 << fusion2;
+  }
+
+  IrBuilder::create<BinaryOp>(BinaryOpType::Add, d3, d1, d2);
+  ss1 << fusion;
+
+  TORCH_CHECK(
+      ss1.str().compare(ss2.str()) == 0,
+      "Error where explicit add nodes don't match implicit add nodes.");
+}
+
+TEST_F(NVFuserTest, FusionScalarTypePromote_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Bool* b = IrBuilder::create<Bool>(true);
+  Double* d = IrBuilder::create<Double>(4.f);
+  Int* i = IrBuilder::create<Int>(3);
+  ComplexDouble* c =
+      IrBuilder::create<ComplexDouble>(c10::complex<double>(1, 2));
+
+  TORCH_CHECK(add(b, b)->getDataType() == DataType::Bool);
+  TORCH_CHECK(add(b, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(b, i)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(b, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(d, b)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, i)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(i, b)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(i, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(i, i)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(i, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(c, b)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, d)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, i)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, c)->getDataType() == DataType::ComplexDouble);
+}
+
+TEST_F(NVFuserTest, FusionComplexAbsTypes_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto tensor_cf = at::randn({4, 4, 4}, options.dtype(at::kComplexFloat));
+  auto tensor_cd = at::randn({4, 4, 4}, options.dtype(at::kComplexDouble));
+
+  auto type_cf = TensorType::create(tensor_cf);
+  auto tv_cf = IrBuilder::create<TensorView>(type_cf);
+  auto type_cd = TensorType::create(tensor_cd);
+  auto tv_cd = IrBuilder::create<TensorView>(type_cd);
+
+  TORCH_CHECK(
+      tensor_cf.abs().scalar_type() ==
+      data_type_to_aten(abs(tv_cf)->getDataType().value()));
+  TORCH_CHECK(
+      tensor_cd.abs().scalar_type() ==
+      data_type_to_aten(abs(tv_cd)->getDataType().value()));
+}
+
+TEST_F(NVFuserTest, FusionRegister_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  Double* v1 = IrBuilder::create<Double>(1.f);
+  Double* v2 = IrBuilder::create<Double>(2.f);
+  Val* v3 = binaryOp(BinaryOpType::Add, v1, v2);
+  Val* v4 = binaryOp(BinaryOpType::Add, v1, v2);
+  TORCH_CHECK(v1->name() + 1 == v2->name());
+  TORCH_CHECK(v2->name() + 1 == v3->name());
+  TORCH_CHECK(v3->name() + 1 == v4->name());
+  TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name());
+}
+
+// dummy expr with 2 outputs only for toposort test.
+struct DummyExpr : public Expr {
+  ~DummyExpr() = default;
+  DummyExpr(
+      IrBuilderPasskey passkey,
+      Val* _outlhs,
+      Val* _outrhs,
+      Val* _lhs,
+      Val* _rhs)
+      : Expr(passkey, ExprType::UnaryOp) // Not terribly safe...
+  {
+    addOutput(_outlhs);
+    addOutput(_outrhs);
+    addInput(_lhs);
+    addInput(_rhs);
+  }
+  DummyExpr(const DummyExpr& other) = delete;
+  DummyExpr& operator=(const DummyExpr& other) = delete;
+  DummyExpr(DummyExpr&& other) = delete;
+  DummyExpr& operator=(DummyExpr&& other) = delete;
+};
+
+TEST_F(NVFuserTest, FusionTopoSort_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // e0: v3, v2 = dummy(v1, v0)
+  // e1: v4     =   add(v3, v2)
+  // e2: v5     =   add(v2, v4)
+  // e3: v6     =   add(v5, v5)
+  Double* v0 = IrBuilder::create<Double>();
+  Double* v1 = IrBuilder::create<Double>();
+  Double* v2 = IrBuilder::create<Double>();
+  Double* v3 = IrBuilder::create<Double>();
+  Double* v4 = IrBuilder::create<Double>();
+  Double* v5 = IrBuilder::create<Double>();
+  Double* v6 = IrBuilder::create<Double>();
+
+  std::vector<Val*> inputs = {v0, v1};
+  for (auto val : inputs) {
+    fusion.addInput(val);
+  }
+
+  Expr* e0 = IrBuilder::create<DummyExpr>(v3, v2, v1, v0);
+  Expr* e1 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v4, v3, v2);
+  Expr* e2 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v5, v2, v4);
+  Expr* e3 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v6, v5, v5);
+
+  fusion.addOutput(v2);
+  fusion.addOutput(v3);
+  auto exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1");
+  TORCH_CHECK(exprs[0] == e0);
+
+  fusion.addOutput(v5);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+
+  fusion.addOutput(v4);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+
+  fusion.addOutput(v6);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+  TORCH_CHECK(exprs[3] == e3);
+
+  TORCH_CHECK(v2->definition()->name() == 0);
+  TORCH_CHECK(v3->definition()->name() == 0);
+  TORCH_CHECK(v4->definition()->name() == 1);
+  TORCH_CHECK(v5->definition()->name() == 2);
+  TORCH_CHECK(v6->definition()->name() == 3);
+}
+
+TEST_F(NVFuserTest, FusionTensor_CUDA) {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  {
+    auto tensor = at::randn({2, 3, 4, 5}, options);
+    auto tensor_type = TensorType::create(tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(
+          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
+      // check contiguity information;
+      TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]);
+    }
+  }
+
+  // TensorType::create fills stride_properties, which helps us to mark
+  // IterDomain properly
+  // Note: implementation could change, depending on how much we want to invest
+  // in our home-brew contiguity coalescing. For now let's make sure that we
+  // properly test what we are using.
+  {
+    auto tensor = at::randn({4, 4, 4}, options);
+    auto sliced_tensor = tensor.slice(1, 0, -1, 2);
+
+    auto tensor_type = TensorType::create(sliced_tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
+    }
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
+  }
+
+  {
+    auto tensor = at::randn({2, 3, 4, 5}, options);
+    auto permuted_tensor = tensor.permute({0, 3, 1, 2});
+    auto tensor_type = TensorType::create(permuted_tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
+    }
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]);
+  }
+}
+
+TEST_F(NVFuserTest, FusionFilterVals_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(1);
+  auto scalar0 = IrBuilder::create<Double>(0);
+  auto scalar1 = IrBuilder::create<Int>(0);
+  auto scalar2 = IrBuilder::create<Int>(1);
+
+  const std::vector<Val*> vals = {tv0, scalar0, tv1, scalar1, scalar2};
+
+  std::vector<TensorView*> tvs(
+      ir_utils::filterByType<TensorView>(vals).begin(),
+      ir_utils::filterByType<TensorView>(vals).end());
+  TORCH_CHECK(tvs.size() == 2);
+  TORCH_CHECK(tvs[0] == tv0);
+  TORCH_CHECK(tvs[1] == tv1);
+
+  std::vector<Double*> floats(
+      ir_utils::filterByType<Double>(vals).begin(),
+      ir_utils::filterByType<Double>(vals).end());
+  TORCH_CHECK(floats.size() == 1);
+  TORCH_CHECK(floats[0] == scalar0);
+
+  std::vector<Int*> ints(
+      ir_utils::filterByType<Int>(vals).begin(),
+      ir_utils::filterByType<Int>(vals).end());
+  TORCH_CHECK(ints.size() == 2);
+  TORCH_CHECK(ints[0] == scalar1);
+  TORCH_CHECK(ints[1] == scalar2);
+
+  TORCH_CHECK(
+      ir_utils::filterByType<Expr>(vals).begin() ==
+          ir_utils::filterByType<Expr>(vals).end(),
+      "Not expecting any results");
+}
+
+TEST_F(NVFuserTest, FusionTVSplit_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv = makeSymbolicTensor(3);
+
+  tv = tv->split(2, 2);
+  TORCH_CHECK(tv->nDims() == 4);
+  Expr* outer = tv->axis(2)->extent()->definition();
+
+  TORCH_CHECK(
+      outer->getExprType().value() == ExprType::BinaryOp &&
+      static_cast<BinaryOp*>(outer)->getBinaryOpType() ==
+          BinaryOpType::CeilDiv &&
+      static_cast<BinaryOp*>(outer)->lhs()->sameAs(
+          tv->getRootDomain()[2]->extent()) &&
+      static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs())
+          ->sameAs(IrBuilder::create<Int>(2)));
+
+  IterDomain* inner = static_cast<IterDomain*>(tv->axis(3));
+  TORCH_CHECK(
+      inner->extent()->isScalar() &&
+      static_cast<Int*>(inner->extent())->isConst() &&
+      static_cast<Int*>(inner->extent())->value().value() == 2);
+}
+
+TEST_F(NVFuserTest, FusionTVMerge_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv = makeSymbolicTensor(3);
+
+  tv = tv->merge(1);
+  Expr* axisOp = tv->axis(1)->extent()->definition();
+
+  TORCH_CHECK(
+      tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp &&
+      static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul &&
+      static_cast<BinaryOp*>(axisOp)->lhs() ==
+          tv->getRootDomain()[1]->extent() &&
+      static_cast<BinaryOp*>(axisOp)->rhs() ==
+          tv->getRootDomain()[2]->extent());
+}
+
+TEST_F(NVFuserTest, FusionTVReorder_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::unordered_map<int, int> shift_right{{-1, 0}};
+
+  std::unordered_map<int, int> shift_left{{0, -1}};
+
+  std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}};
+
+  std::unordered_map<int, int> swap{{0, 2}, {2, 0}};
+
+  auto tv = makeSymbolicTensor(3);
+  std::vector<IterDomain*> ref;
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_left);
+  for (const auto i : c10::irange(tv->nDims())) {
+    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_left);
+  for (const auto i : c10::irange(tv->nDims())) {
+    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_right);
+  TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0)));
+  for (const auto i : c10::irange(1, tv->nDims())) {
+    TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+  tv->reorder(swap);
+  TORCH_CHECK(ref[0]->sameAs(tv->axis(2)));
+  TORCH_CHECK(ref[2]->sameAs(tv->axis(0)));
+  TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
+}
+
+TEST_F(NVFuserTest, FusionEquality_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* fval1 = IrBuilder::create<Double>();
+  Double* fval1_copy = fval1;
+  Double* fval2 = IrBuilder::create<Double>();
+  Double* fone = IrBuilder::create<Double>(1.0);
+
+  TORCH_CHECK(fval1->sameAs(fval1_copy));
+  TORCH_CHECK(!fval1->sameAs(fval2));
+  TORCH_CHECK(!fone->sameAs(fval1));
+  TORCH_CHECK(fone->sameAs(IrBuilder::create<Double>(1.0)));
+
+  Int* ival1 = IrBuilder::create<Int>();
+  Int* ival1_copy = ival1;
+  Int* ival2 = IrBuilder::create<Int>();
+  Int* ione = IrBuilder::create<Int>(1);
+
+  TORCH_CHECK(ival1->sameAs(ival1_copy));
+  TORCH_CHECK(!ival1->sameAs(ival2));
+  TORCH_CHECK(!ione->sameAs(ival1));
+  TORCH_CHECK(ione->sameAs(IrBuilder::create<Int>(1)));
+
+  BinaryOp* add1 = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
+  BinaryOp* add1_copy = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
+  BinaryOp* sub1 = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Sub, IrBuilder::create<Double>(), fval1, ival1);
+
+  UnaryOp* neg1 = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
+  UnaryOp* neg2 = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval2);
+  UnaryOp* neg1_copy = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
+
+  TORCH_CHECK(add1->sameAs(add1_copy));
+  TORCH_CHECK(!add1->sameAs(sub1));
+
+  TORCH_CHECK(neg1->sameAs(neg1_copy));
+  TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1));
+  TORCH_CHECK(!neg1->sameAs(neg2));
+}
+
+TEST_F(NVFuserTest, FusionDependency_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* d0 = IrBuilder::create<Double>(0.f);
+  Double* d1 = IrBuilder::create<Double>(1.f);
+  auto d2 = add(d0, d1);
+
+  auto d3 = add(d2, d2);
+
+  Double* d4 = IrBuilder::create<Double>(4.f);
+  Double* d5 = IrBuilder::create<Double>(5.f);
+  auto d6 = add(d4, d5);
+
+  Double* d7 = IrBuilder::create<Double>(7.f);
+  Double* d8 = IrBuilder::create<Double>(8.f);
+  auto d9 = add(d7, d8);
+
+  auto d10 = add(d6, d9);
+
+  auto d11 = add(d3, d10);
+
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10));
+
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8));
+
+  auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d3);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d2);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d10);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d10);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d6);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2);
+  TORCH_CHECK(dep_chain.empty());
+}
+
+TEST_F(NVFuserTest, FusionParser_CUDA) {
+  // This test may not pass if using a custom block sync as there may
+  // be additional calls. Skip the test as it's not specifically
+  // relevant with block synchronizatin.
+  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
+    return;
+  }
+  auto g = std::make_shared<Graph>();
+  const auto graph0_string = R"IR(
+    graph(%0 : Float(2, strides=[1]),
+          %1 : Float(2, strides=[1])):
+      %c0 : Float(2, strides=[1]) = aten::mul(%0, %1)
+      %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0)
+      return (%d0))IR";
+  parseIR(graph0_string, g.get());
+
+  // strides are not yet supported in the irparser.
+  for (auto val : g->block()->inputs()) {
+    if (val->isCompleteTensor())
+      val->setType(val->type()->castRaw<TensorType>()->contiguous());
+  }
+  for (auto node : g->block()->nodes()) {
+    for (auto val : node->outputs()) {
+      if (val->isCompleteTensor())
+        val->setType(val->type()->castRaw<TensorType>()->contiguous());
+    }
+  }
+
+  auto fusion = parseJitIR(g);
+  FusionGuard fg(fusion.get());
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // Avoid vectorization here as those kernels can't be lowered twice at the
+  // moment
+  at::Tensor input1 = at::randn({16}, options);
+  at::Tensor input2 = at::randn({16}, options);
+  auto lparams = schedulePointwise(fusion.get(), {input1, input2});
+
+  // CONSIDER:
+  // 1. this can be moved to a dedicated "golden" file
+  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
+  const std::string expected_kernel = R"(
+__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
+  int64_t i52;
+  i52 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
+  if ((i52 < T0.size[0])) {
+    float T5[1];
+    T5[0] = 0;
+    T5[0]
+       = T1[i52];
+    float T4[1];
+    T4[0] = 0;
+    T4[0]
+       = T0[i52];
+    float T6[1];
+    float T2[1];
+    T2[0]
+      = T4[0]
+      * T5[0];
+    T6[0]
+      = T2[0]
+      * T4[0];
+    T3[i52]
+       = T6[0];
+  }
+}
+)";
+
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
+  if (expected_kernel.size() != actual_kernel.size() ||
+      expected_kernel.compare(actual_kernel) != 0) {
+    std::cerr
+        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
+        << " \n ========= EXPECTED ========= \n"
+        << expected_kernel << "\n========= ACTUAL ========== \n"
+        << actual_kernel << "\n=================" << std::endl;
+    auto it = std::mismatch(
+        expected_kernel.begin(),
+        expected_kernel.end(),
+        actual_kernel.begin(),
+        actual_kernel.end());
+    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
+    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
+    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
+    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
+    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
+              << ", expected: " << expected_mismatched_snippet << std::endl;
+    TORCH_CHECK(false);
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2}, lparams);
+  auto outputs = fe.runFusion({input1, input2}, lparams);
+  at::Tensor output_ref = input1 * input2 * input1;
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionOuterSplit_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add,
+      tv0,
+      IrBuilder::create<Double>(0.0),
+      IrBuilder::create<Double>(1.0));
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addOutput(tv2);
+
+  //[I0, I1, I2]
+  tv2->split(-1, 4, false);
+  //[I0, I1, I2o{4}, I2i]
+  tv2->merge(0);
+  tv2->merge(0);
+  //[I0*I1*I2o{4}, I2i]
+  tv2->split(0, 2);
+  //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i]
+  tv2->reorder({{0, 1}, {1, 0}});
+  // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i]
+
+  tv0->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor output = at::empty({2, 6, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({}, {output});
+
+  at::Tensor output_ref = at::zeros_like(output, options);
+  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionCodeGen_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add,
+      tv0,
+      IrBuilder::create<Double>(0.0),
+      IrBuilder::create<Double>(1.0));
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addOutput(tv2);
+
+  //[I0, I1, I2]
+  tv2 = tv2->split(0, 4);
+  //[I0o, I0i{4}, I1, I2]
+  tv2 = tv2->merge(1);
+  //[I0o, I0i{4}*I1, I2]
+  tv2 = tv2->split(-1, 2);
+  //[I0o, I0i{4}*I1, I2o, I2i{2}]
+  tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}});
+  //[I0i{4}*I1, I0o, I2i{2}, I2o]
+
+  tv0->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor output = at::empty({16, 8, 8}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({}, {output});
+
+  at::Tensor output_ref = at::zeros_like(output, options);
+  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionCodeGen2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+  TensorView* tv1 = makeSymbolicTensor(3);
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv3);
+
+  //[I0, I1, I2]
+  tv3->reorder({{0, 2}, {2, 0}});
+  //[I2, I1, I0]
+  tv3->split(-1, 4);
+  //[I2, I1, I0o, I0i{4}]
+  tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+  // I0o, I0i{4}, I1, I2]
+
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({16, 8, 8}, options);
+  at::Tensor input2 = at::randn_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionSimplePWise_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims);
+  TensorView* tv1 = makeContigTensor(nDims);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+  tv3->merge(0);
+
+  // Split by n_threads
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims, DataType::ComplexFloat);
+  TensorView* tv1 = makeContigTensor(nDims, DataType::ComplexFloat);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  c10::complex<double> scalar1(2.0, 3.0);
+  TensorView* tv2 = add(tv1, IrBuilder::create<ComplexDouble>(scalar1));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+  tv3->merge(0);
+
+  // Split by n_threads
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options =
+      at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + scalar1;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionExecKernel_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::ones({1, 128}, options);
+  at::Tensor input2 = at::ones_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor check = at::full({1, 128}, 4, options);
+  ;
+  TORCH_CHECK(outputs[0].equal(check));
+}
+
+int ceilDiv_(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv7->merge(0);
+  tv7->split(0, 128);
+  tv7->split(0, 4);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv7, 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+
+    TORCH_CHECK(
+        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  auto t1 = input.mul({-1.0});
+  auto t2 = input.add({3.0});
+  auto t3 = input.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv3->nDims() > 1)
+    tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.mul({0.979361});
+  auto aten_output = t2.mul(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  while (tv6->nDims() > 1)
+    tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+  tv3->computeAt(tv6, 1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  auto t4 = t2.sub(t3);
+  auto t5 = t1.add(t4);
+  auto aten_output = t5.sub(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(-1, 8);
+  tv3->split(-1, 4);
+
+  tv2->computeAt(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv2->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
+
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = broadcast(tv1, {false, true});
+
+  auto tv6 = makeSymbolicTensor(2);
+  fusion.addInput(tv6);
+
+  auto tv7 = mul(tv5, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->split(1, 2);
+  tv7->merge(0);
+  tv7->split(0, 4);
+  tv7->split(0, 128);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv7, 1);
+  auto tv5_domain = tv5->domain()->domain();
+
+  // These computeAt transformations should not affect the TV5 domain
+  tv0->computeAt(tv4, -1);
+  tv2->computeAt(tv4, -1);
+
+  auto tv5_domain_current = tv5->domain()->domain();
+  TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain");
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({numel_x}, options);
+  auto t2 = at::randn({numel_x}, options);
+  auto t6 = at::randn({numel_x, numel_y}, options);
+
+  auto t1 = t0.add(1.0);
+  auto t3 = t2.add(3.0);
+  auto t4 = t1.add(t3);
+  auto t5 = t1.unsqueeze(1);
+  auto t7 = t5.mul(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t6};
+  std::vector<at::Tensor> aten_outputs = {t4, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
+
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = broadcast(tv1, {false, true});
+
+  auto tv6 = makeSymbolicTensor(2);
+  fusion.addInput(tv6);
+
+  auto tv7 = mul(tv5, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->split(1, 2);
+  tv7->merge(0);
+  tv7->split(0, 128, false);
+  tv7->split(0, 4, false);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Reverse computeAt structure from previous test
+  tv0->computeAt(tv4, -1);
+  tv2->computeAt(tv4, -1);
+  tv0->computeAt(tv7, -1);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({numel_x}, options);
+  auto t2 = at::randn({numel_x}, options);
+  auto t6 = at::randn({numel_x, numel_y}, options);
+
+  auto t1 = t0.add(1.0);
+  auto t3 = t2.add(3.0);
+  auto t4 = t1.add(t3);
+  auto t5 = t1.unsqueeze(1);
+  auto t7 = t5.mul(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t6};
+  std::vector<at::Tensor> aten_outputs = {t4, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  tv0->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeWith(tv7, 1);
+
+  GpuLower gpulw(&fusion);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+    TORCH_CHECK(
+        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  tv0->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeWith(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  auto t1 = input.mul({-1.0});
+  auto t2 = input.add({3.0});
+  auto t3 = input.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv0->nDims() > 1)
+    tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  while (tv1->nDims() > 1)
+    tv1->merge(0);
+  tv1->split(0, 128);
+  tv1->split(0, 4);
+
+  tv0->computeWith(tv3, 1);
+  tv1->computeWith(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.mul({0.979361});
+  auto aten_output = t2.mul(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+  std::vector<TensorView*> tvs = {tv0, tv1, tv2};
+  for (auto tv : tvs) {
+    // Lets setup to actually run
+    while (tv->nDims() > 1) {
+      tv->merge(0);
+    }
+    tv->split(0, 128);
+    tv->split(0, 4);
+    tv->computeWith(tv6, 1);
+  }
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  auto t4 = t2.sub(t3);
+  auto t5 = t1.add(t4);
+  auto aten_output = t5.sub(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+
+  tv2->computeWith(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv2->computeWith(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  // This computeAt will affect tv2 as well, even though tv2 is not in
+  // the data-flow path between tv1 and tv3. The reason is that tv1 is
+  // now computed at tv3, so tv2 must also be computed at the same
+  // location. Overall, what will happen is basically we merge
+  // expressions of all tensors and compute them in a single loop
+  // nest.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  GpuLower gpulw(&fusion);
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
+  TORCH_CHECK(
+      tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // Note that tv2 is also computed at tv3.
+  for (auto tv : {tv1, tv2}) {
+    TORCH_CHECK(ca_map.areMapped(
+        tv->axis(0), computeAtTarget->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  TORCH_CHECK(tv3->getComputeAtPosition() == 0);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+// Similar to ComputeAtMultiConsumers, but with a common consumer.
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  // Computing tv1 at tv3. This will affect tv2 as discussed in
+  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
+  // the common consumer of tv2 and tv3, so they are computed at
+  // tv4. The indirect propagation of the computeAt should stop at the
+  // common consumer, and no further change should occur. More
+  // specifically, the computeAT position of tv4 and tv5 should be zero.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 0);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 0);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // Transform tv5 to make it look like the rest
+  tv5->split(0, 128);
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t4, t5};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv3 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This computeAt will affect all tensors including tv3, tv4 and
+  // tv5, even though it appears to impact only tv1 and tv2. The
+  // reason is that tv1 is now computed at tv3, so tv4 must also be
+  // computed at the same location. Similarly, the consumer of tv4,
+  // tv5, must also be computed at the same location. Overall, what
+  // will happen is basically we merge expressions of all tensors and
+  // compute them in a single loop nest. Internally, this will be
+  // realized by making all tensors, except for those in the path
+  // between tv1 and tv3, computed at tv5, which we call the common
+  // consumer.
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (val->isFusionInput() ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv5) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (!tv->isFusionInput()) {
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto aten_output = t3 + t4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Similar to the above common consumer test but adds an additional
+// tensor that has no common consumer with the other tensors.
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv2 + tv3
+  // tv6 = tv1 + 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+  TensorView* tv6 = add(tv1, IrBuilder::create<Double>(6.0));
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This will have the same impact on the tensors except for tv5 and
+  // tv6. tv6 does not have any common consumer with the computeAt
+  // target, but since it uses tv1, it must be also computed at the
+  // same location as the other impacted tensors. We can either make
+  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
+  // should be computed at tv6 just because the current implementation
+  // orders the computeAt relationship based on the order in which
+  // tensors are specified as outputs.
+
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (tv->isFusionInput()) {
+      continue;
+    }
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv5 || tv == tv6) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+      TORCH_CHECK(tv->getMaxProducerPosition() == 1);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = val->as<TensorView>();
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
+  auto t6 = t1.add({6.0});
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
+// that does not have data dependency with the consumer.
+TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  // tv6 = tv1 * 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
+  // Notice that tv6 is not a consumer of tv4.
+  TensorView* tv6 = mul(tv1, IrBuilder::create<Double>(6.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv6 || tv == tv5) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+  auto t6 = t1 * 6.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t4, t5, t6};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+namespace {
+
+void checkIdMapped(
+    ComputeAtRootDomainMap& root_map,
+    TensorView* v0,
+    IterDomain* id0,
+    TensorView* v1,
+    IterDomain* id1,
+    bool should_map) {
+  if (should_map) {
+    TORCH_CHECK(
+        root_map.canMap(v0->domain(), id0, v1->domain(), id1),
+        "Should be mappable: ",
+        id0,
+        " of ",
+        v0,
+        " and ",
+        id1,
+        " of ",
+        v1);
+  } else {
+    TORCH_CHECK(
+        !root_map.canMap(v0->domain(), id0, v1->domain(), id1),
+        "Should not be mappable: ",
+        id0,
+        " of ",
+        v0,
+        " and ",
+        id1,
+        " of ",
+        v1);
+  }
+}
+
+void checkIdMapped(
+    TensorView* v0,
+    const std::vector<IterDomain*>& root0,
+    const std::vector<bool> should_map0,
+    TensorView* v1,
+    const std::vector<IterDomain*>& root1,
+    const std::vector<bool> should_map1) {
+  ComputeAtRootDomainMap map;
+  map.build();
+  TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size());
+  TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size());
+  size_t idx0 = 0;
+  for (const auto i : c10::irange(root0.size())) {
+    size_t idx1 = 0;
+    for (const auto j : c10::irange(root1.size())) {
+      if (should_map0[i] && should_map1[j] && idx0 == idx1) {
+        checkIdMapped(map, v0, root0[i], v1, root1[j], true);
+      } else {
+        checkIdMapped(map, v0, root0[i], v1, root1[j], false);
+      }
+      if (should_map1[j])
+        ++idx1;
+    }
+    if (should_map0[i])
+      ++idx0;
+  }
+}
+
+void checkIdMapped(
+    TensorView* v0,
+    const std::vector<IterDomain*>& root0,
+    TensorView* v1,
+    const std::vector<IterDomain*>& root1) {
+  checkIdMapped(
+      v0,
+      root0,
+      std::vector<bool>(root0.size(), true),
+      v1,
+      root1,
+      std::vector<bool>(root1.size(), true));
+}
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionRootMappingBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {true, false, false});
+  auto tv4 = broadcast(tv1, {false, true, false});
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {false, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {false, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false, true});
+  checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain());
+  checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain());
+  checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain());
+}
+
+TEST_F(NVFuserTest, FusionRootMappingRfactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I,I]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  // [I,I,I]
+  TensorView* tv1 = makeSymbolicTensor(3);
+
+  //[I,I,R]
+  auto tv2 = sum(tv1, {2});
+  auto tv3 = add(tv2, tv0);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv3);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv2->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv4 = tv2->rFactor({3});
+
+  checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain());
+  checkIdMapped(
+      tv4,
+      tv4->getRFactorDomain(),
+      {true, true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, true});
+  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRFactorDomain(),
+      {true, true, false, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  // The second dimension cannot be mapped as it would require recomputation.
+  checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain());
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  auto tv3 = tv1->rFactor({-2});
+
+  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv3,
+      tv3->getMaybeRFactorDomain(),
+      {true, false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv1->split(-1, 4);
+  auto tv4 = tv1->rFactor({-2});
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getMaybeRFactorDomain(),
+      {true, false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+// Reproducer of issue #749
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = add(tv4, tv1);
+  fusion.addOutput(tv5);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, true});
+}
+
+// Similar to RootMappingReductionDependency5 but with rFactor
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = add(tv4, tv1);
+  fusion.addOutput(tv5);
+
+  tv2->split(1, 4);
+  auto tv6 = tv2->rFactor({-1});
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv6,
+      tv6->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv6,
+      tv6->getMaybeRFactorDomain(),
+      {true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, true});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  // tv0 cannot be mapped with the consumers as it would mean its only
+  // domain would be mapped to both the first and second domains of
+  // the two consumers, thus computing tv0 at both corresponding loops.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {false},
+      tv1,
+      tv1->getRootDomain(),
+      {false, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {false},
+      tv2,
+      tv2->getRootDomain(),
+      {false, false});
+  checkIdMapped(tv1, tv1->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {false},
+      tv3,
+      tv3->getRootDomain(),
+      {false, false});
+}
+
+TEST_F(
+    NVFuserTest,
+    FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = broadcast(tv0, {true, false});
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  // If there is no common consumer, there is no recomputation constraint.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+  auto tv3 = broadcast(tv0, {false, true});
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+  auto tv5 = add(tv2, tv3);
+  fusion.addOutput(tv5);
+
+  // Broadcast domains can be used with multiple domains with
+  // different sizes. In this test, the broadcast domain of tv3 has
+  // two consumers, tv4 and tv5, which may have different sizes. Each
+  // of the consumers is used with the broadcast domain of tv3, but
+  // the two consumers may not have the same size, it is not possible
+  // to map those domains.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, false},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, false},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  // tv0[I0]
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false});
+  // tv1[B1, I0]
+  auto tv2 = broadcast(tv1, {true, false, false});
+  // tv2[B2, B1, I0]
+  fusion.addOutput(tv2);
+
+  // In this case, tv1 and tv2 has one and two broadcast domains,
+  // respectively. It is the second broadcast domain that is mapped to
+  // the broadcast of tv1.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true, true}); // Not {true, false, true}
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, false, true});
+}
+
+// Reproducer of issue #723
+TEST_F(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+
+  ComputeAtRootDomainMap map;
+  map.build();
+
+  checkIdMapped(
+      map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true);
+  checkIdMapped(
+      map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true);
+
+  tv2->computeAt(tv4, -1);
+
+  const int x = 11;
+  const int y = 12;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  at::Tensor t1 = at::randn({y, x}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0;
+  auto t4 = t0.unsqueeze(0).expand({y, x}) + t1;
+
+  testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = broadcast(tv1, {true, false});
+  auto tv3 = broadcast(tv1, {false, true});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // computeAt should fail as there is no valid root mapping.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv1->computeAt(tv4, 1));
+}
+
+TEST_F(NVFuserTest, FusionScalarInputs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  Double* d0 = IrBuilder::create<Double>();
+  fusion.addInput(d0);
+  Double* d1 = IrBuilder::create<Double>();
+  fusion.addInput(d1);
+  Double* d2 = IrBuilder::create<Double>();
+  fusion.addInput(d2);
+  Double* d3 = IrBuilder::create<Double>();
+  fusion.addInput(d3);
+  Val* d4 = mul(d0, d1);
+  Val* d5 = sub(d2, d3);
+
+  TensorView* tv2 = sub(tv1, d4);
+  TensorView* tv3 = add(tv0, d5);
+  TensorView* tv4 = mul(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  // Lets setup to actually run
+  while (tv4->nDims() > 1)
+    tv4->merge(0);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // d4 = d0 * d1
+  // d5 = d2 - d3
+  // t2 = t1 - d4
+  // t3 = t0 + d5
+  // t4 = t3 * t2
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  float fl0 = 0.1;
+  float fl1 = -0.2;
+  float fl2 = 0.3;
+  float fl3 = -0.4;
+  float fl4 = fl0 * fl1;
+  float fl5 = fl2 - fl3;
+
+  at::Tensor t0 = at::randn({129, 127}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.sub(fl4);
+  auto t3 = t0.add(fl5);
+  auto aten_output = t3.mul(t2);
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  at::Scalar test(fl0);
+
+  std::vector<IValue> aten_inputs = {
+      t0,
+      t1,
+      at::Scalar(fl0),
+      at::Scalar(fl1),
+      at::Scalar(fl2),
+      at::Scalar(fl3)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  TensorView* tv1 = makeSymbolicTensor(3);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  int block_size = 16;
+
+  tv3->merge(0, 1);
+  tv3->merge(0, 1);
+
+  tv3->split(0, block_size);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({129, 13, 3}, options);
+  at::Tensor input1 = at::randn({129, 13, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1});
+  auto outputs = fe.runFusion({input0, input1});
+
+  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+}
+
+/*
+ * Helper function for single op testing that generates a codegen operand
+ */
+
+Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
+  if (desc.first == ValType::TensorView) {
+    return makeSymbolicTensor(2, desc.second);
+  } else if (desc.first == ValType::Scalar) {
+    if (desc.second == DataType::Float) {
+      return IrBuilder::create<Double>();
+    } else if (desc.second == DataType::Double) {
+      return IrBuilder::create<Double>();
+    } else if (desc.second == DataType::ComplexFloat) {
+      return IrBuilder::create<ComplexDouble>();
+    } else if (desc.second == DataType::ComplexDouble) {
+      return IrBuilder::create<ComplexDouble>();
+    } else if (desc.second == DataType::Int) {
+      return IrBuilder::create<Int>();
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+    }
+  } else {
+    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Helper function for single op testing that generates an ATen operand
+ */
+
+IValue gen_aten_operand(
+    std::pair<ValType, DataType> desc,
+    int blocks,
+    int threads,
+    bool rand) {
+  if (desc.first == ValType::TensorView) {
+    if (desc.second == DataType::Double || desc.second == DataType::Float ||
+        desc.second == DataType::ComplexDouble ||
+        desc.second == DataType::ComplexFloat ||
+        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
+      auto options = at::TensorOptions()
+                         .dtype(data_type_to_aten(desc.second))
+                         .device(at::kCUDA, 0);
+      if (rand) {
+        return IValue(at::rand({blocks, threads}, options));
+      } else {
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else if (desc.second == DataType::Int || desc.second == DataType::Int32) {
+      auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong;
+      if (rand) {
+        auto options =
+            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+        return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype));
+      } else {
+        auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else if (desc.second == DataType::Bool) {
+      if (rand) {
+        auto options =
+            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+        return IValue(
+            at::rand({blocks, threads}, options).round().to(at::kBool));
+      } else {
+        auto options =
+            at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.second)
+    }
+  } else if (desc.first == ValType::Scalar) {
+    // IValue scalars can only be double int64 or bool
+    if (desc.second == DataType::ComplexDouble ||
+        desc.second == DataType::ComplexFloat) {
+      return IValue(at::Scalar(c10::complex<double>(1.0, 0.0)));
+    } else if (
+        desc.second == DataType::Double || desc.second == DataType::Float ||
+        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
+      return IValue(at::Scalar(1.0));
+    } else if (desc.second == DataType::Int) {
+      return IValue(at::Scalar(1));
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+    }
+  } else {
+    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Templatized Helper Function To generate single Op comparison between the
+ * JIT codegen for Cuda and the ATen Library.
+ */
+
+using OutputPair = std::pair<ValType, DataType>;
+template <
+    typename AtenFunc,
+    typename JitFunc,
+    typename InputTuple,
+    size_t... NumInputs>
+void test_op(
+    int blocks,
+    int threads,
+    std::string op_str,
+    AtenFunc af,
+    JitFunc jf,
+    OutputPair op,
+    InputTuple it,
+    std::index_sequence<NumInputs...>) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Generate Input JIT function Inputs and add them as Inputs to the Fusion
+  // Graph
+  std::array<Val*, sizeof...(NumInputs)> jit_inputs = {
+      gen_jit_operand(std::get<NumInputs>(it))...};
+  std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) {
+    fusion.addInput(v);
+  });
+  TensorView* out =
+      static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...));
+  fusion.addOutput(out);
+
+  std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) {
+    if (v->getValType() == ValType::TensorView)
+      static_cast<TensorView*>(v)->computeAt(out, -1);
+  });
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+
+  std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand(
+      std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...};
+  const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs);
+
+  at::Tensor cg_output =
+      gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
+  std::vector<at::Tensor> output_vect = {cg_output};
+  cudaDeviceSynchronize();
+  if (fusion.isStochastic())
+    at::manual_seed(0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs_ivalues);
+  fe.runFusion(aten_inputs_ivalues, output_vect);
+  cudaDeviceSynchronize();
+
+  if (fusion.isStochastic())
+    at::manual_seed(0);
+  at::Tensor aten_output = af(aten_inputs);
+  cudaDeviceSynchronize(); // This sync shouldn't be necessary;
+
+  std::string op_msg = "Operation " + op_str;
+
+  testValidate(
+      &fusion,
+      {cg_output},
+      aten_inputs,
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      op_msg);
+}
+
+/*
+ *  Templatized Helper Function that uses variadic templates to
+ *  process a variable length Input Tuple of different Operand Type.
+ */
+template <typename AtenFunc, typename JitFunc, typename InputTuple>
+void test_op(
+    int blocks,
+    int threads,
+    std::string op_str,
+    AtenFunc af,
+    JitFunc jf,
+    OutputPair op,
+    InputTuple it) {
+  static constexpr auto size = std::tuple_size<InputTuple>::value;
+  test_op(
+      blocks,
+      threads,
+      op_str,
+      af,
+      jf,
+      op,
+      it,
+      std::make_index_sequence<size>{});
+}
+
+TEST_F(NVFuserTest, FusionUnaryOps_CUDA) {
+  using OpTuple =
+      std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
+
+  // [Note: explicit tuple type for uniform initialization list]
+  // Tuple type must be explicitly specified for each uniform initialization
+  // list within the vector to make this code compatible with some old env
+  // which we still need to support. eg. gcc 5.4 + cuda 9.2.
+  std::vector<OpTuple> ops{
+      OpTuple{at::acos, UnaryOpType::Acos, "acos"},
+      OpTuple{at::asin, UnaryOpType::Asin, "asin"},
+      OpTuple{at::atan, UnaryOpType::Atan, "atan"},
+      // There does not appear to be an appropriate ATen function for atanh
+      // OpTuple{at::atanh,      UnaryOpType::Atanh,      "atanh"      },
+      OpTuple{at::cos, UnaryOpType::Cos, "cos"},
+      OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"},
+      OpTuple{at::exp, UnaryOpType::Exp, "exp"},
+      // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"},
+      OpTuple{at::log, UnaryOpType::Log, "log"},
+      OpTuple{at::log10, UnaryOpType::Log10, "log10"},
+      OpTuple{at::neg, UnaryOpType::Neg, "neg"},
+      OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
+      OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
+      OpTuple{at::sin, UnaryOpType::Sin, "sin"},
+      OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"},
+      OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"},
+      OpTuple{at::tan, UnaryOpType::Tan, "tan"},
+      OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"},
+      OpTuple{at::isfinite, UnaryOpType::IsFinite, "isfinite"},
+      OpTuple{at::isinf, UnaryOpType::IsInf, "isinf"},
+      OpTuple{at::isnan, UnaryOpType::IsNan, "isnan"},
+      OpTuple{at::isreal, UnaryOpType::IsReal, "isreal"},
+  };
+
+  // The following ops has no complex support in eager mode
+  std::vector<OpTuple> ops_without_complex{
+      OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"},
+      OpTuple{at::floor, UnaryOpType::Floor, "floor"},
+      OpTuple{at::frac, UnaryOpType::Frac, "frac"},
+      OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"},
+      OpTuple{at::round, UnaryOpType::Round, "round"},
+      OpTuple{at::relu, UnaryOpType::Relu, "relu"},
+      OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"},
+      OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"},
+      OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"},
+      OpTuple{at::erf, UnaryOpType::Erf, "erf"},
+      OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"},
+      OpTuple{at::isneginf, UnaryOpType::IsNegInf, "isneginf"},
+      OpTuple{at::isposinf, UnaryOpType::IsPosInf, "isposinf"},
+  };
+
+  // Complex support for the following op is not working in nvFuser yet
+  std::vector<OpTuple> ops_skip_complex{
+      // TODO: abs is actually supported in nvFuser, but it has bug!!!
+      // In eager mode, abs(complex_tensor) returns floating point tensor
+      // but in nvFuser, it wrongly returns complex tensor!
+      // We need to:
+      //  1. change our type promotion logic to make a special case for abs
+      //  2. why this bug is not detected here? we should bump up test coverage
+      OpTuple{at::abs, UnaryOpType::Abs, "abs"},
+      // TODO: the following two ops fails with compilation error like
+      // "undefined function rsqrt(complex)", we could implement them in
+      // helpers.cu, but I think it is better to check with Jiterator first,
+      // because Jiterator uses the same string for complex support.
+      OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
+      OpTuple{at::log2, UnaryOpType::Log2, "log2"}};
+
+  std::vector<DataType> dtypes = {
+      DataType::Float,
+      DataType::Double,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    auto ops_to_test = ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      ops_to_test.insert(
+          ops_to_test.end(),
+          ops_without_complex.begin(),
+          ops_without_complex.end());
+      ops_to_test.insert(
+          ops_to_test.end(), ops_skip_complex.begin(), ops_skip_complex.end());
+    }
+    std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ std::get<2>(op),
+          /*Aten Func   */
+          [&op](std::array<IValue, 1>& vals) {
+            return std::get<0>(op)(vals[0].toTensor());
+          },
+          /*JIT  Func   */
+          [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+    });
+
+    // TODO: why the rand_like test is failing for complex? Is it because each
+    // complex needs to draw 2 random numbers from the RNG? We need to enable
+    // this
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      test_op(
+          /*blocks*/ 128,
+          /*threads*/ 64,
+          /*name*/ "rand_like",
+          /*Aten Func   */
+          [](std::array<IValue, 1>& vals) {
+            return at::rand_like(vals[0].toTensor());
+          },
+          /*JIT  Func   */
+          [](Val* in1) -> Val* { return unaryOp(UnaryOpType::RandLike, in1); },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+    }
+  }
+
+  dtypes = {DataType::Int, DataType::Int32, DataType::Bool};
+  for (auto dtype : dtypes) {
+    test_op(
+        /*blocks*/ 128,
+        /*threads*/ 64,
+        /*name*/ "bitwise_not",
+        /*Aten Func   */
+        [](std::array<IValue, 1>& vals) {
+          return at::bitwise_not(vals[0].toTensor());
+        },
+        /*JIT  Func   */
+        [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); },
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionBinaryOps_CUDA) {
+  using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
+  using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
+
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  // see [Note: explicit tuple type for uniform initialization list]
+  std::vector<OpTuple> equal_ops{
+      OpTuple{at::eq, BinaryOpType::Eq, "eq"},
+      OpTuple{at::ne, BinaryOpType::NE, "ne"}};
+
+  // Complex numbers are not ordered
+  std::vector<OpTuple> order_ops{
+      OpTuple{at::ge, BinaryOpType::GE, "ge"},
+      OpTuple{at::gt, BinaryOpType::GT, "gt"},
+      OpTuple{at::le, BinaryOpType::LE, "le"},
+      OpTuple{at::lt, BinaryOpType::LT, "lt"}};
+
+  // see [Note: explicit tuple type for uniform initialization list]
+  std::vector<OpTuple> math_ops{
+      OpTuple{at::div, BinaryOpType::Div, "div"},
+      OpTuple{at::mul, BinaryOpType::Mul, "mul"},
+      OpTuple{at::pow, BinaryOpType::Pow, "pow"}};
+
+  // The following ops has no complex support in eager mode
+  std::vector<OpTuple> math_ops_without_complex{
+      OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"},
+      OpTuple{at::max, BinaryOpType::Max, "max"},
+      OpTuple{at::min, BinaryOpType::Min, "min"},
+      OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"},
+      // NOTE: Remainder does not match the Aten impl exactly
+      // despite using an identical function.
+      OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}};
+
+  for (auto dtype : dtypes) {
+    auto logic_ops = equal_ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      logic_ops.insert(logic_ops.end(), order_ops.begin(), order_ops.end());
+    }
+    std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ std::get<2>(op),
+          /*Aten Func   */
+          [&op](std::array<IValue, 2>& vals) {
+            return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
+          },
+          /*JIT  Func   */
+          [&op](Val* in1, Val* in2) -> Val* {
+            return binaryOp(std::get<1>(op), in1, in2);
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, DataType::Bool),
+          /*Inputs Tuple*/
+          std::make_tuple(
+              std::make_pair(ValType::TensorView, dtype),
+              std::make_pair(ValType::TensorView, dtype)));
+    });
+
+    auto enabled_math_ops = math_ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      enabled_math_ops.insert(
+          enabled_math_ops.end(),
+          math_ops_without_complex.begin(),
+          math_ops_without_complex.end());
+    }
+    std::for_each(
+        enabled_math_ops.begin(), enabled_math_ops.end(), [&](OpTuple& op) {
+          test_op(
+              /*blocks*/ 640,
+              /*threads*/ 64,
+              /*name*/ std::get<2>(op),
+              /*Aten Func   */
+              [&op](std::array<IValue, 2>& vals) {
+                return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
+              },
+              /*JIT  Func   */
+              [&op](Val* in1, Val* in2) -> Val* {
+                return binaryOp(std::get<1>(op), in1, in2);
+              },
+              /*Output      */ std::make_pair(ValType::TensorView, dtype),
+              /*Inputs Tuple*/
+              std::make_tuple(
+                  std::make_pair(ValType::TensorView, dtype),
+                  std::make_pair(ValType::TensorView, dtype)));
+        });
+
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "add_alpha",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::add(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "sub_alpha",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::sub(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionTernaryOps_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    // clamp and threshold are not supported for complex on eager mode
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ "clamp",
+          /*Aten Func   */
+          [](std::array<IValue, 1>& vals) {
+            return at::clamp(vals[0].toTensor(), 0.f, 1.f);
+          },
+          /*JIT  Func   */
+          [&](Val* in1) -> Val* {
+            if (dtype == DataType::Float) {
+              return clamp(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            } else {
+              return clamp(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            }
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ "threshold",
+          /*Aten Func   */
+          [](std::array<IValue, 1>& vals) {
+            return at::threshold(vals[0].toTensor(), 0.f, 1.f);
+          },
+          /*JIT  Func   */
+          [&](Val* in1) -> Val* {
+            if (dtype == DataType::Float) {
+              return threshold(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            } else {
+              return threshold(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            }
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+    }
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "where",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::where(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, DataType::Bool),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionCompoundOps_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "lerp",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::lerp(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype)));
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "addcmul",
+        /*Aten Func   */
+        [](std::array<IValue, 4>& vals) {
+          return at::addcmul(
+              vals[0].toTensor(),
+              vals[1].toTensor(),
+              vals[2].toTensor(),
+              vals[3].toScalar());
+        },
+        /*JIT  Func   */
+        static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionCastOps_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2, DataType::Half);
+
+  TensorView* intrm1 = castOp(DataType::Float, tv0);
+  TensorView* out = castOp(DataType::Half, intrm1);
+
+  fusion.addInput(tv0);
+  fusion.addOutput(out);
+  tv0->computeAt(out, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({1, 4}, options);
+  at::Tensor ref_output = at::empty_like(input1);
+
+  std::array<IValue, 1> inputs = {input1};
+  const at::ArrayRef<IValue> input_ivalues(inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, input_ivalues);
+  auto outputs = fe.runFusion(input_ivalues);
+
+  ref_output = at::_cast_Half(at::_cast_Double(input1));
+
+  TORCH_CHECK(
+      outputs[0].equal(ref_output),
+      "\nOp Type: -- ",
+      "cast FP16->FP32->FP16",
+      " -- had a mismatch.\n",
+      "\nABS MAX DIFF: ",
+      outputs[0].sub(ref_output).abs().max(),
+      "\n");
+}
+
+// Start off simple, block on the outer dim
+// block stride + thread all reduce + unrolling on inner dim
+TEST_F(NVFuserTest, FusionReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, 128);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, 4);
+  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
+  // tv3[I0,        R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
+  // tv1[I0,                  R1i{128}] = tv3[I0,        R1oi{4}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv3, 1);
+  tv3->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 65000;
+  int numel_y = 1025;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  // switches to try some different scenarios. maybe we should iterate on all
+  // permutations.
+  bool bind_bidx = true;
+  bool bind_tidx = true;
+  bool bind_tidy = true;
+  bool bind_unroll = true;
+
+  int numel_x = 1025; // Cannot exceed block dim max size / tidy
+  int numel_y = 129;
+  int tidx = 16;
+  int tidy = 8;
+  int unroll_factor = 4;
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1]
+
+  tv1->split(1, unroll_factor);
+  // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1]
+
+  tv1->split(0, tidy);
+
+  TensorView* tv2 = tv1->rFactor({-3});
+  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
+  // tv1[I0o, I0i{tidy},          R1oi{unroll},  R1i{tidx}]
+
+  TensorView* tv3 = tv1->rFactor({-2});
+  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
+  // tv3[I0,                      R1oi{unroll}, Ir1i{tidx}]
+  // tv1[I0o, I0i{tidy},                         R1i{tidx}]
+
+  tv0->computeAt(tv1, -2);
+
+  if (bind_unroll)
+    tv2->axis(-2)->parallelize(ParallelType::Unroll);
+  if (bind_bidx)
+    tv1->axis(0)->parallelize(ParallelType::BIDx);
+  if (bind_tidy)
+    tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  if (bind_tidx) {
+    tv2->axis(-1)->parallelize(ParallelType::TIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+    tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction3_CUDA) {
+  // What if Z participates in the reduction with X?
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int numel_x = 1025; // Cannot exceed block dim max size / tidy
+  int numel_y = 129;
+  int tidx = 16;
+  int tidz = 8;
+
+  tv1->split(1, tidz);
+  // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1]
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({-3});
+  // tv2[I0,  >R1oo<, Ir1oi{tidx}, Ir1i{tidz}]
+  // tv1[I0o,          R1oi{tidx},  R1i{tidz}]
+
+  tv0->computeAt(tv1, -3);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDz);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  TensorView* tv2 = add(tv0, tv1);
+  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
+  // tv3[I0, R1] = tv2[I0, I1]
+
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  // tv5[I0] = tv3[I0, R1] * tv4[I0]
+  TensorView* tv5 = mul(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  int tidx = 16;
+
+  // RFactor the reduction
+  tv3->split(1, tidx);
+  // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1]
+
+  TensorView* tv6 = tv3->rFactor({-2});
+  // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1]
+  // tv3[I0,       R1i{tidx}] = tv3[I0, I1]
+  tv2->computeAt(tv6, 2);
+
+  // Compute at inline with tv5 (only 1D)
+  tv6->computeAt(tv3, 1);
+  tv3->computeAt(tv5, 1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  // Intermediate tensors only need this, but doesn't hurt to do on inputs
+  // tv0, 1, 4
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t4 = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t4});
+  auto cg_outputs = fe.runFusion({t0, t1, t4});
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.to(at::kDouble).sum({1});
+  auto aten_output = t3.mul(t4);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int bidy = 2;
+  int tidy = 4;
+  int tidx = 5;
+
+  int dim1 = 11;
+
+  tv1->split(-2, tidy);
+
+  TensorView* tv2 = tv1->rFactor({-3});
+
+  tv0->computeAt(tv1, 1);
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+
+  for (auto* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      val->as<TensorView>()->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({bidy, dim1, tidx}, options);
+
+  at::Tensor cg_output = at::empty({bidy, tidx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 64;
+  const int bdimy = 8;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(2, bdimx);
+  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
+  tv1->split(1, bdimy);
+  // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  TensorView* tv2 = tv1->rFactor({3});
+  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
+  // tv1[I0, R1o, R1i{8},      R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
+  // tv3[I0, R1o, I1i{8},      I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
+  // tv1[I0,      R1i{8},      R2i{128}] = tv3[I0, R1o, I1i{8},      I2i{128}]
+
+  tv3->computeAt(tv1, 1);
+  tv2->computeAt(tv3, 2);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+  tv3->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+
+  int numel_x = 650;
+  int numel_y = 1000;
+  int numel_z = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1, 2});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = max(tv0, {0});
+  TensorView* tv2 = sum(tv0, {0});
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  int numel_x = 4;
+  int numel_y = 2;
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  std::vector<at::Tensor> aten_outputs = {
+      std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)};
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+
+  FusionExecutor fe;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionReductionTFT_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+  int tidx = 16;
+  int tidy = 8;
+  int tidz = 8;
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1o, R1i{tidx}]
+
+  tv1->split(1, tidz);
+  // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}]
+
+  tv1->split(0, tidy);
+  // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}]
+  // tv1[I0o, I0i,       R1Oi{tidz}, R1R1i{tidx}]
+
+  tv2->computeAt(tv1, 2);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::TIDz);
+  tv2->axis(-2)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) {
+  // based off FusionReduction4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  TensorView* tv2 = add(tv0, tv1);
+  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
+  // tv3[I0, R1] = tv2[I0, I1]
+
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  // tv5[I0] = tv3[I0, R1] * tv4[I0]
+  TensorView* tv5 = mul(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  // RFactor the reduction
+  tv3->split(1, 16, false);
+  // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1]
+
+  TensorView* tv6 = tv3->rFactor({-2});
+  // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1]
+  // tv3[I0,           R1i{tidx}] = tv3[I0, I1]
+  tv2->computeAt(tv6, 2);
+
+  // Compute at inline with tv5 (only 1D)
+  tv6->computeAt(tv3, 1);
+  tv3->computeAt(tv5, 1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  // Intermediate tensors only need this, but doesn't hurt to do on inputs
+  // tv0, 1, 4
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t4 = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t4});
+  auto cg_outputs = fe.runFusion({t0, t1, t4});
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.to(at::kDouble).sum({1});
+  auto aten_output = t3.mul(t4);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBranches_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv3, tv2);
+  auto tv6 = add(tv4, tv5);
+
+  fusion.addOutput(tv6);
+
+  constexpr int x = 63, y = 33;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = at::randn({x, y}, options);
+
+  FusionExecutor fe;
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::Unroll);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-2)->parallelize(ParallelType::Unroll);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = t3.add(t1);
+  auto t5 = t3.add(t2);
+  auto aten_output = t4.add(t5);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.5));
+
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  TensorView* tv4 = sub(tv2, tv3);
+
+  TensorView* tv5 = broadcast(tv1, {false, false, true});
+  TensorView* tv6 = broadcast(tv4, {true, false, false});
+
+  TensorView* tv7 = add(tv5, tv6);
+  fusion.addOutput(tv7);
+
+  tv7->split(-1, 4);
+  tv7->split(0, 8);
+
+  tv0->computeAt(tv7, -1);
+  tv2->computeAt(tv7, -1);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = t0.add(1.5);
+
+  at::Tensor t2 = at::randn({y, z}, options);
+  at::Tensor t3 = at::randn({y, z}, options);
+
+  at::Tensor t4 = t2.sub(t3);
+  at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z});
+
+  at::Tensor t6 = t4.expand({x, y, z});
+
+  at::Tensor aten_output = t5.add(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  TensorView* tv3 = broadcast(tv2, {false, false, true});
+
+  TensorView* tv4 = makeSymbolicTensor(2);
+  fusion.addInput(tv4);
+
+  TensorView* tv5 = sub(tv4, IrBuilder::create<Double>(0.1));
+
+  TensorView* tv6 = broadcast(tv5, {true, false, false});
+
+  TensorView* tv7 = add(tv3, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->merge(0, 1);
+
+  tv0->computeAt(tv7, -1);
+  tv4->computeAt(tv7, -1);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = t0.add(t1);
+  at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z});
+
+  at::Tensor t4 = at::randn({y, z}, options);
+  at::Tensor t5 = t4.sub(0.1);
+  at::Tensor t6 = t5.expand({x, y, z});
+  at::Tensor aten_output = t3.add(t6);
+
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  std::vector<IterDomain*> dom;
+  dom.push_back(IrBuilder::create<IterDomain>(
+      IrBuilder::create<Int>(0), IrBuilder::create<Int>()));
+  dom.push_back(IrBuilder::create<IterDomain>(
+      IrBuilder::create<Int>(0),
+      IrBuilder::create<Int>(1),
+      ParallelType::Serial,
+      IterType::BroadcastWithStride));
+
+  // tv0[I1, B{1}]
+  TensorView* tv0 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(dom), DataType::Float);
+  fusion.addInput(tv0);
+
+  // tv1[I0, I1, I2]
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->merge(0);
+
+  tv0->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int x = 2, y = 3, z = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, 1}, options);
+  at::Tensor t2 = at::randn({x, y, z}, options);
+  auto aten_output = t0.add(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t2};
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  std::vector<IterDomain*> dom;
+  dom.push_back(IrBuilder::create<IterDomain>(
+      IrBuilder::create<Int>(0),
+      IrBuilder::create<Int>(1),
+      ParallelType::Serial,
+      IterType::BroadcastWithStride));
+  dom.push_back(IrBuilder::create<IterDomain>(
+      IrBuilder::create<Int>(0), IrBuilder::create<Int>()));
+  TensorView* tv0 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(dom), DataType::Float);
+
+  TensorView* tv1 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 = add(tv0, tv1);
+
+  tv3->merge(0);
+  tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  auto aten_output = t0.add(t1);
+
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int m = 2, k = 3, n = 4;
+
+  auto zero = IrBuilder::create<Int>(0);
+  auto M = IrBuilder::create<IterDomain>(zero, IrBuilder::create<Int>(m));
+  auto K = IrBuilder::create<IterDomain>(zero, IrBuilder::create<Int>(k));
+  auto N = IrBuilder::create<IterDomain>(zero, IrBuilder::create<Int>(n));
+
+  // Set up your input tensor views
+  TensorView* tv0 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          std::vector<IterDomain*>({M, K}), std::vector<bool>({true, true})),
+      DataType::Float);
+  // Note: IterDomain must not be reused, so K needs to be cloned.
+  TensorView* tv1 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          std::vector<IterDomain*>({K->cloneWithoutRFactor(), N}),
+          std::vector<bool>({true, true})),
+      DataType::Float);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  TensorView* tv4 = add(tv2, tv3);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({m, k}, options);
+  at::Tensor t1 = at::randn({k, n}, options);
+
+  auto t2 = t0.unsqueeze(-1).expand({m, k, n});
+  auto t3 = t1.expand({m, k, n});
+  auto aten_output = t2.add(t3);
+
+  at::Tensor cg_output = at::empty({m, k, n}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 2, y = 3, z = 4;
+
+  auto tv0 = makeConcreteTensor({y});
+  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeConcreteTensor({y, z});
+  auto tv4 = mul(tv2, tv3);
+  auto tv5 = broadcast(tv4, {true, false, false});
+  auto tv6 = makeConcreteTensor({x, y, z});
+  auto tv7 = add(tv5, tv6);
+
+  // tv0[    i1    ] = input
+  // tv1[    i1    ] = tv0/2.0
+  // tv2[    i1, b2] = bcast(tv1)
+  // tv3[    i1, i2] = input
+  // tv4[    i1, i2] = tv2 * tv3
+  // tv5[b0, i1, i2] = bcast(tv4)
+  // tv6[i0, i1, i2] = input
+  // tv7[i0, i1, i2] = tv5 + tv6
+
+  // tv4 = bcast(tv1) * tv3
+  // tv7 = bcast(tv4) + tv6
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv3);
+  fusion.addInput(tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->merge(0);
+  tv7->merge(0);
+  tv0->computeAt(tv7, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y}, options);
+  at::Tensor t3 = at::randn({y, z}, options);
+  at::Tensor t6 = at::randn({x, y, z}, options);
+
+  auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3;
+  auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6;
+
+  std::vector<IValue> aten_inputs = {t0, t3, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 2, y = 3, z = 4;
+
+  auto tv0 = makeConcreteTensor({y, z});
+  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = makeConcreteTensor({x, y});
+  auto tv5 = add(tv3, tv4);
+
+  // tv0[    i1, i2] = input
+  // tv1[    i1, i2] = tv0/2.0
+  // tv2[    i1    ] = sum(tv1, 1)
+  // tv3[b0, i1    ] = bcast(tv2)
+  // tv4[i0, i1    ] = input
+  // tv5[i0, i1    ] = tv3 + tv4
+
+  // tv2 = sum(tv0/2.0, 1)
+  // tv5 = bcast(tv2) + tv4
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv4);
+
+  fusion.addOutput(tv5);
+
+  tv5->merge(0);
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t4 = at::randn({x, y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t4});
+  auto cg_outputs = fe.runFusion({t0, t4});
+
+  auto t1 = t0.div(2.0);
+  auto t2 = t1.to(at::kDouble).sum(1);
+  auto t3 = t2.unsqueeze(0).expand({x, y});
+  auto aten_output = t3.add(t4);
+
+  testValidate(
+      &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->merge(0);
+
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv2->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto aten_output = t3.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(-2);
+  tv4->merge(-2);
+  tv4->merge(-2);
+
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv2->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto aten_output = t3.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({4, 8});
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeConcreteTensor({4, 4, 8});
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = broadcast(tv2, {true, false, false});
+  TensorView* tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({4, 8}, options);
+  at::Tensor t1 = at::randn({4, 4, 8}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(3);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = broadcast(tv2, {true, false, true});
+  TensorView* tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3);
+  tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({7}, options);
+  at::Tensor t1 = at::randn({5, 7, 11}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.unsqueeze(-1).add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor0_shape{7, 4, 7};
+  std::vector<int64_t> tensor1_shape{4, 7};
+
+  TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size());
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size());
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+  TensorView* tv3 = sum(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn(tensor0_shape, options);
+  at::Tensor input1 = at::randn(tensor1_shape, options);
+
+  std::vector<int64_t> reduction_axes{0, 1};
+  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, reduction_params.value().lparams);
+  auto cg_outputs =
+      fe.runFusion({input0, input1}, reduction_params.value().lparams);
+
+  auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      reduction_params.value().lparams);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing7_CUDA) {
+  // Might be able to use this one without 6 as the heuristics in 6 may change
+  // and this test is to cover the same issue.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv4->merge(0, 1);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  auto tv5 = tv4->rFactor({0, 1});
+
+  tv5->computeAt(tv4, -1);
+  tv0->computeAt(tv5, -1);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_x}, options);
+  auto at_t1 = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t1});
+  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+
+  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing8_CUDA) {
+  // Same as 7 but with outer splits instead of inner
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv4->merge(0, 1);
+  tv4->split(0, 128, false);
+  tv4->split(0, 4, false);
+
+  auto tv5 = tv4->rFactor({0, 1});
+
+  tv5->computeAt(tv4, -1);
+  tv0->computeAt(tv5, -1);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_x}, options);
+  auto at_t1 = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t1});
+  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+
+  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing9_CUDA) {
+  // Same as 7 but with outer splits instead of inner
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+
+  auto tv4 = add(tv3, tv2);
+  fusion.addOutput(tv4);
+
+  const int numel_x = 200;
+  const int numel_y = 300;
+  const int numel_z = 400;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_y}, options);
+  auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options);
+  std::vector<IValue> aten_inputs = {at_t0, at_t3};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  auto at_t1 = at_t0.unsqueeze(-1);
+  auto at_t2 = at_t1.mul(2.0);
+
+  auto at_t4 = at_t3.add(at_t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing10_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto tv1_cache = tv1->cacheAfter();
+
+  std::vector<TensorView*> tvs = {tv0_cache, tv1_cache, tv2, tv3};
+
+  for (auto tv : tvs) {
+    tv->split(1, 2, false);
+    tv->split(1, 1);
+    tv->split(-1, 4);
+    // [I0, 2, 1, I1/2/4, 4]
+    tv->reorder({{1, 2}, {2, 3}, {3, 1}});
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+  }
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing11_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, true, true});
+  auto tv4 = add(tv3, tv0);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(1);
+
+  tv4->split(1, 32);
+  tv4->split(0, 1);
+
+  tv4->reorder({{2, 1}});
+
+  tv2->computeAt(tv4, 3);
+
+  tv2->setMemoryType(MemoryType::Global);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::BIDy);
+  tv4->axis(2)->parallelize(ParallelType::Unswitch);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({w, x, y, z}, options);
+  at::Tensor t1 = at::randn({x}, options);
+
+  auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1);
+  auto aten_output = t3.add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Intended to stress the lowering of our code generator
+TEST_F(NVFuserTest, FusionAdvancedLowering1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({9, 5});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  TensorView* tv4 = sum(tv3, {1});
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  auto tv5 = tv4->rFactor({2});
+
+  tv1->computeAt(tv5, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor aten_input = at::randn({9, 5}, options);
+
+  auto t1 = aten_input.add(1.0);
+  auto t2 = t1.add(2.0);
+  auto t3 = t1.add(3.0);
+  auto t4 = t3.sum(1);
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Progressively broadcast tensors
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv4 = broadcast(tv3, {false, true});
+  TensorView* tv5 = add(tv4, tv1);
+  TensorView* tv6 = add(tv5, tv2);
+
+  fusion.addOutput(tv6);
+
+  // Split inner dimension
+  tv6->split(1, 4);
+  // Merge middle dims with outer dimensions
+  tv6->merge(2);
+  tv6->merge(0);
+
+  // tv6[I0*I1o, I1i*I2]
+
+  // Compute everything inline
+  tv0->computeAt(tv6, -1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int x = 13, y = 9, z = 5;
+  at::Tensor t0 = at::randn({y}, options);
+  at::Tensor t1 = at::randn({y, z}, options);
+  at::Tensor t2 = at::randn({x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = t3.unsqueeze(-1);
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+  std::vector<at::Tensor> aten_outputs = {t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// TODO: Complete test
+TEST_F(NVFuserTest, FusionAdvancedLowering3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1, -1});
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [b0, i1]
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+
+  // [i0, i1]
+  auto tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+
+  // [b0, i1]
+  auto tv4 = add(tv2, IrBuilder::create<Double>(4.0));
+
+  // [io, i1]
+  auto tv5 = add(tv2, tv3);
+
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  tv0->computeAt(tv4, -1);
+
+  tv3->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int x = 13, y = 9;
+  at::Tensor t0 = at::randn({1, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  auto t4 = t0 + 2 + 4;
+  auto t5 = t0 + 2 + t1 + 3;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t4, t5};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// This excercises indexing with broadcast root axes. Non-broadcast
+// axes need to be preferred when propagating index exprs to root
+// axes. See, e.g., Index::getConsumerIndex_impl.
+TEST_F(NVFuserTest, FusionAdvancedLowering4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = broadcast(tv1, {false, false, true});
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->merge(1)->merge(0);
+  tv4->split(0, 8);
+  tv0->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 10;
+  const int by = 20;
+  const int bz = 30;
+  at::Tensor t0 = at::randn({bx}, options);
+  at::Tensor t3 = at::randn({bx, by, bz}, options);
+  std::vector<IValue> aten_inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output =
+      t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeConcreteTensor({5, 3});
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true, false});
+
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv1->computeAt(tv2, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor t0 = at::randn({5, 4, 3}, options);
+  at::Tensor t1 = at::randn({5, 3}, options);
+  auto t2 = t1.unsqueeze(1);
+  auto t3 = t0 + t2;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({4});
+  fusion.addInput(tv1);
+  auto tv2 = unaryOp(UnaryOpType::Set, tv0);
+  auto tv3 = unaryOp(UnaryOpType::Set, tv1);
+
+  auto tv4 = sum(tv2, {0, 2});
+  auto tv5 = add(tv4, tv3);
+  fusion.addOutput(tv5);
+
+  auto tv6 = broadcast(tv3, {true, false, true});
+  auto tv7 = add(tv2, tv6);
+  fusion.addOutput(tv7);
+
+  tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort);
+  tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor t0 = at::randn({5, 4, 3}, options);
+  at::Tensor t1 = at::randn({4}, options);
+
+  auto t2 = t0;
+  auto t3 = t1;
+
+  std::vector<int64_t> reduction_axes{0, 2};
+  auto t4 = t2.sum(reduction_axes);
+  auto t5 = add(t4, t3);
+  auto t6 = t3.unsqueeze(0).unsqueeze(-1);
+  auto t7 = t2.add(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t5, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// Test a simple Gemm but also play around with fusion executor features
+TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2); // M, K
+  TensorView* tv1 = makeSymbolicTensor(2); // K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // tv2[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+  // tv3[B, I1, I2] = tv1[I1, I2]
+
+  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
+  TensorView* tv4 = mul(tv2, tv3);
+  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
+  TensorView* tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  tv5->split(1, 32);
+  // tv5[I0, R1o, R1i{32}, I2]
+
+  auto tv6 = tv5->rFactor({1});
+  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
+  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
+
+  tv5->split(0, 4);
+  tv5->split(-1, 4);
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0->computeAt(tv6, -1);
+  tv1->computeAt(tv6, -1);
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(-2)->parallelize(ParallelType::BIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+  // Lets specify a few bounds in launch params to make sure it works
+  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+
+  // Make sure bad launch params throws
+  // TODO: Re-enable once we have parallelization validation in.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 1D tensor. Parallelized only with a single thread block.
+TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 128;
+  const int dimx = 1000;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(1);
+  fusion.addInput(input_tv0);
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(0, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
+  exp_tv1_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({dimx}, options);
+  at::Tensor cg_output = at::empty({dimx}, options);
+  at::Tensor t3_output = at::empty_like(cg_output, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  fe.runFusion({t0}, {cg_output});
+
+  auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 1D tensor with input normalization.
+TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 128;
+  const int dimx = 1000;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(1);
+  fusion.addInput(input_tv0);
+
+  // Normalize with the max value before computing exp.
+  TensorView* max_val_tv1 = reductionOp(
+      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true});
+  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
+  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
+  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
+
+  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
+
+  fusion.addOutput(output_tv7);
+  bcast_max_tv2->split(0, tidx);
+  bcast_sum_tv6->split(0, tidx);
+
+  max_val_tv1->split(-1, tidx);
+  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
+
+  sum_exp_tv5->split(-1, tidx);
+  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
+
+  output_tv7->split(-1, tidx);
+
+  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
+  sub_tv3_copy->computeAt(output_tv7, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      max_val_tv1,
+      bcast_max_tv2,
+      sum_exp_tv5,
+      bcast_sum_tv6,
+      output_tv7,
+      max_val_rf_tv8,
+      sum_exp_rf_tv9};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx}, options);
+  at::Tensor t3_output = at::empty({dimx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 3D tensor, where the inner-most 3rd dimension is
+// normalized. Pallelized with multiple thread blocks.
+TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(-1, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
+  exp_tv1_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
+
+  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 3D tensor with input normalization.
+TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  // Normalize with the max value before computing exp.
+  TensorView* max_val_tv1 = reductionOp(
+      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true});
+  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
+  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
+  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
+
+  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
+
+  fusion.addOutput(output_tv7);
+
+  bcast_max_tv2->split(-1, tidx);
+  bcast_sum_tv6->split(-1, tidx);
+
+  max_val_tv1->split(-1, tidx);
+  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
+
+  sum_exp_tv5->split(-1, tidx);
+  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
+
+  output_tv7->split(-1, tidx);
+
+  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
+  sub_tv3_copy->computeAt(output_tv7, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      max_val_tv1,
+      bcast_max_tv2,
+      sum_exp_tv5,
+      bcast_sum_tv6,
+      output_tv7,
+      max_val_rf_tv8,
+      sum_exp_rf_tv9};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
+  at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv4 = mul(tv2, tv3);
+
+  auto tv5 = sum(tv4, {1});
+  auto tv6 = broadcast(tv5, {false, true});
+
+  auto tv7 = sub(tv6, tv4);
+  fusion.addOutput(tv7);
+
+  tv1->computeAt(tv7, 1);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv1->computeAt(tv7, -1));
+}
+
+// Similar to FusionReduction but uses grid reduction
+TEST_F(NVFuserTest, FusionGridReduction1_CUDA) {
+  const int gdimx = 32;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimx);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 10000;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same test as the above but uses BIDy and TIDx for reduction
+TEST_F(NVFuserTest, FusionGridReduction2_CUDA) {
+  const int gdimy = 32;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimy);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(2)->parallelize(ParallelType::BIDy);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 10000;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same test but uses BIDy and BIDz for reduction. No TID used.
+TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) {
+  // Grid reductions when there aren't any threads are serial reductions
+  // keep these numbers low so our error isn't too high compared to normal cuda
+  // reductions
+  const int gdimz = 15;
+  const int gdimy = 9;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, gdimy);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimz);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDz);
+  tv2->axis(2)->parallelize(ParallelType::BIDz);
+  tv1->axis(-1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-1)->parallelize(ParallelType::BIDy);
+
+  int numel_x = 100;
+  int numel_y = 6500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
+TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) {
+  // Grid reductions when there aren't any threads are serial reductions
+  // keep these numbers low so our error isn't too high compared to normal cuda
+  // reductions
+  const int gdimz = 15;
+  const int gdimy = 9;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[R0, I1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {0}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(0, gdimy);
+  // tv1[R0o, R0i{128}, I1] = tv0[I0, I1]
+  tv1->split(0, gdimz);
+  // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({0});
+  // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1]
+  // tv1[      R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1]
+
+  // Note that computeAt isn't going to make anything better as there
+  // is no dynamically sized dimension.
+
+  // Map parallelism as [Serial, BIDz, BIDy, BIDx]
+  tv1->axis(-1)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::BIDx);
+  tv1->axis(-2)->parallelize(ParallelType::BIDy);
+  tv2->axis(-2)->parallelize(ParallelType::BIDy);
+  tv1->axis(-3)->parallelize(ParallelType::BIDz);
+  tv2->axis(-3)->parallelize(ParallelType::BIDz);
+
+  int numel_x = 6500;
+  int numel_y = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({0});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// This is similar to the FusionReduction, but swaps BIDx and TIDx
+TEST_F(NVFuserTest, FusionGridReduction4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 128;
+  const int gdimx = 1024;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, gdimx);
+  // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1]
+  tv1->split(1, 4);
+  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
+  // tv3[I0,        R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
+  // tv1[I0,                  R1i{1024}] = tv3[I0,        R1oi{4}, Ir1i{1024}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv3, 1);
+  tv3->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::BIDx);
+
+  int numel_x = bdimx;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Grid reduction with 2D thread blocks but only TIDx and BIDx are
+// mapped to a reduction dim
+TEST_F(NVFuserTest, FusionGridReduction5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 64;
+  const int bdimy = 16;
+  const int gdimx = 4;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{64}] = tv0[I0, I1]
+  tv1->split(1, gdimx);
+  // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+
+  int numel_x = bdimy;
+  int numel_y = 6500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Similar to FusionGridReduction1 but with 3D tensors
+TEST_F(NVFuserTest, FusionGridReduction6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  // Splitting for TID
+  tv1->split(2, 128);
+  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  // Splitting for BID
+  tv1->split(1, 128);
+
+  // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  TensorView* tv2 = tv1->rFactor({3});
+  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
+  // tv1[I0, R1o, R1i{128},      R2i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
+  // tv3[I0, R1o, I1i{128},      I2i{128}]
+  // tv1[I0,      R1i{128},      R2i{128}]
+
+  tv3->computeAt(tv1, 1);
+  tv2->computeAt(tv3, 3);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-3)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+
+  int numel_x = 6500;
+  int numel_y = 200;
+  int numel_z = numel_y;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1, 2});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// See issue #1049
+TEST_F(NVFuserTest, FusionGridReduction7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 1000);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+
+  const int numel_x = 1;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0});
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 2;
+  const int numel_y = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0});
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction9_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 2);
+
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::BIDy);
+
+  tv1->computeAt(tv3, 1);
+
+  const int numel_x = 4;
+  const int numel_y = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t2 = at::randn({numel_x}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_output = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.sum({1}).add(t2);
+
+  testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction10_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {-1});
+  auto tv2 = sum(tv1, {-1});
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(3)->parallelize(ParallelType::TIDz);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv3, 1);
+
+  const int numel_w = 2;
+  const int numel_x = 3;
+  const int numel_y = 4;
+  const int numel_z = 5;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_output = fe.runFusion({t0});
+
+  auto aten_output = t0.sum({1, 2, 3});
+
+  testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) {
+  int bid_x = 3;
+  int tid_x = 2;
+  int red_dim = 0;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(-1, tid_x);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({16, bid_x * tid_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({red_dim});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSplitBCast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  TensorView* input_tv1 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+  fusion.addInput(input_tv1);
+
+  TensorView* sum_tv2 = reductionOp(
+      BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true});
+  TensorView* output_tv4 = div(input_tv1, bcast_tv3);
+
+  sum_tv2->split(-1, 32);
+  TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2});
+
+  bcast_tv3->split(-1, 32);
+  output_tv4->split(-1, 32);
+
+  sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx);
+  sum_tv2->axis(0)->parallelize(ParallelType::BIDx);
+  bcast_tv3->axis(0)->parallelize(ParallelType::BIDx);
+  output_tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy);
+  sum_tv2->axis(1)->parallelize(ParallelType::BIDy);
+  bcast_tv3->axis(1)->parallelize(ParallelType::BIDy);
+  output_tv4->axis(1)->parallelize(ParallelType::BIDy);
+
+  sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  sum_tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  output_tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  fusion.addOutput(output_tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({32, 32, 128}, options);
+  at::Tensor t1 = at::randn({32, 32, 128}, options);
+  at::Tensor cg_output = at::empty({32, 32, 128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  fe.runFusion({t0, t1}, {cg_output});
+}
+
+TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // reduce then broadcast
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
+}
+
+TEST_F(NVFuserTest, FusionBCastReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = broadcast(tv0, {true, false, false});
+  auto tv2 = sum(tv1, {1});
+  TORCH_CHECK(
+      tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() &&
+      !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction());
+}
+
+// Multiple consumer reduction with computeAt
+// https://github.com/csarofeen/pytorch/issues/110
+TEST_F(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = unaryOp(UnaryOpType::Exp, tv0);
+  auto tv2 =
+      reductionOp(BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), tv1);
+  auto tv3 =
+      reductionOp(BinaryOpType::Min, {-1}, IrBuilder::create<Double>(0), tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+  tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort);
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) {
+  for (const auto i : c10::irange(2)) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeSymbolicTensor(1);
+    fusion.addInput(tv0);
+
+    auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+    auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+    TensorView* tv3 = add(tv1, tv2);
+    // Set outputs tv2 or tv1 and then tv3
+    if (i == 0) {
+      fusion.addOutput(tv2);
+    } else {
+      fusion.addOutput(tv1);
+    }
+    fusion.addOutput(tv3);
+
+    if (i == 0) {
+      tv1->computeAt(tv3, -1);
+    } else {
+      tv2->computeAt(tv3, -1);
+    }
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor aten_input = at::randn({100}, options);
+    std::vector<at::Tensor> aten_outputs = {
+        aten_input + 1, (aten_input + 1) * 2};
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {aten_input});
+    auto cg_outputs = fe.runFusion({aten_input});
+
+    testValidate(
+        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 32);
+
+  tv1->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100, 100}, options);
+  auto aten_output = (aten_input + 1) * 2;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int64_t dimx = 13;
+  const int64_t dimy = 15;
+
+  TensorView* tv0 = makeConcreteTensor({dimx, dimy});
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv2, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = mul(tv2, tv4);
+  fusion.addOutput(tv5);
+
+  tv1->computeAt(tv2, 2);
+  tv3->computeAt(tv4, 1);
+  tv4->computeAt(tv5, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto t1 = aten_input.add(1.);
+  auto t2 = t1.add(2.);
+  auto t3 = t2.add(3.);
+  auto t4 = t3.add(4.);
+  auto aten_output = t2.mul(t4);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+  TORCH_CHECK(tv2->nDims() == 0);
+  tv1->computeAt(tv2, 0);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum() + 1;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, true});
+  TORCH_CHECK(tv1->nDims() == 2);
+
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv3->computeAt(tv4, -1);
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({}, options);
+  at::Tensor t1 = at::randn({10, 10}, options);
+
+  auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor cg_output = at::empty({}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 32;
+  const int gdimx = 32;
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->split(0, bdimx);
+  tv1->split(0, gdimx);
+  auto tv2 = tv1->rFactor({0});
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({1000}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum();
+
+  at::Tensor cg_output = at::empty({}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  const int tidx = 128;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  tv1->split(1, tidx);
+  auto tv3 = tv1->rFactor({-2});
+
+  TensorView* tv4 = makeSymbolicTensor(2);
+  fusion.addInput(tv4);
+
+  auto tv5 = add(tv2, tv4);
+  fusion.addOutput(tv5);
+  tv5->split(1, tidx);
+
+  tv3->computeAt(tv5, 1);
+
+  tv2->split(1, tidx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  int x = 63, y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t4 = at::randn({x, y}, options);
+
+  auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y});
+  auto aten_output = t3.add(t4);
+
+  std::vector<IValue> aten_inputs = {t0, t4};
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t4});
+  auto cg_outputs = fe.runFusion({t0, t4});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({2, 3});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = broadcast(tv0, {true, false, true, false, true});
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({2, 3}, options);
+  auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true);
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options);
+  auto aten_output =
+      aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add,
+      {red_dim},
+      IrBuilder::create<Double>(0),
+      tv0,
+      /*keep_dim=*/true);
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
+  auto aten_output =
+      aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionSumTo_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor_shape{2, 3, 4, 5, 6};
+  std::vector<int64_t> sum_to_shape{1, 5, 6};
+
+  std::vector<int64_t> tensor_shape_ref{2, 3, 4, 5, 6};
+  std::vector<int64_t> sum_to_shape_ref{1, 5, 6};
+
+  std::vector<Int*> sum_to_symb;
+  std::transform(
+      sum_to_shape.begin(),
+      sum_to_shape.end(),
+      std::back_inserter(sum_to_symb),
+      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
+
+  TensorView* tv0 = makeConcreteTensor(tensor_shape);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum_to(tv0, sum_to_symb);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
+  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  TORCH_CHECK(
+      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
+      "sum_to not keeping the final dimension");
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSumToNoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor_shape{4, 5, 6};
+  std::vector<int64_t> sum_to_shape{4, 5, 6};
+
+  std::vector<int64_t> tensor_shape_ref{4, 5, 6};
+  std::vector<int64_t> sum_to_shape_ref{4, 5, 6};
+
+  std::vector<Int*> sum_to_symb;
+  std::transform(
+      sum_to_shape.begin(),
+      sum_to_shape.end(),
+      std::back_inserter(sum_to_symb),
+      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
+
+  TensorView* tv0 = makeConcreteTensor(tensor_shape);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum_to(tv0, sum_to_symb);
+
+  // Dummy operator to avoid tv0 both input and output
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
+
+  TORCH_CHECK(
+      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
+      "sum_to not keeping the final dimension");
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({red_dim});
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+// Simple reduction parallelized on a symbolic size.
+TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 65000;
+  int numel_y = 1025;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  int runtime_threadIdx_dim = 128;
+
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
+  const std::vector<int> red_dims = {0, 2};
+  // Copy is because CodeGen requires int and Pytorch requires int64_t
+  // for a vector of reduction dimensions
+  const std::vector<int64_t> red_dims64 = {0, 2};
+  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
+  const std::vector<int64_t> tensor_dims_out = {10, 20};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(tensor_dims_in, options);
+  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
+  at::Tensor cg_output = at::empty(tensor_dims_out, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  fe.runFusion({aten_input}, {cg_output}, lparams);
+
+  testValidate(
+      &fusion,
+      {cg_output},
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
+  const std::vector<int> red_dims = {1, 3};
+  // Copy is because CodeGen requires int and Pytorch requires int64_t
+  // for a vector of reduction dimensions
+  const std::vector<int64_t> red_dims64 = {1, 3};
+  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(tensor_dims_in, options);
+  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: add test for complex. Currently complex fails with the following
+  // NVRTC compilation error message:
+  //   error: no suitable user-defined conversion from
+  //   "CudaCodeGen::std::complex<double>" to "CudaCodeGen::std::complex<float>"
+  //   exists
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    at::ScalarType aten_dtype = data_type_to_aten(dtype);
+    for (auto& rdim : red_dims) {
+      Fusion fusion;
+      FusionGuard fg(&fusion);
+
+      bool is_fp16 = dtype == DataType::Half;
+      bool is_bf16 = dtype == DataType::BFloat16;
+
+      TensorView* tv0 = makeSymbolicTensor(1, dtype);
+      fusion.addInput(tv0);
+
+      TensorView* tv0_cast = tv0;
+      if (is_fp16 || is_bf16) {
+        tv0_cast = castOp(DataType::Float, tv0);
+      }
+
+      TensorView* tv1 = sum(tv0_cast, {0});
+
+      TensorView* tv1_cast = tv1;
+      if (is_fp16) {
+        tv1_cast = castOp(DataType::Half, tv1);
+      }
+      if (is_bf16) {
+        tv1_cast = castOp(DataType::BFloat16, tv1);
+      }
+
+      fusion.addOutput(tv1_cast);
+
+      auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+
+      at::Tensor aten_input = at::randn({rdim}, options);
+      auto aten_output = aten_input.to(at::kDouble).sum({0});
+
+      auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+      TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
+      scheduleReduction(&fusion, reduction_params.value());
+      auto lparams = reduction_params.value().lparams;
+
+      FusionExecutor fe;
+      fe.compileFusion(&fusion, {aten_input}, lparams);
+      auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+      testValidate(
+          &fusion,
+          cg_outputs,
+          {aten_input},
+          {aten_output},
+          __LINE__,
+          __FILE__,
+          "",
+          lparams);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: add complex support. Currently, complex fails with the following
+  // NVRTC compilation error:
+  //   error: no instance of overloaded function "__shfl_xor_sync" matches the
+  //   argument list
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_axis = {1, 0};
+  std::vector<int> output_dims = {160, 320};
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    at::ScalarType aten_dtype = data_type_to_aten(dtype);
+    for (auto& axis : red_axis) {
+      for (auto& odim : output_dims) {
+        for (auto& rdim : red_dims) {
+          Fusion fusion;
+          FusionGuard fg(&fusion);
+
+          bool is_fp16 = dtype == DataType::Half;
+          bool is_bf16 = dtype == DataType::BFloat16;
+
+          TensorView* tv0 = makeSymbolicTensor(2, dtype);
+          fusion.addInput(tv0);
+
+          TensorView* tv0_cast = tv0;
+          if (is_fp16 || is_bf16) {
+            tv0_cast = castOp(DataType::Float, tv0);
+          }
+
+          TensorView* tv1 = sum(tv0_cast, {axis});
+
+          TensorView* tv1_cast = tv1;
+          if (is_fp16) {
+            tv1_cast = castOp(DataType::Half, tv1);
+          }
+          if (is_bf16) {
+            tv1_cast = castOp(DataType::BFloat16, tv1);
+          }
+          fusion.addOutput(tv1_cast);
+
+          auto options =
+              at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+
+          at::Tensor aten_input =
+              (axis ? at::randn({odim, rdim}, options)
+                    : at::randn({rdim, odim}, options));
+
+          auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+          TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
+          scheduleReduction(&fusion, reduction_params.value());
+          auto lparams = reduction_params.value().lparams;
+
+          FusionExecutor fe;
+          fe.compileFusion(&fusion, {aten_input}, lparams);
+          auto cg_outputs = fe.runFusion({aten_input}, lparams);
+          auto aten_output = aten_input.to(at::kDouble).sum({axis});
+          testValidate(
+              &fusion,
+              cg_outputs,
+              {aten_input},
+              {aten_output},
+              __LINE__,
+              __FILE__,
+              "",
+              lparams);
+        }
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionCacheBefore_CUDA) {
+  // TVM Cache Write
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  // Before: TV2 = TV1 * 3
+  // After:  TV3 = TV1 * 3;
+  //         TV2 = TV3;
+  TensorView* tv3 = tv2->cacheBefore();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 750;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheAfter_CUDA) {
+  // TVM Cache Read
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  // Before: TV1 = TV0 + 1
+  // After:  TV3 = TV0;
+  //         TV1 = TV3 + 1
+  TensorView* tv3 = tv0->cacheAfter();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 457;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheFork_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+  // Before:  TV1 = TV0 + 1
+  //          TV2 = TV1 * 1
+  // Output:  TV1, TV2
+
+  // After:   TV1 = TV0 + 1
+  //          TV3 = TV1
+  //          TV2 = TV1 * 1
+  // Output:  TV3, TV2
+
+  // cacheFork !!does not!! automatically apply ComputeAt to the cache
+  auto tv3 = tv1->cacheFork();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 457;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output1 = aten_input + 1.0;
+  at::Tensor aten_output2 = aten_output1 * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output1, aten_output2},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv5->cacheAfter();
+  tv5->cacheBefore();
+
+  // cacheAfter on inputs placed before schedule
+  constexpr int BSX = 32;
+  tv6->split(-1, BSX);
+  tv2->computeAt(tv6, -1);
+
+  // Thread and Block binding
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 810;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor t2 = at::randn({M, N}, options);
+  at::Tensor t3 = at::randn({M, N}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheBcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1); // (M, 1)
+  TensorView* tv1 = broadcast(tv0, {false, true});
+  TensorView* tv2 = makeSymbolicTensor(1); // (1, N)
+  TensorView* tv3 = broadcast(tv2, {true, false});
+  TensorView* tv4 = mul(tv1, tv3);
+  fusion.addInput(tv0);
+  fusion.addInput(tv2);
+  fusion.addOutput(tv4);
+
+  // Case 1
+  tv0->cacheAfter();
+
+  // Case 2
+  tv1->cacheBefore();
+
+  // Case 3
+  tv1->cacheAfter();
+
+  // Case 4
+  TensorView* tv8 = tv4->cacheBefore();
+
+  constexpr int BSX = 128;
+  tv4->split(0, BSX);
+  tv4->split(-1, BSX);
+  tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // M/BSX, N/BSY, BSX, BSY
+  tv0->computeAt(tv4, 2);
+  tv2->computeAt(tv4, 2);
+  // 0, 1 | 2, 3, 4
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::BIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Replay on TV3
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv8->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 92, N = 500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M}, options);
+  at::Tensor t1 = at::randn({N}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor aten_output =
+      t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0));
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(2));
+
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+
+  auto tv5 = tv1->cacheBefore();
+  auto tv6 = tv3->cacheBefore();
+  tv5->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv1->computeAt(tv2, -1);
+  tv3->computeAt(tv4, -1);
+
+  // Fails because tensor must be recomputed twice
+  // auto tv7 = tv0->cacheAfter();
+
+  constexpr int N = 800;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({N}, options);
+  auto aten_output = (aten_input + 1) + 2;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output, aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, N)
+  TensorView* tv1 = makeSymbolicTensor(2); // (M, N)
+  TensorView* tv2 = mul(tv0, tv1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv2);
+
+  // Schedule
+  TensorView* tv3 = tv0->cacheAfter();
+  TensorView* tv4 = tv1->cacheAfter();
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  constexpr int BSY = 32;
+  constexpr int BSX = 128;
+  tv2->split(0, BSY);
+  tv2->split(2, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // M/BSX, N/BSX, BSX, BSX
+
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, 2);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 128, N = 10240;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor aten_output = mul(t0, t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cacheAfter();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, 128);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, K, N}, options);
+  at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  constexpr int BSX = 16;
+  tv5->split(2, BSX - 1);
+  tv5->split(1, BSX);
+  tv5->split(0, BSX + 1);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv6 = tv5->rFactor({-1});
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv5, 3);
+  tv1->computeAt(tv5, 3);
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-3)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-3)->parallelize(ParallelType::TIDy);
+  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Make sure BIDx is makred as exact (see issue #1119)
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx));
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  // Remove reduction axis from tv5
+  // tv6 = (M, R, N)
+  // tv5 = (M, N)
+  TensorView* tv6 = tv5->cacheBefore();
+
+  constexpr int BSX = 16;
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // tv5 = M/BSX, N/BSX, MSX, NSX
+
+  tv6->computeAt(tv5, 2);
+  tv6->computeAt(tv5, 2);
+
+  tv6->split(-1, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv7 = tv6->rFactor({-1});
+  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
+  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv0->computeAt(tv7, 3);
+  tv1->computeAt(tv7, 3);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+  tv7->setMemoryType(MemoryType::Shared);
+  // Memory Type
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-3)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv7->axis(-3)->parallelize(ParallelType::TIDy);
+  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv6->axis(-2)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(2);
+  fusion.addInput(x);
+  TensorView* max_val = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      x); // (M)
+  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
+  TensorView* x_max_sub = sub(x, bcast_max); // (M, N)
+  TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N)
+  TensorView* sum_exp = sum(exp, {-1}); // (M, R)
+  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
+  TensorView* softmax = div(exp, bcast_sum); // (M, N)
+  fusion.addOutput(softmax);
+
+  // Read Input into Shared Memory
+  // Load Input + Pwise into shared memory
+  auto cache_x = x->cacheAfter();
+  cache_x->setMemoryType(MemoryType::Shared);
+  exp->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> all_tensors(
+      {x,
+       cache_x,
+       max_val,
+       bcast_max,
+       x_max_sub,
+       exp,
+       sum_exp,
+       bcast_sum,
+       softmax});
+
+  auto tidx = IrBuilder::create<Int>();
+  fusion.addInput(tidx);
+
+  for (auto tensor : all_tensors) {
+    tensor->split(-1, tidx);
+  }
+
+  auto sum_exp_rf = sum_exp->rFactor({1});
+  all_tensors.push_back(sum_exp_rf);
+
+  // computeAt
+  x->computeAt(x_max_sub, 1);
+  exp->computeAt(softmax, 1);
+  x_max_sub->computeAt(exp, 2);
+
+  softmax->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    tensor->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  const int64_t dimx = 1024;
+  const int64_t dimy = 4096;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, 128});
+  auto cg_outputs = fe.runFusion({aten_input, 128});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, 128},
+      {aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int kReductionAxis = 3;
+  std::vector<int64_t> input_shape{10, 10, 10, 67};
+  TensorView* input = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+
+  auto output = softmax(input, kReductionAxis);
+
+  fusion.addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  auto aten_output =
+      at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false);
+
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  schedulePersistentKernel(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionTestMaskSoftmax_CUDA) {
+  // This test is testing the usage of all padding tokens
+  // with softmax like Bert might might use in a full padding
+  // sequence.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int kReductionAxis = 3;
+  std::vector<int64_t> input_shape{256, 16, 128, 128};
+  TensorView* input = makeSymbolicTensor(input_shape.size());
+  TensorView* mask = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+  fusion.addInput(mask);
+
+  auto out1 = add(input, mask);
+  auto output = softmax(out1, kReductionAxis);
+
+  fusion.addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  at::Tensor aten_mask = at::ones(input_shape, options);
+  // -10,000 is used here as a magic number because the padding
+  // tokens need to be a value that gives a value close to zero
+  // as to not influence softmax.  Bert, in particular, does
+  // not use -Infinity because sometimes it will have a
+  // softmax of all padding tokkens that can result a divide by
+  // zero that creates NaN result.
+  aten_mask = aten_mask * -10000.0;
+  auto aten_out1 = aten_input + aten_mask;
+  auto aten_output = at::_softmax(aten_out1, kReductionAxis, false);
+
+  auto reduction_params =
+      getPersistentHeuristics(&fusion, {aten_input, aten_mask});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  schedulePersistentKernel(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, aten_mask}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, aten_mask},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{20, 100, 35, 67};
+  std::vector<int64_t> norm_shape{67};
+
+  const size_t kM = shape.size();
+  const size_t kN = norm_shape.size();
+  const size_t kOuterNumDims = kM - kN;
+
+  std::vector<int64_t> outer_shape;
+  for (const auto idx : c10::irange(kOuterNumDims)) {
+    outer_shape.push_back(shape[idx]);
+  }
+  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+    outer_shape.push_back(1);
+  }
+
+  auto grad_out = makeSymbolicTensor(shape.size());
+  auto input = makeSymbolicTensor(shape.size());
+  auto mean = makeConcreteTensor(outer_shape);
+  auto rstd = makeConcreteTensor(outer_shape);
+  auto weight = makeSymbolicTensor(norm_shape.size());
+  auto bias = makeSymbolicTensor(norm_shape.size());
+  fusion.addInput(grad_out);
+  fusion.addInput(input);
+  fusion.addInput(mean);
+  fusion.addInput(rstd);
+  fusion.addInput(weight);
+  fusion.addInput(bias);
+
+  auto grads = layer_norm_backward(
+      grad_out,
+      input,
+      norm_shape,
+      mean,
+      rstd,
+      weight,
+      bias,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_grad_out = at::randn(shape, options);
+  at::Tensor aten_input = at::randn(shape, options);
+  at::Tensor aten_weight = at::randn(norm_shape, options);
+  at::Tensor aten_bias = at::randn(norm_shape, options);
+  auto at_weight = c10::optional<at::Tensor>(aten_weight);
+  auto at_bias = c10::optional<at::Tensor>(aten_bias);
+
+  const float kEps = 1e-5;
+  auto aten_results =
+      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
+  auto aten_output = std::get<0>(aten_results);
+  auto aten_mean = std::get<1>(aten_results);
+  auto aten_rstd = std::get<2>(aten_results);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {
+      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto aten_gradients = at::native_layer_norm_backward(
+      aten_grad_out.to(at::kDouble),
+      aten_input.to(at::kDouble),
+      norm_shape,
+      aten_mean.to(at::kDouble),
+      aten_rstd.to(at::kDouble),
+      c10::optional<at::Tensor>(aten_weight.to(at::kDouble)),
+      c10::optional<at::Tensor>(aten_bias.to(at::kDouble)),
+      {true, true, true});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {std::get<0>(aten_gradients),
+       std::get<1>(aten_gradients),
+       std::get<2>(aten_gradients)},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+  const int64_t NORM_SIZE = 1024;
+  std::vector<int64_t> shape{8, 56, NORM_SIZE};
+  std::vector<int64_t> norm_shape{NORM_SIZE};
+
+  const size_t kM = shape.size();
+  const size_t kN = norm_shape.size();
+  const size_t kOuterNumDims = kM - kN;
+
+  std::vector<int64_t> outer_shape;
+  for (const auto idx : c10::irange(kOuterNumDims)) {
+    outer_shape.push_back(shape[idx]);
+  }
+  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+    outer_shape.push_back(1);
+  }
+
+  auto grad_out = makeContigTensor(shape.size());
+  auto input = makeContigTensor(shape.size());
+  auto rstd = makeConcreteTensor(outer_shape);
+  auto weight = makeContigTensor(norm_shape.size());
+  fusion.addInput(grad_out);
+  fusion.addInput(input);
+  fusion.addInput(rstd);
+  fusion.addInput(weight);
+
+  auto grads = rms_norm_backward(
+      grad_out, input, norm_shape, rstd, weight, {true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_grad_out = at::randn(shape, options);
+  at::Tensor aten_input = at::randn(shape, options);
+  at::Tensor aten_weight = at::randn(norm_shape, options);
+  auto at_weight = c10::optional<at::Tensor>(aten_weight);
+
+  const float kEps = 1e-6;
+  auto pow2 = at::pow(aten_input, 2);
+  auto sum = at::sum(pow2, -1, true);
+  auto var = at::mul(sum, 1.0 / NORM_SIZE);
+  auto aten_rstd = at::pow(at::add(var, kEps), -0.5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {
+      aten_grad_out, aten_input, aten_rstd, aten_weight};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto in_mul_rstd = at::mul(aten_input, aten_rstd);
+  auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd);
+  auto aten_grad_weight = at::sum(grad_out_mul, c10::IntArrayRef{0, 1});
+  auto sum_loss1 = at::sum(at::mul(aten_grad_out, aten_weight), -1, true);
+  auto sum_loss2 = at::sum(
+      at::mul(
+          at::mul(at::mul(aten_grad_out, aten_weight), aten_input), aten_rstd),
+      -1,
+      true);
+
+  const float fH = NORM_SIZE;
+  auto term1 = at::mul(aten_rstd, 1.0 / fH);
+  auto aten_grad_input = at::mul(at::mul(aten_grad_out, fH), aten_weight);
+  aten_grad_input = at::sub(aten_grad_input, sum_loss1);
+  aten_grad_input = at::sub(
+      aten_grad_input, at::mul(at::mul(aten_input, aten_rstd), sum_loss2));
+  aten_grad_input = at::mul(aten_grad_input, term1);
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_grad_input, aten_grad_weight},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const float kEps = 1e-5;
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  std::vector<int64_t> input_shape{20, 100, 35, 67};
+  std::vector<int64_t> norm_shape{67};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+
+  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  c10::optional<at::Tensor> aten_weight = c10::nullopt;
+  c10::optional<at::Tensor> aten_bias = c10::nullopt;
+  auto aten_outputs = at::native_layer_norm(
+      aten_input, norm_shape, aten_weight, aten_bias, kEps);
+
+  // Check reduction axis is same for all reductions
+  // Generate Launch Parameters
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {std::get<0>(aten_outputs),
+       std::get<1>(aten_outputs),
+       std::get<2>(aten_outputs)},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int64_t NORM_SIZE = 1024;
+  const float kEps = 1e-6;
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  std::vector<int64_t> input_shape{8, 56, NORM_SIZE};
+  std::vector<int64_t> norm_shape{NORM_SIZE};
+
+  auto input = makeContigTensor(input_shape.size());
+  fusion.addInput(input);
+  auto result = rms_norm(input, norm_shape, nullptr, eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  c10::optional<at::Tensor> aten_weight = c10::nullopt;
+
+  auto pow2 = at::pow(aten_input, 2);
+
+  auto sum = at::sum(pow2, -1, true);
+  auto var = at::mul(sum, 1.0 / NORM_SIZE);
+  auto invstd = at::pow(at::add(var, kEps), -0.5);
+  auto output = at::mul(aten_input, invstd);
+  //// Check reduction axis is same for all reductions
+  //// Generate Launch Parameters
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {output, invstd},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kTraining = true;
+  std::vector<int64_t> input_shape{20, 100, 35, 45};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  auto weight = makeSymbolicTensor(1);
+  auto bias = makeSymbolicTensor(1);
+  auto running_mean = makeSymbolicTensor(1);
+  auto running_var = makeSymbolicTensor(1);
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input, weight, bias, running_mean, running_var, kTraining, momentum, eps);
+
+  fusion->addOutput(result.output);
+  fusion->addOutput(result.mean);
+  fusion->addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options);
+  auto at_weight = at::ones({input_shape[1]}, options);
+  auto at_bias = at::zeros({input_shape[1]}, options);
+  auto at_run_mean = at::zeros({input_shape[1]}, options);
+  auto at_run_var = at::ones({input_shape[1]}, options);
+
+  std::vector<IValue> aten_inputs = {
+      at_input, at_weight, at_bias, at_run_mean, at_run_var};
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto aten_outputs = at::native_batch_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(at_run_mean),
+      c10::optional<at::Tensor>(at_run_var),
+      kTraining,
+      kMomentum,
+      kEps);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {std::get<0>(aten_outputs),
+       std::get<1>(aten_outputs),
+       std::get<2>(aten_outputs)},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalization_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kUseInputStats = true;
+  std::vector<int64_t> input_shape{20, 100, 35, 45};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  auto weight = makeSymbolicTensor(1);
+  auto bias = makeSymbolicTensor(1);
+  auto running_mean = makeSymbolicTensor(1);
+  auto running_var = makeSymbolicTensor(1);
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+
+  auto result = instance_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kUseInputStats,
+      momentum,
+      eps);
+
+  fusion->addOutput(result.output);
+  // fusion->addOutput(result.mean);
+  // fusion->addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options);
+  auto at_weight = at::ones({input_shape[1]}, options);
+  auto at_bias = at::zeros({input_shape[1]}, options);
+  auto at_run_mean = at::zeros({input_shape[1]}, options);
+  auto at_run_var = at::ones({input_shape[1]}, options);
+
+  std::vector<IValue> aten_inputs = {
+      at_input, at_weight, at_bias, at_run_mean, at_run_var};
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+  auto cg_outputs_full = {at_run_mean, at_run_var, cg_outputs[0]};
+
+  auto aten_outputs = at::instance_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(at_run_mean),
+      c10::optional<at::Tensor>(at_run_var),
+      kUseInputStats,
+      kMomentum,
+      kEps,
+      false);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      // TODO: can run_mean/run_var be checked here?
+      // fusion_outputs.size() == aten_outputs.size() && aten_outputs.size() ==
+      // fusion->outputs().size() - output_alias_indices.size()
+      {aten_outputs},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion_forward = std::make_unique<Fusion>();
+  FusionGuard fg_forward(fusion_forward.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kUseInputStats = true;
+  const bool channels_last = true;
+  const int B = 2;
+  const int C = 5;
+  const int S = 3;
+  std::vector<int64_t> input_shape{B, C, S, S, S};
+  // explicit channels-last for NVFuser
+  std::vector<int64_t> nvfuser_input_shape{B, S, S, S, C};
+
+  auto input = makeContigTensor(input_shape.size());
+  auto weight = makeContigTensor(1);
+  auto bias = makeContigTensor(1);
+  fusion_forward->addInput(input);
+  fusion_forward->addInput(weight);
+  fusion_forward->addInput(bias);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+  auto result_forward = instance_norm(
+      input,
+      weight,
+      bias,
+      nullptr,
+      nullptr,
+      kUseInputStats,
+      momentum,
+      eps,
+      channels_last);
+  fusion_forward->addOutput(result_forward.output);
+  fusion_forward->addOutput(result_forward.mean);
+  fusion_forward->addOutput(result_forward.invstd);
+
+  FusionExecutorCache executor_cache_forward(std::move(fusion_forward));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options)
+                      .to(at::MemoryFormat::ChannelsLast3d)
+                      .set_requires_grad(true);
+  auto at_input_nvfuser = at_input.clone().detach().permute({0, 2, 3, 4, 1});
+  auto at_weight = at::ones({input_shape[1]}, options).set_requires_grad(true);
+  auto at_weight_nvfuser = at_weight.clone().detach();
+  auto at_bias = at::zeros({input_shape[1]}, options).set_requires_grad(true);
+  auto at_bias_nvfuser = at_bias.clone().detach();
+  std::vector<torch::jit::IValue> aten_inputs_forward = {
+      at_input_nvfuser, at_weight_nvfuser, at_bias_nvfuser};
+  // out, mean, invstd
+  auto outputs_forward =
+      executor_cache_forward.runFusionWithInputs(aten_inputs_forward);
+  auto at_out = at::instance_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(c10::nullopt),
+      c10::optional<at::Tensor>(c10::nullopt),
+      kUseInputStats,
+      kMomentum,
+      kEps,
+      false);
+  auto at_grad =
+      at::randn(input_shape, options).to(at::MemoryFormat::ChannelsLast3d);
+  auto at_grad_nvfuser = at_grad.clone().detach().permute({0, 2, 3, 4, 1});
+  at_out.backward(at_grad);
+  auto fusion_backward = std::make_unique<Fusion>();
+  FusionGuard fg_backward(fusion_backward.get());
+
+  input = makeContigTensor(input_shape.size());
+  auto grad_output = makeContigTensor(input_shape.size());
+  weight = makeContigTensor(1);
+  auto save_mean = makeContigTensor(2);
+  auto save_invstd = makeContigTensor(2);
+  auto dummy = makeContigTensor(0);
+
+  fusion_backward->addInput(input);
+  fusion_backward->addInput(grad_output);
+  fusion_backward->addInput(weight);
+  fusion_backward->addInput(dummy); // dummy for run_mean
+  fusion_backward->addInput(dummy); // dummy for run_var
+  fusion_backward->addInput(save_mean);
+  fusion_backward->addInput(save_invstd);
+
+  auto result_backward = instance_norm_backward(
+      input,
+      grad_output,
+      weight,
+      nullptr,
+      nullptr,
+      save_mean,
+      save_invstd,
+      kUseInputStats,
+      eps,
+      {true, true, true},
+      channels_last);
+
+  fusion_backward->addOutput(result_backward.grad_input);
+  fusion_backward->addOutput(result_backward.grad_weight);
+  fusion_backward->addOutput(result_backward.grad_bias);
+
+  FusionExecutorCache executor_cache_backward(std::move(fusion_backward));
+  std::vector<torch::jit::IValue> aten_inputs_backward = {
+      at_input_nvfuser,
+      at_grad_nvfuser,
+      at_weight_nvfuser,
+      at::empty({}),
+      at::empty({}),
+      outputs_forward[1],
+      outputs_forward[2]};
+  auto outputs_backward =
+      executor_cache_backward.runFusionWithInputs(aten_inputs_backward);
+  outputs_backward[0] = outputs_backward[0].permute({0, 4, 1, 2, 3});
+  testValidate(
+      executor_cache_backward.fusion(),
+      outputs_backward,
+      aten_inputs_backward,
+      {at_input.grad(), at_weight.grad(), at_bias.grad()},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int pixels_per_thread = 64;
+  const int TIDX = 128;
+  const int static_size = pixels_per_thread * TIDX;
+
+  TensorView* sx = makeConcreteTensor({-1, static_size});
+  TensorView* dx = makeSymbolicTensor(2);
+  fusion.addInput(sx);
+  fusion.addInput(dx);
+
+  TensorView* max_sx = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      sx); // (M)
+  TensorView* max_dx = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      dx); // (M)
+
+  // Reduction => merge local and shared memory TensorViews
+  TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx);
+  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
+
+  TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N)
+  TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N)
+
+  TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N)
+  TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N)
+
+  TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R)
+  TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R)
+
+  // Reduction => merge local and shared memory TensorViews
+  TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp);
+  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
+
+  TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N)
+  TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N)
+  fusion.addOutput(sx_softmax);
+  fusion.addOutput(dx_softmax);
+
+  auto sx_cache = sx->cacheAfter();
+  auto dx_cache = dx->cacheAfter();
+  dx_cache->setMemoryType(MemoryType::Shared);
+  dx_exp->setMemoryType(MemoryType::Shared);
+
+  // Reduction and Broadcast Tensors common to both memory TVs
+  std::vector<TensorView*> common_tensors(
+      {max_val, sum_exp, bcast_max, bcast_sum});
+
+  // Static Local Memory TVs
+  std::vector<TensorView*> static_tensors(
+      {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax});
+
+  // Dynamic Local Memory TVs
+  std::vector<TensorView*> dynamic_tensors(
+      {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax});
+
+  std::vector<TensorView*> all_tensors;
+  all_tensors.insert(
+      all_tensors.end(), common_tensors.begin(), common_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), static_tensors.begin(), static_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
+
+  // M => M
+  // M, N => M, N/128, 128
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->split(-1, TIDX);
+    }
+  }
+
+  auto sx_sum_exp_rf = sx_sum_exp->rFactor({1});
+  auto dx_sum_exp_rf = dx_sum_exp->rFactor({1});
+  all_tensors.push_back(sx_sum_exp_rf);
+  all_tensors.push_back(dx_sum_exp_rf);
+
+  // computeAt
+  sx->computeAt(sx_max_sub, 1);
+  dx->computeAt(dx_max_sub, 1);
+
+  sx_exp->computeAt(sx_softmax, 1);
+  dx_exp->computeAt(dx_softmax, 1);
+
+  sx_max_sub->computeAt(sx_exp, 2);
+  dx_max_sub->computeAt(dx_exp, 2);
+
+  sx_softmax->axis(0)->parallelize(ParallelType::BIDx);
+  dx_softmax->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  const int64_t dimx = 1024;
+  const int64_t dimy = 16384;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_in =
+      aten_input.narrow(1, static_size, dimy - static_size);
+
+  at::Tensor out = at::zeros({dimx, dimy}, options);
+  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
+  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
+
+  std::vector<at::Tensor> aten_outputs;
+
+  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
+  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_out =
+      aten_output.narrow(1, static_size, dimy - static_size);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in});
+  fe.runFusion(
+      {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
+
+  testValidate(
+      &fusion,
+      {cg_static_out, cg_dynamic_out},
+      {aten_static_in, aten_dynamic_in},
+      {cg_static_out, cg_dynamic_out},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int pixels_per_thread = 64;
+  const int TIDX = 128;
+  const int static_size = pixels_per_thread * TIDX;
+
+  TensorView* sx = makeConcreteTensor({-1, static_size});
+  TensorView* dx = makeSymbolicTensor(2);
+  fusion.addInput(sx);
+  fusion.addInput(dx);
+
+  Double* gamma = IrBuilder::create<Double>();
+  Double* beta = IrBuilder::create<Double>();
+  Double* eps = IrBuilder::create<Double>();
+  Int* N = IrBuilder::create<Int>();
+  fusion.addInput(gamma);
+  fusion.addInput(beta);
+  fusion.addInput(eps);
+  fusion.addInput(N);
+
+  // Reduction
+  auto sx_sum = sum(sx, {-1}); // (M, R)
+  auto dx_sum = sum(dx, {-1}); // (M, R)
+  // Reduction => merge local and shared memory TensorViews
+  auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum);
+
+  // Broadcast
+  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
+  // Pwise
+  auto x_mean = div(x_sum_bcast, N); // (M, B)
+
+  auto sx_mean_sub = sub(sx, x_mean); // (M, N)
+  auto dx_mean_sub = sub(dx, x_mean); // (M, N)
+
+  auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N)
+  auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N)
+
+  // Reduction
+  auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R)
+  auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R)
+  // Reduction => merge local and shared memory TensorViews
+  auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum);
+
+  // Broadcast
+  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
+  // Pwise
+  auto var = div(var_sum_bcast, N); // (M, B)
+  auto var_eps = add(var, eps); // (M, B)
+  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
+
+  auto sx_norm = mul(sx_mean_sub, rvar);
+  auto dx_norm = mul(dx_mean_sub, rvar);
+
+  auto sx_norm_gamma = mul(sx_norm, gamma);
+  auto dx_norm_gamma = mul(dx_norm, gamma);
+
+  auto sx_norm_gamma_beta = add(sx_norm_gamma, beta);
+  auto dx_norm_gamma_beta = add(dx_norm_gamma, beta);
+
+  fusion.addOutput(sx_norm_gamma_beta);
+  fusion.addOutput(dx_norm_gamma_beta);
+
+  sx_norm_gamma_beta->setContiguity(false);
+  dx_norm_gamma_beta->setContiguity(false);
+
+  // Read Input into Shared Memory
+  // Read Input minus Input_Mean into Shared Memory
+  auto sx_cache = sx->cacheAfter();
+  auto dx_cache = dx->cacheAfter();
+  dx_cache->setMemoryType(MemoryType::Shared);
+  dx_mean_sub->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> common_tensors(
+      {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar});
+
+  std::vector<TensorView*> static_tensors(
+      {sx,
+       sx_cache,
+       sx_sum,
+       sx_mean_sub,
+       sx_mean_sub_pow,
+       sx_var_sum,
+       sx_norm,
+       sx_norm_gamma,
+       sx_norm_gamma_beta});
+
+  std::vector<TensorView*> dynamic_tensors(
+      {dx,
+       dx_cache,
+       dx_sum,
+       dx_mean_sub,
+       dx_mean_sub_pow,
+       dx_var_sum,
+       dx_norm,
+       dx_norm_gamma,
+       dx_norm_gamma_beta});
+
+  std::vector<TensorView*> all_tensors;
+  all_tensors.insert(
+      all_tensors.end(), common_tensors.begin(), common_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), static_tensors.begin(), static_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
+
+  // M => M
+  // M, N => M, N/128, 128
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->split(-1, TIDX);
+    }
+  }
+
+  // Local Sum => Block Broadcast
+  TensorView* sx_sum_rf = sx_sum->rFactor({1});
+  TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1});
+  TensorView* dx_sum_rf = dx_sum->rFactor({1});
+  TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1});
+  all_tensors.push_back(sx_sum_rf);
+  all_tensors.push_back(sx_var_sum_rf);
+  all_tensors.push_back(dx_sum_rf);
+  all_tensors.push_back(dx_var_sum_rf);
+
+  // ComputeAt
+  sx->computeAt(sx_mean_sub_pow, 1);
+  dx->computeAt(dx_mean_sub_pow, 1);
+
+  var_sum->computeAt(rvar, 1);
+
+  sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2);
+  dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2);
+
+  sx_norm->computeAt(sx_norm_gamma_beta, 2);
+  dx_norm->computeAt(dx_norm_gamma_beta, 2);
+
+  sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
+  dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  const int dimx = 1024;
+  const int dimy = 16384;
+  const float kGamma = 1.0f;
+  const float kBeta = 0.0f;
+  const float kEps = 1e-5;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_in =
+      aten_input.narrow(1, static_size, dimy - static_size);
+
+  at::Tensor out = at::zeros({dimx, dimy}, options);
+  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
+  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
+
+  std::vector<IValue> aten_inputs = {
+      aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
+
+  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
+  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
+  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
+  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
+  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_out =
+      aten_output.narrow(1, static_size, dimy - static_size);
+
+  testValidate(
+      &fusion,
+      {cg_static_out, cg_dynamic_out},
+      aten_inputs,
+      {aten_static_out, aten_dynamic_out},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  auto x = makeSymbolicTensor(2);
+  Double* gamma = IrBuilder::create<Double>();
+  Double* beta = IrBuilder::create<Double>();
+  Double* eps = IrBuilder::create<Double>();
+  Int* N = IrBuilder::create<Int>();
+  fusion.addInput(x);
+  fusion.addInput(gamma);
+  fusion.addInput(beta);
+  fusion.addInput(eps);
+  fusion.addInput(N);
+
+  // Reduction
+  auto x_sum = sum(x, {-1}); // (M, R)
+  // Broadcast
+  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
+  // Pwise
+  auto x_mean = div(x_sum_bcast, N); // (M, B)
+  auto x_mean_sub = sub(x, x_mean); // (M, N)
+  auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N)
+  // Reduction
+  auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R)
+  // Broadcast
+  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
+  // Pwise
+  auto var = div(var_sum_bcast, N); // (M, B)
+  auto var_eps = add(var, eps); // (M, B)
+  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
+  auto norm = mul(x_mean_sub, rvar);
+  auto norm_gamma = mul(norm, gamma);
+  auto norm_gamma_beta = add(norm_gamma, beta);
+  fusion.addOutput(norm_gamma_beta);
+
+  // Read Input into Shared Memory
+  // Read Input minus Input_Mean into Shared Memory
+  auto cache_x = x->cacheAfter();
+  cache_x->setMemoryType(MemoryType::Shared);
+  x_mean_sub->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> all_tensors(
+      {x_sum,
+       x_mean,
+       cache_x,
+       x_sum_bcast,
+       x_mean_sub,
+       x_mean_sub_pow,
+       var_sum,
+       var_sum_bcast,
+       var,
+       var_eps,
+       rvar,
+       norm,
+       norm_gamma,
+       norm_gamma_beta});
+
+  auto tidx = IrBuilder::create<Int>();
+  fusion.addInput(tidx);
+
+  for (auto tensor : all_tensors) {
+    tensor->split(-1, tidx);
+  }
+
+  // Local Sum => Block Broadcast
+  TensorView* x_sum_rf = x_sum->rFactor({1});
+  TensorView* var_sum_rf = var_sum->rFactor({1});
+  all_tensors.push_back(x_sum_rf);
+  all_tensors.push_back(var_sum_rf);
+
+  // ComputeAt
+  x->computeAt(x_mean_sub_pow, 1);
+  var_sum->computeAt(rvar, 1);
+  x_mean_sub_pow->computeAt(var_sum_rf, 2);
+  norm->computeAt(norm_gamma_beta, 2);
+
+  for (auto tv : all_tensors) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  const int dimx = 128;
+  const int dimy = 2048;
+  const float kGamma = 1.0f;
+  const float kBeta = 0.0f;
+  const float kEps = 1e-5;
+  const int TIDX = 128;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
+  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
+  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
+
+  std::vector<IValue> aten_inputs = {
+      aten_input, kGamma, kBeta, kEps, dimy, TIDX};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Shared);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  Int* sym_bsx = IrBuilder::create<Int>();
+  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(sym_bsx);
+
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cacheAfter();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, sym_bsx);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, K, N}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, runtime_threadIdx_dim},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* sym_bsx = IrBuilder::create<Int>();
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(sym_bsx);
+  fusion.addOutput(tv4);
+  // Algorithm
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  constexpr int BSX = 32;
+  tv4->split(2, BSX);
+  tv4->split(1, sym_bsx);
+  tv4->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
+
+  tv0->computeAt(tv4, 3);
+  tv1->computeAt(tv4, 3);
+  // Schedule
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
+  // Manual Binding
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  // Thread and Block binding
+
+  constexpr int M = 128, K = 457, N = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
+  std::vector<IValue> aten_inputs = {t0, t1, BSX};
+
+  LaunchParams lparams(-1, -1, -1, BSX, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>(); // bound to threadIdx.z
+  Int* symbolic_split_k_tile_dim =
+      IrBuilder::create<Int>(); // bound to blockIdx.x
+  Int* symbolic_block_k_tile_dim =
+      IrBuilder::create<Int>(); // bound to threadIdx.x
+  // Compile-time integer for tiling
+  int n_smem_tile = 8; // bound to threadIdx.y
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Turn the K-dimension of tv4 into a reduction dimension
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, K, N]
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
+
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  tv5->reorder({{1, 5}, {5, 1}});
+  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  auto tv6 = tv5->rFactor({2});
+  // [Mo, No, rKoo, rKoi, rKi, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Setup compute at schedule
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+  //
+  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
+  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
+  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
+  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
+  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv5->axis(3)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 31, K = 65, N = 33;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // Runtime tiling
+  int m_tile = 4; // bound to threadIdx.z
+  int split_k = 7; // bound to blockIdx.x
+  int intra_cta = 8; // bound to threadIdx.x
+
+  std::vector<IValue> aten_inputs = {t0, t1, m_tile, split_k, intra_cta};
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  FusionExecutor fe;
+  // Generate CUDA and compile with nvRTC
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+}
+
+TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Global);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input}, lparams);
+  auto cg_outputs = fe.runFusion({input}, lparams);
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv4->setMemoryType(MemoryType::Global);
+  tv5->setMemoryType(MemoryType::Global);
+  tv6->setMemoryType(MemoryType::Global);
+
+  constexpr int M = 32, N = 810;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor t2 = at::randn({M, N}, options);
+  at::Tensor t3 = at::randn({M, N}, options);
+
+  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2, t3});
+  auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConstCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto one = IrBuilder::create<Int>(1);
+  TORCH_CHECK(one->isConstScalar());
+
+  auto one_x2 = mul(one, one);
+  TORCH_CHECK(one_x2->isConstScalar());
+
+  auto one_x3 = mul(one_x2, one);
+  TORCH_CHECK(one_x3->isConstScalar());
+
+  auto one_x4 = mul(one_x3, one);
+  TORCH_CHECK(one_x4->isConstScalar());
+}
+
+TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
+  const std::vector<int64_t> tensor_dims_in = {128, 128};
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(0));
+  TensorView* tv2 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(tensor_dims_in, options);
+  at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options);
+
+  // Schedule
+  tv2->split(1, 32);
+  tv2->split(1, 4); // unroll
+
+  auto tv2_rf = tv2->rFactor({-3, -2});
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2_rf->axis(-2)->parallelize(ParallelType::Unroll);
+
+  tv1->computeAt(tv2_rf, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = (input + 0).to(at::kDouble).sum(1);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Test isZeroInt
+TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* x = IrBuilder::create<Int>(0);
+  Int* y = IrBuilder::create<Int>(1);
+  Val* z = mul(x, y);
+  TORCH_CHECK(x->isZeroInt());
+  TORCH_CHECK(!y->isZeroInt());
+  TORCH_CHECK(!z->isZeroInt());
+}
+
+// Test isOneInt
+TEST_F(NVFuserTest, FusionIsOneInt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* x = IrBuilder::create<Int>(1);
+  Int* y = IrBuilder::create<Int>(1);
+  Val* z = mul(x, y);
+  TORCH_CHECK(x->isOneInt());
+  TORCH_CHECK(y->isOneInt());
+  TORCH_CHECK(!z->isOneInt());
+}
+
+// This is to verify no cycle of computeAt is created. A more complex
+// variation of this pattern appears in one of the Python tests
+// (test_random_topo).
+TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  // Common intermediate tensor
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  // tv1 -> tv2
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  // tv1 -> tv3 -> tv4
+  auto tv3 = add(tv1, IrBuilder::create<Double>(3));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  // NOTE: This should no longer occur as of PR #201.
+  // The order of adding outputs matters. If tv3 is added before tv4,
+  // it should be fine. However, if tv4 is added before tv3, there
+  // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
+  // first, and then tv4->tv3 is created at the final phase of
+  // computeAt (ComputeAt::setupOutputs).
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv2, -1);
+
+  TORCH_CHECK(tv3->hasComputeAt());
+  TORCH_CHECK(!tv4->hasComputeAt());
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(100, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = t1 + 3;
+  auto t4 = t3 + 4;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4));
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+
+  tv1->computeAt(tv3, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 10}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = aten_input + 2;
+  auto t3 = t1 + 3;
+  auto t4 = t1 + 4;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  TensorView* tv5 = add(tv1, tv3);
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  tv1->computeAt(tv5, -1);
+  tv3->computeAt(tv5, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 10}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto t5 = t1 + t3;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
+  for (const auto i : c10::irange(2)) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* tv0 = makeSymbolicTensor(1);
+    fusion.addInput(tv0);
+
+    TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+    TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+    TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+    TensorView* tv5 = add(tv1, tv3);
+
+    fusion.addOutput(tv2);
+    fusion.addOutput(tv4);
+    fusion.addOutput(tv5);
+
+    const int tile = 32;
+
+    tv1->split(-1, tile);
+    tv2->split(-1, tile);
+    tv3->split(-1, tile);
+    tv4->split(-1, tile);
+    tv5->split(-1, tile);
+
+    auto compute_at_outer = tv1;
+    auto compute_at_inner = tv3;
+    if (i == 1) {
+      std::swap(compute_at_inner, compute_at_outer);
+    }
+
+    compute_at_outer->computeAt(tv5, -2);
+    compute_at_inner->computeAt(tv5, -1);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor aten_input = at::randn({100}, options);
+    auto t1 = aten_input + 1;
+    auto t2 = t1 + 2;
+    auto t3 = aten_input + 3;
+    auto t4 = t3 + 4;
+    auto t5 = t1 + t3;
+
+    std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
+
+    std::vector<at::Tensor> cg_outputs = {
+        at::empty_like(aten_input, options),
+        at::empty_like(aten_input, options),
+        at::empty_like(aten_input, options)};
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {aten_input});
+    fe.runFusion({aten_input}, cg_outputs);
+
+    testValidate(
+        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // First tree
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  // Second tree
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+  TensorView* tv5 = add(tv4, IrBuilder::create<Double>(5));
+  TensorView* tv6 = add(tv5, IrBuilder::create<Double>(6));
+  TensorView* tv7 = add(tv5, IrBuilder::create<Double>(7));
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  tv1->computeAt(tv2, -1);
+  tv5->computeAt(tv6, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+  at::Tensor t4 = at::rand_like(t0, options);
+
+  auto t1 = t0 + 1;
+  auto t2 = t1 + 2;
+  auto t3 = t1 + 3;
+  auto t5 = t4 + 5;
+  auto t6 = t5 + 6;
+  auto t7 = t5 + 7;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
+  std::vector<IValue> aten_inputs = {t0, t4};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(t0, options),
+      at::empty_like(t0, options),
+      at::empty_like(t0, options),
+      at::empty_like(t0, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = add(tv2, tv4);
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv5);
+
+  tv2->computeAt(tv5, -1);
+  tv4->computeAt(tv5, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto t5 = t2 + t4;
+
+  std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, tv2);
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  fusion.addOutput(tv4);
+
+  tv1->split(0, 32);
+  tv2->split(0, 32);
+  tv3->split(0, 32);
+  tv4->split(0, 32);
+
+  tv3->computeAt(tv4, -2);
+  tv1->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = aten_input + 2;
+  auto t3 = t1 + t2;
+  auto aten_output = t3 + 4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = add(tv2, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->split(0, 4);
+    tv->split(0, 8);
+  }
+
+  // computeAt into inner loop nests
+  tv1->computeAt(tv2, -1);
+  tv3->computeAt(tv4, -2);
+
+  tv2->computeAt(tv5, -4);
+  tv4->computeAt(tv5, -3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto aten_output = t2 + t4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Test predication of grid reduction
+TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
+  const int gdimx = 4;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(2));
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv2);
+
+  tv1->split(1, bdimx);
+  tv1->split(1, gdimx);
+  tv3->split(1, bdimx);
+  tv3->split(1, gdimx);
+
+  TensorView* tv1_rf = tv1->rFactor({1});
+
+  tv1->computeAt(tv2, -1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(2)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDy);
+
+  int numel_x = 100;
+  int numel_y = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+
+  auto t2 = -aten_input.to(at::kDouble).sum({1});
+  auto t3 = aten_input + 2.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t2};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty({numel_x}, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLSTMCell_CUDA) {
+  const int hidden_features = 512;
+  const int batch_size = 64;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tvs[16];
+  for (const auto i : c10::irange(16)) {
+    tvs[i] = makeSymbolicTensor(2);
+    fusion.addInput(tvs[i]);
+  }
+
+  auto ingate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
+
+  auto forgetgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
+
+  auto cellgate = unaryOp(
+      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
+
+  auto outgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
+
+  auto cx = makeContigTensor(2);
+  fusion.addInput(cx);
+
+  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
+
+  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
+
+  fusion.addOutput(cy);
+  fusion.addOutput(hy);
+
+  std::vector<c10::IValue> aten_inputs;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor large_tensor0 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor1 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor2 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor3 =
+      at::randn({batch_size, hidden_features * 4}, options);
+
+  auto chunked0 = large_tensor0.chunk(4, 1);
+  auto chunked1 = large_tensor1.chunk(4, 1);
+  auto chunked2 = large_tensor2.chunk(4, 1);
+  auto chunked3 = large_tensor3.chunk(4, 1);
+
+  aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
+  aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
+  aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
+  aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
+
+  auto at_ingate =
+      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
+  auto at_forgetgate =
+      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
+  auto at_cellgate =
+      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
+  auto at_outgate =
+      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
+
+  auto at_cx = at::randn({batch_size, hidden_features}, options);
+  aten_inputs.push_back(at_cx);
+  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
+  auto at_hy = at_outgate.mul(at_cy.tanh());
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtMultiBCast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = broadcast(tv1, {true, false});
+  TensorView* tv3 = broadcast(tv1, {false, true});
+  TensorView* tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Not possible to do computeAt at position -1 as recomputation
+  // would be required. An exception should be thrown.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
+}
+
+TEST_F(NVFuserTest, FusionReductionHalf_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = castOp(DataType::Half, tv3);
+
+  fusion.addOutput(tv4);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({8, 8, 16}, options);
+
+  auto reduction_tv = tv3;
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({100, 1});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100, 1}, options);
+
+  // Grab only tensor views, though there shouldn't be any other type
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
+
+  TensorView* tv2 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  scheduleReduction(&fusion, reduction_params.value());
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({1, 2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+
+  TensorView* tv2 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({2, 1});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({10, 20, 1});
+  fusion.addInput(tv0);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion, true /* ignore_trivial */).empty(),
+      "Trivial reduction picked up by fusion");
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 20, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+  auto aten_output = aten_input.to(at::kDouble).sum({2});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 1, x = 1, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor({w, x, y, z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv3, tv0);
+
+  fusion.addOutput(tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+  auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int v = 1, w = 1, x = 1, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor({v, w, x, y, z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv1, {0, 1, 2});
+  auto tv3 = add(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t1 = at::randn({v, w, x, y, z}, options);
+  auto aten_output = t1.sum({0, 1, 2}).add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Make sure trivial reductions are correctly detected even with
+// scheduling applied.
+TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 4);
+  tv2->split(1, 8);
+  auto tv3 = tv2->rFactor({-1});
+  auto tv4 = tv2->rFactor({-1});
+
+  auto tv5 = broadcast(tv0, {true, false});
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  auto tv7 = sub(tv6, IrBuilder::create<Double>(1));
+  auto tv8 = sum(tv7, {0});
+  fusion.addOutput(tv8);
+
+  auto tv9 = broadcast(tv0, {false, true, true});
+  auto tv10 = sum(tv9, {1});
+  auto tv11 = sum(tv10, {1});
+  fusion.addOutput(tv11);
+
+  tv8->split(0, 3);
+  tv10->split(1, 4);
+  tv11->split(1, 5);
+
+  tv0->computeAt(tv2, -1);
+  tv0->computeAt(tv8, -1);
+  tv0->computeAt(tv11, 1);
+
+  // Test indexing to gmem-backed tensors
+  tv3->setMemoryType(MemoryType::Global);
+  tv8->setMemoryType(MemoryType::Global);
+
+  GpuLower gpulw(&fusion);
+
+  // No ReductionOp should be generated as all the reduction
+  // exprs should be replaced with a unary set op.
+  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
+    TORCH_CHECK(!expr->isA<ReductionOp>());
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
+}
+
+// Test detection of partially trivial reduction
+TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(1, 1);
+  // tv1->axis(1): non-trivial
+  // tv1->axis(2): trivial
+
+  auto tv3 = tv1->rFactor({-1});
+
+  // Just to suppress register-allocation warning
+  tv0->computeAt(tv2, 1);
+  tv3->computeAt(tv1, -1);
+
+  GpuLower gpulw(&fusion);
+
+  // tv3's reduction axis is a trivial reduction. The only
+  // ReductionOp should be for tv1.
+  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
+    if (expr->isA<ReductionOp>()) {
+      auto reduction_out =
+          expr->as<ReductionOp>()->outputs()[0]->as<TensorView>();
+      TORCH_CHECK(reduction_out->name() == 1);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 8, 8}, options);
+  at::Tensor t1 = at::randn({8, 8}, options);
+  at::Tensor t2 = at::randn({6, 4}, options);
+
+  // create a cache with max size 2;
+  torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2);
+
+  // testing basic function, same encoding for identical inputs
+  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
+  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
+  TORCH_CHECK(id_0.id == id_0_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 1);
+  TORCH_CHECK(id_0.eviction == false);
+
+  // new input (even tho same shape, but we have different signature because of
+  // missing scalar input
+  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
+  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1.id == id_1_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_1.eviction == false);
+
+  // eviction should happen at this point
+  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
+  TORCH_CHECK(id_2.id != id_0.id);
+  TORCH_CHECK(id_2.id != id_1.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_2.eviction == true);
+  TORCH_CHECK(id_2.evict_id == id_0.id);
+
+  // look at input 1 again
+  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1_relook.id == id_1.id);
+  TORCH_CHECK(id_1_relook.eviction == false);
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({64, 8, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // pass with identical shape
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(complyWith(t0, tensor_type));
+
+  // pass with dynamic shape
+  auto t1 = at::randn({16, 16, 8}, options);
+  TORCH_CHECK(complyWith(t1, tensor_type));
+
+  // broadcasting semantic change failure
+  auto t2 = at::randn({16, 1, 8}, options);
+  TORCH_CHECK(!complyWith(t2, tensor_type));
+
+  // contiguity failure via slicing
+  auto t3 = t0.slice(1, 0, 8, 2);
+  TORCH_CHECK(!complyWith(t3, tensor_type));
+
+  // contiguity failure via slicing
+  auto t4 = t0.slice(2, 0, 8, 2);
+  TORCH_CHECK(!complyWith(t4, tensor_type));
+
+  // rank failure
+  auto t5 = at::randn({16, 8, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t5, tensor_type));
+
+  // contiguity on stride 1 dimension with implicit broadcasting
+  auto t = at::randn({4}, options);
+  auto t6 = t.unsqueeze(1).expand({4, 8});
+  TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 1, 8});
+  std::vector<int64_t> strides_vec({8, 8, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // broadcasting semantic change
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t0, tensor_type));
+
+  // dtype failure
+  auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf));
+  TORCH_CHECK(!complyWith(t1, tensor_type));
+
+  // dtype failure
+  auto t2 = at::randn({16, 1, 8}, options);
+  TORCH_CHECK(complyWith(t2, tensor_type));
+
+  // device inconsistency shouldn't fail
+  auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0));
+  TORCH_CHECK(complyWith(t3, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({64, 1, 8});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // failing permutation
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t0, tensor_type));
+
+  // passing with dynamic shape
+  auto t1 = t0.permute({0, 2, 1});
+  TORCH_CHECK(complyWith(t1, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({128, 16, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // contiguity check passes although it differs
+  auto t0 = at::randn({16, 16, 8}, options);
+  TORCH_CHECK(complyWith(t0, tensor_type));
+
+  // passing with dynamic shape
+  auto t1 = t0.slice(1, 0, 16, 2);
+  TORCH_CHECK(complyWith(t1, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionDisjointSet_CUDA) {
+  DisjointSets<int> set;
+
+  const std::set<int> group_x({0, 1, 2});
+  const std::set<int> group_y({3, 4, 5});
+  const std::set<int> group_z({6, 7, 8});
+  const std::vector<std::set<int>> groups({group_x, group_y, group_z});
+  std::set<int> group_all;
+  std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
+    group_all.insert(g.begin(), g.end());
+  });
+
+  // Initially, nothing should be considered equivalent
+  for (auto i : group_all) {
+    for (auto j : group_all) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_x are equivalent
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // All values in group_x shoudl be equivalent with each other
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // But nothing else should be equivalent
+  for (auto i : group_all) {
+    for (auto j : group_y) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+    for (auto j : group_z) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_y are equivalent
+  for (auto i : group_y) {
+    for (auto j : group_y) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // group_x should be still equivalent
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // group_y should be now equivalent
+  for (auto i : group_y) {
+    for (auto j : group_y) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // But group_z should not be equivalent with anything yet
+  for (auto i : group_all) {
+    for (auto j : group_z) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_z are equivalent
+  for (auto i : group_z) {
+    for (auto j : group_z) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // Now each of the three groups should be equivalent within each
+  // group
+  for (const auto gi : c10::irange(groups.size())) {
+    for (const auto gj : c10::irange(groups.size())) {
+      for (auto i : groups[gi]) {
+        for (auto j : groups[gj]) {
+          TORCH_CHECK(
+              (gi == gj && set.permissiveAreMapped(i, j)) ||
+              (gi != gj && !set.permissiveAreMapped(i, j)));
+        }
+      }
+    }
+  }
+
+  std::vector<int> all_elements = set.getAllElements().vector();
+  std::sort(all_elements.begin(), all_elements.end());
+  std::vector<int> group_all_vec(group_all.begin(), group_all.end());
+  std::sort(group_all_vec.begin(), group_all_vec.end());
+  TORCH_CHECK(all_elements == group_all_vec);
+
+  set.clear();
+  TORCH_CHECK(set.getAllElements().vector().size() == 0);
+
+  // All cleared. Nothing should be considered equivalent.
+  for (auto i : group_all) {
+    for (auto j : group_all) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = broadcast(tv0, {false, true});
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  // In order to do this, tv1->axis(1) and tv2->axis(1) must have the
+  // same size, but we can't prove it, so this should throw an error.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv3->computeAt(tv4, -1));
+}
+
+TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const float k_079 = 0.79788456;
+  const float k_004 = 0.044715;
+
+  // bias vector
+  auto t0 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(t0);
+  auto t1 = castOp(DataType::Float, t0);
+  // input tensor
+  auto t2 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t2);
+  auto t3 = castOp(DataType::Float, t2);
+  auto t4 = broadcast(t1, {true, true, false});
+  auto t5 = add(t4, t3);
+  auto t6 = mul(t5, IrBuilder::create<Double>(0.5));
+  auto t7 = mul(t5, IrBuilder::create<Double>(k_079));
+  auto t8 = mul(t5, IrBuilder::create<Double>(k_004));
+  auto t9 = mul(t8, t5);
+  auto t10 = add(t9, IrBuilder::create<Int>(1));
+  auto t11 = mul(t7, t10);
+  auto t12 = unaryOp(UnaryOpType::Tanh, t11);
+  auto t13 = add(t12, IrBuilder::create<Double>(1));
+  auto t14 = mul(t6, t13);
+  auto t15 = castOp(DataType::Half, t14);
+  fusion.addOutput(t15);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  std::vector<int64_t> input_shape{6, 512, 4096};
+  std::vector<int64_t> bias_shape{4096};
+
+  auto at_input = at::randn(input_shape, options);
+  auto at_bias = at::randn(bias_shape, options);
+
+  auto at_x =
+      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
+  auto aten_output_float =
+      at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh());
+  auto aten_output = aten_output_float.to(c10::ScalarType::Half);
+
+  std::vector<IValue> aten_inputs = {at_bias, at_input};
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) {
+  if (at::cuda::getDeviceProperties(0)->major < 6) {
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const float k_079 = 0.79788456;
+  const float k_004 = 0.044715;
+  const float k_010 = 0.1070322243;
+
+  // gradient tensor
+  auto t0 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t0);
+  auto t1 = castOp(DataType::Float, t0);
+  // bias tensor
+  auto t2 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(t2);
+  auto t3 = castOp(DataType::Float, t2);
+  // input tensor
+  auto t4 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t4);
+  auto t5 = castOp(DataType::Float, t4);
+  auto t6 = broadcast(t3, {true, true, false});
+  auto t7 = add(t6, t5);
+  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
+  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
+  auto t10 = mul(t9, t7);
+  auto t11 = add(t10, IrBuilder::create<Int>(1));
+  auto t12 = mul(t8, t11);
+  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
+  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
+  auto t15 = mul(t13, t13);
+  auto t16 = unaryOp(UnaryOpType::Neg, t15);
+  auto t17 = add(t16, IrBuilder::create<Int>(1));
+  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
+  auto t19 = mul(t18, t7);
+  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
+  auto t21 = mul(t17, t20);
+  auto t22 = mul(t14, t21);
+  auto t23 = add(t13, IrBuilder::create<Int>(1));
+  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
+  auto t25 = add(t22, t24);
+  auto t26 = mul(t25, t1);
+  // Save float output for validation
+  fusion.addOutput(t26);
+  auto t27 = castOp(DataType::Half, t26);
+  fusion.addOutput(t27);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  std::vector<int64_t> input_shape{6, 512, 4096};
+  std::vector<int64_t> bias_shape{4096};
+  auto at_input = at::randn(input_shape, options);
+  auto at_bias = at::randn(bias_shape, options);
+  auto at_grad = at::randn(input_shape, options);
+
+  auto at_x =
+      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
+  auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh();
+  auto at_ff = 0.5 * at_x *
+          ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
+      0.5 * (1 + at_tanh_out);
+  auto at_out = at_ff * at_grad;
+  auto at_out_half = at_out.to(c10::ScalarType::Half);
+
+  std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
+  std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// Reproducer of issue #459
+TEST_F(NVFuserTest, FusionIssue459_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv1, tv3);
+
+  // Create two outputs from the final arithmetic result
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+  auto tv6 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  // Scheduling
+  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
+    output->merge(-2, -1);
+  }
+  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
+    output->split(0, 128);
+  }
+
+  tv0->computeAt(tv5, -1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  const int numel_x = 10;
+  const int numel_y = 20;
+  auto t0 = at::randn({numel_x}, options);
+  auto t1 = at::randn({numel_y, numel_x}, options);
+  auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_output, aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto aten_input = at::randn({12, 34}, options);
+  at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
+  // Compile-time integer for tiling
+  int n_smem_tile = 32;
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, rK, N]
+  tv5->split(2, n_smem_tile);
+  // [M, rK, No, Ni{32}]
+  tv5->split(1, symbolic_block_k_tile_dim);
+  // [M, rKo, rKi{i2}, No, Ni{32}]
+  tv5->split(1, symbolic_split_k_tile_dim);
+  // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
+
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2},     No, Ni{32}]
+  // [Mo,     No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
+  tv5->reorder({{1, 5}, {5, 1}});
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  // [Mo, No, rKoo,  Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
+  // [Mo, No,       rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
+  auto tv6 = tv5->rFactor({2});
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+
+  // [Mo, No, rKoo, Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
+  // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
+  tv6->reorder({
+      {5, -2},
+      {6, -1},
+      {2, 2},
+      {3, 3},
+      {4, 4},
+  });
+
+  // Setup compute at schedule
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+
+  constexpr int M = 31, K = 65, N = 32;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  // A, B, m_tile_dim, split_k, intra_cta_tile
+  std::vector<IValue> aten_inputs = {t0, t1, 3, 4, 5};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Reproducer of issue 408
+TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 4);
+
+  auto tv3 = tv2->cacheBefore();
+
+  tv0->computeAt(tv3, -1);
+  tv3->computeAt(tv2, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  auto aten_output = (aten_input + 1).to(at::kDouble).sum({1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  auto tv4 = tv2->cacheBefore();
+
+  tv4->computeAt(tv3, 1);
+  tv0->computeAt(tv4, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 10;
+  const int numel_y = 20;
+  const int numel_z = 30;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
+  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
+  auto t3 = t2 + 1;
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue367_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
+  // Compile-time integer for tiling
+  int n_smem_tile = 32;
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, K, N]
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
+  tv5->reorder({{1, 5}, {5, 1}});
+  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
+
+  auto tv6 = tv5->rFactor({2});
+  auto tv7 = tv5->rFactor({2});
+  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
+  tv7->setMemoryType(MemoryType::Local);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv7->axis(3)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 3, K = 6, N = 16;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // A, B, m, split_k, block_k
+  std::vector<IValue> aten_inputs = {t0, t1, 2, 2, 3};
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue468_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 100}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue363_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+  tv4->setMemoryType(MemoryType::Global);
+
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 3, K = 6, N = 16;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue484_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({M, M}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue329_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+  auto tv3 = sum(tv1, {1});
+  fusion.addOutput(tv3);
+
+  tv1->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> t0_shape{17, 19};
+  auto aten_input = at::randn(t0_shape, options);
+  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
+  auto t3 = (aten_input + 1).to(at::kDouble).sum({1});
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue382_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = broadcast(tv1, {false, false, true});
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv2->merge(1);
+  tv4->merge(1);
+
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+
+  const int numel_x = 12;
+  const int numel_y = 34;
+  const int numel_z = 56;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({numel_x, numel_y}, options);
+  auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t3};
+  auto aten_output = (t0 + 1).unsqueeze(-1) + t3;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue507_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> t0_shape{17, 19};
+  auto aten_input = at::randn(t0_shape, options);
+  auto t1 = (aten_input + 1);
+  auto aten_output = (t1 + 1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue532_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  const int M_BLOCK = 64;
+  const int M_THREAD = 4;
+
+  tv2->split(0, M_BLOCK);
+  // tv2: [M/M_BLOCK, M_BLOCK]
+  tv1->computeAt(tv2, 1);
+  // tv1: [M/M_BLOCK, M_BLOCK]
+
+  tv1->split(-1, M_BLOCK / M_THREAD);
+  // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
+
+  tv2->split(-1, M_THREAD);
+  // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
+
+  constexpr int M = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0 + 1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 32);
+  tv1->computeAt(tv2, -1);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  constexpr int M = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0 + 1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue549_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2); // M, K
+  TensorView* tv1 = makeSymbolicTensor(2); // K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+
+  TensorView* tv3 = broadcast(tv2, {false, false, true});
+  // tv3[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv4 = broadcast(tv1, {true, false, false});
+  // tv4[B, I1, I2] = tv1[I1, I2]
+
+  // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2]
+  TensorView* tv5 = mul(tv3, tv4);
+  // tv6[I0, R1, I2] = tv5[I0, I1, I2]
+  TensorView* tv6 = sum(tv5, {1});
+  fusion.addOutput(tv6);
+
+  tv6->split(1, 32);
+  // tv6[I0, R1o, R1i{32}, I2]
+
+  auto tv7 = tv6->rFactor({1});
+  // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
+  // tv6[I0,    , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
+
+  tv6->split(0, 4);
+  tv6->split(-1, 4);
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0->computeAt(tv6, -1);
+  tv1->computeAt(tv6, -1);
+
+  // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv6[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0->computeAt(tv7, -1);
+  tv1->computeAt(tv7, -1);
+  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv6->axis(0)->parallelize(ParallelType::BIDz);
+  tv6->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv6->axis(-2)->parallelize(ParallelType::BIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // Lets specify a few bounds in launch params to make sure it works
+  LaunchParams lparams(1, -1, -1, 32, 4, 4);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  fe.runFusion({t0, t1}, lparams);
+
+  // Make sure bad launch params throws
+  // TODO: Re-enable once we have parallelization validation in.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
+  FusionExecutor fe;
+  std::string kernel = R"(
+__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
+  if(threadIdx.x==0){
+    for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
+      T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2;
+    }
+  }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      256, // gdimx
+      1, // gdimy
+      1, // gdimz
+      1, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {8};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out0 = at::empty_like(in0);
+  fe.runRtc(lp, {in0, out0});
+
+  auto out_ref = in0 * 2;
+  TORCH_CHECK(out_ref.allclose(out0));
+}
+
+TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
+  FusionExecutor fe;
+  int x = 128, y = 64, z = 64;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_var,
+    Tensor<float,1> out_avg
+){
+    for(int i0=0;i0<inp.size[0];i0++){
+        float tmp_M2=0;
+        float tmp_avg=0;
+        long tmp_N=0;
+        for(int i1=0;i1<inp.size[1];i1++){
+            for(int i2=0;i2<inp.size[2];i2++){
+                welfordCombine(
+                    tmp_avg,
+                    tmp_M2,
+                    tmp_N,
+                    inp[i0*inp.stride[0]+
+                        i1*inp.stride[1]+
+                        i2*inp.stride[2]],
+                    0.f,
+                    (long)1
+                );
+            }
+        }
+        out_var[i0*out_var.stride[0]]=
+            tmp_M2/(tmp_N);
+        out_avg[i0*out_avg.stride[0]]=
+            tmp_avg;
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      1, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out_var = at::empty({x}, options);
+  auto out_avg = at::empty({x}, options);
+  fe.runRtc(lp, {in0, out_var, out_avg});
+
+  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
+  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
+  FusionExecutor fe;
+  int x = 7, y = 8, z = 9;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,2> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var,
+    Tensor<float,1> init_avg,
+    Tensor<float,1> init_var,
+    Tensor<long,0> init_N
+){
+    //actual generated kernel will use dynamic shared mem,
+    // here is just for prototype
+    __shared__ float mem_avg[512];
+    __shared__ float mem_M2[512];
+    __shared__ long mem_N[512];
+    float in=inp[threadIdx.x*inp.stride[0]+
+                        threadIdx.y*inp.stride[1]];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    blockWelford<false,true,false>(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long)1,
+        threadIdx,
+        blockDim,
+        (float*)mem_avg,
+        (float*)mem_M2,
+        (long*)mem_N,
+        (bool)(threadIdx.x<inp.size[0]),
+        0.f);
+    __syncthreads();
+    if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
+        welfordCombine(
+                    tmp_avg,
+                    tmp_M2,
+                    tmp_N,
+                    init_avg[threadIdx.x*init_avg.stride[0]],
+                    init_var[threadIdx.x*init_var.stride[0]]*init_N[0],
+                    init_N[0]
+                );
+        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      x, // bdimx
+      y, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y};
+  const std::vector<int64_t> init_dims = {x, z};
+
+  // generate initial values
+  auto init_in = at::randn(init_dims, options);
+  auto init_var = init_in.var({1}, false);
+  auto init_avg = init_in.mean({1});
+  auto init_N =
+      at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0));
+
+  auto in0 = at::randn(tensor_dims, options);
+
+  // run kernel
+  auto out_var = at::zeros({x}, options);
+  auto out_avg = at::zeros({x}, options);
+  fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
+
+  // compare with reference output
+  auto cat_tensor = at::cat({init_in, in0}, 1);
+  TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var));
+  TORCH_CHECK(
+      cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
+  FusionExecutor fe;
+  int x = 7, y = 8, z = 9;
+
+  // need support IValue for integer input as initial count
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var
+){
+    //actual generated kernel will use dynamic shared mem,
+    // here is just for prototype
+    __shared__ float mem_avg[512];
+    __shared__ float mem_M2[512];
+    __shared__ long mem_N[512];
+    float in=inp[threadIdx.x*inp.stride[0]+
+                        threadIdx.y*inp.stride[1]+
+                        threadIdx.z*inp.stride[2]];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    block_sync::init();
+    blockWelford<false,true,true>(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long) 1,
+        threadIdx,
+        blockDim,
+        (float*)mem_avg,
+        (float*)mem_M2,
+        (long*)mem_N,
+        (bool)(threadIdx.x<inp.size[0]),
+        0.f);
+    __syncthreads();
+    if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
+        out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      x, // bdimx
+      y, // bdimy
+      z // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out_var = at::empty({x}, options);
+  auto out_avg = at::empty({x}, options);
+  fe.runRtc(lp, {in0, out_avg, out_var});
+
+  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
+  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
+  FusionExecutor fe;
+  int x = 128, y = 64, z = 128;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var,
+    Tensor<float,1> work_buf_avg,
+    Tensor<float,1> work_buf_M2,
+    Tensor<long,1> work_buf_N,
+    Tensor<int64_t,1> sync_flag
+){
+    __shared__ float shared_buf_avg[512];
+    __shared__ float shared_buf_M2[512];
+    __shared__ long shared_buf_N[512];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    float in = inp[ blockIdx.x  * inp.stride[0]+
+                    blockIdx.y  * inp.stride[1]+
+                    threadIdx.x * inp.stride[2]];
+    block_sync::init();
+    welford::gridWelford<
+        true,true,false,
+        true,false,false,
+        false
+    >(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long) 1,
+        &work_buf_avg[0],
+        &work_buf_M2[0],
+        &work_buf_N[0],
+        sync_flag,
+        (float*)shared_buf_avg,
+        (float*)shared_buf_M2,
+        (long*)shared_buf_N,
+        threadIdx.x<out_var.size[0],
+        threadIdx.x<out_var.size[0],
+        0.f,
+        0,
+        1);
+    if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
+        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      x, // gdimx
+      y, // gdimy
+      1, // gdimz
+      z, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const auto options_int =
+      at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+
+  auto out_avg = at::empty({z}, options);
+  auto out_var = at::empty({z}, options);
+  auto work_buf_avg = at::empty({x * y * z}, options);
+  auto work_buf_var = at::empty({x * y * z}, options);
+  auto work_buf_N = at::empty({x * y * z}, options_int);
+  auto sync_flag = at::zeros({1}, options_int);
+  fe.runRtc(
+      lp,
+      {in0,
+       out_avg,
+       out_var,
+       work_buf_avg,
+       work_buf_var,
+       work_buf_N,
+       sync_flag});
+  std::vector<int64_t> dims{0, 1};
+
+  TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+  TORCH_CHECK(in0.var(dims, false).allclose(out_var));
+}
+
+TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 32);
+  tv_avg->split(0, 32);
+  tv_avg->split(0, 4);
+  tv_avg->reorder({{-1, -3}, {-3, -1}});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  //
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(0)->parallelize(ParallelType::TIDx);
+  tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 4);
+  auto rtvs = tvs.rFactor({2});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  // TODO: Why do we use launch params from here, but not scheduling???
+  auto reduction_params = getReductionHeuristics(&fusion, {t0});
+  scheduleReduction(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0}, lparams);
+  auto outputs = fe.runFusion({t0}, lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  auto at_avg = t0.mean({1});
+  auto at_var = t0.var({1}, false);
+  auto at_n = at::ones({M}, options_int) * N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {at_avg, at_var, at_n},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      reduction_params.value().lparams);
+}
+
+namespace {
+void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
+  const int axis = red_axis;
+  at::ScalarType aten_dtype = data_type_to_aten(dtype);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeSymbolicTensor(2, dtype);
+  bool is_fp16 = dtype == DataType::Half;
+  bool is_bf16 = dtype == DataType::BFloat16;
+  TensorView* tv0_cast = tv0;
+  if (is_fp16 || is_bf16) {
+    tv0_cast = castOp(DataType::Float, tv0);
+  }
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0_cast, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {axis});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+
+  TensorView* avg_cast = tv_avg;
+  TensorView* M2_cast = tv_M2;
+
+  if (is_fp16) {
+    avg_cast = castOp(DataType::Half, tv_avg);
+    M2_cast = castOp(DataType::Half, tv_M2);
+  }
+  if (is_bf16) {
+    avg_cast = castOp(DataType::BFloat16, tv_avg);
+    M2_cast = castOp(DataType::BFloat16, tv_M2);
+  }
+
+  fusion.addOutput(avg_cast);
+  fusion.addOutput(M2_cast);
+  fusion.addOutput(tv_N);
+
+  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  std::vector<TensorView*> outputs_of_red;
+  at::Tensor aten_input =
+      (axis ? at::randn({odim, rdim}, options)
+            : at::randn({rdim, odim}, options));
+
+  if (is_fp16 || is_bf16) {
+    outputs_of_red.push_back(avg_cast);
+    outputs_of_red.push_back(M2_cast);
+  }
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  scheduleReduction(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto outputs = fe.runFusion({aten_input}, lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to
+  // get var
+
+  outputs[1] /= rdim;
+
+  auto at_avg = aten_input.mean({axis});
+  auto at_var = aten_input.var({axis}, false);
+  auto at_n =
+      (axis ? at::ones({odim, rdim}, options)
+            : at::ones({rdim, odim}, options));
+  at_n = at_n.sum({axis});
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {aten_input},
+      {at_avg, at_var, at_n},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      reduction_params.value().lparams);
+}
+} // namespace
+
+TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: enable this for complex. Currently, complex yields
+  // silent wrong results:
+  //   Detected abs error of: 3.8062
+  //     absolute tolerance was set to 2.23704e-06
+  //     and relative tolerance set to 2.23704e-08
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_axis = {1, 0};
+  std::vector<int> output_dims = {160, 320};
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    for (auto& axis : red_axis) {
+      for (auto& odim : output_dims) {
+        for (auto& rdim : red_dims) {
+          // TODO: original welford algorithm actually keeps a running sum of
+          // squares, i.e. M_{2n} in the
+          //       cf:
+          //       https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+          //       algorithm notation, and it can reach inf for large numbers
+          //       with half precision. skipping too large volumes for half for
+          //       nwo might need further numerical experiments to re-design
+          //       this.
+          if (rdim > 32768 &&
+              (dtype == DataType::Half || dtype == DataType::BFloat16)) {
+            continue;
+          }
+          testWelford(dtype, axis, odim, rdim);
+        }
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionTranspose1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = transpose(tv0, {{0, 1}});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0.t();
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTranspose2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = transpose(tv0, {{0, 1}});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->merge(0);
+  tv1->split(0, 32);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0.t();
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+
+  TensorView* tv0 = makeSymbolicTensor(2); // K, M
+  TensorView* tv1 = makeSymbolicTensor(2); // N, K
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv0_t = transpose(tv0, {{0, 1}});
+  TensorView* tv1_t = transpose(tv1, {{0, 1}});
+
+  TensorView* tv2 = broadcast(tv0_t, {false, false, true});
+  // tv2[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv3 = broadcast(tv1_t, {true, false, false});
+  // tv3[B, I1, I2] = tv1[I1, I2]
+
+  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
+  TensorView* tv4 = mul(tv2, tv3);
+  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
+  TensorView* tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  tv5->split(1, 32);
+  // tv5[I0, R1o, R1i{32}, I2]
+
+  auto tv6 = tv5->rFactor({1});
+  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
+  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
+
+  tv5->split(0, 4);
+  tv5->split(-1, 4);
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0_t->computeAt(tv5, -1);
+  tv1_t->computeAt(tv5, -1);
+
+  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0_t->computeAt(tv6, -1);
+  tv1_t->computeAt(tv6, -1);
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(-2)->parallelize(ParallelType::BIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({K, M}, options);
+  at::Tensor t1 = at::randn({N, K}, options);
+
+  // Lets specify a few bounds in launch params to make sure it works
+  LaunchParams lparams(1, -1, -1, 32, 4, 4);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  fe.runFusion({t0, t1}, lparams);
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  TensorView* input_t = transpose(input_tv0, {{1, 2}});
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* input_t_copy = transpose(input_tv0, {{1, 2}});
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(-1, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  input_t->computeAt(sum_exp_rf_tv5, -1);
+  input_t_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimz, dimy}, options);
+
+  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_input_t = at::transpose(input, 1, 2);
+  auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0, {{0, 1}});
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv7->merge(0);
+  tv7->split(0, 128);
+  tv7->split(0, 4);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv7, 1);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  at::Tensor aten_input_t = aten_input.t();
+
+  auto t1 = aten_input_t.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0, {{0, 1}});
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto input_t = input.t();
+  auto t1 = input_t.mul({-1.0});
+  auto t2 = input_t.add({3.0});
+  auto t3 = input_t.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv3->nDims() > 1)
+    tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t0_t = t0.permute({3, 0, 1, 2});
+  auto t1_t = t1.permute({3, 0, 1, 2});
+  auto t2 = t1_t.mul({0.979361});
+  auto aten_output = t2.mul(t0_t);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  tv2 = transpose(tv2, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  tv3 = transpose(tv3, {{0, 1}, {1, 2}, {2, 3}, {3, 0}});
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  while (tv6->nDims() > 1)
+    tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+  tv3->computeAt(tv6, 1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t0_t = t0.permute({3, 0, 1, 2});
+  auto t1_t = t1.permute({3, 0, 1, 2});
+  auto t2_t = t2.permute({3, 0, 1, 2});
+  auto t3_t = t3.permute({3, 0, 1, 2});
+  auto t4 = t2_t.sub(t3_t);
+  auto t5 = t1_t.add(t4);
+  auto aten_output = t5.sub(t0_t);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  tv0 = transpose(tv0, {{0, 1}});
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  tv1 = transpose(tv1, {{0, 1}});
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(-1, 8);
+  tv3->split(-1, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t2 = t0.t().add(2.0);
+  auto aten_output = t1.t().mul(t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  tv0 = transpose(tv0, {{0, 1}});
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  tv1 = transpose(tv1, {{0, 1}});
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t2 = t0.t().add(2.0);
+  auto aten_output = t1.t().mul(t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
+  TensorView* tv4 =
+      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
+  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
+                                   //  keeps normalization scheduler away)
+  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
+
+  fusion->addOutput(tv6);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  at::Tensor t1 = at::randn({65}, options);
+  at::Tensor t2 = at::randn({128, 65}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = std::get<0>(at::max(t3, 0));
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeContigTensor(1);
+  TensorView* tv1 = makeContigTensor(1);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  TensorView* tv3 = add(tv0, tv1);
+  fusion->addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({40960}, options);
+  at::Tensor t1 = at::randn({40960}, options);
+  auto t2 = t0 + t1;
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.profile(true);
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime1 = executor_cache.getMostRecentKernelRuntime();
+  auto log1 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
+  TORCH_CHECK(log1.has_value());
+  TORCH_CHECK(log1->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  t0 = at::randn({40964}, options);
+  t1 = at::randn({40964}, options);
+  t2 = t0 + t1;
+
+  outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime2 = executor_cache.getMostRecentKernelRuntime();
+  auto log2 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
+  TORCH_CHECK(log2.has_value());
+  TORCH_CHECK(log2->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  t0 = at::randn({40962}, options);
+  t1 = at::randn({40962}, options);
+  t2 = t0 + t1;
+
+  outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime3 = executor_cache.getMostRecentKernelRuntime();
+  auto log3 = executor_cache.getMostRecentExecutorInfo().pointwise_params;
+  TORCH_CHECK(log3.has_value());
+  TORCH_CHECK(log3->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  TORCH_CHECK(runtime1 == runtime2);
+  TORCH_CHECK(runtime1 != runtime3);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(3);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
+
+  fusion.addOutput(tv1);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  auto tv1_cache = tv1->cacheBefore();
+
+  tv1->merge(0);
+  tv1->merge(0);
+  tv1->split(0, 4);
+  tv1->split(0, 128);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv1, 2);
+
+  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::empty({2, 6, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  at::Tensor aten_output = aten_input.sin();
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims);
+  TensorView* tv1 = makeContigTensor(nDims);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto tv1_cache = tv1->cacheAfter();
+  auto tv3_cache = tv3->cacheBefore();
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+
+  // Split by n_threads
+  tv3->split(1, 2);
+  tv3->split(0, 3);
+  tv3->split(0, 1);
+
+  // [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unswitch);
+  tv3->axis(2)->parallelize(ParallelType::Unroll);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+
+  tv3->reorder({{4, 2}});
+  // [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
+
+  TransformPropagator::from(tv3);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv1_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv3->axis(2)->parallelize(ParallelType::Vectorize);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  std::vector<int64_t> input_shape{32, 64, 8};
+  const int kReductionAxis = 1;
+
+  auto tv0 = TensorViewBuilder()
+                 .ndims(input_shape.size())
+                 .dtype(DataType::Double)
+                 .build();
+
+  fusion->addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = sum(tv1, {2}); // Group 0
+
+  auto output = softmax(tv2, kReductionAxis); // Group 1
+  fusion->addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({at_x});
+
+  auto t1 = at_x.add(1.0);
+  auto t2 = t1.sum({2});
+  auto t3 = at::_softmax(t2.to(at::kDouble), -1, false);
+
+  auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
+  TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
+  TORCH_CHECK(
+      optimized_fusion->fusionSegments()->groups().size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSwizzle1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 7);
+  tv2->split(0, 9);
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->swizzle(SwizzleType::Transpose, {1, 2});
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = (t0 + 1) * 2;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSwizzle2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->split(-2, 4);
+
+  tv2->split(-1, 4);
+  tv2->split(-2, 4);
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->reorder({{-1, -2}});
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->swizzle(SwizzleType::Transpose, {-2, -1});
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({123}, options);
+
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = (t0 + 1) * 2;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTransposeWithSwizzle_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, {{0, 1}});
+  fusion.addOutput(tv1);
+
+  // tv0: [I0, I1]
+  // tv1: [I1, I0]
+
+  const int BS = 32;
+
+  // CTA tiling by BS*BS
+  tv1->split(1, BS);
+  tv1->split(0, BS);
+  tv1->reorder({{1, 2}});
+  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
+
+  // Create a smem buffer to cache each tile
+  auto tv0_cache = tv0->cacheAfter();
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv1, 2);
+  // tv0: [I0, I1]
+  // tv0_cache: [I1/BS, I0/BS, BS(I1), BS(I0)]
+  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
+
+  // Assign each thread block to a tile
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+
+  // Thread mapping for each tile. For both of the input and output
+  // tiles, map TIDx to the fastest-changing dimension to facilitate
+  // coalesced gmem accesses.
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(3)->parallelize(ParallelType::TIDx);
+  // Note that the fastest-changing axis is next to the inner-most
+  // axis since computeAt reorders the axes as the output tensor.
+  tv0_cache->axis(2)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(3)->parallelize(ParallelType::TIDy);
+
+  // Swizzles the smem cache to avoid bank conflicts
+  tv0_cache->swizzle(SwizzleType::Transpose, {3, 2});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 100;
+  const int by = 200;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.t();
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, {{0, 1}});
+  fusion.addOutput(tv1);
+
+  // tv0: [I0, I1]
+  // tv1: [I1, I0]
+
+  const int BS = 32;
+  const int BDIM = 256;
+
+  // CTA tiling by BS*BS
+  tv1->split(1, BS);
+  tv1->split(0, BS);
+  tv1->reorder({{1, 2}});
+  // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
+
+  // Create a smem buffer to cache each tile
+  auto tv0_cache = tv0->cacheAfter();
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv1, 2);
+  // tv0: [I0, I1]
+  // tv0_cache: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
+  // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
+
+  // Tranform the tile axes for 1D thread mapping
+  tv1->merge(-2, -1);
+  tv1->split(-1, BDIM);
+  // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
+
+  // Transform the cache similarly but apply swizzle to the 2D tile axes.
+  tv0_cache->reorder({{-2, -1}});
+  tv0_cache->swizzle(SwizzleType::Transpose, {2, 3});
+  tv0_cache->merge(-2, -1);
+  tv0_cache->split(-1, BDIM);
+  // tv0: [I1/BS, I0/BS, BS*BS/BDIM, BDIM]
+
+  // Assign each thread block to a tile
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+
+  // Thread mapping for each tile.
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 100;
+  const int by = 200;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.t();
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+  }
+
+  const int numel_x = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0}).unsqueeze(-1).add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true, false});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDy);
+    tv->axis(2)->parallelize(ParallelType::TIDx);
+  }
+
+  const int numel_x = 10;
+  const int numel_y = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0}).unsqueeze(0).add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0});
+  auto tv4 = add(tvs.avg, tvs.var_sum);
+  auto tv5 = broadcast(tv4, {true});
+  auto tv6 = add(tv0, tv5);
+  fusion.addOutput(tv6);
+
+  std::vector<TensorView*> schedule_tvs = {
+      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
+
+  for (auto tv : schedule_tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+  }
+
+  const int numel_x = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
+                         .unsqueeze(-1)
+                         .add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0});
+  auto tv4 = add(tvs.avg, tvs.var_sum);
+  auto tv5 = broadcast(tv4, {true, false});
+  auto tv6 = add(tv0, tv5);
+  fusion.addOutput(tv6);
+
+  std::vector<TensorView*> schedule_tvs = {
+      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
+  for (auto tv : schedule_tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDy);
+    tv->axis(2)->parallelize(ParallelType::TIDx);
+  }
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 10;
+  const int numel_y = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
+                         .unsqueeze(0)
+                         .add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue633_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int dx = 10;
+  const int dy = 11;
+  const int dz = 12;
+
+  auto tv0 = makeConcreteTensor({dx, dy, dz});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({dx, dy, 1});
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(1);
+  tv2->merge(0);
+  tv2->split(-1, 128);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({dx, dy, dz}, options);
+  at::Tensor t1 = at::randn({dx, dy, 1}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{17, 19};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(1, 128);
+  tv0->computeAt(tv3, 2);
+
+  for (auto tv : {tv2, tv3}) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({shape[0]}, options);
+  at::Tensor t1 = at::randn(shape, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.unsqueeze(-1).expand(shape) + t1;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 457;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(4);
+  auto tv1 = makeContigTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->reorder({{0, 1}, {1, 0}});
+  tv2->merge(-2);
+
+  const int kTDX = 64;
+  const int kVecSize = 2;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(-1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(0, 128);
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 32;
+  const int c = 127;
+  const int h = 51;
+  const int w = 23;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({n, c, h, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int kNumDims = 4;
+  constexpr int kTDX = 64;
+  constexpr int kVecSize = 2;
+  constexpr int kNumElems = kTDX * kVecSize;
+
+  auto tv0 = makeSymbolicTensor(kNumDims);
+  auto tv1 = makeSymbolicTensor(kNumDims);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  // Create caches for vectorization
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  // Merge all dimensions together except inner-most dim
+  for (const auto idx : c10::irange(kNumDims - 2)) {
+    tv2->merge(0);
+  }
+  // Split inner-most dim
+  tv2->split(-1, kNumElems);
+  tv2->split(-1, kVecSize);
+  TransformPropagator::from(tv2);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  // Parallelization Strategy
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 5;
+  const int c = 3;
+  const int h = 51;
+  const int w = 257;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({n, c, h, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int kNumDims = 4;
+  constexpr int kTDX = 64;
+  constexpr int kVecSize = 2;
+  constexpr int kNumElems = kTDX * kVecSize;
+  std::vector<int64_t> bcast_shape{1, 1, 1, -1};
+
+  auto tv0 = makeContigTensor(kNumDims);
+  auto tv1 = TensorViewBuilder().shape(bcast_shape).build();
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  // Create caches for vectorization
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  // Merge all dimensions together
+  // Backward merge order is necessary for vectorize validation
+  for (int idx = kNumDims - 1; idx > 0; --idx) {
+    tv2->merge(idx - 1);
+  }
+  tv2->split(-1, kNumElems);
+  tv2->split(-1, kVecSize);
+  TransformPropagator::from(tv2);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  // Parallelization Strategy
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 32;
+  const int c = 128;
+  const int h = 51;
+  const int w = 23;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({1, 1, 1, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  // TODO: throw assertion - cannot merge non-contiguous vectorization axes
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+
+  tv3->split(-1, 128 * 4);
+  tv3->split(-1, 4);
+  // Reduce outer dim first
+  auto tv4 = tv3->rFactor({-3, -1});
+  // Tv3 will reduce threads
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2050;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.add(t1).sum(1);
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    // Vectorize the wrong dimension
+    tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
+  }
+
+  FusionExecutor fe;
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  // Failure because the input + output tensors do not have the same stride
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+}
+
+TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2048;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    // Vectorize the wrong dimension
+    tv->axis(-2)->parallelize(ParallelType::Vectorize);
+  }
+
+  FusionExecutor fe;
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+
+  aten_inputs[0] = t0.index({"...", Slice(1)});
+  aten_inputs[1] = t1.index({"...", Slice(1)});
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+
+  t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
+  t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
+  aten_inputs = {t0, t1};
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 128 * 4);
+  tv3->split(-1, 4);
+  // Reduce outer dim first
+  auto tv4 = tv3->rFactor({-3, -1});
+  // Tv3 will reduce threads
+
+  auto tv6 = tv0->cacheAfter();
+  auto tv7 = tv1->cacheAfter();
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  tv6->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv7->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2048;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.add(t1).sum(1);
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  auto t3 = t0.add(t1).sum(1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+// Unswitched loops with extent one may omit else clause.
+TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Progressively broadcast tensors
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = broadcast(tv0, {false, true});
+  TensorView* tv4 = add(tv3, tv1);
+  TensorView* tv5 = add(tv4, tv2);
+
+  fusion.addOutput(tv5);
+
+  // Split inner dimension
+  tv5->split(1, 8);
+  // Merge middle dims with outer dimensions
+  tv5->merge(2);
+  tv5->merge(0);
+
+  // tv5[I0*I1o, I1i*I2]
+  // Get a dim of size 1 to unswitch
+  tv5->split(0, 1, false);
+
+  // Compute everything inline
+  tv0->computeAt(tv5, -1);
+
+  tv5->axis(0)->parallelize(ParallelType::Unswitch);
+  tv5->axis(1)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  // Make sure the unswitched loop does not have an else clause.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(!UnswitchInElseChecker::check(gpulw));
+
+  const int x = 11;
+  const int y = 12;
+  const int z = 13;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = at::randn({z, x, y}, options);
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
+}
+
+// The unswitched loop has extent one but inner loops don't. The else
+// part should not be omitted.
+TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int x = 15;
+  auto tv0 = makeConcreteTensor({x});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  tv1->split(-1, 4);
+  tv1->split(-2, 1);
+
+  tv1->axis(-2)->parallelize(ParallelType::Unswitch);
+
+  // Make sure the size-one unswitched loop does not omit the else clause.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(UnswitchInElseChecker::check(gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto t1 = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+
+  // Invalid as tv1 and tv2 do have the same ParallelType
+  FusionExecutor fe;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv1->setMemoryType(MemoryType::Shared);
+
+  // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
+  // memory, so it is valid
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->split(-1, 4);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  // tv1 and tv2 have the same shape and ParallelType
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->split(-1, 8);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  // tv1 and tv2 do not have the same shape but global memory comm is supported.
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 8);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // tv1 and tv2 do not have the same shape, but tv1 is on shared
+  // memory, so it is valid
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+// See issue #995
+TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t W = 5, X = 6, Y = 7, Z = 8;
+
+  auto tv0 = makeConcreteTensor({X, Y, Z});
+  auto tv1 = makeConcreteTensor({W, X, Y, Z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->split(0, 4);
+  tv4->split(0, 3);
+  tv4->split(0, 2);
+
+  TransformPropagator::from(tv4);
+
+  tv0->computeAt(tv2, 2);
+  tv3->computeAt(tv4, 2);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Validation should throw an exception saying the first axes of tv2
+  // and tv3 have incompatible parallelization. See also issue #995.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+TEST_F(NVFuserTest, FusionDAGMerging_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(5);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Branch 0
+  auto tv2 = sum(tv0, {0}); // 0
+  auto tv3 = sum(tv2, {0}); // 1
+  auto tv4 = sum(tv3, {0}); // 2
+  auto tv5 = sum(tv4, {0}); // 3
+
+  // Branch 1
+  auto tv6 = add(tv1, IrBuilder::create<Double>(1)); // 4
+
+  // Merge
+  auto tv7 = add(tv6, tv5); // 5
+
+  // Maximum expected output groups (can improve overtime):
+  //  {0}, {1}, {2}, {3,4,5}
+  //  without final merge would have been {0}, {1}, {2}, {3,4}, {5}
+
+  fusion.addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options);
+  at::Tensor t1 = at::randn({2}, options);
+
+  auto fusion_segments = fusion.segment({t0, t1});
+  TORCH_CHECK(fusion_segments->groups().size() <= 4);
+}
+
+TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto i0 = IrBuilder::create<Double>();
+
+  fusion->addInput(tv0);
+  fusion->addInput(i0);
+
+  auto i1 = add(i0, IrBuilder::create<Double>(1.0));
+  auto i2 = mul(i1, i1);
+  auto i3 = add(i2, i1);
+
+  // Branch 0
+  auto tv1 = sum(tv0, {0}); // 0
+  auto tv2 = add(tv1, i2);
+  // Branch 1
+  auto tv3 = sum(tv2, {0}); // 1
+  auto tv4 = add(tv3, i3);
+
+  auto tv5 = add(tv4, i0);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16, 16}, options);
+  double s0 = 0.5;
+
+  auto s1 = s0 + 1.0;
+  auto s2 = s1 * s1;
+  auto s3 = s2 + s1;
+  auto t1 = t0.sum({0});
+  auto t2 = t1 + s2;
+  auto t3 = sum(t2, {0});
+  auto t4 = t3 + s3;
+  auto t5 = t4 + s0;
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, s0});
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+  constexpr int K = 20;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = sum(tv0, {{1, 2}});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N, K}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  at::Tensor aten_output = t0.sum({1, 2});
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+  constexpr int K = 20;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tvs = Welford(tv0, {{1, 2}});
+  fusion.addInput(tv0);
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N, K}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  at::Tensor aten_avg = t0.mean({1, 2});
+  at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
+}
+
+// See Issue #716
+TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 11;
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  std::vector<int> reduction_axes = {1};
+  std::vector<bool> broadcast_mask = {false, true};
+
+  auto tv0_bcast = broadcast(tv0, broadcast_mask);
+  auto path1_bcast = add(tv0_bcast, IrBuilder::create<Double>(1.0));
+  auto path1 = sum(path1_bcast, reduction_axes);
+  fusion.addOutput(path1);
+
+  auto p = path1->split(1, 1);
+  path1->rFactor({1});
+  path1->axis(0)->parallelize(ParallelType::BIDx);
+  tv0->computeAt(path1, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  at::Tensor t0_ref = t0.clone();
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  // inplace op, we are adding t0 to itself
+  auto outputs = fe.runFusion(aten_inputs, {t0});
+
+  TORCH_CHECK(outputs[0].allclose(t0_ref.add(1)));
+}
+
+TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = tv0->cacheAfter();
+
+  const int bdimx = 128;
+  tv1->split(1, bdimx);
+  tv1->split(1, 4);
+  tv1->split(1, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->split(0, 10);
+  tv0->computeAt(tv1, 4);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 650;
+  int numel_y = 102;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({0});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue728_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addOutput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addOutput(tv1);
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // tv0 -> tv3 -+
+  // tv1 --------+-> tv4 -> tv5
+  //
+  // tv2 -> tv6
+
+  auto all_vals_under_tv3 =
+      DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
+  std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
+  for (auto tv : included_tensors) {
+    TORCH_CHECK(
+        std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
+            all_vals_under_tv3.end(),
+        "TV",
+        tv->name(),
+        " not found");
+  }
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (included_tensors.find(tv) == included_tensors.end()) {
+      TORCH_CHECK(
+          std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
+              all_vals_under_tv3.end(),
+          "TV",
+          tv->name(),
+          " should not be found");
+    }
+  }
+
+  auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
+  TORCH_CHECK(no_dependency.empty(), "No val should be returned");
+
+  auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
+  TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
+
+  auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
+  TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
+
+  auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
+  TORCH_CHECK(
+      just_tv3.size() == 1 && *(just_tv3.begin()) == tv3,
+      "Only tv3 should be included");
+}
+
+TEST_F(NVFuserTest, FusionIssue757_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv1->computeAt(tv4, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 650;
+  int numel_y = 102;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0.sum({1});
+  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t4 = t2 + t3;
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+// See issue #759
+TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->split(0, 4);
+  tv1->computeAt(tv4, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 100;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0.sum({1});
+  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t4 = t2 + t3;
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  // {first kernel}
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, tv0);
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv3, tv0);
+  auto tv5 = sum(tv4, {0});
+  auto tv6 = sum(tv5, {0});
+  // {second kernel}
+  auto tv7 = add(tv6, tv5);
+  auto tv8 = add(tv7, tv5);
+  auto tv9 = sum(tv8, {0});
+
+  fusion->addOutput(tv9);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() == 2);
+}
+
+TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto i0 = IrBuilder::create<Double>();
+
+  fusion->addInput(tv0);
+  fusion->addInput(i0);
+
+  // Branch 0 {first kernel}
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv0, i0);
+  auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
+  auto tv4 = sum(tv3, {0});
+
+  // Branch 1 {first kernel}
+  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
+  auto tv6 = sum(tv5, {0});
+
+  // Incompatible {second kernel}
+  auto tv7 = sum(tv6, {0});
+
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv4);
+  fusion->addOutput(tv7);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), {t0, 1.0}, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() == 2);
+}
+
+TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  // def of tv1 in kernel 1 through horizontal
+  auto tv1 = sum(tv0, {0, 1});
+  // kernel 2
+  auto tv2 = sum(tv0, {2});
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = sum(tv4, {2});
+  // end of kernel 2
+  // kernel 1
+  auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
+  auto tv7 = sum(tv6, {0, 1});
+  auto tv8 = sum(tv6, {0, 1});
+
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv5);
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() <= 2);
+}
+
+TEST_F(NVFuserTest, FusionSBAR_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // N, H, W, C format
+  std::vector<int64_t> input_shape{656, 7, 7, 64};
+
+  auto x = makeContigTensor(4);
+  auto y = makeContigTensor(4);
+  auto weight = makeContigTensor(1);
+  auto bias = makeContigTensor(1);
+
+  fusion.addInput(x);
+  fusion.addInput(y);
+  fusion.addInput(weight);
+  fusion.addInput(bias);
+
+  const size_t kNumberOfDims = x->nDims();
+  std::vector<bool> broadcast_mask(kNumberOfDims, false);
+  for (const auto axis : c10::irange(kNumberOfDims - 1)) {
+    broadcast_mask[axis] = true;
+  }
+
+  auto weight_bcast = broadcast(weight, broadcast_mask);
+  auto scale = mul(x, weight_bcast);
+  auto bias_bcast = broadcast(bias, broadcast_mask);
+  auto scale_bias = add(scale, bias_bcast);
+  auto scale_bias_add = add(scale_bias, y);
+  auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
+
+  fusion.addOutput(scale_bias_add_relu);
+
+  // inputs
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_y = at::randn(input_shape, options);
+  at::Tensor at_weight = at::ones({input_shape[3]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
+
+  // inputs
+  std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
+
+  // outputs
+  std::vector<at::Tensor> outputs;
+
+  auto lparams = schedulePointwise(&fusion, inputs);
+
+  FusionExecutor executor;
+  executor.compileFusion(&fusion, inputs, lparams);
+  outputs = executor.runFusion(inputs, lparams);
+
+  auto at_scale = at::mul(at_x, at_weight);
+  auto at_scale_bias = at::add(at_scale, at_bias);
+  auto pwise_add = at::add(at_scale_bias, at_y);
+  auto output = at::relu(pwise_add);
+
+  testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSingleElement_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(2.5));
+
+  auto tv2 = add(tv1, IrBuilder::create<Double>(3.5));
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({}, options);
+
+  at::Tensor cg_output = at::empty({}, options);
+
+  auto lparams = schedulePointwise(&fusion, {input});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input}, lparams);
+  fe.runFusion({input}, {cg_output}, lparams);
+
+  auto aten_output = input.add(2.5).add(3.5);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 4;
+  int c = 4;
+  int h = 4;
+  int w = 4;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+  auto save_mean = makeSymbolicTensor(1);
+  fusion.addInput(save_mean);
+  auto save_invstd = makeSymbolicTensor(1);
+  fusion.addInput(save_invstd);
+
+  auto grad_out_prev = makeSymbolicTensor(numDims);
+  fusion.addInput(grad_out_prev);
+  auto gt_0 =
+      makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
+  fusion.addInput(gt_0);
+
+  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
+  auto gt_float = castOp(DataType::Float, gt_bool);
+
+  auto grad_out = mul(grad_out_prev, gt_float);
+
+  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
+
+  auto grads = batch_norm_backward(
+      input,
+      grad_out,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_invstd,
+      true,
+      eps_ptr,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({c}, options);
+  at::Tensor input2 = at::randn_like(input1);
+  at::Tensor input3 = at::randn_like(input1);
+  at::Tensor input4 = at::randn_like(input1);
+  at::Tensor input5 = at::randn_like(input1);
+  at::Tensor input6 = at::randn_like(input0);
+  at::Tensor input7 = at::randn_like(input0);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {
+      input0, input1, input2, input3, input4, input5, input6, input7};
+  auto outputs = fec.runFusionWithInputs(inputs);
+}
+
+// TODO: We only changed inputs, merge this with the test above.
+TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 2;
+  int c = 81;
+  int h = 1;
+  int w = 1;
+  int numDims = 4;
+
+  // auto input = makeSymbolicTensor(numDims);
+  auto input = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+  auto save_mean = makeSymbolicTensor(1);
+  fusion.addInput(save_mean);
+  auto save_invstd = makeSymbolicTensor(1);
+  fusion.addInput(save_invstd);
+
+  // auto grad_out_prev = makeSymbolicTensor(numDims);
+  auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(grad_out_prev);
+  // auto gt_0 =
+  //     makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
+  auto gt_0 = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(gt_0);
+
+  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
+  auto gt_float = castOp(DataType::Float, gt_bool);
+
+  auto grad_out = mul(grad_out_prev, gt_float);
+
+  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
+
+  auto grads = batch_norm_backward(
+      input,
+      grad_out,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_invstd,
+      true,
+      eps_ptr,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({c}, options);
+  at::Tensor input2 = at::randn_like(input1);
+  at::Tensor input3 = at::randn_like(input1);
+  at::Tensor input4 = at::randn_like(input1);
+  at::Tensor input5 = at::randn_like(input1);
+  at::Tensor input6 = at::randn_like(input0);
+  at::Tensor input7 = at::randn_like(input0);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {
+      input0, input1, input2, input3, input4, input5, input6, input7};
+  auto outputs = fec.runFusionWithInputs(inputs);
+}
+
+TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  int batch = 14;
+  int c = 65;
+  int h = 7;
+  int w = 7;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto bias = makeSymbolicTensor(1);
+  fusion.addInput(bias);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kTraining,
+      momentum_ptr,
+      eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+  at::Tensor input2 = at::randn({c}, options);
+  at::Tensor input3 = at::randn_like(input2);
+  at::Tensor input4 = at::randn_like(input2);
+  at::Tensor input5 = at::randn_like(input2);
+
+  auto input1_ref = input1.clone();
+  auto input2_ref = input2.clone();
+  auto input3_ref = input3.clone();
+  auto input4_ref = input4.clone();
+  auto input5_ref = input5.clone();
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto at_results = at::native_batch_norm(
+      input1_ref,
+      input2_ref,
+      input3_ref,
+      input4_ref,
+      input5_ref,
+      kTraining,
+      kMomentum,
+      kEps);
+
+  auto at_output = std::get<0>(at_results);
+  auto at_mean = std::get<1>(at_results);
+  auto at_invstd = std::get<2>(at_results);
+
+  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  int batch = 2;
+  int c = 4;
+  int h = 17;
+  int w = 17;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+
+  Val* momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  Val* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      kTraining,
+      momentum_ptr,
+      eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+
+  auto input1_ref = input1.clone();
+  at::Tensor r_m;
+  at::Tensor r_v;
+  at::Tensor weight;
+  at::Tensor bias;
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {input1};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto at_results = at::native_batch_norm(
+      input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
+
+  auto at_output = std::get<0>(at_results);
+  auto at_mean = std::get<1>(at_results);
+  auto at_invstd = std::get<2>(at_results);
+
+  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2.5));
+  fusion.addOutput(tv2);
+
+  // This test used to just have:
+  // auto tv3 = makeConcreteTensor({0});
+  // and somehow that was running through our system fine, but size-0 tensors
+  // are not supported, so making sure this fails.
+  auto tv3 = set(tv1);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  // Fails at schedule pointwise because our (maybe only) size-0 check is in
+  // binding input sizes which the scheduler ends up calling.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(schedulePointwise(&fusion, {input0, input1}));
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv0, {1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({0});
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2, 4}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, reduction_params.value());
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  auto lparams = reduction_params.value().lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
+  auto aten_output2 = input0.sum({1});
+  at::Tensor aten_output3 = at::empty({0}, options);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output2, aten_output3},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv0, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = makeConcreteTensor({0});
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2, 4}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2, 4}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  schedulePersistentKernel(&fusion, reduction_params.value());
+
+  auto lparams = reduction_params.value().lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
+  auto aten_output2 = input0.sum({0}).add(input0);
+  at::Tensor aten_output3 = at::empty({0}, options);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output2, aten_output3},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
+  TensorView* tv4 =
+      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
+  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
+                                   //  keeps normalization scheduler away)
+  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
+
+  // Note: test alias;
+  fusion->aliasOutputToInput(tv6, tv0);
+  // TODO: support output on aliased fusion #1488
+  // remove tv7 after #1488
+  // fusion->addOutput(tv6);
+  TensorView* tv7 = add(tv6, IrBuilder::create<Double>(1)); // Group 0
+  fusion->addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  at::Tensor t1 = at::randn({65}, options);
+  at::Tensor t2 = at::randn({128, 65}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = std::get<0>(at::max(t3, 0));
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+  auto t7 = t6.add(1.0);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  // TODO: support output on aliased fusion #1488
+  // validating aliasing
+  // TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1, t2}, {t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelford1Output_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  fusion->addOutput(tvs.var_sum);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  auto outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto t1 = t0.var({1}, false) * 65;
+  testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  auto tv_out = add(tv0, broadcast(tvs.avg, {false, true}));
+  fusion->addOutput(tv_out);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+    // Square sums does not fit well in the testValidate assumptions,
+    //  so we just compare the divided output here.
+    testValidate(
+        fusion,
+        outputs,
+        {t0},
+        {t0.add(t0.mean({1}).unsqueeze(1))},
+        __LINE__,
+        __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  // Run a translated welford
+  auto runtime1 = run_test(64);
+  // Check it was translated
+  TORCH_CHECK(
+      runtime1->fusionSegments()->groups().size() == 1 &&
+      runtime1->fusionSegments()->groups()[0]->exprs().size() > 2);
+
+  // Run an un-translated welford
+  auto runtime2 = run_test(65536);
+
+  bool found_welford = false;
+  for (auto group : runtime2->fusionSegments()->groups()) {
+    for (auto expr : group->exprs()) {
+      if (expr->isA<WelfordOp>()) {
+        found_welford = true;
+      }
+    }
+  }
+  TORCH_CHECK(found_welford);
+}
+
+TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true}));
+  fusion->addOutput(tv_out1);
+
+  auto tvs2 = Welford(tv0, {1});
+  auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true}));
+  fusion->addOutput(tv_out2);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    // Square sums does not fit well in the testValidate assumptions,
+    //  so we just compare the divided output here.
+    auto out = t0.add(t0.mean({1}).unsqueeze(1));
+    testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  // Run a translated welford
+  auto runtime1 = run_test(64);
+  // Check it was translated
+  TORCH_CHECK(
+      runtime1->fusionSegments()->groups().size() == 1 &&
+      runtime1->fusionSegments()->groups()[0]->exprs().size() > 4);
+
+  // Run an un-translated welford
+  auto runtime2 = run_test(65536);
+  // // Check it was not translated
+  bool found_welford = false;
+  for (auto group : runtime2->fusionSegments()->groups()) {
+    for (auto expr : group->exprs()) {
+      if (expr->isA<WelfordOp>()) {
+        found_welford = true;
+      }
+    }
+  }
+  TORCH_CHECK(found_welford);
+}
+
+TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto sum_of_tv0 = sum(tv0, {1});
+
+  fusion->addOutput(tvs1.var_sum);
+  fusion->addOutput(sum_of_tv0);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    auto t1 = t0.var({1}, false) * inner_size;
+    auto t2 = t0.sum({1});
+    testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  auto runtime = run_test(65536);
+  TORCH_CHECK(!runtime->isSegmented());
+}
+
+TEST_F(NVFuserTest, FusionWelfordOtherPersistence_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto sum_of_tv0 = sum(tv0, {1});
+  auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
+  auto avg_bcasted = broadcast(tvs1.avg, {false, true});
+  auto tv0_plus_sum = add(tv0, sum_bcasted);
+  auto tv0_plus_avg = add(tv0, avg_bcasted);
+
+  fusion->addOutput(tv0_plus_sum);
+  fusion->addOutput(tv0_plus_avg);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    auto t1 = t0.to(c10::kDouble).mean({1}).unsqueeze(1) + t0;
+    auto t2 = t0.to(c10::kDouble).sum({1}).unsqueeze(1) + t0;
+    testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  for (auto inner_size : {4096, 8192, 32768}) {
+    auto runtime = run_test(inner_size);
+    TORCH_CHECK(
+        !runtime->isSegmented() ||
+        runtime->fusionSegments()->groups().size() == 1);
+  }
+}
+
+TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = sum(tv1, {1});
+  fusion->addOutput(tv2);
+  fusion->addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16}, options);
+  at::Tensor t1 = at::randn({16, 16}, options);
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion));
+  fusion_executor_cache.runFusionWithInputs({t0, t1});
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+  auto tv2 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv3 = broadcast(tv0, {false, true, true, true});
+  auto tv4 = broadcast(tv1, {false, false, true, true});
+  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
+
+  auto tv6 = add(tv3, tv5);
+  auto tv7 = add(tv4, tv5);
+  auto tv8 = add(tv3, tv4);
+
+  auto tv9 = add(tv6, tv7);
+  auto tv10 = add(tv9, tv8);
+
+  fusion->addOutput(tv10);
+
+  tv0->computeAt(tv10, -2);
+  tv1->computeAt(tv10, -2);
+  tv2->computeAt(tv10, -2);
+
+  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+
+  TORCH_CHECK(tv6->getMaxProducerPosition() == 3);
+  TORCH_CHECK(tv7->getMaxProducerPosition() == 3);
+  TORCH_CHECK(tv8->getMaxProducerPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(3);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, false, true});
+  auto tv3 = add(tv2, tv1);
+
+  fusion->addOutput(tv3);
+  tv3->split(-2, 4);
+  tv3->reorder({{-1, -2}});
+  tv0->computeAt(tv3, -2);
+  tv1->computeAt(tv3, -2);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv3->getMaxProducerPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, false, true});
+  auto tv3 = broadcast(tv2, {false, true, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion->addOutput(tv4);
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv3->getMaxProducerPosition() == 3);
+}
+
+TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  tv1->split(1, 32);
+  auto tv1_rf = tv1->rFactor({1});
+  TransformPropagator::from(tv1_rf);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp(32);
+  TransformPropagator::from(tv1_rf);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(-1)->padToMultipleOfWarp(32);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->padToMultipleOfWarp(32);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->padToMultipleOfWarp(32);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 127}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  auto tv2 = broadcast(tv1, {false, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  TransformPropagator::from(tv1_rf);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 17, 128}, options);
+
+  auto at_output = input1.sum({1, 2}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  auto tv2 = broadcast(tv1, {false, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  TransformPropagator::from(tv1);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 17, 128}, options);
+
+  auto at_output = input1.sum({1, 2}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({17, 18, 128, 1});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2, 3});
+  auto tv2 = broadcast(tv1, {false, true, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->padToMultipleOfWarp();
+  TransformPropagator::from(tv1_rf);
+  tv0->axis(-2)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({17, 18, 128, 1}, options);
+
+  auto at_output = input1.sum({1, 2, 3}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv_add = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv_add);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  auto tv4 = add(tv0, tv_add);
+
+  fusion->addOutput(tv3);
+  fusion->addOutput(tv4);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp(32);
+  TransformPropagator::from(tv1_rf);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(-1)->padToMultipleOfWarp(32);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->padToMultipleOfWarp(32);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->padToMultipleOfWarp(32);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->padToMultipleOfWarp(64);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+  at::Tensor input2 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1, input2},
+      {at_output, input1 + input2},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 31}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion->addOutput(tv2);
+
+  tv2->split(1, 8);
+  auto tv2_rf = tv2->rFactor({-1});
+  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2_rf->axis(-1)->padToMultipleOfWarp();
+
+  TransformPropagator::from(tv2_rf);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv0->computeAt(tv2, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 31}, options);
+
+  auto at_output = (input1 + 1).sum({1});
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  tv1->split(0, 4);
+  auto tv1_rf = tv1->rFactor({2});
+
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(1)->parallelize(ParallelType::Unroll);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  tv1->axis(1)->parallelize(ParallelType::Unroll);
+  TransformPropagator::from(tv1_rf);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(1)->parallelize(ParallelType::Unroll);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1579
+TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape1 = {1024};
+  std::vector<int64_t> shape2 = {50};
+
+  auto tv0 = makeConcreteTensor(shape1);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeConcreteTensor(shape2);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = sum(tv3, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  // Just to fill the smem buffer by a thread block of 1024 threads
+  // with some values
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Make the tv4_rf reduction a warp reduction to trigger the
+  // bug. Since the smem buffer is filled with some values due to the
+  // reduction of tv1, those values would be used by predicated-out
+  // threads.
+  tv4->split(-1, 10);
+  auto tv4_rf = tv4->rFactor({-1});
+  tv4_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4_rf->axis(-1)->padToMultipleOfWarp();
+
+  tv4_rf->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t2 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t2});
+  auto cg_outputs = fe.runFusion({t0, t2});
+
+  auto t1 = t0.sum({0});
+  auto t4 = (t2 + 1).sum({0}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t2}, {t1, t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 2;
+  int c = 1;
+  int h = 1;
+  int w = 1;
+  int numDims = 4;
+
+  auto input = makeConcreteTensor({-1, 1, 1, 1});
+  fusion.addInput(input);
+  auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1});
+  fusion.addInput(bcast_bias);
+
+  std::vector<int64_t> at_sum_axes;
+  std::vector<int> outer_reduction_axes;
+  std::vector<bool> outer_broadcast_mask(numDims, false);
+  Val* N = IrBuilder::create<Double>(1);
+  for (const auto axis : c10::irange(numDims)) {
+    if (axis != 1) {
+      outer_reduction_axes.push_back(axis);
+      at_sum_axes.push_back(axis);
+      outer_broadcast_mask[axis] = true;
+      N = mul(N, input->domain()->domain()[axis]->extent());
+    }
+  }
+
+  auto output0 = mul(input, bcast_bias);
+  fusion.addOutput(output0);
+  auto output1 = sum(output0, outer_reduction_axes);
+  fusion.addOutput(output1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+
+  auto at_output0 = input0.mul(input1);
+  auto at_output1 = at_output0.sum(at_sum_axes);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {input0, input1};
+  auto outputs = fec.runFusionWithInputs(inputs);
+
+  testValidate(
+      &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3));
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 32);
+  tv0->computeAt(tv3, 1);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  {
+    GpuLower gpulw(&fusion);
+    TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
+  }
+
+  tv2->axis(1)->parallelize(ParallelType::Serial);
+  tv2->split(1, 5);
+
+  {
+    GpuLower gpulw(&fusion);
+    TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+  }
+}
+
+// Repro of issue #1571
+TEST_F(NVFuserTest, FusionPredicateElimination2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 4);
+  tv1->split(0, 4);
+  tv2->split(1, 4);
+  tv2->split(0, 4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum({1}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv0->cacheAfter();
+
+  tv1->split(0, 10);
+  tv1->split(0, 33);
+  TransformPropagator::from(tv1);
+
+  auto tv4 = tv1->rFactor({-1});
+  auto tv5 = tv1->rFactor({-1});
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+
+  // The fusion has three reductions: one within each thread, one
+  // within each block, and another with the whole grid. All of them
+  // should not need to be predicated as they use the same init value
+  // and same reduction op.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv4, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv5, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv1, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  for (auto size : {1, 2, 999, 1001, 1234, 10000}) {
+    auto t0 = at::randn({size}, options);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0});
+    auto cg_outputs = fe.runFusion({t0});
+
+    auto ref = sum(t0) + 1;
+    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = max(tv1, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->split(1, 7);
+  tv1->split(0, 11);
+  tv1->reorder({{1, 2}, {2, 1}});
+  TransformPropagator::from(tv1);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv1, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+
+  // tv2 uses the same op and init with tv1, so tv2 should be fine
+  // without a predicate. However, tv4, while it uses the tv1 as its
+  // input, the reduction op and init value is different from those of
+  // tv1, so tv4 needs to be predicated.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv4, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
+  for (auto s0 : sizes) {
+    for (auto s1 : sizes) {
+      auto t0 = at::randn({s0, s1}, options);
+
+      FusionExecutor fe;
+      fe.compileFusion(&fusion, {t0});
+      auto cg_outputs = fe.runFusion({t0});
+
+      auto t1 = t0.sum({1});
+      auto t3 = t1.sum({0}) + 1;
+      auto t5 = std::get<0>(t1.max(0)) + 1;
+
+      testValidate(&fusion, cg_outputs, {t0}, {t3, t5}, __LINE__, __FILE__);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tvs2 = Welford(tv1, {0});
+  auto tv3 = set(tvs2.avg);
+  fusion.addOutput(tv3);
+
+  tvs2.avg->split(0, 4);
+  TransformPropagator::from(tvs2.avg);
+  auto rtvs2 = tvs2.rFactor({1});
+
+  rtvs2.avg->axis(0)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(rtvs2.avg, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+
+  // The first per-thread welford needs to be predicated as the N
+  // input is different from its init value. The second welford op
+  // does not need a predicate.
+  TORCH_CHECK(PredicatedChecker::isPredicated(rtvs2.avg, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tvs2.avg, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
+  for (auto s0 : sizes) {
+    auto t0 = at::randn({s0}, options);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0});
+    auto cg_outputs = fe.runFusion({t0});
+
+    auto ref = t0.mean({0});
+
+    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 5);
+  TransformPropagator::from(tv4);
+
+  tv4->reorder({{0, 1}, {1, 0}});
+  tv3->computeAt(tv4, 1);
+
+  GpuLower gpulw(&fusion);
+
+  // The expression for tv2 is a local-to-local expression. It
+  // satisfies all the requirements of predicate elimination, except
+  // for the on on split root domains. As the second root axis of tv2
+  // is split, its index exceeds its extent (i.e., 3 in this case)
+  // without its predicate.
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+
+  // Unlike tv2, tv3 is computed at tv4, so the second root axis does
+  // have a zero domain. Its index should look like "i * 5 + j", where
+  // i comes from the first root domain and j comes from the split
+  // inner domain.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv3, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 4;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 5);
+  tv3->split(-1, 4);
+  tv3->split(-1, 3);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  // The last split of tv2 is a non-divisible split, and omitting it
+  // is invalid.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({123}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 3;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+
+  // Group 2
+  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
+  auto tv5 = castOp(DataType::Half, tv4);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{15, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  // Check the segmented edge is fp16
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
+  }
+}
+
+TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+
+  // Group 2
+  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
+  auto tv5 = castOp(DataType::BFloat16, tv4);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{15, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  // Check the segmented edge is bf16
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
+  }
+#else
+  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
+#endif
+}
+
+TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv3 = sum(tv0, {1});
+  auto tv4 = broadcast(tv3, {false, true, false});
+  auto tv5 = sum(tv0, {1});
+
+  // Group 2
+  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
+  auto tv7 = castOp(DataType::Half, tv6);
+
+  // Group 3
+  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
+
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{16, 16, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  auto complete_fusion = segmented_fusion->completeFusion();
+
+  // Check that the edge that wasn't fp16 is the producer of the
+  //  reduction op, i.e. tv8 = sum(tv5,{1});.
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    if (edge_tv->getDataType() == DataType::Float) {
+      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
+      TORCH_CHECK(consumer->isA<ReductionOp>());
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv3 = sum(tv0, {1});
+  auto tv4 = broadcast(tv3, {false, true, false});
+  auto tv5 = sum(tv0, {1});
+
+  // Group 2
+  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
+  auto tv7 = castOp(DataType::BFloat16, tv6);
+
+  // Group 3
+  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
+
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{16, 16, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  auto complete_fusion = segmented_fusion->completeFusion();
+
+  // Check that the edge that wasn't fp16 is the producer of the
+  //  reduction op, i.e. tv8 = sum(tv5,{1});.
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    if (edge_tv->getDataType() == DataType::Float) {
+      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
+      TORCH_CHECK(consumer->isA<ReductionOp>());
+    }
+  }
+#else
+  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
+#endif
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
+  fusion->addOutput(tv5);
+
+  // t4 cannot inner re-use t2, because there's a broadcast
+  //  between them.
+  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
+  tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+
+  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv4 = mul(tv2, tv3);
+  // Broadcast buffer can be reused through outer sharing
+  auto tv5 = broadcast(tv4, {true, false, false});
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(5));
+  auto tv7 = mul(tv6, tv1);
+  auto tv8 = mul(tv7, IrBuilder::create<Double>(7));
+  // tv9 shouldn't alias to avoid buffer over-subscription
+  auto tv9 = broadcast(tv4, {true, false, false});
+  auto tv10 = mul(tv9, IrBuilder::create<Double>(9));
+  auto tv11 = add(tv5, tv9);
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv11);
+
+  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
+  tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort);
+
+  tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort);
+  tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort);
+  tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  auto t2 = in0 * 2;
+  auto t3 = in0 * 3;
+  auto t4 = t2 * t3;
+  auto t5 = t4.unsqueeze(0);
+  auto t6 = t5 * 5;
+  auto t7 = t6 * in1;
+  auto t8 = t7 * 7;
+  auto t9 = t4.unsqueeze(0);
+  auto t10 = t9 * 9;
+  auto t11 = t5 + t9;
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+
+  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
+  auto outputs = fe.runFusion({in0, in1});
+
+  testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({256, 512});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+  auto tv4 = mul(tv3, IrBuilder::create<Double>(2));
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(2));
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(2));
+
+  fusion->addOutput(tv6);
+
+  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
+  tv6->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({256, 512}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto outputs = fe.runFusion({in0});
+
+  auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2);
+
+  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
+                            //  shouldn't outer alias on top
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(3));
+  fusion->addOutput(tv6);
+
+  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
+  tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0);
+
+  testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({3, 3, 3});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+
+  fusion->addOutput(tv3);
+
+  // In this case tv1 "reuses" allocation of tv2
+  //  due to the switched allocation order
+  tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort);
+
+  tv0->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({3, 3, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto outputs = fe.runFusion({in0});
+
+  auto at_out = in0.sum(1).mul(2).mul(2);
+
+  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({16, 16});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+  // tv1 used till here, cannot be reused by tv2 or tv3
+  auto tv4 = mul(tv3, tv1);
+
+  fusion->addOutput(tv4);
+
+  tv0->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({16, 16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto cg_outputs = fe.runFusion({in0});
+
+  auto at_t0 = in0 * 3.0;
+  auto at_out = at_t0 * 2.0 * 2.0 * at_t0;
+
+  testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv4 = mul(tv2, tv3);
+  auto tv5 = broadcast(tv4, {false, false, true});
+  auto tv6 = mul(tv5, tv1);
+  auto tv7 = mul(tv6, IrBuilder::create<Double>(7));
+  fusion->addOutput(tv7);
+
+  // tv6 shouldn't re-use t2 or t3 because of
+  //  the broadcast in between
+  tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort);
+  tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  auto t2 = in0 * 2;
+  auto t3 = in0 * 3;
+  auto t4 = t2 * t3;
+  auto t5 = t4.unsqueeze(2);
+  auto t6 = t5 * in1;
+  auto t7 = t6 * 7;
+  testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue970_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int nelm = 10;
+
+  // tv3 = tv0 + sum(tv0)
+  auto tv0 = makeConcreteTensor({nelm, nelm});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({nelm, nelm}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0;
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of #1016
+TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 8);
+
+  int numel_x = 10;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0 + 1 + 2;
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of #1021
+TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv2->cacheBefore();
+
+  tv2->split(0, 2);
+
+  tv1->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + 1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of issue #1053
+TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion->addOutput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv2);
+
+  tv1->split(0, 8);
+  auto tv1_rf = tv1->rFactor({-1});
+
+  tv1_rf->computeAt(tv1, 1);
+
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({32}, options);
+
+  auto at_tv1 = (input1).sum({0});
+  auto at_tv2 = input1 + 1;
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv2);
+
+  tv1->split(0, 8, false);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->split(0, 8, false);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  // The extents of tv1 and tv2 axes are equal even though their
+  // actual values are not statically known
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  for (const auto i : c10::irange(tv1->domain()->domain().size())) {
+    auto dom1 = tv1->domain()->domain()[i];
+    auto dom2 = tv2->domain()->domain()[i];
+    TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
+  }
+
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1},
+      {input1 + 1, input1 + 1},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = add(tv1, tv2);
+  fusion->addOutput(tv3);
+
+  tv3->split(-1, 8, false);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({11}, options);
+  at::Tensor input2 = at::randn({11, 13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = input1.unsqueeze(-1) + input2;
+
+  testValidate(
+      fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+// Mix symbolic and concrete tensors
+TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv2);
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv3);
+
+  tv2->split(0, 10);
+  tv3->split(0, 20);
+
+  auto tv4 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv4);
+  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv5);
+
+  // Not mapped but equal extent
+  tv4->split(0, 10);
+  tv5->split(0, 10);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDy)->isConst() &&
+      pdmap.get(ParallelType::TIDy)->as<Int>()->value().value() == 10);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1},
+      {input1 + 1, input1 + 1, input1 + 1, input1 + 1},
+      __LINE__,
+      __FILE__);
+}
+
+// Parallelizing merged broadcast domains
+TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  tv4->reorder({{1, 2}, {2, 1}});
+  tv4->merge(0);
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
+  // exact.
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  GpuLower gpulw(&fusion);
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+  at::Tensor input2 = at::randn({15, 13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = (input1 + 1).unsqueeze(0) + input2;
+
+  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {false, true});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::TIDy);
+  tv3->axis(-2)->parallelize(ParallelType::TIDy);
+
+  GpuLower gpulw(&fusion);
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isConst() &&
+      pdmap.get(ParallelType::TIDx)->as<Int>()->value().value() == 4);
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDy)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDy)->as<NamedScalar>()->name() == "blockDim.y");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+  at::Tensor input2 = at::randn({13, 15}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = (input1).unsqueeze(-1) + input2;
+
+  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto t0 = makeSymbolicTensor(3, DataType::Float);
+  auto t1 = makeSymbolicTensor(3, DataType::Half);
+  auto t3 = makeSymbolicTensor(3, DataType::Half);
+  auto t5 = makeSymbolicTensor(3, DataType::Half);
+  auto t7 = makeSymbolicTensor(1, DataType::Half);
+  auto t11 = makeSymbolicTensor(3, DataType::Half);
+  auto t13 = makeSymbolicTensor(3, DataType::Half);
+  auto t15 = makeSymbolicTensor(3, DataType::Half);
+  auto t17 = makeSymbolicTensor(3, DataType::Half);
+  auto d56 = IrBuilder::create<Double>();
+
+  fusion.addInput(t0);
+  fusion.addInput(t1);
+  fusion.addInput(t3);
+  fusion.addInput(t5);
+  fusion.addInput(t7);
+  fusion.addInput(t11);
+  fusion.addInput(t13);
+  fusion.addInput(t15);
+  fusion.addInput(t17);
+  fusion.addInput(d56);
+
+  auto t2 = castOp(DataType::Float, t1);
+  auto t4 = castOp(DataType::Float, t3);
+  auto t22 = sub(t2, t4);
+  auto t6 = castOp(DataType::Float, t5);
+  auto t23 = mul(t22, t6);
+  auto t16 = castOp(DataType::Float, t15);
+  auto t18 = castOp(DataType::Float, t17);
+  auto t19 = add(t16, t18);
+  auto t14 = castOp(DataType::Float, t13);
+  auto t20 = add(t19, t14);
+  auto t12 = castOp(DataType::Float, t11);
+  auto t21 = add(t20, t12);
+  auto t8 = castOp(DataType::Float, t7);
+  auto t24 = broadcast(t8, {true, true, false});
+  auto t25 = mul(t21, t24);
+  auto t27 = sum(t25, {2});
+  auto t28 = broadcast(t27, {false, false, true});
+  auto t29 = mul(t25, t23);
+  auto t30 = sum(t29, {2});
+  auto t31 = broadcast(t30, {false, false, true});
+  auto d59 =
+      mul(t1->getRootDomain()[2]->extent(), IrBuilder::create<Double>(1));
+  auto t26 = mul(d59, t25);
+  auto txx = mul(t26, IrBuilder::create<Double>(1));
+  auto t33 = sub(txx, t28);
+  auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
+  auto t35 = mul(d70, t6);
+  auto t39 = sum(t21, {0, 1});
+  auto t47 = castOp(DataType::Half, t39);
+  auto t37 = mul(t21, t23);
+  auto t38 = sum(t37, {0, 1});
+  auto t46 = castOp(DataType::Half, t38);
+  auto t32 = mul(t23, t31);
+  auto t34 = sub(t33, t32);
+  auto t36 = mul(t35, t34);
+  auto t45 = castOp(DataType::Half, t36);
+  auto t40 = mul(t36, t0);
+  auto t41 = mul(t40, d56);
+  auto t44 = castOp(DataType::Half, t41);
+  auto t42 = sum(t41, {0, 1});
+  auto t43 = castOp(DataType::Half, t42);
+
+  fusion.addOutput(t43);
+  fusion.addOutput(t44);
+  fusion.addOutput(t45);
+  fusion.addOutput(t46);
+  fusion.addOutput(t47);
+
+  auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto options_float =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
+  at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t7 = at::randn({1024}, options_half);
+  at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
+  double at_d56 = 1.1111;
+
+  std::vector<IValue> aten_inputs = {
+      at_t0,
+      at_t1,
+      at_t3,
+      at_t5,
+      at_t7,
+      at_t11,
+      at_t13,
+      at_t15,
+      at_t17,
+      at_d56};
+  for (auto _ : c10::irange(5)) {
+    auto segmented_fusion =
+        SegmentCandidateFinder::segment(fusion_ptr.get(), aten_inputs);
+  }
+}
+
+TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  // Case 1: local memory tensor computed serially and used by
+  // parallel threads
+  tv2->split(-1, 4);
+  tv1->computeAt(tv2, -2);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Case 2: shared memory tensor computed serially and used by BID
+  tv4->split(-1, 4);
+  tv3->computeAt(tv4, -2);
+  tv4->axis(-1)->parallelize(ParallelType::BIDx);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Case 3: shared memory tensor computed by TID and used by BID
+  tv6->split(-1, 4);
+  tv5->computeAt(tv6, -2);
+  tv6->axis(-1)->parallelize(ParallelType::BIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->setMemoryType(MemoryType::Shared);
+
+  const int nx = 11;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({nx}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0 + 2;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1105
+TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv3->split(0, 4);
+  tv0->computeAt(tv3, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+
+  // Make sure a WAR sync is inserted at the end of the outer loop
+  GpuLower gpulw(&fusion);
+  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
+    if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
+      const auto& body = loop->body().exprs();
+      TORCH_CHECK(!body.empty());
+      auto last_expr = dynamic_cast<kir::BlockSync*>(body.back());
+      TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
+      TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeSymbolicTensor(1);
+  fusion.addInput(tv3);
+
+  // Just to make TIDx/y/z non-exact
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  tv2->split(0, 4);
+  tv0->computeAt(tv2, 1);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDz);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv4->split(0, 5);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv5->split(0, 6);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->setMemoryType(MemoryType::Shared);
+  tv6->split(0, 7);
+  tv6->axis(-1)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t3 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref_t2 = t0 + 2;
+  auto ref_t3 = t3 + 3;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1080
+TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 4);
+  tv0->computeAt(tv2, 2);
+
+  tv2->split(-1, 8);
+  tv1->split(-1, 8);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+
+  // swap TIDx and TIDy
+  tv1->axis(-1)->parallelize(ParallelType::TIDy);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int nx = 4;
+  const int ny = 10;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({nx, ny}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({16, 16});
+  auto tv1 = makeConcreteTensor({16, 16});
+
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {false, false, true});
+
+  fusion.addInput(tv0b);
+  fusion.addInput(tv1b);
+
+  auto tv2 = add(tv0b, tv1b);
+  auto tv3 = sum(tv2, {1});
+  fusion.addOutput(tv3);
+
+  auto parallelize = [](auto tv) {
+    tv->axis(0)->parallelize(ParallelType::TIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDx);
+    tv->axis(2)->parallelize(ParallelType::BIDy);
+  };
+
+  parallelize(tv0b);
+  parallelize(tv1b);
+  parallelize(tv2);
+  parallelize(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16, 1}, options);
+  at::Tensor t1 = at::randn({16, 16, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto ref = (t0 + t1).sum({1});
+
+  testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv2, {tv0});
+  scheduler_utils::parallelizeAllLike(tv3, {tv1});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  at::Tensor t1 = at::randn({100}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref_t2 = t0 + 1;
+  auto ref_t3 = t1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1115
+TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{3, 17, 80};
+  std::vector<int64_t> output_shape{3, 17, 1, 80};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
+  auto y = gelu(x_bcast);
+  fusion.addOutput(y);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
+  auto aten_y = at::gelu(at_x_view);
+
+  testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Just set the dimension of TIDx
+  auto tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::TIDx);
+
+  // tv1 and tv2 are on shared memory and are not parallelized with
+  // TIDx. They should be predicated as they are redundant and can
+  // interfere with smem aliasing (issue #1100).
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  at::Tensor t4 = at::randn({1024}, options);
+  std::vector<IValue> aten_inputs = {t0, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+  auto ref2 = t4 + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+  auto tv3 = sum(tv2, {0});
+  fusion.addOutput(tv3);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t2 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 1;
+  auto ref2 = sum(t2);
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+  auto tv3 = Welford(tv2, {0}).avg;
+  fusion.addOutput(tv3);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t2 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 1;
+  auto ref2 = mean(t2, {0});
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1});
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(3);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(2)->parallelize(ParallelType::BIDz);
+
+  // TODO: This needs a fix for issue #1102.
+  // Also, need to allow predicated grid reductions.
+#if 0
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 3}, options);
+  at::Tensor t2 = at::randn({5, 6, 7}, options);
+  at::Tensor t4 = at::randn({8, 9, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t2, t4};
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0.sum(at::IntArrayRef{0, 1});
+  auto ref2 = t2 + 1;
+  auto ref3 = t4 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
+#endif
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0, 1});
+  fusion.addOutput(tvs.avg);
+
+  auto tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(3);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tvs.avg->axis(0)->parallelize(ParallelType::BIDx);
+  tvs.avg->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(2)->parallelize(ParallelType::BIDz);
+
+  // TODO: needs a fix for issue #1102
+  // Also, need to allow predicated grid reductions.
+#if 0
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 3}, options);
+  at::Tensor t2 = at::randn({5, 6, 7}, options);
+  at::Tensor t4 = at::randn({8, 9, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t2, t4};
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0.mean(at::IntArrayRef{0, 1});
+  auto ref2 = t2 + 1;
+  auto ref3 = t4 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
+#endif
+}
+
+// Repro of issue #1102
+TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  // Just to make TIDx/y/z non-exact
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  auto tv7 = add(tv6, IrBuilder::create<Double>(1));
+  auto tv8 = add(tv7, IrBuilder::create<Double>(1));
+  auto tv9 = sum(tv8, {0});
+  fusion.addOutput(tv9);
+
+  tv1->split(0, 5);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->split(0, 6);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->split(0, 7);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+
+  tv9->split(0, 4);
+  tv4->computeAt(tv9, 1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDz);
+  tv7->axis(-1)->parallelize(ParallelType::TIDz);
+  tv8->axis(-1)->parallelize(ParallelType::TIDz);
+  tv9->axis(-1)->parallelize(ParallelType::TIDz);
+  tv9->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv5->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t4 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+  auto ref2 = sum(t4 + 4);
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+// Repro of #1102 and #1129
+TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  // Just to make TIDx/y/z non-exact
+  auto tvx = add(tv1, IrBuilder::create<Double>(1));
+  auto tvy = add(tvx, IrBuilder::create<Double>(1));
+  auto tvz = add(tvy, IrBuilder::create<Double>(1));
+  fusion.addOutput(tvz);
+
+  tv5->split(0, 4);
+  tv0->computeAt(tv5, 1);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->axis(0)->parallelize(ParallelType::Unswitch);
+
+  tvx->split(0, 5);
+  tvx->axis(-1)->parallelize(ParallelType::TIDx);
+  tvy->split(0, 6);
+  tvy->axis(-1)->parallelize(ParallelType::TIDy);
+  tvz->split(0, 7);
+  tvz->axis(-1)->parallelize(ParallelType::TIDz);
+
+  for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
+    tv->setMemoryType(MemoryType::Shared);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t1 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 4;
+  auto ref2 = t1 + 3;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1136
+TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(4));
+  // To check if pow(tv0, 2) is replaced with tv0 * tv0
+  auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(2));
+  // To check if pow(tv0, 2.0) is replaced with tv0 * tv0
+  auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(2));
+  auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(3));
+  auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(3));
+  auto s = binaryOp(
+      BinaryOpType::Pow,
+      IrBuilder::create<Double>(3),
+      IrBuilder::create<Double>(3));
+  auto tv6 = add(tv0, s);
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  tv1->split(0, 32);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  TransformPropagator::from(tv1);
+  scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1000}, options);
+  // Negative inputs cause nan in Fuesr as use_fast_math is enabled
+  t0 = abs(t0);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto p4 = at::pow(t0, 4);
+  auto p2 = at::pow(t0, 2);
+  auto p3 = at::pow(t0, 3);
+  auto t6 = t0 + std::pow(3, 3);
+
+  testValidate(
+      &fusion,
+      outputs,
+      aten_inputs,
+      {p4, p2, p2, p3, p3, t6},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1127_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int numel = 4;
+
+  auto tv0 = makeConcreteTensor({numel});
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+
+  auto tv3 = makeConcreteTensor({numel, numel});
+  fusion.addInput(tv3);
+
+  auto tv4 = sum(tv3, {1});
+
+  auto tv5 = add(tv2, tv4);
+  fusion.addOutput(tv5);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  tv5->axis(0)->parallelize(ParallelType::TIDx);
+
+  // Lowering should fail since tv5 is predicated and paralellized with TIDx.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
+  // This test may not pass if using a custom block sync as there may
+  // be additional calls. Skip the test as it's not specifically
+  // relevant with block synchronizatin.
+  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
+    return;
+  }
+  auto g = std::make_shared<Graph>();
+  const auto graph0_string = R"IR(
+  graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
+        %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
+    %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
+    %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
+    return (%3))IR";
+  parseIR(graph0_string, g.get());
+
+  // strides are not yet supported in the irparser.
+  {
+    auto val = g->block()->inputs()[0];
+    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+        {8, 4, 10, 16}, {640, 1, 64, 4}));
+  }
+
+  {
+    auto val = g->block()->inputs()[1];
+    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+        {8, 4, 10, 16}, {640, 160, 16, 1}));
+  }
+
+  for (auto node : g->block()->nodes()) {
+    for (auto val : node->outputs()) {
+      if (val->isCompleteTensor())
+        val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+            {8, 4, 10, 16}, {640, 1, 64, 4}));
+    }
+  }
+
+  auto fusion = parseJitIR(g);
+  FusionGuard fg(fusion.get());
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor input0 =
+      at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast);
+  at::Tensor input1 = at::randn({2, 2, 2, 16}, options);
+  auto lparams = schedulePointwise(fusion.get(), {input0, input1});
+
+  // CONSIDER:
+  // 1. this can be moved to a dedicated "golden" file
+  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
+  const std::string expected_kernel = R"(
+__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
+  int64_t i173;
+  i173 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
+  if ((i173 < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
+    __half T9[1];
+    T9[0] = 0;
+    T9[0]
+       = T2[((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * ((T0.size[2] * T0.size[1]) * T0.size[3])) + ((((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * (T0.size[2] * T0.size[1])) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * T0.size[2]) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3])];
+    __half T8[1];
+    T8[0] = 0;
+    T8[0]
+       = T0[i173];
+    __half T10[1];
+    float T3[1];
+    T3[0]
+       = __half2float(T9[0]);
+    float T4[1];
+    T4[0]
+       = T3[0];
+    float T1[1];
+    T1[0]
+       = __half2float(T8[0]);
+    float T5[1];
+    T5[0]
+      = T1[0]
+      * T4[0];
+    float T6[1];
+    T6[0]
+       = relu(T5[0]);
+    T10[0]
+       = __float2half(T6[0]);
+    T7[i173]
+       = T10[0];
+  }
+}
+)";
+
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
+
+  if (expected_kernel.size() != actual_kernel.size() ||
+      expected_kernel.compare(actual_kernel) != 0) {
+    std::cerr
+        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
+        << " \n ========= EXPECTED ========= \n"
+        << expected_kernel << "\n========= ACTUAL ========== \n"
+        << actual_kernel << "\n=================" << std::endl;
+    auto it = std::mismatch(
+        expected_kernel.begin(),
+        expected_kernel.end(),
+        actual_kernel.begin(),
+        actual_kernel.end());
+    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
+    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
+    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
+    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
+    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
+              << ", expected: " << expected_mismatched_snippet << std::endl;
+    TORCH_CHECK(false);
+  }
+
+  // TODO: runFusion hits assertion. I'm probably doing something wrong here.
+  // FusionExecutor fe;
+  // fe.compileFusion(fusion.get());
+  // auto outputs = fe.runFusion({input0, input1}, lparams);
+  // at::Tensor output_ref = (input0 * input1).relu();
+  // TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({10, 1024});
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->computeAt(tv3, -1);
+  tv3->axis(0)->parallelize(ParallelType::Unswitch);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 1024}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = sum(t0, {1}) + 2;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  tv1->setContiguity(false);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_input = at::randn({10}, options);
+  at::Tensor at_output = at::empty_strided({10}, {2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_input});
+  auto returned_outputs = fe.runFusion({at_input}, {at_output});
+
+  // Returned outputs should only contain one tensor that is the same
+  // as the output tensor given to runFusion
+  TORCH_CHECK(returned_outputs.size() == 1);
+  TORCH_CHECK(returned_outputs[0].is_same(at_output));
+  TORCH_CHECK(!returned_outputs[0].is_contiguous());
+
+  auto at_ref = at_input + 1;
+
+  testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Setup softmax fusion
+  auto input = makeContigTensor(2);
+  fusion.addInput(input);
+  auto output = softmax(input, 1);
+  fusion.addOutput(output);
+
+  // Setup runtime input
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({8, 16 * 197}, options);
+  std::vector<c10::IValue> aten_inputs({aten_input});
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, aten_inputs, true);
+  TORCH_CHECK(SchedulerEntry::canSchedule(
+      ScheduleHeuristic::Persistent, &fusion, runtime_info));
+  auto scheduler = SchedulerEntry::makeEntry(
+      ScheduleHeuristic::Persistent, &fusion, runtime_info);
+  scheduler->schedule(&fusion);
+
+  // Modify the schedule to use warp reduction
+  auto used_vals = fusion.usedMathVals();
+  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
+    for (IterDomain* id : tv->domain()->domain()) {
+      if (id->getParallelType() == ParallelType::TIDx) {
+        id->padToMultipleOfWarp();
+      }
+    }
+  }
+
+  // Test result
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  auto ref_output = at::_softmax(aten_input, 1, false);
+  testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  const int split_factor = 32;
+
+  tv2->split(-1, split_factor);
+  tv1->computeAt(tv2, -2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::Unswitch);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Both tv1 and tv2 should be allocated at the top-level scope
+  GpuLower gpulw(&fusion);
+  bool tv1_validated = false;
+  bool tv2_validated = false;
+  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
+      auto size = alloc->size();
+      if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) {
+        // There should be no allocation other than those for tv1 and tv2
+        TORCH_CHECK(false, "Invalid allocation detected");
+      }
+      TORCH_CHECK(size->isA<Int>(), "Invalid allocation size");
+      TORCH_CHECK(size->as<Int>()->isConst(), "Allocation not constant");
+      auto size_int = size->as<Int>()->value().value();
+      if (alloc->buffer()->name() == 1) {
+        TORCH_CHECK(
+            size_int == split_factor,
+            "Invalid allocation size: ",
+            size->as<Int>()->value().value());
+        tv1_validated = true;
+      } else {
+        TORCH_CHECK(
+            size_int == 1,
+            "Invalid allocation size: ",
+            size->as<Int>()->value().value());
+        tv2_validated = true;
+      }
+    }
+  }
+
+  TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
+  TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({99, 101}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = (t0 + 1).sum({1}) + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  fusion.addOutput(tv1);
+
+  tv1->split(1, 32);
+
+  auto tv2 = tv1->rFactor({1});
+
+  // This merged domain is not contiguous.
+  tv2->merge(0, 2);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({99, 101}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum({1});
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = set(tv1);
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
+  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = set(tv1);
+  auto tv5 = add(tv3, tv4);
+  auto tv6 = castOp(DataType::Half, tv5);
+  fusion.addOutput(tv6);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
+  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+
+  auto tv5 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv5);
+
+  auto tv6 = castOp(DataType::Float, tv5);
+
+  auto tv7 = add(tv6, tv4);
+  auto tv8 = set(tv1);
+  auto tv9 = add(tv7, tv8);
+  auto tv10 = sum(tv9, {1});
+  auto tv11 = broadcast(tv10, {false, true});
+  auto tv12 = set(tv7);
+  auto tv13 = add(tv12, tv11);
+
+  fusion.addOutput(tv13);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
+  TORCH_INTERNAL_ASSERT(
+      resolution.size() == 2 && resolution[0].size() == 1 &&
+      resolution[1].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
+
+  auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7);
+  TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+  at::Tensor aten_t5 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0, aten_t5}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) *
+          (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = set(tv1);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = set(tv2);
+  auto tv8 = add(tv7, tv6);
+  auto tv9 = castOp(DataType::Half, tv8);
+
+  fusion.addOutput(tv9);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
+  TORCH_INTERNAL_ASSERT(
+      resolution.size() == 2 && resolution[0].size() == 1 &&
+      resolution[1].size() == 1);
+
+  TORCH_INTERNAL_ASSERT(projectable.size() == 2);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
+
+  auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2);
+  TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = set(tv1);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = set(tv2);
+  auto tv8 = add(tv7, tv6);
+  auto tv9 = castOp(DataType::Half, tv8);
+
+  fusion.addOutput(tv9);
+
+  reduction_scheduler_utils::projectPersistentBuffers(&fusion);
+
+  auto tv5_producers = ir_utils::producerTvsOf(tv5);
+  auto tv7_producers = ir_utils::producerTvsOf(tv7);
+
+  // Projection should have broken these dependencies
+
+  TORCH_INTERNAL_ASSERT(
+      std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
+      tv5_producers.end());
+  TORCH_INTERNAL_ASSERT(
+      std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
+      tv7_producers.end());
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_t0});
+
+  auto aten_t1 = aten_t0.to(c10::kDouble);
+  auto aten_t3 = aten_t1.sum({1});
+  auto aten_t4 = aten_t3.unsqueeze(1);
+  auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
+
+  testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv3);
+
+  tv2->split(0, 4);
+  tv2->split(1, 1, false);
+  tv2->split(-1, 4);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  tv2->axis(-3)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv1->computeAt(tv2, -1);
+
+  // Make TIDx and TIDy non-exact
+  tv3->split(0, 32);
+  tv3->split(-1, 32);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDy);
+
+  // The second axis of both tv1 and tv2 are fully unswitched, so they
+  // don't need to predicate the parallel type usage of TIDy, whereas
+  // the first axis is only partially unswitched, i.e., part of its
+  // split output domains is outside the unswitched axis, so the first
+  // axis, which uses TIDx, needs to predicate the parallel
+  // dimension. Previously, as reported in issue #1223, unswitched
+  // expressions didn't predicate parallel dimensions. It should be
+  // fixed by PR #1222.
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::ones({11, 10}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0});
+  auto cg_outputs = fe.runFusion({at_t0});
+
+  auto at_t1 = (at_t0 + 1).sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
+}
+
+// See #1247 and #1250
+TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = min(tv1, {0});
+
+  fusion.addOutput(tv2);
+
+  // Make TIDx non-exact
+  auto tv3 = makeContigTensor(1);
+  fusion.addInput(tv3);
+
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  tv2->split(0, 4);
+  auto tv5 = tv2->rFactor({1});
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::randn({9}, options);
+  at_t0 = at::abs(at_t0);
+  at::Tensor at_t3 = at::randn({128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t3});
+  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+
+  auto at_t2 = (at_t0 + 1).min();
+  auto at_t4 = at_t3 + 1;
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = min(tv0, {0});
+  fusion.addOutput(tv1);
+
+  // Make TIDx non-exact
+  auto tv2 = makeContigTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv1->split(0, 4);
+  auto tv4 = tv1->rFactor({0});
+
+  tv1->split(0, 3);
+
+  // tv0->computeAt(tv1, 3);
+  tv4->reorder({{0, 1}});
+  tv4->split(0, 3);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  // tv0: [I]
+  // tv4: [4/3, 3, I/4]
+  // tv1: [4/3, 3]
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv1, {tv4});
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_t0 = at::randn({9}, options);
+  at_t0 = at::abs(at_t0);
+  at::Tensor at_t3 = at::randn({128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t3});
+  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+
+  auto at_t2 = std::get<0>(at_t0.min(0));
+  auto at_t4 = at_t3 + 1;
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  // [I]
+  tv1->split(0, 5);
+  // [ceilDiv(I, 5), 5]
+
+  // This second split is non-divisible. The split domain must be predicated.
+  tv1->split(1, 3);
+  // [ceilDiv(I, 5), 2, 3]
+
+  auto tv2 = sum(tv0, {0});
+  fusion.addOutput(tv2);
+
+  // tv2 shouldn't need to have another predicate
+  tv2->split(0, 4);
+  tv2->split(1, 2);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
+      "Only tv1 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1074
+TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->reorder({{1, 2}, {2, 1}});
+  tv0->computeAt(tv2, 2);
+
+  tv2->split(-1, 3);
+
+  // To make the sanitizer catch the invalid accesses. Not necessary
+  // to expose the bug.
+  tv1->setMemoryType(MemoryType::Shared);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
+      "Only tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({13, 17}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Similar to FusionNonDivisibleSplit1 but with unswitch
+TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 5);
+  tv2->split(1, 3);
+
+  tv0->computeAt(tv2, -1);
+
+  tv2->axis(0)->parallelize(ParallelType::Unswitch);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Non-divisible split through merge
+TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 5);
+  tv2->merge(1, 2);
+  tv2->split(1, 3);
+
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24, 2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Nested splits
+TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  // [I]
+  tv2->split(0, 8);
+  // [I/8, 8]
+  tv2->split(1, 2);
+  // [I/8, 4, 2]
+  tv2->split(1, 3); // non-divisible split of outer output
+  // [I/8, 2, 3, 2]
+
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Vectorized non-divisible split. Must be validated at run time
+TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 8, false);
+  tv1->split(1, 4);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
+      "There should be one split to validate");
+  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
+    const auto& splits_to_predicate = kv.second;
+    TORCH_CHECK(
+        splits_to_predicate.empty(),
+        "There must be no split to predicate, but tensor t",
+        kv.first->name(),
+        " has:",
+        splits_to_predicate);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+
+  auto t0_non_divisible = at::randn({8}, options);
+  // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
+  // illegal. The run-time validation of vectorization should throw an error.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
+}
+
+// If a split is validated at run time, it's not necessary to predicate.
+TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = sum(tv2, {0});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 8, false);
+  tv3->split(1, 4);
+  TransformPropagator::from(tv3);
+
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
+      "There should be one split to validate");
+  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
+    const auto& splits_to_predicate = kv.second;
+    TORCH_CHECK(
+        splits_to_predicate.empty(),
+        "There must be no split to predicate, but tensor t",
+        kv.first->name(),
+        " has:",
+        splits_to_predicate);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  auto t0 = at::randn({1024}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape_0 = {10, 20};
+  std::vector<int64_t> input_shape_1 = {15};
+
+  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
+  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
+  fusion.addInput(in_0);
+  fusion.addInput(in_1);
+
+  TensorView* out_0 = add(in_0, IrBuilder::create<Double>(0.f));
+  TensorView* out_1 = add(in_1, IrBuilder::create<Double>(2.f));
+
+  fusion.addOutput(out_0);
+  fusion.addOutput(out_1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_in_0 = at::randn(input_shape_0, options);
+  at::Tensor at_in_1 = at::randn(input_shape_1, options);
+  std::vector<IValue> aten_inputs = {at_in_0, at_in_1};
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto t1 = at_in_1 + 2;
+
+  auto runtime = fec.getMostRecentKernelRuntime();
+  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
+  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
+
+  testValidate(
+      &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape_0 = {4, 4};
+  std::vector<int64_t> input_shape_1 = {3, 4, 4};
+  std::vector<int64_t> input_shape_2 = {2, 8, 4, 4};
+
+  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
+  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
+  TensorView* in_2 = makeSymbolicTensor(input_shape_2.size());
+
+  fusion.addInput(in_0);
+  fusion.addInput(in_1);
+  fusion.addInput(in_2);
+
+  TensorView* out_0 = add(in_0, in_1);
+  TensorView* out_1 = add(in_0, in_2);
+
+  fusion.addOutput(out_0);
+  fusion.addOutput(out_1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_in_0 = at::randn(input_shape_0, options);
+  at::Tensor at_in_1 = at::randn(input_shape_1, options);
+  at::Tensor at_in_2 = at::randn(input_shape_2, options);
+
+  std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto t0 = at_in_0 + at_in_1;
+  auto t1 = at_in_0 + at_in_2;
+
+  auto runtime = fec.getMostRecentKernelRuntime();
+  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
+  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
+
+  testValidate(
+      &fusion,
+      outputs,
+      {at_in_0, at_in_1, at_in_2},
+      {t0, t1},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto t0 = makeContigTensor(1);
+  auto t1 = makeContigTensor(2);
+
+  fusion.addInput(t0);
+  fusion.addInput(t1);
+
+  auto t2 = broadcast(t0, {true, false});
+  auto t3 = add(t1, t2);
+  auto t4 = add(t3, t2);
+  auto t5 = sum(t4, {1});
+  auto t6 = broadcast(t5, {false, true});
+  auto t7 = add(t3, t6);
+
+  fusion.addOutput(t7);
+
+  t3->computeAt(t7, -1, ComputeAtMode::MostInlined);
+
+  TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, -1);
+
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  // tv2 is invalid to double-buffer as its producer, tv1, is
+  // computed inside the double-buffering loop.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv2->doubleBuffer());
+
+  // Moving tv2 inner makes tv1 large enough to double-buffer tv2
+  tv2->computeAt(tv3, 2);
+
+  tv2->doubleBuffer();
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering smem to local and unswitch
+TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  tv3->split(-1, 8);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 2);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unswitch);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  tv2->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering gmem to shared and unswitch
+TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 128);
+  tv2->split(-1, 32);
+  tv2->split(-1, 8);
+  TransformPropagator::from(tv2);
+
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion));
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering smem to local and unroll
+TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 16);
+  tv3->split(-2, 4);
+  tv3->split(-2, 2);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(2)->parallelize(ParallelType::Unroll);
+  tv3->axis(4)->parallelize(ParallelType::TIDx);
+
+  tv2->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({199}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering and vectorize
+TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv2);
+
+  tv2->split(-1, 128);
+  tv2->split(-1, 4);
+  TransformPropagator::from(tv2);
+
+  tv1->computeAt(tv2, 2);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({200}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Multiple tensors to double-buffer
+TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->split(0, 32);
+  tv4->split(0, 4);
+  TransformPropagator::from(tv4);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4, ir_utils::allTvs(&fusion));
+
+  tv2->doubleBuffer();
+  tv3->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({100}, options);
+  auto t1 = at::randn({100}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Nested double buffering from gmem to smem and smem to register
+TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto out = tv1;
+  fusion.addOutput(out);
+
+  auto tv2 = tv0->cacheAfter();
+  auto tv3 = tv2->cacheAfter();
+
+  out->split(0, 32);
+  out->split(0, 4);
+  TransformPropagator::from(out);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv2->computeAt(out, 1);
+  tv3->computeAt(out, -1);
+
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  tv2->doubleBuffer();
+  tv3->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1001}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// FusionSmemBlockGemmCache + double buffering at both smem and local
+TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  TensorView* tv6 = tv5->cacheBefore();
+
+  // For smem double buffering
+  auto tv0_cache_local = tv0->cacheAfter();
+  auto tv1_cache_local = tv1->cacheAfter();
+
+  // For register double buffering
+  auto tv0_cache_smem = tv0->cacheAfter();
+  auto tv1_cache_smem = tv1->cacheAfter();
+
+  const int BSX = 32;
+  const int TSX = 8;
+
+  // [M, K, N]
+  tv6->split(-1, BSX);
+  tv6->split(-1, TSX);
+  tv6->split(1, BSX);
+  tv6->split(0, BSX);
+  tv6->split(1, TSX);
+  // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX]
+  tv6->reorder(
+      {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}});
+  // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX]
+
+  auto tv6_rf = tv6->rFactor({-1});
+
+  TransformPropagator::from(tv6_rf);
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv6_rf->computeAt(tv6, -1);
+  tv0_cache_local->computeAt(tv6_rf, -1);
+  tv1_cache_local->computeAt(tv6_rf, -1);
+
+  tv0_cache_smem->setMemoryType(MemoryType::Shared);
+  tv1_cache_smem->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-3)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  tv0_cache_local->doubleBuffer();
+  tv1_cache_local->doubleBuffer();
+
+  tv0_cache_smem->doubleBuffer();
+  tv1_cache_smem->doubleBuffer();
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+  // The smem cache write in this test case is redundant predicated,
+  //   and also double buffered. Currently we are relying on WAR sync
+  //   insertion to ensure ordering of double buffered tensor access.
+  // The check below makes sure that the sync is inserted so that the
+  //   test isn't running on a race condition.
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
+}
+
+TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
+  std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
+
+  for (auto mem_type : mem_types) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeContigTensor(1);
+    fusion.addInput(tv0);
+
+    auto tv1 = set(tv0);
+    auto tv2 = set(tv1);
+    auto tv3 = set(tv2);
+    fusion.addOutput(tv3);
+
+    tv1->setMemoryType(mem_type);
+
+    tv3->split(-1, 4);
+    TransformPropagator::from(tv3);
+
+    tv1->computeAt(tv3, -2);
+
+    tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::manual_seed(0);
+    auto t0 = at::randn({15}, options);
+    FusionExecutor fe;
+    fe.compileFusion(&fusion);
+
+    // This should throw an exception as the extent of t0 is not
+    // divisible by the vector width
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+    ASSERT_ANY_THROW(fe.runFusion({t0}));
+
+    auto t1 = at::randn({16}, options);
+    auto cg_outputs = fe.runFusion({t1});
+
+    auto ref = t1;
+
+    testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({10, 1});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({10, 20});
+  fusion.addInput(tv1);
+  auto tv2 = makeConcreteTensor({10, 10});
+  fusion.addInput(tv2);
+
+  // Not concretized
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = add(tv0, tv4);
+  fusion.addOutput(tv5);
+
+  // Concretized
+  auto tv6 = sum(tv2, {1});
+  auto tv7 = broadcast(tv6, {false, true});
+  auto tv8 = add(tv1, tv7);
+  fusion.addOutput(tv8);
+
+  for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) {
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+  }
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(!gpulw.concretizedBroadcastDomains().isConcretized(
+      loweredTv(tv4, gpulw)->axis(1)));
+  TORCH_CHECK(gpulw.concretizedBroadcastDomains().isConcretized(
+      loweredTv(tv7, gpulw)->axis(1)));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({10, 1}, options);
+  auto t1 = at::randn({10, 20}, options);
+  auto t2 = at::randn({10, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t5 = t0 + t2.sum({1}).unsqueeze(-1);
+  auto t8 = t1 + t2.sum({1}).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = broadcast(tv2, {false, true});
+  fusion.addOutput(tv3);
+
+  // tv1 is thread-predicated with TIDx and TIDy
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+  // tv2 broadcasts along TIDx
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  // tv3 broadcasts along TIDy
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+
+  // Both tv2 and tv3 broadcast along predicated TID dimensions, but
+  // since the broadcast domains are not concretized, there should be
+  // no actual parallel broadcast
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      !gpulw.kernel()->summary().has_block_broadcasts &&
+          !gpulw.kernel()->summary().has_grid_broadcasts,
+      "There must be no parallel broadcast in this fusion");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape({10, 4, 8});
+  std::vector<int64_t> output_shape({8, 4, 1});
+
+  auto tv0 = makeConcreteTensor(input_shape);
+  fusion.addInput(tv0);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = set(tv2);
+  auto tv4 =
+      view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // The view op adds a broadcast domain in tv4, which is
+  // parallelized. Howver, it is never materialized, so there should
+  // be no parallel broadcast.
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      !gpulw.kernel()->summary().has_block_broadcasts &&
+          !gpulw.kernel()->summary().has_grid_broadcasts,
+      "There must be no parallel broadcast in this fusion");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t5 = at::native::view(t0.sum(0), output_shape) + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__);
+}
+
+// Merging non-broadcast and broadcast domains
+// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418
+// validateParallelize does not pass. Even if it's skipped,
+// generated code is invalid as blockBroadcast is not used.
+#if 0
+TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  fusion.addOutput(tv3);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->merge(0, 1);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  // TODO: When set to shared memory, this kernel should be correct, but fails
+  // validation and when skipped produces incorrect code
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv3->merge(0, 1);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  fusion.printMath();
+  fusion.printKernel();
+}
+#endif
+
+TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
+  // Derived from an expression sorting issue when using loop map, now expr
+  // sorting uses parallel map.
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int V = 2, W = 3, X = 4, Y = 5, Z = 6;
+
+  // setup fusion
+  auto tv0 = TensorViewBuilder()
+                 .ndims(5)
+                 .dtype(DataType::Half)
+                 .contiguity(std::vector<bool>(5, true))
+                 .shape({V, W, X, Y, Z})
+                 .build();
+
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = castOp(DataType::Float, tv1);
+
+  auto tvs = Welford(tv2, {1, 2, 3, 4});
+  auto tv3 = tvs.avg;
+  auto tv4 = tvs.var_sum;
+  auto tv5 = tvs.n;
+
+  // avg
+  auto tv6 = broadcast(tvs.avg, {false, true, true, true, true});
+
+  // var
+  auto tv7 = mul(tv4, IrBuilder::create<Double>(1. / (W * X * Y * Z)));
+  auto tv8 = add(tv7, IrBuilder::create<Double>(1.e-6));
+  auto tv9 = broadcast(tv8, {false, true, true, true, true});
+  auto tv10 = rsqrt(tv9);
+
+  auto tv11 = castOp(DataType::Float, tv1);
+  auto tv12 = sub(tv11, tv6);
+  auto tv13 = mul(tv12, tv10);
+
+  auto tv14 = set(tv13);
+  fusion.addOutput(tv14);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDy);
+  tv3->axis(2)->parallelize(ParallelType::BIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(4)->parallelize(ParallelType::Vectorize);
+
+  // tv3->reorder({{1, -2}});
+
+  auto rfactor = ir_utils::rfactorHelper(tv3, {1, 4});
+
+  scheduler_utils::parallelizeAllLike(rfactor, ir_utils::allTvs(&fusion));
+
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    if (tv != tv1 || tv != tv3) {
+      for (auto i : c10::irange(tv->nDims())) {
+        if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) {
+          tv->axis(i)->parallelize(ParallelType::Serial);
+        }
+      }
+    }
+  }
+
+  tv0->computeAt(tv14, 1);
+  tv13->computeAt(tv14, -2);
+  tv2->computeAt(tv14, -1, ComputeAtMode::MostInlined);
+  tv11->computeAt(tv14, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1));
+
+  auto t0_double = t0.to(at::kDouble);
+
+  auto at_mu = at::mean(t0_double, {1, 2, 3, 4})
+                   .unsqueeze(-1)
+                   .unsqueeze(-1)
+                   .unsqueeze(-1)
+                   .unsqueeze(-1);
+  auto at_var = at::var(t0_double, {1, 2, 3, 4}, false)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1);
+
+  auto at_out = t0_double.sub(at_mu).div(at_var.add(1.e-6).sqrt());
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {t0},
+      {at_out},
+      __LINE__,
+      __FILE__,
+      "",
+      LaunchParams(X, V, -1, Y, -1, -1));
+}
+
+// Test code generation of allocated scalars
+TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Fusion is just a dummy container in this test, just used for
+  // getting a Kernel container
+  auto tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  GpuLower gpulw(&fusion);
+  auto kernel = gpulw.kernel();
+
+  // Set the kernel as the current fusion
+  FusionGuard kg(kernel);
+
+  // Create alocated scalars
+  auto ks0 = add(kernel->zeroVal(), kernel->oneVal());
+  auto ks0_alloc = IrBuilder::create<kir::Allocate>(
+      ks0, MemoryType::Local, kernel->oneVal());
+
+  auto ks1 = add(ks0, kernel->oneVal());
+  auto ks1_alloc = IrBuilder::create<kir::Allocate>(
+      ks1, MemoryType::Local, kernel->oneVal());
+
+  auto tk0 = kernel->inputs()[0]->as<TensorView>();
+  auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0});
+  auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1});
+  auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1);
+
+  // Insert the scalar expression and the allocation of the
+  // output directly to the kernel
+  auto proxy = kir::KernelInternalProxy(kernel);
+
+  const auto indent = "  ";
+  const auto ks0_name = "i" + std::to_string(ks0->name());
+  const auto ks1_name = "i" + std::to_string(ks1->name());
+  const auto tk0_name = "T" + std::to_string(tk0->name());
+
+  auto& exprs = proxy.topLevelExprs();
+  exprs.push_back(tk0_expr);
+
+  // Invalid code gen
+  const auto no_alloc_code = codegen::generateCudaKernel(kernel);
+
+  // Without alloc, Int vals are just inlined, resulting in:
+  // t0[(0 + 1)] = t0[((0 + 1) + 1)]
+  std::stringstream no_alloc_ref;
+  no_alloc_ref << "\n"
+               << indent << tk0_name << "[(0 + 1)]\n"
+               << indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n";
+
+  TORCH_CHECK(
+      no_alloc_code.find(no_alloc_ref.str()) != std::string::npos,
+      "Invalid code generation. Expected:",
+      no_alloc_ref.str(),
+      "Actual:\n",
+      no_alloc_code);
+
+  // Insert proper allocations and definitions
+  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc);
+  exprs.insert(
+      std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition());
+  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc);
+  exprs.insert(
+      std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition());
+
+  const auto valid_code = codegen::generateCudaKernel(kernel);
+
+  std::stringstream valid_ref;
+  valid_ref << "\n"
+            << indent << tk0_name << "[" << ks0_name << "]\n"
+            << indent << indent << " = " << tk0_name << "[" << ks1_name
+            << "];\n";
+
+  TORCH_CHECK(
+      valid_code.find(valid_ref.str()) != std::string::npos,
+      "Invalid code generation. Expected:",
+      valid_ref.str(),
+      "Actual:\n",
+      valid_code);
+}
+
+TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) {
+  if (isDisabled(DisableOption::IndexHoist)) {
+    GTEST_SKIP() << "Index hoisting disabled";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = set(tv2);
+  auto tv4 = set(tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv1->split(-1, 4);
+  tv2->split(-1, 4);
+  tv3->merge(0, 1);
+  tv3->split(0, 8);
+  tv5->merge(0, 1);
+  tv5->split(0, 8);
+  tv4->computeAt(tv5, -1);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+
+  // Use Int32 as the index type to verify Int32 is used as the type
+  // of hoisted indices
+  GpuLower gpulw(&fusion, DataType::Int32);
+  auto kernel = gpulw.kernel();
+
+  auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool {
+    auto def = dynamic_cast<BinaryOp*>(val->definition());
+    if (def == nullptr) {
+      return false;
+    }
+    return def->getBinaryOpType() == BinaryOpType::Mul &&
+        def->rhs()->isA<NamedScalar>() &&
+        def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index;
+  };
+
+  // Validate indices in the kernel are hoisted as
+  // intended. Validation could be also done by just string comparison
+  // as the parser test, but updating such tests would be tedious.
+  for (auto top_level_loop :
+       ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) {
+    auto innermost_loop = top_level_loop;
+    while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>(
+               innermost_loop->body().exprs().at(0))) {
+      innermost_loop = first_expr_loop;
+    }
+    const auto& exprs = innermost_loop->body().exprs();
+    TORCH_CHECK(!exprs.empty(), "No expression found");
+    TORCH_CHECK(
+        exprs.at(0)->isA<kir::Allocate>(),
+        "Invalid expression: ",
+        exprs.at(0)->toString());
+    auto hoisted_index = exprs.at(0)->as<kir::Allocate>()->buffer();
+    TORCH_CHECK(
+        hoisted_index->dtype() == DataType::Int32,
+        "Invalid data type of hoisted indices. Should be Int32 but: ",
+        hoisted_index->dtype());
+    kir::Predicate* pred = nullptr;
+    for (auto expr : exprs) {
+      if (expr->isA<kir::IfThenElse>()) {
+        pred = expr->as<kir::IfThenElse>()->predicate();
+        auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(0);
+        auto out_ti = arith_expr->outputs()[0]->as<kir::TensorIndex>();
+        if (out_ti->view()->name() == 1) {
+          // Ref: T1[*, hoisted_index] = T0[*, hoisted_index * T0.stride];
+          auto t1_index = out_ti->index(1);
+          TORCH_CHECK(
+              t1_index == hoisted_index,
+              "Invalid index: ",
+              t1_index->toInlineString());
+          // Pred: hoisted_index < T0.size[1]
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 0);
+          // hoisted_index * T0.stride[1]
+          auto t0_index = in0->index(1);
+          TORCH_CHECK(
+              is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]"),
+              "Invalid index: ",
+              t0_index->toInlineString());
+        } else if (out_ti->view()->name() == 2) {
+          // Ref: T3[*, hoisted_index] = T2[*, hoisted_index];
+          auto out_index = out_ti->index(1);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 1);
+          auto in0_index = in0->index(1);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString());
+        } else if (out_ti->view()->name() == 3) {
+          // Ref: T3[hoisted_index] = T2[hoisted_index];
+          auto out_index = out_ti->index(0);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 2);
+          auto in0_index = in0->index(0);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString());
+        } else if (out_ti->view()->name() == 4) {
+          // Ref: T4[0] = T3[hoisted_index];
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 3);
+          auto in0_index = in0->index(0);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString());
+        } else if (out_ti->view()->name() == 5) {
+          // Ref: T5[hoisted_index] = T4[0]
+          auto out_index = out_ti->index(0);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString());
+        }
+      }
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({15, 17}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Hoist indices for vectorized tensors
+TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) {
+  if (isDisabled(DisableOption::IndexHoist)) {
+    GTEST_SKIP() << "Index hoisting disabled";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv5->split(-1, 4);
+  TransformPropagator::from(tv5);
+
+  tv4->split(-1, 3);
+
+  tv0->computeAt(tv5, 1);
+  tv1->computeAt(tv5, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv5->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({16}, options);
+  auto t1 = at::randn({16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  int X = 3, Y = 4, Z = 2;
+  auto tv0 = makeConcreteTensor({X, Y, Z});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({X, Y, Z});
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = add(tv2, tv1);
+  auto tv4 = set(tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+  tv4->setMemoryType(MemoryType::Global);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::Vectorize);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::BIDy);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDy);
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDy);
+  tv5->axis(1)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({X, Y, Z}, options);
+  auto t1 = at::randn({X, Y, Z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue https://github.com/csarofeen/pytorch/issues/1497
+TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t W = 3, X = 4;
+
+  auto tv0 = makeConcreteTensor({X});
+  auto tv1 = makeConcreteTensor({W, X});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->split(0, 2);
+
+  TransformPropagator::from(tv4);
+
+  tv3->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({X}, options);
+  auto t1 = at::randn({W, X}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Vectorized reset test for double buffered registers
+TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = sum(tv1, {0});
+  auto tv2c = tv2->cacheBefore();
+
+  fusion.addOutput(tv2);
+
+  auto tv1cw = tv1->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+
+  tv1cw->split(-1, 32);
+  tv1cr->split(-1, 32);
+  tv1cr->split(-1, 4);
+  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv1cw->computeAt(tv1cr, 1);
+  tv0->computeAt(tv1cw, -1);
+  tv2c->split(-1, 32);
+  tv2c->split(-1, 4);
+  tv1cr->computeAt(tv2c, 2);
+
+  tv1cw->setMemoryType(MemoryType::Shared);
+  tv1cr->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({200}, options);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto ref = (t0 + 1).sum({0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Request 48KB of data in shared mem,
+//  should be large enough not to fit in
+//  static allocations, but small enough
+//  to fit in supported devices (sm70+).
+TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 12288);
+  tv2->split(1, 128);
+  tv1->computeAt(tv2, 1);
+  tv1->split(1, 128);
+  tv0->computeAt(tv1, -1);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({12288 * 4}, options);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto ref = t0 + 1 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Request a smem allocation that is equal to the device limit
+TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto properties = at::cuda::getDeviceProperties(
+      c10::Device(c10::DeviceType::CUDA, 0).index());
+  int device_limit = properties->sharedMemPerBlockOptin;
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  fusion.addOutput(tv2);
+
+  // 4 byte per float
+  tv2->split(0, device_limit / 4);
+  tv2->split(1, 128);
+  tv1->computeAt(tv2, 1);
+  tv1->split(1, 128);
+  tv0->computeAt(tv1, -1);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({12288 * 4}, options);
+  FusionExecutor fe;
+
+  // First compile gets a compiled kernel
+  fe.compileFusion(&fusion, {t0});
+
+  // Should be throwing because the kernel
+  //  requested absolute device limit
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+// Try to test alignment when multiple tensors are
+//  in shared mem.
+TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({3, 4, 7, 2, 5});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {4});
+  auto tv2 = sum(tv1, {3});
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = sum(tv3, {1});
+  fusion.addOutput(tv4);
+
+  auto tv0c = tv0->cacheAfter();
+  auto tv1bc = tv1->cacheBefore();
+  auto tv2bc = tv2->cacheBefore();
+  auto tv3bc = tv3->cacheBefore();
+  auto tv4bc = tv4->cacheBefore();
+
+  tv0c->setMemoryType(MemoryType::Shared);
+  tv1bc->setMemoryType(MemoryType::Shared);
+  tv2bc->setMemoryType(MemoryType::Shared);
+  tv3bc->setMemoryType(MemoryType::Shared);
+  tv4bc->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv0->computeAt(tv4, 0);
+  tv0->computeAt(tv2, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({3, 4, 7, 2, 5}, options);
+  FusionExecutor fe;
+
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto tref = t0.sum({1, 2, 3, 4});
+
+  testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__);
+}
+
+// Repro of #1521
+TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto immediate_scalr = IrBuilder::create<Double>(0.1);
+  // Adding an immediate scalar value as an input is not allowed
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.addInput(immediate_scalr));
+
+  // Instead, use a symbolic value
+  auto symbolic_scalar = IrBuilder::create<Double>();
+  fusion.addInput(symbolic_scalar);
+
+  auto tv1 = add(tv0, symbolic_scalar);
+  fusion.addOutput(tv1);
+
+  // Make sure the kernel is compiled.
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+// Repro of #1506
+TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
+  std::vector<int64_t> shape{14, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(0);
+
+  // Vectorize by 4 should be allowed
+  tv2->split(0, 4);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv2, 1);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  TORCH_CHECK(t0.equal(cg_outputs[0]));
+}
+
+// Make sure the same fusion as FusionVectorizeContigIndex fails if
+// not contig.
+TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
+  std::vector<int64_t> shape{14, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(0);
+
+  tv2->split(0, 4);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv2, 1);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+
+  // This should fail at the launch time as 14 is not divisible by the
+  // vector word size. The two domains are merged, but they are not
+  // contiguous, so contig indexing is not involved in this case.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 4);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  const int n = 12;
+  auto t0 = at::randn({n}, options);
+  // Shift by one to make it non-aligned
+  auto t0_misaligned = at::randn({n + 1}, options).index({Slice(1)});
+  auto t1_misaligned = at::empty({n + 1}, options).index({Slice(1)});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  TORCH_CHECK(t0.equal(cg_outputs[0]));
+
+  // Pass misaligned input. This must fail.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
+
+  // Pass misaligned output. This must fail too.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
+}
+
+// Repro of issue #1530
+TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
+  std::vector<int64_t> shape{1, 2, 1};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(shape.size());
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->merge(1);
+  tv1->merge(0);
+
+  auto invalid_vec_size = shape[0] * shape[1] * shape[2];
+  invalid_vec_size *= invalid_vec_size;
+
+  tv1->split(0, invalid_vec_size);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({4});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({3, 4});
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  TransformPropagator::from(tv3);
+
+  tv2->setMemoryType(MemoryType::Local);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({4}, options);
+  auto t1 = at::randn({3, 4}, options);
+
+  auto t3 = t0.unsqueeze(0).add(t1);
+  {
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0, t1});
+    auto cg_outputs = fe.runFusion({t0, t1});
+
+    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
+  }
+
+  // Make sure tv2 indexing also works when it's stored in global memory
+  tv2->setMemoryType(MemoryType::Global);
+  {
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0, t1});
+    auto cg_outputs = fe.runFusion({t0, t1});
+
+    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
+  }
+}
+
+// Repro of #1534. Validation should detect invalid vectorization.
+TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
+  std::vector<int64_t> shape1{2, 3, 2};
+  std::vector<int64_t> shape2{2, 2};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor(shape1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigConcreteTensor(shape2);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv1);
+  auto tv3 = broadcast(tv2, {false, true, false});
+  auto tv4 = add(tv0, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->merge(1, 2);
+  tv4->merge(0, 1);
+  tv4->split(0, 4);
+  TransformPropagator::from(tv4);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+
+  // Vectorization of tv2 should be detected as invalid.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
+  std::vector<int64_t> shape1{2, 2, 2};
+  std::vector<int64_t> shape2{1, 2, 2};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I0, I1, I2]
+  auto tv0 = makeContigTensor(shape1.size());
+  fusion.addInput(tv0);
+
+  // [B3, I1, I2]
+  auto tv1 = makeContigConcreteTensor(shape2);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(1, 2);
+  tv3->merge(0, 1);
+  tv3->split(0, 4);
+
+  // Don't modify tv1 so that it's replayed as tv2 with actual
+  // transformations. It would create temporary IterDomains, and the
+  // validation should still be able to detect vectorization by 4 is valid.
+  // TransformPropagator::from(tv3);
+  tv2->merge(1, 2);
+  tv2->merge(0, 1);
+  tv2->split(0, 4);
+
+  tv2->computeAt(tv3, -2);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
+  std::vector<int64_t> shape0{100, 14, 2, 14};
+  std::vector<int64_t> shape1{100, 2, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(shape0.size());
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(shape1.size());
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true, false, false});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape0, options);
+  auto t1 = at::randn(shape1, options);
+
+  auto lparams = schedulePointwise(&fusion, {t0, t1});
+
+  GpuLower gpulw(&fusion);
+  auto kernel = gpulw.kernel();
+
+  // The innermost two dimensions are merged and contiguous, so
+  // vectorization can be done against 2*14=28 rather than 14, so
+  // vector word size should be 4. Broadcasting of tv1 should not
+  // matter.
+  for (const auto& vec_info : kernel->summary().vectorized_set_info) {
+    TORCH_CHECK(
+        vec_info.word_size == 4,
+        "Invalid vector word size: ",
+        vec_info.word_size);
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1.unsqueeze(-3);
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1539.
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false});
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(0, 4);
+
+  TransformPropagator::from(tv2);
+
+  // All tensors must be transformed to a 2D tensor with each axis
+  // mapped with each other in the LOOP map.
+  ComputeAtMap ca_map(&fusion);
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    TORCH_CHECK(
+        tv->nDims() == 2, "Expected to be a 2D tensor but: ", tv->toString());
+    for (const auto i : c10::irange(2)) {
+      TORCH_CHECK(ca_map.areMapped(
+          tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE));
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false});
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Merging a trivial reduction with a non-reduction domain
+  tv2->merge(0, 1);
+  tv2->split(0, 4);
+
+  tv3->split(0, 4);
+
+  // tv2 and tv3 are different as tv3 lacks the trivial reduction, but
+  // they are mapped with each other by BestEffortReplay as the merge
+  // of trivial reduciton dim is forwarded.
+
+  PairwiseRootDomainMap root_map(tv2, tv3);
+
+  auto p2c = BestEffortReplay::replayCasP(tv3, tv2, 2, root_map).getReplay();
+  for (const auto i : c10::irange(tv2->nDims())) {
+    auto tv2_id = tv2->axis(i);
+    auto it = p2c.find(tv2_id);
+    TORCH_CHECK(
+        it != p2c.end(),
+        "Expected mapped consumer ID but not found: ",
+        tv2_id->toString());
+    auto tv3_mapped_id = it->second;
+    TORCH_CHECK(
+        tv3_mapped_id == tv3->axis(i),
+        "Unexpected mapped consumer ID: ",
+        tv3_mapped_id->toString());
+  }
+
+  auto c2p = BestEffortReplay::replayPasC(tv2, tv3, 2, root_map).getReplay();
+  for (const auto i : c10::irange(tv3->nDims())) {
+    auto tv3_id = tv3->axis(i);
+    auto it = c2p.find(tv3_id);
+    TORCH_CHECK(
+        it != c2p.end(),
+        "Expected mapped producer ID but not found: ",
+        tv3_id->toString());
+    auto tv2_mapped_id = it->second;
+    TORCH_CHECK(
+        tv2_mapped_id == tv2->axis(i),
+        "Unexpected mapped consumer ID: ",
+        tv2_mapped_id->toString());
+  }
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  // Similar pattern as FusionTrivialReductionForwarding2 but no
+  // trivial reduciton at the root domain
+
+  // Create a trivial reduction by splitting with a factor of 1
+  tv1->split(1, 1, false);
+  // Merging with a trivial reduction
+  tv1->merge(0, 1);
+  tv1->split(0, 5);
+
+  tv2->split(0, 5);
+
+  // While the merge of tv1 is done with a trivial reduciton, it's not
+  // a root domain, so forwarding is not enabled. BestEffortReplay
+  // should only map the first axis of each tensor.
+
+  PairwiseRootDomainMap root_map(tv1, tv2);
+  auto p2c = BestEffortReplay::replayCasP(tv2, tv1, 2, root_map).getReplay();
+  TORCH_CHECK(p2c.size() == 1, "Expected only one mapping found");
+  TORCH_CHECK(p2c.begin()->first == tv1->getRootDomain().at(0));
+  TORCH_CHECK(p2c.begin()->second == tv2->getRootDomain().at(0));
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  // tv4 has a trivial reduction axis
+  auto tv4 = sum(tv2, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv3->merge(0, 1);
+  tv3->split(0, 32);
+
+  // This causes the trivial reduction of tv4 to be merged with
+  // another axis of tv4, and then forward computeAt is done from tv4
+  // to tv5. The split of the merged id of tv4 should be done on tv5
+  // by forwarding the merge of the trivial reduction.
+  tv0->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({111}, options);
+  auto t1 = at::randn({123, 111}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto t2 = t0.unsqueeze(0);
+  auto t3 = t1 + t2;
+  auto t5 = sum(t2, {0}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__);
+}
+
+// See issue #1598
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Place tv2 on shared memory
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->setMemoryType(MemoryType::Shared);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->split(0, 2);
+  tv3->split(-1, 4);
+  // swap tidx and tidy
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv4->split(0, 2);
+  tv4->split(-1, 4);
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 1);
+  tv3->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10, 64}, options);
+  auto t1 = at::randn({10, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue #1598
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv4->split(0, 2);
+  tv4->split(-1, 4);
+  // Also do unroll for tv3 and tv4
+  tv4->split(-2, 8, false);
+  tv4->axis(-3)->parallelize(ParallelType::Unroll);
+  // swap tidx and tidy
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 1);
+  tv3->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10, 64}, options);
+  auto t1 = at::randn({10, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue #1599
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Use unroll where a RAW-sync tensor is stored
+
+  tv4->split(0, 2);
+  tv4->split(0, 3);
+  tv4->split(-1, 4);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 3);
+  tv3->computeAt(tv4, -1);
+
+  tv2->split(-1, 4);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({50, 64}, options);
+  auto t1 = at::randn({50, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See #1618
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({16, 128});
+  auto tv1 = makeConcreteTensor({16, 128});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = set(tv2);
+  auto tv5 = set(tv3);
+  auto tv6 = add(tv4, tv5);
+  fusion.addOutput(tv6);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  tv2->computeAt(tv6, 0);
+  tv3->computeAt(tv6, 1);
+  tv4->computeAt(tv6, 1);
+  tv5->computeAt(tv6, -1);
+  tv2->split(1, 64);
+  tv3->split(1, 64);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Check the block sync is inserted at the correct location.
+  //  There is exactly one block sync needed in this test case
+  //    and the sync needs to be after the 2 expressions
+  //    that modify shared memory.
+  class SyncInsertionPointChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(UnaryOp* uop) final {
+      // Record number of unary ops that modifies shared memory.
+      if (uop->out()->isA<kir::TensorIndex>() &&
+          uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() ==
+              MemoryType::Shared) {
+        number_of_writes_++;
+      }
+    }
+    void handle(kir::BlockSync* bsync) final {
+      // Make sure both shared memory modifying expressions
+      //  have been observed at the sync insertion point.
+      TORCH_INTERNAL_ASSERT(
+          number_of_writes_ == 2,
+          "FusionRAWSyncInsertionPlace4 test fail:",
+          "only 1 sync after the 2 shared mem writes is needed in this test,"
+          "either a redundant sync has been inserted or the block sync is not inserted at the right place");
+    }
+
+   private:
+    int number_of_writes_ = 0;
+  } sync_insertion_checker;
+  GpuLower gpulw(&fusion);
+  sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs());
+}
+
+// Test serial write and parallel read of shared mem: mapped case
+TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({128, 6});
+  TensorView* tv1 = makeConcreteTensor({128, 6});
+  TensorView* tv2 = makeConcreteTensor({128, 6});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv1);
+  TensorView* tv4 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  //  Use shared memory
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Parallelize t4, in this case dim 0 on tv3 will
+  //  not be parallelized but dim0 of t4 will be.
+  // We will need to make sure a sync is inserted
+  //  even if these dimensions are mapped.
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({128, 6}, options);
+  at::Tensor t1 = at::randn({128, 6}, options);
+  at::Tensor t2 = at::randn({128, 6}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2});
+  auto cg_outputs = fe.runFusion({t0, t1, t2});
+
+  auto ref = t0 + t1 + t2;
+
+  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
+}
+
+// Test serial write and parallel read of shared mem: un-mapped case
+TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({128, 6});
+  TensorView* tv1 = makeConcreteTensor({128, 6});
+  TensorView* tv2 = makeConcreteTensor({128, 6});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv1);
+  TensorView* tv4 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  //  Use shared memory
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Split and parallelize t4,
+  //  the parallelized dimension in t4 will not
+  // map across to the shared mem tensor, t3. So
+  // there will need to be a sync before use of t3.
+  tv4->split(0, 2);
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({128, 6}, options);
+  at::Tensor t1 = at::randn({128, 6}, options);
+  at::Tensor t2 = at::randn({128, 6}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2});
+  auto cg_outputs = fe.runFusion({t0, t1, t2});
+
+  auto ref = t0 + t1 + t2;
+
+  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
+}
+
+// Test predicate removal on reg-to-reg expressions
+TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  TensorView* tv3 = set(tv2);
+  TensorView* tv4 = set(tv3);
+
+  fusion.addOutput(tv4);
+  tv4->split(1, 4);
+  tv0->computeAt(tv4, -2);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  class PredicateRemovalChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(UnaryOp* uop) final {
+      assertOnLocalToLocal(uop);
+    }
+
+    // Utility to assert any local-to-local expr is only trivially predicated.
+    void assertOnLocalToLocal(Expr* expr) {
+      bool is_local = true;
+      for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) {
+        if (in->view()->getMemoryType() != MemoryType::Local) {
+          is_local = false;
+        }
+      }
+      for (auto in :
+           ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) {
+        if (in->view()->getMemoryType() != MemoryType::Local) {
+          is_local = false;
+        }
+      }
+
+      if (is_local) {
+        if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) {
+          TORCH_INTERNAL_ASSERT(
+              ite->predicate()->value()->isConst(),
+              "redundant predicate on: ",
+              expr);
+        }
+      }
+    }
+
+   private:
+    bool within_ite_ = false;
+  } pred_checker;
+
+  GpuLower gpulw(&fusion);
+  pred_checker.handle(gpulw.kernel()->topLevelExprs());
+}
+
+TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tvs = Welford(tv0, {0});
+  auto tv_avg = tvs.avg;
+  fusion.addOutput(tv_avg);
+
+  tv_avg->split(0, 128);
+  TransformPropagator::from(tv_avg);
+
+  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
+  tv_avg->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Make sure the parallelization of tv_avg is propagated to the var
+  // and count tensors.
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->exprs()) {
+    auto wop = dynamic_cast<WelfordOp*>(expr);
+    if (wop == nullptr) {
+      continue;
+    }
+    auto ref = wop->outAvg()->as<TensorView>();
+    for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) {
+      if (ref == sibling) {
+        continue;
+      }
+      TORCH_CHECK(
+          ref->nDims() == sibling->nDims(),
+          "Invalid sibling: ",
+          sibling->toString());
+      for (const auto i : c10::irange(ref->nDims())) {
+        TORCH_CHECK(
+            ref->axis(i)->getParallelType() ==
+                sibling->axis(i)->getParallelType(),
+            "Mismatched parallel types between siblings. ",
+            ref->toString(),
+            ", ",
+            sibling->toString());
+      }
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({9999}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
+}
+
+// Test ExactRootDomainMap
+TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = transpose(tv2, {{0, 1}});
+  auto tv4 = add(tv2, tv1);
+  auto tv5 = add(tv2, tv3);
+  auto tv6 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  const auto exact_map = ExactRootDomainMap(&fusion);
+
+  // In the exact mapping, the broadcast domain introduced at tv2 is
+  // only mapped with the another one in tv3, which is just transposed
+  // from tv2. Any other domain, including the second domain of tv4,
+  // must not be mapped.
+
+  auto tv2_bc = tv2->axis(1);
+  auto tv3_bc = tv3->axis(0);
+
+  TORCH_CHECK(
+      exact_map.areMapped(tv2_bc, tv3_bc),
+      "Invalid exact root domain map: ",
+      exact_map.toString());
+
+  // They must not be mapped with anything else.
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    for (auto root_id : tv->getRootDomain()) {
+      if (root_id == tv2_bc || root_id == tv3_bc) {
+        continue;
+      }
+      TORCH_CHECK(
+          !exact_map.areMapped(root_id, tv2_bc),
+          "Invalid exact root domain map: ",
+          exact_map.toString());
+      TORCH_CHECK(
+          !exact_map.areMapped(root_id, tv3_bc),
+          "Invalid exact root domain map: ",
+          exact_map.toString());
+    }
+  }
+}
+
+class NVFuserMultithreadedTest : public ::testing::Test {
+ protected:
+  bool was_enabled = false;
+
+  void SetUp() override {
+    was_enabled = fuser::cuda::setEnabled(true);
+  }
+
+  void TearDown() override {
+    fuser::cuda::setEnabled(was_enabled);
+  }
+};
+
+TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) {
+  std::string ir = R"IR(
+graph(%x.1 : Tensor,
+      %y.1 : Tensor):
+  %12 : NoneType = prim::Constant()
+  %11 : bool = prim::Constant[value=0]()
+  %9 : int = prim::Constant[value=1]()
+  %3 : Tensor = aten::exp(%x.1)
+  %5 : Tensor = aten::relu(%y.1)
+  %6 : Tensor = aten::sin(%5)
+  %8 : Tensor = aten::add(%3, %6, %9)
+  %10 : int[] = prim::ListConstruct(%9)
+  %13 : Tensor = aten::sum(%8, %10, %11, %12)
+  return (%13)
+)IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(ir, g.get());
+  GraphFunction fn("nvfuser_test", g, nullptr);
+
+  auto run_kernel = [&fn]() {
+    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    std::vector<IValue> results;
+    for (const auto& _ : c10::irange(10)) {
+      auto stack = createStack({x.clone(), y.clone()});
+      fn.run(stack);
+      results.push_back(stack.back());
+    }
+    for (const auto& i : c10::irange(1, 10)) {
+      auto t0 = results[0].toTensor();
+      auto ti = results[i].toTensor();
+      ASSERT_TRUE(at::allclose(t0, ti));
+    }
+  };
+
+  constexpr size_t kNumThreads = 4;
+  std::vector<std::thread> threads;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    threads.emplace_back(run_kernel);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) {
+  auto run_kernel = []() {
+    const std::string ir = R"IR(
+  graph(%x.1 : Tensor,
+        %y.1 : Tensor):
+    %12 : NoneType = prim::Constant()
+    %11 : bool = prim::Constant[value=0]()
+    %9 : int = prim::Constant[value=1]()
+    %3 : Tensor = aten::exp(%x.1)
+    %5 : Tensor = aten::relu(%y.1)
+    %6 : Tensor = aten::sin(%5)
+    %8 : Tensor = aten::add(%3, %6, %9)
+    %10 : int[] = prim::ListConstruct(%9)
+    %13 : Tensor = aten::sum(%8, %10, %11, %12)
+    return (%13)
+  )IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(ir, g.get());
+    GraphFunction fn("nvfuser_test", g, nullptr);
+
+    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    std::vector<IValue> results;
+    constexpr size_t numRuns = 10;
+    for (const auto& _ : c10::irange(numRuns)) {
+      auto stack = createStack({x.clone(), y.clone()});
+      fn.run(stack);
+      results.push_back(stack.back());
+    }
+    for (const auto& i : c10::irange(1, numRuns)) {
+      auto t0 = results[0].toTensor();
+      auto ti = results[i].toTensor();
+      ASSERT_TRUE(at::allclose(t0, ti));
+    }
+  };
+
+  constexpr size_t kNumThreads = 4;
+  std::vector<std::thread> threads;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    threads.emplace_back(run_kernel);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int X = 256, Y = 7, Z = 2048;
+
+  // setup fusion
+  auto tv0 = makeContigTensor(4, DataType::Half);
+  fusion.addInput(tv0);
+  auto tv1 = castOp(DataType::Float, tv0);
+
+  auto tvs = Welford(tv1, {0, 1, 2});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+
+  auto cached_input = tv0->cacheAfter();
+  auto cached_avg = tv_avg->cacheBefore();
+  auto cached_M2 = tv_M2->cacheBefore();
+
+  auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[0];
+
+  reduction_tv->merge(0);
+  reduction_tv->merge(0);
+
+  int TIDx = 16;
+  int vec = 4;
+
+  int TIDy = 16;
+  int outer_tidy_fact = 16;
+
+  reduction_tv->split(-1, TIDx * vec);
+  reduction_tv->split(-1, vec);
+  reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
+  reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  reduction_tv->axis(-3)->parallelize(ParallelType::BIDx);
+
+  reduction_tv->split(0, TIDy);
+  reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+  reduction_tv->split(0, outer_tidy_fact);
+  reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
+
+  // T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24,
+  // iV25{4} ]
+  reduction_tv->reorder({{3, 0}, {4, 1}, {0, 2}, {2, 3}, {1, 4}, {5, 5}});
+  // T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16},
+  // iV25{4}]
+
+  TransformPropagator::from(reduction_tv);
+  auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {4});
+  scheduler_utils::parallelizeAllLike(rfactor_tv, ir_utils::allTvs(&fusion));
+
+  tv0->computeAt(tv_avg, 2);
+  tv0->computeAt(cached_input, -2);
+
+  cached_input->computeAt(rfactor_tv, 4, ComputeAtMode::BestEffort);
+
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    if (tv == cached_input || tv == tv_avg || tv == tv_M2) {
+      continue;
+    }
+    tv->axis(-1)->parallelize(ParallelType::Serial);
+  }
+
+  CompileOptions co;
+  co.index_mode = KernelIndexMode::INT32;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {}, LaunchParams(), co);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
+
+  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y));
+
+  auto at_mu = at::mean(t0.to(at::kDouble), {0, 1, 2});
+  auto at_var = at::var(t0.to(at::kDouble), {0, 1, 2}, false);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {t0},
+      {at_mu, at_var},
+      __LINE__,
+      __FILE__,
+      "",
+      LaunchParams(-1, -1, -1, -1, -1, -1));
+}
+
+// Test sync insertion with redundant predicates
+TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({32});
+  TensorView* tv1 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+
+  auto tv0c = tv0->cacheAfter();
+
+  // Make a redundant write through smem
+  tv0c->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv3, 0);
+  tv1->computeAt(tv3, 0);
+
+  tv0c->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Utility class to make sure one block sync
+  //  is inserted by RAW pass.
+  class SyncChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+    bool result() {
+      return sync_seen_;
+    }
+
+   private:
+    void handle(kir::BlockSync*) final {
+      sync_seen_ = true;
+    }
+
+   private:
+    bool sync_seen_ = false;
+  } checker;
+
+  GpuLower gpulw(&fusion);
+  checker.handle(gpulw.kernel()->topLevelExprs());
+  TORCH_INTERNAL_ASSERT(checker.result(), "Expected block sync not inserted");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({32}, options);
+  at::Tensor t1 = at::randn({32, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // [I, R]
+  auto tv1 = sum(tv0, {1});
+  // [I, B]
+  auto tv2 = unsqueeze(tv1, -1);
+  fusion.addOutput(tv2);
+
+  TORCH_CHECK(
+      tv2->nDims() == 2, "Unpected unsqueeze result: ", tv2->toString());
+  TORCH_CHECK(
+      tv2->axis(1)->isBroadcast(),
+      "Unexpected unsqueeze result: ",
+      tv2->toString());
+
+  // tv1 has only one non-reduction axis. An exception should be
+  // thrown.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(unsqueeze(tv1, 2));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum(1).unsqueeze(-1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // [I, B]
+  auto tv1 = sum(tv0, {1}, true);
+  // [I]
+  auto tv2 = squeeze(tv1, {shape[0], 1});
+  fusion.addOutput(tv2);
+
+  TORCH_CHECK(
+      tv2->nDims() == 2, "Unexpected squeeze result: ", tv2->toString());
+
+  // [I, R]
+  auto tv3 = sum(tv0, {1});
+  // tv3 has only one non-reduction axis. The extent of the first axis
+  // is not one, so squeeze should fail.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(squeeze(tv3, {shape[0], 1}));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum(1, true).squeeze(-1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = broadcast(tv1, {false, true, false});
+  fusion.addOutput(tv2);
+
+  tv2->merge(-2, -1);
+  tv2->merge(-2, -1);
+  tv2->split(-1, 100);
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({3, 4}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0.unsqueeze(1);
+
+  testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
new file mode 100644
index 000000000000..50d8a7bfd4b2
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
@@ -0,0 +1,1272 @@
+#if defined(USE_CUDA)
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+// fuser and IR parser
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+namespace {
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .ndims(ndims)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(ndims, true))
+      .build();
+}
+
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+class KernelExprVisitor : private kir::IrVisitor {
+ public:
+  static std::vector<Expr*> getAllExprs(const kir::Kernel* kernel) {
+    KernelExprVisitor visitor(kernel);
+    return visitor.all_exprs_;
+  }
+
+ private:
+  KernelExprVisitor(const kir::Kernel* kernel) {
+    handle(kernel->topLevelExprs());
+  }
+
+  using kir::IrVisitor::handle;
+
+  void handle(Expr* expr) final {
+    all_exprs_.push_back(expr);
+    kir::IrVisitor::handle(expr);
+  }
+
+ private:
+  std::vector<Expr*> all_exprs_;
+};
+
+void validateNoParallelBroadcastExist(kir::Kernel* kernel) {
+  for (auto expr : KernelExprVisitor::getAllExprs(kernel)) {
+    BroadcastOp* bc = dynamic_cast<BroadcastOp*>(expr);
+    if (bc == nullptr) {
+      auto grid_bc = dynamic_cast<kir::GridBroadcast*>(expr);
+      if (grid_bc != nullptr) {
+        std::cerr << "Grid broadcast: " << grid_bc->toString();
+        bc = grid_bc->broadcast_op();
+      }
+    }
+    if (bc == nullptr) {
+      continue;
+    }
+    TORCH_CHECK(
+        kernel->summary().broadcast_parallel_types.at(bc).none(),
+        "Parallel broadcast should not exist but was found: ",
+        bc->toString());
+  }
+}
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionReduceAndBroadcast1_CUDA) {
+  const int nx = 999;
+  const int tidx = 128;
+
+  if (ceilDiv(nx, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, tidx);
+  TransformPropagator::from(tv3);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = sum(t0).unsqueeze(0) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduceAndBroadcast2_CUDA) {
+  const int nx = 99;
+  const int tidx = 32;
+
+  if (ceilDiv(nx, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, tidx);
+  TransformPropagator::from(tv3);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv2});
+
+  // Broadcast on TIDy instead of TIDx. This still uses the fused
+  // reduction as it's broadcast on BIDx as well. Since TIDy is not
+  // predicated, the broadcast becomes a set op.
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = sum(t0).unsqueeze(0) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Grid reduction with serial non-reduction axis. The global work
+// buffer is double buffered.
+TEST_F(NVFuserTest, FusionReduceAndBroadcast3_CUDA) {
+  const int nx = 100;
+  const int ny = 5000;
+  const int tidx = 128;
+
+  if (ceilDiv(ny, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(1, tidx);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  tv3->axis(1)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx, ny}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = sum(t0, {1}).unsqueeze(-1) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Indirect reduction and broadcast
+TEST_F(NVFuserTest, FusionReduceAndBroadcast4_CUDA) {
+  const int nx = 999;
+  const int tidx = 128;
+
+  if (ceilDiv(nx, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true});
+  auto tv4 = add(tv0, tv3);
+
+  fusion.addOutput(tv4);
+
+  tv4->split(0, tidx);
+  TransformPropagator::from(tv4);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (sum(t0) + 1).unsqueeze(0) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Unused block dimension in the kernel
+TEST_F(NVFuserTest, FusionReduceAndBroadcast5_CUDA) {
+  const int nx = 999;
+  const int tidx = 128;
+  const int iter = 2;
+  const int bdimx = 9; // One more than required by the reduction
+  const int bdimy = 3; // Want an unused dimension
+
+  // Going to bump the bdimx count for this test, ignor
+  if (bdimx * bdimy > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Didn't setup this test with inlining for register usage, so just leave the
+  // iter dimension concrete
+  auto tv0 = makeConcreteTensor({iter, -1});
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = add(tv0, tv3);
+
+  fusion.addOutput(tv4);
+
+  // Dummy op to mess with parallelization
+  auto tv5 = makeSymbolicTensor(2);
+  fusion.addInput(tv5);
+  auto tv6 = set(tv5);
+  fusion.addOutput(tv6);
+
+  // Setup the reduction
+  tv4->split(1, tidx);
+  TransformPropagator::from(tv4);
+
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4, ir_utils::allTvs(&fusion));
+
+  tv6->axis(0)->parallelize(ParallelType::BIDy);
+  tv6->axis(1)->parallelize(ParallelType::BIDx);
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({iter, nx}, options);
+  auto t5 = at::randn({bdimy, bdimx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t5});
+  auto cg_outputs = fe.runFusion({t0, t5});
+
+  auto ref = (sum(t0, {1}) + 1).unsqueeze(-1) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0, t5}, {ref, t5}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordAndBroadcast1_CUDA) {
+  const int nx = 999;
+  const int tidx = 128;
+
+  if (ceilDiv(nx, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0});
+  auto tv2 = broadcast(tvs.avg, {true});
+  auto tv3 = broadcast(tvs.var_sum, {true});
+  auto tv4 = add(tv0, tv2);
+  auto tv5 = add(tv4, tv3);
+
+  fusion.addOutput(tv5);
+
+  tv5->split(0, tidx);
+  TransformPropagator::from(tv5);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref =
+      (t0.mean({0}).unsqueeze(0) + t0) + t0.var({0}, false).unsqueeze(0) * nx;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Grid welford reduction with serial non-reduction axis. The global
+// work buffer is double buffered.
+TEST_F(NVFuserTest, FusionWelfordAndBroadcast2_CUDA) {
+  const int nx = 100;
+  const int ny = 5000;
+  const int tidx = 128;
+
+  if (ceilDiv(ny, tidx) > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  auto tv2 = broadcast(tvs.avg, {false, true});
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(1, tidx);
+  TransformPropagator::from(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  tv3->axis(1)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  // There must be no parallel broadcast
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({nx, ny}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (sum(t0, {1}) / ny).unsqueeze(-1) + t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Persistent batchnorm. Uses the fused reduction for grid welford and
+// broadcast.
+TEST_F(NVFuserTest, FusionFusedReductionBatchnorm_CUDA) {
+  const std::vector<int64_t> input_shape{256, 2048, 14, 14};
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4, DataType::Half);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(tv1);
+  auto tv2 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(tv2);
+  auto tv3 = makeSymbolicTensor(1, DataType::Float);
+  fusion.addInput(tv3);
+  auto tv4 = makeSymbolicTensor(1, DataType::Float);
+  fusion.addInput(tv4);
+
+  auto d34 = IrBuilder::create<Double>(1);
+  auto tv5 = castOp(DataType::Float, tv0);
+  auto tv6 = castOp(DataType::Float, tv1);
+  auto tv7 = castOp(DataType::Float, tv2);
+  auto tvs = Welford(tv5, {0, 2, 3});
+  auto tv8 = tvs.avg;
+  auto tv9 = tvs.var_sum;
+  auto tv10 = tvs.n;
+  auto tv11 = mul(tv8, IrBuilder::create<Double>(0.1));
+  auto tv12 = mul(tv3, d34);
+  auto tv13 = add(tv12, tv11);
+  auto d43 = IrBuilder::create<Double>(0.5);
+  auto tv14 = mul(tv9, d43);
+  auto tv15 = mul(tv14, IrBuilder::create<Double>(0.1));
+  auto tv16 = mul(tv4, d34);
+  auto tv17 = add(tv16, tv15);
+  auto tv18 = broadcast(tv8, {true, false, true, true});
+  auto tv19 = sub(tv5, tv18);
+  auto tv20 = mul(tv9, d43);
+  auto tv21 = add(tv20, IrBuilder::create<Double>(0.0001));
+  auto tv22 = rsqrt(tv21);
+  auto tv23 = broadcast(tv22, {true, false, true, true});
+  auto tv24 = mul(tv19, tv23);
+  auto tv25 = broadcast(tv6, {true, false, true, true});
+  auto tv26 = mul(tv24, tv25);
+  auto tv27 = broadcast(tv7, {true, false, true, true});
+  auto tv28 = add(tv26, tv27);
+  auto tv29 = castOp(DataType::Half, tv28);
+  fusion.addOutput(tv13);
+  fusion.addOutput(tv17);
+  fusion.addOutput(tv29);
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto tv1_cache = tv1->cacheAfter();
+  auto tv2_cache = tv2->cacheAfter();
+  auto tv3_cache = tv3->cacheAfter();
+  auto tv4_cache = tv4->cacheAfter();
+
+  auto tv13_cache = tv13->cacheBefore();
+  auto tv17_cache = tv17->cacheBefore();
+  auto tv29_cache = tv29->cacheBefore();
+
+  tv0->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false);
+  tv0->split(0, NamedScalar::getParallelDim(ParallelType::BIDy), false);
+  tv0->split(1, 8, false);
+  tv0->split(2, 8, false);
+  tv0->merge(-2, -1);
+  tv0->split(-1, 2);
+  tv0->split(-2, 1, false);
+  tv0->split(-2, 1, false);
+  tv0->reorder(
+      {{4, 0},
+       {5, 1},
+       {0, 2},
+       {3, 3},
+       {8, 4},
+       {1, 5},
+       {7, 6},
+       {2, 7},
+       {9, 8},
+       {6, 9}});
+
+  TransformPropagator::from(tv0);
+
+  auto tvs_rf = tvs.rFactor({-5, -4, -3, -2, -1});
+
+  tv0->computeAt(tv29, 2);
+  tv1->computeAt(tv29, 2);
+  tv2->computeAt(tv29, 2);
+  tv3->computeAt(tv13, 2);
+  tv4->computeAt(tv17, 2);
+
+  tv29->axis(0)->parallelize(ParallelType::BIDx);
+  tv29->axis(2)->parallelize(ParallelType::BIDy);
+  tv29->axis(3)->parallelize(ParallelType::TIDz);
+  tv29->axis(4)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv29, ir_utils::allTvs(&fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn(input_shape, options_half);
+  auto t1 = at::randn(input_shape[1], options_half);
+  auto t2 = at::randn(input_shape[1], options_half);
+  auto t3 = at::randn(input_shape[1], options);
+  auto t4 = at::randn(input_shape[1], options);
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3, t4};
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  FusionExecutor fe;
+  LaunchParams launch_params(2, 2, -1, -1, -1, -1);
+  fe.compileFusion(&fusion, aten_inputs, launch_params);
+  auto cg_outputs = fe.runFusion(aten_inputs, launch_params);
+
+  auto t5 = t0.to(at::kFloat);
+  auto t6 = t1.to(at::kFloat);
+  auto t7 = t2.to(at::kFloat);
+  auto t8 = t5.mean({0, 2, 3});
+  auto t9 = t5.var({0, 2, 3}, false) * input_shape[0] * input_shape[2] *
+      input_shape[3];
+  auto t11 = t8 * 0.1;
+  auto t12 = t3 * 1;
+  auto t13 = t12 + t11;
+  auto t14 = t9 * 0.5;
+  auto t15 = t14 * 0.1;
+  auto t16 = t4 * 1;
+  auto t17 = t16 + t15;
+  auto t18 = t8.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t19 = t5 - t18;
+  auto t20 = t9 * 0.5;
+  auto t21 = t20 + 0.0001;
+  auto t22 = rsqrt(t21);
+  auto t23 = t22.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t24 = t19 * t23;
+  auto t25 = t6.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t26 = t24 * t25;
+  auto t27 = t7.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+  auto t28 = t26 + t27;
+  auto t29 = t28.to(at::kHalf);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {t13, t17, t29},
+      __LINE__,
+      __FILE__,
+      "",
+      launch_params);
+}
+
+// Simple grouped reduction
+TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  groupReductions({tv1, tv2});
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({99, 999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum({1}) * 2;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Grouping reductions with different ops
+TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(2));
+  auto tv4 = max(tv3, {1});
+
+  auto tv5 = add(tv2, tv4);
+  fusion.addOutput(tv5);
+
+  groupReductions({tv2, tv4});
+
+  tv2->split(1, 128);
+  TransformPropagator::from(tv2);
+
+  tv0->computeAt(tv4, -1, ComputeAtMode::MostInlined);
+
+  // tv4 is automatically parallelized in the same way
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  std::vector<int64_t> shape({99, 999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1));
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Grouped reduction with different types
+TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+
+  auto tv2 = castOp(DataType::Double, tv0);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = castOp(DataType::Float, tv3);
+
+  auto tv5 = add(tv1, tv4);
+  fusion.addOutput(tv5);
+
+  groupReductions({tv1, tv3});
+  tv1->split(1, 128);
+  TransformPropagator::from(tv1);
+
+  tv0->computeAt(tv5, -1, ComputeAtMode::MostInlined);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDx);
+
+  std::vector<int64_t> shape({99, 999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat);
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Testing validation
+TEST_F(NVFuserTest, FusionGroupedReduction4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = sum(tv1, {1});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Invalid grouping as tv2 and tv3 are not guaranteed to have the
+  // same shape
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(groupReductions({tv2, tv3}));
+}
+
+// Testing validation
+TEST_F(NVFuserTest, FusionGroupedReduction5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 128);
+  tv2->split(1, 64);
+
+  // Invalid grouping as tv1 and tv2 don't have the same
+  // transformations
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(groupReductions({tv1, tv2}));
+}
+
+// Grouping 3 reductions
+TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(2));
+  auto tv4 = sum(tv3, {1});
+
+  auto tv5 = add(tv0, IrBuilder::create<Double>(3));
+  auto tv6 = sum(tv5, {1});
+
+  auto tv7 = add(add(tv2, tv4), tv6);
+
+  fusion.addOutput(tv7);
+
+  groupReductions({tv2, tv4, tv6});
+
+  // There's no runtime grid reduction function that can take more
+  // than 2 inputs, yet.
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({99, 999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum({1}) + (t0 + 2).sum({1}) + (t0 + 3).sum({1});
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGroupedReduction7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  auto tv4 = sum(tv3, {1});
+  fusion.addOutput(tv4);
+
+  // Invalid grouping as tv3 depends on tv1
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(groupReductions({tv1, tv4}));
+}
+
+// Grouping rfactor'ed reductions
+TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  const size_t gdimx = 10;
+  const size_t bdimx = 128;
+
+  tv1->split(0, gdimx, false);
+  tv1->split(1, bdimx);
+  auto tv1_rf = tv1->rFactor({1});
+
+  tv2->split(0, gdimx, false);
+  tv2->split(1, bdimx);
+  auto tv2_rf = tv2->rFactor({1});
+
+  groupReductions({tv1_rf, tv2_rf});
+  groupReductions({tv1, tv2});
+
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(2)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv1_rf, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({12345});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum({0}) * 2;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Rfactoring grouped reductions
+TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  groupReductions({tv1, tv2});
+
+  const size_t gdimx = 10;
+  const size_t bdimx = 128;
+
+  tv1->split(0, gdimx, false);
+  tv1->split(1, bdimx);
+
+  // This should rfactor tv2 as well
+  auto rf_tvs = tv1->rFactor({1}, {tv1, tv2});
+  auto tv1_rf = rf_tvs.at(0);
+
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(2)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv1_rf, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({12345});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum({0}) * 2;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Group reductions of tensors that have computeAt positions set
+TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = sum(tv1, {1});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  const size_t bdimx = 128;
+
+  tv2->split(1, bdimx);
+  auto tv2_rf = tv2->rFactor({1});
+  tv2_rf->reorder({{1, 2}});
+
+  tv3->split(1, bdimx);
+  auto tv3_rf = tv3->rFactor({1});
+  tv3_rf->reorder({{1, 2}});
+
+  tv0->computeAt(tv4, -1, ComputeAtMode::MostInlined);
+
+  groupReductions({tv2_rf, tv3_rf});
+  groupReductions({tv2, tv3});
+
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({3, 1234});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum({1}) * 2;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = sum(tv0, {0});
+  auto tv4 = broadcast(tv3, {true});
+  auto tv5 = add(tv0, tv2);
+  auto tv6 = add(tv5, tv4);
+  fusion.addOutput(tv6);
+
+  groupReductions({tv1, tv3});
+
+  tv2->split(0, 128);
+  TransformPropagator::from(tv2);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto t3 = t0.sum({0}).unsqueeze(-1);
+  auto ref = t0 + t3 + t3;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Grid reductionso of different types
+TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  auto tv3 = castOp(DataType::Double, tv0);
+  auto tv4 = sum(tv3, {1});
+  auto tv5 = broadcast(tv4, {false, true});
+  auto tv6 = castOp(DataType::Float, tv5);
+
+  auto tv7 = add(tv0, tv2);
+  auto tv8 = add(tv7, tv6);
+  fusion.addOutput(tv8);
+
+  groupReductions({tv1, tv4});
+  tv1->split(1, 128);
+  TransformPropagator::from(tv1);
+
+  tv0->computeAt(tv8, -1, ComputeAtMode::MostInlined);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv1, ir_utils::allTvs(&fusion));
+
+  std::vector<int64_t> shape({99, 999});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto t2 = t0.sum({1}).unsqueeze(-1);
+  auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat);
+  auto ref = t0 + t2 + t6;
+
+  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Persistent batchnorm backward with grouped allreduce
+TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) {
+  const std::vector<int64_t> shape({64, 1024, 14, 14});
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto input = makeContigTensor(4);
+  fusion.addInput(input);
+  auto grad_output = makeContigTensor(4);
+  fusion.addInput(grad_output);
+  auto weight = makeContigTensor(1);
+  fusion.addInput(weight);
+  auto save_mean = makeContigTensor(1);
+  fusion.addInput(save_mean);
+  auto save_invstd = makeContigTensor(1);
+  fusion.addInput(save_invstd);
+
+  const bool kTraining = true;
+  const bool channels_last = false;
+
+  const size_t kNumberOfDims =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain()).size();
+  size_t c_axis = channels_last ? kNumberOfDims - 1 : 1;
+
+  std::vector<int> reduction_axes;
+  std::vector<bool> broadcast_mask(kNumberOfDims, false);
+  Val* num_features = nullptr;
+  for (const auto axis : c10::irange(kNumberOfDims)) {
+    if (axis != c_axis) {
+      reduction_axes.push_back(axis);
+      broadcast_mask[axis] = true;
+      if (num_features == nullptr) {
+        num_features =
+            castOp(DataType::Double, input->domain()->domain()[axis]->extent());
+      } else {
+        num_features =
+            mul(num_features, input->domain()->domain()[axis]->extent());
+      }
+    }
+  }
+
+  auto mean = save_mean;
+  auto invstd = save_invstd;
+
+  mean = broadcast(mean, broadcast_mask);
+
+  auto norm = reciprocal(num_features);
+
+  auto grad_output_sum = sum(grad_output, reduction_axes);
+  auto dot_p = sum(mul(grad_output, sub(input, mean)), reduction_axes);
+
+  auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask);
+
+  auto proj_scale =
+      broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask);
+
+  TensorView* grad_scale = nullptr;
+
+  if (weight == nullptr) {
+    grad_scale =
+        mul(broadcast(invstd, broadcast_mask),
+            IrBuilder::create<Double>(input->container(), 1));
+  } else {
+    grad_scale = mul(
+        broadcast(invstd, broadcast_mask), broadcast(weight, broadcast_mask));
+  }
+
+  TensorView* grad_input = nullptr;
+  if (kTraining) {
+    auto proj = mul(sub(input, mean), proj_scale);
+    grad_input = mul(sub(sub(grad_output, proj), grad_mean), grad_scale);
+  } else {
+    grad_input = mul(grad_output, grad_scale);
+  }
+
+  fusion.addOutput(grad_input);
+
+  // Scheduling strategy
+  // 1. Cache inputs
+  // 2. Group the reductions (automatically fused with broadcasts)
+  // 3. Merge HW and vectorize with the outer parallelized by TIDx
+  // 4. Split N by TIDy with the outer parallelized by BIDx and
+  // inner by TIDy
+  // 5. Split C by BIDy and let the outer be the serial outermost loop
+
+  auto input_cache = input->cacheAfter();
+  auto grad_output_cache = grad_output->cacheAfter();
+  auto weight_cache = weight->cacheAfter();
+  auto save_mean_cache = save_mean->cacheAfter();
+  auto save_invstd_cache = save_invstd->cacheAfter();
+
+  // Group the two reductions
+  groupReductions({grad_output_sum, dot_p});
+
+  // Transform grad_input to: [C/bidy, N/tidy, tidy, bidy, HW/vec_width,
+  // vec_width]
+  const int tidy = 8;
+  const int bidy = 4;
+  const int bidx = ceilDiv(shape[0], (int64_t)tidy);
+  const int vec_width = 4;
+  TORCH_CHECK(
+      (shape[2] * shape[3]) % vec_width == 0,
+      "Invalid vector width: ",
+      vec_width);
+
+  grad_input->merge(-2, -1);
+  grad_input->split(-1, vec_width);
+
+  grad_input->split(0, tidy);
+  grad_input->split(2, bidy);
+  TORCH_CHECK(
+      grad_input->nDims() == 6,
+      "Unexpected number of dimensions: ",
+      grad_input->toString());
+
+  grad_input->reorder({{2, 0}, {0, 1}, {1, 2}});
+
+  grad_input->axis(1)->parallelize(ParallelType::BIDx);
+  grad_input->axis(2)->parallelize(ParallelType::TIDy);
+  grad_input->axis(3)->parallelize(ParallelType::BIDy);
+  grad_input->axis(4)->parallelize(ParallelType::TIDx);
+
+  TransformPropagator::from(grad_input);
+
+  auto rf_tensors = grad_output_sum->rFactor(
+      {-1}, std::vector<TensorView*>({grad_output_sum, dot_p}));
+
+  for (auto fusion_input :
+       ir_utils::filterByType<TensorView>(fusion.inputs())) {
+    fusion_input->computeAt(grad_input, 1);
+  }
+
+  // Parallelization
+  scheduler_utils::parallelizeAllLike(grad_input, ir_utils::allTvs(&fusion));
+  input_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+  grad_output_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(shape, options);
+  auto at_grad_output = at::randn(shape, options);
+  auto at_weight = at::randn({shape[c_axis]}, options);
+  auto at_save_mean = at::randn({shape[c_axis]}, options);
+  auto at_save_invstd = at::randn({shape[c_axis]}, options);
+  std::vector<IValue> aten_inputs(
+      {at_input, at_grad_output, at_weight, at_save_mean, at_save_invstd});
+
+  GpuLower gpulw(&fusion);
+  validateNoParallelBroadcastExist(gpulw.kernel());
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  if (bidx * bidy > deviceSMCount()) {
+    GTEST_SKIP() << "Not enough SMs to run this test";
+  }
+
+  auto outputs = fe.runFusion(aten_inputs);
+
+  std::vector<int64_t> at_reduction_axes;
+  std::copy(
+      reduction_axes.begin(),
+      reduction_axes.end(),
+      std::back_inserter(at_reduction_axes));
+
+  auto at_bcast = [=](const auto& tensor) {
+    if (channels_last) {
+      tensor.unsqueeze(0).unsqueeze(0).unsqueeze(0);
+    } else {
+      return tensor.unsqueeze(0).unsqueeze(-1).unsqueeze(-1);
+    }
+  };
+
+  auto at_mean = at_save_mean;
+  const auto& at_invstd = at_save_invstd;
+  at_mean = at_bcast(at_mean);
+  auto at_norm = 1.0f / static_cast<float>(shape[0] * shape[2] * shape[3]);
+
+  auto at_grad_output_sum = sum(at_grad_output, at_reduction_axes);
+  auto at_dot_p =
+      sum(mul(at_grad_output, sub(at_input, at_mean)), at_reduction_axes);
+
+  auto at_grad_mean = at_bcast(at_grad_output_sum * at_norm);
+
+  auto at_proj_scale = at_bcast((at_dot_p * at_norm) * (at_invstd * at_invstd));
+
+  at::Tensor at_grad_scale;
+
+  if (weight == nullptr) {
+    at_grad_scale = at_bcast(at_invstd);
+  } else {
+    at_grad_scale = at_bcast(at_invstd) * at_bcast(at_weight);
+  }
+
+  at::Tensor at_grad_input;
+  if (kTraining) {
+    auto at_proj = (at_input - at_mean) * at_proj_scale;
+    at_grad_input = (at_grad_output - at_proj - at_grad_mean) * at_grad_scale;
+  } else {
+    at_grad_input = at_grad_output * at_grad_scale;
+  }
+
+  testValidate(
+      fe.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
new file mode 100644
index 000000000000..6e9225d29237
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
@@ -0,0 +1,5388 @@
+#if defined(USE_CUDA)
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+// fuser and IR parser
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+namespace {
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .ndims(ndims)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(ndims, true))
+      .build();
+}
+
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+void checkIntValue(
+    ExpressionEvaluator& evaluator,
+    Val* val,
+    Int::ScalarType expected_value) {
+  TORCH_CHECK(val->isAnInt());
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+void checkIntValue(
+    kir::ExpressionEvaluator& evaluator,
+    const Val* val,
+    Int::ScalarType expected_value) {
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+// Used to signify invalid ranges, i.e., values at offset 0 to
+// start_offset, and values at offset stop_offset to the end of the
+// domain.
+static constexpr int invalid_marker = 1;
+
+// ATen version of tensor shifting
+auto shift(
+    at::Tensor tensor,
+    const std::vector<int>& offsets,
+    std::vector<int> padding = {}) {
+  TORCH_INTERNAL_ASSERT(
+      tensor.ndimension() == static_cast<int64_t>(offsets.size()));
+  if (padding.empty()) {
+    padding = offsets;
+    for (auto& p : padding) {
+      p = std::abs(p);
+    }
+  }
+  at::Tensor t = tensor;
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    auto offset = offsets[i];
+    t = t.roll(offsets[i], i);
+    if (offset == 0) {
+      continue;
+    }
+    // Zero padding
+    std::vector<at::indexing::TensorIndex> indices(
+        tensor.ndimension(), at::indexing::Slice(0, at::indexing::None));
+    if (offset > 0) {
+      indices[i] = at::indexing::Slice(0, offset);
+    } else {
+      indices[i] = at::indexing::Slice(offset, at::indexing::None);
+    }
+    t.index(indices) = 0;
+    // Fill the outside range by the special marker value.
+    const auto pad = padding[i];
+    if (offset > 0) {
+      indices[i] = at::indexing::Slice(0, offset - pad);
+    } else {
+      offset += pad;
+      TORCH_INTERNAL_ASSERT(offset <= 0);
+      if (offset == 0) {
+        continue;
+      }
+      indices[i] = at::indexing::Slice(offset, at::indexing::None);
+    }
+    t.index(indices) = invalid_marker;
+  }
+  return t;
+}
+
+// ATen version of tensor gather
+auto gather(
+    at::Tensor tensor,
+    const std::vector<int>& window_shape,
+    const std::vector<std::vector<int>>& pad_width,
+    std::vector<int> strides = {}) {
+  TORCH_CHECK(
+      tensor.ndimension() == static_cast<int64_t>(window_shape.size()),
+      "Invalid window shape: ",
+      window_shape,
+      ". Size of the window shape is different from the tensor dimension.");
+  TORCH_CHECK(
+      tensor.ndimension() == static_cast<int64_t>(pad_width.size()),
+      "Invalid pad width: ",
+      pad_width,
+      ". Size of the pad width is different from the tensor dimension.");
+  if (strides.empty()) {
+    strides = std::vector<int>(tensor.ndimension(), 1);
+  } else {
+    TORCH_CHECK(
+        tensor.ndimension() == static_cast<int64_t>(strides.size()),
+        "Invalid strides: ",
+        strides,
+        ". Size of strides is different from the tensor dimension.");
+  }
+  at::Tensor t = tensor;
+  for (size_t i = 0; i < window_shape.size(); ++i) {
+    const auto w_size = window_shape[i];
+    TORCH_CHECK(w_size != 0);
+    const auto& pad = pad_width[i];
+    TORCH_CHECK(pad.size() == 2);
+    const auto out_extent_adj = -w_size + 1 + pad[0] + pad[1];
+    TORCH_INTERNAL_ASSERT(out_extent_adj <= 0);
+    const auto stride = strides[i];
+    TORCH_CHECK(stride >= 1);
+
+    at::Tensor concat_tensor;
+
+    for (int w = 0; w < w_size; ++w) {
+      std::vector<int> shift_offsets(t.ndimension(), 0);
+      shift_offsets[i] = pad[0] - w;
+      auto shifted = shift(t, shift_offsets);
+      // Apply stride
+      if (stride != 1) {
+        std::vector<at::indexing::TensorIndex> indices(
+            shifted.ndimension(), at::indexing::Slice(0, at::indexing::None));
+        if (out_extent_adj == 0) {
+          indices[i] = at::indexing::Slice(0, at::indexing::None, strides[i]);
+        } else {
+          indices[i] = at::indexing::Slice(0, out_extent_adj, strides[i]);
+        }
+        shifted = shifted.index(indices);
+      }
+      shifted = shifted.unsqueeze(-1);
+      if (w == 0) {
+        concat_tensor = shifted;
+      } else {
+        concat_tensor = at::cat({concat_tensor, shifted}, -1);
+      }
+    }
+    t = concat_tensor;
+  }
+
+  // Fill invalid regions with the marker. Note that when non-unit
+  // stride is used, it trims invalid regions, so no marking is
+  // necessary.
+  for (size_t i = 0; i < window_shape.size(); ++i) {
+    if (strides[i] != 1) {
+      continue;
+    }
+
+    const auto out_extent_adj =
+        -window_shape[i] + 1 + pad_width[i][0] + pad_width[i][1];
+    if (out_extent_adj < 0) {
+      std::vector<at::indexing::TensorIndex> indices(
+          t.ndimension(), at::indexing::Slice(0, at::indexing::None));
+      indices[i] = at::indexing::Slice(out_extent_adj, at::indexing::None);
+      t.index(indices) = invalid_marker;
+    }
+  }
+
+  return t;
+}
+
+} // namespace
+
+// Shift an input tensor
+TEST_F(NVFuserTest, FusionShift1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = shift(tv0, {-1, 0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = shift(tv0, {0, 1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = shift(tv0, {2, 2});
+  fusion.addOutput(tv3);
+
+  auto tv4 = shift(tv0, {-2, -2});
+  fusion.addOutput(tv4);
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = shift(t0, {-1, 0});
+  TORCH_CHECK(t1.equal(outputs[0]));
+
+  auto t2 = shift(t0, {0, 1});
+  TORCH_CHECK(t2.equal(outputs[1]));
+
+  auto t3 = shift(t0, {2, 2});
+  TORCH_CHECK(t3.equal(outputs[2]));
+
+  auto t4 = shift(t0, {-2, -2});
+  TORCH_CHECK(t4.equal(outputs[3]));
+}
+
+// Shifts an intermediate tensor
+TEST_F(NVFuserTest, FusionShift2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {-1, 0});
+  fusion.addOutput(tv2);
+
+  // make it a little more complex
+  auto tv3 = add(tv0, IrBuilder::create<Double>(3));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(4));
+  auto tv5 = shift(tv4, {-1, 0});
+  auto tv6 = shift(tv4, {0, -1});
+  auto tv7 = shift(tv4, {1, 0});
+  auto tv8 = shift(tv4, {0, 0});
+  auto tv9 = add(tv5, tv6);
+  auto tv10 = add(tv9, tv7);
+  auto tv11 = add(tv10, tv8);
+  fusion.addOutput(tv11);
+
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11}) {
+    tv->setMemoryType(MemoryType::Global);
+  }
+
+  // t1 allocation: (t1.size[0] + 1) * (t1.size[1])
+  // t3 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
+  // t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
+  GpuLower gpulw(&fusion);
+
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          if (tensor_name == 1 && i == 1) {
+            TORCH_CHECK(alloc->shape().at(i)->isA<NamedScalar>());
+            continue;
+          }
+          auto def =
+              dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
+          TORCH_CHECK(
+              def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
+          TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
+          auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
+          TORCH_CHECK(rhs != nullptr && rhs->isConst());
+          int rhs_value = *rhs->value();
+          if (tensor_name == 1) {
+            TORCH_CHECK(i == 0);
+            TORCH_CHECK(rhs_value == 1);
+          } else {
+            if (i == 0) {
+              TORCH_CHECK(rhs_value == 2);
+            } else {
+              TORCH_CHECK(rhs_value == 1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {-1, 0});
+
+  auto t3 = t0 + 3;
+  auto t4 = t3 + 4;
+  auto t5 = shift(t4, {-1, 0});
+  auto t6 = shift(t4, {0, -1});
+  auto t7 = shift(t4, {1, 0});
+  auto t8 = shift(t4, {0, 0});
+  auto t9 = t5 + t6;
+  auto t10 = t9 + t7;
+  auto t11 = t10 + t8;
+
+  testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftRightOfCA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  tv0->computeAt(tv2, -2);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  int numel_x = 100;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {0, 1});
+
+  TORCH_CHECK(t2.allclose(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionShiftLeftOfCA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = shift(tv2, {-1, 0});
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  tv0->computeAt(tv4, -1);
+
+  // Lowering should trigger an assertion failure as a shifted axis is
+  // found inside an allocation position.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+TEST_F(NVFuserTest, FusionShiftSplit1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {0, 1});
+  auto tv3 = shift(tv1, {0, -2});
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  int split_factor = 4;
+  tv2->split(-1, split_factor);
+  tv3->split(-1, split_factor);
+
+  tv0->computeAt(tv2, -2);
+  tv0->computeAt(tv3, -2);
+
+  // t1 allocation: 7
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(
+            size != nullptr && size->isConst() && size->value().value() == 7);
+      }
+    }
+  }
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {0, 1});
+  auto t3 = shift(t1, {0, -2});
+
+  testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftSplit2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = shift(tv2, {0, -1});
+  auto tv4 = shift(tv2, {0, 1});
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  auto tv6 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv7 = shift(tv6, {0, 0});
+  auto tv8 = add(tv7, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv8);
+
+  int split_factor = 4;
+
+  tv5->split(-1, split_factor);
+  tv8->split(-1, split_factor);
+
+  tv0->computeAt(tv5, -2);
+  tv0->computeAt(tv8, -2);
+
+  // t1 and t2 allocation: 6
+  // t4 allocation: 4
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(
+            size != nullptr && size->isConst() && size->value().value() == 6);
+      } else if (tensor_name == 4) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(size != nullptr && size->isConst());
+        int size_value = *size->value();
+        TORCH_CHECK(size_value == split_factor);
+      }
+    }
+  }
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 2;
+  auto t3 = shift(t1, {0, -1});
+  auto t4 = shift(t1, {0, 1});
+  auto t5 = t3 + t4;
+
+  auto t6 = t0 + 1;
+  auto t7 = t6;
+  auto t8 = t7 + 1;
+
+  testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftDoubleSplit_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = shift(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  int split_factor1 = 8;
+  int split_factor2 = 4;
+
+  tv3->split(-1, split_factor1);
+
+  tv0->computeAt(tv3, -2);
+
+  tv1->split(-1, split_factor2);
+
+  // t1: [i1, i2/8, 8/4, 4]
+  // t2: [i1, i2/8, 8]
+  // t3: [i1, i2/8, 8]
+
+  // t1 and t2 allocation: (split_factor1 + 1) = 9
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(
+            size != nullptr && size->isConst() && size->value().value() == 9);
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 3;
+  auto ref = shift(t1, {0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift3ptStencil_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 3-pt stencil
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  std::vector<std::vector<int>> offsets = {{-1}, {1}};
+
+  std::vector<TensorView*> tvs;
+  for (const auto& offset : offsets) {
+    tvs.push_back(shift(tv0, offset));
+  }
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  int split_factor = 4;
+
+  tv_out->split(0, split_factor);
+
+  // This seems fine but not verified yet
+  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
+
+  auto cache = tv0->cacheAfter();
+
+  tv0->computeAt(tv_out, 1);
+
+  // Inline completely except for the cache
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  // cache allocation: (split_factor + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == cache->name()) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(
+            size != nullptr && size->isConst() &&
+            size->value().value() == split_factor + 2);
+      }
+    }
+  }
+
+  cache->doubleBuffer();
+
+  int numel_x = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift5ptStencil_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 5-pt stencil
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
+
+  std::vector<TensorView*> tvs;
+  for (const auto& offset : offsets) {
+    tvs.push_back(shift(tv0, offset));
+  }
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  std::vector<int> split_factor({4, 8});
+
+  tv_out->split(-1, split_factor[1]);
+  tv_out->split(0, split_factor[0]);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  auto cache = tv0->cacheAfter();
+
+  tv0->computeAt(tv_out, 2);
+
+  // Inline completely except for the cache
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  // cache allocation: (split_factor + 2) * (split_factor + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == cache->name()) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor[i] + 2);
+        }
+      }
+    }
+  }
+
+  cache->doubleBuffer();
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0;
+  for (const auto& offset : offsets) {
+    ref = ref + shift(t0, offset);
+  }
+  ref = ref / int(offsets.size() + 1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift9ptStencil_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 9-pt stencil
+  std::vector<std::vector<int>> offsets;
+  for (int i = -1; i < 2; ++i) {
+    for (int j = -1; j < 2; ++j) {
+      if (i == 0 && j == 0) {
+        continue;
+      }
+      offsets.push_back({i, j});
+    }
+  }
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  std::vector<TensorView*> tvs;
+  for (const auto& offset : offsets) {
+    tvs.push_back(shift(tv0, offset));
+  }
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  std::vector<int> split_factor({4, 8});
+  tv_out->split(-1, split_factor[1]);
+  tv_out->split(0, split_factor[0]);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  auto cache = tv0->cacheAfter();
+
+  tv0->computeAt(tv_out, 2);
+
+  // Inline completely except for the cache
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  // This seems fine but not yet verified
+  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
+
+  // cache allocation: (split_factor + 2) * (split_factor + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == cache->name()) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor[i] + 2);
+        }
+      }
+    }
+  }
+
+  cache->doubleBuffer();
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0;
+  for (const auto& offset : offsets) {
+    ref = ref + shift(t0, offset);
+  }
+  ref = ref / int(offsets.size() + 1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftSmemBlocking_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  int smem_block_factor = 32;
+
+  tv2->split(-1, smem_block_factor);
+
+  tv0->computeAt(tv2, -2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  // tv1 allocation: (split_factor + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == tv1->name()) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        for (int i = 0; i < 1; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == smem_block_factor + 1);
+        }
+      }
+    }
+  }
+
+  int numel_x = 100;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {0, 1});
+  auto ref = t2;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift3ptStencilParallel_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 3-pt stencil
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  std::vector<TensorView*> tvs;
+  tvs.push_back(shift(tv0, {-1}));
+  tvs.push_back(shift(tv0, {1}));
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  int smem_block_factor = 32;
+
+  tv_out->split(0, smem_block_factor);
+  // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  tv0->computeAt(tv_out, 1);
+
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv0_cache->doubleBuffer();
+
+  int numel_x = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift5ptStencilParallel_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 5-pt stencil
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
+
+  std::vector<TensorView*> tvs;
+  for (const auto& offset : offsets) {
+    tvs.push_back(shift(tv0, offset));
+  }
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  int smem_block_factor = 32;
+
+  tv_out->split(-1, smem_block_factor);
+  tv_out->split(0, smem_block_factor);
+
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  tv0->computeAt(tv_out, 2);
+
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
+  tv_out->axis(-2)->parallelize(ParallelType::TIDy);
+  tv_out->axis(-3)->parallelize(ParallelType::BIDx);
+  tv_out->axis(-4)->parallelize(ParallelType::BIDy);
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-2)->parallelize(ParallelType::TIDy);
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0;
+  for (const auto& offset : offsets) {
+    ref = ref + shift(t0, offset);
+  }
+  ref = ref / int(offsets.size() + 1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftMerge1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {-1, 1});
+  fusion.addOutput(tv2);
+
+  int split_factor = 4;
+
+  tv2->split(-1, split_factor);
+  tv2->split(0, split_factor);
+  tv2->reorder({{1, 2}, {2, 1}});
+  tv2->merge(2, 3);
+
+  tv0->computeAt(tv2, 2);
+
+  // t1 allocation: (split_factor + 1) * (split_factor + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor + 1);
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {-1, 1});
+  auto ref = t2;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftMerge2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1});
+  auto tv3 = shift(tv1, {-1, 1});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  int split_factor = 4;
+
+  tv4->split(-1, split_factor);
+  tv4->split(0, split_factor);
+  tv4->reorder({{1, 2}, {2, 1}});
+  tv4->merge(2, 3);
+
+  tv0->computeAt(tv4, -2);
+
+  // t1 allocation: (split_factor + 2) * (split_factor + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor + 2);
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t1, {-1, 1});
+  auto t4 = t2 + t3;
+
+  TORCH_CHECK(t4.allclose(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionShiftGlobal_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {0, 1});
+  auto tv3 = shift(tv1, {-1, 0});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv1->split(-1, 4);
+  tv2->split(-1, 8);
+  tv3->split(-1, 2);
+  tv4->split(-1, 3);
+
+  tv1->merge(-2, -1);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+
+  // t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto def =
+              dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
+          TORCH_CHECK(
+              def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
+          TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
+          auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
+          TORCH_CHECK(rhs != nullptr && rhs->isConst());
+          int rhs_value = *rhs->value();
+          TORCH_CHECK(rhs_value == 1);
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {0, 1});
+  auto t3 = shift(t1, {-1, 0});
+  auto t4 = t2 + t3;
+  auto ref = t4;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = shift(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  int split_factor1 = 8;
+  int split_factor2 = 4;
+
+  tv3->split(-1, split_factor1);
+
+  tv0->computeAt(tv3, -2);
+
+  tv1->split(-1, split_factor2);
+  tv1->merge(-2, -1);
+
+  // t1 and t2 allocation: (split_factor1 + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        auto size = dynamic_cast<Int*>(alloc->shape().at(0));
+        TORCH_CHECK(
+            size != nullptr && size->isConst() &&
+            size->value().value() == split_factor1 + 1);
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 3;
+  auto ref = shift(t1, {0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = shift(tv2, {1, 1});
+  fusion.addOutput(tv3);
+
+  auto out = tv3;
+
+  int split_factor1 = 32;
+  int split_factor2 = 4;
+
+  out->split(-1, split_factor1);
+  out->split(-1, split_factor2);
+  out->split(0, split_factor1);
+  out->split(1, split_factor2);
+  out->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
+  out->merge(2, 3);
+  out->merge(2, 3);
+  out->merge(2, 3);
+  out->merge(0, 1);
+
+  TransformPropagator::from(out);
+
+  tv0->computeAt(out, 1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {tv1, tv2});
+
+  for (auto tv : {tv1, tv2}) {
+    tv->setMemoryType(MemoryType::Shared);
+  }
+
+  // t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor1 + 1);
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = shift(t0 + 1 + 2, {1, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 5-pt stencil
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
+
+  std::vector<TensorView*> tvs;
+  for (const auto& offset : offsets) {
+    tvs.push_back(shift(tv0, offset));
+  }
+
+  auto tv_out = tv0;
+
+  for (auto tv : tvs) {
+    tv_out = add(tv_out, tv);
+  }
+
+  tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
+
+  fusion.addOutput(tv_out);
+
+  std::vector<int> split_factor({4, 32});
+
+  tv_out->split(-1, split_factor[1]);
+  tv_out->split(0, split_factor[0]);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  // Merge the inner-most two axes and create
+  // a 1D thread block of split_factor1*split_factor2 threads
+  tv_out->merge(-2, -1);
+
+  tv0->computeAt(tv_out, 2);
+
+  // Inline completely except for the cache
+  for (auto tv : tvs) {
+    tv->computeAt(tv_out, -1);
+  }
+
+  tv0_cache->merge(-2, -1);
+
+  tv_out->axis(-1)->parallelize(ParallelType::TIDx);
+  tv_out->axis(1)->parallelize(ParallelType::BIDx);
+  tv_out->axis(0)->parallelize(ParallelType::BIDy);
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // cache allocation: (split_factor1 + 2) * (split_factor2 + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == tv0_cache->name()) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(
+              size != nullptr && size->isConst() &&
+              size->value().value() == split_factor[i] + 2);
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0;
+  for (const auto& offset : offsets) {
+    ref = ref + shift(t0, offset);
+  }
+  ref = ref / int(offsets.size() + 1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftChain1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = shift(tv0, {0, 1});
+  auto tv2 = shift(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  int split_factor = 4;
+  tv2->split(-1, split_factor);
+
+  tv0->computeAt(tv2, -2);
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = shift(shift(t0, {0, 1}), {0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftChain2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = shift(tv0, {0, 1});
+  auto tv2 = shift(tv1, {0, -1});
+  fusion.addOutput(tv2);
+
+  tv2->split(-1, 4);
+
+  tv0->computeAt(tv2, -2);
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = shift(shift(t0, {0, 1}), {0, -1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftChain3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {0, 1});
+  auto tv3 = shift(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  int split_factor = 4;
+  tv3->split(-1, split_factor);
+
+  tv0->computeAt(tv3, -2);
+
+  // Halo size of tv1 is 2 as it needs to account for both of the two
+  // shift operations , while that of tv2 is still just 1
+
+  // tv1: (split_factor + 2)
+  // tv2: (split_factor + 1)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        TORCH_CHECK(alloc->shape().size() == 1);
+        for (int i = 0; i < 1; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(size != nullptr && size->isConst());
+          if (tensor_name == 1) {
+            TORCH_CHECK(size->value().value() == split_factor + 2);
+          } else if (tensor_name == 2) {
+            TORCH_CHECK(size->value().value() == split_factor + 1);
+          }
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {0, 1});
+  auto t3 = shift(t2, {0, 1});
+  auto ref = t3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftChain4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = shift(tv0, {1, -1});
+  auto tv2 = shift(tv1, {2, -2});
+  auto tv3 = shift(tv2, {3, -3});
+  auto tv4 = shift(tv3, {4, -4});
+  auto tv_out = tv4;
+
+  fusion.addOutput(tv_out);
+
+  int split_factor = 4;
+
+  tv_out->split(-1, split_factor);
+  tv_out->split(0, split_factor);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  tv0->computeAt(tv_out, 2);
+
+  tv1->merge(-2, -1);
+  tv2->merge(-2, -1);
+  tv3->merge(-2, -1);
+
+  // tv1: (split_factor + 9) * (split_factor + 9)
+  // tv2: (split_factor + 7) * (split_factor + 7)
+  // tv3: (split_factor + 4) * (split_factor + 4)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == 1 || tensor_name == 2) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(size != nullptr && size->isConst());
+          auto size_val = size->value().value();
+          if (tensor_name == 1) {
+            TORCH_CHECK(size_val == split_factor + 9);
+          } else if (tensor_name == 2) {
+            TORCH_CHECK(size_val == split_factor + 7);
+          } else if (tensor_name == 3) {
+            TORCH_CHECK(size_val == split_factor + 4);
+          }
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = shift(t0, {1, -1});
+  auto t2 = shift(t1, {2, -2});
+  auto t3 = shift(t2, {3, -3});
+  auto t4 = shift(t3, {4, -4});
+  auto ref = t4;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShift5ptStencilChain_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
+
+  // First stencil: 5pt stencil
+  // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
+  std::vector<TensorView*> tv_stencil1_shifts;
+  for (const auto& offset : offsets) {
+    tv_stencil1_shifts.push_back(shift(tv0, offset));
+  }
+
+  auto tv_stencil1 = tv0;
+  for (auto tv : tv_stencil1_shifts) {
+    tv_stencil1 = add(tv_stencil1, tv);
+  }
+
+  tv_stencil1 = div(
+      tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
+
+  // Second stencil: Same 5pt stencil
+  std::vector<TensorView*> tv_stencil2_shifts;
+  for (const auto& offset : offsets) {
+    tv_stencil2_shifts.push_back(shift(tv_stencil1, offset));
+  }
+
+  auto tv_stencil2 = tv_stencil1;
+  for (auto tv : tv_stencil2_shifts) {
+    tv_stencil2 = add(tv_stencil2, tv);
+  }
+
+  tv_stencil2 = div(
+      tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
+
+  auto tv_out = tv_stencil2;
+
+  fusion.addOutput(tv_out);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  std::vector<int> split_factor({16, 16});
+
+  tv_out->split(-1, split_factor[1]);
+  tv_out->split(0, split_factor[0]);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  tv0->computeAt(tv_out, 2);
+
+  // Inline completely all inputs to the first stencil output, except for the
+  // tv0 cache
+  for (auto tv : tv_stencil1_shifts) {
+    tv->computeAt(tv_stencil1, -1);
+  }
+
+  // Inline completely all inputs to the second stencil output, except
+  // for the first stencil output
+  for (auto tv : tv_stencil2_shifts) {
+    tv->computeAt(tv_stencil2, -1);
+  }
+
+  tv_out->axis(1)->parallelize(ParallelType::BIDx);
+  tv_out->axis(0)->parallelize(ParallelType::BIDy);
+
+  auto all_values = DependencyCheck::getAllValsBetween(
+      {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
+  for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+    tv->axis(-2)->parallelize(ParallelType::TIDy);
+  }
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+  tv_stencil1->setMemoryType(MemoryType::Shared);
+
+  // tv0_cache: (split_factor + 4) * (split_factor + 4)
+  // tv_stencil1: (split_factor + 2) * (split_factor + 2)
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->unordered_exprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
+      auto tensor_name = alloc->buffer()->name();
+      if (tensor_name == tv0_cache->name() ||
+          tensor_name == tv_stencil1->name()) {
+        TORCH_CHECK(alloc->shape().size() == 2);
+        for (int i = 0; i < 2; ++i) {
+          auto size = dynamic_cast<Int*>(alloc->shape().at(i));
+          TORCH_CHECK(size != nullptr && size->isConst());
+          if (tensor_name == tv0_cache->name()) {
+            TORCH_CHECK(size->value().value() == split_factor[i] + 4);
+          } else if (tensor_name == tv_stencil1->name()) {
+            TORCH_CHECK(size->value().value() == split_factor[i] + 2);
+          }
+        }
+      }
+    }
+  }
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto stencil1 = t0;
+  for (const auto& offset : offsets) {
+    stencil1 = stencil1 + shift(t0, offset);
+  }
+  stencil1 = stencil1 / int(offsets.size() + 1);
+  auto stencil2 = stencil1;
+  for (const auto& offset : offsets) {
+    stencil2 = stencil2 + shift(stencil1, offset);
+  }
+  stencil2 = stencil2 / int(offsets.size() + 1);
+  auto ref = stencil2;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Shift a reduced tensor
+TEST_F(NVFuserTest, FusionShiftReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = shift(tv2, {1});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+  tv0->computeAt(tv3, 1);
+  tv0->computeAt(tv2, -1);
+
+  const int numel_x = 9;
+  const int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = sum(t1, {1});
+  auto t3 = shift(t2, {1});
+  auto ref = t3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Parallelized version of FusionShiftReduction1
+TEST_F(NVFuserTest, FusionShiftReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = shift(tv2, {1});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+  tv0->computeAt(tv3, 1);
+
+  tv2->split(-1, 32);
+  tv0->computeAt(tv2, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  const int numel_x = 201;
+  const int numel_y = 301;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = sum(t1, {1});
+  auto t3 = shift(t2, {1});
+  auto ref = t3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftRfactor1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = shift(tv2, {1});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+  tv0->computeAt(tv3, 1);
+
+  tv2->split(-1, 32);
+  auto rf = tv2->rFactor({-2});
+  tv0->computeAt(tv2, -1);
+  tv0->computeAt(rf, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  const int numel_x = 201;
+  const int numel_y = 301;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = sum(t1, {1});
+  auto t3 = shift(t2, {1});
+  auto ref = t3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftBcast1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = shift(tv2, {0, 1});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  const int numel_x = 9;
+  const int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1;
+  auto ref = t4;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftBcast2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = shift(tv2, {1, 0});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->split(0, 4);
+  tv0->computeAt(tv4, 1);
+
+  const int numel_x = 9;
+  const int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t3 = shift(t2, {1, 0});
+  auto ref = t3 + t1;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Combine ShiftBcast1 and ShiftBcast2 with parallelization
+TEST_F(NVFuserTest, FusionShiftBcast3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = shift(tv2, {1, 0});
+  auto tv4 = shift(tv2, {0, 1});
+  auto tv5 = shift(tv2, {-1, -1});
+  auto tv6 = add(tv3, tv4);
+  auto tv7 = add(tv6, tv5);
+  auto tv8 = add(tv7, tv1);
+  fusion.addOutput(tv8);
+
+  tv8->split(0, 4);
+  tv8->split(-1, 4);
+  tv0->computeAt(tv8, 1);
+
+  tv8->axis(-1)->parallelize(ParallelType::TIDx);
+  for (auto tv : {tv8, tv7, tv6, tv5, tv4, tv3, tv2}) {
+    tv->axis(1)->parallelize(ParallelType::TIDy);
+  }
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  const int numel_x = 101;
+  const int numel_y = 201;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t3 = shift(t2, {1, 0});
+  auto t4 = t2;
+  auto t5 = shift(t2, {-1, 0});
+  auto ref = t3 + t4 + t5 + t1;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// See issue #893
+TEST_F(NVFuserTest, FusionShiftSyncPlacement1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = shift(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 8);
+  tv0->computeAt(tv4, 2);
+
+  tv2->computeAt(tv3, -1);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = t0 + 2;
+  auto t3 = add(t1, t2);
+  auto t4 = shift(t3, {0, 1});
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+// See issue #893. Top-level placement.
+TEST_F(NVFuserTest, FusionShiftSyncPlacement2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = shift(tv3, {1});
+  fusion.addOutput(tv4);
+
+  tv2->computeAt(tv3, -1);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = t0 + 2;
+  auto t3 = add(t1, t2);
+  auto t4 = shift(t3, {1});
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+// Based on original CUDA provided by Vishal Mehta.
+// Major differences with the original version:
+// - The original version uses additional 2 warps to load the halos
+//   along the Y dimension. The other 10 warps are used to load a 32x10
+//   tile, and all warps will do coalesced loads. No such optimization
+//   is done in the fuser version.
+TEST_F(NVFuserTest, FusionHdiff_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+  auto coeff = makeSymbolicTensor(3);
+  fusion.addInput(coeff);
+
+  std::vector<std::vector<int>> offsets{
+      {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
+
+  // T2, T3, T4, T5
+  std::vector<TensorView*> inp_neighbors;
+  for (const auto& offset : offsets) {
+    inp_neighbors.push_back(shift(inp, offset, false));
+  }
+
+  // T8
+  TensorView* sum_of_neighbors = nullptr;
+  for (auto inp_neighbor : inp_neighbors) {
+    if (sum_of_neighbors == nullptr) {
+      sum_of_neighbors = inp_neighbor;
+    } else {
+      sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
+    }
+  }
+
+  // T9 = T0 * 4
+  // T10 = T9 - T8
+  auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
+
+  // T11 = shift(T10)
+  // T12 = T11 - T10
+  auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
+  // T14 = T13 - T0
+  // T15 = T12 * T14
+  // T16 = T15 > 0
+  // T17 = T16 ? 0 : T12
+  auto flx_cond =
+      gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
+         IrBuilder::create<Double>(0));
+  auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
+
+  // T18 = shift(T10)
+  // T19 = T18 - T10
+  auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
+  // T20 = shift(T0)
+  // T21 = T20 - T0
+  // T22 = T19 * T21
+  // T23 = T22 > 0
+  auto fly_cond =
+      gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
+         IrBuilder::create<Double>(0));
+  // T24 = T23 ? 0 : T19
+  auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
+
+  // T25 = shift(flx0)
+  // T26 = T17 - T25
+  // T27 = shift(fly0)
+  // T28 = T24 - T27
+  // T29 = T26 + T28
+  // T30 = T1 * T29
+  // T31 = T0 - T30
+  auto out =
+      sub(inp,
+          mul(coeff,
+              add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
+                  sub(fly0, shift(fly0, {0, 1, 0}, false)))));
+
+  fusion.addOutput(out);
+
+  /////////////////////////////////
+  // Scheduling
+  /////////////////////////////////
+
+  out->setContiguity(false);
+
+  // Step 1: 2D Tiling
+
+  const int tile_x = 32;
+  const int tile_y = 8;
+
+  out->split(-1, tile_x);
+  out->split(-3, tile_y);
+  out->reorder({{-2, -3}});
+  inp->computeAt(out, -3);
+  coeff->computeAt(out, -3);
+
+  // Step 2: Inlining
+
+  // Inline inputs to lap
+  auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
+  for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
+    if (val != lap && val != inp) {
+      val->computeAt(lap, -1);
+    }
+  }
+
+  // Inline inputs to flx0
+  auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
+  for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
+    if (val != lap && val != flx0 && val != inp) {
+      val->computeAt(flx0, -1);
+    }
+  }
+
+  // Inline inputs to fly0
+  auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
+  for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
+    if (val != lap && val != fly0 && val != inp) {
+      val->computeAt(fly0, -1);
+    }
+  }
+
+  // Inline inputs to out
+  auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
+  for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
+    if (val != flx0 && val != fly0 && val != out) {
+      val->computeAt(out, -1);
+    }
+  }
+
+  // Step 3: Parallelization
+
+  // Block parallelization
+  out->axis(0)->parallelize(ParallelType::BIDz);
+  out->axis(1)->parallelize(ParallelType::BIDy);
+  out->axis(2)->parallelize(ParallelType::BIDx);
+  // Thread parallelization
+  out->axis(3)->parallelize(ParallelType::TIDy);
+  out->axis(4)->parallelize(ParallelType::TIDx);
+  // Apply the same parallelization to all other tensors
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  // Store intermediate stencil results on smem so that they can be
+  // accessed by threads
+  for (auto tv : {flx0, fly0, lap}) {
+    tv->setMemoryType(MemoryType::Shared);
+  }
+
+  /////////////////////////////////
+  int numel_x = 101;
+  int numel_y = 99;
+  int numel_z = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
+  at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
+  std::vector<IValue> inputs = {inp_at, coeff_at};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto fuser_output = fe.runFusion(inputs)[0];
+
+  // Trim the outer rim
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(0, at::indexing::None),
+      at::indexing::Slice(2, -2),
+      at::indexing::Slice(2, -2)};
+  fuser_output = fuser_output.index(indices);
+
+  {
+    at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
+    auto lap = inp_at * 4 -
+        (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
+         shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
+    auto flx = shift(lap, {0, 0, -1}) - lap;
+    auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
+    auto flx0 = at::where(flx_cond, zeros, flx);
+    auto fly = shift(lap, {0, -1, 0}) - lap;
+    auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
+    auto fly0 = at::where(fly_cond, zeros, fly);
+
+    auto ref = inp_at -
+        coeff_at *
+            ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
+    ref = ref.index(indices);
+
+    testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+  auto coeff = makeSymbolicTensor(3);
+  fusion.addInput(coeff);
+
+  std::vector<std::vector<int>> offsets{
+      {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
+
+  // T2, T3, T4, T5
+  std::vector<TensorView*> inp_neighbors;
+  for (const auto& offset : offsets) {
+    inp_neighbors.push_back(shift(inp, offset, false));
+  }
+
+  // T8
+  TensorView* sum_of_neighbors = nullptr;
+  for (auto inp_neighbor : inp_neighbors) {
+    if (sum_of_neighbors == nullptr) {
+      sum_of_neighbors = inp_neighbor;
+    } else {
+      sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
+    }
+  }
+
+  // T9 = T0 * 4
+  // T10 = T9 - T8
+  auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
+
+  // T11 = shift(T10)
+  // T12 = T11 - T10
+  auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
+  // T14 = T13 - T0
+  // T15 = T12 * T14
+  // T16 = T15 > 0
+  // T17 = T16 ? 0 : T12
+  auto flx_cond =
+      gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
+         IrBuilder::create<Double>(0));
+  auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
+
+  // T18 = shift(T10)
+  // T19 = T18 - T10
+  auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
+  // T20 = shift(T0)
+  // T21 = T20 - T0
+  // T22 = T19 * T21
+  // T23 = T22 > 0
+  auto fly_cond =
+      gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
+         IrBuilder::create<Double>(0));
+  // T24 = T23 ? 0 : T19
+  auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
+
+  // T25 = shift(flx0)
+  // T26 = T17 - T25
+  // T27 = shift(fly0)
+  // T28 = T24 - T27
+  // T29 = T26 + T28
+  // T30 = T1 * T29
+  // T31 = T0 - T30
+  auto out =
+      sub(inp,
+          mul(coeff,
+              add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
+                  sub(fly0, shift(fly0, {0, 1, 0}, false)))));
+
+  fusion.addOutput(out);
+
+  out->setContiguity(false);
+
+  /////////////////////////////////
+  // Scheduling
+  /////////////////////////////////
+
+  const auto all_vals = fusion.usedMathVals();
+  const std::vector<TensorView*> all_tensors(
+      {ir_utils::filterByType<TensorView>(all_vals).begin(),
+       ir_utils::filterByType<TensorView>(all_vals).end()});
+
+  // Step 1: Blocking
+  // - Thread block size: (tile_x, tile_y)
+  // - Each thread computes a vertical column of length tile_z along the Z
+  // axis.
+  // - Grid dize: (NX / block_x, NY / block_y, NZ / tile_z)
+
+  const int tile_x = 32;
+  const int tile_y = 8;
+  const int tile_z = 16;
+
+  out->split(0, tile_z);
+  out->split(-1, tile_x, true, true);
+  out->split(-3, tile_y, true, true);
+  // out: [NZ/tz, tz, NY/by, by, NX/bx, bx]
+  out->reorder({{1, 3}, {2, 1}, {3, 4}, {4, 2}});
+  // out: [NZ/tz, NY/by, NX/bx, tz, by, bx]
+
+  TransformPropagator::from(out);
+
+  inp->computeAt(out, 4);
+
+  // Step 2: Inlining
+
+  // Inline inputs to lap
+  auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
+  for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
+    if (val != lap && val != inp) {
+      val->computeAt(lap, -1);
+    }
+  }
+
+  // Inline inputs to flx0
+  auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
+  for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
+    if (val != lap && val != flx0 && val != inp) {
+      val->computeAt(flx0, -1);
+    }
+  }
+
+  // Inline inputs to fly0
+  auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
+  for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
+    if (val != lap && val != fly0 && val != inp) {
+      val->computeAt(fly0, -1);
+    }
+  }
+
+  // Inline inputs to out
+  auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
+  for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
+    if (val != flx0 && val != fly0 && val != out) {
+      val->computeAt(out, -1);
+    }
+  }
+
+  // Step 3: Parallelization
+
+  // Block parallelization
+  out->axis(0)->parallelize(ParallelType::BIDz);
+  out->axis(1)->parallelize(ParallelType::BIDy);
+  out->axis(2)->parallelize(ParallelType::BIDx);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+  // Unswitch at the tz axis
+  out->axis(3)->parallelize(ParallelType::Unswitch);
+
+  scheduler_utils::parallelizeAllLike(out, all_tensors);
+
+  // These need to be on smem
+  for (auto tv : {flx0, fly0, lap}) {
+    tv->setMemoryType(MemoryType::Shared);
+  }
+
+  /////////////////////////////////
+  const int halo_extent = 2;
+  const int numel_x = 64 + halo_extent * 2;
+  const int numel_y = 64 + halo_extent * 2;
+  const int numel_z = 32;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
+  at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
+  std::vector<IValue> inputs = {inp_at, coeff_at};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto fuser_output = fe.runFusion(inputs)[0];
+
+  // Trim the outer rim
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(0, at::indexing::None),
+      at::indexing::Slice(2, -2),
+      at::indexing::Slice(2, -2)};
+  fuser_output = fuser_output.index(indices);
+
+  {
+    at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
+    auto lap = inp_at * 4 -
+        (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
+         shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
+    auto flx = shift(lap, {0, 0, -1}) - lap;
+    auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
+    auto flx0 = at::where(flx_cond, zeros, flx);
+    auto fly = shift(lap, {0, -1, 0}) - lap;
+    auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
+    auto fly0 = at::where(fly_cond, zeros, fly);
+
+    auto ref = inp_at -
+        coeff_at *
+            ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
+    ref = ref.index(indices);
+
+    testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
+  }
+}
+
+// 3x3 max pooling
+TEST_F(NVFuserTest, FusionMaxPooling_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Format: CHW
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // 3x3 pooling of the HW spatial domain
+  std::vector<std::vector<int>> offsets;
+  for (int i = -1; i <= 1; ++i) {
+    for (int j = -1; j <= 1; ++j) {
+      if (i == 0 && j == 0) {
+        continue;
+      }
+      offsets.push_back({i, j});
+    }
+  }
+
+  std::vector<TensorView*> inp_tile({inp});
+  for (auto offset : offsets) {
+    offset.insert(offset.begin(), 0);
+    inp_tile.push_back(shift(inp, offset));
+  }
+
+  TensorView* max_tensor = nullptr;
+  for (auto tv : inp_tile) {
+    if (max_tensor == nullptr) {
+      max_tensor = tv;
+    } else {
+      max_tensor = binaryOp(BinaryOpType::Max, max_tensor, tv);
+    }
+  }
+
+  fusion.addOutput(max_tensor);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Tiling the spatial domain
+  const int tile_x = 32;
+  const int tile_y = 8;
+
+  max_tensor->split(-2, tile_y);
+  max_tensor->axis(-2)->parallelize(ParallelType::TIDy);
+  max_tensor->split(-1, tile_x);
+  max_tensor->axis(-1)->parallelize(ParallelType::TIDx);
+  max_tensor->reorder({{-3, -2}});
+
+  inp_cache->computeAt(max_tensor, 3);
+  inp_cache->axis(-2)->parallelize(ParallelType::TIDy);
+  inp_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  auto max_tensor_dep =
+      DependencyCheck::getAllValsBetween({inp_cache}, {max_tensor});
+  for (auto tv : ir_utils::filterByType<TensorView>(max_tensor_dep)) {
+    if (tv == inp_cache || tv == max_tensor) {
+      continue;
+    }
+    tv->computeAt(max_tensor, -1);
+  }
+
+  max_tensor->axis(0)->parallelize(ParallelType::BIDx);
+
+  const int hw = 50;
+  const int num_channels = 20;
+  const int pooling_window = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
+  // shift always pads by zero, so if all surrounding values are
+  // negative, max pooling would pick a padded value, which isn't the
+  // correct behavior. We need to be able to choose the value of
+  // padding. In this case, padding by the minimum value would not
+  // have this problem. For now, avoid the problem by making sure all
+  // values are not negative.
+  aten_inp = at::abs(aten_inp);
+  std::vector<IValue> inputs = {aten_inp};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = at::max_pool2d(
+      aten_inp, {pooling_window, pooling_window}, {1, 1}, {1, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGather1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {1, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1, s2}, options);
+
+  auto ref = gather(t0, window_shape, padding_width);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionGather2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {1, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  tv3->split(1, 32);
+  tv0->computeAt(tv3, 2);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDy);
+  tv3->axis(1)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int s1 = 99;
+  const int s2 = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1, s2}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGather3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {1, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width);
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionGather4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {3, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionGather5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {3, 3};
+  const std::vector<std::vector<int>> padding_width = {{1, 0}, {0, 1}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// Conv-like pattern with no padding
+TEST_F(NVFuserTest, FusionGather6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {3, 4};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  // Blocking the spatial dimensions
+  const int block_x = 16;
+  const int block_y = 8;
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto out = tv1;
+  auto out_cache = out->cacheBefore();
+
+  out->split(1, block_x);
+  out->split(0, block_y);
+  out->reorder({{1, 2}, {2, 1}});
+
+  TransformPropagator::from(out);
+
+  tv0->computeAt(out, 2);
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  out->axis(0)->parallelize(ParallelType::BIDy);
+  out->axis(1)->parallelize(ParallelType::BIDx);
+  out->axis(2)->parallelize(ParallelType::TIDy);
+  out->axis(3)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  const int s1 = 101;
+  const int s2 = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// Conv-like pattern with irregular padding
+TEST_F(NVFuserTest, FusionGather7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {3, 4};
+  const std::vector<std::vector<int>> padding_width = {{0, 2}, {2, 1}};
+
+  auto tv1 = gather(tv0, window_shape, padding_width);
+
+  fusion.addOutput(tv1);
+
+  // Blocking the spatial dimensions
+  const int block_x = 16;
+  const int block_y = 8;
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto out = tv1;
+  auto out_cache = out->cacheBefore();
+
+  out->split(1, block_x);
+  out->split(0, block_y);
+  out->reorder({{1, 2}, {2, 1}});
+
+  TransformPropagator::from(out);
+
+  tv0->computeAt(out, 2);
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  out->axis(0)->parallelize(ParallelType::BIDy);
+  out->axis(1)->parallelize(ParallelType::BIDx);
+  out->axis(2)->parallelize(ParallelType::TIDy);
+  out->axis(3)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  const int s1 = 101;
+  const int s2 = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// With no padding but with striding
+TEST_F(NVFuserTest, FusionGather8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {2, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
+  const std::vector<int> strides = {3, 3};
+
+  auto tv1 = gather(tv0, window_shape, padding_width, strides);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  for (const auto i : c10::irange(size.size())) {
+    size[i] = ceilDiv(
+        size[i] - window_shape[i] + 1 + padding_width[i][0] +
+            padding_width[i][1],
+        strides[i]);
+  }
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width, strides);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// Similar to Gather8 but with splitting and parallelization
+TEST_F(NVFuserTest, FusionGather9_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {3, 4};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
+  const std::vector<int> strides = {2, 2};
+
+  auto tv1 = gather(tv0, window_shape, padding_width, strides);
+
+  fusion.addOutput(tv1);
+
+  // Blocking the spatial dimensions
+  const int block_x = 16;
+  const int block_y = 8;
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto out = tv1;
+  auto out_cache = out->cacheBefore();
+
+  out->split(1, block_x);
+  out->split(0, block_y);
+  out->reorder({{1, 2}, {2, 1}});
+
+  TransformPropagator::from(out);
+
+  tv0->computeAt(out, 2);
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  out->axis(0)->parallelize(ParallelType::BIDy);
+  out->axis(1)->parallelize(ParallelType::BIDx);
+  out->axis(2)->parallelize(ParallelType::TIDy);
+  out->axis(3)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  const int s1 = 101;
+  const int s2 = 99;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  std::vector<int64_t> size({s1, s2});
+  at::Tensor t0 = at::randn(size, options);
+  for (const auto i : c10::irange(size.size())) {
+    size[i] = ceilDiv(
+        size[i] - window_shape[i] + 1 + padding_width[i][0] +
+            padding_width[i][1],
+        strides[i]);
+  }
+  size.insert(size.end(), window_shape.begin(), window_shape.end());
+  // Use a pre-allocated output tensor filled with 1 so that invalid
+  // writes to outside valid ranges can be detected
+  at::Tensor output = at::ones(size, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0}, {output});
+
+  auto ref = gather(t0, window_shape, padding_width, strides);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionConv2D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 3, 3]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
+  // side of the spatial dimensions
+  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}});
+  // inp_tile: [C, H, W, 1, 3, 3]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv2DNoPadding_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 3, 3]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [3, 3] with no padding
+  auto inp_tile =
+      gather(inp, {1, 3, 3}, {{0, 0}, {0, 0}, {0, 0}}, {1, 1, 1}, true);
+  // inp_tile: [C, H-2, W-2, 1, 3, 3]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  std::vector<int64_t> stride = {1, 1};
+  std::vector<int64_t> padding = {0, 0};
+  auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv2DNoPaddingStrided_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 3, 3]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [2, 2] with no padding and strides of
+  // [2, 2]
+  auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 0}, {0, 0}}, {1, 2, 2});
+  // inp_tile: [C, H/2, W/2, 1, 2, 2]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  std::vector<int64_t> stride = {2, 2};
+  std::vector<int64_t> padding = {0, 0};
+  auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+// 5x5 followed by 3x3
+TEST_F(NVFuserTest, FusionConv2DChain_CUDA) {
+  const int dim_w1_h = 5;
+  const int dim_w1_w = 5;
+  const int dim_pad1_h = (dim_w1_h - 1) / 2;
+  const int dim_pad1_w = (dim_w1_w - 1) / 2;
+  const int dim_w2_h = 3;
+  const int dim_w2_w = 3;
+  const int dim_pad2_h = (dim_w2_h - 1) / 2;
+  const int dim_pad2_w = (dim_w2_w - 1) / 2;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [K1, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K2, K1, S1, T1]
+  auto w1 = makeSymbolicTensor(4);
+  fusion.addInput(w1);
+
+  // Weights: [K3, K2, S2, T2]
+  auto w2 = makeSymbolicTensor(4);
+  fusion.addInput(w2);
+
+  // Gather a neighbor tile of [w1_h, w1_w] with padding
+  auto inp_tile = gather(
+      inp,
+      {1, dim_w1_h, dim_w1_w},
+      {{0, 0}, {dim_pad1_h, dim_pad1_h}, {dim_pad1_w, dim_pad1_w}});
+  // inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w1_bc = broadcast(w1, {false, false, true, true, true, false, false});
+
+  auto inp_times_w1 = mul(inp_bc, w1_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out1 = sum(inp_times_w1, {1, 4, 5, 6});
+
+  // Second conv
+  auto out1_tile = gather(
+      out1,
+      {1, dim_w2_h, dim_w2_w},
+      {{0, 0}, {dim_pad2_h, dim_pad2_h}, {dim_pad2_w, dim_pad2_w}});
+
+  auto out1_bc =
+      broadcast(out1_tile, {true, false, false, false, false, false, false});
+  auto w2_bc = broadcast(w2, {false, false, true, true, true, false, false});
+
+  auto out1_times_w2 = mul(out1_bc, w2_bc);
+
+  auto out2 = sum(out1_times_w2, {1, 4, 5, 6});
+
+  fusion.addOutput(out2);
+
+  ////////////////////////////////////
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+
+  out2->split(2, block_h);
+  out2->split(4, block_w);
+  out2->reorder({{3, 4}});
+  // out2: [K3, K2, Ho, Wo, Hi, Wi, 1, 3, 3]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out2, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out1->reorder({{5, 3}, {3, 4}, {4, 5}});
+  out1->setMemoryType(MemoryType::Shared);
+
+  inp_cache->computeAt(out1, 4);
+
+  inp_tile->computeAt(out1, -1);
+  w1->computeAt(out1, -1);
+
+  out1_tile->computeAt(out2, -1);
+  w2->computeAt(out2, -1);
+
+  out2->axis(0)->parallelize(ParallelType::BIDx);
+  out2->axis(4)->parallelize(ParallelType::TIDy);
+  out2->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_k1 = 3;
+  const int dim_k2 = 5;
+  const int dim_k3 = 7;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options);
+  at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options);
+  at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options);
+  std::vector<IValue> inputs = {at_inp, at_w1, at_w2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out1 = at::conv2d(at_inp, at_w1, {}, 1, 2);
+  auto at_out2 = at::conv2d(at_out1, at_w2, {}, 1, 1);
+  at_out2 = at_out2.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 2, 2]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [2, 2] with padding size of 1 only for
+  // the right side of the spatial dimensions. The left padding is
+  // zero so that the output axis stays the same.
+  auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 1}, {0, 1}});
+  // inp_tile: [C, H, W, 1, 2, 2]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 2, 2]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
+  at_out = at_out.squeeze(0); // drop the N axis
+  // The shape of the spatial domain is (dim_h+1)x(dim_w+1), whereas
+  // the fuser output has dim_h*dim_w. Drop the first elements to make
+  // it match with the fuser output.
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(0, at::indexing::None),
+      at::indexing::Slice(1, at::indexing::None),
+      at::indexing::Slice(1, at::indexing::None)};
+  at_out = at_out.index(indices);
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv4x4Pad1x1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 4, 4]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [4, 4] with padding size of 1 for both
+  // sides of the spatial dimensions. The resulting extent is
+  // decreased by one.
+  auto inp_tile =
+      gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 1, 1}, true);
+  // inp_tile: [C, H-1, W-1, 1, 4, 4]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 4]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out =
+      at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, 1);
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv4x5Pad1x2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 4, 4]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [4, 5] with padding size of 1 and 2 for
+  // each side of the spatial dimensions.
+  auto inp_tile =
+      gather(inp, {1, 4, 5}, {{0, 0}, {1, 1}, {2, 2}}, {1, 1, 1}, true);
+  // inp_tile: [C, H-1, W, 1, 4, 5]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  // Blocking the channel dimension
+  const int block_c = 8;
+
+  out->split(2, block_h);
+  out->split(4, block_w);
+  out->reorder({{3, 4}});
+  // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 5]
+
+  out->split(1, block_c);
+  // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
+  // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  // Move Ci forward
+  out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
+  inp_cache->computeAt(out_rf, 5);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 4, 5}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out =
+      at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, {1, 2});
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv4x4Pad1x1Stride4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 3, 3]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [4, 4] with padding size of 1 for both
+  // sides of the spatial dimensions. Set the stride width as 4.
+  auto inp_tile = gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 4, 4});
+  // inp_tile: [C, H/4, s4, W/4, s4, 1, 4, 4]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  const int block_c = 2;
+
+  // [K, C, H/s, W/s, 1, 4, 4]
+  out->split(2, block_h);
+  // [K, C, H/s/block_h, block_h, W/s, 1, 4, 4]
+  out->split(4, block_w);
+  // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 4, 4]
+  out->reorder({{3, 4}});
+  // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4, 4]
+  out->split(1, block_c);
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4,
+  // 4]
+  out->split(4, 1);
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
+  // 4, 4]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
+  // 4, 4]
+
+  // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
+
+  inp_cache->computeAt(out, 5);
+  inp_cache->setMemoryType(MemoryType::Shared);
+  // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
+  // 4, 4]
+
+  // Move C/block_c before block_h/2 and share the domain from
+  // inp_cache to out_rf
+  out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
+  inp_cache->computeAt(out_rf, 6);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::Unswitch);
+  out->axis(5)->parallelize(ParallelType::TIDy);
+  out->axis(6)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out =
+      at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 4, {1, 1});
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+// POC implementation of im2col for 3-by-3 kernels
+TEST_F(NVFuserTest, FusionIm2Col_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [N, C, H, W]
+  auto inp = makeSymbolicTensor(4);
+  fusion.addInput(inp);
+
+  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
+  // side of the spatial dimensions
+  auto inp_tile = gather(inp, {1, 1, 3, 3}, {{0, 0}, {0, 0}, {1, 1}, {1, 1}});
+  // inp_tile: [N, C, H, W, 1, 1, 3, 3]
+
+  auto inp_col = transpose(inp_tile, {{1, 3}, {2, 1}, {3, 2}});
+  // inp_col: [N, H, W, C, 1, 1, 3, 3]
+
+  fusion.addOutput(inp_col);
+
+  ////////////////////////////////////
+
+  // Cache the input tensor
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+
+  auto out = inp_col;
+
+  out->split(1, block_h);
+  out->split(3, block_w);
+  out->reorder({{2, 3}});
+  // out: [N, Ho, Wo, Hi, Wi, C, 1, 1, 3, 3]
+  // Move the C axis out of Hi*Wi
+  out->reorder({{5, 3}, {3, 4}, {4, 5}});
+  // out: [N, Ho, Wo, C, Hi, Wi, 1, 1, 3, 3]
+
+  // Create a [block_x, block_y] tile on smem
+  inp_cache->computeAt(out, 4);
+  inp_cache->setMemoryType(MemoryType::Shared);
+  // Fully inline inp_tile
+  inp_tile->computeAt(out, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDz);
+  out->axis(1)->parallelize(ParallelType::BIDy);
+  out->axis(2)->parallelize(ParallelType::BIDx);
+  out->axis(4)->parallelize(ParallelType::TIDy);
+  out->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile});
+
+  const int dim_h = 31;
+  const int dim_w = 33;
+  const int dim_c = 5;
+  const int dim_n = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options);
+  std::vector<IValue> inputs = {at_inp};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1});
+
+  // at::im2col outputs [N, C*3*3, N*H]
+  at_out = at::transpose(at_out, 1, 2);
+  at_out = at::reshape(at_out, {dim_n, dim_h, dim_w, dim_c, 1, 1, 3, 3});
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftNoPadding1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, false);
+  auto tv3 = shift(tv1, {-1, 1}, false);
+  auto tv4 = add(tv2, tv3);
+  auto tv5 = sum(tv4, {0, 1});
+
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv5->split(0, 4);
+  tv5->split(-1, 8);
+  tv5->reorder({{1, 2}});
+
+  TransformPropagator::from(tv5);
+
+  tv2->computeAt(tv5, -1);
+  tv3->computeAt(tv5, -1);
+
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t1, {-1, 1});
+  auto t4 = t2 + t3;
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
+  t4 = t4.index(indices);
+  auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Split and merge
+TEST_F(NVFuserTest, FusionShiftNoPadding2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, false);
+  auto tv3 = shift(tv1, {-1, 1}, false);
+  auto tv4 = add(tv2, tv3);
+  auto tv5 = sum(tv4, {0, 1});
+
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv5->split(0, 4);
+  tv5->split(-1, 8);
+  tv5->reorder({{1, 2}});
+  tv5->merge(-2, -1);
+
+  TransformPropagator::from(tv5);
+
+  tv2->computeAt(tv5, -1);
+  tv3->computeAt(tv5, -1);
+
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t1, {-1, 1});
+  auto t4 = t2 + t3;
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
+  t4 = t4.index(indices);
+  auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Split and merge, then welford
+TEST_F(NVFuserTest, FusionShiftNoPadding3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, false);
+  auto tv3 = shift(tv1, {-1, 1}, false);
+  auto tv4 = add(tv2, tv3);
+  auto tvs = Welford(tv4, {0, 1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv_avg->split(0, 4);
+  tv_avg->split(-1, 8);
+  tv_avg->reorder({{1, 2}});
+  tv_avg->merge(-2, -1);
+
+  TransformPropagator::from(tv_avg);
+
+  tv2->computeAt(tv_avg, -1);
+  tv3->computeAt(tv_avg, -1);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv_avg, ir_utils::allTvs(&fusion));
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  outputs[1] /= (numel_x - 2) * (numel_y - 2);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t1, {-1, 1});
+  auto t4 = t2 + t3;
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
+  t4 = t4.index(indices);
+  auto ref_avg = t4.mean(at::ArrayRef<int64_t>{0, 1});
+  auto ref_M2 = t4.var(at::ArrayRef<int64_t>{0, 1}, false);
+  auto ref_N = at::ones({}, options_int) * (numel_x - 2) * (numel_y - 2);
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      inputs,
+      {ref_avg, ref_M2, ref_N},
+      __LINE__,
+      __FILE__);
+}
+
+// Shift indexing and predication with contiguous merge
+TEST_F(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, true);
+  auto tv3 = shift(tv1, {-1, 1}, false);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv2->merge(0);
+  tv3->merge(0);
+  tv4->merge(0);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
+
+  auto fuser_out = outputs[0].index(indices);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t1, {-1, 1});
+  auto ref = t2 + t3;
+
+  ref = ref.index(indices);
+
+  testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftNoPaddingChain_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, false);
+  auto tv3 = shift(tv2, {1, -1}, false);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv4->split(0, 4);
+  tv4->split(-1, 8);
+  tv4->reorder({{1, 2}});
+
+  tv1->computeAt(tv4, 2);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::TIDy);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDy);
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+
+  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {1, -1});
+  auto t3 = shift(t2, {1, -1});
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(2, at::indexing::None), at::indexing::Slice(0, -2)};
+  t3 = t3.index(indices);
+  auto ref = t3.sum(at::ArrayRef<int64_t>{0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Rfactor is not allowed with partial domains
+TEST_F(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1, -1}, false);
+  auto tv3 = sum(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+  tv3->split(-1, 8);
+  tv3->reorder({{1, 2}});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv3->rFactor({-2}));
+}
+
+TEST_F(NVFuserTest, FusionShiftPadding1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {2, -2}, {1, 1});
+  auto tv3 = shift(tv1, {-3, 2}, {2, 2});
+  auto tv4 = add(tv2, tv3);
+  auto tv5 = sum(tv4, {0, 1});
+
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv5->split(0, 4);
+  tv5->split(-1, 8);
+  tv5->reorder({{1, 2}});
+
+  TransformPropagator::from(tv5);
+
+  tv2->computeAt(tv5, -1);
+  tv3->computeAt(tv5, -1);
+
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = shift(t1, {2, -2});
+  auto t3 = shift(t1, {-3, 2});
+  auto t4 = t2 + t3;
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -1), at::indexing::Slice(0, -1)};
+  t4 = t4.index(indices);
+  auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPartialSplit1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  // [I]
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(0));
+  // [I]
+  auto tv2 = shift(tv1, {1}, false);
+  // [1:I]
+  auto tv3 = shift(tv1, {-1}, false);
+  // [0:I-1]
+  auto tv4 = add(tv2, tv3);
+  // [1:I-1]
+  fusion.addOutput(tv4);
+
+  // Partial split of tv4. Split only the valid range, which is
+  // [1:-1].
+  tv4->split(0, 8, true, true);
+  // [(I-2)/8, 8]
+
+  // Propagates the partial split back to tv1. This means that all of
+  // the other tensors are also shaped as [(I-2)/8, 8], which appears
+  // to mean only the sub region of ((I-2)/8 * 8) is
+  // computed for tv1, tv2 and tv3. It's fine for the tv2 and tv3
+  // tensors as only that sub region is used by tv4. It's also fine
+  // for tv1 since it has halo of size one at each side, so the whole
+  // region is actually calculated for tv1.
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::BIDx);
+  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  // gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8),
+  // so it's going to be just 2 rather than 3.
+  const int numel_x = 18;
+
+  ExpressionEvaluator evaluator(&fusion);
+  auto root_extent = tv4->getRootDomain()[0]->extent();
+  evaluator.bind(root_extent, numel_x);
+  auto extent_eval = evaluator.evaluate(tv4->axis(0)->extent());
+  TORCH_CHECK(
+      extent_eval.has_value(),
+      "Invalid evaluation of outer domain extent of partial split");
+  TORCH_CHECK(
+      extent_eval.value() == (numel_x - 2) / 8,
+      "Invalid extent of outer domain of partial split");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{at::indexing::Slice(1, -1)};
+
+  outputs[0] = outputs[0].index(indices);
+
+  auto ref = (shift(t0, {1}) + shift(t0, {-1})).index(indices);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPartialSplit2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(0));
+  auto tv2 = shift(tv1, {1}, false);
+  auto tv3 = shift(tv1, {-1}, false);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  tv4->split(0, 4, true, true);
+
+  // This causes tv5 and tv6 also to be split with the same partial
+  // offsets, however, since they need to be calculated entirely, the
+  // resulting code would be invalid. It should be detected as part of
+  // initial fusion validation during lowering.
+  tv1->computeAt(tv4, 1);
+
+  // Validation should throw an error due to tv5 and tv6.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+// 2D version of PartialSplit1
+TEST_F(NVFuserTest, FusionPartialSplit3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(0));
+  auto tv2 = shift(tv1, {1, 2}, false);
+  auto tv3 = shift(tv1, {-2, -1}, false);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 8, true, true);
+  tv4->split(0, 4, true, true);
+  tv4->reorder({{1, 2}, {2, 1}});
+
+  tv1->computeAt(tv4, 2);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDy);
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::TIDy);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int numel_x = 32 + 3;
+  const int numel_y = 32 + 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, -2), at::indexing::Slice(2, -1)};
+
+  outputs[0] = outputs[0].index(indices);
+
+  auto ref = (shift(t0, {1, 2}) + shift(t0, {-2, -1})).index(indices);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Almost same fusion with Shift5ptStencilChain but non-padded shift
+// and partial split.
+TEST_F(NVFuserTest, FusionPartialSplit4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
+
+  // First stencil: 5pt stencil
+  // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
+  std::vector<TensorView*> tv_stencil1_shifts;
+  for (const auto& offset : offsets) {
+    tv_stencil1_shifts.push_back(shift(tv0, offset, false));
+  }
+
+  auto tv_stencil1 = tv0;
+  for (auto tv : tv_stencil1_shifts) {
+    tv_stencil1 = add(tv_stencil1, tv);
+  }
+
+  tv_stencil1 = div(
+      tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
+
+  // Second stencil: Same 5pt stencil
+  std::vector<TensorView*> tv_stencil2_shifts;
+  for (const auto& offset : offsets) {
+    tv_stencil2_shifts.push_back(shift(tv_stencil1, offset, false));
+  }
+
+  auto tv_stencil2 = tv_stencil1;
+  for (auto tv : tv_stencil2_shifts) {
+    tv_stencil2 = add(tv_stencil2, tv);
+  }
+
+  tv_stencil2 = div(
+      tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
+
+  auto tv_out = tv_stencil2;
+
+  fusion.addOutput(tv_out);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  std::vector<int> split_factor({16, 16});
+
+  tv_out->split(-1, split_factor[1], true, true);
+  tv_out->split(0, split_factor[0], true, true);
+  tv_out->reorder({{1, 2}, {2, 1}});
+
+  tv0->computeAt(tv_out, 2);
+
+  // Inline completely all inputs to the first stencil output, except for the
+  // tv0 cache
+  for (auto tv : tv_stencil1_shifts) {
+    tv->computeAt(tv_stencil1, -1);
+  }
+
+  // Inline completely all inputs to the second stencil output, except
+  // for the first stencil output
+  for (auto tv : tv_stencil2_shifts) {
+    tv->computeAt(tv_stencil2, -1);
+  }
+
+  tv_out->axis(0)->parallelize(ParallelType::BIDy);
+  tv_out->axis(1)->parallelize(ParallelType::BIDx);
+  tv_out->axis(2)->parallelize(ParallelType::TIDy);
+  tv_out->axis(3)->parallelize(ParallelType::TIDx);
+
+  auto all_values = DependencyCheck::getAllValsBetween(
+      {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
+  for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
+    scheduler_utils::parallelizeAllLike(tv_out, {tv});
+  }
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+  tv_stencil1->setMemoryType(MemoryType::Shared);
+
+  // Input matrix size is 68x68, and the output is 64x64. Both
+  // gridDim.x and gridim.y should be ceilDiv(numel - 4,
+  // split_factor), which is 4. If full split is used, the grid
+  // dimension would be 5.
+  const int numel_x = 64 + 4;
+  const int numel_y = 64 + 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(2, -2), at::indexing::Slice(2, -2)};
+
+  outputs[0] = outputs[0].index(indices);
+
+  auto stencil1 = t0;
+  for (const auto& offset : offsets) {
+    stencil1 = stencil1 + shift(t0, offset);
+  }
+  stencil1 = stencil1 / int(offsets.size() + 1);
+  auto stencil2 = stencil1;
+  for (const auto& offset : offsets) {
+    stencil2 = stencil2 + shift(stencil1, offset);
+  }
+  stencil2 = stencil2 / int(offsets.size() + 1);
+  auto ref = stencil2.index(indices);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPartialSplit5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int numel_x = 10;
+  const int numel_y = 11;
+
+  // auto tv0 = makeSymbolicTensor(2);
+  auto tv0 = makeConcreteTensor({numel_x, numel_y});
+  fusion.addInput(tv0);
+
+  auto tv1 = shift(tv0, {0, 1}, false);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  // Partially split tv2 but not tv1. Producer indexing with tv2 as a consumer
+  // requires adjustment of the index to account for the difference of split
+  // offsets.
+  tv2->split(1, 4, true, true);
+  tv1->split(1, 4);
+
+  tv1->computeAt(tv2, 1);
+
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(0, at::indexing::None),
+      at::indexing::Slice(1, at::indexing::None)};
+
+  outputs[0] = outputs[0].index(indices);
+
+  auto ref = (shift(t0, {0, 1}) + 1).index(indices);
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPartialSplit6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int numel_x = 9;
+
+  auto tv0 = makeConcreteTensor({numel_x});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {1}, false);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Another mix of partial and non-partial split
+  tv1->split(0, 4);
+  tv2->split(0, 4, true, true);
+  tv3->split(0, 4);
+
+  // Just make it easier for compute-sanitizer to flag invalid memory accesses
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(1, at::indexing::None)};
+
+  outputs[0] = outputs[0].index(indices);
+
+  auto ref = (shift(t0 + 1, {1}) + 1).index(indices);
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionShiftUnswitch1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = shift(tv0, {-1, 0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = shift(tv0, {0, 1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = shift(tv0, {2, 2});
+  fusion.addOutput(tv3);
+
+  auto tv4 = shift(tv0, {-2, -2});
+  fusion.addOutput(tv4);
+
+  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv6 = shift(tv5, {0, -1});
+  fusion.addOutput(tv6);
+
+  tv1->axis(1)->parallelize(ParallelType::Unswitch);
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  tv3->axis(0)->parallelize(ParallelType::Unswitch);
+  tv4->axis(0)->parallelize(ParallelType::Unswitch);
+
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+  tv6->axis(1)->parallelize(ParallelType::TIDx);
+  tv5->axis(0)->parallelize(ParallelType::Unswitch);
+  tv5->setMemoryType(MemoryType::Shared);
+
+  int numel_x = 9;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = shift(t0, {-1, 0});
+  TORCH_CHECK(t1.equal(outputs[0]));
+
+  auto t2 = shift(t0, {0, 1});
+  TORCH_CHECK(t2.equal(outputs[1]));
+
+  auto t3 = shift(t0, {2, 2});
+  TORCH_CHECK(t3.equal(outputs[2]));
+
+  auto t4 = shift(t0, {-2, -2});
+  TORCH_CHECK(t4.equal(outputs[3]));
+
+  auto t6 = shift(t0 + 1, {0, -1});
+  TORCH_CHECK(t6.equal(outputs[4]));
+}
+
+TEST_F(NVFuserTest, FusionGatherUnswitch1_CUDA) {
+  const int tv1_gather = 3;
+  const int tv1_gather_pad = 1;
+  const int tv2_gather = 5;
+  const int tv2_gather_pad = 2;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = gather(tv0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
+  fusion.addOutput(tv1);
+
+  auto tv2 = gather(tv0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
+  fusion.addOutput(tv2);
+
+  // Static gather
+  auto tv3 = gather(tv0, {3}, {{1, 1}});
+  fusion.addOutput(tv3);
+
+  // Static gather
+  auto tv4 = gather(tv0, {5}, {{2, 2}});
+  fusion.addOutput(tv4);
+
+  auto tv0_cache = tv0->cacheAfter();
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  tv4->split(0, 32);
+
+  tv0->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::Unswitch);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
+  TORCH_CHECK(t1.equal(outputs[0]));
+
+  auto t2 = gather(t0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
+  TORCH_CHECK(t2.equal(outputs[1]));
+
+  auto t3 = gather(t0, {3}, {{1, 1}});
+  TORCH_CHECK(t3.equal(outputs[2]));
+
+  auto t4 = gather(t0, {5}, {{2, 2}});
+  TORCH_CHECK(t4.equal(outputs[3]));
+}
+
+TEST_F(NVFuserTest, FusionGatherStrided1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {1, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
+
+  const std::vector<int> strides = {1, 3};
+
+  auto tv1 = gather(tv0, window_shape, padding_width, strides);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1, s2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // tv1 has a stride dimension, so its number of dimensions should be
+  // input_ndims + window_ndims + stride.
+  TORCH_CHECK(tv1->nDims() == tv0->nDims() * 2 + 1);
+
+  // However, the number of dimensions of the Aten tensor should still
+  // be just the twice of the number of dimensions of the input
+  // tensor.
+  auto fuser_out = outputs[0];
+  TORCH_CHECK(
+      fuser_out.ndimension() == static_cast<int64_t>(tv0->nDims()) * 2,
+      "Invalid dimensionality of output tensor: ",
+      fuser_out.ndimension());
+
+  // Each output dimension should be: ceilDiv(input_size + padding_width -
+  // window, stride).
+  for (const auto i : c10::irange(window_shape.size())) {
+    auto valid_dim = ceilDiv(
+        t0.size(i) + padding_width[i][0] + padding_width[i][1] -
+            window_shape[i] + 1,
+        strides[i]);
+    auto actual_dim = outputs[0].size(i);
+    TORCH_CHECK(
+        valid_dim == actual_dim,
+        "Invalid output size at dimension ",
+        i,
+        ". Expected: ",
+        valid_dim,
+        ", actual: ",
+        actual_dim);
+  }
+
+  auto ref = gather(t0, window_shape, padding_width, strides);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// Split strided domain
+TEST_F(NVFuserTest, FusionGatherStrided2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {3};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  // Split the strided domain
+  tv3->split(0, 4);
+
+  // Propagate the split by 4 of the tv3 domain to pre-stride domains,
+  // making them split by 4 * 3
+  tv0->computeAt(tv3, 1);
+
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int s1 = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width, strides);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Outer split
+TEST_F(NVFuserTest, FusionGatherStrided3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {3};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+
+  auto tv3 = sum(tv2, {-1});
+  fusion.addOutput(tv3);
+
+  // Outer split
+  tv3->split(0, 2, false);
+
+  tv0->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  const int s1 = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width, strides);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGatherStrided4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {3};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  // Test propagation of split from one gather output to another
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+  auto tv3 = gather(tv1, window_shape, padding_width, strides);
+
+  auto tv4 = sum(tv2, {-1});
+  fusion.addOutput(tv4);
+
+  auto tv5 = sum(tv3, {-1});
+  fusion.addOutput(tv5);
+
+  tv4->split(0, 2);
+
+  // Test forward computeAt propagation from tv1 to tv3
+  tv0->computeAt(tv4, 1);
+
+  const int s1 = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width, strides);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref, ref}, __LINE__, __FILE__);
+}
+
+// Same as GatherStrided1 but with stride != window
+TEST_F(NVFuserTest, FusionGatherStrided5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  const std::vector<int> window_shape = {1, 3};
+  const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
+
+  const std::vector<int> strides = {1, 2};
+
+  auto tv1 = gather(tv0, window_shape, padding_width, strides);
+
+  fusion.addOutput(tv1);
+
+  const int s1 = 11;
+  const int s2 = 13;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1, s2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = gather(t0, window_shape, padding_width, strides);
+
+  TORCH_CHECK(ref.equal(outputs[0]));
+}
+
+// Same as GatherStrided2 but with stride != window
+TEST_F(NVFuserTest, FusionGatherStrided6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {2};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  // Split the strided domain
+  tv3->split(0, 4);
+
+  // Propagate the split by 4 of the tv3 domain to pre-stride domains,
+  // making them split by 4 * 2
+  tv0->computeAt(tv3, 1);
+
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int s1 = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width, strides);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Same as GatherStrided4 but different strides
+TEST_F(NVFuserTest, FusionGatherStrided7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  // Use different strides
+  auto tv2 = gather(tv1, window_shape, padding_width, {3});
+  auto tv3 = gather(tv1, window_shape, padding_width, {2});
+
+  auto tv4 = sum(tv2, {-1});
+  fusion.addOutput(tv4);
+
+  auto tv5 = sum(tv3, {-1});
+  fusion.addOutput(tv5);
+
+  tv4->split(0, 2);
+
+  // Since tv3 has a different stride factor, this should fail.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv0->computeAt(tv4, 1));
+}
+
+// Same as GatherStrided2 but with unswitch
+TEST_F(NVFuserTest, FusionGatherStrided8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {3};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  const int tidx = 32;
+
+  // Split the strided domain
+  tv3->split(0, tidx);
+
+  // Split for unswitch
+  tv3->split(0, 1);
+
+  tv0->computeAt(tv3, 2);
+
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unswitch);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int s1 = 1023;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({s1}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0 + 1;
+  auto t2 = gather(t1, window_shape, padding_width, strides);
+  auto ref = sum(t2, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Chained strided gather. Not supported yet.
+TEST_F(NVFuserTest, FusionGatherStridedChain_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int> window_shape = {3};
+  const std::vector<std::vector<int>> padding_width = {{1, 1}};
+  const std::vector<int> strides = {3};
+  // const std::vector<int> strides = {1};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  auto tv2 = gather(tv1, window_shape, padding_width, strides);
+  // Reduce gathered window
+  auto tv3 = sum(tv2, {-1});
+
+  // Repeat
+  auto tv4 = gather(tv3, window_shape, padding_width, strides);
+  auto tv5 = sum(tv4, {-1});
+  auto out = tv5;
+
+  fusion.addOutput(out);
+
+  // This should throw an error at HaloInfo::build.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(GpuLower gpulw(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionMaxPoolingStrided_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input:  CHW
+  // Pooling window: 3x3
+  // Strides: 3
+  // Padding: 1 at each end of the inner 2 dimensions
+
+  // [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // [C, H/3, W/3, 1, 3, 3]
+  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
+
+  // [C, H/3, W/3]
+  auto max_tensor = reductionOp(
+      BinaryOpType::Max,
+      {-3, -2, -1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      inp_tile);
+  fusion.addOutput(max_tensor);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Tiling the spatial domain
+  const int tile_x = 32;
+  const int tile_y = 8;
+
+  max_tensor->split(1, tile_y);
+  max_tensor->split(3, tile_x);
+  max_tensor->reorder({{2, 3}});
+  // [C, H/tile_y, W/tile_x, tile_y, tile_x]
+  max_tensor->split(2, 1);
+  // [C, H/tile_y, W/tile_x, 1, tile_y, tile_x]
+
+  inp->computeAt(max_tensor, 4);
+
+  max_tensor->axis(0)->parallelize(ParallelType::BIDx);
+  max_tensor->axis(3)->parallelize(ParallelType::Unswitch);
+  max_tensor->axis(4)->parallelize(ParallelType::TIDy);
+  max_tensor->axis(5)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(max_tensor, ir_utils::allTvs(&fusion));
+
+  inp_cache->setMemoryType(MemoryType::Shared);
+
+  const int hw = 50;
+  const int num_channels = 20;
+  const int pooling_window = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
+  // We always pad inputs by zero, so if all surrounding values are
+  // negative, max pooling would pick a padded value, which isn't the
+  // correct behavior. We need to be able to choose the value of
+  // padding. In this case, padding by the minimum value would not
+  // have this problem. For now, avoid the problem by making sure all
+  // values are not negative.
+  aten_inp = at::abs(aten_inp);
+  std::vector<IValue> inputs = {aten_inp};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = at::max_pool2d(
+      aten_inp, {pooling_window, pooling_window}, {3, 3}, {1, 1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConv2DStaticStrided_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Input: [C, H, W]
+  auto inp = makeSymbolicTensor(3);
+  fusion.addInput(inp);
+
+  // Weights: [K, C, 3, 3]
+  auto w = makeSymbolicTensor(4);
+  fusion.addInput(w);
+
+  // Gather a neighbor tile of [3, 3] with padding size of 1 for each
+  // side of the spatial dimensions
+  auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
+  // inp_tile: [C, H/3, s3, W/3, s3, 1, 3, 3]
+
+  auto inp_bc =
+      broadcast(inp_tile, {true, false, false, false, false, false, false});
+  auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
+
+  auto inp_times_w = mul(inp_bc, w_bc);
+
+  // Reduce the channel and neighbor tile dimensions
+  auto out = sum(inp_times_w, {1, 4, 5, 6});
+
+  fusion.addOutput(out);
+
+  ////////////////////////////////////
+
+  // Cache the input and weight tensors
+  auto inp_cache = inp->cacheAfter();
+
+  // Blocking the spatial dimensions
+  const int block_w = 16;
+  const int block_h = 4;
+  const int block_c = 2;
+
+  // [K, C, H/s, W/s, 1, 3, 3]
+  out->split(2, block_h);
+  // [K, C, H/s/block_h, block_h, W/s, 1, 3, 3]
+  out->split(4, block_w);
+  // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 3, 3]
+  out->reorder({{3, 4}});
+  // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, 3]
+  out->split(1, block_c);
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3,
+  // 3]
+  out->split(4, 1);
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
+  // 3, 3]
+
+  auto out_rf = out->rFactor({1, -3, -2, -1});
+  // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
+  // 3, 3]
+
+  // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
+
+  inp_cache->computeAt(out, 5);
+  inp_cache->setMemoryType(MemoryType::Shared);
+  // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
+  // 3, 3]
+
+  // Move C/block_c before block_h/2 and share the domain from
+  // inp_cache to out_rf
+  out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
+  inp_cache->computeAt(out_rf, 6);
+
+  inp_tile->computeAt(out_rf, -1);
+  w->computeAt(out_rf, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(1)->parallelize(ParallelType::TIDz);
+  out->axis(4)->parallelize(ParallelType::Unswitch);
+  out->axis(5)->parallelize(ParallelType::TIDy);
+  out->axis(6)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
+
+  const int dim_h = 99;
+  const int dim_w = 101;
+  const int dim_c = 10;
+  const int dim_f = 20;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
+  at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
+  std::vector<IValue> inputs = {at_inp, at_w};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
+  auto at_out = at::conv2d(at_inp, at_w, {}, 3, 1);
+  at_out = at_out.squeeze(0); // drop the N axis
+
+  testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonDivisibleHalo1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = shift(tv1, {-1});
+  fusion.addOutput(tv2);
+
+  // [I]
+  tv2->split(0, 8);
+  // [I/8, 8]
+  tv2->split(1, 3);
+  // [I/8, 3, 3]
+
+  tv0->computeAt(tv2, -2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = shift((t0 + 1), {-1});
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonDivisibleHalo2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
+  auto tv2 = sum(tv1, {-2, -1});
+  auto tv3 = add(tv0, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  const int gy = 50;
+  const int gx = 50;
+  const int by = 8;
+  const int bx = 16;
+
+  auto tv5 = tv0->cacheAfter();
+
+  // [I, J]
+  tv4->split(0, gy);
+  // [I/gy, gy, J]
+  tv4->split(1, by);
+  // [I/gy, gy/by, by, J]
+  tv4->split(-1, gx);
+  // [I/gy, gy/by, by, J/gx, gx]
+  tv4->split(-1, bx);
+  // [I/gy, gy/by, by, J/gx, gx/bx, bx]
+  tv4->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
+  // [I/gy, J/gx, gy/by, gx/bx, by, bx]
+
+  auto tv6 = tv4->rFactor({2, 3});
+
+  tv0->computeAt(tv6, 4);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDy);
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::TIDy);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3, tv5, tv6});
+
+  tv5->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({111, 222}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
+  auto t2 = t1.sum({-2, -1});
+  auto t3 = t0 + t2;
+  auto t4 = t3.sum({-2, -1});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGather9ptStencilDoubleBuffering_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
+  auto tv2 = sum(tv1, {-2, -1});
+  auto tv3 = div(tv2, IrBuilder::create<Double>(9));
+
+  auto out = tv3;
+
+  fusion.addOutput(out);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  tv0_cache->setMemoryType(MemoryType::Shared);
+
+  out->split(-2, 4);
+  out->split(-1, 32);
+  out->reorder({{1, 2}, {2, 1}});
+  TransformPropagator::from(out);
+
+  tv0->computeAt(out, 2);
+
+  out->axis(3)->parallelize(ParallelType::TIDx);
+  out->axis(2)->parallelize(ParallelType::TIDy);
+  out->axis(0)->parallelize(ParallelType::BIDx);
+
+  scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
+
+  tv0_cache->doubleBuffer();
+
+  int numel_x = 99;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
+  auto t2 = sum(t1, {-2, -1});
+  auto t3 = t2 / 9;
+  auto ref = t3;
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelizeShift_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = shift(tv1, {1});
+  auto tv3 = shift(tv1, {-1});
+  auto tv4 = add(tv1, tv2);
+  auto tv5 = add(tv4, tv3);
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv5->split(-1, 1024);
+  tv5->split(-1, 2);
+  TransformPropagator::from(tv5);
+
+  tv0->computeAt(tv5, 1);
+
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1024 * 32}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0 + shift(t0, {1}) + shift(t0, {-1});
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
new file mode 100644
index 000000000000..20e7adb611bd
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -0,0 +1,898 @@
+#if defined(USE_CUDA)
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+// fuser and IR parser
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+namespace {
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .ndims(ndims)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(ndims, true))
+      .build();
+}
+
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+void checkIntValue(
+    ExpressionEvaluator& evaluator,
+    Val* val,
+    Int::ScalarType expected_value) {
+  TORCH_CHECK(val->isAnInt());
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+void checkIntValue(
+    kir::ExpressionEvaluator& evaluator,
+    const Val* val,
+    Int::ScalarType expected_value) {
+  const auto actual_value = evaluator.evaluate(val);
+  TORCH_CHECK(actual_value.has_value());
+  TORCH_CHECK(actual_value.value() == expected_value);
+}
+
+bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
+  int capability_major = at::cuda::getCurrentDeviceProperties()->major;
+  int capability_minor = at::cuda::getCurrentDeviceProperties()->minor;
+
+  if (capability_major < required_major ||
+      (capability_major == required_major &&
+       capability_minor < required_minor)) {
+    return true;
+  }
+  return false;
+}
+
+#define NVFUSER_TEST_CUDA_ARCH_GUARD(REQUIRED_MAJOR, REQUIRED_MINOR)          \
+  if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR)) {              \
+    GTEST_SKIP() << "Requires GPU capability above " << REQUIRED_MAJOR << "." \
+                 << REQUIRED_MINOR << " to run.\n";                           \
+  }
+
+} // namespace
+
+// MMA unit test for a single instruction tile. VoltaTT
+TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [M,K]
+  auto tv0 = makeConcreteTensor({16, 4}, DataType::Half);
+  // [K,N]
+  auto tv1 = makeConcreteTensor({4, 16}, DataType::Half);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [M,K,N]
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {true, false, false});
+
+  // Leaving both sets of mma inputs for volta outside
+  //  currently since they need to be swizzled.
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {1});
+
+  fusion.addOutput(tv2);
+
+  // TODO: should be able to completely remove it
+  //  in a follow up.
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(16, 16, 4);
+  gemm_tile.warp_tile = GemmTile(16, 16, 4);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::TT);
+  tv2->configureMma(mma_builder.build());
+
+  // Write A to smem
+  auto tv0cw = tv0b->cacheAfter();
+  // Read A from smem
+  auto tv0cr = tv0cw->cacheAfter();
+
+  // Write B to smem
+  auto tv1cw = tv1b->cacheAfter();
+
+  // Read B from smem
+  auto tv1cr = tv1cw->cacheAfter();
+
+  // Register accumulator
+  auto tv2c = tv2->cacheBefore();
+
+  // [M,K,N]->[M,N,K]
+  tv0cr->reorder({{-2, -1}, {-1, -2}});
+
+  // Schedule the instruction tile loops, which is the only
+  //  part we have in this unit test.
+  // Assumes last 3 dims are mnk
+  // The innermost loops are dictated by the type of mma used,
+  //   the scheduler needs to use mma_util::WarpMmaSwizzler to
+  //   get the right thread swizzle. Currently this is the only
+  //   method allowed to schedule the 3/2 inner most loops of
+  //   mma input/output.
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+
+  // [M,K,N]->[M,N,K]
+  tv1cr->reorder({{-2, -1}, {-1, -2}});
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  // [M,K,N]->[M,N,K]
+  tv2c->reorder({{-2, -1}, {-1, -2}});
+
+  // Schedule the output instruction tile.
+  // Assumes last 3 dims are mnk
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  // Set memory type.
+  tv0cw->setMemoryType(MemoryType::Shared);
+  tv1cw->setMemoryType(MemoryType::Shared);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 4}, options);
+  auto t1 = at::randn({4, 16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
+}
+
+// MMA unit test for a single instruction tile. VoltaTN
+TEST_F(NVFuserTest, FusionVoltaMMATN_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [M,K]
+  auto tv0 = makeConcreteTensor({16, 4}, DataType::Half);
+  // [N,K]
+  auto tv1 = makeConcreteTensor({16, 4}, DataType::Half);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [M,N,K]
+  auto tv0b = broadcast(tv0, {false, true, false});
+  auto tv1b = broadcast(tv1, {true, false, false});
+
+  // Leaving both sets of mma inputs for volta outside
+  //  currently since they need to be swizzled.
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {2});
+
+  fusion.addOutput(tv2);
+
+  // TODO: should be able to completely remove it
+  //  in a follow up.
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(16, 16, 4);
+  gemm_tile.warp_tile = GemmTile(16, 16, 4);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::TN);
+
+  tv2->configureMma(mma_builder.build());
+
+  auto tv0cw = tv0b->cacheAfter();
+  auto tv0cr = tv0cw->cacheAfter();
+  auto tv1cw = tv1b->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+  auto tv2c = tv2->cacheBefore();
+
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  tv0cw->setMemoryType(MemoryType::Shared);
+  tv1cw->setMemoryType(MemoryType::Shared);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 4}, options);
+  auto t1 = at::randn({16, 4}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
+  testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
+}
+
+// MMA unit test for a single instruction tile. VoltaNT
+TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [K,M]
+  auto tv0 = makeConcreteTensor({4, 16}, DataType::Half);
+  // [K,N]
+  auto tv1 = makeConcreteTensor({4, 16}, DataType::Half);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [K,M,N]
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {false, true, false});
+
+  // Leaving both sets of mma inputs for volta outside
+  //  currently since they need to be swizzled.
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {0});
+
+  fusion.addOutput(tv2);
+
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(16, 16, 4);
+  gemm_tile.warp_tile = GemmTile(16, 16, 4);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::NT);
+
+  tv2->configureMma(mma_builder.build());
+
+  auto tv0cw = tv0b->cacheAfter();
+  auto tv0cr = tv0cw->cacheAfter();
+  auto tv1cw = tv1b->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+  auto tv2c = tv2->cacheBefore();
+
+  // To MNK
+  tv0cr->reorder({{0, 2}, {1, 0}, {2, 1}});
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+
+  // To MNK
+  tv1cr->reorder({{0, 2}, {1, 0}, {2, 1}});
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  tv2c->reorder({{0, 2}, {1, 0}, {2, 1}});
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv0cw->setMemoryType(MemoryType::Shared);
+  tv1cw->setMemoryType(MemoryType::Shared);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({4, 16}, options);
+  auto t1 = at::randn({4, 16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto tref = t0.t().to(at::kFloat).matmul(t1.to(at::kFloat));
+  testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
+}
+
+// Gemm test for Volta MMA: TT
+//  This is the only example that is fully manual,
+//    the rest of them are facilitated by gemm utils.
+TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Keep multiples of 8 to keep vectorizable.
+  int M = 264, N = 120, K = 248;
+
+  // [M,K]
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  // [K,N]
+  auto tv1 = makeContigTensor(2, DataType::Half);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [M,K,N]
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {true, false, false});
+
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {1});
+
+  fusion.addOutput(tv2);
+
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::TT);
+
+  tv2->configureMma(mma_builder.build());
+
+  auto tv0r = tv0->cacheAfter();
+  auto tv1r = tv1->cacheAfter();
+  auto tv0cw = tv0b->cacheAfter();
+  auto tv0cr = tv0cw->cacheAfter();
+  auto tv1cw = tv1b->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+  auto tv2c = tv2->cacheBefore();
+
+  // Make a CTA tile
+  // ------------------------------------------------------------------
+  // [M,N]
+  tv2->split(-2, gemm_tile.cta_tile.m);
+  tv2->split(-1, gemm_tile.cta_tile.n);
+
+  //  0   1    2   3
+  // [Mo,M128, No, N128]
+  tv2->reorder({{1, 2}, {2, 1}});
+
+  //  0   1    2   3
+  // [Mo,No, M128, N128]
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, 2);
+
+  // Order K
+  //  0   1    2   3     4    5
+  // [Mo,No, M128, N128, Ko, K32]
+  tv2c->split(-1, gemm_tile.cta_tile.k);
+  tv2c->reorder({{2, 3}, {3, 4}, {4, 2}});
+
+  //  0   1  2   3     4    5
+  // [Mo,No, Ko M128, N128, K32]
+  tv0r->computeAt(tv2c, 3);
+  tv1r->computeAt(tv2c, 3);
+
+  // Make warp tile:
+  // -------------------------------------------------------------------------
+
+  //       -3   -2  -1
+  //[...    M,   N,  K]
+  // Distribute warp tile: accumulator reg
+  tv2c->split(-3, gemm_tile.warp_tile.m);
+  tv2c->split(-2, gemm_tile.warp_tile.n);
+
+  //  -5   -4   -3   -2   -1
+  // [Mwo  Mw  Nwo   Nw   K]
+  tv2c->split(-4, gemm_tile.instruction_tile.m);
+  tv2c->split(-2, gemm_tile.instruction_tile.n);
+  tv2c->split(-1, gemm_tile.instruction_tile.k);
+
+  //   -8  -7 -6 -5 -4 -3 -2 -1
+  // [Mwo Mw Mi Nwo Nw Ni Ko Ki]
+  tv2c->reorder({{-7, -5}, {-6, -3}, {-5, -7}, {-3, -2}, {-2, -6}});
+  //   -8  -7  -6 -5 -4 -3 -2 -1
+  // [Mwo  Nwo Ko Mw Nw Mi Ni Ki]
+
+  // Distribute warp tile: output tensor
+  tv2->split(-2, gemm_tile.warp_tile.m);
+  tv2->split(-1, gemm_tile.warp_tile.n);
+
+  //  -4   -3   -2   -1
+  // [Mwo  Mw  Nwo   Nw ]
+  tv2->split(-3, gemm_tile.instruction_tile.m);
+  tv2->split(-1, gemm_tile.instruction_tile.n);
+
+  //  -6 -5  -4 -3 -2 -1
+  // [Mwo Mw Mi Nwo Nw Ni]
+  tv2->reorder({{-5, -4}, {-4, -2}, {-3, -5}, {-2, -3}});
+  //  -6 -5  -4 -3 -2 -1
+  // [Mwo Nwo Mw Nw Mi Ni]
+
+  //           -8   -7  -6 -5 -4 -3 -2 -1
+  // [Mo No Ko Mwo  Nwo Kwo Mw Nw Mi Ni Ki]
+
+  tv0cr->computeAt(tv2c, -4);
+  tv1cr->computeAt(tv2c, -4);
+
+  // Schedule gmem read and smem write:
+  // ---------------------------------------------------------------------------
+  // [Mo,No,Ko,M,N,K]
+  tv0cw->reorder({
+      {-3, -2},
+      {-2, -3},
+  });
+  // [Mo,No,Ko,N,M,K]
+  tv0cw->merge(-2);
+  tv0r->merge(-2);
+  auto warp_dims = gemm_tile.cta_tile / gemm_tile.warp_tile;
+  int num_of_thread = warp_dims.m * warp_dims.n * warp_dims.k * 32;
+  int vector_word = 8;
+
+  // Smem write
+  tv0cw->split(-1, num_of_thread * vector_word);
+  tv0cw->split(-1, 8);
+  // [..., thread, vec]
+  // distribute to warp:
+  tv0cw->split(-2, 32);
+  tv0cw->split(-3, warp_dims.n * warp_dims.k);
+
+  tv0cw->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv0cw->axis(-2)->parallelize(ParallelType::TIDx);
+  tv0cw->axis(-3)->parallelize(ParallelType::TIDy);
+  tv0cw->axis(-4)->parallelize(ParallelType::TIDz);
+
+  // Gmem read (reg staging)
+  tv0r->split(-1, num_of_thread * vector_word);
+  tv0r->split(-1, 8);
+  // [..., thread, vec]
+  // distribute to warp:
+  tv0r->split(-2, 32);
+  tv0r->split(-3, warp_dims.n * warp_dims.k);
+
+  tv0r->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv0r->axis(-2)->parallelize(ParallelType::TIDx);
+  tv0r->axis(-3)->parallelize(ParallelType::TIDy);
+  tv0r->axis(-4)->parallelize(ParallelType::TIDz);
+
+  tv0cw->setMemoryType(MemoryType::Shared);
+  // [Mo,Ko,i,wy,wx,v]
+
+  // [Mo,No,Ko,M,N,K]
+  tv1r->reorder({
+      {-1, -2},
+      {-2, -1},
+  });
+  tv1cw->reorder({
+      {-1, -2},
+      {-2, -1},
+  });
+  // [Mo,No,Ko,M,K,N]
+  tv1cw->merge(-2);
+  tv1r->merge(-2);
+  // [Mo,No,Ko,i,wy,wx,v]
+  tv1r->split(-1, num_of_thread * vector_word);
+  tv1r->split(-1, 8);
+  // [..., thread, vec]
+  // distribute to warp:
+  tv1r->split(-2, 32);
+  tv1r->split(-3, warp_dims.n * warp_dims.k);
+
+  tv1r->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1r->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1r->axis(-3)->parallelize(ParallelType::TIDy);
+  tv1r->axis(-4)->parallelize(ParallelType::TIDz);
+
+  tv1cw->split(-1, num_of_thread * vector_word);
+  tv1cw->split(-1, 8);
+  // [..., thread, vec]
+  // distribute to warp:
+  tv1cw->split(-2, 32);
+  tv1cw->split(-3, warp_dims.n * warp_dims.k);
+
+  tv1cw->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1cw->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1cw->axis(-3)->parallelize(ParallelType::TIDy);
+  tv1cw->axis(-4)->parallelize(ParallelType::TIDz);
+
+  tv1cw->setMemoryType(MemoryType::Shared);
+
+  // Schedule mma input
+  // ---------------------------------------------------------------------------
+
+  // Use WarpMmaSwizzler for the innermost instruction tile.(Mi, Ni, Ki)
+  //           -8   -7  -6 -5 -4 -3 -2 -1
+  // [Mo No Ko Mwo  Nwo Kwo Mw Nw Mi Ni Ki]
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  // Schedule mma output
+  // ---------------------------------------------------------------------------
+  // Use WarpMmaSwizzler for the innermost instruction tile (Mi,Ni, Ki) on
+  // output
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  //  -6 -5  -4 -3 -2 -1
+  // [Mwo Nwo Mw Nw Mi Ni]
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  // Inline broadcast with smem write.
+  tv0b->computeAt(tv0cw, -2);
+  tv1b->computeAt(tv1cw, -2);
+
+  // Vectorize smem read
+  tv0cr->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  // Parallelize
+  //  0   1  2  3    4   5  6  7  8  9  10
+  // [Mo No Ko Mwo  Nwo Kw Mw Nw (Mi Ni Ki)]
+  tv2c->axis(3)->parallelize(ParallelType::TIDz);
+  tv2c->axis(4)->parallelize(ParallelType::TIDy);
+
+  // Parallelize
+  //  0  1  2   3   4   5  6  7
+  // [Mo No Mwo Nwo Mw Nw (Mi Ni)]
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(2)->parallelize(ParallelType::TIDz);
+  tv2->axis(3)->parallelize(ParallelType::TIDy);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({M, K}, options);
+  auto t1 = at::randn({K, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
+
+  TORCH_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
+}
+
+// Gemm test for Volta MMA: TN
+TEST_F(NVFuserTest, FusionVoltaMatMulTN_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  int M = 120, N = 264, K = 56;
+
+  // [M,K]
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  // [N,K]
+  auto tv1 = makeContigTensor(2, DataType::Half);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [M,N,K]
+  auto tv0b = broadcast(tv0, {false, true, false});
+  auto tv1b = broadcast(tv1, {true, false, false});
+
+  // Leaving both sets of mma inputs for volta outside
+  //  currently since they need to be swizzled.
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {2});
+
+  fusion.addOutput(tv2);
+
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::TN);
+
+  tv2->configureMma(mma_builder.build());
+
+  auto tv0r = tv0->cacheAfter();
+  auto tv1r = tv1->cacheAfter();
+  auto tv0cw = tv0b->cacheAfter();
+  auto tv0cr = tv0cw->cacheAfter();
+  auto tv1cw = tv1b->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+  auto tv2c = tv2->cacheBefore();
+
+  // Make a CTA tile
+  // ------------------------------------------------------------------
+  // [M,N]
+  tv2->split(-2, gemm_tile.cta_tile.m);
+  tv2->split(-1, gemm_tile.cta_tile.n);
+
+  //  0   1    2   3
+  // [Mo,M128, No, N128]
+  tv2->reorder({{1, 2}, {2, 1}});
+
+  //  0   1    2   3
+  // [Mo,No, M128, N128]
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, 2);
+
+  // Order K
+  //  0   1    2   3     4    5
+  // [Mo,No, M128, N128, Ko, K32]
+  tv2c->split(-1, gemm_tile.cta_tile.k);
+  tv2c->reorder({{2, 3}, {3, 4}, {4, 2}});
+
+  //  0   1  2   3     4    5
+  // [Mo,No, Ko M128, N128, K32]
+  tv0r->computeAt(tv2c, 3);
+  tv1r->computeAt(tv2c, 3);
+
+  // Make warp tile:
+  // -------------------------------------------------------------------------
+  scheduler_utils::matmul_utils::scheduleWarpTileWithReduction(tv2c, gemm_tile);
+  scheduler_utils::matmul_utils::scheduleWarpTileWithNoReduction(
+      tv2, gemm_tile);
+  //           -8   -7  -6 -5 -4 -3 -2 -1
+  // [Mo No Ko Mwo  Nwo Kwo Mw Nw Mi Ni Ki]
+  tv0cr->computeAt(tv2c, -4);
+  tv1cr->computeAt(tv2c, -4);
+
+  // Schedule gmem read and smem write:
+  // ---------------------------------------------------------------------------
+  // [Mo,No,Ko,M,N,K]
+  tv0cw->reorder({
+      {-3, -2},
+      {-2, -3},
+  });
+  // [Mo,No,Ko,N,M,K]
+  tv0cw->merge(-2);
+  tv0r->merge(-2);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv0cw, gemm_tile, 8);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv0r, gemm_tile, 8);
+  tv0cw->setMemoryType(MemoryType::Shared);
+  // [Mo,Ko,i,wy,wx,v]
+
+  // [Mo,No,Ko,M,N,K]
+  tv1cw->merge(-2);
+  tv1r->merge(-2);
+  // [Mo,No,Ko,i,wy,wx,v]
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv1cw, gemm_tile, 8);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv1r, gemm_tile, 8);
+  tv1cw->setMemoryType(MemoryType::Shared);
+  // Schedule mma input
+  // ---------------------------------------------------------------------------
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  // Schedule mma output
+  // ---------------------------------------------------------------------------
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  tv0b->computeAt(tv0cw, -2);
+  tv1b->computeAt(tv1cw, -2);
+
+  tv0cr->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
+  // Parallelize
+  //  0   1  2  3    4   5  6  7  8  9  10
+  // [Mo No Ko Mwo  Nwo Kw Mw Nw (Mi Ni Ki)]
+  tv2c->axis(3)->parallelize(ParallelType::TIDz);
+  tv2c->axis(4)->parallelize(ParallelType::TIDy);
+
+  // Parallelize
+  //  0  1  2   3   4   5  6  7
+  // [Mo No Mwo Nwo Mw Nw (Mi Ni)]
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(2)->parallelize(ParallelType::TIDz);
+  tv2->axis(3)->parallelize(ParallelType::TIDy);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({M, K}, options);
+  auto t1 = at::randn({N, K}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat).t());
+  TORCH_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
+}
+
+// Gemm test for Volta MMA: NT
+TEST_F(NVFuserTest, FusionVoltaMatMulNT_CUDA) {
+  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  int M = 240, N = 320, K = 136;
+
+  // [K,M]
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  // [K,N]
+  auto tv1 = makeContigTensor(2, DataType::Half);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [K,M,N]
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {false, true, false});
+
+  // Leaving both sets of mma inputs for volta outside
+  //  currently since they need to be swizzled.
+  auto tv2 = fusedMultiplySum(tv0b, tv1b, {0});
+
+  fusion.addOutput(tv2);
+
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 4);
+
+  auto mma_builder = MmaBuilder(MmaOptions::MacroType::Volta_16_16_4, gemm_tile)
+                         .layout(MmaOptions::MmaInputLayout::NT);
+
+  tv2->configureMma(mma_builder.build());
+
+  auto tv0r = tv0->cacheAfter();
+  auto tv1r = tv1->cacheAfter();
+  auto tv0cw = tv0b->cacheAfter();
+  auto tv0cr = tv0cw->cacheAfter();
+  auto tv1cw = tv1b->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+  auto tv2c = tv2->cacheBefore();
+
+  // Make a CTA tile
+  // ------------------------------------------------------------------
+  // [M,N]
+  tv2->split(-2, gemm_tile.cta_tile.m);
+  tv2->split(-1, gemm_tile.cta_tile.n);
+
+  //  0   1    2   3
+  // [Mo,M128, No, N128]
+  tv2->reorder({{1, 2}, {2, 1}});
+
+  //  0   1    2   3
+  // [Mo,No, M128, N128]
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, 2);
+
+  // Order K
+  //  0   1    2   3     4    5
+  // [Mo,No, M128, N128, Ko, K32]
+  tv2c->split(-1, gemm_tile.cta_tile.k);
+  tv2c->reorder({{2, 3}, {3, 4}, {4, 2}});
+
+  //  0   1  2   3     4    5
+  // [Mo,No, Ko M128, N128, K32]
+  tv0r->computeAt(tv2c, 3);
+  tv1r->computeAt(tv2c, 3);
+
+  // Make warp tile:
+  // -------------------------------------------------------------------------
+  scheduler_utils::matmul_utils::scheduleWarpTileWithReduction(tv2c, gemm_tile);
+  scheduler_utils::matmul_utils::scheduleWarpTileWithNoReduction(
+      tv2, gemm_tile);
+  //           -8   -7  -6 -5 -4 -3 -2 -1
+  // [Mo No Ko Mwo  Nwo Kwo Mw Nw Mi Ni Ki]
+  tv0cr->computeAt(tv2c, -4);
+  tv1cr->computeAt(tv2c, -4);
+
+  // Schedule gmem read and smem write:
+  // ---------------------------------------------------------------------------
+  // [Mo,No,Ko,M,N,K]
+  tv0cw->reorder({{-3, -1}, {-2, -3}, {-1, -2}});
+  // [Mo,No,Ko,N,K,M]
+  tv0cw->merge(-2);
+
+  // [Mo,No,M,K]
+  tv0r->reorder({{-2, -1}, {-1, -2}});
+  // [Mo,No,K,M]
+  tv0r->merge(-2);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv0cw, gemm_tile, 8);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv0r, gemm_tile, 8);
+  tv0cw->setMemoryType(MemoryType::Shared);
+  // [Mo,Ko,i,wy,wx,v]
+
+  // [Mo,No,Ko,M,N,K]
+  tv1cw->reorder({{-2, -1}, {-1, -2}});
+  tv1r->reorder({{-2, -1}, {-1, -2}});
+  // [Mo,No,Ko,M,K,N]
+  tv1cw->merge(-2);
+  tv1r->merge(-2);
+  // [Mo,No,Ko,i,wy,wx,v]
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv1cw, gemm_tile, 8);
+  scheduler_utils::matmul_utils::scheduleContiguousVectorLoad(
+      tv1r, gemm_tile, 8);
+  tv1cw->setMemoryType(MemoryType::Shared);
+  // Schedule mma input
+  // ---------------------------------------------------------------------------
+  // [...M,N,K]
+  tv0cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+  tv1cr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  // Schedule mma output
+  // ---------------------------------------------------------------------------
+  tv2c->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+  tv2->applyMmaSwizzle(
+      mma_builder.operand(MmaOptions::Operand::NotOperand).build());
+
+  tv0b->computeAt(tv0cw, -2);
+  tv1b->computeAt(tv1cw, -2);
+
+  tv0cr->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
+  // Parallelize
+  //  0   1  2  3    4   5  6  7  8  9  10
+  // [Mo No Ko Mwo  Nwo Kw Mw Nw (Mi Ni Ki)]
+  tv2c->axis(3)->parallelize(ParallelType::TIDz);
+  tv2c->axis(4)->parallelize(ParallelType::TIDy);
+
+  // Parallelize
+  //  0  1  2   3   4   5  6  7
+  // [Mo No Mwo Nwo Mw Nw (Mi Ni)]
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(2)->parallelize(ParallelType::TIDz);
+  tv2->axis(3)->parallelize(ParallelType::TIDy);
+
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({K, M}, options);
+  auto t1 = at::randn({K, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto tref = t0.to(at::kFloat).t().matmul(t1.to(at::kFloat));
+
+  TORCH_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
+}
+
+#undef NVFUSER_TEST_CUDA_ARCH_GUARD
+
+} // namespace jit
+} // namespace torch
+
+#endif
diff --git a/test/cpp/jit/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
similarity index 88%
rename from test/cpp/jit/test_gpu_validator.h
rename to torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
index 5923e384e39d..6708248bf730 100644
--- a/test/cpp/jit/test_gpu_validator.h
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
+#include <ATen/cuda/CUDAContext.h>
 #include <unordered_map>
 
 namespace torch {
@@ -11,6 +12,30 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+inline bool deviceMajorMinorCheck(int major, int minor = 0) {
+  auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  if (dev_prop->major < major ||
+      (dev_prop->major == major && dev_prop->minor < minor)) {
+    return false;
+  }
+  return true;
+}
+
+inline int deviceSMCount() {
+  int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  return sm_count;
+}
+
+class NVFuserTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // requires PASCAL or newer
+    if (!deviceMajorMinorCheck(6)) {
+      GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs";
+    }
+  }
+};
+
 struct ValidationConstants {
   // Tolerances generated from randn + add + sum fusion
   // compared against double precision
@@ -48,6 +73,8 @@ std::pair<double, double> getTolerance(
     int64_t reduction_size,
     const ValidationConstants& tolerances) {
   switch (dtype) {
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
     case DataType::Float:
     // TODO: Pull new tolerances for Double, for now we will just use float
     // tolerances as it should be no worse.
@@ -66,8 +93,8 @@ std::pair<double, double> getTolerance(
       } else {
         // Reduction case
         size_t entry = 0;
-        while (sum_tolerance_entry[entry][0] < reduction_size &&
-               entry < sum_tolerance_entry.size()) {
+        while (entry < sum_tolerance_entry.size() &&
+               sum_tolerance_entry[entry][0] < reduction_size) {
           entry++;
         }
         double abs_tol = 0.0;
@@ -221,7 +248,7 @@ class ReductionSizeMapper : private IterVisitor {
   }
 
   void handle(Expr* expr) override {
-    if (!ir_utils::isTVOp(expr)) {
+    if (!ir_utils::isTvOp(expr)) {
       return;
     }
 
@@ -322,9 +349,12 @@ inline void testValidate(
   auto reduction_sizes =
       ReductionSizeMapper::computeReductionSizes(fusion, expr_eval);
 
+  auto output_alias_indices = fusion->getOutputAliasIndices();
+
   TORCH_INTERNAL_ASSERT(
       fusion_outputs.size() == aten_outputs.size() &&
-          aten_outputs.size() == fusion->outputs().size(),
+          aten_outputs.size() ==
+              fusion->outputs().size() - output_alias_indices.size(),
       "Number of outputs don't match.");
 
   TORCH_INTERNAL_ASSERT(
@@ -341,20 +371,24 @@ inline void testValidate(
 
       TORCH_INTERNAL_ASSERT(
           at_tensor.dim() ==
-              TensorDomain::noReductions(
-                  fusion_input_tv->getMaybeRFactorDomain())
-                  .size(),
+              static_cast<int64_t>(TensorDomain::noReductions(
+                                       fusion_input_tv->getMaybeRFactorDomain())
+                                       .size()),
           "Dimensionality mismatch in inputs.");
     }
   }
 
-  for (size_t i = 0; i < fusion->outputs().size(); i++) {
+  for (size_t i = 0, j = 0; i < fusion->outputs().size(); i++) {
     TORCH_INTERNAL_ASSERT(
         fusion->outputs()[i]->isA<TensorView>(), "Mismatch of tensor outputs.");
+    if (output_alias_indices.count(i) != 0) {
+      // this is an aliased output, let's not check this;
+      continue;
+    }
 
-    auto fusion_output_tensor = fusion_outputs[i];
+    auto fusion_output_tensor = fusion_outputs[j];
     auto fusion_output_tv = fusion->outputs()[i]->as<TensorView>();
-    auto aten_output_tensor = aten_outputs[i];
+    auto aten_output_tensor = aten_outputs[j];
 
     TORCH_INTERNAL_ASSERT(
         reduction_sizes.count(fusion_output_tv),
@@ -365,16 +399,18 @@ inline void testValidate(
 
     TORCH_INTERNAL_ASSERT(
         aten_output_tensor.dim() == fusion_output_tensor.dim() &&
-            fusion_outputs[i].dim() ==
-                TensorDomain::noReductions(
-                    fusion_output_tv->getMaybeRFactorDomain())
-                    .size(),
+            fusion_outputs[j].dim() ==
+                static_cast<int64_t>(
+                    TensorDomain::noReductions(
+                        fusion_output_tv->getMaybeRFactorDomain())
+                        .size()),
         "Dimensionality mismatch in outputs.");
 
     auto tolerance_values = getTolerance(
         fusion_output_tv->getDataType().value(), reduction_size, tolerances);
 
-    if (aten_output_tensor.is_floating_point()) {
+    if (aten_output_tensor.is_floating_point() ||
+        aten_output_tensor.is_complex()) {
       TORCH_INTERNAL_ASSERT(
           aten_output_tensor.allclose(
               fusion_output_tensor.to(aten_output_tensor.dtype()),
@@ -383,7 +419,7 @@ inline void testValidate(
           "\n",
           err_msg,
           "\nValidation error in output ",
-          i,
+          j,
           " on line ",
           line_number,
           " in file ",
@@ -405,13 +441,14 @@ inline void testValidate(
           "\n",
           err_msg,
           ".\n  Validation error in output ",
-          i,
+          j,
           " on line ",
           line_number,
           " in file ",
           file_name,
           ".\n Values are not equal and are not a floating type.");
     }
+    j++;
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
new file mode 100644
index 000000000000..0afc902967b6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
@@ -0,0 +1,977 @@
+#if defined(USE_CUDA)
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+// fuser and IR parser
+#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/ir/irparser.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+namespace {
+
+// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder()
+      .ndims(ndims)
+      .dtype(dtype)
+      .contiguity(std::vector<bool>(ndims, true))
+      .build();
+}
+
+// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
+// but unknown sizes
+TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
+  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
+}
+
+// Make a non-contiguous tensor of compile-time known sizes
+TensorView* makeConcreteTensor(
+    std::vector<int64_t> shape,
+    DataType dtype = DataType::Float) {
+  return TensorViewBuilder().shape(shape).dtype(dtype).build();
+}
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionViewDtypeSameSizeOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{2, 10, 40};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  auto x_view = view(x_add_bias, DataType::Int32);
+  fusion.addOutput(x_view);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto outputs = fe.runFusion(aten_inputs, lparams);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at_x_add_bias.view(at::ScalarType::Int);
+
+  testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewDtypeFailMismatchSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{2, 10, 40};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(view(x_add_bias, DataType::Int));
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(view(x_add_bias, DataType::Half));
+}
+
+TEST_F(NVFuserTest, FusionViewAsRealOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // TODO: We should modify our schedulers to correctly handle
+  // view_as_real. And test these schedulers.
+  std::vector<int64_t> input_shape{512};
+  std::vector<int64_t> output_shape{512, 2};
+
+  TensorView* x =
+      makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
+  TensorView* bias =
+      makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  TensorView* y = makeSymbolicTensor(output_shape.size());
+  fusion.addInput(y);
+
+  auto y_plus_1 = add(y, IrBuilder::create<Double>(1));
+
+  auto x_add_bias = add(x, bias);
+  auto x_view = view_as_real(x_add_bias);
+  auto out = add(y_plus_1, x_view);
+  fusion.addOutput(out);
+
+  out->axis(0)->parallelize(ParallelType::TIDx);
+  x_add_bias->computeAt(out, -1);
+  y->computeAt(out, -1);
+
+  auto in_options =
+      at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
+  auto out_options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, in_options);
+  at::Tensor at_bias = at::randn(input_shape, in_options);
+  at::Tensor at_y = at::randn(output_shape, out_options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias, at_y};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at::view_as_real(at_x_add_bias);
+  auto at_y_plus_1 = at_y + 1.0;
+  auto at_out = at_y_plus_1 + at_x_view;
+
+  testValidate(&fusion, outputs, aten_inputs, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewRfactorExtentReplacement_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = makeContigTensor(2);
+  fusion->addInput(tv1);
+
+  auto tv2 = view(tv0, {12, 8}, {4, 3, 8});
+  auto tv3 = sum(tv2, {-1});
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  auto tv5 = add(tv1, tv4);
+  fusion->addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({12, 8}, options);
+  auto t1 = at::randn({4, 3}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
+
+  auto ref = at::native::view(t0, {4, 3, 8}).sum({-1}) + 1 + t1;
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{2, 10, 40};
+  std::vector<int64_t> output_shape{2, 10, 4, 10};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  auto x_view = view(x_add_bias, input_shape, output_shape);
+  fusion.addOutput(x_view);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto outputs = fe.runFusion(aten_inputs, lparams);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
+
+  testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewFailMismatchSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // The number of elements in input and output shapes do not match,
+  // so this view transformation is invalid.
+  // 2 * 10 * 40 != 2 * 50 * 4 * 10
+
+  std::vector<int64_t> input_shape{2, 10, 40};
+  std::vector<int64_t> output_shape{2, 50, 4, 10};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
+}
+
+TEST_F(NVFuserTest, FusionViewFailMulitDimInference_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Only one dimension can be inferred in the output shape.
+  // Otherwise, the size of the dimensions is ambiguous.
+  std::vector<int64_t> input_shape{2, 10, 40};
+  std::vector<int64_t> output_shape{2, -1, 4, -1};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
+}
+
+void reductionViewAddFusion(
+    std::vector<int64_t>& input_shape,
+    std::vector<int64_t>& output_shape,
+    bool view_before_reduction) {
+  constexpr int kReductionAxis = -1;
+
+  // Drop size for reduction axis from view_shape
+  std::vector<int64_t> view_shape;
+  {
+    const auto kAxis = (kReductionAxis < 0)
+        ? (kReductionAxis + input_shape.size())
+        : kReductionAxis;
+    for (auto i : c10::irange(input_shape.size())) {
+      if (view_before_reduction || i != kAxis) {
+        view_shape.push_back(input_shape[i]);
+      }
+    }
+  }
+
+  auto bias_shape = (view_before_reduction) ? input_shape : output_shape;
+  for (auto has_implicit_broadcast : {false, true}) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    TensorView* x = (has_implicit_broadcast)
+        ? makeConcreteTensor(input_shape)
+        : makeSymbolicTensor(input_shape.size());
+    TensorView* bias = (has_implicit_broadcast)
+        ? makeConcreteTensor(bias_shape)
+        : makeSymbolicTensor(bias_shape.size());
+    fusion.addInput(x);
+    fusion.addInput(bias);
+
+    auto tv1 =
+        (view_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
+    auto x_view = view(tv1, view_shape, output_shape);
+    auto y = (view_before_reduction) ? sum(x_view, {kReductionAxis})
+                                     : add(x_view, bias);
+    fusion.addOutput(y);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape, options);
+    at::Tensor at_bias = at::randn(bias_shape, options);
+    std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+    FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+
+    auto at_tv1 = (view_before_reduction) ? (at_x + at_bias)
+                                          : at::sum(at_x, kReductionAxis);
+    auto at_x_view = at::native::view(at_tv1, output_shape);
+    auto at_y = (view_before_reduction) ? at::sum(at_x_view, kReductionAxis)
+                                        : at::add(at_x_view, at_bias);
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewReductionShmoo_CUDA) {
+  typedef std::vector<int64_t> shape;
+  typedef std::pair<shape, shape> view_example;
+
+  std::vector<view_example> view_before_examples = {
+      {{19, 12, 7, 99}, {19, 3, 2772}},
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
+      {{3, 17, 80, 1}, {51, 2, 4, 1, 10}},
+      {{3, 17, 80, 1, 9}, {51, 2, 4, 1, 10, 9}},
+      {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 5, 1}},
+      {{22, 22, 2}, {22, 11, 1, 1, 4}},
+      {{37, 9, 7, 6, 10}, {333, 2, 2, 3, 35}},
+      {{1, 1, 333, 1}, {1, 1, 333, 1}},
+      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}},
+      {{1, 333, 1}, {1, 37, 9, 1}},
+      {{1, 333}, {1, 1, 1, 111, 1, 3}},
+      {{22, 1, 22, 1}, {484}},
+      {{1, 333, 1}, {333}},
+      // Incorrect Result - Broadcast Issue - Reduction
+      {{1, 27454, 1, 2}, {1, 7844, 1, 7}},
+      {{1, 7844, 1, 7}, {1, 27454, 2}}};
+
+  for (auto e : view_before_examples) {
+    reductionViewAddFusion(e.first, e.second, true /* view_before_reduction */);
+  }
+
+  std::vector<view_example> view_after_examples = {
+      {{19, 12, 7, 99}, {19, 3, 28}},
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 28}},
+      {{3, 17, 80, 1}, {51, 1, 2, 4, 10}},
+      {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10}},
+      {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 1}},
+      {{22, 22, 2}, {22, 11, 1, 1, 2}},
+      {{37, 9, 7, 6, 10}, {333, 2, 21}},
+      {{1, 1, 333, 1}, {1, 1, 333, 1}},
+      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1}},
+      {{1, 333, 1}, {1, 37, 9, 1}},
+      {{22, 1, 22, 1}, {484}},
+      {{1, 333, 1}, {333}},
+      {{1, 27454, 1, 2}, {1, 3922, 1, 7}},
+      {{1, 7844, 1, 7}, {1, 1961, 4}}};
+
+  for (auto e : view_after_examples) {
+    reductionViewAddFusion(
+        e.first, e.second, false /* view_before_reduction */);
+  }
+}
+
+void persistentViewAddFusion(
+    std::vector<int64_t>& input_shape,
+    std::vector<int64_t>& output_shape,
+    bool view_before_persistent) {
+  constexpr int kAxis = -1;
+
+  auto bias_shape = (view_before_persistent) ? input_shape : output_shape;
+  for (auto has_implicit_broadcast : {false, true}) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    TensorView* x = (has_implicit_broadcast)
+        ? makeConcreteTensor(input_shape)
+        : makeSymbolicTensor(input_shape.size());
+    TensorView* bias = (has_implicit_broadcast)
+        ? makeConcreteTensor(bias_shape)
+        : makeSymbolicTensor(bias_shape.size());
+    fusion.addInput(x);
+    fusion.addInput(bias);
+
+    auto tv1 = (view_before_persistent) ? add(x, bias) : softmax(x, kAxis);
+    auto x_view = view(tv1, input_shape, output_shape);
+    auto y =
+        (view_before_persistent) ? softmax(x_view, kAxis) : add(x_view, bias);
+    fusion.addOutput(y);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape, options);
+    at::Tensor at_bias = at::randn(bias_shape, options);
+    std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+    FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+
+    auto at_tv1 = (view_before_persistent)
+        ? (at_x + at_bias)
+        : at::_softmax(at_x, kAxis, false /* half_to_float */);
+    auto at_x_view = at::native::view(at_tv1, output_shape);
+    auto at_y = (view_before_persistent)
+        ? at::_softmax(at_x_view, kAxis, false /* half_to_float */)
+        : at::add(at_x_view, at_bias);
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewPersistentShmoo_CUDA) {
+  typedef std::vector<int64_t> shape;
+  typedef std::pair<shape, shape> view_example;
+
+  std::vector<view_example> view_examples = {
+      {{19, 12, 7, 99}, {19, 3, 2772}},
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
+      {{3, 17, 80, 1}, {51, 2, 4, 1, 10}},
+      {{3, 17, 80, 1, 9}, {51, 2, 4, 1, 10, 9}},
+      {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 5, 1}},
+      {{22, 22, 2}, {22, 11, 1, 1, 4}},
+      {{37, 9, 7, 6, 10}, {333, 2, 2, 3, 35}},
+      {{1, 1, 333, 1}, {1, 1, 333, 1}},
+      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}},
+      {{1, 333, 1}, {1, 37, 9, 1}},
+      // TODO Validation Error - Absolute Tolerance
+      // {{1, 333}, {1, 1, 1, 111, 1, 3}},
+      {{22, 1, 22, 1}, {484}},
+      {{1, 333, 1}, {333}},
+      // TODO Incorrect Result - Broadcast Issue - Reduction
+      {{1, 27454, 1, 2}, {1, 7844, 1, 7}},
+      {{1, 7844, 1, 7}, {1, 27454, 2}}};
+
+  for (auto e : view_examples) {
+    persistentViewAddFusion(
+        e.first, e.second, true /* view_before_persistent */);
+  }
+
+  for (auto e : view_examples) {
+    persistentViewAddFusion(
+        e.first, e.second, false /* view_before_persistent */);
+  }
+}
+
+void addViewGeluFusion(
+    std::vector<int64_t>& input_shape,
+    std::vector<int64_t>& output_shape) {
+  for (auto has_implicit_broadcast : {false, true}) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* x = (has_implicit_broadcast)
+        ? makeConcreteTensor(input_shape)
+        : makeSymbolicTensor(input_shape.size());
+    TensorView* bias = (has_implicit_broadcast)
+        ? makeConcreteTensor(input_shape)
+        : makeSymbolicTensor(input_shape.size());
+    fusion.addInput(x);
+    fusion.addInput(bias);
+
+    auto x_add_bias = add(x, bias);
+    auto x_view = view(x_add_bias, input_shape, output_shape);
+    auto y = gelu(x_view);
+    fusion.addOutput(y);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape, options);
+    at::Tensor at_bias = at::randn(input_shape, options);
+    std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+    auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, aten_inputs, lparams);
+    auto outputs = fe.runFusion(aten_inputs, lparams);
+
+    auto at_x_add_bias = at_x + at_bias;
+    auto at_x_view = at::native::view(at_x_add_bias, output_shape);
+    auto at_y = at::gelu(at_x_view);
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewSplit_CUDA) {
+  std::vector<int64_t> input_shape{80};
+  std::vector<int64_t> output_shape{2, 4, 10};
+  addViewGeluFusion(input_shape, output_shape);
+}
+
+TEST_F(NVFuserTest, FusionViewBroadcast_CUDA) {
+  std::vector<int64_t> input_shape{80};
+  std::vector<int64_t> output_shape{1, 80};
+  addViewGeluFusion(input_shape, output_shape);
+}
+
+TEST_F(NVFuserTest, FusionViewMerge_CUDA) {
+  std::vector<int64_t> input_shape{2, 40, 7};
+  std::vector<int64_t> output_shape{560};
+  addViewGeluFusion(input_shape, output_shape);
+}
+
+TEST_F(NVFuserTest, FusionViewAllShmoo_CUDA) {
+  typedef std::vector<int64_t> shape;
+  typedef std::pair<shape, shape> view_example;
+
+  std::vector<view_example> examples = {
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
+      {{3, 17, 80, 1}, {51, 1, 2, 4, 10}},
+      {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10, 9}},
+      {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 5, 1}},
+      {{22, 22, 2}, {22, 11, 1, 1, 4}},
+      {{37, 9, 7, 6, 10}, {333, 2, 2, 3, 35}},
+      {{1, 1, 333, 1}, {1, 1, 333, 1}},
+      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}},
+      {{1, 333, 1}, {1, 37, 9, 1}},
+      {{1, 333}, {1, 1, 1, 111, 1, 3}},
+      {{22, 1, 22, 1}, {484}},
+      {{1, 333, 1}, {333}},
+      {{1, 27454, 1, 2}, {1, 7844, 1, 7}},
+      {{1, 7844, 1, 7}, {1, 27454, 2}}};
+
+  for (auto e : examples) {
+    addViewGeluFusion(e.first, e.second);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewInferShmoo_CUDA) {
+  typedef std::vector<int64_t> shape;
+  typedef std::pair<shape, shape> view_example;
+
+  std::vector<view_example> examples = {
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, -1, 3, 2772}},
+      {{3, 17, 80, 1}, {51, 1, 2, 4, -1}},
+      {{3, 17, 80, 1, 9}, {-1, 1, 2, 4, 10, 9}},
+      {{2, 3, 4, 5}, {1, 6, 1, -1, 2, 5, 1}},
+      {{22, 22, 2}, {22, -1, 1, 1, 4}},
+      {{37, 9, 7, 6, 10}, {333, 2, -1, 3, 35}},
+      {{1, 1, 333, 1}, {1, 1, -1, 1}},
+      {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, -1}},
+      {{1, 333, 1}, {1, 37, -1, 1}},
+      {{1, 333}, {1, 1, 1, -1, 1, 3}},
+      {{22, 1, 22, 1}, {-1}},
+      {{1, 333, 1}, {-1}},
+      {{1, 27454, 1, 2}, {1, 7844, 1, -1}},
+      {{1, 7844, 1, 7}, {1, -1, 2}}};
+
+  for (auto e : examples) {
+    addViewGeluFusion(e.first, e.second);
+  }
+}
+
+void geluViewAddFusion(
+    std::vector<int64_t> input_shape,
+    std::vector<int64_t> output_shape) {
+  for (auto hasImplicitBroadcast : {false, true}) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* x = (hasImplicitBroadcast)
+        ? makeConcreteTensor(input_shape)
+        : makeSymbolicTensor(input_shape.size());
+    TensorView* bias = (hasImplicitBroadcast)
+        ? makeConcreteTensor(output_shape)
+        : makeSymbolicTensor(output_shape.size());
+    fusion.addInput(x);
+    fusion.addInput(bias);
+
+    auto x_gelu = gelu(x);
+    auto x_view = view(x_gelu, input_shape, output_shape);
+    auto y = add(x_view, bias);
+    fusion.addOutput(y);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape, options);
+    at::Tensor at_bias = at::randn(output_shape, options);
+    std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+    auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, aten_inputs, lparams);
+    auto outputs = fe.runFusion(aten_inputs, lparams);
+
+    auto at_x_gelu = at::gelu(at_x);
+    auto at_x_view = at::native::view(at_x_gelu, output_shape);
+    auto at_y = at_x_view + at_bias;
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewStride_CUDA) {
+  typedef std::vector<int64_t> shape;
+  typedef std::pair<shape, shape> view_example;
+
+  std::vector<view_example> examples = {
+      {{1, 27454, 2}, {1, 7844, 7}},
+      {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}},
+      {{1, 7844, 1, 7}, {1, 27454, 2}}};
+
+  for (const auto& e : examples) {
+    geluViewAddFusion(e.first, e.second);
+  }
+}
+
+void geluViewBinaryAddFusion(
+    std::vector<int64_t> input_shape1,
+    std::vector<int64_t> input_shape2,
+    std::vector<int64_t> output_shape) {
+  for (auto hasImplicitBroadcast : {false, true}) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* x = (hasImplicitBroadcast)
+        ? makeConcreteTensor(input_shape1)
+        : makeSymbolicTensor(input_shape1.size());
+    TensorView* bias = (hasImplicitBroadcast)
+        ? makeConcreteTensor(input_shape2)
+        : makeSymbolicTensor(input_shape2.size());
+    fusion.addInput(x);
+    fusion.addInput(bias);
+
+    auto x_gelu = gelu(x);
+    auto x_view = view(x_gelu, input_shape1, output_shape);
+    auto bias_view = view(bias, input_shape2, output_shape);
+    auto y = add(x_view, bias_view);
+    fusion.addOutput(y);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape1, options);
+    at::Tensor at_bias = at::randn(input_shape2, options);
+    std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+    auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, aten_inputs, lparams);
+    auto outputs = fe.runFusion(aten_inputs, lparams);
+
+    auto at_x_gelu = at::gelu(at_x);
+    auto at_x_view = at::native::view(at_x_gelu, output_shape);
+    auto at_bias_view = at::native::view(at_bias, output_shape);
+    auto at_y = at_x_view + at_bias_view;
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionViewBinary_CUDA) {
+  geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7});
+}
+
+// Repro of issue #1493
+TEST_F(NVFuserTest, FusionViewConcreteDomain_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = view(tv0, {2, 3}, {6});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = broadcast(tv3, {true, false});
+  auto tv5 = add(tv4, tv1);
+
+  fusion.addOutput(tv5);
+
+  tv5->merge(0);
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({2, 3}, options);
+  auto t1 = at::randn({1, 6}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = (at::native::view(t0, {6}) + 1).unsqueeze(0) + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewConcreteDomain2_CUDA) {
+  constexpr int kAxis = -1;
+  std::vector<int64_t> input_shape = {19, 12, 7, 99};
+  std::vector<int64_t> output_shape = {19, 3, 2772};
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(output_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto tv1 = softmax(x, kAxis);
+  auto x_view = view(tv1, input_shape, output_shape);
+  auto y = add(x_view, bias);
+  fusion.addOutput(y);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(output_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto at_tv1 = at::_softmax(at_x, kAxis, false /* half_to_float */);
+  auto at_x_view = at::native::view(at_tv1, output_shape);
+  auto at_y = at::add(at_x_view, at_bias);
+
+  testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1608
+TEST_F(NVFuserTest, FusionViewConcreteDomain3_CUDA) {
+  std::vector<int64_t> input_shape = {14, 12, 8, 100};
+  std::vector<int64_t> bcast_shape = {14, 12, 8, 1};
+  std::vector<int64_t> other_shape = {14, 100, 96};
+  std::vector<int64_t> output_shape = {14, 3, 3200};
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* y = makeConcreteTensor(bcast_shape);
+  TensorView* z = makeSymbolicTensor(other_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(y);
+  fusion.addInput(z);
+
+  auto tv1 = add(x, y);
+  auto tv2 = view(tv1, input_shape, output_shape);
+  auto tv3 = view(z, other_shape, output_shape);
+  auto output = add(tv2, tv3);
+  fusion.addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_y = at::randn(bcast_shape, options);
+  at::Tensor at_z = at::randn(other_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_y, at_z};
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto at_tv1 = at::add(at_x, at_y);
+  auto at_tv2 = at::native::view(at_tv1, output_shape);
+  auto at_tv3 = at::native::view(at_z, output_shape);
+  auto at_output = at::add(at_tv2, at_tv3);
+
+  testValidate(&fusion, outputs, aten_inputs, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionViewConcreteDomain4_CUDA) {
+  std::vector<int64_t> shape1 = {3, 4, 5};
+  std::vector<int64_t> shape2 = {3 * 4 * 5};
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(shape1.size() - 1);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeSymbolicTensor(shape1.size());
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false, false});
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = view(tv3, shape1, shape2);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  TORCH_CHECK(tv5->nDims() == 1);
+
+  // The concrete domain of tv5, which is 1D, with permissive or loop mapping
+  // needs to be either the domain of tv4 or tv5, both of which have the three
+  // concrete root domains of tv1. In other words, it must map with tv4 and tv5
+  // with the exact mapping.
+  ComputeAtMap map(&fusion);
+  auto concrete_id =
+      map.getConcreteMappedID(tv5->axis(0), IdMappingMode::PERMISSIVE);
+  TORCH_CHECK(
+      map.areMapped(concrete_id, tv5->axis(0), IdMappingMode::EXACT),
+      "Invalid concrete ID: ",
+      concrete_id->toString());
+  TORCH_CHECK(
+      map.areMapped(concrete_id, tv4->axis(0), IdMappingMode::EXACT),
+      "Invalid concrete ID: ",
+      concrete_id->toString());
+}
+
+TEST_F(NVFuserTest, FusionViewConcreteDomain5_CUDA) {
+  const std::vector<int64_t> shape1 = {12};
+  const std::vector<int64_t> shape2 = {4, 3};
+  const std::vector<int64_t> shape3 = {12, 5};
+  const std::vector<int64_t> shape4 = {4, 3, 5};
+
+  for (auto order : {true, false}) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr.get();
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeSymbolicTensor(1);
+    fusion.addInput(tv0);
+
+    auto tv1 = makeSymbolicTensor(2);
+    fusion.addInput(tv1);
+
+    auto tv0_cache = set(tv0);
+
+    auto path1 = [&]() {
+      auto view_2d = view(tv0_cache, shape1, shape2);
+      auto view_2d_copy = set(view_2d);
+      fusion.addOutput(view_2d_copy);
+      return view_2d_copy;
+    };
+
+    auto path2 = [&]() {
+      auto tv0_bc = broadcast(tv0_cache, {false, true});
+      auto tv0_bc_plus_tv1 = add(tv0_bc, tv1);
+      auto view_3d = view(tv0_bc_plus_tv1, shape3, shape4);
+      auto view_3d_copy = set(view_3d);
+      fusion.addOutput(view_3d_copy);
+      return view_3d_copy;
+    };
+
+    TensorView* path1_out = nullptr;
+    TensorView* path2_out = nullptr;
+
+    if (order) {
+      // Fails before #1544. Concrete ID is picked from path1_out, which
+      // doesn't have the second root domain of tv1
+      path2_out = path2();
+      path1_out = path1();
+    } else {
+      // Works fine
+      path1_out = path1();
+      path2_out = path2();
+    }
+
+    path2_out->merge(-2, -1);
+    path2_out->merge(-2, -1);
+
+    tv0->computeAt(path2_out, -1);
+    tv1->computeAt(path2_out, -1);
+
+    TORCH_CHECK(path1_out->nDims() == 1);
+    TORCH_CHECK(path2_out->nDims() == 1);
+
+    ComputeAtMap map(&fusion);
+
+    // Make sure the two output tensors are mapped. Note both are 1D.
+    TORCH_CHECK(map.areMapped(
+        path1_out->axis(0), path2_out->axis(0), IdMappingMode::LOOP));
+
+    auto concrete_id =
+        map.getConcreteMappedID(path2_out->axis(0), IdMappingMode::LOOP);
+    TORCH_CHECK(
+        path2_out->axis(0) == concrete_id,
+        "Incorrect concrete ID: ",
+        concrete_id->toString());
+  }
+}
+
+TEST_F(NVFuserTest, FusionFlattenAfterUnsqueezeOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{512};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Double);
+  TensorView* bias = makeSymbolicTensor(input_shape.size(), DataType::Double);
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  auto x_unsqueeze = unsqueeze(x_add_bias, -1);
+  auto x_view = flatten(x_unsqueeze);
+  fusion.addOutput(x_view);
+
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  x_view->split(0, 4);
+  x_add_bias->computeAt(x_view, 1);
+  x_view->axis(0)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at_x_add_bias.unsqueeze(-1).flatten();
+
+  testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtRootDomainMapWithView_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> input_shape1{10, 12};
+  const std::vector<int64_t> input_shape2{10, 3, 4};
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+
+  // reduction followed by broadcast
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true, true});
+
+  // Path with a view
+  auto tv4 = view(tv1, input_shape1, input_shape2);
+
+  // Join the reduciton+broadcast and view paths together
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  ComputeAtRootDomainMap map;
+  map.build();
+
+  // It's not possible to compute tv1 at the -1 position of
+  // t2. ComputeAtRootDomainMap should tell that by not mapping the
+  // second axis.
+  auto tv1_tv2_mappable_dims =
+      map.getMappableDims(tv1->domain(), tv2->domain());
+  TORCH_CHECK(
+      tv1_tv2_mappable_dims.find(tv1->axis(1)) == tv1_tv2_mappable_dims.end(),
+      "Invalid ComputeAtRootDomainMap. Domain should not be mappable: ",
+      tv1->axis(1)->toString());
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/tools/stringify_file.py b/torch/csrc/jit/codegen/cuda/tools/stringify_file.py
index 9f4e74e9c1e6..4c3f173660b0 100644
--- a/torch/csrc/jit/codegen/cuda/tools/stringify_file.py
+++ b/torch/csrc/jit/codegen/cuda/tools/stringify_file.py
@@ -16,6 +16,10 @@
 
 args = arg_parser.parse_args()
 
+# msvc string literal maximum length 16380
+# https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026?view=msvc-170
+MAX_STRING_LITERAL = 16000
+
 with open(args.input, 'r') as fin:
     with open(args.output, 'w') as fout:
         literal_name = f'{pathlib.Path(args.input).stem}_cu'
@@ -23,7 +27,15 @@
         fout.write(f'// {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n')
         fout.write('namespace nvfuser_resources {\n\n')
         fout.write(f'constexpr const char* {literal_name} = R"(\n')
+        accumulated_chars = 0
         for line in fin:
-            fout.write(line)
+            accumulated_chars = accumulated_chars + len(line) + 1
+            if accumulated_chars >= MAX_STRING_LITERAL:
+                fout.write(')"\n')
+                fout.write('R"(\n')
+                fout.write(line)
+                accumulated_chars = len(line) + 1
+            else:
+                fout.write(line)
         fout.write(')";\n')
         fout.write('\n} // namespace nvfuser_resources\n')
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.cpp b/torch/csrc/jit/codegen/cuda/transform_iter.cpp
index 541366162682..71ec9c9916a4 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_iter.cpp
@@ -220,15 +220,17 @@ BestEffortReplay::BestEffortReplay(
     const std::vector<IterDomain*>& replay_domain,
     const std::vector<IterDomain*>& target_domain,
     std::unordered_map<IterDomain*, IterDomain*> target2replay_map,
-    std::unordered_map<IterDomain*, IterDomain*> forward_id_map)
+    std::unordered_map<IterDomain*, IterDomain*> replay_forward_id_map,
+    std::unordered_map<IterDomain*, IterDomain*> target_forward_id_map)
     : target2replay_id_map_(std::move(target2replay_map)),
-      forward_id_map_(std::move(forward_id_map)) {
+      replay_forward_id_map_(std::move(replay_forward_id_map)),
+      target_forward_id_map_(std::move(target_forward_id_map)) {
   for (auto entry : target2replay_id_map_) {
     leaf_ids_[entry.second] = counter++;
   }
 
   // Grab expr history of iter domains in target_domain
-  std::vector<Expr*> target_exprs = ExprSort::getExprs(
+  std::vector<Expr*> target_exprs = StmtSort::getExprs(
       FusionGuard::getCurFusion(),
       std::vector<Val*>(target_domain.begin(), target_domain.end()));
 
@@ -239,7 +241,7 @@ BestEffortReplay::BestEffortReplay(
   // replay_domain map.
 
   // Map replay domain's IterDomains to the Exprs they're used in
-  std::vector<Expr*> replay_exprs = ExprSort::getExprs(
+  std::vector<Expr*> replay_exprs = StmtSort::getExprs(
       FusionGuard::getCurFusion(),
       std::vector<Val*>(replay_domain.begin(), replay_domain.end()));
 
@@ -261,24 +263,25 @@ BestEffortReplay::BestEffortReplay(
           replay_id2expr_map.find(id) == replay_id2expr_map.end(),
           "Error trying to map rfactor root domain during replay.",
           " An IterDomain was found to be used in more than one expression.");
-      // Only want to forward rfactor in map
-      replay_id2expr_map[id] = replay_expr;
 
-      auto out_ids = ir_utils::filterByType<IterDomain>(replay_expr->outputs());
+      replay_id2expr_map[id] = replay_expr;
+    }
 
-      if (std::any_of(out_ids.begin(), out_ids.end(), [](IterDomain* id) {
-            return id->isRFactorProduct();
-          })) {
-        auto inp_ids =
-            ir_utils::filterByType<IterDomain>(replay_expr->inputs());
-        replay_rfactor_ids.insert(inp_ids.begin(), inp_ids.end());
-      }
+    // Only want to forward rfactor in map
+    auto out_ids = ir_utils::filterByType<IterDomain>(replay_expr->outputs());
+    if (std::any_of(out_ids.begin(), out_ids.end(), [](IterDomain* id) {
+          return id->isRFactorProduct();
+        })) {
+      auto inp_ids = ir_utils::filterByType<IterDomain>(replay_expr->inputs());
+      replay_rfactor_ids.insert(inp_ids.begin(), inp_ids.end());
     }
   }
 
   std::string err_str(
       "Error during replay, a transformation was called that conflicts with an rfactor call.");
 
+  bool any_target_expr_contains_broadcast_id = false;
+
   // Iterate through target IterDomains' history and compare with what we
   // recorded from replay_domain
   for (auto target_expr : target_exprs) {
@@ -292,15 +295,15 @@ BestEffortReplay::BestEffortReplay(
             target_inps_filtered.begin(),
             target_inps_filtered.end(),
             [&](IterDomain* target_inp) {
-              return this->inForwardMap(target_inp);
+              return this->inTargetForwardMap(target_inp);
             })) {
       for (auto target_inp : target_inps_filtered) {
-        if (inForwardMap(target_inp)) {
+        if (inTargetForwardMap(target_inp)) {
           auto target2replay_it = target2replay_id_map_.find(target_inp);
           if (target2replay_it != target2replay_id_map_.end()) {
             // Replace target_inp entry in target2replay_id_map_ with forwarded
             // id
-            target2replay_id_map_[getForwardedId(target_inp)] =
+            target2replay_id_map_[getTargetForwardedId(target_inp)] =
                 target2replay_it->second;
             target2replay_id_map_.erase(target_inp);
           }
@@ -313,6 +316,14 @@ BestEffortReplay::BestEffortReplay(
     std::vector<IterDomain*> target_id_inps(
         target_inps_filtered.begin(), target_inps_filtered.end());
 
+    bool target_expr_contains_broadcast_id = std::any_of(
+        target_inps_filtered.begin(),
+        target_inps_filtered.end(),
+        [](IterDomain* id) { return id->isBroadcast(); });
+    any_target_expr_contains_broadcast_id =
+        any_target_expr_contains_broadcast_id ||
+        target_expr_contains_broadcast_id;
+
     std::vector<IterDomain*> replay_inps =
         std::vector<IterDomain*>(target_id_inps.size(), nullptr);
 
@@ -324,7 +335,7 @@ BestEffortReplay::BestEffortReplay(
       // checking).
       auto it = target2replay_id_map_.find(target_id_inps[t_i]);
       if (it != target2replay_id_map_.end()) {
-        replay_inps[t_i] = getForwardedId(it->second);
+        replay_inps[t_i] = getReplayForwardedId(it->second);
       } else {
         missing_replay_input = true;
       }
@@ -353,12 +364,20 @@ BestEffortReplay::BestEffortReplay(
               return replay_id2expr_map.find(id) == replay_id2expr_map.end();
             }
           });
-      TORCH_INTERNAL_ASSERT(no_missing_exprs, err_str);
+      // View operation creates a TensorView with rfactor. After view, broadcast
+      // operation adds iterDomains for any size-1 dimensions. Therefore, the
+      // target domain (broadcast) may contain broadcast ids that are not
+      // present in the replay domain (view). In this case, we skip any target
+      // expressions that contain broadcast ids.
+      TORCH_INTERNAL_ASSERT(
+          no_missing_exprs || any_target_expr_contains_broadcast_id, err_str);
     }
 
     // If any inputs are missing, continue as this expr doesn't match.
     if (missing_replay_input) {
-      TORCH_INTERNAL_ASSERT(!replay_has_rfactor_inp, err_str);
+      TORCH_INTERNAL_ASSERT(
+          !replay_has_rfactor_inp || any_target_expr_contains_broadcast_id,
+          err_str);
       continue;
     }
 
@@ -561,7 +580,7 @@ struct ConsumerForwardingInfo {
     auto consumer_bcast_ids_not_in_producer =
         consumer_bcast_roots_not_in_producer;
 
-    std::vector<Expr*> consumer_history = ExprSort::getExprs(
+    std::vector<Expr*> consumer_history = StmtSort::getExprs(
         FusionGuard::getCurFusion(),
         std::vector<Val*>(
             consumer->domain()->domain().begin(),
@@ -615,8 +634,126 @@ struct ConsumerForwardingInfo {
   }
 };
 
+// Maps that track information relevant to best effort replay about
+// trivial-reduction axes in producer
+//
+// For example if we have producer: T0[i0, r1, r2, i3] and consumer:
+// T1[i0, i3]
+//
+// If producer transformations are:
+// -> T[i0, r1, r2, i3]
+// -> T[i0*r1, r2, i3]
+// -> T[i0*r1*r2, i3]
+//
+// forwarding_map would forward i0->i0*r1 and i0*r1->i0*r1*r2
+// compliment_map would have the i0->r1 and i0*r1->r2
+//
+// These two maps are used similarly as ConsumerForwardingInfo. See
+// its comments as well.
+struct ProducerForwardingInfo {
+ public:
+  // Map IterDomain* axes that can safely be forwarded to their output.
+  std::unordered_map<IterDomain*, IterDomain*> forwarding_map;
+
+  // Given a forward id map id_input -> id_forwarded
+  // Track the other inputs in the expr that id_input is an input to. These will
+  // be used to adjust the replay's leaf tracking. Don't need to track one to
+  // many as currently transformations on IterDomains can only have maximum 2
+  // inputs, but maybe in the future we'll have more.
+  std::unordered_map<IterDomain*, std::vector<IterDomain*>> compliment_map;
+
+  ProducerForwardingInfo(const TensorView* producer) {
+    std::vector<Expr*> producer_history = StmtSort::getExprs(
+        FusionGuard::getCurFusion(),
+        std::vector<Val*>(
+            producer->domain()->domain().begin(),
+            producer->domain()->domain().end()));
+
+    for (auto merge : ir_utils::filterByType<Merge>(producer_history)) {
+      auto inner = merge->inner();
+      auto outer = merge->outer();
+      if ((inner->isTrivialReduction() && !outer->isReduction()) ||
+          (outer->isTrivialReduction() && !inner->isReduction())) {
+        auto compliment_id = inner->isTrivialReduction() ? inner : outer;
+        auto forwarded_id = inner->isTrivialReduction() ? outer : inner;
+        // Only allow forwarding when the trivial reduction domain is
+        // an root domain
+        if (std::find(
+                producer->getMaybeRFactorDomain().begin(),
+                producer->getMaybeRFactorDomain().end(),
+                compliment_id) == producer->getMaybeRFactorDomain().end()) {
+          continue;
+        }
+        forwarding_map.emplace(std::make_pair(forwarded_id, merge->out()));
+        compliment_map.emplace(std::make_pair(
+            forwarded_id, std::vector<IterDomain*>{compliment_id}));
+      }
+    }
+  }
+};
+
 } // namespace
 
+void BestEffortReplay::addComplimentLeafIDs(
+    const std::unordered_map<IterDomain*, IterDomain*>& forwarding_map,
+    const std::unordered_map<IterDomain*, std::vector<IterDomain*>>&
+        compliment_map) {
+  // ID's could go through more than one forward iteration in the map before it
+  // terminates. Grab every id between the forwarded id, and what it was
+  // forwarded to
+  std::function<void(IterDomain*, std::vector<IterDomain*>&)>
+      collectForwardedIds =
+          [&forwarding_map, &collectForwardedIds](
+              IterDomain* forward_id,
+              std::vector<IterDomain*>& forwarded_ids) -> void {
+    if (forwarding_map.find(forward_id) != forwarding_map.end()) {
+      forwarded_ids.emplace_back(forward_id);
+      collectForwardedIds(forwarding_map.at(forward_id), forwarded_ids);
+    }
+  };
+
+  std::vector<IterDomain*> expanded_forwarded_ids;
+  for (auto forwarded_id : forwarded_ids_) {
+    collectForwardedIds(forwarded_id, expanded_forwarded_ids);
+  }
+
+  // Grab all compliments of forwarded ids.
+  std::vector<IterDomain*> compliments;
+  for (auto forwarded_id : expanded_forwarded_ids) {
+    auto compliment_map_it = compliment_map.find(forwarded_id);
+    TORCH_INTERNAL_ASSERT(
+        compliment_map_it != compliment_map.end(),
+        "Issue tracking forwarded broadcast merges in best effort replay.");
+    compliments.insert(
+        compliments.end(),
+        compliment_map_it->second.begin(),
+        compliment_map_it->second.end());
+  }
+
+  // Grab all exprs used to make the forwarded compliments
+  auto compliment_exprs = StmtSort::getExprs(
+      FusionGuard::getCurFusion(), {compliments.begin(), compliments.end()});
+
+  // Figure out if there are any leaves in compliment_exprs that aren't
+  // the forwarded id
+  std::unordered_map<IterDomain*, size_t> leaf_ids;
+
+  for (auto expr : compliment_exprs) {
+    for (auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
+      leaf_ids.erase(inp);
+    }
+    for (auto out : ir_utils::filterByType<IterDomain>(expr->outputs())) {
+      // If we used the comliment for forwarded don't add to leaf nodes.
+      if (std::find(compliments.begin(), compliments.end(), out) ==
+          compliments.end()) {
+        leaf_ids.emplace(std::make_pair(out, counter++));
+      }
+    }
+  }
+
+  leaf_ids_.insert(leaf_ids.begin(), leaf_ids.end());
+}
+
 BestEffortReplay BestEffortReplay::replayCasP(
     const TensorView* consumer,
     const TensorView* producer,
@@ -661,72 +798,18 @@ BestEffortReplay BestEffortReplay::replayCasP(
   // See FusionAdvancedComputeAt7 for an example of the forwarding logic
   ConsumerForwardingInfo consumer_forwarding_info(producer, consumer);
 
+  ProducerForwardingInfo producer_forwarding_info(producer);
+
   auto consumer_replay = BestEffortReplay(
       consumer->domain()->domain(),
       producer_CA_ids,
       p2c_root_map,
-      consumer_forwarding_info.forwarding_map);
-
-  // Need to adjust leaf map based on forwarding before returning.
-
-  // ID's could go through more than one forward iteration in the map before it
-  // terminates. Grab every id between the forwarded id, and what it was
-  // forwarded to
-  std::function<void(IterDomain*, std::vector<IterDomain*>&)>
-      collectForwardedIds =
-          [&consumer_forwarding_info, &collectForwardedIds](
-              IterDomain* forward_id,
-              std::vector<IterDomain*>& forwarded_ids) -> void {
-    if (consumer_forwarding_info.forwarding_map.find(forward_id) !=
-        consumer_forwarding_info.forwarding_map.end()) {
-      forwarded_ids.emplace_back(forward_id);
-      collectForwardedIds(
-          consumer_forwarding_info.forwarding_map.at(forward_id),
-          forwarded_ids);
-    }
-  };
-
-  std::vector<IterDomain*> expanded_forwarded_ids;
-  for (auto forwarded_id : consumer_replay.forwarded_ids_) {
-    collectForwardedIds(forwarded_id, expanded_forwarded_ids);
-  }
+      consumer_forwarding_info.forwarding_map,
+      producer_forwarding_info.forwarding_map);
 
-  // Grab all compliments of forwarded ids.
-  std::vector<IterDomain*> compliments;
-  for (auto forwarded_id : expanded_forwarded_ids) {
-    auto compliment_map_it =
-        consumer_forwarding_info.compliment_map.find(forwarded_id);
-    TORCH_INTERNAL_ASSERT(
-        compliment_map_it != consumer_forwarding_info.compliment_map.end(),
-        "Issue tracking forwarded broadcast merges in best effort replay consumer as producer.");
-    compliments.insert(
-        compliments.end(),
-        compliment_map_it->second.begin(),
-        compliment_map_it->second.end());
-  }
-
-  // Grab all exprs used to make the forwarded compliments
-  auto compliment_exprs = ExprSort::getExprs(
-      FusionGuard::getCurFusion(), {compliments.begin(), compliments.end()});
-
-  // Figure out if there are any leaves in compliment_exprs that aren't
-  // the forwarded id
-  std::unordered_map<IterDomain*, size_t> leaf_ids;
-
-  for (auto expr : compliment_exprs) {
-    for (auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-      leaf_ids.erase(inp);
-    }
-    for (auto out : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-      // If we used the comliment for forwarded don't add to leaf nodes.
-      if (std::find(compliments.begin(), compliments.end(), out) ==
-          compliments.end()) {
-        leaf_ids.emplace(std::make_pair(out, consumer_replay.counter++));
-      }
-    }
-  }
-
-  consumer_replay.leaf_ids_.insert(leaf_ids.begin(), leaf_ids.end());
+  consumer_replay.addComplimentLeafIDs(
+      consumer_forwarding_info.forwarding_map,
+      consumer_forwarding_info.compliment_map);
 
   return consumer_replay;
 }
@@ -766,14 +849,23 @@ BestEffortReplay BestEffortReplay::replayPasC(
 
   ConsumerForwardingInfo consumer_forwarding_info(producer, consumer);
 
+  ProducerForwardingInfo producer_forwarding_info(producer);
+
   // Instead of replaying from the root, lets try to play forward the history
   // of producer if they match ops on consumer. Enforce if we modify an
   // rfactor axis that those ops must match.
-  return BestEffortReplay(
+  auto producer_replay = BestEffortReplay(
       producer->domain()->domain(),
       consumer_CA_ids,
       c2p_root_map,
+      producer_forwarding_info.forwarding_map,
       consumer_forwarding_info.forwarding_map);
+
+  producer_replay.addComplimentLeafIDs(
+      producer_forwarding_info.forwarding_map,
+      producer_forwarding_info.compliment_map);
+
+  return producer_replay;
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/torch/csrc/jit/codegen/cuda/transform_iter.h
index cde502d636e3..a0d7346fe2db 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.h
+++ b/torch/csrc/jit/codegen/cuda/transform_iter.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
@@ -159,7 +159,8 @@ class TORCH_CUDA_CU_API ReplayTransformations : public IterVisitor {
 class TORCH_CUDA_CU_API BestEffortReplay {
  private:
   std::unordered_map<IterDomain*, IterDomain*> target2replay_id_map_;
-  std::unordered_map<IterDomain*, IterDomain*> forward_id_map_;
+  std::unordered_map<IterDomain*, IterDomain*> replay_forward_id_map_;
+  std::unordered_map<IterDomain*, IterDomain*> target_forward_id_map_;
   std::unordered_map<IterDomain*, size_t> leaf_ids_;
   std::vector<IterDomain*> forwarded_ids_;
 
@@ -176,25 +177,45 @@ class TORCH_CUDA_CU_API BestEffortReplay {
   // deterministicly
   size_t counter = 0;
 
-  bool inForwardMap(IterDomain* id) const {
-    return forward_id_map_.find(id) != forward_id_map_.end();
+  bool inReplayForwardMap(IterDomain* id) const {
+    return replay_forward_id_map_.find(id) != replay_forward_id_map_.end();
   }
 
-  IterDomain* getForwardedId(IterDomain* id) const {
-    auto forwarded_id_it = forward_id_map_.find(id);
-    if (forwarded_id_it == forward_id_map_.end()) {
+  bool inTargetForwardMap(IterDomain* id) const {
+    return target_forward_id_map_.find(id) != target_forward_id_map_.end();
+  }
+
+  IterDomain* getReplayForwardedId(IterDomain* id) const {
+    auto forwarded_id_it = replay_forward_id_map_.find(id);
+    if (forwarded_id_it == replay_forward_id_map_.end()) {
+      return id;
+    } else {
+      return getReplayForwardedId(forwarded_id_it->second);
+    }
+  }
+
+  IterDomain* getTargetForwardedId(IterDomain* id) const {
+    auto forwarded_id_it = target_forward_id_map_.find(id);
+    if (forwarded_id_it == target_forward_id_map_.end()) {
       return id;
     } else {
-      return getForwardedId(forwarded_id_it->second);
+      return getTargetForwardedId(forwarded_id_it->second);
     }
   }
 
+  //! Adds complimenting IDs of forwarded IDs to the leaf map
+  void addComplimentLeafIDs(
+      const std::unordered_map<IterDomain*, IterDomain*>& forwarding_map,
+      const std::unordered_map<IterDomain*, std::vector<IterDomain*>>&
+          compliment_map);
+
  public:
   BestEffortReplay(
       const std::vector<IterDomain*>& replay_domain,
       const std::vector<IterDomain*>& target_domain,
       std::unordered_map<IterDomain*, IterDomain*> target2replay_map,
-      std::unordered_map<IterDomain*, IterDomain*> forward_id_map = {});
+      std::unordered_map<IterDomain*, IterDomain*> replay_forward_id_map = {},
+      std::unordered_map<IterDomain*, IterDomain*> target_forward_id_map = {});
 
   // Return iter domain map from target_domain IDs to their "replayed"
   // replay_domain IDs. If not in map, was not replayed.
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.cpp b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
index d0d03532cd6c..6ab0df7b47cb 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
@@ -1,9 +1,11 @@
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
@@ -49,23 +51,32 @@ class ReplaySelf : public ReplayTransformations {
 
     // Manually replay the split, following the output of the operations.
     // This is so rfactor ops are replayed correctly.
-    IterDomain* ido = new IterDomain(
-        new Int(0),
+    IterDomain* ido = IrBuilder::create<IterDomain>(
+        s->container(),
+        s->container()->zeroVal(),
         s->innerSplit() ? remainder->as<Int>() : s->factor(),
         s->outer()->getParallelType(),
         s->outer()->getIterType(),
-        s->outer()->isRFactorProduct());
+        s->outer()->isRFactorProduct(),
+        s->outer()->hasPaddingToMultipleOfWarp(),
+        s->outer()->getMaybeSizeAfterPadding(),
+        s->outer()->isMmaSwizzled());
 
     // inner IterDomain
-    IterDomain* idi = new IterDomain(
-        new Int(0),
+    IterDomain* idi = IrBuilder::create<IterDomain>(
+        s->container(),
+        s->container()->zeroVal(),
         s->innerSplit() ? s->factor() : remainder->as<Int>(),
         s->inner()->getParallelType(),
         s->inner()->getIterType(),
-        s->inner()->isRFactorProduct());
+        s->inner()->isRFactorProduct(),
+        s->outer()->hasPaddingToMultipleOfWarp(),
+        s->outer()->getMaybeSizeAfterPadding(),
+        s->outer()->isMmaSwizzled());
 
     // Generate the split node
-    new Split(
+    IrBuilder::create<Split>(
+        s->container(),
         ido,
         idi,
         mapped,
@@ -112,14 +123,19 @@ class ReplaySelf : public ReplayTransformations {
     Val* merged_id_size =
         mul(id_outer_mapped->extent(), id_inner_mapped->extent());
 
-    IterDomain* merged_id = new IterDomain(
-        new Int(0),
+    IterDomain* merged_id = IrBuilder::create<IterDomain>(
+        m->container(),
+        m->container()->zeroVal(),
         merged_id_size->as<Int>(),
         m->out()->getParallelType(),
         m->outer()->getIterType(),
-        m->out()->isRFactorProduct());
+        m->out()->isRFactorProduct(),
+        m->out()->hasPaddingToMultipleOfWarp(),
+        m->out()->getMaybeSizeAfterPadding(),
+        m->out()->isMmaSwizzled());
 
-    new Merge(merged_id, id_outer_mapped, id_inner_mapped);
+    IrBuilder::create<Merge>(
+        m->container(), merged_id, id_outer_mapped, id_inner_mapped);
 
     // Remove inputs from the leaf IDs
     leaf_ids_.erase(id_outer_mapped);
@@ -197,7 +213,8 @@ TensorDomain* TransformReplay::fullSelfReplay(
             "Error during replay, didn't replay an axis.");
         new_rfactor_domain[i++] = it->second;
       }
-      return new TensorDomain(
+      return IrBuilder::create<TensorDomain>(
+          self->container(),
           new_self_root->getRootDomain(),
           new_rfactor_domain,
           new_domain,
@@ -205,8 +222,11 @@ TensorDomain* TransformReplay::fullSelfReplay(
     }
   }
 
-  return new TensorDomain(
-      new_self_root->getRootDomain(), new_domain, new_self_root->contiguity());
+  return IrBuilder::create<TensorDomain>(
+      self->container(),
+      new_self_root->getRootDomain(),
+      new_domain,
+      new_self_root->contiguity());
 }
 
 // Producer could have rfactor axes which consumer may want replayed. We can
@@ -246,10 +266,13 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
 
   // Make a new map based on all the leaves resulting from best effort replay
   id_map forwarded_replay_map;
+  auto forward_dangling_leaves = forward_replay.getUnorderedLeafIDs();
   for (auto entry : forward_replay.getReplay()) {
-    if (forward_replay.getUnorderedLeafIDs().find(entry.second) !=
-        forward_replay.getUnorderedLeafIDs().end())
+    if (forward_dangling_leaves.find(entry.second) !=
+        forward_dangling_leaves.end()) {
       forwarded_replay_map[entry.first] = entry.second;
+      forward_dangling_leaves.erase(entry.second);
+    }
   }
 
   // Replay producer dimensions.
@@ -265,7 +288,7 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
     auto it = replay_PasC.getReplay().find(c_id);
     if (it == replay_PasC.getReplay().end()) {
       TORCH_INTERNAL_ASSERT(
-          c_id->isBroadcast() || c_id->isGather(),
+          c_id->isBroadcast() || c_id->isGather() || c_id->isVectorComponent(),
           "Could not find axis, ",
           c_id,
           ", requested in replay.");
@@ -288,6 +311,10 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
     producer_self_replay_map[entry.first] = entry.first;
   }
 
+  for (auto entry : forward_dangling_leaves) {
+    producer_self_replay_map[entry.first] = entry.first;
+  }
+
   // Check which root domains were used to produce the leaf_ids. We may have
   // picked up extra roots in consumer because of broadcast forwarding.
   std::vector<Val*> unordered_non_root_leaf_vals;
@@ -360,7 +387,7 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
     auto it = replay_PasC.getReplay().find(c_id);
     if (it == replay_PasC.getReplay().end()) {
       TORCH_INTERNAL_ASSERT(
-          c_id->isBroadcast() || c_id->isGather(),
+          c_id->isBroadcast() || c_id->isGather() || c_id->isVectorComponent(),
           "Could not find axis, ",
           c_id,
           ", requested in replay.");
@@ -407,7 +434,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
       new_IDs.push_back(id);
     }
   }
-  TensorDomain* replayed = new TensorDomain(
+  TensorDomain* replayed = IrBuilder::create<TensorDomain>(
+      producer->container(),
       producer->getRootDomain(),
       producer->getRFactorDomain(),
       new_IDs,
@@ -604,7 +632,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayCasP(
     if (used_IDs.find(id) == used_IDs.end())
       new_IDs.push_back(id);
 
-  TensorDomain* replayed = new TensorDomain(
+  TensorDomain* replayed = IrBuilder::create<TensorDomain>(
+      consumer->container(),
       consumer->getRootDomain(),
       consumer->getRFactorDomain(),
       new_IDs,
@@ -689,17 +718,38 @@ bool TransformPropagator::replayPasC(
   }
 
   auto pairwiseMap = PairwiseRootDomainMap(producer_tv, consumer_tv);
-  auto producerAsC = TransformReplay::replayPasC(
+  auto replayed_producer = TransformReplay::replayPasC(
       producer_tv, consumer_tv, consumer_pos_it->second, pairwiseMap);
 
+  auto producer_root = producer_tv->getMaybeRFactorDomain();
+  auto replayed_domain = replayed_producer.first->domain();
+
+  // Find the number of root IDs involved in the transformation
+  auto dep_vals = DependencyCheck::getAllValsBetween(
+      {producer_root.begin(), producer_root.end()},
+      {replayed_domain.begin(),
+       replayed_domain.begin() + replayed_producer.second});
+
+  std::unordered_set<Val*> dep_vals_set{dep_vals.begin(), dep_vals.end()};
+
+  auto n_transformed_root_dims = std::count_if(
+      producer_root.begin(),
+      producer_root.end(),
+      [&dep_vals_set](IterDomain* root_id) {
+        return dep_vals_set.find(root_id) != dep_vals_set.end();
+      });
+
   if (replayed_pos.find(producer_tv) != replayed_pos.end()) {
-    if (producerAsC.second <= replayed_pos.at(producer_tv)) {
+    if (n_transformed_root_dims < n_replayed_root_dims.at(producer_tv) ||
+        (n_transformed_root_dims == n_replayed_root_dims.at(producer_tv) &&
+         replayed_producer.second <= replayed_pos.at(producer_tv))) {
       return false; // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
     }
   }
 
-  producer_tv->setDomain(producerAsC.first);
-  replayed_pos[producer_tv] = producerAsC.second;
+  producer_tv->setDomain(replayed_producer.first);
+  replayed_pos[producer_tv] = replayed_producer.second;
+  n_replayed_root_dims[producer_tv] = n_transformed_root_dims;
 
   return true;
 }
@@ -717,27 +767,44 @@ bool TransformPropagator::replayCasP(
   }
 
   auto pairwiseMap = PairwiseRootDomainMap(producer_tv, consumer_tv);
-  auto consumerAsP = TransformReplay::replayCasP(
+  auto replayed_consumer = TransformReplay::replayCasP(
       consumer_tv, producer_tv, producer_pos_it->second, pairwiseMap);
 
+  auto consumer_root = consumer_tv->getRootDomain();
+  auto replayed_domain = replayed_consumer.first->domain();
+
+  // Find the number of root IDs involved in the transformation
+  auto dep_vals = DependencyCheck::getAllValsBetween(
+      {consumer_root.begin(), consumer_root.end()},
+      {replayed_domain.begin(),
+       replayed_domain.begin() + replayed_consumer.second});
+
+  std::unordered_set<Val*> dep_vals_set{dep_vals.begin(), dep_vals.end()};
+
+  auto n_transformed_root_dims = std::count_if(
+      consumer_root.begin(),
+      consumer_root.end(),
+      [&dep_vals_set](IterDomain* root_id) {
+        return dep_vals_set.find(root_id) != dep_vals_set.end();
+      });
+
   if (replayed_pos.find(consumer_tv) != replayed_pos.end()) {
-    if (consumerAsP.second <= replayed_pos.at(consumer_tv)) {
+    if (n_transformed_root_dims < n_replayed_root_dims.at(consumer_tv) ||
+        (n_transformed_root_dims == n_replayed_root_dims.at(consumer_tv) &&
+         replayed_consumer.second <= replayed_pos.at(consumer_tv))) {
       return false; // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
     }
   }
 
-  consumer_tv->setDomain(consumerAsP.first);
-  replayed_pos[consumer_tv] = consumerAsP.second;
+  consumer_tv->setDomain(replayed_consumer.first);
+  replayed_pos[consumer_tv] = replayed_consumer.second;
+  n_replayed_root_dims[consumer_tv] = n_transformed_root_dims;
 
   return true;
 }
 
 TransformPropagator::TransformPropagator(TensorView* from) : starting_tv(from) {
-  // Tensors we should try to propagate in the consumer direction
-  std::deque<TensorView*> consumer_propagation{starting_tv};
-
-  // Tensors we should try to propagate in the producer direction
-  std::deque<TensorView*> producer_propagation{starting_tv};
+  VectorOfUniqueEntries<TensorView*> propagation{starting_tv};
 
   // Seed position with local tv
   replayed_pos[from] = from->nDims();
@@ -745,38 +812,24 @@ TransformPropagator::TransformPropagator(TensorView* from) : starting_tv(from) {
   // While tensor views are being replayed, if they're modified, make sure we
   // propagate back to all producers as well as consumers. This is definitely
   // not the most efficient implementation as what we do is any time a tv is
-  // changed we propagate both forward and backward. If a forward pass touches
-  // every node, the backward pass will try to replay every node, potentially
-  // multiple times.
-  while (!consumer_propagation.empty() || !producer_propagation.empty()) {
-    while (!consumer_propagation.empty()) {
-      // Tensor view we will replay onto consumers
-      auto tv = consumer_propagation.front();
-      consumer_propagation.pop_front();
-
-      // Replay tv forward to its consumers.
-      for (auto consumer_tv : consumersOf(tv)) {
-        auto replayed = replayCasP(consumer_tv, tv);
-        // If consumer has changed, mark we should propagate its consumers
-
-        if (replayed) {
-          consumer_propagation.emplace_back(consumer_tv);
-          producer_propagation.emplace_back(consumer_tv);
-        }
+  // changed we propagate both forward and backward.
+  while (!propagation.empty()) {
+    auto tv = propagation.popBack();
+
+    // Replay tv forward to its consumers.
+    for (auto consumer_tv : consumersOf(tv)) {
+      auto replayed = replayCasP(consumer_tv, tv);
+      // If consumer has changed, mark we should propagate
+      if (replayed) {
+        propagation.pushBack(consumer_tv);
       }
     }
 
-    while (!producer_propagation.empty()) {
-      // Tensor view we will replay onto producers
-      auto tv = producer_propagation.front();
-      producer_propagation.pop_front();
-      // Replay tv backward to its producers
-      for (auto producer_tv : producersFor(tv)) {
-        auto replayed = replayPasC(producer_tv, tv);
-        if (replayed) {
-          producer_propagation.emplace_back(producer_tv);
-          consumer_propagation.emplace_back(producer_tv);
-        }
+    for (auto producer_tv : producersFor(tv)) {
+      // If producer has changed, mark we should propagate
+      auto replayed = replayPasC(producer_tv, tv);
+      if (replayed) {
+        propagation.pushBack(producer_tv);
       }
     }
   }
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.h b/torch/csrc/jit/codegen/cuda/transform_replay.h
index 92898b54ba7a..0f7c7c00c853 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.h
+++ b/torch/csrc/jit/codegen/cuda/transform_replay.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/Export.h>
 
 #include <algorithm>
 #include <unordered_map>
@@ -162,6 +162,26 @@ class TORCH_CUDA_CU_API TransformPropagator {
 
  private:
   std::unordered_map<TensorView*, unsigned int> replayed_pos;
+
+  // This example comes from a BN kernel, the domain:
+  //
+  // [ iS{ceilDiv(ceilDiv(ceilDiv(i4, 128), 4), 1)}, iS{1}, iS{4}, iS{128},
+  // iS{i0}, iS{i2}, iS{i3} ]
+  //
+  // and
+  //
+  // [ iS{ceilDiv(ceilDiv(ceilDiv(i5*i6*i7*i8, 128), 4), 1)}, iS252{1},
+  // iS250{4}, iS248{128} ]
+  //
+  // Have the same number of replayed dimensions, however the second one
+  // involves more root domains. The second one is also likely the prefered
+  // replay. Therefore keep track of how many root domains were part of the
+  // replay and prefer transformations with more root domains. We could probably
+  // fix this instances of this occuring by changing the traversal pattern so
+  // that once propagating towards roots through broadcast axes, it can't come
+  // back through another broadcast, losing the transformation on those axes.
+  // However, this should work for existing cases.
+  std::unordered_map<TensorView*, unsigned int> n_replayed_root_dims;
   TensorView* starting_tv = nullptr;
 
  public:
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
index 8ac28cf3a2cc..437cdb48b958 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
@@ -52,23 +53,26 @@ class ReplayRFactor : public ReplayTransformations {
 
     // Manually replay the split, making reduction = false and rfactor = true
     // outer IterDomain
-    IterDomain* ido = new IterDomain(
-        new Int(0),
+    IterDomain* ido = IrBuilder::create<IterDomain>(
+        s->container(),
+        IrBuilder::create<Int>(s->container(), 0),
         s->innerSplit() ? remainder->as<Int>() : s->factor(),
         ParallelType::Serial,
         rfactor_outer ? IterType::Reduction : IterType::Iteration,
         true); // broadcast
 
     // inner IterDomain
-    IterDomain* idi = new IterDomain(
-        new Int(0),
+    IterDomain* idi = IrBuilder::create<IterDomain>(
+        s->container(),
+        IrBuilder::create<Int>(s->container(), 0),
         s->innerSplit() ? s->factor() : remainder->as<Int>(),
         ParallelType::Serial,
         rfactor_inner ? IterType::Reduction : IterType::Iteration,
         true);
 
     // Generate the split node
-    new Split(ido, idi, mapped, s->factor(), s->innerSplit());
+    IrBuilder::create<Split>(
+        s->container(), ido, idi, mapped, s->factor(), s->innerSplit());
 
     // Remove mapped id from leaf IDs
     leaf_ids_.erase(mapped);
@@ -115,14 +119,16 @@ class ReplayRFactor : public ReplayTransformations {
     Val* merged_id_size =
         mul(id_outer_mapped->extent(), id_inner_mapped->extent());
 
-    IterDomain* merged_id = new IterDomain(
-        new Int(0),
+    IterDomain* merged_id = IrBuilder::create<IterDomain>(
+        m->container(),
+        IrBuilder::create<Int>(m->container(), 0),
         merged_id_size->as<Int>(),
         ParallelType::Serial,
         rfactor_output ? IterType::Reduction : IterType::Iteration,
         true);
 
-    new Merge(merged_id, id_outer_mapped, id_inner_mapped);
+    IrBuilder::create<Merge>(
+        m->container(), merged_id, id_outer_mapped, id_inner_mapped);
 
     // Remove inputs from the leaf IDs
     leaf_ids_.erase(id_outer_mapped);
@@ -227,7 +233,7 @@ TensorDomain* TransformRFactor::runReplay(
         return rfactor_root_id;
       });
 
-  auto orig_td_root = orig_td->getRootDomain();
+  auto orig_td_root = orig_td->getMaybeRFactorDomain();
 
   // Generate a new TensorDomain and set up map from one root to this one.
   std::vector<IterDomain*> new_root(orig_td_root.size(), nullptr);
@@ -238,7 +244,8 @@ TensorDomain* TransformRFactor::runReplay(
     for (auto id : orig_td_root) {
       // If this is an rfactor root, it will be a reduction in this stage
       if (rfactor_root_axes.find(id) != rfactor_root_axes.end()) {
-        new_root[i] = new IterDomain(
+        new_root[i] = IrBuilder::create<IterDomain>(
+            id->container(),
             id->start(),
             id->extent(),
             id->stopOffset(),
@@ -248,7 +255,8 @@ TensorDomain* TransformRFactor::runReplay(
         // If this is not an rfactor root, but a reduction root, it should be
         // turned into an iteration domain
       } else if (id->isReduction()) {
-        new_root[i] = new IterDomain(
+        new_root[i] = IrBuilder::create<IterDomain>(
+            id->container(),
             id->start(),
             id->extent(),
             id->stopOffset(),
@@ -256,7 +264,7 @@ TensorDomain* TransformRFactor::runReplay(
             IterType::Iteration,
             false);
       } else {
-        new_root[i] = id->clone();
+        new_root[i] = id->cloneWithoutRFactor();
       }
       replay_map[id] = new_root[i++];
     }
@@ -296,7 +304,8 @@ TensorDomain* TransformRFactor::runReplay(
     if (dom->isRFactorProduct())
       rfactor_root.push_back(dom);
 
-  return new TensorDomain(
+  return IrBuilder::create<TensorDomain>(
+      orig_td->container(),
       new_root,
       rfactor_root,
       new_domain,
@@ -367,9 +376,9 @@ TensorDomain* TransformRFactor::runReplay2(
   // the domain we're creating
   std::vector<IterDomain*> new_root;
   std::unordered_map<IterDomain*, IterDomain*> replay_root_map;
-  for (auto id : orig_td->getRootDomain()) {
+  for (auto id : orig_td->getMaybeRFactorDomain()) {
     if (rfactor_root_axes.find(id) == rfactor_root_axes.end()) {
-      new_root.push_back(id->clone());
+      new_root.push_back(id->cloneWithoutRFactor());
       replay_root_map[id] = new_root.back();
     }
   }
@@ -392,7 +401,7 @@ TensorDomain* TransformRFactor::runReplay2(
           replayed_id->padToMultipleOfWarp(orig_id->getMaybeSizeAfterPadding());
         }
       } else if (axes_set.find(i) == axes_set.end()) {
-        IterDomain* new_id = orig_id->clone();
+        IterDomain* new_id = orig_id->cloneWithoutRFactor();
         new_domain.push_back(new_id);
         new_root.push_back(new_id);
       }
@@ -400,8 +409,11 @@ TensorDomain* TransformRFactor::runReplay2(
     }
   }
 
-  return new TensorDomain(
-      new_root, new_domain, std::vector<bool>(new_root.size(), true));
+  return IrBuilder::create<TensorDomain>(
+      orig_td->container(),
+      new_root,
+      new_domain,
+      std::vector<bool>(new_root.size(), true));
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.h b/torch/csrc/jit/codegen/cuda/transform_rfactor.h
index 551f67905b0b..593eb287d0bc 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.h
+++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.cpp b/torch/csrc/jit/codegen/cuda/transform_view.cpp
index ea4d188c0925..49e525d936d6 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_view.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
@@ -37,18 +38,36 @@ struct ViewIndexState {
 };
 
 //! Base class for all tranformations
-class Transform {
+class Transform : public PolymorphicBase {
  public:
-  virtual void toString(std::stringstream& output) const = 0;
+  virtual void toString(std::ostream& output) const = 0;
 
   size_t index() const {
     return index_;
   }
-  virtual ~Transform() = default;
+
+  size_t originalIndex() const {
+    return original_index_;
+  }
+
+  size_t newIndex() const {
+    return new_index_;
+  }
 
  protected:
-  Transform(size_t index) : index_(index) {}
+  Transform(const ViewIndexState& state, size_t index)
+      : index_(index),
+        original_index_(state.original_view_index),
+        new_index_(Transform::computeNewIndex(state)) {}
+
   const size_t index_ = 0;
+  const size_t original_index_ = 0;
+  const size_t new_index_ = 0;
+
+  static size_t computeNewIndex(const ViewIndexState& state) {
+    return state.original_view_index - state.trivial_reduction_offset +
+        state.split_offset - state.merge_offset + state.broadcast_offset;
+  }
 };
 
 //! Base class for all view tranformations - Merge, Split, Keep
@@ -59,11 +78,12 @@ class ViewTransform : public Transform {
   virtual void createRfactorDomain(
       const std::vector<IterDomain*>& new_root_domain,
       std::vector<IterDomain*>& rfactor_domain) = 0;
-  ~ViewTransform() override = default;
+
+  virtual bool isOriginalAxisDynamic() const = 0;
 
  protected:
   ViewTransform(const ViewIndexState& state)
-      : Transform(ViewTransform::computeIndex(state)) {}
+      : Transform(state, ViewTransform::computeIndex(state)) {}
 
   static size_t computeIndex(const ViewIndexState& state) {
     return state.original_view_index - state.trivial_reduction_offset;
@@ -71,6 +91,7 @@ class ViewTransform : public Transform {
 };
 
 namespace {
+typedef std::vector<size_t> Sizes;
 const size_t kEmptyAxis = 0;
 const size_t kSingletonAxis = 1;
 
@@ -81,11 +102,15 @@ class MergeTransform final : public ViewTransform {
   MergeTransform(const ViewIndexState& state, bool is_last_axis_rfactor)
       : ViewTransform(state), is_last_axis_rfactor_(is_last_axis_rfactor) {}
 
-  void toString(std::stringstream& output) const override {
+  void toString(std::ostream& output) const override {
     output << "Merge Index: " << index_ << " RF: " << is_last_axis_rfactor_
            << std::endl;
   }
 
+  bool isOriginalAxisDynamic() const override {
+    return false;
+  }
+
   void createRfactorDomain(
       const std::vector<IterDomain*>& new_root_domain,
       std::vector<IterDomain*>& rfactor_domain) override {
@@ -108,14 +133,15 @@ class MergeTransform final : public ViewTransform {
     auto merged_extent =
         mul(merged_id->extent(), new_root_domain[index_ + 1]->extent());
 
-    auto new_merged_id = new IterDomain(
-        new Int(0),
+    auto new_merged_id = IrBuilder::create<IterDomain>(
+        FusionGuard::getCurFusion()->zeroVal(),
         merged_extent,
         ParallelType::Serial,
         IterType::Iteration,
         true);
 
-    new Merge(new_merged_id, merged_id, new_root_domain[index_ + 1]);
+    IrBuilder::create<Merge>(
+        new_merged_id, merged_id, new_root_domain[index_ + 1]);
 
     rfactor_domain.push_back(new_merged_id);
   }
@@ -135,11 +161,15 @@ class SplitTransform final : public ViewTransform {
         is_last_axis_rfactor_(is_last_axis_rfactor),
         split_factor_(split_factor) {}
 
-  void toString(std::stringstream& output) const override {
+  void toString(std::ostream& output) const override {
     output << "Split Index: " << index_ << " RF: " << is_last_axis_rfactor_
            << " ARG: " << split_factor_ << std::endl;
   }
 
+  bool isOriginalAxisDynamic() const override {
+    return false;
+  }
+
   void createRfactorDomain(
       const std::vector<IterDomain*>& new_root_domain,
       std::vector<IterDomain*>& rfactor_domain) override {
@@ -150,7 +180,7 @@ class SplitTransform final : public ViewTransform {
         "\t Domain Size:\t",
         new_root_domain.size());
 
-    auto factor = new Int(split_factor_);
+    auto factor = IrBuilder::create<Int>(split_factor_);
 
     IterDomain* id = nullptr;
     if (is_last_axis_rfactor_) {
@@ -164,18 +194,22 @@ class SplitTransform final : public ViewTransform {
     Val* remainder = ceilDiv(id->extent(), factor);
 
     // outer loop IterDomain
-    IterDomain* factor_id = new IterDomain(
-        new Int(0), factor, id->getParallelType(), id->getIterType(), true);
+    IterDomain* factor_id = IrBuilder::create<IterDomain>(
+        FusionGuard::getCurFusion()->zeroVal(),
+        factor,
+        id->getParallelType(),
+        id->getIterType(),
+        true);
 
     // inner loop IterDomain
-    IterDomain* remainder_id = new IterDomain(
-        new Int(0),
+    IterDomain* remainder_id = IrBuilder::create<IterDomain>(
+        FusionGuard::getCurFusion()->zeroVal(),
         remainder->as<Int>(),
         ParallelType::Serial,
         IterType::Iteration,
         true);
 
-    new Split(factor_id, remainder_id, id, factor, false);
+    IrBuilder::create<Split>(factor_id, remainder_id, id, factor, false);
 
     rfactor_domain.push_back(factor_id);
     rfactor_domain.push_back(remainder_id);
@@ -191,10 +225,14 @@ class KeepTransform final : public ViewTransform {
  public:
   KeepTransform(const ViewIndexState& state) : ViewTransform(state) {}
 
-  void toString(std::stringstream& output) const override {
+  void toString(std::ostream& output) const override {
     output << "Keep Index: " << index_ << std::endl;
   }
 
+  bool isOriginalAxisDynamic() const override {
+    return true;
+  }
+
   void createRfactorDomain(
       const std::vector<IterDomain*>& new_root_domain,
       std::vector<IterDomain*>& rfactor_domain) override {
@@ -214,17 +252,11 @@ class KeepTransform final : public ViewTransform {
 class BroadcastTransform final : public Transform {
  public:
   BroadcastTransform(const ViewIndexState& state)
-      : Transform(BroadcastTransform::computeIndex(state)) {}
+      : Transform(state, Transform::computeNewIndex(state)) {}
 
-  void toString(std::stringstream& output) const override {
+  void toString(std::ostream& output) const override {
     output << "Bcast Index: " << index_ << std::endl;
   }
-
- private:
-  static size_t computeIndex(const ViewIndexState& state) {
-    return state.original_view_index - state.trivial_reduction_offset +
-        state.split_offset - state.merge_offset + state.broadcast_offset;
-  }
 };
 
 //! For any implicit broadcast dimensions in the original view, we remove
@@ -232,9 +264,9 @@ class BroadcastTransform final : public Transform {
 class TrivialReductionTransform final : public Transform {
  public:
   TrivialReductionTransform(const ViewIndexState& state)
-      : Transform(TrivialReductionTransform::computeIndex(state)) {}
+      : Transform(state, TrivialReductionTransform::computeIndex(state)) {}
 
-  void toString(std::stringstream& output) const override {
+  void toString(std::ostream& output) const override {
     output << "1-Red Index: " << index_ << std::endl;
   }
 
@@ -249,10 +281,11 @@ class TrivialReductionTransform final : public Transform {
 class AnalyzeViewTransformation {
  public:
   AnalyzeViewTransformation(
-      const std::vector<IterDomain*> root_domain,
-      const std::vector<size_t>& original_view,
-      const std::vector<size_t>& new_view)
-      : root_domain_(root_domain),
+      const Sizes& original_view,
+      const Sizes& new_view,
+      std::vector<IterDomain*> root_domain = {})
+      : default_implicit_broadcast_(root_domain.empty()),
+        root_domain_(root_domain),
         original_view_(original_view),
         new_view_(new_view),
         transform_view_(original_view) {
@@ -264,12 +297,43 @@ class AnalyzeViewTransformation {
     TORCH_INTERNAL_ASSERT(kOriginalNumElements == kNewNumElements);
   }
 
+  AnalyzeViewConstraint constraint() {
+    findTransformation();
+    TORCH_INTERNAL_ASSERT(
+        validate(),
+        "Analyze View Transformation failed to find valid transformation.\n",
+        toString());
+    std::vector<int64_t> original_constraint(
+        original_view_.begin(), original_view_.end());
+    std::vector<int64_t> new_constraint(new_view_.begin(), new_view_.end());
+    for (auto& vt : view_transforms_) {
+      if (vt->isOriginalAxisDynamic()) {
+        original_constraint[vt->originalIndex()] = -1;
+        new_constraint[vt->newIndex()] = -1;
+      }
+    }
+    return {original_constraint, new_constraint};
+  }
+
   AnalyzeViewResult run() {
     findTransformation();
+
     TORCH_INTERNAL_ASSERT(
         validate(),
         "Analyze View Transformation failed to find valid transformation.\n",
         toString());
+
+    // Skip view operations if all iterDomains are kept as-is
+    bool all_keep_transforms = std::all_of(
+        view_transforms_.begin(),
+        view_transforms_.end(),
+        [](std::shared_ptr<ViewTransform> vt) {
+          return vt->isA<KeepTransform>();
+        });
+    if (all_keep_transforms) {
+      view_transforms_.clear();
+    }
+
     return {
         !broadcast_transforms_.empty(),
         generateBroadcastAxes(),
@@ -382,6 +446,15 @@ class AnalyzeViewTransformation {
     return true;
   }
 
+  bool isImplicitBroadcast(size_t original_view_index) const {
+    if (default_implicit_broadcast_) {
+      return original_view_[original_view_index] == 1;
+    } else {
+      TORCH_INTERNAL_ASSERT(!root_domain_.empty());
+      return root_domain_[original_view_index]->isImplicitBroadcast();
+    }
+  }
+
   //! This utility class merges a fixed set of axes together
   //! according to some invariant. Implicit broadcast axes cannot be
   //! merged with standard iterDomains, so they are handled separately
@@ -400,8 +473,7 @@ class AnalyzeViewTransformation {
 
       bool any_merge = false;
       for (size_t idx = 0; idx < num_merge_axes_; ++idx) {
-        if (avt_->root_domain_[state_.original_view_index]
-                ->isImplicitBroadcast()) {
+        if (avt_->isImplicitBroadcast(state_.original_view_index)) {
           avt_->addTrivialReductionTransform();
         } else {
           avt_->addMergeTransform(
@@ -603,9 +675,10 @@ class AnalyzeViewTransformation {
   std::vector<std::shared_ptr<TrivialReductionTransform>>
       trivial_reduction_transforms_;
 
+  bool default_implicit_broadcast_ = true;
   const std::vector<IterDomain*> root_domain_;
-  const std::vector<size_t>& original_view_;
-  const std::vector<size_t>& new_view_;
+  const Sizes& original_view_;
+  const Sizes& new_view_;
 
   // transform_view is a mutable view and is initialized with the original_view.
   // It is used to track the current state of the original tensor domain.
@@ -622,7 +695,7 @@ class AnalyzeViewTransformation {
   // If transform size != original size for an axis, then the transformation
   // uses the last rfactor domain. Otherwise, it is a root domain
   // transformation.
-  std::vector<size_t> transform_view_;
+  Sizes transform_view_;
 };
 
 //! Create new TensorDomain with a modified rfactor domain using the specified
@@ -635,8 +708,9 @@ TensorDomain* createViewDomain(
   TORCH_INTERNAL_ASSERT(!view_transforms.empty());
 
   std::vector<IterDomain*> new_root_domain;
-  for (auto id : TensorDomain::noReductions(original_domain->getRootDomain())) {
-    new_root_domain.push_back(id->clone());
+  for (auto id :
+       TensorDomain::noReductions(original_domain->getMaybeRFactorDomain())) {
+    new_root_domain.push_back(id->cloneWithoutRFactor());
   }
 
   std::vector<IterDomain*> rfactor_domain;
@@ -644,7 +718,7 @@ TensorDomain* createViewDomain(
     t->createRfactorDomain(new_root_domain, rfactor_domain);
   }
 
-  return new TensorDomain(
+  return IrBuilder::create<TensorDomain>(
       new_root_domain,
       rfactor_domain,
       rfactor_domain,
@@ -652,11 +726,19 @@ TensorDomain* createViewDomain(
 }
 
 //! Infer -1 value in new view sizes from original view sizes
-std::vector<size_t> inferNewViewShape(
-    const std::vector<size_t>& original_view,
+std::pair<Sizes, Sizes> inferNewViewShape(
+    const std::vector<int64_t>& original_sizes,
     const std::vector<int64_t>& new_sizes) {
-  std::vector<size_t> new_view(new_sizes.size());
+  bool valid_original_sizes = std::all_of(
+      original_sizes.begin(), original_sizes.end(), [](int64_t dim) {
+        return dim > 0;
+      });
+  TORCH_INTERNAL_ASSERT(valid_original_sizes);
+
+  Sizes original_view(original_sizes.begin(), original_sizes.end());
+  Sizes new_view(new_sizes.size());
 
+  // TODO: refactor
   int64_t dynamic_index = -1;
   size_t new_size_num_elements = 1;
   for (size_t idx = 0; idx < new_sizes.size(); ++idx) {
@@ -665,6 +747,7 @@ std::vector<size_t> inferNewViewShape(
           dynamic_index == -1, "Only one dimension can by inferred.")
       dynamic_index = idx;
     } else {
+      TORCH_INTERNAL_ASSERT(new_sizes[idx] > 0);
       new_size_num_elements *= new_sizes[idx];
       new_view[idx] = new_sizes[idx];
     }
@@ -676,7 +759,7 @@ std::vector<size_t> inferNewViewShape(
     new_view[dynamic_index] = kNumElements / new_size_num_elements;
   }
 
-  return new_view;
+  return {original_view, new_view};
 }
 
 } // namespace
@@ -689,23 +772,26 @@ AnalyzeViewResult analyzeView(
     const std::vector<int64_t>& new_sizes) {
   FUSER_PERF_SCOPE("analyzeView");
   TORCH_INTERNAL_ASSERT(
-      tv->getMaybeRFactorDomain().size() == original_sizes.size());
-
-  bool valid_original_sizes = std::all_of(
-      original_sizes.begin(), original_sizes.end(), [](int64_t dim) {
-        return dim > 0;
-      });
-
-  TORCH_INTERNAL_ASSERT(valid_original_sizes);
-
-  std::vector<size_t> original_view(
-      original_sizes.begin(), original_sizes.end());
-  auto new_view = inferNewViewShape(original_view, new_sizes);
+      TensorDomain::noReductions(tv->getMaybeRFactorDomain()).size() ==
+      original_sizes.size());
+  auto sizes = inferNewViewShape(original_sizes, new_sizes);
   AnalyzeViewTransformation analyzer(
-      tv->getRootDomain(), original_view, new_view);
+      sizes.first /* original_view */,
+      sizes.second /* new_view */,
+      TensorDomain::noReductions(tv->getMaybeRFactorDomain()));
   return analyzer.run();
 }
 
+AnalyzeViewConstraint analyzeViewConstraint(
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& new_sizes) {
+  FUSER_PERF_SCOPE("analyzeViewConstraint");
+  auto sizes = inferNewViewShape(original_sizes, new_sizes);
+  AnalyzeViewTransformation analyzer(
+      sizes.first /* original_view */, sizes.second /* new_view */);
+  return analyzer.constraint();
+}
+
 //! Create new TensorDomain with a modified rfactor domain using the specified
 //! view transformations
 TensorDomain* transformView(
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.h b/torch/csrc/jit/codegen/cuda/transform_view.h
index e7473a1b9b43..f8a986048bea 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.h
+++ b/torch/csrc/jit/codegen/cuda/transform_view.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 
@@ -40,6 +40,11 @@ struct AnalyzeViewResult {
   std::vector<std::shared_ptr<ViewTransform>> transforms;
 };
 
+struct AnalyzeViewConstraint {
+  std::vector<int64_t> original_constraint;
+  std::vector<int64_t> new_constraint;
+};
+
 // Find the transformations necessary to convert TensorView
 // from original size to new size.
 AnalyzeViewResult analyzeView(
@@ -47,6 +52,11 @@ AnalyzeViewResult analyzeView(
     const std::vector<int64_t>& original_sizes,
     const std::vector<int64_t>& new_sizes);
 
+// Find the constraints derived from the view transformations
+AnalyzeViewConstraint analyzeViewConstraint(
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& new_sizes);
+
 // Generate a new TensorDomain from the given view transformations.
 // The original root domain is kept in the new TensorDomain,
 // but a new rfactor domain is created from the view transformations.
diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/torch/csrc/jit/codegen/cuda/type.cpp
index 3afb1b540b80..d4bfa22b44b3 100644
--- a/torch/csrc/jit/codegen/cuda/type.cpp
+++ b/torch/csrc/jit/codegen/cuda/type.cpp
@@ -8,6 +8,17 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+DataType indexModeToDtype(KernelIndexMode index_mode) {
+  switch (index_mode) {
+    case KernelIndexMode::INT32:
+      return DataType::Int32;
+    case KernelIndexMode::INT64:
+      return DataType::Int;
+    default:
+      TORCH_CHECK(false, "Invalid kernel index mode type.");
+  }
+}
+
 bool isFloatingPointType(DataType dtype) {
   switch (dtype) {
     case DataType::Bool:
@@ -17,14 +28,38 @@ bool isFloatingPointType(DataType dtype) {
     case DataType::Half:
     case DataType::BFloat16:
       return true;
+    case DataType::Index:
     case DataType::Int:
     case DataType::Int32:
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
       return false;
     case DataType::Null:
       TORCH_CHECK(
-          false, "Null type is not a valid argument to isFloatingPoint");
+          false, "Null type is not a valid argument to isFloatingPointType");
     default:
-      TORCH_CHECK(false, "Type not supported in isFloatingPoint");
+      TORCH_CHECK(false, "Type not supported in isFloatingPointType");
+  }
+}
+
+bool isBooleanType(DataType dtype) {
+  switch (dtype) {
+    case DataType::Bool:
+      return true;
+    case DataType::Double:
+    case DataType::Float:
+    case DataType::Half:
+    case DataType::BFloat16:
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
+    case DataType::Index:
+    case DataType::Int:
+    case DataType::Int32:
+      return false;
+    case DataType::Null:
+      TORCH_CHECK(false, "Null type is not a valid argument to isBooleanType");
+    default:
+      TORCH_CHECK(false, "Type not supported in isBooleanType");
   }
 }
 
@@ -35,7 +70,10 @@ bool isIntegralType(DataType dtype) {
     case DataType::Float:
     case DataType::Half:
     case DataType::BFloat16:
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
       return false;
+    case DataType::Index:
     case DataType::Int:
     case DataType::Int32:
       return true;
@@ -47,6 +85,83 @@ bool isIntegralType(DataType dtype) {
   }
 }
 
+bool isComplexType(DataType dtype) {
+  switch (dtype) {
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
+      return true;
+    case DataType::Bool:
+    case DataType::Double:
+    case DataType::Float:
+    case DataType::Half:
+    case DataType::BFloat16:
+    case DataType::Int:
+    case DataType::Index:
+    case DataType::Int32:
+      return false;
+    case DataType::Null:
+      TORCH_CHECK(false, "Null type is not a valid argument to isComplexType");
+    default:
+      TORCH_CHECK(false, "Type not supported in isComplexType");
+  }
+}
+
+bool isVectorType(DataType dtype) {
+  switch (dtype) {
+    case DataType::Float_2:
+    case DataType::Double_2:
+      return true;
+    default:
+      return false;
+  }
+}
+
+DataType getVectorType(DataType dtype, size_t vec_size) {
+  switch (dtype) {
+    case DataType::Float:
+      TORCH_INTERNAL_ASSERT(vec_size == 2, "Not supported vectorized type");
+      return DataType::Float_2;
+    case DataType::Double:
+      TORCH_INTERNAL_ASSERT(vec_size == 2, "Not supported vectorized type");
+      return DataType::Double_2;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Not supported vectorized type:", dtype, " and ", vec_size);
+  }
+}
+
+int getVectorSizeFromType(DataType dtype) {
+  switch (dtype) {
+    case DataType::Float_2:
+    case DataType::Double_2:
+      return 2;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Not a vector type:", dtype);
+  }
+}
+
+DataType getTypeFromVectorType(DataType dtype) {
+  switch (dtype) {
+    case DataType::Float_2:
+      return DataType::Float;
+    case DataType::Double_2:
+      return DataType::Double;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Not a vector type:", dtype);
+  }
+}
+
+DataType getTypeFromComplexType(DataType dtype) {
+  switch (dtype) {
+    case DataType::ComplexFloat:
+      return DataType::Float;
+    case DataType::ComplexDouble:
+      return DataType::Double;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Not a vector type:", dtype);
+  }
+}
+
 bool isIntegerOp(const BinaryOpType bopt) {
   return bopt >= BinaryOpType::Mod && bopt <= BinaryOpType::Rshift;
 }
@@ -71,7 +186,8 @@ DataType promote_type(const DataType& t1, const DataType& t2) {
       t1,
       " and ",
       t2);
-  return t1 < t2 ? t1 : t2;
+  return aten_to_data_type(
+      c10::promoteTypes(data_type_to_aten(t1), data_type_to_aten(t2)));
 }
 
 // Return highest on list (smallest enum val)
@@ -87,6 +203,9 @@ ValType promote_type(const ValType& t1, const ValType& t2) {
       (t1 == ValType::Scalar || t1 == ValType::NamedScalar)) {
     return ValType::Scalar;
   }
+  if (t1 == ValType::NamedScalar && t2 == ValType::NamedScalar) {
+    return ValType::Scalar;
+  }
   TORCH_CHECK(false, "Expected promotable ValTypes but got: ", t1, " and ", t2);
 }
 
@@ -104,10 +223,20 @@ static const char* data_type2string(DataType t) {
       return "__bfloat";
     case DataType::Int:
       return "int64_t";
+    case DataType::Index:
+      return "nvfuser_index_t";
     case DataType::Int32:
       return "int";
+    case DataType::ComplexFloat:
+      return "std::complex<float>";
+    case DataType::ComplexDouble:
+      return "std::complex<double>";
+    case DataType::Double_2:
+      return "Array<double, 2, 1>";
+    case DataType::Float_2:
+      return "Array<float, 2, 1>";
     case DataType::Null:
-      return "nullptr";
+      return "null_type";
     default:
       break;
   }
@@ -127,11 +256,38 @@ static const char* val_type2string(ValType t) {
       return "Scalar";
     case ValType::NamedScalar:
       return "NamedScalar";
+    case ValType::Predicate:
+      return "Predicate";
+    case ValType::TensorIndex:
+      return "TensorIndex";
     default:
       TORCH_INTERNAL_ASSERT(false, "No string found for val type.");
   }
 }
 
+static const char* predicate_type2string(PredicateType t) {
+  switch (t) {
+    case PredicateType::Manual:
+      return "Manual";
+    case PredicateType::Inline:
+      return "Inline";
+    case PredicateType::Unswitch:
+      return "Unswitch";
+    case PredicateType::Vectorize:
+      return "Vectorize";
+    case PredicateType::Misaligned:
+      return "Misaligned";
+    case PredicateType::Shift:
+      return "Shift";
+    case PredicateType::Padding:
+      return "Padding";
+    case PredicateType::ReductionWrite:
+      return "ReductionWrite";
+    default:
+      TORCH_INTERNAL_ASSERT(false, "No string found for predicate type.");
+  }
+}
+
 static const char* expr_type2string(ExprType t) {
   switch (t) {
     case ExprType::UnaryOp:
@@ -142,14 +298,48 @@ static const char* expr_type2string(ExprType t) {
       return "TernaryOp";
     case ExprType::ReductionOp:
       return "ReductionOp";
+    case ExprType::GroupedReductionOp:
+      return "GroupedReductionOp";
     case ExprType::BroadcastOp:
       return "BroadcastOp";
+    case ExprType::WelfordOp:
+      return "WelfordOp";
+    case ExprType::MmaOp:
+      return "MmaOp";
+    case ExprType::TransposeOp:
+      return "TransposeOp";
     case ExprType::ShiftOp:
       return "ShiftOp";
+    case ExprType::GatherOp:
+      return "GatherOp";
+    case ExprType::ViewAsScalar:
+      return "ViewAsScalar";
+    case ExprType::ViewOp:
+      return "ViewOp";
     case ExprType::Split:
       return "Split";
     case ExprType::Merge:
       return "Merge";
+    case ExprType::Allocate:
+      return "Allocate";
+    case ExprType::BlockSync:
+      return "BlockSync";
+    case ExprType::GridSync:
+      return "GridSync";
+    case ExprType::InitMagicZero:
+      return "InitMagicZero";
+    case ExprType::UpdateMagicZero:
+      return "UpdateMagicZero";
+    case ExprType::ForLoop:
+      return "ForLoop";
+    case ExprType::IfThenElse:
+      return "IfThenElse";
+    case ExprType::GridReduction:
+      return "GridReduction";
+    case ExprType::GridBroadcast:
+      return "GridBroadcast";
+    case ExprType::GridWelford:
+      return "GridWelford";
     default:
       TORCH_INTERNAL_ASSERT(false, "No string found for expr type.");
   }
@@ -162,11 +352,18 @@ bool needFloatSuffix(UnaryOpType t) {
     case UnaryOpType::Frac:
     case UnaryOpType::Gelu:
     case UnaryOpType::Silu:
+    case UnaryOpType::BitCast:
     case UnaryOpType::Neg:
     case UnaryOpType::Relu:
     case UnaryOpType::Reciprocal:
     case UnaryOpType::Set:
     case UnaryOpType::Sigmoid:
+    case UnaryOpType::IsFinite:
+    case UnaryOpType::IsInf:
+    case UnaryOpType::IsNan:
+    case UnaryOpType::IsNegInf:
+    case UnaryOpType::IsPosInf:
+    case UnaryOpType::IsReal:
       return false;
     default:
       return true;
@@ -205,8 +402,6 @@ static const char* unary_op_type2string(UnaryOpType t) {
       return "floor";
     case UnaryOpType::Frac:
       return "frac";
-    case UnaryOpType::Gelu:
-      return "gelu";
     case UnaryOpType::Silu:
       return "silu";
     case UnaryOpType::Lgamma:
@@ -219,6 +414,8 @@ static const char* unary_op_type2string(UnaryOpType t) {
       return "log1p";
     case UnaryOpType::Log2:
       return "log2";
+    case UnaryOpType::BitCast:
+      return "erase_type";
     case UnaryOpType::Neg:
       return "neg";
     case UnaryOpType::Not:
@@ -249,6 +446,18 @@ static const char* unary_op_type2string(UnaryOpType t) {
       return "tanh";
     case UnaryOpType::Trunc:
       return "trunc";
+    case UnaryOpType::IsFinite:
+      return "isfinite";
+    case UnaryOpType::IsInf:
+      return "isinf";
+    case UnaryOpType::IsNan:
+      return "isnan";
+    case UnaryOpType::IsNegInf:
+      return "isneginf";
+    case UnaryOpType::IsPosInf:
+      return "isposinf";
+    case UnaryOpType::IsReal:
+      return "isreal";
     default:
       TORCH_INTERNAL_ASSERT(false, "No string found for unary op type.");
   }
@@ -281,7 +490,6 @@ bool needFloatSuffix(BinaryOpType t) {
     case BinaryOpType::Atan2:
     case BinaryOpType::Div:
     case BinaryOpType::Fmod:
-    case BinaryOpType::Pow:
       return true;
     default:
       return false;
@@ -355,6 +563,18 @@ static const char* binary_op_integer_op2string(BinaryOpType t) {
   return nullptr;
 }
 
+static const char* binary_op_bool_op2string(BinaryOpType t) {
+  switch (t) {
+    case BinaryOpType::Max:
+      return "max";
+    case BinaryOpType::Min:
+      return "min";
+    default:
+      break;
+  }
+  return nullptr;
+}
+
 static const char* binary_op_type_inline_op2string(BinaryOpType t) {
   switch (t) {
     case BinaryOpType::Add:
@@ -416,12 +636,14 @@ static const char* ternary_op_type2string(TernaryOpType t) {
   switch (t) {
     case TernaryOpType::Clamp:
       return "clamp";
+    case TernaryOpType::Lerp:
+      return "lerp";
     case TernaryOpType::Threshold:
       return "threshold";
     case TernaryOpType::Where:
       return "where";
     default:
-      TORCH_INTERNAL_ASSERT(false, "Unexpected TernaryOpType", t);
+      TORCH_INTERNAL_ASSERT(false, "Unexpected TernaryOpType");
   }
 }
 
@@ -447,10 +669,12 @@ static const char* parallel_type2string(ParallelType t) {
       return "UR";
     case ParallelType::Unswitch:
       return "US";
+    case ParallelType::Mma:
+      return "MMA";
     case ParallelType::Serial:
       return "S";
     default:
-      TORCH_INTERNAL_ASSERT(false, "Unexpected ParallelType", t);
+      TORCH_INTERNAL_ASSERT(false, "Unexpected ParallelType");
   }
 }
 
@@ -463,7 +687,21 @@ static const char* memory_type2string(MemoryType t) {
     case MemoryType::Global:
       return "global";
     default:
-      TORCH_INTERNAL_ASSERT(false, "Unexpected MemoryType", t);
+      TORCH_INTERNAL_ASSERT(false, "Unexpected MemoryType");
+  }
+}
+
+static const char* id_map_mode_type2string(IdMappingMode t) {
+  switch (t) {
+    case IdMappingMode::PERMISSIVE:
+      return "permissive";
+    case IdMappingMode::EXACT:
+      return "exact";
+    case IdMappingMode::LOOP:
+      return "loop";
+    default:
+      // Don't try to print t as it would recursively call this function
+      TORCH_INTERNAL_ASSERT(false, "Unexpected IdMappingMode Type.");
   }
 }
 
@@ -481,6 +719,8 @@ static const char* iter_type2string(IterType t) {
       return "g";
     case IterType::Stride:
       return "s";
+    case IterType::VectorComponent:
+      return "v";
     default:
       // Don't try to print t as it would recursively call this function
       TORCH_INTERNAL_ASSERT(false, "Unexpected IterType");
@@ -502,7 +742,7 @@ static const char* thread_size2string(ParallelType t) {
     case ParallelType::TIDx:
       return "blockDim.x";
     default:
-      TORCH_INTERNAL_ASSERT(false, "Unexpected parallel type", t);
+      TORCH_INTERNAL_ASSERT(false, "Unexpected parallel type");
   }
 }
 
@@ -514,37 +754,68 @@ constexpr unsigned int supported_switch_pair(DataType t1, DataType t2) {
 static const char* supported_casts2string(
     const std::pair<DataType, DataType>& t) {
   switch (supported_switch_pair(std::get<0>(t), std::get<1>(t))) {
+    case supported_switch_pair(DataType::Index, DataType::Float):
     case supported_switch_pair(DataType::Int, DataType::Float):
     case supported_switch_pair(DataType::Int32, DataType::Float):
     case supported_switch_pair(DataType::Double, DataType::Float):
+    case supported_switch_pair(DataType::Bool, DataType::Float):
       return "(float)";
+    case supported_switch_pair(DataType::Index, DataType::Int):
     case supported_switch_pair(DataType::Int32, DataType::Int):
     case supported_switch_pair(DataType::Float, DataType::Int):
     case supported_switch_pair(DataType::Double, DataType::Int):
+    case supported_switch_pair(DataType::Bool, DataType::Int):
       return "(int64_t)";
+    case supported_switch_pair(DataType::Index, DataType::Int32):
+    case supported_switch_pair(DataType::Int, DataType::Int32):
     case supported_switch_pair(DataType::Float, DataType::Int32):
     case supported_switch_pair(DataType::Double, DataType::Int32):
+    case supported_switch_pair(DataType::Bool, DataType::Int32):
       return "(int32_t)";
+    case supported_switch_pair(DataType::Int, DataType::Index):
+    case supported_switch_pair(DataType::Int32, DataType::Index):
+    case supported_switch_pair(DataType::Float, DataType::Index):
+    case supported_switch_pair(DataType::Double, DataType::Index):
+      return "(nvfuser_index_t)";
+    case supported_switch_pair(DataType::Index, DataType::Double):
     case supported_switch_pair(DataType::Int, DataType::Double):
     case supported_switch_pair(DataType::Int32, DataType::Double):
     case supported_switch_pair(DataType::Float, DataType::Double):
+    case supported_switch_pair(DataType::Bool, DataType::Double):
       return "(double)";
+    case supported_switch_pair(DataType::Float, DataType::Bool):
+    case supported_switch_pair(DataType::Double, DataType::Bool):
+    case supported_switch_pair(DataType::Int32, DataType::Bool):
+    case supported_switch_pair(DataType::Int, DataType::Bool):
+      return "(bool)";
+    case supported_switch_pair(DataType::Index, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::Int, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::Int32, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::Double, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::Float, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::Bool, DataType::ComplexDouble):
+    case supported_switch_pair(DataType::ComplexFloat, DataType::ComplexDouble):
+      return "(std::complex<double>)";
+    case supported_switch_pair(DataType::Index, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::Int, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::Int32, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::Double, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::Float, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::Bool, DataType::ComplexFloat):
+    case supported_switch_pair(DataType::ComplexDouble, DataType::ComplexFloat):
+      return "(std::complex<float>)";
     case supported_switch_pair(DataType::Float, DataType::Half):
       return "__float2half";
+    case supported_switch_pair(DataType::Double, DataType::Half):
+      return "__double2half";
     case supported_switch_pair(DataType::Float, DataType::BFloat16):
       return "__float2bfloat";
     case supported_switch_pair(DataType::Half, DataType::Float):
       return "__half2float";
+    case supported_switch_pair(DataType::Half, DataType::Double):
+      return "__half2double";
     case supported_switch_pair(DataType::BFloat16, DataType::Float):
       return "__bfloat2float";
-    case supported_switch_pair(DataType::Bool, DataType::Double):
-      return "double";
-    case supported_switch_pair(DataType::Bool, DataType::Float):
-      return "float";
-    case supported_switch_pair(DataType::Bool, DataType::Int):
-      return "int64_t";
-    case supported_switch_pair(DataType::Bool, DataType::Int32):
-      return "int32_t";
     default:
       return nullptr;
   }
@@ -566,6 +837,10 @@ DataType aten_to_data_type(const at::ScalarType& scalar_type) {
       return DataType::Int;
     case at::ScalarType::Int:
       return DataType::Int32;
+    case at::ScalarType::ComplexFloat:
+      return DataType::ComplexFloat;
+    case at::ScalarType::ComplexDouble:
+      return DataType::ComplexDouble;
     default:
       return DataType::Null;
   }
@@ -585,8 +860,19 @@ at::ScalarType data_type_to_aten(const DataType& data_type) {
       return at::ScalarType::BFloat16;
     case DataType::Int:
       return at::ScalarType::Long;
+    case DataType::Index:
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Index is determined at compile time,",
+          " to convert from an aten type you need to have the compiled information. ",
+          "This information is passed to GpuLower at compile time, and then copied to kerned.",
+          "There's also this information in FusionExecutorCache and the Registry system.");
     case DataType::Int32:
       return at::ScalarType::Int;
+    case DataType::ComplexFloat:
+      return at::ScalarType::ComplexFloat;
+    case DataType::ComplexDouble:
+      return at::ScalarType::ComplexDouble;
     default:
       TORCH_INTERNAL_ASSERT(false, "No data type found for scalar type.");
   }
@@ -596,6 +882,10 @@ std::ostream& operator<<(std::ostream& out, const ValType vtype) {
   return out << val_type2string(vtype);
 }
 
+std::ostream& operator<<(std::ostream& out, const PredicateType ptype) {
+  return out << predicate_type2string(ptype);
+}
+
 std::ostream& operator<<(std::ostream& out, const DataType dtype) {
   return out << data_type2string(dtype);
 }
@@ -624,6 +914,10 @@ std::ostream& operator<<(std::ostream& out, const MemoryType mtype) {
   return out << memory_type2string(mtype);
 }
 
+std::ostream& operator<<(std::ostream& out, const IdMappingMode immtype) {
+  return out << id_map_mode_type2string(immtype);
+}
+
 TORCH_CUDA_CU_API std::ostream& operator<<(
     std::ostream& out,
     const IterType bt) {
@@ -649,6 +943,12 @@ c10::optional<std::string> integer_op_str(const BinaryOpType botype) {
                         : c10::nullopt;
 }
 
+c10::optional<std::string> bool_op_str(const BinaryOpType botype) {
+  const char* str = binary_op_bool_op2string(botype);
+  return str != nullptr ? c10::optional<std::string>(std::string(str))
+                        : c10::nullopt;
+}
+
 std::string stringifyThreadSize(const ParallelType ptype) {
   return thread_size2string(ptype);
 }
@@ -667,9 +967,13 @@ std::string typePrefix(const DataType data_type) {
     case DataType::Half:
     case DataType::BFloat16:
       return "f";
+    case DataType::Index:
     case DataType::Int:
     case DataType::Int32:
       return "i";
+    case DataType::ComplexFloat:
+    case DataType::ComplexDouble:
+      return "c";
     default:
       TORCH_INTERNAL_ASSERT(false, "No data type found for scalar type.");
   }
@@ -705,6 +1009,10 @@ size_t dataTypeSize(DataType type) {
   switch (type) {
     case DataType::Bool:
       return sizeof(bool);
+    case DataType::ComplexDouble:
+      return sizeof(std::complex<double>);
+    case DataType::ComplexFloat:
+      return sizeof(std::complex<float>);
     case DataType::Double:
       return sizeof(double);
     case DataType::Float:
@@ -713,15 +1021,33 @@ size_t dataTypeSize(DataType type) {
       return sizeof(at::Half);
     case DataType::BFloat16:
       return sizeof(at::BFloat16);
+    case DataType::Index:
+      TORCH_INTERNAL_ASSERT(
+          false, "The actual type of Index is only known at compile time.");
     case DataType::Int:
       return sizeof(uint64_t);
     case DataType::Int32:
       return sizeof(uint32_t);
+    case DataType::Double_2:
+      return sizeof(double) * 2;
+    case DataType::Float_2:
+      return sizeof(float) * 2;
     default:
       TORCH_INTERNAL_ASSERT(false, "Size undefined for data type, ", type);
   }
 }
 
+size_t dataTypeSize(DataType type, DataType index_type) {
+  if (type == DataType::Index) {
+    TORCH_INTERNAL_ASSERT(
+        index_type == DataType::Int32 || index_type == DataType::Int,
+        "Invalid index type of ",
+        index_type);
+    return dataTypeSize(index_type);
+  }
+  return dataTypeSize(type);
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h
index 43aadb620064..8bb9cd60fedb 100644
--- a/torch/csrc/jit/codegen/cuda/type.h
+++ b/torch/csrc/jit/codegen/cuda/type.h
@@ -2,8 +2,9 @@
 
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 
-#include <torch/csrc/Export.h>
+#include <c10/macros/Export.h>
 
 #include <array>
 #include <cstdint>
@@ -15,8 +16,6 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-enum class KernelIndexMode { INT32, INT64 };
-
 // https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
 struct TypeHash {
   template <typename T>
@@ -32,6 +31,8 @@ enum class ValType {
   TensorView,
   Scalar,
   NamedScalar,
+  Predicate,
+  TensorIndex,
 };
 
 // Manual - The user provides the Bool value. Predicate generation is bypassed.
@@ -52,12 +53,52 @@ enum class PredicateType {
   ReductionWrite
 };
 
-enum class DataType { Double, Float, Half, Int, Int32, Bool, BFloat16, Null };
+// Index type is a convenience type that may be a 64 or 32 signed integer.
+// This is helpful for math on indexing/size when we don't know what the index
+// type might be. This allows us to prevent assuming the welford count must be
+// int64_t which is relatively heavy to carry around. Index will be resolved
+// at compile time with KernelIndexMode.
+enum class DataType {
+  Double,
+  Float,
+  Half,
+  Int,
+  Index,
+  Int32,
+  Bool,
+  BFloat16,
+  ComplexFloat,
+  ComplexDouble,
+  // Vectorized types, used for reinterpret casting views
+  // TODO: add more vectorized types
+  Double_2,
+  Float_2,
+  // Null
+  Null
+};
+
+enum class KernelIndexMode { INT32, INT64 };
+
+DataType indexModeToDtype(KernelIndexMode index_mode);
 
 // Returns if the datatype is a floating point type
 bool isFloatingPointType(DataType dtype);
-// Returns if the datatype is an integer type
+// Returns if the datatype is an boolean type
 bool isIntegralType(DataType dtype);
+// Returns if the datatype is an integer type
+bool isBooleanType(DataType dtype);
+// Returns if the datatype is a complex type
+bool isComplexType(DataType dtype);
+// Returns if the datatype is a vector type
+bool isVectorType(DataType dtype);
+// Return the corresponding vector type
+DataType getVectorType(DataType dtype, size_t vec_size);
+// Return the vector size for the given vector type
+int getVectorSizeFromType(DataType dtype);
+// Return the corresponding type of a vector type
+DataType getTypeFromVectorType(DataType dtype);
+// Return the corresponding scalar of a complex type
+DataType getTypeFromComplexType(DataType dtype);
 
 enum class ExprType {
   Invalid,
@@ -65,14 +106,29 @@ enum class ExprType {
   BinaryOp,
   TernaryOp,
   ReductionOp,
+  GroupedReductionOp,
   BroadcastOp,
   WelfordOp,
+  MmaOp,
   TransposeOp,
   ShiftOp,
   GatherOp,
   ViewOp,
   Split,
+  ViewAsScalar,
   Merge,
+  Allocate,
+  BlockSync,
+  GridSync,
+  InitMagicZero,
+  UpdateMagicZero,
+  ForLoop,
+  IfThenElse,
+  GridReduction,
+  GroupedGridReduction,
+  GridBroadcast,
+  GridWelford,
+  AllocateFusedReduction
 };
 
 enum class UnaryOpType {
@@ -99,6 +155,7 @@ enum class UnaryOpType {
   Log10,
   Log1p,
   Log2,
+  BitCast,
   Neg,
   RandLike,
   Reciprocal,
@@ -115,7 +172,15 @@ enum class UnaryOpType {
   Trunc,
 
   // Might be a bitwise operator or boolean operator.
-  Not
+  Not,
+
+  // Operators returning boolean values
+  IsFinite,
+  IsInf,
+  IsNan,
+  IsNegInf,
+  IsPosInf,
+  IsReal,
 };
 
 // Primarily for Not, which could be Not a boolean, or a bitwise not.
@@ -171,7 +236,7 @@ bool isLogicalOp(const BinaryOpType bopt);
 // on input, for example bitwise_and is also used for boolean and in the jit
 bool alsoBooleanOperator(const BinaryOpType bopt);
 
-enum class TernaryOpType { Clamp, Threshold, Where };
+enum class TernaryOpType { Clamp, Lerp, Threshold, Where };
 
 enum class ParallelType {
   BIDz,
@@ -184,6 +249,7 @@ enum class ParallelType {
   MisalignedVectorize,
   Unroll,
   Unswitch,
+  Mma,
   Serial
 };
 
@@ -221,11 +287,20 @@ enum class IterType {
   BroadcastWithStride,
   BroadcastWithoutStride,
   Gather,
-  Stride
+  Stride,
+  VectorComponent
 };
 
 enum class SwizzleType { NoSwizzle, Transpose };
 
+// Used for Iteration Domain mapping modes in ComputeAtMap
+enum class IdMappingMode { PERMISSIVE, EXACT, LOOP };
+
+static constexpr std::array<IdMappingMode, 3> kIdMappingModes = {
+    IdMappingMode::PERMISSIVE,
+    IdMappingMode::EXACT,
+    IdMappingMode::LOOP};
+
 // Returns if function needs an f suffix on the operator when operating on a
 // float value i.e. sin->sinf
 bool needFloatSuffix(UnaryOpType t);
@@ -240,6 +315,7 @@ TORCH_CUDA_CU_API DataType aten_to_data_type(const at::ScalarType& scalar_type);
 TORCH_CUDA_CU_API at::ScalarType data_type_to_aten(const DataType& data_type);
 
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ValType);
+TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const PredicateType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const DataType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ExprType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const UnaryOpType);
@@ -248,6 +324,7 @@ TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const TernaryOpType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ParallelType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const MemoryType);
 TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const IterType);
+TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const IdMappingMode);
 
 std::string stringifyBooleanOp(const UnaryOpType);
 std::string stringifyBooleanOp(const BinaryOpType);
@@ -257,8 +334,11 @@ std::string stringifyThread(const ParallelType);
 std::string typePrefix(const DataType);
 
 // TODO: ThreadDim should be BlockDim and BlockDim should be GridDim
+// Returns if parallel type is TID[x, y, z]
 TORCH_CUDA_CU_API bool isParallelTypeThreadDim(ParallelType);
+// Returns if parallel type is BID[x, y, z]
 TORCH_CUDA_CU_API bool isParallelTypeBlockDim(ParallelType);
+// Returns if parallel type is a grid or block parallelization dimension
 TORCH_CUDA_CU_API bool isParallelTypeThread(ParallelType);
 
 TORCH_CUDA_CU_API bool isParallelTypeVectorize(ParallelType);
@@ -266,12 +346,16 @@ TORCH_CUDA_CU_API bool isParallelTypeVectorize(ParallelType);
 TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const UnaryOpType);
 TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const BinaryOpType);
 TORCH_CUDA_CU_API c10::optional<std::string> integer_op_str(const BinaryOpType);
+TORCH_CUDA_CU_API c10::optional<std::string> bool_op_str(const BinaryOpType);
 
 TORCH_CUDA_CU_API c10::optional<std::string> cast_func_str(
     const std::pair<DataType, DataType>&);
 
 TORCH_CUDA_CU_API size_t dataTypeSize(DataType type);
 
+// If the index type is known it will be automatically used here
+TORCH_CUDA_CU_API size_t dataTypeSize(DataType type, DataType index_type);
+
 enum class LaunchConfigType {
   Compatible,
   SharedMemory,
diff --git a/torch/csrc/jit/codegen/cuda/type_inference.cpp b/torch/csrc/jit/codegen/cuda/type_inference.cpp
index 8c7d7d36a06e..417c7c5519bd 100644
--- a/torch/csrc/jit/codegen/cuda/type_inference.cpp
+++ b/torch/csrc/jit/codegen/cuda/type_inference.cpp
@@ -4,6 +4,7 @@
 #include <c10/core/ScalarType.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
 #include <ATen/ExpandUtils.h>
@@ -29,6 +30,29 @@ bool hasTypeAndDevice(const TensorTypePtr& op) {
       op->scalarType().has_value();
 }
 
+void copyScalarTypeAndDeviceToOutput(
+    c10::optional<c10::ScalarType> dtype,
+    c10::optional<c10::Device> device,
+    Node* node,
+    size_t index = 0) {
+  auto out = node->output(index)->type()->cast<TensorType>();
+  TORCH_INTERNAL_ASSERT(
+      out != nullptr,
+      "Expect target node's type pointer to be non-nullptr, but get nullptr");
+  if (!hasTypeAndDevice(out)) {
+    out->scalarType() = dtype;
+    out->device() = device;
+  }
+}
+
+void copyScalarTypeAndDeviceToOutput(
+    TensorTypePtr from,
+    Node* node,
+    size_t index = 0) {
+  copyScalarTypeAndDeviceToOutput(
+      from->scalarType(), from->device(), node, index);
+}
+
 TensorTypePtr getInputTensorType(
     Node* node,
     size_t index,
@@ -104,7 +128,7 @@ class NaiveTypePropagator {
       case aten::bitwise_not:
       // TODO: rand_like should support cast.
       case aten::rand_like: {
-        node->output()->setType(unary_type(node));
+        unary_type(node);
         break;
       }
       // unary float operations
@@ -131,16 +155,28 @@ class NaiveTypePropagator {
       case aten::reciprocal:
       case aten::sigmoid:
       case aten::tanh: {
-        node->output()->setType(unary_float_type(node));
+        unary_float_type(node);
+        break;
+      }
+      // unary is
+      case aten::isfinite:
+      case aten::isinf:
+      case aten::isnan:
+      case aten::isneginf:
+      case aten::isposinf:
+      case aten::isreal: {
+        copyScalarTypeAndDeviceToOutput(
+            c10::ScalarType::Bool, c10::nullopt, node);
         break;
       }
       // binary float
       case aten::atan2: {
-        node->output()->setType(binary_float_type(node));
+        binary_type(node, TypePromotion::float_op_config);
         break;
       }
       // binary operations that forward meta info and broadcast shape:
       case aten::gelu_backward:
+      case aten::tanh_backward:
       case aten::mul:
       case aten::div:
       case aten::min:
@@ -155,32 +191,19 @@ class NaiveTypePropagator {
       // to neither type promotion nor shape.
       // TODO: Include alpha check for add/sub
       case aten::add:
-      case aten::sub: {
-        node->output()->setType(binary_type(node));
-        break;
-      }
-      // Type can be int or bool for "and" and "or", if both are bool should be
-      // bool, if both int should be int, otherwise would have errored
+      case aten::sub:
+      case aten::rsub:
+      case aten::bitwise_and:
       case aten::__and__:
-      case aten::__or__: {
-        const auto promoted_type = binary_broadcast_type(
-            getInputTensorType(node, 0, true),
-            getInputTensorType(node, 1, true),
-            node->input(0)->type()->cast<TensorType>()->scalarType() ==
-                    at::ScalarType::Bool
-                ? at::ScalarType::Bool
-                : at::ScalarType::Int);
-        break;
-      }
-      // Real int ops
+      case aten::bitwise_or:
+      case aten::__or__:
+      case aten::bitwise_xor:
       case aten::__xor__:
+      case aten::bitwise_left_shift:
       case aten::__lshift__:
+      case aten::bitwise_right_shift:
       case aten::__rshift__: {
-        const auto promoted_type = binary_broadcast_type(
-            getInputTensorType(node, 0, true),
-            getInputTensorType(node, 1, true),
-            at::ScalarType::Int);
-        node->output()->setType(promoted_type);
+        binary_type(node);
         break;
       }
       // binary comparison
@@ -190,52 +213,65 @@ class NaiveTypePropagator {
       case aten::ge:
       case aten::ne:
       case aten::eq: {
-        const auto promoted_type = binary_broadcast_type(
+        binary_broadcast_type(
+            node,
             getInputTensorType(node, 0, false),
             getInputTensorType(node, 1, true),
             at::ScalarType::Bool);
-        node->output()->setType(promoted_type);
         break;
       }
       case aten::where: {
-        const auto promoted_type = binary_broadcast_type(
+        binary_broadcast_type(
+            node,
             getInputTensorType(node, 1, true),
             getInputTensorType(node, 2, true));
-        node->output()->setType(promoted_type);
         break;
       }
       case aten::addcmul: {
         auto promoted_type = binary_broadcast_type(
+            nullptr,
             getInputTensorType(node, 1, true),
             getInputTensorType(node, 2, true));
-        promoted_type = binary_broadcast_type(
-            promoted_type, getInputTensorType(node, 0, true));
-        node->output()->setType(promoted_type);
-        break;
-      }
-      case aten::native_dropout_backward:
-      case aten::dropout: {
-        node->output()->setType(getInputTensorType(node, 0));
+        binary_broadcast_type(
+            node, promoted_type, getInputTensorType(node, 0, true));
         break;
       }
       case aten::native_dropout: {
         auto out_type = getInputTensorType(node, 0);
-        node->output(0)->setType(out_type);
-
-        auto mask_type = TensorType::create(
-            at::ScalarType::Bool, *out_type->device(), c10::nullopt, false);
-
-        node->output(1)->setType(mask_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node, 0);
+        copyScalarTypeAndDeviceToOutput(
+            out_type->withScalarType(at::ScalarType::Bool), node, 1);
         break;
       }
+      case aten::native_dropout_backward:
+      case aten::dropout:
       case aten::instance_norm:
-      case aten::batch_norm: {
-        node->output()->setType(getInputTensorType(node, 0));
+      case aten::batch_norm:
+      case aten::layer_norm: {
+        copyScalarTypeAndDeviceToOutput(getInputTensorType(node, 0), node);
         break;
       }
-      case aten::_batch_norm_impl_index_backward: {
+      case aten::_batch_norm_impl_index_backward:
+      case aten::native_batch_norm_backward: {
+        int weight_index = -1;
+        int mask_index = -1;
+        if (node->kind() ==
+            c10::Symbol::fromQualString(
+                "aten::_batch_norm_impl_index_backward")) {
+          weight_index = 3;
+          mask_index = 10;
+        } else if (
+            node->kind() ==
+            c10::Symbol::fromQualString("aten::native_batch_norm_backward")) {
+          weight_index = 2;
+          mask_index = 9;
+        } else {
+          TORCH_INTERNAL_ASSERT(
+              false, "unidentified node kind", node->kind().toDisplayString());
+        }
         // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-        auto out_mask_list = constant_as<c10::List<bool>>(node->input(10));
+        auto out_mask_list =
+            constant_as<c10::List<bool>>(node->input(mask_index));
         TORCH_INTERNAL_ASSERT(
             out_mask_list.has_value(),
             "Missing output mask for batch_norm_backward");
@@ -246,14 +282,14 @@ class NaiveTypePropagator {
 
         auto grad_input_type = getInputTensorType(node, 1);
         if (output_mask[0]) {
-          node->output(0)->setType(grad_input_type);
+          copyScalarTypeAndDeviceToOutput(grad_input_type, node, 0);
         }
 
         if (output_mask[1]) {
           if (auto weight_type = getInputTensorType(node, 3, true)) {
             auto acc_weight_type =
                 weight_type->withScalarType(toAccumulateType(weight_type));
-            node->output(1)->setType(acc_weight_type);
+            copyScalarTypeAndDeviceToOutput(acc_weight_type, node, 1);
           }
         }
 
@@ -265,21 +301,21 @@ class NaiveTypePropagator {
               *grad_input_type->device(),
               c10::nullopt,
               c10::nullopt);
-          node->output(2)->setType(bias_type);
+          copyScalarTypeAndDeviceToOutput(bias_type, node, 2);
         }
         break;
       }
       case aten::_batch_norm_impl_index: {
         auto out_type = getInputTensorType(node, 0);
-        node->output(0)->setType(out_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node, 0);
 
         auto mean_invstd_type = TensorType::create(
             toAccumulateType(out_type),
             *out_type->device(),
             c10::nullopt,
             c10::nullopt);
-        node->output(1)->setType(mean_invstd_type);
-        node->output(2)->setType(mean_invstd_type);
+        copyScalarTypeAndDeviceToOutput(mean_invstd_type, node, 1);
+        copyScalarTypeAndDeviceToOutput(mean_invstd_type, node, 2);
 
         // TODO: not that it matters, but mark the right type here;
         auto reserve_type = TensorType::create(
@@ -287,38 +323,22 @@ class NaiveTypePropagator {
             *out_type->device(),
             c10::nullopt,
             c10::nullopt);
-        node->output(3)->setType(reserve_type);
+        copyScalarTypeAndDeviceToOutput(reserve_type, node, 3);
         node->output(4)->setType(IntType::get());
         break;
       }
-      case aten::native_batch_norm: {
-        auto out_type = getInputTensorType(node, 0);
-        node->output(0)->setType(out_type);
-
-        auto mean_invstd_type = TensorType::create(
-            toAccumulateType(out_type),
-            *out_type->device(),
-            c10::nullopt,
-            c10::nullopt);
-        node->output(1)->setType(mean_invstd_type);
-        node->output(2)->setType(mean_invstd_type);
-        break;
-      }
-      case aten::layer_norm: {
-        node->output(0)->setType(getInputTensorType(node, 0));
-        break;
-      }
+      case aten::native_batch_norm:
       case aten::native_layer_norm: {
         auto out_type = getInputTensorType(node, 0);
-        node->output(0)->setType(out_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node, 0);
 
         auto mean_invstd_type = TensorType::create(
             toAccumulateType(out_type),
             *out_type->device(),
             c10::nullopt,
             c10::nullopt);
-        node->output(1)->setType(mean_invstd_type);
-        node->output(2)->setType(mean_invstd_type);
+        copyScalarTypeAndDeviceToOutput(mean_invstd_type, node, 1);
+        copyScalarTypeAndDeviceToOutput(mean_invstd_type, node, 2);
         break;
       }
       case aten::native_layer_norm_backward: {
@@ -332,24 +352,25 @@ class NaiveTypePropagator {
         }
 
         if (output_mask[0]) {
-          node->output(0)->setType(getInputTensorType(node, 0));
+          copyScalarTypeAndDeviceToOutput(getInputTensorType(node, 0), node, 0);
         }
 
         if (output_mask[1]) {
           // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
           if (auto weight_type = getInputTensorType(node, 5, true)) {
-            node->output(1)->setType(weight_type);
+            copyScalarTypeAndDeviceToOutput(weight_type, node, 1);
           }
         }
 
         if (output_mask[2]) {
           // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
           if (auto bias_type = getInputTensorType(node, 6, true)) {
-            node->output(2)->setType(bias_type);
+            copyScalarTypeAndDeviceToOutput(bias_type, node, 2);
           }
         }
         break;
       }
+      case aten::log_softmax:
       case aten::softmax: {
         auto out_type = getInputTensorType(node, 0);
 
@@ -360,7 +381,7 @@ class NaiveTypePropagator {
             out_type = out_type->withScalarType(opt_ivalue->toScalarType());
           }
         }
-        node->output()->setType(out_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node);
         break;
       }
       case aten::_softmax: {
@@ -374,18 +395,20 @@ class NaiveTypePropagator {
           out_type = out_type->withScalarType(at::ScalarType::Float);
         }
 
-        node->output()->setType(out_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node);
         break;
       }
+      case aten::_log_softmax_backward_data:
       case aten::_softmax_backward_data: {
         auto out_type = getInputTensorType(node, 0);
         if (auto opt_ivalue = toIValue(node->input(3))) {
           out_type = out_type->withScalarType(opt_ivalue->toScalarType());
         }
-        node->output()->setType(out_type);
+        copyScalarTypeAndDeviceToOutput(out_type, node);
         break;
       }
       case aten::amax:
+      case aten::amin:
       case aten::mean:
       case aten::sum: {
         auto out_type = getInputTensorType(node, 0);
@@ -404,41 +427,48 @@ class NaiveTypePropagator {
         TORCH_CHECK(
             dims.has_value() && keepdim.has_value(),
             "Shape inference cannot handle options.");
-        node->output()->setType(
-            unary_reduce_type(out_type, dims->vec(), keepdim.value()));
+        unary_reduce_type(node, out_type, dims->vec(), keepdim.value());
+        break;
+      }
+      case aten::std:
+      case aten::var: {
+        auto out_type = getInputTensorType(node, 0);
+        const auto dims = constant_as<c10::List<int64_t>>(node->input(1));
+        const auto keepdim = constant_as<bool>(node->input(3));
+        TORCH_CHECK(
+            dims.has_value() && keepdim.has_value(),
+            "Shape inference cannot handle options.");
+        unary_reduce_type(node, out_type, dims->vec(), keepdim.value());
         break;
       }
       case aten::sum_to_size:
       case aten::_grad_sum_to_size: {
         auto out_type = node->input(0)->type()->cast<TensorType>();
-        node->output()->setType(out_type->withDim(c10::nullopt));
+        copyScalarTypeAndDeviceToOutput(out_type->withDim(c10::nullopt), node);
         break;
       }
-      /*
-      // TODO: Enable view in parser by detecting non-alias view operation
-      case aten::view:
-      case aten::reshape: {
+      case prim::unsqueeze_copy:
+      case prim::squeeze_copy:
+      case prim::reshape_copy:
+      case prim::view_copy:
+      case prim::flatten_copy: {
         auto out_type = node->input(0)->type()->cast<TensorType>();
-        auto size_optional = constant_as<c10::List<int64_t>>(node->input(1));
-        TORCH_INTERNAL_ASSERT(
-            size_optional.has_value(), "The size parameter is required.");
-        auto new_size = size_optional->vec();
-        node->output()->setType(out_type->withSizes(new_size));
+        copyScalarTypeAndDeviceToOutput(out_type, node);
         break;
       }
-      */
       case aten::type_as: {
         const auto type0 = getInputTensorType(node, 0);
         const auto type1 = getInputTensorType(node, 1);
-        node->output()->setType(type0->withScalarType(type1->scalarType()));
+        copyScalarTypeAndDeviceToOutput(
+            type0->withScalarType(type1->scalarType()), node);
         break;
       }
       case aten::to: {
         const auto type0 = getInputTensorType(node, 0);
         const auto out_dtype = toIValue(node->input(1));
         TORCH_CHECK(out_dtype, "No output type specified");
-        node->output()->setType(
-            type0->withScalarType(out_dtype->toScalarType()));
+        copyScalarTypeAndDeviceToOutput(
+            type0->withScalarType(out_dtype->toScalarType()), node);
         break;
       }
       case prim::add_optional: {
@@ -447,7 +477,7 @@ class NaiveTypePropagator {
         // note: add_optional is supposed to replace an inplace add on input0,
         // so we just directly forward dtype
         TORCH_CHECK(type0 != nullptr);
-        node->output()->setType(type0);
+        copyScalarTypeAndDeviceToOutput(type0, node);
         break;
       }
       case aten::_autocast_to_reduced_precision: {
@@ -455,7 +485,6 @@ class NaiveTypePropagator {
         TORCH_CHECK(
             hasTypeAndDevice(in_type),
             "Type and device propagation has failed, or was not provided enough information.");
-        const auto in_scalar_type = in_type->scalarType();
         const auto in_device = in_type->device();
         const auto cuda_enabled = constant_as<bool>(node->input(1));
         const auto cpu_enabled = constant_as<bool>(node->input(2));
@@ -467,15 +496,16 @@ class NaiveTypePropagator {
             "_autocast_to_reduced_precision requires all scalar inputs to be constant.");
         if (in_type->scalarType() == at::ScalarType::Float) {
           if (in_device->is_cuda() && cuda_enabled.value()) {
-            node->output()->setType(
-                in_type->withScalarType(cuda_dtype.value()));
+            copyScalarTypeAndDeviceToOutput(
+                in_type->withScalarType(cuda_dtype.value()), node);
             break;
           } else if (in_device->is_cpu() && cpu_enabled.value()) {
-            node->output()->setType(in_type->withScalarType(cpu_dtype.value()));
+            copyScalarTypeAndDeviceToOutput(
+                in_type->withScalarType(cpu_dtype.value()), node);
             break;
           }
         }
-        node->output()->setType(in_type);
+        copyScalarTypeAndDeviceToOutput(in_type, node);
         break;
       }
       case aten::_autocast_to_full_precision: {
@@ -495,10 +525,10 @@ class NaiveTypePropagator {
              in_scalar_type == at::ScalarType::BFloat16) &&
             ((in_device->is_cuda() && cuda_enabled.value()) ||
              (in_device->is_cpu() && cpu_enabled.value()))) {
-          node->output()->setType(
-              in_type->withScalarType(at::ScalarType::Float));
+          copyScalarTypeAndDeviceToOutput(
+              in_type->withScalarType(at::ScalarType::Float), node);
         } else {
-          node->output()->setType(in_type);
+          copyScalarTypeAndDeviceToOutput(in_type, node);
         }
         break;
       }
@@ -518,33 +548,33 @@ class NaiveTypePropagator {
   }
 
  protected:
-  TensorTypePtr unary_type(Node* node) {
+  void unary_type(Node* node) {
     auto op = getInputTensorType(node, 0, false);
-    return TensorType::create(
-        *op->scalarType(), *op->device(), c10::nullopt, c10::nullopt);
+    copyScalarTypeAndDeviceToOutput(op, node);
   }
 
-  TensorTypePtr unary_float_type(Node* node) {
+  void unary_float_type(Node* node) {
     auto op = getInputTensorType(node, 0, false);
-    return TensorType::create(
+    copyScalarTypeAndDeviceToOutput(
         computeTypes(TypePromotion::float_op_config, {op}),
         *op->device(),
-        c10::nullopt,
-        c10::nullopt);
+        node);
   }
 
-  TensorTypePtr unary_reduce_type(
+  void unary_reduce_type(
+      Node* node,
       const TensorTypePtr& op,
       const std::vector<int64_t>& dims,
       bool keepdim) {
     TORCH_CHECK(
         hasTypeAndDevice(op),
         "Type and device propagation has failed, or was not provided enough information.");
-    return TensorType::create(
-        *op->scalarType(), *op->device(), c10::nullopt, c10::nullopt);
+    copyScalarTypeAndDeviceToOutput(op, node);
   }
 
-  TensorTypePtr binary_type(Node* node) {
+  void binary_type(
+      Node* node,
+      TypePromotionConfig config = TypePromotion::default_op_config) {
     auto op0 = node->input(0)->type();
     auto op1 = node->input(1)->type();
     auto op0_tensor_type = op0->cast<TensorType>();
@@ -553,53 +583,46 @@ class NaiveTypePropagator {
         hasTypeAndDevice(op0_tensor_type) || hasTypeAndDevice(op1_tensor_type),
         "At least one operand must be a tensor.");
     auto ptr = (op0_tensor_type != nullptr) ? op0_tensor_type : op1_tensor_type;
-    return TensorType::create(
-        computeTypes(TypePromotion::default_op_config, {op0, op1}),
-        *ptr->device(),
-        c10::nullopt,
-        c10::nullopt);
-  }
-
-  TensorTypePtr binary_float_type(Node* node) {
-    auto op0 = getInputTensorType(node, 0, false);
-    auto op1 = node->input(1)->type();
-    return TensorType::create(
-        computeTypes(TypePromotion::float_op_config, {op0, op1}),
-        *op0->device(),
-        c10::nullopt,
-        c10::nullopt);
+    copyScalarTypeAndDeviceToOutput(
+        computeTypes(config, {op0, op1}), *ptr->device(), node);
   }
 
   // TODO: we should comply to codegen type promotion.
   TensorTypePtr binary_broadcast_type(
+      Node* node,
       TensorTypePtr const& op0,
       TensorTypePtr const& op1,
       c10::optional<at::ScalarType> scalar_type = c10::nullopt) {
+    TensorTypePtr out;
     TORCH_CHECK(
         op0 != nullptr || op1 != nullptr,
         "Scalar operations on binary broadcast type, not supported yet.");
 
+    c10::ScalarType promoted_scalar_type;
+    c10::optional<c10::Device> device;
     if (op0 != nullptr && op1 != nullptr) {
       TORCH_CHECK(
           hasTypeAndDevice(op0) && hasTypeAndDevice(op1),
           "Type and device propagation has failed, or was not provided enough information.");
-      auto promoted_scalar_type = scalar_type.has_value()
+      promoted_scalar_type = scalar_type.has_value()
           ? *scalar_type
           : c10::promoteTypes(*op0->scalarType(), *op1->scalarType());
-
-      return TensorType::create(
-          promoted_scalar_type, *op0->device(), c10::nullopt, c10::nullopt);
+      device = *op0->device();
     } else {
       auto ptr = (op0 != nullptr) ? op0 : op1;
       TORCH_CHECK(
           hasTypeAndDevice(ptr),
           "Type and device propagation has failed, or was not provided enough information.");
-      return TensorType::create(
-          scalar_type.has_value() ? *scalar_type : *ptr->scalarType(),
-          *ptr->device(),
-          c10::nullopt,
-          c10::nullopt);
+      promoted_scalar_type =
+          scalar_type.has_value() ? *scalar_type : *ptr->scalarType();
+      device = *ptr->device();
     }
+    if (node != nullptr) {
+      copyScalarTypeAndDeviceToOutput(promoted_scalar_type, device, node);
+    }
+
+    return TensorType::create(
+        promoted_scalar_type, device, c10::nullopt, c10::nullopt);
   }
 
  private:
@@ -610,7 +633,9 @@ class NaiveTypePropagator {
 
 void TypePropagate(std::shared_ptr<Graph>& graph) {
   FUSER_PERF_SCOPE("TypePropagate");
+  GRAPH_DUMP("Before TypePropagate: ", graph);
   NaiveTypePropagator(graph).run();
+  GRAPH_DUMP("After TypePropagate: ", graph);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/type_promotion.cpp b/torch/csrc/jit/codegen/cuda/type_promotion.cpp
index 016e8825acfe..bfc3f7451a38 100644
--- a/torch/csrc/jit/codegen/cuda/type_promotion.cpp
+++ b/torch/csrc/jit/codegen/cuda/type_promotion.cpp
@@ -52,16 +52,13 @@ at::native::ResultTypeState updateResultTypeState(
 at::native::ResultTypeState updateResultTypeState(
     const c10::ScalarType scalar,
     const at::native::ResultTypeState& in_state) {
-  TORCH_INTERNAL_ASSERT(
-      !c10::isComplexType(scalar),
-      "NvFuser does not support complex data types.");
   at::native::ResultTypeState new_state = in_state;
   c10::ScalarType current = scalar;
   if (c10::isFloatingType(scalar)) {
     current = c10::typeMetaToScalarType(at::get_default_dtype());
   }
   new_state.wrappedResult =
-      promoteTypesSkipUndefined(in_state.wrappedResult, scalar);
+      promoteTypesSkipUndefined(in_state.wrappedResult, current);
   return new_state;
 }
 
@@ -195,17 +192,31 @@ std::vector<Val*> promoteValues(
 
 Val* optionalCast(DataType dtype, Val* v) {
   TORCH_INTERNAL_ASSERT(v->getDataType().has_value());
+  // Avoid casting Float/Int/ComplexDouble scalar to any corresponding
+  // FloatingPoint/Integral/Double type in fusion. Instead, we cast them
+  // directly. The exception is Bool, which is always cast to the desired
+  // type.
   const bool kSameDtype = v->getDataType().value() == dtype;
   const bool kIsScalarFloat =
       !v->isA<TensorView>() && isFloatingPointType(dtype);
+  const bool kIsScalarInt = !v->isA<TensorView>() && isIntegralType(dtype);
+  const bool kIsScalarComplex = !v->isA<TensorView>() && isComplexType(dtype);
   if (kSameDtype ||
-      (kIsScalarFloat && isFloatingPointType(v->getDataType().value()))) {
+      (kIsScalarFloat && isFloatingPointType(v->getDataType().value())) ||
+      (kIsScalarInt && isIntegralType(v->getDataType().value())) ||
+      (kIsScalarComplex && isComplexType(v->getDataType().value()))) {
     return v;
   } else {
     return castOp(dtype, v);
   }
 }
 
+Val* optionalCastStrict(DataType dtype, Val* v) {
+  TORCH_INTERNAL_ASSERT(v->getDataType().has_value());
+  const bool kSameDtype = v->getDataType().value() == dtype;
+  return (kSameDtype) ? v : castOp(dtype, v);
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/type_promotion.h b/torch/csrc/jit/codegen/cuda/type_promotion.h
index 632008f2cfae..37f403cbaaeb 100644
--- a/torch/csrc/jit/codegen/cuda/type_promotion.h
+++ b/torch/csrc/jit/codegen/cuda/type_promotion.h
@@ -59,8 +59,12 @@ std::vector<Val*> promoteValues(
     DataType common_type);
 
 // Casts value to common dtype if necessary
+// Avoid cast if value's dtype matches its dtype class
 Val* optionalCast(DataType dtype, Val* v);
 
+// Casts value to common dtype if necessary
+Val* optionalCastStrict(DataType dtype, Val* v);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index 67c8359b5021..2d79f23f69a4 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -19,19 +19,23 @@ auto parseDebugDumpOptions() {
       {DebugDumpOption::FusionIr, false},
       {DebugDumpOption::FusionIrMath, false},
       {DebugDumpOption::KernelIr, false},
+      {DebugDumpOption::ComputeAtMap, false},
       {DebugDumpOption::CudaKernel, false},
       {DebugDumpOption::CudaFull, false},
       {DebugDumpOption::CudaToFile, false},
       {DebugDumpOption::LaunchParam, false},
       {DebugDumpOption::FusionSegments, false},
-      {DebugDumpOption::PrintRuntimeArgs, false},
+      {DebugDumpOption::FusionSegmenterLog, false},
+      {DebugDumpOption::FusionArgs, false},
+      {DebugDumpOption::KernelArgs, false},
       {DebugDumpOption::EffectiveBandwidth, false},
       {DebugDumpOption::FusionSegmentsDrawing, false},
       {DebugDumpOption::PrintPtxasLog, false},
       {DebugDumpOption::BufferReuseInfo, false},
       {DebugDumpOption::SchedulerDebug, false},
       {DebugDumpOption::ParallelDimensions, false},
-      {DebugDumpOption::Halo, false}};
+      {DebugDumpOption::Halo, false},
+      {DebugDumpOption::PerfDebugVerbose, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
     c10::string_view options_view(dump_options);
@@ -44,6 +48,8 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::FusionIrMath] = true;
       } else if (token == "kernel_ir") {
         options_map[DebugDumpOption::KernelIr] = true;
+      } else if (token == "ca_map") {
+        options_map[DebugDumpOption::ComputeAtMap] = true;
       } else if (token == "cuda_kernel") {
         options_map[DebugDumpOption::CudaKernel] = true;
       } else if (token == "cuda_full") {
@@ -54,8 +60,12 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::LaunchParam] = true;
       } else if (token == "segmented_fusion") {
         options_map[DebugDumpOption::FusionSegments] = true;
-      } else if (token == "print_args") {
-        options_map[DebugDumpOption::PrintRuntimeArgs] = true;
+      } else if (token == "segmenter_logging") {
+        options_map[DebugDumpOption::FusionSegmenterLog] = true;
+      } else if (token == "fusion_args") {
+        options_map[DebugDumpOption::FusionArgs] = true;
+      } else if (token == "kernel_args") {
+        options_map[DebugDumpOption::KernelArgs] = true;
       } else if (token == "dump_eff_bandwidth") {
         options_map[DebugDumpOption::EffectiveBandwidth] = true;
       } else if (token == "draw_segmented_fusion") {
@@ -70,17 +80,63 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::ParallelDimensions] = true;
       } else if (token == "halo") {
         options_map[DebugDumpOption::Halo] = true;
+      } else if (token == "perf_debug_verbose") {
+        options_map[DebugDumpOption::PerfDebugVerbose] = true;
       } else {
         TORCH_CHECK(
             false,
             "Invalid debug dump option: '",
             token,
             "'\nAvailable options:\n",
-            "\tfusion_ir, fusion_ir_math, kernel_ir, cuda_kernel, cuda_full,\n",
-            "\tcuda_to_file, launch_param, segmented_fusion, print_args,\n",
-            "\tdump_eff_bandwidth, draw_segmented_fusion, scheduler_params\n",
-            "\tparallel_dimensions, buffer_reuse_verbose, ptxas_verbose\n",
-            "\thalo\n");
+            "\tfusion_ir, fusion_ir_math, kernel_ir, ca_map, cuda_kernel, cuda_full,\n",
+            "\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n",
+            "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
+            "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
+            "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n");
+      }
+      options_view = (end_pos != c10::string_view::npos)
+          ? options_view.substr(end_pos + 1)
+          : "";
+    }
+  }
+
+  return options_map;
+}
+
+auto parseDisableOptions() {
+  std::unordered_map<DisableOption, bool> options_map = {
+      {DisableOption::Fallback, false},
+      {DisableOption::Fma, false},
+      {DisableOption::IndexHoist, false},
+      {DisableOption::Nvtx, false},
+      {DisableOption::PredicateElimination, false},
+      {DisableOption::UnrollWithRng, false}};
+
+  if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DISABLE")) {
+    c10::string_view options_view(dump_options);
+    while (!options_view.empty()) {
+      const auto end_pos = options_view.find_first_of(',');
+      const auto token = options_view.substr(0, end_pos);
+      if (token == "fallback") {
+        options_map[DisableOption::Fallback] = true;
+      } else if (token == "fma") {
+        options_map[DisableOption::Fma] = true;
+      } else if (token == "index_hoist") {
+        options_map[DisableOption::IndexHoist] = true;
+      } else if (token == "nvtx") {
+        options_map[DisableOption::Nvtx] = true;
+      } else if (token == "predicate_elimination") {
+        options_map[DisableOption::PredicateElimination] = true;
+      } else if (token == "unroll_with_rng") {
+        options_map[DisableOption::UnrollWithRng] = true;
+      } else {
+        TORCH_CHECK(
+            false,
+            "Invalid disable option: '",
+            token,
+            "'\nAvailable options:\n",
+            "\tfallback, fma, index_hoist, nvtx, predicate_elimination\n",
+            "unroll_with_rng");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)
@@ -143,19 +199,62 @@ void debugPrint(const c10::TensorTypePtr& type) {
 }
 #pragma clang diagnostic pop
 
+bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
+  return tensor_type && tensor_type->dim().has_value() &&
+      tensor_type->dim().value() == 0;
+}
+
+bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
+  auto opt_sizes = tensor_type->sizes().concrete_sizes();
+  if (opt_sizes.has_value()) {
+    auto sizes = opt_sizes.value();
+    for (const auto& size : sizes) {
+      if (size == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool is_cpu_scalar(const at::Tensor& tensor) {
+  return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0;
+}
+
+bool is_cpu_scalar(const c10::TensorType& tensor_type) {
+  auto opt_device = tensor_type.device();
+  auto opt_dim = tensor_type.dim();
+  auto opt_numel = tensor_type.numel();
+  return opt_device.has_value() && opt_device.value().is_cpu() &&
+      opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 &&
+      opt_numel.value() == 1;
+}
+
 bool isDebugDumpEnabled(DebugDumpOption option) {
   const static auto dump_options = parseDebugDumpOptions();
   return dump_options.at(option);
 }
 
+bool isDisabled(DisableOption option) {
+  const static auto options = parseDisableOptions();
+  return options.at(option);
+}
+
 bool useFallback() {
+  // Keep this env var for compatibility
   const char* disable_fb_env = getenv("PYTORCH_NVFUSER_DISABLE_FALLBACK");
-  return !(disable_fb_env ? atoi(disable_fb_env) : false);
+  bool fallback_disabled = disable_fb_env ? atoi(disable_fb_env) : false;
+  fallback_disabled = fallback_disabled || isDisabled(DisableOption::Fallback);
+
+  return !fallback_disabled;
 }
 
-bool disableRNGUnrolling() {
-  const char* disable_rng_unroll = getenv("PYTORCH_NVFUSER_DISABLE_RNG_UNROLL");
-  return disable_rng_unroll ? atoi(disable_rng_unroll) : false;
+std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) {
+  TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor.");
+  auto optional_sizes = tensor_type->sizes().concrete_sizes();
+  TORCH_INTERNAL_ASSERT(
+      optional_sizes.has_value(), "Missing size information for the tensor.");
+  return optional_sizes.value();
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index f56d0f8d52ec..c3b20ec663a6 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include <ATen/core/jit_type.h>
+#include <ATen/ATen.h>
 #include <c10/util/Exception.h>
+#include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
 namespace jit {
@@ -10,6 +11,12 @@ namespace cuda {
 
 void debugPrint(const c10::TensorTypePtr& type);
 
+bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
+bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
+
+bool is_cpu_scalar(const at::Tensor& tensor);
+bool is_cpu_scalar(const c10::TensorType& tensor_type);
+
 //! Types of debug print-outs
 //!
 //! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
@@ -18,12 +25,15 @@ enum class DebugDumpOption {
   FusionIr, //!< Dump the Fusion IR before lowering
   FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
   KernelIr, //!< Dump the compiler Kernel IR
+  ComputeAtMap, //!< Dump the computeAt map
   CudaKernel, //!< Dump the generated CUDA C++ kernel code
   CudaFull, //!< Dump the complete CUDA C++ code
   CudaToFile, //!< Dump CUDA Strings to File
   LaunchParam, //!< Dump the Launch parameters of kernel
   FusionSegments, //!< Dump Segmented Fusion Graph
-  PrintRuntimeArgs, //!< Print the runtime arguments when launching kernels
+  FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
+  FusionArgs, //!< Print the runtime fusion arguments
+  KernelArgs, //!< Print the runtime kernel arguments when launching kernels
   EffectiveBandwidth, //! Measure kernel performance and print effective
                       //! bandwidth
   FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
@@ -31,18 +41,32 @@ enum class DebugDumpOption {
   BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
   SchedulerDebug, //! Dump scheduler heuristic parameters
   ParallelDimensions, //!< Dump known parallel dimensions
-  Halo //! Halo information of tensors
+  Halo, //! Halo information of tensors
+  PerfDebugVerbose //! When running kernels, print verbose information
+                   //! associated with what's running
 };
 
 TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
 
+//! Types of features to disable
+//!
+//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
+//!
+enum class DisableOption {
+  Fallback, //! Disable fallback
+  Fma, //! Disable FMA instructions
+  IndexHoist, //! Disable index hoisting
+  Nvtx, //! Disable NVTX instrumentation
+  PredicateElimination, //! Disable predicate elimination
+  UnrollWithRng //! Disable unrolling for kernels with RNG in them
+};
+
+TORCH_CUDA_CU_API bool isDisabled(DisableOption option);
+
 // Check if fallback path should be used which will dispatch to eagermode if any
 // errors are encountered. Helpful for debugging.
 bool useFallback();
 
-// Returns if unrolling should not be used for kernels with RNG in them.
-bool disableRNGUnrolling();
-
 //! Ceil integer division
 constexpr int64_t ceilDiv(int64_t a, int64_t b) {
   return (a + b - 1) / b;
@@ -116,6 +140,8 @@ constexpr unsigned int switch_pair(T t1, T t2) {
   return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
 }
 
+std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/vectorization_info.h b/torch/csrc/jit/codegen/cuda/vectorization_info.h
new file mode 100644
index 000000000000..14b5662ab3c5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/vectorization_info.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+struct VectorizedSetInfo {
+  //! Producer of a vectorized set
+  TensorView* producer_tv = nullptr;
+  //! Consumer of a vectorized set
+  TensorView* consumer_tv = nullptr;
+  //! Number of elements to vectorize
+  int word_size = -1;
+  //! Vectorized domain
+  IterDomain* vectorized_leaf_id = nullptr;
+  //! Right-most root dependent domain of the leaf domain
+  IterDomain* vectorized_root_id = nullptr;
+  //! All of the dependent root domains that are contiguously merged
+  std::unordered_set<IterDomain*> contig_root_ids;
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 68fee9918b6d..013a8e8b4adb 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -230,7 +230,7 @@ static CompilerConfig& getConfig() {
 // understand for AVX512. When we need better CPU performance this
 // optimization can be re-enabled by tracking down the platforms where
 // this error occurs and only selectively disabling it.
-#ifdef _MSC_VER
+#if (defined(_MSC_VER) && !defined(_M_ARM64))
 // According to https://stackoverflow.com/a/29178079, we are able to
 // detect which arch level is supported by the vectorizer using
 // the macro __isa_available. It is added during runtime.
diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index ef7e9e0b629d..13f4fd57142f 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -19,6 +19,7 @@ bool cpu_fuser_enabled = true;
 bool cpu_fuser_enabled = false;
 #endif
 
+// note: this doesn't necessarily enable NNC because NVFuser might override it
 bool gpu_fuser_enabled = true;
 
 } // namespace detail
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
new file mode 100644
index 000000000000..4942f3249356
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
@@ -0,0 +1,132 @@
+#include <ATen/Config.h>
+
+#if AT_MKLDNN_ENABLED()
+#include <c10/core/CPUAllocator.h>
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+dnnl::graph::engine& Engine::getEngine() {
+  static dnnl::graph::engine cpu_engine(
+      dnnl::graph::engine::kind::cpu, /* device_id = */ 0);
+  return cpu_engine;
+}
+
+dnnl::graph::stream& Stream::getStream() {
+  static dnnl::graph::stream cpu_stream{Engine::getEngine(), nullptr};
+  return cpu_stream;
+}
+
+LlgaTensorImpl::LlgaTensorImpl(
+    at::Storage&& storage,
+    const caffe2::TypeMeta& data_type,
+    const LlgaTensorDesc& desc)
+    : at::TensorImpl(
+          std::move(storage),
+          c10::DispatchKeySet(c10::DispatchKey::MkldnnCPU),
+          data_type),
+      desc_(desc) {
+  set_sizes_and_strides(desc.sizes(), desc.strides());
+  refresh_numel();
+}
+
+at::Tensor LlgaTensorImpl::llga_to_aten_tensor(LlgaTensorImpl* llgaImpl) {
+  auto aten_tensor = at::detail::make_tensor<TensorImpl>(
+      std::move(llgaImpl->storage_),
+      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      llgaImpl->data_type_);
+  auto impl = aten_tensor.unsafeGetTensorImpl();
+  impl->set_storage_offset(llgaImpl->storage_offset_);
+  impl->set_sizes_and_strides(llgaImpl->sizes(), llgaImpl->strides());
+  return aten_tensor;
+}
+
+at::Tensor empty_llga(
+    const LlgaTensorDesc& desc,
+    const c10::TensorOptions& options) {
+  auto nbytes = desc.storage_size();
+
+  auto allocator = at::GetCPUAllocator();
+  auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      nbytes,
+      allocator->allocate(nbytes),
+      allocator,
+      /*resizable=*/false);
+
+  return at::detail::make_tensor<LlgaTensorImpl>(
+      std::move(storage_impl), options.dtype(), desc);
+}
+
+const LlgaTensorDesc& get_llga_desc(const at::Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT(
+      tensor.is_mkldnn(), "get_llga_desc expects Mkldnn tensor input");
+  return static_cast<LlgaTensorImpl*>(tensor.unsafeGetTensorImpl())->desc();
+}
+
+dnnl::graph::tensor llga_from_aten_tensor(const at::Tensor& tensor) {
+  return {
+      get_llga_desc(tensor).logical_tensor(),
+      torch::jit::fuser::onednn::Engine::getEngine(),
+      tensor.data_ptr()};
+}
+
+using data_type = dnnl::graph::logical_tensor::data_type;
+
+data_type getLlgaDataType(at::ScalarType dt) {
+  switch (dt) {
+    case at::ScalarType::Float:
+      return data_type::f32;
+    case at::ScalarType::BFloat16:
+      return data_type::bf16;
+    case at::kInt:
+      return data_type::s32;
+    case at::ScalarType::QInt8:
+      return data_type::s8;
+    case at::ScalarType::QUInt8:
+      return data_type::u8;
+    default:
+      TORCH_CHECK(false, "Not support data type ", dt);
+  }
+}
+
+LlgaTensorDesc LlgaTensorDesc::supplementTensorInfo(const at::Tensor& t) const {
+  if (t.is_mkldnn()) {
+    // if input tensor is of mkldnn, it's originated from an upstream
+    // LLGA partition which carries opaque layout info
+    return get_llga_desc(t).tid(tid_);
+  } else {
+    // if input tensor is not an mkldnn tensor, use default layout
+    auto sizes = t.sizes().vec();
+    auto strides = t.strides().vec();
+    auto dtype = getLlgaDataType(t.scalar_type());
+    return {tid_, sizes, strides, dtype, property_type_};
+  }
+}
+
+at::ScalarType LlgaTensorDesc::aten_scalar_type() const {
+  switch (dtype_) {
+    case data_type::f32:
+      return at::ScalarType::Float;
+    case data_type::bf16:
+      return at::ScalarType::BFloat16;
+    case data_type::s32:
+      return at::kInt;
+    case data_type::s8:
+      return at::ScalarType::QInt8;
+    case data_type::u8:
+      return at::ScalarType::QUInt8;
+    default:
+      TORCH_CHECK(false, "Invalid data type ", static_cast<size_t>(dtype_));
+  }
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
+
+#endif // AT_MKLDNN_ENABLED()
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
new file mode 100644
index 000000000000..eb4aa592d700
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
@@ -0,0 +1,273 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+// Engine represents a device and its context. From the device kind, the engine
+// knows how to generate code for the target device and what kind of device
+// object to be expected. The device id ensures that there is a unique engine
+// being created for each device. The device handle passed from PyTorch allows
+// oneDNN Graph implementation to work on the device specified by PyTorch, which
+// is currently CPU, so we only have one engine.
+// Ref: https://spec.oneapi.io/onednn-graph/latest/programming_model.html#engine
+struct Engine {
+  // CPU engine singleton
+  static dnnl::graph::engine& getEngine();
+  Engine(const Engine&) = delete;
+  void operator=(const Engine&) = delete;
+};
+
+// Stream is the logical abstraction for execution units. It is created on top
+// of oneDNN Graph engine. A compiled oneDNN Graph partition is submitted to a
+// stream for execution.
+struct Stream {
+  // CPU stream singleton
+  static dnnl::graph::stream& getStream();
+  Stream(const Stream&) = delete;
+  void operator=(const Stream&) = delete;
+};
+
+struct LlgaTensorDesc {
+  using desc = dnnl::graph::logical_tensor;
+
+  LlgaTensorDesc(
+      size_t tid,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides,
+      desc::data_type dtype,
+      desc::property_type property_type)
+      : tid_(tid),
+        sizes_(sizes),
+        strides_(strides),
+        dtype_(dtype),
+        property_type_(property_type),
+        layout_type_(desc::layout_type::strided),
+        layout_id_(-1) {}
+
+  LlgaTensorDesc(const desc& t)
+      : tid_(t.get_id()),
+        sizes_(t.get_dims()),
+        strides_({-1}),
+        dtype_(t.get_data_type()),
+        property_type_(t.get_property_type()),
+        layout_type_(t.get_layout_type()),
+        layout_id_(-1) {
+    if (is_opaque()) {
+      layout_id_ = t.get_layout_id();
+    }
+    if (is_strided()) {
+      strides_ = t.get_strides();
+    }
+  }
+
+  // TODO: llga need set input/output type constraints while it seems that we
+  // cannot get the dtype during compile time, hard-coded to fp32 for now to be
+  // able to add_op
+  LlgaTensorDesc(const torch::jit::Value* v)
+      : LlgaTensorDesc(
+            v->unique(),
+            {},
+            {},
+            desc::data_type::f32,
+            get_property_type(v)) {
+    if (v->type()->isSubtypeOf(TensorType::get())) {
+      auto tt = v->type()->cast<TensorType>();
+
+      auto sizes = tt->sizes();
+      if (sizes.sizes()) {
+        for (auto d : *sizes.sizes()) {
+          sizes_.push_back(d.value_or(DNNL_GRAPH_UNKNOWN_DIM));
+        }
+      }
+
+      auto strides = tt->strides();
+      if (strides.sizes()) {
+        for (auto d : *strides.sizes()) {
+          strides_.push_back(d.value_or(DNNL_GRAPH_UNKNOWN_DIM));
+        }
+      }
+    }
+  }
+
+  LlgaTensorDesc supplementTensorInfo(const at::Tensor& t) const;
+
+  at::ScalarType aten_scalar_type() const;
+
+  const std::vector<int64_t>& sizes() const {
+    return sizes_;
+  }
+
+  const std::vector<int64_t>& strides() const {
+    TORCH_CHECK(!is_opaque(), "Cannot get strides on opaque layout");
+    return strides_;
+  }
+
+  size_t tid() const {
+    return tid_;
+  }
+
+  LlgaTensorDesc tid(uint64_t new_id) const {
+    auto ret = *this;
+    ret.tid_ = new_id;
+    return ret;
+  }
+
+  desc::data_type dtype() const {
+    return dtype_;
+  }
+
+  LlgaTensorDesc dtype(desc::data_type new_dtype) const {
+    return LlgaTensorDesc(tid_, sizes_, strides_, new_dtype, property_type_);
+  }
+
+  desc::layout_type layout_type() const {
+    return layout_type_;
+  }
+
+  LlgaTensorDesc layout_type(desc::layout_type new_layout_type) {
+    auto ret = *this;
+    ret.layout_type_ = new_layout_type;
+    return ret;
+  }
+
+  desc::property_type get_property_type(const torch::jit::Value* v) {
+    switch (v->node()->kind()) {
+      case prim::Constant:
+        return desc::property_type::constant;
+      default:
+        return desc::property_type::variable;
+    }
+  }
+
+  LlgaTensorDesc any() {
+    return layout_type(desc::layout_type::any);
+  }
+
+  size_t storage_size() const {
+    return logical_tensor().get_mem_size();
+  }
+
+  desc logical_tensor() const {
+    if (is_dimensionality_unknown()) {
+      return desc(
+          tid_, dtype_, DNNL_GRAPH_UNKNOWN_NDIMS, layout_type_, property_type_);
+    } else if (is_opaque()) {
+      return desc(tid_, dtype_, sizes_, layout_id_, property_type_);
+    } else if (is_any()) {
+      return desc(tid_, dtype_, sizes_, layout_type_, property_type_);
+    } else {
+      return desc(tid_, dtype_, sizes_, strides_, property_type_);
+    }
+  }
+
+  bool is_strided() const {
+    return layout_type_ == desc::layout_type::strided;
+  }
+
+  bool is_any() const {
+    return layout_type_ == desc::layout_type::any;
+  }
+
+  bool is_opaque() const {
+    return layout_type_ == desc::layout_type::opaque;
+  }
+
+  bool operator==(const LlgaTensorDesc& desc) const {
+    return tid_ == desc.tid_ && sizes_ == desc.sizes_ &&
+        dtype_ == desc.dtype_ && layout_type_ == desc.layout_type_ &&
+        ((is_opaque() && layout_id_ == desc.layout_id_) ||
+         strides_ == desc.strides_);
+  }
+
+  bool operator!=(const LlgaTensorDesc& desc) const {
+    return (tid_ != desc.tid_) || (sizes_ != desc.sizes_) ||
+        (dtype_ != desc.dtype_) || (layout_type_ != desc.layout_type_) ||
+        !((is_opaque() && (layout_id_ == desc.layout_id_)) ||
+          (strides_ == desc.strides_));
+  }
+
+  static size_t hash(const LlgaTensorDesc& desc) {
+    return c10::get_hash(
+        desc.tid_,
+        desc.sizes_,
+        desc.dtype_,
+        desc.layout_type_,
+        desc.layout_id_);
+  }
+
+  void set_compute_inplace() {
+    compute_inplace_ = true;
+  }
+
+  void set_input_tensor_index(size_t index) {
+    input_tensor_index_ = index;
+  }
+
+  bool reuses_input_tensor() {
+    return compute_inplace_;
+  }
+
+  size_t get_input_tensor_index() {
+    return input_tensor_index_;
+  }
+
+ private:
+  bool is_dimensionality_unknown() const {
+    return sizes_.size() == 0;
+  }
+
+  size_t tid_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+  desc::data_type dtype_;
+  desc::property_type property_type_;
+  desc::layout_type layout_type_;
+  size_t layout_id_;
+  // If this is an output tensor, and querying the compiled partition would
+  // determine that this tensor would reuse its input tensor, then
+  // compute_inplace would be true, and input_tensor_index would be the index of
+  // the corresponding input tensor in inputSpecs_ of the LlgaKernel object.
+  bool compute_inplace_ = false;
+  size_t input_tensor_index_;
+};
+
+// Initially, oneDNN Graph also used to have blocked layout for tensors between
+// partitions, and the LlgaTensorImpl wrapper helped us bypass guard checks.
+// oneDNN Graph has switched over to using strided tensors between partitions,
+// but this wrapper still helps us bypass guard checks because the strides of
+// tensors between partitions would be different from the ones the guard is
+// otherwise expecting.
+struct TORCH_API LlgaTensorImpl : public c10::TensorImpl {
+  LlgaTensorImpl(
+      at::Storage&& storage,
+      const caffe2::TypeMeta& data_type,
+      const LlgaTensorDesc& desc);
+
+  const LlgaTensorDesc& desc() const {
+    return desc_;
+  }
+
+  static at::Tensor llga_to_aten_tensor(LlgaTensorImpl* llgaImpl);
+
+ private:
+  LlgaTensorDesc desc_;
+};
+
+at::Tensor empty_llga(
+    const LlgaTensorDesc& desc,
+    const c10::TensorOptions& options);
+
+dnnl::graph::tensor llga_from_aten_tensor(const at::Tensor& tensor);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md
new file mode 100644
index 000000000000..3da25117a070
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/README.md
@@ -0,0 +1,108 @@
+# Pytorch - oneDNN Graph API Bridge
+This integration will add the infrastructure of a new PyTorch JIT graph fuser based on [oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/programming_model.html), which provides a flexible API for aggressive fusion. The current preview4 version supports fusion for FP32 inference. Currently, the speedup is achieved for static shapes,
+although we'd soon add dynamic-shape support. When oneDNN Graph is enabled, weights are cached, as they're constant during inference.
+
+## Graph Optimization
+We have registered optimization passes in the custom pre-passes set of PyTorch:
+
+1. Alias and mutation reduction
+
+    The operators of oneDNN graph are pure functional while PyTorch has operators in in-place forms or create views for buffer sharing.
+    Due to the semantic gaps between the backend operators and the PyTorch operators, we have a pass to reduce mutation with best effort at the beginning.
+
+2. Graph passing
+
+    With a PyTorch TorchScript graph, the integration maps PyTorch operators on the graph to the corresponding oneDNN Graph operators to form a backend graph.
+
+3. Partitioning
+
+    The backend selects regions to be fused in the graph and returns a list of partitions. Each partition corresponds to a set of fused operators.
+
+4. Graph rewriting
+
+    The original PyTorch JIT graph will be re-written based on the partitions returned from the backend. The operators in one partition will be grouped together to form a JIT operator, referred to as a oneDNN Graph fusion group.
+
+5. Layout propagation
+
+    This pass is to eliminate unnecessary layout conversions at partition boundaries. We set different formats to the output of a partition so that the backend could perform layout conversion internally. When `ANY` is set, the layout at boundaries will be fully decided by the backend. Otherwise, the backend should follow the layout set by PyTorch. Currently, we set `ANY` layout for a tensor that's an output of a oneDNN Graph partition, and an input to another.
+
+## Graph Executor
+During runtime execution of a (re-written) PyTorch JIT graph, oneDNN graph partitions will be dispatched to the oneDNN graph JIT variadic Operator.
+Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#partition) and [executed](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
+
+
+## Tests
+
+```bash
+pytest test/test_jit_llga_fuser.py
+```
+
+## Quick Start
+
+A simple cascaded Conv-Relu example is provided in test. Please consider enabling log outputs to familiarize yourself with the whole pipeline:
+
+**Mutation Removal -> Prepare Binary -> Defer Size Check -> Graph Fuser -> Layout Propagation -> Type Guard -> Kernel Execution**
+
+oneDNN Graph was formerly known as LLGA (Low Level Graph API),
+and thus LLGA in the codebase corresponds to oneDNN Graph.
+
+```bash
+DNNL_VERBOSE=1 PYTORCH_JIT_LOG_LEVEL=">>graph_helper:>>graph_fuser:>>kernel:>>interface" python -u test/test_jit_llga_fuser.py -k test_conv2d_eltwise
+```
+
+## Codebase structure
+
+Most of the source code is placed in
+
+```bash
+torch/csrc/jit/codegen/onednn/*
+```
+
+Tensor related code is located at
+
+```bash
+torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
+torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
+```
+
+CMake files where bridge code is included:
+
+```bash
+caffe2/CMakeLists.txt
+```
+
+CMake files where oneDNN Graph submodule are included:
+
+```bash
+third_party/ideep/mkl-dnn
+cmake/public/mkldnn.cmake
+cmake/Modules/FindMKLDNN.cmake
+cmake/Dependencies.cmake
+```
+
+To map another op to oneDNN Graph, you should add an entry for it in in createOperator in torch/csrc/jit/codegen/onednn/graph_helper.cpp.
+If it has an inplace variant, you should add it in the lambda being passed to RemoveTensorMutation in
+torch/csrc/jit/codegen/onednn/interface.cpp. You might also want to add it to canFuseNode in torch/csrc/jit/codegen/onednn/register_interface.cpp.
+
+## How to use
+
+
+```python
+# enable oneDNN graph fusion globally
+torch.jit.enable_onednn_fusion(True)
+
+# define the model
+def MyModel(torch.nn.Module):
+    ...
+
+# construct the model
+model = MyModel(…)
+with torch.no_grad():
+    model.eval()
+    model = torch.jit.trace(model, torch.rand(args.batch_size, 3, 224, 224))
+
+# run the model
+with torch.no_grad():
+    # oneDNN graph fusion will be trigerred during runtime
+    output = model(images)
+```
diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
new file mode 100644
index 000000000000..28266a80859b
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
@@ -0,0 +1,87 @@
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/runtime/symbolic_shape_registry_util.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+class SizeCheckMover {
+ private:
+  Block* block_;
+  std::shared_ptr<Graph> graph_;
+
+ public:
+  SizeCheckMover(Block* block, std::shared_ptr<Graph> graph)
+      : block_(block), graph_(std::move(graph)) {}
+
+  bool analyzeNode(Node* node, AliasDb& aliasDb) {
+    //
+    // %b = addmm(%a)
+    // %sz = aten::size(%b)
+    // %c = relu(%b)
+    //  =>
+    // %b = addmm(%a)
+    // %c = relu(%b)
+    // %sz = aten::size(%c)
+    //       ^-- move size check after relu as it preserves input shape
+    //
+    if (!node->matches("aten::size(Tensor self) -> int[]"))
+      return false;
+
+    auto* input = node->input(0);
+    auto& uses = input->uses();
+    bool onlyUsedByShapePreserveOp =
+        uses.size() > 1 && std::all_of(uses.begin(), uses.end(), [&](auto& u) {
+          if (u.user == node) {
+            return true;
+          }
+          // match with shape-preserving unary ops in
+          // tensorexpr_elementwise_set that's defined in
+          // torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
+          OperatorMap<std::string> schemaMap = get_tensorexpr_elementwise_set();
+          c10::optional<std::string> mapping =
+              schemaMap.find(u.user->getOperator());
+          return mapping == "unary";
+        });
+
+    if (!onlyUsedByShapePreserveOp)
+      return false;
+
+    for (const auto& use : uses) {
+      if (use.user == node)
+        continue;
+      auto shapePreserveOp = use.user;
+      if (aliasDb.moveAfterTopologicallyValid(node, shapePreserveOp)) {
+        node->replaceInputWith(input, shapePreserveOp->output(0));
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  void run() {
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      AliasDb aliasDb(graph_);
+      for (Node* node : block_->nodes()) {
+        changed |= analyzeNode(node, aliasDb);
+      }
+    }
+
+    for (Node* node : block_->nodes())
+      for (Block* subBlock : node->blocks())
+        SizeCheckMover(subBlock, graph_).run();
+  }
+};
+
+void DeferSizeCheck(std::shared_ptr<Graph>& graph) {
+  SizeCheckMover(graph->block(), graph).run();
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.h b/torch/csrc/jit/codegen/onednn/defer_size_check.h
new file mode 100644
index 000000000000..6e31cf202d39
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/defer_size_check.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void DeferSizeCheck(std::shared_ptr<Graph>& graph);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.cpp b/torch/csrc/jit/codegen/onednn/graph_fuser.cpp
new file mode 100644
index 000000000000..2a956362688e
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/graph_fuser.cpp
@@ -0,0 +1,31 @@
+#include <torch/csrc/jit/codegen/onednn/graph_fuser.h>
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph) {
+  AliasDb db(graph);
+  GraphRewriter graphRewriter(graph->block(), graph, db);
+  // We maintain alias db correctness in-place while building up the LLGA
+  // subgraphs, however it is difficult to preserve correctness when
+  // un-inlining autodiff subgraphs. We first recursively construct all
+  // subgraphs and then recursively cleanup & unmerge the small subgraphs
+  graphRewriter.buildupSubgraphs();
+  graphRewriter.cleanupSubgraphs();
+  // Run CSE globally onceto eliminate duplicates that may have occurred
+  // while inlining subgraphs.
+  EliminateCommonSubexpression(graph);
+  EliminateDeadCode(graph);
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.h b/torch/csrc/jit/codegen/onednn/graph_fuser.h
new file mode 100644
index 000000000000..ee83edc68fc4
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/graph_fuser.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+struct WorkBlock : public std::pair<Node*, Node*> {
+  using pair::pair;
+
+  Node* begin() {
+    return this->first;
+  }
+  Node* end() {
+    return this->second;
+  }
+};
+
+class GraphRewriter {
+ public:
+  GraphRewriter(Block* block, std::shared_ptr<Graph> graph, AliasDb& aliasDb)
+      : block_(block),
+        graph_(std::move(graph)),
+        aliasDb_(aliasDb),
+        llgaHelper_(graph_) {}
+
+  void cleanupSubgraphs();
+  void buildupSubgraphs();
+
+ private:
+  Block* block_;
+  std::shared_ptr<Graph> graph_;
+  AliasDb& aliasDb_;
+  LlgaGraphHelper llgaHelper_;
+  std::vector<WorkBlock> buildWorkBlocks();
+  std::pair<graph_node_list::iterator, bool> scanNode(
+      Node* consumer,
+      graph_node_list::iterator workblock_begin);
+  c10::optional<Node*> tryMerge(Node* consumer, Node* producer);
+};
+
+// This pass creates the subgraphs for oneDNN Graph Fusion Nodes.
+// Its code-structure has been vastly inspired from
+// torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
new file mode 100644
index 000000000000..76f4475cdb53
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -0,0 +1,562 @@
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+using opkind = dnnl::graph::op::kind;
+
+void fixConvOptionalBias(Node* node) {
+  if (node->namedInput("bias")->mustNotBeNone() == false) {
+    // Replace non-existent optional bias with const None
+    auto g = node->owningGraph();
+    auto n = g->createNone();
+    auto v = n->insertBefore(node)->output();
+    node->replaceInput(2, v);
+  }
+}
+
+c10::optional<size_t> getDimensions(Value* v) {
+  if (v->type()->isSubtypeOf(TensorType::get())) {
+    return v->type()->cast<TensorType>()->sizes().size();
+  } else {
+    return c10::nullopt;
+  }
+}
+
+// PyTorch ops that can't otherwise be mapped to oneDNN Graph ops are mapped as
+// Wildcards instead. They make the integration code with PyTorch simpler by
+// passing every op to the oneDNN Graph library in the add_op call -
+// no need to check beforehand whether the op is supported by oneDNN Graph or
+// not oneDNN Graph ops separated by wildcards don't end up in the same
+// partition.
+Operator makeWildcardOp(Node* node) {
+  auto o = Operator(node, opkind::Wildcard);
+  // wildcard op contains only topology info
+  for (size_t i = 0; i < node->inputs().size(); i++) {
+    o.setInput(i);
+  }
+  for (size_t i = 0; i < node->outputs().size(); i++) {
+    o.setOutput(i);
+  }
+  return o;
+}
+
+// If we don't meet a certain condition to map a PyTorch op to a oneDNN Graph
+// op, then we create a wildcard op corresponding to that PyTorch op instead.
+#define REQUIRE(cond)                                 \
+  if (!(cond)) {                                      \
+    GRAPH_DEBUG("Unsupported condition " #cond "\n"); \
+    return makeWildcardOp(node);                      \
+  }
+
+Operator makeEltwiseOp(Node* node, opkind kind) {
+  return Operator(node, kind).setInput(0).setOutput(0);
+}
+
+Operator makeBinaryOp(Node* node, opkind kind) {
+  REQUIRE(
+      node->input(0)->type()->isSubtypeOf(TensorType::get()) &&
+      node->input(1)->type()->isSubtypeOf(TensorType::get()))
+  return Operator(node, kind).setInput(0, 1).setOutput(0);
+}
+
+// Map a PyTorch op to its corresponding oneDNN Graph op.
+// If mapping isn't possible, then create a wildcard op instead.
+// The mapping is done as per oneDNN Graph op schema defined in
+// third_party/ideep/mkl-dnn/src/interface/op_def.hpp.
+Operator createOperator(Node* node) {
+  switch (node->kind()) {
+    case aten::conv2d: {
+      fixConvOptionalBias(node);
+      return Operator(node, opkind::Convolution)
+          .setInput(0, 1, 2)
+          .setOutput(0)
+          .setAttr("strides", Operator::Ints, 3)
+          .setAttr("pads_begin", Operator::Ints, 4)
+          .setAttr("pads_end", Operator::Ints, 4)
+          .setAttr("dilations", Operator::Ints, 5)
+          .setAttr("groups", Operator::Int, 6)
+          .setAttr("filter_format", std::string("OIX"))
+          .setAttr("data_format", std::string("NCX"));
+    }
+
+    case aten::_convolution: {
+      bool transposed = toIValue(node->namedInput("transposed"))->toBool();
+      REQUIRE(!transposed);
+
+      return Operator(node, opkind::Convolution)
+          .setInput(0, 1, 2)
+          .setOutput(0)
+          .setAttr("strides", Operator::Ints, 3)
+          .setAttr("pads_begin", Operator::Ints, 4)
+          .setAttr("pads_end", Operator::Ints, 4)
+          .setAttr("dilations", Operator::Ints, 5)
+          .setAttr("groups", Operator::Int, 8)
+          .setAttr("filter_format", std::string("OIX"))
+          .setAttr("data_format", std::string("NCX"));
+    }
+
+    case aten::batch_norm: {
+      auto training = toIValue(node->namedInput("training"));
+      REQUIRE(
+          training.has_value()); // cannot get training status in script mode
+      REQUIRE(!training->toBool()); // TODO: support bn training
+      return Operator(node, opkind::BatchNormInference)
+          .setInput(0, 1, 2, 3, 4)
+          .setOutput(0)
+          .setAttr("epsilon", Operator::Float, 7)
+          .setAttr("data_format", std::string("NCX"));
+    }
+
+    case aten::layer_norm: {
+      auto normalized_shape = toIValue(node->namedInput("normalized_shape"));
+      REQUIRE(normalized_shape->toIntList().size() == 1);
+      return Operator(node, opkind::LayerNorm)
+          .setInput(0, 2, 3)
+          .setOutput(0)
+          .setAttr("epsilon", Operator::Float, 4)
+          .setAttr("keep_stats", false);
+    }
+
+    case aten::addmm: {
+      auto alpha = toIValue(node->namedInput("alpha"));
+      auto beta = toIValue(node->namedInput("beta"));
+      REQUIRE(
+          alpha.has_value() && beta.has_value() && (alpha->toDouble() == 1.0) &&
+          (beta->toDouble() == 1.0));
+      return Operator(node, opkind::MatMul).setInput(1, 2, 0).setOutput(0);
+    }
+
+    case aten::add:
+      return makeBinaryOp(node, opkind::Add);
+
+    case aten::mul:
+      return makeBinaryOp(node, opkind::Multiply);
+
+    case aten::tanh:
+      return makeEltwiseOp(node, opkind::Tanh);
+
+    case aten::relu:
+      return makeEltwiseOp(node, opkind::ReLU);
+
+    case aten::elu:
+      return makeEltwiseOp(node, opkind::Elu)
+          .setAttr("alpha", Operator::Float, 1);
+
+    case aten::sigmoid:
+      return makeEltwiseOp(node, opkind::Sigmoid);
+    case aten::gelu:
+      return makeEltwiseOp(node, opkind::GELU);
+
+    case aten::sqrt:
+      return makeEltwiseOp(node, opkind::Sqrt);
+
+    case aten::abs:
+      return makeEltwiseOp(node, opkind::Abs);
+
+    case aten::square:
+      return makeEltwiseOp(node, opkind::Square);
+
+    case aten::hardtanh:
+      return makeEltwiseOp(node, opkind::HardTanh)
+          .setAttr("min", Operator::Float, 1)
+          .setAttr("max", Operator::Float, 2);
+
+    case aten::relu6:
+      return makeEltwiseOp(node, opkind::HardTanh)
+          .setAttr("min", 0.f)
+          .setAttr("max", 6.f);
+
+    case aten::softmax: {
+      auto axis = toIValue(node->namedInput("dim"))->toInt();
+      return Operator(node, opkind::SoftMax)
+          .setInput(0)
+          .setOutput(0)
+          .setAttr("axis", axis);
+    }
+
+    case aten::cat: {
+      auto o = Operator(node, opkind::Concat);
+      REQUIRE(
+          node->namedInput("tensors")->node()->kind() == prim::ListConstruct);
+      REQUIRE(node->namedInput("tensors")->uses().size() == 1);
+      REQUIRE(node->namedInput("dim")->node()->kind() == prim::Constant);
+      // aten::cat needs a special handling since it takes a Tensor[] as input.
+      // We set the inputs of ListConstruct as the inputs of cat.
+      //
+      // Pytorch IR:                              LLGA sees:
+      //     %a    %b     %c          %dim              %a    %b    %c
+      //      \     |     /             |                \     |    /
+      //   prim::ListConstruct   prim::Constant     llga::Concat[axis=%dim]
+      //                    \      /
+      //                    aten::cat
+      auto listConstruct = node->input(0)->node();
+      for (auto input : listConstruct->inputs())
+        o.setInputValue(input);
+      return o.setOutput(0).setAttr("axis", Operator::Int, 1);
+    }
+
+    case aten::max_pool2d: {
+      REQUIRE(
+          node->namedInput("kernel_size")->node()->kind() == prim::Constant);
+
+      auto rounding_type =
+          toIValue(node->namedInput("ceil_mode"))->toBool() ? "ceil" : "floor";
+      return Operator(node, opkind::MaxPool)
+          .setInput(0)
+          .setOutput(0)
+          .setAttr("kernel", Operator::Ints, 1)
+          .setAttr("strides", Operator::Ints, 2)
+          .setAttr("pads_begin", Operator::Ints, 3)
+          .setAttr("pads_end", Operator::Ints, 3)
+          .setAttr("dilations", Operator::Ints, 4)
+          .setAttr("rounding_type", std::string(rounding_type))
+          .setAttr("data_format", std::string("NCX"));
+    }
+
+    case aten::avg_pool2d: {
+      // TODO: do we need add checks for all Constants?
+      REQUIRE(
+          node->namedInput("kernel_size")->node()->kind() == prim::Constant);
+      auto rounding_type =
+          toIValue(node->namedInput("ceil_mode"))->toBool() ? "ceil" : "floor";
+      auto divisor_override = toIValue(node->namedInput("divisor_override"));
+      REQUIRE(divisor_override->isNone());
+      return Operator(node, opkind::AvgPool)
+          .setInput(0)
+          .setOutput(0)
+          .setAttr("kernel", Operator::Ints, 1)
+          .setAttr("strides", Operator::Ints, 2)
+          .setAttr("pads_begin", Operator::Ints, 3)
+          .setAttr("pads_end", Operator::Ints, 3)
+          .setAttr("exclude_pad", !Operator::Bool(node, 5))
+          .setAttr("rounding_type", std::string(rounding_type))
+          .setAttr("data_format", std::string("NCX"));
+    }
+
+    case aten::matmul: {
+      auto dim0 = getDimensions(node->namedInput("self")).value_or(-1);
+      auto dim1 = getDimensions(node->namedInput("other")).value_or(-1);
+      // TODO: support all shape combinations
+      REQUIRE(
+          (dim0 == 2 && dim1 == 2) || (dim0 == 4 && dim1 == 4) ||
+          (dim0 == 3 && dim1 == 2));
+    } // fall through
+    case aten::mm: {
+      return Operator(node, opkind::MatMul).setInput(0, 1).setOutput(0);
+    }
+
+    case aten::linear: {
+      return Operator(node, opkind::MatMul)
+          .setInput(0, 1, 2)
+          .setOutput(0)
+          .setAttr("transpose_b", true);
+    }
+
+    default:
+      return makeWildcardOp(node);
+  }
+}
+
+dnnl::graph::op createLlgaOp(Node* node) {
+  return createOperator(node).llgaOp();
+}
+
+bool isSupported(Node* node) {
+  return createOperator(node).kind() != opkind::Wildcard;
+};
+
+DeviceType inferDeviceFromValue(Value* v) {
+  auto tt = v->type()->cast<TensorType>();
+  if (!tt) {
+    return at::kCPU;
+  }
+  auto device = tt->device();
+  if (!device) {
+    return at::kCPU;
+  }
+  return device->type();
+}
+
+DeviceType inferDevice(const std::shared_ptr<Graph>& graph) {
+  auto dt = inferDeviceFromValue(graph->inputs()[0]);
+  TORCH_CHECK(
+      std::all_of(
+          graph->inputs().begin(),
+          graph->inputs().end(),
+          [dt](Value* v) { return inferDeviceFromValue(v) == dt; }),
+      "All inputs must have the same deive type");
+  return dt;
+}
+
+dnnl::graph::engine::kind getLlgaEngineKind(DeviceType type) {
+  switch (type) {
+    case DeviceType::CPU:
+      return dnnl::graph::engine::kind::cpu;
+    default:
+      TORCH_CHECK(false, "Not support device type ", type);
+  }
+}
+
+void mayAddListConstructIntoConcatPartition(
+    Node* n,
+    OpPartitionMap& opToOwningPartition) {
+  // Since prim::ListConstruct is not visible to the LLGA,
+  // it will not be in any partition returned from partfuseritioning results.
+  // We need rewrite opToOwningPartition to make the prim::ListConstruct to be
+  // 'virtually' in the same partition with the aten::cat, so that
+  // prim::ListConstruct can be fused into the fusion group by graph fuser.
+  // We emphasize on 'virtually' because get_num_ops() for cat's partition
+  // would still return 1.
+  if (n->kind() == aten::cat && opToOwningPartition.has(n)) {
+    auto listConstrcut = n->namedInput("tensors")->node();
+    auto partitionId = opToOwningPartition.get(n);
+    opToOwningPartition.add(listConstrcut, partitionId);
+  }
+}
+
+// Verify that input tensors are compatible with oneDNN Graph.
+// Scalars would be converted to 1-D tensors later anyway,
+// but they shouldn't be complex-double
+// If this check fails, convert op to wildcard
+bool checkInputCompatibility(Node* node) {
+  auto allInputs = node->inputs();
+  for (auto input : allInputs) {
+    c10::IValue inputIValue = toIValue(input);
+    if (inputIValue.isTensor()) {
+      const at::Tensor& tensor = inputIValue.toTensor();
+      if (tensor.device() != at::kCPU) {
+        return false;
+      }
+      auto dtype = tensor.scalar_type();
+      if ((dtype != at::ScalarType::Float) && (dtype != at::ScalarType::Long)) {
+        return false;
+      }
+    } else if (inputIValue.isScalar()) {
+      if (inputIValue.isComplexDouble()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+LlgaGraphHelper::LlgaGraphHelper(
+    const std::shared_ptr<Graph>& graph,
+    dnnl::graph::partition::policy policy) {
+  auto deviceType = inferDevice(graph);
+  auto engineKind = getLlgaEngineKind(deviceType);
+  dnnl::graph::graph g{engineKind};
+
+  GRAPH_DEBUG("Constructing LLGA graph");
+  // TODO: select nodes in top-level block for now
+  for (auto* node : graph->block()->nodes()) {
+    auto op = createLlgaOp(node);
+    auto kindOfNode = node->kind();
+    if (checkInputCompatibility(node)) {
+      g.add_op(op);
+      GRAPH_DEBUG("  Added node ", kindOfNode.toQualString());
+    } else {
+      GRAPH_DEBUG("The backend failed to add node ", kindOfNode.toQualString());
+      g.add_op(makeWildcardOp(node).llgaOp());
+    }
+
+    for (Value* input : node->inputs()) {
+      tensorIdToValue_.emplace(input->unique(), input);
+    }
+  }
+
+  GRAPH_DEBUG("Get Partitions");
+  std::vector<dnnl::graph::partition> partitions = g.get_partitions(policy);
+  // excluded unsupported Wildcard partitions
+  for (auto& partition : partitions) {
+    if (partition.is_supported()) {
+      partitions_.push_back(partition);
+    }
+  }
+
+  GRAPH_DEBUG("  Got #partitions: ", partitions_.size());
+  for (size_t partId = 0; partId < partitions_.size(); partId++) {
+    for (auto opId : partitions_[partId].get_ops()) {
+      opToOwningPartition_.add(opId, partId);
+    }
+  }
+
+  // Scanning the graph again for post processing
+  for (auto* node : graph->block()->nodes()) {
+    mayAddListConstructIntoConcatPartition(node, opToOwningPartition_);
+  }
+}
+
+bool LlgaGraphHelper::isLlgaSubgraph(const Node* node) {
+  return node->hasAttribute(attr::Subgraph) &&
+      node->kind() == prim::oneDNNFusionGroup;
+}
+
+bool LlgaGraphHelper::shouldMerge(Node* toMerge, Node* subgraph) {
+  TORCH_CHECK(
+      isLlgaSubgraph(subgraph),
+      "The consumer node does not contain a subgraph");
+  if (!shouldConsiderForMerge(toMerge)) {
+    return false;
+  }
+  return opToOwningPartition_.get(toMerge) ==
+      opToOwningPartition_.get(subgraph);
+}
+
+// Except for conv & GEMMs, which should always be handled by oneDNN Graph,
+// only use single-op partitions for ops unsupported by NNC, or ops
+// that oneDNN executes faster. prim::ListConstruct is an exception, since
+// we simply want to fuse it with cat.
+bool isBetterSuitedForLLGA(NodeKind kindOfOp) {
+  return (
+      (kindOfOp == aten::layer_norm) || (kindOfOp == aten::avg_pool2d) ||
+      (kindOfOp == aten::matmul) || (kindOfOp == aten::max_pool2d) ||
+      (kindOfOp == aten::conv2d) || (kindOfOp == aten::_convolution) ||
+      (kindOfOp == aten::mm) || (kindOfOp == aten::linear) ||
+      (kindOfOp == aten::cat) || (kindOfOp == prim::ListConstruct));
+}
+
+bool LlgaGraphHelper::checkForSingleOpPartition(Node* node) {
+  if (opToOwningPartition_.has(node)) {
+    auto partitionId = opToOwningPartition_.get(node);
+    if (partitions_[partitionId].get_ops_num() == 1) {
+      auto kindOfNode = node->kind();
+      return isBetterSuitedForLLGA(kindOfNode);
+    } else {
+      // multi-op partition
+      return true;
+    }
+  } else {
+    // this op isn't present in any partition
+    return false;
+  }
+}
+
+bool LlgaGraphHelper::shouldConsiderForMerge(Node* node) {
+  // if we're already in the process of merging
+  if (isLlgaSubgraph(node)) {
+    return true;
+  }
+  return checkForSingleOpPartition(node);
+}
+
+Node* LlgaGraphHelper::createSingletonSubgraph(Node* n, AliasDb& aliasDb) {
+  auto partitionId = opToOwningPartition_.get(n);
+  GRAPH_DEBUG(
+      "Creating FusionGroup_", partitionId, " for ", n->kind().toQualString());
+  auto group = SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
+      n, prim::oneDNNFusionGroup, aliasDb);
+  opToOwningPartition_.add(group, partitionId);
+  LlgaNodeWrapper(group).initOutputLayouts();
+  return group;
+}
+
+void LlgaGraphHelper::mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    AliasDb& aliasDb) {
+  if (isLlgaSubgraph(toMerge)) {
+    GRAPH_DEBUG(
+        "Merging ",
+        toMerge->kind().toQualString(),
+        "_",
+        opToOwningPartition_.get(toMerge),
+        " into ",
+        subgraphNode->kind().toQualString(),
+        "_",
+        opToOwningPartition_.get(subgraphNode));
+  } else {
+    GRAPH_DEBUG(
+        "Merging ",
+        toMerge->kind().toQualString(),
+        " into ",
+        subgraphNode->kind().toQualString(),
+        "_",
+        opToOwningPartition_.get(subgraphNode));
+  }
+
+  SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing(
+      toMerge, subgraphNode, aliasDb);
+}
+
+void LlgaGraphHelper::unmergeIfAnyNodeIsMissing(Node* subgraphNode) {
+  TORCH_CHECK(isLlgaSubgraph(subgraphNode), "Cannot unmerge a non-LLGA node");
+
+  auto partitionId = opToOwningPartition_.get(subgraphNode);
+  auto expectOpNum = partitions_[partitionId].get_ops_num();
+  auto actualOpNum = countSupportedOps(subgraphNode->g(attr::Subgraph));
+
+  if (expectOpNum != actualOpNum) {
+    GRAPH_DEBUG(
+        "Unmerging FusionGroup_",
+        partitionId,
+        ". Expected ",
+        expectOpNum,
+        " ops, but got ",
+        actualOpNum,
+        " ops.");
+    SubgraphUtils::unmergeSubgraph(subgraphNode);
+  }
+}
+
+size_t LlgaGraphHelper::countSupportedOps(
+    const std::shared_ptr<Graph>& graph) const {
+  // TODO: count nodes in top-level block for now
+  size_t cnt = 0;
+  for (auto* node : graph->block()->nodes()) {
+    auto nodeKind = node->kind();
+    if ((nodeKind != prim::Constant) && (nodeKind != prim::ListConstruct)) {
+      cnt++;
+    }
+  }
+  return cnt;
+}
+
+std::vector<dnnl::graph::partition> LlgaGraphHelper::getPartitions() const {
+  return partitions_;
+}
+
+std::map<size_t, Value*> LlgaGraphHelper::getTensorIdToValue() const {
+  return tensorIdToValue_;
+}
+
+LlgaNodeWrapper::LlgaNodeWrapper(const Node* node)
+    : n(const_cast<Node*>(node)) { // NOLINT
+  TORCH_CHECK(
+      LlgaGraphHelper::isLlgaSubgraph(n), "Cannot wrap a non-LLGA fusion node");
+}
+
+void LlgaNodeWrapper::setOpaqueLayout(size_t offset) {
+  TORCH_CHECK(offset < n->outputs().size(), "Invalid output offset ", offset);
+  auto& layouts =
+      const_cast<std::vector<int64_t>&>(n->is(attr::output_layouts)); // NOLINT
+  layouts.at(offset) = 1;
+}
+
+bool LlgaNodeWrapper::useOpaqueLayout(size_t offset) const {
+  TORCH_CHECK(offset < n->outputs().size(), "Invalid output offset ", offset);
+  return n->is(attr::output_layouts)[offset] == 1;
+}
+
+void LlgaNodeWrapper::initOutputLayouts() {
+  if (n->hasAttribute(attr::output_layouts)) {
+    return;
+  }
+
+  // Init all output layouts as undef
+  std::vector<int64_t> layouts(n->outputs().size(), 0);
+  n->is_(attr::output_layouts, layouts);
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.h b/torch/csrc/jit/codegen/onednn/graph_helper.h
new file mode 100644
index 000000000000..969f3cdc0eff
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/operator.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+struct OpPartitionMap {
+  void add(uint64_t opId, uint64_t partitionId) {
+    opmap_[opId] = partitionId;
+  }
+  void add(Node* n, uint64_t partitionId) {
+    add(Operator::getId(n), partitionId);
+  }
+  bool has(uint64_t opId) {
+    return opmap_.count(opId) > 0;
+  }
+  bool has(Node* n) {
+    return has(Operator::getId(n));
+  }
+  uint64_t get(uint64_t opId) {
+    return opmap_[opId];
+  }
+  uint64_t get(Node* n) {
+    auto opId = Operator::getId(n);
+    TORCH_CHECK(
+        has(opId),
+        "Node ",
+        n->kind().toQualString(),
+        " does not belong to any LLGA partition");
+    return get(opId);
+  }
+
+ private:
+  std::unordered_map<uint64_t, uint64_t> opmap_;
+};
+
+class LlgaGraphHelper {
+ public:
+  LlgaGraphHelper(
+      const std::shared_ptr<Graph>& graph,
+      dnnl::graph::partition::policy policy =
+          dnnl::graph::partition::policy::fusion);
+
+  bool shouldMerge(Node* toMerge, Node* subgraph);
+
+  bool shouldConsiderForMerge(Node* node);
+
+  bool checkForSingleOpPartition(Node* node);
+
+  Node* createSingletonSubgraph(Node* n, AliasDb& db);
+
+  void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode, AliasDb& db);
+
+  void unmergeIfAnyNodeIsMissing(Node* subgraphNode);
+
+  static bool isLlgaSubgraph(const Node* node);
+
+  std::vector<dnnl::graph::partition> getPartitions() const;
+
+  std::map<size_t, Value*> getTensorIdToValue() const;
+
+ private:
+  size_t countSupportedOps(const std::shared_ptr<Graph>& graph) const;
+
+  OpPartitionMap opToOwningPartition_;
+  std::vector<dnnl::graph::partition> partitions_;
+  std::map<size_t, Value*>
+      tensorIdToValue_; // map from tensorId to torch::jit::Value
+};
+
+class LlgaNodeWrapper {
+ public:
+  LlgaNodeWrapper(const Node* node);
+
+  void setOpaqueLayout(size_t offset);
+
+  bool useOpaqueLayout(size_t offset) const;
+
+  friend class LlgaGraphHelper;
+
+ private:
+  void initOutputLayouts();
+
+  Node* n;
+};
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
new file mode 100644
index 000000000000..c91ff9b3917a
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
@@ -0,0 +1,144 @@
+#include <torch/csrc/jit/codegen/onednn/graph_fuser.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void GraphRewriter::cleanupSubgraphs() {
+  auto curNode = *block_->nodes().rbegin();
+  while (curNode != *block_->nodes().rend()) {
+    // Save the previous node, since we might delete `curNode` in next block
+    auto prevNode = curNode->prev();
+    if (llgaHelper_.isLlgaSubgraph(curNode)) {
+      // Unmerge subgraph if we don't get every nodes of a partition
+      // into the subgraph due to failed alias check
+      llgaHelper_.unmergeIfAnyNodeIsMissing(curNode);
+    }
+    curNode = prevNode;
+  }
+  for (Node* n : block_->nodes()) {
+    for (Block* b : n->blocks()) {
+      GraphRewriter(b, graph_, aliasDb_).cleanupSubgraphs();
+    }
+  }
+}
+
+void GraphRewriter::buildupSubgraphs() {
+  // We need to run the rewriter multiple times in order to get all merge
+  // opportunities. This is because moveBeforeTopologicalValid may reorder
+  // nodes to be AFTER the current iteration point. In order to properly
+  // consider those nodes for merging, we need run the pass until no changes
+  // have been made.
+  //
+  // Example:
+  //   c = f(a, b)
+  //   d = f(c)
+  //   e = f(d)  <- iter is here, moving upward
+  // After c.moveBeforeTopologicallyValid(e), we have:
+  //   c = f(a, b)
+  //   e = f(d)  <- iter still here
+  //   d = f(c)  <- this was node moved on the other side.
+  // see [workblocks]
+  auto workblocks = buildWorkBlocks();
+  for (auto& workblock : workblocks) {
+    bool any_changed = true;
+    while (any_changed) {
+      any_changed = false;
+      auto workblock_end = workblock.end()->reverseIterator();
+      auto workblock_begin = workblock.begin()->reverseIterator();
+      for (auto it = workblock_end; it != workblock_begin;) {
+        bool changed = false;
+        std::tie(it, changed) = scanNode(*it, workblock_begin);
+        any_changed |= changed;
+      }
+    }
+  }
+
+  // Construct Subgraphs Recursively
+  for (Node* n : block_->nodes()) {
+    for (auto subBlock : n->blocks()) {
+      GraphRewriter(subBlock, graph_, aliasDb_).buildupSubgraphs();
+    }
+  }
+}
+
+std::vector<WorkBlock> GraphRewriter::buildWorkBlocks() {
+  // [workblocks]
+  // the IR has many nodes which can never be reordered around, such as a
+  // prim::Bailout. if a node N is surrounded by two nodes which cannot be
+  // reordered, A and B, then a fusion group that is created from N
+  // can only contain nodes from (A, B) The nodes from A to B represent one
+  // work block for the subgraph rewriter to work on. By creating these up
+  // front, we avoid retraversing the whole graph block any time scanNode
+  // returns
+  Node* end_bound_node = block_->return_node();
+  Node* curr = end_bound_node->prev();
+  std::vector<WorkBlock> worklist;
+  while (curr != block_->param_node()) {
+    // cannot reorder around side effectful nodes
+    if (curr->hasSideEffects()) {
+      worklist.emplace_back(curr, end_bound_node);
+      end_bound_node = curr;
+    }
+    curr = curr->prev();
+  }
+  worklist.emplace_back(curr, end_bound_node);
+  return worklist;
+}
+
+std::pair<graph_node_list::iterator, bool> GraphRewriter::scanNode(
+    Node* consumer,
+    graph_node_list::iterator workblock_begin) {
+  GRAPH_DEBUG("Scanning ", consumer->kind().toQualString());
+  if (llgaHelper_.shouldConsiderForMerge(consumer)) {
+    if (!llgaHelper_.isLlgaSubgraph(consumer)) {
+      consumer = llgaHelper_.createSingletonSubgraph(consumer, aliasDb_);
+    }
+    // Iterate through the workblock to merge nodes of the
+    // same partition determined by LLGA graph helper.
+    // Nodes like B and C do not share a common input but belong to a
+    // same partition, and thus we cannot only scan the input nodes
+    // to find merging opportunities. Instead, we have to scan through
+    // the whole workblock, which might lead to O^2 accesses in worst case
+    //              A
+    //      + - - / - \ - - +
+    //      |    B     C    |
+    //      |    |     |    |
+    //      |    D     E    |
+    //      + - - \ - / - - +
+    //              F
+    auto prev = ++consumer->reverseIterator();
+    for (auto it = prev; it != workblock_begin; it++) {
+      if (auto group = tryMerge(consumer, *it)) {
+        // we successfully merged, so the new group's `inputs` may have
+        // changed. So rescan the new group for more merging opportunities.
+        return std::make_pair(group.value()->reverseIterator(), true);
+      }
+    }
+  }
+  return std::make_pair(++consumer->reverseIterator(), false);
+}
+
+// Try to merge `producer` into `consumer`. If successful, this destroys
+// `producer` and returns the `consumer` group.
+c10::optional<Node*> GraphRewriter::tryMerge(Node* consumer, Node* producer) {
+  AT_ASSERT(llgaHelper_.isLlgaSubgraph(consumer));
+  bool canMerge = llgaHelper_.shouldMerge(producer, consumer) &&
+      aliasDb_.moveBeforeTopologicallyValid(producer, consumer);
+  if (!canMerge) {
+    return c10::nullopt;
+  }
+  llgaHelper_.mergeNodeIntoSubgraph(producer, consumer, aliasDb_);
+  return consumer;
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/guard_shape.cpp b/torch/csrc/jit/codegen/onednn/guard_shape.cpp
new file mode 100644
index 000000000000..ee595b5c8d71
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/guard_shape.cpp
@@ -0,0 +1,45 @@
+#include <torch/csrc/jit/codegen/onednn/guard_shape.h>
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+//! [ Note -- prepareFusionGroupAndGuardOutputs implementation ]
+//! shamelessly copying code from NNC (tensorexpr_fuser)  with very little
+//! modification, original code at:
+//! `torch/csrc/jit/passes/tensorexpr_fuser.cpp:prepareFusionGroupAndGuardOutputs`
+//!
+//! We have the assumption that LLGA does not have operators
+//! depending on the content of the tensor.
+void prepareFusionGroupAndGuardOutputs(Block* block) {
+  std::vector<Node*> fusion_groups;
+  for (Node* n : block->nodes()) {
+    for (Block* b : n->blocks()) {
+      prepareFusionGroupAndGuardOutputs(b);
+    }
+    if (n->kind() == prim::oneDNNFusionGroup) {
+      fusion_groups.push_back(n);
+    }
+  }
+  for (Node* fusion_group : fusion_groups) {
+    // TODO: add further optimization pass to removeOutputsUsedOnlyInSize,
+    // refer to
+    // `torch/csrc/jit/passes/tensorexpr_fuser.cpp:removeOutputsUsedOnlyInSize`
+    // removeOutputsUsedOnlyInSize(fusion_group);
+    insertTypeGuard(
+        fusion_group,
+        [](const TensorTypePtr& t) { return t; },
+        prim::oneDNNFusionGuard);
+  }
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/guard_shape.h b/torch/csrc/jit/codegen/onednn/guard_shape.h
new file mode 100644
index 000000000000..46f8a396a162
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/guard_shape.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void prepareFusionGroupAndGuardOutputs(Block* block);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/interface.cpp b/torch/csrc/jit/codegen/onednn/interface.cpp
new file mode 100644
index 000000000000..ef525f99e2c7
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/interface.cpp
@@ -0,0 +1,172 @@
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/defer_size_check.h>
+#include <torch/csrc/jit/codegen/onednn/graph_fuser.h>
+#include <torch/csrc/jit/codegen/onednn/guard_shape.h>
+#include <torch/csrc/jit/codegen/onednn/interface.h>
+#include <torch/csrc/jit/codegen/onednn/kernel.h>
+#include <torch/csrc/jit/codegen/onednn/layout_propagation.h>
+#include <torch/csrc/jit/codegen/onednn/prepare_binary.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/decompose_ops.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/operator_options.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void fuseGraph(std::shared_ptr<Graph>& g) {
+  // Follow the process of the tensorexpr_fuser in profiling mode:
+  // Remove prim::profile nodes and embed the profile info directly in the
+  // IR in value types to avoid breaking the fusion patterns.
+  // Will add shape guard after LLGA optimization passes and
+  // wipe the tensor type information from the IR, so that it's not
+  // accidentally used by any other pass.
+
+  // We rely on the shape specialization and shape guard to ensure the validity
+  // of the cached compilation in the kernel, thus only support profiling mode.
+  // TODO: add check on oneDNNFusionGroup to ensure allShapesAreKnown on nodes
+  // to fuse: torch/csrc/jit/passes/tensorexpr_fuser.cpp: allShapesAreKnown
+  if (getProfilingMode()) {
+    GRAPH_DUMP(
+        "Before RemoveProfileNodesAndSpecializeTypes. Beginning of LLGA "
+        "optimization pass",
+        g);
+    RemoveProfileNodesAndSpecializeTypes(g);
+    GRAPH_DUMP(
+        "After RemoveProfileNodesAndSpecializeTypes. Before mutation removal",
+        g);
+
+    RemoveTensorMutation(g, [](Node* nodeToFunctionalize) {
+      static std::unordered_set<Symbol> supportedOps = {
+          aten::add_,
+          aten::mul_,
+          aten::tanh_,
+          aten::elu_,
+          aten::relu_,
+          aten::relu6_,
+          aten::gelu_,
+          aten::sqrt_,
+          aten::sigmoid_,
+          aten::hardtanh_,
+          aten::abs_,
+          aten::square_,
+      };
+      return supportedOps.count(nodeToFunctionalize->kind()) != 0;
+    });
+    RemoveListMutation(g);
+    GRAPH_DUMP("After mutation removal. Before PrepareBinaryForLLGA", g);
+    PrepareBinaryForLLGA(g);
+    GRAPH_DUMP("After PrepareBinaryForLLGA. Before DeferSizeCheck", g);
+    DeferSizeCheck(g);
+    GRAPH_DUMP("After DeferSizeCheck. Before CreateLlgaSubgraphs", g);
+    CreateLlgaSubgraphs(g);
+    GRAPH_DUMP("After CreateLlgaSubgraphs. Before PropagateLayout", g);
+    PropagateLayout(g);
+    GRAPH_DUMP(
+        "After PropagateLayout. Before prepareFusionGroupAndGuardOutputs", g);
+
+    // Add shape guard for profiling mode and wipe the tensor type information
+    // from the IR
+    prepareFusionGroupAndGuardOutputs(g->block());
+    GRAPH_DUMP(
+        "After prepareFusionGroupAndGuardOutputs. Before "
+        "RemoveTensorTypeSpecializations",
+        g);
+    RemoveTensorTypeSpecializations(g);
+    GRAPH_DUMP(
+        "After RemoveTensorTypeSpecializations. End of LLGA optimization pass",
+        g);
+  }
+}
+
+} // namespace onednn
+} // namespace fuser
+
+Operation createLlgaKernel(const Node* node) {
+  auto kernel = std::make_shared<fuser::onednn::LlgaKernel>(node);
+  return [kernel](Stack* stack) {
+    RECORD_FUNCTION(kernel->debugName(), std::vector<c10::IValue>());
+    kernel->run(*stack);
+    return 0;
+  };
+}
+
+RegisterOperators oneDNNFusionGroupOp({
+    torch::jit::Operator(
+        prim::oneDNNFusionGroup,
+        createLlgaKernel,
+        AliasAnalysisKind::INTERNAL_SPECIAL_CASE),
+});
+
+// Currently, we convert some scalar inputs, such as the second argument of
+// binary ops to a 1D tensor. Other scalar inputs are prim::Constant nodes.
+// But if we have any scalar inputs to guard in the future, some logic here
+// would have to be changed.
+Operation createLlgaGuardKernel(const Node* node) {
+  return [node](Stack* stack) {
+#ifdef GRAPH_DEBUG_ENABLED
+    GRAPH_DEBUG("Guarding node: ", node->kind().toQualString());
+#endif
+    std::vector<TypePtr> types = node->tys(attr::types);
+    const auto num_inputs = types.size();
+#ifdef GRAPH_DEBUG_ENABLED
+    GRAPH_DEBUG("num_inputs to guard: ", num_inputs);
+#endif
+    for (size_t i = 0; i < num_inputs; i++) {
+#ifdef GRAPH_DEBUG_ENABLED
+      GRAPH_DEBUG("checking input ", i);
+#endif
+      auto& input = peek(stack, i, num_inputs);
+      const c10::TensorTypePtr& guard_tensor_type =
+          types[i]->cast<TensorType>();
+
+      if (!input.isTensor()) {
+#ifdef GRAPH_DEBUG_ENABLED
+        GRAPH_DEBUG("input ", i, " is not a tensor, return false");
+#endif
+        push(stack, IValue(false));
+        return;
+      }
+      const at::Tensor& tensor = input.toTensor();
+
+      // If input tensor is of mkldnn, it's originated from an upstream
+      // LLGA partition that has passed the check on input shapes.
+      // It is valid to continue here as long as the output shapes from
+      // oneDNN graph partitions are determined by the input shapes.
+      if (tensor.is_mkldnn()) {
+#ifdef GRAPH_DEBUG_ENABLED
+        GRAPH_DEBUG("input ", i, " is_mkldnn, continue");
+#endif
+        continue;
+      }
+
+      if (!guard_tensor_type->matchTensor(tensor)) {
+#ifdef GRAPH_DEBUG_ENABLED
+        GRAPH_DEBUG("input ", i, " check failed, return false");
+#endif
+        push(stack, IValue(false));
+        return;
+      }
+    }
+#ifdef GRAPH_DEBUG_ENABLED
+    GRAPH_DEBUG("all check done, return true");
+#endif
+    push(stack, IValue(true));
+    return;
+  };
+}
+
+RegisterOperators oneDNNGuardOp({
+    torch::jit::Operator(
+        prim::oneDNNFusionGuard,
+        createLlgaGuardKernel,
+        AliasAnalysisKind::FROM_SCHEMA),
+});
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/interface.h b/torch/csrc/jit/codegen/onednn/interface.h
new file mode 100644
index 000000000000..e591c1c3b589
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/interface.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <ATen/Config.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+static std::atomic<bool> onednn_enabled{false};
+
+std::atomic<bool>& getLlgaEnabled() {
+  return onednn_enabled;
+}
+
+C10_EXPORT void fuseGraph(std::shared_ptr<Graph>& g);
+
+} // namespace onednn
+} // namespace fuser
+
+struct C10_EXPORT RegisterLlgaFuseGraph
+    : public PassManager<RegisterLlgaFuseGraph> {
+  static bool setEnabled(bool enabled) {
+    TORCH_CHECK(
+        AT_MKLDNN_ENABLED(),
+        "Running oneDNN Graph fuser is only supported with MKLDNN builds.");
+    bool oldState = fuser::onednn::getLlgaEnabled();
+    fuser::onednn::getLlgaEnabled() = enabled;
+    if (enabled) {
+      registerPass(fuser::onednn::fuseGraph);
+    } else {
+      clearPass();
+    }
+    return oldState;
+  }
+
+  static bool isEnabled() {
+    return fuser::onednn::getLlgaEnabled();
+  }
+
+  // override PassManager::registerPass to register pre-pass
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      passID(registerPrePass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // override PassManager::clearPass to clear pre-pass
+  static void clearPass() {
+    if (isRegistered()) {
+      clearPrePass(passID());
+      isRegistered(true);
+    }
+  }
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/kernel.cpp b/torch/csrc/jit/codegen/onednn/kernel.cpp
new file mode 100644
index 000000000000..f12f4bc86d03
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/kernel.cpp
@@ -0,0 +1,292 @@
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/codegen/onednn/kernel.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/jit/jit_log.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+using namespace dnnl::graph;
+using data_type = dnnl::graph::logical_tensor::data_type;
+
+LlgaKernel::LlgaKernel(const Node* fusionNode)
+    : fusionNode_(fusionNode),
+      graph_(fusionNode->g(attr::Subgraph)),
+      nGraphInputs_(graph_->inputs().size()),
+      nOutputs_(graph_->outputs().size()),
+      debugName_(genDebugName()) {
+  // TODO: This is a workaround to recreate the partitions here.
+  // The ideal way is to use the partition serialization API (not available from
+  // LLGA now) to carry a serialized string representation from graph rewrite
+  // and deserialize it here.
+  auto llgaGraphHelper = LlgaGraphHelper(graph_);
+  auto partitions = llgaGraphHelper.getPartitions();
+  tensorIdToValue_ = llgaGraphHelper.getTensorIdToValue();
+  TORCH_CHECK(
+      partitions.size() == 1,
+      "LLGA subgraph should contain only one partition");
+  partition_ = partitions[0];
+  nPartitionInputs_ = partition_.get_in_ports().size();
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("Initialized ", debugName(), "\n", graph_->toString());
+#endif
+}
+
+bool LlgaKernel::useOpaqueLayout(size_t offset) const {
+  return LlgaNodeWrapper(fusionNode_).useOpaqueLayout(offset);
+}
+
+void LlgaKernel::initializeConstantInputs() {
+  for (auto& lt : partition_.get_in_ports()) {
+    auto inputId = lt.get_id();
+    if (initializedInputIds_.find(inputId) == initializedInputIds_.end()) {
+      TORCH_CHECK(
+          tensorIdToValue_.count(inputId) > 0,
+          "inputs with inputId ",
+          inputId,
+          " is missing");
+      auto* value = tensorIdToValue_[inputId];
+
+      TORCH_CHECK(
+          value->node()->kind() == prim::Constant &&
+              value->type()->cast<TensorType>(),
+          "inputs with inputId ",
+          inputId,
+          " should be a Constant tensor");
+      constantValues_.emplace_back(value);
+
+      auto const_tensor = toIValue(value)->toTensor();
+      constantInputs_.emplace_back(const_tensor);
+    }
+  }
+}
+
+std::map<size_t, int64_t> LlgaKernel::initializeTensorIdToOccurence() const {
+  std::map<size_t, int64_t> tensorIdToOccurence;
+  for (auto& lt : partition_.get_in_ports()) {
+    auto inputId = lt.get_id();
+    std::map<size_t, int64_t>::iterator it(tensorIdToOccurence.find(inputId));
+    if (it != tensorIdToOccurence.end()) {
+      it->second++;
+    } else {
+      tensorIdToOccurence[inputId] = 1;
+    }
+  }
+  return tensorIdToOccurence;
+}
+
+ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) {
+  ArgSpecs inputSpecs;
+  inputSpecs.reserve(nPartitionInputs_);
+  GRAPH_DEBUG("Initializing graph input logical tensors");
+  std::map<size_t, int64_t> tensorIdToOccurence =
+      initializeTensorIdToOccurence();
+  for (size_t i = 0; i < nGraphInputs_; i++) {
+    auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]);
+    initializedInputIds_.insert(spec.tid());
+    int64_t occurence = tensorIdToOccurence[spec.tid()];
+    inputSpecs.insert(inputSpecs.end(), occurence, spec);
+    runArgsIdx_.insert(runArgsIdx_.end(), occurence, i);
+  }
+  GRAPH_DEBUG("Initializing constant input tensors");
+  initializeConstantInputs();
+
+  TORCH_CHECK(
+      inputSpecs.size() + constantValues_.size() == nPartitionInputs_,
+      "Partition inputs are missing");
+  GRAPH_DEBUG(
+      "Concatenating constant input logical tensors to graph input "
+      "logical tensors");
+  for (Value* constant_value : constantValues_) {
+    ArgSpec constantInputSpec(constant_value);
+    inputSpecs.emplace_back(constantInputSpec);
+    constantLogicalTensors_.emplace_back(constantInputSpec.logical_tensor());
+  }
+  return inputSpecs;
+}
+
+ArgSpecs LlgaKernel::initializeOutputSpecs() const {
+  ArgSpecs outputSpecs;
+  outputSpecs.reserve(nOutputs_);
+  for (size_t i = 0; i < nOutputs_; i++) {
+    auto spec = ArgSpec(graph_->outputs()[i]);
+    if (useOpaqueLayout(i)) {
+      spec = spec.any();
+    }
+    outputSpecs.emplace_back(spec);
+  }
+  return outputSpecs;
+}
+
+std::tuple<RunArgs, RunArgs> LlgaKernel::prepareRunArgs(
+    const TensorArgs& inputs,
+    TensorArgs& outputs) const {
+  RunArgs runInputs, runOutputs;
+  auto numInputs = runArgsIdx_.size();
+  for (size_t i = 0; i < numInputs; i++) {
+    auto spec = inputSpecs_[i];
+    auto input = inputs[runArgsIdx_[i]];
+    runInputs.push_back(
+        {spec.logical_tensor(), Engine::getEngine(), input.data_ptr()});
+  }
+  auto numConstantInputs = constantInputs_.size();
+  for (size_t i = 0; i < numConstantInputs; i++) {
+    // constantInputSpecs are placed after graphInputSpecs
+    auto constantInputSpecIdx = nGraphInputs_ + i;
+    auto constantInputSpec = inputSpecs_[constantInputSpecIdx];
+    runInputs.push_back(
+        {constantLogicalTensors_[i],
+         Engine::getEngine(),
+         constantInputs_[i].data_ptr()});
+  }
+
+  for (size_t i = 0; i < nOutputs_; i++) {
+    auto spec = outputSpecs_[i];
+    auto opt = c10::TensorOptions(spec.aten_scalar_type()).device(device_);
+
+    if (spec.reuses_input_tensor()) {
+#ifdef GRAPH_DEBUG_ENABLED
+      GRAPH_DEBUG("oneDNN Graph would perform inplace computation");
+#endif
+      auto inputTensor = inputs[spec.get_input_tensor_index()];
+      auto dataType = spec.dtype();
+      if (C10_UNLIKELY(!useOpaqueLayout(i) && inputTensor.is_mkldnn())) {
+        // If the input tensor was between two partitions, it would've been
+        // wrapped with LlgaTensorImpl. But if it's being reused as the output
+        // tensor which is not between two partitions, then we'd have to re-wrap
+        // it with TensorImpl, as it'd be fed into a PyTorch op.
+#ifdef GRAPH_DEBUG_ENABLED
+        GRAPH_DEBUG("Rewrap tensor");
+#endif
+        auto llgaImpl =
+            static_cast<LlgaTensorImpl*>(inputTensor.unsafeGetTensorImpl());
+        switch (dataType) {
+          case data_type::f32:
+          case data_type::bf16:
+            inputTensor = LlgaTensorImpl::llga_to_aten_tensor(llgaImpl);
+            break;
+          case data_type::s32:
+          default:
+            TORCH_CHECK(
+                false, "Invalid data type ", static_cast<size_t>(dataType));
+        }
+      }
+      outputs.push_back(inputTensor);
+      runOutputs.push_back(
+          {spec.logical_tensor(), Engine::getEngine(), inputTensor.data_ptr()});
+    } else if (useOpaqueLayout(i)) {
+      // Wrap tensors between partitions with LlgaTensorImpl wrapper, so that we
+      // can bypass guard-check, as strides would be different than those
+      // expected.
+#ifdef GRAPH_DEBUG_ENABLED
+      GRAPH_DEBUG("Between two oneDNN Graph partitions");
+#endif
+      auto tensor = empty_llga(spec, opt);
+      outputs.push_back(tensor);
+      runOutputs.push_back(llga_from_aten_tensor(tensor));
+    } else {
+#ifdef GRAPH_DEBUG_ENABLED
+      GRAPH_DEBUG("Neither opaque to PyTorch nor inplace-computation");
+#endif
+      auto tensor = at::empty_strided(spec.sizes(), spec.strides(), opt);
+      outputs.push_back(tensor);
+      runOutputs.push_back(
+          {spec.logical_tensor(), Engine::getEngine(), tensor.data_ptr()});
+    }
+  }
+
+  return std::make_tuple(runInputs, runOutputs);
+}
+
+compiled_partition LlgaKernel::compile(const partition& partition) {
+  auto inputs = fmap(inputSpecs_, toLogicalTensor);
+  auto outputs = fmap(outputSpecs_, toLogicalTensor);
+  auto compilation = partition.compile(inputs, outputs, Engine::getEngine());
+
+  // Since layouts of opaque outputs would be known after compilation,
+  // we need to query them out from compilation and update outputSpecs
+  for (size_t i = 0; i < nOutputs_; i++) {
+    auto tid = outputSpecs_[i].tid();
+    outputSpecs_[i] = compilation.query_logical_tensor(tid);
+  }
+
+  // Build static mapping from output id to input offset
+  // in accordance with available inplace options
+  for (auto&& option : compilation.get_inplace_ports()) {
+    size_t inputId = option.first;
+    size_t outputId = option.second;
+    auto inputSpecIter =
+        std::find_if(inputSpecs_.begin(), inputSpecs_.end(), [&](auto& spec) {
+          return spec.tid() == inputId;
+        });
+    TORCH_CHECK(inputSpecIter != inputSpecs_.end(), "In-place input not found");
+    auto inputOffset = inputSpecIter - inputSpecs_.begin();
+    auto outputSpecIter =
+        std::find_if(outputSpecs_.begin(), outputSpecs_.end(), [&](auto& spec) {
+          return spec.tid() == outputId;
+        });
+    auto outputOffset = outputSpecIter - outputSpecs_.begin();
+    outputSpecs_[outputOffset].set_compute_inplace();
+    outputSpecs_[outputOffset].set_input_tensor_index(inputOffset);
+  }
+
+  return compilation;
+}
+
+void LlgaKernel::run(Stack& stack) {
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("In ", debugName(), "\n");
+#endif
+
+  // Grab input values from stack
+  auto stackInputs = last(stack, nGraphInputs_);
+  auto inputs = fmap(stackInputs, [&](const IValue& v) {
+    TORCH_CHECK(
+        v.isTensor(), "Stack values for LLGA partition must be Tensor type");
+    return v.toTensor();
+  });
+
+  // Even in case of concurrent threads, the kernel would be initialized once.
+  // TODO: Try not using an atomic lock
+  std::call_once(
+      initialized_flag,
+      [&](const TensorArgs& inputs) {
+        GRAPH_DEBUG("Initializing input logical tensors");
+        inputSpecs_ = initializeInputSpecs(inputs);
+        GRAPH_DEBUG("Initializing output logical tensors");
+        outputSpecs_ = initializeOutputSpecs();
+        GRAPH_DEBUG("Compiling partition");
+        compilation_ = compile(partition_);
+        is_initialized_ = true;
+      },
+      inputs);
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("Preparing runtime tensors");
+#endif
+  TensorArgs outputs;
+  RunArgs runInputs, runOutputs;
+  std::tie(runInputs, runOutputs) = prepareRunArgs(inputs, outputs);
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("Executing partition");
+#endif
+  compilation_.execute(Stream::getStream(), runInputs, runOutputs);
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("Partition executed");
+#endif
+
+  // Update the stack.
+  drop(stack, nGraphInputs_);
+  for (auto& o : outputs)
+    push_one(stack, std::move(o));
+#ifdef GRAPH_DEBUG_ENABLED
+  GRAPH_DEBUG("Stack updated");
+#endif
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/kernel.h b/torch/csrc/jit/codegen/onednn/kernel.h
new file mode 100644
index 000000000000..a9c7b24ad8c3
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/kernel.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <unordered_map>
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+using ArgSpec = LlgaTensorDesc;
+using ArgSpecs = std::vector<ArgSpec>;
+using RunArg = dnnl::graph::tensor;
+using RunArgs = std::vector<RunArg>;
+using TensorArgs = std::vector<at::Tensor>;
+
+class LlgaKernel {
+ public:
+  explicit LlgaKernel(const Node* fusionNode);
+
+  void run(Stack& stack);
+
+  void initialize(const TensorArgs& inputs);
+
+  const std::string& debugName() const {
+    return debugName_;
+  }
+
+ private:
+  bool useOpaqueLayout(size_t offset) const;
+
+  // PyTorch copy constants inside the subgraph instead of referencing them.
+  // Constants inputs to the partition are no longer in the graph->inputs().
+  // Need use the tid retrieved from the partition to find the missing
+  // constant inputs.
+  void initializeConstantInputs();
+
+  ArgSpecs initializeInputSpecs(const TensorArgs& inputs);
+
+  ArgSpecs initializeOutputSpecs() const;
+
+  dnnl::graph::compiled_partition compile(
+      const dnnl::graph::partition& partition);
+
+  std::map<size_t, int64_t> initializeTensorIdToOccurence() const;
+
+  std::tuple<RunArgs, RunArgs> prepareRunArgs(
+      const TensorArgs& inputs,
+      TensorArgs& outputs) const;
+
+  static std::string genDebugName() {
+    static size_t debugId = 0;
+    return "LlgaPartition_" + std::to_string(debugId++);
+  }
+
+  static dnnl::graph::logical_tensor toLogicalTensor(const ArgSpec& s) {
+    return s.logical_tensor();
+  }
+
+  at::Device device_ = at::kCPU;
+  const Node* fusionNode_;
+  std::shared_ptr<Graph> graph_;
+  int64_t nGraphInputs_ = 0; // number of inputs to graph_ on the IR
+  int64_t nOutputs_ = 0;
+  std::map<size_t, Value*> tensorIdToValue_;
+  std::vector<int64_t> runArgsIdx_;
+  dnnl::graph::partition partition_;
+  // nPartitionInputs_ is the actual number of inputs to partition_ of graph_
+  // needed by the backend.
+  // nPartitionInputs_ = nGraphInputs_ + constantInputs_.size() since Constant
+  // inputs are copied to the inside of the subgraph
+  int64_t nPartitionInputs_;
+  dnnl::graph::compiled_partition compilation_;
+  std::set<size_t> initializedInputIds_;
+  std::vector<Value*> constantValues_;
+  TensorArgs constantInputs_;
+  ArgSpecs inputSpecs_;
+  ArgSpecs outputSpecs_;
+  std::vector<dnnl::graph::logical_tensor> constantLogicalTensors_;
+  std::string debugName_;
+  std::once_flag initialized_flag;
+  bool is_initialized_ = false;
+};
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/layout_propagation.cpp b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
new file mode 100644
index 000000000000..448e1cf85884
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
@@ -0,0 +1,44 @@
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/codegen/onednn/layout_propagation.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void LayoutPropagation(Node* n) {
+  if (!LlgaGraphHelper::isLlgaSubgraph(n))
+    return;
+
+  for (auto input : n->inputs()) {
+    auto prev = input->node();
+    auto offset = input->offset();
+    if (LlgaGraphHelper::isLlgaSubgraph(prev)) {
+      bool useOpaqueLayout = true;
+      for (auto& use : input->uses()) {
+        if (!LlgaGraphHelper::isLlgaSubgraph(use.user)) {
+          useOpaqueLayout = false;
+          break;
+        }
+      }
+      if (useOpaqueLayout) {
+        LlgaNodeWrapper(prev).setOpaqueLayout(offset);
+      }
+    }
+  }
+}
+
+void LayoutPropagation(at::ArrayRef<Block*> blocks) {
+  for (Block* block : blocks)
+    for (Node* node : block->nodes())
+      LayoutPropagation(node);
+}
+
+void PropagateLayout(const std::shared_ptr<Graph>& graph) {
+  LayoutPropagation(graph->block());
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/layout_propagation.h b/torch/csrc/jit/codegen/onednn/layout_propagation.h
new file mode 100644
index 000000000000..5e48a097cd43
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/layout_propagation.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+void PropagateLayout(const std::shared_ptr<Graph>& graph);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/operator.h b/torch/csrc/jit/codegen/onednn/operator.h
new file mode 100644
index 000000000000..b4d93e94ed25
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/operator.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+class Operator {
+ public:
+  Operator(const Node* node, dnnl::graph::op::kind kind)
+      : n(node), o(getId(node), kind, node->kind().toQualString()), k(kind) {}
+
+  Operator& setInputValue(Value* v) {
+    if (v->mustNotBeNone())
+      o.add_input(createLogicalTensor(v));
+    return *this;
+  }
+
+  Operator& setInput(size_t offset) {
+    return setInputValue(n->input(offset));
+  }
+
+  template <typename... Ts>
+  Operator& setInput(size_t offset, Ts... other) {
+    setInput(offset);
+    return setInput(other...);
+  }
+
+  Operator& setOutputValue(Value* v) {
+    if (v->mustNotBeNone())
+      o.add_output(createLogicalTensor(v));
+    return *this;
+  }
+
+  Operator& setOutput(size_t offset) {
+    return setOutputValue(n->output(offset));
+  }
+
+  template <typename... Ts>
+  Operator& setOutput(size_t offset, Ts... other) {
+    setOutput(offset);
+    return setOutput(other...);
+  }
+
+  template <typename Attr>
+  Operator& setAttr(std::string name, Attr&& attr) {
+    o.set_attr(name, std::forward<Attr>(attr));
+    return *this;
+  }
+
+  template <typename F>
+  Operator& setAttr(std::string name, const F& fn, size_t offset) {
+    return setAttr(name, fn(n, offset));
+  }
+
+  static std::vector<int64_t> Ints(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toIntVector();
+  }
+
+  static int64_t Int(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toInt();
+  }
+
+  static float Float(const Node* node, size_t offset) {
+    return static_cast<float>(toIValue(node->input(offset))->toDouble());
+  }
+
+  static bool Bool(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toBool();
+  }
+
+  static uint64_t getId(const Node* node) {
+    return reinterpret_cast<uint64_t>(node); // cast node address as op id
+  }
+
+  dnnl::graph::op::kind kind() const {
+    return k;
+  }
+
+  dnnl::graph::op llgaOp() const {
+    return o;
+  }
+
+ private:
+  dnnl::graph::logical_tensor createLogicalTensor(Value* value) const {
+    return LlgaTensorDesc(value).logical_tensor();
+  }
+
+  const Node* n;
+  dnnl::graph::op o;
+  dnnl::graph::op::kind k;
+};
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
new file mode 100644
index 000000000000..5704f598eb43
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
@@ -0,0 +1,106 @@
+#include <torch/csrc/jit/codegen/onednn/prepare_binary.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+bool compareConstValue(Value* v, double d) {
+  auto ival = toIValue(v);
+  return ival.has_value() &&
+      ((ival->isInt() && static_cast<int>(ival->toInt()) == d) ||
+       (ival->isDouble() && ival->toDouble() == d));
+}
+
+void mayConvertScalarInputToTensor(Node* node) {
+  // We do not handle binary ops with two scalar inputs,
+  // and we assume scalar is always at the second place.
+  if (node->input(0)->type()->isSubtypeOf(TensorType::get()) &&
+      (node->input(1)->type()->isSubtypeOf(FloatType::get()) ||
+       node->input(1)->type()->isSubtypeOf(IntType::get()))) {
+    auto scalar = node->input(1);
+    WithInsertPoint guard(node);
+    auto g = node->owningGraph();
+    // 42 : Scalar  -->  tensor(42.0) : Float([])
+    auto t = g->insert(
+        aten::as_tensor, {scalar}, {{"dtype", at::ScalarType::Float}});
+    // add dim & stride info to IR
+    c10::optional<size_t> t_dim = 1;
+    auto target_type = TensorTypePtr(
+        TensorType::create(at::ScalarType::Float, at::kCPU, t_dim, false));
+    target_type = target_type->withSizes({1});
+    t->setType(target_type);
+
+    // tensor(42.0) : Float([])  -->  tensor([42.0]) : Float([1])
+    auto unsqueezed = g->insert(aten::unsqueeze, {t, 0});
+    unsqueezed->setType(target_type);
+    node->replaceInput(1, unsqueezed);
+  }
+}
+
+static void ConvertScalarToTensor(Block* block) {
+  for (auto node : block->nodes()) {
+    for (auto sub : node->blocks()) {
+      ConvertScalarToTensor(sub);
+    }
+
+    if (node->kind() == aten::add || node->kind() == aten::mul) {
+      mayConvertScalarInputToTensor(node);
+    }
+  }
+}
+
+void mayDecomposeAdd(Node* node) {
+  if (toIValue(node->namedInput("alpha")).has_value()) {
+    auto alphaEqualsOne = compareConstValue(node->namedInput("alpha"), 1.0);
+    if (!alphaEqualsOne) {
+      WithInsertPoint guard(node);
+      auto g = node->owningGraph();
+      auto mul = g->insert(
+          aten::mul, {node->namedInput("other"), node->namedInput("alpha")});
+      node->replaceInput(1, mul);
+      auto one = g->insertConstant(1.0);
+      node->replaceInput(2, one);
+    }
+  }
+}
+
+static void DecomposeFusedAdd(Block* block) {
+  for (auto node : block->nodes()) {
+    for (auto sub : node->blocks()) {
+      DecomposeFusedAdd(sub);
+    }
+
+    if (node->kind() == aten::add) {
+      mayDecomposeAdd(node);
+    }
+  }
+}
+
+static void EliminateIdentityMulAdd(Block* block) {
+  for (auto node : block->nodes()) {
+    for (auto sub : node->blocks()) {
+      EliminateIdentityMulAdd(sub);
+    }
+
+    if ((node->kind() == aten::add && compareConstValue(node->input(1), 0.0)) ||
+        (node->kind() == aten::mul && compareConstValue(node->input(1), 1.0))) {
+      node->output()->replaceAllUsesWith(node->namedInput("self"));
+    }
+  }
+}
+
+void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph) {
+  DecomposeFusedAdd(graph->block());
+  EliminateIdentityMulAdd(graph->block());
+  EliminateDeadCode(graph);
+  // ConvertScalarToTensor must be placed after EliminateIdentityMulAdd
+  ConvertScalarToTensor(graph->block());
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.h b/torch/csrc/jit/codegen/onednn/prepare_binary.h
new file mode 100644
index 000000000000..d7f90002e8fa
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/prepare_binary.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+// Prepare binary ops for LLGA
+//
+// The pass does the following:
+//
+// - Convert scalar input of aten::add and aten::mul into Float tensor with
+//   dimension [1]
+//
+// - Decompose fused add into aten::mul + aten::add when alpha != 1.0
+//
+// - Eliminate identity add/mul, i.e., tensor + 0, tensor * 1
+//
+void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph);
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/onednn/register_interface.cpp b/torch/csrc/jit/codegen/onednn/register_interface.cpp
new file mode 100644
index 000000000000..62400944cc93
--- /dev/null
+++ b/torch/csrc/jit/codegen/onednn/register_interface.cpp
@@ -0,0 +1,54 @@
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+bool canFuseNode(const Node* node) {
+  switch (node->kind()) {
+    case aten::conv2d:
+    case aten::_convolution:
+    case aten::batch_norm:
+    case aten::layer_norm:
+    case aten::add:
+    case aten::mul:
+    case aten::tanh:
+    case aten::relu:
+    case aten::elu:
+    case aten::sigmoid:
+    case aten::gelu:
+    case aten::sqrt:
+    case aten::abs:
+    case aten::square:
+    case aten::hardtanh:
+    case aten::relu6:
+    case aten::softmax:
+    case aten::max_pool2d:
+    case aten::avg_pool2d:
+    case aten::matmul:
+    case aten::mm:
+    case aten::linear:
+    case aten::addmm:
+      return true;
+
+    default:
+      return false;
+  }
+}
+
+namespace {
+class RegisterInterface {
+ public:
+  RegisterInterface() {
+    RegisterProfilingNode(canFuseNode);
+  }
+};
+
+static RegisterInterface register_interface_;
+} // namespace
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/frontend/builtin_functions.cpp b/torch/csrc/jit/frontend/builtin_functions.cpp
index 899bcb40b19d..b7f1197648c4 100644
--- a/torch/csrc/jit/frontend/builtin_functions.cpp
+++ b/torch/csrc/jit/frontend/builtin_functions.cpp
@@ -18,6 +18,14 @@ def ne(a : ${Scalar}, b : Tensor) -> Tensor:
   return b != a
 def eq(a : ${Scalar}, b : Tensor) -> Tensor:
   return b == a
+def sub(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.neg(b) + a
+def div(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.reciprocal(b) * a
+)SCRIPT");
+
+auto scalar_operators_no_complex_source = at::jit::CodeTemplate(
+    R"SCRIPT(
 def lt(a : ${Scalar}, b : Tensor) -> Tensor:
   return b > a
 def le(a : ${Scalar}, b : Tensor) -> Tensor:
@@ -26,10 +34,6 @@ def gt(a : ${Scalar}, b : Tensor) -> Tensor:
   return b < a
 def ge(a : ${Scalar}, b : Tensor) -> Tensor:
   return b <= a
-def sub(a : ${Scalar}, b : Tensor) -> Tensor:
-  return torch.neg(b) + a
-def div(a : ${Scalar}, b : Tensor) -> Tensor:
-  return torch.reciprocal(b) * a
 )SCRIPT");
 
 auto _ntuple_ops = at::jit::CodeTemplate(
@@ -212,12 +216,18 @@ struct BuiltinFunctionRegistry {
   }
 
   void loadBuiltinFunctions() {
-    for (auto scalar : {"float", "int"}) {
+    for (auto scalar : {"float", "int", "complex"}) {
       at::jit::TemplateEnv env;
       env.s("Scalar", scalar);
       loadSource(scalar_operators_source.format(env), "aten");
     }
 
+    for (auto scalar : {"float", "int"}) {
+      at::jit::TemplateEnv env;
+      env.s("Scalar", scalar);
+      loadSource(scalar_operators_no_complex_source.format(env), "aten");
+    }
+
     using str_pair = std::pair<std::string, std::string>;
     const std::vector<str_pair> name_len = {
         str_pair("single", "1"),
diff --git a/torch/csrc/jit/frontend/concrete_module_type.h b/torch/csrc/jit/frontend/concrete_module_type.h
index c2ee8c5e22d4..22349936687c 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.h
+++ b/torch/csrc/jit/frontend/concrete_module_type.h
@@ -10,7 +10,7 @@
 namespace torch {
 namespace jit {
 
-enum class IterableModuleKind { NONE, LIST, DICT };
+enum class IterableModuleKind { NONE, LIST, DICT, PARAMLIST, PARAMDICT };
 class ConcreteModuleType;
 
 // You can think of an nn.Module as a template that corresponds to a family of
diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp
index e3e89dedfb6c..7e7d5ed9815b 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.cpp
+++ b/torch/csrc/jit/frontend/function_schema_parser.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/core/Reduction.h>
 #include <ATen/core/type_factory.h>
+#include <c10/util/Optional.h>
 #include <c10/util/string_utils.h>
 #include <torch/csrc/jit/frontend/lexer.h>
 #include <torch/csrc/jit/frontend/parse_string_literal.h>
@@ -27,8 +28,13 @@ namespace jit {
 
 namespace {
 struct SchemaParser {
-  SchemaParser(const std::string& str)
-      : L(std::make_shared<SourceView>(c10::string_view(str))),
+  explicit SchemaParser(const std::string& str)
+      : L(std::make_shared<Source>(
+            c10::string_view(str),
+            c10::nullopt,
+            0,
+            nullptr,
+            Source::DONT_COPY)),
         type_parser(L, /*parse_complete_tensor_types*/ false) {}
 
   either<OperatorName, FunctionSchema> parseDeclaration() {
@@ -111,6 +117,17 @@ struct SchemaParser {
     if (L.nextIf('.')) {
       overload_name = L.expect(TK_IDENT).text();
     }
+    // default is used as an attribute on the `OpOverloadPacket`
+    // (obtained using `torch.ops.aten.foo`) to get the operator
+    // overload with overload name as an empty string
+    // and so shouldn't be used as an overload name
+    // also disallow dunder attribute names to be overload names
+    bool is_a_valid_overload_name =
+        !((overload_name == "default") || (overload_name.rfind("__", 0) == 0));
+    TORCH_CHECK(
+        is_a_valid_overload_name,
+        overload_name,
+        " is not a legal overload name for aten operators");
     return {name, overload_name};
   }
 
@@ -130,17 +147,21 @@ struct SchemaParser {
     return result;
   }
 
-  Argument parseArgument(size_t idx, bool is_return, bool kwarg_only) {
-    auto p = type_parser.parseType();
-    auto type = std::move(p.first);
-    auto alias_info = std::move(p.second);
+  Argument parseArgument(size_t /*idx*/, bool is_return, bool kwarg_only) {
+    // fake and real type coincide except for Layout/MemoryFormat/ScalarType
+    // the fake type for these is Int instead
+    auto p = type_parser.parseFakeAndRealType();
+    auto fake_type = std::move(std::get<0>(p));
+    auto real_type = std::move(std::get<1>(p));
+    auto alias_info = std::move(std::get<2>(p));
     c10::optional<int32_t> N;
     c10::optional<IValue> default_value;
     c10::optional<std::string> alias_set;
     std::string name;
     if (L.nextIf('[')) {
       // note: an array with a size hint can only occur at the Argument level
-      type = ListType::create(std::move(type));
+      fake_type = ListType::create(std::move(fake_type));
+      real_type = ListType::create(std::move(real_type));
       N = c10::stoll(L.expect(TK_NUMBER).text());
       L.expect(']');
       auto container = type_parser.parseAliasAnnotation();
@@ -149,7 +170,10 @@ struct SchemaParser {
       }
       alias_info = std::move(container);
       if (L.nextIf('?')) {
-        type = c10::TypeFactory::create<c10::OptionalType>(std::move(type));
+        fake_type =
+            c10::TypeFactory::create<c10::OptionalType>(std::move(fake_type));
+        real_type =
+            c10::TypeFactory::create<c10::OptionalType>(std::move(real_type));
       }
     }
     if (is_return) {
@@ -162,12 +186,14 @@ struct SchemaParser {
     } else {
       name = L.expect(TK_IDENT).text();
       if (L.nextIf('=')) {
-        default_value = parseDefaultValue(*type, type->kind(), N);
+        // NB: this means we have to unswizzle default too
+        default_value = parseDefaultValue(*fake_type, fake_type->kind(), N);
       }
     }
     return Argument(
         std::move(name),
-        std::move(type),
+        std::move(fake_type),
+        std::move(real_type),
         N,
         std::move(default_value),
         !is_return && kwarg_only,
@@ -265,7 +291,7 @@ struct SchemaParser {
     return convertToList(type, kind, tok.range, vs);
   }
 
-  IValue parseTensorDefault(const SourceRange& range) {
+  IValue parseTensorDefault(const SourceRange& /*range*/) {
     L.expect(TK_NONE);
     return IValue();
   }
@@ -339,12 +365,12 @@ struct SchemaParser {
 };
 } // namespace
 
-C10_EXPORT either<OperatorName, FunctionSchema> parseSchemaOrName(
+either<OperatorName, FunctionSchema> parseSchemaOrName(
     const std::string& schemaOrName) {
   return SchemaParser(schemaOrName).parseExactlyOneDeclaration();
 }
 
-C10_EXPORT FunctionSchema parseSchema(const std::string& schema) {
+FunctionSchema parseSchema(const std::string& schema) {
   auto parsed = parseSchemaOrName(schema);
   TORCH_CHECK(
       parsed.is_right(),
@@ -352,7 +378,7 @@ C10_EXPORT FunctionSchema parseSchema(const std::string& schema) {
   return std::move(parsed.right());
 }
 
-C10_EXPORT OperatorName parseName(const std::string& name) {
+OperatorName parseName(const std::string& name) {
   auto parsed = parseSchemaOrName(name);
   TORCH_CHECK(
       parsed.is_left(),
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 20cab7c74999..205f5ff05a7e 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -24,6 +24,7 @@
 #include <torch/csrc/jit/passes/lower_tuples.h>
 #include <torch/csrc/jit/passes/normalize_ops.h>
 #include <torch/csrc/jit/passes/replacement_of_old_operators.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/runtime/slice_indices_adjust.h>
@@ -34,6 +35,9 @@
 #include <c10/util/Optional.h>
 #include <c10/util/hash.h>
 
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/frontend/error_report.h>
 #include <atomic>
 #include <climits>
 #include <set>
@@ -2478,12 +2482,14 @@ struct to_ir {
   void emitRaise(const Raise& raise) {
     auto sv = emitSugaredExpr(raise.expr(), 1);
     Value* error_message = nullptr;
+    Value* qualified_class_name = nullptr;
 
     if (auto exception_instance =
             std::dynamic_pointer_cast<ExceptionMessageValue>(sv)) {
       // The typical case, an instance of the exception class was thrown:
       //    raise RuntimeError("error")
       error_message = exception_instance->getValue();
+      qualified_class_name = exception_instance->getQualifiedClassName();
     } else if (
         auto exception_class = std::dynamic_pointer_cast<ExceptionValue>(sv)) {
       // A bare exception was thrown so add an empty message. e.g.
@@ -2500,7 +2506,11 @@ struct to_ir {
       error_message = graph->insert(aten::str, {error_message});
     }
 
-    graph->insert(prim::RaiseException, {error_message}, {}, raise.range());
+    graph->insert(
+        prim::RaiseException,
+        {error_message, qualified_class_name},
+        {},
+        raise.range());
     exit_blocks.insert(environment_stack->block());
   }
 
@@ -3310,7 +3320,7 @@ struct to_ir {
     auto sv = emitSugaredExpr(apply.callee(), 1);
     auto loc = apply.callee().range();
     if (auto special_form = dynamic_cast<SpecialFormValue*>(sv.get())) {
-      return emitApplySpecialForm(special_form->form(), apply, type_hint);
+      return emitApplySpecialForm(special_form->form(), apply, sv, type_hint);
     }
     auto args = getNamedValues(apply.inputs(), true);
     auto kwargs = emitAttributes(apply.attributes());
@@ -3325,6 +3335,7 @@ struct to_ir {
   std::shared_ptr<SugaredValue> emitApplySpecialForm(
       Symbol form,
       Apply& apply,
+      std::shared_ptr<SugaredValue> sv,
       const TypePtr& type_hint = nullptr) {
     switch (form) {
       case prim::fork: {
@@ -3429,6 +3440,71 @@ struct to_ir {
         return std::make_shared<SimpleValue>(
             graph->insertNode(graph->createTuple(inp_values))->output());
       }
+      case prim::LegacyTypedConstructor: {
+        // see legacy_tensor_generic_ctor_new
+        // These legacy constructors do not follow schemas that can be
+        // typed in native_functions.yaml / JIT type signature and are handled
+        // here. Only the two common cases are handled initially:
+        // "new(IntArrayRef size, *, Device? device=None)",
+        // "new(PyObject* data, *, Device? device=None)",
+        // Note: device argument is unused in the kernel
+        auto args = getValues(apply.inputs(), true);
+        auto kwargs = emitAttributes(apply.attributes());
+        auto get_base_error_msg = [&]() {
+          std::stringstream base_error_msg;
+          base_error_msg
+              << "Legacy Tensor Constructor only supports two schemas in TorchScript: \n";
+          base_error_msg
+              << "'new(IntArrayRef size, *, Device? device=None)',\n";
+          base_error_msg << "'new(PyObject* data, *, Device? device=None)\n'";
+          return base_error_msg;
+        };
+        if (kwargs.size() == 1 && kwargs[0].name() != "device") {
+          throw ErrorReport(apply)
+              << get_base_error_msg().str() << "Got kwarg " << kwargs[0].name();
+        }
+        if (kwargs.size() > 1) {
+          throw ErrorReport(apply)
+              << get_base_error_msg().str() << "Got multiple kwargs\n";
+        }
+        auto dtype = dynamic_cast<LegacyTensorConstructor*>(sv.get())->dtype();
+        auto dtype_ivalue = graph->insertConstant(dtype);
+
+        // supporting "new(IntArrayRef size, *, Device? device=None)", through
+        // empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout?
+        // layout=None, Device? device=None, bool? pin_memory=None,
+        // MemoryFormat? memory_format=None) -> Tensor
+        bool all_ints = std::all_of(args.begin(), args.end(), [](Value* v) {
+          return v->type()->cast<IntType>();
+        });
+        if (args.size() == 0) {
+          // empty inputs == torch.tensor([], dtype=....)
+          auto inp_list =
+              graph->insertNode(graph->createList(IntType::get(), {}))
+                  ->output();
+          return std::make_shared<SimpleValue>(graph->insert(
+              aten::tensor,
+              {inp_list},
+              {NamedValue(apply.range(), "dtype", dtype_ivalue)}));
+        } else if (all_ints) {
+          auto inp_list =
+              graph->insertNode(graph->createList(IntType::get(), args))
+                  ->output();
+          return std::make_shared<SimpleValue>(graph->insert(
+              aten::empty,
+              {inp_list},
+              {NamedValue(apply.range(), "dtype", dtype_ivalue)}));
+        } else if (args.size() == 1) {
+          return std::make_shared<SimpleValue>(graph->insert(
+              aten::tensor,
+              {args[0]},
+              {NamedValue(apply.range(), "dtype", dtype_ivalue)}));
+        } else {
+          throw ErrorReport(apply)
+              << get_base_error_msg().str()
+              << "Got multiple positional arguments that were not all integers";
+        }
+      }
       case prim::isinstance: {
         checkApplyNumInputs(apply, 2);
         auto result = emitIsInstance(apply.inputs()[0], apply.inputs()[1]);
@@ -3471,6 +3547,7 @@ struct to_ir {
         }
         auto createNode =
             graph->insertNode(graph->createObject(class_arg->type_));
+        createNode->setSourceRange(apply.range());
         return std::make_shared<SimpleValue>(createNode->output());
       }
       // We construct the iterable tree here using the IterableTree
@@ -3549,6 +3626,22 @@ struct to_ir {
       case prim::dict: {
         return emitApplySpecialFormForDict(apply, type_hint);
       }
+      case aten::index: {
+        const SourceRange& loc = apply.range();
+        auto select = Select(apply.callee());
+        auto self = emitSugaredExpr(select.value(), 1)->asValue(loc, method);
+
+        auto inputs = apply.inputs();
+        if (inputs.size() != 1) {
+          throw ErrorReport(apply)
+              << "__getitem__ expected exactly 1 arguments, got "
+              << inputs.size();
+        }
+        auto input =
+            emitSugaredExpr(apply.inputs()[0], 1)->asValue(loc, method);
+
+        return std::make_shared<SimpleValue>(emitIndex(loc, self, {input}));
+      }
       default:
         TORCH_INTERNAL_ASSERT(false, "unknown special form: ", form);
     }
@@ -4177,6 +4270,27 @@ struct to_ir {
   Value* emitListLiteral(ListLiteral ll, const TypePtr& type_hint) {
     auto values = getValues(ll.inputs(), /*maybe_unpack=*/true);
 
+    // Empty List Literals that are not assigned to variables
+    // may match to any list type in schema matching,
+    // but still default to List[Tensor] if assigned to a variable
+    // or returned from a function
+    // Restricting empty list matching to temporary values
+    // avoids difficult to handle cases such as
+    // a = []
+    // b = a
+    // if cond:
+    //    b.append(2)
+    // else:
+    //    a.append("hi")
+    // This is also the same behavior that C++ allows with {}
+    // (cannot assign to a variable typed as auto)
+    // These nodes will be removed in a later pass after initial compilation
+    if (values.size() == 0 && type_hint == nullptr) {
+      auto node = graph->insertNode(graph->create(prim::EmptyListLiteral));
+      node->output()->setType(ListType::ofTensors());
+      return node->output();
+    }
+
     // Determine the element type of the list. If we have a type hint
     // of `List[T]`, use `T`. If the list is non-empty, find the
     // greatest common supertype of all the list elements (defaulting to
@@ -5428,13 +5542,38 @@ std::vector<Function*> CompilationUnit::define(
       self);
 }
 
+void eraseListLiterals(std::shared_ptr<Graph>& graph) {
+  DepthFirstGraphNodeIterator it(graph);
+
+  for (auto next_node = it.next(); next_node != nullptr;) {
+    Node* node = next_node;
+    next_node = it.next();
+
+    if (node->kind() == prim::EmptyListLiteral) {
+      if (node->hasUses()) {
+        TORCH_INTERNAL_ASSERT(
+            node->output()->type()->isSubtypeOf(ListType::ofTensors()));
+
+        auto li = graph->createList(TensorType::get(), {});
+        li->insertBefore(node);
+        node->replaceAllUsesWith(li);
+      }
+      node->destroy();
+    }
+  }
+}
+
 void runCleanupPasses(std::shared_ptr<Graph>& to_clean) {
   liftClosures(to_clean);
   inlineForkedClosures(to_clean);
+
   if (getInlineEverythingMode()) {
     Inline(*to_clean);
   }
 
+  // these exist temporarily in initial compilation
+  eraseListLiterals(to_clean);
+
   // remove any uses of tuples that we inserted that are not needed
   LowerSimpleTuples(to_clean);
 
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 2535adc29bc0..7069186cb3ff 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -187,39 +187,39 @@ struct TORCH_API SharedParserData {
 #undef ADD_CASE
   }
 
-  // find the longest match of str.substring(pos) against a token, return true
-  // if successful filling in kind, start,and len
   bool match(
-      c10::string_view str,
-      size_t pos,
+      StringCordView::Iterator pos,
       bool continuation, // are we inside a scope where newlines don't count
                          // (e.g. inside parens)
       bool whitespace_token, // should we treat whitespace as a token
       int* kind,
-      size_t* start,
-      size_t* len) {
+      StringCordView::Iterator* start,
+      StringCordView::Iterator* end) {
     *start = pos;
     // skip whitespace
-    while (pos < str.size() && isblank(str[pos]))
-      pos++;
+    while (pos.has_next() && isblank(*pos)) {
+      ++pos;
+    }
 
     // special handling
-    if (pos < str.size()) {
-      if (str[pos] == '#' && !isTypeComment(str, pos)) {
+    if (pos.has_next()) {
+      if (*pos == '#' && !isTypeComment(pos)) {
         // skip comments
-        while (pos < str.size() && str[pos] != '\n')
-          pos++;
+        while (pos.has_next() && *pos != '\n')
+          ++pos;
         // tail call, handle whitespace and more comments
-        return match(
-            str, pos, continuation, whitespace_token, kind, start, len);
+        return match(pos, continuation, whitespace_token, kind, start, end);
       }
-      if (str[pos] == '\\' && pos + 1 < str.size() && str[pos + 1] == '\n' &&
-          !whitespace_token) {
-        return match(str, pos + 2, continuation, false, kind, start, len);
+      if (*pos == '\\') {
+        auto newiter = pos;
+        ++newiter;
+        if (newiter.has_next() && *newiter == '\n' && !whitespace_token) {
+          ++newiter;
+          return match(newiter, continuation, false, kind, start, end);
+        }
       }
-      if (str[pos] == '\n') {
-        return match(
-            str, pos + 1, continuation, !continuation, kind, start, len);
+      if (*pos == '\n') {
+        return match(++pos, continuation, !continuation, kind, start, end);
       }
     }
     // we handle white space before EOF because in the case we have something
@@ -228,26 +228,31 @@ struct TORCH_API SharedParserData {
     // else:
     //   pass
     if (whitespace_token) {
-      *kind = pos == str.size() ? TK_WHITESPACE_EOF : TK_WHITESPACE;
-      *len = pos - *start;
+      *kind = !pos.has_next() ? TK_WHITESPACE_EOF : TK_WHITESPACE;
+      *end = pos;
       return true;
     }
-    if (pos == str.size()) {
+    if (!pos.has_next()) {
       *kind = TK_EOF;
       *start = pos;
-      *len = 0;
+      *end = *start;
       return true;
     }
     // invariant: the next token is not whitespace or newline
     *start = pos;
     // check for a valid number
-    if (isNumber(str, pos, len)) {
+    size_t len;
+    if (isNumber(pos.rest_line(), 0, &len)) {
+      *end = *start;
+      *end += len;
       *kind = TK_NUMBER;
       return true;
     }
     // check for string
-    if (isString(str, pos, len)) {
+    if (isString(pos.rest_line(), 0, &len)) {
       *kind = TK_STRINGLITERAL;
+      *end = *start;
+      *end += len;
       return true;
     }
 
@@ -257,11 +262,14 @@ struct TORCH_API SharedParserData {
     bool matched = false;
     bool ident = true;
     TokenTrie* cur = head.get();
-    for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr); i++) {
-      ident = ident && validIdent(i, str[pos + i]);
+    // for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr);
+    // i++)
+    for (size_t i = 0; pos.has_next() && (ident || cur != nullptr);
+         ++pos, ++i) {
+      ident = ident && validIdent(i, *pos);
       if (ident) {
         matched = true;
-        *len = i + 1;
+        *end = pos.next_iter();
         *kind = TK_IDENT;
       }
       // check for token second, so that e.g. 'max' matches the token TK_MAX
@@ -270,14 +278,14 @@ struct TORCH_API SharedParserData {
       if (cur) {
         const auto begin_it = cur->child_chars.begin();
         const auto end_it = cur->child_chars.end();
-        const auto ch_it = std::find(begin_it, end_it, str[pos + i]);
+        const auto ch_it = std::find(begin_it, end_it, *pos);
 
         cur = (ch_it == end_it) ? nullptr
                                 : cur->child_tries[ch_it - begin_it].get();
 
         if (cur && cur->kind != 0) {
           matched = true;
-          *len = i + 1;
+          *end = pos.next_iter();
           *kind = cur->kind;
         }
       }
@@ -291,6 +299,7 @@ struct TORCH_API SharedParserData {
     switch (kind) {
       case '?':
       case TK_POW:
+      case TK_IF:
         return true;
       default:
         return false;
@@ -367,8 +376,19 @@ struct TORCH_API SharedParserData {
   bool isblank(int n) {
     return isspace(n) && n != '\n';
   }
+
+  bool isTypeComment(StringCordView::Iterator str_iter) {
+    c10::string_view rest_line = str_iter.rest_line();
+    const std::string type_string = "# type:";
+    if (rest_line.size() < type_string.length()) {
+      return false;
+    }
+    auto match_string = rest_line.substr(0, type_string.size());
+    return match_string == type_string;
+  }
+
   // Make an exception ignoring comments for type annotation comments
-  bool isTypeComment(c10::string_view str, size_t pos) {
+  bool isTypeComment(StringCordView str, size_t pos) {
     const std::string type_string = "# type:";
     if (str.size() < pos + type_string.length()) {
       return false;
@@ -387,7 +407,7 @@ struct Token {
   SourceRange range;
   Token(int kind, SourceRange range) : kind(kind), range(std::move(range)) {}
   std::string text() {
-    return range.text();
+    return std::string(range.token_text());
   }
   std::string kindString() const {
     return kindToString(kind);
@@ -395,7 +415,7 @@ struct Token {
 };
 
 struct Lexer {
-  explicit Lexer(std::shared_ptr<SourceView> source)
+  explicit Lexer(std::shared_ptr<Source> source)
       : source(std::move(source)),
         pos(0),
         nesting(0),
@@ -513,30 +533,37 @@ struct Lexer {
   Token lexRaw(bool whitespace_token = false) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     int kind;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t start;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t length;
     AT_ASSERT(source);
+    if (current == nullptr) {
+      AT_ASSERT(pos == 0);
+      current = std::make_unique<StringCordView::Iterator>(
+          source->text_str().begin());
+    }
+
+    StringCordView::Iterator start_iter = *current;
+    StringCordView::Iterator end_iter = *current;
     if (!shared.match(
-            source->text(),
-            pos,
+            *current,
             nesting > 0,
             whitespace_token,
             &kind,
-            &start,
-            &length)) {
+            &start_iter,
+            &end_iter)) {
       expected(
           "a valid token",
           Token(
-              (source->text())[start], SourceRange(source, start, start + 1)));
+              **current,
+              SourceRange(source, start_iter, start_iter.pos() + 1)));
     }
-    auto t = Token(kind, SourceRange(source, start, start + length));
-    pos = start + length;
+
+    auto t = Token(kind, SourceRange(source, start_iter, end_iter.pos()));
+    pos = end_iter.pos();
+    *current = end_iter;
     return t;
   }
 
-  std::shared_ptr<SourceView> source;
+  std::shared_ptr<Source> source;
+  std::unique_ptr<StringCordView::Iterator> current;
   size_t pos;
   size_t nesting; // depth of ( [ { nesting...
   std::vector<int> indent_stack; // stack of indentation level of blocks
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index b5792fed1eb0..259ce27a1edc 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -46,7 +46,7 @@ Decl mergeTypesFromTypeComment(
 }
 
 struct ParserImpl {
-  explicit ParserImpl(const std::shared_ptr<SourceView>& source)
+  explicit ParserImpl(const std::shared_ptr<Source>& source)
       : L(source), shared(sharedParserData()) {}
 
   Ident parseIdent() {
@@ -801,7 +801,7 @@ struct ParserImpl {
   SharedParserData& shared;
 };
 
-Parser::Parser(const std::shared_ptr<SourceView>& src)
+Parser::Parser(const std::shared_ptr<Source>& src)
     : pImpl(new ParserImpl(src)) {}
 
 Parser::~Parser() = default;
diff --git a/torch/csrc/jit/frontend/parser.h b/torch/csrc/jit/frontend/parser.h
index 8b2bb302a340..6d856a090854 100644
--- a/torch/csrc/jit/frontend/parser.h
+++ b/torch/csrc/jit/frontend/parser.h
@@ -17,7 +17,7 @@ TORCH_API Decl mergeTypesFromTypeComment(
     bool is_method);
 
 struct TORCH_API Parser {
-  explicit Parser(const std::shared_ptr<SourceView>& src);
+  explicit Parser(const std::shared_ptr<Source>& src);
   TreeRef parseFunction(bool is_method);
   TreeRef parseClass();
   Decl parseTypeComment();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index 7190eb166c1c..81cef4fe393a 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/frontend/schema_matching.h>
 
+#include <ATen/core/interned_strings.h>
 #include <ATen/core/jit_type.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
@@ -8,6 +9,7 @@
 #include <torch/csrc/jit/frontend/builtin_functions.h>
 #include <torch/csrc/jit/frontend/error_report.h>
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
+#include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/operator_upgraders/utils.h>
 #include <torch/csrc/jit/operator_upgraders/version_map.h>
 #include <torch/csrc/jit/runtime/operator.h>
@@ -76,6 +78,16 @@ Value* tryConvertToType(
     }
   }
 
+  // allow temporary, unannotated list literals `[]` to match to arbitrary list
+  // types
+  if (value->node()->kind() == prim::EmptyListLiteral &&
+      concrete_type->cast<ListType>()) {
+    value = graph
+                .insertNode(graph.createList(
+                    concrete_type->cast<ListType>()->getElementType(), {}))
+                ->output();
+  }
+
   if (auto value_tuple = value->type()->cast<TupleType>()) {
     // Allow homogeneous tuples to be casted implicitly to lists of appropriate
     // types
@@ -230,10 +242,17 @@ static Value* tryMatchArgument(
 
 c10::optional<size_t> findInputWithName(
     const std::string& name,
-    at::ArrayRef<NamedValue> kwargs) {
+    at::ArrayRef<NamedValue> kwargs,
+    bool is_aten) {
   for (const auto i : c10::irange(kwargs.size())) {
-    if (kwargs[i].name() == name)
+    // TS doesn't understand that the self argument in function
+    // scheams is renamed to input for the functional variant
+    if (is_aten && name == "self" && kwargs[i].name() == "input") {
       return i;
+    }
+    if (kwargs[i].name() == name) {
+      return i;
+    }
   }
   return c10::nullopt;
 }
@@ -342,6 +361,13 @@ static c10::optional<MatchedSchema> tryMatchSchema(
   std::vector<Value*> positional_inputs;
   std::vector<bool> used_kwarg(kwargs.size(), false);
 
+  auto schema_namespace = schema.operator_name().getNamespace();
+  bool is_aten = false;
+  if (schema_namespace.has_value()) {
+    if (schema_namespace.value() == "aten") {
+      is_aten = true;
+    }
+  }
   // if we finish the loop will we have consumed all arguments?
   size_t used_args = 0;
   for (const auto schema_i : c10::irange(schema.arguments().size())) {
@@ -386,7 +412,8 @@ static c10::optional<MatchedSchema> tryMatchSchema(
       // used
       actual_named_value = args[used_args];
       used_args++;
-    } else if (auto kwarg_idx = findInputWithName(arg.name(), kwargs)) {
+    } else if (
+        auto kwarg_idx = findInputWithName(arg.name(), kwargs, is_aten)) {
       const NamedValue& nv = kwargs[*kwarg_idx];
       if (used_kwarg[*kwarg_idx]) {
         if (failure_messages) {
diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h
index 71f03e398f4a..d0828ad8692c 100644
--- a/torch/csrc/jit/frontend/schema_matching.h
+++ b/torch/csrc/jit/frontend/schema_matching.h
@@ -53,7 +53,8 @@ TORCH_API Value* emitBuiltinCall(
 
 TORCH_API c10::optional<size_t> findInputWithName(
     const std::string& name,
-    at::ArrayRef<NamedValue> kwargs);
+    at::ArrayRef<NamedValue> kwargs,
+    bool is_aten = false);
 
 // applies implicit conversion from value trying to turn it into type
 // concrete_type it succeeds if the return_value->isSubtypeOf(concrete_type)
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 252b5e2370ca..fd0981027481 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -20,17 +20,21 @@ using c10::FloatType;
 using c10::FutureType;
 using c10::GeneratorType;
 using c10::IntType;
+using c10::LayoutType;
 using c10::ListType;
+using c10::MemoryFormatType;
 using c10::NoneType;
 using c10::NumberType;
 using c10::OptionalType;
 using c10::QSchemeType;
 using c10::QuantizerType;
 using c10::RRefType;
+using c10::ScalarTypeType;
 using c10::StorageType;
 using c10::StreamObjType;
 using c10::StringType;
 using c10::Symbol;
+using c10::SymIntType;
 using c10::TensorType;
 using c10::TupleType;
 using c10::UnionType;
@@ -43,9 +47,9 @@ TypePtr SchemaTypeParser::parseBaseType() {
   static std::unordered_map<std::string, TypePtr> type_map = {
       {"Generator", c10::TypeFactory::get<GeneratorType>()},
       {"Dimname", c10::TypeFactory::get<StringType>()},
-      {"ScalarType", c10::TypeFactory::get<IntType>()},
-      {"Layout", c10::TypeFactory::get<IntType>()},
-      {"MemoryFormat", c10::TypeFactory::get<IntType>()},
+      {"ScalarType", c10::TypeFactory::get<ScalarTypeType>()},
+      {"Layout", c10::TypeFactory::get<LayoutType>()},
+      {"MemoryFormat", c10::TypeFactory::get<MemoryFormatType>()},
       {"Storage", c10::TypeFactory::get<StorageType>()},
       {"QScheme", c10::TypeFactory::get<QSchemeType>()},
       {"Quantizer", c10::TypeFactory::get<QuantizerType>()},
@@ -61,6 +65,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
       {"float", c10::TypeFactory::get<FloatType>()},
       {"complex", c10::TypeFactory::get<ComplexType>()},
       {"int", c10::TypeFactory::get<IntType>()},
+      {"SymInt", c10::TypeFactory::get<SymIntType>()},
       {"bool", c10::TypeFactory::get<BoolType>()},
       {"None", c10::TypeFactory::get<NoneType>()},
       {"NoneType", c10::TypeFactory::get<NoneType>()},
@@ -302,7 +307,14 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
 }
 
 std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
-  TypePtr value;
+  auto r = parseFakeAndRealType();
+  return std::make_pair(std::move(std::get<0>(r)), std::move(std::get<2>(r)));
+}
+
+std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, c10::optional<AliasInfo>>
+SchemaTypeParser::parseFakeAndRealType() {
+  TypePtr fake_value;
+  TypePtr real_value;
   c10::optional<AliasInfo> alias_info;
   // Tuple type
   if (L.cur().kind == '(') {
@@ -314,7 +326,8 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
         alias_info->addContainedType(std::move(*r.second));
       }
     });
-    value = c10::TypeFactory::create<TupleType>(std::move(types));
+    fake_value = real_value =
+        c10::TypeFactory::create<TupleType>(std::move(types));
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Future") {
     L.next(); // Future
     L.expect('(');
@@ -322,7 +335,7 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     auto subtype = std::move(p.first);
     auto subalias = std::move(p.second);
     L.expect(')');
-    value = c10::TypeFactory::create<FutureType>(subtype);
+    fake_value = real_value = c10::TypeFactory::create<FutureType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "RRef") {
     L.next(); // RRef
     L.expect('(');
@@ -330,10 +343,10 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     auto subtype = std::move(p.first);
     auto subalias = std::move(p.second);
     L.expect(')');
-    value = c10::TypeFactory::create<RRefType>(subtype);
+    fake_value = real_value = c10::TypeFactory::create<RRefType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Tensor") {
     L.next();
-    value = c10::TypeFactory::get<TensorType>();
+    fake_value = real_value = c10::TypeFactory::get<TensorType>();
     alias_info = parseAliasAnnotation();
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Dict") {
     L.next();
@@ -343,7 +356,8 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     auto value_type = parseType().first;
     L.expect(')');
     alias_info = parseAliasAnnotation();
-    value = c10::TypeFactory::create<DictType>(key_type, value_type);
+    fake_value = real_value =
+        c10::TypeFactory::create<DictType>(key_type, value_type);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Union") {
     L.next();
     L.expect('(');
@@ -355,11 +369,12 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     }
     L.expect(')');
     alias_info = parseAliasAnnotation();
-    value = c10::TypeFactory::create<c10::UnionType>(std::move(types));
+    fake_value = real_value =
+        c10::TypeFactory::create<c10::UnionType>(std::move(types));
   } else if (
       complete_tensor_types && L.cur().kind == TK_IDENT &&
       parseTensorDType(L.cur().text())) {
-    value = parseRefinedTensor();
+    fake_value = real_value = parseRefinedTensor();
     alias_info = parseAliasAnnotation();
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "__torch__") {
     L.next();
@@ -379,36 +394,46 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     auto ns_tok = L.expect(TK_IDENT);
     L.expect('.');
     auto class_tok = L.expect(TK_IDENT);
-    value = getCustomClass(
+    fake_value = real_value = getCustomClass(
         std::string("__torch__.torch.classes.") + ns_tok.text() + "." +
         class_tok.text());
-    if (!value) {
+    if (!fake_value) {
       throw ErrorReport(class_tok.range)
           << "Unknown custom class type "
           << ns_tok.text() + "." + class_tok.text()
           << ". Please ensure it is registered.";
     }
   } else {
-    value = parseBaseType();
+    real_value = parseBaseType();
+    if (real_value->kind() == ScalarTypeType::Kind ||
+        real_value->kind() == MemoryFormatType::Kind ||
+        real_value->kind() == LayoutType::Kind) {
+      fake_value = c10::TypeFactory::get<IntType>();
+    } else {
+      fake_value = real_value;
+    }
     alias_info = parseAliasAnnotation();
   }
   while (true) {
     if (L.cur().kind == '[' && L.lookahead().kind == ']') {
       L.next(); // [
       L.next(); // ]
-      value = c10::TypeFactory::create<ListType>(value);
+      fake_value = c10::TypeFactory::create<ListType>(fake_value);
+      real_value = c10::TypeFactory::create<ListType>(real_value);
       auto container = parseAliasAnnotation();
       if (container && alias_info) {
         container->addContainedType(std::move(*alias_info));
       }
       alias_info = std::move(container);
     } else if (L.nextIf('?')) {
-      value = c10::TypeFactory::create<c10::OptionalType>(value);
+      fake_value = c10::TypeFactory::create<c10::OptionalType>(fake_value);
+      real_value = c10::TypeFactory::create<c10::OptionalType>(real_value);
     } else {
       break;
     }
   }
-  return std::make_pair(std::move(value), std::move(alias_info));
+  return std::make_tuple(
+      std::move(fake_value), std::move(real_value), std::move(alias_info));
 }
 
 void SchemaTypeParser::parseList(
diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h
index 99f09c8fd2d3..99ddbb8412c5 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.h
+++ b/torch/csrc/jit/frontend/schema_type_parser.h
@@ -15,6 +15,8 @@ struct TORCH_API SchemaTypeParser {
   TypePtr parseBaseType();
   c10::optional<c10::AliasInfo> parseAliasAnnotation();
   std::pair<TypePtr, c10::optional<c10::AliasInfo>> parseType();
+  std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, c10::optional<c10::AliasInfo>>
+  parseFakeAndRealType();
   c10::optional<at::ScalarType> parseTensorDType(const std::string& dtype);
   TypePtr parseRefinedTensor();
 
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 8b4034f38b37..f5d6f640d413 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -227,7 +227,7 @@ TypePtr ScriptTypeParser::parseTypeFromExpr(const Expr& expr) const {
   // expression and base type names.
   if (resolver_) {
     if (auto typePtr =
-            resolver_->resolveType(expr.range().text(), expr.range())) {
+            resolver_->resolveType(expr.range().text().str(), expr.range())) {
       return typePtr;
     }
   }
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index cd4f6fcb0027..493281115396 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -4,13 +4,140 @@
 
 namespace torch {
 namespace jit {
+
+// A stringlike class backed by a vector of string_view
+// the string represented are logically the concatenation of  the string_views
+// This has advantage of not needing continues memory.
+StringCordView::StringCordView() {
+  accumulated_sizes_.push_back(0);
+}
+
+StringCordView::StringCordView(
+    std::vector<c10::string_view> inputs,
+    std::vector<std::shared_ptr<std::string>> ownerships)
+    : pieces_(std::move(inputs)), owned_strings_(std::move(ownerships)) {
+  accumulated_sizes_.push_back(0);
+  size_t running_sum = 0;
+  for (auto& s : pieces_) {
+    if (s.size() > 0) {
+      running_sum += s.size();
+      accumulated_sizes_.push_back(running_sum);
+    }
+  }
+}
+
+size_t StringCordView::find(const std::string& tok, size_t start) const {
+  if (tok.size() == 0) {
+    return 0;
+  }
+
+  if ((size() - start) < tok.size()) {
+    return std::string::npos;
+  }
+
+  Iterator begin = iter_for_pos(start);
+  Iterator end_iter = end();
+  size_t offset = start;
+  for (; begin != end_iter; ++begin, ++offset) {
+    if (*begin == tok[0]) {
+      auto mis = std::mismatch(begin, end_iter, tok.begin(), tok.end());
+      if (mis.second == tok.end()) {
+        // no mismatch, and second string (tok) is exhausted.
+        return offset;
+      }
+      if (mis.first == end_iter) {
+        // this str is exhausted but tok is not
+        return std::string::npos;
+      }
+    }
+  }
+  return std::string::npos;
+}
+
+StringCordView StringCordView::substr(size_t start, size_t size) const {
+  std::vector<c10::string_view> pieces;
+  std::vector<std::shared_ptr<std::string>> ownerships;
+  if (start >= this->size()) {
+    // out of bounds
+    return StringCordView();
+  }
+  if (start + size >= this->size()) {
+    size = this->size() - start;
+  }
+  Iterator begin = iter_for_pos(start);
+  Iterator end = iter_for_pos(start + size);
+
+  if (begin.line_ == end.line_) {
+    // same line
+    pieces.push_back(pieces_[begin.line_].substr(begin.pos_, size));
+  } else {
+    pieces.push_back(pieces_[begin.line_].substr(begin.pos_));
+
+    size_t last_line = pieces_.size();
+    if (end != this->end() && end.line_ < last_line) {
+      // end is within the string
+      last_line = end.line_;
+    }
+    for (size_t i = begin.line_ + 1; i < last_line; i++) {
+      pieces.push_back(pieces_[i]);
+    }
+    if (end != this->end()) {
+      pieces.push_back(pieces_[end.line_].substr(0, end.pos_));
+    }
+  }
+
+  // share ownership
+  std::copy(
+      owned_strings_.begin(),
+      owned_strings_.end(),
+      std::back_inserter(ownerships));
+
+  return StringCordView(std::move(pieces), std::move(ownerships));
+}
+
+bool StringCordView::operator==(const std::string& rhs) const {
+  if (size() != rhs.size()) {
+    return false;
+  }
+  auto res = std::mismatch(begin(), end(), rhs.begin(), rhs.end());
+  // both need to exhaust
+  return res.first == end() && res.second == rhs.end();
+}
+
+bool StringCordView::operator==(const StringCordView& rhs) const {
+  if (size() != rhs.size()) {
+    return false;
+  }
+  auto res = std::mismatch(begin(), end(), rhs.begin(), rhs.end());
+  // both need to exhaust
+  return res.first == end() && res.second == rhs.end();
+}
+
+StringCordView::Iterator StringCordView::iter_for_pos(size_t pos) const {
+  if (pos == 0) {
+    return begin();
+  }
+  if (pos >= size()) {
+    return end();
+  }
+  auto upper = std::upper_bound(
+      accumulated_sizes_.begin(), accumulated_sizes_.end(), pos);
+  if (upper == accumulated_sizes_.end()) {
+    return end();
+  }
+  size_t line = upper - accumulated_sizes_.begin() - 1;
+  assert(accumulated_sizes_[line] <= pos);
+  assert(accumulated_sizes_[line + 1] > pos);
+  return Iterator(this, line, pos - accumulated_sizes_[line], size() - pos);
+}
+
 size_t SourceRangeHasher::operator()(const torch::jit::SourceRange& key) const {
   return (
       std::hash<uintptr_t>()(reinterpret_cast<uintptr_t>(key.source().get())) ^
       std::hash<size_t>()(key.start()) ^ std::hash<size_t>()(key.end()));
 }
 
-c10::optional<SourceRange> SourceView::findSourceRangeThatGenerated(
+c10::optional<SourceRange> Source::findSourceRangeThatGenerated(
     const SourceRange& range) {
   if (!gen_ranges_) {
     return c10::nullopt;
@@ -18,7 +145,7 @@ c10::optional<SourceRange> SourceView::findSourceRangeThatGenerated(
   return gen_ranges_->findSourceRangeThatGenerated(range);
 }
 
-C10_EXPORT void SourceRange::highlight(std::ostream& out) const {
+void SourceRange::highlight(std::ostream& out) const {
   // Retrieve original SourceRange, if present.
   if (auto orig_source_range = findSourceRangeThatGenerated()) {
     orig_source_range->highlight(out);
@@ -27,7 +154,7 @@ C10_EXPORT void SourceRange::highlight(std::ostream& out) const {
   print_with_context(out, CONTEXT, true, "");
 }
 
-C10_EXPORT void format_stack_trace(
+void format_stack_trace(
     std::ostream& out,
     const std::vector<StackEntry>& entries) {
   bool has_orig_ranges = false;
@@ -63,7 +190,7 @@ C10_EXPORT void format_stack_trace(
   }
 }
 
-C10_EXPORT void SourceRange::print_with_context(
+void SourceRange::print_with_context(
     std::ostream& out,
     size_t context,
     bool highlight,
@@ -73,7 +200,7 @@ C10_EXPORT void SourceRange::print_with_context(
     return;
   }
 
-  c10::string_view str = source_view_->text();
+  auto str = source_view_->text_str().str();
   if (size() == str.size()) {
     // this is just the entire file, not a subset, so print it out.
     // primarily used to print out python stack traces
@@ -141,7 +268,7 @@ C10_EXPORT void SourceRange::print_with_context(
     line_end = start();
     while (line_start < range_end) {
       // move line_end to end of line
-      while (str[line_end] != '\n' && line_end < str.size()) {
+      while (line_end < str.size() && str[line_end] != '\n') {
         ++line_end;
       }
       // print line of code
diff --git a/torch/csrc/jit/frontend/source_range.h b/torch/csrc/jit/frontend/source_range.h
index 1ae3187ee268..694d680b8f75 100644
--- a/torch/csrc/jit/frontend/source_range.h
+++ b/torch/csrc/jit/frontend/source_range.h
@@ -4,43 +4,217 @@
 
 #include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <memory>
+#include <numeric>
 #include <unordered_map>
+
 namespace torch {
 namespace jit {
 
 class SourceRangeUnpickler;
 struct SourceRange;
 
-// SourceView represents a code segment. It keeps track of:
+// A stringlike class backed by a vector of string_view
+// the string represented are logically the concatenation of  the string_views
+// This has advantage of not needing continues memory.
+struct TORCH_API StringCordView {
+  StringCordView();
+  StringCordView(const StringCordView&) = default;
+  StringCordView(
+      std::vector<c10::string_view> inputs,
+      std::vector<std::shared_ptr<std::string>> ownerships);
+
+  StringCordView& operator=(const StringCordView&) = default;
+
+  size_t size() const {
+    return accumulated_sizes_.back();
+  }
+
+  size_t find(const std::string& tok, size_t start) const;
+  StringCordView substr(size_t start, size_t size) const;
+
+  char at(size_t index) const {
+    return *iter_for_pos(index);
+  }
+  char operator[](size_t index) const {
+    return at(index);
+  }
+
+  std::string str() const {
+    std::stringstream ss;
+    for (auto s : pieces_) {
+      ss << std::string(s);
+    }
+    return ss.str();
+  }
+
+  bool operator==(const std::string& rhs) const;
+
+  bool operator==(const StringCordView& rhs) const;
+
+  c10::string_view piece(size_t index) const {
+    return pieces_[index];
+  }
+
+  struct Iterator {
+    Iterator(
+        const StringCordView* str,
+        size_t start_line,
+        size_t start_pos,
+        size_t size)
+        : line_(start_line), pos_(start_pos), str_(str), size_(size) {}
+    explicit Iterator(const StringCordView* str)
+        : Iterator(str, 0, 0, str->size()) {}
+
+    Iterator() : Iterator(nullptr, 0, 0, 0) {}
+
+    Iterator(const Iterator&) = default;
+    Iterator(Iterator&&) = default;
+    Iterator& operator=(const Iterator&) = default;
+    Iterator& operator=(Iterator&&) = default;
+
+    Iterator operator++() {
+      if (size_ == 0) {
+        return *this;
+      }
+      if ((pos_ + 1) < str_->pieces_[line_].size()) {
+        pos_++;
+      } else {
+        line_++;
+        pos_ = 0;
+      }
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator prev(*this);
+      ++(*this);
+      return prev;
+    }
+
+    Iterator next_iter() const {
+      Iterator next(*this);
+      ++next;
+      return next;
+    }
+
+    Iterator& operator+=(size_t num) {
+      if (!has_next()) {
+        return *this;
+      }
+      size_t target_pos = pos_ + num;
+      if (target_pos >= str_->accumulated_sizes_[line_] &&
+          (line_ + 1) < str_->accumulated_sizes_.size() &&
+          target_pos < str_->accumulated_sizes_[line_ + 1]) {
+        pos_ = target_pos;
+        return *this;
+      }
+
+      size_t target_abs_pos = pos() + num;
+      *this = str_->iter_for_pos(target_abs_pos);
+      return *this;
+    }
+
+    bool operator==(const Iterator& rhs) const {
+      if (!has_next() && !rhs.has_next()) {
+        return true;
+      }
+      return (str_ == rhs.str_) && (line_ == rhs.line_) && (pos_ == rhs.pos_);
+    }
+    bool operator!=(const Iterator& rhs) {
+      return !((*this) == rhs);
+    }
+    bool has_next() const {
+      return size_ > 0 && (line_ < str_->pieces_.size());
+    }
+
+    char operator*() const {
+      TORCH_INTERNAL_ASSERT(line_ < str_->pieces_.size());
+      TORCH_INTERNAL_ASSERT(pos_ < str_->pieces_[line_].size());
+      return str_->pieces_[line_].at(pos_);
+    }
+
+    // returns rest of the line of the current iterator
+    c10::string_view rest_line() const {
+      if (line_ >= str_->pieces_.size()) {
+        return "";
+      }
+
+      c10::string_view cur_line = str_->pieces_[line_];
+      return cur_line.substr(pos_, std::string::npos);
+    }
+
+    size_t pos() const {
+      if (size_ == 0) {
+        return 0;
+      }
+      return str_->accumulated_sizes_[line_] + pos_;
+    }
+
+   private:
+    size_t line_;
+    size_t pos_;
+    const StringCordView* str_;
+    size_t size_;
+    friend struct StringCordView;
+  };
+
+  Iterator begin() const {
+    return Iterator(this, 0, 0, size());
+  }
+  Iterator end() const {
+    return Iterator(this, pieces_.size(), 0, 0);
+  }
+  Iterator iter_for_pos(size_t pos) const;
+
+ private:
+  std::vector<c10::string_view> pieces_;
+  std::vector<size_t> accumulated_sizes_;
+  std::vector<std::shared_ptr<std::string>> owned_strings_;
+};
+
+// Source represents a code segment. It keeps track of:
 //  - text_view : the view into text of the code segment
 //  - filename (optional) : if present, represents the name of the file from
 //                          which the code segment originated.
 //  - starting_line_no : represents the line in the original file where the
 //                       code segment started.
-struct SourceView {
-  explicit SourceView(
+struct TORCH_API Source {
+  // Whether or not Source should copy the string passed in the constructor.
+  enum CopiesString { COPIES_STRING, DONT_COPY };
+
+  explicit Source(
       c10::string_view text_view,
-      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : text_view_(text_view),
-        filename_(c10::nullopt),
-        starting_line_no_(0),
+      c10::optional<std::string> filename = c10::nullopt,
+      size_t starting_line_no = 0,
+      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
+      CopiesString copies_str = COPIES_STRING)
+      : filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
         gen_ranges_(std::move(gen_ranges)) {
+    if (copies_str == COPIES_STRING) {
+      std::shared_ptr<std::string> allocated_str =
+          std::make_shared<std::string>(text_view.data(), text_view.size());
+      text_view_ = StringCordView({*allocated_str}, {allocated_str});
+    } else {
+      text_view_ = StringCordView({text_view}, {});
+    }
+
     calc_line_start_offsets();
   }
 
-  SourceView(
-      c10::string_view text_view,
-      c10::optional<std::string> filename,
-      size_t starting_line_no,
+  explicit Source(
+      StringCordView str,
+      c10::optional<std::string> filename = c10::nullopt,
+      size_t starting_line_no = 0,
       std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : text_view_(text_view),
+      : text_view_(str),
         filename_(std::move(filename)),
         starting_line_no_(starting_line_no),
         gen_ranges_(std::move(gen_ranges)) {
     calc_line_start_offsets();
   }
-
   // Given a line number (within source_), return the byte offset of the
   // beginning of that line.
   size_t offset_for_line(size_t line) const {
@@ -54,11 +228,9 @@ struct SourceView {
 
   // Calculate the line (within the code segment) on which `offset` resides.
   size_t lineno_for_offset(size_t offset) const {
-    return std::upper_bound(
-               line_starting_offsets_.begin(),
-               line_starting_offsets_.end(),
-               offset) -
-        line_starting_offsets_.begin() - 1;
+    auto iter = std::upper_bound(
+        line_starting_offsets_.begin(), line_starting_offsets_.end(), offset);
+    return iter - line_starting_offsets_.begin() - 1;
   }
 
   // Calculate the line (within the original source file, if present) on which
@@ -71,11 +243,26 @@ struct SourceView {
     }
   }
 
-  const c10::string_view text() const {
+  StringCordView get_line(size_t lineno) const {
+    auto start = offset_for_line(lineno);
+    auto size = (lineno + 1) < num_lines() ? offset_for_line(lineno + 1) - start
+                                           : text_view_.size() - start;
+    return text_view_.substr(start, size);
+  }
+
+  const StringCordView& text_str() const {
     return text_view_;
   }
 
-  const c10::optional<std::string>& filename() const {
+  char char_at(size_t index) const {
+    return text_view_.at(index);
+  }
+
+  size_t size() const {
+    return text_view_.size();
+  }
+
+  c10::optional<std::string>& filename() {
     return filename_;
   }
 
@@ -86,18 +273,20 @@ struct SourceView {
   c10::optional<SourceRange> findSourceRangeThatGenerated(
       const SourceRange& range);
 
- protected:
-  c10::string_view text_view_;
+  ~Source() = default;
 
  private:
   void calc_line_start_offsets() {
+    line_starting_offsets_.clear();
     line_starting_offsets_.push_back(0);
     size_t pos = 0;
-    while ((pos = text().find('\n', pos)) != std::string::npos) {
+    while ((pos = text_view_.find("\n", pos)) != std::string::npos) {
       line_starting_offsets_.push_back(++pos);
     }
   }
 
+  StringCordView text_view_;
+
   c10::optional<std::string> filename_;
   // If filename_ is not present, starting_line_no_ is don't care
   size_t starting_line_no_;
@@ -108,67 +297,34 @@ struct SourceView {
   std::shared_ptr<SourceRangeUnpickler> gen_ranges_;
 };
 
-// Source represents a code segment like SourceView, but the former owns a copy
-// of source text while the latter doesn't.
-struct Source : public SourceView {
-  explicit Source(
-      std::string text,
-      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : SourceView(text, gen_ranges), text_(std::move(text)) {
-    text_view_ = text_;
-  }
-
-  explicit Source(
-      c10::string_view text_view,
-      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : SourceView(text_view, gen_ranges),
-        text_(text_view.begin(), text_view.end()) {
-    text_view_ = text_;
-  }
-
-  explicit Source(
-      std::string text,
-      c10::optional<std::string> filename,
-      size_t starting_line_no,
-      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : SourceView(text, filename, starting_line_no, gen_ranges),
-        text_(std::move(text)) {
-    text_view_ = text_;
-  }
-
-  explicit Source(
-      c10::string_view text_view,
-      c10::optional<std::string> filename,
-      size_t starting_line_no,
-      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
-      : SourceView(text_view, filename, starting_line_no, gen_ranges),
-        text_(text_view.begin(), text_view.end()) {
-    text_view_ = text_;
-  }
-
-  // Constructor that deepcopies and owns source text referenced in
-  // `source_view`.
-  explicit Source(const SourceView& source_view) : SourceView(source_view) {
-    text_ = std::string(text_view_.begin(), text_view_.end());
-    text_view_ = text_;
-  }
-
-  std::string text_;
-};
-
 // A SourceRange is a reference to subset of a Source, specified by `start` and
 // `end` byte offsets into the source text.
 struct TORCH_API SourceRange {
+  SourceRange(std::shared_ptr<Source> source_view_, size_t start_, size_t end_)
+      : source_view_(std::move(source_view_)), start_(start_), end_(end_) {
+    if (source_view_) {
+      start_iter_ = source_view_->text_str().iter_for_pos(start_);
+    }
+  }
+
+  SourceRange() : source_view_(nullptr), start_(0), end_(0) {}
+
   SourceRange(
-      std::shared_ptr<SourceView> source_view_,
-      size_t start_,
+      std::shared_ptr<Source> source_view_,
+      StringCordView::Iterator start_iter,
       size_t end_)
-      : source_view_(std::move(source_view_)), start_(start_), end_(end_) {}
-  SourceRange() : source_view_(nullptr), start_(0), end_(0) {}
+      : source_view_(std::move(source_view_)),
+        start_(start_iter.pos()),
+        end_(end_),
+        start_iter_(start_iter) {}
+
+  const c10::string_view token_text() const {
+    size_t size = end() - start();
+    return start_iter_.rest_line().substr(0, size);
+  }
 
-  const std::string text() const {
-    auto text_view = source_view_->text().substr(start(), end() - start());
-    return std::string(text_view.begin(), text_view.end());
+  const StringCordView text() const {
+    return source_view_->text_str().substr(start(), end() - start());
   }
   size_t size() const {
     return end() - start();
@@ -183,7 +339,7 @@ struct TORCH_API SourceRange {
       bool highlight,
       const std::string& funcname) const;
 
-  const std::shared_ptr<SourceView>& source() const {
+  const std::shared_ptr<Source>& source() const {
     return source_view_;
   }
   size_t start() const {
@@ -229,21 +385,25 @@ struct TORCH_API SourceRange {
   }
 
  protected:
-  std::shared_ptr<SourceView> source_view_;
+  std::shared_ptr<Source> source_view_;
 
  private:
   size_t start_;
   size_t end_;
+  StringCordView::Iterator start_iter_;
 };
 
 // OwnedSourceRange is just like a SourceRange except that it owns a `Source`
-// instead of `SourceView`. Thus OwnedSourceRange owns a copy of source text.
+// instead of `Source`. Thus OwnedSourceRange owns a copy of source text.
 struct OwnedSourceRange : public SourceRange {
-  OwnedSourceRange(const SourceRange& source_range)
+  explicit OwnedSourceRange(const SourceRange& source_range)
       : SourceRange(source_range) {
     const auto& source = source_range.source();
     if (source) {
-      source_view_ = std::make_shared<Source>(*source);
+      source_view_ = std::make_shared<Source>(
+          source->text_str().str(),
+          source->filename(),
+          source->starting_line_no());
     }
   }
 };
@@ -258,7 +418,7 @@ struct StackEntry {
   SourceRange range;
 };
 
-C10_EXPORT void format_stack_trace(
+TORCH_API void format_stack_trace(
     std::ostream& out,
     const std::vector<StackEntry>& entries);
 
@@ -281,3 +441,14 @@ using SourceRangeTagMap =
 
 } // namespace jit
 } // namespace torch
+
+namespace std {
+template <>
+struct iterator_traits<torch::jit::StringCordView::Iterator> {
+  using value_type = char;
+  using difference_type = ptrdiff_t;
+  using pointer = char*;
+  using reference = char&;
+  using iterator_category = std::forward_iterator_tag;
+};
+} // namespace std
diff --git a/torch/csrc/jit/frontend/source_ref.h b/torch/csrc/jit/frontend/source_ref.h
index 9b99dba45245..185bd3c12684 100644
--- a/torch/csrc/jit/frontend/source_ref.h
+++ b/torch/csrc/jit/frontend/source_ref.h
@@ -21,26 +21,26 @@ namespace jit {
  */
 class TORCH_API SourceRef : public CustomClassHolder {
  public:
-  explicit SourceRef(std::shared_ptr<SourceView> source_view)
+  explicit SourceRef(std::shared_ptr<Source> source_view)
       : source_view_(std::move(source_view)) {}
   bool operator==(const SourceRef& other) const {
     return source_view_ == other.source_view_;
   }
-  bool operator<(const SourceView& other) const {
+  bool operator<(const Source& other) const {
     return source_view_.get() < &other;
   }
-  friend bool operator<(const SourceView& other, const SourceRef& self) {
+  friend bool operator<(const Source& other, const SourceRef& self) {
     return &other < self.source_view_.get();
   }
   bool operator<(const SourceRef& other) const {
     return *this < *other.source_view_.get();
   }
-  const SourceView* operator->() const {
+  const Source* operator->() const {
     return source_view_.get();
   }
 
  private:
-  std::shared_ptr<SourceView> source_view_;
+  std::shared_ptr<Source> source_view_;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/frontend/strtod.cpp b/torch/csrc/jit/frontend/strtod.cpp
index 4ee6ceac38fc..a6dfe67f695a 100644
--- a/torch/csrc/jit/frontend/strtod.cpp
+++ b/torch/csrc/jit/frontend/strtod.cpp
@@ -1,5 +1,6 @@
 // Taken from
 // https://github.com/JuliaLang/julia/blob/v1.1.0/src/support/strtod.c
+#include <torch/csrc/jit/frontend/strtod.h>
 
 #include <ATen/core/Macros.h>
 #include <clocale>
@@ -31,19 +32,19 @@ namespace torch {
 namespace jit {
 
 #ifdef _MSC_VER
-C10_EXPORT double strtod_c(const char* nptr, char** endptr) {
+double strtod_c(const char* nptr, char** endptr) {
   static _locale_t loc = _create_locale(LC_ALL, "C");
   return _strtod_l(nptr, endptr, loc);
 }
 #else
-C10_EXPORT double strtod_c(const char* nptr, char** endptr) {
+double strtod_c(const char* nptr, char** endptr) {
   /// NOLINTNEXTLINE(hicpp-signed-bitwise)
   static locale_t loc = newlocale(LC_ALL_MASK, "C", nullptr);
   return strtod_l(nptr, endptr, loc);
 }
 #endif
 
-C10_EXPORT float strtof_c(const char* nptr, char** endptr) {
+float strtof_c(const char* nptr, char** endptr) {
   return (float)strtod_c(nptr, endptr);
 }
 
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index ae3bbcf45a07..48b948835d1d 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -118,11 +118,13 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
            {"is_sparse", "prim"},
            {"is_sparse_csr", "prim"},
            {"is_mkldnn", "prim"},
-           {"is_mlc", "prim"},
+           {"is_mps", "prim"},
            {"is_quantized", "prim"},
            {"is_vulkan", "prim"},
+           {"is_ipu", "prim"},
            {"is_meta", "prim"},
            {"is_leaf", "aten"},
+           {"is_nested", "prim"},
            {"requires_grad", "prim"},
            {"layout", "prim"},
            {"T", "prim"},
@@ -221,6 +223,14 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
     return SpecialFormValue::create(prim::tolist);
   }
 
+  // Handle calling __getitem__() directly on a Tensor, it needs special
+  // handling because desired method name (`__getitem__`) doesn't match `aten`
+  // operator name of `aten::index`.
+  if (value_->type()->isSubtypeOf(*TensorType::get()) &&
+      field == "__getitem__") {
+    return SpecialFormValue::create(aten::index);
+  }
+
   ErrorReport report(loc);
   report << "'" << value_->type()->repr_str()
          << "' object has no attribute or method '" << field << "'.";
@@ -650,6 +660,7 @@ std::shared_ptr<SugaredValue> ClassValue::call(
   // Generate a new object of the right type, then call `__init__` on it
   auto& g = *m.graph();
   auto self = g.insertNode(g.createObject(type_))->output();
+  self->node()->setSourceRange(loc);
   if (!type_->findMethod("__init__")) {
     throw ErrorReport(loc) << "Class " << type_->name()->name()
                            << " does not have an __init__ function defined";
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index f6a3f72a59d4..c447e9764a6c 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <c10/util/Optional.h>
 #include <functional>
 #include <memory>
 #include <string>
@@ -618,6 +619,25 @@ struct TORCH_API SpecialFormValue : public SugaredValue {
   Symbol form_;
 };
 
+struct TORCH_API LegacyTensorConstructor : public SpecialFormValue {
+  LegacyTensorConstructor(Symbol form, at::ScalarType dtype, at::Device device)
+      : SpecialFormValue(form), device_(device), dtype_(dtype) {}
+
+  static std::shared_ptr<LegacyTensorConstructor> create(
+      Symbol form,
+      at::ScalarType dtype,
+      at::Device device) {
+    return std::make_shared<LegacyTensorConstructor>(form, dtype, device);
+  }
+  at::ScalarType dtype() const {
+    return dtype_;
+  }
+
+ private:
+  at::Device device_;
+  at::ScalarType dtype_;
+};
+
 // matched against for special handling of range expressions
 struct TORCH_API RangeValue : SugaredValue {
   RangeValue(
@@ -744,7 +764,10 @@ struct SimpleSelf : public Self {
 // This is not a SimpleValue so it can not pass through the code paths that
 // expect a SimpleValue as a sugared value.
 struct TORCH_API ExceptionMessageValue : public SugaredValue {
-  explicit ExceptionMessageValue(Value* value) : value_(value) {}
+  explicit ExceptionMessageValue(
+      Value* value,
+      Value* qualified_class_name = nullptr)
+      : value_(value), qualified_class_name_(qualified_class_name) {}
 
   std::string kind() const override {
     return "exception message";
@@ -754,7 +777,14 @@ struct TORCH_API ExceptionMessageValue : public SugaredValue {
     return value_;
   }
 
+  // qualified python class name
+  Value* getQualifiedClassName() {
+    return qualified_class_name_;
+  }
+
+ private:
   Value* value_;
+  Value* qualified_class_name_;
 };
 
 struct TORCH_API ExceptionValue : public SugaredValue {
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index ba5e38adce2f..8fdc8f390804 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -599,6 +599,16 @@ void addInputs(Node* n, const char* name, int64_t value) {
   }
 }
 
+void addInputs(Node* n, const char* name, c10::SymInt value) {
+  using ArgumentStash = jit::tracer::ArgumentStash;
+  if (ArgumentStash::hasValue(name)) {
+    Value* v = ArgumentStash::popValue(name);
+    n->addInput(v);
+  } else {
+    detail::genericAddInput(n, value);
+  }
+}
+
 void addInputs(Node* n, const char* name, c10::optional<int64_t> value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   if (ArgumentStash::hasValue(name)) {
@@ -790,6 +800,10 @@ void addInputs(Node* n, const char* name, at::IntArrayRef value) {
       g->insertNode(g->createList(jit::IntType::get(), info))->output());
 }
 
+void addInputs(Node* n, const char* name, c10::SymIntArrayRef value) {
+  TORCH_CHECK(false, "Tracing operations taking symbolic ints isn't supported");
+}
+
 void addInputs(
     Node* n,
     const char* name,
@@ -797,6 +811,19 @@ void addInputs(
   detail::genericAddOptionalInput(n, name, opt_value);
 }
 
+void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalIntArrayRef& opt_value) {
+  if (opt_value.has_value()) {
+    jit::tracer::addInputs(n, name, *opt_value);
+  } else {
+    Graph* g = n->owningGraph();
+    Value* none = g->insertNode(g->createNone())->output();
+    n->addInput(none);
+  }
+}
+
 void addInputs(Node* n, const char* name, ArrayRef<double> value) {
   std::vector<Value*> info;
   auto& g = getTracingState()->graph;
@@ -894,6 +921,27 @@ autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) {
   return size_var;
 }
 
+autograd::Variable getNumelOf(const autograd::Variable& var) {
+  auto& tracing_state = getTracingState();
+  auto& graph = tracing_state->graph;
+
+  Variable numel_var;
+  {
+    // Make sure this scalar to tensor isn't traced!
+    at::AutoDispatchBelowADInplaceOrView guard;
+    numel_var = scalar_to_tensor(at::Scalar(var.numel()));
+  }
+  auto* value = getValueTrace(var);
+  auto* node = graph->insertNode(graph->create(Symbol::aten("numel"), {value}));
+  recordSourceLocation(node);
+  node->output()->setType(jit::IntType::get());
+
+  auto ten =
+      graph->insertNode(graph->createNumToTensor(node->output()))->output();
+  setValueTrace(numel_var, ten);
+  return numel_var;
+}
+
 void ensureUniqueIfOutOfPlaced(const char* name, const at::Tensor& tensor) {
   auto& state = getTracingState();
   if (state && state->force_outplace == false) {
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 5a928f23f582..de66f0841a6a 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -235,6 +235,7 @@ TORCH_API void abandon();
 // NB: those serve both as an intermediate steps in addInputs below,
 // as well as the overloads that terminate template recursion
 TORCH_API void addInputs(Node* n, const char* name, int64_t value);
+TORCH_API void addInputs(Node* n, const char* name, c10::SymInt value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
@@ -260,10 +261,15 @@ TORCH_API void addInputs(
     const char* name,
     const c10::optional<at::Tensor>& value);
 TORCH_API void addInputs(Node* n, const char* name, ArrayRef<int64_t> value);
+TORCH_API void addInputs(Node* n, const char* name, c10::SymIntArrayRef value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
     const c10::optional<ArrayRef<int64_t>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalIntArrayRef& opt_value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
@@ -385,6 +391,8 @@ TORCH_API autograd::Variable getSizeOf(
     const autograd::Variable& var,
     int64_t dim);
 
+TORCH_API autograd::Variable getNumelOf(const autograd::Variable& var);
+
 } // namespace tracer
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index c8ec5d23947e..b88fa36d6459 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -1,12 +1,14 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 
+#include <ATen/core/interned_strings.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/utils/memory.h>
-
 #include <fstream>
 
 namespace torch {
@@ -208,9 +210,13 @@ struct AliasDb::WriteRegistry {
   std::unordered_set<Node*> writesToAllWildcards_;
 };
 
-AliasDb::AliasDb(std::shared_ptr<Graph> graph, bool isFrozen)
+AliasDb::AliasDb(
+    std::shared_ptr<Graph> graph,
+    bool isFrozen,
+    bool descendFunctionCalls)
     : graph_(std::move(graph)),
       isFrozen_(isFrozen),
+      descend_function_calls_(descendFunctionCalls),
       memoryDAGBuilder_(std::make_unique<MemoryDAGBuilder>()),
       writeRegistry_(std::make_unique<AliasDb::WriteRegistry>()) {
   analyze(graph_);
@@ -617,6 +623,7 @@ void AliasDb::analyzeImpl(Node* node) {
       return analyzeLoop(node);
     case prim::FusionGroup:
     case prim::CudaFusionGroup:
+    case prim::oneDNNFusionGroup:
     case prim::FunctionalGraph:
     case prim::DifferentiableGraph:
     case prim::FallbackGraph:
@@ -654,6 +661,10 @@ void AliasDb::analyzeImpl(Node* node) {
     case prim::MMBatchSide:
     case prim::BroadcastSizes:
     case prim::ChunkSizes:
+    // this should never be seen outside of initial compilation
+    // but because of some dependencies with closure invoking alias
+    // db needs to be handled here
+    case prim::EmptyListLiteral:
     case prim::Closure:
     case prim::CreateObject:
     case prim::tolist:
@@ -705,7 +716,37 @@ void AliasDb::analyzeImpl(Node* node) {
       makePointerTo(node->output(), node->inputs().at(0));
       return;
     case prim::CallFunction:
-    case prim::CallMethod:
+    case prim::CallMethod: {
+      // TODO: this can be improved with summarizes of what the function does
+      // for now we assume the worst
+      if (!descend_function_calls_) {
+        return analyzeConservative(node);
+      }
+      auto g = tryToGraphFunction(node);
+      if (!g) {
+        return analyzeConservative(node);
+      }
+      // this is an unoptimized path - we copy the subgraph for each function
+      // call past the first - so we do not generally enable the recursive
+      // analysis. use cases for fine-grained alias analysis without inlining
+      // are very uncommon
+      auto graph = g->optimized_graph();
+      // alias analysis will use Value* as mappings for information,
+      // so for each analysis of a particular function call we need a new graph
+      // for all copies made, store them for duration of analysis so we do not
+      // run into lifetime issues with the graph
+      std::vector<std::shared_ptr<Graph>>& graphs =
+          function_call_copies_[graph.get()];
+      if (graphs.size() == 0) {
+        graphs.push_back(graph);
+        analyzeSubgraph(node, graph);
+      } else {
+        auto copied_graph = graph->copy();
+        graphs.push_back(copied_graph);
+        analyzeSubgraph(node, copied_graph);
+      }
+      return;
+    }
     case prim::Enter:
     case prim::Exit:
       // TODO: this can be improved with summarizes of what the function does
@@ -935,10 +976,14 @@ void AliasDb::analyzeGradOf(Node* node) {
   mapAliases(node->outputs(), grad_of_block->outputs());
 }
 
-void AliasDb::analyzeSubgraph(Node* node) {
-  const auto subgraph = node->g(attr::Subgraph).get();
+void AliasDb::analyzeSubgraph(Node* node, std::shared_ptr<Graph> subgraph) {
   const auto subgraphBlock = subgraph->block();
-  mapAliases(subgraphBlock->inputs(), node->inputs());
+  // CallFunction nodes have an extra first parameter
+  if (node->kind() == prim::CallFunction) {
+    mapAliases(subgraphBlock->inputs(), node->inputs().slice(1));
+  } else {
+    mapAliases(subgraphBlock->inputs(), node->inputs());
+  }
 
   analyze(subgraphBlock);
 
@@ -947,11 +992,15 @@ void AliasDb::analyzeSubgraph(Node* node) {
   // subgraph block.
   TORCH_INTERNAL_ASSERT(
       subgraphBlock->outputs().size() >= node->outputs().size());
-  for (const auto i : c10::irange(node->outputs().size())) {
+  for (size_t i = 0; i < node->outputs().size(); i++) {
     makePointerTo(node->outputs()[i], subgraphBlock->outputs()[i]);
   }
 }
 
+void AliasDb::analyzeSubgraph(Node* node) {
+  const auto subgraph = node->g(attr::Subgraph);
+  return analyzeSubgraph(node, subgraph);
+}
 // For nodes that generate a fresh value from nothing
 void AliasDb::analyzeCreator(Node* node) {
   for (Value* output : node->outputs()) {
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index fa840621682f..a7d968b87bd2 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -40,12 +40,21 @@ namespace jit {
  * mutable, so you can add and delete elements from it. On the other
  * hand, you can't modify a Tuple once you create it, making `Tuple` an
  * immutable container.)
+ *
+ * `isFrozen` - if the Module is frozen then consider attributes as freshly
+ * created objects. Freezing API invokes alias analysis to check if they are
+ * mutated internally.
+ *
+ * `descendFunctionCalls` - recursively analyze function and method calls
+ * instead of conservative analysis. Generally analysis should be done after
+ * inlining so the implmentation for recursive analysis is unoptimized.
  */
 class AliasDb {
  public:
   TORCH_API explicit AliasDb(
       std::shared_ptr<Graph> graphi,
-      bool isFrozen = false);
+      bool isFrozen = false,
+      bool descendFunctionCalls = false);
   TORCH_API ~AliasDb();
 
   // There are limitations to what effects the alias analysis can track. Two
@@ -152,11 +161,11 @@ class AliasDb {
    * this.
    */
   // Copy `existing`s aliasing info to `new_value`, and remove `existing`.
-  void replaceWithNewValue(Value* existing, Value* new_value);
+  TORCH_API void replaceWithNewValue(Value* existing, Value* new_value);
   // Copy `from`s aliasing info to `to`.
-  void copyValue(Value* from, Value* to);
+  TORCH_API void copyValue(Value* from, Value* to);
   // Create a new `value` that does not alias anything else.
-  void createValue(const Value* value);
+  TORCH_API void createValue(const Value* value);
 
   // Enable more precise treatment of prim::TupleConstruct.
   void enablePreciseTupleContainerAnalysis();
@@ -208,6 +217,7 @@ class AliasDb {
   void analyzeImpl(Node* node);
   void analyzeIf(Node* node);
   void analyzeLoop(Node* node);
+  void analyzeSubgraph(Node* node, std::shared_ptr<Graph> subgraph);
   void analyzeSubgraph(Node* node);
   void analyzeCreator(Node* node);
   void analyzeExtractor(Node* node);
@@ -249,6 +259,10 @@ class AliasDb {
   // internally.
   bool isFrozen_;
 
+  bool descend_function_calls_;
+  std::unordered_map<Graph*, std::vector<std::shared_ptr<Graph>>>
+      function_call_copies_;
+
   // The points-to graph that stores aliasing relationships
   std::unique_ptr<MemoryDAGBuilder> memoryDAGBuilder_;
   std::unique_ptr<MemoryDAG> memoryDAG_;
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 7f32d3b1cb6c..ee51e5c29e77 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -508,6 +508,7 @@ void Node::lint() const {
       break;
     case prim::FusionGroup:
     case prim::CudaFusionGroup:
+    case prim::oneDNNFusionGroup:
       checkSameDevice(this);
       // TODO: Typecheck the parameters
       g(attr::Subgraph)->lint();
@@ -2285,10 +2286,7 @@ const Symbol ProfileOp::Kind = ::c10::prim::profile;
 const Symbol ProfileIValueOp::Kind = ::c10::prim::profile_ivalue;
 
 OperatorSet::OperatorSet(std::initializer_list<const char*> sig_literals) {
-  for (const char* sig : sig_literals) {
-    auto op = getOperatorForLiteral(sig);
-    ops[Symbol::fromQualString(op->schema().name())].push_back(op);
-  }
+  insert(sig_literals);
 }
 
 std::vector<std::shared_ptr<Operator>> OperatorSet::getOps() const {
@@ -2300,6 +2298,13 @@ std::vector<std::shared_ptr<Operator>> OperatorSet::getOps() const {
   return result;
 }
 
+void OperatorSet::insert(std::initializer_list<const char*> sig_literals) {
+  for (const char* sig : sig_literals) {
+    auto op = getOperatorForLiteral(sig);
+    ops[Symbol::fromQualString(op->schema().name())].push_back(op);
+  }
+}
+
 bool Node::isMemberOf(const OperatorSet& os) const {
   auto it = os.ops.find(kind());
   if (it == os.ops.end()) {
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 2ce2a3c420df..ca10809b5e70 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -1624,9 +1624,10 @@ TORCH_API std::vector<Node*> findAllNodes(
     Symbol kind,
     bool recurse);
 
-struct OperatorSet {
+struct TORCH_API OperatorSet {
   OperatorSet(std::initializer_list<const char*> sig_literals);
   std::vector<std::shared_ptr<Operator>> getOps() const;
+  void insert(std::initializer_list<const char*> sig_literals);
 
  private:
   friend struct Node;
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index db6a7660f1f0..1f790de92cb1 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -1,10 +1,18 @@
 #include <torch/csrc/jit/ir/irparser.h>
 
+#include <ATen/EmptyTensor.h>
 #include <torch/csrc/jit/frontend/lexer.h>
 #include <torch/csrc/jit/frontend/parse_string_literal.h>
 #include <torch/csrc/jit/frontend/schema_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#endif
+
 #include <string>
 #include <vector>
 
@@ -18,15 +26,18 @@ class IRParser {
   friend void parseIR(
       const std::string& str,
       torch::jit::Graph* graph,
-      std::unordered_map<std::string, Value*>& vmap);
+      std::unordered_map<std::string, Value*>& vmap,
+      bool parse_tensor_constants);
   IRParser(
       const std::string& str,
       torch::jit::Graph* graph,
-      std::unordered_map<std::string, Value*>& vmap)
+      std::unordered_map<std::string, Value*>& vmap,
+      bool parse_tensor_constants)
       : L(std::make_shared<Source>(str)),
         g(graph),
         vmap(vmap),
-        type_parser(L, /*parse_complete_tensor_types*/ true) {}
+        type_parser(L, /*parse_complete_tensor_types*/ true),
+        parse_tensor_constants_(parse_tensor_constants) {}
 
   std::string parseVar();
   VarWithType parseVarWithType(bool allow_optional = false);
@@ -55,12 +66,17 @@ class IRParser {
       int end,
       const std::function<void()>& callback);
 
+  void bypassTypeAnnotationList();
+
   Value* findValueInVMap(const std::string& name);
 
   torch::jit::Lexer L;
   torch::jit::Graph* g = nullptr;
   std::unordered_map<std::string, Value*>& vmap;
   SchemaTypeParser type_parser;
+  bool parse_tensor_constants_;
+  std::vector<Node*> deferred_tensor_value_initializations_;
+  std::vector<Node*> deferred_empty_container_initializations_;
 };
 
 struct ParsedLiteral {
@@ -89,14 +105,18 @@ struct VarWithType {
 void parseIR(
     const std::string& str,
     torch::jit::Graph* graph,
-    std::unordered_map<std::string, Value*>& vmap) {
-  torch::jit::IRParser p(str, graph, vmap);
+    std::unordered_map<std::string, Value*>& vmap,
+    bool parse_tensor_constants) {
+  torch::jit::IRParser p(str, graph, vmap, parse_tensor_constants);
   p.parse();
 }
 
-void parseIR(const std::string& str, torch::jit::Graph* graph) {
+void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    bool parse_tensor_constants) {
   std::unordered_map<std::string, Value*> vmap;
-  parseIR(str, graph, vmap);
+  parseIR(str, graph, vmap, parse_tensor_constants);
 }
 
 VarWithType IRParser::parseVarWithType(bool allow_optional) {
@@ -117,17 +137,23 @@ VarWithType IRParser::parseVarWithType(bool allow_optional) {
 
 std::string IRParser::parseVar() {
   L.expect('%');
-  if (L.cur().kind == TK_IDENT) {
-    auto name = L.expect(TK_IDENT).text();
-    if (L.cur().kind == TK_NUMBER) {
-      auto suffix = L.expect(TK_NUMBER).text();
-      AT_ASSERT(suffix[0] == '.');
-      name += suffix;
+  std::string name;
+  bool continue_parsing;
+  do {
+    if (L.cur().kind == TK_IDENT) {
+      name += L.expect(TK_IDENT).text();
+    } else {
+      name += L.expect(TK_NUMBER).text();
     }
-    return name;
-  } else {
-    return L.expect(TK_NUMBER).text();
-  }
+    continue_parsing = false;
+    if (L.nextIf('.')) {
+      continue_parsing = true;
+      name += '.';
+    } else if (L.cur().kind == TK_NUMBER && L.cur().text()[0] == '.') {
+      continue_parsing = true;
+    }
+  } while (continue_parsing);
+  return name;
 }
 
 void IRParser::parseOperatorOutputs(std::vector<VarWithType>* outs) {
@@ -184,12 +210,61 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
       AT_ASSERTM(!type_alias.second, "Parsing IR with Alias Info not handled");
       r.ty = type_alias.first;
       return r;
+    case '<': {
+      L.next();
+      auto text = L.expect(TK_IDENT);
+      if (text.text() != "Tensor") {
+        throw ErrorReport(token.range)
+            << "Could not parse literal" << token.text();
+      }
+      if (!parse_tensor_constants_) {
+        throw ErrorReport(token.range)
+            << "Tensor constant encountered but `parse_tensor_constants` set to false"
+            << token.text();
+      }
+      L.expect('>');
+      // these values will be set with randomly initialized data in
+      // a post processing pass;
+      deferred_tensor_value_initializations_.push_back(n);
+      r.k = AttributeKind::t;
+      return r;
+    }
+    case '{': {
+      L.next();
+      if (L.cur().kind == '-') {
+        L.next();
+      }
+      auto text = L.expect(TK_NUMBER);
+      if (!parse_tensor_constants_) {
+        throw ErrorReport(token.range)
+            << "Single-element tensor constant encoutered but "
+            << "`parse_tensor_constants` is set to false " << token.text();
+      }
+      L.expect('}');
+      deferred_tensor_value_initializations_.push_back(n);
+      r.k = AttributeKind::t;
+      return r;
+    }
     default:
       throw ErrorReport(token.range)
           << "Could not parse literal" << token.text();
   }
 }
 
+void IRParser::bypassTypeAnnotationList() {
+  int depth = 0;
+  bool bypassed_list = false;
+  while (depth != 0 || !bypassed_list) {
+    if (L.cur().kind == '[') {
+      bypassed_list = true;
+      depth++;
+    } else if (L.cur().kind == ']') {
+      depth--;
+    }
+    L.next();
+  }
+}
+
 /** \brief Parse attribute and add it to the node N.
  *
  * The function determines the attribute type (string, int, float, complex, list
@@ -264,6 +339,30 @@ void IRParser::parseAttr(Node* n) {
       default:
         throw ErrorReport(L.cur().range) << "Unexpected attr type";
     }
+  } else if (L.cur().text() == "annotate") {
+    L.next();
+    L.expect('(');
+    auto type = L.cur().text();
+    if (type != "List" && type != "Dict") {
+      throw ErrorReport(L.cur().range)
+          << "Unexpected annotation (only List and Dict can be parsed)";
+    }
+    L.next();
+    // ignore the annotations on the IValue constants, and instead recover
+    // type from the Node output
+    // Note: we could also use script_type_parser
+    bypassTypeAnnotationList();
+    L.expect(',');
+    // expect an empty definition (note - this isn't always true)
+    if (type == "Dict") {
+      L.expect('{');
+      L.expect('}');
+    } else if (type == "List") {
+      L.expect('[');
+      L.expect(']');
+    }
+    L.expect(')');
+    deferred_empty_container_initializations_.push_back(n);
   } else {
     // scalar
     ParsedLiteral r = parseScalarLiteral(n);
@@ -283,6 +382,9 @@ void IRParser::parseAttr(Node* n) {
       case AttributeKind::ty:
         n->ty_(Symbol::attr(attrname), r.ty);
         break;
+      case AttributeKind::t:
+        // initialized with random data later
+        break;
       default:
         throw ErrorReport(L.cur().range) << "Unexpected attr type";
     }
@@ -483,6 +585,35 @@ void IRParser::parse() {
 
   // The last statement should be return, which specifies graph outputs
   parseReturnOperator();
+
+  for (Node* n : deferred_tensor_value_initializations_) {
+    auto type = n->output()->type()->expect<TensorType>();
+    auto tt = n->output()->type()->cast<TensorType>();
+    TORCH_INTERNAL_ASSERT(tt, "expected tensor output ", *n);
+    auto sizes = tt->sizes().concrete_sizes();
+    TORCH_INTERNAL_ASSERT(sizes);
+    auto strides = tt->strides().concrete_sizes();
+    TORCH_INTERNAL_ASSERT(strides);
+    auto device = tt->device();
+    TORCH_INTERNAL_ASSERT(device);
+    auto dtype = tt->scalarType();
+    TORCH_INTERNAL_ASSERT(dtype);
+    auto options = at::TensorOptions(*device).dtype(*dtype);
+    auto t = n->t_(attr::value, at::empty_strided(*sizes, *strides, options));
+    (void)t;
+  }
+
+  for (Node* n : deferred_empty_container_initializations_) {
+    auto type = n->output()->type();
+    IValue val;
+    if (type->kind() == TypeKind::ListType) {
+      val = c10::impl::GenericList(type->containedType(0));
+    } else if (type->kind() == TypeKind::DictType) {
+      val = c10::impl::GenericDict(
+          type->containedType(0), type->containedType(1));
+    }
+    n->ival_(attr::value, val);
+  }
 }
 
 void IRParser::parseList(
diff --git a/torch/csrc/jit/ir/irparser.h b/torch/csrc/jit/ir/irparser.h
index f49e8150521a..e6e0301b22c2 100644
--- a/torch/csrc/jit/ir/irparser.h
+++ b/torch/csrc/jit/ir/irparser.h
@@ -4,6 +4,7 @@
 #include <string>
 #include <unordered_map>
 
+#include <c10/util/Optional.h>
 #include <torch/csrc/Export.h>
 
 namespace torch {
@@ -13,17 +14,27 @@ struct Graph;
 struct Value;
 
 // \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
-TORCH_API void parseIR(const std::string& str, torch::jit::Graph* graph);
+// if parse_tensor_constants is true will construct empty tensors
+// for Tensor constants with random or unitialized contents, otherwise will
+// throw
+TORCH_API void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    bool parse_tensor_constants = false);
 
 /** \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
  *
  * \p VMAP is filled with String to Value pairs allowing to index Values in the
  * newly created graph by their name in the original IR string.
+ * if parse_tensor_constants is true will construct empty tensors
+ * for Tensor constants with random or unitialized contents, otherwise will
+ * throw
  */
 TORCH_API void parseIR(
     const std::string& str,
     torch::jit::Graph* graph,
-    std::unordered_map<std::string, Value*>& vmap);
+    std::unordered_map<std::string, Value*>& vmap,
+    bool parse_tensor_constants = false);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/jit_log.h b/torch/csrc/jit/jit_log.h
index ba5c174ec2da..c3a0fa303c7c 100644
--- a/torch/csrc/jit/jit_log.h
+++ b/torch/csrc/jit/jit_log.h
@@ -109,6 +109,14 @@ TORCH_API std::ostream& operator<<(
 // pass
 #define GRAPH_DEBUG(...) \
   JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_DEBUG, __VA_ARGS__);
+// use GRAPH_EXPORT to export a graph so that the IR can be loaded by a script
+#define GRAPH_EXPORT(MSG, G)                       \
+  JIT_LOG(                                         \
+      ::torch::jit::JitLoggingLevels::GRAPH_DEBUG, \
+      MSG,                                         \
+      "\n<GRAPH_EXPORT>\n",                        \
+      (G)->toString(),                             \
+      "</GRAPH_EXPORT>");
 
 #define GRAPH_DUMP_ENABLED \
   (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_DUMP))
diff --git a/torch/csrc/jit/mobile/code.h b/torch/csrc/jit/mobile/code.h
index 7e01bd7199e1..128b193b63aa 100644
--- a/torch/csrc/jit/mobile/code.h
+++ b/torch/csrc/jit/mobile/code.h
@@ -30,6 +30,8 @@ struct Code {
   // be done in parseMethods().
   std::vector<mobile::Function*> functions_;
   size_t register_size_ = 0; // Aggregated output size.
+  // initialized means operators_ array is filled with operators
+  bool initialized = false;
 };
 
 } // namespace mobile
diff --git a/torch/csrc/jit/mobile/compatibility/backport.cpp b/torch/csrc/jit/mobile/compatibility/backport.cpp
index 4d934fbf708e..3cf184667f1e 100644
--- a/torch/csrc/jit/mobile/compatibility/backport.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport.cpp
@@ -21,7 +21,7 @@ const static BackportManager backportManager;
 // Forward declare so that _backport_for_mobile() overloads can
 // call this method directly.
 bool _backport_for_mobile_impl(
-    std::shared_ptr<IStreamAdapter> istream_adapter,
+    std::istream& oss,
     PyTorchStreamWriter& writer,
     const int64_t to_version);
 
@@ -29,24 +29,20 @@ bool _backport_for_mobile(
     std::istream& in,
     std::ostream& out,
     const int64_t to_version) {
-  std::unique_ptr<IStreamAdapter> rai = std::make_unique<IStreamAdapter>(&in);
   auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
     out.write(static_cast<const char*>(buf), nbytes);
     return !out ? 0 : nbytes;
   };
   PyTorchStreamWriter writer(writer_func);
-  return _backport_for_mobile_impl(std::move(rai), writer, to_version);
+  return _backport_for_mobile_impl(in, writer, to_version);
 }
 
 bool _backport_for_mobile(
     std::istream& in,
     const std::string& output_filename,
     const int64_t to_version) {
-  std::unique_ptr<IStreamAdapter> istream_adapter =
-      std::make_unique<IStreamAdapter>(&in);
   PyTorchStreamWriter writer(output_filename);
-  return _backport_for_mobile_impl(
-      std::move(istream_adapter), writer, to_version);
+  return _backport_for_mobile_impl(in, writer, to_version);
 }
 
 bool _backport_for_mobile(
@@ -59,15 +55,13 @@ bool _backport_for_mobile(
   if (!file_stream) {
     AT_ERROR("open file failed, file path: ", input_filename);
   }
-  istream_adapter = std::make_unique<IStreamAdapter>(&file_stream);
-
   auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
     out.write(static_cast<const char*>(buf), nbytes);
     return !out ? 0 : nbytes;
   };
+
   PyTorchStreamWriter writer(writer_func);
-  return _backport_for_mobile_impl(
-      std::move(istream_adapter), writer, to_version);
+  return _backport_for_mobile_impl(file_stream, writer, to_version);
 }
 
 bool _backport_for_mobile(
@@ -75,28 +69,25 @@ bool _backport_for_mobile(
     const std::string& output_filename,
     const int64_t to_version) {
   std::ifstream file_stream;
-  std::unique_ptr<IStreamAdapter> istream_adapter;
   file_stream.open(input_filename, std::ifstream::in | std::ifstream::binary);
   if (!file_stream) {
     AT_ERROR("open file failed, file path: ", input_filename);
   }
-  istream_adapter = std::make_unique<IStreamAdapter>(&file_stream);
 
   PyTorchStreamWriter writer(output_filename);
-  return _backport_for_mobile_impl(
-      std::move(istream_adapter), writer, to_version);
+  return _backport_for_mobile_impl(file_stream, writer, to_version);
 }
 
 bool _backport_for_mobile_impl(
-    std::shared_ptr<IStreamAdapter> istream_adapter,
+    std::istream& oss,
     PyTorchStreamWriter& writer,
     const int64_t to_version) {
   if (!backportManager.hasBytecodeBackportFunction(to_version + 1)) {
     return false;
   }
-  auto from_version = _get_model_bytecode_version(istream_adapter);
-  return backportManager.backport(
-      istream_adapter, writer, from_version, to_version);
+  oss.seekg(0, oss.beg);
+  auto from_version = _get_model_bytecode_version(oss);
+  return backportManager.backport(oss, writer, from_version, to_version);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 5cf6fea97011..e6413ceb030e 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -27,6 +27,8 @@ constexpr int64_t kBytecodeVersionV4 = 0x4L;
 constexpr int64_t kBytecodeVersionV5 = 0x5L;
 constexpr int64_t kBytecodeVersionV6 = 0x6L;
 constexpr int64_t kBytecodeVersionV7 = 0x7L;
+constexpr int64_t kBytecodeVersionV8 = 0x8L;
+constexpr int64_t kBytecodeVersionV9 = 0x9L;
 } // namespace
 
 /********************** Utility Functions **********************/
@@ -434,7 +436,8 @@ std::stringstream backport_v6_to_v5(std::stringstream& input_model_stream) {
   {
     BytecodeEmitModeGuard argNumGuard(
         true /*emit_default_input_instructions*/,
-        false /*enable_defaults_args_with_out_args*/);
+        false /*enable_defaults_args_with_out_args*/,
+        false /*enable_emit_promoted_ops*/);
     torch_script._save_for_mobile(
         intermediate_model_stream, extra_files, hasBytecodeDebug);
   }
@@ -501,7 +504,8 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) {
   {
     BytecodeEmitModeGuard argNumGuard(
         false /*emit_default_input_instructions*/,
-        false /*enable_defaults_args_with_out_args*/);
+        false /*enable_defaults_args_with_out_args*/,
+        false /*enable_emit_promoted_ops*/);
     torch_script._save_for_mobile(
         intermediate_model_stream, extra_files, hasBytecodeDebug);
   }
@@ -512,6 +516,65 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) {
   return output_model_stream;
 }
 
+std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) {
+  ExtraFilesMap extra_files;
+  Module torch_script =
+      torch::jit::load(input_model_stream, c10::nullopt, extra_files);
+  std::stringstream intermediate_model_stream;
+  // TODO(@pavithran) : Check if debug info is available and use load/save while
+  // backporting hardcode debaug info to be false untill supported.
+  bool hasBytecodeDebug = false;
+  {
+    BytecodeEmitModeGuard argNumGuard(
+        false /*emit_default_input_instructions*/,
+        true /*enable_defaults_args_with_out_args*/,
+        true /*enable_emit_promoted_ops*/);
+    torch_script._save_for_mobile(
+        intermediate_model_stream,
+        extra_files,
+        hasBytecodeDebug,
+        /*use_flatbuffer=*/false);
+  }
+  // Update the bytecode version (from 9 to 8)
+  std::stringstream output_model_stream =
+      update_bytecode_version(intermediate_model_stream, kBytecodeVersionV8);
+
+  return output_model_stream;
+}
+
+std::stringstream backport_v8_to_v7(std::stringstream& input_model_stream) {
+  std::shared_ptr<IStreamAdapter> rai =
+      std::make_shared<IStreamAdapter>(&input_model_stream);
+  auto reader = std::make_shared<PyTorchStreamReader>(rai);
+  // extra_files are kept
+  auto records = reader->getAllRecords();
+  bool hasBytecodeDebug = reader->hasRecord("mobile_debug_handles.pkl");
+  ExtraFilesMap extra_files;
+  for (const auto& record : records) {
+    std::size_t found = record.find_last_of("/\\");
+    auto path = record.substr(0, found);
+    if ("extra" == path) {
+      extra_files.emplace(record.substr(found + 1), "");
+    }
+  }
+  Module torch_script = torch::jit::load(rai, c10::nullopt, extra_files);
+  std::stringstream intermediate_model_stream;
+  {
+    BytecodeEmitModeGuard argNumGuard(
+        false /*emit_default_input_instructions*/,
+        true /*enable_defaults_args_with_out_args*/,
+        false /*enable_emit_promoted_ops*/);
+    torch_script._save_for_mobile(
+        intermediate_model_stream, extra_files, hasBytecodeDebug);
+  }
+
+  // Update the bytecode version (from 8 to 7)
+  std::stringstream output_model_stream =
+      update_bytecode_version(intermediate_model_stream, kBytecodeVersionV7);
+
+  return output_model_stream;
+}
+
 } // namespace
 
 /********************** BackportManager **********************/
@@ -528,6 +591,8 @@ BackportManager::BackportManager() {
   registerBytecodeBackportFunction(kBytecodeVersionV5, backport_v5_to_v4);
   registerBytecodeBackportFunction(kBytecodeVersionV6, backport_v6_to_v5);
   registerBytecodeBackportFunction(kBytecodeVersionV7, backport_v7_to_v6);
+  registerBytecodeBackportFunction(kBytecodeVersionV8, backport_v8_to_v7);
+  registerBytecodeBackportFunction(kBytecodeVersionV9, backport_v9_to_v8);
 }
 
 std::unordered_map<
@@ -562,12 +627,10 @@ void BackportManager::registerBytecodeBackportFunction(
 // istream_adapter has access to it. During the backport process,
 // the intermediate result will be stored with stream.
 bool BackportManager::backport(
-    std::shared_ptr<IStreamAdapter> istream_adapter,
+    std::istream& oss,
     PyTorchStreamWriter& final_writer,
     int64_t from_version,
     int64_t to_version) const {
-  PyTorchStreamReader start_reader(istream_adapter);
-
   if (from_version <= to_version) {
     TORCH_WARN(
         "backport donesn't support backporting model to new version. It's trying to backport from version ",
@@ -582,9 +645,9 @@ bool BackportManager::backport(
   // 1) Given an istream_adapter (an adapter with access to the input model, the
   // model can be from istream, file and etc), copy all model content to
   // stringstream
-  std::stringstream oss;
-  get_model_stream(start_reader, oss);
-  std::stringstream input_model_stream(oss.str());
+  oss.seekg(0, std::ios::beg);
+  std::stringstream input_model_stream;
+  input_model_stream << oss.rdbuf();
   std::stringstream output_model_stream;
 
   // 2) backport model, backport_v{i}_to_v{i-1} function's argurment is
@@ -602,6 +665,7 @@ bool BackportManager::backport(
       return false;
     }
 
+    input_model_stream.seekg(0, input_model_stream.beg);
     auto input_model_stream_version =
         _get_model_bytecode_version(input_model_stream);
 
@@ -619,6 +683,7 @@ bool BackportManager::backport(
         bytecodeBackportFunctions()[bytecode_version--](input_model_stream);
 
     output_model_stream.swap(backport_model_stream);
+    output_model_stream.seekg(0, output_model_stream.beg);
     auto output_model_stream_version =
         _get_model_bytecode_version(output_model_stream);
 
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.h b/torch/csrc/jit/mobile/compatibility/backport_manager.h
index 6a518b391d08..842bb268e5ae 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.h
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.h
@@ -34,7 +34,7 @@ class BackportManager final {
   bytecodeBackportFunctions() const;
 
   bool backport(
-      std::shared_ptr<caffe2::serialize::IStreamAdapter> istream_adapter,
+      std::istream& oss,
       caffe2::serialize::PyTorchStreamWriter& final_writer,
       int64_t from_version,
       int64_t to_version) const;
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 832ff77f545f..41e6c437f461 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -3,6 +3,10 @@
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/compilation_unit.h> // removed after using simple type_resolver/obj_loader
 #include <torch/csrc/jit/mobile/compatibility/model_compatibility.h>
+#include <torch/csrc/jit/mobile/file_format.h>
+#if defined(ENABLE_FLATBUFFER)
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
+#endif
 #include <torch/csrc/jit/mobile/import.h> // removed after using simple type_resolver/obj_loader
 #include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
@@ -69,13 +73,53 @@ uint64_t _get_model_bytecode_version(
     const std::vector<IValue>& bytecode_ivalues);
 
 uint64_t _get_model_bytecode_version(std::istream& in) {
-  std::unique_ptr<IStreamAdapter> rai = std::make_unique<IStreamAdapter>(&in);
-  return _get_model_bytecode_version(std::move(rai));
+  auto orig_pos = in.tellg();
+  in.seekg(0, in.beg);
+  auto format = getFileFormat(in);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if !defined(ENABLE_FLATBUFFER)
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+#else
+      return get_bytecode_version(in);
+#endif
+    }
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<IStreamAdapter> rai =
+          std::make_unique<IStreamAdapter>(&in);
+      auto version = _get_model_bytecode_version(std::move(rai));
+      in.seekg(orig_pos, in.beg);
+      return version;
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
 }
 
 uint64_t _get_model_bytecode_version(const std::string& filename) {
-  std::unique_ptr<FileAdapter> rai = std::make_unique<FileAdapter>(filename);
-  return _get_model_bytecode_version(std::move(rai));
+  auto format = getFileFormat(filename);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if !defined(ENABLE_FLATBUFFER)
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+#else
+      return get_bytecode_version(filename);
+#endif
+    }
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<FileAdapter> rai =
+          std::make_unique<FileAdapter>(filename);
+      return _get_model_bytecode_version(std::move(rai));
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
 }
 
 uint64_t _get_model_bytecode_version(
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index e8b4ed01d02a..05fd60b7886b 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -122,17 +122,26 @@ MobileDebugTable::MobileDebugTable(
       at::DataPtr debug_data;
       size_t debug_size{0};
       std::tie(debug_data, debug_size) = reader->getRecord(record_name);
-      auto ivalues =
-          std::move(*jit::unpickle(
-                         reinterpret_cast<const char*>(debug_data.get()),
-                         debug_size,
-                         nullptr,
-                         {},
-                         c10::parseType)
-                         .toTuple())
-              .elements();
-      SourceRangeDeserializer deserializer;
-      for (auto& val : ivalues) {
+      auto ivalueTuple = jit::unpickle(
+          reinterpret_cast<const char*>(debug_data.get()),
+          debug_size,
+          nullptr,
+          {},
+          c10::parseType);
+      const auto& ivalues = ivalueTuple.toTuple()->elements();
+      IValue lines;
+      std::unique_ptr<SourceRangeDeserializer> deserializer;
+      if (ivalues.size() == 3 && ivalues[0].isString() &&
+          kFormatWithStringTable == ivalues[0].toStringRef()) {
+        // new format
+        deserializer = std::make_unique<SourceRangeDeserializer>(ivalues[1]);
+        lines = ivalues[2];
+      } else {
+        deserializer = std::make_unique<SourceRangeDeserializer>();
+        lines = ivalueTuple;
+      }
+
+      for (auto& val : lines.toTuple()->elements()) {
         auto tup_elems = std::move(*std::move(val).toTuple()).elements();
         // For BC we decode only tuples with 3 elements
         // assuming it contains
@@ -140,7 +149,7 @@ MobileDebugTable::MobileDebugTable(
         if (tup_elems.size() == 3) {
           int64_t debug_handle = tup_elems[kSourceRangeTagIndex].toInt();
           auto source_range =
-              deserializer.deserialize(tup_elems[kSourceRangeIndex]);
+              deserializer->deserialize(tup_elems[kSourceRangeIndex]);
           source_range_map.emplace(debug_handle, std::move(source_range));
         }
       }
diff --git a/torch/csrc/jit/mobile/debug_info.h b/torch/csrc/jit/mobile/debug_info.h
index bbeb4edf2c41..7044aa644d67 100644
--- a/torch/csrc/jit/mobile/debug_info.h
+++ b/torch/csrc/jit/mobile/debug_info.h
@@ -3,6 +3,7 @@
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/ir/scope.h>
+#include <torch/csrc/jit/serialization/source_range_serialization.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/mobile/file_format.h b/torch/csrc/jit/mobile/file_format.h
new file mode 100644
index 000000000000..067645c4f941
--- /dev/null
+++ b/torch/csrc/jit/mobile/file_format.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <istream>
+
+/**
+ * @file
+ *
+ * Helpers for identifying file formats when reading serialized data.
+ *
+ * Note that these functions are declared inline because they will typically
+ * only be called from one or two locations per binary.
+ */
+
+namespace torch {
+namespace jit {
+
+/**
+ * The format of a file or data stream.
+ */
+enum class FileFormat {
+  UnknownFileFormat = 0,
+  FlatbufferFileFormat,
+  ZipFileFormat,
+};
+
+namespace internal {
+
+/// The size of the buffer to pass to #getFileFormat(), in bytes.
+constexpr size_t kFileFormatHeaderSize = 8;
+
+/**
+ * Returns the likely file format based on the magic header bytes in @p header,
+ * which should contain the first bytes of a file or data stream.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(
+    const std::array<char, kFileFormatHeaderSize>& header) {
+  // The size of magic strings to look for in the buffer.
+  static constexpr size_t kMagicSize = 4;
+
+  // Bytes 4..7 of a Flatbuffer-encoded file produced by
+  // `flatbuffer_serializer.h`. (The first four bytes contain an offset to the
+  // actual Flatbuffer data.)
+  static constexpr std::array<char, kMagicSize> kFlatbufferMagicString = {
+      'P', 'T', 'M', 'F'};
+  static constexpr size_t kFlatbufferMagicOffset = 4;
+
+  // The first four bytes of a ZIP file.
+  static constexpr std::array<char, kMagicSize> kZipMagicString = {
+      'P', 'K', '\x03', '\x04'};
+
+  // Note that we check for Flatbuffer magic first. Since the first four bytes
+  // of flatbuffer data contain an offset to the root struct, it's theoretically
+  // possible to construct a file whose offset looks like the ZIP magic. On the
+  // other hand, bytes 4-7 of ZIP files are constrained to a small set of values
+  // that do not typically cross into the printable ASCII range, so a ZIP file
+  // should never have a header that looks like a Flatbuffer file.
+  if (std::memcmp(
+          header.data() + kFlatbufferMagicOffset,
+          kFlatbufferMagicString.data(),
+          kMagicSize) == 0) {
+    // Magic header for a binary file containing a Flatbuffer-serialized mobile
+    // Module.
+    return FileFormat::FlatbufferFileFormat;
+  } else if (
+      std::memcmp(header.data(), kZipMagicString.data(), kMagicSize) == 0) {
+    // Magic header for a zip file, which we use to store pickled sub-files.
+    return FileFormat::ZipFileFormat;
+  }
+  return FileFormat::UnknownFileFormat;
+}
+
+} // namespace internal
+
+/**
+ * Returns the likely file format based on the magic header bytes of @p data.
+ * If the stream position changes while inspecting the data, this function will
+ * restore the stream position to its original offset before returning.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(std::istream& data) {
+  FileFormat format = FileFormat::UnknownFileFormat;
+  std::streampos orig_pos = data.tellg();
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  std::array<char, internal::kFileFormatHeaderSize> header;
+  data.read(header.data(), header.size());
+  if (data.good()) {
+    format = internal::getFileFormat(header);
+  }
+  data.seekg(orig_pos, data.beg);
+  return format;
+}
+
+/**
+ * Returns the likely file format based on the magic header bytes of the file
+ * named @p filename.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(const std::string& filename) {
+  std::ifstream data(filename, std::ifstream::binary);
+  return getFileFormat(data);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 3d60dba925b7..d2edc5f31ae8 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -1,3 +1,4 @@
+#include <flatbuffers/base.h>
 #include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 
 #include <ATen/ATen.h>
@@ -5,6 +6,7 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/core/CPUAllocator.h>
+#include <c10/core/impl/alloc_cpu.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 #include <c10/util/ScopeExit.h>
@@ -15,12 +17,18 @@
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/runtime/instruction.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_read.h>
 #include <torch/custom_class.h>
 
 #include <flatbuffers/flatbuffers.h>
 
+#ifndef DISABLE_UPGRADER
+#include <torch/csrc/jit/mobile/parse_bytecode.h>
+#include <torch/csrc/jit/mobile/upgrader_mobile.h>
+#endif
+
 #if defined(HAVE_MMAP)
 #include <fcntl.h>
 #include <sys/mman.h>
@@ -28,6 +36,12 @@
 #include <unistd.h>
 #endif
 
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+
 #include <string>
 #include <vector>
 
@@ -146,6 +160,23 @@ void FlatbufferLoader::internal_registerTypeResolver(
   type_resolver_ = type_resolver;
 }
 
+void parseExtraFilesFromVector(
+    const flatbuffers::Vector<flatbuffers::Offset<
+        torch::jit::mobile::serialization::ExtraFile>>* files,
+    ExtraFilesMap* extra_files) {
+  for (uint32_t i = 0; i < files->size(); ++i) {
+    const auto* extra_file = files->Get(i);
+    (*extra_files)[extra_file->name()->str()] = extra_file->content()->str();
+  }
+}
+
+void parseExtraFiles(
+    mobile::serialization::Module* module,
+    ExtraFilesMap& extra_files) {
+  auto extra_files_offsets = module->extra_files();
+  parseExtraFilesFromVector(extra_files_offsets, &extra_files);
+}
+
 mobile::Module FlatbufferLoader::parseModule(
     mobile::serialization::Module* module) {
   module_ = module;
@@ -153,6 +184,7 @@ mobile::Module FlatbufferLoader::parseModule(
   all_types_.clear();
   storages_.clear();
   storage_loaded_.clear();
+  module_parsed_ = false;
 
   const auto* ivalues = module->ivalues();
   all_ivalues_.resize(ivalues->size());
@@ -180,8 +212,18 @@ mobile::Module FlatbufferLoader::parseModule(
     class_type->addMethod(f.second);
   }
 
+  module_parsed_ = true;
   return mobile::Module(module_ivalue.toObject(), mcu_);
 }
+namespace {
+void appendUpgraderFunctions(mobile::Function* function) {
+#ifndef DISABLE_UPGRADER
+  for (auto& byteCodeFunctionWithOperator : getUpgraderBytecodeList()) {
+    function->append_function(byteCodeFunctionWithOperator.function);
+  }
+#endif
+}
+} // namespace
 
 std::unique_ptr<mobile::Function> FlatbufferLoader::parseFunction(
     const mobile::serialization::Function* method) {
@@ -198,54 +240,67 @@ std::unique_ptr<mobile::Function> FlatbufferLoader::parseFunction(
     function->append_constant(getIValue(i));
   }
 
-  std::unordered_set<std::string> unsupported_op_names;
-  const int64_t model_version = 0x6L;
+  appendUpgraderFunctions(function.get());
+  // 2. Decides if upgrader is needed
+  const uint32_t operator_version = module_->operator_version();
+  bool use_upgrader =
+      (operator_version < caffe2::serialize::kProducedFileFormatVersion);
+
   for (const auto* op : *method->operators()) {
     c10::optional<int> num_args = c10::nullopt;
     if (op->num_args_serialized() > -1) {
       num_args = op->num_args_serialized();
     }
 
-    auto op_found = function->append_operator(
-        op->name()->str(), op->overload_name()->str(), num_args, model_version);
-
-    if (!op_found) {
-      unsupported_op_names.emplace(
-          op->name()->str() + "/" + op->overload_name()->str());
-    }
+    function->append_operator(
+        op->name()->str(), op->overload_name()->str(), num_args);
   }
 
-  AT_ASSERT(unsupported_op_names.empty());
+  if (should_load_operators_) {
+    function->initialize_operators(true);
+  }
 
   for (const auto i : *method->type_annotations()) {
     function->append_type(getOrCreateTypeAnnotations(i));
   }
 
+  // 3. If upgrader is needed, change change the OP instrunction to CALL
+  // instruction (In next PR, use_upgrader will be parsed to parseInstruction
+  // function and do the actual change)
+  if (use_upgrader) {
+#ifndef DISABLE_UPGRADER
+    applyUpgrader(function.get(), operator_version);
+#endif
+  }
+
   function->set_register_size(method->register_size());
   if (method->schema()) {
-    auto parseArgList = [this](const auto* args_fb) {
-      std::vector<c10::Argument> args;
-      for (const auto* arg_tb : *args_fb) {
-        IValue default_value = getIValue(arg_tb->default_value());
-        TypePtr type_ptr = getOrCreateTypeAnnotations(arg_tb->type());
-        auto arg = c10::Argument(
-            arg_tb->name()->str(),
-            std::move(type_ptr),
-            c10::nullopt /*N*/,
-            std::move(default_value));
-        args.emplace_back(std::move(arg));
-      }
-      return args;
-    };
-    c10::FunctionSchema schema(
-        method->qn()->str(),
-        "" /*overload_name*/,
-        parseArgList(method->schema()->arguments()),
-        parseArgList(method->schema()->returns()),
-        false /*is_varargs*/,
-        false /*is_varret*/);
-
-    function->setSchema(std::move(schema));
+    try {
+      auto parseArgList = [this](const auto* args_fb) {
+        std::vector<c10::Argument> args;
+        for (const auto* arg_tb : *args_fb) {
+          IValue default_value = getIValue(arg_tb->default_value());
+          TypePtr type_ptr = getOrCreateTypeAnnotations(arg_tb->type());
+          auto arg = c10::Argument(
+              arg_tb->name()->str(),
+              std::move(type_ptr),
+              c10::nullopt /*N*/,
+              std::move(default_value));
+          args.emplace_back(std::move(arg));
+        }
+        return args;
+      };
+      c10::FunctionSchema schema(
+          method->qn()->str(),
+          "" /*overload_name*/,
+          parseArgList(method->schema()->arguments()),
+          parseArgList(method->schema()->returns()),
+          false /*is_varargs*/,
+          false /*is_varret*/);
+
+      function->setSchema(std::move(schema));
+    } catch (const c10::Error& e) {
+    }
   }
   return function;
 }
@@ -520,8 +575,16 @@ c10::Storage FlatbufferLoader::getStorage(uint32_t index) {
   if (!storage_loaded_[index]) {
     auto* storage = module_->storage_data()->GetMutableObject(index);
     size_t size = storage->data()->size();
-    void* ptr = static_cast<void*>(storage->mutable_data()->data());
-    at::DataPtr data(ptr, ptr, deleteNothing2, DeviceType::CPU);
+
+    at::DataPtr data;
+    if (should_copy_tensor_memory_) {
+      auto* allocator = at::GetCPUAllocator();
+      data = allocator->allocate(size);
+      memcpy(data.get(), storage->data()->data(), size);
+    } else {
+      void* ptr = static_cast<void*>(storage->mutable_data()->data());
+      data = at::DataPtr(ptr, ptr, deleteNothing2, DeviceType::CPU);
+    }
     storages_[index] =
         c10::Storage(c10::Storage::use_byte_size_t(), size, std::move(data));
     storage_loaded_[index] = true;
@@ -540,10 +603,73 @@ TypePtr FlatbufferLoader::getOrCreateTypeAnnotations(
   return type;
 }
 
+std::tuple<std::shared_ptr<char>, size_t> get_file_content(
+    const char* filename) {
+#if defined(HAVE_MMAP)
+  int fd = open(filename, O_RDONLY);
+  struct stat statbuf {};
+  fstat(fd, &statbuf);
+  size_t size = statbuf.st_size;
+  void* ptr = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  close(fd);
+  auto deleter = [statbuf](char* ptr) { munmap(ptr, statbuf.st_size); };
+  std::shared_ptr<char> data(reinterpret_cast<char*>(ptr), deleter);
+#else
+  FILE* f = fopen(filename, "rb");
+  fseek(f, 0, SEEK_END);
+  size_t size = ftell(f);
+  fseek(f, 0, SEEK_SET);
+  // make sure buffer size is multiple of alignment
+  size_t buffer_size =
+      (size / FLATBUFFERS_MAX_ALIGNMENT + 1) * FLATBUFFERS_MAX_ALIGNMENT;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  fread(data.get(), size, 1, f);
+  fclose(f);
+#endif
+  return std::make_tuple(data, size);
+}
+
+std::tuple<std::shared_ptr<char>, size_t> get_stream_content(std::istream& in) {
+  // get size of the stream and reset to orig
+  std::streampos orig_pos = in.tellg();
+  in.seekg(orig_pos, std::ios::end);
+  const long size = in.tellg();
+  in.seekg(orig_pos, in.beg);
+
+  // read stream
+  // NOLINT make sure buffer size is multiple of alignment
+  size_t buffer_size =
+      (size / FLATBUFFERS_MAX_ALIGNMENT + 1) * FLATBUFFERS_MAX_ALIGNMENT;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  in.read(data.get(), size);
+
+  // reset stream to original position
+  in.seekg(orig_pos, in.beg);
+  return std::make_tuple(data, size);
+}
+
+void FlatbufferLoader::extractJitSourceAndConstants(
+    ExtraFilesMap* jit_sources,
+    std::vector<IValue>* constants) {
+  AT_ASSERT(
+      module_parsed_,
+      "Need to first parse a flatbuffer file before extracing jit_sources");
+  const auto* jit_constants = module_->jit_constants();
+  for (auto i = 0; i < jit_constants->size(); ++i) {
+    constants->emplace_back(getIValue(jit_constants->Get(i)));
+  }
+  parseExtraFilesFromVector(module_->jit_sources(), jit_sources);
+}
+
 mobile::Module parse_and_initialize_mobile_module(
     std::shared_ptr<char> data,
     size_t,
     c10::optional<at::Device>) {
+  TORCH_CHECK(
+      mobile::serialization::ModuleBufferHasIdentifier(data.get()),
+      "Format error");
   auto* flatbuffer_module = mobile::serialization::GetMutableModule(data.get());
   mobile::Module m = FlatbufferLoader().parseModule(flatbuffer_module);
   m.set_delete_memory(std::move(data));
@@ -552,34 +678,52 @@ mobile::Module parse_and_initialize_mobile_module(
 
 mobile::Module initialize_mobile_module(
     mobile::serialization::Module* flatbuffer_module,
-    c10::optional<at::Device>) {
-  mobile::Module m = FlatbufferLoader().parseModule(flatbuffer_module);
+    c10::optional<at::Device>,
+    bool should_copy_tensor_memory) {
+  auto flatbufferLoader = FlatbufferLoader();
+  flatbufferLoader.setShouldCopyTensorMemory(should_copy_tensor_memory);
+  mobile::Module m = flatbufferLoader.parseModule(flatbuffer_module);
   return m;
 }
 
 mobile::Module load_mobile_module_from_file(
     const std::string& filename,
     c10::optional<c10::Device> device) {
-#if defined(HAVE_MMAP)
-  int fd = open(filename.c_str(), O_RDONLY);
-  struct stat statbuf {};
-  fstat(fd, &statbuf);
-  int size = statbuf.st_size;
-  void* ptr = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
-  close(fd);
-  auto deleter = [statbuf](char* ptr) { munmap(ptr, statbuf.st_size); };
-  std::shared_ptr<char> data(reinterpret_cast<char*>(ptr), deleter);
-#else
-  FILE* f = fopen(filename.c_str(), "rb");
-  fseek(f, 0, SEEK_END);
-  long size = ftell(f);
-  fseek(f, 0, SEEK_SET);
-  std::shared_ptr<char> data(static_cast<char*>(malloc(size)), free); // NOLINT
-  fread(data.get(), size, 1, f);
-  fclose(f);
-#endif
+  std::shared_ptr<char> data;
+  size_t size = 0;
+  std::tie(data, size) = get_file_content(filename.c_str());
   return parse_and_initialize_mobile_module(std::move(data), size, device);
 }
 
+uint64_t get_bytecode_version(std::istream& in) {
+  std::shared_ptr<char> data;
+  size_t size = 0;
+  std::tie(data, size) = get_stream_content(in);
+  TORCH_CHECK(
+      mobile::serialization::ModuleBufferHasIdentifier(data.get()),
+      "Format error");
+  auto* flatbuffer_module = mobile::serialization::GetMutableModule(data.get());
+  return flatbuffer_module->bytecode_version();
+}
+
+uint64_t get_bytecode_version(const std::string& filename) {
+  std::shared_ptr<char> data;
+  size_t size = 0;
+  std::tie(data, size) = get_file_content(filename.c_str());
+  TORCH_CHECK(
+      mobile::serialization::ModuleBufferHasIdentifier(data.get()),
+      "Format error");
+  auto* flatbuffer_module = mobile::serialization::GetMutableModule(data.get());
+  return flatbuffer_module->bytecode_version();
+}
+
+mobile::ModuleInfo get_module_info_from_flatbuffer(char* flatbuffer_content) {
+  auto* ff_module = mobile::serialization::GetMutableModule(flatbuffer_content);
+  FlatbufferLoader loader;
+  loader.setShouldLoadOperators(false);
+  mobile::Module m = loader.parseModule(ff_module);
+  return mobile::get_module_info(m);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h
index 9b719b65ebbb..7e883264da50 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.h
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -15,6 +15,8 @@
 namespace torch {
 namespace jit {
 
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
 // On high level, to produce a Module from a file on disk, we need to go
 // through the follow steps:
 // 1. Read: Read the file from disk -> memory
@@ -28,9 +30,15 @@ namespace jit {
 // Parse a mobile::Module from flatbuffer's in-memory Module representation.
 // The caller is assumed to manage the lifetimes of Module.
 // This function does step 3 described above.
+// If should_copy_tensor_memory is true, then the returned module will NOT
+// have refences to flatbuffer_module, so it can be discarded.
+// If should_copy_tensor_memory is false, then returned module will have
+// tensors that points inside of flatbuffer_module; the caller need to make
+// sure that flatbuffer_module outlives returned Module.
 TORCH_API mobile::Module initialize_mobile_module(
     mobile::serialization::Module* flatbuffer_module,
-    c10::optional<at::Device> device = c10::nullopt);
+    c10::optional<at::Device> device = c10::nullopt,
+    bool should_copy_tensor_memory = false);
 
 // Parse a mobile::Module from raw bytes.
 // ownership of data is shared to the returned Module.
@@ -50,7 +58,23 @@ TORCH_API mobile::Module load_mobile_module_from_file(
     const std::string& filename,
     c10::optional<at::Device> device = c10::nullopt);
 
-class FlatbufferLoader {
+TORCH_API void parseExtraFiles(
+    mobile::serialization::Module* module,
+    ExtraFilesMap& extra_files);
+
+TORCH_API std::tuple<std::shared_ptr<char>, size_t> get_file_content(
+    const char* filename);
+
+TORCH_API std::tuple<std::shared_ptr<char>, size_t> get_stream_content(
+    std::istream& in);
+
+TORCH_API uint64_t get_bytecode_version(std::istream& in);
+TORCH_API uint64_t get_bytecode_version(const std::string& filename);
+
+TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer(
+    char* flatbuffer_content);
+
+class TORCH_API FlatbufferLoader {
  public:
   FlatbufferLoader();
 
@@ -61,6 +85,10 @@ class FlatbufferLoader {
       IValueParser parser);
   mobile::Module parseModule(mobile::serialization::Module* module);
 
+  void extractJitSourceAndConstants(
+      ExtraFilesMap* jit_sources,
+      std::vector<IValue>* constants);
+
   typedef TypePtr (*TypeResolver)(
       const std::string& type_str,
       std::shared_ptr<CompilationUnit> cu);
@@ -90,6 +118,22 @@ class FlatbufferLoader {
     return module_;
   }
 
+  bool getShouldCopyTensorMemory() {
+    return should_copy_tensor_memory_;
+  }
+
+  void setShouldCopyTensorMemory(bool should_copy_tensor_memory) {
+    should_copy_tensor_memory_ = should_copy_tensor_memory;
+  }
+
+  // Whether or not should load operators in functions.
+  // Not loading operators is useful because if an operator is not found
+  // then we throw exceptions, and sometimes we want to print out
+  // what operators are included before that to debug.
+  void setShouldLoadOperators(bool should_load_operators) {
+    should_load_operators_ = should_load_operators;
+  }
+
   std::shared_ptr<mobile::CompilationUnit> mcu_;
   std::shared_ptr<CompilationUnit> cu_;
 
@@ -111,6 +155,9 @@ class FlatbufferLoader {
       ivalue_parsers_;
   TypeResolver type_resolver_ = nullptr;
   mobile::serialization::Module* module_ = nullptr;
+  bool module_parsed_ = false;
+  bool should_copy_tensor_memory_ = false;
+  bool should_load_operators_ = true;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index 7da992f6b705..c963fcb85828 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -1,13 +1,12 @@
 #include <ATen/core/dynamic_type.h>
-#include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/mobile/function.h>
 #include <torch/csrc/jit/mobile/interpreter.h>
 #include <torch/csrc/jit/mobile/parse_bytecode.h>
 #include <torch/csrc/jit/mobile/parse_operators.h>
 #include <torch/csrc/jit/mobile/prim_ops_registery.h>
+#include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/runtime/operator.h>
-#include <torch/csrc/jit/serialization/import_export_constants.h>
 
 namespace torch {
 namespace jit {
@@ -45,22 +44,52 @@ void Function::append_instruction(OpCode op, int X, int N) {
   code_.instructions_.emplace_back(op, X, N);
 }
 
-bool Function::append_operator(
+void Function::append_operator(
     const std::string& name,
     const std::string& overload_name,
-    const c10::optional<int>& num_specified_args,
-    int64_t model_version) { /* TODO: T90339189 deprecate all v3 when v3 models
-                                are removed */
+    const c10::optional<int>& num_specified_args) {
   // Keep the original opname in code_
   code_.op_names_.emplace_back(name, overload_name);
-  const auto& opname = code_.op_names_.back();
   code_.operator_input_sizes_.emplace_back(num_specified_args.value_or(-1));
-  auto func = makeOperatorFunction(opname, num_specified_args, model_version);
-  if (!func.has_value()) {
-    return false;
+}
+
+std::string operator_str(const c10::OperatorName& opname) {
+  std::string result = opname.name;
+  if (!opname.overload_name.empty()) {
+    result += "." + opname.overload_name;
   }
-  code_.operators_.emplace_back(*func);
-  return true;
+  return result;
+}
+
+bool Function::initialize_operators(bool should_check_operators) {
+  if (code_.initialized) {
+    return true;
+  }
+  std::unordered_set<std::string> unsupported_op_names;
+  code_.operators_.resize(code_.op_names_.size());
+  bool all_ops_supported = true;
+  for (int i = 0; i < code_.op_names_.size(); i++) {
+    const auto& opname = code_.op_names_[i];
+    int num_args = code_.operator_input_sizes_[i];
+    c10::optional<int> num_specified_args =
+        num_args < 0 ? c10::nullopt : c10::optional<int>(num_args);
+    auto func = makeOperatorFunction(opname, num_specified_args);
+    if (!func.has_value()) {
+      unsupported_op_names.insert(operator_str(opname));
+      all_ops_supported = false;
+      break;
+    } else {
+      code_.operators_[i] = *func;
+    }
+  }
+  if (should_check_operators) {
+    TORCH_CHECK(
+        unsupported_op_names.empty(),
+        "Following ops cannot be found. Please check if the operator library is included in the build. If built with selected ops, check if these ops are in the list. If you are a Meta employee, please see fburl.com/missing_ops for a fix. Or post it in https://discuss.pytorch.org/",
+        c10::Join(", ", unsupported_op_names));
+  }
+  code_.initialized = all_ops_supported;
+  return all_ops_supported;
 }
 
 void Function::append_constant(const c10::IValue& constant) {
@@ -100,6 +129,7 @@ const c10::FunctionSchema& Function::getSchema() const {
 }
 
 void Function::run(Stack& stack) {
+  initialize_operators(/* should_check_operators */ true);
   if (hasSchema()) { // if we have a schema then resolve optional args if any
     getSchema().checkAndNormalizeInputs<c10::DynamicType>(
         stack, std::unordered_map<std::string, IValue>{} /*kwargs*/);
@@ -118,6 +148,7 @@ size_t Function::num_inputs() const {
 }
 
 bool Function::call(Stack&, c10::function_ref<void(const mobile::Code&)> f) {
+  initialize_operators(true);
   f(code_);
   return true;
 }
@@ -136,8 +167,7 @@ const std::vector<int64_t>& Function::getExceptionDebugHandles() const {
 
 c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
     c10::OperatorName opname,
-    c10::optional<int> num_specified_args,
-    int64_t model_version) {
+    c10::optional<int> num_specified_args) {
   std::function<void(Stack&)> fn;
   const auto full_name = c10::toString(opname);
   const std::vector<c10::Argument>* pArgs = nullptr;
@@ -167,55 +197,41 @@ c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
   if (!promoted_op) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(pArgs);
     const auto& args = *pArgs;
-    if (model_version == 0x3LL && opname.name == "aten::_convolution" &&
-        opname.overload_name.empty()) {
-      // Since byte-code versions 0x4L, convolution has an additional
-      // default-value argument (allow_tf32=True, see
-      // https://github.com/pytorch/pytorch/pull/40737). This wrapper handles
-      // backward compatibility with models of byte-code version <= 0x3L, where
-      // this bool argument does not yet exist.
-      fn = [fn](Stack& stack) {
-        stack.push_back(true);
-        fn(stack);
-      };
-    } else {
-      // num_specified_args >= 0 indicates number of arguments are available
-      // from model. We can use it to handle backward compatibility.
-      if (num_specified_args &&
-          num_specified_args.value() < static_cast<int64_t>(args.size())) {
-        fn = [fn, num_specified_args, &args](Stack& stack) {
-          std::vector<IValue> out_args;
-          // The following logic pops and temporarily stores all out arguments
-          // from the stack (which can be 0 or more, and always appended to the
-          // schema), in order to push the necessary default values. Finally,
-          // the out arguments are pushed back into the stack.
-          for (size_t i = args.size() - 1; i > 0 && args.at(i).is_out(); i--) {
-            out_args.push_back(stack.back());
-            stack.pop_back();
-          }
-          size_t start_index = num_specified_args.value() - out_args.size();
+    // num_specified_args >= 0 indicates number of arguments are available
+    // from model. We can use it to handle backward compatibility.
+    if (num_specified_args &&
+        num_specified_args.value() < static_cast<int64_t>(args.size())) {
+      fn = [fn, num_specified_args, &args](Stack& stack) {
+        std::vector<IValue> out_args;
+        // The following logic pops and temporarily stores all out arguments
+        // from the stack (which can be 0 or more, and always appended to the
+        // schema), in order to push the necessary default values. Finally,
+        // the out arguments are pushed back into the stack.
+        for (size_t i = args.size() - 1; i > 0 && args.at(i).is_out(); i--) {
+          out_args.push_back(stack.back());
+          stack.pop_back();
+        }
+        size_t start_index = num_specified_args.value() - out_args.size();
+        TORCH_CHECK(
+            start_index >= 0,
+            "The number of output arguments is: ",
+            out_args.size(),
+            ", which is more then the number of specified arguments: ",
+            num_specified_args.value());
+        for (size_t i = start_index; i < (args.size() - out_args.size()); ++i) {
           TORCH_CHECK(
-              start_index >= 0,
-              "The number of output arguments is: ",
-              out_args.size(),
-              ", which is more then the number of specified arguments: ",
-              num_specified_args.value());
-          for (size_t i = start_index; i < (args.size() - out_args.size());
-               ++i) {
-            TORCH_CHECK(
-                args[i].default_value().has_value(),
-                "Error happened at preparing for default values for the argument. The ",
-                i,
-                "th argument ",
-                args[i].name(),
-                " does not have a specified value or default value. ");
+              args[i].default_value().has_value(),
+              "Error happened at preparing for default values for the argument. The ",
+              i,
+              "th argument ",
+              args[i].name(),
+              " does not have a specified value or default value. ");
 
-            stack.push_back(args[i].default_value());
-          }
-          stack.insert(stack.end(), out_args.rbegin(), out_args.rend());
-          fn(stack);
-        };
-      }
+          stack.push_back(args[i].default_value());
+        }
+        stack.insert(stack.end(), out_args.rbegin(), out_args.rend());
+        fn(stack);
+      };
     }
   }
   return fn;
diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h
index 69126e6c0244..fb6f77fa64d7 100644
--- a/torch/csrc/jit/mobile/function.h
+++ b/torch/csrc/jit/mobile/function.h
@@ -34,12 +34,10 @@ class TORCH_API Function : public torch::jit::Function {
   // misaligned. Therefore only use ONE variant at time.
   void append_instruction(OpCode op, int X, int N, int64_t dbg_handle);
   void append_instruction(OpCode op, int X, int N);
-  bool append_operator(
+  void append_operator(
       const std::string& name,
       const std::string& overload_name,
-      const c10::optional<int>& num_specified_args,
-      int64_t model_version); /* TODO: T90339189 deprecate all v3 when v3 models
-                                are removed */
+      const c10::optional<int>& num_specified_args);
   void append_constant(const c10::IValue& constant);
   void append_type(const c10::TypePtr& type);
   void append_function(mobile::Function& func);
@@ -65,6 +63,12 @@ class TORCH_API Function : public torch::jit::Function {
       const std::vector<c10::TypePtr>& types,
       const size_t register_size);
 
+  // if not initialize, initialize by loading operators.
+  // return true of all op loaded, return false if some op is not found
+  // in the current runtime. Then, the ops that did not found will be filled
+  // in unsupported_op_names
+  bool initialize_operators(bool should_check_operators);
+
  private:
   c10::QualifiedName name_;
   Code code_;
@@ -73,8 +77,9 @@ class TORCH_API Function : public torch::jit::Function {
 
 c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
     c10::OperatorName opname,
-    c10::optional<int> num_specified_args,
-    int64_t model_version);
+    c10::optional<int> num_specified_args);
+
+TORCH_API std::string operator_str(const c10::OperatorName& opname);
 
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index a48f4a298e5c..b53ffb96a9fc 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -10,6 +10,10 @@
 #include <caffe2/serialize/inline_container.h>
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/mobile/file_format.h>
+#if defined(ENABLE_FLATBUFFER)
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
+#endif
 #include <torch/csrc/jit/mobile/interpreter.h>
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
@@ -248,9 +252,8 @@ void BytecodeDeserializer::parseFunctionSchema(
     auto parseArgList = [this,
                          function](c10::ivalue::TupleElements&& argTables) {
       std::vector<c10::Argument> args;
-      for (auto&& argTable : std::move(argTables)) {
-        auto argTableElements =
-            std::move(*std::move(argTable).toTuple()).elements();
+      for (auto& argTable : argTables) {
+        auto argTableElements = std::move(argTable.toTupleRef()).elements();
         auto name =
             expect_field(argTableElements, "name", BYTECODE_INDEX_ARGUMENT_NAME)
                 .toStringRef();
@@ -271,19 +274,18 @@ void BytecodeDeserializer::parseFunctionSchema(
       tryRegisterMethod(args, *function);
       return args;
     };
-    auto schemaTableElements =
-        std::move(*std::move(*schemaTable).toTuple()).elements();
-    auto arg_list = std::move(*expect_field(
-                                   schemaTableElements,
-                                   "arguments",
-                                   BYTECODE_INDEX_SCHEMA_ARGUMENTS)
-                                   .toTuple())
+    auto schemaTableElements = std::move(schemaTable->toTupleRef()).elements();
+    auto arg_list = std::move(expect_field(
+                                  schemaTableElements,
+                                  "arguments",
+                                  BYTECODE_INDEX_SCHEMA_ARGUMENTS)
+                                  .toTupleRef())
                         .elements();
     auto ret_list =
         std::move(
-            *expect_field(
-                 schemaTableElements, "returns", BYTECODE_INDEX_SCHEMA_RETURNS)
-                 .toTuple())
+            expect_field(
+                schemaTableElements, "returns", BYTECODE_INDEX_SCHEMA_RETURNS)
+                .toTupleRef())
             .elements();
     c10::FunctionSchema schema(
         function_name,
@@ -309,8 +311,9 @@ void BytecodeDeserializer::parseMethods(
   TORCH_CHECK(vals.size() > 0, "Bytecode has no elements. ");
   // Initialized with the version number when kProducedBytecodeVersion was
   // introduced. The old models (some of them already in production) without
-  // version number don't have to be re-generated.
-  int64_t model_version = 0x3L;
+  // version number are seen as version 3 (deprecated).
+  constexpr uint64_t default_version = 0x3L;
+  uint64_t model_version = default_version;
   size_t method_i_start = 0;
   if (vals[0].isInt()) {
     model_version = vals[0].toInt();
@@ -338,10 +341,10 @@ void BytecodeDeserializer::parseMethods(
   // Process all methods in this mobile module.
   for (const auto i : c10::irange(method_i_start, vals.size())) {
     auto element = std::move(vals[i]);
-    auto m_tuple = std::move(*element.toTuple()).elements();
+    auto m_tuple = std::move(element.toTupleRef()).elements();
     const std::string& function_name = m_tuple[0].toStringRef();
     auto codeTableElements =
-        std::move(*std::move(m_tuple[1]).toTuple()).elements();
+        std::move(std::move(m_tuple[1]).toTupleRef()).elements();
     IValue* schemaTable = // older files do not store function schema
         (model_version > 0x4L || (model_version == 0x4L && m_tuple.size() >= 3))
         ? &m_tuple[2]
@@ -351,23 +354,23 @@ void BytecodeDeserializer::parseMethods(
 
     auto ins_list =
         std::move(
-            *expect_field(
-                 codeTableElements, "instructions", BYTECODE_INDEX_INSTRUCTION)
-                 .toTuple())
+            expect_field(
+                codeTableElements, "instructions", BYTECODE_INDEX_INSTRUCTION)
+                .toTupleRef())
             .elements();
     auto ops_list =
-        std::move(*expect_field(
-                       codeTableElements, "operators", BYTECODE_INDEX_OPERATOR)
-                       .toTuple())
+        std::move(expect_field(
+                      codeTableElements, "operators", BYTECODE_INDEX_OPERATOR)
+                      .toTupleRef())
             .elements();
     auto consts_list =
-        std::move(*expect_field(
-                       codeTableElements, "constants", BYTECODE_INDEX_CONSTANT)
-                       .toTuple())
+        std::move(expect_field(
+                      codeTableElements, "constants", BYTECODE_INDEX_CONSTANT)
+                      .toTupleRef())
             .elements();
     auto types_list =
-        std::move(*expect_field(codeTableElements, "types", BYTECODE_INDEX_TYPE)
-                       .toTuple())
+        std::move(expect_field(codeTableElements, "types", BYTECODE_INDEX_TYPE)
+                      .toTupleRef())
             .elements();
     int64_t register_size =
         expect_field(
@@ -377,15 +380,11 @@ void BytecodeDeserializer::parseMethods(
     c10::ivalue::TupleElements debug_handles_m_tuple;
     if (debug_handles) {
       debug_handles_m_tuple =
-          std::move(*std::move((*debug_handles)[i]).toTuple()).elements();
+          std::move(std::move((*debug_handles)[i]).toTupleRef()).elements();
     }
     init_upgrader(function.get());
     // 1. First pass all operators from models
-    parseOperators(
-        std::move(ops_list),
-        model_version,
-        module_load_options_,
-        function.get());
+    parseOperators(std::move(ops_list), module_load_options_, function.get());
 
     // 2. Decides if upgrader is needed
     bool use_upgrader =
@@ -454,13 +453,13 @@ mobile::Module BytecodeDeserializer::deserialize(
   // being a Tuple (int, table), and the integer stands for the bytecode version
   // number. The rest of the elements are the same as before.
   //
-  auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements();
+  auto bvals = std::move(readArchive("bytecode", mcu).toTupleRef()).elements();
 
   c10::optional<c10::ivalue::TupleElements> debug_handles;
   bool has_debug_handles{false};
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {
     debug_handles =
-        std::move(*readArchive("mobile_debug_handles", mcu).toTuple())
+        std::move(readArchive("mobile_debug_handles", mcu).toTupleRef())
             .elements();
     has_debug_handles = true;
   }
@@ -538,18 +537,49 @@ mobile::Module _load_for_mobile(
     std::istream& in,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  std::unique_ptr<IStreamAdapter> rai = std::make_unique<IStreamAdapter>(&in);
-  auto module = _load_for_mobile(std::move(rai), device, extra_files);
-  return module;
+  in.seekg(0, in.beg);
+  auto format = getFileFormat(in);
+  switch (format) {
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<IStreamAdapter> rai =
+          std::make_unique<IStreamAdapter>(&in);
+      auto module = _load_for_mobile(std::move(rai), device, extra_files);
+      return module;
+    }
+#if defined(ENABLE_FLATBUFFER)
+    case FileFormat::FlatbufferFileFormat: {
+      std::shared_ptr<char> data;
+      size_t size = 0;
+      std::tie(data, size) = get_stream_content(in);
+      auto* flatbuffer_module =
+          mobile::serialization::GetMutableModule(data.get());
+      mobile::Module m = initialize_mobile_module(flatbuffer_module);
+      parseExtraFiles(flatbuffer_module, extra_files);
+      m.set_delete_memory(data);
+      return m;
+    }
+#else
+    case FileFormat::FlatbufferFileFormat: {
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+    }
+#endif
+    default: {
+      TORCH_CHECK(false, "Format error");
+    }
+  }
 }
 
 mobile::Module _load_for_mobile(
     const std::string& filename,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  std::unique_ptr<FileAdapter> rai = std::make_unique<FileAdapter>(filename);
-  auto module = _load_for_mobile(std::move(rai), device, extra_files);
-  return module;
+  return _load_for_mobile(
+      filename,
+      device,
+      extra_files,
+      /*module_load_options=*/_default_mobile_module_load_options);
 }
 
 mobile::Module _load_for_mobile(
@@ -557,10 +587,38 @@ mobile::Module _load_for_mobile(
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
-  std::unique_ptr<FileAdapter> rai = std::make_unique<FileAdapter>(filename);
-  auto module = _load_for_mobile_impl(
-      std::move(rai), device, extra_files, module_load_options);
-  return module;
+  auto format = getFileFormat(filename);
+  switch (format) {
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<FileAdapter> rai =
+          std::make_unique<FileAdapter>(filename);
+      auto module = _load_for_mobile_impl(
+          std::move(rai), device, extra_files, module_load_options);
+      return module;
+    }
+#if defined(ENABLE_FLATBUFFER)
+    case FileFormat::FlatbufferFileFormat: {
+      std::shared_ptr<char> data;
+      size_t size = 0;
+      std::tie(data, size) = get_file_content(filename.c_str());
+      auto* flatbuffer_module =
+          mobile::serialization::GetMutableModule(data.get());
+      mobile::Module m = initialize_mobile_module(flatbuffer_module);
+      parseExtraFiles(flatbuffer_module, extra_files);
+      m.set_delete_memory(data);
+      return m;
+    }
+#else
+    case FileFormat::FlatbufferFileFormat: {
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+    }
+#endif
+    default: {
+      TORCH_CHECK(false, "Format error");
+    }
+  }
 }
 
 mobile::Module _load_for_mobile(
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index c67b521937e5..f16f6db8df05 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -1,15 +1,24 @@
 #include <torch/csrc/jit/mobile/import_data.h>
 
+#include <ATen/Functions.h>
 #include <ATen/core/ivalue.h>
 #include <c10/util/irange.h>
+#include <caffe2/serialize/file_adapter.h>
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/import_export_common.h>
+#include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/serialization/unpickler.h>
 #include <torch/custom_class.h>
 
+#if defined(ENABLE_FLATBUFFER)
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
+#endif // defined(ENABLE_FLATBUFFER)
+
 #include <exception>
 #include <fstream>
 #include <string>
@@ -17,16 +26,20 @@
 
 namespace torch {
 namespace jit {
+using caffe2::serialize::FileAdapter;
 using caffe2::serialize::IStreamAdapter;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::ReadAdapterInterface;
 
 namespace {
 
-// The deserializer class which loads the bytecode package from bc files.
-class BytecodeDeserializer final {
+/**
+ * Given a ZIP file containing a file named "data.pkl", uses Pickle to
+ * deserialize the file and returns the IValue inside it.
+ */
+class IValueUnpickler final {
  public:
-  explicit BytecodeDeserializer(std::unique_ptr<PyTorchStreamReader> reader);
+  explicit IValueUnpickler(std::unique_ptr<PyTorchStreamReader> reader);
   c10::IValue deserialize(c10::optional<at::Device> device);
 
  private:
@@ -39,20 +52,18 @@ class BytecodeDeserializer final {
   std::unique_ptr<PyTorchStreamReader> reader_;
 };
 
-BytecodeDeserializer::BytecodeDeserializer(
-    std::unique_ptr<PyTorchStreamReader> reader)
+IValueUnpickler::IValueUnpickler(std::unique_ptr<PyTorchStreamReader> reader)
     : compilation_unit_(std::make_shared<CompilationUnit>()),
       reader_(std::move(reader)) {}
 
-c10::IValue BytecodeDeserializer::deserialize(
-    c10::optional<at::Device> device) {
+c10::IValue IValueUnpickler::deserialize(c10::optional<at::Device> device) {
   auto mcu = std::make_shared<mobile::CompilationUnit>();
 
   // NOLINTNEXTLINE(performance-move-const-arg)
   return readArchive("data", mcu, std::move(device));
 }
 
-c10::IValue BytecodeDeserializer::readArchive(
+c10::IValue IValueUnpickler::readArchive(
     const std::string& archive_name,
     std::shared_ptr<mobile::CompilationUnit> mcu,
     c10::optional<at::Device> device) {
@@ -153,36 +164,148 @@ c10::IValue BytecodeDeserializer::readArchive(
   return unpickler.parse_ivalue();
 }
 
+/**
+ * Extracts and returns the parameter map serialized as ZIP + Pickle in @p rai.
+ */
+std::map<std::string, at::Tensor> load_parameters_from_zip(
+    std::unique_ptr<ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device) {
+  auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
+  IValueUnpickler unpickler(std::move(reader));
+  auto result = unpickler.deserialize(device).toGenericDict();
+  std::map<std::string, at::Tensor> map;
+  for (const auto& e : result) {
+    auto key = e.key().toString()->string();
+    auto value = e.value().toTensor().tensor_data();
+    map[key] = value;
+  }
+  return map;
+}
+
 } // namespace
 
+/**
+ * Extracts the parameter map stored in @p module. Expects a layout
+ * compatible with the one created by #_save_parameters().
+ */
+std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
+    const mobile::Module& module) {
+  // Safely look for a slot with the expected name. Note that
+  // c10::ivalue::Object::getAttr() is not safe if the attribute isn't present.
+  auto obj = module._ivalue();
+  const std::vector<IValue>& slots = obj->slots();
+  for (const auto i : c10::irange(slots.size())) {
+    if (obj->type()->getAttributeName(i) ==
+        mobile::internal::kSavedParametersAttributeName) {
+      // Found a slot with the right name; make sure it's a
+      // Dict<string, Tensor>.
+      c10::IValue data = slots[i];
+      if (data.isGenericDict()) {
+        auto data_dict = data.toGenericDict();
+
+        // The key and value should be DynamicTypes that wrap String and Tensor.
+        c10::DynamicType* keyType =
+            data_dict.keyType()->castRaw<c10::DynamicType>();
+        c10::DynamicType* valueType =
+            data_dict.valueType()->castRaw<c10::DynamicType>();
+        if (keyType != nullptr &&
+            keyType->fallback()->kind() == TypeKind::StringType &&
+            valueType != nullptr &&
+            valueType->fallback()->kind() == TypeKind::TensorType) {
+          // Name and type are good; copy the contents to the output map.
+          std::map<std::string, at::Tensor> params;
+          for (const auto& e : data_dict) {
+            // The source Tensor points into the flatbuffer data associated with
+            // the Module. But, this Tensor needs to outlive the Module, since
+            // the caller of _load_parameters() won't have a pointer to the
+            // Module. So, return a deep copy.
+            const auto& source = e.value().toTensor();
+            at::Tensor copy = at::empty_like(source); // Must be the same shape.
+            copy.copy_(source);
+
+            params[e.key().toStringRef()] = copy;
+          }
+          return params;
+        }
+      }
+    }
+  }
+
+  TORCH_CHECK(
+      false,
+      "Could not find Dict<string, Tensor> named '",
+      mobile::internal::kSavedParametersAttributeName,
+      "' in deserialized mobile::Module");
+}
+
 std::map<std::string, at::Tensor> _load_parameters(
     std::istream& in,
     c10::optional<at::Device> device) {
-  std::unique_ptr<IStreamAdapter> rai = std::make_unique<IStreamAdapter>(&in);
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  return _load_parameters(std::move(rai), std::move(device));
+  // Detect the data format from the head of the input stream.
+  FileFormat format = getFileFormat(in);
+
+  // Call the appropriate parser.
+  std::map<std::string, at::Tensor> map;
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      std::shared_ptr<char> data;
+      size_t size = 0;
+      std::tie(data, size) = get_stream_content(in);
+      mobile::Module module =
+          parse_and_initialize_mobile_module(std::move(data), size, device);
+      map = mobile_module_to_parameter_map(module);
+#else // !defined(ENABLE_FLATBUFFER)
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+#endif // !defined(ENABLE_FLATBUFFER)
+      break;
+    }
+
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<IStreamAdapter> rai =
+          std::make_unique<IStreamAdapter>(&in);
+      map = load_parameters_from_zip(std::move(rai), device);
+      break;
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
+  return map;
 }
 
 std::map<std::string, at::Tensor> _load_parameters(
     const std::string& filename,
     c10::optional<at::Device> device) {
-  std::unique_ptr<FileAdapter> rai = std::make_unique<FileAdapter>(filename);
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  return _load_parameters(std::move(rai), std::move(device));
-}
+  // Detect the file format from its header.
+  FileFormat format = getFileFormat(filename);
 
-std::map<std::string, at::Tensor> _load_parameters(
-    std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device) {
-  auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
-  BytecodeDeserializer deserializer(std::move(reader));
-  // NOLINTNEXTLINE(performance-move-const-arg)
-  auto result = deserializer.deserialize(std::move(device)).toGenericDict();
+  // Call the appropriate parser.
   std::map<std::string, at::Tensor> map;
-  for (const auto& e : result) {
-    auto key = e.key().toString()->string();
-    auto value = e.value().toTensor().tensor_data();
-    map[key] = value;
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      mobile::Module module = load_mobile_module_from_file(filename, device);
+      map = mobile_module_to_parameter_map(module);
+#else // !defined(ENABLE_FLATBUFFER)
+      TORCH_CHECK(
+          false,
+          "Flatbuffer input file but the build hasn't enabled flatbuffer");
+#endif // !defined(ENABLE_FLATBUFFER)
+      break;
+    }
+
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<FileAdapter> rai =
+          std::make_unique<FileAdapter>(filename);
+      map = load_parameters_from_zip(std::move(rai), device);
+      break;
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
   }
   return map;
 }
diff --git a/torch/csrc/jit/mobile/import_data.h b/torch/csrc/jit/mobile/import_data.h
index 443ae87f77be..f3eb202b7f00 100644
--- a/torch/csrc/jit/mobile/import_data.h
+++ b/torch/csrc/jit/mobile/import_data.h
@@ -1,28 +1,38 @@
 #pragma once
 
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Device.h>
+#include <c10/util/Optional.h>
 #include <torch/csrc/jit/mobile/module.h>
 
 #include <istream>
-#include <memory>
-
-#include <caffe2/serialize/file_adapter.h>
+#include <map>
+#include <string>
 
 namespace torch {
 namespace jit {
-using caffe2::serialize::FileAdapter;
-using caffe2::serialize::IStreamAdapter;
-using caffe2::serialize::ReadAdapterInterface;
 
+/**
+ * Loads named parameters from the serialized data in @p in.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
 TORCH_API std::map<std::string, at::Tensor> _load_parameters(
     std::istream& in,
     c10::optional<at::Device> device = c10::nullopt);
 
+/**
+ * Loads named parameters from the serialized data in @p filename.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
 TORCH_API std::map<std::string, at::Tensor> _load_parameters(
     const std::string& filename,
     c10::optional<at::Device> device = c10::nullopt);
 
-TORCH_API std::map<std::string, at::Tensor> _load_parameters(
-    std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device = c10::nullopt);
+// NOTE: Please prefer using _load_parameters over using the function below.
+TORCH_API std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
+    const mobile::Module& module);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/import_export_common.h b/torch/csrc/jit/mobile/import_export_common.h
new file mode 100644
index 000000000000..4e4e06dfa51d
--- /dev/null
+++ b/torch/csrc/jit/mobile/import_export_common.h
@@ -0,0 +1,23 @@
+#pragma once
+
+/**
+ * @file
+ * Declarations shared between import_data.cpp and export_data.cpp
+ */
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+namespace internal {
+/**
+ * The name of the mobile::Module attribute which contains saved parameters, as
+ * a Dict of names to Tensors. Only used for Flatbuffer serialization.
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+constexpr char kSavedParametersAttributeName[] = "data";
+} // namespace internal
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index b20d116bbdf5..1e5275403247 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -233,7 +233,7 @@ bool InterpreterState::run(Stack& stack) {
           }
           return false;
         case LIST_CONSTRUCT: {
-          listConstruct(stack, *code.types_[inst.X], inst.N);
+          listConstruct(stack, *code.types_.at(inst.X), inst.N);
           frame.step();
         } break;
         case LIST_UNPACK: {
@@ -305,21 +305,20 @@ bool InterpreterState::run(Stack& stack) {
           frame.step();
         } break;
         case DICT_CONSTRUCT: {
-          dictConstruct(stack, *code.types_[inst.X], inst.N);
+          dictConstruct(stack, *code.types_.at(inst.X), inst.N);
           frame.step();
         } break;
         case NAMED_TUPLE_CONSTRUCT: {
-          namedTupleConstruct(stack, code.types_[inst.X], inst.N);
+          namedTupleConstruct(stack, code.types_.at(inst.X), inst.N);
           frame.step();
         } break;
         case CREATE_OBJECT: {
-          auto type = code.types_[inst.X]->expect<c10::ClassType>();
+          auto type = code.types_.at(inst.X)->expect<c10::ClassType>();
           createObject(stack, type);
           frame.step();
         } break;
         case ISINSTANCE: {
-          at::ArrayRef<TypePtr> types(
-              &(code.types_[inst.X]), &(code.types_[inst.X + inst.N]));
+          at::ArrayRef<TypePtr> types(&code.types_.at(inst.X), inst.N);
           isinstance(stack, types);
           frame.step();
         } break;
diff --git a/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp b/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp
index 96ef45cf503b..7c8d40dbf212 100644
--- a/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp
@@ -1,10 +1,28 @@
 #include <torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h>
+#include <mutex>
 
 namespace torch {
 namespace jit {
 namespace mobile {
-BuildFeatureTracer::build_feature_type& BuildFeatureTracer::getBuildFeatures() {
-  static build_feature_type build_features;
+BuildFeatureTracer::BuildFeatureTracer() {
+  auto recorder_cb =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    std::string name = fn.name();
+    getBuildFeatures().withLock(
+        [&](BuildFeatureTracer::build_feature_type& build_features) {
+          build_features.insert(name);
+        });
+    return nullptr;
+  };
+
+  handle_ =
+      at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
+                                .scopes({at::RecordScope::BUILD_FEATURE}));
+}
+
+c10::Synchronized<BuildFeatureTracer::build_feature_type>& BuildFeatureTracer::
+    getBuildFeatures() {
+  static c10::Synchronized<build_feature_type> build_features;
   return build_features;
 }
 
diff --git a/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h b/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h
index 55ce0cb4f3e5..72fb6754e1ee 100644
--- a/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h
+++ b/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
 #include <map>
 #include <set>
 #include <string>
@@ -27,20 +28,8 @@ struct BuildFeatureTracer final {
    */
   typedef std::set<std::string> build_feature_type;
 
-  BuildFeatureTracer() {
-    auto recorder_cb = [](const at::RecordFunction& fn)
-        -> std::unique_ptr<at::ObserverContext> {
-      std::string name = fn.name();
-      getBuildFeatures().insert(name);
-      return nullptr;
-    };
-
-    handle_ =
-        at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
-                                  .scopes({at::RecordScope::BUILD_FEATURE}));
-  }
-
-  static build_feature_type& getBuildFeatures();
+  BuildFeatureTracer();
+  static c10::Synchronized<build_feature_type>& getBuildFeatures();
 
   ~BuildFeatureTracer() {
     at::removeCallback(handle_);
diff --git a/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp b/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp
index f038a2ad57e1..fc37693cb0ea 100644
--- a/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp
@@ -1,10 +1,27 @@
 #include <torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h>
+#include <mutex>
 
 namespace torch {
 namespace jit {
 namespace mobile {
-CustomClassTracer::custom_classes_type& CustomClassTracer::getLoadedClasses() {
-  static custom_classes_type loaded_classes;
+CustomClassTracer::CustomClassTracer() {
+  auto recorder_cb =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    std::string name = fn.name();
+    getLoadedClasses().withLock(
+        [&name](CustomClassTracer::custom_classes_type& custom_classes) {
+          custom_classes.insert(name);
+        });
+    return nullptr;
+  };
+
+  handle_ = at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
+                                      .scopes({at::RecordScope::CUSTOM_CLASS}));
+}
+
+c10::Synchronized<CustomClassTracer::custom_classes_type>& CustomClassTracer::
+    getLoadedClasses() {
+  static c10::Synchronized<custom_classes_type> loaded_classes;
   return loaded_classes;
 }
 
diff --git a/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h b/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h
index 89ea65d34851..3f7555507780 100644
--- a/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h
+++ b/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
 #include <map>
 #include <set>
 #include <string>
@@ -27,20 +28,8 @@ struct CustomClassTracer final {
    */
   typedef std::set<std::string> custom_classes_type;
 
-  CustomClassTracer() {
-    auto recorder_cb = [](const at::RecordFunction& fn)
-        -> std::unique_ptr<at::ObserverContext> {
-      std::string name = fn.name();
-      getLoadedClasses().insert(name);
-      return nullptr;
-    };
-
-    handle_ =
-        at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
-                                  .scopes({at::RecordScope::CUSTOM_CLASS}));
-  }
-
-  static custom_classes_type& getLoadedClasses();
+  CustomClassTracer();
+  static c10::Synchronized<custom_classes_type>& getLoadedClasses();
 
   ~CustomClassTracer() {
     at::removeCallback(handle_);
diff --git a/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp b/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp
index fec2430b2580..15a65e8195e7 100644
--- a/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp
@@ -1,15 +1,37 @@
 #include <torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h>
 #include <map>
+#include <mutex>
 #include <set>
 #include <string>
 
 namespace torch {
 namespace jit {
 namespace mobile {
-KernelDTypeTracer::kernel_tags_type& KernelDTypeTracer::getCalledKernelTags() {
-  static kernel_tags_type called_kernel_tags;
+KernelDTypeTracer::KernelDTypeTracer() {
+  auto recorder_cb =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    std::string name = fn.name();
+    size_t dollar_pos = name.find_first_of('$');
+    std::string kernel_tag = name.substr(0, dollar_pos);
+    std::string dtype = name.substr(dollar_pos + 1);
+
+    getCalledKernelTags().withLock([&](kernel_tags_type& kernel_tags) {
+      kernel_tags[kernel_tag].insert(dtype);
+    });
+    return nullptr;
+  };
+
+  handle_ = at::addGlobalCallback(
+      at::RecordFunctionCallback(recorder_cb)
+          .scopes({at::RecordScope::KERNEL_FUNCTION_DTYPE}));
+}
+
+c10::Synchronized<KernelDTypeTracer::kernel_tags_type>& KernelDTypeTracer::
+    getCalledKernelTags() {
+  static c10::Synchronized<kernel_tags_type> called_kernel_tags;
   return called_kernel_tags;
 }
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h b/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h
index d6bf81836431..a3595e1d3911 100644
--- a/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h
+++ b/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
 #include <map>
 #include <set>
 #include <string>
@@ -28,24 +29,8 @@ struct KernelDTypeTracer final {
    */
   typedef std::map<std::string, std::set<std::string>> kernel_tags_type;
 
-  KernelDTypeTracer() {
-    auto recorder_cb = [](const at::RecordFunction& fn)
-        -> std::unique_ptr<at::ObserverContext> {
-      std::string name = fn.name();
-      size_t dollar_pos = name.find_first_of('$');
-      std::string kernel_tag = name.substr(0, dollar_pos);
-      std::string dtype = name.substr(dollar_pos + 1);
-
-      getCalledKernelTags()[kernel_tag].insert(dtype);
-      return nullptr;
-    };
-
-    handle_ = at::addGlobalCallback(
-        at::RecordFunctionCallback(recorder_cb)
-            .scopes({at::RecordScope::KERNEL_FUNCTION_DTYPE}));
-  }
-
-  static kernel_tags_type& getCalledKernelTags();
+  KernelDTypeTracer();
+  static c10::Synchronized<kernel_tags_type>& getCalledKernelTags();
 
   ~KernelDTypeTracer() {
     at::removeCallback(handle_);
diff --git a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
index 473a523f1dff..403d336534f1 100644
--- a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
+++ b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <iostream>
+#include <mutex>
 #include <sstream>
 
 #include <torch/csrc/autograd/grad_mode.h>
diff --git a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
index 7f0b2058365b..0da724ade0bf 100644
--- a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
@@ -3,7 +3,27 @@
 namespace torch {
 namespace jit {
 namespace mobile {
-std::set<std::string> OperatorCallTracer::called_operators_;
+OperatorCallTracer::OperatorCallTracer() {
+  getCalledOperators().withLock([](std::set<std::string>& called_operators) {
+    called_operators.clear();
+  });
+
+  auto recorder_cb =
+      [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
+    c10::optional<c10::OperatorName> op_name = fn.operator_name();
+    if (op_name.has_value()) {
+      getCalledOperators().withLock(
+          [op_name](std::set<std::string>& called_operators) {
+            called_operators.insert(c10::toString(*op_name));
+          });
+    }
+    return nullptr;
+  };
+
+  handle_ = at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
+                                      .scopes({at::RecordScope::FUNCTION}));
+}
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h
index ed46c0e67bf9..bd3fe7bda02d 100644
--- a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h
+++ b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
 
 namespace torch {
 namespace jit {
@@ -17,25 +18,12 @@ namespace mobile {
  *
  */
 struct OperatorCallTracer final {
-  static std::set<std::string> called_operators_;
   at::CallbackHandle handle_;
 
-  OperatorCallTracer() {
-    called_operators_.clear();
-    auto recorder_cb = [](const at::RecordFunction& fn)
-        -> std::unique_ptr<at::ObserverContext> {
-      c10::optional<c10::OperatorName> op_name = fn.operator_name();
-      if (op_name.has_value()) {
-        called_operators_.insert(c10::toString(*op_name));
-      }
-      return nullptr;
-    };
+  OperatorCallTracer();
 
-    handle_ = at::addGlobalCallback(at::RecordFunctionCallback(recorder_cb)
-                                        .scopes({at::RecordScope::FUNCTION}));
-  }
-
-  std::set<std::string> const& getCalledOperators() const {
+  static c10::Synchronized<std::set<std::string>>& getCalledOperators() {
+    static c10::Synchronized<std::set<std::string>> called_operators_;
     return called_operators_;
   }
 
diff --git a/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp b/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
index 3e2698988432..32852d691b56 100644
--- a/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
@@ -288,19 +288,31 @@ TracerResult trace_run(const std::string& input_module_path) {
   at::globalContext().setQEngine(at::QEngine::FBGEMM);
   run_model(input_module_path, root_ops, enabled_backends, called_kernel_tags);
 
-  traced_operators = op_tracer.getCalledOperators();
+  op_tracer.getCalledOperators().withLock(
+      [&](std::set<std::string>& called_operators) {
+        traced_operators = called_operators;
+      });
+
   recordCustomClassesFromOpSchemas(root_ops, traced_operators, loaded_classes);
-  called_kernel_tags.insert(
-      kdtype_tracer.getCalledKernelTags().begin(),
-      kdtype_tracer.getCalledKernelTags().end());
+
+  kdtype_tracer.getCalledKernelTags().withLock(
+      [&](KernelDTypeTracer::kernel_tags_type& kernel_tags) {
+        called_kernel_tags.insert(kernel_tags.begin(), kernel_tags.end());
+      });
+
   traced_operators.insert(
       always_included_traced_ops.begin(), always_included_traced_ops.end());
-  loaded_classes.insert(
-      custom_class_tracer.getLoadedClasses().begin(),
-      custom_class_tracer.getLoadedClasses().end());
-  build_features.insert(
-      build_feature_tracer.getBuildFeatures().begin(),
-      build_feature_tracer.getBuildFeatures().end());
+
+  custom_class_tracer.getLoadedClasses().withLock(
+      [&](CustomClassTracer::custom_classes_type& custom_classes) {
+        loaded_classes.insert(custom_classes.begin(), custom_classes.end());
+      });
+
+  build_feature_tracer.getBuildFeatures().withLock(
+      [&](BuildFeatureTracer::build_feature_type& bf) {
+        build_features.insert(bf.begin(), bf.end());
+      });
+
   TracerResult tracer_result = {
       root_ops,
       traced_operators,
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index 27f013808a78..1483af299c43 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/backends/backend_exception.h>
 #include <torch/csrc/jit/mobile/interpreter.h>
 #include <torch/csrc/jit/mobile/observer.h>
+#include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
 #include <exception>
 
@@ -263,6 +264,40 @@ c10::IValue Method::operator()(std::vector<c10::IValue> stack) const {
   return stack.front();
 }
 
+c10::optional<std::string> print_type(const c10::Type& t) {
+  auto namedType = t.cast<c10::NamedType>();
+  if (namedType && namedType->name()) {
+    return namedType->name().value().qualifiedName();
+  }
+  if (auto dyn = t.castRaw<c10::DynamicType>()) {
+    return dyn->fallback()->annotation_str();
+  }
+  return c10::nullopt;
+}
+
+TORCH_API ModuleInfo get_module_info(const mobile::Module& module) {
+  ModuleInfo minfo;
+  minfo.operator_version = module.min_operator_version();
+  minfo.bytecode_version = module.bytecode_version();
+  std::vector<std::string> type_name_list;
+  for (const auto& func_ptr : module.compilation_unit().methods()) {
+    const auto& function = *func_ptr;
+    for (int i = 0; i < function.get_code().op_names_.size(); i++) {
+      const auto& op = function.get_code().op_names_[i];
+      minfo.opname_to_num_args[mobile::operator_str(op)] =
+          function.get_code().operator_input_sizes_[i];
+    }
+    for (const c10::TypePtr& tp : function.get_code().types_) {
+      type_name_list.push_back(tp->annotation_str(print_type));
+    }
+    minfo.function_names.insert(function.qualname().qualifiedName());
+  }
+  c10::TypeParser parser(type_name_list);
+  parser.parseList();
+  minfo.type_names = parser.getContainedTypes();
+  return minfo;
+}
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index fc4c046c77fa..01c76e146581 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -135,16 +135,44 @@ class TORCH_API Module {
     mem_to_delete_ = delete_mem;
   }
 
+  void set_min_operator_version(int64_t version) {
+    min_operator_version_ = version;
+  }
+
+  int64_t min_operator_version() const {
+    return min_operator_version_;
+  }
+
+  void set_bytecode_version(int64_t version) {
+    bytecode_version_ = version;
+  }
+
+  int64_t bytecode_version() const {
+    return bytecode_version_;
+  }
+
  private:
   c10::intrusive_ptr<c10::ivalue::Object> object_;
   std::unordered_map<std::string, std::string> metadata_;
   std::shared_ptr<CompilationUnit> cu_;
   MobileDebugTable debug_table_;
   bool has_debug_handles_ = false;
+  int64_t min_operator_version_ = 4;
+  int64_t bytecode_version_ = 4;
 
   // Extra handle for the module to delete when itself is deleted
   std::shared_ptr<char> mem_to_delete_;
 };
+
+struct TORCH_API ModuleInfo {
+  uint64_t bytecode_version;
+  uint64_t operator_version;
+  std::unordered_map<std::string, int> opname_to_num_args;
+  std::unordered_set<std::string> function_names;
+  std::unordered_set<std::string> type_names;
+};
+TORCH_API ModuleInfo get_module_info(const mobile::Module& module);
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
index 1d140390a52c..2b5a7bc7937f 100644
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
@@ -43,9 +43,18 @@ std::vector<int64_t> getConstSizes(const BufPtr b) {
 
 // Construct input-specs vector from the inputs of the original graph
 std::vector<mobile::nnc::InputSpec> toInputSpecs(
-    const std::shared_ptr<Graph>& g) {
+    const std::shared_ptr<tensorexpr::TensorExprKernel>& kernel) {
+  const std::shared_ptr<Graph>& g = kernel->graph();
   std::vector<mobile::nnc::InputSpec> specs;
-  for (auto v : g->inputs()) {
+
+  // Graph inputs include scalar values for symbolic shapes, for which we
+  // don't need input specs. These scalar values come last among the graph
+  // inputs
+  auto num_inputs =
+      g->inputs().size() - kernel->getSymbolicShapeInputs().size();
+
+  for (int i = 0; i < num_inputs; i++) {
+    auto v = g->inputs()[i];
     const auto& t = v->type();
     mobile::nnc::InputSpec spec;
     TORCH_CHECK(t->kind() == TypeKind::TensorType, "Unsupported input type");
@@ -120,7 +129,7 @@ std::unique_ptr<Function> compileMethod(
     const std::vector<at::ScalarType>& types) {
   auto func = std::make_unique<Function>();
   func->set_name(method_name);
-  func->set_input_specs(toInputSpecs(kernel->graph()));
+  func->set_input_specs(toInputSpecs(kernel));
 
   auto params = c10::impl::GenericList(c10::AnyType::get());
   auto const_descriptors = kernel->getConstantDescriptors();
@@ -177,18 +186,33 @@ std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
     std::shared_ptr<Graph>& g,
     const std::vector<std::vector<int64_t>>& sizes,
     const std::vector<at::ScalarType>& types,
-    const std::string& kernel_func_name) {
+    const std::string& kernel_func_name,
+    const std::vector<int64_t>& symbolic_ind) {
   GRAPH_DEBUG("Input sizes ", sizes);
   GRAPH_DEBUG("Input types ", types);
   GRAPH_DEBUG("Method name ", method_name);
   GRAPH_DEBUG("Kernel func name ", kernel_func_name);
-
-  std::shared_ptr<tensorexpr::TensorExprKernel> kernel =
-      std::make_shared<tensorexpr::TensorExprKernel>(
-          TensorExprKernel(g, kernel_func_name));
+  GRAPH_DEBUG("Symbolic indices ", symbolic_ind);
+
+  std::shared_ptr<tensorexpr::TensorExprKernel> kernel;
+  std::vector<torch::jit::StrideInput> stride_desc = {
+      torch::jit::StrideInput::TENSOR_CONT};
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides;
+  if (!symbolic_ind.empty()) {
+    for (auto i : g->inputs()) {
+      symbolic_strides[i] = stride_desc;
+    }
+    for (auto o : g->outputs()) {
+      symbolic_strides[o] = stride_desc;
+    }
+  }
+  kernel = std::make_shared<tensorexpr::TensorExprKernel>(TensorExprKernel(
+      g, kernel_func_name, {}, symbolic_ind, false, symbolic_strides));
 
   const std::string compiled_assembly = kernel->getCodeText();
-
   auto func = compileMethod(kernel, method_name, sizes, types);
   return std::make_pair(std::move(func), compiled_assembly);
 }
@@ -253,6 +277,35 @@ std::vector<at::ScalarType> parseInputTypes(
   return scalarTypes;
 }
 
+std::vector<at::MemoryFormat> parseInputMemoryFormats(
+    const std::string& input_memory_format_str) {
+  std::vector<std::string> memFormatsStr = split(';', input_memory_format_str);
+  std::vector<at::MemoryFormat> memFormats;
+  for (const auto& memFormatStr : memFormatsStr) {
+    at::MemoryFormat memFormat;
+    if (memFormatStr == "contiguous") {
+      memFormat = at::MemoryFormat::Contiguous;
+    } else if (memFormatStr == "channels_last") {
+      memFormat = at::MemoryFormat::ChannelsLast;
+    } else {
+      CAFFE_THROW("Unsupported memory format: ", memFormatStr);
+    }
+    memFormats.push_back(memFormat);
+  }
+  return memFormats;
+}
+
+std::vector<int64_t> parseInputDynamicShapes(
+    const std::string& dynamic_dims_s) {
+  std::vector<std::string> dynamic_dims_list = split(',', dynamic_dims_s);
+  std::vector<int64_t> dynamic_dims;
+  dynamic_dims.reserve(dynamic_dims_list.size());
+  for (const auto& dim : dynamic_dims_list) {
+    dynamic_dims.push_back(c10::stoi(dim));
+  }
+  return dynamic_dims;
+}
+
 std::string getNncKernelId(
     const std::string& model_name,
     const std::string& model_version,
@@ -270,9 +323,12 @@ std::string getNncKernelFuncName(
   return "nnc_" + model_name + "_" + model_version + "_" + method_name;
 }
 
-std::shared_ptr<Graph> preprocessGraphPasses(
+// Preprocess the graph and returns the processed graph and
+// symbolic values if dynamic input shapes are specified
+std::pair<std::shared_ptr<Graph>, std::vector<int64_t>> preprocessGraphPasses(
     std::shared_ptr<Graph>& graph,
-    const std::vector<c10::optional<at::Tensor>>& example_inputs) {
+    const std::vector<c10::optional<at::Tensor>>& example_inputs,
+    const std::vector<int64_t>& dynamic_sizes) {
   GRAPH_DEBUG("Before preprocessing graph passes: ", *graph);
   torch::jit::RemoveTensorMutation(graph);
   torch::jit::EliminateDeadCode(graph->block());
@@ -303,18 +359,25 @@ std::shared_ptr<Graph> preprocessGraphPasses(
   RemoveTensorMutation(graph);
   EliminateDeadCode(graph);
   LowerAllTuples(graph);
+
+  auto sym_val =
+      torch::jit::tensorexpr::makeShapesSymbolic(graph, dynamic_sizes);
+
   GRAPH_DEBUG("After preprocessing graph passes: ", *graph);
-  return graph;
+  return std::make_pair(graph, sym_val);
 }
 
 std::vector<c10::optional<at::Tensor>> generateExampleInputs(
     const std::vector<std::vector<int64_t>>& inputShapes,
-    const std::vector<at::ScalarType>& inputTypes) {
+    const std::vector<at::ScalarType>& inputTypes,
+    const std::vector<at::MemoryFormat>& inputMemoryFormats) {
   std::vector<c10::optional<at::Tensor>> example_inputs;
   example_inputs.reserve(inputShapes.size());
   for (int i = 0; i < inputShapes.size(); ++i) {
+    const auto dtype = at::dtype(inputTypes[i]);
+    const auto memory_format = inputMemoryFormats[i];
     example_inputs.emplace_back(
-        at::rand(inputShapes[i]).to(at::dtype(inputTypes[i])));
+        at::rand(inputShapes[i]).to(dtype).contiguous(memory_format));
   }
   return example_inputs;
 }
@@ -342,14 +405,32 @@ c10::IValue preprocess(
 
     auto sizes = parseInputShapes(*method_spec.at("sizes").toString());
     auto types = parseInputTypes(*method_spec.at("types").toString());
+    auto dynamic_sizes =
+        parseInputDynamicShapes(*method_spec.at("dynamic_sizes").toString());
+
+    std::string memory_formats_str = method_spec.contains("memory_formats")
+        ? (*method_spec.at("memory_formats").toString()).string()
+        : "";
+    auto memory_formats = memory_formats_str.empty()
+        ? std::vector<at::MemoryFormat>(
+              sizes.size(), at::MemoryFormat::Contiguous)
+        : parseInputMemoryFormats(memory_formats_str);
 
-    auto example_inputs = generateExampleInputs(sizes, types);
-    graph = preprocessGraphPasses(graph, example_inputs);
+    auto example_inputs = generateExampleInputs(sizes, types, memory_formats);
+    auto preprocessed =
+        preprocessGraphPasses(graph, example_inputs, dynamic_sizes);
 
     auto kernel_func_name =
         getNncKernelFuncName(model_name, model_version, method_name);
+    auto processed_graph = preprocessed.first;
+    auto sym_values = preprocessed.second;
     auto compiled = torch::jit::mobile::nnc::aotCompile(
-        method_name, graph, sizes, types, kernel_func_name);
+        method_name,
+        processed_graph,
+        sizes,
+        types,
+        kernel_func_name,
+        sym_values);
     writeOutputLlvmAssembly(compiled.second, asmfile_name);
     auto func = std::move(compiled.first);
     func->set_nnc_kernel_id(
diff --git a/torch/csrc/jit/mobile/nnc/context.cpp b/torch/csrc/jit/mobile/nnc/context.cpp
index d56d06614085..6e97c2d4aa8a 100644
--- a/torch/csrc/jit/mobile/nnc/context.cpp
+++ b/torch/csrc/jit/mobile/nnc/context.cpp
@@ -41,7 +41,17 @@ c10::IValue InputSpec::serialize() const {
 }
 
 bool InputSpec::validate(const at::Tensor& input) const {
-  return input.sizes() == sizes_ && input.scalar_type() == dtype_;
+  if (sizes_.size() != input.sizes().size() || input.scalar_type() != dtype_) {
+    return false;
+  }
+  auto spec_sizes = sizes_;
+  for (int i = 0; i < spec_sizes.size(); i++) {
+    // InputSpec size 0 means that the dimension is dynamic
+    if (spec_sizes[i] != 0 && spec_sizes[i] != input.sizes()[i]) {
+      return false;
+    }
+  }
+  return true;
 }
 
 OutputSpec::OutputSpec(const c10::IValue& value) {
@@ -136,6 +146,14 @@ Function::Function(const c10::IValue& value) {
 
   // memory_plan_
   memory_plan_ = MemoryPlan(dict.at("memory_plan"));
+
+  // symbolic shape positions
+  for (const auto& sym_shape_pos :
+       dict.at("sym_shape_pos").toTupleRef().elements()) {
+    auto sym_shape_elements = sym_shape_pos.toTupleRef().elements();
+    sym_shape_positions_.emplace_back(
+        sym_shape_elements[0].toInt(), sym_shape_elements[1].toInt());
+  }
 }
 
 c10::IValue Function::serialize() const {
@@ -185,18 +203,20 @@ void Function::init_execution_state() const {
   ExecutionState state;
   memory_plan_.allocate(&state);
 
-  // The arguments vector consists of 4 sections: inputs, outputs, parameters
-  // and buffers.
+  // The arguments vector consists of 5 sections: inputs, symbolic shapes,
+  // outputs, parameters and buffers.
   auto input_args = input_specs_.size();
+  auto sym_shape_args = sym_shape_positions_.size();
   auto output_args = output_specs_.size();
   auto param_args = parameters_.size();
   auto buffer_args = state.preallocations_.size();
 
   auto& arguments = state.arguments_;
-  arguments.reserve(input_args + output_args + param_args + buffer_args);
+  arguments.reserve(
+      input_args + sym_shape_args + output_args + param_args + buffer_args);
 
   // Keep empty slots to fill in inputs/outputs pointers at execution time.
-  arguments.resize(input_args + output_args);
+  arguments.resize(input_args + sym_shape_args + output_args);
 
   // Fill in parameters as untyped raw pointers.
   // The underlying storage of the parameters should be owned by `parameters_`,
@@ -233,7 +253,7 @@ c10::impl::GenericList Function::run(
 
   // Fill in input tensors.
   TORCH_CHECK(
-      input_specs_.size() == (inputs.size() + sym_shape_positions_.size()),
+      input_specs_.size() == inputs.size(),
       "Input size doesn't match the spec, expect: ",
       input_specs_.size(),
       " actual: ",
@@ -244,8 +264,7 @@ c10::impl::GenericList Function::run(
     const c10::IValue& input = inputs[i];
     const auto& spec = input_specs_[i];
     const auto& input_tensor = input.toTensor();
-    TORCH_CHECK(
-        input_specs_[i].validate(input_tensor), "Invalid input at pos: ", i);
+    TORCH_CHECK(spec.validate(input_tensor), "Invalid input at pos: ", i);
     args[i] = input_tensor.data_ptr();
   }
   offset += inputs.size();
diff --git a/torch/csrc/jit/mobile/parse_bytecode.cpp b/torch/csrc/jit/mobile/parse_bytecode.cpp
index 2dcee06a5ea3..c099bd10954b 100644
--- a/torch/csrc/jit/mobile/parse_bytecode.cpp
+++ b/torch/csrc/jit/mobile/parse_bytecode.cpp
@@ -69,16 +69,16 @@ class OpCodeCache {
 } // namespace
 
 void applyUpgrader(mobile::Function* function, uint64_t operator_version) {
-  const Code& code = function->get_code();
+  Code& code = function->get_code();
   auto& operator_version_map = getOperatorVersionMapForMobile();
-  for (size_t i = 0; i < function->get_code().instructions_.size(); i++) {
-    Instruction& inst = function->get_code().instructions_[i];
+  for (size_t i = 0; i < code.instructions_.size(); i++) {
+    Instruction& inst = code.instructions_[i];
     if (inst.op == OpCode::OP) {
-      std::string op_name = function->get_code().op_names_[inst.X].name;
-      std::string operator_name = function->get_code().op_names_[inst.X].name +
-          (function->get_code().op_names_[inst.X].overload_name.empty()
+      std::string op_name = code.op_names_[inst.X].name;
+      std::string operator_name = code.op_names_[inst.X].name +
+          (code.op_names_[inst.X].overload_name.empty()
                ? ""
-               : "." + function->get_code().op_names_[inst.X].overload_name);
+               : "." + code.op_names_[inst.X].overload_name);
 
       auto it = operator_version_map.find(operator_name);
       // Find out if there is an upgrader for this operator
@@ -102,11 +102,11 @@ void applyUpgrader(mobile::Function* function, uint64_t operator_version) {
             // new_inst.X = upgrader.index;
             // code->instructions_[i] = new_inst;
             TORCH_CHECK(
-                upgrader.index < function->get_code().functions_.size(),
+                upgrader.index < code.functions_.size(),
                 "upgrader index is, ",
                 upgrader.index,
                 " and it's larger than the upgrader function list length ",
-                function->get_code().functions_.size());
+                code.functions_.size());
             inst.op = OpCode::CALL;
             inst.X = upgrader.index;
           }
diff --git a/torch/csrc/jit/mobile/parse_operators.cpp b/torch/csrc/jit/mobile/parse_operators.cpp
index 66f1ca1156f6..47acf09f106f 100644
--- a/torch/csrc/jit/mobile/parse_operators.cpp
+++ b/torch/csrc/jit/mobile/parse_operators.cpp
@@ -5,28 +5,10 @@ namespace torch {
 namespace jit {
 namespace mobile {
 
-std::string operator_str(
-    const std::string& name,
-    const std::string& overloadname) {
-  std::string result = name;
-  if (!overloadname.empty()) {
-    result += "." + overloadname;
-  }
-  return result;
-}
-
-/**
- * Loads operators by looking them up in the Dispatcher and returns
- * the set of operator names (with overload) that are not supported
- * by the current runtime.
- */
-std::unordered_set<std::string> load_and_find_unsupported_operator_names(
+void parseOperators(
     c10::ivalue::TupleElements&& ops_list,
-    mobile::Function* function,
-    int64_t model_version) {
-  std::unordered_set<std::string> unsupported_op_names;
-  // ops_list is the list of operator names that were read in from
-  // bytecode.plk for the method that is currently being processed.
+    const uint64_t& module_load_options,
+    mobile::Function* function) {
   for (auto& op : std::move(ops_list)) {
     auto op_item = std::move(*std::move(op).toTuple()).elements();
     TORCH_CHECK(
@@ -38,44 +20,13 @@ std::unordered_set<std::string> load_and_find_unsupported_operator_names(
     if (op_item.size() > 2) {
       num_args = op_item[2].toInt();
     }
-    auto op_found = function->append_operator(
+    function->append_operator(
         op_item[0].toString()->string(),
         op_item[1].toString()->string(),
-        num_args,
-        model_version);
-    if (!op_found) {
-      unsupported_op_names.emplace(operator_str(
-          op_item[0].toString()->string(), op_item[1].toString()->string()));
-    }
-  }
-  return unsupported_op_names;
-}
-
-void print_unsupported_ops_and_throw(
-    const std::unordered_set<std::string>& unsupported_ops) {
-  std::string error_message("{");
-  for (const auto& op_name : unsupported_ops) {
-    error_message += op_name + ", ";
-  }
-  error_message += "}";
-  TORCH_CHECK(
-      false,
-      "Following ops cannot be found. Please check if the operator library is included in the build. If built with selected ops, check if these ops are in the list. If you are a Meta employee, please see fburl.com/missing_ops for a fix. Or post it in https://discuss.pytorch.org/",
-      error_message);
-}
-
-void parseOperators(
-    c10::ivalue::TupleElements&& ops_list,
-    const int64_t& model_version,
-    const uint64_t& module_load_options,
-    mobile::Function* function) {
-  std::unordered_set<std::string> unsupported_op_names =
-      load_and_find_unsupported_operator_names(
-          std::move(ops_list), function, model_version);
-  if ((module_load_options & MobileModuleLoadOptions::OPERATOR_CHECK) &&
-      !unsupported_op_names.empty()) {
-    print_unsupported_ops_and_throw(unsupported_op_names);
+        num_args);
   }
+  function->initialize_operators(
+      (module_load_options & MobileModuleLoadOptions::OPERATOR_CHECK));
 }
 
 } // namespace mobile
diff --git a/torch/csrc/jit/mobile/parse_operators.h b/torch/csrc/jit/mobile/parse_operators.h
index 698c24ee554f..508267387abf 100644
--- a/torch/csrc/jit/mobile/parse_operators.h
+++ b/torch/csrc/jit/mobile/parse_operators.h
@@ -16,7 +16,6 @@ namespace mobile {
 
 TORCH_API void parseOperators(
     c10::ivalue::TupleElements&& ops_list,
-    const int64_t& model_version,
     const uint64_t& module_load_options,
     mobile::Function* function);
 } // namespace mobile
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index 40a64707cbdc..72365588c783 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -1,4 +1,5 @@
 #include <c10/util/Exception.h>
+#include <c10/util/overloaded.h>
 #include <torch/csrc/jit/mobile/profiler_edge.h>
 #include <string>
 #include <vector>
@@ -28,33 +29,26 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   torch::autograd::profiler::prepareProfiler(
       config, {torch::autograd::profiler::ActivityType::CPU});
   if (with_modules || with_stack) {
-    auto post_processing =
-        [this, with_stack, with_modules](
-            std::vector<torch::autograd::profiler::KinetoEvent>& events) {
-          std::string no_debug_info(
-              "Model was not saved with debug information");
-          for (auto& e : events) {
-            if (with_modules) {
-              // Since KinetoEvents's module hierarchy takes vector of strings
-              // we just construct a temporary vector using one string element
-              if (this->m_.hasDebugHandles()) {
-                e.moduleHierarchy(std::vector<std::string>(
-                    {this->m_.getModuleHierarchy(e.debugHandle())}));
-              } else {
-                e.moduleHierarchy(std::vector<std::string>({no_debug_info}));
-              }
-            } else if (with_stack) {
-              // Since KinetoEvents's stack trace takes vector of strings we
-              // just construct a temporary vector using one string element
-              if (this->m_.hasDebugHandles()) {
-                e.stack(std::vector<std::string>(
-                    {this->m_.getCallStack(e.debugHandle())}));
-              } else {
-                e.stack(std::vector<std::string>({no_debug_info}));
-              }
-            }
-          }
-        };
+    auto post_processing = [this, with_stack, with_modules](
+                               int64_t debug_handle,
+                               std::vector<std::string>& jit_stack,
+                               std::vector<std::string>& jit_modules) {
+      std::string no_debug_info("Model was not saved with debug information");
+      if (with_modules) {
+        // Since KinetoEvents's module hierarchy takes vector of strings
+        // we just construct a temporary vector using one string element
+        jit_modules = std::vector<std::string>(
+            {this->m_.hasDebugHandles()
+                 ? this->m_.getModuleHierarchy(debug_handle)
+                 : no_debug_info});
+      } else if (with_stack) {
+        // Since KinetoEvents's stack trace takes vector of strings we
+        // just construct a temporary vector using one string element
+        jit_stack = std::vector<std::string>(
+            {this->m_.hasDebugHandles() ? this->m_.getCallStack(debug_handle)
+                                        : no_debug_info});
+      }
+    };
     torch::autograd::profiler::enableProfilerWithEventPostProcess(
         config,
         {torch::autograd::profiler::ActivityType::CPU},
diff --git a/torch/csrc/jit/mobile/train/export_data.cpp b/torch/csrc/jit/mobile/train/export_data.cpp
index bcf005df86ba..c1b08b4fb3b0 100644
--- a/torch/csrc/jit/mobile/train/export_data.cpp
+++ b/torch/csrc/jit/mobile/train/export_data.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/mobile/train/export_data.h>
 
+#include <torch/csrc/jit/mobile/import_export_common.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/serialization/pickler.h>
@@ -7,7 +8,14 @@
 
 #include <caffe2/serialize/inline_container.h>
 
+#include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type.h>
+
+#if defined(ENABLE_FLATBUFFER)
+#include <flatbuffers/flatbuffers.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#endif // defined(ENABLE_FLATBUFFER)
+
 #include <string>
 #include <vector>
 
@@ -19,12 +27,15 @@ char const* toString(OpCode op);
 
 namespace {
 
-class ScriptModuleSerializer {
+/**
+ * Serializes an IValue using Pickle, and puts it in a file named "data.pkl"
+ * in a ZIP wrapper.
+ */
+class IValuePickler final {
  public:
-  explicit ScriptModuleSerializer(const std::string& filename)
-      : writer_(filename) {}
+  explicit IValuePickler(const std::string& filename) : writer_(filename) {}
 
-  explicit ScriptModuleSerializer(
+  explicit IValuePickler(
       const std::function<size_t(const void*, size_t)>& writer_func)
       : writer_(writer_func) {}
 
@@ -33,6 +44,7 @@ class ScriptModuleSerializer {
     writeArchive("data", object);
   }
 
+ private:
   void writeArchive(const std::string& archive_name, const IValue& value) {
     std::vector<char> data;
     // Vector to capture the run-time class types during pickling the IValues
@@ -65,32 +77,105 @@ class ScriptModuleSerializer {
 };
 
 } // namespace
-} // namespace mobile
 
-void _save_parameters(
-    const std::map<std::string, at::Tensor>& map,
-    std::ostream& out) {
-  mobile::ScriptModuleSerializer serializer(
-      [&](const void* buf, size_t nbytes) -> size_t {
-        out.write(static_cast<const char*>(buf), nbytes);
-        return !out ? 0 : nbytes;
-      });
+/**
+ * Converts a map of named tensors to a c10::Dict.
+ */
+c10::Dict<std::string, at::Tensor> tensor_map_to_dict(
+    const std::map<std::string, at::Tensor>& map) {
   c10::Dict<std::string, at::Tensor> dict;
   for (const auto& e : map) {
     dict.insert(e.first, e.second);
   }
-  serializer.serialize(dict);
+  return dict;
 }
 
+/**
+ * Returns a Module with a single attribute, with the attribute name specified
+ * by #internal::kSavedParametersAttributeName, whose value is the provided
+ * dict.
+ */
+mobile::Module tensor_dict_to_mobile(
+    const c10::Dict<std::string, at::Tensor>& dict) {
+  // Create an Object to back the Module, with an attribute to hold the dict.
+  auto cu = std::make_shared<torch::jit::CompilationUnit>();
+  // Note that the name doesn't really matter, but it must begin with
+  // "__torch__." to be treated as a valid class when being imported.
+  auto cls = c10::ClassType::create(
+      "__torch__.SavedParameters", cu, /*is_module=*/true);
+  cls->addAttribute(
+      internal::kSavedParametersAttributeName,
+      c10::DictType::create(dict.keyType(), dict.valueType()));
+  auto object = c10::ivalue::Object::create(
+      c10::StrongTypePtr(std::move(cu), std::move(cls)), /*numSlots=*/1);
+
+  // Add the dict as an attribute.
+  object->setAttr(internal::kSavedParametersAttributeName, dict);
+
+  // Wrap the Object in a Module.
+  auto mcu = std::make_shared<mobile::CompilationUnit>();
+  return mobile::Module(object, mcu);
+}
+
+} // namespace mobile
+
 void _save_parameters(
     const std::map<std::string, at::Tensor>& map,
-    const std::string& filename) {
-  mobile::ScriptModuleSerializer serializer(filename);
-  c10::Dict<std::string, at::Tensor> dict;
-  for (const auto& e : map) {
-    dict.insert(e.first, e.second);
+    std::ostream& out,
+    bool use_flatbuffer) {
+  auto dict = mobile::tensor_map_to_dict(map);
+
+  if (use_flatbuffer) {
+#if defined(ENABLE_FLATBUFFER)
+    // For Flatbuffer, we serialize an entire, mostly-empty module containing
+    // the dict as an attribute.
+    flatbuffers::DetachedBuffer bytes = torch::jit::save_mobile_module_to_bytes(
+        mobile::tensor_dict_to_mobile(dict));
+    out.write(
+        reinterpret_cast<char*>(bytes.data()),
+        static_cast<std::streamsize>(bytes.size()));
+#else // !defined(ENABLE_FLATBUFFER)
+    TORCH_CHECK(
+        false,
+        "Trying to export as flatbuffer file but "
+        "the build hasn't enabled flatbuffer");
+#endif // !defined(ENABLE_FLATBUFFER)
+  } else {
+    // For Pickle, we only serialize the dict itself.
+    mobile::IValuePickler pickler(
+        [&](const void* buf, size_t nbytes) -> size_t {
+          out.write(
+              static_cast<const char*>(buf),
+              static_cast<std::streamsize>(nbytes));
+          return !out ? 0 : nbytes;
+        });
+    pickler.serialize(dict);
+  }
+}
+
+void _save_parameters(
+    const std::map<std::string, at::Tensor>& map,
+    const std::string& filename,
+    bool use_flatbuffer) {
+  auto dict = mobile::tensor_map_to_dict(map);
+
+  if (use_flatbuffer) {
+#if defined(ENABLE_FLATBUFFER)
+    // For Flatbuffer, we serialize an entire, mostly-empty module containing
+    // the dict as an attribute.
+    torch::jit::save_mobile_module(
+        mobile::tensor_dict_to_mobile(dict), filename);
+#else // !defined(ENABLE_FLATBUFFER)
+    TORCH_CHECK(
+        false,
+        "Trying to export as flatbuffer file but "
+        "the build hasn't enabled flatbuffer");
+#endif // !defined(ENABLE_FLATBUFFER)
+  } else {
+    // For Pickle, we only serialize the dict itself.
+    mobile::IValuePickler pickler(filename);
+    pickler.serialize(dict);
   }
-  serializer.serialize(dict);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/train/export_data.h b/torch/csrc/jit/mobile/train/export_data.h
index a5de1acb68d4..d20a5bb48201 100644
--- a/torch/csrc/jit/mobile/train/export_data.h
+++ b/torch/csrc/jit/mobile/train/export_data.h
@@ -4,13 +4,46 @@
 
 namespace torch {
 namespace jit {
+
+/**
+ * Serializes the provided tensor map to the provided stream.
+ *
+ * @param[in] map The tensors to serialize.
+ * @param[in] out The stream to write the serialized data to.
+ * @param[in] use_flatbuffer If true, use Flatbuffers to serialize the data.
+ *     If false, use Pickle.
+ */
 TORCH_API void _save_parameters(
     const std::map<std::string, at::Tensor>& map,
-    std::ostream& out);
+    std::ostream& out,
+    bool use_flatbuffer = false);
 
+/**
+ * Serializes the provided tensor map to a file.
+ *
+ * @param[in] map The tensors to serialize.
+ * @param[in] filename The stem of the file name to write to. If
+ *     @p use_flatbuffer is false, the extension ".pkl" will be appended. If
+ *     @p use_flatbuffer is true, the extension ".ff" will be appended.
+ * @param[in] use_flatbuffer If true, use Flatbuffers to serialize the data.
+ *     If false, use Pickle.
+ */
 TORCH_API void _save_parameters(
     const std::map<std::string, at::Tensor>& map,
-    const std::string& filename);
+    const std::string& filename,
+    bool use_flatbuffer = false);
+
+namespace mobile {
+
+// NOTE: Please prefer using _save_parameters directly over using the 2
+// functions below.
+TORCH_API mobile::Module tensor_dict_to_mobile(
+    const c10::Dict<std::string, at::Tensor>& dict);
+
+c10::Dict<std::string, at::Tensor> tensor_map_to_dict(
+    const std::map<std::string, at::Tensor>& map);
+
+} // namespace mobile
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/upgrader_mobile.cpp b/torch/csrc/jit/mobile/upgrader_mobile.cpp
index 281925045584..0e52829255d0 100644
--- a/torch/csrc/jit/mobile/upgrader_mobile.cpp
+++ b/torch/csrc/jit/mobile/upgrader_mobile.cpp
@@ -43,21 +43,29 @@ getOperatorVersionMapForMobile() {
                     std::vector<Upgrader>({
                         Upgrader({0, 3, "div__Tensor_0_3", 3})
                     })},
+                {std::string("aten::gelu"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 9, "gelu_0_9", 5})
+                    })},
+                {std::string("aten::gelu.out"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 9, "gelu_out_0_9", 6})
+                    })},
                 {std::string("aten::linspace"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 7, "linspace_0_7", 5})
+                        Upgrader({0, 7, "linspace_0_7", 7})
                     })},
                 {std::string("aten::linspace.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 7, "linspace_out_0_7", 6})
+                        Upgrader({0, 7, "linspace_out_0_7", 8})
                     })},
                 {std::string("aten::logspace"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 8, "logspace_0_8", 7})
+                        Upgrader({0, 8, "logspace_0_8", 9})
                     })},
                 {std::string("aten::logspace.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 8, "logspace_out_0_8", 8})
+                        Upgrader({0, 8, "logspace_out_0_8", 10})
                     })},
       });
   return operatorVersionMapForMobile;
@@ -292,6 +300,45 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div", "out_mode", 4}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "gelu_0_9",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STORE, 1, 0},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>({
+                                           c10::IValue("none"),
+                                   }), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               1
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::gelu", "", 1}),
+                           }), // operators list
+                   }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "gelu_out_0_9",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 2},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>({
+                                           c10::IValue("none"),
+                                   }), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               2
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::gelu", "out", 2}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "linspace_0_7",
@@ -299,7 +346,7 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::STOREN, 1, 7},
                                            Instruction{OpCode::LOAD, 3, 0},
                                            Instruction{OpCode::LOADC, 0, 0},
-                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::__IS__, 0, 0},
                                            Instruction{OpCode::JF, 10, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
@@ -308,17 +355,17 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::LOAD, 5, 0},
                                            Instruction{OpCode::LOAD, 6, 0},
                                            Instruction{OpCode::LOAD, 7, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::JMP, 10, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOAD, 3, 0},
-                                           Instruction{OpCode::OP, 2, 0},
+                                           Instruction{OpCode::OP, 1, 0},
                                            Instruction{OpCode::LOAD, 4, 0},
                                            Instruction{OpCode::LOAD, 5, 0},
                                            Instruction{OpCode::LOAD, 6, 0},
                                            Instruction{OpCode::LOAD, 7, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::STORE, 8, 0},
                                            Instruction{OpCode::DROPR, 7, 0},
                                            Instruction{OpCode::DROPR, 6, 0},
@@ -338,7 +385,6 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                8
                            ),
                            std::vector<OperatorString>({
-                                   OperatorString({"aten::__is__", "", 2}),
                                    OperatorString({"aten::linspace", "", 7}),
                                    OperatorString({"prim::unchecked_cast", "", 1}),
                            }), // operators list
@@ -350,20 +396,20 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::STOREN, 1, 4},
                                            Instruction{OpCode::LOAD, 3, 0},
                                            Instruction{OpCode::LOADC, 0, 0},
-                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::__IS__, 0, 0},
                                            Instruction{OpCode::JF, 7, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOADC, 1, 0},
                                            Instruction{OpCode::LOAD, 4, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::JMP, 7, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOAD, 3, 0},
-                                           Instruction{OpCode::OP, 2, 0},
-                                           Instruction{OpCode::LOAD, 4, 0},
                                            Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::LOAD, 4, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::STORE, 5, 0},
                                            Instruction{OpCode::DROPR, 4, 0},
                                            Instruction{OpCode::DROPR, 2, 0},
@@ -380,7 +426,6 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                5
                            ),
                            std::vector<OperatorString>({
-                                   OperatorString({"aten::__is__", "", 2}),
                                    OperatorString({"aten::linspace", "out", 4}),
                                    OperatorString({"prim::unchecked_cast", "", 1}),
                            }), // operators list
@@ -392,7 +437,7 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::STOREN, 1, 8},
                                            Instruction{OpCode::LOAD, 3, 0},
                                            Instruction{OpCode::LOADC, 0, 0},
-                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::__IS__, 0, 0},
                                            Instruction{OpCode::JF, 11, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
@@ -402,18 +447,18 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::LOAD, 6, 0},
                                            Instruction{OpCode::LOAD, 7, 0},
                                            Instruction{OpCode::LOAD, 8, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::JMP, 11, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOAD, 3, 0},
-                                           Instruction{OpCode::OP, 2, 0},
+                                           Instruction{OpCode::OP, 1, 0},
                                            Instruction{OpCode::LOAD, 4, 0},
                                            Instruction{OpCode::LOAD, 5, 0},
                                            Instruction{OpCode::LOAD, 6, 0},
                                            Instruction{OpCode::LOAD, 7, 0},
                                            Instruction{OpCode::LOAD, 8, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::STORE, 9, 0},
                                            Instruction{OpCode::DROPR, 8, 0},
                                            Instruction{OpCode::DROPR, 7, 0},
@@ -434,7 +479,6 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                9
                            ),
                            std::vector<OperatorString>({
-                                   OperatorString({"aten::__is__", "", 2}),
                                    OperatorString({"aten::logspace", "", 8}),
                                    OperatorString({"prim::unchecked_cast", "", 1}),
                            }), // operators list
@@ -446,22 +490,22 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                            Instruction{OpCode::STOREN, 1, 5},
                                            Instruction{OpCode::LOAD, 3, 0},
                                            Instruction{OpCode::LOADC, 0, 0},
-                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::__IS__, 0, 0},
                                            Instruction{OpCode::JF, 8, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOADC, 1, 0},
                                            Instruction{OpCode::LOAD, 4, 0},
                                            Instruction{OpCode::LOAD, 5, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::JMP, 8, 0},
                                            Instruction{OpCode::LOAD, 1, 0},
                                            Instruction{OpCode::LOAD, 2, 0},
                                            Instruction{OpCode::LOAD, 3, 0},
-                                           Instruction{OpCode::OP, 2, 0},
+                                           Instruction{OpCode::OP, 1, 0},
                                            Instruction{OpCode::LOAD, 4, 0},
                                            Instruction{OpCode::LOAD, 5, 0},
-                                           Instruction{OpCode::OP, 1, 0},
+                                           Instruction{OpCode::OP, 0, 0},
                                            Instruction{OpCode::STORE, 6, 0},
                                            Instruction{OpCode::DROPR, 5, 0},
                                            Instruction{OpCode::DROPR, 4, 0},
@@ -479,7 +523,6 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                6
                            ),
                            std::vector<OperatorString>({
-                                   OperatorString({"aten::__is__", "", 2}),
                                    OperatorString({"aten::logspace", "out", 5}),
                                    OperatorString({"prim::unchecked_cast", "", 1}),
                            }), // operators list
@@ -490,8 +533,7 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
         upgrader_function.function.append_operator(
             op.name,
             op.overload_name,
-            op.num_specified_args,
-            caffe2::serialize::kMaxSupportedFileFormatVersion);
+            op.num_specified_args);
       }
     }
     return upgrader_function_list;
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
new file mode 100644
index 000000000000..084e6688f148
--- /dev/null
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -0,0 +1,252 @@
+# Guidance for Operator Developer
+
+PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards comatibility when making changes to an operator.
+In order to do this we introduce the concept of the *upgrader*: a method to adapt the new operator to mimic the old operator behavior.
+When a new runtime reads an old program containing the old operator definition, the upgrader will adapt the old operator definition to comply with the new operator implementation. As you would expect, an upgrader is only applied when an old operation definition is encountered (i.e. if there are no "old" operators in the program, no upgrader would be used).
+For more details on the reasoning behind this new requirement please refer to the [PyTorch Operator Versioning RFC](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md).
+
+If the change to the operator is BC-breaking in either the schema or the semantics, you are responsible for writing an upgrader to prevent the change from becoming BC breaking.
+
+You can determine if your change in the operator is BC breaking, if it fails `test/forward_backward_compatibility/check_forward_backward_compatibility.py `.
+
+### Some examples BC breaking changes
+
+When making changes to the operators, the first thing to identify is if it's BC/FC breaking. Again, we only targetting for BC breaking changes on this guidance. Here are some examples to help understanding what a BC changes may look like:
+
+#### Backward Compatibility Breakage:
+
+- Return types are more generic than the older version
+  - Old: `foo(Tensor self, int a) -> int`
+  - New: `foo(Tensor self, int a) -> Scalar`
+- Argument types are more specific than the older version
+  - Old: `foo(Tensor self, Scalar a) -> int`
+  - New: `foo(Tensor self, int a) -> int`
+- Added new arguments don’t have associated default values
+  - Old: `foo(Tensor self, int a) -> int`
+  - New: `foo(Tensor self, int a, int b) -> int`
+- Internal implementation change even when the schema remains the same
+- Deprecating an operator
+
+
+### The steps to write upgrader:
+
+### 1.Preparation
+
+[Build PyTorch from souce](https://github.com/pytorch/pytorch#from-source) and prepare a test model before making changes to the operator, following the process below. A test model before making the operator changes is needed to test the upgrader. Otherwise, after the change to operator, the new runtime will no longer be able to produce a model with the historic operator and can't test it anymore.
+
+    1. Add a test module in `test/jit/fixtures_srcs/fixtures_src.py`. In `test/jit/fixtures_srcs/generate_models.py`,
+  ```
+  class TestVersionedLinspaceV7(torch.nn.Module):
+      def __init__(self):
+          super(TestVersionedLinspaceV7, self).__init__()
+
+      def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
+          c = torch.linspace(a, b, steps=5)
+          d = torch.linspace(a, b)
+          return c, d
+  ```
+        Please make sure the module uses the changed operator and follow the name schema ` TestVersioned{${OpnameOverloadedname}}V${kProducedFileFormatVersion}`. [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) can be found in `versions.h`. The example operator usage can be found on [PyTorch Docs](https://pytorch.org/docs/stable/index.html), like [linspace operator](https://pytorch.org/docs/stable/generated/torch.linspace.html)
+     2. Register its corresponding changed operator in ALL_MODULES like following. Use an instance as the key and the changed operator as the value. It will ensure the test model covers everything needed. It's important to check in a valid test model before making the change to the runtime, as it will be really challenging to switch to the revision of the source code and regenerate the test model after the change is merged.
+
+  ```
+  # key: test module instance, value: changed operator name
+  ALL_MODULES = {
+      TestVersionedLinspaceV7(): "aten::linspace",
+  }
+  ```
+
+        This module should include the changed operator. If the operator isn't covered in the model, the model export process will fail.
+
+     3. Export the model to `test/jit/fixtures` by running
+  ```
+  python test/jit/fixtures_src/generate_models.py
+  ```
+
+     4. Commit the change and submit a pull request.
+
+### 2. Make changes to the operator and write an upgrader.
+    1. Make the operator change.
+    2. Write an upgrader in `torch/csrc/jit/operator_upgraders/upgraders_entry.cpp` file inside a map `kUpgradersEntryMap`. The softly enforced naming format is `<operator_name>_<operator_overload>_<start>_<end>`. The start and end means the upgrader can be applied to the operator exported during when [the global operator version](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) within the range `[start, end]`. Let's take an operator `linspace` with the overloaded name `out` as an example. The first thing is to check if the upgrader exists in [upgraders_entry.cpp](https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp).
+        1. If the upgrader doesn't exist in `upgraders_entry.cpp`, the upgrader name can be `linspace_out_0_{kProducedFileFormatVersion}`, where [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) can be found in [versions.h](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h).
+        2. If the upgrader exist in `upgraders_entry.cpp`, for example `linspace_out_0_7` (means `linspace.out` operator is changed when operator version is bumped from 7 to 8),
+            1. If it's possible to write an upgrader valid for `linspace` before versioning bumping to 8, after versioning bumping to 8, write an upgrader `linspace_out_0_{kProducedFileFormatVersion}`
+            2. If it's impossible to write an upgrader valid for `linspace` before versioning bumping to 8, check the date when the version is bumped to 8  at [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82). If it has been 180 days, write an upgrader `linspace_out_8_{kProducedFileFormatVersion}` for `linspace.out` after bumping to 8, and deprecate the old upgrader. If it hasn't been 180 days, wait until 180 days and do the same changes as above.
+
+    To write an upgrader, you would need to know how the new runtime with the new `linspace` operator can handle an old model with the old `linspace` operator. When `linspace` is bumped to 8, the change is to make `step` a required argument, instead of an optional argument. The old schema is:
+  ```
+  linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], dtype: Optional[int], layout: Optional[int],
+                    device: Optional[Device], pin_memory: Optional[bool]):
+  ```
+    And the new schema is:
+  ```
+  linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: int, dtype: Optional[int], layout: Optional[int],
+                    device: Optional[Device], pin_memory: Optional[bool]):
+  ```
+    An upgrader will only be applied to an old model and it won't be applied to a new model. The upgrader can be written with the following logic:
+  ```
+  def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int],
+                    device: Optional[Device], pin_memory: Optional[bool]):
+    if (steps is None):
+      return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
+    return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
+  ```
+
+    The actual upgrader needs to be written as [TorchScript](https://pytorch.org/docs/stable/jit.html), and the below example is the actual upgrader of the operator `linspace.out `and the operator ` linspace` exported at version from 0 to 7.
+  ```
+  static std::unordered_map<std::string, std::string> kUpgradersEntryMap(
+      {
+        {"linspace_0_7", R"SCRIPT(
+  def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int],
+                    device: Optional[Device], pin_memory: Optional[bool]):
+    if (steps is None):
+      return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
+    return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
+  )SCRIPT"},
+      }
+  ```
+    With the upgrader, when a new runtime loads an old model, it will first check the operator version of the old model. If it's older than the current runtime, it will replace the operator from the old model with the upgrader above.
+
+    3. Bump [`kMaxSupportedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L15) the [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) by 1 and provide the reasons under [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L73-L81)
+  ```
+
+  constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L;
+
+  ...
+  // We describe new operator version bump reasons here:
+  // 1) [01/24/2022]
+  //     We bump the version number to 8 to update aten::linspace
+  //     and aten::linspace.out to error out when steps is not
+  //     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+  // 2) [01/30/2022]
+  //     Bump the version number to 9 to update aten::logspace and
+  //     and aten::logspace.out to error out when steps is not
+  //     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+  constexpr uint64_t kProducedFileFormatVersion = 0x9L;
+  ```
+
+    4. In `torch/csrc/jit/operator_upgraders/version_map.cpp`, add changes like below. You will need to make sure that the entry is **SORTED** by the bumped to version number.
+  ```
+  {{${operator_name.overloaded_name},
+    {{${bump_to_version},
+      "${upgrader_name}",
+      "${old operator schema}"}}},
+  ```
+    For the example operator `linspace`, if there are two version bumps, one is bumped to 8 and one is bumped to 12, the sorted result is:
+  ```
+  {{"aten::linspace",
+    {{12,
+      "linspace_0_11",
+      "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}},
+    {{8,
+      "linspace_0_7",
+      "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}},
+  ```
+
+    5. After [rebuilding PyTorch](https://github.com/pytorch/pytorch#from-source), run the following command to auto update the file [`torch/csrc/jit/mobile/upgrader_mobile.cpp`](https://github.com/pytorch/pytorch/blob/8757e21c6a4fc00e83539aa7f9c28eb11eff53c1/torch/csrc/jit/mobile/upgrader_mobile.cpp). After rebuild PyTorch from source (`python setup.py`), run
+
+  ```
+  python pytorch/torchgen/operator_versions/gen_mobile_upgraders.py
+  ```
+
+    6. Add a test. With the model generated from step 1, you will need to add tests in `test/test_save_load_for_op_versions.py`. Following is an example to write a test
+
+  ```
+        @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+        @given(
+            sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        )  # Generate a pair (integer, float)
+        @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
+        def test_versioned_div_scalar(self, sample_input):
+            # Step 1. Write down the old behavior of this operator, if possible
+            def historic_div_scalar_float(self, other: float):
+                return torch.true_divide(self, other)
+
+            # Step 2. Write down how current module should look like
+            class MyModuleFloat(torch.nn.Module):
+                def __init__(self):
+                    super(MyModuleFloat, self).__init__()
+
+                def forward(self, a, b: float):
+                    return a / b
+            try:
+                # Step 3. Load the old model and it will apply upgrader
+                v3_mobile_module_float = _load_for_lite_interpreter(
+                    pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl")
+                v3_server_module_float = torch.jit.load(
+                    pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl")
+            except Exception as e:
+                self.skipTest("Failed to load fixture!")
+
+            # Step4. Load the new model and it won't apply the ugprader
+            current_mobile_module_float = self._save_load_mobile_module(MyModuleFloat)
+            current_server_module_float = self._save_load_module(MyModuleFloat)
+
+            for val_a, val_b in product(sample_input, sample_input):
+                a = torch.tensor((val_a,))
+                b = val_b
+
+                def _helper(m, fn):
+                    m_result = self._try_fn(m, a, b)
+                    fn_result = self._try_fn(fn, a, b)
+
+                    if isinstance(m_result, Exception):
+                        self.assertTrue(fn_result, Exception)
+                    else:
+                        self.assertEqual(m_result, fn_result)
+
+                # Ensure the module loaded from the old model with upgrader
+                # has the same result as the module loaded from the new model
+                _helper(v3_mobile_module_float, current_mobile_module_float)
+                _helper(v3_mobile_module_float, current_server_module_float)
+
+                # Ensure the module loaded from the new model with upgrader
+                # has the same result as the module loaded from the new model
+                _helper(current_mobile_module_float, torch.div)
+                _helper(current_server_module_float, torch.div)
+  ```
+
+    7. Commit all changes made in step 2 in a single pull request and submit it.
+
+You can look at following PRs to get the rough idea of what needs to be done:
+1. [PR that adds `logspace` test modules](https://github.com/pytorch/pytorch/pull/72052)
+2. [PR that updates `logspace`](https://github.com/pytorch/pytorch/pull/72051)
+
+---
+**NOTE**
+
+1. Adding arguments with a default value to an operator is not BC breaking, and thus does not require an upgrader. For example, the following change to operator `foo` is backwards compatible:
+```
+# before
+def foo(x, y):
+    return x, y
+```
+```
+# after
+def foo(x, y, z=100):
+    return x, y, z
+```
+
+2. To help understanding the BC/FC breakage changes, here are some FC breaking changes examples. The solution to resolve it is not there yet. If it's desired, please report it in either [PyTorch Forum](https://discuss.pytorch.org/) or [PyTorch Github](https://github.com/pytorch/pytorch). We will prioritize it accordingly.
+
+    - Adding new default argument:
+    - Adding a new default argument not RIGHT BEFORE the out arguments which can be 0 or more.
+      - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+      - New: `foo(Tensor self, int a, int c=1, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+
+    - Adding out argument NOT at the end of the schema.
+      - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+      - New: `foo(Tensor self, int a, Tensor(d!), int b=1, Tensor(a!) out) -> (Tensor(a!), Tensor(d!))`
+
+    - Adding default arguments with container types such as ListType or DictType (list or dict).
+      - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+      - New: `foo(Tensor self, int a, int b=1, int[2] c=1, Tensor(a!) out) -> (Tensor(a!))`
+    - Changing default argument’s name
+      - This will only work when the default argument always uses the default value (so that serialization will ignore it). In all other cases, it will fail.
+      - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+      - New: `foo(Tensor self, int a, int c=1, Tensor(a!) out) -> (Tensor(a!))`
+    - Changing default argument’s default value. This will break when this argument is saved with the default value in newer runtime. Older runtime will use its old default value which will lead to wrong output.
+      - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))`
+      - New: `foo(Tensor self, int a, int b=4, Tensor(a!) out) -> (Tensor(a!))`
+    - Adding new operator
+
+---
diff --git a/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp b/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp
index 75201cf5d677..3f41878d7bbe 100644
--- a/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp
+++ b/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp
@@ -14,64 +14,90 @@
 namespace torch {
 namespace jit {
 
-static std::unordered_map<std::string, std::string> kUpgradersEntryMap(
-    {{"logspace_0_8", R"SCRIPT(
+static std::unordered_map<std::string, std::string> kUpgradersEntryMap({
+    {"logspace_0_8", R"SCRIPT(
 def logspace_0_8(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], base: float, *, dtype: Optional[int], layout: Optional[int],
                  device: Optional[Device], pin_memory: Optional[bool]):
   if (steps is None):
     return torch.logspace(start=start, end=end, steps=100, base=base, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
   return torch.logspace(start=start, end=end, steps=steps, base=base, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
 )SCRIPT"},
-     {"logspace_out_0_8", R"SCRIPT(
+    {"logspace_out_0_8", R"SCRIPT(
 def logspace_out_0_8(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], base: float, *, out: Tensor):
   if (steps is None):
     return torch.logspace(start=start, end=end, steps=100, base=base, out=out)
   return torch.logspace(start=start, end=end, steps=steps, base=base, out=out)
 )SCRIPT"},
-     {"linspace_0_7", R"SCRIPT(
+    {"linspace_0_7", R"SCRIPT(
 def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int],
                  device: Optional[Device], pin_memory: Optional[bool]):
   if (steps is None):
     return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
   return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
 )SCRIPT"},
-     {"linspace_out_0_7", R"SCRIPT(
+    {"linspace_out_0_7", R"SCRIPT(
 def linspace_out_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, out: Tensor):
   if (steps is None):
     return torch.linspace(start=start, end=end, steps=100, out=out)
   return torch.linspace(start=start, end=end, steps=steps, out=out)
 )SCRIPT"},
-     {"div_Tensor_0_3", R"SCRIPT(
+    {"div_Tensor_0_3", R"SCRIPT(
 def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
   if (self.is_floating_point() or other.is_floating_point()):
     return self.true_divide(other)
   return self.divide(other, rounding_mode='trunc')
 )SCRIPT"},
-     {"div_Scalar_0_3", R"SCRIPT(
+    {"div_Tensor_mode_0_3", R"SCRIPT(
+def div_Tensor_mode_0_3(self: Tensor, other: Tensor, *, rounding_mode: Optional[str]=None) -> Tensor:
+  return self.divide(other, rounding_mode=rounding_mode)
+)SCRIPT"},
+    {"div_Scalar_0_3", R"SCRIPT(
 def div_Scalar_0_3(self: Tensor, other: number) -> Tensor:
   if (self.is_floating_point() or isinstance(other, float)):
     return self.true_divide(other)
   return self.divide(other, rounding_mode='trunc')
 )SCRIPT"},
-     {"div_out_0_3", R"SCRIPT(
+    {"div_Scalar_mode_0_3", R"SCRIPT(
+def div_Scalar_mode_0_3(self: Tensor, other: number, *, rounding_mode: Optional[str]=None) -> Tensor:
+  return self.divide(other, rounding_mode=rounding_mode)
+)SCRIPT"},
+    {"div_out_0_3", R"SCRIPT(
 def div_out_0_3(self: Tensor, other: Tensor, *, out: Tensor) -> Tensor:
   if (self.is_floating_point() or other.is_floating_point() or out.is_floating_point()):
     return self.true_divide(other, out=out)
   return self.divide(other, rounding_mode='trunc', out=out)
 )SCRIPT"},
-     {"div__Tensor_0_3", R"SCRIPT(
+    {"div_out_mode_0_3", R"SCRIPT(
+def div_out_mode_0_3(self: Tensor, other: Tensor, *, rounding_mode: Optional[str]=None, out: Tensor) -> Tensor:
+  return self.divide(other, rounding_mode=rounding_mode, out=out)
+)SCRIPT"},
+    {"div__Tensor_0_3", R"SCRIPT(
 def div__Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
   if (self.is_floating_point() or other.is_floating_point()):
     return self.true_divide_(other)
   return self.divide_(other, rounding_mode='trunc')
 )SCRIPT"},
-     {"div__Scalar_0_3", R"SCRIPT(
+    {"div__Tensor_mode_0_3", R"SCRIPT(
+def div__Tensor_mode_0_3(self: Tensor, other: Tensor, *, rounding_mode: Optional[str]=None) -> Tensor:
+  return self.divide_(other, rounding_mode=rounding_mode)
+)SCRIPT"},
+    {"div__Scalar_0_3", R"SCRIPT(
 def div__Scalar_0_3(self: Tensor, other: number) -> Tensor:
   if (self.is_floating_point() or isinstance(other, float)):
     return self.true_divide_(other)
   return self.divide_(other, rounding_mode='trunc')
 )SCRIPT"},
-     {"full_0_4", R"SCRIPT(
+    {"div__Scalar_mode_0_3", R"SCRIPT(
+def div__Scalar_mode_0_3(self: Tensor, other: number, *, rounding_mode: Optional[str]=None) -> Tensor:
+  return self.divide_(other, rounding_mode=rounding_mode)
+)SCRIPT"},
+    {"full_names_0_4", R"SCRIPT(
+def full_names_0_4(size:List[int], fill_value:number, *, names:Optional[List[str]]=None,
+                   dtype:Optional[int]=None, layout:Optional[int]=None, device:Optional[Device]=None,
+                   pin_memory:Optional[bool]=None) -> Tensor:
+  return torch.full(size, fill_value, names=names, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
+)SCRIPT"},
+    {"full_0_4", R"SCRIPT(
 def full_0_4(size:List[int], fill_value:number, *, dtype:Optional[int]=None,
              layout:Optional[int]=None, device:Optional[Device]=None,
              pin_memory:Optional[bool]=None) -> Tensor:
@@ -79,10 +105,19 @@ def full_0_4(size:List[int], fill_value:number, *, dtype:Optional[int]=None,
     fill_value = float(fill_value)
   return torch.full(size, fill_value, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory)
 )SCRIPT"},
-     {"full_out_0_4", R"SCRIPT(
+    {"full_out_0_4", R"SCRIPT(
 def full_out_0_4(size:List[int], fill_value:number, *, out:Tensor) -> Tensor:
   return torch.full(size, fill_value, out=out)
-)SCRIPT"}});
+)SCRIPT"},
+    {"gelu_0_9", R"SCRIPT(
+def gelu_0_9(self: Tensor) -> Tensor:
+  return torch.gelu(self, approximate='none')
+)SCRIPT"},
+    {"gelu_out_0_9", R"SCRIPT(
+def gelu_out_0_9(self: Tensor, *, out: Tensor) -> Tensor:
+  return torch.gelu(self, approximate='none', out=out)
+)SCRIPT"},
+});
 
 std::shared_ptr<Graph> create_upgrader_graph(
     const std::string& upgrader_name,
diff --git a/torch/csrc/jit/operator_upgraders/version_map.cpp b/torch/csrc/jit/operator_upgraders/version_map.cpp
index e6860e318cee..5f6a05c83eed 100644
--- a/torch/csrc/jit/operator_upgraders/version_map.cpp
+++ b/torch/csrc/jit/operator_upgraders/version_map.cpp
@@ -36,30 +36,59 @@ static std::unordered_map<std::string, std::vector<UpgraderEntry>> operatorVersi
       {{4,
         "div_Tensor_0_3",
         "aten::div.Tensor(Tensor self, Tensor other) -> Tensor"}}},
+     {"aten::div.Tensor_mode",
+      {{4,
+        "div_Tensor_mode_0_3",
+        "aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor"}}},
      {"aten::div.Scalar",
       {{4,
         "div_Scalar_0_3",
         "aten::div.Scalar(Tensor self, Scalar other) -> Tensor"}}},
+     {"aten::div.Scalar_mode",
+      {{4,
+        "div_Scalar_mode_0_3",
+        "aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor"}}},
      {"aten::div.out",
       {{4,
         "div_out_0_3",
         "aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"}}},
+     {"aten::div.out_mode",
+      {{4,
+        "div_out_mode_0_3",
+        "aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)"}}},
      {"aten::div_.Tensor",
       {{4,
         "div__Tensor_0_3",
         "aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"}}},
+     {"aten::div_.Tensor_mode",
+      {{4,
+        "div__Tensor_mode_0_3",
+        "aten::div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)"}}},
      {"aten::div_.Scalar",
       {{4,
         "div__Scalar_0_3",
         "aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"}}},
+     {"aten::div_.Scalar_mode",
+      {{4,
+        "div__Scalar_mode_0_3",
+        "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)"}}},
      {"aten::full",
       {{5,
         "full_0_4",
         "aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}},
+     {"aten::full.names",
+      {{5,
+        "full_names_0_4",
+        "aten::full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}},
      {"aten::full.out",
       {{5,
         "full_out_0_4",
-        "aten::full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)"}}}});
+        "aten::full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)"}}},
+     {"aten::gelu", {{10, "gelu_0_9", "aten::gelu(Tensor self) -> Tensor"}}},
+     {"aten::gelu.out",
+      {{10,
+        "gelu_out_0_9",
+        "aten::gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor"}}}});
 
 const std::unordered_map<std::string, std::vector<UpgraderEntry>>&
 get_operator_version_map() {
@@ -91,5 +120,15 @@ void test_only_reset_flag() {
   isVersionMapSorted = false;
 }
 
+static bool calculatePackageVersionBasedOnUpgraders = false;
+
+void calculate_package_version_based_on_upgraders(bool val) {
+  calculatePackageVersionBasedOnUpgraders = val;
+}
+
+bool get_version_calculator_flag() {
+  return calculatePackageVersionBasedOnUpgraders;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/operator_upgraders/version_map.h b/torch/csrc/jit/operator_upgraders/version_map.h
index 682bff86ea17..016777cb949d 100644
--- a/torch/csrc/jit/operator_upgraders/version_map.h
+++ b/torch/csrc/jit/operator_upgraders/version_map.h
@@ -13,6 +13,13 @@ struct UpgraderEntry {
   std::string old_schema;
 };
 
+// Toggle the behaviour of calculating version for the module.
+// If this is true, we calculate solely based on upgraders
+// If this is false, we calculate it based on historic per op version map
+TORCH_API void calculate_package_version_based_on_upgraders(bool val);
+
+TORCH_API bool get_version_calculator_flag();
+
 TORCH_API const std::unordered_map<std::string, std::vector<UpgraderEntry>>&
 get_operator_version_map();
 
diff --git a/torch/csrc/jit/passes/add_if_then_else.cpp b/torch/csrc/jit/passes/add_if_then_else.cpp
new file mode 100644
index 000000000000..72a085fd021e
--- /dev/null
+++ b/torch/csrc/jit/passes/add_if_then_else.cpp
@@ -0,0 +1,55 @@
+#include <torch/csrc/jit/passes/add_if_then_else.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+bool hasNoNodes(Block* block) {
+  auto nodes = block->nodes();
+  return nodes.begin() == nodes.end();
+}
+
+bool hasTrivialSubBlocks(Node* node) {
+  const auto blocks = node->blocks();
+  DCHECK_EQ(blocks.size(), 2);
+
+  return hasNoNodes(blocks[0]) && hasNoNodes(blocks[1]);
+}
+
+} // namespace
+
+bool AddIfThenElseOp(std::shared_ptr<Graph>& graph) {
+  std::vector<Node*> to_replace;
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto* node = graph_it.next(); node != nullptr; node = graph_it.next()) {
+    if (node->kind() != prim::If) {
+      continue;
+    }
+    if (node->outputs().size() != 1) {
+      continue;
+    }
+    if (hasTrivialSubBlocks(node)) {
+      to_replace.push_back(node);
+    }
+  }
+
+  for (auto* node : to_replace) {
+    auto* if_then_else_node = graph->create(prim::IfThenElse, 1);
+    if_then_else_node->addInput(node->input());
+    auto blocks = node->blocks();
+    if_then_else_node->addInput(blocks[0]->return_node()->input());
+    if_then_else_node->addInput(blocks[1]->return_node()->input());
+
+    if_then_else_node->insertBefore(node);
+    if_then_else_node->output()->copyMetadata(node->output());
+
+    node->output()->replaceAllUsesWith(if_then_else_node->output());
+    node->destroy();
+  }
+  return !to_replace.empty();
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/add_if_then_else.h b/torch/csrc/jit/passes/add_if_then_else.h
new file mode 100644
index 000000000000..c6b3f9376d6b
--- /dev/null
+++ b/torch/csrc/jit/passes/add_if_then_else.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool AddIfThenElseOp(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp
index 5efb80e14ca4..9181db0ae909 100644
--- a/torch/csrc/jit/passes/autocast.cpp
+++ b/torch/csrc/jit/passes/autocast.cpp
@@ -12,15 +12,14 @@
 
 #include <stack>
 #include <unordered_set>
+#include <vector>
 
 namespace torch {
 namespace jit {
 
 namespace {
 
-// TODO: Turn on autocast by default. default turned off to avoid tests failures
-// as we prototype the support
-bool autocast_enabled = false;
+bool autocast_enabled = true;
 
 struct AutocastContext {
   bool gpu_enabled = false;
@@ -44,7 +43,7 @@ bool isAutocastNode(Value* value) {
   return class_name.has_value() &&
       (*class_name == "__torch__.torch.cuda.amp.autocast_mode.autocast" ||
        *class_name == "__torch__.torch.cpu.amp.autocast_mode.autocast" ||
-       *class_name == "__torch__.torch.autocast_mode.autocast");
+       *class_name == "__torch__.torch.amp.autocast_mode.autocast");
 }
 
 // If we have an autocast instance, return it
@@ -149,17 +148,23 @@ void castTensorInputs(
   const auto graph = node->owningGraph();
 
   std::unordered_set<Value*> casted_inputs;
+  // need to also keep the inputs in order, otherwise tracing fails
+  // sanity checks because casting ops are inserted in random order
+  std::vector<Value*> casted_inputs_ordered;
   for (auto input : node->inputs()) {
     // TODO: update cast_op signature to take dynamic context flags
     auto input_tensor_type = input->type()->cast<TensorType>();
     if (input_tensor_type && input->node()->kind() != cast_op) {
-      casted_inputs.insert(input);
+      auto has_inserted = casted_inputs.insert(input);
+      if (has_inserted.second) {
+        casted_inputs_ordered.push_back(input);
+      }
     }
   }
 
   WithInsertPoint insert_point(node);
 
-  for (auto input : casted_inputs) {
+  for (auto input : casted_inputs_ordered) {
     if (cast_op == aten::_autocast_to_full_precision) {
       const auto new_input = graph->insert(
           cast_op,
@@ -398,7 +403,6 @@ void handleBlock(Block* block, AutocastContext initial_state) {
 
       // CastPolicy::fp32_set_opt_dtype
       case aten::prod:
-      case aten::softmax:
       case aten::log_softmax:
       case aten::cumprod:
       case aten::cumsum:
@@ -409,13 +413,21 @@ void handleBlock(Block* block, AutocastContext initial_state) {
         }
         break;
 
+      // cast softmax to fp32 only on GPU
+      case aten::softmax:
+        if (!node->schema().is_mutable() && !hasExplicitDtypeArgument(node)) {
+          auto context = current_state();
+          context.cpu_enabled = false;
+          castTensorInputs(node, aten::_autocast_to_full_precision, context);
+        }
+        break;
+
       // CastPolicy::promote (promote inputs to the widest type)
       case aten::addcdiv:
       case aten::addcmul:
       case aten::atan2:
       case aten::bilinear:
       case aten::cat:
-      case aten::_cat:
       case aten::cross:
       case aten::dot:
       case aten::equal:
@@ -437,7 +449,9 @@ void handleBlock(Block* block, AutocastContext initial_state) {
 
       // Banned in autocast, see binary_cross_entropy_banned()
       case aten::binary_cross_entropy:
-        AT_ERROR("Unsafe to autocast");
+        if (current_state()) {
+          AT_ERROR("Unsafe to autocast");
+        }
     }
 
     // process sub-blocks, if any
diff --git a/torch/csrc/jit/passes/check_strict_fusion.cpp b/torch/csrc/jit/passes/check_strict_fusion.cpp
new file mode 100644
index 000000000000..866dba99bcd2
--- /dev/null
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@@ -0,0 +1,132 @@
+
+#include <torch/csrc/jit/passes/check_strict_fusion.h>
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/quantization/helper.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+bool isStrictFusion(Value* value) {
+  const auto class_name = getModuleName(value);
+  return class_name.has_value() &&
+      (*class_name == "__torch__.torch.jit.strict_fusion");
+}
+
+} // namespace
+
+bool fusionGuardCheck(Symbol k) {
+  return k == Symbol::prim("TensorExprDynamicGuard") || k == prim::TypeCheck ||
+      k == prim::CudaFusionGuard || k == prim::RequiresGradCheck;
+}
+
+std::unordered_set<Node*> collectValuesUsedInGuard(
+    Node* guarding_if,
+    Node* enter_node) {
+  // DFS to collect
+  std::unordered_set<Node*> visited_nodes;
+  std::vector<Node*> queue = {guarding_if};
+
+  while (!queue.empty()) {
+    Node* curr = queue[queue.size() - 1];
+    queue.pop_back();
+    visited_nodes.insert(curr);
+    // these nodes directly test Tensor inputs, and are not part of additional
+    // guards inserted
+    if (fusionGuardCheck(curr->kind())) {
+      continue;
+    }
+    for (Value* v : curr->inputs()) {
+      Node* inp_node = v->node();
+      if (inp_node->isBefore(enter_node) ||
+          inp_node->owningBlock() != enter_node->owningBlock()) {
+        continue;
+      }
+      if (visited_nodes.count(inp_node)) {
+        continue;
+      }
+      queue.push_back(inp_node);
+    }
+  }
+  return visited_nodes;
+}
+
+void checkForUnfusedOps(Node* enter_node) {
+  std::vector<Node*> unsupported_nodes;
+  std::vector<Node*> guarding_ifs; // if multiple, we will throw
+  for (Node* node = enter_node->next(); node->kind() != prim::Exit;
+       node = node->next()) {
+    if (node->kind() == prim::If &&
+        fusionGuardCheck(node->input()->node()->kind())) {
+      guarding_ifs.push_back(node);
+      continue;
+    }
+    unsupported_nodes.push_back(node);
+  }
+
+  if (guarding_ifs.size() > 1) {
+    std::stringstream ss;
+    ss << "Found multiple fusions: \n";
+    for (Node* n : guarding_ifs) {
+      ss << *n << "\n";
+    }
+    throw ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str();
+  }
+
+  // NVFuser/autodiff/nnc all insert a number of guards, see
+  // `CudaFusionViewGuard Example Graph`
+  // to check for unfused nodes, look at node's whose outputs
+  // are not depended on by the fusion guard
+  // restrict search for all values after the first
+  // node in the prim::Enter block
+
+  std::unordered_set<Node*> guarding_check_nodes;
+  if (guarding_ifs.size() == 1) {
+    guarding_check_nodes =
+        collectValuesUsedInGuard(guarding_ifs[0], enter_node);
+  }
+  std::vector<Node*> unfused_nodes_not_used_in_guard;
+  for (Node* unfused : unsupported_nodes) {
+    if (!guarding_check_nodes.count(unfused)) {
+      unfused_nodes_not_used_in_guard.push_back(unfused);
+    }
+  }
+  if (unfused_nodes_not_used_in_guard.size()) {
+    std::stringstream ss;
+    ss << "Found unfused operators: \n";
+    for (Node* unfused : unfused_nodes_not_used_in_guard) {
+      ss << "\t";
+      if (unfused->maybeSchema()) {
+        ss << unfused->schema();
+      } else {
+        unfused->kind().toDisplayString();
+      }
+      ss << "\n";
+    }
+    auto range = enter_node->input()->node()->sourceRange();
+    throw ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str();
+  }
+}
+
+void CheckStrictFusion(std::shared_ptr<Graph>& graph) {
+  DepthFirstGraphNodeIterator it(graph);
+  Node* n = nullptr;
+  while ((n = it.next()) != nullptr) {
+    if (n->kind() == prim::Enter && isStrictFusion(n->input())) {
+      checkForUnfusedOps(n);
+    }
+  }
+
+  // TODO: remove context manager after checks
+  // TODO: improve control flow not taken, right now always errors
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/check_strict_fusion.h b/torch/csrc/jit/passes/check_strict_fusion.h
new file mode 100644
index 000000000000..87e86d8a7e4b
--- /dev/null
+++ b/torch/csrc/jit/passes/check_strict_fusion.h
@@ -0,0 +1,12 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void CheckStrictFusion(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.cpp b/torch/csrc/jit/passes/common_expression_hoisting.cpp
deleted file mode 100644
index ab2b9d41afa8..000000000000
--- a/torch/csrc/jit/passes/common_expression_hoisting.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
-
-#include <torch/csrc/jit/ir/alias_analysis.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/node_hashing.h>
-#include <torch/csrc/jit/jit_log.h>
-
-#include <cstddef>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace {
-
-struct CommonExpressionHoister {
-  CommonExpressionHoister(std::shared_ptr<Graph> graph)
-      : graph_(std::move(graph)) {}
-
-  bool run() {
-    HoistCommonExpression(graph_->block());
-    return changed_;
-  }
-
-  void HoistFromIfNode(Node* if_node) {
-    Block* true_block = if_node->blocks()[0];
-    Block* false_block = if_node->blocks()[1];
-    // find common statements in the two subblocks
-
-    auto true_block_nodes = std::unordered_set<Node*, HashNode, EqualNode>(
-        true_block->nodes().begin(), true_block->nodes().end());
-    for (auto it = false_block->nodes().begin();
-         it != false_block->nodes().end();) {
-      Node* false_b_node = *it;
-      // node may be moved to a different block so advance iterator now
-      ++it;
-
-      auto matching_elem = true_block_nodes.find(false_b_node);
-      if (matching_elem == true_block_nodes.end()) {
-        continue;
-      }
-      Node* true_b_node = *matching_elem;
-
-      // Check if a move to the front of the block is valid
-      // If both of the moves are valid, then we know we can move the item out
-      // of the if blocks entirely.
-      AliasDb& aliasDb = getOrCreateAliasDb();
-      bool true_moveable = aliasDb.couldMoveAfterTopologically(
-          true_b_node, true_block->nodes().front());
-      bool false_moveable = aliasDb.couldMoveAfterTopologically(
-          false_b_node, false_block->nodes().front());
-
-      if (!true_moveable || !false_moveable) {
-        continue;
-      }
-
-      // Get all the uses of the output to delete and reinsert them
-      // as the input would change, the HashNode value would also change.
-      std::unordered_set<Node*> true_b_uses;
-      for (Value* true_out : true_b_node->outputs()) {
-        for (Use true_use : true_out->uses()) {
-          if (true_use.user->owningBlock() == true_block) {
-            // Make sure we are not accidentally adding stuff from subblocks
-            true_b_uses.insert(true_use.user);
-          }
-        }
-      }
-      for (Node* uses_node : true_b_uses) {
-        true_block_nodes.erase(uses_node);
-      }
-
-      // Now hoist the statement out of the block
-      changed_ = true;
-      false_b_node->moveBefore(if_node);
-
-      true_b_node->replaceAllUsesWith(false_b_node);
-
-      true_block_nodes.erase(true_b_node);
-      true_block_nodes.insert(true_b_uses.cbegin(), true_b_uses.cend());
-      true_b_node->destroy();
-    }
-  }
-
-  void EliminateUnnecessaryIfOutputs(Node* if_node) {
-    Block* true_block = if_node->blocks()[0];
-    Block* false_block = if_node->blocks()[1];
-
-    // fix up the if block outputs
-    for (size_t i = 0; i < true_block->outputs().size();) {
-      // Need to check both sides match to eliminate common if block outputs
-      Value* true_block_output = true_block->outputs().at(i);
-      Value* false_block_output = false_block->outputs().at(i);
-      if (true_block_output != false_block_output) {
-        i++;
-        continue;
-      }
-
-      // We have a matching output, and can remove it from the block itself
-      if_node->outputs().at(i)->replaceAllUsesWith(true_block_output);
-      if_node->eraseOutput(i);
-      true_block->eraseOutput(i);
-      false_block->eraseOutput(i);
-      changed_ = true;
-    }
-
-    // No need to test here if the IF block should be eliminated.
-    // The DCE pass will determine that for us.
-  }
-
-  void HoistCommonExpression(Block* block) {
-    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
-      Node* node = *it;
-      ++it;
-
-      for (auto sub_block : node->blocks()) {
-        HoistCommonExpression(sub_block);
-      }
-
-      if (node->kind() == prim::If) {
-        HoistFromIfNode(node);
-        EliminateUnnecessaryIfOutputs(node);
-      }
-    }
-  }
-
-  AliasDb& getOrCreateAliasDb() {
-    if (!alias_db_) {
-      alias_db_ = std::make_unique<AliasDb>(graph_);
-    }
-
-    return *alias_db_;
-  }
-
- private:
-  std::unique_ptr<AliasDb> alias_db_;
-  std::shared_ptr<Graph> graph_;
-  bool changed_ = false;
-};
-} // anonymous namespace
-bool HoistCommonExpression(const std::shared_ptr<Graph>& graph) {
-  // This moves common subexpressions from the two sides of an
-  // if block out of the if block.
-
-  GRAPH_DUMP("Before CEH", graph);
-  CommonExpressionHoister ceh(graph);
-  bool changed = ceh.run();
-  if (changed) {
-    GRAPH_DUMP("After CEH Changes", graph);
-  }
-  return changed;
-}
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.h b/torch/csrc/jit/passes/common_expression_hoisting.h
deleted file mode 100644
index 2aad158eea8f..000000000000
--- a/torch/csrc/jit/passes/common_expression_hoisting.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/ir/ir.h>
-
-namespace torch {
-namespace jit {
-
-TORCH_API bool HoistCommonExpression(const std::shared_ptr<Graph>& graph);
-}
-} // namespace torch
diff --git a/torch/csrc/jit/passes/cuda_graph_fuser.cpp b/torch/csrc/jit/passes/cuda_graph_fuser.cpp
new file mode 100644
index 000000000000..52ec8ba500f5
--- /dev/null
+++ b/torch/csrc/jit/passes/cuda_graph_fuser.cpp
@@ -0,0 +1,21 @@
+#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
+#include <mutex>
+
+namespace torch {
+namespace jit {
+
+static CudaFuserComparisonCallback comparison_callback = {false, nullptr};
+static std::mutex comparison_callback_lock;
+
+CudaFuserComparisonCallback getCudaFuserComparisonCallback() {
+  std::lock_guard<std::mutex> guard(comparison_callback_lock);
+  return comparison_callback;
+}
+
+void setCudaFuserComparisonCallback(CudaFuserComparisonCallback callback) {
+  std::lock_guard<std::mutex> guard(comparison_callback_lock);
+  comparison_callback = callback;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/cuda_graph_fuser.h b/torch/csrc/jit/passes/cuda_graph_fuser.h
index 4bdf83e2b916..b1d04277166a 100644
--- a/torch/csrc/jit/passes/cuda_graph_fuser.h
+++ b/torch/csrc/jit/passes/cuda_graph_fuser.h
@@ -4,30 +4,39 @@
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
+#include <string>
+#include <utility>
 
 namespace torch {
 namespace jit {
 
 // Register CudaFuseGraph in custom passes
-struct C10_EXPORT RegisterCudaFuseGraph
+struct TORCH_API RegisterCudaFuseGraph
     : public PassManager<RegisterCudaFuseGraph> {
   static bool registerPass(bool enabled) {
-    bool old_flag = PassManager::isRegistered();
-    if (enabled) {
-      TORCH_CHECK(
-          at::globalContext().hasCUDA() && !at::globalContext().hasHIP(),
-          "Running CUDA fuser is only supported on CUDA builds.");
-      PassManager::registerPass(fuser::cuda::fuseGraph);
-    } else {
-      PassManager::clearPass();
-    }
-    return old_flag;
+    TORCH_WARN(
+        "RegisterCudaFuseGraph::registerPass() is deprecated. "
+        "Please use torch::jit::fuser::cuda::setEnabled().");
+    return fuser::cuda::setEnabled(enabled);
   }
 
   static bool isRegistered() {
-    return PassManager::isRegistered();
+    TORCH_WARN(
+        "RegisterCudaFuseGraph::isRegistered() is deprecated. "
+        "Please use torch::jit::fuser::cuda::isEnabled().");
+    return fuser::cuda::isEnabled();
   }
 };
 
+struct CudaFuserComparisonCallback {
+  using callback_type =
+      std::function<void(const Stack&, const Stack&, const std::string&)>;
+  bool run_fallback;
+  callback_type callback;
+};
+
+TORCH_API CudaFuserComparisonCallback getCudaFuserComparisonCallback();
+TORCH_API void setCudaFuserComparisonCallback(CudaFuserComparisonCallback);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp
new file mode 100644
index 000000000000..8ecab1bef916
--- /dev/null
+++ b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp
@@ -0,0 +1,74 @@
+#include <torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h>
+
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/quantization/helper.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+void DBRQuantRemoveRedundantAliasesImpl(const Method& method) {
+  auto g = method.graph();
+  const bool is_frozen = false;
+  const bool descend_function_calls = true;
+  AliasDb alias_db(g, is_frozen, descend_function_calls);
+  // find the alias nodes
+  std::vector<Node*> alias_nodes;
+  DepthFirstGraphNodeIterator it(g);
+  Node* node = nullptr;
+  while ((node = it.next()) != nullptr) {
+    if (node->kind() == Symbol::aten("alias")) {
+      alias_nodes.push_back(node);
+    }
+  }
+
+  // remove the alias nodes, if it is safe to do so
+  for (auto* node : alias_nodes) {
+    GRAPH_DEBUG(*node);
+
+    Value* input_value = node->input();
+    Value* output_value = node->output();
+
+    bool always_safe_to_mutate = alias_db.safeToChangeAliasingRelationship(
+        node->inputs(), node->outputs());
+
+    const auto g_in = g->inputs();
+    const auto g_out = g->outputs();
+    bool is_input =
+        std::find(g_in.begin(), g_in.end(), input_value) != g_in.end();
+    bool is_output =
+        std::find(g_out.begin(), g_out.end(), output_value) != g_out.end();
+    // We assume that aliasing is safe to update on inputs and outputs if they
+    // do not have writers.
+    bool input_safe_to_mutate =
+        (is_input && !alias_db.hasWriters(input_value) &&
+         !alias_db.hasWriters(output_value));
+    bool output_safe_to_mutate =
+        (is_output && !alias_db.hasWriters(input_value) &&
+         !alias_db.hasWriters(output_value));
+
+    if (always_safe_to_mutate || input_safe_to_mutate ||
+        output_safe_to_mutate) {
+      output_value->replaceAllUsesWith(input_value);
+      node->destroy();
+    }
+  }
+}
+
+} // namespace
+
+Module DBRQuantRemoveRedundantAliases(Module& module) {
+  for (const auto& child : module.modules()) {
+    for (const auto& method : child.get_methods()) {
+      DBRQuantRemoveRedundantAliasesImpl(method);
+    }
+  }
+
+  return module;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
new file mode 100644
index 000000000000..548d952014c3
--- /dev/null
+++ b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch {
+namespace jit {
+
+// This function replaces instances of
+//
+//   %b = aten::alias(%a)
+//   %c = foo(%b)
+//
+// with
+//
+//   %c = foo(%a)
+//
+// on the module forward, if it's safe to do so.
+TORCH_API Module DBRQuantRemoveRedundantAliases(Module& module);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp
index eb1986241eab..d0c30b3b5b76 100644
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@@ -44,7 +44,7 @@ bool isDecomposableNorm(Node* normalize_op) {
   auto device = input->type()->expectRef<TensorType>().device();
   // As of now, we do the decomposition for batchnorm/layernorm on GPU device
   // only
-  if (!device || (*device).is_cpu()) {
+  if (!device || !(*device).is_cuda()) {
     return false;
   }
 
diff --git a/torch/csrc/jit/passes/fold_conv_bn.cpp b/torch/csrc/jit/passes/fold_conv_bn.cpp
index 271ad8a7d1dc..9ba80dc0dd9b 100644
--- a/torch/csrc/jit/passes/fold_conv_bn.cpp
+++ b/torch/csrc/jit/passes/fold_conv_bn.cpp
@@ -27,9 +27,13 @@ std::tuple<at::Tensor, at::Tensor> computeUpdatedConvWeightAndBias(
   const int64_t ndim = p.conv_w.dim();
   at::DimVector sizes(ndim, 1);
   sizes.at(0) = -1;
+
+  auto conv_w_dtype = p.conv_w.dtype();
+  auto conv_b_dtype = p.conv_b.dtype();
+
   at::Tensor new_w = p.conv_w * (p.bn_w * bn_var_rsqrt).reshape(sizes);
   at::Tensor new_b = (p.conv_b - p.bn_rm) * bn_var_rsqrt * p.bn_w + p.bn_b;
-  return std::make_tuple(new_w, new_b);
+  return std::make_tuple(new_w.to(conv_w_dtype), new_b.to(conv_b_dtype));
 }
 
 namespace {
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 5aa4a39155e3..a847af666012 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -5,6 +5,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/passes/autocast.h>
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/eliminate_no_ops.h>
 #include <torch/csrc/jit/passes/inliner.h>
@@ -101,6 +102,9 @@ class AttributePropagator {
       ClearProfilingInformation(subgraph);
     };
     auto applyOptimizations = [](std::shared_ptr<Graph>& subgraph) {
+#ifndef C10_MOBILE
+      Autocast(subgraph);
+#endif
       runOptimization(
           subgraph,
           /* unroll_non_constant_loops? */ false,
diff --git a/torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp b/torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp
index 104cd21800f7..6e2fb72ab5eb 100644
--- a/torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp
+++ b/torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
 #include <torch/csrc/jit/passes/graph_rewrite_helper.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
@@ -16,6 +17,7 @@ namespace jit {
 namespace {
 void fuseFrozenConvAddReluImpl(std::shared_ptr<Graph>& graph) {
 #if AT_CUDNN_ENABLED()
+  GRAPH_DEBUG("Before fuseFrozenConvAddReluImpl: ", *graph);
   SubgraphRewriter rewriter;
 
   // CUDNN does not support conv1d
@@ -104,6 +106,7 @@ void fuseFrozenConvAddReluImpl(std::shared_ptr<Graph>& graph) {
   graph_rewrite_helper::replaceConvolutionWithAtenConv(graph);
 
   rewriter.runOnGraph(graph, filter);
+  GRAPH_DEBUG("After fuseFrozenConvAddReluImpl: ", *graph);
 #endif
 }
 
diff --git a/torch/csrc/jit/passes/frozen_conv_folding.cpp b/torch/csrc/jit/passes/frozen_conv_folding.cpp
index 4b5535af9ab6..057ce7358b8e 100644
--- a/torch/csrc/jit/passes/frozen_conv_folding.cpp
+++ b/torch/csrc/jit/passes/frozen_conv_folding.cpp
@@ -45,10 +45,11 @@ bool supportedConvNode(Node* n) {
   }
 }
 
-void FoldFrozenConvBatchnorm(Block* b) {
+bool FoldFrozenConvBatchnorm(Block* b) {
+  bool graph_modified = false;
   for (Node* n : b->nodes()) {
     for (Block* block : n->blocks()) {
-      FoldFrozenConvBatchnorm(block);
+      graph_modified |= FoldFrozenConvBatchnorm(block);
     }
 
     if (n->kind() == aten::batch_norm &&
@@ -79,7 +80,20 @@ void FoldFrozenConvBatchnorm(Block* b) {
       // implementation taken from torch/nn/utils/fusion.py
       Tensor conv_b;
       if (conv->namedInput("bias")->type() == NoneType::get()) {
-        conv_b = at::zeros_like(bn_rm);
+        // If this is on GPU and bias is none and weight was half/bfloat, but
+        // bn_rm was float, then probably this was a case where autocasting
+        // casted inputs to conv. And since CUDA conv implementation requires
+        // all the inputs to have the same scalar dtype, we need to make this
+        // placeholder have the same type as conv_w.
+        at::ScalarType bias_dtype = bn_rm.scalar_type();
+        at::ScalarType weight_dtype = conv_w.scalar_type();
+        at::DeviceType weight_device = conv_w.device().type();
+        if (weight_device == at::kCUDA &&
+            (weight_dtype == at::kHalf || weight_dtype == at::kBFloat16) &&
+            bias_dtype == at::kFloat) {
+          bias_dtype = weight_dtype;
+        }
+        conv_b = at::zeros_like(bn_rm, at::TensorOptions().dtype(bias_dtype));
       } else {
         conv_b = constant_as<Tensor>(conv->namedInput("bias")).value();
       }
@@ -118,8 +132,10 @@ void FoldFrozenConvBatchnorm(Block* b) {
       conv->replaceInputWith(conv_b_value, fused_conv_b);
 
       bn->output()->replaceAllUsesWith(conv->output());
+      graph_modified = true;
     }
   }
+  return graph_modified;
 }
 
 bool supportedAddOrSub(Node* n) {
@@ -173,7 +189,6 @@ bool checkConvAndBroadcastingOpPreConditions(Node* conv, Node* op) {
     return false;
   }
 
-  auto conv_w = constant_as<Tensor>(conv->namedInput("weight")).value();
   Tensor weight_tensor =
       constant_as<Tensor>(conv->namedInput("weight")).value();
 
@@ -188,12 +203,11 @@ bool checkConvAndBroadcastingOpPreConditions(Node* conv, Node* op) {
     if (!opDoesNotBroadCastWithConv(op_tensor, weight_tensor)) {
       return false;
     }
-    if (!op_tensor.is_floating_point()) {
-      return false;
-    }
-    if (c10::promoteTypes(
+
+    if (!op_tensor.is_floating_point() &&
+        c10::promoteTypes(
             op_tensor.scalar_type(), weight_tensor.scalar_type()) !=
-        weight_tensor.scalar_type()) {
+            weight_tensor.scalar_type()) {
       return false;
     }
   }
@@ -227,17 +241,18 @@ Tensor resizeConstantScalarOrTensorToShape(
   return ret_tensor;
 }
 
-void FoldFrozenConvAddOrSub(Block* b) {
+bool FoldFrozenConvAddOrSub(Block* b) {
+  bool graph_modified = false;
   for (Node* n : b->nodes()) {
     for (Block* block : n->blocks()) {
-      FoldFrozenConvAddOrSub(block);
+      graph_modified |= FoldFrozenConvAddOrSub(block);
     }
 
     if (supportedAddOrSub(n) && supportedConvNode(n->inputs().at(0)->node())) {
       auto conv = n->inputs().at(0)->node();
-      auto add_or_div = n;
+      auto add_or_sub = n;
 
-      if (!checkConvAndBroadcastingOpPreConditions(conv, add_or_div)) {
+      if (!checkConvAndBroadcastingOpPreConditions(conv, add_or_sub)) {
         continue;
       }
 
@@ -245,38 +260,40 @@ void FoldFrozenConvAddOrSub(Block* b) {
           constant_as<Tensor>(conv->namedInput("weight")).value();
 
       Tensor add_or_sub_tensor = resizeConstantScalarOrTensorToShape(
-          add_or_div->inputs().at(1),
+          add_or_sub->inputs().at(1),
           {weight_tensor.size(0)},
           weight_tensor.options());
       Tensor bias;
       if (conv->namedInput("bias")->type() == NoneType::get()) {
-        bias = at::zeros_like(add_or_sub_tensor);
+        bias = at::zeros_like(add_or_sub_tensor, weight_tensor.dtype());
       } else {
         bias = constant_as<Tensor>(conv->namedInput("bias")).value();
       }
 
       WithInsertPoint guard(conv);
 
-      add_or_div->replaceInputWith(
+      add_or_sub->replaceInputWith(
           conv->output(), b->owningGraph()->insertConstant(bias));
-      add_or_div->replaceInput(
+      add_or_sub->replaceInput(
           1, b->owningGraph()->insertConstant(add_or_sub_tensor));
 
-      auto stack_out = runNodeIfInputsAreConstant(add_or_div);
+      auto stack_out = runNodeIfInputsAreConstant(add_or_sub);
       TORCH_INTERNAL_ASSERT(stack_out && stack_out->size() == 1);
-      Tensor fuse_bias = (*stack_out)[0].toTensor();
+      Tensor fuse_bias = (*stack_out)[0].toTensor().to(bias.dtype());
 
       auto fused_conv_b = b->owningGraph()->insertConstant(fuse_bias);
       auto conv_b_value = conv->namedInput("bias");
 
       fused_conv_b->setDebugName(
           conv_b_value->debugName() + "_fused_" +
-          add_or_div->kind().toUnqualString());
+          add_or_sub->kind().toUnqualString());
       conv->replaceInputWith(conv_b_value, fused_conv_b);
-      add_or_div->output()->replaceAllUsesWith(conv->output());
+      add_or_sub->output()->replaceAllUsesWith(conv->output());
+      graph_modified = true;
       // DCE run after cleans up nodes
     }
   }
+  return graph_modified;
 }
 
 bool supportedMulOrDiv(Node* n) {
@@ -290,10 +307,11 @@ bool supportedMulOrDiv(Node* n) {
   return n->isMemberOf(add_set);
 }
 
-void FoldFrozenConvMulOrDiv(Block* b) {
+bool FoldFrozenConvMulOrDiv(Block* b) {
+  bool graph_modified = false;
   for (Node* n : b->nodes()) {
     for (Block* block : n->blocks()) {
-      FoldFrozenConvMulOrDiv(block);
+      graph_modified |= FoldFrozenConvMulOrDiv(block);
     }
 
     if (supportedMulOrDiv(n) && supportedConvNode(n->inputs().at(0)->node())) {
@@ -331,7 +349,7 @@ void FoldFrozenConvMulOrDiv(Block* b) {
 
       auto stack_out = runNodeIfInputsAreConstant(mul_or_div);
       TORCH_INTERNAL_ASSERT(stack_out && stack_out->size() == 1);
-      Tensor fuse_weight = (*stack_out)[0].toTensor();
+      Tensor fuse_weight = (*stack_out)[0].toTensor().to(weight_tensor.dtype());
 
       auto fused_conv_weight = b->owningGraph()->insertConstant(fuse_weight);
       auto conv_weight_value = conv->namedInput("weight");
@@ -355,7 +373,7 @@ void FoldFrozenConvMulOrDiv(Block* b) {
 
         auto stack_out = runNodeIfInputsAreConstant(mul_or_div);
         TORCH_INTERNAL_ASSERT(stack_out && stack_out->size() == 1);
-        Tensor fuse_bias = (*stack_out)[0].toTensor();
+        Tensor fuse_bias = (*stack_out)[0].toTensor().to(bias.dtype());
 
         auto fused_conv_bias = b->owningGraph()->insertConstant(fuse_bias);
         auto conv_b_value = conv->namedInput("bias");
@@ -365,26 +383,31 @@ void FoldFrozenConvMulOrDiv(Block* b) {
             mul_or_div->kind().toUnqualString());
         conv->replaceInputWith(conv_b_value, fused_conv_bias);
       }
+      graph_modified = true;
       // DCE run after cleans up nodes
     }
   }
+  return graph_modified;
 }
 
 } // namespace
 
-void FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph) {
-  FoldFrozenConvBatchnorm(graph->block());
+bool FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph) {
+  bool graph_modified = FoldFrozenConvBatchnorm(graph->block());
   EliminateDeadCode(graph);
+  return graph_modified;
 }
 
-void FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph) {
-  FoldFrozenConvAddOrSub(graph->block());
+bool FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph) {
+  bool graph_modified = FoldFrozenConvAddOrSub(graph->block());
   EliminateDeadCode(graph);
+  return graph_modified;
 }
 
-void FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph) {
-  FoldFrozenConvMulOrDiv(graph->block());
+bool FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph) {
+  bool graph_modified = FoldFrozenConvMulOrDiv(graph->block());
   EliminateDeadCode(graph);
+  return graph_modified;
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/frozen_conv_folding.h b/torch/csrc/jit/passes/frozen_conv_folding.h
index ef12fd8b5fa9..65dc138ccd6a 100644
--- a/torch/csrc/jit/passes/frozen_conv_folding.h
+++ b/torch/csrc/jit/passes/frozen_conv_folding.h
@@ -8,17 +8,17 @@ namespace jit {
 // Fuses Convolution -> Batchnorm into a single Convolution by
 // folding batchnorm weights into conv weights.
 // This pass only works on Frozen Graphs; otherwise it is a No-Op.
-TORCH_API void FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph);
+TORCH_API bool FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph);
 
 // Fuses Convolution -> Add/Sub into a single Convolution by
 // folding add constant tensor into conv weights.
 // This pass only works on Frozen Graphs; otherwise it is a No-Op.
-TORCH_API void FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph);
+TORCH_API bool FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph);
 
 // Fuses Convolution -> Mul/Div into a single Convolution by
 // folding add constant tensor into conv weights.
 // This pass only works on Frozen Graphs; otherwise it is a No-Op.
-TORCH_API void FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph);
+TORCH_API bool FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/frozen_graph_optimizations.cpp b/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
index c78963ccc814..1015e20a1d1f 100644
--- a/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
+++ b/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
@@ -1,7 +1,6 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir_views.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/frozen_concat_linear.h>
 #include <torch/csrc/jit/passes/frozen_conv_folding.h>
 #include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
@@ -16,16 +15,16 @@ void OptimizeFrozenGraph(
     std::shared_ptr<Graph>& graph,
     bool optimize_numerics) {
   removeDropout(graph);
-  HoistCommonExpression(graph);
   FrozenConcatLinear(graph);
   // run a couple times to capture Conv -> Mul -> Add etc
   if (optimize_numerics) {
-    for (const auto i : c10::irange(2)) {
-      (void)i; // Suppress unused variable warning
-      FoldFrozenConvBatchnorm(graph);
-      FoldFrozenConvAddOrSub(graph);
-      FoldFrozenConvMulOrDiv(graph);
-    }
+    bool changed = false;
+    do {
+      changed = false;
+      changed |= FoldFrozenConvBatchnorm(graph);
+      changed |= FoldFrozenConvAddOrSub(graph);
+      changed |= FoldFrozenConvMulOrDiv(graph);
+    } while (changed);
   }
 }
 
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 3b5dc448eda5..8453cbc2abc9 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -733,7 +733,7 @@ struct GraphFuser {
     Value* producer_for_chunk = *it;
     size_t producer_index = it - chunk->inputs().begin();
 
-    // all uses of the chunk must be in in this consumer
+    // all uses of the chunk must be in this consumer
     for (auto s : chunk->outputs()) {
       for (auto u : s->uses()) {
         if (u.user != consumer)
diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.cpp b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
index 34d7fd6c121a..d5b56d8103ed 100644
--- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
@@ -111,6 +111,13 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
+  std::string conv_transpose1d_for_deprecated_conv = R"(
+      graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
+          %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
+          %deterministic:bool, %cudnn_enabled:bool):
+        %r = aten::conv_transpose1d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
   std::string conv_transpose1d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
@@ -132,6 +139,20 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
         return (%r) )";
 
+  std::string conv_transpose3d_for_deprecated_conv = R"(
+      graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
+          %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
+          %deterministic:bool, %cudnn_enabled:bool):
+        %r = aten::conv_transpose3d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose3d = R"(
+      graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
+          %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
+          %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
+        %r = aten::conv_transpose3d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
   // Filter the unsupported case
   auto filter_conv1d = [](const Match& match,
                           const std::unordered_map<std::string, Value*>& vmap) {
@@ -190,6 +211,18 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         }
         return calc_value_map["transposed"].toBool();
       };
+  auto filter_conv_transpose3d =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        auto calc_value_map = getConvParams(match, vmap);
+        if (calc_value_map["output_padding"].toIntList().size() != 3 ||
+            calc_value_map["stride"].toIntList().size() != 3 ||
+            calc_value_map["padding"].toIntList().size() != 3 ||
+            calc_value_map["dilation"].toIntList().size() != 3) {
+          return false;
+        }
+        return calc_value_map["transposed"].toBool();
+      };
 
   SubgraphRewriter rewriter_conv1d;
   rewriter_conv1d.RegisterRewritePattern(convolution, conv1d);
@@ -212,6 +245,8 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
   SubgraphRewriter rewriter_conv_transpose1d;
   rewriter_conv_transpose1d.RegisterRewritePattern(
       convolution, conv_transpose1d);
+  rewriter_conv_transpose1d.RegisterRewritePattern(
+      convolution_deprecated, conv_transpose1d_for_deprecated_conv);
   rewriter_conv_transpose1d.runOnGraph(graph, filter_conv_transpose1d);
 
   SubgraphRewriter rewriter_conv_transpose2d;
@@ -220,6 +255,13 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
   rewriter_conv_transpose2d.RegisterRewritePattern(
       convolution_deprecated, conv_transpose2d_for_deprecated_conv);
   rewriter_conv_transpose2d.runOnGraph(graph, filter_conv_transpose2d);
+
+  SubgraphRewriter rewriter_conv_transpose3d;
+  rewriter_conv_transpose3d.RegisterRewritePattern(
+      convolution, conv_transpose3d);
+  rewriter_conv_transpose3d.RegisterRewritePattern(
+      convolution_deprecated, conv_transpose3d_for_deprecated_conv);
+  rewriter_conv_transpose3d.runOnGraph(graph, filter_conv_transpose3d);
 }
 
 bool isClampFusable(
diff --git a/torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp
index c5d91391f43e..f8d63e87f07b 100644
--- a/torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp
@@ -21,7 +21,8 @@ bool canRunWithAutograd(Node* node) {
   }
   return kind != prim::FusionGroup && kind != prim::CudaFusionGroup &&
       kind != prim::TypeCheck && kind != prim::TensorExprGroup &&
-      kind != prim::CudaFusionGuard && (kind.is_aten() || kind.is_prim());
+      kind != prim::CudaFusionGuard && kind != prim::oneDNNFusionGroup &&
+      kind != prim::oneDNNFusionGuard && (kind.is_aten() || kind.is_prim());
 }
 
 namespace {
diff --git a/torch/csrc/jit/passes/inliner.cpp b/torch/csrc/jit/passes/inliner.cpp
index 2e4754d3bd77..3b012a1e3f1e 100644
--- a/torch/csrc/jit/passes/inliner.cpp
+++ b/torch/csrc/jit/passes/inliner.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/inliner.h>
 
+#include <ATen/core/interned_strings.h>
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/frontend/error_report.h>
@@ -12,18 +13,34 @@ namespace prim {
 using namespace ::c10::prim;
 }
 
+GraphFunction* tryToGraphFunction(Node* n) {
+  if (n->kind() == prim::CallFunction) {
+    AT_ASSERT(n->input(0)->node()->kind() == prim::Constant);
+    auto function_constant = n->input(0)->node();
+    auto fun_type = function_constant->output()->type()->expect<FunctionType>();
+    return tryToGraphFunction(*fun_type->function());
+  }
+  if (n->kind() == prim::CallMethod) {
+    const std::string& name = n->s(attr::name);
+    if (auto class_type = n->input(0)->type()->cast<ClassType>()) {
+      Function& function = class_type->getMethod(name);
+      return tryToGraphFunction(function);
+    }
+  }
+  return nullptr;
+}
+
 void inlineCalls(Block* block) {
   for (auto it = block->nodes().begin(), end = block->nodes().end();
        it != end;) {
     Node* cur = *it++;
     switch (cur->kind()) {
       case prim::CallFunction: {
-        AT_ASSERT(cur->input(0)->node()->kind() == prim::Constant);
-        auto function_constant = cur->input(0)->node();
-        auto fun_type =
-            function_constant->output()->type()->expect<FunctionType>();
+        if (auto graphFunction = tryToGraphFunction(cur)) {
+          auto function_constant = cur->input(0)->node();
+          auto fun_type =
+              function_constant->output()->type()->expect<FunctionType>();
 
-        if (auto graphFunction = tryToGraphFunction(*fun_type->function())) {
           cur->removeInput(0);
           GRAPH_UPDATE(
               "Inlining function '",
@@ -57,14 +74,10 @@ void inlineCalls(Block* block) {
         }
       } break;
       case prim::CallMethod: {
-        const std::string& name = cur->s(attr::name);
-        if (auto class_type = cur->input(0)->type()->cast<ClassType>()) {
-          Function& function = class_type->getMethod(name);
-          if (auto graphFunction = tryToGraphFunction(function)) {
-            GRAPH_UPDATE("Inlining method '", function.name(), "' to ", *cur);
-            GRAPH_UPDATE("Function body: ", graphFunction->optimized_graph());
-            inlineCallTo(cur, graphFunction);
-          }
+        if (auto graphFunction = tryToGraphFunction(cur)) {
+          GRAPH_UPDATE("Inlining method '", cur->s(attr::name), "' to ", *cur);
+          GRAPH_UPDATE("Function body: ", graphFunction->optimized_graph());
+          inlineCallTo(cur, graphFunction);
         }
       } break;
       default: {
diff --git a/torch/csrc/jit/passes/inliner.h b/torch/csrc/jit/passes/inliner.h
index a1f1273bde56..b4db0ad18928 100644
--- a/torch/csrc/jit/passes/inliner.h
+++ b/torch/csrc/jit/passes/inliner.h
@@ -8,5 +8,7 @@ namespace jit {
 // Inline function and method calls.
 TORCH_API void Inline(Graph& graph);
 
+TORCH_API GraphFunction* tryToGraphFunction(Node* n);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/lower_grad_of.cpp b/torch/csrc/jit/passes/lower_grad_of.cpp
index 3f3de5ff779e..a4195e24cb0f 100644
--- a/torch/csrc/jit/passes/lower_grad_of.cpp
+++ b/torch/csrc/jit/passes/lower_grad_of.cpp
@@ -25,6 +25,12 @@ void LowerGradOf(Graph& g) {
                        ->insertBefore(else_block->return_node())
                        ->output();
       for (size_t i = 0; i < it->outputs().size(); ++i) {
+        // the else block returns a tensor for each of the outputs of the GradOf
+        // i.e. assuming that all the outputs are tensors. This might not be
+        // true, e.g. backward for cat() returns a list of gradient tensors.
+        // This is fixed in DifferentiableGraphBackward, where the list sizes
+        // are stored during the forward pass, and then undefined tensors are
+        // turned into lists of undefined tensors where necessary.
         else_block->registerOutput(undef);
         if_stat->outputs().at(i)->copyMetadata(it->outputs().at(i));
       }
diff --git a/torch/csrc/jit/passes/onednn_graph_fuser.h b/torch/csrc/jit/passes/onednn_graph_fuser.h
new file mode 100644
index 000000000000..aeb79470b01a
--- /dev/null
+++ b/torch/csrc/jit/passes/onednn_graph_fuser.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+
+#include <ATen/Config.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+static std::atomic<bool> onednn_enabled{true};
+
+static std::atomic<bool>& getLlgaEnabled() {
+  return onednn_enabled;
+}
+
+TORCH_API void fuseGraph(std::shared_ptr<Graph>& g);
+
+} // namespace onednn
+} // namespace fuser
+
+struct C10_EXPORT RegisterLlgaFuseGraph
+    : public PassManager<RegisterLlgaFuseGraph> {
+  static bool setEnabled(bool enabled) {
+    TORCH_CHECK(
+        AT_MKLDNN_ENABLED(),
+        "Running oneDNN Graph fuser is only supported with MKLDNN builds.");
+    bool oldState = fuser::onednn::getLlgaEnabled();
+    fuser::onednn::getLlgaEnabled() = enabled;
+    if (enabled) {
+      registerPass(fuser::onednn::fuseGraph);
+    } else {
+      clearPass();
+    }
+    return oldState;
+  }
+
+  static bool isEnabled() {
+    return fuser::onednn::getLlgaEnabled();
+  }
+
+  // override PassManager::registerPass to register pre-pass
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      passID(registerPrePass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // override PassManager::clearPass to clear pre-pass
+  static void clearPass() {
+    if (isRegistered()) {
+      clearPrePass(passID());
+      isRegistered(true);
+    }
+  }
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index ab549e0c5fbd..150eee9c9671 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/onnx/onnx_log.h>
 #include <torch/csrc/jit/passes/onnx/shape_type_inference.h>
 #include <torch/csrc/jit/python/python_ir.h>
 #include <torch/csrc/utils/pybind.h>
@@ -167,7 +168,14 @@ std::shared_ptr<Graph> ToONNX(
   ConstantValueMap::ClearMaps();
   auto new_graph = std::make_shared<Graph>(graph->current_scope());
   std::unordered_map<Value*, Value*> env;
-  BlockToONNX(graph->block(), new_graph->block(), operator_export_type, env);
+  try {
+    BlockToONNX(graph->block(), new_graph->block(), operator_export_type, env);
+  } catch (std::runtime_error& ex) {
+    ONNX_LOG(
+        "ONNX graph being constructed during exception:\n",
+        new_graph->toString());
+    throw;
+  }
   GRAPH_DUMP("after ToONNX: ", new_graph);
   ConstantValueMap::ClearMaps();
   return new_graph;
@@ -234,7 +242,7 @@ void NodeToONNX(
     ::torch::onnx::OperatorExportTypes operator_export_type,
     std::unordered_map<Value*, Value*>& env) {
   py::object onnx = py::module::import("torch.onnx");
-  py::object onnx_symbolic = py::module::import("torch.onnx.symbolic_helper");
+  py::object onnx_globals = py::module::import("torch.onnx._globals");
   py::object onnx_registry = py::module::import("torch.onnx.symbolic_registry");
 
   // Setup all the lambda helper functions.
@@ -265,8 +273,8 @@ void NodeToONNX(
     }
     // For const node, it does not need params_dict info, so set it to {}.
     const ParamMap empty_params_dict = {};
-    auto opset_version =
-        py::cast<int>(onnx_symbolic.attr("_export_onnx_opset_version"));
+    auto opset_version = py::cast<int>(
+        onnx_globals.attr("GLOBALS").attr("export_onnx_opset_version"));
     for (const auto i : c10::irange(num_old_outputs)) {
       auto old = old_outputs[i];
       if (outputs[i]) {
@@ -427,7 +435,8 @@ void NodeToONNX(
       pyobj = func->get();
     }
 
-    py::object opset_version = onnx_symbolic.attr("_export_onnx_opset_version");
+    py::object opset_version =
+        onnx_globals.attr("GLOBALS").attr("export_onnx_opset_version");
     py::object is_registered_op = onnx_registry.attr("is_registered_op")(
         "PythonOp", "prim", opset_version);
     if (!py::hasattr(pyobj, "symbolic") &&
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index d486e8ffae6c..dd4655734c93 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -147,7 +147,7 @@ c10::optional<at::Tensor> runTorchSlice_opset10(
       return c10::nullopt;
     }
     auto axes_a = inputTensorValues[3].accessor<int64_t, 1>();
-    axes.reserve(inputTensorValues[3].sizes()[0]);
+    axes.resize(inputTensorValues[3].sizes()[0]);
     // ONNX slice accepts negative axis, fix this for aten op
     for (const auto i : c10::irange(inputTensorValues[3].sizes()[0])) {
       axes[i] = axes_a[i] < 0 ? axes_a[i] + inputTensorValues[0].sizes().size()
@@ -329,9 +329,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
         updated_val = at::unsqueeze(updated_val, axes[i]);
       }
       return c10::optional<at::Tensor>(updated_val);
-    } else if (
-        opset_version == ONNX_OPSET_9 || opset_version == ONNX_OPSET_10 ||
-        opset_version == ONNX_OPSET_11 || opset_version == ONNX_OPSET_12) {
+    } else if (opset_version >= ONNX_OPSET_9) {
       assert(inputTensorValues.size() == 1);
       if (!node->hasAttributeS("axes")) {
         return c10::nullopt;
@@ -375,9 +373,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
         }
       }
       return c10::optional<at::Tensor>(updated_val);
-    } else if (
-        opset_version == ONNX_OPSET_9 || opset_version == ONNX_OPSET_10 ||
-        opset_version == ONNX_OPSET_11 || opset_version == ONNX_OPSET_12) {
+    } else if (opset_version >= ONNX_OPSET_9) {
       assert(inputTensorValues.size() == 1);
       updated_val = inputTensorValues[0];
       if (node->hasAttributeS("axes")) {
@@ -452,6 +448,26 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     updated_val = at::norm(
         inputTensorValues[0], p, node->is(attr::axes), node->i(attr::keepdims));
     return c10::optional<at::Tensor>(updated_val);
+  } else if (node->kind() == onnx::ReduceProd) {
+    int64_t rank = inputTensorValues[0].sizes().size();
+    std::vector<int64_t> axes;
+    if (!node->hasAttributeS("axes")) {
+      axes = std::vector<int64_t>(rank);
+      std::iota(axes.rbegin(), axes.rend(), 0);
+    } else {
+      for (const auto& axis : node->is(attr::axes)) {
+        axes.emplace_back(axis < 0 ? axis + rank : axis);
+      }
+      std::sort(axes.begin(), axes.end(), std::greater<>());
+    }
+
+    bool keepdims =
+        node->hasAttributeS("keepdims") ? node->i(attr::keepdims) : true;
+    updated_val = inputTensorValues[0];
+    for (const auto& axis : axes) {
+      updated_val = at::prod(updated_val, axis, keepdims);
+    }
+    return c10::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Gather) {
     assert(inputTensorValues.size() == 2);
     // default axis = 0
diff --git a/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp b/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp
new file mode 100644
index 000000000000..cbb97c238e76
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp
@@ -0,0 +1,103 @@
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/onnx/deduplicate_initializers.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
+
+#include <c10/util/irange.h>
+
+namespace torch {
+namespace jit {
+
+namespace onnx {
+using namespace ::c10::onnx;
+}
+
+void DeduplicateInitializers(
+    std::shared_ptr<Graph>& g,
+    ValueToParamPairMap& valsToParamsMap,
+    bool (*comp)(at::Tensor&, at::Tensor&)) {
+  auto is_same_tensor_as = [&valsToParamsMap, comp](Value* v1) {
+    return [&valsToParamsMap, v1, comp](Value* v2) {
+      if ((valsToParamsMap.find(v1) == valsToParamsMap.end()) ||
+          (valsToParamsMap.find(v2) == valsToParamsMap.end())) {
+        return false;
+      }
+      auto iv1 = valsToParamsMap.find(v1)->second.second;
+      auto iv2 = valsToParamsMap.find(v2)->second.second;
+      if (!iv1.isTensor() || !iv2.isTensor()) {
+        return false;
+      }
+      auto t1 = iv1.toTensor();
+      auto t2 = iv2.toTensor();
+      return comp(t1, t2);
+    };
+  };
+  std::vector<Value*> uniqueVals;
+  std::vector<size_t> inputsIndicesToRemove;
+  auto b = g->block();
+
+  for (auto i : c10::irange(b->inputs().size())) {
+    auto v = g->inputs().at(i);
+    if (valsToParamsMap.find(v) == valsToParamsMap.end()) {
+      // Skip model inputs
+      continue;
+    }
+    auto it = std::find_if(
+        uniqueVals.begin(), uniqueVals.end(), is_same_tensor_as(v));
+    if (it == uniqueVals.end()) {
+      uniqueVals.emplace_back(v);
+    } else {
+      inputsIndicesToRemove.emplace_back(i);
+      auto id_node = g->create(onnx::Identity);
+      id_node->insertAfter(g->block()->param_node());
+      id_node->addInput(*it);
+      id_node->output()->copyMetadata(v);
+      id_node->copyMetadata(g->block()->param_node());
+      v->replaceAllUsesWith(id_node->output());
+    }
+  }
+  for (auto it = inputsIndicesToRemove.rbegin();
+       it != inputsIndicesToRemove.rend();
+       ++it) {
+    valsToParamsMap.erase(g->inputs().at(*it));
+    g->eraseInput(*it);
+  }
+}
+
+bool DeduplicateInitializersByDataPtr(at::Tensor& t1, at::Tensor& t2) {
+  return t1.sizes().equals(t2.sizes()) && t1.strides().equals(t2.strides()) &&
+      (t1.has_storage() && t2.has_storage() && t1.data_ptr() == t2.data_ptr());
+}
+
+bool DeduplicateInitializersByValue(at::Tensor& t1, at::Tensor& t2) {
+  if (t1.dtype() != t2.dtype() || !t1.sizes().equals(t2.sizes()) ||
+      !t1.strides().equals(t2.strides())) {
+    return false;
+  }
+
+  if (t1.device() != t2.device()) {
+    return t1.to("cpu").equal(t2.to("cpu"));
+  }
+
+  return t1.equal(t2);
+}
+
+void DeduplicateInitializers(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramsDict,
+    bool is_train) {
+  auto valsToParamsMap = buildValueToParamsMap(g->block(), paramsDict);
+  // ONNX spec does not support parameters with shared memory.
+  // This pass de-duplicate those parameters. Training is not affected.
+  DeduplicateInitializers(g, valsToParamsMap, DeduplicateInitializersByDataPtr);
+  if (!is_train) {
+    // More aggressive parameters de-duplication based on tensor values.
+    // Producing more compact model for inference.
+    // For training, this pass is disabled,
+    // because parameters may be updated differently.
+    DeduplicateInitializers(g, valsToParamsMap, DeduplicateInitializersByValue);
+  }
+  buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/deduplicate_initializers.h b/torch/csrc/jit/passes/onnx/deduplicate_initializers.h
new file mode 100644
index 000000000000..60fbd8f089d4
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/deduplicate_initializers.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+void DeduplicateInitializers(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramsDict,
+    bool is_train);
+
+} // namespace jit
+
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index 66641057448d..afeb2ac7f3f5 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -185,6 +185,34 @@ std::vector<Value*> ConvertSequenceDependencies(Node* node, int opset_version) {
   return new_outputs;
 }
 
+Node* ONNXOptionalNode(OptionalTypePtr opt_type, Graph* g) {
+  TORCH_INTERNAL_ASSERT(opt_type);
+  TypePtr elem_type = opt_type->getElementType();
+  Node* opt_node = g->create(::c10::onnx::Optional, 1);
+  opt_node->ty_(Symbol::attr("type"), elem_type);
+  opt_node->output()->setType(OptionalType::create(elem_type));
+  return opt_node;
+}
+
+// Replaces block output i with an onnx::Optional
+// with `type` taken from opt_type.
+// Needed when control flow has multiple branches, one of which
+// is defined by `block` and returns a None and another branch
+// returns not-None. The passed-in opt_type should be from the other branch.
+void ReplaceBlockOutputWithOptional(
+    OptionalTypePtr opt_type,
+    Block* block,
+    size_t i) {
+  Node* opt_node = ONNXOptionalNode(opt_type, block->owningGraph());
+  opt_node->insertBefore(block->return_node());
+  Value* block_output = block->outputs().at(i);
+  block_output->replaceAllUsesWith(opt_node->output());
+  if (!block_output->type()->cast<NoneType>()) {
+    opt_node->addInput(block_output);
+    opt_node->copyMetadata(block_output->node());
+  }
+}
+
 // Resolving limitation from ONNX that the block output can not be
 // a value from outside the block. Inserting an Identity node inside
 // the block, linking with the value outside as workaround.
@@ -192,9 +220,17 @@ void FixupONNXSubblockOutputs(Node* n) {
   for (Block* block : n->blocks()) {
     for (Value* output : block->outputs()) {
       if (output->node()->owningBlock() != block) {
-        Node* id_node = block->owningGraph()->create(onnx::Identity);
+        Node* id_node = nullptr;
+        // Simplify graph by creating an empty optional rather than
+        // Identity(None). Also enables shape inference later on, since
+        // ONNX shape inference doesn't handle None.
+        if (output->type()->cast<NoneType>()) {
+          id_node = block->owningGraph()->create(onnx::Optional);
+        } else {
+          id_node = block->owningGraph()->create(onnx::Identity);
+          id_node->addInput(output);
+        }
         id_node->insertBefore(block->return_node());
-        id_node->addInput(output);
         id_node->output()->copyMetadata(output);
         id_node->copyMetadata(n);
         block->return_node()->replaceInputWith(output, id_node->output());
@@ -203,7 +239,45 @@ void FixupONNXSubblockOutputs(Node* n) {
   }
 }
 
-} // anonymous namespace
+// Infer type of optional inputs from outputs.
+void FixupONNXLoopBlockInputs(Node* n) {
+  for (Block* block : n->blocks()) {
+    for (const auto i : c10::irange(1, block->inputs().size())) {
+      // input i corresponds to output i until we run FixupONNXLoopNodeInputs.
+      Value* input_i = block->inputs().at(i);
+      if (input_i->type()->cast<OptionalType>() &&
+          !block->outputs().at(i)->type()->cast<OptionalType>()) {
+        TypePtr merged_type;
+        bool inferred = false;
+        std::tie(merged_type, inferred) = MergeInferredType(
+            input_i->type()->cast<OptionalType>()->getElementType(),
+            block->outputs().at(i)->type());
+        if (inferred) {
+          input_i->setType(OptionalType::create(merged_type));
+        }
+      }
+    }
+  }
+}
+
+// Replace None in outputs with Optional.
+void FixupONNXLoopBlockOutputs(Node* n) {
+  for (Block* block : n->blocks()) {
+    // output 0 is continue_condition, never None.
+    for (const auto i : c10::irange(1, block->outputs().size())) {
+      if (block->outputs().at(i)->type()->cast<NoneType>()) {
+        ReplaceBlockOutputWithOptional(
+            // Output 0 is continue_condition.
+            // Inputs (0, 1) are (loop_counter, cond). So input i + 1
+            // corresponds to output i.
+            block->inputs().at(i + 1)->type()->cast<OptionalType>(),
+            block,
+            i);
+      }
+    }
+  }
+  FixupONNXSubblockOutputs(n);
+}
 
 void FixupONNXLoopNodeInputs(Node* node) {
   if (node->kind() != ::c10::onnx::Loop) {
@@ -235,18 +309,53 @@ void FixupONNXLoopNodeInputs(Node* node) {
         InsertCastForCond(next_cond_val, graph, sub_block->return_node());
     cast_node->copyMetadata(node);
   }
+
+  // Inputs (0, 1) are (max_trip_count, start_condition). Skip them
+  // since they're never None or Optional.
+  for (const auto i : c10::irange(2, node->inputs().size())) {
+    Value* input = node->inputs().at(i);
+    OptionalTypePtr sub_block_input_optional =
+        sub_block->inputs().at(i)->type()->cast<OptionalType>();
+    // If loop input is not optional but block input is, wrap loop input with
+    // Optional. Happens when the loop takes in None and outputs not-None, or
+    // vice-versa.
+    if (!input->type()->cast<OptionalType>() && sub_block_input_optional) {
+      if (!input->type()->cast<NoneType>()) {
+        TypePtr merged_type;
+        bool inferred = false;
+        std::tie(merged_type, inferred) = MergeInferredType(
+            sub_block_input_optional->getElementType(), input->type());
+        if (inferred) {
+          sub_block_input_optional = OptionalType::create(merged_type);
+          sub_block->inputs().at(i)->setType(sub_block_input_optional);
+        }
+      }
+      Node* opt_node = ONNXOptionalNode(sub_block_input_optional, graph);
+      if (!input->type()->cast<NoneType>()) {
+        opt_node->addInput(input);
+      }
+      opt_node->insertBefore(node);
+      node->replaceInputWith(input, opt_node->output());
+    }
+  }
 }
+} // anonymous namespace
 
 std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
   auto output_size = node->outputs().size();
+  GRAPH_DEBUG("before FixupONNXLoopBlockInputs: ", *node->owningGraph());
+  FixupONNXLoopBlockInputs(node);
+  GRAPH_DEBUG("after FixupONNXLoopBlockInputs: ", *node->owningGraph());
   FixupONNXLoopNodeInputs(node);
-  FixupONNXSubblockOutputs(node);
+  GRAPH_DEBUG("after FixupONNXLoopNodeInputs: ", *node->owningGraph());
+  FixupONNXLoopBlockOutputs(node);
+  GRAPH_DEBUG("after FixupONNXLoopBlockOutputs: ", *node->owningGraph());
   // NOTE: the output order is deliberately changed to match expected order
   //       since onnx loop requires scan outputs to be the last outputs.
   auto new_outputs = ConvertSequenceDependencies(node, opset_version);
-
   // Copy type of block output to node output.
   FixupONNXControlflowNodeOutputs(node);
+  GRAPH_DEBUG("after FixupONNXControlflowNodeOutputs: ", *node->owningGraph());
   TORCH_INTERNAL_ASSERT(output_size == new_outputs.size());
   return new_outputs;
 }
@@ -264,8 +373,7 @@ bool IsUninitializedNode(Node* n) {
 
 // Infer shape and type of the uninitialized_output from the corresponding
 // output of the other subblock. prim::Uninitialized node is proven to be
-// unused. So replace this node with a Constant (TensorType) or
-// Sequence (ListType) of the inferred shape and type.
+// unused. So replace this node with one of the inferred shape and type.
 void InferShapeTypeForUninitializedOutput(
     Graph* graph,
     Block* block,
@@ -296,14 +404,24 @@ void InferShapeTypeForUninitializedOutput(
       auto onnx_type = ATenTypeToOnnxType(scalar_type);
       const_node->i_(attr::dtype, onnx_type);
       const_node->output()->setType(other_output->type());
+    } else if (elem->cast<IntType>()) {
+      auto scalar_type = at::kLong;
+      auto onnx_type = ATenTypeToOnnxType(scalar_type);
+      const_node->i_(attr::dtype, onnx_type);
+      const_node->output()->setType(other_output->type());
     } else {
       std::cerr
           << "Warning: UninitializedOutput - Invalid elem Type of ListTensor found."
           << std::endl;
       const_node->output()->setType(other_output->type());
     }
+  } else if (auto output_type = other_output->type()->cast<OptionalType>()) {
+    const_node = ONNXOptionalNode(output_type, graph);
   }
-
+  TORCH_CHECK(
+      const_node,
+      "Inferring type for prim::Uninitialized node from " +
+          other_output->type()->repr_str() + " not supported.")
   const ParamMap empty_params_dict = {};
   ONNXShapeTypeInference(const_node, empty_params_dict, opset_version);
   const_node->insertBefore(block->return_node());
@@ -454,23 +572,90 @@ void ONNXMergeIfBlockOutputShapes(Node* node) {
     return nullptr;
   };
 
+  auto mergeOptionalType = [&mergeTensorType, &mergeListType](
+                               OptionalTypePtr a,
+                               OptionalTypePtr b) -> OptionalTypePtr {
+    if (a && b) {
+      if (a->getElementType()->cast<TensorType>()) {
+        auto a_tensor_type = a->getElementType()->cast<TensorType>();
+        auto b_tensor_type = b->getElementType()->cast<TensorType>();
+        auto tensor_type = mergeTensorType(a_tensor_type, b_tensor_type);
+        if (tensor_type) {
+          return a->withContained({tensor_type})->cast<OptionalType>();
+        }
+        // Both branches produce OptionalType without tensor shape.
+        return a;
+      } else if (a->getElementType()->cast<ListType>()) {
+        auto a_list_type = a->getElementType()->cast<ListType>();
+        auto b_list_type = b->getElementType()->cast<ListType>();
+        auto list_type = mergeListType(a_list_type, b_list_type);
+        if (list_type) {
+          return a->withContained({list_type})->cast<OptionalType>();
+        }
+        // Both branches produce OptionalType without tensor shape.
+        return a;
+      }
+    } else if (a) {
+      return a;
+    } else if (b) {
+      return b;
+    }
+    return nullptr;
+  };
+
   for (const auto i : c10::irange(else_block->outputs().size())) {
+    Value* output_i = node->output(i);
     auto then_type = then_block->outputs().at(i)->type();
     auto else_type = else_block->outputs().at(i)->type();
     auto then_tensor_type = then_type->cast<TensorType>();
     auto else_tensor_type = else_type->cast<TensorType>();
     auto then_list_type = then_type->cast<ListType>();
     auto else_list_type = else_type->cast<ListType>();
+    auto then_optional_type = then_type->cast<OptionalType>();
+    auto else_optional_type = else_type->cast<OptionalType>();
+    auto then_none_type = then_type->cast<NoneType>();
+    auto else_none_type = else_type->cast<NoneType>();
     if (then_tensor_type || else_tensor_type) {
-      if (auto tensor_type =
+      if (TypePtr merged_type =
               mergeTensorType(then_tensor_type, else_tensor_type)) {
-        node->output(i)->setType(tensor_type);
+        if (else_optional_type || else_none_type || then_optional_type ||
+            then_none_type) {
+          merged_type = OptionalType::create(merged_type);
+        }
+        output_i->setType(merged_type);
       }
     } else if (then_list_type || else_list_type) {
-      if (auto list_type = mergeListType(then_list_type, else_list_type)) {
-        node->output(i)->setType(list_type);
+      if (TypePtr merged_type = mergeListType(then_list_type, else_list_type)) {
+        if (else_optional_type || else_none_type || then_optional_type ||
+            then_none_type) {
+          merged_type = OptionalType::create(merged_type);
+        }
+        output_i->setType(merged_type);
+      }
+    }
+
+    if (then_optional_type || else_optional_type) {
+      if (auto optional_type =
+              mergeOptionalType(then_optional_type, else_optional_type)) {
+        output_i->setType(optional_type);
+        // Both branches output types must match.
+        if (!then_optional_type) {
+          ReplaceBlockOutputWithOptional(optional_type, then_block, i);
+        } else if (!else_optional_type) {
+          ReplaceBlockOutputWithOptional(optional_type, else_block, i);
+        }
       }
     }
+
+    if (then_none_type && !else_optional_type) {
+      ReplaceBlockOutputWithOptional(
+          output_i->type()->cast<OptionalType>(), then_block, i);
+    }
+
+    if (else_none_type && !then_optional_type) {
+      ReplaceBlockOutputWithOptional(
+          output_i->type()->cast<OptionalType>(), else_block, i);
+    }
   }
 }
 
@@ -481,7 +666,6 @@ std::vector<Value*> FixupONNXIfNode(Node* node, int opset_version) {
   GRAPH_DUMP("Graph before fixing controlflow: ", node->owningGraph());
   FixupONNXSubblockOutputs(node);
   ONNXFixupUninitializedOutput(node, opset_version);
-  // Copy type of block output to node output.
   ONNXMergeIfBlockOutputShapes(node);
 
   GRAPH_DUMP("Graph after fixing controlflow: ", node->owningGraph());
@@ -504,12 +688,28 @@ std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version) {
 void FixupONNXControlflowNodeOutputs(Node* n) {
   switch (n->kind()) {
     case ::c10::onnx::Loop: {
-      auto loop_carried_output_size = n->blocks().at(0)->inputs().size() - 2;
+      Block* loop_block = n->blocks().at(0);
+      // inputs (0, 1) are (i, cond), remainder are carried outputs.
+      size_t loop_carried_output_size = loop_block->inputs().size() - 2;
+
       for (auto i : c10::irange(n->outputs().size())) {
-        auto type = n->blocks().at(0)->outputs().at(i + 1)->type();
         if (i < loop_carried_output_size) {
+          const TypePtr block_input_type =
+              loop_block->inputs().at(i + 2)->type();
+          const TypePtr block_output_type =
+              loop_block->outputs().at(i + 1)->type();
+          TypePtr type = block_output_type;
+          // Handle the case where a block input is Optional but the
+          // output is not (i.e. if the loop executes > 0 times, the
+          // output will not be None).
+          if (block_input_type->cast<OptionalType>() &&
+              !block_output_type->cast<OptionalType>()) {
+            type = OptionalType::create(block_output_type);
+          }
           n->output(i)->setType(type);
         } else {
+          // scan output, should be a Tensor type
+          TypePtr type = loop_block->outputs().at(i + 1)->type();
           if (auto t_type = type->cast<TensorType>()) {
             auto sizes = t_type->symbolic_sizes().sizes();
             if (sizes.has_value()) {
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 1840d96fd13a..5f31ba2260ff 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -9,6 +9,14 @@ namespace {
 
 using scope_list = std::vector<ScopePtr>;
 
+// Annotated attributes retrieved from module by inspecting module annotations.
+// These attributes are not used inside the subgraph of ONNX local function
+// because they are not created by PyTorch JIT tracing, but they may be used by
+// consumers to determine whether or not to replace the function with a
+// particular fused kernel.
+static std::unordered_map<ScopePtr, Node*> scope_attr_map_;
+static std::shared_ptr<Graph> scope_attr_graph_ = std::make_shared<Graph>();
+
 static bool HasSameAttribute(
     const Node* a,
     const Node* b,
@@ -49,7 +57,6 @@ struct FunctionExtractor {
         scope_ctx_map& scope_ctxs);
     void DebugPrint() const;
     void SetAttrName(Node* ref_n, Symbol attr, const std::string& name);
-    void SetAttrName(Node* ref_const_n, const std::string& name);
     c10::optional<std::string> FindAttrName(Node* ref_n, Symbol attr);
     c10::optional<std::string> FindAttrName(Node* ref_const_n);
 
@@ -487,14 +494,53 @@ Node* FunctionExtractor::CreateFunctionDefNode(
     auto* n = n_it.first;
     for (const auto& attr_it : n_it.second) {
       const auto& attr = attr_it.first;
-      auto attr_name =
-          std::string(n->kind().toUnqualString()) + '_' + attr.toUnqualString();
+      // Add prefix "inferred::" to name of inferred attribute.
+      // This is to differentiate from annotated attributes picked up
+      // from python module annotation.
+      auto attr_name = "inferred::" + std::string(n->kind().toUnqualString()) +
+          '_' + attr.toUnqualString();
       auto final_attr_name = adjust_attr_name(attr_name);
       final_attr_names.emplace_back(final_attr_name);
       func_ctx.SetAttrName(n, attr, final_attr_name);
     }
   }
 
+  // Set annotated attributes
+  std::unordered_set<Symbol> annotated_attr_names;
+  bool first_iteration = true;
+  for (const auto& it : func_ctx.scope_ctxs_) {
+    auto scope = it.first;
+    auto annotated_attr_node = scope_attr_map_.find(scope);
+    if (annotated_attr_node != scope_attr_map_.end()) {
+      auto names = annotated_attr_node->second->attributeNames();
+      if (first_iteration) {
+        std::copy(
+            names.begin(),
+            names.end(),
+            std::inserter(annotated_attr_names, annotated_attr_names.end()));
+        first_iteration = false;
+      } else {
+        auto unseen_attr_name = std::find_if(
+            names.begin(),
+            names.end(),
+            [&annotated_attr_names](const Symbol& name) {
+              return annotated_attr_names.find(name) ==
+                  annotated_attr_names.end();
+            });
+        TORCH_CHECK(
+            unseen_attr_name == names.end(),
+            "Found outstanding annotated attribute ",
+            *unseen_attr_name,
+            " from module ",
+            scope->name(),
+            ". Please ensure module instances of the same class have the same set of annotated attributes.");
+      }
+    }
+  }
+  for (auto attr_name : annotated_attr_names) {
+    final_attr_names.emplace_back(attr_name.toUnqualString());
+  }
+
   func_def_n->ss_(Symbol::attr("attributes"), final_attr_names);
 
   return func_def_n;
@@ -575,6 +621,16 @@ Node* FunctionExtractor::CreateFunctionNode(
     }
   }
 
+  // annotated attributes
+  auto scope = scope_ctx.scope_;
+  auto annotated_attr_node = scope_attr_map_.find(scope);
+  if (annotated_attr_node != scope_attr_map_.end()) {
+    auto node = annotated_attr_node->second;
+    for (auto attr : node->attributeNames()) {
+      copy_attr(node, func_n, attr, attr.toUnqualString());
+    }
+  }
+
   func_n->insertAfter(last_n);
   return func_n;
 }
@@ -1020,6 +1076,29 @@ NodeAttrNameMap FunctionExtractor::run() {
   return node_attr_to_name;
 }
 
+// Retrieves the node representing the most recent
+// ScopePtr. This function should only be invoked from module forward hook. At
+// this point, module forward call is completed, and the most recent ScopePtr
+// is popped from TracingState.
+// This function inspects the node, and its subblock, to find
+// the node associated with the most recent ScopePtr.
+Node* NodeOfMostRecentScope(Node* forward_node) {
+  TORCH_INTERNAL_ASSERT(
+      forward_node->kind() == prim::TracedModuleForward,
+      "forward_node got kind: ",
+      forward_node->kind().toDisplayString());
+  auto* block = forward_node->blocks()[0];
+  for (auto* node : block->nodes().reverse()) {
+    if (node->kind() == prim::TracedModuleForward) {
+      Node* target_node = NodeOfMostRecentScope(node);
+      if (scope_attr_map_.find(node->scope()) == scope_attr_map_.end()) {
+        return target_node;
+      }
+    }
+  }
+  return forward_node;
+}
+
 } // namespace
 
 // FunctionExtractor runs in the following steps. Updates are made inplace to
@@ -1045,6 +1124,62 @@ NodeAttrNameMap ONNXFunctionExtraction(
   return fe.run();
 }
 
+Node* ONNXGetPreviousScope(std::shared_ptr<Graph>& graph) {
+  auto* last_node = graph->nodes().back()->prev();
+  auto* scope_node = NodeOfMostRecentScope(last_node);
+  auto* attr_node = scope_attr_graph_->create(prim::TracedModuleForward);
+  attr_node->setScope(scope_node->scope());
+  TORCH_INTERNAL_ASSERT(
+      scope_attr_map_.find(scope_node->scope()) == scope_attr_map_.end(),
+      "Found duplicated scope. Scope ",
+      scope_node->scope()->namesFromRoot(),
+      " already processed.");
+  scope_attr_map_[scope_node->scope()] = attr_node;
+  return attr_node;
+}
+
+void ONNXClearScopeRecords() {
+  scope_attr_map_.clear();
+  scope_attr_graph_ = std::make_shared<Graph>();
+}
+
+void ONNXTrackScopeAttributes(
+    std::shared_ptr<Graph>& graph,
+    std::map<std::string, IValue>& attributes) {
+  // Skip the "real" last node which is `return_node`.
+  auto* last_node = graph->nodes().back()->prev();
+  auto* scope_node = NodeOfMostRecentScope(last_node);
+  auto* attr_node = scope_attr_graph_->create(prim::TracedModuleForward);
+  attr_node->setScope(scope_node->scope());
+  TORCH_INTERNAL_ASSERT(
+      scope_attr_map_.find(scope_node->scope()) == scope_attr_map_.end());
+  scope_attr_map_[scope_node->scope()] = attr_node;
+
+  for (const auto& it : attributes) {
+    auto k = Symbol::attr(it.first);
+    auto v = it.second;
+    if (v.isTensor()) {
+      attr_node->t_(k, v.toTensor());
+    } else if (v.isInt()) {
+      attr_node->i_(k, v.toInt());
+    } else if (v.isDouble()) {
+      attr_node->f_(k, v.toDouble());
+    } else if (v.isBool()) {
+      attr_node->i_(k, v.toBool());
+    } else if (v.isString()) {
+      attr_node->s_(k, v.toString()->string());
+    } else if (v.isIntList()) {
+      attr_node->is_(k, v.toIntList().vec());
+    } else if (v.isBoolList()) {
+      auto bool_list = v.toBoolList();
+      attr_node->is_(
+          k, std::vector<int64_t>(bool_list.begin(), bool_list.end()));
+    } else if (v.isDoubleList()) {
+      attr_node->fs_(k, v.toDoubleList().vec());
+    }
+  }
+}
+
 } // namespace onnx
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.h b/torch/csrc/jit/passes/onnx/function_extraction.h
index b46e095433f5..3a90967e2f1f 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.h
+++ b/torch/csrc/jit/passes/onnx/function_extraction.h
@@ -57,6 +57,13 @@ TORCH_API NodeAttrNameMap ONNXFunctionExtraction(
     std::shared_ptr<Graph>& graph,
     const std::unordered_set<std::string>& module_names,
     const std::vector<std::string>& param_names);
+
+TORCH_API void ONNXClearScopeRecords();
+
+TORCH_API void ONNXTrackScopeAttributes(
+    std::shared_ptr<Graph>& graph,
+    std::map<std::string, IValue>& attributes);
+
 } // namespace onnx
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index f76b606c1812..7d473cebbeb8 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -186,7 +186,18 @@ Node* transformToONNXConcatNode(
   for (auto* input : lc_node->inputs()) {
     auto new_input =
         need_new_input ? g->addInput()->copyMetadata(input) : input;
-
+    // This particular Concat operation concats along axis=0 and this requires
+    // inputs to the node to have the same shape along dim-0. To ensure this,
+    // unsqueeze nodes are added such that all shapes along dim-0 are 1.
+    // Certain inputs from ListConstruct Int[] could be combinations of scalars
+    // and 1-D tensors, For inputs that are already 1-D tensors, we skip the
+    // step of creating a corresponding unsqueeze node.
+    if (auto type = new_input->type()->cast<TensorType>()) {
+      if (type->dim() && type->dim() == 1U) {
+        unsqueezed.emplace_back(new_input);
+        continue;
+      }
+    }
     Node* unsqueezed_node =
         createONNXUnsqueeze(g, new_node, new_input, 0, opset_version);
     unsqueezed_node->copyMetadata(lc_node);
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index c27c27b42fac..b17d5d00758b 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -14,6 +14,9 @@ static const int OPSET_VERSION_10 = 10;
 static const int OPSET_VERSION_11 = 11;
 static const int OPSET_VERSION_12 = 12;
 static const int OPSET_VERSION_13 = 13;
+static const int OPSET_VERSION_14 = 14;
+static const int OPSET_VERSION_15 = 15;
+static const int OPSET_VERSION_16 = 16;
 
 using ValueToParamPairMap = std::map<Value*, std::pair<std::string, IValue>>;
 
@@ -59,5 +62,12 @@ Node* transformToONNXConcatNode(
     bool need_new_input,
     int opset_version);
 
+class ScalarTypeHashFunction {
+ public:
+  size_t operator()(const c10::ScalarType& type) const {
+    return static_cast<size_t>(type);
+  }
+};
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/onnx_log.cpp b/torch/csrc/jit/passes/onnx/onnx_log.cpp
new file mode 100644
index 000000000000..eff2ae4d5a32
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/onnx_log.cpp
@@ -0,0 +1,31 @@
+#include <torch/csrc/jit/passes/onnx/onnx_log.h>
+#include <iostream>
+
+namespace torch {
+namespace jit {
+namespace onnx {
+
+namespace {
+bool log_enabled = false;
+std::shared_ptr<std::ostream> out;
+} // namespace
+
+bool is_log_enabled() {
+  return log_enabled;
+}
+
+void set_log_enabled(bool enabled) {
+  log_enabled = enabled;
+}
+
+void set_log_output_stream(std::shared_ptr<std::ostream> out_stream) {
+  out = std::move(out_stream);
+}
+
+std::ostream& _get_log_output_stream() {
+  return out ? *out : std::cout;
+}
+
+} // namespace onnx
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/onnx_log.h b/torch/csrc/jit/passes/onnx/onnx_log.h
new file mode 100644
index 000000000000..a659a1223427
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/onnx_log.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <ostream>
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace onnx {
+
+TORCH_API bool is_log_enabled();
+
+TORCH_API void set_log_enabled(bool enabled);
+
+TORCH_API void set_log_output_stream(std::shared_ptr<std::ostream> out_stream);
+
+TORCH_API std::ostream& _get_log_output_stream();
+
+#define ONNX_LOG(...)                            \
+  if (::torch::jit::onnx::is_log_enabled()) {    \
+    ::torch::jit::onnx::_get_log_output_stream() \
+        << ::c10::str(__VA_ARGS__) << std::endl; \
+  }
+
+} // namespace onnx
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
index bc646308424b..3e516498272e 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
@@ -7,7 +7,7 @@ bool IndexingPatternFinder::IsSameSource(const Node* n, const Node* m) {
   const auto source_n = n->sourceRange().source();
   const auto source_m = m->sourceRange().source();
   return (
-      (source_n->text() == source_m->text()) &&
+      (source_n->text_str() == source_m->text_str()) &&
       (source_n->starting_line_no() == source_m->starting_line_no()));
 }
 
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index b873bcd6a0b4..e5efc704a410 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -761,6 +761,32 @@ static void fuseListConstructListUnpack(Block* b) {
   }
 }
 
+// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+static void eraseTupleConstruct(Block* block) {
+  std::vector<Value*> new_block_outputs;
+  bool found_tuple_construct = false;
+  // TupleConstruct is generated from the symbolics in quantized domain, and
+  // consumed by other quantized operators. The remained TupleConstruct should
+  // be at the output of the blocks.
+  for (auto* output : block->outputs()) {
+    auto output_node = output->node();
+    if (output_node->kind() == prim::TupleConstruct) {
+      found_tuple_construct = true;
+      for (auto* input : output_node->inputs()) {
+        new_block_outputs.emplace_back(input);
+      }
+    } else {
+      new_block_outputs.emplace_back(output);
+    }
+  }
+  if (found_tuple_construct) {
+    block->removeAllOutputs();
+    for (auto* output : new_block_outputs) {
+      block->registerOutput(output);
+    }
+  }
+}
+
 void removeMaxPoolUnusedOutput(Block* b) {
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     auto n = *it;
@@ -1025,6 +1051,7 @@ void PeepholeOptimizeONNX(
   fuseListConstructListUnpack(graph->block());
   fuseLogSoftmaxNllLoss(graph->block());
   eraseListConstruct(graph->block(), opset_version);
+  eraseTupleConstruct(graph->block());
   EliminateDeadCode(
       graph->block(),
       true,
diff --git a/torch/csrc/jit/passes/onnx/peephole.h b/torch/csrc/jit/passes/onnx/peephole.h
index 2eb38b7e9933..7d23267310ab 100644
--- a/torch/csrc/jit/passes/onnx/peephole.h
+++ b/torch/csrc/jit/passes/onnx/peephole.h
@@ -10,5 +10,5 @@ void PeepholeOptimizeONNX(
     int opset_version,
     bool fixed_batch_size);
 
-}
+} // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 1aeebdcae6fc..cba23985a3bb 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
 
 namespace torch {
@@ -11,13 +12,6 @@ using namespace ::c10::onnx;
 }
 
 namespace {
-class ScalarTypeHashFunction {
- public:
-  size_t operator()(const c10::ScalarType& type) const {
-    return static_cast<size_t>(type);
-  }
-};
-
 const int ONNX_OPSET_14 = 14;
 
 static const std::unordered_map<c10::ScalarType, int, ScalarTypeHashFunction>
@@ -50,12 +44,14 @@ static int64_t ScalarTypeToONNXType(const c10::ScalarType& st) {
 // There is no operator-wise special case handling needed.
 static const std::unordered_set<NodeKind> standardOps = {
     onnx::Add,
-    onnx::Sub,
-    onnx::Mul,
+    onnx::Concat,
     onnx::Div,
     onnx::Gemm,
-    onnx::Pow,
+    onnx::Min,
     onnx::Mod,
+    onnx::Mul,
+    onnx::Pow,
+    onnx::Sub,
 };
 
 // For these operators, all inputs share the same scalar type.
@@ -294,9 +290,10 @@ static void UpdateScalarTypeForInputs(
 static void UpdateScalarTypeForOutput(
     Node* n,
     const c10::ScalarType& scalar_type) {
-  auto output_tensor_type = n->output()->type()->cast<TensorType>();
-  n->output()->setType(
-      CreateProfiledTensorTypeWithScalarType(output_tensor_type, scalar_type));
+  if (auto output_tensor_type = n->output()->type()->cast<TensorType>()) {
+    n->output()->setType(CreateProfiledTensorTypeWithScalarType(
+        output_tensor_type, scalar_type));
+  }
 }
 
 static void RecoverScalarTypeForOutput(
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index b99d0dd689b3..384b329a626a 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -17,26 +17,15 @@
 #include <cmath>
 #include <limits>
 #include <unordered_set>
+#include <utility>
 
 namespace torch {
 namespace jit {
 
-// Return a new TypePtr, merging ONNX inferred type with existing type.
-// The inferred type will take higher precedence, since it is produced by ONNX
-// shape inference, and is more compatible with ONNX. In cases where ONNX shape
-// inference fails to produce an inferred type, or produces inferred type that
-// is incomplete, refer to existing type and fill in the gap that is missing.
-// Currently the following cases are supported.
-//  1. existing type: Tensor[], inferred type: Tensor[]
-//    For list of tensors, existing type does not store datatype nor shape for
-//    inner tensor. Thus inferred type always contain more information, and is
-//    returned.
-//  2. existing type: Tensor, inferred type: Tensor
-//    Fill in missing info (shape, data type) for inferred type from existing
-//    type.
-//  3. existing type: Scalar[], inferred type: Tensor
-//    ONNX represents list of scalars by 1-d Tensor. Return inferred type since
-//    it is more compatible with ONNX.
+inline bool PyNone_Check(PyObject* o) {
+  return o == Py_None;
+}
+
 std::pair<TypePtr, bool> MergeInferredType(
     TypePtr existing_type,
     TypePtr inferred_type) {
@@ -331,27 +320,27 @@ Node* CloneNodeToGraph(
   return clone_node;
 }
 
-bool IsGraphValidForInference(std::shared_ptr<Graph> graph) {
-  // Verify if every input has type(either Tensor or Sequence) and scalar type.
-  // This is a requirement for ONNX graph inputs.
-  for (auto in : graph->inputs()) {
-    if (auto t_type = in->type()->cast<TensorType>()) {
-      if (!t_type->scalarType().has_value()) {
-        GRAPH_UPDATE(
-            "Input ", in->debugName(), " is tensor type, but miss datatype.");
-        return false;
-      }
-    } else if (auto s_type = in->type()->cast<ListType>()) {
-      auto e_type = s_type->getElementType();
-      if (auto t_type = e_type->cast<TensorType>()) {
-        if (t_type->scalarType().has_value()) {
-          continue;
-        }
-      }
-      GRAPH_UPDATE(
-          "Input ", in->debugName(), " is sequence type, but miss datatype.");
+bool HasValidType(TypePtr type, std::string name) {
+  if (auto t_type = type->cast<TensorType>()) {
+    if (!t_type->scalarType().has_value()) {
+      GRAPH_UPDATE("Input ", name, " is missing tensor datatype.");
       return false;
     }
+  } else if (auto s_type = type->cast<ListType>()) {
+    auto e_type = s_type->getElementType();
+    return HasValidType(e_type, name);
+  } else if (auto o_type = type->cast<OptionalType>()) {
+    auto e_type = o_type->getElementType();
+    return HasValidType(e_type, name);
+  }
+  return true;
+}
+
+bool IsGraphValidForInference(std::shared_ptr<Graph> graph) {
+  // Verify if every input has type (either Tensor, Sequence or Optional) and
+  // scalar type. This is a requirement for ONNX graph inputs.
+  for (auto in : graph->inputs()) {
+    return HasValidType(in->type(), in->debugName());
   }
   return true;
 }
@@ -363,7 +352,13 @@ void ConvertGraphToONNXProto(
     int opset_version) {
   RawDataExportMap export_map;
   bool val_use_external_data_format;
-  std::tie(model_proto, export_map, symbol_map, val_use_external_data_format) =
+  NodeNameMap node_names;
+  std::tie(
+      model_proto,
+      export_map,
+      symbol_map,
+      val_use_external_data_format,
+      node_names) =
       export_onnx(
           graph,
           {},
@@ -408,7 +403,7 @@ c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
     return c10::nullopt;
   }
   std::vector<at::Tensor> inputTensorValues;
-  for (auto i = 0; i < n->inputs().size(); i++) {
+  for (auto i : c10::irange(n->inputs().size())) {
     if (TensorTypePtr input_type = n->input(i)->type()->cast<TensorType>()) {
       if (!ConstantValueMap::HasValue(n->input(i)->debugName())) {
         return c10::nullopt;
@@ -425,110 +420,13 @@ c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
     return onnx_constant_fold::runTorchBackendForOnnx(
         n, inputTensorValues, opset_version);
   } catch (const std::exception& ex) {
-    TORCH_WARN(
-        "Constant folding in symbolic shape inference fails: ", ex.what());
+    auto ex_str = std::string(ex.what());
+    ex_str = ex_str.substr(0, ex_str.find("\n"));
+    TORCH_WARN("Constant folding in symbolic shape inference fails: ", ex_str);
     return c10::nullopt;
   }
 }
 
-// When the Reshape node's two inputs are constant, compute the output shape.
-// The shape value 0 and -1 are converted to the real value explicitly.
-c10::optional<std::vector<int64_t>> ComputeShapeFromReshape(
-    Node* n,
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& shape,
-    int opset_version) {
-  TORCH_INTERNAL_ASSERT(
-      !input_shape.empty() || !shape.empty(),
-      "Reshape node should have at least one input size > 0 when constant folding.");
-  // Case: shape.empty()
-  // %22 : int[] = prim::Constant[value=annotate(List[int], [])]()
-  // %15 : Float(requires_grad=0, device=cpu) = aten::view(%1, %22)
-  if (shape.empty()) {
-    return input_shape;
-  }
-  // Case: input_shape.empty()
-  // (1) input_shape is not obtained,
-  // (2) input_shape is scalar (output is still a tensor, not a scalar),
-  // Both cases return shape
-  // TODO: for (1), multiple -1 may conflict each other. Consider use
-  // newSymbol() in shapeMap.
-  if (input_shape.empty()) {
-    return shape;
-  }
-  auto it_0 = std::find(shape.begin(), shape.end(), 0);
-  bool shape_has_zero = it_0 != shape.end();
-
-  // Allowzero is set to 0 by default
-  // When opset version > 14, assign appropriate allowzero value
-  if (opset_version >= 14 && n->hasAttributeS("allowzero")) {
-    int allowzero = n->i(attr::allowzero);
-    if (allowzero == 1 && shape_has_zero) {
-      // Return here, because the denominator in shape_ratio calculation cannot
-      // be 0.
-      // TODO: Shall we handle the case when shape has -1 here?
-      return shape;
-    }
-  }
-
-  int minus_one_pos = -1;
-  for (int i = 0; i < shape.size(); ++i) {
-    if (shape[i] == -1) {
-      minus_one_pos = i;
-      break;
-    }
-  }
-
-  if (!shape_has_zero && minus_one_pos == -1) {
-    return shape;
-  }
-  std::vector<int64_t> final_shape;
-  // shape_ratio is used to calculate the real value of -1.
-  // One example with shape 0 and -1:
-  // input_shape = 2 16 5 4
-  // shape = -1 0 4
-  // final_shape = 10 16 4
-  uint64_t shape_ratio = 1;
-  for (const auto& cur_input_shape : input_shape) {
-    if (shape_ratio >= std::numeric_limits<uint64_t>::max() / cur_input_shape) {
-      std::cerr
-          << "WARNING: ComputeShapeFromReshape(), shape_ratio overflows, skip shape inference."
-          << std::endl;
-      return c10::nullopt;
-    } else {
-      shape_ratio *= static_cast<uint64_t>(cur_input_shape);
-    }
-  }
-  int shape_size = static_cast<int>(shape.size());
-  for (const auto i : c10::irange(shape_size)) {
-    if (i == minus_one_pos) {
-      continue;
-    }
-    if (shape[i] != 0) {
-      shape_ratio /= static_cast<uint64_t>(shape[i]);
-    } else {
-      shape_ratio /= static_cast<uint64_t>(input_shape[i]);
-    }
-  }
-
-  TORCH_INTERNAL_ASSERT(
-      minus_one_pos != -1,
-      "ComputeShapeFromReshape(): shape_has_zero && minus_one_pos == -1, but we assumed this could never happen. Please file a bug with repro instructions.");
-
-  for (const int i : c10::irange(minus_one_pos)) {
-    int64_t cur_shape = shape[i] == 0 ? input_shape[i] : shape[i];
-    final_shape.push_back(cur_shape);
-  }
-  if (minus_one_pos != -1) {
-    final_shape.push_back(static_cast<int64_t>(shape_ratio));
-  }
-  for (auto i = minus_one_pos + 1; i < shape_size; i++) {
-    int64_t cur_shape = shape[i] == 0 ? input_shape[i] : shape[i];
-    final_shape.push_back(cur_shape);
-  }
-  return final_shape;
-}
-
 // Similar to the function above, but for symbolic shapes.
 c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
     Node* n,
@@ -552,25 +450,24 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
   auto it_0 = std::find_if(shape_vector.begin(), shape_vector.end(), is_zero);
   bool shape_has_zero = it_0 != shape_vector.end();
 
-  if (opset_version >= 14 && n->hasAttributeS("allowzero")) {
-    int allowzero = n->i(attr::allowzero);
-    if (allowzero == 1 && shape_has_zero) {
-      // Return here, because the denominator in shape_ratio calculation cannot
-      // be 0.
-      // TODO: Shall we handle the case when shape has -1 here?
-      return shape;
-    }
-  }
-
   int minus_one_pos = -1;
-  for (int i = 0; i < shape_vector.size(); ++i) {
+  for (auto i : c10::irange(shape_vector.size())) {
     if (shape_vector[i].value() == -1) {
       minus_one_pos = i;
       break;
     }
   }
 
-  if (!shape_has_zero && minus_one_pos == -1) {
+  int allowzero = 0;
+  if (opset_version >= 14 && n->hasAttributeS("allowzero")) {
+    allowzero = n->i(attr::allowzero);
+  }
+
+  TORCH_CHECK(
+      !(shape_has_zero && allowzero == 1 && minus_one_pos != -1),
+      "0 and -1 cannot both be present in `Shape` input of `Reshape` node, when `allowzero=1`.");
+
+  if (minus_one_pos == -1 && (!shape_has_zero || allowzero)) {
     return shape;
   }
   std::vector<c10::ShapeSymbol> final_shape;
@@ -787,7 +684,7 @@ void SetShapeValueFromListConstructNode(Node* lc_node) {
       } else if (ConstantValueMap::HasShapeValue(input->debugName())) {
         auto lc_value =
             ConstantValueMap::GetShapeValue(input->debugName()).value();
-        if (lc_value.rank() == 1) {
+        if (lc_value.rank() == 1U) {
           shape_size.emplace_back(lc_value.at(0));
         }
       }
@@ -800,54 +697,58 @@ void SetShapeValueFromListConstructNode(Node* lc_node) {
   }
 }
 
+std::vector<::c10::ShapeSymbol> Broadcast(
+    const std::vector<::c10::ShapeSymbol>& input_shape_value_0,
+    const std::vector<::c10::ShapeSymbol>& input_shape_value_1) {
+  size_t rank_0 = input_shape_value_0.size();
+  size_t rank_1 = input_shape_value_1.size();
+  size_t rank_max = std::max(rank_0, rank_1);
+  size_t rank_min = std::min(rank_0, rank_1);
+  std::vector<::c10::ShapeSymbol> final_shape;
+  final_shape.reserve(rank_max);
+  for (auto idx : c10::irange(rank_max)) {
+    final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
+  }
+  for (auto idx : c10::irange(rank_min)) {
+    const c10::ShapeSymbol& ss_shape_0 = input_shape_value_0[rank_0 - 1 - idx];
+    const c10::ShapeSymbol& ss_shape_1 = input_shape_value_1[rank_1 - 1 - idx];
+    bool is_static_0 = ss_shape_0.is_static();
+    bool is_static_1 = ss_shape_1.is_static();
+    if (is_static_0 && is_static_1) {
+      int64_t static_0_sz = ss_shape_0.static_size();
+      int64_t static_1_sz = ss_shape_1.static_size();
+      final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
+          std::max(static_0_sz, static_1_sz));
+    } else if (!is_static_0 && !is_static_1) {
+      if (ss_shape_0.value() == ss_shape_1.value()) {
+        final_shape[rank_max - 1 - idx] = ss_shape_0;
+      }
+    }
+  }
+
+  if (rank_0 < rank_1) {
+    for (size_t idx = rank_min; idx < rank_max; idx++) {
+      size_t shape_idx = rank_max - 1 - idx;
+      final_shape[shape_idx] = input_shape_value_1[shape_idx];
+    }
+  } else {
+    for (size_t idx = rank_min; idx < rank_max; idx++) {
+      size_t shape_idx = rank_max - 1 - idx;
+      final_shape[shape_idx] = input_shape_value_0[shape_idx];
+    }
+  }
+  return final_shape;
+}
+
 void ProcessBroadcastNode(Node* n) {
   TORCH_INTERNAL_ASSERT(n->inputs().size() == 2);
   if (ConstantValueMap::HasShape(n->input(0)->debugName()) &&
       ConstantValueMap::HasShape(n->input(1)->debugName())) {
     auto input_shape_0 = ConstantValueMap::GetShape(n->input(0)->debugName());
-    auto input_shape_value_0 = input_shape_0.value().sizes();
+    auto input_shape_value_0 = input_shape_0.value().sizes().value();
     auto input_shape_1 = ConstantValueMap::GetShape(n->input(1)->debugName());
-    auto input_shape_value_1 = input_shape_1.value().sizes();
-    size_t rank_0 = input_shape_value_0.value().size();
-    size_t rank_1 = input_shape_value_1.value().size();
-    size_t rank_max = std::max(rank_0, rank_1);
-    size_t rank_min = std::min(rank_0, rank_1);
-    std::vector<::c10::ShapeSymbol> final_shape;
-    final_shape.reserve(rank_max);
-    for (auto idx = 0; idx < rank_max; idx++) {
-      final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
-    }
-    for (auto idx = 0; idx < rank_min; idx++) {
-      const c10::ShapeSymbol& ss_shape_0 =
-          input_shape_value_0.value()[rank_0 - 1 - idx];
-      const c10::ShapeSymbol& ss_shape_1 =
-          input_shape_value_1.value()[rank_1 - 1 - idx];
-      bool is_static_0 = ss_shape_0.is_static();
-      bool is_static_1 = ss_shape_1.is_static();
-      if (is_static_0 && is_static_1) {
-        int64_t static_0_sz = ss_shape_0.static_size();
-        int64_t static_1_sz = ss_shape_1.static_size();
-        final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
-            std::max(static_0_sz, static_1_sz));
-      } else if (!is_static_0 && !is_static_1) {
-        if (ss_shape_0.value() == ss_shape_1.value()) {
-          final_shape[rank_max - 1 - idx] = ss_shape_0;
-        }
-      }
-    }
-
-    if (rank_0 < rank_1) {
-      for (auto idx = rank_min; idx < rank_max; idx++) {
-        auto shape_idx = rank_max - 1 - idx;
-        final_shape[shape_idx] = input_shape_value_1.value()[shape_idx];
-      }
-    } else {
-      for (auto idx = rank_min; idx < rank_max; idx++) {
-        auto shape_idx = rank_max - 1 - idx;
-        final_shape[shape_idx] = input_shape_value_0.value()[shape_idx];
-      }
-    }
-
+    auto input_shape_value_1 = input_shape_1.value().sizes().value();
+    auto final_shape = Broadcast(input_shape_value_0, input_shape_value_1);
     UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
   }
 }
@@ -864,11 +765,11 @@ void ProcessShapeForConcatNode(Node* n) {
     }
     std::vector<::c10::ShapeSymbol> final_shape;
     final_shape.reserve(rank);
-    for (auto idx = 0; idx < rank; idx++) {
+    for (auto idx : c10::irange(rank)) {
       if (idx == axis_adjust) {
         auto flag = true;
         int64_t size_total = 0;
-        for (auto input_idx = 0; input_idx < n->inputs().size(); input_idx++) {
+        for (auto input_idx : c10::irange(n->inputs().size())) {
           if (ConstantValueMap::HasShape(n->input(input_idx)->debugName())) {
             auto input_shape =
                 ConstantValueMap::GetShape(n->input(input_idx)->debugName());
@@ -890,7 +791,7 @@ void ProcessShapeForConcatNode(Node* n) {
         }
       } else {
         auto flag = false;
-        for (auto input_idx = 0; input_idx < n->inputs().size(); input_idx++) {
+        for (auto input_idx : c10::irange(n->inputs().size())) {
           if (ConstantValueMap::HasShape(n->input(input_idx)->debugName())) {
             auto input_shape =
                 ConstantValueMap::GetShape(n->input(input_idx)->debugName());
@@ -928,7 +829,7 @@ void ProcessShapeValueForConcatNode(Node* n) {
     } else if (ConstantValueMap::HasShapeValue(input->debugName())) {
       auto concat_value =
           ConstantValueMap::GetShapeValue(input->debugName()).value();
-      if (concat_value.rank() == 1) {
+      if (concat_value.rank() == 1U) {
         shape_size.emplace_back(concat_value.at(0));
       }
     }
@@ -955,6 +856,8 @@ void ProcessMatMulNode(Node* n) {
     auto input_shape_value_1 = input_shape_1.sizes().value();
     size_t rank_0 = input_shape_value_0.size();
     size_t rank_1 = input_shape_value_1.size();
+    // Handle inputs of rank 1 just like numpy.matmul:
+    // https://numpy.org/doc/stable/reference/generated/numpy.matmul.html
     auto is_rank_0_1 = false;
     if (rank_0 == 1) {
       input_shape_value_0.insert(
@@ -968,25 +871,23 @@ void ProcessMatMulNode(Node* n) {
       rank_1 = 2;
       is_rank_1_1 = true;
     }
-    size_t rank = std::max(rank_0, rank_1);
-    std::vector<::c10::ShapeSymbol> final_shape;
-    final_shape.reserve(rank);
-    if (rank_0 >= rank_1) {
-      for (auto idx = 0; idx < rank_0 - 2; idx++) {
-        final_shape.emplace_back(input_shape_value_0[idx]);
-      }
-    } else {
-      for (auto idx = 0; idx < rank_1 - 2; idx++) {
-        final_shape.emplace_back(input_shape_value_1[idx]);
-      }
-    }
-    final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
-    final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
-    if (is_rank_0_1) {
-      final_shape.erase(final_shape.begin());
-    }
-    if (is_rank_1_1) {
-      final_shape.pop_back();
+    // Per https://pytorch.org/docs/stable/generated/torch.matmul.html
+    // the broadcasting logic only applies to the batch dimensions, and not the
+    // matrix dimensions so we remove the matrix dimensions which are the last 2
+    // dimensions before broadcasting
+    auto final_shape = Broadcast(
+        std::vector<::c10::ShapeSymbol>(
+            input_shape_value_0.begin(), input_shape_value_0.end() - 2),
+        std::vector<::c10::ShapeSymbol>(
+            input_shape_value_1.begin(), input_shape_value_1.end() - 2));
+    // add the last 2 dimensions back, unless they do not exist in the first
+    // place and inserted by this function Then apply [n,k]X[k,m]=[n,m], where
+    // n=input_shape_value_0[rank_0 - 2], m=input_shape_value_1[rank_1 - 1]
+    if (!is_rank_0_1) {
+      final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
+    }
+    if (!is_rank_1_1) {
+      final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
     }
     UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
   }
@@ -1004,7 +905,7 @@ void ProcessReduceNode(Node* n) {
     }
     final_shape.reserve(rank_0);
     std::vector<int64_t> axes_vector = n->is(attr::axes);
-    for (auto idx = 0; idx < axes_vector.size(); idx++) {
+    for (auto idx : c10::irange(axes_vector.size())) {
       if (axes_vector[idx] < 0) {
         axes_vector[idx] += rank_0;
       }
@@ -1014,7 +915,7 @@ void ProcessReduceNode(Node* n) {
     if (n->hasAttributeS("keepdims")) {
       keepdims = n->i(attr::keepdims);
     }
-    for (auto idx = 0; idx < rank_0; idx++) {
+    for (auto idx : c10::irange(rank_0)) {
       auto it = std::find(axes_vector.begin(), axes_vector.end(), idx);
       if (it != axes_vector.end()) {
         if (keepdims != 0) {
@@ -1029,46 +930,47 @@ void ProcessReduceNode(Node* n) {
 }
 
 void ProcessReshapeNode(Node* n, int opset_version) {
-  if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
-    auto shape_temp =
-        ConstantValueMap::GetValueInto1DInt64Vector(n->input(1)->debugName());
-    auto shape_vector_0 =
-        ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown(
-            n->input(0)->debugName());
-    std::vector<int64_t> shape_vector_0_value(0);
-    if (shape_vector_0.has_value()) {
-      shape_vector_0_value = shape_vector_0.value();
-    }
-    if (shape_vector_0.has_value() || shape_temp.size() > 0) {
+  const auto& input_name = n->input(0)->debugName();
+  const auto& shape_name = n->input(1)->debugName();
+
+  // When `shape` input value is statically known, compute output shape.
+  if (ConstantValueMap::HasValue(shape_name)) {
+    auto static_shape_value =
+        ConstantValueMap::GetValueInto1DInt64Vector(shape_name);
+    auto symbolic_input_shape = ConstantValueMap::GetShape(input_name);
+    if (symbolic_input_shape && static_shape_value.size() > 0) {
       auto final_shape = ComputeShapeFromReshape(
-          n, shape_vector_0_value, shape_temp, opset_version);
-      if (final_shape.has_value()) {
-        UpdateShapeFromVector(n->output(), final_shape.value());
+          n,
+          symbolic_input_shape.value(),
+          c10::SymbolicShape(static_shape_value),
+          opset_version);
+      if (final_shape) {
+        UpdateShape(n->output(), final_shape.value());
         return;
       }
     }
   }
 
-  if (ConstantValueMap::HasShapeValue(n->input(1)->debugName()) &&
-      ConstantValueMap::HasShape(n->input(0)->debugName())) {
-    auto shape_vector_0 =
-        ConstantValueMap::GetShape(n->input(0)->debugName()).value();
-    auto shape_vector_1 =
-        ConstantValueMap::GetShapeValue(n->input(1)->debugName()).value();
+  // When `shape` input value is symbolically known, compute output shape.
+  if (ConstantValueMap::HasShapeValue(shape_name) &&
+      ConstantValueMap::HasShape(input_name)) {
+    auto symbolic_input_shape = ConstantValueMap::GetShape(input_name).value();
+    auto symbolic_shape_value =
+        ConstantValueMap::GetShapeValue(shape_name).value();
     auto final_shape = ComputeShapeFromReshape(
-        n, shape_vector_0, shape_vector_1, opset_version);
+        n, symbolic_input_shape, symbolic_shape_value, opset_version);
     if (final_shape.has_value()) {
       UpdateShape(n->output(), final_shape.value());
       return;
     }
   }
 
-  if (ConstantValueMap::HasShape(n->input(1)->debugName())) {
-    auto shape_vector_1 =
-        ConstantValueMap::GetShapeInto1DInt64Vector(n->input(1)->debugName());
-    if (shape_vector_1.has_value()) {
-      TORCH_INTERNAL_ASSERT(shape_vector_1.value().size() == 1);
-      UpdateRank(n->output(), shape_vector_1.value()[0]);
+  // Only shape of new shape is known, assign output rank.
+  if (ConstantValueMap::HasShape(shape_name)) {
+    auto output_rank = ConstantValueMap::GetShapeInto1DInt64Vector(shape_name);
+    if (output_rank.has_value()) {
+      TORCH_INTERNAL_ASSERT(output_rank.value().size() == 1);
+      UpdateRank(n->output(), output_rank.value()[0]);
       return;
     }
   }
@@ -1153,7 +1055,7 @@ void ProcessSliceNode(Node* n, int opset_version) {
   if (opset_version >= 10) {
     valid = ConstantValueMap::HasValue(n->input(1)->debugName()) &&
         ConstantValueMap::HasValue(n->input(2)->debugName());
-    for (const auto input_idx : c10::irange(3, 5)) {
+    for (const auto input_idx : c10::irange(3U, 5U)) {
       if (n->inputs().size() > input_idx) {
         valid = valid &&
             ConstantValueMap::HasValue(n->input(input_idx)->debugName());
@@ -1270,7 +1172,7 @@ void ProcessTimeSeriesNode(Node* n) {
         seq_length, num_directions, batch_size, hidden_size};
     UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
   }
-  for (const auto idx : c10::irange(2, 4)) {
+  for (const auto idx : c10::irange(2U, 4U)) {
     if (n->outputs().size() > idx) {
       std::vector<c10::ShapeSymbol> final_shape = {
           num_directions, batch_size, hidden_size};
@@ -1368,17 +1270,11 @@ void ComputeConstant(Node* n, int opset_version) {
       break;
     }
     case ::c10::onnx::Gather: {
-      if (ConstantValueMap::HasRank(n->input(0)->debugName()) &&
-          ConstantValueMap::HasRank(n->input(1)->debugName())) {
-        auto rank_0 =
-            ConstantValueMap::GetRank(n->input(0)->debugName()).value();
-        auto rank_1 =
-            ConstantValueMap::GetRank(n->input(1)->debugName()).value();
-        only_rank_available = true;
-        rank = rank_0 + rank_1 - 1;
-      }
       if (ConstantValueMap::HasShapeValue(n->input(0)->debugName()) &&
           ConstantValueMap::HasValue(n->input(1)->debugName())) {
+        // Special case for pattern Shape -> Gather, to propagate shape value.
+        // Gather input 0 is 1d tensor, Gather input 1 is scalar.
+        // Gather output will be scalar.
         auto shape_value =
             ConstantValueMap::GetShapeValue(n->input(0)->debugName()).value();
         auto idx_value =
@@ -1471,6 +1367,8 @@ void ComputeConstant(Node* n, int opset_version) {
         if (input0_shape_size.has_value()) {
           auto input0_shape_value = input0_shape_size.value();
           if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
+            // When value of `shape` is statically known,
+            // output shape can be computed.
             auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(
                 n->input(1)->debugName());
             auto final_shape =
@@ -1478,6 +1376,23 @@ void ComputeConstant(Node* n, int opset_version) {
             if (final_shape.has_value()) {
               UpdateShape(n->output(), final_shape.value());
             }
+          } else if (
+              auto expand_shape =
+                  ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown(
+                      n->input(1)->debugName())) {
+            // When shape of `shape` is statically known,
+            // output rank can be computed.
+            TORCH_INTERNAL_ASSERT(
+                expand_shape.value().size() == 1,
+                "`Shape` input to `Expand` should be a 1-D tensor. Instead got rank ",
+                expand_shape.value().size());
+            if (expand_shape.value()[0] > 0) {
+              std::vector<c10::ShapeSymbol> final_shape;
+              for (const auto i : c10::irange(expand_shape.value()[0])) {
+                final_shape.emplace_back(c10::ShapeSymbol::newSymbol());
+              }
+              UpdateShape(n->output(), c10::SymbolicShape(final_shape));
+            }
           }
         }
       }
@@ -1648,7 +1563,7 @@ void ProcessConstantValueMap(Node* n, int opset_version) {
   // shapes also. ONNX If can have different types on different branches, skip
   // here.
   auto static_input_shape = AllGraphInputsStatic(n->owningGraph());
-  for (auto i = 0; i < n->outputs().size(); i++) {
+  for (auto i : c10::irange(n->outputs().size())) {
     if (TensorTypePtr output_type = n->output(i)->type()->cast<TensorType>()) {
       if (output_type->dim().has_value()) {
         size_t rank = static_cast<size_t>(output_type->dim().value());
@@ -1663,7 +1578,7 @@ void ProcessConstantValueMap(Node* n, int opset_version) {
   // Update ConstantValueMap on node inputs from onnx shape inference.
   // ListConstruct is handled here (we only consider IntType, not TensorType) ,
   // no need to have a per-op based process.
-  for (auto i = 0; i < n->inputs().size(); i++) {
+  for (auto i : c10::irange(n->inputs().size())) {
     if (TensorTypePtr input_type = n->input(i)->type()->cast<TensorType>()) {
       if (input_type->dim().has_value()) {
         size_t rank = static_cast<size_t>(input_type->dim().value());
@@ -1955,7 +1870,7 @@ std::pair<bool, bool> AreInputsReliableOrStatic(Node* n) {
     non_required_idx =
         non_required_shape_inference_idx_map[n->kind().toDisplayString()];
   }
-  for (auto idx = 0; idx < input_size; idx++) {
+  for (auto idx : c10::irange(input_size)) {
     if (!non_required_idx.empty() &&
         non_required_idx.find(idx) != non_required_idx.end()) {
       continue;
@@ -1979,7 +1894,8 @@ static std::unordered_set<std::string> nodeTypeReliableForTracer = {
     "onnx::Cast",
     "onnx::Constant",
     "onnx::Relu",
-    "com.microsoft::Gelu"};
+    "com.microsoft::Gelu",
+    "aten::ATen"};
 
 void UpdateReliable(
     torch::jit::Value* output,
@@ -2112,7 +2028,7 @@ void ONNXShapeTypeInference(
   }
   UpdateReliable(n);
 
-  // For the node type that does nott have ComputeConstant logic, it may have
+  // For the node type that does not have ComputeConstant logic, it may have
   // reliable shape but its shape is not in ConstantValueMap. So we need this
   // logic to update ConstantValueMap.
   for (auto node_output : n->outputs()) {
@@ -2176,7 +2092,7 @@ void ONNXSetDynamicInputShape(
           name_to_sym[name] = ::c10::ShapeSymbol::newSymbol();
         }
         TORCH_CHECK(
-            axis < shape.size(),
+            axis < static_cast<int64_t>(shape.size()),
             "Dynamic shape axis should be no more than the shape dimension for ",
             name);
         shape[axis] = name_to_sym[name];
@@ -2219,10 +2135,11 @@ size_t ONNXAssignOutputShape(
     std::shared_ptr<Graph>& graph,
     size_t outputs_index,
     PyObject* output_obj,
-    bool onnx_shape_inference) {
+    bool onnx_shape_inference,
+    bool is_script) {
   auto index_check = [&]() {
     TORCH_INTERNAL_ASSERT(
-        outputs_index >= 0 && outputs_index <= graph->outputs().size(),
+        outputs_index <= graph->outputs().size(),
         "Incorrect number of elements provided as example outputs.");
   };
 
@@ -2240,7 +2157,8 @@ size_t ONNXAssignOutputShape(
           graph,
           outputs_index,
           PyTuple_GET_ITEM(output_obj, i),
-          onnx_shape_inference);
+          onnx_shape_inference,
+          is_script);
     }
   } else if (PyList_Check(output_obj)) {
     const auto list_len = PyList_GET_SIZE(output_obj);
@@ -2287,7 +2205,8 @@ size_t ONNXAssignOutputShape(
             graph,
             outputs_index,
             PyList_GET_ITEM(output_obj, i),
-            onnx_shape_inference);
+            onnx_shape_inference,
+            is_script);
       }
     }
   } else if (PyDict_Check(output_obj)) {
@@ -2302,15 +2221,26 @@ size_t ONNXAssignOutputShape(
           graph,
           outputs_index,
           PyList_GET_ITEM(unrolled_dict.ptr(), i),
-          onnx_shape_inference);
+          onnx_shape_inference,
+          is_script);
     }
   } else if (THPUtils_checkString(output_obj)) {
     // Ignore string, since they are not supported as output in ONNX.
-  } else if (strcmp(THPUtils_typename(output_obj), "NoneType") == 0) {
-    // For cases with tracing, simply ignore NoneType outputs
-    // For cases with scripting, TODO: Add logic to handle NoneType outputs
-    // when such output types are supported. For now test cases with NoneType
-    // outputs have been disabled.
+  } else if (PyNone_Check(output_obj)) {
+    // TODO: Currently there's no one thing to do here that works for
+    // both tracing and scripting.
+    // If we don't increment outputs_index here, then scripting fails
+    // for
+    // `python test/onnx/test_pytorch_onnx_no_runtime.py`.
+    // If we do increment it, then tracing fails for
+    // `python test/onnx/test_pytorch_onnx_onnxruntime.py
+    //  TestONNXRuntime.test_tuple_with_none_outputs`.
+    // Cause: in tracing we flatten the outputs in ONNXTracedModule.forward
+    // in torch/jit/_trace.py while tracing. This means the output has None
+    // objects omitted. But then the outputs passed in here are un-flattened,
+    // which means they contain None objects.
+    // Ideally we'd remove this difference.
+    outputs_index += static_cast<size_t>(is_script);
   } else {
     std::string msg =
         ("Model output has unsupported type. See "
@@ -2328,13 +2258,14 @@ void ONNXAssignOutputShape(
     std::shared_ptr<Graph>& graph,
     at::ArrayRef<at::Tensor> outputs,
     const python::IODescriptor& desc,
-    bool onnx_shape_inference) {
+    bool onnx_shape_inference,
+    bool is_script) {
   size_t outputs_index = 0;
   PyObject* py_obj = unflatten(outputs, desc);
   TORCH_INTERNAL_ASSERT(PyTuple_Check(py_obj));
 
-  outputs_index =
-      ONNXAssignOutputShape(graph, outputs_index, py_obj, onnx_shape_inference);
+  outputs_index = ONNXAssignOutputShape(
+      graph, outputs_index, py_obj, onnx_shape_inference, is_script);
 
   TORCH_INTERNAL_ASSERT(
       outputs_index == graph->outputs().size(),
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index f4347caa4ad5..34248b351e86 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -4,9 +4,33 @@
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 
+#include <utility>
+
 namespace torch {
 namespace jit {
 
+// Merges existing_type and inferred_type.
+// Returns {merged type, whether or not inferred_type was used}.
+//
+// The inferred type will take higher precedence, since it is produced by ONNX
+// shape inference, and is more compatible with ONNX. In cases where ONNX shape
+// inference fails to produce an inferred type, or produces an inferred type
+// that is incomplete, refer to existing type and fill in the gap that is
+// missing. Currently the following cases are supported.
+//  1. existing type: Tensor[], inferred type: Tensor[]
+//    For list of tensors, existing type does not store datatype nor shape for
+//    inner tensor. Thus inferred type always contain more information, and is
+//    returned.
+//  2. existing type: Tensor, inferred type: Tensor
+//    Fill in missing info (shape, data type) for inferred type from existing
+//    type.
+//  3. existing type: Scalar[], inferred type: Tensor
+//    ONNX represents list of scalars by 1-d Tensor. Return inferred type since
+//    it is more compatible with ONNX.
+std::pair<TypePtr, bool> MergeInferredType(
+    TypePtr existing_type,
+    TypePtr inferred_type);
+
 void MergeInferredTypeAndSetMap(
     Value* dest_v,
     TypePtr existing_type,
@@ -32,10 +56,11 @@ TORCH_API void ONNXAssignOutputShape(
     std::shared_ptr<Graph>& graph,
     at::ArrayRef<at::Tensor> outputs,
     const python::IODescriptor& desc,
-    bool onnx_shape_inference);
+    bool onnx_shape_inference,
+    bool is_script);
 
 // Utilize ONNX Shape Inference for node.
-// The node must have ONNX namespace, and is valid ONNX node accroding to spec.
+// The node must have ONNX namespace, and is valid ONNX node according to spec.
 // On successful ONNX shape inference runs, the function updates output types of
 // n with inferred shape and type. Otherwise n is unchanged.
 TORCH_API void ONNXShapeTypeInference(
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 0367ad6972c2..f33f3fd74e86 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -1,6 +1,6 @@
 #include <torch/csrc/jit/passes/onnx/unpack_quantized_weights.h>
 
-#include <ATen/native/quantized/cpu/packed_params.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -9,12 +9,9 @@
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 
-#ifndef AT_PER_OPERATOR_HEADERS
+// TODO: Switch to per operator headers after
+// https://github.com/pytorch/pytorch/pull/68693 is merged
 #include <ATen/Functions.h>
-#else
-#include <ATen/ops/quantize_per_tensor.h>
-#include <ATen/ops/zeros.h>
-#endif
 
 #include <stack>
 
@@ -104,7 +101,7 @@ double getScaleFromInput(Node* input_node) {
       input_name);
 }
 
-Node* CreateQuantizedWeights(
+Node* CreateQuantizedWeightsCaffe2(
     std::string data,
     std::shared_ptr<Graph>& graph,
     std::vector<int64_t> shapes,
@@ -118,7 +115,7 @@ Node* CreateQuantizedWeights(
   return const_node;
 }
 
-Node* CreateQuantizedBias(
+Node* CreateQuantizedBiasCaffe2(
     std::vector<int64_t> data,
     std::shared_ptr<Graph>& graph,
     std::vector<int64_t> shapes,
@@ -132,6 +129,117 @@ Node* CreateQuantizedBias(
   return const_node;
 }
 
+std::vector<Node*> CreateQuantizedWeights(
+    std::shared_ptr<Graph>& graph,
+    const at::Tensor& weight,
+    int8_t* data,
+    const std::vector<int64_t>& shapes,
+    const std::vector<int64_t>& strides) {
+  auto qscheme = weight.qscheme();
+  std::vector<Node*> unpacked_wt;
+
+  // Retrieve scales and zero_points. Their formats are different depending on
+  // different weight qscheme.
+  std::vector<float> scale_data;
+  std::vector<int64_t> scale_shapes;
+  std::vector<int64_t> zero_point_data;
+  std::vector<int64_t> zero_point_shapes;
+  std::vector<int64_t> axis_data;
+  switch (qscheme) {
+    case c10::kPerTensorAffine: {
+      // Cast to float since ONNX (De)QuantizeLinear only supports float scale.
+      scale_data = {static_cast<float>(weight.q_scale())};
+      scale_shapes = {1};
+      zero_point_data = {weight.q_zero_point()};
+      zero_point_shapes = {1};
+      break;
+    }
+    case c10::kPerChannelAffine:
+    case c10::kPerChannelAffineFloatQParams: {
+      auto q_scales = weight.q_per_channel_scales();
+      auto* scale_data_raw =
+          reinterpret_cast<double*>(q_scales.data_ptr<double>());
+      scale_shapes = q_scales.sizes().vec();
+      TORCH_INTERNAL_ASSERT(
+          scale_shapes.size() == 1,
+          "quantized per channel scales are expected as 1-d array.");
+      scale_data.resize(scale_shapes[0]);
+      // Cast to float since ONNX (De)QuantizeLinear only supports float scale.
+      std::transform(
+          scale_data_raw,
+          scale_data_raw + scale_shapes[0],
+          scale_data.begin(),
+          [](double x) { return static_cast<float>(x); });
+
+      auto q_zero_points = weight.q_per_channel_zero_points();
+      auto* zero_point_data_raw =
+          reinterpret_cast<int64_t*>(q_zero_points.data_ptr<int64_t>());
+      zero_point_shapes = q_zero_points.sizes().vec();
+      TORCH_INTERNAL_ASSERT(
+          zero_point_shapes.size() == 1,
+          "quantized per channel zero points are expected as 1-d array.");
+      zero_point_data = std::vector<int64_t>(
+          zero_point_data_raw, zero_point_data_raw + zero_point_shapes[0]);
+      axis_data = {weight.q_per_channel_axis()};
+      break;
+    }
+    default:
+      TORCH_CHECK(
+          false, "Unsupported qscheme for weight, got ", toString(qscheme));
+  }
+
+  Node* data_node = graph->create(prim::Constant);
+  auto data_value =
+      at::from_blob(
+          data, c10::IntArrayRef(shapes), c10::IntArrayRef(strides), at::kChar)
+          .to(at::kCPU);
+  auto options = c10::TensorOptions().dtype(at::kChar).device(at::kCPU);
+  // Need clone because at::from_blob does not take ownership of data.
+  data_node->t_(Symbol::attr("value"), data_value.clone());
+
+  Node* scale_node = graph->create(prim::Constant);
+  auto scale_value =
+      at::from_blob(
+          scale_data.data(), c10::IntArrayRef(scale_shapes), at::kFloat)
+          .to(at::kCPU);
+  scale_node->t_(Symbol::attr("value"), scale_value.clone());
+
+  Node* zero_point_node = graph->create(prim::Constant);
+  auto zero_point_value =
+      at::from_blob(
+          zero_point_data.data(), c10::IntArrayRef(zero_point_shapes), at::kInt)
+          .to(at::kCPU);
+  zero_point_node->t_(Symbol::attr("value"), zero_point_value.clone());
+
+  Node* axis_node = graph->create(prim::Constant);
+  if (axis_data.size() > 0) {
+    auto axis_value =
+        at::from_blob(
+            axis_data.data(), c10::IntArrayRef(axis_data.size()), at::kLong)
+            .to(at::kCPU);
+    axis_node->t_(attr::value, axis_value.clone());
+  } else {
+    axis_node->output()->setType(NoneType::get());
+  }
+
+  return {data_node, scale_node, zero_point_node, axis_node};
+}
+
+Node* CreateQuantizedBias(
+    std::vector<float> data,
+    std::shared_ptr<Graph>& graph,
+    std::vector<int64_t> shapes) {
+  Node* const_node_1 = graph->create(prim::Constant);
+  auto const_bias =
+      at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat)
+          .to(at::kCPU);
+  auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
+  at::Tensor const_bias_copy = at::empty(c10::IntArrayRef(shapes), options);
+  const_bias_copy.copy_(const_bias);
+  const_node_1->t_(Symbol::attr("value"), const_bias_copy);
+  return const_node_1;
+}
+
 Node* createIntTuple(
     const std::vector<int64_t>& is,
     std::shared_ptr<Graph>& graph) {
@@ -146,6 +254,52 @@ Node* createInt(int64_t i, std::shared_ptr<Graph>& graph) {
   return const_node;
 }
 
+void ConvertQuantizedWeight(
+    std::shared_ptr<Graph>& graph,
+    Node* node,
+    at::Tensor& weight,
+    bool is_caffe2) {
+  std::vector<int64_t> wt_sizes = weight.sizes().vec();
+  std::vector<int64_t> wt_strides = weight.strides().vec();
+  if (weight.ndimension() == 4 && is_caffe2) {
+    // Permute weights
+    weight.permute({0, 2, 3, 1});
+    wt_sizes = {weight.size(0), weight.size(2), weight.size(3), weight.size(1)};
+  }
+
+  // Remove packed_params
+  node->removeInput(1);
+
+  auto* wt_data = reinterpret_cast<int8_t*>(weight.data_ptr<c10::qint8>());
+
+  if (is_caffe2) {
+    // Convert from int8 to uint8
+    const int64_t weight_zp = weight.q_zero_point() + 128;
+    const int64_t wt_numel = weight.numel();
+    // Create caffe2::Int8GivenTensorFill node
+    std::ostringstream os;
+    for (const auto i : c10::irange(wt_numel)) {
+      os << static_cast<char>(wt_data[i] + 128);
+    }
+    Node* c2_weight = CreateQuantizedWeightsCaffe2(
+        os.str(), graph, wt_sizes, weight.q_scale(), weight_zp);
+    graph->setInsertPoint(node);
+    c2_weight->insertBefore(node);
+    node->insertInput(1, c2_weight->output());
+  } else {
+    std::vector<Node*> unpacked_wt =
+        CreateQuantizedWeights(graph, weight, wt_data, wt_sizes, wt_strides);
+    graph->setInsertPoint(node);
+    Node* quant_node = graph->create(prim::TupleConstruct);
+    for (auto* n : unpacked_wt) {
+      n->insertBefore(node);
+      quant_node->addInput(n->output());
+    }
+    quant_node->insertBefore(node);
+    node->insertInput(1, quant_node->output());
+  }
+}
+
 enum class QuantizedParamsType { CONV, LINEAR };
 
 // This is called before the onnx pass. Using pattern matching we
@@ -158,7 +312,8 @@ void unpackQuantizedWeightsHelper(
     std::map<std::string, IValue>& paramsDict,
     const std::string& pattern,
     const std::string& unpack_fn,
-    QuantizedParamsType params_type) {
+    QuantizedParamsType params_type,
+    bool caffe2 = true) {
   Graph pattern_graph;
   std::unordered_map<std::string, Value*> vmap;
   parseIR(pattern, &pattern_graph, vmap);
@@ -216,7 +371,7 @@ void unpackQuantizedWeightsHelper(
 
         const int64_t kSpatialDim = config_vals.at(0);
         // skip kSpatialDim
-        int idx = 1;
+        unsigned idx = 1;
         for (const auto i : c10::irange(kSpatialDim)) {
           (void)i; // Suppress unused variable warning
           stride_int.emplace_back(config_vals.at(idx));
@@ -348,37 +503,7 @@ void unpackQuantizedWeightsHelper(
       std::tie(unpacked_weight, bias) = op.call(packed_weight);
     }
 
-    // Permute weights
-    std::vector<int64_t> wt_sizes = unpacked_weight.sizes().vec();
-    if (unpacked_weight.ndimension() == 4) {
-      unpacked_weight.permute({0, 2, 3, 1});
-      wt_sizes = {
-          unpacked_weight.size(0),
-          unpacked_weight.size(2),
-          unpacked_weight.size(3),
-          unpacked_weight.size(1)};
-    }
-
-    // Remove packed_params
-    qlinear_node->removeInput(1);
-
-    // Convert from int8 to uint8
-    int8_t* inp_data =
-        reinterpret_cast<int8_t*>(unpacked_weight.data_ptr<c10::qint8>());
-    const int64_t weight_zp = unpacked_weight.q_zero_point() + 128;
-    const int64_t wt_numel = unpacked_weight.numel();
-
-    // Create caffe2::Int8GivenTensorFill node
-    std::ostringstream os;
-    for (const auto i : c10::irange(wt_numel)) {
-      os << static_cast<char>(inp_data[i] + 128);
-    }
-
-    Node* c2_weight = CreateQuantizedWeights(
-        os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp);
-    graph->setInsertPoint(qlinear_node);
-    c2_weight->insertBefore(qlinear_node);
-    qlinear_node->insertInput(1, c2_weight->output());
+    ConvertQuantizedWeight(graph, qlinear_node, unpacked_weight, caffe2);
 
     // Add bias
     at::Tensor original_bias;
@@ -386,15 +511,11 @@ void unpackQuantizedWeightsHelper(
       original_bias = bias.value();
       original_bias.set_requires_grad(false);
     } else {
-      // Caffe2 ops always expect bias tensor so if not present create empty
-      // tensor.
       int64_t bias_size = unpacked_weight.size(0);
       original_bias =
           at::zeros(bias_size, unpacked_weight.options().dtype(at::kFloat));
     }
 
-    auto weight_scale = unpacked_weight.q_scale();
-
     auto input_val = match_vmap.at(vmap.at("r"))->node()->inputs()[0];
     TORCH_INTERNAL_ASSERT(
         input_val->type()->isSubtypeOf(*TensorType::get()),
@@ -402,24 +523,40 @@ void unpackQuantizedWeightsHelper(
         input_val->type()->str());
 
     auto input_node = match_vmap.at(vmap.at("r"))->node()->inputs()[0]->node();
-    auto input_scale = getScaleFromInput(input_node);
-    auto q_bias = at::quantize_per_tensor(
-        original_bias, weight_scale * input_scale, 0, at::kQInt32);
-
-    std::vector<int64_t> bias_values;
-    bias_values.reserve(q_bias.numel());
-    auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
-    for (const auto i : c10::irange(q_bias.numel())) {
-      bias_values.push_back(bias_data[i]);
+    at::Tensor q_bias;
+
+    if (caffe2) {
+      auto weight_scale = unpacked_weight.q_scale();
+      auto input_scale = getScaleFromInput(input_node);
+      q_bias = at::quantize_per_tensor(
+          original_bias, weight_scale * input_scale, 0, at::kQInt32);
+      std::vector<int64_t> bias_values;
+      bias_values.reserve(q_bias.numel());
+      auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
+      for (const auto i : c10::irange(q_bias.numel())) {
+        bias_values.push_back(bias_data[i]);
+      }
+      Node* c2_bias = CreateQuantizedBiasCaffe2(
+          bias_values,
+          graph,
+          q_bias.sizes().vec(),
+          q_bias.q_scale(),
+          q_bias.q_zero_point());
+      c2_bias->insertBefore(qlinear_node);
+      qlinear_node->insertInput(2, c2_bias->output());
+    } else {
+      std::vector<float> bias_values(original_bias.numel());
+      auto bias_data = original_bias.data_ptr<float>();
+      for (const auto i : c10::irange(original_bias.numel())) {
+        bias_values[i] = bias_data[i];
+      }
+      Node* bias =
+          CreateQuantizedBias(bias_values, graph, original_bias.sizes().vec());
+      bias->insertBefore(qlinear_node);
+      // For quantized_linear inputs, the order is input, weight, bias, ....
+      // Therefore bias is at location 2.
+      qlinear_node->insertInput(2, bias->output());
     }
-    Node* c2_bias = CreateQuantizedBias(
-        bias_values,
-        graph,
-        q_bias.sizes().vec(),
-        q_bias.q_scale(),
-        q_bias.q_zero_point());
-    c2_bias->insertBefore(qlinear_node);
-    qlinear_node->insertInput(2, c2_bias->output());
 
     // add conv arguments: stride, padding, dilation, groups
     if (stride.has_value() && padding.has_value() && dilation.has_value() &&
@@ -428,6 +565,7 @@ void unpackQuantizedWeightsHelper(
       conv_ints_args.push_back(stride);
       conv_ints_args.push_back(padding);
       conv_ints_args.push_back(dilation);
+      // skip (input, weight, bias)
       const size_t arg_offset = 3;
       for (const auto i : c10::irange(conv_ints_args.size())) {
         Node* ints_node =
@@ -444,9 +582,59 @@ void unpackQuantizedWeightsHelper(
     eraseUnusedValuesFromMap(valsToParamsMap);
   }
 }
+
+static std::
+    unordered_map<c10::ScalarType, c10::ScalarType, ScalarTypeHashFunction>
+        qTypeToValType = {
+            {c10::ScalarType::QInt8, c10::ScalarType::Char},
+            {c10::ScalarType::QUInt8, c10::ScalarType::Byte},
+            {c10::ScalarType::QInt32, c10::ScalarType::Int},
+            {c10::ScalarType::QUInt4x2, c10::ScalarType::Byte},
+};
+
+// Unpack quantized tensor inputs into {value, scale, zero_point},
+// Then create a prim::TupleConstruct node based on these three values.
+void UnpackQuantizedTensorInputs(std::shared_ptr<Graph>& graph) {
+  for (size_t index = 0; index < graph->inputs().size();) {
+    auto g_input = graph->inputs()[index];
+    TensorTypePtr shape_type = g_input->type()->cast<TensorType>();
+    if (!shape_type || !shape_type->scalarType().has_value()) {
+      index++;
+      continue;
+    }
+    auto scalar_type = shape_type->scalarType().value();
+    if (qTypeToValType.find(scalar_type) == qTypeToValType.end()) {
+      index++;
+      continue;
+    }
+    std::string input_name = g_input->debugName();
+    auto input_value =
+        graph->insertInput(index, input_name + "_value")
+            ->setType(shape_type->withScalarType(qTypeToValType[scalar_type]));
+    // scale and zero_point type can be found at torch/include/ATen/Operators.h
+    auto input_scale =
+        graph->insertInput(index + 1, input_name + "_scale")
+            ->setType(TensorType::create(
+                at::kDouble, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
+    auto input_zero_point =
+        graph->insertInput(index + 2, input_name + "_zero_point")
+            ->setType(TensorType::create(
+                at::kLong, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
+    std::vector<Value*> converted{input_value, input_scale, input_zero_point};
+    auto input_tuple =
+        graph->prependNode(graph->createTuple(converted))->output();
+    g_input->replaceAllUsesWith(input_tuple);
+    // Erase the original quantized tensor input.
+    graph->eraseInput(index + converted.size());
+    index += 3;
+  }
+}
+
+// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
 void UnpackQuantizedWeights(
     std::shared_ptr<Graph>& graph,
-    std::map<std::string, IValue>& paramsDict) {
+    std::map<std::string, IValue>& paramsDict,
+    bool caffe2) {
   std::string qlinear = R"(
   graph(%input, %packed_weight, %w_scale, %w_zero_point):
         %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
@@ -472,31 +660,39 @@ void UnpackQuantizedWeights(
       paramsDict,
       qlinear,
       "quantized::linear_unpack",
-      QuantizedParamsType::LINEAR);
+      QuantizedParamsType::LINEAR,
+      caffe2);
   unpackQuantizedWeightsHelper(
       graph,
       paramsDict,
       qconv2d,
       "quantized::conv2d_unpack",
-      QuantizedParamsType::CONV);
+      QuantizedParamsType::CONV,
+      caffe2);
   unpackQuantizedWeightsHelper(
       graph,
       paramsDict,
       qconv2d_relu,
       "quantized::conv2d_unpack",
-      QuantizedParamsType::CONV);
+      QuantizedParamsType::CONV,
+      caffe2);
   unpackQuantizedWeightsHelper(
       graph,
       paramsDict,
       qconv3d,
       "quantized::conv3d_unpack",
-      QuantizedParamsType::CONV);
+      QuantizedParamsType::CONV,
+      caffe2);
   unpackQuantizedWeightsHelper(
       graph,
       paramsDict,
       qconv3d_relu,
       "quantized::conv3d_unpack",
-      QuantizedParamsType::CONV);
+      QuantizedParamsType::CONV,
+      caffe2);
+  if (!caffe2) {
+    UnpackQuantizedTensorInputs(graph);
+  }
   GRAPH_DUMP("After UnpackQuantizedWeights: ", graph);
 }
 
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
index 6a38c02181df..d9fd6f5eba78 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
 
 #include <memory>
 
@@ -10,7 +11,8 @@ namespace jit {
 
 TORCH_API void UnpackQuantizedWeights(
     std::shared_ptr<Graph>& graph,
-    std::map<std::string, IValue>& paramsDict);
+    std::map<std::string, IValue>& paramsDict,
+    bool caffe2);
 TORCH_API void insertPermutes(
     std::shared_ptr<Graph>& graph,
     std::map<std::string, IValue>& paramsDict);
diff --git a/torch/csrc/jit/passes/pass_manager.h b/torch/csrc/jit/passes/pass_manager.h
index 8585c6ecdb3d..111cb116dd41 100644
--- a/torch/csrc/jit/passes/pass_manager.h
+++ b/torch/csrc/jit/passes/pass_manager.h
@@ -68,7 +68,7 @@ using RegisterPass = RegisterPostPass;
  * types.
  */
 template <typename DerivedType>
-struct C10_EXPORT PassManager {
+struct TORCH_API PassManager {
  private:
   // We want this class to be abstract because it's
   virtual void abstract() = 0;
diff --git a/torch/csrc/jit/passes/refine_tuple_types.cpp b/torch/csrc/jit/passes/refine_tuple_types.cpp
new file mode 100644
index 000000000000..df4dae2761fe
--- /dev/null
+++ b/torch/csrc/jit/passes/refine_tuple_types.cpp
@@ -0,0 +1,42 @@
+#include <torch/csrc/jit/passes/refine_tuple_types.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+
+#include <ATen/core/type_factory.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+static void VisitTupleNode(Node* node) {
+  TORCH_CHECK(
+      node->outputs().size() == 1, "Tuple must have exactly one output!");
+
+  Value* output = node->outputs()[0];
+  auto tuple_type = output->type()->expectRef<TupleType>();
+
+  TORCH_CHECK(
+      tuple_type.containedTypes().size() == node->inputs().size(),
+      "Number of contained types does not match number of inputs!");
+
+  // Extract updated types from input values.
+  std::vector<c10::TypePtr> types;
+  for (const Value* input : node->inputs()) {
+    types.push_back(input->type());
+  }
+
+  // Construct new tuple type based on input types.
+  output->setType(tuple_type.withContained(types));
+}
+} // anonymous namespace
+
+void RefineTupleTypes(std::shared_ptr<Graph>& graph) {
+  DepthFirstGraphNodeIterator it(graph);
+  for (auto* node = it.next(); node != nullptr; node = it.next()) {
+    if (node->kind() == prim::TupleConstruct) {
+      VisitTupleNode(node);
+    }
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/refine_tuple_types.h b/torch/csrc/jit/passes/refine_tuple_types.h
new file mode 100644
index 000000000000..75f36313b3a1
--- /dev/null
+++ b/torch/csrc/jit/passes/refine_tuple_types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// updates the types of tuples according to the type of their current inputs.
+TORCH_API void RefineTupleTypes(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/replacement_of_old_operators.cpp b/torch/csrc/jit/passes/replacement_of_old_operators.cpp
index 911873bd17c7..5678059feda3 100644
--- a/torch/csrc/jit/passes/replacement_of_old_operators.cpp
+++ b/torch/csrc/jit/passes/replacement_of_old_operators.cpp
@@ -42,8 +42,7 @@ struct OldOpsReplacerWithUpgraders {
             get_operator_version_map().find(schema_name.value());
         if (version_entry != get_operator_version_map().end()) {
           const auto& entry = version_entry->second;
-          auto upgrader_entry =
-              findUpgrader(version_entry->second, current_version);
+          auto upgrader_entry = findUpgrader(entry, current_version);
           if (!upgrader_entry.has_value()) {
             if (!isOpSymbolCurrent(schema_name.value(), current_version)) {
               TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 262e9b351103..a4a307b51b87 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -361,7 +361,6 @@ class ShapePropagator : public PropertyPropBase {
   }
 
   OperatorSet cannot_propagate_shape_by_running_it = {
-      "aten::solve(Tensor self, Tensor A) -> (Tensor, Tensor)",
       "aten::inverse(Tensor self) -> Tensor",
   };
 
@@ -872,7 +871,7 @@ class ShapePropagator : public PropertyPropBase {
             "aten::rrelu(Tensor self, Scalar lower, Scalar upper, bool training, Generator? generator) -> Tensor",
             "aten::rsqrt(Tensor self) -> Tensor",
             "aten::selu(Tensor self) -> Tensor",
-            "aten::gelu(Tensor self) -> Tensor",
+            "aten::gelu(Tensor self, *, str approximate='none') -> Tensor",
             "aten::sigmoid(Tensor self) -> Tensor",
             "aten::sign(Tensor self) -> Tensor",
             "aten::sin(Tensor self) -> Tensor",
@@ -917,7 +916,7 @@ class ShapePropagator : public PropertyPropBase {
           if (input_type->scalarType()) {
             const auto scalar_type = *(input_type->scalarType());
             if (isComplexType(scalar_type)) {
-              const auto out_type = c10::toValueType(scalar_type);
+              const auto out_type = c10::toRealValueType(scalar_type);
               return type_vec_t{
                   input_type->dimensionedOnly()->withScalarType(out_type)};
             }
diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp
index 2ac068a248b1..2f72257f064f 100644
--- a/torch/csrc/jit/passes/specialize_autogradzero.cpp
+++ b/torch/csrc/jit/passes/specialize_autogradzero.cpp
@@ -90,7 +90,7 @@ struct AutogradZeroSpecializer {
     if (!isBackwardGraph()) {
       return;
     }
-    if (getProfilingMode()) {
+    if (getExecutorMode()) {
       if (auto versioning_if = guardSpecializations()) {
         specializeAutogradOps(versioning_if->blocks()[0]);
         GRAPH_DUMP("After versioning graph", graph_);
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 9c5125d6246c..8f7fdc537752 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -19,6 +19,8 @@
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+#include <torch/csrc/jit/passes/symbolic_shape_cache.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/exception_message.h>
 #include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
 #include <torch/csrc/utils/memory.h>
@@ -87,11 +89,11 @@ struct ShapeArg
     }
   }
 
-  c10::optional<int64_t> asConstantInt() {
+  c10::optional<int64_t> asConstantInt() const {
     return this->second;
   }
 
-  c10::optional<c10::ShapeSymbol> asShapeSymbol() {
+  c10::optional<c10::ShapeSymbol> asShapeSymbol() const {
     return this->first;
   }
 
@@ -102,30 +104,67 @@ struct ShapeArg
   }
 };
 
+std::ostream& operator<<(std::ostream& out, const ShapeArg& sa) {
+  if (auto val = sa.asConstantInt()) {
+    out << *val;
+  } else if (auto ss = sa.asShapeSymbol()) {
+    out << *ss;
+  } else {
+    out << "UNK";
+  }
+  return out;
+}
+
 struct ShapeArguments {
+  // Superset of SymbolicShape, with additional support for unknown, nonsymbolic
+  // vals
+ public:
   ShapeArguments(const c10::SymbolicShape& ss) {
-    TORCH_INTERNAL_ASSERT(ss.rank())
-    for (size_t i = 0; i < *ss.rank(); ++i) {
-      maybe_shape_symbols_.push_back(ShapeArg(ss.at(i)));
+    has_dim_ = ss.rank().has_value();
+    if (has_dim_) {
+      for (size_t i = 0; i < *ss.rank(); ++i) {
+        maybe_shape_symbols_.emplace_back(ss.at(i));
+      }
     }
   }
 
-  ShapeArguments(std::vector<ShapeArg> ss) {
-    maybe_shape_symbols_ = std::move(ss);
+  ShapeArguments(std::vector<ShapeArg> ss)
+      : has_dim_(true), maybe_shape_symbols_(std::move(ss)) {}
+
+  bool has_dim() const {
+    return has_dim_;
   }
 
-  int64_t len() {
-    return maybe_shape_symbols_.size();
+  int64_t len() const {
+    TORCH_INTERNAL_ASSERT(has_dim_, "ShapeArguments has no known dim")
+    return (int64_t)maybe_shape_symbols_.size();
   }
 
-  ShapeArg at(size_t i) {
+  const ShapeArg at(size_t i) const {
+    TORCH_INTERNAL_ASSERT(has_dim_, "ShapeArguments has no known dim")
     return maybe_shape_symbols_.at(i);
   }
 
  private:
+  bool has_dim_;
   std::vector<ShapeArg> maybe_shape_symbols_;
 };
 
+std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
+  if (!sa.has_dim()) {
+    os << "(UNKNOWN DIM)";
+    return os;
+  }
+
+  os << "(";
+  for (size_t i = 0; i < sa.len(); i++) {
+    os << sa.at(i);
+  }
+  os << ")";
+
+  return os;
+}
+
 bool setSymbolicShapeAnalysisTestMode(bool value) {
   bool old_value = symbolic_shape_analysis_test_mode;
   symbolic_shape_analysis_test_mode = value;
@@ -136,17 +175,19 @@ bool symbolicShapeAnalysisTestModeEnabled() {
   return symbolic_shape_analysis_test_mode;
 }
 
-namespace {
+using SSArgument = c10::variant<ShapeArguments, IValue>;
 
-IValue tensor_sizes_from_tensor_list(const IValue& iv) {
-  c10::List<c10::List<int64_t>> tensor_sizes;
-  auto tensor_list = iv.toTensorVector();
-  for (const auto& ten : tensor_list) {
-    tensor_sizes.push_back(c10::List<int64_t>(ten.sizes()));
+std::ostream& operator<<(std::ostream& out, const SSArgument& sa) {
+  if (const IValue* iv = c10::get_if<IValue>(&sa)) {
+    out << *iv;
+  } else {
+    out << c10::get<ShapeArguments>(sa);
   }
-  return tensor_sizes;
+  return out;
 }
 
+namespace {
+
 bool isListOfInts(const TypePtr& type) {
   return type->cast<ListType>() &&
       type->cast<ListType>()->getElementType()->cast<IntType>();
@@ -189,7 +230,31 @@ void replaceWithIValue(Value* v, IValue val) {
   v->replaceAllUsesWith(v->owningGraph()->insertConstant(val));
 }
 
-} // namespace
+c10::SymbolicShape extractListShape(
+    Value* list,
+    std::unordered_map<Value*, int64_t>& symbolic_shape_values,
+    const AliasDb& db) {
+  if (list->node()->kind() == prim::Constant) {
+    auto int_list = toIValue(list)->toIntVector();
+    return c10::SymbolicShape(int_list);
+  }
+  // We need a list construct or a constant output
+  // that is not written to in order to analyze the output shape
+  if (list->node()->kind() != prim::ListConstruct || db.hasWriters(list)) {
+    GRAPH_DEBUG("Could not extract shape");
+    return c10::SymbolicShape();
+  }
+  Node* list_construct = list->node();
+  std::vector<c10::optional<int64_t>> output_shape;
+  for (Value* input : list_construct->inputs()) {
+    if (symbolic_shape_values.count(input)) {
+      output_shape.emplace_back(symbolic_shape_values[input]);
+    } else {
+      output_shape.push_back(constant_as<int64_t>(input));
+    }
+  }
+  return c10::SymbolicShape(output_shape);
+}
 
 // Symbolic Shape Analysis works through iteratively partially evaluating
 // a TorchScript shape compute graph by inputing properties from input
@@ -205,206 +270,96 @@ void replaceWithIValue(Value* v, IValue val) {
 // means that we do know its concrete value statically but we can asssign sets
 // of tensor dimensions which must be equal at runtime.
 
-struct SymbolicShapeNodeAnalyzer {
-  SymbolicShapeNodeAnalyzer(
-      Node* n,
-      std::shared_ptr<Graph> shape_compute_graph,
-      const AliasDb& db)
-      : shape_compute_graph_(shape_compute_graph->copy()), node_(n) {
-    // NB: shape compute graphs may have less inputs than their node
-    // counterparts to allow e.g. sharing one single unary definition
-    // so iterate on # of shape inputs
-    size_t shape_graph_initial_inputs = shape_compute_graph_->inputs().size();
-    // We make lists of Tensor inputs variadic, which results in
-    // offset between a node index and its corresponding graph index
-    size_t graph_index_offset = 0;
-    for (size_t node_index = 0; node_index < shape_graph_initial_inputs;
-         node_index++) {
-      auto type = node_->input(node_index)->type();
-      size_t graph_index = graph_index_offset + node_index;
-
+struct SymbolicShapeOpAnalyzer {
+  std::shared_ptr<Graph> shape_compute_graph_;
+  const FunctionSchema* schema_;
+  std::vector<SSArgument> inputs_;
+
+  // For the case where we have a JIT graph,
+  // subsititute optional types for their component types
+  // if the type is known. This doesn't need to be done
+  // for known IValues.
+  void refineInputUnionTypes(const Node* parent_graph_node) {
+    for (size_t op_in_index = 0;
+         op_in_index < shape_compute_graph_->inputs().size();
+         op_in_index++) {
+      auto type = parent_graph_node->input(op_in_index)->type();
       if (auto opt_type = shape_compute_graph_->inputs()
-                              .at(graph_index)
+                              .at(op_in_index)
                               ->type()
                               ->cast<OptionalType>()) {
         // None will get handled with constant substitution later
         if (!type->cast<OptionalType>() &&
             !NoneType::get()->isSubtypeOf(*type)) {
           shape_compute_graph_->inputs()
-              .at(graph_index)
+              .at(op_in_index)
               ->setType(opt_type->getElementType());
         }
       } else if (shape_compute_graph_->inputs()
-                     .at(graph_index)
+                     .at(op_in_index)
                      ->type()
                      ->cast<NumberType>()) {
-        shape_compute_graph_->inputs().at(graph_index)->setType(type);
-      }
-
-      if (auto tt = type->castRaw<TensorType>()) {
-        addTensorInputMetaData(node_->input(node_index), graph_index);
-      } else if (isListOfTensors(type)) {
-        // waiting for more use cases to decide on best generalization
-        TORCH_INTERNAL_ASSERT(
-            node_->kind() == aten::cat, "TODO: generalize logic");
-        // When we have partially evaluate a list of Tensors like cat(tensor[])
-        // We have a few problems:
-        // - optimizing out calls to the length of the list: len(tensors)
-        // - resolving accesses of the list to the tensor symbolic sizes the
-        // corresponding list element We can solve both of these problems by
-        // replacing the partial evaluation of cat([x, y]) def cat(tensors:
-        // List[List[int]], dim: int)
-        //    body
-        // with
-        // def cat(x, y, dim: int)
-        //     tensors = [x, y]
-        //     body
-        // This reuses the existing input Tensors partial evaluation and allows
-        // our existing optimizations to optimize out len(tensors) instead of
-        // requiring extra partial evaluation within this pass
-        if (node_->input(node_index)->node()->kind() == prim::Constant) {
-          replaceWithIValue(
-              shape_compute_graph_->inputs().at(graph_index),
-              tensor_sizes_from_tensor_list(
-                  *toIValue(node_->input(node_index))));
-        } else if (
-            node_->input(node_index)->node()->kind() == prim::ListConstruct &&
-            !db.hasWriters(node_->input(node_index))) {
-          auto li_construct_node = node_->input(node_index)->node();
-          std::vector<Value*> li_inputs;
-          Value* graph_input = shape_compute_graph_->inputs().at(graph_index);
-          for (size_t j = 0; j < li_construct_node->inputs().size(); ++j) {
-            auto new_inp = shape_compute_graph_->insertInput(graph_index + j);
-            new_inp->setType(ListType::ofInts());
-            li_inputs.push_back(new_inp);
-          }
-          WithInsertPoint guard(
-              *shape_compute_graph_->block()->nodes().begin());
-          auto new_li = shape_compute_graph_->insertNode(
-              shape_compute_graph_->createList(ListType::ofInts(), li_inputs));
-          graph_input->replaceAllUsesWith(new_li->output());
-          for (size_t j = 0; j < li_construct_node->inputs().size(); ++j) {
-            addTensorInputMetaData(
-                li_construct_node->input(j), graph_index + j);
-          }
-          shape_compute_graph_->eraseInput(
-              node_index + li_construct_node->inputs().size());
-          graph_index_offset += li_construct_node->inputs().size() - 1;
-        }
-      } else if (auto ival = toIValue(node_->input(node_index))) {
-        replaceWithIValue(
-            shape_compute_graph_->inputs().at(graph_index), *ival);
-      } else if (
-          type->cast<ListType>() &&
-          type->cast<ListType>()->getElementType()->cast<IntType>()) {
-        if (node_->input(node_index)->node()->kind() == prim::ListConstruct &&
-            !db.hasWriters(node_->input(node_index))) {
-          // it is a very common in graphs to see patterns like:
-          // z = x.view(y.size())
-          // or:
-          // z = x.view(1, 10, y.size(0), y.size(1))
-          // We want to propagate symbolic dimensions and concrete sizes
-          // from y to z. To do this we try to associate symbolic dimensions
-          // or concrete sizes with the integer list inputs that have a
-          // constructor taken from constants or y.size() or y.size(0)
-          auto list_construct = node_->input(node_index)->node();
-          std::vector<ShapeArg> shape;
-          for (Value* v : list_construct->inputs()) {
-            if (auto constant = constant_as<int64_t>(v)) {
-              shape.emplace_back(*constant);
-            } else if (v->node()->kind() == aten::size) {
-              auto const_index = constant_as<int64_t>(v->node()->input(1));
-              auto tt = v->node()->input(0)->type()->expect<TensorType>();
-              auto ss = tt->symbolic_sizes();
-              if (!ss.rank() || !const_index) {
-                // if we are getting a size of a tensor, it is an unknown
-                // symbolic dimension instead of an unknown integer (must be
-                // >=0)
-                shape.emplace_back(at::ShapeSymbol::newSymbol());
-                continue;
-              }
-              auto norm_index = normIndex(*const_index, *ss.rank());
-              if (!norm_index) {
-                shape.emplace_back(at::ShapeSymbol::newSymbol());
-                continue;
-              }
-              shape.emplace_back(ss[*norm_index]);
-            } else {
-              shape.emplace_back(ShapeArg::unknownInteger());
-            }
-          }
-          node_symbolic_input_indices_.emplace_back(
-              graph_index, std::move(shape));
-        } else if (
-            node_->input(node_index)->node()->kind() == aten::size &&
-            !db.hasWriters(node_->input(node_index))) {
-          auto ten_inp = node_->input(node_index)->node()->input();
-          auto ss = ten_inp->type()->expect<TensorType>()->symbolic_sizes();
-          node_symbolic_input_indices_.emplace_back(graph_index, ss);
-        }
+        shape_compute_graph_->inputs().at(op_in_index)->setType(type);
       }
     }
   }
 
-  void addTensorInputMetaData(
-      Value* tensor_v,
-      size_t shape_compute_graph_index) {
-    auto tt = tensor_v->type()->expect<TensorType>();
-    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-    c10::SymbolicShape symbolic_shapes = tt->symbolic_sizes();
-
-    // for testing, we don't insert complete tensor shapes and rely on our
-    // partial evaluation pipeline to propagate information.
-    // this is a good proxy for our ability to propagate non-complete shape
-    // information.
-
-    if (symbolic_shapes.isComplete() && !symbolic_shape_analysis_test_mode) {
-      replaceWithIValue(
-          shape_compute_graph_->inputs().at(shape_compute_graph_index),
-          *tt->sizes().concrete_sizes());
-      return;
-    }
-    // TODO: remove, all constant tensors should have typed sizes
-    if (toIValue(tensor_v)) {
-      auto size = constant_as<at::Tensor>(tensor_v)->sizes();
-      if (!symbolic_shape_analysis_test_mode) {
-        replaceWithIValue(
-            shape_compute_graph_->inputs().at(shape_compute_graph_index), size);
-      } else {
-        node_symbolic_input_indices_.emplace_back(
-            shape_compute_graph_index, c10::SymbolicShape(size));
+  // We handle non-constant values in the shape propagation step
+  void substituteConstantInputs() {
+    if (schema_->name() == "aten::cat") {
+      // Modifying the graph where _node is part of to not use the tensor
+      // construct
+
+      // When we have partially evaluate a list of Tensors like cat(tensor[])
+      // We have a few problems:
+      // - optimizing out calls to the length of the list: len(tensors)
+      // - resolving accesses of the list to the tensor symbolic sizes the
+      // corresponding list element We can solve both of these problems by
+      // replacing the partial evaluation of cat([x, y]) def cat(tensors:
+      // List[List[int]], dim: int)
+      //    body
+      // with
+      // def cat(x, y, dim: int)
+      //     tensors = [x, y]
+      //     body
+      uint64_t li_length = inputs_.size() - (schema_->arguments().size() - 1);
+      std::vector<Value*> li_inputs;
+      Value* graph_input = shape_compute_graph_->inputs().at(0);
+      for (size_t j = 0; j < li_length; ++j) {
+        auto new_inp = shape_compute_graph_->insertInput(j);
+        new_inp->setType(ListType::ofInts());
+        li_inputs.push_back(new_inp);
       }
-      return;
-    }
+      WithInsertPoint guard(*shape_compute_graph_->block()->nodes().begin());
+      auto new_li = shape_compute_graph_->insertNode(
+          shape_compute_graph_->createList(ListType::ofInts(), li_inputs));
+      graph_input->replaceAllUsesWith(new_li->output());
 
-    // we can't optimize a tensor without fixed rank
-    if (symbolic_shapes.rank()) {
-      node_symbolic_input_indices_.emplace_back(
-          shape_compute_graph_index, symbolic_shapes);
+      shape_compute_graph_->eraseInput(li_length);
     }
-  }
 
-  // returns partially evaluated shape compute graph
-  std::shared_ptr<Graph> run() {
-    bool made_change = true;
-    constexpr size_t MAX_ATTEMPTS = 8;
-    size_t curr_attempt = 0;
-    while (made_change && curr_attempt < MAX_ATTEMPTS) {
-      curr_attempt++;
-      // symbolic shape concrete values are only used in final shape extraction
-      substituteInputTensorProperties(/*symbolic_shape_values*/ nullptr);
-      made_change = shapeGraphCleanupPasses(shape_compute_graph_);
+    TORCH_INTERNAL_ASSERT(
+        shape_compute_graph_->inputs().size() <= inputs_.size(),
+        "Shape Compute Graph expected to have less inputs than actual inputs"); //?
+    for (size_t op_in_index = 0;
+         op_in_index < shape_compute_graph_->inputs().size();
+         op_in_index++) {
+      SSArgument& argument = inputs_[op_in_index];
+      Value* graph_in_var = shape_compute_graph_->inputs().at(op_in_index);
+
+      if (IValue* cur_val = c10::get_if<IValue>(&argument)) {
+        GRAPH_DEBUG("Substituting constant input ", *cur_val);
+        replaceWithIValue(graph_in_var, *cur_val);
+      } else {
+        auto cur_arg = c10::get<ShapeArguments>(argument);
+        if (cur_arg.has_dim()) {
+          graph_in_var->setType(ListType::ofInts());
+        }
+      }
     }
-    std::unordered_map<Value*, int64_t> symbolic_shape_values;
-    substituteInputTensorProperties(&symbolic_shape_values);
-    GRAPH_DUMP("Done with partial evaluation", shape_compute_graph_);
-
-    extractOutputShape(symbolic_shape_values);
-    return shape_compute_graph_;
   }
 
- private:
-  void substituteInputTensorProperties(
+  void substituteSymbolicProperties(
       std::unordered_map<Value*, int64_t>* symbolic_shape_values) {
     // clang-format off
     // here we iteratively substitute properties of the node's input tensors
@@ -433,15 +388,22 @@ struct SymbolicShapeNodeAnalyzer {
 
     std::unordered_map<int64_t, std::vector<Value*>> symbolic_shape_map;
 
-    for (const auto& index_symbolic_shape : node_symbolic_input_indices_) {
-      auto index = index_symbolic_shape.first;
-      auto shape_arguments = index_symbolic_shape.second;
+    TORCH_INTERNAL_ASSERT(
+        inputs_.size() >= shape_compute_graph_->inputs().size(),
+        "Missing Arg for Shape Graph");
+    for (int64_t index = 0; index < shape_compute_graph_->inputs().size();
+         index++) {
+      auto shape_arguments = c10::get_if<ShapeArguments>(&inputs_[index]);
+      if (!shape_arguments || !shape_arguments->has_dim()) {
+        continue;
+      }
+      // Add support for testing symbolic shapes with dynamic dims
 
-      for (const auto& use : shape_compute_graph_->inputs().at(index)->uses()) {
+      for (const Use& use : shape_compute_graph_->inputs().at(index)->uses()) {
         // TODO: either decompose composite ops like slice or add handling here
         switch (use.user->kind()) {
           case aten::len: {
-            size_t len = shape_arguments.len();
+            size_t len = shape_arguments->len();
             replaceWithIValue(use.user->output(), static_cast<int64_t>(len));
           } break;
           case aten::__getitem__: {
@@ -449,11 +411,11 @@ struct SymbolicShapeNodeAnalyzer {
             if (!index) {
               continue;
             }
-            auto norm_index = normIndex(*index, shape_arguments.len());
+            auto norm_index = normIndex(*index, shape_arguments->len());
             if (!norm_index) {
               continue;
             }
-            auto shape_arg = shape_arguments.at(*norm_index);
+            auto shape_arg = shape_arguments->at(*norm_index);
             if (auto const_int = shape_arg.asConstantInt()) {
               replaceWithIValue(use.user->output(), const_int);
               continue;
@@ -555,65 +517,234 @@ struct SymbolicShapeNodeAnalyzer {
     }
   }
 
-  c10::SymbolicShape extractListShape(
-      Value* list,
-      std::unordered_map<Value*, int64_t>& symbolic_shape_values,
-      const AliasDb& db) {
-    if (list->node()->kind() == prim::Constant) {
-      auto int_list = toIValue(list)->toIntVector();
-      return c10::SymbolicShape(int_list);
-    }
-    // We need a list construct or a constant output
-    // that is not written to in order to analyze the output shape
-    if (list->node()->kind() != prim::ListConstruct || db.hasWriters(list)) {
-      GRAPH_DEBUG("Could not extract shape ", getHeader(node_));
-      return c10::SymbolicShape();
-    }
-    Node* list_construct = list->node();
-    std::vector<c10::optional<int64_t>> output_shape;
-    for (Value* input : list_construct->inputs()) {
-      if (symbolic_shape_values.count(input)) {
-        output_shape.push_back(symbolic_shape_values[input]);
-      } else {
-        output_shape.push_back(constant_as<int64_t>(input));
-      }
+  std::vector<c10::SymbolicShape> propagateShapesInGraph() {
+    bool made_change = true;
+    constexpr size_t MAX_ATTEMPTS = 8;
+    for (int attempt_num = 0; made_change && attempt_num < MAX_ATTEMPTS;
+         attempt_num++) {
+      // symbolic shape concrete values are only used in final shape extraction
+      GRAPH_DUMP("Before substitution: ", shape_compute_graph_);
+      substituteSymbolicProperties(/*symbolic_shape_values*/ nullptr);
+      GRAPH_DUMP("Before Opt: ", shape_compute_graph_);
+      made_change = shapeGraphCleanupPasses(shape_compute_graph_);
     }
-    return c10::SymbolicShape(output_shape);
+    std::unordered_map<Value*, int64_t> symbolic_shape_values;
+    substituteSymbolicProperties(&symbolic_shape_values);
+    GRAPH_DUMP("Done with partial evaluation", shape_compute_graph_);
+
+    return extractOutputShape(symbolic_shape_values);
   }
 
-  void extractOutputShape(
+  std::vector<c10::SymbolicShape> extractOutputShape(
       std::unordered_map<Value*, int64_t>& symbolic_shape_values) {
     TORCH_INTERNAL_ASSERT(
-        shape_compute_graph_->outputs().size() == node_->outputs().size());
+        shape_compute_graph_->outputs().size() == schema_->returns().size());
     // TODO: would be nice if there were easy facility to look at uses and see
     // if they are all pure instead of instanting db.
+    auto res = std::vector<c10::SymbolicShape>();
     AliasDb db(shape_compute_graph_);
     for (size_t i = 0; i < shape_compute_graph_->outputs().size(); ++i) {
       auto output = shape_compute_graph_->outputs().at(i);
       auto type = output->type();
       TORCH_INTERNAL_ASSERT(isListOfInts(type));
-      auto ss = extractListShape(output, symbolic_shape_values, db);
-      node_->output(i)->setType(
-          node_->output(i)->type()->expect<TensorType>()->withSymbolicShapes(
-              ss));
+      c10::SymbolicShape ss =
+          extractListShape(output, symbolic_shape_values, db);
+      GRAPH_DEBUG("Extracted Output: ", ss);
+      res.push_back(ss);
     }
+    return res;
   }
 
-  // node input indices that are TensorType and we need to iteratively
-  // substitute properties of. We only substitute properties
-  // of TensorTypes with a fixed dimension but not a complete shape,
-  // because a complete shape we can completely replace with a constant
-  // and non-fixed dimensions we cannot reason about at all
-  std::vector<std::pair<int64_t, ShapeArguments>> node_symbolic_input_indices_;
-  std::shared_ptr<Graph> shape_compute_graph_;
-  Node* node_;
+ public:
+  SymbolicShapeOpAnalyzer(const FunctionSchema* schema) : schema_(schema) {
+    shape_compute_graph_ = nullptr;
+    if (!schema_) {
+      return;
+    }
+    auto maybe_graph = shapeComputeGraphForSchema(*schema_);
+    if (!maybe_graph) {
+      return;
+    }
+    shape_compute_graph_ = (*maybe_graph)->copy();
+  }
+
+  c10::optional<std::vector<c10::SymbolicShape>> run(
+      std::vector<SSArgument>& inputs) {
+    if (!shape_compute_graph_) {
+      return c10::nullopt;
+    }
+    inputs_ = inputs;
+    substituteConstantInputs();
+    GRAPH_DEBUG(inputs_)
+    return propagateShapesInGraph();
+  }
+
+  std::shared_ptr<Graph> getShapeComputeGraph() {
+    return shape_compute_graph_;
+  }
 };
 
+SSArgument tensorShapeArg(Value* tensor_v) {
+  auto tt = tensor_v->type()->expect<TensorType>();
+  c10::SymbolicShape symbolic_shapes = tt->symbolic_sizes();
+
+  // for testing, we don't insert complete tensor shapes and rely on our
+  // partial evaluation pipeline to propagate information.
+  // this is a good proxy for our ability to propagate non-complete shape
+  // information.
+  if (symbolic_shapes.isComplete() && !symbolic_shape_analysis_test_mode) {
+    return IValue(tt->sizes().concrete_sizes());
+  }
+  if (toIValue(tensor_v)) {
+    auto size = constant_as<at::Tensor>(tensor_v)->sizes();
+    if (!symbolic_shape_analysis_test_mode) {
+      return IValue(size);
+    } else {
+      return c10::SymbolicShape(size);
+    }
+  }
+  return symbolic_shapes;
+}
+
+std::vector<SSArgument> getNodeInputShapes(Node* n, const AliasDb& db) {
+  // TODO: fix the List of integers implementation, and
+  // extract out the shape changes, otherwise this is complete
+  // NB: shape compute graphs may have less inputs than their node
+  // counterparts to allow e.g. sharing one single unary definition
+  // so iterate on # of shape inputs
+  // We make lists of Tensor inputs variadic, which results in
+  // offset between a node index and its corresponding graph index
+  std::vector<SSArgument> input_shapes = std::vector<SSArgument>();
+
+  for (size_t node_index = 0; node_index < n->inputs().size(); ++node_index) {
+    auto type = n->input(node_index)->type();
+
+    if (type->castRaw<TensorType>()) {
+      input_shapes.push_back(tensorShapeArg(n->input(node_index)));
+      continue;
+    }
+    if (isListOfTensors(type)) {
+      // waiting for more use cases to decide on best generalization
+      TORCH_INTERNAL_ASSERT(n->kind() == aten::cat, "TODO: generalize logic");
+      if (n->input(node_index)->node()->kind() == prim::Constant) {
+        auto ival = toIValue(n->input(node_index));
+        for (const auto& ten : ival->toTensorVector()) {
+          input_shapes.emplace_back(c10::List<int64_t>(ten.sizes()));
+        }
+      } else if (
+          n->input(node_index)->node()->kind() == prim::ListConstruct &&
+          !db.hasWriters(n->input(node_index))) {
+        auto li_construct_node = n->input(node_index)->node();
+        for (size_t j = 0; j < li_construct_node->inputs().size(); ++j) {
+          input_shapes.push_back(tensorShapeArg(li_construct_node->input(j)));
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT(false, "Unhandled List, we shouldn't get here");
+      }
+      continue;
+    }
+    if (auto ival = toIValue(n->input(node_index))) {
+      input_shapes.emplace_back(*ival);
+      continue;
+    }
+    if (type->cast<ListType>() &&
+        type->cast<ListType>()->getElementType()->cast<IntType>()) {
+      auto input_src_node = n->input(node_index)->node();
+      if (input_src_node->kind() == prim::ListConstruct &&
+          !db.hasWriters(n->input(node_index))) {
+        // it is a very common in graphs to see patterns like:
+        // z = x.view(y.size())
+        // or:
+        // z = x.view(1, 10, y.size(0), y.size(1))
+        // We want to propagate symbolic dimensions and concrete sizes
+        // from y to z. To do this we try to associate symbolic dimensions
+        // or concrete sizes with the integer list inputs that have a
+        // constructor taken from constants or y.size() or y.size(0)
+        auto list_construct = n->input(node_index)->node();
+        std::vector<ShapeArg> shape;
+        for (Value* v : list_construct->inputs()) {
+          if (auto constant = constant_as<int64_t>(v)) {
+            shape.emplace_back(*constant);
+          } else if (v->node()->kind() == aten::size) {
+            auto const_index = constant_as<int64_t>(v->node()->input(1));
+            auto tt = v->node()->input(0)->type()->expect<TensorType>();
+            auto ss = tt->symbolic_sizes();
+            if (!ss.rank() || !const_index) {
+              // if we are getting a size of a tensor, it is an unknown
+              // symbolic dimension instead of an unknown integer (must be
+              // >=0)
+              shape.emplace_back(at::ShapeSymbol::newSymbol());
+              continue;
+            }
+            auto norm_index = normIndex(*const_index, *ss.rank());
+            if (!norm_index) {
+              shape.emplace_back(at::ShapeSymbol::newSymbol());
+              continue;
+            }
+            shape.emplace_back(ss[*norm_index]);
+          } else {
+            shape.emplace_back(ShapeArg::unknownInteger());
+          }
+        }
+        input_shapes.emplace_back(ShapeArguments(shape));
+        continue;
+      }
+      if (input_src_node->kind() == aten::size &&
+          !db.hasWriters(n->input(node_index))) {
+        auto ten_inp = input_src_node->input();
+        auto ss = ten_inp->type()->expect<TensorType>()->symbolic_sizes();
+        input_shapes.emplace_back(ss);
+        continue;
+      }
+    }
+    GRAPH_DEBUG(
+        "Unhandled input: ",
+        n->kind().toDisplayString(),
+        " arg num: ",
+        node_index);
+    input_shapes.emplace_back(c10::SymbolicShape());
+  }
+  TORCH_INTERNAL_ASSERT(
+      input_shapes.size() >= n->inputs().size(),
+      "input_shapes size: ",
+      input_shapes.size(),
+      " n inputs size: ",
+      n->inputs().size());
+  return input_shapes;
+}
+
+void applyOutputShapeToGraph(
+    Node* node,
+    const std::vector<c10::SymbolicShape>& output_shapes) {
+  TORCH_INTERNAL_ASSERT(
+      node->outputs().size() == output_shapes.size(),
+      "Output shape size mismatch");
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    auto& ss = output_shapes.at(i);
+    node->output(i)->setType(
+        node->output(i)->type()->expect<TensorType>()->withSymbolicShapes(ss));
+  }
+}
+
 std::shared_ptr<Graph> PropagateShapesWithShapeFunction(
     Node* n,
-    std::shared_ptr<Graph>& shape_compute_graph,
     const AliasDb& db) {
-  return SymbolicShapeNodeAnalyzer(n, shape_compute_graph, db).run();
+  const FunctionSchema* func_schema = n->maybeSchema();
+  if (!func_schema) {
+    return nullptr;
+  }
+  auto op_analyzer = SymbolicShapeOpAnalyzer(func_schema);
+  if (!op_analyzer.getShapeComputeGraph()) {
+    return nullptr;
+  }
+  auto input_shapes = getNodeInputShapes(n, db);
+  op_analyzer.refineInputUnionTypes(n);
+
+  if (auto output_shapes = op_analyzer.run(input_shapes)) {
+    applyOutputShapeToGraph(n, *output_shapes);
+  }
+
+  return op_analyzer.getShapeComputeGraph();
 }
 
 struct SymbolicShapeGraphAnalyzer {
@@ -782,8 +913,8 @@ struct SymbolicShapeGraphAnalyzer {
 
     // When we add a new tensor node, we do two things:
     // 1: record a mapping from the tensor node output to its shape in the
-    // partial eval graph 2: add each symbolic shape dimension that we have not
-    // already added as a output to the large shape compute graph
+    // partial eval graph 2: add each symbolic shape dimension that we have
+    // not already added as a output to the large shape compute graph
 
     // Once we are done stitching together all partial eval'd graphs, we can
     // cleanup the graph and remove the unneeded complete shapes as outputs,
@@ -891,11 +1022,8 @@ struct SymbolicShapeGraphAnalyzer {
     std::unordered_map<Node*, std::shared_ptr<Graph>> partial_evaluated_graphs;
     for (auto it = beg_->iterator(); it != end_->iterator(); it++) {
       auto curr = *it;
-      if (curr->maybeSchema()) {
-        if (auto maybe_graph = shapeComputeGraphForSchema(curr->schema())) {
-          partial_evaluated_graphs[curr] =
-              PropagateShapesWithShapeFunction(curr, *maybe_graph, db);
-        }
+      if (auto maybe_graph = PropagateShapesWithShapeFunction(curr, db)) {
+        partial_evaluated_graphs[curr] = maybe_graph;
       }
     }
     return partial_evaluated_graphs;
@@ -920,9 +1048,7 @@ void PropagateShapesOnBlock(Block* b, const AliasDb& db) {
       PropagateShapesOnBlock(if_v.elseBlock(), db);
       mergeTypes(if_v.thenOutputs(), if_v.elseOutputs(), if_v.outputs());
     } else if (n->maybeSchema()) {
-      if (auto maybe_graph = shapeComputeGraphForSchema(n->schema())) {
-        PropagateShapesWithShapeFunction(n, *maybe_graph, db);
-      }
+      PropagateShapesWithShapeFunction(n, db);
     } else if (n->kind() == prim::TupleConstruct) {
       auto orig_type = n->output()->type()->expect<TupleType>();
       auto new_types = fmap(n->inputs(), [](Value* v) { return v->type(); });
@@ -931,6 +1057,7 @@ void PropagateShapesOnBlock(Block* b, const AliasDb& db) {
     }
   }
 }
+} // namespace
 
 void PropagateShapesOnGraph(std::shared_ptr<Graph>& graph) {
   AliasDb db(graph);
@@ -945,5 +1072,37 @@ PropagateShapesAndBuildLargeShapeComputeGraph(
   return SymbolicShapeGraphAnalyzer(graph, beg, end).run();
 }
 
+TORCH_API c10::optional<std::vector<c10::SymbolicShape>>
+calculateSymbolicShapesOnOp(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& inputs) {
+  if (shapeComputeGraphForSchema(*schema) == c10::nullopt) {
+    // Avoid doing all this work for functions that don't have a
+    // supported schema
+    return c10::nullopt;
+  }
+
+  if (auto cached_ret_vec = get_cached_shape_function(schema, inputs)) {
+    return cached_ret_vec;
+  }
+
+  std::vector<SSArgument> ssa_args;
+  for (auto& arg : inputs) {
+    if (const IValue* ival = c10::get_if<IValue>(&arg)) {
+      ssa_args.emplace_back(*ival);
+    } else {
+      const c10::SymbolicShape* ss = c10::get_if<c10::SymbolicShape>(&arg);
+      ssa_args.emplace_back(ShapeArguments(*ss));
+    }
+  }
+
+  auto op_analyzer = SymbolicShapeOpAnalyzer(schema);
+  auto res = op_analyzer.run(ssa_args);
+  if (res.has_value()) {
+    cache_shape_function(schema, inputs, res.value());
+  }
+  return res;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.h b/torch/csrc/jit/passes/symbolic_shape_analysis.h
index c2dc78c7d7fb..99a7798dc49c 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.h
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/variant.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <unordered_map>
@@ -47,5 +48,10 @@ PropagateShapesAndBuildLargeShapeComputeGraph(
 TORCH_API bool setSymbolicShapeAnalysisTestMode(bool value);
 TORCH_API bool symbolicShapeAnalysisTestModeEnabled();
 
+using SSAInput = c10::variant<IValue, c10::SymbolicShape>;
+TORCH_API c10::optional<std::vector<c10::SymbolicShape>>
+calculateSymbolicShapesOnOp(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& inputs);
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.cpp b/torch/csrc/jit/passes/symbolic_shape_cache.cpp
new file mode 100644
index 000000000000..62bf488f6a84
--- /dev/null
+++ b/torch/csrc/jit/passes/symbolic_shape_cache.cpp
@@ -0,0 +1,208 @@
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+#include <torch/csrc/jit/passes/symbolic_shape_cache.h>
+#include <torch/csrc/lazy/core/cache.h>
+
+// SHAPE CACHINHG CODE
+namespace torch {
+namespace jit {
+namespace {
+using CanonicalArg = c10::variant<CanonicalizedSymbolicShape, IValue>;
+using CanonicalArgVec = std::vector<CanonicalArg>;
+using CanonicalRet = std::vector<CanonicalizedSymbolicShape>;
+using ShapeCacheKey = std::tuple<c10::OperatorName, CanonicalArgVec>;
+
+CanonicalArgVec cannonicalizeVec(
+    const std::vector<SSAInput>& arg_vec,
+    std::unordered_map<int64_t, int64_t>& ss_map,
+    bool deep_copy = true) {
+  CanonicalArgVec canonical_args;
+  canonical_args.reserve(arg_vec.size());
+  for (auto& arg : arg_vec) {
+    if (const IValue* iv = c10::get_if<IValue>(&arg)) {
+      if (deep_copy) {
+        canonical_args.push_back(iv->deepcopy());
+      } else {
+        canonical_args.push_back(*iv);
+      }
+    } else {
+      auto& ss = c10::get<at::SymbolicShape>(arg);
+      canonical_args.emplace_back(CanonicalizedSymbolicShape(ss, ss_map));
+    }
+  }
+  return canonical_args;
+}
+
+std::vector<CanonicalizedSymbolicShape> cannonicalizeVec(
+    const std::vector<at::SymbolicShape>& ret_vec,
+    std::unordered_map<int64_t, int64_t>& ss_map) {
+  std::vector<CanonicalizedSymbolicShape> canonical_rets;
+  canonical_rets.reserve(ret_vec.size());
+  for (auto& ss : ret_vec) {
+    canonical_rets.emplace_back(CanonicalizedSymbolicShape(ss, ss_map));
+  }
+  return canonical_rets;
+}
+
+struct ArgumentsHasher {
+  size_t operator()(const ShapeCacheKey& cacheKey) const {
+    // TODO: ignore arguments that are not used in shape function (not needed
+    // initially)
+    auto& op_name = std::get<0>(cacheKey);
+    auto& arg_vec = std::get<1>(cacheKey);
+
+    size_t hash_val = c10::hash<c10::OperatorName>()(op_name);
+
+    hash_val = at::hash_combine(std::hash<size_t>{}(arg_vec.size()), hash_val);
+    for (const CanonicalArg& arg : arg_vec) {
+      size_t cur_arg = 0;
+      if (const IValue* ival = c10::get_if<IValue>(&arg)) {
+        // IValue doesn't hash List (as Python doesn't), so we will do a custom
+        // list hash
+        if (ival->isList()) {
+          TORCH_INTERNAL_ASSERT(ival->isIntList(), "Unexpected Args in List");
+          cur_arg = ival->toListRef().size();
+          for (const IValue& elem_ival : ival->toListRef()) {
+            cur_arg = at::hash_combine(cur_arg, IValue::hash(elem_ival));
+          }
+        } else {
+          cur_arg = IValue::hash(ival);
+        }
+      } else {
+        cur_arg = c10::get<CanonicalizedSymbolicShape>(arg).hash();
+      }
+      hash_val = at::hash_combine(hash_val, cur_arg);
+    }
+    return hash_val;
+  }
+};
+
+using ShapeCache = lazy::Cache<
+    ShapeCacheKey,
+    std::vector<CanonicalizedSymbolicShape>,
+    ArgumentsHasher>;
+
+constexpr size_t kShapeCacheSize = 1024;
+ShapeCache shapeCache(kShapeCacheSize);
+
+ShapeCacheKey get_cache_key(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec,
+    std::unordered_map<int64_t, int64_t>& ss_map,
+    bool deep_copy = true) {
+  CanonicalArgVec canonical_args = cannonicalizeVec(arg_vec, ss_map, deep_copy);
+  return std::make_tuple(schema->operator_name(), canonical_args);
+}
+
+} // namespace
+
+TORCH_API void cache_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec,
+    const std::vector<at::SymbolicShape>& ret_vec) {
+  // TODO: compare perf using std::vector<std::tuple<int64_t, int64_t>>
+  auto ss_map = std::unordered_map<int64_t, int64_t>();
+  auto cache_key = get_cache_key(schema, arg_vec, ss_map, /* deep_copy */ true);
+  auto can_ret_vec = std::make_shared<std::vector<CanonicalizedSymbolicShape>>(
+      cannonicalizeVec(ret_vec, ss_map));
+  shapeCache.Add(cache_key, can_ret_vec);
+}
+
+TORCH_API c10::optional<std::vector<at::SymbolicShape>>
+get_cached_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec) {
+  // TODO: compare perf using std::vector<std::tuple<int64_t, int64_t>> for both
+  // ss_map and inverse_ss_map
+  auto ss_map = std::unordered_map<int64_t, int64_t>();
+  auto cache_key =
+      get_cache_key(schema, arg_vec, ss_map, /* deep_copy */ false);
+  auto cached_ret_vec = shapeCache.Get(cache_key);
+  if (cached_ret_vec == nullptr) {
+    return c10::nullopt;
+  }
+  // Decanonicalize the return values
+  auto inverse_ss_map = std::unordered_map<int64_t, int64_t>();
+  for (auto& ss_val : ss_map) {
+    inverse_ss_map[ss_val.second] = ss_val.first;
+  }
+  std::vector<at::SymbolicShape> ret_vec;
+  for (auto& css : *cached_ret_vec) {
+    ret_vec.emplace_back(css.toSymbolicShape(inverse_ss_map));
+  }
+  return ret_vec;
+}
+
+// Function only to access the cache, used for testing
+TORCH_API void clear_shape_cache() {
+  shapeCache.Clear();
+}
+
+TORCH_API size_t get_shape_cache_size() {
+  return shapeCache.Numel();
+}
+
+void CanonicalizedSymbolicShape::init(
+    const c10::SymbolicShape& orig_shape,
+    std::unordered_map<int64_t, int64_t>& ss_map) {
+  auto sizes = orig_shape.sizes();
+  if (!sizes) {
+    values_ = c10::nullopt;
+    return;
+  }
+  values_ = std::vector<int64_t>();
+  int64_t cur_symbolic_index = -static_cast<int64_t>(ss_map.size()) - 1;
+  for (auto& cur_shape : *sizes) {
+    if (cur_shape.is_static()) {
+      values_->push_back(cur_shape.static_size());
+    } else {
+      // Check for aliasing
+      auto it = ss_map.find(cur_shape.value());
+
+      if (it == ss_map.end()) {
+        values_->push_back(cur_symbolic_index);
+        ss_map.insert({cur_shape.value(), cur_symbolic_index});
+        cur_symbolic_index--;
+      } else {
+        values_->push_back(it->second);
+      }
+    }
+  }
+}
+
+c10::SymbolicShape CanonicalizedSymbolicShape::toSymbolicShape(
+    std::unordered_map<int64_t, int64_t>& inverse_ss_map) const {
+  if (!values_.has_value()) {
+    return c10::SymbolicShape();
+  }
+  std::vector<at::ShapeSymbol> sizes;
+  for (long long cur_val : *values_) {
+    if (cur_val >= 0) {
+      sizes.push_back(at::ShapeSymbol::fromStaticSize(cur_val));
+      continue;
+    }
+    auto res = inverse_ss_map.find(cur_val);
+    if (res != inverse_ss_map.end()) {
+      sizes.push_back(at::ShapeSymbol::fromStaticSize(res->second));
+    } else {
+      auto new_symbol = at::ShapeSymbol::newSymbol();
+      inverse_ss_map.insert({cur_val, new_symbol.value()});
+      sizes.push_back(new_symbol);
+    }
+  }
+  return c10::SymbolicShape(std::move(sizes));
+}
+
+size_t CanonicalizedSymbolicShape::hash() const {
+  if (!values_.has_value()) {
+    return 0x8cc80c80; // random value to prevent hash collisions
+  }
+  return c10::hash<std::vector<int64_t>>()(values_.value());
+}
+
+bool operator==(
+    const CanonicalizedSymbolicShape& a,
+    const CanonicalizedSymbolicShape& b) {
+  return a.values_ == b.values_;
+};
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.h b/torch/csrc/jit/passes/symbolic_shape_cache.h
new file mode 100644
index 000000000000..02e00acac08d
--- /dev/null
+++ b/torch/csrc/jit/passes/symbolic_shape_cache.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API CanonicalizedSymbolicShape {
+  // TODO: Consider in the future if it is reasonable to
+  // merge code with SymbolicShape or VaryingShape while keeping
+  // the two not implicitly convertable (and cause bugs).
+  CanonicalizedSymbolicShape(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map) {
+    init(orig_shape, ss_map);
+  }
+
+  CanonicalizedSymbolicShape(c10::SymbolicShape& orig_shape) {
+    std::unordered_map<int64_t, int64_t> new_ssmap;
+    init(orig_shape, new_ssmap);
+  }
+
+  size_t hash() const;
+
+  c10::SymbolicShape toSymbolicShape(
+      std::unordered_map<int64_t, int64_t>& inverse_ss_map) const;
+
+  TORCH_API friend bool operator==(
+      const CanonicalizedSymbolicShape& a,
+      const CanonicalizedSymbolicShape& b);
+
+ private:
+  c10::optional<std::vector<int64_t>> values_;
+
+  void init(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map);
+};
+
+// SHAPE CACHE API
+TORCH_API c10::optional<std::vector<at::SymbolicShape>>
+get_cached_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec);
+
+TORCH_API void cache_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec,
+    const std::vector<at::SymbolicShape>& ret_vec);
+
+// For use in test code
+TORCH_API void clear_shape_cache();
+TORCH_API size_t get_shape_cache_size();
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 2cb7db329e5f..8f81c8189e6c 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -5,6 +5,7 @@
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -69,6 +70,11 @@ Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
 
 namespace tensorexpr {
 
+OperatorSet& getCustomOperatorSet() {
+  static OperatorSet _g_custom_operator_set{};
+  return _g_custom_operator_set;
+}
+
 static const OperatorSet& supported_non_eltwise_set() {
   // clang-format off
   static const OperatorSet supported_non_eltwise_set{
@@ -101,6 +107,7 @@ bool isSupported(Node* node) {
   if (get_tensorexpr_elementwise_set().contains(node) ||
       node->isMemberOf(supported_non_eltwise_set()) ||
       node->isMemberOf(supported_misc_set) ||
+      node->isMemberOf(getCustomOperatorSet()) ||
       (texpr_reductions_enabled && node->isMemberOf(supported_reduction_set))) {
     // We only insert guards on Tensor types, so we rely on the output
     // of a node being uniquely determined by its input types.
@@ -140,7 +147,6 @@ bool isSupported(Node* node) {
 
   return false;
 }
-
 } // namespace tensorexpr
 
 static bool texpr_fuser_enabled_ = true;
@@ -248,9 +254,9 @@ void RemoveProfileNodesAndSpecializeTypes(std::shared_ptr<Graph>& graph) {
   GRAPH_DEBUG("After removeProfileNodesAndSpecializeTypes:\n", *graph);
 }
 
-void removeTensorTypeSpecialization(Value* v) {
+bool hasTensorTypeSpecialization(Value* v) {
   if (!v->type()->cast<TensorType>()) {
-    return;
+    return false;
   }
   // Constants & TensorExprGroup will always produce specialized tensor type,
   // TypeCheck are inserted by this pass and only used by fusion groups that
@@ -258,9 +264,18 @@ void removeTensorTypeSpecialization(Value* v) {
   if (v->node()->kind() == prim::Constant ||
       v->node()->kind() == prim::TypeCheck ||
       v->node()->kind() == prim::TensorExprGroup) {
-    return;
+    return false;
+  }
+  if (v->type() == TensorType::get()) {
+    return false;
+  }
+  return true;
+}
+
+void removeTensorTypeSpecialization(Value* v) {
+  if (hasTensorTypeSpecialization(v)) {
+    v->setType(TensorType::get());
   }
-  v->setType(TensorType::get());
 }
 
 void removeTensorTypeSpecializations(Block* block) {
@@ -549,8 +564,6 @@ class TensorExprFuser {
     } else {
       prepareFusionGroupAndGuardOutputs(graph_->block());
       GRAPH_DUMP("After guarding fusion groups: ", graph_);
-      removeTensorTypeSpecializations(graph_->block());
-      GRAPH_DUMP("After removing tensor type specializations: ", graph_);
     }
   }
 
@@ -745,6 +758,10 @@ class TensorExprFuser {
     }
     // Cleanup the subgraph from duplicated constants while we're at it.
     ConstantPooling(subgraph);
+
+    if (GRAPH_DEBUG_ENABLED) {
+      GRAPH_EXPORT("", subgraph);
+    }
     return false;
   }
 
@@ -837,12 +854,16 @@ class TensorExprFuser {
     if (device->is_cpu()) {
       return canFuseOnCPU();
     } else if (device->is_cuda()) {
+#ifndef C10_MOBILE
+      if (fuser::cuda::isEnabled()) {
+        return false;
+      }
+#endif
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
       return false;
-    } else {
-      TORCH_CHECK_NOT_IMPLEMENTED(false, "Unknown device for tensorexpr fuser")
     }
+    return false;
   }
 
   bool isFusableOnDevice(Node* node) {
@@ -1091,6 +1112,7 @@ class TensorExprFuser {
       // aten::cat, though it does not have a shape function.
       REQ(node->kind() == prim::ListConstruct ||
           node->kind() == prim::TensorExprGroup ||
+          node->isMemberOf(tensorexpr::getCustomOperatorSet()) ||
           (node->maybeSchema() && shapeComputeGraphForSchema(node->schema())));
     }
 
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index 89053f8c0aed..fc800dde0de7 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <torch/csrc/Export.h>
-#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/ir/ir.h>
 #include <memory>
 
 namespace torch {
@@ -28,6 +28,7 @@ TORCH_API bool texprReductionsEnabled();
 
 TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
+TORCH_API bool hasTensorTypeSpecialization(Value* v);
 TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
 TORCH_API void removeTensorTypeSpecializations(Block* block);
 
@@ -60,6 +61,17 @@ TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
 
 namespace tensorexpr {
 TORCH_API bool isSupported(Node* node);
+
+/// Get the modifiable custom operator set object.
+///
+/// For static shapes, if a custom operator has been added to the custom
+/// operator set, it will be pulled into the NNC fusion group. But it doesn't
+/// work with dynamic shapes unless explicitly register the shape function via
+/// `torch::jit::RegisterShapeComputeGraphForSchema` for the custom operator.
+///
+/// @return Reference of the custome operator set
+///
+TORCH_API OperatorSet& getCustomOperatorSet();
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index 5dbe0b430867..b71dfd816b33 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -1,8 +1,4 @@
 #include <ATen/core/jit_type.h>
-#ifdef USE_VULKAN
-#include <ATen/native/vulkan/VulkanOpContext.h>
-#endif
-
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
@@ -20,8 +16,6 @@
 namespace torch {
 namespace jit {
 
-#ifdef USE_VULKAN
-
 namespace {
 
 void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
@@ -88,6 +82,23 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
   transpose_rewriter.runOnGraph(graph);
 }
 
+void insertPrePackedGruOp(std::shared_ptr<Graph>& graph) {
+  std::string gru_pattern = R"(
+      graph(%input.1, %hx.1, %params_cpu:Tensor[], %has_biases:bool, %num_layers:int, %dropout:float, %train:bool, %bidirectional:bool, %batch_first:bool):
+        %y.1 : Tensor, %hn.1 : Tensor = aten::gru(%input.1, %hx.1, %params_cpu, %has_biases, %num_layers, %dropout, %train, %bidirectional, %batch_first)
+        return (%y.1, %hn.1) )";
+  std::string prepacked_ops_pattern = R"(
+      graph(%input.1, %hx.1, %params_cpu:Tensor[], %has_biases:bool, %num_layers:int, %dropout:float, %train:bool, %bidirectional:bool, %batch_first:bool):
+        %packed_weights_biases = vulkan_prepack::gru_prepack(
+            %params_cpu, %has_biases, %num_layers, %dropout, %train, %bidirectional, %batch_first)
+        %y.1 : Tensor, %hn.1 : Tensor = vulkan_prepack::gru_run(%input.1, %hx.1, %packed_weights_biases)
+        return (%y.1, %hn.1) )";
+
+  SubgraphRewriter gru_rewriter;
+  gru_rewriter.RegisterRewritePattern(gru_pattern, prepacked_ops_pattern);
+  gru_rewriter.runOnGraph(graph);
+}
+
 void fuseHardtanhWithPackedOps(std::shared_ptr<Graph>& graph) {
   SubgraphRewriter rewriter;
 
@@ -176,6 +187,7 @@ void fuseReluWithPackedOps(std::shared_ptr<Graph>& graph) {
 void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph) {
   insertPrePackedLinearOp(graph);
   insertPrePackedConv2dOp(graph);
+  insertPrePackedGruOp(graph);
 }
 
 void vulkanInsertPrePackedOps(script::Module& module) {
@@ -241,38 +253,5 @@ script::Module vulkanOptimizeForMobile(
   return cloned_module;
 }
 
-#else
-
-void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph) {
-  TORCH_INTERNAL_ASSERT(
-      false, "Vulkan is not enabled. Please build with USE_VULKAN=1");
-}
-
-void vulkanInsertPrePackedOps(script::Module& module) {
-  TORCH_INTERNAL_ASSERT(
-      false, "Vulkan is not enabled. Please build with USE_VULKAN=1");
-}
-
-void vulkanFusePrePackedConvWithClamp(script::Module& module) {
-  TORCH_INTERNAL_ASSERT(
-      false, "Vulkan is not enabled. Please build with USE_VULKAN=1");
-}
-
-void vulkanFoldPrePackingOps(script::Module& m) {
-  TORCH_INTERNAL_ASSERT(
-      false, "Vulkan is not enabled. Please build with USE_VULKAN=1");
-}
-
-script::Module vulkanOptimizeForMobile(
-    const script::Module& module,
-    const std::vector<std::string>& preserved_methods) {
-  TORCH_INTERNAL_ASSERT(
-      false,
-      "Mobile optimizaiton only available with Vulkan at the moment. "
-      "Vulkan is not enabled. Please build with USE_VULKAN=1");
-  return module;
-}
-
-#endif
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 4c9913de5425..a77161a839ae 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1,11 +1,16 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 
+#include <ATen/core/operator_name.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/backends/backend_init.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
+#if (!defined(FBCODE_CAFFE2) && defined(BUILD_ONEDNN_GRAPH))
+#include <torch/csrc/jit/codegen/onednn/interface.h>
+#endif
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -14,13 +19,13 @@
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/create_functional_graphs.h>
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
+#include <torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
 #include <torch/csrc/jit/passes/device_type_analysis.h>
@@ -45,24 +50,6 @@
 #include <torch/csrc/jit/passes/lower_tuples.h>
 #include <torch/csrc/jit/passes/metal_rewrite.h>
 #include <torch/csrc/jit/passes/normalize_ops.h>
-#include <torch/csrc/jit/passes/onnx.h>
-#include <torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.h>
-#include <torch/csrc/jit/passes/onnx/constant_fold.h>
-#include <torch/csrc/jit/passes/onnx/eliminate_unused_items.h>
-#include <torch/csrc/jit/passes/onnx/eval_peephole.h>
-#include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
-#include <torch/csrc/jit/passes/onnx/function_extraction.h>
-#include <torch/csrc/jit/passes/onnx/function_substitution.h>
-#include <torch/csrc/jit/passes/onnx/list_model_parameters.h>
-#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h>
-#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h>
-#include <torch/csrc/jit/passes/onnx/peephole.h>
-#include <torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h>
-#include <torch/csrc/jit/passes/onnx/preprocess_for_onnx.h>
-#include <torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.h>
-#include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
-#include <torch/csrc/jit/passes/onnx/shape_type_inference.h>
-#include <torch/csrc/jit/passes/onnx/unpack_quantized_weights.h>
 #include <torch/csrc/jit/passes/peephole.h>
 #include <torch/csrc/jit/passes/peephole_list_idioms.h>
 #include <torch/csrc/jit/passes/quantization/dedup_module_uses.h>
@@ -71,6 +58,7 @@
 #include <torch/csrc/jit/passes/quantization/insert_observers.h>
 #include <torch/csrc/jit/passes/quantization/insert_quant_dequant.h>
 #include <torch/csrc/jit/passes/quantization/quantization_type.h>
+#include <torch/csrc/jit/passes/refine_tuple_types.h>
 #include <torch/csrc/jit/passes/remove_dropout.h>
 #include <torch/csrc/jit/passes/remove_expands.h>
 #include <torch/csrc/jit/passes/remove_inplace_ops.h>
@@ -94,6 +82,7 @@
 #include <torch/csrc/jit/python/script_init.h>
 #include <torch/csrc/jit/runtime/argument_spec.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
 #include <torch/csrc/jit/runtime/jit_trace.h>
@@ -105,6 +94,7 @@
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/tensorexpr_init.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 
 #include <c10/macros/Export.h>
 #include <c10/util/irange.h>
@@ -156,26 +146,39 @@ void initJITBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   auto jit = m.def_submodule("_jit");
 
-  py::register_exception<JITException>(m, "JITException");
+  static py::exception<JITException> exc(m, "JITException");
+
+  py::register_exception_translator([](std::exception_ptr p) {
+    try {
+      if (p) {
+        std::rethrow_exception(p);
+      }
+    } catch (const JITException& e) {
+      // special handling of JITException, to set its python class name and msg
+      py::gil_scoped_acquire acquire;
+      const auto& className = e.getPythonClassName();
+      const auto& originalMsg = e.getOriginalMsg();
+      JITException::setCaughtOriginalMsg(originalMsg.value_or(""));
+      JITException::setCaughtPythonClassName(className.value_or(""));
+      exc(e.what());
+    }
+  });
+
+  m.def(
+      "_get_caught_jit_exception_class_name",
+      JITException::getCaughtPythonClassName);
+  m.def(
+      "_get_caught_jit_exception_original_msg",
+      JITException::getCaughtOriginalMsg);
 
   py::class_<python::IODescriptor> iodescriptor(
-      m, "IODescriptor"); // NOLINT(bugprone-unused-raii)
+      m,
+      "IODescriptor"); // NOLINT(bugprone-unused-raii)
 
   m.def("_jit_init", loadPythonClasses)
       .def(
           "_jit_debug_fuser_num_cached_kernel_specs",
           torch::jit::fuser::debugNumCachedKernelSpecs)
-      .def("_jit_pass_onnx_remove_print", RemovePrintOps)
-      .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops)
-      .def("_jit_pass_onnx", ToONNX)
-      .def(
-          "_jit_pass_onnx_assign_output_shape",
-          [](std::shared_ptr<Graph>& graph,
-             const std::vector<at::Tensor>& tensors,
-             const python::IODescriptor& desc,
-             bool onnx_shape_inference = false) {
-            ONNXAssignOutputShape(graph, tensors, desc, onnx_shape_inference);
-          })
       .def("_jit_pass_lower_all_tuples", LowerAllTuples)
       .def(
           "_new_symbolic_shape_symbol",
@@ -188,6 +191,38 @@ void initJITBindings(PyObject* module) {
             }
             return shapeComputeGraphForSchema(n->schema());
           })
+      .def(
+          "_jit_decomposition_graph_for_node",
+          [](Node* n) -> c10::optional<std::shared_ptr<Graph>> {
+            if (!n->maybeSchema()) {
+              return c10::nullopt;
+            }
+            return GetDecomposition(n->schema());
+          })
+      .def("_jit_pass_run_decompositions", RunDecompositions)
+      // using Node* here instead of Schema because looking up the schema
+      // and passing it in from Python will have a different pointer than the
+      // schema that is globally used for caching
+      .def(
+          "_jit_register_shape_compute_graph_for_node",
+          [](Node* n, std::shared_ptr<Graph>& graph) {
+            if (n->maybeSchema()) {
+              const FunctionSchema& schema = n->schema();
+              RegisterShapeComputeGraphForSchema(schema, graph);
+            } else {
+              TORCH_INTERNAL_ASSERT(false, "Expected schema", n);
+            }
+          })
+      .def(
+          "_jit_register_decomposition_for_schema",
+          [](const FunctionSchema& s, std::shared_ptr<Graph>& graph) {
+            // because this is invoked by python, the function schema *
+            // becomes different, and we need to find and reuse the
+            // one that is used for caching
+            auto op =
+                findOperatorFor(c10::OperatorName(s.name(), s.overload_name()));
+            RegisterDecomposition(op->schema(), graph);
+          })
       .def("_jit_pass_propagate_shapes_on_graph", PropagateShapesOnGraph)
       .def(
           "_jit_pass_propagate_shapes_on_graph_and_build_compute",
@@ -204,7 +239,6 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_propagate_shapes_on_graph_and_build_compute",
           PropagateShapesAndBuildLargeShapeComputeGraph)
-      .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution)
       .def("_jit_pass_integer_value_refinement", RefineIntegerValues)
       .def(
           "_jit_set_symbolic_shapes_test_mode",
@@ -212,79 +246,8 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_symbolic_shapes_test_mode_enabled",
           &symbolicShapeAnalysisTestModeEnabled)
-      .def(
-          "_jit_pass_onnx_peephole",
-          [](std::shared_ptr<Graph>& graph,
-             int opset_version,
-             bool fixed_batch_size) {
-            return PeepholeOptimizeONNX(graph, opset_version, fixed_batch_size);
-          })
-      .def("_jit_pass_onnx_preprocess", PreprocessForONNX)
-      .def(
-          "_jit_pass_onnx_eval_peephole",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict) {
-            EvalPeepholeONNX(graph, paramsDict);
-            return paramsDict;
-          },
-          pybind11::return_value_policy::move)
-      .def(
-          "_jit_pass_onnx_cast_all_constant_to_floating",
-          CastAllConstantToFloating)
-      .def(
-          "_jit_pass_onnx_constant_fold",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict,
-             int opset_version) {
-            ConstantFoldONNX(
-                graph,
-                paramsDict,
-                opset_version); // overload resolution
-            return paramsDict;
-          },
-          pybind11::return_value_policy::move)
-      .def(
-          "_jit_pass_onnx_eliminate_unused_items",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict) {
-            EliminateUnusedItemsONNX(
-                graph->block(),
-                paramsDict); // overload resolution
-            return paramsDict;
-          },
-          pybind11::return_value_policy::move)
-      .def(
-          "_jit_pass_onnx_scalar_type_analysis",
-          [](std::shared_ptr<Graph>& graph,
-             bool lowprecision_cast,
-             int opset_version) {
-            return ScalarTypeAnalysisForONNX(
-                graph, lowprecision_cast, opset_version);
-          },
-          py::arg("graph"),
-          py::arg("lowprecision_cast") = true,
-          py::arg("opset_version"))
-      .def(
-          "_jit_pass_onnx_remove_inplace_ops_for_onnx", RemoveInplaceOpsForONNX)
-      .def(
-          "_jit_pass_onnx_node_shape_type_inference",
-          [](Node* n,
-             std::map<std::string, IValue>& params_dict,
-             int opset_version) {
-            ONNXShapeTypeInference(n, params_dict, opset_version);
-          })
-      .def(
-          "_jit_pass_onnx_graph_shape_type_inference",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& params_dict,
-             int opset_version) {
-            ONNXShapeTypeInference(graph, params_dict, opset_version);
-          })
-      .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape)
       .def("_jit_pass_autocast", Autocast)
       .def("_jit_set_autocast_mode", &setAutocastMode)
-      .def("_jit_pass_onnx_lint", ONNXLintGraph)
-      .def("_jit_pass_onnx_function_extraction", onnx::ONNXFunctionExtraction)
       .def("_jit_pass_fuse", FuseGraph)
       .def(
           "_jit_pass_replace_old_ops_with_upgraders",
@@ -311,11 +274,6 @@ void initJITBindings(PyObject* module) {
           [](std::shared_ptr<Graph>& g) {
             return EliminateCommonSubexpression(g); // overload resolution
           })
-      .def(
-          "_jit_pass_common_expression_hoisting",
-          [](std::shared_ptr<Graph>& g) {
-            return HoistCommonExpression(g); // overload resolution
-          })
       .def(
           "_jit_pass_fuse_quantized_add_relu",
           [](std::shared_ptr<Graph>& g) {
@@ -369,8 +327,8 @@ void initJITBindings(PyObject* module) {
           "_jit_pass_fold_convbn",
           [](Module& module) { return FoldConvBatchNorm(module); })
       .def(
-          "_jit_onnx_list_model_parameters",
-          [](Module& module) { return list_module_parameters(module); })
+          "_jit_pass_dbr_quant_remove_redundant_aliases",
+          [](Module& module) { return DBRQuantRemoveRedundantAliases(module); })
       .def(
           "_freeze_module",
           [](Module& module,
@@ -502,7 +460,11 @@ void initJITBindings(PyObject* module) {
           [](std::shared_ptr<Graph>& g) { return FuseAddMM(g); })
       .def(
           "_jit_pass_canonicalize",
-          [](const std::shared_ptr<Graph>& g) { return Canonicalize(g); })
+          [](const std::shared_ptr<Graph>& g, bool keep_unique_names = true) {
+            return Canonicalize(g, keep_unique_names);
+          },
+          py::arg("graph"),
+          py::arg("keep_unique_names") = true)
       .def("_jit_pass_lint", LintGraph)
       .def(
           "_jit_pass_complete_shape_analysis",
@@ -588,7 +550,6 @@ void initJITBindings(PyObject* module) {
       .def("_jit_pass_erase_number_types", EraseNumberTypes)
       .def("_jit_pass_inline_fork_wait", InlineForkWait)
       .def("_jit_pass_inline", Inline)
-      .def("_jit_pass_prepare_division_for_onnx", PrepareDivisionForONNX)
       .def(
           "_jit_pass_lower_graph",
           [](std::shared_ptr<Graph>& graph, const Module& self) {
@@ -671,10 +632,6 @@ void initJITBindings(PyObject* module) {
             return py::reinterpret_steal<py::object>(
                 python::unflatten(vars, desc));
           })
-      .def("_jit_pass_onnx_block", BlockToONNX)
-      .def(
-          "_jit_onnx_convert_pattern_from_subblock", ConvertPatternFromSubblock)
-      .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode)
       .def("_jit_pass_canonicalize_graph_fuser_ops", CanonicalizeOps)
       .def("_jit_pass_decompose_ops", DecomposeOps)
       .def("_jit_pass_specialize_autogradzero", specializeAutogradZero)
@@ -701,7 +658,23 @@ void initJITBindings(PyObject* module) {
             auto stack = toTraceableStack(args);
             checkAliasAnnotation(g, std::move(stack), unqualified_op_name);
           })
-      .def("_jit_set_nvfuser_enabled", &RegisterCudaFuseGraph::registerPass)
+#if (!defined(FBCODE_CAFFE2) && defined(BUILD_ONEDNN_GRAPH))
+      .def("_jit_set_llga_enabled", &RegisterLlgaFuseGraph::setEnabled)
+      .def("_jit_llga_enabled", &RegisterLlgaFuseGraph::isEnabled)
+#endif
+      .def(
+          "_jit_set_nvfuser_skip_node_kind",
+          // Args:
+          //     `op_name`: Symbol of op;
+          //     `flip`: flag indicating whether to flip the given op in the
+          //             skip list.
+          // Returns:
+          //     a bool flag indicating if `op_name` was already in the skip
+          //     list.
+          [](const std::string& op_name, bool flip = true) {
+            return fuser::cuda::skipNode(op_name, flip);
+          })
+      .def("_jit_set_nvfuser_enabled", &fuser::cuda::setEnabled)
       .def(
           "_jit_set_nvfuser_single_node_mode",
           [](bool flag) { return fuser::cuda::setSingletonFusion(flag); })
@@ -721,7 +694,31 @@ void initJITBindings(PyObject* module) {
             fuser::cuda::getCudaFusionGuardMode() = profiling_flag;
             return oldState;
           })
-      .def("_jit_nvfuser_enabled", &RegisterCudaFuseGraph::isRegistered)
+      .def("_jit_nvfuser_enabled", &fuser::cuda::isEnabled)
+      .def(
+          "_jit_nvfuser_set_comparison_callback",
+          [](bool run_fallback, py::function fn) {
+            // If set, then the callback will be run after each nvfuser fusion
+            // group is executed. Can be used for testing accuracy.
+            // If run_fallback == True, then a fallback will be run and
+            // unfused_outputs will be nonempty, showing the result if the
+            // fusion didn't take place. Otherwise, unfused_outputs will
+            // be empty
+            auto fn_ptr = std::make_shared<py::function>(fn);
+            auto callback_lambda = [fn_ptr](
+                                       const Stack& fused_outputs,
+                                       const Stack& unfused_outputs,
+                                       const std::string& graph_ir) {
+              py::gil_scoped_acquire acquire{};
+              (*fn_ptr)(fused_outputs, unfused_outputs, graph_ir);
+            };
+            setCudaFuserComparisonCallback({run_fallback, callback_lambda});
+          })
+      .def(
+          "_jit_nvfuser_clear_comparison_callback",
+          []() {
+            setCudaFuserComparisonCallback({false, nullptr});
+          })
       .def(
           "_jit_set_profiling_mode",
           [](bool profiling_flag) {
@@ -773,6 +770,7 @@ void initJITBindings(PyObject* module) {
                 vec_conv.emplace_back(FusionBehavior::DYNAMIC, pair.second);
               } else {
                 TORCH_INTERNAL_ASSERT(
+                    false,
                     "FusionBehavior only supported 'STATIC' or 'DYNAMIC', got: ",
                     pair.first);
               }
@@ -914,7 +912,10 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "_jit_pass_fuse_tensorexprs",
-          [](std::shared_ptr<Graph>& g) { return FuseTensorExprs(g); })
+          [](std::shared_ptr<Graph>& g) {
+            FuseTensorExprs(g);
+            RemoveTensorTypeSpecializations(g);
+          })
       .def(
           "_jit_fuser_get_fused_kernel_code",
           [](Graph& g, const std::vector<at::Tensor>& inps) {
@@ -923,6 +924,9 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_remove_dropout",
           [](script::Module& module) { return removeDropout(module); })
+      .def(
+          "_jit_pass_refine_tuple_types",
+          [](std::shared_ptr<Graph>& graph) { return RefineTupleTypes(graph); })
       .def(
           "_jit_pass_transform_conv1d_to_conv2d",
           [](std::shared_ptr<Graph>& graph) {
@@ -1020,22 +1024,6 @@ void initJITBindings(PyObject* module) {
              std::vector<std::string>& preserved_methods) {
             return metalOptimizeForMobile(module, preserved_methods);
           })
-      .def(
-          "_jit_pass_onnx_unpack_quantized_weights",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict) {
-            UnpackQuantizedWeights(graph, paramsDict);
-            return paramsDict;
-          },
-          pybind11::return_value_policy::move)
-      .def(
-          "_jit_pass_onnx_quantization_insert_permutes",
-          [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict) {
-            insertPermutes(graph, paramsDict);
-            return paramsDict;
-          },
-          pybind11::return_value_policy::move)
       .def(
           "_jit_pass_filter_non_tensor_arguments",
           [](std::map<std::string, IValue> params) {
@@ -1353,9 +1341,10 @@ void initJITBindings(PyObject* module) {
           auto operations = getAllOperatorsFor(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
-              auto func =
-                  py::cpp_function([op](py::args args, py::kwargs kwargs) {
-                    return invokeOperatorFromPython({op}, args, kwargs);
+              auto func = py::cpp_function(
+                  [op, symbol](py::args args, py::kwargs kwargs) {
+                    return _get_operation_for_overload_or_packet(
+                        {op}, symbol, args, kwargs, true);
                   });
               return func;
             }
@@ -1384,66 +1373,19 @@ void initJITBindings(PyObject* module) {
             docstring << "  " << op->schema() << "\n";
           }
 
+          py::list overload_names;
+          for (const auto& op : operations) {
+            overload_names.append(py::str(op->schema().overload_name()));
+          }
+
           auto func = py::cpp_function(
               [operations, symbol](py::args args, py::kwargs kwargs) {
-                std::vector<py::handle> overloaded_args;
-                size_t total_arg_num = args.size() + kwargs.size();
-                for (const auto i : c10::irange(args.size())) {
-                  is_tensor_and_append_overloaded(
-                      args[i].ptr(), &overloaded_args);
-                  is_tensor_list_and_append_overloaded(
-                      args[i].ptr(),
-                      &overloaded_args,
-                      static_cast<int>(total_arg_num),
-                      false /* throw_error */);
-                }
-                // NB: for kwargs, we cannot guarantee the order of appending
-                // is the same as the argument order in operator's schema.
-                // This is suboptimal, but should be fine. Later when we have
-                // better schema matching and argument parsing, we could
-                // match the operator in `operations` first, then the order will
-                // be guaranteed.
-                for (auto item : kwargs) {
-                  is_tensor_and_append_overloaded(
-                      item.second.ptr(), &overloaded_args);
-                  is_tensor_list_and_append_overloaded(
-                      item.second.ptr(),
-                      &overloaded_args,
-                      total_arg_num,
-                      false /* throw_error */);
-                }
-                if (overloaded_args.size() > 0) {
-                  std::vector<py::object> overloaded_types;
-                  overloaded_types.reserve(overloaded_args.size());
-                  for (auto& oarg : overloaded_args) {
-                    overloaded_types.push_back(
-                        py::reinterpret_borrow<py::object>(
-                            (PyObject*)Py_TYPE(oarg.ptr())));
-                  }
-                  py::tuple py_types = py::cast(overloaded_types);
-                  py::object ret;
-                  std::string ns = symbol.ns().toUnqualString();
-                  std::string method_name = symbol.toUnqualString();
-                  auto self_func = py::module::import("torch")
-                                       .attr("ops")
-                                       .attr(ns.c_str())
-                                       .attr(method_name.c_str());
-                  std::string module_name("torch.ops");
-                  module_name.append(ns);
-                  return pybind11::reinterpret_steal<py::object>(
-                      handle_torch_function_no_python_arg_parser(
-                          overloaded_args,
-                          args.ptr(),
-                          kwargs.ptr(),
-                          method_name.c_str(),
-                          self_func.ptr(),
-                          module_name.c_str()));
-                }
-                return invokeOperatorFromPython(operations, args, kwargs);
+                return _get_operation_for_overload_or_packet(
+                    operations, symbol, args, kwargs, false);
               },
               py::name(symbol.toUnqualString()),
               py::doc(docstring.str().c_str()));
-          return func;
+          return py::make_tuple(func, overload_names);
         } catch (const c10::Error& e) {
           auto msg = torch::get_cpp_stacktraces_enabled()
               ? e.what()
@@ -1453,11 +1395,15 @@ void initJITBindings(PyObject* module) {
       },
       py::arg("qualified_name"));
 
-  m.def("parse_ir", [](const std::string& input) {
-    auto graph = std::make_shared<Graph>();
-    parseIR(input, &*graph);
-    return graph;
-  });
+  m.def(
+      "parse_ir",
+      [](const std::string& input, bool parse_tensor_constants) {
+        auto graph = std::make_shared<Graph>();
+        parseIR(input, &*graph, parse_tensor_constants);
+        return graph;
+      },
+      py::arg("input"),
+      py::arg("parse_tensor_constants") = false);
   m.def("parse_schema", parseSchema);
   m.def("unify_type_list", [](const std::vector<TypePtr>& types) {
     std::ostringstream s;
@@ -1721,6 +1667,7 @@ void initJITBindings(PyObject* module) {
   initJitBackendBindings(module);
   initStaticModuleBindings(module);
   initTensorExprBindings(module);
+  initNvFuserPythonBindings(module);
 
   setPrintHandler([](const std::string& str) {
     py::gil_scoped_acquire acquire;
@@ -1731,6 +1678,12 @@ void initJITBindings(PyObject* module) {
       throw std::runtime_error(e.what());
     }
   });
+
+  // On exit we need to reset the print handler to default one,
+  // because otherwise prim::Print() instruction won't work for JIT modules.
+  auto atexit = py::module_::import("atexit");
+  atexit.attr("register")(
+      py::cpp_function([]() { setPrintHandler(getDefaultPrintHandler()); }));
 }
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 89767f8d0906..6486cc1ad76f 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -4,6 +4,8 @@
 #include <torch/csrc/jit/python/python_ivalue.h>
 #include <torch/csrc/jit/python/python_list.h>
 
+#include <ATen/ScalarOps.h>
+
 #include <c10/util/irange.h>
 
 namespace torch {
@@ -30,16 +32,28 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
         // None gets converted to undefined Tensors
         return autograd::Variable();
       }
-      auto var = py::cast<autograd::Variable>(obj);
-      if (var.is_sparse()) {
-        TORCH_WARN_ONCE(
-            "Using sparse tensors in TorchScript is experimental. Many optimization "
-            "pathways have not been thoroughly tested with sparse tensors. Please "
-            "include the fact that the network is running sparse tensors in any bug "
-            "reports submitted.");
+      if (THPVariable_Check(obj.ptr())) {
+        auto var = py::cast<autograd::Variable>(obj);
+        guardAgainstNamedTensor<autograd::Variable>(var);
+        return var;
+      } else {
+        at::Scalar scalar;
+        if (PyBool_Check(obj.ptr())) {
+          scalar = at::Scalar(THPUtils_unpackBool(obj.ptr()));
+        } else if (THPUtils_checkLong(obj.ptr())) {
+          scalar = at::Scalar(THPUtils_unpackLong(obj.ptr()));
+        } else if (PyComplex_Check(obj.ptr())) {
+          scalar = at::Scalar(THPUtils_unpackComplexDouble(obj.ptr()));
+        } else if (THPUtils_checkDouble(obj.ptr())) {
+          scalar = at::Scalar(THPUtils_unpackDouble(obj.ptr()));
+        } else {
+          throw py::cast_error(
+              c10::str("Unable to cast ", py::str(obj), " to Tensor"));
+        }
+        at::Tensor tensor = at::scalar_to_tensor(scalar);
+        tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+        return tensor;
       }
-      guardAgainstNamedTensor<autograd::Variable>(var);
-      return var;
     }
     case TypeKind::StorageType:
       return py::cast<at::Storage>(obj);
@@ -49,10 +63,16 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       auto c_obj = py::cast<std::complex<double>>(obj.ptr());
       return static_cast<c10::complex<double>>(c_obj);
     }
+    case TypeKind::SymIntType:
+      return py::cast<int64_t>(obj);
     case TypeKind::IntType:
-    // TODO(xintchen): Handling LayoutType and ScalarTypeType correctly.
+    // NB: Typically, these switches are completely dead, because
+    // Argument::type() will always report IntType for these types.
+    // So this is a bit overly permissive: we'll accept a dtype
+    // passed to an int argument, for example.
     case TypeKind::LayoutType:
     case TypeKind::ScalarTypeType:
+    case TypeKind::MemoryFormatType:
       if (THPDtype_Check(obj.ptr())) {
         auto dtype = reinterpret_cast<THPDtype*>(obj.ptr());
         return static_cast<int64_t>(dtype->scalar_type);
@@ -65,6 +85,10 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
         auto layout = reinterpret_cast<THPLayout*>(obj.ptr());
         return static_cast<int8_t>(layout->layout);
       }
+      if (THPMemoryFormat_Check(obj.ptr())) {
+        auto memory_format = reinterpret_cast<THPMemoryFormat*>(obj.ptr());
+        return static_cast<int8_t>(memory_format->memory_format);
+      }
       return py::cast<int64_t>(obj);
     case TypeKind::NoneType:
       if (!obj.is_none()) {
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 3eb1d5cafcb6..0eb67ee761b1 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -587,6 +587,16 @@ inline void guardAgainstNamedTensor(const T& var) {
 // python_ivalue.h
 IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N);
 
+// Extract custom class registered with torchbind
+template <typename T>
+c10::intrusive_ptr<T> toCustomClass(py::handle obj) {
+  static_assert(
+      std::is_base_of<CustomClassHolder, T>::value, "T is not a CustomClass");
+  const auto& type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
+  c10::IValue ivalue = toIValue(obj, type);
+  return std::move(ivalue).toCustomClass<T>();
+}
+
 // Small wrapper around getting the type name string from Python to make
 // types easier to interpret, e.g. give the structural type for a NamedTuple
 inline std::string friendlyTypeName(py::handle obj) {
@@ -683,15 +693,31 @@ inline py::object toPyObject(IValue ivalue) {
     return py::none();
   } else if (ivalue.isTensor()) {
     auto tensor = std::move(ivalue).toTensor();
-    if (tensor.is_sparse()) {
-      TORCH_WARN_ONCE(
-          "Using sparse tensors in TorchScript is experimental. Many optimization "
-          "pathways have not been thoroughly tested with sparse tensors. Please "
-          "include the fact that the network is running sparse tensors in any bug "
-          "reports submitted.");
+    if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
+      TORCH_INTERNAL_ASSERT(tensor.device().is_cpu());
+      auto scalar_type = tensor.scalar_type();
+      switch (scalar_type) {
+        case at::ScalarType::Bool:
+          return py::cast(*tensor.data_ptr<bool>());
+        case at::ScalarType::Long:
+          return py::cast(*tensor.data_ptr<int64_t>());
+        case at::ScalarType::Double:
+          return py::cast(*tensor.data_ptr<double>());
+        case at::ScalarType::ComplexDouble:
+          // TODO: https://github.com/pytorch/pytorch/issues/77134
+          return py::cast(static_cast<std::complex<double>>(
+              *tensor.data_ptr<c10::complex<double>>()));
+        default:
+          TORCH_CHECK(
+              false,
+              "Missing cases in 'toPyObject' wrapped number handling! Can't convert ",
+              scalar_type,
+              " to a Python object");
+      }
+    } else {
+      guardAgainstNamedTensor<at::Tensor>(tensor);
+      return py::cast(autograd::Variable(std::move(tensor)));
     }
-    guardAgainstNamedTensor<at::Tensor>(tensor);
-    return py::cast(autograd::Variable(std::move(tensor)));
   } else if (ivalue.isStorage()) {
     return py::cast(ivalue.toStorage());
   } else if (ivalue.isDouble()) {
@@ -1159,5 +1185,72 @@ inline py::object invokeOperatorFromPython(
   return createPyObjectForStack(std::move(stack));
 }
 
+inline py::object _get_operation_for_overload_or_packet(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    Symbol symbol,
+    py::args args,
+    const py::kwargs& kwargs,
+    bool is_overload) {
+  std::vector<py::handle> overloaded_args;
+  size_t total_arg_num = args.size() + kwargs.size();
+  for (const auto i : c10::irange(args.size())) {
+    is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        args[i].ptr(),
+        &overloaded_args,
+        static_cast<int>(total_arg_num),
+        false /* throw_error */);
+  }
+  // NB: for kwargs, we cannot guarantee the order of appending
+  // is the same as the argument order in operator's schema.
+  // This is suboptimal, but should be fine. Later when we have
+  // better schema matching and argument parsing, we could
+  // match the operator in `operations` first, then the order will
+  // be guaranteed.
+  for (auto item : kwargs) {
+    is_tensor_and_append_overloaded(item.second.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        item.second.ptr(),
+        &overloaded_args,
+        total_arg_num,
+        false /* throw_error */);
+  }
+  if (overloaded_args.size() > 0) {
+    std::vector<py::object> overloaded_types;
+    overloaded_types.reserve(overloaded_args.size());
+    for (auto& oarg : overloaded_args) {
+      overloaded_types.push_back(
+          py::reinterpret_borrow<py::object>((PyObject*)Py_TYPE(oarg.ptr())));
+    }
+    py::tuple py_types = py::cast(overloaded_types);
+    py::object ret;
+    std::string ns = symbol.ns().toUnqualString();
+    std::string method_name = symbol.toUnqualString();
+    auto self_func = py::module::import("torch")
+                         .attr("ops")
+                         .attr(ns.c_str())
+                         .attr(method_name.c_str());
+    if (is_overload) {
+      auto overload_name = operations[0]->schema().overload_name();
+      if (overload_name == "") {
+        self_func = self_func.attr("default");
+      } else {
+        self_func = self_func.attr(overload_name.c_str());
+      }
+    }
+    std::string module_name("torch.ops");
+    module_name.append(ns);
+    return pybind11::reinterpret_steal<py::object>(
+        handle_torch_function_no_python_arg_parser(
+            overloaded_args,
+            args.ptr(),
+            kwargs.ptr(),
+            method_name.c_str(),
+            self_func.ptr(),
+            module_name.c_str()));
+  }
+  return invokeOperatorFromPython(operations, args, kwargs);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index c755446afc49..68cb44777828 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -30,6 +30,10 @@ static constexpr char NoneType = 'n';
 
 namespace {
 
+inline bool PyNone_Check(PyObject* o) {
+  return o == Py_None;
+}
+
 template <typename T>
 py::object cast_handle_sequence(std::vector<py::handle> objs) {
   auto num_objs = objs.size();
@@ -68,7 +72,7 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Variable);
-  } else if (strcmp(THPUtils_typename(obj), "NoneType") == 0) {
+  } else if (PyNone_Check(obj)) {
     args.desc.structure.push_back(D::NoneType);
   } else if (PyBool_Check(obj)) { // Wrap bools in Bool tensors
     at::Tensor var = scalar_to_tensor(at::Scalar(THPUtils_unpackBool(obj)));
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 97c90b67d788..3422a21b4ca5 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -200,8 +200,10 @@ void initPythonIRBindings(PyObject* module_) {
           [&](AliasDb& db, Value* v1, Value* v2) {
             return db.mayContainAlias(v1, v2);
           })
+      .def(
+          "has_writers",
+          [&](AliasDb& db, Value* v1) { return db.hasWriters(v1); })
       .def("__str__", &AliasDb::toString);
-
 #define GS(name) def(#name, &Graph ::name)
   py::class_<Graph, std::shared_ptr<Graph>>(m, "Graph")
       .def(py::init<>())
@@ -217,9 +219,14 @@ void initPythonIRBindings(PyObject* module_) {
           py::arg("enabled") = true)
       .def(
           "alias_db",
-          [](std::shared_ptr<Graph> g) {
-            return std::make_shared<AliasDb>(std::move(g));
-          })
+          [](std::shared_ptr<Graph> g,
+             bool isFrozen = false,
+             bool descend_function_calls = false) {
+            return std::make_shared<AliasDb>(
+                std::move(g), isFrozen, descend_function_calls);
+          },
+          py::arg("isFrozen") = false,
+          py::arg("descend_function_calls") = false)
       .def(
           "dump_alias_db",
           [](std::shared_ptr<Graph> g) {
@@ -247,11 +254,13 @@ void initPythonIRBindings(PyObject* module_) {
             RawDataExportMap export_map;
             SymbolDimMap symbol_map;
             bool val_use_external_data_format = false;
+            NodeNameMap onnx_node_names;
             std::tie(
                 model_proto,
                 export_map,
                 symbol_map,
-                val_use_external_data_format) =
+                val_use_external_data_format,
+                onnx_node_names) =
                 export_onnx(
                     g,
                     initializers,
@@ -281,7 +290,8 @@ void initPythonIRBindings(PyObject* module_) {
             return std::make_tuple(
                 py::bytes(graph),
                 python_serialized_export_map,
-                val_use_external_data_format);
+                val_use_external_data_format,
+                onnx_node_names);
           },
           py::arg("initializers"),
           py::arg("onnx_opset_version") = 0,
@@ -367,6 +377,11 @@ void initPythonIRBindings(PyObject* module_) {
       .GS(eraseInput)
       .GS(eraseOutput)
       .GS(registerOutput)
+      .def(
+          "permuteInputs",
+          [](Graph& g, const std::vector<size_t>& new_inputs) {
+            g.block()->permuteInputs(new_inputs);
+          })
       .def(
           "create",
           [](Graph& g, const char* str) {
@@ -418,6 +433,45 @@ void initPythonIRBindings(PyObject* module_) {
           })
       .GS(appendNode)
       .GS(prependNode)
+      // NB: insert_point_guard defined over direct modification of insert point
+      .def(
+          "insert_point_guard",
+          [](Graph& g, Node* n) {
+            return py::module::import("torch.jit._ir_utils")
+                .attr("insert_point_guard")(g, n);
+          })
+      .def(
+          "insert_point_guard",
+          [](Graph& g, Block* b) {
+            return py::module::import("torch.jit._ir_utils")
+                .attr("insert_point_guard")(g, b);
+          })
+      .GS(insertPoint)
+      .def("setInsertPoint", [](Graph& g, Node* n) { g.setInsertPoint(n); })
+      .def("setInsertPoint", [](Graph& g, Block* n) { g.setInsertPoint(n); })
+      .def(
+          "insertGraph",
+          [](Graph& g, Graph& callee, std::vector<Value*> inputs) {
+            return insertGraph(g, callee, inputs);
+          })
+      .def(
+          "insertGraph",
+          [](Graph& g,
+             Graph& callee,
+             std::vector<Value*> inputs,
+             std::unordered_map<Value*, Value*> value_map) {
+            return insertGraph(g, callee, inputs, value_map);
+          })
+      .def(
+          "insert",
+          [](Graph& g, Symbol opname, std::vector<Value*> args) {
+            std::vector<NamedValue> args_named;
+            args_named.reserve(args.size());
+            for (Value* v : args) {
+              args_named.emplace_back(v);
+            }
+            return g.insert(opname, args_named);
+          })
       .def(
           "makeMultiOutputIntoTuple",
           [](Graph& g) {
@@ -542,6 +596,7 @@ void initPythonIRBindings(PyObject* module_) {
       .def("outputsSize", [](Node& n) { return n.outputs().size(); })
       .NS(kind)
       .def("prev", [](Node& n) { return n.prev(); })
+      .def("matches", [](Node& n, const char* s) { return n.matches(s); })
       .def("owningBlock", [](Node& n) { return n.owningBlock(); })
       .def("inputsAt", [](Node& n, size_t i) { return n.inputs().at(i); })
       .def(
@@ -588,6 +643,11 @@ void initPythonIRBindings(PyObject* module_) {
       .def(
           "getModuleHierarchy",
           [](Node& n) { return torch::jit::utils::getNodesModuleHierarchy(n); })
+      .def(
+          "namedInput",
+          [](Node& n, const std::string& unqualName) {
+            return n.namedInput(unqualName);
+          })
       .NS(addInput)
       .NS(copyMetadata)
       .NS(replaceInput)
@@ -644,6 +704,7 @@ void initPythonIRBindings(PyObject* module_) {
       .CREATE_ACCESSOR(Ints, is)
       .CREATE_ACCESSOR(Graph, g)
       .CREATE_ACCESSOR(Graphs, gs)
+      .CREATE_ACCESSOR(IValue, ival)
 #undef CREATE_ACCESSOR
       // Tensor (t_) -- manually written to unwrap the variable into a tensor.
       .def(
@@ -691,6 +752,16 @@ void initPythonIRBindings(PyObject* module_) {
       .def(
           "z",
           [](Node& n, const char* name) { return n.t(Symbol::attr(name)); })
+      .def(
+          "ty_",
+          [](Node& n, const char* name, const TypePtr& type) {
+            return n.ty_(Symbol::attr(name), type);
+          })
+      .def(
+          "tys_",
+          [](Node& n, const char* name, const std::vector<TypePtr>& types) {
+            return n.tys_(Symbol::attr(name), types);
+          })
       .def(
           "zs_",
           [](Node& n, const char* name, TensorsAttr::ValueType v) {
@@ -885,6 +956,9 @@ void initPythonIRBindings(PyObject* module_) {
           [](const TypePtr& self) {
             return self->castRaw<InterfaceType>() != nullptr;
           })
+      .def(
+          "requires_grad",
+          [](const TypePtr& self) -> bool { return self->requires_grad(); })
       .def_property_readonly(
           "annotation_str", [](const std::shared_ptr<Type>& self) {
             return self->annotation_str();
@@ -896,6 +970,8 @@ void initPythonIRBindings(PyObject* module_) {
       .def_static("get", &NumberType::get);
   py::class_<IntType, Type, IntTypePtr>(m, "IntType")
       .def_static("get", &IntType::get);
+  py::class_<SymIntType, Type, SymIntTypePtr>(m, "SymIntType")
+      .def_static("get", &SymIntType::get);
   py::class_<FloatType, Type, FloatTypePtr>(m, "FloatType")
       .def_static("get", &FloatType::get);
   py::class_<ComplexType, Type, ComplexTypePtr>(m, "ComplexType")
diff --git a/torch/csrc/jit/python/python_list.h b/torch/csrc/jit/python/python_list.h
index 316387cf7359..04a2f23b29a8 100644
--- a/torch/csrc/jit/python/python_list.h
+++ b/torch/csrc/jit/python/python_list.h
@@ -197,7 +197,7 @@ class ScriptList final {
       idx += len();
     }
 
-    if (idx < 0 || (size_type)idx > len()) {
+    if (idx < 0 || idx > len()) {
       throw std::out_of_range("list index out of range");
     }
 
@@ -217,7 +217,7 @@ class ScriptList final {
       idx += sz;
     }
 
-    if (idx < 0 || (size_type)idx >= sz) {
+    if (idx < 0 || idx >= sz) {
       throw std::out_of_range("list index out of range");
     }
 
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 87ab27a55527..f34ed5512de3 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/jit/python/python_sugared_value.h>
 
+#include <ATen/core/interned_strings.h>
+#include <c10/core/ScalarType.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/Layout.h>
@@ -325,9 +327,21 @@ SugaredValuePtr ModuleValue::getitem(
           loc, m, idx, type_hint);
     }
   } else if (
-      concreteType_->getIterableModuleKind() == IterableModuleKind::DICT) {
+      concreteType_->getIterableModuleKind() == IterableModuleKind::PARAMLIST) {
+    return getSugaredNamedParameterList(loc, m)->getModules()->getitem(
+        loc, m, idx, type_hint);
+  } else if (
+      concreteType_->getIterableModuleKind() == IterableModuleKind::DICT ||
+      concreteType_->getIterableModuleKind() == IterableModuleKind::PARAMDICT) {
     if (auto ivalue = toIValue(idx)) {
-      auto sd = getSugaredDict(loc, m);
+      std::shared_ptr<SugaredDict> sd;
+      if (concreteType_->getIterableModuleKind() == IterableModuleKind::DICT) {
+        sd = getSugaredDict(loc, m);
+      } else if (
+          concreteType_->getIterableModuleKind() ==
+          IterableModuleKind::PARAMDICT) {
+        sd = getSugaredNamedParameterDict(loc, m);
+      }
       auto idx_str = ivalue->toStringRef();
       auto keys_iter = sd->keys_;
       auto module_values_iter = sd->modules_;
@@ -362,7 +376,8 @@ SugaredValuePtr ModuleValue::getitem(
         << "Enumeration of ModuleDict is supported, e.g. 'for k, v in self.items(): ...'";
   }
   throw ErrorReport(loc)
-      << "Only ModuleList, Sequential, and ModuleDict modules are subscriptable";
+      << "Only ModuleList, Sequential, ModuleDict, "
+      << "ParameterList, and ParameterDict modules are subscriptable";
 }
 
 void checkInterface(
@@ -441,6 +456,34 @@ std::shared_ptr<SugaredDict> ModuleValue::getSugaredNamedBufferDict(
       std::make_shared<SugaredTupleValue>(values));
 }
 
+std::shared_ptr<SugaredDict> ModuleValue::getSugaredNamedParameterList(
+    const SourceRange& loc,
+    GraphFunction& m) {
+  std::vector<std::string> paramNames;
+  std::vector<SugaredValuePtr> values;
+
+  const auto& selfType = concreteType_->getJitType()->expect<ClassType>();
+  for (size_t i = 0; i < selfType->numAttributes(); ++i) {
+    if (selfType->is_parameter(i)) {
+      paramNames.push_back(selfType->getAttributeName(i));
+    }
+  }
+
+  std::vector<SugaredValuePtr> keys;
+  for (const auto& name : paramNames) {
+    auto name_v =
+        std::make_shared<SimpleValue>(insertConstant(*m.graph(), name));
+    m.graph()->insertGetAttr(self_, name);
+    values.push_back(tryGetAttr(loc, m, name));
+    keys.push_back(name_v);
+  }
+
+  return std::make_shared<SugaredDict>(
+      std::make_shared<ModuleValue>(self_, concreteType_),
+      std::make_shared<SugaredTupleValue>(keys),
+      std::make_shared<SugaredTupleValue>(values));
+}
+
 std::shared_ptr<SugaredDict> ModuleValue::getSugaredDict(
     const SourceRange& loc,
     GraphFunction& m) {
@@ -472,6 +515,35 @@ std::shared_ptr<SugaredDict> ModuleValue::getSugaredDict(
       std::make_shared<SugaredTupleValue>(values));
 }
 
+std::shared_ptr<SugaredDict> ModuleValue::getSugaredNamedParameterDict(
+    const SourceRange& loc,
+    GraphFunction& m) {
+  std::vector<std::string> paramNames;
+  const auto& selfType = concreteType_->getJitType()->expect<ClassType>();
+  for (size_t i = 0; i < selfType->numAttributes(); ++i) {
+    if (selfType->is_parameter(i)) {
+      paramNames.push_back(selfType->getAttributeName(i));
+    }
+  }
+
+  std::vector<SugaredValuePtr> keys;
+  std::vector<SugaredValuePtr> values;
+  for (const auto& name : paramNames) {
+    auto name_v =
+        std::make_shared<SimpleValue>(insertConstant(*m.graph(), name));
+    m.graph()->insertGetAttr(self_, name);
+    auto val = tryGetAttr(loc, m, name);
+    TORCH_INTERNAL_ASSERT(val != nullptr, "Could not find attribute ", name);
+    values.push_back(val);
+    keys.push_back(name_v);
+  }
+
+  return std::make_shared<SugaredDict>(
+      std::make_shared<ModuleValue>(self_, concreteType_),
+      std::make_shared<SugaredTupleValue>(keys),
+      std::make_shared<SugaredTupleValue>(values));
+}
+
 std::shared_ptr<SugaredValue> SugaredDict::attr(
     const SourceRange& loc,
     GraphFunction& m,
@@ -794,13 +866,18 @@ SugaredValuePtr ModuleValue::iter(const SourceRange& loc, GraphFunction& m) {
   const auto iterableModuleKind = concreteType_->getIterableModuleKind();
   if (iterableModuleKind == IterableModuleKind::NONE) {
     throw ErrorReport(loc)
-        << "Only constant Sequential, ModueList, or ModuleDict can be used as an iterable";
+        << "Only constant Sequential, ModuleList, ModuleDict, or "
+        << "ParameterList can be used as an iterable";
   }
 
-  auto module_dict = getSugaredDict(loc, m);
   if (iterableModuleKind == IterableModuleKind::DICT) {
+    auto module_dict = getSugaredDict(loc, m);
     return module_dict->keys_;
   } else if (iterableModuleKind == IterableModuleKind::LIST) {
+    auto module_dict = getSugaredDict(loc, m);
+    return module_dict->modules_;
+  } else if (iterableModuleKind == IterableModuleKind::PARAMLIST) {
+    auto module_dict = getSugaredNamedParameterList(loc, m);
     return module_dict->modules_;
   } else {
     TORCH_INTERNAL_ASSERT(false);
@@ -914,8 +991,11 @@ std::shared_ptr<SugaredValue> PythonExceptionValue::call(
             ->insertNode(caller.graph()->createTuple(message_values))
             ->output();
   }
+  Value* qualified_class_name =
+      insertConstant(*caller.graph(), exception_class_qualified_name_, loc);
 
-  return std::make_shared<ExceptionMessageValue>(error_message);
+  return std::make_shared<ExceptionMessageValue>(
+      error_message, qualified_class_name);
 }
 
 bool isNamedTupleClass(const py::object& obj) {
@@ -1157,6 +1237,25 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     throw ErrorReport(loc) << "Cannot call a ScriptModule that is not"
                            << " a submodule of the caller";
   }
+  std::vector<std::pair<const char*, at::ScalarType>> tensor_names = {
+      {"BoolTensor", at::ScalarType::Bool},
+      {"LongTensor", at::ScalarType::Long},
+      {"ByteTensor", at::ScalarType::Byte},
+      {"CharTensor", at::ScalarType::Char},
+      {"DoubleTensor", at::ScalarType::Double},
+      {"FloatTensor", at::ScalarType::Float},
+      {"IntTensor", at::ScalarType::Int},
+      {"ShortTensor", at::ScalarType::Short},
+      {"HalfTensor", at::ScalarType::Half},
+  };
+  for (const auto& name : tensor_names) {
+    if (obj.ptr() == py::module::import("torch").attr(name.first).ptr()) {
+      // torch.LongTensor and other related functions create on cpu,
+      // TODO: add support for torch.cuda.LongTensor for gpu
+      return LegacyTensorConstructor::create(
+          prim::LegacyTypedConstructor, name.second, at::kCPU);
+    }
+  }
 
   py::object builtin_name =
       py::module::import("torch.jit._builtins").attr("_find_builtin")(obj);
@@ -1254,6 +1353,10 @@ std::shared_ptr<SugaredValue> toSugaredValue(
       return std::make_shared<FunctionValue>(*callee);
     }
   }
+  if (obj.ptr() == py::module::import("math").attr("inf").ptr()) {
+    return toSimple(
+        g.insertConstant(std::numeric_limits<double>::infinity(), loc));
+  }
 
   py::bool_ isMethod = py::module::import("inspect").attr("ismethod")(obj);
   // methods here have been explicitly annotated to not be compiled,
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index d3559abda5ce..ad393450bc8e 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -208,6 +208,14 @@ struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
       const SourceRange& loc,
       GraphFunction& m);
 
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterList(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
   void setAttr(
       const SourceRange& loc,
       GraphFunction& m,
@@ -328,7 +336,12 @@ struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue {
 struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue {
   explicit PythonExceptionValue(const py::object& exception_class)
       : ExceptionValue(
-            py::str(py::getattr(exception_class, "__name__", py::str("")))) {}
+            py::str(py::getattr(exception_class, "__name__", py::str("")))),
+        exception_class_qualified_name_(
+            py::str(py::module::import("torch._jit_internal")
+                        .attr("_qualified_name")(
+                            exception_class,
+                            /*mangle_name=*/false))) {}
 
   std::string kind() const override {
     return "Python exception";
@@ -340,6 +353,9 @@ struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue {
       at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override;
+
+ private:
+  std::string exception_class_qualified_name_;
 };
 
 // Python Slice class.
diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp
index a692c2d9282a..fcb39ad122f8 100644
--- a/torch/csrc/jit/python/python_tree_views.cpp
+++ b/torch/csrc/jit/python/python_tree_views.cpp
@@ -104,9 +104,8 @@ void initTreeViewBindings(PyObject* module) {
             return SourceRange(self.source_, start, end);
           })
       .def_property_readonly("source", [](const SourceRangeFactory& self) {
-        auto text_view = self.source_->text();
-        std::string text(text_view.begin(), text_view.end());
-        return text;
+        auto text_view = self.source_->text_str().str();
+        return text_view;
       });
 
   py::class_<TreeView>(m, "TreeView")
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 0811cd339855..4a7a04d56fae 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -12,6 +12,7 @@
 #include <torch/csrc/jit/mobile/code.h>
 #include <torch/csrc/jit/mobile/compatibility/backport.h>
 #include <torch/csrc/jit/mobile/compatibility/model_compatibility.h>
+#include <torch/csrc/jit/mobile/file_format.h>
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/operator_upgraders/upgraders.h>
@@ -42,7 +43,7 @@
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/runtime/logging.h>
-#include <torch/csrc/jit/serialization/export.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/csrc/jit/serialization/python_print.h>
 #include <torch/csrc/jit/testing/hooks_for_testing.h>
@@ -446,34 +447,29 @@ static TypePtr inferShapeAndTypeForInput(
     TypePtr input_type,
     Stack::const_iterator& s_iter,
     const Stack::const_iterator& s_iter_end,
-    bool complete);
-
-static TupleTypePtr getTupleTensorType(
-    Stack::const_iterator& s_iter,
-    const Stack::const_iterator& s_iter_end,
-    const TypePtr& tupleType,
     bool complete) {
-  TORCH_INTERNAL_ASSERT(tupleType->kind() == TupleType::Kind);
-  std::vector<TypePtr> types;
-  for (const auto& subType : tupleType->containedTypes()) {
-    TORCH_INTERNAL_ASSERT(s_iter != s_iter_end);
-    types.emplace_back(
-        inferShapeAndTypeForInput(subType, s_iter, s_iter_end, complete));
-  }
-  return TupleType::create(types);
-}
-
-static TypePtr inferShapeAndTypeForInput(
-    TypePtr input_type,
-    Stack::const_iterator& s_iter,
-    const Stack::const_iterator& s_iter_end,
-    bool complete) {
-  if (input_type->kind() == TupleType::Kind) {
-    return getTupleTensorType(s_iter, s_iter_end, input_type, complete);
-  } else if (input_type->kind() == TensorType::Kind) {
+  if (auto tuple_type = input_type->cast<TupleType>()) {
+    std::vector<TypePtr> types;
+    for (const auto& sub_type : tuple_type->containedTypes()) {
+      TORCH_INTERNAL_ASSERT(s_iter != s_iter_end);
+      types.emplace_back(
+          inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete));
+    }
+    return TupleType::create(types);
+  } else if (auto list_type = input_type->cast<ListType>()) {
+    const TypePtr& sub_type = list_type->getElementType();
+    auto elem_type =
+        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
+    return ListType::create(elem_type);
+  } else if (auto tensor_type = input_type->cast<TensorType>()) {
     auto type = getTensorType(s_iter->toTensor(), complete);
     s_iter++;
     return type;
+  } else if (auto optional_type = input_type->cast<OptionalType>()) {
+    const TypePtr& sub_type = optional_type->getElementType();
+    auto elem_type =
+        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
+    return OptionalType::create(elem_type);
   } else {
     // Primitive type, keep as is.
     s_iter++;
@@ -493,7 +489,6 @@ static void setInputTensorTypes(
     TORCH_INTERNAL_ASSERT(input_values.size() == param_count_list.size());
   }
   for (auto v : input_values) {
-    AT_ASSERT(s_iter != stack.end());
     // Leave packed param types alone. This is needed for downstream passes
     // (like alias analysis) to work properly. This will be unpacked later
     // in unpackQuantizedWeights.
@@ -501,8 +496,12 @@ static void setInputTensorTypes(
       if (auto qualname = named_type->name()) {
         if (getCustomClass(qualname->qualifiedName())) {
           if (param_count_list.empty()) {
+            AT_ASSERT(s_iter != stack.end());
             s_iter++;
           } else {
+            if (param_count_list[list_idx] > 0) {
+              AT_ASSERT(s_iter != stack.end());
+            }
             s_iter += param_count_list[list_idx];
           }
           list_idx++;
@@ -1097,23 +1096,32 @@ void initJitScriptBindings(PyObject* module) {
           [](Module& m,
              const std::string& filename,
              const ExtraFilesMap& _extra_files = ExtraFilesMap(),
-             bool _save_mobile_debug_info = false) {
-            m._save_for_mobile(filename, _extra_files, _save_mobile_debug_info);
+             bool _save_mobile_debug_info = false,
+             bool _use_flatbuffer = false) {
+            m._save_for_mobile(
+                filename,
+                _extra_files,
+                _save_mobile_debug_info,
+                _use_flatbuffer);
           },
           py::arg("filename"),
           py::arg("_extra_files") = ExtraFilesMap(),
-          py::arg("_save_mobile_debug_info") = false)
+          py::arg("_save_mobile_debug_info") = false,
+          py::arg("_use_flatbuffer") = false)
       .def(
           "_save_to_buffer_for_mobile",
           [](Module& m,
              const ExtraFilesMap& _extra_files = ExtraFilesMap(),
-             bool _save_mobile_debug_info = false) {
+             bool _save_mobile_debug_info = false,
+             bool _use_flatbuffer = false) {
             std::ostringstream buf;
-            m._save_for_mobile(buf, _extra_files, _save_mobile_debug_info);
+            m._save_for_mobile(
+                buf, _extra_files, _save_mobile_debug_info, _use_flatbuffer);
             return py::bytes(buf.str());
           },
           py::arg("_extra_files") = ExtraFilesMap(),
-          py::arg("_save_mobile_debug_info") = false)
+          py::arg("_save_mobile_debug_info") = false,
+          py::arg("_use_flatbuffer") = false)
       .def("_set_optimized", &Module::set_optimized)
       .def(
           "dump",
@@ -1495,6 +1503,14 @@ void initJitScriptBindings(PyObject* module) {
       .def_property_readonly(
           "name",
           [](const StrongFunctionPtr& self) { return self.function_->name(); })
+      .def(
+          "_set_ignore_amp",
+          [](StrongFunctionPtr& self, bool ignore) {
+            auto fn = self.function_;
+            TORCH_INTERNAL_ASSERT(fn->isGraphFunction());
+            GraphFunction& g_fn = toGraphFunction(*fn);
+            g_fn._set_ignore_amp(ignore);
+          })
       .def_property_readonly(
           "qualified_name",
           [](const StrongFunctionPtr& self) {
@@ -1562,6 +1578,10 @@ void initJitScriptBindings(PyObject* module) {
           })
       .def_property_readonly("owner", &Method::owner);
   m.def("_generate_upgraders_graph", &generate_upgraders_graph);
+  m.def(
+      "_calculate_package_version_based_on_upgraders",
+      &calculate_package_version_based_on_upgraders);
+  m.def("_get_version_calculator_flag", &get_version_calculator_flag);
   m.def(
       "_compile_graph_to_code_table",
       [](const std::string& name, const std::shared_ptr<Graph>& graph) {
@@ -1892,6 +1912,10 @@ void initJitScriptBindings(PyObject* module) {
         std::istringstream in(buffer);
         return _get_mobile_model_contained_types(in);
       });
+  m.def("_nn_module_to_mobile", [](const Module& module) {
+    CompilationOptions options;
+    return jitModuleToMobile(module, options);
+  });
   py::class_<OperatorInfo>(m, "OperatorInfo")
       .def_readonly("num_schema_args", &OperatorInfo::num_schema_args);
   m.def("_get_model_ops_and_info", [](const std::string& filename) {
@@ -1996,7 +2020,16 @@ void initJitScriptBindings(PyObject* module) {
     setGraphExecutorOptimize(optimize);
   });
 
-  m.def("_get_graph_executor_optimize", &torch::jit::getGraphExecutorOptimize);
+  m.def(
+      "_get_graph_executor_optimize",
+      [](c10::optional<bool> new_setting = c10::nullopt) {
+        bool old_value = getGraphExecutorOptimize();
+        if (new_setting) {
+          setGraphExecutorOptimize(*new_setting);
+        }
+        return old_value;
+      },
+      py::arg("new_settings") = nullptr);
 
   m.def(
       "_enable_mobile_interface_call_export",
@@ -2063,8 +2096,18 @@ void initJitScriptBindings(PyObject* module) {
              const ConcreteModuleTypeBuilder& other) {
             return self.equals(other);
           })
-      .def("set_module_list", [](ConcreteModuleTypeBuilder& self) {
-        self.setIterableModuleKind(IterableModuleKind::LIST);
+      .def(
+          "set_module_list",
+          [](ConcreteModuleTypeBuilder& self) {
+            self.setIterableModuleKind(IterableModuleKind::LIST);
+          })
+      .def(
+          "set_parameter_list",
+          [](ConcreteModuleTypeBuilder& self) {
+            self.setIterableModuleKind(IterableModuleKind::PARAMLIST);
+          })
+      .def("set_parameter_dict", [](ConcreteModuleTypeBuilder& self) {
+        self.setIterableModuleKind(IterableModuleKind::PARAMDICT);
       });
 
   py::class_<ConcreteModuleType, std::shared_ptr<ConcreteModuleType>>(
@@ -2192,6 +2235,10 @@ void initJitScriptBindings(PyObject* module) {
   m.def(
       "_run_emit_module_hook", [](const Module& m) { didFinishEmitModule(m); });
 
+  m.def(
+      "_set_should_use_format_with_string_table",
+      setShouldUseFormatWithStringTable);
+
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<logging::LoggerBase, std::shared_ptr<logging::LoggerBase>>(
       m, "LoggerBase");
@@ -2211,14 +2258,21 @@ void initJitScriptBindings(PyObject* module) {
       logging::LoggerBase,
       std::shared_ptr<logging::NoopLogger>>(m, "NoopLogger")
       .def(py::init<>());
-  m.def(
-      "_check_onnx_proto",
-      [](const std::string& proto_string) { check_onnx_proto(proto_string); },
-      py::arg("proto_string"));
   m.def("_jit_is_script_object", [](const py::object& obj) {
     return py::isinstance<Object>(obj);
   });
 
+  m.def("_get_file_format", [](const std::string& path) {
+    switch (getFileFormat(path)) {
+      case FileFormat::FlatbufferFileFormat:
+        return "flatbuffer";
+      case FileFormat::ZipFileFormat:
+        return "zipfile";
+      default:
+        return "invalid";
+    }
+  });
+
   initScriptDictBindings(module);
   initScriptListBindings(module);
 }
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
new file mode 100644
index 000000000000..d55ac7eac9be
--- /dev/null
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -0,0 +1,175 @@
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
+#include <torch/csrc/jit/runtime/decomposition_registry_util.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/import_source.h>
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <memory>
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+namespace {
+std::mutex lock;
+
+// CompilationUnit that holds all these Functions and keeps them alive.
+auto compilation_unit = std::make_shared<CompilationUnit>();
+std::unordered_map<const FunctionSchema*, std::shared_ptr<Graph>>
+    schema_to_decomposition;
+
+// Holds User-Registered Functions and keeps them alive
+std::unordered_map<const FunctionSchema*, std::unique_ptr<Function>>
+    user_registered_funcs;
+
+std::unordered_map<const FunctionSchema*, Function*> schema_to_function;
+
+void loadModule(const CompilationUnit& module) {
+  const auto& mappings = GetDecompositionMapping().getAllKeysAndValues();
+  for (const auto& pair : mappings) {
+    const FunctionSchema* schema = &pair.first->schema();
+    const std::string& decomposition_function_name = pair.second;
+
+    Function& decomposition_function =
+        module.get_function(decomposition_function_name);
+    std::shared_ptr<Graph> graph =
+        toGraphFunction(decomposition_function).graph();
+
+    schema_to_function[schema] = &decomposition_function;
+    schema_to_decomposition[schema] = graph;
+  }
+}
+
+void loadDecompositionFunctions() {
+  std::lock_guard<std::mutex> guard(lock);
+  if (schema_to_decomposition.size() != 0) {
+    return;
+  }
+
+  auto src = std::make_shared<Source>(GetSerializedDecompositions());
+  std::stringstream ss;
+  std::vector<at::IValue> constantTable;
+  auto resolver = std::make_shared<SourceImporterImpl>(
+      compilation_unit,
+      &constantTable,
+      [&](const std::string& name) -> std::shared_ptr<Source> { return src; },
+      1);
+  compilation_unit->define(
+      c10::nullopt, GetSerializedDecompositions(), resolver, nullptr);
+  loadModule(*compilation_unit);
+}
+
+} // anonymous namespace
+
+void DecomposeOp(Node* n) {
+  auto schema = n->maybeSchema();
+  if (!schema) {
+    return;
+  }
+  auto decomposition = GetDecomposition(n->schema());
+  if (!decomposition) {
+    return;
+  }
+  WithInsertPoint guard(n);
+  auto outputs =
+      insertGraph(*n->owningGraph(), *decomposition->get(), n->inputs());
+  TORCH_INTERNAL_ASSERT(outputs.size() == n->outputs().size());
+  for (size_t i : c10::irange(outputs.size())) {
+    n->outputs().at(i)->replaceAllUsesWith(outputs[i]);
+  }
+  n->destroy();
+}
+
+void RunDecompositions(Block* block) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+    Node* n = *it;
+    it++; // advance iterator bc the current node may be destroyed
+    for (Block* b : n->blocks()) {
+      RunDecompositions(b);
+    }
+    DecomposeOp(n);
+  }
+}
+
+void RunDecompositions(std::shared_ptr<Graph> g) {
+  RunDecompositions(g->block());
+  for (C10_UNUSED const auto _ : c10::irange(2)) {
+    PeepholeOptimize(g, /*disable_shape_peephole*/ true);
+    ConstantPropagation(g);
+  }
+}
+
+c10::optional<std::shared_ptr<Graph>> GetDecomposition(
+    const FunctionSchema& schema) {
+  loadDecompositionFunctions();
+  GRAPH_DEBUG("Trying to find schema: ", schema);
+  auto cache_it = schema_to_decomposition.find(&schema);
+  if (cache_it != schema_to_decomposition.end()) {
+    return cache_it->second;
+  }
+  GRAPH_DEBUG("Could not find schema: ", schema);
+
+  return c10::nullopt;
+}
+
+c10::optional<GraphFunction*> GetDecompositionFunction(
+    const FunctionSchema& schema) {
+  loadDecompositionFunctions();
+  auto cache_it = schema_to_function.find(&schema);
+  GRAPH_DEBUG("Trying to find schema: ", schema);
+  if (cache_it == schema_to_function.end()) {
+    GRAPH_DEBUG("Could not find schema: ", schema);
+    return c10::nullopt;
+  }
+  auto& func = toGraphFunction(*cache_it->second);
+  // Simple Executor:
+  // To allow decomposition to run on tensor subclasses such as batched tensors,
+  // we set decompostion execution to use the simple executor so that
+  // optimizations that do not compose with arbitrary subclasses (such as
+  // fusion) do not run
+  func._set_initial_executor_execution_mode(ExecutorExecutionMode::SIMPLE);
+  return &func;
+}
+
+// Decomposition registers a Graph so that we can initialize a GraphFunction
+// that will run with Simple Executor
+void RegisterDecomposition(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g) {
+  loadDecompositionFunctions();
+  std::lock_guard<std::mutex> guard(lock);
+  Inline(*g);
+  for (const auto i : c10::irange(2)) {
+    (void)i; // Suppress unused variable warning
+    PeepholeOptimize(g);
+    ConstantPropagationImmutableTypes(g);
+  }
+
+  std::unique_ptr<GraphFunction> new_func(new GraphFunction(
+      schema.name(), g, nullptr, ExecutorExecutionMode::SIMPLE));
+  user_registered_funcs.emplace(&schema, std::move(new_func));
+  schema_to_function[&schema] = user_registered_funcs[&schema].get();
+  schema_to_decomposition[&schema] = g;
+}
+
+Function* GetDecompositionExecutor(const FunctionSchema& schema) {
+  auto maybe_func = GetDecompositionFunction(schema);
+  TORCH_INTERNAL_ASSERT(maybe_func);
+  return *maybe_func;
+}
+
+Function* GetDecompositionExecutor(const char* schema_literal) {
+  auto& schema = getOperatorForLiteral(schema_literal)->schema();
+  return GetDecompositionExecutor(schema);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h
new file mode 100644
index 000000000000..4c6ef3029a0b
--- /dev/null
+++ b/torch/csrc/jit/runtime/decomposition_registry.h
@@ -0,0 +1,29 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API c10::optional<std::shared_ptr<Graph>> GetDecomposition(
+    const FunctionSchema& schema);
+
+TORCH_API void RegisterDecomposition(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g);
+
+TORCH_API void RunDecompositions(std::shared_ptr<Graph> g);
+
+TORCH_API c10::optional<GraphFunction*> GetDecompositionFunction(
+    const FunctionSchema& schema);
+
+// For invocation in C++, recommended is to assign to static local variable
+TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
+
+TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry_util.cpp b/torch/csrc/jit/runtime/decomposition_registry_util.cpp
new file mode 100644
index 000000000000..da972bfce4f8
--- /dev/null
+++ b/torch/csrc/jit/runtime/decomposition_registry_util.cpp
@@ -0,0 +1,77 @@
+
+/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python torchgen/decompositions/gen_jit_decompositions.py
+ */
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/decomposition_registry_util.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace torch {
+namespace jit {
+
+const std::string decomp_funcs =
+    R"(def var_decomposition(input: Tensor,
+    dim: Optional[List[int]]=None,
+    correction: Optional[int]=None,
+    keepdim: bool=False) -> Tensor:
+  if torch.__is__(dim, None):
+    dim0 = annotate(List[int], [])
+  else:
+    dim0 = unchecked_cast(List[int], dim)
+  if torch.eq(torch.len(dim0), 0):
+    n = torch.numel(input)
+  else:
+    n0 = 1
+    for _0 in range(torch.len(dim0)):
+      dim_i = dim0[_0]
+      n1 = torch.mul(n0, (torch.size(input))[dim_i])
+      n0 = n1
+    n = n0
+  mean = torch.mean(input, dim0, True)
+  sub = torch.sub(input, mean)
+  sq = torch.mul(sub, sub)
+  sum = torch.sum(sq, dim0, keepdim)
+  if torch.__isnot__(correction, None):
+    correction0 = unchecked_cast(int, correction)
+    n2 = torch.sub(n, correction0)
+  else:
+    n2 = n
+  return torch.div(sum, n2)
+
+def var(input: Tensor,
+    unbiased: bool=True) -> Tensor:
+  if unbiased:
+    _0 = 1
+  else:
+    _0 = 0
+  n = torch.numel(input)
+  mean = torch.mean(input, annotate(List[int], []), True)
+  sub = torch.sub(input, mean)
+  sq = torch.mul(sub, sub)
+  sum = torch.sum(sq, annotate(List[int], []))
+  n0 = torch.sub(n, _0)
+  return torch.div(sum, n0)
+
+)";
+
+const std::string& GetSerializedDecompositions() {
+  return decomp_funcs;
+}
+
+const OperatorMap<std::string>& GetDecompositionMapping() {
+  // clang-format off
+ static const OperatorMap<std::string> decomposition_mapping {
+    {"aten::var.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor)", "var_decomposition"},
+    {"aten::var(Tensor self, bool unbiased=True) -> (Tensor)", "var"},
+  };
+  // clang-format on
+
+  return decomposition_mapping;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry_util.h b/torch/csrc/jit/runtime/decomposition_registry_util.h
new file mode 100644
index 000000000000..47e68f7196ac
--- /dev/null
+++ b/torch/csrc/jit/runtime/decomposition_registry_util.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API const std::string& GetSerializedDecompositions();
+
+TORCH_API const OperatorMap<std::string>& GetDecompositionMapping();
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 27d5936fd796..08029ce317af 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -9,7 +9,6 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -39,9 +38,11 @@
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/runtime/profiling_graph_executor_impl.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
+#include <torch/csrc/jit/runtime/simple_graph_executor_impl.h>
 
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function.h>
+#include <torch/csrc/jit/python/update_graph_executor_opt.h>
 #include <torch/csrc/jit/runtime/logging.h>
 
 #include <cstdint>
@@ -56,17 +57,16 @@ namespace torch {
 namespace jit {
 
 EnableProfilingGuard::EnableProfilingGuard() {
-  auto& profiling_mode = getProfilingMode();
-  old_profiling_mode = profiling_mode;
-  profiling_mode = true;
   auto& executor_mode = getExecutorMode();
   old_executor_mode = executor_mode;
   executor_mode = true;
+  old_get_optimize = getGraphExecutorOptimize();
+  setGraphExecutorOptimize(true);
 }
 
 EnableProfilingGuard::~EnableProfilingGuard() {
-  getProfilingMode() = old_profiling_mode;
   getExecutorMode() = old_executor_mode;
+  setGraphExecutorOptimize(old_get_optimize);
 }
 
 namespace {
@@ -283,7 +283,19 @@ struct DifferentiableGraphBackward : public autograd::Node {
           produceOutput(output_index++, std::move(tensor), outputs);
         }
       } else if (v.isTensor()) {
-        produceOutput(output_index++, std::move(v).toTensor(), outputs);
+        if (!v.toTensor().defined()) {
+          // this undefined gradient actually corresponds to a tensor list
+          if (input_tensor_lists_.count(output_index) != 0) {
+            size_t list_size = input_tensor_lists_[output_index];
+            for (size_t i = 0; i < list_size; i++) {
+              produceOutput(output_index++, {}, outputs);
+            }
+          } else {
+            produceOutput(output_index++, {}, outputs);
+          }
+        } else {
+          produceOutput(output_index++, std::move(v).toTensor(), outputs);
+        }
       } else {
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(v.isNone());
         output_index++;
@@ -292,6 +304,12 @@ struct DifferentiableGraphBackward : public autograd::Node {
         outputs.emplace_back();
       }
     }
+    TORCH_INTERNAL_ASSERT(
+        num_outputs() == outputs.size(),
+        "DifferentiableGraphBackward: expected ",
+        num_outputs(),
+        " outputs but found ",
+        outputs.size());
     return outputs;
   }
 
@@ -307,14 +325,18 @@ struct DifferentiableGraphBackward : public autograd::Node {
   }
   void addOutputForIValue(const IValue& value) {
     if (value.isTensorList()) {
+      input_tensor_lists_.insert({index_, value.toTensorList().size()});
       for (const at::Tensor tensor : value.toTensorList()) {
         addOutputForTensor(tensor);
+        index_++;
       }
     } else if (value.isTensor()) {
       addOutputForTensor(value.toTensor());
+      index_++;
     } else {
       // We could have None passed here via `Optional[Tensor]`
       add_next_edge(autograd::Edge{});
+      index_++;
     }
   }
 
@@ -322,7 +344,8 @@ struct DifferentiableGraphBackward : public autograd::Node {
     // NB: since our requires_grad setting is only a heuristic we might end
     // up wanting to differentiate through integral tensors, which is
     // generally a hard error in autograd.
-    if (at::isFloatingType(output.scalar_type())) {
+    if (at::isFloatingType(output.scalar_type()) ||
+        at::isComplexType(output.scalar_type())) {
       autograd::create_gradient_edge(output, shared_from_this());
       output.set_requires_grad(true);
     } else {
@@ -371,6 +394,14 @@ struct DifferentiableGraphBackward : public autograd::Node {
   GraphExecutor executor;
   CaptureList captures_;
   UnpackInstructions input_instructions_;
+  // we need to track input lists to fwd graph
+  // since in backward graphs these will become
+  // an undefined tensors if gradients are zeros
+  // we will need to convert an undefined tensor
+  // back to a list
+  // TODO: switch to using UnpackInstructions
+  size_t index_ = 0;
+  std::map<size_t, size_t> input_tensor_lists_;
 };
 
 // an optimized way of executing the subgraph computed directly on
@@ -407,8 +438,7 @@ struct DifferentiableGraphOp {
 
     detachVariables(stack);
     if (IsNewExecutorEnabled()) {
-      const ExecutionPlan& plan =
-          f_ptr->getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts());
+      const ExecutionPlan& plan = f_ptr->getPlanFor(stack);
       InterpreterState(plan.code).run(stack);
     } else {
       InterpreterState(legacy_f).run(stack);
@@ -549,8 +579,7 @@ void GraphExecutorImplBase::run(Stack& stack) {
   logging::getLogger()->addStatValue(
       logging::runtime_counters::GRAPH_EXECUTOR_INVOCATIONS, 1.0);
 
-  const ExecutionPlan& plan =
-      getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts());
+  const ExecutionPlan& plan = getPlanFor(stack);
   InterpreterState(plan.code).run(stack);
   last_executed_optimized_graph = plan.graph;
 }
@@ -575,9 +604,8 @@ c10::intrusive_ptr<Future> GraphExecutorImplBase::runAsync(
     ExecutionPlan plan;
     InterpreterState state;
   };
-  auto frame = std::make_shared<Frame>(
-      getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts()),
-      std::move(taskLauncher));
+  auto frame =
+      std::make_shared<Frame>(getPlanFor(stack), std::move(taskLauncher));
   auto res = frame->state.runAsync(stack);
   last_executed_optimized_graph = frame->plan.graph;
   if (!res->completed()) {
@@ -602,8 +630,9 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
         logging::runtime_counters::GRAPH_EXECUTORS_CONSTRUCTED, 1.0);
   }
 
-  const ExecutionPlan& getPlanFor(Stack& stack, size_t remaining_bailout_depth)
-      override {
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override {
     return getGraphExecutorOptimize() ? getOrCompile(stack)
                                       : getOrCompileFallback();
   }
@@ -765,12 +794,33 @@ GraphExecutor::GraphExecutor(
     std::string function_name)
     : pImpl(
           IsNewExecutorEnabled()
+              ? (getProfilingMode() ?
+
+                                    dynamic_cast<GraphExecutorImplBase*>(
+                                        new ProfilingGraphExecutorImpl(
+                                            graph,
+                                            std::move(function_name)))
+                                    : dynamic_cast<GraphExecutorImplBase*>(
+                                          new SimpleGraphExecutorImpl(
+                                              graph,
+                                              std::move(function_name))))
+              : dynamic_cast<GraphExecutorImplBase*>(
+                    new GraphExecutorImpl(graph, std::move(function_name)))) {}
+
+GraphExecutor::GraphExecutor(
+    const std::shared_ptr<Graph>& graph,
+    std::string function_name,
+    ExecutorExecutionMode executor_mode)
+    : pImpl(
+          executor_mode == ExecutorExecutionMode::SIMPLE
               ? dynamic_cast<GraphExecutorImplBase*>(
-                    new ProfilingGraphExecutorImpl(
+                    new SimpleGraphExecutorImpl(
                         graph,
                         std::move(function_name)))
               : dynamic_cast<GraphExecutorImplBase*>(
-                    new GraphExecutorImpl(graph, std::move(function_name)))) {}
+                    new ProfilingGraphExecutorImpl(
+                        graph,
+                        std::move(function_name)))) {}
 
 void GraphExecutor::run(Stack& inputs) {
   return pImpl->run(inputs);
@@ -782,13 +832,9 @@ c10::intrusive_ptr<Future> GraphExecutor::runAsync(
   return pImpl->runAsync(stack, std::move(taskLauncher));
 }
 
-size_t GraphExecutor::getDefaultNumBailOuts() {
-  return getProfilingMode() ? getBailoutDepth() : 0;
-}
-
 const ExecutionPlan& GraphExecutor::getPlanFor(
     Stack& inputs,
-    size_t remaining_bailout_depth) {
+    c10::optional<size_t> remaining_bailout_depth) {
   return pImpl->getPlanFor(inputs, remaining_bailout_depth);
 }
 
@@ -886,10 +932,8 @@ void runNondiffOptimization(
 
   // decomposition pass, decompose certain ops that will be used in the
   // following passes (like batchmm and jit fusion)
-  if (!getProfilingMode()) {
-    DecomposeOps(graph);
-    GRAPH_DEBUG("After DecomposeOps\n", *graph);
-  }
+  DecomposeOps(graph);
+  GRAPH_DEBUG("After DecomposeOps\n", *graph);
 
   // TupleConstruct / TupleUnpack pairs can still be present at this point
   // and must be removed for fusion.
@@ -900,7 +944,7 @@ void runNondiffOptimization(
   BatchMM(graph);
 
   GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
-  if (getProfilingMode()) {
+  if (getExecutorMode()) {
     if (tensorExprFuserEnabled()) {
       auto min_size = getFusionGroupInlining() ? 2 : 1;
       auto dyn_shapes = tensorExprDynamicShapeFusionEnabled();
@@ -970,10 +1014,7 @@ void runOptimization(
 
   EliminateCommonSubexpression(graph);
   GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before HoistCommonExpression\n",
-      *graph);
-  HoistCommonExpression(graph);
-  GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
+      "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
   CheckInplace(graph);
   GRAPH_DEBUG("After CheckInplace (end of runOptimization)\n", *graph);
 }
diff --git a/torch/csrc/jit/runtime/graph_executor.h b/torch/csrc/jit/runtime/graph_executor.h
index 381fcc41062b..36c08edf4803 100644
--- a/torch/csrc/jit/runtime/graph_executor.h
+++ b/torch/csrc/jit/runtime/graph_executor.h
@@ -16,14 +16,15 @@ namespace jit {
 struct GraphExecutorState;
 struct Code;
 
+enum ExecutorExecutionMode {
+  SIMPLE,
+  PROFILING,
+};
+
 struct ExecutionPlan {
   ExecutionPlan() = default;
-  ExecutionPlan(
-      std::shared_ptr<Graph> graph,
-      std::string function_name,
-      size_t remaining_bailout_depth = 0)
-      : code(graph, std::move(function_name), remaining_bailout_depth),
-        graph(std::move(graph)) {}
+  ExecutionPlan(std::shared_ptr<Graph> graph, std::string function_name)
+      : code(graph, std::move(function_name)), graph(std::move(graph)) {}
 
   operator bool() const {
     return static_cast<bool>(graph);
@@ -34,8 +35,8 @@ struct ExecutionPlan {
 };
 
 // Notice that those structs don't manage lifetime of their members.
-// They is only valid only right after you call getDebugState() and should never
-// be used again once another GraphExecutor function is called.
+// They are only valid only right after you call getDebugState() and should
+// never be used again once another GraphExecutor function is called.
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct GraphExecutorState {
@@ -50,7 +51,7 @@ struct TORCH_API EnableProfilingGuard {
 
  private:
   bool old_executor_mode = false;
-  bool old_profiling_mode = false;
+  bool old_get_optimize = false;
 };
 
 struct GraphExecutorImplBase;
@@ -58,6 +59,11 @@ struct TORCH_API GraphExecutor {
   GraphExecutor() = default;
   GraphExecutor(const std::shared_ptr<Graph>& graph, std::string function_name);
 
+  GraphExecutor(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      ExecutorExecutionMode executor_mode);
+
   void run(Stack& inputs);
   c10::intrusive_ptr<Future> runAsync(
       Stack& stack,
@@ -72,13 +78,13 @@ struct TORCH_API GraphExecutor {
   // profiled information whenever a bailout check is failed/triggered, a new
   // `GraphExecutor` will be created. This new `GraphExecutor`'s
   // remaining_bailout_depth will be reduced by 1.
+  // If no bailout depth is passed, the depth will be initialized from the
+  // current global fusion strategy settings.
   const ExecutionPlan& getPlanFor(
       Stack& inputs,
-      size_t remaining_bailout_depth);
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt);
   GraphExecutorState getDebugState();
 
-  static size_t getDefaultNumBailOuts();
-
   void debugFlushCompilationCache();
 
   bool isOptimized() const;
diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h
index c6c90fb7cd08..e48ac2b022c1 100644
--- a/torch/csrc/jit/runtime/graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -79,7 +79,7 @@ struct GraphExecutorImplBase {
 
   virtual const ExecutionPlan& getPlanFor(
       Stack& stack,
-      size_t remaining_bailout_depth) = 0;
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt) = 0;
   virtual GraphExecutorState getDebugState() = 0;
   virtual ~GraphExecutorImplBase() = default;
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index a01da0b8c057..81e6b951c598 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -26,6 +26,7 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/script_profile.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 #include <string>
 
 #ifdef USE_RPC
@@ -175,7 +176,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   void callFunction(
       Function& f,
       Stack& stack,
-      size_t bailOut = GraphExecutor::getDefaultNumBailOuts(),
+      c10::optional<size_t> bailOut = c10::nullopt,
       bool next = true) {
     bool newFrame = f.call(stack, bailOut, [&](const Code& code) {
       enterFrame(code, stack.size() - code.num_inputs());
@@ -716,10 +717,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             auto& forked_fn =
                 toGraphFunction(*frame.function->function_table_[inst.X]);
             InterpreterState forked_interpreter(
-                forked_fn.get_executor()
-                    .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
-                    .code,
-                taskLauncher_);
+                forked_fn.get_executor().getPlanFor(stack).code, taskLauncher_);
             InterpreterContinuation continuation(
                 forked_interpreter,
                 Stack(stack.end() - inst.N, stack.end()),
@@ -799,10 +797,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         python_class_name = jit_exception->getPythonClassName();
       }
       handleError(
-          ExceptionMessage(e),
-          (bool)jit_exception,
-          not_implemented_error,
-          python_class_name);
+          e, (bool)jit_exception, not_implemented_error, python_class_name);
       return false;
     }
   }
@@ -819,10 +814,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 
   void handleError(
-      const ExceptionMessage& msg,
+      const std::exception& e,
       bool is_jit_exception,
       c10::NotImplementedError* not_implemented_error,
       c10::optional<std::string> python_class_name) {
+    ExceptionMessage msg(e);
     std::ostringstream ss;
     std::string class_name =
         python_class_name ? *python_class_name : "RuntimeError";
@@ -832,24 +828,29 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     if (future_) {
       future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
     } else if (is_jit_exception) {
-      throw JITException(ss.str(), python_class_name);
+      // save the original exception's message when creating a new JITException
+      throw JITException(ss.str(), python_class_name, e.what());
     } else if (not_implemented_error) {
       throw c10::NotImplementedError(
           ss.str(),
           not_implemented_error->backtrace(),
           not_implemented_error->caller());
     } else {
+      if (get_cpp_stacktraces_enabled()) {
+        ss << e.what() << "\n";
+      }
       throw std::runtime_error(ss.str());
     }
   }
 
   static void checkAndStartRecordFunction(Frame& frame, Stack& stack) {
-    bool pre_sampled = false;
-    if (!frame.record_function && at::hasCallbacks() &&
-        at::shouldRunRecordFunction(&pre_sampled)) {
-      auto rec_fn = std::make_unique<at::RecordFunction>(
-          at::RecordScope::TORCHSCRIPT_FUNCTION, pre_sampled);
-      if (rec_fn->isActive()) {
+    if (!frame.record_function) {
+      auto step_callbacks =
+          at::getStepCallbacks(at::RecordScope::TORCHSCRIPT_FUNCTION);
+      if (!step_callbacks.empty()) {
+        auto rec_fn =
+            std::make_unique<at::RecordFunction>(std::move(step_callbacks));
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rec_fn->isActive());
         if (rec_fn->needsInputs()) {
           rec_fn->before(
               frame.function->function_name_,
@@ -1059,12 +1060,14 @@ MobileCode::MobileCode(
     std::string function_name,
     bool emit_default_input_instructions,
     bool support_default_args_before_out,
+    bool emit_promoted_ops,
     size_t remaining_bailout_depth)
     : Code(new interpreter::MobileCodeImpl(
           graph,
           std::move(function_name),
           emit_default_input_instructions,
           support_default_args_before_out,
+          emit_promoted_ops,
           remaining_bailout_depth)) {}
 
 MobileCode::~MobileCode() = default;
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 12441735ae65..19f997981f41 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -88,6 +88,7 @@ struct TORCH_API MobileCode : Code {
       std::string function_name,
       bool emit_default_input_instructions = true,
       bool support_default_args_before_out = true,
+      bool emit_promoted_ops = true,
       size_t remaining_bailout_depth = 0);
   ~MobileCode();
 };
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 03411a196321..63844c4e981b 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -869,10 +869,12 @@ struct MobileCodeImpl : CodeImpl {
       std::string function_name,
       bool emit_default_input_instructions,
       bool support_default_args_before_out,
+      bool emit_promoted_ops,
       size_t remaining_bailout_depth)
       : CodeImpl(graph, function_name, remaining_bailout_depth, false),
         emit_default_input_instructions_(emit_default_input_instructions),
-        support_default_args_before_out_(support_default_args_before_out) {
+        support_default_args_before_out_(support_default_args_before_out),
+        emit_promoted_ops_(emit_promoted_ops) {
     // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
     run();
   }
@@ -965,7 +967,6 @@ struct MobileCodeImpl : CodeImpl {
       int64_t X = 0,
       uint64_t N = 0,
       bool emit_inputs = true) override {
-    bool emit_promoted_ops_ = false;
     if (emit_promoted_ops_) {
       CodeImpl::emitOperatorOrInstruction(node, op, X, N, emit_inputs);
     } else {
@@ -977,6 +978,8 @@ struct MobileCodeImpl : CodeImpl {
   bool emit_default_input_instructions_;
   // To support forward compatibility for bytecode version bump from v6 to v7
   bool support_default_args_before_out_;
+  // To support forward compatibility for bytecode version bump from v7 to v8
+  bool emit_promoted_ops_;
 };
 
 } // namespace interpreter
diff --git a/torch/csrc/jit/runtime/jit_exception.cpp b/torch/csrc/jit/runtime/jit_exception.cpp
index 600b92a111a8..fc8e18d21e6a 100644
--- a/torch/csrc/jit/runtime/jit_exception.cpp
+++ b/torch/csrc/jit/runtime/jit_exception.cpp
@@ -3,11 +3,30 @@
 namespace torch {
 namespace jit {
 
+static thread_local std::string caughtOriginalMsg = "";
+static thread_local std::string caughtPythonClassName = "";
+
 JITException::JITException(
     const std::string& msg,
-    c10::optional<std::string> python_class_name)
+    c10::optional<std::string> python_class_name,
+    c10::optional<std::string> original_msg)
     : std::runtime_error(msg),
-      python_class_name_(std::move(python_class_name)) {}
+      python_class_name_(std::move(python_class_name)),
+      original_msg_(std::move(original_msg)) {}
+
+const std::string& JITException::getCaughtOriginalMsg() {
+  return caughtOriginalMsg;
+}
+const std::string& JITException::getCaughtPythonClassName() {
+  return caughtPythonClassName;
+}
+void JITException::setCaughtOriginalMsg(const std::string& msg) {
+  caughtOriginalMsg = msg;
+}
+void JITException::setCaughtPythonClassName(
+    const std::string& pythonClassName) {
+  caughtPythonClassName = pythonClassName;
+}
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/jit_exception.h b/torch/csrc/jit/runtime/jit_exception.h
index 573d82b5799d..595a7b756967 100644
--- a/torch/csrc/jit/runtime/jit_exception.h
+++ b/torch/csrc/jit/runtime/jit_exception.h
@@ -12,14 +12,28 @@ namespace jit {
 struct TORCH_API JITException : public std::runtime_error {
   explicit JITException(
       const std::string& msg,
-      c10::optional<std::string> python_class_name = c10::nullopt);
+      c10::optional<std::string> python_class_name = c10::nullopt,
+      c10::optional<std::string> original_msg = c10::nullopt);
 
   c10::optional<std::string> getPythonClassName() const {
     return python_class_name_;
   }
 
+  // the original msg if this is from a python exception. The interpretor has
+  // changed the original message by adding "The following operation failed in
+  // the TorchScript interpreter." in front of it in the handleError function.
+  c10::optional<std::string> getOriginalMsg() const {
+    return original_msg_;
+  }
+
+  static const std::string& getCaughtOriginalMsg();
+  static const std::string& getCaughtPythonClassName();
+  static void setCaughtOriginalMsg(const std::string& msg);
+  static void setCaughtPythonClassName(const std::string& pythonClassName);
+
  private:
   c10::optional<std::string> python_class_name_;
+  c10::optional<std::string> original_msg_;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index 373196c112df..d005d1b100bd 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -246,6 +246,8 @@ bool printerHasSpecialCaseFor(Symbol sym) {
       prim::StaticSubgraph, // optimization pass adds it
       prim::ConstantMKLDNNTensor, // optimization pass adds it
       prim::BroadcastMKLDNNTensors, // optimization pass adds it
+      prim::oneDNNFusionGroup, // optimization pass adds it
+      prim::oneDNNFusionGuard, // optimization pass adds it
       prim::StaticRuntimeCopyOuts, // used in SR only
       prim::Load, // used in interpreter only
       prim::MMTreeReduce, // used as an optimization
@@ -282,6 +284,7 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
       prim::Loop,
       prim::FusionGroup,
       prim::CudaFusionGroup,
+      prim::oneDNNFusionGroup,
       prim::DifferentiableGraph,
       prim::TensorExprGroup,
       prim::TensorExprDynamicGroup,
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index a269908ce909..438fcc5411bb 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -27,6 +27,7 @@ namespace torch {
 namespace jit {
 
 struct Node;
+using ::c10::Argument;
 using ::c10::FunctionSchema;
 using ::c10::Symbol;
 
@@ -85,6 +86,23 @@ struct TORCH_API Operator {
                 UnparsedFunctionSchema{std::move(schema), alias_analysis}),
             c10::make_left<Operation, OperationCreator>(std::move(op))})) {}
 
+  Operator(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      Operation op,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(c10::make_right<C10Operator, JitOnlyOperator>(JitOnlyOperator{
+            c10::make_left<FunctionSchema, UnparsedFunctionSchema>(
+                varArgSchemaWithName(
+                    name,
+                    overload_name,
+                    arguments,
+                    returns,
+                    alias_analysis)),
+            c10::make_left<Operation, OperationCreator>(std::move(op))})) {}
+
   Operator(
       std::string schema,
       OperationCreator op_creator,
@@ -180,6 +198,23 @@ struct TORCH_API Operator {
     return result;
   }
 
+  static FunctionSchema varArgSchemaWithName(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      AliasAnalysisKind alias_analysis) {
+    auto result = FunctionSchema(
+        name,
+        overload_name,
+        arguments,
+        returns,
+        /*is_vararg*/ false,
+        /*is_varret*/ false);
+    result.setAliasAnalysis(alias_analysis);
+    return result;
+  }
+
   c10::either<C10Operator, JitOnlyOperator> op_;
 };
 
@@ -245,5 +280,22 @@ c10::optional<Operator> OperatorGenerator(
   return c10::nullopt;
 }
 
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    const std::string name,
+    const std::string overload_name,
+    const std::vector<c10::Argument> arguments,
+    const std::vector<c10::Argument> returns,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return c10::optional<Operator>(Operator(
+      name,
+      overload_name,
+      arguments,
+      returns,
+      std::forward<Func>(op),
+      alias_analysis));
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/print_handler.cpp b/torch/csrc/jit/runtime/print_handler.cpp
index e3e8585bca56..9452589f9e39 100644
--- a/torch/csrc/jit/runtime/print_handler.cpp
+++ b/torch/csrc/jit/runtime/print_handler.cpp
@@ -6,9 +6,15 @@
 namespace torch {
 namespace jit {
 
-std::atomic<PrintHandler> print_handler([](const std::string& str) {
-  std::cout << str;
-});
+namespace {
+
+std::atomic<PrintHandler> print_handler(getDefaultPrintHandler());
+
+} // namespace
+
+PrintHandler getDefaultPrintHandler() {
+  return [](const std::string& s) { std::cout << s; };
+}
 
 PrintHandler getPrintHandler() {
   return print_handler.load();
diff --git a/torch/csrc/jit/runtime/print_handler.h b/torch/csrc/jit/runtime/print_handler.h
index d9ba851fa1ab..2f1f3ee92e06 100644
--- a/torch/csrc/jit/runtime/print_handler.h
+++ b/torch/csrc/jit/runtime/print_handler.h
@@ -11,8 +11,9 @@ namespace jit {
 
 using PrintHandler = void (*)(const std::string&);
 
-TORCH_API void setPrintHandler(PrintHandler ph);
+TORCH_API PrintHandler getDefaultPrintHandler();
 TORCH_API PrintHandler getPrintHandler();
+TORCH_API void setPrintHandler(PrintHandler ph);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index c0fc02e34d4f..d3c4a02685ed 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -1,13 +1,15 @@
 #include <torch/csrc/jit/runtime/profiling_graph_executor_impl.h>
 
+#include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/add_if_then_else.h>
 #include <torch/csrc/jit/passes/bailout_graph.h>
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
+#include <torch/csrc/jit/passes/check_strict_fusion.h>
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -34,6 +36,7 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <mutex>
 
 C10_DEFINE_bool(
     torch_jit_enable_new_executor,
@@ -45,6 +48,16 @@ C10_DEFINE_bool(
     false,
     "Disables warning.warn prints in TorchScript graph");
 
+C10_DEFINE_bool(
+    torch_jit_static_then_dynamic,
+    false,
+    "fuse on two static compilations then 10 dynamic");
+
+C10_DEFINE_bool(
+    torch_jit_always_dynamic,
+    false,
+    "fuse on 12 dynamic compilations");
+
 constexpr size_t kDefaultNumProfiledRuns = 1;
 constexpr size_t kDefaultBailoutDepth = 20;
 
@@ -70,30 +83,44 @@ static std::atomic<bool> profiling_mode{true};
 
 static std::mutex fusion_strategy_lock;
 
+FusionStrategy getInitialStrategy() {
+  if (FLAGS_torch_jit_always_dynamic) {
+    return {{FusionBehavior::DYNAMIC, 12}};
+  }
+  FusionStrategy mixed = {
+      {FusionBehavior::STATIC, 2}, {FusionBehavior::DYNAMIC, 10}};
+  if (FLAGS_torch_jit_static_then_dynamic) {
+    return mixed;
+  }
 // TODO remove ifdef
 #ifdef FBCODE_CAFFE2
-static FusionStrategy fusion_strategy = {{FusionBehavior::STATIC, 20}};
-#else
-static FusionStrategy fusion_strategy = {
-    {FusionBehavior::STATIC, 2},
-    {FusionBehavior::DYNAMIC, 10}};
+  return {{FusionBehavior::STATIC, 20}};
 #endif
+  return mixed;
+}
+
+// defer initial value so that we can load in gflags
+static c10::optional<FusionStrategy> fusion_strategy = c10::nullopt;
 
 FusionStrategy getFusionStrategy() {
   std::lock_guard<std::mutex> guard(fusion_strategy_lock);
-  FusionStrategy strategy = fusion_strategy;
-  return strategy;
+  if (fusion_strategy == c10::nullopt) {
+    fusion_strategy = getInitialStrategy();
+  }
+  return *fusion_strategy;
 }
 
 FusionStrategy setFusionStrategy(FusionStrategy& strategy) {
   std::lock_guard<std::mutex> guard(fusion_strategy_lock);
-  auto old_strategy = fusion_strategy;
+  if (fusion_strategy == c10::nullopt) {
+    fusion_strategy = getInitialStrategy();
+  }
+  FusionStrategy old_strategy = *fusion_strategy;
   fusion_strategy = strategy;
   return old_strategy;
 }
 
 static std::atomic<size_t> num_profiled_runs{kDefaultNumProfiledRuns};
-static std::atomic<size_t> bailout_depth{kDefaultBailoutDepth};
 
 std::atomic<bool>& getProfilingMode() {
   return profiling_mode;
@@ -282,8 +309,9 @@ bool guardDifferentiableGraph(Node* dnode) {
 }
 
 void runNooptPassPipeline(std::shared_ptr<Graph>& graph) {
-  GRAPH_DEBUG(
-      "Before LowerGradOf (beginning of runNooptPassPipeline)\n", *graph);
+  GRAPH_DEBUG("Before Inliner (beginning of runNooptPassPipeline)\n", *graph);
+  Inline(*graph);
+  GRAPH_DEBUG("After Inline, Before NoGrad\n", *graph);
   LowerGradOf(*graph);
   GRAPH_DEBUG("After LowerGradOf, before RemoveExpands\n", *graph);
   RemoveExpands(graph);
@@ -300,22 +328,8 @@ void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
       "Before InsertGuards (beginning of runPreAutodiffPassPipeline)\n",
       *graph);
 
-  if (tensorExprFuserEnabled() || RegisterCudaFuseGraph::isRegistered()) {
-    // With TE fuser or nvfuser, we don't generate bailouts
-    LowerGradOf(*graph);
-    GRAPH_DEBUG("After LowerGradOf, before specializeAutogradZero\n", *graph);
-  } else {
-    InsertGuards(graph);
-    GRAPH_DEBUG("After InsertGuards, before LowerGradOf\n", *graph);
-    LowerGradOf(*graph);
-    GRAPH_DEBUG("After LowerGradOf, before EliminateRedundantGuards\n", *graph);
-    EliminateRedundantGuards(graph);
-    GRAPH_DEBUG(
-        "After EliminateRedundantGuards, before InsertBailOuts\n", *graph);
-    InsertBailOuts(graph);
-    GRAPH_DEBUG(
-        "After InsertBailOuts, before specializeAutogradZero\n", *graph);
-  }
+  LowerGradOf(*graph);
+  GRAPH_DEBUG("After LowerGradOf, before specializeAutogradZero\n", *graph);
 
   specializeAutogradZero(graph);
   GRAPH_DEBUG("After specializeAutogradZero\n", *graph);
@@ -364,23 +378,20 @@ void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
 
     EliminateCommonSubexpression(graph);
     GRAPH_DEBUG(
-        "After EliminateCommonSubexpression, before HoistCommonExpression\n",
-        *graph);
-    HoistCommonExpression(graph);
-    GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
+        "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
     CheckInplace(graph);
   }
   GRAPH_DEBUG(
       "After CheckInplace (end of runPreAutodiffPassPipeline)\n", *graph);
 }
 
-FusionBehavior getCurrentBehavior(size_t remaining_depth) {
+FusionBehavior ProfilingGraphExecutorImpl::getCurrentBehavior(
+    size_t remaining_depth) {
   size_t curr_depth = 0;
-  auto curr_strategy = getFusionStrategy();
-  for (int i = static_cast<int>(curr_strategy.size()) - 1; i >= 0; i--) {
-    curr_depth += curr_strategy[i].second;
+  for (int i = static_cast<int>(fusion_strategy_.size()) - 1; i >= 0; i--) {
+    curr_depth += fusion_strategy_[i].second;
     if (remaining_depth <= curr_depth) {
-      return curr_strategy[i].first;
+      return fusion_strategy_[i].first;
     }
   }
   // should never get here
@@ -388,7 +399,7 @@ FusionBehavior getCurrentBehavior(size_t remaining_depth) {
   return FusionBehavior::STATIC;
 }
 
-void runNoGradOptimizations(
+void ProfilingGraphExecutorImpl::runNoGradOptimizations(
     std::shared_ptr<Graph>& graph,
     size_t remaining_bailout_depth) {
   GRAPH_DEBUG(
@@ -425,15 +436,8 @@ void runNoGradOptimizations(
       auto min_size = getFusionGroupInlining() ? 2 : 1;
       bool dyn_shapes = getCurrentBehavior(remaining_bailout_depth) ==
           FusionBehavior::DYNAMIC;
-      FuseTensorExprs(graph, min_size, /*composed_op*/ false, dyn_shapes);
-      GRAPH_DEBUG(
-          "After Fusion, before RemoveTensorTypeSpecializations\n", *graph);
-
-      // Wipe tensor type info from the IR
-      RemoveTensorTypeSpecializations(graph);
-      GRAPH_DEBUG(
-          "After RemoveTensorTypeSpecializations, before customPostPasses\n",
-          *graph);
+      FuseTensorExprs(graph, min_size, /* composed op*/ false, dyn_shapes);
+      GRAPH_DEBUG("After Fusion, before customPostPasses\n", *graph);
     } else {
       // Rewrite subgraphs with many MMs into expressions that batch them.
       BatchMM(graph);
@@ -444,12 +448,17 @@ void runNoGradOptimizations(
     }
 
     // Run custom post-fusion passes
+    // e.g. NVFuser
     for (const auto& passPair : getCustomPostPasses()) {
       passPair.first(graph);
     }
+    GRAPH_DEBUG(
+        "After customPostPasses, before RemoveTensorTypeSpecializations \n",
+        *graph);
+    RemoveTensorTypeSpecializations(graph);
+    GRAPH_DEBUG("After RemoveTensorTypeSpecializations\n", *graph);
   }
-  GRAPH_DEBUG(
-      "After customPostPasses (end of runNoGradOptimizations)\n", *graph);
+  GRAPH_DEBUG("End of runNoGradOptimizations\n");
 }
 
 void ProfilingGraphExecutorImpl::runProfilingOptimizations(
@@ -548,11 +557,7 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations(
   DecomposeOps(graph);
   GRAPH_DEBUG("After DecomposeOps, before ConstantPropagation\n", *graph);
   ConstantPropagation(graph);
-  GRAPH_DEBUG(
-      "After ConstantPropagation, before HoistCommonExpression\n", *graph);
-  HoistCommonExpression(graph);
-  GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before ElimiateDeadCode\n", *graph);
+  GRAPH_DEBUG("After ConstantPropagation, before EliminateDeadCode\n", *graph);
   EliminateDeadCode(graph);
   GRAPH_DEBUG(
       "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph);
@@ -576,15 +581,27 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations(
 ProfilingGraphExecutorImpl::ProfilingGraphExecutorImpl(
     const std::shared_ptr<Graph>& graph,
     std::string function_name)
-    : GraphExecutorImplBase(graph, std::move(function_name)) {}
+    : GraphExecutorImplBase(graph, std::move(function_name)) {
+  fusion_strategy_ = getFusionStrategy();
+}
+
+size_t ProfilingGraphExecutorImpl::getInstantiatedBailoutDepth() {
+  // Initialize bailout_depth from command-line flag.
+  size_t depth = 0;
+  for (const auto& pair : fusion_strategy_) {
+    depth += pair.second;
+  }
+  return depth;
+}
 
 const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
     Stack& stack,
-    size_t remaining_bailout_depth) {
+    c10::optional<size_t> remaining_bailout_depth) {
   GRAPH_DEBUG("Running ProfilingGraphExecutorImpl ", this);
 
+  // TODO: instantiate simple executor when getProfilingMode() is false
   // no opt mode
-  if (!getGraphExecutorOptimize()) {
+  if (!getGraphExecutorOptimize() || !getProfilingMode()) {
     if (!fallback_plan_) {
       auto copy = graph->copy();
       GRAPH_DEBUG(
@@ -604,7 +621,11 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
   // getPlanFor(remaining_bailout_depth) is corrected and persisted by the Code
   // object in interpreter.
   if (!remaining_bailout_depth_.has_value() || !tensorExprFuserEnabled()) {
-    remaining_bailout_depth_ = remaining_bailout_depth;
+    if (remaining_bailout_depth.has_value()) {
+      remaining_bailout_depth_ = *remaining_bailout_depth;
+    } else {
+      remaining_bailout_depth_ = getInstantiatedBailoutDepth();
+    }
   }
 
   // simple executor
@@ -630,7 +651,7 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
     // `InsertProfileNodesForCUDAFuser` inserts profile node for non-tensor
     // value
 #ifndef C10_MOBILE
-    if (RegisterCudaFuseGraph::isRegistered()) {
+    if (torch::jit::fuser::cuda::isEnabled()) {
       torch::jit::fuser::cuda::InsertProfileNodesForCUDAFuser(pr_.get());
     }
 #endif
@@ -650,15 +671,16 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
   // replaces a fallback graph inserted by
   // specialize_autogradzero if one exists
   replaceFallbackGraphWithFallbackFunction(copy->block());
+  runFinalOptimizations(copy);
+  CheckStrictFusion(copy);
   GRAPH_DUMP("Optimized Graph: ", copy);
-  optimized_plan_ =
-      ExecutionPlan(copy, function_name_, *remaining_bailout_depth_);
+  optimized_plan_ = ExecutionPlan(copy, function_name_);
   return *optimized_plan_;
 }
 
 const ExecutionPlan& ProfilingGraphExecutorImpl::getPlanFor(
     Stack& stack,
-    size_t remaining_bailout_depth) {
+    c10::optional<size_t> remaining_bailout_depth) {
   std::lock_guard<std::mutex> lock(compile_mutex);
 
   // IMPORTANT: This is a hot path of calling a torchscript function. Try not to
@@ -666,7 +688,7 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getPlanFor(
   if (optimized_plan_) {
     return *optimized_plan_;
   }
-
+  // if depth is not set, use
   return getOptimizedPlanFor(stack, remaining_bailout_depth);
 }
 
@@ -749,5 +771,10 @@ void ProfilingGraphExecutorImpl::replaceFallbackGraphWithFallbackFunction(
   }
 }
 
+void ProfilingGraphExecutorImpl::runFinalOptimizations(
+    std::shared_ptr<Graph>& graph) {
+  AddIfThenElseOp(graph);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
index af9d1a25c6f3..ef79a6860df5 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
@@ -1,69 +1,25 @@
 #pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/runtime/graph_executor_impl.h>
 
+C10_DECLARE_bool(torch_jit_static_then_dynamic);
+
+C10_DECLARE_bool(torch_jit_always_dynamic);
+
 namespace torch {
 namespace jit {
 
-enum class FusionBehavior { STATIC, DYNAMIC };
-
-using FusionStrategy = std::vector<std::pair<FusionBehavior, size_t>>;
-// FusionStrategy is used to control the type and number of specializations that
-//   can occur during fusion
-//
-// Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or
-//   "DYNAMIC" and depth is an integer.
-//
-// Behavior - static vs dynamic:
-// - in STATIC fusion, fused ops are compiled to have fixed input shapes. The
-//   input shapes are determined based on a number of initial profiling runs.
-//   The shape is determined based on some initial profiling runs. For example,
-//   if on the first run an input of shape [2, 4] is observed, then the compiled
-//   op will only work on shapes of size [2, 4].
-// - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so
-//   that multiple shapes are possible. Dynamic fusion uses "symbolic shapes",
-//   where any dimensions of the same value that are observed in profiling runs
-//   are assumed to have the same value. For example, if inputs of [2,3,4] and
-//   [3,4,5] are observed, then it is assumed that future inputs will have
-//   shapes [a,b,c] and [b,c,d] for some values of a,b,c,d.
-//
-//   In both cases, we also recompile on new striding behavior, device, or
-//   dtype.
-//
-// Behavior - fallback functions & depth:
-//   When an input doesn't match the format required by the specialized compiled
-//   op, it will run a fallback function.
-//   Fallback functions can also recursively be compiled and specialized based
-//   on the input shape. Since compilation can be slow, the "depth" parameter is
-//   provided to limit the number of specializations that can be compiled,
-//   before JIT gives up on recompiling and falls back to a completely un-fused,
-//   un-specialized implementation.
-//
-// The list of (type, depth) pairs controls the type of specializations and the
-//   number of specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)]
-//   indicates that the first two specializations will use static fusions, the
-//   following two specializations will use dynamic fusion, and any inputs that
-//   satisfy none of the 4 options will run an unfused implementation.
-// Below an example of the fallback function structure is shown, if given a
-//   strategy of [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had
-//   these input shapes:
-//     [2, 2], [3, 3], [4, 4], [3, 5], ...
-//
-//   + specialized: statically fused, shape [2, 2]
-//    \-> + fallback 1; statically fused, shape [3, 3]
-//         \-> + fallback 2; dynamically fused, shape [A, A]
-//              \-> + fallback 3: dynamically fused, shape [A, B]
-//                   \-> final fallback: unspecialized, unfused
-TORCH_API FusionStrategy getFusionStrategy();
-// returns previous strategy
-TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy);
+TORCH_API void runNooptPassPipeline(std::shared_ptr<Graph>& graph);
 
 struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
   ProfilingGraphExecutorImpl(
       const std::shared_ptr<Graph>& graph,
       std::string function_name);
 
-  const ExecutionPlan& getPlanFor(Stack& stack, size_t remaining_bailout_depth)
-      override;
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override;
   GraphExecutorState getDebugState() override;
   ~ProfilingGraphExecutorImpl() override = default;
 
@@ -76,6 +32,8 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
     // prevent memory leaks
     fallback_functions_.clear();
     remaining_bailout_depth_.reset();
+    // TODO - would be nice to have it initialized in subsequent use
+    fusion_strategy_ = getFusionStrategy();
   }
 
   bool isOptimized() const override {
@@ -85,16 +43,24 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
  private:
   const ExecutionPlan& getOptimizedPlanFor(
       Stack& stack,
-      size_t remaining_bailout_depth);
+      c10::optional<size_t> remaining_bailout_depth);
   void runProfilingInsensitiveOptimizations(std::shared_ptr<Graph>& graph);
   void runProfilingOptimizations(
       std::shared_ptr<Graph>& graph,
       size_t remaining_depth);
   void replaceFallbackGraphWithFallbackFunction(Block* b);
+  FusionBehavior getCurrentBehavior(size_t remaining_depth);
+  size_t getInstantiatedBailoutDepth();
+  void runNoGradOptimizations(
+      std::shared_ptr<Graph>& graph,
+      size_t remaining_bailout_depth);
+  void runFinalOptimizations(std::shared_ptr<Graph>& graph);
   std::unique_ptr<ProfilingRecord> pr_;
   c10::optional<ExecutionPlan>
       profiling_plan_; // plan to run in order to profiling the code
   c10::optional<ExecutionPlan> optimized_plan_;
+  FusionStrategy fusion_strategy_;
+
   // this plan is used if getGraphExecutorOptimize is unset
   c10::optional<ExecutionPlan> fallback_plan_;
   // fallback functions are inserted for tensorexpr fusion groups
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 7c71c48c5139..31ed3ff4068c 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -2,10 +2,10 @@
 
 #include <ATen/core/symbol.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
@@ -209,7 +209,7 @@ void ProfilingRecord::insertShapeProfile(
 bool needsProfiledInputs(Node* n) {
   if (tensorexpr::isSupported(n) ||
 #ifndef C10_MOBILE
-      (RegisterCudaFuseGraph::isRegistered() && fuser::cuda::profileNode(n))
+      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
 #else
       false
 #endif
@@ -246,7 +246,7 @@ bool needsProfiledInputs(Node* n) {
 bool needsProfiledOutput(Node* n) {
   if (tensorexpr::isSupported(n) ||
 #ifndef C10_MOBILE
-      (RegisterCudaFuseGraph::isRegistered() && fuser::cuda::profileNode(n))
+      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
 #else
       false
 #endif
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 0bf4f22aa7f0..5521572fd3a7 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -503,7 +503,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"),
         [](Stack& stack) {
-          RECORD_FUNCTION("get_device", std::vector<c10::IValue>());
+          RECORD_FUNCTION("get_device", c10::ArrayRef<const c10::IValue>{});
           auto result =
               at::get_device((std::move(peek(stack, 0, 1))).toTensor());
           drop(stack, 1);
@@ -513,7 +513,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"),
         [](Stack& stack) {
-          RECORD_FUNCTION("storage_offset", std::vector<c10::IValue>());
+          RECORD_FUNCTION("storage_offset", c10::ArrayRef<const c10::IValue>{});
           auto result =
               ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset();
           drop(stack, 1);
@@ -523,7 +523,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"),
         [](Stack& stack) {
-          RECORD_FUNCTION("is_contiguous", std::vector<c10::IValue>());
+          RECORD_FUNCTION("is_contiguous", c10::ArrayRef<const c10::IValue>{});
           auto result =
               ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous();
           drop(stack, 1);
@@ -700,6 +700,17 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           push(stack, at::stack(inputs, dim));
         },
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "prim::IfThenElse(bool cond, Any(a) x, Any(b) y) -> Any(a|b)"),
+        [](Stack& stack) {
+          const auto cond = stack[stack.size() - 3].toBool();
+          stack[stack.size() - 3] =
+              std::move(stack[stack.size() - (cond ? 2 : 1)]);
+          stack.pop_back();
+          stack.pop_back();
+        },
+        aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"),
@@ -974,9 +985,14 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
         TORCH_SELECTIVE_SCHEMA(
             "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
         [](Stack& stack) {
-          auto indices = pop(stack).to<c10::List<c10::optional<at::Tensor>>>();
+          auto indices = pop(stack).to<c10::List<at::Tensor>>();
+          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          opt_list_indices.reserve(indices.size());
+          for (const auto& ten : indices) {
+            opt_list_indices.push_back(ten);
+          }
           auto self = pop(stack).toTensor();
-          auto result = at::index(self, indices);
+          auto result = at::index(self, opt_list_indices);
           push(stack, std::move(result));
         },
         aliasAnalysisFromSchema()),
@@ -987,10 +1003,15 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto unsafe = pop(stack).toBool();
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
-          auto indices = pop(stack).to<c10::List<c10::optional<at::Tensor>>>();
+          auto indices = pop(stack).to<c10::List<at::Tensor>>();
+          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          opt_list_indices.reserve(indices.size());
+          for (const auto& ten : indices) {
+            opt_list_indices.push_back(ten);
+          }
           auto self = pop(stack).toTensor();
-          auto result =
-              at::_index_put_impl_(self, indices, values, accumulate, unsafe);
+          auto result = at::_index_put_impl_(
+              self, opt_list_indices, values, accumulate, unsafe);
           push(stack, std::move(result));
         },
         aliasAnalysisFromSchema()),
@@ -1000,9 +1021,15 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
         [](Stack& stack) {
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
-          auto indices = pop(stack).to<c10::List<c10::optional<at::Tensor>>>();
+          auto indices = pop(stack).to<c10::List<at::Tensor>>();
+          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          opt_list_indices.reserve(indices.size());
+          for (const auto& ten : indices) {
+            opt_list_indices.push_back(ten);
+          }
           auto self = pop(stack).toTensor();
-          auto result = at::index_put_(self, indices, values, accumulate);
+          auto result =
+              at::index_put_(self, opt_list_indices, values, accumulate);
           push(stack, std::move(result));
         },
         aliasAnalysisFromSchema()),
@@ -1012,9 +1039,15 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
         [](Stack& stack) {
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
-          auto indices = pop(stack).to<c10::List<c10::optional<at::Tensor>>>();
+          auto indices = pop(stack).to<c10::List<at::Tensor>>();
+          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          opt_list_indices.reserve(indices.size());
+          for (const auto& ten : indices) {
+            opt_list_indices.push_back(ten);
+          }
           auto self = pop(stack).toTensor();
-          auto result = at::index_put_(self, indices, values, accumulate);
+          auto result =
+              at::index_put_(self, opt_list_indices, values, accumulate);
           push(stack, std::move(result));
         },
         aliasAnalysisFromSchema()),
@@ -2234,11 +2267,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
-        TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"),
+        TORCH_SELECTIVE_SCHEMA("prim::is_mps(Tensor a) -> bool"),
         [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
-          push(stack, a.is_mlc());
+          push(stack, a.is_mps());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
@@ -2249,6 +2282,14 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
           push(stack, a.is_vulkan());
         },
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_ipu(Tensor a) -> bool"),
+        [](Stack& stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_ipu());
+        },
+        aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"),
         [](Stack& stack) {
@@ -2273,6 +2314,14 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
           push(stack, a.is_ort());
         },
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_nested(Tensor a) -> bool"),
+        [](Stack& stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_nested());
+        },
+        aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"),
         [](Stack& stack) {
diff --git a/torch/csrc/jit/runtime/script_profile.cpp b/torch/csrc/jit/runtime/script_profile.cpp
index c393dcde1e01..e8e4efed3d3f 100644
--- a/torch/csrc/jit/runtime/script_profile.cpp
+++ b/torch/csrc/jit/runtime/script_profile.cpp
@@ -61,7 +61,7 @@ auto initBindings() {
             return static_cast<int64_t>((*self)->starting_line_no());
           })
       .def("text", [](const c10::intrusive_ptr<SourceRef>& self) {
-        return (*self)->text();
+        return (*self)->text_str().str();
       });
 
   torch::class_<InstructionStats>("profiling", "InstructionStats")
diff --git a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
new file mode 100644
index 000000000000..293a386abc78
--- /dev/null
+++ b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
@@ -0,0 +1,2772 @@
+
+/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python
+ * torchgen/shape_functions/gen_jit_shape_functions.py
+ */
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/serialized_shape_function_registry.h>
+
+// clang-format off
+
+namespace torch {
+namespace jit {
+
+
+std::string shape_funcs = ""
++ std::string(R"=====(
+def unary(self: List[int]) -> List[int]:
+  out = annotate(List[int], [])
+  for _0 in range(torch.len(self)):
+    elem = self[_0]
+    _1 = torch.append(out, elem)
+  return out
+
+def adaptive_avg_pool2d(self: List[int],
+    out: List[int]) -> List[int]:
+  if torch.eq(torch.len(out), 2):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(self), 3):
+    _0 = True
+  else:
+    _0 = torch.eq(torch.len(self), 4)
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _1 = torch.__range_length(1, torch.len(self), 1)
+  for _2 in range(_1):
+    i = torch.__derive_index(_2, 1, 1)
+    if torch.ne(self[i], 0):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  shape = annotate(List[int], [])
+  _3 = torch.__range_length(0, torch.sub(torch.len(self), 2), 1)
+  for _4 in range(_3):
+    i0 = torch.__derive_index(_4, 0, 1)
+    _5 = torch.append(shape, self[i0])
+  for _6 in range(torch.len(out)):
+    elem = out[_6]
+    _7 = torch.append(shape, elem)
+  return shape
+
+def zero_dim_tensor(input: Any) -> List[int]:
+  return annotate(List[int], [])
+
+def arange_end(end: Union[float, int],
+    inp0: Any,
+    inp1: Any,
+    inp2: Any,
+    inp3: Any) -> List[int]:
+  if torch.ge(end, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  return [int(torch.ceil(end))]
+
+def arange_start(start: Union[float, int],
+    end: Union[float, int],
+    inp0: Any,
+    inp1: Any,
+    inp2: Any,
+    inp3: Any) -> List[int]:
+  if torch.ge(end, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(end, start):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _0 = int(torch.ceil(torch.sub(end, start)))
+  return [_0]
+
+)=====")
++ std::string(R"=====(def arange_start_step(start: Union[float, int],
+    end: Union[float, int],
+    step: Union[float, int],
+    inp0: Any,
+    inp1: Any,
+    inp2: Any,
+    inp3: Any) -> List[int]:
+  if torch.ne(step, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(step, 0):
+    if torch.ge(start, end):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  else:
+    if torch.ge(end, start):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  _0 = torch.div(torch.sub(end, start), step)
+  return [torch.ceil(_0)]
+
+def squeeze_nodim(li: List[int]) -> List[int]:
+  out = annotate(List[int], [])
+  for i in range(torch.len(li)):
+    if torch.ne(li[i], 1):
+      _0 = torch.append(out, li[i])
+    else:
+      pass
+  return out
+
+def squeeze(li: List[int],
+    dim: int) -> List[int]:
+  out = annotate(List[int], [])
+  _0 = torch.len(li)
+  if torch.le(_0, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = _0
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim, min):
+    _1 = True
+  else:
+    _1 = torch.gt(dim, max)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim, 0):
+    wrapped_dim = torch.add(dim, dim_post_expr)
+  else:
+    wrapped_dim = dim
+  for i in range(torch.len(li)):
+    if torch.eq(i, wrapped_dim):
+      if torch.ne(li[i], 1):
+        _2 = torch.append(out, li[i])
+      else:
+        pass
+    else:
+      _3 = torch.append(out, li[i])
+  return out
+
+)=====")
++ std::string(R"=====(def unsqueeze(li: List[int],
+    dim: int) -> List[int]:
+  _0 = torch.add(torch.len(li), 1)
+  if torch.le(_0, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = _0
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim, min):
+    _1 = True
+  else:
+    _1 = torch.gt(dim, max)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim, 0):
+    dim0 = torch.add(dim, dim_post_expr)
+  else:
+    dim0 = dim
+  out = annotate(List[int], [])
+  for _2 in range(torch.len(li)):
+    elem = li[_2]
+    _3 = torch.append(out, elem)
+  torch.insert(out, dim0, 1)
+  return out
+
+)=====")
++ std::string(R"=====(def slice(self: List[int],
+    dim: int,
+    start: Optional[int],
+    end: Optional[int],
+    step: int) -> List[int]:
+  ndim = torch.len(self)
+  if torch.ne(ndim, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.le(ndim, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = ndim
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim, min):
+    _0 = True
+  else:
+    _0 = torch.gt(dim, max)
+  if torch.__not__(_0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim, 0):
+    dim0 = torch.add(dim, dim_post_expr)
+  else:
+    dim0 = dim
+  if torch.__isnot__(start, None):
+    start_val = unchecked_cast(int, start)
+  else:
+    start_val = 0
+  if torch.__isnot__(end, None):
+    end_val = unchecked_cast(int, end)
+  else:
+    end_val = 9223372036854775807
+  if torch.gt(step, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _1 = torch.eq(start_val, 9223372036854775807)
+  if _1:
+    start_val0 = 0
+  else:
+    start_val0 = start_val
+  if torch.lt(start_val0, 0):
+    start_val1 = torch.add(start_val0, self[dim0])
+  else:
+    start_val1 = start_val0
+  if torch.lt(end_val, 0):
+    end_val0 = torch.add(end_val, self[dim0])
+  else:
+    end_val0 = end_val
+  if torch.lt(start_val1, 0):
+    start_val2 = 0
+  else:
+    if torch.gt(start_val1, self[dim0]):
+      start_val3 = self[dim0]
+    else:
+      start_val3 = start_val1
+    start_val2 = start_val3
+  if torch.lt(end_val0, start_val2):
+    end_val1 = start_val2
+  else:
+    if torch.ge(end_val0, self[dim0]):
+      end_val2 = self[dim0]
+    else:
+      end_val2 = end_val0
+    end_val1 = end_val2
+  slice_len = torch.sub(end_val1, start_val2)
+  out = annotate(List[int], [])
+  for _2 in range(torch.len(self)):
+    elem = self[_2]
+    _3 = torch.append(out, elem)
+  _4 = torch.sub(torch.add(slice_len, step), 1)
+  _5 = torch._set_item(out, dim0, torch.floordiv(_4, step))
+  return out
+
+)=====")
++ std::string(R"=====(def select(self: List[int],
+    dim: int,
+    index: int) -> List[int]:
+  ndim = torch.len(self)
+  if torch.ne(ndim, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.le(ndim, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = ndim
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim, min):
+    _0 = True
+  else:
+    _0 = torch.gt(dim, max)
+  if torch.__not__(_0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim, 0):
+    dim0 = torch.add(dim, dim_post_expr)
+  else:
+    dim0 = dim
+  size = self[dim0]
+  if torch.lt(index, torch.neg(size)):
+    _1 = True
+  else:
+    _1 = torch.ge(index, size)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  out = annotate(List[int], [])
+  for i in range(ndim):
+    if torch.ne(i, dim0):
+      _2 = torch.append(out, self[i])
+    else:
+      pass
+  return out
+
+)=====")
++ std::string(R"=====(def index_select(self: List[int],
+    dim: int,
+    index: List[int]) -> List[int]:
+  _0 = torch.len(self)
+  if torch.le(_0, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = _0
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim, min):
+    _1 = True
+  else:
+    _1 = torch.gt(dim, max)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim, 0):
+    dim0 = torch.add(dim, dim_post_expr)
+  else:
+    dim0 = dim
+  numel = 1
+  for _2 in range(torch.len(index)):
+    elem = index[_2]
+    numel = torch.mul(numel, elem)
+  if torch.le(torch.len(index), 1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(dim0, 0):
+    _3 = True
+  else:
+    _3 = torch.lt(dim0, torch.len(self))
+  if _3:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  result_size = annotate(List[int], [])
+  for i in range(torch.len(self)):
+    if torch.eq(dim0, i):
+      _4 = torch.append(result_size, numel)
+    else:
+      _5 = torch.append(result_size, self[i])
+  return result_size
+
+)=====")
++ std::string(R"=====(def embedding(weight: List[int],
+    indices: List[int],
+    padding_idx: int=-1,
+    scale_grad_by_freq: bool=False,
+    sparse: bool=False) -> List[int]:
+  if torch.eq(torch.len(weight), 2):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(indices), 1):
+    _1 = torch.len(weight)
+    if torch.le(_1, 0):
+      dim_post_expr = 1
+    else:
+      dim_post_expr = _1
+    min = torch.neg(dim_post_expr)
+    max = torch.sub(dim_post_expr, 1)
+    if torch.lt(0, min):
+      _2 = True
+    else:
+      _2 = torch.gt(0, max)
+    if torch.__not__(_2):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    numel = 1
+    for _3 in range(torch.len(indices)):
+      elem = indices[_3]
+      numel = torch.mul(numel, elem)
+    if torch.le(torch.len(indices), 1):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    result_size = annotate(List[int], [])
+    for i in range(torch.len(weight)):
+      if torch.eq(0, i):
+        _4 = torch.append(result_size, numel)
+      else:
+        _5 = torch.append(result_size, weight[i])
+    _0 = result_size
+  else:
+    size = annotate(List[int], [])
+    for _6 in range(torch.len(indices)):
+      elem0 = indices[_6]
+      _7 = torch.append(size, elem0)
+    _8 = torch.append(size, weight[1])
+    _0 = size
+  return _0
+
+def mm(self: List[int],
+    mat2: List[int]) -> List[int]:
+  _0 = "AssertionError: self must be a matrix"
+  _1 = "AssertionError: mat2 must be a matrix"
+  if torch.eq(torch.len(self), 2):
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  if torch.eq(torch.len(mat2), 2):
+    pass
+  else:
+    ops.prim.RaiseException(_1)
+  if torch.eq(self[1], mat2[0]):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  return [self[0], mat2[1]]
+
+)=====")
++ std::string(R"=====(def dot(self: List[int],
+    tensor: List[int]) -> List[int]:
+  if torch.eq(torch.len(self), 1):
+    _0 = torch.eq(torch.len(tensor), 1)
+  else:
+    _0 = False
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(self[0], tensor[0]):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  return annotate(List[int], [])
+
+def mv(self: List[int],
+    vec: List[int]) -> List[int]:
+  if torch.eq(torch.len(self), 2):
+    _0 = torch.eq(torch.len(vec), 1)
+  else:
+    _0 = False
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(self[1], vec[0]):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  return [self[0]]
+
+)=====")
++ std::string(R"=====(def matmul(tensor1: List[int],
+    tensor2: List[int]) -> List[int]:
+  _0 = "AssertionError: self must be a matrix"
+  _1 = "AssertionError: mat2 must be a matrix"
+  _2 = "AssertionError: self must be a matrix"
+  _3 = "AssertionError: mat2 must be a matrix"
+  _4 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  _5 = "AssertionError: both  arguments to matmul need to be at least 1D"
+  _6 = uninitialized(List[int])
+  dim_tensor1 = torch.len(tensor1)
+  dim_tensor2 = torch.len(tensor2)
+  if torch.eq(dim_tensor1, 1):
+    _7 = torch.eq(dim_tensor2, 1)
+  else:
+    _7 = False
+  if _7:
+    if torch.eq(torch.len(tensor1), 1):
+      _9 = torch.eq(torch.len(tensor2), 1)
+    else:
+      _9 = False
+    if _9:
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    if torch.eq(tensor1[0], tensor2[0]):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    _8 = annotate(List[int], [])
+  else:
+    if torch.eq(dim_tensor1, 2):
+      _10 = torch.eq(dim_tensor2, 1)
+    else:
+      _10 = False
+    if _10:
+      if torch.eq(torch.len(tensor1), 2):
+        _12 = torch.eq(torch.len(tensor2), 1)
+      else:
+        _12 = False
+      if _12:
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      if torch.eq(tensor1[1], tensor2[0]):
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      _11 = [tensor1[0]]
+    else:
+      if torch.eq(dim_tensor1, 1):
+        _13 = torch.eq(dim_tensor2, 2)
+      else:
+        _13 = False
+      if _13:
+        _15 = torch.add(torch.len(tensor1), 1)
+        if torch.le(_15, 0):
+          dim_post_expr = 1
+        else:
+          dim_post_expr = _15
+        min = torch.neg(dim_post_expr)
+        max = torch.sub(dim_post_expr, 1)
+        if torch.lt(0, min):
+          _16 = True
+        else:
+          _16 = torch.gt(0, max)
+        if torch.__not__(_16):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        out = annotate(List[int], [])
+        for _17 in range(torch.len(tensor1)):
+          elem = tensor1[_17]
+          _18 = torch.append(out, elem)
+        torch.insert(out, 0, 1)
+        if torch.eq(torch.len(out), 2):
+          pass
+        else:
+          ops.prim.RaiseException(_0)
+        if torch.eq(torch.len(tensor2), 2):
+          pass
+        else:
+          ops.prim.RaiseException(_1)
+        if torch.eq(out[1], tensor2[0]):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        _19 = [out[0], tensor2[1]]
+        out0 = annotate(List[int], [])
+        for i in range(2):
+          if torch.eq(i, 0):
+            if torch.ne(_19[i], 1):
+              _20 = torch.append(out0, _19[i])
+            else:
+              pass
+          else:
+            _21 = torch.append(out0, _19[i])
+        _14 = out0
+      else:
+        if torch.eq(dim_tensor1, 2):
+          _22 = torch.eq(dim_tensor2, 2)
+        else:
+          _22 = False
+        if _22:
+          _24 = torch.eq(torch.len(tensor1), 2)
+          if _24:
+            pass
+          else:
+            ops.prim.RaiseException(_2)
+          _25 = torch.eq(torch.len(tensor2), 2)
+          if _25:
+            pass
+          else:
+            ops.prim.RaiseException(_3)
+          _26 = torch.eq(tensor1[1], tensor2[0])
+          if _26:
+            pass
+          else:
+            ops.prim.RaiseException("AssertionError: ")
+          _23 = [tensor1[0], tensor2[1]]
+        else:
+          if torch.ge(dim_tensor1, 1):
+            _27 = torch.ge(dim_tensor2, 1)
+          else:
+            _27 = False
+          if _27:
+            if torch.gt(dim_tensor1, 1):
+              n = tensor1[-2]
+            else:
+              n = 1
+            batch_tensor1 = annotate(List[int], [])
+            for i0 in range(torch.sub(dim_tensor1, 2)):
+              _29 = torch.append(batch_tensor1, tensor1[i0])
+            p = tensor2[-1]
+            batch_tensor2 = annotate(List[int], [])
+            for i1 in range(torch.sub(dim_tensor2, 2)):
+              _30 = torch.append(batch_tensor2, tensor2[i1])
+            dimsA = torch.len(batch_tensor1)
+            dimsB = torch.len(batch_tensor2)
+            ndim = ops.prim.max(dimsA, dimsB)
+            expand_batch_portion = annotate(List[int], [])
+            for i2 in range(ndim):
+              offset = torch.sub(torch.sub(ndim, 1), i2)
+              dimA = torch.sub(torch.sub(dimsA, 1), offset)
+              dimB = torch.sub(torch.sub(dimsB, 1), offset)
+              if torch.ge(dimA, 0):
+                sizeA = batch_tensor1[dimA]
+              else:
+                sizeA = 1
+              if torch.ge(dimB, 0):
+                sizeB = batch_tensor2[dimB]
+              else:
+                sizeB = 1
+              if torch.ne(sizeA, sizeB):
+                _31 = torch.ne(sizeA, 1)
+              else:
+                _31 = False
+              if _31:
+                _32 = torch.ne(sizeB, 1)
+              else:
+                _32 = False
+              if _32:
+                _33 = torch.format(_4, sizeA, sizeB, i2)
+                _34 = torch.add("AssertionError: ", _33)
+                ops.prim.RaiseException(_34)
+              else:
+                pass
+              if torch.eq(sizeA, 1):
+                _35 = sizeB
+              else:
+                _35 = sizeA
+              _36 = torch.append(expand_batch_portion, _35)
+            if torch.gt(dim_tensor1, 1):
+              _37 = torch.append(expand_batch_portion, n)
+            else:
+              pass
+            if torch.gt(dim_tensor2, 1):
+              _38 = torch.append(expand_batch_portion, p)
+            else:
+              pass
+            _28 = expand_batch_portion
+          else:
+            ops.prim.RaiseException(_5)
+            _28 = _6
+          _23 = _28
+        _14 = _23
+      _11 = _14
+    _8 = _11
+  return _8
+
+)=====")
++ std::string(R"=====(def linear(input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]]) -> List[int]:
+  _0 = "AssertionError: self must be a matrix"
+  _1 = "AssertionError: mat2 must be a matrix"
+  _2 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  _3 = "AssertionError: both  arguments to matmul need to be at least 1D"
+  _4 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  if torch.le(torch.len(weight), 2):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  self_len = torch.len(weight)
+  if torch.eq(self_len, 0):
+    _5 = annotate(List[int], [])
+  else:
+    if torch.eq(self_len, 1):
+      _6 = [weight[0]]
+    else:
+      _6 = [weight[1], weight[0]]
+    _5 = _6
+  _7 = uninitialized(List[int])
+  dim_tensor1 = torch.len(input)
+  dim_tensor2 = torch.len(_5)
+  if torch.eq(dim_tensor1, 1):
+    _8 = torch.eq(dim_tensor2, 1)
+  else:
+    _8 = False
+  if _8:
+    if torch.eq(torch.len(input), 1):
+      _9 = torch.eq(torch.len(_5), 1)
+    else:
+      _9 = False
+    if _9:
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    if torch.eq(input[0], _5[0]):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    out = annotate(List[int], [])
+  else:
+    if torch.eq(dim_tensor1, 2):
+      _10 = torch.eq(dim_tensor2, 1)
+    else:
+      _10 = False
+    if _10:
+      if torch.eq(torch.len(input), 2):
+        _12 = torch.eq(torch.len(_5), 1)
+      else:
+        _12 = False
+      if _12:
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      if torch.eq(input[1], _5[0]):
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      _11 = [input[0]]
+    else:
+      if torch.eq(dim_tensor1, 1):
+        _13 = torch.eq(dim_tensor2, 2)
+      else:
+        _13 = False
+      if _13:
+        _15 = torch.add(torch.len(input), 1)
+        if torch.le(_15, 0):
+          dim_post_expr = 1
+        else:
+          dim_post_expr = _15
+        min = torch.neg(dim_post_expr)
+        max = torch.sub(dim_post_expr, 1)
+        if torch.lt(0, min):
+          _16 = True
+        else:
+          _16 = torch.gt(0, max)
+        if torch.__not__(_16):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        out0 = annotate(List[int], [])
+        for _17 in range(torch.len(input)):
+          elem = input[_17]
+          _18 = torch.append(out0, elem)
+        torch.insert(out0, 0, 1)
+        if torch.eq(torch.len(out0), 2):
+          pass
+        else:
+          ops.prim.RaiseException(_0)
+        if torch.eq(torch.len(_5), 2):
+          pass
+        else:
+          ops.prim.RaiseException(_1)
+        if torch.eq(out0[1], _5[0]):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        _19 = [out0[0], _5[1]]
+        out1 = annotate(List[int], [])
+        for i in range(2):
+          if torch.eq(i, 0):
+            if torch.ne(_19[i], 1):
+              _20 = torch.append(out1, _19[i])
+            else:
+              pass
+          else:
+            _21 = torch.append(out1, _19[i])
+        _14 = out1
+      else:
+        if torch.eq(dim_tensor1, 2):
+          _22 = torch.eq(dim_tensor2, 2)
+        else:
+          _22 = False
+        if _22:
+          if torch.eq(torch.len(input), 2):
+            pass
+          else:
+            ops.prim.RaiseException(_0)
+          if torch.eq(torch.len(_5), 2):
+            pass
+          else:
+            ops.prim.RaiseException(_1)
+          if torch.eq(input[1], _5[0]):
+            pass
+          else:
+            ops.prim.RaiseException("AssertionError: ")
+          _23 = [input[0], _5[1]]
+        else:
+          if torch.ge(dim_tensor1, 1):
+            _24 = torch.ge(dim_tensor2, 1)
+          else:
+            _24 = False
+          if _24:
+            if torch.gt(dim_tensor1, 1):
+              n = input[-2]
+            else:
+              n = 1
+            batch_tensor1 = annotate(List[int], [])
+            for i0 in range(torch.sub(dim_tensor1, 2)):
+              _26 = torch.append(batch_tensor1, input[i0])
+            p = _5[-1]
+            batch_tensor2 = annotate(List[int], [])
+            for i1 in range(torch.sub(dim_tensor2, 2)):
+              _27 = torch.append(batch_tensor2, _5[i1])
+            dimsA = torch.len(batch_tensor1)
+            dimsB = torch.len(batch_tensor2)
+            ndim = ops.prim.max(dimsA, dimsB)
+            expand_batch_portion = annotate(List[int], [])
+            for i2 in range(ndim):
+              offset = torch.sub(torch.sub(ndim, 1), i2)
+              dimA = torch.sub(torch.sub(dimsA, 1), offset)
+              dimB = torch.sub(torch.sub(dimsB, 1), offset)
+              if torch.ge(dimA, 0):
+                sizeA = batch_tensor1[dimA]
+              else:
+                sizeA = 1
+              if torch.ge(dimB, 0):
+                sizeB = batch_tensor2[dimB]
+              else:
+                sizeB = 1
+              if torch.ne(sizeA, sizeB):
+                _28 = torch.ne(sizeA, 1)
+              else:
+                _28 = False
+              if _28:
+                _29 = torch.ne(sizeB, 1)
+              else:
+                _29 = False
+              if _29:
+                _30 = torch.format(_2, sizeA, sizeB, i2)
+                _31 = torch.add("AssertionError: ", _30)
+                ops.prim.RaiseException(_31)
+              else:
+                pass
+              if torch.eq(sizeA, 1):
+                _32 = sizeB
+              else:
+                _32 = sizeA
+              _33 = torch.append(expand_batch_portion, _32)
+            if torch.gt(dim_tensor1, 1):
+              _34 = torch.append(expand_batch_portion, n)
+            else:
+              pass
+            if torch.gt(dim_tensor2, 1):
+              _35 = torch.append(expand_batch_portion, p)
+            else:
+              pass
+            _25 = expand_batch_portion
+          else:
+            ops.prim.RaiseException(_3)
+            _25 = _7
+          _23 = _25
+        _14 = _23
+      _11 = _14
+    out = _11
+  if torch.__isnot__(bias, None):
+    bias0 = unchecked_cast(List[int], bias)
+    dimsA0 = torch.len(bias0)
+    dimsB0 = torch.len(out)
+    ndim0 = ops.prim.max(dimsA0, dimsB0)
+    expandedSizes = annotate(List[int], [])
+    for i3 in range(ndim0):
+      offset0 = torch.sub(torch.sub(ndim0, 1), i3)
+      dimA0 = torch.sub(torch.sub(dimsA0, 1), offset0)
+      dimB0 = torch.sub(torch.sub(dimsB0, 1), offset0)
+      if torch.ge(dimA0, 0):
+        sizeA0 = bias0[dimA0]
+      else:
+        sizeA0 = 1
+      if torch.ge(dimB0, 0):
+        sizeB0 = out[dimB0]
+      else:
+        sizeB0 = 1
+      if torch.ne(sizeA0, sizeB0):
+        _36 = torch.ne(sizeA0, 1)
+      else:
+        _36 = False
+      if _36:
+        _37 = torch.ne(sizeB0, 1)
+      else:
+        _37 = False
+      if _37:
+        _38 = torch.format(_4, sizeA0, sizeB0, i3)
+        _39 = torch.add("AssertionError: ", _38)
+        ops.prim.RaiseException(_39)
+      else:
+        pass
+      if torch.eq(sizeA0, 1):
+        _40 = sizeB0
+      else:
+        _40 = sizeA0
+      _41 = torch.append(expandedSizes, _40)
+    if torch.eq(expandedSizes, out):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  else:
+    pass
+  return out
+
+)=====")
++ std::string(R"=====(def max_pool2d(input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool) -> List[int]:
+  _0 = "AssertionError: max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
+  _1 = "AssertionError: max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+  _2 = "AssertionError: max_pool2d: padding must be either be a single int, or a tuple of two ints"
+  _3 = "AssertionError: max_pool2d: dilation must be either a single int, or a tuple of two ints"
+  _4 = "AssertionError: stride should not be zeero"
+  _5 = "AssertionError: stride should not be zeero"
+  if torch.eq(torch.len(kernel_size), 1):
+    _6 = True
+  else:
+    _6 = torch.eq(torch.len(kernel_size), 2)
+  if _6:
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  kH = kernel_size[0]
+  if torch.eq(torch.len(kernel_size), 1):
+    kW = kH
+  else:
+    kW = kernel_size[1]
+  if torch.eq(torch.len(stride), 0):
+    _7 = True
+  else:
+    _7 = torch.eq(torch.len(stride), 1)
+  if _7:
+    _8 = True
+  else:
+    _8 = torch.eq(torch.len(stride), 2)
+  if _8:
+    pass
+  else:
+    ops.prim.RaiseException(_1)
+  if torch.eq(torch.len(stride), 0):
+    dH = kH
+  else:
+    dH = stride[0]
+  if torch.eq(torch.len(stride), 0):
+    dW = kW
+  else:
+    if torch.eq(torch.len(stride), 1):
+      dW0 = dH
+    else:
+      dW0 = stride[1]
+    dW = dW0
+  if torch.eq(torch.len(padding), 1):
+    _9 = True
+  else:
+    _9 = torch.eq(torch.len(padding), 2)
+  if _9:
+    pass
+  else:
+    ops.prim.RaiseException(_2)
+  padH = padding[0]
+  if torch.eq(torch.len(padding), 1):
+    padW = padH
+  else:
+    padW = padding[1]
+  if torch.eq(torch.len(dilation), 1):
+    _10 = True
+  else:
+    _10 = torch.eq(torch.len(dilation), 2)
+  if _10:
+    pass
+  else:
+    ops.prim.RaiseException(_3)
+  dilationH = dilation[0]
+  if torch.eq(torch.len(dilation), 1):
+    dilationW = dilationH
+  else:
+    dilationW = dilation[1]
+  if torch.eq(torch.len(input), 3):
+    _11 = True
+  else:
+    _11 = torch.eq(torch.len(input), 4)
+  if _11:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 4):
+    nbatch = input[-4]
+  else:
+    nbatch = 1
+  nInputPlane = input[-3]
+  inputHeight = input[-2]
+  inputWidth = input[-1]
+  if torch.ne(dH, 0):
+    pass
+  else:
+    ops.prim.RaiseException(_4)
+  _12 = torch.add(torch.add(inputHeight, padH), padH)
+  _13 = torch.mul(dilationH, torch.sub(kH, 1))
+  _14 = torch.sub(torch.sub(_12, _13), 1)
+  if ceil_mode:
+    _15 = torch.sub(dH, 1)
+  else:
+    _15 = 0
+  _16 = torch.floordiv(torch.add(_14, _15), dH)
+  outputSize = torch.add(_16, 1)
+  if ceil_mode:
+    _17 = torch.ge(torch.mul(_16, dH), torch.add(inputHeight, padH))
+    if _17:
+      outputSize0 = _16
+    else:
+      outputSize0 = outputSize
+    outputHeight = outputSize0
+  else:
+    outputHeight = outputSize
+  if torch.ne(dW, 0):
+    pass
+  else:
+    ops.prim.RaiseException(_5)
+  _18 = torch.add(torch.add(inputWidth, padW), padW)
+  _19 = torch.mul(dilationW, torch.sub(kW, 1))
+  _20 = torch.sub(torch.sub(_18, _19), 1)
+  if ceil_mode:
+    _21 = torch.sub(dW, 1)
+  else:
+    _21 = 0
+  _22 = torch.floordiv(torch.add(_20, _21), dW)
+  outputSize1 = torch.add(_22, 1)
+  if ceil_mode:
+    _23 = torch.ge(torch.mul(_22, dW), torch.add(inputWidth, padW))
+    if _23:
+      outputSize2 = _22
+    else:
+      outputSize2 = outputSize1
+    outputWidth = outputSize2
+  else:
+    outputWidth = outputSize1
+  ndim = torch.len(input)
+  if torch.gt(kW, 0):
+    _24 = torch.gt(kH, 0)
+  else:
+    _24 = False
+  if _24:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.gt(dW, 0):
+    _25 = torch.gt(dH, 0)
+  else:
+    _25 = False
+  if _25:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.gt(dilationH, 0):
+    _26 = torch.gt(dilationW, 0)
+  else:
+    _26 = False
+  if _26:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ne(input[1], 0):
+    valid_dims = torch.ne(input[2], 0)
+  else:
+    valid_dims = False
+  if torch.eq(ndim, 3):
+    _27 = torch.ne(input[0], 0)
+  else:
+    _27 = False
+  if _27:
+    _28 = valid_dims
+  else:
+    _28 = False
+  if _28:
+    _29 = True
+  else:
+    if torch.eq(ndim, 4):
+      _30 = valid_dims
+    else:
+      _30 = False
+    if _30:
+      _31 = torch.ne(input[3], 0)
+    else:
+      _31 = False
+    _29 = _31
+  if _29:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(torch.floordiv(kW, 2), padW):
+    _33 = torch.ge(torch.floordiv(kH, 2), padH)
+    _32 = _33
+  else:
+    _32 = False
+  if _32:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(outputWidth, 1):
+    _34 = torch.ge(outputHeight, 1)
+  else:
+    _34 = False
+  if _34:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 3):
+    _36 = [nInputPlane, outputHeight, outputWidth]
+    _35 = _36
+  else:
+    _37 = [nbatch, nInputPlane, outputHeight, outputWidth]
+    _35 = _37
+  return _35
+
+)=====")
++ std::string(R"=====(def max_pool2d_with_indices(input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool) -> Tuple[List[int], List[int]]:
+  _0 = "AssertionError: max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
+  _1 = "AssertionError: max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+  _2 = "AssertionError: max_pool2d: padding must be either be a single int, or a tuple of two ints"
+  _3 = "AssertionError: max_pool2d: dilation must be either a single int, or a tuple of two ints"
+  _4 = "AssertionError: stride should not be zeero"
+  if torch.eq(torch.len(kernel_size), 1):
+    _5 = True
+  else:
+    _5 = torch.eq(torch.len(kernel_size), 2)
+  if _5:
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  kH = kernel_size[0]
+  if torch.eq(torch.len(kernel_size), 1):
+    kW = kH
+  else:
+    kW = kernel_size[1]
+  if torch.eq(torch.len(stride), 0):
+    _6 = True
+  else:
+    _6 = torch.eq(torch.len(stride), 1)
+  if _6:
+    _7 = True
+  else:
+    _7 = torch.eq(torch.len(stride), 2)
+  if _7:
+    pass
+  else:
+    ops.prim.RaiseException(_1)
+  if torch.eq(torch.len(stride), 0):
+    dH = kH
+  else:
+    dH = stride[0]
+  if torch.eq(torch.len(stride), 0):
+    dW = kW
+  else:
+    if torch.eq(torch.len(stride), 1):
+      dW0 = dH
+    else:
+      dW0 = stride[1]
+    dW = dW0
+  if torch.eq(torch.len(padding), 1):
+    _8 = True
+  else:
+    _8 = torch.eq(torch.len(padding), 2)
+  if _8:
+    pass
+  else:
+    ops.prim.RaiseException(_2)
+  padH = padding[0]
+  if torch.eq(torch.len(padding), 1):
+    padW = padH
+  else:
+    padW = padding[1]
+  if torch.eq(torch.len(dilation), 1):
+    _9 = True
+  else:
+    _9 = torch.eq(torch.len(dilation), 2)
+  if _9:
+    pass
+  else:
+    ops.prim.RaiseException(_3)
+  dilationH = dilation[0]
+  if torch.eq(torch.len(dilation), 1):
+    dilationW = dilationH
+  else:
+    dilationW = dilation[1]
+  if torch.eq(torch.len(input), 3):
+    _10 = True
+  else:
+    _10 = torch.eq(torch.len(input), 4)
+  if _10:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 4):
+    nbatch = input[-4]
+  else:
+    nbatch = 1
+  nInputPlane = input[-3]
+  inputHeight = input[-2]
+  inputWidth = input[-1]
+  if torch.ne(dH, 0):
+    pass
+  else:
+    ops.prim.RaiseException(_4)
+  _11 = torch.add(torch.add(inputHeight, padH), padH)
+  _12 = torch.mul(dilationH, torch.sub(kH, 1))
+  _13 = torch.sub(torch.sub(_11, _12), 1)
+  if ceil_mode:
+    _14 = torch.sub(dH, 1)
+  else:
+    _14 = 0
+  _15 = torch.floordiv(torch.add(_13, _14), dH)
+  outputSize = torch.add(_15, 1)
+  if ceil_mode:
+    _16 = torch.ge(torch.mul(_15, dH), torch.add(inputHeight, padH))
+    if _16:
+      outputSize0 = _15
+    else:
+      outputSize0 = outputSize
+    outputHeight = outputSize0
+  else:
+    outputHeight = outputSize
+  if torch.ne(dW, 0):
+    pass
+  else:
+    ops.prim.RaiseException(_4)
+  _17 = torch.add(torch.add(inputWidth, padW), padW)
+  _18 = torch.mul(dilationW, torch.sub(kW, 1))
+  _19 = torch.sub(torch.sub(_17, _18), 1)
+  if ceil_mode:
+    _20 = torch.sub(dW, 1)
+  else:
+    _20 = 0
+  _21 = torch.floordiv(torch.add(_19, _20), dW)
+  outputSize1 = torch.add(_21, 1)
+  if ceil_mode:
+    _22 = torch.ge(torch.mul(_21, dW), torch.add(inputWidth, padW))
+    if _22:
+      outputSize2 = _21
+    else:
+      outputSize2 = outputSize1
+    outputWidth = outputSize2
+  else:
+    outputWidth = outputSize1
+  ndim = torch.len(input)
+  if torch.gt(kW, 0):
+    _23 = torch.gt(kH, 0)
+  else:
+    _23 = False
+  if _23:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.gt(dW, 0):
+    _24 = torch.gt(dH, 0)
+  else:
+    _24 = False
+  if _24:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.gt(dilationH, 0):
+    _25 = torch.gt(dilationW, 0)
+  else:
+    _25 = False
+  if _25:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ne(input[1], 0):
+    valid_dims = torch.ne(input[2], 0)
+  else:
+    valid_dims = False
+  if torch.eq(ndim, 3):
+    _26 = torch.ne(input[0], 0)
+  else:
+    _26 = False
+  if _26:
+    _27 = valid_dims
+  else:
+    _27 = False
+  if _27:
+    _28 = True
+  else:
+    if torch.eq(ndim, 4):
+      _29 = valid_dims
+    else:
+      _29 = False
+    if _29:
+      _30 = torch.ne(input[3], 0)
+    else:
+      _30 = False
+    _28 = _30
+  if _28:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(torch.floordiv(kW, 2), padW):
+    _32 = torch.ge(torch.floordiv(kH, 2), padH)
+    _31 = _32
+  else:
+    _31 = False
+  if _31:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(outputWidth, 1):
+    _33 = torch.ge(outputHeight, 1)
+  else:
+    _33 = False
+  if _33:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 3):
+    _34 = [nInputPlane, outputHeight, outputWidth]
+    out = _34
+  else:
+    _35 = [nbatch, nInputPlane, outputHeight, outputWidth]
+    out = _35
+  return (out, out)
+
+)=====")
++ std::string(R"=====(def t(self: List[int]) -> List[int]:
+  if torch.le(torch.len(self), 2):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  self_len = torch.len(self)
+  if torch.eq(self_len, 0):
+    _0 = annotate(List[int], [])
+  else:
+    if torch.eq(self_len, 1):
+      _1 = [self[0]]
+    else:
+      _1 = [self[1], self[0]]
+    _0 = _1
+  return _0
+
+def transpose(self: List[int],
+    dim0: int,
+    dim1: int) -> List[int]:
+  ndims = torch.len(self)
+  if torch.le(ndims, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = ndims
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(dim0, min):
+    _0 = True
+  else:
+    _0 = torch.gt(dim0, max)
+  if torch.__not__(_0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim0, 0):
+    dim00 = torch.add(dim0, dim_post_expr)
+  else:
+    dim00 = dim0
+  if torch.le(ndims, 0):
+    dim_post_expr0 = 1
+  else:
+    dim_post_expr0 = ndims
+  min0 = torch.neg(dim_post_expr0)
+  max0 = torch.sub(dim_post_expr0, 1)
+  if torch.lt(dim1, min0):
+    _1 = True
+  else:
+    _1 = torch.gt(dim1, max0)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(dim1, 0):
+    dim10 = torch.add(dim1, dim_post_expr0)
+  else:
+    dim10 = dim1
+  if torch.eq(dim00, dim10):
+    out = annotate(List[int], [])
+    for _3 in range(torch.len(self)):
+      elem = self[_3]
+      _4 = torch.append(out, elem)
+    _2 = out
+  else:
+    out0 = annotate(List[int], [])
+    for i in range(ndims):
+      if torch.eq(i, dim00):
+        _5 = torch.append(out0, self[dim10])
+      else:
+        if torch.eq(i, dim10):
+          _6 = torch.append(out0, self[dim00])
+        else:
+          _7 = torch.append(out0, self[i])
+    _2 = out0
+  return _2
+
+)=====")
++ std::string(R"=====(def conv1d(input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int) -> List[int]:
+  if torch.eq(torch.len(weight), 3):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 3):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  k = torch.len(input)
+  weight_dim = torch.len(weight)
+  non_negative = False
+  for _0 in range(torch.len(padding)):
+    val = padding[_0]
+    if torch.lt(val, 0):
+      non_negative0 = True
+    else:
+      non_negative0 = non_negative
+    non_negative = non_negative0
+  if torch.__not__(non_negative):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  non_negative1 = False
+  for _1 in range(torch.len(stride)):
+    val0 = stride[_1]
+    if torch.lt(val0, 0):
+      non_negative2 = True
+    else:
+      non_negative2 = non_negative1
+    non_negative1 = non_negative2
+  if torch.__not__(non_negative1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(weight_dim, k):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(weight[0], groups):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _2 = torch.eq(torch.remainder(weight[0], groups), 0)
+  if _2:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _3 = torch.eq(input[1], torch.mul(weight[1], groups))
+  if _3:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.__is__(bias, None):
+    _4 = True
+  else:
+    bias0 = unchecked_cast(List[int], bias)
+    if torch.eq(torch.len(bias0), 1):
+      _5 = torch.eq(bias0[0], weight[0])
+    else:
+      _5 = False
+    _4 = _5
+  if _4:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  for _6 in range(torch.__range_length(2, k, 1)):
+    i = torch.__derive_index(_6, 2, 1)
+    _7 = input[i]
+    _8 = torch.mul(padding[torch.sub(i, 2)], 2)
+    _9 = torch.add(_7, _8)
+    _10 = torch.mul(dilation[torch.sub(i, 2)], torch.sub(weight[i], 1))
+    if torch.ge(_9, torch.add(_10, 1)):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  has_dilation = torch.gt(torch.len(dilation), 0)
+  dim = torch.len(input)
+  output_size = annotate(List[int], [])
+  _11 = torch.append(output_size, input[0])
+  _12 = torch.append(output_size, weight[0])
+  for _13 in range(torch.__range_length(2, dim, 1)):
+    d = torch.__derive_index(_13, 2, 1)
+    if has_dilation:
+      dilation_ = dilation[torch.sub(d, 2)]
+    else:
+      dilation_ = 1
+    _14 = torch.mul(dilation_, torch.sub(weight[d], 1))
+    kernel = torch.add(_14, 1)
+    _15 = input[d]
+    _16 = torch.mul(padding[torch.sub(d, 2)], 2)
+    _17 = torch.sub(torch.add(_15, _16), kernel)
+    _18 = torch.floordiv(_17, stride[torch.sub(d, 2)])
+    _19 = torch.append(output_size, torch.add(_18, 1))
+  return output_size
+
+)=====")
++ std::string(R"=====(def conv2d(input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int) -> List[int]:
+  if torch.eq(torch.len(weight), 4):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 4):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  k = torch.len(input)
+  weight_dim = torch.len(weight)
+  non_negative = False
+  for _0 in range(torch.len(padding)):
+    val = padding[_0]
+    if torch.lt(val, 0):
+      non_negative0 = True
+    else:
+      non_negative0 = non_negative
+    non_negative = non_negative0
+  if torch.__not__(non_negative):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  non_negative1 = False
+  for _1 in range(torch.len(stride)):
+    val0 = stride[_1]
+    if torch.lt(val0, 0):
+      non_negative2 = True
+    else:
+      non_negative2 = non_negative1
+    non_negative1 = non_negative2
+  if torch.__not__(non_negative1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(weight_dim, k):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(weight[0], groups):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _2 = torch.eq(torch.remainder(weight[0], groups), 0)
+  if _2:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _3 = torch.eq(input[1], torch.mul(weight[1], groups))
+  if _3:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.__is__(bias, None):
+    _4 = True
+  else:
+    bias0 = unchecked_cast(List[int], bias)
+    if torch.eq(torch.len(bias0), 1):
+      _5 = torch.eq(bias0[0], weight[0])
+    else:
+      _5 = False
+    _4 = _5
+  if _4:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  for _6 in range(torch.__range_length(2, k, 1)):
+    i = torch.__derive_index(_6, 2, 1)
+    _7 = input[i]
+    _8 = torch.mul(padding[torch.sub(i, 2)], 2)
+    _9 = torch.add(_7, _8)
+    _10 = torch.mul(dilation[torch.sub(i, 2)], torch.sub(weight[i], 1))
+    if torch.ge(_9, torch.add(_10, 1)):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  has_dilation = torch.gt(torch.len(dilation), 0)
+  dim = torch.len(input)
+  output_size = annotate(List[int], [])
+  _11 = torch.append(output_size, input[0])
+  _12 = torch.append(output_size, weight[0])
+  for _13 in range(torch.__range_length(2, dim, 1)):
+    d = torch.__derive_index(_13, 2, 1)
+    if has_dilation:
+      dilation_ = dilation[torch.sub(d, 2)]
+    else:
+      dilation_ = 1
+    _14 = torch.mul(dilation_, torch.sub(weight[d], 1))
+    kernel = torch.add(_14, 1)
+    _15 = input[d]
+    _16 = torch.mul(padding[torch.sub(d, 2)], 2)
+    _17 = torch.sub(torch.add(_15, _16), kernel)
+    _18 = torch.floordiv(_17, stride[torch.sub(d, 2)])
+    _19 = torch.append(output_size, torch.add(_18, 1))
+  return output_size
+
+)=====")
++ std::string(R"=====(def batch_norm(input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+    training: bool,
+    momentum: float,
+    eps: float,
+    cudnn_enabled: bool) -> List[int]:
+  out = annotate(List[int], [])
+  for _0 in range(torch.len(input)):
+    elem = input[_0]
+    _1 = torch.append(out, elem)
+  return out
+
+)=====")
++ std::string(R"=====(def conv3d(input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int) -> List[int]:
+  if torch.eq(torch.len(weight), 5):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 5):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  k = torch.len(input)
+  weight_dim = torch.len(weight)
+  non_negative = False
+  for _0 in range(torch.len(padding)):
+    val = padding[_0]
+    if torch.lt(val, 0):
+      non_negative0 = True
+    else:
+      non_negative0 = non_negative
+    non_negative = non_negative0
+  if torch.__not__(non_negative):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  non_negative1 = False
+  for _1 in range(torch.len(stride)):
+    val0 = stride[_1]
+    if torch.lt(val0, 0):
+      non_negative2 = True
+    else:
+      non_negative2 = non_negative1
+    non_negative1 = non_negative2
+  if torch.__not__(non_negative1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(weight_dim, k):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.ge(weight[0], groups):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _2 = torch.eq(torch.remainder(weight[0], groups), 0)
+  if _2:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _3 = torch.eq(input[1], torch.mul(weight[1], groups))
+  if _3:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.__is__(bias, None):
+    _4 = True
+  else:
+    bias0 = unchecked_cast(List[int], bias)
+    if torch.eq(torch.len(bias0), 1):
+      _5 = torch.eq(bias0[0], weight[0])
+    else:
+      _5 = False
+    _4 = _5
+  if _4:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  for _6 in range(torch.__range_length(2, k, 1)):
+    i = torch.__derive_index(_6, 2, 1)
+    _7 = input[i]
+    _8 = torch.mul(padding[torch.sub(i, 2)], 2)
+    _9 = torch.add(_7, _8)
+    _10 = torch.mul(dilation[torch.sub(i, 2)], torch.sub(weight[i], 1))
+    if torch.ge(_9, torch.add(_10, 1)):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  has_dilation = torch.gt(torch.len(dilation), 0)
+  dim = torch.len(input)
+  output_size = annotate(List[int], [])
+  _11 = torch.append(output_size, input[0])
+  _12 = torch.append(output_size, weight[0])
+  for _13 in range(torch.__range_length(2, dim, 1)):
+    d = torch.__derive_index(_13, 2, 1)
+    if has_dilation:
+      dilation_ = dilation[torch.sub(d, 2)]
+    else:
+      dilation_ = 1
+    _14 = torch.mul(dilation_, torch.sub(weight[d], 1))
+    kernel = torch.add(_14, 1)
+    _15 = input[d]
+    _16 = torch.mul(padding[torch.sub(d, 2)], 2)
+    _17 = torch.sub(torch.add(_15, _16), kernel)
+    _18 = torch.floordiv(_17, stride[torch.sub(d, 2)])
+    _19 = torch.append(output_size, torch.add(_18, 1))
+  return output_size
+
+)=====")
++ std::string(R"=====(def flatten(input: List[int],
+    start_dim: int,
+    end_dim: int) -> List[int]:
+  _0 = torch.len(input)
+  if torch.le(_0, 0):
+    dim_post_expr = 1
+  else:
+    dim_post_expr = _0
+  min = torch.neg(dim_post_expr)
+  max = torch.sub(dim_post_expr, 1)
+  if torch.lt(start_dim, min):
+    _1 = True
+  else:
+    _1 = torch.gt(start_dim, max)
+  if torch.__not__(_1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(start_dim, 0):
+    start_dim0 = torch.add(start_dim, dim_post_expr)
+  else:
+    start_dim0 = start_dim
+  _2 = torch.len(input)
+  if torch.le(_2, 0):
+    dim_post_expr0 = 1
+  else:
+    dim_post_expr0 = _2
+  min0 = torch.neg(dim_post_expr0)
+  max0 = torch.sub(dim_post_expr0, 1)
+  if torch.lt(end_dim, min0):
+    _3 = True
+  else:
+    _3 = torch.gt(end_dim, max0)
+  if torch.__not__(_3):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.lt(end_dim, 0):
+    end_dim0 = torch.add(end_dim, dim_post_expr0)
+  else:
+    end_dim0 = end_dim
+  if torch.le(start_dim0, end_dim0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(torch.len(input), 0):
+    _4 = [1]
+  else:
+    if torch.eq(start_dim0, end_dim0):
+      out = annotate(List[int], [])
+      for _6 in range(torch.len(input)):
+        elem = input[_6]
+        _7 = torch.append(out, elem)
+      _5 = out
+    else:
+      _8 = torch.__range_length(start_dim0, torch.add(end_dim0, 1), 1)
+      slice_numel = 1
+      for _9 in range(_8):
+        i = torch.__derive_index(_9, start_dim0, 1)
+        slice_numel0 = torch.mul(slice_numel, input[i])
+        slice_numel = slice_numel0
+      shape = annotate(List[int], [])
+      for i0 in range(start_dim0):
+        _10 = torch.append(shape, input[i0])
+      _11 = torch.append(shape, slice_numel)
+      _12 = torch.add(end_dim0, 1)
+      _13 = torch.__range_length(_12, torch.len(input), 1)
+      for _14 in range(_13):
+        i1 = torch.__derive_index(_14, _12, 1)
+        _15 = torch.append(shape, input[i1])
+      _5 = shape
+    _4 = _5
+  return _4
+
+)=====")
++ std::string(R"=====(def cat(tensors: List[List[int]],
+    dim: int) -> List[int]:
+  _0 = "AssertionError: Tensors must have same number of dimensions"
+  _1 = "AssertionError: Sizes of tensors must match except in dimension"
+  for _2 in range(torch.len(tensors)):
+    tensor = tensors[_2]
+    if torch.gt(torch.len(tensor), 0):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+  out_dim: Optional[int] = None
+  for _3 in range(torch.len(tensors)):
+    size = tensors[_3]
+    if torch.eq(torch.len(size), 1):
+      _4 = torch.eq(size[0], 0)
+    else:
+      _4 = False
+    if torch.__not__(_4):
+      if torch.__is__(out_dim, None):
+        _5 = torch.len(size)
+        if torch.le(_5, 0):
+          dim_post_expr = 1
+        else:
+          dim_post_expr = _5
+        min = torch.neg(dim_post_expr)
+        max = torch.sub(dim_post_expr, 1)
+        if torch.lt(dim, min):
+          _6 = True
+        else:
+          _6 = torch.gt(dim, max)
+        if torch.__not__(_6):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        if torch.lt(dim, 0):
+          out_dim2 = torch.add(dim, dim_post_expr)
+        else:
+          out_dim2 = dim
+        out_dim1 = out_dim2
+      else:
+        out_dim1 = unchecked_cast(int, out_dim)
+      out_dim0 : Optional[int] = out_dim1
+    else:
+      out_dim0 = out_dim
+    out_dim = out_dim0
+  if torch.__is__(out_dim, None):
+    dim0 = dim
+  else:
+    dim0 = unchecked_cast(int, out_dim)
+  if torch.gt(torch.len(tensors), 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  not_skipped_tensor: Optional[List[int]] = None
+  for _7 in range(torch.len(tensors)):
+    tensor0 = tensors[_7]
+    numel = 1
+    for _8 in range(torch.len(tensor0)):
+      elem = tensor0[_8]
+      numel = torch.mul(numel, elem)
+    if torch.eq(numel, 0):
+      _9 = torch.eq(torch.len(tensor0), 1)
+    else:
+      _9 = False
+    if torch.__not__(_9):
+      not_skipped_tensor0 : Optional[List[int]] = tensor0
+    else:
+      not_skipped_tensor0 = not_skipped_tensor
+    not_skipped_tensor = not_skipped_tensor0
+  _10 = torch.__is__(not_skipped_tensor, None)
+  if _10:
+    _11 = [0]
+  else:
+    not_skipped_tensor1 = unchecked_cast(List[int], not_skipped_tensor)
+    cat_dim_size = 0
+    for i in range(torch.len(tensors)):
+      tensor1 = tensors[i]
+      numel0 = 1
+      for _12 in range(torch.len(tensor1)):
+        elem0 = tensor1[_12]
+        numel0 = torch.mul(numel0, elem0)
+      if torch.eq(numel0, 0):
+        _13 = torch.eq(torch.len(tensor1), 1)
+      else:
+        _13 = False
+      if torch.__not__(_13):
+        first_dims = torch.len(not_skipped_tensor1)
+        second_dims = torch.len(tensor1)
+        _14 = torch.eq(first_dims, second_dims)
+        if _14:
+          pass
+        else:
+          ops.prim.RaiseException(_0)
+        _15 = torch.__range_length(0, first_dims, 1)
+        for _16 in range(_15):
+          dim1 = torch.__derive_index(_16, 0, 1)
+          if torch.ne(dim1, dim0):
+            _17 = torch.eq(not_skipped_tensor1[dim1], tensor1[dim1])
+            if _17:
+              pass
+            else:
+              ops.prim.RaiseException(_1)
+          else:
+            pass
+        cat_dim_size1 = torch.add(cat_dim_size, tensor1[dim0])
+        cat_dim_size0 = cat_dim_size1
+      else:
+        cat_dim_size0 = cat_dim_size
+      cat_dim_size = cat_dim_size0
+    result_size = annotate(List[int], [])
+    for _18 in range(torch.len(not_skipped_tensor1)):
+      elem1 = not_skipped_tensor1[_18]
+      _19 = torch.append(result_size, elem1)
+    _20 = torch._set_item(result_size, dim0, cat_dim_size)
+    _11 = result_size
+  return _11
+
+)=====")
++ std::string(R"=====(def permute(input: List[int],
+    dims: List[int]) -> List[int]:
+  _0 = torch.eq(torch.len(input), torch.len(dims))
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  ndim = torch.len(dims)
+  seen_dims = annotate(List[int], [])
+  newSizes = annotate(List[int], [])
+  for i in range(ndim):
+    _1 = dims[i]
+    if torch.le(ndim, 0):
+      dim_post_expr = 1
+    else:
+      dim_post_expr = ndim
+    min = torch.neg(dim_post_expr)
+    max = torch.sub(dim_post_expr, 1)
+    if torch.lt(_1, min):
+      _2 = True
+    else:
+      _2 = torch.gt(_1, max)
+    if torch.__not__(_2):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    if torch.lt(_1, 0):
+      dim = torch.add(_1, dim_post_expr)
+    else:
+      dim = _1
+    _3 = torch.append(seen_dims, dim)
+    _4 = torch.append(newSizes, input[dim])
+  for _5 in range(torch.__range_length(1, ndim, 1)):
+    i0 = torch.__derive_index(_5, 1, 1)
+    for j in range(i0):
+      _6 = torch.ne(seen_dims[i0], seen_dims[j])
+      if _6:
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+  return newSizes
+
+)=====")
++ std::string(R"=====(def view(self: List[int],
+    sizes: List[int]) -> List[int]:
+  _0 = "AssertionError: only one dimension can be inferred"
+  _1 = "AssertionError: invalid shape dimensions"
+  numel = 1
+  for _2 in range(torch.len(self)):
+    elem = self[_2]
+    numel = torch.mul(numel, elem)
+  _3 = uninitialized(int)
+  newsize = 1
+  infer_dim: Optional[int] = None
+  for dim in range(torch.len(sizes)):
+    if torch.eq(sizes[dim], -1):
+      if torch.__isnot__(infer_dim, None):
+        ops.prim.RaiseException(_0)
+      else:
+        pass
+      newsize0, infer_dim0 = newsize, dim
+    else:
+      if torch.ge(sizes[dim], 0):
+        newsize1 = torch.mul(newsize, sizes[dim])
+      else:
+        ops.prim.RaiseException(_1)
+        newsize1 = _3
+      newsize0, infer_dim0 = newsize1, infer_dim
+    newsize, infer_dim = newsize0, infer_dim0
+  if torch.eq(numel, newsize):
+    _4, infer_dim1 = True, infer_dim
+  else:
+    if torch.__isnot__(infer_dim, None):
+      infer_dim3 = unchecked_cast(int, infer_dim)
+      _5, infer_dim2 = torch.gt(newsize, 0), infer_dim3
+    else:
+      _5, infer_dim2 = False, infer_dim
+    if _5:
+      infer_dim5 = unchecked_cast(int, infer_dim2)
+      _7 = torch.eq(torch.remainder(numel, newsize), 0)
+      _6, infer_dim4 = _7, infer_dim5
+    else:
+      _6, infer_dim4 = False, infer_dim2
+    _4, infer_dim1 = _6, infer_dim4
+  if torch.__not__(_4):
+    ops.prim.RaiseException("AssertionError: invalid shape")
+  else:
+    pass
+  out = annotate(List[int], [])
+  for _8 in range(torch.len(sizes)):
+    elem0 = sizes[_8]
+    _9 = torch.append(out, elem0)
+  if torch.__isnot__(infer_dim1, None):
+    infer_dim6 = unchecked_cast(int, infer_dim1)
+    _10 = torch._set_item(out, infer_dim6, torch.floordiv(numel, newsize))
+  else:
+    pass
+  return out
+
+)=====")
++ std::string(R"=====(def expand(self: List[int],
+    sizes: List[int]) -> List[int]:
+  _0 = torch.ge(torch.len(sizes), torch.len(self))
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  ndim = torch.len(sizes)
+  tensor_dim = torch.len(self)
+  if torch.eq(ndim, 0):
+    out = annotate(List[int], [])
+    for _2 in range(torch.len(sizes)):
+      elem = sizes[_2]
+      _3 = torch.append(out, elem)
+    _1 = out
+  else:
+    out0 = annotate(List[int], [])
+    for i in range(ndim):
+      offset = torch.sub(torch.sub(ndim, 1), i)
+      dim = torch.sub(torch.sub(tensor_dim, 1), offset)
+      if torch.ge(dim, 0):
+        size = self[dim]
+      else:
+        size = 1
+      targetSize = sizes[i]
+      if torch.eq(targetSize, -1):
+        if torch.ge(dim, 0):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        targetSize0 = size
+      else:
+        targetSize0 = targetSize
+      if torch.ne(size, targetSize0):
+        if torch.eq(size, 1):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        size0 = targetSize0
+      else:
+        size0 = size
+      _4 = torch.append(out0, size0)
+    _1 = out0
+  return _1
+
+)=====")
++ std::string(R"=====(def expand_one_unused(self: List[int],
+    sizes: List[int],
+    inp0: Any) -> List[int]:
+  _0 = torch.ge(torch.len(sizes), torch.len(self))
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  ndim = torch.len(sizes)
+  tensor_dim = torch.len(self)
+  if torch.eq(ndim, 0):
+    out = annotate(List[int], [])
+    for _2 in range(torch.len(sizes)):
+      elem = sizes[_2]
+      _3 = torch.append(out, elem)
+    _1 = out
+  else:
+    out0 = annotate(List[int], [])
+    for i in range(ndim):
+      offset = torch.sub(torch.sub(ndim, 1), i)
+      dim = torch.sub(torch.sub(tensor_dim, 1), offset)
+      if torch.ge(dim, 0):
+        size = self[dim]
+      else:
+        size = 1
+      targetSize = sizes[i]
+      if torch.eq(targetSize, -1):
+        if torch.ge(dim, 0):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        targetSize0 = size
+      else:
+        targetSize0 = targetSize
+      if torch.ne(size, targetSize0):
+        if torch.eq(size, 1):
+          pass
+        else:
+          ops.prim.RaiseException("AssertionError: ")
+        size0 = targetSize0
+      else:
+        size0 = size
+      _4 = torch.append(out0, size0)
+    _1 = out0
+  return _1
+
+)=====")
++ std::string(R"=====(def mean_dim(self: List[int],
+    dims: List[int],
+    keep_dim: bool,
+    dt: Any) -> List[int]:
+  out = annotate(List[int], [])
+  for idx in range(torch.len(self)):
+    is_mean_dim = False
+    for _0 in range(torch.len(dims)):
+      reduce_dim = dims[_0]
+      _1 = torch.len(self)
+      if torch.le(_1, 0):
+        dim_post_expr = 1
+      else:
+        dim_post_expr = _1
+      min = torch.neg(dim_post_expr)
+      max = torch.sub(dim_post_expr, 1)
+      if torch.lt(reduce_dim, min):
+        _2 = True
+      else:
+        _2 = torch.gt(reduce_dim, max)
+      if torch.__not__(_2):
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      if torch.lt(reduce_dim, 0):
+        dim0 = torch.add(reduce_dim, dim_post_expr)
+        dim = dim0
+      else:
+        dim = reduce_dim
+      if torch.eq(idx, dim):
+        is_mean_dim0 = True
+      else:
+        is_mean_dim0 = is_mean_dim
+      is_mean_dim = is_mean_dim0
+    if is_mean_dim:
+      if keep_dim:
+        _3 = torch.append(out, 1)
+      else:
+        pass
+    else:
+      _4 = torch.append(out, self[idx])
+  return out
+
+)=====")
++ std::string(R"=====(def max_dim(self: List[int],
+    dim: int,
+    keep_dim: bool) -> Tuple[List[int], List[int]]:
+  _0 = [dim]
+  out = annotate(List[int], [])
+  for idx in range(torch.len(self)):
+    is_mean_dim = False
+    for _1 in range(1):
+      reduce_dim = _0[_1]
+      _2 = torch.len(self)
+      if torch.le(_2, 0):
+        dim_post_expr = 1
+      else:
+        dim_post_expr = _2
+      min = torch.neg(dim_post_expr)
+      max = torch.sub(dim_post_expr, 1)
+      if torch.lt(reduce_dim, min):
+        _3 = True
+      else:
+        _3 = torch.gt(reduce_dim, max)
+      if torch.__not__(_3):
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      if torch.lt(reduce_dim, 0):
+        dim1 = torch.add(reduce_dim, dim_post_expr)
+        dim0 = dim1
+      else:
+        dim0 = reduce_dim
+      if torch.eq(idx, dim0):
+        is_mean_dim0 = True
+      else:
+        is_mean_dim0 = is_mean_dim
+      is_mean_dim = is_mean_dim0
+    if is_mean_dim:
+      if keep_dim:
+        _4 = torch.append(out, 1)
+      else:
+        pass
+    else:
+      _5 = torch.append(out, self[idx])
+  return (out, out)
+
+)=====")
++ std::string(R"=====(def addmm(self: List[int],
+    mat1: List[int],
+    mat2: List[int],
+    beta: Any,
+    alpha: Any) -> List[int]:
+  _0 = "AssertionError: self must be a matrix"
+  _1 = "AssertionError: mat2 must be a matrix"
+  _2 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  if torch.eq(torch.len(mat1), 2):
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  if torch.eq(torch.len(mat2), 2):
+    pass
+  else:
+    ops.prim.RaiseException(_1)
+  if torch.eq(mat1[1], mat2[0]):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  _3 = [mat1[0], mat2[1]]
+  dimsA = torch.len(self)
+  ndim = ops.prim.max(dimsA, 2)
+  expandedSizes = annotate(List[int], [])
+  for i in range(ndim):
+    offset = torch.sub(torch.sub(ndim, 1), i)
+    dimA = torch.sub(torch.sub(dimsA, 1), offset)
+    dimB = torch.sub(1, offset)
+    if torch.ge(dimA, 0):
+      sizeA = self[dimA]
+    else:
+      sizeA = 1
+    if torch.ge(dimB, 0):
+      sizeB = _3[dimB]
+    else:
+      sizeB = 1
+    if torch.ne(sizeA, sizeB):
+      _4 = torch.ne(sizeA, 1)
+    else:
+      _4 = False
+    if _4:
+      _5 = torch.ne(sizeB, 1)
+    else:
+      _5 = False
+    if _5:
+      _6 = torch.add("AssertionError: ", torch.format(_2, sizeA, sizeB, i))
+      ops.prim.RaiseException(_6)
+    else:
+      pass
+    if torch.eq(sizeA, 1):
+      _7 = sizeB
+    else:
+      _7 = sizeA
+    _8 = torch.append(expandedSizes, _7)
+  return expandedSizes
+
+)=====")
++ std::string(R"=====(def upsample_nearest2d(input: List[int],
+    output_size: Optional[List[int]],
+    scale_factors: Optional[List[float]]) -> Optional[List[int]]:
+  _0 = "AssertionError: Must specify exactly one of output_size and scale_factors"
+  _1 = "AssertionError: Either output_size or scale_factors must be presented"
+  out = annotate(List[int], [])
+  _2 = torch.append(out, input[0])
+  _3 = torch.append(out, input[1])
+  if torch.__isnot__(output_size, None):
+    output_size0 = unchecked_cast(List[int], output_size)
+    if torch.__is__(scale_factors, None):
+      pass
+    else:
+      ops.prim.RaiseException(_0)
+    _5 = torch.eq(torch.len(output_size0), 2)
+    if _5:
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    _6 = torch.append(out, output_size0[0])
+    _7 = torch.append(out, output_size0[1])
+    _4 : Optional[List[int]] = out
+  else:
+    _8 = torch.__isnot__(scale_factors, None)
+    if _8:
+      scale_factors0 = unchecked_cast(List[float], scale_factors)
+      if torch.__is__(output_size, None):
+        pass
+      else:
+        ops.prim.RaiseException(_0)
+      _10 = torch.eq(torch.len(scale_factors0), 2)
+      if _10:
+        pass
+      else:
+        ops.prim.RaiseException("AssertionError: ")
+      _11 = torch.mul(input[2], scale_factors0[0])
+      _12 = torch.append(out, int(_11))
+      _13 = torch.mul(input[3], scale_factors0[1])
+      _14 = torch.append(out, int(_13))
+      _9 : Optional[List[int]] = out
+    else:
+      ops.prim.RaiseException(_1)
+      _9 = None
+    _4 = _9
+  return _4
+
+)=====")
++ std::string(R"=====(def broadcast(a: List[int],
+    b: List[int]) -> List[int]:
+  _0 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  dimsA = torch.len(a)
+  dimsB = torch.len(b)
+  ndim = ops.prim.max(dimsA, dimsB)
+  expandedSizes = annotate(List[int], [])
+  for i in range(ndim):
+    offset = torch.sub(torch.sub(ndim, 1), i)
+    dimA = torch.sub(torch.sub(dimsA, 1), offset)
+    dimB = torch.sub(torch.sub(dimsB, 1), offset)
+    if torch.ge(dimA, 0):
+      sizeA = a[dimA]
+    else:
+      sizeA = 1
+    if torch.ge(dimB, 0):
+      sizeB = b[dimB]
+    else:
+      sizeB = 1
+    if torch.ne(sizeA, sizeB):
+      _1 = torch.ne(sizeA, 1)
+    else:
+      _1 = False
+    if _1:
+      _2 = torch.ne(sizeB, 1)
+    else:
+      _2 = False
+    if _2:
+      _3 = torch.add("AssertionError: ", torch.format(_0, sizeA, sizeB, i))
+      ops.prim.RaiseException(_3)
+    else:
+      pass
+    if torch.eq(sizeA, 1):
+      _4 = sizeB
+    else:
+      _4 = sizeA
+    _5 = torch.append(expandedSizes, _4)
+  return expandedSizes
+
+)=====")
++ std::string(R"=====(def argmax(self: List[int],
+    dim: Optional[int]=None,
+    keepdim: bool=False) -> List[int]:
+  if torch.__is__(dim, None):
+    _0 = annotate(List[int], [])
+  else:
+    dim0 = unchecked_cast(int, dim)
+    _1 = torch.len(self)
+    if torch.le(_1, 0):
+      dim_post_expr = 1
+    else:
+      dim_post_expr = _1
+    min = torch.neg(dim_post_expr)
+    max = torch.sub(dim_post_expr, 1)
+    if torch.lt(dim0, min):
+      _2 = True
+    else:
+      _2 = torch.gt(dim0, max)
+    if torch.__not__(_2):
+      pass
+    else:
+      ops.prim.RaiseException("AssertionError: ")
+    if torch.lt(dim0, 0):
+      dim1 = torch.add(dim0, dim_post_expr)
+    else:
+      dim1 = dim0
+    out = annotate(List[int], [])
+    _3 = [9223372036854775807, torch.len(self)]
+    for i in range(ops.prim.min(_3)):
+      self_dim = self[i]
+      if torch.eq(i, dim1):
+        if keepdim:
+          _4 = torch.append(out, 1)
+        else:
+          pass
+      else:
+        _5 = torch.append(out, self_dim)
+    _0 = out
+  return _0
+
+def bmm(self: List[int],
+    mat2: List[int]) -> List[int]:
+  _0 = "AssertionError: bmm only supports 3D tensors"
+  _1 = "AssertionError: mismatching batch dimension"
+  _2 = "AssertionError: mismatching contracting dimension"
+  if torch.eq(torch.len(self), 3):
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  if torch.eq(torch.len(mat2), 3):
+    pass
+  else:
+    ops.prim.RaiseException(_0)
+  if torch.eq(self[0], mat2[0]):
+    pass
+  else:
+    ops.prim.RaiseException(_1)
+  if torch.eq(self[2], mat2[1]):
+    pass
+  else:
+    ops.prim.RaiseException(_2)
+  return [self[0], self[1], mat2[2]]
+
+def _shape_as_tensor(self: List[int]) -> List[int]:
+  return [torch.len(self)]
+
+)=====")
++ std::string(R"=====(def topk(self: List[int],
+    k: int,
+    dim: int=-1) -> Tuple[List[int], List[int]]:
+  _0 = "k ({}) is too big for dimension {} of size {}"
+  if torch.eq(torch.len(self), 0):
+    result = annotate(List[int], [])
+  else:
+    if torch.le(k, self[dim]):
+      pass
+    else:
+      _1 = torch.format(_0, k, dim, self[dim])
+      ops.prim.RaiseException(torch.add("AssertionError: ", _1))
+    result0 = annotate(List[int], [])
+    for _2 in range(torch.len(self)):
+      elem = self[_2]
+      _3 = torch.append(result0, elem)
+    _4 = torch._set_item(result0, dim, k)
+    result = result0
+  return (result, result)
+
+def nll_loss_forward(self: List[int],
+    target: List[int],
+    weight: Optional[List[int]],
+    reduction: int) -> Tuple[List[int], List[int]]:
+  self_dim = torch.len(self)
+  target_dim = torch.len(target)
+  if torch.lt(0, self_dim):
+    _0 = torch.le(self_dim, 2)
+  else:
+    _0 = False
+  if _0:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.le(target_dim, 1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(self_dim, 1):
+    no_batch_dim = torch.eq(target_dim, 0)
+  else:
+    no_batch_dim = False
+  if no_batch_dim:
+    _1 = True
+  else:
+    _1 = torch.eq(self[0], target[0])
+  if _1:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  n_classes = self[-1]
+  if torch.__is__(weight, None):
+    _2 = True
+  else:
+    weight0 = unchecked_cast(List[int], weight)
+    if torch.eq(torch.len(weight0), 1):
+      _3 = torch.eq(weight0[0], n_classes)
+    else:
+      _3 = False
+    _2 = _3
+  if _2:
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  if torch.eq(reduction, 0):
+    _4 = torch.eq(self_dim, 2)
+  else:
+    _4 = False
+  if _4:
+    reduction_shape = [self[0]]
+  else:
+    reduction_shape = annotate(List[int], [])
+  _5 = (reduction_shape, annotate(List[int], []))
+  return _5
+
+)=====")
++ std::string(R"=====(def native_layer_norm(input: List[int],
+    normalized_shape: List[int]) -> Tuple[List[int], List[int], List[int]]:
+  reduction_shape = annotate(List[int], [])
+  num_unreduced_dimensions = torch.sub(torch.len(input), torch.len(normalized_shape))
+  if torch.ge(num_unreduced_dimensions, 0):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  for i in range(num_unreduced_dimensions):
+    _0 = torch.append(reduction_shape, input[i])
+  _1 = torch.__range_length(num_unreduced_dimensions, torch.len(input), 1)
+  for _2 in range(_1):
+    _3 = torch.append(reduction_shape, 1)
+  out = annotate(List[int], [])
+  for _4 in range(torch.len(input)):
+    elem = input[_4]
+    _5 = torch.append(out, elem)
+  _6 = (out, reduction_shape, reduction_shape)
+  return _6
+
+def native_batch_norm(input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+    training: bool) -> Tuple[List[int], List[int], List[int]]:
+  if training:
+    _size = [input[1]]
+  else:
+    _size = [0]
+  out = annotate(List[int], [])
+  for _0 in range(torch.len(input)):
+    elem = input[_0]
+    _1 = torch.append(out, elem)
+  return (out, _size, _size)
+
+)=====")
++ std::string(R"=====(def broadcast_three(a: List[int],
+    b: List[int],
+    c: List[int]) -> List[int]:
+  _0 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  _1 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  dimsA = torch.len(a)
+  dimsB = torch.len(b)
+  ndim = ops.prim.max(dimsA, dimsB)
+  expandedSizes = annotate(List[int], [])
+  for i in range(ndim):
+    offset = torch.sub(torch.sub(ndim, 1), i)
+    dimA = torch.sub(torch.sub(dimsA, 1), offset)
+    dimB = torch.sub(torch.sub(dimsB, 1), offset)
+    if torch.ge(dimA, 0):
+      sizeA = a[dimA]
+    else:
+      sizeA = 1
+    if torch.ge(dimB, 0):
+      sizeB = b[dimB]
+    else:
+      sizeB = 1
+    if torch.ne(sizeA, sizeB):
+      _2 = torch.ne(sizeA, 1)
+    else:
+      _2 = False
+    if _2:
+      _3 = torch.ne(sizeB, 1)
+    else:
+      _3 = False
+    if _3:
+      _4 = torch.add("AssertionError: ", torch.format(_0, sizeA, sizeB, i))
+      ops.prim.RaiseException(_4)
+    else:
+      pass
+    if torch.eq(sizeA, 1):
+      _5 = sizeB
+    else:
+      _5 = sizeA
+    _6 = torch.append(expandedSizes, _5)
+  dimsA0 = torch.len(expandedSizes)
+  dimsB0 = torch.len(c)
+  ndim0 = ops.prim.max(dimsA0, dimsB0)
+  expandedSizes0 = annotate(List[int], [])
+  for i0 in range(ndim0):
+    offset0 = torch.sub(torch.sub(ndim0, 1), i0)
+    dimA0 = torch.sub(torch.sub(dimsA0, 1), offset0)
+    dimB0 = torch.sub(torch.sub(dimsB0, 1), offset0)
+    if torch.ge(dimA0, 0):
+      sizeA0 = expandedSizes[dimA0]
+    else:
+      sizeA0 = 1
+    if torch.ge(dimB0, 0):
+      sizeB0 = c[dimB0]
+    else:
+      sizeB0 = 1
+    if torch.ne(sizeA0, sizeB0):
+      _7 = torch.ne(sizeA0, 1)
+    else:
+      _7 = False
+    if _7:
+      _8 = torch.ne(sizeB0, 1)
+    else:
+      _8 = False
+    if _8:
+      _9 = torch.format(_1, sizeA0, sizeB0, i0)
+      ops.prim.RaiseException(torch.add("AssertionError: ", _9))
+    else:
+      pass
+    if torch.eq(sizeA0, 1):
+      _10 = sizeB0
+    else:
+      _10 = sizeA0
+    _11 = torch.append(expandedSizes0, _10)
+  return expandedSizes0
+
+)=====")
++ std::string(R"=====(def broadcast_one_three(a: List[int],
+    b: Any,
+    c: List[int]) -> List[int]:
+  _0 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  dimsA = torch.len(a)
+  dimsB = torch.len(c)
+  ndim = ops.prim.max(dimsA, dimsB)
+  expandedSizes = annotate(List[int], [])
+  for i in range(ndim):
+    offset = torch.sub(torch.sub(ndim, 1), i)
+    dimA = torch.sub(torch.sub(dimsA, 1), offset)
+    dimB = torch.sub(torch.sub(dimsB, 1), offset)
+    if torch.ge(dimA, 0):
+      sizeA = a[dimA]
+    else:
+      sizeA = 1
+    if torch.ge(dimB, 0):
+      sizeB = c[dimB]
+    else:
+      sizeB = 1
+    if torch.ne(sizeA, sizeB):
+      _1 = torch.ne(sizeA, 1)
+    else:
+      _1 = False
+    if _1:
+      _2 = torch.ne(sizeB, 1)
+    else:
+      _2 = False
+    if _2:
+      _3 = torch.add("AssertionError: ", torch.format(_0, sizeA, sizeB, i))
+      ops.prim.RaiseException(_3)
+    else:
+      pass
+    if torch.eq(sizeA, 1):
+      _4 = sizeB
+    else:
+      _4 = sizeA
+    _5 = torch.append(expandedSizes, _4)
+  return expandedSizes
+
+)=====")
++ std::string(R"=====(def broadcast_inplace(a: List[int],
+    b: List[int]) -> List[int]:
+  _0 = "The dims of tensor b ({}) must be less than or equal tothe dims of tensor a ({}) "
+  _1 = "The size of tensor a {} must match the size of tensor b ({}) at non-singleton dimension {}"
+  dimsA = torch.len(a)
+  dimsB = torch.len(b)
+  if torch.gt(dimsB, dimsA):
+    _2 = torch.add("AssertionError: ", torch.format(_0, dimsB, dimsA))
+    ops.prim.RaiseException(_2)
+  else:
+    pass
+  for dimA in range(dimsA):
+    dimB = torch.add(torch.sub(dimsB, dimsA), dimA)
+    sizeA = a[dimA]
+    if torch.ge(dimB, 0):
+      sizeB = b[dimB]
+    else:
+      sizeB = 1
+    if torch.ne(sizeA, sizeB):
+      _3 = torch.ne(sizeB, 1)
+    else:
+      _3 = False
+    if _3:
+      _4 = torch.format(_1, sizeA, sizeB, dimA)
+      ops.prim.RaiseException(torch.add("AssertionError: ", _4))
+    else:
+      pass
+  out = annotate(List[int], [])
+  for _5 in range(torch.len(a)):
+    elem = a[_5]
+    _6 = torch.append(out, elem)
+  return out
+
+def nonzero_lower_bound(input: List[int]) -> List[int]:
+  if torch.ge(torch.len(input), 1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  return [0, torch.len(input)]
+
+def nonzero_upper_bound(input: List[int]) -> List[int]:
+  if torch.ge(torch.len(input), 1):
+    pass
+  else:
+    ops.prim.RaiseException("AssertionError: ")
+  numel = 1
+  for _0 in range(torch.len(input)):
+    elem = input[_0]
+    numel = torch.mul(numel, elem)
+  return [numel, torch.len(input)]
+
+)=====")
+;
+
+
+const std::string& GetSerializedShapeFunctions() {
+  return shape_funcs;
+}
+
+
+const OperatorMap<std::string>& GetShapeFunctionMappings() {
+ static const OperatorMap<std::string> shape_mappings {
+    {"aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)", "unary"},
+    {"aten::rsub.Tensor(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "unary"},
+    {"aten::dropout(Tensor input, float p, bool train) -> Tensor", "unary"},
+    {"aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", "adaptive_avg_pool2d"},
+    {"prim::NumToTensor.Scalar(Scalar a) -> Tensor", "zero_dim_tensor"},
+    {"prim::NumToTensor.bool(bool a) -> Tensor", "zero_dim_tensor"},
+    {"aten::zeros(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", "unary"},
+    {"aten::to.dtype(Tensor(a) self, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))", "unary"},
+    {"aten::arange(Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", "arange_end"},
+    {"aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "arange_start"},
+    {"aten::arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "arange_start_step"},
+    {"aten::squeeze(Tensor(a) self) -> Tensor(a)", "squeeze_nodim"},
+    {"aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)", "squeeze"},
+    {"aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)", "unsqueeze"},
+    {"aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)", "slice"},
+    {"aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)", "select"},
+    {"aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", "index_select"},
+    {"aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", "unary"},
+    {"aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "unary"},
+    {"aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor", "unary"},
+    {"aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)", "unary"},
+    {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor", "embedding"},
+    {"aten::mm(Tensor self, Tensor mat2) -> Tensor", "mm"},
+    {"aten::dot(Tensor self, Tensor tensor) -> Tensor", "dot"},
+    {"aten::mv(Tensor self, Tensor vec) -> Tensor", "mv"},
+    {"aten::matmul(Tensor self, Tensor other) -> Tensor", "matmul"},
+    {"aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor", "linear"},
+    {"aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "max_pool2d"},
+    {"aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", "max_pool2d_with_indices"},
+    {"aten::t(Tensor(a) self) -> Tensor(a)", "t"},
+    {"aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", "transpose"},
+    {"aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor", "conv1d"},
+    {"aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", "conv2d"},
+    {"aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", "batch_norm"},
+    {"aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor", "conv3d"},
+    {"aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)", "flatten"},
+    {"aten::cat(Tensor[] tensors, int dim=0) -> Tensor", "cat"},
+    {"aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)", "permute"},
+    {"aten::view(Tensor(a) self, int[] size) -> Tensor(a)", "view"},
+    {"aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", "expand"},
+    {"aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)", "expand_one_unused"},
+    {"aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"},
+    {"aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"},
+    {"aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "max_dim"},
+    {"aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor", "zero_dim_tensor"},
+    {"aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor", "zero_dim_tensor"},
+    {"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "addmm"},
+    {"aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> (Tensor)", "upsample_nearest2d"},
+    {"aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor", "unary"},
+    {"aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor", "unary"},
+    {"aten::dequantize(Tensor self) -> Tensor", "unary"},
+    {"quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc", "broadcast"},
+    {"aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor", "argmax"},
+    {"aten::bmm(Tensor self, Tensor mat2) -> Tensor", "bmm"},
+    {"aten::_shape_as_tensor(Tensor self) -> Tensor", "_shape_as_tensor"},
+    {"aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)", "topk"},
+    {"aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)", "nll_loss_forward"},
+    {"aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)", "native_layer_norm"},
+    {"aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "native_batch_norm"},
+    {"aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor", "broadcast_three"},
+    {"aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor", "broadcast_one_three"},
+    {"aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", "broadcast_inplace"},
+  };
+
+  return shape_mappings;
+}
+
+const OperatorMap<std::pair<std::string, std::string>>& GetBoundedShapeMappings() {
+ static const OperatorMap<std::pair<std::string, std::string>> shape_mappings {
+    {"aten::nonzero(Tensor self) -> (Tensor)", {"nonzero_lower_bound", "nonzero_upper_bound"}},
+  };
+
+  return shape_mappings;
+}
+
+// clang-format on
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/serialized_shape_function_registry.h b/torch/csrc/jit/runtime/serialized_shape_function_registry.h
new file mode 100644
index 000000000000..495bc17d15a2
--- /dev/null
+++ b/torch/csrc/jit/runtime/serialized_shape_function_registry.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API const std::string& GetSerializedShapeFunctions();
+
+TORCH_API const OperatorMap<std::string>& GetShapeFunctionMappings();
+
+TORCH_API const OperatorMap<std::pair<std::string, std::string>>&
+GetBoundedShapeMappings();
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/shape_function_registry.h b/torch/csrc/jit/runtime/shape_function_registry.h
new file mode 100644
index 000000000000..8a0334a39b2c
--- /dev/null
+++ b/torch/csrc/jit/runtime/shape_function_registry.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API const std::string& GetSerializedFuncs();
+
+TORCH_API const OperatorMap<std::string>& GetFuncMapping();
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/shape_functions.h b/torch/csrc/jit/runtime/shape_functions.h
deleted file mode 100644
index 7de7dadb5f66..000000000000
--- a/torch/csrc/jit/runtime/shape_functions.h
+++ /dev/null
@@ -1,477 +0,0 @@
-R"=====("  ### DO NOT REMOVE THIS STRING!!!
-# this file is included in torch/csrc/jit/runtime/symbolic_shape_registry.cpp
-# at compile time and turned into a "raw" string
-# there's a matching one at the bottom
-# mypy: ignore-errors
-# flake8: noqa
-
-from typing import List, Any, Optional, Tuple, TypeVar, Union
-number = TypeVar('number', bound=Union[int, float])
-
-import torch
-
-import inspect
-import warnings
-from importlib.machinery import SourceFileLoader
-
-import os
-shape_function_fp = f"{os.path.dirname(os.path.realpath(__file__))}/shape_functions_1.h"
-try:
-    _shapes_1 = SourceFileLoader("shape_functions", shape_function_fp).load_module() # type: ignore
-    globals().update(inspect.getmembers(_shapes_1))
-except Exception as e:
-    warnings.warn(f"Couldn't load shape functions from {shape_function_fp}")
-
-
-
-####    SHAPE COMPUTE FUNCTIONS START   ###
-
-
-def index_select(self: List[int], dim: int, index: List[int]):
-    dim = maybe_wrap_dim(dim, len(self))
-    numel = multiply_integers(index)
-    assert len(index) <= 1
-    assert dim == 0 or dim < len(self)
-    result_size: List[int] = []
-    for i in range(len(self)):
-        if dim == i:
-            result_size.append(numel)
-        else:
-            result_size.append(self[i])
-    return result_size
-
-
-def embedding(
-    weight: List[int],
-    indices: List[int],
-    padding_idx: int = -1,
-    scale_grad_by_freq: bool = False,
-    sparse: bool = False,
-):
-    assert len(weight) == 2
-    if len(indices) == 1:
-        return index_select(weight, 0, indices)
-    size = _copy(indices)
-    size.append(weight[1])
-    return size
-
-
-def max_int():
-    return 9223372036854775807
-
-
-def slice(
-    self: List[int], dim: int, start: Optional[int], end: Optional[int], step: int
-):
-    ndim = len(self)
-    assert ndim != 0
-    dim = maybe_wrap_dim(dim, ndim)
-    start_val = start if start is not None else 0
-    end_val = end if end is not None else max_int()
-    assert step > 0
-    if start_val == max_int():
-        start_val = 0
-    if start_val < 0:
-        start_val += self[dim]
-    if end_val < 0:
-        end_val += self[dim]
-    if start_val < 0:
-        start_val = 0
-    elif start_val >= self[dim]:
-        start_val = self[dim]
-    if end_val < start_val:
-        end_val = start_val
-    elif end_val >= self[dim]:
-        end_val = self[dim]
-    len = end_val - start_val
-    out = _copy(self)
-    out[dim] = (len + step - 1) // step
-    return out
-
-
-def check_cat_no_zero_dim(tensors: List[List[int]]):
-    for tensor in tensors:
-        assert len(tensor) > 0
-
-def legacy_cat_wrap_dim(dim: int, tensor_sizes: List[List[int]]):
-    out_dim: Optional[int] = None
-    for size in tensor_sizes:
-        if not (len(size) == 1 and size[0] == 0):
-            if out_dim is None:
-                out_dim = maybe_wrap_dim(dim, len(size))
-    if out_dim is None:
-        out_dim = dim
-    return out_dim
-
-
-def should_skip(tensor: List[int]):
-    return numel(tensor) == 0 and len(tensor) == 1
-
-
-def check_cat_shape_except_dim(
-    first: List[int], second: List[int], dimension: int, index: int
-):
-    first_dims = len(first)
-    second_dims = len(second)
-    assert first_dims == second_dims, "Tensors must have same number of dimensions"
-    for dim in range(0, first_dims):
-        if dim != dimension:
-            assert (
-                first[dim] == second[dim]
-            ), "Sizes of tensors must match except in dimension"
-
-
-def cat(tensors: List[List[int]], dim: int):
-    check_cat_no_zero_dim(tensors)
-    dim = legacy_cat_wrap_dim(dim, tensors)
-    assert len(tensors) > 0
-    not_skipped_tensor: Optional[List[int]] = None
-    for tensor in tensors:
-        if not should_skip(tensor):
-            not_skipped_tensor = tensor
-    if not_skipped_tensor is None:
-        return [0]
-
-    cat_dim_size = 0
-
-    for i in range(len(tensors)):
-        tensor = tensors[i]
-        if not should_skip(tensor):
-            check_cat_shape_except_dim(not_skipped_tensor, tensor, dim, i)
-            cat_dim_size = cat_dim_size + tensor[dim]
-
-    result_size = _copy(not_skipped_tensor)
-    result_size[dim] = cat_dim_size
-    return result_size
-
-
-def select(self: List[int], dim: int, index: int):
-    ndim = len(self)
-    assert ndim != 0
-    dim = maybe_wrap_dim(dim, ndim)
-    size = self[dim]
-    assert not (index < -size or index >= size)
-    if index < 0:
-        index += size
-    out: List[int] = []
-    for i in range(ndim):
-        if i != dim:
-            out.append(self[i])
-    return out
-
-
-def matmul(tensor1: List[int], tensor2: List[int]):
-    dim_tensor1 = len(tensor1)
-    dim_tensor2 = len(tensor2)
-    if dim_tensor1 == 1 and dim_tensor2 == 1:
-        return dot(tensor1, tensor2)
-    elif dim_tensor1 == 2 and dim_tensor2 == 1:
-        return mv(tensor1, tensor2)
-    elif dim_tensor1 == 1 and dim_tensor2 == 2:
-        return squeeze(mm(unsqueeze(tensor1, 0), tensor2), 0)
-    elif dim_tensor1 == 2 and dim_tensor2 == 2:
-        return mm(tensor1, tensor2)
-    elif dim_tensor1 >= 1 and dim_tensor2 >= 1:
-        # We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
-        # we track m1 vs m2 separately even though they must match for nicer error messages
-        n = tensor1[-2] if dim_tensor1 > 1 else 1
-        m1 = tensor1[-1]
-        batch_tensor1: List[int] = []
-        # TODO: handling of slice
-        for i in range(dim_tensor1 - 2):
-            batch_tensor1.append(tensor1[i])
-        m2 = tensor2[-1] if dim_tensor2 > 1 else 1
-        p = tensor2[-1]
-        batch_tensor2: List[int] = []
-        # TODO: handling of slice
-        for i in range(dim_tensor2 - 2):
-            batch_tensor2.append(tensor2[i])
-
-        # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
-        expand_batch_portion = broadcast(batch_tensor1, batch_tensor2)
-
-        # todo: copy ?
-        output_shape = expand_batch_portion
-        if dim_tensor1 > 1:
-            output_shape.append(n)
-
-        if dim_tensor2 > 1:
-            output_shape.append(p)
-
-        return output_shape
-    else:
-        assert False, "both  arguments to matmul need to be at least 1D"
-
-
-def t(self: List[int]):
-    assert len(self) <= 2
-    self_len = len(self)
-    if self_len == 0:
-        out: List[int] = []
-        return out
-    elif self_len == 1:
-        return [self[0]]
-    else:
-        return [self[1], self[0]]
-
-
-def transpose(self: List[int], dim0: int, dim1: int):
-    ndims = len(self)
-    dim0 = maybe_wrap_dim(dim0, ndims)
-    dim1 = maybe_wrap_dim(dim1, ndims)
-    if dim0 == dim1:
-        return _copy(self)
-    out: List[int] = []
-    for i in range(ndims):
-        if i == dim0:
-            out.append(self[dim1])
-        elif i == dim1:
-            out.append(self[dim0])
-        else:
-            out.append(self[i])
-    return out
-
-
-def linear(input: List[int], weight: List[int], bias: Optional[List[int]]):
-    out = matmul(input, t(weight))
-    if bias is not None:
-        assert broadcast(bias, out) == out
-    return out
-
-
-def addmm(self: List[int], mat1: List[int], mat2: List[int], beta: Any, alpha: Any):
-    return broadcast(self, mm(mat1, mat2))
-
-
-def check_non_negative(array: List[int]) -> bool:
-    # TODO: look into rewriting with early return and getting loop unrolling to fire
-    non_negative = False
-    for val in array:
-        if val < 0:
-            non_negative = True
-    return non_negative
-
-
-def check_shape_forward(
-    input: List[int],
-    weight_sizes: List[int],
-    bias: Optional[List[int]],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    groups: int,
-):
-    k = len(input)
-    weight_dim = len(weight_sizes)
-
-    # TODO: assertions could be expanded with the error messages
-    assert not check_non_negative(padding)
-    assert not check_non_negative(stride)
-
-    assert weight_dim == k
-    assert weight_sizes[0] >= groups
-    assert (weight_sizes[0] % groups) == 0
-    # only handling not transposed
-    assert input[1] == weight_sizes[1] * groups
-    assert bias is None or (len(bias) == 1 and bias[0] == weight_sizes[0])
-
-    for i in range(2, k):
-        assert (input[i] + 2 * padding[i - 2]) >= (
-            dilation[i - 2] * (weight_sizes[i] - 1) + 1
-        )
-
-    # this is not handling transposed convolution yet
-
-
-def conv_output_size(
-    input_size: List[int],
-    weight_size: List[int],
-    bias: Optional[List[int]],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    groups: int,
-):
-    check_shape_forward(
-        input_size, weight_size, bias, stride, padding, dilation, groups
-    )
-
-    has_dilation = len(dilation) > 0
-    dim = len(input_size)
-    output_size: List[int] = []
-    input_batch_size_dim = 0
-    weight_output_channels_dim = 0
-    output_size.append(input_size[input_batch_size_dim])
-    output_size.append(weight_size[weight_output_channels_dim])
-
-    for d in range(2, dim):
-        dilation_ = dilation[d - 2] if has_dilation else 1
-        kernel = dilation_ * (weight_size[d] - 1) + 1
-        output_size.append(
-            (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[d - 2] + 1
-        )
-    return output_size
-
-
-def conv1d(
-    input: List[int],
-    weight: List[int],
-    bias: Optional[List[int]],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    groups: int,
-):
-    assert len(weight) == 3
-    assert len(input) == 3
-    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
-
-
-def conv2d(
-    input: List[int],
-    weight: List[int],
-    bias: Optional[List[int]],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    groups: int,
-):
-    assert len(weight) == 4
-    assert len(input) == 4
-    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
-
-
-def batch_norm(
-    input: List[int],
-    weight: List[int],
-    bias: Optional[List[int]],
-    running_mean: Optional[List[int]],
-    running_var: Optional[List[int]],
-    training: bool,
-    momentum: float,
-    eps: float,
-    cudnn_enabled: bool,
-):
-    out: List[int] = []
-    for elem in input:
-        out.append(elem)
-    return out
-
-
-def conv3d(
-    input: List[int],
-    weight: List[int],
-    bias: Optional[List[int]],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    groups: int,
-):
-    assert len(weight) == 5
-    assert len(input) == 5
-    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
-
-
-def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
-    if dim_post_expr <= 0:
-        assert wrap_scalar
-        dim_post_expr = 1
-    min = -dim_post_expr
-    max = dim_post_expr - 1
-    assert not (dim < min or dim > max)
-    if dim < 0:
-        dim += dim_post_expr
-    return dim
-
-
-def zero_dim_tensor(input: Any):
-    out: List[int] = []
-    return out
-
-
-def multiply_integers(li: List[int]):
-    out = 1
-    for elem in li:
-        out = out * elem
-    return out
-
-
-def arange_end(end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any):
-    assert end >= 0
-    return [int(torch.ceil(end))]
-
-
-def arange_start(
-    start: number, end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
-):
-    assert end >= 0
-    assert end >= start
-    return [int(torch.ceil(end - start))]
-
-
-def arange_start_step(
-    start: number, end: number, step: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
-):
-    assert step != 0
-    if step < 0:
-        assert start >= end
-    else:
-        assert end >= start
-    return [int(torch.ceil((end - start) / step))]
-
-
-def permute(input: List[int], dims: List[int]):
-    assert len(input) == len(dims)
-    ndim = len(dims)
-    seen_dims: List[int] = []
-    newSizes: List[int] = []
-    for i in range(ndim):
-        dim = maybe_wrap_dim(dims[i], ndim)
-        seen_dims.append(dim)
-        newSizes.append(input[dim])
-    for i in range(1, ndim):
-        for j in range(i):
-            assert seen_dims[i] != seen_dims[j]
-    return newSizes
-
-
-def flatten(input: List[int], start_dim: int, end_dim: int):
-    start_dim = maybe_wrap_dim(start_dim, len(input))
-    end_dim = maybe_wrap_dim(end_dim, len(input))
-    assert start_dim <= end_dim
-    if len(input) == 0:
-        return [1]
-    if start_dim == end_dim:
-        # TODO: return self
-        out: List[int] = []
-        for elem in input:
-            out.append(elem)
-        return out
-    slice_numel = 1
-    for i in range(start_dim, end_dim + 1):
-        slice_numel *= input[i]
-    # TODO: use slicing when slice optimization has landed
-    # slice_numel = multiply_integers(input[start_dim:end_dim - start_dim + 1])
-    shape: List[int] = []
-    for i in range(start_dim):
-        shape.append(input[i])
-    shape.append(slice_numel)
-    for i in range(end_dim + 1, len(input)):
-        shape.append(input[i])
-    return shape
-
-
-def quantized_prepacked_conv2d(input: List[int], conv2dOpContext: Any):
-    assert isinstance(
-        conv2dOpContext, __torch__.torch.classes.quantized.Conv2dPackedParamsBase
-    )
-    (weight, bias, stride, padding, dilation, groups) = unchecked_cast(
-        Tuple[List[int], Optional[List[int]], List[int], List[int], List[int], int],
-        ops.quantized.conv2d_unpack_sizes(conv2dOpContext),
-    )
-    return conv2d(input, weight, bias, stride, padding, dilation, groups)
-
-
-####    SHAPE COMPUTE FUNCTIONS END   ###
-### DO NOT REMOVE THIS STRING!!!
-")====="
diff --git a/torch/csrc/jit/runtime/shape_functions_1.h b/torch/csrc/jit/runtime/shape_functions_1.h
deleted file mode 100644
index f95c5e2bb836..000000000000
--- a/torch/csrc/jit/runtime/shape_functions_1.h
+++ /dev/null
@@ -1,416 +0,0 @@
-R"=====("  ### DO NOT REMOVE THIS STRING!!!
-# this file is included in torch/csrc/jit/runtime/symbolic_shape_registry.cpp
-# at compile time and turned into a "raw" string
-# there's a matching one at the bottom
-# mypy: ignore-errors
-# flake8: noqa
-
-from typing import List, Any, Optional, Tuple, TypeVar, Union
-number = TypeVar('number', bound=Union[int, float])
-
-import torch
-
-####    SHAPE COMPUTE FUNCTIONS START   ###
-
-
-def broadcast(a: List[int], b: List[int]):
-    dimsA = len(a)
-    dimsB = len(b)
-    ndim = max(dimsA, dimsB)
-    expandedSizes: List[int] = []
-
-    for i in range(ndim):
-        offset = ndim - 1 - i
-        dimA = dimsA - 1 - offset
-        dimB = dimsB - 1 - offset
-        sizeA = a[dimA] if (dimA >= 0) else 1
-        sizeB = b[dimB] if (dimB >= 0) else 1
-
-        if sizeA != sizeB and sizeA != 1 and sizeB != 1:
-            # TODO: only assertion error is bound in C++ compilation right now
-            raise AssertionError(
-                "The size of tensor a {} must match the size of tensor b ("
-                "{}) at non-singleton dimension {}".format(sizeA, sizeB, i)
-            )
-
-        expandedSizes.append(sizeB if sizeA == 1 else sizeA)
-
-    return expandedSizes
-
-def broadcast_three(a: List[int], b: List[int], c: List[int]):
-    return broadcast(broadcast(a, b), c)
-
-def broadcast_one_three(a: List[int], b: Any, c: List[int]):
-    return broadcast(a, c)
-
-def adaptive_avg_pool2d(self: List[int], out: List[int]):
-    assert len(out) == 2
-    assert len(self) == 3 or len(self) == 4
-    for i in range(1, len(self)):
-        assert self[i] != 0
-
-    shape: List[int] = []
-    for i in range(0, len(self) - 2):
-        shape.append(self[i])
-    for elem in out:
-        shape.append(elem)
-    return shape
-
-
-def _copy(self: List[int]):
-    out: List[int] = []
-    for elem in self:
-        out.append(elem)
-    return out
-
-
-def unary(self: List[int]):
-    return _copy(self)
-
-
-def broadcast_inplace(a: List[int], b: List[int]):
-    dimsA = len(a)
-    dimsB = len(b)
-    if dimsB > dimsA:
-        raise AssertionError(
-            "The dims of tensor b ({}) must be less than or equal to"
-            "the dims of tensor a ({}) ".format(dimsB, dimsA)
-        )
-    for dimA in range(dimsA):
-        dimB = dimsB - dimsA + dimA
-        sizeA = a[dimA]
-        sizeB = b[dimB] if (dimB >= 0) else 1
-        if sizeA != sizeB and sizeB != 1:
-            # TODO: only assertion error is bound in C++ compilation right now
-            raise AssertionError(
-                "The size of tensor a {} must match the size of tensor b ("
-                "{}) at non-singleton dimension {}".format(sizeA, sizeB, dimA)
-            )
-    return _copy(a)
-
-
-def expand(self: List[int], sizes: List[int]):
-    assert len(sizes) >= len(self)
-    ndim = len(sizes)
-    tensor_dim = len(self)
-    if ndim == 0:
-        return _copy(sizes)
-    out: List[int] = []
-    for i in range(ndim):
-        offset = ndim - 1 - i
-        dim = tensor_dim - 1 - offset
-        size = self[dim] if dim >= 0 else 1
-        targetSize = sizes[i]
-        if targetSize == -1:
-            assert dim >= 0
-            targetSize = size
-        if size != targetSize:
-            assert size == 1
-            size = targetSize
-        out.append(size)
-    return out
-
-
-def expand_one_unused(self: List[int], sizes: List[int], inp0: Any):
-    return expand(self, sizes)
-
-
-def infer_size_impl(shape: List[int], numel: int) -> List[int]:
-    newsize = 1
-    infer_dim: Optional[int] = None
-    for dim in range(len(shape)):
-        if shape[dim] == -1:
-            if infer_dim is not None:
-                raise AssertionError("only one dimension can be inferred")
-            infer_dim = dim
-        elif shape[dim] >= 0:
-            newsize *= shape[dim]
-        else:
-            raise AssertionError("invalid shape dimensions")
-    if not (
-        numel == newsize
-        or (infer_dim is not None and newsize > 0 and numel % newsize == 0)
-    ):
-        raise AssertionError("invalid shape")
-    out = _copy(shape)
-    if infer_dim is not None:
-        out[infer_dim] = numel // newsize
-    return out
-
-
-def numel(sizes: List[int]):
-    numel = 1
-    for elem in sizes:
-        numel *= elem
-    return numel
-
-
-def view(self: List[int], sizes: List[int]):
-    return infer_size_impl(sizes, numel(self))
-
-
-def view_one_unused(self: List[int], sizes: List[int], *, implicit: bool = False):
-    return view(self, sizes)
-
-
-def mean_dim(self: List[int], dims: List[int], keep_dim: bool, dt: Any):
-    out: List[int] = []
-    for idx in range(len(self)):
-        is_mean_dim: bool = False
-        for reduce_dim in dims:
-            if idx == maybe_wrap_dim(reduce_dim, len(self)):
-                is_mean_dim = True
-        if is_mean_dim:
-            if keep_dim:
-                out.append(1)
-        else:
-            out.append(self[idx])
-    return out
-
-def max_dim(self: List[int], dim: int, keep_dim: bool):
-    out = mean_dim(self, [dim], keep_dim, None)
-    return out, out
-
-# note: python already rounds down towards negative infinity on integer division, special arithmetic not needed
-def div_rtn(x: int, y: int):
-    return x // y
-
-
-def pooling_output_shape_pad_lr(
-    inputSize: int,
-    kernelSize: int,
-    pad_l: int,
-    pad_r: int,
-    stride: int,
-    dilation: int,
-    ceil_mode: bool,
-):
-    outputSize = (
-        div_rtn(
-            inputSize
-            + pad_l
-            + pad_r
-            - dilation * (kernelSize - 1)
-            - 1
-            + (stride - 1 if ceil_mode else 0),
-            stride,
-        )
-        + 1
-    )
-    if ceil_mode:
-        if (outputSize - 1) * stride >= inputSize + pad_l:
-            outputSize = outputSize - 1
-    return outputSize
-
-
-def pooling_output_shape(
-    inputSize: int,
-    kernelSize: int,
-    pad_l: int,
-    stride: int,
-    dilation: int,
-    ceil_mode: bool,
-):
-    assert stride != 0, "stride should not be zeero"
-    return pooling_output_shape_pad_lr(
-        inputSize, kernelSize, pad_l, pad_l, stride, dilation, ceil_mode
-    )
-
-
-def pool2d_shape_check(
-    input: List[int],
-    kH: int,
-    kW: int,
-    dH: int,
-    dW: int,
-    padH: int,
-    padW: int,
-    dilationH: int,
-    dilationW: int,
-    nInputPlane: int,
-    inputHeight: int,
-    inputWidth: int,
-    outputHeight: int,
-    outputWidth: int,
-):
-    ndim = len(input)
-    nOutputPlane = nInputPlane
-
-    assert kW > 0 and kH > 0
-    assert dW > 0 and dH > 0
-    assert dilationH > 0 and dilationW > 0
-
-    valid_dims = input[1] != 0 and input[2] != 0
-    assert (
-        ndim == 3
-        and input[0] != 0
-        and valid_dims
-        or (ndim == 4 and valid_dims and input[3] != 0)
-    )
-
-    assert kW // 2 >= padW and kH // 2 >= padH
-    assert outputWidth >= 1 and outputHeight >= 1
-
-
-def max_pool2d(
-    input: List[int],
-    kernel_size: List[int],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    ceil_mode: bool,
-):
-    assert (
-        len(kernel_size) == 1 or len(kernel_size) == 2
-    ), "max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
-    kH = kernel_size[0]
-    kW = kH if len(kernel_size) == 1 else kernel_size[1]
-
-    assert (
-        len(stride) == 0 or len(stride) == 1 or len(stride) == 2
-    ), "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
-    dH = kH if len(stride) == 0 else stride[0]
-    if len(stride) == 0:
-        dW = kW
-    elif len(stride) == 1:
-        dW = dH
-    else:
-        dW = stride[1]
-
-    assert (
-        len(padding) == 1 or len(padding) == 2
-    ), "max_pool2d: padding must be either be a single int, or a tuple of two ints"
-    padH = padding[0]
-    padW = padH if len(padding) == 1 else padding[1]
-
-    assert (
-        len(dilation) == 1 or len(dilation) == 2
-    ), "max_pool2d: dilation must be either a single int, or a tuple of two ints"
-    dilationH = dilation[0]
-    dilationW = dilationH if len(dilation) == 1 else dilation[1]
-
-    assert len(input) == 3 or len(input) == 4
-
-    nbatch = input[-4] if len(input) == 4 else 1
-    nInputPlane = input[-3]
-    inputHeight = input[-2]
-    inputWidth = input[-1]
-
-    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
-    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
-
-    pool2d_shape_check(
-        input,
-        kH,
-        kW,
-        dH,
-        dW,
-        padH,
-        padW,
-        dilationH,
-        dilationW,
-        nInputPlane,
-        inputHeight,
-        inputWidth,
-        outputHeight,
-        outputWidth,
-    )
-
-    if len(input) == 3:
-        return [nInputPlane, outputHeight, outputWidth]
-    else:
-        return [nbatch, nInputPlane, outputHeight, outputWidth]
-
-
-def max_pool2d_with_indices(
-    input: List[int],
-    kernel_size: List[int],
-    stride: List[int],
-    padding: List[int],
-    dilation: List[int],
-    ceil_mode: bool,
-):
-    out = max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
-    return (out, out)
-
-
-def upsample_nearest2d(
-    input: List[int],
-    output_size: Optional[List[int]],
-    scale_factors: Optional[List[float]],
-):
-    out: List[int] = []
-    out.append(input[0])
-    out.append(input[1])
-    if output_size is not None:
-        assert (
-            scale_factors is None
-        ), "Must specify exactly one of output_size and scale_factors"
-        assert len(output_size) == 2
-        out.append(output_size[0])
-        out.append(output_size[1])
-        return out
-
-    if scale_factors is not None:
-        assert (
-            output_size is None
-        ), "Must specify exactly one of output_size and scale_factors"
-        assert len(scale_factors) == 2
-        out.append(int(input[2] * scale_factors[0]))
-        out.append(int(input[3] * scale_factors[1]))
-        return out
-    assert 0, "Either output_size or scale_factors must be presented"
-
-
-def mm(self: List[int], mat2: List[int]):
-    assert len(self) == 2, "self must be a matrix"
-    assert len(mat2) == 2, "mat2 must be a matrix"
-
-    assert self[1] == mat2[0]
-    return [self[0], mat2[1]]
-
-
-def dot(self: List[int], tensor: List[int]):
-    assert len(self) == 1 and len(tensor) == 1
-    assert self[0] == tensor[0]
-    out: List[int] = []
-    return out
-
-
-def mv(self: List[int], vec: List[int]):
-    assert len(self) == 2 and len(vec) == 1
-    assert self[1] == vec[0]
-    # TODO: return self
-    return [self[0]]
-
-
-def unsqueeze(li: List[int], dim: int):
-    dim = maybe_wrap_dim(dim, len(li) + 1)
-    out = _copy(li)
-    out.insert(dim, 1)
-    return out
-
-
-def squeeze_nodim(li: List[int]):
-    out: List[int] = []
-    for i in range(len(li)):
-        if li[i] != 1:
-            out.append(li[i])
-    return out
-
-
-def squeeze(li: List[int], dim: int):
-    out: List[int] = []
-    wrapped_dim = maybe_wrap_dim(dim, len(li))
-    for i in range(len(li)):
-        if i == wrapped_dim:
-            if li[i] != 1:
-                out.append(li[i])
-        else:
-            out.append(li[i])
-    return out
-
-
-####    SHAPE COMPUTE FUNCTIONS END   ###
-### DO NOT REMOVE THIS STRING!!! #
-")====="
diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp
new file mode 100644
index 000000000000..ef5339cab38e
--- /dev/null
+++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp
@@ -0,0 +1,43 @@
+#include <torch/csrc/jit/runtime/profiling_graph_executor_impl.h>
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/runtime/simple_graph_executor_impl.h>
+#include <mutex>
+
+namespace torch {
+namespace jit {
+
+SimpleGraphExecutorImpl::SimpleGraphExecutorImpl(
+    const std::shared_ptr<Graph>& graph,
+    std::string function_name)
+    : GraphExecutorImplBase(graph, std::move(function_name)) {}
+
+const ExecutionPlan& SimpleGraphExecutorImpl::getPlanFor(
+    Stack& stack,
+    c10::optional<size_t> remaining_bailout_depth) {
+  std::lock_guard<std::mutex> lock(compile_mutex);
+
+  // IMPORTANT: This is a hot path of calling a torchscript function. Try not to
+  // add any code above this.
+  if (execution_plan_) {
+    return *execution_plan_;
+  }
+  auto copy = graph->copy();
+  runNooptPassPipeline(copy);
+  execution_plan_ = ExecutionPlan(copy, function_name_);
+
+  return *execution_plan_;
+}
+
+GraphExecutorState SimpleGraphExecutorImpl::getDebugState() {
+  GraphExecutorState state;
+  TORCH_INTERNAL_ASSERT(execution_plan_);
+  state.graph = execution_plan_->graph.get();
+  auto opt_plan = *execution_plan_;
+  state.execution_plans.emplace(ArgumentSpec{0, 0}, opt_plan);
+  return state;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.h b/torch/csrc/jit/runtime/simple_graph_executor_impl.h
new file mode 100644
index 000000000000..2e9425aab654
--- /dev/null
+++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/runtime/graph_executor_impl.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API SimpleGraphExecutorImpl : public GraphExecutorImplBase {
+  SimpleGraphExecutorImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name);
+
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override;
+  GraphExecutorState getDebugState() override;
+  ~SimpleGraphExecutorImpl() override = default;
+
+ private:
+  c10::optional<ExecutionPlan> execution_plan_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/README.md b/torch/csrc/jit/runtime/static/README.md
index 0ffb946aaf58..35d6f02389a0 100644
--- a/torch/csrc/jit/runtime/static/README.md
+++ b/torch/csrc/jit/runtime/static/README.md
@@ -12,14 +12,9 @@ so many models will not work out of the box.
 This is a list of current assumptions for use with
 this feature.
 
-- Inference only execution
-
-After `torch.jit.freeze` and inlining/constant propagation is run on the model:
-
-- No control flow
-- No submodule invocations
-- No references to `self`
-- Inlined weights (i.e. no calls to `GetAttr`)
+- Inference only execution, CPU only
+- Static input dtypes
+- Static input shapes (the runtime supports dynamic shapes, but excessive dynamic shapes may degrade performance)
 
 ## Threading model
 Static runtime supports two execution modes.
@@ -61,10 +56,185 @@ Runtime instances in your code.
   pool.push(runtime);
 ```
 
-## Planned features
+**In both modes, `StaticRuntime` may not be used after its associated `StaticModule` is destructed!**
+
+## Memory Planning
+Static runtime's memory planner does two things:
+
+1) Coalesces internal allocations for tensor storage
+2) Does static analysis to figure out how to efficiently re-use memory.
+
+For (2), there are two algorithms used. Specify which algorithm with
+the `memory_planner_algorithm` field in `StaticModuleOptions`. The
+algorithms are briefly described below:
+
+### Standard Resizing (default)
+Static runtime will record the space required for each intermediate managed tensor it sees
+on the first inference iteration. An intermediate tensor is *managed* if two conditions
+are satisfied:
+
+1) The op that produces it has an out variant. Out variants are wrappers around ops that
+conceptually transform the op's signature from `Tensor some_op(const Tensor& some_arg)`
+into `void some_op(Tensor& output, const Tensor& some_arg)`. Out variants are registered
+with static runtime via the `REGISTER_OPERATOR_FUNCTOR` macro; see "Registering Ops" for
+more info.
+
+2) The tensor does not alias a graph output. Output tensors are handled separately by
+the memory planner, see "Managed Output Tensors" for details.
+
+With this algorithm, static analysis is used to group the tensors in `StorageGroup`s.
+Tensors in the same storage group share memory, and two tensors can be in the same storage group
+if their lifetimes do not overlap.
+
+On the subsequent iterations, static runtime allocates the tensor buffer at the start of the run.
+The amount of memory allocated is `sum([max(tensor.size()) for tensor in storage_groups])`.
+
+If a tensor needs to be bigger than the allocated space on subsequent runs, a dynamic allocation
+will occur. This is why dynamic shapes will degrade performance. With the standard resizing
+strategy, static runtime will record the new largest tensor size in each storage group at the
+end of the iteration and allocate a buffer that is possibly bigger on the next iteration.
+
+### Precomputed Offsets Memory Planner (experimental)
+This algorithm is based on [arXiv:2001.03288](https://arxiv.org/pdf/2001.03288.pdf), section 5.2 "Greedy by Size for Offset Calculation".
+
+The paper describes the algorithm in detail, but the key considerations are:
+
+1) This algorithm will tend to be more efficient with respect to maximum memory usage
+2) This algorithm will *not* resize the tensor buffer since recomputing offsets is a quadratic operation. Therefore,
+to avoid performance degradation, the model should be warmed up with the largest possible inputs.
+
+
+### Managed Output Tensors
+
+`StaticRuntime` can optionally manage output tensors via the `manage_output_tensors` option in `StaticModuleOptions`.
+When this flag is turned on, we coalesce allocations for output tensors together. Note that the buffer containing
+output tensors is separated from the one containing intermediate tensors. The former needs to live past the end
+of the inference run, but the latter needs deallocated at the end of the run.
+
+Under the hood, we store a refcounted pointer to the output arena in each returned `Tensor`. The arena is destroyed
+only when all output tensors are destroyed.
+
+```
+auto output = runtime(args);
+auto& elems = output.toTupleRef().elements();
+auto tensor_1 = elems[0].toTensor();
+auto tensor_2 = elems[1].toTensor();
+
+tensor_1 = at::empty({0}); // Output buffer not deallocated yet!
+tensor_2 = at::empty({0}); // This call deallocates the output buffer.
+```
+
+## Registering Ops
+Static runtime has three op execution modes:
+
+1) Out variants: ops that return tensors which we may be able to manage. See "Memory Planning" for more
+details. Out variants are registered via the `REGISTER_OPERATOR_FUNCTOR` macro in `ops.h`.
+```
+REGISTER_OPERATOR_FUNCTOR(
+  aten::op_name,
+  aten_op_name, // This macro generates a struct, this field names it
+  [](torch::jit::Node* n) -> SROperator {
+    // This mechanism lets us support a subset of schemas
+    if (n->matches(some_schema)) {
+      return some_overload;
+    } else if (n->matches(another_schema)) {
+      return another_overload;
+    }
+    return nullptr;
+  })
+```
+
+A `SROperator` is a type alias for `std::function<void(ProcessedNode*)>`. See "Implementation Details" for more
+details on `ProcessedNode`.
+
+2) Native functions: just like out variants, except their outputs cannot be managed. This is because the op's return
+type is not a tensor or it is a view op (returns a tensor alias instead of a new tensor). Registration is done with
+`REGISTER_NATIVE_OPERATOR_FUNCTOR`. This macro is used in the same way as `REGISTER_OPERATOR_FUNCTOR`.
+
+3) JIT fallback: static runtime has no implementation for this op, so the implementation that the JIT interpreter uses
+is selected instead.
+
+When loading a model, ops are selected for each `torch::jit::Node` in the graph as follows:
+
+1) If an out variant is registered, pass the node to the function that prodcues the `SROperator`. If
+the result is not `nulltpr`, use that op.
+2) If a native function is registered, pass the node to the function that prodcues the `SROperator`. If
+the result is not `nulltpr`, use that op.
+3) Use the JIT implementation. Static runtime will throw an exception if it does not exist.
+
+## Implementation Details
+
+### Structure and Lifetime Details
+
+The following diagram shows the core data structure. An arrow from `A` to `B` means that
+`A` stores a reference to `B`. If the reference is unowned,
+`A` may not out live `B` or anything that `B` stores a reference to (directly or indirectly).
+If the reference is owned, the lifetimes of `A` and `B` are the same.
+```
+
+                         IValue array◄────────────────┐─────────────────────────────────────────┐
+                              ▲                       │               Owns                      │       Owns
+                              │                       │  ┌───────────────────────────────►ProcessedNode───────►BlockRunner
+                              │Owns                   │  │                                      │                  │
+                              │         Owns          │  │   Owns                               │                  │
+StaticModule◄───────────StaticRuntime───────────►BlockRunner────────►MemoryPlanner              │                  ▼
+    │     │                                           │                  │                      │                 ...
+Owns│     │                                           │                  │                      │
+    ▼     │                                           │                  │                      │
+BlockInfo◄├───────────────────────────────────────────┘──────────────────┘                      │
+          │                                                                                     │
+      Owns│                                                                                     │
+          ▼                                                                                     │
+ProcessedFunction ◄─────────────────────────────────────────────────────────────────────────────┘
+```
+
+Each class is described in detail below.
+
+### `StaticModule` and `StaticRuntime`
+
+`StaticModule`s are constructed from `torch::jit::Module`s and can be used to construct `StaticRuntime`
+instances. Each `StaticModule` caches exactly one `StaticRuntime` instance - it is lazily initialized when
+you access it via `runtime()`.
+
+`StaticModule::operator()` can be used directly to make predictions. Under the hood, this method just
+forwards to the cached runtime's `StaticRuntime::operator()`. One upshot of this behavior is that
+`StaticModule::operator()` is not thread-safe.
+
+The way to use static runtime in a multi-threaded context is to give each thread its own `StaticRuntime`
+instance. New runtime instances can be created directly (`StaticRuntime(static_module)`) or `clone()`'d from
+an existing runtimes.
+
+`StaticModule` takes a set of options that control the behavior of the runtime instances that it spawns;
+see `StaticModuleOptions` for more details.
+
+Internally, `StaticRuntime` owns an array of `IValue`s that is referenced from all `BlockRunner`s and
+`ProcessedNode`s. All values that are generated at runtime are stored in this array.
+
+### `BlockRunner`
+
+A `BlockRunner` represents a single sub-block in the graph. Every graph has at least one `BlockRunner`
+corresponding to the top-level block, and `StaticRuntime` starts its inference run by invoking
+`(*top_level_block)(args, kwargs)`. Each `BlockRunner` has its own `MemoryPlanner` and set of `ProcessedNode`s.
+Special nodes that have sub-blocks (like `prim::If`) might own `BlockRunner`s. The op implementations are responsible
+for invoking `BlockRunner`s corresponding to sub-blocks.
+
+### `MemoryPlanner`
+
+See the "Memory Planning" section. `MemoryPlanner` is an abstract base class. Each sub-class implements a different
+memory planning algorithm.
+
+In addition to the memory planning we do for tensors, `MemoryPlanner` encapsulates a few other optimizations.
+
+* Managed output tensors (see "Managed Output Tensors")
+* Borrowed `IValue`s; ops that just unpack their inputs (e.g. `dict_unpack`) might produce weak-references to
+avoid refcount bumps, the `MemoryPlanner` needs to destroy these borrows appropriately.
+
+### `ProcessedNode` and `ProcessedFunction`
+
+`ProcessedNode` is our abstraction for a single op. Each `ProcessedNode` stores an unowned reference to `StaticRuntime`'s
+`IValue` array. It knows how to map input/output indices to indices in this array (so `processed_node->output(i)` returns
+a reference to `ivalue_array[some_set_of_indices[i]]`)
 
-- Memory planning
-- Operator dispatch inlining
-- Operator subsitution
-- Weight layout transformations (pre-packing)
-- Lowering to `torch.jit.tensorexpr`
+Each `ProcessedNode` stores a `ProcessedFunction`, which represents the actual op to execute. `ProcessedFunction`s are initialized
+upon `StaticModule` construction according to the out variant/native/JIT fallback lookup rules described in "Registering Ops".
+**Note that all `ProcessedFunction`s are shared amongst all runtime instances**, so all `ProcessedFunction`s must be thread-safe.
diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp
index 556d1bc0b91e..b2a52641458c 100644
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@@ -11,6 +11,7 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
 #include <torch/csrc/jit/runtime/jit_trace.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
@@ -322,6 +323,17 @@ void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size) {
   inlineSmallFusionGroups(block, min_size);
 }
 
+void inlineFallbackGraphs(std::shared_ptr<Graph> graph) {
+  DepthFirstGraphNodeIterator it(graph);
+
+  Node* n = nullptr;
+  while ((n = it.next()) != nullptr) {
+    if (n->kind() == prim::FallbackGraph) {
+      SubgraphUtils::unmergeSubgraph(n);
+    }
+  }
+}
+
 void performTensorExprFusion(
     std::shared_ptr<Graph> graph,
     std::vector<IValue> sample_inputs) {
@@ -333,8 +345,10 @@ void performTensorExprFusion(
   FuseTensorExprs(
       traced_graph,
       /*min_group_size*/ 2,
-      /*add_composed_op*/ false,
+      /*add_composed_op*/ true,
       /*fuse_to_dynamic_shapes*/ true);
+  RemoveTensorTypeSpecializations(graph);
+  inlineFallbackGraphs(traced_graph);
   graph->block()->clear();
   graph->block()->cloneFrom(traced_graph->block(), nullptr);
   GRAPH_DUMP("Graph after fusion: ", graph);
diff --git a/torch/csrc/jit/runtime/static/generated_ops.cpp b/torch/csrc/jit/runtime/static/generated_ops.cpp
new file mode 100644
index 000000000000..1667f51700f0
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/generated_ops.cpp
@@ -0,0 +1,4986 @@
+// @lint-ignore-every CLANGTIDY HOWTOEVEN
+#include <torch/csrc/jit/runtime/static/ops.h>
+
+#include <ATen/CPUFunctions.h>
+#include <ATen/InferSize.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Fill.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/cpu/SerialStackImpl.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qembeddingbag.h>
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+#include <ATen/quantized/QTensorImpl.h>
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/csrc/jit/runtime/static/te_wrapper.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+namespace torch {
+namespace jit {
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::absolute,
+    aten_absolute,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::absolute(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::absolute(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::absolute_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::angle, aten_angle, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::angle(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::angle(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::angle_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::sgn, aten_sgn, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::sgn(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::sgn(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::sgn_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::acos, aten_acos, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::acos(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::acos(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::acos_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::arccos, aten_arccos, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::arccos(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::arccos(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::arccos_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::_add_relu, aten__add_relu, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::_add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      const auto alpha = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::add_relu(self, other, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::add_relu_out(self, other, alpha, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::addmv, aten_addmv, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& mat = p_node->Input(1).toTensor();
+      const auto& vec = p_node->Input(2).toTensor();
+      const auto beta = p_node->Input(3).toScalar();
+      const auto alpha = p_node->Input(4).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::addmv(self, mat, vec, beta, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::addmv_out(out, self, mat, vec, beta, alpha);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::addr, aten_addr, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& vec1 = p_node->Input(1).toTensor();
+      const auto& vec2 = p_node->Input(2).toTensor();
+      const auto beta = p_node->Input(3).toScalar();
+      const auto alpha = p_node->Input(4).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::addr(self, vec1, vec2, beta, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::addr_out(self, vec1, vec2, beta, alpha, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::argmax, aten_argmax, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toOptional<int64_t>();
+      const auto keepdim = p_node->Input(2).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::argmax(self, dim, keepdim);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::argmax_out(out, self, dim, keepdim);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::acosh, aten_acosh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::acosh(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::acosh(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::acosh_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::asinh, aten_asinh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::asinh(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::asinh(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::asinh_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::arcsinh,
+    aten_arcsinh,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::arcsinh(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::arcsinh(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::arcsinh_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::atanh, aten_atanh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::atanh(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::atanh(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::atanh_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::arctanh,
+    aten_arctanh,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::arctanh(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::arctanh(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::arctanh_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::asin, aten_asin, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::asin(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::asin(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::asin_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::arcsin, aten_arcsin, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::arcsin(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::arcsin(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::arcsin_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::atan, aten_atan, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::atan(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::atan(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::atan_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::arctan, aten_arctan, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::arctan(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::arctan(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::arctan_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::baddbmm, aten_baddbmm, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& batch1 = p_node->Input(1).toTensor();
+      const auto& batch2 = p_node->Input(2).toTensor();
+      const auto beta = p_node->Input(3).toScalar();
+      const auto alpha = p_node->Input(4).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::baddbmm(self, batch1, batch2, beta, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::baddbmm_out(out, self, batch1, batch2, beta, alpha);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_not,
+    aten_bitwise_not,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::bitwise_not(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_not(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_not_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::copysign,
+    aten_copysign,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::copysign.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::copysign(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::copysign_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logical_not,
+    aten_logical_not,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::logical_not(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::logical_not(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::logical_not_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logical_xor,
+    aten_logical_xor,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logical_xor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::logical_xor(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::logical_xor_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logical_and,
+    aten_logical_and,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logical_and(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::logical_and(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::logical_and_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logical_or,
+    aten_logical_or,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logical_or(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::logical_or(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::logical_or_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::ceil, aten_ceil, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::ceil(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::ceil(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::ceil_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::clamp_max,
+    aten_clamp_max,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::clamp_max(Tensor self, Scalar max) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto max = p_node->Input(1).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::clamp_max(self, max);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::clamp_max_out(out, self, max);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::clip, aten_clip, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto min = p_node->Input(1).toOptional<at::Scalar>();
+      const auto max = p_node->Input(2).toOptional<at::Scalar>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::clip(self, min, max);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::clip_out(self, min, max, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::complex,
+    aten_complex,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::complex(Tensor real, Tensor imag) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& real = p_node->Input(0).toTensor();
+          const auto& imag = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::complex(real, imag);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::complex_out(real, imag, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::polar, aten_polar, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::polar(Tensor abs, Tensor angle) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& abs = p_node->Input(0).toTensor();
+      const auto& angle = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::polar(abs, angle);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::polar_out(abs, angle, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::cos, aten_cos, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::cos(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::cos(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::cos_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::cosh, aten_cosh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::cosh(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::cosh(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::cosh_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::cumprod, aten_cumprod, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto dtype = p_node->Input(2).toOptional<at::ScalarType>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::cumprod(self, dim, dtype);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::cumprod_out(out, self, dim, dtype);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::diff, aten_diff, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toInt();
+      const auto dim = p_node->Input(2).toInt();
+      const auto prepend = p_node->Input(3).toOptional<at::Tensor>();
+      const auto append = p_node->Input(4).toOptional<at::Tensor>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::diff(self, n, dim, prepend, append);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::diff_out(self, n, dim, prepend, append, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::divide, aten_divide, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::divide.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::divide(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::divide_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::true_divide,
+    aten_true_divide,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::true_divide.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::true_divide(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::true_divide_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::dot, aten_dot, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::dot(Tensor self, Tensor tensor) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& tensor = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::dot(self, tensor);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::dot_out(self, tensor, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::vdot, aten_vdot, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::vdot(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::vdot(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::vdot_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::erf, aten_erf, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::erf(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::erf(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::erf_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::erfc, aten_erfc, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::erfc(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::erfc(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::erfc_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::exp, aten_exp, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::exp(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::exp(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::exp_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::exp2, aten_exp2, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::exp2(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::exp2(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::exp2_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::expm1, aten_expm1, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::expm1(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::expm1(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::expm1_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::floor, aten_floor, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::floor(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::floor(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::floor_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::frac, aten_frac, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::frac(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::frac(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::frac_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::gcd, aten_gcd, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::gcd(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::gcd(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::gcd_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::lcm, aten_lcm, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::lcm(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lcm(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lcm_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::index_copy, aten_index_copy, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto& source = p_node->Input(3).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::index_copy(self, dim, index, source);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::index_copy_out(out, self, dim, index, source);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::inverse,
+    aten_inverse,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::inverse(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::inverse(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::inverse_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::isin, aten_isin, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& elements = p_node->Input(0).toTensor();
+      const auto& test_elements = p_node->Input(1).toTensor();
+      const auto assume_unique = p_node->Input(2).toBool();
+      const auto invert = p_node->Input(3).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::cpu::isin(elements, test_elements, assume_unique, invert);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::isin_out(out, elements, test_elements, assume_unique, invert);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& elements = p_node->Input(0).toTensor();
+      const auto test_element = p_node->Input(1).toScalar();
+      const auto assume_unique = p_node->Input(2).toBool();
+      const auto invert = p_node->Input(3).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::cpu::isin(elements, test_element, assume_unique, invert);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::isin_out(out, elements, test_element, assume_unique, invert);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto element = p_node->Input(0).toScalar();
+      const auto& test_elements = p_node->Input(1).toTensor();
+      const auto assume_unique = p_node->Input(2).toBool();
+      const auto invert = p_node->Input(3).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::cpu::isin(element, test_elements, assume_unique, invert);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::isin_out(out, element, test_elements, assume_unique, invert);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::kron, aten_kron, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::kron(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::kron(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::kron_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::ldexp, aten_ldexp, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::ldexp.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::ldexp(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::ldexp_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::log10, aten_log10, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::log10(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::log10(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::log10_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::log1p, aten_log1p, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::log1p(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::log1p(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::log1p_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::log2, aten_log2, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::log2(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::log2(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::log2_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logaddexp,
+    aten_logaddexp,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logaddexp(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::logaddexp(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::logaddexp_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logaddexp2,
+    aten_logaddexp2,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logaddexp2(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::logaddexp2(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::logaddexp2_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::xlogy, aten_xlogy, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::xlogy.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::xlogy(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::xlogy_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_log_softmax,
+    aten__log_softmax,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto dim = p_node->Input(1).toInt();
+          const auto half_to_float = p_node->Input(2).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_log_softmax(self, dim, half_to_float);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::_log_softmax_out(out, self, dim, half_to_float);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_log_softmax_backward_data,
+    aten__log_softmax_backward_data,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& output = p_node->Input(1).toTensor();
+          const auto dim = p_node->Input(2).toInt();
+          const auto input_dtype = p_node->Input(3).toScalarType();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_log_softmax_backward_data(
+                grad_output, output, dim, input_dtype);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::_log_softmax_backward_data_out(
+              out, grad_output, output, dim, input_dtype);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_logcumsumexp,
+    aten__logcumsumexp,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_logcumsumexp(Tensor self, int dim) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto dim = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::_logcumsumexp_cpu(self, dim);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::_logcumsumexp_out_cpu(self, dim, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::logcumsumexp,
+    aten_logcumsumexp,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::logcumsumexp(Tensor self, int dim) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto dim = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::logcumsumexp(self, dim);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::logcumsumexp_out(self, dim, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::matrix_power,
+    aten_matrix_power,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::matrix_power(Tensor self, int n) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto n = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::matrix_power(self, n);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::matrix_power_out(self, n, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::mm, aten_mm, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::mm(Tensor self, Tensor mat2) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& mat2 = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::mm(self, mat2);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::mm_out(out, self, mat2);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::multiply,
+    aten_multiply,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::multiply.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::multiply(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::multiply_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::mv, aten_mv, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::mv(Tensor self, Tensor vec) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& vec = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::mv(self, vec);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::mv_out(self, vec, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::mvlgamma,
+    aten_mvlgamma,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::mvlgamma(Tensor self, int p) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto p = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::mvlgamma(self, p);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::mvlgamma_out(self, p, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::rad2deg,
+    aten_rad2deg,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::rad2deg(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::rad2deg(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::rad2deg_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::deg2rad,
+    aten_deg2rad,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::deg2rad(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::deg2rad(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::deg2rad_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::reciprocal,
+    aten_reciprocal,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::reciprocal(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::reciprocal(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::reciprocal_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::neg, aten_neg, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::neg(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::neg(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::neg_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::negative,
+    aten_negative,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::negative(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::negative(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::negative_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::round, aten_round, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::round(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::round(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::round_out(out, self);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::round.decimals(Tensor self, *, int decimals) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto decimals = p_node->Input(1).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::round(self, decimals);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::round_out(out, self, decimals);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::gelu, aten_gelu, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::gelu(Tensor self, *, str approximate='none') -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto approximate = p_node->Input(1).toStringView();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::gelu(self, approximate);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::gelu_out(out, self, approximate);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::gelu_backward,
+    aten_gelu_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto approximate = p_node->Input(2).toStringView();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::gelu_backward(grad_output, self, approximate);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::gelu_backward_out(
+              grad_input, grad_output, self, approximate);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::hardshrink,
+    aten_hardshrink,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto lambd = p_node->Input(1).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::hardshrink(self, lambd);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::hardshrink_out(out, self, lambd);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::hardshrink_backward,
+    aten_hardshrink_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_out = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto lambd = p_node->Input(2).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::hardshrink_backward(grad_out, self, lambd);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::hardshrink_backward_out(grad_input, grad_out, self, lambd);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::rsqrt, aten_rsqrt, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::rsqrt(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::rsqrt(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::rsqrt_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::silu, aten_silu, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::silu(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::silu(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::silu_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::silu_backward,
+    aten_silu_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::silu_backward(Tensor grad_output, Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::silu_backward(grad_output, self);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::silu_backward_out(grad_input, grad_output, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::mish, aten_mish, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::mish(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::mish(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::mish_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::sin, aten_sin, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::sin(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::sin(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::sin_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::sinc, aten_sinc, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::sinc(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::sinc(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::sinc_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::sinh, aten_sinh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::sinh(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::sinh(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::sinh_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::_softmax, aten__softmax, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::_softmax(Tensor self, int dim, bool half_to_float) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto half_to_float = p_node->Input(2).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::_softmax(self, dim, half_to_float);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::_softmax_out(out, self, dim, half_to_float);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_softmax_backward_data,
+    aten__softmax_backward_data,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& output = p_node->Input(1).toTensor();
+          const auto dim = p_node->Input(2).toInt();
+          const auto input_dtype = p_node->Input(3).toScalarType();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_softmax_backward_data(
+                grad_output, output, dim, input_dtype);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::_softmax_backward_data_out(
+              grad_input, grad_output, output, dim, input_dtype);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::sqrt, aten_sqrt, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::sqrt(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::sqrt(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::sqrt_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::square, aten_square, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::square(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::square(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::square_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::prod, aten_prod, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto keepdim = p_node->Input(2).toBool();
+      const auto dtype = p_node->Input(3).toOptional<at::ScalarType>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::prod(self, dim, keepdim, dtype);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::prod_out(out, self, dim, keepdim, dtype);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::tan, aten_tan, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::tan(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::tan(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::tan_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::threshold, aten_threshold, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto threshold = p_node->Input(1).toScalar();
+      const auto value = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::threshold(self, threshold, value);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::threshold_out(out, self, threshold, value);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::threshold_backward,
+    aten_threshold_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto threshold = p_node->Input(2).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::threshold_backward(grad_output, self, threshold);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::threshold_backward_out(
+              grad_input, grad_output, self, threshold);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::trunc, aten_trunc, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::trunc(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::trunc(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::trunc_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fix, aten_fix, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::fix(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fix(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fix_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::nuclear_norm,
+    aten_nuclear_norm,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::nuclear_norm(Tensor self, bool keepdim=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto keepdim = p_node->Input(1).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::nuclear_norm(self, keepdim);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::nuclear_norm_out(self, keepdim, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::subtract, aten_subtract, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      const auto alpha = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::subtract(self, other, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::subtract_out(self, other, alpha, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::heaviside,
+    aten_heaviside,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::heaviside(Tensor self, Tensor values) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& values = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::heaviside(self, values);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::heaviside_out(out, self, values);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_addmm_activation,
+    aten__addmm_activation,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& mat1 = p_node->Input(1).toTensor();
+          const auto& mat2 = p_node->Input(2).toTensor();
+          const auto beta = p_node->Input(3).toScalar();
+          const auto alpha = p_node->Input(4).toScalar();
+          const auto use_gelu = p_node->Input(5).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_addmm_activation(
+                self, mat1, mat2, beta, alpha, use_gelu);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::_addmm_activation_out(
+              out, self, mat1, mat2, beta, alpha, use_gelu);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::index_add, aten_index_add, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto& source = p_node->Input(3).toTensor();
+      const auto alpha = p_node->Input(4).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::index_add(self, dim, index, source, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::index_add_out(out, self, dim, index, source, alpha);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::scatter, aten_scatter, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto& src = p_node->Input(3).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::scatter(self, dim, index, src);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::scatter_out(out, self, dim, index, src);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto value = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::scatter(self, dim, index, value);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::scatter_out(out, self, dim, index, value);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto& src = p_node->Input(3).toTensor();
+      const auto reduce = p_node->Input(4).toStringView();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::scatter(self, dim, index, src, reduce);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::scatter_out(out, self, dim, index, src, reduce);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto value = p_node->Input(3).toScalar();
+      const auto reduce = p_node->Input(4).toStringView();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::scatter(self, dim, index, value, reduce);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::scatter_out(out, self, dim, index, value, reduce);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::scatter_add, aten_scatter_add, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto& src = p_node->Input(3).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::scatter_add(self, dim, index, src);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::scatter_add_out(out, self, dim, index, src);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::scatter_reduce,
+    aten_scatter_reduce,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto dim = p_node->Input(1).toInt();
+          const auto& index = p_node->Input(2).toTensor();
+          const auto& src = p_node->Input(3).toTensor();
+          const auto reduce = p_node->Input(4).toStringView();
+          const auto include_self = p_node->Input(5).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::scatter_reduce(
+                self, dim, index, src, reduce, include_self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::scatter_reduce_out(
+              out, self, dim, index, src, reduce, include_self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::eq, aten_eq, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::eq.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::eq(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::eq_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::eq.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::eq(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::eq_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_and,
+    aten_bitwise_and,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_and(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_and_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_or,
+    aten_bitwise_or,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_or(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_or_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_xor,
+    aten_bitwise_xor,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_xor(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_xor_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_left_shift,
+    aten_bitwise_left_shift,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_left_shift(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_left_shift_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::bitwise_right_shift,
+    aten_bitwise_right_shift,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::bitwise_right_shift(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::bitwise_right_shift_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::tril, aten_tril, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::tril(Tensor self, int diagonal=0) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto diagonal = p_node->Input(1).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::tril(self, diagonal);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::tril_out(out, self, diagonal);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::triu, aten_triu, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::triu(Tensor self, int diagonal=0) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto diagonal = p_node->Input(1).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::triu(self, diagonal);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::triu_out(out, self, diagonal);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::digamma,
+    aten_digamma,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::digamma(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::digamma(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::digamma_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::lerp, aten_lerp, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& end = p_node->Input(1).toTensor();
+      const auto weight = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lerp(self, end, weight);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lerp_out(out, self, end, weight);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& end = p_node->Input(1).toTensor();
+      const auto& weight = p_node->Input(2).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lerp(self, end, weight);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lerp_out(out, self, end, weight);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::addbmm, aten_addbmm, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& batch1 = p_node->Input(1).toTensor();
+      const auto& batch2 = p_node->Input(2).toTensor();
+      const auto beta = p_node->Input(3).toScalar();
+      const auto alpha = p_node->Input(4).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::addbmm(self, batch1, batch2, beta, alpha);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::addbmm_out(self, batch1, batch2, beta, alpha, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::diag, aten_diag, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::diag(Tensor self, int diagonal=0) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto diagonal = p_node->Input(1).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::diag(self, diagonal);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::diag_cpu_out(self, diagonal, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::cross, aten_cross, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      const auto dim = p_node->Input(2).toOptional<int64_t>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::cross(self, other, dim);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::cross_out(self, other, dim, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::ne, aten_ne, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::ne.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::ne(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::ne_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::ne.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::ne(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::ne_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::ge, aten_ge, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::ge.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::ge(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::ge_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::ge.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::ge(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::ge_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::le, aten_le, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::le.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::le(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::le_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::le.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::le(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::le_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::gt, aten_gt, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::gt.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::gt(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::gt_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::gt.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::gt(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::gt_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::lt, aten_lt, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::lt.Scalar(Tensor self, Scalar other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto other = p_node->Input(1).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lt(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lt_out(out, self, other);
+    };
+  }
+
+  if (n->matches(torch::schema(
+          "aten::lt.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lt(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lt_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::take, aten_take, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::take(Tensor self, Tensor index) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& index = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::take(self, index);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::take_out(self, index, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::take_along_dim,
+    aten_take_along_dim,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& indices = p_node->Input(1).toTensor();
+          const auto dim = p_node->Input(2).toOptional<int64_t>();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::take_along_dim(self, indices, dim);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::take_along_dim_out(self, indices, dim, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::masked_select,
+    aten_masked_select,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::masked_select(Tensor self, Tensor mask) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& mask = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::masked_select_cpu(self, mask);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::masked_select_out_cpu(self, mask, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::gather, aten_gather, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      const auto& index = p_node->Input(2).toTensor();
+      const auto sparse_grad = p_node->Input(3).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::gather(self, dim, index, sparse_grad);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::gather_out(out, self, dim, index, sparse_grad);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::addcmul, aten_addcmul, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& tensor1 = p_node->Input(1).toTensor();
+      const auto& tensor2 = p_node->Input(2).toTensor();
+      const auto value = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::addcmul(self, tensor1, tensor2, value);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::addcmul_out(out, self, tensor1, tensor2, value);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::addcdiv, aten_addcdiv, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& tensor1 = p_node->Input(1).toTensor();
+      const auto& tensor2 = p_node->Input(2).toTensor();
+      const auto value = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::addcdiv(self, tensor1, tensor2, value);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::addcdiv_out(out, self, tensor1, tensor2, value);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_solve_triangular,
+    aten_linalg_solve_triangular,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& B = p_node->Input(1).toTensor();
+          const auto upper = p_node->Input(2).toBool();
+          const auto left = p_node->Input(3).toBool();
+          const auto unitriangular = p_node->Input(4).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_solve_triangular(
+                self, B, upper, left, unitriangular);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_solve_triangular_out(
+              self, B, upper, left, unitriangular, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::cholesky_solve,
+    aten_cholesky_solve,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& input2 = p_node->Input(1).toTensor();
+          const auto upper = p_node->Input(2).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::cholesky_solve(self, input2, upper);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::cholesky_solve_out(self, input2, upper, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::cholesky_inverse,
+    aten_cholesky_inverse,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::cholesky_inverse(Tensor self, bool upper=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto upper = p_node->Input(1).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::cholesky_inverse(self, upper);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::cholesky_inverse_out(self, upper, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::orgqr, aten_orgqr, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::orgqr(Tensor self, Tensor input2) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& input2 = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::orgqr(self, input2);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::orgqr_out(self, input2, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::ormqr, aten_ormqr, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& input2 = p_node->Input(1).toTensor();
+      const auto& input3 = p_node->Input(2).toTensor();
+      const auto left = p_node->Input(3).toBool();
+      const auto transpose = p_node->Input(4).toBool();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::ormqr(self, input2, input3, left, transpose);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::ormqr_out(self, input2, input3, left, transpose, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::lgamma, aten_lgamma, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::lgamma(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::lgamma(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::lgamma_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::polygamma,
+    aten_polygamma,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::polygamma(int n, Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto n = p_node->Input(0).toInt();
+          const auto& self = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::polygamma(n, self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::polygamma_out(out, n, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::erfinv, aten_erfinv, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::erfinv(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::erfinv(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::erfinv_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::i0, aten_i0, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::i0(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::i0(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::i0_out(out, self);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::signbit,
+    aten_signbit,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::signbit(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::signbit(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::signbit_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::atan2, aten_atan2, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::atan2(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::atan2(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::atan2_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::arctan2,
+    aten_arctan2,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::arctan2(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::arctan2(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::arctan2_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::histc, aten_histc, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto bins = p_node->Input(1).toInt();
+      const auto min = p_node->Input(2).toScalar();
+      const auto max = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::histogram_histc_cpu(self, bins, min, max);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::histogram_histc_cpu_out(self, bins, min, max, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::hypot, aten_hypot, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::hypot(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::hypot(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::hypot_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::igamma, aten_igamma, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::igamma(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::igamma(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::igamma_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::igammac,
+    aten_igammac,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::igammac(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::igammac(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::igammac_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::nextafter,
+    aten_nextafter,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::nextafter(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::nextafter(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::nextafter_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::fmin, aten_fmin, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::fmin(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::fmin(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::fmin_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fmax, aten_fmax, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::fmax(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::fmax(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::fmax_out(out, self, other);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::maximum,
+    aten_maximum,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::maximum(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::maximum(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::maximum_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::max, aten_max, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::max.other(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::max(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::max_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::minimum,
+    aten_minimum,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::minimum(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::minimum(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::minimum_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::min, aten_min, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::min.other(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::min(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::min_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::quantile, aten_quantile, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& q = p_node->Input(1).toTensor();
+      const auto dim = p_node->Input(2).toOptional<int64_t>();
+      const auto keepdim = p_node->Input(3).toBool();
+      const auto interpolation = p_node->Input(4).toStringView();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::quantile(self, q, dim, keepdim, interpolation);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::quantile_out(self, q, dim, keepdim, interpolation, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::nanquantile, aten_nanquantile, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& q = p_node->Input(1).toTensor();
+      const auto dim = p_node->Input(2).toOptional<int64_t>();
+      const auto keepdim = p_node->Input(3).toBool();
+      const auto interpolation = p_node->Input(4).toStringView();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::nanquantile(self, q, dim, keepdim, interpolation);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::nanquantile_out(self, q, dim, keepdim, interpolation, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::msort, aten_msort, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema("aten::msort(Tensor self) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::msort(self);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::msort_out(self, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::renorm, aten_renorm, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto p = p_node->Input(1).toScalar();
+      const auto dim = p_node->Input(2).toInt();
+      const auto maxnorm = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::renorm(self, p, dim, maxnorm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::renorm_out(out, self, p, dim, maxnorm);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_convert_indices_from_coo_to_csr,
+    aten__convert_indices_from_coo_to_csr,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto size = p_node->Input(1).toInt();
+          const auto out_int32 = p_node->Input(2).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_convert_indices_from_coo_to_csr(
+                self, size, out_int32);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::_convert_indices_from_coo_to_csr_out(
+              out, self, size, out_int32);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::_convert_indices_from_csr_to_coo,
+    aten__convert_indices_from_csr_to_coo,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::_convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& crow_indices = p_node->Input(0).toTensor();
+          const auto& col_indices = p_node->Input(1).toTensor();
+          const auto out_int32 = p_node->Input(2).toBool();
+          const auto transpose = p_node->Input(3).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::_convert_indices_from_csr_to_coo(
+                crow_indices, col_indices, out_int32, transpose);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::_convert_indices_from_csr_to_coo_out(
+              out, crow_indices, col_indices, out_int32, transpose);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::mse_loss, aten_mse_loss, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& target = p_node->Input(1).toTensor();
+      const auto reduction = p_node->Input(2).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::mse_loss(self, target, reduction);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::mse_loss_out(out, self, target, reduction);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::l1_loss, aten_l1_loss, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& target = p_node->Input(1).toTensor();
+      const auto reduction = p_node->Input(2).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::l1_loss(self, target, reduction);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::l1_loss_out(self, target, reduction, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::multi_margin_loss,
+    aten_multi_margin_loss,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& target = p_node->Input(1).toTensor();
+          const auto p = p_node->Input(2).toScalar();
+          const auto margin = p_node->Input(3).toScalar();
+          const auto weight = p_node->Input(4).toOptional<at::Tensor>();
+          const auto reduction = p_node->Input(5).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::multi_margin_loss_cpu(
+                self, target, p, margin, weight, reduction);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::multi_margin_loss_cpu_out(
+              self, target, p, margin, weight, reduction, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::multilabel_margin_loss,
+    aten_multilabel_margin_loss,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& target = p_node->Input(1).toTensor();
+          const auto reduction = p_node->Input(2).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::native::multilabel_margin_loss(self, target, reduction);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::multilabel_margin_loss_out(self, target, reduction, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::nll_loss, aten_nll_loss, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& target = p_node->Input(1).toTensor();
+      const auto weight = p_node->Input(2).toOptional<at::Tensor>();
+      const auto reduction = p_node->Input(3).toInt();
+      const auto ignore_index = p_node->Input(4).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) =
+            at::native::nll_loss(self, target, weight, reduction, ignore_index);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::nll_loss_out(
+          self, target, weight, reduction, ignore_index, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::nll_loss_backward,
+    aten_nll_loss_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto& target = p_node->Input(2).toTensor();
+          const auto weight = p_node->Input(3).toOptional<at::Tensor>();
+          const auto reduction = p_node->Input(4).toInt();
+          const auto ignore_index = p_node->Input(5).toInt();
+          const auto& total_weight = p_node->Input(6).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::nll_loss_backward(
+                grad_output,
+                self,
+                target,
+                weight,
+                reduction,
+                ignore_index,
+                total_weight);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::nll_loss_backward_out(
+              grad_input,
+              grad_output,
+              self,
+              target,
+              weight,
+              reduction,
+              ignore_index,
+              total_weight);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::nll_loss2d, aten_nll_loss2d, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& target = p_node->Input(1).toTensor();
+      const auto weight = p_node->Input(2).toOptional<at::Tensor>();
+      const auto reduction = p_node->Input(3).toInt();
+      const auto ignore_index = p_node->Input(4).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::nll_loss2d(
+            self, target, weight, reduction, ignore_index);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::nll_loss2d_out(
+          self, target, weight, reduction, ignore_index, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::soft_margin_loss,
+    aten_soft_margin_loss,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& target = p_node->Input(1).toTensor();
+          const auto reduction = p_node->Input(2).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::native::soft_margin_loss(self, target, reduction);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::soft_margin_loss_out(self, target, reduction, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::elu, aten_elu, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto alpha = p_node->Input(1).toScalar();
+      const auto scale = p_node->Input(2).toScalar();
+      const auto input_scale = p_node->Input(3).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::elu(self, alpha, scale, input_scale);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::elu_out(out, self, alpha, scale, input_scale);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::elu_backward,
+    aten_elu_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto alpha = p_node->Input(1).toScalar();
+          const auto scale = p_node->Input(2).toScalar();
+          const auto input_scale = p_node->Input(3).toScalar();
+          const auto is_result = p_node->Input(4).toBool();
+          const auto& self_or_result = p_node->Input(5).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::elu_backward(
+                grad_output,
+                alpha,
+                scale,
+                input_scale,
+                is_result,
+                self_or_result);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::elu_backward_out(
+              grad_input,
+              grad_output,
+              alpha,
+              scale,
+              input_scale,
+              is_result,
+              self_or_result);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::glu, aten_glu, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::glu(Tensor self, int dim=-1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto dim = p_node->Input(1).toInt();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::glu(self, dim);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::glu_out(out, self, dim);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::hardsigmoid,
+    aten_hardsigmoid,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::hardsigmoid(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::hardsigmoid(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::hardsigmoid_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::hardsigmoid_backward,
+    aten_hardsigmoid_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::hardsigmoid_backward(grad_output, self);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::hardsigmoid_backward_out(grad_input, grad_output, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::hardtanh, aten_hardtanh, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto min_val = p_node->Input(1).toScalar();
+      const auto max_val = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::hardtanh(self, min_val, max_val);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::hardtanh_out(self, min_val, max_val, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::hardswish,
+    aten_hardswish,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::hardswish(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::hardswish(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::hardswish_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::leaky_relu_backward,
+    aten_leaky_relu_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto negative_slope = p_node->Input(2).toScalar();
+          const auto self_is_result = p_node->Input(3).toBool();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::leaky_relu_backward(
+                grad_output, self, negative_slope, self_is_result);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::leaky_relu_backward_out(
+              grad_input, grad_output, self, negative_slope, self_is_result);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::log_sigmoid,
+    aten_log_sigmoid,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::log_sigmoid(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::log_sigmoid(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::log_sigmoid_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::softplus, aten_softplus, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto beta = p_node->Input(1).toScalar();
+      const auto threshold = p_node->Input(2).toScalar();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::cpu::softplus(self, beta, threshold);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::cpu::softplus_out(out, self, beta, threshold);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::softplus_backward,
+    aten_softplus_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto beta = p_node->Input(2).toScalar();
+          const auto threshold = p_node->Input(3).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::softplus_backward(grad_output, self, beta, threshold);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::softplus_backward_out(
+              grad_input, grad_output, self, beta, threshold);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::softshrink,
+    aten_softshrink,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::softshrink(Tensor self, Scalar lambd=0.5) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto lambd = p_node->Input(1).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::softshrink(self, lambd);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::softshrink_out(out, self, lambd);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::softshrink_backward,
+    aten_softshrink_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto lambd = p_node->Input(2).toScalar();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) =
+                at::cpu::softshrink_backward(grad_output, self, lambd);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::softshrink_backward_out(
+              grad_input, grad_output, self, lambd);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::adaptive_max_pool2d_backward,
+    aten_adaptive_max_pool2d_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto& indices = p_node->Input(2).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::adaptive_max_pool2d_backward(
+                grad_output, self, indices);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::adaptive_max_pool2d_backward_out(
+              grad_input, grad_output, self, indices);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::adaptive_max_pool3d_backward,
+    aten_adaptive_max_pool3d_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& self = p_node->Input(1).toTensor();
+          const auto& indices = p_node->Input(2).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::adaptive_max_pool3d_backward(
+                grad_output, self, indices);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::adaptive_max_pool3d_backward_out(
+              grad_input, grad_output, self, indices);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::sigmoid_backward,
+    aten_sigmoid_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& output = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::sigmoid_backward(grad_output, output);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::sigmoid_backward_out(grad_input, grad_output, output);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::tanh_backward,
+    aten_tanh_backward,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& grad_output = p_node->Input(0).toTensor();
+          const auto& output = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::tanh_backward(grad_output, output);
+            return;
+          }
+          auto& grad_input = p_node->Output(0).toTensor();
+          fastResizeToZero(grad_input);
+          at::cpu::tanh_backward_out(grad_input, grad_output, output);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::isposinf,
+    aten_isposinf,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::isposinf(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::isposinf(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::isposinf_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::isneginf,
+    aten_isneginf,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::isneginf(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::isneginf(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::isneginf_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_entr,
+    aten_special_entr,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_entr(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_entr(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_entr_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_ndtri,
+    aten_special_ndtri,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_ndtri(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_ndtri(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_ndtri_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_log_ndtr,
+    aten_special_log_ndtr,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_log_ndtr(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_log_ndtr(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_log_ndtr_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_expm1,
+    aten_special_expm1,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_expm1(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_expm1(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_expm1_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_exp2,
+    aten_special_exp2,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_exp2(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_exp2(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_exp2_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_psi,
+    aten_special_psi,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_psi(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_psi(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_psi_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_digamma,
+    aten_special_digamma,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_digamma(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_digamma(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_digamma_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_gammaln,
+    aten_special_gammaln,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_gammaln(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_gammaln(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_gammaln_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_erf,
+    aten_special_erf,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_erf(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_erf(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_erf_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_erfc,
+    aten_special_erfc,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_erfc(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_erfc(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_erfc_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_erfcx,
+    aten_special_erfcx,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_erfcx(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_erfcx(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_erfcx_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_erfinv,
+    aten_special_erfinv,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_erfinv(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_erfinv(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_erfinv_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_ndtr,
+    aten_special_ndtr,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_ndtr(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_ndtr(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_ndtr_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_xlog1py,
+    aten_special_xlog1py,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_xlog1py(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_xlog1py(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_xlog1py_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_xlogy,
+    aten_special_xlogy,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_xlogy(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_xlogy(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_xlogy_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_zeta,
+    aten_special_zeta,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_zeta(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_zeta(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_zeta_out(out, self, other);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_i0,
+    aten_special_i0,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_i0(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_i0(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_i0_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_i0e,
+    aten_special_i0e,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_i0e(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_i0e(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_i0e_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_i1,
+    aten_special_i1,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_i1(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_i1(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_i1_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_i1e,
+    aten_special_i1e,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_i1e(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::special_i1e(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::special_i1e_out(out, self);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_polygamma,
+    aten_special_polygamma,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_polygamma(int n, Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto n = p_node->Input(0).toInt();
+          const auto& self = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_polygamma(n, self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_polygamma_out(n, self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_expit,
+    aten_special_expit,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_expit(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_expit(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_expit_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_sinc,
+    aten_special_sinc,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_sinc(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_sinc(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_sinc_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_round,
+    aten_special_round,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_round(Tensor self, *, int decimals=0) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto decimals = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_round(self, decimals);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_round_out(self, decimals, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_log1p,
+    aten_special_log1p,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::special_log1p(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_log1p(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_log1p_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_gammainc,
+    aten_special_gammainc,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_gammainc(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_gammainc(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_gammainc_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_gammaincc,
+    aten_special_gammaincc,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_gammaincc(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_gammaincc(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_gammaincc_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::special_multigammaln,
+    aten_special_multigammaln,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::special_multigammaln(Tensor self, int p) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto p = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::special_multigammaln(self, p);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::special_multigammaln_out(self, p, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_fft, aten_fft_fft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_fft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_fft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_ifft, aten_fft_ifft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_ifft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_ifft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_rfft, aten_fft_rfft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_rfft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_rfft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_irfft, aten_fft_irfft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_irfft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_irfft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_hfft, aten_fft_hfft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_hfft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_hfft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::fft_ihfft, aten_fft_ihfft, [](Node* n) -> SROperator {
+  if (n->matches(torch::schema(
+          "aten::fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto n = p_node->Input(1).toOptional<int64_t>();
+      const auto dim = p_node->Input(2).toInt();
+      const auto norm = p_node->Input(3).toOptional<c10::string_view>();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::fft_ihfft(self, n, dim, norm);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::fft_ihfft_out(self, n, dim, norm, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_cross,
+    aten_linalg_cross,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          const auto dim = p_node->Input(2).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::cpu::linalg_cross(self, other, dim);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::cpu::linalg_cross_out(out, self, other, dim);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_det,
+    aten_linalg_det,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::linalg_det(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_det(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_det_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_matmul,
+    aten_linalg_matmul,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_matmul(Tensor self, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_matmul(self, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_matmul_out(self, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_eigvals,
+    aten_linalg_eigvals,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::linalg_eigvals(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_eigvals(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_eigvals_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_inv,
+    aten_linalg_inv,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::linalg_inv(Tensor self) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_inv(self);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_inv_out(self, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(aten::inner, aten_inner, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::inner(Tensor self, Tensor other) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& other = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::inner(self, other);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::inner_out(self, other, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::outer, aten_outer, [](Node* n) -> SROperator {
+  if (n->matches(
+          torch::schema("aten::outer(Tensor self, Tensor vec2) -> Tensor"))) {
+    return [](ProcessedNode* p_node) {
+      const auto& self = p_node->Input(0).toTensor();
+      const auto& vec2 = p_node->Input(1).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = at::native::outer(self, vec2);
+        return;
+      }
+      auto& out = p_node->Output(0).toTensor();
+      fastResizeToZero(out);
+      at::native::outer_out(self, vec2, out);
+    };
+  }
+  LogAndDumpSchema(n);
+  return nullptr;
+});
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_svdvals,
+    aten_linalg_svdvals,
+    [](Node* n) -> SROperator {
+      if (n->matches(
+              torch::schema("aten::linalg_svdvals(Tensor A) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& A = p_node->Input(0).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_svdvals(A);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_svdvals_out(A, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_cond,
+    aten_linalg_cond,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_cond(Tensor self, Scalar? p=None) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto p = p_node->Input(1).toOptional<at::Scalar>();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_cond(self, p);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_cond_out(self, p, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_solve,
+    aten_linalg_solve,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_solve(Tensor input, Tensor other) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& input = p_node->Input(0).toTensor();
+          const auto& other = p_node->Input(1).toTensor();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_solve(input, other);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_solve_out(input, other, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_tensorinv,
+    aten_linalg_tensorinv,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_tensorinv(Tensor self, int ind=2) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto ind = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_tensorinv(self, ind);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_tensorinv_out(self, ind, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::linalg_matrix_power,
+    aten_linalg_matrix_power,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema(
+              "aten::linalg_matrix_power(Tensor self, int n) -> Tensor"))) {
+        return [](ProcessedNode* p_node) {
+          const auto& self = p_node->Input(0).toTensor();
+          const auto n = p_node->Input(1).toInt();
+          if (p_node->Output(0).isNone()) {
+            p_node->Output(0) = at::native::linalg_matrix_power(self, n);
+            return;
+          }
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          at::native::linalg_matrix_power_out(self, n, out);
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 54a33ed0b129..b33082b6eec9 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -12,6 +12,7 @@
 #include <caffe2/core/timer.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/add_if_then_else.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/eliminate_no_ops.h>
@@ -91,7 +92,6 @@ bool isUnsupportedOp(Node* node) {
 bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
   // check for sub-blocks
   bool can_support = true;
-  bool has_blocks = false;
   for (auto* node : graph->block()->nodes()) {
     const auto kind = node->kind();
     if (kind == prim::Constant) {
@@ -140,30 +140,33 @@ void OptimizeGraph(
   ConstantPropagation(graph);
   RemoveTensorMutation(graph);
   ConstantPropagation(graph);
+  EliminateNoOpSlice(graph);
   EliminateDeadCode(graph);
   FuseInferenceOpsForSparseNN(graph);
   UseVariadicCat(graph);
   UseVariadicStack(graph);
   EliminateTrivialEquallySplit(graph);
+  EliminateExtraPermuteOps(graph);
 
   if (opts.enable_out_variant) {
     UseVariadicOp(
         graph,
         fromQualString("fb::sigrid_transforms_torch_bind"),
         fromQualString("fb::variadic_sigrid_transforms_torch_bind"));
+    // These fused ops only have out variants - we can't do the fusion when
+    // out variants are disabled.
     FuseSignLog1P(graph);
+    FuseClampNaNToNum(graph);
 
-    // TODO: we can avoid this guard by moving operations
-    // to exposed folders.
 #ifdef FBCODE_CAFFE2
-    if (opts.use_copy_variants) {
+    if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) {
       ReplaceWithCopy(graph);
     }
-    if (opts.use_maybe_copy_variants) {
+    if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) {
       ReplaceWithMaybeCopy(graph);
     }
     FuseListUnpack(graph);
-    EnableStaticRuntimeLayerNorm(graph);
+    RemoveUnnecessaryOutputs(graph);
 #endif
   }
 
@@ -173,6 +176,9 @@ void OptimizeGraph(
   UseVariadicGroupedAccessor(graph);
   EliminateNoOps(
       graph, /* custom_ops */ {fromQualString("fb::scale_gradient")});
+  AddIfThenElseOp(graph);
+  UseSplitAndSqueeze(graph);
+  QuantizedLinearReluFusion(graph);
   GRAPH_DUMP("Final graph after optimizations: ", graph);
 }
 
@@ -215,6 +221,11 @@ bool mayContainAlias(
   return db.mayContainAlias(const_cast<Value*>(a), valueVecFromFastSet(b));
 }
 
+bool escapesScope(const AliasDb& db, const Value* a) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  return db.escapesScope({const_cast<Value*>(a)});
+}
+
 void PrepareGraphForStaticModule(
     std::shared_ptr<torch::jit::Graph> graph,
     const StaticModuleOptions& opts,
@@ -296,7 +307,7 @@ void ValueGroup::init(const Block& block, const AliasDb& db) {
       continue;
     }
     for (const auto* v : node->outputs()) {
-      if (mayContainAlias(db, v, external_aliases_)) {
+      if (escapesScope(db, v) || mayContainAlias(db, v, external_aliases_)) {
         external_aliases_.insert(v);
       }
     }
@@ -311,13 +322,6 @@ void ValueGroup::init(const Block& block, const AliasDb& db) {
       continue;
     }
     for (const auto* v : node->outputs()) {
-      // Add values that can aliase input/constant values. Note some output
-      // aliases may end up in this category via collection objects (e.g.,
-      // Tuple).
-      if (mayContainAlias(db, v, external_aliases_)) {
-        external_aliases_.insert(v);
-        continue;
-      }
       if (mayContainAlias(db, v, output_aliases_)) {
         output_aliases_.insert(v);
       }
@@ -559,7 +563,7 @@ StaticModule::StaticModule(
   value_buffer_size_ +=
       prepareBlockInfo(graph_->block(), values_index_offset, value_to_index);
 
-  prepareProcessedNodes(graph_->block(), value_to_index, alias_db);
+  prepareStaticNodeInfos(graph_->block(), value_to_index, alias_db);
 
   for (auto& block_and_info : block_infos_) {
     auto& block_info = block_and_info.second;
@@ -644,7 +648,7 @@ void StaticModule::prepareFunctionsAndConstants(
   }
 }
 
-size_t StaticModule::prepareProcessedNodes(
+size_t StaticModule::prepareStaticNodeInfos(
     Block* block,
     const FastMap<const Value*, uint32_t>& value_to_index,
     const AliasDb& alias_db,
@@ -652,7 +656,7 @@ size_t StaticModule::prepareProcessedNodes(
   const auto node_start = node_idx;
 
   auto& block_info = block_infos_.at(block);
-  std::vector<ProcessedNode> nodes;
+  std::vector<StaticNodeInfo> nodes;
   FastMap<Node*, bool> node_has_out_variant;
 
   for (auto* node : block->nodes()) {
@@ -662,7 +666,7 @@ size_t StaticModule::prepareProcessedNodes(
 
     for (auto* sub_block : node->blocks()) {
       node_idx +=
-          prepareProcessedNodes(sub_block, value_to_index, alias_db, node_idx);
+          prepareStaticNodeInfos(sub_block, value_to_index, alias_db, node_idx);
     }
     ProcessedNodeInputs input_indices(node->inputs().size());
     for (const auto input_idx : c10::irange(node->inputs().size())) {
@@ -697,7 +701,7 @@ size_t StaticModule::prepareProcessedNodes(
 }
 
 void BlockInfo::set_nodes(
-    std::vector<ProcessedNode> nodes,
+    std::vector<StaticNodeInfo> nodes,
     const FastMap<Node*, bool>& node_has_out_variant) {
   nodes_ = std::move(nodes);
 
@@ -721,7 +725,7 @@ void BlockInfo::prepare_for_memory_planner(
       block_.outputs().begin(), block_.outputs().end());
 
   // collect register indices of outputs of ops with out variant
-  for (ProcessedNode& pnode : nodes_) {
+  for (StaticNodeInfo& pnode : nodes_) {
     if (!pnode.has_out_variant()) {
       continue;
     }
@@ -806,7 +810,7 @@ c10::IValue StaticModule::operator()(
 
 BlockRunner::BlockRunner(
     const StaticModule& sm,
-    std::vector<IValue>& values,
+    IValue* values,
     Block* block,
     bool is_root_block)
     : static_module_(sm),
@@ -818,10 +822,10 @@ BlockRunner::BlockRunner(
       // TODO(T108633124): Turn on manage output tensors for sub-blocks.
       manage_output_tensors_enabled_(
           is_root_block_ && sm.opts().manage_output_tensors),
-      values_(values),
-      nodes_(block_info_.nodes()) {
-  for (auto& n : nodes_) {
-    n.set_values(values_.data());
+      values_(values) {
+  nodes_.reserve(block_info_.nodes().size());
+  for (auto& pre_pnode : block_info_.nodes()) {
+    nodes_.emplace_back(pre_pnode, values_);
   }
 
   for (auto index : block_info_.block_output_indices()) {
@@ -840,7 +844,7 @@ BlockRunner::BlockRunner(
     block_runners->reserve(num_blocks);
 
     for (auto* b : blocks) {
-      block_runners->emplace_back(sm, values, b);
+      block_runners->emplace_back(sm, values_, b);
     }
     pnode.set_block_runners(std::move(block_runners));
   }
@@ -880,10 +884,6 @@ template <typename IValueList>
 void BlockRunner::set_inputs(
     IValueList&& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs) {
-  const auto total_num_inputs =
-      args.size() + kwargs.size() + first_input_is_self_;
-  TORCH_CHECK(total_num_inputs == block_info_.num_inputs());
-
   const auto& schema = static_module_.schema();
   if (first_input_is_self_) {
     Input(0) = static_module_.module()._ivalue();
@@ -892,6 +892,10 @@ void BlockRunner::set_inputs(
   if (!is_root_block_ || C10_UNLIKELY(!schema)) {
     TORCH_CHECK(
         kwargs.empty(), "Schema is not available, but BlockRunner got kwargs.");
+
+    const auto total_num_inputs = args.size() + first_input_is_self_;
+    TORCH_CHECK(total_num_inputs == block_info_.num_inputs());
+
     for (size_t i = 0; i < args.size(); ++i) {
       set_arg(i, std::forward<IValueList>(args));
     }
@@ -901,7 +905,9 @@ void BlockRunner::set_inputs(
   const auto& schema_args = schema->arguments();
   size_t consumed_kwargs = 0;
   DCHECK(schema_args.size() > 0);
-
+  TORCH_CHECK(
+      args.size() < schema_args.size(),
+      "Static runtime got too many arguments");
   for (size_t i = 0; i < schema_args.size() - 1; ++i) {
     // Start at 1 since the schema always contains `self`.
     const auto& schema_arg = schema_args[i + 1];
@@ -929,14 +935,12 @@ void BlockRunner::set_inputs(
     TORCH_CHECK(
         false, "Static runtime is missing required kwarg ", schema_arg.name());
   }
-  TORCH_CHECK(
-      consumed_kwargs == kwargs.size() &&
-      args.size() + consumed_kwargs == schema_args.size() - 1);
+  TORCH_CHECK(consumed_kwargs == kwargs.size());
 }
 
 void BlockRunner::create_memory_planner() {
   if (!planner_) {
-    planner_ = std::make_unique<MemoryPlanner>(
+    planner_ = std::make_unique<StandardMemoryPlanner>(
         this,
         block_info_,
         static_module_.opts().enable_out_variant,
@@ -1195,17 +1199,16 @@ template <typename IValueList>
 c10::IValue BlockRunner::run_impl_record_functions(
     IValueList&& args,
     const KeywordArgs& kwargs) {
-  bool pre_sampled = false;
-  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
-    at::RecordFunction guard(
-        at::RecordScope::STATIC_RUNTIME_MODEL, pre_sampled);
-    if (guard.isActive()) {
-      if (guard.needsInputs()) {
-        guard.before("forward", &args);
-      } else {
-        guard.before("forward");
-      }
-    }
+  auto step_callbacks =
+      at::getStepCallbacks(at::RecordScope::STATIC_RUNTIME_MODEL);
+  if (!step_callbacks.empty()) {
+    at::RecordFunction guard(std::move(step_callbacks));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
+    guard.needsInputs()
+        ? guard.before(
+              "forward", c10::ArrayRef<const IValue>(args.data(), args.size()))
+        : guard.before("forward");
+
     return run_impl(std::forward<IValueList>(args), kwargs);
   }
   return run_impl(std::forward<IValueList>(args), kwargs);
@@ -1258,9 +1261,6 @@ void BlockRunner::benchmark(
   TORCH_CHECK(
       kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
   std::cout << "Input size: " << args_list.size() << std::endl;
-  if (args_list.size() == 0) {
-    return;
-  }
   float time_per_iter =
       benchmark_model(args_list, kwargs_list, warmup_runs, main_runs);
   std::cout << "Static runtime ms per iter: " << time_per_iter
@@ -1280,11 +1280,20 @@ void BlockRunner::benchmark(
 
   std::vector<std::pair<std::string, double>> time_per_node_type_vec{
       results.time_per_node_type.begin(), results.time_per_node_type.end()};
-  std::sort(
-      time_per_node_type_vec.begin(),
-      time_per_node_type_vec.end(),
-      [](auto& left, auto& right) { return left.second > right.second; });
-
+  if (args_list.size() == 0) {
+    std::sort(
+        time_per_node_type_vec.begin(),
+        time_per_node_type_vec.end(),
+        [&results](auto& left, auto& right) {
+          return results.instances_per_node_type[left.first] >
+              results.instances_per_node_type[right.first];
+        });
+  } else {
+    std::sort(
+        time_per_node_type_vec.begin(),
+        time_per_node_type_vec.end(),
+        [](auto& left, auto& right) { return left.second > right.second; });
+  }
   std::cout << "Time per node type:" << std::endl;
   for (const auto& p : time_per_node_type_vec) {
     const std::string& kind = p.first;
@@ -1339,13 +1348,14 @@ void BlockRunner::benchmark(
       std::cout << "Total number of reused tensors: "
                 << planner_->total_reused_tensors() << std::endl;
     }
-    std::cout << "Total number of 'out' variant nodes/total number of nodes: "
-              << results.out_nodes_count << "/" << results.total_nodes_count
-              << " ("
-              << 100.0 * (results.out_nodes_count) /
-            static_cast<float>(results.total_nodes_count)
-              << "%)" << std::endl;
   }
+  std::cout << "Total number of 'out' variant nodes/total number of nodes: "
+            << results.out_nodes_count << "/" << results.total_nodes_count
+            << " ("
+            << 100.0 * (results.out_nodes_count) /
+          static_cast<float>(results.total_nodes_count)
+            << "%)" << std::endl;
+
   check_for_memory_leak();
 
 #ifndef NDEBUG
@@ -1466,8 +1476,36 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
   TORCH_CHECK(
       kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
   TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1);
+
+  IndividualMetrics results;
+  results.time_per_node.resize(nodes_.size(), 0);
   if (args_list.size() == 0) {
-    return {};
+    // When the given input is empty, compute the op statistics from the given
+    // graph without executing it.
+    for (const auto i : c10::irange(nodes_.size())) {
+      const Node* node = nodes_[i].node();
+      std::string kind(node->kind().toQualString());
+      // TODO: Collect op statistics from sub-blocks here.
+      results.time_per_node[i] = 0;
+      results.time_per_node_type[kind] = 0;
+      results.instances_per_node_type[kind]++;
+      if (nodes_[i].has_out_variant()) {
+        results.out_nodes.insert(kind);
+        results.out_nodes_count++;
+      } else if (nodes_[i].has_native()) {
+        results.native_nodes.insert(kind);
+      }
+      results.total_time += results.time_per_node[i];
+    }
+    results.total_nodes_count = nodes_.size();
+    results.memory_alloc_time = 0;
+    results.memory_dealloc_time = 0;
+    results.output_dealloc_time = 0;
+    for (const auto& p : results.time_per_node_type) {
+      const std::string& kind = p.first;
+      results.percent_per_node_type[kind] = 0;
+    }
+    return results;
   }
 
   const bool is_kwargs_empty = kwargs_list.size() == 0;
@@ -1477,9 +1515,6 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
   // explanation.
   c10::InferenceMode mode;
 
-  IndividualMetrics results;
-  results.time_per_node.resize(nodes_.size(), 0);
-
   // setup time
   caffe2::Timer timer;
 
@@ -1681,7 +1716,9 @@ bool BlockRunner::checkOutputTensorMemoryLeaks() {
     for (const auto i : c10::irange(pnode.num_outputs())) {
       const IValue* ival = &pnode.Output(i);
       const Value* val = pnode.node()->output(i);
-      if (!isManagedOutputTensorValue(val)) {
+      if (!isManagedOutputTensorValue(val) || !ival->isTensor()) {
+        // ival can not be a tensor if it's being managed by ops like
+        // to_maybe_copy_out; see ReplaceWithMaybeCopy for details.
         continue;
       }
       const auto& t = ival->toTensor();
@@ -1734,7 +1771,8 @@ ProcessedFunction::ProcessedFunction(
     Node* node,
     bool enable_out_variant,
     bool check_memory_overlap)
-    : check_memory_overlap_(check_memory_overlap) {
+    : check_memory_overlap_(check_memory_overlap),
+      num_outputs_(node->outputs().size()) {
   if (enable_out_variant) {
     f_ = getOutOfPlaceOperation(node);
     if (f_) {
@@ -1782,7 +1820,7 @@ ProcessedFunction::ProcessedFunction(
   }
 }
 
-ProcessedNode::ProcessedNode(
+StaticNodeInfo::StaticNodeInfo(
     Node* node,
     ProcessedFunction* fn,
     ProcessedNodeInputs inputs,
@@ -1791,13 +1829,7 @@ ProcessedNode::ProcessedNode(
       fn_(fn),
       inputs_(std::move(inputs)),
       outputs_offset_(outputs_offset) {
-  TORCH_CHECK(
-      node->outputs().size() < (1 << (sizeof(num_outputs_) * 8)),
-      node->outputs().size(),
-      " outputs to ProcessedNode ",
-      node->kind().toQualString(),
-      " is too many to use 2-byte indexing");
-  num_outputs_ = node->outputs().size();
+  TORCH_CHECK(num_outputs() == node->outputs().size());
 }
 
 std::vector<IValue> ProcessedNode::inputs_ivalue_vec() const {
@@ -1811,16 +1843,23 @@ std::vector<IValue> ProcessedNode::inputs_ivalue_vec() const {
 
 void ProcessedNode::run() {
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
-  bool pre_sampled = false;
-  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
-    at::RecordFunction guard(at::RecordScope::STATIC_RUNTIME_OP, pre_sampled);
-    if (guard.isActive()) {
-      if (guard.needsInputs()) {
-        guard.before(get_op_name(), inputs_ivalue_vec());
-      } else {
-        guard.before(get_op_name());
-      }
+  auto step_callbacks =
+      at::getStepCallbacks(at::RecordScope::STATIC_RUNTIME_OP);
+  if (!step_callbacks.empty()) {
+    at::RecordFunction guard(std::move(step_callbacks));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
+    if (guard.needsInputs()) {
+      const auto inputs = inputs_ivalue_vec();
+      guard.before(
+          get_op_name(),
+          c10::ArrayRef<const IValue>(inputs.data(), inputs.size()));
+    } else {
+      guard.before(get_op_name());
     }
+    if (has_out_variant()) {
+      guard._setStaticRuntimeOutVariant();
+    }
+
     fn_->run(this);
   } else {
     fn_->run(this);
@@ -1851,11 +1890,13 @@ static bool checkNoMemoryOverlap(const at::Tensor& a, const at::Tensor& b) {
 }
 
 bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
-  const static std::array<c10::Symbol, 5> special_case_ops = {
+  const static std::array<c10::Symbol, 7> special_case_ops = {
       fromQualString("prim::TypeCheck"),
+      fromQualString("prim::IfThenElse"),
       fromQualString("static_runtime::select_tensor"),
       fromQualString("static_runtime::VarTupleUnpack"),
       fromQualString("static_runtime::dict_unpack"),
+      fromQualString("static_runtime::fused_split_and_squeeze"),
       fromQualString("static_runtime::create_owned_ref")};
   if (!force_check &&
       std::find(
@@ -1869,12 +1910,12 @@ bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
 }
 
 bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
-  for (const auto i : c10::irange(num_outputs_)) {
+  for (const auto i : c10::irange(num_outputs())) {
     if (!Output(i).isTensor()) {
       continue;
     }
     const auto& out0_t = Output(i).toTensor();
-    for (const auto j : c10::irange(i + 1, num_outputs_)) {
+    for (const auto j : c10::irange(i + 1, num_outputs())) {
       if (!Output(j).isTensor()) {
         continue;
       }
@@ -1894,7 +1935,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
   // skip memory overlap check for mutable or view ops with only one output
   bool skip_check = !schema ||
       ((schema->is_mutable() || !fn_->checkMemoryOverlap()) &&
-       num_outputs_ == 1);
+       num_outputs() == 1);
   if (!force_check && skip_check) {
     if (!schema) {
       VLOG(2) << "Detected that op schema is null";
@@ -1902,7 +1943,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
     }
     VLOG(2) << "schema->is_mutable: " << schema->is_mutable()
             << ", fn_->checkMemoryOverlap: " << fn_->checkMemoryOverlap()
-            << ", num_outputs_: " << num_outputs_;
+            << ", num_outputs_: " << num_outputs();
     return true;
   }
 
@@ -1912,7 +1953,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
       continue;
     }
     const auto& in_t = in->toTensor();
-    for (const auto j : c10::irange(num_outputs_)) {
+    for (const auto j : c10::irange(num_outputs())) {
       const IValue& out = Output(j);
       if (!out.isTensor()) {
         continue;
@@ -1949,7 +1990,7 @@ void ProcessedNode::verify_and_correct_memory_overlap() {
       continue;
     }
     const auto& in_t = in.toTensor();
-    for (const auto j : c10::irange(num_outputs_)) {
+    for (const auto j : c10::irange(num_outputs())) {
       auto& output = Output(j);
       if (output.isTensor()) {
         check_and_correct_overlap_with(in_t, output);
@@ -1970,11 +2011,11 @@ void ProcessedNode::verify_and_correct_memory_overlap() {
   }
 }
 
-StaticRuntime::StaticRuntime(const StaticModule& sm) {
-  values_.resize(sm.value_buffer_size());
-  std::copy(sm.constants().begin(), sm.constants().end(), values_.begin());
+StaticRuntime::StaticRuntime(const StaticModule& sm)
+    : values_(sm.value_buffer_size()) {
+  std::copy(sm.constants().begin(), sm.constants().end(), values_.data());
   block_ = std::make_unique<BlockRunner>(
-      sm, values_, sm.root_block(), /*is_root_block*/ true);
+      sm, values_.data(), sm.root_block(), /*is_root_block*/ true);
   ;
 }
 
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index c0b4f5b0b2d3..b28b4dc5adde 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -58,10 +58,11 @@ TORCH_API inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
 }
 
 TORCH_API inline bool borrowsOutputs(c10::Symbol kind) {
-  static const std::array<c10::Symbol, 3> symbols_with_borrowed_outputs = {
+  static const std::array<c10::Symbol, 4> symbols_with_borrowed_outputs = {
       c10::Symbol::fromQualString("static_runtime::select_tensor"),
       c10::Symbol::fromQualString("static_runtime::dict_unpack"),
       c10::Symbol::fromQualString("static_runtime::VarTupleUnpack"),
+      c10::Symbol::fromQualString("prim::IfThenElse"),
   };
   return std::find(
              symbols_with_borrowed_outputs.begin(),
@@ -165,11 +166,18 @@ struct TORCH_API StaticModuleOptions {
   bool manage_output_tensors{false};
   // Gates the ReplaceWithCopy pass, which replaces ops that
   // sometimes alias their outputs with out variants that
-  // always copy (so the output may participate in memory planning)
+  // always copy (so the output may participate in memory planning).
+  // Since replacing with copies is done after TensorExpr fusion, the
+  // resulting graph does not conform to the assumptions made in the fuser.
+  // So, even if this flag is turned on, the ReplaceWithCopy pass will not
+  // be executed if TensorExpr fusion is enabled.
   bool use_copy_variants{true};
   // Gates the ReplaceWithMaybeCopy pass, which replaces ops that
   // sometimes alias their outputs with subgraphs that include an out
   // variant.
+  // For the same reason as `use_copy_variants`, the ReplaceWithMaybeCopy pass
+  // will not be executed if TensorExpr fusion is enabled, even if this flag
+  // is turned on.
   bool use_maybe_copy_variants{true};
   // enable TensorExpr fusion of ops at model loading time
   bool enable_tensorexpr_fusion{false};
@@ -220,10 +228,13 @@ struct TORCH_API StaticModuleOptions {
 /// @endcode
 ///
 class MemoryPlanner;
+class StaticNodeInfo;
 class ProcessedFunction;
 class ProcessedNode;
 class StaticRuntime;
 
+using SROperator = std::function<void(ProcessedNode*)>;
+
 // A `BlockInfo` instance stores all of the shared state that each
 // `BlockRunner` will need to access. Most of this information is
 // read-only and shared between threads.
@@ -240,10 +251,10 @@ class BlockInfo {
       : input_idx_(input_idx), block_(block) {}
 
   void set_nodes(
-      std::vector<ProcessedNode> nodes,
+      std::vector<StaticNodeInfo> nodes,
       const FastMap<Node*, bool>& node_has_out_variant);
 
-  const std::vector<ProcessedNode>& nodes() const {
+  const std::vector<StaticNodeInfo>& nodes() const {
     return nodes_;
   }
 
@@ -317,7 +328,7 @@ class BlockInfo {
   }
 
  private:
-  std::vector<ProcessedNode> nodes_;
+  std::vector<StaticNodeInfo> nodes_;
 
   ValueGroup value_group_;
 
@@ -454,7 +465,7 @@ class TORCH_API StaticModule {
 
   // Recurses on sub-blocks and populates the array of ProcessedNodes
   // Returns (number of nodes processed, number of blocks processed)
-  size_t prepareProcessedNodes(
+  size_t prepareStaticNodeInfos(
       Block* block,
       const FastMap<const Value*, uint32_t>& value_to_index,
       const AliasDb& alias_db,
@@ -475,8 +486,9 @@ class TORCH_API StaticModule {
   std::vector<IValue> constants_;
   // The functions to be called by corresponding ProcessedNode.
   std::vector<ProcessedFunction> functions_{};
-  // The nodes we need to run
-  std::vector<ProcessedNode> nodes_;
+  // A list of pre-processed nodes from which ProcessedNode are created per
+  // StaticRuntime instance.
+  std::vector<StaticNodeInfo> nodes_;
   // Indices of graph outputs in the single values array.
   std::vector<uint16_t> output_indices_;
 
@@ -506,7 +518,7 @@ class TORCH_API BlockRunner {
  public:
   BlockRunner(
       const StaticModule& sm,
-      std::vector<IValue>& values,
+      IValue* values,
       Block* block,
       bool is_root_block = false);
   BlockRunner(BlockRunner&&) noexcept;
@@ -557,15 +569,9 @@ class TORCH_API BlockRunner {
   // Input is readwrite
   IValue& Input(uint32_t i) {
     DCHECK_LT(i, block_info_.num_inputs());
-    DCHECK_LT(i, values_.size());
     return values_[i + block_info_.block_inputs_idx()];
   }
 
-  size_t init_sub_blocks(
-      const StaticModule& sm,
-      std::vector<IValue>& values,
-      size_t block_idx);
-
   // Output is readonly. The writing process happens inside ProcessedNodes
   C10_NODISCARD const IValue& Output(uint32_t i) const {
     DCHECK(i < outputs_.size());
@@ -722,7 +728,8 @@ class TORCH_API BlockRunner {
   // [block_i] = [inputs_i][intermediates_i]
   // Each BlockRunner knows where its inputs start. Each ProcessedNode
   // knows how to find the indices of its outputs/inputs in this array.
-  std::vector<IValue>& values_;
+  IValue* values_;
+
   std::vector<IValue*> outputs_;
   std::vector<ProcessedNode> nodes_;
 };
@@ -752,64 +759,72 @@ class TORCH_API ProcessedFunction {
     return check_memory_overlap_;
   }
 
+  size_t num_outputs() const {
+    return num_outputs_;
+  }
+
  private:
-  std::function<void(ProcessedNode*)> f_;
+  SROperator f_;
   Kind kind_{ProcessedFunction::Kind::kOutVariant};
   bool check_memory_overlap_{false};
+  size_t num_outputs_{0};
 };
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class TORCH_API ProcessedNode {
+class TORCH_API StaticNodeInfo {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  ProcessedNode() = default;
-  // ProcessedNodes are created within StaticModule and then
-  // associated with a shared values array using set_values() when
-  // they are copied into a StaticRuntime. block_runners_ are also
-  // not initialized until StaticRuntime initialization; see
-  // BlockRunner's ctor.
-  ProcessedNode(
+  StaticNodeInfo(
       Node* n,
       ProcessedFunction* fn,
       ProcessedNodeInputs inputs,
       uint16_t outputs_offset);
 
-  ProcessedNode(const ProcessedNode& other)
+  Node* node() const {
+    return node_;
+  }
+
+  size_t num_outputs() const {
+    DCHECK(fn_ != nullptr);
+    return fn_->num_outputs();
+  }
+
+  bool has_out_variant() const {
+    return fn_->kind() == ProcessedFunction::Kind::kOutVariant;
+  }
+
+ private:
+  friend class ProcessedNode;
+
+  Node* node_;
+  const ProcessedFunction* fn_;
+  ProcessedNodeInputs inputs_;
+  uint16_t outputs_offset_;
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class TORCH_API ProcessedNode {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ProcessedNode() = default;
+
+  ProcessedNode(const StaticNodeInfo& other, IValue* values)
       : node_(other.node_),
         fn_(other.fn_),
-        overlap_detected_(other.overlap_detected_),
         inputs_(other.inputs_),
         outputs_offset_(other.outputs_offset_),
-        num_outputs_(other.num_outputs_),
-        values_(other.values_),
-        // It doesn't really make sense to copy block runners,
-        // each processed node needs its own. This is OK to do
-        // since ProcessedNodes are copied from StaticModule right before
-        // the block runners are set up.
+        values_(values),
         // TODO(T105178680): For this task, we should move
-        // block runners out of ProcessedNode. Then, we don't have to deal
-        // with this caveat.
+        // block runners out of ProcessedNode.
         block_runners_(nullptr) {}
 
-  ProcessedNode& operator=(const ProcessedNode& other) {
-    if (&other == this) {
-      return *this;
-    }
-    node_ = other.node_;
-    fn_ = other.fn_;
-    overlap_detected_ = other.overlap_detected_;
-    inputs_ = other.inputs_;
-    outputs_offset_ = other.outputs_offset_;
-    num_outputs_ = other.num_outputs_;
-    values_ = other.values_;
-    block_runners_ = nullptr;
-    return *this;
-  }
-
   // These should be noexcept, but some Android build is failing
   // saying the noexcept specification doesn't match the calculated
   // one. Maybe c10::variant is throwing it off?
   ProcessedNode(ProcessedNode&&) = default;
+
+  ProcessedNode(const ProcessedNode&) = delete;
+  ProcessedNode& operator=(const ProcessedNode& other) = delete;
   ProcessedNode& operator=(ProcessedNode&&) = default;
 
   void run();
@@ -825,21 +840,23 @@ class TORCH_API ProcessedNode {
 
   // Output is readwrite
   IValue& Output(uint32_t i) {
-    DCHECK(i < num_outputs_);
+    DCHECK(i < num_outputs());
     return values_[outputs_offset_ + i];
   }
 
   C10_NODISCARD const IValue& Output(uint32_t i) const {
-    DCHECK(i < num_outputs_);
+    DCHECK(i < num_outputs());
     return values_[outputs_offset_ + i];
   }
 
-  C10_NODISCARD c10::ArrayRef<const IValue> outputs() const {
-    return c10::ArrayRef<const IValue>(values_ + outputs_offset_, num_outputs_);
+  size_t num_outputs() const {
+    DCHECK(fn_ != nullptr);
+    return fn_->num_outputs();
   }
 
-  C10_NODISCARD auto num_outputs() const {
-    return num_outputs_;
+  C10_NODISCARD c10::ArrayRef<const IValue> outputs() const {
+    return c10::ArrayRef<const IValue>(
+        values_ + outputs_offset_, num_outputs());
   }
 
   C10_NODISCARD uint16_t num_inputs() const {
@@ -885,7 +902,7 @@ class TORCH_API ProcessedNode {
   }
 
   C10_NODISCARD uint16_t output_ivalue_index(uint16_t i) const {
-    DCHECK(i < num_outputs_);
+    DCHECK(i < num_outputs());
     return outputs_offset_ + i;
   }
   // used in debug mode
@@ -907,10 +924,9 @@ class TORCH_API ProcessedNode {
 
   Node* node_;
   const ProcessedFunction* fn_;
-  bool overlap_detected_{false};
   ProcessedNodeInputs inputs_;
   uint16_t outputs_offset_;
-  uint16_t num_outputs_;
+  bool overlap_detected_{false};
   IValue* values_ = nullptr; // unowned
   // For control flow; processed nodes may have sub-blocks which can
   // be executed by op implementations.
@@ -974,8 +990,38 @@ class TORCH_API StaticRuntime {
   }
 
  private:
+  // An array of IValues with unchanging size/data ptr.
+  class IValueArray {
+   public:
+    IValueArray() = default;
+    explicit IValueArray(size_t size) : array_(allocate(size)), size_(size) {}
+
+    IValue* data() const {
+      return array_.get();
+    }
+
+    size_t size() const {
+      return size_;
+    }
+
+   private:
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    static std::unique_ptr<IValue[]> allocate(size_t size) {
+      if (size) {
+        return std::make_unique<IValue[]>(size);
+      }
+      return nullptr;
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    std::unique_ptr<IValue[]> array_ = nullptr;
+    size_t size_;
+  };
+
   std::unique_ptr<BlockRunner> block_;
-  std::vector<IValue> values_;
+  IValueArray values_;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/static/memory_planner.cpp b/torch/csrc/jit/runtime/static/memory_planner.cpp
index b19bfe5f6f93..1da4e0cfabbd 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.cpp
+++ b/torch/csrc/jit/runtime/static/memory_planner.cpp
@@ -54,6 +54,18 @@ FastMap<const Value*, at::Tensor*> tensorValueToTensor(
   return tensor_value_to_tensor;
 }
 
+// Don't change the size if it is already aligned, otherwise increase the size
+// to make it aligned.
+size_t compute_aligned_tensor_size(size_t nbytes) {
+  // Note: everything below is size_t
+  return (nbytes + c10::gAlignment - 1) & (~(c10::gAlignment - 1));
+}
+
+at::DataPtr allocate_buffer(size_t size) {
+  at::Allocator* allocator = c10::GetCPUCachingAllocator();
+  return allocator->allocate(size);
+}
+
 } // namespace
 
 std::vector<StorageGroup> assignStorageToManagedTensors(
@@ -154,8 +166,7 @@ MemoryPlanner::MemoryPlanner(
     BlockRunner* block_runner,
     const BlockInfo& block_info,
     bool enable_out_variant,
-    bool manage_output_tensors,
-    bool optimize_memory) {
+    bool manage_output_tensors) {
   const auto& managed_tensor_values = block_info.managed_tensor_values();
   const auto& managed_output_tensor_values =
       block_info.managed_output_tensor_values();
@@ -168,14 +179,19 @@ MemoryPlanner::MemoryPlanner(
     const auto borrows_outputs = borrowsOutputs(pnode.node()->kind());
     for (const auto i : c10::irange(pnode.outputs().size())) {
       const Value* out_v = pnode.node()->outputs()[i];
-      const bool in_managed_sets = setIncludes(managed_tensor_values, out_v) ||
+      const bool in_managed_tensors = setIncludes(managed_tensor_values, out_v);
+      const bool is_unmanaged_special_case = isUnmanagedSpecialCase(pnode, i);
+      if (in_managed_tensors && !is_unmanaged_special_case) {
+        ++num_managed_tensors_;
+      }
+      const bool in_managed_sets = in_managed_tensors ||
           // Manage output tensors might have been turned off, so we have to
           // check the flag here
           (manage_output_tensors &&
            setIncludes(managed_output_tensor_values, out_v)) ||
           setIncludes(leaked_values, out_v);
 
-      if (in_managed_sets && !isUnmanagedSpecialCase(pnode, i)) {
+      if (in_managed_sets && !is_unmanaged_special_case) {
         continue;
       }
       if (doesNotHeapAllocateWhenStoredInIValue(*out_v->type())) {
@@ -212,84 +228,18 @@ MemoryPlanner::MemoryPlanner(
       unmanaged_borrowed_ivalues.begin(),
       unmanaged_borrowed_ivalues.end());
 
-  if (enable_out_variant) {
-    const auto tensor_value_to_tensor =
-        tensorValueToTensor(block_runner->nodes(), managed_tensor_values);
-    if (optimize_memory) {
-      managed_tensors_ = assignStorageToManagedTensors(
-          block_info.node_ptrs(),
-          block_info.managed_tensor_ranges(),
-          tensor_value_to_tensor);
-    } else {
-      for (auto& tensor : tensor_value_to_tensor) {
-        managed_tensors_.emplace_back(tensor.second);
-      }
-    }
-  }
-
   if (enable_out_variant && manage_output_tensors) {
     managed_output_tensors_ = assignStorageToOutputTensors(
         block_runner, managed_output_tensor_values);
   }
-
-  num_managed_tensors_ = 0;
-  for (const auto& ms : managed_tensors_) {
-    num_managed_tensors_ += ms.numManagedTensors();
-  }
-}
-
-// Don't change the size if it is already aligned, otherwise increase the size
-// to make it aligned.
-size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) {
-  // Note: everything below is size_t
-  return (nbytes + c10::gAlignment - 1) & (~(c10::gAlignment - 1));
-}
-
-at::DataPtr MemoryPlanner::allocate_buffer(size_t size) {
-  at::Allocator* allocator = c10::GetCPUCachingAllocator();
-  return allocator->allocate(size);
 }
 
-void MemoryPlanner::allocateManagedTensors() {
-  if (managed_bytes_ == 0) {
-    return;
-  }
-  DCHECK(!managed_tensor_storage_impls_.empty());
-  buffer_ = allocate_buffer(managed_bytes_);
-
-  size_t offset = 0;
+uint8_t* MemoryPlanner::allocateBuffer(size_t num_bytes) {
+  buffer_ = allocate_buffer(num_bytes);
   uint8_t* start = static_cast<uint8_t*>(buffer_.get());
   buffer_start_ = start;
-  buffer_end_ = start + managed_bytes_;
-
-  reused_tensors_ = 0;
-  auto group_idx = 0;
-  for (auto& ms : managed_tensor_storage_impls_) {
-    auto tensor_size = ms.first;
-    if (tensor_size == 0) {
-      group_idx++;
-      continue;
-    }
-    at::StorageImpl* storageImpl = &ms.second;
-    DCHECK_LE(offset + tensor_size, managed_bytes_);
-    void* src = static_cast<void*>(start + offset);
-
-#ifndef NDEBUG
-    DCHECK_EQ(tensor_size, managed_tensors_[group_idx].maxTensorSize());
-    for (auto* tensor : managed_tensors_[group_idx].group()) {
-      DCHECK_EQ(storageImpl, tensor->storage().unsafeGetStorageImpl());
-    }
-#endif
-    DCHECK_NE(managed_tensors_[group_idx].numManagedTensors(), 0);
-    reused_tensors_ += managed_tensors_[group_idx].numManagedTensors() - 1;
-    storageImpl->set_data_ptr_noswap(
-        at::DataPtr(src, src, nullptr, c10::Device(c10::DeviceType::CPU)));
-    storageImpl->set_nbytes(tensor_size);
-
-    offset += tensor_size;
-    group_idx++;
-  }
-  DCHECK_EQ(offset, managed_bytes_);
+  buffer_end_ = start + num_bytes;
+  return start;
 }
 
 void MemoryPlanner::allocateOutputTensors() {
@@ -342,6 +292,106 @@ void MemoryPlanner::allocate() {
 }
 
 void MemoryPlanner::deallocate() {
+  deallocateManagedTensors();
+  for (auto& iv : borrowed_ivalues_needing_incref_) {
+    auto old = std::move(*iv);
+    *iv = IValue(old);
+    c10::MaybeOwnedTraits<c10::IValue>::destroyBorrow(old);
+  }
+  // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so
+  // that the objects pointed to by *iv may be reclaimed by reference counting
+  for (auto& iv : unmanaged_ivalues_) {
+    *iv = IValue();
+  }
+  for (auto& iv : unmanaged_borrowed_ivalues_) {
+    c10::MaybeOwnedTraits<c10::IValue>::destroyBorrow(*iv);
+  }
+  buffer_ = {};
+}
+
+void MemoryPlanner::deallocateOutputTensors() {
+  size_t output_buffer_bytes = 0;
+  for (auto& ms : managed_output_tensors_) {
+    auto* tensor = ms.second;
+    size_t current_size =
+        compute_aligned_tensor_size(tensor->storage().nbytes());
+    tensor->storage().unsafeGetStorageImpl()->reset();
+    if (current_size > ms.first) {
+      ms.first = current_size;
+    }
+    output_buffer_bytes += ms.first;
+  }
+  output_buffer_bytes_ = output_buffer_bytes;
+  output_buffer_ = {};
+}
+
+StandardMemoryPlanner::StandardMemoryPlanner(
+    BlockRunner* block_runner,
+    const BlockInfo& block_info,
+    bool enable_out_variant,
+    bool manage_output_tensors,
+    bool optimize_memory)
+    : MemoryPlanner(
+          block_runner,
+          block_info,
+          enable_out_variant,
+          manage_output_tensors) {
+  const auto& managed_tensor_values = block_info.managed_tensor_values();
+  if (enable_out_variant) {
+    const auto tensor_value_to_tensor =
+        tensorValueToTensor(block_runner->nodes(), managed_tensor_values);
+    if (optimize_memory) {
+      managed_tensors_ = assignStorageToManagedTensors(
+          block_info.node_ptrs(),
+          block_info.managed_tensor_ranges(),
+          tensor_value_to_tensor);
+    } else {
+      for (auto& tensor : tensor_value_to_tensor) {
+        managed_tensors_.emplace_back(tensor.second);
+      }
+    }
+  }
+}
+
+void StandardMemoryPlanner::allocateManagedTensors() {
+  if (managed_bytes_ == 0) {
+    return;
+  }
+  DCHECK(!managed_tensor_storage_impls_.empty());
+  size_t offset = 0;
+  auto* start = allocateBuffer(managed_bytes_);
+
+  reused_tensors_ = 0;
+  auto group_idx = 0;
+  for (auto& ms : managed_tensor_storage_impls_) {
+    auto tensor_size = ms.first;
+    if (tensor_size == 0) {
+      group_idx++;
+      continue;
+    }
+    at::StorageImpl* storageImpl = &ms.second;
+    DCHECK_LE(offset + tensor_size, managed_bytes_);
+    void* src = static_cast<void*>(start + offset);
+
+#ifndef NDEBUG
+    DCHECK_EQ(tensor_size, managed_tensors_[group_idx].maxTensorSize());
+    for (auto* tensor : managed_tensors_[group_idx].group()) {
+      DCHECK_EQ(storageImpl, tensor->storage().unsafeGetStorageImpl());
+    }
+#endif
+    DCHECK_NE(managed_tensors_[group_idx].numManagedTensors(), 0);
+    reused_tensors_ += managed_tensors_[group_idx].numManagedTensors() - 1;
+    storageImpl->set_data_ptr_noswap(
+        at::DataPtr(src, src, nullptr, c10::Device(c10::DeviceType::CPU)));
+    storageImpl->set_nbytes(tensor_size);
+
+    offset += tensor_size;
+    group_idx++;
+  }
+  DCHECK_EQ(offset, managed_bytes_);
+}
+
+void StandardMemoryPlanner::deallocateManagedTensors() {
   managed_bytes_ = 0;
   // free memory used by outputs of ops in out variants
   // but keep the TensorImpl and StorageImpl around.
@@ -357,7 +407,6 @@ void MemoryPlanner::deallocate() {
   for (auto& ms : managed_tensors_) {
     const auto& tensors = ms.group();
     size_t max = ms.maxTensorSize();
-    auto tensor_idx = 0;
     for (auto& tensor : tensors) {
       const auto& storage = tensor->storage();
       size_t current_size = compute_aligned_tensor_size(storage.nbytes());
@@ -419,37 +468,6 @@ void MemoryPlanner::deallocate() {
 
   DCHECK_EQ(managed_tensor_storage_impls_.size(), managed_tensors_.size());
   VLOG(1) << "managed_bytes: " << managed_bytes_;
-
-  for (auto& iv : borrowed_ivalues_needing_incref_) {
-    auto old = std::move(*iv);
-    *iv = IValue(old);
-    c10::MaybeOwnedTraits<c10::IValue>::destroyBorrow(old);
-  }
-  // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so
-  // that the objects pointed to by *iv may be reclaimed by reference counting
-  for (auto& iv : unmanaged_ivalues_) {
-    *iv = IValue();
-  }
-  for (auto& iv : unmanaged_borrowed_ivalues_) {
-    c10::MaybeOwnedTraits<c10::IValue>::destroyBorrow(*iv);
-  }
-  buffer_ = {};
-}
-
-void MemoryPlanner::deallocateOutputTensors() {
-  size_t output_buffer_bytes = 0;
-  for (auto& ms : managed_output_tensors_) {
-    auto* tensor = ms.second;
-    size_t current_size =
-        compute_aligned_tensor_size(tensor->storage().nbytes());
-    tensor->storage().unsafeGetStorageImpl()->reset();
-    if (current_size > ms.first) {
-      ms.first = current_size;
-    }
-    output_buffer_bytes += ms.first;
-  }
-  output_buffer_bytes_ = output_buffer_bytes;
-  output_buffer_ = {};
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/static/memory_planner.h b/torch/csrc/jit/runtime/static/memory_planner.h
index 5281de1e6e51..3f849b4daf7b 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.h
+++ b/torch/csrc/jit/runtime/static/memory_planner.h
@@ -92,17 +92,18 @@ TORCH_API std::vector<StorageGroup> assignStorageToManagedTensors(
 
 class MemoryPlanner {
  public:
-  explicit MemoryPlanner(
+  MemoryPlanner(
       BlockRunner* block_runner,
       const BlockInfo& block_info,
       bool enable_out_variant,
-      bool manage_output_tensors,
-      bool optimize_memory);
+      bool manage_output_tensors);
+
   // disable copying and moving
   MemoryPlanner(const MemoryPlanner&) = delete;
   MemoryPlanner& operator=(const MemoryPlanner&) = delete;
   MemoryPlanner(MemoryPlanner&&) = delete;
   MemoryPlanner& operator=(MemoryPlanner&&) = delete;
+  virtual ~MemoryPlanner() = default;
 
   void allocate();
   void deallocate();
@@ -112,6 +113,10 @@ class MemoryPlanner {
     return num_managed_tensors_;
   }
 
+  size_t total_reused_tensors() const {
+    return reused_tensors_;
+  }
+
   size_t total_num_managed_output_tensors() const {
     return managed_output_tensors_.size();
   }
@@ -132,10 +137,6 @@ class MemoryPlanner {
     return managed_bytes_;
   }
 
-  size_t total_reused_tensors() const {
-    return reused_tensors_;
-  }
-
   size_t numOutputBufferBytes() const {
     return output_buffer_bytes_;
   }
@@ -171,7 +172,8 @@ class MemoryPlanner {
     const auto start =
         reinterpret_cast<uintptr_t>(managed_tensor_storage_impls_.data());
     const auto end = reinterpret_cast<uintptr_t>(
-        &managed_tensor_storage_impls_[managed_tensor_storage_impls_.size()]);
+        managed_tensor_storage_impls_.data() +
+        managed_tensor_storage_impls_.size());
     return impl_p >= start && impl_p < end;
   }
 
@@ -179,53 +181,71 @@ class MemoryPlanner {
     return buffer_start_ <= data_ptr && data_ptr < buffer_end_;
   }
 
- private:
-  // ivalues created in one run but not managed by MemoryPlanner
-  std::vector<IValue*> unmanaged_ivalues_;
-
-  // Special class of unmanaged values: some native ops create IValues
-  // in a "borrowed" state that can and must be cleaned up without a
-  // reference count decrement.
-  std::vector<IValue*> unmanaged_borrowed_ivalues_;
+ protected:
+  uint8_t* allocateBuffer(size_t num_bytes);
 
-  // Even more special class of unmanaged values: if select_tensor
-  // outputs are outputs of the graph, then they need to be restored
-  // to an ordinary "strong reference" state.
-  std::vector<IValue*> borrowed_ivalues_needing_incref_;
+  size_t managed_bytes_{0};
+  size_t reused_tensors_{0};
 
   // each pair contains the size (in bytes) of data to be allocated
   // and a vector of Tensors' storages that should be backed by that
   // same data. Thus, if memonger is disabled, all vectors are of
   // size 1.
-
   // We allocate StorageImpls ourselves so that 1) we don't have to do
   // an extra two loads per Tensor (which will likely miss in the CPU
   // data cache) first reading the Storage (i.e., StorageImpl pointer)
   // from the TensorImpl object and then second dereferencing it and
   // 2) our memory access pattern during allocate() has high locality.
-  std::vector<std::pair<size_t, at::StorageImpl>>
-      managed_tensor_storage_impls_{};
   // We don't have any guarantee that the model doesn't change the
   // Storage for managed tensors out from under us during execution,
   // so we have to check the StorageImpls each time we deallocate.
-  std::vector<StorageGroup> managed_tensors_{};
+  std::vector<std::pair<size_t, at::StorageImpl>>
+      managed_tensor_storage_impls_{};
+
+ private:
+  // ivalues created in one run but not managed by MemoryPlanner
+  std::vector<IValue*> unmanaged_ivalues_;
+
+  // Special class of unmanaged values: some native ops create IValues
+  // in a "borrowed" state that can and must be cleaned up without a
+  // reference count decrement.
+  std::vector<IValue*> unmanaged_borrowed_ivalues_;
+
+  // Even more special class of unmanaged values: if select_tensor
+  // outputs are outputs of the graph, then they need to be restored
+  // to an ordinary "strong reference" state.
+  std::vector<IValue*> borrowed_ivalues_needing_incref_;
+
   std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_{};
   at::DataPtr buffer_; // allocated each time we call Run()
   uint8_t* buffer_start_{nullptr};
   uint8_t* buffer_end_{nullptr};
   size_t num_managed_tensors_{0};
-  size_t managed_bytes_{0};
-  size_t reused_tensors_{0};
   size_t num_unmanaged_scalar_ivalues_{0};
 
   at::DataPtr output_buffer_;
   size_t output_buffer_bytes_{0};
 
-  void allocateManagedTensors();
+  virtual void allocateManagedTensors() = 0;
+  virtual void deallocateManagedTensors() = 0;
+
   void allocateOutputTensors();
+};
+
+class StandardMemoryPlanner : public MemoryPlanner {
+ public:
+  StandardMemoryPlanner(
+      BlockRunner* block_runner,
+      const BlockInfo& block_info,
+      bool enable_out_variant,
+      bool manage_output_tensors,
+      bool optimize_memory);
+
+ protected:
+  void allocateManagedTensors() override;
+  void deallocateManagedTensors() override;
 
-  static size_t compute_aligned_tensor_size(size_t nbytes);
-  static at::DataPtr allocate_buffer(size_t size);
+  std::vector<StorageGroup> managed_tensors_{};
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 33e2e27a7dea..5f22fc4e42ea 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -39,7 +39,7 @@ bool nativeOpIsRegistered(const c10::Symbol& op_name) {
   return SRNativeOperatorRegistry()->Has(name);
 }
 
-std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
+SROperator getNativeOperation(Node* n) {
   auto op_name = n->kind().toQualString();
   if (SRNativeOperatorRegistry()->Has(op_name)) {
     return SRNativeOperatorRegistry()->Create(op_name)->Generate(n);
@@ -87,17 +87,21 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
     prim::DictConstruct,
     prim_DictConstruct,
     [](Node* n) -> SROperator {
-      return [](ProcessedNode* p_node) {
-        // prepare inputs
-        auto stack = boxInputs(*p_node);
-        // run op
-        auto* node = p_node->node();
-        dictConstruct(
-            stack,
-            node->output()->type()->expectRef<DictType>(),
-            node->inputs().size());
-        // put output back
-        p_node->Output(0) = std::move(stack[0]);
+      auto dict_type = n->output()->type()->expect<DictType>();
+      const auto num_inputs = n->inputs().size();
+      DCHECK_EQ(num_inputs % 2, 0);
+      return [dict_type = std::move(dict_type),
+              num_inputs,
+              dict_size = num_inputs / 2](ProcessedNode* p_node) {
+        auto result = c10::impl::GenericDict(
+            dict_type->containedType(0), dict_type->containedType(1));
+        result.reserve(dict_size);
+        for (size_t i = 0; i < num_inputs; i += 2) {
+          const auto& key = p_node->Input(i);
+          const auto& value = p_node->Input(i + 1);
+          result.insert_or_assign(key, value);
+        }
+        p_node->Output(0) = result;
       };
     });
 
@@ -168,16 +172,17 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
     prim::ListUnpack,
     prim_ListUnpack,
     [](Node* n) -> SROperator {
-      return [](ProcessedNode* p_node) {
-        // prepare inputs
-        auto stack = boxInputs(*p_node);
-        // run op
-        size_t num_outputs = p_node->outputs().size();
-        listUnpack(stack, num_outputs);
-        // put output back
-        DCHECK_EQ(stack.size(), num_outputs);
+      const auto num_outputs = n->outputs().size();
+      return [num_outputs](ProcessedNode* p_node) {
+        const auto list = p_node->Input(0).toListRef();
+        TORCH_CHECK(
+            list.size() == num_outputs,
+            "Expected ",
+            num_outputs,
+            " elements in list but got ",
+            list.size());
         for (const auto i : c10::irange(num_outputs)) {
-          p_node->Output(i) = std::move(stack[i]);
+          p_node->Output(i) = list[i];
         }
       };
     });
@@ -709,26 +714,100 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
           [](ProcessedNode* p_node) { p_node->Output(0) = p_node->Input(0); };
     });
 
-REGISTER_NATIVE_OPERATOR_FUNCTOR(prim::If, prim_If, [](Node*) -> SROperator {
-  return [](ProcessedNode* p_node) {
-    auto condition = p_node->Input(0).toBool();
-    auto* block_runners = p_node->block_runners();
-    DCHECK(block_runners);
-    DCHECK_EQ(block_runners->size(), 2);
-    auto& runner = (*block_runners)[!condition];
-
-    auto output = runner({});
-    if (!output.isTuple()) {
-      p_node->Output(0) = std::move(output);
-      return;
-    }
-    auto& elems = output.toTupleRef().elements();
-    DCHECK_EQ(elems.size(), p_node->num_outputs());
-    for (const auto i : c10::irange(elems.size())) {
-      p_node->Output(i) = elems[i];
-    }
-  };
-});
+namespace {
+bool outputsEmpty(const Block* block) {
+  return block->outputs().size() == 1 && block->outputs().at(0)->mustBeNone();
+}
+
+bool blockEmpty(const Block* block) {
+  return block->nodes().begin() == block->nodes().end();
+}
+
+enum class BlockRunPlan : int8_t {
+  kRunOnlyTrueBlock,
+  kRunOnlyFalseBlock,
+  kRunBothBlocks,
+  kRunNeitherBlock,
+};
+} // namespace
+
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    prim::If,
+    prim_If,
+    [](Node* node) -> SROperator {
+      DCHECK_EQ(node->blocks().size(), 2);
+      const Block* true_block = node->blocks().at(0);
+      const Block* false_block = node->blocks().at(1);
+
+      const bool true_block_returns_empty = outputsEmpty(true_block);
+      const bool false_block_returns_empty = outputsEmpty(false_block);
+
+      BlockRunPlan block_run_plan = BlockRunPlan::kRunNeitherBlock;
+
+      if (true_block_returns_empty && false_block_returns_empty) {
+        const bool false_block_is_empty = blockEmpty(false_block);
+        const bool true_block_is_empty = blockEmpty(true_block);
+
+        if (false_block_is_empty && !true_block_is_empty) {
+          block_run_plan = BlockRunPlan::kRunOnlyTrueBlock;
+        } else if (!false_block_is_empty && true_block_is_empty) {
+          block_run_plan = BlockRunPlan::kRunOnlyFalseBlock;
+        } else if (false_block_is_empty && true_block_is_empty) {
+          block_run_plan = BlockRunPlan::kRunNeitherBlock;
+        } else {
+          block_run_plan = BlockRunPlan::kRunBothBlocks;
+        }
+      } else {
+        block_run_plan = BlockRunPlan::kRunBothBlocks;
+      }
+
+      switch (block_run_plan) {
+        case BlockRunPlan::kRunBothBlocks:
+          return [](ProcessedNode* p_node) {
+            auto condition = p_node->Input(0).toBool();
+            auto* block_runners = p_node->block_runners();
+            DCHECK(block_runners);
+            DCHECK_EQ(block_runners->size(), 2);
+            auto& runner = (*block_runners)[!condition];
+
+            auto output = runner({});
+            if (!output.isTuple()) {
+              p_node->Output(0) = std::move(output);
+              return;
+            }
+            auto& elems = output.toTupleRef().elements();
+            DCHECK_EQ(elems.size(), p_node->num_outputs());
+            for (const auto i : c10::irange(elems.size())) {
+              p_node->Output(i) = elems[i];
+            }
+          };
+        case BlockRunPlan::kRunOnlyTrueBlock:
+          return [](ProcessedNode* p_node) {
+            auto condition = p_node->Input(0).toBool();
+            auto* block_runners = p_node->block_runners();
+            DCHECK(block_runners);
+            DCHECK_EQ(block_runners->size(), 2);
+            if (condition) {
+              auto output = block_runners->front()({});
+              DCHECK(output.isNone());
+            }
+          };
+        case BlockRunPlan::kRunOnlyFalseBlock:
+          return [](ProcessedNode* p_node) {
+            auto condition = p_node->Input(0).toBool();
+            auto* block_runners = p_node->block_runners();
+            DCHECK(block_runners);
+            DCHECK_EQ(block_runners->size(), 2);
+            if (!condition) {
+              auto output = block_runners->back()({});
+              DCHECK(output.isNone());
+            }
+          };
+        case BlockRunPlan::kRunNeitherBlock:
+          return [](ProcessedNode*) {};
+      }
+      return [](ProcessedNode*) {};
+    });
 
 namespace {
 
@@ -946,5 +1025,122 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
       };
     });
 
+// See [Borrowed IValue Outputs]
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    prim::IfThenElse,
+    prim_IfThenElse,
+    [](Node*) -> SROperator {
+      return [](ProcessedNode* pnode) {
+        const auto condition = pnode->Input(0).toBool();
+        pnode->Output(0) = condition ? createBorrowedIValue(pnode->Input(1))
+                                     : createBorrowedIValue(pnode->Input(2));
+      };
+    });
+
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::len,
+    aten_len,
+    [](Node* n) -> SROperator {
+      if (n->matches(torch::schema("aten::len.t(t[] a) -> int")) ||
+          n->matches(torch::schema("aten::len.any(Any[] a) -> int"))) {
+        return [](ProcessedNode* pnode) {
+          const auto list = pnode->Input(0).toListRef();
+          const int64_t size = list.size();
+          pnode->Output(0) = size;
+        };
+      }
+      if (n->matches(torch::schema("aten::len.Tensor(Tensor t) -> int"))) {
+        return [](ProcessedNode* pnode) {
+          const auto& t = pnode->Input(0).toTensor();
+          TORCH_CHECK(t.dim() > 0);
+          pnode->Output(0) = t.sizes()[0];
+        };
+      }
+      if (n->matches(torch::schema("aten::len.str(str s) -> int"))) {
+        return [](ProcessedNode* pnode) {
+          const auto& string = pnode->Input(0).toStringRef();
+          pnode->Output(0) = static_cast<int64_t>(string.size());
+        };
+      }
+      if (n->matches(
+              torch::schema("aten::len.Dict_str(Dict(str, t) self) -> int")) ||
+          n->matches(
+              torch::schema("aten::len.Dict_int(Dict(int, t) self) -> int")) ||
+          n->matches(torch::schema(
+              "aten::len.Dict_bool(Dict(bool, t) self) -> int")) ||
+          n->matches(torch::schema(
+              "aten::len.Dict_float(Dict(float, t) self) -> int")) ||
+          n->matches(torch::schema(
+              "aten::len.Dict_complex(Dict(complex, t) self) -> int")) ||
+          n->matches(torch::schema(
+              "aten::len.Dict_Tensor(Dict(Tensor, t) self) -> int"))) {
+        return [](ProcessedNode* pnode) {
+          const auto& dict = pnode->Input(0).toGenericDict();
+          pnode->Output(0) = static_cast<int64_t>(dict.size());
+        };
+      }
+      LogAndDumpSchema(n);
+      return nullptr;
+    });
+
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::IntImplicit,
+    aten_IntImplicit,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema("aten::IntImplicit(Tensor a) -> int"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* pnode) {
+        const auto& tensor = pnode->Input(0).toTensor();
+        // JIT does a check for requires_grad, but we skip it here since SR is
+        // inference only
+        if (tensor.sizes().size() != 0) {
+          throw std::runtime_error(
+              "Cannot convert a tensor of dimension > 0 to scalar");
+        }
+        if (!isIntegralType(tensor.scalar_type(), /*includeBool=*/false)) {
+          std::stringstream ss;
+          ss << "Cannot input a tensor of type " << tensor.scalar_type()
+             << " as an integral argument";
+          throw std::runtime_error(ss.str());
+        }
+        pnode->Output(0) = at::native::item(tensor).toInt();
+      };
+    });
+
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::select,
+    aten_select,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "aten::select(Tensor(a) self, int dim, int index) -> Tensor(a)"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* pnode) {
+        const auto& self = pnode->Input(0).toTensor();
+        const auto dim = pnode->Input(1).toInt();
+        const auto index = pnode->Input(2).toInt();
+        pnode->Output(0) = at::native::select(self, dim, index);
+      };
+    });
+
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::reshape_as,
+    aten_reshape_as,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "aten::reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* pnode) {
+        const auto& self = pnode->Input(0).toTensor();
+        const auto& other = pnode->Input(1).toTensor();
+        pnode->Output(0) = at::native::reshape(self, other.sizes());
+      };
+    });
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index b7c6a5eda66e..0e4a93ea34c6 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -8,7 +8,6 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
-#include <ATen/native/EmbeddingBag.h>
 #include <ATen/native/Fill.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/Resize.h>
@@ -92,11 +91,11 @@ void repeat_out(at::Tensor& result, const Tensor& self, IntArrayRef repeats) {
 at::Tensor& reshape_copy_out(
     at::Tensor& out,
     const at::Tensor& self,
-    at::IntArrayRef proposed_shape,
+    const at::DimVector& proposed_shape,
     bool infer_size) {
-  auto shape = infer_size
-      ? at::infer_size(proposed_shape, self.numel())
-      : std::vector<int64_t>(proposed_shape.begin(), proposed_shape.end());
+  const auto& shape = infer_size
+      ? at::infer_size_dv(proposed_shape, self.numel())
+      : proposed_shape;
   at::native::resize_(out, shape, c10::nullopt);
 
   auto self_contig = self.expect_contiguous();
@@ -126,11 +125,11 @@ at::Tensor& flatten_copy_out(
       "flatten() has invalid args: start_dim cannot come after end_dim");
 
   if (self.dim() == 0) {
-    return reshape_copy_out(out, self, {1}, false);
+    return reshape_copy_out(out, self, at::DimVector{1}, false);
   }
 
   if (start_dim == end_dim) {
-    auto shape = self.sizes().vec();
+    auto shape = at::DimVector{self.sizes()};
     return reshape_copy_out(out, self, shape, false);
   }
 
@@ -147,7 +146,7 @@ at::Tensor& flatten_copy_out(
       // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<int64_t>());
 
-  std::vector<int64_t> shape;
+  at::DimVector shape;
   shape.reserve(self.dim() - end_dim + start_dim);
   for (const auto i : c10::irange(start_dim)) {
     shape.push_back(self.sizes()[i]);
@@ -362,39 +361,6 @@ Tensor& c2_argmin_out(
   return output;
 }
 
-void where_out(
-    const at::Tensor& cond,
-    const at::Tensor& x,
-    const at::Tensor& y,
-    at::Tensor& out) {
-  TORCH_CHECK(x.scalar_type() == y.scalar_type());
-  TORCH_CHECK(out.scalar_type() == x.scalar_type());
-  TORCH_CHECK(cond.scalar_type() == at::ScalarType::Bool);
-  TORCH_CHECK(x.sizes() == y.sizes());
-
-  at::native::resize_(out, x.sizes(), c10::nullopt);
-  TORCH_CHECK(out.is_contiguous());
-
-  const auto num_elems = x.numel();
-  AT_DISPATCH_ALL_TYPES(x.scalar_type(), "where_out_x", [&] {
-    const auto cond_contig = cond.expect_contiguous();
-    const auto x_contig = x.expect_contiguous();
-    const auto y_contig = y.expect_contiguous();
-
-    const auto* data_cond = cond_contig->data_ptr<bool>();
-    const auto* data_x = x_contig->data_ptr<scalar_t>();
-    const auto* data_y = y_contig->data_ptr<scalar_t>();
-    auto* data_out = out.data_ptr<scalar_t>();
-    for (const auto i : c10::irange(num_elems)) {
-      if (data_cond[i]) {
-        data_out[i] = data_x[i];
-      } else {
-        data_out[i] = data_y[i];
-      }
-    }
-  });
-}
-
 at::Tensor& dequantize_copy_out(Tensor& out, const Tensor& self) {
   if (C10_UNLIKELY(!self.is_quantized())) {
     // fallback to dequantize_cpu equivalent case: make sure out is at::kFloat
@@ -429,7 +395,7 @@ bool disableUnsafeMathOp(const char* op_name) {
   return fast_ops.count(op_name) > 0;
 }
 
-std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n) {
+SROperator getOutOfPlaceOperation(Node* n) {
   auto op_name = n->kind().toQualString();
   if (SROperatorRegistry()->Has(op_name) && !disableUnsafeMathOp(op_name)) {
     return SROperatorRegistry()->Create(op_name)->Generate(n);
@@ -491,61 +457,90 @@ bool isOptimizableContainerType(
   return is_supported_type && inputsCanRunOutOfPlace(n, node_has_out_variant);
 }
 
+static inline void listConstructSlowPath(
+    const ListType& list_type,
+    const size_t size,
+    ProcessedNode* p_node) {
+  c10::List<IValue> vals(list_type.getElementType());
+  vals.reserve(size);
+  for (const auto i : c10::irange(size)) {
+    vals.push_back(p_node->Input(i));
+  }
+  p_node->Output(0) = vals;
+}
+
 REGISTER_OPERATOR_FUNCTOR(
     prim::ListConstruct,
     prim_ListConstruct,
     [](Node* n) -> SROperator {
+      const bool can_optimize =
+          isOptimizableContainerType(n, FastMap<Node*, bool>());
       const auto& type = n->output()->type()->expectRef<ListType>();
-      bool can_optimize = isOptimizableContainerType(n, FastMap<Node*, bool>());
-      return [can_optimize, &type](ProcessedNode* p_node) {
+      const size_t size = n->inputs().size();
+      if (!can_optimize) {
+        return [&type, size](ProcessedNode* p_node) {
+          DCHECK(p_node->num_inputs() == size);
+          listConstructSlowPath(type, size, p_node);
+        };
+      }
+      return [&type, size](ProcessedNode* p_node) {
+        DCHECK(p_node->num_inputs() == size);
         const auto& out_l = p_node->Output(0);
-        if (!out_l.isNone() && can_optimize) {
+        if (!out_l.isNone()) {
           return;
         }
-        const size_t size = p_node->num_inputs();
-        c10::List<IValue> vals(type.getElementType());
-        vals.reserve(size);
-        for (const auto i : c10::irange(size)) {
-          vals.push_back(p_node->Input(i));
-        }
-        p_node->Output(0) = std::move(vals);
+        listConstructSlowPath(type, size, p_node);
       };
     });
 
+static inline void tupleConstructSlowPath(
+    const size_t size,
+    ProcessedNode* p_node) {
+  // prepare inputs
+  switch (size) {
+    case 1:
+      p_node->Output(0) = c10::ivalue::Tuple::create(p_node->Input(0));
+      break;
+    case 2:
+      p_node->Output(0) =
+          c10::ivalue::Tuple::create(p_node->Input(0), p_node->Input(1));
+      break;
+    case 3:
+      p_node->Output(0) = c10::ivalue::Tuple::create(
+          p_node->Input(0), p_node->Input(1), p_node->Input(2));
+      break;
+    default: {
+      std::vector<IValue> vals;
+      vals.reserve(size);
+      for (const auto i : c10::irange(size)) {
+        vals.push_back(p_node->Input(i));
+      }
+      p_node->Output(0) = c10::ivalue::Tuple::create(std::move(vals));
+      break;
+    }
+  }
+}
+
 REGISTER_OPERATOR_FUNCTOR(
     prim::TupleConstruct,
     prim_TupleConstruct,
     [](Node* n) -> SROperator {
-      bool can_optimize = isOptimizableContainerType(n, FastMap<Node*, bool>());
-      return [can_optimize](ProcessedNode* p_node) {
+      const bool can_optimize =
+          isOptimizableContainerType(n, FastMap<Node*, bool>());
+      const size_t size = n->inputs().size();
+      if (!can_optimize) {
+        return [size](ProcessedNode* p_node) {
+          DCHECK(p_node->num_inputs() == size);
+          tupleConstructSlowPath(size, p_node);
+        };
+      }
+      return [size](ProcessedNode* p_node) {
+        DCHECK(p_node->num_inputs() == size);
         const auto& out_l = p_node->Output(0);
-        if (!out_l.isNone() && can_optimize) {
+        if (!out_l.isNone()) {
           return;
         }
-        // prepare inputs
-        const size_t size = p_node->num_inputs();
-        switch (size) {
-          case 1:
-            p_node->Output(0) = c10::ivalue::Tuple::create(p_node->Input(0));
-            break;
-          case 2:
-            p_node->Output(0) =
-                c10::ivalue::Tuple::create(p_node->Input(0), p_node->Input(1));
-            break;
-          case 3:
-            p_node->Output(0) = c10::ivalue::Tuple::create(
-                p_node->Input(0), p_node->Input(1), p_node->Input(2));
-            break;
-          default: {
-            std::vector<IValue> vals;
-            vals.reserve(size);
-            for (const auto i : c10::irange(size)) {
-              vals.push_back(p_node->Input(i));
-            }
-            p_node->Output(0) = c10::ivalue::Tuple::create(std::move(vals));
-            break;
-          }
-        }
+        tupleConstructSlowPath(size, p_node);
       };
     });
 
@@ -608,10 +603,69 @@ REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator {
   };
 });
 
+#ifdef FBCODE_CAFFE2
+// Disable externally to avoid MSVC errors in open-source CI
+
+REGISTER_OPERATOR_FUNCTOR(
+    static_runtime::clamp_nan_to_num,
+    static_runtime_clamp_nan_to_num,
+    [](Node* n) -> SROperator {
+      auto clamp_min_ival_opt = toIValue(n->input(1));
+      auto clamp_max_ival_opt = toIValue(n->input(2));
+      TORCH_CHECK(
+          clamp_min_ival_opt.has_value() && clamp_max_ival_opt.has_value());
+
+      auto clamp_min_opt = clamp_min_ival_opt->toOptional<at::Scalar>();
+      auto clamp_max_opt = clamp_max_ival_opt->toOptional<at::Scalar>();
+      TORCH_CHECK(clamp_min_opt.has_value() && clamp_max_opt.has_value());
+
+      return [te = createClampNanToNum(),
+              clamp_min = clamp_min_opt->to<float>(),
+              clamp_max =
+                  clamp_max_opt->to<float>()](ProcessedNode* p_node) mutable {
+        const auto& in0_t = p_node->Input(0).toTensor();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = create_empty_from(in0_t);
+        }
+        auto& out_t = p_node->Output(0).toTensor();
+        fastResizeToZero(out_t);
+        auto in3_s = p_node->Input(3).toOptional<double>();
+
+        if (!te || !te->checkInput<float>(in0_t)) {
+          at::cpu::nan_to_num_out(
+              out_t,
+              at::cpu::clamp(in0_t, clamp_min, clamp_max),
+              in3_s,
+              c10::nullopt,
+              c10::nullopt);
+          return;
+        }
+        at::native::resize_(out_t, in0_t.sizes(), c10::nullopt);
+
+        auto output_size = in0_t.numel();
+
+        // This might be UB if in3_s is absurdly large, but most implementations
+        // just turn it into `inf` in that case. The PyTorch core nan_to_num
+        // kernel just static_cast()s the limits to the destination type, so
+        // we'll ignore overflow issues here as well.
+        auto nan = in3_s.has_value() ? static_cast<float>(*in3_s) : 0.f;
+
+        te->call(
+            {out_t.data_ptr(),
+             in0_t.data_ptr(),
+             &clamp_min,
+             &clamp_max,
+             &nan,
+             &output_size});
+      };
+    });
+
+#endif
+
 REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
   if (n->matches(torch::schema(
           "aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor"))) {
-    return [](ProcessedNode* p_node) {
+    return [te = createClamp()](ProcessedNode* p_node) {
       const auto& in0_t = p_node->Input(0).toTensor();
       if (p_node->Output(0).isNone()) {
         p_node->Output(0) = create_empty_from(in0_t);
@@ -620,7 +674,17 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
       fastResizeToZero(out_t);
       auto in1_s = p_node->Input(1).toOptional<at::Scalar>();
       auto in2_s = p_node->Input(2).toOptional<at::Scalar>();
-      at::cpu::clamp_out(out_t, in0_t, in1_s, in2_s);
+      if (!te->checkInput<float>(in0_t)) {
+        at::cpu::clamp_out(out_t, in0_t, in1_s, in2_s);
+        return;
+      }
+      at::native::resize_(out_t, in0_t.sizes(), c10::nullopt);
+      auto output_size = in0_t.numel();
+      auto min = in1_s.has_value() ? in1_s->toFloat()
+                                   : -std::numeric_limits<float>::infinity();
+      auto max = in2_s.has_value() ? in2_s->toFloat()
+                                   : std::numeric_limits<float>::infinity();
+      te->call({out_t.data_ptr(), in0_t.data_ptr(), &min, &max, &output_size});
     };
   }
   if (n->matches(
@@ -634,7 +698,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
       fastResizeToZero(out_t);
       auto in1_t = p_node->Input(1).toOptional<at::Tensor>();
       auto in2_t = p_node->Input(2).toOptional<at::Tensor>();
-      at::native::clamp_out(in0_t, in1_t, in2_t, out_t);
+      at::cpu::clamp_out(out_t, in0_t, in1_t, in2_t);
     };
   }
   LogAndDumpSchema(n);
@@ -716,7 +780,7 @@ void varstackNonserialOut(
   std::vector<at::Tensor> inputs_unsqueezed =
       unsqueezeVarStackInputs(inputs, dim);
   fastResizeToZero(result);
-  at::native::_cat_out_cpu(inputs_unsqueezed, dim, result);
+  at::cpu::cat_outf(inputs_unsqueezed, dim, result);
 }
 
 void varStackFastOut(
@@ -799,6 +863,7 @@ SROperator aten_stack(Node* n) {
   }
   return [](ProcessedNode* p_node) {
     const auto inputs = p_node->Input(0).toTensorVector();
+    TORCH_CHECK(inputs.size() > 0, "stack expects non-empty tensor list");
     const auto dim = p_node->Input(1).toInt();
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) = at::native::_stack_cpu(inputs, dim);
@@ -1162,6 +1227,30 @@ REGISTER_OPERATOR_FUNCTOR(aten::index, aten_index, [](Node* n) -> SROperator {
     at::native::index_out(out_t, in0_t, in1_l);
   };
 });
+
+REGISTER_OPERATOR_FUNCTOR(
+    aten::index_select,
+    aten_index_select,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "aten::index_select(Tensor self, int dim, Tensor index) -> Tensor"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& self = p_node->Input(0).toTensor();
+        const auto dim = p_node->Input(1).toInt();
+        const auto& index = p_node->Input(2).toTensor();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::index_select_cpu_(self, dim, index);
+          return;
+        }
+        auto& out = p_node->Output(0).toTensor();
+        fastResizeToZero(out);
+        at::native::index_select_out_cpu_(self, dim, index, out);
+      };
+    });
+
 REGISTER_OPERATOR_FUNCTOR(aten::pow, aten_pow, [](Node* n) -> SROperator {
   if (n->matches(torch::schema(
           "aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor"))) {
@@ -1613,107 +1702,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::sum, aten_sum, [](Node* n) -> SROperator {
   return nullptr;
 });
 
-REGISTER_OPERATOR_FUNCTOR(aten::embedding_bag, aten_embedding_bag, [](Node* n) -> SROperator {
-  // TODO: Support only 9 args once the old signature has been removed.
-  if (!n->matches(torch::schema(
-          "aten::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)")) &&
-      !n->matches(torch::schema(
-          "aten::embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor)"))) {
-    LogAndDumpSchema(n);
-    return nullptr;
-  }
-  return [](ProcessedNode* p_node) {
-    const auto& weight = p_node->Input(0).toTensor();
-    const auto& indices = p_node->Input(1).toTensor();
-    const auto& offsets = p_node->Input(2).toTensor();
-    auto scale_grad_by_freq = p_node->Input(3).toBool();
-    auto mode = p_node->Input(4).to<int64_t>();
-    auto sparse = p_node->Input(5).toBool();
-    auto per_sample_weights = p_node->Input(6).toOptional<at::Tensor>();
-    auto include_last_offset = p_node->Input(7).toBool();
-    c10::optional<int64_t> padding_idx;
-    if (p_node->num_inputs() == 9) {
-      if (p_node->Input(8).isNone()) {
-        padding_idx = c10::nullopt;
-      } else {
-        padding_idx = p_node->Input(8).toInt();
-      }
-    }
-
-    at::native::check_arguments(
-        weight,
-        indices,
-        offsets,
-        mode,
-        per_sample_weights,
-        include_last_offset);
-
-    std::ignore = scale_grad_by_freq;
-    std::ignore = sparse;
-
-    if (p_node->Output(0).isNone()) {
-      p_node->Output(0) = at::empty(
-          {include_last_offset ? offsets.sizes()[0] - 1 : offsets.sizes()[0],
-           weight.sizes()[1]},
-          weight.options());
-    } else {
-      at::native::resize_(
-          p_node->Output(0).toTensor(),
-          {include_last_offset ? offsets.sizes()[0] - 1 : offsets.sizes()[0],
-           weight.sizes()[1]},
-          c10::nullopt);
-    }
-    at::Tensor& output = p_node->Output(0).toTensor();
-
-    if (p_node->Output(1).isNone()) {
-      p_node->Output(1) = at::empty({0}, offsets.options());
-    }
-    at::Tensor& offset2bag = p_node->Output(1).toTensor();
-    at::native::make_offset2bag_out(
-        offset2bag,
-        output,
-        weight,
-        indices,
-        offsets,
-        mode,
-        per_sample_weights,
-        padding_idx.value_or(-1));
-
-    if (p_node->Output(2).isNone()) {
-      p_node->Output(2) = at::empty(offsets.sizes(), offsets.options());
-    }
-    at::Tensor& bag_size = p_node->Output(2).toTensor();
-    at::native::make_bag_size_out(
-        bag_size, offsets, indices, mode, include_last_offset, false);
-
-    if (p_node->Output(3).isNone()) {
-      p_node->Output(3) = at::empty(bag_size.sizes(), offsets.options());
-    }
-    at::Tensor& max_indices = p_node->Output(3).toTensor();
-    at::native::make_max_indices_out(
-        max_indices,
-        weight,
-        indices,
-        offsets,
-        bag_size,
-        mode,
-        include_last_offset);
-
-    at::native::_embedding_bag_cpu_impl_out(
-        output,
-        offset2bag,
-        bag_size,
-        max_indices,
-        weight,
-        indices,
-        offsets,
-        mode,
-        per_sample_weights,
-        include_last_offset,
-        padding_idx.value_or(-1));
-  };
-});
-
 REGISTER_OPERATOR_FUNCTOR(aten::repeat, aten_repeat, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::repeat(Tensor self, int[] repeats) -> Tensor"))) {
@@ -1849,12 +1837,12 @@ REGISTER_OPERATOR_FUNCTOR(
         const auto& in0_t = p_node->Input(0).toTensor();
         const auto in1_s = p_node->Input(1).toScalar();
         if (p_node->Output(0).isNone()) {
-          p_node->Output(0) = at::native::clamp_min(in0_t, in1_s);
+          p_node->Output(0) = at::cpu::clamp_min(in0_t, in1_s);
           return;
         }
         auto& out_t = p_node->Output(0).toTensor();
         fastResizeToZero(out_t);
-        at::native::clamp_min_out(in0_t, in1_s, out_t);
+        at::cpu::clamp_min_out(out_t, in0_t, in1_s);
       };
     });
 
@@ -1904,22 +1892,22 @@ REGISTER_OPERATOR_FUNCTOR(aten::softmax, aten_softmax, [](Node* n) -> SROperator
   };
 });
 
-static c10::MaybeOwned<at::Tensor> borrow_from_optional_tensor_ivalue(
+namespace {
+
+c10::MaybeOwned<at::Tensor> borrow_from_optional_tensor_ivalue(
     const IValue& iv) {
   if (iv.isNone()) {
     return c10::MaybeOwned<at::Tensor>::owned(c10::in_place);
   }
   return c10::MaybeOwned<at::Tensor>::borrowed(iv.toTensor());
 }
+
+} // namespace
+
 REGISTER_OPERATOR_FUNCTOR(
-    static_runtime::layer_norm,
+    aten::layer_norm,
     aten_layer_norm,
-    [](Node* n) -> SROperator {
-      if (!n->matches(torch::schema(
-              "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor,Tensor,Tensor)"))) {
-        LogAndDumpSchema(n);
-        return nullptr;
-      }
+    [](Node*) -> SROperator {
       return [](ProcessedNode* p_node) {
         // ignore Input(5): `bool cudnn_enable=True`
         const auto& input = p_node->Input(0).toTensor();
@@ -1953,30 +1941,8 @@ REGISTER_OPERATOR_FUNCTOR(
           at::native::resize_(
               p_node->Output(0).toTensor(), X->sizes(), c10::nullopt);
         }
-        if (p_node->Output(1).isNone()) {
-          p_node->Output(1) = create_empty_from({M}, *X);
-        } else {
-          at::native::resize_(p_node->Output(1).toTensor(), {M}, c10::nullopt);
-        }
-        if (p_node->Output(2).isNone()) {
-          p_node->Output(2) = create_empty_from({M}, *X);
-        } else {
-          at::native::resize_(p_node->Output(2).toTensor(), {M}, c10::nullopt);
-        }
         at::Tensor& output = p_node->Output(0).toTensor();
-        at::Tensor& mean = p_node->Output(1).toTensor();
-        at::Tensor& rstd = p_node->Output(2).toTensor();
-        at::native::layer_norm_cpu_out(
-            output,
-            mean,
-            rstd,
-            input,
-            normalized_shape,
-            *gamma,
-            *beta,
-            eps,
-            M,
-            N);
+        at::native::layer_norm_cpu_out(output, input, *gamma, *beta, eps, M, N);
       };
     });
 
@@ -2154,6 +2120,73 @@ REGISTER_OPERATOR_FUNCTOR(
       };
     });
 
+namespace {
+
+template <bool has_relu>
+void apply_dynamic_out_functor(
+    c10::intrusive_ptr<LinearPackedParamsBase> packed_weight,
+    const at::Tensor& input,
+    at::Tensor& out,
+    bool reduce_range);
+
+template <>
+void apply_dynamic_out_functor<false>(
+    c10::intrusive_ptr<LinearPackedParamsBase> packed_weight,
+    const at::Tensor& input,
+    at::Tensor& out,
+    bool reduce_range) {
+  packed_weight->apply_dynamic_out(input, out, reduce_range);
+}
+
+template <>
+void apply_dynamic_out_functor<true>(
+    c10::intrusive_ptr<LinearPackedParamsBase> packed_weight,
+    const at::Tensor& input,
+    at::Tensor& out,
+    bool reduce_range) {
+  // The implementation of PackedLinearWeightFp16::apply_dynamic_impl does not
+  // handle relu. Currently, it ignores the `ReluFused` template parameter.
+  // So, we explicitly do the relu here.
+  packed_weight->apply_dynamic_out(input, out, reduce_range);
+  out.relu_();
+}
+
+template <bool has_relu>
+SROperator quantized_linear_dynamic_fp16_impl(Node* n) {
+  const auto weight = toIValue(n->inputs()[1]);
+  c10::intrusive_ptr<LinearPackedParamsBase> packed_weight;
+  if (weight) {
+    packed_weight = weight->toCustomClass<LinearPackedParamsBase>();
+  }
+  if (packed_weight) {
+    return [packed_weight](ProcessedNode* p_node) {
+      const auto& input = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = create_empty_from(input, at::kFloat);
+      }
+      auto& out_t = p_node->Output(0).toTensor();
+      fastResizeToZero(out_t);
+      apply_dynamic_out_functor<has_relu>(packed_weight, input, out_t, false);
+    };
+  } else {
+    return [](ProcessedNode* p_node) {
+      const auto& input = p_node->Input(0).toTensor();
+      if (p_node->Output(0).isNone()) {
+        p_node->Output(0) = create_empty_from(input, at::kFloat);
+      }
+      auto& out_t = p_node->Output(0).toTensor();
+      fastResizeToZero(out_t);
+      // Weights could be quantized on the fly
+      auto packed_weight_tmp =
+          p_node->Input(1).toCustomClass<LinearPackedParamsBase>();
+      apply_dynamic_out_functor<has_relu>(
+          packed_weight_tmp, input, out_t, false);
+    };
+  }
+}
+
+} // namespace
+
 REGISTER_OPERATOR_FUNCTOR(
     quantized::linear_dynamic_fp16,
     quantized_linear_dynamic_fp16,
@@ -2164,37 +2197,48 @@ REGISTER_OPERATOR_FUNCTOR(
         LogAndDumpSchema(n);
         return nullptr;
       }
-      const auto weight = toIValue(n->inputs()[1]);
-      c10::intrusive_ptr<LinearPackedParamsBase> packed_weight;
-      if (weight) {
-        packed_weight = weight->toCustomClass<LinearPackedParamsBase>();
-      }
-      if (packed_weight) {
-        return [packed_weight](ProcessedNode* p_node) {
-          const auto& input = p_node->Input(0).toTensor();
-          if (p_node->Output(0).isNone()) {
-            p_node->Output(0) = create_empty_from(input, at::kFloat);
-          }
-          auto& out_t = p_node->Output(0).toTensor();
-          fastResizeToZero(out_t);
-          packed_weight->apply_dynamic_out(input, out_t, false);
-        };
-      } else {
-        return [](ProcessedNode* p_node) {
-          const auto& input = p_node->Input(0).toTensor();
-          if (p_node->Output(0).isNone()) {
-            p_node->Output(0) = create_empty_from(input, at::kFloat);
-          }
-          auto& out_t = p_node->Output(0).toTensor();
-          fastResizeToZero(out_t);
-          // Weights could be quantized on the fly
-          auto packed_weight_tmp =
-              p_node->Input(1).toCustomClass<LinearPackedParamsBase>();
-          packed_weight_tmp->apply_dynamic_out(input, out_t, false);
-        };
+      return quantized_linear_dynamic_fp16_impl<false>(n);
+    });
+
+REGISTER_OPERATOR_FUNCTOR(
+    quantized::linear_relu_dynamic_fp16,
+    quantized_linear_relu_dynamic_fp16,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes."
+              "quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
       }
+      return quantized_linear_dynamic_fp16_impl<true>(n);
     });
 
+// device & pin_memory matter only when CUDA is enabled.
+static bool hasTensorWithOptions(
+    const IValue& ivalue,
+    c10::optional<c10::ScalarType> dtype,
+    c10::optional<c10::Layout> layout) {
+  if (!ivalue.isTensor()) {
+    return false;
+  }
+  const auto& tensor = ivalue.toTensor();
+  if (dtype == tensor.dtype().toScalarType() &&
+      layout == tensor.options().layout_opt()) {
+    return true;
+  }
+  VLOG(1) << "tensor exists, but tensor options were different";
+  return false;
+}
+
+static bool hasTensorWithOptions(
+    const IValue& ivalue,
+    c10::optional<c10::ScalarType> dtype,
+    c10::optional<c10::Layout> layout,
+    c10::optional<c10::MemoryFormat> memory_format) {
+  return hasTensorWithOptions(ivalue, dtype, layout) &&
+      (memory_format == ivalue.toTensor().options().memory_format_opt());
+}
+
 REGISTER_OPERATOR_FUNCTOR(aten::full, aten_full, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"))) {
@@ -2204,9 +2248,9 @@ REGISTER_OPERATOR_FUNCTOR(aten::full, aten_full, [](Node* n) -> SROperator {
   return [](ProcessedNode* p_node) {
     const auto& size = p_node->Input(0).toDimVector();
     const auto fill_value = p_node->Input(1).toScalar();
-    if (p_node->Output(0).isNone()) {
-      const auto dtype = p_node->Input(2).toOptional<c10::ScalarType>();
-      const auto layout = p_node->Input(3).toOptional<c10::Layout>();
+    const auto dtype = p_node->Input(2).toOptional<c10::ScalarType>();
+    const auto layout = p_node->Input(3).toOptional<c10::Layout>();
+    if (!hasTensorWithOptions(p_node->Output(0), dtype, layout)) {
       const auto device = p_node->Input(4).toOptional<c10::Device>();
       const auto pin_memory = p_node->Input(5).toOptional<bool>();
       p_node->Output(0) =
@@ -2227,9 +2271,9 @@ REGISTER_OPERATOR_FUNCTOR(aten::full_like, aten_full_like, [](Node* n) -> SROper
   return [](ProcessedNode* p_node) {
     const auto in1_s = p_node->Input(1).toScalar();
     const auto& in0_t = p_node->Input(0).toTensor();
-    if (p_node->Output(0).isNone()) {
-      const auto dtype = p_node->Input(2).toOptional<c10::ScalarType>();
-      const auto layout = p_node->Input(3).toOptional<c10::Layout>();
+    const auto dtype = p_node->Input(2).toOptional<c10::ScalarType>();
+    const auto layout = p_node->Input(3).toOptional<c10::Layout>();
+    if (!hasTensorWithOptions(p_node->Output(0), dtype, layout)) {
       const auto device = p_node->Input(4).toOptional<c10::Device>();
       const auto pin_memory = p_node->Input(5).toOptional<bool>();
       const auto memory_format =
@@ -2244,6 +2288,74 @@ REGISTER_OPERATOR_FUNCTOR(aten::full_like, aten_full_like, [](Node* n) -> SROper
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::ones, aten_ones, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto size = p_node->Input(0).toDimVector();
+    if (p_node->Output(0).isNone()) {
+      const auto dtype = p_node->Input(1).toOptional<c10::ScalarType>();
+      const auto layout = p_node->Input(2).toOptional<c10::Layout>();
+      const auto device = p_node->Input(3).toOptional<c10::Device>();
+      const auto pin_memory = p_node->Input(4).toOptional<bool>();
+      p_node->Output(0) =
+          at::native::ones(size, dtype, layout, device, pin_memory);
+      return;
+    }
+    auto& out_t = p_node->Output(0).toTensor();
+    fastResizeToZero(out_t);
+    at::native::ones_out(size, out_t);
+  };
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::ones_like, aten_ones_like, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& self = p_node->Input(0).toTensor();
+    const auto dtype = p_node->Input(1).toOptional<c10::ScalarType>();
+    const auto layout = p_node->Input(2).toOptional<c10::Layout>();
+    const auto device = p_node->Input(3).toOptional<c10::Device>();
+    const auto pin_memory = p_node->Input(4).toOptional<bool>();
+    const auto memory_format = p_node->Input(5).toOptional<c10::MemoryFormat>();
+    if (!hasTensorWithOptions(
+            p_node->Output(0), dtype, layout, memory_format)) {
+      p_node->Output(0) = at::native::ones_like(
+          self, dtype, layout, device, pin_memory, memory_format);
+      return;
+    }
+    auto& out_t = p_node->Output(0).toTensor();
+    fastResizeToZero(out_t);
+    at::native::ones_out(self.sizes(), out_t);
+  };
+});
+
+REGISTER_OPERATOR_FUNCTOR(aten::zeros, aten_zeros, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto size = p_node->Input(0).toDimVector();
+    const auto dtype = p_node->Input(1).toOptional<c10::ScalarType>();
+    const auto layout = p_node->Input(2).toOptional<c10::Layout>();
+    if (!hasTensorWithOptions(p_node->Output(0), dtype, layout)) {
+      p_node->Output(0) = at::native::zeros(size, dtype, layout);
+      return;
+    }
+    auto& out_t = p_node->Output(0).toTensor();
+    fastResizeToZero(out_t);
+    at::native::zeros_out(size, out_t);
+  };
+});
+
 REGISTER_OPERATOR_FUNCTOR(aten::linear, aten_linear, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"))) {
@@ -2324,14 +2436,15 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
   }
   return [](ProcessedNode* p_node) {
     const auto inputs = p_node->Input(0).toTensorVector();
+    TORCH_CHECK(inputs.size() > 0, "concat expects non-empty tensor list");
     const auto dim = p_node->Input(1).toInt();
     if (p_node->Output(0).isNone()) {
-      p_node->Output(0) = at::native::_cat_cpu(inputs, dim);
+      p_node->Output(0) = at::cpu::cat(inputs, dim);
       return;
     }
     auto& output = p_node->Output(0).toTensor();
     fastResizeToZero(output);
-    at::native::_cat_out_cpu(inputs, dim, output);
+    at::cpu::cat_outf(inputs, dim, output);
   };
 });
 
@@ -2388,12 +2501,12 @@ REGISTER_OPERATOR_FUNCTOR(
         }
         auto dim = p_node->Input(num_inputs - 1).toInt();
         if (p_node->Output(0).isNone()) {
-          p_node->Output(0) = at::native::_cat_cpu(inputs, dim);
+          p_node->Output(0) = at::cpu::cat(inputs, dim);
           return;
         }
         auto& out_t = p_node->Output(0).toTensor();
         fastResizeToZero(out_t);
-        at::native::_cat_out_cpu(inputs, dim, out_t);
+        at::cpu::cat_outf(inputs, dim, out_t);
       };
     });
 
@@ -2506,8 +2619,7 @@ REGISTER_OPERATOR_FUNCTOR(
 REGISTER_OPERATOR_FUNCTOR(aten::where, aten_where, [](Node* n) -> SROperator {
   if (n->matches(torch::schema(
           "aten::where.self(Tensor condition, Tensor self, Tensor other) -> Tensor"))) {
-    auto te = createWhere();
-    return [te = std::move(te)](ProcessedNode* p_node) {
+    return [](ProcessedNode* p_node) {
       const auto& cond = p_node->Input(0).toTensor();
       const auto& self = p_node->Input(1).toTensor();
       const auto& other = p_node->Input(2).toTensor();
@@ -2516,21 +2628,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::where, aten_where, [](Node* n) -> SROperator {
         p_node->Output(0) = create_empty_from(self);
       }
       auto& out = p_node->Output(0).toTensor();
-
-      if (!te || !te->checkInput<bool>(cond) ||
-          !te->checkInput<int64_t>(self) || !te->checkInput<int64_t>(other)) {
-        fastResizeToZero(out);
-        at::native::where_out(cond, self, other, out);
-      } else {
-        at::native::resize_(out, self.sizes(), c10::nullopt);
-        auto num_elems = self.numel();
-        te->call(
-            {out.data_ptr(),
-             cond.data_ptr(),
-             self.data_ptr(),
-             other.data_ptr(),
-             &num_elems});
-      }
+      fastResizeToZero(out);
+      at::native::where_self_out(cond, self, other, out);
     };
   }
 
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index 81cf88f5a06d..a71c3473b341 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -9,7 +9,7 @@ namespace native {
 at::Tensor& reshape_copy_out(
     at::Tensor& out,
     const at::Tensor& self,
-    at::IntArrayRef proposed_shape,
+    const at::DimVector& proposed_shape,
     bool infer_size = true);
 at::Tensor& to_copy_out(
     Tensor& out,
@@ -23,11 +23,10 @@ at::Tensor& to_copy_out(
 namespace torch {
 namespace jit {
 
-using SROperator = std::function<void(ProcessedNode*)>;
 using SROpFunctor = SROperator (*)(Node* n);
 struct SROperatorFunctor {
   virtual SROperator Generate(Node*) {
-    std::function<void(ProcessedNode*)> out;
+    SROperator out;
     return out;
   }
   virtual ~SROperatorFunctor() = default;
@@ -154,8 +153,8 @@ bool isOptimizableContainerType(
     Node* n,
     const FastMap<Node*, bool>& node_has_out_variant);
 
-std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);
-std::function<void(ProcessedNode*)> getNativeOperation(Node* n);
+SROperator getOutOfPlaceOperation(Node* n);
+SROperator getNativeOperation(Node* n);
 
 bool hasVarArgs(Node* n);
 
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 704cc27a83cb..34de1b45ae15 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/runtime/static/passes.h>
 
 #include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
@@ -124,7 +125,6 @@ void CastedBatchOneHotLengths(std::shared_ptr<torch::jit::Graph>& graph) {
 
 C10_UNUSED
 void ConcatBatchMatMulBatchGather(std::shared_ptr<torch::jit::Graph>& graph) {
-  // TODO:: check restrictions for inputs; outputs not used elsewhere
   std::string pattern = R"IR(
     graph(%a, %b, %c, %d, %e, %f):
         %y0 : Tensor = aten::stack(%a, %b)
@@ -139,6 +139,36 @@ void ConcatBatchMatMulBatchGather(std::shared_ptr<torch::jit::Graph>& graph) {
         return (%res))IR";
   SubgraphRewriter fuse;
   fuse.RegisterRewritePattern(pattern, fused_pattern);
+
+  // this pattern found in several models has a redundant second `flatten`
+  std::string pattern_broadcast = R"IR(
+    graph(%a, %b, %c, %d, %e, %indices):
+        %y0 : Tensor = fb::broadcast_stack(%a, %b)
+        %y1 : Tensor = aten::transpose(%y0, %b, %c)
+        %y2 : Tensor = aten::matmul(%y0, %y1)
+        %y3 : Tensor = aten::flatten(%y2, %b, %e)
+        %y4 : Tensor = aten::flatten(%y3, %d, %d)
+        %res : Tensor = aten::index_select(%y4, %b, %indices)
+        return (%res))IR";
+  std::string fused_pattern_broadcast = R"IR(
+    graph(%a, %b, %c, %d, %e, %indices):
+        %res : Tensor = fb::broadcast_concat_batch_matmul_batch_gather(%indices, %a)
+        return (%res))IR";
+  fuse.RegisterRewritePattern(pattern_broadcast, fused_pattern_broadcast);
+
+  std::string pattern_broadcast2 = R"IR(
+    graph(%a, %b, %c, %d, %indices):
+        %y0 : Tensor = fb::broadcast_stack(%a, %b)
+        %y1 : Tensor = aten::transpose(%y0, %b, %c)
+        %y2 : Tensor = aten::matmul(%y0, %y1)
+        %y3 : Tensor = aten::flatten(%y2, %b, %d)
+        %res : Tensor = aten::index_select(%y3, %b, %indices)
+        return (%res))IR";
+  std::string fused_pattern_broadcast2 = R"IR(
+    graph(%a, %b, %c, %d, %indices):
+        %res : Tensor = fb::broadcast_concat_batch_matmul_batch_gather(%indices, %a)
+        return (%res))IR";
+  fuse.RegisterRewritePattern(pattern_broadcast2, fused_pattern_broadcast2);
   fuse.runOnGraph(graph);
 }
 
@@ -225,6 +255,33 @@ C10_UNUSED void ClipRangesToGatherToOffsets(
   fuse.runOnGraph(graph);
 }
 
+C10_UNUSED void ToLengthsToOffsets(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%a, %includelastoffset, %dtype, %nonblocking, %copy, %memoryformat):
+        %y0 : Tensor = aten::to(%a, %dtype, %nonblocking, %copy, %memoryformat)
+        %y1 : Tensor = fb::lengths_to_offsets(%y0, %includelastoffset)
+        return (%y1))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %includelastoffset, %dtype, %nonblocking, %copy, %memoryformat):
+        %y0 : Tensor = fb::to_lengths_to_offsets(%a, %includelastoffset, %dtype)
+        return (%y0))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+
+  std::string pattern2 = R"IR(
+    graph(%a, %includelastoffset, %dtype, %nonblocking, %copy):
+        %y0 : Tensor = aten::to(%a, %dtype, %nonblocking, %copy)
+        %y1 : Tensor = fb::lengths_to_offsets(%y0, %includelastoffset)
+        return (%y1))IR";
+  std::string fused_pattern2 = R"IR(
+    graph(%a, %includelastoffset, %dtype, %nonblocking, %copy):
+        %y0 : Tensor = fb::to_lengths_to_offsets(%a, %includelastoffset, %dtype)
+        return (%y0))IR";
+  fuse.RegisterRewritePattern(pattern2, fused_pattern2);
+  fuse.runOnGraph(graph);
+}
+
 C10_UNUSED
 void ClipRangesGatherSigridHash(std::shared_ptr<torch::jit::Graph>& graph) {
   // TODO:: check restrictions for inputs; outputs not used elsewhere
@@ -334,6 +391,8 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
 
     ClipRangesToGatherToOffsets(graph);
   }
+
+  ToLengthsToOffsets(graph);
 #endif
 }
 
@@ -387,7 +446,18 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
   m.def(torch::schema(
       "static_runtime::select_tensor(Tensor(a) a, Tensor(b) b, bool use_b) -> Tensor(a|b)",
       c10::AliasAnalysisKind::FROM_SCHEMA));
-  m.def(torch::schema("static_runtime::create_owned_ref(...) -> ..."));
+  m.def(torch::schema(
+      "static_runtime::create_owned_ref(...) -> ...",
+      c10::AliasAnalysisKind::CONSERVATIVE));
+  m.def(torch::schema(
+      "static_runtime::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor)",
+      c10::AliasAnalysisKind::PURE_FUNCTION));
+  m.def(torch::schema(
+      "static_runtime::embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor)",
+      c10::AliasAnalysisKind::PURE_FUNCTION));
+  m.def(torch::schema(
+      "static_runtime::clamp_nan_to_num(Tensor input, Scalar? min, Scalar? max, float? nan, float? posinf, float? posinf) -> Tensor",
+      c10::AliasAnalysisKind::PURE_FUNCTION));
 }
 
 void FuseSignLog1P(std::shared_ptr<torch::jit::Graph>& graph) {
@@ -514,10 +584,9 @@ void UseVariadicTupleUnpack(const std::shared_ptr<Graph>& graph) {
 //                         v
 
 void ReplaceWithMaybeCopy(
-    std::shared_ptr<torch::jit::Graph>& graph,
+    std::shared_ptr<Graph>& graph,
     bool outputs_are_immutable) {
   AliasDb db(graph);
-
   // for ops that have overloads, match the schema
   static const std::array<std::pair<c10::FunctionSchema, c10::Symbol>, 3> supported_schema =
       {{{torch::schema(
@@ -542,7 +611,8 @@ void ReplaceWithMaybeCopy(
 
   // old node, new node, select_tensor node
   std::vector<std::tuple<Node*, Node*, Node*>> replacement;
-  for (auto* n : graph->nodes()) {
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto n = graph_it.next(); n != nullptr; n = graph_it.next()) {
     c10::Symbol new_symbol;
     if (!match_schema(n, new_symbol)) {
       continue;
@@ -561,7 +631,6 @@ void ReplaceWithMaybeCopy(
 
     // Add the did_copy flag to outputs.
     auto* new_node = graph->create(new_symbol, n->outputs().size() + 1);
-    new_node->insertBefore(n);
     for (auto* input : n->inputs()) {
       new_node->addInput(input);
     }
@@ -570,7 +639,6 @@ void ReplaceWithMaybeCopy(
     static const auto select_tensor_symbol =
         fromQualString("static_runtime::select_tensor");
     auto* select_tensor_node = graph->create(select_tensor_symbol, 1);
-    select_tensor_node->insertBefore(n);
     DCHECK_EQ(new_node->outputs().size(), 2);
     select_tensor_node->addInput(n->input(0));
     for (auto* output : new_node->outputs()) {
@@ -584,6 +652,8 @@ void ReplaceWithMaybeCopy(
     auto* const new_node = std::get<1>(tup);
     auto* const select_tensor_node = std::get<2>(tup);
 
+    new_node->insertBefore(old_node);
+    select_tensor_node->insertBefore(old_node);
     new_node->outputs()[0]->copyMetadata(old_node->output());
     select_tensor_node->output()->copyMetadata(old_node->output());
     old_node->replaceAllUsesWith(select_tensor_node);
@@ -597,10 +667,9 @@ void ReplaceWithMaybeCopy(
 }
 
 void ReplaceWithCopy(
-    std::shared_ptr<torch::jit::Graph>& graph,
+    std::shared_ptr<Graph>& graph,
     bool outputs_are_immutable) {
   AliasDb db(graph);
-
   const FastMap<c10::Symbol, c10::Symbol> supported = {
 #ifdef FBCODE_CAFFE2
       OP_PAIR("aten::permute", "static_runtime::permute_copy"),
@@ -626,7 +695,8 @@ void ReplaceWithCopy(
   };
 
   std::vector<std::pair<Node*, Node*>> replacement;
-  for (auto* n : graph->nodes()) {
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto n = graph_it.next(); n != nullptr; n = graph_it.next()) {
     c10::Symbol new_symbol;
     if (supported.count(n->kind()) && opIsRegistered(supported.at(n->kind()))) {
       new_symbol = supported.at(n->kind());
@@ -663,7 +733,6 @@ void ReplaceWithCopy(
       continue;
     }
     auto* new_node = graph->create(new_symbol, n->outputs().size());
-    new_node->insertBefore(n);
     for (auto* input : n->inputs()) {
       new_node->addInput(input);
     }
@@ -673,6 +742,7 @@ void ReplaceWithCopy(
   for (const auto& p : replacement) {
     auto* old_node = p.first;
     auto* new_node = p.second;
+    new_node->insertBefore(old_node);
     new_node->output()->copyMetadata(old_node->output());
     old_node->replaceAllUsesWith(new_node);
     old_node->destroy();
@@ -687,7 +757,8 @@ void ReplaceWithCopy(
 void EliminateTrivialEquallySplit(std::shared_ptr<torch::jit::Graph>& graph) {
   const auto equally_split = fromQualString("fb::equally_split");
   std::vector<Node*> to_remove;
-  for (auto* node : graph->nodes()) {
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto node = graph_it.next(); node != nullptr; node = graph_it.next()) {
     if (node->kind() != equally_split) {
       continue;
     }
@@ -708,7 +779,7 @@ void EliminateTrivialEquallySplit(std::shared_ptr<torch::jit::Graph>& graph) {
     }
 
     list_unpack_node->output()->replaceAllUsesWith(node->input(0));
-    list_unpack_node->destroy();
+    to_remove.push_back(list_unpack_node);
     to_remove.push_back(node);
   }
 
@@ -717,16 +788,45 @@ void EliminateTrivialEquallySplit(std::shared_ptr<torch::jit::Graph>& graph) {
   }
 }
 
-// NB: The alias type of the fused op needs to be changed to
-// c10::AliasAnalysisKind::PURE_FUNCTION to make alias analysis work.
+namespace {
+
+bool shouldNotFuseListUnpackSpecialCase(const Node* node) {
+  const static std::array<c10::Symbol, 3> sigrid_transforms_symbols{
+      c10::Symbol::fromQualString("fb::variadic_sigrid_transforms_torch_bind"),
+      c10::Symbol::fromQualString("fb::sigrid_transforms_torch_bind"),
+      c10::Symbol::fromQualString("fb::sigrid_transforms")};
+
+  if (std::find(
+          sigrid_transforms_symbols.begin(),
+          sigrid_transforms_symbols.end(),
+          node->kind()) == sigrid_transforms_symbols.end()) {
+    return false;
+  }
+
+  // To fuse with sigrid transforms, we must be able to statically determine
+  // `instance` and `use_offsets` - these two together let us statically
+  // determine the types of the outputs. Rationale: it is a huge pain to write
+  // fused sigrid transforms without static type information, and these two
+  // arguments are indeed statically known in every model we've seen.
+  // The reason why trying to fuse the outputs is annoying without static type
+  // information is that, if one of the outputs is not managed, you need to
+  // reset to an empty tensor of the correct type each iteration. So, if we
+  // can't collect types ahead of time, we would have to do it lazily on the
+  // first iteration, which would could be wasteful in terms of time/memory
+  // - either each thread would have its own set of output types, or we would
+  // need a lock to prevent data races.
+  const auto num_inputs = node->inputs().size();
+  return !toIValue(node->input(0)).has_value() ||
+      !toIValue(node->input(num_inputs - 1)).has_value();
+}
+
+} // namespace
+
 void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
   const FastMap<c10::Symbol, c10::Symbol> unfused_to_fused = {
       OP_PAIR("fb::equally_split", "static_runtime::fused_equally_split"),
       OP_PAIR(
           "fb::sigrid_transforms", "static_runtime::fused_sigrid_transforms"),
-      OP_PAIR(
-          "static_runtime::variadic_grouped_accessor_op",
-          "static_runtime::fused_variadic_grouped_accessor_op"),
       OP_PAIR(
           "static_runtime::variadic_grouped_accessor_op_v2",
           "static_runtime::fused_variadic_grouped_accessor_op_v2"),
@@ -741,14 +841,15 @@ void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
           "static_runtime::fused_gather_ranges_to_dense"),
       OP_PAIR(
           "fb::gather_ranges_to_dense_v2",
-          "static_runtime::fused_gather_ranges_to_dense_v2")};
+          "static_runtime::fused_gather_ranges_to_dense_v2"),
+      OP_PAIR(
+          "fb::split_and_squeeze",
+          "static_runtime::fused_split_and_squeeze_copy")};
 
-  AliasDb alias_db(
-      graph,
-      /*isFrozen=*/false);
-  auto nodes = graph->nodes();
-  std::vector<Node*> to_remove;
-  for (auto* node : nodes) {
+  // replacement contains (old_node, new_node, list_unpack_node)
+  std::vector<std::tuple<Node*, Node*, Node*>> replacement;
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto node = graph_it.next(); node != nullptr; node = graph_it.next()) {
     auto unfused_to_fused_it = unfused_to_fused.find(node->kind());
     if (unfused_to_fused_it == unfused_to_fused.end()) {
       continue;
@@ -769,6 +870,10 @@ void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
       continue;
     }
 
+    if (shouldNotFuseListUnpackSpecialCase(node)) {
+      continue;
+    }
+
     const auto& new_sym = unfused_to_fused_it->second;
     auto* new_node = graph->create(new_sym, 0);
 
@@ -781,51 +886,19 @@ void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
       new_out->copyMetadata(out);
       out->replaceAllUsesWith(new_out);
     }
-
-    new_node->insertAfter(node);
-    list_unpack_node->destroy();
-    to_remove.push_back(node);
-  }
-  for (Node* node : to_remove) {
-    node->destroy();
+    replacement.emplace_back(node, new_node, list_unpack_node);
   }
 
-#ifndef NDEBUG
-  graph->lint();
-  AliasDb db2(graph);
-  torch::jit::Lint(&db2);
-#endif
-} // namespace jit
+  for (const auto& nodes : replacement) {
+    auto* old_node = std::get<0>(nodes);
+    auto* new_node = std::get<1>(nodes);
+    auto* list_unpack_node = std::get<2>(nodes);
 
-void EnableStaticRuntimeLayerNorm(std::shared_ptr<torch::jit::Graph>& graph) {
-  const c10::Symbol static_runtime_layer_norm_symbol =
-      fromQualString("static_runtime::layer_norm");
-  auto nodes = graph->nodes();
-  std::vector<std::pair<Node*, Node*>> replacement;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    Node* old_node = *it;
-    if (!old_node->matches(torch::schema(
-            "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) {
-      continue;
-    }
-    TORCH_CHECK(old_node->outputs().size() == 1);
-    auto* new_node = graph->create(
-        static_runtime_layer_norm_symbol,
-        /*layer_norm*/ 1 + /*mean*/ 1 + /*rst=*/1);
-    new_node->insertBefore(old_node);
-    for (auto* input : old_node->inputs()) {
-      new_node->addInput(input);
-    }
-    replacement.emplace_back(old_node, new_node);
-  }
-  for (const auto& p : replacement) {
-    auto* old_node = p.first;
-    auto* new_node = p.second;
-    new_node->output(0)->copyMetadata(old_node->output(0));
-    old_node->output(0)->replaceAllUsesWith(new_node->output(0));
+    new_node->insertAfter(old_node);
+    list_unpack_node->destroy();
     old_node->destroy();
   }
-}
+} // namespace jit
 
 void RemoveImmutableInputDictLookups(
     std::shared_ptr<torch::jit::Graph>& graph) {
@@ -899,12 +972,6 @@ void RemoveImmutableInputDictLookups(
 }
 
 void UseVariadicGroupedAccessor(const std::shared_ptr<Graph>& graph) {
-  // Migration to v2 is still in progress. For now, SR will support
-  // both versions of this op.
-  UseVariadicOp(
-      graph,
-      fromQualString("grouped_accessor::grouped_accessor_op"),
-      fromQualString("static_runtime::variadic_grouped_accessor_op"));
   UseVariadicOp(
       graph,
       fromQualString("grouped_accessor::grouped_accessor_op_v2"),
@@ -1000,5 +1067,297 @@ void ForceNonEmptyOutputs(Graph& graph) {
   }
 }
 
+namespace {
+
+bool inputIsConstantList(
+    Node* node,
+    size_t input_idx,
+    const c10::List<int64_t>& expected) {
+  auto input_opt = toIValue(node->input(input_idx));
+  if (!input_opt.has_value() || !input_opt->isIntList()) {
+    return false;
+  }
+  return input_opt->toIntList() == expected;
+}
+
+bool inputIsConstantInt(Node* node, size_t input_idx, int64_t expected) {
+  auto input_opt = toIValue(node->input(input_idx));
+  if (!input_opt.has_value() || !input_opt->isInt()) {
+    return false;
+  }
+  return input_opt->toInt() == expected;
+}
+
+void eliminatePermuteOpsSumPattern(std::shared_ptr<Graph>& graph) {
+  // SubgraphRewriter can't pattern-match on constants, so we use this
+  // extra filter to make sure the values of the `dim` arguments are
+  // correct.
+  auto dims_are_valid_constants =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        // Get the nodes in the real graph from the nodes in the template
+        // pattern graph
+        const auto& node_map = match.nodes_map;
+        auto* sum_node = node_map.at(vmap.at("c")->node());
+        auto* permute_node = node_map.at(vmap.at("b")->node());
+        return inputIsConstantList(sum_node, 1, c10::List<int64_t>{-1}) &&
+            inputIsConstantList(permute_node, 1, c10::List<int64_t>{0, 2, 1});
+      };
+
+  const auto pattern = R"IR(
+    graph(%a, %sum_dim, %permute_dim, %keepdim, %dtype):
+        %b = aten::permute(%a, %permute_dim)
+        %c = aten::sum(%b, %sum_dim, %keepdim, %dtype)
+        return (%c))IR";
+
+  const auto fused_pattern = R"IR(
+    graph(%a, %sum_dim, %permute_dim, %keepdim, %dtype):
+        %new_sum_dim: int[] = prim::Constant[value=[1]]()
+        %d = aten::sum(%a, %new_sum_dim, %keepdim, %dtype)
+        return (%d))IR";
+
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph, dims_are_valid_constants);
+}
+
+void eliminatePermuteOpsSoftmaxPattern(std::shared_ptr<Graph>& graph) {
+  const auto pattern = R"IR(
+    graph(%a, %permute_dim_1, %permute_dim_2, %softmax_dim, %softmax_dtype):
+        %b = aten::permute(%a, %permute_dim_1)
+        %c = aten::softmax(%b, %softmax_dim, %softmax_dtype)
+        %d = aten::permute(%c, %permute_dim_2)
+        return (%d)
+  )IR";
+
+  const auto fused_pattern = R"IR(
+    graph(%a, %permute_dim_1, %permute_dim_2, %softmax_dim, %softmax_dtype):
+        %new_softmax_dim: int = prim::Constant[value=1]()
+        %e = aten::softmax(%a, %new_softmax_dim, %softmax_dtype)
+        return (%e)
+  )IR";
+
+  // Check that permute_dim is (0, 2, 1) and softmax_dim is 2
+  auto dims_are_valid_constants =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        const auto& node_map = match.nodes_map;
+        auto* permute_node_1 = node_map.at(vmap.at("b")->node());
+        auto* permute_node_2 = node_map.at(vmap.at("d")->node());
+        auto* softmax_node = node_map.at(vmap.at("c")->node());
+        return inputIsConstantInt(softmax_node, 1, 2) &&
+            inputIsConstantList(
+                   permute_node_1, 1, c10::List<int64_t>{0, 2, 1}) &&
+            inputIsConstantList(permute_node_2, 1, c10::List<int64_t>{0, 2, 1});
+      };
+
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph, dims_are_valid_constants);
+}
+
+} // namespace
+
+void EliminateExtraPermuteOps(std::shared_ptr<Graph>& graph) {
+  eliminatePermuteOpsSumPattern(graph);
+  eliminatePermuteOpsSoftmaxPattern(graph);
+}
+
+namespace {
+
+Node* maybeUserWithKind(Value* value, c10::Symbol kind) {
+  auto& uses = value->uses();
+  if (uses.size() != 1) {
+    return nullptr;
+  }
+  auto* user = uses[0].user;
+  if (user->kind() != kind) {
+    return nullptr;
+  }
+  return user;
+}
+
+} // namespace
+
+void UseSplitAndSqueeze(std::shared_ptr<Graph>& graph) {
+  std::vector<Node*> to_erase;
+  for (auto* node : graph->nodes()) {
+    if (node->kind() != aten::split) {
+      continue;
+    }
+    auto axis_opt = toIValue(node->input(2));
+    if (!axis_opt) {
+      continue;
+    }
+    auto axis = *axis_opt;
+    auto* split_node_output = node->output();
+    auto* list_unpack_node =
+        maybeUserWithKind(split_node_output, prim::ListUnpack);
+    if (list_unpack_node == nullptr) {
+      continue;
+    }
+    std::vector<Node*> squeeze_nodes;
+    squeeze_nodes.reserve(list_unpack_node->outputs().size());
+    for (auto* output : list_unpack_node->outputs()) {
+      auto* squeeze_node = maybeUserWithKind(output, aten::squeeze);
+      if (squeeze_node == nullptr) {
+        break;
+      }
+      auto dim_opt = toIValue(squeeze_node->input(1));
+      if (!dim_opt || *dim_opt != axis) {
+        break;
+      }
+      squeeze_nodes.push_back(squeeze_node);
+    }
+    auto num_outputs = list_unpack_node->outputs().size();
+    if (squeeze_nodes.size() != num_outputs) {
+      continue;
+    }
+    auto* split_and_squeeze_node = graph->create(
+        c10::Symbol::fromQualString(
+            "static_runtime::fused_split_and_squeeze_copy"),
+        num_outputs);
+    split_and_squeeze_node->addInput(node->input(0));
+    split_and_squeeze_node->addInput(node->input(1));
+    split_and_squeeze_node->addInput(node->input(2));
+    split_and_squeeze_node->insertBefore(node);
+    for (const auto i : c10::irange(num_outputs)) {
+      auto* squeeze_node = squeeze_nodes[i];
+      split_and_squeeze_node->output(i)->copyMetadata(squeeze_node->output());
+      squeeze_node->output()->replaceAllUsesWith(
+          split_and_squeeze_node->output(i));
+    }
+    to_erase.insert(to_erase.end(), squeeze_nodes.begin(), squeeze_nodes.end());
+    to_erase.push_back(list_unpack_node);
+    to_erase.push_back(node);
+  }
+  for (auto* node : to_erase) {
+    node->destroy();
+  }
+}
+
+C10_UNUSED void RemoveUnnecessaryOutputs(
+    std::shared_ptr<torch::jit::Graph>& graph) {
+  RemoveUnnecessaryEmbeddingBagOutputs(graph);
+}
+
+C10_UNUSED void RemoveUnnecessaryEmbeddingBagOutputs(
+    std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset):
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        return (%y2, %y1, %y0))IR";
+  std::string transformed_pattern = R"IR(
+    graph(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset):
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor = static_runtime::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset)
+        return (%y2, %y1, %y0))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, transformed_pattern);
+  fuse.runOnGraph(graph);
+
+  std::string pattern2 = R"IR(
+    graph(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset, %padding_idx):
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset, %padding_idx)
+        return (%y2, %y1, %y0))IR";
+  std::string transformed_pattern2 = R"IR(
+    graph(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset, %padding_idx):
+        %y0 : Tensor, %y1 : Tensor, %y2 : Tensor = static_runtime::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset, %padding_idx)
+        return (%y2, %y1, %y0))IR";
+  fuse.RegisterRewritePattern(pattern2, transformed_pattern2);
+  fuse.runOnGraph(graph);
+}
+
+namespace {
+bool isNoOpSlice(Node* node) {
+  DCHECK(node->kind() == aten::slice);
+  auto step = toIValue(node->input(3));
+  if (!step.has_value() || step->toInt() != 1) {
+    return false;
+  }
+  auto start = toIValue(node->input(1));
+  if (!start.has_value() || (start->isInt() && start->toInt() != 0)) {
+    return false;
+  }
+  auto end = toIValue(node->input(2));
+  // Could also look at list length, but most models that have this pattern are
+  // just doing list[0:], so it's not needed for now.
+  return end.has_value() && end->isNone();
+}
+} // namespace
+
+void EliminateNoOpSlice(std::shared_ptr<Graph>& graph) {
+  DepthFirstGraphNodeIterator it(graph);
+  auto schema = torch::schema(
+      "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> t[]");
+  Node* node = nullptr;
+  std::vector<Node*> to_delete;
+  while ((node = it.next()) != nullptr) {
+    if (!node->matches(schema) || !isNoOpSlice(node)) {
+      continue;
+    }
+
+    node->output()->replaceAllUsesWith(node->input(0));
+    to_delete.push_back(node);
+  }
+  for (auto* node : to_delete) {
+    node->destroy();
+  }
+}
+
+void QuantizedLinearReluFusion(std::shared_ptr<Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%input, %packed_params):
+        %x : Tensor = quantized::linear_dynamic_fp16(%input, %packed_params)
+        %y : Tensor = aten::relu(%x)
+        return (%y))IR";
+  std::string fused_pattern = R"IR(
+    graph(%input, %packed_params):
+        %x : Tensor = quantized::linear_relu_dynamic_fp16(%input, %packed_params)
+        return (%x))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
+void FuseClampNaNToNum(std::shared_ptr<Graph>& graph) {
+#ifdef FBCODE_CAFFE2
+  std::string pattern = R"IR(
+    graph(%input, %clamp_min: Scalar?, %clamp_max: Scalar?, %nan, %posinf, %neginf):
+        %x : Tensor = aten::clamp(%input, %clamp_min, %clamp_max)
+        %y : Tensor = aten::nan_to_num(%x, %nan, %posinf, %neginf)
+        return (%y))IR";
+
+  std::string fused_pattern = R"IR(
+    graph(%input, %clamp_min: Scalar?, %clamp_max: Scalar?, %nan, %posinf, %neginf):
+        %x : Tensor = static_runtime::clamp_nan_to_num(%input, %clamp_min, %clamp_max, %nan, %posinf, %neginf)
+        return (%x))IR";
+
+  auto isConstantAndNotNone = [](Value* value) {
+    auto ival_opt = toIValue(value);
+    if (!ival_opt.has_value()) {
+      return false;
+    }
+    auto scalar_opt = ival_opt->toOptional<at::Scalar>();
+    return scalar_opt.has_value();
+  };
+
+  auto clampValuesAreConstant =
+      [&isConstantAndNotNone](
+          const Match& match,
+          const std::unordered_map<std::string, Value*>& vmap) {
+        // Get the nodes in the real graph from the nodes in the template
+        // pattern graph
+        const auto& node_map = match.nodes_map;
+        auto* clamp_node = node_map.at(vmap.at("x")->node());
+        return isConstantAndNotNone(clamp_node->input(1)) &&
+            isConstantAndNotNone(clamp_node->input(2));
+      };
+
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph, clampValuesAreConstant);
+#endif
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index ba31f22eaf03..270c9073ba9b 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -21,9 +21,6 @@ void ReplaceWithMaybeCopy(
     std::shared_ptr<torch::jit::Graph>& graph,
     bool outputs_are_immutable = true);
 
-TORCH_API void EnableStaticRuntimeLayerNorm(
-    std::shared_ptr<torch::jit::Graph>& graph);
-
 TORCH_API void RemoveImmutableInputDictLookups(
     std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -64,5 +61,24 @@ TORCH_API void ForceNonEmptyOutputs(Graph& graph);
 
 TORCH_API void UseVariadicGroupedAccessor(const std::shared_ptr<Graph>& graph);
 
+TORCH_API void EliminateExtraPermuteOps(std::shared_ptr<Graph>& graph);
+
+TORCH_API void EliminateNoOpSlice(std::shared_ptr<Graph>& graph);
+
+TORCH_API void UseSplitAndSqueeze(std::shared_ptr<Graph>& graph);
+
+// [Remove unnecessary outputs]]
+// Removes outputs to reduce compute when it is not used later in the graph.
+// Currently used to remove the max_indices output of embedding_bag, which
+// isn't necessary to compute the main output.
+TORCH_API void RemoveUnnecessaryOutputs(std::shared_ptr<Graph>& graph);
+
+TORCH_API void RemoveUnnecessaryEmbeddingBagOutputs(
+    std::shared_ptr<Graph>& graph);
+
+TORCH_API void QuantizedLinearReluFusion(std::shared_ptr<Graph>& graph);
+
+TORCH_API void FuseClampNaNToNum(std::shared_ptr<Graph>& graph);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
index f272f23d2ffa..d982e652344f 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.cpp
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/csrc/jit/tensorexpr/operators/misc.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 
 namespace torch {
@@ -148,7 +149,7 @@ std::shared_ptr<TEWrapper> createRelu() {
   Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto zero = FloatImm::make(0.f);
     auto a = A.load(i);
-    return ifThenElse(a < zero, zero, a);
+    return CompareSelect::make(a, zero, zero, a, kLT);
   });
   wrap = wrapTECompute(wrap, B, {A, N});
   updateNNCCache(aten::relu, wrap);
@@ -180,16 +181,62 @@ std::shared_ptr<TEWrapper> createSigmoid() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   BufHandle A("A", {N}, kFloat);
-  Tensor B =
-      Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
-  // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
-  // (Sleef_expf8).
-  constexpr int kSleefWidth = 8;
-  wrap = wrapTECompute(wrap, B, {A, N}, kSleefWidth);
+  Tensor B = Compute(
+      "B", {N}, [&](const VarHandle& i) { return fast_sigmoid(A.load(i)); });
+  wrap = wrapTECompute(wrap, B, {A, N});
   updateNNCCache(aten::sigmoid, wrap);
   return wrap;
 }
 
+std::shared_ptr<TEWrapper> createClamp() {
+  static auto clamp_symbol = c10::Symbol::fromQualString("aten::clamp");
+  auto wrap = lookupNNCCache(clamp_symbol);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  auto min_handle = VarHandle("min", kFloat);
+  auto max_handle = VarHandle("max", kFloat);
+
+  BufHandle A("A", {N}, kFloat);
+  Tensor result = Compute("aten_clamp", {N}, [&](const VarHandle& i) {
+    auto a = A.load(i);
+    return tensorexpr::clamp(min_handle, max_handle, a);
+  });
+  wrap = wrapTECompute(wrap, result, {A, min_handle, max_handle, N});
+  updateNNCCache(clamp_symbol, wrap);
+  return wrap;
+}
+
+std::shared_ptr<TEWrapper> createClampNanToNum() {
+  static auto symbol =
+      c10::Symbol::fromQualString("static_runtime::clamp_nan_to_num");
+  auto wrap = lookupNNCCache(symbol);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  auto min_handle = VarHandle("min", kFloat);
+  auto max_handle = VarHandle("max", kFloat);
+  auto nan_replace_val = VarHandle("nan_replace_val", kFloat);
+
+  BufHandle A("A", {N}, kFloat);
+  Tensor result = Compute("aten_clamp", {N}, [&](const VarHandle& i) {
+    auto a = A.load(i);
+    auto clamp = tensorexpr::clamp(min_handle, max_handle, a);
+    auto is_nan = tensorexpr::isnan(clamp);
+    auto nans_replaced =
+        tensorexpr::CompareSelect::make(is_nan, 1, nan_replace_val, clamp, kEQ);
+    return nans_replaced;
+  });
+  wrap = wrapTECompute(
+      wrap, result, {A, min_handle, max_handle, nan_replace_val, N});
+  updateNNCCache(symbol, wrap);
+  return wrap;
+}
+
 std::shared_ptr<TEWrapper> createSignedLog1p() {
   static auto signed_log1p_symbol =
       c10::Symbol::fromQualString("static_runtime::signed_log1p");
@@ -223,28 +270,5 @@ std::shared_ptr<TEWrapper> createSignedLog1p() {
   return wrap;
 }
 
-std::shared_ptr<TEWrapper> createWhere() {
-  auto wrap = lookupNNCCache(aten::where);
-  if (wrap) {
-    return wrap;
-  }
-  wrap = std::make_shared<TEWrapper>();
-  auto N = VarHandle("N", kLong);
-  BufHandle cond("cond", {N}, kBool);
-  BufHandle x("x", {N}, kLong);
-  BufHandle y("y", {N}, kLong);
-
-  Tensor res = Compute("res", {N}, [&](const VarHandle& i) {
-    auto cond_i = cond.load(i);
-    auto x_i = x.load(i);
-    auto y_i = y.load(i);
-    return ifThenElse(cond_i, x_i, y_i);
-  });
-
-  wrap = wrapTECompute(wrap, res, {cond, x, y, N});
-  updateNNCCache(aten::where, wrap);
-  return wrap;
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.h b/torch/csrc/jit/runtime/static/te_wrapper.h
index 379d9d533418..a9f2a5553dd4 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.h
+++ b/torch/csrc/jit/runtime/static/te_wrapper.h
@@ -38,7 +38,8 @@ std::shared_ptr<TEWrapper> createRelu();
 std::shared_ptr<TEWrapper> createTanh();
 std::shared_ptr<TEWrapper> createSigmoid();
 std::shared_ptr<TEWrapper> createSignedLog1p();
-std::shared_ptr<TEWrapper> createWhere();
+std::shared_ptr<TEWrapper> createClamp();
+std::shared_ptr<TEWrapper> createClampNanToNum();
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index f385ba3875bc..64f8dd5b4c50 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -913,16 +913,10 @@ const std::vector<std::string> functions = {
                 return grad_output * torch.where(self > 0, 1.0, negative_slope).type_as(result), None
             return result, backward
 
-        def gelu(self):
-            result = torch.gelu(self)
-            def backward(grad_output):
-                m_2_sqrtpi = 1.12837916709551257390
-                m_sqrt1_2 = 0.707106781186547524401
-                alpha = m_sqrt1_2
-                beta = m_2_sqrtpi * m_sqrt1_2 * 0.5
-                cdf = (torch.erf(self * m_sqrt1_2) + 1.0) * 0.5
-                pdf = beta * torch.exp(self * self * -0.5)
-                return grad_output * (cdf + self * pdf)
+        def gelu(self : Tensor, *, approximate : str):
+            result = torch.gelu(self, approximate=approximate)
+            def backward(grad_output):
+                return torch.gelu_backward(grad_output, self, approximate=approximate), None
             return result, backward
 
         def hardswish(self):
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index d778ea35fa07..8b1136b3c11d 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -1,7 +1,11 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/ir/ir_views.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
 #include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/serialized_shape_function_registry.h>
 #include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
 #include <torch/csrc/jit/runtime/symbolic_shape_registry_util.h>
 #include <torch/csrc/jit/serialization/import_source.h>
@@ -14,17 +18,9 @@ std::mutex lock;
 
 // split here to satisfy MSVC++
 // https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026?view=msvc-170
-const std::string _shape_compute_functions =
-#include <torch/csrc/jit/runtime/shape_functions.h>
-    ;
-const std::string _shape_compute_functions_1 =
-#include <torch/csrc/jit/runtime/shape_functions_1.h>
-    ;
-
 const std::string _xnnpack_shape_compute_functions =
 #ifdef USE_XNNPACK
-    R"(
-def prepacked_conv2d_clamp_run(input: List[int], conv2dOpContext: Any):
+    R"(def prepacked_conv2d_clamp_run(input: List[int], conv2dOpContext: Any):
     assert isinstance(conv2dOpContext, __torch__.torch.classes.xnnpack.Conv2dOpContext)
     (weight, bias, stride, padding, dilation, groups) = unchecked_cast(
         Tuple[List[int], Optional[List[int]], List[int], List[int], List[int], int],
@@ -59,64 +55,11 @@ def prepacked_linear_clamp_run(input: List[int], linearOpContext: Any):
 
 // wrapped in function so that operators get registered before map is
 // initialized
-static const OperatorMap<std::string>& get_schema_to_function_graph() {
+// Conditionally defined ops not yet supported in python serialized
+// operators
+static const OperatorMap<std::string>& conditionally_defined_ops() {
   // clang-format off
   static const OperatorMap<std::string> schema_to_function_graph{
-      {"aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)", "unary"},
-      {"aten::rsub.Tensor(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "unary"},
-      {"aten::dropout(Tensor input, float p, bool train) -> Tensor", "unary"},
-      {"aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", "adaptive_avg_pool2d"},
-      {"prim::NumToTensor.Scalar(Scalar a) -> Tensor", "zero_dim_tensor"},
-      {"prim::NumToTensor.bool(bool a) -> Tensor", "zero_dim_tensor"},
-      {"aten::zeros(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", "unary"},
-      {"aten::to.dtype(Tensor(a) self, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))", "unary"},
-      {"aten::arange(Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", "arange_end"},
-      {"aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "arange_start"},
-      {"aten::arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "arange_start_step"},
-      {"aten::squeeze(Tensor(a) self) -> Tensor(a)", "squeeze_nodim"},
-      {"aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)", "squeeze"},
-      {"aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)", "unsqueeze"},
-      {"aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)", "slice"},
-      {"aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)", "select"},
-      {"aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", "index_select"},
-      {"aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, "
-       "float eps=1e-05, bool cudnn_enable=True) -> Tensor", "unary"},
-      {"aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "unary"},
-      {"aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor", "unary"},
-      {"aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)", "unary"},
-      {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor", "embedding"},
-      {"aten::mm(Tensor self, Tensor mat2) -> Tensor", "mm"},
-      {"aten::dot(Tensor self, Tensor tensor) -> Tensor", "dot"},
-      {"aten::mv(Tensor self, Tensor vec) -> Tensor", "mv"},
-      {"aten::matmul(Tensor self, Tensor other) -> Tensor", "matmul"},
-      {"aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor", "linear"},
-      {"aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "max_pool2d"},
-      {"aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", "max_pool2d_with_indices"},
-      {"aten::t(Tensor(a) self) -> Tensor(a)", "t"},
-      {"aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", "transpose"},
-      {"aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor", "conv1d"},
-      {"aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", "conv2d"},
-      {"aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", "batch_norm"},
-      {"aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor", "conv3d"},
-      {"aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)", "flatten"},
-      {"aten::cat(Tensor[] tensors, int dim=0) -> Tensor", "cat"},
-      {"aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)", "permute"},
-      {"aten::view(Tensor(a) self, int[] size) -> Tensor(a)", "view"},
-      {"aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", "expand"},
-      {"aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)", "expand_one_unused"},
-      {"aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"},
-      {"aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"},
-      {"aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "max_dim"},
-      {"aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor", "zero_dim_tensor"},
-      {"aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor", "zero_dim_tensor"},
-      {"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "addmm"},
-      {"aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> (Tensor)", "upsample_nearest2d"},
-      {"aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor", "unary"},
-      {"aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor", "unary"},
-      {"aten::dequantize(Tensor self) -> Tensor", "unary"},
-      {"quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor", "quantized_prepacked_conv2d"},
-      {"quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor", "quantized_prepacked_conv2d"},
-      {"quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc", "broadcast"},
 #ifdef USE_XNNPACK
       {"prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y", "prepacked_conv2d_clamp_run"},
       {"prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y", "prepacked_linear_clamp_run"},
@@ -160,26 +103,121 @@ const at::optional<const FunctionSchema*> getInplaceVariant(
   return at::nullopt;
 }
 
-void registerSchema(
-    const FunctionSchema* schema_string,
-    const std::string& shape_compute_function_name,
-    std::unordered_map<std::string, std::shared_ptr<Graph>>& reused_functions,
-    const CompilationUnit& module) {
-  if (reused_functions.count(shape_compute_function_name)) {
-    auto graph = reused_functions[shape_compute_function_name];
+TypePtr mapTensorToListOfInts(TypePtr type) {
+  if (type->cast<TensorType>()) {
+    return ListType::ofInts();
+  }
+  at::ArrayRef<TypePtr> contained = type->containedTypes();
+  if (contained.empty()) {
+    return type;
+  }
+  return type->withContained(
+      fmap(type->containedTypes(), mapTensorToListOfInts));
+}
 
-    // allow extra unused arguments to map multiple functions to e.g. unary
+void checkForWhileLoop(
+    const FunctionSchema* schema,
+    std::shared_ptr<Graph> graph) {
+  DepthFirstGraphNodeIterator graph_it(graph);
+  for (auto* node = graph_it.next(); node != nullptr; node = graph_it.next()) {
+    if (node->kind() != prim::Loop) {
+      continue;
+    }
+    LoopView loop(node);
+    if (loop.loopType() != LoopView::For) {
+      TORCH_WARN(
+          "While loops are not yet implemented in unrolling which may make this shape function difficult to partially evaluate: ",
+          *node,
+          " for schema ",
+          *schema);
+    }
+  }
+}
+
+void checkInputReturnedAsOutput(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph) {
+  // Could use alias db here as well but would have to warn because it's
+  // imprecise
+  for (size_t i : c10::irange(graph->inputs().size())) {
+    Value* input = graph->inputs().at(i);
+    for (size_t j : c10::irange(graph->outputs().size())) {
+      Value* output = graph->outputs().at(j);
+      TORCH_CHECK(
+          input != output,
+          "For schema: ",
+          *schema,
+          " input index ",
+          i,
+          " is returned as output index ",
+          j,
+          ". Shape functions must return new unaliased lists");
+    }
+  }
+}
+
+void checkInputAndOutputTypes(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph) {
+  // allow extra unused arguments to map multiple functions to e.g. unary
+  TORCH_CHECK(
+      graph->inputs().size() <= schema->arguments().size(),
+      "Shape function must have fewer arguments than schema. Got ",
+      graph->inputs().size(),
+      " graph arguments and ",
+      schema->arguments().size(),
+      " schema arguments of schema: ",
+      *schema);
+
+  for (auto i : c10::irange(graph->inputs().size())) {
+    auto inp_type = schema->arguments().at(i).type();
+    auto mapped_type = mapTensorToListOfInts(inp_type);
+    auto graph_type = graph->inputs().at(i)->type();
     TORCH_INTERNAL_ASSERT(
-        graph->inputs().size() <= schema_string->arguments().size());
+        mapped_type->isSubtypeOf(graph->inputs().at(i)->type()),
+        "For schema type: ",
+        inp_type->str(),
+        " Expected supertype of ",
+        mapped_type->str(),
+        " but got graph_type ",
+        graph_type->str(),
+        " at index ",
+        i,
+        " of schema: ",
+        *schema);
+  }
 
-    cached_schema_to_graph[schema_string] = graph;
-    return;
+  TORCH_CHECK(
+      graph->outputs().size() == schema->returns().size(),
+      "Shape function equal number of outputs as schema. Got ",
+      graph->outputs().size(),
+      " graph outputs and ",
+      schema->returns().size(),
+      " schema returns of schema: ",
+      *schema);
+
+  for (auto i : c10::irange(schema->returns().size())) {
+    auto out_type = schema->returns().at(i).type();
+    auto mapped_type = mapTensorToListOfInts(out_type);
+    auto graph_type = graph->outputs().at(i)->type();
+    TORCH_INTERNAL_ASSERT(
+        mapped_type->isSubtypeOf(graph->outputs().at(i)->type()),
+        "For schema type: ",
+        out_type->str(),
+        " Expected supertype of ",
+        mapped_type->str(),
+        " but got graph_type ",
+        graph_type->str(),
+        " at output index ",
+        i,
+        " of schema: ",
+        *schema);
   }
+}
 
-  Function& shape_compute_function =
-      module.get_function(shape_compute_function_name);
-  std::shared_ptr<Graph> graph =
-      toGraphFunction(shape_compute_function).graph();
+void transformShapeFunction(
+    const FunctionSchema* schema_string,
+    std::shared_ptr<Graph> graph) {
   Inline(*graph);
 
   // ATEN operators can return multiple unboxed values, this in contrast to
@@ -197,9 +235,33 @@ void registerSchema(
       graph->registerOutput(v);
     }
   }
-  // allow extra unused arguments to map multiple functions to e.g. unary
-  TORCH_INTERNAL_ASSERT(
-      graph->inputs().size() <= schema_string->arguments().size());
+}
+
+void registerSchema(
+    const FunctionSchema* schema_string,
+    const std::string& shape_compute_function_name,
+    std::unordered_map<std::string, std::shared_ptr<Graph>>& reused_functions,
+    const CompilationUnit& module) {
+  if (reused_functions.count(shape_compute_function_name)) {
+    auto graph = reused_functions[shape_compute_function_name];
+
+    // allow extra unused arguments to map multiple functions to e.g. unary
+    TORCH_INTERNAL_ASSERT(
+        graph->inputs().size() <= schema_string->arguments().size());
+
+    cached_schema_to_graph[schema_string] = graph;
+    return;
+  }
+
+  Function& shape_compute_function =
+      module.get_function(shape_compute_function_name);
+  std::shared_ptr<Graph> graph =
+      toGraphFunction(shape_compute_function).graph();
+
+  transformShapeFunction(schema_string, graph);
+  // NB: we lint the shape functions registered in source
+  // in a test file
+  // LintShapeComputeGraph(schema_string, graph);
 
   cached_schema_to_graph[schema_string] = graph;
   reused_functions[shape_compute_function_name] = graph;
@@ -209,9 +271,12 @@ void loadModule(const CompilationUnit& module) {
   std::unordered_map<std::string, std::shared_ptr<Graph>> reused_functions;
 
   std::vector<std::pair<std::shared_ptr<Operator>, std::string>>
-      operator_pairs = get_schema_to_function_graph().getAllKeysAndValues();
+      operator_pairs = conditionally_defined_ops().getAllKeysAndValues();
   auto te_ops = get_tensorexpr_elementwise_set().getAllKeysAndValues();
   operator_pairs.insert(operator_pairs.end(), te_ops.begin(), te_ops.end());
+  auto more_mappings = GetShapeFunctionMappings().getAllKeysAndValues();
+  operator_pairs.insert(
+      operator_pairs.end(), more_mappings.begin(), more_mappings.end());
 
   for (const auto& pair : operator_pairs) {
     const FunctionSchema* schema_string = &pair.first->schema();
@@ -242,24 +307,8 @@ void loadModule(const CompilationUnit& module) {
 }
 
 void loadFunctions() {
-  // these should be static casts but not possible until C++17
-  // https://stackoverflow.com/a/43335753/9045206
-  auto start = _shape_compute_functions.find(
-      "####    SHAPE COMPUTE FUNCTIONS START   ###");
-  auto end = _shape_compute_functions.find(
-      "####    SHAPE COMPUTE FUNCTIONS END   ###");
-  auto start_1 = _shape_compute_functions_1.find(
-      "####    SHAPE COMPUTE FUNCTIONS START   ###");
-  auto end_1 = _shape_compute_functions_1.find(
-      "####    SHAPE COMPUTE FUNCTIONS END   ###");
-  TORCH_INTERNAL_ASSERT(start != std::string::npos && end != std::string::npos);
-  TORCH_INTERNAL_ASSERT(
-      start_1 != std::string::npos && end_1 != std::string::npos);
-
   auto shape_compute_functions =
-      _shape_compute_functions.substr(start, end - start) +
-      _shape_compute_functions_1.substr(start_1, end_1 - start_1) +
-      _xnnpack_shape_compute_functions;
+      GetSerializedShapeFunctions() + _xnnpack_shape_compute_functions;
 
   auto src = std::make_shared<Source>(shape_compute_functions);
   std::stringstream ss;
@@ -292,5 +341,41 @@ c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
   return c10::nullopt;
 }
 
+void RegisterShapeComputeGraphForSchema(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g) {
+  std::lock_guard<std::mutex> guard(lock);
+  if (cached_schema_to_graph.size() == 0) {
+    loadFunctions();
+  }
+  transformShapeFunction(&schema, g);
+  LintShapeComputeGraph(&schema, g);
+
+  cached_schema_to_graph[&schema] = g;
+}
+
+std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas() {
+  std::lock_guard<std::mutex> guard(lock);
+  if (cached_schema_to_graph.size() == 0) {
+    loadFunctions();
+  }
+
+  std::vector<const FunctionSchema*> schemas;
+  schemas.reserve(cached_schema_to_graph.size());
+  for (const auto& pair : cached_schema_to_graph) {
+    schemas.push_back(pair.first);
+  }
+  return schemas;
+}
+
+void LintShapeComputeGraph(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph) {
+  checkInputAndOutputTypes(schema, graph);
+  checkForWhileLoop(schema, graph);
+  checkInputReturnedAsOutput(schema, graph);
+  // TODO: other checks ? list ops which we don't symbolically optimize, etc ?
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.h b/torch/csrc/jit/runtime/symbolic_shape_registry.h
index 4ff57141060d..cdca73c28efa 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.h
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.h
@@ -8,8 +8,56 @@
 namespace torch {
 namespace jit {
 
+/*
+ADDING A NEW SHAPE GRAPH:
+- For one node schema, there is one corresponding registered shape compute
+graph. The schema of the graph should be the same except for Tensor arguments.
+For every Tensor input in operator schema, there should be a List[int]
+corresponding to that Tensor's shape. For example: "aten::linear(Tensor input,
+Tensor weight, Tensor? bias=None) -> Tensor" ==> def linear(input: List[int],
+weight: List[int], bias: Optional[List[int]])
+
+Additionally, arguments which are unused at the end of the schema may be left
+off. This allows sharing a single graph for multiple function schemas, such as
+unary operators with different trailing arguments that do not affect the output
+shape.
+
+The shape graph should return a new, unaliased List[int] (or tuple of lists for
+multiple returns) and should not modify any input lists. This allows the shape
+graphs to be composed and executed.
+
+The shape analysis (particularly for non-complete, or symbolic shapes) works by
+partially evaluating the JIT IR. It may be possible for a Graph to be registered
+that we cannot currently partially evaluate. If this happens, please file an
+issue. There are lints registered to avoid particular known patterns (continue
+or break or early return in a loop). Those may be improved in the future, please
+file an issue if necessary.
+
+To debug (and write initially) the recommended flow is to define these functions
+in python and iterate there. Functions should be added to
+torch/jit/_shape_functions.
+
+To test operators, the preferred flow is through OpInfos, with
+`assert_jit_shape_analysis=True`. If this is not feasible, you can look at tests
+in `test_symbolic_shape_analysis.py` such as `test_adaptive_avg_pool2d`.
+
+Operators which take in a list of tensors, such as concat, are not yet
+supported. Concat has been special cased and could be generalized as needed.
+Please file an issue.
+*/
+
+TORCH_API void RegisterShapeComputeGraphForSchema(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g);
+
 TORCH_API c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
     const FunctionSchema& schema);
 
+TORCH_API std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas();
+
+TORCH_API void LintShapeComputeGraph(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
index fbe3a0c36c71..c5c714079a95 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
@@ -76,7 +76,7 @@ const OperatorMap<std::string>& get_tensorexpr_elementwise_set() {
       {"aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor", "unary"},
       {"aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor", "unary"},
       {"aten::relu6(Tensor self) -> Tensor", "unary"},
-      {"aten::gelu(Tensor self) -> Tensor", "unary"},
+      {"aten::gelu(Tensor self, *, str approximate='none') -> Tensor", "unary"},
       {"aten::neg(Tensor self) -> Tensor", "unary"},
       {"aten::reciprocal(Tensor self) -> Tensor", "unary"},
       {"aten::expm1(Tensor self) -> Tensor", "unary"},
@@ -118,8 +118,9 @@ const OperatorMap<std::string>& get_tensorexpr_elementwise_set() {
       {"aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor", "broadcast_one_three"},
       // TODO: enable slice, shape inference is not implemented for this op yet
   };
+  // clang-format on
   return tensorexpr_elementwise_set;
 }
 
-}
-}
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
index b5aa951aa12b..46384f49f673 100644
--- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
+++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
@@ -222,7 +222,7 @@ ska::flat_hash_map<int64_t, DebugInfoTuple> CallStackDebugInfoUnpickler::
       {},
       c10::parseType);
   ska::flat_hash_map<int64_t, DebugInfoTuple> callstack_ptrs;
-  auto ivalues = std::move(*std::move(ival).toTuple()).elements();
+  const auto& ivalues = ival.toTupleRef().elements();
   for (auto& val : ivalues) {
     const auto& tup_elems = val.toTupleRef().elements();
     TORCH_CHECK(
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 065b9336b2ed..471bcc23595c 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -23,6 +23,7 @@
 #include <onnx/checker.h>
 #include <onnx/onnx_pb.h>
 #include <onnx/proto_utils.h>
+#include <onnx/shape_inference/implementation.h>
 
 #include <fstream>
 #include <memory>
@@ -58,7 +59,7 @@ namespace onnx = ::ONNX_NAMESPACE;
 const static int kInvalidOpsetVersion = -1;
 // Based on OP_SET_ID_VERSION_MAP in
 // https://github.com/onnx/onnx/blob/master/onnx/helper.py.
-constexpr static std::array<int64_t, 16> kOpsetVersionToIRVersion = {
+constexpr static std::array<int64_t, 17> kOpsetVersionToIRVersion = {
     kInvalidOpsetVersion,
     3,
     kInvalidOpsetVersion,
@@ -74,6 +75,7 @@ constexpr static std::array<int64_t, 16> kOpsetVersionToIRVersion = {
     7,
     7,
     7,
+    8,
     8};
 
 std::string getNodeStackTraceString(const Node* n) {
@@ -112,7 +114,7 @@ void validateBlock(
           WithInsertPoint guard(node);
           auto* new_node =
               b->owningGraph()->insertNode(b->owningGraph()->create(
-                  Symbol(::c10::onnx::ATen),
+                  Symbol(::c10::aten::ATen),
                   node->inputs(),
                   node->outputs().size()));
           for (size_t i = 0; i < node->outputs().size(); ++i) {
@@ -232,6 +234,10 @@ class GraphEncoder {
     return use_external_data_format_;
   }
 
+  NodeNameMap get_onnx_node_names() {
+    return onnx_node_name_map_;
+  }
+
  private:
   // Using std::map instead of std::unordered_map for initializers
   // in EncodeGraph constructor so that the order in which initializers
@@ -291,6 +297,11 @@ class GraphEncoder {
       bool use_external_data_format = false,
       const std::string& onnx_file_path = std::string());
 
+  void EncodeTypeProto(
+      onnx::TypeProto* type_proto,
+      const TypePtr& node_type,
+      const std::string& name);
+
   void EncodeLocalFunctionOpsetImport(
       onnx::FunctionProto* func_proto,
       const Node* n,
@@ -348,6 +359,16 @@ class GraphEncoder {
 
   void AddAttribute(onnx::FunctionProto* func_proto, const std::string& name);
 
+  void TensorTypeToONNXType(
+      const TensorTypePtr& tensor_type,
+      const std::string& dim_name_prefix,
+      const std::string& name,
+      const std::unordered_map<
+          std::string,
+          std::unordered_map<int64_t, std::string>>& dynamic_axes,
+      onnx::TypeProto_Tensor* onnx_tensor_type,
+      bool assign_dim_param = true);
+
   SymbolDimMap symbol_dim_map_;
   onnx::ModelProto model_proto_;
   size_t num_blocks_;
@@ -363,6 +384,7 @@ class GraphEncoder {
   std::map<std::string, int> custom_opsets_;
   std::shared_ptr<Graph> graph_;
   NodeAttrNameMap node_attr_to_name_;
+  NodeNameMap onnx_node_name_map_;
   // For large models, the parameters can be stored in separate binary files.
   // This parameter sets a threshold on the number of elements in the parameter
   // tensor, beyond which the parameter is stored in a separate file (if
@@ -424,6 +446,10 @@ onnx::AttributeProto_AttributeType ATenAttributeKindToOnnxAttributeType(
       return onnx::AttributeProto_AttributeType_TENSOR;
     case AttributeKind::ts:
       return onnx::AttributeProto_AttributeType_TENSORS;
+    case AttributeKind::ty:
+      return onnx::AttributeProto_AttributeType_TYPE_PROTO;
+    case AttributeKind::tys:
+      return onnx::AttributeProto_AttributeType_TYPE_PROTOS;
     case AttributeKind::g:
       return onnx::AttributeProto_AttributeType_GRAPH;
     case AttributeKind::gs:
@@ -534,6 +560,43 @@ GraphEncoder::GraphEncoder(
   }
 }
 
+void GraphEncoder::TensorTypeToONNXType(
+    const TensorTypePtr& tensor_type,
+    const std::string& dim_name_prefix,
+    const std::string& name,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    onnx::TypeProto_Tensor* onnx_tensor_type,
+    bool assign_dim_param) {
+  if (tensor_type->dim()) {
+    onnx::TensorShapeProto* shape = onnx_tensor_type->mutable_shape();
+    auto sizes = tensor_type->symbolic_sizes().sizes().value();
+    for (const auto i : c10::irange(sizes.size())) {
+      shape->add_dim();
+      if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
+          (dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
+        shape->mutable_dim(i)->set_dim_param(dynamic_axes.at(name).at(i));
+        if (!sizes[i].is_static()) {
+          symbol_dim_map_[sizes[i]] = dynamic_axes.at(name).at(i);
+        }
+      } else if (sizes[i].is_static()) {
+        shape->mutable_dim(i)->set_dim_value(sizes[i].static_size());
+      } else if (assign_dim_param) {
+        if (symbol_dim_map_.find(sizes[i]) == symbol_dim_map_.end()) {
+          symbol_dim_map_[sizes[i]] =
+              dim_name_prefix + name + "_dim_" + std::to_string(i);
+        }
+        shape->mutable_dim(i)->set_dim_param(symbol_dim_map_[sizes[i]]);
+      }
+    }
+  }
+  if (tensor_type->scalarType()) {
+    onnx_tensor_type->set_elem_type(
+        ATenTypeToOnnxType(tensor_type->scalarType().value()));
+  }
+}
+
 void GraphEncoder::EncodeValueInfoType(
     onnx::TypeProto* onnx_type,
     const TypePtr node_type,
@@ -541,49 +604,28 @@ void GraphEncoder::EncodeValueInfoType(
     const std::unordered_map<
         std::string,
         std::unordered_map<int64_t, std::string>>& dynamic_axes) {
-  auto tensorTypeToONNXType = [&dynamic_axes, n, this](
-                                  const TensorTypePtr& t,
-                                  onnx::TypeProto_Tensor* onnx_tensor_type) {
-    std::string name = n->debugName();
-    if (t->dim()) {
-      onnx::TensorShapeProto* shape = onnx_tensor_type->mutable_shape();
-      auto sizes = t->symbolic_sizes().sizes().value();
-      for (const auto i : c10::irange(sizes.size())) {
-        shape->add_dim();
-        if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
-            (dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
-          shape->mutable_dim(i)->set_dim_param(dynamic_axes.at(name).at(i));
-          if (!sizes[i].is_static()) {
-            symbol_dim_map_[sizes[i]] = dynamic_axes.at(name).at(i);
-          }
-        } else if (sizes[i].is_static()) {
-          shape->mutable_dim(i)->set_dim_value(sizes[i].static_size());
-        } else {
-          if (symbol_dim_map_.find(sizes[i]) == symbol_dim_map_.end()) {
-            if (n->node()->kind() == prim::Param) {
-              symbol_dim_map_[sizes[i]] = name + "_dim_" + std::to_string(i);
-            } else {
-              std::string op_type = n->node()->kind().toUnqualString();
-              symbol_dim_map_[sizes[i]] =
-                  op_type + name + "_dim_" + std::to_string(i);
-            }
-          }
-          shape->mutable_dim(i)->set_dim_param(symbol_dim_map_[sizes[i]]);
-        }
-      }
-    }
-    if (t->scalarType()) {
-      onnx_tensor_type->set_elem_type(
-          ATenTypeToOnnxType(t->scalarType().value()));
-    }
-  };
-
+  std::string dim_name_prefix;
+  if (n->node()->kind() != prim::Param) {
+    dim_name_prefix = n->node()->kind().toUnqualString();
+  }
   if (TensorTypePtr tensor_type = node_type->cast<TensorType>()) {
     if (tensor_type->dim() || tensor_type->scalarType()) {
       // Encode type if either shape or dtype exists.
       onnx::TypeProto_Tensor* onnx_tensor_type =
           onnx_type->mutable_tensor_type();
-      tensorTypeToONNXType(tensor_type, onnx_tensor_type);
+      // Do not assign dim_param for sequence tensor type.
+      // Sequence of tensors could differ in dimension size.
+      // Use a dimension with neither dim_value nor dim_param set
+      // to denote an unknown dimension.
+      // Create and assign dim_param for normal tensor type.
+      auto is_sequence_tensor = static_cast<bool>(n->type()->cast<ListType>());
+      TensorTypeToONNXType(
+          tensor_type,
+          dim_name_prefix,
+          n->debugName(),
+          dynamic_axes,
+          onnx_tensor_type,
+          !is_sequence_tensor);
     }
   } else if (BoolTypePtr bool_type = node_type->cast<BoolType>()) {
     onnx::TypeProto_Tensor* onnx_tensor_type = onnx_type->mutable_tensor_type();
@@ -600,6 +642,37 @@ void GraphEncoder::EncodeValueInfoType(
         onnx_type->mutable_sequence_type();
     onnx::TypeProto* onnx_tensor_type = sequence_type->mutable_elem_type();
     EncodeValueInfoType(onnx_tensor_type, list_elem_type, n, dynamic_axes);
+  } else if (OptionalTypePtr optional_type = node_type->cast<OptionalType>()) {
+    auto elem_type = optional_type->getElementType();
+    if (TensorTypePtr tensor_type = elem_type->cast<TensorType>()) {
+      onnx::TypeProto_Optional* onnx_optional_type =
+          onnx_type->mutable_optional_type();
+      onnx::TypeProto_Tensor* onnx_tensor_type =
+          onnx_optional_type->mutable_elem_type()->mutable_tensor_type();
+      TensorTypeToONNXType(
+          tensor_type,
+          dim_name_prefix,
+          n->debugName(),
+          dynamic_axes,
+          onnx_tensor_type);
+    } else if (ListTypePtr inner_node_type = elem_type->cast<ListType>()) {
+      auto list_elem_type = inner_node_type->getElementType();
+      if (TensorTypePtr tensor_type = list_elem_type->cast<TensorType>()) {
+        onnx::TypeProto_Optional* onnx_optional_type =
+            onnx_type->mutable_optional_type();
+        onnx::TypeProto_Sequence* onnx_optional_sequence_type =
+            onnx_optional_type->mutable_elem_type()->mutable_sequence_type();
+        onnx::TypeProto_Tensor* onnx_tensor_type =
+            onnx_optional_sequence_type->mutable_elem_type()
+                ->mutable_tensor_type();
+        TensorTypeToONNXType(
+            tensor_type,
+            dim_name_prefix,
+            n->debugName(),
+            dynamic_axes,
+            onnx_tensor_type);
+      }
+    }
   }
 }
 
@@ -649,7 +722,7 @@ void GraphEncoder::EncodeBlock(
     bool use_external_data_format,
     const std::string& onnx_file_path) {
   AT_ASSERT(graph_proto != nullptr);
-  std::string block_name = "torch-jit-export";
+  std::string block_name = "torch_jit";
   if (num_blocks_) {
     block_name += std::to_string(num_blocks_);
   }
@@ -813,8 +886,10 @@ void GraphEncoder::EncodeNode(
   }
   node_proto->set_op_type(node->kind().toUnqualString());
   if (add_node_names) {
-    node_proto->set_name(
-        node_proto->op_type() + "_" + std::to_string(num_op_nodes_));
+    auto node_name =
+        node_proto->op_type() + "_" + std::to_string(num_op_nodes_);
+    node_proto->set_name(node_name);
+    onnx_node_name_map_[node] = node_name;
     num_op_nodes_++;
   }
   auto attrs_it = node_attr_to_name_.find(node);
@@ -963,6 +1038,26 @@ void GraphEncoder::AddAttribute(
         EncodeTensor(t, v, {}, use_external_data_format, onnx_file_path);
       }
       break;
+    case AttributeKind::ty: {
+      attr->set_type(onnx::AttributeProto_AttributeType_TYPE_PROTO);
+      auto tp = attr->mutable_tp();
+      const TypePtr& node_type = node->ty(name);
+      EncodeTypeProto(
+          tp, node_type, node_proto->op_type() + "_" + name.toDisplayString());
+    } break;
+    case AttributeKind::tys: {
+      attr->set_type(onnx::AttributeProto_AttributeType_TYPE_PROTOS);
+      size_t index = 0;
+      for (auto& v : node->tys(name)) {
+        auto tp = attr->add_type_protos();
+        EncodeTypeProto(
+            tp,
+            v,
+            node_proto->op_type() + "_" + name.toDisplayString() + "_" +
+                std::to_string(index));
+        index++;
+      }
+    } break;
     case AttributeKind::g: {
       auto g = attr->mutable_g();
       EncodeGraph(
@@ -1085,6 +1180,21 @@ void GraphEncoder::EncodeLocalFunction(
   }
 }
 
+void GraphEncoder::EncodeTypeProto(
+    onnx::TypeProto* type_proto,
+    const TypePtr& node_type,
+    const std::string& name) {
+  if (TensorTypePtr tensor_type = node_type->cast<TensorType>()) {
+    onnx::TypeProto_Tensor* onnx_tensor_type =
+        type_proto->mutable_tensor_type();
+    TensorTypeToONNXType(tensor_type, "", name, {}, onnx_tensor_type);
+  } else if (ListTypePtr list_type = node_type->cast<ListType>()) {
+    onnx::TypeProto_Sequence* seq_type = type_proto->mutable_sequence_type();
+    auto elem_type = list_type->getElementType();
+    EncodeTypeProto(seq_type->mutable_elem_type(), elem_type, name);
+  }
+}
+
 void GraphEncoder::EncodeTensor(
     onnx::TensorProto* tensor_proto,
     const at::Tensor& tensor,
@@ -1148,8 +1258,8 @@ void GraphEncoder::EncodeIntermediateValueInfo(
     const Value* v) {
   // Motivation is to encode ValueInfo for onnx local function nodes.
   auto n = v->node();
-  if (n->kind().is_onnx()) {
-    // Encode value info only for non-onnx nodes.
+  if (n->kind().is_onnx() || n->kind().is_aten()) {
+    // Encode value info only for non-onnx or non-ATen nodes.
     return;
   }
   if (n->owningGraph() != graph_.get()) {
@@ -1203,7 +1313,8 @@ std::tuple<
     std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
     RawDataExportMap,
     SymbolDimMap,
-    bool>
+    bool,
+    NodeNameMap>
 export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
@@ -1240,7 +1351,8 @@ export_onnx(
           graph_encoder.get_model_proto()),
       graph_encoder.get_raw_data_export_map(),
       graph_encoder.get_symbol_dim_param_map(),
-      graph_encoder.get_use_external_data_format());
+      graph_encoder.get_use_external_data_format(),
+      graph_encoder.get_onnx_node_names());
 }
 
 std::string serialize_model_proto_to_string(
@@ -1248,13 +1360,17 @@ std::string serialize_model_proto_to_string(
   return model_proto->SerializeAsString();
 }
 
-void check_onnx_proto(const std::string& proto_string) {
+void check_onnx_proto(const std::string& proto_string, bool full_check) {
   onnx::ModelProto model;
   if (!ParseProtoFromBytes(&model, proto_string.c_str(), proto_string.size())) {
     throw std::runtime_error("Invalid ONNX proto string.");
     return;
   }
   onnx::checker::check_model(model);
+
+  if (full_check) {
+    onnx::shape_inference::InferShapes(model);
+  }
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index b73817fb23c5..7b18d9805d87 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -3,6 +3,7 @@
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/jit/serialization/python_print.h>
 #include <torch/csrc/jit/serialization/storage_context.h>
@@ -30,8 +31,9 @@ using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
 using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
 
+using NodeNameMap = std::unordered_map<const Node*, std::string>;
+
 // Used for modularized export settling function and node attributes.
-using ValAttrNameMap = std::unordered_map<const Value*, std::string>;
 using NodeAttrNameMap = std::
     unordered_map<const Node*, std::unordered_map<std::string, std::string>>;
 
@@ -39,7 +41,8 @@ TORCH_API std::tuple<
     std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
     RawDataExportMap,
     SymbolDimMap,
-    bool>
+    bool,
+    NodeNameMap>
 export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
@@ -61,7 +64,9 @@ export_onnx(
 TORCH_API std::string serialize_model_proto_to_string(
     const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
 
-TORCH_API void check_onnx_proto(const std::string& proto_string);
+TORCH_API void check_onnx_proto(
+    const std::string& proto_string,
+    bool full_check = false);
 
 // Serializer for both oldsyle and unified format TorchScript serialization
 class TORCH_API ScriptModuleSerializer {
@@ -85,9 +90,6 @@ class TORCH_API ScriptModuleSerializer {
   void convertNamedType(const c10::NamedTypePtr& class_type);
   void convertTypes(const at::NamedTypePtr& root_type);
   void writeExtraFiles(const Module& module, const ExtraFilesMap& extra_files);
-  void writeMobileMetadata(
-      const Module& module,
-      const ExtraFilesMap& extra_files);
   void writeByteCode(const Module& module, bool save_mobile_debug_info);
   void writeArchive(
       const IValue& value,
@@ -157,21 +159,24 @@ TORCH_API void ExportModule(
     std::ostream& out,
     const ExtraFilesMap& metadata = ExtraFilesMap(),
     bool bytecode_format = false,
-    bool save_mobile_debug_info = false);
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
 
 TORCH_API void ExportModule(
     const Module& module,
     const std::string& filename,
     const ExtraFilesMap& metadata = ExtraFilesMap(),
     bool bytecode_format = false,
-    bool save_mobile_debug_info = false);
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
 
 TORCH_API void ExportModule(
     const Module& module,
     const std::function<size_t(const void*, size_t)>& writer_func,
     const ExtraFilesMap& metadata = ExtraFilesMap(),
     bool bytecode_format = false,
-    bool save_mobile_debug_info = false);
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
 
 // Write the bytes of a pickle archive and the tensors referenced inside that
 // archive
@@ -201,6 +206,9 @@ struct TORCH_API BytecodeEmitMode {
 
   static bool is_default_args_before_out_args_enabled();
   static void set_default_args_before_out_args_enabled(bool enabled);
+
+  static bool is_emit_promoted_ops_enabled();
+  static void set_default_emit_promoted_ops_enabled(bool enabled);
 };
 
 // RAII guard to switch the way JIT emits the bytecode for inputs.
@@ -216,24 +224,32 @@ struct TORCH_API BytecodeEmitMode {
 struct TORCH_API BytecodeEmitModeGuard {
   BytecodeEmitModeGuard(
       bool enable_default_value_for_unspecified_arg,
-      bool enable_default_args_before_out_args)
+      bool enable_default_args_before_out_args,
+      bool enable_emit_promoted_ops)
       : prev_default_value_for_unspecified_arg_mode(
             BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled()),
         prev_default_args_before_out_args(
-            BytecodeEmitMode::is_default_args_before_out_args_enabled()) {
+            BytecodeEmitMode::is_default_args_before_out_args_enabled()),
+        prev_default_emit_promoted_ops(
+            BytecodeEmitMode::is_emit_promoted_ops_enabled()) {
     BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
         enable_default_value_for_unspecified_arg);
     BytecodeEmitMode::set_default_args_before_out_args_enabled(
         enable_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        enable_emit_promoted_ops);
   }
   ~BytecodeEmitModeGuard() {
     BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
         prev_default_value_for_unspecified_arg_mode);
     BytecodeEmitMode::set_default_args_before_out_args_enabled(
         prev_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        prev_default_emit_promoted_ops);
   }
   bool prev_default_value_for_unspecified_arg_mode;
   bool prev_default_args_before_out_args;
+  bool prev_default_emit_promoted_ops;
 };
 
 TORCH_API IValue to_tuple(std::vector<IValue> ivalues);
@@ -244,5 +260,7 @@ Table(const std::vector<std::pair<std::string, IValue>>& entries);
 TORCH_API void enableMobileInterfaceCallExport();
 bool getMobileInterfaceCallExport();
 
+CompilationOptions getOptionsFromGlobal();
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/export_bytecode.cpp b/torch/csrc/jit/serialization/export_bytecode.cpp
index 007e29ec7c33..4164f3b3da1b 100644
--- a/torch/csrc/jit/serialization/export_bytecode.cpp
+++ b/torch/csrc/jit/serialization/export_bytecode.cpp
@@ -142,7 +142,8 @@ mobile::Code compileGraphToMobileCode(
       graph,
       name,
       compilation_options.enable_default_value_for_unspecified_arg,
-      compilation_options.enable_default_args_before_out_args);
+      compilation_options.enable_default_args_before_out_args,
+      compilation_options.enable_emit_promoted_ops);
 
   mobile::Code mobile_code;
 
@@ -172,8 +173,7 @@ mobile::Code compileGraphToMobileCode(
       }
       mobile_code.operator_input_sizes_.emplace_back(num_args.value_or(-1));
       mobile_code.op_names_.emplace_back(opname);
-      auto func = mobile::makeOperatorFunction(
-          opname, num_args, compilation_options.model_version);
+      auto func = mobile::makeOperatorFunction(opname, num_args);
       TORCH_INTERNAL_ASSERT(
           func.has_value(),
           "Operator with name: ",
@@ -376,6 +376,8 @@ mobile::Module jitModuleToMobile(
       backend_debug_info_map.begin(), backend_debug_info_map.end());
   m.setDebugTable(MobileDebugTable(
       debug_handle_cs_ptr_map.begin(), debug_handle_cs_ptr_map.end()));
+
+  m.set_bytecode_version(options.model_version);
   return m;
 }
 
diff --git a/torch/csrc/jit/serialization/export_bytecode.h b/torch/csrc/jit/serialization/export_bytecode.h
index 4fb0b5043f56..96397a56eac8 100644
--- a/torch/csrc/jit/serialization/export_bytecode.h
+++ b/torch/csrc/jit/serialization/export_bytecode.h
@@ -20,6 +20,7 @@ struct TORCH_API CompilationOptions {
   bool incl_interface_call = false;
   bool enable_default_value_for_unspecified_arg = false;
   bool enable_default_args_before_out_args = true;
+  bool enable_emit_promoted_ops = true;
   int model_version = caffe2::serialize::kProducedBytecodeVersion;
 };
 
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 23bd357130f5..d0bfc84af233 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -16,6 +16,10 @@
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/serialization/callstack_debug_info_serialization.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
+#if defined(ENABLE_FLATBUFFER)
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
+#endif
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_export_helpers.h>
@@ -44,6 +48,8 @@ CompilationOptions getOptionsFromGlobal() {
       BytecodeEmitMode::is_default_args_before_out_args_enabled();
   compilation_options.enable_default_value_for_unspecified_arg =
       BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled();
+  compilation_options.enable_emit_promoted_ops =
+      BytecodeEmitMode::is_emit_promoted_ops_enabled();
   compilation_options.incl_interface_call = getMobileInterfaceCallExport();
   compilation_options.model_version =
       caffe2::serialize::kProducedBytecodeVersion;
@@ -74,6 +80,87 @@ ExportModuleExtraFilesHook& GetExtraFilesHook() {
   return func;
 }
 
+/**
+ * If the type is not NamedTuple, it will return default_type_str. If the type
+ * is a NamedTuple, it will return a string with following structure to describe
+ * the content in the NamedTuple: "qualified_named[ NamedTuple, [ [filed_name_1,
+ * field_type_1], [filed_name_2, field_type_2]
+ *   ]
+ * ]"
+ *  Example NamedTuple type:
+ *  "__torch__.base_models.sparse_nn.pytorch_preproc_types.PreprocOutputType[
+ *     NamedTuple, [
+ *         [float_features, Tensor],
+ *         [id_list_features, List[Tensor]],
+ *         [label,  Tensor],
+ *         [weight, Tensor],
+ *         ]
+ *     ]"
+ *
+ * @param compilation_unit Jit compilcation unit to look up function schema.
+ * @param type_ptr A type pointer and it can be possibly any type.
+ * @param default_type_str The default string representation. The string can
+ * either from type_ptr->str(), type_ptr->annotation_str(), or
+ * type_ptr->repr_str(). In some cases, they could be different in different
+ * scenario. For example, Tensor type can be "Tensor", "Tensor (inferred)" and
+ * "Tensor[]", and we only want "Tensor". Leave it as part of arguments as the
+ * default return, when type_ptr is not a NamedTuple.
+ * @return string representation.
+ */
+std::string get_named_tuple_str_or_default(
+    const CompilationUnit& compilation_unit,
+    const TypePtr& type_ptr,
+    std::string default_type_str) {
+  if (type_ptr->kind() == TypeKind::TupleType) {
+    // For the simple types (Tensor, Tensor), the mobile type parse can parse
+    // it and compilation unit won't have it's definition. The default type
+    // string will be returned instead.
+    if (compilation_unit.get_named_tuple(type_ptr->str())) {
+      auto named_tuple_ptr = compilation_unit.get_named_tuple(type_ptr->str());
+      if (named_tuple_ptr != nullptr) {
+        std::string named_tuple_str = type_ptr->str();
+        named_tuple_str.append("[NamedTuple, [");
+        std::vector<IValue> name_type_pairs;
+
+        // Get the field name and field type for the NamedTuple
+        for (auto it = named_tuple_ptr->schema()->arguments().begin();
+             it != named_tuple_ptr->schema()->arguments().end();
+             it++) {
+          const std::string named_tuple_name = it->name();
+          const c10::TypePtr& named_tuple_type = it->type();
+          // When it->type() is Tensor type, in Python, if it's inferred type,
+          // str() return "Tensor" and repr_str() return "Tensor (inferred)". If
+          // it's not inferred type, str() return "Tensor[]" and repr_str()
+          // return "Tensor". In cpp, repr_str() will always return "Tensor"
+          // regardless inferred type. When exporing custom type in bytecode,
+          // "Tensor" is the preferred way to deserialize Tensor type
+          std::string named_tuple_type_str = it->is_inferred_type()
+              ? named_tuple_type->str()
+              : named_tuple_type->repr_str();
+          // The type can also be NamedTuple. Will parse it recursively and get
+          // it's string representation.
+          named_tuple_type_str = get_named_tuple_str_or_default(
+              compilation_unit, named_tuple_type, named_tuple_type_str);
+          name_type_pairs.emplace_back(
+              c10::ivalue::Tuple::create({it->name(), named_tuple_type_str}));
+
+          named_tuple_str.append("[")
+              .append(named_tuple_name)
+              .append(", ")
+              .append(named_tuple_type_str)
+              .append("]");
+          if (it != named_tuple_ptr->schema()->arguments().end() - 1) {
+            named_tuple_str.append(",");
+          }
+        }
+        named_tuple_str.append("]]");
+        return named_tuple_str;
+      }
+    }
+  }
+  return default_type_str;
+}
+
 std::pair<IValue, IValue> getFunctionTuple(
     const CompilationUnit& compilation_unit,
     const mobile::Function& func,
@@ -114,59 +201,36 @@ std::pair<IValue, IValue> getFunctionTuple(
       t = dyn->fallback();
     }
     std::string type_str = t->annotation_str();
-    if (t->kind() == TypeKind::TupleType) {
-      TORCH_CHECK(
-          compilation_unit.get_named_tuple(t->str()),
-          "Can't find definition for the qualified name: ",
-          t->str(),
-          "(TypeKind::TupleType)  in compilation unit.",
-          "Please report a bug to PyTorch.");
-      auto named_tuple_type = compilation_unit.get_named_tuple(t->str());
-      if (named_tuple_type != nullptr) {
-        std::string named_tuple_str = t->str();
-        named_tuple_str.append("[NamedTuple, [");
-        std::vector<IValue> name_type_pairs;
-
-        // Get the field name and field type for the NamedTuple
-        for (auto it = named_tuple_type->schema()->arguments().begin();
-             it != named_tuple_type->schema()->arguments().end();
-             it++) {
-          name_type_pairs.emplace_back(
-              c10::ivalue::Tuple::create({it->name(), it->type()->repr_str()}));
-
-          // When it->type() is Tensor type, in Python, if it's inferred type,
-          // str() return "Tensor" and repr_str() return "Tensor (inferred)". If
-          // it's not inferred type, str() return "Tensor[]" and repr_str()
-          // return "Tensor". In cpp, repr_str() will always return "Tensor"
-          // regardless inferred type. When exporing custom type in bytecode,
-          // "Tensor" is the preferred way to deserialize Tensor type
-          type_str = it->is_inferred_type() ? it->type()->str()
-                                            : it->type()->repr_str();
-          named_tuple_str.append("[" + it->name() + ", " + type_str + "]");
-          if (it != named_tuple_type->schema()->arguments().end() - 1) {
-            named_tuple_str.append(",");
-          }
-        }
-        named_tuple_str.append("]]");
-        // Create a named_tuple type with following structure
-        // "qualified_named[
-        //   NamedTuple, [
-        //       [filed_name_1, field_type_1],
-        //       [filed_name_2, field_type_2]
-        //   ]
-        // ]"
-        //  Example NamedTuple type:
-        //  "__torch__.base_models.sparse_nn.pytorch_preproc_types.PreprocOutputType[
-        //     NamedTuple, [
-        //         [float_features, Tensor],
-        //         [id_list_features, List[Tensor]],
-        //         [label,  Tensor],
-        //         [weight, Tensor],
-        //         ]
-        //     ]"
-        types.emplace_back(named_tuple_str);
-        continue;
-      }
+    if (t->kind() == TypeKind::DictType) {
+      // For DictType, there are two items in t->containedTypes(), the first one
+      // is key and the second one is value. Both of them could be NamedTuple
+      // type.
+      const TypePtr& key_type = t->containedTypes()[0];
+      const TypePtr& value_type = t->containedTypes()[1];
+      std::string key_type_str = get_named_tuple_str_or_default(
+          compilation_unit, key_type, key_type->annotation_str());
+      std::string value_type_str = get_named_tuple_str_or_default(
+          compilation_unit, value_type, value_type->annotation_str());
+
+      // Construct the dict representation after achieving correct string
+      // representation for both key and value, like
+      // "Dict[str,__torch__.dper3.core.pytorch_schema_utils.IdScoreListFeatureTuple[NamedTuple,
+      // [[lengths, Tensor],[values,
+      // __torch__.dper3.core.pytorch_schema_utils.IdScoreTuple[NamedTuple,
+      // [[ids, Tensor],[scores, Tensor]]]],[offsets, Optional[Tensor]]]]]"
+      std::string dict_str;
+      dict_str.append("Dict[")
+          .append(key_type_str)
+          .append(",")
+          .append(value_type_str)
+          .append("]");
+      types.emplace_back(dict_str);
+      continue;
+    } else if (t->kind() == TypeKind::TupleType) {
+      std::string named_tuple_str =
+          get_named_tuple_str_or_default(compilation_unit, t, type_str);
+      types.emplace_back(named_tuple_str);
+      continue;
     } else if (type_str.find(torch_prefix) == 0) {
       TORCH_CHECK(
           type_str.find(class_prefix) == 0,
@@ -506,8 +570,12 @@ void ScriptModuleSerializer::writeArchive(
   TORCH_INTERNAL_ASSERT(tensor_names.size() == data_pickle.tensorData().size());
 
   for (const auto& td : data_pickle.tensorData()) {
-    WriteableTensorData writable_td = getWriteableTensorData(td);
     std::string tensor_name = tensor_names[i++];
+    if (td.is_meta()) {
+      writer_.writeRecord(tensor_dir + tensor_name, nullptr, 0);
+      continue;
+    }
+    WriteableTensorData writable_td = getWriteableTensorData(td);
     if (use_storage_context && serialized_tensors.count(tensor_name)) {
       // storage has been serialzed already, skip
       continue;
@@ -782,20 +850,49 @@ SerializationStorageContext& ScriptModuleSerializer::storage_context() {
   return storage_context_;
 }
 
+#if defined(ENABLE_FLATBUFFER)
+void save_mobile_module_to(
+    const Module& module,
+    const ExtraFilesMap& extra_files,
+    bool save_mobile_debug_info,
+    const std::function<size_t(const void*, size_t)>& writer_func) {
+  ExtraFilesMap jitFiles;
+  CompilationOptions options = getOptionsFromGlobal();
+  std::vector<IValue> constants;
+  jitModuleToPythonCodeAndConstants(module, &jitFiles, &constants);
+  mobile::Module mod = jitModuleToMobile(module, options);
+  auto buffer =
+      save_mobile_module_to_bytes(mod, extra_files, jitFiles, constants);
+  writer_func(reinterpret_cast<void*>(buffer.data()), buffer.size());
+}
+#endif
+
 void ExportModule(
     const Module& module,
     std::ostream& out,
     const ExtraFilesMap& extra_files,
     bool bytecode_format,
-    bool save_mobile_debug_info) {
-  caffe2::serialize::PyTorchStreamWriter writer(
-      [&](const void* buf, size_t nbytes) -> size_t {
-        out.write(static_cast<const char*>(buf), nbytes);
-        return !out ? 0 : nbytes;
-      });
-  ScriptModuleSerializer serializer(writer);
-  serializer.serialize(
-      module, extra_files, bytecode_format, save_mobile_debug_info);
+    bool save_mobile_debug_info,
+    bool use_flatbuffer) {
+  auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
+    out.write(static_cast<const char*>(buf), nbytes);
+    return !out ? 0 : nbytes;
+  };
+  if (use_flatbuffer) {
+#if defined(ENABLE_FLATBUFFER)
+    save_mobile_module_to(
+        module, extra_files, save_mobile_debug_info, writer_func);
+#else
+    TORCH_CHECK(
+        false,
+        "Trying to export as flatbuffer file but the build hasn't enabled flatbuffer");
+#endif
+  } else {
+    caffe2::serialize::PyTorchStreamWriter writer(writer_func);
+    ScriptModuleSerializer serializer(writer);
+    serializer.serialize(
+        module, extra_files, bytecode_format, save_mobile_debug_info);
+  }
 }
 
 void ExportModule(
@@ -803,11 +900,29 @@ void ExportModule(
     const std::string& filename,
     const ExtraFilesMap& extra_files,
     bool bytecode_format,
-    bool save_mobile_debug_info) {
-  caffe2::serialize::PyTorchStreamWriter writer(filename);
-  ScriptModuleSerializer serializer(writer);
-  serializer.serialize(
-      module, extra_files, bytecode_format, save_mobile_debug_info);
+    bool save_mobile_debug_info,
+    bool use_flatbuffer) {
+  if (use_flatbuffer) {
+#if defined(ENABLE_FLATBUFFER)
+    auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
+      std::fstream ofile(filename, std::ios::binary | std::ios::out);
+      ofile.write(static_cast<const char*>(buf), nbytes);
+      ofile.close();
+      return !ofile ? 0 : nbytes;
+    };
+    save_mobile_module_to(
+        module, extra_files, save_mobile_debug_info, writer_func);
+#else
+    TORCH_CHECK(
+        false,
+        "Trying to export as flatbuffer file but the build hasn't enabled flatbuffer");
+#endif
+  } else {
+    caffe2::serialize::PyTorchStreamWriter writer(filename);
+    ScriptModuleSerializer serializer(writer);
+    serializer.serialize(
+        module, extra_files, bytecode_format, save_mobile_debug_info);
+  }
 }
 
 void ExportModule(
@@ -815,11 +930,23 @@ void ExportModule(
     const std::function<size_t(const void*, size_t)>& writer_func,
     const ExtraFilesMap& extra_files,
     bool bytecode_format,
-    bool save_mobile_debug_info) {
-  caffe2::serialize::PyTorchStreamWriter writer(writer_func);
-  ScriptModuleSerializer serializer(writer);
-  serializer.serialize(
-      module, extra_files, bytecode_format, save_mobile_debug_info);
+    bool save_mobile_debug_info,
+    bool use_flatbuffer) {
+  if (use_flatbuffer) {
+#if defined(ENABLE_FLATBUFFER)
+    save_mobile_module_to(
+        module, extra_files, save_mobile_debug_info, writer_func);
+#else
+    TORCH_CHECK(
+        false,
+        "Trying to export as flatbuffer file but the build hasn't enabled flatbuffer");
+#endif
+  } else {
+    caffe2::serialize::PyTorchStreamWriter writer(writer_func);
+    ScriptModuleSerializer serializer(writer);
+    serializer.serialize(
+        module, extra_files, bytecode_format, save_mobile_debug_info);
+  }
 }
 
 namespace {
@@ -864,5 +991,14 @@ void BytecodeEmitMode::set_default_args_before_out_args_enabled(bool enabled) {
   emitDefautlArgsWithOutArgs = enabled;
 }
 
+thread_local bool emitDefaultEmitPromotedOps =
+    caffe2::serialize::kProducedBytecodeVersion <= 7 ? false : true;
+bool BytecodeEmitMode::is_emit_promoted_ops_enabled() {
+  return emitDefaultEmitPromotedOps;
+}
+void BytecodeEmitMode::set_default_emit_promoted_ops_enabled(bool enabled) {
+  emitDefaultEmitPromotedOps = enabled;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index ef7eca2b7811..066a78c3456a 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <c10/core/CPUAllocator.h>
+#include <caffe2/serialize/versions.h>
 #include <flatbuffers/flatbuffers.h>
 #include <torch/csrc/jit/mobile/code.h>
 #include <torch/csrc/jit/mobile/flatbuffer_loader.h>
@@ -28,16 +29,42 @@ using mobile::serialization::CreateTupleDirect;
 
 namespace {
 
+// TODO: remove once caffe2::kProducedBytecodeVersion is >= 9 and flatbuffer is
+// launched.
+constexpr uint32_t kMinVersion = 9;
+
 // We will store IValue NONE in index 0 in flatbuffer.
 constexpr int kNoneIndex = 0;
 
+static TypePtr realType(TypePtr type) {
+  if (auto dyn = type->castRaw<c10::DynamicType>()) {
+    return dyn->fallback();
+  } else {
+    return type;
+  }
+}
+
+auto print_type(const c10::Type& t) -> c10::optional<std::string> {
+  auto namedType = t.cast<c10::NamedType>();
+  if (namedType && namedType->name()) {
+    return namedType->name().value().qualifiedName();
+  }
+  if (auto dyn = t.castRaw<c10::DynamicType>()) {
+    return dyn->fallback()->annotation_str();
+  }
+  return c10::nullopt;
+}
+
 class FlatbufferSerializer {
  public:
   FlatbufferSerializer() = default;
 
   flatbuffers::DetachedBuffer serializeModule(
       const mobile::Module& module,
-      bool include_tensor_data_in_flatbuffer);
+      bool include_tensor_data_in_flatbuffer,
+      const ExtraFilesMap& extra_files = ExtraFilesMap(),
+      const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+      const std::vector<IValue>& jit_constants = {});
 
  private:
   template <typename It>
@@ -103,6 +130,12 @@ class FlatbufferSerializer {
       flatbuffers::FlatBufferBuilder& fbb,
       ClassTypePtr class_type);
 
+  flatbuffers::Offset<flatbuffers::Vector<
+      flatbuffers::Offset<mobile::serialization::ExtraFile>>>
+  storeExtraFilesAndGetOffset(
+      FlatBufferBuilder& fbb,
+      const ExtraFilesMap& extra_files);
+
   uint32_t insertIValue(
       flatbuffers::Offset<mobile::serialization::IValue> ivalue) {
     uint32_t size = ivalue_offsets_.size();
@@ -146,21 +179,21 @@ flatbuffers::Offset<jit::mobile::serialization::Schema> FlatbufferSerializer::
   return_vec.reserve(returns.size());
   for (const auto& arg : args) {
     int index = storeIValueAndGetIndex(fbb, arg.default_value());
-    TORCH_INTERNAL_ASSERT(arg.type()->kind() != c10::DynamicType::Kind);
     arg_vec.emplace_back(CreateArg(
         fbb,
         fbb.CreateSharedString(arg.name()),
-        fbb.CreateSharedString(arg.type()->annotation_str(type_printer)),
+        fbb.CreateSharedString(
+            realType(arg.type())->annotation_str(type_printer)),
         index));
   }
 
   for (const auto& ret : returns) {
     int index = storeIValueAndGetIndex(fbb, ret.default_value());
-    TORCH_INTERNAL_ASSERT(ret.type()->kind() != c10::DynamicType::Kind);
     return_vec.emplace_back(CreateArg(
         fbb,
         fbb.CreateSharedString(ret.name()),
-        fbb.CreateSharedString(ret.type()->annotation_str(type_printer)),
+        fbb.CreateSharedString(
+            realType(ret.type())->annotation_str(type_printer)),
         index));
   }
   return CreateSchema(
@@ -208,8 +241,7 @@ flatbuffers::Offset<mobile::serialization::Function> FlatbufferSerializer::
   std::vector<flatbuffers::Offset<flatbuffers::String>> type_offsets;
 
   for (const TypePtr& t : code.types_) {
-    auto type_str = t->annotation_str();
-    TORCH_INTERNAL_ASSERT(t->kind() != c10::DynamicType::Kind);
+    auto type_str = realType(t)->annotation_str();
     if (type_str.find(torch_prefix) == 0) {
       TORCH_CHECK(
           type_str.find(class_prefix) == 0,
@@ -232,6 +264,9 @@ flatbuffers::Offset<mobile::serialization::Function> FlatbufferSerializer::
     if (namedType && namedType->name()) {
       return namedType->name().value().qualifiedName();
     }
+    if (auto dyn = t.castRaw<c10::DynamicType>()) {
+      return dyn->fallback()->annotation_str();
+    }
     return c10::nullopt;
   };
 
@@ -271,9 +306,31 @@ flatbuffers::Offset<mobile::serialization::Function> FlatbufferSerializer::
   return function_offset;
 }
 
+flatbuffers::Offset<
+    flatbuffers::Vector<flatbuffers::Offset<mobile::serialization::ExtraFile>>>
+FlatbufferSerializer::storeExtraFilesAndGetOffset(
+    FlatBufferBuilder& fbb,
+    const ExtraFilesMap& extra_files) {
+  std::vector<flatbuffers::Offset<mobile::serialization::ExtraFile>>
+      extra_file_offsets;
+
+  for (const auto& extra_file : extra_files) {
+    flatbuffers::Offset<mobile::serialization::ExtraFile> extra_file_offset =
+        mobile::serialization::CreateExtraFile(
+            fbb,
+            fbb.CreateSharedString(extra_file.first),
+            fbb.CreateString(extra_file.second));
+    extra_file_offsets.emplace_back(extra_file_offset);
+  }
+  return fbb.CreateVector(extra_file_offsets);
+}
+
 flatbuffers::DetachedBuffer FlatbufferSerializer::serializeModule(
     const mobile::Module& module,
-    bool include_tensor_data_in_flatbuffer) {
+    bool include_tensor_data_in_flatbuffer,
+    const ExtraFilesMap& extra_files,
+    const ExtraFilesMap& jit_sources,
+    const std::vector<IValue>& jit_constants) {
   FlatBufferBuilder fbb;
 
   mcu_ = &module.compilation_unit();
@@ -323,17 +380,35 @@ flatbuffers::DetachedBuffer FlatbufferSerializer::serializeModule(
     storage_data_offset = fbb.CreateVector(storage_data);
   }
 
+  auto extra_files_offset = storeExtraFilesAndGetOffset(fbb, extra_files);
+
+  auto jit_source_offset = storeExtraFilesAndGetOffset(fbb, jit_sources);
+  std::vector<uint32_t> jit_constants_indexes;
+  jit_constants_indexes.reserve(jit_constants.size());
+  for (const auto& ival : jit_constants) {
+    jit_constants_indexes.emplace_back(storeIValueAndGetIndex(fbb, ival));
+  }
+  const uint32_t operator_version =
+      static_cast<uint32_t>(module.min_operator_version());
+  uint32_t bytecode_version = static_cast<uint32_t>(module.bytecode_version());
+  if (bytecode_version < kMinVersion) {
+    bytecode_version = kMinVersion;
+  }
+
   auto mod = CreateModule(
       fbb,
-      0, /* version */
-      0, /* extra_files */
+      /*bytecode_version=*/bytecode_version,
+      extra_files_offset, /* extra_files */
       functions_offset,
       ivalue_index,
       fbb.CreateVector(ivalue_offsets_),
       tensor_data_.size(),
       storage_data_offset,
-      fbb.CreateVector(obj_types_offset_));
-  fbb.Finish(mod);
+      fbb.CreateVector(obj_types_offset_),
+      jit_source_offset,
+      fbb.CreateVector(jit_constants_indexes),
+      operator_version);
+  FinishModuleBuffer(fbb, mod);
   return fbb.Release();
 }
 
@@ -354,7 +429,8 @@ flatbuffers::Offset<mobile::serialization::List> FlatbufferSerializer::listToFB(
   return CreateList(
       fbb,
       fbb.CreateVector(items),
-      fbb.CreateSharedString(list.type<c10::Type>()->annotation_str()));
+      fbb.CreateSharedString(
+          realType(list.type<c10::Type>())->annotation_str(print_type)));
 }
 
 flatbuffers::Offset<mobile::serialization::Dict> FlatbufferSerializer::dictToFB(
@@ -371,11 +447,13 @@ flatbuffers::Offset<mobile::serialization::Dict> FlatbufferSerializer::dictToFB(
     int value_index = storeIValueAndGetIndex(fbb, entry.value());
     values.push_back(value_index);
   }
+
   return CreateDict(
       fbb,
       fbb.CreateVector(keys),
       fbb.CreateVector(values),
-      fbb.CreateSharedString(ivalue.type<c10::Type>()->annotation_str()));
+      fbb.CreateSharedString(
+          realType(ivalue.type<c10::Type>())->annotation_str(print_type)));
 }
 
 flatbuffers::Offset<mobile::serialization::ObjectType> FlatbufferSerializer::
@@ -387,10 +465,14 @@ flatbuffers::Offset<mobile::serialization::ObjectType> FlatbufferSerializer::
       flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
       names_offset = 0;
   c10::QualifiedName setstate_name(*class_ptr->name(), "__setstate__");
+  c10::QualifiedName getstate_name(*class_ptr->name(), "__getstate__");
   const mobile::Function* setstate = mcu_->find_function(setstate_name);
-  if (setstate != nullptr) {
+  const mobile::Function* getstate = mcu_->find_function(getstate_name);
+  if (setstate != nullptr && getstate != nullptr) {
     typetype = mobile::serialization::TypeType::CLASS_WITH_SETSTATE;
-  } else if (class_ptr->findMethod("__setstate__")) {
+  } else if (
+      class_ptr->findMethod("__setstate__") &&
+      class_ptr->findMethod("__getstate__")) {
     typetype = mobile::serialization::TypeType::CUSTOM_CLASS;
   } else {
     size_t num_attr = class_ptr->numAttributes();
@@ -666,18 +748,29 @@ flatbuffers::Offset<mobile::serialization::IValue> FlatbufferSerializer::
 
 void save_mobile_module(
     const mobile::Module& module,
-    const std::string& filename) {
-  FlatbufferSerializer fb_serializer;
-  auto buffer = fb_serializer.serializeModule(module, true);
+    const std::string& filename,
+    const ExtraFilesMap& extra_files,
+    const ExtraFilesMap& jit_sources,
+    const std::vector<IValue>& jit_constants) {
+  auto buffer = save_mobile_module_to_bytes(
+      module, extra_files, jit_sources, jit_constants);
   std::fstream ofile(filename, std::ios::binary | std::ios::out);
   ofile.write(reinterpret_cast<char*>(buffer.data()), buffer.size());
   ofile.close();
 }
 
 flatbuffers::DetachedBuffer save_mobile_module_to_bytes(
-    const mobile::Module& module) {
+    const mobile::Module& module,
+    const ExtraFilesMap& extra_files,
+    const ExtraFilesMap& jit_sources,
+    const std::vector<IValue>& jit_constants) {
   FlatbufferSerializer fb_serializer;
-  return fb_serializer.serializeModule(module, true);
+  return fb_serializer.serializeModule(
+      module,
+      /*include_tensor_data_in_flatbuffer*/ true,
+      extra_files,
+      jit_sources,
+      jit_constants);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.h b/torch/csrc/jit/serialization/flatbuffer_serializer.h
index 6f20a1f799ba..46552ffaa392 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.h
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.h
@@ -18,9 +18,16 @@ namespace jit {
 
 TORCH_API void save_mobile_module(
     const mobile::Module& module,
-    const std::string& filename);
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
+
 TORCH_API flatbuffers::DetachedBuffer save_mobile_module_to_bytes(
-    const mobile::Module& module);
+    const mobile::Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
new file mode 100644
index 000000000000..b61d4ed98c10
--- /dev/null
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
@@ -0,0 +1,73 @@
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
+
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/import.h>
+
+namespace torch {
+namespace jit {
+
+Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  auto* flatbuffer_module = mobile::serialization::GetMutableModule(data.get());
+  FlatbufferLoader loader;
+  mobile::Module mobilem = loader.parseModule(flatbuffer_module);
+  parseExtraFiles(flatbuffer_module, extra_files);
+  ExtraFilesMap files;
+  std::vector<IValue> constants;
+  loader.extractJitSourceAndConstants(&files, &constants);
+  Module m = jitModuleFromSourceAndConstants(
+      mobilem._ivalue(),
+      files,
+      constants,
+      flatbuffer_module->bytecode_version());
+  m.set_delete_memory(data);
+  return m;
+}
+
+Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  auto data = get_file_content(filename.c_str());
+  return parse_and_initialize_jit_module(
+      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
+}
+
+Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  auto data = get_stream_content(in);
+  return parse_and_initialize_jit_module(
+      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
+}
+
+void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files) {
+  auto buffer = save_jit_module_to_bytes(module, extra_files);
+  std::fstream ofile(filename, std::ios::binary | std::ios::out);
+  ofile.write(reinterpret_cast<char*>(buffer.data()), buffer.size()); // NOLINT
+  ofile.close();
+}
+
+flatbuffers::DetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files) {
+  ExtraFilesMap jitfiles;
+  std::vector<IValue> constants;
+  jitModuleToPythonCodeAndConstants(module, &jitfiles, &constants);
+  CompilationOptions options = getOptionsFromGlobal();
+  mobile::Module mobilem = jitModuleToMobile(module, options);
+  return save_mobile_module_to_bytes(mobilem, extra_files, jitfiles, constants);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
new file mode 100644
index 000000000000..3fc32883f863
--- /dev/null
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
@@ -0,0 +1,35 @@
+#include <flatbuffers/flatbuffers.h>
+#include <torch/csrc/jit/serialization/import.h>
+
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API flatbuffers::DetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index 45d98c5646a6..c54370e25ef2 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -10,6 +10,7 @@
 #endif
 #include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/mobile/file_format.h>
 #include <torch/csrc/jit/operator_upgraders/upgraders_entry.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 #include <torch/csrc/jit/serialization/import_read.h>
@@ -18,6 +19,11 @@
 #include <torch/csrc/jit/serialization/source_range_serialization.h>
 #include <torch/csrc/jit/serialization/unpickler.h>
 
+#if defined(ENABLE_FLATBUFFER)
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
+#endif
+
 #include <caffe2/serialize/file_adapter.h>
 #include <caffe2/serialize/inline_container.h>
 #include <caffe2/serialize/istream_adapter.h>
@@ -290,9 +296,26 @@ Module import_ir_module(
     std::istream& in,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  auto reader = torch::make_unique<PyTorchStreamReader>(&in);
-  ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
-  return deserializer.deserialize(device, extra_files);
+  in.seekg(0, in.beg);
+  auto format = getFileFormat(in);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      return load_jit_module_from_stream(in, extra_files, device);
+#else
+      TORCH_CHECK(
+          false, "Flatbuffer input file but the build hasn't enable flatbuffer")
+#endif
+    }
+    case FileFormat::ZipFileFormat: {
+      auto reader = torch::make_unique<PyTorchStreamReader>(&in);
+      ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
+      return deserializer.deserialize(device, extra_files);
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
 }
 
 // For reading unified serialization format from torch.Package.
@@ -325,9 +348,25 @@ Module import_ir_module(
     const std::string& filename,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  auto reader = torch::make_unique<PyTorchStreamReader>(filename);
-  ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
-  return deserializer.deserialize(device, extra_files);
+  auto format = getFileFormat(filename);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      return load_jit_module_from_file(filename, extra_files, device);
+#else
+      TORCH_CHECK(
+          false, "Flatbuffer input file but the build hasn't enable flatbuffer")
+#endif
+    }
+    case FileFormat::ZipFileFormat: {
+      auto reader = torch::make_unique<PyTorchStreamReader>(filename);
+      ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
+      return deserializer.deserialize(device, extra_files);
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
 }
 
 Module import_ir_module(
@@ -357,9 +396,27 @@ Module load(
     std::istream& in,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  std::unique_ptr<IStreamAdapter> rai = std::make_unique<IStreamAdapter>(&in);
-  auto module = load(std::move(rai), device, extra_files);
-  return module;
+  in.seekg(0, in.beg);
+  auto format = getFileFormat(in);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      return load_jit_module_from_stream(in, extra_files, device);
+#else
+      TORCH_CHECK(
+          false, "Flatbuffer input file but the build hasn't enable flatbuffer")
+#endif
+    }
+    case FileFormat::ZipFileFormat: {
+      std::unique_ptr<IStreamAdapter> rai =
+          std::make_unique<IStreamAdapter>(&in);
+      auto module = load(std::move(rai), device, extra_files);
+      return module;
+    }
+
+    default:
+      TORCH_CHECK(false, "Unrecognized data format");
+  }
 }
 
 Module load(const std::string& filename, c10::optional<at::Device> device) {
@@ -371,9 +428,27 @@ Module load(
     const std::string& filename,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
-  std::unique_ptr<FileAdapter> rai = std::make_unique<FileAdapter>(filename);
-  auto module = load(std::move(rai), device, extra_files);
-  return module;
+  auto format = getFileFormat(filename);
+  switch (format) {
+    case FileFormat::FlatbufferFileFormat: {
+#if defined(ENABLE_FLATBUFFER)
+      return load_jit_module_from_file(filename, extra_files, device);
+#else
+      TORCH_CHECK(
+          false, "Flatbuffer input file but the build hasn't enable flatbuffer")
+#endif
+
+      case FileFormat::ZipFileFormat: {
+        std::unique_ptr<FileAdapter> rai =
+            std::make_unique<FileAdapter>(filename);
+        auto module = load(std::move(rai), device, extra_files);
+        return module;
+      }
+
+      default:
+        TORCH_CHECK(false, "Unrecognized data format");
+    }
+  }
 }
 
 Module load(
@@ -387,8 +462,8 @@ Module load(
     std::shared_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files) {
-  // Verify that we're loading a zip archive and not a torch.save pickle archive
-  // (marked by the 0x80 0x02 bytes at the start)
+  // Verify that we're loading a zip archive and not a torch.save pickle
+  // archive (marked by the 0x80 0x02 bytes at the start)
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
   TORCH_CHECK(
       check_zip_file(rai),
@@ -403,5 +478,75 @@ Module load(
   return deserializer.deserialize(device, extra_files);
 }
 
+// Replace object with a newly created but equivalent object.
+// The goal is to replace object's methods. However, since object's
+// methods are attached to type; we need to replace it's type.
+// Non-objects are unchanged; however, nested structures such as list, dict
+// are also reconstructed because they might contain an object.
+static IValue recreateObject(IValue ivalue, TypeResolver resolver) {
+  if (ivalue.isObject()) {
+    auto obj = ivalue.toObject();
+    auto classtype_old = obj->type();
+    auto newtype = resolver(*classtype_old->name());
+    size_t n = classtype_old->numAttributes();
+    auto newobj = c10::ivalue::Object::create(newtype, n);
+    for (const auto i : c10::irange(n)) {
+      newobj->setSlot(i, recreateObject(obj->getSlot(i), resolver));
+    }
+    return newobj;
+  } else if (ivalue.isList()) {
+    auto res = c10::impl::GenericList(ivalue.type()->containedType(0));
+    for (const auto& ival : ivalue.toList()) {
+      res.emplace_back(recreateObject(ival, resolver));
+    }
+    return res;
+  } else if (ivalue.isGenericDict()) {
+    auto result = c10::impl::GenericDict(
+        ivalue.type()->containedType(0), ivalue.type()->containedType(1));
+    for (const auto& kv : ivalue.toGenericDict()) {
+      result.insert_or_assign(
+          recreateObject(kv.key(), resolver),
+          recreateObject(kv.value(), resolver));
+    }
+    return result;
+  } else if (ivalue.isTuple()) {
+    std::vector<IValue> res;
+    for (const auto& ival : ivalue.toTuple()->elements()) {
+      res.push_back(recreateObject(ival, resolver));
+    }
+    return c10::ivalue::Tuple::create(res);
+  }
+  // Leaf types are returned verbatim.
+  return ivalue;
+}
+
+Module jitModuleFromSourceAndConstants(
+    const IValue& ivalue,
+    const ExtraFilesMap& source,
+    const std::vector<IValue>& constants,
+    int32_t version) {
+  auto compilation_unit = std::make_shared<CompilationUnit>();
+  SourceImporter importer(
+      compilation_unit,
+      &constants,
+      [&source](const std::string& qualifier) -> std::shared_ptr<Source> {
+        auto source_iter = source.find(qualifier);
+        if (source_iter == source.end()) {
+          return nullptr;
+        }
+        return std::make_shared<Source>(
+            source_iter->second, qualifier, 1, nullptr, Source::COPIES_STRING);
+      },
+      version);
+  auto type_resolver = [&](const c10::QualifiedName& qn) {
+    auto cls = importer.loadType(qn);
+    return c10::StrongTypePtr(compilation_unit, std::move(cls));
+  };
+  auto newIvalue = recreateObject(ivalue, type_resolver).toObject();
+  Module m(newIvalue);
+  rewriteQuantizedConvForBC(m);
+  return m;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index fe2e1129c272..8487b5183f5d 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -98,5 +98,11 @@ TORCH_API Module load(
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files);
 
+TORCH_API Module jitModuleFromSourceAndConstants(
+    const IValue& ivalue,
+    const ExtraFilesMap& source,
+    const std::vector<IValue>& constants,
+    int32_t version);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/import_export_helpers.cpp b/torch/csrc/jit/serialization/import_export_helpers.cpp
index 6ed475aa7e82..3ee2e9cad2fe 100644
--- a/torch/csrc/jit/serialization/import_export_helpers.cpp
+++ b/torch/csrc/jit/serialization/import_export_helpers.cpp
@@ -22,7 +22,7 @@ std::string qualifierToArchivePath(
   return export_prefix + path + "." + kExportSuffix;
 }
 
-std::shared_ptr<SourceView> findSourceInArchiveFromQualifier(
+std::shared_ptr<Source> findSourceInArchiveFromQualifier(
     caffe2::serialize::PyTorchStreamReader& reader,
     const std::string& export_prefix,
     const std::string& qualifier) {
diff --git a/torch/csrc/jit/serialization/import_export_helpers.h b/torch/csrc/jit/serialization/import_export_helpers.h
index d70bd2aa0e9e..c94993207e6a 100644
--- a/torch/csrc/jit/serialization/import_export_helpers.h
+++ b/torch/csrc/jit/serialization/import_export_helpers.h
@@ -12,7 +12,7 @@ class PyTorchStreamReader;
 namespace torch {
 namespace jit {
 
-struct SourceView;
+struct Source;
 
 // Convert a class type's qualifier name to the corresponding path the source
 // file it should be written to.
@@ -23,7 +23,7 @@ std::string qualifierToArchivePath(
     const std::string& qualifier,
     const std::string& export_prefix);
 
-std::shared_ptr<SourceView> findSourceInArchiveFromQualifier(
+std::shared_ptr<Source> findSourceInArchiveFromQualifier(
     caffe2::serialize::PyTorchStreamReader& reader,
     const std::string& export_prefix,
     const std::string& qualifier);
diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp
index d1fb898730eb..2a1f7fb04674 100644
--- a/torch/csrc/jit/serialization/import_source.cpp
+++ b/torch/csrc/jit/serialization/import_source.cpp
@@ -159,7 +159,7 @@ void SourceImporterImpl::parseSourceIfNeeded(const std::string& qualifier) {
     return;
   }
   loaded_sources_.insert(qualifier);
-  std::shared_ptr<SourceView> src = source_loader_(qualifier);
+  std::shared_ptr<Source> src = source_loader_(qualifier);
 
   // The importer, when looking for classes/functions doesn't know if 'foo'
   // contains definitions or if it is a prefix of 'foo.bar', we only figure it
diff --git a/torch/csrc/jit/serialization/import_source.h b/torch/csrc/jit/serialization/import_source.h
index 22e59b4e40dc..9a720a81bcbb 100644
--- a/torch/csrc/jit/serialization/import_source.h
+++ b/torch/csrc/jit/serialization/import_source.h
@@ -20,8 +20,7 @@
 namespace torch {
 namespace jit {
 
-using SourceLoader =
-    std::function<std::shared_ptr<SourceView>(const std::string&)>;
+using SourceLoader = std::function<std::shared_ptr<Source>(const std::string&)>;
 
 struct SourceImporterImpl : public Resolver,
                             std::enable_shared_from_this<SourceImporterImpl> {
diff --git a/torch/csrc/jit/serialization/mobile_bytecode.fbs b/torch/csrc/jit/serialization/mobile_bytecode.fbs
index b936a8ebd4ff..76de17a726de 100644
--- a/torch/csrc/jit/serialization/mobile_bytecode.fbs
+++ b/torch/csrc/jit/serialization/mobile_bytecode.fbs
@@ -1,3 +1,5 @@
+file_identifier "PTMF";
+
 namespace torch.jit.mobile.serialization;
 
 struct Int {
@@ -184,7 +186,15 @@ table ExtraFile {
 }
 
 table Module {
-  version:int;
+  // denotes the bytecode version of the mobile Module
+  // this starts from 9 for a flatbuffer file
+  // versions 8 and below are reserved pickle
+  // Version is bumped when changes in model serialization/execution
+  // can no longer work in current version
+  // To read more:
+  // https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L96
+  bytecode_version:uint;
+
   extra_files:[ExtraFile];
   methods:[uint];  // index to ivalues
   state_obj:uint; // index to ivalues
@@ -192,6 +202,15 @@ table Module {
   storage_data_size:int;  // number of storage data;
   storage_data:[StorageData];
   object_types:[ObjectType];
+  jit_sources:[ExtraFile];
+  jit_constants:[uint];  // index to ivalues
+
+  // version of operator
+  // Version is bumped when changes in operator
+  // can no longer work in current version
+  // To read more:
+  // https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md
+  operator_version:uint;
 }
 
 root_type Module;
diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
index cb3c95a6626d..494dad1b14ec 100644
--- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h
+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
@@ -272,15 +272,15 @@ template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Function>
 };
 
 bool VerifyIValueUnion(flatbuffers::Verifier &verifier, const void *obj, IValueUnion type);
-bool VerifyIValueUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyIValueUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<IValueUnion> *types);
 
 FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int FLATBUFFERS_FINAL_CLASS {
  private:
   int64_t int_val_;
 
  public:
-  Int() {
-    memset(static_cast<void *>(this), 0, sizeof(Int));
+  Int()
+      : int_val_(0) {
   }
   Int(int64_t _int_val)
       : int_val_(flatbuffers::EndianScalar(_int_val)) {
@@ -299,8 +299,8 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
   uint8_t bool_val_;
 
  public:
-  Bool() {
-    memset(static_cast<void *>(this), 0, sizeof(Bool));
+  Bool()
+      : bool_val_(0) {
   }
   Bool(bool _bool_val)
       : bool_val_(flatbuffers::EndianScalar(static_cast<uint8_t>(_bool_val))) {
@@ -319,8 +319,8 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Double FLATBUFFERS_FINAL_CLASS {
   double double_val_;
 
  public:
-  Double() {
-    memset(static_cast<void *>(this), 0, sizeof(Double));
+  Double()
+      : double_val_(0) {
   }
   Double(double _double_val)
       : double_val_(flatbuffers::EndianScalar(_double_val)) {
@@ -341,8 +341,11 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) PerTensorAffineSchema FLATBUFFERS_FINAL_C
   int32_t padding0__;
 
  public:
-  PerTensorAffineSchema() {
-    memset(static_cast<void *>(this), 0, sizeof(PerTensorAffineSchema));
+  PerTensorAffineSchema()
+      : q_scale_(0),
+        q_zero_point_(0),
+        padding0__(0) {
+    (void)padding0__;
   }
   PerTensorAffineSchema(double _q_scale, int32_t _q_zero_point)
       : q_scale_(flatbuffers::EndianScalar(_q_scale)),
@@ -371,8 +374,9 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) ComplexDouble FLATBUFFERS_FINAL_CLASS {
   double imag_;
 
  public:
-  ComplexDouble() {
-    memset(static_cast<void *>(this), 0, sizeof(ComplexDouble));
+  ComplexDouble()
+      : real_(0),
+        imag_(0) {
   }
   ComplexDouble(double _real, double _imag)
       : real_(flatbuffers::EndianScalar(_real)),
@@ -401,8 +405,12 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Instruction FLATBUFFERS_FINAL_CLASS {
   int32_t x_;
 
  public:
-  Instruction() {
-    memset(static_cast<void *>(this), 0, sizeof(Instruction));
+  Instruction()
+      : op_(0),
+        padding0__(0),
+        n_(0),
+        x_(0) {
+    (void)padding0__;
   }
   Instruction(int8_t _op, uint16_t _n, int32_t _x)
       : op_(flatbuffers::EndianScalar(_op)),
@@ -445,19 +453,19 @@ struct QuantizedSchema FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int8_t qscheme() const {
     return GetField<int8_t>(VT_QSCHEME, 0);
   }
-  bool mutate_qscheme(int8_t _qscheme) {
+  bool mutate_qscheme(int8_t _qscheme = 0) {
     return SetField<int8_t>(VT_QSCHEME, _qscheme, 0);
   }
   double scale() const {
     return GetField<double>(VT_SCALE, 0.0);
   }
-  bool mutate_scale(double _scale) {
+  bool mutate_scale(double _scale = 0.0) {
     return SetField<double>(VT_SCALE, _scale, 0.0);
   }
   int32_t zero_point() const {
     return GetField<int32_t>(VT_ZERO_POINT, 0);
   }
-  bool mutate_zero_point(int32_t _zero_point) {
+  bool mutate_zero_point(int32_t _zero_point = 0) {
     return SetField<int32_t>(VT_ZERO_POINT, _zero_point, 0);
   }
   const torch::jit::mobile::serialization::TensorMetadata *scales() const {
@@ -475,7 +483,7 @@ struct QuantizedSchema FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t axis() const {
     return GetField<int32_t>(VT_AXIS, 0);
   }
-  bool mutate_axis(int32_t _axis) {
+  bool mutate_axis(int32_t _axis = 0) {
     return SetField<int32_t>(VT_AXIS, _axis, 0);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -518,7 +526,6 @@ struct QuantizedSchemaBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  QuantizedSchemaBuilder &operator=(const QuantizedSchemaBuilder &);
   flatbuffers::Offset<QuantizedSchema> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<QuantizedSchema>(end);
@@ -558,19 +565,19 @@ struct TensorMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t storage_location_index() const {
     return GetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, 0);
   }
-  bool mutate_storage_location_index(uint32_t _storage_location_index) {
+  bool mutate_storage_location_index(uint32_t _storage_location_index = 0) {
     return SetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, _storage_location_index, 0);
   }
   int8_t scalar_type() const {
     return GetField<int8_t>(VT_SCALAR_TYPE, 0);
   }
-  bool mutate_scalar_type(int8_t _scalar_type) {
+  bool mutate_scalar_type(int8_t _scalar_type = 0) {
     return SetField<int8_t>(VT_SCALAR_TYPE, _scalar_type, 0);
   }
   int32_t storage_offset() const {
     return GetField<int32_t>(VT_STORAGE_OFFSET, 0);
   }
-  bool mutate_storage_offset(int32_t _storage_offset) {
+  bool mutate_storage_offset(int32_t _storage_offset = 0) {
     return SetField<int32_t>(VT_STORAGE_OFFSET, _storage_offset, 0);
   }
   const flatbuffers::Vector<int32_t> *sizes() const {
@@ -588,7 +595,7 @@ struct TensorMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool requires_grad() const {
     return GetField<uint8_t>(VT_REQUIRES_GRAD, 0) != 0;
   }
-  bool mutate_requires_grad(bool _requires_grad) {
+  bool mutate_requires_grad(bool _requires_grad = 0) {
     return SetField<uint8_t>(VT_REQUIRES_GRAD, static_cast<uint8_t>(_requires_grad), 0);
   }
   const torch::jit::mobile::serialization::QuantizedSchema *quantized_schema() const {
@@ -642,7 +649,6 @@ struct TensorMetadataBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  TensorMetadataBuilder &operator=(const TensorMetadataBuilder &);
   flatbuffers::Offset<TensorMetadata> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<TensorMetadata>(end);
@@ -722,7 +728,6 @@ struct StringBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  StringBuilder &operator=(const StringBuilder &);
   flatbuffers::Offset<String> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<String>(end);
@@ -777,7 +782,6 @@ struct DeviceBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DeviceBuilder &operator=(const DeviceBuilder &);
   flatbuffers::Offset<Device> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Device>(end);
@@ -844,7 +848,6 @@ struct ListBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ListBuilder &operator=(const ListBuilder &);
   flatbuffers::Offset<List> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<List>(end);
@@ -904,7 +907,6 @@ struct IntListBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  IntListBuilder &operator=(const IntListBuilder &);
   flatbuffers::Offset<IntList> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<IntList>(end);
@@ -959,7 +961,6 @@ struct DoubleListBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DoubleListBuilder &operator=(const DoubleListBuilder &);
   flatbuffers::Offset<DoubleList> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<DoubleList>(end);
@@ -1014,7 +1015,6 @@ struct BoolListBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  BoolListBuilder &operator=(const BoolListBuilder &);
   flatbuffers::Offset<BoolList> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<BoolList>(end);
@@ -1069,7 +1069,6 @@ struct TupleBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  TupleBuilder &operator=(const TupleBuilder &);
   flatbuffers::Offset<Tuple> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Tuple>(end);
@@ -1148,7 +1147,6 @@ struct DictBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DictBuilder &operator=(const DictBuilder &);
   flatbuffers::Offset<Dict> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Dict>(end);
@@ -1199,7 +1197,7 @@ struct ObjectType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   torch::jit::mobile::serialization::TypeType type() const {
     return static_cast<torch::jit::mobile::serialization::TypeType>(GetField<uint8_t>(VT_TYPE, 0));
   }
-  bool mutate_type(torch::jit::mobile::serialization::TypeType _type) {
+  bool mutate_type(torch::jit::mobile::serialization::TypeType _type = static_cast<torch::jit::mobile::serialization::TypeType>(0)) {
     return SetField<uint8_t>(VT_TYPE, static_cast<uint8_t>(_type), 0);
   }
   const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *attr_names() const {
@@ -1237,7 +1235,6 @@ struct ObjectTypeBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ObjectTypeBuilder &operator=(const ObjectTypeBuilder &);
   flatbuffers::Offset<ObjectType> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<ObjectType>(end);
@@ -1282,13 +1279,13 @@ struct Object FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t type_index() const {
     return GetField<uint32_t>(VT_TYPE_INDEX, 0);
   }
-  bool mutate_type_index(uint32_t _type_index) {
+  bool mutate_type_index(uint32_t _type_index = 0) {
     return SetField<uint32_t>(VT_TYPE_INDEX, _type_index, 0);
   }
   uint32_t state() const {
     return GetField<uint32_t>(VT_STATE, 0);
   }
-  bool mutate_state(uint32_t _state) {
+  bool mutate_state(uint32_t _state = 0) {
     return SetField<uint32_t>(VT_STATE, _state, 0);
   }
   const flatbuffers::Vector<uint32_t> *attrs() const {
@@ -1300,7 +1297,7 @@ struct Object FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t setstate_func() const {
     return GetField<uint32_t>(VT_SETSTATE_FUNC, 0);
   }
-  bool mutate_setstate_func(uint32_t _setstate_func) {
+  bool mutate_setstate_func(uint32_t _setstate_func = 0) {
     return SetField<uint32_t>(VT_SETSTATE_FUNC, _setstate_func, 0);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -1334,7 +1331,6 @@ struct ObjectBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ObjectBuilder &operator=(const ObjectBuilder &);
   flatbuffers::Offset<Object> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Object>(end);
@@ -1386,7 +1382,7 @@ struct EnumValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t value() const {
     return GetField<uint32_t>(VT_VALUE, 0);
   }
-  bool mutate_value(uint32_t _value) {
+  bool mutate_value(uint32_t _value = 0) {
     return SetField<uint32_t>(VT_VALUE, _value, 0);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -1412,7 +1408,6 @@ struct EnumValueBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  EnumValueBuilder &operator=(const EnumValueBuilder &);
   flatbuffers::Offset<EnumValue> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<EnumValue>(end);
@@ -1463,7 +1458,7 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t num_args_serialized() const {
     return GetField<int32_t>(VT_NUM_ARGS_SERIALIZED, -1);
   }
-  bool mutate_num_args_serialized(int32_t _num_args_serialized) {
+  bool mutate_num_args_serialized(int32_t _num_args_serialized = -1) {
     return SetField<int32_t>(VT_NUM_ARGS_SERIALIZED, _num_args_serialized, -1);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -1494,7 +1489,6 @@ struct OperatorBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  OperatorBuilder &operator=(const OperatorBuilder &);
   flatbuffers::Offset<Operator> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Operator>(end);
@@ -1550,7 +1544,7 @@ struct Arg FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t default_value() const {
     return GetField<uint32_t>(VT_DEFAULT_VALUE, 0);
   }
-  bool mutate_default_value(uint32_t _default_value) {
+  bool mutate_default_value(uint32_t _default_value = 0) {
     return SetField<uint32_t>(VT_DEFAULT_VALUE, _default_value, 0);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -1581,7 +1575,6 @@ struct ArgBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ArgBuilder &operator=(const ArgBuilder &);
   flatbuffers::Offset<Arg> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Arg>(end);
@@ -1659,7 +1652,6 @@ struct SchemaBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  SchemaBuilder &operator=(const SchemaBuilder &);
   flatbuffers::Offset<Schema> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Schema>(end);
@@ -1719,7 +1711,6 @@ struct DebugInfoBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DebugInfoBuilder &operator=(const DebugInfoBuilder &);
   flatbuffers::Offset<DebugInfo> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<DebugInfo>(end);
@@ -1790,7 +1781,7 @@ struct Function FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t register_size() const {
     return GetField<int32_t>(VT_REGISTER_SIZE, 0);
   }
-  bool mutate_register_size(int32_t _register_size) {
+  bool mutate_register_size(int32_t _register_size = 0) {
     return SetField<int32_t>(VT_REGISTER_SIZE, _register_size, 0);
   }
   const torch::jit::mobile::serialization::Schema *schema() const {
@@ -1808,7 +1799,7 @@ struct Function FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t class_type() const {
     return GetField<uint32_t>(VT_CLASS_TYPE, 0);
   }
-  bool mutate_class_type(uint32_t _class_type) {
+  bool mutate_class_type(uint32_t _class_type = 0) {
     return SetField<uint32_t>(VT_CLASS_TYPE, _class_type, 0);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -1870,7 +1861,6 @@ struct FunctionBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  FunctionBuilder &operator=(const FunctionBuilder &);
   flatbuffers::Offset<Function> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Function>(end);
@@ -1961,7 +1951,6 @@ struct StorageDataBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  StorageDataBuilder &operator=(const StorageDataBuilder &);
   flatbuffers::Offset<StorageData> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<StorageData>(end);
@@ -2138,7 +2127,6 @@ struct IValueBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  IValueBuilder &operator=(const IValueBuilder &);
   flatbuffers::Offset<IValue> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<IValue>(end);
@@ -2198,7 +2186,6 @@ struct ExtraFileBuilder {
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ExtraFileBuilder &operator=(const ExtraFileBuilder &);
   flatbuffers::Offset<ExtraFile> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<ExtraFile>(end);
@@ -2231,20 +2218,23 @@ inline flatbuffers::Offset<ExtraFile> CreateExtraFileDirect(
 struct Module FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ModuleBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_VERSION = 4,
+    VT_BYTECODE_VERSION = 4,
     VT_EXTRA_FILES = 6,
     VT_METHODS = 8,
     VT_STATE_OBJ = 10,
     VT_IVALUES = 12,
     VT_STORAGE_DATA_SIZE = 14,
     VT_STORAGE_DATA = 16,
-    VT_OBJECT_TYPES = 18
+    VT_OBJECT_TYPES = 18,
+    VT_JIT_SOURCES = 20,
+    VT_JIT_CONSTANTS = 22,
+    VT_OPERATOR_VERSION = 24
   };
-  int32_t version() const {
-    return GetField<int32_t>(VT_VERSION, 0);
+  uint32_t bytecode_version() const {
+    return GetField<uint32_t>(VT_BYTECODE_VERSION, 0);
   }
-  bool mutate_version(int32_t _version) {
-    return SetField<int32_t>(VT_VERSION, _version, 0);
+  bool mutate_bytecode_version(uint32_t _bytecode_version = 0) {
+    return SetField<uint32_t>(VT_BYTECODE_VERSION, _bytecode_version, 0);
   }
   const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_EXTRA_FILES);
@@ -2261,7 +2251,7 @@ struct Module FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t state_obj() const {
     return GetField<uint32_t>(VT_STATE_OBJ, 0);
   }
-  bool mutate_state_obj(uint32_t _state_obj) {
+  bool mutate_state_obj(uint32_t _state_obj = 0) {
     return SetField<uint32_t>(VT_STATE_OBJ, _state_obj, 0);
   }
   const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues() const {
@@ -2273,7 +2263,7 @@ struct Module FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t storage_data_size() const {
     return GetField<int32_t>(VT_STORAGE_DATA_SIZE, 0);
   }
-  bool mutate_storage_data_size(int32_t _storage_data_size) {
+  bool mutate_storage_data_size(int32_t _storage_data_size = 0) {
     return SetField<int32_t>(VT_STORAGE_DATA_SIZE, _storage_data_size, 0);
   }
   const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data() const {
@@ -2288,9 +2278,27 @@ struct Module FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *mutable_object_types() {
     return GetPointer<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *>(VT_OBJECT_TYPES);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *mutable_jit_sources() {
+    return GetPointer<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  const flatbuffers::Vector<uint32_t> *jit_constants() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  flatbuffers::Vector<uint32_t> *mutable_jit_constants() {
+    return GetPointer<flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  uint32_t operator_version() const {
+    return GetField<uint32_t>(VT_OPERATOR_VERSION, 0);
+  }
+  bool mutate_operator_version(uint32_t _operator_version = 0) {
+    return SetField<uint32_t>(VT_OPERATOR_VERSION, _operator_version, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_VERSION) &&
+           VerifyField<uint32_t>(verifier, VT_BYTECODE_VERSION) &&
            VerifyOffset(verifier, VT_EXTRA_FILES) &&
            verifier.VerifyVector(extra_files()) &&
            verifier.VerifyVectorOfTables(extra_files()) &&
@@ -2307,6 +2315,12 @@ struct Module FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_OBJECT_TYPES) &&
            verifier.VerifyVector(object_types()) &&
            verifier.VerifyVectorOfTables(object_types()) &&
+           VerifyOffset(verifier, VT_JIT_SOURCES) &&
+           verifier.VerifyVector(jit_sources()) &&
+           verifier.VerifyVectorOfTables(jit_sources()) &&
+           VerifyOffset(verifier, VT_JIT_CONSTANTS) &&
+           verifier.VerifyVector(jit_constants()) &&
+           VerifyField<uint32_t>(verifier, VT_OPERATOR_VERSION) &&
            verifier.EndTable();
   }
 };
@@ -2315,8 +2329,8 @@ struct ModuleBuilder {
   typedef Module Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_version(int32_t version) {
-    fbb_.AddElement<int32_t>(Module::VT_VERSION, version, 0);
+  void add_bytecode_version(uint32_t bytecode_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_BYTECODE_VERSION, bytecode_version, 0);
   }
   void add_extra_files(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files) {
     fbb_.AddOffset(Module::VT_EXTRA_FILES, extra_files);
@@ -2339,11 +2353,19 @@ struct ModuleBuilder {
   void add_object_types(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types) {
     fbb_.AddOffset(Module::VT_OBJECT_TYPES, object_types);
   }
+  void add_jit_sources(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources) {
+    fbb_.AddOffset(Module::VT_JIT_SOURCES, jit_sources);
+  }
+  void add_jit_constants(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> jit_constants) {
+    fbb_.AddOffset(Module::VT_JIT_CONSTANTS, jit_constants);
+  }
+  void add_operator_version(uint32_t operator_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_OPERATOR_VERSION, operator_version, 0);
+  }
   explicit ModuleBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ModuleBuilder &operator=(const ModuleBuilder &);
   flatbuffers::Offset<Module> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<Module>(end);
@@ -2353,15 +2375,21 @@ struct ModuleBuilder {
 
 inline flatbuffers::Offset<Module> CreateModule(
     flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t version = 0,
+    uint32_t bytecode_version = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files = 0,
     flatbuffers::Offset<flatbuffers::Vector<uint32_t>> methods = 0,
     uint32_t state_obj = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>> ivalues = 0,
     int32_t storage_data_size = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>> storage_data = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> jit_constants = 0,
+    uint32_t operator_version = 0) {
   ModuleBuilder builder_(_fbb);
+  builder_.add_operator_version(operator_version);
+  builder_.add_jit_constants(jit_constants);
+  builder_.add_jit_sources(jit_sources);
   builder_.add_object_types(object_types);
   builder_.add_storage_data(storage_data);
   builder_.add_storage_data_size(storage_data_size);
@@ -2369,35 +2397,43 @@ inline flatbuffers::Offset<Module> CreateModule(
   builder_.add_state_obj(state_obj);
   builder_.add_methods(methods);
   builder_.add_extra_files(extra_files);
-  builder_.add_version(version);
+  builder_.add_bytecode_version(bytecode_version);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<Module> CreateModuleDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t version = 0,
+    uint32_t bytecode_version = 0,
     const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files = nullptr,
     const std::vector<uint32_t> *methods = nullptr,
     uint32_t state_obj = 0,
     const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues = nullptr,
     int32_t storage_data_size = 0,
     const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data = nullptr,
-    const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types = nullptr) {
+    const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types = nullptr,
+    const std::vector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources = nullptr,
+    const std::vector<uint32_t> *jit_constants = nullptr,
+    uint32_t operator_version = 0) {
   auto extra_files__ = extra_files ? _fbb.CreateVector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*extra_files) : 0;
   auto methods__ = methods ? _fbb.CreateVector<uint32_t>(*methods) : 0;
   auto ivalues__ = ivalues ? _fbb.CreateVector<flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>(*ivalues) : 0;
   auto storage_data__ = storage_data ? _fbb.CreateVector<flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>(*storage_data) : 0;
   auto object_types__ = object_types ? _fbb.CreateVector<flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>(*object_types) : 0;
+  auto jit_sources__ = jit_sources ? _fbb.CreateVector<flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*jit_sources) : 0;
+  auto jit_constants__ = jit_constants ? _fbb.CreateVector<uint32_t>(*jit_constants) : 0;
   return torch::jit::mobile::serialization::CreateModule(
       _fbb,
-      version,
+      bytecode_version,
       extra_files__,
       methods__,
       state_obj,
       ivalues__,
       storage_data_size,
       storage_data__,
-      object_types__);
+      object_types__,
+      jit_sources__,
+      jit_constants__,
+      operator_version);
 }
 
 inline bool VerifyIValueUnion(flatbuffers::Verifier &verifier, const void *obj, IValueUnion type) {
@@ -2469,7 +2505,7 @@ inline bool VerifyIValueUnion(flatbuffers::Verifier &verifier, const void *obj,
   }
 }
 
-inline bool VerifyIValueUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyIValueUnionVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<IValueUnion> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
   for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
@@ -2493,26 +2529,39 @@ inline Module *GetMutableModule(void *buf) {
   return flatbuffers::GetMutableRoot<Module>(buf);
 }
 
+inline torch::jit::mobile::serialization::Module *GetMutableSizePrefixedModule(void *buf) {
+  return flatbuffers::GetMutableSizePrefixedRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline const char *ModuleIdentifier() {
+  return "PTMF";
+}
+
+inline bool ModuleBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, ModuleIdentifier());
+}
+
 inline bool VerifyModuleBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<torch::jit::mobile::serialization::Module>(nullptr);
+  return verifier.VerifyBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
 }
 
 inline bool VerifySizePrefixedModuleBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<torch::jit::mobile::serialization::Module>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
 }
 
 inline void FinishModuleBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
     flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
-  fbb.Finish(root);
+  fbb.Finish(root, ModuleIdentifier());
 }
 
 inline void FinishSizePrefixedModuleBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
     flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
-  fbb.FinishSizePrefixed(root);
+  fbb.FinishSizePrefixed(root, ModuleIdentifier());
 }
 
 }  // namespace serialization
diff --git a/torch/csrc/jit/serialization/onnx.cpp b/torch/csrc/jit/serialization/onnx.cpp
index 9e45d78c0ea3..aaf91a5c71ad 100644
--- a/torch/csrc/jit/serialization/onnx.cpp
+++ b/torch/csrc/jit/serialization/onnx.cpp
@@ -61,6 +61,16 @@ void dump(const onnx::TypeProto_Tensor& tensor_type, std::ostream& stream) {
 
 void dump(const onnx::TypeProto& type, std::ostream& stream);
 
+void dump(const onnx::TypeProto_Optional& optional_type, std::ostream& stream) {
+  stream << "Optional<";
+  if (optional_type.has_elem_type()) {
+    dump(optional_type.elem_type(), stream);
+  } else {
+    stream << "None";
+  }
+  stream << ">";
+}
+
 void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
   stream << "Sequence<";
   if (sequence_type.has_elem_type()) {
@@ -76,6 +86,8 @@ void dump(const onnx::TypeProto& type, std::ostream& stream) {
     dump(type.tensor_type(), stream);
   } else if (type.has_sequence_type()) {
     dump(type.sequence_type(), stream);
+  } else if (type.has_optional_type()) {
+    dump(type.optional_type(), stream);
   } else {
     stream << "None";
   }
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 846c29a61d99..074f84714afe 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 
+#include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
@@ -17,6 +18,7 @@
 #include <torch/csrc/jit/operator_upgraders/version_map.h>
 #include <torch/csrc/jit/resource_guard.h>
 #include <torch/csrc/jit/runtime/calculate_necessary_args.h>
+#include <torch/csrc/jit/serialization/type_name_uniquer.h>
 
 using c10::QualifiedName;
 
@@ -751,15 +753,22 @@ struct PythonPrintImpl {
       if (version_entry != get_operator_version_map().end()) {
         const auto& entry = version_entry->second;
         // TODO (tugsuu) move this calculation into a seperate step.
-        min_version_ = std::max(
-            min_version_, uint64_t(entry[entry.size() - 1].bumped_at_version));
+        uint64_t current_version = entry[entry.size() - 1].bumped_at_version;
+        uint64_t legacy_version_map_version =
+            get_min_version_for_kind(node->kind());
+
+        // True means we solely calculate based on upgrader version
+        if (get_version_calculator_flag()) {
+          min_version_ = std::max(min_version_, current_version);
+        } else {
+          if (legacy_version_map_version != 0) {
+            min_version_ = std::max(min_version_, legacy_version_map_version);
+          } else {
+            min_version_ = std::max(min_version_, current_version);
+          }
+        }
       }
     }
-    // We want to manually bump the minimum versions for
-    // other variants of aten::div and aten::full which
-    // are not covered by the new upgraders
-    min_version_ =
-        std::max(min_version_, get_min_version_for_kind(node->kind()));
 #else
     min_version_ =
         std::max(min_version_, get_min_version_for_kind(node->kind()));
@@ -1662,5 +1671,98 @@ uint64_t PythonPrint::minVersion() const {
 
 PythonPrint::~PythonPrint() = default;
 
+std::vector<IValue> traverseIValueAndGetObjects(IValue ivalue) {
+  std::vector<IValue> result;
+  std::vector<IValue> stack;
+  stack.emplace_back(ivalue);
+  while (!stack.empty()) {
+    IValue head = stack.back();
+    stack.pop_back();
+    if (head.isObject()) {
+      result.push_back(head);
+      auto obj = head.toObject();
+      ClassTypePtr type = obj->type();
+      if (type->hasMethod("__getstate__")) {
+        Function& getstate = type->getMethod("__getstate__");
+        stack.emplace_back(getstate({obj}));
+      } else {
+        for (size_t i = 0, n = type->numAttributes(); i < n; ++i) {
+          stack.emplace_back(obj->getSlot(i));
+        }
+      }
+    } else if (ivalue.isGenericDict()) {
+      for (const auto& kv : ivalue.toGenericDict()) {
+        // skip key because key cannot be an object
+        stack.emplace_back(kv.value());
+      }
+    } else if (ivalue.isList()) {
+      for (const auto& v : ivalue.toList()) {
+        stack.emplace_back(v);
+      }
+    } else if (ivalue.isTuple()) {
+      for (const auto& v : ivalue.toTuple()->elements()) {
+        stack.emplace_back(v);
+      }
+    }
+  }
+  return result;
+}
+
+c10::optional<std::string> printType(
+    const c10::Type& type,
+    torch::jit::TypeNameUniquer& type_name_uniquer) {
+  if (auto dyn = type.castRaw<c10::DynamicType>()) {
+    return dyn->fallback()->annotation_str(
+        [&](auto&& t) { return printType(t, type_name_uniquer); });
+  }
+  auto namedType = type.cast<c10::NamedType>();
+  if (namedType && namedType->name()) {
+    return type_name_uniquer.getUniqueName(namedType).qualifiedName();
+  }
+  return c10::nullopt;
+}
+
+void jitModuleToPythonCodeAndConstants(
+    const Module& module,
+    ExtraFilesMap* jit_sources, // output
+    std::vector<IValue>* constants // output
+) {
+  std::vector<IValue> objects = traverseIValueAndGetObjects(module._ivalue());
+  std::unordered_set<c10::QualifiedName> visited;
+  PrintDepsTable class_deps;
+  TypeNameUniquer uniquer;
+  auto type_printer = [&](const c10::Type& t) { return printType(t, uniquer); };
+
+  // Group by prefix; because every prefix is a file.
+  std::unordered_map<std::string, PythonPrint> grouped_by_prefix;
+  for (const IValue& obj : objects) {
+    ObjectPtr obj_ptr = obj.toObject();
+    ClassTypePtr class_type = obj_ptr->type();
+    class_deps.add(class_type);
+  }
+
+  for (int i = 0; i < class_deps.size(); ++i) {
+    auto type = class_deps[i];
+    auto qualname = uniquer.getUniqueName(type);
+    std::string qualifier = qualname.prefix();
+    auto pp_iter = grouped_by_prefix.find(qualifier);
+    if (pp_iter == grouped_by_prefix.end()) {
+      pp_iter = grouped_by_prefix
+                    .emplace(
+                        qualifier,
+                        PythonPrint(
+                            *constants,
+                            class_deps,
+                            type_printer,
+                            /*enforce_importable=*/true))
+                    .first;
+    }
+    pp_iter->second.printNamedType(type);
+  }
+  for (const auto& kv : grouped_by_prefix) {
+    (*jit_sources)[kv.first] = kv.second.str();
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/python_print.h b/torch/csrc/jit/serialization/python_print.h
index 0e95d6de626a..94fb83d7aab7 100644
--- a/torch/csrc/jit/serialization/python_print.h
+++ b/torch/csrc/jit/serialization/python_print.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <iostream>
 #include <vector>
@@ -49,5 +50,12 @@ struct TORCH_API PythonPrint {
 };
 
 TORCH_API bool printerHasSpecialCaseFor(c10::Symbol sym);
+
+TORCH_API void jitModuleToPythonCodeAndConstants(
+    const Module& module,
+    ExtraFilesMap* jit_sources, // output
+    std::vector<IValue>* constants // output
+);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index bc7ae6400485..170714c6c17a 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -1,54 +1,97 @@
 #include <torch/csrc/jit/serialization/source_range_serialization.h>
 #include <torch/csrc/jit/serialization/source_range_serialization_impl.h>
 
+#include <c10/util/Exception.h>
+#include <c10/util/Flags.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/serialization/pickle.h>
+#include <algorithm>
 
 namespace torch {
 namespace jit {
 
+// "Whether to emit compact debug_pkl when saving a model to .pt file."
+// "Compact file is smaller but cannot be loaded by old torch binaries."
+// TODO(qihan) remove when all binaries are using string table.
+thread_local bool should_use_format_with_string_table_ = true;
+
 class SourceRangeSerializer {
  public:
   // Serialize SourceRange as Tuple[SourceType, int, int]
-  // where SourceType = Tuple[str, Optional[str], int, List[int]],
+  // where SourceType = Tuple[int, int, int, List[int]],
+  // The first 2 ints are positions into the vector returned by textSaved
+  // after all the Ranges are processed. textSaved() returns a vector of str
   // the serialized form of Source
   c10::IValue serialize(const SourceRange& sr);
 
+  const std::vector<c10::IValue>& texts_saved() {
+    return texts_;
+  }
+
+  SourceRangeSerializer() {
+    texts_.emplace_back("");
+    text_to_idx_[texts_.back().toStringRef()] = 0;
+  }
+
  private:
   // Serialize Source as Tuple[str, Optional[str], int, List[int]]
   // This caches serialized sources, since many SourceRanges can
   // refer to the same one.
-  c10::IValue serialize_source(const std::shared_ptr<SourceView>& s);
+  c10::IValue serialize_source(const std::shared_ptr<Source>& s);
+  std::unordered_map<std::shared_ptr<Source>, c10::IValue> serialized_sources;
+
+  int64_t store_text_and_get_index(const std::string& text_view);
 
-  std::unordered_map<std::shared_ptr<SourceView>, c10::IValue>
-      serialized_sources;
+  std::vector<c10::IValue> texts_;
+  std::unordered_map<c10::string_view, int64_t> text_to_idx_;
 };
 
 SourceRange SourceRangeDeserializer::deserialize(const c10::IValue& iv) {
   const auto& tup_elems = iv.toTupleRef().elements();
   TORCH_INTERNAL_ASSERT(tup_elems.size() == 3);
-  std::shared_ptr<SourceView> source_ = deserialize_source(tup_elems[0]);
+  std::shared_ptr<Source> source_ = deserialize_source(tup_elems[0]);
   int64_t start_ = tup_elems[1].toInt();
   int64_t end_ = tup_elems[2].toInt();
   return SourceRange(source_, start_, end_);
 }
 
-std::shared_ptr<SourceView> SourceRangeDeserializer::deserialize_source(
+std::shared_ptr<Source> SourceRangeDeserializer::deserialize_source(
     const c10::IValue& iv) {
   auto tup = iv.toTuple();
   auto it = cached_sources.find(tup);
   if (it != cached_sources.end()) {
     return it->second;
   }
-
+  std::shared_ptr<Source> source;
   const auto& tup_elems = tup->elements();
   TORCH_INTERNAL_ASSERT(tup_elems.size() == 3);
-  std::string text_ = tup_elems[0].toString()->string();
-  c10::optional<std::string> filename_ = tup_elems[1].toOptional<std::string>();
-  int64_t starting_line_no_ = tup_elems[2].toInt();
+  if (!text_table_.empty()) {
+    const auto& textIndex = tup_elems[0].toIntList();
+    int64_t fnameIndex = tup_elems[1].toInt();
+    int64_t starting_line_no_ = tup_elems[2].toInt();
+    c10::optional<std::string> filename = c10::nullopt;
+
+    filename = *text_table_[fnameIndex];
 
-  auto source = std::make_shared<Source>(
-      std::move(text_), std::move(filename_), starting_line_no_);
+    std::vector<c10::string_view> pieces;
+    std::vector<std::shared_ptr<std::string>> strs;
+
+    for (int64_t i : textIndex) {
+      pieces.emplace_back(*text_table_[i]);
+      strs.emplace_back(text_table_[i]);
+    }
+
+    StringCordView str_cord(std::move(pieces), std::move(strs));
+
+    source = std::make_shared<Source>(str_cord, filename, starting_line_no_);
+  } else {
+    std::string text_ = tup_elems[0].toString()->string();
+    c10::optional<std::string> filename_ =
+        tup_elems[1].toOptional<std::string>();
+    int64_t starting_line_no_ = tup_elems[2].toInt();
+    source = std::make_shared<Source>(
+        std::move(text_), std::move(filename_), starting_line_no_);
+  }
   cached_sources[tup] = source;
   return source;
 }
@@ -58,17 +101,50 @@ c10::IValue SourceRangeSerializer::serialize(const SourceRange& sr) {
       serialize_source(sr.source()), (int64_t)sr.start(), (int64_t)sr.end());
 }
 
+int64_t SourceRangeSerializer::store_text_and_get_index(
+    const std::string& text_view) {
+  auto text_iter = text_to_idx_.find(text_view);
+  if (text_iter == text_to_idx_.end()) {
+    int64_t text_pos = static_cast<int64_t>(texts_.size());
+    texts_.emplace_back(text_view);
+    text_to_idx_[texts_.back().toStringView()] = text_pos;
+    return text_pos;
+  } else {
+    return text_iter->second;
+  }
+}
+
 c10::IValue SourceRangeSerializer::serialize_source(
-    const std::shared_ptr<SourceView>& s) {
+    const std::shared_ptr<Source>& s) {
   if (serialized_sources.count(s)) {
     return serialized_sources.at(s);
   }
   c10::intrusive_ptr<c10::ivalue::Tuple> serialized;
-  if (s == nullptr) {
-    serialized = c10::ivalue::Tuple::create({"", "", 0});
+  c10::List<int64_t> lines;
+  if (should_use_format_with_string_table_) {
+    if (s == nullptr) {
+      serialized = c10::ivalue::Tuple::create({lines, 0, 0});
+    } else {
+      for (size_t lineno = 0; lineno < s->num_lines(); lineno++) {
+        std::string line_content = s->get_line(lineno).str();
+        int64_t text_pos = store_text_and_get_index(line_content);
+        lines.push_back(text_pos);
+      }
+
+      int64_t fname_pos = 0;
+      if (s->filename().has_value()) {
+        fname_pos = store_text_and_get_index(*s->filename());
+      }
+      serialized = c10::ivalue::Tuple::create(
+          {lines, fname_pos, (int64_t)s->starting_line_no()});
+    }
   } else {
-    serialized = c10::ivalue::Tuple::create(
-        {s->text(), s->filename(), (int64_t)s->starting_line_no()});
+    if (s == nullptr) {
+      serialized = c10::ivalue::Tuple::create({"", "", 0});
+    } else {
+      serialized = c10::ivalue::Tuple::create(
+          {s->text_str().str(), s->filename(), (int64_t)s->starting_line_no()});
+    }
   }
   serialized_sources[s] = serialized;
   return serialized;
@@ -86,14 +162,24 @@ std::vector<char> SourceRangePickler::pickle(
     if (it != source_range_tags.end()) {
       source_range_tag = it->second;
     }
+
     ivalues.emplace_back(c10::ivalue::Tuple::create(
         {(int64_t)range.bytes,
          srs->serialize(range.range),
          static_cast<int64_t>(source_range_tag)}));
   }
+
   std::vector<at::Tensor> table;
+  auto textTable = c10::ivalue::Tuple::create(srs->texts_saved());
   auto ivalue = c10::ivalue::Tuple::create(std::move(ivalues));
-  auto result = jit::pickle(ivalue, &table);
+  std::vector<char> result;
+  if (should_use_format_with_string_table_) {
+    result = jit::pickle(
+        c10::ivalue::Tuple::create({kFormatWithStringTable, textTable, ivalue}),
+        &table);
+  } else {
+    result = jit::pickle(ivalue, &table);
+  }
   TORCH_CHECK(table.size() == 0, "Expected 0 tensors to be written");
   return result;
 }
@@ -103,7 +189,7 @@ ConcreteSourceRangeUnpickler::ConcreteSourceRangeUnpickler(
     size_t size)
     : data(std::move(data)),
       size(size),
-      deserializer(new SourceRangeDeserializer()),
+      deserializer(nullptr),
       unpickled_records(nullptr) {}
 
 void ConcreteSourceRangeUnpickler::unpickle() {
@@ -119,10 +205,19 @@ void ConcreteSourceRangeUnpickler::unpickle() {
                           {},
                           c10::parseType)
                           .toTuple();
-  const auto& ivalues = ivaluesTuple->elements();
 
+  const auto& ivalues = ivaluesTuple->elements();
   unpickled_records = std::make_shared<SourceRangeRecords>();
-  for (auto& val : ivalues) {
+  IValue lines;
+  if (ivalues[0].isString() &&
+      kFormatWithStringTable == ivalues[0].toStringRef()) {
+    deserializer.reset(new SourceRangeDeserializer(ivalues[1]));
+    lines = ivalues[2];
+  } else {
+    deserializer.reset(new SourceRangeDeserializer());
+    lines = ivaluesTuple;
+  }
+  for (auto& val : lines.toTuple()->elements()) {
     const auto& tup_elems = val.toTupleRef().elements();
     int64_t offset = tup_elems[kByteOffsetIndex].toInt();
     auto source_range = deserializer->deserialize(tup_elems[kSourceRangeIndex]);
@@ -152,5 +247,10 @@ c10::optional<SourceRange> ConcreteSourceRangeUnpickler::
   return c10::nullopt;
 }
 
+TORCH_API void setShouldUseFormatWithStringTable(
+    bool should_use_format_with_string_table) {
+  should_use_format_with_string_table_ = should_use_format_with_string_table;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/source_range_serialization.h b/torch/csrc/jit/serialization/source_range_serialization.h
index 4ecd5dcfff18..577a3832a3c1 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.h
+++ b/torch/csrc/jit/serialization/source_range_serialization.h
@@ -20,6 +20,7 @@ class SourceRangeSerializer;
 static constexpr size_t kByteOffsetIndex = 0;
 static constexpr size_t kSourceRangeIndex = 1;
 static constexpr size_t kSourceRangeTagIndex = 2;
+constexpr c10::string_view kFormatWithStringTable = "FORMAT_WITH_STRING_TABLE";
 
 class SourceRangePickler {
  public:
@@ -35,14 +36,21 @@ class SourceRangePickler {
 
 class SourceRangeDeserializer {
  public:
+  SourceRangeDeserializer() = default;
+  explicit SourceRangeDeserializer(c10::IValue text_table) {
+    for (const auto& x : text_table.toTuple()->elements()) {
+      text_table_.emplace_back(std::make_shared<std::string>(x.toStringRef()));
+    }
+  }
   SourceRange deserialize(const c10::IValue& iv);
 
  private:
-  std::shared_ptr<SourceView> deserialize_source(const c10::IValue& iv);
+  std::shared_ptr<Source> deserialize_source(const c10::IValue& iv);
   std::unordered_map<
       c10::intrusive_ptr<c10::ivalue::Tuple>,
-      std::shared_ptr<SourceView>>
+      std::shared_ptr<Source>>
       cached_sources;
+  std::vector<std::shared_ptr<std::string>> text_table_;
 };
 
 class SourceRangeUnpickler {
@@ -53,5 +61,8 @@ class SourceRangeUnpickler {
   virtual ~SourceRangeUnpickler() = default;
 };
 
+TORCH_API void setShouldUseFormatWithStringTable(
+    bool should_use_format_with_string_table);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 391849ace7c7..b75ed3f75690 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -77,6 +77,7 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case StreamObjType::Kind:
       case QSchemeType::Kind:
       case LayoutType::Kind:
+      case MemoryFormatType::Kind:
       case ScalarTypeType::Kind:
       case RRefType::Kind:
       case AnyType::Kind:
@@ -86,6 +87,10 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case AnyEnumType::Kind:
         // no op, there is nothing to tag
         break;
+      case c10::SymIntType::Kind:
+        TORCH_CHECK(!w.value.toSymInt().is_symbolic());
+        // no op, there is nothing to tag
+        break;
       case DynamicType::Kind:
       case UnionType::Kind:
       case EnumType::Kind:
@@ -467,7 +472,7 @@ PickleOpCode Unpickler::readInstruction() {
         caffe2::TypeMeta dtype = at::CPU(type).typeMeta();
 
         at::DataPtr storage_ptr;
-        if (numel > 0) {
+        if (numel > 0 && !device.is_meta()) {
           // If there are no elements in the tensor, there's no point in
           // reading a zero (0) byte file from the input stream and paying
           // that cost.
@@ -487,7 +492,7 @@ PickleOpCode Unpickler::readInstruction() {
       }
 
       auto options = at::CPU(type).options();
-      if (use_storage_device_) {
+      if (use_storage_device_ && !device.is_meta()) {
         options = options.device(storage.device());
         device = storage.device();
       }
@@ -500,7 +505,7 @@ PickleOpCode Unpickler::readInstruction() {
         tensor = at::empty({0}, options).set_(storage);
       }
 
-      if (device.is_cuda() || device.is_xpu()) {
+      if (device.is_cuda() || device.is_xpu() || device.is_meta()) {
         tensor = tensor.to(device, tensor.scalar_type());
       } else if (device.type() != DeviceType::CPU) {
         AT_ERROR(
diff --git a/torch/csrc/jit/tensorexpr/analysis.h b/torch/csrc/jit/tensorexpr/analysis.h
index 82e7b7f62afd..d74695461411 100644
--- a/torch/csrc/jit/tensorexpr/analysis.h
+++ b/torch/csrc/jit/tensorexpr/analysis.h
@@ -196,6 +196,34 @@ class StmtsReadingBuf : public IRVisitor {
   std::vector<StmtPtr> reads_;
 };
 
+class ExternalAllocBufFinder : public IRVisitor {
+ public:
+  void visit(ExternalCallWithAllocPtr v) override {
+    const auto& bufs_out = v->buf_out_args();
+    bufs_.insert(bufs_out.begin(), bufs_out.end());
+    IRVisitor::visit(v);
+  }
+
+  static std::unordered_set<BufPtr> find(StmtPtr s) {
+    ExternalAllocBufFinder f;
+    s->accept(&f);
+    return f.bufs();
+  }
+
+  static std::unordered_set<BufPtr> find(ExprPtr e) {
+    ExternalAllocBufFinder f;
+    e->accept(&f);
+    return f.bufs();
+  }
+
+  const std::unordered_set<BufPtr>& bufs() {
+    return bufs_;
+  }
+
+ private:
+  std::unordered_set<BufPtr> bufs_;
+};
+
 // Traverses the IR to determine if a particular Var is modified within it.
 class ModifiesVarChecker : public IRVisitor {
  public:
@@ -287,6 +315,14 @@ class BufLiveRange : public IRVisitor {
         }
       }
     }
+    auto loads3 = NodeFinder<ExternalCallWithAlloc>::find(s);
+    for (auto l : loads3) {
+      for (auto lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
     return false;
   }
 
@@ -303,6 +339,14 @@ class BufLiveRange : public IRVisitor {
         return true;
       }
     }
+    auto writes3 = NodeFinder<ExternalCallWithAlloc>::find(s);
+    for (auto w : writes3) {
+      for (auto wb : w->buf_out_args()) {
+        if (wb == buf_) {
+          return true;
+        }
+      }
+    }
     return false;
   }
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
index ae582650b0f9..9cfda0670f2f 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
@@ -1,11 +1,87 @@
 #include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
 
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 namespace analysis {
 
+// Returns true if the given expression is guaranteed to be positive.
+bool mustBePositive(ExprPtr e) {
+  if (e->isConstant()) {
+    int e_val = immediateAs<int>(e);
+    return e_val > 0;
+  }
+  return false;
+}
+
+// Returns true if the given expression is guaranteed to be negative.
+bool mustBeNegative(ExprPtr e) {
+  if (e->isConstant()) {
+    int e_val = immediateAs<int>(e);
+    return e_val < 0;
+  }
+  return false;
+}
+
+// Returns true if the given expression is guaranteed to be zero.
+bool mustBeZero(ExprPtr e) {
+  if (e->isConstant()) {
+    int e_val = immediateAs<int>(e);
+    return e_val == 0;
+  }
+  return false;
+}
+
+void Bound::print() const {
+  std::cout << "(" << *start << ", " << *end << ")";
+}
+
+bool Bound::equals(const Bound& other) const {
+  return exprEquals(start, other.start) && exprEquals(end, other.end);
+}
+
+bool Bound::operator==(const Bound& other) const {
+  if (equals(other)) {
+    auto ret_expr = IRSimplifier::simplify(alloc<Sub>(start, end));
+    return mustBeZero(ret_expr);
+  }
+
+  return false;
+}
+
+bool Bound::operator!=(const Bound& other) const {
+  return (*this < other) || (*this > other);
+}
+
+bool Bound::operator>=(const Bound& other) const {
+  if (*this == other) {
+    return true;
+  }
+  auto ret_expr = IRSimplifier::simplify(alloc<Sub>(start, other.end));
+  return mustBePositive(ret_expr) || mustBeZero(ret_expr);
+}
+
+bool Bound::operator>(const Bound& other) const {
+  auto ret_expr = IRSimplifier::simplify(alloc<Sub>(start, other.end));
+  return mustBePositive(ret_expr);
+}
+
+bool Bound::operator<=(const Bound& other) const {
+  if (*this == other) {
+    return true;
+  }
+  auto ret_expr = IRSimplifier::simplify(alloc<Sub>(end, other.start));
+  return mustBeNegative(ret_expr) || mustBeZero(ret_expr);
+}
+
+bool Bound::operator<(const Bound& other) const {
+  auto ret_expr = IRSimplifier::simplify(alloc<Sub>(end, other.start));
+  return mustBeNegative(ret_expr);
+}
+
 OverlapKind boundOverlap(Bound a, Bound b) {
   // If they're equal they're equal.
   bool startEqual = exprEquals(a.start, b.start);
@@ -14,16 +90,30 @@ OverlapKind boundOverlap(Bound a, Bound b) {
     return ContainedOrEqual;
   }
 
+  // We have to figure out if the bounds fall under the following 2 cases:
+  // 1. a is before b
+  //      a.start ... a.end ... b.start ... b.end
+  // 2. b is before a
+  //      b.start ... b.end ... a.start ... a.end
+  //
+  // So, we compute "a.start - b.end" and "b.start - a.end". If even one of
+  // those is positive, then it is guaranteed that the bounds do not overlap.
+  //
+  // If the diff is a constant, then we can directly check if the constant is
+  // positive. If the diff is not a constant, then it will be made of
+  // variables that correspond to the bounds of buffers involved. These buffer
+  // bounds can never be negative. So, we check if the given expression is
+  // guaranteed to be positive under the assumption that the variables involved
+  // are never negative.
+
   ExprPtr lowDiff = IRSimplifier::simplify(alloc<Sub>(a.start, b.end));
   ExprPtr highDiff = IRSimplifier::simplify(alloc<Sub>(b.start, a.end));
 
-  if (lowDiff->isConstant() && highDiff->isConstant()) {
-    int low = immediateAs<int>(lowDiff);
-    int high = immediateAs<int>(highDiff);
-    // No overlap.
-    if (low > 0 || high > 0) {
-      return NoOverlap;
-    }
+  if (mustBePositive(lowDiff)) {
+    return NoOverlap;
+  }
+  if (mustBePositive(highDiff)) {
+    return NoOverlap;
   }
 
   ExprPtr diff_start = IRSimplifier::simplify(alloc<Sub>(b.start, a.start));
@@ -48,6 +138,27 @@ OverlapKind boundOverlap(Bound a, Bound b) {
   return PartialOverlap;
 }
 
+CmpEvalResult TORCH_API compareBound(
+    const Bound& a,
+    const Bound& b,
+    const CompareSelectOperation& cmp_op) {
+  switch (cmp_op) {
+    case CompareSelectOperation::kGT:
+      return (a > b) ? TRUE : (a <= b ? FALSE : NOT_DETERMINED);
+    case CompareSelectOperation::kGE:
+      return (a >= b) ? TRUE : (a < b ? FALSE : NOT_DETERMINED);
+    case CompareSelectOperation::kLT:
+      return (a < b) ? TRUE : (a >= b ? FALSE : NOT_DETERMINED);
+    case CompareSelectOperation::kLE:
+      return (a <= b) ? TRUE : (a > b ? FALSE : NOT_DETERMINED);
+    case CompareSelectOperation::kNE:
+      return (a != b) ? TRUE : (a == b ? FALSE : NOT_DETERMINED);
+    default:
+      TORCH_INTERNAL_ASSERT(cmp_op == CompareSelectOperation::kEQ)
+      return (a == b) ? TRUE : (a != b ? FALSE : NOT_DETERMINED);
+  }
+}
+
 bool indexBoundsEquals(const IndexBounds& A, const IndexBounds& B) {
   if (A.size() != B.size()) {
     return false;
@@ -113,7 +224,15 @@ OverlapKind overlaps(const IndexBounds& a, const IndexBounds& b) {
   return overlap;
 }
 
-std::vector<Bound> subtractBound(Bound a, Bound b, OverlapKind overlap) {
+std::vector<Bound> subtractBound(Bound a, Bound b) {
+  OverlapKind overlap = boundOverlap(a, b);
+  if (overlap == NoOverlap) {
+    return {a};
+  }
+  if (overlap == ContainedOrEqual) {
+    return {};
+  }
+
   // The bounds must overlap.
   std::vector<Bound> res;
 
@@ -170,18 +289,6 @@ std::vector<Bound> subtractBound(Bound a, Bound b, OverlapKind overlap) {
   return res;
 }
 
-std::vector<Bound> subtractBound(Bound a, Bound b) {
-  OverlapKind overlap = boundOverlap(a, b);
-  if (overlap == NoOverlap) {
-    return {a};
-  }
-  if (overlap == ContainedOrEqual) {
-    return {};
-  }
-
-  return subtractBound(a, b, overlap);
-}
-
 std::vector<IndexBounds> subtractIndicesBounds(
     const IndexBounds& A,
     const IndexBounds& B,
@@ -225,8 +332,14 @@ std::vector<IndexBounds> subtractIndicesBounds(
         remaining = A[i];
       } else {
         auto remainingSlices = subtractBound(remaining, slice);
-        TORCH_INTERNAL_ASSERT(remainingSlices.size() == 1, buildErrorMessage());
-        remaining = remainingSlices[0];
+        // In some cases, we might end up with empty remainingSlices due to the
+        // optimization done in subtraction while handling diff expressions
+        // that have a single variable in `subtractBound()`.
+        if (!remainingSlices.empty()) {
+          TORCH_INTERNAL_ASSERT(
+              remainingSlices.size() == 1, buildErrorMessage());
+          remaining = remainingSlices[0];
+        }
       }
     }
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.h b/torch/csrc/jit/tensorexpr/bounds_overlap.h
index 482b7865d1c6..c78884dcffcf 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.h
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
-#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+
 #include <deque>
 #include <vector>
 
@@ -24,17 +24,19 @@ struct TORCH_API Bound {
   Bound() = default;
   Bound(ExprPtr s, ExprPtr e) : start(s), end(e) {}
 
-  void print() const {
-    std::cout << "(" << *start << ", " << *end << ")";
-  }
-
-  bool equals(const Bound& other) const {
-    return exprEquals(start, other.start) && exprEquals(end, other.end);
-  }
+  void print() const;
+  bool equals(const Bound& other) const;
 
-  bool operator==(const Bound& other) const {
-    return exprEquals(start, other.start) && exprEquals(end, other.end);
-  }
+  // The comparison operators are conservative. If the compare operator returns
+  // true, it means that all the elements satisfy the logical expression. But
+  // the false does not mean the opposite comparison is satisfied. It could be
+  // but not always.
+  bool operator==(const Bound& other) const;
+  bool operator!=(const Bound& other) const;
+  bool operator<(const Bound& other) const;
+  bool operator<=(const Bound& other) const;
+  bool operator>(const Bound& other) const;
+  bool operator>=(const Bound& other) const;
 
   void swap() {
     std::swap(start, end);
@@ -57,10 +59,26 @@ struct BoundHash {
 //     NoOverlap: No elements in the Bound A are in the bound B.
 enum OverlapKind { ContainedOrEqual, Contains, PartialOverlap, NoOverlap };
 
+// The Bound comparison result.
+//     TRUE: Every Bound element always satifies the given comparison operator
+//     FALSE: Every Bound element always does NOT satify the given comparison
+//     operator
+//     NOT_DETERMINED: Some elements satify the given comparison operator and
+//     some elements not
+enum CmpEvalResult { TRUE, FALSE, NOT_DETERMINED };
+
 // Returns the kind of overlap between Bound A and Bound A in a single
 // dimension.
 OverlapKind TORCH_API boundOverlap(Bound A, Bound B);
 
+// The comparison is conservative and the compare result is deterministic.
+// It means that every element of the Bound to be compared needs to satisfiy
+// the given comparison operator.
+CmpEvalResult TORCH_API compareBound(
+    const Bound& a,
+    const Bound& b,
+    const CompareSelectOperation& cmp_op);
+
 // A multi dimensional bound representing the bound of a set of indices.
 using IndexBounds = std::vector<Bound>;
 
@@ -78,11 +96,17 @@ OverlapKind TORCH_API overlaps(const IndexBounds& a, const IndexBounds& b);
 // Multiple Bounds can be returned in the case where B slices A into two
 // distinct regions with no overlap.
 //
+// For example:
+//    subtractBound((0, 10), (2, 4)) => [(0, 1), (5, 10)]
+//       bound A: (0, 10)
+//       bound B: (2, 4)
+//       If we remove slice (2, 4) from the slice (0, 10), we will be left
+//       with 2 slices, one at the start (0, 1), and one at the end (5, 10).
+//       So, the result of this subtraction is [(0, 1), (5, 10)].
+//
 // Note: this doesn't use IndexBounds because the Bounds returned do not
 // represent multiple different dimensions.
 std::vector<Bound> TORCH_API subtractBound(Bound a, Bound b);
-std::vector<Bound> TORCH_API
-subtractBound(Bound a, Bound b, OverlapKind overlap);
 
 // Returns the bound slices created by subtracting the IndexBounds B from A.
 std::vector<IndexBounds> TORCH_API subtractIndicesBounds(
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 59786cb980c6..e1e7a01875ec 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -8,6 +8,20 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+CodeGen::CodeGen(
+    StmtPtr stmt,
+    std::vector<BufferArg> buffer_args,
+    at::Device device,
+    std::string kernel_func_name)
+    : stmt_(std::move(stmt)),
+      buffer_args_(std::move(buffer_args)),
+      device_(device),
+      kernel_func_name_(std::move(kernel_func_name)) {
+  ExtCallMemoryReuse extCallMemoryReuse(buffer_args_);
+  apply_mutator(&extCallMemoryReuse);
+  allocIntermediateBufs();
+}
+
 RegisterCodeGenList::StmtFactoryMethod RegisterCodeGenList::
     FindStmtFactoryMethod(const std::string& name) {
   auto iter = stmt_factory_methods_.find(name);
@@ -105,8 +119,8 @@ c10::optional<size_t> bufSize(BufPtr buf) {
 // as "up for grabs" for future reuse.
 std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
     const std::unordered_set<BufPtr>& bufs,
-    const std::unordered_map<BufPtr, std::tuple<int32_t, int32_t>>&
-        buf_ranges) {
+    const std::unordered_map<BufPtr, std::tuple<int32_t, int32_t>>& buf_ranges,
+    const std::unordered_set<BufPtr>& bufs_external_allocs) {
   // Sort buffers by the time they appear.
   std::vector<BufPtr> bufs_sorted(bufs.begin(), bufs.end());
   auto sorting_function_by_start_time = [&buf_ranges](
@@ -135,7 +149,6 @@ std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
     }
 
     auto start = std::get<0>(buf_ranges.at(buf));
-    auto end = std::get<1>(buf_ranges.at(buf));
 
     // Release memory for buffers whose liveness range ends before the creation
     // time of this buf.
@@ -161,16 +174,18 @@ std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
     }
 
     bool allocated = false;
-    // Check whether there are free memories that this buf can reuse.
-    for (auto it = mem_up_for_grabs.begin(); it != mem_up_for_grabs.end();
-         it++) {
-      auto m = *it;
-      if (bufSize(m) >= bufSize(buf)) {
-        buf_mem_map[buf] = m;
-        buf_allocs.emplace_back(std::make_pair(buf, m));
-        allocated = true;
-        mem_up_for_grabs.erase(it);
-        break;
+    if (bufs_external_allocs.find(buf) == bufs_external_allocs.end()) {
+      // Check whether there are free memories that this buf can reuse.
+      for (auto it = mem_up_for_grabs.begin(); it != mem_up_for_grabs.end();
+           it++) {
+        auto m = *it;
+        if (bufSize(m) >= bufSize(buf)) {
+          buf_mem_map[buf] = m;
+          buf_allocs.emplace_back(buf, m);
+          allocated = true;
+          mem_up_for_grabs.erase(it);
+          break;
+        }
       }
     }
 
@@ -187,26 +202,77 @@ std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
 
 StmtPtr insertAllocFree(
     std::vector<std::pair<BufPtr, BufPtr>>& buf_allocs,
+    const std::unordered_set<BufPtr>& bufs_external_allocs,
     StmtPtr stmt) {
   BlockPtr b = to<Block>(stmt);
   if (!b) {
     b = alloc<Block>(std::vector<StmtPtr>({stmt}));
   }
 
+  std::vector<BufPtr> bufs_ext_to_free;
   // Insert allocations and frees for temporary buffers at global scope.
   for (auto rit = buf_allocs.rbegin(); rit != buf_allocs.rend(); ++rit) {
     if (rit->first == rit->second) {
       BufPtr buf = rit->first;
-      b->prepend_stmt(alloc<Allocate>(buf));
-      b->append_stmt(alloc<Free>(buf));
+      if (bufs_external_allocs.find(buf) == bufs_external_allocs.end()) {
+        b->prepend_stmt(alloc<Allocate>(buf));
+        b->append_stmt(alloc<Free>(buf));
+      } else {
+        bufs_ext_to_free.push_back(buf);
+      }
     } else {
       b->prepend_stmt(alloc<PlacementAllocate>(rit->first, rit->second));
     }
   }
 
+  b->append_stmt(alloc<FreeExt>(bufs_ext_to_free));
   return b;
 }
 
+std::unordered_map<std::string, std::string> ExtCallMemoryReuse::
+    makeExtCallFuncNameMap() {
+  return {
+      {"nnc_aten_quantize_per_tensor", "nnc_aten_quantize_per_tensor_out"},
+      {"nnc_aten_dequantize", "nnc_aten_dequantize_out"},
+      {"nnc_aten_quantized_mul", "nnc_aten_quantized_mul_out"},
+      {"nnc_aten_quantized_conv2d", "nnc_aten_quantized_conv2d_out"},
+      {"nnc_aten_quantized_conv2d_relu", "nnc_aten_quantized_conv2d_relu_out"},
+      {"nnc_aten_quantized_mul", "nnc_aten_quantized_mul_out"},
+      {"nnc_aten_quantized_sigmoid", "nnc_aten_quantized_sigmoid_out"},
+      {"nnc_aten_upsample_nearest2d", "nnc_aten_upsample_nearest2d_out"},
+      {"nnc_aten_quantized_linear", "nnc_aten_quantized_linear_out"},
+      {"nnc_aten_quantized_conv1d", "nnc_aten_quantized_conv1d_out"},
+      {"nnc_aten_quantized_mul_scalar", "nnc_aten_quantized_mul_scalar_out"},
+      {"nnc_aten_max_red", "nnc_aten_max_red_out"},
+      {"nnc_aten_conv1d", "nnc_aten_conv1d_out"},
+  };
+}
+
+const std::unordered_map<std::string, std::string>
+    ExtCallMemoryReuse::extCallFuncNameMap_ = makeExtCallFuncNameMap();
+
+ExtCallMemoryReuse::ExtCallMemoryReuse(
+    const std::vector<CodeGen::BufferArg>& bufferArgs) {
+  for (const auto& ba : bufferArgs) {
+    if (ba.buf()) {
+      bufferArgs_.insert(ba.buf());
+    }
+  }
+}
+
+StmtPtr ExtCallMemoryReuse::mutate(ExternalCallPtr v) {
+  if (extCallFuncNameMap_.count(v->func_name()) &&
+      bufferArgs_.count(v->buf()) == 0) {
+    std::vector<BufPtr> buf_out_args = {v->buf()};
+    return alloc<ExternalCallWithAlloc>(
+        extCallFuncNameMap_.at(v->func_name()),
+        buf_out_args,
+        v->buf_args(),
+        v->args());
+  }
+  return v;
+}
+
 // We allocate intermediate buffers by inserting Allocate/Free or
 // PlacementAllocate stmts. Allocate/Free stmts will allocate memory at runtime,
 // and PlacementAllocate stmt reuses the memory of one buffer for another
@@ -236,14 +302,17 @@ void CodeGen::allocIntermediateBufs() {
     }
   }
 
+  const auto bufs_external_allocs = ExternalAllocBufFinder::find(stmt_);
+
   // For each intermediate buffer, we reuse the memory of an old buffer whose
   // liveness range does not overlap with the current buffer, or allocate memory
   // if reusing buffer is impossible.
-  auto buf_allocs = AllocBufsWithMemReuse(interm_bufs, interm_buf_ranges);
+  auto buf_allocs = AllocBufsWithMemReuse(
+      interm_bufs, interm_buf_ranges, bufs_external_allocs);
 
   // Insert memory allocation/mapping nodes.
   if (buf_allocs.size() > 0) {
-    auto stmt_new = insertAllocFree(buf_allocs, stmt_);
+    auto stmt_new = insertAllocFree(buf_allocs, bufs_external_allocs, stmt_);
     set_stmt(stmt_new);
   }
 
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 3115527786c9..4b94b2e4b7f6 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -26,13 +26,7 @@ class TORCH_API CodeGen {
       StmtPtr stmt,
       std::vector<BufferArg> buffer_args,
       at::Device device = at::kCPU,
-      std::string kernel_func_name = "func")
-      : stmt_(stmt),
-        buffer_args_(std::move(buffer_args)),
-        device_(device),
-        kernel_func_name_(std::move(kernel_func_name)) {
-    allocIntermediateBufs();
-  }
+      std::string kernel_func_name = "func");
 
   virtual ~CodeGen() = default;
 
@@ -113,6 +107,20 @@ class TORCH_API CodeGen {
   std::string kernel_func_name_ = "func";
 };
 
+class TORCH_API ExtCallMemoryReuse : public IRMutator {
+  static std::unordered_map<std::string, std::string> makeExtCallFuncNameMap();
+  static const std::unordered_map<std::string, std::string> extCallFuncNameMap_;
+
+ public:
+  explicit ExtCallMemoryReuse(
+      const std::vector<CodeGen::BufferArg>& bufferArgs);
+  ~ExtCallMemoryReuse() override = default;
+  StmtPtr mutate(ExternalCallPtr v) override;
+
+ private:
+  std::unordered_set<BufPtr> bufferArgs_;
+};
+
 class CodeGen::BufferArg {
  public:
   BufferArg(Tensor tensor) : buf_(tensor.buf()) {}
diff --git a/torch/csrc/jit/tensorexpr/codegen_external.py b/torch/csrc/jit/tensorexpr/codegen_external.py
index fbde2212de0d..bdfe318a5fad 100644
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import argparse
-from tools.codegen.gen import parse_native_yaml, FileManager
-import tools.codegen.model as model
+from torchgen.gen import parse_native_yaml, FileManager
+import torchgen.model as model
 
 def num_leading_spaces(line: str) -> int:
     return len(line) - len(line.lstrip())
@@ -12,8 +12,8 @@ def deindent(code: str) -> str:
     return '\n'.join(lines)
 
 
-def gen_external(native_functions_path, external_path):
-    native_functions = parse_native_yaml(native_functions_path)
+def gen_external(native_functions_path, tags_path, external_path):
+    native_functions = parse_native_yaml(native_functions_path, tags_path)
     func_decls = []
     func_registrations = []
     for func in native_functions:
@@ -83,11 +83,14 @@ def main() -> None:
     parser.add_argument('--native_functions',
                         help='path to native_functions.yaml',
                         default='../../../../aten/src/ATen/native/native_functions.yaml')
+    parser.add_argument('--tags',
+                        help='path to tags.yaml',
+                        default='../../../../aten/src/ATen/native/tags.yaml')
     parser.add_argument('--template_path',
                         help='path to external_functions_codegen_template.cpp',
                         default='../../../../tools/jit/templates/external_functions_codegen_template.cpp')
     args = parser.parse_args()
-    gen_external(args.native_functions, args.template_path)
+    gen_external(args.native_functions, args.tags, args.template_path)
 
 if __name__ == '__main__':
     main()
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index e398333a23ad..ef17c8500290 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1170,15 +1170,24 @@ void CudaCodeGen::call_raw(const std::vector<void*>& raw_args) {
       gpu_block_extents_v[i] = immediateAs<int64_t>(gpu_block_extents[i]);
       continue;
     }
-    gpu_block_extents_v[i] = block_extents_eval_[i].value<int64_t>(extent_args);
+    {
+      // invocation of block_extents_eval_ isn't thread safe and this function
+      // may be invoked by multiple threads
+      std::lock_guard<std::mutex> guard(eval_lock_);
+      gpu_block_extents_v[i] =
+          block_extents_eval_[i].value<int64_t>(extent_args);
+    }
   }
   for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
     if (gpu_thread_extents[i]->isConstant()) {
       gpu_thread_extents_v[i] = immediateAs<int64_t>(gpu_thread_extents[i]);
       continue;
     }
-    gpu_thread_extents_v[i] =
-        thread_extents_eval_[i].value<int64_t>(extent_args);
+    {
+      std::lock_guard<std::mutex> guard(eval_lock_);
+      gpu_thread_extents_v[i] =
+          thread_extents_eval_[i].value<int64_t>(extent_args);
+    }
   }
 
   // Skip launching the kernel if there are no elements to process.
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
index eaea52d9acf1..22de1ce32d00 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -273,6 +273,7 @@ class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
   std::unique_ptr<CudaAnalysis> cuda_analysis_;
   std::unique_ptr<GPUMetaVarRewriter> metavar_rewriter_;
   std::unordered_set<std::string> taken_func_names;
+  std::mutex eval_lock_;
   CUfunction function_;
   bool has_random_ = false;
   int thread_block_size_ = -1;
diff --git a/torch/csrc/jit/tensorexpr/dim_arg.h b/torch/csrc/jit/tensorexpr/dim_arg.h
deleted file mode 100644
index b4f442a03b95..000000000000
--- a/torch/csrc/jit/tensorexpr/dim_arg.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-#include <torch/csrc/jit/tensorexpr/expr.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-// A helper structure to store the arguments to specify dimensions. In the
-// Compute arguments for dim_args, all of the following is supported. For
-// example:
-//    dim_args: {1, 2, 3, 4}
-//    dim_args: {{1, "x"}, {2, "y"}, {3, "z"}}
-//    dim_args: {1, 2, {3, "x"}}
-class DimArg {
- public:
-  // Intentionally leave out explicit to allow implicit conversions.
-  DimArg(const ExprHandle& dim) : dim_(dim) {}
-  DimArg(const ExprHandle& dim, std::string name_hint)
-      : dim_(dim), name_hint_(std::move(name_hint)) {}
-  const ExprHandle& dim() const {
-    return dim_;
-  }
-  const std::string& name_hint() const {
-    return name_hint_;
-  }
-
- private:
-  ExprHandle dim_;
-  std::string name_hint_;
-};
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 32365243753c..0d5a09a7e13c 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/tensorexpr/eval.h>
 
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/tensorexpr/external_functions_core.h>
 #include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
 
 #include <c10/util/irange.h>
@@ -897,6 +898,86 @@ class SimpleIREvaluatorImpl : public IRVisitor {
         extra_args.data());
   }
 
+  void visit(ExternalCallWithAllocPtr v) override {
+    auto& func_registry = getNNCFunctionRegistry();
+    if (!func_registry.count(v->func_name())) {
+      throw unimplemented_lowering(v);
+    }
+    GRAPH_DEBUG("EXTERNAL CALL: func=", v->func_name());
+
+    const auto& bufs_out = v->buf_out_args();
+    const auto& bufs_in = v->buf_args();
+    const auto bufs_in_size = bufs_in.size();
+    const auto bufs_out_size = bufs_out.size();
+
+    std::vector<void*> buf_ptrs(bufs_in_size + 2 * bufs_out_size);
+    std::vector<int64_t> buf_ranks;
+    std::vector<int64_t> buf_dims;
+    std::vector<int64_t> buf_strides;
+    std::vector<int8_t> buf_dtypes;
+    std::vector<int64_t> extra_args;
+
+    size_t i = 0;
+    for (const auto& b : bufs_in) {
+      auto iter = buffer_mapping_.find(b);
+      if (iter == buffer_mapping_.end()) {
+        throw malformed_input("could not find buf", v);
+      }
+      buf_ptrs[bufs_out_size + i] = iter->second;
+      // @lint-ignore CLANGTIDY
+      buf_ranks.push_back(b->dims().size());
+      buf_dtypes.push_back((int8_t)b->dtype().scalar_type());
+      for (const auto& dim_expr : b->dims()) {
+        dim_expr->accept(this);
+        buf_dims.push_back(value().intValue());
+      }
+      for (const ExprPtr& stride_expr : b->strides()) {
+        stride_expr->accept(this);
+        buf_strides.push_back(value().intValue());
+      }
+      i++;
+    }
+    for (const auto& a : v->args()) {
+      a->accept(this);
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      int64_t val;
+      if (value().dtype() == kLong) {
+        val = value().as<int64_t>();
+      } else if (value().dtype() == kInt) {
+        val = value().intValue();
+      } else if (value().dtype() == kDouble) {
+        auto x = value().as<double>();
+        val = reinterpret_cast<int64_t*>(&x)[0];
+      } else if (value().dtype() == kFloat) {
+        auto x = value().as<float>();
+        val = reinterpret_cast<int64_t*>(&x)[0];
+      } else {
+        throw malformed_input(
+            "extra_args in ExternalCalls must have int64 dtype", v);
+      }
+      extra_args.push_back(val);
+    }
+
+    auto fn_ptr = func_registry.at(v->func_name());
+    (*fn_ptr)(
+        // @lint-ignore CLANGTIDY
+        bufs_in_size,
+        buf_ptrs.data(),
+        buf_ranks.data(),
+        buf_dims.data(),
+        buf_strides.data(),
+        buf_dtypes.data(),
+        // @lint-ignore CLANGTIDY
+        extra_args.size(),
+        extra_args.data());
+
+    for (i = 0; i < bufs_out_size; ++i) {
+      const auto& buf_out = bufs_out[i];
+      buffer_mapping_[buf_out] = buf_ptrs[i];
+      ext_bufs_free_ptr_[buf_out] = buf_ptrs[bufs_in_size + bufs_out_size + i];
+    }
+  }
+
   template <typename TReturn, typename TInput>
   void visit_intrinsics_helper(IntrinsicsPtr v) {
     std::vector<InterpValue> values(v->nparams());
@@ -983,7 +1064,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   }
 
   void visit(PlacementAllocatePtr v) override {
-    buffer_mapping_[v->buf()] = buffer_mapping_[v->buf_to_reuse()];
+    buffer_mapping_[v->buf()] = buffer_mapping_.at(v->buf_to_reuse());
   }
 
   void visit(FreePtr v) override {
@@ -998,6 +1079,21 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     buffer_mapping_.erase(b);
   }
 
+  void visit(FreeExtPtr v) override {
+    const auto& bufs = v->bufs();
+    const auto bufs_num = bufs.size();
+    std::vector<void*> buf_ptrs;
+    for (const auto& buf : bufs) {
+      if (!ext_bufs_free_ptr_.count(buf)) {
+        throw std::runtime_error(
+            "Free an external allocated buffer that does not have corresponding pointer for freeing: " +
+            buf->base_handle()->name_hint());
+      }
+      buf_ptrs.push_back(ext_bufs_free_ptr_[buf]);
+    }
+    nnc_aten_free(bufs_num, buf_ptrs.data());
+  }
+
   void visit(LetPtr v) override {
     var_by_scope_[scope_].push_back(v->var());
     bindVar(v->var(), evaluateExpr(v->value()));
@@ -1141,6 +1237,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   std::unordered_map<BufPtr, void*> buffer_mapping_;
   std::unordered_map<BufPtr, std::unique_ptr<std::vector<int>>>
       internal_buffers_;
+  std::unordered_map<BufPtr, void*> ext_bufs_free_ptr_;
 };
 
 SimpleIREvaluator::SimpleIREvaluator(
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp
index e17727c76029..7a82c2688ef0 100644
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/tensorexpr/expr.h>
 
 #include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 
 namespace torch {
 namespace jit {
@@ -425,22 +426,130 @@ Buf::Buf(
   TORCH_CHECK(var);
 }
 
-ExprHandle Buf::make(
+BufHandle Buf::make(const std::vector<ExprHandle>& dims, Dtype dtype) {
+  return Buf::make("", dims, dtype);
+}
+
+BufHandle Buf::make(
     const std::string& name_hint,
     const std::vector<ExprHandle>& dims,
+    const std::vector<ExprHandle>& strides,
     Dtype dtype) {
-  return ExprHandle(
-      alloc<Buf>(name_hint, ExprHandleVectorToExprVector(dims), dtype));
+  return BufHandle(alloc<Buf>(
+      name_hint,
+      ExprHandleVectorToExprVector(dims),
+      dtype,
+      nullptr,
+      ExprHandleVectorToExprVector(strides)));
 }
 
-ExprHandle Buf::make(const std::vector<ExprHandle>& dims, Dtype dtype) {
-  return Buf::make("", dims, dtype);
+BufHandle Buf::make(
+    const std::string& name_hint,
+    const std::vector<ExprHandle>& dims,
+    Dtype dtype,
+    c10::optional<ExprHandle> initializer,
+    c10::optional<std::vector<ExprHandle>> strides,
+    c10::optional<ExprHandle> qscale,
+    c10::optional<ExprHandle> qzero) {
+  c10::optional<std::vector<ExprPtr>> opt_strides;
+  if (strides) {
+    opt_strides = ExprHandleVectorToExprVector(*strides);
+  }
+  return BufHandle(alloc<Buf>(
+      name_hint,
+      ExprHandleVectorToExprVector(dims),
+      dtype,
+      initializer ? initializer->node() : nullptr,
+      opt_strides,
+      qscale ? qscale->node() : nullptr,
+      qzero ? qzero->node() : nullptr));
+}
+
+bool Buf::is_contiguous(at::MemoryFormat memory_format) const {
+  auto ndims = dims_.size();
+  std::vector<int64_t> dim_order(ndims);
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    if (dims_.size() != 4)
+      return false;
+    dim_order = {1, 3, 2, 0};
+  } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+    if (dims_.size() != 5)
+      return false;
+    dim_order = {1, 4, 3, 2, 0};
+  } else {
+    if (dims_.empty()) {
+      // Scalar tensor
+      TORCH_CHECK(strides_.empty());
+      return true; // Align with the isContiguous logic in the kernel.cpp
+    }
+    for (size_t i = 0; i < ndims; i++) {
+      dim_order[i] = ndims - i - 1; // Reverse
+    }
+  }
+
+  bool res = is_stride_one(dim_order[0]);
+  if (!res)
+    return false;
+
+  for (size_t i = 1; i < ndims; i++) {
+    auto cur_dim = dim_order[i];
+    auto pre_dim = dim_order[i - 1];
+    res &= is_cont_with(cur_dim, pre_dim);
+    if (!res)
+      return false;
+  }
+
+  return true;
 }
 
 std::vector<ExprHandle> BufHandle::dims() const {
   return ExprVectorToExprHandleVector(node()->dims());
 }
 
+bool Buf::is_cont_with(int cur_dim, int adjacent_dim) const {
+  auto is_cont_fn = [](ExprPtr adjacent_dim,
+                       ExprPtr adjacent_stride,
+                       ExprPtr cur_stride) {
+    // For static shape
+    bool res = exprEquals(
+        cur_stride,
+        (ExprHandle(adjacent_dim) * ExprHandle(adjacent_stride)).node());
+    if (res)
+      return res;
+
+    // For symbolic shape
+    auto mul_node = to<Mul>(cur_stride);
+    if (!mul_node) {
+      return false;
+    }
+
+    // lhs and rhs could be other dim or stride
+    auto lhs_ = mul_node->lhs();
+    auto rhs_ = mul_node->rhs();
+
+    bool same_stride = false;
+    auto same_dim = exprEquals(lhs_, adjacent_dim) || (adjacent_dim == lhs_);
+    if (same_dim) {
+      // lhs_ is dim while rhs_ is stride
+      same_stride =
+          exprEquals(rhs_, adjacent_stride) || (adjacent_stride == rhs_);
+    } else {
+      // lhs_ is stride while rhs_ is dim
+      same_dim = exprEquals(rhs_, adjacent_dim) || (adjacent_dim == rhs_);
+      same_stride =
+          exprEquals(lhs_, adjacent_stride) || (adjacent_stride == lhs_);
+    }
+
+    return same_dim && same_stride;
+  };
+  return is_cont_fn(
+      dims_[adjacent_dim], strides_[adjacent_dim], strides_[cur_dim]);
+}
+
+bool Buf::is_stride_one(int cur_dim) const {
+  return exprEquals(strides_[cur_dim], alloc<LongImm>(1));
+}
+
 ExprHandle expr_to_vec(ExprHandle v, int lanes) {
   if (lanes == 1) {
     return v;
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 148ce41e4104..4f4ac02639ad 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -5,6 +5,8 @@
  */
 #pragma once
 
+#include <c10/core/MemoryFormat.h>
+#include <c10/util/Optional.h>
 #include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/tensorexpr/ir_mutator.h>
 #include <torch/csrc/jit/tensorexpr/ir_visitor.h>
@@ -184,18 +186,29 @@ class TORCH_API Var : public ExprNode<Var> {
   std::string name_hint_;
 };
 
-std::vector<ExprPtr> make_contiguous_strides(
+TORCH_API std::vector<ExprPtr> make_contiguous_strides(
     const std::vector<ExprHandle>& dims);
-std::vector<ExprPtr> make_channels_last_strides(
+TORCH_API std::vector<ExprPtr> make_channels_last_strides(
     const std::vector<ExprHandle>& dims);
 
 class TORCH_API Buf : public ExprNode<Buf> {
  public:
-  static ExprHandle make(
+  static BufHandle make(const std::vector<ExprHandle>& dims, Dtype dtype);
+
+  static BufHandle make(
       const std::string& name_hint,
       const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
       Dtype dtype);
-  static ExprHandle make(const std::vector<ExprHandle>& dims, Dtype dtype);
+
+  static BufHandle make(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      Dtype dtype,
+      c10::optional<ExprHandle> initializer = c10::nullopt,
+      c10::optional<std::vector<ExprHandle>> strides = c10::nullopt,
+      c10::optional<ExprHandle> qscale = c10::nullopt,
+      c10::optional<ExprHandle> qzero = c10::nullopt);
 
   // TODO: unique_name
   VarPtr base_handle() const {
@@ -290,7 +303,27 @@ class TORCH_API Buf : public ExprNode<Buf> {
     return true;
   }
 
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const;
+
+  // The channels-last 1d can benefit the performance of some operators like
+  // conv1d. But the MemoryFormat enum has not covered this layout yet. Hence,
+  // we abstract a dedicated function to check channels-last 1d contiguous.
+  //
+  // Channels-last 1d:
+  //   dims:              n   c    l
+  //   strides(nlc):    c*l   1    c
+  bool is_channels_last_1d_contiguous() const {
+    if (dims_.size() != 3) {
+      return false;
+    }
+    return is_stride_one(1) && is_cont_with(2, 1) && is_cont_with(0, 2);
+  }
+
  private:
+  bool is_cont_with(int cur_dim, int adjacent_dim) const;
+  bool is_stride_one(int cur_dim) const;
+
   VarPtr base_handle_;
   std::vector<ExprPtr> dims_;
   std::vector<ExprPtr> strides_;
@@ -308,6 +341,13 @@ class TORCH_API BufHandle : public ExprHandle {
       Dtype dtype)
       : ExprHandle(Buf::make(name_hint, dims, dtype)) {}
 
+  BufHandle(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
+      Dtype dtype)
+      : ExprHandle(Buf::make(name_hint, dims, strides, dtype)) {}
+
   BufHandle(const std::vector<ExprHandle>& dims, Dtype dtype)
       : ExprHandle(Buf::make("_", dims, dtype)) {}
 
@@ -356,6 +396,15 @@ class TORCH_API BufHandle : public ExprHandle {
   ExprHandle dim(size_t index) const {
     return ExprHandle(node()->dim(index));
   }
+
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    return node()->is_contiguous(memory_format);
+  }
+
+  bool is_channels_last_1d_contiguous() const {
+    return node()->is_channels_last_1d_contiguous();
+  }
 };
 
 // An expression to construct the underlying variable node.
diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp
index c041c6bf1d09..8459bd696cbb 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@@ -4,14 +4,15 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/native/quantized/cpu/conv_packed_params.h>
 #include <ATen/native/quantized/cpu/conv_serialization.h>
 #include <ATen/native/quantized/cpu/qadd.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
+#include <ATen/native/quantized/packed_params.h>
 #include <ATen/native/xnnpack/OpContext.h>
 #include <ATen/quantized/QTensorImpl.h>
 #include <aten/src/ATen/Parallel.h>
+#include <c10/core/TensorImpl.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/irange.h>
@@ -177,6 +178,124 @@ std::vector<at::Tensor> constructTensors(
       bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes, opt);
 }
 
+std::vector<at::Tensor> constructTensors2(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg,
+    size_t bufs_out_num) {
+  std::vector<void*> buf_data_vec;
+  std::vector<std::vector<int64_t>> buf_dims_vec;
+  std::vector<std::vector<int64_t>> buf_strides_vec;
+  std::vector<c10::ScalarType> buf_dtypes_vec;
+  int64_t buf_dims_idx = 0;
+  int64_t buf_strides_idx = 0;
+  for (const auto i : c10::irange(bufs_in_num)) {
+    buf_data_vec.push_back(buf_data[bufs_out_num + i]);
+    buf_dims_vec.emplace_back();
+    buf_strides_vec.emplace_back();
+    for (const auto dim : c10::irange(buf_ranks[i])) {
+      (void)dim;
+      buf_dims_vec[i].push_back(buf_dims[buf_dims_idx++]);
+      buf_strides_vec[i].push_back(buf_strides[buf_strides_idx++]);
+    }
+    buf_dtypes_vec.push_back(static_cast<c10::ScalarType>(buf_dtypes[i]));
+  }
+
+  std::vector<at::Tensor> tensors;
+  at::Tensor und;
+  for (const auto i : c10::irange(bufs_out_num)) {
+    (void)i;
+    tensors.emplace_back(und);
+  }
+  if (!qdataArg.has_value()) {
+    for (const auto i : c10::irange(buf_data_vec.size())) {
+      auto options = at::TensorOptions()
+                         // NOLINTNEXTLINE
+                         .dtype(buf_dtypes_vec[i])
+                         .layout(at::kStrided)
+                         .device(at::kCPU) // TODO: support GPUs too
+                         .memory_format(deduce_memory_format(
+                             // NOLINTNEXTLINE
+                             buf_strides_vec[i],
+                             // NOLINTNEXTLINE
+                             buf_dims_vec[i]))
+                         .requires_grad(false);
+      auto tensor = at::from_blob(
+          // NOLINTNEXTLINE
+          buf_data_vec[i],
+          buf_dims_vec[i],
+          buf_strides_vec[i],
+          options);
+      tensors.emplace_back(tensor);
+    }
+  } else {
+    // handle quantized
+    std::vector<c10::optional<QIData>> qdata(bufs_in_num, c10::nullopt);
+    for (const auto& qd : *qdataArg) {
+      qdata[qd.first - bufs_out_num] = qd.second;
+    }
+    for (const auto i : c10::irange(buf_data_vec.size())) {
+      auto options = at::TensorOptions()
+                         // NOLINTNEXTLINE
+                         .dtype(buf_dtypes_vec[i])
+                         .layout(at::kStrided)
+                         .device(at::kCPU) // TODO: support GPUs too
+                         .memory_format(deduce_memory_format(
+                             // NOLINTNEXTLINE
+                             buf_strides_vec[i],
+                             // NOLINTNEXTLINE
+                             buf_dims_vec[i]))
+                         .requires_grad(false);
+      if (auto qd = qdata[i]) {
+        // inplace tensor
+        auto tensor = from_blob_quantized(
+            // NOLINTNEXTLINE
+            buf_data_vec[i],
+            buf_dims_vec[i],
+            buf_strides_vec[i],
+            qd->scale,
+            qd->zero,
+            qd->scalarType);
+        tensors.emplace_back(tensor);
+      } else {
+        auto tensor = at::from_blob(
+            // NOLINTNEXTLINE
+            buf_data_vec[i],
+            buf_dims_vec[i],
+            buf_strides_vec[i],
+            options);
+        tensors.emplace_back(tensor);
+      }
+    }
+  }
+  return tensors;
+}
+
+std::vector<at::Tensor> constructTensors2(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    std::vector<std::pair<size_t, QIData>> qdata,
+    size_t bufs_out_num = 0u) {
+  c10::optional<std::vector<std::pair<size_t, QIData>>> opt = std::move(qdata);
+  return constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      opt,
+      bufs_out_num);
+}
+
 #ifndef _WIN32
 at::Tensor quantized_add(
     const at::Tensor& x1,
@@ -249,24 +368,6 @@ at::Tensor quantized_relu(const at::Tensor& qx) {
 extern "C" {
 #endif
 
-typedef void (*ParallelCallee)(int64_t index, int8_t* packed_data);
-void DispatchParallel(
-    int8_t* func,
-    int64_t start,
-    int64_t stop,
-    int8_t* packed_data) noexcept {
-  // TODO: preserve the func type.
-  try {
-    ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
-    at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
-      for (int64_t index = f_begin; index < f_end; index++) {
-        callee(index, packed_data);
-      }
-    });
-  } catch (...) {
-  }
-}
-
 void nnc_aten_conv2d(
     int64_t bufs_num,
     void** buf_data,
@@ -349,6 +450,41 @@ void nnc_aten_quantized_conv1d(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_conv1d_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+  auto convPackedParams =
+      reinterpret_cast<ConvPackedParamsBase<2>*>(buf_data[2]);
+  const double out_qscale = ((double*)extra_args)[3];
+  const int64_t out_qzero = extra_args[4];
+  // NOLINTNEXTLINE
+  auto qx = tensors[1].unsqueeze(quant_utils::kConv1dSqueezeDim + 2);
+  auto r = convPackedParams->apply(qx, out_qscale, out_qzero);
+  r = r.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_conv2d(
     int64_t bufs_num,
     void** buf_data,
@@ -378,6 +514,39 @@ void nnc_aten_quantized_conv2d(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_conv2d_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+  auto convPackedParams =
+      reinterpret_cast<ConvPackedParamsBase<2>*>(buf_data[2]);
+  const double out_qscale = ((double*)extra_args)[3];
+  const int64_t out_qzero = extra_args[4];
+  // NOLINTNEXTLINE
+  auto r = convPackedParams->apply(tensors[1], out_qscale, out_qzero);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_conv2d_relu(
     int64_t bufs_num,
     void** buf_data,
@@ -407,6 +576,39 @@ void nnc_aten_quantized_conv2d_relu(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_conv2d_relu_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+  auto convPackedParams =
+      reinterpret_cast<ConvPackedParamsBase<2>*>(buf_data[2]);
+  const double out_qscale = ((double*)extra_args)[3];
+  const int64_t out_qzero = extra_args[4];
+  // NOLINTNEXTLINE
+  auto r = convPackedParams->apply_relu(tensors[1], out_qscale, out_qzero);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_linear(
     int64_t bufs_num,
     void** buf_data,
@@ -436,6 +638,39 @@ void nnc_aten_quantized_linear(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_linear_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+  auto linearPackedParams =
+      reinterpret_cast<LinearPackedParamsBase*>(buf_data[2]);
+  const double out_qscale = ((double*)extra_args)[3];
+  const int64_t out_qzero = extra_args[4];
+  // NOLINTNEXTLINE
+  auto r = linearPackedParams->apply(tensors[1], out_qscale, out_qzero);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_linear_relu(
     int64_t bufs_num,
     void** buf_data,
@@ -531,6 +766,41 @@ void nnc_aten_quantized_mul(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_mul_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double a_qscale = ((double*)extra_args)[0];
+  const int64_t a_qzero = extra_args[1];
+  const c10::ScalarType a_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  const double b_qscale = ((double*)extra_args)[3];
+  const int64_t b_qzero = extra_args[4];
+  const c10::ScalarType b_qdtype = static_cast<c10::ScalarType>(extra_args[5]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {a_qscale, a_qzero, toQIntType(a_qdtype)}},
+       {2u, {b_qscale, b_qzero, toQIntType(b_qdtype)}}},
+      1u);
+  const double out_qscale = ((double*)extra_args)[6];
+  const int64_t out_qzero = extra_args[7];
+  // NOLINTNEXTLINE
+  auto r = quantized_mul(tensors[1], tensors[2], out_qscale, out_qzero);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_mul_scalar(
     int64_t bufs_num,
     void** buf_data,
@@ -557,6 +827,36 @@ void nnc_aten_quantized_mul_scalar(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_mul_scalar_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+  const double scalar = ((double*)extra_args)[3];
+  // NOLINTNEXTLINE
+  auto r = quantized_mul_scalar(tensors[1], scalar);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_relu(
     int64_t bufs_num,
     void** buf_data,
@@ -608,6 +908,36 @@ void nnc_aten_quantized_sigmoid(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantized_sigmoid_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const c10::ScalarType x_qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  const size_t bufs_out_num = 1u;
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {x_qscale, x_qzero, toQIntType(x_qdtype)}}},
+      bufs_out_num);
+
+  // NOLINTNEXTLINE
+  auto r = at::sigmoid(tensors[1]);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantized_cat(
     int64_t bufs_num,
     void** buf_data,
@@ -684,6 +1014,58 @@ void nnc_aten_upsample_nearest2d(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_upsample_nearest2d_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+  const double x_qscale = ((double*)extra_args)[0];
+  const int64_t x_qzero = extra_args[1];
+  const int64_t x_qdtype = extra_args[2];
+  const auto is_quantized = x_qdtype != -1;
+  c10::optional<std::vector<std::pair<size_t, QIData>>> qdata;
+  if (is_quantized) {
+    qdata = {
+        {1u,
+         {x_qscale,
+          x_qzero,
+          at::toQIntType(static_cast<c10::ScalarType>(x_qdtype))}}};
+  }
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      qdata,
+      bufs_out_num);
+  auto x = tensors[1];
+
+  int64_t output_size_h = extra_args[3];
+  int64_t output_size_w = extra_args[4];
+  double scale_factor_h = ((double*)extra_args)[5];
+  double scale_factor_w = ((double*)extra_args)[6];
+
+  auto r = at::upsample_nearest2d(
+      x,
+      (output_size_h != -1)
+          ? c10::optional<at::IntArrayRef>({output_size_h, output_size_w})
+          : c10::nullopt,
+      (scale_factor_h != -1.f) ? c10::optional<at::ArrayRef<double>>(
+                                     {scale_factor_h, scale_factor_w})
+                               : c10::nullopt);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_quantize_per_tensor(
     int64_t bufs_num,
     void** buf_data,
@@ -704,6 +1086,36 @@ void nnc_aten_quantize_per_tensor(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_quantize_per_tensor_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      c10::nullopt,
+      bufs_out_num);
+  // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+  at::Tensor x = tensors[1];
+  const double qscale = ((double*)extra_args)[0];
+  const int64_t qzero = extra_args[1];
+  const c10::ScalarType qdtype = static_cast<c10::ScalarType>(extra_args[2]);
+  auto r = at::quantize_per_tensor(x, qscale, qzero, qdtype);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_dequantize(
     int64_t bufs_num,
     void** buf_data,
@@ -730,6 +1142,35 @@ void nnc_aten_dequantize(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_dequantize_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  const double qscale = ((double*)extra_args)[0];
+  const int64_t qzero = extra_args[1];
+  const int64_t qdtype = extra_args[2];
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      {{1u, {qscale, qzero, toQIntType(static_cast<c10::ScalarType>(qdtype))}}},
+      bufs_out_num);
+  // NOLINTNEXTLINE
+  auto r = at::dequantize(tensors[1]);
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_conv1d(
     int64_t bufs_num,
     void** buf_data,
@@ -770,6 +1211,56 @@ void nnc_aten_conv1d(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_conv1d_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t args_num,
+    int64_t* extra_args) {
+  const size_t bufs_out_num = 1u;
+  auto tensors = constructTensors2(
+      bufs_in_num,
+      buf_data,
+      buf_ranks,
+      buf_dims,
+      buf_strides,
+      buf_dtypes,
+      c10::nullopt,
+      bufs_out_num);
+
+  at::Tensor r;
+  const at::Tensor& x = tensors[1];
+  const at::Tensor& w = tensors[2];
+  if (args_num > 0) {
+    // Check that if the extra arguments are provided, then the bias tensor is
+    // also present
+    TORCH_INTERNAL_ASSERT(args_num == 4 && bufs_in_num == 3);
+    const at::Tensor& b = tensors[3];
+
+    int64_t stride = extra_args[0];
+    int64_t padding = extra_args[1];
+    int64_t dilation = extra_args[2];
+    int64_t groups = extra_args[3];
+
+    try {
+      r = at::conv1d(x, w, b, {stride}, {padding}, {dilation}, groups);
+    } catch (...) {
+    }
+  } else {
+    try {
+      r = at::conv1d(x, w);
+    } catch (...) {
+    }
+  }
+
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_adaptive_avg_pool2d(
     int64_t bufs_num,
     void** buf_data,
@@ -844,6 +1335,33 @@ void nnc_aten_max_red(
   memcpy(buf_data[0], r.data_ptr(), r.element_size() * r.numel());
 }
 
+void nnc_aten_max_red_out(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t,
+    int64_t* extra_args) {
+  size_t bufs_out_num = 1u;
+  auto tensors = constructTensors2(
+      bufs_in_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes);
+
+  at::Tensor r;
+  // @lint-ignore CLANGTIDY
+  const at::Tensor& x = tensors[1];
+  int64_t max_dim = extra_args[0];
+  bool keep_dim = extra_args[1];
+  try {
+    r = std::get<0>(at::max(x, max_dim, keep_dim));
+  } catch (...) {
+  }
+  buf_data[0] = r.data_ptr();
+  c10::raw::intrusive_ptr::incref(r.getIntrusivePtr().get());
+  buf_data[bufs_in_num + bufs_out_num] = r.getIntrusivePtr().get();
+}
+
 void nnc_aten_addmm(
     int64_t bufs_num,
     void** buf_data,
@@ -972,15 +1490,27 @@ const static RegisterNNCExternalFunction nnc_conv2d(
 const static RegisterNNCExternalFunction nnc_quantized_conv1d(
     "nnc_aten_quantized_conv1d",
     nnc_aten_quantized_conv1d);
+const static RegisterNNCExternalFunction nnc_quantized_conv1d_out(
+    "nnc_aten_quantized_conv1d_out",
+    nnc_aten_quantized_conv1d_out);
 const static RegisterNNCExternalFunction nnc_quantized_conv2d(
     "nnc_aten_quantized_conv2d",
     nnc_aten_quantized_conv2d);
+const static RegisterNNCExternalFunction nnc_quantized_conv2d_out(
+    "nnc_aten_quantized_conv2d_out",
+    nnc_aten_quantized_conv2d_out);
 const static RegisterNNCExternalFunction nnc_quantized_conv2d_relu(
     "nnc_aten_quantized_conv2d_relu",
     nnc_aten_quantized_conv2d_relu);
+const static RegisterNNCExternalFunction nnc_quantized_conv2d_relu_out(
+    "nnc_aten_quantized_conv2d_relu_out",
+    nnc_aten_quantized_conv2d_relu_out);
 const static RegisterNNCExternalFunction nnc_quantized_linear(
     "nnc_aten_quantized_linear",
     nnc_aten_quantized_linear);
+const static RegisterNNCExternalFunction nnc_quantized_linear_out(
+    "nnc_aten_quantized_linear_out",
+    nnc_aten_quantized_linear_out);
 #ifndef _WIN32
 const static RegisterNNCExternalFunction nnc_quantized_add(
     "nnc_aten_quantized_add",
@@ -988,12 +1518,21 @@ const static RegisterNNCExternalFunction nnc_quantized_add(
 const static RegisterNNCExternalFunction nnc_quantized_mul(
     "nnc_aten_quantized_mul",
     nnc_aten_quantized_mul);
+const static RegisterNNCExternalFunction nnc_quantized_mul_out(
+    "nnc_aten_quantized_mul_out",
+    nnc_aten_quantized_mul_out);
 const static RegisterNNCExternalFunction nnc_quantized_mul_scalar(
     "nnc_aten_quantized_mul_scalar",
     nnc_aten_quantized_mul_scalar);
+const static RegisterNNCExternalFunction nnc_quantized_mul_scalar_out(
+    "nnc_aten_quantized_mul_scalar_out",
+    nnc_aten_quantized_mul_scalar_out);
 const static RegisterNNCExternalFunction nnc_quantized_sigmoid(
     "nnc_aten_quantized_sigmoid",
     nnc_aten_quantized_sigmoid);
+const static RegisterNNCExternalFunction nnc_quantized_sigmoid_out(
+    "nnc_aten_quantized_sigmoid_out",
+    nnc_aten_quantized_sigmoid_out);
 const static RegisterNNCExternalFunction nnc_quantized_cat(
     "nnc_aten_quantized_cat",
     nnc_aten_quantized_cat);
@@ -1004,16 +1543,28 @@ const static RegisterNNCExternalFunction nnc_quantized_relu(
 const static RegisterNNCExternalFunction nnc_quantize_per_tensor(
     "nnc_aten_quantize_per_tensor",
     nnc_aten_quantize_per_tensor);
+const static RegisterNNCExternalFunction nnc_quantize_per_tensor_out(
+    "nnc_aten_quantize_per_tensor_out",
+    nnc_aten_quantize_per_tensor_out);
 const static RegisterNNCExternalFunction nnc_dequantize(
     "nnc_aten_dequantize",
     nnc_aten_dequantize);
+const static RegisterNNCExternalFunction nnc_dequantize_out(
+    "nnc_aten_dequantize_out",
+    nnc_aten_dequantize_out);
 
 const static RegisterNNCExternalFunction nnc_upsample_nearest2d(
     "nnc_aten_upsample_nearest2d",
     nnc_aten_upsample_nearest2d);
+const static RegisterNNCExternalFunction nnc_upsample_nearest2d_out(
+    "nnc_aten_upsample_nearest2d_out",
+    nnc_aten_upsample_nearest2d_out);
 const static RegisterNNCExternalFunction nnc_conv1d(
     "nnc_aten_conv1d",
     nnc_aten_conv1d);
+const static RegisterNNCExternalFunction nnc_conv1d_out(
+    "nnc_aten_conv1d_out",
+    nnc_aten_conv1d_out);
 const static RegisterNNCExternalFunction nnc_adaptive_avg_pool2d(
     "nnc_aten_adaptive_avg_pool2d",
     nnc_aten_adaptive_avg_pool2d);
@@ -1023,6 +1574,9 @@ const static RegisterNNCExternalFunction nnc_mean(
 const static RegisterNNCExternalFunction nnc_max_red(
     "nnc_aten_max_red",
     nnc_aten_max_red);
+const static RegisterNNCExternalFunction nnc_max_red_out(
+    "nnc_aten_max_red_out",
+    nnc_aten_max_red_out);
 const static RegisterNNCExternalFunction nnc_addmm(
     "nnc_aten_addmm",
     nnc_aten_addmm);
diff --git a/torch/csrc/jit/tensorexpr/external_functions.h b/torch/csrc/jit/tensorexpr/external_functions.h
index 9e27ad1f4086..17de3f2e6812 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.h
+++ b/torch/csrc/jit/tensorexpr/external_functions.h
@@ -6,28 +6,41 @@
 #include <cstdint>
 #include <vector>
 
-#define FOR_ALL_EXTERNAL_FUNCTIONS(_) \
-  _(nnc_aten_conv2d)                  \
-  _(nnc_aten_matmul)                  \
-  _(nnc_aten_mv)                      \
-  _(nnc_aten_mm)                      \
-  _(nnc_aten_adaptive_avg_pool2d)     \
-  _(nnc_aten_mean)                    \
-  _(nnc_aten_addmm)                   \
-  _(nnc_aten_quantized_conv1d)        \
-  _(nnc_aten_quantized_conv2d)        \
-  _(nnc_aten_quantized_conv2d_relu)   \
-  _(nnc_aten_quantized_linear)        \
-  _(nnc_aten_quantized_linear_relu)   \
-  _(nnc_aten_quantized_add)           \
-  _(nnc_aten_quantized_cat)           \
-  _(nnc_aten_quantized_mul)           \
-  _(nnc_aten_quantized_mul_scalar)    \
-  _(nnc_aten_quantized_relu)          \
-  _(nnc_aten_quantized_sigmoid)       \
-  _(nnc_aten_quantize_per_tensor)     \
-  _(nnc_aten_dequantize)              \
-  _(nnc_aten_upsample_nearest2d)
+#define FOR_ALL_EXTERNAL_FUNCTIONS(_)   \
+  _(nnc_aten_conv2d)                    \
+  _(nnc_aten_conv1d)                    \
+  _(nnc_aten_conv1d_out)                \
+  _(nnc_aten_matmul)                    \
+  _(nnc_aten_mv)                        \
+  _(nnc_aten_mm)                        \
+  _(nnc_aten_adaptive_avg_pool2d)       \
+  _(nnc_aten_mean)                      \
+  _(nnc_aten_addmm)                     \
+  _(nnc_aten_max_red)                   \
+  _(nnc_aten_max_red_out)               \
+  _(nnc_aten_quantized_conv1d)          \
+  _(nnc_aten_quantized_conv1d_out)      \
+  _(nnc_aten_quantized_conv2d)          \
+  _(nnc_aten_quantized_conv2d_out)      \
+  _(nnc_aten_quantized_conv2d_relu)     \
+  _(nnc_aten_quantized_conv2d_relu_out) \
+  _(nnc_aten_quantized_linear)          \
+  _(nnc_aten_quantized_linear_out)      \
+  _(nnc_aten_quantized_linear_relu)     \
+  _(nnc_aten_quantized_add)             \
+  _(nnc_aten_quantized_cat)             \
+  _(nnc_aten_quantized_mul)             \
+  _(nnc_aten_quantized_mul_out)         \
+  _(nnc_aten_quantized_mul_scalar)      \
+  _(nnc_aten_quantized_mul_scalar_out)  \
+  _(nnc_aten_quantized_relu)            \
+  _(nnc_aten_quantized_sigmoid)         \
+  _(nnc_aten_quantize_per_tensor)       \
+  _(nnc_aten_quantize_per_tensor_out)   \
+  _(nnc_aten_dequantize)                \
+  _(nnc_aten_dequantize_out)            \
+  _(nnc_aten_upsample_nearest2d)        \
+  _(nnc_aten_upsample_nearest2d_out)
 
 #define DECLARE_EXTERNAL_FUNCTION(NAME) \
   TORCH_API void NAME(                  \
@@ -58,6 +71,17 @@ std::vector<at::Tensor> constructTensors(
     c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
         c10::nullopt);
 
+std::vector<at::Tensor> constructTensors2(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+        c10::nullopt,
+    size_t bufs_out_num = 0);
+
 #ifdef C10_MOBILE
 extern "C" {
 #endif
@@ -69,6 +93,8 @@ void DispatchParallel(
 
 FOR_ALL_EXTERNAL_FUNCTIONS(DECLARE_EXTERNAL_FUNCTION)
 
+TORCH_API void nnc_aten_free(int64_t bufs_num, void** ptrs) noexcept;
+
 #ifdef C10_MOBILE
 } // extern "C"
 #endif
diff --git a/torch/csrc/jit/tensorexpr/external_functions_codegen.cpp b/torch/csrc/jit/tensorexpr/external_functions_codegen.cpp
index 29f29a28e6db..5e049b397d2d 100644
--- a/torch/csrc/jit/tensorexpr/external_functions_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions_codegen.cpp
@@ -1,4 +1,4 @@
-// @generated by tools/codegen/gen.py from
+// @generated by torchgen/gen.py from
 // external_functions_codegen_template.cpp
 #include <torch/csrc/jit/tensorexpr/external_functions.h>
 
diff --git a/torch/csrc/jit/tensorexpr/external_functions_core.cpp b/torch/csrc/jit/tensorexpr/external_functions_core.cpp
new file mode 100644
index 000000000000..1d84b25e0c45
--- /dev/null
+++ b/torch/csrc/jit/tensorexpr/external_functions_core.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/jit/tensorexpr/external_functions_core.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+
+using ParallelCallee = void (*)(int64_t, int8_t*);
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept {
+  // TODO: preserve the func type.
+  try {
+    ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
+    at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
+      for (int64_t index = f_begin; index < f_end; index++) {
+        callee(index, packed_data);
+      }
+    });
+  } catch (...) {
+  }
+}
+
+void nnc_aten_free(int64_t bufs_num, void** ptrs) noexcept {
+  for (const auto i : c10::irange(bufs_num)) {
+    c10::raw::intrusive_ptr::decref((c10::TensorImpl*)ptrs[i]);
+  }
+}
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/external_functions_core.h b/torch/csrc/jit/tensorexpr/external_functions_core.h
new file mode 100644
index 000000000000..7d85d545af8c
--- /dev/null
+++ b/torch/csrc/jit/tensorexpr/external_functions_core.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept;
+
+TORCH_API void nnc_aten_free(int64_t bufs_num, void** ptrs) noexcept;
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h
index f8efd30b510d..f7caf135350a 100644
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -100,8 +100,10 @@ class AtomicAdd;
 class Block;
 class Cond;
 class ExternalCall;
+class ExternalCallWithAlloc;
 class For;
 class Free;
+class FreeExt;
 class PlacementAllocate;
 class SyncThreads;
 using AllocatePtr = NodePtr<Allocate>;
@@ -109,8 +111,10 @@ using AtomicAddPtr = NodePtr<AtomicAdd>;
 using BlockPtr = NodePtr<Block>;
 using CondPtr = NodePtr<Cond>;
 using ExternalCallPtr = NodePtr<ExternalCall>;
+using ExternalCallWithAllocPtr = NodePtr<ExternalCallWithAlloc>;
 using ForPtr = NodePtr<For>;
 using FreePtr = NodePtr<Free>;
+using FreeExtPtr = NodePtr<FreeExt>;
 using PlacementAllocatePtr = NodePtr<PlacementAllocate>;
 using SyncThreadsPtr = NodePtr<SyncThreads>;
 
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index e9ea44c2dde9..b6920aaa744f 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -192,6 +192,38 @@ ExternalCallPtr ExternalCall::make(
       buf.node(), func_name, buf_arg_nodes, ExprHandleVectorToExprVector(args));
 }
 
+ExternalCallWithAllocPtr ExternalCallWithAlloc::make(
+    const std::string& func_name,
+    const std::vector<BufHandle>& buf_out_args,
+    const std::vector<BufHandle>& buf_args,
+    const std::vector<ExprHandle>& args) {
+  std::vector<BufPtr> buf_out_arg_nodes;
+  buf_out_arg_nodes.reserve(buf_out_args.size());
+  for (const BufHandle& buf_out_arg : buf_out_args) {
+    buf_out_arg_nodes.push_back(buf_out_arg.node());
+  }
+
+  std::vector<BufPtr> buf_arg_nodes;
+  buf_arg_nodes.reserve(buf_args.size());
+  for (const BufHandle& buf_arg : buf_args) {
+    buf_arg_nodes.push_back(buf_arg.node());
+  }
+  return alloc<ExternalCallWithAlloc>(
+      func_name,
+      buf_out_arg_nodes,
+      buf_arg_nodes,
+      ExprHandleVectorToExprVector(args));
+}
+
+FreeExtPtr FreeExt::make(const std::vector<BufHandle>& bufs) {
+  std::vector<BufPtr> buf_nodes;
+  buf_nodes.reserve(bufs.size());
+  for (const BufHandle& buf : bufs) {
+    buf_nodes.push_back(buf.node());
+  }
+  return alloc<FreeExt>(buf_nodes);
+}
+
 std::vector<ExprPtr> ExprHandleVectorToExprVector(
     const std::vector<ExprHandle>& v) {
   std::vector<ExprPtr> result(v.size());
@@ -238,6 +270,26 @@ bool immediateIsNegative(ExprPtr e) {
   return false;
 }
 
+bool immediateIsPositive(ExprPtr e) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value() > 0;                 \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+  return false;
+}
+
+bool immediateIsZero(ExprPtr e) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value() == 0;                \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+  return false;
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 0ed5edf59494..9055bf579ed5 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -397,6 +397,10 @@ bool immediateEquals(ExprPtr e, T val) {
 
 TORCH_API bool immediateIsNegative(ExprPtr e);
 
+TORCH_API bool immediateIsPositive(ExprPtr e);
+
+TORCH_API bool immediateIsZero(ExprPtr e);
+
 // Represents a ramp vector node:
 //     [base, base + 1 * stride, ... , base + (lanes - 1) * stride]
 class TORCH_API Ramp : public ExprNode<Ramp> {
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
index 1144833c7990..5783519ea5a4 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
@@ -321,6 +321,28 @@ StmtPtr IRCloner::mutate(ExternalCallPtr v) {
   return alloc<ExternalCall>(buf_new, v->func_name(), buf_args_new, args_new);
 }
 
+StmtPtr IRCloner::mutate(ExternalCallWithAllocPtr v) {
+  std::vector<BufPtr> buf_out_args_new;
+  buf_out_args_new.reserve(v->buf_out_args().size());
+  for (const auto& buf_out_arg : v->buf_out_args()) {
+    buf_out_args_new.push_back(to<Buf>(buf_out_arg->accept_mutator(this)));
+  }
+
+  std::vector<BufPtr> buf_args_new;
+  buf_args_new.reserve(v->buf_args().size());
+  for (const auto& buf_arg : v->buf_args()) {
+    buf_args_new.push_back(to<Buf>(buf_arg->accept_mutator(this)));
+  }
+  std::vector<ExprPtr> args_new;
+  args_new.reserve(v->args().size());
+  for (const auto& arg : v->args()) {
+    args_new.push_back(arg->accept_mutator(this));
+  }
+
+  return alloc<ExternalCallWithAlloc>(
+      v->func_name(), buf_out_args_new, buf_args_new, args_new);
+}
+
 StmtPtr IRCloner::mutate(LetPtr v) {
   auto value_new = v->value()->accept_mutator(this);
   return alloc<Let>(v->var(), value_new);
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.h b/torch/csrc/jit/tensorexpr/ir_cloner.h
index 3cc2f4dcd6fe..37216693b35b 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.h
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -52,6 +52,7 @@ class TORCH_API IRCloner : public IRMutator {
   StmtPtr mutate(AtomicAddPtr v) override;
   StmtPtr mutate(SyncThreadsPtr v) override;
   StmtPtr mutate(ExternalCallPtr v) override;
+  StmtPtr mutate(ExternalCallWithAllocPtr v) override;
 
   StmtPtr mutate(AllocatePtr v) override;
   StmtPtr mutate(FreePtr v) override;
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
index 4fb8cd451d63..85449ba780bd 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -475,6 +475,50 @@ StmtPtr IRMutator::mutate(ExternalCallPtr v) {
   return v;
 }
 
+StmtPtr IRMutator::mutate(ExternalCallWithAllocPtr v) {
+  bool buf_out_args_changed = false;
+  std::vector<BufPtr> buf_out_args_new;
+  buf_out_args_new.reserve(v->buf_out_args().size());
+  for (const auto& buf_out_arg : v->buf_out_args()) {
+    BufPtr buf_out_arg_new = to<Buf>(buf_out_arg->accept_mutator(this));
+    TORCH_INTERNAL_ASSERT(
+        buf_out_arg_new, buildErrorMessage("IRMutator produced null for Buf."));
+    buf_out_args_new.push_back(buf_out_arg_new);
+    buf_out_args_changed |= buf_out_arg_new != buf_out_arg;
+  }
+
+  bool buf_args_changed = false;
+  std::vector<BufPtr> buf_args_new;
+  buf_args_new.reserve(v->buf_args().size());
+  for (const auto& buf_arg : v->buf_args()) {
+    BufPtr buf_arg_new = to<Buf>(buf_arg->accept_mutator(this));
+    TORCH_INTERNAL_ASSERT(
+        buf_arg_new, buildErrorMessage("IRMutator produced null for Buf."));
+    buf_args_new.push_back(buf_arg_new);
+    buf_args_changed |= buf_arg_new != buf_arg;
+  }
+
+  bool args_changed = false;
+  std::vector<ExprPtr> args_new;
+  args_new.reserve(v->args().size());
+  for (const auto& arg : v->args()) {
+    ExprPtr arg_new = arg->accept_mutator(this);
+    args_new.push_back(arg_new);
+    args_changed |= arg_new != arg;
+  }
+
+  if (buf_out_args_changed) {
+    v->set_buf_out_args(buf_out_args_new);
+  }
+  if (buf_args_changed) {
+    v->set_buf_args(buf_args_new);
+  }
+  if (args_changed) {
+    v->set_args(args_new);
+  }
+  return v;
+}
+
 StmtPtr IRMutator::mutate(AllocatePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
@@ -497,6 +541,24 @@ StmtPtr IRMutator::mutate(FreePtr v) {
   return v;
 }
 
+StmtPtr IRMutator::mutate(FreeExtPtr v) {
+  bool bufs_changed = false;
+  std::vector<BufPtr> bufs_new;
+  bufs_new.reserve(v->bufs().size());
+  for (const auto& buf : v->bufs()) {
+    BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
+    TORCH_INTERNAL_ASSERT(
+        buf_new, buildErrorMessage("IRMutator produced null for Buf."));
+    bufs_new.push_back(buf_new);
+    bufs_changed |= buf_new != buf;
+  }
+
+  if (bufs_changed) {
+    v->set_bufs(bufs_new);
+  }
+  return v;
+}
+
 StmtPtr IRMutator::mutate(PlacementAllocatePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h
index 2d37d49ba60c..aaecd34289bc 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.h
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -51,9 +51,11 @@ class TORCH_API IRMutator {
   virtual StmtPtr mutate(AtomicAddPtr v);
   virtual StmtPtr mutate(SyncThreadsPtr v);
   virtual StmtPtr mutate(ExternalCallPtr v);
+  virtual StmtPtr mutate(ExternalCallWithAllocPtr v);
 
   virtual StmtPtr mutate(AllocatePtr v);
   virtual StmtPtr mutate(FreePtr v);
+  virtual StmtPtr mutate(FreeExtPtr v);
   virtual StmtPtr mutate(PlacementAllocatePtr v);
   virtual StmtPtr mutate(LetPtr v);
   virtual StmtPtr mutate(CondPtr v);
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index e2b8ef7b2293..680dd5da50aa 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -402,7 +402,7 @@ void IRPrinter::visit(ReduceOpPtr v) {
     if (!first) {
       os() << ", ";
     }
-    os() << d->name_hint();
+    os() << *d;
     first = false;
   }
   os() << "})";
@@ -483,6 +483,19 @@ void IRPrinter::visit(FreePtr v) {
   os() << "Free(" << *v->buffer_var() << ");";
 }
 
+void IRPrinter::visit(FreeExtPtr v) {
+  os() << "FreeExt(bufs={";
+  int i = 0;
+  for (const auto& buf : v->bufs()) {
+    if (i++ > 0) {
+      os() << ", ";
+    }
+    os() << *buf;
+  }
+
+  os() << "});";
+}
+
 void IRPrinter::visit(PlacementAllocatePtr v) {
   os() << "Alias(" << *v->buf()->base_handle() << ","
        << *v->buf_to_reuse()->base_handle() << ");";
@@ -552,6 +565,37 @@ void IRPrinter::visit(ExternalCallPtr v) {
   os() << "})";
 }
 
+void IRPrinter::visit(ExternalCallWithAllocPtr v) {
+  int i = 0;
+  for (const auto& buf_out_arg : v->buf_out_args()) {
+    if (i++ > 0) {
+      os() << ", ";
+    }
+    os() << *buf_out_arg;
+  }
+
+  os() << " := " << v->func_name() << "(";
+
+  os() << "buf_args={";
+  i = 0;
+  for (const auto& buf_arg : v->buf_args()) {
+    if (i++ > 0) {
+      os() << ", ";
+    }
+    os() << *buf_arg;
+  }
+
+  os() << "}, args={";
+  i = 0;
+  for (const auto& arg : v->args()) {
+    if (i++ > 0) {
+      os() << ", ";
+    }
+    os() << *arg;
+  }
+  os() << "})";
+}
+
 void IRPrinter::emitIndent() {
   os() << std::setw(2 * indent_) << "";
 }
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index c58012e8a1b8..661d3a463d13 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -55,12 +55,14 @@ class TORCH_API IRPrinter : public IRVisitor {
   void visit(AtomicAddPtr v) override;
   void visit(SyncThreadsPtr v) override;
   void visit(ExternalCallPtr v) override;
+  void visit(ExternalCallWithAllocPtr v) override;
   void visit(StorePtr v) override;
   void visit(ForPtr v) override;
   void visit(CondPtr v) override;
   void visit(BlockPtr v) override;
   void visit(AllocatePtr v) override;
   void visit(FreePtr v) override;
+  void visit(FreeExtPtr v) override;
   void visit(PlacementAllocatePtr v) override;
   void visit(LetPtr v) override;
 
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 42e9629c0d53..4cba928feaaa 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 
@@ -2555,7 +2556,7 @@ StmtPtr SimplifierUnderContext::mutate(ForPtr v) {
   // bound info after the for stmt, we can use it to simplify the assignment
   // stmt x = (i+20)/5 to x = 4.
   bool has_bounds = false;
-  std::pair<ExprPtr, ExprPtr> bound_old;
+  analysis::Bound bound_old;
   VarPtr var_key = to<Var>(var);
   auto got = var_bound_info_.find(var_key);
   if (got != var_bound_info_.end()) {
@@ -2563,8 +2564,7 @@ StmtPtr SimplifierUnderContext::mutate(ForPtr v) {
     bound_old = got->second;
   }
   // set bounds info for index var
-  const std::pair<ExprPtr, ExprPtr> bound_new =
-      std::make_pair(start_new, stop_new);
+  const analysis::Bound bound_new(start_new, stop_new);
   var_bound_info_[var_key] = bound_new;
 
   ExprPtr iters = alloc<Sub>(stop_new, start_new);
@@ -2703,10 +2703,10 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
   }
 
   // check the bounds of 'i'
-  auto start = got->second.first;
+  auto start = got->second.start;
   // open upper bound, i.e.,  end is one more than the maximum value in the
   // range
-  auto end = got->second.second;
+  auto end = got->second.end;
   ExprPtr check_start = IRSimplifier::simplify(
       alloc<CompareSelect>(start, immLike(start, 0), kGE));
   ExprPtr check_end =
@@ -2741,7 +2741,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
 
     // check if j is not negative
     sign_check = IRSimplifier::simplify(alloc<CompareSelect>(
-        got->second.first, immLike(got->second.first, 0), kGE));
+        got->second.start, immLike(got->second.start, 0), kGE));
     if (sign_check->isConstant() && immediateEquals(sign_check, 1)) {
       return ret_var;
     }
@@ -2823,10 +2823,10 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
   }
 
   // check the bounds of 'i'
-  auto start = got->second.first;
+  auto start = got->second.start;
   // open upper bound, i.e.,  end is one more than the maximum value in the
   // range
-  auto end = got->second.second;
+  auto end = got->second.end;
   ExprPtr check_start = IRSimplifier::simplify(
       alloc<CompareSelect>(start, immLike(start, 0), kGE));
   ExprPtr check_end =
@@ -2860,7 +2860,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
 
     // check if j is not negative
     sign_check = IRSimplifier::simplify(alloc<CompareSelect>(
-        got->second.first, immLike(got->second.first, 0), kGE));
+        got->second.start, immLike(got->second.start, 0), kGE));
     if (sign_check->isConstant() && immediateEquals(sign_check, 1)) {
       return var_key;
     }
@@ -2887,8 +2887,8 @@ ExprPtr SimplifierUnderContext::mutate(DivPtr v) {
   if (lhsVar && rhsScalar && !rhsScalar->dtype().is_floating_point()) {
     auto got = var_bound_info_.find(lhsVar);
     if (got != var_bound_info_.end()) {
-      auto start = got->second.first;
-      auto end = got->second.second;
+      auto start = got->second.start;
+      auto end = got->second.end;
       ExprPtr check_start = IRSimplifier::simplify(
           alloc<CompareSelect>(start, immLike(start, 0), kGE));
       ExprPtr check_end =
@@ -2910,6 +2910,86 @@ ExprPtr SimplifierUnderContext::mutate(DivPtr v) {
   return alloc<Div>(lhs_new, rhs_new);
 }
 
+ExprPtr SimplifierUnderContext::mutate(IfThenElsePtr v) {
+  ExprPtr condition = v->condition();
+  ExprPtr true_val = v->true_value();
+  ExprPtr false_val = v->false_value();
+
+  auto simplified_condition =
+      IRSimplifier::simplify(condition->accept_mutator(this));
+  auto simplified_true_val =
+      IRSimplifier::simplify(true_val->accept_mutator(this));
+  auto simplified_false_val =
+      IRSimplifier::simplify(false_val->accept_mutator(this));
+  if (simplified_condition->isConstant()) {
+    return immediateAs<int>(simplified_condition) ? simplified_true_val
+                                                  : simplified_false_val;
+  }
+
+  bool nothing_changed = (simplified_condition == condition) &&
+      (simplified_true_val == true_val) && (simplified_false_val == false_val);
+  return nothing_changed
+      ? v
+      : alloc<IfThenElse>(
+            simplified_condition, simplified_true_val, simplified_false_val);
+}
+
+ExprPtr SimplifierUnderContext::mutate(CompareSelectPtr v) {
+  GRAPH_DEBUG("(SimplifierUnderContext) Original: ", std::to_string(v));
+
+  ExprPtr lhs = v->lhs();
+  ExprPtr rhs = v->rhs();
+  ExprPtr ret1 = v->ret_val1();
+  ExprPtr ret2 = v->ret_val2();
+
+  auto simplified_lhs = IRSimplifier::simplify(lhs->accept_mutator(this));
+  auto simplified_rhs = IRSimplifier::simplify(rhs->accept_mutator(this));
+  auto simplified_ret1 = IRSimplifier::simplify(ret1->accept_mutator(this));
+  auto simplified_ret2 = IRSimplifier::simplify(ret2->accept_mutator(this));
+
+  ExprPtr simplified_cmp_select_expr = nullptr;
+  if ((simplified_lhs == lhs) && (simplified_rhs == rhs) &&
+      (simplified_ret1 == ret1) && (simplified_ret2 == ret2)) {
+    simplified_cmp_select_expr = v;
+  } else {
+    simplified_cmp_select_expr = alloc<CompareSelect>(
+        simplified_lhs,
+        simplified_rhs,
+        simplified_ret1,
+        simplified_ret2,
+        v->compare_select_op(),
+        v->bias());
+  }
+
+  GRAPH_DEBUG(
+      "(SimplifierUnderContext) after simplify: ",
+      std::to_string(simplified_cmp_select_expr));
+
+  analysis::Bound lhs_bound;
+  analysis::Bound rhs_bound;
+  auto lhs_has_bound = getLoopBoundInfo(simplified_lhs, &lhs_bound);
+  auto rhs_has_bound = getLoopBoundInfo(simplified_rhs, &rhs_bound);
+  if (!lhs_has_bound || !rhs_has_bound) {
+    GRAPH_DEBUG(
+        "(SimplifierUnderContext) Final: ",
+        std::to_string(simplified_cmp_select_expr));
+    return simplified_cmp_select_expr;
+  }
+
+  analysis::CmpEvalResult cmp_res =
+      analysis::compareBound(lhs_bound, rhs_bound, v->compare_select_op());
+
+  // Return the simplified ret1/ret2 if the compare result is deterministic.
+  // Otherwise, return the simplified CompareSelect directly.
+  auto ret_expr = (cmp_res == analysis::CmpEvalResult::TRUE)
+      ? simplified_ret1
+      : ((cmp_res == analysis::CmpEvalResult::FALSE)
+             ? simplified_ret2
+             : simplified_cmp_select_expr);
+  GRAPH_DEBUG("(SimplifierUnderContext) Final: ", std::to_string(ret_expr));
+  return ret_expr;
+}
+
 ExprPtr SimplifierUnderContext::mutate(ModPtr v) {
   ExprPtr lhs = v->lhs();
   ExprPtr rhs = v->rhs();
@@ -2928,8 +3008,8 @@ ExprPtr SimplifierUnderContext::mutate(ModPtr v) {
   if (lhsVar && rhsScalar && !rhsScalar->dtype().is_floating_point()) {
     auto got = var_bound_info_.find(lhsVar);
     if (got != var_bound_info_.end()) {
-      auto start = got->second.first;
-      auto end = got->second.second;
+      auto start = got->second.start;
+      auto end = got->second.end;
       ExprPtr check_start = IRSimplifier::simplify(
           alloc<CompareSelect>(start, immLike(start, 0), kGE));
       ExprPtr check_end =
@@ -2950,6 +3030,40 @@ ExprPtr SimplifierUnderContext::mutate(ModPtr v) {
   return alloc<Mod>(lhs_new, rhs_new);
 }
 
+bool SimplifierUnderContext::getLoopBoundInfo(
+    const ExprPtr& expr,
+    analysis::Bound* loop_bound_info) {
+  if (expr == nullptr)
+    return false;
+
+  if (expr->isConstant()) {
+    loop_bound_info->start = expr;
+    loop_bound_info->end = expr;
+    return true;
+  }
+
+  VarPtr var_key = to<Var>(expr);
+  if (var_key == nullptr) {
+    return false;
+  }
+
+  auto got = var_bound_info_.find(var_key);
+  if (got == var_bound_info_.end()) {
+    return false;
+  }
+
+  loop_bound_info->start = got->second.start;
+  // TODO: Need to add the boundary information(close/open) of a range to
+  // Bound. Currently, the VarBoundInfo comes from for-loop statement while
+  // the end of the boundary is open. But we assume the start and end of a
+  // range are always close. Hence, we explicitly convert the open boundary to
+  // close.
+  //   [for-start, for-stop) => [for-start, for-stop -1]
+  loop_bound_info->end = IRSimplifier::simplify(
+      alloc<Sub>(got->second.end, immLike(got->second.end, 1)));
+  return true;
+}
+
 bool exprEquals(ExprPtr A, ExprPtr B) {
   try {
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h
index 11d004f395ed..40bf0d8cd4b9 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/hash_provider.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
@@ -372,7 +373,8 @@ class MinTerm : public ExprNode<MinTerm> {
 };
 
 // Context-sensitive IR simplification
-using VarBoundInfo = std::unordered_map<VarPtr, std::pair<ExprPtr, ExprPtr>>;
+using VarBoundInfo = std::unordered_map<VarPtr, analysis::Bound>;
+
 class TORCH_API SimplifierUnderContext : public IRMutator {
  public:
   ~SimplifierUnderContext() override = default;
@@ -381,6 +383,11 @@ class TORCH_API SimplifierUnderContext : public IRMutator {
 
   ExprPtr mutate(DivPtr v) override;
   ExprPtr mutate(ModPtr v) override;
+  ExprPtr mutate(CompareSelectPtr v) override;
+  ExprPtr mutate(IfThenElsePtr v) override;
+
+ protected:
+  bool getLoopBoundInfo(const ExprPtr& expr, analysis::Bound* loop_bound_info);
 
  protected:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
index 649a51ee4577..6479d99f214b 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
@@ -140,6 +140,24 @@ void IRVisitor::visit(ExternalCallPtr v) {
   }
 }
 
+void IRVisitor::visit(ExternalCallWithAllocPtr v) {
+  for (const auto& buf_out_arg : v->buf_out_args()) {
+    buf_out_arg->accept(this);
+  }
+  for (const auto& buf_arg : v->buf_args()) {
+    buf_arg->accept(this);
+  }
+  for (const auto& arg : v->args()) {
+    arg->accept(this);
+  }
+}
+
+void IRVisitor::visit(FreeExtPtr v) {
+  for (const auto& buf : v->bufs()) {
+    buf->accept(this);
+  }
+}
+
 void IRVisitor::visit(BlockPtr v) {
   for (StmtPtr s : *v) {
     s->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h
index 2bb48088d89f..09e6069dba1c 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.h
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -43,6 +43,7 @@ class TORCH_API IRVisitor {
   virtual void visit(IntrinsicsPtr v);
   virtual void visit(AllocatePtr v);
   virtual void visit(FreePtr v);
+  virtual void visit(FreeExtPtr v);
   virtual void visit(PlacementAllocatePtr v);
   virtual void visit(LetPtr v);
   virtual void visit(CondPtr v);
@@ -55,6 +56,7 @@ class TORCH_API IRVisitor {
   virtual void visit(AtomicAddPtr v);
   virtual void visit(SyncThreadsPtr v);
   virtual void visit(ExternalCallPtr v);
+  virtual void visit(ExternalCallWithAllocPtr v);
 };
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index dc3603361805..36ab63b05a71 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -208,7 +208,9 @@ std::vector<int64_t> _pair_int(IValue v) {
   }
 }
 
-static bool isContiguous(const torch::jit::Value* v) {
+static bool isContiguous(
+    const torch::jit::Value* v,
+    at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) {
   auto const& tt = v->type()->cast<TensorType>();
   if (!tt) {
     return false;
@@ -221,6 +223,14 @@ static bool isContiguous(const torch::jit::Value* v) {
   if (!sizes || !strides) {
     return false;
   }
+
+  // Check dimension size first
+  int ndims = (*sizes).size();
+  if ((memory_format == at::MemoryFormat::ChannelsLast && ndims != 4) ||
+      (memory_format == at::MemoryFormat::ChannelsLast3d && ndims != 5)) {
+    return false;
+  }
+
   return *strides == TensorType::contiguousStridesOf(*sizes);
 }
 
@@ -362,6 +372,8 @@ ArgValue TensorExprKernel::toArg(const torch::jit::Value* v) const {
       return val.toIntVector();
     } else if (val.isDoubleList()) {
       return val.toDoubleVector();
+    } else if (val.isString()) {
+      return val.toStringRef();
     } else {
       throw unsupported_dtype(val.type()->str());
     }
@@ -473,8 +485,38 @@ Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
     hasRandom_ = true;
   }
 
+  // Check if the tensor is a contiguous tensor
+  bool is_contiguous = false;
+  // Check if the tensor is a channels-last contiguous tensor
+  bool is_channels_last_contiguous = false;
+  for (auto input : inputs) {
+    if (input->type()->kind() != TypeKind::TensorType)
+      continue;
+
+    TORCH_CHECK(bufs_.count(input) > 0);
+    auto buf_ = bufs_.at(input);
+
+    auto _is_contiguous = buf_->is_contiguous();
+    if (_is_contiguous) {
+      is_contiguous |= _is_contiguous;
+    } else {
+      is_channels_last_contiguous |=
+          (buf_->is_contiguous(at::MemoryFormat::ChannelsLast) ||
+           buf_->is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+           buf_->is_channels_last_1d_contiguous());
+    }
+  }
+
   auto outputType = findDtypeForValue(v);
   std::vector<ExprHandle> outputShape = sizesForValue(v);
+  std::vector<ExprHandle> outputStrides;
+  if (is_channels_last_contiguous && (!is_contiguous)) {
+    outputStrides =
+        c10::fmap<ExprHandle>(make_channels_last_strides(outputShape));
+  } else {
+    // Default
+    outputStrides = c10::fmap<ExprHandle>(make_contiguous_strides(outputShape));
+  }
 
   std::vector<ArgValue> argInputs;
   if (op == prim::ConstantChunk) {
@@ -519,12 +561,14 @@ Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
   }
 
   if (NNCLoweringFunction custom_lowering = getCustomLoweringFor(op)) {
-    return custom_lowering(argInputs, outputShape, outputType, device_);
+    return custom_lowering(
+        argInputs, outputShape, outputStrides, outputType, device_);
   }
   if (v->node()->maybeSchema()) {
     if (NNCLoweringFunction lowering =
             getStandardLoweringFor(c10::toString(v->node()->schema()))) {
-      return lowering(argInputs, outputShape, outputType, device_);
+      return lowering(
+          argInputs, outputShape, outputStrides, outputType, device_);
     }
   }
   std::string msg = std::string("Unhandled node kind (in computeValue): ") +
@@ -535,29 +579,17 @@ Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
   throw malformed_input(msg);
 }
 
-// Return the (lower, upper) loop bounds if they are constants, else nullopt.
-c10::optional<std::pair<int64_t, int64_t>> loopBounds(ForPtr loop) {
-  auto start = IRSimplifier::simplify(loop->start());
-  auto stop = IRSimplifier::simplify(loop->stop());
-  if (!start->isConstant() || !stop->isConstant()) {
-    return c10::nullopt;
-  }
-  return c10::make_optional(
-      std::make_pair(immediateAs<int64_t>(start), immediateAs<int64_t>(stop)));
-}
-
 // True if all the loops in this vector have equal bounds.
 bool loopBoundsAllEqual(const std::vector<ForPtr>& loops) {
-  auto bounds = loopBounds(loops[0]);
-  if (!bounds) {
-    return false;
+  if (loops.size() <= 1) {
+    return true;
   }
-  for (auto const& loop : loops) {
-    auto next = loopBounds(loop);
-    if (!next) {
-      return false;
-    }
-    if (bounds->first != next->first || bounds->second != next->second) {
+  const auto& start = loops.front()->start();
+  const auto& stop = loops.front()->stop();
+  for (size_t i = 1; i < loops.size(); ++i) {
+    const auto& curr_start = loops[i]->start();
+    const auto& curr_stop = loops[i]->stop();
+    if (!exprEquals(start, curr_start) || !exprEquals(stop, curr_stop)) {
       return false;
     }
   }
@@ -583,6 +615,8 @@ void fuseAllLoops(StmtPtr st) {
     if (loopsToFuse.empty()) {
       return;
     }
+    // TODO: Support fusing some of the loops in a block.
+    // Currently, we only fuse all the loops in a block, which is restrictive.
     if (!loopBoundsAllEqual(loopsToFuse)) {
       return;
     }
@@ -1003,28 +1037,53 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
   auto const& outputs = input->owningGraph()->outputs();
   std::unordered_set<const Value*> outputs_set(outputs.begin(), outputs.end());
 
+  auto is_concrete_cont = [](const torch::jit::Value* input) {
+    if (input->isCompleteTensor()) {
+      return isContiguous(input);
+    } else {
+      return false;
+    }
+  };
+
+  auto is_symbolic_cont = [](std::vector<torch::jit::StrideInput> desc) {
+    if (desc.size() == 1) {
+      return desc[0] == torch::jit::StrideInput::TENSOR_CONT;
+    } else {
+      return false;
+    }
+  };
+
   Tensor result(nullptr, nullptr);
   switch (t->kind()) {
     case TypeKind::TensorType: {
       auto tt = input->type()->cast<TensorType>();
-      bool contiguous_concrete_tensor =
-          (input->isCompleteTensor() && isContiguous(input));
-      bool contiguous_sym_tensor = false;
+      bool contiguous_concrete_tensor = is_concrete_cont(input);
+      bool contiguous_symbolic_tensor = false;
       if (has_symbolic_shapes_) {
         auto desc = getSymbolicInputStrideDesc(input);
-        contiguous_sym_tensor =
-            desc.size() == 1 && desc[0] == torch::jit::StrideInput::TENSOR_CONT;
+        contiguous_symbolic_tensor = is_symbolic_cont(desc);
       }
 
+      // Get input size and strides
+      auto size_handles = sizesFromSymbolicShape(tt->symbolic_sizes());
+      auto inputTensorStrides = getInputStrides(input, size_handles);
+
       // We don't need to copy the input if:
       //  1) it is not an output AND
       //  2) it is contiguous
-      bool contiguous = contiguous_concrete_tensor || contiguous_sym_tensor;
+      bool contiguous =
+          contiguous_concrete_tensor || contiguous_symbolic_tensor;
       if (!outputs_set.count(input) && contiguous) {
         BufHandle inBuffer(
             "t" + input_name_map_[input],
             sizesFromSymbolicShape(tt->symbolic_sizes()),
+            inputTensorStrides,
             ToDtype(static_cast<ScalarType>(*tt->scalarType())));
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            inBuffer.node()->is_contiguous() ||
+            inBuffer.node()->is_channels_last_1d_contiguous() ||
+            inBuffer.node()->is_contiguous(at::MemoryFormat::ChannelsLast) ||
+            inBuffer.node()->is_contiguous(at::MemoryFormat::ChannelsLast3d));
         bufs_.emplace(input, inBuffer.node());
         bufferArgs_.emplace_back(inBuffer);
         break;
@@ -1033,13 +1092,6 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
       // if the input isn't contiguous or is an output,
       // write strided input into  contiguous buffer that is
       // then used in all further compute
-      std::vector<DimArg> inputTensorDims;
-      auto size_handles = sizesFromSymbolicShape(tt->symbolic_sizes());
-      for (size_t i = 0; i < size_handles.size(); i++) {
-        auto size = size_handles[i];
-        inputTensorDims.emplace_back(DimArg(size, "i" + c10::to_string(i)));
-      }
-      auto inputTensorStrides = getInputStrides(input, size_handles);
       ExprHandle flat_size = 1;
       for (size_t i = 0; i < size_handles.size(); ++i) {
         auto size = size_handles[i];
@@ -1057,7 +1109,7 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
 
       result = Compute(
           "input" + c10::to_string(bufs_.size() + 1),
-          inputTensorDims,
+          size_handles,
           [&](const std::vector<VarHandle>& axes) {
             ExprHandle idx = 0;
             for (size_t i = 0; i < axes.size(); i++) {
@@ -1144,11 +1196,10 @@ Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(
   // for stride in strides_from_largest_to_smallest:
   //     cur_idx = absolute // stride
   //     absolute = absolute % stride
-  auto dims = c10::fmap<DimArg>(sizes);
   std::vector<ExprPtr> default_strides = make_contiguous_strides(sizes);
   auto zero = LongImm::make(0);
   return Compute(
-      "output_1", dims, [&](const std::vector<VarHandle>& axes_input) {
+      "output_1", sizes, [&](const std::vector<VarHandle>& axes_input) {
         std::vector<ExprHandle> axes(axes_input.begin(), axes_input.end());
         auto absolute_position = ExprHandle(immLike(axes[0], 0));
         for (size_t i = 0; i < axes.size(); ++i) {
@@ -1182,16 +1233,15 @@ Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(
           "Ouput tensor has no corresponding bufs in the fuser."));
   BufPtr buf = bufs_.at(v);
   // output is contiguous, no work to do
-  if (tensorOutputStrideDesc_[v->offset()] ==
-      torch::jit::StrideInput::TENSOR_CONT) {
+  auto stride_desc = tensorOutputStrideDesc_[v->offset()];
+  if (stride_desc == torch::jit::StrideInput::TENSOR_CONT) {
     return Tensor(buf, nullptr);
-    ;
   }
+
   TORCH_INTERNAL_ASSERT(
       tensorOutputStrideDesc_[v->offset()] ==
       torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST);
   auto sizes = sizesFromSymbolicShape(tt->symbolic_sizes());
-  auto dims = c10::fmap<DimArg>(sizes);
   auto strides = make_channels_last_strides(sizes);
   // For a tensor with dimensions N C H W, channels last
   // format will is in format N H W C,
@@ -1243,7 +1293,7 @@ Tensor TensorExprKernel::convertStaticShapeOutputToCorrectStrides(
     return Tensor(buf, nullptr);
   }
 
-  auto dims = c10::fmap<DimArg>(sizesForValue(v));
+  auto dims = sizesForValue(v);
   auto zero = LongImm::make(0);
   std::vector<size_t> sorted_stride_indices = reverse_sort_indices(strides);
 
@@ -1296,7 +1346,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
   }
   auto const_tensor = toIValue(v)->toTensor();
   auto scalar_type = c10::typeMetaToScalarType(const_tensor.options().dtype());
-  const auto& tt = v->type()->expect<TensorType>();
   auto sizes = const_tensor.sizes();
   std::vector<ExprHandle> te_sizes;
   te_sizes.reserve(sizes.size());
@@ -1642,7 +1691,7 @@ TensorExprKernel::TensorExprKernel(
   }
 }
 
-void TensorExprKernel::run(Stack& stack) {
+void TensorExprKernel::run(Stack& stack) const {
   if (!use_fallback_ && !allow_fallback_) {
     runKernel(stack);
   } else if (!use_fallback_ && allow_fallback_) {
@@ -1656,37 +1705,44 @@ void TensorExprKernel::run(Stack& stack) {
   }
 }
 
-void TensorExprKernel::updateOutputSizesAndStrides(
-    const at::ArrayRef<IValue>& inputs) {
+void TensorExprKernel::getStaticOutputSizesAndStrides(
+    const at::ArrayRef<IValue>& inputs,
+    std::vector<std::vector<int64_t>>* sizes,
+    std::vector<std::vector<int64_t>>* strides) const {
   TORCH_INTERNAL_ASSERT(has_symbolic_shapes_);
   // If there are symbolic shapes, then the output tensor size wouldn't have
   // been computed at compile time. That has to be done here by using the
   // symbolic shape input params passed in to this call.
   TORCH_INTERNAL_ASSERT(
       tensorOutputSymbolicSizes_.size() == bufOutputs_.size());
-  TORCH_INTERNAL_ASSERT(tensorOutputSizes_.size() == bufOutputs_.size());
-  TORCH_INTERNAL_ASSERT(tensorOutputStrides_.size() == bufOutputs_.size());
+
+  TORCH_INTERNAL_ASSERT(sizes);
+  TORCH_INTERNAL_ASSERT(strides);
+  *sizes = tensorOutputSizes_;
+  *strides = tensorOutputStrides_;
+  auto& static_sizes = *sizes;
+  auto& static_strides = *strides;
   for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) {
-    tensorOutputSizes_[i].clear();
+    static_sizes[i].clear();
     for (auto t : tensorOutputSymbolicSizes_[i]) {
       if (t.AsNode<LongImm>()) {
-        tensorOutputSizes_[i].emplace_back(immediateAs<int64_t>(t.node()));
+        static_sizes[i].emplace_back(immediateAs<int64_t>(t.node()));
       } else {
         auto input_pos = shapeSymbolInputPos_.at(t.node());
         TORCH_INTERNAL_ASSERT(input_pos < inputs.size());
         TORCH_INTERNAL_ASSERT(inputs[input_pos].isInt());
-        tensorOutputSizes_[i].emplace_back(inputs[input_pos].toInt());
+        static_sizes[i].emplace_back(inputs[input_pos].toInt());
       }
     }
 
     if (tensorOutputStrideDesc_[i] == torch::jit::StrideInput::TENSOR_CONT) {
-      tensorOutputStrides_[i] =
-          TensorType::contiguousStridesOf(tensorOutputSizes_[i]);
+      static_strides[i] = TensorType::contiguousStridesOf(static_sizes[i]);
+
     } else if (
         tensorOutputStrideDesc_[i] ==
         torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST) {
-      tensorOutputStrides_[i] =
-          at::get_channels_last_strides_2d(tensorOutputSizes_[i]);
+      static_strides[i] = at::get_channels_last_strides_2d(static_sizes[i]);
+
     } else {
       std::string output_desc = toString(tensorOutputStrideDesc_[i]);
       TORCH_INTERNAL_ASSERT(
@@ -1697,7 +1753,7 @@ void TensorExprKernel::updateOutputSizesAndStrides(
 
 std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
     const at::ArrayRef<IValue>& inputs,
-    std::vector<at::Tensor>& outputs) {
+    std::vector<at::Tensor>& outputs) const {
   // TODO: preallocate `runArgs` during compilation and fill in values where
   // possible (e.g. for constant tensors)
   std::vector<CodeGen::CallArg> runArgs;
@@ -1717,7 +1773,9 @@ std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
   }
 
   if (has_symbolic_shapes_) {
-    updateOutputSizesAndStrides(inputs);
+    std::vector<std::vector<int64_t>> static_sizes;
+    std::vector<std::vector<int64_t>> static_strides;
+    getStaticOutputSizesAndStrides(inputs, &static_sizes, &static_strides);
 
     // add stride args
     for (const auto& input_stride_arg : input_stride_args_) {
@@ -1725,18 +1783,30 @@ std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
           inputs[input_stride_arg.first].toTensor().strides().at(
               input_stride_arg.second));
     }
-  }
 
-  for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) {
-    auto const& opts = tensorOutputTensorOptions_[i];
-    outputs.emplace_back(codegen_->empty_strided(
-        tensorOutputSizes_[i],
-        tensorOutputStrides_[i],
-        opts.dtype,
-        opts.layout,
-        opts.device,
-        opts.pinned_memory));
-    runArgs.emplace_back(outputs.back().data_ptr());
+    for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) {
+      auto const& opts = tensorOutputTensorOptions_[i];
+      outputs.emplace_back(codegen_->empty_strided(
+          static_sizes[i],
+          static_strides[i],
+          opts.dtype,
+          opts.layout,
+          opts.device,
+          opts.pinned_memory));
+      runArgs.emplace_back(outputs.back().data_ptr());
+    }
+  } else {
+    for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) {
+      auto const& opts = tensorOutputTensorOptions_[i];
+      outputs.emplace_back(codegen_->empty_strided(
+          tensorOutputSizes_[i],
+          tensorOutputStrides_[i],
+          opts.dtype,
+          opts.layout,
+          opts.device,
+          opts.pinned_memory));
+      runArgs.emplace_back(outputs.back().data_ptr());
+    }
   }
 
   for (auto c : constants_) {
@@ -1750,7 +1820,7 @@ StmtPtr TensorExprKernel::getCodeGenStmt() {
   return codegen_->stmt();
 }
 
-void TensorExprKernel::runKernel(Stack& stack) {
+void TensorExprKernel::runKernel(Stack& stack) const {
   // Set up arguments (inputs, then outputs) for kernel call.
   auto inputs = last(stack, nInputs_);
   std::vector<at::Tensor> outputs;
@@ -1777,7 +1847,7 @@ void TensorExprKernel::runKernel(Stack& stack) {
 
 void TensorExprKernel::runFast(
     const std::vector<void*>& inputs,
-    const std::vector<void*>& outputs) {
+    const std::vector<void*>& outputs) const {
   std::vector<void*> args(inputs);
   args.reserve(inputs.size() + outputs.size() + constants_.size());
   args.insert(args.end(), outputs.begin(), outputs.end());
@@ -1791,7 +1861,7 @@ void TensorExprKernel::runFast(
   codegen_->call_raw(args);
 }
 
-void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) {
+void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) const {
   TORCH_INTERNAL_ASSERT(
       device_ == at::kCPU,
       "Pre-allocated output tensors are supported only on CPUs.");
@@ -1819,7 +1889,10 @@ void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) {
 
   std::vector<int64_t> stride_values(input_stride_args_.size());
   if (has_symbolic_shapes_) {
-    updateOutputSizesAndStrides(stack_inputs);
+    std::vector<std::vector<int64_t>> static_sizes;
+    std::vector<std::vector<int64_t>> static_strides;
+    getStaticOutputSizesAndStrides(
+        stack_inputs, &static_sizes, &static_strides);
 
     // add stride args
     for (auto idx : c10::irange(input_stride_args_.size())) {
@@ -1835,7 +1908,7 @@ void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) {
       auto& out = stack_outputs[i].toTensor();
       // This has only been tested on CPUs.
       // TODO: Test on GPUs.
-      out.resize_(tensorOutputSizes_[i]);
+      out.resize_(static_sizes[i]);
       args.emplace_back(out.data_ptr());
     }
   } else {
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 249668132106..e655a58db739 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -145,16 +145,16 @@ class TORCH_API TensorExprKernel {
             pre_alloc,
             symbolic_strides) {}
 
-  void run(Stack& stack);
+  void run(Stack& stack) const;
   void runFast(
       const std::vector<void*>& inputs,
-      const std::vector<void*>& outputs);
+      const std::vector<void*>& outputs) const;
   // Expected format of stack:
   //  ... <outputs> <inputs>
   // i.e., output IValues must be below the input IValues in the stack.
-  void runWithAllocatedOutputs(Stack& stack);
+  void runWithAllocatedOutputs(Stack& stack) const;
 
-  void fallback(Stack& stack) {
+  void fallback(Stack& stack) const {
     InterpreterState(code_).run(stack);
   }
   void recompile();
@@ -196,9 +196,8 @@ class TORCH_API TensorExprKernel {
 
   void compile();
   void genInputDebugNames();
-  void runKernel(Stack& stack);
+  void runKernel(Stack& stack) const;
 
-  std::vector<DimArg> dimsFromSizes(const std::vector<ExprHandle>& sizes);
   std::vector<ExprHandle> sizesForValue(const torch::jit::Value* v);
 
   // These functions broadcast shape and also store a `hasBroadcast_` variable.
@@ -219,10 +218,14 @@ class TORCH_API TensorExprKernel {
 
   std::string getCodeGenName(BackendType backendType);
 
-  void updateOutputSizesAndStrides(const at::ArrayRef<IValue>& inputs);
+  void getStaticOutputSizesAndStrides(
+      const at::ArrayRef<IValue>& inputs,
+      std::vector<std::vector<int64_t>>* static_sizes,
+      std::vector<std::vector<int64_t>>* static_strides) const;
+
   std::vector<CodeGen::CallArg> prepareRunArgs(
       const at::ArrayRef<IValue>& inputs,
-      std::vector<at::Tensor>& outputs);
+      std::vector<at::Tensor>& outputs) const;
   BackendType inferBackendTypeFromDevice(at::Device device);
 
   Tensor bindInput(const torch::jit::Value* input);
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 46255195155a..2284adb8150e 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -212,6 +212,9 @@ class LLVMCodeGenImpl : public IRVisitor {
 
   std::unordered_map<VarPtr, int> varToArg_;
   std::unordered_map<VarPtr, llvm::Value*> varToVal_;
+  std::unordered_set<BufPtr> bufsExtAlloc_;
+  std::unordered_map<VarPtr, llvm::Value*> bufsExtToFreeVal_;
+  std::unordered_multimap<BufPtr, BufPtr> bufsExtAllocReuse_;
   std::unordered_map<BlockPtr, std::vector<VarPtr>> scopeToVar_;
   BlockPtr scope_;
 
@@ -245,6 +248,7 @@ class LLVMCodeGenImpl : public IRVisitor {
   llvm::Value* packFuncArgs(const std::vector<llvm::Value*>& func_args);
   std::vector<llvm::Value*> unpackFuncArgs(llvm::Value* packed, int arg_count);
   void processParallelFor(ForPtr v);
+  void handleBufReuse(BufPtr buf, BufPtr buf_to_reuse);
 
  public:
   LLVMCodeGenImpl(
@@ -292,10 +296,12 @@ class LLVMCodeGenImpl : public IRVisitor {
   void visit(IntrinsicsPtr v) override;
   void visit(AllocatePtr v) override;
   void visit(FreePtr v) override;
+  void visit(FreeExtPtr v) override;
   void visit(PlacementAllocatePtr v) override;
   void visit(LetPtr v) override;
   void visit(CondPtr v) override;
   void visit(ExternalCallPtr v) override;
+  void visit(ExternalCallWithAllocPtr v) override;
 
   void emitIsNan(IntrinsicsPtr v);
 
@@ -432,7 +438,8 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
     c10::optional<std::string> attrs)
     : context_(std::make_unique<llvm::LLVMContext>()),
       irb_(getContext()),
-      kernel_func_name_(std::move(kernel_func_name)) {
+      kernel_func_name_(std::move(kernel_func_name)),
+      bufsExtAlloc_(ExternalAllocBufFinder::find(stmt)) {
   if (!triple) {
     triple = LLVMTargetTriple();
   }
@@ -1902,6 +1909,14 @@ void LLVMCodeGenImpl::visit(IntrinsicsPtr v) {
   }
 }
 
+void LLVMCodeGenImpl::handleBufReuse(BufPtr buf, BufPtr buf_to_reuse) {
+  llvm::Value* ptr = varToVal_.at(buf_to_reuse->base_handle());
+  if (buf_to_reuse->dtype().scalar_type() != buf->dtype().scalar_type()) {
+    ptr = irb_.CreatePointerCast(ptr, dtypeToLLVMPtr(buf->dtype()));
+  }
+  varToVal_[buf->base_handle()] = ptr;
+}
+
 void LLVMCodeGenImpl::visit(ExternalCallPtr v) {
   auto& func_registry = getNNCFunctionRegistry();
   if (!func_registry.count(v->func_name())) {
@@ -2030,6 +2045,172 @@ void LLVMCodeGenImpl::visit(ExternalCallPtr v) {
   value_ = llvm::ConstantInt::get(IntTy_, 0);
 }
 
+void LLVMCodeGenImpl::visit(ExternalCallWithAllocPtr v) {
+  auto& func_registry = getNNCFunctionRegistry();
+  if (!func_registry.count(v->func_name())) {
+    throw unimplemented_lowering(v);
+  }
+
+  const auto& bufs_out = v->buf_out_args();
+  const auto& bufs_in = v->buf_args();
+
+  const auto bufs_in_size = bufs_in.size();
+  const auto bufs_out_size = bufs_out.size();
+  const auto args_num = v->args().size();
+
+  // Count the size of dims array - it consists of dimension of all bufs
+  // concatenated together.
+  size_t dims_num = 0;
+  for (const auto& b : bufs_in) {
+    dims_num += b->dims().size();
+  }
+
+  // bufs_out_size for out tensors data pointers
+  // bufs_in_size for input pointers
+  // bufs_out_size for out tensors TensorImpl* to pass to nnc_aten_free to
+  // release out tensors
+  llvm::Value* buf_ptrs = irb_.CreateAlloca(
+      Int8PtrTy_,
+      llvm::ConstantInt::getSigned(IntTy_, bufs_in_size + 2 * bufs_out_size));
+  // @lint-ignore CLANGTIDY
+  llvm::Value* buf_ranks = irb_.CreateAlloca(
+      LongTy_, llvm::ConstantInt::getSigned(IntTy_, bufs_in_size));
+  llvm::Value* buf_dims = irb_.CreateAlloca(
+      LongTy_, llvm::ConstantInt::getSigned(IntTy_, dims_num));
+  llvm::Value* buf_strides = irb_.CreateAlloca(
+      LongTy_, llvm::ConstantInt::getSigned(IntTy_, dims_num));
+  llvm::Value* buf_dtypes = irb_.CreateAlloca(
+      ByteTy_, llvm::ConstantInt::getSigned(IntTy_, bufs_in_size));
+  // @lint-ignore CLANGTIDY
+  llvm::Value* extra_args = irb_.CreateAlloca(
+      LongTy_, llvm::ConstantInt::getSigned(IntTy_, args_num));
+
+  int i = 0;
+  int dim_idx = 0;
+  int stride_idx = 0;
+  for (const auto& b : bufs_in) {
+    // Store value for buf pointer
+    llvm::Value* gep = irb_.CreateInBoundsGEP(
+        Int8PtrTy_,
+        buf_ptrs,
+        // @lint-ignore CLANGTIDY
+        llvm::ConstantInt::getSigned(IntTy_, bufs_out_size + i));
+    b->base_handle()->accept(this);
+    auto buf_ptr = this->value_;
+    auto buf_void_ptr = irb_.CreatePointerCast(buf_ptr, Int8PtrTy_);
+    irb_.CreateStore(buf_void_ptr, gep);
+
+    // Store dtype of the buf
+    gep = irb_.CreateInBoundsGEP(
+        ByteTy_, buf_dtypes, llvm::ConstantInt::getSigned(IntTy_, i));
+    irb_.CreateStore(
+        llvm::ConstantInt::getSigned(ByteTy_, (int8_t)b->dtype().scalar_type()),
+        gep);
+
+    // Store rank of the buf
+    // @lint-ignore CLANGTIDY
+    gep = irb_.CreateInBoundsGEP(
+        LongTy_, buf_ranks, llvm::ConstantInt::getSigned(IntTy_, i));
+    irb_.CreateStore(
+        llvm::ConstantInt::getSigned(LongTy_, b->dims().size()), gep);
+
+    // Store dims of the buf
+    for (const auto dim : c10::irange(b->dims().size())) {
+      gep = irb_.CreateInBoundsGEP(
+          LongTy_, buf_dims, llvm::ConstantInt::getSigned(IntTy_, dim_idx));
+      b->dims()[dim]->accept(this);
+      auto dim_val = this->value_;
+      irb_.CreateStore(irb_.CreateZExt(dim_val, LongTy_), gep);
+      dim_idx++;
+    }
+
+    // Store strides of the buf
+    for (const auto dim : c10::irange(b->dims().size())) {
+      gep = irb_.CreateInBoundsGEP(
+          LongTy_,
+          buf_strides,
+          llvm::ConstantInt::getSigned(IntTy_, stride_idx));
+      b->strides()[dim]->accept(this);
+      auto stride_val = this->value_;
+      irb_.CreateStore(irb_.CreateZExt(stride_val, LongTy_), gep);
+      stride_idx++;
+    }
+
+    i++;
+  }
+
+  i = 0;
+  for (const ExprPtr& arg : v->args()) {
+    auto gep = irb_.CreateInBoundsGEP(
+        LongTy_, extra_args, llvm::ConstantInt::getSigned(IntTy_, i));
+    arg->accept(this);
+    irb_.CreateStore(irb_.CreateZExtOrBitCast(this->value_, LongTy_), gep);
+    i++;
+  }
+
+  // Generate the call itself
+  std::string fname = v->func_name();
+  FunctionCallee callee = module_->getOrInsertFunction(
+      fname,
+      llvm::FunctionType::get(
+          llvm::Type::getVoidTy(getContext()), // return type
+          {LongTy_, // int64_t bufs_in_size
+           Int8PtrTy_->getPointerTo(), // void** buf_data
+           LongTy_->getPointerTo(), // int64_t* buf_ranks
+           LongTy_->getPointerTo(), // int64_t* buf_dims
+           LongTy_->getPointerTo(), // int64_t* buf_strides
+           ByteTy_->getPointerTo(), // int64_t* buf_dtypes
+           LongTy_, // int64_t args_num
+           LongTy_->getPointerTo()}, // int64_t* extra_args
+          false)); // is var_arg
+
+  auto call_ty = callee.getFunctionType();
+  auto call_fn = callee.getCallee();
+  llvm::cast<llvm::Function>(call_fn)->addFnAttr(llvm::Attribute::NoUnwind);
+
+  irb_.CreateCall(
+      call_ty,
+      call_fn,
+      // @lint-ignore CLANGTIDY
+      {llvm::ConstantInt::getSigned(LongTy_, bufs_in_size),
+       buf_ptrs,
+       buf_ranks,
+       buf_dims,
+       buf_strides,
+       buf_dtypes,
+       // @lint-ignore CLANGTIDY
+       llvm::ConstantInt::getSigned(LongTy_, args_num),
+       extra_args});
+
+  // @lint-ignore CLANGTIDY
+  for (const auto i : c10::irange(bufs_out_size)) {
+    const auto& buf_out = bufs_out[i];
+    auto gep = irb_.CreateInBoundsGEP(
+        Int8PtrTy_, buf_ptrs, llvm::ConstantInt::getSigned(IntTy_, i));
+    llvm::Value* ptr = irb_.CreatePointerCast(
+        irb_.CreateLoad(Int8PtrTy_, gep), dtypeToLLVMPtr(buf_out->dtype()));
+    varToVal_[buf_out->base_handle()] = ptr;
+
+    for (auto it = bufsExtAllocReuse_.find(buf_out);
+         it != bufsExtAllocReuse_.end();
+         it++) {
+      auto buf = it->second;
+      handleBufReuse(buf, buf_out);
+    }
+    bufsExtAllocReuse_.erase(buf_out);
+
+    gep = irb_.CreateInBoundsGEP(
+        Int8PtrTy_,
+        buf_ptrs,
+        // @lint-ignore CLANGTIDY
+        llvm::ConstantInt::getSigned(IntTy_, bufs_out_size + bufs_in_size + i));
+    bufsExtToFreeVal_[buf_out->base_handle()] =
+        irb_.CreateLoad(Int8PtrTy_, gep);
+  }
+
+  value_ = llvm::ConstantInt::get(IntTy_, 0);
+}
+
 void LLVMCodeGenImpl::visit(AllocatePtr v) {
   llvm::Value* size =
       llvm::ConstantInt::getSigned(LongTy_, v->dtype().byte_size());
@@ -2063,18 +2244,66 @@ void LLVMCodeGenImpl::visit(AllocatePtr v) {
 }
 
 void LLVMCodeGenImpl::visit(PlacementAllocatePtr v) {
-  llvm::Value* ptr = varToVal_.at(v->buf_to_reuse()->base_handle());
-  varToVal_[v->buf()->base_handle()] = ptr;
+  auto buf_to_reuse = v->buf_to_reuse();
+  auto buf = v->buf();
+
+  if (bufsExtAlloc_.count(buf_to_reuse)) {
+    bufsExtAllocReuse_.insert({buf_to_reuse, buf});
+    return;
+  }
+
+  handleBufReuse(buf, buf_to_reuse);
 }
 
 void LLVMCodeGenImpl::visit(FreePtr v) {
   value_ = llvm::ConstantInt::get(IntTy_, 0);
-  llvm::Value* ptr = varToVal_.at(v->buffer_var());
+
+  llvm::Value* ptr = bufsExtToFreeVal_.count(v->buffer_var())
+      ? bufsExtToFreeVal_.at(v->buffer_var())
+      : varToVal_.at(v->buffer_var());
+
   if (!llvm::isa<llvm::AllocaInst>(ptr)) {
     irb_.Insert(llvm::CallInst::CreateFree(ptr, irb_.GetInsertBlock()));
   }
 }
 
+void LLVMCodeGenImpl::visit(FreeExtPtr v) {
+  value_ = llvm::ConstantInt::get(IntTy_, 0);
+  const auto& bufs = v->bufs();
+  const auto bufs_num = bufs.size();
+
+  llvm::Value* ptrs = irb_.CreateAlloca(
+      Int8PtrTy_, llvm::ConstantInt::getSigned(IntTy_, bufs_num));
+  for (const auto i : c10::irange(bufs_num)) {
+    const auto& buf = bufs[i];
+    llvm::Value* gep = irb_.CreateInBoundsGEP(
+        Int8PtrTy_, ptrs, llvm::ConstantInt::getSigned(IntTy_, i));
+    auto ptr = bufsExtToFreeVal_[buf->base_handle()];
+    irb_.CreateStore(ptr, gep);
+  }
+
+  FunctionCallee callee = module_->getOrInsertFunction(
+      "nnc_aten_free",
+      llvm::FunctionType::get(
+          llvm::Type::getVoidTy(getContext()), // return type
+          {
+              LongTy_, // int64_t bufs_num
+              Int8PtrTy_->getPointerTo(), // void** ptrs
+          },
+          false)); // is var_arg
+
+  auto call_ty = callee.getFunctionType();
+  auto call_fn = callee.getCallee();
+  llvm::cast<llvm::Function>(call_fn)->addFnAttr(llvm::Attribute::NoUnwind);
+
+  irb_.CreateCall(
+      call_ty,
+      call_fn,
+      {llvm::ConstantInt::getSigned(LongTy_, bufs_num), ptrs});
+
+  value_ = llvm::ConstantInt::get(IntTy_, 0);
+}
+
 void LLVMCodeGenImpl::visit(LetPtr v) {
   v->value()->accept(this);
   if (!varToVal_.count(v->var())) {
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index 72f7d6342d46..a2e576d167d3 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -120,6 +120,8 @@ static void registerIntrinsics(
   }
   assertSuccess(JD.define(
       absoluteSymbols({entry("DispatchParallel", DispatchParallel)})));
+  assertSuccess(
+      JD.define(absoluteSymbols({entry("nnc_aten_free", nnc_aten_free)})));
 }
 
 namespace llvm {
@@ -169,7 +171,16 @@ class TORCH_API PytorchLLVMJITImpl {
   }
 
   JITSymbol findSymbol(const std::string Name) {
+#if LLVM_VERSION_MAJOR >= 15
+    // Starting with llvm-15, LLJIT::lookup returns an address rather than a
+    // symbol. Even though an address is what we ultimately we want, we also
+    // want to avoid churning our internal APIs, so we wrap the returned address
+    // in a fake JITSymbol.
+    auto result = assertSuccess(LLJ->lookup(Name));
+    return JITSymbol(result.getValue(), JITSymbolFlags());
+#else
     return assertSuccess(LLJ->lookup(Name));
+#endif
   }
 
   bool hasSymbol(const std::string& Name) {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index cc49ef3ef82b..6b66d48fe505 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -37,11 +37,13 @@ namespace tensorexpr {
 LoopNest::LoopNest(const LoopNest& other)
     : root_stmt_(Stmt::clone(other.root_stmt_)),
       output_bufs_(other.output_bufs_) {
+  GRAPH_DEBUG("Origin Stmt in LoopNest:\n", std::to_string(root_stmt_));
   verify(root_stmt_);
 }
 
 LoopNest::LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs)
     : root_stmt_(stmt), output_bufs_(std::move(output_bufs)) {
+  GRAPH_DEBUG("Origin Stmt in LoopNest:\n", std::to_string(root_stmt_));
   verify(root_stmt_);
 }
 
@@ -50,12 +52,14 @@ LoopNest::LoopNest(
     const std::vector<Tensor>& output_tensors,
     const std::vector<Tensor>& tensors_to_compute) {
   initialize(output_tensors, tensors_to_compute);
+  GRAPH_DEBUG("Origin Stmt in LoopNest:\n", std::to_string(root_stmt_));
   verify(root_stmt_);
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 LoopNest::LoopNest(const std::vector<Tensor>& output_tensors) {
   initialize(output_tensors, output_tensors);
+  GRAPH_DEBUG("Origin Stmt in LoopNest:\n", std::to_string(root_stmt_));
   verify(root_stmt_);
 }
 
@@ -886,7 +890,7 @@ StmtPtr computeInlineImpl(
   }
   for (auto& use : buf_load_store_uses.at(b)) {
     StmtPtr s = use.s;
-    if (to<ExternalCall>(s)) {
+    if (to<ExternalCall>(s) || to<ExternalCallWithAlloc>(s)) {
       return nullptr;
     }
   }
@@ -985,7 +989,8 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
         } else {
           // If S is not a store, it must be an ExternalCall.
           TORCH_INTERNAL_ASSERT(
-              to<ExternalCall>(stores[0].s),
+              to<ExternalCall>(stores[0].s) ||
+                  to<ExternalCallWithAlloc>(stores[0].s),
               buildErrorMessage(
                   "Expected stmt: " + std::to_string(stores[0].s) +
                   "\nto be either a Store or an ExternalCall in the fuser."));
@@ -1045,6 +1050,23 @@ class LoadOrStoreUseFinder : public IRVisitor {
     IRVisitor::visit(v);
   }
 
+  void visit(ExternalCallWithAllocPtr v) override {
+    for (const auto& out_buf : v->buf_out_args()) {
+      if (stores_[out_buf].insert(last_stmt_).second) {
+        uses_[out_buf].push_back({(StmtPtr)v, true});
+      }
+    }
+    last_stmt_ = (StmtPtr)v;
+
+    for (const auto& input_buf : v->buf_args()) {
+      if (loads_[input_buf].insert(last_stmt_).second) {
+        uses_[input_buf].push_back({last_stmt_, false});
+      }
+    }
+
+    IRVisitor::visit(v);
+  }
+
   void visit(LoadPtr v) override {
     if (loads_[v->buf()].insert(last_stmt_).second) {
       uses_[v->buf()].push_back({last_stmt_, false});
@@ -1084,6 +1106,10 @@ class ContainedStmtsFinder : public IRVisitor {
     contained_.insert((StmtPtr)v);
     IRVisitor::visit(v);
   }
+  void visit(ExternalCallWithAllocPtr v) override {
+    contained_.insert((StmtPtr)v);
+    IRVisitor::visit(v);
+  }
   void visit(BlockPtr v) override {
     contained_.insert((StmtPtr)v);
     IRVisitor::visit(v);
@@ -2955,7 +2981,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
         tmp_params,
         reduceOp->reducer()(
             producer,
-            ExprHandle(alloc<Load>(tmp_buf, new_loop_vars_expr)),
+            alloc<Load>(tmp_buf, new_loop_vars_expr),
             tmp_params,
             {}));
 
diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp
index 2c9c71a40924..6d1681c42275 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.cpp
+++ b/torch/csrc/jit/tensorexpr/lowerings.cpp
@@ -3,6 +3,8 @@
 #include <torch/csrc/jit/tensorexpr/lowerings.h>
 #include <torch/csrc/jit/tensorexpr/operators/operators.h>
 
+#include <ATen/native/Activation.h>
+
 namespace torch {
 namespace jit {
 namespace tensorexpr {
@@ -47,6 +49,7 @@ int nnc_lowerings_lazy_registration() {
        "aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
@@ -56,11 +59,20 @@ int nnc_lowerings_lazy_registration() {
         TORCH_INTERNAL_ASSERT(
             inputs.size() == 2 || inputs.size() == 3,
             buildErrorMessage("Invalid number of input operands"));
-        return (inputs.size() > 2)
-            ? computeTwoOperandWithAlpha(
-                  "aten_sub", inputs, outputShape, outputType, sub_lambda)
-            : computeTwoOperand(
-                  "aten_sub", inputs, outputShape, outputType, sub_lambda);
+        return (inputs.size() > 2) ? computeTwoOperandWithAlpha(
+                                         "aten_sub",
+                                         inputs,
+                                         outputShape,
+                                         outputStrides,
+                                         outputType,
+                                         sub_lambda)
+                                   : computeTwoOperand(
+                                         "aten_sub",
+                                         inputs,
+                                         outputShape,
+                                         outputStrides,
+                                         outputType,
+                                         sub_lambda);
       });
 
   RegisterNNCLoweringsFunction aten_mul(
@@ -68,12 +80,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::mul.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_mul",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return boolToInteger(lhs) * boolToInteger(rhs);
@@ -88,12 +102,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::" #op_name ".float(float a, float b) -> (float)"},          \
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
+         const std::vector<ExprHandle>& outputStrides,                    \
          const c10::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
             inputs,                                                       \
             outputShape,                                                  \
+            outputStrides,                                                \
             outputType,                                                   \
             [](const ExprHandle& a, const ExprHandle& b) { return op; }); \
       });
@@ -109,12 +125,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::div.float(float a, float b) -> (float)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeScalar(
             "aten_div",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a, const ExprHandle& b) {
               return promoteIntegerToDefaultType(a) /
@@ -131,12 +149,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::" #op_name ".float(float a, float b) -> (bool)"},           \
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
+         const std::vector<ExprHandle>& outputStrides,                    \
          const c10::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
             inputs,                                                       \
             outputShape,                                                  \
+            outputStrides,                                                \
             outputType,                                                   \
             [](const ExprHandle& a, const ExprHandle& b) { return op; }); \
       });
@@ -153,12 +173,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::" #op_name ".int(int a, int b) -> (int)"},                  \
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
+         const std::vector<ExprHandle>& outputStrides,                    \
          const c10::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
             inputs,                                                       \
             outputShape,                                                  \
+            outputStrides,                                                \
             outputType,                                                   \
             [](const ExprHandle& a, const ExprHandle& b) { return op; }); \
       });
@@ -176,12 +198,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::" #op_name ".bool(bool a, bool b) -> (bool)"},              \
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
+         const std::vector<ExprHandle>& outputStrides,                    \
          const c10::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
             inputs,                                                       \
             outputShape,                                                  \
+            outputStrides,                                                \
             outputType,                                                   \
             [](const ExprHandle& a, const ExprHandle& b) { return op; }); \
       });
@@ -195,12 +219,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::div.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_div",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return promoteIntegerToDefaultType(lhs) /
@@ -213,12 +239,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::__and__.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_and",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return boolToInteger(lhs) & boolToInteger(rhs);
@@ -230,12 +258,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::__or__.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_or",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return boolToInteger(lhs) | boolToInteger(rhs);
@@ -247,12 +277,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::__xor__.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_xor",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return boolToInteger(lhs) ^ boolToInteger(rhs);
@@ -264,12 +296,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::__lshift__.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_lshift",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return lhs << rhs;
@@ -281,12 +315,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::__rshift__.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_rshift",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return lhs >> rhs;
@@ -298,12 +334,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::eq.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_eq",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs == rhs);
@@ -315,12 +353,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::ne.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_ne",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs != rhs);
@@ -332,12 +372,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::ge.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_ge",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs >= rhs);
@@ -349,12 +391,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::gt.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_gt",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs > rhs);
@@ -366,12 +410,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::le.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_le",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs <= rhs);
@@ -383,12 +429,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::lt.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_lt",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return cast<bool>(lhs < rhs);
@@ -399,12 +447,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::min.other(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_min",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return Min::make(boolToInteger(lhs), boolToInteger(rhs), false);
@@ -415,12 +465,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::max.other(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_max",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return Max::make(boolToInteger(lhs), boolToInteger(rhs), false);
@@ -432,12 +484,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_masked_fill",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& input,
                const ExprHandle& mask,
@@ -453,6 +507,7 @@ int nnc_lowerings_lazy_registration() {
        "aten::clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         bool noMin = false;
@@ -469,6 +524,7 @@ int nnc_lowerings_lazy_registration() {
             "aten_clamp",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [noMin, noMax](
                 const ExprHandle& in,
@@ -499,12 +555,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeFourOperand(
             "aten_addcmul",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a0,
                const ExprHandle& a1,
@@ -516,18 +574,20 @@ int nnc_lowerings_lazy_registration() {
       {"aten::sigmoid(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         // check if the activation is quantized
         const BufHandle& x = c10::get<BufHandle>(inputs[0]);
         if (x.node()->qscale()) {
           return computeQuantizedSigmoidExternalCall(
-              inputs, outputShape, outputType, device);
+              inputs, outputShape, outputStrides, outputType, device);
         }
         return computeOneOperand(
             "aten_sigmoid",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return sigmoid(promoteIntegerToDefaultType(a));
@@ -538,12 +598,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::reciprocal(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_reciprocal",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return ExprHandle(1.0f) / a; });
       });
@@ -552,12 +614,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::neg(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_neg",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return ExprHandle(-0) - a; });
       });
@@ -566,12 +630,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::isnan(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_isnan",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               if (!a.dtype().is_floating_point()) {
@@ -585,16 +651,19 @@ int nnc_lowerings_lazy_registration() {
       {"aten::relu(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         auto A = c10::get<BufHandle>(inputs[0]);
         if (A.node()->qscale()) {
-          return computeQuantizedRelu(inputs, outputShape, outputType, device);
+          return computeQuantizedRelu(
+              inputs, outputShape, outputStrides, outputType, device);
         }
         return computeOneOperand(
             "aten_relu",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               auto zero = Cast::make(a.dtype(), 0);
@@ -606,12 +675,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_leaky_relu",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a, const ExprHandle& negative_slope) {
               auto neg_slope = Cast::make(a.dtype(), negative_slope);
@@ -626,12 +697,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::relu6(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_relu6",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               auto zero = Cast::make(a.dtype(), 0);
@@ -641,22 +714,47 @@ int nnc_lowerings_lazy_registration() {
       });
 
   RegisterNNCLoweringsFunction aten_gelu(
-      {"aten::gelu(Tensor self) -> (Tensor)"},
-      [](const std::vector<ArgValue>& inputs,
-         const std::vector<ExprHandle>& outputShape,
-         const c10::optional<ScalarType>& outputType,
-         at::Device device) {
-        return computeOneOperand(
-            "aten_gelu",
-            inputs,
-            outputShape,
-            outputType,
-            [](const ExprHandle& a) {
-              auto m_sqrt1_2 = Cast::make(a.dtype(), M_SQRT1_2);
-              auto one = Cast::make(a.dtype(), 1.);
-              auto point_five = Cast::make(a.dtype(), .5);
-              return a * point_five * (one + erf(a * m_sqrt1_2));
-            });
+      {"aten::gelu(Tensor self, *, str approximate='none') -> (Tensor)"},
+      [](const std::vector<ArgValue>& inputs,
+         const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
+         const c10::optional<ScalarType>& outputType,
+         at::Device device) {
+        const auto& kApproximate = c10::get<std::string>(inputs[1]);
+        std::vector<ArgValue> operands = {inputs.front()};
+        if (at::native::get_gelutype_enum(kApproximate) ==
+            at::native::GeluType::Tanh) {
+          // approximate == 'tanh'
+          return computeOneOperand(
+              "aten_tanh_gelu",
+              operands,
+              outputShape,
+              outputStrides,
+              outputType,
+              [](const ExprHandle& a) {
+                auto one = Cast::make(a.dtype(), 1.);
+                auto point_five = Cast::make(a.dtype(), .5);
+                auto beta = Cast::make(a.dtype(), M_SQRT2 * M_2_SQRTPI * 0.5);
+                auto kappa = Cast::make(a.dtype(), 0.044715);
+                auto a_cube = a * a * a;
+                auto inner = beta * (a + kappa * a_cube);
+                return point_five * a * (one + tanh(inner));
+              });
+        } else {
+          // approximate == 'none'
+          return computeOneOperand(
+              "aten_gelu",
+              operands,
+              outputShape,
+              outputStrides,
+              outputType,
+              [](const ExprHandle& a) {
+                auto m_sqrt1_2 = Cast::make(a.dtype(), M_SQRT1_2);
+                auto one = Cast::make(a.dtype(), 1.);
+                auto point_five = Cast::make(a.dtype(), .5);
+                return a * point_five * (one + erf(a * m_sqrt1_2));
+              });
+        }
       });
 
   RegisterNNCLoweringsFunction aten_batch_norm(
@@ -667,12 +765,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::log(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return log(promoteIntegerToDefaultType(a));
@@ -683,12 +783,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::log10(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log10",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return log10(promoteIntegerToDefaultType(a));
@@ -699,12 +801,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::log1p(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log1p",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return log1p(promoteIntegerToDefaultType(a));
@@ -715,12 +819,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::log2(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log2",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return log2(promoteIntegerToDefaultType(a));
@@ -731,12 +837,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::exp(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_exp",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return exp(promoteIntegerToDefaultType(a));
@@ -747,12 +855,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::expm1(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_expm1",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return expm1(promoteIntegerToDefaultType(a));
@@ -763,12 +873,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::erf(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_erf",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return erf(promoteIntegerToDefaultType(a));
@@ -779,12 +891,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::erfc(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_erfc",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return erfc(promoteIntegerToDefaultType(a));
@@ -795,12 +909,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::cos(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cos",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return cos(promoteIntegerToDefaultType(a));
@@ -811,12 +927,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::sin(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sin",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return sin(promoteIntegerToDefaultType(a));
@@ -827,12 +945,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::tan(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_tan",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return tan(promoteIntegerToDefaultType(a));
@@ -843,6 +963,7 @@ int nnc_lowerings_lazy_registration() {
       {"aten::type_as(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         const BufHandle& rhs = c10::get<BufHandle>(inputs[1]);
@@ -851,6 +972,7 @@ int nnc_lowerings_lazy_registration() {
             "aten_type_as",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [dtype](const ExprHandle& lhs) { return Cast::make(dtype, lhs); });
       });
@@ -861,12 +983,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::pow.Scalar(Scalar self, Tensor exponent) -> Tensor"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_pow",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               if (!rhs.node()->isConstant()) {
@@ -904,12 +1028,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::fmod.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_fmod",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return fmod(promoteHalfToFloat(lhs), promoteHalfToFloat(rhs));
@@ -921,12 +1047,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_lerp",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a,
                const ExprHandle& end,
@@ -939,6 +1067,7 @@ int nnc_lowerings_lazy_registration() {
        "aten::remainder.Tensor(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         auto imodImpl = [](const ExprHandle& lhs, const ExprHandle& rhs) {
@@ -953,9 +1082,7 @@ int nnc_lowerings_lazy_registration() {
           auto const& shape =
               broadcastShapes(valueShape(inputs[0]), valueShape(inputs[1]));
           return Compute(
-              "aten_remainder",
-              c10::fmap<DimArg>(shape),
-              [&](const std::vector<VarHandle>& axes) {
+              "aten_remainder", shape, [&](const std::vector<VarHandle>& axes) {
                 std::vector<ExprHandle> indices(axes.begin(), axes.end());
                 std::vector<ExprHandle> exprInputs = {
                     tensorOrConstant(inputs[0], indices),
@@ -988,12 +1115,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::acos(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_acos",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return acos(promoteIntegerToDefaultType(a));
@@ -1004,12 +1133,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::asin(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_asin",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return asin(promoteIntegerToDefaultType(a));
@@ -1020,12 +1151,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::cosh(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cosh",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return cosh(promoteIntegerToDefaultType(a));
@@ -1036,12 +1169,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::sinh(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sinh",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return sinh(promoteIntegerToDefaultType(a));
@@ -1052,12 +1187,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::atan(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_atan",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return atan(promoteIntegerToDefaultType(a));
@@ -1068,12 +1205,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::atan2(Tensor self, Tensor other) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_atan2",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& lhs, const ExprHandle& rhs) {
               return atan2(
@@ -1086,12 +1225,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::tanh(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_tanh",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return tanh(promoteIntegerToDefaultType(a));
@@ -1102,12 +1243,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_hardtanh",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a,
                const ExprHandle& min_val,
@@ -1121,12 +1264,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_softplus",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a,
                const ExprHandle& beta,
@@ -1147,12 +1292,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::hardsigmoid(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_hardsigmoid",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               auto zero = Cast::make(a.dtype(), 0.0);
@@ -1166,12 +1313,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::hardswish(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_hardswish",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               //  x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0
@@ -1187,12 +1336,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::hardshrink(Tensor self, Scalar lambd=0.5) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_hardshrink",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a, const ExprHandle& lambd) {
               auto pos_clambd = Cast::make(a.dtype(), lambd);
@@ -1208,12 +1359,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::sqrt(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sqrt",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return tensorexpr::sqrt(promoteIntegerToDefaultType(a));
@@ -1224,12 +1377,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::rsqrt(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_rsqrt",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return rsqrt(promoteIntegerToDefaultType(a));
@@ -1240,12 +1395,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::abs(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_abs",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return tensorexpr::abs(promoteHalfToFloat(a));
@@ -1257,6 +1414,7 @@ int nnc_lowerings_lazy_registration() {
       {"aten::sign(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) { return computeSign(inputs, outputShape); });
 
@@ -1264,12 +1422,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::ceil(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_ceil",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return ceil(a); });
       });
@@ -1278,12 +1438,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::floor(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_floor",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return floor(a); });
       });
@@ -1292,12 +1454,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::round(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_round",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return round(a); });
       });
@@ -1306,12 +1470,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::trunc(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_trunc",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return trunc(a); });
       });
@@ -1320,12 +1486,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::_cast_Float(Tensor self, bool non_blocking=False) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cast_float",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) { return cast<float>(a); });
       });
@@ -1340,6 +1508,7 @@ int nnc_lowerings_lazy_registration() {
        "aten::_autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         // see handling of aten::to in tensorexpr_fuser.cpp for why we only
@@ -1348,6 +1517,7 @@ int nnc_lowerings_lazy_registration() {
             "aten_to",
             {inputs[0]},
             outputShape,
+            outputStrides,
             outputType,
             [outputType](const ExprHandle& a) {
               TORCH_INTERNAL_ASSERT(
@@ -1360,12 +1530,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::threshold(Tensor self, Scalar threshold, Scalar value) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_threshold",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a,
                const ExprHandle& threshold,
@@ -1382,12 +1554,14 @@ int nnc_lowerings_lazy_registration() {
        "aten::where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeConditionWithTwoOperand(
             "aten_where",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a0,
                const ExprHandle& a1,
@@ -1398,12 +1572,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::frac(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_frac",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               auto aa = promoteHalfToFloat(a);
@@ -1416,12 +1592,14 @@ int nnc_lowerings_lazy_registration() {
       {"aten::lgamma(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_lgamma",
             inputs,
             outputShape,
+            outputStrides,
             outputType,
             [](const ExprHandle& a) {
               return lgamma(promoteIntegerToDefaultType(a));
@@ -1454,7 +1632,7 @@ int nnc_lowerings_lazy_registration() {
   //        at::Device device) {
   //       return Compute(
   //           "aten_slice",
-  //           c10::fmap<DimArg>(outputShape),
+  //           outputShape,
   //           [&](const std::vector<VarHandle>& axes) {
   //             int64_t dim =
   //                 at::maybe_wrap_dim(c10::get<int64_t>(inputs[1]),
@@ -1471,11 +1649,13 @@ int nnc_lowerings_lazy_registration() {
       {"aten::unsqueeze(Tensor(a) self, int dim) -> (Tensor(a))"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return Compute(
             "aten_unsqueeze",
-            c10::fmap<DimArg>(outputShape),
+            outputShape,
+            outputStrides,
             [&](const std::vector<VarHandle>& axes) {
               int64_t dim = c10::get<int64_t>(inputs[1]);
               if (dim < 0) {
@@ -1503,11 +1683,13 @@ int nnc_lowerings_lazy_registration() {
       {"aten::t(Tensor(a) self) -> (Tensor(a))"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTranspose(
             {inputs[0], (int64_t)1, (int64_t)0},
             outputShape,
+            outputStrides,
             outputType,
             device);
       });
@@ -1518,6 +1700,7 @@ int nnc_lowerings_lazy_registration() {
       {"aten::permute(Tensor(a) self, int[] dims) -> (Tensor(a))"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         auto A = c10::get<BufHandle>(inputs[0]);
@@ -1525,7 +1708,8 @@ int nnc_lowerings_lazy_registration() {
         if (A.ndim() == 0) {
           auto tensor = Compute(
               "aten_permute",
-              c10::fmap<DimArg>(outputShape),
+              outputShape,
+              outputStrides,
               [&](const std::vector<VarHandle>& axes) {
                 std::vector<ExprHandle> empty_indices;
                 return A.load(empty_indices);
@@ -1539,7 +1723,7 @@ int nnc_lowerings_lazy_registration() {
         auto permute_dims = c10::get<IntList>(inputs[1]);
         auto tensor = Compute(
             "aten_permute",
-            c10::fmap<DimArg>(outputShape),
+            outputShape,
             [&](const std::vector<VarHandle>& axes) {
               std::vector<VarHandle> new_axes;
               new_axes.resize(axes.size());
@@ -1590,18 +1774,20 @@ int nnc_lowerings_lazy_registration() {
       {"aten::softmax.int(Tensor self, int dim, int? dtype=None) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
-        return computeSoftmax(inputs, outputShape, false);
+        return computeSoftmax(inputs, outputShape, outputStrides, false);
       });
 
   RegisterNNCLoweringsFunction aten_log_softmax(
       {"aten::log_softmax.int(Tensor self, int dim, int? dtype=None) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
-        return computeSoftmax(inputs, outputShape, true);
+        return computeSoftmax(inputs, outputShape, outputStrides, true);
       });
 
   RegisterNNCLoweringsFunction aten_conv1d(
@@ -1632,6 +1818,7 @@ int nnc_lowerings_lazy_registration() {
        "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
          const c10::optional<ScalarType>& outputType,
          at::Device device) {
         auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
@@ -1640,11 +1827,20 @@ int nnc_lowerings_lazy_registration() {
         TORCH_INTERNAL_ASSERT(
             inputs.size() == 2 || inputs.size() == 3,
             buildErrorMessage("Invalid number of input operands"));
-        return (inputs.size() > 2)
-            ? computeTwoOperandWithAlpha(
-                  "aten_add", inputs, outputShape, outputType, add_lambda)
-            : computeTwoOperand(
-                  "aten_add", inputs, outputShape, outputType, add_lambda);
+        return (inputs.size() > 2) ? computeTwoOperandWithAlpha(
+                                         "aten_add",
+                                         inputs,
+                                         outputShape,
+                                         outputStrides,
+                                         outputType,
+                                         add_lambda)
+                                   : computeTwoOperand(
+                                         "aten_add",
+                                         inputs,
+                                         outputShape,
+                                         outputStrides,
+                                         outputType,
+                                         add_lambda);
       });
   RegisterNNCLoweringsFunction aten_embedding(
       {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor"},
@@ -1714,15 +1910,14 @@ int nnc_lowerings_lazy_registration() {
 
   RegisterNNCLoweringsFunction aten_upsample_nearest2d(
       {"aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> (Tensor)"},
-      computeUpsampleNearest2d);
+      computeUpsampleNearest2dExternalCall);
 
   return 0;
 }
 } // namespace
 
 NNCLoweringFunction getStandardLoweringFor(const std::string& schema_str) {
-  // NOLINTNEXTLINE
-  static const int once = nnc_lowerings_lazy_registration();
+  C10_UNUSED static const int once = nnc_lowerings_lazy_registration();
   const auto& lowerings = getNNCLoweringRegistry();
   if (auto l = lowerings.find(parseSchema(schema_str))) {
     return *l;
diff --git a/torch/csrc/jit/tensorexpr/lowerings.h b/torch/csrc/jit/tensorexpr/lowerings.h
index 19aa85810b9c..a64a8c599b73 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.h
+++ b/torch/csrc/jit/tensorexpr/lowerings.h
@@ -26,11 +26,13 @@ using ArgValue = c10::variant<
     BufList,
     DoubleList,
     IntList,
+    std::string,
     ArgNone>;
 
 using NNCLoweringFunction = std::function<Tensor(
     const std::vector<ArgValue>&,
     const std::vector<ExprHandle>&,
+    const std::vector<ExprHandle>&,
     const c10::optional<ScalarType>&,
     at::Device)>;
 
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
index bab5fbd67fc7..f06ab90a80d3 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
@@ -49,7 +49,8 @@ Tensor conv2d_depthwise_static(
 
   Tensor conv = Reduce(
       "conv2d_depthwise",
-      {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
+      {N, K, OH, OW},
+      c10::nullopt, // TODO
       Sum(),
       [&](const std::vector<VarHandle>& v) { return init_func(v); },
       [&](const std::vector<VarHandle>& v) {
@@ -70,7 +71,7 @@ Tensor conv2d_depthwise_static(
             input.load(n, k, oh * stride - pad + r, ow * stride - pad + s));
         return in * weight.load(k, c, r, s);
       },
-      {{C / groups, "c"}, {R, "r"}, {S, "s"}});
+      {C / groups, R, S});
 
   LoopNest nest({conv});
 
@@ -120,7 +121,8 @@ Tensor conv2d_depthwise_dynamic(
 
   return Reduce(
       "conv2d_depthwise",
-      {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
+      {N, K, OH, OW},
+      c10::nullopt, // TODO
       Sum(),
       [&](const std::vector<VarHandle>& v) { return init_func(v); },
       [&](const std::vector<VarHandle>& v) {
@@ -141,7 +143,7 @@ Tensor conv2d_depthwise_dynamic(
             input.load(n, k, oh * stride - pad + r, ow * stride - pad + s));
         return in * weight.load(k, c, r, s);
       },
-      {{C / groups, "c"}, {R, "r"}, {S, "s"}});
+      {C / groups, R, S});
 }
 
 } // namespace
@@ -308,6 +310,7 @@ bool conv2dIsSupported(
 Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -355,6 +358,7 @@ Tensor computeConv2d(
 Tensor computeConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -388,6 +392,7 @@ Tensor computeConv1d(
 Tensor computePrepackedConv2dClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -406,6 +411,7 @@ Tensor computePrepackedConv2dClampRun(
 Tensor computePrepackedLinearClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
index 05af97f9d3e6..8278d8773bdb 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -66,21 +66,25 @@ bool conv2dIsSupported(
 Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computePrepackedConv2dClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computePrepackedLinearClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
index 91b3824a8b02..408ae6e8c361 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
@@ -8,6 +8,7 @@ namespace tensorexpr {
 Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -38,12 +39,12 @@ Tensor computeMatmul(
   if (total_size && total_size->value() < 1000) {
     return Reduce(
         "nnc_matmul",
-        {{size_a[0], "M"}, {size_b[1], "N"}},
+        {size_a[0], size_b[1]},
         Sum(),
         [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
           return Load::make(a, {m, k}) * Load::make(b, {k, n});
         },
-        {{size_a[1], "K"}});
+        {size_a[1]});
   } else {
     return Tensor(
         ResultBuf.node(),
@@ -54,6 +55,7 @@ Tensor computeMatmul(
 Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h
index 3f797ba473dd..70f3f4bf7bf0 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -9,11 +9,13 @@ namespace tensorexpr {
 Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/misc.cpp b/torch/csrc/jit/tensorexpr/operators/misc.cpp
index 8efc08155f2f..5e03fc3560ed 100644
--- a/torch/csrc/jit/tensorexpr/operators/misc.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/misc.cpp
@@ -320,11 +320,12 @@ std::vector<ExprHandle> computeIndicesToBroadcast(
 Tensor computeChunk(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   return Compute(
       "prim_constantchunk",
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
       [inputs](const std::vector<VarHandle>& axes) {
         const auto& b = c10::get<BufHandle>(inputs[0]);
         int64_t chunkIdx = c10::get<int64_t>(inputs[1]);
@@ -353,15 +354,14 @@ Tensor computeChunk(
 Tensor computeTranspose(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = c10::get<BufHandle>(inputs[0]);
   // Trivial case of 0-dim and 1-dim tensors: transpose is just a copy
   if (A.ndim() <= 1) {
     return Compute(
-        "aten_transpose",
-        c10::fmap<DimArg>(outputShape),
-        [&](std::vector<VarHandle> axes) {
+        "aten_transpose", outputShape, [&](std::vector<VarHandle> axes) {
           TORCH_INTERNAL_ASSERT(
               axes.size() <= 1,
               buildErrorMessage("Invalid axes size in transpose"));
@@ -372,9 +372,7 @@ Tensor computeTranspose(
   auto start_dim = at::maybe_wrap_dim(c10::get<int64_t>(inputs[1]), A.ndim());
   auto to_dim = at::maybe_wrap_dim(c10::get<int64_t>(inputs[2]), A.ndim());
   return Compute(
-      "aten_transpose",
-      c10::fmap<DimArg>(outputShape),
-      [&](std::vector<VarHandle> axes) {
+      "aten_transpose", outputShape, [&](std::vector<VarHandle> axes) {
         std::swap(axes[start_dim], axes[to_dim]);
         return A.load(axes);
       });
@@ -383,13 +381,12 @@ Tensor computeTranspose(
 Tensor computeExpand(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = c10::get<BufHandle>(inputs[0]);
   return Compute(
-      "aten_expand",
-      c10::fmap<DimArg>(outputShape),
-      [&](const std::vector<VarHandle>& axes) {
+      "aten_expand", outputShape, [&](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         return broadcast(A, indices);
       });
@@ -398,22 +395,19 @@ Tensor computeExpand(
 Tensor computeReshape(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = c10::get<BufHandle>(inputs[0]);
   if (A.ndim() == 0) {
     return Compute(
-        "aten_view",
-        c10::fmap<DimArg>(outputShape),
-        [&](const std::vector<VarHandle>& axes) {
+        "aten_view", outputShape, [&](const std::vector<VarHandle>& axes) {
           std::vector<ExprHandle> empty_indices;
           return A.load(empty_indices);
         });
   }
   return Compute(
-      "aten_reshape",
-      c10::fmap<DimArg>(outputShape),
-      [&](const std::vector<VarHandle>& axes) {
+      "aten_reshape", outputShape, [&](const std::vector<VarHandle>& axes) {
         std::vector<VarHandle> new_axes;
         assert(outputShape.size() == axes.size());
         /*
@@ -469,6 +463,7 @@ Tensor computeReshape(
 Tensor computeFlatten(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   std::vector<int64_t> outputShapeVec;
@@ -478,7 +473,8 @@ Tensor computeFlatten(
   std::vector<ArgValue> reshapeInputs;
   reshapeInputs.push_back(inputs[0]);
   reshapeInputs.emplace_back(outputShapeVec);
-  return computeReshape(reshapeInputs, outputShape, outputType, device);
+  return computeReshape(
+      reshapeInputs, outputShape, outputStrides, outputType, device);
 }
 
 static std::pair<ScalarType, std::vector<BufHandle>> processCatList(
@@ -596,6 +592,7 @@ Tensor computeCatWoConditionals(
 Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   if (device == at::kCPU && getCatWoConditionals()) {
@@ -609,7 +606,8 @@ Tensor computeCat(
   std::vector<BufHandle> nonEmptyInputs = catInfo.second;
   return Compute(
       "aten_cat",
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [&](const std::vector<VarHandle>& axes) {
         if (nonEmptyInputs.size() == 0) {
           return ExprHandle(0);
@@ -657,6 +655,7 @@ Tensor computeCat(
 Tensor computeEmbedding(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
diff --git a/torch/csrc/jit/tensorexpr/operators/misc.h b/torch/csrc/jit/tensorexpr/operators/misc.h
index 8c1843b8af94..5650b35147b1 100644
--- a/torch/csrc/jit/tensorexpr/operators/misc.h
+++ b/torch/csrc/jit/tensorexpr/operators/misc.h
@@ -50,26 +50,31 @@ ExprHandle clamp(
 Tensor computeChunk(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeTranspose(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeExpand(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeReshape(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeFlatten(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeCatWoConditionals(
@@ -78,11 +83,13 @@ Tensor computeCatWoConditionals(
 Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeEmbedding(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp
index 04a7f48629fd..56ca21f123ed 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -8,6 +8,7 @@ namespace tensorexpr {
 Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   bool hasWeight = true;
@@ -23,7 +24,8 @@ Tensor computeBatchNorm(
 
   return Compute(
       "aten_batch_norm",
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [&](const std::vector<VarHandle>& axes) {
         TORCH_INTERNAL_ASSERT(axes.size() >= 2);
         // axes: N, C, H, W
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h
index 4d4ebdfea125..7c8cc43387b0 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -9,6 +9,7 @@ namespace tensorexpr {
 Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
index 63525c265692..57c63fcd9239 100644
--- a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
@@ -9,9 +9,10 @@ using namespace torch::jit::tensorexpr;
 
 Tensor computeSign(
     const std::vector<ArgValue>& inputValues,
-    const std::vector<ExprHandle>& outputShape) {
+    const std::vector<ExprHandle>& outputShape,
+    c10::optional<std::vector<ExprHandle>> outputStrides) {
   return Compute(
-      "aten_sign", c10::fmap<DimArg>(outputShape), [&](ParameterList& axes) {
+      "aten_sign", outputShape, outputStrides, [&](ParameterList& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         std::vector<ExprHandle> inputs = {
             tensorOrConstant(inputValues[0], indices)};
@@ -26,12 +27,14 @@ Tensor computeOneOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
     const int checkParamTypes) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr, checkParamTypes](
           const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
@@ -47,12 +50,14 @@ Tensor computeTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         std::vector<ExprHandle> inputs = {
@@ -70,12 +75,14 @@ Tensor computeTwoOperandWithAlpha(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         std::vector<ExprHandle> inputs = {
@@ -94,13 +101,15 @@ Tensor computeConditionWithTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         std::vector<ExprHandle> inputs = {
@@ -121,6 +130,7 @@ Tensor computeThreeOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
@@ -128,7 +138,8 @@ Tensor computeThreeOperand(
     bool promote_inputs) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr, promote_inputs](
           const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
@@ -149,6 +160,7 @@ Tensor computeFourOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(
         const ExprHandle&,
@@ -157,7 +169,8 @@ Tensor computeFourOperand(
         const ExprHandle&)>& innerExpr) {
   return Compute(
       name,
-      c10::fmap<DimArg>(outputShape),
+      outputShape,
+      outputStrides,
       [inputValues, outputType, innerExpr](const std::vector<VarHandle>& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
         std::vector<ExprHandle> inputs = {
@@ -177,18 +190,23 @@ Tensor computeFourOperand(
 Tensor computeNoop(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   return computeOneOperand(
-      "copy", inputValues, outputShape, outputType, [](const ExprHandle& a) {
-        return a;
-      });
+      "copy",
+      inputValues,
+      outputShape,
+      outputStrides,
+      outputType,
+      [](const ExprHandle& a) { return a; });
 }
 
 Tensor computeScalar(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.h b/torch/csrc/jit/tensorexpr/operators/pointwise.h
index 87c547c550d9..8de218dbb038 100644
--- a/torch/csrc/jit/tensorexpr/operators/pointwise.h
+++ b/torch/csrc/jit/tensorexpr/operators/pointwise.h
@@ -8,12 +8,14 @@ namespace tensorexpr {
 
 TORCH_API Tensor computeSign(
     const std::vector<ArgValue>& inputs,
-    const std::vector<ExprHandle>& outputShape);
+    const std::vector<ExprHandle>& outputShape,
+    c10::optional<std::vector<ExprHandle>> outputStrides = c10::nullopt);
 
 Tensor computeOneOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
     const int checkParamTypes = kAllTypes);
@@ -21,6 +23,7 @@ Tensor computeTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
@@ -28,6 +31,7 @@ Tensor computeTwoOperandWithAlpha(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
@@ -35,6 +39,7 @@ Tensor computeConditionWithTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
@@ -43,6 +48,7 @@ Tensor computeThreeOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
@@ -52,6 +58,7 @@ Tensor computeFourOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(
         const ExprHandle&,
@@ -61,6 +68,7 @@ Tensor computeFourOperand(
 Tensor computeNoop(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
@@ -68,6 +76,7 @@ Tensor computeScalar(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
index 1c99f058f9c6..c5d9c00523f0 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@@ -39,20 +39,7 @@ bool isQuantized(const BufHandle& qx) {
   return qx.node()->qscale() && qx.node()->qzero();
 }
 
-BufHandle makeQBufHandleNCHW(
-    const std::string& name,
-    const std::vector<ExprHandle>& dims,
-    Dtype dtype,
-    const ExprPtr qscale,
-    const ExprPtr qzero) {
-  BufHandle ResultBuf(name, dims, dtype);
-  ResultBuf.node()->set_qscale(qscale);
-  ResultBuf.node()->set_qzero(qzero);
-  ResultBuf.node()->set_strides(make_contiguous_strides(dims));
-  return ResultBuf;
-}
-
-BufHandle makeQBufHandleNHWC(
+BufHandle makeQBufHandleChannelsLast(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
@@ -65,13 +52,13 @@ BufHandle makeQBufHandleNHWC(
   return ResultBuf;
 }
 
-BufHandle makeQBufHandleNHWC(
+BufHandle makeQBufHandleChannelsLast(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
     const double qscale,
     const int64_t qzero) {
-  return makeQBufHandleNHWC(
+  return makeQBufHandleChannelsLast(
       name,
       dims,
       dtype,
@@ -79,7 +66,7 @@ BufHandle makeQBufHandleNHWC(
       LongImm::make(qzero).node());
 }
 
-BufHandle makeQBufHandleNLC(
+BufHandle makeQBufHandleContiguous(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
@@ -88,17 +75,17 @@ BufHandle makeQBufHandleNLC(
   BufHandle ResultBuf(name, dims, dtype);
   ResultBuf.node()->set_qscale(qscale);
   ResultBuf.node()->set_qzero(qzero);
-  ResultBuf.node()->set_strides(make_channels_last_strides(dims));
+  ResultBuf.node()->set_strides(make_contiguous_strides(dims));
   return ResultBuf;
 }
 
-BufHandle makeQBufHandleNLC(
+BufHandle makeQBufHandleContiguous(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
     const double qscale,
     const int64_t qzero) {
-  return makeQBufHandleNLC(
+  return makeQBufHandleContiguous(
       name,
       dims,
       dtype,
@@ -106,44 +93,19 @@ BufHandle makeQBufHandleNLC(
       LongImm::make(qzero).node());
 }
 
-BufHandle makeQBufHandleNCHW(
-    const std::string& name,
-    const std::vector<ExprHandle>& dims,
-    Dtype dtype,
-    const double qscale,
-    const int64_t qzero) {
-  return makeQBufHandleNCHW(
-      name,
-      dims,
-      dtype,
-      DoubleImm::make(qscale).node(),
-      LongImm::make(qzero).node());
-}
-
-bool isNHWC(const BufHandle& buf) {
+bool isChannelsLast(const BufHandle& buf) {
   const auto& strides = buf.node()->strides();
   const auto& dims = buf.node()->dims();
-  if (strides.size() != 4) {
+  const auto rank = dims.size();
+  if (rank < 3) {
     return false;
   }
-  auto dims1 = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
-  auto strides1 = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
-  auto strides3 = to<LongImm>(IRSimplifier::simplify(strides[3]))->value();
+  auto dimsC = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
+  auto stridesC = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
+  auto stridesLast =
+      to<LongImm>(IRSimplifier::simplify(strides[rank - 1]))->value();
 
-  return ((strides3 == dims1) && (strides1 == 1));
-}
-
-bool isNLC(const BufHandle& buf) {
-  const auto& strides = buf.node()->strides();
-  const auto& dims = buf.node()->dims();
-  if (strides.size() != 3) {
-    return false;
-  }
-  auto dims1 = to<LongImm>(IRSimplifier::simplify(dims[1]))->value();
-  auto strides1 = to<LongImm>(IRSimplifier::simplify(strides[1]))->value();
-  auto strides3 = to<LongImm>(IRSimplifier::simplify(strides[3]))->value();
-
-  return ((strides3 == dims1) && (strides1 == 1));
+  return ((stridesLast == dimsC) && (stridesC == 1));
 }
 
 ExprHandle quant(
@@ -176,6 +138,7 @@ ExprHandle dequant(
 Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>&,
     at::Device) {
   std::vector<VarPtr> vars;
@@ -215,6 +178,7 @@ Tensor computeQuantizePerTensor(
 Tensor computeQuantizedAdd(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   const BufHandle& QA = c10::get<BufHandle>(inputs[0]);
@@ -247,7 +211,8 @@ Tensor computeQuantizedAdd(
       ExprHandleVectorToExprVector(outputShape),
       out_dtype,
       nullptr,
-      c10::nullopt,
+      isChannelsLast(QA) ? make_channels_last_strides(outputShape)
+                         : make_contiguous_strides(outputShape),
       out_qscale.node(),
       out_qzero.node());
   return Tensor(buf, vars, exprHandle.node());
@@ -256,6 +221,7 @@ Tensor computeQuantizedAdd(
 Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     at::Device) {
@@ -273,15 +239,11 @@ Tensor computeQuantizePerTensorExternalCall(
     throw malformed_input("Expected quantized dtype");
   }(qdtype);
   auto ResultBuf = [&]() {
-    if (isNHWC(x)) {
-      return makeQBufHandleNHWC(
+    if (isChannelsLast(x)) {
+      return makeQBufHandleChannelsLast(
           "quantize_per_tensor", outputShape, dtype, qscale, qzero);
     }
-    if (isNLC(x)) {
-      return makeQBufHandleNLC(
-          "quantize_per_tensor", outputShape, dtype, qscale, qzero);
-    }
-    return makeQBufHandleNCHW(
+    return makeQBufHandleContiguous(
         "quantize_per_tensor", outputShape, dtype, qscale, qzero);
   }();
   StmtPtr s = ExternalCall::make(
@@ -292,6 +254,7 @@ Tensor computeQuantizePerTensorExternalCall(
 Tensor computeDequantizeExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
@@ -300,8 +263,6 @@ Tensor computeDequantizeExternalCall(
   }
 
   const BufHandle& qx = c10::get<BufHandle>(inputs[0]);
-  const double qscale = immQScale(qx);
-  const int64_t qzero = immQZero(qx);
   const int64_t qdtype = (int64_t)immQDType(qx);
 
   BufHandle ResultBuf("dequantize", outputShape, dtype);
@@ -318,6 +279,7 @@ Tensor computeDequantizeExternalCall(
 Tensor computeQuantizedConv2dPrepack(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
@@ -366,6 +328,7 @@ Tensor computeQuantizedConv2dPrepack(
 Tensor computeQuantizedConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -376,7 +339,7 @@ Tensor computeQuantizedConv1d(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNLC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv1d",
       outputShape,
       Dtype(out_qdtype),
@@ -397,6 +360,7 @@ Tensor computeQuantizedConv1d(
 Tensor computeQuantizedConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -407,7 +371,7 @@ Tensor computeQuantizedConv2d(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNHWC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv2d",
       outputShape,
       Dtype(out_qdtype),
@@ -428,6 +392,7 @@ Tensor computeQuantizedConv2d(
 Tensor computeQuantizedConv2dRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -438,7 +403,7 @@ Tensor computeQuantizedConv2dRelu(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNHWC(
+  auto ResultBuf = makeQBufHandleChannelsLast(
       "quantized_conv2d_relu",
       outputShape,
       Dtype(out_qdtype),
@@ -459,6 +424,7 @@ Tensor computeQuantizedConv2dRelu(
 Tensor computeQuantizedLinear(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -469,7 +435,7 @@ Tensor computeQuantizedLinear(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_linear",
       outputShape,
       Dtype(out_qdtype),
@@ -490,6 +456,7 @@ Tensor computeQuantizedLinear(
 Tensor computeQuantizedLinearRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -500,7 +467,7 @@ Tensor computeQuantizedLinearRelu(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qx);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_linear_relu",
       outputShape,
       Dtype(out_qdtype),
@@ -521,6 +488,7 @@ Tensor computeQuantizedLinearRelu(
 Tensor computeQuantizedAddExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -531,16 +499,16 @@ Tensor computeQuantizedAddExternalCall(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
-  const bool isQAChannelsLast = isNHWC(qa);
-  const bool isQBChannelsLast = isNHWC(qb);
+  const bool isQAChannelsLast = isChannelsLast(qa);
+  const bool isQBChannelsLast = isChannelsLast(qb);
   auto ResultBuf = (isQAChannelsLast || isQBChannelsLast)
-      ? makeQBufHandleNHWC(
+      ? makeQBufHandleChannelsLast(
             "quantized_add",
             outputShape,
             Dtype(out_qdtype),
             out_qscale,
             out_qzero)
-      : makeQBufHandleNCHW(
+      : makeQBufHandleContiguous(
             "quantized_add",
             outputShape,
             Dtype(out_qdtype),
@@ -564,6 +532,7 @@ Tensor computeQuantizedAddExternalCall(
 Tensor computeQuantizedMul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -574,7 +543,7 @@ Tensor computeQuantizedMul(
   const auto out_qzero = c10::get<int64_t>(inputs[3]);
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_mul", outputShape, Dtype(out_qdtype), out_qscale, out_qzero);
   StmtPtr s = ExternalCall::make(
       ResultBuf,
@@ -594,6 +563,7 @@ Tensor computeQuantizedMul(
 Tensor computeQuantizedMulScalar(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -603,7 +573,7 @@ Tensor computeQuantizedMulScalar(
   // Change to dtype based on outputType when dtype propagation implemented
   const auto out_qdtype = immQDType(qa);
   double scale1 = immQScale(qa);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_mul_scalar",
       outputShape,
       Dtype(out_qdtype),
@@ -620,20 +590,21 @@ Tensor computeQuantizedMulScalar(
 Tensor computeQuantizedRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qa = c10::get<BufHandle>(inputs[0]);
   const auto out_qdtype = immQDType(qa);
-  const bool isQAChannelsLast = isNHWC(qa);
-  auto ResultBuf = isQAChannelsLast ? makeQBufHandleNHWC(
+  const bool isQAChannelsLast = isChannelsLast(qa);
+  auto ResultBuf = isQAChannelsLast ? makeQBufHandleChannelsLast(
                                           "quantized_relu",
                                           outputShape,
                                           Dtype(out_qdtype),
                                           immQScale(qa),
                                           immQZero(qa))
-                                    : makeQBufHandleNCHW(
+                                    : makeQBufHandleContiguous(
                                           "quantized_relu",
                                           outputShape,
                                           Dtype(out_qdtype),
@@ -650,6 +621,7 @@ Tensor computeQuantizedRelu(
 Tensor computeQuantizedCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
@@ -674,7 +646,7 @@ Tensor computeQuantizedCat(
   extra_args.emplace_back(argDim);
   extra_args.emplace_back(out_qscale);
   extra_args.emplace_back(out_qzero);
-  auto ResultBuf = makeQBufHandleNCHW(
+  auto ResultBuf = makeQBufHandleContiguous(
       "quantized_cat",
       outputShape,
       Dtype(immQDType(inputList[0])),
@@ -688,6 +660,7 @@ Tensor computeQuantizedCat(
 Tensor computeDequantize(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
@@ -719,6 +692,7 @@ Tensor computeDequantize(
 Tensor computeUpsampleNearest2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   auto A = c10::get<BufHandle>(inputs[0]);
@@ -727,9 +701,7 @@ Tensor computeUpsampleNearest2d(
   auto input_height = ExprHandle(A.dim(2));
   auto input_width = ExprHandle(A.dim(3));
 
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(c10::fmap<DimArg>(outputShape), &dims, &args);
+  std::vector<VarHandle> args = create_index_vars(outputShape);
   // Handle separately when scale is specified? as in 'scalar_t
   // compute_scales_value' in UpSample.h
   auto scale_h =
@@ -749,21 +721,24 @@ Tensor computeUpsampleNearest2d(
     newAxes[3] = compute_nearest_idx(scale_w, axes[3], input_width);
     return A.load(newAxes);
   };
-  auto e = body_func(VarVectorToVarHandleVector(args));
-  BufPtr buf = alloc<Buf>(
-      "quantize_upsample_nearest2d",
-      ExprHandleVectorToExprVector(outputShape),
+  auto e = body_func(args);
+  auto strides = isChannelsLast(A) ? make_channels_last_strides(outputShape)
+                                   : make_contiguous_strides(outputShape);
+  BufHandle buf = Buf::make(
+      "upsample_nearest2d",
+      outputShape,
       Dtype(*outputType),
-      nullptr,
-      c10::nullopt,
-      A.node()->qscale(),
-      A.node()->qzero());
-  return Tensor(buf, args, e.node());
+      c10::nullopt, // initializer
+      fmap(strides, [&](ExprPtr stride) { return ExprHandle(stride); }),
+      ExprHandle(A.node()->qscale()),
+      ExprHandle(A.node()->qzero()));
+  return Tensor(buf, args, e);
 }
 
 Tensor computeUpsampleNearest2dExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
@@ -795,7 +770,7 @@ Tensor computeUpsampleNearest2dExternalCall(
 
   BufHandle ResultBuf = [&]() {
     if (isQuantized(x)) {
-      return makeQBufHandleNHWC(
+      return makeQBufHandleChannelsLast(
           "upsample_nearest2d",
           outputShape,
           Dtype(immQDType(x)),
@@ -822,6 +797,7 @@ Tensor computeUpsampleNearest2dExternalCall(
 Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
     const c10::optional<ScalarType>& outputType,
     at::Device) {
@@ -831,12 +807,18 @@ Tensor computeQuantizedSigmoidExternalCall(
   const double out_qscale = 1.0f / 256.0f;
   const int64_t out_qzero = (out_qdtype == ScalarType::QInt8) ? -128 : 0;
 
-  auto ResultBuf = makeQBufHandleNHWC(
-      "quantized_sigmoid",
-      outputShape,
-      Dtype(out_qdtype),
-      out_qscale,
-      out_qzero);
+  auto ResultBuf = isChannelsLast(qx) ? makeQBufHandleChannelsLast(
+                                            "quantized_sigmoid",
+                                            outputShape,
+                                            Dtype(out_qdtype),
+                                            out_qscale,
+                                            out_qzero)
+                                      : makeQBufHandleContiguous(
+                                            "quantized_sigmoid",
+                                            outputShape,
+                                            Dtype(out_qdtype),
+                                            out_qscale,
+                                            out_qzero);
   StmtPtr s = ExternalCall::make(
       ResultBuf,
       "nnc_aten_quantized_sigmoid",
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h
index 67395e62a708..019b2349b184 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.h
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -19,114 +19,140 @@ TORCH_API bool isQuantized(const BufHandle& qx);
 TORCH_API Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2dPrepack(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2dRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedLinear(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedLinearRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedAdd(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 Tensor computeQuantizedAddExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedMul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedMulScalar(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeDequantize(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeDequantizeExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeUpsampleNearest2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeUpsampleNearest2dExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device);
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
index 1bd8e1d6d81a..6fa04899c99f 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
@@ -22,6 +22,7 @@ namespace tensorexpr {
 Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   std::vector<size_t> axes;
@@ -53,12 +54,12 @@ Tensor computeSum(
     std::iota(axes.begin(), axes.end(), 0);
   }
   // Axes go into reduction dimensions.
-  std::vector<DimArg> reductionDims;
+  std::vector<ExprHandle> reductionDims;
   reductionDims.reserve(rank);
   for (size_t axis : axes) {
     reductionDims.emplace_back(sizes[axis]);
   }
-  std::vector<DimArg> outputDims;
+  std::vector<ExprHandle> outputDims;
   // Output dimensions are the complement of axes. When keepdim is set, a
   // one-sized dimension is inserted for each axis.
   for (size_t dim = 0; dim < rank; ++dim) {
@@ -72,6 +73,7 @@ Tensor computeSum(
   return Reduce(
       "sum",
       outputDims,
+      outputStrides,
       Sum(),
       [&](ParameterList& indices) {
         // "Squeeze" out indices inserted when keepdim is set.
@@ -105,6 +107,7 @@ Tensor computeSum(
 Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -137,6 +140,7 @@ Tensor computeMean(
 Tensor computeMax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
@@ -160,6 +164,7 @@ Tensor computeMax(
 Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 6ae6e8168cdc..6265c4d26585 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -9,21 +9,25 @@ namespace tensorexpr {
 TORCH_API Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 TORCH_API Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 TORCH_API Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeMax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     const c10::optional<ScalarType>& outputType,
     at::Device device);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.cpp b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
index 0b813a134008..78fe16c44e8f 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
@@ -9,6 +9,7 @@ using namespace torch::jit::tensorexpr;
 Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     bool log_softmax) {
   // Softmax is computed as follows:
   //    softmax(vi) = exp(vi) / sum(exp(vi))
@@ -40,7 +41,6 @@ Tensor computeSoftmax(
   //   - Final loop computes the log_softmax for every element in v.
 
   TORCH_INTERNAL_ASSERT(inputs.size() == 3);
-  auto output_dims = c10::fmap<DimArg>(outputShape);
 
   // We do not handle None for dims (input 1) because that is supposed to
   // be deprecated.
@@ -48,10 +48,10 @@ Tensor computeSoftmax(
   int64_t rank = valueShape(inputs[0]).size();
   size_t softmax_dim =
       normalizeAndCheckIndex(c10::get<int64_t>(inputs[1]), rank);
-  std::vector<DimArg> non_softmax_dims;
-  for (size_t i = 0; i < output_dims.size(); ++i) {
+  std::vector<ExprHandle> non_softmax_dims;
+  for (size_t i = 0; i < outputShape.size(); ++i) {
     if (i != softmax_dim) {
-      non_softmax_dims.push_back(output_dims[i]);
+      non_softmax_dims.push_back(outputShape[i]);
     }
   }
 
@@ -103,14 +103,18 @@ Tensor computeSoftmax(
   auto max = Reduce(
       "aten_softmax_max",
       non_softmax_dims,
+      c10::nullopt,
       Maximum(dtype),
       [&](ParameterList& indices) {
         return tensorOrConstant(
             inputs[0], move_softmax_dim_index_to_pos(indices));
       },
-      {output_dims[softmax_dim]});
-  auto e =
-      Compute("aten_softmax_exp", output_dims, [&](ParameterList& indices) {
+      {outputShape[softmax_dim]});
+  auto e = Compute(
+      "aten_softmax_exp",
+      outputShape,
+      c10::nullopt,
+      [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
         return exp(inp - max.load(remove_softmax_dim_index(indices)));
@@ -118,14 +122,15 @@ Tensor computeSoftmax(
   auto sum = Reduce(
       "aten_softmax_sum",
       non_softmax_dims,
+      c10::nullopt,
       Sum(),
       [&](ParameterList& indices) {
         return e.load(move_softmax_dim_index_to_pos(indices));
       },
-      {output_dims[softmax_dim]});
+      {outputShape[softmax_dim]});
   if (!log_softmax) {
-    auto result =
-        Compute("aten_softmax", output_dims, [&](ParameterList& indices) {
+    auto result = Compute(
+        "aten_softmax", outputShape, c10::nullopt, [&](ParameterList& indices) {
           return e.load(indices) / sum.load(remove_softmax_dim_index(indices));
         });
     return Tensor(
@@ -135,11 +140,15 @@ Tensor computeSoftmax(
   }
 
   auto log_sum = Compute(
-      "aten_softmax_log_sum", non_softmax_dims, [&](ParameterList& indices) {
-        return log(sum.load(indices));
-      });
-  auto result =
-      Compute("aten_log_softmax", output_dims, [&](ParameterList& indices) {
+      "aten_softmax_log_sum",
+      non_softmax_dims,
+      c10::nullopt,
+      [&](ParameterList& indices) { return log(sum.load(indices)); });
+  auto result = Compute(
+      "aten_log_softmax",
+      outputShape,
+      c10::nullopt,
+      [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
         auto non_softmax_indices = remove_softmax_dim_index(indices);
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h
index b74a867a91b9..d5dd7fd429be 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.h
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -9,6 +9,7 @@ namespace tensorexpr {
 Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
     bool log_softmax);
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/reduction.cpp b/torch/csrc/jit/tensorexpr/reduction.cpp
index 1727482a8025..0cf8cb27dc02 100644
--- a/torch/csrc/jit/tensorexpr/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/reduction.cpp
@@ -6,12 +6,12 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-ReduceOpPtr Reducer::operator()(
-    BufPtr result_buf,
+ExprHandle Reducer::operator()(
+    BufHandle result_buf,
     ExprHandle body,
-    const std::vector<ExprPtr>& output,
-    const std::vector<VarPtr>& inner) const {
-  return alloc<ReduceOp>(
+    const std::vector<ExprHandle>& output,
+    const std::vector<VarHandle>& inner) const {
+  return ReduceOp::make(
       complete(result_buf, interaction_, body, output, inner), inner, *this);
 }
 
@@ -26,6 +26,14 @@ ReduceOpPtr Reducer::operator()(
       *this);
 }
 
+ExprHandle ReduceOp::make(
+    ExprHandle body,
+    std::vector<VarHandle> reduce_args,
+    const Reducer& reducer) {
+  return ExprHandle(alloc<ReduceOp>(
+      body.node(), VarHandleVectorToVarVector(reduce_args), reducer));
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h
index 24df1ef0fb3e..787fe6d2068f 100644
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
@@ -36,11 +35,11 @@ class TORCH_API Reducer {
     return init_;
   }
 
-  ReduceOpPtr operator()(
-      BufPtr result_buf,
+  ExprHandle operator()(
+      BufHandle result_buf,
       ExprHandle body,
-      const std::vector<ExprPtr>& output,
-      const std::vector<VarPtr>& inner) const;
+      const std::vector<ExprHandle>& output,
+      const std::vector<VarHandle>& inner) const;
 
   ReduceOpPtr operator()(
       BufPtr result_buf,
@@ -111,6 +110,16 @@ class TORCH_API Reducer {
     auto e = interaction(accum, body);
     return e.node();
   }
+  static ExprHandle complete(
+      BufHandle accumulator,
+      ReduceInteraction interaction,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output_args,
+      const std::vector<VarHandle>& reduce_args) {
+    ExprHandle accum = Load::make(body.dtype(), accumulator, output_args);
+    auto e = interaction(accum, body);
+    return e;
+  }
 
  private:
   ExprPtr init_;
@@ -133,6 +142,10 @@ class TORCH_API ReduceOp : public ExprNode<ReduceOp> {
         body_(body),
         reduce_args_(std::move(reduce_args)),
         reducer_(reducer) {}
+  static ExprHandle make(
+      ExprHandle body,
+      std::vector<VarHandle> reduce_args,
+      const Reducer& reducer);
 
   // return the body expression which obtains the value to be reduced.
   ExprPtr body() const {
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index b39976443e48..d2894ea157e6 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -449,6 +449,24 @@ class TORCH_API Free : public StmtNode<Free> {
   BufPtr buf_;
 };
 
+class TORCH_API FreeExt : public StmtNode<FreeExt> {
+ public:
+  static FreeExtPtr make(const std::vector<BufHandle>& bufs);
+
+  std::vector<BufPtr> bufs() const {
+    return bufs_;
+  }
+
+  void set_bufs(std::vector<BufPtr> bufs) {
+    bufs_ = std::move(bufs);
+  }
+
+  explicit FreeExt(std::vector<BufPtr> bufs) : bufs_(std::move(bufs)) {}
+
+ private:
+  std::vector<BufPtr> bufs_;
+};
+
 class TORCH_API Let : public StmtNode<Let> {
  public:
   static LetPtr make(const VarHandle& var, const ExprHandle& val) {
@@ -940,6 +958,60 @@ class TORCH_API ExternalCall : public StmtNode<ExternalCall> {
   std::vector<ExprPtr> args_;
 };
 
+class TORCH_API ExternalCallWithAlloc : public StmtNode<ExternalCallWithAlloc> {
+ public:
+  static ExternalCallWithAllocPtr make(
+      const std::string& func_name,
+      const std::vector<BufHandle>& buf_out_args,
+      const std::vector<BufHandle>& buf_args,
+      const std::vector<ExprHandle>& args);
+
+  std::vector<BufPtr> buf_out_args() const {
+    return buf_out_args_;
+  }
+
+  std::string func_name() const {
+    return func_name_;
+  }
+
+  std::vector<BufPtr> buf_args() const {
+    return buf_args_;
+  }
+
+  std::vector<ExprPtr> args() const {
+    return args_;
+  }
+
+  void set_buf_out_args(std::vector<BufPtr> buf_out_args) {
+    buf_out_args_ = std::move(buf_out_args);
+  }
+
+  void set_buf_args(std::vector<BufPtr> buf_args) {
+    buf_args_ = std::move(buf_args);
+  }
+
+  void set_args(std::vector<ExprPtr> args) {
+    args_ = std::move(args);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ExternalCallWithAlloc(
+      std::string func_name,
+      std::vector<BufPtr> buf_out_args,
+      std::vector<BufPtr> buf_args,
+      std::vector<ExprPtr> args)
+      : func_name_(std::move(func_name)),
+        buf_out_args_(std::move(buf_out_args)),
+        buf_args_(std::move(buf_args)),
+        args_(std::move(args)) {}
+
+ private:
+  std::string func_name_;
+  std::vector<BufPtr> buf_out_args_;
+  std::vector<BufPtr> buf_args_;
+  std::vector<ExprPtr> args_;
+};
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index c78f27f19b67..840cfc5eaa65 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -2,7 +2,6 @@
 
 #include <c10/util/Logging.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 
 namespace torch {
@@ -40,9 +39,31 @@ StmtPtr Tensor::constructStmt(
     }
   }
 
-  for (const auto i : c10::irange(ndim)) {
-    // Going in reverse order: from innermost loop to the outermost
-    size_t dim_index = ndim - i - 1;
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      buf_->is_contiguous() ||
+      buf_->is_contiguous(at::MemoryFormat::ChannelsLast) ||
+      buf_->is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+      buf_->is_channels_last_1d_contiguous());
+
+  auto loop_order_fn = [&]() {
+    std::vector<int32_t> loop_order;
+    if (buf_->is_contiguous()) {
+      for (int32_t i = args.size() - 1; i >= 0; i--) {
+        loop_order.push_back(i);
+      }
+    } else if (buf_->is_contiguous(c10::MemoryFormat::ChannelsLast)) {
+      loop_order = {1, 3, 2, 0};
+    } else if (buf_->is_contiguous(c10::MemoryFormat::ChannelsLast3d)) {
+      loop_order = {1, 4, 3, 2, 0};
+    } else {
+      loop_order = {1, 2, 0};
+    }
+
+    return loop_order;
+  };
+
+  auto loop_order = loop_order_fn();
+  for (auto dim_index : loop_order) {
     auto const& dim = buf()->dim(dim_index);
     s = alloc<For>(args[dim_index], immLike(dim, 0), dim, s);
   }
@@ -51,117 +72,162 @@ StmtPtr Tensor::constructStmt(
 
 Tensor Compute(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  ExprPtr body = body_func(VarVectorToVarHandleVector(args)).node();
-  BufPtr buf = alloc<Buf>(name, dims, body->dtype());
+  std::vector<VarHandle> args = create_index_vars(dims);
+  ExprHandle body = body_func(args);
+  BufHandle buf = Buf::make(name, dims, body.dtype(), c10::nullopt, strides);
   return Tensor(buf, args, body);
 }
+Tensor Compute(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
+  return Compute(name, dims, c10::nullopt, body_func);
+}
 
 Tensor Compute(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&)>& body_func) {
-  if (dim_args.size() != 1) {
+  if (dims.size() != 1) {
     throw malformed_input("mismatch between body and arg size (1)");
   }
 
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  ExprPtr body = body_func(VarHandle(args[0])).node();
-  BufPtr buf = alloc<Buf>(name, dims, body->dtype());
+  std::vector<VarHandle> args = create_index_vars(dims);
+  ExprHandle body = body_func(args[0]);
+  BufHandle buf = Buf::make(name, dims, body.dtype(), c10::nullopt, strides);
   return Tensor(buf, args, body);
 }
+Tensor Compute(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&)>& body_func) {
+  return Compute(name, dims, c10::nullopt, body_func);
+}
 
 Tensor Compute(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func) {
-  if (dim_args.size() != 2) {
+  if (dims.size() != 2) {
     throw malformed_input("mismatch between body and arg size (2)");
   }
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  ExprPtr body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
-  BufPtr buf = alloc<Buf>(name, dims, body->dtype());
+  std::vector<VarHandle> args = create_index_vars(dims);
+  ExprHandle body = body_func(args[0], args[1]);
+  BufHandle buf = Buf::make(name, dims, body.dtype(), c10::nullopt, strides);
   return Tensor(buf, args, body);
 }
+Tensor Compute(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func) {
+  return Compute(name, dims, c10::nullopt, body_func);
+}
 
 Tensor Compute(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func) {
-  if (dim_args.size() != 3) {
+  if (dims.size() != 3) {
     throw malformed_input("mismatch between body and arg size (3)");
   }
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  ExprPtr body =
-      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
-          .node();
-  BufPtr buf = alloc<Buf>(name, dims, body->dtype());
+  std::vector<VarHandle> args = create_index_vars(dims);
+  ExprHandle body = body_func(args[0], args[1], args[2]);
+  BufHandle buf = Buf::make(name, dims, body.dtype(), c10::nullopt, strides);
   return Tensor(buf, args, body);
 }
+Tensor Compute(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func) {
+  return Compute(name, dims, c10::nullopt, body_func);
+}
 
 Tensor Compute(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(
         const VarHandle&,
         const VarHandle&,
         const VarHandle&,
         const VarHandle&)>& body_func) {
-  if (dim_args.size() != 4) {
+  if (dims.size() != 4) {
     throw malformed_input("mismatch between body and arg size (4)");
   }
-  std::vector<ExprPtr> dims;
-  std::vector<VarPtr> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  ExprPtr body = body_func(
-                     VarHandle(args[0]),
-                     VarHandle(args[1]),
-                     VarHandle(args[2]),
-                     VarHandle(args[3]))
-                     .node();
-  BufPtr buf = alloc<Buf>(name, dims, body->dtype());
+  std::vector<VarHandle> args = create_index_vars(dims);
+  ExprHandle body = body_func(args[0], args[1], args[2], args[3]);
+  BufHandle buf = Buf::make(name, dims, body.dtype(), c10::nullopt, strides);
   return Tensor(buf, args, body);
 }
+Tensor Compute(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func) {
+  return Compute(name, dims, c10::nullopt, body_func);
+}
 
 Tensor Reduce(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BufHandle& buffer,
-    const std::vector<DimArg>& reduce_args) {
+    const std::vector<ExprHandle>& reduce_dims) {
   return Reduce(
       name,
-      dim_args,
+      dims,
+      strides,
       reducer,
       [&](ParameterList& p) { return buffer.load(p); },
-      reduce_args);
+      reduce_dims);
+}
+Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(name, dims, c10::nullopt, reducer, buffer, reduce_dims);
 }
 
 Tensor Reduce(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     Tensor tensor,
-    const std::vector<DimArg>& reduce_args) {
+    const std::vector<ExprHandle>& reduce_dims) {
   return Reduce(
       name,
-      dim_args,
+      dims,
+      strides,
       reducer,
       [&](ParameterList& p) { return tensor.load(p); },
-      reduce_args);
+      reduce_dims);
+}
+Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    Tensor tensor,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(name, dims, c10::nullopt, reducer, tensor, reduce_dims);
 }
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 18244638be78..e1e9dc03581b 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -4,7 +4,6 @@
 #include <functional>
 #include <vector>
 
-#include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 
@@ -19,6 +18,8 @@ class TORCH_API Tensor {
       : buf_(buf) {
     stmt_ = constructStmt(args, body, {}, {});
   }
+  Tensor(BufHandle buf, const std::vector<VarHandle>& args, ExprHandle body)
+      : Tensor(buf.node(), VarHandleVectorToVarVector(args), body.node()) {}
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   Tensor(
@@ -30,6 +31,18 @@ class TORCH_API Tensor {
       : buf_(buf) {
     stmt_ = constructStmt(args, body, reduce_dims, reduce_args);
   }
+  Tensor(
+      BufHandle buf,
+      const std::vector<VarHandle>& args,
+      const std::vector<ExprHandle>& reduce_dims,
+      const std::vector<VarHandle>& reduce_args,
+      ExprHandle body)
+      : Tensor(
+            buf.node(),
+            VarHandleVectorToVarVector(args),
+            ExprHandleVectorToExprVector(reduce_dims),
+            VarHandleVectorToVarVector(reduce_args),
+            body.node()) {}
 
   Tensor(BufPtr buf, StmtPtr stmt) : buf_(buf), stmt_(stmt) {}
 
@@ -59,22 +72,49 @@ class TORCH_API Tensor {
 
 TORCH_API Tensor Compute(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const std::function<ExprHandle(const VarHandle&)>& body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const std::function<ExprHandle(
         const VarHandle&,
         const VarHandle&,
@@ -82,120 +122,155 @@ TORCH_API Tensor Compute(
         const VarHandle&)>& body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
 
-inline void unpack_dim_args(
-    const std::vector<DimArg>& dim_args,
-    std::vector<ExprPtr>* dims,
-    std::vector<VarPtr>* vars) {
-  dims->clear();
-  vars->clear();
-  for (const DimArg& dim_arg : dim_args) {
-    ExprPtr expr = dim_arg.dim().node();
-    dims->push_back(expr);
-    vars->push_back(alloc<Var>(
-        dim_arg.name_hint(),
-        expr->dtype().scalar_type() == ScalarType::Long ? kLong : kInt));
+inline std::vector<VarHandle> create_index_vars(
+    const std::vector<ExprHandle>& dims) {
+  std::vector<VarHandle> vars;
+  vars.reserve(dims.size());
+  for (const ExprHandle& dim : dims) {
+    vars.push_back(VarHandle(alloc<Var>(
+        "i", dim.dtype().scalar_type() == ScalarType::Long ? kLong : kInt)));
   }
+  return vars;
 }
 
 // Handle reductions over a Reducer and a body_func which produces values.
 template <typename InitFunc, typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const InitFunc& init_func,
     const BodyFunc& body_func,
-    const std::vector<DimArg>& reduce_args) {
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<ExprPtr> dims;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<VarPtr> vars;
-  unpack_dim_args(dim_args, &dims, &vars);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<ExprPtr> reduce_dims;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<VarPtr> reduce_vars;
-  unpack_dim_args(reduce_args, &reduce_dims, &reduce_vars);
+    const std::vector<ExprHandle>& reduce_dims) {
+  std::vector<VarHandle> vars = create_index_vars(dims);
+  std::vector<VarHandle> reduce_vars = create_index_vars(reduce_dims);
 
   // If reduce_vars is empty, then it's not a reduction, but rather a simple
   // copy
   if (reduce_vars.empty()) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    ExprPtr body =
-        Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(vars))
-            .node();
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    BufPtr func_result = alloc<Buf>(func_name, dims, body->dtype());
+    ExprHandle body = Reducer::getReduceBody(body_func, vars);
+    BufHandle func_result =
+        Buf::make(func_name, dims, body.dtype(), c10::nullopt, strides);
     return Tensor(func_result, vars, body);
   }
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<VarPtr> all_vars;
+  std::vector<VarHandle> all_vars;
   all_vars.insert(all_vars.end(), vars.begin(), vars.end());
   all_vars.insert(all_vars.end(), reduce_vars.begin(), reduce_vars.end());
 
-  ExprHandle body =
-      Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(all_vars));
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<ExprPtr> output_args(vars.begin(), vars.end());
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  ExprPtr init_expr = alloc<Cast>(
-      body.dtype(), init_func(VarVectorToVarHandleVector(vars)).node());
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  BufPtr func_result = alloc<Buf>(func_name, dims, body.dtype(), init_expr);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  ReduceOpPtr reduce_op = reducer(func_result, body, output_args, reduce_vars);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  ExprHandle body = Reducer::getReduceBody(body_func, all_vars);
+  std::vector<ExprHandle> output_args(vars.begin(), vars.end());
+  ExprHandle init_expr = Cast::make(body.dtype(), init_func(vars));
+  BufHandle func_result = Buf::make(func_name, dims, body.dtype(), init_expr);
+  ExprHandle reduce_op = reducer(func_result, body, output_args, reduce_vars);
   Tensor t = Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
   return t;
 }
+template <typename InitFunc, typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const InitFunc& init_func,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<InitFunc, BodyFunc>(
+      func_name,
+      dims,
+      c10::nullopt,
+      reducer,
+      init_func,
+      body_func,
+      reduce_dims);
+}
 
 template <typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BodyFunc& body_func,
-    const std::vector<DimArg>& reduce_args) {
+    const std::vector<ExprHandle>& reduce_dims) {
   return Reduce(
       func_name,
-      dim_args,
+      dims,
+      strides,
       reducer,
       [&](ParameterList p) { return ExprHandle(reducer.initializer()); },
       body_func,
-      reduce_args);
+      reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<BodyFunc>(
+      func_name, dims, c10::nullopt, reducer, body_func, reduce_dims);
 }
 
 // Overload which allows inline lambda functions for the body_func.
 template <typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    const BodyFunc&& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, strides, reducer, body_func, reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const Reducer& reducer,
     const BodyFunc&& body_func,
-    const std::vector<DimArg>& reduce_args) {
-  return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, c10::nullopt, reducer, body_func, reduce_dims);
 }
 
 TORCH_API Tensor Reduce(
     const std::string& name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BufHandle& buffer,
-    const std::vector<DimArg>& reduce_args);
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims);
 
 // Overload for the common case of all dimensions of a prevously Computed
 // Tensor.
 TORCH_API Tensor Reduce(
     const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    Tensor tensor,
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
     const Reducer& reducer,
     Tensor tensor,
-    const std::vector<DimArg>& reduce_args);
+    const std::vector<ExprHandle>& reduce_dims);
 
 template <typename... Ts>
 inline ExprHandle Tensor::load(const Ts&... ts) const {
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index d1ad1e27b8e9..b18b934a6f88 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -278,17 +278,10 @@ void initTensorExprBindings(PyObject* module) {
         self->set_src_value(value.node());
       });
 
-  py::class_<DimArg>(te, "DimArg")
-      .def(py::init<const ExprHandle&>())
-      .def(py::init<const ExprHandle&, const std::string&>());
-  py::implicitly_convertible<ExprHandle, DimArg>();
-  py::implicitly_convertible<int32_t, DimArg>();
-  py::implicitly_convertible<int64_t, DimArg>();
-
   te.def(
       "Compute",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          py::function func) {
         if (dim_args.size() == 1) {
           return Compute(func_name, dim_args, [&func](const VarHandle& a) {
@@ -329,7 +322,7 @@ void initTensorExprBindings(PyObject* module) {
   te.def(
       "Compute2",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          py::function func) {
         return Compute(
             func_name, dim_args, [&func](const std::vector<VarHandle>& dims) {
@@ -348,10 +341,10 @@ void initTensorExprBindings(PyObject* module) {
   te.def(
       "Reduce",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          const Reducer& reducer,
          Tensor buffer,
-         const std::vector<DimArg>& reduce_args) {
+         const std::vector<ExprHandle>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
       },
       py::return_value_policy::reference);
@@ -359,34 +352,34 @@ void initTensorExprBindings(PyObject* module) {
   te.def(
       "Reduce",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          const Reducer& reducer,
          const BufHandle& buffer,
-         const std::vector<DimArg>& reduce_args) {
+         const std::vector<ExprHandle>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
       },
       py::return_value_policy::reference);
   te.def(
       "Reduce",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          const Reducer& reducer,
          const std::function<ExprHandle(const std::vector<VarHandle>&)>&
              body_func,
-         const std::vector<DimArg>& reduce_args) {
+         const std::vector<ExprHandle>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
       },
       py::return_value_policy::reference);
   te.def(
       "Reduce",
       [](const std::string& func_name,
-         const std::vector<DimArg>& dim_args,
+         const std::vector<ExprHandle>& dim_args,
          const Reducer& reducer,
          const std::function<ExprHandle(const std::vector<VarHandle>&)>&
              init_func,
          const std::function<ExprHandle(const std::vector<VarHandle>&)>&
              body_func,
-         const std::vector<DimArg>& reduce_args) {
+         const std::vector<ExprHandle>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
       },
       py::return_value_policy::reference);
@@ -714,8 +707,14 @@ void initTensorExprBindings(PyObject* module) {
         }
         if (NNCLoweringFunction lowering =
                 getStandardLoweringFor(op.toQualString())) {
+          std::vector<ExprHandle> outputStrides =
+              c10::fmap<ExprHandle>(make_channels_last_strides(outputShape));
           return lowering(
-              argInputs, outputShape, outputType.scalar_type(), at::kCPU);
+              argInputs,
+              outputShape,
+              outputStrides,
+              outputType.scalar_type(),
+              at::kCPU);
         }
         std::string msg = std::string("Unhandled node kind (in te.lower): ") +
             op.toQualString();
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index 88ac4d61a667..19b4338829fb 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -94,7 +94,7 @@ size_t assertFind(
     const SourceRange& search_range,
     const std::string& sub,
     const std::function<void(std::ostream& out)>& extra_msg = nullptr) {
-  auto pos = search_range.source()->text().find(sub, search_range.start());
+  auto pos = search_range.source()->text_str().find(sub, search_range.start());
   if (pos == std::string::npos || (pos + sub.size()) > search_range.end()) {
     auto found_range =
         SourceRange(search_range.source(), search_range.start(), sub.size());
@@ -122,19 +122,18 @@ size_t assertFind(
 }
 
 size_t assertFind(
-    const std::shared_ptr<SourceView>& source,
+    const std::shared_ptr<Source>& source,
     const std::string& sub,
     size_t start,
     const Check& check) {
-  return assertFind(
-      SourceRange(source, start, source->text().size()), sub, check);
+  return assertFind(SourceRange(source, start, source->size()), sub, check);
 }
 
 void assertNotFind(
     const SourceRange& search_range,
     const std::string& sub,
     const Check& check) {
-  auto pos = search_range.source()->text().find(sub, search_range.start());
+  auto pos = search_range.source()->text_str().find(sub, search_range.start());
   if (pos != std::string::npos && (pos + sub.size()) <= search_range.end()) {
     auto found_range =
         SourceRange(search_range.source(), pos, sub.size() + pos);
@@ -202,9 +201,7 @@ struct FileCheckImpl {
   friend std::ostream& operator<<(std::ostream& out, const FileCheckImpl& fc);
 
  private:
-  bool parseSingleCheck(
-      const std::shared_ptr<SourceView>& source,
-      size_t* start) {
+  bool parseSingleCheck(const std::shared_ptr<Source>& source, size_t* start) {
     const static std::vector<std::pair<CheckType, std::string>> check_pairs = {
         {CHECK, ": "},
         {CHECK_NEXT, "-NEXT: "},
@@ -217,31 +214,35 @@ struct FileCheckImpl {
 
     for (const auto& check_pair : check_pairs) {
       const std::string& check_suffix = check_pair.second;
-      auto suffix_pos = source->text().find(check_suffix, *start);
+      auto suffix_pos = source->text_str().find(check_suffix, *start);
       if (suffix_pos != *start) {
         continue;
       }
       size_t end_check_string = suffix_pos + check_suffix.size();
       CheckType type = check_pair.first;
       c10::optional<size_t> count = c10::nullopt;
-      auto end_line = source->text().find('\n', end_check_string);
+      auto end_line = source->text_str().find("\n", end_check_string);
       bool exactly = false;
       if (type == CHECK_COUNT) {
         const std::string exact = "EXACTLY-";
-        if (source->text().find(exact, end_check_string) == end_check_string) {
+        if (source->text_str().find(exact, end_check_string) ==
+            end_check_string) {
           exactly = true;
           end_check_string += exact.size();
         }
         size_t end =
             assertFind(SourceRange(source, end_check_string, end_line), ":");
-        auto count_view =
-            source->text().substr(end_check_string, end - end_check_string);
+        auto count_view = source->text_str()
+                              .substr(end_check_string, end - end_check_string)
+                              .str();
         count = c10::stoll(std::string(count_view.begin(), count_view.end()));
         end_check_string = end + 2; // add ':' and the space
       }
       auto check = Check(
           type,
-          source->text().substr(end_check_string, end_line - end_check_string),
+          source->text_str()
+              .substr(end_check_string, end_line - end_check_string)
+              .str(),
           count);
       addCheck(check);
       if (exactly) {
@@ -253,32 +254,30 @@ struct FileCheckImpl {
     return false;
   }
 
-  size_t findNextStart(
-      const std::shared_ptr<SourceView>& source,
-      size_t prev_end) {
-    size_t start = source->text().find('#', prev_end);
+  size_t findNextStart(const std::shared_ptr<Source>& source, size_t prev_end) {
+    size_t start = source->text_str().find("#", prev_end);
     if (start == std::string::npos) {
       return start;
     }
     start += 1;
     static constexpr size_t max_whitespace = 6;
     size_t i = 0;
-    while (start + i < source->text().size() && i < max_whitespace) {
-      auto c = source->text().at(start + i);
+    while (start + i < source->size() && i < max_whitespace) {
+      auto c = source->char_at(start + i);
       if (c != ' ' && c != '\t') {
         break;
       }
       i++;
     }
     static const std::string check = "CHECK";
-    if (source->text().substr(start + i, check.size()) == check) {
+    if (source->text_str().substr(start + i, check.size()) == check) {
       return start + i + check.size();
     } else {
       return findNextStart(source, start + i + 1);
     }
   }
 
-  void parseStrings(const std::shared_ptr<SourceView>& source) {
+  void parseStrings(const std::shared_ptr<Source>& source) {
     size_t start = 0;
     start = findNextStart(source, 0);
     while (start != std::string::npos) {
@@ -297,7 +296,7 @@ struct FileCheckImpl {
 
   void doCheckNot(
       const std::vector<Check>& nots,
-      const std::shared_ptr<SourceView>& source,
+      const std::shared_ptr<Source>& source,
       const SourceRange& prev,
       const SourceRange& next) {
     auto start = prev.end(); // inclusive
@@ -314,7 +313,7 @@ struct FileCheckImpl {
   // Checks that source token is highlighted, does not advance search range.
   void doCheckSourceHighlighted(
       const Check& check,
-      const std::shared_ptr<SourceView>& source,
+      const std::shared_ptr<Source>& source,
       size_t start_offset) {
     auto construct_error_and_throw = [&](size_t error_start_pos) {
       SourceRange error_range(
@@ -330,8 +329,8 @@ struct FileCheckImpl {
     size_t search_start_offset = start_offset;
     bool found_token_at_least_once = false;
     size_t pos = search_start_offset;
-    while (pos < source->text().size()) {
-      pos = source->text().find(check.search_str_, search_start_offset);
+    while (pos < source->size()) {
+      pos = source->text_str().find(check.search_str_, search_start_offset);
       if (pos == std::string::npos) {
         break;
       }
@@ -349,17 +348,16 @@ struct FileCheckImpl {
       auto highlight_start_offset =
           source->offset_for_line(highlight_lineno) + col;
       auto highlight_end_offset = std::min(
-          highlight_start_offset + check.search_str_.size(),
-          source->text().size());
+          highlight_start_offset + check.search_str_.size(), source->size());
 
-      if (highlight_end_offset >= source->text().size()) {
+      if (highlight_end_offset >= source->size()) {
         construct_error_and_throw(pos);
       }
 
       bool found_highlight = true;
       for (const auto posi :
            c10::irange(highlight_start_offset, highlight_end_offset)) {
-        if (source->text()[posi] != '~') {
+        if (source->char_at(posi) != '~') {
           found_highlight = false;
         }
       }
@@ -390,7 +388,7 @@ struct FileCheckImpl {
 
   SourceRange matchDagGroup(
       const std::vector<Check>& group,
-      const std::shared_ptr<SourceView>& source,
+      const std::shared_ptr<Source>& source,
       const SourceRange& prev) {
     size_t group_beg = std::string::npos;
     size_t group_end = 0;
@@ -408,7 +406,7 @@ struct FileCheckImpl {
 
   SourceRange matchGroup(
       const std::vector<Check>& group,
-      const std::shared_ptr<SourceView>& source,
+      const std::shared_ptr<Source>& source,
       const SourceRange& prev) {
     AT_ASSERT(group.size() != 0);
     CheckType type = group[0].type_;
@@ -467,7 +465,7 @@ struct FileCheckImpl {
     return SourceRange(source, start_range, end_range);
   }
 
-  void doChecks(const std::shared_ptr<SourceView>& source) {
+  void doChecks(const std::shared_ptr<Source>& source) {
     SourceRange prev(source, 0, 0);
     for (size_t i = 0; i < groups.size(); i++) {
       const auto& curr_group = groups[i];
@@ -484,7 +482,7 @@ struct FileCheckImpl {
           ++i; // already checked the group after
         } else {
           SourceRange end_of_file(
-              source, source->text().size() + 1, source->text().size() + 1);
+              source, source->size() + 1, source->size() + 1);
           doCheckNot(curr_group, source, prev, end_of_file);
         }
       }
diff --git a/torch/csrc/lazy/backend/backend_device.cpp b/torch/csrc/lazy/backend/backend_device.cpp
index 5259efbc350b..8445fd74c974 100644
--- a/torch/csrc/lazy/backend/backend_device.cpp
+++ b/torch/csrc/lazy/backend/backend_device.cpp
@@ -5,6 +5,7 @@
 #include <c10/util/StringUtil.h>
 #include <torch/csrc/lazy/core/tensor.h>
 #include <torch/csrc/lazy/backend/backend_interface.h>
+#include <c10/util/Optional.h>
 
 namespace torch {
 namespace lazy {
@@ -17,9 +18,6 @@ BackendDevice::BackendDevice()
 BackendDevice::BackendDevice(std::shared_ptr<BackendDeviceType>&& type, int64_t ordinal)
   : type_(std::move(type)), ordinal_(ordinal) {}
 
-BackendDevice::BackendDevice(const std::string& device_spec)
-  : BackendDevice::BackendDevice() {}
-
 int8_t BackendDevice::type() const {
   TORCH_INTERNAL_ASSERT(type_);
   return type_->type;
@@ -54,9 +52,25 @@ c10::Device backendDeviceToAtenDevice(const BackendDevice& device) {
   return c10::Device(at::kLazy, device.ordinal());
 }
 
+c10::optional<BackendDevice> GetBackendDevice(const at::TensorList tensors) {
+  for (auto& tensor: tensors) {
+    if (auto lt = TryGetLtcTensor(tensor)) {
+      return lt->GetDevice();
+    }
+  }
+  return c10::nullopt;
+}
+
 c10::optional<BackendDevice> GetBackendDevice(const at::Tensor& tensor) {
   if (auto lt = TryGetLtcTensor(tensor)) {
-    return lt.GetDevice();
+    return lt->GetDevice();
+  }
+  return c10::nullopt;
+}
+
+c10::optional<BackendDevice> GetBackendDevice(const c10::optional<c10::Device> device) {
+  if (device) {
+    return c10::make_optional(atenDeviceToBackendDevice(*device));
   }
   return c10::nullopt;
 }
diff --git a/torch/csrc/lazy/backend/backend_device.h b/torch/csrc/lazy/backend/backend_device.h
index 6c4cde8ac8bc..818d0f2c10db 100644
--- a/torch/csrc/lazy/backend/backend_device.h
+++ b/torch/csrc/lazy/backend/backend_device.h
@@ -18,7 +18,11 @@ namespace lazy {
 
 // Backend should extend it and define their own supported hardware types.
 struct TORCH_API BackendDeviceType {
-  int8_t type {0};
+  int8_t type {(int8_t)at::kCPU};
+  // Note: previous default value was '0', which actually maps to at::kCPU, at least now it is explicit,
+  // we may want to make default/undefined semantics more clear though
+  BackendDeviceType() :type((int8_t)at::kCPU) {}
+  BackendDeviceType(int8_t type) :type(type) {}
 
   virtual ~BackendDeviceType() = default;
   virtual std::string toString() const { return "Unknown"; }
@@ -32,7 +36,7 @@ class TORCH_API BackendDevice {
   BackendDevice(std::shared_ptr<BackendDeviceType>&& type, int64_t ordinal);
 
   int8_t type() const;
-  int64_t ordinal() const { return ordinal_;  }
+  int64_t ordinal() const { return ordinal_; }
 
   bool operator==(const BackendDevice& other) const { return compare(other) == 0; }
   bool operator!=(const BackendDevice& other) const { return compare(other) != 0; }
@@ -40,9 +44,6 @@ class TORCH_API BackendDevice {
 
   std::string toString() const;
 
-  // The string -> Device conversion should be handled by the backend interface.
-  C10_DEPRECATED explicit BackendDevice(const std::string& device_spec);
-
  private:
   int compare(const BackendDevice& rhs) const;
 
@@ -59,7 +60,9 @@ TORCH_API c10::Device backendDeviceToAtenDevice(const BackendDevice& device);
 
 // Tries to extract the backend device out of the lazy tensor. Returns nullopt if the
 // input is not a lazy tensor.
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(const at::TensorList tensors);
 TORCH_API c10::optional<BackendDevice> GetBackendDevice(const at::Tensor& tensor);
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(const c10::optional<c10::Device> device);
 
 // For variadic template.
 TORCH_API c10::optional<BackendDevice> GetBackendDevice();
diff --git a/torch/csrc/lazy/backend/backend_interface.cpp b/torch/csrc/lazy/backend/backend_interface.cpp
index 21d5b2ed3e95..d754bafefa4b 100644
--- a/torch/csrc/lazy/backend/backend_interface.cpp
+++ b/torch/csrc/lazy/backend/backend_interface.cpp
@@ -22,6 +22,13 @@ BackendRegistrar::BackendRegistrar(
   backend_impl_registry.store(backend_impl_interface);
 }
 
+// Get IrBuilder from backend. Use TorchScriptIrBuilder by default
+const IrBuilder* getIrBuilder() {
+  static const IrBuilder* builder = getBackend()->GetIrBuilder();
+  return builder;
+}
+
+
 at::Tensor MakeTensorFromComputationData(
     const BackendDataPtr data,
     c10::optional<at::ScalarType> logical_scalar_type) {
diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h
index 18b1f436cb47..097b01969457 100644
--- a/torch/csrc/lazy/backend/backend_interface.h
+++ b/torch/csrc/lazy/backend/backend_interface.h
@@ -10,6 +10,8 @@
 namespace torch {
 namespace lazy {
 
+struct IrBuilder;
+
 /**
  * Work in progress- don't treat this as a stable interface yet!
  */
@@ -32,6 +34,12 @@ class TORCH_API BackendImplInterface {
 
   virtual void SetRngSeed(size_t seed) const = 0;
 
+  /**
+   * IR Tracing
+   * */
+
+  virtual const IrBuilder* GetIrBuilder() const = 0;
+
   /**
    * Data Transfer
    * */
@@ -45,6 +53,9 @@ class TORCH_API BackendImplInterface {
   virtual BackendDataPtr CreateDataPlaceholder(
       const BackendDevice& device, const Shape& shape) const = 0;
 
+  // Gets backend data if the node is a device data node. Otherwise returns nullptr
+  virtual BackendDataPtr GetComputationDataFromNode(Node*) const = 0;
+
   virtual at::Tensor MakeTensorFromComputationData(
       const BackendDataPtr data,
       c10::optional<at::ScalarType> logical_scalar_type) const = 0;
@@ -121,8 +132,10 @@ class TORCH_API BackendRegistrar {
   BackendRegistrar(const BackendImplInterface* backend_impl_interface);
 };
 
-bool hasBackend();
+TORCH_API bool hasBackend();
 TORCH_API const BackendImplInterface* getBackend();
 
+TORCH_API const IrBuilder* getIrBuilder();
+
 }  // lazy
 }  // torch
diff --git a/torch/csrc/lazy/backend/lowering_context.h b/torch/csrc/lazy/backend/lowering_context.h
index f6c9977e8069..ca20eb696b01 100644
--- a/torch/csrc/lazy/backend/lowering_context.h
+++ b/torch/csrc/lazy/backend/lowering_context.h
@@ -24,7 +24,13 @@ class TORCH_API Computation {
 
   virtual const Shape& result_shape() const = 0;
 
+  virtual const std::string to_string() const = 0;
+
   virtual ~Computation() = default;
+
+  // Indicates whether this computation is being executed inside a mark step
+  // Assume false unless set otherwise
+  bool in_mark_step = false;
 };
 
 using ComputationPtr = std::shared_ptr<Computation>;
@@ -54,8 +60,22 @@ class TORCH_API LoweringContext {
   const std::vector<BackendDataPtr>&
   GetParametersData() const;
 
-  // Get the shape of the result tuple component, given by index.
-  virtual Shape GetResultShape(size_t index) const = 0;
+  // Adds a new input/output alias.
+  virtual void SetUpAlias(
+      const std::vector<int64_t>& output_index,
+      int64_t param_number,
+      const std::vector<int64_t>& param_index,
+      bool must_alias = false) {
+    // Dummy default implementation to do nothing.
+  }
+
+  // Check if parameter shape matches result at index.
+  virtual bool CheckResultShape(
+      const BackendDataPtr& parameter_data,
+      size_t result_idx) {
+    // Dummy default implementation to do nothing.
+    return false;
+  }
 
   // Adds the given output as a component of the result tuple and returns its
   // assigned position within the tuple.
diff --git a/torch/csrc/lazy/core/cache.h b/torch/csrc/lazy/core/cache.h
index cd49a1134ce3..2ff45b4d1de7 100644
--- a/torch/csrc/lazy/core/cache.h
+++ b/torch/csrc/lazy/core/cache.h
@@ -63,6 +63,12 @@ class Cache {
     return it->second->second;
   }
 
+  TypePtr GetLatest() {
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(element_list_.size() > 0);
+    return element_list_.front().second;
+  }
+
   bool Erase(const K& key) {
     std::lock_guard<std::mutex> slock(lock_);
     auto it = element_map_.find(&key);
@@ -81,6 +87,12 @@ class Cache {
     element_list_.clear();
   }
 
+  int Numel() const {
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(element_map_.size() == element_list_.size());
+    return element_map_.size();
+  }
+
  private:
   using ElementList = std::list<Element>;
 
@@ -107,7 +119,7 @@ class Cache {
     element_list_.splice(element_list_.begin(), element_list_, it);
   }
 
-  std::mutex lock_;
+  mutable std::mutex lock_;
   size_t max_size_ = 0;
   ElementList element_list_;
   ElementMap element_map_;
diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp
index af86dd926d6d..0898fd1a3afe 100644
--- a/torch/csrc/lazy/core/config.cpp
+++ b/torch/csrc/lazy/core/config.cpp
@@ -4,9 +4,24 @@ C10_DEFINE_bool(torch_lazy_ir_debug, false, "Enable lazy tensor IR debugging");
 
 C10_DEFINE_bool(
     torch_lazy_param_aliasing,
-    false,
+    true,
     "Enable parameter aliasing support");
 
+C10_DEFINE_bool(
+    torch_lazy_handle_special_scalars,
+    false,
+    "Handle special scalars 0 and 1 diffrently");
+
+C10_DEFINE_bool(
+    torch_lazy_reuse_ir,
+    false,
+    "Reuse IR nodes from previous tracing when possible");
+
+C10_DEFINE_bool(
+    torch_lazy_use_thread_pool,
+    false,
+    "Use thread pool to schedule backend execution");
+
 C10_DEFINE_int(
     torch_lazy_compilation_cache_size,
     1024,
@@ -40,3 +55,27 @@ C10_DEFINE_string(
     torch_lazy_metrics_percentiles,
     "0.01:0.05:0.1:0.2:0.5:0.8:0.9:0.95:0.99",
     "Metrics percentiles to be collected, using : as the delimiter");
+
+C10_DEFINE_int(
+    torch_lazy_shape_cache_size,
+    4096,
+    "Set the size for the shape cache used for shape inference");
+
+
+namespace torch {
+namespace lazy {
+
+std::string& getLTCForceFallback() {
+    static std::string config;
+    static bool _ignore = [&]() {
+        char *envptr = std::getenv("LTC_FORCE_FALLBACK");
+        if (envptr) {
+            config = std::string(envptr);
+        }
+        return true;
+    }();
+    (void) _ignore;  // avoid unused variables warning
+    return config;
+}
+
+} }
diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h
index beee5b4b214d..a73b123a9dd4 100644
--- a/torch/csrc/lazy/core/config.h
+++ b/torch/csrc/lazy/core/config.h
@@ -1,8 +1,12 @@
 #pragma once
 #include <c10/util/Flags.h>
+#include <c10/macros/Export.h>
 
 C10_DECLARE_bool(torch_lazy_ir_debug);
+C10_DECLARE_bool(torch_lazy_handle_special_scalars);
 C10_DECLARE_bool(torch_lazy_param_aliasing);
+C10_DECLARE_bool(torch_lazy_reuse_ir);
+C10_DECLARE_bool(torch_lazy_use_thread_pool);
 
 C10_DECLARE_int(torch_lazy_compilation_cache_size);
 C10_DECLARE_int(torch_lazy_device_data_cache_size);
@@ -12,3 +16,10 @@ C10_DECLARE_int(torch_lazy_trim_graph_check_frequency);
 C10_DECLARE_int(torch_lazy_trim_graph_size);
 
 C10_DECLARE_string(torch_lazy_metrics_percentiles);
+
+C10_DECLARE_int(torch_lazy_shape_cache_size);
+
+namespace torch {
+namespace lazy {
+TORCH_API std::string& getLTCForceFallback();
+} }
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
new file mode 100644
index 000000000000..f64fe4765920
--- /dev/null
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -0,0 +1,172 @@
+#include <c10/util/irange.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_dump_util.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <torch/csrc/lazy/core/unique.h>
+
+#include <fstream>
+#include <mutex>
+#include <sstream>
+#include <unordered_set>
+
+namespace torch {
+namespace lazy {
+namespace  {
+
+std::string GetEnvString(const char* name, const std::string& defval) {
+  const char* env = std::getenv(name);
+  return env != nullptr ? env : defval;
+}
+
+DebugUtil::GraphFormat DefaultGraphFormat() {
+  std::string fmt_str = GetEnvString("LTC_SAVE_TENSORS_FMT", "text");
+  if (fmt_str == "text") {
+    return DebugUtil::GraphFormat::kText;
+  } else if (fmt_str == "backend") {
+    return DebugUtil::GraphFormat::kBackend;
+  } else if (fmt_str == "dot") {
+    return DebugUtil::GraphFormat::kDot;
+  }
+  LOG(ERROR) << "Invalid save graph format: " << fmt_str;
+  return DebugUtil::GraphFormat::kText;
+}
+
+std::unordered_set<std::string>* LoadExperiments() {
+  std::unique_ptr<std::unordered_set<std::string>> xset =
+      std::make_unique<std::unordered_set<std::string>>();
+  std::string experiments = GetEnvString("LTC_EXPERIMENTAL", "");
+  std::vector<std::string> experiment_list =
+      torch::lazy::StrSplit(experiments, ':');
+  for (auto& name : experiment_list) {
+    xset->insert(name);
+  }
+  return xset.release();
+}
+
+}  // namespace
+
+std::vector<SourceLocation> NoPythonFrames(){
+  SourceLocation dummy_loc;
+  dummy_loc.file = "No Python Frames";
+  return {dummy_loc};
+}
+
+std::function<std::vector<SourceLocation>()>& GetPythonFramesFunction() {
+  static std::function<std::vector<SourceLocation>()> func_ = NoPythonFrames;
+  return func_;
+}
+
+DebugUtil::GraphFormat DebugUtil::GetDefaultGraphFormat() {
+  static GraphFormat format = DefaultGraphFormat();
+  return format;
+}
+
+std::string GetFirstUserFrameInPython() {
+
+  std::string empty;
+  if (!torch::lazy::GetPythonFramesFunction()) {
+    return empty;
+  }
+
+  auto frames = torch::lazy::GetPythonFramesFunction()();
+
+  for (auto i = frames.size(); i > 0; i--) {
+    auto& loc = frames[i - 1];
+    if (loc.file.find("site-packages") == std::string::npos) {
+      std::stringstream ss;
+      ss << loc.file << " "
+        << loc.function << " "
+        << loc.line;
+      return ss.str();
+    }
+  }
+  return empty;
+}
+
+std::string DebugUtil::GetTensorsGraphInfo(c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+                                           const std::vector<size_t>* indices,
+                                           GraphFormat format) {
+  std::vector<torch::lazy::Node*> root_nodes;
+  std::vector<torch::lazy::Value> root_values;
+  std::vector<torch::lazy::hash_t> root_hashes;
+  torch::lazy::Unique<torch::lazy::BackendDevice> unique_device;
+  if (indices != nullptr) {
+    for (auto index : *indices) {
+      const torch::lazy::LazyTensorPtr& tensor = tensors[index];
+      torch::lazy::Value ir_value = tensor->CurrentIrValue();
+      if (ir_value) {
+        root_nodes.push_back(ir_value.node.get());
+        root_hashes.push_back(ir_value.hash());
+        root_values.push_back(std::move(ir_value));
+        unique_device.set(tensor->GetDevice());
+      }
+    }
+  } else {
+    for (auto& tensor : tensors) {
+      torch::lazy::Value ir_value = tensor->CurrentIrValue();
+      if (ir_value) {
+        root_nodes.push_back(ir_value.node.get());
+        root_hashes.push_back(ir_value.hash());
+        root_values.push_back(std::move(ir_value));
+        unique_device.set(tensor->GetDevice());
+      }
+    }
+  }
+  std::stringstream ss;
+  // Call into a function pointer that may backed by python or empty depending on runtime
+  std::vector<SourceLocation> frames = GetPythonFramesFunction()();
+  ss << "Python Stacktrace:\n";
+  for (auto& location : frames) {
+    ss << "  " << location.function << " (" << location.file << ":"
+       << location.line << ")\n";
+  }
+  ss << "\nHashes: (";
+  for (const auto i : c10::irange(root_hashes.size())) {
+    if (i > 0) {
+      ss << ", ";
+    }
+    ss << torch::lazy::HashToString(root_hashes[i]);
+  }
+  ss << ")\n";
+
+  std::string graph_str;
+  if (format == GraphFormat::kText) {
+    graph_str = torch::lazy::DumpUtil::ToText(root_nodes);
+  } else if (format == GraphFormat::kDot) {
+    graph_str = torch::lazy::DumpUtil::ToDot(root_nodes);
+  } else if (format == GraphFormat::kBackend) {
+    graph_str = torch::lazy::DumpUtil::ToBackend(
+        root_values,
+        unique_device ? *unique_device : torch::lazy::BackendDevice());
+  } else {
+    LOG(ERROR) << "Invalid graph format: " << format;
+  }
+  ss << "\n## BEGIN_GRAPH\n" << graph_str << "\n## END_GRAPH\n\n";
+  return ss.str();
+}
+
+void DebugUtil::SaveTensorsGraphInfo(const char* name,
+                                     c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+                                     const std::vector<size_t>* indices,
+                                     GraphFormat format) {
+  static const std::string save_file = GetEnvString("LTC_SAVE_TENSORS_FILE", "");
+  if (!save_file.empty()) {
+    static std::mutex lock;
+    std::string info = GetTensorsGraphInfo(tensors, indices, format);
+    std::lock_guard<std::mutex> guard(lock);
+    std::ofstream graph_file(save_file, std::ios_base::app);
+    graph_file << "[" << name << "]\n" << info << "\n";
+  }
+}
+
+bool DebugUtil::ExperimentEnabled(const std::string& name) {
+  static const std::unordered_set<std::string>* xset = LoadExperiments();
+  return xset->find(name) != xset->end();
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/core/debug_util.h b/torch/csrc/lazy/core/debug_util.h
new file mode 100644
index 000000000000..62cdcf98c641
--- /dev/null
+++ b/torch/csrc/lazy/core/debug_util.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API std::function<std::vector<SourceLocation>()>& GetPythonFramesFunction();
+
+TORCH_API std::string GetFirstUserFrameInPython();
+
+class TORCH_API DebugUtil {
+ public:
+  enum GraphFormat {
+    kText,
+    kDot,
+    kBackend,
+  };
+
+  static GraphFormat GetDefaultGraphFormat();
+
+  // Dumps the current Python frame and the IR Graph whose roots are the IR
+  // values held at the tensors. If indices is not nullptr, it selects the
+  // indices of the tensors whose graph will be emitted.
+  static std::string GetTensorsGraphInfo(
+      c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors, const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  // If the environment variable LTC_SAVE_TENSORS_FILE is set to the proper
+  // output path, an instance of the report returned by GetTensorsGraphInfo() is
+  // saved.
+  static void SaveTensorsGraphInfo(
+      const char* name, c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+      const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  static bool ExperimentEnabled(const std::string& name);
+};
+
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/core/hash.cpp b/torch/csrc/lazy/core/hash.cpp
index 41eb83cee80c..42dac8d09bf3 100644
--- a/torch/csrc/lazy/core/hash.cpp
+++ b/torch/csrc/lazy/core/hash.cpp
@@ -89,5 +89,23 @@ std::string HashToString(const hash_t& a) {
   return ss.str();
 }
 
+hash_t Hash(const std::vector<bool>& values){
+  // We can't assume a DataHash size/dataptr approach here bc
+  // vector<bool> can be optimized as vector<bit> and storage details
+  // are decoupled from actual size of 'bool' type
+  hash_t h(static_cast<uint64_t>(0xad2ed1983bbf2e28));
+  static const hash_t h_true(static_cast<uint64_t>(0x74f6b5198daa2b2));
+  static const hash_t h_false(static_cast<uint64_t>(0xe39f30789cab5382));
+  for (const auto& b: values) {
+    if(b) {
+      h = HashCombine(h, h_true);
+    } else {
+      h = HashCombine(h, h_false);
+    }
+  }
+  return h;
+}
+
+
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/core/hash.h b/torch/csrc/lazy/core/hash.h
index c9511cd037ea..0e871bec2d45 100644
--- a/torch/csrc/lazy/core/hash.h
+++ b/torch/csrc/lazy/core/hash.h
@@ -8,7 +8,6 @@
 #include <set>
 #include <string>
 #include <vector>
-
 #include <ATen/Tensor.h>
 #include <c10/core/Scalar.h>
 #include <c10/util/int128.h>
@@ -68,11 +67,31 @@ hash_t Hash(const T& value) {
   return DataHash(&value, sizeof(value));
 }
 
+// added because on macos builds the vector<bool> specialization
+// breaks falling through to the templated arithmetic types above
+hash_t TORCH_API Hash(const std::vector<bool>& value);
+
 // Specialiazed implementations for proprietary types
 static inline hash_t Hash(const c10::ScalarType& value) {
   return DataHash(&value, sizeof(value));
 }
 
+static inline hash_t Hash(const c10::MemoryFormat& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::DeviceType& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::Device& value) {
+  return HashCombine(Hash(value.type()), Hash(value.index()));
+}
+
+static inline hash_t Hash(const c10::Layout& value) {
+  return DataHash(&value, sizeof(value));
+}
+
 static inline hash_t Hash(const c10::Scalar& value) {
   switch(value.type()){
   case c10::ScalarType::ComplexDouble:
diff --git a/torch/csrc/lazy/core/internal_ops/ltc_ops.h b/torch/csrc/lazy/core/internal_ops/ltc_ops.h
index a6e1c5e1bb97..768faa1d05f4 100644
--- a/torch/csrc/lazy/core/internal_ops/ltc_ops.h
+++ b/torch/csrc/lazy/core/internal_ops/ltc_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <torch/csrc/lazy/core/ir.h>
 
 #include <mutex>
 #include <string>
diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp
index 63e6ee8744ca..ee11b3b47ff2 100644
--- a/torch/csrc/lazy/core/ir.cpp
+++ b/torch/csrc/lazy/core/ir.cpp
@@ -1,6 +1,12 @@
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/cache.h>
+#include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/ir.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
 
+// Enables caching on for dynamic shapes (aka disable hash on shapes)
+C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled");
+
 namespace torch {
 namespace lazy {
 
@@ -19,10 +25,20 @@ std::string Output::ToString() const {
   return ss.str();
 }
 
+bool Output::operator==(const Value& rhs) const {
+  // Either side could be kNullValue which has node as nullptr
+  return (!node == !rhs.node) && (!node ||
+      (node->hash() == rhs.node->hash() && index == rhs.index));
+}
+
 hash_t Value::hash() const {
   return HashCombine(node->hash(), Hash(index));
 }
 
+hash_t Value::shapeHash() const {
+  return HashCombine(node->shapeHash(), Hash(index));
+}
+
 OpKind OpKind::Get(const std::string& name) {
   return OpKind(c10::Symbol::fromQualString(name));
 }
@@ -31,34 +47,107 @@ hash_t OpKind::hash() const {
   return StringHash(op.toQualString());
 }
 
-Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash)
-    : op_(op),
-      num_outputs_(num_outputs),
-      node_hash_(node_hash),
-      dag_hash_(dag_hash),
-      metadata_(GetMetaDataIfDebugging()) {}
+bool Node::enableDynamicShape() {
+  static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr;
+  return enabled || FLAGS_ltc_enable_dynamic_shapes;
+}
 
-Node::Node(OpKind op, size_t num_outputs, hash_t node_hash)
+Node::Node(OpKind op, size_t num_outputs)
     : op_(op),
       num_outputs_(num_outputs),
-      node_hash_(node_hash),
-      dag_hash_(node_hash),
       metadata_(GetMetaDataIfDebugging()) {}
 
+Node::Node(OpKind op, OpList operands, std::vector<Shape>&& shapes, size_t num_outputs)
+    : Node(op, num_outputs) {
+
+  // Move shapes into node
+  shapes_.insert(
+    shapes_.end(),
+    std::make_move_iterator(shapes.begin()),
+    std::make_move_iterator(shapes.end()));
+
+  for (auto& operand : operands) {
+    // Ideally, optional operands should be filtered by the leaf node classes,
+    // but it's just much easier to do it here.
+    // TODO(alanwaketan): Find a way to move the below logic to the leaf node
+    // classes.
+    if (!operand) {
+      continue;
+    }
+
+    AddOperand(operand.node, operand.index);
+  }
+}
+
+Node::Node(OpKind op, OpList operands, const std::function<Shape()>& shape_fn, size_t num_outputs)
+    : Node(op, operands, std::vector<Shape>{}, num_outputs) {
+  addComputedShape(shape_fn);
+}
+
+Node::Node(OpKind op, OpList operands, size_t num_outputs)
+    : Node(op, operands, std::vector<Shape>{}, num_outputs) {}
+
+Node::Node(OpKind op, Shape shape, size_t num_outputs)
+    : Node(op, num_outputs) {
+  shapes_.push_back(std::move(shape));
+}
+
 Node::~Node() = default;
 
+// Retrieves the full shape of the IR Node.
+c10::ArrayRef<Shape> Node::shapes() const { return shapes_; }
+
+// Retrieves the shape of the output at a given index.
+const Shape& Node::shape(size_t output_index) const {
+  return shapes_.at(output_index);
+}
+
+// Add the shape computed by the shape_fn
+
+void Node::addComputedShape(const std::function<Shape()>& shape_fn) {
+  shapes_.push_back(computeShape(shape_fn));
+}
+
+using ShapeCache = Cache<hash_t, Shape, HashReducer>;
+
+// Compute the shape using the provided shape_fn.
+Shape Node::computeShape(const std::function<Shape()>& shape_fn) {
+  static ShapeCache* cache = new ShapeCache(FLAGS_torch_lazy_shape_cache_size);
+
+  auto hash = shapeHash();
+  auto shape = cache->Get(hash);
+  if (shape == nullptr) {
+    shape = cache->Add(hash, std::make_shared<Shape>(shape_fn()));
+  }
+  return *shape;
+}
+
+const std::vector<Output>& Node::operands() const {
+  return operands_as_outputs_;
+}
+const Output& Node::operand(size_t i) const {
+  return operands_as_outputs_.at(i);
+}
+
 std::string Node::ToString() const {
   std::stringstream ss;
-  ss << op();
+  ss << shapes() << " " << op();
   if (num_outputs() > 1) {
     ss << ", num_outputs=" << num_outputs();
   }
-  if (!metadata_.scope.empty()) {
-    ss << ", scope=" << metadata_.scope;
+  if (!metadata().scope.empty()) {
+    ss << ", scope=" << metadata().scope;
   }
-  EmitShortFrameInfo(ss, metadata_.frame_info);
+  EmitShortFrameInfo(ss, metadata().frame_info);
   return ss.str();
 }
 
+void Node::AddOperand(NodePtr node, size_t index) {
+  CHECK_LT(index, node->num_outputs());
+  operands_.push_back(std::move(node));
+  operands_as_outputs_.emplace_back(operands_.back().get(), index);
+}
+
+
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h
index 6ca1df8d2fb7..5ecbf5e5abe0 100644
--- a/torch/csrc/lazy/core/ir.h
+++ b/torch/csrc/lazy/core/ir.h
@@ -14,7 +14,11 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/ArrayRef.h>
 #include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/shape.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
+#include <c10/util/Flags.h>
+
+C10_DECLARE_bool(ltc_enable_dynamic_shapes);
 
 namespace torch {
 namespace lazy {
@@ -22,69 +26,11 @@ namespace lazy {
 static const hash_t kHashSeed(static_cast<uint32_t>(0x5a2d296e9));
 
 class Node;
+struct Output;
+struct Value;
 
 using NodePtr = std::shared_ptr<Node>;
 
-// Represents a specific output produced by a node. Since the output of a node
-// can be composed by multiple outputs, the node+index coordinates fully qualify
-// each single output.
-struct TORCH_API Output {
-  struct Hasher {
-    size_t operator()(const Output& output) const;
-  };
-
-  Output() = default;
-  explicit Output(const Node* node, size_t index = 0)
-      : node(node), index(index) {}
-
-  hash_t hash() const;
-
-  bool operator==(const Output& rhs) const {
-    return node == rhs.node && index == rhs.index;
-  }
-  bool operator!=(const Output& rhs) const {
-    return !operator==(rhs);
-  }
-
-  std::string ToString() const;
-
-  // The node providing the output.
-  const Node* node{nullptr};
-  // The index in the node's output this output refers to.
-  size_t index{0};
-};
-
-inline std::ostream& operator<<(std::ostream& stream, const Output& output) {
-  stream << output.ToString();
-  return stream;
-}
-
-template <typename T>
-using OutputMap = std::unordered_map<Output, T, Output::Hasher>;
-
-// Represents an input/operand for a Node object.
-struct TORCH_API Value {
-  Value() = default;
-  /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {}
-
-  hash_t hash() const;
-
-  operator bool() const {
-    return node != nullptr;
-  }
-
-  operator Output() const {
-    return Output(node.get(), index);
-  }
-
-  Node* operator->() const {
-    return node.get();
-  }
-
-  NodePtr node;
-  size_t index = 0;
-};
-
 // The Kind of operation a Node can be associated to.
 struct TORCH_API OpKind {
   OpKind() = default;
@@ -121,22 +67,36 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) {
 
 using OpList = c10::ArrayRef<Value>;
 
-
-// A node in the graph. Nodes for operations which requires extra data to be
-// stored for lowering, should inherit from this class and add operation
+hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes);
+// A node in the graph. Nodes for operations which require extra data to be
+// stored for lowering should inherit from this class and add an operation
 // specific member there. For example, a constant might create a new
 // NodeConstant class (inheriting from Node) with an extra lazy_tensors::Literal
-// field, or a tensor value might create a new NodeTensor with computation
+// field, or a tensor value might create a new NodeTensor with a computation
 // client data handle in it.
 class TORCH_API Node {
  public:
+  static bool enableDynamicShape();
+
   // Creates a new node with the given op name. The op is a unique identifier
   // for the operation. The num_outputs tells how many outputs a given operation
   // generates.
-  Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash);
+  //
+  // None leaf node's node_hash does not contains shape information always.
+  // So we pass in the hash value rather than a function.
+  Node(OpKind op, size_t num_outputs);
+
+  // Construct node with operands and shapes
+  Node(OpKind op, OpList operands, std::vector<Shape>&& shapes, size_t num_outputs = 1);
+
+  // Construct node with operands and shape generated from a function
+  Node(OpKind op, OpList operands, const std::function<Shape()>& shape_fn, size_t num_outputs = 1);
 
-  // Contructor used to create leaf nodes.
-  Node(OpKind op, size_t num_outputs, hash_t node_hash);
+  // Construct node with operands and no shape
+  Node(OpKind op, OpList operands, size_t num_outputs = 1);
+
+  // Construct node with shape and no operands
+  Node(OpKind op, Shape shape, size_t num_outputs = 1);
 
   virtual ~Node();
 
@@ -148,17 +108,26 @@ class TORCH_API Node {
     return num_outputs_;
   }
 
-  virtual const std::vector<Output>& operands() const = 0;
+  // Retrieves the full shape of the IR Node.
+  virtual c10::ArrayRef<Shape> shapes() const;
 
-  virtual const Output& operand(size_t i) const = 0;
+  virtual const Shape& shape(size_t output_index = 0) const;
 
-  hash_t node_hash() const {
-    return node_hash_;
-  }
+  // Add the shape computed by the shape_fn
+  void addComputedShape(const std::function<Shape()>& shape_fn);
 
-  hash_t hash() const {
-    return dag_hash_;
-  }
+  // Compute the shape using the provided shape_fn if not previously cached
+  Shape computeShape(const std::function<Shape()>& shape_fn);
+
+  virtual const std::vector<Output>& operands() const;
+
+  virtual const Output& operand(size_t i) const;
+
+  // Returns the hash of the dag used to look up the compiled graph
+  virtual hash_t hash() const = 0;
+
+  // Returns the hash of the dag used to for shape caching
+  virtual hash_t shapeHash() const = 0;
 
   const MetaData& metadata() const {
     return metadata_;
@@ -181,15 +150,22 @@ class TORCH_API Node {
   OpKind op_;
   size_t num_outputs_ = 1;
 
-  // The hash value of this node.
-  hash_t node_hash_;
-  // The hash value of the graph rooted at this node.
-  hash_t dag_hash_;
   // The IR specific metadata attached to the IR node.
   MetaData metadata_;
   // The IR framework user can attach a user defined metadata object deriving
   // from UserMetaData.
   std::shared_ptr<UserMetaData> user_metadata_;
+
+protected:
+  // Adds node's index output number as operand.
+  void AddOperand(NodePtr node, size_t index = 0);
+
+  std::vector<Shape> shapes_;
+  // A node holds a real reference to its operands.
+  std::vector<NodePtr> operands_;
+  // Outputs do not hold references on the nodes, and neither do the uses, since
+  // otherwise we get into circular reference counting.
+  std::vector<Output> operands_as_outputs_;
 };
 
 
@@ -199,12 +175,8 @@ inline std::ostream& operator<<(std::ostream& stream, const Node& node) {
   return stream;
 }
 
-// TODO(alanwaketan): Support r-value reference argument type.
-template <typename T, typename... Args>
-NodePtr MakeNode(Args&&... args) {
-  return std::make_shared<T>(std::forward<Args>(args)...);
-}
-
+// Note: Keep this version of NodeCast for smooth PyTorch/XLA migration, and
+// clean up once the migration is done.
 template <typename T>
 const T* NodeCast(const Node* node, OpKind op) {
   if (op != node->op()) {
@@ -217,6 +189,92 @@ const T* NodeCast(const Node* node, OpKind op) {
 #endif
 }
 
+template <typename T>
+const T* NodeCast(const Node* node) {
+  if (T::ClassOpKind() != node->op()) {
+    return nullptr;
+  }
+  // TODO: Some IR classes share the same opkind, such as Mean and MeanDim, so
+  // static_cast is not safe here. Unless we have opkind unique for each class,
+  // we have to use dynamic_cast here.
+  return dynamic_cast<const T*>(node);
+}
+
+
+// Represents a specific output produced by a node. Since the output of a node
+// can be composed by multiple outputs, the node+index coordinates fully qualify
+// each single output.
+struct TORCH_API Output {
+  struct Hasher {
+    size_t operator()(const Output& output) const;
+  };
+
+  Output() = default;
+  explicit Output(const Node* node, size_t index = 0)
+      : node(node), index(index) {}
+
+  hash_t hash() const;
+
+  bool operator==(const Output& rhs) const {
+    return node == rhs.node && index == rhs.index;
+  }
+
+  // To compare the operands of to-be-constructed node and to-be-reused node
+  bool operator==(const Value& rhs) const;
+
+  bool operator!=(const Output& rhs) const {
+    return !operator==(rhs);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  std::string ToString() const;
+
+  // The node providing the output.
+  const Node* node{nullptr};
+  // The index in the node's output this output refers to.
+  size_t index{0};
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const Output& output) {
+  stream << output.ToString();
+  return stream;
+}
+
+template <typename T>
+using OutputMap = std::unordered_map<Output, T, Output::Hasher>;
+
+// Represents an input/operand for a Node object.
+struct TORCH_API Value {
+  Value() = default;
+  /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {}
+  /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {}
+
+  hash_t hash() const;
+  hash_t shapeHash() const;
+
+  operator bool() const {
+    return node != nullptr;
+  }
+
+  operator Output() const {
+    return Output(node.get(), index);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  Node* operator->() const {
+    return node.get();
+  }
+
+  NodePtr node;
+  size_t index = 0;
+};
+
 } // namespace lazy
 } // namespace torch
 
diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h
new file mode 100644
index 000000000000..8cc6de02dba4
--- /dev/null
+++ b/torch/csrc/lazy/core/ir_builder.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/config.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/trie.h>
+#include <vector>
+
+// This file is part of the backend interface. So, ops shouldn't be added or removed without due process
+// The exception to this being the view ops which will be removed soon pending functionalization
+
+namespace torch {
+namespace lazy {
+
+template <typename T, typename... Args>
+NodePtr ReuseNode(Args&&... args) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    return LookupNodeFromTrieCache<T>(std::forward<Args>(args)...);
+  }
+  return nullptr;
+}
+
+// Caching an IR node into TrieCache
+static inline void CacheNode(NodePtr node) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    TrieCache::Get()->Insert(std::move(node));
+  }
+}
+
+template <typename T, typename... Args>
+NodePtr MakeNode(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
+}
+
+// op is passed in for a more efficient node casting, see the implementation of NodeCast
+template <typename T, typename... Args>
+NodePtr ReuseOrMakeNode(Args&&... args) {
+  NodePtr node = ReuseNode<T>(std::forward<Args>(args)...);
+  if (!node) {
+    node = MakeNode<T>(std::forward<Args>(args)...);
+    CacheNode(node);
+  }
+  return node;
+}
+
+struct IrBuilder {
+  virtual NodePtr MakeDeviceData(const std::shared_ptr<BackendData>& data) const = 0;
+  virtual NodePtr MakeScalar(const at::Scalar& value, const at::ScalarType& type) const = 0;
+  virtual NodePtr MakeExpand(const Value& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand) const = 0;
+  virtual NodePtr MakeView(const Value& input0, const std::vector<int64_t>& output_size) const = 0;
+  virtual NodePtr MakeCast(const Value& input0, const at::ScalarType& dtype, const c10::optional<at::ScalarType>& stype = c10::nullopt) const = 0;
+  virtual NodePtr MakeTensorList(const OpList& inputs) const = 0;
+  virtual NodePtr MakeGeneric(const OpKind& op, const OpList& operands, const Shape& shape, const size_t& num_outputs = 1, const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) const = 0;
+
+  // View op nodes
+  virtual NodePtr MakeAsStridedViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) const = 0;
+  virtual NodePtr MakeAsStrided(const Value& input0, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) const = 0;
+  virtual NodePtr MakeDiagonalViewUpdate(const Value& input0, const Value& input1, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const = 0;
+  virtual NodePtr MakeDiagonal(const Value& input0, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const = 0;
+  virtual NodePtr MakeNarrowViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& base_indices) const = 0;
+  virtual NodePtr MakeNarrow(const Value& input0, const std::vector<int64_t>& base_indices, const std::vector<int64_t>& sizes) const = 0;
+  virtual NodePtr MakePermute(const Value& input0, const std::vector<int64_t>& dims) const = 0;
+  virtual NodePtr MakeResize(const Value& input0, const std::vector<int64_t>& size) const = 0;
+  virtual NodePtr MakeSelectViewUpdate(const Value& input0, const Value& input1, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) const = 0;
+  virtual NodePtr MakeSelect(const Value& input0, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) const = 0;
+  virtual NodePtr MakeSqueeze(const Value& input0, const int& dim) const = 0;
+  virtual NodePtr MakeUnsqueeze(const Value& input0, const int& dim) const = 0;
+
+  // dynamic ir nodes
+  virtual NodePtr MakeSizeNode(const Value& input, size_t dim) const = 0;
+  virtual NodePtr MakeSizeAdd(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeMul(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeDiv(const Value& a, const Value& b) const = 0;
+};
+
+static inline NodePtr MakeDeviceData(const std::shared_ptr<BackendData>& data) {
+    return getIrBuilder()->MakeDeviceData(data);
+}
+static inline NodePtr MakeScalar(const at::Scalar& value, const at::ScalarType& type) {
+    return getIrBuilder()->MakeScalar(value, type);
+}
+static inline NodePtr MakeExpand(const Value& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand) {
+    return getIrBuilder()->MakeExpand(input0, size, is_scalar_expand);
+}
+static inline NodePtr MakeView(const Value& input0, const std::vector<int64_t>& output_size) {
+    return getIrBuilder()->MakeView(input0, output_size);
+}
+static inline NodePtr MakeCast(const Value& input0, const at::ScalarType& dtype, const c10::optional<at::ScalarType>& stype = c10::nullopt) {
+    return getIrBuilder()->MakeCast(input0, dtype, stype);
+}
+static inline NodePtr MakeTensorList(const OpList& inputs) {
+    return getIrBuilder()->MakeTensorList(inputs);
+}
+static inline NodePtr MakeGeneric(const OpKind& op, const OpList& operands, const Shape& shape, const size_t& num_outputs = 1, const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) {
+    return getIrBuilder()->MakeGeneric(op, operands, shape, num_outputs, hash_seed);
+}
+
+// View op nodes
+static inline NodePtr MakeAsStridedViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) {
+    return getIrBuilder()->MakeAsStridedViewUpdate(input0, input1, size, stride, storage_offset);
+}
+static inline NodePtr MakeAsStrided(const Value& input0, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) {
+    return getIrBuilder()->MakeAsStrided(input0, size, stride, storage_offset);
+}
+static inline NodePtr MakeDiagonalViewUpdate(const Value& input0, const Value& input1, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) {
+    return getIrBuilder()->MakeDiagonalViewUpdate(input0, input1, offset, dim1, dim2);
+}
+static inline NodePtr MakeDiagonal(const Value& input0, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) {
+    return getIrBuilder()->MakeDiagonal(input0, offset, dim1, dim2);
+}
+static inline NodePtr MakeNarrowViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& base_indices) {
+    return getIrBuilder()->MakeNarrowViewUpdate(input0, input1, base_indices);
+}
+static inline NodePtr MakeNarrow(const Value& input0, const std::vector<int64_t>& base_indices, const std::vector<int64_t>& sizes) {
+    return getIrBuilder()->MakeNarrow(input0, base_indices, sizes);
+}
+static inline NodePtr MakePermute(const Value& input0, const std::vector<int64_t>& dims) {
+    return getIrBuilder()->MakePermute(input0, dims);
+}
+static inline NodePtr MakeResize(const Value& input0, const std::vector<int64_t>& size) {
+    return getIrBuilder()->MakeResize(input0, size);
+}
+static inline NodePtr MakeSelectViewUpdate(const Value& input0, const Value& input1, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) {
+    return getIrBuilder()->MakeSelectViewUpdate(input0, input1, dim, start, end, stride);
+}
+static inline NodePtr MakeSelect(const Value& input0, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) {
+    return getIrBuilder()->MakeSelect(input0, dim, start, end, stride);
+}
+static inline NodePtr MakeSqueeze(const Value& input0, const int& dim) {
+    return getIrBuilder()->MakeSqueeze(input0, dim);
+}
+static inline NodePtr MakeUnsqueeze(const Value& input0, const int& dim) {
+    return getIrBuilder()->MakeUnsqueeze(input0, dim);
+}
+
+// dynamic ir nodes
+static inline NodePtr MakeSizeNode(const Value& input, size_t dim) {
+  return getIrBuilder()->MakeSizeNode(input, dim);
+}
+static inline NodePtr MakeSizeAdd(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeMul(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeDiv(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeDiv(a, b);
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index a214ac7dcaa6..eff2873d668d 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -10,10 +10,6 @@
 #include <sstream>
 #include <unordered_map>
 
-// TODO(whc) don't have ir util depend on ts_backend
-// temporary hack to use Node shape printing from TsNode::shape()
-#include <torch/csrc/lazy/ts_backend/ts_node.h>
-
 namespace torch {
 namespace lazy {
 namespace {
@@ -135,12 +131,7 @@ std::string GenerateDotNodeLabel(
     const std::unordered_map<const Node*, size_t>& roots_ids) {
   static const size_t kMaxValueSize = 64;
   std::stringstream ss;
-  ss << node->op() << "\\n";
-  if (auto tsnode = dynamic_cast<const TsNode*>(node)) {
-    ss << tsnode->shape();
-  } else {
-    ss << "{TODO implement Node::shape}";
-  }
+  ss << node->op() << "\\n" << node->shape();
   for (auto& tag : GetNodeTags(node)) {
     ss << "\\n" << tag.name << "=";
     if (tag.value.size() < kMaxValueSize) {
@@ -166,12 +157,7 @@ std::string GenerateDotNodeSpec(
 
 std::string GenerateTextNodeSpec(const Node* node, const NodeIdMap& id_map) {
   std::stringstream ss;
-  if (auto tsnode = dynamic_cast<const TsNode*>(node)) {
-    ss << tsnode->shapes() << " ";
-  } else {
-    ss << "{TODO implement Node::shape} ";
-  }
-  ss << node->op() << "(";
+  ss << node->shapes() << " " << node->op() << "(";
   size_t count = 0;
   for (auto& output : node->operands()) {
     if (count > 0) {
@@ -251,6 +237,7 @@ std::string DumpUtil::PostOrderToText(
     if (opt_root_id) {
       ss << ", ROOT=" << *opt_root_id;
     }
+    ss << ", NodeType=" << typeid(*node).name();
     ss << "\n";
   }
   ss << "}\n";
diff --git a/torch/csrc/lazy/core/ir_metadata.cpp b/torch/csrc/lazy/core/ir_metadata.cpp
index a310fb5ee385..cea721bc1397 100644
--- a/torch/csrc/lazy/core/ir_metadata.cpp
+++ b/torch/csrc/lazy/core/ir_metadata.cpp
@@ -1,5 +1,6 @@
 #include <functional>
 #include <torch/csrc/lazy/core/config.h>
+#include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
 
 namespace torch {
@@ -89,23 +90,13 @@ std::string GetCurrentScope() {
   return scope;
 }
 
-
-std::vector<SourceLocation> GetFrameInfoDefault() {
-  return std::vector<SourceLocation>();
-}
-
-std::function <std::vector<SourceLocation> ()> GetFrameInfo = GetFrameInfoDefault;
-void RegisterGetFrameInfo(const std::function <std::vector<SourceLocation> ()>& getFrameInfo) {
-  GetFrameInfo = getFrameInfo;
-}
-
 MetaData GetMetaDataIfDebugging() {
   if (!FLAGS_torch_lazy_ir_debug) {
     return MetaData();
   }
   MetaData meta;
   meta.scope = GetCurrentScope();
-  meta.frame_info = GetFrameInfo();
+  meta.frame_info = torch::lazy::GetPythonFramesFunction()();
   return meta;
 }
 
diff --git a/torch/csrc/lazy/core/ir_metadata.h b/torch/csrc/lazy/core/ir_metadata.h
index 437f177c13a9..ea413fcfb826 100644
--- a/torch/csrc/lazy/core/ir_metadata.h
+++ b/torch/csrc/lazy/core/ir_metadata.h
@@ -43,13 +43,7 @@ struct TORCH_API ScopePusher {
   static void ResetScopes();
 };
 
-MetaData GetMetaDataIfDebugging();
-
-// If python bindings for lazy tensor core are initialized, they should
-// register a function to get python frame info.  Otherwise, frame info
-// will not be available.
-TORCH_API void RegisterGetFrameInfo(
-    const std::function<std::vector<SourceLocation>()>& getFrameInfo);
+TORCH_API MetaData GetMetaDataIfDebugging();
 
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index caf46c2b3a0a..569e0999a90c 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -3,6 +3,7 @@
 #include <ATen/ScalarOps.h>
 #include <c10/util/Logging.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/core/ir_dump_util.h>
@@ -10,14 +11,13 @@
 #include <torch/csrc/lazy/core/tensor_util.h>
 #include <torch/csrc/lazy/core/unique.h>
 
-// TODO: DebugUtil will be upstreamed after LazyTensor is in.
-// #include <">lazy_tensor_core/csrc/debug_util.h>
+#include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/metrics.h>
 #include <torch/csrc/lazy/core/thread_pool.h>
-#include <torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h>
-#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
-#include <torch/csrc/lazy/ts_backend/ops/expand.h>
-#include <torch/csrc/lazy/ts_backend/ops/scalar.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ops/arithmetic_ir_ops.h>
+
+#include <ATen/ScalarOps.h>
 
 namespace torch {
 namespace lazy {
@@ -265,8 +265,8 @@ class DeviceContextArena {
     TORCH_LAZY_COUNTER("DestroyLtcTensor", 1);
   }
 
-  std::vector<LazyTensor> GetLiveTensors(const BackendDevice* device) {
-    std::vector<LazyTensor> tensors;
+  std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device) {
+    std::vector<LazyTensorPtr> tensors;
     auto fn = [&](DeviceContext* devctx) {
       std::lock_guard<std::mutex> lock(devctx->lock);
       for (auto& uid_wptr : devctx->tensors_data) {
@@ -295,8 +295,8 @@ class DeviceContextArena {
     devctx->running_seed = kSeedAdd + kSeedMul * devctx->running_seed;
     // Compose new seeds from the root seed, to avoid creating too many
     // computation parameters which might overflow the device capacity.
-    Value k = MakeNode<Scalar>(MakeIntScalar(kSeedMul), kSeedType);
-    Value b = MakeNode<Scalar>(MakeIntScalar(kSeedAdd), kSeedType);
+    Value k = MakeScalar(MakeIntScalar(kSeedMul), kSeedType);
+    Value b = MakeScalar(MakeIntScalar(kSeedAdd), kSeedType);
     devctx->seed_ir_value = b + k * devctx->seed_ir_value;
     return devctx->seed_ir_value;
   }
@@ -372,7 +372,7 @@ class DeviceContextArena {
     at::Tensor tensor =
         at::scalar_tensor(value, at::TensorOptions(scalar_type));
     BackendDataPtr device_data = TensorToDataHandle(tensor, device);
-    return MakeNode<DeviceData>(std::move(device_data));
+    return MakeDeviceData(std::move(device_data));
   }
 
   std::mutex lock_;
@@ -385,9 +385,9 @@ bool ShouldSyncIrValue(const Value& ir_value) {
 
 // Return true if no tensor in the list has an underlying IR (leaf or
 // operation).
-bool TensorsHaveIR(const std::vector<LazyTensor>& tensors) {
+bool TensorsHaveIR(const std::vector<LazyTensorPtr>& tensors) {
   for (const auto& tensor : tensors) {
-    if (tensor.CurrentDataHandle() || tensor.CurrentIrValue()) {
+    if (tensor->CurrentDataHandle() || tensor->CurrentIrValue()) {
       return true;
     }
   }
@@ -438,7 +438,7 @@ BackendDataPtr LazyGraphExecutor::GetDeviceData(
   return DataCacheArena::Get()->GetDeviceData(value, scalar_type, device);
 }
 
-std::vector<LazyTensor> LazyGraphExecutor::GetLiveTensors(
+std::vector<LazyTensorPtr> LazyGraphExecutor::GetLiveTensors(
     const BackendDevice* device) {
   return DeviceContextArena::Get()->GetLiveTensors(device);
 }
@@ -454,7 +454,7 @@ void LazyGraphExecutor::SyncLiveTensorsGraph(
 }
 
 void LazyGraphExecutor::SyncTensorsGraph(
-    std::vector<LazyTensor>* tensors,
+    std::vector<LazyTensorPtr>* tensors,
     c10::ArrayRef<std::string> devices,
     bool wait,
     bool sync_ltc_data) {
@@ -463,7 +463,7 @@ void LazyGraphExecutor::SyncTensorsGraph(
   config.sync_ltc_data = sync_ltc_data;
 
   auto async = SyncTensorsGraphInternal(tensors, devices, config);
-  if (wait && async != nullptr) {
+  if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) {
     async->mwait.Wait();
   }
 }
@@ -473,6 +473,8 @@ void LazyGraphExecutor::MarkStep(const BackendDevice& device) {
   DeviceContextArena::Get()->MarkStep(device);
   ScopePusher::ResetScopes();
   g_tls_data.Reset();
+  // Move TrieCache's current pointer back to its root
+  TrieCache::Get()->ResetCurrent();
 }
 
 void LazyGraphExecutor::WaitDeviceOps(c10::ArrayRef<BackendDevice> devices) {
@@ -495,7 +497,7 @@ void LazyGraphExecutor::WaitDeviceOps(c10::ArrayRef<BackendDevice> devices) {
 }
 
 std::vector<at::Tensor> LazyGraphExecutor::GetTensors(
-    std::vector<LazyTensor>* tensors) {
+    std::vector<LazyTensorPtr>* tensors) {
   VLOG(4) << "Trying to get the value of " << tensors->size() << " tensor(s)";
   return GetTensorsFused(tensors);
 }
@@ -505,10 +507,10 @@ size_t LazyGraphExecutor::IncTrimCounter() {
 }
 
 std::string LazyGraphExecutor::DumpBackendComputation(
-    const std::vector<LazyTensor>& tensors) {
+    const std::vector<LazyTensorPtr>& tensors) {
   std::vector<Value> ir_values;
   for (auto& tensor : tensors) {
-    Value ir_value = tensor.CurrentIrValue();
+    Value ir_value = tensor->CurrentIrValue();
     if (ir_value) {
       ir_values.push_back(std::move(ir_value));
     }
@@ -524,17 +526,17 @@ Value LazyGraphExecutor::GetDeviceDataIrValue(
   BackendDataPtr data = GetDeviceData(value, type, device);
   data->SetInfo(std::make_shared<DeviceDataInfo>(
       /*tensor_id=*/-1, /*read_only=*/true));
-  return MakeNode<DeviceData>(std::move(data));
+  return MakeDeviceData(std::move(data));
 }
 
 Value LazyGraphExecutor::GetIrValueForScalarFromCodegen(const at::Scalar& value) {
   if (IsSpecialScalar(value)) {
-    return MakeNode<Scalar>(value, value.type());
+    return MakeScalar(value, value.type());
   }
   auto cpu_device = getBackend()->GetBackendDevice(c10::Device(c10::kCPU, 0));
   BackendDataPtr data = getBackend()->MakeComputationDataFromScalar(value, cpu_device);
   data->SetInfo(std::make_shared<DeviceDataInfo>(/*tensor_id=*/-1, /*read_only=*/true));
-  return MakeNode<DeviceData>(std::move(data));
+  return MakeDeviceData(std::move(data));
 }
 
 Value LazyGraphExecutor::GetIrValueForScalar(
@@ -542,7 +544,7 @@ Value LazyGraphExecutor::GetIrValueForScalar(
     c10::ScalarType type,
     const BackendDevice& device) {
   if (IsSpecialScalar(value)) {
-    return MakeNode<Scalar>(value, type);
+    return MakeScalar(value, type);
   }
   return GetDeviceDataIrValue(value, type, getBackend()->GetBackendDevice(c10::Device(c10::kCPU, 0)));
 }
@@ -560,7 +562,7 @@ Value LazyGraphExecutor::GetIrValueForExpandedScalar(
   auto type = shape.scalar_type();
   Value ir_value = GetIrValueForScalar(value, type, device);
   if (!dimensions.empty()) {
-      ir_value = MakeNode<Expand>(
+      ir_value = MakeExpand(
           ir_value, dimensions.vec(),
           /*is_scalar_expand=*/true);
   }
@@ -602,11 +604,11 @@ void LazyGraphExecutor::Async::Wait() {
 }
 
 LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
-    const std::vector<LazyTensor>& tensors,
+    const std::vector<LazyTensorPtr>& tensors,
     const SyncTensorsConfig& config) {
   Unique<BackendDevice> unique_device;
   for (const auto& tensor : tensors) {
-    unique_device.set(tensor.GetDevice());
+    unique_device.set(tensor->GetDevice());
   }
   SyncTensorCollection coll;
   if (!unique_device) {
@@ -626,17 +628,11 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
   coll.config = config;
   coll.device = *unique_device;
   coll.indices.reserve(tensors.size());
-  VLOG(4) << "Waiting on device barrier for device " << coll.device << " ...";
-  {
-    TORCH_LAZY_TIMED("DeviceLockWait");
-    coll.unlocker =
-        DeviceLockerArena::Get()->LockDevices(unique_device.AsSet());
-  }
-  VLOG(4) << "Waiting on device barrier for device " << coll.device << " done!";
+
   for (const auto i : c10::irange(tensors.size())) {
-    if (tensor_ids.insert(tensors[i].GetUniqueId()).second &&
-        tensors[i].CurrentDataHandle() == nullptr) {
-      Value ir_value = tensors[i].CurrentIrValue();
+    if (tensor_ids.insert(tensors[i]->GetUniqueId()).second &&
+        tensors[i]->CurrentDataHandle() == nullptr) {
+      Value ir_value = tensors[i]->CurrentIrValue();
       if (ir_value) {
         if (ShouldSyncIrValue(ir_value)) {
           // Add only tensors which need to be synced.
@@ -646,10 +642,10 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
       } else if (config.force_ltc_data) {
         // The tensor only has at::Tensor data. We need to queue it for a
         // device upload.
-        c10::optional<at::Tensor> tensor_data = tensors[i].CurrentTensorData();
+        c10::optional<at::Tensor> tensor_data = tensors[i]->CurrentTensorData();
         TORCH_CHECK(tensor_data);
         at_tensors.push_back(*tensor_data);
-        devices.push_back(tensors[i].GetDevice());
+        devices.push_back(tensors[i]->GetDevice());
         at_tensor_index.push_back(i);
       }
     }
@@ -663,7 +659,7 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
       // present. Also, we uploaded the at::Tensor data to the device, but such
       // data is still valid so we leave it live on the lazy tensor (so that a
       // following ToTensor() does not need to fetch it from device).
-      tensors[at_tensor_index[i]].data()->handle = std::move(handles[i]);
+      tensors[at_tensor_index[i]]->data()->handle = std::move(handles[i]);
     }
   }
   VLOG(4) << "Tensors graph hash " << HashToString(coll.hash) << " on device "
@@ -672,24 +668,24 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
 }
 
 std::vector<Value> LazyGraphExecutor::CollectRoots(
-    const std::vector<LazyTensor>& tensors,
+    const std::vector<LazyTensorPtr>& tensors,
     c10::ArrayRef<size_t> indices) {
   std::vector<Value> roots;
   roots.reserve(indices.size());
   for (auto index : indices) {
-    roots.push_back(tensors.at(index).CurrentIrValue());
+    roots.push_back(tensors.at(index)->CurrentIrValue());
   }
   return roots;
 }
 
 std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
-    std::vector<LazyTensor>* tensors,
+    std::vector<LazyTensorPtr>* tensors,
     const SyncTensorsConfig& config,
     c10::ArrayRef<size_t> indices) {
   std::vector<BackendDataPtr> tensors_data;
   tensors_data.reserve(indices.size());
   for (auto index : indices) {
-    LazyTensor& tensor = (*tensors)[index];
+    LazyTensorPtr& tensor = (*tensors)[index];
     // If the config.force_ltc_data flag is true, the purpose of this tensor
     // sync operation is to truncate the IR graph and materialize device data in
     // place of IR graph, on selected tensors. But since operation will complete
@@ -699,12 +695,12 @@ std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
     // into the async variable), any other operation trying to access the
     // tensor's device data will have to wait until the asynchronous operation
     // completes.
-    BackendDataPtr handle = tensor.CurrentDataHandle();
+    BackendDataPtr handle = tensor->CurrentDataHandle();
     if (handle == nullptr && config.force_ltc_data) {
-      const BackendDevice& tensor_device = tensor.GetDevice();
+      const BackendDevice& tensor_device = tensor->GetDevice();
       handle = getBackend()->CreateDataPlaceholder(
-          tensor_device, std::move(tensor.shape()));
-      tensor.SetDataHandle(handle, config.sync_ltc_data);
+          tensor_device, std::move(tensor->shape()));
+      tensor->SetDataHandle(handle, config.sync_ltc_data);
     }
     tensors_data.emplace_back(std::move(handle));
   }
@@ -712,28 +708,33 @@ std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
 }
 
 LazyGraphExecutor::PostOrderData LazyGraphExecutor::RunPostOrder(
-    const std::vector<LazyTensor>& tensors,
-    c10::ArrayRef<size_t> indices) {
+    const std::vector<LazyTensorPtr>& tensors,
+    SyncTensorCollection* coll) {
   std::vector<Node*> roots;
-  roots.reserve(indices.size());
-  for (auto index : indices) {
-    Value ir_value = tensors.at(index).CurrentIrValue();
+  roots.reserve(coll->indices.size());
+  for (auto index : coll->indices) {
+    Value ir_value = tensors.at(index)->CurrentIrValue();
     roots.push_back(ir_value.node.get());
   }
   PostOrderData po_data;
   po_data.post_order = Util::ComputePostOrder(roots, &po_data.emission_map);
   std::unordered_map<BackendData::Handle, size_t> data_handles;
   for (auto node : po_data.post_order) {
-    const DeviceData* device_data = DeviceData::Cast(node);
-    if (device_data != nullptr) {
-      BackendData::Handle handle = device_data->data()->GetHandle();
+    const auto backend_data = getBackend()->GetComputationDataFromNode(node);
+    if (backend_data) {
+      /* Acceptable race condition: HasValue may return false. This is OK
+       * since the conditional barrier is a performance optimization. */
+      if (!backend_data->HasValue()) {
+        TensorCollectionBarrier(coll);
+      }
+      BackendData::Handle handle = backend_data->GetHandle();
       auto it = data_handles.find(handle);
       if (it != data_handles.end()) {
         po_data.parameter_sequence.push_back(it->second);
       } else {
         po_data.parameter_sequence.push_back(po_data.parameters_data.size());
         data_handles[handle] = po_data.parameters_data.size();
-        po_data.parameters_data.push_back(device_data->data());
+        po_data.parameters_data.push_back(backend_data);
       }
     }
   }
@@ -741,7 +742,7 @@ LazyGraphExecutor::PostOrderData LazyGraphExecutor::RunPostOrder(
 }
 
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::TryRunCachedSync(
-    std::vector<LazyTensor>* tensors,
+    std::vector<LazyTensorPtr>* tensors,
     SyncTensorCollection* coll,
     PostOrderData* po_data) {
   ComputationCache::TypePtr cached_computation =
@@ -749,6 +750,10 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::TryRunCachedSync(
   if (cached_computation == nullptr) {
     return nullptr;
   }
+  if (GRAPH_DUMP_ENABLED) {
+    auto* comp = cached_computation->computation.get();
+    LOG(ERROR) << "Run a cached graph: " << comp->to_string() << std::endl;
+  }
   TORCH_LAZY_VALUE_METRIC("TensorsGraphSize", po_data->post_order.size());
   VLOG(5) << "TensorsGraphSize=" << po_data->post_order.size();
 
@@ -760,7 +765,7 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::TryRunCachedSync(
 }
 
 LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile(
-    const std::vector<LazyTensor>& tensors,
+    const std::vector<LazyTensorPtr>& tensors,
     c10::ArrayRef<std::string> devices,
     const SyncTensorCollection& coll,
     PostOrderData* po_data) {
@@ -770,7 +775,7 @@ LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile(
       po_data->post_order,
       std::move(po_data->emission_map));
   for (auto index : coll.indices) {
-    Value ir_value = tensors[index].CurrentIrValue();
+    Value ir_value = tensors[index]->CurrentIrValue();
     lowering_ctx->AddResult(ir_value);
   }
   if (FLAGS_torch_lazy_param_aliasing && coll.config.sync_ltc_data) {
@@ -802,6 +807,10 @@ LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile(
   }
 
   ComputationPtr computation = lowering_ctx->Build();
+  // If force_ltc_data is true it means that we did a proper sync and are
+  // inside a mark step. If GetTensors was called, force_ltc_data will
+  // be false meaning we are prematurely evaluating some value.
+  computation->in_mark_step = coll.config.force_ltc_data;
 
   VLOG(3) << "Compiling IR graph hash " << HashToString(coll.hash)
           << " on device " << coll.device << " ...";
@@ -847,13 +856,13 @@ typedef SSIZE_T ssize_t;
 #endif
 
 void LazyGraphExecutor::BuildInputOutputAliases(
-    const std::vector<LazyTensor>& tensors,
+    const std::vector<LazyTensorPtr>& tensors,
     c10::ArrayRef<size_t> indices,
     LoweringContext* lowering_ctx) {
   std::unordered_map<int64_t, size_t> output_tensor_id_map;
   for (const auto i : c10::irange(indices.size())) {
     size_t tensor_index = indices[i];
-    int64_t tensor_id = tensors[tensor_index].GetUniqueId();
+    int64_t tensor_id = tensors[tensor_index]->GetUniqueId();
     output_tensor_id_map[tensor_id] = i;
   }
   const std::vector<BackendDataPtr>& parameters_data =
@@ -866,12 +875,9 @@ void LazyGraphExecutor::BuildInputOutputAliases(
       auto it = output_tensor_id_map.find(data_info->tensor_id);
       if (it != output_tensor_id_map.end()) {
         size_t output_index = it->second;
-        const Shape& root_shape = lowering_ctx->GetResultShape(output_index);
-        if (Shape(parameters_data[i]->shape()) == root_shape &&
+        if (lowering_ctx->CheckResultShape(parameters_data[i], output_index) &&
             alias_map[output_index] < 0) {
-          // TODO(whc) deleted this interface until we see a need (no TS impl)
-          // lowering_ctx->SetUpAlias({static_cast<int64_t>(output_index)}, i,
-          // {});
+          lowering_ctx->SetUpAlias({static_cast<int64_t>(output_index)}, i, {});
           alias_map[output_index] = i;
 
           VLOG(6) << "Aliased parameter " << i << " with output "
@@ -885,17 +891,19 @@ void LazyGraphExecutor::BuildInputOutputAliases(
 
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
     SyncTensorsGraphInternal(
-        std::vector<LazyTensor>* tensors,
+        std::vector<LazyTensorPtr>* tensors,
         c10::ArrayRef<std::string> devices,
         const SyncTensorsConfig& config) {
   SyncTensorCollection coll = CollectSyncTensors(*tensors, config);
   if (coll.indices.empty()) {
+    /* Enure previous execution is complete before exiting this
+     * function */
+    TensorCollectionBarrier(&coll);
     return nullptr;
   }
-  // DebugUtil::SaveTensorsGraphInfo("ScheduleSyncTensorsGraph", *tensors,
-  //                                &coll.indices);
-
-  PostOrderData po_data = RunPostOrder(*tensors, coll.indices);
+  PostOrderData po_data = RunPostOrder(*tensors, &coll);
+  DebugUtil::SaveTensorsGraphInfo("ScheduleSyncTensorsGraph", *tensors,
+                                 &coll.indices);
   coll.hash = HashCombine(coll.hash, Hash(po_data.parameter_sequence));
   VLOG(4) << "Parameter sequence graph hash " << HashToString(coll.hash);
   std::shared_ptr<Async> async = TryRunCachedSync(tensors, &coll, &po_data);
@@ -904,6 +912,11 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
   }
 
   CompilationResult compile_result = Compile(*tensors, devices, coll, &po_data);
+  if (GRAPH_DUMP_ENABLED) {
+    auto* comp = compile_result.computation.get();
+    LOG(ERROR) << "Add a cached computation with hash " << coll.hash << std::endl;
+    LOG(ERROR) << "Add a graph to cache: " << comp->to_string() << std::endl;
+  }
 
   TORCH_LAZY_VALUE_METRIC("TensorsGraphSize", compile_result.emitted_nodes);
   VLOG(5) << "TensorsGraphSize=" << compile_result.emitted_nodes;
@@ -925,6 +938,7 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
         std::vector<BackendDataPtr> parameters_data,
         std::vector<BackendDataPtr> tensors_data,
         ComputationCache::TypePtr cached_computation) {
+  TensorCollectionBarrier(coll);
   std::shared_ptr<Async> async = std::make_shared<Async>(
       coll,
       std::move(parameters_data),
@@ -947,6 +961,10 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
       VLOG(3) << "Executing IR graph hash " << HashToString(hash)
               << " on device " << async->device << " done!";
 
+      TORCH_CHECK(async->tensors_data.size() == results.size(),
+        "Expected number of outputs does not match TorchScript Stack size: ",
+        async->tensors_data.size(), " != ", results.size());
+
       for (const auto i : c10::irange(results.size())) {
         if (async->tensors_data[i] != nullptr) {
           async->tensors_data[i]->Assign(*results[i]);
@@ -973,13 +991,17 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
     }
   };
 
-  ScheduleIoClosure(async->mwait.Completer(std::move(syncfn)));
+  if (FLAGS_torch_lazy_use_thread_pool) {
+    ScheduleIoClosure(async->mwait.Completer(std::move(syncfn)));
+  } else {
+    syncfn();
+  }
   return async;
 }
 
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
     ScheduleSyncTensorsGraph(
-        std::vector<LazyTensor>* tensors,
+        std::vector<LazyTensorPtr>* tensors,
         SyncTensorCollection* coll,
         std::vector<BackendDataPtr> parameters_data,
         ComputationCache::TypePtr cached_computation) {
@@ -992,11 +1014,11 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
 }
 
 std::vector<at::Tensor> LazyGraphExecutor::GetTensorsFused(
-    std::vector<LazyTensor>* tensors) {
+    std::vector<LazyTensorPtr>* tensors) {
   SyncTensorsConfig config;
   config.force_ltc_data = false;
   auto async = SyncTensorsGraphInternal(tensors, {}, config);
-  if (async != nullptr) {
+  if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) {
     async->mwait.Wait();
   }
   std::vector<BackendDataPtr> tensors_data = GatherTensorsData(
@@ -1018,7 +1040,7 @@ std::vector<at::Tensor> LazyGraphExecutor::GetTensorsFused(
 // 'PopulateTensor' method on backend, which can either attach an existing
 // tensor buffer to the wrapper, or copy data?
 std::vector<at::Tensor> LazyGraphExecutor::FetchTensors(
-    std::vector<LazyTensor>* tensors,
+    std::vector<LazyTensorPtr>* tensors,
     c10::ArrayRef<BackendDataPtr> tensors_data,
     const std::vector<size_t>* indices) {
   std::vector<at::Tensor> results;
@@ -1029,17 +1051,17 @@ std::vector<at::Tensor> LazyGraphExecutor::FetchTensors(
     if (indices != nullptr && sync_index < indices->size() &&
         i == (*indices)[sync_index]) {
       results.push_back(getBackend()->MakeTensorFromComputationData(
-          tensors_data[literals_index], (*tensors)[i].dtype()));
+          tensors_data[literals_index], (*tensors)[i]->dtype()));
       ++literals_index;
       ++sync_index;
     } else {
-      c10::optional<at::Tensor> tensor_data = (*tensors)[i].CurrentTensorData();
+      c10::optional<at::Tensor> tensor_data = (*tensors)[i]->CurrentTensorData();
       if (tensor_data) {
         results.push_back(*tensor_data);
       } else {
         TORCH_CHECK(literals_index < tensors_data.size());
         results.push_back(getBackend()->MakeTensorFromComputationData(
-            tensors_data[literals_index], (*tensors)[i].dtype()));
+            tensors_data[literals_index], (*tensors)[i]->dtype()));
         ++literals_index;
       }
     }
@@ -1048,14 +1070,14 @@ std::vector<at::Tensor> LazyGraphExecutor::FetchTensors(
 }
 
 std::vector<BackendDataPtr> LazyGraphExecutor::GatherTensorsData(
-    const std::vector<LazyTensor>& tensors,
+    const std::vector<LazyTensorPtr>& tensors,
     c10::ArrayRef<size_t> indices,
     c10::ArrayRef<BackendDataPtr> tensors_data) {
   std::vector<BackendDataPtr> result_tensors_data;
   std::unordered_map<int64_t, size_t> uid_index_map;
   size_t indices_index = 0;
   for (const auto i : c10::irange(tensors.size())) {
-    int64_t tensor_id = tensors[i].GetUniqueId();
+    int64_t tensor_id = tensors[i]->GetUniqueId();
     auto it = uid_index_map.find(tensor_id);
     if (it != uid_index_map.end()) {
       // Current tensor is a duplicate of a previously processed tensor that had
@@ -1068,8 +1090,8 @@ std::vector<BackendDataPtr> LazyGraphExecutor::GatherTensorsData(
       uid_index_map.emplace(tensor_id, result_tensors_data.size());
       result_tensors_data.push_back(tensors_data[indices_index]);
       ++indices_index;
-    } else if (!tensors[i].CurrentTensorData()) {
-      BackendDataPtr handle = tensors[i].CurrentDataHandle();
+    } else if (!tensors[i]->CurrentTensorData()) {
+      BackendDataPtr handle = tensors[i]->CurrentDataHandle();
       TORCH_CHECK(handle != nullptr);
       result_tensors_data.push_back(std::move(handle));
     }
@@ -1077,5 +1099,33 @@ std::vector<BackendDataPtr> LazyGraphExecutor::GatherTensorsData(
   return result_tensors_data;
 }
 
+void LazyGraphExecutor::TensorCollectionBarrier(SyncTensorCollection* coll) {
+  static const std::string invalid_device(
+      "Unknown0"); /* Temp solution to idetify unassigned devices */
+  if (coll->device.toString().compare(invalid_device) == 0 ||
+      coll->unlocker.size() > 0) {
+    return;
+  }
+  if (coll) {
+    VLOG(4) << "Waiting on device barrier for device " << coll->device << " ...";
+    {
+      TORCH_LAZY_TIMED("DeviceLockWait");
+      coll->unlocker =
+          DeviceLockerArena::Get()->LockDevices({coll->device });
+    }
+    VLOG(4) << "Waiting on device barrier for device " << coll->device << " done!";
+  }
+}
+
+hash_t LazyGraphExecutor::GetGraphHash(const std::vector<LazyTensorPtr>& tensors) {
+  SyncTensorsConfig config;
+  config.sync_ltc_data = false;
+
+  auto coll = CollectSyncTensors(tensors, config);
+  auto po_data = RunPostOrder(tensors, &coll);
+  coll.hash = HashCombine(coll.hash, Hash(po_data.parameter_sequence));
+  return coll.hash;
+}
+
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 0b398e85868b..76b7d694bfab 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -46,7 +46,7 @@ class TORCH_API LazyGraphExecutor {
   // for the given device. If device is nullptr, the live tensors for all
   // devices will be returned. Returned tensors are sorted by device as primary
   // key, and by unique ID as secondary key.
-  std::vector<LazyTensor> GetLiveTensors(const BackendDevice* device);
+  std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device);
 
   // Makes sure that any outstanding IR operation accumulated over live tensors,
   // gets turned into device data. If wait is true, the sync operation will be
@@ -62,7 +62,7 @@ class TORCH_API LazyGraphExecutor {
   // will be run synchronously. The devices argument, if not empty, tells the
   // devices which should be partecipating into the replicated computation.
   void SyncTensorsGraph(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       c10::ArrayRef<std::string> devices,
       bool wait,
       bool sync_ltc_data);
@@ -77,13 +77,13 @@ class TORCH_API LazyGraphExecutor {
 
   // Retrieves the PyTorch CPU tensors behind the lazy tensors IR operations.
   // All the tensors must be on the same device.
-  std::vector<at::Tensor> GetTensors(std::vector<LazyTensor>* tensors);
+  std::vector<at::Tensor> GetTensors(std::vector<LazyTensorPtr>* tensors);
 
   size_t IncTrimCounter();
 
   // Dumps the backend specific text of the computation accumulated in the graph
   // which is attached the tensors.
-  std::string DumpBackendComputation(const std::vector<LazyTensor>& tensors);
+  std::string DumpBackendComputation(const std::vector<LazyTensorPtr>& tensors);
 
   Value GetDeviceDataIrValue(
       const at::Scalar& value,
@@ -114,6 +114,19 @@ class TORCH_API LazyGraphExecutor {
     noop_execution_mode_ = enable_noop;
   }
 
+  struct CachedComputation {
+    explicit CachedComputation(ComputationPtr computation)
+        : computation(std::move(computation)) {}
+
+    ComputationPtr computation;
+  };
+
+  using ComputationCache = Cache<hash_t, CachedComputation, HashReducer>;
+
+  ComputationCache* GetComputationCache();
+
+  hash_t GetGraphHash(const std::vector<LazyTensorPtr>& tensors);
+
  private:
   struct SyncTensorsConfig {
     // Whether we want to force data on the target tensors (hence trimming
@@ -148,15 +161,6 @@ class TORCH_API LazyGraphExecutor {
     std::vector<BackendDataPtr> parameters_data;
   };
 
-  struct CachedComputation {
-    explicit CachedComputation(ComputationPtr computation)
-        : computation(std::move(computation)) {}
-
-    ComputationPtr computation;
-  };
-
-  using ComputationCache = Cache<hash_t, CachedComputation, HashReducer>;
-
   struct Async {
     Async(
         SyncTensorCollection* coll,
@@ -176,43 +180,44 @@ class TORCH_API LazyGraphExecutor {
   };
 
   SyncTensorCollection CollectSyncTensors(
-      const std::vector<LazyTensor>& tensors,
+      const std::vector<LazyTensorPtr>& tensors,
       const SyncTensorsConfig& config);
 
+  // Waits for this SyncTensorCollection's device barrier and acuire the lock.
+  void TensorCollectionBarrier(SyncTensorCollection* coll);
+
   std::vector<Value> CollectRoots(
-      const std::vector<LazyTensor>& tensors,
+      const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<size_t> indices);
 
   std::vector<BackendDataPtr> FetchTensorData(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       const SyncTensorsConfig& config,
       c10::ArrayRef<size_t> indices);
 
   PostOrderData RunPostOrder(
-      const std::vector<LazyTensor>& tensors,
-      c10::ArrayRef<size_t> indices);
+      const std::vector<LazyTensorPtr>& tensors,
+      SyncTensorCollection* coll);
   std::shared_ptr<Async> TryRunCachedSync(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       SyncTensorCollection* coll,
       PostOrderData* po_data);
 
   CompilationResult Compile(
-      const std::vector<LazyTensor>& tensors,
+      const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<std::string> devices,
       const SyncTensorCollection& coll,
       PostOrderData* po_data);
 
-  ComputationCache* GetComputationCache();
-
   ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash);
 
   void BuildInputOutputAliases(
-      const std::vector<LazyTensor>& tensors,
+      const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<size_t> indices,
       LoweringContext* lowering_ctx);
 
   std::shared_ptr<Async> SyncTensorsGraphInternal(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       c10::ArrayRef<std::string> devices,
       const SyncTensorsConfig& config);
 
@@ -226,22 +231,22 @@ class TORCH_API LazyGraphExecutor {
       ComputationCache::TypePtr cached_computation);
 
   std::shared_ptr<Async> ScheduleSyncTensorsGraph(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       SyncTensorCollection* coll,
       std::vector<BackendDataPtr> parameters_data,
       ComputationCache::TypePtr cached_computation);
 
-  std::vector<at::Tensor> GetTensorsFused(std::vector<LazyTensor>* tensors);
+  std::vector<at::Tensor> GetTensorsFused(std::vector<LazyTensorPtr>* tensors);
 
   std::vector<at::Tensor> FetchTensors(
-      std::vector<LazyTensor>* tensors,
+      std::vector<LazyTensorPtr>* tensors,
       c10::ArrayRef<BackendDataPtr> tensors_data,
       const std::vector<size_t>* indices);
 
   // Gathers the device data for all the input tensors, after an
   // asynchronous operation.
   std::vector<BackendDataPtr> GatherTensorsData(
-      const std::vector<LazyTensor>& tensors,
+      const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<size_t> indices,
       c10::ArrayRef<BackendDataPtr> tensors_data);
 
diff --git a/torch/csrc/lazy/core/lazy_view.cpp b/torch/csrc/lazy/core/lazy_view.cpp
index 85d81e0f7a57..b9766bc16985 100644
--- a/torch/csrc/lazy/core/lazy_view.cpp
+++ b/torch/csrc/lazy/core/lazy_view.cpp
@@ -2,17 +2,8 @@
 
 #include <torch/csrc/lazy/core/helpers.h>
 #include <torch/csrc/lazy/core/permutation_util.h>
-#include <torch/csrc/lazy/core/view_ops/as_strided.h>
-#include <torch/csrc/lazy/core/view_ops/as_strided_view_update.h>
-#include <torch/csrc/lazy/core/view_ops/diagonal.h>
-#include <torch/csrc/lazy/core/view_ops/diagonal_view_update.h>
-#include <torch/csrc/lazy/core/view_ops/narrow.h>
-#include <torch/csrc/lazy/core/view_ops/narrow_view_update.h>
-#include <torch/csrc/lazy/core/view_ops/permute.h>
-#include <torch/csrc/lazy/core/view_ops/resize.h>
-#include <torch/csrc/lazy/core/view_ops/select.h>
-#include <torch/csrc/lazy/core/view_ops/select_view_update.h>
-#include <torch/csrc/lazy/core/view_ops/view.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
 
 #include <c10/util/Exception.h>
 #include <algorithm>
@@ -26,31 +17,35 @@ namespace {
 Value ApplyViewInfo(Value ir_value, const ViewInfo& view_info) {
   switch (view_info.view_type) {
     case ViewInfo::Type::kSelect:
-      return MakeNode<Select>(
+      return MakeSelect(
           ir_value,
           view_info.select->dim,
           view_info.select->start,
           view_info.select->end,
           view_info.select->stride);
     case ViewInfo::Type::kNarrow:
-      return MakeNode<Narrow>(
-          ir_value, view_info.indices, view_info.shape.sizes());
+      return MakeNarrow(
+          ir_value, view_info.indices, view_info.shape.sizes().vec());
     case ViewInfo::Type::kNoOp:
       return ir_value;
     case ViewInfo::Type::kPermute:
-      return MakeNode<Permute>(ir_value, view_info.permutation);
+      return MakePermute(ir_value, view_info.permutation);
     case ViewInfo::Type::kReshape:
-      return MakeNode<View>(ir_value, view_info.shape.sizes().vec());
+      return MakeView(ir_value, view_info.shape.sizes().vec());
     case ViewInfo::Type::kResize:
-      return MakeNode<Resize>(ir_value, view_info.shape.sizes().vec());
+      return MakeResize(ir_value, view_info.shape.sizes().vec());
+    case ViewInfo::Type::kSqueeze:
+      return MakeSqueeze(ir_value, view_info.squeeze_index);
+    case ViewInfo::Type::kUnsqueeze:
+      return MakeUnsqueeze(ir_value, view_info.squeeze_index);
     case ViewInfo::Type::kAsStrided:
-      return MakeNode<AsStrided>(
+      return MakeAsStrided(
           ir_value,
           view_info.shape.sizes().vec(),
           view_info.as_strided->stride,
           view_info.as_strided->offset);
     case ViewInfo::Type::kDiagonal:
-      return MakeNode<Diagonal>(
+      return MakeDiagonal(
           ir_value,
           view_info.diagonal->offset,
           view_info.diagonal->dim1,
@@ -61,6 +56,16 @@ Value ApplyViewInfo(Value ir_value, const ViewInfo& view_info) {
   }
 }
 
+// Here we are trying to populate inplace updated values from the latest view
+// all the way back to the original tensor.
+// For example:
+//     a = torch.diagonal(b)
+//     b.add_(1) # a should be updated as well.
+//
+// Ideally we should all have a *ViewUpdate IR which updates the original tensor/view
+// withe current value. See DiagonalViewUpdate and corresponding LowerDiagonalViewUpdate
+// in ts_node_lowering.cpp. There are some "edge cases" here simply because they can
+// smartly reuse some other ops to undo themselves.
 Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) {
   // We first bring the source IR value forward, by reshaping and slicing.
   std::vector<Value> tmp_values({ir_value});
@@ -74,7 +79,7 @@ Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) {
     const ViewInfo& view_info = update_data.view_infos[i - 1];
     switch (view_info.view_type) {
       case ViewInfo::Type::kSelect:
-        result = MakeNode<SelectViewUpdate>(
+        result = MakeSelectViewUpdate(
             tmp_values[i - 1],
             result,
             view_info.select->dim,
@@ -83,23 +88,29 @@ Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) {
             view_info.select->stride);
         break;
       case ViewInfo::Type::kNarrow:
-        result = MakeNode<NarrowViewUpdate>(
+        result = MakeNarrowViewUpdate(
             tmp_values[i - 1], result, view_info.indices);
         break;
       case ViewInfo::Type::kNoOp:
         break;
       case ViewInfo::Type::kPermute:
-        result = MakeNode<Permute>(
+        result = MakePermute(
             result, InversePermutation(view_info.permutation));
         break;
       case ViewInfo::Type::kReshape:
-        result = MakeNode<View>(result, view_info.source_shape.sizes().vec());
+        result = MakeView(result, view_info.source_shape.sizes().vec());
         break;
       case ViewInfo::Type::kResize:
-        result = MakeNode<Resize>(result, view_info.source_shape.sizes().vec());
+        result = MakeResize(result, view_info.source_shape.sizes().vec());
         break;
+      case ViewInfo::Type::kSqueeze:
+          result = MakeUnsqueeze(ir_value, view_info.squeeze_index);
+          break;
+      case ViewInfo::Type::kUnsqueeze:
+          result = MakeSqueeze(ir_value, view_info.squeeze_index);
+          break;
       case ViewInfo::Type::kAsStrided:
-        result = MakeNode<AsStridedViewUpdate>(
+        result = MakeAsStridedViewUpdate(
             tmp_values[i - 1],
             result,
             view_info.source_shape.sizes().vec(),
@@ -107,7 +118,7 @@ Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) {
             view_info.as_strided->offset);
         break;
       case ViewInfo::Type::kDiagonal:
-        result = MakeNode<DiagonalViewUpdate>(
+        result = MakeDiagonalViewUpdate(
             tmp_values[i - 1],
             result,
             view_info.diagonal->offset,
@@ -130,12 +141,21 @@ ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape)
       indices(source_shape.dim(), 0),
       source_shape(std::move(source_shape)) {}
 
+ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi)
+    : view_type(view_type),
+      shape(std::move(shape)),
+      source_shape(std::move(source_shape)),
+      squeeze_index(sqi)
+{
+  TORCH_CHECK(view_type == Type::kSqueeze);
+}
+
 ViewInfo::ViewInfo(
     Type view_type,
     Shape source_shape,
     std::vector<int64_t> permutation)
     : view_type(view_type),
-      shape(Permute::MakePermuteShape(source_shape, permutation)),
+      shape(MakePermuteShape(source_shape, permutation)),
       source_shape(std::move(source_shape)),
       permutation(std::move(permutation)) {
   TORCH_CHECK(view_type == Type::kPermute);
@@ -143,7 +163,7 @@ ViewInfo::ViewInfo(
 
 ViewInfo::ViewInfo(Type view_type, const Shape& source_shape, SelectInfo select)
     : view_type(view_type),
-      shape(Select::MakeSelectShape(
+      shape(MakeSelectShape(
           source_shape,
           select.dim,
           select.start,
@@ -171,7 +191,7 @@ ViewInfo::ViewInfo(
     const Shape& source_shape,
     DiagonalInfo diagonal)
     : view_type(view_type),
-      shape(Diagonal::MakeDiagonalShape(
+      shape(MakeDiagonalShape(
           source_shape,
           diagonal.offset,
           diagonal.dim1,
diff --git a/torch/csrc/lazy/core/lazy_view.h b/torch/csrc/lazy/core/lazy_view.h
index 8116f6f98252..5e1a106494cf 100644
--- a/torch/csrc/lazy/core/lazy_view.h
+++ b/torch/csrc/lazy/core/lazy_view.h
@@ -52,10 +52,13 @@ struct TORCH_API ViewInfo {
     kSelect,
     kAsStrided,
     kDiagonal,
+    kSqueeze,
+    kUnsqueeze,
   };
 
   ViewInfo() = default;
   ViewInfo(Type view_type, Shape shape, Shape source_shape);
+  ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi);
   ViewInfo(
       Type view_type,
       Shape source_shape,
@@ -92,6 +95,8 @@ struct TORCH_API ViewInfo {
   c10::optional<AsStridedInfo> as_strided;
   // Information used for diagonal views.
   c10::optional<DiagonalInfo> diagonal;
+  // Squeeze/Unsqueeze Index
+  int64_t squeeze_index;
 };
 
 // When a "view" (capture by reference) is taken on a node, an Alias object is
diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h
index 02cf92a11b4c..c1e125f4048c 100644
--- a/torch/csrc/lazy/core/metrics.h
+++ b/torch/csrc/lazy/core/metrics.h
@@ -205,7 +205,7 @@ class TORCH_API Counter {
   } while (0)
 
 #define TORCH_LAZY_FN_COUNTER(ns) \
-  TORCH_LAZY_COUNTER(c10::str(ns, __FUNCTION__), 1)
+  TORCH_LAZY_COUNTER(c10::str(ns, __func__), 1)
 
 #define TORCH_LAZY_VALUE_METRIC(name, value)                         \
   do {                                                               \
diff --git a/torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp b/torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp
new file mode 100644
index 000000000000..9c416822015d
--- /dev/null
+++ b/torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp
@@ -0,0 +1,48 @@
+#include <torch/csrc/lazy/core/ops/arithmetic_ir_ops.h>
+
+#include <torch/csrc/lazy/core/helpers.h>
+
+#include <memory>
+
+#include <torch/csrc/lazy/core/ir_builder.h>
+
+namespace torch {
+namespace lazy {
+
+// These operators were once widely used in nativefunction impls to perform
+// convenient decompositions (partial lowerings) of aten operators into more primitive
+// opererators. They should not be used for this purpose anymore, but still used in
+// lazy_graph_executor for RNG math in one place.  We could rewrite that.
+NodePtr operator+(const Value& node1, const Value& node2) {
+  return MakeGeneric(
+      OpKind(at::aten::add),
+      {node1, node2},
+      GetPromotedBinaryOpShape(node1.shape(), node2.shape()));
+}
+
+NodePtr operator-(const Value& node1, const Value& node2) {
+  return MakeGeneric(
+      OpKind(at::aten::sub),
+      {node1, node2},
+      GetPromotedBinaryOpShape(
+          node1.shape(), node2.shape()));
+}
+
+NodePtr operator*(const Value& node1, const Value& node2) {
+  return MakeGeneric(
+      OpKind(at::aten::mul),
+      {node1, node2},
+      GetPromotedBinaryOpShape(
+          node1.shape(), node2.shape()));
+}
+
+NodePtr operator/(const Value& node1, const Value& node2) {
+  return MakeGeneric(
+      OpKind(at::aten::div),
+      {node1, node2},
+      GetPromotedBinaryOpShape(
+          node1.shape(), node2.shape()));
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h b/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
similarity index 88%
rename from torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h
rename to torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
index f9fccab582b0..3abb6cb3b108 100644
--- a/torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h
+++ b/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <torch/csrc/lazy/core/ir.h>
 
 namespace torch {
 namespace lazy {
diff --git a/torch/csrc/lazy/core/ops/utils.cpp b/torch/csrc/lazy/core/ops/utils.cpp
new file mode 100644
index 000000000000..65f7cbd1639d
--- /dev/null
+++ b/torch/csrc/lazy/core/ops/utils.cpp
@@ -0,0 +1,103 @@
+#include <algorithm>
+
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/util.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+
+namespace torch {
+namespace lazy {
+
+bool StrideIsSupported(c10::ArrayRef<int64_t> stride) {
+  std::vector<int64_t> sorted_stride(stride.begin(), stride.end());
+  std::sort(sorted_stride.begin(), sorted_stride.end());
+  return stride.empty() || sorted_stride.front() == 1;
+}
+
+std::vector<int64_t> GetArrayStridePermutation(
+    c10::ArrayRef<int64_t> stride) {
+  std::vector<int64_t> permutation = Iota<int64_t>(stride.size());
+  std::sort(permutation.begin(), permutation.end(), [&](int64_t a, int64_t b) {
+    return stride[a] > stride[b];
+  });
+  return permutation;
+}
+
+Shape MakeDiagonalShape(
+    const Shape& shape,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2) {
+  std::vector<int64_t> dimensions;
+  for (const auto dim : c10::irange(shape.dim())) {
+    if (dim != dim1 && dim != dim2) {
+      dimensions.push_back(shape.size(dim));
+    }
+  }
+  int64_t dsize = 0;
+  if (offset >= 0) {
+    dsize = std::max<int64_t>(
+        std::min(shape.size(dim1), shape.size(dim2) - offset), 0);
+  } else {
+    dsize = std::max<int64_t>(
+        std::min(shape.size(dim1) + offset, shape.size(dim2)), 0);
+  }
+  dimensions.push_back(dsize);
+  return Shape(shape.scalar_type(), dimensions);
+}
+
+Shape MakePermuteShape(
+    const Shape& source_shape,
+    c10::ArrayRef<int64_t> permutation) {
+  return Shape(
+      source_shape.scalar_type(),
+      PermuteDimensions(permutation, source_shape.sizes()));
+}
+
+
+Shape MakeSelectShape(
+    const Shape& shape,
+    int64_t dim,
+    int64_t start,
+    int64_t end,
+    int64_t stride) {
+  int64_t effective_stride = GetStride(start, end, stride);
+  Shape select_shape(shape);
+  select_shape.set_size(
+      dim, (end - start + effective_stride - 1) / effective_stride);
+  return select_shape;
+}
+
+int64_t GetStride(int64_t start, int64_t end, int64_t stride) {
+  if (stride == 0) {
+    CHECK_EQ(start, end);
+    stride = 1;
+  }
+  return stride;
+}
+
+// This is almost like at::inferSqueezeGeometry, but that requires a Tensor input
+// and also computes new strides.  This logic seems correct.
+std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
+                                             int64_t squeeze_dim) {
+  std::vector<int64_t> output_dimensions;
+  for (const auto i : c10::irange(dimensions.size())) {
+    int64_t dim = dimensions[i];
+    if (dim != 1 || (i != squeeze_dim && squeeze_dim >= 0)) {
+      output_dimensions.push_back(dim);
+    }
+  }
+  return output_dimensions;
+}
+
+std::vector<int64_t> BuildUnsqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim) {
+  std::vector<int64_t> output_dimensions(
+      dimensions.cbegin(), dimensions.cend());
+  output_dimensions.insert(output_dimensions.begin() + squeeze_dim, 1);
+  return output_dimensions;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/ops/utils.h b/torch/csrc/lazy/core/ops/utils.h
new file mode 100644
index 000000000000..9c902bb7244f
--- /dev/null
+++ b/torch/csrc/lazy/core/ops/utils.h
@@ -0,0 +1,40 @@
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+bool StrideIsSupported(c10::ArrayRef<int64_t> stride);
+
+std::vector<int64_t> GetArrayStridePermutation(c10::ArrayRef<int64_t> stride);
+
+Shape MakeDiagonalShape(
+    const Shape& shape,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2);
+
+Shape MakePermuteShape(
+    const Shape& source_shape,
+    c10::ArrayRef<int64_t> permutation);
+
+Shape MakeSelectShape(
+    const Shape& shape,
+    int64_t dim,
+    int64_t start,
+    int64_t end,
+    int64_t stride);
+
+int64_t GetStride(int64_t start, int64_t end, int64_t stride);
+
+std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
+                                             int64_t squeeze_dim);
+
+std::vector<int64_t> BuildUnsqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim);
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp
index 2b7fd2c74b87..ba90fbbcba78 100644
--- a/torch/csrc/lazy/core/shape.cpp
+++ b/torch/csrc/lazy/core/shape.cpp
@@ -1,12 +1,17 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/tensor.h>
+
+C10_DEFINE_bool(
+    ltc_enable_symbolic_shapes,
+    false,
+    "Enables calculation of if dims are symbolic");
 
 namespace torch {
 namespace lazy {
 
 Shape::Shape(at::ScalarType scalar_type, c10::ArrayRef<int64_t> sizes)
-    : scalar_type_(scalar_type),
-      sizes_(sizes.begin(), sizes.end()) {}
+    : scalar_type_(scalar_type), sizes_(sizes.begin(), sizes.end()) {}
 
 std::string Shape::to_string() const {
   return c10::str(toString(scalar_type_), "[", c10::Join(",", sizes_), "]");
@@ -28,9 +33,96 @@ size_t Shape::numel() const {
   return elts;
 }
 
-hash_t Shape::hash() const {
-  return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t)));
+hash_t Shape::hash(bool bakeInSizes) const {
+  if (bakeInSizes) {
+    return HashCombine(
+        Hash(scalar_type_),
+        DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t)));
+  } else {
+    return HashCombine(Hash(scalar_type_), Hash(sizes_.size()));
+  }
+}
+
+Shape Shape::with_symbolic_dims(
+    c10::optional<std::vector<bool>> symbolic_dims) const {
+  Shape copy = *this;
+  copy.is_symbolic_ = symbolic_dims;
+  return copy;
+}
+
+bool symbolicShapeEnabled() {
+  static bool enabled = std::getenv("LTC_ENABLE_SYMBOLIC_SHAPES") != nullptr;
+  return enabled || FLAGS_ltc_enable_symbolic_shapes;
+}
+
+c10::SymbolicShape get_symbolic_shape(at::Tensor& tensor) {
+  auto ltc_tensor = TryGetLtcTensor(tensor);
+  if (!ltc_tensor) {
+    // Set Concrete sizes for Concrete tensors
+    return c10::SymbolicShape(tensor.sizes());
+  }
+  const Shape& input_shape = ltc_tensor->GetIrValue()->shape();
+  auto& is_symbolic = input_shape.is_symbolic();
+  if (!is_symbolic.has_value()) {
+    return c10::SymbolicShape();
+  }
+  auto sizes = input_shape.sizes();
+  TORCH_INTERNAL_ASSERT(
+      sizes.size() == is_symbolic->size(),
+      "Dims of two values are not consistent");
+  std::vector<c10::optional<int64_t>> symbolic_dims;
+  for (int64_t i = 0; i < sizes.size(); i++) {
+    if (is_symbolic->at(i)) {
+      symbolic_dims.emplace_back(c10::nullopt);
+    } else {
+      symbolic_dims.emplace_back(sizes.at(i));
+    }
+  }
+  return c10::SymbolicShape(symbolic_dims);
+}
+
+void applySymbolicShapesOnLT(
+    const char* schema_str,
+    std::vector<c10::IValue> args,
+    std::vector<Shape>& result_shapes) {
+  std::vector<jit::SSAInput> converted_args;
+  // TODO: Determine if there are any unknown values in LazyTensor
+  const c10::FunctionSchema& schema =
+      jit::getOperatorForLiteral(schema_str)->schema();
+
+  for (auto& arg : args) {
+    // Handle list of tensors
+    if (arg.isTensorList()) {
+      at::List<at::Tensor> tensor_list = arg.toTensorList();
+      for (at::Tensor tensor : tensor_list) {
+        converted_args.emplace_back(get_symbolic_shape(tensor));
+      }
+    } else if (arg.isTensor()) {
+      auto ss = get_symbolic_shape(arg.toTensor());
+      converted_args.emplace_back(ss);
+    } else {
+      // If we need to support symbolic ints, here is the place
+      // to add it.
+      converted_args.emplace_back(arg);
+    }
+  }
+  auto res_symbolic = jit::calculateSymbolicShapesOnOp(&schema, converted_args);
+  if (!res_symbolic) {
+    for (auto & result_shape : result_shapes) {
+      result_shape = result_shape.with_symbolic_dims(c10::nullopt);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        res_symbolic->size() == result_shapes.size(),
+        "Result shape size is not consistent");
+    for (int64_t i = 0; i < res_symbolic->size(); i++) {
+      auto sym_dims = res_symbolic->at(i).symbolicDims();
+      if (sym_dims.has_value()) {
+        result_shapes[i] = result_shapes[i].with_symbolic_dims(*sym_dims);
+      }
+    }
+  }
 }
 
-}  // namespace lazy
-}  // namespace torch
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h
index c67ff9088331..21377287be92 100644
--- a/torch/csrc/lazy/core/shape.h
+++ b/torch/csrc/lazy/core/shape.h
@@ -4,8 +4,11 @@
 #include <vector>
 
 #include <c10/core/Scalar.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
 #include <torch/csrc/lazy/core/hash.h>
 
+C10_DECLARE_bool(ltc_enable_symbolic_shapes);
+
 namespace torch {
 namespace lazy {
 
@@ -17,24 +20,58 @@ class TORCH_API Shape {
 
   std::string to_string() const;
 
-  c10::ScalarType scalar_type() const { return scalar_type_; }
-  void set_scalar_type(at::ScalarType value) { scalar_type_ = value; }
+  c10::ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  void set_scalar_type(at::ScalarType value) {
+    scalar_type_ = value;
+  }
+
+  int64_t dim() const {
+    return sizes_.size();
+  }
+  c10::ArrayRef<int64_t> sizes() const {
+    return sizes_;
+  }
+  int64_t size(int64_t dim) const {
+    return sizes_.at(dim);
+  }
+  void set_size(int64_t dim, int64_t size) {
+    sizes_.at(dim) = size;
+  }
+
+  const c10::optional<std::vector<bool>>& is_symbolic() const {
+    return is_symbolic_;
+  }
+
+  // Makes a copy with symbolic dims applied
+  Shape with_symbolic_dims(
+      c10::optional<std::vector<bool>> symbolic_dims) const;
 
-  int64_t dim() const { return sizes_.size(); }
-  c10::ArrayRef<int64_t> sizes() const { return sizes_; }
-  int64_t size(int64_t dim) const { return sizes_.at(dim); }
-  void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; }
   size_t numel() const;
-  hash_t hash() const;
+  hash_t hash(bool bakeInSizes) const;
 
   bool operator==(const Shape& other) const;
 
  private:
-  c10::ScalarType scalar_type_ {c10::ScalarType::Undefined};
+  c10::ScalarType scalar_type_{c10::ScalarType::Undefined};
+
+  // Stores which dimmensions are symbolic
+  // If nullopt, either it hasn't been initialized or the symbolic
+  // dimmensions are not calculatable
+  c10::optional<std::vector<bool>> is_symbolic_ = c10::nullopt;
+  // Sizes are the upper bound sizes for a tensor, used by XLA.
   std::vector<int64_t> sizes_;
 };
 
 TORCH_API std::ostream& operator<<(std::ostream& out, const Shape& shape);
 
-}  // namespace lazy
-}  // namespace torch
+TORCH_API bool symbolicShapeEnabled();
+// Calculate and applies symbolic shapes onto the
+// Shape objects passed to result_shapes
+TORCH_API void applySymbolicShapesOnLT(
+    const char* schema_str,
+    std::vector<c10::IValue> args,
+    std::vector<Shape>& result_shapes);
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
new file mode 100644
index 000000000000..6c6462886ed1
--- /dev/null
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -0,0 +1,625 @@
+/**
+ * This is a handwritten file that accompanies codegenerated header
+ * LazyShapeDtype.h
+ *
+ * The purpose of these shape/dtype inference methods are to fill gaps
+ * where we do not yet have structured kernels in pytorch core.  Ops
+ * for which there _are_ structured kernels can use meta::op() to infer
+ * shape/dtype, and codegen makes use of this.  Ops for which there are not
+ * yet structured kernels can still be used with lazy_tensor codegen, but require
+ * manual intervention to implement compute_shape_{op} and compute_dtype_{op}.
+ *
+ * READ THIS!
+ *
+ * 1. Beware: Tech Debt!
+ * ---------------------
+ * These functions are tech debt.  We want to delete them all and use structured
+ * kernels instead, but it's a lot faster to write these so we're decoupling the
+ * two efforts to move fast for adding support for codegenned Lazy Tensor ops.
+ *
+ * Codegenned Lazy Tensor ops with handwritten shape formulae are still better than
+ * fully handwritten Lazy Tensor ops (which also have handwritten shape formulae).
+ *
+ * 2. Structured Kernels For The Win
+ * ---------------------------------
+ * Long term, more and more ops should be supported as 'structured kernels'.  Consider
+ * doing your part and porting an op.  As ops get ported over, the codegen will automatically
+ * notice and stop generating declarations for these shape formulae, so we'll need to
+ * manually clean up the unused functions in this file, or somehow automate that.
+ *
+ * https://dev-discuss.pytorch.org/t/slides-from-structured-kernel-presentation/179
+ *
+ * 3. How to figure out the shape/dtype
+ * ------------------------------------
+ * Unfortunatley there isn't a one-stop-shop for learning the output shape formulae for all
+ * operators.  This is partly because some operators are not part of our 'public' API, including
+ * backward operators which users don't directly invoke.
+ *
+ * Check our opinfo registry:
+ *  https://github.com/pytorch/pytorch/blob/13b859983183ea9938deb5030ac9a0747841f0a8/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+ *
+ * Read the manual (for ops that are 1:1 with python frontend):
+ *  https://pytorch.org/docs/stable/generated/torch.trace.html
+ *
+ */
+
+#include <torch/csrc/lazy/core/shape_inference.h>
+
+#include <torch/csrc/lazy/core/shape.h>
+#include <ATen/native/ConvUtils.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/WrapDimUtils.h>
+#include <aten/src/ATen/native/ReduceOpsUtils.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/api/include/torch/enum.h>
+#include <ostream>
+#include <vector>
+
+namespace torch{
+namespace lazy {
+
+// Copied from ATen/native/utils/ParamUtils.h, which aparently I can't include from here?
+std::vector<int64_t> expand_param_if_needed(
+    at::IntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  if (list_param.size() == 1) {
+    return std::vector<int64_t>(expected_dim, list_param[0]);
+  } else if ((int64_t)list_param.size() != expected_dim) {
+    std::ostringstream ss;
+    ss << "expected " << param_name << " to be a single integer value or a "
+       << "list of " << expected_dim << " values to match the convolution "
+       << "dimensions, but got " << param_name << "=" << list_param;
+    AT_ERROR(ss.str());
+  } else {
+    return list_param.vec();
+  }
+}
+
+// It seems more common to not use parameters than to use them, so disable unused-parameter warning
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+std::vector<Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out) {
+  double size_d = 0;
+  // shape inference code copied from RangeFactories.cpp arange_out function
+  // Note: AT_DISPATCH_ALL_TYPES_AND is just a macro that defines the correct c++ scalar_t type depending on out tensor
+  AT_DISPATCH_ALL_TYPES_AND(c10::kBFloat16, out.scalar_type(), "compute_shape_arange_out", [&]() {
+    // Note: acc_type further defines an accumulataion type depending on the scalar_t and whether its on cuda vs cpu.
+    using accscalar_t = at::acc_type<scalar_t, false>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+
+    // we use double precision for (start - end) / step
+    // to compute size_d for consistency across devices.
+    // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
+    // but double on cpu for the same,
+    // and the effective output size starts differing on CPU vs GPU because of precision issues, which
+    // we dont want.
+    // the corner-case we do want to take into account is int64_t, which has higher precision than double
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (std::is_same<scalar_t, int64_t>::value) {
+      size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
+                         / step.to<accscalar_t>());
+    } else {
+      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+                         / step.to<double>());
+    }
+
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+             std::isfinite(static_cast<double>(xend)),
+             "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+             "upper bound and larger bound inconsistent with step sign");
+
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+             "invalid size, possible overflow?");
+  });
+
+  int64_t size = static_cast<int64_t>(size_d);
+
+
+  // From torch.arange docs:
+  // dtype (torch.dtype, optional) – the desired data type of returned tensor.
+  // Default: if None, uses a global default (see torch.set_default_tensor_type()).
+  // If dtype is not given, infer the data type from the other input arguments.
+  // If any of start, end, or stop are floating-point, the dtype is inferred to be the default dtype, see get_default_dtype().
+  // Otherwise, the dtype is inferred to be torch.int64.
+
+  // Since out tensor is specified, its dtype should always be used?
+  return {Shape(out.scalar_type(), {size})};
+}
+
+std::vector<Shape> compute_shape_abs(const at::Tensor & self) {
+  if (self.is_complex()) {
+    const auto float_type = c10::toRealValueType(self.scalar_type());
+    return {Shape(float_type, self.sizes().vec())};
+  }
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_bernoulli(const at::Tensor & self, c10::optional<at::Generator> generator) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_bernoulli_(at::Tensor & self, double p, c10::optional<at::Generator> generator) {
+  return compute_shape_bernoulli(self, generator);
+}
+
+std::vector<Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction) {
+  if(reduction == at::Reduction::None) {
+    return {Shape(self.scalar_type(), self.sizes().vec())};
+  }
+  return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+
+std::vector<Shape> compute_shape_constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value) {
+  // Based on aten/src/ATen/native/ConstantPadNd.cpp::constant_pad_nd
+  TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ",
+            pad.size());
+
+  auto input_sizes = self.sizes();
+  auto l_inp = self.dim();
+
+  auto l_pad = pad.size() / 2;
+  auto l_diff = l_inp - l_pad;
+  TORCH_CHECK(l_inp >= (int64_t)l_pad, "Length of pad should be no more than twice the number of "
+            "dimensions of the input. Pad length is ", pad.size(), "while the input has ",
+            l_inp, "dimensions.");
+
+  std::vector<int64_t> new_shape;
+  for (size_t i = 0; i < (size_t)l_diff; i ++) {
+      new_shape.emplace_back(input_sizes[i]);
+  }
+
+  for (const auto i : c10::irange((size_t)l_pad)) {
+      auto pad_idx = pad.size() - ((i + 1) * 2);
+      auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
+      TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+                pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
+                "which is invalid. Check dimension ", l_diff + i, " of your input.");
+      new_shape.emplace_back(new_dim);
+  }
+  return {Shape(self.scalar_type(), new_shape)};
+}
+
+std::vector<Shape> compute_shape_convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+  if (bias_sizes.has_value()) {
+    return {Shape(input.scalar_type(), input.sizes().vec()),
+            Shape(weight.scalar_type(), weight.sizes().vec()),
+            Shape(grad_output.scalar_type(), bias_sizes.value().vec())};
+  } else {
+    // TODO(whc) not sure whether to return 2 shapes here, or a 3rd one that is empty
+    return {Shape(input.scalar_type(), input.sizes().vec()),
+            Shape(weight.scalar_type(), weight.sizes().vec())};
+  }
+}
+
+std::vector<Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
+
+  int64_t dim = weight.ndimension() - 2;
+  TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
+
+  // at::convolution performs parameter expansion before running kernels on expanded parameters
+  // we must do the same.  Shape formulae access differnent dimensions of e.g. output_padding, but
+  // output_padding may be passed in as a scalar.  Sadly, accessing output_padding[1] in this case
+  // gives incorrect results rather than indexing error
+  auto expanded_stride = expand_param_if_needed(stride, "stride", dim);
+  auto expanded_padding = expand_param_if_needed(padding, "padding", dim);
+  auto expanded_dilation = expand_param_if_needed(dilation, "dilation", dim);
+  if (!transposed) {
+    return {Shape(input.scalar_type(), at::native::conv_output_size(input.sizes(), weight.sizes(), expanded_padding, expanded_stride, expanded_dilation))};
+  } else {
+    auto expanded_output_padding = expand_param_if_needed(output_padding, "output_padding", dim);
+    auto out_shape = at::native::conv_input_size(input.sizes(), weight.sizes(), expanded_padding, expanded_output_padding, expanded_stride, expanded_dilation, groups);
+    return {Shape(input.scalar_type(), out_shape)};
+  }
+}
+
+std::vector<Shape> compute_shape_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_max(const at::Tensor & self) {
+  TORCH_CHECK(self.numel() > 0,
+            "max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.");
+  return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_min(const at::Tensor & self){
+  TORCH_CHECK(self.numel() > 0,
+            "min(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.");
+    return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_nonzero(const at::Tensor& t, bool as_tuple) {
+  if (as_tuple) {
+    auto res = std::vector<Shape>();
+    for (auto dim_size : t.sizes()) {
+      res.emplace_back(Shape(at::kLong, {dim_size}));
+    }
+    return res;
+  }
+  int64_t max_elements = 1;
+  for (auto dim_size : t.sizes()) {
+    max_elements *= dim_size;
+  }
+  return {Shape(at::kLong, {max_elements, (int64_t)t.sizes().size()})};
+}
+
+std::vector<Shape> compute_shape_nonzero(const at::Tensor& self) {
+  return compute_shape_nonzero(self, false);
+}
+
+std::vector<Shape> compute_shape_embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse){
+  // Based on aten/src/ATen/native/Embedding.cpp::embedding.
+  std::vector<int64_t> out_sizes = indices.sizes().vec();
+  out_sizes.emplace_back(weight.size(1));
+  return {Shape(weight.scalar_type(), out_sizes)};
+}
+
+std::vector<Shape> compute_shape_std(const at::Tensor & self, bool unbiased){
+  return compute_shape_std(self, c10::nullopt, c10::nullopt, false);
+}
+std::vector<Shape> compute_shape_std(const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim){
+  return compute_shape_std(self, dim, c10::nullopt, keepdim);
+}
+std::vector<Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim){
+  if (dim.has_value()) {
+    auto shape = at::native::shape_from_dim_mask(self, at::native::make_dim_mask(dim.value(), self.dim()), keepdim);
+    return {Shape(self.scalar_type(), std::vector<int64_t>(shape.begin(), shape.end()))};
+  }
+  return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_embedding_dense_backward(const at::Tensor& grad_output, const at::Tensor& indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+  // Based on aten/src/ATen/native/Embedding.cpp::embedding_dense_backward_cpu.
+  return {Shape(grad_output.scalar_type(), {num_weights, grad_output.size(-1)})};
+}
+
+std::vector<Shape> compute_shape_index_select(const at::Tensor & self, int64_t dim,
+    const at::Tensor & index) {
+  // Based on definition of https://pytorch.org/docs/stable/generated/torch.index_select.html.
+  // Promote Rank 0 index tensor to a 1 * 1 tensor.
+  dim = at::maybe_wrap_dim(dim, self);
+  auto index_dim = index.dim() > 0 ? index.dim() : 1;
+  auto index_size = index.dim() > 0 ? index.size(0) : 1;
+  TORCH_CHECK(index_dim == 1);
+
+  auto self_sizes = self.sizes();
+  std::vector<int64_t> output_sizes(self_sizes.begin(), self_sizes.end());
+  TORCH_CHECK(output_sizes.size() > 0, "Empty output_sizes is not supported.");
+  output_sizes[dim] = index_size;
+
+  return {Shape(self.scalar_type(), output_sizes)};
+}
+
+std::vector<Shape> compute_shape_kl_div_backward(const at::Tensor& grad_output, const at::Tensor& self, const at::Tensor& target, int64_t reduction, bool log_target) {
+  // Based on definition of aten/src/ATen/native/Loss.cpp::kl_div_backward_cpu.
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_cat(at::TensorList tensors, int64_t dim) {
+  // TODO(whc) support cat in codegen and move this to compute_*_cat functions
+  std::vector<int64_t> out_shape(tensors[0].sizes().begin(), tensors[0].sizes().end());
+
+  dim = at::maybe_wrap_dim(dim, tensors);
+  size_t extended_dim_shape = 0;
+  for (auto& tensor: tensors) {
+    extended_dim_shape += tensor.sizes()[dim];
+  }
+  TORCH_CHECK(out_shape.size() > 0, "Scalar tensors are not supported in cat.");
+  TORCH_CHECK(extended_dim_shape <= std::numeric_limits<int64_t>::max(), "Size overflow");
+  out_shape[dim] = extended_dim_shape;
+  return {Shape(tensors[0].scalar_type(), out_shape)};
+}
+
+std::vector<Shape> compute_shape_native_layer_norm(const at::Tensor & input,
+    at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias,
+    double eps) {
+  // Copied from aten/src/ATen/native/layer_norm.cpp::layer_norm_cpu_out.
+  auto input_shape = input.sizes().vec();
+  const size_t axis = input.dim() - normalized_shape.size();
+
+  std::vector<int64_t> stat_shape;
+  for (const auto idx : c10::irange(axis)) {
+    TORCH_CHECK(idx < input_shape.size(), "Shape mismatch");
+    stat_shape.emplace_back(input_shape[idx]);
+  }
+  for (const auto idx : c10::irange(axis, input.dim())) {
+    (void)idx; // Suppress unused variable warning
+    stat_shape.emplace_back(1);
+  }
+
+  return {Shape(input.scalar_type(), input_shape),
+          Shape(input.scalar_type(), stat_shape),
+          Shape(input.scalar_type(), stat_shape)};
+}
+
+std::vector<Shape> compute_shape_native_layer_norm_backward(const at::Tensor& grad_out,
+    const at::Tensor& input, at::IntArrayRef normalized_shape, const at::Tensor& mean, const at::Tensor& rstd,
+    const c10::optional<at::Tensor>& weight, const c10::optional<at::Tensor>& bias, ::std::array<bool,3> output_mask) {
+  std::vector<Shape> shapes;
+  shapes.emplace_back(input.scalar_type(),
+                         output_mask[0] ? input.sizes().vec() : std::vector<int64_t>{});
+  shapes.emplace_back(weight && weight->defined() ? weight->scalar_type() : input.scalar_type(),
+                         output_mask[1] && weight ? weight->sizes().vec() : std::vector<int64_t>{});
+  shapes.emplace_back(bias && weight->defined() ? bias->scalar_type() : input.scalar_type(),
+                         output_mask[2] && bias ? bias->sizes().vec() : std::vector<int64_t>{});
+  return shapes;
+}
+
+std::vector<Shape> compute_shape_mean(const at::Tensor& self, c10::optional<at::ScalarType> dtype) {
+  if (dtype.has_value()) {
+    return {Shape(dtype.value(), {})};
+  }
+  return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_mv(const at::Tensor& self, const at::Tensor& vec) {
+  return {Shape(self.scalar_type(), {self.size(0)})};
+}
+
+std::vector<Shape> compute_shape_native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) {
+  return {Shape(input.scalar_type(), input.sizes().vec()), Shape(c10::ScalarType::Bool, input.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
+  return {Shape(grad_output.scalar_type(), grad_output.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_random_(at::Tensor & self, c10::optional<at::Generator> generator) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_random_(at::Tensor & self, int64_t to, c10::optional<at::Generator> generator) {
+  return compute_shape_random_(self, generator);
+}
+
+std::vector<Shape> compute_shape_random_(at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator) {
+  return compute_shape_random_(self, generator);
+}
+
+std::vector<Shape> compute_shape_relu(const at::Tensor& self) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_relu_(at::Tensor& self) {
+  return compute_shape_relu(self);
+}
+
+std::vector<Shape> compute_shape_bitwise_and(const at::Tensor& self, const at::Scalar& other) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_sum(
+    const at::Tensor& self, c10::optional<at::ScalarType> dtype) {
+  if (dtype.has_value()) {
+    return {Shape(dtype.value(), {})};
+  }
+  // It's undocumented, but torch::sum promotes all integral types to int64_t by
+  // default
+  if (isIntegralType(self.scalar_type(), /*includeBool*/ true)) {
+    return {Shape(c10::ScalarType::Long, {})};
+  }
+  return {Shape(self.scalar_type(), {})};;
+}
+
+std::vector<Shape> compute_shape_zero_(at::Tensor& self) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_trace(const at::Tensor& self) {
+  return {Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_sort(const at::Tensor & self, int64_t dim, bool descending) {
+  return {Shape(self.scalar_type(), self.sizes().vec()),
+          Shape(c10::ScalarType::Long, self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_smooth_l1_loss(
+    const at::Tensor& self, const at::Tensor& target, int64_t reduction,
+    double beta) {
+  // Taken from definition of 'Output' shape here:
+  // https://pytorch.org/docs/stable/generated/torch.nn.SmoothL1Loss.html
+  switch (reduction) {
+    case at::Reduction::None:
+      return {Shape(self.scalar_type(), self.sizes().vec())};
+    default:
+      return {Shape(self.scalar_type(), {})};
+  }
+}
+
+std::vector<Shape> compute_shape_smooth_l1_loss_backward(
+    const at::Tensor& grad_output, const at::Tensor& self,
+    const at::Tensor& target, int64_t reduction, double beta) {
+  // The `grad_output` tensor is really the input to this kernel, and while its
+  // shape may vary following the logic of the forward output, the output of
+  // this kernel should have fixed shapes matching the inputs to the forward
+  // kernel.
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_logdet(const at::Tensor & self) {
+  // assumes self.shape is {*, n, n} and returns shape *
+  TORCH_INTERNAL_ASSERT(self.dim() >= 2);
+  std::vector<int64_t> out_sizes(self.sizes().begin(), self.sizes().end() - 2);
+  // Doesn't check input dtype, but output dtype either matches it,
+  // or the actual logdet operation will throw if it's an unsupported type
+  return {Shape(self.scalar_type(), out_sizes)};
+}
+
+std::vector<Shape> compute_shape_log_sigmoid_forward(const at::Tensor& self) {
+  // Based on definition of aten/src/ATen/native/Activation.cpp::log_sigmoid_forward_out_cpu.
+  return {Shape(self.scalar_type(), self.sizes().vec()),
+          Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_log_sigmoid_backward(const at::Tensor& grad_output, const at::Tensor& self, const at::Tensor& buffer) {
+  // Based on definition of aten/src/ATen/native/Activation.cpp::log_sigmoid_backward_cpu*.
+  return {Shape(grad_output.scalar_type(), grad_output.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_nll_loss2d_forward(
+    const at::Tensor& self, const at::Tensor& target,
+    const c10::optional<at::Tensor>& weight, int64_t reduction,
+    int64_t ignore_index) {
+  // Based on definition of aten/src/ATen/native/LossNLL2d.cpp:nll_loss2d_forward_cpu
+  auto sizes =
+      (reduction == at::Reduction::Reduction::None ? target.sizes().vec()
+                                                   : std::vector<int64_t>{});
+  return {Shape(self.scalar_type(), sizes), Shape(self.scalar_type(), {})};
+}
+
+std::vector<Shape> compute_shape_nll_loss2d_backward(
+    const at::Tensor& grad_output, const at::Tensor& self,
+    const at::Tensor& target, const c10::optional<at::Tensor>& weight,
+    int64_t reduction, int64_t ignore_index, const at::Tensor& total_weight) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_grid_sampler_2d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  // from `aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+  int64_t N = input.size(0);
+  int64_t C = input.size(1);
+  int64_t H = grid.size(1);
+  int64_t W = grid.size(2);
+  return {Shape(input.scalar_type(), {N, C, H, W})};
+}
+
+std::vector<Shape> compute_shape_grid_sampler_2d_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask) {
+  // from `aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+  auto grad_input_shape = Shape(input.scalar_type(), input.sizes().vec());
+  auto grad_grid_shape = Shape(grid.scalar_type(), grid.sizes().vec());
+  return {grad_input_shape, grad_grid_shape};
+}
+
+std::vector<Shape> compute_shape_flip(const at::Tensor & self, at::IntArrayRef dims) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape__adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) {
+  // Checks based on `aten/src/ATen/native/AdaptiveAveragePooling.cpp`
+  // and on `aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp`
+  TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
+  TORCH_CHECK(
+      (output_size[0] >= 0 && output_size[1] >= 0),
+      "adaptive_avg_pool2d: elements of output_size must be greater than or equal to 0 ",
+      "but received {", output_size[0], ", ", output_size[1], "}");
+    int64_t ndim = self.ndimension();
+    for (const auto i : c10::irange(1, ndim)) {
+      TORCH_CHECK(self.size(i) > 0,
+        "adaptive_avg_pool2d(): Expected self to have non-zero size for non-batch dimensions, "
+        "but Tensor has sizes ", self.sizes(), " with dimension ", i, " being "
+        "empty");
+    }
+    TORCH_CHECK((ndim == 3 || ndim == 4),
+      "adaptive_avg_pool2d(): Expected 3D or 4D tensor, but got ", self.sizes());
+
+  int64_t channels  = self.size(-3);
+  int64_t output_height = output_size[0];
+  int64_t output_width = output_size[1];
+
+  if (ndim == 3) {
+    return {Shape(self.scalar_type(), {channels, output_height, output_width})};
+  } else {
+    int64_t nbatch = self.size(0);
+    return {Shape(self.scalar_type(), {nbatch, channels, output_height, output_width})};
+  }
+}
+
+std::vector<Shape> compute_shape__adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self) {
+    // Checks based on `aten/src/ATen/native/AdaptiveAveragePooling.cpp`
+    int64_t ndim = grad_output.ndimension();
+
+    for (const auto i : c10::irange(1, ndim)) {
+      TORCH_CHECK(grad_output.size(i) > 0,
+        "adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
+        "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
+        "empty");
+    }
+
+    TORCH_CHECK((ndim == 3 || ndim == 4),
+      "adaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got ", self.sizes());
+    TORCH_CHECK(self.dtype() == grad_output.dtype(),
+      "expected dtype ", self.dtype(), " for `grad_output` but got dtype ", grad_output.dtype());
+
+    return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_glu_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim) {
+  return {Shape(glu.scalar_type(), glu.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+  TORCH_INTERNAL_ASSERT(grad_output.scalar_type() == self.dtype());
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_clamp_min(const at::Tensor & self, const at::Scalar & min) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape__to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
+  if(dtype){
+    return {Shape(*dtype, self.sizes().vec())};
+  }
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+std::vector<Shape> compute_shape_stack(at::TensorList tensors, int64_t dim) {
+  TORCH_CHECK(tensors.size() > 0, "stack expects a non-empty TensorList");
+  auto wrapped_dim = at::maybe_wrap_dim(dim, tensors[0].ndimension() + 1);
+
+  // Copied from 'check_stack_inputs' in TensorShape.cpp
+  at::IntArrayRef entry_shape = tensors[0].sizes();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    TORCH_CHECK(tensors[i].sizes() == entry_shape,
+      "stack expects each tensor to be equal size, but got ", entry_shape,
+      " at entry 0 and ", tensors[i].sizes(), " at entry ", i);
+  }
+
+  auto result_sizes = tensors[0].sizes().vec();
+  result_sizes.insert(result_sizes.begin() + wrapped_dim, tensors.size());
+  return {Shape(tensors[0].scalar_type(), result_sizes)};
+}
+
+std::vector<Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats) {
+  CHECK_GE(repeats.size(), self.dim());
+  int64_t num_new_dimensions = repeats.size() - self.dim();
+  std::vector<int64_t> padded_size(num_new_dimensions, 1);
+  padded_size.insert(padded_size.end(), self.sizes().begin(),
+                     self.sizes().end());
+  std::vector<int64_t> target_size(repeats.size());
+  for (const auto idx : c10::irange(repeats.size())) {
+    target_size[idx] = padded_size[idx] * repeats[idx];
+  }
+  return {Shape(self.scalar_type(), target_size)};
+}
+
+std::vector<Shape> compute_shape_narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, c10::SymInt length) {
+  return {Shape(self.scalar_type(), self.sizes().vec())};
+}
+
+// Restore unused-parameters warnings
+#pragma GCC diagnostic pop
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/shape_inference.h b/torch/csrc/lazy/core/shape_inference.h
new file mode 100644
index 000000000000..4b1815e98a1d
--- /dev/null
+++ b/torch/csrc/lazy/core/shape_inference.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <vector>
+
+namespace torch{
+namespace lazy {
+
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_abs(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli_(at::Tensor & self, double p, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_cat(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clamp_min(const at::Tensor & self, const at::Scalar & min);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_flip(const at::Tensor & self, at::IntArrayRef dims);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_kl_div_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, bool log_target);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_forward(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logdet(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_max(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mean(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_min(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mv(const at::Tensor & self, const at::Tensor & vec);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout(const at::Tensor & input, double p, c10::optional<bool> train);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nonzero(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random_(at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random_(at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random_(at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu_(at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_smooth_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sort(const at::Tensor & self, int64_t dim, bool descending);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_stack(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, bool unbiased);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_trace(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_zero_(at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, c10::SymInt length);
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 830b6de733ac..1cc1ade8fe4a 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -1,24 +1,23 @@
-#include <c10/util/irange.h>
+#include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/tensor.h>
 
-#include <torch/csrc/lazy/core/config.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/helpers.h>
 #include <torch/csrc/lazy/core/ir_dump_util.h>
 #include <torch/csrc/lazy/core/lazy_graph_executor.h>
 #include <torch/csrc/lazy/core/metrics.h>
 #include <torch/csrc/lazy/core/tensor_impl.h>
 #include <torch/csrc/lazy/core/tensor_util.h>
-#include <torch/csrc/lazy/ts_backend/ops/cast.h>
-#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
-#include <torch/csrc/lazy/ts_backend/ops/scalar.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+
 
 namespace torch {
 namespace lazy {
 namespace {
-LazyTensor GetOrCreateLtcTensor(const at::Tensor& tensor,
+LazyTensorPtr GetOrCreateLtcTensor(const at::Tensor& tensor,
                                 const BackendDevice& device) {
   if (!tensor.defined()) {
-    return LazyTensor();
+    return torch::lazy::LazyTensorPtr();
   }
   auto lazy_tensor = TryGetLtcTensor(tensor);
   return lazy_tensor ? lazy_tensor : LazyTensor::Create(tensor, device);
@@ -29,56 +28,58 @@ LazyTensor::Data::~Data() {
   LazyGraphExecutor::Get()->UnregisterTensor(this);
 }
 
-LazyTensor LazyTensor::Create(
+LazyTensorPtr LazyTensor::Create(
     const at::Tensor& tensor,
     const BackendDevice& device) {
   TORCH_CHECK(tensor.device().type() != at::kLazy);
-  LazyTensor xtensor(tensor, device);
-  LazyGraphExecutor::Get()->RegisterTensor(xtensor.data_ptr());
-  return xtensor;
+  LazyTensorPtr lazy_tensor = c10::make_intrusive<LazyTensor>(LazyTensor(tensor, device));
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  return lazy_tensor;
 }
 
-LazyTensor LazyTensor::Create(Value ir_value, const BackendDevice& device) {
-  LazyTensor xtensor(std::move(ir_value), device);
-  LazyGraphExecutor::Get()->RegisterTensor(xtensor.data_ptr());
-  return xtensor;
+LazyTensorPtr LazyTensor::Create(Value ir_value, const BackendDevice& device) {
+  LazyTensorPtr lazy_tensor = c10::make_intrusive<LazyTensor>(LazyTensor(std::move(ir_value), device));
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  return lazy_tensor;
 }
 
-LazyTensor LazyTensor::Create(
+LazyTensorPtr LazyTensor::Create(
     std::shared_ptr<LazyView> view,
     const BackendDevice& device) {
-  LazyTensor xtensor(std::move(view), device);
-  LazyGraphExecutor::Get()->RegisterTensor(xtensor.data_ptr());
-  return xtensor;
+  LazyTensorPtr lazy_tensor = c10::make_intrusive<LazyTensor>(LazyTensor(std::move(view), device));
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  return lazy_tensor;
 }
 
-LazyTensor LazyTensor::Create(BackendDataPtr handle) {
-  LazyTensor xtensor(std::move(handle));
-  LazyGraphExecutor::Get()->RegisterTensor(xtensor.data_ptr());
-  return xtensor;
+LazyTensorPtr LazyTensor::Create(BackendDataPtr handle) {
+  LazyTensorPtr lazy_tensor = c10::make_intrusive<LazyTensor>(LazyTensor(std::move(handle)));
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  return lazy_tensor;
 }
 
-LazyTensor LazyTensor::Create(std::shared_ptr<Data> data) {
-  return LazyTensor(std::move(data));
+LazyTensorPtr LazyTensor::Create(std::shared_ptr<Data> data) {
+  return c10::make_intrusive<LazyTensor>(LazyTensor(std::move(data)));
 }
 
 LazyTensor::LazyTensor(const at::Tensor& tensor, const BackendDevice& device)
-    : data_(std::make_shared<Data>(tensor, device)) {}
+    : LazyTensor(std::make_shared<Data>(tensor, device)) {}
 
 LazyTensor::LazyTensor(BackendDataPtr handle)
-    : data_(std::make_shared<Data>(handle, handle->device())) {}
+    : LazyTensor(std::make_shared<Data>(handle, handle->device())) {}
 
 LazyTensor::LazyTensor(Value ir_value, const BackendDevice& device)
-    : data_(std::make_shared<Data>(std::move(ir_value), device)) {
+    : LazyTensor(std::make_shared<Data>(std::move(ir_value), device)) {
   TryLimitGraphSize();
 }
 
 LazyTensor::LazyTensor(
     std::shared_ptr<LazyView> view,
     const BackendDevice& device)
-    : data_(std::make_shared<Data>(std::move(view), device)) {}
+    : LazyTensor(std::make_shared<Data>(std::move(view), device)) {}
 
-LazyTensor::LazyTensor(std::shared_ptr<Data> data) : data_(std::move(data)) {}
+LazyTensor::LazyTensor(std::shared_ptr<Data> data)
+  : data_(std::move(data))
+  , storage_(c10::Storage({}, 0, c10::DataPtr(nullptr, backendDeviceToAtenDevice(data_->device)))) {}
 
 LazyTensor::Data* LazyTensor::data() const {
   TORCH_CHECK(data_ != nullptr, "Trying to access a null cursor");
@@ -105,7 +106,7 @@ MaybeRef<Shape> LazyTensor::shape() const {
   }
   if (data()->ir_value) {
     // TODO(whc) remove shape from LazyTensor API too!
-    return GetShapeFromTsValue(data()->ir_value);
+    return data()->ir_value.shape();
   }
   TORCH_CHECK(data()->tensor_data);
   return Shape(
@@ -200,8 +201,8 @@ void LazyTensor::SetIrValue(Value ir_value) {
 void LazyTensor::SetInPlaceIrValue(Value ir_value) {
   auto tensor_shape = shape();
   if (tensor_shape.Get().scalar_type() !=
-      GetShapeFromTsValue(ir_value).scalar_type()) {
-    ir_value = MakeNode<Cast>(ir_value, tensor_shape.Get().scalar_type());
+      ir_value.shape().scalar_type()) {
+    ir_value = MakeCast(ir_value, tensor_shape.Get().scalar_type(), c10::nullopt);
   }
   SetIrValue(std::move(ir_value));
 }
@@ -272,7 +273,7 @@ Value LazyTensor::GetIrValueForTensor(
   if (tensor.dim() == 0 && tensor.numel() == 1) {
     at::Scalar value = tensor.item();
     if (IsSpecialScalar(value)) {
-      return MakeNode<Scalar>(value, tensor.scalar_type());
+      return MakeScalar(value, tensor.scalar_type());
     }
     data = LazyGraphExecutor::Get()->GetDeviceData(tensor.cpu(), device);
     read_only = true;
@@ -296,11 +297,11 @@ std::tuple<Value, bool> LazyTensor::GetViewUpdate(
 std::shared_ptr<LazyView> LazyTensor::UpdateView(
     std::shared_ptr<LazyView> view,
     Value ir_value) const {
-  if (GetShapeFromTsValue(ir_value).sizes() != view->shape().sizes()) {
-    TORCH_CHECK(GetShapeFromTsValue(ir_value).numel() == view->shape().numel());
+  if (ir_value.shape().sizes() != view->shape().sizes()) {
+    TORCH_CHECK(ir_value.shape().numel() == view->shape().numel());
 
     ViewInfo view_info(
-        ViewInfo::Type::kReshape, GetShapeFromTsValue(ir_value), view->shape());
+        ViewInfo::Type::kReshape, ir_value.shape(), view->shape());
     view = view->CreateSubView(view_info.shape, view_info);
   }
   view->Update(std::move(ir_value));
@@ -336,16 +337,18 @@ std::shared_ptr<LazyView> LazyTensor::CreateView(ViewInfo view_info) const {
   std::shared_ptr<Alias> alias = std::make_shared<Alias>(ir_value);
   ViewInfo this_view_info(
       ViewInfo::Type::kNoOp,
-      GetShapeFromTsValue(ir_value),
-      GetShapeFromTsValue(ir_value));
+      ir_value.shape(),
+      ir_value.shape());
   data()->view = std::make_shared<LazyView>(
-      GetShapeFromTsValue(ir_value), alias, std::move(this_view_info));
+      ir_value.shape(), alias, std::move(this_view_info));
   AssignIrValue(Value());
   return std::make_shared<LazyView>(view_info.shape, alias, view_info);
 }
 
-LazyTensor LazyTensor::CreateViewTensor(ViewInfo view_info) const {
-  return Create(CreateView(std::move(view_info)), GetDevice());
+LazyTensorPtr LazyTensor::CreateViewTensor(ViewInfo view_info) const {
+  auto new_tensor = Create(CreateView(std::move(view_info)), GetDevice());
+  new_tensor->storage_ = Storage();
+  return new_tensor;
 }
 
 at::Tensor LazyTensor::ToTensor(bool detached) {
@@ -379,7 +382,7 @@ at::Tensor LazyTensor::ToTensor(bool detached) {
   return tensor;
 }
 
-void LazyTensor::ShallowCopyTo(LazyTensor* dest) const {
+void LazyTensor::ShallowCopyTo(LazyTensorPtr dest) const {
   dest->SetIrValue(GetIrValue());
 }
 
@@ -412,22 +415,22 @@ void LazyTensor::UpdateFromTensorOut(at::Tensor tensor) {
   UpdateFromTensor(std::move(tensor), /*sync=*/false);
 }
 
-void LazyTensor::UpdateFromTensorOut(const LazyTensor& tensor) {
+void LazyTensor::UpdateFromTensorOut(const LazyTensorPtr& tensor) {
   if (data()->view != nullptr &&
-      shape().Get().numel() != tensor.shape().Get().numel()) {
+      shape().Get().numel() != tensor->shape().Get().numel()) {
     data()->view = nullptr;
   }
-  SetIrValue(tensor.GetIrValue());
+  SetIrValue(tensor->GetIrValue());
 }
 
 Value LazyTensor::CreateTensorNode(BackendDataPtr data, bool read_only) const {
   data->SetInfo(std::make_shared<LazyGraphExecutor::DeviceDataInfo>(
       GetUniqueId(), read_only));
-  return MakeNode<DeviceData>(std::move(data));
+  return MakeDeviceData(std::move(data));
 }
 
-std::vector<LazyTensor> LazyTensor::MakeOutputTensors(NodePtr node) const {
-  std::vector<LazyTensor> tensors;
+std::vector<LazyTensorPtr> LazyTensor::MakeOutputTensors(NodePtr node) const {
+  std::vector<LazyTensorPtr> tensors;
   tensors.reserve(node->num_outputs());
   for (const auto i : c10::irange(node->num_outputs())) {
     tensors.push_back(Create(Value(node, i), GetDevice()));
@@ -435,7 +438,7 @@ std::vector<LazyTensor> LazyTensor::MakeOutputTensors(NodePtr node) const {
   return tensors;
 }
 
-LazyTensor LazyTensor::CopyTensorToDevice(const BackendDevice& device) {
+LazyTensorPtr LazyTensor::CopyTensorToDevice(const BackendDevice& device) {
   // TODO: This can be optimized.
   return Create(ToTensor(/*detached=*/true), device);
 }
@@ -445,7 +448,7 @@ void LazyTensor::ApplyPendingGraph() {
   // This method is called to ensure that the tensor data is available on
   // device, so that a call to CurrentDataHandle() returns a valid pointer.
   if (CurrentDataHandle() == nullptr) {
-    std::vector<LazyTensor> tensors({*this});
+    std::vector<LazyTensorPtr> tensors({c10::make_intrusive<LazyTensor>(LazyTensor(*this))});
     LazyGraphExecutor::Get()->SyncTensorsGraph(
         &tensors,
         {},
@@ -459,22 +462,35 @@ int64_t LazyTensor::GetNextTensorId() {
   return id_generator->fetch_add(1);
 }
 
-LazyTensor TryGetLtcTensor(const at::Tensor& tensor) {
+torch::lazy::Value GetTensorList(c10::ArrayRef<at::Tensor> tensors) {
+  std::vector<Value> values;
+  for (const auto& t: tensors) {
+    auto* impl = dynamic_cast<LTCTensorImpl*>(t.unsafeGetTensorImpl());
+    TORCH_INTERNAL_ASSERT(impl,
+      "GetTensorList only supports lists of valid tensors, but optional support could be added");
+    values.push_back(impl->tensor()->GetIrValue());
+  }
+
+  return torch::lazy::Value(torch::lazy::MakeTensorList(std::move(values)));
+}
+
+LazyTensorPtr TryGetLtcTensor(const at::Tensor& tensor) {
   auto* impl = dynamic_cast<LTCTensorImpl*>(tensor.unsafeGetTensorImpl());
   if (impl == nullptr) {
-    return LazyTensor();
+    // return c10::make_intrusive<LazyTensor>();
+    return LazyTensorPtr();
   }
   return impl->tensor();
 }
 
-LazyTensor GetLtcTensor(const at::Tensor& tensor) {
+LazyTensorPtr GetLtcTensor(const at::Tensor& tensor) {
   auto lazy_tensor = TryGetLtcTensor(tensor);
   CHECK(lazy_tensor) << "Input tensor is not a lazy tensor: " << tensor.toString();
   return lazy_tensor;
 }
 
-std::vector<LazyTensor> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors) {
-  std::vector<LazyTensor> ltc_tensors;
+std::vector<LazyTensorPtr> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors) {
+  std::vector<LazyTensorPtr> ltc_tensors;
   ltc_tensors.reserve(tensors.size());
   for (const auto& tensor : tensors) {
     ltc_tensors.push_back(TryGetLtcTensor(tensor));
@@ -482,12 +498,12 @@ std::vector<LazyTensor> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors) {
   return ltc_tensors;
 }
 
-LazyTensor GetOrCreateLtcTensor(const c10::optional<at::Tensor>& tensor,
+LazyTensorPtr GetOrCreateLtcTensor(const c10::optional<at::Tensor>& tensor,
                                 const BackendDevice& device) {
   return GetOrCreateLtcTensor(tensor.value_or(at::Tensor()), device);
 }
 
-LazyTensor GetLtcTensorOrCreateForWrappedNumber(const at::Tensor& tensor, const BackendDevice& device) {
+LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber(const at::Tensor& tensor, const BackendDevice& device) {
   // TODO: There are places in core where a scalar is wrapped but not marked as
   // wrapped.
   return (tensor.unsafeGetTensorImpl()->is_wrapped_number() ||
@@ -496,14 +512,13 @@ LazyTensor GetLtcTensorOrCreateForWrappedNumber(const at::Tensor& tensor, const
              : GetLtcTensor(tensor);
 }
 
-at::Tensor CreateAtenFromLtcTensor(const LazyTensor& ltc_tensor) {
-  return ltc_tensor.is_null() ? at::Tensor()
-                              : at::Tensor(c10::make_intrusive<LTCTensorImpl>(ltc_tensor));
+at::Tensor CreateAtenFromLtcTensor(const LazyTensorPtr& ltc_tensor) {
+  return ltc_tensor ? at::Tensor(c10::make_intrusive<LTCTensorImpl>(ltc_tensor))
+                    : at::Tensor();
 }
 
 at::Tensor CreateAtenFromLtcTensor(LazyTensor&& ltc_tensor) {
-  return ltc_tensor.is_null() ? at::Tensor()
-                              : at::Tensor(c10::make_intrusive<LTCTensorImpl>(std::move(ltc_tensor)));
+  return at::Tensor(c10::make_intrusive<LTCTensorImpl>(std::move(ltc_tensor)));
 }
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 136f4584aa7a..9401a43dc522 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -1,15 +1,27 @@
 #pragma once
 
+#include <c10/util/intrusive_ptr.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
 #include <torch/csrc/lazy/backend/backend_interface.h>
 #include <torch/csrc/lazy/core/ir.h>
 #include <torch/csrc/lazy/core/lazy_view.h>
 #include <torch/csrc/lazy/core/util.h>
+#include <c10/core/SymbolicIntNode.h>
+
 
 namespace torch {
 namespace lazy {
 
-class TORCH_API LazyTensor {
+class TORCH_API SymbolicIntNode: public c10::SymbolicIntNode {
+public:
+  SymbolicIntNode(NodePtr ptr): node_(std::move(ptr)) {};
+  NodePtr node_;
+};
+
+class LazyTensor;
+using LazyTensorPtr = c10::intrusive_ptr<LazyTensor>;
+
+class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
  public:
   // This is the core lazy tensor data structure where all the tensor data is
   // held. The lazy tensor is nothing more than a shared pointer to a Data
@@ -43,43 +55,41 @@ class TORCH_API LazyTensor {
     size_t generation = 1;
   };
 
-  static LazyTensor Create(
+  static LazyTensorPtr Create(
       const at::Tensor& tensor,
       const BackendDevice& device);
-  static LazyTensor Create(Value ir_value, const BackendDevice& device);
-  static LazyTensor Create(BackendDataPtr handle);
-  static LazyTensor Create(std::shared_ptr<Data> data);
-
-  // Creates an empty/null tensor.
-  LazyTensor() = default;
-
-  bool is_null() const {
-    return data_ptr() == nullptr;
-  }
-  operator bool() const {
-    return !is_null();
-  }
+  static LazyTensorPtr Create(Value ir_value, const BackendDevice& device);
+  static LazyTensorPtr Create(BackendDataPtr handle);
+  static LazyTensorPtr Create(std::shared_ptr<Data> data);
+
+  // The default ctor previously created a null LazyTensor (one with no 'data' obj).
+  // Creating a null LazyTensor is no longer possible, since the same can be achieved by
+  // creating a null LazyTensorPtr and it is way too confusing to have to check both
+  // lazy_tensor_ptr && *lazy_tensor_ptr,
+  // so everywhere that used to rely on a LazyTensor obj with a null Data can now rely on
+  // a null LazyTensorPtr instead.
+  LazyTensor() = delete;
 
   size_t generation() const {
     return data()->generation;
   }
 
-  LazyTensor alias() const {
-    return LazyTensor(data_ptr());
+  LazyTensorPtr alias() const {
+    return c10::make_intrusive<LazyTensor>(LazyTensor(data_ptr()));
   }
 
   int64_t size(int64_t dim) const;
 
   at::Tensor ToTensor(bool detached);
 
-  void ShallowCopyTo(LazyTensor* dest) const;
+  void ShallowCopyTo(LazyTensorPtr dest) const;
 
   // Assigns the tensor value to the lazy tensor.
   void SetTensor(at::Tensor tensor);
 
   void UpdateFromTensor(at::Tensor tensor, bool sync);
   void UpdateFromTensorOut(at::Tensor tensor);
-  void UpdateFromTensorOut(const LazyTensor& tensor);
+  void UpdateFromTensorOut(const LazyTensorPtr& tensor);
 
   Data* data() const;
 
@@ -121,16 +131,21 @@ class TORCH_API LazyTensor {
 
   c10::optional<at::Tensor> CurrentTensorData() const;
 
-  std::vector<LazyTensor> MakeOutputTensors(NodePtr node) const;
+  std::vector<LazyTensorPtr> MakeOutputTensors(NodePtr node) const;
 
-  LazyTensor CreateViewTensor(ViewInfo view_info) const;
-  LazyTensor CopyTensorToDevice(const BackendDevice& device);
+  LazyTensorPtr CreateViewTensor(ViewInfo view_info) const;
+  LazyTensorPtr CopyTensorToDevice(const BackendDevice& device);
 
   void ModifyCurrentView(ViewInfo view_info) const;
 
   // Applies the queue of operations in preparation for using the data.
   void ApplyPendingGraph();
 
+  const c10::Storage& Storage() const { return storage_; }
+  // This is currently only used by outlier view ops such as expand that
+  // don't go through CreateViewTensor to support Tensor.is_alias_of.
+  void SetStorage(const c10::Storage& storage) { storage_ = storage; }
+
  private:
   LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
   LazyTensor(Value ir_value, const BackendDevice& device);
@@ -138,7 +153,7 @@ class TORCH_API LazyTensor {
   explicit LazyTensor(BackendDataPtr handle);
   explicit LazyTensor(std::shared_ptr<Data> data);
 
-  static LazyTensor Create(
+  static LazyTensorPtr Create(
       std::shared_ptr<LazyView> view,
       const BackendDevice& device);
 
@@ -175,40 +190,53 @@ class TORCH_API LazyTensor {
   static int64_t GetNextTensorId();
 
   std::shared_ptr<Data> data_;
+  // Temporarily used to suport Tensor.is_alias_of().
+  // This is a fake storage that doesn't store anything.
+  // Instead it serves as a marker to mark LazyTensors that
+  // points to the same storage, and thus alias of each other.
+  // FIXME(alanwaketan): Remove this once we have functionalization (bdhirsh).
+  c10::Storage storage_;
 };
 
 // Utils to convert at::Tensor to LazyTensor, and vice versa.
+
+// Section 0: c10::Tensorlist ==> lazy::TensorList
+// note: GetTensorList is not totally parallel to GetLtcTensor; A TensorList skips
+//       the LazyTensor wrappers, assuming that the list of underlying IR nodes is
+//       actually more useful for downstream computations.  TBD.
+TORCH_API torch::lazy::Value GetTensorList(c10::ArrayRef<at::Tensor> tensors);
+
 // Section 1: at::Tensor => LazyTensor.
 // Extracts the LazyTensor out of an at::Tensor. Returns a null LazyTensor
 // if the tensor is not a lazy tensor.
-TORCH_API LazyTensor TryGetLtcTensor(const at::Tensor& tensor);
+TORCH_API LazyTensorPtr TryGetLtcTensor(const at::Tensor& tensor);
 
 // Extracts the LazyTensor out of an at::Tensor. Throws an exception
 // if the tensor is not a lazy tensor.
-TORCH_API LazyTensor GetLtcTensor(const at::Tensor& tensor);
+TORCH_API LazyTensorPtr GetLtcTensor(const at::Tensor& tensor);
 
 // Same as above, applied to a list of tensors.
-TORCH_API std::vector<LazyTensor> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors);
+TORCH_API std::vector<LazyTensorPtr> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors);
 
 // If tensor is a lazy tensor type, returns the LazyTensor embedded within it,
 // otherwise creates a new lazy tensor type with tensor as data.
-TORCH_API LazyTensor GetOrCreateLtcTensor(const c10::optional<at::Tensor>& tensor,
+TORCH_API LazyTensorPtr GetOrCreateLtcTensor(const c10::optional<at::Tensor>& tensor,
                                 const BackendDevice& device);
 
-TORCH_API LazyTensor GetLtcTensorOrCreateForWrappedNumber(const at::Tensor& tensor, const BackendDevice& device);
+TORCH_API LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber(const at::Tensor& tensor, const BackendDevice& device);
 
 // Section 2: LazyTensor => at::Tensor.
 // Creates an ATen tensor from an LazyTensor.
-TORCH_API at::Tensor CreateAtenFromLtcTensor(const LazyTensor& ltc_tensor);
+TORCH_API at::Tensor CreateAtenFromLtcTensor(const LazyTensorPtr& ltc_tensor);
 TORCH_API at::Tensor CreateAtenFromLtcTensor(LazyTensor&& ltc_tensor);
 
 template <size_t... Indices>
-auto TupleAtenFromLtcTensorsImpl(const std::vector<LazyTensor>& tensors, std::index_sequence<Indices...>) {
+auto TupleAtenFromLtcTensorsImpl(const std::vector<LazyTensorPtr>& tensors, std::index_sequence<Indices...>) {
     return std::make_tuple(CreateAtenFromLtcTensor(tensors[Indices])...);
 }
 
 template <size_t N>
-auto TupleAtenFromLtcTensors(const std::vector<LazyTensor>& tensors) {
+auto TupleAtenFromLtcTensors(const std::vector<LazyTensorPtr>& tensors) {
     return TupleAtenFromLtcTensorsImpl(tensors, std::make_index_sequence<N>{});
 }
 
diff --git a/torch/csrc/lazy/core/tensor_impl.cpp b/torch/csrc/lazy/core/tensor_impl.cpp
index f0f09256da49..db6631189be9 100644
--- a/torch/csrc/lazy/core/tensor_impl.cpp
+++ b/torch/csrc/lazy/core/tensor_impl.cpp
@@ -1,8 +1,10 @@
 #include <torch/csrc/lazy/core/tensor_impl.h>
 
+#include <c10/core/Allocator.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/tensor_util.h>
 
 namespace torch {
@@ -64,6 +66,10 @@ C10_REGISTER_GUARD_IMPL(Lazy, LTCGuardImpl);
 
 }  // namespace
 
+// TODO(whc) when do we want to clone vs share?
+LTCTensorImpl::LTCTensorImpl(const LazyTensorPtr& tensor)
+    : LTCTensorImpl(LazyTensor(*tensor)) {}
+
 LTCTensorImpl::LTCTensorImpl(const LazyTensor& tensor)
     : LTCTensorImpl(LazyTensor(tensor)) {}
 
@@ -72,14 +78,15 @@ LTCTensorImpl::LTCTensorImpl(LazyTensor&& tensor)
                                           c10::DispatchKey::AutogradLazy},
                       c10::scalarTypeToTypeMeta(tensor.dtype()),
                       backendDeviceToAtenDevice(tensor.GetDevice())),
-      tensor_(std::move(tensor)) {
+      tensor_(c10::make_intrusive<LazyTensor>(std::move(tensor))) {
   // This is a temporary fix for a PyTorch core issue,
   // according to https://github.com/pytorch/xla/pull/2682.
   is_non_overlapping_and_dense_ = false;
+  set_sizes_strides_policy(SizesStridesPolicy::CustomSizes);
 }
 
-void LTCTensorImpl::set_tensor(const LazyTensor& lazy_tensor) {
-  tensor_ = lazy_tensor;
+void LTCTensorImpl::set_tensor(const LazyTensorPtr& lazy_tensor) {
+  tensor_ = c10::make_intrusive<LazyTensor>(*lazy_tensor);
   generation_ = 0;
 }
 
@@ -116,81 +123,61 @@ void LTCTensorImpl::shallow_copy_from(
       /*dest_impl=*/this,
       /*version_counter=*/version_counter(),
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
-  ltc_impl->tensor_.ShallowCopyTo(&tensor_);
+  ltc_impl->tensor_->ShallowCopyTo(tensor_);
   generation_ = 0;
 }
 
-int64_t LTCTensorImpl::size(int64_t d) const {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::size(d);
-}
-
-int64_t LTCTensorImpl::stride(int64_t d) const {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::stride(d);
-}
-
 void LTCTensorImpl::setup_size_properties() {
-  size_t generation = tensor_.generation();
+  size_t generation = tensor_->generation();
   if (generation != generation_) {
     // Fill up the basic dimension data members which the base class
     // implementation uses in its APIs.
-    auto shape = tensor_.shape();
+    auto shape = tensor_->shape();
     // We can't call refresh_numel() given we override sizes() too.
     numel_ = shape.Get().numel();
     sizes_and_strides_.set_sizes(shape.Get().sizes());
     // We can't call empty_tensor_restride(c10::MemoryFormat::Contiguous) given we override sizes() too.
     std::vector<int64_t> updated_strides;
     updated_strides = ComputeArrayStrides(shape.Get().sizes());
-    for (int i = 0; i < updated_strides.size(); i++) {
+    for (const auto i : c10::irange(updated_strides.size())) {
       sizes_and_strides_.stride_at_unchecked(i) = updated_strides[i];
     }
     generation_ = generation;
   }
 }
 
-#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-
-at::IntArrayRef LTCTensorImpl::sizes() const {
+at::IntArrayRef LTCTensorImpl::sizes_custom() const {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::sizes();
+  return sizes_default();
 }
 
-at::IntArrayRef LTCTensorImpl::strides() const {
+at::IntArrayRef LTCTensorImpl::strides_custom() const {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::strides();
+  return strides_default();
 }
 
-int64_t LTCTensorImpl::dim() const {
+int64_t LTCTensorImpl::dim_custom() const {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::dim();
+  return dim_default();
 }
 
-int64_t LTCTensorImpl::numel() const {
+int64_t LTCTensorImpl::numel_custom() const {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<LTCTensorImpl*>(this)->setup_size_properties();
-  return c10::TensorImpl::numel();
+  return numel_default();
 }
 
-bool LTCTensorImpl::is_contiguous(c10::MemoryFormat _unused) const {
-  if (tensor_.CurrentTensorData()) {
-    return tensor_.CurrentTensorData()->is_contiguous();
+bool LTCTensorImpl::is_contiguous_custom(c10::MemoryFormat _unused) const {
+  if (tensor_->CurrentTensorData()) {
+    return tensor_->CurrentTensorData()->is_contiguous();
   }
   // Only check that the storage is already contiguous.
   CHECK(is_contiguous_) << "Non-contiguous storage for lazy tensor";
   return true;
 }
 
-const at::Storage& LTCTensorImpl::storage() const {
-  TORCH_CHECK(false, "Lazy tensors do not have storage");
-}
-
-#endif  // C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-
 }  // namespace lazy
 }  // namespace torch
diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h
index 2881aa0bd1e6..e0bac8f906d2 100644
--- a/torch/csrc/lazy/core/tensor_impl.h
+++ b/torch/csrc/lazy/core/tensor_impl.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <ATen/Tensor.h>
-#include <c10/core/Storage.h>
 #include <c10/core/TensorImpl.h>
 
 #include <torch/csrc/lazy/core/tensor.h>
@@ -13,12 +12,13 @@ namespace lazy {
 // Its scope is just to handle an LazyTensor.
 class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
  public:
+  explicit LTCTensorImpl(const LazyTensorPtr& tensor);
   explicit LTCTensorImpl(const LazyTensor& tensor);
   explicit LTCTensorImpl(LazyTensor&& tensor);
 
-  LazyTensor& tensor() { return tensor_; }
+  LazyTensorPtr tensor() { return tensor_; }
 
-  void set_tensor(const LazyTensor& lazy_tensor);
+  void set_tensor(const LazyTensorPtr& lazy_tensor);
 
   void force_refresh_sizes() { generation_ = 0; }
 
@@ -32,25 +32,21 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
 
   void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
 
-  int64_t size(int64_t d) const override;
-
-  int64_t stride(int64_t d) const override;
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t dim_custom() const override;
+  int64_t numel_custom() const override;
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
 
 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-  at::IntArrayRef sizes() const override;
-  at::IntArrayRef strides() const override;
-  int64_t dim() const override;
-  int64_t numel() const override;
-
-  bool is_contiguous(at::MemoryFormat memory_format) const override;
-  const at::Storage& storage() const override;
-  bool has_storage() const override { return false; }
+  const at::Storage& storage() const override { return tensor_->Storage(); }
+  bool has_storage() const override { return tensor_->Storage(); }
 #endif  // C10_DISABLE_TENSORIMPL_EXTENSIBILITY
 
  private:
   void setup_size_properties();
 
-  LazyTensor tensor_;
+  LazyTensorPtr tensor_;
   size_t generation_ {0};
 };
 
diff --git a/torch/csrc/lazy/core/tensor_util.cpp b/torch/csrc/lazy/core/tensor_util.cpp
index 63926fb59db9..1d9c5c36dbaf 100644
--- a/torch/csrc/lazy/core/tensor_util.cpp
+++ b/torch/csrc/lazy/core/tensor_util.cpp
@@ -6,6 +6,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
 #include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/config.h>
 #include <torch/csrc/lazy/core/helpers.h>
 
 #include <algorithm>
@@ -57,10 +58,7 @@ std::vector<BackendDataPtr> CreateTensorsData(
 }
 
 bool IsSpecialScalar(const at::Scalar& value) {
-  static bool no_scalars = false;
-  // TODO: need to clean up all the env options
-  // lazy_tensors::sys_util::GetEnvBool("NO_SPECIAL_SCALARS", false);
-  if (!no_scalars && (value.isIntegral(false) || value.isFloatingPoint())) {
+  if (FLAGS_torch_lazy_handle_special_scalars && (value.isIntegral(false) || value.isFloatingPoint())) {
     double scalar_value = value.toDouble();
     return scalar_value == 0.0 || std::fabs(scalar_value) == 1.0;
   }
diff --git a/torch/csrc/lazy/core/trie.cpp b/torch/csrc/lazy/core/trie.cpp
new file mode 100644
index 000000000000..6566f62f9df5
--- /dev/null
+++ b/torch/csrc/lazy/core/trie.cpp
@@ -0,0 +1,85 @@
+#include <torch/csrc/lazy/core/trie.h>
+
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <fstream>
+#include <sstream>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+void TraverseTrie(TrieNode* node, std::stringstream& ss) {
+  if (!node) {
+    return;
+  }
+  if (node->ir_node) {
+    ss << node->unique_id << "[label=\"" << node->ir_node->op().ToString()
+        << ", " << node->hit_counter << " hits\"]\n";
+  }
+  for (auto& successor : node->successors) {
+    ss << node->unique_id << " -> " << successor->unique_id << "\n";
+    TraverseTrie(successor.get(), ss);
+  }
+}
+} // namespace
+
+TrieCache* TrieCache::Get() {
+  static thread_local TrieCache* trie = new TrieCache();
+  return trie;
+}
+
+TrieCache::TrieCache() : root_(std::make_shared<TrieNode>()), current_(root_.get()) {}
+
+TrieNode* TrieCache::Current() const {
+  return current_;
+}
+
+void TrieCache::SetCurrent(std::list<std::shared_ptr<TrieNode>>::iterator& iter) {
+  auto& successors = current_->successors;
+  // Update current_ before iter gets destroyed
+  current_ = (*iter).get();
+
+  // Insert this node to the front of its parent's successor list
+  if (iter != successors.begin()) {
+    successors.push_front(std::move(*iter));
+    successors.erase(iter);
+  }
+}
+
+void TrieCache::ResetCurrent() {
+  current_ = root_.get();
+}
+
+void TrieCache::Insert(NodePtr ir_node) {
+  TORCH_CHECK(current_);
+  if (!current_->successors.empty()) {
+    TORCH_LAZY_COUNTER("TrieForked", 1);
+  }
+  auto new_node = std::make_shared<TrieNode>(std::move(ir_node));
+  current_->successors.push_front(std::move(new_node));
+  // Update current_ to the newly inserted node
+  current_ = current_->successors.front().get();
+}
+
+void TrieCache::Clear() {
+  ResetCurrent();
+  // Clear at the root level should be sufficient because all the nodes
+  // are created as shared_ptr.
+  root_->successors.clear();
+}
+
+void TrieCache::DumpToDotFile(const std::string& file_name) {
+  std::stringstream ss;
+  ss << "digraph G {\n";
+  TraverseTrie(root_.get(), ss);
+  ss << "}\n";
+
+  std::ofstream graph_file(file_name);
+  graph_file << ss.str();
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/trie.h b/torch/csrc/lazy/core/trie.h
new file mode 100644
index 000000000000..cfe1142a9758
--- /dev/null
+++ b/torch/csrc/lazy/core/trie.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <atomic>
+#include <list>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/metrics.h>
+
+namespace torch {
+namespace lazy {
+
+struct TORCH_API TrieNode {
+  static size_t GetNextUniqueId() {
+    static thread_local size_t id_generator = 0;
+    return id_generator++;
+  }
+
+  size_t unique_id;
+  size_t hit_counter;
+  NodePtr ir_node;
+  std::list<std::shared_ptr<TrieNode>> successors;
+
+  TrieNode() : unique_id(GetNextUniqueId()), hit_counter(0), ir_node(nullptr) {}
+  explicit TrieNode(NodePtr node)
+      : unique_id(GetNextUniqueId()), hit_counter(0), ir_node(std::move(node)) {}
+};
+
+class TORCH_API TrieCache {
+ public:
+  static TrieCache* Get();
+
+  TrieNode* Current() const;
+  // Take an iterator as the input because we want to move the corresponding
+  // node in the successor list to achieve a LRU caching effect
+  void SetCurrent(std::list<std::shared_ptr<TrieNode>>::iterator& iter);
+  // Used in MarkStep to indicate the end of one tracing
+  void ResetCurrent();
+
+  // Create a new TrieNode for ir_node and insert into the TrieCache
+  void Insert(NodePtr ir_node);
+
+  // Clear all TrieCache nodes
+  // TODO: Because we don't expect user to explicitly call this function via
+  // a Python API, we may need to introduce a threshold on the size of the cache
+  // to avoid holding tensors for too long.
+  void Clear();
+
+  void DumpToDotFile(const std::string& file_name);
+
+ private:
+  TrieCache();
+
+  std::shared_ptr<TrieNode> root_;
+  TrieNode* current_;
+};
+
+template <typename T, typename... Args>
+NodePtr LookupNodeFromTrieCache(Args&&... args) {
+  auto& successors = TrieCache::Get()->Current()->successors;
+  for (auto it = successors.begin(); it != successors.end(); it++) {
+    NodePtr ir_node = (*it)->ir_node;
+    const T* concrete_node = NodeCast<T>(ir_node.get());
+    if (concrete_node && concrete_node->CanBeReused(std::forward<Args>(args)...)) {
+      TORCH_LAZY_COUNTER("IrNodeReused_" + c10::demangle((typeid(T).name())), 1);
+      (*it)->hit_counter++;
+      TrieCache::Get()->SetCurrent(it);
+      return ir_node;
+    }
+  }
+  return nullptr;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/util.h b/torch/csrc/lazy/core/util.h
index 77adfe23932f..e324099af8b1 100644
--- a/torch/csrc/lazy/core/util.h
+++ b/torch/csrc/lazy/core/util.h
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
 
 namespace torch {
 namespace lazy {
@@ -93,7 +94,7 @@ std::vector<T> ToVector(const S& input) {
 }
 
 template<typename T>
-c10::optional<std::vector<T>> ToOptionalVector(c10::optional<c10::ArrayRef<T>> arrayRef) {
+c10::optional<std::vector<T>> ToOptionalVector(c10::OptionalArrayRef<T> arrayRef) {
   if (arrayRef) {
     return arrayRef->vec();
   }
diff --git a/torch/csrc/lazy/core/view_ops/diagonal.cpp b/torch/csrc/lazy/core/view_ops/diagonal.cpp
deleted file mode 100644
index 197b9632f102..000000000000
--- a/torch/csrc/lazy/core/view_ops/diagonal.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <c10/util/irange.h>
-#include <torch/csrc/lazy/core/view_ops/diagonal.h>
-
-#include <cmath>
-
-namespace torch {
-namespace lazy {
-
-Diagonal::Diagonal(
-    const Value& input,
-    int64_t offset,
-    int64_t dim1,
-    int64_t dim2)
-    : TsNode(
-          OpKind(at::aten::diagonal),
-          {input},
-          [&]() {
-            return MakeDiagonalShape(
-                GetShapeFromTsValue(input), offset, dim1, dim2);
-          },
-          /*num_outputs=*/1,
-          MHash(offset, dim1, dim2)),
-      offset_(offset),
-      dim1_(dim1),
-      dim2_(dim2) {}
-
-std::string Diagonal::ToString() const {
-  std::stringstream ss;
-  ss << TsNode::ToString() << ", offset=" << offset_ << ", dim1=" << dim1_
-     << ", dim2=" << dim2_;
-  return ss.str();
-}
-
-Shape Diagonal::MakeDiagonalShape(
-    const Shape& shape,
-    int64_t offset,
-    int64_t dim1,
-    int64_t dim2) {
-  std::vector<int64_t> dimensions;
-  for (const auto dim : c10::irange(shape.dim())) {
-    if (dim != dim1 && dim != dim2) {
-      dimensions.push_back(shape.size(dim));
-    }
-  }
-  int64_t dsize = 0;
-  if (offset >= 0) {
-    dsize = std::max<int64_t>(
-        std::min(shape.size(dim1), shape.size(dim2) - offset), 0);
-  } else {
-    dsize = std::max<int64_t>(
-        std::min(shape.size(dim1) + offset, shape.size(dim2)), 0);
-  }
-  dimensions.push_back(dsize);
-  return Shape(shape.scalar_type(), dimensions);
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/permute.cpp b/torch/csrc/lazy/core/view_ops/permute.cpp
deleted file mode 100644
index 9e41be67a1a7..000000000000
--- a/torch/csrc/lazy/core/view_ops/permute.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <torch/csrc/lazy/core/view_ops/permute.h>
-
-#include <torch/csrc/lazy/core/helpers.h>
-
-namespace torch {
-namespace lazy {
-
-Permute::Permute(const Value& input, std::vector<int64_t> dims)
-    : TsNode(
-          OpKind(at::aten::permute),
-          {input},
-          /*num_outputs=*/1,
-          MHash(dims)),
-      dims_(std::move(dims)) {
-  SetShapeDeferred([&]() {
-    return MakePermuteShape(GetShapeFromTsOutput(operand(0)), dims_);
-  });
-}
-
-std::string Permute::ToString() const {
-  std::stringstream ss;
-  ss << TsNode::ToString() << ", dims=(" << c10::Join(", ", dims_) << ")";
-  return ss.str();
-}
-
-Shape Permute::MakePermuteShape(
-    const Shape& source_shape,
-    c10::ArrayRef<int64_t> permutation) {
-  return Shape(
-      source_shape.scalar_type(),
-      PermuteDimensions(permutation, source_shape.sizes()));
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/select.cpp b/torch/csrc/lazy/core/view_ops/select.cpp
deleted file mode 100644
index c78fa507e247..000000000000
--- a/torch/csrc/lazy/core/view_ops/select.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <torch/csrc/lazy/core/view_ops/select.h>
-
-#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
-
-namespace torch {
-namespace lazy {
-
-Select::Select(
-    const Value& input,
-    int64_t dim,
-    int64_t start,
-    int64_t end,
-    int64_t stride)
-    : TsNode(
-          OpKind(at::aten::select),
-          {input},
-          [&]() {
-            return MakeSelectShape(
-                GetShapeFromTsValue(input), dim, start, end, stride);
-          },
-          /*num_outputs=*/1,
-          MHash(dim, start, end, stride)),
-      dim_(dim),
-      start_(start),
-      end_(end),
-      stride_(stride) {}
-
-std::string Select::ToString() const {
-  std::stringstream ss;
-  ss << TsNode::ToString() << ", dim=" << dim_ << ", start=" << start_
-     << ", end=" << end_ << ", stride=" << stride_;
-  return ss.str();
-}
-
-Shape Select::MakeSelectShape(
-    const Shape& shape,
-    int64_t dim,
-    int64_t start,
-    int64_t end,
-    int64_t stride) {
-  int64_t effective_stride = GetStride(start, end, stride);
-  Shape select_shape(shape);
-  select_shape.set_size(
-      dim, (end - start + effective_stride - 1) / effective_stride);
-  return select_shape;
-}
-
-int64_t Select::GetStride(int64_t start, int64_t end, int64_t stride) {
-  if (stride == 0) {
-    CHECK_EQ(start, end);
-    stride = 1;
-  }
-  return stride;
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/squeeze.cpp b/torch/csrc/lazy/core/view_ops/squeeze.cpp
deleted file mode 100644
index 3f020a2137d7..000000000000
--- a/torch/csrc/lazy/core/view_ops/squeeze.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <torch/csrc/lazy/core/view_ops/squeeze.h>
-#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
-
-namespace torch {
-namespace lazy {
-
-// This is almost like at::inferSqueezeGeometry, but that requires a Tensor input
-// and also computes new strides.  This logic seems correct.
-std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
-                                             int64_t squeeze_dim) {
-  std::vector<int64_t> output_dimensions;
-  for (int64_t i = 0; i < dimensions.size(); ++i) {
-    int64_t dim = dimensions[i];
-    if (dim != 1 || (i != squeeze_dim && squeeze_dim >= 0)) {
-      output_dimensions.push_back(dim);
-    }
-  }
-  return output_dimensions;
-}
-
-Squeeze::Squeeze(const torch::lazy::Value& input, int dim)
-    : torch::lazy::TsNode(torch::lazy::OpKind(at::aten::squeeze), {input},
-                          /*num_outputs=*/1, torch::lazy::MHash(dim)),
-      dim_(dim) {
-  SetShapeDeferred(
-      [&]() {
-        const auto& input_shape = GetShapeFromTsValue(input);
-        return torch::lazy::Shape(input_shape.scalar_type(),
-          BuildSqueezedDimensions(input_shape.sizes(), dim));
-      });
-}
-
-std::string Squeeze::ToString() const {
-  std::stringstream ss;
-  ss << torch::lazy::TsNode::ToString() << ", dim=" << dim_;
-  return ss.str();
-}
-
-}  // namespace lazy
-}  // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/squeeze.h b/torch/csrc/lazy/core/view_ops/squeeze.h
deleted file mode 100644
index 3244252c9225..000000000000
--- a/torch/csrc/lazy/core/view_ops/squeeze.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <torch/csrc/lazy/ts_backend/ts_node.h>
-
-namespace torch {
-namespace lazy {
-
-TORCH_API std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
-                                             int64_t squeeze_dim);
-
-class TORCH_API Squeeze : public TsNode {
- public:
-  // Squeeze out the specified dimension index, -1 for all trivial dimensions.
-  Squeeze(const torch::lazy::Value& input, int dim);
-
-  std::string ToString() const override;
-
-  int dim() const { return dim_; }
-
- private:
-  int dim_;
-};
-
-}  // namespace lazy
-}  // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/unsqueeze.cpp b/torch/csrc/lazy/core/view_ops/unsqueeze.cpp
deleted file mode 100644
index 4cea53837406..000000000000
--- a/torch/csrc/lazy/core/view_ops/unsqueeze.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <torch/csrc/lazy/core/view_ops/unsqueeze.h>
-#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
-
-namespace torch {
-namespace lazy {
-
-std::vector<int64_t> BuildUnsqueezedDimensions(
-    c10::ArrayRef<int64_t> dimensions,
-    int64_t squeeze_dim) {
-  std::vector<int64_t> output_dimensions(
-      dimensions.cbegin(), dimensions.cend());
-  output_dimensions.insert(output_dimensions.begin() + squeeze_dim, 1);
-  return output_dimensions;
-}
-
-Unsqueeze::Unsqueeze(const torch::lazy::Value& input, int dim)
-    : torch::lazy::TsNode(
-          torch::lazy::OpKind(at::aten::unsqueeze),
-          {input},
-          /*num_outputs=*/1,
-          torch::lazy::MHash(dim)),
-      dim_(dim) {
-  SetShapeDeferred([&]() {
-    const auto& input_shape = GetShapeFromTsValue(input);
-    return torch::lazy::Shape(
-        input_shape.scalar_type(),
-        BuildUnsqueezedDimensions(input_shape.sizes(), dim));
-  });
-}
-
-std::string Unsqueeze::ToString() const {
-  std::stringstream ss;
-  ss << torch::lazy::TsNode::ToString() << ", dim=" << dim_;
-  return ss.str();
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/unsqueeze.h b/torch/csrc/lazy/core/view_ops/unsqueeze.h
deleted file mode 100644
index 113e2da8227f..000000000000
--- a/torch/csrc/lazy/core/view_ops/unsqueeze.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <torch/csrc/lazy/ts_backend/ts_node.h>
-
-namespace torch {
-namespace lazy {
-
-TORCH_API std::vector<int64_t> BuildUnsqueezedDimensions(
-    c10::ArrayRef<int64_t> dimensions,
-    int64_t squeeze_dim);
-
-class TORCH_API Unsqueeze : public TsNode {
- public:
-  Unsqueeze(const torch::lazy::Value& input, int dim);
-
-  std::string ToString() const override;
-
-  int dim() const {
-    return dim_;
-  }
-
- private:
-  int dim_;
-};
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/generated/README.md b/torch/csrc/lazy/generated/README.md
new file mode 100644
index 000000000000..0a0c1b0f4d0c
--- /dev/null
+++ b/torch/csrc/lazy/generated/README.md
@@ -0,0 +1,18 @@
+This folder contains generated sources for the lazy torchscript backend.
+
+The main input file that drives which operators get codegen support for torchscript backend is
+[../../../../aten/src/ATen/native/ts_native_functions.yaml](../../../../aten/src/ATen/native/ts_native_functions.yaml)
+
+The code generator lives at `torchgen/gen_lazy_tensor.py`.
+
+It is called automatically by the torch autograd codegen (`tools/setup_helpers/generate_code.py`)
+as a part of the build process in OSS builds (CMake/Bazel) and Buck.
+
+External backends (e.g. torch/xla) call `gen_lazy_tensor.py` directly,
+and feed it command line args indicating where the output files should go.
+
+For more information on codegen, see these resources:
+* Info about lazy tensor codegen: [gen_lazy_tensor.py docs](../../../../torchgen/gen_lazy_tensor.py)
+* Lazy TorchScript backend native functions: [ts_native_functions.yaml](../../../../aten/src/ATen/native/ts_native_functions.yaml)
+* Source of truth for native func definitions [ATen native_functions.yaml](../../../../aten/src/ATen/native/native_functions.yaml)
+* Info about native functions [ATen nativefunc README.md](../../../../aten/src/ATen/native/README.md)
diff --git a/torch/csrc/lazy/python/README.md b/torch/csrc/lazy/python/README.md
new file mode 100644
index 000000000000..28835a73e76b
--- /dev/null
+++ b/torch/csrc/lazy/python/README.md
@@ -0,0 +1,10 @@
+# Lazy Tensor Python Code
+
+Lazy Tensor Core is part of libtorch, which can not depend on python.
+
+Parts of lazy tensor core use python for 2 purposes
+A) py bindings let python programs call into lazy tensor c++ code
+B) lazy tensor core calls into python to use it (e.g. for grabbing stack traces)
+
+(A) is trivial since the python bindings only depend on libtorch;
+(B) requires making libtorch_python register a function with libtorch if loaded, and having a default (no-op) function otherwise.  Any functionality that strictly needs to depend on python should be part of the 'python' folder.
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
new file mode 100644
index 000000000000..d36592d10abc
--- /dev/null
+++ b/torch/csrc/lazy/python/init.cpp
@@ -0,0 +1,298 @@
+#include <torch/csrc/lazy/python/init.h>
+
+#include <c10/core/Device.h>
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/trie.h>
+#include <torch/csrc/lazy/python/python_util.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/ir_dump_util.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/config.h>
+#if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+#endif // FBCODE_CAFFE2 || OVRSOURCE
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+// TODO(whc) backend 'device' related APIs are not very clear, this code could be
+// simplified but it should probably be done together with designing/refactoring
+// the overall approach to get/set of default eager/lazy device types
+torch::lazy::BackendDevice GetDeviceOrCurrent(const std::string& device_str) {
+  if (device_str.empty()) {
+    getBackend()->GetDefaultDeviceType();
+    return torch::lazy::BackendDevice();
+  }
+  return torch::lazy::atenDeviceToBackendDevice(c10::Device(device_str));
+}
+
+std::ptrdiff_t GetTensorId(const at::Tensor& tensor) {
+  torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
+  return lazy_tensor->GetUniqueId();
+}
+
+std::string GetTensorsDump(
+    const std::vector<at::Tensor>& tensors,
+    const std::function<std::string(c10::ArrayRef<torch::lazy::Node*>)>&
+        coverter) {
+  std::vector<torch::lazy::Node*> nodes;
+  std::vector<torch::lazy::Value> values;
+  for (auto& tensor : tensors) {
+    torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
+    values.push_back(lazy_tensor->GetIrValue());
+    nodes.push_back(values.back().node.get());
+  }
+  return coverter(nodes);
+}
+
+std::vector<torch::lazy::LazyTensorPtr> GetLtcTensors(const std::vector<at::Tensor>& tensors,
+                                      bool want_all) {
+  std::vector<torch::lazy::LazyTensorPtr> lazy_tensors;
+  lazy_tensors.reserve(tensors.size());
+  if (want_all) {
+    for (auto& tensor : tensors) {
+      lazy_tensors.push_back(torch::lazy::TryGetLtcTensor(tensor));
+    }
+  } else {
+    for (auto& tensor : tensors) {
+      auto lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
+      if (lazy_tensor) {
+        lazy_tensors.push_back(lazy_tensor);
+      }
+    }
+  }
+  return lazy_tensors;
+}
+
+std::string GetTensorsBackendGraph(const std::vector<at::Tensor>& tensors) {
+  std::vector<torch::lazy::LazyTensorPtr> lazy_tensors = GetLtcTensors(tensors, /*want_all=*/false);
+  return torch::lazy::LazyGraphExecutor::Get()->DumpBackendComputation(lazy_tensors);
+}
+
+void SyncTensors(const std::vector<at::Tensor>& tensors,
+                 const std::vector<std::string>& devices, bool wait,
+                 bool sync_ltc_data) {
+  std::vector<torch::lazy::LazyTensorPtr> lazy_tensors = GetLtcTensors(tensors, /*want_all=*/false);
+  torch::lazy::LazyGraphExecutor::Get()->SyncTensorsGraph(&lazy_tensors, devices, wait,
+                                             sync_ltc_data);
+}
+
+void initLazyBindings(PyObject* module){
+  auto m = py::handle(module).cast<py::module>();
+  auto lazy = m.def_submodule("_lazy");
+  auto lazy_ts_backend = m.def_submodule("_lazy_ts_backend");
+
+  lazy.def(
+      "_mark_step",
+      // TODO(whc) this API should probably change from vector<string> to
+      // vector<c10::device> but in a separate PR
+      [](const std::string& device_str, const std::vector<std::string>& devices,
+         bool wait) {
+        pybind11::gil_scoped_release no_gil;
+        auto backend_device = GetDeviceOrCurrent(device_str);
+        torch::lazy::LazyGraphExecutor::Get()->SyncLiveTensorsGraph(&backend_device, devices, wait);
+        torch::lazy::LazyGraphExecutor::Get()->MarkStep(backend_device);
+      },
+      py::arg("device") = "", py::arg("devices"), py::arg("wait") = true);
+  lazy.def(
+      "_wait_device_ops",
+      [](const std::vector<std::string>& devices) {
+        pybind11::gil_scoped_release no_gil;
+        // TODO: Add support of non-empty devices.
+        if (!devices.empty()) {
+          LOG(ERROR) << "Non-empty devices are not supported.";
+        }
+        torch::lazy::LazyGraphExecutor::Get()->WaitDeviceOps({});
+      },
+      py::arg("devices"));
+  lazy.def("_reset_metrics",
+        []() { torch::lazy::MetricsArena::Get()->Reset(); });
+  lazy.def("_counter_names", []() { return torch::lazy::GetCounterNames(); });
+  lazy.def("_counter_value", [](const std::string& name) -> py::object {
+    torch::lazy::CounterData* data = torch::lazy::GetCounter(name);
+    return data != nullptr ? py::cast<int64_t>(data->Value()) : py::none();
+  });
+  lazy.def("_get_tensor_id", [](const at::Tensor& tensor) { return GetTensorId(tensor); });
+
+  lazy.def("_get_tensors_text",
+        [](const std::vector<at::Tensor>& tensors) -> std::string {
+          auto coverter = [](c10::ArrayRef<torch::lazy::Node*> nodes) {
+            return torch::lazy::DumpUtil::ToText(nodes);
+          };
+          return GetTensorsDump(tensors, coverter);
+        });
+  lazy.def("_get_tensors_dot",
+        [](const std::vector<at::Tensor>& tensors) -> std::string {
+          auto coverter = [](c10::ArrayRef<torch::lazy::Node*> nodes) {
+            return torch::lazy::DumpUtil::ToDot(nodes);
+          };
+          return GetTensorsDump(tensors, coverter);
+        });
+  lazy.def("_get_tensors_backend",
+        [](const std::vector<at::Tensor>& tensors) -> std::string {
+          return GetTensorsBackendGraph(tensors);
+        });
+  lazy.def("_get_graph_hash", [](const std::vector<at::Tensor>& tensors) {
+    std::vector<LazyTensorPtr> xtensors;
+    xtensors.reserve(tensors.size());
+    for (auto& tensor : tensors) {
+      xtensors.push_back(TryGetLtcTensor(tensor));
+    }
+    auto hash = LazyGraphExecutor::Get()->GetGraphHash(xtensors);
+    std::string bin((const char*) &hash, sizeof(hash));
+    return py::bytes(bin);
+  });
+  lazy.def(
+      "_sync_multi",
+      [](const std::vector<at::Tensor>& tensors,
+         const std::vector<std::string>& devices, bool wait,
+         bool sync_ltc_data) {
+        pybind11::gil_scoped_release no_gil;
+        SyncTensors(tensors, devices, wait, sync_ltc_data);
+      },
+      py::arg("tensors"), py::arg("devices"), py::arg("wait") = true,
+      py::arg("sync_ltc_data") = true);
+
+  lazy.def(
+    "_get_force_fallback", []() {
+        return torch::lazy::getLTCForceFallback();
+    }
+  );
+  lazy.def(
+    "_set_force_fallback", [](std::string newval) {
+        torch::lazy::getLTCForceFallback() = newval;
+    }
+  );
+  lazy.def(
+    "_clear_ir_cache", []() {
+        TrieCache::Get()->Clear();
+    }
+  );
+  lazy.def(
+    "_dump_ir_cache", [](std::string filename) {
+        TrieCache::Get()->DumpToDotFile(filename);
+    }
+  );
+  lazy.def(
+    "_set_reuse_ir", [](bool val) {
+        FLAGS_torch_lazy_reuse_ir = val;
+    }
+  );
+  lazy.def(
+    "_set_symbolic_shape_mode", [](bool val) {
+        FLAGS_ltc_enable_symbolic_shapes = val;
+    }
+  );
+  lazy.def(
+    "_get_symbolic_shape_mode", []() {
+        return FLAGS_ltc_enable_symbolic_shapes;
+    }
+  );
+
+  lazy_ts_backend.def(
+    "_init",
+    []() {
+#if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+      torch::lazy::InitTorchScriptBackend();
+#else
+      TORCH_CHECK(false, "TorchScript backend not yet supported in FBCODE/OVRSOURCE builds");
+#endif  // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+    });
+
+  /*
+   * Return tensor ids and tensors for DeviceData nodes.
+   * TODO(shunting) revisit this API for XLA
+   */
+  lazy_ts_backend.def("_get_tensors_ts_device_data_node",
+        [](const std::vector<at::Tensor>& tensors) -> std::pair<std::vector<int64_t>, std::vector<at::IValue>> {
+#if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+          std::vector<Node*> roots;
+          for (auto& tensor : tensors) {
+            auto xtensor = TryGetLtcTensor(tensor);
+            roots.push_back(xtensor->GetIrValue().node.get());
+          }
+          auto post_order = Util::ComputePostOrder(roots);
+          std::vector<int64_t> tensor_ids;
+          std::vector<at::IValue> ivalues;
+
+          std::unordered_set<BackendData::Handle> data_handles_;
+          for (auto nodeptr : post_order) {
+            if (nodeptr->op() == *torch::lazy::ltc_device_data) {
+              const auto backend_data = getBackend()->GetComputationDataFromNode(nodeptr);
+
+              auto infoptr = backend_data->info();
+              auto deviceDataInfoPtr = (torch::lazy::LazyGraphExecutor::DeviceDataInfo*) infoptr;
+              auto* tsDataPtr = (torch::lazy::TSData*) backend_data.get();
+
+              // dedup DeviceData by handle
+              auto handle = tsDataPtr->GetHandle();
+              if (!data_handles_.insert(handle).second) {
+                continue;
+              }
+              tensor_ids.push_back(deviceDataInfoPtr->tensor_id);
+              /*
+               * If the TSData contains a tensor, then the tensor id will uniquely identify the tensor.
+               * We use that tensor id to find the tensor in other places: e.g. in the python forward method parameters.
+               *
+               * If the TSData contains a scalar, the tensor id itself is not important. We reuse the scalar value in
+               * future calls.
+               */
+              if (tsDataPtr->HasValue()) {
+                ivalues.emplace_back(tsDataPtr->data());
+              } else {
+                CHECK(tsDataPtr->scalar.has_value());
+                ivalues.emplace_back(tsDataPtr->scalar.value());
+              }
+            }
+          }
+          return std::make_pair(tensor_ids, ivalues);
+#else
+          TORCH_CHECK(false, "TorchScript backend not yet supported in FBCODE builds");
+          return std::make_pair(std::vector<int64_t>(), std::vector<at::IValue>());
+#endif  // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+        });
+  // TODO(shunting) revisit this part for XLA
+  lazy_ts_backend.def("_run_cached_graph", [](const std::string& hash_str, const std::vector<at::IValue>& graph_inputs) {
+    std::vector<at::Tensor> result;
+#if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+    TORCH_CHECK(hash_str.size() == sizeof(hash_t));
+    hash_t hash = *(hash_t*) (hash_str.c_str());
+    auto cachedComputation = LazyGraphExecutor::Get()->GetComputationCache()->Get(hash);
+    TORCH_CHECK(cachedComputation, "Failed to get computation by hash. Maybe the entry get kicked out of the LRU cache"); // TODO implement a fallback mechanism, or make sure those entries never get kicked out
+    auto computationPtr = (torch::lazy::TSComputation*) cachedComputation->computation.get();
+
+    std::vector<torch::jit::IValue> stack;
+    stack.reserve(graph_inputs.size());
+    for (const auto& arg : graph_inputs) {
+      stack.emplace_back(arg);
+    }
+    computationPtr->graph_executor().run(stack);
+    result.reserve(stack.size());
+    for (torch::jit::IValue elem : stack) {
+      result.push_back(elem.toTensor());
+    }
+#else
+    TORCH_CHECK(false, "TorchScript backend not yet supported in FBCODE builds");
+#endif  // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+    return result;
+  });
+
+#ifndef USE_DEPLOY
+  // When libtorch_python is loaded, we register the python frame getter
+  // otherwise, debug util simply omits python frames
+  // TODO(whc) can we make this work inside torch deploy interpreter?
+  // it doesn't work as-is, possibly becuase GetPythonFrames resolves to external
+  // cpython rather than embedded cpython
+  GetPythonFramesFunction() = GetPythonFrames;
+#endif
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/python/init.h b/torch/csrc/lazy/python/init.h
new file mode 100644
index 000000000000..3049baa4421e
--- /dev/null
+++ b/torch/csrc/lazy/python/init.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <pybind11/pybind11.h>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API void initLazyBindings(PyObject* module);
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/python/python_util.cpp b/torch/csrc/lazy/python/python_util.cpp
new file mode 100644
index 000000000000..485d288c90b1
--- /dev/null
+++ b/torch/csrc/lazy/python/python_util.cpp
@@ -0,0 +1,46 @@
+#include <torch/csrc/lazy/python/python_util.h>
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/lazy/core/debug_util.h>
+
+namespace torch {
+namespace lazy {
+
+c10::optional<SourceLocation> GetPythonFrameTop() {
+  if (!Py_IsInitialized()) {
+    return c10::nullopt;
+  }
+  pybind11::gil_scoped_acquire gil;
+  PyFrameObject* frame = PyEval_GetFrame();
+  if (frame == nullptr) {
+    return c10::nullopt;
+  }
+  SourceLocation loc;
+  loc.line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+  loc.file = THPUtils_unpackString(frame->f_code->co_filename);
+  loc.function = THPUtils_unpackString(frame->f_code->co_name);
+  return loc;
+}
+
+std::vector<SourceLocation> GetPythonFrames() {
+  std::vector<SourceLocation> frames;
+  if (Py_IsInitialized()) {
+    pybind11::gil_scoped_acquire gil;
+    PyFrameObject* frame = PyEval_GetFrame();
+    while (frame != nullptr) {
+      SourceLocation loc;
+      loc.line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+      loc.file = THPUtils_unpackString(frame->f_code->co_filename);
+      loc.function = THPUtils_unpackString(frame->f_code->co_name);
+      frames.push_back(std::move(loc));
+      frame = frame->f_back;
+    }
+  }
+  return frames;
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/python/python_util.h b/torch/csrc/lazy/python/python_util.h
new file mode 100644
index 000000000000..7b00f8ac7cd2
--- /dev/null
+++ b/torch/csrc/lazy/python/python_util.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+c10::optional<SourceLocation> TORCH_API GetPythonFrameTop();
+
+std::vector<SourceLocation> TORCH_API GetPythonFrames();
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/test_mnist.py b/torch/csrc/lazy/test_mnist.py
new file mode 100644
index 000000000000..7c4ea654ce72
--- /dev/null
+++ b/torch/csrc/lazy/test_mnist.py
@@ -0,0 +1,85 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import os
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+import torch._lazy
+import torch._lazy.ts_backend
+import torch._lazy.metrics
+torch._lazy.ts_backend.init()
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(log_interval, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad(set_to_none=True)
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        torch._lazy.mark_step()
+
+        if batch_idx % log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+if __name__ == '__main__':
+    bsz = 64
+    device = 'lazy'
+    epochs = 14
+    log_interval = 10
+    lr = 1
+    gamma = 0.7
+    train_kwargs = {'batch_size': bsz}
+    # if we want to use CUDA
+    if "LTC_TS_CUDA" in os.environ:
+        cuda_kwargs = {'num_workers': 1,
+                       'pin_memory': True,
+                       'shuffle': True,
+                       'batch_size': bsz}
+    train_kwargs.update(cuda_kwargs)
+
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    dataset1 = datasets.MNIST('./data', train=True, download=True,
+                              transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
+    for epoch in range(1, epochs + 1):
+        train(log_interval, model, device, train_loader, optimizer, epoch)
+        scheduler.step()
diff --git a/torch/csrc/lazy/ts_backend/config.cpp b/torch/csrc/lazy/ts_backend/config.cpp
index 3c030288ea06..29265a20c37e 100644
--- a/torch/csrc/lazy/ts_backend/config.cpp
+++ b/torch/csrc/lazy/ts_backend/config.cpp
@@ -1,7 +1,14 @@
 #include <torch/csrc/lazy/core/config.h>
 
-// TODO(whc) either deprecate this, or use it for all shape inference
-C10_DEFINE_int(
-    torch_lazy_ts_shape_cache_size,
-    4096,
-    "Set the size for the shape cache used for shape inference");
+// TODO(whc) unclear if this is useful, has only been tested as true
+C10_DEFINE_bool(
+    torch_lazy_ts_tensor_update_sync,
+    true,
+    "Use synchronous copy inside _copy_from op");
+
+// TODO(whc) we need to hook up these flags in a more useful way
+// possibly also keep LTC_TS_CUDA env working?
+C10_DEFINE_bool(
+    torch_lazy_ts_cuda,
+    false,
+    "Use cuda device for torchscript backend (instead of CPU)");
diff --git a/torch/csrc/lazy/ts_backend/config.h b/torch/csrc/lazy/ts_backend/config.h
index 6698007c4ddf..ac0320b9d0ac 100644
--- a/torch/csrc/lazy/ts_backend/config.h
+++ b/torch/csrc/lazy/ts_backend/config.h
@@ -1,5 +1,7 @@
 #pragma once
 #include <c10/util/Flags.h>
 
-// TODO(whc) either deprecate this, or use it for all shape inference
-C10_DECLARE_int(torch_lazy_ts_shape_cache_size);
+// TODO(whc) unclear if this is useful, has only been tested as true
+C10_DECLARE_bool(torch_lazy_ts_tensor_update_sync);
+
+C10_DECLARE_bool(torch_lazy_ts_cuda);
diff --git a/torch/csrc/lazy/ts_backend/dynamic_ir.cpp b/torch/csrc/lazy/ts_backend/dynamic_ir.cpp
new file mode 100644
index 000000000000..0c94aab7b4e9
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/dynamic_ir.cpp
@@ -0,0 +1,74 @@
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
+
+namespace torch {
+namespace lazy {
+
+DimensionNode::DimensionNode(OpKind op, OpList operands, hash_t hash_seed):
+  TsNode(op, operands, /*num_outputs=*/1,
+  /* hash_seed */ HashCombine(op.hash(), hash_seed)){}
+
+std::string DimensionNode::ToString() const {
+  return "DimensionNode";
+}
+
+  TSOpVector SizeNode::Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                          TSLoweringContext* loctx) const {
+  std::vector<torch::jit::NamedValue> arguments;
+  std::vector<torch::jit::NamedValue> kwarguments;
+  arguments.reserve(2);
+  auto index = loctx->graph()->insertConstant(static_cast<int64_t>(this->dim_));
+  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
+  arguments.emplace_back(index);
+  torch::lazy::TSOpVector size_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+  CHECK_EQ(size_out.size(), 1);
+  return size_out;
+}
+
+SizeNode::SizeNode(Value input, size_t dim):
+    DimensionNode(OpKind{c10::Symbol::fromQualString("aten::size")}, {input}, MHash(dim)),
+    dim_(dim) {};
+
+int64_t SizeNode:: getStaticValue() const {
+    return dynamic_cast<const TsNode*>(operand(0).node)->shape(0).size(dim_);
+}
+
+std::string SizeNode::ToString() const {
+  return "SizeNode";
+}
+
+SizeAdd::SizeAdd(Value a, Value b):
+  DimensionNode(OpKind{c10::Symbol::fromQualString("aten::add")}, {a, b}) {};
+
+int64_t SizeAdd::getStaticValue() const {
+    return dynamic_cast<const DimensionNode*>(operand(0).node)->getStaticValue() + dynamic_cast<const DimensionNode*>(operand(1).node)->getStaticValue();
+}
+
+std::string SizeAdd::ToString() const {
+  return "SizeAdd";
+}
+
+SizeMul::SizeMul(Value a, Value b):
+  DimensionNode(OpKind{c10::Symbol::fromQualString("aten::mul")}, {a, b}) {};
+
+int64_t SizeMul::getStaticValue() const {
+    return dynamic_cast<const DimensionNode*>(operand(0).node)->getStaticValue() * dynamic_cast<const DimensionNode*>(operand(1).node)->getStaticValue();
+}
+
+std::string SizeMul::ToString() const {
+  return "SizeMul";
+}
+
+SizeDiv::SizeDiv(Value a, Value b):
+  DimensionNode(OpKind{c10::Symbol::fromQualString("aten::div")}, {a, b}) {};
+
+int64_t SizeDiv::getStaticValue() const {
+    TORCH_CHECK(dynamic_cast<const DimensionNode*>(operand(1).node)->getStaticValue() != 0, "Can't divide a dimension by zero");
+    return dynamic_cast<const DimensionNode*>(operand(0).node)->getStaticValue() / dynamic_cast<const DimensionNode*>(operand(1).node)->getStaticValue();
+}
+
+std::string SizeDiv::ToString() const {
+  return "SizeDiv";
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/dynamic_ir.h b/torch/csrc/lazy/ts_backend/dynamic_ir.h
new file mode 100644
index 000000000000..c35d9c28f467
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/dynamic_ir.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <c10/util/Flags.h>
+
+C10_DECLARE_bool(ltc_enable_dynamic_shapes);
+
+namespace torch {
+namespace lazy {
+
+/**
+ * The goal of "dynamic" Nodes is to patch a hole in our tracing.
+ * Previously, if a user called `sizes` on a Tensor, it would leak out
+ * of our tracing system, as `sizes` returns a torch.Size or an int. To
+ * prevent this from happening, we introduce DimensionNode, a new type
+ * of Node that abstracts the operation of getting the dimensions of a
+ * Tensor.
+ *
+ * Consider the following example:
+ * ```
+ * numel = x.shape()[0] * x.shape()[1]
+ * ```
+ *
+ * Here, `x.shape()[i]` will be a SizeNode (subclass of DimensionNode),
+ * and the multiplication of the two SizeNodes will be represented by
+ * a SizeMul (also a subclass of DimensionNode). Through this, we can
+ * prevent `numel` from being represented as a Python int and thus
+ * burned into the Graph.
+ */
+
+class TORCH_API DimensionNode : public lazy::TsNode {
+ public:
+  DimensionNode(OpKind op, OpList operands, hash_t hash_seed = kHashSeed);
+  bool isDynamic() {
+      return false;
+  }
+
+  std::string ToString() const override;
+
+  virtual int64_t getStaticValue() const = 0;
+};
+
+// Represents the result of calling `size` on a Tensor
+class TORCH_API SizeNode : public DimensionNode {
+ public:
+  SizeNode(Value input, size_t dim);
+  int64_t getStaticValue() const override;
+  std::string ToString() const override;
+  size_t dim_ = 0;
+  virtual TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                          TSLoweringContext* loctx) const override;
+};
+
+class TORCH_API SizeAdd: public DimensionNode {
+ public:
+  SizeAdd(Value a, Value b);
+  int64_t getStaticValue() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeMul: public DimensionNode {
+ public:
+  SizeMul(Value a, Value b);
+  int64_t getStaticValue() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeDiv: public DimensionNode {
+ public:
+  SizeDiv(Value a, Value b);
+  int64_t getStaticValue() const override;
+  std::string ToString() const override;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h
new file mode 100644
index 000000000000..dafd20178bf2
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/narrow.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/select_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/permute.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/resize.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/squeeze.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/diagonal.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/as_strided.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/unsqueeze.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/select.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/view.h>
+#include <torch/csrc/lazy/ts_backend/ops/cast.h>
+#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
+#include <torch/csrc/lazy/ts_backend/ops/generic.h>
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/to_copy.h>
+#include <torch/csrc/lazy/ts_backend/ops/scalar.h>
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/expand.h>
+
+// This file contains the TorchScript IrBuilder
+
+namespace torch {
+namespace lazy {
+
+
+struct TorchScriptIrBuilder: IrBuilder {
+  NodePtr MakeDeviceData(const std::shared_ptr<BackendData>& data) const override { return DeviceData::Create(data); }
+  NodePtr MakeScalar(const at::Scalar& value, const at::ScalarType& type) const override { return MakeNode<Scalar>(value, type); }
+  NodePtr MakeExpand(const Value& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand) const override { return MakeNode<Expand>(input0, size, is_scalar_expand); }
+  NodePtr MakeView(const Value& input0, const std::vector<int64_t>& output_size) const override { return MakeNode<View>(input0, output_size); }
+  NodePtr MakeCast(const Value& input0, const at::ScalarType& dtype, const c10::optional<at::ScalarType>& stype = c10::nullopt) const override { return MakeNode<Cast>(input0, dtype, stype); }
+  NodePtr MakeTensorList(const OpList& inputs) const override { return MakeNode<TensorList>(inputs); }
+  NodePtr MakeGeneric(const OpKind& op, const OpList& operands, const Shape& shape, const size_t& num_outputs = 1, const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) const override { return MakeNode<Generic>(op, operands, shape, num_outputs, hash_seed); }
+
+  // View op nodes
+  NodePtr MakeAsStridedViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) const override { return MakeNode<AsStridedViewUpdate>(input0, input1, size, stride, storage_offset); }
+  NodePtr MakeAsStrided(const Value& input0, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset) const override { return MakeNode<AsStrided>(input0, size, stride, storage_offset); }
+  NodePtr MakeDiagonalViewUpdate(const Value& input0, const Value& input1, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const override { return MakeNode<DiagonalViewUpdate>(input0, input1, offset, dim1, dim2); }
+  NodePtr MakeDiagonal(const Value& input0, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const override { return MakeNode<Diagonal>(input0, offset, dim1, dim2); }
+  NodePtr MakeNarrowViewUpdate(const Value& input0, const Value& input1, const std::vector<int64_t>& base_indices) const override { return MakeNode<NarrowViewUpdate>(input0, input1, base_indices); }
+  NodePtr MakeNarrow(const Value& input0, const std::vector<int64_t>& base_indices, const std::vector<int64_t>& sizes) const override { return MakeNode<Narrow>(input0, base_indices, sizes); }
+  NodePtr MakePermute(const Value& input0, const std::vector<int64_t>& dims) const override { return MakeNode<Permute>(input0, dims); }
+  NodePtr MakeResize(const Value& input0, const std::vector<int64_t>& size) const override { return MakeNode<Resize>(input0, size); }
+  NodePtr MakeSelectViewUpdate(const Value& input0, const Value& input1, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) const override { return MakeNode<SelectViewUpdate>(input0, input1, dim, start, end, stride); }
+  NodePtr MakeSelect(const Value& input0, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride) const override { return MakeNode<Select>(input0, dim, start, end, stride); }
+  NodePtr MakeSqueeze(const Value& input0, const int& dim) const override { return MakeNode<Squeeze>(input0, dim); }
+  NodePtr MakeUnsqueeze(const Value& input0, const int& dim) const override { return MakeNode<Unsqueeze>(input0, dim); }
+
+  // dynamic ir nodes
+  NodePtr MakeSizeNode(const Value& input, size_t dim) const override { return MakeNode<SizeNode>(input, dim); }
+  NodePtr MakeSizeAdd(const Value& a, const Value& b) const override { return MakeNode<SizeAdd>(a, b); }
+  NodePtr MakeSizeMul(const Value& a, const Value& b) const override { return MakeNode<SizeMul>(a, b); }
+  NodePtr MakeSizeDiv(const Value& a, const Value& b) const override { return MakeNode<SizeDiv>(a, b); }
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp b/torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp
deleted file mode 100644
index 91ce1866b5c3..000000000000
--- a/torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h>
-
-#include <torch/csrc/lazy/core/helpers.h>
-
-#include <memory>
-
-#include <torch/csrc/lazy/ts_backend/ops/generic.h>
-
-namespace torch {
-namespace lazy {
-
-NodePtr operator+(const Value& node1, const Value& node2) {
-  return GenericOp(
-      OpKind(at::aten::add),
-      {node1, node2},
-      GetPromotedBinaryOpShape(
-          GetShapeFromTsValue(node1), GetShapeFromTsValue(node2)));
-}
-
-NodePtr operator-(const Value& node1, const Value& node2) {
-  return GenericOp(
-      OpKind(at::aten::sub),
-      {node1, node2},
-      GetPromotedBinaryOpShape(
-          GetShapeFromTsValue(node1), GetShapeFromTsValue(node2)));
-}
-
-NodePtr operator*(const Value& node1, const Value& node2) {
-  return GenericOp(
-      OpKind(at::aten::mul),
-      {node1, node2},
-      GetPromotedBinaryOpShape(
-          GetShapeFromTsValue(node1), GetShapeFromTsValue(node2)));
-}
-
-NodePtr operator/(const Value& node1, const Value& node2) {
-  return GenericOp(
-      OpKind(at::aten::div),
-      {node1, node2},
-      GetPromotedBinaryOpShape(
-          GetShapeFromTsValue(node1), GetShapeFromTsValue(node2)));
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp
new file mode 100644
index 000000000000..d751e414a553
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp
@@ -0,0 +1,76 @@
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+TSNativeBatchNormBackward::TSNativeBatchNormBackward(
+    const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+    const torch::lazy::Value& weight, const torch::lazy::Value& running_mean,
+    const torch::lazy::Value& running_var, const torch::lazy::Value& save_mean,
+    const torch::lazy::Value& save_invstd, bool training, double eps,
+    std::array<bool, 3> output_mask)
+    : torch::lazy::TsNode(
+          torch::lazy::OpKind(at::aten::native_batch_norm_backward),
+          {grad_out, input, weight, running_mean, running_var, save_mean,
+           save_invstd},
+          {input.shape(),
+           weight.shape(),
+           weight.shape()},
+          /*num_outputs=*/3,
+          torch::lazy::MHash(training, eps, output_mask[0], output_mask[1],
+                             output_mask[2])),
+      training_(training),
+      eps_(eps),
+      output_mask_(output_mask) {}
+
+TSNativeBatchNormBackward::TSNativeBatchNormBackward(
+    const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+    const torch::lazy::Value& weight, const torch::lazy::Value& save_mean,
+    const torch::lazy::Value& save_invstd, bool training, double eps,
+    std::array<bool, 3> output_mask)
+    : torch::lazy::TsNode(
+          torch::lazy::OpKind(at::aten::native_batch_norm_backward),
+          {grad_out, input, weight, save_mean, save_invstd},
+          {input.shape(),
+           weight.shape(),
+           weight.shape()},
+          /*num_outputs=*/3,
+          torch::lazy::MHash(training, eps, output_mask[0], output_mask[1],
+                             output_mask[2])),
+      training_(training),
+      eps_(eps),
+      output_mask_(output_mask) {}
+
+std::string TSNativeBatchNormBackward::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", training=" << training_
+     << ", eps=" << eps_;
+  return ss.str();
+}
+
+TSNativeBatchNormForward::TSNativeBatchNormForward(
+    const torch::lazy::Value& input, const torch::lazy::Value& weight,
+    const torch::lazy::Value& bias, const torch::lazy::Value& running_mean,
+    const torch::lazy::Value& running_var, bool training, double momentum,
+    double eps)
+    : torch::lazy::TsNode(torch::lazy::OpKind(at::aten::native_batch_norm),
+                          {input, weight, bias, running_mean, running_var},
+                          {input.shape(),
+                           running_mean.shape(),
+                           running_var.shape()},
+                          /*num_outputs=*/3,
+                          torch::lazy::MHash(training, momentum, eps)),
+      training_(training),
+      momentum_(momentum),
+      eps_(eps) {}
+
+std::string TSNativeBatchNormForward::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", training=" << training_
+     << ", momentum=" << momentum_ << ", eps=" << eps_;
+  return ss.str();
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h
new file mode 100644
index 000000000000..f013df5effad
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+// Node for the backward batch norm operator.
+class TSNativeBatchNormBackward : public torch::lazy::TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::native_batch_norm_backward);
+  }
+
+  TSNativeBatchNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+                            const torch::lazy::Value& weight, const torch::lazy::Value& running_mean,
+                            const torch::lazy::Value& running_var, const torch::lazy::Value& save_mean,
+                            const torch::lazy::Value& save_invstd, bool training, double eps,
+                            std::array<bool, 3> output_mask);
+
+  TSNativeBatchNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+                            const torch::lazy::Value& weight, const torch::lazy::Value& save_mean,
+                            const torch::lazy::Value& save_invstd, bool training, double eps,
+                            std::array<bool, 3> output_mask);
+
+  std::string ToString() const override;
+
+  bool training() const { return training_; }
+
+  double eps() const { return eps_; }
+
+  const std::array<bool, 3>& output_mask() const { return output_mask_; }
+
+ private:
+  bool training_;
+  double eps_;
+  std::array<bool, 3> output_mask_;
+};
+
+class TSNativeBatchNormForward : public torch::lazy::TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::native_batch_norm);
+  }
+
+  TSNativeBatchNormForward(const torch::lazy::Value& input, const torch::lazy::Value& weight,
+                           const torch::lazy::Value& bias, const torch::lazy::Value& running_mean,
+                           const torch::lazy::Value& running_var, bool training,
+                           double momentum, double eps);
+
+  std::string ToString() const override;
+
+  bool training() const { return training_; }
+
+  double momentum() const { return momentum_; }
+
+  double eps() const { return eps_; }
+
+ private:
+  bool training_;
+  double momentum_;
+  double eps_;
+};
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/cast.cpp b/torch/csrc/lazy/ts_backend/ops/cast.cpp
index d64bc4a4dfe6..e1adcaa5c535 100644
--- a/torch/csrc/lazy/ts_backend/ops/cast.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/cast.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/lazy/ts_backend/ops/cast.h>
 
-#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/core/tensor_util.h>
 
 namespace torch {
@@ -9,18 +8,19 @@ namespace lazy {
 namespace {
 
 Shape NodeOutputShape(const Value& input, c10::ScalarType type) {
-  Shape shape = GetShapeFromTsValue(input);
+  Shape shape = input.shape();
   shape.set_scalar_type(type);
   return shape;
 }
 
 } // namespace
+
 Cast::Cast(
     const Value& input,
     at::ScalarType dtype,
     c10::optional<at::ScalarType> stype)
     : TsNode(
-          ltc_cast,
+          ClassOpKind(),
           {input},
           {NodeOutputShape(input, dtype)},
           /*num_outputs=*/1,
diff --git a/torch/csrc/lazy/ts_backend/ops/cast.h b/torch/csrc/lazy/ts_backend/ops/cast.h
index 5848044dfa54..b94821aa3037 100644
--- a/torch/csrc/lazy/ts_backend/ops/cast.h
+++ b/torch/csrc/lazy/ts_backend/ops/cast.h
@@ -2,6 +2,7 @@
 
 #include <c10/core/ScalarType.h>
 #include <c10/util/Optional.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
 
 namespace torch {
@@ -9,6 +10,10 @@ namespace lazy {
 
 class TORCH_API Cast : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_cast;
+  }
+
   Cast(
       const Value& input,
       at::ScalarType dtype,
diff --git a/torch/csrc/lazy/ts_backend/ops/device_data.cpp b/torch/csrc/lazy/ts_backend/ops/device_data.cpp
index 4db992c8f185..bd80fcd7fe61 100644
--- a/torch/csrc/lazy/ts_backend/ops/device_data.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/device_data.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/lazy/ts_backend/ops/device_data.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
 
 #include <sstream>
 
@@ -9,7 +10,7 @@ namespace lazy {
 
 DeviceData::DeviceData(std::shared_ptr<BackendData> data)
     : TsNode(
-          ltc_device_data,
+          ClassOpKind(),
           data->shape(),
           /*num_outputs=*/1,
           /*hash_seed=*/static_cast<uint32_t>(101)),
@@ -22,7 +23,19 @@ std::string DeviceData::ToString() const {
 }
 
 const DeviceData* DeviceData::Cast(const Node* node) {
-  return NodeCast<DeviceData>(node, ltc_device_data);
+  return NodeCast<DeviceData>(node);
+}
+
+NodePtr DeviceData::Create(std::shared_ptr<BackendData> data) {
+  NodePtr node = ReuseOrMakeNode<DeviceData>(data);
+  // ReuseOrMakeNode may return a reused node which has the same shape,
+  // however, we need to replace the old data_ with the new one.
+  // Ditching the old data_ is safe because tracing is done iteration
+  // by iteration, and after we lauch the async device execution for the
+  // previous iteration, data_ in DeviceData nodes are not needed anymore.
+  DeviceData* device_data = static_cast<DeviceData*>(node.get());
+  device_data->SetData(data);
+  return node;
 }
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/ts_backend/ops/device_data.h b/torch/csrc/lazy/ts_backend/ops/device_data.h
index 9bef82f20bf3..30a551cc055f 100644
--- a/torch/csrc/lazy/ts_backend/ops/device_data.h
+++ b/torch/csrc/lazy/ts_backend/ops/device_data.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
 
 namespace torch {
@@ -8,16 +9,35 @@ namespace lazy {
 
 class TORCH_API DeviceData : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_device_data;
+  }
+
   explicit DeviceData(std::shared_ptr<BackendData> data);
 
+  // A DeviceData node can be reused if the shape matches,
+  // but we will substitute the actual data_ pointer under
+  // the hood.
+  bool CanBeReused(std::shared_ptr<BackendData> data) const {
+    return data_->shape() == data->shape();
+  }
+
   std::string ToString() const override;
 
   const std::shared_ptr<BackendData>& data() const {
     return data_;
   }
 
+  void SetData(std::shared_ptr<BackendData> data) {
+    data_ = data;
+  }
+
   static const DeviceData* Cast(const Node* node);
 
+  // To reuse IR nodes, use this method to create DeviceData nodes
+  // instead of calling the constructor directly.
+  static NodePtr Create(std::shared_ptr<BackendData> data);
+
  private:
   std::shared_ptr<BackendData> data_;
 };
diff --git a/torch/csrc/lazy/ts_backend/ops/expand.cpp b/torch/csrc/lazy/ts_backend/ops/expand.cpp
index 6ac4f1604865..b7bf90631bfa 100644
--- a/torch/csrc/lazy/ts_backend/ops/expand.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/expand.cpp
@@ -8,14 +8,14 @@ Expand::Expand(
     std::vector<int64_t> size,
     bool is_scalar_expand)
     : TsNode(
-          OpKind(at::aten::expand),
+          ClassOpKind(),
           {input},
           /*num_outputs=*/1,
           MHash(size, is_scalar_expand)),
       size_(std::move(size)),
       is_scalar_expand_(is_scalar_expand) {
-  SetShapeDeferred(
-      [&]() { return Shape(GetShapeFromTsValue(input).scalar_type(), size_); });
+  addComputedShape(
+      [&]() { return Shape(input.shape().scalar_type(), size_); });
 }
 
 std::string Expand::ToString() const {
diff --git a/torch/csrc/lazy/ts_backend/ops/expand.h b/torch/csrc/lazy/ts_backend/ops/expand.h
index e81e2fb0fcb3..ceea8365b429 100644
--- a/torch/csrc/lazy/ts_backend/ops/expand.h
+++ b/torch/csrc/lazy/ts_backend/ops/expand.h
@@ -9,6 +9,10 @@ namespace lazy {
 
 class TORCH_API Expand : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::expand);
+  }
+
   Expand(const Value& input, std::vector<int64_t> size, bool is_scalar_expand);
 
   std::string ToString() const override;
diff --git a/torch/csrc/lazy/ts_backend/ops/generic.h b/torch/csrc/lazy/ts_backend/ops/generic.h
index 2f9a837cd780..c605aaa437cc 100644
--- a/torch/csrc/lazy/ts_backend/ops/generic.h
+++ b/torch/csrc/lazy/ts_backend/ops/generic.h
@@ -2,6 +2,8 @@
 
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
 
+#include <torch/csrc/lazy/core/ir_builder.h>
+
 namespace torch {
 namespace lazy {
 
diff --git a/torch/csrc/lazy/ts_backend/ops/random_ops.cpp b/torch/csrc/lazy/ts_backend/ops/random_ops.cpp
new file mode 100644
index 000000000000..ff29f4ce8a32
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ops/random_ops.cpp
@@ -0,0 +1,40 @@
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+Normal::Normal(const torch::lazy::Value& self, const double& mean, const double& std, std::vector<torch::lazy::Shape>&& shapes)
+    : torch::lazy::TsNode(ClassOpKind(),
+            {self}, std::move(shapes),
+            /* num_outputs */ 1,
+            torch::lazy::MHash(mean, std)),
+    mean_(mean),
+    std_(std) {}
+
+std::string Normal::ToString() const {
+  std::stringstream ss;
+  ss << TsNode::ToString();
+  ss << ", mean=" << mean_;
+  ss << ", std=" << std_;
+  return ss.str();
+}
+
+torch::lazy::TSOpVector Normal::Lower(
+    std::shared_ptr<torch::jit::GraphFunction> function,
+    torch::lazy::TSLoweringContext* loctx) const {
+  std::vector<torch::jit::NamedValue> arguments;
+  std::vector<torch::jit::NamedValue> kwarguments;
+  arguments.reserve(3);
+  size_t i = 0;
+  arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+  arguments.emplace_back("mean", mean_);
+  arguments.emplace_back("std", std_);
+  torch::lazy::TSOpVector normal__out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+  CHECK_EQ(normal__out.size(), 1);
+
+  return normal__out;
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/random_ops.h b/torch/csrc/lazy/ts_backend/ops/random_ops.h
new file mode 100644
index 000000000000..f44c5cc261d9
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ops/random_ops.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+class Normal : public torch::lazy::TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind::Get("aten::normal_");
+  }
+
+  Normal(const torch::lazy::Value& self, const double& mean, const double& std, std::vector<torch::lazy::Shape>&& shapes);
+
+  std::string ToString() const override;
+  torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                   torch::lazy::TSLoweringContext* loctx) const override;
+
+  double mean_;
+  double std_;
+};
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ops/scalar.cpp b/torch/csrc/lazy/ts_backend/ops/scalar.cpp
index b9e2fd01b637..114e8c26926d 100644
--- a/torch/csrc/lazy/ts_backend/ops/scalar.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/scalar.cpp
@@ -12,7 +12,7 @@ using at::operator<<;
 
 Scalar::Scalar(const at::Scalar& value, Shape shape)
     : TsNode(
-          OpKind(at::prim::Constant),
+          ClassOpKind(),
           std::move(shape),
           /*num_outputs=*/1,
           ScalarHash(value)),
@@ -20,7 +20,7 @@ Scalar::Scalar(const at::Scalar& value, Shape shape)
 
 Scalar::Scalar(const at::Scalar& value, c10::ScalarType type)
     : TsNode(
-          OpKind(at::prim::Constant),
+          ClassOpKind(),
           {Shape(type, {})},
           /*num_outputs=*/1,
           ScalarHash(value)),
diff --git a/torch/csrc/lazy/ts_backend/ops/scalar.h b/torch/csrc/lazy/ts_backend/ops/scalar.h
index 0b913f6b9152..1092735a3a56 100644
--- a/torch/csrc/lazy/ts_backend/ops/scalar.h
+++ b/torch/csrc/lazy/ts_backend/ops/scalar.h
@@ -12,6 +12,10 @@ namespace lazy {
 // computation graph.
 class TORCH_API Scalar : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::prim::Constant);
+  }
+
   Scalar(const at::Scalar& value, Shape shape);
   Scalar(const at::Scalar& value, c10::ScalarType type);
 
diff --git a/torch/csrc/lazy/ts_backend/ops/to_copy.h b/torch/csrc/lazy/ts_backend/ops/to_copy.h
new file mode 100644
index 000000000000..6abe5f71e4cd
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ops/to_copy.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+
+// This IR was copied from code-generated output, but the entire _to_copy operator
+// cannot be trivially code genereated since it is only desirable to capture IR for
+// certain permutaions of _to_copy (e.g. dtype), and for the others it is difficult to even invoke
+// the aten/eager fallback necessitating directly implementing the right to(device) behavior
+class ToCopy : public torch::lazy::TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::_to_copy);
+  }
+
+  ToCopy(const torch::lazy::Value& self, const c10::optional<at::ScalarType>& dtype, const c10::optional<at::Layout>& layout, const c10::optional<at::Device>& device, const c10::optional<bool>& pin_memory, const bool& non_blocking, const c10::optional<at::MemoryFormat>& memory_format, std::vector<torch::lazy::Shape>&& shapes)
+      : torch::lazy::TsNode(ClassOpKind(),
+              {self}, std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype, layout, device, pin_memory, non_blocking, memory_format)),
+
+        dtype(dtype),
+        layout(layout),
+        device(device),
+        pin_memory(pin_memory),
+        non_blocking(non_blocking),
+        memory_format(memory_format) {}
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << torch::lazy::TsNode::ToString();
+    if (dtype.has_value()) {
+        ss << ", dtype=" << dtype.value();
+    } else {
+        ss << ", dtype=null";
+    }
+    if (layout.has_value()) {
+        ss << ", layout=" << layout.value();
+    } else {
+        ss << ", layout=null";
+    }
+    if (device.has_value()) {
+        ss << ", device=" << device.value();
+    } else {
+        ss << ", device=null";
+    }
+    if (pin_memory.has_value()) {
+        ss << ", pin_memory=" << pin_memory.value();
+    } else {
+        ss << ", pin_memory=null";
+    }
+    ss << ", non_blocking=" << non_blocking;
+    if (memory_format.has_value()) {
+        ss << ", memory_format=" << memory_format.value();
+    } else {
+        ss << ", memory_format=null";
+    }
+    return ss.str();
+  }
+
+  torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                   torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(6);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("dtype", dtype);
+    kwarguments.emplace_back("layout", layout);
+    kwarguments.emplace_back("device", device);
+    kwarguments.emplace_back("pin_memory", pin_memory);
+    kwarguments.emplace_back("non_blocking", non_blocking);
+    kwarguments.emplace_back("memory_format", memory_format);
+    torch::lazy::TSOpVector _to_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    CHECK_EQ(_to_copy_out.size(), 1);
+
+    return _to_copy_out;
+
+  }
+
+  c10::optional<at::ScalarType> dtype;
+  c10::optional<at::Layout> layout;
+  c10::optional<at::Device> device;
+  c10::optional<bool> pin_memory;
+  bool non_blocking;
+  c10::optional<at::MemoryFormat> memory_format;
+};
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
new file mode 100644
index 000000000000..2cec30d50005
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
@@ -0,0 +1,328 @@
+#include <torch/csrc/lazy/ts_backend/tensor_aten_ops.h>
+
+#include <ATen/InferSize.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/ops/arithmetic_ir_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/util.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/generated/LazyIr.h>
+#include <algorithm>
+#include <functional>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+// to enable operator+-*/ for Value
+using namespace torch::lazy;
+
+torch::lazy::Value MaybeExpand(const torch::lazy::Value& input,
+                               const torch::lazy::Shape& target_shape) {
+  if (input.shape().sizes() == target_shape.sizes()) {
+    return input;
+  }
+  return torch::lazy::MakeExpand(
+      input, target_shape.sizes().vec(),
+      /*is_scalar_expand=*/false);
+}
+
+std::vector<int64_t> GetExpandDimensions(const torch::lazy::Shape& shape,
+                                         std::vector<int64_t> dimensions) {
+  CHECK_GE(dimensions.size(), shape.dim()) << shape;
+  int64_t base = dimensions.size() - shape.dim();
+  for (size_t i = 0; i < shape.dim(); ++i) {
+    if (dimensions[base + i] == -1) {
+      dimensions[base + i] = shape.size(i);
+    }
+  }
+  return dimensions;
+}
+
+// Returns a 1-D shape for batch norm weight or bias based on the input shape.
+torch::lazy::Shape BatchNormFeaturesShape(const torch::lazy::LazyTensorPtr& input) {
+  CHECK(input);
+  auto input_shape = input->shape().Get();
+  return torch::lazy::Shape(input_shape.scalar_type(),
+                             input_shape.sizes()[1]);
+}
+
+// Returns the IR for the given input or the provided default value broadcasted
+// to the default shape, if the input is undefined.
+torch::lazy::Value GetIrValueOrDefault(const torch::lazy::LazyTensorPtr& input,
+                                       const at::Scalar& default_value,
+                                       const torch::lazy::Shape& default_shape,
+                                       const torch::lazy::BackendDevice& device) {
+  return input ? input->GetIrValue()
+               : torch::lazy::LazyGraphExecutor::Get()->GetIrValueForExpandedScalar(default_value,
+                                                                                    default_shape,
+                                                                                    device);
+}
+
+torch::lazy::ViewInfo CreateAsStridedViewInfo(
+    const torch::lazy::Shape& input_shape, std::vector<int64_t> size,
+    std::vector<int64_t> stride, c10::optional<int64_t> storage_offset) {
+  torch::lazy::Shape result_shape =
+      torch::lazy::Shape(input_shape.scalar_type(), size);
+  torch::lazy::AsStridedInfo as_strided_info;
+  as_strided_info.stride = std::move(stride);
+  if (storage_offset) {
+    as_strided_info.offset = *storage_offset;
+  }
+  return torch::lazy::ViewInfo(torch::lazy::ViewInfo::Type::kAsStrided,
+                               std::move(result_shape), input_shape,
+                               std::move(as_strided_info));
+}
+
+}  // namespace
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+torch::lazy::LazyTensorPtr as_strided(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                      std::vector<int64_t> stride,
+                      c10::optional<int64_t> storage_offset) {
+  auto input_shape = input->shape();
+  return input->CreateViewTensor(CreateAsStridedViewInfo(
+      input_shape, std::move(size), std::move(stride), storage_offset));
+}
+
+void as_strided_(torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                 std::vector<int64_t> stride,
+                 c10::optional<int64_t> storage_offset) {
+  if (input->data()->view == nullptr) {
+    input->SetIrValue(torch::lazy::MakeAsStrided(
+        input->GetIrValue(), std::move(size), std::move(stride),
+        storage_offset.value_or(0)));
+  } else {
+    auto input_shape = input->shape();
+    input->SetSubView(CreateAsStridedViewInfo(
+        input_shape, std::move(size), std::move(stride), storage_offset));
+  }
+}
+
+torch::lazy::LazyTensorPtr expand(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size) {
+  auto input_shape = input->shape();
+  auto output = torch::lazy::LazyTensor::Create(torch::lazy::MakeExpand(
+      input->GetIrValue(),
+      GetExpandDimensions(input_shape.Get(), std::move(size)),
+      /*is_scalar_expand=*/false), input->GetDevice());
+  output->SetStorage(input->Storage());
+  return output;
+}
+
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) {
+  torch::lazy::Value constant = torch::lazy::LazyGraphExecutor::Get()->GetIrValueForExpandedScalar(
+      value, input->shape(), input->GetDevice());
+  input->SetInPlaceIrValue(std::move(constant));
+}
+
+torch::lazy::LazyTensorPtr narrow(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                  int64_t length) {
+  auto input_shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
+  torch::lazy::Shape narrow_shape = input_shape;
+  narrow_shape.set_size(dim, length);
+
+  torch::lazy::ViewInfo::Type view_type =
+      (input_shape.Get().numel() == narrow_shape.numel())
+          ? torch::lazy::ViewInfo::Type::kReshape
+          : torch::lazy::ViewInfo::Type::kNarrow;
+  torch::lazy::ViewInfo view_info(view_type, std::move(narrow_shape),
+                                  input_shape);
+  view_info.indices[dim] =
+      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm(
+    const torch::lazy::LazyTensorPtr& input, const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& bias,
+    torch::lazy::LazyTensorPtr& running_mean, torch::lazy::LazyTensorPtr& running_var, bool training,
+    double momentum, double eps) {
+  torch::lazy::Shape features_shape = BatchNormFeaturesShape(input);
+  torch::lazy::Value weight_value =
+      GetIrValueOrDefault(weight, 1, features_shape, input->GetDevice());
+  torch::lazy::Value bias_value =
+      GetIrValueOrDefault(bias, 0, features_shape, input->GetDevice());
+  torch::lazy::Value running_mean_value =
+      GetIrValueOrDefault(running_mean, 0, features_shape, input->GetDevice());
+  torch::lazy::Value running_var_value =
+      GetIrValueOrDefault(running_var, 0, features_shape, input->GetDevice());
+  torch::lazy::NodePtr node =
+      torch::lazy::MakeNode<TSNativeBatchNormForward>(
+          input->GetIrValue(), weight_value, bias_value, running_mean_value,
+          running_var_value, training, momentum, eps);
+  torch::lazy::LazyTensorPtr output = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 0), input->GetDevice());
+  torch::lazy::LazyTensorPtr running_mean_output =
+      torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 1), input->GetDevice());
+  torch::lazy::LazyTensorPtr running_var_output = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 2), input->GetDevice());
+  return std::make_tuple(std::move(output), std::move(running_mean_output),
+                         std::move(running_var_output));
+}
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm_backward(
+    const torch::lazy::LazyTensorPtr& grad_out, const torch::lazy::LazyTensorPtr& input,
+    const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& running_mean,
+    const torch::lazy::LazyTensorPtr& running_var, const torch::lazy::LazyTensorPtr& save_mean,
+    const torch::lazy::LazyTensorPtr& save_invstd, bool training, double eps,
+    c10::ArrayRef<bool> output_mask) {
+  torch::lazy::Shape features_shape = BatchNormFeaturesShape(input);
+  torch::lazy::Value weight_value =
+      GetIrValueOrDefault(weight, 1, features_shape, input->GetDevice());
+  torch::lazy::NodePtr node;
+  if (!running_mean && !running_var) {
+    node = torch::lazy::MakeNode<TSNativeBatchNormBackward>(
+        grad_out->GetIrValue(), input->GetIrValue(), weight_value,
+        save_mean->GetIrValue(), save_invstd->GetIrValue(), training, eps,
+        std::array<bool, 3>{output_mask[0], output_mask[1], output_mask[2]});
+  } else {
+    CHECK(running_mean);
+    CHECK(running_var);
+    node = torch::lazy::MakeNode<TSNativeBatchNormBackward>(
+        grad_out->GetIrValue(), input->GetIrValue(), weight_value,
+        running_mean->GetIrValue(), running_var->GetIrValue(),
+        save_mean->GetIrValue(), save_invstd->GetIrValue(), training, eps,
+        std::array<bool, 3>{output_mask[0], output_mask[1], output_mask[2]});
+  }
+  torch::lazy::LazyTensorPtr grad_input = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 0), input->GetDevice());
+  torch::lazy::LazyTensorPtr grad_weight = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 1), input->GetDevice());
+  torch::lazy::LazyTensorPtr grad_bias = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 2), input->GetDevice());
+  return std::make_tuple(std::move(grad_input), std::move(grad_weight),
+                         std::move(grad_bias));
+}
+
+torch::lazy::LazyTensorPtr permute(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> dims) {
+  auto input_shape = input->shape();
+  torch::lazy::ViewInfo view_info(
+      torch::lazy::ViewInfo::Type::kPermute, input_shape,
+      torch::lazy::GetCanonicalDimensionIndices(dims, input_shape.Get().dim()));
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src) {
+  if (input->GetDevice() == src->GetDevice()) {
+    torch::lazy::Value copy_value;
+    if (input->dtype() == src->dtype()) {
+      copy_value = src->GetIrValue();
+    } else {
+      copy_value = torch::lazy::MakeCast(
+          src->GetIrValue(), input->dtype(), src->dtype());
+    }
+    input->SetIrValue(MaybeExpand(copy_value, input->shape()));
+  } else {
+    auto input_shape = input->shape();
+    at::Tensor src_tensor = src->ToTensor(/*detached=*/true);
+    if (src_tensor.sizes() != input_shape.Get().sizes()) {
+      src_tensor = src_tensor.expand(input_shape.Get().sizes().vec());
+    }
+    input->UpdateFromTensor(std::move(src_tensor), /*sync=*/false);
+  }
+}
+
+torch::lazy::LazyTensorPtr select(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t index) {
+  auto shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, shape.Get().dim());
+  torch::lazy::LazyTensorPtr result = narrow(input, dim, index, 1);
+  auto new_dims = torch::lazy::DropDimensions(shape.Get().sizes(), {dim});
+  return view(result, new_dims);
+}
+
+torch::lazy::LazyTensorPtr slice(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                 int64_t end, int64_t step) {
+  auto input_shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
+  start =
+      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
+  end = torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, end);
+  // PyTorch allows tensor[-1:0] to return a 0-dim tensor.
+  if (start > end) {
+    end = start;
+  }
+  step = std::min(step, end - start);
+
+  torch::lazy::SelectInfo select = {dim, start, end, step};
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kSelect,
+                                  input_shape, select);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input) {
+  auto input_shape = input->shape();
+  auto output_dimensions = BuildSqueezedDimensions(
+      input_shape.Get().sizes(), /*squeeze_dim=*/-1);
+  return view(input, output_dimensions);
+}
+
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  auto input_shape = input->shape();
+  int64_t squeeze_dim =
+      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().Get().dim());
+  auto output_dimensions =
+      BuildSqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
+  return view(input, output_dimensions);
+}
+
+void squeeze_(torch::lazy::LazyTensorPtr& input) {
+  input->SetIrValue(
+      torch::lazy::MakeSqueeze(input->GetIrValue(), -1));
+}
+
+void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  input->SetIrValue(torch::lazy::MakeSqueeze(
+      input->GetIrValue(),
+      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().Get().dim())));
+}
+
+torch::lazy::LazyTensorPtr transpose(const torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1) {
+  auto input_shape = input->shape();
+  auto permute_dims = torch::lazy::MakeTransposePermutation(
+      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kPermute,
+                                  input_shape, permute_dims);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1) {
+  auto input_shape = input->shape();
+  auto permute_dims = torch::lazy::MakeTransposePermutation(
+      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kPermute,
+                                  input_shape, permute_dims);
+  return input->ModifyCurrentView(std::move(view_info));
+}
+
+torch::lazy::LazyTensorPtr unsqueeze(const torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  auto input_shape = input->shape();
+  int64_t squeeze_dim =
+      torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim() + 1);
+  auto dimensions =
+      BuildUnsqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
+  return view(input, dimensions);
+}
+
+void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  int squeeze_dim = torch::lazy::GetCanonicalDimensionIndex(
+      dim, input->shape().Get().dim() + 1);
+  input->SetIrValue(torch::lazy::MakeUnsqueeze(input->GetIrValue(),
+                                                             squeeze_dim));
+}
+
+torch::lazy::LazyTensorPtr view(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> output_size) {
+  auto input_shape = input->shape().Get();
+  torch::lazy::Shape shape = torch::lazy::Shape(
+      input_shape.scalar_type(), at::infer_size(output_size, input_shape.numel()));
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kReshape,
+                                  std::move(shape), input_shape);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
new file mode 100644
index 000000000000..e506bfd0255a
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+// Takes a slice from the input as R1 at the specified offset and reshapes it
+// into the provided size.
+torch::lazy::LazyTensorPtr as_strided(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                      std::vector<int64_t> stride,
+                      c10::optional<int64_t> storage_offset);
+
+// In-place version of the method above.
+void as_strided_(torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                 std::vector<int64_t> stride,
+                 c10::optional<int64_t> storage_offset);
+
+torch::lazy::LazyTensorPtr expand(const torch::lazy::LazyTensorPtr& input,
+                  std::vector<int64_t> size);
+
+// Fills the input with the given value.
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
+
+// Returns a new tensor that is a narrowed view of the input in the given
+// dimension.
+torch::lazy::LazyTensorPtr narrow(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                  int64_t length);
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm(
+    const torch::lazy::LazyTensorPtr& input, const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& bias,
+    torch::lazy::LazyTensorPtr& running_mean, torch::lazy::LazyTensorPtr& running_var, bool training,
+    double momentum, double eps);
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm_backward(
+    const torch::lazy::LazyTensorPtr& grad_out, const torch::lazy::LazyTensorPtr& input,
+    const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& running_mean,
+    const torch::lazy::LazyTensorPtr& running_var, const torch::lazy::LazyTensorPtr& save_mean,
+    const torch::lazy::LazyTensorPtr& save_invstd, bool training, double eps,
+    c10::ArrayRef<bool> output_mask);
+
+// Permute the dimensions of this tensor according to the given permutation.
+torch::lazy::LazyTensorPtr permute(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> dims);
+
+// Repeats the input tensor along each dimension by the given number of
+// repeats.
+torch::lazy::LazyTensorPtr repeat(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> repeats);
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
+
+torch::lazy::LazyTensorPtr select(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t index);
+
+torch::lazy::LazyTensorPtr slice(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                 int64_t end, int64_t step);
+
+// Squeeze out all trivial (size 1) dimensions.
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input);
+
+// Squeeze out the specified dimension index, if trivial (size 1). Returns
+// unchanged input otherwise.
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// In-place versions of the methods above.
+void squeeze_(torch::lazy::LazyTensorPtr& input);
+void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> svd(
+    const torch::lazy::LazyTensorPtr& input,
+    bool some, bool compute_uv);
+
+// Swap given dimensions of the input.
+torch::lazy::LazyTensorPtr transpose(const torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1);
+
+// In-place version of the method above.
+void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1);
+
+// Insert a dimension of size one at the specified position.
+torch::lazy::LazyTensorPtr unsqueeze(const torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// In-place version of the method above.
+void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// Like reshape, but it returns a view into the original tensor.
+torch::lazy::LazyTensorPtr view(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> output_size);
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp b/torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp
new file mode 100644
index 000000000000..1a1259f6441d
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp
@@ -0,0 +1,55 @@
+#include <torch/csrc/lazy/ts_backend/ts_autograd_functions.h>
+#include <ATen/Operators.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+
+namespace torch {
+namespace lazy {
+
+at::Tensor MaxPool3dAutogradFunctionTS::forward(
+    torch::autograd::AutogradContext* ctx, at::Tensor self,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  ctx->saved_data["kernel_size"] = kernel_size;
+  ctx->saved_data["stride"] = stride;
+  ctx->saved_data["padding"] = padding;
+  ctx->saved_data["dilation"] = dilation;
+  ctx->saved_data["ceil_mode"] = ceil_mode;
+  auto results = at::native::call_fallback_fn<
+      &ltc_eager_fallback, ATEN_OP(max_pool3d_with_indices)>::call(self,
+                                                                   kernel_size,
+                                                                   stride,
+                                                                   padding,
+                                                                   dilation,
+                                                                   ceil_mode);
+  ctx->save_for_backward({self, std::get<1>(results)});
+  return std::get<0>(results);
+}
+
+torch::autograd::variable_list MaxPool3dAutogradFunctionTS::backward(
+    torch::autograd::AutogradContext* ctx,
+    torch::autograd::variable_list grad_output) {
+  auto kernel_size = ctx->saved_data["kernel_size"].toIntList().vec();
+  auto stride = ctx->saved_data["stride"].toIntList().vec();
+  auto padding = ctx->saved_data["padding"].toIntList().vec();
+  auto dilation = ctx->saved_data["dilation"].toIntList().vec();
+  auto ceil_mode = ctx->saved_data["ceil_mode"].toBool();
+  auto saved = ctx->get_saved_variables();
+  auto self = saved[0];
+  at::Tensor grad;
+  auto indices = saved[1];
+  grad = at::native::call_fallback_fn<
+      &ltc_eager_fallback,
+      ATEN_OP(max_pool3d_with_indices_backward)>::call(grad_output[0], self,
+                                                       kernel_size, stride,
+                                                       padding, dilation,
+                                                       ceil_mode, indices);
+
+  at::Tensor undef;
+  torch::autograd::variable_list grad_inputs = {grad,  undef, undef,
+                                                undef, undef, undef};
+  return grad_inputs;
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_autograd_functions.h b/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
new file mode 100644
index 000000000000..3ed0587e2ad6
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+
+namespace torch {
+namespace lazy {
+
+struct MaxPool3dAutogradFunctionTS
+    : public torch::autograd::Function<MaxPool3dAutogradFunctionTS> {
+  static at::Tensor forward(torch::autograd::AutogradContext* ctx,
+                               at::Tensor self,
+                               at::IntArrayRef kernel_size,
+                               at::IntArrayRef stride,
+                               at::IntArrayRef padding,
+                               at::IntArrayRef dilation, bool ceil_mode);
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
new file mode 100644
index 000000000000..b7d3d9f50fa9
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -0,0 +1,267 @@
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+
+#include <ATen/Functions.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/generated/LazyNativeFunctions.h>
+#include <torch/csrc/lazy/ts_backend/config.h>
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+#include <torch/csrc/lazy/ts_backend/ir_builder.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace at {
+// This function is defined in the codegenerated RegisterDispatchKey.cpp file.
+// For the TorchScript backend, we have a special case where the registration does not happen
+// immediately (at static initialization time), so that if an external backend is loaded,
+// it has a chance to register itself, and TorchScript only registers itself if explicitly initialized
+extern TORCH_API void RegisterTorchScriptLazyNativeFunctions();
+extern TORCH_API void RegisterTorchScriptAutogradLazyNativeFunctions();
+}
+
+namespace torch {
+namespace lazy {
+
+struct TSBackendDeviceType : public BackendDeviceType {
+  TSBackendDeviceType() = delete;
+  TSBackendDeviceType(c10::DeviceType deviceType)
+  :BackendDeviceType((int8_t)deviceType) {
+    TORCH_CHECK(deviceType == at::kCPU || deviceType == at::kCUDA);
+  }
+
+  std::string toString() const override {
+    return c10::DeviceTypeName((c10::DeviceType)type);
+  }
+
+  c10::DeviceType c10Type() const {
+    return (c10::DeviceType)type;
+  }
+};
+
+class TSBackendImpl : public torch::lazy::BackendImplInterface {
+ public:
+  TSBackendImpl() : default_device_type_(at::kCPU) {
+    // TODO(whc) unify how all our flags are set and parsed as envs
+    static bool env_use_cuda = std::getenv("LTC_TS_CUDA") != nullptr;
+    auto type = (env_use_cuda || FLAGS_torch_lazy_ts_cuda) ? at::kCUDA : at::kCPU;
+    default_device_type_ = TSBackendDeviceType(type);
+  }
+
+  const IrBuilder* GetIrBuilder() const override {
+    static const IrBuilder* builder = new TorchScriptIrBuilder();
+    return builder;
+  }
+
+  std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      torch::lazy::BackendDevice device,
+      c10::ArrayRef<torch::lazy::Node*> post_order,
+      torch::lazy::Util::EmissionMap emit_status) const override {
+    return std::make_unique<torch::lazy::TSLoweringContext>(
+        name, device, post_order, emit_status);
+  }
+
+  std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      torch::lazy::BackendDevice device) const override {
+    return std::make_unique<torch::lazy::TSLoweringContext>(name, device);
+  }
+
+  std::vector<std::string> GetCompilationDevices(
+      const std::string& device,
+      c10::ArrayRef<std::string> devices) const override {
+    return std::vector<std::string>(devices.begin(), devices.end());
+  }
+
+  at::Tensor MakeTensorFromComputationData(
+      const torch::lazy::BackendDataPtr data,
+      c10::optional<at::ScalarType> logical_scalar_type) const override {
+    const auto ts_data = std::static_pointer_cast<TSData>(data);
+    return ts_data->data();
+  }
+
+  torch::lazy::BackendDataPtr MakeComputationDataFromTensor(
+      const at::Tensor& tensor,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device) const override {
+    at::TensorOptions options = tensor.options().device(
+        default_device_type_.c10Type(), device.ordinal());
+    if (tensor.device().type() == default_device_type_.c10Type() &&
+        default_device_type_.c10Type() == at::kCUDA) {
+      return std::make_shared<TSData>(
+          tensor.to(options, /*non_blocking=*/true), shape, device);
+    } else if (tensor.device().type() == at::kCPU && tensor.numel() == 1) {
+      // calling .item() on singleton cpu tensor is fast, and using fill is a safe,
+      // async way to copy cpu to cuda for a single value
+      auto device_tensor = at::full(tensor.sizes(), tensor.item(), options);
+      return std::make_shared<TSData>(device_tensor, shape, device);
+    } else {
+      return std::make_shared<TSData>(
+          tensor.to(options, /*non_blocking=*/false), shape, device);
+    }
+  }
+
+  torch::lazy::BackendDataPtr MakeComputationDataFromScalar(
+      const at::Scalar& scalar,
+      const torch::lazy::BackendDevice& device) const override {
+    return std::make_shared<TSData>(scalar, device);
+  }
+
+  torch::lazy::BackendDataPtr GetComputationDataFromNode(Node* node) const {
+    auto* device_data_node = dynamic_cast<DeviceData*>(node);
+    if (!device_data_node) {
+      return nullptr;
+    }
+    return device_data_node->data();
+  }
+
+  std::string GetComputationBackendText(
+      const torch::lazy::ComputationPtr computation) const override {
+    auto ts_computation =
+        static_cast<torch::lazy::TSComputation*>(computation.get());
+    return ts_computation->graph()->toString();
+  }
+
+  //////////////computation client interfaces///////////////////////
+
+ public:
+  torch::lazy::BackendDataPtr CreateDataPlaceholder(
+      const torch::lazy::BackendDevice& device,
+      const torch::lazy::Shape& shape) const override;
+
+  std::vector<torch::lazy::ComputationPtr> Compile(
+      std::vector<torch::lazy::ComputationPtr> instances) const override;
+
+  std::vector<torch::lazy::BackendDataPtr> ExecuteComputation(
+      torch::lazy::Computation& computation,
+      c10::ArrayRef<torch::lazy::BackendDataPtr> arguments,
+      const torch::lazy::BackendDevice& device) const override;
+
+  std::shared_ptr<torch::lazy::BackendDeviceType> GetDefaultDeviceType()
+      const override {
+    return std::make_shared<BackendDeviceType>(default_device_type_);
+  }
+
+  at::DeviceType EagerFallbackDeviceType() const override;
+
+  void SetDefaultDeviceType(std::string type) override {
+    default_device_type_ = TSBackendDeviceType(c10::Device(type).type());
+    // The first CUDA usage could happen via lazy tensors. Initialize CUDA here
+    // to account for that, at::scalar_tensor constructor triggers everything we
+    // need.
+    static auto init_cuda = default_device_type_.c10Type() == at::kCUDA
+        ? c10::optional<at::Tensor>(
+              at::scalar_tensor(0, at::TensorOptions().device(at::kCUDA)))
+        : c10::nullopt;
+  }
+
+  std::vector<torch::lazy::BackendDevice> GetBackendDevices() const override;
+
+  torch::lazy::BackendDevice GetBackendDevice(
+      c10::Device device) const override;
+
+  void SetRngSeed(size_t seed) const override {
+    LOG(FATAL) << "Not implemented yet.";
+  }
+
+  // std::map<std::string, Metric> GetMetrics() const override { return {}; }
+
+  // MemoryInfo GetMemoryInfo(const std::string& device) override {
+  //   LOG(FATAL) << "Not implemented yet.";
+  // }
+
+  void PrepareToExit() const override;
+
+ private:
+  TSBackendDeviceType default_device_type_;
+};
+
+torch::lazy::BackendDataPtr TSBackendImpl::CreateDataPlaceholder(
+    const torch::lazy::BackendDevice& device,
+    const torch::lazy::Shape& shape) const {
+  return std::make_shared<TSData>(shape, device);
+}
+
+std::vector<torch::lazy::ComputationPtr> TSBackendImpl::Compile(
+    std::vector<torch::lazy::ComputationPtr> instances) const {
+
+  for (const auto& instance : instances) {
+    auto ts_computation =
+        static_cast<torch::lazy::TSComputation*>(instance.get());
+    if (!ts_computation->in_mark_step) {
+      LOG(WARNING) << "Compile outside of mark step";
+    }
+  }
+  return instances;
+}
+
+std::vector<torch::lazy::BackendDataPtr> TSBackendImpl::ExecuteComputation(
+    torch::lazy::Computation& computation,
+    c10::ArrayRef<torch::lazy::BackendDataPtr> arguments,
+    const torch::lazy::BackendDevice& device) const {
+  torch::jit::GraphExecutor& graph_executor =
+      static_cast<torch::lazy::TSComputation&>(computation).graph_executor();
+  std::vector<torch::jit::IValue> stack;
+  for (const auto& argument : arguments) {
+    const auto ts_data = std::static_pointer_cast<TSData>(argument);
+    if (ts_data->scalar.has_value()) {
+      stack.emplace_back(ts_data->scalar.value());
+    } else {
+      // TODO(whc) should this check be made more general? it's written somewhat
+      // oddly
+      CHECK(
+          (c10::DeviceType)default_device_type_.type != at::kCUDA ||
+          ts_data->data().device().type() == at::kCUDA);
+      stack.emplace_back(ts_data->data());
+    }
+  }
+  graph_executor.run(stack);
+  std::vector<torch::lazy::BackendDataPtr> results;
+  for (torch::jit::IValue component : stack) {
+    at::Tensor result = component.toTensor();
+    at::IntArrayRef result_sizes = result.sizes();
+    torch::lazy::Shape shape(
+        result.scalar_type(),
+        std::vector<int64_t>(result_sizes.begin(), result_sizes.end()));
+    results.push_back(std::make_shared<TSData>(result, shape, device));
+  }
+  return results;
+}
+
+std::vector<torch::lazy::BackendDevice> TSBackendImpl::GetBackendDevices()
+    const {
+  std::vector<torch::lazy::BackendDevice> devices;
+  // TODO(whc) figure out how to query available devices from pytorch
+  devices.emplace_back(GetBackendDevice(c10::Device(c10::kCPU, 0)));
+  devices.emplace_back(GetBackendDevice(c10::Device(c10::kCUDA, 0)));
+  return devices;
+}
+
+torch::lazy::BackendDevice TSBackendImpl::GetBackendDevice(
+    c10::Device device) const {
+  // Note, we ignore the device type specified by the c10::Device since it is
+  // expected to be a virtual device (lazy::), but we need to change this when
+  // we support lazy as a mode
+  return torch::lazy::BackendDevice(GetDefaultDeviceType(), device.index());
+}
+
+void TSBackendImpl::PrepareToExit() const {}
+
+c10::DeviceType TSBackendImpl::EagerFallbackDeviceType() const {
+  // For TS backend, hardware device _is_ eager device
+  return (c10::DeviceType)GetDefaultDeviceType()->type;
+}
+
+torch::lazy::BackendImplInterface* GetTSBackendImpl() {
+  static TSBackendImpl* ts_backend_impl = new TSBackendImpl();
+  return ts_backend_impl;
+}
+
+void InitTorchScriptBackend() {
+  at::RegisterTorchScriptLazyNativeFunctions();
+  at::RegisterTorchScriptAutogradLazyNativeFunctions();
+  register_ts_ltc_eager_fallback();
+  static std::unique_ptr<BackendRegistrar> s_registrar;
+  s_registrar = std::make_unique<BackendRegistrar>(GetTSBackendImpl());
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.h b/torch/csrc/lazy/ts_backend/ts_backend_impl.h
new file mode 100644
index 000000000000..d238e8263e57
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_interface.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API TSData : public torch::lazy::BackendData {
+ public:
+  TSData(const at::Scalar& scalar, const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, torch::lazy::Shape(scalar.type(), {})),
+        scalar(scalar) {}
+
+  TSData(
+      const at::Tensor& data,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape), data_(data) {}
+
+  TSData(
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape) {}
+
+  Handle GetHandle() override {
+    return reinterpret_cast<int64_t>(this);
+  }
+
+  void Assign(const torch::lazy::BackendData& data) override {
+    data_ = static_cast<const TSData&>(data).data_;
+  }
+
+  bool HasValue() const override {
+    return data_.defined();
+  }
+
+  at::Tensor data() {
+    return data_;
+  }
+
+  c10::optional<at::Scalar> scalar;
+
+ private:
+  at::Tensor data_;
+};
+
+TORCH_API torch::lazy::BackendImplInterface* GetTSBackendImpl();
+
+TORCH_API void InitTorchScriptBackend();
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
new file mode 100644
index 000000000000..9951bb123b57
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
@@ -0,0 +1,368 @@
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+
+#include <ATen/Functions.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/config.h>
+#include <torch/library.h>
+#include <sstream>
+#include <unordered_map>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+std::vector<at::Tensor> _to_eager(
+    at::TensorList tensors,
+    c10::DeviceType device_type) {
+  switch (device_type) {
+    case at::kCPU: {
+      return at::_to_cpu(tensors);
+    }
+    default: {
+      std::vector<at::Tensor> eager_tensors;
+      for (const auto& t : tensors) {
+        c10::TensorOptions options = t.options().device(device_type);
+        at::Tensor eager_tensor = t.to(
+            options,
+            /*non_blocking*/ false,
+            /*copy*/ false);
+        eager_tensors.push_back(eager_tensor);
+      }
+      return eager_tensors;
+    }
+  }
+}
+
+// convenience helper for converting tensors to cpu
+
+std::vector<at::Tensor> to_eager(
+    const at::TensorList& tensors,
+    c10::DeviceType device_type) {
+  // We can't just call _to_eager() on the entire list of Tensors because it
+  // will break on undefined tensors. Separate out undefined tensors first.
+  std::vector<at::Tensor> eager_tensors(tensors.size());
+  std::vector<at::Tensor> valid_tensors;
+  std::vector<bool> to_translate(tensors.size());
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const at::Tensor& tensor = tensors[i];
+    // Explicitly handling undefined tensors here instead of letting `_to_eager`
+    // handle it. Otherwise, we'd need to require all backends with their own
+    // implementation of _to_eager to properly handle undefined tensors.
+    if (tensor.defined()) {
+      to_translate[i] = true;
+      valid_tensors.push_back(tensor);
+    } else {
+      eager_tensors[i] = tensor;
+    }
+  }
+  auto eager_valid_tensors = _to_eager(valid_tensors, device_type);
+  for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) {
+    if (to_translate[i]) {
+      eager_tensors[i] = std::move(eager_valid_tensors[defined_pos++]);
+    }
+  }
+  return eager_tensors;
+}
+
+std::vector<c10::optional<at::Tensor>> to_eager(
+    const std::vector<c10::optional<at::Tensor>>& tensors,
+    c10::DeviceType device_type) {
+  // We can't just call _to_eager() on the entire list of Tensors because it
+  // will break on undefined tensors. Separate out undefined tensors first.
+  std::vector<c10::optional<at::Tensor>> eager_tensors(tensors.size());
+  std::vector<at::Tensor> valid_tensors;
+  std::vector<bool> to_translate(tensors.size());
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const c10::optional<at::Tensor>& tensor = tensors[i];
+    // Explicitly handling undefined tensors here instead of letting `_to_eager`
+    // handle it. Otherwise, we'd need to require all backends with their own
+    // implementation of _to_eager to properly handle undefined tensors.
+    if (tensor.has_value() && tensor->defined()) {
+      to_translate[i] = true;
+      valid_tensors.push_back(*tensor);
+    } else {
+      eager_tensors[i] = tensor;
+    }
+  }
+  auto eager_valid_tensors = _to_eager(valid_tensors, device_type);
+  for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) {
+    if (to_translate[i]) {
+      eager_tensors[i] = std::move(eager_valid_tensors[defined_pos++]);
+    }
+  }
+  return eager_tensors;
+}
+
+c10::DispatchKey dispatch_key(c10::DeviceType device_type) {
+  switch (device_type) {
+    case at::kCPU: {
+      return c10::DispatchKey::CPU;
+    }
+    case at::kCUDA: {
+      return c10::DispatchKey::CUDA;
+    }
+    default: {
+      AT_ERROR("Unsupported device type: ", device_type);
+    }
+  }
+}
+
+c10::optional<c10::Device> compute_target_device(
+    std::vector<at::Tensor>& t_args,
+    std::vector<c10::List<at::Tensor>> tlist_args,
+    std::vector<c10::List<c10::optional<at::Tensor>>> opt_tlist_args) {
+  // Decide what device to move the output tensor(s) to.
+  // The current convention is that we use the first tensor arg to pick the
+  // device Barring that, we take the first tensor from a TensorList arg.
+  if (t_args.size() > 0) {
+    return t_args[0].device();
+  } else {
+    // We need to loop through all of the (potentially multiple) TensorList
+    // arguments In case, e.g. the first one is empty but the second is not.
+    for (auto& tens_list : tlist_args) {
+      for (const auto i : c10::irange(tens_list.size())) {
+        return tens_list.get(i).device();
+      }
+    }
+    for (auto& tens_list : opt_tlist_args) {
+      for (const auto i : c10::irange(tens_list.size())) {
+        if (tens_list.get(i).has_value()) {
+          return tens_list.get(i)->device();
+        }
+      }
+    }
+  }
+  return c10::nullopt;
+}
+
+} // namespace
+
+static std::unordered_map<std::string, ::torch::lazy::Counter*>
+    _eager_fallback_counters;
+
+bool force_eager_fallback(c10::Symbol op) {
+  auto force_str = getLTCForceFallback();
+  if (!force_str.empty()) {
+    static auto force_sym = c10::Symbol::fromQualString(std::string(force_str));
+    if (op == force_sym) {
+      return true;
+    }
+  }
+  if(op == at::aten::nonzero){
+    // When symbolic shape mode is not enabled, the nonzero shape function
+    // returns an incorrect result.
+    return !symbolicShapeEnabled();
+  }
+
+  return false;
+}
+
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  // TODO(whc) this FN_TRACK thing hasn't been used so far in LTC iirc but could land/re-enable it
+  // LTC_FN_TRACK(3);;
+  const auto name = c10::toString(op.operator_name());
+
+  // Manually applying the TORCH_LAZY_COUNTER macro.
+  // We need to do it ourselves and explicitly keep a mapping of counters
+  // because this boxed fallback kernel is used by multiple operators,
+  // and the macro stamps out a static Counter object with a fixed name
+  // at the code location that it was called.
+  if (_eager_fallback_counters.find(name) == _eager_fallback_counters.end()) {
+    _eager_fallback_counters[name] = new ::torch::lazy::Counter(name);
+  }
+  _eager_fallback_counters[name]->AddValue(1);
+
+  auto& args = op.schema().arguments();
+  auto arguments = torch::jit::last(stack, args.size());
+
+  // Log each tensor argument.
+  for (const auto & ivalue : arguments) {
+    if (ivalue.isTensor()) {
+      VLOG(3) << ivalue.toTensor().toString();
+    }
+  }
+
+  // Call the actual boxed CPU fallback.
+  ts_eager_fallback(
+      op, stack, torch::lazy::getBackend()->EagerFallbackDeviceType());
+}
+
+void register_ts_ltc_eager_fallback() {
+  static auto m = MAKE_TORCH_LIBRARY_IMPL(_, Lazy);
+  // Most backends use TORCH_LIBRARY_* macros which perform their dispatcher
+  // registrations at static library init time, but the lazy Torchscript backend
+  // does not since it is built in the main torch lib but not always used.
+  // In particular, if another external backend wants to register itself to the
+  // same key (Lazy), Torchscript backend must not be initialized.
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&ltc_eager_fallback>());
+}
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type) {
+  auto& schema_args = op.schema().arguments();
+  const auto num_arguments = schema_args.size();
+  auto arguments = torch::jit::last(stack, num_arguments);
+  const auto arguments_begin = stack->size() - num_arguments;
+
+  std::vector<at::Tensor> tensor_args;
+  std::vector<int> tensor_args_indices;
+
+  std::vector<c10::List<at::Tensor>> tensorlist_args;
+  std::vector<c10::List<c10::optional<at::Tensor>>> opt_tensorlist_args;
+
+  // Step 1: Convert all non-eager tensor inputs into eager tensors and put them
+  // on the stack at the correct indices.
+  for (int64_t idx = 0; idx < arguments.size(); ++idx) {
+    const auto& ivalue = arguments[idx];
+    if (ivalue.isTensor()) {
+      tensor_args.push_back(ivalue.toTensor());
+      tensor_args_indices.push_back(idx);
+    } else if (ivalue.isTensorList()) {
+      // Note: we copy each TensorList argument to eager individually out of
+      // convenience, but XLA would benefit from materializing all tensor and
+      // TensorList args onto the CPU at the same time. We can improve this if
+      // we need better perf for XLA's CPU fallbacks.
+      auto eager_ivalue = c10::IValue(c10::List<at::Tensor>(
+          to_eager(ivalue.toTensorVector(), device_type)));
+      (*stack)[arguments_begin + idx] = std::move(eager_ivalue);
+      tensorlist_args.push_back(ivalue.toTensorList());
+    } else if (ivalue.isOptionalTensorList()) {
+      auto eager_ivalue = c10::IValue(c10::List<c10::optional<at::Tensor>>(
+          to_eager(ivalue.toOptionalTensorVector(), device_type)));
+      (*stack)[arguments_begin + idx] = std::move(eager_ivalue);
+      opt_tensorlist_args.push_back(ivalue.toOptionalTensorList());
+    }
+  }
+  // XLA requires all of the tensor arguments to be gathered up and converted to
+  // CPU together.
+  auto eager_tensors = to_eager(tensor_args, device_type);
+
+  for (auto i = 0; i < tensor_args_indices.size(); ++i) {
+    auto idx = tensor_args_indices[i];
+    (*stack)[arguments_begin + idx] = c10::IValue(eager_tensors[i]);
+  }
+
+  // Step 2: Call the underlying eager implementation of the operator
+  op.redispatchBoxed(c10::DispatchKeySet(dispatch_key(device_type)), stack);
+
+  // Step 3: We need to take special care to handle mutable aliases properly:
+  // If any input tensors are mutable aliases, we need to directly copy the
+  // updated data on the eager tensors back to the original inputs.
+  for (int64_t i = 0; i < tensor_args_indices.size(); ++i) {
+    auto tensor_idx = tensor_args_indices[i];
+    const auto alias_info = schema_args[tensor_idx].alias_info();
+    if (alias_info != nullptr && alias_info->isWrite()) {
+      at::_copy_from_and_resize(eager_tensors[i], tensor_args[i]);
+    }
+  }
+
+  // Step 4: Convert any eager output tensors back to the original input device.
+  // For mutable alias'd outputs, we also need to take special care
+  // to move the ORIGINAL input tensor back onto the stack, in place of
+  // the temporary eager output tensor that we created.
+  //
+  // Note [Eager Fallback Does Not Handle View Operators]
+  // Also note that we are incapable of handling immutable alises properly.
+  // Why?
+  // Schemas with an immutable alias'd tensor outputs correspond to view
+  // operators. For example, the `view_as` schema from native_functions.yaml:
+  // `view_as(Tensor(a) self, Tensor other) -> Tensor(a)`
+  // We can't handle these ops properly, because view ops are supposed to return
+  // a NEW tensor that shares the SAME storage as the original tensor.
+  // However, the new tensor that we created cannot share the same storage,
+  // since it lives on the eager CPU / CUDA device and the original tensor lives
+  // on a different device. Because of that, we warn if someone attempts to call
+  // the eager fallback on a view operator (this is to maintain BC for view ops
+  // for XLA that fall back to CPU).
+  const auto& schema_returns = op.schema().returns();
+  const auto& num_returns = schema_returns.size();
+  auto returns = torch::jit::last(stack, num_returns);
+  const auto returns_begin = stack->size() - num_returns;
+
+  for (int64_t idx = 0; idx < returns.size(); ++idx) {
+    if (returns[idx].isTensor()) {
+      const auto& return_tens = returns[idx].toTensor();
+      if (return_tens.defined()) {
+        const auto alias_info = schema_returns[idx].alias_info();
+        if (alias_info != nullptr && alias_info->isWrite()) {
+          // Case (1): mutable alias case. Move the input ivalue directly onto
+          // the stack in place of the existing eager output tensor.
+          bool found_alias = false;
+          // We could store some extra metadata on the function schema to avoid
+          // the loop here if we need to improve perf.
+          for (int64_t i = 0; i < tensor_args_indices.size(); ++i) {
+            auto input_tensor_idx = tensor_args_indices[i];
+            const auto& input_tensor = eager_tensors[i];
+            const auto input_alias_info =
+                schema_args[input_tensor_idx].alias_info();
+            if (input_tensor.defined() && input_alias_info != nullptr &&
+                *alias_info == *input_alias_info) {
+              // We've found the original input tensor that aliases with the
+              // current output. Wrap it in an IValue and put it directly on the
+              // stack.
+              (*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]);
+              found_alias = true;
+              break;
+            }
+          }
+          TORCH_CHECK(
+              found_alias,
+              "The operator ",
+              op.schema().operator_name(),
+              " appears to have invalid alias information. ",
+              "Found a return tensor argument with a mismatched "
+              "mutable alias: ",
+              schema_returns[idx]);
+        } else {
+          c10::optional<c10::Device> tgt_device =
+              compute_target_device(tensor_args, tensorlist_args, opt_tensorlist_args);
+          if (alias_info != nullptr && !alias_info->isWrite()) {
+            // immutable alias (view) case: Warn here, since we're copying and
+            // not creating a view.
+            // If this operator is needed, the backend should provide a kernel
+            // for it.
+            // See Note [Eager Fallback Does Not Handle View Operators]
+            std::stringstream dev_str;
+            if (tgt_device) {
+              dev_str << *tgt_device;
+            } else {
+              dev_str << "<none>";
+            }
+            TORCH_WARN(
+                false,
+                "The operator ",
+                op.schema().operator_name(),
+                " appears to be a view operator, ",
+                "but it has no implementation for the backend \"",
+                dev_str.str(),
+                "\". View operators don't support ",
+                "falling back to run on the eager, since the tensor's "
+                "storage cannot be shared across devices.");
+          }
+          // Case (2): copy case. Copy the eager output tensor to the original
+          // device.
+
+          // We technically  might not have a target device, e.g. if you call
+          // torch.cat() with an empty list In that case, we shouldn't have any
+          // tensors to schlep across devices anyway.
+          if (tgt_device) {
+            (*stack)[returns_begin + idx] =
+                c10::IValue(returns[idx].toTensor().to(*tgt_device));
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.h b/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
new file mode 100644
index 000000000000..7c805a63364a
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <functional>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+namespace torch {
+namespace lazy {
+
+bool force_eager_fallback(c10::Symbol op);
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type);
+
+// The TorchScript backend does not register itself with pytorch dispatcher
+// until it is explicitly initialized.  This function should only be called
+// by the main Torchscript backend init function.
+void register_ts_ltc_eager_fallback();
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
new file mode 100644
index 000000000000..c2d3decd54cb
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@@ -0,0 +1,70 @@
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+TSLoweringContext::TSLoweringContext(
+    const std::string& name,
+    BackendDevice device)
+    : torch::lazy::LoweringContext(name, device),
+      graph_(std::make_shared<torch::jit::Graph>()) {
+  lowering_ = TSNodeLoweringInterface::Create(this);
+}
+
+TSLoweringContext::TSLoweringContext(
+    const std::string& name,
+    BackendDevice device,
+    c10::ArrayRef<Node*> post_order,
+    Util::EmissionMap emit_status)
+    : torch::lazy::LoweringContext(name, device, post_order, emit_status),
+      graph_(std::make_shared<torch::jit::Graph>()) {
+  lowering_ = TSNodeLoweringInterface::Create(this);
+  for (auto node : post_order) {
+    bool ok = lowering_->Lower(node);
+    CHECK(ok) << "Failed to lower: " << *node;
+  }
+}
+
+void TSLoweringContext::AssignOutputOp(
+    const Output& output,
+    torch::jit::Value* op) {
+  const TsNode* ts_node = static_cast<const TsNode*>(output.node);
+  std::string stack_trace = ts_node->getPythonStacktrace();
+  if (!stack_trace.empty()) {
+    op->node()->s_(c10::Symbol::attr("source"), stack_trace);
+  }
+  emitted_outputs_[output] = op;
+}
+
+torch::jit::Value* TSLoweringContext::GetParameter(BackendDataPtr data) {
+  const auto ts_data =
+      std::static_pointer_cast<TSData>(data);
+  BackendData::Handle handle = ts_data->GetHandle();
+  auto it = parameters_map_.find(handle);
+  if (it == parameters_map_.end()) {
+    torch::jit::Value* param =
+        graph_->addInput(c10::str("p", parameters_.size()));
+    if (ts_data->scalar.has_value()) {
+      auto scalarType = ts_data->scalar.value().type();
+      if (isFloatingType(scalarType)) {
+        param->setType(c10::FloatType::get());
+      } else if (isIntegralType(scalarType, /*includeBool=*/true)) {
+        param->setType(c10::IntType::get());
+      } else {
+        TORCH_CHECK(
+            false, "Unhandled scalar type: ", c10::toString(scalarType));
+      }
+    }
+    it = parameters_map_.emplace(handle, Parameter{param, parameters_.size()})
+             .first;
+    parameters_.push_back(ts_data);
+  }
+  parameter_sequence_.push_back(it->second.index);
+  return it->second.param;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index bc198e5d71d1..d66a9fcfdcb5 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -1,9 +1,12 @@
 #pragma once
 
+#include <sstream>
+
+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir.h>
 #include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
-#include <torch/csrc/jit/runtime/graph_executor.h>
-#include <torch/csrc/api/include/torch/jit.h>
 
 namespace torch {
 namespace lazy {
@@ -22,7 +25,8 @@ class TORCH_API TSNodeLoweringInterface {
 
   virtual bool Lower(const Node* node) = 0;
 
-  static std::unique_ptr<TSNodeLoweringInterface> Create(LoweringContext* loctx);
+  static std::unique_ptr<TSNodeLoweringInterface> Create(
+      LoweringContext* loctx);
 };
 
 class TORCH_API TSComputation : public Computation {
@@ -34,7 +38,9 @@ class TORCH_API TSComputation : public Computation {
     }
   }
 
-  int parameters_size() const override { return parameter_names_.size(); }
+  int parameters_size() const override {
+    return parameter_names_.size();
+  }
 
   const std::vector<Shape>& parameter_shapes() const override {
     throw std::runtime_error(
@@ -52,9 +58,19 @@ class TORCH_API TSComputation : public Computation {
     return result_shape_;
   }
 
-  std::shared_ptr<torch::jit::Graph> graph() const { return graph_; }
+  const std::string to_string() const override {
+    std::ostringstream oss;
+    oss << *graph_;
+    return oss.str();
+  }
+
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
 
-  torch::jit::GraphExecutor& graph_executor() { return graph_executor_; }
+  torch::jit::GraphExecutor& graph_executor() {
+    return graph_executor_;
+  }
 
  private:
   std::shared_ptr<torch::jit::Graph> graph_;
@@ -68,15 +84,11 @@ class TORCH_API TSLoweringContext : public LoweringContext {
  public:
   TSLoweringContext(const std::string& name, const BackendDevice device);
 
-  TSLoweringContext(const std::string& name, BackendDevice device,
-                    c10::ArrayRef<Node*> post_order,
-                    Util::EmissionMap emit_status);
-
-  // TODO(whc) replace these when real impl lands;
-  // I am just landing the interface in this diff, but MSVC won't allow undefined virtual funcs
-  Shape GetResultShape(size_t index) const override {
-    TORCH_INTERNAL_ASSERT(false, "not implemented");
-  }
+  TSLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<Node*> post_order,
+      Util::EmissionMap emit_status);
 
   size_t AddResult(const Output& output) override {
     return AddResult(GetOutputOp(output));
@@ -129,7 +141,9 @@ class TORCH_API TSLoweringContext : public LoweringContext {
   // held in data.
   torch::jit::Value* GetParameter(BackendDataPtr data);
 
-  std::shared_ptr<torch::jit::Graph> graph() const { return graph_; }
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
 
  private:
   struct Parameter {
@@ -149,5 +163,5 @@ class TORCH_API TSLoweringContext : public LoweringContext {
   std::unique_ptr<TSNodeLoweringInterface> lowering_;
 };
 
-}  // namespace lazy
-}  // namespace torch
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
new file mode 100644
index 000000000000..b7c9b8086b85
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -0,0 +1,545 @@
+#include <ATen/Operators.h>
+#include <ATen/Functions.h>
+#include <ATen/MetaFunctions.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/core/tensor_impl.h>
+#include <torch/csrc/lazy/generated/LazyNativeFunctions.h>
+#include <torch/csrc/lazy/ts_backend/config.h>
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+#include <torch/csrc/lazy/ts_backend/tensor_aten_ops.h>
+#include <torch/csrc/lazy/ts_backend/ts_autograd_functions.h>
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/to_copy.h>
+#include <torch/library.h>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+at::Tensor CreateLtcTensor(const at::Tensor& tensor,
+                           const c10::optional<torch::lazy::BackendDevice>& device) {
+  if (tensor.defined() && device) {
+    return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::LazyTensor::Create(tensor, *device));
+  }
+  return tensor;
+}
+
+c10::optional<torch::lazy::BackendDevice> GetLtcDevice(const c10::optional<c10::Device>& device) {
+  if (!device) {
+    return c10::nullopt;
+  }
+  if (device->type() != at::kLazy) {
+    return c10::nullopt;
+  }
+  return torch::lazy::atenDeviceToBackendDevice(*device);
+}
+
+}  // namespace
+
+at::Tensor LazyNativeFunctions::alias(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::as_strided(
+    const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<int64_t> storage_offset) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  torch::lazy::LazyTensorPtr self_tensor = torch::lazy::TryGetLtcTensor(self);
+  auto xsize = torch::lazy::ToI64Vector(size);
+  auto xstride = torch::lazy::ToI64Vector(stride);
+  if (!torch::lazy::StrideIsSupported(xstride)) {
+    return at::native::call_fallback_fn<
+        &ltc_eager_fallback, ATEN_OP(as_strided)>::call(self, size, stride,
+                                                        storage_offset);
+  }
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::as_strided(
+      self_tensor, std::move(xsize), std::move(xstride), storage_offset));
+}
+
+const at::Tensor& LazyNativeFunctions::as_strided_(
+    const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<int64_t> storage_offset) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  auto xsize = torch::lazy::ToI64Vector(size);
+  auto xstride = torch::lazy::ToI64Vector(stride);
+  if (!torch::lazy::StrideIsSupported(xstride)) {
+    return at::native::call_fallback_fn<
+        &ltc_eager_fallback, ATEN_OP(as_strided_)>::call(self, size, stride,
+                                                         storage_offset);
+  }
+  torch::lazy::as_strided_(self_tensor, std::move(xsize),
+                                    std::move(xstride), storage_offset);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
+  auto self_lt = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(self_lt->Create(self_lt->GetIrValue(), self_lt->GetDevice()));
+}
+
+at::Tensor LazyNativeFunctions::_copy_from(const at::Tensor& self,
+                                           const at::Tensor& dst,
+                                           bool non_blocking) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto dst_tensor = torch::lazy::TryGetLtcTensor(dst);
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  if (!self_tensor) {
+    // providing a new 'eager' value (self) for an existing lazy tensor (dst)
+    static bool sync_update = FLAGS_torch_lazy_ts_tensor_update_sync;
+    CHECK(dst_tensor);
+    dst_tensor->UpdateFromTensor(self, /*sync=*/sync_update);
+  } else if (!dst_tensor) {
+    // materializing a lazy tensor (self) and copying its value into eager tensor (dst)
+    // detached=false lets us skip a copy in `ToTensor`, which should be safe
+    // because we are only going to use the tensor for dst.copy_()
+    CHECK(self_tensor);
+    at::Tensor tensor = self_tensor->ToTensor(/*detached=*/false);
+    at::Tensor typed_tensor =
+        torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
+    dst.resize_as_(typed_tensor).copy_(typed_tensor);
+  } else {
+    // Copying one lazy tensor to another
+    if (!dst_tensor->CurrentIrValue()) {
+      // if dest is not backed by IR (e.g. result of some lazy operation),
+      // then it should have at::Tensor data backing it instead
+      auto dst_tensor_data = dst_tensor->CurrentTensorData();
+      CHECK(dst_tensor_data);
+      auto src_tensor_data = self_tensor->CurrentTensorData();
+      if (src_tensor_data) {
+        // both src/dst are simply backed by at::Tensor data, no IR- do a straightforward copy
+        dst_tensor_data->copy_(*src_tensor_data);
+      } else {
+        // src needs to be materialized before its result can be used for a copy into dst
+        // since we use the src tensor only for making a copy, we don't need to detach it
+        // note: it would be even more efficient if we could cause ToTensor to materialize the
+        // value directly into dst's buffer (that would need to be detached though).
+        dst_tensor_data->copy_(self_tensor->ToTensor(/*detached=*/false));
+      }
+    } else {
+      copy_(dst_tensor, self_tensor);
+      auto* impl = dynamic_cast<torch::lazy::LTCTensorImpl*>(dst.unsafeGetTensorImpl());
+      impl->set_tensor(dst_tensor);
+    }
+  }
+  return dst;
+}
+
+at::Tensor LazyNativeFunctions::_copy_from_and_resize(const at::Tensor& self,
+                                                      const at::Tensor& dst) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto dst_tensor = torch::lazy::TryGetLtcTensor(dst);
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  if (!self_tensor) {
+    CHECK(dst_tensor);
+    dst_tensor->UpdateFromTensorOut(self);
+  } else if (!dst_tensor) {
+    CHECK(self_tensor);
+    at::Tensor tensor = self_tensor->ToTensor(/*detached=*/true);
+    at::Tensor typed_tensor =
+        torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
+    dst.resize_as_(typed_tensor).copy_(typed_tensor);
+  } else {
+    // at this point we know dst is a lazy tensor
+    auto* dest_impl =
+        dynamic_cast<torch::lazy::LTCTensorImpl*>(dst.unsafeGetTensorImpl());
+    dest_impl->tensor()->UpdateFromTensorOut(self_tensor);
+    dest_impl->force_refresh_sizes();
+  }
+  return dst;
+}
+
+at::Tensor LazyNativeFunctions::_to_copy(const at::Tensor & self,
+                                         c10::optional<at::ScalarType> dtype,
+                                         c10::optional<at::Layout> layout,
+                                         c10::optional<at::Device> device,
+                                         c10::optional<bool> pin_memory,
+                                         bool non_blocking,
+                                         c10::optional<at::MemoryFormat> memory_format) {
+
+    if (force_eager_fallback(at::aten::_to_copy)) {
+      TORCH_INTERNAL_ASSERT(false,
+        "Fallback is currently impossible for _to_copy since the fallback helper itself reinvokes _to_copy");
+    }
+
+    auto options = self.options();
+    if (dtype) {
+      // I put each of these setters in a conditional instead of doing `self.options().dtype(dtype).layout(layout)...
+      // because calling .dtype(nullopt) on an options() that already has dtype appears to wipe it
+      options = options.dtype(dtype);
+    }
+    if (layout) {
+      options = options.layout(layout);
+    }
+    if (memory_format) {
+      options = options.memory_format(memory_format);
+    }
+    if (pin_memory) {
+      // TODO(whc) can we honor 'pin_memory' in some/all cases?
+      options = options.pinned_memory(pin_memory);
+      TORCH_WARN_ONCE("Pinned memory used in lazy _to_copy, check if the behavior is as intended");
+    }
+
+    TORCH_LAZY_FN_COUNTER("lazy::");
+    auto lazy_self = torch::lazy::TryGetLtcTensor(self);
+    if (!lazy_self && device && device->type() == c10::kLazy) {
+      // Case 1: eager->lazy (we create a new lazy tensor)
+
+      auto eager_tensor = self.to(options, /*non_blocking=*/non_blocking, /*copy=*/true);
+      lazy_self = torch::lazy::GetOrCreateLtcTensor(eager_tensor,
+                                                    torch::lazy::atenDeviceToBackendDevice(*device));
+      return torch::lazy::CreateAtenFromLtcTensor(lazy_self);
+    } else if(device && device->type() != c10::kLazy) {
+      // Case 2: lazy->eager (forces a graph break since we are materializing a tensor)
+
+      TORCH_INTERNAL_ASSERT(lazy_self);
+      auto eager_tensor = lazy_self->ToTensor(/*detached=*/true);
+      options = options.device(device);
+      auto moved_eager_tensor = eager_tensor.to(options, /*non_blocking=*/non_blocking, /*copy=*/true);
+      return moved_eager_tensor;
+    } else if (device &&
+               device->type() == c10::kLazy &&
+               device->has_index() &&
+               device->index() != self.device().index()) {
+      // Case 3: lazy:0 -> lazy:1
+
+      // TODO(whc) what do we actually want to do here?
+      //   option 1: materialize, move eager tensor, create new lazy tensor
+      //     - this should be our default, as it is what would happen before we implemented _to_copy
+      //     - actually combines case 1 + case 2
+      //   option 2: support multiple devices inside one lazy/TS executor (case 4)
+      //     - but: we may have other assumptions that there is just one device per executor? so don't take this lightly
+
+      TORCH_INTERNAL_ASSERT(lazy_self);
+      auto eager_tensor = lazy_self->ToTensor(/*detached=*/true);
+      // we move the eager tensor to the 'eager' equivalent of our lazy device
+      // e.g. if our device is lazy:1, the backend maps that to cuda:1, which is what we use
+      auto eager_device = c10::Device(torch::lazy::getBackend()->EagerFallbackDeviceType(), device->index());
+      options = options.device(eager_device);
+      auto moved_eager_tensor = eager_tensor.to(options, /*non_blocking=*/false, /*copy=*/true);
+      lazy_self = torch::lazy::GetOrCreateLtcTensor(moved_eager_tensor,
+                                                    torch::lazy::atenDeviceToBackendDevice(eager_device));
+      return torch::lazy::CreateAtenFromLtcTensor(lazy_self);
+
+    } else {
+      // Case 4: lazy->lazy (special case: keep the _to_copy INSIDE the lazy graph)
+
+      // Note: captured _to_copy will be executed with real eager tensors, not lazy tensors.
+      // We DO NOT want to burn 'lazy:0' as the device into this captured IR, or we will try to
+      // convert an eager tensor back to a lazy one inside the torchscript executor
+      // lazy:0 -> lazy:1 is handled in case3, so we can safely drop the device argument
+      device = c10::nullopt;
+
+      auto shapes = torch::lazy::compute_shape__to_copy(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
+      TORCH_INTERNAL_ASSERT(shapes.size() == 1);
+      auto node = torch::lazy::MakeNode<ToCopy>(lazy_self->GetIrValue(),
+                            dtype,
+                            layout,
+                            device,
+                            pin_memory,
+                            non_blocking,
+                            memory_format,
+                            std::move(shapes));
+
+      auto result = torch::lazy::CreateAtenFromLtcTensor(
+              torch::lazy::LazyTensor::Create(std::move(node), lazy_self->GetDevice()));
+      return result;
+    }
+};
+
+at::Tensor LazyNativeFunctions::diagonal(const at::Tensor& self, int64_t offset, int64_t dim1, int64_t dim2) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+
+  auto input = GetLtcTensor(self);
+  auto input_shape = input->shape();
+  dim1 = at::maybe_wrap_dim(dim1, self);
+  dim2 = at::maybe_wrap_dim(dim2, self);
+  auto diagonal_info = DiagonalInfo {offset, dim1, dim2};
+  auto view_info = ViewInfo(ViewInfo::Type::kDiagonal, input_shape, diagonal_info);
+
+  return CreateAtenFromLtcTensor(input->CreateViewTensor(std::move(view_info)));
+}
+
+at::Tensor LazyNativeFunctions::empty(
+    at::IntArrayRef size, c10::optional<at::ScalarType> dtype,
+    c10::optional<at::Layout> layout, c10::optional<at::Device> device,
+    c10::optional<bool> pin_memory,
+    c10::optional<at::MemoryFormat> memory_format) {
+  const auto device_type = torch::lazy::getBackend()->EagerFallbackDeviceType();
+  at::TensorOptions options = at::TensorOptions()
+                                  .device(c10::Device(device_type))
+                                  .layout(layout)
+                                  .pinned_memory(pin_memory)
+                                  .dtype(dtype);
+  auto x_result = at::empty(size, options, memory_format);
+  return CreateLtcTensor(x_result, GetLtcDevice(device));
+}
+
+at::Tensor LazyNativeFunctions::empty_strided(
+    at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout,
+    c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  at::Tensor t = empty(size, dtype, layout, device, pin_memory, c10::nullopt);
+  return LazyNativeFunctions::as_strided(
+      t, size, stride, /*storage_offset=*/0);
+}
+
+at::Tensor LazyNativeFunctions::expand(const at::Tensor& self,
+                                       at::IntArrayRef size, bool implicit) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::expand(
+      torch::lazy::TryGetLtcTensor(self), size.vec()));
+}
+
+at::Tensor& LazyNativeFunctions::fill_(at::Tensor& self,
+                                       const at::Scalar& value) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::fill_(self_tensor, value);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::max_pool3d(
+    const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  return torch::lazy::MaxPool3dAutogradFunctionTS::apply(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+LazyNativeFunctions::native_batch_norm(
+    const at::Tensor& input, const c10::optional<at::Tensor>& weight,
+    const c10::optional<at::Tensor>& bias,
+    const c10::optional<at::Tensor>& running_mean,
+    const c10::optional<at::Tensor>& running_var, bool training,
+    double momentum, double eps) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto input_tensor = torch::lazy::TryGetLtcTensor(input);
+  const torch::lazy::BackendDevice& device = input_tensor->GetDevice();
+  auto running_mean_tensor =
+      GetOrCreateLtcTensor(running_mean, device);
+  auto running_var_tensor =
+      GetOrCreateLtcTensor(running_var, device);
+  auto outputs = ts_native_batch_norm(
+      torch::lazy::TryGetLtcTensor(input), GetOrCreateLtcTensor(weight, device),
+      GetOrCreateLtcTensor(bias, device), running_mean_tensor,
+      running_var_tensor, training, momentum, eps);
+  return std::make_tuple(torch::lazy::CreateAtenFromLtcTensor(std::get<0>(outputs)),
+                         torch::lazy::CreateAtenFromLtcTensor(std::get<1>(outputs)),
+                         torch::lazy::CreateAtenFromLtcTensor(std::get<2>(outputs)));
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+LazyNativeFunctions::native_batch_norm_backward(
+    const at::Tensor& grad_out, const at::Tensor& input,
+    const c10::optional<at::Tensor>& weight,
+    const c10::optional<at::Tensor>& running_mean,
+    const c10::optional<at::Tensor>& running_var,
+    const c10::optional<at::Tensor>& save_mean,
+    const c10::optional<at::Tensor>& save_invstd, bool train, double eps,
+    std::array<bool, 3> output_mask) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto grad_out_tensor = torch::lazy::TryGetLtcTensor(grad_out);
+  const torch::lazy::BackendDevice& device = grad_out_tensor->GetDevice();
+  torch::lazy::LazyTensorPtr null_tensor;
+  bool running_stats = running_mean && running_mean->defined();
+  CHECK_EQ(running_var && running_var->defined(), running_stats);
+  auto gradients = ts_native_batch_norm_backward(
+      torch::lazy::TryGetLtcTensor(grad_out), torch::lazy::TryGetLtcTensor(input),
+      GetOrCreateLtcTensor(weight, device),
+      running_stats ? GetOrCreateLtcTensor(running_mean, device)
+                    : null_tensor,
+      running_stats ? GetOrCreateLtcTensor(running_var, device)
+                    : null_tensor,
+      GetOrCreateLtcTensor(save_mean, device),
+      GetOrCreateLtcTensor(save_invstd, device), train, eps,
+      output_mask);
+  at::Tensor undefined;
+  return std::make_tuple(
+      output_mask[0] ? torch::lazy::CreateAtenFromLtcTensor(std::get<0>(gradients))
+                     : undefined,
+      output_mask[1] ? torch::lazy::CreateAtenFromLtcTensor(std::get<1>(gradients))
+                     : undefined,
+      output_mask[2] ? torch::lazy::CreateAtenFromLtcTensor(std::get<2>(gradients))
+                     : undefined);
+}
+
+// We need to explicitly override max pooling operators and just call the
+// fallback for them because we've customized the autograd function for them
+// (backward needs saved indices from forward).
+std::tuple<at::Tensor, at::Tensor> LazyNativeFunctions::max_pool3d_with_indices(
+    const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  return at::native::call_fallback_fn<
+      &ltc_eager_fallback, ATEN_OP(max_pool3d_with_indices)>::call(self,
+                                                                   kernel_size,
+                                                                   stride,
+                                                                   padding,
+                                                                   dilation,
+                                                                   ceil_mode);
+}
+
+at::Tensor LazyNativeFunctions::max_pool3d_with_indices_backward(
+    const at::Tensor& grad_output, const at::Tensor& self,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
+    const at::Tensor& indices) {
+  return at::native::call_fallback_fn<
+      &ltc_eager_fallback,
+      ATEN_OP(max_pool3d_with_indices_backward)>::call(grad_output, self,
+                                                       kernel_size, stride,
+                                                       padding, dilation,
+                                                       ceil_mode, indices);
+}
+
+at::Tensor LazyNativeFunctions::narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::narrow(
+      self_tensor, dim, start, length));
+}
+
+at::Tensor & LazyNativeFunctions::normal_(at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator) {
+    // Unconditionally fall back.
+    // implementing normal_ via lazy tensor caused differences in results compared to eager.
+    return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+
+    // if (force_eager_fallback(c10::Symbol::fromQualString("aten::normal_"))) {
+    //   return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+    // }
+
+    // if (generator.has_value()) {
+    //   return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+    // }
+
+    // TORCH_LAZY_FN_COUNTER("lazy::");
+    // auto device = bridge::GetBackendDevice(self);
+    // LazyTensor lazy_self = GetLtcTensorOrCreateForWrappedNumber(self, *device);
+    // std::vector<torch::lazy::Shape> shapes = {torch::lazy::Shape(self.scalar_type(), self.sizes().vec())};
+    // auto node = torch::lazy::MakeNode<Normal>(lazy_self.GetIrValue(), mean, std, std::move(shapes));
+    // lazy_self.SetInPlaceIrValue(node);
+    // return self;
+};
+
+at::Tensor LazyNativeFunctions::permute(const at::Tensor& self,
+                                        at::IntArrayRef dims) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::permute(
+      self_tensor, torch::lazy::ToI64Vector(dims)));
+}
+
+at::Tensor LazyNativeFunctions::select(const at::Tensor& self, int64_t dim,
+                                       int64_t index) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::select(torch::lazy::TryGetLtcTensor(self), dim, index));
+}
+
+at::Tensor LazyNativeFunctions::slice(const at::Tensor& self, int64_t dim,
+                                      c10::optional<int64_t> start,
+                                      c10::optional<int64_t> end,
+                                      int64_t step) {
+  int64_t start_val = start.has_value() ? start.value() : 0;
+  int64_t end_val = end.has_value() ? end.value() : INT64_MAX;
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::slice(
+      torch::lazy::TryGetLtcTensor(self), dim, start_val, end_val, step));
+}
+
+at::Tensor LazyNativeFunctions::squeeze(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::squeeze(torch::lazy::TryGetLtcTensor(self)));
+}
+
+at::Tensor LazyNativeFunctions::squeeze(const at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::squeeze(torch::lazy::TryGetLtcTensor(self), dim));
+}
+
+at::Tensor& LazyNativeFunctions::squeeze_(at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::squeeze_(self_tensor);
+  return self;
+}
+
+at::Tensor& LazyNativeFunctions::squeeze_(at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::squeeze_(self_tensor, dim);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::t(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::transpose(torch::lazy::TryGetLtcTensor(self), 0, 1));
+}
+
+at::Tensor& LazyNativeFunctions::t_(at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::transpose_(self_tensor, 0, 1);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::transpose(const at::Tensor& self, int64_t dim0,
+                                          int64_t dim1) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::transpose(torch::lazy::TryGetLtcTensor(self), dim0, dim1));
+}
+
+at::Tensor& LazyNativeFunctions::transpose_(at::Tensor& self, int64_t dim0,
+                                            int64_t dim1) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::transpose_(self_tensor, dim0, dim1);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::unsqueeze(const at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::unsqueeze(torch::lazy::TryGetLtcTensor(self), dim));
+}
+
+at::Tensor& LazyNativeFunctions::unsqueeze_(at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::unsqueeze_(self_tensor, dim);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::view(const at::Tensor& self,
+                                     at::IntArrayRef size) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::view(self_tensor, torch::lazy::ToI64Vector(size)));
+}
+
+at::Tensor LazyNativeFunctions::_unsafe_view(const at::Tensor& self,
+                                     at::IntArrayRef size) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::view(self_tensor, torch::lazy::ToI64Vector(size)));
+}
+
+void InitializeAtenBindings() {}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp
index d79dd999f812..51df34464055 100644
--- a/torch/csrc/lazy/ts_backend/ts_node.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node.cpp
@@ -1,134 +1,70 @@
+                                                \
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
-#include <torch/csrc/lazy/ts_backend/config.h>
-#include <torch/csrc/lazy/core/cache.h>
+#include <torch/csrc/lazy/core/debug_util.h>
 
-namespace torch {
-namespace lazy {
+namespace {
+  std::string GetFirstUserFrameInPythonIfEnabled() {
+    static const auto LTC_ENABLE_SOURCE_INFO = std::getenv("LTC_ENABLE_SOURCE_INFO");
+    if (!LTC_ENABLE_SOURCE_INFO) {
+      return {};
+    }
 
-const Shape& GetShapeFromTsOutput(const Output& output) {
-  if (auto* tsnode = dynamic_cast<const TsNode*>(output.node)) {
-    return tsnode->shape(output.index);
+    return torch::lazy::GetFirstUserFrameInPython();
   }
-  throw std::runtime_error("Expected TsNode but could not dynamic cast");
 }
 
-const Shape& GetShapeFromTsValue(const Value& value) {
-  if (auto* tsnode = dynamic_cast<const TsNode*>(value.node.get())) {
-    return tsnode->shape(value.index);
-  }
-  throw std::runtime_error("Expected TsNode but could not dynamic cast");
-}
+namespace torch {
+namespace lazy {
 
-void TsNodeSetShapeDeferred(
-    NodePtr node, const std::function<Shape()>& shape_fn) {
-  if (auto tsnode = std::dynamic_pointer_cast<TsNode>(node)) {
-    tsnode->SetShapeDeferred(shape_fn);
-    return;
-  }
-  throw std::runtime_error("Expected TsNode but could not dynamic cast");
-}
 
-hash_t OperandHashes(const OpList& operands, const hash_t& seed) {
+hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) {
   hash_t hash = seed;
   for (auto& operand : operands) {
     if (!operand) {
       hash = HashCombine(hash, static_cast<uint64_t>(kNullOpt));
       continue;
     }
-    hash = HashCombine(hash, operand.hash());
+    auto operand_hash = operand.hash();
+    hash = HashCombine(hash, operand_hash);
   }
   return hash;
 }
 
-TsNode::TsNode(OpKind op, OpList operands, std::vector<Shape>&& shapes,
-               size_t num_outputs, hash_t hash_seed)
-    : Node(op, num_outputs,
-           // TODO(WHC) this is inefficient (having to compute node_hash twice
-           // since I can't call hash() yet) so probably move dag_hash
-           // initialization to a separate function?
-           /* node_hash */ HashCombine(op.hash(), hash_seed),
-           /* dag_hash */
-           OperandHashes(operands, HashCombine(op.hash(), hash_seed))),
-      shapes_(shapes) {
-  for (auto& operand : operands) {
-    // Ideally, optional operands should be filtered by the leaf node classes,
-    // but it's just much easier to do it here.
-    // TODO(alanwaketan): Find a way to move the below logic to the leaf node
-    // classes.
-    if (!operand) {
-      continue;
-    }
+hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) {
+  hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes));
+  return HashCombine(h, hash_seed);
+}
 
-    AddOperand(operand.node, operand.index);
-  }
+TsNode::TsNode(OpKind op, OpList operands, std::vector<Shape>&& shapes, size_t num_outputs, hash_t hash_seed)
+    : Node(op, operands, std::move(shapes), num_outputs) {
+  hash_seed = HashCombine(op.hash(), hash_seed);
+  shape_hash_ = OperandHashes(operands, hash_seed, true);
+  dag_hash_ = (enableDynamicShape() ? OperandHashes(operands, hash_seed, false) : shape_hash_);
 }
 
-TsNode::TsNode(OpKind op, OpList operands,
-               const std::function<Shape()>& shape_fn,
+
+TsNode::TsNode(OpKind op, OpList operands, const std::function<Shape()>& shape_fn,
                size_t num_outputs, hash_t hash_seed)
     : TsNode(op, operands, std::vector<Shape>{}, num_outputs, hash_seed) {
-  shapes_.push_back(GetOpShape(shape_fn));
+  addComputedShape(shape_fn);
 }
 
-TsNode::TsNode(OpKind op, OpList operands, size_t num_outputs,
-               hash_t hash_seed)
+TsNode::TsNode(OpKind op, OpList operands, size_t num_outputs, hash_t hash_seed)
     : TsNode(op, operands, std::vector<Shape>{}, num_outputs, hash_seed) {}
 
-void TsNode::SetShapeDeferred(
-    const std::function<Shape()>& shape_fn) {
-  shapes_.push_back(GetOpShape(shape_fn));
-}
-
 TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed)
-    : Node(op, num_outputs, GetOpHash(op, shape, hash_seed))
-{
+    : Node(op, num_outputs),
+      shape_hash_(GetOpHash(op, shape, hash_seed, true)),
+      dag_hash_(enableDynamicShape() ? GetOpHash(op, shape, hash_seed, false) : shape_hash_) {
   shapes_.push_back(std::move(shape));
 }
 
-const Shape& TsNode::shape(size_t output_index) const {
-  return shapes_.at(output_index);
-}
-
-using ShapeCache = Cache<hash_t, Shape, HashReducer>;
-
-ShapeCache* GetShapeCache() {
-  static ShapeCache* cache = new ShapeCache(FLAGS_torch_lazy_ts_shape_cache_size);
-  return cache;
-}
+hash_t TsNode::hash() const { return dag_hash_; }
 
-Shape TsNode::GetOpShape(
-    const std::function<Shape()>& shape_fn) const {
-  ShapeCache* shape_cache = GetShapeCache();
-  auto shape = shape_cache->Get(hash());
-  if (shape == nullptr) {
-    shape = shape_cache->Add(hash(),
-                             std::make_shared<Shape>(shape_fn()));
-  }
-  return *shape;
-}
+hash_t TsNode::shapeHash() const { return shape_hash_; }
 
-std::string TsNode::ToString() const {
-  std::stringstream ss;
-  ss << shapes() << " " << op();
-  if (num_outputs() > 1) {
-    ss << ", num_outputs=" << num_outputs();
-  }
-  if (!metadata().scope.empty()) {
-    ss << ", scope=" << metadata().scope;
-  }
-  EmitShortFrameInfo(ss, metadata().frame_info);
-  return ss.str();
-}
-
-hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) {
-  hash_t h = HashCombine(op.hash(), shape.hash());
-  return HashCombine(h, hash_seed);
-}
-
-void TsNode::AddOperand(NodePtr node, size_t index) {
-  CHECK_LT(index, node->num_outputs());
-  operands_.push_back(std::move(node));
-  operands_as_outputs_.emplace_back(operands_.back().get(), index);
+const std::string TsNode::getPythonStacktrace() const {
+  return GetFirstUserFrameInPythonIfEnabled();
 }
 
 TSOpVector TsNode::Lower(std::shared_ptr<torch::jit::GraphFunction> function,
@@ -141,5 +77,25 @@ TSOpVector TsNode::Lower(std::shared_ptr<torch::jit::GraphFunction> function,
   return {};
 }
 
+TensorList::TensorList(OpList values)
+  : TsNode(/*op=*/ClassOpKind(),
+           /*operands=*/values,
+           /*shapes=*/std::vector<Shape>(),
+         /*num_outputs=*/1,
+         /*hash_seed=*/OperandHashes(values, /*seed=*/kHashSeed, enableDynamicShape())) {}
+
+TSOpVector TensorList::Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                             TSLoweringContext* loctx) const {
+
+  std::vector<torch::jit::Value*> tensor_list;
+  CHECK(!operands().empty());
+  for (const torch::lazy::Output& operand : operands()) {
+    tensor_list.emplace_back(loctx->GetOutputOp(operand));
+  }
+  auto graph = function->graph();
+  auto listnode = graph->insertNode(graph->createList(tensor_list[0]->type(), tensor_list));
+  return {listnode->output()};
+}
+
 }  // namespace lazy
 }  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h
index a6595a5337db..19cbfaf175ee 100644
--- a/torch/csrc/lazy/ts_backend/ts_node.h
+++ b/torch/csrc/lazy/ts_backend/ts_node.h
@@ -13,56 +13,25 @@ namespace lazy {
 
 using TSOpVector = std::vector<torch::jit::Value*>;
 
-// Helper that makes it easy to access the TsNode::shape() method
-// from an torch::lazy::Output* that holds a Node* that points to a TsNode
-// TODO(whc) remove these once migrating to codegen and cleaning up Shape use
-TORCH_API const Shape& GetShapeFromTsOutput(const Output& output);
-TORCH_API const Shape& GetShapeFromTsValue(const Value& value);
-TORCH_API void TsNodeSetShapeDeferred(
-    NodePtr node, const std::function<Shape()>& shape_fn);
-
 class TORCH_API TsNode : public lazy::Node {
  public:
   TsNode(OpKind op, OpList operands, std::vector<Shape>&& shapes,
-         size_t num_outputs = 1, hash_t hash_seed = kHashSeed);
-
-  // Same as the constructor above, but the shape is generated by a function,
-  // only if needed (shape cache miss).
-  TsNode(OpKind op, OpList operands,
-         const std::function<Shape()>& shape_fn,
-         size_t num_outputs = 1, hash_t hash_seed = kHashSeed);
+         size_t num_outputs, hash_t hash_seed = kHashSeed);
 
-  // The shape is set later.
-  TsNode(OpKind op, OpList operands, size_t num_outputs = 1,
-         hash_t hash_seed = kHashSeed);
+  TsNode(OpKind op, OpList operands, const std::function<Shape()>& shape_fn,
+                size_t num_outputs, hash_t hash_seed = kHashSeed);
 
-  void SetShapeDeferred(const std::function<Shape()>& shape_fn);
+  TsNode(OpKind op, OpList operands, size_t num_outputs, hash_t hash_seed = kHashSeed);
 
-  // Contructor used to create leaf nodes.
-  TsNode(OpKind op, Shape shape, size_t num_outputs = 1,
-         hash_t hash_seed = kHashSeed);
+  TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed = kHashSeed);
 
   ~TsNode() override = default;
 
-  Shape GetOpShape(
-      const std::function<Shape()>& shape_fn) const;
-
-  // Retrieves the full shape of the IR Node.
-  c10::ArrayRef<Shape> shapes() const { return shapes_; }
+  hash_t hash() const override;
 
-  // Retrieves the shape of the output at a given index.
-  const Shape& shape(size_t output_index = 0) const;
+  hash_t shapeHash() const override;
 
-  std::string ToString() const override;
-
-  static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed);
-
-  const std::vector<Output>& operands() const override {
-    return operands_as_outputs_;
-  }
-  const Output& operand(size_t i) const override {
-    return operands_as_outputs_.at(i);
-  }
+  const std::string getPythonStacktrace() const;
 
   // Lower is a backend-specific method since it returns a backend specific
   // type. hence, it is convenient to define it differently per-backend rather
@@ -71,15 +40,39 @@ class TORCH_API TsNode : public lazy::Node {
                            TSLoweringContext* loctx) const;
 
  private:
-  // Adds node's index output number as operand.
-  void AddOperand(NodePtr node, size_t index = 0);
-
-  std::vector<Shape> shapes_;
-  // A node holds a real reference to its operands.
-  std::vector<NodePtr> operands_;
-  // Outputs do not hold references on the nodes, and neither do the uses, since
-  // otherwise we get into circular reference counting.
-  std::vector<Output> operands_as_outputs_;
+  // The hash of the dag WITH size info. Used for shape caching
+  hash_t shape_hash_;
+  // The hash of the dag used to look up the compiled graph by a hash
+  // in this case, we will use the dag hash WITHOUT size info if dynamic shape is enabled
+  // and use the dag hash WITH size info otherwise.
+  hash_t dag_hash_;
+};
+
+// Note: this OpKind is separate from ltc_ops.h since it would be a circular import otherwise, I like leaving TensorList
+// in this file, and I think most of ltc_ops special cases will be deleted anyway
+const OpKind tensor_list_opkind = OpKind::Get("lazy_tensors::tensor_list");
+
+// TensorList represents an at::TensorList which is a vector[Tensor] but is also a first-class IValue
+// and can be fed as a single input to a TS program.  It is much easier to handle TensorLists in Lazy Tensor code
+// if they are represented as a single Node so there can be more than one TensorList and more than one Tensor
+// side-by-side as operands to an op.
+//
+// Note: shape is undefined for TensorList.  We assert in some places that #shapes matches #outputs and this stems from
+//       the fact that currently all IR nodes represent tensors (there is no type system for this IR).  Becuase of this,
+//       TensorList is a bit of a hack.
+//
+// TODO(whc) once Shape() API is moved to Node base, also make it virtual, and then implement it as NotImplemented for
+// TensorList, also fixing the assertion that would fail.
+struct TORCH_API TensorList : public TsNode {
+  static OpKind ClassOpKind() {
+    return tensor_list_opkind;
+  }
+
+  TensorList() = delete;
+  TensorList(OpList values);
+
+  TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                   TSLoweringContext* loctx) const override;
 };
 
 }  // namespace lazy
diff --git a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
new file mode 100644
index 000000000000..a43f78da6492
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
@@ -0,0 +1,422 @@
+#include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
+
+#include <ATen/Functions.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/ts_backend/ir_builder.h>
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch {
+namespace lazy {
+
+class TSNodeLowering : public TSNodeLoweringInterface {
+ public:
+  TSNodeLowering(const std::string& name, torch::lazy::TSLoweringContext* loctx)
+      : loctx_(loctx),
+        function_(loctx ? std::make_shared<torch::jit::GraphFunction>(
+                              name, loctx->graph(), nullptr)
+                        : nullptr) {}
+
+  torch::lazy::TSLoweringContext* loctx() { return loctx_; }
+
+  bool Lower(const torch::lazy::Node* node) override {
+    if (auto* tsnode = dynamic_cast<const torch::lazy::TsNode*>(node)) {
+      // First, we call the node lowering function, which exists for newly
+      // codegenned or refactored nodes
+      TSOpVector ops = tsnode->Lower(function_, loctx());
+      if (ops.empty()) {
+        // Then fall back to legacy lowering code, which should be gradually
+        // removed
+        ops = LowerNonCodegenOps(node);
+      }
+      if (ops.empty()) {
+        return false;
+      }
+      CHECK_EQ(node->num_outputs(), ops.size());
+      for (size_t i = 0; i < ops.size(); ++i) {
+        loctx()->AssignOutputOp(torch::lazy::Output(node, i), ops[i]);
+      }
+      return true;
+    }
+    throw std::runtime_error(
+        "Expected torch::lazy::TsNode but could not dynamic cast");
+  }
+
+  // TODO(whc) this is for legacy/non-codegen Ops, and after moving most ops
+  // to codegen we should delete this and put all the lowering logic into Node
+  // classes
+  TSOpVector LowerNonCodegenOps(const torch::lazy::Node* node) {
+    if (node->op().op == at::aten::as_strided) {
+      return LowerAsStrided(torch::lazy::NodeCast<torch::lazy::AsStrided>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_as_strided_view_update) {
+      return LowerAsStridedViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::AsStridedViewUpdate>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_cast) {
+      return LowerCast(torch::lazy::NodeCast<torch::lazy::Cast>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_select_view_update) {
+      return LowerSelectViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::SelectViewUpdate>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_narrow_view_update) {
+      return LowerNarrowViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::NarrowViewUpdate>(node));
+    }
+    if (node->op().op == at::prim::Constant) {
+      return LowerScalar(torch::lazy::NodeCast<torch::lazy::Scalar>(node));
+    }
+    if (node->op().op == at::aten::native_batch_norm) {
+      return LowerBatchNorm(
+          torch::lazy::NodeCast<TSNativeBatchNormForward>(node));
+    }
+    if (node->op().op == at::aten::native_batch_norm_backward) {
+      return LowerBatchNormBackward(
+          torch::lazy::NodeCast<TSNativeBatchNormBackward>(node));
+    }
+    if (node->op().op == at::aten::expand) {
+      return LowerExpand(
+          torch::lazy::NodeCast<torch::lazy::Expand>(node));
+    }
+    if (node->op().op == at::aten::narrow) {
+      return LowerNarrow(torch::lazy::NodeCast<torch::lazy::Narrow>(node));
+    }
+    if (node->op().op == at::aten::permute) {
+      return LowerPermute(torch::lazy::NodeCast<torch::lazy::Permute>(node));
+    }
+    if (node->op().op == at::aten::select) {
+      return LowerSelect(torch::lazy::NodeCast<torch::lazy::Select>(node));
+    }
+    if (node->op().op == at::aten::squeeze) {
+      return LowerSqueeze(
+          torch::lazy::NodeCast<Squeeze>(node));
+    }
+    if (node->op().op == at::aten::unsqueeze) {
+      return LowerUnsqueeze(
+          torch::lazy::NodeCast<Unsqueeze>(node));
+    }
+    if (node->op().op == at::aten::view) {
+      return LowerView(torch::lazy::NodeCast<torch::lazy::View>(node));
+    }
+    if (node->op().op == at::aten::diagonal) {
+      return LowerDiagonal(torch::lazy::NodeCast<torch::lazy::Diagonal>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_diagonal_view_update) {
+      return LowerDiagonalViewUpdate(torch::lazy::NodeCast<torch::lazy::DiagonalViewUpdate>(node));
+    }
+    if (node->op() == *torch::lazy::ltc_device_data) {
+      const torch::lazy::DeviceData* device_data_node =
+          torch::lazy::NodeCast<torch::lazy::DeviceData>(node);
+      auto infoptr = device_data_node->data()->info();
+      auto deviceDataInfoPtr = (torch::lazy::LazyGraphExecutor::DeviceDataInfo*) infoptr;
+      if (GRAPH_DUMP_ENABLED) {
+        LOG(ERROR) << "Lowering device data node, tensor id " << deviceDataInfoPtr->tensor_id << std::endl;
+      }
+      return {loctx()->GetParameter(device_data_node->data())};
+    }
+
+    std::vector<torch::jit::NamedValue> arguments;
+    for (const torch::lazy::Output& output : node->operands()) {
+      arguments.emplace_back(loctx()->GetOutputOp(output));
+    }
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerBuiltin(
+      const torch::lazy::Node* node,
+      const std::vector<torch::jit::NamedValue>& arguments,
+      const std::vector<torch::jit::NamedValue>& kwarguments = {}) {
+    return LowerTSBuiltin(function_, node->op().op, arguments, kwarguments);
+  }
+  TSOpVector LowerBuiltin(
+      c10::Symbol sym, const std::vector<torch::jit::NamedValue>& arguments,
+      const std::vector<torch::jit::NamedValue>& kwarguments = {}) {
+    return LowerTSBuiltin(function_, sym, arguments, kwarguments);
+  }
+
+  TSOpVector LowerAsStrided(const torch::lazy::AsStrided* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->size());
+    arguments.emplace_back(node->stride());
+    arguments.emplace_back(node->storage_offset());
+    TSOpVector as_strided_out = LowerBuiltin(node, arguments);
+    CHECK_EQ(as_strided_out.size(), 1);
+    return {GenerateClone(as_strided_out.front())};
+  }
+
+  TSOpVector LowerAsStridedViewUpdate(
+      const torch::lazy::AsStridedViewUpdate* node) {
+    torch::jit::Value* destination =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    const torch::lazy::Output& input_op = node->operand(1);
+    const torch::lazy::Shape& input_shape = input_op.shape();
+    const auto input_dimensions = input_shape.sizes();
+    std::vector<torch::jit::NamedValue> dest_arguments;
+    dest_arguments.emplace_back(destination);
+    dest_arguments.emplace_back(
+        std::vector<int64_t>(input_dimensions.begin(), input_dimensions.end()));
+    dest_arguments.emplace_back(node->stride());
+    dest_arguments.emplace_back(node->storage_offset());
+    TSOpVector as_strided_out =
+        LowerBuiltin(at::aten::as_strided, dest_arguments);
+    CHECK_EQ(as_strided_out.size(), 1);
+    torch::jit::Value* as_strided = as_strided_out.front();
+    GenerateCopy(as_strided, loctx()->GetOutputOp(input_op));
+    return {destination};
+  }
+
+  TSOpVector LowerBatchNorm(const TSNativeBatchNormForward* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    for (size_t i = 0; i < 5; ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    arguments.emplace_back(node->training());
+    arguments.emplace_back(node->momentum());
+    arguments.emplace_back(node->eps());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerBatchNormBackward(const TSNativeBatchNormBackward* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    for (size_t i = 0; i < 3; ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    const auto& operands = node->operands();
+    c10::optional<at::Tensor> null_arg;
+    if (operands.size() == 5) {
+      arguments.emplace_back(null_arg);
+      arguments.emplace_back(null_arg);
+    }
+    for (size_t i = 3; i < operands.size(); ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    arguments.emplace_back(node->training());
+    arguments.emplace_back(node->eps());
+    arguments.emplace_back(node->output_mask());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerCast(const torch::lazy::Cast* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dtype());
+    return LowerBuiltin(at::aten::to, arguments);
+  }
+
+  TSOpVector LowerExpand(const torch::lazy::Expand* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->size());
+    auto expand_out = LowerBuiltin(node, arguments);
+    if (node->is_scalar_expand()) {
+      // The aten::expand operations sets all strides to 0 when the original is
+      // of rank 0. This leads to false positives when checking for internal
+      // memory overlap, because at::has_internal_overlap returns
+      // MemOverlap::YES when a stride is set to 0.
+      CHECK_EQ(expand_out.size(), 1);
+      return {GenerateClone(expand_out.front())};
+    }
+    return expand_out;
+  }
+
+  TSOpVector LowerNarrow(const torch::lazy::Narrow* node) {
+    const torch::lazy::Output& input = node->operand(0);
+    torch::jit::Value* base = loctx()->GetOutputOp(input);
+    const auto& base_indices = node->base_indices();
+    const auto& sizes = node->sizes();
+    const torch::lazy::Shape& input_shape = input.shape();
+    CHECK_EQ(sizes.size(), base_indices.size());
+    CHECK_EQ(input_shape.dim(), base_indices.size());
+    for (size_t dim = 0; dim < base_indices.size(); ++dim) {
+      int64_t start = base_indices[dim];
+      base = GenerateSlice(/*base=*/base, /*dim=*/dim, /*start=*/start,
+                           /*end=*/start + sizes[dim], /*step=*/1);
+    }
+    return {base};
+  }
+
+  TSOpVector LowerPermute(const torch::lazy::Permute* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dims());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerScalar(const torch::lazy::Scalar* node) {
+    const at::Scalar& value = node->value();
+    const torch::lazy::Shape& shape = node->shape();
+    auto options =
+        at::TensorOptions()
+            .device(torch::lazy::getBackend()->EagerFallbackDeviceType())
+            .dtype(shape.scalar_type());
+    return {
+        loctx()->graph()->insertConstant(at::scalar_tensor(value, options))};
+  }
+
+  TSOpVector LowerSelect(const torch::lazy::Select* node) {
+    int64_t step = torch::lazy::Select::GetStride(node->start(), node->end(),
+                                                  node->stride());
+    torch::jit::Value* base = loctx()->GetOutputOp(node->operand(0));
+    return {GenerateSlice(/*base=*/base, /*dim=*/node->dim(),
+                          /*start=*/node->start(), /*end=*/node->end(),
+                          /*step=*/step)};
+  }
+
+  TSOpVector LowerSqueeze(const Squeeze* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    if (node->dim() != -1) {
+      arguments.emplace_back(node->dim());
+    }
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerSelectViewUpdate(const torch::lazy::SelectViewUpdate* node) {
+    torch::jit::Value* dest =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    int64_t step = torch::lazy::Select::GetStride(node->start(), node->end(),
+                                                  node->stride());
+    torch::jit::Value* selected = GenerateSlice(
+        /*base=*/dest, /*dim=*/node->dim(), /*start=*/node->start(),
+        /*end=*/node->end(), /*step=*/step);
+    GenerateCopy(selected, loctx()->GetOutputOp(node->operand(1)));
+    return {dest};
+  }
+
+  TSOpVector LowerNarrowViewUpdate(const torch::lazy::NarrowViewUpdate* node) {
+    torch::jit::Value* dest =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    const auto& base_indices = node->base_indices();
+    const torch::lazy::Output& source_argument = node->operand(1);
+    const torch::lazy::Shape& source_shape = source_argument.shape();
+    CHECK_EQ(source_shape.dim(), base_indices.size());
+    torch::jit::Value* base = dest;
+    for (size_t dim = 0; dim < base_indices.size(); ++dim) {
+      int64_t start = base_indices[dim];
+      base = GenerateSlice(/*base=*/base, /*dim=*/dim, /*start=*/start,
+                           /*end=*/start + source_shape.size(dim),
+                           /*step=*/1);
+    }
+    GenerateCopy(base, loctx()->GetOutputOp(source_argument));
+    return {dest};
+  }
+
+  TSOpVector LowerUnsqueeze(const Unsqueeze* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dim());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerView(const torch::lazy::View* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->output_size());
+    return LowerBuiltin(at::aten::reshape, arguments);
+  }
+
+  TSOpVector LowerDiagonal(const Diagonal* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->offset());
+    arguments.emplace_back(node->dim1());
+    arguments.emplace_back(node->dim2());
+    return LowerBuiltin(node, arguments);
+  }
+
+  // FIXME(alanwaketan): One day we should code-gen all view ops, or at
+  // least move the lowering to the IR nodes.
+  TSOpVector LowerDiagonalViewUpdate(const DiagonalViewUpdate* node) {
+    // Since we promise the backends that we never generate any aliased
+    // inplace update IR, therefore we clone the target first and then
+    // update the clone inplace instead. Since the clone is transient,
+    // it will never be aliased, and therefore it's safe.
+    auto* destination = GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+
+    // Replay the diagonal.
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(destination);
+    arguments.emplace_back(node->offset());
+    arguments.emplace_back(node->dim1());
+    arguments.emplace_back(node->dim2());
+    auto diag = LowerBuiltin(at::aten::diagonal, arguments);
+
+    // Update the replayed diagonal view with the input.
+    GenerateCopy(diag.front(), loctx()->GetOutputOp(node->operand(1)));
+
+    // Destination's diag view should be updated.
+    return {destination};
+  }
+
+  torch::jit::Value* GenerateClone(torch::jit::Value* val) {
+    std::vector<torch::jit::NamedValue> clone_arguments;
+    clone_arguments.emplace_back(val);
+    TSOpVector cloned = LowerBuiltin(at::aten::clone, clone_arguments);
+    CHECK_EQ(cloned.size(), 1);
+    return cloned.front();
+  }
+
+  void GenerateCopy(torch::jit::Value* destination, torch::jit::Value* source) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(destination);
+    arguments.emplace_back(source);
+    LowerBuiltin(at::aten::copy_, arguments);
+  }
+
+  torch::jit::Value* GenerateSlice(torch::jit::Value* base, int64_t dim,
+                                   int64_t start, int64_t end, int64_t step) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(base);
+    arguments.emplace_back(dim);
+    arguments.emplace_back(start);
+    arguments.emplace_back(end);
+    arguments.emplace_back(step);
+    TSOpVector selected = LowerBuiltin(at::aten::slice, arguments);
+    CHECK_EQ(selected.size(), 1);
+    return selected.front();
+  }
+  torch::lazy::TSLoweringContext* loctx_;
+  std::shared_ptr<torch::jit::GraphFunction> function_;
+};
+
+std::unique_ptr<TSNodeLoweringInterface> TSNodeLoweringInterface::Create(
+    torch::lazy::LoweringContext* loctx) {
+  return std::make_unique<TSNodeLowering>(
+      "TSNodeLowering", static_cast<torch::lazy::TSLoweringContext*>(loctx));
+}
+
+TSOpVector LowerTSBuiltin(
+    std::shared_ptr<torch::jit::GraphFunction> function, c10::Symbol sym,
+    const std::vector<torch::jit::NamedValue>& arguments,
+    const std::vector<torch::jit::NamedValue>& kwarguments) {
+  auto builtin =
+      std::make_shared<torch::jit::BuiltinFunction>(sym, at::nullopt);
+  auto magic_method = std::make_shared<torch::jit::MagicMethod>("", builtin);
+  auto ret = magic_method->call({}, *function, arguments, kwarguments, 0);
+  auto sv = dynamic_cast<torch::jit::SimpleValue*>(ret.get());
+  CHECK(sv);
+  if (sv->getValue()->type()->kind() == c10::TypeKind::TupleType) {
+    const auto tuple_call_result = sv->asTuple({}, *function);
+    TSOpVector tuple_result;
+    for (const auto& tuple_component : tuple_call_result) {
+      auto tuple_component_sv =
+          dynamic_cast<torch::jit::SimpleValue*>(tuple_component.get());
+      tuple_result.push_back(tuple_component_sv->getValue());
+    }
+    return tuple_result;
+  }
+  return {sv->getValue()};
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/as_strided.cpp b/torch/csrc/lazy/ts_backend/view_ops/as_strided.cpp
similarity index 89%
rename from torch/csrc/lazy/core/view_ops/as_strided.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/as_strided.cpp
index 18a4ebb6fe5c..eaabd23f9a2f 100644
--- a/torch/csrc/lazy/core/view_ops/as_strided.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/as_strided.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/lazy/core/view_ops/as_strided.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/as_strided.h>
 
 #include <algorithm>
 
@@ -14,10 +14,10 @@ AsStrided::AsStrided(
     std::vector<int64_t> stride,
     int64_t storage_offset)
     : TsNode(
-          OpKind(at::aten::as_strided),
+          ClassOpKind(),
           {input},
           [&]() {
-            return Shape(GetShapeFromTsValue(input).scalar_type(), size);
+            return Shape(input.shape().scalar_type(), size);
           },
           /*num_outputs=*/1,
           MHash(size, stride, storage_offset)),
diff --git a/torch/csrc/lazy/core/view_ops/as_strided.h b/torch/csrc/lazy/ts_backend/view_ops/as_strided.h
similarity index 91%
rename from torch/csrc/lazy/core/view_ops/as_strided.h
rename to torch/csrc/lazy/ts_backend/view_ops/as_strided.h
index e293f810a6a9..0c5cd49bc98e 100644
--- a/torch/csrc/lazy/core/view_ops/as_strided.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/as_strided.h
@@ -9,6 +9,10 @@ namespace lazy {
 
 class TORCH_API AsStrided : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::as_strided);
+  }
+
   AsStrided(
       const Value& input,
       std::vector<int64_t> size,
diff --git a/torch/csrc/lazy/core/view_ops/as_strided_view_update.cpp b/torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.cpp
similarity index 82%
rename from torch/csrc/lazy/core/view_ops/as_strided_view_update.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.cpp
index 4e29a97ce055..49b9e36b944c 100644
--- a/torch/csrc/lazy/core/view_ops/as_strided_view_update.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/lazy/core/view_ops/as_strided_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/core/tensor_util.h>
-#include <torch/csrc/lazy/core/view_ops/as_strided.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/as_strided.h>
 
 namespace torch {
 namespace lazy {
@@ -17,7 +17,7 @@ AsStridedViewUpdate::AsStridedViewUpdate(
           ltc_as_strided_view_update,
           {target, input},
           [&]() {
-            return Shape(GetShapeFromTsValue(target).scalar_type(), size);
+            return Shape(target.shape().scalar_type(), size);
           },
           /*num_outputs=*/1,
           MHash(size, stride, storage_offset)),
diff --git a/torch/csrc/lazy/core/view_ops/as_strided_view_update.h b/torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.h
similarity index 86%
rename from torch/csrc/lazy/core/view_ops/as_strided_view_update.h
rename to torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.h
index 85954ac33bf3..02ecf3182789 100644
--- a/torch/csrc/lazy/core/view_ops/as_strided_view_update.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.h
@@ -3,12 +3,17 @@
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
 
 #include <vector>
+#include <lazy/core/internal_ops/ltc_ops.h>
 
 namespace torch {
 namespace lazy {
 
 class TORCH_API AsStridedViewUpdate : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_as_strided_view_update;
+  }
+
   AsStridedViewUpdate(
       const Value& target,
       const Value& input,
diff --git a/torch/csrc/lazy/ts_backend/view_ops/diagonal.cpp b/torch/csrc/lazy/ts_backend/view_ops/diagonal.cpp
new file mode 100644
index 000000000000..565796deafb1
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/diagonal.cpp
@@ -0,0 +1,35 @@
+#include <c10/util/irange.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/diagonal.h>
+
+#include <cmath>
+
+namespace torch {
+namespace lazy {
+
+Diagonal::Diagonal(
+    const Value& input,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2)
+    : TsNode(
+          ClassOpKind(),
+          {input},
+          [&]() {
+            return MakeDiagonalShape(input.shape(), offset, dim1, dim2);
+          },
+          /*num_outputs=*/1,
+          MHash(offset, dim1, dim2)),
+      offset_(offset),
+      dim1_(dim1),
+      dim2_(dim2) {}
+
+std::string Diagonal::ToString() const {
+  std::stringstream ss;
+  ss << TsNode::ToString() << ", offset=" << offset_ << ", dim1=" << dim1_
+     << ", dim2=" << dim2_;
+  return ss.str();
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/diagonal.h b/torch/csrc/lazy/ts_backend/view_ops/diagonal.h
similarity index 81%
rename from torch/csrc/lazy/core/view_ops/diagonal.h
rename to torch/csrc/lazy/ts_backend/view_ops/diagonal.h
index 2fdc6d37785d..4ef669461b05 100644
--- a/torch/csrc/lazy/core/view_ops/diagonal.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/diagonal.h
@@ -7,6 +7,10 @@ namespace lazy {
 
 class TORCH_API Diagonal : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::diagonal);
+  }
+
   Diagonal(const Value& input, int64_t offset, int64_t dim1, int64_t dim2);
 
   std::string ToString() const override;
@@ -23,12 +27,6 @@ class TORCH_API Diagonal : public TsNode {
     return dim2_;
   }
 
-  static Shape MakeDiagonalShape(
-      const Shape& shape,
-      int64_t offset,
-      int64_t dim1,
-      int64_t dim2);
-
  private:
   int64_t offset_;
   int64_t dim1_;
diff --git a/torch/csrc/lazy/core/view_ops/diagonal_view_update.cpp b/torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.cpp
similarity index 86%
rename from torch/csrc/lazy/core/view_ops/diagonal_view_update.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.cpp
index 381713e461de..2d87832afa59 100644
--- a/torch/csrc/lazy/core/view_ops/diagonal_view_update.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/lazy/core/view_ops/diagonal_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 
@@ -14,7 +14,7 @@ DiagonalViewUpdate::DiagonalViewUpdate(
     : TsNode(
           ltc_diagonal_view_update,
           {target, input},
-          {GetShapeFromTsValue(target)},
+          {target.shape()},
           /*num_outputs=*/1,
           MHash(offset, dim1, dim2)),
       offset_(offset),
diff --git a/torch/csrc/lazy/core/view_ops/diagonal_view_update.h b/torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.h
similarity index 83%
rename from torch/csrc/lazy/core/view_ops/diagonal_view_update.h
rename to torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.h
index 813e2aea7fef..853abc8e6b2d 100644
--- a/torch/csrc/lazy/core/view_ops/diagonal_view_update.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.h
@@ -1,12 +1,17 @@
 #pragma once
 
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <lazy/core/internal_ops/ltc_ops.h>
 
 namespace torch {
 namespace lazy {
 
 class TORCH_API DiagonalViewUpdate : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_diagonal_view_update;
+  }
+
   DiagonalViewUpdate(
       const Value& target,
       const Value& input,
diff --git a/torch/csrc/lazy/core/view_ops/narrow.cpp b/torch/csrc/lazy/ts_backend/view_ops/narrow.cpp
similarity index 78%
rename from torch/csrc/lazy/core/view_ops/narrow.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/narrow.cpp
index dda66b3da8ea..c5d26a4261ea 100644
--- a/torch/csrc/lazy/core/view_ops/narrow.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/narrow.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/lazy/core/view_ops/narrow.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/narrow.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 
@@ -10,14 +10,14 @@ Narrow::Narrow(
     c10::ArrayRef<int64_t> base_indices,
     c10::ArrayRef<int64_t> sizes)
     : TsNode(
-          OpKind(at::aten::narrow),
+          ClassOpKind(),
           {input},
           /*num_outputs=*/1,
           MHash(base_indices, sizes)),
       base_indices_(base_indices.begin(), base_indices.end()),
       sizes_(sizes.begin(), sizes.end()) {
-  SetShapeDeferred([&]() {
-    return Shape(GetShapeFromTsOutput(operand(0)).scalar_type(), sizes);
+  addComputedShape([&]() {
+    return Shape(operand(0).shape().scalar_type(), sizes);
   });
 }
 
diff --git a/torch/csrc/lazy/core/view_ops/narrow.h b/torch/csrc/lazy/ts_backend/view_ops/narrow.h
similarity index 88%
rename from torch/csrc/lazy/core/view_ops/narrow.h
rename to torch/csrc/lazy/ts_backend/view_ops/narrow.h
index f017f349ddd6..b82557558a18 100644
--- a/torch/csrc/lazy/core/view_ops/narrow.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/narrow.h
@@ -7,6 +7,10 @@ namespace lazy {
 
 class TORCH_API Narrow : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::narrow);
+  }
+
   Narrow(
       const Value& input,
       c10::ArrayRef<int64_t> base_indices,
diff --git a/torch/csrc/lazy/core/view_ops/narrow_view_update.cpp b/torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.cpp
similarity index 82%
rename from torch/csrc/lazy/core/view_ops/narrow_view_update.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.cpp
index 5334b8bbd87a..db8c6340fa82 100644
--- a/torch/csrc/lazy/core/view_ops/narrow_view_update.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/lazy/core/view_ops/narrow_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 
@@ -15,7 +15,7 @@ NarrowViewUpdate::NarrowViewUpdate(
           /*num_outputs=*/1,
           MHash(base_indices)),
       base_indices_(base_indices.begin(), base_indices.end()) {
-  SetShapeDeferred([&]() { return GetShapeFromTsOutput(operand(0)); });
+  addComputedShape([&]() { return operand(0).shape(); });
 }
 
 std::string NarrowViewUpdate::ToString() const {
diff --git a/torch/csrc/lazy/core/view_ops/narrow_view_update.h b/torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.h
similarity index 81%
rename from torch/csrc/lazy/core/view_ops/narrow_view_update.h
rename to torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.h
index f1d4cbc10921..c1c0189409b8 100644
--- a/torch/csrc/lazy/core/view_ops/narrow_view_update.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.h
@@ -1,12 +1,17 @@
 #pragma once
 
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <lazy/core/internal_ops/ltc_ops.h>
 
 namespace torch {
 namespace lazy {
 
 class TORCH_API NarrowViewUpdate : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_narrow_view_update;
+  }
+
   NarrowViewUpdate(
       const Value& input,
       const Value& source,
diff --git a/torch/csrc/lazy/ts_backend/view_ops/permute.cpp b/torch/csrc/lazy/ts_backend/view_ops/permute.cpp
new file mode 100644
index 000000000000..ee9d932bc9c2
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/permute.cpp
@@ -0,0 +1,28 @@
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/permute.h>
+
+#include <torch/csrc/lazy/core/helpers.h>
+
+namespace torch {
+namespace lazy {
+
+Permute::Permute(const Value& input, std::vector<int64_t> dims)
+    : TsNode(
+          ClassOpKind(),
+          {input},
+          /*num_outputs=*/1,
+          MHash(dims)),
+      dims_(std::move(dims)) {
+  addComputedShape([&]() {
+    return MakePermuteShape(operand(0).shape(), dims_);
+  });
+}
+
+std::string Permute::ToString() const {
+  std::stringstream ss;
+  ss << TsNode::ToString() << ", dims=(" << c10::Join(", ", dims_) << ")";
+  return ss.str();
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/permute.h b/torch/csrc/lazy/ts_backend/view_ops/permute.h
similarity index 80%
rename from torch/csrc/lazy/core/view_ops/permute.h
rename to torch/csrc/lazy/ts_backend/view_ops/permute.h
index 9d84d0dad086..23a07cd5d647 100644
--- a/torch/csrc/lazy/core/view_ops/permute.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/permute.h
@@ -7,6 +7,10 @@ namespace lazy {
 
 class TORCH_API Permute : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::permute);
+  }
+
   Permute(const Value& input, std::vector<int64_t> dims);
 
   std::string ToString() const override;
@@ -15,10 +19,6 @@ class TORCH_API Permute : public TsNode {
     return dims_;
   }
 
-  static Shape MakePermuteShape(
-      const Shape& source_shape,
-      c10::ArrayRef<int64_t> permutation);
-
  private:
   // The permutation of dimensions.
   std::vector<int64_t> dims_;
diff --git a/torch/csrc/lazy/core/view_ops/resize.cpp b/torch/csrc/lazy/ts_backend/view_ops/resize.cpp
similarity index 79%
rename from torch/csrc/lazy/core/view_ops/resize.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/resize.cpp
index 409bd1e9e440..1a4e945c74f0 100644
--- a/torch/csrc/lazy/core/view_ops/resize.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/resize.cpp
@@ -1,19 +1,18 @@
-#include <torch/csrc/lazy/core/view_ops/resize.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/resize.h>
 
 namespace torch {
 namespace lazy {
 
 namespace {
-
 Shape NodeOutputShape(const Value& input, c10::ArrayRef<int64_t> size) {
-  return Shape(GetShapeFromTsValue(input).scalar_type(), size);
+  return Shape(input.shape().scalar_type(), size);
 }
 
 } // namespace
 
 Resize::Resize(const Value& input, std::vector<int64_t> size)
     : TsNode(
-          OpKind(at::aten::resize),
+          ClassOpKind(),
           {input},
           [&]() { return NodeOutputShape(input, size); },
           /*num_outputs=*/1,
diff --git a/torch/csrc/lazy/core/view_ops/resize.h b/torch/csrc/lazy/ts_backend/view_ops/resize.h
similarity index 84%
rename from torch/csrc/lazy/core/view_ops/resize.h
rename to torch/csrc/lazy/ts_backend/view_ops/resize.h
index 0da99d06c479..e906c60f5e8e 100644
--- a/torch/csrc/lazy/core/view_ops/resize.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/resize.h
@@ -7,6 +7,10 @@ namespace lazy {
 
 class TORCH_API Resize : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::resize);
+  }
+
   Resize(const Value& input, std::vector<int64_t> size);
 
   std::string ToString() const override;
diff --git a/torch/csrc/lazy/ts_backend/view_ops/select.cpp b/torch/csrc/lazy/ts_backend/view_ops/select.cpp
new file mode 100644
index 000000000000..f27e1df65237
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/select.cpp
@@ -0,0 +1,44 @@
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/select.h>
+
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+
+namespace torch {
+namespace lazy {
+
+Select::Select(
+    const Value& input,
+    int64_t dim,
+    int64_t start,
+    int64_t end,
+    int64_t stride)
+    : TsNode(
+          ClassOpKind(),
+          {input},
+          [&]() {
+            return MakeSelectShape(input.shape(), dim, start, end, stride);
+          },
+          /*num_outputs=*/1,
+          MHash(dim, start, end, stride)),
+      dim_(dim),
+      start_(start),
+      end_(end),
+      stride_(stride) {}
+
+std::string Select::ToString() const {
+  std::stringstream ss;
+  ss << TsNode::ToString() << ", dim=" << dim_ << ", start=" << start_
+     << ", end=" << end_ << ", stride=" << stride_;
+  return ss.str();
+}
+
+int64_t Select::GetStride(int64_t start, int64_t end, int64_t stride) {
+  if (stride == 0) {
+    CHECK_EQ(start, end);
+    stride = 1;
+  }
+  return stride;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/select.h b/torch/csrc/lazy/ts_backend/view_ops/select.h
similarity index 83%
rename from torch/csrc/lazy/core/view_ops/select.h
rename to torch/csrc/lazy/ts_backend/view_ops/select.h
index 26cc7dd7d1f2..f6bd0c81777f 100644
--- a/torch/csrc/lazy/core/view_ops/select.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/select.h
@@ -7,6 +7,10 @@ namespace lazy {
 
 class TORCH_API Select : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::select);
+  }
+
   Select(
       const Value& input,
       int64_t dim,
@@ -32,13 +36,6 @@ class TORCH_API Select : public TsNode {
     return stride_;
   }
 
-  static Shape MakeSelectShape(
-      const Shape& shape,
-      int64_t dim,
-      int64_t start,
-      int64_t end,
-      int64_t stride);
-
   static int64_t GetStride(int64_t start, int64_t end, int64_t stride);
 
  private:
diff --git a/torch/csrc/lazy/core/view_ops/select_view_update.cpp b/torch/csrc/lazy/ts_backend/view_ops/select_view_update.cpp
similarity index 83%
rename from torch/csrc/lazy/core/view_ops/select_view_update.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/select_view_update.cpp
index 7f778b18a5e6..20291f948ea4 100644
--- a/torch/csrc/lazy/core/view_ops/select_view_update.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/select_view_update.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/lazy/core/view_ops/select_view_update.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/select_view_update.h>
 
 #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
 #include <torch/csrc/lazy/core/tensor_util.h>
-#include <torch/csrc/lazy/core/view_ops/select.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/select.h>
 
 namespace torch {
 namespace lazy {
@@ -17,7 +17,7 @@ SelectViewUpdate::SelectViewUpdate(
     : TsNode(
           ltc_select_view_update,
           {target, source},
-          {GetShapeFromTsValue(target)},
+          {target.shape()},
           /*num_outputs=*/1,
           MHash(dim, start, end, stride)),
       dim_(dim),
diff --git a/torch/csrc/lazy/core/view_ops/select_view_update.h b/torch/csrc/lazy/ts_backend/view_ops/select_view_update.h
similarity index 85%
rename from torch/csrc/lazy/core/view_ops/select_view_update.h
rename to torch/csrc/lazy/ts_backend/view_ops/select_view_update.h
index 6d33e1b45b2d..9b62fbd9c3d4 100644
--- a/torch/csrc/lazy/core/view_ops/select_view_update.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/select_view_update.h
@@ -1,12 +1,17 @@
 #pragma once
 
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
+#include <lazy/core/internal_ops/ltc_ops.h>
 
 namespace torch {
 namespace lazy {
 
 class TORCH_API SelectViewUpdate : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return ltc_select_view_update;
+  }
+
   SelectViewUpdate(
       const Value& target,
       const Value& source,
diff --git a/torch/csrc/lazy/ts_backend/view_ops/squeeze.cpp b/torch/csrc/lazy/ts_backend/view_ops/squeeze.cpp
new file mode 100644
index 000000000000..6a4021173239
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/squeeze.cpp
@@ -0,0 +1,28 @@
+#include <c10/util/irange.h>
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/squeeze.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch {
+namespace lazy {
+
+Squeeze::Squeeze(const torch::lazy::Value& input, int dim)
+    : torch::lazy::TsNode(ClassOpKind(), {input},
+                          /*num_outputs=*/1, torch::lazy::MHash(dim)),
+      dim_(dim) {
+  addComputedShape(
+      [&]() {
+        const auto& input_shape = input.shape();
+        return torch::lazy::Shape(input_shape.scalar_type(),
+          BuildSqueezedDimensions(input_shape.sizes(), dim));
+      });
+}
+
+std::string Squeeze::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", dim=" << dim_;
+  return ss.str();
+}
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/view_ops/squeeze.h b/torch/csrc/lazy/ts_backend/view_ops/squeeze.h
new file mode 100644
index 000000000000..2b0c7fb2211d
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/squeeze.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Squeeze : public TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::squeeze);
+  }
+
+  // Squeeze out the specified dimension index, -1 for all trivial dimensions.
+  Squeeze(const torch::lazy::Value& input, int dim);
+
+  std::string ToString() const override;
+
+  int dim() const { return dim_; }
+
+ private:
+  int dim_;
+};
+
+}  // namespace lazy
+}  // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.cpp b/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.cpp
new file mode 100644
index 000000000000..3b68810cf7f9
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.cpp
@@ -0,0 +1,30 @@
+#include <torch/csrc/lazy/core/ops/utils.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/unsqueeze.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch {
+namespace lazy {
+
+Unsqueeze::Unsqueeze(const torch::lazy::Value& input, int dim)
+    : torch::lazy::TsNode(
+          ClassOpKind(),
+          {input},
+          /*num_outputs=*/1,
+          torch::lazy::MHash(dim)),
+      dim_(dim) {
+  addComputedShape([&]() {
+    const auto& input_shape = input.shape();
+    return torch::lazy::Shape(
+        input_shape.scalar_type(),
+        BuildUnsqueezedDimensions(input_shape.sizes(), dim));
+  });
+}
+
+std::string Unsqueeze::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", dim=" << dim_;
+  return ss.str();
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.h b/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.h
new file mode 100644
index 000000000000..9b561fff5e21
--- /dev/null
+++ b/torch/csrc/lazy/ts_backend/view_ops/unsqueeze.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Unsqueeze : public TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::unsqueeze);
+  }
+
+  Unsqueeze(const torch::lazy::Value& input, int dim);
+
+  std::string ToString() const override;
+
+  int dim() const {
+    return dim_;
+  }
+
+ private:
+  int dim_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/torch/csrc/lazy/core/view_ops/view.cpp b/torch/csrc/lazy/ts_backend/view_ops/view.cpp
similarity index 85%
rename from torch/csrc/lazy/core/view_ops/view.cpp
rename to torch/csrc/lazy/ts_backend/view_ops/view.cpp
index 1a0e5093ce64..2048593ed737 100644
--- a/torch/csrc/lazy/core/view_ops/view.cpp
+++ b/torch/csrc/lazy/ts_backend/view_ops/view.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/lazy/core/view_ops/view.h>
+#include <torch/csrc/lazy/ts_backend/view_ops/view.h>
 
 #include <ATen/InferSize.h>
 
@@ -6,9 +6,8 @@ namespace torch {
 namespace lazy {
 
 namespace {
-
 Shape NodeOutputShape(const Value& input, c10::ArrayRef<int64_t> output_sizes) {
-  const Shape& input_shape = GetShapeFromTsValue(input);
+  const Shape& input_shape = input.shape();
   const auto complete_output_sizes =
       at::infer_size(output_sizes, input_shape.numel());
   return Shape(input_shape.scalar_type(), complete_output_sizes);
@@ -18,7 +17,7 @@ Shape NodeOutputShape(const Value& input, c10::ArrayRef<int64_t> output_sizes) {
 
 View::View(const Value& input, std::vector<int64_t> output_size)
     : TsNode(
-          OpKind(at::aten::view),
+          ClassOpKind(),
           {input},
           {NodeOutputShape(input, output_size)},
           /*num_outputs=*/1,
diff --git a/torch/csrc/lazy/core/view_ops/view.h b/torch/csrc/lazy/ts_backend/view_ops/view.h
similarity index 86%
rename from torch/csrc/lazy/core/view_ops/view.h
rename to torch/csrc/lazy/ts_backend/view_ops/view.h
index 8b69d5b128cb..5da9465c5d2b 100644
--- a/torch/csrc/lazy/core/view_ops/view.h
+++ b/torch/csrc/lazy/ts_backend/view_ops/view.h
@@ -9,6 +9,10 @@ namespace lazy {
 
 class TORCH_API View : public TsNode {
  public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::view);
+  }
+
   View(const Value& input, std::vector<int64_t> output_size);
 
   std::string ToString() const override;
diff --git a/torch/csrc/lazy/tutorial.md b/torch/csrc/lazy/tutorial.md
new file mode 100644
index 000000000000..6704a6ccc839
--- /dev/null
+++ b/torch/csrc/lazy/tutorial.md
@@ -0,0 +1,286 @@
+# Lazy Tensor Tutorial
+
+## Introduction
+
+Lazy Tensor is a brand-new tracing system in PyTorch. It includes a safety guarantee not provided by other tracing systems (jit.trace) in that it retraces and recompiles if properties about the input change or uses a cached computation otherwise. It's easier to use than jit.trace and **much** easier to use than jit.script! Lazy Tensor traces both forward and backward passes and removes many Python features present in jit scripted and traced graphs
+that are difficult for hardware vendors to support.
+
+Let's kick off our introduction to Lazy Tensor with an example that illustrates the safety guarantee, as it's one of the biggest usability issues of jit.trace. Suppose we'd like to jit trace the following function.
+
+```python
+import torch
+
+def add_two_maybe(t: torch.Tensor, maybe: torch.Tensor):
+    if maybe:
+        return t + 2
+    return t
+```
+
+You may have noticed that `add_two_maybe` contains an if statement that depends on `maybe` input.
+Let's jit trace the function with the following inputs.
+
+```python
+t = torch.ones(1)
+maybe_false = torch.BoolTensor([0])
+good_inputs = (t, maybe_false)
+jit = torch.jit.trace(add_two_maybe, good_inputs)
+# let's check that the results match with eager
+assert jit(*good_inputs) == add_two_maybe(*good_inputs)
+```
+
+So far, so good! We successfully traced `add_two_maybe` into `jit` and running it gives us the same result as the original function.
+
+Our troubles start if we change the second input and re-run the traced function.
+
+```python
+maybe_true = torch.BoolTensor([1])
+assert jit(t, maybe_true) == add_two_maybe(t, maybe_true)
+```
+
+```shell
+Traceback (most recent call last):
+  File "/home/villedepommes/github/pytorch4/test/test_tutorial.py", line 27, in <module>
+    assert jit(t, maybe_true) == add_two_maybe(t, maybe_true)
+AssertionError
+```
+
+Uh oh?! What really happened here? Let's print out the graph for `jit`:
+
+
+```python
+
+print(torch.jit.last_executed_optimized_graph())
+
+# graph(%t : Tensor,
+#       %maybe : Tensor):
+#   %2 : Tensor = prim::profile[profiled_type=Float(1, strides=[1], requires_grad=0, device=cpu), seen_none=0](%t)
+#    = prim::profile()
+#   return (%2)
+```
+
+We could see that the if statement disappeared and jit trace only traced the `else` path. In fact, jit trace can trace **only** aten operations. It's completely oblivious to any control flow operations such as `if`, `for` or an exception.
+If this sounds unsafe to you, that's because it is!
+
+Let's now learn how we can solve this issue with Lazy Tensors.
+
+The first step is to move the inputs to the Lazy device. The Lazy device isn't any real hardware device. Your code still runs either on CPU or on GPU if you set `LTC_TS_CUDA="1"`.
+
+The lazy device is however very special: it makes PyTorch "remember" every aten operation (into a graph) the user calls rather than eagerly executing it. It's lazy that way ;) get it?
+
+So, the lazy device is an API that users should use to trace their models with Lazy Tensor. It's also a PyTorch device which is a very convenient way for implementing tracing based on PyTorch dispatcher.
+
+First of all, we need a little bit of setup. The Lazy Tensor needs a backend to actually run traced graphs. We implemented a TorchScript-based backend to give our users end-to-end experience running their models with Lazy Tensor. It also serves as an example for hardware vendors looking to integrate with Lazy Tensor.
+
+
+```python
+import torch._lazy
+import torch._lazy.ts_backend
+torch._lazy.ts_backend.init()
+```
+
+Now, we can run our example,
+
+```python
+dev = "lazy"
+t_lazy = torch.ones(1).to(dev)
+maybe_false_lazy = torch.BoolTensor([0]).to(dev)
+lazy_result = add_two_maybe(t_lazy, maybe_false_lazy)
+```
+
+This is pretty cool! Eventually, however, we would still like to execute our computation and access the result, wouldn't we?
+
+There are a few ways to do it. Typically, PyTorch transparently triggers the execution when the user tries to access the result e.g., print a tensor out, move the tensor to a non-lazy device, etc.
+
+Let's give it a try:
+
+```python
+lazy_result = add_two_maybe(t_lazy, maybe_false_lazy)
+print(lazy_result)
+assert lazy_result.cpu() == add_two_maybe(t, maybe_false)
+```
+
+This works as expected! Let's try the case jit trace couldn't handle.
+
+```python
+maybe_true_lazy = torch.BoolTensor([1]).to(dev)
+lazy_result = add_two_maybe(t_lazy, maybe_true_lazy)
+assert lazy_result.cpu() == add_two_maybe(t, maybe_true)
+```
+
+Woo-hoo! This works too!
+Unfortunately, this flexibility comes with a few downsides. Remember that backends need to translate aten ops into some much lower-level operations that an accelerator understands. The translation process may be time-consuming. Although, usually, it's well worth it!
+
+However, if a non-trivial model is wildly dynamic and contains loops that always run different number of times or if statements one after another that explode into different traces every time you run the model, the backend will spend non-trivial amount of time compiling each trace even though the latter is used only for a few times.
+
+Alright, at this point, you should have learned the main ideas behind Lazy Tensor, most common usage patterns and APIs.
+Also, you are hopefully as inspired and motivated about Lazy Tensor as I am.
+
+Let's see now how we can run a full training loop with an optimizer and backward pass! We will learn a few more important concepts and APIs.
+
+## MNIST MLP
+
+We will adapt the following example running MNIST_MLP from [pytorch/examples](https://github.com/pytorch/examples/blob/main/mnist/main.py)
+
+Note, you can access the full version of the script [here](https://github.com/pytorch/pytorch/blob/master/torch/csrc/lazy/test_mnist.py)
+
+First, we need to install one single dependency, `torchvision`
+
+```
+pip install torchvision
+```
+
+`torchvision` comes with MNIST dataset w/ images of handwritten digits, which we will be using for training.
+
+Here's our model definition:
+
+```python
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+```
+
+We are using a multi-level perceptron model with two convolutions, two linear layers and activations sandwiched in between.
+
+Let's set up a loader that would feed the `MNIST` dataset in `train` to our model.
+We are going to run the training loop for 14 epochs which is what the original MNIST example uses.
+**Note, we had to move the model to the Lazy device, `Net().to(device)`. This is very similar to what we would have done had we been training this model on a GPU.**
+
+The rest of the code is pretty standard boilerplate.
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import os
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+import torch._lazy
+import torch._lazy.ts_backend
+import torch._lazy.metrics
+torch._lazy.ts_backend.init()
+
+if __name__  == '__main__':
+    bsz = 64
+    device = 'lazy'
+    epochs = 14
+    log_interval = 10
+    lr = 1
+    gamma = 0.7
+    train_kwargs = {'batch_size': bsz}
+    # if we want to use CUDA
+    if "LTC_TS_CUDA" in os.environ:
+        cuda_kwargs = {'num_workers': 1,
+                       'pin_memory': True,
+                       'shuffle': True,
+                       'batch_size': bsz}
+    train_kwargs.update(cuda_kwargs)
+
+    transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+        ])
+    dataset1 = datasets.MNIST('./data', train=True, download=True,
+                        transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
+    for epoch in range(1, epochs + 1):
+        train(log_interval, model, device, train_loader, optimizer, epoch)
+        scheduler.step()
+```
+
+The training loop in `train` also has one addition. Namely, `torch._lazy.mark_step()` which deserves some elaboration on our part. `mark_step()` instructs Lazy Tensor to break up the current trace and start executing it asynchronously. The current trace encompasses both forward and backward passes and provides the backends with the whole model graph w/o any pythonisms.
+If we don't stop the trace after `optimizer_step` it will include two or more iterations which is way more stuff for the backends to chew through without a whole lot of benefit.
+
+Another important point is that after `mark_step()` we actually continue tracing the next iteration! And... start executing the previous one at the same time! Really, nothing stops us from tracing the next iteration ...and then the one after next until we hit `if batch_idx % log_interval == 0:` where
+we actually need to wait for execution to catch up, so we can print out `loss`. Remember to avoid accessing intermediate results too often if you would like to extract the maximum benefit out of Lazy Tensor.
+
+Since every iteration looks exactly like the one before it, the TS backend will be re-using the same TS compilation.
+
+Alright, let's run it now!
+
+```python
+def train(log_interval, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad(set_to_none=True)
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        torch._lazy.mark_step()
+
+        if batch_idx % log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+```
+
+
+After the script downloads the dataset, the model will be trained on the Lazy device as
+evidenced by the decreasing loss.
+
+```shell
+Train Epoch: 1 [0/60000 (0%)]   Loss: 2.343924
+Train Epoch: 1 [640/60000 (1%)] Loss: 1.760821
+Train Epoch: 1 [1280/60000 (2%)]        Loss: 0.802798
+Train Epoch: 1 [1920/60000 (3%)]        Loss: 0.856164
+Train Epoch: 1 [2560/60000 (4%)]        Loss: 0.568396
+Train Epoch: 1 [3200/60000 (5%)]        Loss: 0.399044
+Train Epoch: 1 [3840/60000 (6%)]        Loss: 0.457996
+Train Epoch: 1 [4480/60000 (7%)]        Loss: 0.285104
+Train Epoch: 1 [5120/60000 (9%)]        Loss: 0.193083
+Train Epoch: 1 [5760/60000 (10%)]       Loss: 0.486165
+Train Epoch: 1 [6400/60000 (11%)]       Loss: 0.163996
+Train Epoch: 1 [7040/60000 (12%)]       Loss: 0.200323
+
+```
+
+Let's briefly mention a few more APIs before we wrap this up. Unfortunately, LT is still very early in its development which means it doesn't implement every single PyTorch op out of there.
+In fact, we implement about a hundred most common ops. What happens if a model contains an op that LT does **not** implement. Lazy Tensor transparently (from a user) breaks up the current trace, waits until all inputs to the op are computed, computes the op on some different device, and finally moves the results onto the lazy device again and starts a new trace.
+This big-little wrinkle means that *sometimes* LT can **not** give the backend a whole model graph which may have a negative impact on performance. You could get the list of the ops that LT could handle for your model by adding the following to your model:
+
+```python
+torch._lazy.metrics.reset()
+train(...)
+print(torch._lazy.metrics.counter_names())
+```
+
+If you are seeing any ops with the prefix: `aten::`
+
+*Sometimes* you could replace such ops with similar that LT does support. More often than not, we will have to just live with it until LT matures.
+
+Another handy API is `torch._lazy.wait_device_ops()`. Remember, we said that `mark_step()` breaks up the current trace and kicks off a computation asynchronously? If downstream there are no blocking operations such as `print`, `item()`, `to`, LT will happily continue tracing.
+If you would like to time how much exactly time computation and tracing took for some model without including device transfers or printing, you could stick `torch._lazy.wait_device_ops()` and `time.perf_counter()` right after it. Don't forget another `time.perf_counter()` before the trace start!
+
+This concludes our brief introduction to LT. Hopefully, you'll remember the main takeaways:
+
+* Backends prefer bigger graphs that preferably include both forward and backward as there's ample opportunity for performance optimizations
+* It's really tricky to produce such graphs without overburdening a user too much. Think, torch.jit.script, torch.jit.trace! Also, think ifs, fors, "Lions, and Tigers, and Bears, Oh My" We digressed.
+
+
+Please give LT a try and tell us what you think on Github! We are **eager, not lazy** (haha!) to hear from you!
diff --git a/torch/csrc/monitor/python_init.cpp b/torch/csrc/monitor/python_init.cpp
index c27816321900..25df084bb828 100644
--- a/torch/csrc/monitor/python_init.cpp
+++ b/torch/csrc/monitor/python_init.cpp
@@ -257,7 +257,7 @@ void initMonitorBindings(PyObject* module) {
       m,
       "data_value_t",
       R"DOC(
-        data_value_t is one of of ``str``, ``float``, ``int``, ``bool``.
+        data_value_t is one of ``str``, ``float``, ``int``, ``bool``.
       )DOC");
 
   py::implicitly_convertible<std::string, data_value_t>();
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index 47e9625cefcc..ad163094865a 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -2,11 +2,206 @@
 #include <torch/csrc/onnx/init.h>
 #include <torch/csrc/onnx/onnx.h>
 #include <torch/version.h>
+#include <torch/csrc/jit/passes/onnx.h>
+#include <torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.h>
+#include <torch/csrc/jit/passes/onnx/constant_fold.h>
+#include <torch/csrc/jit/passes/onnx/deduplicate_initializers.h>
+#include <torch/csrc/jit/passes/onnx/eliminate_unused_items.h>
+#include <torch/csrc/jit/passes/onnx/eval_peephole.h>
+#include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
+#include <torch/csrc/jit/passes/onnx/function_extraction.h>
+#include <torch/csrc/jit/passes/onnx/function_substitution.h>
+#include <torch/csrc/jit/passes/onnx/list_model_parameters.h>
+#include <torch/csrc/jit/passes/onnx/onnx_log.h>
+#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h>
+#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h>
+#include <torch/csrc/jit/passes/onnx/peephole.h>
+#include <torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h>
+#include <torch/csrc/jit/passes/onnx/preprocess_for_onnx.h>
+#include <torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.h>
+#include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
+#include <torch/csrc/jit/passes/onnx/shape_type_inference.h>
+#include <torch/csrc/jit/passes/onnx/unpack_quantized_weights.h>
+#include <torch/csrc/jit/serialization/export.h>
 
 namespace torch {
 namespace onnx {
+
+using namespace torch::jit;
+
 void initONNXBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
+
+  // ONNX specific passes
+  m.def("_jit_pass_onnx_remove_print", RemovePrintOps)
+      .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops)
+      .def("_jit_pass_onnx", ToONNX)
+      .def(
+          "_jit_pass_onnx_assign_output_shape",
+          [](std::shared_ptr<Graph>& graph,
+             const std::vector<at::Tensor>& tensors,
+             const python::IODescriptor& desc,
+             bool onnx_shape_inference,
+             bool is_script) {
+            ONNXAssignOutputShape(
+                graph, tensors, desc, onnx_shape_inference, is_script);
+          })
+      .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution)
+      .def(
+          "_jit_pass_onnx_peephole",
+          [](std::shared_ptr<Graph>& graph,
+             int opset_version,
+             bool fixed_batch_size) {
+            return PeepholeOptimizeONNX(graph, opset_version, fixed_batch_size);
+          })
+      .def("_jit_pass_onnx_preprocess", PreprocessForONNX)
+      .def(
+          "_jit_pass_onnx_eval_peephole",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& paramsDict) {
+            EvalPeepholeONNX(graph, paramsDict);
+            return paramsDict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_pass_onnx_cast_all_constant_to_floating",
+          CastAllConstantToFloating)
+      .def(
+          "_jit_pass_onnx_constant_fold",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& paramsDict,
+             int opset_version) {
+            ConstantFoldONNX(
+                graph,
+                paramsDict,
+                opset_version); // overload resolution
+            return paramsDict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_pass_onnx_eliminate_unused_items",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& paramsDict) {
+            EliminateUnusedItemsONNX(
+                graph->block(),
+                paramsDict); // overload resolution
+            return paramsDict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_pass_onnx_scalar_type_analysis",
+          [](std::shared_ptr<Graph>& graph,
+             bool lowprecision_cast,
+             int opset_version) {
+            return ScalarTypeAnalysisForONNX(
+                graph, lowprecision_cast, opset_version);
+          },
+          py::arg("graph"),
+          py::arg("lowprecision_cast") = true,
+          py::arg("opset_version"))
+      .def(
+          "_jit_pass_onnx_remove_inplace_ops_for_onnx", RemoveInplaceOpsForONNX)
+      .def(
+          "_jit_pass_onnx_node_shape_type_inference",
+          [](Node* n,
+             std::map<std::string, IValue>& params_dict,
+             int opset_version) {
+            ONNXShapeTypeInference(n, params_dict, opset_version);
+          })
+      .def(
+          "_jit_pass_onnx_graph_shape_type_inference",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& params_dict,
+             int opset_version) {
+            ONNXShapeTypeInference(graph, params_dict, opset_version);
+          })
+      .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape)
+      .def("_jit_pass_onnx_lint", ONNXLintGraph)
+      .def("_jit_pass_onnx_function_extraction", torch::jit::onnx::ONNXFunctionExtraction)
+      .def("_jit_pass_onnx_block", BlockToONNX)
+      .def(
+          "_jit_pass_onnx_unpack_quantized_weights",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& paramsDict,
+             bool caffe2) {
+            UnpackQuantizedWeights(graph, paramsDict, caffe2);
+            return paramsDict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_pass_onnx_quantization_insert_permutes",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& paramsDict) {
+            insertPermutes(graph, paramsDict);
+            return paramsDict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_onnx_list_model_parameters",
+          [](Module& module) { return list_module_parameters(module); })
+      .def("_jit_pass_prepare_division_for_onnx", PrepareDivisionForONNX)
+      .def(
+          "_jit_onnx_convert_pattern_from_subblock", ConvertPatternFromSubblock)
+      .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode)
+      .def(
+          "_jit_pass_onnx_deduplicate_initializers",
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue> params_dict,
+             bool is_train) {
+            DeduplicateInitializers(graph, params_dict, is_train);
+            return params_dict;
+          },
+          pybind11::return_value_policy::move)
+      .def(
+          "_jit_pass_onnx_clear_scope_records",
+          &torch::jit::onnx::ONNXClearScopeRecords)
+      .def(
+          "_jit_pass_onnx_track_scope_attributes",
+          &torch::jit::onnx::ONNXTrackScopeAttributes)
+      .def(
+          "_jit_is_onnx_log_enabled",
+          ::torch::jit::onnx::is_log_enabled,
+          "Returns whether ONNX logging is enabled or disabled.")
+      .def(
+          "_jit_set_onnx_log_enabled",
+          ::torch::jit::onnx::set_log_enabled,
+          "Enables or disables ONNX logging.")
+      .def(
+          "_jit_set_onnx_log_output_stream",
+          [](std::string stream_name = "stdout") -> void {
+            std::shared_ptr<std::ostream> out;
+            if (stream_name == "stdout") {
+              out = std::shared_ptr<std::ostream>(&std::cout, [](std::ostream*){});
+            } else if (stream_name == "stderr") {
+              out = std::shared_ptr<std::ostream>(&std::cerr, [](std::ostream*){});
+            } else {
+              std::cerr << "ERROR: only `stdout` and `stderr`"
+                        << "are supported as `stream_name`" << std::endl;
+            }
+            ::torch::jit::onnx::set_log_output_stream(out);
+          },
+          "Set specific file stream for ONNX logging.")
+      .def(
+          "_jit_onnx_log",
+          [](py::args args) -> void {
+            if (::torch::jit::onnx::is_log_enabled()) {
+              auto& out = ::torch::jit::onnx::_get_log_output_stream();
+              for (auto arg : args) {
+                out << ::c10::str(arg);
+              }
+              out << std::endl;
+            }
+          },
+          "Write `args` to the previously specified ONNX log stream.");
+
+  m.def(
+      "_check_onnx_proto",
+      [](const std::string& proto_string, bool full_check) {
+        check_onnx_proto(proto_string, full_check);
+      },
+      py::arg("proto_string"),
+      py::arg("full_check") = false);
+
   auto onnx = m.def_submodule("_onnx");
   py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType")
       .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED)
diff --git a/torch/csrc/profiler/api.h b/torch/csrc/profiler/api.h
index 900a5a44d57c..1eb55d6f98bf 100644
--- a/torch/csrc/profiler/api.h
+++ b/torch/csrc/profiler/api.h
@@ -25,6 +25,7 @@ enum class C10_API_ENUM ProfilerState {
   NVTX, // only emit NVTX markers
   KINETO, // use libkineto
   KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
+  KINETO_ONDEMAND, // run the profiler in on-demand mode
   NUM_PROFILER_STATES, // must be the last one
 };
 
@@ -35,6 +36,21 @@ enum class C10_API_ENUM ActiveProfilerType {
   NVTX
 };
 
+struct TORCH_API ExperimentalConfig {
+  explicit ExperimentalConfig(
+      std::vector<std::string> profiler_metrics = {},
+      bool profiler_measure_per_kernel = false)
+    : profiler_metrics(std::move(profiler_metrics)),
+      profiler_measure_per_kernel(profiler_measure_per_kernel) {}
+  ~ExperimentalConfig() = default;
+  std::vector<std::string> profiler_metrics;
+  bool profiler_measure_per_kernel = false;
+
+  bool hasOptions() const {
+    return profiler_metrics.size() > 0;
+  }
+};
+
 struct TORCH_API ProfilerConfig {
   explicit ProfilerConfig(
       ProfilerState state,
@@ -42,8 +58,10 @@ struct TORCH_API ProfilerConfig {
       bool profile_memory = false,
       bool with_stack = false,
       bool with_flops = false,
-      bool with_modules = false)
+      bool with_modules = false,
+      ExperimentalConfig experimental_config = ExperimentalConfig())
       : state(state),
+        experimental_config(experimental_config),
         report_input_shapes(report_input_shapes),
         profile_memory(profile_memory),
         with_stack(with_stack),
@@ -51,6 +69,7 @@ struct TORCH_API ProfilerConfig {
         with_modules(with_modules) {}
   ~ProfilerConfig() = default;
   ProfilerState state;
+  ExperimentalConfig experimental_config;
   bool report_input_shapes;
   bool profile_memory;
   bool with_stack;
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
new file mode 100644
index 000000000000..c084b9de2452
--- /dev/null
+++ b/torch/csrc/profiler/collection.cpp
@@ -0,0 +1,299 @@
+#include <torch/csrc/profiler/collection.h>
+
+#include <algorithm>
+
+#include <ATen/record_function.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/util/overloaded.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
+  for (const auto& value : values) {
+    if (value.isTensor()) {
+      push(value.toTensor());
+    } else if (value.isScalar()) {
+      tags_.emplace_back(Tag::Scalar);
+    } else if (value.isTensorList()) {
+      tags_.emplace_back(Tag::TensorListBegin);
+      // TODO: Skip TensorList for now.
+      tags_.emplace_back(Tag::TERMINATOR);
+    } else {
+      tags_.emplace_back(Tag::Other);
+    }
+  }
+  tags_.emplace_back(Tag::TERMINATOR);
+}
+
+void InputOutputEncoder::push(const at::Tensor& t) {
+  if (t.defined()) {
+    tags_.emplace_back(Tag::Tensor);
+    const auto& sizes = t.sizes();
+    const auto dim = sizes.size();
+    TORCH_CHECK(
+      dim <= std::numeric_limits<uint32_t>::max(),
+      "Cannot profile Tensors of size > uint32 max. Got dim: ", dim);
+
+    tensor_metadata_.emplace_back(
+      /*ptr_=*/(void*)t.unsafeGetTensorImpl(),
+      /*dtype_=*/t.scalar_type(),
+      /*dim_=*/(uint32_t)dim
+    );
+
+    for (const auto i : sizes) {
+      tensor_sizes_.emplace_back(i);
+    }
+  } else {
+    tags_.emplace_back(Tag::UndefinedTensor);
+  }
+}
+
+// This is a custom-iterator-like getter to obtain input shapes and dtypes.
+auto InputOutputEncoder::getNextShapesAndDtypes() {
+  return [this, tag_it = tags_.begin(),
+          tensor_metadata_it = tensor_metadata_.begin(),
+          tensor_size_it = tensor_sizes_.begin()]() mutable {
+    struct Inputs out;
+    bool terminate = false;
+    while (!terminate && tag_it != tags_.end()) {
+      out.shapes_.emplace_back();
+      switch (*tag_it) {
+        case Tag::Tensor:
+          {
+            const auto& md = *tensor_metadata_it++;
+            for (const auto _ : c10::irange(md.dim_)) {
+              (void)_; // Suppress unused variable warning
+              out.shapes_.back().push_back(*tensor_size_it++);
+            }
+            out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
+          }
+          break;
+
+        case Tag::TensorListBegin:
+            while (*(++tag_it) != Tag::TERMINATOR) {
+              // TODO: Skip TensorLists for now.
+            }
+          out.dtypes_.emplace_back("TensorList");
+          break;
+
+        case Tag::Scalar:
+          out.dtypes_.emplace_back("Scalar");
+          break;
+
+        case Tag::UndefinedTensor:
+        case Tag::Other:
+          out.dtypes_.emplace_back();
+          break;
+
+        case Tag::TERMINATOR:
+          // This marks the end of this op.
+          out.shapes_.pop_back();
+          terminate = true;
+          break;
+
+        default:
+          break;
+      }
+      ++tag_it;
+    }
+    return out;
+  };
+}
+
+void InputOutputEncoder::clear() {
+  tags_.clear();
+  tensor_metadata_.clear();
+  tensor_sizes_.clear();
+}
+
+namespace {
+// See `RecordQueue::getSubqueue()` for an overview of this cache.
+struct SubQueueThreadCache {
+  uint32_t key_;
+  ThreadLocalSubqueue* ref_;
+};
+
+// The astute observer will note that this leaves a dangling reference; nothing
+// in the teardown of `RecordQueue` or `ThreadLocalSubqueue` clears this value.
+// (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime
+// of `*ref_`.) This is safe, however, because `getSubqueue` will check
+// `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
+// does not match the RecordQueue's *unique* `id_` it will evict
+// `sub_queue_cache_` and fall back to a different mechanism.
+std::atomic<uint32_t> queue_id_{0};
+thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
+} // namespace
+
+std::string Result::name() const {
+  return c10::visit([](auto& e){ return e.name_; }, event_);
+}
+
+torch::profiler::impl::kineto::KinetoActivityType Result::kinetoType() const {
+  auto record_function_scope = static_cast<at::RecordScope>(
+      c10::visit([](auto& e) { return e.record_function_scope_; }, event_));
+  return record_function_scope == at::RecordScope::USER_SCOPE
+      ? torch::profiler::impl::kineto::KinetoActivityType::USER_ANNOTATION
+      : torch::profiler::impl::kineto::KinetoActivityType::CPU_OP;
+}
+
+uint64_t Result::correlation_id() const {
+  return c10::visit(c10::overloaded(
+      [](const OpEvent& e){ return e.correlation_id_; },
+      [](const BackendEvent& e) { return std::numeric_limits<uint64_t>::max(); }
+  ), event_);
+}
+
+ThreadLocalSubqueue::ThreadLocalSubqueue(
+    const uint64_t tid,
+    const ProfilerConfig& config)
+    : tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
+  torch::profiler::impl::kineto::recordThreadInfo();
+}
+
+std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
+    const at::RecordFunction& fn,
+    uint64_t correlation_id) {
+  auto event = op_events_.emplace_back(
+      correlation_id,
+      fn.threadId(),
+      fn.seqNr(),
+      fn.forwardThreadId(),
+      fn.scope(),
+      fn.isAsync(),
+      fn.debugHandle(),
+      fn.name());
+  if (config_.report_input_shapes) {
+    inputs_outputs_.push(fn.inputs());
+  }
+
+#if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
+  // backward nodes source range corresponds to the forward node
+  // TODO: consider using C++ stack trace
+  if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+    auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
+    jit_stack_.emplace_back(callstackStr(cs));
+  }
+  if (config_.with_modules &&
+      fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+    jit_modules_.emplace_back(jit::currentModuleHierarchy());
+  }
+#endif
+  if (config_.with_flops) {
+    extra_args_.emplace_back(torch::profiler::impl::saveExtraArgs(fn));
+  }
+
+  auto out = std::make_unique<KinetoObserverContext>(event);
+
+  if (config_.state == ProfilerState::KINETO_GPU_FALLBACK) {
+    try {
+      out->fallback_ = gpu_fallback_.emplace_back();
+      torch::profiler::impl::cudaStubs()->record(
+          nullptr, &out->fallback_->cuda_event_start_, nullptr);
+    } catch (const std::exception& e) {
+      LOG(WARNING) << "Failed to record CUDA event. " << e.what();
+    }
+  }
+
+  event->start_time_ = torch::profiler::impl::getApproximateTime();
+  return out;
+}
+
+RecordQueue::RecordQueue(const ProfilerConfig& config)
+    : id_(++queue_id_), config_{config} {}
+
+ThreadLocalSubqueue* RecordQueue::getSubqueue() {
+  // In the most common case, a thread will want to write to the same sub-queue
+  // that it wrote to last call. The only time that isn't true is if:
+  //  A) The profiler context has ended and we are in a new one.
+  //  B) Two profilers are active in different TLS contexts, and this thread
+  //     is a worker helping with intra-op parallelism.
+  // Since we expect this to be the OVERWHELMINGLY common case (>99%), we add a
+  // special thread_local cache so that we can skip the overall `flat_hash_map`
+  // (and corresponding lock).
+  if (id_ == sub_queue_cache_.key_) {
+    return sub_queue_cache_.ref_;
+  }
+
+  const auto tid = at::RecordFunction::currentThreadId();
+  std::lock_guard<std::mutex> guard(sub_queue_mutex_);
+  auto it = sub_queues_.find(tid);
+  if (it == sub_queues_.end()) {
+    it =
+        sub_queues_.emplace(tid, std::make_unique<ThreadLocalSubqueue>(tid, config_)).first;
+  }
+
+  sub_queue_cache_ = SubQueueThreadCache{id_, it->second.get()};
+  return it->second.get();
+}
+
+template <typename T>
+auto steal_or_default(T& it) {
+  if (it.exhausted()) {
+    return typename T::value_type();
+  } else {
+    auto result = std::move(*it);
+    ++it;
+    return result;
+  }
+}
+
+std::deque<Result> RecordQueue::getRecords(
+    std::function<time_t(approx_time_t)> time_converter) {
+  auto converter = [&](approx_time_t t) {
+    return t == std::numeric_limits<approx_time_t>::min()
+        ? std::numeric_limits<int64_t>::min()
+        : time_converter(t) / 1000; // ns to ms
+  };
+  std::deque<Result> out;
+  for (auto& subqueue_it : sub_queues_) {
+    auto& queue = *subqueue_it.second;
+    for (auto& i : queue.backend_events_) {
+      Result r;
+      r.start_time_us_ = i.start_time_us_;
+      r.end_time_us_ = i.end_time_us_;
+      r.start_tid_ = queue.tid();
+      r.kineto_info_ = queue.kineto_info();
+      r.event_ = std::move(i);
+      out.push_back(std::move(r));
+    }
+
+    auto input_getter = queue.inputs_outputs_.getNextShapesAndDtypes();
+    auto jit_stack_it = queue.jit_stack_.begin();
+    auto jit_module_it = queue.jit_modules_.begin();
+    auto extra_args_it = queue.extra_args_.begin();
+    auto gpu_fallback_it = queue.gpu_fallback_.begin();
+    for (auto& i : queue.op_events_) {
+      Result r;
+      r.start_time_us_ = converter(i.start_time_);
+      r.end_time_us_ = converter(i.end_time_);
+      r.start_tid_ = queue.tid();
+      r.kineto_info_ = queue.kineto_info();
+      r.event_ = std::move(i);
+      r.inputs_ = input_getter();
+      r.jit_stack_ = steal_or_default(jit_stack_it);
+      r.jit_modules_ = steal_or_default(jit_module_it);
+      r.extra_args_ = steal_or_default(extra_args_it);
+      r.gpu_fallback_ = steal_or_default(gpu_fallback_it);
+
+      out.push_back(std::move(r));
+    }
+    queue.op_events_.clear();
+    queue.inputs_outputs_.clear();
+    queue.jit_stack_.clear();
+    queue.jit_modules_.clear();
+    queue.extra_args_.clear();
+    queue.gpu_fallback_.clear();
+  }
+
+  std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) {
+    return a.start_time_us_ < b.start_time_us_;
+  });
+  return out;
+}
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
new file mode 100644
index 000000000000..509fcd8984da
--- /dev/null
+++ b/torch/csrc/profiler/collection.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <utility>
+
+#include <ATen/Context.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/variant.h>
+#include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+struct OpEvent {
+  OpEvent() = default;
+  OpEvent(
+      const uint64_t correlation_id,
+      const uint64_t start_thread_id,
+      const int64_t sequence_number,
+      const uint64_t forward_thread_id,
+      const at::RecordScope scope,
+      const bool is_async,
+      const int64_t debug_handle,
+      const std::string name)
+      : correlation_id_{correlation_id},
+        start_thread_id_{start_thread_id},
+        sequence_number_{sequence_number},
+        forward_thread_id_{forward_thread_id},
+        record_function_scope_{(uint8_t)scope},
+        is_async_{is_async},
+        debug_handle_{debug_handle},
+        name_{name} {}
+
+  approx_time_t start_time_;
+  approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
+  uint64_t correlation_id_;
+  uint64_t start_thread_id_;
+  uint64_t end_thread_id_;
+  int64_t sequence_number_;
+  uint64_t forward_thread_id_;
+  uint8_t record_function_scope_;
+  bool is_async_;
+  int64_t debug_handle_;
+  std::string name_;
+};
+
+struct Inputs {
+  std::vector<std::vector<int64_t>> shapes_;
+  std::vector<std::string> dtypes_;
+};
+
+struct FallbackPair {
+  CUDAEventStub cuda_event_start_ = nullptr;
+  CUDAEventStub cuda_event_end_ = nullptr;
+};
+
+struct BackendEvent {
+  int64_t start_time_us_;
+  int64_t end_time_us_;
+  uint8_t record_function_scope_;
+  int64_t debug_handle_;
+  std::string name_;
+  std::string backend_;
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct Result {
+  std::string name() const;
+  torch::profiler::impl::kineto::KinetoActivityType kinetoType() const;
+  uint64_t correlation_id() const;
+
+  int64_t start_time_us_;
+  int64_t end_time_us_;
+  uint64_t start_tid_;
+  kineto::DeviceAndResource kineto_info_;
+
+  c10::variant<OpEvent, BackendEvent> event_;
+
+  // OpEvent only.
+  Inputs inputs_;
+  std::vector<std::string> jit_stack_;
+  std::vector<std::string> jit_modules_;
+  std::unordered_map<std::string, c10::IValue> extra_args_;
+  FallbackPair gpu_fallback_;
+};
+
+struct KinetoObserverContext : public at::ObserverContext {
+  explicit KinetoObserverContext(OpEvent* event)
+    : event_{event} {}
+
+  OpEvent* event_;
+  FallbackPair* fallback_ {nullptr};
+};
+
+constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
+
+// InputOutputEncoder
+// Stores each op_events' shapes and dtypes into a contiguous AppendOnlyList
+// so that we no longer create vectors for shapes and dtypes on every op.
+// Those vectors can be created during post-processing.
+class InputOutputEncoder final {
+ public:
+  void push(c10::ArrayRef<const c10::IValue> values);
+
+  // Used during post-processing to create vectors for shapes and dtype.
+  auto getNextShapesAndDtypes();
+
+  void clear();
+
+ private:
+  enum class Tag {
+    Tensor = 0,
+    UndefinedTensor,
+    TensorListBegin, // TODO: generalize to other lists.
+    Scalar,
+    Other,
+    TERMINATOR
+  };
+
+  struct TensorMetadata {
+    void* ptr_;
+    c10::ScalarType dtype_;
+    uint32_t dim_;
+  };
+
+  void push(const at::Tensor& t);
+
+  AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
+  AppendOnlyList<TensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_metadata_;
+  AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_;
+};
+
+
+class TORCH_API ThreadLocalSubqueue {
+ public:
+  ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
+
+  std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn, uint64_t correlation_id);
+
+  template <class... Args>
+  void emplace_backend_event(Args&&... args) {
+    backend_events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  uint64_t tid() const {
+    return tid_;
+  }
+
+  const kineto::DeviceAndResource& kineto_info() const {
+    return kineto_info_;
+  }
+
+ private:
+  uint64_t tid_;
+  ProfilerConfig config_;
+  kineto::DeviceAndResource kineto_info_;
+
+  friend class RecordQueue;
+  // See `containers.h` for block size benchmarks.
+  static constexpr size_t BlockSize = 512;
+  AppendOnlyList<OpEvent, BlockSize> op_events_;
+
+  // report_input_shapes
+  InputOutputEncoder inputs_outputs_;
+
+  // with_stack
+  AppendOnlyList<std::vector<std::string>, BlockSize> jit_stack_;
+
+  // with_modules
+  AppendOnlyList<std::vector<std::string>, BlockSize> jit_modules_;
+
+  // with_flops
+  AppendOnlyList<std::unordered_map<std::string, c10::IValue>, BlockSize> extra_args_;
+
+  // ProfilerState::KINETO_GPU_FALLBACK
+  AppendOnlyList<FallbackPair, BlockSize> gpu_fallback_;
+
+  // reportBackendEventToActiveKinetoProfiler
+  AppendOnlyList<BackendEvent, BlockSize> backend_events_;
+};
+
+class TORCH_API RecordQueue {
+ public:
+  explicit RecordQueue(const ProfilerConfig& config);
+
+  ThreadLocalSubqueue* getSubqueue();
+
+  // NB: This is a destructive operation.
+  std::deque<Result> getRecords(std::function<time_t(approx_time_t)> time_converter);
+
+ private:
+  uint32_t id_;
+  ProfilerConfig config_;
+  ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>> sub_queues_;
+  std::mutex sub_queue_mutex_;
+};
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/containers.h b/torch/csrc/profiler/containers.h
new file mode 100644
index 000000000000..ff5e684709e7
--- /dev/null
+++ b/torch/csrc/profiler/containers.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <utility>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ============================================================================
+// == AppendOnlyList ==========================================================
+// ============================================================================
+//   During profiling, we have a very predictable access pattern: we only
+// append to the end of the container. We can specialize and outperform both
+// std::vector (which must realloc) and std::deque (which performs a double
+// indirection), and this class of operation is sufficiently important to the
+// profiling hot path to warrant specializing:
+//   https://godbolt.org/z/rTjozf1c4
+//   https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4    (Prototype #1, int)
+//   https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no    (Prototype #1, int pair)
+//   https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q    (Prototype #2, int pair)
+//   https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg    (Prototype #3, int pair)
+//   https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo    (Full impl, int pair)
+// AppendOnlyList has 2x lower emplace overhead compared to more generic STL
+// containers.
+//
+//   The optimal value of `ChunkSize` will vary by use case, but testing shows
+// that a value of 1024 does a good job amortizing the `malloc` cost of growth.
+// Performance drops off for larger values, so testing on a case-by-case basis
+// is recommended if performance is absolutely critical.
+
+template <typename T, size_t ChunkSize>
+class AppendOnlyList {
+ public:
+  using array_t = std::array<T, ChunkSize>;
+  static_assert(ChunkSize > 0, "Block cannot be empty.");
+
+  AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
+  AppendOnlyList(const AppendOnlyList&) = delete;
+  AppendOnlyList& operator=(const AppendOnlyList&) = delete;
+
+  size_t size() const {
+    return n_blocks_ * ChunkSize + (size_t)(next_ - end_);
+  }
+
+  template <class... Args>
+  T* emplace_back(Args&&... args) {
+    maybe_grow();
+    *next_ = {std::forward<Args>(args)...};
+    return next_++;
+  }
+
+  void clear() {
+    buffer_.clear();
+    buffer_last_ = buffer_.begin();
+    n_blocks_ = 0;
+    next_ = nullptr;
+    end_ = nullptr;
+  }
+
+  struct Iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type   = std::ptrdiff_t;
+    using value_type        = T;
+    using pointer           = T*;
+    using reference         = T&;
+
+    Iterator(std::forward_list<array_t>& buffer, const size_t size)
+      : block_{buffer.begin()}, size_{size} {}
+
+    // End iterator.
+    Iterator() = default;
+
+    bool exhausted() const {
+      return current_ >= size_;
+    }
+
+    reference operator*() const { return *current_ptr(/*checked=*/true); }
+    pointer operator->() { return current_ptr(/*checked=*/true); }
+
+    // Prefix increment
+    Iterator& operator++() {
+      if (!(++current_ % ChunkSize)) {
+        block_++;
+      }
+      return *this;
+    }
+
+    // Postfix increment
+    Iterator operator++(int) { Iterator tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator==(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() == b.current_ptr();
+    }
+    friend bool operator!=(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() != b.current_ptr();
+    }
+
+    std::pair<array_t*, size_t> address() const {
+      if (current_ >= size_){
+        return {nullptr, 0};
+      }
+      return {&(*block_), current_ % ChunkSize};
+    }
+
+   private:
+    T* current_ptr(bool checked = false) const {
+      auto a = address();
+      if (a.first == nullptr) {
+        TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList.");
+        return nullptr;
+      }
+      return a.first->data() + a.second;
+    }
+
+    typename std::forward_list<array_t>::iterator block_;
+    size_t current_ {0};
+    size_t size_ {0};
+  };
+
+  Iterator begin() { return Iterator(buffer_, size()); }
+  Iterator end()   { return Iterator(); }
+  // TODO: cbegin and cend()
+
+// TODO: make private
+ protected:
+  void maybe_grow() {
+    if (C10_UNLIKELY(next_ == end_)) {
+      buffer_last_ = buffer_.emplace_after(buffer_last_);
+      n_blocks_++;
+      next_ = buffer_last_->data();
+      end_ = next_ + ChunkSize;
+    }
+  }
+
+  std::forward_list<array_t> buffer_;
+
+  // We maintain a pointer to the last element of `buffer_` so that we can
+  // insert at the end in O(1) time.
+  typename std::forward_list<array_t>::iterator buffer_last_;
+  size_t n_blocks_ {0};
+  T* next_ {nullptr};
+  T* end_ {nullptr};
+};
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
new file mode 100644
index 000000000000..515e41810a3d
--- /dev/null
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -0,0 +1,57 @@
+#ifdef USE_KINETO
+#include <libkineto.h>
+#include <torch/csrc/autograd/profiler_kineto.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+namespace {
+
+using namespace torch::autograd::profiler;
+
+class LibKinetoClient : public libkineto::ClientInterface {
+ public:
+  void init() override {}
+
+  void warmup(bool setupOpInputsCollection) override {
+    reportInputShapes_ = setupOpInputsCollection;
+  }
+
+  void start() override {
+    ProfilerConfig cfg{
+        ProfilerState::KINETO_ONDEMAND,
+        reportInputShapes_,
+        false,
+        false,
+        false,
+        false};
+    std::set<ActivityType> activities{ActivityType::CPU};
+    auto scopes = {at::RecordScope::FUNCTION, at::RecordScope::USER_SCOPE};
+    enableProfiler(cfg, activities, scopes);
+  }
+
+  void stop() override {
+    (void)disableProfiler();
+  }
+ private:
+  bool reportInputShapes_{true};
+};
+
+} // namespace
+
+
+} // namespace impl
+} // namespace profiler
+
+#ifdef ENABLE_LIBKINETO_CLIENT
+struct RegisterLibKinetoClient {
+  RegisterLibKinetoClient() {
+    static profiler::impl::LibKinetoClient client;
+    libkineto::api().registerClient(&client);
+  }
+} register_libkineto_client;
+#endif // ENABLE_LIBKINETO_CLIENT
+
+} // namespace torch
+#endif // USE_KINETO
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index aea60edb74b6..df2098f274f7 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -60,51 +60,48 @@ TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
 }
 #endif // USE_KINETO
 
+#ifdef USE_KINETO
+namespace {
+libkineto::ActivityType toActivityType(const KinetoActivityType type) {
+  switch (type) {
+    case KinetoActivityType::CPU_OP:
+      return libkineto::ActivityType::CPU_OP;
+    case KinetoActivityType::CPU_INSTANT_EVENT:
+      return libkineto::ActivityType::CPU_INSTANT_EVENT;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          type == KinetoActivityType::USER_ANNOTATION,
+          "Invalid KinetoActivityType: ",
+          (int)type);
+      return libkineto::ActivityType::USER_ANNOTATION;
+  }
+}
+} // namespace
+#endif // USE_KINETO
+
 void TraceWrapper::addCPUActivity(
     const std::string& name,
+    const KinetoActivityType kineto_type,
     const DeviceAndResource device_and_resource,
     const uint64_t correlation_id,
     const int64_t start_time,
-    const int64_t end_time) {
+    const int64_t end_time,
+    const annotation_t& annotations) {
 #ifdef USE_KINETO
   TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
+  auto type = toActivityType(kineto_type);
   cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
-    cpu_trace_->span, libkineto::ActivityType::CPU_OP, name));
+    cpu_trace_->span, type, name));
   auto& act = cpu_trace_->activities.back();
   act.device = device_and_resource.device;
   act.resource = device_and_resource.resource;
   act.id = correlation_id;
   act.startTime = start_time;
-  act.endTime = end_time;
-#endif // USE_KINETO
-}
-
-void TraceWrapper::addMemoryUsageActivity(
-    const std::string& name,
-    const DeviceAndResource device_and_resource,
-    const int64_t time,
-    const c10::Device device,
-    const void* ptr,
-    const int64_t alloc_size,
-    const int64_t total_allocated,
-    const int64_t total_reserved) {
-#ifdef USE_KINETO
-  TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
-  cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
-    cpu_trace_->span, libkineto::ActivityType::CPU_INSTANT_EVENT, name));
-  auto& act = cpu_trace_->activities.back();
-  act.device = device_and_resource.device;
-  act.resource = device_and_resource.resource;
-  act.startTime = time;
-  act.addMetadata("Device Type", std::to_string((int8_t)device.type()));
-  act.addMetadata("Device Id", std::to_string(device.index()));
-  act.addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(ptr)));
-  act.addMetadata("Bytes", std::to_string(alloc_size));
-  if (total_allocated >= 0) {
-    act.addMetadata("Total Allocated", std::to_string(total_allocated));
+  if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
+    act.endTime = end_time;
   }
-  if (total_reserved >= 0) {
-    act.addMetadata("Total Reserved", std::to_string(total_reserved));
+  for (const auto& i : annotations) {
+    act.addMetadata(i.first, i.second);
   }
 #endif // USE_KINETO
 }
@@ -149,7 +146,67 @@ void ActivityTraceWrapper::save(const std::string& path) {
 #endif // USE_KINETO
 }
 
-void prepareTrace(const bool cpuOnly, const ActivitySet& activities) {
+namespace {
+// Handles processing of Experimental Config options for Kineto
+class ExperimentalConfigWrapper {
+ public:
+  explicit ExperimentalConfigWrapper(
+      const torch::profiler::impl::ExperimentalConfig& config)
+    : config_(config) {}
+
+  bool assertValid(
+    const ActivitySet& activities) {
+    // Kineto supports reading performance events per kernel/iteration
+    // using CUPTI Range based profiler API. In this mode however we
+    // do not trace CPU or GPU events.
+    bool cupti_range_profiler = config_.profiler_metrics.size() > 0;
+    if (cupti_range_profiler && activities.count(
+          torch::autograd::profiler::ActivityType::CPU)) {
+      LOG(WARNING) << "Cannot run range profiler with CPU activities, please only"
+                   << " use CUDA activity type";
+      return false;
+    }
+    return cupti_range_profiler;
+  }
+
+  void prepareTraceWithExperimentalOptions() {
+#ifdef USE_KINETO
+  std::set<libkineto::ActivityType> k_activities{
+    libkineto::ActivityType::CUDA_PROFILER_RANGE};
+
+  const size_t num_metrics = config_.profiler_metrics.size();
+  std::stringstream configss;
+
+  LOG(INFO) << "CUPTI profiler metrics size = " << num_metrics;
+
+  configss << "ACTIVITIES_WARMUP_PERIOD_SECS=0\n"
+           << "CUPTI_PROFILER_METRICS=";
+
+  for (int i = 0; i < num_metrics; i++) {
+    configss << config_.profiler_metrics[i];
+    if (num_metrics > 1 && i < (num_metrics-1)) {
+      configss << ",";
+    }
+  }
+  configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
+           << (config_.profiler_measure_per_kernel ? "true" : "false")
+           << "\n";
+  LOG(INFO) << "Generated config = " << configss.str();
+
+  libkineto::api().activityProfiler().prepareTrace(
+      k_activities, configss.str());
+#endif // USE_KINETO
+  }
+
+ private:
+  const torch::profiler::impl::ExperimentalConfig& config_;
+};
+} // namespace
+
+void prepareTrace(
+    const bool cpuOnly,
+    const ActivitySet& activities,
+    const torch::profiler::impl::ExperimentalConfig& config) {
 #ifdef USE_KINETO
   if (!libkineto::api().isProfilerRegistered()) {
     libkineto_init(/*cpuOnly=*/cpuOnly, /*logOnError=*/true);
@@ -168,6 +225,14 @@ void prepareTrace(const bool cpuOnly, const ActivitySet& activities) {
     k_activities.insert(cudaTypes.begin(), cudaTypes.end());
   }
 
+  ExperimentalConfigWrapper configWrap(config);
+
+  // Experimental Configuration options are present
+  if (config.hasOptions() && configWrap.assertValid(activities)) {
+    configWrap.prepareTraceWithExperimentalOptions();
+    return;
+  }
+
   libkineto::api().activityProfiler().prepareTrace(k_activities);
 #endif // USE_KINETO
 }
@@ -194,12 +259,24 @@ void pushCorrelationId(uint64_t correlation_id) {
 #endif // USE_KINETO
 }
 
+void pushUserCorrelationId(uint64_t correlation_id) {
+#ifdef USE_KINETO
+  libkineto::api().activityProfiler().pushUserCorrelationId(correlation_id);
+#endif // USE_KINETO
+}
+
 void popCorrelationId() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().popCorrelationId();
 #endif // USE_KINETO
 }
 
+void popUserCorrelationId() {
+#ifdef USE_KINETO
+  libkineto::api().activityProfiler().popUserCorrelationId();
+#endif // USE_KINETO
+}
+
 void recordThreadInfo() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().recordThreadInfo();
@@ -220,6 +297,7 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
     case libkineto::ActivityType::GPU_MEMSET:
     case libkineto::ActivityType::CONCURRENT_KERNEL:
     case libkineto::ActivityType::GPU_USER_ANNOTATION:
+    case libkineto::ActivityType::CUDA_PROFILER_RANGE:
       return c10::DeviceType::CUDA;
     case libkineto::ActivityType::CPU_OP:
     case libkineto::ActivityType::USER_ANNOTATION:
@@ -251,6 +329,16 @@ void addMetadataJson(const std::string& key, const std::string& value) {
 #endif // USE_KINETO
 }
 
+void profilerStep() {
+#ifdef USE_KINETO
+  if (libkineto::api().isProfilerInitialized()) {
+    libkineto::api().activityProfiler().step();
+  } else {
+    LOG(WARNING) << "Profiler is not initialized: skipping step() invocation";
+  }
+#endif // USE_KINETO
+}
+
 } // namespace profiler
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index fb251fdd7408..59ff529cfd78 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -58,30 +58,30 @@ using trace_t = DummyTraceBuffer;
 using interface_trace_t = DummyTraceBuffer;
 #endif // USE_KINETO
 
+// Subset of `libkineto::ActivityType` for `addCPUActivity`.
+enum class KinetoActivityType : uint8_t {
+  CPU_OP = 0,
+  CPU_INSTANT_EVENT,
+  USER_ANNOTATION
+};
+
+using annotation_t = std::vector<std::pair<std::string, std::string>>;
+
 // Wraps: libkineto::CpuTraceBuffer
 struct TraceWrapper {
   TraceWrapper(const int64_t start_time, const std::string& name);
   TraceWrapper(TraceWrapper&&) = default;
   TraceWrapper(const TraceWrapper&) = delete;
 
-  // The caller is expected to hold a mutex when calling `addCPUActivity` and
-  // addMemoryUsageActivity.
+  // The caller is expected to hold a mutex when calling `addCPUActivity`.
   void addCPUActivity(
       const std::string& name,
+      const KinetoActivityType kineto_type,
       const DeviceAndResource device_and_resource,
       const uint64_t correlation_id,
       const int64_t start_time,
-      const int64_t end_time);
-
-  void addMemoryUsageActivity(
-      const std::string& name,
-      const DeviceAndResource device_and_resource,
-      const int64_t time,
-      const c10::Device device,
-      const void* ptr,
-      const int64_t alloc_size,
-      const int64_t total_allocated,
-      const int64_t total_reserved);
+      const int64_t end_time,
+      const annotation_t& annotations);
 
   void transferCpuTrace(int64_t end_time);
 
@@ -114,11 +114,15 @@ struct ActivityTraceWrapper {
 };
 
 using ActivitySet = std::set<torch::autograd::profiler::ActivityType>;
-void prepareTrace(const bool cpuOnly, const ActivitySet& activities);
+void prepareTrace(
+    const bool cpuOnly, const ActivitySet& activities,
+    const torch::profiler::impl::ExperimentalConfig& config);
 void startTrace();
 ActivityTraceWrapper stopTrace();
 void pushCorrelationId(uint64_t correlation_id);
+void pushUserCorrelationId(uint64_t correlation_id);
 void popCorrelationId();
+void popUserCorrelationId();
 void recordThreadInfo();
 
 } // namespace kineto
@@ -135,6 +139,8 @@ TORCH_API void addMetadataJson(
     const std::string& key,
     const std::string& value);
 
+TORCH_API void profilerStep();
+
 } // namespace profiler
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/profiler/nvtx_observer.cpp b/torch/csrc/profiler/nvtx_observer.cpp
index 06ad46ebb60e..2f460a38f8da 100644
--- a/torch/csrc/profiler/nvtx_observer.cpp
+++ b/torch/csrc/profiler/nvtx_observer.cpp
@@ -30,17 +30,103 @@ struct NVTXThreadLocalState : ProfilerThreadLocalStateBase {
         tls == nullptr || tls->profilerType() == ActiveProfilerType::NVTX);
     return static_cast<NVTXThreadLocalState*>(tls);
   }
+  std::pair<at::RecordFunctionHandle, int> getOpIdFromInput(const at::Tensor& tensor);
+
+  void setProducerTensorMap(at::TensorImpl *tensor, at::RecordFunctionHandle op_id, int output_nr){
+    producer_tensor_map_[(void*)tensor] = std::pair<at::RecordFunctionHandle, int> {op_id, output_nr};
+  }
+
+ protected:
+  // Maps the address of an output Tensor to a unique op id and output
+  // index of the tensor.
+  // at::TensorImpl* is the actual type of the key, but using void*
+  // to indicate the pointer is just being used as a key
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_map<void*, std::pair<at::RecordFunctionHandle, int>> producer_tensor_map_;
 };
 
+std::pair<at::RecordFunctionHandle, int> NVTXThreadLocalState::getOpIdFromInput(const at::Tensor& tensor) {
+  std::pair<at::RecordFunctionHandle, int> producer_op_pair(0, -1);
+  if (tensor.defined()) {
+    at::TensorImpl *ten_addr =  tensor.unsafeGetTensorImpl();
+    // See if Address is in the map already
+    if (producer_tensor_map_.count((void*)ten_addr) > 0) {
+      producer_op_pair  =  producer_tensor_map_[(void*)ten_addr];
+    }
+  }
+  return producer_op_pair;
+}
+
+std::list<std::pair<at::RecordFunctionHandle, int>> flattenOpIdList(c10::List<c10::IValue> list, std::string fn_name) {
+  std::list<std::pair<at::RecordFunctionHandle, int>> input_op_id_list;
+  auto state_ptr = NVTXThreadLocalState::getTLS();
+  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
+  for (const c10::IValue input : list) {
+    if (input.isTensor()) {
+      const at::Tensor& tensor = input.toTensor();
+      auto producer_op_pair = state_ptr->getOpIdFromInput(tensor);
+      input_op_id_list.push_back(producer_op_pair);
+    }
+  }
+  return input_op_id_list;
+}
+
+std::list<std::pair<at::RecordFunctionHandle, int>> getInputTensorOpIds(const at::RecordFunction& fn) {
+  std::pair<at::RecordFunctionHandle, int> undefined_op_pair(0,-1);
+  std::list<std::pair<at::RecordFunctionHandle, int>> input_producer_ops_;
+  auto state_ptr = NVTXThreadLocalState::getTLS();
+  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
+  for (const c10::IValue& input_item : fn.inputs()) {
+    if(input_item.isTensor()) {
+      const at::Tensor& tensor = input_item.toTensor();
+      auto producer_pair = state_ptr->getOpIdFromInput(tensor);
+      input_producer_ops_.push_back(producer_pair);
+    } else {
+      if (input_item.isList()) {
+        std::list<std::pair<at::RecordFunctionHandle, int>> tmp_op_ids = flattenOpIdList(input_item.toList(), std::string(fn.name()));
+        // Extend the current sizes array by the array returned from input sizes
+        if (!tmp_op_ids.empty()) {
+          input_producer_ops_.splice(input_producer_ops_.end(), tmp_op_ids);
+        } else {
+          input_producer_ops_.emplace_back(undefined_op_pair);
+        }
+      } else {
+          input_producer_ops_.emplace_back(undefined_op_pair);
+      }
+    }
+  }
+  return input_producer_ops_;
+}
+
+void updateOutputTensorTracker(const at::RecordFunction& fn) {
+  int output_nr = 0;
+  auto state_ptr = NVTXThreadLocalState::getTLS();
+  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
+  for (const c10::IValue& s_tensor : fn.outputs()){
+    if(s_tensor.isTensor()) {
+      const at::Tensor& tensor = s_tensor.toTensor();
+      if (tensor.defined()) {
+        auto ten_addr =  tensor.unsafeGetTensorImpl();
+        state_ptr->setProducerTensorMap(ten_addr, fn.handle(), output_nr);
+      }
+    }
+    output_nr++;
+  }
+}
+
 template <bool report_input_shapes>
 std::unique_ptr<at::ObserverContext> enterNVTX(const at::RecordFunction& fn) {
   if (NVTXThreadLocalState::getTLS() != nullptr) {
+    auto input_op_ids = getInputTensorOpIds(fn);
     torch::profiler::impl::cudaStubs()->nvtxRangePushA(
         torch::profiler::impl::getNvtxStr(
             fn.name(),
             fn.seqNr(),
-            report_input_shapes ? torch::profiler::impl::inputSizes(fn)
-                                : std::vector<std::vector<int64_t>>())
+            report_input_shapes ? torch::profiler::impl::inputSizes(fn, true)
+                                : std::vector<std::vector<int64_t>>(),
+            fn.handle(),
+            report_input_shapes ? input_op_ids
+                                : std::list<std::pair<at::RecordFunctionHandle, int>>())
             .c_str());
   }
   return nullptr;
@@ -65,10 +151,13 @@ void pushNVTXCallbacks(
           state_ptr->config().report_input_shapes
               ? &enterNVTX</*report_input_shapes=*/true>
               : &enterNVTX</*report_input_shapes=*/false>,
-          [](const at::RecordFunction&, at::ObserverContext*) {
+          [](const at::RecordFunction& fn, at::ObserverContext *ctx) {
             torch::profiler::impl::cudaStubs()->nvtxRangePop();
+            updateOutputTensorTracker(fn);
           })
           .needsInputs(config.report_input_shapes)
+          .needsOutputs(config.report_input_shapes)
+          .needsIds(true)
           .scopes(scopes));
   state_ptr->setCallbackHandle(handle);
 }
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 95c1c7c1933f..4415b7fa78a9 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/profiler/util.h>
+#include <torch/csrc/autograd/function.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 
 #include <c10/util/ArrayRef.h>
@@ -13,13 +14,85 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
+ApproximateClockToUnixTimeConverter::ApproximateClockToUnixTimeConverter()
+  : start_times_(measurePairs()) {}
+
+ApproximateClockToUnixTimeConverter::UnixAndApproximateTimePair
+ApproximateClockToUnixTimeConverter::measurePair() {
+  // Take a measurement on either side to avoid an ordering bias.
+  auto fast_0 = getApproximateTime();
+  auto wall = std::chrono::system_clock::now();
+  auto fast_1 = getApproximateTime();
+
+  TORCH_INTERNAL_ASSERT(fast_1 >= fast_0, "getCount is non-monotonic.");
+  auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(
+      wall.time_since_epoch());
+
+  // `x + (y - x) / 2` is a more numerically stable average than `(x + y) / 2`.
+  return {t.count(), fast_0 + (fast_1 - fast_0) / 2};
+}
+
+ApproximateClockToUnixTimeConverter::time_pairs
+    ApproximateClockToUnixTimeConverter::measurePairs() {
+  static constexpr auto n_warmup = 5;
+  for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
+    getApproximateTime();
+    steady_clock_t::now();
+  }
+
+  time_pairs out;
+  for (const auto i : c10::irange(out.size())) {
+    out[i] = measurePair();
+  }
+  return out;
+}
+
+std::function<time_t(approx_time_t)>
+ApproximateClockToUnixTimeConverter::makeConverter() {
+  auto end_times = measurePairs();
+
+  // Compute the real time that passes for each tick of the approximate clock.
+  std::array<long double, replicates> scale_factors{};
+  for (const auto i : c10::irange(replicates)) {
+    auto delta_ns = end_times[i].t_ - start_times_[i].t_;
+    auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
+    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+  }
+  std::sort(scale_factors.begin(), scale_factors.end());
+  long double scale_factor = scale_factors[replicates / 2 + 1];
+
+  // We shift all times by `t0` for better numerics. Double precision only has
+  // 16 decimal digits of accuracy, so if we blindly multiply times by
+  // `scale_factor` we may suffer from precision loss. The choice of `t0` is
+  // mostly arbitrary; we just need a factor that is the correct order of
+  // magnitude to bring the intermediate values closer to zero. We are not,
+  // however, guaranteed that `t0_approx` is *exactly* the getApproximateTime
+  // equivilent of `t0`; it is only an estimate that we have to fine tune.
+  auto t0 = start_times_[0].t_;
+  auto t0_approx = start_times_[0].approx_t_;
+  std::array<double, replicates> t0_correction{};
+  for (const auto i : c10::irange(replicates)) {
+    auto dt = start_times_[i].t_  - t0;
+    auto dt_approx = (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
+    t0_correction[i] = dt - (time_t)dt_approx;
+  }
+  t0 += t0_correction[t0_correction.size() / 2 + 1];
+
+  return [=](approx_time_t t_approx) {
+    // See above for why this is more stable than `A * t_approx + B`.
+    return (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
+  };
+}
+
 // ----------------------------------------------------------------------------
 // -- NVTX --------------------------------------------------------------------
 // ----------------------------------------------------------------------------
 std::string getNvtxStr(
     const char* name,
     int64_t sequence_nr,
-    const std::vector<std::vector<int64_t>>& shapes) {
+    const std::vector<std::vector<int64_t>>& shapes,
+    at::RecordFunctionHandle op_id,
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids) {
   if (sequence_nr >= -1 || shapes.size() > 0) {
     std::string str;
     if (sequence_nr >= 0) {
@@ -32,31 +105,17 @@ std::string getNvtxStr(
       str = name;
 #endif
     }
+    if (op_id > 0) {
+      str = fmt::format("{}, op_id = {}", str, op_id);
+    }
     if (shapes.size() > 0) {
-      std::stringstream s;
-      s << str;
-      s << ", sizes = [";
-      for (const auto idx : c10::irange(shapes.size())) {
-        if (shapes[idx].size() > 0) {
-          s << "[";
-          for (const auto dim : c10::irange(shapes[idx].size())) {
-            s << shapes[idx][dim];
-            if (dim < shapes[idx].size() - 1) {
-              s << ", ";
-            }
-          }
-          s << "]";
-        } else {
-          s << "[]";
-        }
-        if (idx < shapes.size() - 1) {
-          s << ", ";
-        }
-      }
-      s << "]";
-      return s.str();
+      str = fmt::format("{}, sizes = {}", str, shapesToStr(shapes));
+    }
+    // Include the op ids of the input edges so
+    // you can build the network graph
+    if (input_op_ids.size() > 0) {
+      str = fmt::format("{}, input_op_ids = {}", str, inputOpIdsToStr(input_op_ids));
     }
-
     return str;
   } else {
     return name;
@@ -115,17 +174,41 @@ std::string stacksToStr(
   return "\"" + rc + "\"";
 }
 
-std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
+std::vector<std::vector<int64_t>> flattenList(c10::List<c10::IValue> list, std::string fn_name) {
+  std::vector<std::vector<int64_t>> tensor_dims;
+  for (const c10::IValue input : list) {
+    if (input.isTensor()) {
+      const at::Tensor& tensor = input.toTensor();
+      if (tensor.defined()) {
+        tensor_dims.push_back(input.toTensor().sizes().vec());
+      }
+    }
+  }
+  return tensor_dims;
+}
+
+std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn, bool flatten_list_enabled) {
   std::vector<std::vector<int64_t>> sizes;
   sizes.reserve(fn.inputs().size());
   for (const c10::IValue& input : fn.inputs()) {
-    if (!input.isTensor()) {
-      sizes.emplace_back();
-      continue;
-    }
-    const at::Tensor& tensor = input.toTensor();
-    if (tensor.defined()) {
-      sizes.push_back(input.toTensor().sizes().vec());
+    if (input.isTensor()) {
+      const at::Tensor& tensor = input.toTensor();
+      if (tensor.defined()) {
+        sizes.push_back(input.toTensor().sizes().vec());
+      } else {
+        sizes.emplace_back();
+      }
+    } else if (input.isList()) {
+      std::vector<std::vector<int64_t>> tmp_sizes;
+      if (flatten_list_enabled) {
+        tmp_sizes = flattenList(input.toList(), std::string(fn.name()));
+      }
+      // Extend the current sizes array by the array returned from input sizes
+      if (!tmp_sizes.empty()) {
+        sizes.insert(sizes.end(), tmp_sizes.begin(), tmp_sizes.end());
+      } else {
+        sizes.emplace_back();
+      }
     } else {
       sizes.emplace_back();
     }
@@ -134,23 +217,37 @@ std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
 }
 
 std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes) {
-  std::ostringstream oss;
-  oss << "[";
+  std::string str("[");
   for (const auto t_idx : c10::irange(shapes.size())) {
     if (t_idx > 0) {
-      oss << ", ";
+      str = fmt::format("{}, ", str);
     }
-    oss << "[";
+    str = fmt::format("{}[", str);
     for (const auto s_idx : c10::irange(shapes[t_idx].size())) {
       if (s_idx > 0) {
-        oss << ", ";
+        str = fmt::format("{}, ", str);
       }
-      oss << shapes[t_idx][s_idx];
+      str = fmt::format("{}{}", str, shapes[t_idx][s_idx]);
+    }
+    str = fmt::format("{}]", str);
+  }
+  str = fmt::format("{}]", str);
+  return str;
+}
+
+std::string inputOpIdsToStr(const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids) {
+  std::string str("[");
+  int idx = 0;
+
+  for (const auto& op_id_info_pair : input_op_ids) {
+    if (idx++ > 0) {
+      str = fmt::format("{}, ", str);
     }
-    oss << "]";
+    // (OpId,OutputNr)
+    str = fmt::format("{}({},{})", str, op_id_info_pair.first, op_id_info_pair.second);
   }
-  oss << "]";
-  return oss.str();
+  str = fmt::format("{}]", str);
+  return str;
 }
 
 std::string dtypesToStr(const std::vector<std::string>& types) {
@@ -220,7 +317,7 @@ static constexpr auto kMat2Size = "mat2_size";
 static bool validateInput(
     const std::string& op_name,
     size_t min_size,
-    const std::vector<c10::IValue>& inputs,
+    c10::ArrayRef<const c10::IValue> inputs,
     const c10::ArrayRef<int>& should_be_tensor) {
   std::stringstream ss;
   if (inputs.size() < min_size) {
@@ -245,7 +342,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(
     const at::RecordFunction& fn) {
   // for specific types of fn, return the saved extra args for computing flops
   std::unordered_map<std::string, c10::IValue> map;
-  std::vector<c10::IValue> inputs = fn.inputs();
+  auto inputs = fn.inputs();
   std::string fname(fn.name());
 
   if (inputs.empty()) {
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index ddd5db0e1b0c..d5d0792817e1 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -5,6 +5,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <list>
 
 #include <c10/macros/Macros.h>
 #include <ATen/record_function.h>
@@ -18,26 +19,50 @@
 #include <sys/time.h> // for gettimeofday()
 #endif
 
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
+#define C10_RDTSC
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__CUDACC__) || defined(__HIPCC__)
+#undef C10_RDTSC
+#elif defined(__clang__)
+// `__rdtsc` is available by default.
+// NB: This has to be first, because Clang will also define `__GNUC__`
+#elif defined(__GNUC__)
+#include <x86intrin.h>
+#else
+#undef C10_RDTSC
+#endif
+#endif
+
 namespace torch {
 namespace profiler {
 namespace impl {
 
-inline int64_t getTime(bool allow_monotonic = false) {
+using time_t = int64_t;
+using steady_clock_t = std::conditional<
+    std::chrono::high_resolution_clock::is_steady,
+    std::chrono::high_resolution_clock,
+    std::chrono::steady_clock>::type;
+
+inline time_t getTimeSinceEpoch() {
+  auto now = std::chrono::system_clock::now().time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
+}
+
+inline time_t getTime(bool allow_monotonic = false) {
 #if defined(C10_IOS) && defined(C10_MOBILE)
   // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
   // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
   // is implemented or not
   struct timeval now;
   gettimeofday(&now, NULL);
-  return static_cast<int64_t>(now.tv_sec) * 1000000000 +
-      static_cast<int64_t>(now.tv_usec) * 1000;
+  return static_cast<time_t>(now.tv_sec) * 1000000000 +
+      static_cast<time_t>(now.tv_usec) * 1000;
 #elif defined(_WIN32) || defined(__MACH__)
-  using namespace std::chrono;
-  using clock = std::conditional<
-      high_resolution_clock::is_steady,
-      high_resolution_clock,
-      steady_clock>::type;
-  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             steady_clock_t::now().time_since_epoch())
+      .count();
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
   struct timespec t {};
@@ -46,15 +71,59 @@ inline int64_t getTime(bool allow_monotonic = false) {
     mode = CLOCK_MONOTONIC;
   }
   clock_gettime(mode, &t);
-  return static_cast<int64_t>(t.tv_sec) * 1000000000 +
-      static_cast<int64_t>(t.tv_nsec);
+  return static_cast<time_t>(t.tv_sec) * 1000000000 +
+      static_cast<time_t>(t.tv_nsec);
+#endif
+}
+
+// We often do not need to capture true wall times. If a fast mechanism such
+// as TSC is available we can use that instead and convert back to epoch time
+// during post processing. This greatly reduce the clock's contribution to
+// profiling.
+//   http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
+//   https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
+// TODO: We should use
+// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
+inline auto getApproximateTime() {
+#if defined(C10_RDTSC)
+  return static_cast<uint64_t>(__rdtsc());
+#else
+  return getTime();
 #endif
 }
 
+using approx_time_t = decltype(getApproximateTime());
+static_assert(
+    std::is_same<approx_time_t, int64_t>::value ||
+    std::is_same<approx_time_t, uint64_t>::value,
+    "Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
+
+// Convert `getCount` results to Nanoseconds since unix epoch.
+class ApproximateClockToUnixTimeConverter final {
+ public:
+  ApproximateClockToUnixTimeConverter();
+  std::function<time_t(approx_time_t)> makeConverter();
+
+  struct UnixAndApproximateTimePair {
+    time_t t_;
+    approx_time_t approx_t_;
+  };
+  static UnixAndApproximateTimePair measurePair();
+
+ private:
+  static constexpr size_t replicates = 1001;
+  using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
+  time_pairs measurePairs();
+
+  time_pairs start_times_;
+};
+
 std::string getNvtxStr(
     const char* name,
     int64_t sequence_nr,
-    const std::vector<std::vector<int64_t>>& shapes);
+    const std::vector<std::vector<int64_t>>& shapes,
+    at::RecordFunctionHandle op_id = 0,
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids = {});
 
 struct TORCH_API FileLineFunc {
   std::string filename;
@@ -70,10 +139,12 @@ TORCH_API std::string stacksToStr(
     const std::vector<std::string>& stacks,
     const char* delim);
 TORCH_API std::vector<std::vector<int64_t>> inputSizes(
-    const at::RecordFunction& fn);
+    const at::RecordFunction& fn,
+    const bool flatten_list_enabled=false);
 TORCH_API std::string shapesToStr(
     const std::vector<std::vector<int64_t>>& shapes);
 TORCH_API std::string dtypesToStr(const std::vector<std::string>& types);
+TORCH_API std::string inputOpIdsToStr(const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids);
 TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn);
 
 std::unordered_map<std::string, c10::IValue> TORCH_API
diff --git a/torch/csrc/stub_with_flatbuffer.c b/torch/csrc/stub_with_flatbuffer.c
new file mode 100644
index 000000000000..6f7c159634e0
--- /dev/null
+++ b/torch/csrc/stub_with_flatbuffer.c
@@ -0,0 +1,18 @@
+#include <Python.h>  // NOLINT
+
+#ifdef _WIN32
+__declspec(dllimport)
+#endif
+extern PyObject* initModuleFlatbuffer(void);
+
+#ifndef _WIN32
+#ifdef __cplusplus
+extern "C"
+#endif
+__attribute__((visibility("default"))) PyObject* PyInit__C_flatbuffer(void);
+#endif
+
+PyMODINIT_FUNC PyInit__C_flatbuffer(void)
+{
+  return initModuleFlatbuffer();
+}
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 32ff8d72d893..81e65b0f5c00 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -54,8 +54,7 @@ struct PyTensorType {
 
 static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
 
-// This is always an instance of VariableType
-static PyTensorType* default_tensor_type;
+static Backend default_backend = Backend::CPU;
 
 static void py_bind_tensor_types(const std::vector<PyTensorType*>& tensor_types);
 
@@ -207,12 +206,12 @@ static std::string get_name(Backend backend, ScalarType scalarType) {
   return ss.str();
 }
 
-static THPObjectPtr get_storage_obj(PyTensorType* type) {
-  auto module_name = get_module(type->get_backend());
+static THPObjectPtr get_storage_obj(Backend backend, ScalarType dtype) {
+  auto module_name = get_module(backend);
   auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name));
   if (!module_obj) throw python_error();
 
-  auto storage_name = std::string(toString(type->get_scalar_type())) + "Storage";
+  auto storage_name = std::string(toString(dtype)) + "Storage";
   THPObjectPtr storage(PyObject_GetAttrString(module_obj.get(), storage_name.c_str()));
   if (!storage.get()) {
     throw TypeError("couldn't find storage object %s", storage_name.c_str());
@@ -276,32 +275,40 @@ static THPObjectPtr get_tensor_dict() {
 // importing torch.
 static std::vector<PyTensorType*> tensor_types;
 
-void set_default_tensor_type(PyTensorType* type) {
-  if (!at::isFloatingType(type->get_scalar_type())) {
-    throw TypeError("only floating-point types are supported as the default type");
-  }
-  if (type->get_backend() == Backend::Undefined) {
-    throw TypeError("default type cannot be undefined");
-  }
-  if (isSparse(type->get_backend())) {
-    throw TypeError("only dense types are supported as the default type");
-  }
-
-  // get the storage first, so if it doesn't exist we don't change the default tensor type
-  THPObjectPtr storage = get_storage_obj(type);
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  default_tensor_type = type;
-  at::set_default_dtype(scalarTypeToTypeMeta(type->get_scalar_type()));
+void set_default_storage_type(Backend backend, ScalarType dtype) {
+  THPObjectPtr storage = get_storage_obj(backend, dtype);
 
   auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
   if (!torch_module) throw python_error();
 
   if (PyObject_SetAttrString(torch_module.get(), "Storage", storage) != 0) {
-    // technically, we should undo the change of default tensor type.
     throw python_error();
   }
 }
 
+void set_default_tensor_type(c10::optional<Backend> backend, c10::optional<ScalarType> dtype) {
+  if (backend.has_value()) {
+    TORCH_CHECK_TYPE(*backend != Backend::Undefined, "default type cannot be undefined");
+    TORCH_CHECK_TYPE(!isSparse(*backend), "only dense types are supported as the default type");
+  }
+  if (dtype.has_value()) {
+    TORCH_CHECK_TYPE(at::isFloatingType(*dtype),
+                     "only floating-point types are supported as the default type");
+  }
+
+  // Try setting default storage in python first as it's the only operation that can fail
+  set_default_storage_type(
+      backend.value_or(default_backend),
+      dtype.value_or(at::get_default_dtype_as_scalartype()));
+
+  if (dtype.has_value()) {
+    at::set_default_dtype(scalarTypeToTypeMeta(*dtype));
+  }
+  if (backend.has_value()) {
+    default_backend = *backend;
+  }
+}
+
 static void initialize_aten_types(std::vector<PyTensorType*>& tensor_types) {
   // includes CUDA types even when PyTorch is not built with CUDA
   auto declared_types = torch::utils::all_declared_types();
@@ -314,12 +321,9 @@ static void initialize_aten_types(std::vector<PyTensorType*>& tensor_types) {
     ScalarType scalar_type = declared_types[i].second;
     set_type(tensor_type, backend, scalar_type);
     set_name(tensor_type, get_name(backend, scalar_type));
-
-    // Use torch.float32 as the default tensor type
-    if (backend == Backend::CPU && scalar_type == at::kFloat) {
-      set_default_tensor_type(&tensor_type);
-    }
   }
+
+  set_default_tensor_type(Backend::CPU, ScalarType::Float);
 }
 
 void initialize_python_bindings() {
@@ -385,39 +389,26 @@ static bool PyTensorType_Check(PyObject* obj) {
 
 void py_set_default_tensor_type(PyObject* obj) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  PyTensorType *type;
-  if (PyTensorType_Check(obj)) {
-    type = (PyTensorType*)obj;
-  } else {
-    throw TypeError("invalid type object");
-  }
+  TORCH_CHECK_TYPE(PyTensorType_Check(obj), "invalid type object");
+  PyTensorType *type = (PyTensorType*)obj;
   if (type->is_cuda && !torch::utils::cuda_enabled()) {
     throw unavailable_type(*type);
   }
-  set_default_tensor_type(type);
+  set_default_tensor_type(type->get_backend(), type->get_scalar_type());
 }
 
 void py_set_default_dtype(PyObject* obj) {
-  if (THPDtype_Check(obj)) {
-    auto scalar_type = ((THPDtype*)obj)->scalar_type;
-    auto backend = default_tensor_type->get_backend();
-    auto it = std::find_if(tensor_types.begin(), tensor_types.end(),
-      [backend, scalar_type](PyTensorType *x) {
-        return x->get_backend() == backend && x->get_scalar_type() == scalar_type;
-      });
-    set_default_tensor_type(*it);
-  } else {
-    throw TypeError("invalid dtype object");
-  }
+  TORCH_CHECK_TYPE(THPDtype_Check(obj), "invalid dtype object");
+  auto scalar_type = ((THPDtype*)obj)->scalar_type;
+  set_default_tensor_type(/*backend=*/c10::nullopt, scalar_type);
 }
 
 c10::DispatchKey get_default_dispatch_key() {
-  AT_ASSERT(default_tensor_type);
-  return default_tensor_type->get_dispatch_key();
+  return backendToDispatchKey(default_backend);
 }
 
 ScalarType get_default_scalar_type() {
-  return typeMetaToScalarType(get_default_dtype());
+  return get_default_dtype_as_scalartype();
 }
 
 }} // namespace torch::tensors
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index 98203fc77d70..aba5f01a50c9 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -198,14 +198,14 @@ void storage_fill(at::Storage self, uint8_t value) {
 }
 
 void storage_set(at::Storage self, ptrdiff_t idx, uint8_t value) {
-  TORCH_CHECK((idx >= 0) && (idx < self.nbytes()), "out of bounds");
+  TORCH_CHECK((idx >= 0) && (idx < static_cast<ptrdiff_t>(self.nbytes())), "out of bounds");
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
   auto self_t = at::empty({0}, {}, options).set_(self);
   self_t[idx].fill_(value);
 }
 
 uint8_t storage_get(at::Storage self, ptrdiff_t idx) {
-  TORCH_CHECK((idx >= 0) && (idx < self.nbytes()), "out of bounds");
+  TORCH_CHECK((idx >= 0) && (idx < static_cast<ptrdiff_t>(self.nbytes())), "out of bounds");
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
   auto self_t = at::empty({0}, {}, options).set_(self);
   return self_t[idx].item<uint8_t>();
diff --git a/torch/csrc/utils/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
index 3b6729b2ced9..d54c81dc15c7 100644
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -309,7 +309,8 @@ void THP_encodeComplexFloatBuffer(uint8_t* dst, const c10::complex<float>* src,
   auto new_src = complex_to_float(src, len);
   memcpy(dst, static_cast<void*>(&new_src), 2 * sizeof(float) * len);
   if (order != THP_nativeByteOrder()) {
-    for (size_t i = 0; i < (2 * len); i++) {
+    for (const auto i : c10::irange(2 * len)) {
+      (void)i; // Suppress unused variable warning
       swapBytes32(dst);
       dst += sizeof(float);
     }
@@ -321,7 +322,8 @@ void THP_encodeCompelxDoubleBuffer(uint8_t* dst, const c10::complex<double>* src
   auto new_src = complex_to_float(src, len);
   memcpy(dst, static_cast<void*>(&new_src), 2 * sizeof(double) * len);
   if (order != THP_nativeByteOrder()) {
-    for (size_t i = 0; i < (2 * len); i++) {
+    for (const auto i : c10::irange(2 * len)) {
+      (void)i; // Suppress unused variable warning
       swapBytes64(dst);
       dst += sizeof(double);
     }
diff --git a/torch/csrc/utils/cpp_stacktraces.cpp b/torch/csrc/utils/cpp_stacktraces.cpp
new file mode 100644
index 000000000000..6543ca986f5b
--- /dev/null
+++ b/torch/csrc/utils/cpp_stacktraces.cpp
@@ -0,0 +1,30 @@
+#include <torch/csrc/utils/cpp_stacktraces.h>
+
+#include <cstdlib>
+#include <cstring>
+
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace {
+bool compute_cpp_stack_traces_enabled() {
+  auto envar = std::getenv("TORCH_SHOW_CPP_STACKTRACES");
+  if (envar) {
+    if (strcmp(envar, "0") == 0) {
+      return false;
+    }
+    if (strcmp(envar, "1") == 0) {
+      return true;
+    }
+    TORCH_WARN("ignoring invalid value for TORCH_SHOW_CPP_STACKTRACES: ", envar,
+               " valid values are 0 or 1.");
+  }
+  return false;
+}
+} // namespace
+
+bool get_cpp_stacktraces_enabled() {
+  static bool enabled = compute_cpp_stack_traces_enabled();
+  return enabled;
+}
+} // namespace torch
diff --git a/torch/csrc/utils/cpp_stacktraces.h b/torch/csrc/utils/cpp_stacktraces.h
new file mode 100644
index 000000000000..70249608879e
--- /dev/null
+++ b/torch/csrc/utils/cpp_stacktraces.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+TORCH_API bool get_cpp_stacktraces_enabled();
+}
diff --git a/torch/csrc/utils/crash_handler.cpp b/torch/csrc/utils/crash_handler.cpp
deleted file mode 100644
index 8fb318b265a8..000000000000
--- a/torch/csrc/utils/crash_handler.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-
-#ifdef ADD_BREAKPAD_SIGNAL_HANDLER
-#ifdef __linux__
-#include <breakpad/src/client/linux/handler/exception_handler.h>
-#include <csignal>
-#elif __APPLE__
-#include <breakpad/src/client/mac/handler/exception_handler.h>
-#elif _WIN32
-#include <breakpad/src/client/windows/handler/exception_handler.h>
-#else
-#error unsupported platform
-#endif
-#endif
-
-#include <c10/util/Exception.h>
-#include <torch/csrc/utils/crash_handler.h>
-
-namespace torch {
-namespace crash_handler {
-
-#ifdef ADD_BREAKPAD_SIGNAL_HANDLER
-
-static std::unique_ptr<google_breakpad::ExceptionHandler> handler; // NOLINT
-static STRING_TYPE minidump_directory; // NOLINT
-static bool enabled_for_exceptions = false; // NOLINT
-
-#if __linux__
-bool dump_callback(
-    const google_breakpad::MinidumpDescriptor& descriptor,
-    void* context,
-    bool succeeded) {
-  if (succeeded) {
-    std::cerr << "Wrote minidump to " << descriptor.path() << std::endl;
-  }
-  return succeeded;
-}
-#elif __APPLE__
-
-bool dump_callback(
-    const char* dump_dir,
-    const char* minidump_id,
-    void* context,
-    bool succeeded) {
-  if (succeeded) {
-    std::cerr << "Wrote minidump to " << dump_dir << "/" << minidump_id
-              << ".dmp" << std::endl;
-  }
-  return succeeded;
-}
-#elif _WIN32
-bool dump_callback(
-    const wchar_t* dump_path,
-    const wchar_t* minidump_id,
-    void* context,
-    EXCEPTION_POINTERS* exinfo,
-    MDRawAssertionInfo* assertion,
-    bool succeeded) {
-  if (succeeded) {
-    // Printing with wcerr inserts spaces between all the characters for some
-    // reason. If someone figures that out then we can get rid of the std::string
-    // conversions here.
-    std::wstring dump_path_ws(dump_path);
-    std::string dump_path_string(dump_path_ws.begin(), dump_path_ws.end());
-    std::wstring minidump_id_ws(minidump_id);
-    std::string minidump_id_string(minidump_id_ws.begin(), minidump_id_ws.end());
-    std::cerr << "Wrote minidump to " << dump_path_string << "\\" << minidump_id_string << ".dmp" << std::endl;
-  }
-  return succeeded;
-}
-#endif
-
-void enable_minidumps(const STRING_TYPE& dir) {
-  minidump_directory = dir;
-// The constructor here registers the actual signal handler
-#ifdef __linux__
-  handler = std::make_unique<google_breakpad::ExceptionHandler>(
-      google_breakpad::MinidumpDescriptor(minidump_directory),
-      nullptr,
-      dump_callback,
-      nullptr,
-      true,
-      -1);
-#elif __APPLE__
-  handler = std::make_unique<google_breakpad::ExceptionHandler>(
-      /*dump_path=*/minidump_directory.c_str(),
-      /*filter=*/nullptr,
-      /*callback=*/dump_callback,
-      /*callback_context=*/nullptr,
-      /*install_handler=*/true,
-      /*port_name=*/nullptr);
-#elif _WIN32
-  handler = std::make_unique<google_breakpad::ExceptionHandler>(
-      /*dump_path=*/minidump_directory.c_str(),
-      /*filter=*/nullptr,
-      /*callback=*/dump_callback,
-      /*callback_context=*/nullptr,
-      /*handler_types=*/
-      google_breakpad::ExceptionHandler::HandlerType::HANDLER_ALL);
-#endif
-}
-
-void disable_minidumps() {
-  handler.reset();
-}
-
-const STRING_TYPE& get_minidump_directory() {
-  if (handler == nullptr) {
-    AT_ERROR(
-        "Minidump handler is uninintialized, make sure to call enable_minidumps first");
-  }
-  return minidump_directory;
-}
-
-bool is_enabled_on_exceptions() {
-  if (handler == nullptr) {
-    return false;
-  }
-
-  return enabled_for_exceptions;
-}
-
-void write_minidump() {
-  TORCH_CHECK(
-      handler != nullptr,
-      "Minidump handler is uninintialized, make sure to call enable_minidumps first");
-  handler->WriteMinidump();
-}
-
-void enable_minidumps_on_exceptions() {
-  if (handler == nullptr) {
-    AT_ERROR(
-        "Minidump handler is uninintialized, make sure to call enable_minidumps first");
-  }
-  enabled_for_exceptions = true;
-}
-
-#else
-// On unspported systems we can't do anything, so stub out everything.
-void enable_minidumps(const STRING_TYPE& dir) {
-  AT_ERROR("Compiled without minidump support");
-}
-
-void disable_minidumps() {
-  // Purposefully do nothing
-}
-
-const STRING_TYPE& get_minidump_directory() {
-  AT_ERROR("Compiled without minidump support");
-}
-
-bool is_enabled_on_exceptions() {
-  return false;
-}
-
-void write_minidump() {
-  AT_ERROR("Compiled without minidump support");
-}
-
-void enable_minidumps_on_exceptions() {
-  AT_ERROR("Compiled without minidump support");
-}
-
-#endif
-
-} // namespace crash_handler
-} // namespace torch
diff --git a/torch/csrc/utils/crash_handler.h b/torch/csrc/utils/crash_handler.h
deleted file mode 100644
index 93f03380945d..000000000000
--- a/torch/csrc/utils/crash_handler.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#include <torch/csrc/Export.h>
-#include <string>
-
-namespace torch {
-namespace crash_handler {
-
-#ifdef _WIN32
-typedef std::wstring STRING_TYPE;
-#else
-typedef std::string STRING_TYPE;
-#endif
-
-// Set up a handler that writes minidumps to 'dir' on signals. This is not
-// necessary to call unless you want to change 'dir' to something other than
-// the default '/tmp/pytorch_crashes'.
-TORCH_API void enable_minidumps(const STRING_TYPE& dir);
-
-// Enable minidumps when passing exceptions up to Python. By default these don't
-// do anything special, but it can be useful to write out a minidump on
-// exceptions for debugging purposes. This has no effect in C++.
-TORCH_API void enable_minidumps_on_exceptions();
-
-// Disable all minidump writing and un-register the signal handler
-TORCH_API void disable_minidumps();
-
-// Get the directory that minidumps will be written to
-TORCH_API const STRING_TYPE& get_minidump_directory();
-
-// These are TORCH_API'ed since they are used from libtorch_python.so
-TORCH_API bool is_enabled_on_exceptions();
-TORCH_API void write_minidump();
-
-} // namespace crash_handler
-} // namespace torch
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 7281588074d4..1bf5bfb0ddbf 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -3,12 +3,14 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/utils/python_strings.h>
 
+#include <ATen/PythonTorchFunctionTLS.h>
+
 namespace torch {
-  static thread_local bool enable_torch_function = true;
   PyObject* disabled_torch_function = nullptr;
+  PyObject* disabled_torch_dispatch = nullptr;
 
   bool torch_function_enabled() {
-      return enable_torch_function;
+      return !at::impl::PythonTorchFunctionTLS::is_disabled();
   }
 
   PyObject* disabled_torch_function_impl() {
@@ -18,6 +20,14 @@ namespace torch {
   void set_disabled_torch_function_impl(PyObject* value) {
     disabled_torch_function = value;
   }
+
+  PyObject* disabled_torch_dispatch_impl() {
+    return disabled_torch_dispatch;
+  }
+
+  void set_disabled_torch_dispatch_impl(PyObject* value) {
+    disabled_torch_dispatch = value;
+  }
 }
 
 typedef struct {
@@ -27,18 +37,18 @@ typedef struct {
 } DisableTorchFunction;
 
 PyObject* DisableTorchFunction__enter(PyObject* self, PyObject *unused) {
-    ((DisableTorchFunction*)self)->old_state = torch::enable_torch_function;
-    torch::enable_torch_function = false;
+    ((DisableTorchFunction*)self)->old_state = at::impl::PythonTorchFunctionTLS::is_disabled();
+    at::impl::PythonTorchFunctionTLS::set_disabled(true);
     Py_RETURN_NONE;
 }
 
 PyObject* DisableTorchFunction__exit(PyObject* self, PyObject *unused) {
-    torch::enable_torch_function = ((DisableTorchFunction*)self)->old_state;
+    at::impl::PythonTorchFunctionTLS::set_disabled(((DisableTorchFunction*)self)->old_state);
     Py_RETURN_NONE;
 }
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject *unused) {
-    if (torch::enable_torch_function) {
+    if (torch::torch_function_enabled()) {
         Py_RETURN_TRUE;
     } else
     {
@@ -110,23 +120,68 @@ PyObject* THPModule_disable_torch_function(PyObject *self, PyObject *a) {
   py::tuple py_args;
   if (args == nullptr) {
     py_args = py::make_tuple();
-  }
-  else {
+  } else if (PyList_Check(args)) {
+    py_args = py::reinterpret_steal<py::tuple>(PyList_AsTuple(args));
+  } else if (PyTuple_Check(args)) {
     py_args = py::reinterpret_borrow<py::tuple>(args);
+  } else {
+    throw torch::TypeError("expected List or Tuple (got %s)", Py_TYPE(args)->tp_name);
   }
 
   // These are all C-API calls so no exceptions will be raised
   // and therefore no need for RAII approach to storing
   // the old value.
-  bool old_value = torch::enable_torch_function;
-  torch::enable_torch_function = false;
+  bool old_value = at::impl::PythonTorchFunctionTLS::is_disabled();
+  at::impl::PythonTorchFunctionTLS::set_disabled(true);
   // kwargs can safely be nullptr here.
   PyObject *result = PyObject_Call(func, py_args.ptr(), kwargs);
-  torch::enable_torch_function = old_value;
+  at::impl::PythonTorchFunctionTLS::set_disabled(old_value);
   return result;
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THPModule_disable_torch_dispatch(PyObject *self, PyObject *a) {
+  HANDLE_TH_ERRORS
+  PyObject *func=nullptr, *types=nullptr, *args=nullptr, *kwargs=nullptr;
+  if (!PyArg_ParseTuple(a, "OO|OO", &func, &types, &args, &kwargs)) {
+    return nullptr;
+  }
+  py::tuple py_args;
+  if (args == nullptr) {
+    py_args = py::make_tuple();
+  } else if (PyList_Check(args)) {
+    py_args = py::reinterpret_steal<py::tuple>(PyList_AsTuple(args));
+  } else if (PyTuple_Check(args)) {
+    py_args = py::reinterpret_borrow<py::tuple>(args);
+  } else {
+    throw torch::TypeError("expected List or Tuple (got %s)", Py_TYPE(args)->tp_name);
+  }
+
+  // This implementation is not completely correct.  The moral
+  // meaning of this function is that we should do a redispatch
+  // "after" PythonKey, aka a redispatch() call.  But we don't have a
+  // dispatcher call here; we have an opaque Python object.
+  //
+  // What we have here is a close approximation: instead of redispatch(), we
+  // just exclude Python and all the keys before it, so that we will go
+  // to the next key after Python.  The difference, however, is we are
+  // now PERMANENTLY after Python.  We don't think there are any legitimate
+  // cases where we want to go for another round on the entire dispatcher key
+  // set, but if there are, then we will have to do something else here.
+  c10::impl::ExcludeDispatchKeyGuard guard_(
+      // TODO: add constructor for this specifically
+      c10::DispatchKeySet(c10::DispatchKeySet::FULL) -
+      c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Python)
+      // NB: off by one hazard here, but it works out: python key is not
+      // included in AFTER, so it is included in the negation (and that's
+      // correct: we want to exclude Python key and everything BEFORE it.)
+  );
+  auto r = PyObject_Call(func, py_args.ptr(), kwargs);
+  if (r == nullptr) throw python_error();
+  return r;
+  END_HANDLE_TH_ERRORS
+}
+
 // Makes sure that we don't check for __torch_function__ on basic Python types
 static bool is_basic_python_type(PyTypeObject *tp)
 {
@@ -168,8 +223,9 @@ inline bool has_torch_function_attr(PyObject* obj) {
 }
 
 namespace torch {
-auto check_has_torch_function(PyObject* obj) -> bool
-{
+auto check_has_torch_function(PyObject* obj, bool ignore_mode) -> bool {
+  if (!ignore_mode && at::impl::PythonTorchFunctionTLS::get_mode())
+    return true;
   PyTypeObject *tp = Py_TYPE(obj);
   return (
     !THPVariable_CheckTypeExact(tp) &&
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index c4f9651d5342..18ed7f47bef6 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -1,5 +1,7 @@
 #pragma once
 #include <torch/csrc/python_headers.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
 
 namespace torch {
   // Sometimes we don't want infinite recursion for subclasses,
@@ -8,13 +10,26 @@ namespace torch {
   // This is an internal utility, not exposed to users.
   bool torch_function_enabled();
   PyObject* disabled_torch_function_impl();
+  PyObject* disabled_torch_dispatch_impl();
   void set_disabled_torch_function_impl(PyObject* value);
-  bool check_has_torch_function(PyObject* obj);
+  void set_disabled_torch_dispatch_impl(PyObject* value);
+  // Set ignore_mode to true if you're trying to collect overloaded arguments;
+  // using mode here will improperly cause you to add ALL objects to the
+  // overloaded list even if they don't actually have __torch_function__
+  bool check_has_torch_function(PyObject* obj, bool ignore_mode = false);
+
+  struct DisableTorchDispatch {
+    DisableTorchDispatch() : guard_(c10::DispatchKey::Python),
+                             guard_tls_snapshot_(c10::DispatchKey::PythonTLSSnapshot) {}
+    c10::impl::ExcludeDispatchKeyGuard guard_;
+    c10::impl::ExcludeDispatchKeyGuard guard_tls_snapshot_;
+  };
 }
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject *unused);
 PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_disable_torch_function(PyObject *self, PyObject *args);
+PyObject* THPModule_disable_torch_dispatch(PyObject *self, PyObject *args);
 PyObject* THPModule_has_torch_function(PyObject*, PyObject *arg);
 PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject *obj);
 PyObject* THPModule_has_torch_function_variadic(PyObject*, PyObject *const *args, Py_ssize_t nargs);
diff --git a/torch/csrc/utils/init.cpp b/torch/csrc/utils/init.cpp
index 8472f2083baf..04a5d1b3e028 100644
--- a/torch/csrc/utils/init.cpp
+++ b/torch/csrc/utils/init.cpp
@@ -1,7 +1,6 @@
 #include <ATen/core/ivalue.h>
 #include <torch/csrc/utils/init.h>
 #include <torch/csrc/utils/throughput_benchmark.h>
-#include <torch/csrc/utils/crash_handler.h>
 
 #include <pybind11/functional.h>
 
@@ -51,17 +50,4 @@ void initThroughputBenchmarkBindings(PyObject* module) {
 }
 
 } // namespace throughput_benchmark
-
-namespace crash_handler {
-
-void initCrashHandlerBindings(PyObject* module) {
-  auto m = pybind11::handle(module).cast<pybind11::module>();
-
-  m.def("_enable_minidumps", enable_minidumps)
-      .def("_is_enabled_on_exceptions", is_enabled_on_exceptions)
-      .def("_enable_minidumps_on_exceptions", enable_minidumps_on_exceptions)
-      .def("_disable_minidumps", disable_minidumps)
-      .def("_get_minidump_directory", get_minidump_directory);
-}
-} // namespace crash_handler
 } // namespace torch
diff --git a/torch/csrc/utils/init.h b/torch/csrc/utils/init.h
index 45a1a3a7e1f0..bf6dd216bbcc 100644
--- a/torch/csrc/utils/init.h
+++ b/torch/csrc/utils/init.h
@@ -8,9 +8,4 @@ namespace throughput_benchmark {
 void initThroughputBenchmarkBindings(PyObject* module);
 
 } // namespace throughput_benchmark
-
-namespace crash_handler {
-void initCrashHandlerBindings(PyObject* module);
-
-} // namespace crash_handler
 } // namespace torch
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index ab522fd4a547..6d9fe5a5c9a2 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -3,10 +3,14 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/MemoryFormat.h>
+#include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/utils/invalid_arguments.h>
 #include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/python_torch_function_mode.h>
+#include <torch/csrc/utils/torch_dispatch_mode.h>
 
 #include <ATen/ATen.h>
+#include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/TracerMode.h>
 #include <c10/util/irange.h>
 
@@ -40,7 +44,9 @@ static std::unordered_map<std::string, ParameterType> type_map = {
   {"Stream", ParameterType::STREAM},
   {"std::string", ParameterType::STRING},
   {"c10::string_view", ParameterType::STRING},
+  {"SymInt", ParameterType::SYM_INT},
   {"Dimname", ParameterType::DIMNAME},
+  {"SymIntArrayRef", ParameterType::SYM_INT_LIST},
   {"DimnameList", ParameterType::DIMNAME_LIST},
   {"ScalarList", ParameterType::SCALAR_LIST},
 };
@@ -182,25 +188,19 @@ auto combine_self_args(PyObject *self, PyObject *args) -> py::tuple {
   return args_;
 }
 
+// TODO: I'm not sure if I should call this __torch_function__ or
+// torch_function.  The former makes it easier to take an existing
+// Tensor-like __torch_function__ object and turn it into a mode;
+// but in general modes don't have to be Tensor-like (and we will
+// improperly accept mode objects as arguments when they shouldn't
+// be passed around in this way).
+const char* torch_function_mode_name = "__torch_function__";
+
 auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args, PyObject* kwargs, PyObject* torch_api, const std::string& module_name) -> PyObject* {
   py::object torch_api_function = PyObject_FastGetAttrString(torch_api, (char*)func_name.c_str());
   TORCH_INTERNAL_ASSERT(torch_api_function.ptr() != nullptr, "torch API function must exist");
   py::tuple args_ = combine_self_args(self, args);
-  py::tuple py_types = py::make_tuple(py::handle(PyObject_Type(self)));
-  // NOLINTNEXTLINE(clang-diagnostic-writable-strings)
-  py::object torch_function = PyObject_FastGetAttrString(self, "__torch_function__");
-  py::object ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function.ptr(), py_types.ptr(), args_.ptr(), kwargs));
-  if (ret.ptr() == nullptr) {
-    // if an exception occurred in a user's implementation of
-    // __torch_function__, throw it
-    throw python_error();
-  }
-  if (ret.ptr() == Py_NotImplemented) {
-    std::string error_msg = "no implementation found for " + module_name + "." + func_name + "' on types that implement __torch_function__: [" + self->ob_type->tp_name + "]";
-    PyErr_SetString(PyExc_TypeError, error_msg.c_str());
-    throw python_error();
-  }
-  return ret.release().ptr();
+  return handle_torch_function_no_python_arg_parser({py::handle(self)}, args_.ptr(), kwargs, func_name.c_str(), torch_api_function.ptr(), module_name.c_str(), TorchFunctionName::TorchFunction);
 }
 
 // Note: [Overloaded args]
@@ -219,8 +219,28 @@ static PyObject* get_type_of_overloaded_arg(PyObject* obj_or_type) {
 }
 
 // See Note: [Overloaded args] for what they hold
-auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name) -> PyObject* {
+auto handle_torch_function_no_python_arg_parser(
+    at::ArrayRef<py::handle> overloaded_args,
+    PyObject* args,
+    PyObject* kwargs,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    TorchFunctionName torch_function_name) -> PyObject* {
+  const char* torch_function_name_str = nullptr;
+  switch (torch_function_name) {
+    case TorchFunctionName::TorchFunction:
+      torch_function_name_str = "__torch_function__";
+      break;
+    case TorchFunctionName::TorchDispatch:
+      torch_function_name_str = "__torch_dispatch__";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(0, static_cast<int>(torch_function_name));
+  }
   // overloaded_args already all have unique types
+  // nb: modes don't go in the overloaded types list, as they are not
+  // necessarily types
   std::vector<py::object> overloaded_types;
   overloaded_types.reserve(overloaded_args.size());
   for (auto &arg : overloaded_args) {
@@ -228,14 +248,67 @@ auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &o
   }
   py::tuple py_types = py::cast(overloaded_types);
   py::object ret;
-  for (auto &arg : overloaded_args) {
-    // NOLINTNEXTLINE(clang-diagnostic-writable-strings)
-    py::object torch_function = PyObject_FastGetAttrString(arg.ptr(), torch_function_name);
-    ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function, py_types.ptr(), args, kwargs, NULL));
-    if (ret.ptr() != Py_NotImplemented) {
-      // Return the reference to the result. This also covers the case where ret
-      // is NULL and __torch_function__ raised an exception, which we throw below
-      break;
+  PyObject* mode_obj = nullptr;
+  const bool is_torch_function = torch_function_name == TorchFunctionName::TorchFunction;
+  const auto& maybe_mode = is_torch_function ? at::impl::PythonTorchFunctionTLS::get_mode() : at::impl::TorchDispatchModeTLS::get_state();
+  if (maybe_mode) {
+    mode_obj = maybe_mode->ptr(getPyInterpreter());
+    TORCH_INTERNAL_ASSERT(py_types.ptr() != nullptr);
+    TORCH_INTERNAL_ASSERT(args != nullptr);
+    // Disable mode on the inside; this makes for a more user-friendly
+    // experience if you try to, e.g., print your tensors.
+    at::optional<torch::overrides::StashTorchFunctionModeGuard> tf_g;
+    at::optional<torch_dispatch_mode::StashTorchDispatchModeGuard> td_g;
+    if (is_torch_function) {
+      tf_g.emplace();
+    } else{
+      td_g.emplace();
+    }
+    // Blegh.  This accidentally works in PyObject_CallFunctionObjArgs below
+    // because the nullptr terminates the argument list ick ick ick.
+    if (kwargs == nullptr) {
+      ret = py::reinterpret_steal<py::object>(PyObject_CallMethod(
+          mode_obj, torch_function_name_str, "OOO", torch_api_function, py_types.ptr(), args));
+    } else {
+      ret = py::reinterpret_steal<py::object>(PyObject_CallMethod(
+          mode_obj,
+          torch_function_name_str,
+          "OOOO",
+          torch_api_function,
+          py_types.ptr(),
+          args,
+          kwargs));
+    }
+    if (ret.ptr() == nullptr) {
+      throw python_error();
+    }
+  }
+  if (ret.ptr() == nullptr || ret.ptr() == Py_NotImplemented) {
+    for (auto& arg : overloaded_args) {
+      // NOLINTNEXTLINE(clang-diagnostic-writable-strings)
+      py::object torch_function =
+          PyObject_FastGetAttrString(arg.ptr(), torch_function_name_str);
+
+      // See https://github.com/pytorch/pytorch/issues/63767
+      if (PyObject_FastGetAttrString(torch_function.ptr(), "__self__").is(arg) &&
+          torch_function.ptr() != torch::disabled_torch_function_impl()) {
+        TORCH_WARN("Defining your `", torch_function_name_str, "` as a plain method is deprecated ",
+                   "and will be an error in future, please define it as a classmethod.");
+      }
+
+      ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
+          torch_function.ptr(),
+          torch_api_function,
+          py_types.ptr(),
+          args,
+          kwargs,
+          NULL));
+      if (ret.ptr() != Py_NotImplemented) {
+        // Return the reference to the result. This also covers the case where
+        // ret is NULL and __torch_function__/__torch_dispatch raised an
+        // exception, which we throw below
+        break;
+      }
     }
   }
   if (ret.ptr() == nullptr) {
@@ -248,15 +321,24 @@ auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &o
     // returned NotImplemented, so we raise a TypeError.
     std::stringstream ss;
     ss << "no implementation found for '" << module_name << "." << func_name
-       << "' on types that implement " << torch_function_name << ": [";
+       << "' on types that implement " << torch_function_name_str << ": [";
     for (auto &arg : overloaded_args) {
-      ss << PyObject_Repr(get_type_of_overloaded_arg(arg.ptr()));
+      ss << py::repr(get_type_of_overloaded_arg(arg.ptr()));
       if (!arg.is(overloaded_args.back())) {
         ss << ", ";
       }
-      else {
-        ss << "]";
-      }
+    }
+    ss << "]";
+    if (mode_obj) {
+      // Note [Paranoid check mode is same]
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      // If a user forcibly changes the mode in a non-lexical way
+      // in the inner context, the mode could be invalid here.  So just be
+      // a bit safe, it doesn't cost us anything since this is error reporting
+      const auto& maybe_mode = at::impl::PythonTorchFunctionTLS::get_mode();
+      TORCH_INTERNAL_ASSERT(mode_obj == maybe_mode->ptr(getPyInterpreter()));
+      ss << " nor was it found on the currently active mode "
+         << py::repr(mode_obj);
     }
     const std::string& tmp = ss.str();
     PyErr_SetString(PyExc_TypeError, tmp.c_str());
@@ -265,8 +347,13 @@ auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &o
   return ret.release().ptr();
 }
 
-auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject* {
-  py::object torch_api_function = PyObject_FastGetAttrString(torch_api, (char*)r.get_func_name().c_str());
+auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name, const char* func_name_override) -> PyObject* {
+  py::object torch_api_function = PyObject_FastGetAttrString(
+    torch_api,
+    (char*)(
+      func_name_override ? func_name_override : r.get_func_name().c_str()
+    )
+  );
   TORCH_INTERNAL_ASSERT(torch_api_function.ptr() != nullptr, "torch API function must exist");
   py::object ret;
   py::tuple args_ = combine_self_args(self, args);
@@ -280,9 +367,9 @@ auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObje
   return handle_torch_function_no_python_arg_parser(r.signature.overloaded_args, args_.ptr(), kwargs, r.get_func_name().c_str(), torch_api_function.ptr(), module_name);
 }
 
-auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject*
+auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name, const char* func_name_override) -> PyObject*
 {
-  return handle_torch_function(r, nullptr, args, kwargs, torch_api, module_name);
+  return handle_torch_function(r, nullptr, args, kwargs, torch_api, module_name, func_name_override);
 }
 
 auto handle_torch_function_indexing(PyObject* self, PyObject* index, PyObject* val) -> PyObject* {
@@ -394,7 +481,7 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* ove
     return true;
   }
 
-  if (check_has_torch_function(obj)) {
+  if (check_has_torch_function(obj, /*ignore_mode*/ true)) {
     // tensor subclasses and unrelated objects with __torch_function__
     append_overloaded_tensor(overloaded_args, obj);
     return true;
@@ -460,21 +547,45 @@ bool is_float_or_complex_list(PyObject* obj) {
   return true;
 }
 
-static bool is_int_list(PyObject* obj, int broadcast_size) {
+static bool is_int_list_(PyObject* obj, int broadcast_size) {
   if (PyTuple_Check(obj) || PyList_Check(obj)) {
     if (PySequence_Size(obj) == 0) {
       return true;
     }
     auto item = py::reinterpret_steal<py::object>(
         PySequence_GetItem(obj, 0));
+    if (THPUtils_checkIndex(item.ptr())) {
+      return true;
+    }
     // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
     // in an intlist argument. Even float or complex scalar tensors.
-    return (THPVariable_Check(item.ptr()) || THPUtils_checkIndex(item.ptr()));
+    return (
+        jit::tracer::isTracing() &&
+        THPVariable_Check(item.ptr()) &&
+        THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
   }
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int
   return broadcast_size > 0 && THPUtils_checkLong(obj);
 }
 
+static bool is_int_list(PyObject* obj, int broadcast_size) {
+  return is_int_list_(obj, broadcast_size);
+}
+
+static bool is_int_or_symint(PyObject* obj) {
+  if (THPUtils_checkLong(obj)) {
+      return true;
+  }
+
+  // TODO: test if it's the Python binding for SymbolicIntNode
+  return false;
+}
+
+static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) {
+  // TODO: add a check for SymbolicIntNode
+  return is_int_list_(obj, broadcast_size);
+}
+
 // argnum is needed for raising the TypeError, it's used in the error message.
 auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded_args, int argnum) -> bool
 {
@@ -541,6 +652,12 @@ auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded
     case ParameterType::SCALAR_LIST: {
       return is_scalar_list(obj);
     }
+    case ParameterType::SYM_INT: {
+      return is_int_or_symint(obj);
+    }
+    case ParameterType::SYM_INT_LIST: {
+      return is_int_or_symint_list(obj, size);
+    }
   }
 }
 
@@ -549,6 +666,7 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::TENSOR: return "Tensor";
     case ParameterType::SCALAR: return "Number";
     case ParameterType::INT64: return "int";
+    case ParameterType::SYM_INT: return "SymInt";
     case ParameterType::DOUBLE: return "float";
     case ParameterType::COMPLEX: return "complex";
     case ParameterType::TENSOR_LIST: return "tuple of Tensors";
@@ -567,6 +685,7 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::DIMNAME: return "name";
     case ParameterType::DIMNAME_LIST: return "tuple of names";
     case ParameterType::SCALAR_LIST: return "tuple of Scalars";
+    case ParameterType::SYM_INT_LIST: return "tuple of SymInts";
     default: throw std::runtime_error("unknown parameter type");
   }
 }
@@ -910,9 +1029,9 @@ static void extra_kwargs(FunctionSignature& signature, PyObject* kwargs, Py_ssiz
 
 bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* dst[],  // NOLINT
                               bool raise_exception) {
-  auto nargs = args ? PyTuple_GET_SIZE(args) : 0;
+  size_t nargs = args ? PyTuple_GET_SIZE(args) : 0;
   auto remaining_kwargs = kwargs ? PyDict_Size(kwargs) : 0;
-  Py_ssize_t arg_pos = 0;
+  size_t arg_pos = 0;
   bool allow_varargs_intlist = false;
 
   // if there is a single positional IntArrayRef argument, i.e. expand(..), view(...),
@@ -934,7 +1053,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs,
   }
 
   int i = 0;
-  if (self != nullptr && check_has_torch_function(self)) {
+  if (self != nullptr && check_has_torch_function(self, /*ignore_mode*/ true)) {
     append_overloaded_tensor(&this->overloaded_args, self);
   }
   for (auto& param : params) {
@@ -1077,7 +1196,7 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
 
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-  auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
+  size_t num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<unsigned> plausible_idxs;
   unsigned i = 0;
   for (auto& signature : signatures_) {
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 3de479b57578..a844e8885951 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -63,6 +63,7 @@
 #include <torch/csrc/utils/six.h>
 #include <torch/csrc/autograd/variable.h>
 
+#include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/core/Tensor.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
@@ -77,9 +78,9 @@
 namespace torch {
 
 enum class ParameterType {
-  TENSOR, SCALAR, INT64, DOUBLE, COMPLEX, TENSOR_LIST, INT_LIST, GENERATOR,
+  TENSOR, SCALAR, INT64, SYM_INT, DOUBLE, COMPLEX, TENSOR_LIST, INT_LIST, GENERATOR,
   BOOL, STORAGE, PYOBJECT, SCALARTYPE, LAYOUT, MEMORY_FORMAT, DEVICE, STREAM, STRING,
-  DIMNAME, DIMNAME_LIST, QSCHEME, FLOAT_LIST, SCALAR_LIST
+  DIMNAME, DIMNAME_LIST, QSCHEME, FLOAT_LIST, SCALAR_LIST, SYM_INT_LIST
 };
 
 struct FunctionParameter;
@@ -167,6 +168,7 @@ struct PythonArgs {
   template<int N>
   inline std::array<at::Tensor, N> tensorlist_n(int i);
   inline std::vector<int64_t> intlist(int i);
+  inline std::vector<c10::SymInt> symintlist(int i);
   inline c10::OptionalArray<int64_t> intlistOptional(int i);
   inline std::vector<int64_t> intlistWithDefault(int i, std::vector<int64_t> default_intlist);
   inline c10::optional<at::Generator> generator(int i);
@@ -203,6 +205,7 @@ struct PythonArgs {
   inline c10::optional<c10::string_view> stringViewOptional(int i);
   inline PyObject* pyobject(int i);
   inline int64_t toInt64(int i);
+  inline c10::SymInt toSymInt(int i);
   inline int64_t toInt64WithDefault(int i, int64_t default_int);
   inline double toDouble(int i);
   inline double toDoubleWithDefault(int i, double default_double);
@@ -271,7 +274,8 @@ inline PythonArgs PythonArgParser::parse(PyObject* self, ParsedArgs<0>& dst) {
 }
 
 inline bool PythonArgs::has_torch_function(){
-  return !this->signature.overloaded_args.empty();
+  return !this->signature.overloaded_args.empty() ||
+       at::impl::PythonTorchFunctionTLS::get_mode();
 }
 
 inline std::string PythonArgs::get_func_name(){
@@ -385,6 +389,11 @@ inline std::vector<int64_t> PythonArgs::intlist(int i) {
   return intlistWithDefault(i, signature.params[i].default_intlist);
 }
 
+inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
+  auto intlist = intlistWithDefault(i, signature.params[i].default_intlist);
+  return c10::fmap(intlist, [](int64_t n) {return c10::SymInt(n); });
+}
+
 inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<int64_t> default_intlist) {
   if (!args[i]) return default_intlist;
   PyObject* arg = args[i];
@@ -627,6 +636,18 @@ inline int64_t PythonArgs::toInt64(int i) {
   return THPUtils_unpackLong(args[i]);
 }
 
+inline c10::SymInt PythonArgs::toSymInt(int i) {
+  if (!args[i]) {
+    return c10::SymInt(signature.params[i].default_int);
+  }
+  if (traceable && jit::tracer::isTracing() && THPVariable_Check(args[i])) {
+    auto & var = THPVariable_Unpack(args[i]);
+    jit::tracer::ArgumentStash::stashValue(
+        signature.params[i].name, idx, var, c10::IntType::get());
+  }
+  return c10::SymInt(THPUtils_unpackLong(args[i]));
+}
+
 inline int64_t PythonArgs::toInt64WithDefault(int i, int64_t default_int) {
   if (!args[i]) return default_int;
   return toInt64(i);
@@ -777,16 +798,26 @@ inline PyObject* PythonArgs::pyobject(int i) {
  *
  */
 // Used for Tensor methods with arguments.
-auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject*;
+auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name, const char* func_name_override = nullptr) -> PyObject*;
 
 // Used for functions which needs to parse python args.
-auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject*;
+auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name, const char* func_name_override = nullptr) -> PyObject*;
 
 // Used for functions that have no argument parsing.
 auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args=nullptr, PyObject* kwargs=nullptr, PyObject* torch_api=THPVariableClass, const std::string& module_name="torch.Tensor") -> PyObject*;
 
 // Used for functions created in C++, e.g., C++ custom op, which doesn't use PythonArgParser to get overloaded_args.
-auto TORCH_API handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name = "__torch_function__") -> PyObject*;
+enum class TorchFunctionName { TorchFunction, TorchDispatch };
+
+auto TORCH_API handle_torch_function_no_python_arg_parser(
+    at::ArrayRef<py::handle> overloaded_args,
+    PyObject* args,
+    PyObject* kwargs,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    TorchFunctionName torch_function_name = TorchFunctionName::TorchFunction)
+    -> PyObject*;
 
 // Used for getters of Tensor properties
 auto handle_torch_function_getter(THPVariable* self, const std::string& property_name) -> PyObject*;
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index c4206b202a25..34a867f0f555 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -5,6 +5,10 @@
 #include <ATen/ATen.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
+#include <c10/core/SafePyObject.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/autograd/python_variable.h>
+
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 
@@ -50,6 +54,21 @@ inline torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
   }
 }
 
+class PythonKernelHolder : public c10::OperatorKernel {
+  c10::SafePyObject func_;
+public:
+  PythonKernelHolder(py::object func) : func_(func.release().ptr(), getPyInterpreter()) {}
+
+  void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) {
+    auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
+    py::gil_scoped_acquire g;
+    auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+    auto obj = py::reinterpret_steal<py::object>(PyObject_Call(func_.ptr(getPyInterpreter()), args_kwargs.first.ptr(), args_kwargs.second.ptr()));
+    if (obj == nullptr) { throw python_error(); }
+    pushPyOutToStack(op, stack, obj, "PythonKernelHolder");
+  }
+};
+
 void initDispatchBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
@@ -122,6 +141,17 @@ void initDispatchBindings(PyObject* module) {
       );
       return self;
     }, "", py::arg("name"), py::arg("dispatch") = "", py::arg("debug") = "")
+    .def("impl", [](py::object self, const char* name, const char* dispatch, py::object func) {
+      HANDLE_TH_ERRORS
+      self.cast<torch::Library&>().impl(
+        name,
+        dispatch_str(dispatch, CppFunction::makeFromBoxedFunctor(std::make_unique<PythonKernelHolder>(std::move(func))))
+      );
+      END_HANDLE_TH_ERRORS_PYBIND
+    }, "", py::arg("name"), py::arg("dispatch"), py::arg("func"))
+    .def("define", [](py::object self, const char* schema) {
+      self.cast<torch::Library&>().def(torch::schema(schema, c10::AliasAnalysisKind::FROM_SCHEMA));
+    }, "", py::arg("schema"))
     .def("fallback_fallthrough", [](py::object self, const char* dispatch) {
       self.cast<torch::Library&>().fallback(
         dispatch_str(dispatch, CppFunction::makeFallthrough())
@@ -130,14 +160,17 @@ void initDispatchBindings(PyObject* module) {
     }, "", py::arg("dispatch") = "")
   ;
 
-  m.def("_dispatch_library", [](const char* kind, std::string name, const char* dispatch) {
+  m.def("_dispatch_library", [](const char* kind, std::string name, const char* dispatch, const char* file, uint32_t linenum) {
+    HANDLE_TH_ERRORS
     return std::make_unique<torch::Library>(
       parseKind(kind),
       std::move(name),
       std::string(dispatch) == "" ? c10::nullopt : c10::make_optional(c10::parseDispatchKey(dispatch)),
-      "/dev/null",
-      0);
-  });
+      "/dev/null", // temporary workaround
+      linenum);
+    END_HANDLE_TH_ERRORS_PYBIND
+  }, "", py::arg("kind"), py::arg("name"), py::arg("dispatch"), py::arg("file")="/dev/null", py::arg("linenum")=0)
+  ;
 
   m.def("_dispatch_dump", [](const char* name) -> std::string {
     auto op = c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
@@ -169,6 +202,17 @@ void initDispatchBindings(PyObject* module) {
     c10::Dispatcher::singleton().checkInvariants();
   });
 
+  m.def("_dispatch_has_kernel", [](const char* name) -> bool {
+    auto op = c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
+    return static_cast<bool>(op);
+  });
+
+  m.def("_dispatch_has_kernel_for_dispatch_key", [](const char* name, const char* dispatch) -> bool {
+    auto op = c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
+    TORCH_CHECK(op, "operator ", name, " does not exist");
+    return op->hasKernelForDispatchKey(c10::parseDispatchKey(dispatch));
+  });
+
   m.def("_dispatch_find_dangling_impls", []() -> std::vector<std::string> {
     auto danglingImpls =  c10::Dispatcher::singleton().findDanglingImpls();
 
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 98f8a262a10a..7f454bdff826 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -20,7 +20,7 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
       break;
     case at::kFloat: *(float*)data = (float)THPUtils_unpackDouble(obj); break;
     case at::kDouble: *(double*)data = THPUtils_unpackDouble(obj); break;
-    case at::kComplexHalf: *(c10::complex<at::Half>*)data = (c10::complex<at::Half>)THPUtils_unpackComplexDouble(obj); break;
+    case at::kComplexHalf: *(c10::complex<at::Half>*)data = (c10::complex<at::Half>)static_cast<c10::complex<float>>(THPUtils_unpackComplexDouble(obj)); break;
     case at::kComplexFloat: *(c10::complex<float>*)data = (c10::complex<float>)THPUtils_unpackComplexDouble(obj); break;
     case at::kComplexDouble: *(c10::complex<double>*)data = THPUtils_unpackComplexDouble(obj); break;
     case at::kBool: *(bool*)data = THPUtils_unpackNumberAsBool(obj); break;
diff --git a/torch/csrc/utils/python_torch_function_mode.h b/torch/csrc/utils/python_torch_function_mode.h
new file mode 100644
index 000000000000..5a27c9b13a88
--- /dev/null
+++ b/torch/csrc/utils/python_torch_function_mode.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/PythonTorchFunctionTLS.h>
+
+namespace torch {
+namespace overrides {
+
+// Corresponds to torch.overrides._no_torch_function_mode.  We discourage use
+// of this in userland because it's non-compositional; there might be another
+// mode waiting to go after you, and you shouldn't just blindly disable it.
+// From C++ side, there is no such thing as compositional modes, there is one
+// mode and of course you should be able to clear it.
+struct StashTorchFunctionModeGuard {
+  StashTorchFunctionModeGuard() { at::impl::PythonTorchFunctionTLS::swap_mode(old_mode_); }
+  ~StashTorchFunctionModeGuard() { at::impl::PythonTorchFunctionTLS::set_mode(std::move(old_mode_)); }
+private:
+  std::shared_ptr<c10::SafePyObject> old_mode_ = nullptr;
+};
+
+} // namespace overrides
+} // namespace torch
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index df81fc76b456..11a64b3bdf80 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -34,7 +34,7 @@ std::pair<std::string, std::string> getDtypeNames(
     case at::ScalarType::Half:
       return std::make_pair("float16", "half");
     case at::ScalarType::ComplexHalf:
-      return std::make_pair("complex32", "");
+      return std::make_pair("complex32", "chalf");
     case at::ScalarType::ComplexFloat:
       return std::make_pair("complex64", "cfloat");
     case at::ScalarType::ComplexDouble:
diff --git a/torch/csrc/utils/tensor_layouts.cpp b/torch/csrc/utils/tensor_layouts.cpp
index f38d0637e3af..8c87c0cc7788 100644
--- a/torch/csrc/utils/tensor_layouts.cpp
+++ b/torch/csrc/utils/tensor_layouts.cpp
@@ -9,6 +9,15 @@
 
 namespace torch { namespace utils {
 
+#define REGISTER_LAYOUT(layout, LAYOUT)                          \
+    PyObject* layout##_layout =                                  \
+      THPLayout_New(at::Layout::LAYOUT, "torch." # layout);      \
+    Py_INCREF(layout##_layout);                                         \
+    if (PyModule_AddObject(torch_module, "" # layout, layout##_layout) != 0) { \
+      throw python_error();                                             \
+    }                                                                   \
+    registerLayoutObject((THPLayout*)layout##_layout, at::Layout::LAYOUT);
+
 void initializeLayouts() {
   auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
   if (!torch_module) throw python_error();
@@ -27,13 +36,10 @@ void initializeLayouts() {
   }
   registerLayoutObject((THPLayout*)sparse_coo_layout, at::Layout::Sparse);
 
-  PyObject* sparse_csr_layout =
-      THPLayout_New(at::Layout::SparseCsr, "torch.sparse_csr");
-  Py_INCREF(sparse_csr_layout);
-  if (PyModule_AddObject(torch_module, "sparse_csr", sparse_csr_layout) != 0) {
-    throw python_error();
-  }
-  registerLayoutObject((THPLayout*)sparse_csr_layout, at::Layout::SparseCsr);
+  REGISTER_LAYOUT(sparse_csr, SparseCsr)
+  REGISTER_LAYOUT(sparse_csc, SparseCsc)
+  REGISTER_LAYOUT(sparse_bsr, SparseBsr)
+  REGISTER_LAYOUT(sparse_bsc, SparseBsc)
 
   PyObject* mkldnn_layout = THPLayout_New(at::Layout::Mkldnn, "torch._mkldnn");
   Py_INCREF(mkldnn_layout);
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 1cde3a196d6b..63cffd438afa 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -24,7 +24,9 @@ static PyObject* recursive_to_list(
     PyObject* obj = recursive_to_list(data, sizes, strides, dim + 1, scalarType, elementSize);
     if (!obj) throw python_error();
     PyList_SET_ITEM(list.get(), i, obj);
-    data += strides[dim] * elementSize;
+    auto advance_data_ptr = strides[dim] * elementSize;
+    TORCH_INTERNAL_ASSERT(data || (advance_data_ptr == 0));
+    data += advance_data_ptr;
   }
   return list.release();
 }
@@ -36,9 +38,10 @@ PyObject* tensor_to_list(const Tensor& tensor) {
     pybind11::gil_scoped_release no_gil;
     data = data.toBackend(Backend::CPU);
   }
+  TORCH_CHECK(tensor.numel() == 0 || data.data_ptr(), "tolist() shouldn't be called on a tensor with unallocated storage");
   return recursive_to_list(
       (char*)data.data_ptr(), data.sizes(), data.strides(), 0,
-      data.scalar_type(), data.dtype().itemsize());
+      data.scalar_type(), tensor.numel() == 0 ? 0 : data.dtype().itemsize());
 }
 
 }}  // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_memoryformats.cpp b/torch/csrc/utils/tensor_memoryformats.cpp
index 9eb8f413dbdf..07c480e7ee82 100644
--- a/torch/csrc/utils/tensor_memoryformats.cpp
+++ b/torch/csrc/utils/tensor_memoryformats.cpp
@@ -12,15 +12,14 @@
 namespace torch {
 namespace utils {
 
-#define _ADD_MEMORY_FORMAT(format, name)                                       \
-  {                                                                            \
-    std::string module_name = "torch.";                                        \
-    PyObject* memory_format = THPMemoryFormat_New(format, module_name + name); \
-    Py_INCREF(memory_format);                                                  \
-    if (PyModule_AddObject(torch_module, name, memory_format) != 0) {          \
-      throw python_error();                                                    \
-    }                                                                          \
-  }
+namespace {
+// Intentionally leaked
+std::array<PyObject*, static_cast<int>(at::MemoryFormat::NumOptions)>  memory_format_registry = {};
+} // anonymous namespace
+
+py::object getTHPMemoryFormat(at::MemoryFormat memory_format) {
+  return py::reinterpret_borrow<py::object>(memory_format_registry[static_cast<size_t>(memory_format)]);
+}
 
 void initializeMemoryFormats() {
   auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
@@ -28,10 +27,22 @@ void initializeMemoryFormats() {
     throw python_error();
   }
 
-  _ADD_MEMORY_FORMAT(at::MemoryFormat::Preserve, "preserve_format");
-  _ADD_MEMORY_FORMAT(at::MemoryFormat::Contiguous, "contiguous_format");
-  _ADD_MEMORY_FORMAT(at::MemoryFormat::ChannelsLast, "channels_last");
-  _ADD_MEMORY_FORMAT(at::MemoryFormat::ChannelsLast3d, "channels_last_3d");
+  auto add_memory_format = [&](at::MemoryFormat format, const char* name) {
+    std::string module_name = "torch.";
+    PyObject* memory_format = THPMemoryFormat_New(format, module_name + name);
+    Py_INCREF(memory_format);
+    if (PyModule_AddObject(torch_module, name, memory_format) != 0) {
+      Py_DECREF(memory_format);
+      throw python_error();
+    }
+    Py_INCREF(memory_format);
+    memory_format_registry[static_cast<size_t>(format)] = memory_format;
+  };
+
+  add_memory_format(at::MemoryFormat::Preserve, "preserve_format");
+  add_memory_format(at::MemoryFormat::Contiguous, "contiguous_format");
+  add_memory_format(at::MemoryFormat::ChannelsLast, "channels_last");
+  add_memory_format(at::MemoryFormat::ChannelsLast3d, "channels_last_3d");
 
 }
 
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
index 8a90f1b00986..e078d540bd96 100644
--- a/torch/csrc/utils/tensor_memoryformats.h
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -1,7 +1,11 @@
 #pragma once
 
+#include <torch/csrc/utils/pybind.h>
+#include <c10/core/MemoryFormat.h>
+
 namespace torch { namespace utils {
 
 void initializeMemoryFormats();
+py::object getTHPMemoryFormat(c10::MemoryFormat);
 
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 73565545ad47..6006120f32c1 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -20,6 +20,7 @@
 #include <ATen/dlpack.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/TracerMode.h>
 #include <c10/core/Backend.h>
 #include <c10/core/DispatchKeySet.h>
@@ -87,13 +88,6 @@ Tensor new_with_storage(c10::TensorOptions options, at::ScalarType scalar_type,
   return tensor;
 }
 
-Tensor new_with_tensor(c10::TensorOptions options, at::ScalarType scalar_type, const Tensor& other) {
-  options = options.dtype(scalar_type);
-  TORCH_CHECK_TYPE(other.options().type_equal(options), "expected ",
-                   options, " (got ", other.options(), ")");
-  return other.alias();
-}
-
 std::vector<int64_t> compute_sizes(PyObject* seq, ScalarType scalar_type) {
   bool is_storage = isStorage(seq);
   std::vector<int64_t> sizes;
@@ -266,58 +260,81 @@ Tensor internal_new_from_data(
   }
 #endif
 
+  auto device = device_opt.has_value() ? *device_opt : options.device();
+
   auto sizes = compute_sizes(data, scalar_type);
+
   ScalarType inferred_scalar_type = type_inference ? infer_scalar_type(data) : scalar_type;
   // This exists to prevent us from tracing the call to empty().  The actual
   // autograd code doesn't really matter, because requires_grad is always false
   // here.
+  // What are the semantics of tensor_new()?
+  // We manually construct a tensor and place on it on the correct device with empty() and to().
+  // We then have to "lift" the newly constructed tensor in some cases, like when we're performing
+  // a functorch transform or running functionalization.
+  // The exclude guards are all to ensure that extra logic doesn't run when we're constructing the raw tensor.
   Tensor tensor;
   {
-    at::AutoDispatchBelowADInplaceOrView guard;  // TODO: remove
-    at::tracer::impl::NoTracerDispatchMode tracer_guard;
-    c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python);
+    at::AutoDispatchBelowADInplaceOrView guard;
+    c10::impl::ExcludeDispatchKeyGuard torchdispatchmode_guard(c10::DispatchKey::Python);
+    c10::impl::ExcludeDispatchKeyGuard torchdispatchmode_snapshot_guard(c10::DispatchKey::PythonTLSSnapshot);
     // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all
     // tensors returned from operators in special TensorWrapper tensor extension
-    // The problem with this is that TensorWrapper does not have storage so
-    // accessing the data_ptr (for recursive_store) internal asserts.
-    // As a quick hack, the guard here prevents functorch from wrapping the empty
-    // tensor in a TensorWrapper and instead when `tensor.to` is called later,
-    // the tensor gets wrapped. A more long-term solution is to think about
-    // what the extensibility mechanism for this function (internal_new_from_data)
-    // looks like for mode-based dispatch keys and C++ tensor extensions.
-    c10::impl::ExcludeDispatchKeyGuard functorch_guard(c10::DispatchKey::FuncTorchDynamicLayerBackMode);
-
-    if (isStorage(data)) {
-      ScalarType storage_scalar_type;
-      bool is_typed_storage = false;
-      Storage storage = createStorageGetType(data, storage_scalar_type, is_typed_storage);
-      TORCH_CHECK(!is_typed_storage || storage_scalar_type == scalar_type,
-          "Expected a Storage of type ", scalar_type,
-          " or an UntypedStorage, but got ", storage_scalar_type);
-      tensor = at::empty(sizes, at::initialTensorOptions().dtype(is_typed_storage ? storage_scalar_type : inferred_scalar_type).pinned_memory(pin_memory).device(storage.device()));
-      tensor.set_(storage);
+    c10::impl::ExcludeDispatchKeyGuard functorch_front_guard(c10::DispatchKey::FuncTorchDynamicLayerFrontMode);
+    c10::impl::ExcludeDispatchKeyGuard functorch_back_guard(c10::DispatchKey::FuncTorchDynamicLayerBackMode);
+    // We disable DeferredInit handler for similar reasons as functorch.
+    c10::impl::ExcludeDispatchKeyGuard deferred_init_guard(c10::DispatchKey::DeferredInit);
+    // Note [Functionalization <> torch.Tensor constructor]
+    // Functionalization "lifts" the newly constructed tensor into a wrapper using aten::lift().
+    c10::impl::ExcludeDispatchKeyGuard functionalize_guard(c10::DispatchKey::Functionalize);
+    {
+      // Tracing should probably also use the "lift" operator to add the tensor to a trace,
+      // but it's technically BC-breaking to do that, since we currently trace .to() calls.
+      at::tracer::impl::NoTracerDispatchMode tracer_guard;
+
+      if (isStorage(data)) {
+        ScalarType storage_scalar_type;
+        bool is_typed_storage = false;
+        Storage storage = createStorageGetType(data, storage_scalar_type, is_typed_storage);
+        TORCH_CHECK(!is_typed_storage || storage_scalar_type == scalar_type,
+            "Expected a Storage of type ", scalar_type,
+            " or an _UntypedStorage, but got ", storage_scalar_type);
+        tensor = at::empty(sizes, at::initialTensorOptions().dtype(is_typed_storage ? storage_scalar_type : inferred_scalar_type).pinned_memory(pin_memory).device(storage.device()));
+        tensor.set_(storage);
 
-    } else {
-      tensor = at::empty(sizes, at::initialTensorOptions().dtype(inferred_scalar_type).pinned_memory(pin_memory));
-      if (c10::multiply_integers(tensor.sizes()) !=0 ) {
-        recursive_store(
-            (char*)tensor.data_ptr(), tensor.sizes(), tensor.strides(), 0,
-            inferred_scalar_type, tensor.dtype().itemsize(), data);
+      } else {
+        TensorOptions opts = at::initialTensorOptions().dtype(inferred_scalar_type);
+
+        // If the device is Meta, take the shortcut. We don't want to allocate an
+        // empty CPU tensor which would break our contract for meta tensors.
+        if (device == at::kMeta) {
+          return at::empty(sizes, opts.device(device));
+        }
+        tensor = at::empty(sizes, opts.pinned_memory(pin_memory));
+        if (c10::multiply_integers(tensor.sizes()) != 0) {
+          recursive_store(
+              (char*)tensor.data_ptr(), tensor.sizes(), tensor.strides(), 0,
+              inferred_scalar_type, tensor.dtype().itemsize(), data);
+        }
       }
     }
+    pybind11::gil_scoped_release no_gil;
+    maybe_initialize_cuda(device);
+    // However, it is VERY important that we trace the to() call here (even
+    // though the reason this is important is a hack).  Without *some* factory
+    // function call that is traced at construction time, we will consider
+    // a tensor constant as originating from "outside" the trace, and if you
+    // try to return it directly we will fail with the error saying no
+    // "no observable data dependence".  In an ideal world, we wouldn't trace
+    // a to() call but I need to think harder about what exactly we should trace
+    // in this case.
+    tensor = tensor.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/false);
   }
-  auto device = device_opt.has_value() ? *device_opt : options.device();
-  pybind11::gil_scoped_release no_gil;
-  maybe_initialize_cuda(device);
-  // However, it is VERY important that we trace the to() call here (even
-  // though the reason this is important is a hack).  Without *some* factory
-  // function call that is traced at construction time, we will consider
-  // a tensor constant as originating from "outside" the trace, and if you
-  // try to return it directly we will fail with the error saying no
-  // "no observable data dependence".  In an ideal world, we wouldn't trace
-  // a to() call but I need to think harder about what exactly we should trace
-  // in this case.
-  return tensor.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/false);
+
+  // torch.jit.trace will continue to trace out `.to()` instead of `.lift()`, since
+  // changing it is BC-breaking.
+  at::tracer::impl::NoTracerDispatchMode tracer_guard;
+  return tensor.lift();
 }
 
 Tensor new_from_data_copy(
@@ -354,6 +371,7 @@ void check_base_legacy_new(c10::DispatchKey dispatch_key, at::Layout expected_la
         c10::DispatchKey::HIP,
         c10::DispatchKey::XLA,
         c10::DispatchKey::Lazy,
+        c10::DispatchKey::IPU,
         c10::DispatchKey::XPU,
         c10::DispatchKey::HPU,
     });
@@ -389,7 +407,13 @@ void check_legacy_ctor_device(c10::DispatchKey dispatch_key, c10::optional<Devic
   }
 }
 
-Tensor legacy_sparse_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+enum class CtorOrNew {
+  BASE_CTOR,
+  CTOR,
+  NEW,
+};
+
+Tensor legacy_sparse_tensor_generic_ctor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs, CtorOrNew ctor_or_new) {
   auto options = dispatchKeyToTensorOptions(dispatch_key);
   static PythonArgParser parser({
     "new(*, Device? device=None)",
@@ -398,6 +422,7 @@ Tensor legacy_sparse_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType s
     "new(Tensor indices, Tensor values, IntArrayRef size, *, Device? device=None)",
     "new(IntArrayRef size, *, Device? device=None)",
   });
+  if (ctor_or_new == CtorOrNew::NEW) check_base_legacy_new(dispatch_key, c10::kSparse);
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
@@ -407,51 +432,6 @@ Tensor legacy_sparse_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType s
   } else if (r.idx == 1) {
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return at::unsafeTensorFromTH(cdata, true);
-  } else if (r.idx == 2) {
-    auto deviceOptional = r.deviceOptional(2);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    at::OptionalDeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1));
-  } else if (r.idx == 3) {
-    auto deviceOptional = r.deviceOptional(3);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    at::OptionalDeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
-  } else if (r.idx == 4) {
-    PyObject* arg = r.pyobject(0);
-    auto deviceOptional = r.deviceOptional(1);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
-      // new(sequence) binds to this signature but should be treated differently
-      // unless the sequences is a torch.Size
-      throw TypeError("torch.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() " \
-                      "or construct a strided tensor and convert it to sparse via to_sparse.");
-    }
-    return new_with_sizes(options, scalar_type, r.deviceOptional(1), r.intlist(0));
-  }
-  throw std::runtime_error("new(): invalid arguments");
-}
-
-Tensor legacy_sparse_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
-  auto options = dispatchKeyToTensorOptions(dispatch_key);
-  static PythonArgParser parser({
-    "new(*, Device? device=None)",
-    "new(*, int64_t cdata)|hidden",
-    "new(Tensor indices, Tensor values, *, Device? device=None)",
-    "new(Tensor indices, Tensor values, IntArrayRef size, *, Device? device=None)",
-    "new(IntArrayRef size, *, Device? device=None)",
-  });
-  check_base_legacy_new(dispatch_key, c10::kSparse);
-  ParsedArgs<5> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-  if (r.idx == 0) {
-    auto deviceOptional = r.deviceOptional(0);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    at::OptionalDeviceGuard device_guard(deviceOptional);
-    return at::empty({0}, build_options(options, scalar_type));
-  } else if (r.idx == 1) {
-    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
-    return at::unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
@@ -473,14 +453,27 @@ Tensor legacy_sparse_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType sc
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      throw TypeError("SparseTensor.new(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() " \
-                      "or construct a strided tensor and convert it to sparse via to_sparse.");
+      if (ctor_or_new == CtorOrNew::CTOR) {
+        throw TypeError("torch.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() " \
+                        "or construct a strided tensor and convert it to sparse via to_sparse.");
+      } else {
+        throw TypeError("SparseTensor.new(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() " \
+                        "or construct a strided tensor and convert it to sparse via to_sparse.");
+      }
     }
     return new_with_sizes(options, scalar_type, r.deviceOptional(1), r.intlist(0));
   }
   throw std::runtime_error("new(): invalid arguments");
 }
 
+Tensor legacy_sparse_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  return legacy_sparse_tensor_generic_ctor_new(dispatch_key, scalar_type, args, kwargs, CtorOrNew::CTOR);
+}
+
+Tensor legacy_sparse_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  return legacy_sparse_tensor_generic_ctor_new(dispatch_key, scalar_type, args, kwargs, CtorOrNew::NEW);
+}
+
 // NB: device_idx here is NOT a DeviceIndex, but index into PythonArgs
 c10::TensorOptions typeIdWithDefault(PythonArgs& r, int64_t device_idx, c10::DispatchKey dispatch_key) {
   auto options = dispatchKeyToTensorOptions(dispatch_key);
@@ -493,12 +486,14 @@ c10::TensorOptions typeIdWithDefault(PythonArgs& r, int64_t device_idx, c10::Dis
 
 } // namespace
 
-Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor legacy_tensor_generic_ctor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs, CtorOrNew ctor_or_new) {
   auto options = dispatchKeyToTensorOptions(dispatch_key);
   static PythonArgParser parser({
     "new(*, Device? device=None)",
     "new(Storage storage)",
     "new(*, int64_t cdata)|hidden",
+    // This constructor is no longer legacy, it will also be usable for
+    // subclass initialization
     "new(Tensor other)",
     "new(Tensor other, *, Device? device=None)|hidden",  // prevent Tensor matching with IntArrayRef, PyObject*
     "new(IntArrayRef size, *, Device? device=None)",
@@ -506,9 +501,11 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t
   });
 
   if (isSparse(dispatchKeyToBackend(dispatch_key))) {
-    return legacy_sparse_tensor_ctor(dispatch_key, scalar_type, args, kwargs);
+    return legacy_sparse_tensor_generic_ctor_new(dispatch_key, scalar_type, args, kwargs, ctor_or_new);
   }
 
+  if (ctor_or_new == CtorOrNew::NEW) check_base_legacy_new(dispatch_key, c10::kStrided);
+
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
@@ -524,7 +521,7 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t
       TORCH_CHECK(
         storage_scalar_type == scalar_type,
         "Expected a Storage of type ", scalar_type,
-        " or an UntypedStorage, but got type ", storage_scalar_type,
+        " or an _UntypedStorage, but got type ", storage_scalar_type,
         " for argument 1 'storage'");
     }
     return new_with_storage(options, scalar_type, storage);
@@ -532,10 +529,23 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return at::unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 3) {
-    return new_with_tensor(options, scalar_type, r.tensor(0));
+    const auto& other = r.tensor(0);
+    // BASE_CTOR (aka torch.Tensor) is now relaxed to accept any
+    // dtype; previously it was "float" biased
+    if (ctor_or_new != CtorOrNew::BASE_CTOR) {
+      options = options.dtype(scalar_type);
+      TORCH_CHECK_TYPE(other.options().type_equal(options), "expected ",
+                       options, " (got ", other.options(), ")");
+    }
+    return other.alias();
   } else if (r.idx == 4) {
-    TORCH_CHECK(false, "Legacy tensor constructor of the form torch.Tensor(tensor, device=device) " \
-                "is not supported.  Use torch.tensor(...) or torch.as_tensor(...) instead.");
+    if (ctor_or_new == CtorOrNew::CTOR || ctor_or_new == CtorOrNew::BASE_CTOR) {
+      TORCH_CHECK(false, "Legacy tensor constructor of the form torch.Tensor(tensor, device=device) " \
+                  "is not supported.  Use torch.tensor(...) or torch.as_tensor(...) instead.");
+    } else {
+      TORCH_CHECK(false, "Legacy tensor new of the form tensor.new(tensor, device=device) " \
+                  "is not supported.  Use torch.as_tensor(...) instead.");
+    }
   } else if (r.idx == 5) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -554,66 +564,27 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t
   throw std::runtime_error("new(): invalid arguments");
 }
 
-Tensor legacy_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
-  auto options = dispatchKeyToTensorOptions(dispatch_key);
-  static PythonArgParser parser({
-    "new(*, Device? device=None)",
-    "new(Storage storage)",
-    "new(*, int64_t cdata)|hidden",
-    "new(Tensor other)",  // this doesn't have a dtype/device because it creates an alias.
-    "new(Tensor other, *, Device? device=None)|hidden",  // prevent Tensor matching with IntArrayRef, PyObject*
-    "new(IntArrayRef size, *, Device? device=None)",
-    "new(PyObject* data, *, Device? device=None)",
-  });
+// Handles ONLY torch.Tensor
+// Unlike the legacy dtype/device specialized constructors, this one is
+// relaxed to accept any device/dtype input tensor (even if it doesn't
+// match the default)
+Tensor base_tensor_ctor(PyObject* args, PyObject* kwargs) {
+  return legacy_tensor_generic_ctor_new(
+    torch::tensors::get_default_dispatch_key(),
+    torch::tensors::get_default_scalar_type(),
+    args, kwargs, CtorOrNew::BASE_CTOR
+  );
+}
 
-  if (isSparse(dispatchKeyToBackend(dispatch_key))) {
-    return legacy_sparse_tensor_new(dispatch_key, scalar_type, args, kwargs);
-  }
+// Handles calls like torch.DoubleTensor, torch.cuda.FloatTensor,
+// torch.sparse.FloatTensor, etc.
+Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  return legacy_tensor_generic_ctor_new(dispatch_key, scalar_type, args, kwargs, CtorOrNew::CTOR);
+}
 
-  check_base_legacy_new(dispatch_key, c10::kStrided);
-  ParsedArgs<3> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-  if (r.idx == 0) {
-    auto deviceOptional = r.deviceOptional(0);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    at::OptionalDeviceGuard device_guard(deviceOptional);
-    return at::empty({0}, build_options(options, scalar_type));
-  } else if (r.idx == 1) {
-    at::ScalarType storage_scalar_type;
-    bool is_typed_storage = false;
-    at::Storage storage = r.storage(0, storage_scalar_type, is_typed_storage);
-    if (storage_scalar_type != at::ScalarType::Undefined && is_typed_storage) {
-      TORCH_CHECK(
-        storage_scalar_type == scalar_type,
-        "Expected a Storage of type ", scalar_type,
-        " or an UntypedStorage, but got type ", storage_scalar_type,
-        " for argument 1 'storage'");
-    }
-    return new_with_storage(options, scalar_type, storage);
-  } else if (r.idx == 2) {
-    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
-    return at::unsafeTensorFromTH(cdata, true);
-  } else if (r.idx == 3) {
-    return new_with_tensor(options, scalar_type, r.tensor(0));
-  } else if (r.idx == 4) {
-      TORCH_CHECK(false, "Legacy tensor new of the form tensor.new(tensor, device=device) " \
-                  "is not supported.  Use torch.as_tensor(...) instead.");
-  } else if (r.idx == 5) {
-    PyObject* arg = r.pyobject(0);
-    auto deviceOptional = r.deviceOptional(1);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
-      // new(sequence) binds to this signature but should be treated differently
-      // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(options, scalar_type, deviceOptional, r.pyobject(0));
-    }
-    return new_with_sizes(options, scalar_type, r.deviceOptional(1), r.intlist(0));
-  } else if (r.idx == 6) {
-    auto deviceOptional = r.deviceOptional(1);
-    check_legacy_ctor_device(dispatch_key, deviceOptional);
-    return legacy_new_from_sequence(options, scalar_type, r.deviceOptional(1), r.pyobject(0));
-  }
-  throw std::runtime_error("new(): invalid arguments");
+// Handles tensor.new(...)
+Tensor legacy_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  return legacy_tensor_generic_ctor_new(dispatch_key, scalar_type, args, kwargs, CtorOrNew::NEW);
 }
 
 Tensor indexing_tensor_from_data(
@@ -635,16 +606,31 @@ Tensor indexing_tensor_from_data(
   }
 }
 
-Tensor sparse_csr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor sparse_compressed_tensor_ctor_worker(std::string name, c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r, c10::optional<c10::Layout> required_layout) {
   TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
   TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
-  static PythonArgParser parser({
-      "sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
-      "sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
-  });
-  const int NUM_ARGS = 9, CROW_INDICES_ARG = 0, COL_INDICES_ARG = 1, VALUES_ARG = 2;
-  ParsedArgs<NUM_ARGS> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
+  enum {
+        ARG_COMPRESSED_INDICES = 0,
+        ARG_PLAIN_INDICES,
+        ARG_VALUES,
+        ARG_SIZE,
+        ARG_TYPE,
+        ARG_LAYOUT,
+        ARG_DEVICE,
+        ARG_PIN_MEMORY,
+        ARG_REQUIRES_GRAD,
+        ARGS_COUNT
+  };
+  enum {
+        ARG_VALUES1 = ARG_VALUES,
+        ARG_TYPE1,
+        ARG_LAYOUT1,
+        ARG_DEVICE1,
+        ARG_PIN_MEMORY1,
+        ARG_REQUIRES_GRAD1,
+        ARGS_COUNT1
+  };
+
   auto safe_get_attr_string = [](PyObject *o, const char *attr_name) -> PyObject* {
     // Clear error indicator if attribute does not exists.
     // Otherwise subsequent Python C API calls might return bogus values.
@@ -659,76 +645,134 @@ Tensor sparse_csr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scal
     }
     return rc;
   };
-  THPObjectPtr crow_indices_dtype_attr(safe_get_attr_string(r.pyobject(CROW_INDICES_ARG), "dtype"));
-  THPObjectPtr col_indices_dtype_attr(safe_get_attr_string(r.pyobject(COL_INDICES_ARG), "dtype"));
-  at::ScalarType crow_indices_scalar_type = crow_indices_dtype_attr ? reinterpret_cast<THPDtype*>(
-    crow_indices_dtype_attr.get())->scalar_type : kInt;
-  at::ScalarType col_indices_scalar_type = col_indices_dtype_attr ? reinterpret_cast<THPDtype*>(
-    col_indices_dtype_attr.get())->scalar_type : kInt;
+  THPObjectPtr compressed_indices_dtype_attr(safe_get_attr_string(r.pyobject(ARG_COMPRESSED_INDICES), "dtype"));
+  THPObjectPtr plain_indices_dtype_attr(safe_get_attr_string(r.pyobject(ARG_PLAIN_INDICES), "dtype"));
+  at::ScalarType compressed_indices_scalar_type = compressed_indices_dtype_attr ? reinterpret_cast<THPDtype*>(
+    compressed_indices_dtype_attr.get())->scalar_type : kInt;
+  at::ScalarType plain_indices_scalar_type = plain_indices_dtype_attr ? reinterpret_cast<THPDtype*>(
+    plain_indices_dtype_attr.get())->scalar_type : kInt;
 
   if (r.idx == 0) {
-    const int SIZE_ARRAY_ARG = 3, TYPE_INFERENCE_ARG = 4, DEVICE_TYPE_ARG = 6, REQ_GRAD_ARG = 8;
-    bool type_inference = r.isNone(TYPE_INFERENCE_ARG);
-    const auto inferred_options = typeIdWithDefault(r, DEVICE_TYPE_ARG, dispatch_key);
-    const auto inferred_scalar_type = r.scalartypeWithDefault(TYPE_INFERENCE_ARG, scalar_type);
-    at::OptionalDeviceGuard device_guard(r.deviceOptional(DEVICE_TYPE_ARG));
-
-    Tensor values = internal_new_from_data(inferred_options, inferred_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG),
-                                           r.pyobject(VALUES_ARG), /*copy_variables=*/false, /*copy_numpy=*/true,
+    bool type_inference = r.isNone(ARG_TYPE);
+    const auto inferred_options = typeIdWithDefault(r, ARG_DEVICE, dispatch_key);
+    const auto inferred_scalar_type = r.scalartypeWithDefault(ARG_TYPE, scalar_type);
+    at::OptionalDeviceGuard device_guard(r.deviceOptional(ARG_DEVICE));
+
+    Tensor values = internal_new_from_data(inferred_options, inferred_scalar_type, r.deviceOptional(ARG_DEVICE),
+                                           r.pyobject(ARG_VALUES), /*copy_variables=*/false, /*copy_numpy=*/true,
                                            /*type_inference=*/type_inference);
-    Tensor crow_indices =  internal_new_from_data(values.options(),
-      crow_indices_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG), r.pyobject(CROW_INDICES_ARG),
+    Tensor compressed_indices =  internal_new_from_data(values.options(),
+      compressed_indices_scalar_type, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_COMPRESSED_INDICES),
       /*copy_variables=*/false, /*copy_numpy=*/true,
       /*type_inference=*/true);
-    Tensor col_indices = internal_new_from_data(values.options(),
-      col_indices_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG), r.pyobject(COL_INDICES_ARG),
+    Tensor plain_indices = internal_new_from_data(values.options(),
+      plain_indices_scalar_type, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_PLAIN_INDICES),
       /*copy_variables=*/false, /*copy_numpy=*/true,
       /*type_inference=*/true);
-
-    return at::sparse_csr_tensor(crow_indices, col_indices, values, r.intlist(SIZE_ARRAY_ARG),
-                                 values.options().layout(at::kSparseCsr)).set_requires_grad(r.toBool(REQ_GRAD_ARG));
+    c10::optional<c10::Layout> layout = (required_layout ? r.layoutWithDefault(ARG_LAYOUT, required_layout.value()) : r.layoutOptional(ARG_LAYOUT));
+    if (required_layout && layout) {
+      TORCH_CHECK(layout.value() == required_layout.value(), name, ": layout must be ", required_layout.value(), " but got ", layout.value());
+    }
+    return at::sparse_compressed_tensor(compressed_indices, plain_indices, values, r.intlist(ARG_SIZE),
+                                 values.options().layout(layout)).set_requires_grad(r.toBool(ARG_REQUIRES_GRAD));
   } else if (r.idx == 1) {
-    const int TYPE_INFERENCE_ARG = 3, DEVICE_TYPE_ARG = 5, REQ_GRAD_ARG = 7;
-    bool type_inference = r.isNone(TYPE_INFERENCE_ARG);
-    const auto inferred_options = typeIdWithDefault(r, DEVICE_TYPE_ARG, dispatch_key);
-    const auto inferred_scalar_type = r.scalartypeWithDefault(TYPE_INFERENCE_ARG, scalar_type);
-    at::OptionalDeviceGuard device_guard(r.deviceOptional(DEVICE_TYPE_ARG));
-
-    Tensor values = internal_new_from_data(inferred_options, inferred_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG),
-                                           r.pyobject(VALUES_ARG), /*copy_variables=*/false, /*copy_numpy=*/true,
+    bool type_inference = r.isNone(ARG_TYPE1);
+    const auto inferred_options = typeIdWithDefault(r, ARG_DEVICE1, dispatch_key);
+    const auto inferred_scalar_type = r.scalartypeWithDefault(ARG_TYPE1, scalar_type);
+    at::OptionalDeviceGuard device_guard(r.deviceOptional(ARG_DEVICE1));
+
+    Tensor values = internal_new_from_data(inferred_options, inferred_scalar_type, r.deviceOptional(ARG_DEVICE1),
+                                           r.pyobject(ARG_VALUES), /*copy_variables=*/false, /*copy_numpy=*/true,
                                            /*type_inference=*/type_inference);
-    Tensor crow_indices = internal_new_from_data(values.options(),
-      crow_indices_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG),
-      r.pyobject(CROW_INDICES_ARG), /*copy_variables=*/false, /*copy_numpy=*/true,
+    Tensor compressed_indices = internal_new_from_data(values.options(),
+      compressed_indices_scalar_type, r.deviceOptional(ARG_DEVICE1),
+      r.pyobject(ARG_COMPRESSED_INDICES), /*copy_variables=*/false, /*copy_numpy=*/true,
       /*type_inference=*/true);
-    Tensor col_indices = internal_new_from_data(values.options(), col_indices_scalar_type, r.deviceOptional(DEVICE_TYPE_ARG),
-      r.pyobject(COL_INDICES_ARG), /*copy_variables=*/false, /*copy_numpy=*/true,
+    Tensor plain_indices = internal_new_from_data(values.options(), plain_indices_scalar_type, r.deviceOptional(ARG_DEVICE1),
+      r.pyobject(ARG_PLAIN_INDICES), /*copy_variables=*/false, /*copy_numpy=*/true,
       /*type_inference=*/true);
-    return at::sparse_csr_tensor(crow_indices, col_indices, values,
-                                 values.options().layout(at::kSparseCsr)).set_requires_grad(r.toBool(REQ_GRAD_ARG));
+    c10::optional<c10::Layout> layout = (required_layout ? r.layoutWithDefault(ARG_LAYOUT1, required_layout.value()) : r.layoutOptional(ARG_LAYOUT1));
+    if (required_layout && layout) {
+      TORCH_CHECK(layout.value() == required_layout.value(), name, ": layout must be ", required_layout.value(), " but got ", layout.value());
+    }
+    return at::sparse_compressed_tensor(compressed_indices, plain_indices, values,
+                                 values.options().layout(layout)).set_requires_grad(r.toBool(ARG_REQUIRES_GRAD1));
   }
-  throw std::runtime_error("sparse_csr_tensor(): invalid arguments");
+  throw std::runtime_error(name + ": invalid arguments");
+}
+
+Tensor sparse_compressed_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  c10::optional<c10::Layout> required_layout {};
+  return sparse_compressed_tensor_ctor_worker("sparse_compressed_tensor", dispatch_key, scalar_type, r, required_layout);
+}
+
+Tensor sparse_csr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  c10::optional<c10::Layout> required_layout(c10::Layout::SparseCsr);
+  return sparse_compressed_tensor_ctor_worker("sparse_csr_tensor", dispatch_key, scalar_type, r, required_layout);
 }
 
-Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor sparse_csc_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  c10::optional<c10::Layout> required_layout(c10::Layout::SparseCsc);
+  return sparse_compressed_tensor_ctor_worker("sparse_csc_tensor", dispatch_key, scalar_type, r, required_layout);
+}
+
+Tensor sparse_bsr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  c10::optional<c10::Layout> required_layout(c10::Layout::SparseBsr);
+  return sparse_compressed_tensor_ctor_worker("sparse_bsr_tensor", dispatch_key, scalar_type, r, required_layout);
+}
+
+Tensor sparse_bsc_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  c10::optional<c10::Layout> required_layout(c10::Layout::SparseBsc);
+  return sparse_compressed_tensor_ctor_worker("sparse_bsc_tensor", dispatch_key, scalar_type, r, required_layout);
+}
+
+Tensor _sparse_compressed_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
   TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
   TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
   enum {
-    ARG_CROW_INDICES = 0,
-    ARG_COL_INDICES,
-    ARG_VALUES,
-    ARG_SIZE,
-    ARG_TYPE,
-    ARG_DEVICE,
-    ARG_REQUIRES_GRAD,
-    ARGS_COUNT
+        ARG_COMPRESSED_INDICES = 0,
+        ARG_PLAIN_INDICES,
+        ARG_VALUES,
+        ARG_SIZE,
+        ARG_TYPE,
+        ARG_LAYOUT,
+        ARG_DEVICE,
+        ARG_REQUIRES_GRAD,
+        ARGS_COUNT
   };
-  static PythonArgParser parser({
-    "_sparse_csr_tensor_unsafe(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  bool type_inference = r.isNone(ARG_TYPE);
+  const auto inferred_options = typeIdWithDefault(r, ARG_DEVICE, dispatch_key);
+  const auto inferred_scalar_type = r.scalartypeWithDefault(ARG_TYPE, scalar_type);
+  at::OptionalDeviceGuard device_guard(r.deviceOptional(ARG_DEVICE));
+  Tensor values = internal_new_from_data(inferred_options, inferred_scalar_type, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_VALUES),
+                                         /*copy_variables=*/false, /*copy_numpy=*/true,
+                                         /*type_inference=*/type_inference);
 
-  ParsedArgs<ARGS_COUNT> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
+  Tensor compressed_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_COMPRESSED_INDICES),
+                                          /*copy_variables=*/false, /*copy_numpy=*/true,
+                                          /*type_inference=*/true);
+
+  Tensor plain_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_PLAIN_INDICES),
+                                          /*copy_variables=*/false, /*copy_numpy=*/true,
+                                          /*type_inference=*/true);
+  return at::_sparse_compressed_tensor_unsafe(compressed_indices, plain_indices, values, r.intlist(ARG_SIZE),
+                                              values.options().layout(r.layoutOptional(ARG_LAYOUT))).set_requires_grad(r.toBool(ARG_REQUIRES_GRAD));
+}
+
+template <c10::Layout required_layout>
+Tensor _sparse_compressed_tensor_unsafe_ctor_template(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
+  TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
+  enum {
+        ARG_COMPRESSED_INDICES = 0,
+        ARG_PLAIN_INDICES,
+        ARG_VALUES,
+        ARG_SIZE,
+        ARG_TYPE,
+        ARG_DEVICE,
+        ARG_REQUIRES_GRAD,
+        ARGS_COUNT
+  };
   bool type_inference = r.isNone(ARG_TYPE);
   const auto inferred_options = typeIdWithDefault(r, ARG_DEVICE, dispatch_key);
   const auto inferred_scalar_type = r.scalartypeWithDefault(ARG_TYPE, scalar_type);
@@ -737,15 +781,31 @@ Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarT
                                          /*copy_variables=*/false, /*copy_numpy=*/true,
                                          /*type_inference=*/type_inference);
 
-  Tensor crow_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_CROW_INDICES),
+  Tensor compressed_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_COMPRESSED_INDICES),
                                           /*copy_variables=*/false, /*copy_numpy=*/true,
                                           /*type_inference=*/true);
 
-  Tensor col_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_COL_INDICES),
+  Tensor plain_indices = internal_new_from_data(values.options(), kInt, r.deviceOptional(ARG_DEVICE), r.pyobject(ARG_PLAIN_INDICES),
                                           /*copy_variables=*/false, /*copy_numpy=*/true,
                                           /*type_inference=*/true);
+  return at::_sparse_compressed_tensor_unsafe(compressed_indices, plain_indices, values, r.intlist(ARG_SIZE),
+                                              values.options().layout(required_layout)).set_requires_grad(r.toBool(ARG_REQUIRES_GRAD));
+}
+
+Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  return _sparse_compressed_tensor_unsafe_ctor_template<c10::kSparseCsr>(dispatch_key, scalar_type, r);
+}
 
-  return at::_sparse_csr_tensor_unsafe(crow_indices, col_indices, values, r.intlist(ARG_SIZE), values.options().layout(at::kSparseCsr)).set_requires_grad(r.toBool(ARG_REQUIRES_GRAD));
+Tensor _sparse_csc_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  return _sparse_compressed_tensor_unsafe_ctor_template<c10::kSparseCsc>(dispatch_key, scalar_type, r);
+}
+
+Tensor _sparse_bsr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  return _sparse_compressed_tensor_unsafe_ctor_template<c10::kSparseBsr>(dispatch_key, scalar_type, r);
+}
+
+Tensor _sparse_bsc_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) {
+  return _sparse_compressed_tensor_unsafe_ctor_template<c10::kSparseBsc>(dispatch_key, scalar_type, r);
 }
 
 // Note [Ensuring sparse values and indices match devices]
@@ -769,17 +829,12 @@ Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarT
 // "this needs to be CUDA" and indices would be allocated on the wrong tensor.
 // Options is more right and gets this correct.
 
-Tensor sparse_coo_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor sparse_coo_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r) {
   TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
   TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
-  static PythonArgParser parser({
-    "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-    "sparse_coo_tensor(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-    "sparse_coo_tensor(IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
-
-  ParsedArgs<6> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
     bool type_inference = r.isNone(2);
     const auto inferred_options = typeIdWithDefault(r, 3, dispatch_key);
@@ -816,7 +871,10 @@ Tensor sparse_coo_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scal
   throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
 }
 
-Tensor _sparse_coo_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor _sparse_coo_tensor_unsafe_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r) {
   TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
   TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
   enum {
@@ -828,12 +886,6 @@ Tensor _sparse_coo_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarT
     ARG_REQUIRES_GRAD,
     ARGS_COUNT
   };
-  static PythonArgParser parser({
-    "_sparse_coo_tensor_unsafe(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
-
-  ParsedArgs<ARGS_COUNT> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
   bool type_inference = r.isNone(ARG_TYPE);
   const auto inferred_options = typeIdWithDefault(r, ARG_DEVICE, dispatch_key);
   const auto inferred_scalar_type = r.scalartypeWithDefault(ARG_TYPE, scalar_type);
@@ -866,37 +918,100 @@ void _validate_sparse_coo_tensor_args(c10::DispatchKey dispatch_key, at::ScalarT
   at::native::_validate_sparse_coo_tensor_args(indices, values, r.intlist(2));
 }
 
+void _validate_sparse_compressed_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  auto options = dispatchKeyToTensorOptions(dispatch_key);
+  enum {
+    ARG_COMPRESSED_INDICES = 0,
+    ARG_PLAIN_INDICES,
+    ARG_VALUES,
+    ARG_SIZE,
+    ARG_LAYOUT,
+    ARGS_COUNT
+  };
 
-void _validate_sparse_csr_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  const std::string signature = "_validate_sparse_compressed_tensor(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, IntArrayRef size, Layout layout)";
+  static PythonArgParser parser({signature});
+
+  ParsedArgs<ARGS_COUNT> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  Tensor values = internal_new_from_data(
+      options, scalar_type, c10::nullopt, r.pyobject(ARG_VALUES),
+      /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
+  // See Note [Ensuring sparse values and indices match devices]
+  Tensor compressed_indices = internal_new_from_data(
+      values.options(), kInt, c10::nullopt, r.pyobject(ARG_COMPRESSED_INDICES),
+      /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
+  Tensor plain_indices = internal_new_from_data(
+      values.options(), kInt, c10::nullopt, r.pyobject(ARG_PLAIN_INDICES),
+      /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
+  at::native::_validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, values, r.intlist(ARG_SIZE), r.layout(ARG_LAYOUT));
+}
+
+template <c10::Layout required_layout>
+void _validate_sparse_compressed_tensor_args_template(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
   auto options = dispatchKeyToTensorOptions(dispatch_key);
-  static PythonArgParser parser({
-    "_validate_sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size)",
-  });
+  enum {
+    ARG_COMPRESSED_INDICES = 0,
+    ARG_PLAIN_INDICES,
+    ARG_VALUES,
+    ARG_SIZE,
+    ARGS_COUNT
+  };
+  static std::string sig;
+  switch (required_layout) {
+  case c10::Layout::SparseCsr:
+    sig = "_validate_sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size)";
+    break;
+  case c10::Layout::SparseCsc:
+    sig = "_validate_sparse_csc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size)";
+    break;
+  case c10::Layout::SparseBsr:
+    sig = "_validate_sparse_bsr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size)";
+    break;
+  case c10::Layout::SparseBsc:
+    sig = "_validate_sparse_bsc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size)";
+    break;
+  default:
+    ;
+  }
+  static PythonArgParser parser({sig});
 
-  ParsedArgs<4> parsed_args;
+  ParsedArgs<ARGS_COUNT> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   Tensor values = internal_new_from_data(
-      options, scalar_type, c10::nullopt, r.pyobject(2),
+      options, scalar_type, c10::nullopt, r.pyobject(ARG_VALUES),
       /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
   // See Note [Ensuring sparse values and indices match devices]
-  Tensor crow_indices = internal_new_from_data(
-      values.options(), kInt, c10::nullopt, r.pyobject(0),
+  Tensor compressed_indices = internal_new_from_data(
+      values.options(), kInt, c10::nullopt, r.pyobject(ARG_COMPRESSED_INDICES),
       /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
-  Tensor col_indices = internal_new_from_data(
-      values.options(), kInt, c10::nullopt, r.pyobject(1),
+  Tensor plain_indices = internal_new_from_data(
+      values.options(), kInt, c10::nullopt, r.pyobject(ARG_PLAIN_INDICES),
       /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true);
 
-  at::native::_validate_sparse_csr_tensor_args(crow_indices, col_indices, values, r.intlist(3));
+  at::native::_validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, values, r.intlist(3), required_layout);
 }
 
-Tensor tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
-  static PythonArgParser parser({
-    "tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, DimnameList? names=None)",
-  });
+void _validate_sparse_csr_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  _validate_sparse_compressed_tensor_args_template<c10::Layout::SparseCsr>(dispatch_key, scalar_type, args, kwargs);
+}
 
-  constexpr int ctor_num_args = 6;
-  ParsedArgs<ctor_num_args> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
+void _validate_sparse_csc_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  _validate_sparse_compressed_tensor_args_template<c10::Layout::SparseCsc>(dispatch_key, scalar_type, args, kwargs);
+}
+
+void _validate_sparse_bsr_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  _validate_sparse_compressed_tensor_args_template<c10::Layout::SparseBsr>(dispatch_key, scalar_type, args, kwargs);
+}
+
+void _validate_sparse_bsc_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+  _validate_sparse_compressed_tensor_args_template<c10::Layout::SparseBsc>(dispatch_key, scalar_type, args, kwargs);
+}
+
+Tensor tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r) {
   if (r.idx == 0) {
     PyObject* data = r.pyobject(0);
     if (THPVariable_Check(data)) {
@@ -929,14 +1044,11 @@ Tensor tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, Py
   throw std::runtime_error("tensor(): invalid arguments");
 }
 
-Tensor as_tensor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
+Tensor as_tensor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r) {
   // TODO: add requires_grad once we decide on semantics for sharing data.
-  static PythonArgParser parser({
-    "as_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None)",
-  });
-
-  ParsedArgs<3> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
     bool type_inference = r.isNone(1);
     return internal_new_from_data(
@@ -1025,7 +1137,7 @@ Tensor tensor_frombuffer(PyObject* buffer, ScalarType dtype, int64_t count, int6
   }
 
   TORCH_CHECK_VALUE(
-      static_cast<size_t>(offset) + actual_count * elsize <= len,
+      static_cast<size_t>(offset) + actual_count * elsize <= static_cast<size_t>(len),
       "requested buffer length (", actual_count, " * ", elsize, " bytes) "
       "after offset (", offset, " bytes) must not be greater than actual "
       "buffer length (", len, " bytes)");
diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h
index 1865583ea61f..da2829feb160 100644
--- a/torch/csrc/utils/tensor_new.h
+++ b/torch/csrc/utils/tensor_new.h
@@ -1,11 +1,13 @@
 #pragma once
 
 #include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
 
 #include <ATen/core/Tensor.h>
 
 namespace torch { namespace utils {
 
+at::Tensor base_tensor_ctor(PyObject* args, PyObject* kwargs);
 at::Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor legacy_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor indexing_tensor_from_data(
@@ -13,14 +15,42 @@ at::Tensor indexing_tensor_from_data(
     at::ScalarType scalar_type,
     c10::optional<at::Device> device,
     PyObject* data);
-at::Tensor sparse_coo_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor _sparse_coo_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+at::Tensor sparse_coo_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor _sparse_coo_tensor_unsafe_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
 void _validate_sparse_coo_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor sparse_csr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+
+at::Tensor sparse_compressed_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor sparse_csr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor sparse_csc_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor sparse_bsr_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor sparse_bsc_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+
+at::Tensor _sparse_compressed_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor _sparse_csr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor _sparse_csc_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor _sparse_bsr_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+at::Tensor _sparse_bsc_tensor_unsafe_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r);
+
+void _validate_sparse_compressed_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 void _validate_sparse_csr_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor as_tensor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+void _validate_sparse_csc_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+void _validate_sparse_bsr_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+void _validate_sparse_bsc_tensor_args(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
+
+at::Tensor tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor as_tensor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
 at::Tensor new_tensor(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor new_ones(c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor tensor_frombuffer(PyObject* buffer, at::ScalarType dtype, int64_t count, int64_t offset, bool requires_grad);
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index dd1f94990085..b56594fa49b5 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -371,7 +371,7 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
   {
     PyObject *py_strides = PyDict_GetItemString(cuda_dict, "strides");
     if (py_strides != nullptr && py_strides != Py_None) {
-      if (PySequence_Length(py_strides) == -1 || PySequence_Length(py_strides) != sizes.size()) {
+      if (PySequence_Length(py_strides) == -1 || static_cast<size_t>(PySequence_Length(py_strides)) != sizes.size()) {
         throw TypeError("strides must be a sequence of the same length as shape");
       }
       strides = seq_to_aten_shape(py_strides);
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index 16125669a011..c52f10d1722f 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -21,11 +21,14 @@ static const char* backend_to_string(const at::Backend& backend) {
     case at::Backend::CPU: return "torch";
     case at::Backend::CUDA: return "torch.cuda";
     case at::Backend::XPU: return "torch.xpu";
+    case at::Backend::IPU: return "torch.ipu";
     case at::Backend::SparseCPU: return "torch.sparse";
     case at::Backend::SparseCUDA: return "torch.cuda.sparse";
     case at::Backend::SparseXPU: return "torch.xpu.sparse";
     case at::Backend::QuantizedCPU: return "torch.quantized";
     case at::Backend::HPU: return "torch.hpu";
+    case at::Backend::MPS: return "torch.mps";
+    case at::Backend::PrivateUse1: return "torch.privateuseone";
     default: AT_ERROR("Unimplemented backend ", backend);
   }
 }
@@ -83,11 +86,16 @@ at::TensorOptions options_from_string(const std::string& str) {
 
 std::vector<std::pair<Backend, ScalarType>> all_declared_types() {
   std::vector<std::pair<Backend, ScalarType>> ret;
-  // can't easily iterate over enum classes
+
+  // NOTE: Do not add more types here. This list controls the creation
+  // of legacy tensor types e.g. torch.cuda.FloatTensor which are
+  // maintained for backwards-compatibility only.
   std::vector<Backend> backends = { Backend::CPU, Backend::CUDA, Backend::SparseCPU, Backend::SparseCUDA };
-  std::vector<ScalarType> scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
-                                           ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
-                                           ScalarType::Bool, ScalarType::BFloat16};
+  std::vector<ScalarType> scalar_types = {
+    ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
+    ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
+    ScalarType::Bool, ScalarType::BFloat16};
+
   for (auto& backend : backends) {
     for (auto& scalar_type : scalar_types) {
       // there is no sparse bool type.
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
index 7a11d6f11a3b..1c7ed8205e67 100644
--- a/torch/csrc/utils/throughput_benchmark-inl.h
+++ b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -41,7 +41,8 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
     for (const auto thread_id : c10::irange(config.num_calling_threads)) {
       // Just in case we generate num_iters inputs for each of the threads
       // This was if one thread does all the work we will be fine
-      for (int i = 0; i < config.num_iters + config.num_warmup_iters; ++i) {
+      for (const auto i :
+           c10::irange(config.num_iters + config.num_warmup_iters)) {
         thread_inputs[thread_id].push_back(cloneInput(inputs_[dist(engine)]));
       }
       input_iters[thread_id] = 0;
@@ -90,7 +91,6 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
         LOG(INFO) << "Shutting down forward thread " << thread_id
                   << ". Total number of finished threads: " << finished;
       }
-
     });
   }
 
@@ -108,7 +108,8 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
     if (!config.profiler_output_path.empty()) {
       LOG(INFO) << "Using Autograd profiler. Trace will be saved to "
                 << config.profiler_output_path;
-      profiler_guard = std::make_unique<RecordProfile>(config.profiler_output_path);
+      profiler_guard =
+          std::make_unique<RecordProfile>(config.profiler_output_path);
     }
     LOG(INFO) << "Starting threads";
     start = true;
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
new file mode 100644
index 000000000000..487b444b883f
--- /dev/null
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/core/TorchDispatchModeTLS.h>
+
+namespace torch {
+namespace torch_dispatch_mode {
+
+struct StashTorchDispatchModeGuard {
+public:
+  StashTorchDispatchModeGuard() {
+    saved_ = at::impl::TorchDispatchModeTLS::get_state();
+    at::impl::TorchDispatchModeTLS::set_state(nullptr);
+  }
+
+  ~StashTorchDispatchModeGuard() {
+    at::impl::TorchDispatchModeTLS::set_state(saved_);
+  }
+private:
+  std::shared_ptr<at::SafePyObject> saved_;
+};
+
+} // namespace torch_dispatch_mode
+} // namespace torch
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index d782c300c332..24fe4d42b130 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -18,7 +18,8 @@
 from typing import List, Optional, Tuple, Union, Any
 from ._utils import _get_device_index, _dummy_type
 from .._utils import classproperty
-from .graphs import CUDAGraph, graph_pool_handle, graph, make_graphed_callables
+from .graphs import CUDAGraph, graph_pool_handle, graph, \
+    make_graphed_callables, is_current_stream_capturing
 from .streams import ExternalStream, Stream, Event
 from .. import device as _device
 import torch._C
@@ -674,72 +675,82 @@ def type(self, *args, **kwargs):
 
     __new__ = _lazy_new
 
-from torch.storage import TypedStorage
+from torch.storage import _TypedStorage, _LegacyStorage
 
-class UntypedStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase):
-    pass
+class _UntypedStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase):
+    @classmethod
+    def from_buffer(cls, *args, **kwargs):
+        raise RuntimeError('from_buffer: Not available for CUDA storage')
+
+    @classmethod
+    def _new_with_weak_ptr(cls, *args, **kwargs):
+        raise RuntimeError('_new_with_weak_ptr: Not available for CUDA storage')
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None):
+        raise RuntimeError('_new_shared_filename: Not available for CUDA storage')
 
-class ByteStorage(TypedStorage):
+class ByteStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.uint8
 
-class DoubleStorage(TypedStorage):
+class DoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.double
 
-class FloatStorage(TypedStorage):
+class FloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.float
 
-class HalfStorage(TypedStorage):
+class HalfStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.half
 
-class LongStorage(TypedStorage):
+class LongStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.long
 
-class IntStorage(TypedStorage):
+class IntStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.int
 
-class ShortStorage(TypedStorage):
+class ShortStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.short
 
-class CharStorage(TypedStorage):
+class CharStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.int8
 
-class BoolStorage(TypedStorage):
+class BoolStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.bool
 
-class BFloat16Storage(TypedStorage):
+class BFloat16Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.bfloat16
 
-class ComplexDoubleStorage(TypedStorage):
+class ComplexDoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.cdouble
 
-class ComplexFloatStorage(TypedStorage):
+class ComplexFloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
         return torch.cfloat
 
-torch._storage_classes.add(UntypedStorage)
+torch._storage_classes.add(_UntypedStorage)
 torch._storage_classes.add(DoubleStorage)
 torch._storage_classes.add(FloatStorage)
 torch._storage_classes.add(LongStorage)
@@ -757,3 +768,4 @@ def dtype(self):
 from . import profiler
 from . import nvtx
 from . import amp
+from . import jiterator
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index 839dac652073..ef6d4ab6432f 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -10,7 +10,7 @@
 from typing import Any
 
 
-class autocast(torch.autocast_mode.autocast):
+class autocast(torch.amp.autocast_mode.autocast):
     r"""
     See :class:`torch.autocast`.
     ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)``
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index bce9c70a0d1d..76bcb3fd2848 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -8,11 +8,21 @@
     # Define dummy base classes
     torch._C.__dict__['_CUDAGraph'] = _dummy_type('_CUDAGraph')
     torch._C.__dict__['_graph_pool_handle'] = _dummy_type('_graph_pool_handle')
+    torch._C.__dict__['_cuda_isCurrentStreamCapturing'] = _dummy_type('_cuda_isCurrentStreamCapturing')
 
 from torch._C import _CUDAGraph  # noqa: F401
 from torch._C import _graph_pool_handle
+from torch._C import _cuda_isCurrentStreamCapturing
 
 
+def is_current_stream_capturing():
+    r"""
+    Returns True if CUDA graph capture is underway on the current CUDA stream, False otherwise.
+
+    If a CUDA context does not exist on the current device, returns False without initializing the context.
+    """
+    return _cuda_isCurrentStreamCapturing()
+
 # Python shim helps Sphinx process docstrings more reliably.
 def graph_pool_handle():
     r"""
diff --git a/torch/cuda/jiterator.py b/torch/cuda/jiterator.py
new file mode 100644
index 000000000000..46689bcb1ff9
--- /dev/null
+++ b/torch/cuda/jiterator.py
@@ -0,0 +1,117 @@
+import torch
+from torch import Tensor
+from typing import Callable, List
+
+import re
+
+__all__ : List[str] = []
+
+class _CodeParser:
+    def __init__(self, code_string: str):
+        optional_ws = r"\s*"
+        required_ws = r"\s+"
+        template_params = r"(?P<template_params>\<.+\>)"
+        return_type = r"(?P<return_type>\w+)"
+        function_name = r"(?P<function_name>\w+)"
+        function_params = r"(?P<function_params>\(.+\))"
+        function_body = r"(?P<function_body>\{.+\})"
+
+        pattern = \
+            optional_ws \
+            + "template" \
+            + optional_ws + template_params \
+            + optional_ws + return_type \
+            + required_ws + function_name \
+            + optional_ws + function_params \
+            + optional_ws + function_body \
+            + optional_ws
+
+        result = re.match(pattern, code_string, re.DOTALL)  # DOTALL for matching multiline
+
+        if result is None:
+            raise Exception(f"Couldn't parse code, please check correctness:\n {code_string}")
+
+        self.template_params = result["template_params"]
+        self.return_type = result["return_type"]
+        self.function_name = result["function_name"]
+        self.function_params = result["function_params"]
+        self.function_body = result["function_body"]
+
+
+def _create_jit_fn(code_string: str, **kwargs) -> Callable:
+    """
+    Create a jiterator-generated cuda kernel for an elementwise op.
+
+    The code string has to be a valid CUDA function that describes the computation for a single element. The code
+    string has to follow the c++ template pattern, as shown in the example below. This function will be inlined
+    into elementwise kernel template, and compiled on the fly. Compiled kernel will be cached in memory, as well as
+    local temp dir.
+
+    Jiterator-generated kernels accepts noncontiguous tensors, and supports boardcasting and type promotion.
+
+    Args:
+        code_string (string): CUDA code string to be compiled by jiterator.
+        kwargs (Dict, optional): Keyword arguments for generated function
+
+    Example:
+        >>> code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return  -x + alpha * y; }"
+        >>> jitted_fn = create_jit_fn(code_string, alpha=1.0)
+        >>> a = torch.rand(3, device='cuda')
+        >>> b = torch.rand(3, device='cuda')
+        >>> # invoke jitted function like a regular python function
+        >>> result = jitted_fn(a, b, alpha=3.14)
+
+
+    Jiterator can be used together with python registration to override an operator's cuda kernel
+
+    Following example is overriding gelu's cuda kernel with relu:
+        >>> code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
+        >>> my_gelu = create_jit_fn(code_string)
+        >>> my_lib = torch.library.Library("aten", "IMPL")
+        >>> my_lib.impl('aten::gelu', my_gelu, "CUDA")
+        >>> # torch.nn.GELU and torch.nn.function.gelu are now overridden
+        >>> a = torch.rand(3, device='cuda')
+        >>> torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
+
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        Jiterator only supports up to 8 tensor inputs
+
+    .. warning::
+        All input tensors must live in CUDA device
+
+    """
+    class JittedFunction:
+        def __init__(self, code_string: str, **kwargs):
+            self.code_string = code_string
+
+            parsed_code = _CodeParser(code_string)
+            self.kernel_name = parsed_code.function_name
+
+            self.kwargs_dict = kwargs
+            self.is_cuda_available = torch.cuda.is_available()
+
+        def __call__(self, *tensors: Tensor, **kwargs):
+            # Jiterator follow torch.cuda's lazy initialization behavior
+            # Defer checking cuda's availability at the function invocation time
+            assert self.is_cuda_available, "Jiterator is only supported on CUDA GPUs, no CUDA GPUs are available."
+
+            assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
+
+            expanded_kwargs = self.kwargs_dict.copy()
+            for key, value in kwargs.items():
+                if key in self.kwargs_dict:
+                    expanded_kwargs[key] = value
+                else:
+                    raise KeyError(f"{key} is not declared in function definition")
+
+            return torch._C._cuda_jiterator_compile_and_launch_kernel(
+                self.code_string,
+                self.kernel_name,
+                tensors,
+                expanded_kwargs)
+
+    return JittedFunction(code_string, **kwargs)
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 38be2067d3ba..a4b7b1d956cb 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -7,6 +7,13 @@
 from . import is_initialized, _get_device_index, _lazy_init
 from torch.types import Device
 
+__all__ = ["caching_allocator_alloc", "caching_allocator_delete", "set_per_process_memory_fraction",
+           "empty_cache", "memory_stats", "memory_stats_as_nested_dict", "reset_accumulated_memory_stats",
+           "reset_peak_memory_stats", "reset_max_memory_allocated", "reset_max_memory_cached",
+           "memory_allocated", "max_memory_allocated", "memory_reserved", "max_memory_reserved",
+           "memory_cached", "max_memory_cached", "memory_snapshot", "memory_summary", "list_gpu_processes",
+           "mem_get_info"]
+
 def _host_allocator():
     _lazy_init()
     return torch._C._cuda_cudaHostAllocator()
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index 45bab19c8a1d..db8ba710c5c7 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -48,7 +48,7 @@ def init_rank(num_ranks, uid, rank):
 
 
 def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
-    if not isinstance(inputs, collections.Container) or isinstance(inputs, torch.Tensor):
+    if not isinstance(inputs, collections.abc.Container) or isinstance(inputs, torch.Tensor):
         raise TypeError("Inputs should be a collection of tensors")
 
 
diff --git a/torch/custom_class.h b/torch/custom_class.h
index 6fec7d7b1b8f..214c6f5ed060 100644
--- a/torch/custom_class.h
+++ b/torch/custom_class.h
@@ -314,7 +314,7 @@ class class_ : public ::torch::detail::class_base {
     def("__getstate__", std::forward<GetStateFn>(get_state));
 
     // __setstate__ needs to be registered with some custom handling:
-    // We need to wrap the invocation of of the user-provided function
+    // We need to wrap the invocation of the user-provided function
     // such that we take the return value (i.e. c10::intrusive_ptr<CurrClass>)
     // and assign it to the `capsule` attribute.
     using SetStateTraits =
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 43fb3fdfc6ce..3a5345c9ff4c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -40,8 +40,10 @@ def is_available() -> bool:
         _compute_bucket_assignment_by_size,
         _verify_params_across_processes,
         _test_python_store,
-        _DistributedDebugLevel,
-        _get_debug_mode,
+        DebugLevel,
+        get_debug_level,
+        set_debug_level,
+        set_debug_level_from_env,
     )
 
     if sys.platform != "win32":
@@ -64,4 +66,10 @@ def is_available() -> bool:
         _rank_not_in_group,
     )
 
+    from .rendezvous import (
+        _create_store_from_options,
+    )
+
     from .remote_device import _remote_device
+
+    set_debug_level_from_env()
diff --git a/torch/distributed/_shard/__init__.py b/torch/distributed/_shard/__init__.py
index b6f0776a36af..2dfad636b07f 100644
--- a/torch/distributed/_shard/__init__.py
+++ b/torch/distributed/_shard/__init__.py
@@ -1 +1,7 @@
-from .api import shard_parameter, _shard_tensor
+from .api import (
+    _replicate_tensor,
+    _shard_tensor,
+    load_with_process_group,
+    shard_module,
+    shard_parameter,
+)
diff --git a/torch/distributed/_shard/_utils.py b/torch/distributed/_shard/_utils.py
new file mode 100644
index 000000000000..a81c2398f519
--- /dev/null
+++ b/torch/distributed/_shard/_utils.py
@@ -0,0 +1,21 @@
+import torch
+from torch.distributed._shard.metadata import ShardMetadata
+
+def narrow_tensor(tensor: torch.Tensor, metadata: ShardMetadata):
+    """
+    narrow the tensor according to the metadata
+    """
+    narrowed_tensor = tensor
+    shard_offsets = metadata.shard_offsets
+    shard_sizes = metadata.shard_sizes
+    for idx, (offset, size) in enumerate(zip(shard_offsets, shard_sizes)):
+        if size < tensor.size(idx):
+            # Reshape to get shard for this rank and we don't want autograd
+            # recording here for the narrow op and 'local_shard' should be a
+            # leaf variable in the autograd graph.
+            narrowed_tensor = narrowed_tensor.narrow(
+                idx,
+                shard_offsets[idx],
+                shard_sizes[idx]
+            )
+    return narrowed_tensor
diff --git a/torch/distributed/_shard/api.py b/torch/distributed/_shard/api.py
index 18711d4dbdfb..1bc58400b124 100644
--- a/torch/distributed/_shard/api.py
+++ b/torch/distributed/_shard/api.py
@@ -1,24 +1,25 @@
-import copy
+from contextlib import contextmanager
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.distributed import distributed_c10d
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    _PartialTensor
+)
+from .replicated_tensor import ReplicatedTensor
 from .sharding_spec import (
-    ChunkShardingSpec,
     ShardingSpec,
+    ChunkShardingSpec
 )
-from torch.distributed._shard.sharding_spec._internals import (
-    get_chunked_dim_size,
-    get_split_size,
-)
-from torch.distributed._shard.sharded_tensor import (
-    Shard,
-    ShardMetadata,
-    ShardedTensor,
+from .sharding_plan import (
+    ShardingPlan
 )
+from .sharder import Sharder
 
 def _shard_tensor(
     tensor: torch.Tensor, sharding_spec: ShardingSpec, src_rank=0, process_group=None
-):
+) -> ShardedTensor:
     """
     Given a :class:`torch.Tensor`, it shards that tensor according to the provided
     ``sharding_spec``. ``src_rank`` denotes the source rank which would be
@@ -45,14 +46,12 @@ def _shard_tensor(
         Only :class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec` is
         currently supported as the ``sharding_spec``.
     """
-    if not isinstance(sharding_spec, ChunkShardingSpec):
-        raise NotImplementedError('Only ChunkShardingspec is supported.')
     if not tensor.is_contiguous():
         raise ValueError('input tensor is not a contiguous Tensor')
 
     pg = process_group if process_group is not None else distributed_c10d._get_default_group()
     world_size = dist.get_world_size(pg)
-    rank = dist.get_rank(pg)
+    current_rank = dist.get_rank(pg)
 
     # Validate src_rank and sharding_spec are same across all ranks.
     gathered_list = [None] * world_size
@@ -61,64 +60,14 @@ def _shard_tensor(
     for idx, entry in enumerate(gathered_list):
         if src_rank != entry[0]:  # type: ignore[index]
             raise ValueError(
-                f'src_rank={src_rank} on rank: {rank} does not '  # type: ignore[index]
+                f'src_rank={src_rank} on rank: {current_rank} does not '  # type: ignore[index]
                 f'match with src_rank={entry[0]} on rank: {idx}')
         if sharding_spec != entry[1]:  # type: ignore[index]
             raise ValueError(
-                f'sharding_spec={sharding_spec} on rank: {rank} does not '  # type: ignore[index]
+                f'sharding_spec={sharding_spec} on rank: {current_rank} does not '  # type: ignore[index]
                 f'match with sharding_spec={entry[1]} on rank: {idx}')
 
-    # Rearrange chunks according to placement.
-    local_metadata = None
-    current_offsets = [0] * len(tensor.size())
-    shards_metadata = []
-    sharding_dim_size = tensor.size(sharding_spec.dim)  # type: ignore[arg-type]
-    split_size = get_split_size(sharding_dim_size, world_size)
-    tensor_sizes = list(tensor.size())
-    for idx, placement in enumerate(sharding_spec.placements):
-        chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
-        shard_size = copy.deepcopy(tensor_sizes)
-        shard_size[sharding_spec.dim] = chunked_dim_size  # type: ignore[index]
-
-        shard_metadata = ShardMetadata(
-            shard_offsets=copy.deepcopy(current_offsets),
-            shard_sizes=shard_size,
-            placement=placement,
-        )
-        shards_metadata.append(shard_metadata)
-
-        if rank == placement.rank():  # type: ignore[union-attr]
-            local_metadata = shard_metadata
-
-        current_offsets[sharding_spec.dim] += chunked_dim_size  # type: ignore[index]
-
-    # Scatter the shards (use broadcast since NCCL doesn't support scatter, this is very inefficient).
-    dist.broadcast(tensor, src=src_rank, group=pg)
-
-    # Reshape to get shard for this rank and we don't want autograd
-    # recording here for the narrow op and 'local_shard' should be a
-    # leaf variable in the autograd graph.
-    local_shard = tensor.narrow(
-        sharding_spec.dim,  # type: ignore[arg-type]
-        local_metadata.shard_offsets[sharding_spec.dim],  # type: ignore[union-attr, arg-type, index]
-        local_metadata.shard_sizes[sharding_spec.dim],  # type: ignore[union-attr, index]
-    ).clone().detach().contiguous()
-
-    # Sync requires_grad to local_shard.
-    local_shard.requires_grad = tensor.requires_grad
-
-    # Create ShardedTensor based on local shards.
-    local_shards = [
-        Shard(
-            tensor=local_shard,
-            metadata=local_metadata,  # type: ignore[arg-type]
-        )
-    ]
-
-    st = ShardedTensor._init_from_local_shards(local_shards, tensor.size(), process_group=pg)
-
-    # Manually set sharding_spec
-    st._sharding_spec = sharding_spec
+    st = sharding_spec.shard(tensor, src_rank=src_rank, process_group=process_group)
 
     return st
 
@@ -158,7 +107,7 @@ def shard_parameter(
     """
     # Perform some validation first.
     if not hasattr(module, param_name):
-        raise ValueError(f'module: {module} does not have parameter with name: {param_name}')
+        raise AttributeError(f'{module._get_name()} has no attribute `{param_name}`')
 
     tensor = getattr(module, param_name)
     if not isinstance(tensor, torch.Tensor):
@@ -178,3 +127,189 @@ def shard_parameter(
 
     # Now we can set the attribute appropriately.
     setattr(module, param_name, st)
+
+
+def _replicate_tensor(tensor: torch.Tensor, process_group=None) -> ReplicatedTensor:
+    """
+    Given a :class:`torch.Tensor`, mark it as a ReplicatedTensor where all
+    ranks have the same value.
+
+    Args:
+        tensor (:class:`torch.Tensor`): the tensor to be marked as replicated.
+    Keyword args:
+        process_group (ProcessGroup, optional): The process group to replicate on.
+            If None, the default process group will be used.
+    Returns:
+        A :class:`ReplicatedTensor` from the given tensor.
+
+    """
+    return ReplicatedTensor(tensor, process_group=process_group)
+
+# Tracks the current process group in the load context manager.
+_CURRENT_PROCESS_GROUP = None
+
+@contextmanager
+def load_with_process_group(process_group):
+    """
+    Context manager to set the process group with which to load a ShardedTensor/ReplicatedTensor.
+    """
+    global _CURRENT_PROCESS_GROUP
+    if _CURRENT_PROCESS_GROUP is not None:
+        raise RuntimeError(
+            'ProcessGroup already set by previous "load_with_process_group" '
+            'context manager')
+    _CURRENT_PROCESS_GROUP = process_group
+    try:
+        yield process_group
+    finally:
+        _CURRENT_PROCESS_GROUP = None
+
+def _get_current_process_group():
+    """
+    Retrieves the current process group set by ``load_with_process_group``.
+    If not set, it just returns the default group.
+    """
+    global _CURRENT_PROCESS_GROUP
+    if _CURRENT_PROCESS_GROUP is None:
+        return distributed_c10d._get_default_group()
+    else:
+        return _CURRENT_PROCESS_GROUP
+
+def _reshard_output(
+        module: torch.nn.Module,
+        resharding_spec: ShardingSpec) -> torch.nn.Module:
+    """
+    Hook a module with output resharding in the forward pass according
+    to the given ``resharding_spec``.
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose output needs to be resharded.
+        resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            The specification describing how the output of the module will be resharded.
+
+    Returns:
+        A :class:`torch.nn.Module` object with reshard API hooked.
+    """
+    def hook_func(_module, _input, output):
+        if isinstance(output, ShardedTensor) or isinstance(output, _PartialTensor):
+            return output.reshard(resharding_spec)
+        return output
+    module.register_forward_hook(hook_func)
+    return module
+
+def _collect_local_shard(module: torch.nn.Module) -> torch.nn.Module:
+    """
+    Hook a module with local shards collection in the forward pass.
+
+    This API is typically used to convert a sharded representation back to data parallel
+    representation. In particular, it returns the local tensor for this Shard. If the
+    size along the sharding dimension for the local tensor is 1, this dimension is removed
+    from the final result. For example a [4, 16] ShardedTensor across 4 ranks is typically
+    a local Tensor of size [16] across each rank and not [1, 16] across each rank.
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose output is ShardedTensor and the
+            local tensor value needs to be returned.
+
+    Returns:
+        A :class:`torch.nn.Module` object with collection API hooked.
+    """
+
+    def hook_func(_module, _input, output):
+        if isinstance(output, ShardedTensor):
+            local_tensor = output.local_tensor()
+            # Squeeze the # of dimensions manually, only applicable to ChunkShardingSpec
+            sharding_spec = output._sharding_spec
+            if isinstance(sharding_spec, ChunkShardingSpec) \
+               and local_tensor.size(sharding_spec.dim) == 1:  # type: ignore[attr-defined, arg-type]
+                local_tensor = local_tensor.squeeze(
+                    output._sharding_spec.dim  # type: ignore[attr-defined]
+                )
+            return local_tensor
+    module.register_forward_hook(hook_func)
+    return module
+
+def shard_module(
+    module: nn.Module,
+    plan: ShardingPlan,
+    src_rank=0,
+    process_group=None
+):
+    """
+    Shards a given module according to the provided sharding_plan. This method
+    first shards all the parameters according to the given sharding_plan. Then if
+    `output_plan` and `return_local_tensor` are specified in the sharding_plan, it
+    will tag the output of modules according `output_plan`, convert the module's
+    output back to data parallel according to `return_local_tensor`.
+
+    Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        module (:class:`torch.nn.Module`): The module to apply sharding to
+        sharding_plan (:class:`torch.distributed._shard.sharding_plan.ShardingPlan`):
+            The ShardingPlan which specified param name to ShardingSpec to apply to
+            each parameter.
+
+    Keyword args:
+         src_rank (int, optional): The source rank which is used as the ground truth of
+            the data for the module that would be sharded and scattered across the rest
+            of the ranks.
+            Default: 0.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+    """
+    # record Sharder paths for sanity check on the plan to ensure items in the plan
+    # does not conflict with the submodule tree that the Sharder is working with
+    sharder_paths = []
+    for name, spec in plan.plan.items():
+        if isinstance(spec, Sharder):
+            sharder_paths.append(name)
+
+    # shard the parameter according to the ShardingPlan
+    for name, spec in plan.plan.items():
+        if isinstance(spec, ShardingSpec):
+            # if found a sharding spec, try to shard the parameter
+            module_path, _, param_name = name.rpartition(".")
+
+            for sharder_path in sharder_paths:
+                if module_path.startswith(sharder_path):
+                    raise RuntimeError(f"ShardingPlan is in-valid, trying to shard a parameter: {name},"
+                                       f" but there's already a Sharder entry for module {sharder_path},"
+                                       f" parameter sharding should not conflict with the submodule tree"
+                                       f" that a Sharder is working with!")
+
+            mod = module.get_submodule(module_path)
+            shard_parameter(
+                mod,
+                param_name,
+                spec,
+                src_rank=src_rank,
+                process_group=process_group
+            )
+        elif isinstance(spec, Sharder):
+            parent_mod_path, _, mod_name = name.rpartition(".")
+            if name == "":
+                raise KeyError("Module path must not be empty for custom sharder!")
+            mod = module.get_submodule(name)
+            parent_mod = module.get_submodule(parent_mod_path)
+            sharded_mod = spec.shard(mod)
+            # swap this submodule with the sharded module
+            parent_mod.mod_name = sharded_mod
+        else:
+            raise TypeError(f"Only `ShardingSpec` and `Sharder` are supported to shard '{name}'")
+
+    # reshard output if there's an entry in `reshard_output` for this module
+    if plan.output_plan is not None:
+        for module_path, output_spec in plan.output_plan.items():
+            if isinstance(output_spec, ShardingSpec):
+                mod = module.get_submodule(module_path)
+                _reshard_output(mod, output_spec)
+            else:
+                raise TypeError(f"Only `ShardingSpec` is supported as output_plan for '{module_path}'")
+    # convert the output back to data parallel for the modules appears in
+    # `return_local_tensor` of the plan, we will call `_collect_local_shard`
+    # to collect the local tensor for output of modules
+    if plan.return_local_tensor is not None:
+        for module_path in plan.return_local_tensor:
+            mod = module.get_submodule(module_path)
+            _collect_local_shard(mod)
diff --git a/torch/distributed/_shard/checkpoint/__init__.py b/torch/distributed/_shard/checkpoint/__init__.py
new file mode 100644
index 000000000000..0eddc8128756
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/__init__.py
@@ -0,0 +1,15 @@
+from .metadata import (
+    BytesReadRequest,
+    BytesWriteRequest,
+    ShardedTensorMetadata,
+    ShardStorageMetadata,
+    TensorStorageMetadata,
+    Metadata,
+    TensorReadRequest,
+    TensorWriteRequest,
+)
+from .state_dict_loader import load_state_dict
+from .state_dict_saver import save_state_dict
+from .storage import StorageReader, StorageWriter
+from .filesystem import FileSystemReader, FileSystemWriter
+from .api import CheckpointException
diff --git a/torch/distributed/_shard/checkpoint/api.py b/torch/distributed/_shard/checkpoint/api.py
new file mode 100644
index 000000000000..0ca848c017d9
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/api.py
@@ -0,0 +1,18 @@
+from typing import Dict
+
+class CheckpointException(BaseException):
+    """
+    Exception raised if failure was detected as part of a checkpoint load or save.
+    """
+    def __init__(self, msg: str, failures: Dict[int, BaseException]):
+        super().__init__(msg, failures)
+        self._failures = failures
+
+    @property
+    def failures(self) -> Dict[int, BaseException]:
+        """
+        Returns:
+            Dict of failed nodes and their associated exception.
+              Keys are node ranks and values are exceptions
+        """
+        return self._failures
diff --git a/torch/distributed/_shard/checkpoint/filesystem.py b/torch/distributed/_shard/checkpoint/filesystem.py
new file mode 100644
index 000000000000..607cb297ed41
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/filesystem.py
@@ -0,0 +1,145 @@
+import os
+import operator
+import pickle
+from typing import List, Optional, Union, cast
+
+import torch
+from torch import Tensor
+from torch.futures import Future
+from pathlib import Path
+
+from .metadata import (
+    BytesReadRequest,
+    BytesWriteRequest,
+    Metadata,
+    TensorReadRequest,
+    TensorWriteRequest,
+)
+from .storage import StorageReader, StorageWriter
+
+
+class FileSystemWriter(StorageWriter):
+    """
+    Basic implementation of StorageWriter using file IO.
+
+    This implementation makes the following assumptions and simplifications:
+
+    * The checkpoint path is an empty or non-existing directory.
+    * File creation is atomic
+
+    The checkpoint consist of one file per write request plus
+    a `.metadata` file with the serialized metadata.
+
+    """
+    def __init__(self, path: Union[str, os.PathLike]) -> None:
+        """
+        Initialize the writer pointing to `path`
+
+        Args:
+            path: diretory where the checkpoint will be writen to.
+        """
+        super().__init__()
+        self.path = Path(path)
+
+    def write_bytes(self, requests: List[BytesWriteRequest]) -> Future[None]:
+        for req in requests:
+            with (self.path / req.storage_key).open("wb") as w:
+                w.write(req.bytes.getbuffer())
+                os.fsync(w.fileno())
+
+        fut: Future[None] = Future()
+        fut.set_result(None)
+        return fut
+
+    def write_tensors(self, requests: List[TensorWriteRequest]) -> Future[None]:
+        for req in requests:
+            # The following couple lines are simple implementation to get
+            # things going.
+            #
+            # At load time, to enable resharding, we use (sub)view of the tensor.
+            # Since the storage of the tensor might not be contiguous. we need to
+            # preserve the original view, to calculate the correct sub view at load.
+            #
+            # `torch.save` saves both the view and storage, it is a good option
+            # for unblocking. There are two drawbacks:
+            # 1. `torch.save` is pickle based, and pickle is not known for its
+            #   compatibility, we should consider replacing it with a more
+            #   stable option.
+            # 2. pickle is not streamable.
+            with (self.path / req.storage_key).open("wb") as w:
+                torch.save(req.tensor, w)
+                os.fsync(w.fileno())
+
+        fut: Future[None] = Future()
+        fut.set_result(None)
+        return fut
+
+    def prepare(self) -> None:
+        self.path.mkdir(parents=True, exist_ok=True)
+
+    def finish(self, metadata: Metadata) -> None:
+        with (self.path / ".metadata.tmp").open("wb") as metadata_file:
+            pickle.dump(metadata, metadata_file)
+            os.fsync(metadata_file.fileno())
+
+        (self.path / ".metadata.tmp").rename(self.path / ".metadata")
+
+class FileSystemReader(StorageReader):
+    def __init__(self, path: Union[str, os.PathLike]) -> None:
+        super().__init__()
+        self.path = Path(path)
+
+    def read_tensors(self, requests: List[TensorReadRequest]) -> Future[None]:
+        """
+        Very basic implementation that read from file system.
+        """
+        # Sort the the requests by storage key and try to reuse the loaded tensors
+        requests.sort(key=operator.attrgetter("storage_key"))
+
+        cached_storage_key = None
+        view_cached: Optional[Tensor] = None
+
+        for req in requests:
+            if cached_storage_key != req.storage_key or \
+                    (view_cached is not None and view_cached.device != req.tensor.device):
+
+                with (self.path / req.storage_key).open("rb") as storage:
+                    view_cached = cast(Tensor, torch.load(storage, map_location=req.tensor.device))
+                    cached_storage_key = req.storage_key
+
+            view_to_copy: Tensor = cast(Tensor, view_cached)
+            # FileSystemWrite writes the tensor as is during save.
+            # During load time, we will load the Tensor (with it orignal view)
+            # narrow it along all dimemsions, and copy_ it to the
+            # target tensor, which will be the same size.
+            for dim, (start, length) in enumerate(zip(req.offsets, req.lengths)):
+                view_to_copy = torch.narrow(view_to_copy, dim, start, length)
+
+            assert (
+                view_to_copy.size() == req.tensor.size()
+            ), f"The {req.storage_key} src/dst size does not match."
+
+
+            assert (
+                view_to_copy.device == req.tensor.device
+            ), f"cannot load across devices {view_to_copy.device} vs {req.tensor.device}"
+
+            req.tensor.copy_(view_to_copy)
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    def read_bytes(self, requests: List[BytesReadRequest]) -> Future[None]:
+        for req in requests:
+            with (self.path / req.storage_key).open("rb") as storage:
+                req.bytes.write(storage.read())
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    # Implementating the abstract function in StorageReader
+    def read_metadata(self) -> Metadata:
+        with (self.path / ".metadata").open("rb") as metadata_file:
+            return pickle.load(metadata_file)
diff --git a/torch/distributed/_shard/checkpoint/metadata.py b/torch/distributed/_shard/checkpoint/metadata.py
new file mode 100644
index 000000000000..98b605068368
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/metadata.py
@@ -0,0 +1,81 @@
+import io
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Union
+
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    ShardedTensorMetadata,
+    ShardMetadata,
+)
+
+TENSOR_TYPE = Union[torch.Tensor, ShardedTensor]
+
+@dataclass
+class ShardStorageMetadata:
+    shard_metadata: ShardMetadata
+    # storage key used for this particular Shard
+    storage_key: str
+    # Length in bytes for this shard
+    length: int
+
+
+# Metadata for each param.
+@dataclass
+class ShardedTensorStorageMetadata:
+    # Metadata for the sharded tensor itself
+    tensor_metadata: ShardedTensorMetadata
+
+    # Storage info for each Shard. There's no ordering requirement for this list.
+    storage_metadata: List[ShardStorageMetadata]
+
+
+@dataclass
+class TensorStorageMetadata:
+    # Storage key used for this tensor
+    storage_key: str
+
+    # Tensor sizes
+    size: torch.Size
+
+@dataclass
+class BytesStorageMetadata:
+    # Storage key used for this tensor
+    storage_key: str
+
+    # serialized payload size
+    length: int
+
+STORAGE_TYPES = Union[ShardedTensorStorageMetadata, TensorStorageMetadata, BytesStorageMetadata]
+
+@dataclass
+class Metadata:
+    # Keys are the same from the `state_dict` used.
+    state_dict_metadata: Dict[str, STORAGE_TYPES]
+
+@dataclass
+class BytesWriteRequest:
+    bytes: io.BytesIO
+    storage_key: str
+
+
+@dataclass
+class BytesReadRequest:
+    bytes: io.BytesIO
+    storage_key: str
+    fqn: str
+
+
+@dataclass
+class TensorWriteRequest:
+    tensor: torch.Tensor
+    storage_key: str
+
+
+@dataclass
+class TensorReadRequest:
+    tensor: torch.Tensor
+    storage_key: str
+    # offset and length w.r.t. to the storage identified by ``storage_key``
+    offsets: Tuple[int, ...]
+    lengths: Tuple[int, ...]
diff --git a/torch/distributed/_shard/checkpoint/resharding.py b/torch/distributed/_shard/checkpoint/resharding.py
new file mode 100644
index 000000000000..5f1d46c87f1c
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/resharding.py
@@ -0,0 +1,290 @@
+import hashlib
+import io
+from typing import List, Tuple, Dict
+
+import torch
+from torch import Tensor
+
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec import (
+    ShardMetadata,
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    _check_shard_metadata_pair_overlap,
+)
+
+from .metadata import (
+    BytesStorageMetadata,
+    BytesWriteRequest,
+    TensorReadRequest,
+    ShardStorageMetadata,
+    ShardedTensorStorageMetadata,
+    TensorStorageMetadata,
+    TensorWriteRequest,
+)
+
+def _create_storage_key(
+    storage_key_to_fqn: Dict[str, str],
+    fqn: str
+) -> str:
+    """
+    Compute the storage key from the Fully Qualified Name
+    Storage keys must respect the following properties:
+    1) Globally unique name across all objects and ranks.
+    2) Suitable for usage with common storage systems (IE, alphanumeric only)
+    """
+
+    storage_key = hashlib.sha256(bytes(fqn, "utf-8")).hexdigest()
+    counter = 0
+    while storage_key in storage_key_to_fqn:
+        storage_key = hashlib.sha256(bytes(f"{fqn}{counter}", "utf-8")).hexdigest()
+        counter += 1
+
+    storage_key_to_fqn[storage_key] = fqn
+    return storage_key
+
+# This constant is used as the separator character between tensor name and shard name
+STORAGE_KEY_SEPARATOR = "$"
+
+def _shards_get_overlap_region_wrt_saved_tensor(
+    saved_shard: ShardMetadata, current_shard: ShardMetadata
+) -> List[Tuple[int, int, int, int]]:
+    """
+    Return the overlapping region between saved_shard and current_shard.
+    There returned list has the same number of elements as the tensor's dimension.
+    For each element, we produce a tuple with the following contents:
+        (dimension, `saved_shard` offset, `current_shard` offset, length)
+
+    Offsets are relative to each shard.
+    """
+    narrows = []
+    for dim, (
+        saved_shard_offset,
+        current_shard_offset,
+        saved_shard_size,
+        current_shard_size,
+    ) in enumerate(
+        zip(
+            saved_shard.shard_offsets,
+            current_shard.shard_offsets,
+            saved_shard.shard_sizes,
+            current_shard.shard_sizes,
+        )
+    ):
+        min_range_end = min(
+            saved_shard_offset + saved_shard_size,
+            current_shard_offset + current_shard_size,
+        )
+
+        length = min_range_end - max(current_shard_offset, saved_shard_offset)
+
+        if saved_shard_offset > current_shard_offset:
+            offset_for_saved_tensor = 0
+            offset_for_current_tensor = saved_shard_offset - current_shard_offset
+        else:
+            offset_for_saved_tensor = current_shard_offset - saved_shard_offset
+            offset_for_current_tensor = 0
+
+        narrows.append(
+            (dim, offset_for_saved_tensor, offset_for_current_tensor, length)
+        )
+
+    return narrows
+
+
+def _get_sharded_tensor_element_size(tensor: ShardedTensor) -> int:
+    if len(tensor.local_shards()) > 0:
+        test_tensor = tensor.local_shards()[0].tensor
+    else:
+        dtype = tensor.metadata().tensor_properties.dtype
+        test_tensor = torch.empty((1,), dtype=dtype)
+
+    return test_tensor.element_size()
+
+
+def _compute_sharded_tensor_md(
+    tensor: ShardedTensor,
+    shard_to_storage_key: Dict[str, str]
+) -> ShardedTensorStorageMetadata:
+    smd = []
+    for shard_md in tensor.metadata().shards_metadata:
+        shard_storage_key = shard_to_storage_key[_get_shard_key(shard_md)]
+
+        shard_size = 1
+        for d in shard_md.shard_sizes:
+            shard_size *= d
+
+        # not particularly great
+        storage_size = shard_size * _get_sharded_tensor_element_size(tensor)
+
+        one_smd = ShardStorageMetadata(
+            shard_metadata=shard_md,
+            storage_key=shard_storage_key,
+            length=storage_size,
+        )
+        smd.append(one_smd)
+
+    return ShardedTensorStorageMetadata(
+        tensor_metadata=tensor.metadata(),
+        storage_metadata=smd,
+    )
+
+
+def _get_shard_key(shard: ShardMetadata) -> str:
+    """
+    Compute an unique key for a shard.
+
+    This key is unique vis-a-vis other shard of the owning ShardedTensor
+    """
+    return "_".join(str(i) for i in shard.shard_offsets)
+
+def _get_shard_storage_key(
+    tensor_storage_key: str,
+    shard: ShardMetadata,
+    storage_key_to_fqn: Dict[str, str]
+) -> str:
+    shard_key = f"{tensor_storage_key}{STORAGE_KEY_SEPARATOR}{_get_shard_key(shard)}"
+
+    return _create_storage_key(storage_key_to_fqn, shard_key)
+
+
+def _prepare_sharded_tensor_write(
+    sharded_tensor: ShardedTensor,
+    storage_key: str,
+    storage_key_to_fqn: Dict[str, str]
+) -> Tuple[List[TensorWriteRequest], ShardedTensorStorageMetadata]:
+    """
+    Prepare sharded tensor write.
+
+    Args:
+        sharded_tensor: The sharded tensor to persist.
+        storage_key: The identifier for `sharded_tensor`.
+        storage_key_to_fqn: dict used to produce storage keys
+
+    Returns:
+        Write requests for persisting the sharded tensor, and metadata
+        describing the persisted sharded tensor.
+
+    NB `storage_key` is used to compose the key names of the local shards.
+
+    """
+    write_requests = []
+    shard_to_storage_key: Dict[str, str] = dict()
+
+    for shard_md in sharded_tensor.metadata().shards_metadata:
+        shard_storage_key = _get_shard_storage_key(storage_key, shard_md, storage_key_to_fqn)
+        shard_to_storage_key[_get_shard_key(shard_md)] = shard_storage_key
+
+    for shard in sharded_tensor.local_shards():
+        tensor = shard.tensor.detach()
+        shard_storage_key = shard_to_storage_key[_get_shard_key(shard.metadata)]
+
+        wr = TensorWriteRequest(
+            tensor=tensor,
+            storage_key=shard_storage_key,
+        )
+        write_requests.append(wr)
+    return write_requests, _compute_sharded_tensor_md(
+        sharded_tensor, shard_to_storage_key
+    )
+
+
+def _prepare_sharded_tensor_read(
+    metadata: ShardedTensorStorageMetadata, sharded_tensor_out: ShardedTensor
+) -> List[TensorReadRequest]:
+    """
+    Prepare sharded tensor read.
+
+    Args:
+        metadata: Metadata describing the persisted sharded tensor. Normally,
+                  this is generated by func::`_prepare_sharded_tensor_write`.
+        sharded_tensor_out: The dest sharded tensor.
+
+    Returns:
+        A list of class::`TensorReadRequest`. When fullfilled,
+        `sharded_tensor_out`'s local shards load from the persisted sharded
+        tensor.
+    """
+    read_reqs = []
+    # this is a naive quadratic algo that can be optimized later
+    for shard in sharded_tensor_out.local_shards():
+        # scan all mds looking for chunks
+        for storage_md in metadata.storage_metadata:
+            shard_md_from_storage = storage_md.shard_metadata
+
+            # do they overlap?
+            if not _check_shard_metadata_pair_overlap(
+                shard.metadata, shard_md_from_storage
+            ):
+                continue
+
+            storage_key = storage_md.storage_key
+            target_tensor = shard.tensor.detach()
+            offsets = []
+            lengths = []
+            for (
+                dim,
+                offset_for_saved_tensor,
+                offset_for_current_tensor,
+                length,
+            ) in _shards_get_overlap_region_wrt_saved_tensor(
+                saved_shard=shard_md_from_storage, current_shard=shard.metadata
+            ):
+                # Note that we do NOT want to make any tensor copy.
+                # all operation must be view only
+                target_tensor = torch.narrow(
+                    target_tensor, dim, offset_for_current_tensor, length
+                )
+                offsets.append(offset_for_saved_tensor)
+                lengths.append(length)
+
+            read_reqs.append(
+                TensorReadRequest(
+                    tensor=target_tensor,
+                    storage_key=storage_key,
+                    offsets=tuple(offsets),
+                    lengths=tuple(lengths),
+                )
+            )
+    return read_reqs
+
+def _compute_tensor_md(storage_key: str, tensor: Tensor) -> TensorStorageMetadata:
+    return TensorStorageMetadata(
+        storage_key=storage_key,
+        size=tensor.size()
+    )
+
+def _prepare_tensor_write(
+    tensor: Tensor, fqn: str, storage_key_to_fqn: Dict[str, str]
+) -> Tuple[List[TensorWriteRequest], TensorStorageMetadata]:
+    storage_key = _create_storage_key(storage_key_to_fqn, fqn)
+
+    write_reqs = [
+        TensorWriteRequest(
+            tensor=tensor.detach(),
+            storage_key=storage_key,
+        )
+    ]
+    return (write_reqs, _compute_tensor_md(storage_key, tensor))
+
+
+def _compute_bytes_md(storage_key: str, bytes: io.BytesIO) -> BytesStorageMetadata:
+    return BytesStorageMetadata(
+        storage_key=storage_key,
+        length=len(bytes.getbuffer())
+    )
+
+def _prepare_bytes_write(
+    bytes: io.BytesIO, fqn: str, storage_key_to_fqn: Dict[str, str]
+) -> Tuple[List[BytesWriteRequest], BytesStorageMetadata]:
+    storage_key = _create_storage_key(storage_key_to_fqn, fqn)
+
+    write_reqs = [
+        BytesWriteRequest(
+            bytes=bytes,
+            storage_key=storage_key,
+        )
+    ]
+    return (write_reqs, _compute_bytes_md(storage_key, bytes))
diff --git a/torch/distributed/_shard/checkpoint/state_dict_loader.py b/torch/distributed/_shard/checkpoint/state_dict_loader.py
new file mode 100644
index 000000000000..cdb527ee1e3f
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/state_dict_loader.py
@@ -0,0 +1,289 @@
+import io
+from typing import Any, Dict, List, Tuple, Optional, cast
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    ShardedTensorMetadata
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    validate_non_overlapping_shards_metadata,
+    _check_shard_metadata_pair_overlap,
+)
+
+from .metadata import (
+    BytesReadRequest,
+    BytesStorageMetadata,
+    TensorReadRequest,
+    Metadata,
+    ShardedTensorStorageMetadata,
+    TensorStorageMetadata,
+)
+from .resharding import (
+    _prepare_sharded_tensor_read,
+    _shards_get_overlap_region_wrt_saved_tensor
+)
+from .storage import (
+    StorageReader,
+)
+
+from .api import CheckpointException
+
+def _reshard_and_prepare_read_request(
+    state_dict: Dict[str, Any], metadata_from_storage: Metadata
+) -> Tuple[List[BytesReadRequest], List[TensorReadRequest]]:
+    """
+    Use the loaded metadata and the current state dict to map the saved tensors to current tensor
+    """
+    tensor_read_requests = []
+    bytes_read_requests = []
+    for fqn, obj in state_dict.items():
+        if isinstance(obj, ShardedTensor):
+            md = metadata_from_storage.state_dict_metadata[fqn]
+            if isinstance(md, ShardedTensorStorageMetadata):
+                tensor_read_requests += _prepare_sharded_tensor_read(md, obj)
+            else:
+                raise ValueError(
+                    f"Invalid checkpoint metadata for {fqn}, " +
+                    f"expected ShardedTensorStorageMetadata but found {type(md)}"
+                )
+        elif isinstance(obj, torch.Tensor):
+            tensor = obj.detach()
+            md = metadata_from_storage.state_dict_metadata[fqn]
+            if isinstance(md, TensorStorageMetadata):
+                rr = TensorReadRequest(
+                    tensor=tensor,
+                    storage_key=md.storage_key,
+                    offsets=tuple([0] * len(tensor.size())),
+                    lengths=md.size,
+                )
+
+                tensor_read_requests.append(rr)
+            else:
+                raise ValueError(
+                    f"Invalid checkpoint metadata for {fqn}, " +
+                    f"expected TensorStorageMetadata but found {type(md)}"
+                )
+        else:
+            md = metadata_from_storage.state_dict_metadata[fqn]
+            # This is actually hard to handle correctly
+            # If the value is not a tensor but any random obj,
+            # we cannot just write whatever memory it points to inplace
+            # the best we can to is to replace it with an object of the same type
+            if isinstance(md, BytesStorageMetadata):
+                bytes_io = io.BytesIO()
+                brr = BytesReadRequest(
+                    bytes=bytes_io,
+                    storage_key=md.storage_key,
+                    fqn=fqn
+                )
+                bytes_read_requests.append(brr)
+            else:
+                raise ValueError(
+                    f"Invalid checkpoint metadata for {fqn}, " +
+                    f"expected BytesStorageMetadata but found {type(md)}"
+                )
+
+
+    return (bytes_read_requests, tensor_read_requests)
+
+
+def load_state_dict(
+    state_dict: Dict[str, Any],
+    storage_reader: StorageReader,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False
+) -> None:
+    """
+    Load a distributed state_dict in SPMD style.
+
+    Each rank will try to read the least amount of data necessary
+    to fullfill the requested `state_dict`.
+
+    When loading ShardedTensor instances, each rank only
+    reads data for their local shards.
+
+    All tensors in ``state_dict`` must be allocated on their
+    destination device prior to calling this function.
+
+    All non-tensor data is loaded using `torch.load()` and modified in place
+    on state_dict.
+
+    Users must call `load_state_dict` on the root module to ensure load
+    pos-processing and non-tensor data properly propagates.
+
+    This function can be used for local inference and load a checkpoint
+    produced by ``save_state_dict`` without having a process group initialized
+    by passing ``no_dist=True`` and by using Tensors instead of ShardedTensors.
+
+    Args:
+        state_dict (Dict[str, Any]) : The state_dict to load. Note that this
+            state dict will updated in places.
+        storage_reader (StorageReader): StorageReader used to load data from.
+        process_group (ProcessGroup): ProcessGroup to be used for cross-rank synchronization
+        coordinator_rank (int): Rank to use to coordinate the checkpoint, rank0 is used by default
+        no_dist (bool): Don't attempt to load in SPMD style. Default to False
+
+    Returns:
+        None.
+
+    Examples
+        >>> my_model = MyModule()
+        >>> optimizer = Adagrad(my_model.parameters())
+        >>> model_state_dict = my_model.state_dict()
+        >>> fs_storage_loader = torch.distributed._shard.checkpoint.FileSystemLoader("/checkpoint/1")
+
+        >>> torch.distributed._shard.checkpoint.load_state_dict(
+        >>>     state_dict=model_state_dict,
+        >>>     storage_reader=fs_storage_loader,
+        >>> )
+
+        >>> # module.load_state_dict() function might have customized steps
+        >>> # to flush the state_dict, must call it to
+        >>> # ensure correct behavior.
+        >>> my_model.load_state_dict(model_state_dict)
+
+    .. note:: load_state_dict uses collectives to coordinate reads across ranks.
+        For NCCL-based process groups, internal tensor representations of objects
+        must be moved to the GPU device before communication takes place. In this
+        case, the device used is given by ``torch.cuda.current_device()`` and it
+        is the user's responsibility to ensure that this is set so that each rank
+        has an individual GPU, via ``torch.cuda.set_device()``
+    """
+    is_coordinator = no_dist or dist.get_rank(process_group) == coordinator_rank
+
+    try:
+        metadata = storage_reader.read_metadata()
+        bytes_read_requests, tensor_read_requests = _reshard_and_prepare_read_request(
+            state_dict=state_dict, metadata_from_storage=metadata
+        )
+        bytes_futures = storage_reader.read_bytes(bytes_read_requests)
+        tensor_futures = storage_reader.read_tensors(tensor_read_requests)
+
+        bytes_futures.wait()
+
+        # Addtional steps are required to convert the bytes to its original type
+        # Note that this is NOT inplace,
+        # it creating a new object and replace what's in the state dict
+        for req in bytes_read_requests:
+            # Ensure the BytesIO is rewound
+            req.bytes.seek(0)
+            state_dict[req.fqn] = torch.load(req.bytes)
+
+        tensor_futures.wait()
+        result = None
+    except BaseException as e:
+        result = e
+
+    global_result: Optional[CheckpointException] = None
+    if not no_dist:
+        all_errors = [None] * dist.get_world_size(process_group)
+
+        dist.all_gather_object(
+            object_list=all_errors,
+            obj=result,
+            group=process_group)
+
+        node_failures = cast(Dict[int, BaseException], {i: err for i, err in enumerate(all_errors) if err is not None})
+        if len(node_failures) > 0:
+            global_result = CheckpointException("failed to read checkpoint", node_failures)
+    elif result is not None:
+        global_result = CheckpointException("failed to read storage", {coordinator_rank : result})
+
+    if global_result is not None:
+        raise global_result
+
+
+def _validate_sharded_tensor(
+    tensor_md: ShardedTensorMetadata, checkpoint_md: ShardedTensorStorageMetadata
+) -> None:
+    # We assume the incoming tensor has being validated during construction
+
+    # To ensure a checkpoint can satisfy loading a ST, we compute the loading
+    # plans for all shards and see if they are doable.
+    validate_non_overlapping_shards_metadata(
+        checkpoint_md.tensor_metadata.shards_metadata
+    )
+
+    for shard_md in tensor_md.shards_metadata:
+        read_volume = 0
+        for storage_md in checkpoint_md.storage_metadata:
+            shard_md_from_storage = storage_md.shard_metadata
+
+            if not _check_shard_metadata_pair_overlap(shard_md, shard_md_from_storage):
+                continue
+
+            shard_volume = 1
+            for (_, _, _, length,) in _shards_get_overlap_region_wrt_saved_tensor(
+                saved_shard=shard_md_from_storage, current_shard=shard_md
+            ):
+                shard_volume *= length
+            read_volume += shard_volume
+
+        shard_volume = 1
+        for size in shard_md.shard_sizes:
+            shard_volume *= size
+        if read_volume != shard_volume:
+            raise ValueError(
+                f"Shard {shard_md} only has {read_volume} available" +
+                "elements but needs {shard_volume}"
+            )
+
+def validate_metadata(
+    state_dict: Dict[str, Any], metadata: Metadata
+) -> None:
+    """
+    Verify if it's possible to correctly load `state_dict` from `metadata`.
+
+    This method validate if a checkpoint is usable with a given model
+    state_dict without loading it. It will raise ValueError if it finds
+    anything problematic.
+
+    Args:
+        state_dict: A state_dict to verify if it's loadable.
+        metadata: Checkpoint metadata to verify against.
+
+    Returns:
+        None
+
+    Example:
+        >>> my_model: torch.nn.Model = ....
+        >>> my_reader: torch.distributed._shard.checkpoint.StorageReader = ...
+
+        >>> torch.distributed._shard.checkpoint.validate_metadata(my_model.state_dict(), my_reader.read_metadata())
+        None
+    ```
+
+    """
+    for fqn, obj in state_dict.items():
+        if isinstance(obj, ShardedTensor):
+            if fqn not in metadata.state_dict_metadata:
+                raise ValueError(f"{fqn}: Could not find ShardedTensor metadata")
+
+            md = metadata.state_dict_metadata[fqn]
+            if not isinstance(md, ShardedTensorStorageMetadata):
+                raise ValueError(f"{fqn}: Expected ShardedTensorStorageMetadata but found: {type(md)}")
+
+            # Check if the overall ShardedTensor size is the same. Individual shards don't matter as we can reshard.
+            md_size = list(md.tensor_metadata.size)
+            tensor_size = list(obj.metadata().size)
+            if md_size != tensor_size:
+                raise ValueError(
+                    f"{fqn}: Incompatible ShardedTensor size: expectected {tensor_size} but found {md_size}"
+                )
+
+            _validate_sharded_tensor(obj.metadata(), md)
+        elif isinstance(obj, torch.Tensor):
+            if fqn not in metadata.state_dict_metadata:
+                raise ValueError(f"{fqn}: Could not find Tensor metadata")
+
+            md = metadata.state_dict_metadata[fqn]
+            if not isinstance(md, TensorStorageMetadata):
+                raise ValueError(f"{fqn}: Expected TensorStorageMetadata but found: {type(md)}")
+
+            if md.size != obj.size():
+                raise ValueError(
+                    f"{fqn}: Incompatible tensor size: expected {obj.size()} but found {md.size}"
+                )
diff --git a/torch/distributed/_shard/checkpoint/state_dict_saver.py b/torch/distributed/_shard/checkpoint/state_dict_saver.py
new file mode 100644
index 000000000000..27fd0f392e70
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/state_dict_saver.py
@@ -0,0 +1,216 @@
+import io
+from typing import Any, Dict, List, Tuple, Optional, Union
+
+
+import torch
+import torch.distributed as dist
+
+from torch import Tensor
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+
+from .metadata import (
+    Metadata,
+    BytesWriteRequest,
+    TensorWriteRequest,
+)
+from .resharding import (
+    _prepare_sharded_tensor_write,
+    _prepare_tensor_write,
+    _prepare_bytes_write
+)
+
+from .storage import (
+    StorageWriter,
+)
+
+from .api import CheckpointException
+
+# -------------- private functions --------------
+
+def _prepare(
+    state_dict: Dict[str, Any],
+    write_replicated_data: bool,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> Tuple[Metadata, List[BytesWriteRequest], List[TensorWriteRequest]]:
+    """
+    Build the serialization plan for a given state_dict
+
+    Args:
+        state_dict: The instance to plan for.
+
+    Returns:
+        A tuple with the following values:
+
+        metadata: Metadata
+        The storage metadata describing Tensor and ShardedTensors
+        instances found in `state_dict`. See `Metadata` for the schema.
+
+        size_for_storage_keys: Dict[str, int]
+            Key is the storage key name, value is the associated size
+            It can used to pre allocate the storage for parallel and non sequential writes.
+
+        bytes_write_requests: List[BytesWriteRequest]
+            List of ByteIO write requests that should be performed by the writer.
+
+        tensor_write_requests: List[TensorWriteRequest]
+            List of Tensor write requests that should be performed by the writer.
+
+    """
+    metadata = Metadata(state_dict_metadata={})
+    tensor_write_requests: List[TensorWriteRequest] = []
+    bytes_write_requests: List[BytesWriteRequest] = []
+    storage_key_to_fqn: Dict[str, str] = dict()
+
+    for fqn, obj in state_dict.items():
+        if isinstance(obj, ShardedTensor):
+            st_write_reqs, st_md = _prepare_sharded_tensor_write(obj, fqn, storage_key_to_fqn)
+            tensor_write_requests += st_write_reqs
+            metadata.state_dict_metadata[fqn] = st_md
+        elif isinstance(obj, Tensor):
+            write_reqs, tensor_md = _prepare_tensor_write(obj, fqn, storage_key_to_fqn)
+            if write_replicated_data:
+                tensor_write_requests += write_reqs
+            metadata.state_dict_metadata[fqn] = tensor_md
+        else:
+            bytes_io = io.BytesIO()
+            # This produces incomplete MD for rank > 0 since we won't populate bytes_io.
+            # This is ok since only rank == 0 uses this data
+            if write_replicated_data:
+                torch.save(obj, bytes_io)
+            byte_write_reqs, bytes_md = _prepare_bytes_write(bytes_io, fqn, storage_key_to_fqn)
+            if write_replicated_data:
+                bytes_write_requests += byte_write_reqs
+            metadata.state_dict_metadata[fqn] = bytes_md
+
+    return (metadata, bytes_write_requests, tensor_write_requests)
+
+def save_state_dict(
+    state_dict: Dict[str, Any],
+    storage_writer: StorageWriter,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False
+) -> None:
+    """
+    Save a distributed model in SPMD style.
+
+    This function is different from ``torch.save()`` as it handles
+    ``ShardedTensor`` by having each rank only save their local shards.
+
+    To produce a state_dict with ShardedTensor instances you must call
+    ``_register_state_dict_hook`` on the top module with value
+    `torch.distributed._shard.sharded_tensor.state_dict_hook` prior to
+    calling `state_dict()` on the top module.
+
+    There is no guarantees of Backwards Compatibility across PyTorch versions
+    for saved state_dicts.
+
+    If using the `process_group` argument, make sure that only its ranks
+    call `save_state_dict` and that all data in state_dict belong to it.
+
+    This function can be used to save a state_dict with an intialized process
+    group by passing ``no_dist=True``. This can be used to produce a checkpoint
+    that can consumed by load_state_dict is a SPMD fashion.
+
+    Args:
+        state_dict (Dict[str, Any]) : A state_dict
+        storage_writer (StorageWriter): Instance of StorageWrite use to perform writes.
+        process_group (ProcessGroup): ProcessGroup to be used for cross-rank synchronization
+        coordinator_rank (int): Rank to use to coordinate the checkpoint, rank0 is used by default
+        no_dist (bool): Don't attempt to save in SPMD style. Default to False
+
+    Example:
+        >>> my_model = MyModule()
+        >>> # We must call this function prior to state_dict()
+        >>> my_model._register_state_dict_hook(state_dict_hook)
+
+        >>> model_state_dict = my_model.state_dict()
+
+        >>> fs_storage_writer = torch.distributed._shard.checkpoint.FileSystemWriter("/checkpoint/1")
+        >>> torch.distributed._shard.checkpoint.save_state_dict(
+        >>>     state_dict=model_state_dict,
+        >>>     storage_writer=fs_stroage_writer,
+        >>> )
+
+    .. note:: save_state_dict uses collectives to coordinate writes across ranks.
+        For NCCL-based process groups, internal tensor representations of objects
+        must be moved to the GPU device before communication takes place. In this
+        case, the device used is given by ``torch.cuda.current_device()`` and it
+        is the user's responsibility to ensure that this is set so that each rank
+        has an individual GPU, via ``torch.cuda.set_device()``
+    """
+    is_coordinator = no_dist or dist.get_rank(process_group) == coordinator_rank
+
+    exceptions: List[Optional[BaseException]] = [None]
+    if is_coordinator:
+        try:
+            storage_writer.prepare()
+        except BaseException as e:
+            exceptions = [e]
+
+    # Writing can only start once prepare has finished
+    if not no_dist:
+        dist.broadcast_object_list(exceptions, group=process_group, src=coordinator_rank)
+
+    if exceptions[0] is not None:
+        raise CheckpointException("failed to prepare storage", {coordinator_rank : exceptions[0]})
+
+    rank_write_error: Optional[BaseException]
+    try:
+        (
+            metadata,
+            bytes_write_requests,
+            tensor_write_requests,
+        ) = _prepare(state_dict, is_coordinator, process_group)
+
+        combined_writes: List[Union[TensorWriteRequest, BytesWriteRequest]] = []
+        combined_writes.extend(tensor_write_requests)
+        combined_writes.extend(bytes_write_requests)
+
+        storage_writer.prepare_storage(combined_writes)
+        bytes_futures = storage_writer.write_bytes(bytes_write_requests)
+        tensor_futures = storage_writer.write_tensors(tensor_write_requests)
+        torch.futures.wait_all([bytes_futures, tensor_futures])
+        rank_write_error = None
+    except BaseException as e:
+        rank_write_error = e
+
+    all_errors: List[Optional[BaseException]]
+    # collect all write errors
+    if not no_dist:
+        all_errors = [None] * dist.get_world_size(process_group)
+        dist.gather_object(
+            obj=rank_write_error,
+            object_gather_list=all_errors if is_coordinator else None,
+            dst=coordinator_rank
+        )
+    else:
+        all_errors = [rank_write_error]
+
+    result: List[Optional[CheckpointException]] = [None]
+    if is_coordinator:
+        message: Optional[str] = None
+        # gather produces an array of arrays, flatten it
+        if any(all_errors):
+            message = "Failed to write data"
+        else:
+            try:
+                storage_writer.finish(metadata=metadata)
+            except BaseException as e:
+                all_errors[coordinator_rank] = e
+                message = "Failed to finish checkpoint"
+
+        if message is not None:
+            node_failures = {i: err for i, err in enumerate(all_errors) if err is not None}
+            result[0] = CheckpointException(message, node_failures)
+
+    if not no_dist:
+        dist.broadcast_object_list(
+            result,
+            group=process_group,
+            src=coordinator_rank)
+
+    if result[0] is not None:
+        raise result[0]
diff --git a/torch/distributed/_shard/checkpoint/storage.py b/torch/distributed/_shard/checkpoint/storage.py
new file mode 100644
index 000000000000..1325f0687fc3
--- /dev/null
+++ b/torch/distributed/_shard/checkpoint/storage.py
@@ -0,0 +1,188 @@
+import abc
+from typing import List, Union
+
+from torch.futures import Future
+
+from .metadata import (
+    BytesReadRequest,
+    BytesWriteRequest,
+    Metadata,
+    TensorReadRequest,
+    TensorWriteRequest,
+)
+
+class StorageWriter(abc.ABC):
+    """
+    Interface used by ``save_state_dict`` to write to storage.
+
+    A subclass should expect the following sequence of calls by ``save_state_dict``
+
+    1) (called once globally) prepare()
+    2) prepare_storage() with the writes that will be used with (3) and (4).
+    3) write_bytes
+    4) write_tensors.
+    5) Wait for (2) and (3) futures. If either fail, abort checkpoint.
+    6) (called once globally) finish().
+
+    There's a single process that executes methods that are called once globally.
+    The writes from (3) and (4) are initiated before any waiting is done.
+    The last call to finish() has the semantics of commiting the checkpoint.
+
+
+    """
+    @abc.abstractmethod
+    def prepare(self) -> None:
+        """
+        Initialize storage to receive the checkpoint.
+
+        This method is called once globally per checkpoint before any other method.
+        This is in contrast to ``prepare_storage`` which is called on each process
+        in parallel.
+
+        Returns:
+            Future to signal intialization is complete.
+        """
+        pass
+
+    @abc.abstractmethod
+    def write_bytes(self, requests: List[BytesWriteRequest]) -> Future[None]:
+        """
+        Initiate writes for all requests in `requests`.
+
+        Writing can happen asynchronously and/or concurrently. A blocking
+        implementation is valid.
+
+        Args:
+            requests (List[BytesWriteRequest]): A list of requests to write
+        Returns:
+            A future that completes once all writes have finished.
+        """
+        pass
+
+    @abc.abstractmethod
+    def write_tensors(self, requests: List[TensorWriteRequest]) -> Future[None]:
+        """
+        Initiate writes for all requests in `requests`.
+
+        Writing can happen asynchronously and/or concurrently. A blocking
+        implementation is valid.
+
+        Implementors are responsible for any device to host transfers required
+        to copy.
+
+        Args:
+            requests (List[TensorWriteRequest]): A list of requests to write
+
+        Returns:
+            A future that completes once all writes have finished.
+        """
+        pass
+
+    @abc.abstractmethod
+    def finish(self, metadata: Metadata) -> None:
+        """
+        Writes the metadata and marks the current checkpoint as sucessfull.
+
+        This method is called once globally after all data was writen
+        and is used to write its metadata and commit the checkpoint.
+
+        The `metadata` object includes a global view of the checkpoint
+        and, while writing it is optional, it must be recoverable by the
+        StorageReader implementation.
+
+        The actual format/schema used for serializing `metadata` is
+        considered and implementation detail.
+
+        Args:
+            metadata (Metadata): metadata for the new checkpoint
+
+        Returns:
+            None
+        """
+        pass
+
+    def prepare_storage(self, storage_writes: List[Union[TensorWriteRequest, BytesWriteRequest]]) -> None:
+        """
+        Prepare the underlying storage for upcoming writes.
+
+        This is an optional override intended for advanced scenarios where
+        a storage layer needs wants to do some work ahead of the writing itself.
+
+        This method is called on each process in parallel before any writes are performed.
+
+        The default implementation does nothing.
+
+        Args:
+            storage_writes (List[Union[TensorWriteRequest, BytesWriteRequest]]): A list of
+            all writes that will be submited.
+
+        Returns:
+            None
+        """
+        pass
+
+
+class StorageReader(abc.ABC):
+    """
+    Interface used by ``load_state_dict`` to read from storage.
+
+    A subclass should expected the following sequence of calls by ``load_state_dict``:
+
+    1) read_metadata() - on all ranks
+    2) read_bytes
+    3) read_tensors
+
+    The reads from (2) and (3) are initiated before any waiting is done.
+
+    Implementors must ensure host/device synchronization as part of
+    completion of both read requests.
+    """
+
+    @abc.abstractmethod
+    def read_bytes(self, requests: List[BytesReadRequest]) -> Future[None]:
+        """
+        Initiate read for all requests in `requests`.
+
+        Reading happen asynchronously and/or concurrently. A blocking
+        implementation is valid.
+
+        Args:
+            requests (List[BytesReadRequest]): A list of requests to read.
+
+        Return:
+            A future that completes once all read have finished.
+        """
+        pass
+
+    @abc.abstractmethod
+    def read_tensors(self, requests: List[TensorReadRequest]) -> Future[None]:
+        """
+        Initiate read for all requests in `requests`.
+
+        Reading happen asynchronously and/or concurrently. A blocking
+        implementation is valid.
+
+        Implementors must not assume that the original device
+        at write time will be the same at read time.
+
+        If an implementation uses asynchronous copies to device, it must
+        ensure proper synchronization W.R.T. the returned future.
+
+        Args:
+            requests (List[BytesReadRequest]): A list of requests to read.
+
+        Returns:
+            A future that completes once all read have finished.
+        """
+        pass
+
+    @abc.abstractmethod
+    def read_metadata(self) -> Metadata:
+        """
+        Reads the checkpoint metadata.
+
+        Returnss:
+            The metatada object associated with the checkpoint being loaded.
+
+        """
+        pass
diff --git a/torch/distributed/_shard/metadata.py b/torch/distributed/_shard/metadata.py
new file mode 100644
index 000000000000..ed15291dcecf
--- /dev/null
+++ b/torch/distributed/_shard/metadata.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass
+from typing import List, Union
+from functools import reduce
+
+from torch.distributed.remote_device import _remote_device
+
+@dataclass
+class ShardMetadata(object):
+    """
+    Represents a shard of the overall Tensor including its
+    offsets, lengths and device placement.
+
+    Args:
+        shard_offsets(List[int]): Offsets in the original tensor indicating
+            the start offsets for this shard. Should have the same rank as
+            the original tensor.
+        shard_sizes(List[int]): Integers indicating the size of each
+            dimension for this shard. Should have the same rank as the
+            original tensor.
+        placement(:class:`torch.distributed._remote_device`):
+            Specifies the placement of this shard.
+    """
+
+    __slots__ = ['shard_offsets', 'shard_sizes', 'placement']
+
+    shard_offsets: List[int]
+    shard_sizes: List[int]
+    placement: Union[str, _remote_device]
+
+    def __hash__(self):
+        def _hash_reduce(a, b):
+            return (a << 8) + hash(b)
+
+        res = reduce(_hash_reduce, self.shard_offsets, 37)
+        res = reduce(_hash_reduce, self.shard_sizes, res)
+        res = _hash_reduce(res, self.placement)
+        return res
+
+    def __post_init__(self):
+        if isinstance(self.placement, str):
+            self.placement = _remote_device(self.placement)
+
+        if len(self.shard_offsets) != len(self.shard_sizes):
+            raise ValueError(
+                f'shard_offsets and shard_sizes should have '
+                f'the same number of elements, found {len(self.shard_offsets)} '
+                f'and {self.shard_sizes} respectively')
+
+        for i in range(len(self.shard_offsets)):
+            if self.shard_offsets[i] < 0:
+                raise ValueError('shard_offsets should be >=0')
+            if self.shard_sizes[i] < 0:
+                raise ValueError('shard_sizes should be >= 0')
diff --git a/torch/distributed/_shard/partial_tensor.py b/torch/distributed/_shard/partial_tensor.py
new file mode 100644
index 000000000000..04160756cf23
--- /dev/null
+++ b/torch/distributed/_shard/partial_tensor.py
@@ -0,0 +1,299 @@
+import functools
+from typing import Callable, Dict
+
+import torch
+import torch.distributed as dist
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed import distributed_c10d
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed.nn.functional import (
+    reduce_scatter,
+)
+
+# Custom PartialTensor ops
+_PARTIAL_TENSOR_OPS: Dict[Callable, Callable] = {}
+def _register_partial_tensor_op(op, func):
+    from inspect import signature
+    if len(signature(func).parameters) != 3:
+        raise TypeError(
+            f'Partial tensor op function expects signature: '
+            f'(types, args, kwargs), but received '
+            f'signature: {signature(func)}')
+
+    global _PARTIAL_TENSOR_OPS
+    _PARTIAL_TENSOR_OPS[op] = func
+
+def _custom_partial_tensor_op(func):
+    """
+    Decorate for custom partial tensor op
+    Args:
+        func(Callable): Torch function for which we want to provide a PartialTensor
+            implementation (ex: torch.nn.functional.linear)
+    """
+    def decorator_sharded_func(wrapped_func):
+        _register_partial_tensor_op(func, wrapped_func)
+
+        @functools.wraps(wrapped_func)
+        def wrapper(*args, **kwargs):
+            return wrapped_func(*args, **kwargs)
+        return wrapper
+    return decorator_sharded_func
+
+class _PartialTensor(torch.Tensor):
+    """
+    PartialTensor is an abstraction to represent Tensors that need
+    aggregation across multiple devices and multiple processes.
+
+    PartialTensor is initialized in an SPMD like fashion where each rank
+    initializes the PartialTensor. The PartialTensor object on each rank
+    then only stores the local partial shard, process group and the
+    aggregation way to get a full tensor.
+
+    PartialTensor doesn't provide any Tensor like operations but is a
+    wrapper providing the Tensor representing the local partial shard.
+
+    We assume the size of each local tensor to be exactly the same.
+
+    Users can apply custom distributed sharded computations on top of
+    this primitive.
+
+    Args:
+        local_partial_shard (Tensor): Partial result stored across ranks.
+        process_group (ProcessGroup): The process group to aggregate on.
+        reduce_op (distributed_c10d.ReduceOp): Way to aggregate the partial result.
+            Default: ``distributed_c10d.ReduceOp.SUM``
+
+    Examples:
+        >>> # All tensors below are of torch.int64 type.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> tensor = torch.cat([tensor, tensor + 2])
+        >>> tensor
+        tensor([1, 2, 3, 4]) # Rank 0
+        tensor([3, 4, 5, 6]) # Rank 1
+        >>> partial_tensor = _PartialTensor(tensor, distributed_c10d.ReduceOp.MAX)
+        >>> sharding_dim = 0
+        >>> collect_spec = shard_spec.ChunkShardingSpec(
+                dim=sharding_dim,
+                placements=[
+                    "rank:0/cuda:0",
+                    "rank:1/cuda:1",
+                ],
+            )
+        >>> complete_tensor = partial_tensor.reshard(collect_spec)
+        >>> complete_tensor
+        ShardedTensor(
+            ShardedTensorMetadata(
+                shards_metadata=[
+                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
+                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
+                size=torch.Size([4])
+        )
+        >>> complete_tensor.local_tensor()
+        tensor([3, 4]) # Rank 0
+        tensor([5, 6]) # Rank 1
+
+        >>> # All tensors below are of torch.cfloat type.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> tensor = torch.tensor([1, 2]) + 2 * rank
+        >>> tensor = torch.cat([tensor, tensor + 2])
+        >>> tensor
+        tensor([1, 2, 3, 4]) # Rank 0
+        tensor([3, 4, 5, 6]) # Rank 1
+        >>> partial_tensor = _PartialTensor(tensor)
+        >>> complete_tensor = partial_tensor.reshard(collect_spec)
+        >>> complete_tensor
+        ShardedTensor(
+            ShardedTensorMetadata(
+                shards_metadata=[
+                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
+                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
+                size=torch.Size([4])
+        )
+        >>> complete_tensor.local_tensor()
+        tensor([4, 6]) # Rank 0
+        tensor([8, 10]) # Rank 1
+    """
+
+    _process_group: distributed_c10d.ProcessGroup
+    _local_shard: torch.Tensor
+    _reduce_op: distributed_c10d.ReduceOp
+
+    __slots__ = ["_process_group", "_local_shard", "_reduce_op"]
+
+    def __new__(cls, local_shard, process_group=None, reduce_op=distributed_c10d.ReduceOp.SUM):
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            local_shard.size(),
+            dtype=local_shard.dtype,
+            layout=local_shard.layout,
+            pin_memory=local_shard.is_pinned(),
+            requires_grad=local_shard.requires_grad)      # type: ignore[arg-type]
+        r._process_group = (     # type: ignore[attr-defined]
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        r._reduce_op = reduce_op
+        r._local_shard = local_shard
+        return r
+
+    def __post_init__(self):
+        if not isinstance(self._reduce_op, distributed_c10d.ReduceOp):
+            raise ValueError(
+                "reduce_op needs to be a member of distributed_c10d.ReduceOp."
+            )
+
+    def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
+        """
+        The reshard happens in two steps logically:
+
+        1. Aggregate all the shards of the partial tensor.
+        2. Shard this tensor according to the provided spec.
+
+        In reality, for the sake of performance, we consolidate all partial tensors
+        across multiple ranks and covert to a sharded tensor in one step.
+
+        Args:
+            resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+                The specification describing how we reshard the aggregated local result.
+
+        Returns:
+            A :class:`ShardedTensor` filled with local aggregated result.
+        """
+        if not isinstance(resharding_spec, shard_spec.ChunkShardingSpec):
+            raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
+        if self._local_shard.is_complex():
+            raise NotImplementedError("Only real partial tensor supported for reshard.")
+        sharding_dim = int(resharding_spec.dim)  # type: ignore[attr-defined]
+        chunk_mode_res = self._local_shard.size(sharding_dim) % self._process_group.size()
+        local_shard = self._local_shard
+        # Add padding when the size is not divisible by the world size.
+        if chunk_mode_res != 0:
+            padding = [0] * (local_shard.dim() * 2)
+            padding[-1] = self._process_group.size() - chunk_mode_res
+            local_shard = torch.nn.functional.pad(
+                local_shard,
+                tuple(padding),
+                "constant",
+                0,
+            )
+        current_rank = dist.get_rank(self._process_group)  # type: ignore[attr-defined]
+        rank_idx = None
+        rearrange_local_shards = False
+        indices = [0] * self._process_group.size()
+        for idx, placement in enumerate(resharding_spec.placements):  # type: ignore[attr-defined]
+            if placement.rank() == current_rank:  # type: ignore[index, union-attr]
+                rank_idx = idx  # type: ignore[attr-defined]
+            if placement.rank() != idx:  # type: ignore[index, union-attr]
+                rearrange_local_shards = True
+            indices[placement.rank()] = idx  # type: ignore[index, union-attr]
+
+        local_shards = local_shard.chunk(self._process_group.size(), dim=sharding_dim)
+        if rearrange_local_shards:
+            # Need to re-arrange original shard_dim of output_tensor_list.
+            local_shards = [local_shards[idx] for idx in indices]  # type: ignore[call-overload]
+        local_result = reduce_scatter(
+            torch.empty_like(local_shards[0]), list(local_shards), op=self._reduce_op
+        )
+
+        sharded_tensor_size = self._local_shard.size()
+        # Remove padding when the size is not divisible by the world size.
+        if chunk_mode_res != 0:
+            uneven_local_shards = self._local_shard.chunk(
+                self._process_group.size(), dim=sharding_dim
+            )
+            expected_size = uneven_local_shards[rank_idx].size()  # type: ignore[index]
+            if local_result.size() != expected_size:
+                local_result = local_result.narrow(
+                    sharding_dim,
+                    0,
+                    expected_size[sharding_dim],
+                )
+        return ShardedTensor._init_from_local_tensor(
+            local_result,
+            resharding_spec,
+            sharded_tensor_size,
+            process_group=self._process_group,
+        )
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if func in _PARTIAL_TENSOR_OPS:
+            return _PARTIAL_TENSOR_OPS[func](types, args, kwargs)
+
+        # Need to disable all dispatch to print args and kwargs appropriately.
+        guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+        try:
+            with torch._C.DisableTorchFunction():
+                raise RuntimeError(
+                    f"torch function '{func.__name__}', with args: {args} and "
+                    f"kwargs: {kwargs} not supported for PartialTensor!")
+        finally:
+            del guard
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        raise RuntimeError(
+            f"A {cls.__name__} object is being used from c++ "
+            f"while calling {func.__module__}.{func.__name__} "
+            "but the there is no custom __torch_dispatch__ implementation for it."
+        )
+
+    def __repr__(self):
+        return f"PartialTensor({super(_PartialTensor, self).__repr__()})"
+
+def _transpose_impl(types, args=(), kwargs=None):
+    partial_tensor = args[0]
+    input = partial_tensor._local_shard
+    dim0 = args[1]
+    dim1 = args[2]
+    return _PartialTensor(
+        torch.transpose(input, dim0, dim1),
+        partial_tensor._process_group,
+        partial_tensor._reduce_op
+    )
+
+@_custom_partial_tensor_op(torch.Tensor.transpose)
+def partial_transpose(types, args=(), kwargs=None):
+    return _transpose_impl(types, args, kwargs)
+
+@_custom_partial_tensor_op(torch.transpose)
+def partial_torch_transpose(types, args=(), kwargs=None):
+    return _transpose_impl(types, args, kwargs)
+
+@_custom_partial_tensor_op(torch.cat)
+def partial_cat(types, args=(), kwargs=None):
+    input_list = args[0]
+    if len(input_list) == 0:
+        raise RuntimeError('Empty list of tensors to torch.cat!')
+
+    local_shards = []
+    for idx, input in enumerate(input_list):
+        if not isinstance(input, _PartialTensor):
+            raise RuntimeError('All inputs need to be an instance of _PartialTensor')
+        if idx == 0:
+            reduce_op = input._reduce_op
+        elif reduce_op != input._reduce_op:
+            raise RuntimeError(
+                'All _PartialTensor reduce_ops need to be the same, found: '
+                '{reduce_op} and {input._reduce_op}'
+            )
+
+        local_shards.append(input._local_shard)
+
+    if kwargs is None:
+        dim = 0
+    else:
+        if 'out' in kwargs:
+            raise RuntimeError('"out" kwarg is not supported!')
+        dim = kwargs['dim'] if 'dim' in kwargs else 0
+
+    return _PartialTensor(torch.cat(local_shards, dim), input._process_group, input._reduce_op)
+
+@_custom_partial_tensor_op(torch.Tensor.size)
+def partial_size(types, args=(), kwargs=None):
+    if kwargs is None:
+        kwargs = {}
+    with torch._C.DisableTorchFunction():
+        return torch.Tensor.size(*args, **kwargs)
diff --git a/torch/distributed/_shard/replicated_tensor.py b/torch/distributed/_shard/replicated_tensor.py
new file mode 100644
index 000000000000..1327f89e00aa
--- /dev/null
+++ b/torch/distributed/_shard/replicated_tensor.py
@@ -0,0 +1,167 @@
+import torch
+import torch.distributed as dist
+
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed import distributed_c10d
+from torch.overrides import get_default_nowrap_functions
+
+_REPLICATED_WITH_NON_TENSOR_ALLOWLIST = [
+    # List of ops where if parameters are a combination of ReplicatedTensors
+    # and non-tensors, we can still return a ReplicatedTensor as the result.
+    torch.unsqueeze,
+    torch.Tensor.unsqueeze,
+    torch.Tensor.__getitem__,
+]
+
+class ReplicatedTensor(torch.Tensor):
+    """
+    ReplicatedTensor represents a tensor which is replicated across the `world_size` and
+    has the same value on each rank.
+
+    ReplicatedTensor is a :class:`~torch.Tensor` subclass, and it could be used together
+    with ShardedTensor/Tensor together to express different types of computation. The
+    inter-op rules defined as (using torch.add as an example op):
+        ReplicatedTensor + ReplicatedTensor = ReplicatedTensor
+        ReplicatedTensor + torch.Tensor = torch.Tensor
+        ReplicatedTensor + ShardedTensor = ShardedTensor
+        ReplicatedTensor + other type (i.e. Scalar) = other type
+
+    NOTE: We do not gurantee equal content of ReplicatedTensor across nodes after its
+    construction. Although we defined proper inter-op rules to make sure ReplicatedTensor
+    stays the same, there's no enforcement on it (i.e. if you manually modify content on
+    some ranks, the modified value will not automatically get synced to other nodes). If
+    you wish to manually validate tensors are the same across ranks, use `validate()`.
+
+    """
+    _process_group: distributed_c10d.ProcessGroup
+
+    __slots__ = ["_process_group"]
+
+    def __new__(cls, data=None, process_group=None):
+        if data is None:
+            data = torch.empty(0)
+        r = torch.Tensor._make_subclass(cls, data, data.requires_grad)      # type: ignore[arg-type]
+        r._process_group = (     # type: ignore[attr-defined]
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        return r
+
+    def __deepcopy__(self, memo):
+        if id(self) in memo:
+            return memo[id(self)]
+        else:
+            result = type(self)(self.data.clone(memory_format=torch.preserve_format), self._process_group)
+            memo[id(self)] = result
+            return result
+
+    def __repr__(self):
+        return f"ReplicatedTensor({super(ReplicatedTensor, self).__repr__()})"
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        # We will re-dispatch the execution to ShardedTensor __torch_function__
+        # if we find there're ShardedTensor operands. We will also check if args/kwargs
+        # are all replicated tensor operands, we have to do this to ensure we do not
+        # converting results back to ReplicatedTensor if not all operands are replicated.
+        all_replicated = True
+        replicated_with_non_tensor = True
+        replicated_pg = None
+
+        def dispatch_arg(arg):
+            # This function returns a tuple, first element represents whether the op been
+            # executed, the second element represents the result of the execution
+            nonlocal replicated_pg, all_replicated, replicated_with_non_tensor
+            if isinstance(arg, ShardedTensor):
+                # redispatch to ShardedTensor
+                # TODO: handle ShardedTensor/PartialTensor inter-op with ReplicatedTensor
+                return True, arg.__torch_function__(func, types, args, kwargs)
+            if isinstance(arg, ReplicatedTensor):
+                if replicated_pg is None:
+                    replicated_pg = arg._process_group
+                elif replicated_pg != arg._process_group:
+                    raise RuntimeError(
+                        f"ReplicatedTensor operands must be in the same process group "
+                        f"in torch function '{func.__name__}', but found at least two "
+                        f"ReplicatedTensor operands in different process groups! ")
+            elif isinstance(arg, torch.Tensor):
+                replicated_with_non_tensor = False
+                all_replicated = False
+            else:
+                all_replicated = False
+
+            return False, None
+
+        for arg in args:
+            redispatched, res = dispatch_arg(arg)
+            if redispatched:
+                return res
+
+        if kwargs is not None:
+            for k, v in kwargs.items():
+                redispatched, res = dispatch_arg(v)
+                if redispatched:
+                    return res
+
+        # We cann't do super().__torch_function__() as it implicitly convert the result
+        # back to tensor subclasses, where in our case, we need to control the output type
+        # base on the inter-op rules we defined.
+        with torch._C.DisableTorchFunction():
+            rs = func(*args, **kwargs)
+            if func in get_default_nowrap_functions():
+                return rs
+
+            result_not_replicated = isinstance(rs, torch.Tensor) and not isinstance(rs, ReplicatedTensor)
+            should_convert_to_replicated = all_replicated or (
+                replicated_with_non_tensor and func in _REPLICATED_WITH_NON_TENSOR_ALLOWLIST
+            )
+            if result_not_replicated and should_convert_to_replicated:
+                # if all operands are ReplicatedTensors and does not get dispatched to ShardedTensor
+                # __torch_function__, result is a torch.Tensor, then we convert and return a
+                # ReplicatedTensor according to our inter-op rule
+                rs = rs.as_subclass(ReplicatedTensor)        # type: ignore[arg-type]
+                # propagate the process_group field to result
+                rs._process_group = replicated_pg        # type: ignore[attr-defined]
+
+            return rs
+
+    def validate(self) -> bool:
+        """
+        Validate the ReplicatedTensor is legit by all gathering tensors on all ranks
+        and check to make sure they are the same.
+
+        If there's some ranks with different values, a ValueError will be raised.
+
+        Keyword args:
+            process_group (ProcessGroup, optional): The process group to work on. If None,
+                the default process group will be used.
+
+        Returns:
+            True if validation succeed.
+        """
+        world_size = dist.get_world_size(self._process_group)
+        current_rank = dist.get_rank(self._process_group)
+
+        tensors_on_rank = [torch.empty_like(self) for _ in range(world_size)]
+
+        dist.all_gather(tensors_on_rank, self, group=self._process_group)
+        # validate and check if all tensors are equal
+        for rank, tensor in enumerate(tensors_on_rank):
+            if not torch.allclose(self, tensor):
+                raise ValueError(
+                    f"ReplicatedTensor have different values on rank {current_rank} and {rank}")
+
+        return True
+
+    def __setstate__(self, state):
+        with torch._C.DisableTorchFunction():
+            self.data = state
+            self.requires_grad = state.requires_grad
+            from torch.distributed._shard.api import _get_current_process_group
+            self._process_group = _get_current_process_group()
+
+    def __getstate__(self):
+        return self.data
diff --git a/torch/distributed/_shard/sharded_tensor/__init__.py b/torch/distributed/_shard/sharded_tensor/__init__.py
index 887899e1896d..7fcf606ce1d0 100644
--- a/torch/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/distributed/_shard/sharded_tensor/__init__.py
@@ -2,32 +2,23 @@
 
 import copy
 import functools
-import torch
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-    ShardingSpec,
-)
-from torch.distributed._shard.sharding_spec._internals import (
-    get_chunked_dim_size,
-    get_split_size,
-)
 from typing import List
 
+import torch
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.partial_tensor import _PartialTensor
+
 from .api import (
     _register_sharded_op,
     Shard,
-    ShardMetadata,
     ShardedTensor,
     ShardedTensorMetadata,
     TensorProperties,
-    _PartialTensor,
 )
-from .utils import load_with_process_group
-import torch.distributed as dist
-from torch.distributed import distributed_c10d
+from .metadata import ShardMetadata  # noqa: F401
 
 
-def empty(sharding_spec: ShardingSpec,
+def empty(sharding_spec: shard_spec.ShardingSpec,
           *size,
           dtype=None,
           layout=torch.strided,
@@ -79,7 +70,7 @@ def empty(sharding_spec: ShardingSpec,
         init_rrefs=init_rrefs,
     )
 
-def ones(sharding_spec: ShardingSpec,
+def ones(sharding_spec: shard_spec.ShardingSpec,
          *size,
          dtype=None,
          layout=torch.strided,
@@ -130,7 +121,7 @@ def ones(sharding_spec: ShardingSpec,
         init_rrefs=init_rrefs
     )
 
-def zeros(sharding_spec: ShardingSpec,
+def zeros(sharding_spec: shard_spec.ShardingSpec,
           *size,
           dtype=None,
           layout=torch.strided,
@@ -181,9 +172,10 @@ def zeros(sharding_spec: ShardingSpec,
         init_rrefs=init_rrefs
     )
 
-def full(sharding_spec: ShardingSpec,
+def full(sharding_spec: shard_spec.ShardingSpec,
          size,
-         fill_value=torch.types.Number,
+         fill_value,
+         *,
          dtype=None,
          layout=torch.strided,
          requires_grad=False,
@@ -233,7 +225,7 @@ def full(sharding_spec: ShardingSpec,
     torch.nn.init.constant_(sharded_tensor, fill_value)  # type: ignore[arg-type]
     return sharded_tensor
 
-def rand(sharding_spec: ShardingSpec,
+def rand(sharding_spec: shard_spec.ShardingSpec,
          *size,
          dtype=None,
          layout=torch.strided,
@@ -243,16 +235,15 @@ def rand(sharding_spec: ShardingSpec,
          process_group=None,
          init_rrefs=False) -> ShardedTensor:
     """
-    Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype
-        is inferred from fill_value. If dtype is specified, it will override the
-        inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion.
+    Creates a :class:`ShardedTensor` filled with random numbers from a uniform distribution
+        on the interval :math:`[0, 1)`. The shape of the tensor is defined by the
+        variable argument `size`. Needs to be called on all ranks in an SPMD fashion.
 
     Args:
         sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
             describing how to shard the Tensor.
         size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
             output tensor.
-        fill_value (Scalar) – the value to fill the output tensor with.
 
     Keyword args:
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
@@ -287,6 +278,60 @@ def rand(sharding_spec: ShardingSpec,
     torch.nn.init.uniform_(sharded_tensor, 0, 1)  # type: ignore[arg-type]
     return sharded_tensor
 
+def randn(sharding_spec: shard_spec.ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False) -> ShardedTensor:
+    """
+    Creates a :class:`ShardedTensor` filled with random numbers from a uniform distribution
+        with mean `0` and variance `1` (also called standard normal distribution). The shape
+        of the tensor is defined by the variable argument `size`. Needs to be called on all ranks
+        in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
+            output tensor.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    sharded_tensor = ShardedTensor(
+        sharding_spec,
+        *size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+    torch.nn.init.normal_(sharded_tensor, 0, 1)  # type: ignore[arg-type]
+    return sharded_tensor
+
 def init_from_local_shards(
         local_shards: List[Shard],
         *global_size,
@@ -374,7 +419,7 @@ def sharded_op_impl(func):
     parameters, the function provided will be invoked for that operator.
 
     Example::
-        >>> @custom_sharded_op(torch.nn.functional.linear)
+        >>> @sharded_op_impl(torch.nn.functional.linear)
         >>> def my_custom_sharded_linear(types, args, kwargs, process_group):
         >>>   ....
         >>>
@@ -396,65 +441,16 @@ def sharded_op_impl(func):
             implementation (ex: torch.nn.functional.linear)
     """
     def decorator_sharded_func(wrapped_func):
-        _register_sharded_op(func, wrapped_func)
+        from torch.distributed._shard.sharded_tensor._ops._common import _basic_validation
 
         @functools.wraps(wrapped_func)
-        def wrapper(*args, **kwargs):
-            return wrapped_func(*args, **kwargs)
+        def wrapper(types, args, kwargs, process_group):
+            _basic_validation(func, args, kwargs)
+            return wrapped_func(types, args, kwargs, process_group)
+
+        _register_sharded_op(func, wrapper)
         return wrapper
     return decorator_sharded_func
 
 # Import all builtin sharded ops
 from ._ops import *  # noqa: F403
-
-def _reshard_output(
-        module: torch.nn.Module,
-        resharding_spec: ShardingSpec) -> torch.nn.Module:
-    """
-    Hook a module with local shards collection in the forward pass according
-    to the given ``resharding_spec``.
-
-    Args:
-        module (:class:`torch.nn.Module`): Module whose output needs to be resharded.
-        resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
-            The specification describing how the output of the module will be resharded.
-
-    Returns:
-        A :class:`torch.nn.Module` object with collection API hooked.
-    """
-    def hook_func(_module, _input, output):
-        if isinstance(output, ShardedTensor) or isinstance(output, _PartialTensor):
-            return output.reshard(resharding_spec)
-        return output
-    module.register_forward_hook(hook_func)
-    return module
-
-
-def _collect_local_shard(module: torch.nn.Module) -> torch.nn.Module:
-    """
-    Hook a module with local shards collection in the forward pass.
-
-    This API is typically used to convert a sharded representation back to data parallel
-    representation. In particular, it returns the local tensor for this Shard. If the
-    size along the sharding dimension for the local tensor is 1, this dimension is removed
-    from the final result. For example a [4, 16] ShardedTensor across 4 ranks is typically
-    a local Tensor of size [16] across each rank and not [1, 16] across each rank.
-
-    Args:
-        module (:class:`torch.nn.Module`): Module whose output needs to be resharded.
-
-    Returns:
-        A :class:`torch.nn.Module` object with collection API hooked.
-    """
-
-    def hook_func(_module, _input, output):
-        if isinstance(output, ShardedTensor):
-            local_tensor = output.local_tensor()
-            # Squeeze the # of dimensions manually.
-            if local_tensor.size(output._sharding_spec.dim) == 1:  # type: ignore[attr-defined]
-                local_tensor = local_tensor.squeeze(
-                    output._sharding_spec.dim  # type: ignore[attr-defined]
-                )
-            return local_tensor
-    module.register_forward_hook(hook_func)
-    return module
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/__init__.py b/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
index f922a646bfd1..1bbc079f7843 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
@@ -1,7 +1,16 @@
+import torch.distributed._shard.sharded_tensor._ops.chunk
 import torch.distributed._shard.sharded_tensor._ops.elementwise_ops
+import torch.distributed._shard.sharded_tensor._ops.math_ops
+import torch.distributed._shard.sharded_tensor._ops.matrix_ops
+import torch.distributed._shard.sharded_tensor._ops.tensor_ops
 
 from .binary_cmp import equal, allclose
-from .embedding import sharded_embedding
-from .embedding_bag import sharded_embedding_bag
 from .init import kaiming_uniform_, normal_, uniform_, constant_
-from .linear import sharded_linear
+
+# Import all ChunkShardingSpec ops
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.linear import sharded_linear
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.embedding import sharded_embedding
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.embedding_bag import sharded_embedding_bag
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.softmax import sharded_softmax
+import torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.math_ops
+import torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.matrix_ops
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/_common.py b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
index 20ab34ccdf85..743ec09ef727 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/_common.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
@@ -1,380 +1,148 @@
-# coding=utf-8
-
-from typing import List
-
-import torch
-import torch.distributed as dist
-from torch.distributed._shard.sharding_spec._internals import (
-    get_split_size,
-    get_chunked_dim_size,
-)
-from torch.distributed.nn.functional import (
-    all_gather,
-    all_to_all_single,
+import functools
+from torch.distributed._shard.sharded_tensor import (
+    sharded_op_impl,
+    Shard,
+    ShardedTensor,
 )
+from torch.distributed._shard.partial_tensor import _PartialTensor
+from torch.distributed._shard.replicated_tensor import ReplicatedTensor
+from torch.utils._pytree import tree_map
 
-
-def _handle_col_wise_sharding_base(
-    op_func,
-    col_dim,
-    input,
-    world_size,
-    weight,
-    local_shard,
-    pg,
-    gathered_inputs=None,
-    mode=None,
-    gathered_per_sample_weights=None,
-    gathered_offsets=None,
-    padding_idx=None,
-):
+def _basic_validation(op, args=(), kwargs=None):
     """
-    For col-wise sharding of weight, lots of logic are common.
-    So we extract the common logic and put in this function:
-    Step 1. To get input from each rank and
-    Step 2. To perform the op on the concatenated tensor.
-    Step 3. To distribute results to each rank with col rearrangement.
-    Step 4. To concatenate all results from all ranks.
-
-    Args:
-        op_func: operator which is applied to the input tensor.
-        col_dim: dim of result tensor after the operation.
-        input: tensor to be applied op on.
-        world_size: number of ranks.
-        weight: shareded weight tensor.
-        local_shard: col-wise sharded weight tensor.
-        pg: process group.
-        gathered_inputs: list of inputs from all ranks. If specified, we
-            don't need to communicate with each rank any more.
-        mode: aggregation mode of EmbeddingBag.
-        gathered_per_sample_weights: per_sample_weights across all ranks.
-        gathered_offsets: offsets across all ranks.
-        padding_idx: If specified, the entries at padding_idx do
-            not contribute to the gradient; therefore, the embedding
-            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
-            Note that the embedding vector at padding_idx is
-            excluded from the reduction.
-
-    Return: final result of input being applied with the op.
+    Common validation across all ops go in here.
     """
-    if gathered_inputs is None:
-        # allgather the inputs first.
-        gathered_inputs = all_gather(input, group=pg)
-
-    # run the operator's function for all the inputs.
-    results = []
-    for i, inp in enumerate(gathered_inputs):
-        if op_func == torch.nn.functional.embedding_bag:
-            result = op_func(
-                inp,
-                local_shard,
-                offsets=gathered_offsets[i] if gathered_offsets is not None else None,
-                mode=mode,
-                per_sample_weights=gathered_per_sample_weights[i]
-                if gathered_per_sample_weights is not None
-                else None,
-                padding_idx=padding_idx,
-            )
-        elif op_func == torch.nn.functional.embedding:
-            result = op_func(
-                inp,
-                local_shard,
-                padding_idx=padding_idx,
-            )
-        else:
-            result = op_func(inp, local_shard)
-        results.append(torch.transpose(result, 0, col_dim))
-
-    # Distribute results to each rank with col rearrangement.
-    output = _result_distribute_with_col_rearrange(
-        results, input, world_size, weight, pg
-    )
-
-    # transpose the output and return result.
-    return torch.transpose(output, 0, col_dim)
+    if len(args) == 0 and (kwargs is None or len(kwargs) == 0):
+        raise ValueError(f" No input for '{op.__name__}'!")
 
+    # Validate types
+    has_distributed_tensor = False
 
-def _result_distribute_with_col_rearrange(
-    results, input, world_size, weight, pg
-):
-    """
-    For col-wise sharding of weight, we need to distribute
-    results to each rank. We do them in this function.
-    Note that, if the index in the Sharding Spec is not equal to
-    the rank number, we need to do the rearrangement based on the
-    order given by the Sharding Spec (placement).
-
-    Args:
-        results: results from ops applied to inputs from all ranks.
-            We need to distribute them back to their original ranks.
-        input: tensor to be applied op to.
-        world_size: number of ranks.
-        weight: shareded weight tensor.
-        pg: process group.
+    def is_distributed_tensor(e):
+        nonlocal has_distributed_tensor
+        if isinstance(e, ReplicatedTensor) or isinstance(e, _PartialTensor) or isinstance(e, ShardedTensor):
+            has_distributed_tensor = True
 
-    Return: column rearranged result.
-    """
-    # Process results and outputs for all2all.
-    sharding_dim = weight._sharding_spec.dim
-    sharding_dim_size = weight.size(sharding_dim)
-    dims = list(results[0].size())
-    dims[0] = sharding_dim_size
-    output = torch.empty(*dims, device=input.device)
-    combined_results = torch.cat(results)
+    tree_map(is_distributed_tensor, args)
+    tree_map(is_distributed_tensor, kwargs)
 
-    # Compute output splits
-    split_size = get_split_size(sharding_dim_size, world_size)
-    output_split_sizes = [0] * world_size
-    for idx, placement in enumerate(weight._sharding_spec.placements):
-        output_split_sizes[placement.rank()] = get_chunked_dim_size(
-            sharding_dim_size, split_size, idx
+    if not has_distributed_tensor:
+        raise TypeError(
+            f"torch function '{op.__name__}', with args: {args} and "
+            f"kwargs: {kwargs} are called without any distributed tensor!"
         )
 
-    # distribute the outputs using all2all.
-    output = all_to_all_single(
-        output, combined_results, output_split_sizes=output_split_sizes, group=pg
-    )
-
-    # Check if we need to rearrange columns appropriately for output.
-    rearrange_columns = any(
-        [
-            idx != placement.rank()
-            for idx, placement in enumerate(weight._sharding_spec.placements)
-        ]
-    )
-    if not rearrange_columns:
-        return output
+    # Validate all distributed tensors use the same PG.
+    cur_pg = None
 
-    indices = []
-    for placement in weight._sharding_spec.placements:
-        dim_size = output_split_sizes[placement.rank()]
-        start = sum(
-            [
-                split_size if i < placement.rank() else 0
-                for i, split_size in enumerate(output_split_sizes)
-            ]
-        )
-        indices += list(range(start, start + dim_size))
-
-    return output.index_select(0, torch.tensor(indices, device=output.device))
+    def validate_pg(e):
+        nonlocal cur_pg
+        if isinstance(e, ReplicatedTensor) or isinstance(e, _PartialTensor) or isinstance(e, ShardedTensor):
+            if cur_pg is not None and e._process_group is not cur_pg:
+                raise RuntimeError(
+                    'All distributed tensors should use the '
+                    'same ProcessGroup if used together in an op.'
+                )
+            cur_pg = e._process_group
 
+    tree_map(validate_pg, args)
+    tree_map(validate_pg, kwargs)
 
-def _handle_row_wise_lookup_distribute(
-    input_sorted, input, world_size, weight, rank, padding_idx
-):
+def _sharded_op_common(op, early_stop_func, extra_check):
     """
-    In the circumstance of row-wise sharding of weight, we need to distribute
-    the sorted lookup IDs of embedding/embeddingBag to each rank.
-    If the index in the placement is not equal to the rank number, we need to
-    do the rearrangement based on the order given by the Sharding Spec (placement).
-
-    In addition, we do two things for padding_idx. The first thing is to only
-    set it if it's within the range of the current rank and the other thing
-    is to do the modularization of it by sharded_dim_size_max.
+    Inject sharded tensor op registration with common logics executed before
+    different behaviors are done on either local shards or a local tensor.
+
+    Example::
+        >>> op = torch.transpose
+        >>> @sharded_op_impl(op)
+        >>> @_sharded_op_common(op, early_stop_func, extra_check)
+        >>> def sharded_tensor_op(types, args, kwargs, process_group):
+        >>>   ....
+        >>>
+        >>> st = sharded_tensor.rand(32, 16)
+        >>> st.transpose(1, 2)
+        >>> # This will call '_sharded_op_common'
 
     Args:
-        input_sorted: sorted lookup IDs of embedding/embeddingBag.
-        input: tensor to be applied op on.
-        world_size: number of ranks.
-        weight: shareded weight tensor.
-        rank: # of cuda process.
-        padding_idx: If specified, the entries at padding_idx do
-            not contribute to the gradient and reduction.
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
 
     Return:
-        input_sorted: sorted lookup IDs of embedding/embeddingBag
-            Rearrangement performed if it is needed.
-        input_split_sizes: size of IDs to be assigned to each rank.
-        sharded_dim_size_max: the max size of the row each rank gets.
-        input_split_rearrange_indices: indices of row rearrangement.
-        rearrange_indices_1d_second_order: reverse indices of row
-            rearrangement, which will be used to restore the original
-            order.
-        padding_idx: Same as input if padding_idx is within the range
-            of the given rank; otherwise, None is returned. It is
-            also modularized by sharded_dim_size_max.
+        func (Callable): Torch function for which we want to provide a sharded
+            implementation (ex: torch.transpose)
     """
-    # Decide which rank the input goes to by check the sharding range.
-    split_size = get_split_size(weight.size(0), world_size)
-    rearrange_rows = False
-    indices_flatten = None
-    input_split_sizes: List[int] = [0] * world_size
-    input_split_start_indices: List[int] = [0] * world_size
-    start_row_idx_rank = None
-    end_row_idx_rank = None
-    # When we do the chunk split, we always ensure the first N - 1 chunks get max out
-    # and then the Nth chunk gets the rest. So input_split_sizes like [3, 3, 3, 4]
-    # are not possible. The expected split size will be [4, 4, 4, 1].
-    sharded_dim_size_max = get_chunked_dim_size(weight.size(0), split_size, 0)
-    for idx, placement in enumerate(weight._sharding_spec.placements):
-        sharded_dim_size = get_chunked_dim_size(weight.size(0), split_size, idx)
-        start_row_idx = idx * sharded_dim_size_max
-        end_row_idx = start_row_idx + sharded_dim_size
-        start_idx = torch.searchsorted(input_sorted, start_row_idx).item()
-        end_idx = torch.searchsorted(input_sorted, end_row_idx).item()
-        input_split_sizes[placement.rank()] = int(end_idx - start_idx)
-        input_split_start_indices[placement.rank()] = int(start_idx)
-        if placement.rank() != idx:
-            rearrange_rows = True
-        # Store the range of the current rank.
-        if placement.rank() == rank:
-            start_row_idx_rank = start_row_idx
-            end_row_idx_rank = end_row_idx
-
-    # Perform the modular if padding_idx is within the range.
-    if padding_idx is not None:
-        if padding_idx < start_row_idx_rank or padding_idx >= end_row_idx_rank:
-            padding_idx = None
-        else:
-            padding_idx = padding_idx % sharded_dim_size_max
-
-    rearrange_indices_1d_second_order = None
-    if rearrange_rows:
-        # Need to re-arrange the 1D tensor to be sent via all2all.
-        indices: List[List[int]] = [[0]] * world_size
-        for placement in weight._sharding_spec.placements:
-            split_length = input_split_sizes[placement.rank()]
-            offset_idx = input_split_start_indices[placement.rank()]
-            indices[placement.rank()] = list(
-                range(offset_idx, offset_idx + split_length)
-            )
-        indices_flatten = list(idx for indice in indices for idx in indice)
-
-        input_sorted = input_sorted.index_select(
-            0, torch.tensor(indices_flatten, device=input.device)
-        )
-        rearrange_indices_1d_second_order = torch.argsort(torch.Tensor(indices_flatten))
-
-    return (
-        input_sorted,
-        input_split_sizes,
-        sharded_dim_size_max,
-        torch.tensor(indices_flatten, device=input.device) if rearrange_rows else None,
-        rearrange_indices_1d_second_order,
-        padding_idx,
-    )
-
-
-def _communicate_size_to_each_rank(
-    input_size_list, output_size, input, pg, tensor_type=torch.int
+    def decorator_sharded_func(wrapped_func):
+        @functools.wraps(wrapped_func)
+        def wrapper(types, args=(), kwargs=None, pg=None):
+            _basic_validation(op, args, kwargs)
+
+            st = args[0]
+            if kwargs is None:
+                kwargs = {}
+            if extra_check:
+                extra_check(*args, **kwargs)
+            if early_stop_func:
+                early_stop = early_stop_func(*args, **kwargs)
+                if early_stop:
+                    return st
+            return wrapped_func(types, args, kwargs, pg)
+
+        return wrapper
+
+    return decorator_sharded_func
+
+def _register_sharded_op_on_local_shards(
+    op, early_stop_func=None, extra_check=None, customized_func=None
 ):
     """
-    In the circumstance of row-wise sharding of weight, we need to first
-    communicate the input length to each rank because each rank gets a
-    different one.
+    Handles ``__torch_function__`` dispatch for ops which are performed on
+    each shard of the sharded tensor such as elementwise op like
+    ``torch.nn.functional.gelu`` or ``torch.nn.functional.relu``.
 
-    Args:
-        input_size_list: list of sizes to be sent to each rank.
-        output_size: length of the output tensor.
-        input: tensor to be applied op on.
-        pg: process group.
-        tensor_type: dtype of tensor.
-
-    Return: A list of communication results (int).
-    """
-    input_size_list_tensor = torch.tensor(
-        input_size_list, dtype=tensor_type, device=input.device
-    )
-    output_size_list_tensor = torch.empty(
-        output_size, dtype=tensor_type, device=input.device
-    )
-    dist.all_to_all_single(
-        output_size_list_tensor,
-        input_size_list_tensor,
-        group=pg,
-    )
-    return output_size_list_tensor.tolist()
+    For more complicated ops, a customized func can be used to generate
+    the new shards and sharded tensor size.
 
-
-def _communicate_list_to_each_rank(
-    input_tensor_list, output_lists, input, pg, tensor_type=torch.int64
-):
-    """
-    In the circumstance of row-wise sharding of weight, we need to
-    communicate a list of input tensors to each rank. Because the
-    input could be a list of list, we need to first convert the list
-    to a tensor.
+    This function expects that the original ShardingSpec for the ShardedTensor
+    is preserved irrespective of whether or not a customized function is used.
 
     Args:
-        input_tensor_list: list of tensors to be sent to each rank.
-        output_lists: list of sizes to be obtained from each rank.
-        input: tensor to be applied op on.
-        pg: process group.
-        tensor_type: dtype of tensor.
-
-    Return: A list of communication results (tensors).
-    """
-    output_tensor_list = []
-    for output_list in output_lists:
-        output_tensor_list.append(
-            torch.empty(output_list, dtype=tensor_type, device=input.device)
-        )
-    dist.all_to_all(
-        output_tensor_list,
-        input_tensor_list,
-        group=pg,
-    )
-    return output_tensor_list
-
-
-def _handle_max_norm_col_wise(
-    max_norm,
-    norm_type,
-    local_shard,
-    input,
-    world_size,
-    pg,
-):
-    """
-    For col-wise sharding of weight, we need to aggregate the
-    norm across all ranks before we can perform the proper re-norm.
-    Note that, the max_norm logic is only applied to the embedding
-    indices that are looked up and not the whole shard.
-
-    Args:
-        max_norm: If given, each embedding vector with norm larger
-            than max_norm is renormalized to have norm max_norm.
-            Note: this will modify weight in-place.
-        norm_type: The p in the p-norm to compute for the max_norm option.
-        local_shard: col-wise shared local weight used for lookup.
-        input: tensor to be applied op to.
-        world_size: number of ranks.
-        pg: process group.
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
+        customized_func (Callable, optional): the func for customized logic
+            to generate new shards and sharded tensor size.
+            Default: if ``None``, we simply lower to the real op call with
+                all local shards of the st.
 
     Return:
-        local_shard_norm_renormed: local_shard re-normed to max_norm if the norm is larger
-            than it.
-        gathered_inputs: list of inputs from all ranks.
+        func (Callable): registered implementation for sharded op for
+        ``__torch_function__`` dispatch.
     """
-    norm_type = norm_type if norm_type is not None else 2.0
-    # allgather the inputs first.
-    gathered_inputs = [torch.zeros_like(input) for _ in range(world_size)]
-    dist.all_gather(gathered_inputs, input, group=pg)
-    unique_inp = torch.unique(torch.cat(gathered_inputs))
-    local_shard_sum = torch.sum(
-        torch.pow(torch.abs(local_shard), norm_type), dim=1, dtype=local_shard.dtype
-    )
-    # For col-wise sharding, we need to first aggregate the powered sum
-    # from each rank first and then calculate the norm.
-    dist.all_reduce(local_shard_sum, group=pg)
-    local_shard_norm = torch.pow(local_shard_sum, 1.0 / norm_type)
-    max_norm_tensor = torch.full(
-        (local_shard.size(0),),
-        float("inf"),
-        dtype=local_shard.dtype,
-        device=input.device,
-    )
-    max_norm_tensor[unique_inp] = max_norm
-    local_shard_t = local_shard.t().contiguous()
-    normalized_tensor = torch.where(
-        local_shard_norm > max_norm_tensor, max_norm_tensor, local_shard_norm
-    )
-    # Make sure divisor is not zero.
-    local_shard_norm[local_shard_norm == 0.0] = 1.0
-    local_shard_norm_renormed = (
-        torch.div(torch.mul(local_shard_t, normalized_tensor), local_shard_norm)
-        .t()
-        .contiguous()
-    )
-    return local_shard_norm_renormed, gathered_inputs
+    @sharded_op_impl(op)
+    @_sharded_op_common(op, early_stop_func, extra_check)
+    def sharded_tensor_op_on_local_shards(types, args=(), kwargs=None, pg=None):
+        st = args[0]
+        st_metadata = st.metadata()
+        local_shards = st.local_shards()
+        local_shards_new = []
+        if customized_func:
+            local_shards_new, st_metadata = customized_func(args, kwargs, pg)
+        else:
+            for local_shard in local_shards:
+                args = (local_shard.tensor, *args[1:])
+                local_shards_new.append(
+                    Shard(op(*args, **kwargs), local_shard.metadata)
+                )
+        return ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards_new,
+            st_metadata,
+            process_group=pg,
+            init_rrefs=st._init_rrefs,
+            sharding_spec=st.sharding_spec()
+        )
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/chunk.py b/torch/distributed/_shard/sharded_tensor/_ops/chunk.py
new file mode 100644
index 000000000000..d167d378cb9c
--- /dev/null
+++ b/torch/distributed/_shard/sharded_tensor/_ops/chunk.py
@@ -0,0 +1,63 @@
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    sharded_op_impl,
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+
+
+def register_chunk_op(op):
+    @sharded_op_impl(op)
+    def sharded_chunk(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for the chunk op.
+        If we chunk by the non-sharding dim, we just directly chunk the
+        local tensor and create a list of sharded tensor based on them.
+
+        Warnings: Chunk by the sharding dim is not supported.
+
+        Args: same as ``torch.chunk``.
+
+        Return:
+            List[ShardedTensor]: Chunk results as a list of ShardedTensor.
+        """
+        st = args[0]
+        chunk_num = args[1]
+        dim = kwargs.get("dim")
+        dim = dim if dim else 0
+
+        # Validate types
+        if not isinstance(st, ShardedTensor):
+            raise TypeError(
+                f"torch function '{op.__name__}', with args: {args} and "
+                f"kwargs: {kwargs} are called for non ShardedTensor!"
+            )
+        spec = st.sharding_spec()
+        if not isinstance(spec, ChunkShardingSpec):
+            raise NotImplementedError("Only ChunkShardingSpec is supported for chunk.")
+        if spec.dim == dim or st.dim() + spec.dim == dim or st.dim() + dim == spec.dim:  # type: ignore[operator]
+            raise NotImplementedError("Chunk by sharding dim is not supported.")
+
+        local_tensor = st.local_tensor()
+        st_size = st.size()
+        dim = dim if dim > 0 else st.dim() + dim
+        results = []
+        for chunk_tensor in local_tensor.chunk(chunk_num, dim=dim):
+            new_st_size = (*st_size[:dim], chunk_tensor.size(dim), *st_size[dim + 1 :])  # type: ignore[index]
+            results.append(
+                ShardedTensor._init_from_local_tensor(
+                    chunk_tensor.contiguous(),
+                    st.sharding_spec(),
+                    new_st_size,
+                    process_group=pg,
+                )
+            )
+        return results
+
+
+chunk_ops = [
+    torch.chunk,
+    torch.Tensor.chunk,
+]
+for op in chunk_ops:
+    register_chunk_op(op)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py
index 42bc29d3d42f..eb6c5d54e381 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py
@@ -1,38 +1,10 @@
-import copy
-
 import torch
-from torch.distributed._shard.sharded_tensor import (
-    sharded_op_impl,
-    Shard,
-    ShardedTensor,
-)
-
-
-def register_elementwise_op(op):
-    @sharded_op_impl(op)
-    def elementwise_op(types, args=(), kwargs=None, pg=None):
-        """
-        Handles ``__torch_function__`` dispatch for the elementwise op such
-        as ``torch.nn.functional.gelu`` or ``torch.nn.functional.relu``.
-        This method computes on either a normal tensor or a sharded tensor.
-        """
-        input = args[0]
-        # Validate types
-        if not isinstance(input, ShardedTensor):
-            raise TypeError("input needs to be a ShardedTensor")
-        local_shards_new = []
-        for local_shard in input.local_shards():
-            local_shards_new.append(Shard(op(local_shard.tensor), local_shard.metadata))
-        # TODO: After a new API for sharded tensor creation, we need to replace this.
-        # https://github.com/pytorch/pytorch/issues/72092
-        new_st = ShardedTensor._init_from_local_shards(
-            local_shards_new, input.size(), process_group=pg
-        )
-
-        # Manually set sharding_spec
-        new_st._sharding_spec = copy.deepcopy(input._sharding_spec)
-        return new_st
 
+from ._common import (
+    _register_sharded_op_on_local_shards,
+)
 
-register_elementwise_op(torch.nn.functional.gelu)
-register_elementwise_op(torch.nn.functional.relu)
+_register_sharded_op_on_local_shards(torch.nn.functional.gelu)
+_register_sharded_op_on_local_shards(torch.nn.functional.relu)
+_register_sharded_op_on_local_shards(torch.nn.functional.dropout)
+_register_sharded_op_on_local_shards(torch.Tensor.tanh)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/init.py b/torch/distributed/_shard/sharded_tensor/_ops/init.py
index 0d037aa93a13..eebd02c82964 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/init.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/init.py
@@ -1,4 +1,5 @@
 import torch
+import torch.distributed._shard.sharded_tensor as sharded_tensor
 from torch.distributed._shard.sharded_tensor import (
     sharded_op_impl,
 )
@@ -103,3 +104,40 @@ def constant_(types, args=(), kwargs=None, pg=None):
     for shard in sharded_tensor.local_shards():
         torch.nn.init.constant_(shard.tensor, val=val)
     return sharded_tensor
+
+tensor_like_creation_op_map = {
+    torch.full_like: sharded_tensor.full,
+    torch.empty_like: sharded_tensor.empty,
+    torch.zeros_like: sharded_tensor.zeros,
+    torch.ones_like: sharded_tensor.ones,
+    torch.rand_like: sharded_tensor.rand,
+    torch.randn_like: sharded_tensor.randn,
+}
+
+# tensor ops that behave the same as the default tensor
+def register_tensor_creation_op(op):
+    @sharded_op_impl(op)
+    def tensor_creation_op(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for tensor creation ops that
+        takes a ShardedTensor as argument, such as ``torch.zeros_like`` or
+        ``torch.full_like``.
+        """
+        creation_op = tensor_like_creation_op_map.get(op, None)
+        if creation_op is None:
+            raise RuntimeError(f"Tensor creation {op} not supported!")
+        if kwargs is None:
+            kwargs = {}
+
+        st = args[0]
+
+        new_st = creation_op(st.sharding_spec(), st.size(), *args[1:], **kwargs)  # type: ignore[operator]
+        return new_st
+
+
+register_tensor_creation_op(torch.full_like)
+register_tensor_creation_op(torch.empty_like)
+register_tensor_creation_op(torch.zeros_like)
+register_tensor_creation_op(torch.ones_like)
+register_tensor_creation_op(torch.rand_like)
+register_tensor_creation_op(torch.randn_like)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py
new file mode 100644
index 000000000000..f4840864c4c0
--- /dev/null
+++ b/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py
@@ -0,0 +1,109 @@
+import torch
+from torch import Tensor
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    sharded_op_impl
+)
+from torch.distributed._shard.replicated_tensor import ReplicatedTensor
+from torch.distributed._shard._utils import narrow_tensor
+
+def binary_math_op_impl(op, types, args=(), kwargs=None, pg=None):
+    """
+    Handles ``__torch_function__`` dispatch for the binary math ops
+    such as `torch.add`, `torch.mul`, `torch.div`, etc.
+    This method computes on ShardedTensor, or ShardedTensor op ReplicatedTensor
+    """
+    if len(args) != 2:
+        raise ValueError("Only support binary math op on ShardedTensor for now!")
+    lhs = args[0]
+    rhs = args[1]
+    # Validate types
+    if isinstance(lhs, ReplicatedTensor):
+        assert isinstance(rhs, ShardedTensor)
+        st_size = rhs.size()
+        st_meta = rhs.local_shards()[0].metadata
+        if st_size != lhs.size():
+            # try to broadcast replicated tensor
+            lhs = lhs.expand(st_size)
+
+        replica_part = narrow_tensor(lhs, st_meta)
+        res = op(replica_part, rhs.local_tensor())
+
+        return ShardedTensor._init_from_local_tensor(
+            res,
+            rhs.sharding_spec(),
+            rhs.size(),  # type: ignore[arg-type]
+            process_group=pg)
+
+    elif isinstance(rhs, ReplicatedTensor):
+        assert isinstance(lhs, ShardedTensor)
+        st_size = lhs.size()
+        st_meta = lhs.local_shards()[0].metadata
+        if st_size != rhs.size():
+            # try to broadcast replicated tensor
+            rhs = rhs.expand(st_size)
+
+        replica_part = narrow_tensor(rhs, st_meta)
+        res = op(lhs.local_tensor(), replica_part)
+        return ShardedTensor._init_from_local_tensor(
+            res,
+            lhs.sharding_spec(),
+            lhs.size(),  # type: ignore[arg-type]
+            process_group=pg)
+
+    elif isinstance(lhs, (int, float)):
+        assert isinstance(rhs, ShardedTensor)
+        res = op(lhs, rhs.local_tensor())
+        return ShardedTensor._init_from_local_tensor(
+            res,
+            rhs.sharding_spec(),
+            rhs.size(),  # type: ignore[arg-type]
+            process_group=pg)
+
+    elif isinstance(rhs, (int, float)):
+        assert isinstance(lhs, ShardedTensor)
+        res = op(lhs.local_tensor(), rhs)
+        return ShardedTensor._init_from_local_tensor(
+            res,
+            lhs.sharding_spec(),
+            lhs.size(),  # type: ignore[arg-type]
+            process_group=pg)
+    else:
+        raise RuntimeError(
+            f"torch function '{op.__name__}', with args: {args} and "
+            f"kwargs: {kwargs} not supported yet for ShardedTensor!")
+
+def register_math_op(op):
+    @sharded_op_impl(op)
+    def binary_math_op(types, args=(), kwargs=None, pg=None):
+        return binary_math_op_impl(op, types, args, kwargs, pg)
+
+binary_ops = [
+    # add
+    torch.add,
+    Tensor.add,
+    Tensor.add_,
+    Tensor.__add__,
+    Tensor.__radd__,
+    # sub
+    torch.sub,
+    Tensor.sub,
+    Tensor.sub_,
+    Tensor.__sub__,
+    Tensor.__rsub__,
+    # mul
+    torch.mul,
+    Tensor.mul,
+    Tensor.mul_,
+    Tensor.__mul__,
+    Tensor.__rmul__,
+    # div
+    torch.div,
+    Tensor.div,
+    Tensor.div_,
+    Tensor.__div__,
+    Tensor.__rdiv__,
+]
+
+for op in binary_ops:
+    register_math_op(op)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/matrix_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/matrix_ops.py
new file mode 100644
index 000000000000..4765c3777890
--- /dev/null
+++ b/torch/distributed/_shard/sharded_tensor/_ops/matrix_ops.py
@@ -0,0 +1,67 @@
+import copy
+
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+)
+
+from ._common import (
+    _register_sharded_op_on_local_shards,
+)
+
+
+def sharded_type_as_check(*args, **kwargs):
+    """
+    Perform extra checks for the sharded_type_as op such as the input needs to
+    be either a Tensor or ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return: None
+    """
+    if len(args) < 2:
+        raise ValueError("Needs to give a tensor to cast type as!")
+    if not isinstance(args[1], torch.Tensor) and not isinstance(args[1], ShardedTensor):
+        raise ValueError("Needs to give a Tensor or ShardedTensor to cast type as!")
+
+
+def same_dtype(*args, **kwargs):
+    """
+    When the dtype is the same, return the original ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return (bool): Whether to return early or not.
+    """
+    return args[0].dtype == args[1].dtype
+
+
+def sharded_type_as(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.type_as`` op.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return:
+        new_local_shards (List[Shard]): Local shards for the new sharded tensor.
+        st_meta (ShardedTensorMetadata): Metadata of the new sharded tensor.
+    """
+    st = args[0]
+    tensor = args[1]
+    if isinstance(tensor, ShardedTensor):
+        tensor = tensor.local_tensor()
+    new_local_shards = []
+    for shard in st.local_shards():
+        new_local_shards.append(Shard(shard.tensor.type_as(tensor), shard.metadata))
+    st_meta = copy.deepcopy(st._metadata)
+    st_meta.tensor_properties.dtype = tensor.dtype
+    return new_local_shards, st_meta
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.type_as,
+    early_stop_func=same_dtype,
+    extra_check=sharded_type_as_check,
+    customized_func=sharded_type_as,
+)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
new file mode 100644
index 000000000000..4d7e398985ec
--- /dev/null
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -0,0 +1,167 @@
+import copy
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    sharded_op_impl,
+    Shard,
+    ShardedTensor,
+)
+from ._common import (
+    _register_sharded_op_on_local_shards,
+)
+
+def register_default_op(op):
+    @sharded_op_impl(op)
+    def tensor_default_op(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for the default tensor ops that
+        behave the same as ``torch.Tensor`` such as ``torch.Tensor.shape`` or
+        ``torch.Tensor.dtype``. We simply lower to the real op call with
+        DisableTorchFunction context like ``torch.Tensor.__torch_function__``
+        to avoid recursions.
+        """
+        if kwargs is None:
+            kwargs = {}
+
+        with torch._C.DisableTorchFunction():
+            return op(*args, **kwargs)
+
+# Tensor properties access
+register_default_op(torch.Tensor.requires_grad.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.shape.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.dtype.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.layout.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.size)
+register_default_op(torch.Tensor.dim)
+register_default_op(torch.Tensor.ndim.__get__)  # type: ignore[attr-defined]
+register_default_op(torch.Tensor.is_contiguous)
+register_default_op(torch.Tensor.contiguous)
+
+# __reduce_ex__ to dispatch to get_state/set_state
+register_default_op(torch.Tensor.__reduce_ex__)
+
+def sharded_type_as_check(*args, **kwargs):
+    """
+    Perform extra checks for the sharded_type_as op such as the input needs to
+    be either a Tensor or ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return: None
+    """
+    if len(args) < 2:
+        raise ValueError("Needs to give a tensor to cast type as!")
+    if not isinstance(args[1], torch.Tensor) and not isinstance(args[1], ShardedTensor):
+        raise ValueError("Needs to give a Tensor or ShardedTensor to cast type as!")
+
+
+def same_dtype(*args, **kwargs):
+    """
+    When the dtype is the same, return the original ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return (bool): Whether to return early or not.
+    """
+    return args[0].dtype == args[1].dtype
+
+
+def sharded_type_as(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.type_as`` op.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return:
+        new_local_shards (List[Shard]): Local shards for the new sharded tensor.
+        st_meta (ShardedTensorMetadata): Metadata of the new sharded tensor.
+    """
+    st = args[0]
+    tensor = args[1]
+    if isinstance(tensor, ShardedTensor):
+        tensor = tensor.local_tensor()
+    new_local_shards = []
+    for shard in st.local_shards():
+        new_local_shards.append(Shard(shard.tensor.type_as(tensor), shard.metadata))
+    st_meta = copy.deepcopy(st._metadata)
+    st_meta.tensor_properties.dtype = tensor.dtype
+    return new_local_shards, st_meta
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.type_as,
+    early_stop_func=same_dtype,
+    extra_check=sharded_type_as_check,
+    customized_func=sharded_type_as,
+)
+
+def sharded_deepcopy(args, kwargs, pg):
+    # NOTE: we directly implement deepcopy magic method
+    # instead of using the default tensor.__deepcopy__
+    # and implement clone(). This is because the default
+    # tensor deepcopy copies every attribute, but the
+    # process_group in ShardedTensor cannot be deep copied.
+    self_st = args[0]
+    new_local_shards = copy.deepcopy(self_st.local_shards())
+    new_metadata = copy.deepcopy(self_st.metadata())
+    return new_local_shards, new_metadata
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.__deepcopy__,
+    customized_func=sharded_deepcopy,
+)
+
+def sharded_clone(args, kwargs, pg):
+    self_st = args[0]
+    desire_memory_format = kwargs.get("memory_format", None)
+    if desire_memory_format and desire_memory_format != torch.preserve_format:
+        raise RuntimeError("Only support torch.preserve_format for ShardedTensor!")
+    cloned_local_shards = [
+        Shard(
+            local_shard.tensor.clone(memory_format=desire_memory_format),
+            metadata=copy.deepcopy(local_shard.metadata),
+        )
+        for local_shard in self_st.local_shards()
+    ]
+    new_metadata = copy.deepcopy(self_st.metadata())
+    return cloned_local_shards, new_metadata
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.clone,
+    customized_func=sharded_clone,
+)
+
+def sharded_detach(args, kwargs, pg):
+    self_st = args[0]
+    detached_local_shards = [
+        Shard(
+            local_shard.tensor.detach(),
+            metadata=copy.deepcopy(local_shard.metadata),
+        )
+        for local_shard in self_st.local_shards()
+    ]
+    new_metadata = copy.deepcopy(self_st.metadata())
+    new_metadata.tensor_properties.requires_grad = False
+    return detached_local_shards, new_metadata
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.detach,
+    customized_func=sharded_detach,
+)
+
+@sharded_op_impl(torch.Tensor.requires_grad_)
+def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
+    self_st = args[0]
+    requires_grad = args[1]
+    # Validate types
+    if not isinstance(self_st, ShardedTensor):
+        raise TypeError("input needs to be a ShardedTensor")
+
+    if requires_grad == self_st.requires_grad:
+        return self_st
+
+    for local_shard in self_st.local_shards():
+        local_shard.tensor.requires_grad_(requires_grad)
+
+    # update the metadata in the meanwhile
+    self_st._metadata.tensor_properties.requires_grad = requires_grad
+    return self_st
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 0e26ea9166c4..6aad1a3eace8 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -5,42 +5,44 @@
     Dict,
     List,
     Optional,
-    Union
+    Sequence,
+    Tuple,
+    Union,
+    cast,
 )
+import copy
 import weakref
+import math
 
-import copy
 import threading
 import torch
 import torch.distributed as dist
 from torch.distributed import rpc
 from torch.distributed import distributed_c10d
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-    EnumerableShardingSpec,
-    ShardMetadata,
-    ShardingSpec,
+from torch.distributed._shard.metadata import ShardMetadata
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.sharding_spec.api import (
+    _dispatch_custom_op,
+    _has_custom_op,
 )
 from torch.distributed._shard.sharding_spec._internals import (
     check_tensor,
-    get_split_size,
-    get_chunked_dim_size,
     validate_non_overlapping_shards_metadata,
 )
-from torch.distributed.nn.functional import (
-    reduce_scatter,
-)
+
 from .metadata import TensorProperties, ShardedTensorMetadata
 from .shard import Shard
 from .reshard import reshuffle_local_shard, reshard_local_shard
 from .utils import (
-    get_current_process_group,
     _flatten_tensor_size,
     _parse_and_validate_remote_device,
     _validate_output_tensor_for_gather,
     build_metadata_from_local_shards,
     build_global_metadata
 )
+from torch.overrides import handle_torch_function
+from torch.distributed.remote_device import _remote_device
+
 
 # Tracking for sharded tensor objects.
 _sharded_tensor_lock = threading.Lock()
@@ -48,7 +50,7 @@
 _sharded_tensor_map: Dict[int, 'weakref.ReferenceType[ShardedTensor]'] = {}
 
 # Custom sharded ops
-_SHARDED_OPS: Dict[str, Callable] = {}
+_SHARDED_OPS: Dict[Callable, Callable] = {}
 def _register_sharded_op(op, func):
     from inspect import signature
     if len(signature(func).parameters) != 4:
@@ -112,11 +114,11 @@ class ShardedTensor(object):
             Default: ``False``.
 
     .. note:: ShardedTensor uses collectives to do various operations, i.e. it
-        uses all_gather to do cross rank validations. For NCCL-based processed
+        uses all_gather to do cross rank validations. For NCCL-based process
         groups, internal tensor representations of objects must be moved to the
         GPU device before communication takes place. In this case, the device
         used is given by ``torch.cuda.current_device()`` and it is the user's
-        responsiblity to ensure that this is set so that each rank has an
+        responsibility to ensure that this is set so that each rank has an
         individual GPU, via ``torch.cuda.set_device()``
 
     """
@@ -128,7 +130,7 @@ def __new__(cls, *args, **kwargs):
 
     def __init__(
         self,
-        sharding_spec: ShardingSpec,
+        sharding_spec: shard_spec.ShardingSpec,
         *size,
         dtype=None,
         layout=torch.strided,
@@ -158,14 +160,26 @@ def __init__(
 
         dims = _flatten_tensor_size(size)
 
+        if not isinstance(sharding_spec, shard_spec.ShardingSpec):
+            raise ValueError(f'Expecting ShardingSpec but got: {type(sharding_spec)}')
+
         self._sharding_spec = sharding_spec
 
-        if isinstance(self._sharding_spec, ChunkShardingSpec):
-            self._init_chunked(dims, tensor_properties)
-        elif isinstance(self._sharding_spec, EnumerableShardingSpec):
-            self._init_enumerable(dims, tensor_properties)
-        else:
-            raise ValueError(f'Unsupported sharding_spec: {self._sharding_spec}')
+        sharded_tensor_metadata = sharding_spec.build_metadata(
+            dims, tensor_properties=tensor_properties)
+
+        current_rank = dist.get_rank(self._process_group)
+
+        for shard_metadata in sharded_tensor_metadata.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(self._process_group, shard_metadata.placement)
+            if rank == current_rank:
+                local_tensor = _create_tensor_from_params(
+                    shard_metadata.shard_sizes,
+                    local_device=device,
+                    tensor_properties=sharded_tensor_metadata.tensor_properties
+                )
+                self._local_shards.append(Shard(local_tensor, shard_metadata))
+        self._metadata = sharded_tensor_metadata
 
         # do post initialization (i.e. register sharded_tensor_id, initialize_rpc)
         self._post_init()
@@ -250,6 +264,15 @@ def _init_rpc(self):
         # Barrier for all RPCs to finish on all ranks.
         rpc.api._all_gather(None)
 
+    def _get_preferred_device(self) -> torch.device:
+        """
+        Return the prefered device to be used when creating tensors for collectives.
+        This method takes into account the associated process group
+        """
+        if dist.get_backend(self._process_group) == dist.Backend.NCCL:
+            return torch.device(torch.cuda.current_device())
+        return torch.device("cpu")
+
     def gather(
         self,
         dst: int = 0,
@@ -270,44 +293,126 @@ def gather(
                 Must to be provided ONLY on ``dst`` rank.
                 Default: ``None``
         """
+        def shard_size(shard_md):
+            return math.prod(shard_md.shard_sizes)  # type: ignore[attr-defined]
+
         rank = dist.get_rank(self._process_group)
         full_size = self.metadata().size
         _validate_output_tensor_for_gather(rank, dst, full_size, out)
 
         local_shards = self.local_shards()
-
         world_size = dist.get_world_size(self._process_group)
+        rank_sizes = [0 for _ in range(world_size)]
+        max_rank_size = 0
+        shard_placement: Dict[ShardMetadata, Tuple[int, int]] = dict()
+        # collect sizes
+        for shard_md in self.metadata().shards_metadata:
+            shard_rank = cast(_remote_device, shard_md.placement).rank()
+            assert shard_rank is not None
+
+            shard_placement[shard_md] = (shard_rank, rank_sizes[shard_rank])
+            rank_sizes[shard_rank] += shard_size(shard_md)
+            max_rank_size = max(max_rank_size, rank_sizes[shard_rank])
+
+        gather_list: Optional[List[torch.Tensor]]
+        if rank == dst:
+            assert out is not None
+            gather_list = [torch.empty((max_rank_size,), device=out.device) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        with torch.no_grad():
+            data = torch.empty(max_rank_size, device=self._get_preferred_device())
+
+            for shard in local_shards:
+                src = shard.tensor.flatten()
+                shard_offset = shard_placement[shard.metadata][1]
+                data[shard_offset: shard_offset + src.numel()].copy_(src)
 
-        gathered_shards = [None] * world_size
-        # will revise this part with CPU support and use dist.gather()
-        # once NCCL support for gather() is ready
-        # https://github.com/pytorch/pytorch/issues/66187
-        dist.all_gather_object(
-            obj=local_shards,
-            object_list=gathered_shards,
+        dist.gather(
+            tensor=data,
+            gather_list=gather_list,
+            dst=dst,
             group=self._process_group,
         )
+        if rank != dst:
+            return
+        # In _validate_output_tensor_for_gather, we raise if out == None and rank == dst
+        out = cast(torch.Tensor, out)
+        assert gather_list is not None
 
-        if rank == dst:
-            dims = len(full_size)
-            for shards in gathered_shards:
-                if shards is None:
-                    raise RuntimeError(
-                        'Gathered shards cannot be None on dst rank {dst}'
-                    )
-                for shard in shards:
-                    metadata = shard.metadata
-                    tensor = shard.tensor
+        full_size = self.metadata().size
+        dims = len(full_size)
+        for shard_md in self.metadata().shards_metadata:
+            rank, rank_offset = shard_placement[shard_md]
+            tensor = gather_list[rank]
+            tensor = tensor[rank_offset : rank_offset + shard_size(shard_md)]
+            tensor = tensor.view(shard_md.shard_sizes)
+
+            out_narrow_view = out
+            for dim in range(dims):
+                out_narrow_view = out_narrow_view.narrow(
+                    dim,
+                    shard_md.shard_offsets[dim],
+                    shard_md.shard_sizes[dim],
+                )
+
+            out_narrow_view.copy_(tensor)
+
+    def cpu(
+        self,
+        memory_format=torch.preserve_format,
+        process_group=None
+    ) -> ShardedTensor:
+        """
+        Returns a copy of this object in CPU memory.
 
-                    out_narrow_view = out
-                    for dim in range(dims):
-                        out_narrow_view = out_narrow_view.narrow(
-                            dim,
-                            metadata.shard_offsets[dim],
-                            metadata.shard_sizes[dim],
-                        )
+        If this ShardedTensor is already on CPU memory, then no copy is
+        performed and original object is returned.
 
-                    out_narrow_view.copy_(tensor)
+        .. note:: When moving a ShardedTensor from GPU to CPU, the ShardedTensor might
+            need to be managed by a different type of ProcessGroup(i.e. ProcessGroupGloo),
+            it is the user's responsiblity to explicitly pass in a new process_group that
+            is compatible with CPU.
+        """
+        # TODO: make this a __torch_function__ op once ShardedTensor becomes a
+        # torch.Tensor subclass, see https://github.com/pytorch/pytorch/issues/75402
+        if memory_format != torch.preserve_format and \
+                memory_format != torch.contiguous_format:
+            raise RuntimeError("Only `torch.contiguous_format` or "
+                               "`torch.preserve_format` is supported!")
+        all_on_cpu = True
+        for meta in self.metadata().shards_metadata:
+            all_on_cpu &= (meta.placement.device().type == "cpu")  # type: ignore[union-attr]
+
+        # if every shard is already on CPU, return the original object
+        if all_on_cpu:
+            return self
+
+        # if not, returns a copy of this object on CPU
+        list_shards: List[Shard] = []
+        # move all local shards to cpu, and change metadata
+        for shard in self._local_shards:
+            cpu_tensor = shard.tensor.cpu(memory_format=memory_format)  # type: ignore[call-arg]
+            metadata = copy.deepcopy(shard.metadata)
+            metadata.placement._device = torch.device("cpu")  # type: ignore[union-attr]
+            list_shards.append(
+                Shard(cpu_tensor, metadata)
+            )
+
+        st_meta = copy.deepcopy(self.metadata())
+        for meta in st_meta.shards_metadata:
+            if meta.placement.device().type != "cpu":  # type: ignore[union-attr]
+                meta.placement._device = torch.device("cpu")  # type: ignore[union-attr]
+
+        pg = self._process_group if process_group is None else process_group
+        st_cpu = ShardedTensor._init_from_local_shards_and_global_metadata(
+            list_shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=pg,
+            init_rrefs=self._init_rrefs
+        )
+        return st_cpu
 
     @classmethod
     def _init_from_local_shards(
@@ -357,15 +462,116 @@ def _init_from_local_shards(
         # add to metadata and local_shards
         sharded_tensor._metadata = global_sharded_tensor_metadata
         sharded_tensor._local_shards = local_shards
-        # make a EnumerableShardingSpec for sharded tensors that initialized from this API.
-        # TODO: make sharding spec a ChunkShardingSpec by inferring from the metadata list.
-        #       see issue https://github.com/pytorch/pytorch/issues/67244
-        sharded_tensor._sharding_spec = EnumerableShardingSpec(global_sharded_tensor_metadata.shards_metadata)
+        sharded_tensor._sharding_spec = shard_spec._infer_sharding_spec_from_shards_metadata(
+            global_sharded_tensor_metadata.shards_metadata
+        )
 
         # run post initialization, i.e. map registration, rpc initialization
         sharded_tensor._post_init()
         return sharded_tensor
 
+    @classmethod
+    def _init_from_local_tensor(
+        cls,
+        local_tensor: torch.Tensor,
+        sharding_spec: shard_spec.ShardingSpec,
+        *global_size: Sequence[int],
+        process_group: dist.ProcessGroup = None,
+        init_rrefs=False,
+    ) -> "ShardedTensor":
+        """
+        Initialize a ShardedTensor given only one local tensor, global sharded tensor
+        size and sharding spec on each rank.
+
+        Args:
+            local_tensor (Tensor): Single tensor of local shard stored in each rank.
+            sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+                The specification describing how to shard the Tensor.
+            global_size (Sequence[int]): Size of the sharded tensor.
+            process_group (ProcessGroup, optional): The process group to aggregate on.
+                Default: None
+            init_rrefs (bool, optional): Whether or not to initialize
+                :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+                Need to initialize the RPC Framework if specified as ``True``.
+                Default: ``False``.
+
+        Returns:
+            A :class:`ShardedTensor` sharded based on the given sharding_spec with local
+                tensor stored in the current rank.
+
+        Examples:
+            >>> # All tensors below are of torch.int64 type.
+            >>> # We have 2 process groups, 2 ranks.
+            >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+            >>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]))
+            >>> local_tensor
+            tensor([[1, 2, 3, 4]]) # Rank 0
+            tensor([[3, 4, 5, 6]]) # Rank 1
+            >>> sharding_dim = 0
+            >>> sharding_spec = ChunkShardingSpec(
+                    dim=sharding_dim,
+                    placements=[
+                        "rank:0/cuda:0",
+                        "rank:1/cuda:1",
+                    ],
+                )
+            >>> st = ShardedTensor._init_from_local_tensor(local_tensor, sharding_spec, [2, 4])
+            >>> st
+            ShardedTensor(
+                ShardedTensorMetadata(
+                    shards_metadata=[
+                        ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1, 4], placement=rank:0/cuda:0),
+                        ShardMetadata(shard_offsets=[1, 0], shard_sizes=[1, 4], placement=rank:1/cuda:1),
+                    ],
+                    size=torch.Size([2, 4])
+            )
+            >>> st.local_tensor()
+            tensor([1, 2, 3, 4]) # Rank 0
+            tensor([3, 4, 5, 6]) # Rank 1
+
+        Warning: This API is experimental and subject to change. It lacks of a fully across
+                 rank validations, and we only validate the local shard on the current rank.
+                 We fully rely on the user to ensure local tensor is sharded based on the
+                 sharding spec.
+        """
+        if not local_tensor.is_contiguous():
+            raise ValueError('local_tensor is not a contiguous Tensor.')
+
+        global_tensor_size = _flatten_tensor_size(global_size)
+        tensor_properties = TensorProperties(
+            dtype=local_tensor.dtype,
+            layout=local_tensor.layout,
+            requires_grad=local_tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=local_tensor.is_pinned())
+        sharded_tensor_metadata = sharding_spec.build_metadata(
+            global_tensor_size,
+            tensor_properties
+        )
+
+        process_group = (
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        current_rank = dist.get_rank(process_group)
+
+        local_shards: List[Shard] = []
+        for shard_metadata in sharded_tensor_metadata.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(process_group, shard_metadata.placement)
+            if rank == current_rank:
+                local_shards.append(Shard(local_tensor, shard_metadata))
+
+        # TODO: figure out what the API should behave when some rank have no shard
+        # see https://github.com/pytorch/pytorch/issues/7313
+        return ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            sharded_tensor_metadata,
+            process_group=process_group,
+            init_rrefs=init_rrefs,
+            sharding_spec=sharding_spec,
+        )
+
     @classmethod
     def _init_from_local_shards_and_global_metadata(
         cls,
@@ -373,6 +579,7 @@ def _init_from_local_shards_and_global_metadata(
         sharded_tensor_metadata: ShardedTensorMetadata,
         process_group=None,
         init_rrefs=False,
+        sharding_spec=None,
     ) -> "ShardedTensor":
         """
         Initialize a ShardedTensor with local shards and a global
@@ -455,94 +662,22 @@ def _raise_if_mismatch(expected, actual, prop_name, rank, is_property=False):
 
         # done validation, add local_shards
         sharded_tensor._local_shards = local_shards
-        # make a EnumerableShardingSpec for sharded tensors that initialized from this API.
-        # TODO: make sharding spec a ChunkShardingSpec by inferring from the metadata list.
-        #       see issue https://github.com/pytorch/pytorch/issues/67244
-        sharded_tensor._sharding_spec = EnumerableShardingSpec(shards_metadata)
+        if sharding_spec is None:
+            sharded_tensor._sharding_spec = shard_spec._infer_sharding_spec_from_shards_metadata(shards_metadata)
+        else:
+            sharded_tensor._sharding_spec = sharding_spec
 
         # run post initialization, i.e. map registration, rpc initialization
         sharded_tensor._post_init()
         return sharded_tensor
 
-
-    def _init_chunked(self, dims, tensor_properties: TensorProperties):
-        current_rank = dist.get_rank(self._process_group)
-        sharding_dim = self._sharding_spec.dim  # type: ignore[attr-defined]
-
-        # Validate the sharding spec.
-        if not isinstance(sharding_dim, int):
-            raise ValueError(
-                f"Sharding dim needs to be an integer, found: {sharding_dim}"
-            )
-        if sharding_dim >= len(dims) or sharding_dim < -len(dims):
-            raise ValueError(f"Invalid sharding dim: {sharding_dim}")
-
-        dim_size = dims[sharding_dim]
-        remote_devices = self._sharding_spec.placements  # type: ignore[attr-defined]
-        chunks = len(remote_devices)
-        # split_size computed similar to 'torch.chunk'
-        split_size = get_split_size(dim_size, chunks)
-
-        shards_metadata = []
-        for idx, remote_device in enumerate(remote_devices):
-            rank, local_device = _parse_and_validate_remote_device(self._process_group, remote_device)
-
-            # Adjust the sharding dim for this rank.
-            sharded_dim_size = get_chunked_dim_size(dim_size, split_size, idx)
-
-            if sharded_dim_size > 0:
-                # Build sharding_metadata.
-
-                # deepcopy for modification.
-                rank_dims = dims.copy()
-
-                rank_offsets = [0] * len(dims)
-                rank_offsets[sharding_dim] = split_size * idx
-                rank_dims[sharding_dim] = sharded_dim_size
-
-                shard_metadata = ShardMetadata(rank_offsets, rank_dims, remote_device)
-                shards_metadata.append(shard_metadata)
-
-                # Build the local shard for the current rank if it is involved in the sharding spec.
-                if current_rank == rank:
-                    # Initialize the local shard.
-                    local_shard = _create_tensor_from_params(
-                        *rank_dims, local_device=local_device, tensor_properties=tensor_properties)
-                    self._local_shards.append(Shard(local_shard, shard_metadata))
-
-        # Build overall metadata
-        self._metadata = ShardedTensorMetadata(
-            shards_metadata, dims, tensor_properties)
-
-    def _init_enumerable(self, dims, tensor_properties: TensorProperties):
-        # Validate the sharding spec is compatible with the tensor.
-        check_tensor(self._sharding_spec.shards, dims)  # type: ignore[attr-defined]
-
-        current_rank = dist.get_rank(self._process_group)
-
-        shards_metadata = []
-        for shard_metadata in self._sharding_spec.shards:  # type: ignore[attr-defined]
-            rank, local_device = _parse_and_validate_remote_device(self._process_group, shard_metadata.placement)
-            shards_metadata.append(shard_metadata)
-
-            if current_rank == rank:
-                # Initialize the local shard.
-                local_shard = _create_tensor_from_params(
-                    *shard_metadata.shard_sizes, local_device=local_device,
-                    tensor_properties=tensor_properties)
-                self._local_shards.append(Shard(local_shard, shard_metadata))
-
-        # Build overall metadata
-        self._metadata = ShardedTensorMetadata(
-            shards_metadata, dims, tensor_properties)
-
-    def sharding_spec(self) -> ShardingSpec:
+    def sharding_spec(self) -> shard_spec.ShardingSpec:
         """
         Returns the ShardingSpec for the tensor.
         """
         return self._sharding_spec
 
-    def reshard(self, resharding_spec: ShardingSpec) -> ShardedTensor:
+    def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
         """
         Reshard a sharded tensor given the ``resharding_spec``. For now, we only support
         single local shard.
@@ -612,8 +747,8 @@ def reshard(self, resharding_spec: ShardingSpec) -> ShardedTensor:
             tensor([[4], [4], [6], [6], [8], [8], [10], [10]]) # Rank 3
         """
         if (
-            not isinstance(resharding_spec, ChunkShardingSpec) or
-            not isinstance(self._sharding_spec, ChunkShardingSpec)
+            not isinstance(resharding_spec, shard_spec.ChunkShardingSpec) or
+            not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec)
         ):
             raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
         if (len(self.local_shards()) != 1):
@@ -654,9 +789,29 @@ def local_tensor(self) -> torch.Tensor:
             raise NotImplementedError("Only single local shard is supported.")
         return self.local_shards()[0].tensor
 
-    def __torch_function__(self, func, types, args=(), kwargs=None):
-        if func in _SHARDED_OPS:
-            return _SHARDED_OPS[func](types, args, kwargs, self._process_group)
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        def dispatch(st: ShardedTensor, func: Callable):
+            # Dispatch to custom sharding spec op if it has one.
+            if _has_custom_op(st._sharding_spec, func):
+                return _dispatch_custom_op(st._sharding_spec, func, types, args, kwargs)
+
+            if func in _SHARDED_OPS:
+                return _SHARDED_OPS[func](types, args, kwargs, st._process_group)
+
+            raise RuntimeError(
+                f"torch function '{func.__name__}', with args: {args} and "
+                f"kwargs: {kwargs} not supported for ShardedTensor!")
+
+        # Find ShardedTensor instance to get process_group and sharding_spec.
+        for arg in args:
+            if isinstance(arg, ShardedTensor):
+                return dispatch(arg, func)
+
+        for kwarg in kwargs.values():
+            if isinstance(kwarg, ShardedTensor):
+                return dispatch(kwarg, func)
+
         raise RuntimeError(
             f"torch function '{func.__name__}', with args: {args} and "
             f"kwargs: {kwargs} not supported for ShardedTensor!")
@@ -693,9 +848,10 @@ def size(self, dim: int = None) -> Union[torch.Size, int]:
         size = self._metadata.size
         if dim is None:
             return size
-        if dim < 0 or dim >= len(size):
+        if dim < -len(size) or dim >= len(size):
             raise ValueError(
-                f"Argument ``dim`` must be within the range of tensor dimensions [0, {len(size)})"
+                "Argument ``dim`` must be within the range of tensor "
+                f"dimensions [-{len(size)}, {len(size)})"
             )
         return size[dim]
 
@@ -713,6 +869,129 @@ def is_contiguous(self) -> bool:
         """
         return self._metadata.tensor_properties.memory_format == torch.contiguous_format
 
+    def dim(self) -> int:
+        """
+        Returns a `int` which represents the dimension of the tensor.
+
+        Returns:
+            A `int` represents the dimension of the tensor.
+        """
+        return len(self._metadata.size)
+
+    # TODO: This op needs further definition of what exactly its behavior will be.
+    def contiguous(self) -> ShardedTensor:
+        """
+        Returns a new sharded tensor with the local tensor is made to contiguous.
+        """
+        if self.is_contiguous():
+            return self
+        local_shards = []
+        for shard in self.local_shards():
+            local_shards.append(
+                Shard(shard.tensor.contiguous(), shard.metadata)
+            )
+        return ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            self._metadata,
+            process_group=self._process_group,
+            init_rrefs=self._init_rrefs,
+        )
+
+    def masked_fill(self, mask, value) -> ShardedTensor:
+        """
+        Returns a new sharded tensor with each shard has been filled elements
+        with value where mask is True. The shape of mask must be broadcastable
+        with the shape of the underlying tensor.
+
+        Args:
+            mask (BoolTensor): the boolean mask.
+            value (float): the value to fill in with.
+
+        Returns:
+            A :class:`ShardedTensor` object whose shards have been applied masked_fill.
+        """
+        return handle_torch_function(
+            torch.Tensor.masked_fill, (self, mask, value), self, mask, value
+        )
+
+    def type_as(self, tensor) -> ShardedTensor:
+        """
+        Returns a new sharded tensor with each shard has been
+        cast to the type of the given tensor.
+
+        Args:
+            tensor (Tensor): the tensor which has the desired type.
+
+        Returns:
+            A :class:`ShardedTensor` object whose shards have been applied type_as.
+        """
+        return handle_torch_function(torch.Tensor.type_as, (self, tensor), self, tensor)
+
+    def view(self, *shape) -> ShardedTensor:
+        """
+        Returns a new sharded tensor with the same data as the
+        self tensor but of a different shape for its local tensor.
+
+        For now, we only support to pass through the view op to the local
+        tensor.
+
+        Args:
+            shape (torch.Size or int...) – the desired size.
+
+        Returns:
+            A :class:`ShardedTensor` object whose shards have been applied
+                with view to its local tensor.
+        """
+        return handle_torch_function(torch.Tensor.view, (self, *shape), self, *shape)
+
+    def transpose(self, dim0, dim1) -> ShardedTensor:
+        """
+        Returns a new sharded tensor with the given dimensions transposed.
+        During the transpose, we keep the original shading dim, e.g., if the
+        tensor is sharded by dim 0 and if we call transpose(1, 0). The returned
+        tensor will be sharded by dim 1.
+
+        Args:
+            dim0 (int): the first dimension to be transposed.
+            dim1 (int): the second dimension to be transposed.
+
+        Returns:
+            A :class:`ShardedTensor` object whose dims have been transposed
+                specified in the input.
+        """
+        return handle_torch_function(torch.Tensor.transpose, (self, dim0, dim1), self, dim0, dim1)
+
+    def bmm(self, st2, *, out=None) -> ShardedTensor:
+        """
+        Performs a batch matrix-matrix product of matrices stored in self and st2.
+
+        Warning: For now we only supports the case when both tensors are sharded
+            by dim 0 so that no communication is needed.
+
+        Args:
+            st2 (ShardedTensor) – the second batch of sharded matrices to be multiplied.
+
+        Returns:
+            A :class:`ShardedTensor` object which is the result of the batch multiplication.
+        """
+        return handle_torch_function(torch.Tensor.bmm, (self, st2, out), self, st2, out=out)
+
+    def chunk(self, chunks, dim=0) -> List[ShardedTensor]:
+        """
+        Attempts to split a tensor into the specified number of chunks.
+        Each chunk is a view of the input tensor.
+
+        Warnings: Chunk by the sharding dim is not supported.
+
+        Args:
+            chunks (int) – number of chunks to return
+            dim (int) – dimension along which to split the tensor
+
+        Returns:
+            A List of :class:`ShardedTensor` object chunked on dims.
+        """
+        return handle_torch_function(torch.Tensor.chunk, (self, chunks, dim), self, chunks, dim=dim)
+
     @property
     def shape(self):
         return self._metadata.size
@@ -721,6 +1000,9 @@ def shape(self):
     def requires_grad(self):
         return self._metadata.tensor_properties.requires_grad
 
+    def requires_grad_(self, requires_grad=True):
+        return handle_torch_function(torch.Tensor.requires_grad_, (self, requires_grad), self, requires_grad)
+
     @property
     def dtype(self):
         return self._metadata.tensor_properties.dtype
@@ -752,6 +1034,45 @@ def __hash__(self):
     def __repr__(self):
         return f'ShardedTensor({self._metadata})'
 
+    def __add__(self, other):
+        return handle_torch_function(torch.Tensor.__add__, (self, other), self, other)
+
+    def __radd__(self, other):
+        return handle_torch_function(torch.Tensor.__radd__, (self, other), self, other)
+
+    def __sub__(self, other):
+        return handle_torch_function(torch.Tensor.__sub__, (self, other), self, other)
+
+    def __rsub__(self, other):
+        return handle_torch_function(torch.Tensor.__rsub__, (self, other), self, other)
+
+    def __mul__(self, other):
+        return handle_torch_function(torch.Tensor.__mul__, (self, other), self, other)
+
+    def __rmul__(self, other):
+        return handle_torch_function(torch.Tensor.__rmul__, (self, other), self, other)
+
+    def __truediv__(self, other):
+        return handle_torch_function(torch.Tensor.__div__, (self, other), self, other)
+
+    def __rtruediv__(self, other):
+        return handle_torch_function(torch.Tensor.__rdiv__, (self, other), self, other)
+
+    def tanh(self):
+        return handle_torch_function(torch.Tensor.tanh, (self,), self)
+
+    def __getitem__(self, key):
+        return handle_torch_function(torch.Tensor.__getitem__, (self, key), self, key)
+
+    def __deepcopy__(self, memo):
+        return handle_torch_function(torch.Tensor.__deepcopy__, (self, memo), self, memo)
+
+    def clone(self, *, memory_format=torch.preserve_format):
+        return handle_torch_function(torch.Tensor.clone, (self,), self, memory_format=memory_format)
+
+    def detach(self):
+        return handle_torch_function(torch.Tensor.detach, (self,), self)
+
     @dataclass
     class ProcessGroupState:
         """
@@ -782,7 +1103,8 @@ def __setstate__(self, state):
         self._local_shards, self._metadata, pg_state, self._sharding_spec, self._init_rrefs = state
 
         # Setup process group
-        self._process_group = get_current_process_group()
+        from torch.distributed._shard.api import _get_current_process_group
+        self._process_group = _get_current_process_group()
 
         # Validate process group.
         local_rank = distributed_c10d.get_rank(self._process_group)
@@ -825,151 +1147,3 @@ def _create_tensor_from_params(*size, local_device, tensor_properties: TensorPro
         device=local_device, requires_grad=requires_grad,
         memory_format=memory_format, pin_memory=pin_memory
     )
-
-
-class _PartialTensor(object):
-    """
-    PartialTensor is an abstraction to represent Tensors that need
-    aggregation across multiple devices and multiple processes.
-
-    PartialTensor is initialized in an SPMD like fashion where each rank
-    initializes the PartialTensor. The PartialTensor object on each rank
-    then only stores the local partial shard, process group and the
-    aggregation way to get a full tensor.
-
-    PartialTensor doesn't provide any Tensor like operations but is a
-    wrapper providing the Tensor representing the local partial shard.
-
-    We assume the size of each local tensor to be exactly the same.
-
-    Users can apply custom distributed sharded computations on top of
-    this primitive.
-
-    Args:
-        local_partial_shard (Tensor): Partial result stored across ranks.
-        process_group (ProcessGroup): The process group to aggregate on.
-        reduce_op (distributed_c10d.ReduceOp): Way to aggregate the partial result.
-            Default: ``distributed_c10d.ReduceOp.SUM``
-
-    Examples:
-        >>> # All tensors below are of torch.int64 type.
-        >>> # We have 2 process groups, 2 ranks.
-        >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
-        >>> tensor = torch.cat([tensor, tensor + 2])
-        >>> tensor
-        tensor([1, 2, 3, 4]) # Rank 0
-        tensor([3, 4, 5, 6]) # Rank 1
-        >>> partial_tensor = _PartialTensor(tensor, distributed_c10d.ReduceOp.MAX)
-        >>> sharding_dim = 0
-        >>> collect_spec = ChunkShardingSpec(
-                dim=sharding_dim,
-                placements=[
-                    "rank:0/cuda:0",
-                    "rank:1/cuda:1",
-                ],
-            )
-        >>> complete_tensor = partial_tensor.reshard(collect_spec)
-        >>> complete_tensor
-        ShardedTensor(
-            ShardedTensorMetadata(
-                shards_metadata=[
-                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
-                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
-                size=torch.Size([4])
-        )
-        >>> complete_tensor.local_tensor()
-        tensor([3, 4]) # Rank 0
-        tensor([5, 6]) # Rank 1
-
-        >>> # All tensors below are of torch.cfloat type.
-        >>> # We have 2 process groups, 2 ranks.
-        >>> tensor = torch.tensor([1, 2]) + 2 * rank
-        >>> tensor = torch.cat([tensor, tensor + 2])
-        >>> tensor
-        tensor([1, 2, 3, 4]) # Rank 0
-        tensor([3, 4, 5, 6]) # Rank 1
-        >>> partial_tensor = _PartialTensor(tensor)
-        >>> complete_tensor = partial_tensor.reshard(collect_spec)
-        >>> complete_tensor
-        ShardedTensor(
-            ShardedTensorMetadata(
-                shards_metadata=[
-                    ShardMetadata(shard_offsets=[0], shard_sizes=[2], placement=rank:0/cuda:0),
-                    ShardMetadata(shard_offsets=[2], shard_sizes=[2], placement=rank:1/cuda:1)],
-                size=torch.Size([4])
-        )
-        >>> complete_tensor.local_tensor()
-        tensor([4, 6]) # Rank 0
-        tensor([8, 10]) # Rank 1
-    """
-
-    def __init__(
-        self, local_shard, process_group=None, reduce_op=distributed_c10d.ReduceOp.SUM
-    ):
-        self.local_shard = local_shard
-        self.process_group = (
-            process_group
-            if process_group
-            else dist.distributed_c10d._get_default_group()
-        )
-        self.reduce_op = reduce_op
-
-    def __post_init__(self):
-        if not isinstance(self.local_shard, torch.Tensor):
-            raise ValueError("local_shard needs to be a Tensor.")
-        if not isinstance(self.reduce_op, distributed_c10d.ReduceOp):
-            raise ValueError(
-                "reduce_op needs to be a member of distributed_c10d.ReduceOp."
-            )
-
-    def reshard(self, resharding_spec: ShardingSpec) -> ShardedTensor:
-        """
-        The reshard happens in two steps logically:
-
-        1. Aggregate all the shards of the partial tensor.
-        2. Shard this tensor according to the provided spec.
-
-        In reality, for the sake of performance, we consolidate all partial tensors
-        across multiple ranks and covert to a sharded tensor in one step.
-
-        Args:
-            resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
-                The specification describing how we reshard the aggregated local result.
-
-        Returns:
-            A :class:`ShardedTensor` filled with local aggregated result.
-        """
-        if not isinstance(resharding_spec, ChunkShardingSpec):
-            raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
-        sharding_dim = int(resharding_spec.dim)  # type: ignore[attr-defined]
-        if self.local_shard.size(sharding_dim) % self.process_group.size() != 0:
-            raise ValueError('World size need to divide the length of the dimension.')
-        if self.local_shard.is_complex():
-            raise NotImplementedError("Only real partial tensor supported for reshard.")
-
-        local_shards = self.local_shard.chunk(self.process_group.size(), dim=sharding_dim)
-        local_result = reduce_scatter(
-            torch.empty_like(local_shards[0]), list(local_shards), op=self.reduce_op
-        )
-
-        sharded_tensor_size = self.local_shard.size()
-        current_offsets = [0] * len(local_result.size())
-        shards = []
-        rank = self.process_group.rank()
-        for idx, placement in enumerate(resharding_spec.placements):  # type: ignore[attr-defined]
-            if rank == placement.rank():  # type: ignore[union-attr]
-                local_metadata = ShardMetadata(
-                    shard_offsets=current_offsets,
-                    shard_sizes=list(local_result.size()),
-                    placement=placement,
-                )
-                shards.append(Shard(local_result, local_metadata))
-                break
-            current_offsets[sharding_dim] += local_result.size(sharding_dim)  # type: ignore[index]
-
-        st = ShardedTensor._init_from_local_shards(
-            shards, tuple(sharded_tensor_size), process_group=self.process_group
-        )
-        st._sharding_spec = copy.deepcopy(resharding_spec)
-
-        return st
diff --git a/torch/distributed/_shard/sharded_tensor/metadata.py b/torch/distributed/_shard/sharded_tensor/metadata.py
index b9a43400ef29..812bee02efd7 100644
--- a/torch/distributed/_shard/sharded_tensor/metadata.py
+++ b/torch/distributed/_shard/sharded_tensor/metadata.py
@@ -3,8 +3,7 @@
 from typing import List
 
 import torch
-from torch.distributed._shard.sharding_spec import ShardMetadata
-
+from torch.distributed._shard.metadata import ShardMetadata
 
 class MEM_FORMAT_ENCODING(Enum):
     TORCH_CONTIGUOUS_FORMAT = 0
@@ -22,28 +21,9 @@ class TensorProperties(object):
     memory_format: torch.memory_format = field(default=torch.contiguous_format)
     pin_memory: bool = False
 
-@dataclass
-class ShardedTensorMetadata(object):
-    """
-    Represents metadata for :class:`ShardedTensor`
-    """
-
-    # Metadata about each shard of the Tensor
-    shards_metadata: List[ShardMetadata] = field(default_factory=list)
-
-    # Size of each dim of the overall Tensor.
-    size: torch.Size = field(default=torch.Size([]))
-
-    tensor_properties: TensorProperties = field(
-        default=TensorProperties(dtype=torch.get_default_dtype(),
-                                 layout=torch.strided,
-                                 requires_grad=False,
-                                 memory_format=torch.contiguous_format,
-                                 pin_memory=False))
-
     def __getstate__(self):
         # Since torch.memory_format cannot be pickled!
-        memory_format = self.tensor_properties.memory_format
+        memory_format = self.memory_format
         if memory_format == torch.contiguous_format:
             mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT
         elif memory_format == torch.channels_last:
@@ -53,22 +33,19 @@ def __getstate__(self):
         else:
             raise RuntimeError(f'Invalid torch.memory_format: {memory_format}')
 
-        # Keep old serialization to ensure backward compatibility
         return (
-            self.shards_metadata,
-            self.size,
-            self.tensor_properties.dtype,
-            self.tensor_properties.layout,
-            self.tensor_properties.requires_grad,
+            self.dtype,
+            self.layout,
+            self.requires_grad,
             mem_format_encoding,
-            self.tensor_properties.pin_memory,
+            self.pin_memory,
         )
 
     def __setstate__(
         self,
         state,
     ):
-        (self.shards_metadata, self.size, dtype, layout, requires_grad, mem_format_encoding, pin_memory) = state
+        (self.dtype, self.layout, self.requires_grad, mem_format_encoding, self.pin_memory) = state
 
         if mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT:
             memory_format = torch.contiguous_format
@@ -79,6 +56,18 @@ def __setstate__(
         else:
             raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}')
 
-        self.tensor_properties = TensorProperties(
-            dtype=dtype, layout=layout, requires_grad=requires_grad,
-            memory_format=memory_format, pin_memory=pin_memory, )
+        self.memory_format = memory_format
+
+@dataclass
+class ShardedTensorMetadata(object):
+    """
+    Represents metadata for :class:`ShardedTensor`
+    """
+
+    # Metadata about each shard of the Tensor
+    shards_metadata: List[ShardMetadata] = field(default_factory=list)
+
+    # Size of each dim of the overall Tensor.
+    size: torch.Size = field(default=torch.Size([]))
+
+    tensor_properties: TensorProperties = field(default=TensorProperties())
diff --git a/torch/distributed/_shard/sharded_tensor/reshard.py b/torch/distributed/_shard/sharded_tensor/reshard.py
index 23aa63d32253..25c10f86aa94 100644
--- a/torch/distributed/_shard/sharded_tensor/reshard.py
+++ b/torch/distributed/_shard/sharded_tensor/reshard.py
@@ -6,10 +6,7 @@
 from torch._C._distributed_c10d import (
     ProcessGroup,
 )
-from torch.distributed._shard.sharding_spec import (
-    ShardingSpec,
-    ShardMetadata,
-)
+import torch.distributed._shard.sharding_spec as shard_spec
 from torch.distributed._shard.sharding_spec._internals import (
     get_split_size,
     get_chunked_dim_size,
@@ -18,6 +15,7 @@
     all_to_all,
     all_to_all_single,
 )
+from torch.distributed._shard.metadata import ShardMetadata
 
 from .shard import Shard
 
@@ -48,7 +46,7 @@ def get_idx_from_placements(placements, current_rank) -> int:
 
 def build_reshard_metadata(
     st_size: torch.Size,
-    sharding_spec: ShardingSpec,
+    sharding_spec: shard_spec.ShardingSpec,
     world_size: int,
 ) -> Tuple[List[ShardMetadata], List[int]]:
     """
@@ -89,8 +87,8 @@ def build_reshard_metadata(
 def reshuffle_local_shard(
     local_shard: torch.Tensor,
     st_size: torch.Size,
-    sharding_spec: ShardingSpec,
-    resharding_spec: ShardingSpec,
+    sharding_spec: shard_spec.ShardingSpec,
+    resharding_spec: shard_spec.ShardingSpec,
     pg: ProcessGroup,
 ) -> Tuple[List[Shard], List[ShardMetadata]]:
     """
@@ -156,8 +154,8 @@ def reshuffle_local_shard(
 def reshard_local_shard(
     local_tensor: torch.Tensor,
     st_size: torch.Size,
-    sharding_spec: ShardingSpec,
-    resharding_spec: ShardingSpec,
+    sharding_spec: shard_spec.ShardingSpec,
+    resharding_spec: shard_spec.ShardingSpec,
     pg: ProcessGroup,
 ) -> Tuple[List[Shard], List[ShardMetadata]]:
     """
diff --git a/torch/distributed/_shard/sharded_tensor/shard.py b/torch/distributed/_shard/sharded_tensor/shard.py
index 7436cfa3388f..aea13390fb5c 100644
--- a/torch/distributed/_shard/sharded_tensor/shard.py
+++ b/torch/distributed/_shard/sharded_tensor/shard.py
@@ -2,7 +2,7 @@
 from typing import List, cast
 
 import torch
-from torch.distributed._shard.sharding_spec import ShardMetadata
+from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed.remote_device import _remote_device
 
 
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index 98fa1140c44f..01b48d53cf11 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -1,51 +1,19 @@
 import collections.abc
-from contextlib import contextmanager
+import copy
 from typing import Optional, List, Sequence
 
 import torch
 from torch.distributed import distributed_c10d
 from torch.distributed import rpc
-from torch.distributed._shard.sharding_spec import (
-    ShardMetadata,
-)
 from torch.distributed._shard.sharding_spec._internals import (
     check_tensor,
     validate_non_overlapping_shards_metadata,
 )
 
+from torch.distributed._shard.metadata import ShardMetadata
 from .metadata import TensorProperties, ShardedTensorMetadata
 from .shard import Shard
 
-# Tracks the current process group in the load context manager.
-_CURRENT_PROCESS_GROUP = None
-
-@contextmanager
-def load_with_process_group(process_group):
-    """
-    Context manager to set the process group with which to load a ShardedTensor.
-    """
-    global _CURRENT_PROCESS_GROUP
-    if _CURRENT_PROCESS_GROUP is not None:
-        raise RuntimeError(
-            'ProcessGroup already set by previous "load_with_process_group" '
-            'context manager')
-    _CURRENT_PROCESS_GROUP = process_group
-    try:
-        yield process_group
-    finally:
-        _CURRENT_PROCESS_GROUP = None
-
-def get_current_process_group():
-    """
-    Retrieves the current process group set by ``load_with_process_group``.
-    If not set, it just returns the default group.
-    """
-    global _CURRENT_PROCESS_GROUP
-    if _CURRENT_PROCESS_GROUP is None:
-        return distributed_c10d._get_default_group()
-    else:
-        return _CURRENT_PROCESS_GROUP
-
 def _parse_and_validate_remote_device(pg, remote_device):
 
     worker_name = remote_device.worker_name()
@@ -92,9 +60,9 @@ def _validate_output_tensor_for_gather(
             "on non-destination ranks."
         )
 
-def _flatten_tensor_size(size) -> List[int]:
+def _flatten_tensor_size(size) -> torch.Size:
     """
-    Checks if tensor size is valid, then flatten/return the list of ints.
+    Checks if tensor size is valid, then flatten/return a torch.Size object.
     """
     if len(size) == 1 and isinstance(size[0], collections.abc.Sequence):
         dims = list(*size)
@@ -105,7 +73,7 @@ def _flatten_tensor_size(size) -> List[int]:
         if not isinstance(dim, int):
             raise TypeError(f'size has to be a sequence of ints, found: {dims}')
 
-    return dims
+    return torch.Size(dims)
 
 def _raise_if_mismatch(expected, actual, prop_name, ranks, is_local=True):
     if is_local:
@@ -125,7 +93,7 @@ def _raise_if_mismatch(expected, actual, prop_name, ranks, is_local=True):
 
 def build_metadata_from_local_shards(
     local_shards: List[Shard],
-    global_size: List[int],
+    global_size: torch.Size,
     current_rank: int,
     pg: distributed_c10d.ProcessGroup
 ) -> ShardedTensorMetadata:
@@ -184,7 +152,7 @@ def build_metadata_from_local_shards(
 
     local_sharded_tensor_metadata = ShardedTensorMetadata(
         shards_metadata=local_shard_metadatas,
-        size=torch.Size(global_size),
+        size=global_size,
         tensor_properties=local_tensor_properties)
 
     return local_sharded_tensor_metadata
@@ -199,7 +167,7 @@ def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMet
             continue
 
         if global_sharded_tensor_metadata is None:
-            global_sharded_tensor_metadata = rank_metadata
+            global_sharded_tensor_metadata = copy.deepcopy(rank_metadata)
             global_metadata_rank = rank
         else:
             _raise_if_mismatch(global_sharded_tensor_metadata.size,
diff --git a/torch/distributed/_shard/sharder.py b/torch/distributed/_shard/sharder.py
new file mode 100644
index 000000000000..bf3b3596d1be
--- /dev/null
+++ b/torch/distributed/_shard/sharder.py
@@ -0,0 +1,27 @@
+import abc
+import torch.nn as nn
+
+class Sharder(abc.ABC):
+    """
+    This is an interface which allows user to create more advanced
+    sharding strategies that are not easily be composed by the
+    `ShardingSpec`.
+
+    :class:`torch.distributed._shard.sharding_plan.ShardingPlan` could
+    take an object of the `Sharder` and call `shard` to shard the module,
+    then replace the original module with sharded module returned.
+    """
+    @abc.abstractmethod
+    def shard(self, module: nn.Module) -> nn.Module:
+        """
+        Shard a module base on the implementation of this method, and
+        return the sharded version of the module.
+
+        Args:
+            module (:class:`torch.nn.Module`):
+                The module to apply sharding to.
+        Returns:
+            A :class:`torch.nn.Module` object that represents a module
+            that's already been sharded.
+        """
+        pass
diff --git a/torch/distributed/_shard/sharding_plan/__init__.py b/torch/distributed/_shard/sharding_plan/__init__.py
new file mode 100644
index 000000000000..269dfd8af760
--- /dev/null
+++ b/torch/distributed/_shard/sharding_plan/__init__.py
@@ -0,0 +1,4 @@
+from .api import (
+    ShardingPlan,
+    ShardingPlanner
+)
diff --git a/torch/distributed/_shard/sharding_plan/api.py b/torch/distributed/_shard/sharding_plan/api.py
new file mode 100644
index 000000000000..113212f0f4dd
--- /dev/null
+++ b/torch/distributed/_shard/sharding_plan/api.py
@@ -0,0 +1,84 @@
+import abc
+import torch.nn as nn
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+from torch.distributed._shard.sharder import Sharder
+from torch.distributed._shard.sharding_spec import ShardingSpec
+
+@dataclass
+class ShardingPlan(object):
+    """
+    Representation of a sharding plan, describes how to shard a module
+    across hosts. `plan` is used to shard module parameters according to the spec provided,
+    `output_plan` and `return_local_tensor` are optional, they are used to specify the output
+    layout of a module with a spec, and when to convert back to data parallel fashion.
+
+    Args:
+        plan (Dict[str, Union[:class:`torch.distributed._shard.sharding_spec.ShardingSpec`,
+              :class:`torch.distributed._shard.sharder.Sharder`]):
+            a dict describes how to shard a module, there're currently two ways to shard a module:
+                1. directly shard a module parameter by a `ShardingSpec`, keyed by the name of
+                   a parameter to a `ShardingSpec`.
+                2. shard a submodule by applying a `Sharder` on it, keyed by the name of a module
+                   to a `Sharder` object.
+        output_plan (Dict[str, :class:`torch.distributed._shard.sharding_spec.ShardingSpec`), optional):
+            a dict specifies the layout of a module's output which produces a ShardedTensor,
+            keyed by the name of module to ShardingSpec("" in key means the root module).
+            Default: `None`
+        return_local_tensor (List[str], optional): a list of string, each element enables
+            a module's sharded output to be returned as a Tensor from its local shards to
+            ensure further processsing in a data parallel fashion. ("" in list means the
+            root module).
+            Default: None
+    Example:
+      Suppose we want to shard a module with two linear layers and then run it with DDP, we also
+      want to convert the output of the second linear layer back to DDP, we can do it as follows:
+
+        >>> class MyModule(nn.Module):
+        >>>     def __init__(self):
+        >>>        super().__init__()
+        >>>        self.fc1 = nn.Linear()
+        >>>        self.gelu = nn.GELU()
+        >>>        self.fc2 = nn.Linear()
+        >>>        self.relu = nn.Linear()
+        >>>
+        >>>     def forward(self, input):
+        >>>         return self.relu(self.fc2(self.gelu(self.fc1(input))))
+
+
+        >>> sharding_plan = ShardingPlan(
+        >>>    plan={
+        >>>        "fc1.weight": spec1,
+        >>>        "fc2.weight": spec2
+        >>>    },
+        >>>    output_plan={
+        >>>        "fc2": output_spec
+        >>>    },
+        >>>    return_local_tensor=["fc2"]
+        >>> )
+    """
+    plan: Dict[str, Union[ShardingSpec, Sharder]]
+    output_plan: Optional[Dict[str, ShardingSpec]] = None
+    return_local_tensor: Optional[List[str]] = None
+
+
+class ShardingPlanner(abc.ABC):
+    """
+    Default ShardingPlanner interface, can be extended and
+    implement advanced sharding strategies.
+    """
+    @abc.abstractmethod
+    def build_plan(self, module: nn.Module) -> ShardingPlan:
+        """
+        Given a nn.Module, define how to shard the module across
+        ranks, return a ShardingPlan
+        Args:
+            module (:class:`torch.nn.Module`):
+                The module to apply sharding to.
+        Returns:
+            A :class:`torch.distributed._shard.sharding_plan.ShardingPlan` object that
+            represents how to shard the module.
+        """
+        pass
diff --git a/torch/distributed/_shard/sharding_spec/__init__.py b/torch/distributed/_shard/sharding_spec/__init__.py
index f25c849559d7..e356295e1a8a 100644
--- a/torch/distributed/_shard/sharding_spec/__init__.py
+++ b/torch/distributed/_shard/sharding_spec/__init__.py
@@ -1,8 +1,12 @@
 from .api import (
-    ChunkShardingSpec,
     DevicePlacementSpec,
     EnumerableShardingSpec,
     PlacementSpec,
-    ShardMetadata,
     ShardingSpec,
+    _infer_sharding_spec_from_shards_metadata,
+)
+from .chunk_sharding_spec import (
+    ChunkShardingSpec,
 )
+
+from torch.distributed._shard.metadata import ShardMetadata
diff --git a/torch/distributed/_shard/sharding_spec/_internals.py b/torch/distributed/_shard/sharding_spec/_internals.py
index 3582811f62e5..99eb9b9bb090 100644
--- a/torch/distributed/_shard/sharding_spec/_internals.py
+++ b/torch/distributed/_shard/sharding_spec/_internals.py
@@ -1,49 +1,6 @@
-from dataclasses import dataclass
-from typing import List, Union
-
-import torch
-from torch.distributed.remote_device import _remote_device
-
-@dataclass
-class ShardMetadata(object):
-    """
-    Represents a shard of the overall Tensor including its
-    offsets, lengths and device placement.
-
-    Args:
-        shard_offsets(List[int]): Offsets in the original tensor indicating
-            the start offsets for this shard. Should have the same rank as
-            the original tensor.
-        shard_sizes(List[int]): Integers indicating the size of each
-            dimension for this shard. Should have the same rank as the
-            original tensor.
-        placement(:class:`torch.distributed._remote_device`):
-            Specifies the placement of this shard.
-    """
-
-    __slots__ = ['shard_offsets', 'shard_sizes', 'placement']
-
-    shard_offsets: List[int]
-    shard_sizes: List[int]
-    placement: Union[str, _remote_device]
-
-    def __post_init__(self):
-        if isinstance(self.placement, str):
-            self.placement = torch.distributed._remote_device(self.placement)
-
-        if len(self.shard_offsets) != len(self.shard_sizes):
-            raise ValueError(
-                f'shard_offsets and shard_sizes should have '
-                f'the same number of elements, found {len(self.shard_offsets)} '
-                f'and {self.shard_sizes} respectively')
-
-        for i in range(len(self.shard_offsets)):
-            if self.shard_offsets[i] < 0:
-                raise ValueError('shard_offsets should be >=0')
-            if self.shard_sizes[i] < 0:
-                raise ValueError('shard_sizes should be >= 0')
-
+from typing import List
 
+from torch.distributed._shard.metadata import ShardMetadata
 
 def _check_shard_metadata_pair_overlap(shard1: ShardMetadata, shard2: ShardMetadata):
     """
diff --git a/torch/distributed/_shard/sharding_spec/api.py b/torch/distributed/_shard/sharding_spec/api.py
index 6f1d0b81ac43..86181e3d5c37 100644
--- a/torch/distributed/_shard/sharding_spec/api.py
+++ b/torch/distributed/_shard/sharding_spec/api.py
@@ -1,12 +1,24 @@
-from abc import ABC
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Union
+import functools
+from typing import Callable, Dict, List, Type, TYPE_CHECKING
+
 import torch
 
 from ._internals import (
-    ShardMetadata,
+    check_tensor,
+    get_chunked_dim_size,
+    get_split_size,
     validate_non_overlapping_shards_metadata
 )
+from torch.distributed._shard.metadata import ShardMetadata
+
+import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
+
+if TYPE_CHECKING:
+    # Only include ShardedTensor when do type checking, exclude it
+    # from run-time to resolve circular dependency.
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
 
 class PlacementSpec(ABC):
     """
@@ -32,57 +44,108 @@ def __post_init__(self):
         if not isinstance(self.device, torch.distributed._remote_device):
             self.device = torch.distributed._remote_device(self.device)
 
+class ShardingSpec(ABC):
+    """
+    Base class representing sharding specifications.
+    """
+    @abstractmethod
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        """
+        Given a global tensor size, define how to shard a tensor like this shape
+        across ranks, return ShardedTensorMetadata
+        Args:
+            tensor_sizes (:class:`torch.Size`):
+                The tensor shape to shard on, a `torch.Size` object that represents the
+                tensor shape to be sharded according to the ShardingSpec.
+            tensor_properties(:class:`torch.distributed._shard.sharded_tensor.TensorProperties):
+                Tensor properties used to create a ShardedTensor.
+        Returns:
+            A :class:`ShardedTensorMetadata` object that encodes the information about
+            the layout of the ShardedTensor and its properties.
+        """
+
+    @abstractmethod
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        """
+        Given a global tensor on src_rank, shard this tensor
+        across ranks within the process group, return a ShardedTensor.
+        Args:
+            tensor (:class:`torch.Tensor`): Tensor needs to be sharded.
+        Keyword args:
+            src_rank (int, optional): The source rank which is used as the ground truth of
+                the data for the parameter that would be sharded and scattered
+                across the rest of the ranks.
+                Default: 0.
+            process_group (ProcessGroup, optional): The process group to work on. If None,
+                the default process group will be used.
+        Returns:
+            A :class:`ShardedTensor` sharded from the given tensor.
+        """
+
+# Ops customized for a particular ShardingSpec.
+CUSTOM_SHARDING_SPEC_OPS: Dict[str, Dict[Callable, Callable]] = {}
 
-class ShardingSpec(PlacementSpec):
+def _register_custom_op(sharding_spec_cls: Type, op: Callable, func: Callable):
     """
-    Base class representing sharding specifications. It is special type of
-    PlacementSpec.
+    Allows registration of a custom op for ShardedTensor to enable
+    custom optimizations for a particular ShardingSpec.
+    Args:
+        sharding_spec(type): The ShardingSpec for which we need to add this custom op.
+        op(Callable): The op to override (ex: torch.bmm)
+        func(Callable): The custom implementation for ``op``
     """
-    pass
+    from inspect import signature
+    if len(signature(func).parameters) != 3:
+        raise TypeError(
+            f'Custom sharded op function expects signature: '
+            f'(types, args, kwargs), but received '
+            f'signature: {signature(func)}')
 
+    global CUSTOM_SHARDING_SPEC_OPS
+    class_name = sharding_spec_cls.__qualname__
+    if class_name not in CUSTOM_SHARDING_SPEC_OPS:
+        CUSTOM_SHARDING_SPEC_OPS[class_name] = {}
+    CUSTOM_SHARDING_SPEC_OPS[class_name][op] = func
 
-@dataclass
-class ChunkShardingSpec(ShardingSpec):
+def _has_custom_op(sharding_spec, op):
+    """
+    Returns whether or not the ShardingSpec has a custom op implementation.
     """
-    This is a type of PlacementSpec that defines the placement as being sharded
-    across multiple devices. In particular, it represents sharding a Tensor
-    along a single dimension into equal chunks (similar to :meth:`torch.chunk`).
+    class_name = type(sharding_spec).__qualname__
+    return class_name in CUSTOM_SHARDING_SPEC_OPS and op in CUSTOM_SHARDING_SPEC_OPS[class_name]
 
-    The semantics of how a tensor is partitioned is inline with
-    :meth:`torch.chunk`, where ``dim`` in torch.chunk corresponds to the
-    specified ``dim`` and ``chunks`` in torch.chunk is the number of elements
-    in the placement specified.
+def _dispatch_custom_op(sharding_spec, op: Callable, types, args, kwargs):
+    """
+    Calls the custom op for this ShardingSpec if it exists.
+    """
+    class_name = type(sharding_spec).__qualname__
+    if not _has_custom_op(sharding_spec, op):
+        raise RuntimeError(f'Custom op: {op} not registered for {class_name}')
+    func = CUSTOM_SHARDING_SPEC_OPS[class_name][op]
+    return func(types, args, kwargs)
 
+def custom_sharding_spec_op(sharding_spec_class, func):
+    """
+    Decorator to allow custom registration of ops.
     Args:
-        dim (int or str):
-            The dimension to shard on, could be an integer representing the
-            dimension or a string in case of named tensors where dimensions are
-            named.
-        placement(List[Union[_remote_device, str]]):
-            Specifies the placement of each shard of the Tensor. The size of
-            the list represents the number of shards to be created. This could
-            be a list of
-            :class:`torch.distributed._remote_device`'s. This list
-            could also contain a string which represents remote
-            device as accepted by
-            :class:`torch.distributed._remote_device`
+        sharding_spec_class(type): The ShardingSpec for which we need to add this custom op.
+        func(Callable): The op to override (ex: torch.bmm)
     """
+    def decorator_sharded_func(wrapped_func):
+        from torch.distributed._shard.sharded_tensor._ops._common import _basic_validation
 
-    ShardingDim = Union[int, str]
+        @functools.wraps(wrapped_func)
+        def wrapper(types, args, kwargs):
+            _basic_validation(func, args, kwargs)
+            return wrapped_func(types, args, kwargs)
 
-    dim: ShardingDim
-    placements: List[Union[torch.distributed._remote_device, str]]
+        _register_custom_op(sharding_spec_class, func, wrapper)
+        return wrapper
 
-    def __post_init__(self):
-        self._verify_dim(self.dim)
-        for i, remote_device in enumerate(self.placements):
-            if not isinstance(remote_device, torch.distributed._remote_device):
-                self.placements[i] = torch.distributed._remote_device(remote_device)
-
-    @staticmethod
-    def _verify_dim(dim):
-        if not (isinstance(dim, int) or isinstance(dim, str)):
-            raise ValueError(f'{dim} needs to either be an int or str')
+    return decorator_sharded_func
 
 
 @dataclass
@@ -110,3 +173,88 @@ def __post_init__(self):
             rank = len(shard.shard_offsets)
 
         validate_non_overlapping_shards_metadata(self.shards)
+
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        # check if shards form a valid tensor
+        check_tensor(self.shards, tensor_sizes)
+        return sharded_tensor_meta.ShardedTensorMetadata(
+            self.shards,
+            tensor_sizes,
+            tensor_properties
+        )
+
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        # TODO: figure out a generic and efficient way to scatter the shards for EnumerableShardingSpec
+        raise NotImplementedError("EnumerableShardingSpec.shard not implemented yet!")
+
+
+def _infer_sharding_spec_from_shards_metadata(shards_metadata):
+    """
+    Infer the sharding spec from the metadata of each shard of a ShardedTensor.
+    If the tensor is sharded only on one dimension, we can then verify whether it's
+    a ChunkShardingSpec or not. The way to verify it is to first get the total length
+    and perform a chunk sharding with the given placements to see if we can have the
+    same chunk size as the given shards_metadata. If not, we assume it's enum sharded.
+
+    Args:
+        shards_metadata (List[ShardMetadata]): List of Metadata of local shards.
+
+    Returns:
+        A :class:`torch.distributed._shard.sharding_spec.ShardingSpec` object of sharding
+            spec for one sharded tensor.
+    """
+    placements = []
+    chunk_sharding_dim = None
+    chunk_offset_list = []
+    shard_size_list = []
+    # collect local shard metadatas from the global sharded_tensor_metadata
+    for shard_metadata in shards_metadata:  # type: ignore[attr-defined]
+        placements.append(shard_metadata.placement)
+        local_offsets = shard_metadata.shard_offsets
+        chunk_offset_list.append(sum(local_offsets))
+        shard_size_list.append(shard_metadata.shard_sizes)
+        shard_dims = [idx for idx, e in enumerate(local_offsets) if e != 0]
+        # If the offset is [0, 0, ..., 0] (all zeros),
+        # we cannot decide whether how the tensor is sharded.
+        if len(shard_dims) == 0:
+            continue
+        # If the offset is [0, N, .,0, M, 0, .., 0],
+        # we are sure it's sharded by more than one dimension.
+        if len(shard_dims) != 1:
+            chunk_sharding_dim = None
+            break
+        # If the offset is [0, 0, .,0, M, 0, .., 0], aka, it's sharded by just
+        # one dimension, we need to make sure all ranks share the same dimension.
+        if not chunk_sharding_dim:
+            chunk_sharding_dim = shard_dims[0]
+        elif chunk_sharding_dim != shard_dims[0]:
+            chunk_sharding_dim = None
+            break
+
+    if chunk_sharding_dim is not None:
+        # Ensure we infer the correct placement order from offsets
+        placements = [
+            x for _, x in sorted(zip(chunk_offset_list, placements), key=lambda e: e[0])
+        ]
+
+        from .chunk_sharding_spec import ChunkShardingSpec
+        chunk_spec = ChunkShardingSpec(
+            dim=chunk_sharding_dim,
+            placements=placements,
+        )
+        shard_sizes = sorted([x[chunk_sharding_dim] for x in shard_size_list])
+        shard_total_length = sum(shard_sizes)
+        chunks = len(placements)
+        split_size = get_split_size(shard_total_length, chunks)
+        chunk_shard_sizes = sorted(
+            [
+                get_chunked_dim_size(shard_total_length, split_size, idx)
+                for idx in range(len(placements))
+            ]
+        )
+        if shard_sizes == chunk_shard_sizes:
+            return chunk_spec
+    return EnumerableShardingSpec(shards_metadata)
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
new file mode 100644
index 000000000000..479eea216843
--- /dev/null
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
@@ -0,0 +1,193 @@
+from dataclasses import dataclass
+import torch
+import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._shard.sharded_tensor.utils import (
+    _parse_and_validate_remote_device
+)
+from torch.distributed._shard._utils import narrow_tensor
+import torch.distributed as dist
+from typing import List, Union, TYPE_CHECKING
+from ._internals import (
+    get_chunked_dim_size,
+    get_split_size,
+)
+
+from .api import ShardingSpec
+
+if TYPE_CHECKING:
+    # Only include ShardedTensor when do type checking, exclude it
+    # from run-time to resolve circular dependency.
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+@dataclass
+class ChunkShardingSpec(ShardingSpec):
+    """
+    This is a type of PlacementSpec that defines the placement as being sharded
+    across multiple devices. In particular, it represents sharding a Tensor
+    along a single dimension into equal chunks (similar to :meth:`torch.chunk`).
+
+    The semantics of how a tensor is partitioned is inline with
+    :meth:`torch.chunk`, where ``dim`` in torch.chunk corresponds to the
+    specified ``dim`` and ``chunks`` in torch.chunk is the number of elements
+    in the placement specified.
+
+    Args:
+        dim (int or str):
+            The dimension to shard on, could be an integer representing the
+            dimension or a string in case of named tensors where dimensions are
+            named. Note that named tensor support is not added yet.
+        placement(List[Union[_remote_device, str]]):
+            Specifies the placement of each shard of the Tensor. The size of
+            the list represents the number of shards to be created. This could
+            be a list of
+            :class:`torch.distributed._remote_device`'s. This list
+            could also contain a string which represents remote
+            device as accepted by
+            :class:`torch.distributed._remote_device`
+    """
+
+    ShardingDim = Union[int, str]
+
+    dim: ShardingDim
+    placements: List[Union[torch.distributed._remote_device, str]]
+
+    def __post_init__(self):
+        self._verify_dim(self.dim)
+        for i, remote_device in enumerate(self.placements):
+            if not isinstance(remote_device, torch.distributed._remote_device):
+                self.placements[i] = torch.distributed._remote_device(remote_device)
+
+    @staticmethod
+    def _verify_dim(dim):
+        # Validate the sharding spec.
+        # TODO: support named dimension
+        if isinstance(dim, str):
+            raise NotImplementedError(
+                "ChunkShardingSpec does not support named dimension yet!"
+            )
+
+        if not isinstance(dim, int):
+            raise ValueError(
+                f"Sharding dim needs to be an integer, found: {dim}"
+            )
+
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        tensor_num_dim = len(tensor_sizes)
+
+        self._verify_dim(self.dim)
+        if self.dim >= tensor_num_dim or self.dim < -tensor_num_dim:  # type: ignore[operator]
+            raise ValueError(f"Invalid sharding dim: {self.dim}")
+
+        shards_metadata = []
+        sharding_dim_size = tensor_sizes[self.dim]  # type: ignore[index]
+        chunks = len(self.placements)
+        split_size = get_split_size(sharding_dim_size, chunks)
+        for idx, placement in enumerate(self.placements):
+            # generate ShardMetadata for each placement device
+            chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
+            if chunked_dim_size > 0:
+                shard_size = list(tensor_sizes)
+                current_offsets = [0] * tensor_num_dim
+                current_offsets[self.dim] = split_size * idx  # type: ignore[index]
+                shard_size[self.dim] = chunked_dim_size  # type: ignore[index]
+
+                shard_metadata = ShardMetadata(
+                    shard_offsets=current_offsets,
+                    shard_sizes=shard_size,
+                    placement=placement,
+                )
+                shards_metadata.append(shard_metadata)
+
+                # current_offsets[self.dim] += chunked_dim_size  # type: ignore[index]
+
+        return sharded_tensor_meta.ShardedTensorMetadata(
+            shards_metadata,
+            tensor_sizes,
+            tensor_properties
+        )
+
+
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        # relative imports to avoid circular dependency
+        from torch.distributed._shard.sharded_tensor import (
+            ShardedTensor
+        )
+        tensor_properties = sharded_tensor_meta.TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned()
+        )
+        current_rank = dist.get_rank(process_group)
+        tensor_meta = self.build_metadata(tensor.size(), tensor_properties)
+        local_shards = []
+        local_tensor = None
+        local_metadata = None
+        tensors_to_scatter = [None] * dist.get_world_size(process_group)
+
+        sharding_dim_size = tensor.size()[self.dim]  # type: ignore[index]
+        chunks = len(self.placements)
+        split_size = get_split_size(sharding_dim_size, chunks)
+        scatter_shape = list(tensor.size())
+        scatter_shape[self.dim] = split_size  # type: ignore[index]
+
+        for shard_meta in tensor_meta.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(process_group, shard_meta.placement)
+            if current_rank == src_rank:
+                # Reshape to get shard for this rank and we don't want autograd
+                # recording here for the narrow op and 'local_shard' should be a
+                # leaf variable in the autograd graph.
+                narrowed_tensor = narrow_tensor(tensor, shard_meta)
+                if shard_meta.shard_sizes[self.dim] < split_size:  # type: ignore[index]
+                    # for the last shard that might be smaller to other shards
+                    # resize the narrowed tensor to the same size and use it for
+                    # the scatter collective as dist.scatter requires same size
+                    # inputs on every rank
+                    tensor_to_scatter = narrowed_tensor.detach().clone().resize_(scatter_shape)
+                else:
+                    tensor_to_scatter = narrowed_tensor.detach().clone().contiguous()
+
+                tensors_to_scatter[rank] = tensor_to_scatter
+
+            if current_rank == rank:
+                local_tensor = torch.empty(
+                    scatter_shape, dtype=tensor.dtype, layout=tensor.layout, device=device)
+                local_metadata = shard_meta
+
+        # each rank should have local_tensor and local_metadata initialized if we build
+        # the metadata list in a correct way.
+        assert local_tensor is not None
+        assert local_metadata is not None
+
+        # Scatter the shards to all ranks in the pg
+        dist.scatter(
+            local_tensor,
+            scatter_list=tensors_to_scatter if current_rank == src_rank else None,
+            src=src_rank,
+            group=process_group
+        )
+
+        if list(local_tensor.size()) != local_metadata.shard_sizes:
+            # detach again after receiving to ensure local shards remain a leaf node
+            local_tensor = local_tensor.resize_(local_metadata.shard_sizes).detach()
+
+        # Sync requires_grad to local_shard.
+        local_tensor.requires_grad = tensor.requires_grad
+
+        local_shards.append(Shard(tensor=local_tensor, metadata=local_metadata))
+
+        st = ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            tensor_meta,
+            process_group=process_group)
+
+        # Manually set sharding_spec
+        st._sharding_spec = self
+
+        return st
diff --git a/tools/codegen/operator_versions/__init__.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py
similarity index 100%
rename from tools/codegen/operator_versions/__init__.py
rename to torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
new file mode 100644
index 000000000000..b9fa4c46623a
--- /dev/null
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@@ -0,0 +1,447 @@
+# coding=utf-8
+
+from typing import List
+
+import torch
+import torch.distributed as dist
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.sharded_tensor._ops._common import _sharded_op_common
+from torch.distributed._shard.sharded_tensor import (
+    sharded_op_impl,
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    get_split_size,
+    get_chunked_dim_size,
+)
+from torch.distributed.nn.functional import (
+    all_gather,
+    all_to_all_single,
+)
+
+
+def _chunk_sharding_spec_check(spec, op):
+    """
+    For the given op implementation check if the sharding spec is ChunkShardingSpec.
+    """
+    if not isinstance(spec, shard_spec.ChunkShardingSpec):
+        raise NotImplementedError(
+            f"Only ChunkShardingSpec supported for '{op.__name__}'."
+        )
+
+def _register_sharded_op_on_local_tensor(
+    op, early_stop_func=None, extra_check=None, customized_func=None
+):
+    """
+    Handles ``__torch_function__`` dispatch for ops which are performed on
+    the single local tensor of the sharded tensor such as op like
+    ``torch.nn.functional.softmax`` or ``torch.Tensor.view``.
+
+    For more complicated ops, a customized func can be used to generate
+    the new local tensor, sharding spec and sharded tensor size.
+
+    Args:
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
+        customized_func (Callable, optional): the func for customized logic
+            to generate the new local tensor, sharding spec and sharded tensor size.
+            Default: if ``None``, we simply lower to the real op call with
+                the single local tensor of the st.
+
+    Return:
+        func (Callable): registered implementation for sharded op for
+        ``__torch_function__`` dispatch.
+    """
+    @sharded_op_impl(op)
+    @_sharded_op_common(op, early_stop_func, extra_check)
+    def sharded_tensor_op_on_local_tensor(types, args=(), kwargs=None, pg=None):
+        st = args[0]
+        sharding_spec = st.sharding_spec()
+        _chunk_sharding_spec_check(sharding_spec, op)
+        if len(st.local_shards()) != 1:
+            raise TypeError(
+                f"torch function '{op.__name__}', with args: {args} and "
+                f"kwargs: {kwargs} only supported for single local tensor!"
+            )
+        st_size = st.size()
+        if customized_func:
+            local_tensor, sharding_spec, st_size = customized_func(args, kwargs, pg)
+        else:
+            args = (st.local_tensor(), *args[1:])
+            local_tensor = op(*args, **kwargs)
+        return ShardedTensor._init_from_local_tensor(
+            local_tensor.contiguous(),
+            sharding_spec,
+            st_size,  # type: ignore[arg-type]
+            process_group=pg,
+            init_rrefs=st._init_rrefs,
+        )
+
+
+def _handle_col_wise_sharding_base(
+    op_func,
+    col_dim,
+    input,
+    world_size,
+    weight,
+    local_shard,
+    pg,
+    gathered_inputs=None,
+    mode=None,
+    gathered_per_sample_weights=None,
+    gathered_offsets=None,
+    padding_idx=None,
+):
+    """
+    For col-wise sharding of weight, lots of logic are common.
+    So we extract the common logic and put in this function:
+    Step 1. To get input from each rank and
+    Step 2. To perform the op on the concatenated tensor.
+    Step 3. To distribute results to each rank with col rearrangement.
+    Step 4. To concatenate all results from all ranks.
+
+    Args:
+        op_func: operator which is applied to the input tensor.
+        col_dim: dim of result tensor after the operation.
+        input: tensor to be applied op on.
+        world_size: number of ranks.
+        weight: shareded weight tensor.
+        local_shard: col-wise sharded weight tensor.
+        pg: process group.
+        gathered_inputs: list of inputs from all ranks. If specified, we
+            don't need to communicate with each rank any more.
+        mode: aggregation mode of EmbeddingBag.
+        gathered_per_sample_weights: per_sample_weights across all ranks.
+        gathered_offsets: offsets across all ranks.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+            Note that the embedding vector at padding_idx is
+            excluded from the reduction.
+
+    Return: final result of input being applied with the op.
+    """
+    if gathered_inputs is None:
+        # allgather the inputs first.
+        gathered_inputs = all_gather(input, group=pg)
+
+    # run the operator's function for all the inputs.
+    results = []
+    for i, inp in enumerate(gathered_inputs):
+        if op_func == torch.nn.functional.embedding_bag:
+            result = op_func(
+                inp,
+                local_shard,
+                offsets=gathered_offsets[i] if gathered_offsets is not None else None,
+                mode=mode,
+                per_sample_weights=gathered_per_sample_weights[i]
+                if gathered_per_sample_weights is not None
+                else None,
+                padding_idx=padding_idx,
+            )
+        elif op_func == torch.nn.functional.embedding:
+            result = op_func(
+                inp,
+                local_shard,
+                padding_idx=padding_idx,
+            )
+        else:
+            result = op_func(inp, local_shard)
+        results.append(torch.transpose(result, 0, col_dim))
+
+    # Distribute results to each rank with col rearrangement.
+    output = _result_distribute_with_col_rearrange(
+        results, input, world_size, weight, pg
+    )
+
+    # transpose the output and return result.
+    return torch.transpose(output, 0, col_dim)
+
+
+def _result_distribute_with_col_rearrange(
+    results, input, world_size, weight, pg
+):
+    """
+    For col-wise sharding of weight, we need to distribute
+    results to each rank. We do them in this function.
+    Note that, if the index in the Sharding Spec is not equal to
+    the rank number, we need to do the rearrangement based on the
+    order given by the Sharding Spec (placement).
+
+    Args:
+        results: results from ops applied to inputs from all ranks.
+            We need to distribute them back to their original ranks.
+        input: tensor to be applied op to.
+        world_size: number of ranks.
+        weight: shareded weight tensor.
+        pg: process group.
+
+    Return: column rearranged result.
+    """
+    # Process results and outputs for all2all.
+    sharding_dim = weight._sharding_spec.dim
+    sharding_dim_size = weight.size(sharding_dim)
+    dims = list(results[0].size())
+    dims[0] = sharding_dim_size
+    output = torch.empty(*dims, device=input.device)
+    combined_results = torch.cat(results)
+
+    # Compute output splits
+    split_size = get_split_size(sharding_dim_size, world_size)
+    output_split_sizes = [0] * world_size
+    for idx, placement in enumerate(weight._sharding_spec.placements):
+        output_split_sizes[placement.rank()] = get_chunked_dim_size(
+            sharding_dim_size, split_size, idx
+        )
+
+    # distribute the outputs using all2all.
+    output = all_to_all_single(
+        output, combined_results, output_split_sizes=output_split_sizes, group=pg
+    )
+
+    # Check if we need to rearrange columns appropriately for output.
+    rearrange_columns = any(
+        [
+            idx != placement.rank()
+            for idx, placement in enumerate(weight._sharding_spec.placements)
+        ]
+    )
+    if not rearrange_columns:
+        return output
+
+    indices = []
+    for placement in weight._sharding_spec.placements:
+        dim_size = output_split_sizes[placement.rank()]
+        start = sum(
+            [
+                split_size if i < placement.rank() else 0
+                for i, split_size in enumerate(output_split_sizes)
+            ]
+        )
+        indices += list(range(start, start + dim_size))
+
+    return output.index_select(0, torch.tensor(indices, device=output.device))
+
+
+def _handle_row_wise_lookup_distribute(
+    input_sorted, input, world_size, weight, rank, padding_idx
+):
+    """
+    In the circumstance of row-wise sharding of weight, we need to distribute
+    the sorted lookup IDs of embedding/embeddingBag to each rank.
+    If the index in the placement is not equal to the rank number, we need to
+    do the rearrangement based on the order given by the Sharding Spec (placement).
+
+    In addition, we do two things for padding_idx. The first thing is to only
+    set it if it's within the range of the current rank and the other thing
+    is to do the modularization of it by sharded_dim_size_max.
+
+    Args:
+        input_sorted: sorted lookup IDs of embedding/embeddingBag.
+        input: tensor to be applied op on.
+        world_size: number of ranks.
+        weight: shareded weight tensor.
+        rank: # of cuda process.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient and reduction.
+
+    Return:
+        input_sorted: sorted lookup IDs of embedding/embeddingBag
+            Rearrangement performed if it is needed.
+        input_split_sizes: size of IDs to be assigned to each rank.
+        sharded_dim_size_max: the max size of the row each rank gets.
+        input_split_rearrange_indices: indices of row rearrangement.
+        rearrange_indices_1d_second_order: reverse indices of row
+            rearrangement, which will be used to restore the original
+            order.
+        padding_idx: Same as input if padding_idx is within the range
+            of the given rank; otherwise, None is returned. It is
+            also modularized by sharded_dim_size_max.
+    """
+    # Decide which rank the input goes to by check the sharding range.
+    split_size = get_split_size(weight.size(0), world_size)
+    rearrange_rows = False
+    indices_flatten = None
+    input_split_sizes: List[int] = [0] * world_size
+    input_split_start_indices: List[int] = [0] * world_size
+    start_row_idx_rank = None
+    end_row_idx_rank = None
+    # When we do the chunk split, we always ensure the first N - 1 chunks get max out
+    # and then the Nth chunk gets the rest. So input_split_sizes like [3, 3, 3, 4]
+    # are not possible. The expected split size will be [4, 4, 4, 1].
+    sharded_dim_size_max = get_chunked_dim_size(weight.size(0), split_size, 0)
+    for idx, placement in enumerate(weight._sharding_spec.placements):
+        sharded_dim_size = get_chunked_dim_size(weight.size(0), split_size, idx)
+        start_row_idx = idx * sharded_dim_size_max
+        end_row_idx = start_row_idx + sharded_dim_size
+        start_idx = torch.searchsorted(input_sorted, start_row_idx).item()
+        end_idx = torch.searchsorted(input_sorted, end_row_idx).item()
+        input_split_sizes[placement.rank()] = int(end_idx - start_idx)
+        input_split_start_indices[placement.rank()] = int(start_idx)
+        if placement.rank() != idx:
+            rearrange_rows = True
+        # Store the range of the current rank.
+        if placement.rank() == rank:
+            start_row_idx_rank = start_row_idx
+            end_row_idx_rank = end_row_idx
+
+    # Perform the modular if padding_idx is within the range.
+    if padding_idx is not None:
+        if padding_idx < start_row_idx_rank or padding_idx >= end_row_idx_rank:
+            padding_idx = None
+        else:
+            padding_idx = padding_idx % sharded_dim_size_max
+
+    rearrange_indices_1d_second_order = None
+    if rearrange_rows:
+        # Need to re-arrange the 1D tensor to be sent via all2all.
+        indices: List[List[int]] = [[0]] * world_size
+        for placement in weight._sharding_spec.placements:
+            split_length = input_split_sizes[placement.rank()]
+            offset_idx = input_split_start_indices[placement.rank()]
+            indices[placement.rank()] = list(
+                range(offset_idx, offset_idx + split_length)
+            )
+        indices_flatten = list(idx for indice in indices for idx in indice)
+
+        input_sorted = input_sorted.index_select(
+            0, torch.tensor(indices_flatten, device=input.device)
+        )
+        rearrange_indices_1d_second_order = torch.argsort(torch.Tensor(indices_flatten))
+
+    return (
+        input_sorted,
+        input_split_sizes,
+        sharded_dim_size_max,
+        torch.tensor(indices_flatten, device=input.device) if rearrange_rows else None,
+        rearrange_indices_1d_second_order,
+        padding_idx,
+    )
+
+
+def _communicate_size_to_each_rank(
+    input_size_list, output_size, input, pg, tensor_type=torch.int
+):
+    """
+    In the circumstance of row-wise sharding of weight, we need to first
+    communicate the input length to each rank because each rank gets a
+    different one.
+
+    Args:
+        input_size_list: list of sizes to be sent to each rank.
+        output_size: length of the output tensor.
+        input: tensor to be applied op on.
+        pg: process group.
+        tensor_type: dtype of tensor.
+
+    Return: A list of communication results (int).
+    """
+    input_size_list_tensor = torch.tensor(
+        input_size_list, dtype=tensor_type, device=input.device
+    )
+    output_size_list_tensor = torch.empty(
+        output_size, dtype=tensor_type, device=input.device
+    )
+    dist.all_to_all_single(
+        output_size_list_tensor,
+        input_size_list_tensor,
+        group=pg,
+    )
+    return output_size_list_tensor.tolist()
+
+
+def _communicate_list_to_each_rank(
+    input_tensor_list, output_lists, input, pg, tensor_type=torch.int64
+):
+    """
+    In the circumstance of row-wise sharding of weight, we need to
+    communicate a list of input tensors to each rank. Because the
+    input could be a list of list, we need to first convert the list
+    to a tensor.
+
+    Args:
+        input_tensor_list: list of tensors to be sent to each rank.
+        output_lists: list of sizes to be obtained from each rank.
+        input: tensor to be applied op on.
+        pg: process group.
+        tensor_type: dtype of tensor.
+
+    Return: A list of communication results (tensors).
+    """
+    output_tensor_list = []
+    for output_list in output_lists:
+        output_tensor_list.append(
+            torch.empty(output_list, dtype=tensor_type, device=input.device)
+        )
+    dist.all_to_all(
+        output_tensor_list,
+        input_tensor_list,
+        group=pg,
+    )
+    return output_tensor_list
+
+
+def _handle_max_norm_col_wise(
+    max_norm,
+    norm_type,
+    local_shard,
+    input,
+    world_size,
+    pg,
+):
+    """
+    For col-wise sharding of weight, we need to aggregate the
+    norm across all ranks before we can perform the proper re-norm.
+    Note that, the max_norm logic is only applied to the embedding
+    indices that are looked up and not the whole shard.
+
+    Args:
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        local_shard: col-wise shared local weight used for lookup.
+        input: tensor to be applied op to.
+        world_size: number of ranks.
+        pg: process group.
+
+    Return:
+        local_shard_norm_renormed: local_shard re-normed to max_norm if the norm is larger
+            than it.
+        gathered_inputs: list of inputs from all ranks.
+    """
+    norm_type = norm_type if norm_type is not None else 2.0
+    # allgather the inputs first.
+    gathered_inputs = [torch.zeros_like(input) for _ in range(world_size)]
+    dist.all_gather(gathered_inputs, input, group=pg)
+    unique_inp = torch.unique(torch.cat(gathered_inputs))
+    local_shard_sum = torch.sum(
+        torch.pow(torch.abs(local_shard), norm_type), dim=1, dtype=local_shard.dtype
+    )
+    # For col-wise sharding, we need to first aggregate the powered sum
+    # from each rank first and then calculate the norm.
+    dist.all_reduce(local_shard_sum, group=pg)
+    local_shard_norm = torch.pow(local_shard_sum, 1.0 / norm_type)
+    max_norm_tensor = torch.full(
+        (local_shard.size(0),),
+        float("inf"),
+        dtype=local_shard.dtype,
+        device=input.device,
+    )
+    max_norm_tensor[unique_inp] = max_norm
+    local_shard_t = local_shard.t().contiguous()
+    normalized_tensor = torch.where(
+        local_shard_norm > max_norm_tensor, max_norm_tensor, local_shard_norm
+    )
+    # Make sure divisor is not zero.
+    local_shard_norm[local_shard_norm == 0.0] = 1.0
+    local_shard_norm_renormed = (
+        torch.div(torch.mul(local_shard_t, normalized_tensor), local_shard_norm)
+        .t()
+        .contiguous()
+    )
+    return local_shard_norm_renormed, gathered_inputs
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/embedding.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
similarity index 98%
rename from torch/distributed/_shard/sharded_tensor/_ops/embedding.py
rename to torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
index 11ef15338bc9..232e71683c1c 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/embedding.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
@@ -11,13 +11,13 @@
     _handle_max_norm_col_wise,
 )
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
 from torch.distributed._shard.sharded_tensor import (
-    sharded_op_impl,
     ShardedTensor
 )
 
-@sharded_op_impl(torch.nn.functional.embedding)
-def sharded_embedding(types, args, kwargs, pg):
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.embedding)
+def sharded_embedding(types, args, kwargs):
     """
     Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding``.
     This method computes a sharded embedding lookup and has the following limitations:
@@ -104,6 +104,7 @@ def sharded_embedding(types, args, kwargs, pg):
     norm_type = kwargs.get("norm_type")
     padding_idx = kwargs.get("padding_idx")
 
+    pg = weight._process_group
     local_shard = weight.local_tensor().contiguous()
     sharding_dim = weight._sharding_spec.dim
     world_size = dist.get_world_size(pg)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
similarity index 99%
rename from torch/distributed/_shard/sharded_tensor/_ops/embedding_bag.py
rename to torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index 6fad8758d47f..d3151a3d5f42 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -15,14 +15,14 @@
     _handle_max_norm_col_wise,
 )
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
 from torch.distributed._shard.sharded_tensor import (
-    sharded_op_impl,
     ShardedTensor
 )
 
 
-@sharded_op_impl(torch.nn.functional.embedding_bag)
-def sharded_embedding_bag(types, args, kwargs, pg):
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.embedding_bag)
+def sharded_embedding_bag(types, args, kwargs):
     """
     Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding_bag``.
     This method computes a sharded embedding bag aggregation and has the following limitations:
@@ -123,6 +123,7 @@ def sharded_embedding_bag(types, args, kwargs, pg):
     include_last_offset = kwargs.get("include_last_offset")
     padding_idx = kwargs.get("padding_idx")
 
+    pg = weight._process_group
     local_shard = weight.local_tensor().contiguous()
     sharding_dim = weight._sharding_spec.dim
     world_size = dist.get_world_size(pg)
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/linear.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
similarity index 83%
rename from torch/distributed/_shard/sharded_tensor/_ops/linear.py
rename to torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
index 772947a4c0aa..87eefd927801 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/linear.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
@@ -1,4 +1,3 @@
-import copy
 from typing import List, cast
 
 import torch
@@ -8,14 +7,12 @@
     all_gather,
     all_to_all_single,
 )
+from torch.distributed._shard.partial_tensor import _PartialTensor
 from torch.distributed._shard.sharded_tensor import (
-    sharded_op_impl,
-    _PartialTensor,
-    Shard,
     ShardedTensor,
-    ShardMetadata,
 )
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
 from torch.distributed._shard.sharding_spec._internals import (
     get_split_size,
     get_chunked_dim_size,
@@ -27,8 +24,8 @@
 )
 
 
-@sharded_op_impl(torch.nn.functional.linear)
-def sharded_linear(types, args, kwargs, pg):
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.linear)
+def sharded_linear(types, args, kwargs):
     """
     Handles ``__torch_function__`` dispatch for ``torch.nn.functional.linear``.
     This method computes a sharded linear and has the following limitations:
@@ -102,6 +99,7 @@ def sharded_linear(types, args, kwargs, pg):
     weight = args[1]
     bias = args[2]
 
+    pg = weight._process_group
     local_shard = weight.local_tensor()
     local_shard_t = local_shard.t().contiguous()
     sharding_dim = weight._sharding_spec.dim
@@ -206,20 +204,26 @@ def _handle_col_wise_sharding(input, world_size, weight, rank, local_shard_t, bi
     local_bias = _BiasTensorNarrow.apply(
         world_size, start_pos, chunk_size, weight, pg, bias
     )
-    results = [None] * world_size
-    indices = {}
-    for idx, placement in enumerate(weight._sharding_spec.placements):
-        indices[placement.rank()] = idx
+    results = []
     for i, inp in enumerate(gathered_inputs):
-        results[indices[i]] = inp.matmul(local_shard_t) + local_bias
+        results.append(inp.matmul(local_shard_t) + local_bias)
     # When the local result only has one dimension, we need to make sure
     # it does not shard by dim 0. So reshard can work properly.
     if results[0].dim() == 1:  # type: ignore[attr-defined]
         result = torch.stack(results)  # type: ignore[arg-type]
     else:
         result = torch.cat(results)  # type: ignore[arg-type]
-    return _init_sharded_tensor_from_local_result(
-        weight, result, 0, -1, world_size, pg  # type: ignore[arg-type]
+    st_size = list(result.size())
+    st_size[-1] = weight.size(0)
+    new_sharding_spec = ChunkShardingSpec(
+        dim=-1,
+        placements=weight.sharding_spec().placements
+    )
+    return ShardedTensor._init_from_local_tensor(
+        result,
+        new_sharding_spec,
+        *st_size,  # type: ignore[arg-type]
+        process_group=pg,
     )
 
 
@@ -322,75 +326,21 @@ def _handle_row_wise_sharding_sharded_tensor(
     """
     results = []
     local_shard = input.local_shards()[0].tensor
-    indices = [0] * world_size
-    reaggrance_partial = False
-    for idx, placement in enumerate(input._sharding_spec.placements):
-        indices[placement.rank()] = idx
-        if idx != placement.rank():
-            reaggrance_partial = True
+    if input.sharding_spec().dim not in (-1, len(input.size()) - 1):
+        raise NotImplementedError(
+            "The case when the input does not come from col-wise sharded "
+            "linear is not supported for row-wise sharded linear."
+        )
 
     for tensor in torch.tensor_split(local_shard, world_size):
         results.append(
             tensor.matmul(local_shard_t) + _BiasTensorPartial.apply(world_size, bias)
         )
-    if reaggrance_partial:
-        results = [results[idx] for idx in indices]
 
     # Return the partial local result.
     return _PartialTensor(torch.cat(results), pg)
 
 
-def _init_sharded_tensor_from_local_result(
-    sharded_tensor,
-    local_result,
-    tensor_shard_dim,
-    result_shard_dim,
-    world_size,
-    pg,
-):
-    """
-    Given a sharded tensor and local_result from an op on top of it. We want
-    to create a new sharded tensor from the local_result so that the the next
-    op can be performed on the basis of the new sharded tensor. This can seen
-    as the last step of the first phase of the Megatron-LM style model(tensor)
-    parallelism.
-
-    Args:
-        sharded_tensor: Sharded tensor which the op was performed on.
-        local_result: A tensor which is from the op performed on the local_shard of
-            the sharded_tensor.
-        tensor_shard_dim: Dim which the tensor is sharded on.
-        result_shard_dim: Dim which the new sharded tensor will be sharded on.
-        world_size: number of ranks.
-        pg (ProcessGroup, optional): The process group to work on. If None,
-            the default process group will be used.
-
-    Return:
-        A :class:`ShardedTensor` object which filled with local intermediate results.
-    """
-    sharded_weight_metadata = copy.deepcopy(sharded_tensor.local_shards()[0].metadata)
-    current_offsets = [0] * local_result.dim()
-    current_offsets[result_shard_dim] = sharded_weight_metadata.shard_offsets[
-        tensor_shard_dim
-    ]
-    global_size = list(local_result.size())
-    global_size[result_shard_dim] = sharded_tensor.size(tensor_shard_dim)
-    local_shard_metadata = ShardMetadata(
-        shard_offsets=current_offsets,
-        shard_sizes=list(local_result.size()),
-        placement=sharded_weight_metadata.placement,
-    )
-    local_shards = [Shard(local_result, local_shard_metadata)]
-    new_st = ShardedTensor._init_from_local_shards(
-        local_shards, tuple(global_size), process_group=pg
-    )
-
-    # Manually set sharding_spec
-    new_st._sharding_spec = copy.deepcopy(sharded_tensor._sharding_spec)
-    new_st._sharding_spec.dim = result_shard_dim
-    return new_st
-
-
 class _BiasTensorNarrow(Function):
     """
     Since we now return the intermediate results in a col-wise sharding. We
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/math_ops.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/math_ops.py
new file mode 100644
index 000000000000..5f5b59f5918c
--- /dev/null
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/math_ops.py
@@ -0,0 +1,72 @@
+import torch
+from torch import Tensor
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+from torch.distributed._shard.sharded_tensor._ops.math_ops import binary_math_op_impl
+
+from ._common import (
+    _chunk_sharding_spec_check,
+)
+
+def register_math_op(op):
+    @custom_sharding_spec_op(ChunkShardingSpec, op)
+    def binary_math_op(types, args=(), kwargs=None):
+        """
+        Handles ``__torch_function__`` dispatch for the binary math ops
+        such as `torch.add`, `torch.mul`, `torch.div`, etc.
+        This method computes on ShardedTensor
+        """
+        if len(args) != 2:
+            raise ValueError("Only support binary math op on ShardedTensor for now!")
+        lhs = args[0]
+        rhs = args[1]
+        pg = lhs._process_group if isinstance(lhs, ShardedTensor) else rhs._process_group
+        # Validate types
+        if isinstance(lhs, ShardedTensor) and isinstance(rhs, ShardedTensor):
+            lhs_spec = lhs.sharding_spec()
+            rhs_spec = rhs.sharding_spec()
+            _chunk_sharding_spec_check(lhs_spec, op)
+            _chunk_sharding_spec_check(rhs_spec, op)
+
+            if lhs.size() == rhs.size() and lhs_spec.dim == rhs_spec.dim:  # type: ignore[attr-defined]
+                # perform local element-wise math op
+                res = op(lhs.local_tensor(), rhs.local_tensor())
+                return ShardedTensor._init_from_local_tensor(
+                    res,
+                    lhs_spec,
+                    lhs.size(),  # type: ignore[arg-type]
+                    process_group=pg)
+            else:
+                raise RuntimeError("Implicit broadcasting not supported yet!")
+        else:
+            # Try dispatch to ShardingSpec agnostic ops.
+            return binary_math_op_impl(op, types, args, kwargs, pg)
+
+binary_ops = [
+    # add
+    torch.add,
+    Tensor.add,
+    Tensor.__add__,
+    Tensor.__radd__,
+    # sub
+    torch.sub,
+    Tensor.sub,
+    Tensor.__sub__,
+    Tensor.__rsub__,
+    # mul
+    torch.mul,
+    Tensor.mul,
+    Tensor.__mul__,
+    Tensor.__rmul__,
+    # div
+    torch.div,
+    Tensor.div,
+    Tensor.__div__,
+    Tensor.__rdiv__,
+]
+
+for op in binary_ops:
+    register_math_op(op)
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/matrix_ops.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/matrix_ops.py
new file mode 100644
index 000000000000..401ba23dfae3
--- /dev/null
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/matrix_ops.py
@@ -0,0 +1,420 @@
+import copy
+import math
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    get_chunk_sharding_params,
+)
+from torch.distributed.nn.functional import (
+    all_reduce,
+)
+
+from ._common import (
+    _chunk_sharding_spec_check,
+    _register_sharded_op_on_local_tensor,
+)
+
+
+def transpose_same_dim(*args, **kwargs):
+    """
+    When the dim0 and dim1 of transpose are the same, return the original ShardedTensor.
+
+    Args: same as ``torch.Tensor.transpose``.
+
+    Return (bool): Whether to return early or not.
+    """
+    return args[1] == args[2]
+
+
+def sharded_transpose_check(*args, **kwargs):
+    """
+    Perform extra checks for the sharded_transpose op such as the input needs to
+    be at least 2 and the sharding spec needs to be a ChunkShardingSpec.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return: None
+    """
+    if len(args) < 3:
+        raise ValueError("Needs at least two dimensions for transpose op!")
+    _chunk_sharding_spec_check(args[0].sharding_spec(), torch.Tensor.transpose)
+
+
+def sharded_transpose(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.transpose`` op.
+
+    Returns a new sharded tensor with the given dimensions transposed.
+    During the transpose, we keep the original shading dim, if the sharding
+    dim is not neither dim0 nor dim1. Otherwise, we will swap the sharding
+    dim with the other input of transpose.
+
+    Args: (same as ``torch.Tensor.transpose``.)
+        dim0 (Int): the first dimension to be transposed.
+        dim1 (Int): the second dimension to be transposed.
+
+    Returns:
+        new_local_shards (List[Shard]): Local shards for the new sharded tensor.
+        st_meta (ShardedTensorMetadata): Metadata of the new sharded tensor.
+    """
+
+    def _swap_meta_data(data, idx0, idx1):
+        """
+        Swap the item at idx0 and idx1 in the data list.
+        """
+        data[idx0], data[idx1] = data[idx1], data[idx0]
+
+    st = args[0]
+    dim0 = args[1]
+    dim1 = args[2]
+
+    sharding_spec = copy.deepcopy(st.sharding_spec())
+    if sharding_spec.dim == dim0:
+        sharding_spec.dim = dim1
+    elif sharding_spec.dim == dim1:
+        sharding_spec.dim = dim0
+
+    st_size = list(st.size())
+    _swap_meta_data(st_size, dim0, dim1)
+    local_tensor = st.local_tensor().transpose(dim0, dim1).contiguous()
+    return local_tensor, sharding_spec, tuple(st_size)
+
+
+_register_sharded_op_on_local_tensor(
+    torch.transpose,
+    early_stop_func=transpose_same_dim,
+    extra_check=sharded_transpose_check,
+    customized_func=sharded_transpose,
+)
+_register_sharded_op_on_local_tensor(
+    torch.Tensor.transpose,
+    early_stop_func=transpose_same_dim,
+    extra_check=sharded_transpose_check,
+    customized_func=sharded_transpose,
+)
+
+
+def sharded_masked_fill_check(*args, **kwargs):
+    """
+    Perform extra checks for the ``torch.Tensor.masked_fill`` op.
+    Ensure the mask size is broadcastable with the size of
+    the sharded tensor.
+
+    Args: same as ``torch.Tensor.masked_fill``.
+
+    Return: None
+    """
+    st = args[0]
+    mask = args[1]
+    if st.dim() < mask.dim():
+        raise ValueError(
+            "mask dim must not greater than the dim of the sharded tensor."
+        )
+    for idx in range(-1, -mask.dim() - 1, -1):
+        if mask.size(idx) != st.size(idx) and mask.size(idx) != 1:
+            raise ValueError(
+                f"The size of mask {mask.dim() + idx} must match the size of "
+                f"sharded tensor {st.dim() + idx} at non-singleton dimension {mask.dim() + idx}"
+            )
+
+
+def sharded_masked_fill(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.masked_fill`` op.
+    We first narrow down the mask to the size of local tensor if the mask
+    contains the sharding dim and then apply the mask to the local tensor.
+
+    Args: same as ``torch.Tensor.masked_fill``.
+
+    Return:
+        local_tensor (Tensor): New local tensor to build the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            sharding spec of the new sharded tensor.
+        new_st_size (torch.Size): Size of the new sharded tensor.
+    """
+    st = args[0]
+    mask = args[1]
+    value = args[2]
+    current_rank = dist.get_rank(pg)  # type: ignore[attr-defined]
+    sharding_dim = st.sharding_spec().dim  # type: ignore[attr-defined]
+    narrow_idx = None
+    for idx in range(-1, -mask.dim() - 1, -1):
+        if st.dim() + idx == sharding_dim and mask.size(idx) != 1:
+            narrow_idx = idx
+    if narrow_idx is not None:
+        rank_idx = None
+        for idx, placement in enumerate(st._sharding_spec.placements):  # type: ignore[attr-defined]
+            if placement.rank() == current_rank:  # type: ignore[index]
+                rank_idx = idx  # type: ignore[attr-defined]
+        shard_metadata = st.metadata().shards_metadata[rank_idx]  # type: ignore[index]
+        mask = mask.narrow(
+            narrow_idx,
+            shard_metadata.shard_offsets[sharding_dim],
+            shard_metadata.shard_sizes[sharding_dim],
+        )
+    local_tensor = st.local_tensor().masked_fill(mask, value)
+    return local_tensor, st.sharding_spec(), st.size()
+
+
+_register_sharded_op_on_local_tensor(
+    torch.Tensor.masked_fill,
+    extra_check=sharded_masked_fill_check,
+    customized_func=sharded_masked_fill,
+)
+
+
+def sharded_view_check(*args, **kwargs):
+    """
+    Perform extra checks for the ``torch.Tensor.view`` op.
+
+    Args: same as ``torch.Tensor.view``.
+
+    Return: None
+    """
+    st = args[0]
+    shape = args[1:]
+    if len(shape) == 0:
+        raise ValueError("Missing *shape for sharded view op.")
+    if len(shape) <= st.sharding_spec().dim:
+        raise NotImplementedError(
+            f"Shape having dim {len(shape)} is not supported "
+            f"for sharded tensor sharded on dim {st.sharding_spec().dim}."
+        )
+    st_size = math.prod(st.size())  # type: ignore[attr-defined]
+    shape_size = math.prod(shape)  # type: ignore[attr-defined]
+    neg_sum = sum(i for i in shape if i < 0)
+    if shape_size > st_size or st_size % shape_size:
+        raise ValueError(
+            f"Shape '{list(shape)}' is invalid for sharded tensor size {st_size}."
+        )
+    if neg_sum < -1:
+        raise ValueError("Only one dimension can be inferred for sharded view op.")
+
+
+def sharded_view(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.view`` op.
+    For now we always keep the sharding dim after view. For example, if
+    a sharded tensor with size [16, 5] and sharded by 0. If we now view
+    it as [4, 2, 2, 5], it will still be sharded by dim 0.
+
+    Args: same as ``torch.Tensor.view``.
+
+    Return:
+        local_tensor (Tensor): New local tensor to build the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            sharding spec of the new sharded tensor.
+        new_st_size (torch.Size): Size of the new sharded tensor.
+    """
+    st = args[0]
+    shape = args[1:]
+    try:
+        infer_idx = shape.index(-1)
+    except ValueError:
+        infer_idx = None
+
+    # Infer the dim which is specified with -1.
+    if infer_idx is not None:
+        st_size = math.prod(st.size())  # type: ignore[attr-defined]
+        shape_size = -1 * math.prod(shape)  # type: ignore[attr-defined]
+        shape = (*shape[:infer_idx], st_size // shape_size, *shape[infer_idx + 1 :])
+    if st.size() == shape:
+        return st.local_tensor(), st.sharding_spec(), shape
+
+    sharding_dim = st.sharding_spec().dim
+    sharding_spec = st.sharding_spec()
+    # When the sharding dim is negative, we need to ensure the new
+    # sharded tensor is still sharded by the original dimension.
+    if sharding_dim < 0:
+        sharding_spec = copy.deepcopy(sharding_spec)
+        sharding_dim = st.dim() + sharding_dim
+        sharding_spec.dim = sharding_dim
+
+    world_size = dist.get_world_size(pg)
+    if shape[sharding_dim] % world_size:
+        raise NotImplementedError(
+            f"Case when dim '({shape[sharding_dim]})' is not divisible "
+            "by world_size is not supported."
+        )
+    new_local_tensor_size = (
+        *shape[:sharding_dim],
+        shape[sharding_dim] // world_size,
+        *shape[sharding_dim + 1 :],
+    )
+    new_local_tensor = st.local_tensor().view(*new_local_tensor_size)
+    return new_local_tensor, sharding_spec, shape
+
+
+_register_sharded_op_on_local_tensor(
+    torch.Tensor.view,
+    extra_check=sharded_view_check,
+    customized_func=sharded_view,
+)
+
+
+def sharded_bmm_check(*args, **kwargs):
+    """
+    Perform extra checks for the sharded_bmm op, for example, st2 needs to
+    be a sharded tensor and both tensors need to sharded by dim 0, etc.
+
+    Args: same as ``torch.bmm``.
+
+    Return: None
+    """
+    if len(args) < 2:
+        raise TypeError("Needs two tensors to perform torch.bmm.")
+    st = args[0]
+    st2 = args[1]
+    # Validate types
+    if not isinstance(st2, ShardedTensor):
+        raise TypeError("st2 needs to be a ShardedTensor for torch.bmm.")
+    _chunk_sharding_spec_check(st2.sharding_spec(), torch.bmm)
+    if st.dim() != 3 or st2.dim() != 3:
+        raise TypeError("both st and st2 need to be a 3D ShardedTensor")
+    if (
+        st.sharding_spec().dim != st2.sharding_spec().dim  # type: ignore[attr-defined]
+        or st.sharding_spec().dim != 0
+    ):
+        raise NotImplementedError(
+            "Only support performing bmm on tensors sharded on dim 0 now."
+        )
+    if st.sharding_spec().placements != st2.sharding_spec().placements:  # type: ignore[attr-defined]
+        raise NotImplementedError(
+            "Both st and st2 need to have same placements for bmm."
+        )
+
+
+def sharded_bmm(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the sharded_bmm op.
+
+    Warning: For now we only supports the case when both tensors are sharded
+             by dim 0 so that no local communication.
+
+    Args: same as ``torch.bmm``.
+
+    Return:
+        local_tensor (Tensor): New local tensor to build the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            sharding spec of the new sharded tensor.
+        new_st_size (torch.Size): Size of the new sharded tensor.
+    """
+    st = args[0]
+    st2 = args[1]
+    local_tensor = torch.bmm(st.local_tensor(), st2.local_tensor())
+    new_st_size = (*st.size()[:-1], st2.size(-1))
+    return local_tensor, st.sharding_spec(), new_st_size
+
+
+_register_sharded_op_on_local_tensor(
+    torch.Tensor.bmm,
+    extra_check=sharded_bmm_check,
+    customized_func=sharded_bmm,
+)
+
+_register_sharded_op_on_local_tensor(
+    torch.bmm,
+    extra_check=sharded_bmm_check,
+    customized_func=sharded_bmm,
+)
+
+
+def sharded_layer_norm_check(*args, **kwargs):
+    """
+    Perform extra checks for the ``nn.LayerNorm`` op.
+    Ensure the normalized shape is compatible with
+    the size of the sharded tensor.
+
+    Args: same as ``torch.nn.LayerNorm``.
+
+    Return: None
+    """
+    st = args[0]
+    normalized_shape = args[1]
+    if st.dim() < len(normalized_shape):
+        raise ValueError(
+            "normalized_shape dim must not be greater than "
+            "the dim of the sharded tensor."
+        )
+    for idx in range(-1, -len(normalized_shape) - 1, -1):
+        if normalized_shape[idx] != st.size(idx):
+            raise ValueError(
+                f"Given normalized_shape=[{normalized_shape[idx]}], expected input with shape "
+                f"[*, {normalized_shape[idx]}], but got input of size {list(st.size())}."
+            )
+
+
+def sharded_layer_norm(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.nn.LayerNorm`` op.
+    We gather all shards from local shards and perform a global normalization.
+    We then scatter the result back to each rank.
+
+    Args: same as ``torch.nn.LayerNorm``.
+
+    Return:
+        local_tensor (Tensor): New local tensor to build the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            sharding spec of the new sharded tensor.
+        new_st_size (torch.Size): Size of the new sharded tensor.
+    """
+    st = args[0]
+    normalized_shape = args[1]
+    sharding_dim = st.sharding_spec().dim  # type: ignore[attr-defined]
+    sharding_dim = sharding_dim if sharding_dim >= 0 else st.dim() + sharding_dim
+    local_tensor = st.local_tensor()
+    # If sharding dim is smaller than shape start, we just perform a local norm.
+    shape_start = st.dim() - len(normalized_shape)
+    if shape_start > sharding_dim:
+        args = (local_tensor, *args[1:])
+        local_tensor = torch.nn.functional.layer_norm(*args, **kwargs)
+        return local_tensor, st.sharding_spec(), st.size()
+
+    elementwise_affine = kwargs.get("elementwise_affine", False)
+    eps = kwargs.get("eps", 1e-05)
+
+    norm_dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1))
+    local_size = math.prod(local_tensor.size()[shape_start:])  # type: ignore[attr-defined]
+    st_size = math.prod(st.size()[shape_start:])  # type: ignore[attr-defined]
+    local_mean = torch.mul(local_tensor.mean(norm_dims, keepdim=True), local_size)
+    global_mean = torch.div(all_reduce(local_mean), st_size)
+    local_variant_sq = torch.square(local_tensor - global_mean).sum(
+        norm_dims, keepdim=True
+    )
+    global_variant = torch.div(all_reduce(local_variant_sq), st_size)
+
+    denom = torch.rsqrt(global_variant + eps)
+    local_tensor = torch.mul(local_tensor - global_mean, denom)
+
+    if elementwise_affine:
+        weight = kwargs["weight"]
+        bias = kwargs["bias"]
+        current_rank = dist.get_rank(pg)  # type: ignore[attr-defined]
+        world_size = dist.get_world_size(pg)
+        (start_pos, chunk_size) = get_chunk_sharding_params(
+            bias.size(0), world_size, st.sharding_spec(), current_rank
+        )
+        local_tensor = torch.addmm(
+            torch.narrow(bias, 0, start_pos, chunk_size),
+            local_tensor,
+            torch.narrow(weight, sharding_dim - shape_start, start_pos, chunk_size),
+        )
+
+    return local_tensor, st.sharding_spec(), st.size()
+
+
+_register_sharded_op_on_local_tensor(
+    torch.nn.LayerNorm,
+    extra_check=sharded_layer_norm_check,
+    customized_func=sharded_layer_norm,
+)
+
+_register_sharded_op_on_local_tensor(
+    torch.nn.functional.layer_norm,
+    extra_check=sharded_layer_norm_check,
+    customized_func=sharded_layer_norm,
+)
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/softmax.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/softmax.py
new file mode 100644
index 000000000000..e3a86296838e
--- /dev/null
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/softmax.py
@@ -0,0 +1,30 @@
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+from ._common import (
+    _register_sharded_op_on_local_tensor,
+)
+
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.softmax)
+def sharded_softmax(types, args=(), kwargs=None):
+    input = args[0]
+    pg = input._process_group
+    dim = kwargs['dim']
+    sharding_dim = input.sharding_spec().dim
+    ndims = input.dim()
+    if dim == sharding_dim or dim + ndims == sharding_dim or sharding_dim + ndims == dim:
+        exp = torch.exp(input.local_tensor())
+        exp_sum = exp.sum(dim=dim).unsqueeze(dim=dim)
+        exp_sum = torch.distributed.nn.functional.all_reduce(exp_sum, group=pg)
+        smax = torch.div(exp, exp_sum)
+    else:
+        smax = torch.nn.functional.softmax(input.local_tensor(), dim=dim)
+    return ShardedTensor._init_from_local_tensor(smax, input.sharding_spec(), input.size(), process_group=pg)
+
+_register_sharded_op_on_local_tensor(
+    torch.nn.functional.softmax,
+    customized_func=sharded_softmax,
+)
diff --git a/torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py
deleted file mode 100644
index 163f4b457eba..000000000000
--- a/torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from enum import Enum, auto
-from contextlib import suppress
-
-import torch
-from torch.autograd.graph import save_on_cpu
-from torch.utils.checkpoint import checkpoint
-
-
-class CheckpointImpl(Enum):
-    REENTRANT = auto()
-    NO_REENTRANT = auto()
-
-
-class _CheckpointWrapper(torch.nn.Module):
-    """
-    An nn.Module that wraps another nn.Module with checkpointing.
-    """
-    def __init__(
-        self,
-        mod: torch.nn.Module,
-        checkpoint_impl: CheckpointImpl = CheckpointImpl.REENTRANT,
-        offload_to_cpu: bool = False,
-    ):
-        super().__init__()
-        self.mod = mod
-        self.checkpoint_impl = checkpoint_impl
-        self.offload_to_cpu = offload_to_cpu
-
-    def forward(self, *args, **kwargs):
-        offload_mgr = save_on_cpu(pin_memory=True) if self.offload_to_cpu else suppress()
-        with offload_mgr:  # type: ignore[attr-defined]
-            return checkpoint(
-                self.mod,
-                use_reentrant=(self.checkpoint_impl == CheckpointImpl.REENTRANT),
-                *args,
-                **kwargs,
-            )
-
-
-def checkpoint_wrapper(
-    module: torch.nn.Module,
-    checkpoint_impl: CheckpointImpl = CheckpointImpl.REENTRANT,
-    offload_to_cpu: bool = False,
-) -> torch.nn.Module:
-    """
-    A convenience wrapper for activation checkpointing. If the module is wrapped
-    with this function, all subsequent calls to the module will automatically
-    perform checkpointing without the user having to explicitly call ``checkpoint``
-    function.
-    Usage::
-        checkpointed_module = checkpoint_wrapper(module)
-        outputs = checkpointed_module(inputs)
-    Args:
-        module (nn.Module):
-            The module to be wrapped
-        checkpoint_impl (Optional[CheckpointImpl]):
-            The checkpointing implementation to use. Currently only
-            CheckpointImpl.REENTRANT is supported.
-        offload_to_cpu (Optional[bool]):
-            Whether to offload outer activations to CPU. Note that this
-            currently only works with CheckpointImpl.REENTRANT.
-
-    Returns:
-        (nn.Module):
-            Wrapped module
-    """
-    # saved tensor hooks based-checkpoint wrapper is not yet supported.
-    if checkpoint_impl == CheckpointImpl.NO_REENTRANT:
-        raise ValueError(
-            "No support for non-reentrant based checkpoint implementation."
-        )
-
-    if offload_to_cpu and checkpoint_impl != CheckpointImpl.REENTRANT:
-        raise ValueError(
-            "No support for CPU offload activations and non-reentrant based "
-            "checkpoint implementation."
-        )
-
-    return _CheckpointWrapper(module, checkpoint_impl, offload_to_cpu)
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
new file mode 100644
index 000000000000..8674531f0976
--- /dev/null
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -0,0 +1,125 @@
+from enum import Enum, auto
+from contextlib import suppress
+
+import torch
+from torch.autograd.graph import save_on_cpu
+from torch.utils.checkpoint import checkpoint
+from torch.distributed.utils import _replace_by_prefix
+import torch.nn as nn
+from typing import Dict, Any
+
+_CHECKPOINT_PREFIX = "mod"
+
+
+class CheckpointImpl(Enum):
+    REENTRANT = auto()
+    NO_REENTRANT = auto()
+
+
+class CheckpointWrapper(torch.nn.Module):
+    """
+    An nn.Module that wraps another nn.Module with checkpointing.
+    """
+    def __init__(
+        self,
+        mod: torch.nn.Module,
+        checkpoint_impl: CheckpointImpl = CheckpointImpl.REENTRANT,
+        offload_to_cpu: bool = False,
+    ):
+        super().__init__()
+        self.mod = mod
+        self.checkpoint_impl = checkpoint_impl
+        self.offload_to_cpu = offload_to_cpu
+        # state_dict post hook to remove prefix to allow loading into a
+        # non-checkpoint wrapped module.
+        self._register_state_dict_hook(self._post_state_dict_hook)
+        # load_state_dict pre-hook to allow loading back into
+        # checkpoint-wrapped module.
+        self._register_load_state_dict_pre_hook(
+            self._pre_load_state_dict_hook, with_module=True
+        )
+
+    def forward(self, *args, **kwargs):
+        offload_mgr = save_on_cpu(pin_memory=True) if self.offload_to_cpu else suppress()
+        with offload_mgr:  # type: ignore[attr-defined]
+            return checkpoint(
+                self.mod,
+                use_reentrant=(self.checkpoint_impl == CheckpointImpl.REENTRANT),
+                *args,
+                **kwargs,
+            )
+
+    @staticmethod
+    def _post_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> Dict[str, Any]:
+        """
+        _post_state_dict_hook() is called after the state_dict() of this
+        FSDP module is executed. For ``checkpoint_wrapper``, it will strip
+        checkpoint-wrapped module prefix so that this module can be loaded into
+        non-checkpointed modules. It would still be able to be loaded into
+        checkpoint-wrapped modules as this class adds the prefix back before
+        loading the state_dict.
+        """
+        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}.", prefix)
+        return state_dict
+
+    @staticmethod
+    def _pre_load_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> None:
+        """
+        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
+        is called. For ``checkpoint_wrapper``, it will add back the module
+        prefix so that non-checkpointed modules can be loaded into
+        checkpoint_wrapper modules properly.
+        """
+        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}.")
+
+
+def checkpoint_wrapper(
+    module: torch.nn.Module,
+    checkpoint_impl: CheckpointImpl = CheckpointImpl.REENTRANT,
+    offload_to_cpu: bool = False,
+) -> torch.nn.Module:
+    """
+    A convenience wrapper for activation checkpointing. If the module is wrapped
+    with this function, all subsequent calls to the module will automatically
+    perform checkpointing without the user having to explicitly call ``checkpoint``
+    function.
+    Usage::
+        checkpointed_module = checkpoint_wrapper(module)
+        outputs = checkpointed_module(inputs)
+    Args:
+        module (nn.Module):
+            The module to be wrapped
+        checkpoint_impl (Optional[CheckpointImpl]):
+            The checkpointing implementation to use. Currently only
+            CheckpointImpl.REENTRANT is supported.
+        offload_to_cpu (Optional[bool]):
+            Whether to offload outer activations to CPU. Note that this
+            currently only works with CheckpointImpl.REENTRANT.
+
+    Returns:
+        (nn.Module):
+            Wrapped module
+    """
+    # saved tensor hooks based-checkpoint wrapper is not yet supported.
+    if checkpoint_impl == CheckpointImpl.NO_REENTRANT:
+        raise ValueError(
+            "No support for non-reentrant based checkpoint implementation."
+        )
+
+    if offload_to_cpu and checkpoint_impl != CheckpointImpl.REENTRANT:
+        raise ValueError(
+            "No support for CPU offload activations and non-reentrant based "
+            "checkpoint implementation."
+        )
+
+    return CheckpointWrapper(module, checkpoint_impl, offload_to_cpu)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
index eaeca0824349..ba226db1d8ab 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
@@ -1,10 +1,11 @@
 from typing import Any
 
 import torch
-import torch.distributed as dist
+from torch.distributed import GradBucket
 
 
-def noop_hook(_: Any, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+
+def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
     """
     This DDP communication hook returns a future that wraps the input,
     so it is a noop that does not incur any communication overheads.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
index e6ee8c231b40..2d148b5d3b62 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -5,6 +5,8 @@
 
 from . import default_hooks as default
 
+logger = logging.getLogger(__name__)
+
 
 class PostLocalSGDState(object):
     r"""
@@ -13,12 +15,16 @@ class PostLocalSGDState(object):
 
     If ``process_group`` is ``None``, the global process group will be used.
     If ``subgroup`` is ``None``, the intra-node process group on each machine will be used.
+
+    Additionally, ``post_local_gradient_allreduce`` may be worth tuning,
+    because both true and false may give a faster convergence.
     """
 
     __slots__ = [
         "process_group",
         "subgroup",
         "start_localSGD_iter",
+        "post_local_gradient_allreduce",
         "iter",
     ]
 
@@ -27,8 +33,9 @@ def __init__(
         process_group,
         subgroup,
         start_localSGD_iter,
+        post_local_gradient_allreduce=True,
     ):
-        logging.info(
+        logger.info(
             "Local SGD will be started after {} iterations".format(start_localSGD_iter)
         )
 
@@ -37,6 +44,9 @@ def __init__(
         # The group used for all-reducing gradients locally.
         self.subgroup = subgroup
         self.start_localSGD_iter = start_localSGD_iter
+        # Allreduce gradients locally since iteration `start_localSGD_iter`.
+        # This may help with the convergence efficiency at the cost of relatively cheap intra-subgroup communication.
+        self.post_local_gradient_allreduce = post_local_gradient_allreduce
         # Iteration/step in the training loop.
         self.iter = 0
 
@@ -47,7 +57,7 @@ def maybe_increase_iter(self, bucket):
             self.iter += 1
 
         if self.iter == self.start_localSGD_iter:
-            logging.info(
+            logger.info(
                 "Start to apply local SGD after {} iterations.".format(self.iter)
             )
 
@@ -81,7 +91,6 @@ def post_localSGD_hook(
     global_group_to_use = (
         state.process_group if state.process_group is not None else dist.group.WORLD
     )
-    world_size = global_group_to_use.size()
 
     # The input tensor is a flattened 1D tensor.
     input_tensor = bucket.buffer()
@@ -91,6 +100,13 @@ def post_localSGD_hook(
         state.maybe_increase_iter(bucket)
         return default._allreduce_fut(global_group_to_use, input_tensor)
 
+    # If `post_local_gradient_allreduce` is not set,
+    # then no gradient synchronization after the first `start_localSGD_iter` iterations.
+    if not state.post_local_gradient_allreduce:
+        fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
+        fut.set_result(input_tensor)
+        return fut
+
     # Run allreduce using `subgroup` after the first `start_localSGD_iter` iterations.
     # Note that by default, a separate subgroup for each node is created which
     # causes an intra-node allreduce to be done at each training step.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index b4faf2a312ee..50ae96832060 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -1,5 +1,7 @@
+from collections import defaultdict
 import logging
 import math
+from typing import Dict
 
 import numpy as np
 import torch
@@ -7,56 +9,64 @@
 
 from . import default_hooks as default
 
+__all__ = [
+    "PowerSGDState", "powerSGD_hook", "batched_powerSGD_hook"
+]
 
-def _orthogonalize(matrix, epsilon=0):
+logger = logging.getLogger(__name__)
+
+
+def _orthogonalize(matrices, epsilon=0):
     """
-    Decide between Gram-Schmidt or QR factorization to orthogonalize the matrix.
+    Decide between Gram-Schmidt or QR factorization to orthogonalize a batch of matrices.
     QR factorization doesn't work with half-precision, but it is usually faster with a rank > 2.
     """
-    assert len(matrix.shape) == 2 and matrix.shape[1] <= matrix.shape[0]
+    assert len(matrices.shape) == 3 and matrices.shape[2] <= matrices.shape[1]
 
-    rank = matrix.shape[1]
-    dtype = matrix.dtype
+    num_matrices = matrices.shape[0]
+    rank = matrices.shape[2]
+    dtype = matrices.dtype
     if rank <= 2 or dtype in [torch.float16, torch.bfloat16]:
-        _orthogonalize_gram_schmidt(matrix, epsilon=epsilon)
+        _orthogonalize_gram_schmidt(matrices, epsilon=epsilon)
     else:
         torch.linalg.qr(
-            matrix,
+            matrices,
             out=(
-                matrix,
-                torch.empty(rank, rank, device=matrix.device, dtype=dtype)
+                matrices,
+                torch.empty(num_matrices, rank, rank, device=matrices.device, dtype=dtype)
             )
         )
 
-def _orthogonalize_gram_schmidt(matrix, epsilon=0):
+def _orthogonalize_gram_schmidt(matrices, epsilon=0):
     """
-    Applies Gram-Schmidt procedure to orthogonalize a given 2D tensor.
-    If epsilon is 0, this is equivalent to `torch.qr(matrix, out=(matrix, _))`,
+    Applies Gram-Schmidt procedure to orthogonalize a batch of matrices.
+    If epsilon is 0, this is equivalent to `torch.qr(matrices, out=(matrices, _))`,
     """
-    num_cols = matrix.shape[1]
+    num_cols = matrices.shape[2]
     for i in range(num_cols):
         # Normalize the i'th column.
-        col = matrix[:, i : i + 1]
+        col = matrices[:, :, i : i + 1]
         # If no epsilon is added here, division by zero may be caused by vanishing gradients.
-        # This epsilon is not needed if the input matrix covers the gradients of at least one entire layer in the neural network.
+        # This epsilon is not needed if the input batch of matrices covers the gradients of at least one entire layer
+        # in the neural network.
         if epsilon == 0:
             # Note that col ** 2 can underflow/overflow if we use FP16.
             # May need to consider multiplying a scaling factor and dividing it later, or using bfloat16 instead.
             try:
-                col /= torch.norm(col)
+                col /= torch.norm(col, dim=1, keepdim=True)
             except ZeroDivisionError:
-                logging.error(
-                    "The matrix to be orthogonalized has at least a column of all 0s. Please set a small value such as 1e-8 "
+                logger.error(
+                    "The matrices to be orthogonalized has at least a column of all 0s. Please set a small value such as 1e-8 "
                     "as `orthogonalization_epsilon` in PowerSGD state."
                 )
                 # Recover the values from NaNs to 0s.
                 col.fill_(0.0)
         else:
-            col /= torch.norm(col) + epsilon
+            col /= torch.norm(col, dim=1, keepdim=True) + epsilon
         # Project it on the rest and remove it.
         if i + 1 < num_cols:
-            rest = matrix[:, i + 1 :]
-            rest -= torch.sum(col * rest, dim=0) * col
+            rest = matrices[:, :, i + 1 :]
+            rest -= torch.sum(col * rest, dim=1, keepdim=True) * col
 
 
 def _should_compress(
@@ -95,7 +105,7 @@ def _report_compression_stats(bucket, state):
         and state.iter >= state.next_stats_report
     ):
         stats = state.compression_stats()
-        logging.info(
+        logger.info(
             "Compression stats: iter {}, total before compression {}, total after compression {}, "
             "rate {}".format(state.iter, stats[1], stats[2], stats[0])
         )
@@ -126,6 +136,8 @@ class PowerSGDState(object):
 
     4. ``orthogonalization_epsilon`` can be a very small value (e.g., 1e-8) added to every normalized matrix column in orthogonalization step, to prevent div-by-zero error if any column has all 0s. If this can already be prevented (e.g., by batch normalization), an epsilon of 0 is recommended for accuracy.
 
+    5. ``batch_tensors_with_same_shape`` controls whether to compress and decompress tensors with same shape in a batched operation to achieve higher parallelism. Note that you should also increase the bucket size (i.e., ``bucket_cap_mb`` arg in DDP constructor) to make more same-shaped tensors appear in the same bucket, however this may reduce the overlap between computation and communication, and increase the memory footprint due to stacking the tensors of the same shape. Set to ``True`` if the compression / decompression computation is a bottleneck.
+
     .. warning ::
         If error feedback or warm-up is enabled, the minimum value of ``start_powerSGD_iter`` allowed in DDP is 2.
         This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
@@ -143,6 +155,7 @@ class PowerSGDState(object):
         # The fields below are the binary hyperparameters recommended to be turned on for performance and accuracy.
         "use_error_feedback",
         "warm_start",
+        "batch_tensors_with_same_shape",
         # The fields below are internal state.
         "rng",
         "error_dict",
@@ -167,11 +180,12 @@ def __init__(
         orthogonalization_epsilon=0,
         random_seed=0,
         compression_stats_logging_frequency=10_000,
+        batch_tensors_with_same_shape: bool = False,
     ):
-        logging.info(
+        logger.info(
             "PowerSGD config: matrix_approximation_rank = {}; start_powerSGD_iter = {}; "
             "min_compression_rate = {}; orthogonalization_epsilon = {}; use_error_feedback = {}; warm_start = {}; "
-            "random_seed = {}; compression_stats_logging_frequency = {}".format(
+            "random_seed = {}; compression_stats_logging_frequency = {}; batch_tensors_with_same_shape = {}".format(
                 matrix_approximation_rank,
                 start_powerSGD_iter,
                 min_compression_rate,
@@ -180,6 +194,7 @@ def __init__(
                 warm_start,
                 random_seed,
                 compression_stats_logging_frequency,
+                batch_tensors_with_same_shape,
             )
         )
 
@@ -228,9 +243,9 @@ def __init__(
         self.rng = np.random.RandomState(random_seed)
         # Since there is only a single state instance for all the input buckets,
         # need to maintain a dictionary that maps each bucket index to the local error.
-        self.error_dict = {}
-        self.p_memory_dict = {}
-        self.q_memory_dict = {}
+        self.error_dict: Dict[int, torch.Tensor] = {}
+        self.p_memory_dict: Dict[int, torch.Tensor] = {}
+        self.q_memory_dict: Dict[int, torch.Tensor] = {}
         # Iteration/step in the training loop.
         self.iter = 0
         # Compression stats accumulators
@@ -242,6 +257,12 @@ def __init__(
             1, compression_stats_logging_frequency
         )
         self.next_stats_report = 0
+        # Batching tensors with same shape can increase parallelism in compressiom / decompression computation.
+        # This requires a larger bucket size to make more same-shaped tensor to appear in one bucket, however
+        # this may reduce the overlap between computation and communication, and increase the memory footprint
+        # due to stacking tensors.
+        # Turn on if compression / decompression computation is a bottleneck.
+        self.batch_tensors_with_same_shape = batch_tensors_with_same_shape
 
     def maybe_increase_iter(self, bucket):
         # Since bucket 0 is the last bucket to allreduce in an iteration.
@@ -250,7 +271,7 @@ def maybe_increase_iter(self, bucket):
             self.iter += 1
 
         if self.iter == self.start_powerSGD_iter:
-            logging.info(
+            logger.info(
                 "Start to apply PowerSGD after {} iterations.".format(self.iter)
             )
 
@@ -358,7 +379,7 @@ def powerSGD_hook(
         if bucket_index in state.error_dict:
             input_tensor.add_(state.error_dict[bucket_index])
         else:
-            logging.info(
+            logger.info(
                 "A zero tensor of length {} that represents local error is created.".format(
                     total_length
                 )
@@ -417,7 +438,7 @@ def powerSGD_hook(
         # If warm-start is disabled, low-rank tensors will be initialized at every step.
         # Only log this if warm-start to avoid spamming.
         if state.warm_start:
-            logging.info(
+            logger.info(
                 "Allocating contiguous memory of length {} for Ps, and of length {} for Qs, respectively.".format(
                     total_Ps_size, total_Qs_size
                 )
@@ -429,26 +450,48 @@ def powerSGD_hook(
             total_Qs_size, device=device, dtype=dtype
         )
 
+    # Batch tensors to compress by shape.
+    shape_to_tensors = defaultdict(list)
+    for tensor in tensors_to_compress:
+        shape_to_tensors[tensor.shape].append(tensor)
+
+    # This function decides whether to batch tensors with same shape or not according to the argument,
+    # so the following process could share the same code.
+    def maybe_batched_tensors_to_compress():
+        for tensors in shape_to_tensors.values():
+            if state.batch_tensors_with_same_shape:
+                batch_size = len(tensors)
+                if batch_size == 1:
+                    # Use the original tensor to avoid copy.
+                    yield tensors[0].unsqueeze(0)
+                else:
+                    yield torch.stack(tensors)
+            else:
+                for tensor in tensors:
+                    yield tensor.unsqueeze(0)
+
     # Create Ps and Qs that point to the allocated memory.
+    tensors_to_compress = []
     ps = []
     qs = []
     p_idx = 0
     q_idx = 0
-    for tensor in tensors_to_compress:
-        n, m = tensor.shape
+    for tensor in maybe_batched_tensors_to_compress():
+        batch_size, n, m = tensor.shape
         matrix_approximation_rank = min(n, m, state.matrix_approximation_rank)
+        tensors_to_compress.append(tensor)
         ps.append(
             state.p_memory_dict[bucket_index][
-                p_idx : p_idx + n * matrix_approximation_rank
-            ].view(n, matrix_approximation_rank)
+                p_idx : p_idx + batch_size * n * matrix_approximation_rank
+            ].view(batch_size, n, matrix_approximation_rank)
         )
         qs.append(
             state.q_memory_dict[bucket_index][
-                q_idx : q_idx + m * matrix_approximation_rank
-            ].view(m, matrix_approximation_rank)
+                q_idx : q_idx + batch_size * m * matrix_approximation_rank
+            ].view(batch_size, m, matrix_approximation_rank)
         )
-        p_idx += n * matrix_approximation_rank
-        q_idx += m * matrix_approximation_rank
+        p_idx += batch_size * n * matrix_approximation_rank
+        q_idx += batch_size * m * matrix_approximation_rank
 
     # If warm-start is enabled, reuse Qs from the previous iteration if possible and skip filling random values.
     # The exception is the first iteration when PowerSGD is applied.
@@ -475,7 +518,7 @@ def powerSGD_hook(
 
     # Compute Ps.
     for tensor, q, p in zip(tensors_to_compress, qs, ps):
-        torch.matmul(tensor, q, out=p)
+        torch.bmm(tensor, q, out=p)
 
     # This allreduce is only applied to uncompressed tensors,
     # so it should have been kicked off before the above computation on the compressed tensors to hide more communication costs.
@@ -509,7 +552,7 @@ def compute_qs(fut):
 
         # Compute Qs.
         for tensor, p, q in zip(tensors_to_compress, ps, qs):
-            torch.matmul(tensor.t(), p, out=q)
+            torch.bmm(tensor.transpose(1, 2), p, out=q)
 
         # TODO: The above procedure does two matmul+allreduce steps per iteration --
         # one left multiplication and one right multiplication.
@@ -528,7 +571,18 @@ def decompress(fut):
         state.q_memory_dict[bucket_index] = fut.value().div_(world_size)
 
         for p, q, tensor in zip(ps, qs, tensors_to_compress):
-            torch.matmul(p, q.t(), out=tensor)
+            torch.bmm(p, q.transpose(1, 2), out=tensor)
+
+        # Copy batched tensors back to original buffer.
+        if state.batch_tensors_with_same_shape:
+            for tensor in tensors_to_compress:
+                if tensor.shape[0] == 1:
+                    # Skip tensor with batch_size == 1 since itself is the original tensor.
+                    continue
+                original_tensors = shape_to_tensors[tensor.shape[1:]]
+                for i, original_tensor in enumerate(original_tensors):
+                    original_tensor.copy_(tensor[i])
+
         if torch.cuda.is_available():
             torch.cuda.synchronize(device)
 
@@ -643,7 +697,7 @@ def batched_powerSGD_hook(
         if bucket_index in state.error_dict:
             input_tensor.add_(state.error_dict[bucket_index])
         else:
-            logging.info(
+            logger.info(
                 "A zero tensor of length {} that represents local error is created.".format(
                     padded_total_length
                 )
@@ -664,7 +718,7 @@ def batched_powerSGD_hook(
         # If warm-start is disabled, low-rank tensors will be initialized at every step.
         # Only log this if warm-start to avoid spamming.
         if state.warm_start:
-            logging.info(
+            logger.info(
                 "Initializing low-rank tensors P and Q, each of which has a shape of {} x {}.".format(
                     square_side_length, state.matrix_approximation_rank
                 )
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index 45d3eee938af..17fe5cce8c67 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -259,7 +259,7 @@ def __exit__(
                     f"{self._rank} has at least {WARN_THRESHOLD} "
                     f"fewer inputs than other currently-active ranks. "
                     "This level of skew could lead to performance "
-                    "degradataion during training."
+                    "degradation during training."
                 )
             # Shadow the all-reduce in non-joined processes
             num_nonjoined_procs = self._get_num_nonjoined_procs()
diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
index a084ab7326c2..19292c1549e2 100644
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -1,6 +1,7 @@
 import warnings
 from abc import ABC, abstractmethod
-
+from typing import Union, Iterable, Dict
+import torch
 import torch.distributed as dist
 import torch.distributed.algorithms.model_averaging.utils as utils
 
@@ -60,8 +61,7 @@ class PeriodicModelAverager(ModelAverager):
         >>>     module, device_ids=[rank], output_device=rank
         >>>  )
         >>>  # Register a post-localSGD communication hook.
-        >>>  subgroup, subgroups = dist.new_subgroups()
-        >>>  state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
+        >>>  state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
         >>>  model.register_comm_hook(state, post_localSGD_hook)
         >>>
         >>>  # In the first 100 steps, run global gradient averaging like normal DDP at every step.
@@ -83,7 +83,7 @@ def __init__(
         self,
         period,
         warmup_steps=0,
-        process_group=None,
+        process_group=None
     ):
         super().__init__(process_group)
         if warmup_steps < 0:
@@ -95,20 +95,23 @@ def __init__(
             warnings.warn(
                 "When period is 1, no need to use model averaging because the communication cost "
                 "of all-reducing parameters will be no less than the cost of all-reducing gradients "
-                "by DistributedDataParall in the backward pass. Therefore, only "
+                "by DistributedDataParallel in the backward pass. Therefore, only "
                 "DistributedDataParallel should be used for this case."
             )
         self.period = period
 
-    def average_parameters(self, params):
-        r"""
-        Averages parameters if ``step`` is no less than ``warmup_steps``
+    def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+        """
+        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``
         and it can be divided by ``period``, where ``step`` is increased by 1
         at each iteration in the training loop.
+        Args:
+            params: The parameters of a model or parameter groups of an optimizer.
+
         """
         if (
             self.step >= self.warmup_steps
             and (self.step - self.warmup_steps) % self.period == 0
         ):
-            utils.average_parameters(iter(params), self.process_group)
+            utils.average_parameters_or_parameter_groups(params, self.process_group)
         self.step += 1
diff --git a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
new file mode 100644
index 000000000000..b91438d47e41
--- /dev/null
+++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@@ -0,0 +1,163 @@
+# Copyright 2022 Cruise LLC
+import logging
+import warnings
+from collections import OrderedDict
+from typing import Union, Iterable, Dict
+
+import torch
+import torch.distributed as dist
+import torch.distributed.algorithms.model_averaging.averagers as averagers
+import torch.distributed.algorithms.model_averaging.utils as utils
+
+logger = logging.getLogger(__name__)
+
+
+class HierarchicalModelAverager(averagers.ModelAverager):
+    r"""
+    Runs hierarchical model averaging (`hierarchical SGD <https://arxiv.org/pdf/2010.12998.pdf>`_).
+    Process groups of different sizes are organized in a hierarhicy, and they average parameters
+    by using different periods concurrently after the warm-up stage.
+    This is an extension of :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`
+    that supports `post-local SGD <https://arxiv.org/abs/1808.07217>`_, which essentially only supports
+    a two-level hierarchy: the intra-machine level and the global level, where the intra-machine
+    level is usually embedded in :meth:`~torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook`.
+    Similarly, the process groups within this class do not have such an intra-machine process
+    subgroup, which should be embedded by the post-local SGD communication hook instead.
+
+    Args:
+        period_group_size_dict: An ordered dict mapping keys of model averaging period to
+                                process group size, used for initializing process groups of
+                                different sizes in a hierarchy to average parameters concurrently.
+                                Particularly, at each iteration, there will be at most a single
+                                process group that runs averaging -- the period of such group should
+                                have the largest period which the current step can be divided by.
+                                For example, if the dict has three keys: 2, 4, and 8,
+                                then this means totally three process groups will be created to
+                                average parameters every 2, 4, and 8 iterations, respectively.
+                                At the 4th iteration, only the second process group will run
+                                averaging, because the first process group should be a
+                                subset of the second process group, and no need to execute the first
+                                process group redundantly.
+                                On the other hand, the third process group can only be triggered
+                                every 8 iterations, so it will not be triggered at the 4th iteration.
+        warmup_steps (int): The number of warm-up steps. During this stage, model averaging is skipped.
+        process_group (ProcessGroup, optional): The overall process group containing all the processes that runs model averaging.
+                                                If ``None``, the default process group, which is created
+                                                by :func:`torch.distributed.init_process_group`, will be used.
+                                                (default: ``None``)
+
+    Example::
+        >>>  from collections import OrderedDict
+        >>>  import torch
+        >>>  import torch.distributed as dist
+        >>>  from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
+        >>>      PostLocalSGDState,
+        >>>      post_localSGD_hook,
+        >>>  )
+        >>>  import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
+        >>>  import torch.nn as nn
+        >>>
+        >>>  dist.init_process_group("nccl", rank=rank, world_size=16)
+        >>>  torch.cuda.set_device(rank)
+        >>>  module = nn.Linear(1, 1, bias=False).to(rank)
+        >>>  model = nn.parallel.DistributedDataParallel(
+        >>>     module, device_ids=[rank], output_device=rank
+        >>>  )
+        >>>  # Register a post-localSGD communication hook.
+        >>>  # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
+        >>>  subgroup, _ = dist.new_subgroups()
+        >>>  state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
+        >>>  model.register_comm_hook(state, post_localSGD_hook)
+        >>>
+        >>>  # Average parameters among each group of 8 processes every 4 iterations, and among all
+        >>>  # the 16 processes every 16 iterations.
+        >>>  averager = hierarchicalSGD.HierarchicalModelAverager(
+        >>>      period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
+        >>>  # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
+        >>>  # In the first 100 steps, run global gradient averaging like normal DDP at every step.
+        >>>  # After 100 steps, run model averaging at two levels.
+        >>>  for step in range(0, 200):
+        >>>     optimizer.zero_grad()
+        >>>     loss = loss_fn(output, labels)
+        >>>     loss.backward()
+        >>>     optimizer.step()
+        >>>     # Average parameters after ``optimizer.step()``.
+        >>>     # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
+        >>>     averager.average_parameters(model.parameters())
+
+    .. warning ::
+        The last group size in the dict must be the size of the provided ``process_group``,
+        which indicates model averaging at the highest level of the hierarchy.
+        If ``process_group`` is not provided, then the last group size should be equal to the world size.
+
+    .. warning ::
+        `HierarchicalModelAverager` is experimental and subject to change.
+    """
+
+    def __init__(self, period_group_size_dict=None, warmup_steps=0, process_group=None):
+        super().__init__(process_group)
+        if not period_group_size_dict:
+            raise ValueError("Arg ``period_group_size_dict`` must not be empty.")
+        self._periods = list(period_group_size_dict.keys())
+        if self._periods[0] <= 0:
+            raise ValueError("The minimum period in arg ``period_group_size_dict`` must be a positive value.")
+        elif self._periods[-1] == 1:
+            warnings.warn(
+                "When the maximum period in arg ``period_group_size_dict`` is 1, "
+                "no need to use model averaging because the communication cost "
+                "of all-reducing parameters will be no less than the cost of all-reducing gradients "
+                "by DistributedDataParallel in the backward pass. Therefore, only "
+                "DistributedDataParallel should be used for this case."
+            )
+        overall_group_size = dist.get_world_size(group=self.process_group)
+        if list(period_group_size_dict.values())[-1] != overall_group_size:
+            raise ValueError(
+                f"The last value in arg ``period_process_group_dict`` {list(period_group_size_dict.values())[-1]} "
+                f"must be equal to the size of arg ``process_group`` {overall_group_size}."
+            )
+
+        self.period_process_group_dict = OrderedDict()
+        logger.info("Model averaging hierarchy:")
+        for period, group_size in period_group_size_dict.items():
+            logger.info(
+                f"\tEach group that has {group_size} processes average parameters every {period} iterations, "
+                "if no higher-level averaging.")
+            if group_size != overall_group_size:
+                self.period_process_group_dict[period], _ = dist.new_subgroups(
+                    group_size=group_size, group=self.process_group)
+            else:
+                self.period_process_group_dict[period] = self.process_group
+
+        if warmup_steps < 0:
+            raise ValueError("Arg ``warmup_steps`` must be a non-negative number.")
+        self.warmup_steps = warmup_steps
+
+    def _find_process_group(self):
+        """
+        Returns a process group as the value of an ``period_process_group_dict`` entry,
+        if ``step`` can be divided by a period in the keys of ``period_process_group_dict``.
+        If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
+        then the returned process group is the one corresponding to the largest period,
+        since this process group will be used for averaging parameters at this ``step``.
+        Returns ``None`` if not found.
+        """
+        for period in reversed(self._periods):
+            if self.step % period == 0:
+                return self.period_process_group_dict[period]
+        return None
+
+    def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+        """
+        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``
+        and it can be divided by a period in the keys of ``period_process_group_dict``,
+        where ``step`` is increased by 1 at each iteration in the training loop.
+        If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
+        only the largest period is used, and the corresponding process group is used for averaging parameters.
+        Args:
+            params: The parameters of a model or parameter groups of an optimizer.
+        """
+        if self.step >= self.warmup_steps:
+            group = self._find_process_group()
+            if group is not None:
+                utils.average_parameters_or_parameter_groups(params, group)
+        self.step += 1
diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 81c0c8ba6465..cdc3adf6146a 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -1,20 +1,23 @@
 # flake8: noqa C101
 import itertools
-from typing import Iterator
+from typing import Union, Iterable, Dict, Iterator
 
 import torch
 import torch.distributed as dist
-
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
+from torch.distributed import ProcessGroup, group
 
 def average_parameters(
-    params: Iterator[torch.nn.Parameter], process_group: dist.ProcessGroup
+    params: Iterator[torch.nn.Parameter], process_group: ProcessGroup
 ):
     """
     Averages all the given parameters.
     For allreduce efficiency, all the parameters are flattened into a contiguous buffer.
     Thus, it requires extra memory of the same size as the given parameters.
     """
-    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    group_to_use = process_group if process_group is not None else group.WORLD
     # Do not update any parameter if not in the process group.
     if dist._rank_not_in_group(group_to_use):
         return
@@ -23,7 +26,7 @@ def average_parameters(
     # If the input parameters have different data types,
     # packing these parameters will trigger an implicit type up-casting.
     # The original parameter data types will be restored during the subsequent unpacking.
-    flat_params = torch.cat([p.data.view(-1) for p in params_it1])
+    flat_params = torch.cat([p.data.reshape(-1) for p in params_it1])
     flat_params /= dist.get_world_size(group_to_use)
     # Make sure the allreduce will not conflict with any other ongoing process group.
     if torch.cuda.is_available():
@@ -34,3 +37,33 @@ def average_parameters(
     for p in params_it2:
         p.data = flat_params[offset : offset + p.numel()].view_as(p).type_as(p)
         offset += p.numel()
+
+
+def get_params_to_average(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+    """
+    Returns a list of parameters that need to average, which filters out the parameters that do not contain any gradients.
+    Args:
+        params: The parameters of a model or parameter groups of an optimizer.
+    """
+    filtered_params = []
+    for param in params:
+        if isinstance(param, torch.nn.Parameter):
+            # model.parameters() input
+            param_data = param
+            if param_data.grad is not None:
+                filtered_params.append(param_data)
+        elif isinstance(param, dict):
+            # optimizer.param_groups input
+            for param_data in param["params"]:
+                if param_data.grad is not None:
+                    filtered_params.append(param_data)
+        else:
+            raise NotImplementedError(f"Parameter input of type {type(param)} is not supported")
+    return filtered_params
+
+
+def average_parameters_or_parameter_groups(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]], process_group: ProcessGroup):
+    """
+    Averages parameters of a model or parameter groups of an optimizer.
+    """
+    average_parameters(iter(get_params_to_average(params)), process_group)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 2bfa10bec6a2..9455da9eb5d1 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -23,8 +23,8 @@
     ReduceScatterOptions,
     ScatterOptions,
     Store,
-    _DistributedDebugLevel,
-    _get_debug_mode,
+    DebugLevel,
+    get_debug_level,
 )
 from torch._six import string_classes
 
@@ -703,7 +703,7 @@ def _new_process_group_helper(
             pg = ProcessGroupGloo(prefix_store, rank, world_size, timeout=timeout)
             # In debug mode and if GLOO is available, wrap in a wrapper PG that
             # enables enhanced collective checking for debugability.
-            if _get_debug_mode() == _DistributedDebugLevel.DETAIL:
+            if get_debug_level() == DebugLevel.DETAIL:
                 if not _GLOO_AVAILABLE:
                     logger.info(
                         """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
@@ -738,7 +738,7 @@ def _new_process_group_helper(
             pg = ProcessGroupNCCL(prefix_store, rank, world_size, pg_options)
             # In debug mode and if GLOO is available, wrap in a wrapper PG that
             # enables enhanced collective checking for debugability.
-            if _get_debug_mode() == _DistributedDebugLevel.DETAIL:
+            if get_debug_level() == DebugLevel.DETAIL:
                 if not _GLOO_AVAILABLE:
                     logger.info(
                         """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
@@ -1053,7 +1053,7 @@ def batch_isend_irecv(p2p_op_list):
     """
     Send or Receive a batch of tensors asynchronously and return a list of requests.
 
-    Process each of the operations in p2p_op_list and return the corresponding
+    Process each of the operations in ``p2p_op_list`` and return the corresponding
     requests. NCCL and Gloo backend are currently supported.
 
     Args:
@@ -1070,7 +1070,7 @@ def batch_isend_irecv(p2p_op_list):
         >>> send_tensor = torch.arange(2) + 2 * rank
         >>> recv_tensor = torch.randn(2)
         >>> send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
-        >>> recv_op = dist.P2POp(dist.irecv, recv_tensor, (rank + 1)%world_size)
+        >>> recv_op = dist.P2POp(dist.irecv, recv_tensor, (rank - 1 + world_size)%world_size)
         >>> reqs = batch_isend_irecv([send_op, recv_op])
         >>> for req in reqs:
         >>>     req.wait()
@@ -1081,6 +1081,12 @@ def batch_isend_irecv(p2p_op_list):
     .. note:: Note that when this API is used with the NCCL PG backend, users must set
         the current GPU device with `torch.cuda.set_device`, otherwise it will
         lead to unexpected hang issues.
+
+        In addition, if this API is the first collective call in the ``group``
+        passed to ``dist.P2POp``, all ranks of the ``group`` must participate in
+        this API call; otherwise, the behavior is undefined. If this API call is
+        not the first collective call in the ``group``, batched P2P operations
+        involving only a subset of ranks of the ``group`` are allowed.
     """
     _check_p2p_op_list(p2p_op_list)
     backend = get_backend(p2p_op_list[0].group)
@@ -1566,10 +1572,12 @@ def _tensor_to_object(tensor, tensor_size):
 
 def _check_for_nccl_backend(group):
     pg = group or _get_default_group()
-    # It is not expected for PG to be wrapped many times, but support it just
-    # in case
-    while isinstance(pg, _ProcessGroupWrapper):
-        pg = pg.wrapped_pg
+    # Gate PG wrapper check on Gloo availability.
+    if _GLOO_AVAILABLE:
+        # It is not expected for PG to be wrapped many times, but support it just
+        # in case
+        while isinstance(pg, _ProcessGroupWrapper):
+            pg = pg.wrapped_pg
 
     return (
         is_nccl_available() and
@@ -1872,7 +1880,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
     if my_rank == src:
         object_tensor = torch.cat(tensor_list)
     else:
-        object_tensor = torch.empty(
+        object_tensor = torch.empty(  # type: ignore[call-overload]
             torch.sum(object_sizes_tensor).item(),  # type: ignore[arg-type]
             dtype=torch.uint8,
         )
@@ -3229,6 +3237,6 @@ def new_subgroups_by_enumeration(
             rank_to_ranks_dict[rank] = ranks
             if my_rank == rank:
                 cur_subgroup = subgroup
-                logging.info("Rank {} is assigned to subgroup {}".format(rank, ranks))
+                logger.info("Rank {} is assigned to subgroup {}".format(rank, ranks))
 
     return cur_subgroup, subgroups
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index c84df1a8e434..8fa868398d28 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -156,10 +156,13 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
                 "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts),
                 "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(),
                 "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store),
-                "NCCL_ASYNC_ERROR_HANDLING": str(1),
+                "NCCL_ASYNC_ERROR_HANDLING": os.getenv(
+                    "NCCL_ASYNC_ERROR_HANDLING", str(1)
+                ),
             }
             if "OMP_NUM_THREADS" in os.environ:
                 worker_env["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"]
+
             envs[local_rank] = worker_env
             worker_args = list(spec.args)
             worker_args = macros.substitute(worker_args, str(local_rank))
diff --git a/torch/distributed/elastic/metrics/api.py b/torch/distributed/elastic/metrics/api.py
index 75e233845915..35d5c78c6ef5 100644
--- a/torch/distributed/elastic/metrics/api.py
+++ b/torch/distributed/elastic/metrics/api.py
@@ -60,7 +60,7 @@ def add_value(self, metric_name: str, metric_value: int):
 
 
 _metrics_map = {}
-_default_metrics_handler = NullMetricHandler()  # type: MetricHandler
+_default_metrics_handler: MetricHandler = NullMetricHandler()
 
 
 # pyre-fixme[9]: group has type `str`; used as `None`.
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index cfc33713d871..3f109ef10071 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -47,7 +47,7 @@ def __init__(self, msg: str, sigval: signal.Signals) -> None:
         self.sigval = sigval
 
 
-def _terminate_process_handler(signum: int, frame: FrameType) -> None:
+def _terminate_process_handler(signum: int, frame: Optional[FrameType]) -> None:
     """Termination handler that raises exceptions on the main process.
 
     When the process receives death signal(SIGTERM, SIGINT), this termination handler will
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index da47eb77e8a5..16eb5ddc45b1 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -18,24 +18,6 @@
 log = logging.getLogger(__name__)
 
 
-def _write_error(e: BaseException, error_file: Optional[str]):
-    data = {
-        "message": {
-            "message": f"{type(e).__name__}: {e}",
-            "extraInfo": {
-                "py_callstack": traceback.format_exc(),
-                "timestamp": str(int(time.time())),
-            },
-        }
-    }
-
-    if error_file:
-        with open(error_file, "w") as fp:
-            json.dump(data, fp)
-    else:
-        log.error(json.dumps(data, indent=2))
-
-
 class ErrorHandler:
     """
     Writes the provided exception object along with some other metadata about
@@ -83,7 +65,20 @@ def record_exception(self, e: BaseException) -> None:
         JSON format. If the error file cannot be determined, then logs the content
         that would have been written to the error file.
         """
-        _write_error(e, self._get_error_file_path())
+
+        file = self._get_error_file_path()
+        if file:
+            data = {
+                "message": {
+                    "message": f"{type(e).__name__}: {e}",
+                    "extraInfo": {
+                        "py_callstack": traceback.format_exc(),
+                        "timestamp": str(int(time.time())),
+                    },
+                }
+            }
+            with open(file, "w") as fp:
+                json.dump(data, fp)
 
     def dump_error_file(self, rootcause_error_file: str, error_code: int = 0):
         """
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index 81395bbf65b8..80f9763f0294 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -10,12 +10,15 @@
 from typing import List
 
 
-def get_all(store, prefix: str, size: int):
+def get_all(store, rank: int, prefix: str, size: int):
     r"""
     Given a store and a prefix, the method goes through the array of keys
     of the following format: ``{prefix}{idx}``, where idx is in a range
     from 0 to size, and tries to retrieve the data.
 
+    The Rank0 process waits at the end to make sure all other processes
+    finished the procedure before exiting.
+
     Usage
 
     ::
@@ -30,6 +33,14 @@ def get_all(store, prefix: str, size: int):
     for idx in range(size):
         data = store.get(f"{prefix}{idx}")
         data_arr.append(data)
+    store.set(f"{prefix}{rank}.FIN", b"FIN")
+    if rank == 0:
+        # Rank0 runs the TCPStore daemon, as a result it needs to exit last.
+        # Otherwise, the barrier may timeout if rank0 process finished the work
+        # before other processes finished `get_all` method
+        for node_rank in range(size):
+            store.get(f"{prefix}{node_rank}.FIN")
+
     return data_arr
 
 
@@ -50,7 +61,7 @@ def synchronize(
     """
     store.set_timeout(timedelta(seconds=barrier_timeout))
     store.set(f"{key_prefix}{rank}", data)
-    agent_data = get_all(store, key_prefix, world_size)
+    agent_data = get_all(store, rank, key_prefix, world_size)
     return agent_data
 
 
diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py
index f8dd192f3997..d81aff78fe7b 100644
--- a/torch/distributed/fsdp/__init__.py
+++ b/torch/distributed/fsdp/__init__.py
@@ -1,2 +1,11 @@
+from .flatten_params_wrapper import FlatParameter
 from .fully_sharded_data_parallel import FullyShardedDataParallel
-from .fully_sharded_data_parallel import CPUOffload
+from .fully_sharded_data_parallel import (
+    CPUOffload,
+    BackwardPrefetch,
+    ShardingStrategy,
+    MixedPrecision,
+    FullStateDictConfig,
+    LocalStateDictConfig,
+)
+from .fully_sharded_data_parallel import StateDictType, OptimStateKeyType
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
new file mode 100644
index 000000000000..5c6414df2a6b
--- /dev/null
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -0,0 +1,1002 @@
+import copy
+import functools
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+# Import the entire FSDP file to avoid circular imports
+import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
+from torch.distributed.fsdp.flatten_params_wrapper import FlatParameter
+
+
+class _ConsolidatedOptimState:
+    """
+    This holds the consolidated optimizer state on the target rank. Positive-
+    dimension tensor state is communicated across ranks, while zero-dimension
+    tensor state and non-tensor state is taken directly from the target rank.
+
+    PyTorch version 1.12 moved to using zero-dimension tensors for scalar
+    values, but user implemented optimizers may still use float (i.e. a
+    non-tensor). Thus, we support both and handle them identically.
+
+    Attributes:
+        tensor_state (Dict[str, torch.Tensor]): Mapping from positive-dimension
+            tensor state name to the unsharded flattened tensor representing
+            the state.
+        zero_dim_tensor_state (Dict[str, torch.Tensor]): Mapping from zero-
+            dimension tensor state name to its value.
+        non_tensor_state (Dict[str, Any]): Mapping from non-tensor state
+            name to its value.
+    """
+    tensor_state: Dict[str, torch.Tensor] = {}
+    zero_dim_tensor_state: Dict[str, torch.Tensor] = {}
+    non_tensor_state: Dict[str, Any] = {}
+
+
+class _PosDimTensorInfo(NamedTuple):
+    """
+    Meatadata for positive-dimension tensors used internally for
+    :meth:`scatter_full_optim_state_dict`.
+
+    Attributes:
+        shape (torch.Size): Sharded tensor shape (which is equal to the
+            unsharded tensor shape if the tensor is optimizer state for a
+            non-FSDP parameter and is hence not sharded).
+        dtype (torch.dtype): Data type of the tensor.
+    """
+    shape: torch.Size
+    dtype: torch.dtype
+
+
+def _unflatten_optim_state(
+    fsdp_module,
+    flat_param: FlatParameter,
+    flat_param_state: Dict[str, Any],
+    to_save: bool,
+) -> List[Dict[str, Any]]:
+    """
+    Unflattens the optimizer state, consisting of the "state" part and the
+    "param_groups" part. Unflattening the "state" part involves consolidating
+    the state on the target rank and remapping from flattened to unflattened
+    parameter IDs, and the "param_groups" part only involves remapping from
+    flattened to unflattened parameter IDs.
+
+    Args:
+        fsdp_module (FullyShardedDataParallel): FSDP module that owns
+            ``flat_param``, i.e. holds it in ``self.params``.
+        flat_param (FlatParameter): The flattened parameter.
+        flat_param_state (Dict[str, Any]): Entry for the flattened parameter
+            in the "state" part of the optimizer state dict.
+        to_save (bool): Whether to save the state on this rank.
+
+    Returns:
+        List[Dict[str, Any]]: A :class:`list` holding the entries in the
+        "state" part of the optimizer state dict corresponding to the
+        unflattened parameters comprising the flattened parameter
+        ``flat_param`` if on the target rank or an empty :class:`list`
+        otherwise. The final optimizer state dict will need to map these
+        entries using the proper unflattened parameter IDs.
+    """
+    assert sum(p is flat_param for p in fsdp_module.params) == 1, \
+        "`fsdp_module` must own `flat_param`"
+    consolidated_state = _communicate_optim_state(
+        fsdp_module, flat_param, flat_param_state, to_save,
+    )
+    unflat_param_state = _unflatten_communicated_optim_state(
+        fsdp_module,
+        flat_param,
+        consolidated_state,
+    ) if to_save else []
+    return unflat_param_state
+
+
+def _communicate_optim_state(
+    fsdp_module,
+    flat_param: FlatParameter,
+    flat_param_state: Dict[str, Any],
+    to_save: bool,
+) -> _ConsolidatedOptimState:
+    """
+    Communicates the optimizer state for a flattened parameter ``flat_param``
+    across ranks so that the target rank holds the entire non-sharded optimizer
+    state.
+
+    If ``N`` is the number of tensor optimizer states in the optimizer state
+    dict, then the communication complexity is 0 if ``N = 0`` and ``N + 1``
+    otherwise (where the plus 1 comes from all-gathering the padding per rank).
+
+    Args:
+        flat_param (FlatParameter): The flattened parameter.
+        flat_param_state (Dict[str, Any]): The entry in the "state" part of the
+            optimizer state dict corresponding to the flattened parameter.
+        to_save (bool): Whether to save the state on this rank.
+
+    Returns:
+        ConsolidatedOptimState: Consolidated optimizer state for
+        ``flat_param``; the state is not populated for non-target ranks.
+    """
+    param_index = -1
+    for i, param in enumerate(fsdp_module.params):
+        if param is flat_param:
+            param_index = i
+            break
+    assert param_index >= 0, "`fsdp_module` must own `flat_param`"
+
+    state = _ConsolidatedOptimState()
+    tensor_state, zero_dim_tensor_state, non_tensor_state = \
+        state.tensor_state, state.zero_dim_tensor_state, state.non_tensor_state
+    process_group = fsdp_module.process_group
+
+    tensor_buffer = None  # initialize lazily in case it is not needed
+    for state_name, value in flat_param_state.items():
+        # Positive-dimension tensor state: communicate across ranks
+        if torch.is_tensor(value) and value.dim() > 0:
+            # If the parameter is not sharded (e.g. world size of 1), then
+            # neither is the positive-dimension tensor state, so no need to
+            # communicate it -- we take the target rank's value
+            if not flat_param._is_sharded:
+                tensor_state[state_name] = value.cpu()
+                continue
+            if tensor_buffer is None:
+                # Assume that positive-dimension tensor optimizer state
+                # has the same shape as the sharded flattened parameter
+                buffer_size = flat_param._full_param_padded.size()  # type: ignore[attr-defined]
+                tensor_buffer = value.new_zeros(*buffer_size)
+            dist._all_gather_base(tensor_buffer, value, group=process_group)
+            if to_save:
+                assert hasattr(flat_param, "_orig_size"), \
+                    "Sharded flattened parameter should have `_orig_size` set"
+                unpadded_numel = flat_param._orig_size.numel()  # type: ignore[attr-defined]
+                tensor_state[state_name] = tensor_buffer[:unpadded_numel].cpu()
+        # Zero-dimension tensor state and non-tensor state: take this rank's
+        # value directly
+        elif to_save:
+            if _is_zero_dim_tensor(value):
+                zero_dim_tensor_state[state_name] = value.cpu()
+            else:
+                non_tensor_state[state_name] = value
+    return state
+
+
+def _unflatten_communicated_optim_state(
+    fsdp_module,
+    flat_param: FlatParameter,
+    state: _ConsolidatedOptimState,
+) -> List[Dict[str, Any]]:
+    """
+    Unflattens the communicated optimizer state (given by ``tensor_state``,
+    ``non_tensor_state``, and ``zero_dim_tensor_state``) for a single flattened
+    parameter ``flat_param``. This should only be called on the target rank.
+
+    Args:
+        fsdp_module (FullyShardedDataParallel): FSDP module that owns
+            ``flat_param``, i.e. holds it in ``self.params``.
+        flat_param (FlatParameter): The flattened parameter.
+        state (_ConsolidatedOptimState): Consolidated optimizer state.
+
+    Returns:
+        List[Dict[str, Any]]: A :class:`list` holding the entries in the
+        "state" part of the optimizer state dict corresponding to the
+        unflattened parameters comprising the flattened parameter
+        ``flat_param``. The final optimizer state dict will need to map these
+        entries using the proper unflattened parameter IDs.
+    """
+    assert sum(p is flat_param for p in fsdp_module.params) == 1, \
+        "`fsdp_module` must own `flat_param`"
+    unflat_param_state: List[Dict[str, Any]] = []
+    flat_param_views: Dict[str, Iterator] = {}
+    num_unflat_params = flat_param._num_unflattened_params
+    tensor_state, zero_dim_tensor_state, non_tensor_state = \
+        state.tensor_state, state.zero_dim_tensor_state, state.non_tensor_state
+
+    for _ in range(num_unflat_params):
+        unflat_state_param = {}
+        # Add positive-dimension tensor state: unflatten with views
+        for state_name, flat_tensor in tensor_state.items():
+            views_generated = state_name in flat_param_views
+            if not views_generated:
+                param_views = flat_param.get_param_views(flat_tensor)
+                flat_param_views[state_name] = param_views
+            else:
+                param_views = flat_param_views[state_name]
+            unflat_state_param[state_name] = next(param_views)
+        # Add zero-dimension tensor state: take the target rank's value
+        for state_name, zero_dim_tensor in zero_dim_tensor_state.items():
+            unflat_state_param[state_name] = zero_dim_tensor
+        # Add non-tensor state: take the target rank's value
+        for state_name, non_tensor in non_tensor_state.items():
+            unflat_state_param[state_name] = non_tensor
+        unflat_param_state.append(unflat_state_param)
+    return unflat_param_state
+
+
+def _flatten_full_optim_state_dict(
+    full_optim_state_dict: Dict[str, Any],
+    model: torch.nn.Module,
+    shard_state: bool,
+    optim_input: Optional[Union[
+        List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+    ]] = None,
+) -> Tuple[Dict[str, Any], Set[int]]:
+    """
+    Args:
+        shard_state (bool): Whether to shard flattened positive-dimension
+            tensor state; if ``False``, then the full flattened tensor is
+            kept in the returned :class:`dict.
+
+    Returns:
+        Tuple[Dict[str, Any], Set[int]]: The flattened optimizer state dict
+            and a set of the parameter IDs corresponding to FSDP parameters.
+    """
+    full_osd = full_optim_state_dict  # alias
+    if "state" not in full_osd or "param_groups" not in full_osd:
+        raise ValueError(
+            "`full_optim_state_dict` must have the keys \"state\" and "
+            "\"param_groups\" to be a valid optimizer state dict"
+        )
+
+    flat_param_id_to_param = _get_param_id_to_param(model, optim_input)
+    flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
+    param_to_unflat_param_names = FSDP._get_param_to_unflat_param_names(model)
+
+    # Handle the "state" part of the optimizer state dict
+    flat_osd_state: Dict[int, Any] = {}
+    full_osd_state = full_osd["state"]
+    unflat_param_names_to_flat_param_id: Dict[str, int] = {}
+    fsdp_flat_param_ids = set()  # save which IDs are for FSDP parameters
+    for flat_param_id, param in enumerate(flat_param_id_to_param):  # type: ignore[assignment]
+        assert param in param_to_unflat_param_names, \
+            "Check the `param_to_unflat_params` construction\n" \
+            f"param: {param}"
+        unflat_param_names = param_to_unflat_param_names[param]
+        # For FSDP parameters, we need to flatten
+        if isinstance(param, FlatParameter):
+            assert param in flat_param_to_fsdp_module, \
+                "Check the `flat_param_to_fsdp_module` mapping " \
+                f"construction\nparam={param}"
+            unflat_param_names = param_to_unflat_param_names[param]
+            fsdp_module = flat_param_to_fsdp_module[param]
+            flat_state = _flatten_optim_state(
+                full_osd_state, unflat_param_names, fsdp_module, param,
+                shard_state,
+            )
+            flat_osd_state[flat_param_id] = flat_state
+            for unflat_param_name in unflat_param_names:
+                unflat_param_names_to_flat_param_id[unflat_param_name] = flat_param_id
+            fsdp_flat_param_ids.add(flat_param_id)
+        # For parameters from non-FSDP modules, we do not need to flatten
+        else:
+            assert len(unflat_param_names) == 1
+            unflat_param_name = unflat_param_names[0]
+            if unflat_param_name not in full_osd_state:
+                # A non-FSDP module's parameter may be ignored and hence not
+                # have an entry in the optimizer state
+                continue
+            # Remap from unflattened to flattened parameter ID -- do not
+            # deepcopy to avoid unnecessarily duplicating tensor storage
+            flat_osd_state[flat_param_id] = \
+                copy.copy(full_osd_state[unflat_param_name])
+            unflat_param_names_to_flat_param_id[unflat_param_name] = flat_param_id
+
+    # Handle the "param_groups" part of the optimizer state dict
+    sharded_osd_param_groups: List[Dict[str, Any]] = []
+    for unflat_param_group in full_osd["param_groups"]:
+        flat_param_group = copy.deepcopy(unflat_param_group)
+        # Map from unflattened parameter names to flattened parameter IDs
+        flat_param_ids = sorted(set(
+            unflat_param_names_to_flat_param_id[unflat_param_name]
+            for unflat_param_name in unflat_param_group["params"]
+        ))
+        flat_param_group["params"] = flat_param_ids
+        sharded_osd_param_groups.append(flat_param_group)
+
+    optim_state_dict = {
+        "state": flat_osd_state,
+        "param_groups": sharded_osd_param_groups,
+    }
+    return optim_state_dict, fsdp_flat_param_ids
+
+
+def _flatten_optim_state(
+    unflat_osd_state: Dict[str, Dict[str, Any]],
+    unflat_param_names: List[str],
+    fsdp_module,
+    flat_param: FlatParameter,
+    shard_state: bool,
+) -> Dict[str, Any]:
+    """
+    Flattens the optimizer state in ``full_optim_state_dict`` for a single
+    flattened parameter ``flat_param`` in ``fsdp_module`` corresponding to
+    the unflattened parameter names in ``unflat_param_names``.
+
+    Args:
+        unflat_osd_state (Dict[str, Dict[str, Any]]): The "state" part of the
+            optimizer state dict corresponding to the unflattened parameters.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the flattened parameter
+            ``flat_param``.
+        fsdp_module (FullyShardedDataParallel): FSDP module owning the
+            flattened parameter.
+        flat_param (FlatParameter): The flattened parameter.
+        shard_state (bool): Whether to shard flattened positive-dimension
+            tensor state; if ``False``, then the full flattened tensor is
+            kept in the returned :class:`dict.
+
+    Returns:
+        Dict[str, Any]: A :class:`dict` mapping state names to their values for
+        a particular flattened parameter. The sharded optimizer state dict's
+        "state" part will map the flattened parameter ID to this returned
+        value.
+    """
+    num_unflat_params = len(unflat_param_names)
+    assert num_unflat_params > 0, \
+        "Expects at least one unflattened parameter corresponding to the " \
+        "flattened parameter"
+    unflat_param_shapes = flat_param._param_shapes
+    num_unflat_param_shapes = len(unflat_param_shapes)
+    assert num_unflat_params == num_unflat_param_shapes, \
+        f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
+
+    # Check if these unflattened parameters have any optimizer state
+    has_state = [
+        bool(unflat_param_name in unflat_osd_state)
+        for unflat_param_name in unflat_param_names
+    ]
+    # If none of the unflattened parameters comprising this flattened parameter
+    # have any state, then we do not want an entry in the optimizer state dict
+    if not any(has_state):
+        return {}  # no need to flatten any state
+    # There may still be some unflattened parameters with state and some
+    # without
+    unflat_param_states = [
+        unflat_osd_state[unflat_param_name]
+        if unflat_param_name in unflat_osd_state else None
+        for unflat_param_name in unflat_param_names
+    ]
+    # Check that the unflattened parameters have the same state names
+    state_names = None
+    for unflat_param_state in unflat_param_states:
+        if unflat_param_state is None:
+            continue
+        if state_names is None:
+            state_names = set(unflat_param_state.keys())
+        else:
+            if state_names != set(unflat_param_state.keys()):
+                raise ValueError(
+                    "Differing optimizer state names for the unflattened "
+                    f"parameters: {unflat_param_names}"
+                )
+    assert state_names is not None
+
+    # Flatten the state
+    flat_state: Dict[str, Any] = {}
+    for state_name in state_names:
+        state_values = [
+            unflat_param_state[state_name]
+            if unflat_param_state is not None else None
+            for unflat_param_state in unflat_param_states
+        ]
+        non_none_state_values = [v for v in state_values if v is not None]
+        are_pos_dim_tensors = are_zero_dim_tensors = are_non_tensors = True
+        for v in non_none_state_values:
+            are_pos_dim_tensors &= torch.is_tensor(v) and v.dim() > 0
+            are_zero_dim_tensors &= _is_zero_dim_tensor(v)
+            are_non_tensors &= not torch.is_tensor(v)
+        types = set(type(v) for v in non_none_state_values)
+        if len(types) != 1 or not (
+            are_pos_dim_tensors or are_zero_dim_tensors or are_non_tensors
+        ):
+            raise ValueError(
+                f"Differing optimizer state types for state {state_name}, "
+                f"values {non_none_state_values}, and unflattened parameter "
+                f"names {unflat_param_names}"
+            )
+        if are_pos_dim_tensors:
+            flat_tensor = _flatten_tensor_optim_state(
+                state_name, state_values, unflat_param_names,
+                unflat_param_shapes, flat_param,
+            )
+            if shard_state:
+                # Shard the flattened tensor immediately to minimize max memory
+                # usage
+                sharded_flat_tensor, _ = fsdp_module._get_shard(flat_tensor)
+                flat_state[state_name] = sharded_flat_tensor
+            else:
+                flat_state[state_name] = flat_tensor
+        elif are_zero_dim_tensors:
+            flat_state[state_name] = _flatten_zero_dim_tensor_optim_state(
+                state_name, state_values, unflat_param_names,
+            )
+        else:
+            assert are_non_tensors
+            flat_state[state_name] = _flatten_non_tensor_optim_state(
+                state_name, state_values, unflat_param_names,
+            )
+
+    return flat_state
+
+
+def _flatten_tensor_optim_state(
+    state_name: str,
+    pos_dim_tensors: List[torch.Tensor],
+    unflat_param_names: List[str],
+    unflat_param_shapes: List[torch.Size],
+    flat_param: FlatParameter,
+) -> torch.Tensor:
+    """
+    Flattens the positive-dimension tensor optimizer state given by the values
+    ``tensors`` for the state ``state_name`` for a single flattened parameter
+    ``flat_param`` corresponding to the unflattened parameter names
+    ``unflat_param_names`` and unflatted parameter shapes
+    ``unflat_param_shapes``. This flattens each unflattened parameter's tensor
+    state into one tensor.
+
+    NOTE: We use zero tensors for any unflattened parameters without state
+    since some value is required to fill those entries. This assumes that the
+    zero tensor is mathematically equivalent to having no state, which is true
+    for Adam's ``exp_avg`` and ``exp_avg_sq`` but may not be true for all
+    optimizers.
+
+    Args:
+        state_name (str): Optimizer state name.
+        pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor
+            optimizer state values for the unflattened parameters corresponding
+            to the single flattened parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flattened parameter.
+        unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes
+            corresponding to the single flattened parameter.
+        flat_param (FlatParameter): The flattened parameter.
+
+    Returns:
+        torch.Tensor: A flattened tensor containing the optimizer state
+        corresponding to ``state_name`` constructed by concatenating the
+        unflattened parameter tensor states in ``pos_dim_tensors`` (using zero
+        tensors for any unflattened parameters without the state).
+    """
+    non_none_tensors = [t for t in pos_dim_tensors if t is not None]
+    # Check that all are tensors with the same dtype
+    dtypes = set(t.dtype for t in non_none_tensors)
+    if len(dtypes) != 1:
+        raise ValueError(
+            "All unflattened parameters comprising a single flattened "
+            "parameter must have positive-dimension tensor state with the "
+            f"same dtype but got dtypes {dtypes} for state {state_name} and "
+            f"unflattened parameter names {unflat_param_names}"
+        )
+    dtype = next(iter(dtypes))
+    # Check that each tensor state matches its parameter's shape
+    for tensor, shape in zip(pos_dim_tensors, unflat_param_shapes):
+        if tensor is None and len(shape) == 0:
+            raise ValueError(
+                "Flattening a zero-dimension parameter is not supported"
+            )
+        elif tensor is not None and tensor.shape != shape:
+            raise ValueError(
+                "Tensor optimizer state does not have same shape as its "
+                f"parameter: {tensor.shape} {shape}"
+            )
+    # Flatten the tensor states
+    cpu_device = torch.device("cpu")
+    tensors = [
+        torch.flatten(state_value.to(cpu_device)) if state_value is not None
+        else torch.flatten(torch.zeros(
+            size=shape, dtype=dtype, device=cpu_device,
+        ))
+        for state_value, shape
+        in zip(pos_dim_tensors, unflat_param_shapes)
+    ]
+    padding = flat_param.num_padded
+    if padding > 0:
+        tensors.append(torch.zeros(padding, dtype=dtype, device=cpu_device))
+    flat_tensor = torch.cat(tensors)
+    # `flat_tensor`'s shape should be 1D and less than or equal to the
+    # flattened parameter's shape (where the inequality is strict for positive
+    # padding)
+    if not flat_param._is_sharded:  # currently, only when world size is 1
+        # If the parameter is not sharded, then `_full_param_padded` is not
+        # used, so we skip the shape check
+        return flat_tensor
+    full_padded_dim = flat_param._full_param_padded.dim()  # type: ignore[attr-defined]
+    full_padded_shape = flat_param._full_param_padded.shape  # type: ignore[attr-defined]
+    assert flat_tensor.dim() == 1, \
+        f"`flat_tensor` should be 1D but got {flat_tensor.dim()} dims"
+    assert full_padded_dim == 1, \
+        f"`_full_param_padded` should be 1D but got {full_padded_dim} dims"
+    assert flat_tensor.shape[0] <= full_padded_shape[0], \
+        f"tensor optim state: {flat_tensor.shape} " \
+        f"parameter: {full_padded_shape}"
+    return flat_tensor
+
+
+def _flatten_zero_dim_tensor_optim_state(
+    state_name: str,
+    zero_dim_tensors: List[torch.Tensor],
+    unflat_param_names: List[str],
+) -> torch.Tensor:
+    """
+    Flattens the zero-dimension tensor optimizer state given by the values
+    ``zero_dim_tensors`` for the state ``state_name`` for a single flattened
+    parameter corresponding to the unflattened parameter names
+    ``unflat_param_names`` by enforcing that all tensors are the same and using
+    that common value.
+
+    NOTE: The requirement that the tensors are the same across all unflattened
+    parameters comprising the flattened parameter is needed to maintain the
+    invariant that FSDP performs the same computation as its non-sharded
+    equivalent. This means that none of the unflattened parameters can be
+    missing this state since imposing a value may differ from having no value.
+    For example, for Adam's "step", no value means maximum bias correction,
+    while having some positive value means less bias correction.
+
+    Args:
+        state_name (str): Optimizer state name.
+        zero_dim_tensors (List[torch.Tensor]): Zero-dimension optimizer state
+            for the unflattened parameters corresponding to the single
+            flattened parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flattened parameter.
+
+    Returns:
+        torch.Tensor: A zero-dimensional tensor giving the value of the state
+        ``state_name`` for all unflattened parameters corresponding to the
+        names ``unflat_param_names``.
+    """
+    non_none_tensors = [t for t in zero_dim_tensors if t is not None]
+    # Enforce that all have the same value and dtype
+    values_set = set(t.item() if t is not None else None for t in zero_dim_tensors)
+    dtypes = set(t.dtype if t is not None else None for t in zero_dim_tensors)
+    if len(non_none_tensors) != len(zero_dim_tensors) or \
+            len(values_set) != 1 or len(dtypes) != 1:
+        raise ValueError(
+            "All unflattened parameters comprising a single flattened "
+            "parameter must have scalar state with the same value and dtype "
+            f"but got values {values_set} and dtypes {dtypes} for state "
+            f"{state_name} and unflattened parameter names "
+            f"{unflat_param_names}"
+        )
+    value = next(iter(values_set))
+    dtype = next(iter(dtypes))
+    return torch.tensor(value, dtype=dtype, device=torch.device("cpu"))
+
+
+def _flatten_non_tensor_optim_state(
+    state_name: str,
+    non_tensors: List[Any],
+    unflat_param_names: List[str],
+) -> Any:
+    """
+    Flattens the non-tensor optimizer state given by the values ``non_tensors``
+    for the state ``state_name`` for a single flattened parameter corresponding
+    to the unflattened parameter names ``unflat_param_names`` by enforcing that
+    all values are the same and using that common value.
+
+    See the note in :func:`_flatten_zero_dim_tensor_optim_state`.
+
+    Args:
+        state_name (str): Optimizer state name.
+        non_tensors (List[Any]): Non-tensor optimizer state for the unflattened
+            parameters corresponding to the single flattened parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flattened parameter.
+
+    Returns:
+        Any: A non-tensor giving the value of the state ``state_name`` for all
+        unflattened parameters corresponding to the names
+        ``unflat_param_names``.
+    """
+    non_none_non_tensors = [nt for nt in non_tensors if nt is not None]
+    # Enforce that all have the same value (same type already checked)
+    non_tensor_set = set(non_tensors)
+    if len(non_none_non_tensors) != len(non_tensors) or \
+            len(non_tensor_set) != 1:
+        raise ValueError(
+            "All unflattened parameters comprising a single flattened "
+            "parameter must have scalar state with the same value and dtype "
+            f"but got values {non_tensor_set} for state {state_name} and  "
+            f"unflattened parameter names {unflat_param_names}"
+        )
+    non_tensor = next(iter(non_tensor_set))
+    return non_tensor
+
+
+def _process_pos_dim_tensor_state(
+    flat_optim_state_dict: Dict[str, Any],
+    fsdp_flat_param_ids: Set[int],
+    world_size: int,
+) -> Dict[str, Any]:
+    """
+    Processes positive-dimension tensor states in ``flat_optim_state_dict`` by
+    replacing them with metadata. This is done so the processed optimizer state
+    dict can be broadcast from rank 0 to all ranks without copying those tensor
+    states, and thus, this is meant to only be called on rank 0.
+
+    Args:
+        flat_optim_state_dict (Dict[str, Any]): Flattened optimizer state dict
+            with the positive-dimension tensor states unsharded; this should
+            be returned by :meth:`_flatten_optim_state` with
+            ``shard_state=False``.
+        fsdp_flat_param_ids (Set[int]): Parameter IDs corresponding to FSDP
+            parameters.
+
+    Returns:
+        Dict[str, Any]: The flattened optimizer state dict with positive-
+        dimension tensor states replaced by metadata.
+    """
+    flat_osd = flat_optim_state_dict  # alias
+    no_tensor_osd: Dict[str, Any] = {"state": {}}
+    cpu_device = torch.device("cpu")
+    for param_id, param_state in flat_osd["state"].items():
+        no_tensor_osd["state"][param_id] = {}
+        for state_name, state_value in param_state.items():
+            is_pos_dim_tensor_state = torch.is_tensor(state_value) and \
+                state_value.dim() > 0
+            if not is_pos_dim_tensor_state:
+                no_tensor_osd["state"][param_id][state_name] = state_value
+                continue
+            if param_id in fsdp_flat_param_ids:  # FSDP parameter
+                # Use `_get_chunk()` to get a view and avoid allocating any new
+                # tensor storage via either `clone()` or `pad()`; each rank's
+                # chunk has the same padded shape, so we can pass rank 0
+                chunk, num_to_pad = FSDP.FullyShardedDataParallel._get_chunk(
+                    state_value, 0, world_size,
+                )
+                assert len(chunk.shape) == 1, \
+                    f"Chunk should be 1D but got {chunk.shape}"
+                # Include the padding to get the final shard shape
+                info = _PosDimTensorInfo(
+                    shape=torch.Size([chunk.shape[0] + num_to_pad]),
+                    dtype=chunk.dtype,
+                )
+            else:  # non-FSDP parameter
+                info = _PosDimTensorInfo(
+                    shape=state_value.shape, dtype=state_value.dtype,
+                )
+            no_tensor_osd["state"][param_id][state_name] = info
+    no_tensor_osd["param_groups"] = copy.deepcopy(flat_osd["param_groups"])
+    return no_tensor_osd
+
+
+def _broadcast_processed_optim_state_dict(
+    processed_optim_state_dict: Optional[Dict[str, Any]],
+    fsdp_flat_param_ids: Optional[Set[int]],
+    rank: int,
+    group,
+    device: torch.device,
+) -> Tuple[Dict[str, Any], Set[int]]:
+    """
+    Broadcasts the processed optimizer state dict and the accompanying FSDP
+    parameter IDs from rank 0 to all ranks.
+
+    Args:
+        processed_optim_state_dict (Optional[Dict[str, Any]]): The full
+            optimizer state dict with positive-dimension tensor states replaced
+            with metadata if on rank 0; ignored otherwise.
+        fsdp_flat_param_ids (Optional[Set[int]]): Parameter IDs corresponding
+            to FSDP parameters if on rank 0; ignored otherwise.
+        device (torch.device): Device to move zero-dimension tensors post-
+            broadcast.
+
+    Returns:
+        Tuple[Dict[str, Any], Set[int]]: The processed optimizer state dict
+        and the parameter IDs corresponding to FSDP parameters.
+    """
+    # Broadcast the two data structures rank 0 to all ranks
+    obj_list = [processed_optim_state_dict, fsdp_flat_param_ids] if rank == 0 \
+        else [None, None]
+    dist.broadcast_object_list(obj_list, src=0, group=group)
+    processed_optim_state_dict, fsdp_flat_param_ids = obj_list  # type: ignore[assignment]
+    assert processed_optim_state_dict is not None
+    assert fsdp_flat_param_ids is not None
+    # Move zero-dimension tensors to `device`
+    for param_state in processed_optim_state_dict["state"].values():
+        for state_name, value in param_state.items():
+            if _is_zero_dim_tensor(value):
+                param_state[state_name] = value.to(device)
+    return processed_optim_state_dict, fsdp_flat_param_ids
+
+
+def _broadcast_pos_dim_tensor_states(
+    processed_optim_state_dict: Dict[str, Any],
+    fsdp_flat_param_ids: Set[int],
+    flat_optim_state_dict: Optional[Dict[str, Any]],
+    rank: int,
+    world_size: int,
+    group,
+    broadcast_device: torch.device,
+) -> Dict[str, Any]:
+    """
+    Takes ``processed_optim_state_dict``, which has metadata in place of
+    positive-dimension tensor states, and broadcasts those tensor states from
+    rank 0 to all ranks. For tensor states corresponding to FSDP parameters,
+    rank 0 shards the tensor and broadcasts shard-by-shard, and for tensor
+    states corresponding to non-FSDP parameters, rank 0 broadcasts the full
+    tensor.
+
+    Args:
+        processed_optim_state_dict (Dict[str, Any]): The full optimizer state
+            dict with positive-dimension tensor states replaced with metadata;
+            should be returned by :meth:`_process_pos_dim_tensor_state` and
+            non-empty on all ranks (e.g. via a ``broadcast()`` from rank 0).
+        fsdp_flat_param_ids (Set[int]): Parameter IDs corresponding to FSDP
+            parameters.
+        flat_optim_state_dict (Optional[Dict[str, Any]]): Flattened optimizer
+            state dict if on rank 0; ignored on nonzero ranks.
+
+    Returns:
+        Dict[str, Any]: The optimizer state dict with the positive-dimension
+        tensor state correctly populated via ``broadcast()`` s from rank 0.
+    """
+    assert rank != 0 or flat_optim_state_dict is not None, \
+        "Expects rank 0 to pass in the flattened optimizer state dict"
+    no_tensor_osd = processed_optim_state_dict  # alias
+    flat_osd = flat_optim_state_dict  # alias
+    for param_id, param_state in no_tensor_osd["state"].items():
+        for state_name, value in param_state.items():
+            is_pos_dim_tensor_state = isinstance(value, _PosDimTensorInfo)
+            if not is_pos_dim_tensor_state:
+                continue
+            if rank == 0:
+                assert flat_osd is not None
+                unsharded_tensor = flat_osd["state"][param_id][state_name]
+            else:
+                unsharded_tensor = None
+            shape, dtype = value.shape, value.dtype
+            if param_id in fsdp_flat_param_ids:  # FSDP parameter
+                _broadcast_sharded_pos_dim_tensor_state(
+                    unsharded_tensor, param_state, state_name, shape, dtype,
+                    broadcast_device, rank, world_size, group,
+                )  # modify `param_state` destructively
+            else:  # non-FSDP parameter
+                _broadcast_unsharded_pos_dim_tensor_state(
+                    unsharded_tensor, param_state, state_name, shape, dtype,
+                    broadcast_device, rank, group,
+                )  # modify `param_state` destructively
+    return no_tensor_osd
+
+
+def _broadcast_sharded_pos_dim_tensor_state(
+    unsharded_tensor: Optional[torch.Tensor],
+    param_state: Dict[str, Any],
+    state_name: str,
+    shape: torch.Size,
+    dtype: torch.dtype,
+    broadcast_device: torch.device,
+    rank: int,
+    world_size: int,
+    group,
+) -> None:
+    """
+    Broadcasts positive-dimension tensor state for the state ``state_name``
+    corresponding to an FSDP parameter shard-by-shard, only to be saved on the
+    relevant rank. This modifies ``param_state`` destructively.
+
+    Args:
+        unsharded_tensor (Optional[torch.Tensor]): Unsharded tensor from which
+            to broadcast shards if on rank 0; ignored otherwise.
+        shape (torch.Size): Shape of the sharded tensor; same on all ranks.
+    """
+    get_shard: Optional[functools.partial[Tuple[torch.Tensor, int]]] = None
+    if rank == 0:
+        assert unsharded_tensor is not None, \
+            "Expects rank 0 to pass in the unsharded tensor"
+        get_shard = functools.partial(
+            FSDP.FullyShardedDataParallel._get_shard_functional,
+            unsharded_tensor,
+        )
+    for target_rank in range(1, world_size):
+        if rank == 0:
+            assert get_shard is not None
+            sharded_tensor = get_shard(target_rank, world_size)[0].to(broadcast_device)
+        else:
+            sharded_tensor = torch.zeros(
+                shape, requires_grad=False, dtype=dtype,
+                device=broadcast_device,
+            )
+        dist.broadcast(sharded_tensor, src=0, group=group)
+        # Only keep the shard on the target rank and keep it on the broadcast
+        # device, which is typically GPU
+        if rank == target_rank:
+            param_state[state_name] = sharded_tensor
+        else:
+            del sharded_tensor
+    # Lastly, shard on rank 0
+    if rank != 0:
+        return
+    param_state[state_name] = get_shard(0, world_size)[0].to(broadcast_device)  # type: ignore[misc]
+
+
+def _broadcast_unsharded_pos_dim_tensor_state(
+    unsharded_tensor: Optional[torch.Tensor],
+    param_state: Dict[str, Any],
+    state_name: str,
+    shape: torch.Size,
+    dtype: torch.dtype,
+    broadcast_device: torch.device,
+    rank: int,
+    group,
+) -> None:
+    """
+    Broadcasts positive-dimension tensor state for the state ``state_name``
+    corresponding to an unsharded non-FSDP parameter from rank 0 to all ranks.
+    This modifies ``param_state`` destructively.
+
+    Args:
+        unsharded_tensor (Optional[torch.Tensor]): Unsharded tensor to
+            broadcast if on rank 0; ignored otherwise.
+    """
+    if rank == 0:
+        assert unsharded_tensor is not None, \
+            "Expects rank 0 to pass in the unsharded tensor"
+        assert shape == unsharded_tensor.shape, \
+            f"Shape mismatch: {shape} {unsharded_tensor.shape}"
+        assert dtype == unsharded_tensor.dtype, \
+            f"dtype mismatch: {dtype} {unsharded_tensor.dtype}"
+        unsharded_tensor = unsharded_tensor.to(broadcast_device)
+    else:
+        unsharded_tensor = torch.zeros(
+            shape, requires_grad=False, dtype=dtype, device=broadcast_device,
+        )
+    dist.broadcast(unsharded_tensor, src=0, group=group)
+    # Keep the tensor on the broadcast device, which is typically GPU
+    param_state[state_name] = unsharded_tensor
+
+
+def _get_flat_param_to_fsdp_module(model: torch.nn.Module):
+    """
+    Constructs a mapping from FSDP flattened parameters to their owning FSDP
+    modules and ensures that all FSDP modules are initialized.
+
+    Args:
+        model (torch.nn.model): Root module (which may or may not be a
+            :class:`FullyShardedDataParallel` instance).
+
+    Returns:
+        Dict[FlatParameter, FullyShardedDataParallel]: Mapping from FSDP
+            flattened parameters to their owning FSDP modules.
+    """
+    flat_param_to_fsdp_module = {}
+    for module in model.modules():
+        if isinstance(module, FSDP.FullyShardedDataParallel):
+            module._lazy_init()
+            for param in module.params:  # may have none
+                flat_param_to_fsdp_module[param] = module
+    return flat_param_to_fsdp_module
+
+
+def _get_param_id_to_param(
+    model: torch.nn.Module,
+    optim_input: Optional[Union[
+        List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+    ]] = None,
+) -> List[torch.nn.Parameter]:
+    """
+    Constructs a mapping from parameter IDs to parameters. This may be used
+    both for models with ``FlatParameter`` s and without.
+
+    NOTE: We critically assume that, whether the optimizer input is a list of
+    parameters or a list of parameter groups, :class:`torch.optim.Optimizer`
+    enumerates the parameter IDs in order. In other words, for a parameter list
+    input, the parameter IDs should be in that list order, and for a parameter
+    groups input, the parameter IDs should be in order within each parameter
+    group and in order across parameter groups.
+
+    Args:
+        model (torch.nn.Module): Model whose parameters are passed into the
+            optimizer.
+        optim_input (Optional[Union[List[Dict[str, Any]],
+        Iterable[torch.nn.Parameter]]]): Input passed into the optimizer
+            representing either a :class:`list` of parameter groups or an
+            iterable of parameters; if ``None``, then this method assumes the
+            input was ``model.parameters()``. (Default: ``None``)
+
+    Returns:
+        List[torch.nn.Parameter]: Mapping from parameter IDs to parameters,
+        where the parameter ID is implicitly the index in the :class:`list`.
+    """
+    # Assume the standard case of passing `model.parameters()` to the optimizer
+    # if `optim_input` is not specified
+    if optim_input is None:
+        return list(model.parameters())
+    try:
+        params = list(optim_input)
+    except TypeError:
+        raise TypeError(
+            "Optimizer input should be an iterable of Tensors or dicts, "
+            f"but got {optim_input}"
+        )
+    if len(params) == 0:
+        raise ValueError("Optimizer input should not be empty")
+
+    # Check if the optimizer input represents tensors or parameter groups
+    all_tensors = True
+    all_dicts = True
+    for param in params:
+        all_tensors &= isinstance(param, torch.Tensor)
+        all_dicts &= isinstance(param, dict)
+    if not all_tensors and not all_dicts:
+        raise TypeError(
+            "Optimizer input should be an iterable of Tensors or dicts"
+        )
+    if all_tensors:
+        return params  # type: ignore[return-value]
+    assert all_dicts
+    param_id_to_param = []
+    for param_group in params:
+        has_params_key = "params" in param_group  # type: ignore[operator]
+        assert has_params_key, \
+            "A parameter group should map \"params\" to a list of the " \
+            "parameters in the group"
+        for param in param_group["params"]:  # type: ignore[index]
+            # Implicitly map `flat_param_id` (current length of the list) to
+            # `param`
+            param_id_to_param.append(param)
+    return param_id_to_param  # type: ignore[return-value]
+
+
+def _get_param_to_param_id(
+    model: torch.nn.Module,
+    optim_input: Optional[Union[
+        List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+    ]] = None,
+) -> Dict[torch.nn.Parameter, int]:
+    """Constructs the inverse mapping of :func:`_get_param_id_to_param`."""
+    param_id_to_param = _get_param_id_to_param(model, optim_input)
+    return {
+        param: param_id for param_id, param in enumerate(param_id_to_param)
+    }
+
+
+def _get_unflat_to_flat_param_ids(
+    flat_to_unflat_param_ids: Dict[int, List[int]],
+) -> List[int]:
+    """
+    Inverts the mapping ``flat_to_unflat_param_ids`` to be from unflattened
+    parameter ID to flattened parameter ID, where the unflattened parameter ID
+    is the index in the returned :class:`list`. There may be multiple
+    unflattened parameter IDs mapping to the same flattened parameter ID.
+
+    Args:
+        flat_to_unflat_param_ids (Dict[int, List[int]]): A mapping from
+            flattened parameter ID to a :class:`list` of corresponding
+            unflattened parameter IDs.
+
+    Returns:
+        List[int]: A mapping from unflattened parameter ID to flattened
+        parameter ID, where the unflattened parameter ID is the index in the
+        :class:`list`.
+    """
+    # Construct as a dict and then convert to list
+    unflat_to_flat_param_ids = {}
+    for flat_param_id, unflat_param_ids in flat_to_unflat_param_ids.items():
+        for unflat_param_id in unflat_param_ids:
+            assert unflat_param_id not in unflat_to_flat_param_ids, \
+                "`flat_to_unflat_param_ids` has the unflattened parameter " \
+                f"ID {unflat_param_id} mapped to multiple flattened " \
+                "parameter IDs"
+            unflat_to_flat_param_ids[unflat_param_id] = flat_param_id
+    num_unflat_param_ids = len(unflat_to_flat_param_ids)
+    unflat_param_ids_set = set(unflat_to_flat_param_ids.keys())
+    assert unflat_param_ids_set == set(range(num_unflat_param_ids)), \
+        "The set of unflattened parameter IDs should be {0, ..., " + \
+        str(num_unflat_param_ids - 1) + "} but got " + \
+        f"{unflat_param_ids_set}"
+    return [
+        unflat_to_flat_param_ids[unflat_param_id]
+        for unflat_param_id in range(num_unflat_param_ids)
+    ]
+
+
+def _is_zero_dim_tensor(x: Any) -> bool:
+    return torch.is_tensor(x) and x.dim() == 0
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
new file mode 100644
index 000000000000..fd403fe0701b
--- /dev/null
+++ b/torch/distributed/fsdp/_utils.py
@@ -0,0 +1,69 @@
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Set, Tuple, Union
+
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from torch.nn.utils.rnn import PackedSequence
+
+"""Useful functions to deal with tensor types with other python container types."""
+
+def _contains_batchnorm(module):
+    return any(
+        isinstance(mod, _BatchNorm) for mod in module.modules()
+    )
+
+def _override_batchnorm_mixed_precision(module):
+    for mod in module.modules():
+        if isinstance(mod, _BatchNorm):
+            mod._wrap_overrides = {"mixed_precision": None}  # type: ignore[assignment]
+
+def _apply_to_tensors(
+    fn: Callable, container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
+) -> Any:
+    """Recursively apply to all tensor in different kinds of container types."""
+
+    def apply(x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]) -> Any:
+        if torch.is_tensor(x):
+            return fn(x)
+        elif isinstance(x, OrderedDict):
+            od = x.__class__()
+            for key, value in x.items():
+                od[key] = apply(value)
+            return od
+        elif isinstance(x, PackedSequence):
+            apply(x.data)
+            return x
+        elif isinstance(x, dict):
+            return {key: apply(value) for key, value in x.items()}
+        elif isinstance(x, (list, tuple, set)):
+            return type(x)(apply(el) for el in x)
+        else:
+            return x
+
+    return apply(container)
+
+def _apply_to_modules(
+    root_module: torch.nn.Module,
+    module_fn: Callable,
+    return_fn: Callable,
+    *args,
+    **kwargs,
+):
+    """
+    Performs a pre-order traversal of the modules in the hierarchy rooted at
+    ``root_module``, applying ``module_fn`` at each module and finally
+    returning a value using ``return_fn``. The traversal constructs the full
+    module prefix name (e.g. "module.submodule." just like in model state dict)
+    and makes that available to ``module_fn``.
+    """
+    def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
+        # Call the module function before recursing over children (pre-order)
+        module_fn(module, prefix, *args, **kwargs)
+        for submodule_name, submodule in module.named_children():
+            if submodule is not None:
+                new_prefix = prefix + submodule_name + "."
+                f(submodule, new_prefix, *args, **kwargs)
+
+    f(root_module, "", *args, **kwargs)
+    return return_fn(*args, **kwargs)
diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py
index 796a102d9a22..97e086fc8435 100644
--- a/torch/distributed/fsdp/flatten_params_wrapper.py
+++ b/torch/distributed/fsdp/flatten_params_wrapper.py
@@ -6,10 +6,12 @@
 # Copyright (c) Tongzhou Wang
 # Licensed under the MIT License.
 
+import contextlib
 from itertools import accumulate
 from typing import (
     Any,
     Dict,
+    Generator,
     Iterator,
     List,
     NamedTuple,
@@ -22,8 +24,50 @@
 import torch.nn as nn
 from torch import Tensor
 
+from torch.distributed.utils import _replace_by_prefix
+
+
 ParamOffset = Tuple[int, int]
 SharedParamInfo = Tuple[str, str, nn.Module, str, nn.Module, str]
+FLAT_PARAM = "flat_param"
+FPW_MODULE = "_fpw_module"
+
+
+def _post_state_dict_hook(
+    module: nn.Module, state_dict: Dict[str, Any], prefix: str, *args: Any
+) -> Dict[str, Any]:
+    """
+    _post_state_dict_hook() is called after the state_dict() is executed
+    and before returning the state_dict to the users.
+    This API post-processes the keys of the state_dict to remove the
+    FlattenParamsWrapper internal prefix.
+    """
+    # Move everything from FPW_MODULE up one level.
+    _replace_by_prefix(state_dict, prefix + f"{FPW_MODULE}.", prefix)
+    return state_dict
+
+
+def _pre_load_state_dict_hook(
+    state_dict: Dict[str, Any],
+    prefix: str,
+    *args: Any,
+) -> None:
+    """
+    _pre_load_state_dict_hook() is called before the _load_from_state_dict() is
+    executed. This API pre-processes the keys of the state_dict to add the
+    FlattenParamsWrapper internal prefix.
+    """
+    # Push everything down to FPW_MODULE level.
+    _replace_by_prefix(state_dict, prefix, prefix + f"{FPW_MODULE}.")
+    # The flat_param_* keys actually needs to move one level up.
+    flat_param_key = prefix + f"{FPW_MODULE}.{FLAT_PARAM}"
+    for k in list(state_dict.keys()):
+        if k.startswith(flat_param_key):
+            last_part = k.split(".")[-1]
+            assert last_part.startswith(
+                FLAT_PARAM
+            ), f"Expected key to contain flat_param, but key name is {k}"
+            _replace_by_prefix(state_dict, k, prefix + last_part)
 
 
 class ParamInfo(NamedTuple):
@@ -47,8 +91,8 @@ class FlatParameter(nn.Parameter):
     Args:
         params (Sequence[nn.Parameter])
             The parameters to be flattend and concatened.
-        requres_grad (bool):
-            Set to Ture if gradients need to be computed for this parameter,
+        requires_grad (bool):
+            Set to True if gradients need to be computed for this parameter,
             False otherwise.
     """
 
@@ -57,7 +101,7 @@ def __new__(
     ) -> "FlatParameter":
         """Make an object using the parent's __new__ function."""
 
-        # A empty of non-list input doesn't make sense.
+        # A empty or non-list input doesn't make sense.
         if not isinstance(params, (list, tuple)) or len(params) == 0:
             raise ValueError("An non-empty list or tuple argument is needed")
 
@@ -96,10 +140,13 @@ def __new__(
     def __init__(self, params: Sequence[nn.Parameter], requires_grad: bool = True):
         self._is_sharded = False
         self._param_numels = [p.numel() for p in params]
-        assert self.numel() <= sum(self._param_numels), (
+        # The total element numbers. This is equal to the summation of the
+        # ``numel()`` of all the parameters.
+        self.full_numel = sum(self._param_numels)
+        assert self.numel() <= self.full_numel, (
             "Parameter numbers mismatched. "
             f"The number of elements in FlatParameter: {self.numel()} vs. "
-            f"the number of elements in original parameters: {sum(self._param_numels)}."
+            f"the number of elements in original parameters: {self.full_numel}."
         )
         # The shapes of each individual parameter.
         self._param_shapes = [p.size() for p in params]
@@ -122,7 +169,7 @@ def __init__(self, params: Sequence[nn.Parameter], requires_grad: bool = True):
             (0, numel) for numel in self._param_numels
         ]
         # The number of padding elements.
-        self._num_padded = 0
+        self.num_padded = 0
 
     def shard_by_offsets(self, start: int, end: int, num_padded: int) -> None:
         assert self._is_sharded
@@ -131,8 +178,8 @@ def shard_by_offsets(self, start: int, end: int, num_padded: int) -> None:
                 f"Shard the flatten parameter with an invalid offset pair {(start, end)}."
             )
         _shard_size = end - start + 1
-        self._num_padded = num_padded
-        if self._num_padded > _shard_size:
+        self.num_padded = num_padded
+        if self.num_padded > _shard_size:
             raise ValueError("The number of padding is larger than the shard size.")
         self._sharded_param_offsets.clear()
 
@@ -161,19 +208,35 @@ def get_param_views(
     ) -> Iterator[Tensor]:
         """Return a generator of views that map to the original parameters."""
         # Note, self.data could be sharded, so its numel is <= to the sum.
-        assert self.data.numel() <= sum(
-            self._param_numels
-        ), f"Incorrect internal state {self.data.numel()} vs. {sum(self._param_numels)}"
+        assert (
+            self.data.numel() <= self.full_numel
+        ), f"Incorrect internal state {self.data.numel()} vs. {self.full_numel}"
         data = external_data if external_data is not None else self
-        if data.numel() != sum(self._param_numels):
+        if data.numel() != self.full_numel:
             raise ValueError(
-                f"Incorrect numel of supplied data: got {data.numel()} but expected {sum(self._param_numels)}"
+                f"Incorrect numel of supplied data: got {data.numel()} but expected {self.full_numel}"
             )
         return (
             t.view(s)
             for (t, s) in zip(data.split(self._param_numels), self._param_shapes)
         )
 
+    @property
+    def _num_unflattened_params(self) -> int:
+        """Returns the number of unflattened parameters that comprise this
+        flattened parameter."""
+        assert hasattr(self, "_param_infos"), \
+            "`_param_infos` has not been set, meaning this `FlatParameter` " \
+            "has not been initialized yet"
+        num_unflat_params = len(self._param_infos)
+        assert num_unflat_params > 0, "`FlatParameter` corresponding to 0 " \
+            "unflattened parameters"
+        return num_unflat_params
+
+    @property
+    def param_info(self) -> List[ParamInfo]:
+        return self._param_infos
+
     @property
     def _param_names(self):
         return [".".join([m, n]) if m else n for (m, _, n) in self._param_infos]
@@ -187,9 +250,8 @@ def shard_metadata(
     ) -> ShardMetadata:
         """
         Return tuple of (names, shapes, numels) metadata for the sharded parameter
-        metada of this flat parameter.
+        metadata of this flat parameter.
         """
-        names = [".".join([m, n]) if m else n for (m, _, n) in self._param_infos]
         return ShardMetadata(
             self._param_names[self._offset_to_slice()],
             self._param_shapes[self._offset_to_slice()],
@@ -222,13 +284,28 @@ class FlattenParamsWrapper(nn.Module):
     def __init__(self, module: nn.Module, param_list: List[nn.Parameter]):
         super().__init__()
         self._fpw_module = module
+        # People may test whether this module contains parameters by using
+        # `getattr(module, "flat_param") is None`. This is not always accurate
+        # as the above condition is also true if this module is unflattened.
+        # `no_params` explicitly shows this module has no parameters and
+        # is always correct regardless flattened or unflattened.
+        self.no_params = True
         self.flat_param = None
 
+        # Register hook to be called after state_dict() to remove the
+        # "_fpw_module." prefix and before load_state_dict() to add it back.
+        # The hooks must be registered even if the target param_list is empty as
+        # all submodules in FlattenParamsWrapper should be pre/post processed by
+        # the hooks.
+        self._register_state_dict_hook(_post_state_dict_hook)
+        self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook)
+
         if len(param_list) == 0:
             return
 
         # A list of parameters to be flatten
         unique_param_list = set(param_list)
+        self.no_params = False
 
         # convert from list of Parameters to set of (Module, parameter_name) tuples, which
         # will survive in case the Parameter instances are reset.
@@ -243,8 +320,18 @@ def __init__(self, module: nn.Module, param_list: List[nn.Parameter]):
         self.flat_param = FlatParameter(params, params[0].requires_grad)
         self.flat_param._param_infos = param_infos
         self.flat_param._shared_param_infos = shared_param_infos
+
+        # This attribute is used to remember the flat_param inside the unflatten_params()
+        # context. With this attribute, FSDP can access the flat parameter metadata
+        # even if flat_param is temporarily deleted.
+        # ``orig_flat_param` is a list to avoid being tracked by ``state_dict()``.
+        self.orig_flat_param: List[Optional[FlatParameter]] = [None]
         self._flatten_params()
 
+        # Sanity check for the string constants.
+        assert getattr(self, FPW_MODULE) is self._fpw_module
+        assert getattr(self, FLAT_PARAM) is self.flat_param
+
     @property
     def module(self) -> Any:
         """Support _fsdp_wrapped_module.module in case we are immitating DDP, which has .module
@@ -287,16 +374,20 @@ def _init_flatten_params(
 
         return params, param_infos, shared_param_infos
 
-    def _flatten_params(self) -> None:
+    def _flatten_params(self, external_data: Optional[FlatParameter] = None) -> None:
         """Flatten the managed parameters and replaced the original
-        attributes with views to the flat param.
+        attributes with views to the flat param. If `external_data`
+        is passed, it will be used as the flat_param.
         """
         # register the flatten one
         assert (
-            self.flat_param is not None
-        ), "Can not flatten params when flat_param is None"
+            getattr(self, "flat_param", None) is not None or external_data is not None
+        ), "Can not flatten params when both flat_param and external_data are None."
+        if external_data is not None:
+            self.flat_param = external_data
         self.register_parameter("flat_param", self.flat_param)
 
+        assert self.flat_param is not None  # avoid mypy complain.
         # deregister the names as parameters
         for _, m, n in self.flat_param._param_infos:
             delattr(m, n)
@@ -339,6 +430,25 @@ def _unflatten_params(self) -> None:
 
         del self.flat_param
 
+    @contextlib.contextmanager
+    def unflatten_params(self) -> Generator:
+        """
+        Unflatten params. If the current instance is already unflattened, then
+        it will remain unflattened after the context manager exits.
+        """
+        if getattr(self, "flat_param", None) is None:
+            yield
+        else:
+            self.orig_flat_param[0] = self.flat_param
+            self._unflatten_params()
+            # Put yield in a try...finally in case the caller catches the exception and handles
+            # it. In that case, we need to properly handle the undoing of state here.
+            try:
+                yield
+            finally:
+                self._flatten_params(self.orig_flat_param[0])
+                self.orig_flat_param[0] = None
+
     def _get_param_views(
         self, external_data: Optional[Tensor] = None
     ) -> Iterator[Tensor]:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index adde6704c338..4361b7a83be1 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1,45 +1,182 @@
 import contextlib
+import copy
 import functools
+import itertools
+import math
 import traceback
+import warnings
+from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
-    cast,
     Dict,
+    Generator,
+    Iterable,
+    Iterator,
     List,
+    Mapping,
+    NamedTuple,
     Optional,
-    Generator,
     Set,
     Tuple,
     Union,
+    cast,
 )
 
 import torch
 import torch.distributed as dist
+from torch.distributed.utils import _to_kwargs, _sync_params_and_buffers, _replace_by_prefix
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 from torch.distributed import ProcessGroup
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+    init_from_local_shards,
+)
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.nn.parameter import Parameter
 
-from .flatten_params_wrapper import FlatParameter, FlattenParamsWrapper
-from .utils import (
-    _apply_to_tensors,
+from ._optim_utils import (
+    _broadcast_pos_dim_tensor_states,
+    _broadcast_processed_optim_state_dict,
+    _flatten_full_optim_state_dict,
+    _get_flat_param_to_fsdp_module,
+    _get_param_id_to_param,
+    _get_param_to_param_id,
+    _process_pos_dim_tensor_state,
+    _unflatten_optim_state,
+)
+from ._utils import (
+    _apply_to_modules, _apply_to_tensors,
+    _override_batchnorm_mixed_precision, _contains_batchnorm
 )
-from .wrap import _recursive_wrap
+from .flatten_params_wrapper import (
+    FLAT_PARAM,
+    FPW_MODULE,
+    FlatParameter,
+    FlattenParamsWrapper,
+)
+from .wrap import _recursive_wrap, _wrap_batchnorm_individually, _or_policy
 
 if TYPE_CHECKING:
     from collections import OrderedDict  # noqa: F401
 
+_TORCHDISTX_AVAIL = True
+try:
+    from torchdistx import deferred_init, fake
+except ImportError:
+    _TORCHDISTX_AVAIL = False
+
+
+FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
+FSDP_PREFIX = FSDP_WRAPPED_MODULE + "." + FPW_MODULE + "."
+
+_PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
+
+
+def _default_meta_device_init_fn(module):
+    """
+    Default initializer for modules initialized on the meta device.
+    """
+    # TODO: move module to device_id here once device_id is available.
+    module.to_empty(device=torch.cuda.current_device())
+    try:
+        with torch.no_grad():
+            module.reset_parameters()
+    except BaseException as e:
+        warnings.warn(
+            f"Unable to call reset_parameters() for module on meta device with error {str(e)}. "
+            "Please ensure your module implements a ``reset_parameters`` function."
+        )
+        raise e
+
+
+class ShardingStrategy(Enum):
+    """
+    Specify which sharding strategy will be used for the distributed training.
+    FULL_SHARD: Shards parameters, gradients and optimizer states. This algorithm
+                inserts ``all_gather`` before forward and backward computation to gather
+                parameters, also inserts ``reduce_scatter`` after backward computation for
+                synchronizing and sharding gradients. Sharded optimizer states are
+                updated locally.
+    SHARD_GRAD_OP: Shard optimizer states and gradients, this algorithm inserts all_gather
+                   before forward computation and keeps the full parameters in
+                   GPU memory until backward computation is done. It inserts reduce_scater
+                   after backward computation for synchronizing and sharding gradients.
+                   Sharded optimizer states are updated locally.
+    NO_SHARD: This is similar to PyTorch ``DistributedDataParallel`` API. Parameters, gradients
+              and optimizer states are replicated among ranks, ``all_reduce`` is inserted after
+              backward computation is done for synchronizing gradients. Full optimizer states
+              are updated in each rank.
+    HYBRID_SHARD(future support): apply FULL_SHARD algorithm in the intra node and
+                                  apply NO_SHARD algorithm in the inter nodes.
+
+    """
+    FULL_SHARD = auto()
+    SHARD_GRAD_OP = auto()
+    NO_SHARD = auto()
+    # TODO
+    # HYBRID_SHARD = auto()
+
+
+@dataclass
+class MixedPrecision:
+    """
+    A config to enable mixed precision training with FullyShardedDataParallel.
+    This class can be constructed with three flags:
+        ``param_dtype`` controls the precision of model parameters, inputs, and
+        therefore the precision under which computation happens. After forward
+        and backward passes, FSDP parameters point to full precision shards
+        that are kept in memory. Full precision parameters are always
+        checkpointed.
+        ``reduce_dtype`` controls the precision under which gradient reduction
+        would occur, which can potentially be different than ``param_dtype``
+        for use cases such as communication efficiency.
+        ``buffer_dtype`` controls the precision that buffers are cast to. Note
+        that buffers are unsharded and are cast in the first forward pass, and
+        remain in their reduced precision state even after forward/backward
+        passes. However, when taking checkpoints with ``state_dict``, buffers
+        are checkpointed in their full precision (and then restored back to
+        to their reduced precision) as expected. Note that this checkpoint
+        support is currently limited to ``StateDictType.FULL_STATE_DICT``.
+
+    .. note:: In ``summon_full_params``, parameters are summoned in full
+        precision but buffers are not.
+
+    .. note:: Parameters and buffers are checkpointed in full precision. For
+        buffers, this is only guaranteed to work for ``StateDictType.FULL_STATE_DICT``.
+
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Specification of reduced precision types must be explicit, in that
+        if, for example, ``param_dtype`` is not specified, it will not be cast by
+        FSDP. Thus, a config such as ``MixedPrecision(reduce_dtype=torch.float16)``
+        will not cast buffers or parameters. Note that if a ``MixedPrecision``
+        config is specified without a ``reduce_dtype``, gradient communication
+        would occur in the `param_dtype` precision, if given, otherwise, in the
+        original parameter precision.
+    """
+    # maintain a tensor of this dtype that the fp32 param shard will be cast to.
+    # Will control the precision of model params, inputs, and thus compute as
+    # well.
+    param_dtype: Optional[torch.dtype] = None
+    # Gradient communication precision.
+    reduce_dtype: Optional[torch.dtype] = None
+    # Buffer precision.
+    # TODO: buffer + param are usually of the same type, if user specifies
+    # param but not buffer, should we automatically make buffer be the same?
+    buffer_dtype: Optional[torch.dtype] = None
+
 
 @dataclass
 class CPUOffload:
     """
-    CPU offlaoding config. Currently, only parameter and gradient CPU
+    CPU offloading config. Currently, only parameter and gradient CPU
     offload are supported.
     offload_params: Offloading parameters to CPUs when these parameters are
                     not used for computation on GPUs. This implicitly enables
@@ -48,8 +185,6 @@ class CPUOffload:
     """
 
     offload_params: bool = False
-    # TODO: state dict offloading
-    # https://github.com/pytorch/pytorch/issues/67224
 
 
 class BackwardPrefetch(Enum):
@@ -99,11 +234,189 @@ class TrainingState_(Enum):
     SUMMON_FULL_PARAMS = auto()
 
 
+class StateDictType(Enum):
+    """
+    This enum indicates that which type of ``state_dict`` the FSDP module is
+    currently processing (returning or loading).
+    The default value is FULL_STATE_DICT to comply the PyTorch convention.
+    ..note::
+        FSDP currently supports two types of ``state_dict``:
+            1. ``state_dict/load_state_dict`: this pair of APIs return and load
+               the non-sharded, unflattened parameters. The semantics is the
+               same as using DDP.
+            2. ``_local_state_dict/_load_local_state_dict``: this pair of APIs return
+               and load local sharded, flattened parameters. The values returned
+               by ``_local_state_dict`` can be directly used by FSDP and is only
+               meaningful to FSDP (because parameters are flattened). Note that
+               these APIs are meant for use via the :func:`state_dict_type`
+               context manager as follows:
+                   >>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
+                   >>>     state = fsdp.state_dict()  # loads local state dict
+            3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
+               return and load sharded, unflattened parameters. The ``state_dict``
+               return by ``sharded_state_dict`` can be used by all other parallel
+               schemes (resharding may be required).
+    """
+
+    FULL_STATE_DICT = auto()
+    LOCAL_STATE_DICT = auto()
+    SHARDED_STATE_DICT = auto()
+
+@dataclass
+class StateDictConfig:
+    """
+    ``StateDictConfig`` is the base class for all state_dict configuration classes.
+    Users should instantiate a child version (i.e. ``FullStateDictConfig``) in
+    order to configure settings for the particular type of ``state_dict``
+    implementation FSDP will use.
+    """
+    pass
+
+@dataclass
+class FullStateDictConfig(StateDictConfig):
+    """
+    ``FullStateDictConfig`` is a config class meant to be used with
+    ``StateDictType.FULL_STATE_DICT``. Currently, it accepts two parameters,
+    ``offload_to_cpu`` and ``rank0_only`` which can be configured to offload
+    the full ``state_dict`` to CPU and to materialize the ``state_dict`` on
+    rank 0 only. When used, it is recommended to enable both of these flags
+    together to optimize memory savings when taking checkpoints. Note that
+    this config class is meant for user via the :func:`state_dict_type`
+    context manager as follows:
+        >>> fsdp = FSDP(model, auto_wrap_policy=...)
+        >>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        >>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
+        >>>     state = fsdp.state_dict()
+        >>>     # state will be empty on non rank 0 and contain CPU tensors on rank 0.
+        >>> # To reload checkpoint for inference, finetuning, transfer learning, etc:
+        >>> model = model_fn() # Initialize model on CPU in preparation for wrapping with FSDP
+        >>> if dist.get_rank() == 0:
+        >>>     # Load checkpoint only on rank 0 to avoid memory redundancy
+        >>>     state_dict = torch.load("my_checkpoint.pt")
+        >>>     model.load_state_dict(state_dict)
+        >>> # All ranks initialize FSDP module as usual. ``sync_module_states`` argument
+        >>> # communicates loaded checkpoint states from rank 0 to rest of the world.
+        >>> fsdp = FSDP(Model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
+        >>> # After this point, all ranks have FSDP model with loaded checkpoint.
+    """
+    offload_to_cpu: bool = False
+    rank0_only: bool = False
+
+@dataclass
+class LocalStateDictConfig(StateDictConfig):
+    pass
+
+@dataclass
+class ShardedStateDictConfig(StateDictConfig):
+    pass
+
+_state_dict_type_to_config = {
+    StateDictType.FULL_STATE_DICT: FullStateDictConfig,
+    StateDictType.LOCAL_STATE_DICT: LocalStateDictConfig,
+    StateDictType.SHARDED_STATE_DICT: ShardedStateDictConfig,
+}
+
+class OptimStateKeyType(Enum):
+    PARAM_NAME = auto()
+    PARAM_ID = auto()
+
+
+class _ExecOrderWarnStatus(Enum):
+    """Used internally for execution order validation."""
+    NONE = auto()     # no deviation yet
+    WARNING = auto()  # deviated this iteration; currently issuing warnings
+    WARNED = auto()   # deviated in a previous iteration
+
+
+class _ExecOrderData():
+    """
+    This contains the data used for validating execution order across ranks.
+
+    Attributes:
+        _all_flat_params (List[FlatParameter]): A :class:`list` of all
+            flattened parameters contained in the FSDP module hierarchy with
+            the list index implicitly giving a unique parameter index.
+        _param_to_unflat_param_names (Dict[FlatParameter, List[str]]): A
+            mapping from flattened parameter to the comprising unflattened
+            parameters' names.
+        is_first_iter (bool): Whether executing in the first iteration or not.
+        param_order (List[int]): Order that parameters participate in the
+            forward pass; constructed on the first iteration and validated
+            against in subsequent iterations.
+        index (int): Index tracking the position in ``param_order``
+            when validating the forward pass execution order in subsequent
+            iterations.
+        warn_status (_ExecOrderWarnStatus): To avoid flooding the console, we
+            only issue warnings throughout the first deviating iteration and no
+            longer check thereafter; this tracks the warning status.
+    """
+    def __init__(self) -> None:
+        self._all_flat_params: List[FlatParameter] = []
+        self._param_to_unflat_param_names: Dict[FlatParameter, List[str]] = []
+        # Modified in the first iteration:
+        self.is_first_iter: bool = True
+        self.param_order: List[int] = []
+        # Modified in the subsequent iterations:
+        self.index: int = 0
+        self.warn_status: _ExecOrderWarnStatus = _ExecOrderWarnStatus.NONE
+
+    def init(self, root_module: "FullyShardedDataParallel"):
+        assert root_module._is_root, "This data structure should only be " \
+            "initialized on an FSDP root module"
+        # Save `root_modules.parameters()` to `_all_flat_params` instead of
+        # re-materializing each time to avoid the result depending on the
+        # calling context (e.g. when some parameters have been rebuilt)
+        self._all_flat_params = list(root_module.parameters())
+        self._param_to_unflat_param_names = cast(
+            Dict[FlatParameter, List[str]],
+            _get_param_to_unflat_param_names(root_module)
+        )  # `root_module.parameters()` should only contain `FlatParameter`s
+
+    def get_param_index(self, param: FlatParameter) -> int:
+        """Returns a unique non-negative parameter index for ``param`` if it is
+        valid or -1 otherwise. Critically, this index assignment must be the
+        same across ranks."""
+        assert isinstance(param, FlatParameter), \
+            f"Expects `param` is a `FlatParameter` but got {type(param)}"
+        for i, p in enumerate(self._all_flat_params):
+            if p is param:
+                return i
+        return -1
+
+    def get_param(self, param_index: int) -> Optional[FlatParameter]:
+        """Returns the parameter corresponding to ``param_index`` or ``None``
+        if the index is invalid."""
+        for i, p in enumerate(self._all_flat_params):
+            if i == param_index:
+                return p
+        return None
+
+    def get_unflat_param_names(self, param_index: int) -> List[str]:
+        """Returns a :class:`list` of unflattened parameter names comprising
+        the flattened parameter with index ``param_index`` or an empty
+        :class:`list` if ``param_index`` is invalid."""
+        param = self.get_param(param_index)
+        if param is None:
+            return []
+        assert param in self._param_to_unflat_param_names, \
+            "Internal data structures out of sync; check `init()`"
+        return self._param_to_unflat_param_names[param]
+
+    def reset(self):
+        """Called in :meth:`_wait_for_post_backward` to reset data for the next
+        iteration."""
+        self.is_first_iter = False
+        self.index = 0
+        # `reset()` marks the end of an iteration, so transition if needed
+        if self.warn_status == _ExecOrderWarnStatus.WARNING:
+            self.warn_status = _ExecOrderWarnStatus.WARNED
+
+
 class FullyShardedDataParallel(nn.Module):
     """
     A wrapper for sharding Module parameters across data parallel workers. This
     is inspired by `Xu et al.`_ as well as the ZeRO Stage 3 from DeepSpeed_.
-    FullyShardedDataParallel is commonly shorten to FSDP.
+    FullyShardedDataParallel is commonly shortened to FSDP.
 
     .. _`Xu et al.`: https://arxiv.org/abs/2004.13336
     .. _DeepSpeed: https://www.deepspeed.ai/
@@ -125,18 +438,48 @@ class FullyShardedDataParallel(nn.Module):
         since FSDP will shard parameters in-place and this will break any
         previously initialized optimizers.
 
-    .. warning:
+    .. warning::
         Module should be already placed on the destination device or
-        device is set properly using torch.cuda.set_device(device_id).
+        device is set properly using ``torch.cuda.set_device(device_id)``.
+        Alternatively, if module is input on CPU, ``device_id`` argument can
+        be specified to move module to CUDA device during initialization.
         FSDP will get compute device from module first, if module device
-        is CPU, FSDP will then get compute device from current device.
+        is CPU, FSDP will then get compute device from current device. Note
+        that if ``device_id`` is specified, compute device will always match
+        ``device_id``.
+
+    .. warning::
+        FSDP currently does not support gradient accumulation outside
+        `no_sync()` when using CPU offloading. Trying to do so yields incorrect
+        results since FSDP will use the newly-reduced gradient instead of
+        accumulating with any existing gradient.
+
+    .. warning::
+        Changing the original parameter variable names after construction will
+        lead to undefined behavior.
+
+    .. warning::
+        As of PyTorch 1.12, FSDP only offers limited support for shared parameters
+        (for example, setting one ``Linear`` layer's weight to another's). In
+        particular, modules that share parameters must be wrapped as part of the
+        same FSDP unit. If enhanced shared parameter support is needed for your
+        use case, please ping https://github.com/pytorch/pytorch/issues/77724
+
+    .. note::
+        Inputs into FSDP ``forward`` function will be moved to compute device
+        (same device FSDP module is on) before running ``forward``, so user does
+        not have to manually move inputs from CPU -> GPU.
 
     Args:
         module (nn.Module):
             module to be wrapped with FSDP.
         process_group (Optional[ProcessGroup]):
             process group for sharding
-        cpu_offload (Optional [CPUOffload]):
+        sharding_strategy (Optional[ShardingStrategy]):
+            Config sharding algorithm, different sharding algorithm has trade
+            off between memory saving and communication overhead. ``FULL_SHARD``
+            will be chosen if sharding_strategy is not specified.
+        cpu_offload (Optional[CPUOffload]):
             CPU offloading config. Currently, only parameter and gradient CPU
             offload is supported. It can be enabled via passing in
             ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
@@ -144,18 +487,22 @@ class FullyShardedDataParallel(nn.Module):
             params and grads to be on same device to work with optimizer. This
             API is subject to change. Default is ``None`` in which case there
             will be no offloading.
-        fsdp_auto_wrap_policy: (Optional [callable]):
+        auto_wrap_policy (Optional[Callable]):
             A callable specifying a policy to recursively wrap layers with FSDP.
             Note that this policy currently will only apply to child modules of
             the passed in module. The remainder modules are always wrapped in
             the returned FSDP root instance.
-            ``default_auto_wrap_policy`` written in ``torch.distributed.fsdp.wrap`` is
-            an example of ``fsdp_auto_wrap_policy`` callable, this policy wraps layers
-            with parameter sizes larger than 100M. Users can supply the customized
-            ``fsdp_auto_wrap_policy`` callable that should accept following arguments:
+            ``size_based_auto_wrap_policy`` written in ``torch.distributed.fsdp.wrap`` is
+            an example of ``auto_wrap_policy`` callable, this policy wraps layers
+            with the number of parameters larger than 100M. ``transformer_auto_wrap_policy``
+            written in ``torch.distributed.fsdp.wrap`` is an example of ``auto_wrap_policy``
+            callable for tranformer-like model architectures. Users can supply the customized
+            ``auto_wrap_policy`` callable that should accept following arguments:
             ``module: nn.Module``, ``recurse: bool``, ``unwrapped_params: int``,
             extra customized arguments could be added to the customized
-            ``fsdp_auto_wrap_policy`` callable as well.
+            ``auto_wrap_policy`` callable as well. It is a good practice to print out
+            the sharded model and check whether the sharded model is what
+            the application wants and then adjust accordingly.
 
             Example::
 
@@ -168,59 +515,243 @@ class FullyShardedDataParallel(nn.Module):
                 >>> ) -> bool:
                 >>>     return unwrapped_params >= min_num_params
 
-        backward_prefetch: (Optional[BackwardPrefetch]):
+        backward_prefetch (Optional[BackwardPrefetch]):
             This is an experimental feature that is subject to change in the
             the near future. It allows users to enable two different backward_prefetch
             algorithms to help backward communication and computation overlapping.
             Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
+        mixed_precision: (Optional[MixedPrecision]): A ``MixedPrecision`` instance
+            describing the mixed precision training config to be used. ``MixedPrecision``
+            supports configuring parameter, buffer, and gradient communication dtype. Note
+            that only floating point data is cast to the reduced precision. This allows
+            users potential memory saving and training speedup while trading off
+            accuracy during model training. If ``None``, no mixed precision is applied.
+            Note that if ``mixed_precision`` is enabled for FSDP model that
+            contains ``BatchNorm`` with ``auto_wrap_policy``, FSDP will take
+            care to disable mixed precision for ``BatchNorm`` units by wrapping
+            them separately in their own FSDP unit with ``mixed_precision=None``.
+            This is done because several ``BatchNorm`` kernels do not implement
+            reduced type support at the moment. If individually wrapping the model,
+            users must take care to set ``mixed_precision=None`` for
+            ``BatchNorm`` units.
+            (Default: ``None``)
+        ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
+            own parameters and child modules' parameters and buffers are
+            ignored by this instance. None of the modules directly in
+            ``ignored_modules`` should be :class:`FullyShardedDataParallel`
+            instances, and any child modules that are already-constructed
+            :class:`FullyShardedDataParallel` instances will not be ignored if
+            they are nested under this instance. This argument may be used to
+            avoid sharding specific parameters when using an
+            ``auto_wrap_policy`` or if parameters' sharding is not managed by
+            FSDP. (Default: ``None``)
+        param_init_fn: (Optional[Callable[[nn.Module], None]]):
+            A ``Callable[torch.nn.Module] -> None`` that
+            specifies how modules that are currently on the meta device should be initialized
+            onto an actual device. Note that as of v1.12, we detect modules on the meta
+            device via ``is_meta`` check and apply a default initialization that calls
+            ``reset_parameters`` method on the passed in ``nn.Module`` if ``param_init_fn``
+            is not specified, otherwise we run ``param_init_fn`` to initialize the passed
+            in ``nn.Module``. In particular, this means that if ``is_meta=True`` for any
+            module parameters for modules that will be wrapped with FSDP and ``param_init_fn``
+            is not specified, we assume your module properly implements a ``reset_paramters()``
+            and will throw errors if not. Note that additionally, we offer support for modules
+            initialized with torchdistX's (https://github.com/pytorch/torchdistX)
+            ``deferred_init`` API. In this case, deferred modules would be initialized
+            by a default initialization function that calls torchdistX's
+            ``materialize_module``, or the passed in ``param_init_fn``, if it is not
+            ``None``. The same ``Callable`` is applied to initialize all meta modules.
+            Note that this initialization function is applied before doing any FSDP sharding
+            logic.
+
+            Example::
+
+                >>> module = MyModule(device="meta")
+                >>> def my_init_fn(module):
+                >>>     # responsible for initializing a module, such as with reset_parameters
+                >>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy)
+                >>> print(next(fsdp_model.parameters()).device) # current CUDA device
+                >>> # With torchdistX
+                >>> module = deferred_init.deferred_init(MyModule, device="cuda")
+                >>> # Will initialize via deferred_init.materialize_module().
+                >>> fsdp_model = FSDP(module, auto_wrap_policy=size_based_auto_wrap_policy)
+
+        device_id (Optional[Union[int, torch.device]]): An ``int`` or ``torch.device``
+            describing the CUDA device the FSDP module should be moved to determining where
+            initialization such as sharding takes place. If this argument is not specified
+            and ``module`` is on CPU, we will move ``module`` to current CUDA device for faster
+            initialization and move ``module`` back to CPU before returning.
+            If specified, resulting FSDP instances will reside on this device.
+            Note that if ``device_id`` is specified but ``module`` is already
+            on a different CUDA device, an error will be thrown. (Default: ``None``)
+
+        sync_module_states (bool): If ``True``, each individually wrapped FSDP unit will broadcast
+            module parameters from rank 0 to ensure they are the same across all ranks after
+            initialization. This helps ensure model parameters are the same across ranks
+            before starting training, but adds communication overhead to ``__init__``, as at least
+            one broadcast is triggered per individually wrapped FSDP unit.
+            This can also help load checkpoints taken by ``state_dict`` and to be loaded by
+            ``load_state_dict`` in a memory efficient way. See documentation for
+            :class:`FullStateDictConfig` for an example of this. (Default: ``False``)
+
     """
 
     def __init__(
         self,
         module: nn.Module,
         process_group: Optional[ProcessGroup] = None,
+        sharding_strategy: Optional[ShardingStrategy] = None,
         cpu_offload: Optional[CPUOffload] = None,
-        fsdp_auto_wrap_policy: Optional[Callable] = None,
+        auto_wrap_policy: Optional[Callable] = None,
         backward_prefetch: Optional[BackwardPrefetch] = None,
+        mixed_precision: Optional[MixedPrecision] = None,
+        ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+        param_init_fn: Optional[Callable[[nn.Module], None]] = None,
+        device_id: Optional[Union[int, torch.device]] = None,
+        sync_module_states: bool = False,
     ):
         torch._C._log_api_usage_once("torch.distributed.fsdp")
         super().__init__()
-        # if fsdp_auto_wrap_policy is specified, submodules should not be
+        # Validate the ignored modules and derive the ignored parameters/buffers
+        ignored_modules = self._get_ignored_modules(module, ignored_modules)
+        ignored_params, ignored_param_names = \
+            self._get_ignored_params(module, ignored_modules)
+        buffer_names = self._get_buffer_names(module)
+        # Compute the names to ignore for full state dict cloning (i.e. those
+        # of the ignored modules' parameters and of all modules' buffers)
+        self._ignored_param_names = ignored_param_names
+        self._buffer_names = buffer_names
+        # NOTE: Since the names are computed at construction time, if the user
+        # changes them later, then FSDP will not properly ignore them. However,
+        # the `FlatParameter` implementation already relies on this assumption.
+        # We do this at construction time since we want the fully prefixed
+        # parameter names matching the keys in the model state dict (namely,
+        # including the wrapped module's name in the prefix), which may be done
+        # most non-intrusively here before flattening.
+
+        # if auto_wrap_policy is specified, submodules should not be
         # already wrapped, otherwise we'd attempt to double wrap them resulting
         # in errors.
-        if fsdp_auto_wrap_policy is not None:
+        if auto_wrap_policy is not None:
             self._check_wrapped(
                 module,
                 check_fn=lambda mod: not isinstance(mod, FullyShardedDataParallel),
                 err_fn=lambda mod: f"Expected {mod} to NOT be FullyShardedDataParallel if auto_wrap is enabled.",
             )
+            if mixed_precision is not None and _contains_batchnorm(module):
+                _override_batchnorm_mixed_precision(module)
+                policy_to_use = functools.partial(
+                    _or_policy,
+                    policies=[_wrap_batchnorm_individually, auto_wrap_policy]
+                )
+                warnings.warn(
+                    "Mixed precision was specified for FSDP module with"
+                    " batchnorm submodules wrapped via ``auto_wrap_policy``."
+                    " BatchNorm units will be wrapped as a separate FSDP unit,"
+                    " with mixed_precision disabled (i.e. set to ``None``)"
+                    " as several BatchNorm kernels would raise errors when"
+                    " operating on reduced precision inputs."
+                )
+            else:
+                policy_to_use = auto_wrap_policy
             _recursive_wrap(
                 module,
-                auto_wrap_policy=fsdp_auto_wrap_policy,
+                auto_wrap_policy=policy_to_use,
                 wrapper_cls=FullyShardedDataParallel,
+                ignored_modules=ignored_modules,
+                ignored_params=ignored_params,
                 # Note that we have the recursive_wrap skip wrapping for
                 # the outermost (this) module otherwise it will result in a
                 # double-wrap causing issues.
                 only_wrap_children=True,
                 # FSDP arguments follow.
                 process_group=process_group,
+                sharding_strategy=sharding_strategy,
                 cpu_offload=cpu_offload,
                 backward_prefetch=backward_prefetch,
-                # Note that recursive_wap should not call FSDP with wrapping
-                # enabled, as this recursive call handles all wrapping,
-                # including for nested children.
-                fsdp_auto_wrap_policy=None,
+                mixed_precision=mixed_precision,
+                param_init_fn=param_init_fn,
+                device_id=device_id,
+                sync_module_states=sync_module_states,
             )
 
         self.process_group = process_group or _get_default_group()
         self.rank = self.process_group.rank()
         self.world_size = self.process_group.size()
+        if device_id is not None:
+            self.device_id = (
+                device_id if isinstance(device_id, torch.device)
+                else torch.device(device_id)
+            )
+            # If user passed in something like torch.device("cuda"),
+            # device index of current device is unclear, make it explicit.
+            if self.device_id == torch.device("cuda"):
+                warnings.warn(
+                    f"Passed in {self.device_id} does not have explicit index, "
+                    f"setting it to current index: {torch.cuda.current_device()}. "
+                    "If this is not correct, please explicitly call torch.cuda.set_device()"
+                    "before FSDP initialization or pass in explicit device index as device_id argument."
+                )
+                self.device_id = torch.device("cuda", torch.cuda.current_device())
+        else:
+            self.device_id = None
+
+
+        is_meta_module = any(p.is_meta for p in module.parameters())
+        is_torchdistX_deferred_init = (
+            not is_meta_module and _TORCHDISTX_AVAIL
+            and any(fake.is_fake(p) for p in module.parameters())
+        )
+
+        def _run_param_init_fn():
+            # Call user-specified initialization function.
+            if not callable(param_init_fn):
+                raise ValueError(
+                    f"Expected {param_init_fn} to be callable, but got {type(param_init_fn)}"
+                )
+            param_init_fn(module)
+
+        if is_meta_module:
+            if param_init_fn is not None:
+                _run_param_init_fn()
+            else:
+                # Call default initialization function that is dependent on
+                # reset_parameters.
+                _default_meta_device_init_fn(module)
+        elif is_torchdistX_deferred_init:
+            assert _TORCHDISTX_AVAIL, "Got torchdistX initialized module but torchdistX lib is not available."
+            if param_init_fn is not None:
+                _run_param_init_fn()
+            else:
+                # Call default torchdistX initialization function. Omit re-initialization of FSDP submodules
+                # which is unnecessary.
+                check_fn = lambda k: not isinstance(k, FullyShardedDataParallel)  # noqa: E731
+                deferred_init.materialize_module(module, check_fn=check_fn)
+
+        # Check that module was placed onto a single device.
+        module_devices = set(
+            p.device for p in module.parameters() if p not in ignored_params and not isinstance(p, FlatParameter)
+        )
+
+        if len(module_devices) > 1:
+            raise RuntimeError(
+                f"FSDP only supports single device modules, but got params on {module_devices}"
+            )
+
+        # Move module appropriately depending on device_id and whether module is on CPU.
+        needs_move_back_to_cpu = self._move_module_if_needed(module)
+
         # device for computation, if module is on GPU, use module.device;
         # if module is on CPU, use current device;
         self.compute_device = _get_default_cuda_device(module)
 
-        # Free full params and keep shard only after forward
-        self.reshard_after_forward = True
+        # if device_id is specified, ensure it is the same
+        assert (
+            self.device_id is None or self.compute_device == self.device_id
+        ), f"Inconsistent compute_device and device_id: {self.compute_device} vs {self.device_id}"
+
+        # Enum to indicate if we're in the forward/backward pass, idle, etc.
+        self.training_state = TrainingState_.IDLE
 
         # setting two factors to avoid underflow and overflow
         self.gradient_predivide_factor: float = self._get_gradient_predivide_factor(
@@ -233,18 +764,45 @@ def __init__(
         self.numel_padded_per_param: List[int] = []
         self.cpu_offload = cpu_offload or CPUOffload()
         self.backward_prefetch = backward_prefetch
+        self.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
+        self.mixed_precision = mixed_precision
+        # Original buffer type (mapping since all buffers may not be of same type). In
+        # the case of mixed precision training, this is used to restore buffers
+        # to their original type (which may not be the same as that of the
+        # parameters in the model) when checkpointing.
+        self._orig_buffer_dtypes: Dict[str, torch.dtype] = {}
 
         # Only handle params which are not already sharded. This enables
         # sharding individual layers of a Module, with an outer wrapper to
         # shard any leftover parameters.
-        params = []
-        for param_name, param in module.named_parameters():
-            if not isinstance(param, FlatParameter):
-                params.append(param)
+        params = [
+            p for p in module.parameters()
+            if p not in ignored_params and not isinstance(p, FlatParameter)
+        ]
+
+        if sync_module_states:
+            # Collect buffers we have to synchronize, avoiding buffers that have already
+            # been synchronized to avoid redundant synchronization.
+            bufs_to_sync = []
+            for buf in module.buffers():
+                if not getattr(buf, '_fsdp_has_been_sync', False):
+                    buf._fsdp_has_been_sync = True
+                    bufs_to_sync.append(buf.detach())
+
+            states_to_sync = [param.detach() for param in params]
+            states_to_sync.extend(bufs_to_sync)
+            _sync_params_and_buffers(
+                process_group=self.process_group,
+                module_states=states_to_sync,
+                # Same bucket size as DDP
+                broadcast_bucket_size=_PARAM_BROADCAST_BUCKET_SIZE,
+                src=0,
+            )
 
         self._fsdp_wrapped_module: FlattenParamsWrapper = FlattenParamsWrapper(
             module, param_list=params
         )
+        assert getattr(self, FSDP_WRAPPED_MODULE) is self._fsdp_wrapped_module
         del module  # free original module in case it helps garbage collection
         if self._fsdp_wrapped_module.flat_param is not None:
             self.params = [self._fsdp_wrapped_module.flat_param]
@@ -256,14 +814,47 @@ def __init__(
 
         # Make sure all parameters are sharded.
         for n, p in self.named_parameters():
-            if not isinstance(p, FlatParameter):
+            if p not in ignored_params and not isinstance(p, FlatParameter):
                 raise RuntimeError(
-                    f"found unsharded parameter: {n} ; {p.size()} {p.__class__}"
+                    f"found unflattened parameter: {n} ; {p.size()} {p.__class__}"
                 )
         self._reset_lazy_init()
 
-        # Enum to indicate if we're in the forward/backward pass, idle, etc.
-        self.training_state = TrainingState_.IDLE
+        # Flag indicating if we require gradient reduction in the backward
+        # pass (set to `False` in the `no_sync()` context manager)
+        self._require_backward_grad_sync: bool = True
+
+        self._state_dict_type = StateDictType.FULL_STATE_DICT
+        self._state_dict_config = FullStateDictConfig()
+
+        # FSDP currently provides three different state_dicts. The actual
+        # state_dict that will be saved/loaded is decided by
+        # self._state_dict_type. And the main logic of each state_dict is
+        # implemented in the hook. Therefore, for each hook (post-save and
+        # pre-load), there is a dispatcher dictionary to dispatch the execution
+        # flow to the correct implementation.
+        self._register_state_dict_hook(self._post_state_dict_hook)
+        self._post_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: self._full_post_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: self._local_post_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: self._sharded_post_state_dict_hook,
+        }
+        self._register_load_state_dict_pre_hook(
+            self._pre_load_state_dict_hook, with_module=True
+        )
+        self._pre_load_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: self._full_pre_load_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: self._local_pre_load_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: self._sharded_pre_load_state_dict_hook,
+        }
+        self.register_load_state_dict_post_hook(
+            self._post_load_state_dict_hook
+        )
+        self._post_load_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: self._full_post_load_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: self._local_post_load_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: self._sharded_post_load_state_dict_hook,
+        }
 
         # Flag to guard against preparing gradients multiple times per backward pass.
         self._pre_backward_hook_has_run = False
@@ -275,6 +866,196 @@ def __init__(
             for p in self.params:
                 self._offload_to_cpu(p)
 
+        if needs_move_back_to_cpu:
+            # _fsdp_wrapped_module took ownership of module and we deleted
+            # module, so move _fsdp_wrapped_module back.
+            self._fsdp_wrapped_module = self._fsdp_wrapped_module.to(
+                torch.device("cpu")
+            )
+
+        # For validating execution order across ranks
+        self._exec_order_data = _ExecOrderData()
+
+    def _move_module_if_needed(self, module) -> bool:
+        """
+        Moves module appropriately depending on device_id and
+        whether module is on CPU. Returns a ``bool`` indicating
+        whether the module needs to be moved back to CPU before
+        returning to user.
+        """
+        # Move module to device specified. Note that this is done prior to
+        # setting compute_device to ensure that they align.
+        needs_move_back_to_cpu = False
+        if self.device_id is not None:
+            param = None
+            try:
+                # Get the next unflat param
+                param_gen = module.parameters()
+                while True:
+                    param = next(param_gen)
+                    if not isinstance(param, FlatParameter):
+                        break
+
+                if param.device == torch.device("cpu"):
+                    module = module.to(self.device_id)
+            except StopIteration:
+                # this FSDP instance manages no parameters.
+                pass
+
+            # For GPU modules, module device should match device_id.
+            if param is not None and param.device != self.device_id:
+                raise RuntimeError(
+                    f"Module on rank {self.rank} is given device_id argument "
+                    f"{self.device_id}, but is on {param.device}. "
+                    " Either move module before FSDP init or omit device_id argument."
+                )
+        else:
+            # device_id argument is not specified
+            # If module is on CPU, move it to current device and log a warning.
+            try:
+                # Get the next unflat param
+                param_gen = module.parameters()
+                while True:
+                    param = next(param_gen)
+                    if not isinstance(param, FlatParameter):
+                        break
+
+                if param.device == torch.device("cpu"):
+                    warnings.warn(
+                        f"Module is input on CPU, we are moving it to {torch.cuda.current_device()}"
+                        " to perform parameter verification, flattening, sharding, and will"
+                        " move it back after."
+                    )
+                    needs_move_back_to_cpu = True
+                    # Note that this will match self.compute_device, because
+                    # compute_device is computed as current_device for CPU
+                    # modules.
+                    module = module.to(torch.cuda.current_device())
+            except StopIteration:
+                # this FSDP instance manages no parameters
+                pass
+
+        return needs_move_back_to_cpu
+
+    def _init_reshard_after_forward(self):
+        if self.sharding_strategy == ShardingStrategy.FULL_SHARD:
+            # Free full params and keep shard only after forward
+            self.reshard_after_forward = True
+        elif self.sharding_strategy == ShardingStrategy.SHARD_GRAD_OP:
+            # Keep full params in the GPU memory until backward
+            # computation is done
+            self.reshard_after_forward = False
+        elif self.sharding_strategy == ShardingStrategy.NO_SHARD:
+            # self.reshard_after_forward is not used when NO_SHARD
+            # is set, just setting it as False here
+            self.reshard_after_forward = False
+        else:
+            raise RuntimeError(
+                "sharding_strategy only supports FULL_SHARD, SHARD_GRAD_OP and NO_SHARD right now."
+            )
+
+    def _get_ignored_modules(
+        self,
+        root_module: torch.nn.Module,
+        _ignored_modules: Any,
+    ) -> Set[torch.nn.Module]:
+        """
+        Checks that ``_ignored_modules`` (1) is an iterable of
+        ``torch.nn.Module`` s without any :class:`FullyShardedDataParallel`
+        instances and does not contain the top-level ``module`` itself, and
+        then returns them and their children as a :class:`set`, excluding
+        nested :class:`FullyShardedDataParallel` instances.
+
+        We include the child modules of modules in ``_ignored_modules`` to be
+        more intuitive since ignoring a module should ignore its child modules
+        as well, and we exclude :class:`FullyShardedDataParallel` instances
+        since ``self`` may be the intended root instance that manages them.
+        """
+        if _ignored_modules is None:
+            return set()
+        msg_prefix = "`ignored_modules` should be an iterable of " \
+            "`torch.nn.Module`s "
+        try:
+            ignored_root_modules = set(_ignored_modules)
+        except TypeError:
+            raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}")
+        for module in ignored_root_modules:
+            if not isinstance(module, torch.nn.Module):
+                raise TypeError(
+                    msg_prefix + f"but got an iterable with {type(module)}"
+                )
+            if isinstance(module, FullyShardedDataParallel):
+                raise ValueError(
+                    "`ignored_modules` should not include FSDP modules"
+                )
+        # Include child modules and exclude nested FSDP modules
+        ignored_modules = set(
+            child for module in ignored_root_modules
+            for child in module.modules()
+            if not isinstance(child, FullyShardedDataParallel) and
+            not isinstance(child, FlattenParamsWrapper)
+        )
+        if root_module in ignored_modules:
+            raise ValueError(
+                "Trying to ignore the top-level module passed into the FSDP "
+                "constructor itself will result in all parameters being "
+                f"ignored and is not supported: {module}"
+            )
+        return ignored_modules
+
+    def _get_ignored_params(
+        self,
+        root_module: torch.nn.Module,
+        ignored_modules: Set[torch.nn.Module],
+    ) -> Tuple[Set[torch.nn.Parameter], Set[str]]:
+        """
+        Returns the parameters of the modules in ``ignored_modules``,
+        excluding any :class:`FlatParameter` s and their fully prefixed names,
+        both as :class:`set` s.
+
+        Args:
+            root_module (torch.nn.Module): Top-level module passed into the
+                FSDP constructor from which to derive the fully prefixed names.
+            ignored_modules (Set[torch.nn.Module]): Modules to ignore.
+        """
+        ignored_params = set(
+            p for m in ignored_modules for p in m.parameters()
+            if not isinstance(p, FlatParameter)
+        )
+        param_to_unflat_param_names = _get_param_to_unflat_param_names(
+            root_module, dedup_shared_params=False,
+        )
+        ignored_param_names = set()
+        for param in ignored_params:
+            unflat_param_names = param_to_unflat_param_names[param]
+            ignored_param_names.update(unflat_param_names)
+        return ignored_params, ignored_param_names
+
+    def _get_buffer_names(self, root_module: torch.nn.Module) -> Set[str]:
+        """
+        Returns the fully prefixed names of all buffers in the module hierarchy
+        rooted at ``root_module`` as a class:`set`.
+
+        Args:
+            root_module (torch.nn.Module): Top-level module passed into the
+                FSDP constructor from which to derive the fully prefixed names.
+        """
+        def module_fn(module, prefix, buffer_names):
+            # For FSDP modules, only add the entry when considering the
+            # contained `FlattenParamsWrapper` to avoid duplication
+            if not isinstance(module, FullyShardedDataParallel):
+                for buffer_name, _ in module.named_buffers(recurse=False):
+                    prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
+                    buffer_names.add(prefixed_buffer_name)
+
+        def return_fn(buffer_names, *args):
+            return buffer_names
+
+        buffer_names: Set[str] = set()
+        return _apply_to_modules(
+            root_module, module_fn, return_fn, buffer_names,
+        )
+
     @classmethod
     def _check_wrapped(cls, begin_module, check_fn, err_fn):
         for _, mod in begin_module.named_modules():
@@ -287,6 +1068,64 @@ def module(self) -> FlattenParamsWrapper:
         assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper)
         return self._fsdp_wrapped_module
 
+    def check_is_root(self) -> bool:
+        self._lazy_init()
+        assert self._is_root is not None
+        return self._is_root
+
+    @staticmethod
+    def fsdp_modules(
+        module: nn.Module,
+        root_only: bool = False,
+    ) -> List["FullyShardedDataParallel"]:
+        """
+        Returns all nested FSDP instances, possibly including ``module`` itself
+        and only including FSDP root modules if ``root_only=True``.
+
+        Args:
+            module (torch.nn.Module): Root module, which may or may not be an
+                ``FSDP`` module.
+            root_only (bool): Whether to return only FSDP root modules.
+                (Default: ``False``)
+
+        Returns:
+            List[FullyShardedDataParallel]: FSDP modules that are nested in
+            the input ``module``.
+        """
+        return [
+            submodule for submodule in module.modules()
+            if isinstance(submodule, FullyShardedDataParallel) and
+            (not root_only or submodule.check_is_root())
+        ]
+
+    def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
+        r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
+        as well as self. Typical use includes initializing the parameters of a model
+        (see also :ref:`nn-init-doc`).
+
+        Compared to ``torch.nn.Module.apply``, this version additionally gathers
+        the full parameters before applying ``fn``. It should not be called from
+        within another ``summon_full_params`` context.
+
+        Args:
+            fn (:class:`Module` -> None): function to be applied to each submodule
+
+        Returns:
+            Module: self
+        """
+        uninitialized = self._is_root is None
+        self._assert_state(TrainingState_.IDLE)
+        with self._summon_full_params(recurse=False, writeback=True):
+            ret = super().apply(fn)
+
+        # Reset lazy init that might be called by _summon_full_params, since
+        # it could have set is_root incorrectly for non-root FSDP instances.
+        if uninitialized and self._is_root:
+            for module in self.fsdp_modules(self):
+                module._reset_lazy_init()
+
+        return ret
+
     # setting two factors 'self.gradient_predivide_factor'
     # and 'self.gradient_postdivide_factor' to avoid underflow and overflow
     def _get_gradient_predivide_factor(self, world_size: int) -> float:
@@ -306,32 +1145,162 @@ def _offload_to_cpu(self, p):
         with torch.no_grad():
             p.data = p.to(cpu_device)
 
+    def _mixed_precision_enabled_for_params(self) -> bool:
+        """
+        Whether user explicitly enabled mixed precision for
+        parameters or not.
+        """
+        return (
+            self.mixed_precision is not None
+            and self.mixed_precision.param_dtype is not None
+        )
+
+    def _mixed_precision_enabled_for_buffers(self) -> bool:
+        """
+        Whether user explicitly enabled mixed precision for
+        buffers or not.
+        """
+        return (
+            self.mixed_precision is not None
+            and self.mixed_precision.buffer_dtype is not None
+        )
+
+    def _mixed_precision_enabled_for_reduce(self) -> bool:
+        """
+        Whether user explicitly enabled mixed precision for
+        gradient reduction or not.
+        """
+        return (
+            self.mixed_precision is not None
+            and self.mixed_precision.reduce_dtype is not None
+        )
+
+    def _cast_fp_inputs_to_precision(
+        self, dtype: torch.dtype, *args: Any, **kwargs: Any
+    ) -> Tuple[Any, Any]:
+        """
+        Casts floating point tensors in args and kwargs to precision given by dtype.
+        requires_grad field is respected.
+        """
+        def cast_fn(x: torch.Tensor) -> torch.Tensor:
+            if not torch.is_floating_point(x):
+                return x
+            y = x.to(dtype)
+            # Explicitly copy over requires_grad context since this is happening
+            # within torch.no_grad.
+            if x.is_leaf:
+                y.requires_grad = x.requires_grad
+            return y
+
+        with torch.no_grad():
+            return (
+                _apply_to_tensors(cast_fn, args),
+                _apply_to_tensors(cast_fn, kwargs)
+            )
+
+    @torch.no_grad()
+    def _cast_param_shards_to_dtype(self):
+        """
+        Allocates a mixed precision paramter shard and casts parameter shards to
+        reduced precision by copying into this mixed precision shard. Note that
+        if we are CPU offloading, this also implicitly loads the parameter shard
+        back to GPU.
+        """
+        assert (
+            self._mixed_precision_enabled_for_params()
+        ), "Expected to only be called when mixed precision for parameters is enabled."
+        with torch.cuda.stream(self._streams["mixed_precision_params"]):
+            for p in self.params:
+                assert p._mp_shard is not None
+                _alloc_storage(data=p._mp_shard, size=p._local_shard.size())
+                # Cast is done by copy
+                p._mp_shard.copy_(
+                    # no-op if not CPU offloading, otherwise nonblocking because
+                    # p._local_shard is pinned in _init_param_attributes.
+                    p._local_shard.to(p._mp_shard.device, non_blocking=True)
+                )
+                # Point p to the mp shard
+                p.data = p._mp_shard
+        # Block current stream on this copy work.
+        torch.cuda.current_stream().wait_stream(self._streams["mixed_precision_params"])
+
+    @torch.no_grad()
+    def _free_mp_shard(self, params: List[FlatParameter]):
+        """
+        Deallocate storage for parameter's mixed precision shard.
+        """
+        assert (
+            self._mixed_precision_enabled_for_params()
+        ), "Expected to only be called when mixed precision for parameters is enabled."
+        current_stream = torch.cuda.current_stream()
+        for p in params:
+            # mp_shard should always be allocated.
+            assert p._mp_shard is not None
+            # Shard is allocated in "mixed_precision_stream" and then we block
+            # current stream on this stream, so don't free it until work in the
+            # current stream is completed.
+            p._mp_shard.record_stream(current_stream)
+            _free_storage(p._mp_shard)
+
     def _cast_buffers(
-        self, device: Optional[torch.device] = None, memo: Optional[Set] = None
+        self,
+        device: Optional[torch.device] = None,
+        dtype: Optional[Dict[str, torch.dtype]] = None,
+        memo: Optional[Set] = None,
+        recurse: bool = True,
     ) -> None:
-        """Move all buffers to the given *device*.
+        """Move all buffers to the given *device* and *dtype*.
         If *device* is not given, then it will default to
-        ``self.compute_device``. In the
-        case of nested FSDP instances, we will respect the child instance's
+        ``self.compute_device``, otherwise buffer will be moved to ``device``.
+        In the case of nested FSDP instances, we will respect the child instance's
         ``compute_device`` configuration.
+        If *dtype* is given, it must be a mapping of buffer name to buffer dtype,
+            and this argument is currently only given to restore back to original
+            buffer types during checkpoint. If *dtype* is not given, and we are
+            in mixed precision training, the buffer will be cast to buffer_dtype,
+            otherwise the buffer will not be cast.
         Args:
             device (torch.device, Optional):
                 device to cast buffers to (defaults to compute_device)
+            dtype: (Dict[str, torch.dtype], Optional):
+                Mapping of buffer name to their dtype to cast to.
             memo (Set, Optional):
                 set of modules that have already been processed
+            recurse (bool, Optional):
+                Whether to call _cast_buffers recursively on nested FSDP
+                instances (default is True).
         """
         if memo is None:
             memo = set()
         for module in self.modules():
-            if module is not self and isinstance(module, FullyShardedDataParallel):
+            if module is not self and isinstance(module, FullyShardedDataParallel) and recurse:
                 # Allow any child FSDP instances to handle their own buffers.
-                module._cast_buffers(device=device, memo=memo)
+                module._cast_buffers(device=device, dtype=dtype, memo=memo, recurse=recurse)
             elif module not in memo:
                 memo.add(module)
                 for name, buf in module.named_buffers(recurse=False):
                     if buf is None:
                         continue
                     buf = buf.to(device=device or self.compute_device)
+                    if name not in self._orig_buffer_dtypes:
+                        self._orig_buffer_dtypes[name] = buf.dtype
+                    # If given, cast buffer to the given dtype. This is used to
+                    # suppport mixed precision for buffers
+                    # (given by self.mixed_precision.buffer_dtype) and also used
+                    # to restore the buffer dtype to the original precision for
+                    # state_dict() calls.
+                    # Note that non-floating point buffers are not casted.
+                    if torch.is_floating_point(buf):
+                        # We are restoring the original buffer type in
+                        # preparation for checkpoint.
+                        if dtype:
+                            buf = buf.to(dtype=dtype[name])
+                        # Note that we don't pass in self.mixed_precision.buffer_dtype
+                        # recursively into _cast_buffers, as we want to respect
+                        # mp config for child FSDP instances.
+                        elif self._mixed_precision_enabled_for_buffers():
+                            buf = buf.to(self.mixed_precision.buffer_dtype)
+
                     setattr(module, name, buf)
 
     @torch.no_grad()
@@ -355,8 +1324,12 @@ def _shard_parameters(self) -> None:
                 p.is_floating_point()
             ), "Autograd does not support operations for integer type."
 
-            # Sharding is done only when world_size is larger than 1.
-            p._is_sharded = self.world_size > 1  # type: ignore[attr-defined]
+            # Sharding is done only when world_size is larger than 1 and
+            # sharding_strategy!=NO_SHARD.
+            p._is_sharded = (  # type: ignore[attr-defined]
+                self.world_size > 1
+                and self.sharding_strategy != ShardingStrategy.NO_SHARD
+            )
             p._orig_size = p.size()  # type: ignore[attr-defined]
 
             if not p._is_sharded:  # type: ignore[attr-defined]
@@ -389,32 +1362,60 @@ def _shard_parameters(self) -> None:
             self.params
         ), "numel_padded_per_param is not populated correctly."
 
-    def _get_shard(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, int]:
-        """Return the local shard of a full tensor."""
-        # Shard using torch.chunk to match all-gather/reduce-scatter.
-        chunks = torch.flatten(tensor).chunk(self.world_size)
-        if len(chunks) < (self.rank + 1):
+    @staticmethod
+    def _get_chunk(
+        tensor: torch.Tensor,
+        rank: int,
+        world_size: int,
+    ) -> Tuple[torch.Tensor, int]:
+        """Returns the unpadded chunk as a view and the number of padding
+        elements of a full tensor for the given rank and world size."""
+        # Shard using `torch.chunk()` to match all-gather/reduce-scatter.
+        chunks = torch.flatten(tensor).chunk(world_size)
+        if len(chunks) < (rank + 1):
             # If there are not enough chunks to shard across ranks, create an
             # empty chunk that will just be padded with zeros to be the
             # appropriate size.
             chunk = chunks[0].new_empty(0)
         else:
-            chunk = chunks[self.rank]
+            chunk = chunks[rank]
         # Determine number of padding elements.
         num_to_pad = chunks[0].numel() - chunk.numel()
-        assert (
-            num_to_pad >= 0
-        ), "Chunk's size should be equal or smaller than \
-            the first chunk's size."
-
-        # We always need to clone here, because regardless of padding the
-        # original parameter, of which this chunk is a view of, is deallocated
-        # after _get_shard.
+        assert num_to_pad >= 0, \
+            "Chunk's size should at most the first chunk's size"
+        return chunk, num_to_pad
+
+    @staticmethod
+    def _get_shard_functional(
+        tensor: torch.Tensor,
+        rank: int,
+        world_size: int,
+    ) -> Tuple[torch.Tensor, int]:
+        """Functional version of :meth:`_get_shard`."""
+        chunk, num_to_pad = FullyShardedDataParallel._get_chunk(
+            tensor, rank, world_size,
+        )
+        # We always need to clone here regardless of the padding and even
+        # though `chunk` is a view of `tensor` because `tensor` may be
+        # deallocated after this method returns
         shard = chunk.clone()
         if num_to_pad > 0:
             shard = F.pad(shard, [0, num_to_pad])
         return shard, num_to_pad
 
+    def _get_shard(
+        self,
+        tensor: torch.Tensor,
+        rank: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Returns the local shard and the number of padding elements of a full
+        tensor for the calling rank if ``rank=None`` or for the rank ``rank``
+        if not ``None``."""
+        rank = self.rank if rank is None else rank
+        return FullyShardedDataParallel._get_shard_functional(
+            tensor, rank, self.world_size,
+        )
+
     def __getattr__(self, name: str) -> Any:
         """Forward missing attributes to wrapped module."""
         try:
@@ -440,6 +1441,8 @@ def _reset_lazy_init(self) -> None:
                 # reset attributes that are added in _init_param_attributes, as
                 # part of _lazy_init
                 del p._local_shard  # type: ignore[attr-defined]
+        # set 'self.reshard_after_forward' flag based on self.sharding_strategy
+        self._init_reshard_after_forward()
 
     def _lazy_init(self) -> None:
         """Initialization steps that should happen lazily, typically right
@@ -461,7 +1464,7 @@ def _lazy_init(self) -> None:
         if self._is_root:
             # Buffers stay on GPU, and don't get sharded. Since _cast_buffers
             # applies recursively, we only call this from the root instance.
-            self._cast_buffers()
+            self._cast_buffers(recurse=True)
 
             # Don't free the full params for the outer-most (root) instance,
             # In most cases, root instance contains params in the last layers
@@ -482,8 +1485,9 @@ def _init_param_attributes(self, p: Parameter) -> None:
         are set by :func:`_shard_parameters`:
             ``_is_sharded``: ``True`` if the Parameter is sharded or ``False``
                 if the Parameter is intentionally not sharded (in which case we
-                will all-reduce grads for this param). Currently the only way
-                `_is_sharded = False` is if world_size = 1.
+                will all-reduce grads for this param). Currently the way
+                `_is_sharded = False` is if world_size = 1 or sharding strategy
+                is NO_SHARD.
             ``_orig_size``: the size of the original Parameter (before sharding)
         A few attributes are set here:
             ``_local_shard``: a single shard of the parameter. This is needed to
@@ -539,16 +1543,40 @@ def _init_param_attributes(self, p: Parameter) -> None:
                 p, device=torch.device("cpu")
             ).pin_memory()
 
+        # If mixed_precision, maintain reduced precision param shard on
+        # compute_device for computation in fwd/bwd. We resize storage to 0 here
+        # and rematerialize before building the full param when needed. After
+        # fwd/bwd, it is freed and we only hold on to the full precision shard.
+        # As a result, this reduced precision shard is not allocated if we are
+        # not in the forward/backward pass.
+        if (
+            self._mixed_precision_enabled_for_params()
+        ):
+            p._mp_shard = torch.zeros_like(
+                p._local_shard,
+                device=self.compute_device,
+                dtype=self.mixed_precision.param_dtype
+            )
+            _free_storage(p._mp_shard)
+
         # We also maintain a full-sized parameter of type self.compute_dtype.
         # We resize the storage to size 0 at init (here) and only materialize
         # as needed. The storage may contain padding elements so that it is
         # evenly divisible by world_size, although these padding elements will
         # be removed before the relevant computation.
         if p._is_sharded:  # type: ignore[attr-defined]
+            # We set p._full_param_padded's dtype to the desired parameter dtype
+            # in the case of mixed precision. This is so that when we all_gather
+            # into full_param_padded it can occur without issues and result in
+            # full_param_padded having the expected param_dtype.
+            full_param_dtype = (
+                p.dtype if not self._mixed_precision_enabled_for_params()
+                else self.mixed_precision.param_dtype
+            )
             p._full_param_padded = torch.zeros(  # type: ignore[attr-defined]
                 p.numel() * self.world_size,
                 device=self.compute_device,
-                dtype=p.dtype,
+                dtype=full_param_dtype,
             )
             _free_storage(p._full_param_padded)  # type: ignore[attr-defined]
 
@@ -560,6 +1588,7 @@ def _set_is_root(self) -> None:
             return
         # No FSDP instance wraps this, else _is_root would be set to False.
         self._is_root = True
+        self._exec_order_data.init(self)
         # If final backward callback is never been queued, state should be IDLE.
         # If final backward callback is queued, the callback should be finished
         # and the state was reset to be IDLE.
@@ -567,15 +1596,14 @@ def _set_is_root(self) -> None:
         # For children instances, if they are checkpointed, state will not be reset to
         # IDLE after each inner forward/backward.
         self._assert_state(TrainingState_.IDLE)
-        for n, m in self.named_modules():
-            # `n != ""` excludes self.
-            if n != "" and isinstance(m, FullyShardedDataParallel):
+        for m in self.modules():
+            if m is not self and isinstance(m, FullyShardedDataParallel):
                 # We relax the assert for non-root instance, when the nested initialized module is wrapped
                 # again in FSDP later, for example after training to run inference.
                 assert (
                     m._is_root is None or not m._is_root
-                ), "Non-root instance's _is_root flag should have not been set yet \
-                    or has already been set as False."
+                ), "Non-root instance's _is_root flag should have not been set yet" \
+                    "or has already been set as False."
                 if m._is_root is None:
                     m._is_root = False
 
@@ -589,14 +1617,25 @@ def _setup_streams(self) -> None:
             self._streams["all_gather"] = torch.cuda.Stream()
             # Stream for overlapping grad reduction with the backward pass.
             self._streams["post_backward"] = torch.cuda.Stream()
+            # Stream to move main params to self.mixed_precision.param_dtype
+            # for forward pass.
+            if self._mixed_precision_enabled_for_params():
+                self._streams["mixed_precision_params"] = torch.cuda.Stream()
 
         # We share streams with all children instances, which allows them to
         # overlap transfers across the forward pass without synchronizing with
         # the default stream.
-        for n, m in self.named_modules():
-            if n != "" and isinstance(m, FullyShardedDataParallel):
+        for m in self.modules():
+            if m is not self and isinstance(m, FullyShardedDataParallel):
                 m._streams = self._streams
                 m._fsdp_graph_order = self._fsdp_graph_order
+                # Give each non-root FSDP module an alias to the root's
+                # execution order data structure and the root's ignored
+                # parameters and all buffer names since only the root's names
+                # are fully prefixed like the state dict keys
+                m._exec_order_data = self._exec_order_data
+                m._ignored_param_names = self._ignored_param_names
+                m._buffer_names = self._buffer_names
 
     def _wait_for_previous_optim_step(self) -> None:
         """
@@ -606,6 +1645,12 @@ def _wait_for_previous_optim_step(self) -> None:
         """
         if not torch.cuda.is_available():
             return
+
+        if self._mixed_precision_enabled_for_params():
+            self._streams["mixed_precision_params"].wait_stream(
+                torch.cuda.current_stream()
+            )
+
         self._streams["all_gather"].wait_stream(torch.cuda.current_stream())
 
     def _need_prefetch_pre_backward_hook(self) -> bool:
@@ -637,97 +1682,703 @@ def _need_prefetch_post_backward_hook(self) -> bool:
         else:
             return False
 
-    def forward(self, *args: Any, **kwargs: Any) -> Any:
-        self._lazy_init()
+    @staticmethod
+    @contextlib.contextmanager
+    def state_dict_type(
+        module: nn.Module,
+        state_dict_type: StateDictType,
+        state_dict_config: Optional[StateDictConfig] = None,
+    ) -> Generator:
+        """
+        A context manager to set the ``state_dict_type`` of all the descendant
+        FSDP modules of the target module. The target module does not have to
+        be a FSDP module. If the target module is a FSDP module, its
+        ``state_dict_type`` will also be changed.
 
-        # Start of a forward pass.
-        self.training_state = TrainingState_.FORWARD
-
-        # All-gather full parameters, moving them to compute_device if
-        # necessary.
-        self._rebuild_full_params()
-        # Wait for all_gather full parameters to finish before computation
-        torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
-
-        # Register backward hooks to reshard params and reduce-scatter grads.
-        # These need to be re-registered every forward pass in some cases where grad_fn
-        # is mutated.
-        self._register_post_backward_hooks()
-
-        outputs = self.module(*args, **kwargs)
-
-        if self not in self._fsdp_graph_order:
-            self._my_fsdp_idx_in_graph = len(self._fsdp_graph_order)
-            self._fsdp_graph_order.append(self)
-
-        if self.reshard_after_forward:
-            self._free_full_params()
-        # Switch to original local shards of params. We maintain this invariant throughout
-        # the code, i.e., ``p.data == p._local_shard`` after each function. This
-        # also ensures that after the first forward, the optimizer state will be
-        # initialized with the correct dtype and (sharded) size, since optimizer
-        # state is typically initialized lazily in ``optim.step()``.
-        self._use_param_local_shard()
-
-        # Register pre-backward hooks to all-gather the params for the backward
-        # pass (if output's grad was needed). This won't register anything if
-        # we are in eval mode.
-        outputs = self._register_pre_backward_hooks(outputs)
-
-        # Done with a forward pass.
-        self.training_state = TrainingState_.IDLE
+        .. note:: This API should be called for only the top-level (root)
+            module.
 
-        return outputs
+        .. note:: This API enables users to transparently use the conventional
+            ``state_dict`` API to take model checkpoints in cases where the
+            root FSDP module is wrapped by another ``nn.Module``. For example,
+            the following will ensure ``state_dict``  is called on all non-FSDP
+            instances, while dispatching into `local_state_dict` implementation
+            for FSDP:
 
-    @torch.no_grad()
-    def _write_back_current_shard(self):
-        for p in self.params:
-            if not p._is_sharded:  # type: ignore[attr-defined]
-                pass
-            chunks = p._full_param_padded.chunk(self.world_size)  # type: ignore[attr-defined]
-            assert len(chunks) > self.rank
-            chunk = chunks[self.rank]
-            p._local_shard.copy_(chunk)  # type: ignore[attr-defined]
+        Example::
 
-    def _collect_local_params(self):
+        >>> model = DDP(FSDP(...))
+        >>> fsdp_root = model.module
+        >>> with FSDP.state_dict_type(fsdp_root, StateDictType.LOCAL_STATE_DICT):
+        >>>     checkpoint = model.state_dict()
 
+        Args:
+            module (torch.nn.Module): Root module.
+            state_dict_type (StateDictType): the desired ``state_dict_type`` to set.
+        """
+        prev_state_dict_type = None
+        prev_state_dict_config = None
+        # Use default config a state_dict config is not set.
+        if state_dict_config is None:
+            state_dict_config = _state_dict_type_to_config[state_dict_type]()
+        for submodule in FullyShardedDataParallel.fsdp_modules(module):
+            if prev_state_dict_type is None:
+                prev_state_dict_type = submodule._state_dict_type
+            if prev_state_dict_config is None:
+                prev_state_dict_config = submodule._state_dict_config
+            if prev_state_dict_type != submodule._state_dict_type:
+                raise RuntimeError("All FSDP module should the same state_dict_type.")
+            if type(prev_state_dict_config) != type(submodule._state_dict_config):
+                raise RuntimeError(
+                    "All FSDP modules should have the same type of state_dict_config."
+                )
 
-        def _is_full_param_in_use(p: Parameter):
-            return p._is_sharded and p._full_param_padded.storage().size() > 0  # type: ignore[attr-defined]
+            expected_state_dict_config_type = _state_dict_type_to_config[state_dict_type]
+            if expected_state_dict_config_type != type(state_dict_config):
+                raise RuntimeError(
+                    f"Expected state_dict_config of type {expected_state_dict_config_type} but got {type(state_dict_config)}"
+                )
+            submodule._state_dict_type = state_dict_type
+            submodule._state_dict_config = state_dict_config
+        try:
+            yield
+        finally:
+            assert prev_state_dict_type is not None  # Avoid mypy warning
+            assert prev_state_dict_config is not None  # Avoid mypy warning
+            for submodule in FullyShardedDataParallel.fsdp_modules(module):
+                submodule._state_dict_type = prev_state_dict_type
+                submodule._state_dict_config = prev_state_dict_config
+
+    def _full_post_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> Dict[str, Any]:
+        """
+        Hook that runs after model.state_dict() is called before returning result to
+        user. For FSDP, we may have to clone the tensors in state_dict as params go
+        back to sharded version after _summon_full_params ends, and also remove
+        "_fsdp_wrapped_module" prefix.
+        """
+        self._assert_state([TrainingState_.SUMMON_FULL_PARAMS])
+        # state_dict is empty for nonzero ranks if `rank0_only` was enabled.
+        if not state_dict:
+            return state_dict
 
-        return [p for p in self.params if not _is_full_param_in_use(p)]
+        offload_to_cpu = self._state_dict_config.offload_to_cpu
+        cpu_device = torch.device("cpu")
+        for key in state_dict:
+            clean_key = clean_tensor_name(key)
+            # Do not need to clone buffers since they are not sharded
+            if clean_key in self._buffer_names:
+                # Offload the buffer to CPU if needed -- we do not do this in
+                # `_summon_full_params()` since without care, that would free
+                # the original buffer's GPU memory and require reallocating
+                # that memory later; this only affects the state dict's buffer
+                # variable and leaves the original buffer's GPU memory intact
+                if offload_to_cpu and state_dict[key].device != cpu_device:
+                    state_dict[key] = state_dict[key].to(cpu_device)
+                continue
+            # Clone non-ignored parameters before exiting the
+            # `_summon_full_params()` context
+            if clean_key not in self._ignored_param_names and \
+                    not getattr(state_dict[key], "_has_been_cloned", False):
+                try:
+                    state_dict[key] = state_dict[key].clone().detach()
+                    state_dict[key]._has_been_cloned = True  # type: ignore[attr-defined]
+                except BaseException as e:
+                    warnings.warn(
+                        f"Failed to clone() tensor with name {key}. This may mean "
+                        "that this state_dict entry could point to invalid memory "
+                        "regions after returning from state_dict() call if this "
+                        "parameter is managed by FSDP. Please check clone "
+                        f"implementation of {key}. Error: {str(e)}"
+                    )
 
+        _replace_by_prefix(state_dict, prefix + f"{FSDP_WRAPPED_MODULE}.", prefix)
+        return state_dict
 
-    @contextlib.contextmanager
-    def _summon_full_params(self, recurse: bool = True, writeback: bool = True) -> Generator:
+    def _local_post_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> Dict[str, Any]:
         """
-        A context manager to expose full params for the current FSDP instance.
-        Can be useful *after* forward/backward for a model to get the params for
-        additional processing or checking.
-        .. note:: This can be used on inner FSDPs.
-        .. note:: This can *not* be used within a forward or backward pass. Nor
-            can forward and backward be started from within this context.
-        .. note:: Parameters will revert to their local shards after the context manager
-            exits, storage behavior is the same as forward.
-        .. note:: The full parameters can be modified, but only the portion
-            corresponding to the local param shard will persist after the
-            context manager exits (unless ``writeback=False``, in which case
-            changes will be discarded).
-        Args:
-            recurse (bool, Optional): recursively summon all params for nested
-                FSDP instances (default: True)
-            writeback (bool, Optional): if ``False``, modifications to params are
-                discarded after the context manager exists;
-                disabling this can be slightly more efficient (default: True)
+        This hook create a ShardedTensor from the local flat_param and replace
+        the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
+        will happen. The underlying storage is the same.
         """
-        if recurse:
-            with contextlib.ExitStack() as stack:
-                # Summon all params for any nested FSDP instances.
-                for module in self.modules():
-                    if isinstance(module, FullyShardedDataParallel):
-                        stack.enter_context(module._summon_full_params(recurse=False, writeback=writeback))
-                # Yield to the caller, with full params in all nested instances.
-                yield
+        _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix)
+        if self.module.no_params:
+            return state_dict
+
+        # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
+        # value as the flat_param but it is a pure Tensor because
+        # nn.Module.state_dict() will detach the parameter. Therefore, we need
+        # to get flat_param from the FlattenParamsWrapper to get the metadata.
+        flat_param = getattr(self.module, FLAT_PARAM, None)
+        # Construct a ShardedTensor from the flat_param.
+        full_numel = flat_param.full_numel
+        shard_offset = flat_param.numel() * self.rank
+        valid_data_size = flat_param.numel() - flat_param.num_padded
+        if valid_data_size > 0 and flat_param.num_padded > 0:
+            flat_param = flat_param.narrow(0, 0, valid_data_size)
+        local_shards = [
+            Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank)
+        ]
+        state_dict[f"{prefix}{FLAT_PARAM}"] = init_from_local_shards(
+            local_shards, full_numel, process_group=self.process_group
+        )  # type: ignore[assignment]
+
+        return state_dict
+
+    @torch.no_grad()
+    def _sharded_post_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> Dict[str, Any]:
+        """
+        The hook replaces the unflattened, unsharded parameter in the state_dict
+        with a unflattened, sharded parameter (a ShardedTensor).
+        """
+        _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix)
+        if self.module.no_params:
+            return state_dict
+
+        for module_name, _, param_name in self.module.orig_flat_param[0].param_info:
+            module_name = module_name.replace(f"{FPW_MODULE}.", "")
+            module_name = module_name.replace(f"{FPW_MODULE}", "")
+            if module_name:
+                module_name = f"{module_name}."
+            fqn = f"{prefix}{module_name}{param_name}"
+
+            # Create a ShardedTensor for the unflattened, non-sharded parameter.
+            param = state_dict[fqn]
+            local_shard = param.chunk(self.world_size)[self.rank].clone()
+            offsets = [0 for _ in param.size()]
+            offsets[0] = math.ceil(param.size()[0] / self.world_size) * self.rank
+            local_shards = [
+                Shard.from_tensor_and_offsets(local_shard, offsets, self.rank)
+            ]
+            state_dict[fqn] = init_from_local_shards(
+                local_shards, param.size(), process_group=self.process_group
+            )  # type: ignore[assignment]
+        return state_dict
+
+    @staticmethod
+    def _post_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> Dict[str, Any]:
+        """
+        _post_state_dict_hook() is called after the state_dict() of this
+        FSDP module is executed. ``self._state_dict_type`` is used to decide
+        what postprocessing will be done.
+        """
+        self = cast(FullyShardedDataParallel, module)
+        processed_state_dict = self._post_state_dict_hook_fn[self._state_dict_type](state_dict, prefix)
+        # Restore buffers, which currently are in their full precision type,
+        # back to their mixed precision type. This is because buffers are cast
+        # during lazy_init() and stay at their mixed precision type before/after
+        # forward/backward. As a result state_dict() should maintain this.
+        if (
+            self._is_root
+            and self._mixed_precision_enabled_for_buffers()
+        ):
+            self._cast_buffers(recurse=True)
+        return processed_state_dict
+
+    def state_dict(self, *args, **kwargs):
+        """
+        The entry point of all three FSDP ``state_dict`` APIs. By default, calling
+        ``state_dict`` on an FSDP module will result in FSDP attempting to bring
+        the entire (nested) model into memory and taking the local model's
+        ``state_dict`` on every rank, which could result in OOM if the model
+        cannot fit on a single GPU. As a result, :func:`state_dict_type` API is
+        available to configure between `state_dict` implementations. User can
+        thus use `with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT)`
+        context manager to perform a local checkpoint that will store only local
+        shards of the module. Currently, the only supported implementations are
+        ``StateDictType.LOCAL_STATE_DICT`` and ``StateDictType.FULL_STATE_DICT``
+        (default).
+
+        Example::
+
+        >>> import torch
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> from torch.distributed.fsdp import StateDictType
+        >>> torch.cuda.set_device(device_id)
+        >>> my_module = nn.Linear(...)
+        >>> sharded_module = FSDP(my_module)
+        >>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
+        >>>     full_dict = sharded_module.state_dict()
+        >>> full_dict.keys()
+        >>> odict_keys(['weight', 'bias'])
+        >>> # using local state dict
+        >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
+        >>>     local_dict = sharded_module.state_dict()
+        >>> local_dict.keys()
+        >>> odict_keys(['flat_param', 'inner.flat_param'])
+
+        .. warning:: This needs to be called on all ranks, since synchronization
+            primitives may be used.
+        """
+        # TODO (rohan-varma): separate these out once a state_dict pre-hook
+        # is available.
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        self._lazy_init()
+        if self._state_dict_type == StateDictType.FULL_STATE_DICT:
+            # Get config args
+            full_state_dict_config = (
+                self._state_dict_config if self._state_dict_config is not None
+                else FullStateDictConfig()
+            )
+            rank0_only = full_state_dict_config.rank0_only
+            offload_to_cpu = full_state_dict_config.offload_to_cpu
+            summon_ctx = (
+                self._summon_full_params(
+                    recurse=False, writeback=False, offload_to_cpu=offload_to_cpu, rank0_only=rank0_only
+                )
+                if self.training_state != TrainingState_.SUMMON_FULL_PARAMS else
+                contextlib.suppress()
+            )
+            with summon_ctx:
+                # Since buffers are not sharded and stay casted, restore them to their
+                # original user module specified types for checkpoint. We take care to
+                # recast in post_state_dict_hook for consistency with the fact that
+                # buffers stay casted after forward/backward. We must have the
+                # call here instead of above because _summon_full_params itself
+                # calls _lazy_init() which would cast the buffers.
+                if (
+                    self._is_root
+                    and self._mixed_precision_enabled_for_buffers()
+                ):
+                    self._cast_buffers(
+                        dtype=self._orig_buffer_dtypes, recurse=False
+                    )
+                state_dict = super().state_dict(*args, **kwargs)
+
+            # TODO: support offload to CPU in post state dict hook.
+            if not rank0_only or self.rank == 0:
+                return state_dict
+            else:
+                return {}
+
+        elif self._state_dict_type == StateDictType.LOCAL_STATE_DICT:
+            if (
+                self.module.flat_param is not None and
+                not self.module.flat_param._is_sharded
+            ):
+                raise RuntimeError(
+                    "local_state_dict can only be called "
+                    "when parameters are flatten and sharded."
+                )
+            return super().state_dict(*args, **kwargs)
+        elif self._state_dict_type == StateDictType.SHARDED_STATE_DICT:
+            summon_ctx = (
+                self._summon_full_params(recurse=False, writeback=False)
+                if self.training_state != TrainingState_.SUMMON_FULL_PARAMS else
+                contextlib.suppress()
+            )
+            with summon_ctx:
+                return super().state_dict(*args, **kwargs)
+        else:
+            raise ValueError(f"Unknown StateDictType {self._state_dict_type}.")
+
+    def _local_state_dict(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        Returns the local state of the module. Parameters are flattened and
+        sharded, so the resulting state_dict can only be loaded after the module
+        has been wrapped with FSDP.
+        """
+        with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT):
+            return self.state_dict(*args, **kwargs)
+
+    def _full_post_load_state_dict_hook(self, *args, **kwargs) -> None:
+        # We should exit summon_full_params context.
+        self._assert_state([TrainingState_.SUMMON_FULL_PARAMS])
+        assert getattr(self, '_full_param_ctx', None) is not None
+        self._full_param_ctx.__exit__(None, None, None)
+        self._full_param_ctx = None
+
+    def _sharded_state_dict(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        Returns the sharded states of the module. Parameters are unflattened and
+        sharded, so the resulting state_dict can be used with any parallelism
+        (e.g., DPP, model parallelism, and single trainer) after a valid
+        resharding.
+        """
+        with self.set_state_dict_type(StateDictType.SHARDED_STATE_DICT):
+            return self.state_dict(self, *args, **kwargs)
+
+    def _full_pre_load_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> None:
+        # We do not expect to be calling pre-hooks twice without post-hook
+        # call in between.
+        assert getattr(self, '_full_param_ctx', None) is None
+        # Note that it needs writeback=True to persist.
+        self._full_param_ctx = self._summon_full_params(
+            recurse=False, writeback=True
+        )
+        self._full_param_ctx.__enter__()
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.")
+
+    def _local_post_load_state_dict_hook(self, *args, **kwargs) -> None:
+        pass
+
+    def _local_pre_load_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> None:
+        """
+        This hook finds the local flat_param for this FSDP module from the
+        state_dict. The flat_param should be a ShardedTensor. This hook converts
+        the ShardedTensor to a tensor. No copy happen unless padding is required.
+        """
+        _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.")
+        fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}"
+        if fqn not in state_dict:
+            assert getattr(self.module, FLAT_PARAM, None) is None, (
+                "No flat parameter in state_dict but self.module.flat_param is not None"
+            )
+            return
+        load_tensor = state_dict[fqn]
+        assert isinstance(
+            load_tensor, ShardedTensor
+        ), "Tensors in local_state_dict should be ShardedTensor."
+
+        # Convert the ShardedTensor to a Tensor.
+        shards = load_tensor.local_shards()
+        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+        load_tensor = cast(torch.Tensor, shards[0].tensor)
+
+        # Get the metada of the flat_param to decide whether to pad the loaded
+        # tensor.
+        flat_param = self.module.flat_param
+        assert flat_param is not None
+        if flat_param.num_padded not in (0, flat_param.numel()):
+            assert load_tensor.numel() < flat_param.numel(), (
+                f"Local shard size = {flat_param.numel()} and the tensor in "
+                f"the state_dict is {load_tensor.numel()}."
+            )
+            load_tensor = F.pad(load_tensor, [0, flat_param.num_padded])
+        state_dict[fqn] = load_tensor
+
+    def _sharded_post_load_state_dict_hook(self, *args, **kwargs) -> None:
+        pass
+
+    def _sharded_pre_load_state_dict_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+    ) -> None:
+        """
+        The hook combines the unflattened, sharded parameters (ShardedTensor) to
+        a new FlatParameter and shards the new FlatParameter to the local chunk.
+        """
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.")
+        if self.module.no_params:
+            return
+
+        if not self.module.flat_param._is_sharded:
+            raise RuntimeError(
+                "load_sharded_state_dict can only be called when parameters "
+                "are flatten and sharded."
+            )
+
+        nonsharded_tensors = []
+        # TODO: Reduce the communication by using only one _all_gather_base to
+        # gather all the parameters in this layer. This can be achieved by
+        # concatenated all the local shards and then append the padding.
+        # https://github.com/pytorch/pytorch/issues/77461
+        for module_name, _, param_name in self.module.flat_param._param_infos:
+            module_name = module_name.replace(f"{FPW_MODULE}.", "")
+            module_name = module_name.replace(f"{FPW_MODULE}", "")
+            if module_name:
+                module_name = f"{module_name}."
+            fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{module_name}{param_name}"
+            param = state_dict.pop(fqn)
+
+            # All-gather the param (ShardedTensor)
+            shards = param.local_shards()
+            local_tensor = cast(torch.Tensor, shards[0].tensor).flatten()
+            dim_0_size = param.size()[0]
+            param_numel = param.size().numel()
+            chunk_size = (
+                math.ceil(dim_0_size / self.world_size) * param_numel // dim_0_size
+            )
+            num_padding = chunk_size - local_tensor.numel()
+            if num_padding > 0:
+                local_tensor = F.pad(local_tensor, [0, num_padding])
+            tensor = torch.empty(
+                chunk_size * self.world_size, dtype=local_tensor.dtype
+            ).cuda()
+            dist._all_gather_base(tensor, local_tensor, group=self.process_group)
+            tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
+            nonsharded_tensors.append(tensor)
+
+        # Create a new flat_param from the loaded, non-sharded tensors.
+        flat_param = self.module.flat_param
+        loaded_flat_param = FlatParameter(nonsharded_tensors, requires_grad=False)
+
+        # Get the chunk from the loaded flat_param for the local rank.
+        loaded_flat_param, num_to_pad = self._get_shard(loaded_flat_param)
+        assert flat_param.numel() == loaded_flat_param.numel(), (
+            f"The loaded local chunk has different numel({flat_param.numel()}) "
+            f"from the local chunk {flat_param.numel()}."
+        )
+        assert flat_param.num_padded == num_to_pad, (
+            f"The loaded local chunk has different padding({num_to_pad}) "
+            f"from the local chunk {flat_param.num_padded}."
+        )
+        state_dict[f"{prefix}_fsdp_wrapped_module.flat_param"] = loaded_flat_param
+
+    @staticmethod
+    def _pre_load_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> None:
+        """
+        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
+        is called. ``self._state_dict_type`` is used to decide what preprocessing
+        will be done.
+        """
+        # Code that is common for all state_dict impls
+        self = cast(FullyShardedDataParallel, module)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        # Dispatch into state_dict specific implementation of pre-hook.
+        self._pre_load_state_dict_hook_fn[self._state_dict_type](state_dict, prefix)
+
+    @staticmethod
+    def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
+        # Code that is common for all state_dict impls
+        self = cast(FullyShardedDataParallel, module)
+        # Dispatch into state_dict type specific implementation of post-hook for
+        # loading state_dict.
+        self._post_load_state_dict_hook_fn[self._state_dict_type]()
+
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        *args,
+    ) -> NamedTuple:
+        """
+        The entry point of all three FSDP ``load_state_dict`` APIs. By default,
+        calling ``load_state_dict`` on an FSDP module will result in FSDP
+        attempting to load a "full" state_dict, i.e. a state_dict consisting of
+        full, unsharded, unflattened original module parameters. This requires
+        FSDP to load the full parameter context on each rank which could result
+        in GPU OOM. As a result, :func:`state_dict_type` API is available to
+        configure between `load_state_dict` implementations. User can thus use
+        ``with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT)`` context
+        manager to load a local state dict checkpoint that will restore only
+        local shards of the module. Currently, the only supported
+        implementations are ``StateDictType.LOCAL_STATE_DICT`` and
+        ``StateDictType.FULL_STATE_DICT`` (default). Please see :func:`state_dict`
+        for documentation around creating an FSDP checkpoint.
+
+        Example::
+
+        >>> import torch
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> from torch.distributed.fsdp import StateDictType
+        >>> torch.cuda.set_device(device_id)
+        >>> my_module = nn.Linear(...)
+        >>> sharded_module = FSDP(my_module)
+        >>> checkpoint = torch.load(PATH)
+        >>> full_state_dict = checkpoint['full_state_dict']
+        >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
+        >>>     sharded_module.load_state_dict(full_state_dict)
+        >>> full_dict.keys()
+        >>> odict_keys(['weight', 'bias'])
+        >>> # using local state dict
+        >>> local_state_dict = checkpoint['local_state_dict]
+        >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
+        >>>     sharded_module.load_state_dict(local_state_dict)
+        >>> local_dict.keys()
+        >>> odict_keys(['flat_param', 'inner.flat_param'])
+
+        .. warning:: This needs to be called on all ranks, since synchronization
+            primitives may be used.
+        """
+        return super().load_state_dict(state_dict, *args)
+
+    def _load_local_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        *args,
+    ) -> NamedTuple:
+        """
+        Load states from a flattened, sharded state dictionary.
+        """
+        with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT):
+            return self.load_state_dict(state_dict, *args)
+
+    def _load_sharded_state_dict(
+        self,
+        state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"],
+        strict: bool = True,
+    ) -> NamedTuple:
+        """
+        Load states from a unflattened, sharded state dictionary.
+        """
+        with self.set_state_dict_type(StateDictType.SHARDED_STATE_DICT):
+            return self.load_state_dict(state_dict, strict)
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        with torch.autograd.profiler.record_function("FullyShardedDataParallel.forward"):
+            self._lazy_init()
+
+            # Start of a forward pass.
+            self.training_state = TrainingState_.FORWARD
+            if self._is_root:
+                # TODO: disabling side stream for tensor copies for now, investigate
+                # perf with it on / off.
+                # Place inputs on compute_device. This is a noop if inputs are already
+                # on compute_device. Note that when device_id is specified,
+                # device_id == self.compute_device is guaranteed.
+                # TODO: for mixed precision, move inputs to right device + cast might
+                # be done in one go for performance.
+                args, kwargs = _to_kwargs(args, kwargs, self.compute_device.index, False)
+                args = args[0]
+                kwargs = kwargs[0]
+
+            # Cast inputs to their mixed precision type.
+            if (
+                self._is_root
+                and self._mixed_precision_enabled_for_params()
+            ):
+                input_dtype = self.mixed_precision.param_dtype
+                args, kwargs = self._cast_fp_inputs_to_precision(
+                    input_dtype, *args, **kwargs
+                )
+
+            # All-gather full parameters, moving them to compute_device if
+            # necessary.
+            self._rebuild_full_params()
+            # Wait for all_gather full parameters to finish before computation
+            torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+
+            # Register backward hooks to reshard params and reduce-scatter grads.
+            # These need to be re-registered every forward pass in some cases where grad_fn
+            # is mutated.
+            self._register_post_backward_hooks()
+            outputs = self.module(*args, **kwargs)
+
+            if self not in self._fsdp_graph_order:
+                self._my_fsdp_idx_in_graph = len(self._fsdp_graph_order)
+                self._fsdp_graph_order.append(self)
+
+            if self.reshard_after_forward:
+                self._free_full_params()
+                if (
+                    self._mixed_precision_enabled_for_params()
+                ):
+                    self._free_mp_shard(self.params)
+            # Switch to original local shards of params. We maintain this invariant throughout
+            # the code, i.e., ``p.data == p._local_shard`` after each function. This
+            # also ensures that after the first forward, the optimizer state will be
+            # initialized with the correct dtype and (sharded) size, since optimizer
+            # state is typically initialized lazily in ``optim.step()``. Note that
+            # when CPU offload is enabled, _use_param_local_shard implicitly
+            # offloads the local shard to CPU by making p.data point to
+            # p._local_shard, which would reside on CPU.
+            self._use_param_local_shard()
+
+            # Register pre-backward hooks to all-gather the params for the backward
+            # pass (if output's grad was needed). This won't register anything if
+            # we are in eval mode.
+            outputs = self._register_pre_backward_hooks(outputs)
+
+            # Done with a forward pass.
+            self.training_state = TrainingState_.IDLE
+
+        return outputs
+
+    @torch.no_grad()
+    def _write_back_current_shard(self, full_params):
+        """
+        Writes back full_params into self.params.
+        """
+        for p, (full_param, _) in zip(self.params, full_params):
+            if not p._is_sharded:  # type: ignore[attr-defined]
+                continue  # Already copied because no sharding.
+
+            # TODO: Might be able to refactor to use _get_shard.
+            chunks = full_param.chunk(self.world_size)  # type: ignore[attr-defined]
+            assert len(chunks) > self.rank
+            chunk = chunks[self.rank]
+            p._local_shard.copy_(chunk)  # type: ignore[attr-defined]
+
+    @contextlib.contextmanager
+    def _summon_full_params(
+        self,
+        recurse: bool = True,
+        writeback: bool = True,
+        rank0_only: bool = False,
+        offload_to_cpu: bool = False,
+    ):
+        if writeback and rank0_only:
+            raise ValueError(
+                "writeback=True and rank0_only=True is not supported, as model "
+                "parameter shapes will be different across ranks, and writing "
+                "to them can lead to inconsistencies across ranks when the "
+                "context is exited."
+            )
+
+        if offload_to_cpu and not rank0_only:
+            warnings.warn(
+                "offload_to_cpu and rank0_only=False will result in "
+                "full parameters being redundantly copied to CPU memory for "
+                "GPUs that reside on the same machine, which may incur the risk of "
+                "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
+                "rank0_only=True."
+            )
+
+        def _free_full_params_and_use_local_shard(params_to_free):
+            # We may not always be able to free the full param, for example in
+            # the case where world_size == 1 and the shard actually points to
+            # the full parameter.
+            for (param, can_free) in params_to_free:
+                if can_free:
+                    current_stream = torch.cuda.current_stream()
+                    # Don't let PyTorch reuse this memory until all work in the
+                    # current stream is complete
+                    param.record_stream(current_stream)
+                    _free_storage(param)
+
+            # when CPU offload is enabled, _use_param_local_shard implicitly
+            # offloads the local shard to CPU by making p.data point to
+            # p._local_shard, which would reside on CPU.
+            self._use_param_local_shard()
+
+        if recurse:
+            with contextlib.ExitStack() as stack:
+                # Summon all params for any nested FSDP instances.
+                for module in self.fsdp_modules(self):
+                    stack.enter_context(
+                        module._summon_full_params(
+                            recurse=False,
+                            writeback=writeback,
+                            rank0_only=rank0_only,
+                            offload_to_cpu=offload_to_cpu,
+                        )
+                    )
+                # Yield to the caller, with full params in all nested instances.
+                yield
             # Exiting from the ExitStack will re-shard params.
             return
         else:
@@ -738,18 +2389,186 @@ def _summon_full_params(self, recurse: bool = True, writeback: bool = True) -> G
             # forward/backward.
             self.training_state = TrainingState_.SUMMON_FULL_PARAMS
 
-            currently_local_params = self._collect_local_params()
-            self._rebuild_full_params()
-            self._fsdp_wrapped_module._unflatten_params_if_needed()
+            # Even if rank0_only = True, we need to materialize all params here
+            # and free them right after as full param materialization requires
+            # collective comm.
+            currently_local_params = self._rebuild_full_params()
+            # Wait for all_gather to finish before computation
+            torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+            my_rank = dist.get_rank(self.process_group)
+            if offload_to_cpu and (not rank0_only or my_rank == 0):
+                for p in self.params:
+                    if p._is_sharded:
+                        with torch.no_grad():
+                            # Note that we avoid using p._full_param_padded
+                            # directly here as we may not be using that param
+                            # as the full_param from _rebuild_full_params (i.e.)
+                            # in mixed precision.
+                            for p, (full_param, _) in zip(
+                                self.params, currently_local_params
+                            ):
+                                full_param = full_param.to(torch.device("cpu"))
+                                self._update_p_data(p, output_tensor=full_param)
+
+            if rank0_only and my_rank != 0:
+                _free_full_params_and_use_local_shard(currently_local_params)
+                try:
+                    yield
+                finally:
+                    self.training_state = TrainingState_.IDLE
+            else:
+                # FSDP now has the full flattened parameter. Unflatten it to get the
+                # full parameters.
+                with contextlib.ExitStack() as stack:
+                    # Invariant: rank == 0 or !rank0_only
+                    stack.enter_context(self.module.unflatten_params())
+                    try:
+                        yield
+                    finally:
+                        if offload_to_cpu:
+                            for p in self.params:
+                                if p._is_sharded:
+                                    with torch.no_grad():
+                                        # Note that we avoid using
+                                        # p._full_param_padded directly here as
+                                        # we may not be using that param
+                                        # as the full_param from
+                                        # _rebuild_full_params (i.e. in mixed
+                                        # precision.
+                                        for p, (full_param, _) in zip(
+                                            self.params, currently_local_params
+                                        ):
+                                            full_param = full_param.to(self.compute_device)
+                                            self._update_p_data(
+                                                p, output_tensor=full_param,
+                                            )
+
+                        if writeback:
+                            self._write_back_current_shard(currently_local_params)
+                        stack.close()
+                        _free_full_params_and_use_local_shard(currently_local_params)
+                        self.training_state = TrainingState_.IDLE
+
+    @staticmethod
+    @contextlib.contextmanager
+    def summon_full_params(
+        module,
+        recurse: bool = True,
+        writeback: bool = True,
+        rank0_only: bool = False,
+        offload_to_cpu: bool = False,
+    ) -> Generator:
+        r""" A context manager to expose full params for FSDP instances.
+        Can be useful *after* forward/backward for a model to get
+        the params for additional processing or checking. It can take a non-FSDP
+        module and will summon full params for all contained FSDP modules as
+        well as their children, depending on the ``recurse`` argument.
 
-            try:
-                yield
-            finally:
-                if writeback:
-                    self._write_back_current_shard()
-                self._free_full_params(currently_local_params)
-                self._use_param_local_shard()
-                self.training_state = TrainingState_.IDLE
+        .. note:: This can be used on inner FSDPs.
+        .. note:: This can *not* be used within a forward or backward pass. Nor
+            can forward and backward be started from within this context.
+        .. note:: Parameters will revert to their local shards after the context
+            manager exits, storage behavior is the same as forward.
+        .. note:: The full parameters can be modified, but only the portion
+            corresponding to the local param shard will persist after the
+            context manager exits (unless ``writeback=False``, in which case
+            changes will be discarded). In the case where FSDP does not shard
+            the parameters, currently only when ``world_size == 1``, or ``NO_SHARD``
+            config, the modification is persisted regardless of ``writeback``.
+        .. note:: This method works on modules which are not FSDP themselves but
+            may contain multiple independent FSDP units. In that case, the given
+            arguments will apply to all contained FSDP units.
+
+        .. warning:: Note that ``rank0_only=True`` in conjunction with
+            ``writeback=True`` is not currently supported and will raise an
+            error. This is because model parameter shapes would be different
+            across ranks within the context, and writing to them can lead to
+            inconsistency across ranks when the context is exited.
+
+        .. warning:: Note that ``offload_to_cpu`` and ``rank0_only=False`` will
+            result in full parameters being redundantly copied to CPU memory for
+            GPUs that reside on the same machine, which may incur the risk of
+            CPU OOM. It is recommended to use ``offload_to_cpu`` with
+            ``rank0_only=True``.
+
+        Args:
+            recurse (bool, Optional): recursively summon all params for nested
+                FSDP instances (default: True).
+            writeback (bool, Optional): if ``False``, modifications to params are
+                discarded after the context manager exists;
+                disabling this can be slightly more efficient (default: True)
+            rank0_only (bool, Optional): if ``True``, full parameters are
+                materialized on only global rank 0. This means that within the
+                context, only rank 0 will have full parameters and the other
+                ranks will have sharded parameters. Note that setting
+                ``rank0_only=True`` with ``writeback=True`` is not supported,
+                as model parameter shapes will be different across ranks
+                within the context, and writing to them can lead to
+                inconsistency across ranks when the context is exited.
+            offload_to_cpu (bool, optional): If ``True``, full parameters are
+                offloaded to CPU. Note that this offloading currently only
+                occurs if the parameter is sharded (which is only not the case
+                for world_size = 1 or ``NO_SHARD`` config). It is recommended
+                to use ``offload_to_cpu`` with ``rank0_only=True`` to avoid
+                redundant copies of model parameters being offloaded to the same CPU memory.
+        """
+        # Note that we specify root_only as FSDP roots will handle summoning
+        # child FSDP instances based on recurse argument.
+        fsdp_modules = FullyShardedDataParallel.fsdp_modules(
+            module, root_only=True
+        )
+        # Summon all params for all FSDP instances
+        with contextlib.ExitStack() as stack:
+            for module in fsdp_modules:
+                stack.enter_context(
+                    module._summon_full_params(
+                        recurse=recurse,
+                        writeback=writeback,
+                        rank0_only=rank0_only,
+                        offload_to_cpu=offload_to_cpu,
+                    )
+                )
+            # Yield to the caller, with full params in all FSDP instances.
+            yield
+        # Exiting from the ExitStack will reshard all params.
+        return
+
+    def named_buffers(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.Tensor]]:
+        """
+        Overrides :meth:`named_buffers()` to intercept buffer names and
+        remove all occurrences of the FSDP-specific flattened buffer prefix
+        when inside the :meth:`summon_full_params` context manager.
+        """
+        in_summon_full_params = self.training_state == TrainingState_.SUMMON_FULL_PARAMS
+        for buffer_name, buffer in super().named_buffers(*args, **kwargs):
+            if in_summon_full_params:
+                # Remove any instances of the FSDP-specific prefix; there can
+                # be multiple in the case of nested FSDP modules
+                buffer_name = buffer_name.replace(FSDP_PREFIX, "")
+            yield (buffer_name, buffer)
+
+    def named_parameters(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
+        """
+        Overrides :meth:`named_parameters()` to intercept parameter names and
+        remove all occurrences of the FSDP-specific flattened parameter prefix
+        when inside the :meth:`summon_full_params` context manager.
+        """
+        # Determine which logic to use based on the context at call time
+        in_summon_full_params = self.training_state == TrainingState_.SUMMON_FULL_PARAMS
+        for param_name, param in super().named_parameters(*args, **kwargs):
+            if in_summon_full_params:
+                # Remove any instances of the FSDP-specific prefix; there can
+                # be multiple in the case of nested FSDP modules
+                param_name = param_name.replace(FSDP_PREFIX, "")
+            yield (param_name, param)
 
     def _register_pre_backward_hooks(self, outputs: Any) -> Any:
         """Register pre-backward hook to run before the wrapped module's
@@ -789,6 +2608,10 @@ def _pre_backward_hook(*unused: Any) -> None:
                 # pre-backward hook.
                 torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
 
+            # Start of a backward pass for the first time in an backward pass.
+            self._assert_state([TrainingState_.IDLE])
+            self.training_state = TrainingState_.BACKWARD_PRE
+
             # All-gather full parameters, moving them to compute device if
             # necessary.
             self._rebuild_full_params()
@@ -803,9 +2626,6 @@ def _pre_backward_hook(*unused: Any) -> None:
             self._pre_backward_hook_has_run = True
             # Prepare p.grad so that it is in the right shape, device, accumulated values, etc.
             self._prep_grads_for_backward()
-            # Start of a backward pass for the first time in an backward pass.
-            self._assert_state([TrainingState_.IDLE])
-            self.training_state = TrainingState_.BACKWARD_PRE
 
         def _register_hook(t: torch.Tensor) -> torch.Tensor:
             if t.requires_grad:
@@ -899,8 +2719,24 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                 "FSDP only works with gradients that don't require gradients"
             )
 
-        self._free_full_params(cast(List[FlatParameter], [param]))
-        # Switch to local shard after backward.
+        if self._require_backward_grad_sync or \
+                self.sharding_strategy == ShardingStrategy.FULL_SHARD:
+            # We free full parameters unless we are in `no_sync()` (i.e. when
+            # `_require_backward_grad_sync=False`) and not using the
+            # `FULL_SHARD` strategy. If we are not using the `FULL_SHARD`
+            # strategy (e.g. instead using `SHARD_GRAD_OP`), then we keep the
+            # full parameters in memory and save network overhead.
+            self._free_full_params(cast(List[FlatParameter], [param]))
+
+        if self._mixed_precision_enabled_for_params():
+            # Noop if reshard_after_forward=True because we'd free the param
+            # shard when rebuilding the full params in the pre_beckward_hook.
+            self._free_mp_shard(cast(List[FlatParameter], [param]))
+
+        # Switch to local shard after backward. Note that
+        # when CPU offload is enabled, _use_param_local_shard implicitly
+        # offloads the local shard to CPU by making p.data point to
+        # p._local_shard, which would reside on CPU.
         self._use_param_local_shard(cast(List[FlatParameter], [param]))
 
         # Prefetch previous layer's full params in backward pass post backward hook,
@@ -913,21 +2749,46 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
             # Wait for all_gather to finish before computation.
             torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
 
+        if not self._require_backward_grad_sync:
+            return
+
         # Wait for all work in the current stream to finish, then start the
         # reductions in post_backward stream.
         self._streams["post_backward"].wait_stream(torch.cuda.current_stream())
 
         with torch.cuda.stream(self._streams["post_backward"]):
             orig_grad_data = param.grad.data
+            if (
+                self._mixed_precision_enabled_for_reduce()
+            ):
+                # Cast gradient to precision in which it should be communicated.
+                # TODO: Make this a communication hook when communication hooks
+                # are implemented for FSDP. Note that this is a noop if the
+                # reduce_dtype matches the param dtype.
+                param.grad.data = param.grad.data.to(self.mixed_precision.reduce_dtype)
 
             if self.gradient_predivide_factor > 1:
                 # Average grad by world_size for consistency with PyTorch DDP.
                 param.grad.div_(self.gradient_predivide_factor)
 
+            grad = param.grad.data
             if param._is_sharded:  # type: ignore[attr-defined]
-                grad_flatten = torch.flatten(param.grad)
+                # We clear `param.grad` to permit repeated gradient
+                # computations when this FSDP module is called multiple times.
+                # This is to avoid a race among multiple re-entrant backward
+                # passes. For example, the second backward pass computation
+                # precedes ahead of the first backward pass reduction, which is
+                # possible since the reduction is in a different stream and is
+                # async. Then, the first backward pass may be incorrectly
+                # reducing the second backward pass's `param.grad`.
+                # The reduced gradients are accumulated in
+                # `param._saved_grad_shard`, and the gradient reductions can
+                # happen in arbitrary order, though we tolerate this due to the
+                # (approximate) commutativity of floating-point addition.
+                param.grad = None
+                grad_flatten = torch.flatten(grad)
                 chunks = list(grad_flatten.chunk(self.world_size))
-                num_pad = self.world_size * chunks[0].numel() - param.grad.numel()
+                num_pad = self.world_size * chunks[0].numel() - grad.numel()
                 input_flattened = F.pad(grad_flatten, [0, num_pad])
                 output = torch.zeros_like(chunks[0])
                 dist._reduce_scatter_base(
@@ -936,16 +2797,71 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                 if self.gradient_postdivide_factor > 1:
                     # Average grad by world_size for consistency with PyTorch DDP.
                     output.div_(self.gradient_postdivide_factor)
-                param.grad.data = output
+
+                # Note that we need to cast grads back to the full precision if
+                # 1) parameters were in reduced precision during fwd, as grads
+                # would thus be in this reduced precision, or
+                # 2) parameters did not have precision reduced, but grads
+                # had reduced precision for communication.
+                if (
+                    self._mixed_precision_enabled_for_params() or self._mixed_precision_enabled_for_reduce()
+                ):
+                    # Cast gradients back to the full parameter precision so that
+                    # optimizer.step() happens in full precision.
+                    orig_param_grad_data = output
+                    output.data = output.data.to(dtype=param.data.dtype)
+                    # Don't let this memory get reused until after the transfer.
+                    orig_param_grad_data.record_stream(torch.cuda.current_stream())
+
+                # To support gradient accumulation outside `no_sync()`, we save
+                # the gradient data to `param._saved_grad_shard` before the
+                # backward pass, accumulate gradients into it here, and set
+                # `param.grad` with the accumulated value at the end of the
+                # backward pass in preparation for the optimizer step.
+                accumulate_grad = hasattr(param, "_saved_grad_shard")
+                if accumulate_grad:
+                    p_assert(
+                        param._saved_grad_shard.shape == output.shape,  # type: ignore[attr-defined]
+                        "Shape mismatch when accumulating gradients: "  # type: ignore[attr-defined]
+                        f"existing grad shape={param._saved_grad_shard.shape} "
+                        f"new grad shape={output.shape}"  # type: ignore[attr-defined]
+                    )
+                    p_assert(
+                        param._saved_grad_shard.device == output.device,  # type: ignore[attr-defined]
+                        "Device mismatch when accumulating gradients: "  # type: ignore[attr-defined]
+                        f"existing grad device={param._saved_grad_shard.device} "
+                        f"new grad device={output.device}"  # type: ignore[attr-defined]
+                    )
+                    param._saved_grad_shard += output  # type: ignore[attr-defined]
+                else:
+                    param._saved_grad_shard = output  # type: ignore[attr-defined]
+                grad = param._saved_grad_shard  # type: ignore[attr-defined]
             else:
-                # Currently the only way for _is_sharded to be False is if
-                # world_size == 1. This could be relaxed in the future, e.g,
-                # no sharding like PyTorch DDP, in which case grads should be
-                # all-reduced here.
+                # Currently the way for _is_sharded to be False is if
+                # world_size == 1 or sharding_strategy is NO_SHARD.
                 assert (
-                    self.world_size == 1
-                ), "Currently the only way for _is_sharded to be False is \
-                    world_size == 1"
+                    self.world_size == 1 or self.sharding_strategy == ShardingStrategy.NO_SHARD
+                ), "Currently the way for _is_sharded to be False is \
+                    world_size == 1 or sharding_stratagy is set to be NO_SHARD"
+                if self.sharding_strategy == ShardingStrategy.NO_SHARD:
+                    dist.all_reduce(param.grad, group=self.process_group)
+                    if self.gradient_postdivide_factor > 1:
+                        # Average grad by world_size for consistency with PyTorch DDP.
+                        param.grad.div_(self.gradient_postdivide_factor)
+                # Note that we need to cast grads back to the full precision if
+                # 1) parameters were in reduced precision during fwd, as grads
+                # would thus be in this reduced precision, or
+                # 2) parameters did not have precision reduced, but grads
+                # had reduced precision for communication.
+                if (
+                    self._mixed_precision_enabled_for_params() or self._mixed_precision_enabled_for_reduce()
+                ):
+                    # Cast gradients back to the full parameter precision so that
+                    # optimizer.step() happens in full precision.
+                    orig_param_grad_data = param.grad.data
+                    param.grad.data = param.grad.data.to(dtype=param.data.dtype)
+                    # Don't let this memory get reused until after the transfer.
+                    orig_param_grad_data.record_stream(torch.cuda.current_stream())
 
             # Regardless of sharding or not, offload the grad to CPU if we are
             # offloading params. This is so param and grad reside on same device
@@ -955,14 +2871,10 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                 # and ensure the appropriate synchronization is done by waiting
                 # streams in _wait_for_post_backward.
                 param._cpu_grad.copy_(  # type: ignore[attr-defined]
-                    param.grad.detach(), non_blocking=True
+                    grad.detach(), non_blocking=True
                 )
                 # Don't let this memory get reused until after the transfer.
-                param.grad.data.record_stream(torch.cuda.current_stream())
-                # Point param.grad.data to CPU grad to offload it. Note that
-                # the transfer is async so it is not necessarily done until we
-                # explicitly synchronize in backward.
-                param.grad.data = param._cpu_grad  # type: ignore[attr-defined]
+                grad.data.record_stream(torch.cuda.current_stream())
 
             # After _post_backward_hook returns, orig_grad_data will eventually
             # go out of scope, at which point it could otherwise be freed for
@@ -997,18 +2909,20 @@ def _wait_for_post_backward(self) -> None:
         else:
             self._assert_state(TrainingState_.BACKWARD_PRE)
 
-        torch.cuda.current_stream().wait_stream(self._streams["post_backward"])
-        if self.cpu_offload.offload_params:
-            # We need to wait for the non-blocking GPU ->
-            # CPU grad transfers to finish. We need to do this for GPU -> CPU
-            # copies because when grad is on CPU, it won't wait for any CUDA
-            # stream to finish GPU -> CPU copies unless we explicitly block the
-            # host-side with synchronize().
-            torch.cuda.current_stream().synchronize()
+        if self._require_backward_grad_sync:
+            torch.cuda.current_stream().wait_stream(self._streams["post_backward"])
+            if self.cpu_offload.offload_params:
+                # We need to wait for the non-blocking GPU ->
+                # CPU grad transfers to finish. We need to do this for GPU -> CPU
+                # copies because when grad is on CPU, it won't wait for any CUDA
+                # stream to finish GPU -> CPU copies unless we explicitly block the
+                # host-side with synchronize().
+                torch.cuda.current_stream().synchronize()
 
         # A backward pass is done, clean up below.
+        self._exec_order_data.reset()
 
-        def _remove_shard_bwd_hook(fsdp_module: FullyShardedDataParallel) -> None:
+        def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
             """Helper used below on all fsdp modules."""
             for p in fsdp_module.params:
                 if p.requires_grad:
@@ -1020,11 +2934,41 @@ def _remove_shard_bwd_hook(fsdp_module: FullyShardedDataParallel) -> None:
                         )
                         p._shard_bwd_hook[1].remove()  # type: ignore[attr-defined]
                         delattr(p, "_shard_bwd_hook")
+                    # Preserve the gradient accumulation state if not
+                    # synchronizing: `p.grad` remains the unsharded gradient
+                    # accumulated from prior `no_sync()` iterations, and
+                    # `p._saved_grad_shard` remains the sharded gradient from
+                    # the last synchronized iteration
+                    if not self._require_backward_grad_sync:
+                        continue
+                    # Set `p.grad` as needed to ensure optimizer correctness
+                    # since optimizers operate on the `grad` attribute
+                    if hasattr(p, "_cpu_grad"):
+                        p_assert(
+                            p.device == torch.device("cpu"),
+                            f"Device mismatch: p={p.device} "  # type: ignore[attr-defined]
+                            f"p._cpu_grad={p._cpu_grad}"
+                        )
+                        p.grad = p._cpu_grad  # type: ignore[attr-defined]
+                    elif hasattr(p, "_saved_grad_shard"):
+                        p_assert(
+                            p.device == p._saved_grad_shard.device,  # type: ignore[attr-defined]
+                            f"Device mismatch: p={p.device} "  # type: ignore[attr-defined]
+                            f"p._saved_grad_shard={p._saved_grad_shard.device}"
+                        )
+                        p.grad = p._saved_grad_shard  # type: ignore[attr-defined]
+                    else:
+                        p_assert(
+                            not p._is_sharded, "All sharded parameters should "
+                            "use `_saved_grad_shard`"
+                        )
+                    if hasattr(p, "_saved_grad_shard"):
+                        delattr(p, "_saved_grad_shard")
 
         # Update root and nested FSDP's hooks and flags.
         for m in self.modules():  # includes self
             if isinstance(m, FullyShardedDataParallel):
-                _remove_shard_bwd_hook(m)
+                _finalize_params(m)
                 m._pre_backward_hook_has_run = False
                 if any(p.requires_grad for p in m.parameters()):
                     # Check if the module has params and if any of them has
@@ -1036,8 +2980,9 @@ def _remove_shard_bwd_hook(fsdp_module: FullyShardedDataParallel) -> None:
                     else:
                         m._assert_state(TrainingState_.BACKWARD_PRE)
                 else:
-                    # When `m` and its children has no params or has params but
-                    # none with `requires_grad==True`, there are two cases:
+                    # When `m` and its children have no non-ignored params or
+                    # have non-ignored params but none with `requires_grad==True`,
+                    # there are two cases:
                     # 1. output tensors are `requires_grad==True`. In this case,
                     # pre-backward hook is still registered, so it is in BACKWARD_PRE state.
                     # 2. output tensors are `requires_grad==False`. In this case,
@@ -1049,40 +2994,90 @@ def _remove_shard_bwd_hook(fsdp_module: FullyShardedDataParallel) -> None:
                     # reset this flag for cases like "one forward pass + multiple backward passes"
                     self._post_backward_callback_queued = False
 
+    def _update_p_data(self, p, output_tensor: torch.Tensor) -> None:
+        """
+        Helper function to update p.data pointer.
+        Args:
+            output_tensor (torch.Tensor): this tensor contains the data we just gathered.
+        """
+        p.data = output_tensor
+        # Trim any padding and reshape to match original size.
+        p.data = p.data[: p._orig_size.numel()].view(p._orig_size)  # type: ignore[attr-defined]
+
     @torch.no_grad()
-    def _rebuild_full_params(self) -> None:
+    def _rebuild_full_params(self) -> List[Tuple[torch.Tensor, bool]]:
         """
         Gather all shards of params.
         """
-
-        def update_p_data(output_tensor: torch.Tensor) -> None:
-            """
-            Helper function to update p.data pointer.
-            Args:
-                output_tensor (torch.Tensor): this tensor contains the data we just gathered.
-            """
-            p.data = output_tensor
-            # Trim any padding and reshape to match original size.
-            p.data = p.data[: p._orig_size.numel()].view(p._orig_size)  # type: ignore[attr-defined]
-
+        # _summon_full_params must do a full precision rebuild even under mixed
+        # precision, because it is used for e.g. checkpoint where we'd like to
+        # checkpoint in full precision.
+        force_full_precision = (self.training_state == TrainingState_.SUMMON_FULL_PARAMS)
+        # full param output tensors and a flag indicating whether
+        # _summon_full_params can free them or not. It is possible that we can't
+        # free the full param, which currently occurs when the returned
+        # parameter points to the unsharded param when world_size == 1, or when
+        # we're returning the full parameter and reshard_after_forward=False
+        # (because we need to ensure p._full_param_padded stays intact)
+        output_tensors: List[Tuple[torch.Tensor, bool]] = []
         with torch.cuda.stream(self._streams["all_gather"]):
             for p in self.params:
-                if self.cpu_offload.offload_params:
+                mixed_precision_cast_ran = (
+                    self._mixed_precision_enabled_for_params()
+                    and not force_full_precision
+                )
+                if mixed_precision_cast_ran:
+                    self._cast_param_shards_to_dtype()
+                    # TODO: remove below
+                    for p in self.params:
+                        assert p.dtype == self.mixed_precision.param_dtype
+                # We can skip moving params to GPU if mixed precision, as p.data
+                # would then be pointing to p._mp_shard which is already on
+                # self.compute_device.
+                if self.cpu_offload.offload_params and not mixed_precision_cast_ran:
                     # Move params to GPU if needed. Note that we don't use
                     # self._full_param_padded.device here because the attr is
                     # not set always, i.e. when world_size=1 and
                     # p._is_sharded = False. However when it is set, the
                     # device is always self.compute_device.
                     p.data = p.data.to(self.compute_device, non_blocking=True)
+                # Check the validity of this `_rebuild_full_params()` call in
+                # terms of execution order (regardless of if FSDP actually
+                # needs to all-gather or not)
+                self._check_rebuild_full_params(p)
                 # e.g., when world_size == 1
                 if not p._is_sharded:  # type: ignore[attr-defined]
+                    if mixed_precision_cast_ran:
+                        # p.data should be the same type as p._mp_shard, and it
+                        # is safe to free.
+                        assert p.data.dtype == p._mp_shard.dtype
+                        # Safe to free because p.data points to the mp shard.
+                        output_tensors.append((p.data, True))
+                    else:
+                        # p.data points to the unsharded parameter, so not safe to
+                        # free.
+                        output_tensors.append((p.data, False))
                     continue
                 # If full param has been rebuilt or has not been freed, no need to call all gather
                 elif (
                     p._full_param_padded.storage().size()  # type: ignore[attr-defined]
                     == p._full_param_padded.size().numel()  # type: ignore[attr-defined]
                 ):
-                    update_p_data(p._full_param_padded)  # type: ignore[attr-defined]
+                    # Check that the full param is in the expected precision, if
+                    # training with mixed precision
+                    if mixed_precision_cast_ran:
+                        if p._full_param_padded.dtype != self.mixed_precision.param_dtype:
+                            raise ValueError(
+                                "_rebuild_full_params: Expected full param to be "
+                                f"of type {self.mixed_precision.param_dtype}, "
+                                f"but got {p._full_param_padded.dtype}!"
+                            )
+                    # output is full_param_padded which can be freed depending
+                    # on reshard_after_forward (this path is exercised by tests
+                    # in test_fsdp_summon_full_params).
+                    output_tensors.append((p._full_param_padded, self.reshard_after_forward))
+
+                    self._update_p_data(p, output_tensor=p._full_param_padded)  # type: ignore[attr-defined]
                     continue
                 else:
                     # If full param has not been rebuilt or has been freed, call all gather
@@ -1094,17 +3089,152 @@ def update_p_data(output_tensor: torch.Tensor) -> None:
                     assert (
                         p._full_param_padded.storage().size() == 0  # type: ignore[attr-defined]
                     ), "Full param's storage should have been freed before if all gather is needed."  # type: ignore[attr-defined]
-                    # Allocate based on full size from all shards.
-                    _alloc_storage(p._full_param_padded, size=p_full_size)  # type: ignore[attr-defined]
-                    output_tensor = p._full_param_padded  # type: ignore[attr-defined]
-
+                    if (
+                        self._mixed_precision_enabled_for_params()
+                        and force_full_precision
+                    ):
+                        # p._full_param_padded has the reduced precision type,
+                        # but we need full precision rebuild as we're in
+                        # _summon_full_params. Note that this is why
+                        # _summon_full_params collects locally used params from
+                        # _rebuild_full_params instead of relying on
+                        # p._full_param_padded, as it may not always be
+                        # allocated such as during mixed precision.
+                        output_tensor = p_data.new_zeros(p_full_size)
+                    else:
+                        # Allocate based on full size from all shards.
+                        _alloc_storage(p._full_param_padded, size=p_full_size)  # type: ignore[attr-defined]
+                        output_tensor = p._full_param_padded  # type: ignore[attr-defined]
                     # Fill output_tensor with (p.data for each shard in self.world_size)
                     dist._all_gather_base(
                         output_tensor, p_data, group=self.process_group
                     )
 
+                    # The full parameter, which can be freed. Note that we
+                    # append here before update_p_data so as to not saved the
+                    # tensor with padding trimmed, which causes issues with
+                    # writeback in _summon_full_params.
+                    output_tensors.append((output_tensor, True))
                     # Set p.data = output_tensor (with padding trimmed)
-                    update_p_data(output_tensor)
+                    self._update_p_data(p, output_tensor=output_tensor)
+                    # We can free the reduced precision shard as we have the
+                    # full precision parameter.
+                    if (
+                        self._mixed_precision_enabled_for_params()
+                    ):
+                        self._free_mp_shard(cast(List[FlatParameter], [p]))
+        return output_tensors
+
+    def _check_rebuild_full_params(self, param: FlatParameter):
+        """
+        Checks the validity of a call to :meth:`_rebuild_full_params` in terms
+        of the execution order. If on the first iteration, this uses an
+        all-gather to check that all ranks are running ``forward()`` with the
+        same parameter, erroring if not, and on subsequent iterations, if the
+        forward order differs from that of the first iteration (meaning that we
+        can no longer guarantee correct execution since all-gathers may be
+        mismatched), then we issue a warning to the user. This only issues
+        warnings on the first deviating iteration and stops checking
+        thereafter.
+
+        Only the :meth:`_rebuild_full_params` calls in the forward pass are
+        checked since a correct forward order should imply a correct
+        pre-backward order for typical cases.
+
+        Executing in ``no_sync()`` does not affect this check for
+        ``FULL_SHARD`` and ``SHARD_GRAD_OP``: (1) Being in ``no_sync()`` in the
+        first iteration does not yield a different forward
+        :meth:`_rebuild_full_params()` sequence, and (2) being in ``no_sync()``
+        in a later iteration does not give false positive warnings since the
+        forward :meth:`_rebuild_full_params()` sequence still matches the first
+        iteration sequence (for ``FULL_SHARD``) or the first iteration
+        sequence's prefix (for ``SHARD_GRAD_OP``).
+        """
+        # Only check when rebuilding the full parameters in the forward pass,
+        # and skip the check (1) when in eval mode since then there is not a
+        # safe point at which to reset the execution order data and (2) if
+        # world size is 1 since then there is no chance of desynchronization
+        if self.training_state != TrainingState_.FORWARD or \
+                not self.training or self.world_size == 1:
+            return
+        eod = self._exec_order_data
+        param_index = eod.get_param_index(param)
+        if not eod.is_first_iter:
+            # Only issue warnings on the first deviating iteration and stop
+            # checking thereafter to avoid flooding the console
+            if eod.warn_status == _ExecOrderWarnStatus.WARNED:
+                return
+            # However, we may issue multiple warnings on the first deviating
+            # iteration to help debugging, where either:
+            # 1. This iteration sees an extra `_rebuild_full_params()` in
+            # `forward()` compared to the first iteration
+            msg_prefix = curr_param_order = None  # non-`None` means we warn
+            if eod.index >= len(eod.param_order):
+                msg_prefix = "Expected to not rebuild any more parameters " \
+                    "in `forward()` for this module but trying to rebuild " \
+                    "parameters for "
+                curr_param_order = eod.param_order + [param_index]
+            else:
+                expected_param_index = eod.param_order[eod.index]
+                # 2. This iteration sees the same number of
+                # `_rebuild_full_params()` (so far) but the current parameter
+                # differs
+                if param_index != expected_param_index:
+                    expected_param_names = eod.get_unflat_param_names(expected_param_index)
+                    assert len(expected_param_names) > 0, \
+                        "Expected parameter should always be valid"
+                    msg_prefix = "Expected to rebuild parameters in " \
+                        f"`forward()` for {expected_param_names} but " \
+                        "instead trying to rebuild parameters for "
+                    curr_param_order = eod.param_order[:eod.index - 1] + [param_index]
+            to_issue_warning = msg_prefix is not None
+            if to_issue_warning:
+                assert curr_param_order is not None
+                param_names = eod.get_unflat_param_names(param_index)
+                is_added_param = len(param_names) == 0
+                if is_added_param:
+                    msg_suffix = "a newly-added parameter since construction time"
+                else:
+                    msg_suffix = f"{param_names}"
+                sub_msg = msg_prefix + msg_suffix
+                first_iter_param_names = [
+                    eod.get_unflat_param_names(index) for index in eod.param_order
+                ]
+                curr_iter_param_names = [
+                    eod.get_unflat_param_names(index) for index in curr_param_order
+                ]
+                warnings.warn(
+                    "Forward order differs from that of the first iteration "
+                    f"on rank {self.rank} -- collectives are unchecked and may "
+                    "give incorrect results or hang\n" + sub_msg + "\n" +
+                    f"First iteration's forward order: {first_iter_param_names}"
+                    "\nThis iteration's forward order (so far): "
+                    f"{curr_iter_param_names}"
+                )
+                eod.warn_status = _ExecOrderWarnStatus.WARNING
+            eod.index += 1
+        else:
+            # Use `compute_device` instead of the parameter's device in case it
+            # is offloaded on CPU and we are using NCCL backend, which requires
+            # communicated tensors be on GPU
+            device = self.compute_device
+            indices = torch.zeros(self.world_size, dtype=torch.int32, device=device)
+            index = torch.tensor([param_index], dtype=torch.int32, device=device)
+            dist._all_gather_base(indices, index, group=self.process_group)
+            # Check that all ranks plan to all-gather the same parameter index
+            for (r1, i1), (r2, i2) in itertools.combinations(
+                ((rank, indices[rank]) for rank in range(self.world_size)), 2,
+            ):
+                if not torch.equal(i1, i2):
+                    r1_param_names = eod.get_unflat_param_names(i1)
+                    r2_param_names = eod.get_unflat_param_names(i2)
+                    raise RuntimeError(
+                        f"Forward order differs across ranks: rank {r1} is "
+                        "rebuilding full parameters in `forward()` for "
+                        f"{r1_param_names} while rank {r2} is rebuilding full "
+                        f"parameters in `forward()` for {r2_param_names}"
+                    )
+            eod.param_order.append(param_index)
 
     @torch.no_grad()
     def _prep_grads_for_backward(self) -> None:
@@ -1114,6 +3244,26 @@ def _prep_grads_for_backward(self) -> None:
                 p.grad.size() != p._orig_size  # type: ignore[attr-defined]
                 or p.grad.device != p.device
             ):
+                offloaded: bool = p.grad.device != p.device
+                if offloaded:
+                    assert self.cpu_offload.offload_params, \
+                        "`p.grad.device` and `p.device` should be the same " \
+                        "if not offloading parameters to CPU"
+                prev_iter_outside_no_sync: bool = \
+                    p.grad.size() == p._local_shard.shape  # type: ignore[attr-defined]
+                # As long as the previous iteration was outside `no_sync()`,
+                # then we must save the gradient in `_saved_grad_shard`, even
+                # if the current iteration is inside `no_sync()`. This is to
+                # prepare for the next iteration outside `no_sync()`, which may
+                # try to accumulate gradients. FSDP accumulates gradients in
+                # the separate variable `p._saved_grad_shard` to leave `p.grad`
+                # for the per-iteration gradient.
+                if prev_iter_outside_no_sync:
+                    # FSDP currently does not support gradient accumulation
+                    # outside `no_sync()` when using CPU offloading (see the
+                    # warning in the class's docstring).
+                    if not offloaded:
+                        p._saved_grad_shard = p.grad.data  # type: ignore[attr-defined]
                 p.grad = None
 
     @torch.no_grad()
@@ -1125,8 +3275,12 @@ def _free_full_params(self, params: Optional[List[FlatParameter]] = None) -> Non
             params = self.params
         current_stream = torch.cuda.current_stream()
         for p in params:
-            # e.g., world_size == 1
+            # e.g., world_size == 1 or self.sharding_strategy = NO_SHARD
             if not p._is_sharded:  # type: ignore[attr-defined]
+                if (
+                    self._mixed_precision_enabled_for_params()
+                ):
+                    self._free_mp_shard(cast(List[FlatParameter], [p]))
                 continue
             # Don't let PyTorch reuse this memory until all work in the current
             # stream is complete.
@@ -1175,6 +3329,504 @@ def _assert_state(self, state: Union[TrainingState_, List[TrainingState_]]) -> N
                 traceback.print_stack()
             raise ValueError(msg)
 
+    @contextmanager
+    def no_sync(self) -> Generator:
+        """
+        A context manager to disable gradient synchronizations across FSDP
+        instances. Within this context, gradients will be accumulated in module
+        variables, which will later be synchronized in the first
+        forward-backward pass after exiting the context. This should only be
+        used on the root FSDP instance and will recursively apply to all
+        children FSDP instances.
+
+        .. note:: This likely results in higher memory usage because FSDP will
+            accumulate the full model gradients (instead of gradient shards)
+            until the eventual sync.
+
+        .. note:: When used with CPU offloading, the gradients will not be
+            offloaded to CPU when inside the context manager. Instead, they
+            will only be offloaded right after the eventual sync.
+        """
+        self._lazy_init()
+        assert self._is_root, "`no_sync()` on inner FSDP instances is not supported"
+        self._assert_state(TrainingState_.IDLE)
+        old_flags = []
+        for m in self.modules():
+            if isinstance(m, FullyShardedDataParallel):
+                old_flags.append((m, m._require_backward_grad_sync))
+                m._require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            for m, old_flag in old_flags:
+                assert not m._require_backward_grad_sync, (
+                    "`_require_backward_grad_sync` was incorrectly set to "
+                    "`True` while in the `no_sync()` context manager"
+                )
+                m._require_backward_grad_sync = old_flag
+
+    @property
+    def params_with_grad(self) -> List[Parameter]:
+        """
+        Recursively returns a list of all module parameters that have a gradient.
+        """
+        return [p for p in self.parameters() if p.grad is not None]
+
+    @torch.no_grad()
+    def clip_grad_norm_(
+        self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0
+    ) -> None:
+        """
+        Clip all gradients at this point in time. The norm is computed over all
+        gradients together, as if they were concatenated into a single vector.
+        Gradients are modified in-place.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'``
+                for infinity norm.
+
+        Returns:
+            Total norm of the parameters (viewed as a single vector).
+
+        .. note:: This is analogous to `torch.nn.utils.clip_grad_norm_` but
+            handles the partitioning and multiple devices per rank under the
+            hood. The default torch util is not applicable here, because each
+            rank only has a partial view of all the grads in the model, so
+            calling it for FSDP models would lead to different scaling being
+            applied per subset of model parameters.
+
+        .. warning:: This needs to be called on all ranks, since synchronization
+            primitives will be used.
+        """
+        # Call `_lazy_init` to ensure the stream synchronization is done appropriately.
+        self._lazy_init()
+        assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance"
+        self._assert_state(TrainingState_.IDLE)
+
+        max_norm = float(max_norm)
+        norm_type = float(norm_type)
+        # Computes the max norm for this shard's gradients and sync's across workers
+        local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda()  # type: ignore[arg-type]
+        if norm_type == math.inf:
+            total_norm = local_norm
+            dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
+        else:
+            total_norm = local_norm ** norm_type
+            dist.all_reduce(total_norm, group=self.process_group)
+            total_norm = total_norm ** (1.0 / norm_type)
+
+        if self.cpu_offload:
+            total_norm = total_norm.cpu()
+
+        clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6)
+        if clip_coef < 1:
+            # multiply by clip_coef, aka, (max_norm/total_norm).
+            for p in self.params_with_grad:
+                assert p.grad is not None
+                p.grad.detach().mul_(clip_coef.to(p.grad.device))
+
+    @staticmethod
+    def full_optim_state_dict(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        optim_input: Optional[Union[
+            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+        ]] = None,
+        rank0_only: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Consolidates the full optimizer state on rank 0 and returns it
+        as a :class:`dict` following the convention of
+        :meth:`torch.optim.Optimizer.state_dict`, i.e. with keys ``"state"``
+        and ``"param_groups"``. The flattened parameters in ``FSDP`` modules
+        contained in ``model`` are mapped back to their unflattened parameters.
+
+        .. warning:: This needs to be called on all ranks since synchronization
+            primitives are used. However, if ``rank0_only=True``, then the
+            state dict is only populated on rank 0, and all other ranks return
+            an empty :class:`dict`.
+
+        .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
+            uses full parameter names as keys instead of parameter IDs.
+
+        .. warning:: If you do not pass ``model.parameters()`` as the first
+            argument to the optimizer, then you should pass that same value to
+            this method as ``optim_input``.
+
+        .. note:: Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
+            contained in the optimizer state dict are not cloned, so there may
+            be aliasing surprises. For best practices, consider saving the
+            returned optimizer state dict immediately, e.g. using
+            ``torch.save()``.
+
+        Args:
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer ``optim`` representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``. (Default: ``None``)
+            rank0_only (bool): If ``True``, saves the populated :class:`dict`
+                only on rank 0; if ``False``, saves it on all ranks. (Default:
+                ``True``)
+
+        Returns:
+            Dict[str, Any]: A :class:`dict` containing the optimizer state for
+            ``model`` 's original unflattened parameters and including keys
+            "state" and "param_groups" following the convention of
+            :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=False``,
+            then nonzero ranks return an empty :class:`dict`.
+        """
+        osd = optim.state_dict()
+        osd_state, osd_param_groups = osd["state"], osd["param_groups"]  # alias
+
+        group = model.process_group if hasattr(model, "process_group") \
+            else None  # not all `torch.nn.Module`s have `process_group`
+        rank = dist.get_rank(group)
+        to_save = not rank0_only or rank == 0
+        full_osd: Dict = {"state": {}, "param_groups": []} if to_save else {}
+        full_osd_state = full_osd["state"] if to_save else None  # alias
+
+        # Handle the "state" part of the optimizer state dict
+        param_to_unflat_param_names = _get_param_to_unflat_param_names(model)
+        flat_param_id_to_param = _get_param_id_to_param(model, optim_input)
+        flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
+        for flat_param_id, param in enumerate(flat_param_id_to_param):  # type: ignore[assignment]
+            # Do not include parameters without state to avoid empty mappings
+            if flat_param_id not in osd_state:
+                continue
+            assert param in param_to_unflat_param_names, \
+                "Check the `param_to_unflat_params` construction\n" \
+                f"param: {param}"
+            unflat_param_names = param_to_unflat_param_names[param]
+            # For FSDP parameters, we need to unflatten
+            if isinstance(param, FlatParameter):
+                assert param in flat_param_to_fsdp_module, \
+                    "Check the `flat_param_to_fsdp_module` construction\n" \
+                    f"param: {param}"
+                unflat_state = _unflatten_optim_state(
+                    flat_param_to_fsdp_module[param], param,
+                    osd_state[flat_param_id], to_save,
+                )
+                if to_save:
+                    assert len(unflat_state) == len(unflat_param_names) and \
+                        len(unflat_state) == param._num_unflattened_params, \
+                        f"{len(unflat_state)} {len(unflat_param_names)} " \
+                        f"{param._num_unflattened_params}"
+                    for unflat_param_name, unflat_param_state in zip(
+                        unflat_param_names, unflat_state,
+                    ):
+                        full_osd_state[unflat_param_name] = unflat_param_state
+            # For parameters from non-FSDP modules, we do not need to unflatten
+            elif to_save:
+                assert len(unflat_param_names) == 1
+                unflat_param_name = unflat_param_names[0]
+                # Do not `deepcopy()` to avoid unnecessarily duplicating
+                # tensor storage
+                full_osd_state[unflat_param_name] = \
+                    copy.copy(osd_state[flat_param_id])
+                # Move all tensor state to CPU
+                param_state = full_osd_state[unflat_param_name]
+                for state_name, value in param_state.items():
+                    if torch.is_tensor(value):
+                        param_state[state_name] = value.cpu()
+
+        # Non-target ranks may return since there is no more communication
+        if not to_save:
+            return full_osd
+
+        # Handle the "param_groups" part of the optimizer state dict
+        full_osd_param_groups = full_osd["param_groups"]  # alias
+        for flat_param_group in osd_param_groups:
+            unflat_param_group = copy.deepcopy(flat_param_group)
+            param_group_params = [
+                flat_param_id_to_param[flat_param_id]
+                for flat_param_id in flat_param_group["params"]
+            ]
+            nested_unflat_param_names = [
+                param_to_unflat_param_names[param]
+                for param in param_group_params
+            ]
+            unflat_param_group["params"] = [
+                unflat_param_name
+                for unflat_param_names in nested_unflat_param_names
+                for unflat_param_name in unflat_param_names
+            ]  # flatten the list of lists
+            full_osd_param_groups.append(unflat_param_group)
+        return full_osd
+
+    @staticmethod
+    def shard_full_optim_state_dict(
+        full_optim_state_dict: Dict[str, Any],
+        model: torch.nn.Module,
+        optim_input: Optional[Union[
+            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+        ]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Shards the full optimizer state dict ``full_optim_state_dict`` by
+        remapping the state to flattened parameters instead of unflattened
+        parameters and restricting to only this rank's part of the optimizer
+        state. The first argument should be the return value of
+        :meth:`full_optim_state_dict`.
+
+        Example::
+
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> model, optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(model, optim)
+            >>> torch.save(full_osd, PATH)
+            >>> # Define new model with possibly different world size
+            >>> new_model, new_optim = ...
+            >>> full_osd = torch.load(PATH)
+            >>> sharded_osd = FSDP.shard_full_optim_state_dict(full_osd, new_model)
+            >>> new_optim.load_state_dict(sharded_osd)
+
+        .. warning:: If you do not pass ``model.parameters()`` as the first
+            argument to the optimizer, then you should pass that same value to
+            this method as ``optim_input``.
+
+        .. note:: Both :meth:`shard_full_optim_state_dict` and
+            :meth:`scatter_full_optim_state_dict` may be used to get the
+            sharded optimizer state dict to load. Assuming that the full
+            optimizer state dict resides in CPU memory, the former requires
+            each rank to have the full dict in CPU memory, where each rank
+            individually shards the dict without any communication, while the
+            latter requires only rank 0 to have the full dict in CPU memory,
+            where rank 0 moves each shard to GPU memory (for NCCL) and
+            communicates it to ranks appropriately. Hence, the former has
+            higher aggregate CPU memory cost, while the latter has higher
+            communication cost.
+
+        Args:
+            full_optim_state_dict (Dict[str, Any]): Optimizer state dict
+                corresponding to the unflattened parameters and holding the
+                full non-sharded optimizer state.
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                correspond to the optimizer state in ``full_optim_state_dict``.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``. (Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: The full optimizer state dict now remapped to
+            flattened parameters instead of unflattened parameters and
+            restricted to only include this rank's part of the optimizer state.
+        """
+        return _flatten_full_optim_state_dict(
+            full_optim_state_dict, model, True, optim_input,
+        )[0]
+
+    @staticmethod
+    def scatter_full_optim_state_dict(
+        full_optim_state_dict: Optional[Dict[str, Any]],
+        model: torch.nn.Module,
+        optim_input: Optional[Union[
+            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+        ]] = None,
+        group: Optional[Any] = None,
+    ) -> Dict[str, Any]:
+        """
+        Scatters the full optimizer state dict from rank 0 to all other ranks,
+        returning the sharded optimizer state dict on each rank. The return
+        value is the same as :meth:`shard_full_optim_state_dict`, and on rank
+        0, the first argument should be the return value of
+        :meth:`full_optim_state_dict`.
+
+        Example::
+
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> model, optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(model, optim)  # only non-empty on rank 0
+            >>> # Define new model with possibly different world size
+            >>> new_model, new_optim, new_group = ...
+            >>> sharded_osd = FSDP.scatter_full_optim_state_dict(full_osd, new_model, group=new_group)
+            >>> new_optim.load_state_dict(sharded_osd)
+
+        .. note:: Both :meth:`shard_full_optim_state_dict` and
+            :meth:`scatter_full_optim_state_dict` may be used to get the
+            sharded optimizer state dict to load. Assuming that the full
+            optimizer state dict resides in CPU memory, the former requires
+            each rank to have the full dict in CPU memory, where each rank
+            individually shards the dict without any communication, while the
+            latter requires only rank 0 to have the full dict in CPU memory,
+            where rank 0 moves each shard to GPU memory (for NCCL) and
+            communicates it to ranks appropriately. Hence, the former has
+            higher aggregate CPU memory cost, while the latter has higher
+            communication cost.
+
+        Args:
+            full_optim_state_dict (Optional[Dict[str, Any]]): Optimizer state
+                dict corresponding to the unflattened parameters and holding
+                the full non-sharded optimizer state if on rank 0; the argument
+                is ignored on nonzero ranks.
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                correspond to the optimizer state in ``full_optim_state_dict``.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``; the argument is ignored on nonzero
+                ranks. (Default: ``None``)
+            group (Optional[Any]): Model's process group or ``None`` if using
+                the default process group. (Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: The full optimizer state dict now remapped to
+            flattened parameters instead of unflattened parameters and
+            restricted to only include this rank's part of the optimizer state.
+        """
+        # Try to use the passed-in process group, the model's process group,
+        # or the default process group (i.e. ``None``) in that priority order
+        if group is None and hasattr(model, "process_group"):
+            group = model.process_group
+        rank = dist.get_rank(group)
+        world_size = dist.get_world_size(group)
+        # Check for a valid broadcast device, preferring GPU when available
+        using_nccl = dist.distributed_c10d._check_for_nccl_backend(group)
+        broadcast_device = torch.device("cuda") if torch.cuda.is_available() \
+            else torch.device("cpu")
+        if using_nccl and not torch.cuda.is_available():
+            raise RuntimeError("NCCL requires a GPU for collectives")
+        # Flatten the optimizer state dict and construct a copy with the
+        # positive-dimension tensors' shapes in place of the tensors themselves
+        # since those tensors will be broadcast separately to avoid copying
+        if rank == 0:
+            if full_optim_state_dict is None:
+                raise ValueError("Rank 0 must pass in the full optimizer state dict")
+            flat_osd, fsdp_flat_param_ids = _flatten_full_optim_state_dict(
+                full_optim_state_dict, model, False, optim_input,
+            )
+            processed_osd = _process_pos_dim_tensor_state(
+                flat_osd, fsdp_flat_param_ids, world_size,
+            )
+        # Broadcast the optim state dict without positive-dimension tensor
+        # state and the FSDP parameter IDs from rank 0 to all ranks
+        processed_osd, fsdp_flat_param_ids = \
+            _broadcast_processed_optim_state_dict(
+                processed_osd if rank == 0 else None,
+                fsdp_flat_param_ids if rank == 0 else None, rank, group,
+                broadcast_device,
+            )
+        # Broadcast positive-dimension tensor state (both sharded tensors for
+        # FSDP parameters and unsharded tensors for non-FSDP parameters)
+        sharded_osd = _broadcast_pos_dim_tensor_states(
+            processed_osd, fsdp_flat_param_ids,
+            flat_osd if rank == 0 else None, rank, world_size, group,
+            broadcast_device,
+        )
+        return sharded_osd
+
+    @staticmethod
+    def rekey_optim_state_dict(
+        optim_state_dict: Dict[str, Any],
+        optim_state_key_type: OptimStateKeyType,
+        model: torch.nn.Module,
+        optim_input: Optional[Union[
+            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+        ]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Re-keys the optimizer state dict ``optim_state_dict`` to use the key
+        type ``optim_state_key_type``. This can be used to achieve
+        compatibility between optimizer state dicts from models with FSDP
+        instances and ones without.
+
+        To re-key an FSDP full optimizer state dict (i.e. from
+        :meth:`full_optim_state_dict`) to use parameter IDs and be loadable to
+        a non-wrapped model::
+
+            >>> wrapped_model, wrapped_optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(wrapped_model, wrapped_optim)
+            >>> nonwrapped_model, nonwrapped_optim = ...
+            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(full_osd, OptimStateKeyType.PARAM_ID, nonwrapped_model)
+            >>> nonwrapped_optim.load_state_dict(rekeyed_osd)
+
+        To re-key a normal optimizer state dict from a non-wrapped model to be
+        loadable to a wrapped model::
+
+            >>> nonwrapped_model, nonwrapped_optim = ...
+            >>> osd = nonwrapped_optim.state_dict()
+            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(osd, OptimStateKeyType.PARAM_NAME, nonwrapped_model)
+            >>> wrapped_model, wrapped_optim = ...
+            >>> sharded_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, wrapped_model)
+            >>> wrapped_optim.load_state_dict(sharded_osd)
+
+        Returns:
+            Dict[str, Any]: The optimizer state dict re-keyed using the
+            parameter keys specified by ``optim_state_key_type``.
+        """
+        assert optim_state_key_type in \
+            (OptimStateKeyType.PARAM_NAME, OptimStateKeyType.PARAM_ID)
+        osd = optim_state_dict  # alias
+        # Validate that the existing parameter keys are uniformly typed
+        uses_param_name_mask = [
+            type(param_key) is str for param_key in osd["state"]
+        ]
+        uses_param_id_mask = [
+            type(param_key) is int for param_key in osd["state"]
+        ]
+        if (any(uses_param_name_mask) and not all(uses_param_name_mask)) or \
+                (any(uses_param_id_mask) and not all(uses_param_id_mask)):
+            error_msg = f"Invalid parameter keys: {osd['state'].keys()}"
+            raise ValueError(error_msg)
+        # Return directly if the existing key type matches the target key type
+        if (optim_state_key_type == OptimStateKeyType.PARAM_NAME and
+            all(uses_param_name_mask)) or \
+            (optim_state_key_type == OptimStateKeyType.PARAM_ID and
+                all(uses_param_id_mask)):
+            return osd
+        # Otherwise, actually perform the re-keying
+        new_osd = {}
+        if optim_state_key_type == OptimStateKeyType.PARAM_NAME:  # ID -> name
+            param_id_to_param = _get_param_id_to_param(model, optim_input)
+            param_to_param_name = _get_param_to_param_name(model)
+            param_id_to_param_name: List[str] = [
+                param_to_param_name[param] for param in param_id_to_param
+            ]
+            new_osd["state"] = {
+                param_id_to_param_name[param_id]: param_state
+                for param_id, param_state in osd["state"].items()
+            }
+            new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
+            for param_group in new_osd["param_groups"]:
+                param_group["params"] = sorted([
+                    param_id_to_param_name[param_id]
+                    for param_id in param_group["params"]
+                ])
+            return new_osd
+        elif optim_state_key_type == OptimStateKeyType.PARAM_ID:  # name -> ID
+            param_name_to_param = _get_param_name_to_param(model)
+            param_to_param_id = _get_param_to_param_id(model, optim_input)
+            # Because not all model parameters may be passed as the optimizer
+            # input, we may need to drop some parameters from this mapping
+            param_name_to_param_id = {
+                param_name: param_to_param_id[param]
+                for param_name, param in param_name_to_param.items()
+                if param in param_to_param_id
+            }
+            new_osd["state"] = {
+                param_name_to_param_id[param_name]: param_state
+                for param_name, param_state in osd["state"].items()
+            }
+            new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
+            for param_group in new_osd["param_groups"]:
+                param_group["params"] = sorted([
+                    param_name_to_param_id[param_name]
+                    for param_name in param_group["params"]
+                ])
+            return new_osd
+        return new_osd  # should never reach here
+
 
 def _get_default_cuda_device(module: nn.Module) -> torch.device:
     """Try to infer CUDA device from module parameters."""
@@ -1187,7 +3839,7 @@ def _get_default_cuda_device(module: nn.Module) -> torch.device:
     except StopIteration:
         pass
     # Fall back to current CUDA device
-    return torch.device("cuda")
+    return torch.device("cuda", torch.cuda.current_device())
 
 
 def _free_storage(data: torch.Tensor) -> None:
@@ -1210,3 +3862,139 @@ def _alloc_storage(data: torch.Tensor, size: torch.Size) -> None:
         data.storage().size() == 0
     ), "Then tensor storage should have been resized to be 0."
     data.storage().resize_(size.numel())  # type: ignore[attr-defined]
+
+def p_assert(cond: Any, s: Any) -> None:
+    """This is used as an alternate to ``assert`` when in the backward context
+    to print the error message ``s`` since otherwise, it is swallowed."""
+    if not cond:
+        print(s)
+        raise AssertionError
+
+def _calc_grad_norm(parameters: List[torch.nn.Parameter], p: float) -> torch.Tensor:
+    r"""Calculate gradient norm of an iterable of parameters.
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    parameters = [p for p in parameters if p.grad is not None]
+
+    if len(parameters) == 0:
+        return torch.tensor(0.0)
+    if p == math.inf:
+        local_norm = torch.tensor(max(par.grad.detach().abs().max() for par in parameters))
+    else:
+        # Compute the norm in full precision no matter what
+        local_norm = torch.linalg.vector_norm(
+            torch.stack(
+                [
+                    torch.linalg.vector_norm(par.grad.detach(), p, dtype=torch.float32)
+                    for par in parameters
+                ]
+            ),
+            p,
+        )
+    local_norm.to(dtype=parameters[0].dtype)
+    return local_norm
+
+
+def _get_param_to_unflat_param_names(
+    model: torch.nn.Module,
+    dedup_shared_params: bool = True,
+) -> Dict[torch.nn.Parameter, List[str]]:
+    """
+    Constructs a mapping from flattened parameter (including non-FSDP-module
+    parameters) to its unflattened parameter names. For non-FSDP-module
+    parameters, these mapped-to lists always contain a single element. The
+    unflattened parameter names should match the keys of the model state dict.
+
+    For shared parameters, only the first parameter name is included (following
+    the ``torch.nn.Module.parameters()`` order).
+
+    Args:
+        model (torch.nn.Module): Root module (which may or may not be a
+            :class:`FullyShardedDataParallel` instance).
+        dedup_shared_params (bool): If ``True``, only includes the first
+            list of unflattened parameter names corresponding to a parameter
+            in the module walk order; if ``False``, then includes all of the
+            unflattened parameter names.
+    """
+    def _clean_param_name(prefix, param_info):
+        """This replicates the parameter name cleaning logic in model state
+        dict but avoids gathering any parameters."""
+        name = clean_tensor_name(
+            prefix + param_info.module_name + "." + param_info.param_name
+        )
+        return name
+
+    def module_fn(module, prefix, param_to_unflat_param_names):
+        # For FSDP modules, only add the entry when considering the contained
+        # `FlattenParamsWrapper` to avoid duplication
+        if not isinstance(module, FullyShardedDataParallel):
+            for param_name, param in module.named_parameters(recurse=False):
+                prefixed_param_names = [
+                    _clean_param_name(prefix, param_info)
+                    for param_info in param._param_infos
+                ] if isinstance(param, FlatParameter) else [prefix + param_name]
+                # If this parameter has already been visited, then it is a
+                # shared parameter; then, only take the first parameter name
+                is_shared_param = param in param_to_unflat_param_names
+                if not is_shared_param:
+                    param_to_unflat_param_names[param] = prefixed_param_names
+                elif not dedup_shared_params:
+                    param_to_unflat_param_names[param].extend(prefixed_param_names)
+
+    def return_fn(param_to_unflat_param_names):
+        return param_to_unflat_param_names
+
+    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
+    return _apply_to_modules(
+        model, module_fn, return_fn, param_to_unflat_param_names,
+    )
+
+
+def _get_param_to_param_name(
+    model: torch.nn.Module,
+) -> Dict[torch.nn.Parameter, str]:
+    """
+    Constructs a mapping from parameters to their parameter names. ``model``
+    should not contain any :class:`FullyShardedDataParallel` instances, which
+    means that none of the parameters should be ``FlatParameter`` s. As a
+    result, compared to :meth:`_get_param_to_unflat_param_names`, the mapped
+    values may be flattened from singleton :class:`list` s to the contained
+    names themselves.
+
+    Args:
+        model (torch.nn.Module): Root module, which should not contain any
+            :class:`FullyShardedDataParallel` instances.
+    """
+    param_to_param_names = _get_param_to_unflat_param_names(model)
+    for param_names in param_to_param_names.values():
+        assert len(param_names) > 0, "`_get_param_to_unflat_param_names()` " \
+            "should not construct empty lists"
+        if len(param_names) > 1:
+            raise RuntimeError(
+                "Each parameter should only map to one parameter name but got "
+                f"{len(param_names)}: {param_names}"
+            )
+    param_to_param_name = {
+        param: param_names[0]
+        for param, param_names in param_to_param_names.items()
+    }
+    return param_to_param_name
+
+
+def _get_param_name_to_param(
+    model: torch.nn.Module,
+) -> Dict[str, torch.nn.Parameter]:
+    """Constructs the inverse mapping of :meth:`_get_param_to_param_name`."""
+    param_to_param_name = _get_param_to_param_name(model)
+    return dict(zip(param_to_param_name.values(), param_to_param_name.keys()))
+
+
+def clean_tensor_name(tensor_name: str) -> str:
+    """Cleans the parameter or buffer name by removing any FSDP-related
+    prefixes."""
+    # FSDP full tensor names may not have both (i.e. `FSDP_PREFIX`), so we
+    # call `replace()` twice separately
+    tensor_name = tensor_name.replace(FSDP_WRAPPED_MODULE + ".", "")
+    tensor_name = tensor_name.replace(FPW_MODULE + ".", "")
+    return tensor_name
diff --git a/torch/distributed/fsdp/shard_utils.py b/torch/distributed/fsdp/shard_utils.py
new file mode 100644
index 000000000000..966427e27526
--- /dev/null
+++ b/torch/distributed/fsdp/shard_utils.py
@@ -0,0 +1,190 @@
+import bisect
+import itertools
+import math
+from typing import Any, Dict, List, Tuple, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed import distributed_c10d
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+    EnumerableShardingSpec,
+    ShardingSpec,
+)
+
+
+def _sharding_spec_to_offsets(
+    sharding_spec: ShardingSpec, tensor_numel: int, world_size: int
+) -> List[int]:
+    r"""
+    Translates the sharding spec to a list of offsets along dim 0. If the
+    sharding spec is ChunkShardingSpec, only the ``dim`` is used and the
+    placement is not used.
+    """
+    offsets: List[int] = []
+    if isinstance(sharding_spec, EnumerableShardingSpec):
+        for shard in sharding_spec.shards:
+            offsets.append(shard.shard_offsets[0])
+    elif isinstance(sharding_spec, ChunkShardingSpec):
+        assert sharding_spec.dim == 0
+        chunk_size = math.ceil(tensor_numel / world_size)
+        if chunk_size == 1:
+            offsets = [
+                rank if rank < tensor_numel else tensor_numel
+                for rank in range(world_size)
+            ]
+        else:
+            offsets = [chunk_size if rank > 0 else 0 for rank in range(world_size)]
+            offsets = list(itertools.accumulate(offsets))
+    else:
+        raise ValueError(f"Un-recognized sharding spec type {type(sharding_spec)}.")
+
+    return offsets
+
+
+def _offsets_to_split_sizes(
+    input_offsets: List[int],
+    output_offsets: List[int],
+    tensor_numel: int,
+    world_size: int,
+    my_rank: int,
+) -> Tuple[List[int], List[int]]:
+    r"""
+    Given the shard offsets for each rank of the input tensor and output tensor,
+    this API returns the corresponding split sizes that can be passed to
+    all_to_all_single().
+    """
+
+    def _get_interval(offsets):
+        if my_rank != world_size - 1:
+            return offsets[my_rank], offsets[my_rank + 1] - 1
+        else:
+            return offsets[my_rank], tensor_numel - 1
+
+    def _offsets_to_sizes(offsets, begin, end):
+        sizes = []
+        for i, offset in enumerate(offsets):
+            next_offset = offsets[i + 1] if i < len(offsets) - 1 else end + 1
+            sizes.append(
+                (next_offset - offset)
+                - max(begin - offset, 0)
+                - max(next_offset - end - 1, 0)
+            )
+        return sizes
+
+    def _convert(from_offsets, to_offsets, split_sizes):
+        begin, end = _get_interval(from_offsets)
+        to_begin_rank = bisect.bisect(to_offsets, begin) - 1
+        to_end_rank = bisect.bisect(to_offsets, end) - 1
+        _split_sizes = _offsets_to_sizes(
+            to_offsets[to_begin_rank : to_end_rank + 1], begin, end
+        )
+        split_sizes[to_begin_rank : to_end_rank + 1] = _split_sizes
+
+    input_split_sizes = [0 for _ in range(world_size)]
+    output_split_sizes = [0 for _ in range(world_size)]
+    _convert(input_offsets, output_offsets, input_split_sizes)
+    _convert(output_offsets, input_offsets, output_split_sizes)
+
+    return input_split_sizes, output_split_sizes
+
+
+def _reshard_flatten_tensor(
+    input_tensor: ShardedTensor,
+    output_spec: ShardingSpec,
+    world_size: int,
+    my_rank: int,
+    device: torch.device,
+    process_group: Optional[dist.ProcessGroup],
+) -> torch.Tensor:
+    """
+    Resharded a sharded flatten tensor, this is used by FSDP to do sharded
+    state_dict. But the functionaility is not supported by ShardedTensor.
+    This API is designed to be used for FSDP; therefore this API supports only
+    1-D ShardedTensor (hence the naming, reshard_flatten_tensor).
+
+    This API uses the ChunkShardingSpec and EnumerableShardingSpec from
+    torch.distributed.sharding_spec but ignores the placement field in
+    ChunkShardingSpec, as the placement requires the callees understand the
+    number of GPUs per node. The API simply uses the semantics of the sharding
+    specs.
+
+    Args:
+        input_tensor (ShardedTensor): the original ShardedTensor. Must be 1D.
+        output_spec (ShardingSpec): the sharding spect for the output tensor.
+        world_size (int): total trainer count.
+        my_rank (int): the rank for this trainer.
+
+    Returns:
+        The local shard for the new ShardedTensor.
+    """
+
+    input_spec = input_tensor.sharding_spec()
+    size = input_tensor.size()
+    if isinstance(size, int):
+        raise ValueError("The input tensor has no dimensions.")
+    tensor_numel = size.numel()
+    input_offsets = _sharding_spec_to_offsets(input_spec, tensor_numel, world_size)
+    output_offsets = _sharding_spec_to_offsets(output_spec, tensor_numel, world_size)
+    input_split_sizes, output_split_sizes = _offsets_to_split_sizes(
+        input_offsets, output_offsets, tensor_numel, world_size, my_rank
+    )
+    output_size = sum(output_split_sizes)
+    local_shard = torch.empty(output_size, dtype=input_tensor.dtype, device=device)
+    dist.all_to_all_single(
+        local_shard,
+        input_tensor.local_shards()[0].tensor,
+        input_split_sizes=input_split_sizes,
+        output_split_sizes=output_split_sizes,
+        group=process_group,
+    )
+    return local_shard
+
+
+def _all_gather_sharded_tensor(
+    sharded_tensor: ShardedTensor, pg: Optional[dist.ProcessGroup] = None
+) -> torch.Tensor:
+    if pg is None:
+        pg = distributed_c10d._get_default_group()
+    world_size = dist.get_world_size(pg)
+    shards = sharded_tensor.local_shards()
+    local_tensor = shards[0].tensor.flatten()
+    dim_0_size = sharded_tensor.size()[0]  # type: ignore[index]
+    tensor_numel = sharded_tensor.size().numel()  # type: ignore[union-attr]
+    chunk_size = math.ceil(dim_0_size / world_size) * tensor_numel // dim_0_size
+    num_padding = chunk_size - local_tensor.numel()
+    if num_padding > 0:
+        local_tensor = F.pad(local_tensor, [0, num_padding])
+    tensor = torch.empty(chunk_size * world_size, dtype=local_tensor.dtype).cuda()
+    dist._all_gather_base(tensor, local_tensor, group=pg)
+    return tensor.narrow(0, 0, tensor_numel).reshape(sharded_tensor.size())
+
+
+def _gather_state_dict(
+    state_dict: Dict[str, Any],
+    pg: Optional[dist.ProcessGroup] = None,
+) -> Dict[str, Any]:
+    """
+    Given a state_dict, this API gathers all the ShardedTensor in the state_dict
+    to the output_rank, and creates a new state_dict which the values are either
+    the gathered tensors (rank == output_rank) or None (rank != output_rank).
+    """
+    new_state_dict = {}
+    for key, tensor in state_dict.items():
+        if isinstance(tensor, ShardedTensor):
+            """
+            # TODO: It is unclear why the following implementation cause a
+            # timeout in some unittests on AWS servers but not other environment.
+            output_tensor = (
+                torch.empty(tensor.shape, dtype=tensor.dtype).cuda()
+                if curr_rank == output_rank
+                else None
+            )
+            tensor.gather(output_rank, output_tensor)
+            """
+            output_tensor = _all_gather_sharded_tensor(tensor, pg)
+            tensor = output_tensor
+        new_state_dict[key] = tensor
+    return new_state_dict
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
new file mode 100644
index 000000000000..dfeaf13ea1ee
--- /dev/null
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -0,0 +1,332 @@
+from collections import abc, defaultdict
+import logging
+from typing import Dict, List, Optional, Union
+
+import torch
+from torch.cuda import FloatTensor  # type: ignore[attr-defined]
+from torch.cuda.amp.grad_scaler import GradScaler, OptState, _MultiDeviceReplicator
+from torch.distributed.distributed_c10d import ProcessGroup
+import torch.distributed as dist
+from torch.optim.sgd import SGD
+
+
+def _refresh_per_optimizer_state():
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+def _is_supported_device(tensor: torch.Tensor):
+    return tensor.is_cuda or tensor.device.type in ("xla", "cpu")
+
+
+class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
+    """
+    Lazily serves tensor to request device. This class extends
+    _MultiDeviceReplicator to allow support for "cpu" as a device.
+    """
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert _is_supported_device(master_tensor)
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+
+class ShardedGradScaler(GradScaler):
+    """
+    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
+    functionality from GradScaler:
+    * Suports Pytorch DDP and FSDP implementations
+    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
+    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
+    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
+    nodes
+
+    Example::
+
+        # Creates a ShardedGradScaler once at the beginning of training.
+        scaler = ShardedGradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
+            process group for sharding
+    """
+    def __init__(
+        self,
+        init_scale: float = 2.0 ** 16,
+        backoff_factor: float = 0.5,
+        growth_factor: float = 2.0,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+        process_group: Optional[ProcessGroup] = dist.group.WORLD,
+    ):
+        super().__init__(
+            init_scale=init_scale,
+            backoff_factor=backoff_factor,
+            growth_factor=growth_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
+        if self._enabled:
+            self.process_group = process_group
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch.Tensor, List[torch.Tensor]]:
+        if not self._enabled:
+            return outputs
+
+        if isinstance(outputs, torch.Tensor):
+            assert _is_supported_device(outputs)
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            scaled_output = outputs * self._scale.to(device=outputs.device, non_blocking=True)
+            # Here we ensure the return dtype is the same as the outputs dtype.
+            # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision
+            # format (fp16, bf16) and so the scaled loss should be of the same dtype.
+            return scaled_output.type(outputs.dtype)
+
+        stash: List[_GeneralMultiDeviceReplicator] = []
+
+        def apply_scale(val: Union[torch.Tensor, abc.Iterable]) -> Union[torch.Tensor, abc.Iterable]:
+            if isinstance(val, torch.Tensor):
+                assert _is_supported_device(val)
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_GeneralMultiDeviceReplicator(self._scale))
+                scaled_val = val * stash[0].get(val.device)
+                # Here we ensure the return dtype is the same as the outputs dtype.
+                # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision
+                # format (fp16, bf16) and so the scaled loss should be of the same dtype.
+                return scaled_val.type(val.dtype)
+            elif isinstance(val, abc.Iterable):
+                iterator = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterator)
+                else:
+                    return iterator
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)  # type: ignore[return-value]
+
+    def _foreach_non_finite_check_and_unscale_cpu_(
+        self, grads: List, found_inf: torch.Tensor, inv_scale: torch.Tensor
+    ) -> None:
+        if len(grads) == 0:
+            return
+        assert inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."
+        assert found_inf.numel() == 1, "found_inf must be a 1-element tensor."
+
+        expected_device = grads[0].device
+        for grad in grads:
+            for tensor in grad:
+                if tensor.device != expected_device:
+                    logging.error("tensor device is %s and expected device is %s" % (tensor.device, expected_device))
+                    raise ValueError("Gradients must be on the same device.")
+
+                # check for non_overlapping_and_dense doesn't exist in the python world
+                # as remarked here https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L108
+                # we assume tensor is not MTA(multi tensor apply) safe. iterate through each item regardless of dtype
+                if torch.isinf(tensor).any().item() is True or torch.isnan(tensor).any().item() is True:
+                    found_inf.data = torch.tensor([1.0])
+                    break
+                else:
+                    tensor.data *= inv_scale.item()
+
+    def _unscale_grads_(
+        self, optimizer: SGD, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool = True
+    ) -> Dict[torch.device, torch.Tensor]:
+        per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be thousands of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            # coalesce is not suported in torch.float16
+                            param_grad_fp32 = param.grad.type(torch.float32).coalesce()
+                            param.grad = param_grad_fp32.type(torch.float16)
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    if grads[0].device.type == "cpu":
+                        self._foreach_non_finite_check_and_unscale_cpu_(
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
+                        )
+                    else:
+                        torch._amp_foreach_non_finite_check_and_unscale_(
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
+                        )
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer: SGD) -> None:
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, True)
+        optimizer_state["stage"] = OptState.UNSCALED
+
+        # Synchronize the detected inf across the ranks
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+        future_handles = []
+
+        for v in optimizer_state["found_inf_per_device"].values():
+            if v.device.type == "cpu":
+                v_on_cuda = v.cuda()
+                future_handles.append(dist.all_reduce(v_on_cuda, async_op=True, group=self.process_group).get_future())
+                v.copy_(v_on_cuda.cpu())
+            else:
+                future_handles.append(dist.all_reduce(v, async_op=True, group=self.process_group).get_future())
+
+        # Make sure that the calls are done before moving out.
+        if future_handles:
+            torch.futures.wait_all(future_handles)
+
+    def step(self, optimizer: SGD, *args, **kwargs) -> Optional[float]:
+        return super().step(optimizer, *args, **kwargs)
+
+    def _amp_update_scale_cpu_(self, found_inf) -> None:
+        """
+        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
+        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
+        """
+        if found_inf.item() >= 1.0:
+            self._scale *= self._backoff_factor  # type: ignore[arg-type]
+            self._growth_tracker = 0
+        else:
+            successful = self._growth_tracker + 1  # type: ignore[operator]
+            if successful == self._growth_interval:  # type: ignore[arg-type]
+                self._scale *= self._growth_factor  # type: ignore[arg-type]
+                self._growth_tracker = 0
+            else:
+                self._growth_tracker = successful
+
+    def update(self, new_scale: Optional[Union[float, FloatTensor]] = None) -> None:
+        """
+        Updates the scale factor.
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")  # type: ignore[var-annotated]
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            if _scale.device.type == "cpu":
+                self._amp_update_scale_cpu_(found_inf_combined)
+            else:
+                torch._amp_update_scale_(
+                    self._scale,  # type: ignore[arg-type]
+                    self._growth_tracker,  # type: ignore[arg-type]
+                    found_inf_combined,
+                    self._growth_factor,  # type: ignore[arg-type]
+                    self._backoff_factor,  # type: ignore[arg-type]
+                    self._growth_interval,  # type: ignore[arg-type]
+                )
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
diff --git a/torch/distributed/fsdp/utils.py b/torch/distributed/fsdp/utils.py
deleted file mode 100644
index 3b54967c5bac..000000000000
--- a/torch/distributed/fsdp/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import Dict, List, Tuple, Union, Any, Callable, Set
-
-import torch
-
-
-"""Useful functions to deal with tensor types with other python container types."""
-
-
-def _apply_to_tensors(
-    fn: Callable, container: Union[torch.Tensor, Dict, List, Tuple, Set]
-) -> Any:
-    """Recursively apply to all tensor in different kinds of container types."""
-
-    def apply(x: Union[torch.Tensor, Dict, List, Tuple, Set]) -> Any:
-        if torch.is_tensor(x):
-            return fn(x)
-        elif isinstance(x, dict):
-            return {key: apply(value) for key, value in x.items()}
-        elif isinstance(x, (list, tuple, set)):
-            return type(x)(apply(el) for el in x)
-        else:
-            return x
-
-    return apply(container)
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index 8071ff6c71d5..9df28f489da1 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -4,25 +4,120 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-from typing import Any, Callable, Dict, Generator, Optional, Set, Tuple, Type, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    cast,
+)
 
 import torch.nn as nn
+from torch.nn.modules.batchnorm import _BatchNorm
 
 
-def default_auto_wrap_policy(
+def always_wrap_policy(*args, **kwargs) -> bool:
+    """
+    A simple wrapper policy that always returns ``True``,
+    i.e. when passed as the `auto_wrap_policy` into FSDP,
+    this will result in all submodules being wrapped as
+    distinct FSDP instances.
+    """
+    return True
+
+def transformer_auto_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    unwrapped_params: int,
+    transformer_layer_cls: Set[Type[nn.Module]],
+) -> bool:
+    """
+    A convenient auto wrap policy for transformer models. If the submodule
+    is an instance of transformer_layer_cls, the submodule will be wrapped
+    as a FSDP unit. Otherwise, all the other remainder submodules are wrapped
+    by the outermost FSDP unit. Right now, FSDP requires submodules that share
+    weights to be wrapped in the same FSDP unit, this auto wrap policy can
+    conviniently wrap the shared embeddings into the same FSDP unit for transformer
+    models. In the near future, FSDP will support submodules that share weights
+    to be wrapped in the separated FSDP units.
+
+    Return if a module should be wrapped during FSDP auto wrapping.
+
+    The first three parameters are required by :func:`_recursive_wrap`.
+
+
+    Args:
+       module (nn.Module):
+           The module to be considered in this decision.
+       recurse (bool):
+           Indicate if this is called to make a decision on whether we
+           should recurse down a subgraph of the module structure.
+           If False, it means this function is called to make a decision
+           on whether we should wrap the said module.
+       unwrapped_params (int):
+           The number of parameters yet to be wrapped in this module.
+
+       transformer_layer_cls (int):
+           Submodules with one of the `transformer_layer_cls` names
+           will be wrapped as seperated FSDP units
+    """
+    if recurse:
+        # always recurse
+        return True
+    else:
+        # if not recursing, decide whether we should wrap for the leaf node or reminder
+        return isinstance(module, tuple(transformer_layer_cls))
+
+def _wrap_batchnorm_individually(
+    module: nn.Module,
+    recurse: bool,
+    *args,
+    **kwargs,
+) -> bool:
+    """
+    A policy that wraps ``BatchNorm`` instances in their own FSDP unit.
+    """
+    if recurse:
+        # always recurse
+        return True
+    else:
+        # if not recursing, decide whether we should wrap based on whether it is a
+        # BN layer or not.
+        return isinstance(module, _BatchNorm)
+
+def _or_policy(
     module: nn.Module,
     recurse: bool,
     unwrapped_params: int,
-    # These are customizable for this default policy function.
+    policies,
+) -> bool:
+    """
+    A policy that wraps ``module`` if any policy in the passed in iterable of
+    ``policies`` returns ``True``.
+    """
+    return any(
+        policy(module, recurse, unwrapped_params) for policy in policies
+    )
+
+
+def size_based_auto_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    unwrapped_params: int,
+    # These are customizable for this policy function.
     min_num_params: int = int(1e8),
     force_leaf_modules: Optional[Set[Type[nn.Module]]] = None,
     exclude_wrap_modules: Optional[Set[Type[nn.Module]]] = None,
 ) -> bool:
-    """Default policy function for :func:`auto_wrap`.
+    """A size based auto_wrap_policy function for FSDP API.
 
-       Return if a module should be wrapped during :func:`auto_wrap`.
+       Return if a module should be wrapped during FSDP auto wrapping.
 
-       The first three parameters are used by :func:`auto_wrap`. If
+       The first three parameters are used by :func:`_recursive_wrap`. If
        you write a custom version of this policy function, your version
        needs to at least accept the first three parameters and free
        to do whatever you want in the function.
@@ -47,12 +142,12 @@ def default_auto_wrap_policy(
            Customizable set of module types to be excluded in wrapping.
     """
     force_leaf_modules = (
-        default_auto_wrap_policy.FORCE_LEAF_MODULES  # type: ignore[attr-defined]
+        size_based_auto_wrap_policy.FORCE_LEAF_MODULES  # type: ignore[attr-defined]
         if force_leaf_modules is None
         else force_leaf_modules
     )
     exclude_wrap_modules = (
-        default_auto_wrap_policy.EXCLUDE_WRAP_MODULES  # type: ignore[attr-defined]
+        size_based_auto_wrap_policy.EXCLUDE_WRAP_MODULES  # type: ignore[attr-defined]
         if exclude_wrap_modules is None
         else exclude_wrap_modules
     )
@@ -66,9 +161,9 @@ def default_auto_wrap_policy(
         return is_large and not isinstance(module, tuple(exclude_wrap_modules))
 
 
-# Set those defaults to the default_auto_wrap_policy function. Make them easy to be imported.
-default_auto_wrap_policy.EXCLUDE_WRAP_MODULES = {nn.ModuleList, nn.ModuleDict}  # type: ignore[attr-defined]
-default_auto_wrap_policy.FORCE_LEAF_MODULES = {nn.MultiheadAttention}  # type: ignore[attr-defined]
+# Set those defaults to the size_based_auto_wrap_policy function. Make them easy to be imported.
+size_based_auto_wrap_policy.EXCLUDE_WRAP_MODULES = {nn.ModuleList, nn.ModuleDict}  # type: ignore[attr-defined]
+size_based_auto_wrap_policy.FORCE_LEAF_MODULES = {nn.MultiheadAttention}  # type: ignore[attr-defined]
 
 
 @contextlib.contextmanager
@@ -78,12 +173,12 @@ def enable_wrap(
     """
     Context manager to wrap modules using a wrapper.
 
-    Useful for when you'd like to apply the same parameters to all child modules
-    that you wrap. A particularly important use case is wrapping large layers so
-    that they get sharded (in-place) during initialization, to avoid running out of
-    system memory. Large layers can indicate that they should be sharded via
-    the ``wrap`` annotation and this context manager can provide the
-    exact configuration for these nested instances.
+    Useful for when you'd like to apply the same configuration arguments to all
+    child modules that you wrap. A particularly important use case is wrapping
+    large layers so that they get sharded (in-place) during initialization, to
+    avoid running out of system memory. Large layers can indicate that they
+    should be sharded via the ``wrap`` annotation and this context manager can
+    provide the exact configuration for these nested instances.
 
     Usage::
 
@@ -103,7 +198,7 @@ def enable_wrap(
         **{"wrapper_cls": wrapper_cls},
         **wrapper_kwargs,
     }
-    with ConfigAutoWrap(**kwargs):
+    with _ConfigAutoWrap(**kwargs):
         yield
 
 
@@ -132,13 +227,13 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
         **wrap_overrides: configuration overrides that will take priority over
             the values provided by the :func:`enable_wrap` context
     """
-    if ConfigAutoWrap.in_autowrap_context:
-        assert ConfigAutoWrap.wrapper_cls is not None
+    if _ConfigAutoWrap.in_autowrap_context:
+        assert _ConfigAutoWrap.wrapper_cls is not None
 
-        wrap_overrides = {**ConfigAutoWrap.kwargs, **wrap_overrides}
+        wrap_overrides = {**_ConfigAutoWrap.kwargs, **wrap_overrides}
         return _wrap(
             module,
-            ConfigAutoWrap.wrapper_cls,
+            _ConfigAutoWrap.wrapper_cls,
             **wrap_overrides,
         )
     return module
@@ -146,6 +241,14 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
 
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
     assert wrapper_cls is not None
+    if hasattr(module, '_wrap_overrides'):
+        # If module has a _wrap_overrides attribute, we force overriding the
+        # FSDP config with these attributes for this module. Currently this
+        # is only used to disable mixed precision for BatchNorm when
+        # auto_wrapping.
+        overrides = {**kwargs, **module._wrap_overrides}  # type: ignore[arg-type]
+        return wrapper_cls(module, **overrides)
+
     return wrapper_cls(module, **kwargs)
 
 
@@ -153,17 +256,24 @@ def _recursive_wrap(
     module: nn.Module,
     auto_wrap_policy: Callable,
     wrapper_cls: Callable,
+    ignored_modules: Set[nn.Module],
+    ignored_params: Set[nn.Parameter],
     only_wrap_children: bool = False,
     **kwargs: Any
 ) -> Tuple[nn.Module, int]:
     """
     Automatically wrap child modules of *module* that meet the given
-    criteria with :func:`auto_wrap`. Does not rely on ConfigAutoWrap.
+    criteria with :func:`auto_wrap`. Does not rely on _ConfigAutoWrap.
     Args:
         module (nn.Module):
             module to recursively wrap
         auto_wrap_policy (Callable):
             A callable specifying a policy to recursively wrap layers with FSDP.
+        ignored_modules (Set[torch.nn.Module]): Modules to ignore when
+            wrapping.
+        ignored_params (Set[torch.nn.Parameter]): Parameters to ignore when
+            wrapping; these should be the parameters contained in the modules
+            in ``ignored_modules``.
     Returns:
         (nn.Module, int):
             Wrapped module and the number parameters wrapped recursively.
@@ -172,20 +282,28 @@ def _recursive_wrap(
     assert wrapper_cls is not None, "Must specify wrapper_cls"
     # Make sure no child is already wrapped.
     for _, child in module.named_modules():
+        if child in ignored_modules:
+            continue
         assert not isinstance(child, cast(type, wrapper_cls))
 
-    # We count all params, assuming none of them is already wrapped.
-    num_params = sum([p.numel() for p in module.parameters()])
+    # We count all params, assuming none of them are already wrapped.
+    num_params = sum(
+        p.numel() for p in module.parameters() if p not in ignored_params
+    )
 
     assert auto_wrap_policy is not None
     if auto_wrap_policy(module=module, recurse=True, unwrapped_params=num_params):
         total_wrapped_params = 0
         # Iterate through the children, recursively wrap if necessary
         for name, child in module.named_children():
+            if child in ignored_modules:
+                continue
             wrapped_child, num_wrapped_params = _recursive_wrap(
                 module=child,
                 auto_wrap_policy=auto_wrap_policy,
                 wrapper_cls=wrapper_cls,
+                ignored_modules=ignored_modules,
+                ignored_params=ignored_params,
                 **kwargs,
             )
             setattr(module, name, wrapped_child)
@@ -204,7 +322,7 @@ def _recursive_wrap(
     return module, 0
 
 
-class ConfigAutoWrap:
+class _ConfigAutoWrap:
     """
     Helper class to wrap modules based on default config args via a context manager.
     See :func:`enable_wrap` for more information.
@@ -219,25 +337,25 @@ def __init__(self, **kwargs: Dict[str, Any]):
 
     @staticmethod
     def enable_autowrap_context(kwargs: Any) -> None:
-        if ConfigAutoWrap.in_autowrap_context:
+        if _ConfigAutoWrap.in_autowrap_context:
             raise NotImplementedError(
                 "You are already within an autowrap context and we currently do not supported nested autowrap."
             )
-        ConfigAutoWrap.in_autowrap_context = True
+        _ConfigAutoWrap.in_autowrap_context = True
         # Get and save the wrapper cls for the context.
         assert (
             "wrapper_cls" in kwargs.keys()
-        ), "Expected to pass in wrapper_cls arg into ConfigAutoWrap."
-        ConfigAutoWrap.wrapper_cls = cast(Callable, kwargs["wrapper_cls"])
+        ), "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
+        _ConfigAutoWrap.wrapper_cls = cast(Callable, kwargs["wrapper_cls"])
         del kwargs["wrapper_cls"]
         # Save the rest.
-        ConfigAutoWrap.kwargs = kwargs
+        _ConfigAutoWrap.kwargs = kwargs
 
     @staticmethod
     def disable_autowrap_context() -> None:
-        ConfigAutoWrap.in_autowrap_context = False
-        ConfigAutoWrap.wrapper_cls = None
-        ConfigAutoWrap.kwargs = {}
+        _ConfigAutoWrap.in_autowrap_context = False
+        _ConfigAutoWrap.wrapper_cls = None
+        _ConfigAutoWrap.kwargs = {}
 
     def __enter__(self) -> None:
         self.enable_autowrap_context(self.kwargs)
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index 10b9482b2c0b..66efbccd9ca7 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -44,7 +44,7 @@ class LaunchConfig:
         rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
         rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
             to be removed in future versions, see the note below. The default timeout is 900 seconds.
-        rdzv_id: The unique run id of the job (if not passed a unique one will be
+        run_id: The unique run id of the job (if not passed a unique one will be
                 deduced from run environment - flow workflow id in flow - or auto generated).
         role: User defined role of the worker (defaults to "trainer").
         max_restarts: The maximum amount of restarts that elastic agent will conduct
diff --git a/torch/distributed/nn/__init__.py b/torch/distributed/nn/__init__.py
index b3cedcc60111..3ed1b42cbe15 100644
--- a/torch/distributed/nn/__init__.py
+++ b/torch/distributed/nn/__init__.py
@@ -1,2 +1,4 @@
-from .api.remote_module import RemoteModule
+import torch
+if torch.distributed.rpc.is_available():
+    from .api.remote_module import RemoteModule
 from .functional import *  # noqa: F403
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 5f458547d10c..9316046af772 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -9,6 +9,7 @@
     Dict,
     Iterator,
     List,
+    Mapping,
     Optional,
     Set,
     Tuple,
@@ -64,6 +65,7 @@
     "_forward_pre_hooks",
     "_state_dict_hooks",
     "_load_state_dict_pre_hooks",
+    "_load_state_dict_post_hooks",
     "_modules",
     # The two attributes below are generated methods, not available at pickling time.
     "forward_async",
@@ -323,6 +325,9 @@ def apply(self: T, fn: Callable[[Module], None]) -> T:  # type: ignore[return]
     def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
         _raise_not_supported(self.cuda.__name__)
 
+    def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
+        _raise_not_supported(self.ipu.__name__)
+
     def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
         _raise_not_supported(self.xpu.__name__)
 
@@ -358,12 +363,12 @@ def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandl
     def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:  # type: ignore[return]
         _raise_not_supported(self.register_forward_hook.__name__)
 
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
+    def state_dict(self, *args, **kwargs):
         _raise_not_supported(self.state_dict.__name__)
 
     def load_state_dict(
         self,
-        state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],
+        state_dict: Mapping[str, Any],
         strict: bool = True,
     ):
         _raise_not_supported(self.load_state_dict.__name__)
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index c40de387a5e8..57e8b8fd70e6 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -1,9 +1,12 @@
 import torch
 import torch.distributed as dist
 from torch.autograd import Function
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
+from torch.distributed import group, ReduceOp
 
-
-def broadcast(tensor, src, group=dist.group.WORLD):
+def broadcast(tensor, src, group=group.WORLD):
     """
     Broadcasts the tensor to the whole group.
 
@@ -23,7 +26,7 @@ def broadcast(tensor, src, group=dist.group.WORLD):
     return _Broadcast.apply(src, group, tensor)
 
 
-def gather(tensor, dst=0, group=dist.group.WORLD):
+def gather(tensor, dst=0, group=group.WORLD):
     """
     Gathers a list of tensors in a single process.
 
@@ -38,7 +41,7 @@ def gather(tensor, dst=0, group=dist.group.WORLD):
     return _Gather.apply(dst, group, tensor)
 
 
-def scatter(tensors, src=0, group=dist.group.WORLD):
+def scatter(tensors, src=0, group=group.WORLD):
     """
     Scatters a list of tensors to all processes in a group.
 
@@ -58,7 +61,7 @@ def scatter(tensors, src=0, group=dist.group.WORLD):
     return _Scatter.apply(src, group, *tensors)
 
 
-def reduce(tensor, dst, op=dist.ReduceOp.SUM, group=dist.group.WORLD):
+def reduce(tensor, dst, op=ReduceOp.SUM, group=group.WORLD):
     """
     Reduces the tensor data across all machines.
 
@@ -79,7 +82,7 @@ def reduce(tensor, dst, op=dist.ReduceOp.SUM, group=dist.group.WORLD):
     return _Reduce.apply(dst, op, group, tensor)
 
 
-def reduce_scatter(output, input_list, op=dist.ReduceOp.SUM, group=dist.group.WORLD):
+def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=group.WORLD):
     """
     Reduces, then scatters a list of tensors to all processes in a group.
 
@@ -98,7 +101,7 @@ def reduce_scatter(output, input_list, op=dist.ReduceOp.SUM, group=dist.group.WO
     return _Reduce_Scatter.apply(op, group, output, *input_list)
 
 
-def all_gather(tensor, group=dist.group.WORLD):
+def all_gather(tensor, group=group.WORLD):
     """
     Gathers tensors from the whole group in a list.
 
@@ -113,7 +116,7 @@ def all_gather(tensor, group=dist.group.WORLD):
     return _AllGather.apply(group, tensor)
 
 
-def all_to_all(output_tensor_list, input_tensor_list, group=dist.group.WORLD):
+def all_to_all(output_tensor_list, input_tensor_list, group=group.WORLD):
     """
     Each process scatters list of input tensors to all processes in a group and
     return gathered list of tensors in output list.
@@ -135,7 +138,7 @@ def all_to_all_single(
     input,
     output_split_sizes=None,
     input_split_sizes=None,
-    group=dist.group.WORLD,
+    group=group.WORLD,
 ):
     """
     Each process splits input tensor and then scatters the split list
@@ -161,7 +164,7 @@ def all_to_all_single(
     )
 
 
-def all_reduce(tensor, op=dist.ReduceOp.SUM, group=dist.group.WORLD):
+def all_reduce(tensor, op=ReduceOp.SUM, group=group.WORLD):
     """
     Reduces the tensor data across all machines in such a way that all get
     the final result.
@@ -197,7 +200,7 @@ def forward(ctx, src, group, tensor):
 
     @staticmethod
     def backward(ctx, grad_output):
-        gx = _Reduce.apply(ctx.src, dist.ReduceOp.SUM, ctx.group, grad_output)
+        gx = _Reduce.apply(ctx.src, ReduceOp.SUM, ctx.group, grad_output)
         if ctx.src != ctx.rank:
             gx.zero_()
         return (None, None, gx)
@@ -215,6 +218,8 @@ def forward(ctx, dst, group, tensor):
         tensor_list = [
             torch.zeros_like(tensor) for i in range(dist.get_world_size(group=group))
         ]
+
+        tensor = tensor.contiguous()
         if dist.get_rank(group=group) == dst:
             dist.gather(tensor, tensor_list, dst, group=group)
         else:
@@ -262,14 +267,13 @@ class _Reduce_Scatter(Function):
     @staticmethod
     def forward(ctx, op, group, tensor, *input_tensor_list):
         ctx.group = group
+        input_tensor_list = tuple(t.contiguous() for t in input_tensor_list)
         dist.reduce_scatter(tensor, list(input_tensor_list), op=op, group=group)
         return tensor
 
     @staticmethod
     def backward(ctx, grad_output):
-        return (None, None, None) + _AllGather.apply(
-            ctx.group, grad_output.contiguous()
-        )
+        return (None, None, None) + _AllGather.apply(ctx.group, grad_output)
 
 
 class _AllGather(Function):
@@ -277,16 +281,24 @@ class _AllGather(Function):
     def forward(ctx, group, tensor):
         ctx.group = group
         out_tensor_list = [
-            torch.empty_like(tensor) for i in range(dist.get_world_size(group=group))
+            torch.empty_like(tensor) for _ in range(dist.get_world_size(group=group))
         ]
-        dist.all_gather(out_tensor_list, tensor, group=group)
+
+        dist.all_gather(out_tensor_list, tensor.contiguous(), group=group)
         return tuple(out_tensor_list)
 
     @staticmethod
     def backward(ctx, *grad_outputs):
-        tensor_list = [torch.empty_like(tensor) for tensor in grad_outputs]
-        gxs = _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
-        gx = torch.sum(torch.stack(gxs), dim=0)
+        if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
+            rank = dist.get_rank()
+            gx = torch.empty_like(grad_outputs[rank])
+            _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs)
+        else:
+            # As many backends doesn't support ReduceScatter, we use AlltoAll with .sum()
+            # to emulate the ReduceScatter behavior
+            tensor_list = [torch.empty_like(tensor) for tensor in grad_outputs]
+            gxs = _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
+            gx = torch.sum(torch.stack(gxs), dim=0)
         return (None, gx)
 
 
@@ -298,6 +310,7 @@ def forward(ctx, group, out_tensor_list, *tensors):
             tensors[i].size() for i in range(dist.get_world_size(group=group))
         ]
         my_rank = dist.get_rank(group=group)
+        tensors = tuple(t.contiguous() for t in tensors)
         # Implement it on means of scatter/gather, send/recv async operations have issues
         if dist.get_backend(group=group) is dist.Backend.GLOO:
             for i in range(dist.get_world_size(group=group)):
@@ -319,7 +332,6 @@ def backward(ctx, *grad_outputs):
             torch.empty(size, device=grad_outputs[0].device)
             for size in ctx.input_tensor_size_list
         ]
-        grad_outputs = tuple(tensor.contiguous() for tensor in grad_outputs)
         return (None, None) + _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
 
 
diff --git a/torch/distributed/nn/jit/instantiator.py b/torch/distributed/nn/jit/instantiator.py
index 273109f3008d..7b78ee085a60 100644
--- a/torch/distributed/nn/jit/instantiator.py
+++ b/torch/distributed/nn/jit/instantiator.py
@@ -142,7 +142,7 @@ def instantiate_scriptable_remote_module_template(
 
 
 def instantiate_non_scriptable_remote_module_template():
-    generated_module_name = f"{_FILE_PREFIX}non_sriptable"
+    generated_module_name = f"{_FILE_PREFIX}non_scriptable"
     str_dict = dict(
         assign_module_interface_cls="module_interface_cls = None",
         args="*args",
diff --git a/torch/distributed/optim/functional_adadelta.py b/torch/distributed/optim/functional_adadelta.py
index eaa5d3157da2..ddba3d5ae671 100644
--- a/torch/distributed/optim/functional_adadelta.py
+++ b/torch/distributed/optim/functional_adadelta.py
@@ -23,6 +23,7 @@ def __init__(
         eps: float = 1e-6,
         weight_decay: float = 0.0,
         foreach: bool = False,
+        maximize: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -32,6 +33,7 @@ def __init__(
             "weight_decay": weight_decay,
         }
         self.foreach = foreach
+        self.maximize = maximize
 
         if len(params) == 0 and not _allow_empty_param_list:
             raise ValueError("optimizer got an empty parameter list")
@@ -85,4 +87,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                        rho=rho,
                        eps=eps,
                        weight_decay=weight_decay,
-                       foreach=self.foreach)
+                       foreach=self.foreach,
+                       maximize=self.maximize)
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
index 0c8fca6cb9a5..be81acf02139 100644
--- a/torch/distributed/optim/functional_adagrad.py
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -27,6 +27,7 @@ def __init__(
         eps: float = 1e-10,
         coalesce_grad: bool = True,
         foreach: bool = False,
+        maximize: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -40,6 +41,7 @@ def __init__(
         }
         self.coalesce_grad = coalesce_grad
         self.foreach = foreach
+        self.maximize = maximize
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -92,4 +94,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                       lr_decay=self.defaults['lr_decay'],
                       eps=self.defaults['eps'],
                       has_sparse_grad=has_sparse_grad,
-                      foreach=self.foreach)
+                      foreach=self.foreach,
+                      maximize=self.maximize)
diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py
index 690b167967fa..d0d2a7df06b4 100644
--- a/torch/distributed/optim/functional_adam.py
+++ b/torch/distributed/optim/functional_adam.py
@@ -24,6 +24,7 @@ def __init__(
         weight_decay: float = 0.0,
         amsgrad: bool = False,
         maximize: bool = False,
+        foreach: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         if not 0.0 <= lr:
@@ -46,6 +47,7 @@ def __init__(
         }
         self.amsgrad = amsgrad
         self.maximize = maximize
+        self.foreach = foreach
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -100,7 +102,8 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
                    beta2=self.defaults['beta2'],
                    lr=self.defaults['lr'],
                    weight_decay=self.defaults['weight_decay'],
-                   eps=self.defaults['eps'])
+                   eps=self.defaults['eps'],
+                   foreach=self.foreach)
 
     def step(self, gradients: List[Optional[Tensor]]):
         params = self.param_group['params']
@@ -158,4 +161,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                    beta2=self.defaults['beta2'],
                    lr=self.defaults['lr'],
                    weight_decay=self.defaults['weight_decay'],
-                   eps=self.defaults['eps'])
+                   eps=self.defaults['eps'],
+                   foreach=self.foreach)
diff --git a/torch/distributed/optim/functional_adamax.py b/torch/distributed/optim/functional_adamax.py
index 65fe682230d8..33dd1669af18 100644
--- a/torch/distributed/optim/functional_adamax.py
+++ b/torch/distributed/optim/functional_adamax.py
@@ -23,6 +23,7 @@ def __init__(
         eps: float = 1e-8,
         weight_decay: float = 0.0,
         foreach: bool = False,
+        maximize: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         if not 0.0 <= lr:
@@ -44,6 +45,7 @@ def __init__(
             "weight_decay": weight_decay,
         }
         self.foreach = foreach
+        self.maximize = maximize
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -99,4 +101,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                      beta2=self.defaults['beta2'],
                      lr=self.defaults['lr'],
                      weight_decay=self.defaults['weight_decay'],
-                     foreach=self.foreach)
+                     foreach=self.foreach,
+                     maximize=self.maximize)
diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py
index ae036538f99d..3114d0691138 100644
--- a/torch/distributed/optim/functional_adamw.py
+++ b/torch/distributed/optim/functional_adamw.py
@@ -24,6 +24,7 @@ def __init__(
         weight_decay: float = 1e-2,
         amsgrad: bool = False,
         maximize: bool = False,
+        foreach: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         if not 0.0 <= lr:
@@ -46,6 +47,7 @@ def __init__(
         }
         self.amsgrad = amsgrad
         self.maximize = maximize
+        self.foreach = foreach
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -100,7 +102,8 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
                     beta2=self.defaults['beta2'],
                     lr=self.defaults['lr'],
                     weight_decay=self.defaults['weight_decay'],
-                    eps=self.defaults['eps'])
+                    eps=self.defaults['eps'],
+                    foreach=self.foreach)
 
     def step(self, gradients: List[Optional[Tensor]]):
         params = self.param_group['params']
@@ -158,4 +161,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                     beta2=self.defaults['beta2'],
                     lr=self.defaults['lr'],
                     weight_decay=self.defaults['weight_decay'],
-                    eps=self.defaults['eps'])
+                    eps=self.defaults['eps'],
+                    foreach=self.foreach)
diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py
index 7c02338ceb59..e628e4855a8a 100644
--- a/torch/distributed/optim/functional_rmsprop.py
+++ b/torch/distributed/optim/functional_rmsprop.py
@@ -24,6 +24,7 @@ def __init__(
         weight_decay: float = 0.0,
         momentum: float = 0.0,
         centered: bool = False,
+        foreach: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -34,6 +35,7 @@ def __init__(
             "momentum": momentum,
         }
         self.centered = centered
+        self.foreach = foreach
 
         if len(params) == 0 and not _allow_empty_param_list:
             raise ValueError("optimizer got an empty parameter list")
@@ -99,4 +101,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                       eps=eps,
                       weight_decay=weight_decay,
                       momentum=momentum,
-                      centered=self.centered)
+                      centered=self.centered,
+                      foreach=self.foreach)
diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py
index 3302822688cc..ed6ebddc3d2b 100644
--- a/torch/distributed/optim/functional_rprop.py
+++ b/torch/distributed/optim/functional_rprop.py
@@ -21,6 +21,7 @@ def __init__(
         lr: float = 1e-2,
         etas: Tuple[float, float] = (0.5, 1.2),
         step_sizes: Tuple[float, float] = (1e-6, 50),
+        foreach: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -28,6 +29,7 @@ def __init__(
         }
         self.etas = etas
         self.step_sizes = step_sizes
+        self.foreach = foreach
 
         if len(params) == 0 and not _allow_empty_param_list:
             raise ValueError("optimizer got an empty parameter list")
@@ -81,4 +83,5 @@ def step(self, gradients: List[Optional[Tensor]]):
                     step_size_min=step_size_min,
                     step_size_max=step_size_max,
                     etaminus=etaminus,
-                    etaplus=etaplus)
+                    etaplus=etaplus,
+                    foreach=self.foreach)
diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py
index 73d72febcce7..57cf724ad079 100644
--- a/torch/distributed/optim/functional_sgd.py
+++ b/torch/distributed/optim/functional_sgd.py
@@ -24,6 +24,7 @@ def __init__(
         weight_decay: float = 0.0,
         nesterov: bool = False,
         maximize: bool = False,
+        foreach: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -34,6 +35,7 @@ def __init__(
         }
         self.nesterov = nesterov
         self.maximize = maximize
+        self.foreach = foreach
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -56,8 +58,12 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
         params = [param]
         momentum_buffer_list: List[Optional[Tensor]] = []
         grads = []
+
+        has_sparse_grad = False
         if grad is not None:
             grads.append(grad)
+            if grad.is_sparse:
+                has_sparse_grad = True
             if param not in self.state:
                 self.state[param] = {}
             state = self.state[param]
@@ -77,6 +83,8 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
                 dampening=dampening,
                 nesterov=self.nesterov,
                 maximize=self.maximize,
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
             )
         # update momentum_buffer in state
         state = self.state[param]
@@ -101,10 +109,13 @@ def step(self, gradients: List[Optional[Tensor]]):
                 + f"Gradients length: {len(gradients)}"
             )
 
+        has_sparse_grad = False
         for param, gradient in zip(params, gradients):
             if gradient is not None:
                 params_with_grad.append(param)
                 grads.append(gradient)
+                if gradient.is_sparse:
+                    has_sparse_grad = True
 
                 if param not in self.state:
                     self.state[param] = {}
@@ -125,6 +136,8 @@ def step(self, gradients: List[Optional[Tensor]]):
                   dampening=dampening,
                   nesterov=self.nesterov,
                   maximize=self.maximize,
+                  has_sparse_grad=has_sparse_grad,
+                  foreach=self.foreach,
                   )
 
         # update momentum_buffers in state
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index 1a80bab01bf7..913af4397391 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -26,8 +26,7 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
         >>>  )
         >>>
         >>>  # Register a post-localSGD communication hook.
-        >>>  subgroup, subgroups = dist.new_subgroups()
-        >>>  state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100)
+        >>>  state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
         >>>  model.register_comm_hook(state, post_localSGD_hook)
         >>>
         >>>  # Create a post-localSGD optimizer that wraps a local optimizer.
@@ -76,14 +75,10 @@ def step(self):
         Performs a single optimization step (parameter update).
         """
         self.optim.step()
-        for param_group in self.param_groups:
-            for params in param_group["params"]:
-                if params.grad is None:
-                    continue
-                self.averager.average_parameters(iter(params))
+        self.averager.average_parameters(params=self.param_groups)
 
-    def zero_grad(self):
-        self.optim.zero_grad()
+    def zero_grad(self, set_to_none: bool = False):  # type: ignore[override]
+        self.optim.zero_grad(set_to_none=set_to_none)
 
     def add_param_group(self, param_group):
         self.optim.add_param_group(param_group)
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index bc724324a12b..22f1817fcc12 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -10,7 +10,16 @@
 import io
 import logging
 from itertools import chain
-from typing import Any, Callable, Dict, List, Optional, Set, Type
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Type,
+    Union,
+)
 
 import torch
 import torch.distributed as dist
@@ -18,6 +27,9 @@
 from torch.distributed.optim.utils import functional_optim_map
 from torch.optim import Optimizer
 
+
+logger = logging.getLogger(__name__)
+
 __all__ = ["ZeroRedundancyOptimizer"]
 
 
@@ -284,7 +296,8 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
 
     Arguments:
         params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s
-            giving all parameters, which will be sharded across ranks.
+            or :class:`dict` s giving all parameters, which will be sharded
+            across ranks.
 
     Keyword Args:
         optimizer_class (:class:`torch.nn.Optimizer`): the class of the local
@@ -361,7 +374,7 @@ def __init__(
         **defaults: Any,
     ):
         # Perform type and assumption checks on the input parameters
-        self._verify_and_init_params(params)
+        params = self._verify_and_init_params(params)
         self._verify_same_dense_param_type()
 
         # NOTE: The parent constructor uses `add_param_group()` which is
@@ -370,7 +383,7 @@ def __init__(
         # between the parent and child.
         self.initialized = False
 
-        Optimizer.__init__(self, self._all_params, defaults)
+        Optimizer.__init__(self, params, defaults)
         Joinable.__init__(self)
         # Now, all parameters are held in both `self._all_params` and
         # `self.param_groups`
@@ -403,7 +416,7 @@ def __init__(
         else:
             self._overlap_info: _OverlapInfo = _OverlapInfo(self.world_size)
             if parameters_as_bucket_view:
-                logging.warning(
+                logger.warning(
                     "`parameters_as_bucket_view=True` will be ignored since "
                     "`overlap_with_ddp=True`; instead, a different bucketing "
                     "strategy will be used"
@@ -992,7 +1005,7 @@ def _local_step(
                     "does not support changing parameter trainability at run "
                     "time"
                 )
-            logging.warning(
+            logger.warning(
                 "ZeroRedundancyOptimizer detected that the trainable "
                 "parameters changed; rebuilding the parameter buckets if "
                 "enabled"
@@ -1038,7 +1051,7 @@ def step(
         .. note: Any extra parameters are passed to the base optimizer as-is.
         """
         if self._overlap_with_ddp:
-            logging.warning(
+            logger.warning(
                 "`step()` should not be included in the training loop when "
                 "`overlap_with_ddp=True`"
             )
@@ -1286,36 +1299,60 @@ def _build_ddp_param_buckets(self) -> None:
                     offset = offset_next
                 bucket_assignment.tensor = tensor
 
-    def _verify_and_init_params(self, params: Any) -> None:
+    def _verify_and_init_params(
+        self, params: Any,
+    ) -> Union[List[torch.Tensor], List[dict]]:
         r"""
         Verifies the type of ``params`` and initializes ``self._all_params``
-        if ``params`` is valid.
+        as a :class:`list` of all parameters if ``params`` is valid.
 
-        While :class:`optim.Optimizer <torch.optim.Optimizer>` allows
-        ``params`` to be an iterable of :class:`dict` s, currently
-        ``ZeroRedundancyOptimizer`` strictly requires ``params`` to be an
-        iterable of :class:`torch.Tensor` s.
+        Arguments:
+            params (Any): Candidate parameter list or parameter groups to
+                verify.
 
         Raises:
             TypeError: ``params`` has an invalid type.
             ValueError: ``params`` is empty.
+
+        Returns:
+            The persistent form of ``params`` to be passed into the parent
+            :class:`Optimizer` constructor -- i.e. returns ``params`` as a
+            :class:`list` to ensure that it can be iterated over again.
         """
         if isinstance(params, torch.Tensor):
-            raise TypeError("params argument should be an iterable of "
+            raise TypeError("`params` argument should be an iterable of "
                             f"Tensors, but got {torch.typename(params)}")
         try:
-            self._all_params = list(params)
+            all_params = list(params)
         except TypeError:
-            raise TypeError("params argument should be an iterable of "
-                            f"Tensors, but got {torch.typename(params)}")
-        if len(self._all_params) == 0:
+            raise TypeError("`params` argument should be an iterable of Tensors"
+                            f" or dicts, but got {torch.typename(params)}")
+        if len(all_params) == 0:
             raise ValueError("ZeroRedundancyOptimizer got an empty parameter "
                              "list")
-        for param in self._all_params:
-            if not isinstance(param, torch.Tensor):
-                raise TypeError("params argument should be an iterable of "
-                                "Tensors, but got an iterable containing "
-                                f"{torch.typename(param)}")
+        all_tensors = True
+        all_dicts = True
+        for param in all_params:
+            all_tensors &= isinstance(param, torch.Tensor)
+            all_dicts &= isinstance(param, dict)
+        if not all_tensors and not all_dicts:
+            raise TypeError("`params` argument should be an iterable of "
+                            "Tensors or dicts")
+        # Ensure that `self._all_params` contains a list of all parameters
+        if all_tensors:
+            self._all_params = all_params
+        elif all_dicts:
+            self._all_params = []
+            # `all_params` contains parameter groups (not parameters)
+            for param_group in all_params:
+                if "params" not in param_group:
+                    raise ValueError(
+                        "Each parameter group passed-in via `params` must "
+                        "have a 'params' key mapping to the parameters in "
+                        "the group"
+                    )
+                self._all_params.extend(param_group["params"])
+        return all_params
 
     def _verify_same_dense_param_type(self) -> None:
         r"""
@@ -1373,23 +1410,23 @@ def _init_local_optimizer(self) -> None:
             if "_allow_empty_param_list" in inspect.signature(self._optim_constructor).parameters:
                 self.optim: Any = self._optim_constructor(params, **self._optim_defaults, _allow_empty_param_list=True)
             else:
-                logging.warning(
+                logger.warning(
                     f"{self._optim_constructor} does not support the argument "
                     "`_allow_empty_param_list`; ZeroRedundancyOptimizer may "
                     "error due to an empty parameter list"
                 )
-                self.optim: Any = self._optim_constructor(params, **self._optim_defaults)
+                self.optim: Any = self._optim_constructor(params, **self._optim_defaults)  # type: ignore[no-redef]
 
             # Log information about the DDP and ZeRO bucketing
-            if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF:
+            if dist.get_debug_level() != dist.DebugLevel.OFF:
                 local_numel = sum(p.numel() for p in params)
                 num_assigned_buckets = len(self._bucket_assignments_per_rank[self.global_rank])
-                logging.info(
+                logger.info(
                     f"rank {self.global_rank} with {local_numel} parameters "
                     f"across {num_assigned_buckets} buckets"
                 )
                 if self.global_rank == 0:
-                    logging.info(
+                    logger.info(
                         f"{len(self._overlap_info.params_per_bucket)} DDP "
                         f"buckets and "
                         f"{self._overlap_info.num_bucket_assignments} bucket "
@@ -1503,7 +1540,7 @@ def _get_optimizer_constructor(self, optimizer_class: Any) -> Any:
                 # Translate the passed-in optimizer class to its functional
                 # equivalent if `overlap_with_ddp=True`
                 optim_constructor = functional_optim_map[optimizer_class]
-                logging.info(
+                logger.info(
                     f"Using the functional optimizer {optim_constructor} "
                     f"instead of {optimizer_class} since "
                     "`overlap_with_ddp=True`"
diff --git a/torch/distributed/pipeline/sync/batchnorm.py b/torch/distributed/pipeline/sync/batchnorm.py
index a0e144b68cc9..c58fe0043273 100644
--- a/torch/distributed/pipeline/sync/batchnorm.py
+++ b/torch/distributed/pipeline/sync/batchnorm.py
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Tracks the running statistics per mini-batch instead of micro-batch."""
-from typing import Optional, TypeVar, cast
+from typing import TypeVar, cast
 
 import torch
 from torch import Tensor, nn
@@ -35,7 +35,7 @@ def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: Optional[float] = 0.1,
+        momentum: float = 0.1,
         affine: bool = True,
         chunks: int = 1,
     ) -> None:
diff --git a/torch/distributed/remote_device.py b/torch/distributed/remote_device.py
index 659efc0a9eaf..b49ea174dd05 100644
--- a/torch/distributed/remote_device.py
+++ b/torch/distributed/remote_device.py
@@ -125,3 +125,9 @@ def __eq__(self, other):
             return True
 
         return False
+
+
+    def __hash__(self):
+        return hash(self._worker_name) ^ \
+            hash(self._device) ^ \
+            hash(self._rank)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 2c58f7e836d8..1300a415cb72 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -9,7 +9,7 @@
 import os
 import sys
 from datetime import timedelta
-from typing import Dict, Optional, Union
+from typing import Dict
 
 import torch._six as six
 from torch.distributed import FileStore, PrefixStore, Store, TCPStore
@@ -50,6 +50,10 @@ def register_rendezvous_handler(scheme, handler):
         )
     _rendezvous_handlers[scheme] = handler
 
+# Query will have format "rank=0&world_size=1" and is
+# converted into {"rank": 0, "world_size": 1}
+def _query_to_dict(query: str) -> Dict[str, str]:
+    return dict((pair[0], pair[1]) for pair in (pair.split("=") for pair in filter(None, query.split("&"))))
 
 def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
     if not isinstance(url, six.string_classes):
@@ -64,9 +68,7 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
     # Append node-specific arguments.
     result = urlparse(url)
     if rank != -1 or world_size != -1:
-        query_dict: Dict[str, Union[int, str]] = dict(
-            pair.split("=") for pair in filter(None, result.query.split("&"))
-        )
+        query_dict = _query_to_dict(result.query)
         assert (
             "rank" not in query_dict and "world_size" not in query_dict
         ), "The url: {url} has node-specific arguments(rank, world_size) already.".format(
@@ -88,6 +90,34 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
         raise RuntimeError("No rendezvous handler for {}://".format(result.scheme))
     return _rendezvous_handlers[result.scheme](url, **kwargs)
 
+def _create_store_from_options(backend_options, rank):
+    result = urlparse(backend_options.init_method)
+
+    # If using env initialization, get rank and world_size from env
+    world_size = -1
+    if result.scheme == "env":
+        rank = os.environ.get("RANK", rank)
+        # Here, the world_size has already beeen initialized to -1 in init_rpc
+        # If the world_size env variable is also not present then it is a dynamic group
+        world_size = int(os.environ.get("WORLD_SIZE", world_size))
+
+    query_dict = _query_to_dict(result.query)
+    # if rank is -1 then intentionally exclude rank for the query, error will be thrown later
+    if rank != -1:
+        query_dict["rank"] = str(rank)
+    query_dict["world_size"] = str(world_size)
+
+    result = result._replace(
+        query="{}".format(
+            "&".join(["{}={}".format(k, v) for k, v in query_dict.items()])
+        )
+    )
+
+    url = urlunparse(result)
+    if result.scheme not in _rendezvous_handlers:
+        raise RuntimeError("No handler for {}://".format(result.scheme))
+    store, _, _ = next(_rendezvous_handlers[result.scheme](url))
+    return store
 
 def _rendezvous_error(msg):
     return ValueError("Error initializing torch.distributed using " + msg)
@@ -110,16 +140,14 @@ def _error(msg):
 
     if not path:
         raise _error("path missing")
-    query: Dict[str, str]
-    # mypy doesn't allow dict() to accept List of values (#257)
-    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))  # type: ignore[misc, arg-type]
-    if "rank" not in query:
+    query_dict = _query_to_dict(result.query)
+    if "rank" not in query_dict:
         raise _error("rank parameter missing")
-    if "world_size" not in query:
+    if "world_size" not in query_dict:
         raise _error("world size parameter missing")
 
-    rank = int(query["rank"])
-    world_size = int(query["world_size"])
+    rank = int(query_dict["rank"])
+    world_size = int(query_dict["world_size"])
     store = FileStore(path, world_size)
     yield (store, rank, world_size)
 
@@ -171,16 +199,14 @@ def _error(msg):
     result = urlparse(url)
     if not result.port:
         raise _error("port number missing")
-    query: Dict[str, Union[int, str]]
-    # mypy doesn't allow dict() to accept List of values (#257)
-    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))  # type: ignore[misc, arg-type]
-    if "rank" not in query:
+    query_dict = _query_to_dict(result.query)
+    if "rank" not in query_dict:
         raise _error("rank parameter missing")
-    if "world_size" not in query:
+    if "world_size" not in query_dict:
         raise _error("world size parameter missing")
 
-    rank = int(query["rank"])
-    world_size = int(query["world_size"])
+    rank = int(query_dict["rank"])
+    world_size = int(query_dict["world_size"])
     assert result.hostname is not None
 
     store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout)
@@ -208,21 +234,20 @@ def _get_env_or_raise(env_var: str) -> str:
             return env_val
 
     result = urlparse(url)
-    query: Dict[str, Union[int, str]]
-    # mypy doesn't allow dict() to accept List of values (#257)
-    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))  # type: ignore[misc, arg-type]
+    query_dict = _query_to_dict(result.query)
 
-    rank: Optional[Union[str, int]]
-    world_size: Optional[Union[str, int]]
-    master_port: Optional[Union[str, int]]
+    rank: int
+    world_size: int
+    master_port: int
+    master_addr: str
 
-    if "rank" in query:
-        rank = int(query["rank"])
+    if "rank" in query_dict:
+        rank = int(query_dict["rank"])
     else:
         rank = int(_get_env_or_raise("RANK"))
 
-    if "world_size" in query:
-        world_size = int(query["world_size"])
+    if "world_size" in query_dict:
+        world_size = int(query_dict["world_size"])
     else:
         world_size = int(_get_env_or_raise("WORLD_SIZE"))
 
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 9a5edeb8acb3..07afb05b38bc 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -1,8 +1,10 @@
 from datetime import timedelta
 import logging
+import os
 import threading
 import warnings
 from typing import Generator, Tuple
+from urllib.parse import urlparse
 
 import torch
 import torch.distributed as dist
@@ -158,16 +160,20 @@ def init_rpc(
                 backend
             )
 
-        # Rendezvous.
-        # This rendezvous state sometimes is destroyed before all processes
-        # finishing handshaking. To avoid that issue, we make it global to
-        # keep it alive.
-        global rendezvous_iterator
-        rendezvous_iterator = dist.rendezvous(
-            rpc_backend_options.init_method, rank=rank, world_size=world_size
-        )
-        store, _, _ = next(rendezvous_iterator)
-
+        # Create store, performs rendezvous for static RPC group.
+        if not world_size:
+            # If world_size is not set in construction and also not set in environment variables
+            # The store will be created for the dynamic group setting
+            store = dist._create_store_from_options(rpc_backend_options, rank)
+        else:
+            # This rendezvous state sometimes is destroyed before all processes
+            # finishing handshaking. To avoid that issue, we make it global to
+            # keep it alive.
+            global rendezvous_iterator
+            rendezvous_iterator = dist.rendezvous(
+                rpc_backend_options.init_method, rank=rank, world_size=world_size
+            )
+            store, _, _ = next(rendezvous_iterator)
         # Use same timeout as RPC.
         store.set_timeout(timedelta(seconds=rpc_backend_options.rpc_timeout))
 
@@ -195,11 +201,12 @@ def _validate_rpc_args(backend, store, name, rank, world_size, rpc_backend_optio
             store: dist.Store,
             name: str,
             rank: numbers.Integral,
-            world_size: numbers.Integral,
+            # world_size can be None for a dynamic group
+            world_size: (numbers.Integral, type(None)),
             rpc_backend_options: RpcBackendOptions,
         }
         for arg, arg_type in type_mapping.items():
-            if not isinstance(arg, arg_type):
+            if not isinstance(arg, arg_type):  # type: ignore[arg-type]
                 raise RuntimeError(
                     "Argument {} must be of type {} but got type {}".format(
                         arg, arg_type, type(arg)
@@ -211,7 +218,7 @@ def _init_rpc_backend(
         store=None,
         name=None,
         rank=-1,
-        world_size=-1,
+        world_size=None,
         rpc_backend_options=None,
     ):
 
diff --git a/torch/distributed/rpc/_utils.py b/torch/distributed/rpc/_utils.py
new file mode 100644
index 000000000000..9356ffadb858
--- /dev/null
+++ b/torch/distributed/rpc/_utils.py
@@ -0,0 +1,37 @@
+from contextlib import contextmanager
+from typing import cast
+import logging
+from . import api
+from . import TensorPipeAgent
+
+logger = logging.getLogger(__name__)
+
+@contextmanager
+def _group_membership_management(store, name, is_join):
+    token_key = "RpcGroupManagementToken"
+    join_or_leave = "join" if is_join else "leave"
+    my_token = f"Token_for_{name}_{join_or_leave}"
+    while True:
+        # Retrieve token from store to signal start of rank join/leave critical section
+        returned = store.compare_set(token_key, "", my_token).decode()
+        if returned == my_token:
+            # Yield to the function this context manager wraps
+            yield
+            # Finished, now exit and release token
+            # Update from store to signal end of rank join/leave critical section
+            store.set(token_key, "")
+            # Other will wait for this token to be set before they execute
+            store.set(my_token, "Done")
+            break
+        else:
+            # Store will wait for the token to be released
+            try:
+                store.wait([returned])
+            except RuntimeError:
+                logger.error(f"Group membership token {my_token} timed out waiting for {returned} to be released.")
+                raise
+
+def _update_group_membership(worker_info, my_devices, reverse_device_map, is_join):
+    agent = cast(TensorPipeAgent, api._get_current_rpc_agent())
+    ret = agent._update_group_membership(worker_info, my_devices, reverse_device_map, is_join)
+    return ret
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index 3fda49ef2211..47c983a1907b 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -1,3 +1,6 @@
+__all__ = ["shutdown", "get_worker_info", "remote", "rpc_sync",
+           "rpc_async", "RRef", "AllGatherStates", "method_factory", "new_method"]
+
 import collections
 import contextlib
 import functools
@@ -13,6 +16,7 @@
     PyRRef,
     RemoteProfilerManager,
     WorkerInfo,
+    TensorPipeAgent,
     get_rpc_timeout,
     _cleanup_python_rpc_handler,
     _delete_all_user_and_unforked_owner_rrefs,
@@ -38,6 +42,8 @@
 
 from .constants import DEFAULT_SHUTDOWN_TIMEOUT, UNSET_RPC_TIMEOUT
 
+from ._utils import _group_membership_management, _update_group_membership
+
 logger = logging.getLogger(__name__)
 
 # NB: Ignoring RRef leaks during shutdown. Without this, applications have to
@@ -194,8 +200,20 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
         sequence_id = concat_names + str(sequence_num)
 
     is_leader = leader_name == self_name
+
     if timeout == UNSET_RPC_TIMEOUT:
-        timeout = get_rpc_timeout()
+        # Timeout is specified by agent for RPC calls
+        rpc_timeout = get_rpc_timeout()
+        # No timeout for signal
+        signal_timeout = None
+    elif timeout == DEFAULT_SHUTDOWN_TIMEOUT:
+        # No timeout for RPC
+        rpc_timeout = timeout
+        # No timeout for signal
+        signal_timeout = None
+    else:
+        # Signal and RPC timeout use the same timeout
+        signal_timeout = rpc_timeout = timeout
 
     # Phase 1: Followers send it's object to the leader
     if is_leader:
@@ -205,12 +223,14 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
             leader_name,
             _gather_to_leader,
             args=(sequence_id, self_name, obj, worker_names),
-            timeout=timeout,
+            timeout=rpc_timeout,
         )
 
     with _all_gather_dict_lock:
         states = _all_gather_sequence_id_to_states[sequence_id]
-    states.proceed_signal.wait()
+
+    # Timeout is either set by function parameter or None (which is indefinite)
+    states.proceed_signal.wait(timeout=signal_timeout)
 
     # Phase 2: Leader broadcast gathered results to all followers
     # Leader's signal is the first to be unblocked, after receiving all
@@ -222,7 +242,7 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
                 follower_name,
                 _broadcast_to_followers,
                 args=(sequence_id, states.gathered_objects),
-                timeout=timeout
+                timeout=rpc_timeout
             )
             worker_name_to_response_future_dict[follower_name] = fut
 
@@ -236,7 +256,7 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
         if errors:
             raise RuntimeError(
                 f"Followers {[e[0] for e in errors]} timed out in _all_gather "
-                f"after {timeout:.2f} seconds. The first exception is {errors[0][1]}"
+                f"after {rpc_timeout:.2f} seconds. The first exception is {errors[0][1]}"
             )
 
     # Clean up for the states using the sequence_id
@@ -333,9 +353,21 @@ def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     """
     if graceful:
         try:
-            _wait_all_workers(timeout)
-            _delete_all_user_and_unforked_owner_rrefs()
-            _get_current_rpc_agent().join(shutdown=True)
+            agent = _get_current_rpc_agent()
+            if not isinstance(agent, TensorPipeAgent) or agent.is_static_group:
+                _wait_all_workers(timeout)
+                _delete_all_user_and_unforked_owner_rrefs()
+                agent.join(shutdown=True, timeout=timeout)
+            else:
+                # This is a dynamic group so we need to grab the token for the operation
+                my_worker_info = agent.get_worker_info()
+                my_name = my_worker_info.name
+                with _group_membership_management(agent.store, my_name, False):
+                    all_worker_infos = agent.get_worker_infos()
+                    for worker in all_worker_infos:
+                        if worker.name != my_name:
+                            rpc_sync(worker.name, _update_group_membership, args=(my_worker_info, [], {}, False))
+                    agent.join(shutdown=True, timeout=timeout)
         finally:
             # In case of errors, continue to complete the local shutdown.
             _finalize_shutdown()
@@ -452,7 +484,8 @@ def method_factory(method_name, docstring):
     def method(self, *args, **kwargs):
         return getattr(super(RRef, self), method_name)(*args, **kwargs)
 
-    method.__doc__ = docstring
+    if method.__doc__:
+        method.__doc__ = docstring
     return method
 
 
@@ -586,7 +619,7 @@ def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
     torch._C._log_api_usage_once("torch.distributed.rpc_remote")
     qualified_name = torch.jit._builtins._find_builtin(func)
     dst_worker_info = _to_worker_info(to)
-    should_profile = torch.autograd._profiler_enabled()
+    should_profile = _get_should_profile()
 
     ctx_manager = _enable_rpc_profiler(should_profile, qualified_name, func, RPCExecMode.REMOTE, dst_worker_info)
 
@@ -639,7 +672,7 @@ def _invoke_rpc(to, func, rpc_type, args=None, kwargs=None, rpc_timeout=UNSET_RP
     qualified_name = torch.jit._builtins._find_builtin(func)
     dst_worker_info = _to_worker_info(to)
 
-    should_profile = torch.autograd._profiler_enabled()
+    should_profile = _get_should_profile()
 
     ctx_manager = _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info)
 
@@ -861,6 +894,14 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
         _thread_local_var.future_list.append(fut)
     return fut
 
+def _get_should_profile():
+    # Legacy profiler should be enabled. RPC profiling is not supported with
+    # Kineto profiler.
+    ActiveProfilerType = torch._C._autograd.ActiveProfilerType
+    return (
+        torch.autograd._profiler_enabled() and
+        torch._C._autograd._profiler_type() == ActiveProfilerType.LEGACY  # type: ignore[attr-defined]
+    )
 
 def _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info):
     ctx_manager = contextlib.suppress()
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 476b1c207acb..b8c4aec49121 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -1,15 +1,16 @@
+__all__ = ["init_backend", "backend_registered", "construct_rpc_backend_options", "register_backend", "BackendType", "BackendValue"]
 
 import collections
 import enum
-from typing import Dict, List, Set, Tuple
+from typing import cast, Dict, List, Set, Tuple
 
 import torch
 import torch.distributed as dist
+from ._utils import _group_membership_management, _update_group_membership
 
 from . import api
 from . import constants as rpc_constants
 
-
 BackendValue = collections.namedtuple(
     "BackendValue", ["construct_rpc_backend_options_handler", "init_backend_handler"]
 )
@@ -32,7 +33,9 @@ def _backend_type_repr(self):
 BackendType = enum.Enum(value="BackendType", names=dict())  # type: ignore[misc]
 # Unable to assign a function a method (mypy bug #2427)
 BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
-BackendType.__doc__ = _backend_type_doc
+
+if BackendType.__doc__:
+    BackendType.__doc__ = _backend_type_doc
 
 def backend_registered(backend_name):
     """
@@ -79,7 +82,8 @@ def register_backend(
     BackendType = enum.Enum(value="BackendType", names=extended_enum_dict)  # type: ignore[misc]
     # Unable to assign a function a method (mypy bug #2427)
     BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
-    BackendType.__doc__ = _backend_type_doc
+    if BackendType.__doc__:
+        BackendType.__doc__ = _backend_type_doc
     return BackendType[backend_name]
 
 def construct_rpc_backend_options(
@@ -160,6 +164,14 @@ def _tensorpipe_exchange_and_check_all_device_maps(
     all_device_maps = {name: map_ for name, _, map_, _ in gathered}
     all_devices = {name: devices for name, _, _, devices in gathered}
 
+    _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices)
+
+    # passed all checked, construct reverse mapping and get list of devices handled by this agent
+    reverse_device_maps = _create_reverse_mapping(my_name, all_names, all_device_maps)
+    my_devices = _create_device_list(my_devices, my_device_maps, reverse_device_maps)
+    return reverse_device_maps, my_devices
+
+def _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices, is_static_group=True):
     for node in all_names:
         devices = all_devices[node]
         if len(set(devices)) != len(devices):
@@ -175,7 +187,8 @@ def _tensorpipe_exchange_and_check_all_device_maps(
             )
 
     for source_node in all_names:
-        if not set(all_device_maps[source_node].keys()).issubset(all_names):
+        # For dynamic group (non-static) do not check the target node name since it may not have joined yet
+        if is_static_group and not set(all_device_maps[source_node].keys()).issubset(all_names):
             raise ValueError(
                 f"Node {source_node} has invalid target node names in its device maps\n"
                 f"device maps = {all_device_maps[source_node].keys()}\n"
@@ -205,7 +218,7 @@ def _tensorpipe_exchange_and_check_all_device_maps(
                     f"device map = {map_}\n"
                     f"device count = {all_device_counts[source_node]}"
                 )
-            if all_devices[target_node]:
+            if all_devices.get(target_node, []):
                 if not set(map_.values()).issubset(all_devices[target_node]):
                     raise ValueError(
                         f"Node {source_node} has unexpected target devices "
@@ -213,7 +226,7 @@ def _tensorpipe_exchange_and_check_all_device_maps(
                         f"device map = {map_}\n"
                         f"devices = {all_devices[target_node]}"
                     )
-            elif not _tensorpipe_validate_devices(
+            elif target_node in all_device_counts and not _tensorpipe_validate_devices(
                 map_.values(), all_device_counts[target_node]
             ):
                 raise ValueError(
@@ -223,14 +236,7 @@ def _tensorpipe_exchange_and_check_all_device_maps(
                     f"device count = {all_device_counts[target_node]}"
                 )
 
-    # passed all checked, construct reverse mapping for return values
-    reverse_device_maps: Dict[str, Dict[torch.device, torch.device]] = {}
-    for node in all_names:
-        if my_name in all_device_maps[node]:
-            reverse_device_maps[node] = {
-                v: k for k, v in all_device_maps[node][my_name].items()
-            }
-
+def _create_device_list(my_devices, my_device_maps, reverse_device_maps):
     if not my_devices:
         devices_set: Set[torch.device] = set()
         for _, map_ in my_device_maps.items():
@@ -240,14 +246,60 @@ def _tensorpipe_exchange_and_check_all_device_maps(
         devices_set.discard(torch.device("cpu"))
         my_devices = list(devices_set)
     my_devices = sorted(my_devices, key=lambda d: d.index)
+    return my_devices
 
-    return reverse_device_maps, my_devices
+def _create_reverse_mapping(my_name, all_names, all_device_maps):
+    reverse_device_maps: Dict[str, Dict[torch.device, torch.device]] = {}
+    for node in all_names:
+        if my_name in all_device_maps[node]:
+            reverse_device_maps[node] = {
+                v: k for k, v in all_device_maps[node][my_name].items()
+            }
+    return reverse_device_maps
 
+def _get_device_infos():
+    from . import TensorPipeAgent
+    agent = cast(TensorPipeAgent, api._get_current_rpc_agent())
+    opts = agent._get_backend_options()
+    device_count = torch.cuda.device_count()
+    return device_count, opts.device_maps, opts.devices
 
-def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_options):
-    from . import TensorPipeRpcBackendOptions
+def _set_devices_and_reverse_device_map(agent):
     from . import TensorPipeAgent
+    agent = cast(TensorPipeAgent, agent)
+    # Group state is retrieved from local agent
+    # On initialization, tensorpipe agent retrieves information from all existing workers, so group state is valid
+    my_worker_info = agent.get_worker_info()
+    my_name = my_worker_info.name
+    all_worker_infos = agent.get_worker_infos()
+    # One round to get device_maps of all workers and construct reverse device maps
+    all_device_counts, all_device_maps, all_devices, all_names = {}, {}, {}, []
+    for worker_info in all_worker_infos:
+        worker_name = worker_info.name
+        if worker_name != my_name:
+            # TODO: make async?
+            device_count, device_map, devices = api.rpc_sync(worker_name, _get_device_infos)
+        else:
+            opts = agent._get_backend_options()
+            device_count, device_map, devices = torch.cuda.device_count(), opts.device_maps, opts.devices
+        all_device_counts[worker_name] = device_count
+        all_device_maps[worker_name] = device_map
+        all_devices[worker_name] = devices
+        all_names.append(worker_name)
+
+    _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices, is_static_group=False)
+    reverse_device_maps = _create_reverse_mapping(my_name, all_names, all_device_maps)
+
+    # Perform RPC call to all workers, including itself, to include newly joined worker information and device maps
+    for worker_name in all_names:
+        # Set device list for each worker
+        all_devices[worker_name] = _create_device_list(all_devices[worker_name], all_device_maps[worker_name], reverse_device_maps)
+        api.rpc_sync(worker_name, _update_group_membership,
+                     args=(my_worker_info, all_devices[worker_name], reverse_device_maps, True))
 
+def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_options):
+    from . import TensorPipeAgent
+    from . import TensorPipeRpcBackendOptions
     if not isinstance(store, dist.Store):
         raise TypeError("`store` must be a c10d::Store. {}".format(store))
 
@@ -260,12 +312,6 @@ def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_
             )
         )
 
-    # The agent's join method is required to behave like a barrier and perform
-    # collective operations, for which it relies on a process group, instead of
-    # re-implementing this on top of RPCs.
-
-    group = _init_process_group(store, rank, world_size)
-
     if torch.cuda.is_available():
         # It's necessary to initialize PyTorch CUDA states here (e.g.,
         # CUDACachingAllocator). If this is missing, we could hit errors like
@@ -277,38 +323,70 @@ def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_
     else:
         device_count = 0
 
-    reverse_device_maps, devices = _tensorpipe_exchange_and_check_all_device_maps(
-        name,
-        device_count,
-        rpc_backend_options.device_maps,
-        rpc_backend_options.devices,
-        group,
-    )
-
-    # TODO: add try-except and destroy _agent in all processes if any fails.
-    agent = TensorPipeAgent(
-        store,
-        name,
-        rank,
-        world_size,
-        rpc_backend_options,
-        reverse_device_maps,
-        devices,
-    )
+    is_static_group = True if world_size else False
+    # world_size is specified so this is a static group (ranks cannot join and leave)
+    if is_static_group:
+        # The agent's join method is required to behave like a barrier and perform
+        # collective operations, for which it relies on a process group, instead of
+        # re-implementing this on top of RPCs.
+        group = _init_process_group(store, rank, world_size)
+
+        reverse_device_maps, devices = _tensorpipe_exchange_and_check_all_device_maps(
+            name,
+            device_count,
+            rpc_backend_options.device_maps,
+            rpc_backend_options.devices,
+            group,
+        )
 
-    api._init_rpc_states(agent)
+        # TODO: add try-except and destroy _agent in all processes if any fails.
+        agent = TensorPipeAgent(
+            store,
+            name,
+            rank,
+            world_size,
+            rpc_backend_options,
+            reverse_device_maps,
+            devices,
+        )
 
-    # Run one dummy round of RPC to initialize channels/transports. Without
-    # this, it's easy to hit timeout in rpc.shutdown() if there is no other RPC
-    # on that process before rpc.shutdown(), as the agent initialization can
-    # take longer than 5s.
-    api._all_gather(None, timeout=rpc_backend_options.rpc_timeout)
-    # Need a barrier here to make sure no peers leave before the rank0 finishes
-    # _all_gather
-    group.barrier().wait()
+        api._init_rpc_states(agent)
 
-    return agent
+        # Run one dummy round of RPC to initialize channels/transports. Without
+        # this, it's easy to hit timeout in rpc.shutdown() if there is no other RPC
+        # on that process before rpc.shutdown(), as the agent initialization can
+        # take longer than 5s.
+        api._all_gather(None, timeout=rpc_backend_options.rpc_timeout)
+        # Need a barrier here to make sure no peers leave before the rank0 finishes
+        # _all_gather
+        group.barrier().wait()
 
+        return agent
+    # initialization for dynamic rpc (ranks can join and leave)
+    else:
+        with _group_membership_management(store, name, True):
+            # Construct TPAgent with empty reverse_device_map and devices
+            # these properties will be updated after initialization
+            agent = TensorPipeAgent(
+                store,
+                name,
+                rank,
+                world_size,
+                rpc_backend_options,
+                {},
+                [],
+            )
+            api._init_rpc_states(agent)
+
+            try:
+                # Notify all workers in group this rank has joined and set devices and reverse_device_map
+                # This is a synchronous operation that completes once all existing ranks are updated
+                _set_devices_and_reverse_device_map(agent)
+                pass
+            except Exception:
+                api.shutdown()
+                raise
+            return agent
 
 register_backend(
     "TENSORPIPE",
diff --git a/torch/distributed/rpc/constants.py b/torch/distributed/rpc/constants.py
index 1ec79b009104..ea8356fa94aa 100644
--- a/torch/distributed/rpc/constants.py
+++ b/torch/distributed/rpc/constants.py
@@ -11,7 +11,7 @@
 # For any RpcAgent.
 DEFAULT_RPC_TIMEOUT_SEC: float = _DEFAULT_RPC_TIMEOUT_SEC
 DEFAULT_INIT_METHOD: str = _DEFAULT_INIT_METHOD
-DEFAULT_SHUTDOWN_TIMEOUT: float = 5.0
+DEFAULT_SHUTDOWN_TIMEOUT: float = 0
 
 # For TensorPipeAgent.
 DEFAULT_NUM_WORKER_THREADS: int = _DEFAULT_NUM_WORKER_THREADS
diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index ab58c72e53f2..8c4dde955aea 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -167,6 +167,8 @@ def deserialize(self, binary_data, tensor_table):
             callee modules."""
             )
             ret = AttributeError(except_str)
+            # Ensure the stack trace gets preserved
+            ret.__cause__ = e
 
         # restore _thread_local_tensor_tables.recv_tables if return
         # from nested call, otherwise clean up the table
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index b9bf6583f21e..5c018075cc37 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -23,8 +23,7 @@ def _to_device_map(
 ) -> Dict[torch.device, torch.device]:
     full_device_map: Dict[torch.device, torch.device] = {}
     reverse_map: Dict[torch.device, torch.device] = {}
-    for k in device_map:
-        v = device_map[k]
+    for k, v in device_map.items():
         k, v = torch.device(k), torch.device(v)
         if v in reverse_map:
             raise ValueError(
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 463431374a9b..fba992a94731 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -118,7 +118,8 @@ def __enter__(self):
             self.profile_memory,
             False,
             False,
-            False)
+            False,
+            torch.profiler._ExperimentalConfig())
         _enable_server_process_global_profiler(profiler_config)
         return self
 
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 6473f2df924f..21ca98f6c62c 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -16,6 +16,13 @@
 
 3. Number of nodes is allowed to change between minimum and maximum sizes (elasticity).
 
+.. note:: ``torchrun`` is a python
+          `console script <https://packaging.python.org/en/latest/specifications/entry-points/#use-for-scripts>`_
+          to the main module
+          `torch.distributed.run <https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py>`_
+          declared in the ``entry_points`` configuration in
+          `setup.py <https://github.com/pytorch/pytorch/blob/master/setup.py>`_.
+          It is equivalent to invoking ``python -m torch.distributed.run``.
 
 
 Transitioning from torch.distributed.launch to torchrun
@@ -67,11 +74,11 @@
 * the rest of this page for more information on the features of ``torchrun``.
 
 
-
 Usage
-~~~~~~
+--------
 
-1. Single-node multi-worker
+Single-node multi-worker
+++++++++++++++++++++++++++++++
 
 ::
 
@@ -81,7 +88,29 @@
         --nproc_per_node=$NUM_TRAINERS
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
-2. Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failures):
+Stacked single-node multi-worker
++++++++++++++++++++++++++++++++++++
+
+To run multiple instances (separate jobs) of single-node, multi-worker on the
+same host, we need to make sure that each instance (job) is
+setup on different ports to avoid port conflicts (or worse, two jobs being merged
+as a single job). To do this you have to run with ``--rdzv_backend=c10d``
+and specify a different port by setting ``--rdzv_endpoint=localhost:$PORT_k``.
+For ``--nodes=1``, its often convenient to let ``torchrun`` pick a free random
+port automatically instead of manually assgining different ports for each run.
+
+::
+
+    >>> torchrun
+        --rdzv_backend=c10d
+        --rdzv_endpoint=localhost:0
+        --nnodes=1
+        --nproc_per_node=$NUM_TRAINERS
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+
+Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failures)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 ::
 
@@ -101,7 +130,8 @@
 .. note::
    If no port number is specified ``HOST_NODE_ADDR`` defaults to 29400.
 
-3. Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures):
+Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 ::
 
@@ -121,7 +151,8 @@
 .. note::
    If no port number is specified ``HOST_NODE_ADDR`` defaults to 29400.
 
-**Note on rendezvous backend**:
+Note on rendezvous backend
+------------------------------
 
 For multi-node training you need to specify:
 
@@ -144,7 +175,8 @@
    equivalent, but uses a revised implementation. ``etcd`` is in maintenance mode and will be
    removed in a future version.
 
-**Definitions:**
+Definitions
+--------------
 
 1. ``Node`` - A physical instance or a container; maps to the unit that the job manager works with.
 
@@ -173,7 +205,8 @@
 A ``Node`` runs ``LOCAL_WORLD_SIZE`` workers which comprise a ``LocalWorkerGroup``. The union of
 all ``LocalWorkerGroups`` in the nodes in the job comprise the ``WorkerGroup``.
 
-**Environment Variables:**
+Environment Variables
+----------------------
 
 The following environment variables are made available to you in your script:
 
@@ -209,7 +242,8 @@
 13. ``PYTHON_EXEC`` - System executable override. If provided, the python user script will
     use the value of ``PYTHON_EXEC`` as executable. The `sys.executable` is used by default.
 
-**Deployment:**
+Deployment
+------------
 
 1. (Not needed for the C10d backend) Start the rendezvous backend server and get the endpoint (to be
    passed as ``--rdzv_endpoint`` to the launcher script)
@@ -223,7 +257,8 @@
 When using a job/cluster manager the entry point command to the multi-node job should be this
 launcher.
 
-**Failure Modes:**
+Failure Modes
+---------------
 
 1. Worker failure: For a training job with ``n`` workers, if ``k<=n`` workers fail all workers
    are stopped and restarted up to ``max_restarts``.
@@ -234,7 +269,8 @@
 
 3. Node failure: Same as agent failure.
 
-**Membership Changes:**
+Membership Changes
+--------------------
 
 1. Node departure (scale-down): The agent is notified of the departure, all existing workers are
    stopped, a new ``WorkerGroup`` is formed, and all workers are started with a new ``RANK`` and
@@ -244,7 +280,8 @@
    a new ``WorkerGroup`` is formed, and all workers are started with a new ``RANK`` and
    ``WORLD_SIZE``.
 
-**Important Notices:**
+Important Notices
+--------------------
 
 1. This utility and multi-process distributed (single-node or
    multi-node) GPU training currently only achieves the best performance using
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
new file mode 100644
index 000000000000..20a618d7cb30
--- /dev/null
+++ b/torch/distributed/utils.py
@@ -0,0 +1,156 @@
+import collections
+
+import torch
+import torch.distributed as dist
+from torch.nn.parallel._functions import _get_stream
+from torch.nn.parallel.scatter_gather import (  # type: ignore[attr-defined]
+    is_namedtuple as _is_namedtuple
+)
+from typing import Dict, Any, List
+
+__all__ = []  # type: ignore[var-annotated]
+
+def _recursive_to(inputs, target_gpu, use_side_stream_for_tensor_copies):
+    r"""
+    Recursively moves input to the target_gpu.
+    """
+
+    def to_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if obj.device == torch.device("cuda", target_gpu):
+                return (obj,)
+            if not use_side_stream_for_tensor_copies:
+                return (obj.to(target_gpu),)
+            else:
+                # Perform CPU -> GPU copies in a background stream. This code is
+                # motivated from similar logic in torch/nn/parallel/_functions.py
+                stream = _get_stream(target_gpu)
+                with torch.cuda.stream(stream):
+                    output = obj.to(target_gpu)
+                # synchronize with the copy stream
+                with torch.cuda.device(target_gpu):
+                    current_stream = torch.cuda.current_stream()
+                    # Sync the current stream with the copy stream
+                    current_stream.wait_stream(stream)
+                    # Ensure tensor memory is not reused until work on
+                    # main stream is complete
+                    output.record_stream(current_stream)  # type: ignore[arg-type]
+                return (output,)
+        if _is_namedtuple(obj):
+            return [type(obj)(*args) for args in zip(*map(to_map, obj))]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(to_map, obj)))
+        if isinstance(obj, str):
+            # Needs to be checked, otherwise it's taken as a sequence infinitely.
+            # This is because the elements of a string are also strings, and so on.
+            return [obj]
+        if isinstance(obj, collections.abc.Sequence) and len(obj) > 0:
+            try:
+                return [type(obj)(i) for i in zip(*map(to_map, obj))]  # type: ignore[call-arg]
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+                return [list(i) for i in zip(*map(to_map, obj))]
+        if isinstance(obj, collections.abc.Mapping) and len(obj) > 0:
+            try:
+                return [type(obj)(i) for i in zip(*map(to_map, obj.items()))]   # type: ignore[call-arg]
+            except TypeError:
+                # The mapping type may not support `__init__(iterable)`.
+                return [dict(i) for i in zip(*map(to_map, obj.items()))]
+        return [obj]
+
+    # Avoid reference cycle
+    try:
+        res = to_map(inputs)
+    finally:
+        to_map = None  # type: ignore[assignment]
+    return res
+
+
+def _to_kwargs(inputs, kwargs, device_id, use_side_stream_for_tensor_copies):
+    inputs = (
+        _recursive_to(inputs, device_id, use_side_stream_for_tensor_copies)
+        if inputs
+        else []
+    )
+    kwargs = (
+        _recursive_to(kwargs, device_id, use_side_stream_for_tensor_copies)
+        if kwargs
+        else []
+    )
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
+
+def _verify_param_shape_across_processes(process_group, tensors, logger=None):
+    return dist._verify_params_across_processes(process_group, tensors, logger)
+
+def _sync_module_states(
+    module,
+    process_group,
+    broadcast_bucket_size,
+    src,
+    params_and_buffers_to_ignore,
+):
+    """
+    Syncs ``module``'s parameters and buffers state so that all ranks contain
+    the same module state across all ranks. Note that this API assumes that all
+    parameter shapes are consistent before running the synchronization. This can
+    be checked with ``_verify_param_shape_across_processes``.
+    """
+    module_states = []
+    for name, param in module.named_parameters():
+        if name not in params_and_buffers_to_ignore:
+            module_states.append(param.detach())
+
+    for name, buffer in module.named_buffers():
+        if name not in params_and_buffers_to_ignore:
+            module_states.append(buffer.detach())
+
+    _sync_params_and_buffers(
+        process_group,
+        module_states,
+        broadcast_bucket_size,
+        src
+    )
+
+def _sync_params_and_buffers(
+    process_group: dist.ProcessGroup,
+    module_states: List[torch.Tensor],
+    broadcast_bucket_size: int,
+    src: int,
+):
+    """
+    Synchronizes ``module_states`` (list of tensors) across all processes by
+    broadcasting them from rank 0.
+    """
+    if len(module_states) > 0:
+        dist._broadcast_coalesced(
+            process_group, module_states, broadcast_bucket_size, src
+        )
+
+def _replace_by_prefix(
+    state_dict: Dict[str, Any],
+    old_prefix: str,
+    new_prefix: str,
+) -> None:
+    """
+    Replace all keys that match a given old_prefix with a new_prefix (in-place).
+
+    Usage::
+
+        state_dict = {"layer.xyz": torch.tensor(1)}
+        replace_by_prefix_(state_dict, "layer.", "module.layer.")
+        assert state_dict == {"module.layer.xyz": torch.tensor(1)}
+    """
+    if old_prefix == new_prefix:
+        raise ValueError("old_prefix and new_prefix must be distinct")
+    for key in list(state_dict.keys()):
+        if not key.startswith(old_prefix):
+            continue
+        new_key = new_prefix + key[len(old_prefix) :]
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
index 87d4847912f7..71ad4b4fbfba 100644
--- a/torch/distributions/__init__.py
+++ b/torch/distributions/__init__.py
@@ -90,7 +90,7 @@
 from .half_cauchy import HalfCauchy
 from .half_normal import HalfNormal
 from .independent import Independent
-from .kl import kl_divergence, register_kl
+from .kl import kl_divergence, register_kl, _add_kl_info
 from .kumaraswamy import Kumaraswamy
 from .laplace import Laplace
 from .lkj_cholesky import LKJCholesky
@@ -116,6 +116,9 @@
 from .wishart import Wishart
 from . import transforms
 
+_add_kl_info()
+del _add_kl_info
+
 __all__ = [
     'Bernoulli',
     'Beta',
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index b9ec24caa911..c3c4111bf604 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -1,6 +1,7 @@
 from numbers import Number
 
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property
@@ -67,6 +68,12 @@ def _new(self, *args, **kwargs):
     def mean(self):
         return self.probs
 
+    @property
+    def mode(self):
+        mode = (self.probs >= 0.5).to(self.probs)
+        mode[self.probs == 0.5] = nan
+        return mode
+
     @property
     def variance(self):
         return self.probs * (1 - self.probs)
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index 97f25f6dc1e6..b99b3e5b1b47 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -48,6 +48,10 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return self.concentration1 / (self.concentration1 + self.concentration0)
 
+    @property
+    def mode(self):
+        return self._dirichlet.mode[..., 0]
+
     @property
     def variance(self):
         total = self.concentration1 + self.concentration0
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 368eccf24d2f..92691b88e85b 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -75,6 +75,10 @@ def support(self):
     def mean(self):
         return self.total_count * self.probs
 
+    @property
+    def mode(self):
+        return ((self.total_count + 1) * self.probs).floor().clamp(max=self.total_count)
+
     @property
     def variance(self):
         return self.total_count * self.probs * (1 - self.probs)
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index ad4ca39fc3f8..fa0ba3fae49a 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -101,6 +101,10 @@ def param_shape(self):
     def mean(self):
         return torch.full(self._extended_shape(), nan, dtype=self.probs.dtype, device=self.probs.device)
 
+    @property
+    def mode(self):
+        return self.probs.argmax(axis=-1)
+
     @property
     def variance(self):
         return torch.full(self._extended_shape(), nan, dtype=self.probs.dtype, device=self.probs.device)
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 63181a2a6733..f146807afe68 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -49,6 +49,10 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return torch.full(self._extended_shape(), nan, dtype=self.loc.dtype, device=self.loc.device)
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def variance(self):
         return torch.full(self._extended_shape(), inf, dtype=self.loc.dtype, device=self.loc.device)
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index 3a85110d2e8a..049bac64ba03 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -75,6 +75,14 @@ def log_prob(self, value):
     def mean(self):
         return self.concentration / self.concentration.sum(-1, True)
 
+    @property
+    def mode(self):
+        concentrationm1 = (self.concentration - 1).clamp(min=0.)
+        mode = concentrationm1 / concentrationm1.sum(-1, True)
+        mask = (self.concentration < 1).all(axis=-1)
+        mode[mask] = torch.nn.functional.one_hot(mode[mask].argmax(axis=-1), concentrationm1.shape[-1]).to(mode)
+        return mode
+
     @property
     def variance(self):
         con0 = self.concentration.sum(-1, True)
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index b0f9d2eb1568..f24650993521 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -121,6 +121,13 @@ def mean(self):
         """
         raise NotImplementedError
 
+    @property
+    def mode(self):
+        """
+        Returns the mode of the distribution.
+        """
+        raise NotImplementedError(f"{self.__class__} does not implement mode")
+
     @property
     def variance(self):
         """
diff --git a/torch/distributions/exp_family.py b/torch/distributions/exp_family.py
index 669619d9db17..7084714ee3d0 100644
--- a/torch/distributions/exp_family.py
+++ b/torch/distributions/exp_family.py
@@ -56,5 +56,5 @@ def entropy(self):
         gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True)
         result += lg_normal
         for np, g in zip(nparams, gradients):
-            result -= np * g
+            result -= (np * g).reshape(self._batch_shape + (-1,)).sum(-1)
         return result
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 2589c8fb11e2..fdacbc8d0761 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -28,6 +28,10 @@ class Exponential(ExponentialFamily):
     def mean(self):
         return self.rate.reciprocal()
 
+    @property
+    def mode(self):
+        return torch.zeros_like(self.rate)
+
     @property
     def stddev(self):
         return self.rate.reciprocal()
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index b80877635091..89058225e964 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -53,6 +53,12 @@ def mean(self):
         df2[df2 <= 2] = nan
         return df2 / (df2 - 2)
 
+    @property
+    def mode(self):
+        mode = (self.df1 - 2) / self.df1 * self.df2 / (self.df2 + 2)
+        mode[self.df1 <= 2] = nan
+        return mode
+
     @property
     def variance(self):
         df2 = self.df2.clone(memory_format=torch.contiguous_format)
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index d362c4b8fbf3..e1009a79b120 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -27,7 +27,7 @@ class Gamma(ExponentialFamily):
             (often referred to as beta)
     """
     arg_constraints = {'concentration': constraints.positive, 'rate': constraints.positive}
-    support = constraints.positive
+    support = constraints.nonnegative
     has_rsample = True
     _mean_carrier_measure = 0
 
@@ -35,6 +35,10 @@ class Gamma(ExponentialFamily):
     def mean(self):
         return self.concentration / self.rate
 
+    @property
+    def mode(self):
+        return ((self.concentration - 1) / self.rate).clamp(min=0)
+
     @property
     def variance(self):
         return self.concentration / self.rate.pow(2)
@@ -66,8 +70,8 @@ def log_prob(self, value):
         value = torch.as_tensor(value, dtype=self.rate.dtype, device=self.rate.device)
         if self._validate_args:
             self._validate_sample(value)
-        return (self.concentration * torch.log(self.rate) +
-                (self.concentration - 1) * torch.log(value) -
+        return (torch.xlogy(self.concentration, self.rate) +
+                torch.xlogy(self.concentration - 1, value) -
                 self.rate * value - torch.lgamma(self.concentration))
 
     def entropy(self):
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index 00479f0976f3..af78876aade4 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -71,6 +71,10 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return 1. / self.probs - 1.
 
+    @property
+    def mode(self):
+        return torch.zeros_like(self.probs)
+
     @property
     def variance(self):
         return (1. / self.probs - 1.) / self.probs
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index a569af34ebdc..ece771a6ee9f 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -54,6 +54,10 @@ def log_prob(self, value):
     def mean(self):
         return self.loc + self.scale * euler_constant
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def stddev(self):
         return (math.pi / math.sqrt(6)) * self.scale
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 73c2de2fa39d..665720927907 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -25,7 +25,7 @@ class HalfCauchy(TransformedDistribution):
         scale (float or Tensor): scale of the full Cauchy distribution
     """
     arg_constraints = {'scale': constraints.positive}
-    support = constraints.positive
+    support = constraints.nonnegative
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
@@ -45,6 +45,10 @@ def scale(self):
     def mean(self):
         return torch.full(self._extended_shape(), math.inf, dtype=self.scale.dtype, device=self.scale.device)
 
+    @property
+    def mode(self):
+        return torch.zeros_like(self.scale)
+
     @property
     def variance(self):
         return self.base_dist.variance
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index b528a8dbc1c7..ab27325711ec 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,5 +1,6 @@
 import math
 
+import torch
 from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
@@ -24,7 +25,7 @@ class HalfNormal(TransformedDistribution):
         scale (float or Tensor): scale of the full Normal distribution
     """
     arg_constraints = {'scale': constraints.positive}
-    support = constraints.positive
+    support = constraints.nonnegative
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
@@ -44,6 +45,10 @@ def scale(self):
     def mean(self):
         return self.scale * math.sqrt(2 / math.pi)
 
+    @property
+    def mode(self):
+        return torch.zeros_like(self.scale)
+
     @property
     def variance(self):
         return self.scale.pow(2) * (1 - 2 / math.pi)
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 61c7b7a03697..b089bfe9d858 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -77,6 +77,10 @@ def support(self):
     def mean(self):
         return self.base_dist.mean
 
+    @property
+    def mode(self):
+        return self.base_dist.mode
+
     @property
     def variance(self):
         return self.base_dist.variance
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index 39c8fb8cf3cb..ea85658fc013 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -166,7 +166,8 @@ def kl_divergence(p, q):
         fun = _dispatch_kl(type(p), type(q))
         _KL_MEMOIZE[type(p), type(q)] = fun
     if fun is NotImplemented:
-        raise NotImplementedError
+        raise NotImplementedError("No KL(p || q) is implemented for p type {} and q type {}"
+                                  .format(p.__class__.__name__, q.__class__.__name__))
     return fun(p, q)
 
 
@@ -812,3 +813,14 @@ def _kl_cauchy_cauchy(p, q):
     t1 = ((p.scale + q.scale).pow(2) + (p.loc - q.loc).pow(2)).log()
     t2 = (4 * p.scale * q.scale).log()
     return t1 - t2
+
+def _add_kl_info():
+    """Appends a list of implemented KL functions to the doc for kl_divergence."""
+    rows = ["KL divergence is currently implemented for the following distribution pairs:"]
+    for p, q in sorted(_KL_REGISTRY,
+                       key=lambda p_q: (p_q[0].__name__, p_q[1].__name__)):
+        rows.append("* :class:`~torch.distributions.{}` and :class:`~torch.distributions.{}`"
+                    .format(p.__name__, q.__name__))
+    kl_info = '\n\t'.join(rows)
+    if kl_divergence.__doc__:
+        kl_divergence.__doc__ += kl_info  # type: ignore[operator]
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
index 9690a0dc3a6a..ed5ff3af4011 100644
--- a/torch/distributions/kumaraswamy.py
+++ b/torch/distributions/kumaraswamy.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.uniform import Uniform
 from torch.distributions.transformed_distribution import TransformedDistribution
@@ -56,6 +57,14 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return _moments(self.concentration1, self.concentration0, 1)
 
+    @property
+    def mode(self):
+        # Evaluate in log-space for numerical stability.
+        log_mode = self.concentration0.reciprocal() * \
+            (-self.concentration0).log1p() - (-self.concentration0 * self.concentration1).log1p()
+        log_mode[(self.concentration0 < 1) | (self.concentration1 < 1)] = nan
+        return log_mode.exp()
+
     @property
     def variance(self):
         return _moments(self.concentration1, self.concentration0, 2) - torch.pow(self.mean, 2)
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index a505d60c8f38..9b870e8839b7 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -27,6 +27,10 @@ class Laplace(Distribution):
     def mean(self):
         return self.loc
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def variance(self):
         return 2 * self.scale.pow(2)
diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index ef657b1416ef..132b873a5bb8 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -112,7 +112,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         diag_elems = value.diagonal(dim1=-1, dim2=-2)[..., 1:]
-        order = torch.arange(2, self.dim + 1)
+        order = torch.arange(2, self.dim + 1, device=self.concentration.device)
         order = 2 * (self.concentration - 1).unsqueeze(-1) + self.dim - order
         unnormalized_log_pdf = torch.sum(order * diag_elems.log(), dim=-1)
         # Compute normalization constant (page 1999 of [1])
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index cf91f847ed2f..3e18c6ceb1ad 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -46,6 +46,10 @@ def scale(self):
     def mean(self):
         return (self.loc + self.scale.pow(2) / 2).exp()
 
+    @property
+    def mode(self):
+        return (self.loc - self.scale.square()).exp()
+
     @property
     def variance(self):
         return (self.scale.pow(2).exp() - 1) * (2 * self.loc + self.scale.pow(2)).exp()
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 159ad71131a1..d0cff8fb5666 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -129,6 +129,10 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return self.loc
 
+    @property
+    def mode(self):
+        return self.loc
+
     @lazy_property
     def variance(self):
         return (self._unbroadcasted_cov_factor.pow(2).sum(-1)
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index d81c80cd5d16..f6abf0b372a7 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -191,6 +191,10 @@ def precision_matrix(self):
     def mean(self):
         return self.loc
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def variance(self):
         return self._unbroadcasted_scale_tril.pow(2).sum(-1).expand(
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 4adb851f9837..c67baf0d2da8 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -59,6 +59,10 @@ def _new(self, *args, **kwargs):
     def mean(self):
         return self.total_count * torch.exp(self.logits)
 
+    @property
+    def mode(self):
+        return ((self.total_count - 1) * self.logits.exp()).floor().clamp(min=0.)
+
     @property
     def variance(self):
         return self.mean / torch.sigmoid(-self.logits)
@@ -96,5 +100,6 @@ def log_prob(self, value):
 
         log_normalization = (-torch.lgamma(self.total_count + value) + torch.lgamma(1. + value) +
                              torch.lgamma(self.total_count))
+        log_normalization[self.total_count + value == 0.] = 0.
 
         return log_unnormalized_prob - log_normalization
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 1f14f0ae015f..5a2ade0881a0 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -33,6 +33,10 @@ class Normal(ExponentialFamily):
     def mean(self):
         return self.loc
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def stddev(self):
         return self.scale
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index 8273cf616b6b..be3553a526fb 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -70,6 +70,12 @@ def logits(self):
     def mean(self):
         return self._categorical.probs
 
+    @property
+    def mode(self):
+        probs = self._categorical.probs
+        mode = probs.argmax(axis=-1)
+        return torch.nn.functional.one_hot(mode, num_classes=probs.shape[-1]).to(probs)
+
     @property
     def variance(self):
         return self._categorical.probs * (1 - self._categorical.probs)
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 550265a42e9d..15cce64bcb48 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -39,6 +39,10 @@ def mean(self):
         a = self.alpha.clamp(min=1)
         return a * self.scale / (a - 1)
 
+    @property
+    def mode(self):
+        return self.scale
+
     @property
     def variance(self):
         # var is inf for alpha <= 2
@@ -47,7 +51,7 @@ def variance(self):
 
     @constraints.dependent_property(is_discrete=False, event_dim=0)
     def support(self):
-        return constraints.greater_than(self.scale)
+        return constraints.greater_than_eq(self.scale)
 
     def entropy(self):
         return ((self.scale / self.alpha).log() + (1 + self.alpha.reciprocal()))
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 9adb641d7fce..83e5a1c9041a 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -31,6 +31,10 @@ class Poisson(ExponentialFamily):
     def mean(self):
         return self.rate
 
+    @property
+    def mode(self):
+        return self.rate.floor()
+
     @property
     def variance(self):
         return self.rate
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index 232771ac3e64..f89c886e4de6 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -33,6 +33,10 @@ def mean(self):
         m[self.df <= 1] = nan
         return m
 
+    @property
+    def mode(self):
+        return self.loc
+
     @property
     def variance(self):
         m = self.df.clone(memory_format=torch.contiguous_format)
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 470154bc4116..045441d25f5a 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -27,6 +27,7 @@
     'PowerTransform',
     'ReshapeTransform',
     'SigmoidTransform',
+    'SoftplusTransform',
     'TanhTransform',
     'SoftmaxTransform',
     'StackTransform',
@@ -601,6 +602,29 @@ def log_abs_det_jacobian(self, x, y):
         return -F.softplus(-x) - F.softplus(x)
 
 
+class SoftplusTransform(Transform):
+    r"""
+    Transform via the mapping :math:`\text{Softplus}(x) = \log(1 + \exp(x))`.
+    The implementation reverts to the linear function when :math:`x > 20`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, SoftplusTransform)
+
+    def _call(self, x):
+        return softplus(x)
+
+    def _inverse(self, y):
+        return (-y).expm1().neg().log() + y
+
+    def log_abs_det_jacobian(self, x, y):
+        return -softplus(-x)
+
+
 class TanhTransform(Transform):
     r"""
     Transform via the mapping :math:`y = \tanh(x)`.
@@ -944,7 +968,6 @@ def _inverse(self, y):
 
 
 class CatTransform(Transform):
-    tseq: List[numbers.Number]
     """
     Transform functor that applies a sequence of transforms `tseq`
     component-wise to each submatrix at `dim`, of length `lengths[dim]`,
@@ -957,6 +980,8 @@ class CatTransform(Transform):
        t = CatTransform([t0, t0], dim=0, lengths=[20, 20])
        y = t(x)
     """
+    transforms: List[Transform]
+
     def __init__(self, tseq, dim=0, lengths=None, cache_size=0):
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
@@ -980,7 +1005,7 @@ def length(self):
     def with_cache(self, cache_size=1):
         if self._cache_size == cache_size:
             return self
-        return CatTransform(self.tseq, self.dim, self.lengths, cache_size)
+        return CatTransform(self.transforms, self.dim, self.lengths, cache_size)
 
     def _call(self, x):
         assert -x.dim() <= self.dim < x.dim()
@@ -1055,6 +1080,8 @@ class StackTransform(Transform):
        t = StackTransform([ExpTransform(), identity_transform], dim=1)
        y = t(x)
     """
+    transforms: List[Transform]
+
     def __init__(self, tseq, dim=0, cache_size=0):
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index 70a1b1023ac5..88f20e4a7131 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -1,6 +1,7 @@
 from numbers import Number
 
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import broadcast_all
@@ -30,6 +31,10 @@ class Uniform(Distribution):
     def mean(self):
         return (self.high + self.low) / 2
 
+    @property
+    def mode(self):
+        return nan * self.high
+
     @property
     def stddev(self):
         return (self.high - self.low) / 12**0.5
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 6601db6c812b..a9fc23efb327 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 from typing import Dict, Any
-from torch.overrides import has_torch_function
+from torch.overrides import is_tensor_like
 
 euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
 
@@ -24,17 +24,17 @@ def broadcast_all(*values):
         ValueError: if any of the values is not a `numbers.Number` instance,
             a `torch.*Tensor` instance, or an instance implementing __torch_function__
     """
-    if not all(isinstance(v, torch.Tensor) or has_torch_function((v,)) or isinstance(v, Number)
+    if not all(is_tensor_like(v) or isinstance(v, Number)
                for v in values):
         raise ValueError('Input arguments must all be instances of numbers.Number, '
                          'torch.Tensor or objects implementing __torch_function__.')
-    if not all([isinstance(v, torch.Tensor) or has_torch_function((v,)) for v in values]):
+    if not all(is_tensor_like(v) for v in values):
         options: Dict[str, Any] = dict(dtype=torch.get_default_dtype())
         for value in values:
             if isinstance(value, torch.Tensor):
                 options = dict(dtype=value.dtype, device=value.device)
                 break
-        new_values = [v if isinstance(v, torch.Tensor) or has_torch_function((v,)) else torch.tensor(v, **options)
+        new_values = [v if is_tensor_like(v) else torch.tensor(v, **options)
                       for v in values]
         return torch.broadcast_tensors(*new_values)
     return torch.broadcast_tensors(*values)
diff --git a/torch/distributions/von_mises.py b/torch/distributions/von_mises.py
index 4ce3c2d27ff8..ce33796829fd 100644
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@@ -131,6 +131,10 @@ def mean(self):
         """
         return self.loc
 
+    @property
+    def mode(self):
+        return self.loc
+
     @lazy_property
     def variance(self):
         """
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index c07537deb250..fdfe884a199c 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -52,6 +52,10 @@ def expand(self, batch_shape, _instance=None):
     def mean(self):
         return self.scale * torch.exp(torch.lgamma(1 + self.concentration_reciprocal))
 
+    @property
+    def mode(self):
+        return self.scale * ((self.concentration - 1) / self.concentration) ** self.concentration.reciprocal()
+
     @property
     def variance(self):
         return self.scale.pow(2) * (torch.exp(torch.lgamma(1 + 2 * self.concentration_reciprocal)) -
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index 0dd431a0f7ba..30e4284c9f45 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -4,6 +4,7 @@
 from typing import Union
 
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import lazy_property
@@ -20,6 +21,10 @@ def _mvdigamma(x: torch.Tensor, p: int) -> torch.Tensor:
         - torch.arange(p, dtype=x.dtype, device=x.device).div(2).expand(x.shape + (-1,))
     ).sum(-1)
 
+def _clamp_above_eps(x: torch.Tensor) -> torch.Tensor:
+    # We assume positive input for this function
+    return x.clamp(min=torch.finfo(x.dtype).eps)
+
 class Wishart(ExponentialFamily):
     r"""
     Creates a Wishart distribution parameterized by a symmetric positive definite matrix :math:`\Sigma`,
@@ -27,8 +32,9 @@ class Wishart(ExponentialFamily):
 
     Example:
         >>> m = Wishart(torch.eye(2), torch.Tensor([2]))
-        >>> m.sample()  #Wishart distributed with mean=`df * I` and
-                        #variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
+        >>> m.sample()  # Wishart distributed with mean=`df * I` and
+                        # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
+
     Args:
         covariance_matrix (Tensor): positive-definite covariance matrix
         precision_matrix (Tensor): positive-definite precision matrix
@@ -56,6 +62,7 @@ class Wishart(ExponentialFamily):
     }
     support = constraints.positive_definite
     has_rsample = True
+    _mean_carrier_measure = 0
 
     def __init__(self,
                  df: Union[torch.Tensor, Number],
@@ -80,7 +87,7 @@ def __init__(self,
         event_shape = param.shape[-2:]
 
         if self.df.le(event_shape[-1] - 1).any():
-            raise ValueError(f"Value of df={df} expected to be greater than ndim={event_shape[-1]-1}.")
+            raise ValueError(f"Value of df={df} expected to be greater than ndim - 1 = {event_shape[-1]-1}.")
 
         if scale_tril is not None:
             self.scale_tril = param.expand(batch_shape + (-1, -1))
@@ -119,9 +126,8 @@ def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Wishart, _instance)
         batch_shape = torch.Size(batch_shape)
         cov_shape = batch_shape + self.event_shape
-        df_shape = batch_shape
         new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape)
-        new.df = self.df.expand(df_shape)
+        new.df = self.df.expand(batch_shape)
 
         new._batch_dims = [-(x + 1) for x in range(len(batch_shape))]
 
@@ -172,19 +178,29 @@ def precision_matrix(self):
 
     @property
     def mean(self):
-        return self.df.view(self._batch_shape + (1, 1,)) * self.covariance_matrix
+        return self.df.view(self._batch_shape + (1, 1)) * self.covariance_matrix
+
+    @property
+    def mode(self):
+        factor = self.df - self.covariance_matrix.shape[-1] - 1
+        factor[factor <= 0] = nan
+        return factor.view(self._batch_shape + (1, 1)) * self.covariance_matrix
+
 
     @property
     def variance(self):
         V = self.covariance_matrix  # has shape (batch_shape x event_shape)
         diag_V = V.diagonal(dim1=-2, dim2=-1)
-        return self.df.view(self._batch_shape + (1, 1,)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V))
+        return self.df.view(self._batch_shape + (1, 1)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V))
 
     def _bartlett_sampling(self, sample_shape=torch.Size()):
         p = self._event_shape[-1]  # has singleton shape
 
         # Implemented Sampling using Bartlett decomposition
-        noise = self._dist_chi2.rsample(sample_shape).sqrt().diag_embed(dim1=-2, dim2=-1)
+        noise = _clamp_above_eps(
+            self._dist_chi2.rsample(sample_shape).sqrt()
+        ).diag_embed(dim1=-2, dim2=-1)
+
         i, j = torch.tril_indices(p, p, offset=-1)
         noise[..., i, j] = torch.randn(
             torch.Size(sample_shape) + self._batch_shape + (int(p * (p - 1) / 2),),
@@ -250,8 +266,7 @@ def log_prob(self, value):
         nu = self.df  # has shape (batch_shape)
         p = self._event_shape[-1]  # has singleton shape
         return (
-            - nu * p * _log_2 / 2
-            - nu * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+            - nu * (p * _log_2 / 2 + self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1))
             - torch.mvlgamma(nu / 2, p=p)
             + (nu - p - 1) / 2 * torch.linalg.slogdet(value).logabsdet
             - torch.cholesky_solve(value, self._unbroadcasted_scale_tril).diagonal(dim1=-2, dim2=-1).sum(dim=-1) / 2
@@ -262,8 +277,7 @@ def entropy(self):
         p = self._event_shape[-1]  # has singleton shape
         V = self.covariance_matrix  # has shape (batch_shape x event_shape)
         return (
-            (p + 1) * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
-            + p * (p + 1) * _log_2 / 2
+            (p + 1) * (p * _log_2 / 2 + self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1))
             + torch.mvlgamma(nu / 2, p=p)
             - (nu - p - 1) / 2 * _mvdigamma(nu / 2, p=p)
             + nu * p / 2
@@ -271,11 +285,13 @@ def entropy(self):
 
     @property
     def _natural_params(self):
-        return (
-            0.5 * self.df,
-            - 0.5 * self.precision_matrix,
-        )
+        nu = self.df  # has shape (batch_shape)
+        p = self._event_shape[-1]  # has singleton shape
+        return - self.precision_matrix / 2, (nu - p - 1) / 2
 
     def _log_normalizer(self, x, y):
-        p = y.shape[-1]
-        return x * (- torch.linalg.slogdet(-2 * y).logabsdet + _log_2 * p) + _mvdigamma(x, p=p)
+        p = self._event_shape[-1]
+        return (
+            (y + (p + 1) / 2) * (- torch.linalg.slogdet(- 2 * x).logabsdet + _log_2 * p)
+            + torch.mvlgamma(y + (p + 1) / 2, p=p)
+        )
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 6ad15de6dfec..a9a6e3e84650 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -20,7 +20,6 @@
 Computes the one dimensional discrete Fourier transform of :attr:`input`.
 
 Note:
-
     The Fourier domain representation of any real signal satisfies the
     Hermitian property: `X[i] = conj(X[-i])`. This function always returns both
     the positive and negative frequency terms even though, for real inputs, the
@@ -28,6 +27,10 @@
     more compact one-sided representation where only the positive frequencies
     are returned.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
 Args:
     input (Tensor): the input tensor
     n (int, optional): Signal length. If given, the input will either be zero-padded
@@ -68,6 +71,10 @@
 
 Computes the one dimensional inverse discrete Fourier transform of :attr:`input`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
 Args:
     input (Tensor): the input tensor
     n (int, optional): Signal length. If given, the input will either be zero-padded
@@ -111,6 +118,10 @@
     :func:`~torch.fft.rfft2` returns the more compact one-sided representation
     where only the positive frequencies of the last dimension are returned.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -157,6 +168,10 @@
 Computes the 2 dimensional inverse discrete Fourier transform of :attr:`input`.
 Equivalent to :func:`~torch.fft.ifftn` but IFFTs only the last two dimensions by default.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -203,7 +218,6 @@
 Computes the N dimensional discrete Fourier transform of :attr:`input`.
 
 Note:
-
     The Fourier domain representation of any real signal satisfies the
     Hermitian property: ``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])``. This
     function always returns all positive and negative frequency terms even
@@ -211,6 +225,10 @@
     :func:`~torch.fft.rfftn` returns the more compact one-sided representation
     where only the positive frequencies of the last dimension are returned.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -256,6 +274,10 @@
 
 Computes the N dimensional inverse discrete Fourier transform of :attr:`input`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -305,6 +327,10 @@
 the output contains only the positive frequencies below the Nyquist frequency.
 To compute the full output, use :func:`~torch.fft.fft`
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
 Args:
     input (Tensor): the real input tensor
     n (int, optional): Signal length. If given, the input will either be zero-padded
@@ -367,6 +393,12 @@
     signal is assumed to be even length and odd signals will not round-trip
     properly. So, it is recommended to always pass the signal length :attr:`n`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+    With default arguments, size of the transformed dimension should be (2^n + 1) as argument
+    `n` defaults to even output size = 2 * (transformed_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor representing a half-Hermitian signal
     n (int, optional): Output signal length. This determines the length of the
@@ -424,6 +456,10 @@
 :func:`~torch.fft.rfft2` instead omits the negative frequencies in the last
 dimension.
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -496,6 +532,12 @@
     signal is assumed to be even length and odd signals will not round-trip
     properly. So, it is recommended to always pass the signal shape :attr:`s`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -557,6 +599,10 @@
 :func:`~torch.fft.rfftn` instead omits the negative frequencies in the
 last dimension.
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -628,6 +674,12 @@
     signal is assumed to be even length and odd signals will not round-trip
     properly. So, it is recommended to always pass the signal shape :attr:`s`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -709,6 +761,12 @@
     signal is assumed to be even length and odd signals will not round-trip
     properly. So, it is recommended to always pass the signal length :attr:`n`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+    With default arguments, size of the transformed dimension should be (2^n + 1) as argument
+    `n` defaults to even output size = 2 * (transformed_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor representing a half-Hermitian signal
     n (int, optional): Output signal length. This determines the length of the
@@ -771,6 +829,10 @@
 positive frequencies below the Nyquist frequency are included. To compute the
 full output, use :func:`~torch.fft.ifft`.
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
 Args:
     input (Tensor): the real input tensor
     n (int, optional): Signal length. If given, the input will either be zero-padded
@@ -818,6 +880,12 @@
 :attr:`input` is interpreted as a one-sided Hermitian signal in the time
 domain. By the Hermitian property, the Fourier transform will be real-valued.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -878,6 +946,10 @@
 :attr:`input`. Equivalent to :func:`~torch.fft.ihfftn` but transforms only the
 two last dimensions by default.
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -960,6 +1032,12 @@
     signal is assumed to be even length and odd signals will not round-trip
     properly. It is recommended to always pass the signal shape :attr:`s`.
 
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
@@ -1025,6 +1103,10 @@
 Nyquist frequency are included in the last signal dimension. To compute the
 full output, use :func:`~torch.fft.ifftn`.
 
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
 Args:
     input (Tensor): the input tensor
     s (Tuple[int], optional): Signal size in the transformed dimensions.
diff --git a/torch/for_onnx/__init__.py b/torch/for_onnx/__init__.py
deleted file mode 100644
index 30c8a0298045..000000000000
--- a/torch/for_onnx/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .onnx import *  # noqa: F403
diff --git a/torch/functional.py b/torch/functional.py
index cbe0d6f7dc7d..29a66f7f1600 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1,16 +1,15 @@
 from typing import (
-    Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
+    List, Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
 )
-from collections import namedtuple
-import itertools
 
 import torch
+from torch._C import _add_docstr
 import torch.nn.functional as F
 from ._lowrank import svd_lowrank, pca_lowrank
 from .overrides import (
     has_torch_function, has_torch_function_unary, has_torch_function_variadic,
     handle_torch_function)
-from ._jit_internal import boolean_dispatch, List
+from ._jit_internal import boolean_dispatch
 from ._jit_internal import _overload as overload
 
 Tensor = torch.Tensor
@@ -28,7 +27,6 @@
     'cdist',
     'chain_matmul',
     'einsum',
-    'histogramdd',
     'istft',
     'lu',
     'norm',
@@ -101,15 +99,47 @@ def broadcast_shapes(*shapes):
         RuntimeError: If shapes are incompatible.
     """
     # This wrapper exists to support variadic args.
-    # TODO Movie this to C++ once the jit has better support for torch.Size.
-    with torch.no_grad():
-        scalar = torch.zeros((), device="cpu")
-        tensors = [scalar.expand(shape) for shape in shapes]
-        tensors = broadcast_tensors(*tensors)
-        return tensors[0].shape
+    # TODO Move this to C++ once the jit has better support for torch.Size.
+    if not torch.jit.is_tracing():
+        max_len = 0
+        for shape in shapes:
+            if isinstance(shape, int):
+                if max_len < 1:
+                    max_len = 1
+            elif isinstance(shape, tuple) or isinstance(shape, list):
+                s = len(shape)
+                if max_len < s:
+                    max_len = s
+        result = [1] * max_len
+        for shape in shapes:
+            if isinstance(shape, int):
+                shape = (shape,)
+            if isinstance(shape, tuple) or isinstance(shape, list):
+                for i in range(-1, -1 - len(shape), -1):
+                    if shape[i] < 0:
+                        raise RuntimeError("Trying to create tensor with negative dimension ({}): ({})"
+                                           .format(shape[i], shape[i]))
+                    if shape[i] == 1 or shape[i] == result[i]:
+                        continue
+                    if result[i] != 1:
+                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                    result[i] = shape[i]
+            else:
+                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+        return torch.Size(result)
+    else:
+        # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail
+        with torch.no_grad():
+            scalar = torch.zeros((), device="cpu")
+            tensors = [scalar.expand(shape) for shape in shapes]
+            tensors = broadcast_tensors(*tensors)
+            return tensors[0].shape
+
 
 
-def split(tensor, split_size_or_sections, dim=0):
+def split(
+    tensor: Tensor, split_size_or_sections: Union[int, List[int]], dim: int = 0
+) -> List[Tensor]:
     r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.
 
     If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
@@ -159,7 +189,7 @@ def split(tensor, split_size_or_sections, dim=0):
     return tensor.split(split_size_or_sections, dim)
 
 
-def einsum(*args):
+def einsum(*args: Any) -> Tensor:
     r"""einsum(equation, *operands) -> Tensor
 
     Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
@@ -329,126 +359,6 @@ def parse_subscript(n: int) -> str:
 
     return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
 
-# Wrapper around _histogramdd and _histogramdd_bin_edges needed due to (Tensor, Tensor[]) return type.
-if TYPE_CHECKING:
-    # The JIT doesn't understand Union, so only add type annotation for mypy
-    def histogramdd(input: Tensor,
-                    bins: Union[List[Tensor], List[int], int],
-                    range: Optional[List[float]] = None,
-                    weight: Optional[Tensor] = None,
-                    density: bool = False):
-        pass
-else:
-    def histogramdd(input, bins, range=None, weight=None, density=False):
-        r"""
-        histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
-
-        Computes a multi-dimensional histogram of the values in a tensor.
-
-        Interprets the elements of an input tensor whose innermost dimension has size N
-        as a collection of N-dimensional points. Maps each of the points into a set of
-        N-dimensional bins and returns the number of points (or total weight) in each bin.
-
-        :attr:`input` must be a tensor with at least 2 dimensions.
-        If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
-        If input has three or more dimensions, all but the last dimension are flattened.
-
-        Each dimension is independently associated with its own strictly increasing sequence
-        of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
-        tensors. Alternatively, bin edges may be constructed automatically by passing a
-        sequence of integers specifying the number of equal-width bins in each dimension.
-
-        For each N-dimensional point in input:
-            - Each of its coordinates is binned independently among the bin edges
-              corresponding to its dimension
-            - Binning results are combined to identify the N-dimensional bin (if any)
-              into which the point falls
-            - If the point falls into a bin, the bin's count (or total weight) is incremented
-            - Points which do not fall into any bin do not contribute to the output
-
-        :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
-
-        If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
-        of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
-        least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
-        the left and right edges of all bins. Every bin is exclusive of its left edge. Only
-        the rightmost bin is inclusive of its right edge.
-
-        If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
-        in each dimension. By default, the leftmost and rightmost bin edges in each dimension
-        are determined by the minimum and maximum elements of the input tensor in the
-        corresponding dimension. The :attr:`range` argument can be provided to manually
-        specify the leftmost and rightmost bin edges in each dimension.
-
-        If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
-
-        .. note::
-            See also :func:`torch.histogram`, which specifically computes 1D histograms.
-            While :func:`torch.histogramdd` infers the dimensionality of its bins and
-            binned values from the shape of :attr:`input`, :func:`torch.histogram`
-            accepts and flattens :attr:`input` of any shape.
-
-        Args:
-            {input}
-            bins: Tensor[], int[], or int.
-                  If Tensor[], defines the sequences of bin edges.
-                  If int[], defines the number of equal-width bins in each dimension.
-                  If int, defines the number of equal-width bins for all dimensions.
-        Keyword args:
-            range (sequence of float): Defines the leftmost and rightmost bin edges
-                                       in each dimension.
-            weight (Tensor): By default, each value in the input has weight 1. If a weight
-                             tensor is passed, each N-dimensional coordinate in input
-                             contributes its associated weight towards its bin's result.
-                             The weight tensor should have the same shape as the :attr:`input`
-                             tensor excluding its innermost dimension N.
-            density (bool): If False (default), the result will contain the count (or total weight)
-                            in each bin. If True, each count (weight) is divided by the total count
-                            (total weight), then divided by the volume of its associated bin.
-        Returns:
-            hist (Tensor): N-dimensional Tensor containing the values of the histogram.
-            bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
-
-        Example::
-            >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
-            ...                   weight=torch.tensor([1., 2., 4., 8.]))
-                histogramdd_return_type(hist=tensor([[0., 1., 0.],
-                                                     [2., 0., 0.],
-                                                     [4., 0., 8.]]),
-                                        bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
-                                                   tensor([0.0000, 0.6667, 1.3333, 2.0000])))
-
-            >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
-            ...                   range=[0., 1., 0., 1.], density=True)
-                histogramdd_return_type(hist=tensor([[2., 0.],
-                                                     [0., 2.]]),
-                                        bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
-                                                   tensor([0.0000, 0.5000, 1.0000])))
-
-        """
-        if isinstance(bins, int):
-            # If a single int is passed, repeat it for all dimensions
-            bins = list(itertools.repeat(bins, input.size()[-1]))
-
-        if bins and isinstance(bins[0], int):
-            """
-            If bins is int[], the histogram kernel runs faster knowing that the bin edges form
-            a linear progression (see comments in aten/src/ATen/native/cpu/HistogramKernel.cpp).
-            However, we end up constructing the bin edge tensors twice because
-            _histogramdd_from_bin_cts cannot pass back (Tensor, Tensor[]).
-            """
-            bin_edges = _VF._histogramdd_bin_edges(input, bins, range=range, weight=weight, density=density)
-            hist = _VF._histogramdd_from_bin_cts(input, bins, range=range, weight=weight, density=density)
-        else:
-            """
-            If bins is Tensor[] we simply return it back.
-            """
-            bin_edges = bins
-            hist = _VF._histogramdd_from_bin_tensors(input, bin_edges, weight=weight, density=density)
-
-        # TODO: figure out how to return torch.return_types.histogramdd
-        histogramdd_return_type = namedtuple('histogramdd_return_type', 'hist bin_edges')
-        return histogramdd_return_type(hist, bin_edges)
 
 # This wrapper exists to support variadic args.
 if TYPE_CHECKING:
@@ -586,7 +496,8 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
 
     The STFT computes the Fourier transform of short overlapping windows of the
     input. This giving frequency components of the signal as they change over
-    time. The interface of this function is modeled after the librosa_ stft function.
+    time. The interface of this function is modeled after (but *not* a drop-in
+    replacement for) librosa_ stft function.
 
     .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html
 
@@ -684,8 +595,8 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
             stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
             window=window, center=center, pad_mode=pad_mode, normalized=normalized,
             onesided=onesided, return_complex=return_complex)
-    # TODO: after having proper ways to map Python strings to ATen Enum, move
-    #       this and F.pad to ATen.
+    # NOTE: Do not edit. This code will be removed once the forward-compatibility
+    #       period is over for PR #73432
     if center:
         signal_dim = input.dim()
         extended_shape = [1] * (3 - signal_dim) + list(input.size())
@@ -695,80 +606,74 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
     return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
                     normalized, onesided, return_complex)
 
-def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
-          win_length: Optional[int] = None, window: Optional[Tensor] = None,
-          center: bool = True, normalized: bool = False,
-          onesided: Optional[bool] = None, length: Optional[int] = None,
-          return_complex: bool = False) -> Tensor:
-    r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
-    It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
-    least squares estimation of the original signal. The algorithm will check using the NOLA condition (
-    nonzero overlap).
-
-    Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
-    created by the summation of all the windows is never zero at certain point in time. Specifically,
-    :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.
-
-    Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
-    ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
-    since the signal isn't padded). If `length` is given in the arguments and is longer than expected,
-    ``istft`` will pad zeros to the end of the returned signal.
-
-    If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
-    Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
-    calculated without additional information.
-
-    Example: Suppose the last window is:
-    ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``
 
-    The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
-    of right padding. These additional values could be zeros or a reflection of the signal so providing
-    :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
-    (some loss of signal).
-
-    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
-    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
-
-    Args:
-        input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
-            can either be complex (``channel``, ``fft_size``, ``n_frame``), or real
-            (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel``
-            dimension is optional.
-
-            .. deprecated:: 1.8.0
-               Real input is deprecated, use complex inputs as returned by
-               ``stft(..., return_complex=True)`` instead.
-        n_fft (int): Size of Fourier transform
-        hop_length (Optional[int]): The distance between neighboring sliding window frames.
-            (Default: ``n_fft // 4``)
-        win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
-        window (Optional[torch.Tensor]): The optional window function.
-            (Default: ``torch.ones(win_length)``)
-        center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
-            centered at time :math:`t \times \text{hop\_length}`.
-            (Default: ``True``)
-        normalized (bool): Whether the STFT was normalized. (Default: ``False``)
-        onesided (Optional[bool]): Whether the STFT was onesided.
-            (Default: ``True`` if ``n_fft != fft_size`` in the input size)
-        length (Optional[int]): The amount to trim the signal by (i.e. the
-            original signal length). (Default: whole signal)
-        return_complex (Optional[bool]):
-            Whether the output should be complex, or if the input should be
-            assumed to derive from a real signal and window.
-            Note that this is incompatible with ``onesided=True``.
-            (Default: ``False``)
-
-    Returns:
-        Tensor: Least squares estimation of the original signal of size (..., signal_length)
-    """
-    if has_torch_function_unary(input):
-        return handle_torch_function(
-            istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
-            window=window, center=center, normalized=normalized, onesided=onesided,
-            length=length, return_complex=return_complex)
-
-    return _VF.istft(input, n_fft, hop_length, win_length, window, center,  # type: ignore[attr-defined]
-                     normalized, onesided, length, return_complex)
+istft = _add_docstr(
+    torch.istft,
+    "istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, "
+    "normalized=False, onesided=None, length=None, return_complex=False) -> Tensor:\n"
+    r"""
+Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
+
+It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
+least squares estimation of the original signal. The algorithm will check using the NOLA condition (
+nonzero overlap).
+
+Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
+created by the summation of all the windows is never zero at certain point in time. Specifically,
+:math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.
+
+Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
+``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
+since the signal isn't padded). If `length` is given in the arguments and is longer than expected,
+``istft`` will pad zeros to the end of the returned signal.
+
+If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
+Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
+calculated without additional information.
+
+Example: Suppose the last window is:
+``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``
+
+The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
+of right padding. These additional values could be zeros or a reflection of the signal so providing
+:attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
+(some loss of signal).
+
+[1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
+IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
+
+Args:
+    input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
+        can either be complex (``channel``, ``fft_size``, ``n_frame``), or real
+        (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel``
+        dimension is optional.
+
+        .. deprecated:: 1.8.0
+            Real input is deprecated, use complex inputs as returned by
+            ``stft(..., return_complex=True)`` instead.
+    n_fft (int): Size of Fourier transform
+    hop_length (Optional[int]): The distance between neighboring sliding window frames.
+        (Default: ``n_fft // 4``)
+    win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
+    window (Optional[torch.Tensor]): The optional window function.
+        (Default: ``torch.ones(win_length)``)
+    center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
+        centered at time :math:`t \times \text{hop\_length}`.
+        (Default: ``True``)
+    normalized (bool): Whether the STFT was normalized. (Default: ``False``)
+    onesided (Optional[bool]): Whether the STFT was onesided.
+        (Default: ``True`` if ``n_fft != fft_size`` in the input size)
+    length (Optional[int]): The amount to trim the signal by (i.e. the
+        original signal length). (Default: whole signal)
+    return_complex (Optional[bool]):
+        Whether the output should be complex, or if the input should be
+        assumed to derive from a real signal and window.
+        Note that this is incompatible with ``onesided=True``.
+        (Default: ``False``)
+
+Returns:
+    Tensor: Least squares estimation of the original signal of size (..., signal_length)
+""")
 
 
 if TYPE_CHECKING:
@@ -1122,7 +1027,7 @@ def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
                 [ -0.2850,   4.2573,  -3.5997]])
     """
     if has_torch_function_variadic(a, b):
-        return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
+        return handle_torch_function(tensordot, (a, b), a, b, dims=dims, out=out)
 
     if not isinstance(dims, (tuple, list, torch.Tensor, int)):
         raise RuntimeError("tensordot expects dims to be int or "
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 6524c2d1b871..67d8e9b08a4a 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -82,7 +82,7 @@ def forward(self, x):
 
 from .graph_module import GraphModule
 from ._symbolic_trace import symbolic_trace, Tracer, wrap, PH, ProxyableClassMeta
-from .graph import Graph
+from .graph import Graph, CodeGen
 from .node import Node, map_arg
 from .proxy import Proxy
 from .interpreter import Interpreter as Interpreter, Transformer as Transformer
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 3d175905b82e..83372a3e86ce 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -12,7 +12,7 @@
 
 from ._compatibility import compatibility
 from .node import Argument, map_aggregate, base_types
-from .graph import Graph, _PyTreeInfo
+from .graph import Graph, _PyTreeInfo, _PyTreeCodeGen
 from .graph_module import GraphModule
 from .proxy import TracerBase, Proxy, ParameterProxy
 
@@ -414,12 +414,15 @@ def replace_ph(x):
                         return out
                     # Union[int, bool] == bool in Python <= 3.6
                     if type(x) == bool or type(x) in base_types and type(x) != torch.Tensor:
-                        torch._assert(out == x, f"{name} has been specialized to have value {x}")
+                        torch._assert(out == x, f"{name} has been specialized to have value {x} but got another value")
+                    elif type(x) == type(None):
+                        args = (out, f"{name} has been specialized to have value None but got another value")
+                        self.create_proxy('call_function', _assert_is_none, args, {})
                     else:
                         torch.warnings.warn(
-                            "Was not able to add assertion to guarantee correct inputs to "
-                            "specialized function. It is up to the user to make sure that your inputs match the "
-                            "inputs you specialized the function with."
+                            f"Was not able to add assertion to guarantee correct input {name} to "
+                            f"specialized function. It is up to the user to make sure that your inputs match the "
+                            f"inputs you specialized the function with."
                         )
 
                     return x
@@ -434,7 +437,8 @@ def replace_ph(x):
                                      type_expr=fn_for_analysis.__annotations__.get(name, None))
         arg_names = [next(names_iter) for idx in range(skip_arg_idx, total_args)]
         if isinstance(concrete_args, tuple):
-            assert(len(arg_names) == len(concrete_args))
+            if len(arg_names) != len(concrete_args):
+                raise RuntimeError(f"Tracing expected {len(arg_names)} arguments but got {len(concrete_args)} concrete arguments")
             concrete_args = {name: val for name, val in zip(arg_names, concrete_args)}
         args.extend(proxy_placeholder(names) for names in arg_names)
 
@@ -452,14 +456,14 @@ def replace_ph(x):
             # In the case that we have pytree-flattened inputs in
             # `concrete_args`, generate a flattening wrapper around the
             # original root function and return that.
-            self.graph._pytree_info = _PyTreeInfo(orig_args[:total_args], in_spec, None)
+            self.graph._codegen = _PyTreeCodeGen(_PyTreeInfo(orig_args[:total_args], in_spec, None))
 
             def flatten_fn(*args):
                 tree_args = pytree.tree_unflatten(list(args), in_spec)
                 tree_out = root_fn(*tree_args)
                 out_args, out_spec = pytree.tree_flatten(tree_out)
-                assert(self.graph._pytree_info is not None)
-                self.graph._pytree_info = self.graph._pytree_info._replace(out_spec=out_spec)
+                assert(isinstance(self.graph._codegen, _PyTreeCodeGen))
+                self.graph._codegen.pytree_info = self.graph._codegen.pytree_info._replace(out_spec=out_spec)
                 return out_args
 
             return flatten_fn, flat_args
@@ -467,8 +471,8 @@ def flatten_fn(*args):
 
 
     def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
-        if isinstance(attr_val, torch.nn.Parameter):
-            for n, p in self.root.named_parameters():
+        def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
+            for n, p in collection_to_search:
                 if attr_val is p:
                     if n not in parameter_proxy_cache:
                         kwargs = {}
@@ -478,6 +482,17 @@ def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
                         val_proxy = self.create_proxy('get_attr', n, (), {}, **kwargs)  # type: ignore[arg-type]
                         parameter_proxy_cache[n] = val_proxy
                     return parameter_proxy_cache[n]
+            return None
+
+        if isinstance(attr_val, torch.nn.Parameter):
+            maybe_parameter_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_parameters(), parameter_proxy_cache)
+            if maybe_parameter_proxy is not None:
+                return maybe_parameter_proxy
+
+        if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
+            maybe_buffer_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_buffers(), parameter_proxy_cache)
+            if maybe_buffer_proxy is not None:
+                return maybe_buffer_proxy
 
         return attr_val
 
@@ -508,7 +523,13 @@ def trace(self, root: Union[torch.nn.Module, Callable[..., Any]], concrete_args:
         """
         if isinstance(root, torch.nn.Module):
             self.root = root
-            fn = type(root).forward
+
+
+            assert hasattr(
+                type(root), self.traced_func_name
+            ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
+
+            fn = getattr(type(root), self.traced_func_name)
             self.submodule_paths = {mod: name for name, mod in root.named_modules()}
         else:
             self.root = torch.nn.Module()
@@ -857,3 +878,8 @@ def f(x):
     graph = tracer.trace(root, concrete_args)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     return GraphModule(tracer.root, graph, name)
+
+
+@wrap
+def _assert_is_none(value, msg):
+    assert value is None, msg
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index a7365ee668f6..9ce0f4600ca2 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -1,7 +1,7 @@
+import re
 from typing import Callable, Dict, Set, Optional, Union
 
 import torch.fx
-import torch.fx.experimental.fx_acc.acc_utils as acc_utils
 from torch.fx.node import map_arg
 from torch.fx.passes.split_module import split_module
 
@@ -59,9 +59,11 @@ def run_folding(self):
         # Tuple[Tensor,].
         folded_attrs = self.const_subgraph_module()
         params = (
-            torch.nn.ParameterList([torch.nn.Parameter(i) for i in folded_attrs])
+            torch.nn.ParameterList([torch.nn.Parameter(
+                i if not isinstance(i, int) else torch.Tensor([i]).cuda()) for i in folded_attrs])
             if isinstance(folded_attrs, tuple)
-            else torch.nn.Parameter(folded_attrs)
+            else torch.nn.Parameter(
+                folded_attrs if not isinstance(folded_attrs, int) else torch.Tensor([folded_attrs]).cuda())
         )
         setattr(self, self.fx_const_folded_attrs_name, params)
 
@@ -111,6 +113,26 @@ def replacement_fn(node):
     gm.graph.eliminate_dead_code()
 
 
+def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
+    """
+    Make sure the name is unique (in a module) and can represents an attr.
+    """
+    # Delete all characters that are illegal in a Python identifier.
+    name = re.sub("[^0-9a-zA-Z_]+", "_", name)
+    if name[0].isdigit():
+        name = f"_{name}"
+    # Now make sure it is in fact unique to the module by incrementing suffix value.
+    while hasattr(mod_traced, name):
+        match = re.match(r"(.*)_(\d+)$", name)
+        if match is None:
+            name = name + "_1"
+        else:
+            base, num = match.group(1, 2)
+            name = f"{base}_{int(num) + 1}"
+
+    return name
+
+
 def split_const_subgraphs(
     module: Union[torch.nn.Module, torch.fx.GraphModule],
     skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None
@@ -222,7 +244,7 @@ def mod_partition(node: torch.fx.Node):
     # folded tensor(s) that result from constant folding. Note that we don't need to
     # worry about whether this is one or more tensors because the original graph
     # correctly uses getitem to extract individual tensors if there are multiple folded.
-    fx_const_folded_attrs_name = acc_utils.get_unique_attr_name_in_module(
+    fx_const_folded_attrs_name = get_unique_attr_name_in_module(
         split, "_FX_CONST_FOLDED_ATTRS"
     )
     setattr(
diff --git a/torch/fx/experimental/fx_acc/acc_normalizer.py b/torch/fx/experimental/fx_acc/acc_normalizer.py
deleted file mode 100644
index 5e2d21e86fcf..000000000000
--- a/torch/fx/experimental/fx_acc/acc_normalizer.py
+++ /dev/null
@@ -1,444 +0,0 @@
-import inspect
-import re
-from typing import NamedTuple, Optional, Callable, Dict, List, Tuple, Union, Any, Set
-
-import torch
-import torch.fx
-import torch.fx.experimental.fx_acc.acc_utils as acc_utils
-from torch.fx.node import _get_qualified_name
-
-# Need to keep up-to-date with https://fburl.com/codesearch/7r2hhh53
-ALIAS_MAP = {
-    "input": ("input", "x", "a", "x1"),
-    "dim": ("dim", "axis"),
-    "keepdim": ("keepdim", "keepdims"),
-    "other": ("other", "x2"),
-}
-
-# Type used for arg replacement tuples. The list represents the argument signature of
-# some callable. Each item in the list is a tuple, where for each member of a tuple:
-# - The first member is union of either:
-#   - A tuple of all potential alias kwarg str names of the source signature, or
-#   - A tuple of a single str representing the single kwarg name allowed.
-# - The second member is the str name of the kwarg to map it to. This is either from the
-#   signature of the acc_op, or for custom mapped nodes from the original unnormalized op.
-# - The third member is a bool representing whether this arg is optional, i.e. whether it
-#   is allowed to not be present in the original input args.
-ArgReplacementTuplesType = List[Tuple[Tuple[str, ...], str, bool]]
-
-
-class NormalizationInfo(NamedTuple):
-    """
-    Holds normalization info for some FX node, where the FX node will be mapped either
-    via new_fn_target and arg_replacement_tuples, or via custom_mapping_fn.
-
-    If via new_fn_target and arg_replacement_tuples:
-      - new_fn_target is the target function to replace the original node with
-        (generally some function from acc_ops).
-
-      - arg_replacement_tuples describes how to map the original FX node's args/kwargs to
-        the new FX node. If set to None, then the kwargs are copied directly from the
-        original FX node. Else, this is list of three-member tuples, where each tuple
-        represents a mapping from either an arg or kwarg in the original FX node to the
-        kwarg it should be mapped to. If for ops registered with `register_acc_op` then
-        this is a mapping to the the new FX node for the acc_op. Otherwise it is for some
-        op registered with `register_custom_acc_mapper_fn`, in which case this is a
-        mapping for the original input node so its args are normalized to kwargs before
-        being custom normalized to acc_ops. The third member of the tuple is a bool
-        representing whether this argument is optional; if False and the arg is not
-        present then an assertion will be thrown. The index of the tuple indicates where
-        the original arg is in node.args and the string name indicates which original
-        kwarg it is.
-
-    If via custom_mapping_fn, then custom_mapping_fn is some function that takes the
-    original FX node as input and returns the FX node that should replace it. This means
-    it was registered via `register_custom_acc_mapper_fn`.
-    """
-
-    new_fn_target: Callable
-    arg_replacement_tuples: Optional[ArgReplacementTuplesType]
-    custom_mapping_fn: Optional[Callable]
-    # either (tensor_meta_field_name, original_field_name, move_to_qparams) or
-    # (tensor_meta_field_name, orginal_field_name)
-    # when move_to_qparams is True, we'll move the field to qparams
-    # dictionary, otherwise it will stay in TensorMeta itself
-    kwargs_to_move_to_acc_out_ty: Optional[
-        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
-    ]
-    needs_shapes_for_normalization: bool
-
-
-# Dict from (op, target) to NormalizationInfo for that op.
-_normalization_dict: Dict[Tuple[str, Union[str, Callable]], NormalizationInfo] = {}
-
-# Set of all the acc ops.
-_acc_ops: Set[Callable] = set()
-
-
-def _insert_fun(
-    op_and_target: Tuple[str, Union[str, Callable]],
-    arg_replacement_tuples: List[Tuple],
-    new_fn_target: Optional[Callable] = None,
-    custom_mapping_fn: Optional[Callable] = None,
-    kwargs_to_move_to_acc_out_ty: Optional[
-        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
-    ] = None,
-    needs_shapes_for_normalization=False,
-    allow_normalize_from_torch_package=False,
-):
-    if op_and_target[0] == "call_function":
-        assert callable(op_and_target[1])
-    elif op_and_target[0] == "call_method":
-        assert isinstance(op_and_target[1], str)
-    elif op_and_target[0] == "call_module":
-        assert isinstance(op_and_target[1], type)
-
-    # Finalize arg replacement tuples.
-    # 1. Check to see if they have the `is_optional` bool, and if not defaulting it to
-    #   False.
-    # 2. Some kwargs might have aliases. e.g. "a", "x" and "x1" are aliases of "input".
-    #   Here we replace `orig_kwarg` with a tuple of all aliases if it has aliases.
-    final_arg_replacement_tuples = []
-    for arg_replacement_tuple in arg_replacement_tuples:
-        if len(arg_replacement_tuple) == 2:
-            orig_kwarg, new_kwarg, is_optional = *arg_replacement_tuple, False
-        else:
-            assert len(arg_replacement_tuple) == 3
-            orig_kwarg, new_kwarg, is_optional = arg_replacement_tuple
-
-        if not isinstance(orig_kwarg, tuple):
-            orig_kwarg = (orig_kwarg,)
-
-        # Use set to avoid duplicates.
-        orig_kwarg_set = set(orig_kwarg)
-
-        for k in orig_kwarg:
-            if k in ALIAS_MAP:
-                orig_kwarg_set.update(ALIAS_MAP[k])
-        final_arg_replacement_tuples.append(
-            (tuple(orig_kwarg_set), new_kwarg, is_optional)
-        )
-
-    assert op_and_target not in _normalization_dict.keys()
-    norm_info = NormalizationInfo(
-        new_fn_target=new_fn_target,  # type: ignore[arg-type]
-        arg_replacement_tuples=final_arg_replacement_tuples,
-        custom_mapping_fn=custom_mapping_fn,
-        kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
-        needs_shapes_for_normalization=needs_shapes_for_normalization,
-    )
-    _normalization_dict[op_and_target] = norm_info
-
-    # If allow_normalize_from_torch_package then add another entry to
-    # _normalization_dict where we look for the qualified name of the target with the
-    # torch_package module prefix. Note that we leave off any integer at the end of
-    # "<torch_package_>" in order to allow for whatever mangling index is used.
-    if allow_normalize_from_torch_package:
-        torch_package_op_and_target = (
-            op_and_target[0],  # type: ignore[]
-            f"<torch_package_>.{_get_qualified_name(op_and_target[1])}",  # type: ignore[arg-type]
-        )
-        _normalization_dict[torch_package_op_and_target] = norm_info
-
-
-def _get_dup_signature_tuples(fn: Callable) -> List[Tuple[str, str]]:
-    """
-    Helper that inspects the arg signature of `fn` and returns a list of tuples, where
-    each tuple is a pair of duplicated names which is used for arg_replacement_tuples.
-    """
-    sig_tuples: List[Tuple[str, str]] = []
-    for param in inspect.signature(inspect.unwrap(fn)).parameters:
-        sig_tuples.append((param, param))
-    return sig_tuples
-
-
-def register_acc_op(acc_op: Callable):
-    """
-    For a new acc op, add this as decorator to register it.
-    """
-    _acc_ops.add(acc_op)
-    return acc_op
-
-
-def register_acc_op_mapping(
-    op_and_target: Tuple[str, Union[str, Callable]],
-    arg_replacement_tuples: Optional[
-        List[
-            Union[
-                Tuple[Union[str, Tuple[str, ...]], str],
-                Tuple[Union[str, Tuple[str, ...]], str, bool],
-            ]
-        ]
-    ] = None,
-    kwargs_to_move_to_acc_out_ty: Optional[
-        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
-    ] = None,
-):
-    """
-    Use this decorator to map a non-acc operator to an acc operator.
-
-    Args:
-        op_and_target: A tuple that contains op and target of the node that represents the non-acc operator.
-        arg_replacement_tuples: Please refer to the comment on above for `ArgReplacementTuplesType`.
-        kwargs_to_move_to_acc_out_ty: The kwargs we want to move out from the non-acc op kwargs to acc_out_ty.
-    """
-
-    def insert(new_fn_target: Callable):
-        # If arg_replacement_tuples is None then assume we use the same signature for
-        # the acc_op and the original op.
-        if arg_replacement_tuples is None:
-            final_arg_replacement_tuples = _get_dup_signature_tuples(new_fn_target)
-        else:
-            final_arg_replacement_tuples = arg_replacement_tuples  # type: ignore[assignment]
-
-        _insert_fun(
-            op_and_target=op_and_target,
-            new_fn_target=new_fn_target,
-            arg_replacement_tuples=final_arg_replacement_tuples,  # type: ignore[arg-type]
-            kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
-        )
-        return new_fn_target
-
-    return insert
-
-
-def register_custom_acc_mapper_fn(
-    op_and_target: Tuple[str, Union[str, Callable]],
-    arg_replacement_tuples: List[
-        Union[
-            Tuple[Union[str, Tuple[str, ...]], str],
-            Tuple[Union[str, Tuple[str, ...]], str, bool],
-        ]
-    ],
-    needs_shapes_for_normalization=False,
-    allow_normalize_from_torch_package=False,
-):
-    def insert(custom_mapping_fn: Callable):
-        _insert_fun(
-            op_and_target=op_and_target,
-            custom_mapping_fn=custom_mapping_fn,
-            arg_replacement_tuples=arg_replacement_tuples,  # type: ignore[arg-type]
-            needs_shapes_for_normalization=needs_shapes_for_normalization,
-            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
-        )
-        return custom_mapping_fn
-
-    return insert
-
-
-def move_kwargs_to_acc_out_ty(
-    node_or_normalization_info: Union[NormalizationInfo, torch.fx.Node],
-    new_kwargs: Dict[str, Any],
-):
-    """
-    Given `node_or_normalization_info` which is either NormalizationInfo for a node, or
-    a node to fetch NormalizationInfo for, check if kwargs_to_move_to_acc_out_ty exists
-    in the NormalizationInfo, and if so perform the move of kwargs to acc_out_ty.
-    """
-
-    if isinstance(node_or_normalization_info, torch.fx.Node):
-        node = node_or_normalization_info
-        normalization_info = _normalization_dict.get((node.op, node.target))
-    else:
-        assert isinstance(node_or_normalization_info, NormalizationInfo)
-        normalization_info = node_or_normalization_info
-
-    assert normalization_info is not None
-    if normalization_info.kwargs_to_move_to_acc_out_ty is None:
-        return
-
-    assert acc_utils.is_acc_op_with_kwarg(
-        normalization_info.new_fn_target, "acc_out_ty"
-    )
-
-    # Build a dict representing the new TensorMetadata to use for acc_out_ty,
-    # and then remove the kwarg from the new_kwargs since it's passed in via
-    # acc_out_ty instead.
-    tmd_dict: Dict[str, Any] = {}
-    qparams: Dict[str, Any] = {}
-
-    for kwarg_replacement_tuple in normalization_info.kwargs_to_move_to_acc_out_ty:
-        if len(kwarg_replacement_tuple) == 2:
-            orig_kwarg_name, tmd_field_name, move_to_qparams = *kwarg_replacement_tuple, False  # type: ignore[misc]
-        else:
-            assert len(kwarg_replacement_tuple) == 3
-            orig_kwarg_name, tmd_field_name, move_to_qparams = kwarg_replacement_tuple  # type: ignore[misc]
-        if move_to_qparams:
-            qparams[tmd_field_name] = new_kwargs[orig_kwarg_name]
-        else:
-            tmd_dict[tmd_field_name] = new_kwargs[orig_kwarg_name]
-        del new_kwargs[orig_kwarg_name]
-
-    tmd_dict["qparams"] = qparams
-    # Note: allow_partial_spec here because we are only using the tensor metadata tuple
-    # here to pass specific values into the function. For example, for quantization we
-    # only need to provide qparams dictionary, but is_quantized is
-    # not passed in.
-    new_kwargs["acc_out_ty"] = acc_utils.build_raw_tensor_meta(**tmd_dict)
-
-
-def get_normalized_kwargs(
-    node: torch.fx.Node, arg_replacement_tuples: ArgReplacementTuplesType
-):
-    new_kwargs = {}
-    final_arg_is_varg = False
-    for i, replacement_tuple in enumerate(arg_replacement_tuples):
-        orig_kwargs_names, new_kwarg_name, is_optional = replacement_tuple
-
-        # Check if this is a varg and if so break/process the rest outside the loop.
-        if len(orig_kwargs_names) == 1 and orig_kwargs_names[0] == "*":
-            assert i == len(arg_replacement_tuples) - 1
-            final_arg_is_varg = True
-            break
-
-        # If nothing is found in node.kwargs it means the kwarg is in node.arg
-        # or it's optional. In this case, we set orig_kwargs_name to None.
-        assert isinstance(orig_kwargs_names, tuple)
-        orig_kwargs_name = next(
-            (key for key in orig_kwargs_names if key in node.kwargs),
-            None,
-        )
-
-        # If can't find in node.kwargs then it should be in the i index
-        # of node.args.
-        if orig_kwargs_name is None:
-            if i < len(node.args):
-                new_kwargs[new_kwarg_name] = node.args[i]
-            else:
-                # Verify the arg we're trying to normalize was optional.
-                assert is_optional, f"Cannot normalize {orig_kwargs_names} to {new_kwarg_name} for {node.name}"
-        else:
-            new_kwargs[new_kwarg_name] = node.kwargs[orig_kwargs_name]
-
-    # If using var args then process the rest of the args now.
-    if final_arg_is_varg:
-        var_arg_idx = len(arg_replacement_tuples) - 1
-        new_kwarg_name = arg_replacement_tuples[var_arg_idx][1]
-        rest_of_args = []
-        for i in range(var_arg_idx, len(node.args)):
-            rest_of_args.append(node.args[i])
-        new_kwargs[new_kwarg_name] = rest_of_args
-
-    return new_kwargs
-
-
-def normalize(mod: torch.fx.GraphModule, expect_nodes_have_shapes: bool = False):
-    assert len(_normalization_dict) > 0
-    graph = mod.graph
-
-    # For "call_module" node we return _base_class_origin if it's a
-    # RewrittenModule, otherwise, return its type. For other nodes,
-    # we return node.target.
-    def get_target(mod: torch.fx.GraphModule, node: torch.fx.Node):
-        if node.op != "call_module":
-            return node.target
-
-        # Find the module that node.target points to
-        m = dict(mod.named_modules())[node.target]
-        return getattr(m, "_base_class_origin", type(m))
-
-    def normalize_to_acc_op(
-        node: torch.fx.Node,
-        normalization_info: NormalizationInfo,
-        normalized_args: Tuple[Any, ...],
-        normalized_kwargs: Dict[str, Any],
-    ):
-        # If there's a custom mapping function then use it.
-        if normalization_info.custom_mapping_fn is not None:
-            # For custom mapping, the normalized_kwargs are used for the original op,
-            # i.e. *before* custom acc_ops normalization. Do that now.
-            node.args = normalized_args
-            node.kwargs = normalized_kwargs
-            new_node = normalization_info.custom_mapping_fn(node, mod)
-            # If a new node is returned then use it to replace the old node. Otherwise
-            # the custom mapping function did its own replacement, so return early.
-            if new_node is None:
-                return
-        else:
-            # If there's kwargs_to_move_to_acc_out_ty then use it to setup acc_out_ty in
-            # normalized_kwargs, and remove the kwarg from normalized_kwargs.
-            move_kwargs_to_acc_out_ty(normalization_info, normalized_kwargs)
-
-            # All acc ops are functions. Create a call to the correct acc_ops target using
-            # the normalized kwargs provided.
-            with graph.inserting_before(node):
-                new_node = graph.create_node(
-                    "call_function",
-                    normalization_info.new_fn_target,
-                    args=normalized_args,
-                    kwargs=normalized_kwargs,
-                    name=node.name,
-                )
-                new_node.meta = node.meta.copy()
-
-        # Finally replace the original node with the normalized node.
-        node.replace_all_uses_with(new_node)
-        graph.erase_node(node)
-
-        # Don't wrap the acc_op node just because the original node was wrapped.
-        if "is_wrapped" in new_node.meta:
-            del new_node.meta["is_wrapped"]
-
-    for node in graph.nodes:
-        if node.op in {"placeholder", "get_attr", "output"}:
-            continue
-
-        normalization_info = _normalization_dict.get((node.op, get_target(mod, node)))
-
-        # Also check if the torch_packaged version of the op was specified to be normalized.
-        if normalization_info is None and node.op == "call_function":
-            # Strip off the mangle_index suffix here before checking the map.
-            target = re.sub(
-                r"\A<torch_package_\d+>",
-                "<torch_package_>",
-                _get_qualified_name(node.target),
-            )
-            torch_package_op_and_target = (node.op, target)
-            normalization_info = _normalization_dict.get(torch_package_op_and_target)
-
-        if normalization_info is None:
-            continue
-
-        # Get the normalized kwargs to be used by normalize_to_acc_op below. If
-        # normalization_info.arg_replacement_tuples is empty then assume the function
-        # signature must be left as is.
-        assert normalization_info.arg_replacement_tuples is not None
-        if len(normalization_info.arg_replacement_tuples) == 0:
-            normalized_args = node.args
-            normalized_kwargs = node.kwargs
-        else:
-            normalized_args = ()
-            try:
-                normalized_kwargs = get_normalized_kwargs(
-                    node, normalization_info.arg_replacement_tuples
-                )
-            except Exception:
-                print(
-                    f"Error during kwarg normalization for: {node.format_node()}; "
-                    f"arg_replacement_tuples={normalization_info.arg_replacement_tuples}"
-                )
-                raise
-
-        if (
-            normalization_info.needs_shapes_for_normalization
-            and not expect_nodes_have_shapes
-        ):
-            # All nodes needing shapes for normalization should be custom mapped.
-            assert normalization_info.custom_mapping_fn is not None
-            # For custom mapping, the normalized_kwargs are used for the original op,
-            # i.e. *before* custom acc_ops normalization. Do that now so that whoever
-            # consumes the graph next (e.g. shape inference) can use kwargs safely.
-            node.args = normalized_args
-            node.kwargs = normalized_kwargs
-            continue
-
-        try:
-            normalize_to_acc_op(
-                node, normalization_info, normalized_args, normalized_kwargs
-            )
-        except Exception:
-            print(f"Error during normalization for node: {node.format_node()}")
-            raise
-
-    # If there are any dead nodes left after normalization, eliminate them now.
-    mod.graph.eliminate_dead_code()
diff --git a/torch/fx/experimental/fx_acc/acc_op_properties.py b/torch/fx/experimental/fx_acc/acc_op_properties.py
deleted file mode 100644
index a2bc076ec782..000000000000
--- a/torch/fx/experimental/fx_acc/acc_op_properties.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from collections import defaultdict
-from enum import Flag, auto
-from typing import Callable, DefaultDict, Set
-
-import torch
-import torch.fx
-
-class AccOpProperty(Flag):
-    """
-    A collection of static properties for acc_ops.
-
-    * pointwise - op commutes with data restructuring ops such as reshape,
-        transpose, permute. e.g. op(reshape(x)) == reshape(op(x)).
-        Alternatively, for tensor x = (x1, x2, ...), there exists a scalar
-        function f such that op(x) = (f(x1), f(x2), ...).
-    * quantized - op expects quantized inputs and return quantized outputs
-    * unary - op has exactly one graph dependent input. e.g. relu,
-        dequantize, sum
-    """
-    pointwise = auto()
-    quantized = auto()
-    unary = auto()
-
-acc_op_properties: DefaultDict[Callable, Set[AccOpProperty]] = defaultdict(set)
-acc_ops_with_property: DefaultDict[AccOpProperty, Set[Callable]] = defaultdict(set)
-
-
-def register_acc_op_properties(*properties: AccOpProperty):
-    """
-    Attach properties to acc_op to inform optimization
-    """
-    def decorator(acc_op: Callable):
-        acc_op_properties[acc_op] |= set(properties)
-        for prop in properties:
-            acc_ops_with_property[prop].add(acc_op)
-        return acc_op
-    return decorator
-
-
-def add_optimization_properties_to_meta(mod: torch.fx.GraphModule) -> None:
-    """
-    Add acc_op properties to Node.meta to inform optimization
-    """
-    for node in mod.graph.nodes:
-        node.meta['acc_op_properties'] = acc_op_properties[node.target]
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
deleted file mode 100644
index 636e35170451..000000000000
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ /dev/null
@@ -1,1924 +0,0 @@
-# encoding: utf-8
-import operator
-import warnings
-
-import torch  # isort:skip
-from typing import Sequence, List, cast
-
-import torch.fx.experimental.fx_acc.acc_utils as acc_utils
-import torch.nn as nn
-from torch.fx.experimental.fx_acc.acc_normalizer import (
-    register_acc_op,
-    register_acc_op_mapping,
-    register_custom_acc_mapper_fn,
-)
-from torch.fx.experimental.fx_acc.acc_op_properties import (
-    AccOpProperty,
-    register_acc_op_properties,
-)
-from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
-
-this_arg_is_optional = True
-move_to_qparams = True
-dont_move_to_qparams = False
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.linear))
-@register_acc_op
-def linear(*, input, weight, bias):
-    return nn.functional.linear(input=input, weight=weight, bias=bias)
-
-
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op
-def quantized_linear(*, input, weight, bias, acc_out_ty=None):
-    assert acc_out_ty is not None
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    return nn.quantized.functional.linear(
-        input,
-        weight,
-        bias,
-        qparams["scale"],
-        qparams["zero_point"],
-    )
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_method", "flatten"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("start_dim", "start_dim", this_arg_is_optional),
-        ("end_dim", "end_dim", this_arg_is_optional),
-    ],
-)
-@register_acc_op_mapping(op_and_target=("call_function", torch.flatten))
-@register_acc_op
-def flatten(*, input, start_dim=0, end_dim=-1):
-    return torch.flatten(input=input, start_dim=start_dim, end_dim=end_dim)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_method", "squeeze"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-    ],
-)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.squeeze),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-    ],
-)
-@register_acc_op
-def squeeze(*, input, dim=None):
-    if dim is None:
-        return input.squeeze()
-    return input.squeeze(dim=dim)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool2d))
-@register_acc_op
-def max_pool2d(
-    *, input, kernel_size, stride, padding, dilation, ceil_mode, return_indices
-):
-    return nn.functional.max_pool2d(
-        input=input,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        ceil_mode=ceil_mode,
-        return_indices=return_indices,
-    )
-
-
-@register_acc_op_mapping(
-    op_and_target=("call_function", nn.functional.adaptive_avg_pool2d)
-)
-@register_acc_op
-def adaptive_avg_pool2d(*, input, output_size):
-    return nn.functional.adaptive_avg_pool2d(input=input, output_size=output_size)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool2d))
-@register_acc_op
-def avg_pool2d(
-    *,
-    input,
-    kernel_size,
-    stride,
-    padding,
-    ceil_mode,
-    count_include_pad,
-    divisor_override,
-):
-    return nn.functional.avg_pool2d(
-        input=input,
-        kernel_size=kernel_size,
-        stride=stride,
-        padding=padding,
-        ceil_mode=ceil_mode,
-        count_include_pad=count_include_pad,
-        divisor_override=divisor_override,
-    )
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.sign))
-@register_acc_op
-def sign(*, input):
-    return torch.sign(input)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def size(*, input):
-    return input.size()
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", getattr),
-    arg_replacement_tuples=[],
-)
-def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    """
-    Custom function for mapping a call_function getattr to other ops. Currently only
-    supports loading a getattr called on a torch.Tensor with attr name "shape", which is
-    supported by mapping it to acc_ops.size().
-    """
-    # Have to use args here since getattr forces positional args.
-    input_obj = node.args[0]
-    attr_name = node.args[1]
-    assert isinstance(input_obj, torch.fx.Node)
-    assert (
-        input_obj.meta["type"] == torch.Tensor
-    ), f"Expected torch.Tensor type for {input_obj.meta['type']}"
-    assert (
-        attr_name == "shape"
-    ), f"Only supporting shape getattr for now, not {attr_name}"
-    with node.graph.inserting_before(node):
-        size_node = node.graph.call_function(size, kwargs={"input": input_obj})
-        size_node.meta = node.meta.copy()
-        return size_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "size"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-    ],
-)
-def tensor_size_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    """
-    Mapping from Tensor.size() to acc_ops.size. We map size() to acc_ops.size directly
-    and map size(dim) to acc_ops.size + acc_ops.getitem.
-    """
-
-    with node.graph.inserting_before(node):
-        size_node = node.graph.call_function(
-            size, kwargs={"input": node.kwargs["input"]}
-        )
-
-        if "dim" not in node.kwargs:
-            size_node.meta = node.meta.copy()
-            return size_node
-
-        size_node.meta["type"] = torch.Size
-        getitem_node = node.graph.call_function(
-            getitem, kwargs={"input": size_node, "idx": node.kwargs["dim"]}
-        )
-        getitem_node.meta = node.meta.copy()
-        return getitem_node
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", operator.add))
-@register_acc_op_mapping(op_and_target=("call_method", "add"))
-@register_acc_op
-def add(*, input, other):
-    return input + other
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze"))
-@register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze))
-@register_acc_op
-def unsqueeze(*, input, dim):
-    return torch.unsqueeze(input=input, dim=dim)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_method", "tile"))
-@register_acc_op_mapping(op_and_target=("call_function", torch.tile))
-@register_acc_op
-def tile(*, input, dims):
-    return torch.tile(input=input, dims=dims)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.stack),
-    arg_replacement_tuples=[
-        ("tensors", "tensors"),
-        ("dim", "dim"),
-    ],
-)
-def stack_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    """
-    Map torch.stack to unsqueeze + cat.
-    """
-    with node.graph.inserting_before(node):
-        inputs = node.kwargs["tensors"]
-        unsqueeze_nodes = []
-        assert isinstance(inputs, Sequence)
-        for i, t in enumerate(inputs):
-            new_node = node.graph.create_node(
-                "call_function",
-                unsqueeze,
-                kwargs={"input": t, "dim": node.kwargs["dim"]},
-                name=f"{node.name}_unsqueeze_{i}",
-            )
-            new_node.meta["type"] = torch.Tensor
-            unsqueeze_nodes.append(new_node)
-        cat_node = node.graph.create_node(
-            "call_function",
-            cat,
-            kwargs={"tensors": unsqueeze_nodes, "dim": node.kwargs["dim"]},
-        )
-        cat_node.meta = node.meta.copy()
-        return cat_node
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.clamp))
-@register_acc_op_mapping(op_and_target=("call_method", "clamp"))
-@register_acc_op
-def clamp(*, input, min=None, max=None):
-    return torch.clamp(input=input, min=min, max=max)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", torch.cat))
-@register_acc_op
-def cat(*, tensors, dim):
-    return torch.cat(tensors=tensors, dim=dim)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.transpose),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim0", "dim0"),
-        ("dim1", "dim1"),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "transpose"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim0", "dim0"),
-        ("dim1", "dim1"),
-    ],
-)
-def transpose_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    # Get the dim-permutation/shuffle
-    shape_as_list = node.meta["tensor_meta"].shape
-    ranks = len(shape_as_list)
-    shuffle = list(i for i in range(ranks))
-    dim0 = cast(int, node.kwargs["dim0"])
-    dim1 = cast(int, node.kwargs["dim1"])
-    shuffle[dim0] = dim1
-    shuffle[dim1] = dim0
-
-    # Create the new acc_ops.permute node. Update all uses of the transpose
-    # node and then delete the transpose node.
-    with node.graph.inserting_after(node):
-        permute_node = node.graph.call_function(
-            the_function=permute,
-            kwargs={
-                "input": node.kwargs.get("input"),
-                "permutation": shuffle,
-            },
-        )
-        permute_node.meta = node.meta.copy()
-        node.replace_all_uses_with(permute_node)
-
-    permute_node.graph.erase_node(node)
-    return permute_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_method", "contiguous"))
-@register_acc_op
-def contiguous(*, input):
-    return input.contiguous()
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softmax))
-@register_acc_op
-def softmax(*, input, dim, dtype):
-    """
-    _stacklevel are ignored here.
-    """
-    return torch.nn.functional.softmax(input=input, dim=dim, dtype=dtype)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.addmm),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("mat1", "mat1"),
-        ("mat2", "mat2"),
-        ("beta", "beta"),
-        ("alpha", "alpha"),
-    ],
-)
-def addmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    """
-    Mapping from torch.addmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1
-    then we also insert acc_ops.mul to the right place.
-    """
-    with node.graph.inserting_before(node):
-        mm_kwargs = {"input": node.kwargs["mat1"], "other": node.kwargs["mat2"]}
-        mm_node = node.graph.create_node(
-            "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_mm"
-        )
-        mm_node.meta = node.meta.copy()
-
-        if node.kwargs["alpha"] != 1:
-            mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]}
-            mm_node = node.graph.create_node(
-                "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul"
-            )
-        mm_node.meta = node.meta.copy()
-
-        input_node = node.kwargs["input"]
-        if node.kwargs["beta"] != 1:
-            mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]}
-            new_input_node = node.graph.create_node(
-                "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul"
-            )
-            assert isinstance(input_node, torch.fx.Node)
-            new_input_node.meta = input_node.meta.copy()
-            input_node = new_input_node
-
-        add_kwargs = {"input": mm_node, "other": input_node}
-        add_node = node.graph.create_node(
-            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
-        )
-        add_node.meta = node.meta.copy()
-        return add_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.t),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "t"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def t_mapper(node: torch.fx.Node, _: nn.Module):
-    ranks = len(node.meta["tensor_meta"].shape)
-    shuffle = [1, 0] if (ranks > 1) else [0]
-
-    with node.graph.inserting_before(node):
-        new_node = node.graph.create_node(
-            "call_function",
-            permute,
-            kwargs={"input": node.kwargs["input"], "permutation": shuffle},
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_method", "permute"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("*", "permutation"),
-    ],
-)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.permute),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dims", "permutation"),
-    ],
-)
-@register_acc_op
-def permute(*, input, permutation):
-    return input.permute(*permutation)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.square),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def square_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    input_node = node.kwargs["input"]
-    with node.graph.inserting_before(node):
-        new_node = node.graph.call_function(
-            mul, kwargs={"input": input_node, "other": input_node}
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.bmm),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("mat2", "other"),
-    ],
-)
-@register_acc_op_mapping(op_and_target=("call_function", torch.matmul))
-@register_acc_op
-def matmul(*, input, other):
-    return torch.matmul(input=input, other=other)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", nn.functional.dropout),
-    arg_replacement_tuples=[("input", "input")],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "detach"), arg_replacement_tuples=[("input", "input")]
-)
-def dropout_mapper(node: torch.fx.Node, mod: nn.Module):
-    """
-    Remove dropout node and directly map its input to output.
-    """
-    return node.kwargs["input"]
-
-
-try:
-    from torchvision.ops import stochastic_depth
-except Exception as e:
-    warnings.warn(f"Unable to import torchvision related libraries.: {e}")
-else:
-
-    @register_custom_acc_mapper_fn(
-        op_and_target=("call_function", stochastic_depth),
-        arg_replacement_tuples=[("input", "input")],
-    )
-    def stochastic_depth_mapper(node: torch.fx.Node, mod: nn.Module):
-        """
-        Remove dropout node and directly map its input to output.
-        """
-        return node.kwargs["input"]
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_function", nn.functional.hardtanh),
-)
-@register_acc_op
-def hardtanh(*, input, min_val=-1.0, max_val=1.0):
-    return nn.functional.hardtanh(input=input, min_val=min_val, max_val=max_val)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.hardsigmoid))
-@register_acc_op
-def hardsigmoid(*, input):
-    return nn.functional.hardsigmoid(input)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", nn.functional.silu),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def silu(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    input_node = node.kwargs["input"]
-    with node.graph.inserting_before(node):
-        sigmoid_node = node.graph.call_function(sigmoid, kwargs={"input": input_node})
-        sigmoid_node.meta = node.meta.copy()
-        new_node = node.graph.call_function(
-            mul, kwargs={"input": sigmoid_node, "other": input_node}
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", nn.functional.hardswish),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    input_node = node.kwargs["input"]
-    with node.graph.inserting_before(node):
-        new_sigmoid_node = node.graph.call_function(
-            hardsigmoid, kwargs={"input": input_node}
-        )
-        new_sigmoid_node.meta = node.meta.copy()
-        new_node = node.graph.call_function(
-            mul, kwargs={"input": new_sigmoid_node, "other": input_node}
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.ops.quantized.add),
-    arg_replacement_tuples=[
-        ("qa", "input"),
-        ("qb", "other"),
-        ("scale", "scale"),
-        ("zero_point", "zero_point"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[
-        ("scale", "scale", move_to_qparams),
-        ("zero_point", "zero_point", move_to_qparams),
-    ],
-)
-@register_acc_op
-def quantized_add(*, input, other, acc_out_ty=None):
-    assert acc_out_ty is not None
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    return torch.ops.quantized.add(
-        input,
-        other,
-        qparams["scale"],
-        qparams["zero_point"],
-    )
-
-
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.ops.quantized.mul),
-    arg_replacement_tuples=[
-        ("qa", "input"),
-        ("qb", "other"),
-        ("scale", "scale"),
-        ("zero_point", "zero_point"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[
-        ("scale", "scale", move_to_qparams),
-        ("zero_point", "zero_point", move_to_qparams),
-    ],
-)
-@register_acc_op
-def quantized_mul(*, input, other, acc_out_ty=None):
-    assert acc_out_ty is not None
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    return torch.ops.quantized.mul(
-        input,
-        other,
-        qparams["scale"],
-        qparams["zero_point"],
-    )
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.quantize_per_tensor),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("scale", "scale"),
-        ("zero_point", "zero_point"),
-        ("dtype", "dtype"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[
-        ("scale", "scale", move_to_qparams),
-        ("zero_point", "zero_point", move_to_qparams),
-        ("dtype", "dtype", dont_move_to_qparams),
-    ],
-)
-@register_acc_op
-def quantize_per_tensor(*, input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    dtype = TensorMetadata(*acc_out_ty).dtype
-    return torch.quantize_per_tensor(
-        input, qparams["scale"], qparams["zero_point"], dtype
-    )
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.quantize_per_channel),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("scales", "scales"),
-        ("zero_points", "zero_points"),
-        ("axis", "axis"),
-        ("dtype", "dtype"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[
-        ("scales", "scale", move_to_qparams),
-        ("zero_points", "zero_point", move_to_qparams),
-        ("axis", "axis", move_to_qparams),
-        ("dtype", "dtype", dont_move_to_qparams),
-    ],
-)
-@register_acc_op
-def quantize_per_channel(*, input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    dtype = TensorMetadata(*acc_out_ty).dtype
-    return torch.quantize_per_channel(
-        input,
-        torch.tensor(qparams["scale"]),
-        torch.tensor(qparams["zero_point"]),
-        qparams["axis"],
-        dtype,
-    )  # type: ignore[call-overload]
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_method", "dequantize"))
-@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize))
-@register_acc_op
-def dequantize(*, input):
-    return torch.dequantize(input)
-
-
-@register_acc_op_properties(
-    AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized
-)
-@register_acc_op
-def rescale_quantize_per_tensor(*, input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    d = dequantize(input=input)
-    return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty)
-
-
-@register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized)
-@register_acc_op
-def rescale_quantize_per_channel(*, input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    d = dequantize(input=input)
-    return quantize_per_channel(input=d, acc_out_ty=acc_out_ty)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", operator.sub))
-@register_acc_op
-def sub(*, input, other):
-    return input - other
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", torch.mul))
-@register_acc_op_mapping(op_and_target=("call_function", operator.mul))
-@register_acc_op_mapping(op_and_target=("call_method", "mul"))
-@register_acc_op
-def mul(*, input, other):
-    return input * other
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.div),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("other", "other"),
-        ("rounding_mode", "rounding_mode", this_arg_is_optional),
-    ],
-)
-def div_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
-    with node.graph.inserting_before(node):
-        div_kwargs = dict(node.kwargs)
-        if "rounding_mode" not in div_kwargs or div_kwargs["rounding_mode"] is None:
-            div_node = node.graph.call_function(
-                div, kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]}
-            )
-        elif div_kwargs["rounding_mode"] == "trunc":
-            div_node = node.graph.call_function(
-                trunc_div,
-                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
-            )
-        elif div_kwargs["rounding_mode"] == "floor":
-            div_node = node.graph.call_function(
-                floor_div,
-                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
-            )
-        else:
-            raise RuntimeError(
-                f"Unhandled div rounding mode {div_kwargs['rounding_mode']}"
-            )
-        div_node.meta = node.meta.copy()
-        return div_node
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", operator.truediv))
-@register_acc_op
-def div(*, input, other):
-    return input / other
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", operator.floordiv))
-@register_acc_op
-def floor_div(*, input, other):
-    # This is temp fix because currently operator.floor_div for tensors would
-    # traslate into torch.floor_divide which would throw an error. After it's
-    # fixed we can stick to `input // other`.
-    if isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor):
-        return torch.div(input, other, rounding_mode="floor")
-    return input // other
-
-
-# torch.floor_divide rounds result toward zero, rather than -Inf.
-# https://github.com/pytorch/pytorch/issues/43874
-@register_acc_op_mapping(op_and_target=("call_function", torch.floor_divide))
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op
-def trunc_div(*, input, other):
-    return torch.div(input, other, rounding_mode="trunc")
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", torch.pow))
-@register_acc_op
-def pow(*, input, exponent):
-    return torch.pow(input, exponent)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu))
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.relu),
-    arg_replacement_tuples=[("input", "input")],
-)
-@register_acc_op_mapping(
-    op_and_target=("call_method", "relu"),
-    arg_replacement_tuples=[("input", "input")],
-)
-@register_acc_op
-def relu(*, input, inplace=False):
-    return nn.functional.relu(input=input, inplace=inplace)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.nn.functional.leaky_relu)
-)
-@register_acc_op
-def leaky_relu(*, input, negative_slope=0.01, inplace=False):
-    return nn.functional.leaky_relu(
-        input=input, negative_slope=negative_slope, inplace=inplace
-    )
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.elu))
-@register_acc_op
-def elu(*, input, alpha=1.0, inplace=False):
-    return nn.functional.elu(input=input, alpha=alpha, inplace=inplace)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.selu))
-@register_acc_op
-def selu(*, input, inplace=False):
-    return nn.functional.selu(input=input, inplace=inplace)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softsign))
-@register_acc_op
-def softsign(*, input):
-    return nn.functional.softsign(input=input)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.log1p),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
-    with node.graph.inserting_before(node):
-        add_kwargs = {"input": node.kwargs["input"], "other": 1.0}
-        add_node = node.graph.call_function(add, kwargs=add_kwargs)
-        add_node.meta = node.meta.copy()
-        log_kwargs = {"input": add_node}
-        log_node = node.graph.call_function(log, kwargs=log_kwargs)
-        log_node.meta = node.meta.copy()
-        return log_node
-
-
-def reduce_op_mapper(
-    node: torch.fx.Node, mod: torch.fx.GraphModule, func
-) -> torch.fx.Node:
-    with node.graph.inserting_before(node):
-        kwargs = dict(node.kwargs)
-        if "dim" in kwargs and isinstance(kwargs["dim"], int):
-            kwargs["dim"] = (kwargs["dim"],)
-        new_node = node.graph.call_function(func, kwargs=kwargs)
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def sum(*, input, dim=None, keepdim=False, dtype=None):
-    if dim is not None:
-        return torch.sum(input, dim=dim, keepdim=keepdim, dtype=dtype)
-    else:
-        return input.sum(dtype=dtype)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "sum"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.sum),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
-    return reduce_op_mapper(node, mod, sum)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def prod(*, input, dim=None, keepdim=False, dtype=None):
-    if dim is not None:
-        return torch.prod(input, dim=dim, keepdim=keepdim, dtype=dtype)
-    else:
-        return input.prod(dtype=dtype)
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "prod"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.prod),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-def prod_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
-    func = prod
-    with node.graph.inserting_before(node):
-        kwargs = dict(node.kwargs)
-        new_node = node.graph.call_function(func, kwargs=kwargs)
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def mean(*, input, dim=None, keepdim=False, dtype=None):
-    if dim is not None:
-        return torch.mean(input, dim=dim, keepdim=keepdim, dtype=dtype)
-    else:
-        return input.mean(dtype=dtype)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "mean"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.mean),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-        ("dtype", "dtype", this_arg_is_optional),
-    ],
-)
-def mean_mapper(node, mod):
-    return reduce_op_mapper(node, mod, mean)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "max"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        (("dim", "other"), "dim_or_other", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.max),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        (("dim", "other"), "dim_or_other", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "min"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        (("dim", "other"), "dim_or_other", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.min),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        (("dim", "other"), "dim_or_other", this_arg_is_optional),
-        ("keepdim", "keepdim", this_arg_is_optional),
-    ],
-)
-def add_maximum_minimum_mapper(
-    node: torch.fx.Node, mod: torch.fx.GraphModule
-) -> torch.fx.Node:
-    # there are effectively three versions of torch.max / torch.min
-    # full reduce: torch.max(input) -> Tensor
-    # dimensional reduce: torch.max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
-    # elementwise: torch.max(input, other, *, out=None) -> Tensor
-
-    # the mapper function is remapping for both min and max situations
-    # this helper function makes the choices available clearer and provides an easier way
-    # to lookup the right function
-    def target_map(op, target):
-        if (op, target) in (("call_method", "max"), ("call_function", torch.max)):
-            return dict(
-                full_reduce=max_full_reduce,
-                dim_reduce=max_dim_reduce,
-                elementwise=maximum,
-            )
-        elif (op, target) in (("call_method", "min"), ("call_function", torch.min)):
-            return dict(
-                full_reduce=min_full_reduce,
-                dim_reduce=min_dim_reduce,
-                elementwise=minimum,
-            )
-
-    with node.graph.inserting_before(node):
-        new_targets = target_map(node.op, node.target)
-        max_kwargs = dict()
-        max_kwargs["input"] = node.kwargs["input"]
-        if ("dim_or_other" not in node.kwargs) or (node.kwargs["dim_or_other"] is None):
-            nt = new_targets["full_reduce"]
-            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
-        elif isinstance(node.kwargs["dim_or_other"], int):
-            nt = new_targets["dim_reduce"]
-            dim = node.kwargs["dim_or_other"]
-            max_kwargs["dim"] = dim
-            max_kwargs["keepdim"] = node.kwargs.get("keepdim", False)
-            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
-        else:
-            other = node.kwargs["dim_or_other"]
-            assert isinstance(other, torch.fx.Node)
-            # Lowering path for when provided "other", where we do elem-wise max
-            nt = new_targets["elementwise"]
-            max_kwargs["other"] = other
-            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
-        max_node.meta = node.meta.copy()
-        return max_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def max_full_reduce(*, input):
-    return torch.max(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def max_dim_reduce(*, input, dim=None, keepdim=False):
-    return torch.max(input=input, dim=dim, keepdim=keepdim)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", torch.maximum))
-@register_acc_op_mapping(op_and_target=("call_method", "maximum"))
-@register_acc_op
-def maximum(*, input, other):
-    return torch.maximum(input=input, other=other)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def min_full_reduce(*, input):
-    return torch.min(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def min_dim_reduce(*, input, dim=None, keepdim=False):
-    return torch.min(input, dim=dim, keepdim=keepdim)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise)
-@register_acc_op_mapping(op_and_target=("call_function", torch.minimum))
-@register_acc_op_mapping(op_and_target=("call_method", "minimum"))
-@register_acc_op
-def minimum(*, input, other):
-    return torch.minimum(input=input, other=other)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.sigmoid))
-@register_acc_op_mapping(op_and_target=("call_method", "sigmoid"))
-@register_acc_op
-def sigmoid(*, input):
-    return torch.sigmoid(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.sinh))
-@register_acc_op
-def sinh(*, input):
-    return torch.sinh(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.cosh))
-@register_acc_op
-def cosh(*, input):
-    return torch.cosh(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.tanh))
-@register_acc_op_mapping(op_and_target=("call_method", "tanh"))
-@register_acc_op
-def tanh(*, input):
-    return torch.tanh(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.asin))
-@register_acc_op
-def asin(*, input):
-    return torch.asin(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.acos))
-@register_acc_op
-def acos(*, input):
-    return torch.acos(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.atan))
-@register_acc_op
-def atan(*, input):
-    return torch.atan(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.exp))
-@register_acc_op
-def exp(*, input):
-    return torch.exp(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.log))
-@register_acc_op
-def log(*, input):
-    return torch.log(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.sqrt))
-@register_acc_op
-def sqrt(*, input):
-    return torch.sqrt(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.reciprocal))
-@register_acc_op
-def reciprocal(*, input):
-    return torch.reciprocal(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.abs))
-@register_acc_op
-def abs(*, input):
-    return torch.abs(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", operator.neg))
-@register_acc_op_mapping(op_and_target=("call_function", torch.neg))
-@register_acc_op
-def neg(*, input):
-    return torch.neg(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.floor))
-@register_acc_op
-def floor(*, input):
-    return torch.floor(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.ceil))
-@register_acc_op
-def ceil(*, input):
-    return torch.ceil(input=input)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.pad))
-@register_acc_op
-def pad(*, input, pad, mode, value):
-    return torch.nn.functional.pad(input=input, pad=pad, mode=mode, value=value)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", torch.conv2d))
-@register_acc_op
-def conv2d(*, input, weight, bias, stride, padding, dilation, groups):
-    return nn.functional.conv2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-    )
-
-
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op
-def quantized_conv2d(
-    *,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    padding_mode,
-    acc_out_ty,
-):
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    return torch.nn.quantized.functional.conv2d(
-        input=input,
-        weight=weight,
-        bias=bias,
-        stride=stride,
-        padding=padding,
-        dilation=dilation,
-        groups=groups,
-        padding_mode=padding_mode,
-        scale=qparams["scale"],
-        zero_point=qparams["zero_point"],
-    )
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.batch_norm))
-@register_acc_op
-def batch_norm(
-    *, input, running_mean, running_var, weight, bias, training, momentum, eps
-):
-    return nn.functional.batch_norm(
-        input=input,
-        running_mean=running_mean,
-        running_var=running_var,
-        weight=weight,
-        bias=bias,
-        training=training,
-        momentum=momentum,
-        eps=eps,
-    )
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.layer_norm))
-@register_acc_op
-def layer_norm(*, input, normalized_shape, weight, bias, eps):
-    return nn.functional.layer_norm(
-        input=input,
-        normalized_shape=normalized_shape,
-        weight=weight,
-        bias=bias,
-        eps=eps,
-    )
-
-
-def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node:
-    """
-    Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk
-    + acc_ops.getitem + acc_ops.squeeze (depends on keepdim).
-    """
-    input_node = node.kwargs["input"]
-    dim = node.kwargs["dim"]
-    keepdim = node.kwargs["keepdim"]
-
-    if dim is None and keepdim:
-        raise RuntimeError(
-            "We currently don't support argmin/argmax with dim=None and keepdim=True"
-        )
-
-    with node.graph.inserting_before(node):
-        if dim is None:
-            flatten_kwargs = {
-                "input": node.kwargs["input"],
-                "start_dim": 0,
-                "end_dim": -1,
-            }
-            flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs)
-            flatten_node.meta["type"] = torch.Tensor
-            input_node = flatten_node
-            dim = -1
-
-        topk_kwargs = {
-            "input": input_node,
-            "k": 1,
-            "dim": dim,
-            "largest": largest,
-            "sorted": False,
-        }
-        topk_node = node.graph.call_function(topk, kwargs=topk_kwargs)
-        # It's actually more like NamedTuple but tuple here should be fine.
-        topk_node.meta["type"] = tuple
-
-        getitem_kwargs = {"input": topk_node, "idx": 1}
-        getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs)
-        getitem_node.meta["type"] = torch.Tensor
-        output_node = getitem_node
-
-        if not keepdim:
-            squeeze_kwargs = {"input": getitem_node, "dim": dim}
-            output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs)
-
-        output_node.meta = node.meta.copy()
-        return output_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.argmin),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim"),
-        ("keepdim", "keepdim"),
-    ],
-)
-def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
-    """
-    Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem
-    + acc_ops.squeeze (depends on keepdim).
-    """
-    return argmin_max_mapper_impl(node, largest=False)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", torch.linalg.norm))
-@register_acc_op
-def linalg_norm(*, input, ord, dim, keepdim):
-    return torch.linalg.norm(input=input, ord=ord, dim=dim, keepdim=keepdim)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "split"),
-    arg_replacement_tuples=[
-        ("tensor", "input"),
-        ("split_size_or_sections", "split_size_or_sections"),
-        ("dim", "dim"),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "split_with_sizes"),
-    arg_replacement_tuples=[
-        ("tensor", "input"),
-        ("split_sizes", "split_size_or_sections"),
-        ("dim", "dim"),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.split),
-    arg_replacement_tuples=[
-        ("tensor", "input"),
-        ("split_size_or_sections", "split_size_or_sections"),
-        ("dim", "dim"),
-    ],
-)
-def torch_split_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
-    """
-    If split_size_or_sections is sections, map the node to slice_tensors
-    + tuple_construct. Otherwise, if split_size_or_sections is split_size,
-    map the node to acc_ops.split.
-    """
-    split_size_or_sections = node.kwargs["split_size_or_sections"]
-    with node.graph.inserting_before(node):
-        if isinstance(split_size_or_sections, int):
-            new_kwargs = {
-                "input": node.kwargs["input"],
-                "split_size": split_size_or_sections,
-                "dim": node.kwargs["dim"],
-            }
-            new_node = node.graph.call_function(split, kwargs=new_kwargs)
-            new_node.meta = node.meta.copy()
-            return new_node
-
-        assert isinstance(split_size_or_sections, Sequence)
-        start = 0
-        slice_nodes = []
-        for i in split_size_or_sections:
-            assert isinstance(i, int)
-            new_kwargs = {
-                "input": node.kwargs["input"],
-                "dim": node.kwargs["dim"],
-                "start": start,
-                "stop": start + i,
-                "step": 1,
-            }
-            new_node = node.graph.call_function(slice_tensor, kwargs=new_kwargs)
-            new_node.meta["type"] = torch.Tensor
-            slice_nodes.append(new_node)
-            start += i
-
-        new_node = node.graph.call_function(
-            tuple_construct, kwargs={"tensors": tuple(slice_nodes)}
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def split(*, input, split_size, dim):
-    return torch.split(input, split_size, dim)
-
-
-@register_acc_op
-def tuple_construct(*, tensors):
-    return tuple(tensors)
-
-
-@register_acc_op_properties(AccOpProperty.quantized)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.ops.quantized.batch_norm2d),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("weight", "weight"),
-        ("bias", "bias"),
-        ("running_mean", "running_mean"),
-        ("running_var", "running_var"),
-        ("eps", "eps"),
-        ("scale", "scale"),
-        ("zero_point", "zero_point"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[
-        ("scale", "scale", move_to_qparams),
-        ("zero_point", "zero_point", move_to_qparams),
-    ],
-)
-@register_acc_op
-def quantized_batch_norm2d(
-    *, input, running_mean, running_var, weight, bias, eps, acc_out_ty
-):
-    qparams = TensorMetadata(*acc_out_ty).qparams
-    return torch.ops.quantized.batch_norm2d(
-        input,
-        weight,
-        bias,
-        running_mean,
-        running_var,
-        eps,
-        qparams["scale"],
-        qparams["zero_point"],
-    )
-
-
-@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding_bag))
-@register_acc_op
-def embedding_bag(
-    *,
-    input,
-    weight,
-    offsets,
-    max_norm,
-    norm_type,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    return nn.functional.embedding_bag(
-        input=input,
-        weight=weight,
-        offsets=offsets,
-        max_norm=max_norm,
-        norm_type=norm_type,
-        scale_grad_by_freq=scale_grad_by_freq,
-        mode=mode,
-        sparse=sparse,
-        per_sample_weights=per_sample_weights,
-        include_last_offset=include_last_offset,
-        padding_idx=padding_idx,
-    )
-
-
-@register_acc_op_mapping(
-    op_and_target=(
-        "call_function",
-        torch.ops.quantized.embedding_bag_byte_rowwise_offsets,
-    )
-)
-@register_acc_op
-def embedding_bag_byte_rowwise_offsets(
-    *,
-    weight,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    pruned_weights,
-    per_sample_weights,
-    compressed_indices_mapping,
-    include_last_offset,
-):
-    return torch.ops.quantized.embedding_bag_byte_rowwise_offsets(
-        weight=weight,
-        indices=indices,
-        offsets=offsets,
-        scale_grad_by_freq=scale_grad_by_freq,
-        mode=mode,
-        pruned_weights=pruned_weights,
-        per_sample_weights=per_sample_weights,
-        compressed_indices_mapping=compressed_indices_mapping,
-        include_last_offset=include_last_offset,
-    )
-
-
-@register_acc_op_mapping(
-    op_and_target=(
-        "call_function",
-        torch.ops.quantized.embedding_bag_4bit_rowwise_offsets,
-    )
-)
-@register_acc_op
-def embedding_bag_4bit_rowwise_offsets(
-    *,
-    weight,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    pruned_weights,
-    per_sample_weights,
-    compressed_indices_mapping,
-    include_last_offset,
-):
-    return torch.ops.quantized.embedding_bag_4bit_rowwise_offsets(
-        weight=weight,
-        indices=indices,
-        offsets=offsets,
-        scale_grad_by_freq=scale_grad_by_freq,
-        mode=mode,
-        pruned_weights=pruned_weights,
-        per_sample_weights=per_sample_weights,
-        compressed_indices_mapping=compressed_indices_mapping,
-        include_last_offset=include_last_offset,
-    )
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.sin))
-@register_acc_op
-def sin(*, input):
-    return torch.sin(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.cos))
-@register_acc_op
-def cos(*, input):
-    return torch.cos(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.tan))
-@register_acc_op
-def tan(*, input):
-    return torch.tan(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.topk))
-@register_acc_op
-def topk(*, input, k, dim, largest, sorted):
-    return torch.topk(input=input, k=k, dim=dim, largest=largest, sorted=sorted)
-
-
-@register_acc_op_mapping(op_and_target=("call_function", operator.getitem))
-@register_acc_op
-def getitem(*, input, idx):
-    return input[idx]
-
-
-@register_acc_op_mapping(op_and_target=("call_function", torch.nan_to_num))
-@register_acc_op_mapping(op_and_target=("call_method", "nan_to_num"))
-@register_acc_op
-def nan_to_num(*, input, nan=0.0, posinf=None, neginf=None):
-    return torch.nan_to_num(input, nan=nan, posinf=posinf, neginf=neginf)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op
-def slice_tensor(*, input, dim, start, stop, step):
-    slc = slice(start, stop, step)
-    if dim >= 0:
-        slices: List[slice] = [slice(None, None, None) for _ in range(dim)]
-        slices.append(slc)
-    else:
-        slices = [Ellipsis, slc]  # type: ignore[list-item]
-        slices.extend([slice(None, None, None) for _ in range(-dim - 1)])
-
-    return input[tuple(slices)]
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.narrow),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim"),
-        ("start", "start"),
-        ("length", "length"),
-    ],
-)
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "narrow"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dim", "dim"),
-        ("start", "start"),
-        ("length", "length"),
-    ],
-)
-def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
-    assert isinstance(node.kwargs["start"], int) and isinstance(
-        node.kwargs["length"], int
-    )
-    kwargs = {
-        "input": node.kwargs["input"],
-        "dim": node.kwargs["dim"],
-        "start": node.kwargs["start"],
-        "stop": node.kwargs["start"] + node.kwargs["length"],
-        "step": 1,
-    }
-    with node.graph.inserting_before(node):
-        new_node = node.graph.call_function(slice_tensor, kwargs=kwargs)
-    new_node.meta = node.meta.copy()
-    return new_node
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(
-    op_and_target=("call_function", torch.reshape),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("shape", "shape"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[("shape", "shape")],
-)
-@register_acc_op_mapping(
-    op_and_target=("call_method", "view"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("*", "shape"),
-    ],
-    kwargs_to_move_to_acc_out_ty=[("shape", "shape")],
-)
-@register_acc_op
-def reshape(*, input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    return input.reshape(TensorMetadata(*acc_out_ty).shape)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "reshape"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("*", "shape"),
-    ],
-)
-def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
-    """
-    For Tensor.reshape node, args could be (input, 1, 2, 3) or (input, (1, 2, 3)).
-    Here we do some special handling with the `shape` arg in order to map it to
-    acc_ops.reshape. It also handles the case when `shape` is a list instead of
-    tuple.
-    """
-    input_node = node.kwargs["input"]
-    shape = node.kwargs["shape"]
-
-    assert isinstance(shape, Sequence)
-    if isinstance(shape[0], (tuple, list)):  # type: ignore[index]
-        shape = shape[0]  # type: ignore[index]
-
-    with node.graph.inserting_before(node):
-        new_node = node.graph.call_function(
-            reshape,
-            kwargs={
-                "input": input_node,
-                "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=shape),
-            },
-        )
-        new_node.meta = node.meta.copy()
-        return new_node
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op
-def to_dtype(input, acc_out_ty=None):
-    assert acc_out_ty is not None
-    return input.to(dtype=TensorMetadata(*acc_out_ty).dtype)
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_method", "to"),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("dtype", "dtype"),
-    ],
-)
-def custom_tensor_to_mapper(node: torch.fx.Node, _: nn.Module):
-    dest_dtype = node.kwargs["dtype"]
-    mem_format = node.kwargs.get("memory_format")
-    device = node.kwargs.get("device")
-    assert dest_dtype is not None
-    assert mem_format is None or mem_format == torch.preserve_format
-    assert device is None
-
-    new_kwargs = {
-        "input": node.kwargs["input"],
-        "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dest_dtype),
-    }
-
-    with node.graph.inserting_before(node):
-        new_node = node.graph.create_node(
-            "call_function", to_dtype, kwargs=new_kwargs, name=node.name
-        )
-        new_node.meta = node.meta
-        return new_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.add),
-    # Note that we may have aliases for inputs here due to issues with deterministically
-    # knowing the correct target that will be resolved by pytorch.
-    arg_replacement_tuples=[
-        (("input", "a"), "input"),
-        (("other", "b"), "other"),
-        ("alpha", "alpha", this_arg_is_optional),
-    ],
-)
-def custom_torch_add_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
-    """
-    Add custom mapping for torch.add because it has an `alpha` parameter which scales
-    the `other` input, and we want to make that mul a separate node.
-    """
-    with node.graph.inserting_before(node):
-        # If alpha is in kwargs check if we need to add a mul, and use correct kwargs.
-        if "alpha" in node.kwargs:
-            # Add mul node only if it has a numerical impact, i.e. alpha != 1.0.
-            if node.kwargs["alpha"] != 1.0:
-                other_node = node.graph.create_node(
-                    "call_function",
-                    mul,
-                    kwargs={
-                        "input": node.kwargs["other"],
-                        "other": node.kwargs["alpha"],
-                    },
-                    name=node.name + "_mul_alpha",
-                )
-                other_node.meta = node.meta
-            else:
-                other_node = node.kwargs["other"]
-            add_kwargs = {"input": node.kwargs["input"], "other": other_node}
-        else:
-            add_kwargs = node.kwargs
-
-        new_node = node.graph.create_node(
-            "call_function", add, kwargs=add_kwargs, name=node.name
-        )
-        new_node.meta = node.meta
-        return new_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_module", nn.quantized.Linear),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def packed_quantized_linear_mapper(
-    node: torch.fx.Node, mod: nn.Module
-) -> torch.fx.Node:
-    """
-    Mapping from quantized_linear module to acc_op.linear. We unpack weight and bias
-    in this mapper and pass them directly to linear node.
-    """
-    assert isinstance(node.target, str)
-    linear_module = dict(mod.named_modules())[node.target]
-    prefix = node.target.replace(".", "_")
-    weight_name = f"{prefix}_weight"
-    bias_name = f"{prefix}_bias"
-
-    # Store weight and bias in the main module
-    mod.register_buffer(weight_name, linear_module.weight())
-    if linear_module.bias() is not None:
-        mod.register_buffer(bias_name, linear_module.bias())
-
-    with node.graph.inserting_before(node):
-        # Insert get_attr nodes for weight and bias
-        get_weight = node.graph.get_attr(weight_name)
-        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(
-            linear_module.weight()
-        )
-
-        get_bias = None
-        if linear_module.bias() is not None:
-            get_bias = node.graph.get_attr(bias_name)
-            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(
-                linear_module.bias()
-            )
-
-        qparams = {"scale": linear_module.scale, "zero_point": linear_module.zero_point}
-        # Create kwargs for acc_op.quantized_linear
-        kwargs = {
-            "input": node.kwargs["input"],
-            "weight": get_weight,
-            "bias": get_bias,
-            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
-        }
-
-        new_node = node.graph.call_function(quantized_linear, kwargs=kwargs)
-        new_node.meta = node.meta
-        return new_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_module", nn.quantized.Conv2d),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def packed_quantized_conv2d_mapper(
-    node: torch.fx.Node, mod: nn.Module
-) -> torch.fx.Node:
-    """
-    Mapping from quantzed Conv2d module to acc_op.conv. We unpack all the parameters
-    in this mapper and pass them directly to conv2d node.
-    """
-    assert isinstance(node.target, str)
-    conv_module = dict(mod.named_modules())[node.target]
-    prefix = node.target.replace(".", "_")
-    weight_name = f"{prefix}_weight"
-    bias_name = f"{prefix}_bias"
-
-    # Store weight and bias in the main module
-    mod.register_buffer(weight_name, conv_module.weight())
-    if conv_module.bias() is not None:
-        mod.register_buffer(bias_name, conv_module.bias())
-
-    with node.graph.inserting_before(node):
-        # Insert get_attr nodes for weight and bias
-        get_weight = node.graph.get_attr(weight_name)
-        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight())
-
-        get_bias = None
-        if conv_module.bias() is not None:
-            get_bias = node.graph.get_attr(bias_name)
-            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias())
-
-        qparams = {"scale": conv_module.scale, "zero_point": conv_module.zero_point}
-
-        # Create kwargs for acc_op.conv
-        kwargs = {
-            "input": node.kwargs["input"],
-            "weight": get_weight,
-            "bias": get_bias,
-            "stride": conv_module.stride,
-            "padding": conv_module.padding,
-            "dilation": conv_module.dilation,
-            "groups": conv_module.groups,
-            "padding_mode": conv_module.padding_mode,
-            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
-        }
-
-        new_node = node.graph.call_function(quantized_conv2d, kwargs=kwargs)
-        new_node.meta = node.meta
-        return new_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_function", torch.ops.quantized.add_relu),
-    arg_replacement_tuples=[
-        ("input", "input"),
-        ("other", "other"),
-        ("scale", "scale"),
-        ("zero_point", "zero_point"),
-    ],
-)
-def add_relu_unfuse_mapper(
-    node: torch.fx.Node, mod: torch.fx.GraphModule
-) -> torch.fx.Node:
-    with node.graph.inserting_before(node):
-        qparams = {
-            "scale": node.kwargs["scale"],
-            "zero_point": node.kwargs["zero_point"],
-        }
-        add_kwargs = {
-            "input": node.kwargs["input"],
-            "other": node.kwargs["other"],
-            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
-        }
-        add_node = node.graph.call_function(quantized_add, kwargs=add_kwargs)
-        add_node.meta = node.meta.copy()
-
-        relu_node = node.graph.call_function(
-            relu, kwargs={"input": add_node, "inplace": False}
-        )
-        relu_node.meta = node.meta
-        return relu_node
-
-
-@register_custom_acc_mapper_fn(
-    op_and_target=("call_module", nn.intrinsic.quantized.ConvReLU2d),
-    arg_replacement_tuples=[
-        ("input", "input"),
-    ],
-)
-def packed_quantized_convrelu2d_mapper(
-    node: torch.fx.Node, mod: nn.Module
-) -> torch.fx.Node:
-    """
-    Mapping from quantized ConvReLU2d module to acc_op.relu. We use packed_quantized_conv2d_mapper to unpack all the parameters
-    in this mapper and pass the returned conv2d node directly to relu node.
-    """
-
-    with node.graph.inserting_before(node):
-        # conv2d op
-        conv2d_node = packed_quantized_conv2d_mapper(node, mod)
-
-        # relu op
-        relu_node = node.graph.call_function(
-            relu, kwargs={"input": conv2d_node, "inplace": False}
-        )
-        relu_node.meta = node.meta
-        return relu_node
-
-
-@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.gelu))
-@register_acc_op_mapping(op_and_target=("call_method", "gelu"))
-@register_acc_op
-def gelu(*, input):
-    return torch.nn.functional.gelu(input=input)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.cumsum))
-@register_acc_op_mapping(op_and_target=("call_method", "cumsum"))
-@register_acc_op
-def cumsum(*, input, dim, dtype=None):
-    return torch.cumsum(input=input, dim=dim, dtype=dtype)
-
-
-@register_acc_op_properties(AccOpProperty.unary)
-@register_acc_op_mapping(op_and_target=("call_function", torch.chunk))
-@register_acc_op_mapping(op_and_target=("call_method", "chunk"))
-@register_acc_op
-def chunk(*, input, chunks, dim=0):
-    return torch.chunk(input=input, chunks=chunks, dim=dim)
diff --git a/torch/fx/experimental/fx_acc/acc_tracer.py b/torch/fx/experimental/fx_acc/acc_tracer.py
deleted file mode 100644
index 352b7161170a..000000000000
--- a/torch/fx/experimental/fx_acc/acc_tracer.py
+++ /dev/null
@@ -1,462 +0,0 @@
-import ast
-import builtins
-import copy
-import inspect
-import logging
-import textwrap
-import warnings
-from types import FunctionType
-from typing import Dict, Optional, Any, Type, Tuple, Set, List
-
-import torch.fx.experimental.fx_acc.acc_normalizer as acc_normalizer
-import torch.fx.experimental.fx_acc.acc_ops  # noqa: F401
-import torch
-import torch.jit as jit
-import torch.nn as nn
-from torch._sources import normalize_source_lines
-from torch.fx import Graph, Tracer
-from torch.fx.experimental.normalize import NormalizeArgs
-from torch.fx.passes import shape_prop
-
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def _get_exception_wrapper_attr_name(exc_type: Type[Exception]) -> str:
-    return f"_conditional_exception_wrapper_{exc_type.__name__}"
-
-
-class Acc_Rewriter(ast.NodeTransformer):
-    """
-    Take a FunctionType object representing a `forward` method, then
-    perform an AST rewrite to swap out nodes that are not symbolically
-    traceable with a callsite to the FX alternative.
-
-    To support swapping out an AST node, define a new `visit` method on
-    that node. For more details, see:
-    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.exceptions_rewritten: Set[Type[Exception]] = set()
-
-    def rewrite(self, fn: FunctionType) -> Tuple[FunctionType, Set[Type[Exception]]]:
-
-        # Normalize the source lines
-        sourcelines, _ = inspect.getsourcelines(fn)
-        sourcelines = normalize_source_lines(sourcelines)
-        source = "".join(sourcelines)
-        normalized_str = textwrap.dedent(source)
-
-        # Rewrite the original AST
-        source_ast = ast.parse(normalized_str)
-        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
-
-        # Pull out the compiled function from the newly-created Module
-        code = compile(dest_ast, "", "exec")
-        globals_dict = copy.copy(fn.__globals__)
-        keys_before = set(globals_dict.keys())
-        exec(code, globals_dict)
-        new_keys = list(set(globals_dict.keys()) - keys_before)
-        assert len(new_keys) <= 1
-        fn_compiled = globals_dict[fn.__name__]
-
-        # Return the correct FunctionType object and the Exceptions that were
-        # rewritten during visit_If.
-        return fn_compiled, self.exceptions_rewritten
-
-    def visit_Assert(self, node: ast.Assert):
-        """
-        Swap out the Assert node (Python's `assert`) with a callsite to the
-        symbolically-traceable torch._assert function
-        """
-        # Create the Call node
-        n = ast.parse("torch._assert()", mode="eval")
-        assert isinstance(n, ast.Expression)
-        call_node = n.body
-        assert isinstance(call_node, ast.Call)
-        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
-        call_node.args = [node.test, msg]
-
-        # Ensure that the new node conforms to the Python AST grammar
-        expr_wrapper = ast.Expr(value=call_node)
-
-        # Return the new Call node to signify that we want to use it as
-        # a replacement for the original _assert node
-        return ast.copy_location(expr_wrapper, node)
-
-    def visit_If(self, if_node: ast.If):
-        """
-        Swap out the pattern `If(x): Raise(y)` with a ConditionalExceptionWrapper
-        specialized for the specific exception y. The specialized
-        ConditionalExceptionWrapper module will be added in the RewrittenModule.
-        Only works with builtin Exceptions, as we assume the signature of the
-        init for the Exception is a string.
-        """
-        raise_node = if_node.body[0]
-        if not isinstance(raise_node, ast.Raise):
-            return if_node
-
-        # Don't handle orelse for now.
-        # TODO: Move orelse to the body after calling ConditionalExceptionWrapper.
-        if len(if_node.orelse) != 0:
-            return if_node
-
-        def _reuse_loc(node):
-            return ast.copy_location(node, if_node)
-
-        # If the exception has a message then we expect the raise's exc to be a
-        # Call w/ a msg. Else if it's a exc Name then there's no msg to use.
-        node_for_exc = raise_node.exc
-        if isinstance(node_for_exc, ast.Name):
-            # E.g. `raise AssertionError`, i.e. without an exc_msg.
-            name_node_of_exc = node_for_exc
-            exc_msg = _reuse_loc(ast.Constant(None))
-        elif isinstance(node_for_exc, ast.Call):
-            # E.g. `raise AssertionError("error message")`
-            name_node_of_exc = node_for_exc.func  # type: ignore[assignment]
-            if not isinstance(name_node_of_exc, ast.Name):
-                return if_node
-            # Most assertions just take a single string arg, but some may not; skip
-            # handling such assertions for now.
-            if len(node_for_exc.args) != 1:
-                return if_node
-            exc_msg = node_for_exc.args[0]
-        else:
-            return if_node
-
-        # Convert what we expect is the name of the exception into its
-        # associated python class.
-        name_of_exc = name_node_of_exc.id
-        try:
-            exc_type = eval(name_of_exc)
-        except Exception:
-            return if_node
-
-        # Check that we actually have a builtin exception.
-        if (
-            not issubclass(exc_type, Exception)
-            or getattr(getattr(exc_type, "__class__", None), "__module__", None)
-            != "builtins"
-        ):
-            return if_node
-
-        # We need a ConditionalExceptionWrapper specialized for every kind of
-        # exception, so add it to exceptions_rewritten to remember for later to
-        # add a specialized attr with it.
-        self.exceptions_rewritten.add(exc_type)
-
-        # From here we definitely should be able to do the replacement. Create a
-        # Call node to the ConditionalExceptionWrapper module we're replacing
-        # the If with, with args set as the If's condition and the string of the
-        # exception. The call to the self._conditional_exception_wrapper_*Error
-        # module is safe because the RewrittenModule will add it as an attr
-        # based on the returned exceptions_rewritten, and we assume we are
-        # currently modifying the AST of a method from a RewrittenModule.
-        exc_wrapper_node = ast.parse(
-            f"self.{_get_exception_wrapper_attr_name(exc_type)}()", mode="eval"
-        )
-        assert isinstance(exc_wrapper_node, ast.Expression)
-        exc_wrapper_call_node = exc_wrapper_node.body
-        assert isinstance(exc_wrapper_call_node, ast.Call)
-        exc_wrapper_call_node.args = [if_node.test, exc_msg]
-
-        # Ensure that the new node conforms to the Python AST grammar
-        expr_wrapper = _reuse_loc(ast.Expr(_reuse_loc(exc_wrapper_call_node)))
-
-        # Return the new node to signify that we want to use it as a replacement
-        # for the original `If x: Raise y` pattern.
-        return expr_wrapper
-
-
-class ConditionalExceptionWrapper(nn.Module):
-    """
-    This wrapper class is used to wrap conditional raising of exceptions during
-    rewriting. For example:
-
-    .. code-block:: python
-
-        if self.name != "x":
-            raise AssertionError(f"Name was not x: {self.name}")
-
-    Is rewritten into
-
-    .. code-block:: python
-
-        self._conditional_exception_wrapper_AssertionError(
-            self.name != "x", f"Name was not x: {self.name}"
-        )
-
-    Note that __init__ takes the Exception class that it is wrapping, while
-    forward takes the condition to check and the message for the exception.
-
-    """
-
-    # Mark as impure so that calls to it will not be removed during DCE.
-    _is_impure = True
-
-    def __init__(self, exc: Type[Exception]):
-        super().__init__()
-        self.exc = exc
-
-    def forward(self, cond: bool, msg: str):
-        if cond:
-            raise self.exc if msg is None else self.exc(msg)
-
-
-# Custom tracer that traces to the functional level and rewrites asserts and
-# exceptions.
-class AccRewritingTracer(Tracer):
-    # Add an explicit check for mutable operations, which break symbolic tracing.
-    check_mutable_operations = True
-
-    # Note: Treat ConditionalExceptionWrapper as a leaf so that we don't
-    # trace into it, because it contains control flow and raises an exception.
-    DEFAULT_LEAF_MODULE_LIST = {
-        ConditionalExceptionWrapper,
-        torch.nn.quantized.Linear,
-        torch.nn.quantized.Conv2d,
-        torch.nn.intrinsic.quantized.ConvReLU2d,
-        jit.ScriptModule,
-        jit.RecursiveScriptModule,
-    }
-
-    def is_leaf_module(self, m: nn.Module, mod_qual_name: str) -> bool:
-        return getattr(m, "_base_class_origin", type(m)) in self.leaf_module_list
-
-    def trace(
-        self,
-        root: nn.Module,
-        concrete_args: Optional[Dict[str, Any]] = None,
-        ast_rewriter_allow_list: Optional[Set] = None,
-        leaf_module_list: Optional[Set] = None,
-    ) -> Tuple[Graph, nn.Module]:
-        self.leaf_module_list = self.DEFAULT_LEAF_MODULE_LIST
-        if leaf_module_list:
-            self.leaf_module_list.update(leaf_module_list)
-        rewritten = _rewrite(root, ast_rewriter_allow_list, self.leaf_module_list)
-        return super().trace(rewritten, concrete_args), rewritten
-
-
-# List of modules that need rewriting to be supported for tracing.
-DEFAULT_REWRITE_ALLOW_LIST = {
-    nn.BatchNorm1d,
-    nn.BatchNorm2d,
-    nn.BatchNorm3d,
-}
-
-
-def _rewrite(mod_to_rewrite: nn.Module, allow_list: Optional[Set] = None, leaf_module_list: Optional[Set] = None) -> nn.Module:
-    if allow_list is None:
-        allow_list = DEFAULT_REWRITE_ALLOW_LIST
-    else:
-        allow_list = allow_list.union(DEFAULT_REWRITE_ALLOW_LIST)
-
-    if not leaf_module_list:
-        leaf_module_list = set()
-
-    # Rewrite this module's functions as well as all recursive modules'
-    # functions that are attrs of this moodule. Return the new, rewritten module
-    # hierarchy.
-    def rewrite_module(m: nn.Module):
-        if isinstance(m, jit.ScriptModule):
-            # ScriptModule cannot be rewritten, so bypass it. The issue is it
-            # requires explicitly calling its `__init__()`, calling
-            # `nn.Module.__init__()` in the derived `RewrittenModule` is not
-            # enough. And even if we init it we can't do much with it.
-            return m
-
-        # If m is an already-rewritten RewrittenModule, then use the original base class.
-        base_class : Type[nn.Module] = getattr(m, "_base_class_origin", type(m))
-
-        # Keep track of all the ConditionalExceptionWrappers that the
-        # Acc_Rewriter calls into in this module so we can add them in init
-        # below.
-        all_added_wrappers: Set[Type[Exception]] = set()
-
-        # Note: Make this a subclass of our base class.
-        class RewrittenModule(base_class):  # type: ignore[valid-type, misc]
-            # Keep track of the base_class so that symbolic tracing can
-            # determine what kind of module this originally was later on.
-            _base_class_origin = base_class
-            # Add suffix to qualname so it's easier to debug the origin of this module.
-            __qualname__ = f"{base_class.__qualname__}__AccRewrittenModule"
-
-            # Write all of the non-dunder or special methods from base_class
-            # into RewrittenModule.
-            for method_name in dir(base_class):
-                method = getattr(base_class, method_name, None)
-                if method is None:
-                    _LOGGER.warning(f"{__qualname__} does not have attribute {method_name}")
-
-                if builtins.type(method) is not FunctionType:
-                    continue
-
-                # Always skip rewriting dunder methods, as they haven't (yet) been
-                # problematic, and modifying them has caused issues previously.
-                if method_name.startswith("__") and method_name.endswith("__"):
-                    continue
-
-                # Only rewrite those Modules explicitly in the allow_list.
-                assert allow_list is not None
-                if base_class not in allow_list:
-                    vars()[method_name] = method
-                else:
-                    vars()[method_name], added_wrappers = Acc_Rewriter().rewrite(method)
-                    all_added_wrappers.update(added_wrappers)
-
-            def __init__(self, orig):
-                nn.Module.__init__(self)
-
-                # Iterate over all added exception wrappers and add
-                # ConditionalExceptionWrapper attrs for each.
-                for exc_type in all_added_wrappers:
-                    wrapper_name = _get_exception_wrapper_attr_name(exc_type)
-                    assert not hasattr(self, wrapper_name)
-                    setattr(
-                        self,
-                        wrapper_name,
-                        ConditionalExceptionWrapper(exc_type),
-                    )
-                # Recursively rewrite and copy all module attrs of this module.
-                for k, v in orig.__dict__.items():
-                    if k == "_modules":
-                        for mod_k, mod_v in v.items():
-                            if getattr(mod_v, "_base_class_origin", type(mod_v)) in leaf_module_list:  # type: ignore[operator]
-                                print(f"Skip rewriting leaf module {type(mod_v)}")
-                                self._modules[mod_k] = mod_v
-                            else:
-                                self._modules[mod_k] = rewrite_module(mod_v)
-                    else:
-                        self.__dict__[k] = v
-
-        # Add suffix to name so it's easier to debug the origin of this module.
-        RewrittenModule.__name__ = f"{base_class.__name__}__AccRewrittenModule"
-        return RewrittenModule(m)
-
-    return rewrite_module(mod_to_rewrite)
-
-
-def _remove_assertions(gm: torch.fx.GraphModule) -> bool:
-    """
-    Unconditionally removes all assertions found in GraphModule gm.
-    Returns whether the graph is modified.
-    """
-    changed = False
-    for node in gm.graph.nodes:
-        if node.op == "call_function" and node.target == torch._assert:
-            gm.graph.erase_node(node)
-            changed = True
-    return changed
-
-
-def _remove_exceptions(gm: torch.fx.GraphModule) -> bool:
-    """
-    Unconditionally removes all call_modules to ConditionalExceptionWrappers
-    found in GraphModule gm. Returns whether the graph is modified.
-    """
-    changed = False
-    for node in gm.graph.nodes:
-        if node.op == "call_module" and isinstance(
-            gm.get_submodule(node.target), ConditionalExceptionWrapper
-        ):
-            gm.graph.erase_node(node)
-            changed = True
-    return changed
-
-
-def trace(
-    mod: nn.Module,
-    sample_inputs: List[torch.Tensor],
-    remove_assertions: bool = True,
-    remove_exceptions: bool = True,
-    use_acc_normalization: bool = True,
-    ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None,
-    leaf_module_list: Optional[Set[Type[nn.Module]]] = None,
-) -> torch.fx.GraphModule:
-    """
-    Performs tracing and arg normalization specialized for accelerator lowering.
-
-    It first rewrites the AST of the module's methods (and all attr methods
-    recursively) to transform un-tracable parts of the module to make them
-    traceable.
-
-    It then traces to the functional level so that optimizations and backend
-    accelerator importers have the ability to see and/or change inputs to each
-    op.
-
-    It then removes assertions and exception wrappers found during symbolic
-    tracing if requested based on remove_assertions and remove_exceptions
-
-    Dead code is then eliminated, which will e.g. remove any nodes that were
-    only used by assertions or exceptions if they were removed.
-
-    It then performs normalization on args/kwargs, aligning any arg that can be
-    moved to kwarg to be so, and then making default values explicit.
-
-    Args:
-
-        mod (Module): The module to transform and trace.
-
-        sample_inputs (Tuple[Union[torch.Tensor, List[torch.Tensor]]]):
-                Sample inputs with which to run shape prop.
-
-        remove_assertions (bool): Whether to remove assertion nodes from
-                                    the graph after symbolic tracing.
-
-        remove_exceptions (bool): Whether to remove exception wrapper nodes
-                                    from the graph after symbolic tracing.
-
-        use_acc_normalization (bool): Whether to use acc-specific
-                                        normalization to all acc_ops.
-
-        ast_rewriter_allow_list (Optional[Set[nn.Module]]): Optional allow list of
-                                            modules that need AST rewriting.
-
-        leaf_module_list (Optional[Set[nn.Module]]): Optional leaf module list where
-                                            modules will not be traced into.
-
-    """
-    if mod.training:
-        warnings.warn(
-            "acc_tracer does not support currently support models for training."
-            " Calling eval on model before tracing."
-        )
-        mod.eval()
-
-    # Rewrite the module to make it symbolic traceable, and then trace it.
-    rewritten_graph, rewritten_mod = AccRewritingTracer().trace(
-        mod,
-        ast_rewriter_allow_list=ast_rewriter_allow_list,
-        leaf_module_list=leaf_module_list,
-    )
-
-    assert isinstance(rewritten_mod, nn.Module)
-    # Note: use the rewritten_mod here as the root. This is necessary because
-    # RewrittenModule includes a new module for the ConditionalExceptionWrapper.
-    traced = torch.fx.GraphModule(rewritten_mod, rewritten_graph)
-
-    # Now remove all assertions and exceptions if requested.
-    if remove_assertions:
-        _remove_assertions(traced)
-    if remove_exceptions:
-        _remove_exceptions(traced)
-
-    # Cleanup any dead code from the original module as well as resulting dead
-    # nodes after removing assertions and exceptions.
-    traced.graph.eliminate_dead_code()
-
-    # Now normalize args/kwargs to make default values visible. Leave args/kwargs as
-    # they were, since all-kwarg normalization is broken, and we don't need it anyway.
-    shape_prop.ShapeProp(traced).propagate(*sample_inputs)
-    traced = NormalizeArgs(traced, normalize_to_only_use_kwargs=False).transform()
-
-    # Normalize to acc-specialized wrappers for consistency across op naming and
-    # ensuring all kwarg usage.
-    if use_acc_normalization:
-        acc_normalizer.normalize(traced)
-
-    traced.recompile()
-
-    return traced
diff --git a/torch/fx/experimental/fx_acc/acc_utils.py b/torch/fx/experimental/fx_acc/acc_utils.py
deleted file mode 100644
index 31613ebb775e..000000000000
--- a/torch/fx/experimental/fx_acc/acc_utils.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import inspect
-import json
-import os
-from typing import Any, Tuple, Callable, Union, Dict, List, Optional
-import re
-
-import torch
-import torch.fx
-from torch.fx.passes.graph_manipulation import (
-    serialize_module,
-)
-from torch.fx.graph_module import GraphModule
-from torch.fx.node import _get_qualified_name
-from torch.fx.passes import graph_drawer
-from torch.fx.passes.shape_prop import TensorMetadata
-
-
-def get_target_from_module(mod: torch.nn.Module, target: str):
-    """
-    Gets `target` from `mod` and returns it. If `target` is empty then returns `mod.`
-    """
-    if target == "":
-        return mod
-
-    target_atoms = target.split(".")
-    curr_obj = mod
-    for i, atom in enumerate(target_atoms):
-        if not hasattr(curr_obj, atom):
-            raise RuntimeError(
-                f"Node referenced nonexistent target '{'.'.join(target_atoms[:i])}'; "
-                f" original whole target: '{target}'"
-            )
-        curr_obj = getattr(curr_obj, atom)
-    return curr_obj
-
-
-def get_attr(node: torch.fx.Node) -> Any:
-    """
-    Returns the underlying attr for a given node which
-    must be of type get_attr.
-    """
-    assert node.op == "get_attr", "Expected a get_attr node"
-    return get_target_from_module(node.graph.owning_module, str(node.target))
-
-
-def is_acc_op(node_or_target: Union[Callable, torch.fx.Node]) -> bool:
-    """
-    Returns whether `node_or_target` is an acc_op. If it's a node, then checks whether
-    it's a call_function target is from the acc_ops module. Otherwise it's already
-    the target, which is similarly checked to see if it's from the acc_ops module.
-    """
-    if isinstance(node_or_target, torch.fx.Node):
-        # All acc_ops are call_functions.
-        if node_or_target.op != "call_function":
-            return False
-        target = node_or_target.target
-    else:
-        target = node_or_target
-    return "acc_ops" in target.__module__
-
-
-def is_acc_op_with_kwarg(
-    node_or_target: Union[Callable, torch.fx.Node], kwarg: str
-) -> bool:
-    """
-    Helper that inspects `node_or_target` and returns whether it is an acc_op node
-    (or a target for an acc_op) that has an arg signature that includes `kwarg`.
-    """
-    if not is_acc_op(node_or_target):
-        return False
-
-    target = (
-        node_or_target.target
-        if isinstance(node_or_target, torch.fx.Node)
-        else node_or_target
-    )
-    assert not isinstance(target, str)
-    return kwarg in inspect.signature(inspect.unwrap(target)).parameters
-
-
-def serialize_module_json_to_file(fx_module: GraphModule, fname: str):
-    weights: Dict = {}
-    serialized_json = json.dumps(serialize_module(fx_module, weights), indent=2)
-    with open(fname, "w") as ofile:
-        ofile.write(serialized_json)
-
-
-def build_raw_tensor_meta(
-    shape=None,
-    dtype=None,
-    requires_grad=None,
-    stride=None,
-    memory_format=None,
-    is_quantized=None,
-    qparams=None,
-):
-    return TensorMetadata(**locals())
-
-
-def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph"):
-    base, ext = os.path.splitext(fname)
-    if not ext:
-        ext = ".svg"
-    print(f"Writing FX graph to file: {base}{ext}")
-    g = graph_drawer.FxGraphDrawer(traced, figname)
-    x = g.get_main_dot_graph()
-    try:
-        getattr(x, "write_" + ext.lstrip("."))(fname)
-    except OSError as e:
-        print(f"Failed to write the FX graph due to: {e}")
-
-
-def get_model_info_str(gm: torch.fx.GraphModule, header: Optional[str] = None):
-    """
-    Print out info of the provided `gm`.
-    If `header` is provided then it's included in the printed string.
-    """
-    ops_and_counts: Dict[Callable, int] = dict()
-    placeholder_count = get_attr_count = call_method_count = call_module_count = 0
-    for node in gm.graph.nodes:
-        if node.op == "call_function":
-            ops_and_counts[node.target] = ops_and_counts.get(node.target, 0) + 1
-        elif node.op == "placeholder":
-            placeholder_count += 1
-        elif node.op == "get_attr":
-            get_attr_count += 1
-        elif node.op == "call_method":
-            call_method_count += 1
-        elif node.op == "call_module":
-            call_module_count += 1
-        elif node.op == "output":
-            output_count = len(node.args[0]) if isinstance(node.args[0], tuple) else 1
-        else:
-            raise RuntimeError(f"Unknown node found: {node.format_node()}")
-
-    header = "" if header is None else f" [{header}]"
-    model_info_str = f"Model Info{header}:\n"
-    model_info_str += f"> placeholder: {placeholder_count}\n"
-    model_info_str += f"> get_attr: {get_attr_count}\n"
-    model_info_str += f"> output: {output_count}\n"
-    if call_module_count != 0:
-        model_info_str += f"> WARNING: call_module: {call_module_count}"
-    if call_method_count != 0:
-        model_info_str += f"> WARNING: call_method: {call_method_count}"
-
-    # Sort and print all the other ops. Sort so it's deterministic between runs and
-    # easier to parse.
-    pretty_ops_and_counts: List[Tuple[str, int]] = []
-    for op, count in ops_and_counts.items():
-        pretty_ops_and_counts.append((_get_qualified_name(op), count))
-    pretty_ops_and_counts.sort()
-    for op_str, count in pretty_ops_and_counts:
-        model_info_str += f"> {op_str}: {count}\n"
-
-    return model_info_str
-
-
-def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
-    """
-    Make sure the name is unique (in a module) and can represents an attr.
-    """
-    # Delete all characters that are illegal in a Python identifier.
-    name = re.sub("[^0-9a-zA-Z_]+", "_", name)
-    if name[0].isdigit():
-        name = f"_{name}"
-    # Now make sure it is in fact unique to the module by incrementing suffix value.
-    while hasattr(mod_traced, name):
-        match = re.match(r"(.*)_(\d+)$", name)
-        if match is None:
-            name = name + "_1"
-        else:
-            base, num = match.group(1, 2)
-            name = f"{base}_{int(num) + 1}"
-
-    return name
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
index b233c157f67d..a1d4b0e7de76 100644
--- a/torch/fx/experimental/merge_matmul.py
+++ b/torch/fx/experimental/merge_matmul.py
@@ -1,83 +1,35 @@
 import torch
 
-from torch.fx.graph import Graph
-from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 from torch.fx._symbolic_trace import symbolic_trace
-
+from torch.fx.passes.tools_common import legalize_graph
 import itertools
 import operator
 
 from typing import Dict, List
 
 
-def get_first_dim(t: torch.Tensor) -> int:
+def split_result_tensors(result: torch.Tensor, inputs: List[torch.Tensor]) -> List[torch.Tensor]:
     """
-    A free function primarily for use in the merge_matmul graph transformation below
-    that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape
-    is an attribute (and cannot be the target of a call_function node) and also helps save
-    a getitem op in the graph.
+    A free function for use in the merge_matmul graph transformation below that
+    splits the output from a merged matmul into the individual results for each
+    input tensor.
 
     Arguments:
-        t: The tensor to get the first dimension of.
+        result: The merged matmul result tensor.
+        inputs: The list of inputs that were merged into one for the matmul.
 
     Returns:
-        The first dimension of t.
-    """
-    return t.shape[0]
-
-
-def legalize_graph(gm: GraphModule):
-    """
-    Replace the graph of the given GraphModule with one that contains the same nodes as the
-    original, but in topologically sorted order.
-
-    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
-    order of its input GraphModule, so that this order is restored before further transformation.
-
-    Arguments:
-        gm: The graph module to topologically sort. It is modified in-place.
-
+        List of matmul results for each input tensor.
     """
-    # Build an adjacency list representation of node dependencies in the graph. This also
-    # serves as a list of nodes that still need to be inserted into the new, topologically
-    # sorted graph.
-    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
+    # When fx tracer is running, x.shape[0] will be torch.fx.Attribute but we
+    # need an int even when tracing
+    if isinstance(result, torch.fx.Proxy):
+        splits = [0] * len(inputs)
+    else:
+        splits = [x.shape[0] for x in inputs]
 
-    # Construct a new graph that will contain all nodes in topologically sorted order.
-    new_graph = Graph()
-    value_remap: Dict[Node, Node] = {}
-
-    # Copy over all nodes with no dependencies.
-    for node, deps in dependencies.items():
-        if not deps:
-            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-
-    # Remove the copied over nodes from the adjacency list.
-    for copied_node in value_remap.keys():
-        del dependencies[copied_node]
-
-    # While there are still nodes to insert into the new graph:
-    while dependencies:
-        copied_this_round = []
-
-        # Copy over all nodes whose dependencies already exist in the new graph.
-        for node, deps in dependencies.items():
-            all_deps_copied = True
-            for dep in deps:
-                if dep not in value_remap:
-                    all_deps_copied = False
-
-            if all_deps_copied:
-                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-                copied_this_round.append(node)
-
-        # Delete all nodes copied over in this iteration from dependencies.
-        for copied_node in copied_this_round:
-            del dependencies[copied_node]
-
-    # Replace the old graph with the new, topologically sorted one.
-    gm.graph = new_graph
+    return torch.split(result, splits)
 
 
 def may_depend_on(a: Node, b: Node, search_depth: int = 6):
@@ -195,11 +147,8 @@ def merge_matmul(in_mod: torch.nn.Module):
 
         # Split the result of the merged matmul using the shapes of the LHS operands
         # to ascertain how large each chunk should be.
-        merge_mm_sizes = [
-            gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs
-        ]
         merge_mm_split = gm.graph.call_function(
-            torch.split, (merge_mm, merge_mm_sizes), {}
+            split_result_tensors, (merge_mm, lhs), {}
         )
         merge_mm_res = [
             gm.graph.call_function(operator.getitem, (merge_mm_split, out), {})
diff --git a/torch/fx/experimental/meta_tracer.py b/torch/fx/experimental/meta_tracer.py
new file mode 100644
index 000000000000..7ec5fb88d409
--- /dev/null
+++ b/torch/fx/experimental/meta_tracer.py
@@ -0,0 +1,268 @@
+import torch
+import torch.fx
+import warnings
+import functools
+import builtins
+
+from typing import Any, Callable, Dict, Optional, Union
+
+def embedding_override(self, input):
+    return torch.empty(*input.shape, self.weight.shape[-1], device='meta')
+
+
+def nn_layernorm_override(self, input):
+    return input
+
+
+def torch_relu_override(x):
+    return x
+
+
+def torch_nn_relu_override(self, x):
+    return x
+
+
+def functional_relu_override(x, inplace=False):
+    assert not inplace, 'dont support inplace functional.relu for metatensor analysis'
+    return x
+
+
+def torch_where_override(condition, x, y):
+    # torch.where returns the broadcasted tensor of condition, x, and y,
+    # so hack it by using addition
+    return condition.to(device='meta') + x.to(device='meta') + y.to(device='meta')
+
+
+def torch_abs_override(input, *, out=None):
+    assert out is None, 'Dont support in-place abs for MetaTensor analysis'
+    return input
+
+manual_meta_overrides : Dict[Callable, Callable] = {
+    torch.nn.Embedding: embedding_override,
+    torch.nn.LayerNorm: nn_layernorm_override,
+    torch.relu: torch_relu_override,
+    torch.nn.functional.relu: functional_relu_override,
+    torch.nn.ReLU: torch_nn_relu_override,
+    torch.where: torch_where_override,
+    torch.abs: torch_abs_override,
+}
+
+def gen_constructor_wrapper(target):
+    @functools.wraps(target)
+    def wrapper(*args, **kwargs):
+        proxy = None
+
+        def check_has_proxy(v):
+            if isinstance(v, torch.fx.Proxy):
+                nonlocal proxy
+                proxy = v
+        torch.fx.node.map_aggregate(args, check_has_proxy)
+        torch.fx.node.map_aggregate(kwargs, check_has_proxy)
+
+        if proxy is not None:
+            return proxy.tracer.create_proxy('call_function', target, args, kwargs)
+        else:
+            return target(*args, **kwargs)
+    return wrapper, target
+
+class MetaProxy(torch.fx.Proxy):
+    def install_tensor_meta(self, tensor_meta):
+        self._tensor_meta = tensor_meta
+
+    def size(self, dim=None):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.size(*[dim] if dim else [])
+        return self.tracer.create_proxy('call_method', 'size', (self, dim) if dim else (self,), {})
+
+    def dim(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.dim()
+        return self.tracer.create_proxy('call_method', 'dim', (self,), {})
+
+    @property
+    def shape(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.shape
+        return self.tracer.create_proxy('call_function', builtins.getattr, (self, 'shape'), {})
+
+    @property
+    def dtype(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.dtype
+        return self.tracer.create_proxy('call_function', builtins.getattr, (self, 'dtype'), {})
+
+    @property
+    def device(self):
+        # Hack so we can track when devices are used. During meta-tensor propagation,
+        # replace these values with a constant 'meta'
+        return MetaDeviceAttribute(self, 'device')
+
+    def __getattr__(self, k):
+        if k == '_tensor_meta':
+            return self.__getattribute__(k)
+        # note: not added to the graph yet, if this is a method call
+        # we peephole optimize to the method invocation
+        return MetaAttribute(self, k)
+
+class MetaAttribute(MetaProxy):
+    def __init__(self, root, attr: str):
+
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._node = None
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
+
+class MetaDeviceAttribute(MetaAttribute):
+    pass
+
+def proxys_to_metas(v):
+    if isinstance(v, MetaDeviceAttribute):
+        return 'meta'
+    if isinstance(v, torch.fx.Proxy):
+        assert isinstance(v, MetaProxy), f'Expected MetaProxy but got {type(v)}'
+        assert hasattr(v, '_tensor_meta'), 'MetaProxy does not have an associated meta'
+        return v._tensor_meta
+    return v
+
+class MetaTracer(torch.fx.Tracer):
+    allow_insert_stateless_mods : bool = True
+
+    _TORCH_METHODS_TO_PATCH = ['arange', 'zeros', 'ones', 'full_like', 'eye']
+
+    def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None):
+        rv = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
+
+        if kind == 'placeholder' and target in self.meta_args:
+            rv.install_tensor_meta(self.meta_args[target])
+            return rv
+
+        if target in self.orig_fns:
+            # NOTE: tensor constructors in PyTorch define the `device` argument as
+            # *kwargs-only*. That is why this works. If you add methods to
+            # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
+            # this will break and you will likely see issues where we cannot infer
+            # the size of the output.
+            if 'device' in kwargs:
+                kwargs['device'] = 'meta'
+
+        try:
+            args_metas = torch.fx.node.map_aggregate(args, proxys_to_metas)
+            kwargs_metas = torch.fx.node.map_aggregate(kwargs, proxys_to_metas)
+
+            if kind == 'call_function':
+                meta_target = manual_meta_overrides.get(target, target)
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+            elif kind == 'call_method':
+                meta_out = getattr(args_metas[0], target)(*args_metas[1:], **kwargs_metas)
+            elif kind == 'call_module':
+                assert hasattr(self, 'orig_forward')
+                self._disable_module_getattr = True
+                try:
+                    mod = self.root.get_submodule(target)
+                    mod_type = type(mod)
+                    if mod_type in manual_meta_overrides:
+                        meta_out = manual_meta_overrides[mod_type](mod, *args_metas, **kwargs_metas)
+                    else:
+                        meta_out = self.orig_forward(*args_metas, **kwargs_metas)
+                finally:
+                    self._disable_module_getattr = False
+            elif kind == 'get_attr':
+                self._disable_module_getattr = True
+                try:
+                    attr_itr = self.root
+                    atoms = target.split('.')
+                    for atom in atoms:
+                        attr_itr = getattr(attr_itr, atom)
+                    assert isinstance(attr_itr, torch.Tensor)
+                    meta_out = attr_itr.to(device='meta')
+                finally:
+                    self._disable_module_getattr = False
+            else:
+                return rv
+
+            # TODO
+            assert isinstance(rv, torch.fx.Proxy), 'Dont support composite output yet'
+            rv.install_tensor_meta(meta_out)
+        except Exception as e:
+            warnings.warn(f'Could not compute metadata for {kind} target {target}: {e}')
+
+        return rv
+
+    def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
+        if getattr(self, '_disable_module_getattr', False):
+            return attr_val
+        else:
+            return super()._module_getattr(attr, attr_val, parameter_proxy_cache)
+
+    def call_module(self, m, forward, args, kwargs):
+        self.orig_forward = forward
+        return super().call_module(m, forward, args, kwargs)
+
+    def _insert_module_as_submodule(self, mod: torch.nn.Module) -> str:
+        """
+        Helper method which tries to insert a module that was not declared as submodule.
+        """
+        idx = 0
+        mod_name = mod.__class__.__name__.lower()
+        path = f"{mod_name}_{idx}"
+        while hasattr(self.root, path):
+            path = f"{mod_name}_{idx}"
+            idx += 1
+
+        self.root.add_module(path, mod)
+        return path
+
+    def path_of_module(self, mod: torch.nn.Module) -> str:
+        try:
+            return super().path_of_module(mod)
+        except NameError as e:
+            if self.allow_insert_stateless_mods and len(list(mod.parameters())) == 0 and len(list(mod.buffers())) == 0:
+                path = self._insert_module_as_submodule(mod)
+                self.prev_module = path
+                return path
+            raise
+
+    def proxy(self, node):
+        return MetaProxy(node, self)
+
+    def trace(self, root, meta_args : Dict[str, torch.Tensor], concrete_args=None):
+        assert isinstance(meta_args, dict)
+        self.meta_args = meta_args
+
+        self.patched_torch_methods = {
+            target: gen_constructor_wrapper(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH
+        }
+        self.orig_fns = set()
+
+        for name, (wrapper, orig) in self.patched_torch_methods.items():
+            setattr(torch, name, wrapper)
+            self.orig_fns.add(orig)
+
+        try:
+            graph = super().trace(root, concrete_args)
+            graph._tracer_extras = {'meta_args': meta_args}
+            return graph
+        finally:
+            for name, (_, orig) in self.patched_torch_methods.items():
+                setattr(torch, name, orig)
+
+
+def symbolic_trace(root : Union[torch.nn.Module, Callable[..., Any]],
+                   meta_args : Dict[str, torch.Tensor] = None,
+                   concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.GraphModule:
+    tracer = MetaTracer()
+    graph = tracer.trace(root, meta_args, concrete_args)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    gm = torch.fx.GraphModule(tracer.root, graph, name)
+    return gm
diff --git a/torch/fx/experimental/normalize.py b/torch/fx/experimental/normalize.py
index b8164e2417f9..d7bd2c114684 100644
--- a/torch/fx/experimental/normalize.py
+++ b/torch/fx/experimental/normalize.py
@@ -34,7 +34,7 @@ class NormalizeArgs(Transformer):
     """
 
     def __init__(
-        self, module: torch.nn.Module, normalize_to_only_use_kwargs: bool = True
+        self, module: torch.fx.GraphModule, normalize_to_only_use_kwargs: bool = True
     ):
         super().__init__(module)
         self.node_map: Dict[Proxy, Node] = {}
diff --git a/torch/fx/experimental/optimization.py b/torch/fx/experimental/optimization.py
index 595dbfa4308d..3b9b7c365811 100644
--- a/torch/fx/experimental/optimization.py
+++ b/torch/fx/experimental/optimization.py
@@ -399,7 +399,7 @@ def get_color(n):
         if node.target == 'to_mkldnn' or node.target == 'to_dense':
             mkldnn_conversions += 1
 
-    logging.info(f"mkldnn conversions: {mkldnn_conversions}")
+    logging.getLogger(__name__).info(f"mkldnn conversions: {mkldnn_conversions}")
     fx_graph.lint()
     result = fx.GraphModule(model, fx_graph)
     return result
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
new file mode 100644
index 000000000000..ea6eceec8b1b
--- /dev/null
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -0,0 +1,184 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+from typing import Any, Dict, Optional, Tuple, Callable, Union
+import torch
+from torch._C import _disabled_torch_function_impl
+import torch.utils._pytree as pytree
+from torch.fx import Tracer, GraphModule
+import torch.fx as fx
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from contextlib import contextmanager
+
+__all__ = ["ProxyTensor", "PythonKeyTracer", "dispatch_trace", "make_fx"]
+aten = torch.ops.aten
+
+CURRENT_DECOMPOSITION_TABLE: Dict[torch._ops.OpOverload, Callable] = {}
+
+
+@contextmanager
+def no_dispatch():
+    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+    try:
+        yield
+    finally:
+        del guard
+
+
+@contextmanager
+def decompose(decomposition_table):
+    global CURRENT_DECOMPOSITION_TABLE
+    old_decomposition_table = CURRENT_DECOMPOSITION_TABLE
+    CURRENT_DECOMPOSITION_TABLE = decomposition_table
+    try:
+        yield CURRENT_DECOMPOSITION_TABLE
+    finally:
+        CURRENT_DECOMPOSITION_TABLE = old_decomposition_table
+
+
+class ProxyTensor(torch.Tensor):
+    proxy: fx.Proxy
+
+    @staticmethod
+    def __new__(cls, elem, proxy):
+        # Hack to deal with super().__new__ not working for sparse tensors
+        if elem.is_sparse:
+            proxy.node.meta['tensor_meta'] = {}
+            r = torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+        else:
+            r = super().__new__(cls, elem)  # type: ignore[call-arg]
+            proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(r)
+        r.proxy = proxy  # type: ignore[attr-defined]
+
+        return r
+
+    def __repr__(self):
+        with no_dispatch():
+            return f"ProxyTensor({self.as_subclass(torch.Tensor)}, proxy={self.proxy})"  # type: ignore[arg-type]
+
+    __torch_function__ = _disabled_torch_function_impl
+
+    @classmethod
+    def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
+        func = func_overload.overloadpacket
+        if func_overload in CURRENT_DECOMPOSITION_TABLE:
+            return CURRENT_DECOMPOSITION_TABLE[func_overload](*args, **kwargs)
+        if func_overload == aten._local_scalar_dense.default:
+            raise RuntimeError("It appears that you're trying to get value out of a tracing tensor - erroring out! "
+                               "It's likely that this is caused by data-dependent control flow or similar.")
+
+        def unwrap_proxy(e):
+            return e.proxy if isinstance(e, ProxyTensor) else e
+
+        proxy_args = pytree.tree_map(unwrap_proxy, args)
+        proxy_kwargs = pytree.tree_map(unwrap_proxy, kwargs)
+
+        proxy_out = func(*proxy_args, **proxy_kwargs)
+
+        # Kind of a hacky way to test if an op is in-place or not
+        if func.__name__[-1] == "_" and func.__name__[0] != "_":
+            args[0].proxy = proxy_out
+            proxy_out.node.meta['tensor_meta'] = _extract_tensor_metadata(args[0])
+
+        with no_dispatch():
+            real_out = func_overload(*args, **kwargs)
+
+        def wrap_with_proxy(e, proxy):
+            if type(e) == torch.Tensor:
+                return ProxyTensor(e, proxy)
+            else:
+                return e
+
+        # Unfortunately, tree_map cannot directly be used here. As the resulting
+        # object may be a proxy that represents a tuple, we may need to
+        # explicitly unwrap the proxy by simulating the flattening operations.
+        if isinstance(real_out, tuple):
+            return tuple(wrap_with_proxy(e, proxy_out[idx]) for idx, e in enumerate(real_out))
+        elif isinstance(real_out, list):
+            return list([wrap_with_proxy(e, proxy_out[idx]) for idx, e in enumerate(real_out)])
+        elif isinstance(real_out, torch.Tensor):
+            return wrap_with_proxy(real_out, proxy_out)
+        else:
+            return real_out
+
+
+class PythonKeyTracer(Tracer):
+    def __init__(self):
+        super().__init__()
+
+    # In general, we don't want to make modules leaves. In principle, users of
+    # this tracer might want to override this in order to turn a couple specific
+    # modules into leaves in the traced graph.
+    def call_module(
+        self, m: torch.nn.Module, forward: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Any:
+        return forward(*args, **kwargs)
+
+    def create_arg(self, a: Any):
+        if isinstance(a, torch.nn.Parameter):
+            for n, p in self.root.named_parameters():
+                if a is p:
+                    return self.create_node('get_attr', n, (), {})
+            qualname: Optional[str] = None
+
+            if not qualname:
+                i = 0
+                while True:
+                    qualname = f'_param_constant{i}'
+                    if not hasattr(self.root, qualname):
+                        break
+                    i += 1
+                setattr(self.root, qualname, a)
+
+            return self.create_node('get_attr', qualname, (), {})
+        return super().create_arg(a)
+
+
+def dispatch_trace(
+    root: Union[torch.nn.Module, Callable], concrete_args: Optional[Tuple[Any, ...]] = None
+) -> GraphModule:
+    tracer = PythonKeyTracer()
+    graph = tracer.trace(root, concrete_args)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    return GraphModule(tracer.root, graph, name)
+
+
+def wrap_key(f, inps):
+    flat_inps, _ = pytree.tree_flatten(inps)
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        flat_args, args_spec = pytree.tree_flatten(args)
+        assert(len(flat_args) == len(flat_inps))
+        for idx, arg in enumerate(flat_args):
+            if isinstance(flat_inps[idx], torch.Tensor):
+                flat_args[idx] = ProxyTensor(flat_inps[idx], arg)
+            else:
+                flat_args[idx] = flat_inps[idx]
+
+        tree_args = pytree.tree_unflatten(flat_args, args_spec)
+        out = f(*tree_args)
+        flat_outs, out_spec = pytree.tree_flatten(out)
+        for idx in range(len(flat_outs)):
+            if isinstance(flat_outs[idx], torch.Tensor) and isinstance(flat_outs[idx], ProxyTensor):
+                flat_outs[idx] = flat_outs[idx].proxy
+        return pytree.tree_unflatten(flat_outs, out_spec)
+
+    return wrapped
+
+
+def make_fx(f, decomposition_table=None):
+    if decomposition_table is None:
+        decomposition_table = {}
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        phs = pytree.tree_map(lambda x: fx.PH, args)  # type: ignore[attr-defined]
+        with decompose(decomposition_table):
+            t = dispatch_trace(wrap_key(f, args), concrete_args=tuple(phs))
+        return t
+
+    return wrapped
diff --git a/torch/fx/experimental/unification/variable.py b/torch/fx/experimental/unification/variable.py
index a87e256682a0..68324319e62e 100644
--- a/torch/fx/experimental/unification/variable.py
+++ b/torch/fx/experimental/unification/variable.py
@@ -19,7 +19,7 @@ def __new__(cls, *token):
             token = token[0]
 
         obj = object.__new__(cls)
-        obj.token = token
+        obj.token = token  # type: ignore[attr-defined]
         return obj
 
     def __str__(self):
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index fbd0425a793a..27a6082f7cea 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -151,7 +151,7 @@ def create_name(self, candidate: str, obj: Optional[Any]) -> str:
             num += 1
             candidate = f'{base}_{num}'
 
-        self._used_names.setdefault(candidate)
+        self._used_names.setdefault(candidate, 0)
         if obj is None:
             self._unassociated_names.add(candidate)
         else:
@@ -196,13 +196,6 @@ class PythonCode:
     globals: Dict[str, Any]
 
 
-def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
-    args_s = ', '.join(repr(a) for a in args)
-    kwargs_s = ', '.join(f'{k} = {repr(v)}' for k, v in kwargs.items())
-    if args_s and kwargs_s:
-        return f'{args_s}, {kwargs_s}'
-    return args_s or kwargs_s
-
 def _format_target(base: str, target: str) -> str:
     elems = target.split('.')
     r = base
@@ -252,6 +245,313 @@ class _PyTreeInfo(NamedTuple):
     in_spec: pytree.TreeSpec
     out_spec: Optional[pytree.TreeSpec]
 
+@compatibility(is_backward_compatible=False)
+class CodeGen(object):
+    def __init__(self):
+        self._body_transformer: Optional[TransformCodeFunc] = None
+
+    def gen_fn_def(self, free_vars: List[str], maybe_return_annotation: str) -> str:
+        """
+        Given the free variables and a return annotation, generates the beginning of the FX function.
+        By default, `gen_fn_def(['a', 'b'], '') == 'def forward(a, b):'`
+        """
+        # If the original function didn't have self as its first argument, we
+        # would have added it.
+        if len(free_vars) == 0 or free_vars[0] != 'self':
+            free_vars.insert(0, 'self')
+        return f"def forward({', '.join(free_vars)}){maybe_return_annotation}:"
+
+    def generate_output(self, output_args: Argument) -> str:
+        """
+        Given the output arguments, generates the return statement of the FX function.
+        Note: The returned statement should not be indented.
+        """
+        return f'return {repr(output_args)}'
+
+    def process_inputs(self, *args: Any) -> Any:
+        """
+        Transforms the inputs so that the graph can take them as arguments, as
+        non-default codegen may result in the inputs to the function being
+        different from the inputs to the graph.
+
+        If the graph was directly runnable, this invariant should hold true
+        `f.graph.process_outputs(f.graph(*f.graph.process_inputs(*inputs))) == f(*inputs)`
+        """
+        return args
+
+    def process_outputs(self, outputs: Any) -> Any:
+        """
+        Transforms the outputs of the graph to be identical to the codegen.
+
+        See ``process_inputs`` for more details.
+        """
+        return outputs
+
+    def additional_globals(self) -> List[Tuple[str, Any]]:
+        """
+        If your codegen uses extra global values, add tuples of (identifier,reference to the value) here.
+        For example, return ['List', typing.List] if you need ``List`` in the global context.
+        """
+        return []
+
+    def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation : List[str] = ['']
+
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
+
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
+
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if _is_from_torch(obj) and obj != torch.device:  # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
+                return global_name
+            globals_[global_name] = obj
+            return global_name
+
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
+
+        def type_repr(o : Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return '()'
+
+            typename = _type_repr(o)
+
+            if hasattr(o, '__origin__'):
+                # This is a generic type, e.g. typing.List[torch.Tensor]
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                if hasattr(o, '__args__'):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
+
+                    if len(args) == 0:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python < 3.9
+                        return origin_typename
+
+                    return f'{origin_typename}[{",".join(args)}]'
+                else:
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+                    return origin_typename
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+            def _get_repr(arg):
+                # Handle NamedTuples (if it has `_fields`) via add_global.
+                if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                    qualified_name = _get_qualified_name(type(arg))
+                    global_name = add_global(qualified_name, type(arg))
+                    return f"{global_name}{repr(tuple(arg))}"
+                return repr(arg)
+            args_s = ', '.join(_get_repr(a) for a in args)
+            kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+            if args_s and kwargs_s:
+                return f'{args_s}, {kwargs_s}'
+            return args_s or kwargs_s
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use : Dict[Node, Node] = {}
+        user_to_last_uses : Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n : Node, user : Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        def delete_unused_values(user : Node):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == 'placeholder':
+                return
+            if user.op == 'output':
+                body.append('\n')
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                body.append(f';  {to_delete_str}\n')
+            else:
+                body.append('\n')
+
+
+        def emit_node(node : Node):
+            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+            if node.op == 'placeholder':
+                assert isinstance(node.target, str)
+                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                raw_name = node.target.replace('*', '')
+                if raw_name != repr(node):
+                    body.append(f'{repr(node)} = {raw_name}\n')
+                return
+            elif node.op == 'call_method':
+                assert isinstance(node.target, str)
+                body.append(
+                    f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                    f'({_format_args(node.args[1:], node.kwargs)})')
+                return
+            elif node.op == 'call_function':
+                assert callable(node.target)
+                # pretty print operators
+                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    assert isinstance(node.args, tuple)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                    return
+
+                # pretty print inplace operators; required for jit.script to work properly
+                # not currently supported in normal FX graphs, but generated by torchdynamo
+                if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
+                    body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
+                                f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                    return
+
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if global_name == 'getattr' and \
+                   isinstance(node.args, tuple) and \
+                   isinstance(node.args[1], str) and \
+                   node.args[1].isidentifier() and \
+                   len(node.args) == 2:
+                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                    return
+                body.append(f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                if node.meta.get('is_wrapped', False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == 'call_module':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = '
+                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                return
+            elif node.op == 'get_attr':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                return
+            elif node.op == 'output':
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                body.append(self.generate_output(node.args[0]))
+                return
+            raise NotImplementedError(f'node: {node.op} {node.target}')
+
+        for node in nodes:
+            # NOTE: emit_node does not emit a string with newline. It depends
+            # on delete_unused_values to append one
+            emit_node(node)
+            delete_unused_values(node)
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+
+
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global('wrap', torch.fx.wrap)
+            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ''
+
+        if self._body_transformer:
+            body = self._body_transformer(body)
+
+        for name, value in self.additional_globals():
+            add_global(name, value)
+
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n'))
+        fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+        return PythonCode(fn_code, globals_)
+
+
+# Ideally, we'd like to refactor all of the pytree logic into this codegen
+# class. Unfortunately, there are 3 areas we currently need extra logic in FX.
+# 1. In the initial symbolic trace, the pytree logic is tied up with `concrete_args`.
+# 2. In the FX graph, we need to access 2 attributes - in_spec and out_spec.
+#    Since we can't access .graph within the FX forward, we need to copy the attribute to the module.
+# 3. We currently can't register the pytree imports with `add_global` - not sure why.
+class _PyTreeCodeGen(CodeGen):
+    def __init__(self, pytree_info: _PyTreeInfo):
+        super().__init__()
+        self.pytree_info: _PyTreeInfo = pytree_info
+
+    def process_inputs(self, *inputs: Any) -> Any:
+        flat_args, _ = pytree.tree_flatten(inputs)
+        return flat_args
+
+    def process_outputs(self, out: Any) -> Any:
+        if self.pytree_info is None:
+            return out
+        if not isinstance(out, list):
+            out = [out]
+        assert(self.pytree_info.out_spec is not None)
+        return pytree.tree_unflatten(out, self.pytree_info.out_spec)
+
+    def gen_fn_def(self, free_vars, maybe_return_annotation):
+        if self.pytree_info is None:
+            return super().gen_fn_def(free_vars, maybe_return_annotation)
+        function_args = self.pytree_info.orig_args
+        has_orig_self = (function_args[0] == 'self')
+        if has_orig_self:
+            free_vars.insert(0, 'self')
+        function_definition = super().gen_fn_def(function_args[:], maybe_return_annotation)
+        if len(free_vars) > 0:  # pytree has placeholders in it
+            function_definition += f"""
+    {', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(function_args)}], self._in_spec)"""
+        return function_definition
+
+    def generate_output(self, output_args):
+        if self.pytree_info:
+            return f'return pytree.tree_unflatten({repr(output_args)}, self._out_spec)'
+        else:
+            return super().generate_output(output_args)
+
 @compatibility(is_backward_compatible=True)
 class Graph:
     """
@@ -298,7 +598,8 @@ def forward(self, x):
     """
 
     @compatibility(is_backward_compatible=True)
-    def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Optional[Type["Tracer"]] = None):
+    def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Optional[Type["Tracer"]] = None,
+                 tracer_extras: Optional[Dict[str, Any]] = None):
         """
         Construct an empty Graph.
         """
@@ -310,10 +611,8 @@ def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Op
         self._owners = 0
         self._owning_module = owning_module
         self._tracer_cls = tracer_cls
-        self._pytree_info: Optional[_PyTreeInfo] = None
-
-        # A function `list[str] -> list[str]` to transform the body of the generated code
-        self._on_generate_code: Optional[TransformCodeFunc] = None
+        self._tracer_extras = tracer_extras
+        self._codegen = CodeGen()
 
     @property
     def owning_module(self):
@@ -383,6 +682,7 @@ def __deepcopy__(self, memo=None) -> 'Graph':
         memo = memo if memo else {}
         g = Graph(tracer_cls=self._tracer_cls)
         output_vals = g.graph_copy(self, val_map=memo, return_output_node=True)
+        g._codegen = copy.deepcopy(self._codegen)
         assert isinstance(output_vals, tuple)
         output_val, old_output_val = output_vals
         g.output(output_val, type_expr=getattr(old_output_val, 'type', None))
@@ -436,18 +736,16 @@ def create_node(self, op: str, target: 'Target',
         return n
 
     @compatibility(is_backward_compatible=False)
-    def flatten_inps(self, *args):
-        flat_args, args_spec = pytree.tree_flatten(args)
-        return flat_args
+    def process_inputs(self, *args):
+        """
+        Processes args so that they can be passed to the FX graph.
+        """
+        return self._codegen.process_inputs(*args)
 
     @compatibility(is_backward_compatible=False)
-    def unflatten_outs(self, out):
-        if self._pytree_info is None:
-            return out
-        if not isinstance(out, list):
-            out = [out]
-        assert(self._pytree_info.out_spec is not None)
-        return pytree.tree_unflatten(out, self._pytree_info.out_spec)
+    def process_outputs(self, out):
+        return self._codegen.process_outputs(out)
+
 
     @compatibility(is_backward_compatible=True)
     def erase_node(self, to_erase : Node) -> None:
@@ -853,204 +1151,8 @@ def override_node_repr(graph: Graph):
             return self._python_code(root_module, namespace)
 
     def _python_code(self, root_module: str, namespace: _Namespace) -> PythonCode:
-        free_vars: List[str] = []
-        body: List[str] = []
-        globals_: Dict[str, Any] = {}
-        wrapped_fns: Dict[str, None] = {}
-
-        # Wrap string in list to pass by reference
-        maybe_return_annotation : List[str] = ['']
-
-        def add_global(name_hint: str, obj: Any):
-            """Add an obj to be tracked as a global.
-
-            We call this for names that reference objects external to the
-            Graph, like functions or types.
-
-            Returns: the global name that should be used to reference 'obj' in generated source.
-            """
-            if _is_from_torch(obj) and obj != torch.device:  # to support registering torch.device
-                # HACK: workaround for how torch custom ops are registered. We
-                # can't import them like normal modules so they must retain their
-                # fully qualified name.
-                return _get_qualified_name(obj)
-
-            # normalize the name hint to get a proper identifier
-            global_name = namespace.create_name(name_hint, obj)
-
-            if global_name in globals_:
-                assert globals_[global_name] is obj
-                return global_name
-            globals_[global_name] = obj
-            return global_name
-
-        # Pre-fill the globals table with registered builtins.
-        for name, (_, obj) in _custom_builtins.items():
-            add_global(name, obj)
-
-        def type_repr(o : Any):
-            if o == ():
-                # Empty tuple is used for empty tuple type annotation Tuple[()]
-                return '()'
-
-            typename = _type_repr(o)
-
-            # This is a generic type, e.g. typing.List[torch.Tensor]
-            if hasattr(o, '__origin__'):
-                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                origin_typename = add_global(_type_repr(origin_type), origin_type)
-
-                # Assign global names for each of the inner type variables.
-                args = [type_repr(arg) for arg in o.__args__]
-
-                return f'{origin_typename}[{",".join(args)}]'
-
-            # Common case: this is a regular module name like 'foo.bar.baz'
-            return add_global(typename, o)
-
-        # Run through reverse nodes and record the first instance of a use
-        # of a given node. This represents the *last* use of the node in the
-        # execution order of the program, which we will use to free unused
-        # values
-        node_to_last_use : Dict[Node, Node] = {}
-        user_to_last_uses : Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n : Node, user : Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(self.nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-        def delete_unused_values(user : Node):
-            """
-            Delete values after their last use. This ensures that values that are
-            not used in the remainder of the code are freed and the memory usage
-            of the code is optimal.
-            """
-            if user.op == 'placeholder':
-                return
-            if user.op == 'output':
-                body.append('\n')
-                return
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if len(nodes_to_delete):
-                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
-                body.append(f';  {to_delete_str}\n')
-            else:
-                body.append('\n')
-
-
-        def emit_node(node : Node):
-            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-            if node.op == 'placeholder':
-                assert isinstance(node.target, str)
-                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
-                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
-                raw_name = node.target.replace('*', '')
-                if raw_name != repr(node):
-                    body.append(f'{repr(node)} = {raw_name}\n')
-                return
-            elif node.op == 'call_method':
-                assert isinstance(node.target, str)
-                body.append(
-                    f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
-                    f'({_format_args(node.args[1:], node.kwargs)})')
-                return
-            elif node.op == 'call_function':
-                assert callable(node.target)
-                # pretty print operators
-                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
-                    assert isinstance(node.args, tuple)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
-                    return
+        return self._codegen._gen_python_code(self.nodes, root_module, namespace)
 
-                # pretty print inplace operators; required for jit.script to work properly
-                # not currently supported in normal FX graphs, but generated by torchdynamo
-                if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
-                    body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
-                                f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
-                    return
-
-                qualified_name = _get_qualified_name(node.target)
-                global_name = add_global(qualified_name, node.target)
-                # special case for getattr: node.args could be 2-argument or 3-argument
-                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                if global_name == 'getattr' and \
-                   isinstance(node.args, tuple) and \
-                   isinstance(node.args[1], str) and \
-                   node.args[1].isidentifier() and \
-                   len(node.args) == 2:
-                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
-                    return
-                body.append(f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
-                if node.meta.get('is_wrapped', False):
-                    wrapped_fns.setdefault(global_name)
-                return
-            elif node.op == 'call_module':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = '
-                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
-                return
-            elif node.op == 'get_attr':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
-                return
-            elif node.op == 'output':
-                if node.type is not None:
-                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                if self._pytree_info is None:
-                    body.append(f'return {repr(node.args[0])}')
-                else:
-                    body.append(f'return pytree.tree_unflatten({repr(node.args[0])}, self._out_spec)')
-                return
-            raise NotImplementedError(f'node: {node.op} {node.target}')
-
-        for node in self.nodes:
-            # NOTE: emit_node does not emit a string with newline. It depends
-            # on delete_unused_values to append one
-            emit_node(node)
-            delete_unused_values(node)
-
-        if len(body) == 0:
-            # If the Graph has no non-placeholder nodes, no lines for the body
-            # have been emitted. To continue to have valid Python code, emit a
-            # single pass statement
-            body.append('pass\n')
-        if self._pytree_info is not None:
-            orig_args = self._pytree_info.orig_args
-            has_orig_self = (orig_args[0] == 'self')
-            if has_orig_self:
-                free_vars.insert(0, 'self')
-            if len(free_vars) > 0:  # pytree has placeholders in it
-                body.insert(0, f"{', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(orig_args)}], self._in_spec)\n")
-        else:
-            orig_args = free_vars
-
-        if len(wrapped_fns) > 0:
-            wrap_name = add_global('wrap', torch.fx.wrap)
-            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
-        else:
-            wrap_stmts = ''
-
-        if self._on_generate_code:
-            body = self._on_generate_code(body)
-
-        # If the original function didn't have self as its first argument, we
-        # would have added it.
-        if len(orig_args) == 0 or orig_args[0] != 'self':
-            orig_args.insert(0, 'self')
-        code = ''.join(body)
-        code = '\n'.join('    ' + line for line in code.split('\n'))
-        fn_code = f"""
-{wrap_stmts}
-
-def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
-{code}"""
-        return PythonCode(fn_code, globals_)
 
     def __str__(self) -> str:
         """
@@ -1203,6 +1305,9 @@ def forward(self, x):
 
         return changed
 
+    @compatibility(is_backward_compatible=False)
+    def set_codegen(self, codegen: CodeGen):
+        self._codegen = codegen
 
     @compatibility(is_backward_compatible=False)
     def on_generate_code(
@@ -1278,15 +1383,15 @@ def insert_pdb(body):
             # remains - that means you can run `gm` with pdb here too, until you
             # run next `recompile()`).
         """
-        on_gen_code_old = self._on_generate_code
-        self._on_generate_code = make_transformer(on_gen_code_old)
+        on_gen_code_old = self._codegen._body_transformer
+        self._codegen._body_transformer = make_transformer(on_gen_code_old)
 
         @contextlib.contextmanager
         def on_generate_code_context_manager():
             try:
                 yield
             finally:
-                self._on_generate_code = on_gen_code_old
+                self._codegen._body_transformer = on_gen_code_old
 
         return on_generate_code_context_manager()
 
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index a7a2ee24c9a5..89125185b850 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -5,7 +5,7 @@
 from torch.package import PackageImporter, PackageExporter
 import linecache
 from typing import Type, Dict, List, Any, Union, Optional, Set
-from .graph import Graph, _is_from_torch, _custom_builtins, PythonCode
+from .graph import Graph, _PyTreeCodeGen, _is_from_torch, _custom_builtins, PythonCode
 from ._compatibility import compatibility
 from torch.package import Importer, sys_importer
 import copy
@@ -160,7 +160,8 @@ def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
 
     com = CodeOnlyModule(body)
 
-    graph = KeepModules().trace(com)
+    tracer_extras = body.get('_tracer_extras', {})
+    graph = KeepModules().trace(com, **tracer_extras)
 
     # Manually set Tracer class on the reconstructed Graph, to avoid
     # referencing the private local subclass KeepModules.
@@ -222,6 +223,59 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
     else:
         setattr(to_module, field, from_obj)
 
+class _WrappedCall:
+    def __init__(self, cls, cls_call):
+        self.cls = cls
+        self.cls_call = cls_call
+
+    # Previously, if an error occurred when valid
+    # symbolically-traced code was run with an invalid input, the
+    # user would see the source of the error as coming from
+    # `File "<eval_with_key_N">`, where N is some number. We use
+    # this function to generate a more informative error message. We
+    # return the traceback itself, a message explaining that the
+    # error occurred in a traced Module's generated forward
+    # function, and five lines of context surrounding the faulty
+    # line
+    @staticmethod
+    def _generate_error_message(frame_summary: traceback.FrameSummary) -> str:
+        # auxiliary variables (for readability)
+        err_lineno = frame_summary.lineno
+        assert err_lineno is not None
+        line = frame_summary.line
+        assert line is not None
+        err_line_len = len(line)
+        all_src_lines = linecache.getlines(frame_summary.filename)
+
+        # constituent substrings of the error message
+        tb_repr = traceback.format_exc()
+        custom_msg = ("Call using an FX-traced Module, "
+                      f"line {err_lineno} of the traced Module's "
+                      "generated forward function:")
+        before_err = "".join(all_src_lines[err_lineno - 2 : err_lineno])
+        marker = "~" * err_line_len + "~~~ <--- HERE"
+        err_and_after_err = "\n".join(all_src_lines[err_lineno : err_lineno + 2])
+
+        # joined message
+        return "\n".join([tb_repr, custom_msg, before_err, marker, err_and_after_err])
+
+    def __call__(self, obj, *args, **kwargs):
+        try:
+            if self.cls_call is not None:
+                return self.cls_call(obj, *args, **kwargs)
+            else:
+                return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
+        except Exception as e:
+            assert e.__traceback__
+            topmost_framesummary: traceback.FrameSummary = \
+                traceback.StackSummary.extract(traceback.walk_tb(e.__traceback__))[-1]  # type: ignore[arg-type]
+            if "eval_with_key" in topmost_framesummary.filename:
+                print(_WrappedCall._generate_error_message(topmost_framesummary),
+                      file=sys.stderr)
+                raise e.with_traceback(None)
+            else:
+                raise e
+
 @compatibility(is_backward_compatible=True)
 class GraphModule(torch.nn.Module):
     """
@@ -320,6 +374,10 @@ def __init__(self,
         if self.graph._tracer_cls and '<locals>' not in self.graph._tracer_cls.__qualname__:
             self._tracer_cls = self.graph._tracer_cls
 
+        self._tracer_extras = {}
+        if self.graph._tracer_extras:
+            self._tracer_extras = self.graph._tracer_extras
+
     # TorchScript breaks trying to compile the graph setter because of the
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
     #
@@ -570,9 +628,9 @@ def recompile(self) -> PythonCode:
         called after editing the contained ``graph``, otherwise the generated
         code of this ``GraphModule`` will be out of date.
         """
-        if self._graph._pytree_info is not None:
-            self._in_spec = self._graph._pytree_info.in_spec
-            self._out_spec = self._graph._pytree_info.out_spec
+        if isinstance(self._graph._codegen, _PyTreeCodeGen):
+            self._in_spec = self._graph._codegen.pytree_info.in_spec
+            self._out_spec = self._graph._codegen.pytree_info.out_spec
         python_code = self._graph.python_code(root_module='self')
         self._code = python_code.src
 
@@ -587,49 +645,13 @@ def recompile(self) -> PythonCode:
         # bypass patching of torch.nn.Module.__call__ done while symbolic tracing.
         cls_call = cls.__call__ if "__call__" in vars(cls) else None
 
-        # Previously, if an error occurred when valid
-        # symbolically-traced code was run with an invalid input, the
-        # user would see the source of the error as coming from
-        # `File "<eval_with_key_N">`, where N is some number. We use
-        # this function to generate a more informative error message. We
-        # return the traceback itself, a message explaining that the
-        # error occurred in a traced Module's generated forward
-        # function, and five lines of context surrounding the faulty
-        # line
-        def generate_error_message(frame_summary: traceback.FrameSummary) -> str:
-            # auxiliary variables (for readability)
-            err_lineno = frame_summary.lineno
-            err_line_len = len(frame_summary.line)
-            all_src_lines = linecache.getlines(frame_summary.filename)
-
-            # constituent substrings of the error message
-            tb_repr = traceback.format_exc()
-            custom_msg = ("Call using an FX-traced Module, "
-                          f"line {err_lineno} of the traced Module's "
-                          "generated forward function:")
-            before_err = "".join(all_src_lines[err_lineno - 2 : err_lineno])
-            marker = "~" * err_line_len + "~~~ <--- HERE"
-            err_and_after_err = "\n".join(all_src_lines[err_lineno : err_lineno + 2])
-
-            # joined message
-            return "\n".join([tb_repr, custom_msg, before_err, marker, err_and_after_err])
-
-        def wrapped_call(self, *args, **kwargs):
-            try:
-                if cls_call is not None:
-                    return cls_call(self, *args, **kwargs)
-                else:
-                    return super(type(self), self).__call__(*args, **kwargs)
-            except Exception as e:
-                assert e.__traceback__
-                topmost_framesummary: traceback.FrameSummary = \
-                    traceback.StackSummary.extract(traceback.walk_tb(e.__traceback__))[-1]  # type: ignore[arg-type]
-                if "eval_with_key" in topmost_framesummary.filename:
-                    print(generate_error_message(topmost_framesummary),
-                          file=sys.stderr)
-                raise e.with_traceback(None)
+        if '_wrapped_call' not in vars(cls):
+            cls._wrapped_call = _WrappedCall(cls, cls_call)  # type: ignore[attr-defined]
+
+        def call_wrapped(self, *args, **kwargs):
+            return self._wrapped_call(self, *args, **kwargs)
 
-        cls.__call__ = wrapped_call
+        cls.__call__ = call_wrapped
 
         return python_code
 
diff --git a/torch/fx/immutable_collections.py b/torch/fx/immutable_collections.py
index 1093a07c8d22..d400656e080c 100644
--- a/torch/fx/immutable_collections.py
+++ b/torch/fx/immutable_collections.py
@@ -1,4 +1,7 @@
+from typing import Any, Dict, Tuple, List
+
 from ._compatibility import compatibility
+from torch.utils._pytree import Context, _register_pytree_node
 
 _help_mutation = """\
 If you are attempting to modify the kwargs or args of a torch.fx.Node object,
@@ -26,3 +29,22 @@ def _create_immutable_container(base, mutable_functions):
 immutable_dict = _create_immutable_container(dict, ['__delitem__', '__setitem__', 'clear', 'pop', 'popitem', 'update'])
 immutable_dict.__reduce__ = lambda self: (immutable_dict, (iter(self.items()),))
 compatibility(is_backward_compatible=True)(immutable_dict)
+
+
+# Register immutable collections for PyTree operations
+
+def _immutable_dict_flatten(d: Dict[Any, Any]) -> Tuple[List[Any], Context]:
+    return list(d.values()), list(d.keys())
+
+def _immutable_dict_unflatten(values: List[Any], context: Context) -> Dict[Any, Any]:
+    return immutable_dict({key: value for key, value in zip(context, values)})
+
+def _immutable_list_flatten(d: List[Any]) -> Tuple[List[Any], Context]:
+    return d, None
+
+def _immutable_list_unflatten(values: List[Any], context: Context) -> List[Any]:
+    return immutable_list(values)
+
+
+_register_pytree_node(immutable_dict, _immutable_dict_flatten, _immutable_dict_unflatten)
+_register_pytree_node(immutable_list, _immutable_list_flatten, _immutable_list_unflatten)
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index c6200677bb3e..cf945aa30eea 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -5,6 +5,7 @@
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+import inspect
 
 @compatibility(is_backward_compatible=True)
 class Interpreter:
@@ -88,7 +89,7 @@ def register_last_uses(n : Node, user : Node):
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
 
     @compatibility(is_backward_compatible=True)
-    def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
+    def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_processing : bool = True) -> Any:
         """
         Run `module` via interpretation and return the result.
 
@@ -98,6 +99,8 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
                 This is a dict mapping `Node` to any value. This can be used, for example, to
                 pre-populate results for certain `Nodes` so as to do only partial evaluation within
                 the interpreter.
+            enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and
+                process_outputs function first before using them.
 
         Returns:
             Any: The value returned from executing the Module
@@ -107,6 +110,8 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
         # Positional function args are consumed left-to-right by
         # `placeholder` nodes. Use an iterator to keep track of
         # position and extract those values.
+        if enable_io_processing:
+            args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
 
         for node in self.module.graph.nodes:
@@ -125,7 +130,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
 
             if node.op == 'output':
                 output_val = self.env[node]
-                return output_val
+                return self.module.graph.process_outputs(output_val) if enable_io_processing else output_val
 
     @compatibility(is_backward_compatible=True)
     def run_node(self, n : Node) -> Any:
@@ -381,6 +386,7 @@ def fn(x):
     def __init__(self, module):
         super().__init__(module)
         self.new_graph = Graph()
+        self.new_graph.set_codegen(module.graph._codegen)
 
         class TransformerTracer(Tracer):
             def __init__(self, graph: Graph):
@@ -407,7 +413,8 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
             kwargs (Dict): Dict of keyword arguments for this invocation
         """
         assert isinstance(target, str)
-        return Proxy(self.new_graph.placeholder(target), self.tracer)
+        default_value = next(iter(args)) if args else inspect.Signature.empty
+        return Proxy(self.new_graph.placeholder(target, default_value=default_value), self.tracer)
 
     @compatibility(is_backward_compatible=True)
     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
@@ -444,7 +451,7 @@ def transform(self) -> GraphModule:
         Transform ``self.module`` and return the transformed
         ``GraphModule``.
         """
-        result = super().run()
+        result = super().run(enable_io_processing=False)
         if result is not None:
             def strip_proxy(a : Union[Argument, Proxy]) -> Any:
                 return a.node if isinstance(a, Proxy) else a
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 704535b38ee0..431109422c08 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -26,7 +26,9 @@
 ]]
 
 _side_effectful_functions: Set[Callable] = {
-    torch._assert, torch.ops.profiler._record_function_enter,
+    torch._assert,
+    torch.ops.profiler._record_function_enter,
+    torch.ops.profiler._record_function_enter_new,
     torch.ops.profiler._record_function_exit}
 
 # this is fixed on master, WAR for 1.5
@@ -68,14 +70,18 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
     module = module.replace('torch._ops', 'torch.ops')  # WAR for bug in how torch.ops assigns module
     return f'{module}.{name}'
 
-def _format_arg(arg) -> str:
-    if isinstance(arg, list):
-        items = ', '.join(_format_arg(a) for a in arg)
-        return f'[{items}]'
+def _format_arg(arg, max_list_len=float('inf')) -> str:
+    if hasattr(arg, '_custom_fx_repr_fn'):
+        return arg._custom_fx_repr_fn()
+    elif isinstance(arg, list):
+        items = ', '.join(_format_arg(a) for idx, a in enumerate(arg) if idx < max_list_len)
+        maybe_len = '' if len(arg) < max_list_len + 1 else f', ...[total_len={len(arg)}]'
+        return f'[{items}{maybe_len}]'
     elif isinstance(arg, tuple):
-        items = ', '.join(_format_arg(a) for a in arg)
+        items = ', '.join(_format_arg(a) for idx, a in enumerate(arg) if idx < max_list_len)
+        maybe_len = '' if len(arg) < max_list_len + 1 else f', ...[total_len={len(arg)}]'
         maybe_comma = ',' if len(arg) == 1 else ''
-        return f'({items}{maybe_comma})'
+        return f'({items}{maybe_comma}{maybe_len})'
     elif isinstance(arg, dict):
         items_str = ', '.join(f'{k}: {_format_arg(v)}' for k, v in arg.items())
         return f'{{{items_str}}}'
@@ -457,20 +463,30 @@ def format_node(self,
                    f'args = {_format_arg(self.args)}, kwargs = {_format_arg(self.kwargs)})'
 
     @compatibility(is_backward_compatible=True)
-    def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
+    def replace_all_uses_with(self,
+                              replace_with : 'Node',
+                              delete_user_cb: Callable[['Node'], bool] = lambda user: True
+                              ) -> List['Node']:
         """
         Replace all uses of ``self`` in the Graph with the Node ``replace_with``.
 
         Args:
 
             replace_with (Node): The node to replace all uses of ``self`` with.
+            delete_user_cb (Callable): Callback that is called to determine
+              whether a given user of the self node should be removed.
 
         Returns:
 
             The list of Nodes on which this change was made.
         """
         to_process = list(self.users)
+        skipped = []
         for use_node in to_process:
+            if not delete_user_cb(use_node):
+                skipped.append(use_node)
+                continue
+
             def maybe_replace_node(n : Node) -> Node:
                 if n == self:
                     return replace_with
@@ -483,8 +499,8 @@ def maybe_replace_node(n : Node) -> Node:
             assert isinstance(new_kwargs, dict)
             use_node.__update_args_kwargs(new_args, new_kwargs)
 
-        assert len(self.users) == 0
-        return to_process
+        assert len(self.users) - len(skipped) == 0
+        return [n for n in to_process if n not in skipped]
 
     @compatibility(is_backward_compatible=False)
     def is_impure(self):
@@ -587,7 +603,9 @@ def map_aggregate(a: Argument, fn: Callable[[Argument], Argument]) -> Argument:
     Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys.
     """
     if isinstance(a, tuple):
-        return tuple(map_aggregate(elem, fn) for elem in a)
+        t = tuple(map_aggregate(elem, fn) for elem in a)
+        # Support NamedTuple (if it has `_fields`) by repacking into original type.
+        return t if not hasattr(a, '_fields') else type(a)(*t)
     elif isinstance(a, list):
         return immutable_list(map_aggregate(elem, fn) for elem in a)
     elif isinstance(a, dict):
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 1e3e02ed7cff..9848baa04c10 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -1,13 +1,14 @@
 import torch
 import inspect
 import numbers
+import types
 import typing
 import enum
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast, TYPE_CHECKING
 from torch._jit_internal import boolean_dispatched
 from ._compatibility import compatibility
-from torch._ops import OpOverloadPacket
+from torch._ops import OpOverloadPacket, OpOverload
 
 if TYPE_CHECKING:
     from .node import Argument
@@ -134,21 +135,22 @@ def get_signature_for_torch_op(op : Callable, return_schemas : bool = False):
             return_schemas=True, returns a tuple containing the optional Python signatures
             and the optional TorchScript Function signature
     """
-    override = _manual_overrides.get(op)
-    if override:
-        return (override, None) if return_schemas else None
-
-    if isinstance(op, OpOverloadPacket):
-        op = op._op
+    if isinstance(op, OpOverload):
+        schemas = [op._schema]
+    elif isinstance(op, OpOverloadPacket):
+        schemas = [getattr(op, overload)._schema for overload in op.overloads()]
+    else:
+        override = _manual_overrides.get(op)
+        if override:
+            return (override, None) if return_schemas else None
 
-    aten_fn = torch.jit._builtins._find_builtin(op)
+        aten_fn = torch.jit._builtins._find_builtin(op)
 
-    if aten_fn is None:
-        return (None, None) if return_schemas else None
+        if aten_fn is None:
+            return (None, None) if return_schemas else None
+        schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
 
-    schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
     signatures = [_torchscript_schema_to_signature(schema) for schema in schemas]
-
     return (signatures, schemas) if return_schemas else signatures
 
 @compatibility(is_backward_compatible=False)
@@ -257,7 +259,9 @@ def normalize_function(
     if kwargs is None:
         kwargs = {}
     new_args_and_kwargs = None
-    if target in boolean_dispatched or target.__module__ in ['torch.nn.functional', 'torch.functional']:
+    if not isinstance(target, types.BuiltinFunctionType) and not (
+        isinstance(target, OpOverloadPacket) or isinstance(target, OpOverload)
+    ):
         target_for_analysis = target
         if target in boolean_dispatched:
             # HACK: `boolean_dispatch` as used in `torch.nn.functional` makes it so that we have
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index d22e3b1bf407..045f019b587c 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -4,7 +4,7 @@
 import torch
 import torch.fx
 from typing import Dict, Any, TYPE_CHECKING
-from torch.fx.node import _get_qualified_name
+from torch.fx.node import _get_qualified_name, _format_arg
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch.fx._compatibility import compatibility
 from itertools import chain
@@ -60,9 +60,19 @@ class FxGraphDrawer:
                 f.write(g.get_dot_graph().create_svg())
         """
 
-        def __init__(self, graph_module: torch.fx.GraphModule, name: str, ignore_getattr: bool = False):
+        def __init__(
+            self,
+            graph_module: torch.fx.GraphModule,
+            name: str,
+            ignore_getattr: bool = False,
+            skip_node_names_in_args: bool = True,
+        ):
             self._name = name
-            self._dot_graphs = {name: self._to_dot(graph_module, name, ignore_getattr)}
+            self._dot_graphs = {
+                name: self._to_dot(
+                    graph_module, name, ignore_getattr, skip_node_names_in_args
+                )
+            }
 
             for node in graph_module.graph.nodes:
                 if node.op != "call_module":
@@ -73,7 +83,12 @@ def __init__(self, graph_module: torch.fx.GraphModule, name: str, ignore_getattr
                 if not isinstance(leaf_node, torch.fx.GraphModule):
                     continue
 
-                self._dot_graphs[f"{name}_{node.target}"] = self._to_dot(leaf_node, f"{name}_{node.target}", ignore_getattr)
+                self._dot_graphs[f"{name}_{node.target}"] = self._to_dot(
+                    leaf_node,
+                    f"{name}_{node.target}",
+                    ignore_getattr,
+                    skip_node_names_in_args,
+                )
 
         def get_dot_graph(self, submod_name=None) -> pydot.Dot:
             if submod_name is None:
@@ -129,20 +144,52 @@ def _typename(self, target: Any) -> str:
 
             return _get_qualified_name(target)
 
-        def _get_node_label(self, module: torch.fx.GraphModule, node: torch.fx.Node) -> str:
-            label = "{" + f"{node.name}|op_code={node.op}"
+        def _get_node_label(
+            self,
+            module: torch.fx.GraphModule,
+            node: torch.fx.Node,
+            skip_node_names_in_args: bool,
+        ) -> str:
+            def _get_str_for_args_kwargs(arg):
+                if isinstance(arg, tuple):
+                    prefix, suffix = r"|args=(\l", r",\n)\l"
+                    arg_strs_list = [_format_arg(a, max_list_len=8) for a in arg]
+                elif isinstance(arg, dict):
+                    prefix, suffix = r"|kwargs={\l", r",\n}\l"
+                    arg_strs_list = [
+                        f"{k}: {_format_arg(v, max_list_len=8)}"
+                        for k, v in arg.items()
+                    ]
+                else:  # Fall back to nothing in unexpected case.
+                    return ""
+
+                # Strip out node names if requested.
+                if skip_node_names_in_args:
+                    arg_strs_list = [a for a in arg_strs_list if "%" not in a]
+                if len(arg_strs_list) == 0:
+                    return ""
+                arg_strs = prefix + r",\n".join(arg_strs_list) + suffix
+                return arg_strs.replace("{", r"\{").replace("}", r"\}")
+
+
+            label = "{" + f"name=%{node.name}|op_code={node.op}\n"
 
             if node.op == "call_module":
                 leaf_module = self._get_leaf_node(module, node)
-                label += r"\l" + self._typename(leaf_module) + r"\l|"
+                label += r"\n" + self._typename(leaf_module) + r"\n|"
                 extra = ""
                 if hasattr(leaf_module, "__constants__"):
-                    extra = r"\l".join(
+                    extra = r"\n".join(
                         [f"{c}: {getattr(leaf_module, c)}" for c in leaf_module.__constants__]  # type: ignore[union-attr]
                     )
-                label += extra + r"\l"
+                label += extra + r"\n"
             else:
-                label += "|" + self._typename(node.target) + r"\l"
+                label += f"|target={self._typename(node.target)}" + r"\n"
+                if len(node.args) > 0:
+                    label += _get_str_for_args_kwargs(node.args)
+                if len(node.kwargs) > 0:
+                    label += _get_str_for_args_kwargs(node.kwargs)
+                label += f"|num_users={len(node.users)}" + r"\n"
 
             tensor_meta = node.meta.get('tensor_meta')
             label += self._tensor_meta_to_label(tensor_meta)
@@ -176,10 +223,10 @@ def _stringify_tensor_meta(self, tm: TensorMetadata) -> str:
             result = ""
             if not hasattr(tm, "dtype"):
                 print("tm", tm)
-            result += "|" + "dtype" + "=" + str(tm.dtype) + r"\l"
-            result += "|" + "shape" + "=" + str(tuple(tm.shape)) + r"\l"
-            result += "|" + "requires_grad" + "=" + str(tm.requires_grad) + r"\l"
-            result += "|" + "stride" + "=" + str(tm.stride) + r"\l"
+            result += "|" + "dtype" + "=" + str(tm.dtype) + r"\n"
+            result += "|" + "shape" + "=" + str(tuple(tm.shape)) + r"\n"
+            result += "|" + "requires_grad" + "=" + str(tm.requires_grad) + r"\n"
+            result += "|" + "stride" + "=" + str(tm.stride) + r"\n"
             if tm.is_quantized:
                 assert tm.qparams is not None
                 assert "qscheme" in tm.qparams
@@ -188,25 +235,31 @@ def _stringify_tensor_meta(self, tm: TensorMetadata) -> str:
                         torch.per_tensor_affine,
                         torch.per_tensor_symmetric,
                 }:
-                    result += "|" + "q_scale" + "=" + str(tm.qparams["scale"]) + r"\l"
-                    result += "|" + "q_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\l"
+                    result += "|" + "q_scale" + "=" + str(tm.qparams["scale"]) + r"\n"
+                    result += "|" + "q_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\n"
                 elif qscheme in {
                         torch.per_channel_affine,
                         torch.per_channel_symmetric,
                         torch.per_channel_affine_float_qparams,
                 }:
-                    result += "|" + "q_per_channel_scale" + "=" + str(tm.qparams["scale"]) + r"\l"
-                    result += "|" + "q_per_channel_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\l"
-                    result += "|" + "q_per_channel_axis" + "=" + str(tm.qparams["axis"]) + r"\l"
+                    result += "|" + "q_per_channel_scale" + "=" + str(tm.qparams["scale"]) + r"\n"
+                    result += "|" + "q_per_channel_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\n"
+                    result += "|" + "q_per_channel_axis" + "=" + str(tm.qparams["axis"]) + r"\n"
                 else:
                     raise RuntimeError(f"Unsupported qscheme: {qscheme}")
-                result += "|" + "qscheme" + "=" + str(tm.qparams["qscheme"]) + r"\l"
+                result += "|" + "qscheme" + "=" + str(tm.qparams["qscheme"]) + r"\n"
             return result
 
         def _get_tensor_label(self, t: torch.Tensor) -> str:
-            return str(t.dtype) + str(list(t.shape)) + r"\l"
-
-        def _to_dot(self, graph_module: torch.fx.GraphModule, name: str, ignore_getattr: bool) -> pydot.Dot:
+            return str(t.dtype) + str(list(t.shape)) + r"\n"
+
+        def _to_dot(
+            self,
+            graph_module: torch.fx.GraphModule,
+            name: str,
+            ignore_getattr: bool,
+            skip_node_names_in_args: bool,
+        ) -> pydot.Dot:
             """
             Actual interface to visualize a fx.Graph. Note that it takes in the GraphModule instead of the Graph
             """
@@ -218,7 +271,7 @@ def _to_dot(self, graph_module: torch.fx.GraphModule, name: str, ignore_getattr:
 
                 style = self._get_node_style(node)
                 dot_node = pydot.Node(
-                    node.name, label=self._get_node_label(graph_module, node), **style
+                    node.name, label=self._get_node_label(graph_module, node, skip_node_names_in_args), **style
                 )
                 dot_graph.add_node(dot_node)
 
diff --git a/torch/fx/passes/graph_manipulation.py b/torch/fx/passes/graph_manipulation.py
index a33696e74c59..8c2eb554d4cd 100644
--- a/torch/fx/passes/graph_manipulation.py
+++ b/torch/fx/passes/graph_manipulation.py
@@ -1,15 +1,14 @@
 from typing import Dict, List, NamedTuple, Any, Optional, Tuple
 
 import torch
-from torch.fx.passes.param_fetch import lift_lowering_attrs_to_nodes
+from torch.fx._compatibility import compatibility
 from torch.fx.graph import Graph
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node, Target, Argument, map_arg, map_aggregate
 from torch.fx.node import _get_qualified_name
+from torch.fx.passes.param_fetch import lift_lowering_attrs_to_nodes
 from torch.fx.passes.shape_prop import ShapeProp
 
-from torch.fx._compatibility import compatibility
-
 
 @compatibility(is_backward_compatible=False)
 def replace_target_nodes_with(
@@ -36,11 +35,13 @@ def replace_target_nodes_with(
             val_map[node] = new_graph.node_copy(node, lambda n: val_map[n])
     fx_module.graph = new_graph
 
+
 @compatibility(is_backward_compatible=False)
 class size_bytes(NamedTuple):
     output_size: int
     total_size: int
 
+
 @compatibility(is_backward_compatible=False)
 def get_size_of_all_nodes(
     fx_module: GraphModule, args: Optional[List[torch.Tensor]] = None
@@ -59,6 +60,7 @@ def get_size_of_all_nodes(
         node.size_bytes = get_size_of_node(fx_module, node)
     return
 
+
 @compatibility(is_backward_compatible=False)
 def get_tensor_meta(node: Node) -> Any:
     tensor_meta = node.meta.get("tensor_meta")
@@ -71,6 +73,7 @@ def get_tensor_meta(node: Node) -> Any:
 
     return tensor_meta
 
+
 @compatibility(is_backward_compatible=False)
 def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
     """Given a node with node.dtype and node.shape, return its total size and its output size.
@@ -102,14 +105,17 @@ def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
     output_size = size_per_elem_bytes * output_elem
     return size_bytes(output_size, total_size)
 
+
 @compatibility(is_backward_compatible=False)
 def serialize_shape(shape: torch.Size) -> str:
     return str(list(shape))
 
+
 @compatibility(is_backward_compatible=False)
 def serialize_stride(stride: Tuple[int]) -> str:
     return str(list(stride))
 
+
 @compatibility(is_backward_compatible=False)
 def serialize_tensor_quantization(
     tensor: torch.Tensor, weights: Dict, pcq_prefix: str
@@ -209,6 +215,7 @@ def serialize_tensor_quantization(
         scheme["q_per_channel_axis"] = tensor.q_per_channel_axis()
     return scheme, per_channel_dict
 
+
 @compatibility(is_backward_compatible=False)
 def serialize_weight(tensor: torch.Tensor, weights: Dict, name: str) -> Dict:
     weight_dict: Dict[str, Dict] = {name: {}}
@@ -227,6 +234,7 @@ def serialize_weight(tensor: torch.Tensor, weights: Dict, name: str) -> Dict:
 
     return weight_dict
 
+
 @compatibility(is_backward_compatible=False)
 def serialize_leaf_module(
     node: Node, weights_metadata: Dict, weights: Dict, name_prefix: str
@@ -244,6 +252,39 @@ def serialize_leaf_module(
 
     return parameters
 
+
+def _update_weight_fused_dtypes(weight, name, node):
+    """
+    For quantized embedding tables we need to update the shape/type, so we check if the
+    users of this get_attr node is a quantized EB and this is the weight for the EB, and
+    update the dtype accordingly.
+    """
+    user_targets = {
+        _get_qualified_name(n.target)
+        .replace("fx2trt_oss.tracer.acc_tracer.", "")
+        .replace("glow.fb.fx.", ""): n
+        for n in node.users.keys()
+        if n.op == "call_function"
+    }
+    if (
+        "acc_ops.embedding_bag_byte_rowwise_offsets" in user_targets
+        and str(
+            user_targets["acc_ops.embedding_bag_byte_rowwise_offsets"].kwargs["weight"]
+        )
+        == name
+    ):
+        weight[name]["dtype"] = "acc.uint8fused"
+    # Same as above, but for the 4 bit version.
+    if (
+        "acc_ops.embedding_bag_4bit_rowwise_offsets" in user_targets
+        and str(
+            user_targets["acc_ops.embedding_bag_4bit_rowwise_offsets"].kwargs["weight"]
+        )
+        == name
+    ):
+        weight[name]["dtype"] = "acc.uint4fused"
+
+
 @compatibility(is_backward_compatible=False)
 def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> Dict:
     """Recursively Serializes a graph module (fx_module) to a dictionary which is later exported to JSON.
@@ -291,17 +332,6 @@ def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> D
     submodules = dict(fx_module.named_modules())
     prefix = f"{name_prefix}." if name_prefix else ""
 
-    def add_weight_tensors(named_tensors):
-        for name, p in named_tensors:
-            if name.startswith("parent.") or not isinstance(p, torch.Tensor):
-                continue
-            weight_dict = serialize_weight(p, weights, prefix + name)
-            serialized_dict["weights"].update(weight_dict)
-            weights[prefix + name] = p
-
-    add_weight_tensors(fx_module.named_parameters())
-    add_weight_tensors(fx_module.named_buffers())
-
     def get_node_info(node):
         tensor_meta = get_tensor_meta(node)
         node_rep = {
@@ -373,58 +403,25 @@ def get_node_info(node):
         if node.op == "get_attr":
             # If we are targeting a parent constant we update the target.
             if node.target.startswith("parent."):
-                stripped_name = node.target[len("parent.") :]
-                node.name = stripped_name
-                node_rep["target"] = stripped_name
-                weight = serialize_weight(
-                    weights[stripped_name], weights, node.target[len("parent.") :]
-                )
-                # For quantized embedding tables we need to update the shape/type,
-                # so we check if the users of this get_attr is a quantized EB and this is the weight for the EB.
-                user_targets = {
-                    _get_qualified_name(n.target)
-                    .replace("torch.fx.experimental.fx_acc.", "")
-                    .replace("glow.fb.fx.", ""): n
-                    for n in node.users.keys()
-                }
-                if (
-                    "acc_ops.embedding_bag_byte_rowwise_offsets" in user_targets
-                    and str(
-                        user_targets[
-                            "acc_ops.embedding_bag_byte_rowwise_offsets"
-                        ].kwargs["weight"]
-                    )
-                    == stripped_name
-                ):
-                    weight[stripped_name]["dtype"] = "acc.uint8fused"
-                # Same as above, but for the 4 bit version.
-                if (
-                    "acc_ops.embedding_bag_4bit_rowwise_offsets" in user_targets
-                    and str(
-                        user_targets[
-                            "acc_ops.embedding_bag_4bit_rowwise_offsets"
-                        ].kwargs["weight"]
-                    )
-                    == stripped_name
-                ):
-                    weight[stripped_name]["dtype"] = "acc.uint4fused"
-
-                serialized_dict["weights"].update(weight)
+                qualname = node.target[len("parent.") :]
+                node.name = qualname
+                node_rep["target"] = qualname
             else:
-                # Find the actual target parameter/buffer from the fx_module.
-                submod_path, _, target_name = node.target.rpartition(".")
-                submod: Optional[torch.nn.Module] = (
-                    fx_module.get_submodule(submod_path) if submod_path else fx_module
-                )
-                assert submod is not None, f"submod {submod_path} not found"
-                target = getattr(submod, target_name, None)
-                assert target is not None, f"{target_name} not an attr of {submod_path}"
                 qualname = prefix + node.target
-                # Check that the target is a tensor, and that we haven't added it already from a leaf module.
-                if isinstance(target, torch.Tensor) and qualname not in weights:
-                    weight = serialize_weight(target, weights, qualname)
-                    serialized_dict["weights"].update(weight)
-                    weights[qualname] = target
+            # Find the actual target parameter/buffer from the fx_module.
+            submod_path, _, target_name = node.target.rpartition(".")
+            submod: Optional[torch.nn.Module] = (
+                fx_module.get_submodule(submod_path) if submod_path else fx_module
+            )
+            assert submod is not None, f"submod {submod_path} not found"
+            target = getattr(submod, target_name, None)
+            assert target is not None, f"{target_name} not an attr of {submod_path}"
+            # Check that the target is a tensor, and that we haven't added it already from a leaf module.
+            if isinstance(target, torch.Tensor) and qualname not in weights:
+                weight = serialize_weight(target, weights, qualname)
+                _update_weight_fused_dtypes(weight, qualname, node)
+                serialized_dict["weights"].update(weight)
+                weights[qualname] = target
 
         node_rep["op_code"] = node.op
         node_rep["name"] = node.name
@@ -451,9 +448,9 @@ def get_output_arg_info(arg: Node) -> Dict[str, Any]:
                 get_output_arg_info,
             )
 
-            # If there're multiple outputs then node_rep["args"][0] will be a tuple.
-            # In this case we want to unpack the tuple.
-            if isinstance(node_rep["args"][0], tuple):
+            # If there're multiple outputs then node_rep["args"][0] will be a tuple or
+            # list. In this case we want to unpack the tuple or list.
+            if isinstance(node_rep["args"][0], (tuple, list)):
                 node_rep["args"] = node_rep["args"][0]
         else:
             node_rep["args"] = map_aggregate(node.args, get_arg_info)
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 9c1a884b421c..1d9891588b7c 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -1,4 +1,3 @@
-import argparse
 from typing import Any, Callable, Tuple, Dict, Optional
 import logging
 
@@ -18,6 +17,7 @@
     FxNetAccFusionsFinder,
     Names
 )
+from dataclasses import dataclass
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -46,43 +46,26 @@ class FxNetMinimizerResultMismatchError(Exception):
 
     pass
 
-
+@dataclass
 class _MinimizerSettingBase:
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--accumulate_error",
-            default=False,
-            action="store_true",
-            help="Instead of using a's input for both converted module to verify, use the "
-            "previous outputs of each converted module as input to accumulate the errors.",
-        )
+    """
+    Args:
+    `accumulate_error`: Instead of using a's input for both converted module to verify
+    , use the previous outputs of each converted module as input to accumulate the
+    errors.
 
-        parser.add_argument(
-            "--traverse_method",
-            default="sequential",
-            choices=["sequential", "binary", "accumulate"],
-            help="Determine the way of traverse the nodes in FX module.",
-        )
-        parser.add_argument(
-            "--find_all",
-            default=False,
-            action="store_true",
-            help="Minimizer will go through the entire model and return all problematic nodes.",
-        )
-        parser.add_argument(
-            "--return_intermediate",
-            default=False,
-            action="store_true",
-            help="If true, when using `run_nodes()` function to run the model, intermediate results "
-            "of all the ops will be returned as output.",
-        )
-        args, unknown = parser.parse_known_args()
+    `traverse_method`: "sequential" or "binary" or "accumulate"
+    Determine the way of traverse the nodes in FX module.
 
-        self.accumulate_error: bool = args.accumulate_error
-        self.traverse_method: str = args.traverse_method
-        self.find_all: bool = args.find_all
-        self.return_intermediate: bool = args.return_intermediate
+    `find_all`: Minimizer will go through the entire model and return all problematic nodes.
+
+    `return_intermediate`: If true, when using `run_nodes()` function to run the
+    model, intermediate results of all the ops will be returned as output.
+    """
+    accumulate_error: bool = False
+    traverse_method: str = "sequential"
+    find_all: bool = False
+    return_intermediate: bool = False
 
     def __str__(self):
         settings_str = "FX Minimizer Settings:\n"
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
new file mode 100644
index 000000000000..096857efb7c2
--- /dev/null
+++ b/torch/fx/passes/pass_manager.py
@@ -0,0 +1,188 @@
+from functools import wraps
+from inspect import unwrap
+from typing import Callable, List
+
+
+# for callables which modify object inplace and return something other than
+# the object on which they act
+def inplace_wrapper(fn: Callable) -> Callable:
+    """
+    Convenience wrapper for passes which modify an object inplace. This
+    wrapper makes them return the modified object instead.
+
+    Args:
+        fn (Callable[Object, Any])
+
+    Returns:
+        wrapped_fn (Callable[Object, Object])
+    """
+
+    @wraps(fn)
+    def wrapped_fn(gm):
+        fn(gm)
+        return gm
+
+    return wrapped_fn
+
+
+def loop_pass(base_pass: Callable, n_iter: int = None, predicate: Callable = None):
+    """
+    Convenience wrapper for passes which need to be applied multiple times.
+
+    Exactly one of `n_iter`or `predicate` must be specified.
+
+    Args:
+        base_pass (Callable[Object, Object]): pass to be applied in loop
+        n_iter (int, optional): number of times to loop pass
+        predicate (Callable[Object, bool], optional):
+
+    """
+    assert (n_iter is not None) ^ (
+        predicate is not None
+    ), "Exactly one of `n_iter`or `predicate` must be specified."
+
+    @wraps(base_pass)
+    def new_pass(source):
+        output = source
+        if n_iter is not None and n_iter > 0:
+            for _ in range(n_iter):
+                output = base_pass(output)
+        elif predicate is not None:
+            while predicate(output):
+                output = base_pass(output)
+        else:
+            raise RuntimeError(
+                f"loop_pass must be given positive int n_iter (given "
+                f"{n_iter}) xor predicate (given {predicate})"
+            )
+        return output
+
+    return new_pass
+
+
+# Pass Schedule Constraints:
+#
+# Implemented as 'depends on' operators. A constraint is satisfied iff a list
+# has a valid partial ordering according to this comparison operator.
+def _validate_pass_schedule_constraint(
+    constraint: Callable[[Callable, Callable], bool], passes: List[Callable]
+):
+    for i, a in enumerate(passes):
+        for j, b in enumerate(passes[i + 1 :]):
+            if constraint(a, b):
+                continue
+            raise RuntimeError(
+                f"pass schedule constraint violated. Expected {a} before {b}"
+                f" but found {a} at index {i} and {b} at index{j} in pass"
+                f" list."
+            )
+
+
+def this_before_that_pass_constraint(this: Callable, that: Callable):
+    """
+    Defines a partial order ('depends on' function) where `this` must occur
+    before `that`.
+    """
+
+    def depends_on(a: Callable, b: Callable):
+        if a == that and b == this:
+            return False
+        return True
+
+    return depends_on
+
+
+def these_before_those_pass_constraint(these: Callable, those: Callable):
+    """
+    Defines a partial order ('depends on' function) where `these` must occur
+    before `those`. Where the inputs are 'unwrapped' before comparison.
+
+    For example, the following pass list and constraint list would be invalid.
+    ```
+    passes = [
+        loop_pass(pass_b, 3),
+        loop_pass(pass_a, 5),
+    ]
+
+    constraints = [
+        these_before_those_pass_constraint(pass_a, pass_b)
+    ]
+    ```
+
+    Args:
+        these (Callable): pass which should occur first
+        those (Callable): pass which should occur later
+
+    Returns:
+        depends_on (Callable[[Object, Object], bool]
+    """
+
+    def depends_on(a: Callable, b: Callable):
+        if unwrap(a) == those and unwrap(b) == these:
+            return False
+        return True
+
+    return depends_on
+
+
+class PassManager:
+    """
+    Construct a PassManager.
+
+    Collects passes and constraints. This defines the pass schedule, manages
+    pass constraints and pass execution.
+
+    Args:
+        passes (Optional[List[Callable]]): list of passes. A pass is a
+            callable which modifies an object and returns modified object
+        constraint (Optional[List[Callable]]): list of constraints. A
+            constraint is a callable which takes two passes (A, B) and returns
+            True if A depends on B and False otherwise. See implementation of
+            `this_before_that_pass_constraint` for example.
+    """
+
+    passes: List[Callable] = []
+    constraints: List[Callable] = []
+    _validated: bool = False
+
+    def __init__(
+        self,
+        passes=None,
+        constraints=None,
+    ):
+        if passes:
+            self.passes = passes
+        if constraints:
+            self.constraints = constraints
+
+    @classmethod
+    def build_from_passlist(cls, passes):
+        pm = PassManager(passes)
+        # TODO(alexbeloi): add constraint management/validation
+        return pm
+
+    def add_pass(self, _pass: Callable):
+        self.passes.append(_pass)
+        self._validated = False
+
+    def add_constraint(self, constraint):
+        self.constraints.append(constraint)
+        self._validated = False
+
+    def validate(self):
+        """
+        Validates that current pass schedule defined by `self.passes` is valid
+        according to all constraints in `self.constraints`
+        """
+        if self._validated:
+            return
+        for constraint in self.constraints:
+            _validate_pass_schedule_constraint(constraint, self.passes)
+        self._validated = True
+
+    def __call__(self, source):
+        self.validate()
+        out = source
+        for _pass in self.passes:
+            out = _pass(out)
+        return out
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 83ba0cbc5722..f7feaddd207f 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -1,9 +1,12 @@
 import torch
 import torch.fx
+import traceback
+
 from torch.fx.node import Node, map_aggregate
 from typing import Any, Tuple, NamedTuple, Optional, Dict
 from torch.fx._compatibility import compatibility
 
+
 @compatibility(is_backward_compatible=True)
 class TensorMetadata(NamedTuple):
     # TensorMetadata is a structure containing pertinent information
@@ -108,7 +111,14 @@ def forward(self, x):
 
     """
     def run_node(self, n : Node) -> Any:
-        result = super().run_node(n)
+        try:
+            result = super().run_node(n)
+        except Exception:
+            traceback.print_exc()
+            raise RuntimeError(
+                f"ShapeProp error for: node={n.format_node()} with "
+                f"meta={n.meta}"
+            )
 
         found_tensor = False
 
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index e018d5079205..1bd5918da053 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -8,6 +8,7 @@
 class Partition:
     def __init__(self, name: str):
         self.name: str = name
+        self.submod_name = f'submod_{name}'
         self.node_names: List[str] = []
         self.inputs: Dict[str, None] = {}
         self.outputs: Dict[str, None] = {}
@@ -31,6 +32,7 @@ def split_module(
     m: GraphModule,
     root_m: torch.nn.Module,
     split_callback: Callable[[torch.fx.node.Node], int],
+    qualname_map: Optional[Dict[str, str]] = None,
 ):
     """
     Creates subgraphs out of main graph
@@ -44,6 +46,9 @@ def split_module(
             that maps a given Node instance to a numeric partition identifier.
             split_module will use this function as the policy for which operations
             appear in which partitions in the output Module.
+        qualname_map: Optional[Dict[str, str]]: optional output parameter that returns a
+            mapping from new target names in the module after split to old target
+            names in the original module.
 
     Returns:
         GraphModule: the module after split.
@@ -211,6 +216,12 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
                 # target = target_atoms[-1]
                 target = '_'.join(target_atoms)
                 partition.targets[target] = target_attr
+                # Fill in the passed-in mapping from new qualname to old qualname
+                if qualname_map is not None:
+                    # When creating the split module later, the submodules will have
+                    # path prefix matching the corresponding partition's submod_name
+                    qualname = f'{partition.submod_name}.{target}'
+                    qualname_map[qualname] = node.target
 
             assert isinstance(gathered_args, tuple)
             assert isinstance(gathered_kwargs, dict)
@@ -227,7 +238,7 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
         if node.op == 'placeholder':
             default_value = node.args[0] if len(node.args) > 0 else inspect.Signature.empty
             base_mod_env[node.name] = base_mod_graph.placeholder(
-                node.name, type_expr=node.type, default_value=default_value)
+                node.target, type_expr=node.type, default_value=default_value)
             base_mod_env[node.name].meta = node.meta.copy()
         elif node.op == 'get_attr':
             base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
@@ -254,11 +265,10 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
         partition.graph.output(output_vals)
 
         # Construct GraphModule for this partition
-        submod_name = f'submod_{partition_name}'
-        base_mod_attrs[submod_name] = torch.fx.graph_module.GraphModule(partition.targets, partition.graph)  # noqa: B950
+        base_mod_attrs[partition.submod_name] = torch.fx.graph_module.GraphModule(partition.targets, partition.graph)  # noqa: B950
 
         # Emit call in base graph to this submodule
-        output_val = base_mod_graph.call_module(submod_name, tuple(base_mod_env[name] for name in partition.inputs))
+        output_val = base_mod_graph.call_module(partition.submod_name, tuple(base_mod_env[name] for name in partition.inputs))
         if len(partition.outputs) > 1:
             # Unpack multiple return values from submodule
             output_val_proxy = torch.fx.proxy.Proxy(output_val)
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index fce9c75ee99f..bf1514085d72 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -8,6 +8,25 @@
 from torch.fx._compatibility import compatibility
 
 
+@compatibility(is_backward_compatible=False)
+def getattr_recursive(obj, name):
+    for layer in name.split("."):
+        if hasattr(obj, layer):
+            obj = getattr(obj, layer)
+        else:
+            return None
+    return obj
+
+
+@compatibility(is_backward_compatible=False)
+def setattr_recursive(obj, attr, value):
+    if "." not in attr:
+        setattr(obj, attr, value)
+    else:
+        layer = attr.split(".")
+        setattr_recursive(getattr(obj, layer[0]), ".".join(layer[1:]), value)
+
+
 @compatibility(is_backward_compatible=False)
 @dataclass
 class Component:
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 03d57ea300e1..b1b8fab7299c 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -35,6 +35,7 @@ def __init__(self):
         parser.add_argument(
             "--min_acc_module_size",
             default=1,
+            type=int,
             help="Minimum size limit of an accelerator subgraph.",
         )
         parser.add_argument(
diff --git a/tools/codegen/selective_build/__init__.py b/torch/fx/passes/tests/__init__.py
similarity index 100%
rename from tools/codegen/selective_build/__init__.py
rename to torch/fx/passes/tests/__init__.py
diff --git a/torch/fx/passes/tests/test_pass_manager.py b/torch/fx/passes/tests/test_pass_manager.py
new file mode 100644
index 000000000000..4ed0cfce89de
--- /dev/null
+++ b/torch/fx/passes/tests/test_pass_manager.py
@@ -0,0 +1,36 @@
+import unittest
+
+from ..pass_manager import (
+    inplace_wrapper,
+    PassManager,
+    these_before_those_pass_constraint,
+    this_before_that_pass_constraint,
+)
+
+
+class TestPassManager(unittest.TestCase):
+    def test_pass_manager_builder(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+        pm.validate()
+
+    def test_this_before_that_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+
+        # add unfulfillable constraint
+        pm.add_constraint(this_before_that_pass_constraint(passes[-1], passes[0]))
+
+        self.assertRaises(RuntimeError, pm.validate)
+
+    def test_these_before_those_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        constraint = these_before_those_pass_constraint(passes[-1], passes[0])
+        pm = PassManager(
+            [inplace_wrapper(p) for p in passes]
+        )
+
+        # add unfulfillable constraint
+        pm.add_constraint(constraint)
+
+        self.assertRaises(RuntimeError, pm.validate)
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
index f673e47114f1..c9e76266fd71 100644
--- a/torch/fx/passes/tools_common.py
+++ b/torch/fx/passes/tools_common.py
@@ -23,7 +23,7 @@ def get_acc_ops_name(k):
         return f"acc_ops.{k.__name__}"
     else:
         module = k.__module__
-        return f"{module if module else ''}.{k.__name__}".replace('_', '')
+        return f"{module if module else ''}.{k.__name__}"
 
 
 @compatibility(is_backward_compatible=False)
@@ -205,3 +205,57 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
                     result[n] = fusion_group.nodes
 
         return result
+
+
+@compatibility(is_backward_compatible=False)
+def legalize_graph(gm: torch.fx.GraphModule):
+    """
+    Replace the graph of the given GraphModule with one that contains the same nodes as the
+    original, but in topologically sorted order.
+
+    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
+    order of its input GraphModule, so that this order is restored before further transformation.
+
+    Arguments:
+        gm: The graph module to topologically sort. It is modified in-place.
+
+    """
+    # Build an adjacency list representation of node dependencies in the graph. This also
+    # serves as a list of nodes that still need to be inserted into the new, topologically
+    # sorted graph.
+    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
+
+    # Construct a new graph that will contain all nodes in topologically sorted order.
+    new_graph = torch.fx.Graph()
+    value_remap: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+    # Copy over all nodes with no dependencies.
+    for node, deps in dependencies.items():
+        if not deps:
+            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+
+    # Remove the copied over nodes from the adjacency list.
+    for copied_node in value_remap.keys():
+        del dependencies[copied_node]
+
+    # While there are still nodes to insert into the new graph:
+    while dependencies:
+        copied_this_round = []
+
+        # Copy over all nodes whose dependencies already exist in the new graph.
+        for node, deps in dependencies.items():
+            all_deps_copied = True
+            for dep in deps:
+                if dep not in value_remap:
+                    all_deps_copied = False
+
+            if all_deps_copied:
+                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+                copied_this_round.append(node)
+
+        # Delete all nodes copied over in this iteration from dependencies.
+        for copied_node in copied_this_round:
+            del dependencies[copied_node]
+
+    # Replace the old graph with the new, topologically sorted one.
+    gm.graph = new_graph
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 30b0cffa98da..83dafb48e481 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -19,6 +19,12 @@ class TracerBase:
     check_mutable_operations : bool = False
     # Feature flag for assert tracing
     trace_asserts : bool = False
+    # Feature flag for proxying accesses to buffer values
+    proxy_buffer_attributes : bool = False
+
+    # Name of the function to be traced. It will only be used when
+    # ``root`` is an instance of ``nn.Module``
+    traced_func_name: str = "forward"
 
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
diff --git a/torch/hub.py b/torch/hub.py
index 236ba8d5de33..37cd3d099c7c 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -9,7 +9,8 @@
 import torch
 import warnings
 import zipfile
-
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union, Any
 from urllib.error import HTTPError
 from urllib.request import urlopen, Request
 from urllib.parse import urlparse  # noqa: F401
@@ -53,9 +54,20 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
                 sys.stderr.write('\n')
 
+__all__ = [
+    'download_url_to_file',
+    'get_dir',
+    'help',
+    'list',
+    'load',
+    'load_state_dict_from_url',
+    'set_dir',
+]
+
 # matches bfd8deac from resnet18-bfd8deac.pth
 HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')
 
+_TRUSTED_REPO_OWNERS = ("facebookresearch", "facebookincubator", "pytorch", "fairinternal")
 ENV_GITHUB_TOKEN = 'GITHUB_TOKEN'
 ENV_TORCH_HOME = 'TORCH_HOME'
 ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
@@ -71,17 +83,13 @@ def _import_module(name, path):
     import importlib.util
     from importlib.abc import Loader
     spec = importlib.util.spec_from_file_location(name, path)
+    assert spec is not None
     module = importlib.util.module_from_spec(spec)
     assert isinstance(spec.loader, Loader)
     spec.loader.exec_module(module)
     return module
 
 
-def import_module(name, path):
-    warnings.warn('The use of torch.hub.import_module is deprecated in v0.11 and will be removed in v0.12', DeprecationWarning)
-    return _import_module(name, path)
-
-
 def _remove_if_exists(path):
     if os.path.exists(path):
         if os.path.isfile(path):
@@ -90,8 +98,9 @@ def _remove_if_exists(path):
             shutil.rmtree(path)
 
 
-def _git_archive_link(repo_owner, repo_name, branch):
-    return 'https://github.com/{}/{}/archive/{}.zip'.format(repo_owner, repo_name, branch)
+def _git_archive_link(repo_owner, repo_name, ref):
+    # See https://docs.github.com/en/rest/reference/repos#download-a-repository-archive-zip
+    return f"https://github.com/{repo_owner}/{repo_name}/zipball/{ref}"
 
 
 def _load_attr_from_module(module, func_name):
@@ -111,24 +120,24 @@ def _get_torch_home():
 
 def _parse_repo_info(github):
     if ':' in github:
-        repo_info, branch = github.split(':')
+        repo_info, ref = github.split(':')
     else:
-        repo_info, branch = github, None
+        repo_info, ref = github, None
     repo_owner, repo_name = repo_info.split('/')
 
-    if branch is None:
-        # The branch wasn't specified by the user, so we need to figure out the
+    if ref is None:
+        # The ref wasn't specified by the user, so we need to figure out the
         # default branch: main or master. Our assumption is that if main exists
         # then it's the default branch, otherwise it's master.
         try:
             with urlopen(f"https://github.com/{repo_owner}/{repo_name}/tree/main/"):
-                branch = 'main'
+                ref = 'main'
         except HTTPError as e:
             if e.code == 404:
-                branch = 'master'
+                ref = 'master'
             else:
                 raise
-    return repo_owner, repo_name, branch
+    return repo_owner, repo_name, ref
 
 
 def _read_url(url):
@@ -136,7 +145,7 @@ def _read_url(url):
         return r.read().decode(r.headers.get_content_charset('utf-8'))
 
 
-def _validate_not_a_forked_repo(repo_owner, repo_name, branch):
+def _validate_not_a_forked_repo(repo_owner, repo_name, ref):
     # Use urlopen to avoid depending on local git.
     headers = {'Accept': 'application/vnd.github.v3+json'}
     token = os.environ.get(ENV_GITHUB_TOKEN)
@@ -154,30 +163,34 @@ def _validate_not_a_forked_repo(repo_owner, repo_name, branch):
             if not response:
                 break
             for br in response:
-                if br['name'] == branch or br['commit']['sha'].startswith(branch):
+                if br['name'] == ref or br['commit']['sha'].startswith(ref):
                     return
 
-    raise ValueError(f'Cannot find {branch} in https://github.com/{repo_owner}/{repo_name}. '
+    raise ValueError(f'Cannot find {ref} in https://github.com/{repo_owner}/{repo_name}. '
                      'If it\'s a commit from a forked repo, please call hub.load() with forked repo directly.')
 
 
-def _get_cache_or_reload(github, force_reload, verbose=True, skip_validation=False):
+
+def _get_cache_or_reload(github, force_reload, trust_repo, calling_fn, verbose=True, skip_validation=False):
     # Setup hub_dir to save downloaded files
     hub_dir = get_dir()
     if not os.path.exists(hub_dir):
         os.makedirs(hub_dir)
     # Parse github repo information
-    repo_owner, repo_name, branch = _parse_repo_info(github)
+    repo_owner, repo_name, ref = _parse_repo_info(github)
     # Github allows branch name with slash '/',
     # this causes confusion with path on both Linux and Windows.
     # Backslash is not allowed in Github branch name so no need to
     # to worry about it.
-    normalized_br = branch.replace('/', '_')
+    normalized_br = ref.replace('/', '_')
     # Github renames folder repo-v1.x.x to repo-1.x.x
     # We don't know the repo name before downloading the zip file
     # and inspect name from it.
     # To check if cached repo exists, we need to normalize folder names.
-    repo_dir = os.path.join(hub_dir, '_'.join([repo_owner, repo_name, normalized_br]))
+    owner_name_branch = '_'.join([repo_owner, repo_name, normalized_br])
+    repo_dir = os.path.join(hub_dir, owner_name_branch)
+    # Check that the repo is in the trusted list
+    _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo=trust_repo, calling_fn=calling_fn)
 
     use_cache = (not force_reload) and os.path.exists(repo_dir)
 
@@ -187,14 +200,32 @@ def _get_cache_or_reload(github, force_reload, verbose=True, skip_validation=Fal
     else:
         # Validate the tag/branch is from the original repo instead of a forked repo
         if not skip_validation:
-            _validate_not_a_forked_repo(repo_owner, repo_name, branch)
+            _validate_not_a_forked_repo(repo_owner, repo_name, ref)
 
         cached_file = os.path.join(hub_dir, normalized_br + '.zip')
         _remove_if_exists(cached_file)
 
-        url = _git_archive_link(repo_owner, repo_name, branch)
-        sys.stderr.write('Downloading: \"{}\" to {}\n'.format(url, cached_file))
-        download_url_to_file(url, cached_file, progress=False)
+        try:
+            url = _git_archive_link(repo_owner, repo_name, ref)
+            sys.stderr.write('Downloading: \"{}\" to {}\n'.format(url, cached_file))
+            download_url_to_file(url, cached_file, progress=False)
+        except HTTPError as err:
+            if err.code == 300:
+                # Getting a 300 Multiple Choices error likely means that the ref is both a tag and a branch
+                # in the repo. This can be disambiguated by explicitely using refs/heads/ or refs/tags
+                # See https://git-scm.com/book/en/v2/Git-Internals-Git-References
+                # Here, we do the same as git: we throw a warning, and assume the user wanted the branch
+                warnings.warn(
+                    f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? "
+                    "Torchhub will now assume that it's a branch. "
+                    "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or "
+                    "refs/tags/tag_name as the ref. That might require using skip_validation=True."
+                )
+                disambiguated_branch_ref = f"refs/heads/{ref}"
+                url = _git_archive_link(repo_owner, repo_name, ref=disambiguated_branch_ref)
+                download_url_to_file(url, cached_file, progress=False)
+            else:
+                raise
 
         with zipfile.ZipFile(cached_file) as cached_zipfile:
             extraced_repo_name = cached_zipfile.infolist()[0].filename
@@ -209,6 +240,55 @@ def _get_cache_or_reload(github, force_reload, verbose=True, skip_validation=Fal
 
     return repo_dir
 
+def _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo, calling_fn="load"):
+    hub_dir = get_dir()
+    filepath = os.path.join(hub_dir, "trusted_list")
+
+    if not os.path.exists(filepath):
+        Path(filepath).touch()
+    with open(filepath, 'r') as file:
+        trusted_repos = tuple(line.strip() for line in file)
+
+    # To minimize friction of introducing the new trust_repo mechanism, we consider that
+    # if a repo was already downloaded by torchhub, then it is already trusted (even if it's not in the allowlist)
+    trusted_repos_legacy = next(os.walk(hub_dir))[1]
+
+    owner_name = '_'.join([repo_owner, repo_name])
+    is_trusted = (
+        owner_name in trusted_repos
+        or owner_name_branch in trusted_repos_legacy
+        or repo_owner in _TRUSTED_REPO_OWNERS
+    )
+
+    # TODO: Remove `None` option in 1.14 and change the default to "check"
+    if trust_repo is None:
+        if not is_trusted:
+            warnings.warn(
+                "You are about to download and run code from an untrusted repository. In a future release, this won't "
+                "be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
+                "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
+                f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
+                f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
+                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour")
+        return
+
+    if (trust_repo is False) or (trust_repo == "check" and not is_trusted):
+        response = input(
+            f"The repository {owner_name} does not belong to the list of trusted repositories and as such cannot be downloaded. "
+            "Do you trust this repository and wish to add it to the trusted list of repositories (y/N)?")
+        if response.lower() in ("y", "yes"):
+            if is_trusted:
+                print("The repository is already trusted.")
+        elif response.lower() in ("n", "no", ""):
+            raise Exception("Untrusted repository.")
+        else:
+            raise ValueError(f"Unrecognized response {response}.")
+
+    # At this point we're sure that the user trusts the repo (or wants to trust it)
+    if not is_trusted:
+        with open(filepath, "a") as file:
+            file.write(owner_name + "\n")
+
 
 def _check_module_exists(name):
     import importlib.util
@@ -269,16 +349,16 @@ def set_dir(d):
         d (string): path to a local folder to save downloaded models & weights.
     """
     global _hub_dir
-    _hub_dir = d
+    _hub_dir = os.path.expanduser(d)
 
 
-def list(github, force_reload=False, skip_validation=False):
+def list(github, force_reload=False, skip_validation=False, trust_repo=None):
     r"""
     List all callable entrypoints available in the repo specified by ``github``.
 
     Args:
-        github (string): a string with format "repo_owner/repo_name[:tag_name]" with an optional
-            tag/branch. If ``tag_name`` is not specified, the default branch is assumed to be ``main`` if
+        github (string): a string with format "repo_owner/repo_name[:ref]" with an optional
+            ref (tag or branch). If ``ref`` is not specified, the default branch is assumed to be ``main`` if
             it exists, and otherwise ``master``.
             Example: 'pytorch/vision:0.10'
         force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
@@ -287,13 +367,32 @@ def list(github, force_reload=False, skip_validation=False):
             specified by the ``github`` argument properly belongs to the repo owner. This will make
             requests to the GitHub API; you can specify a non-default GitHub token by setting the
             ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        trust_repo (bool, string or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v1.14.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v1.14.
+
     Returns:
         list: The available callables entrypoint
 
     Example:
         >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True)
     """
-    repo_dir = _get_cache_or_reload(github, force_reload, verbose=True, skip_validation=skip_validation)
+    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "list", verbose=True,
+                                    skip_validation=skip_validation)
 
     sys.path.insert(0, repo_dir)
 
@@ -308,26 +407,44 @@ def list(github, force_reload=False, skip_validation=False):
     return entrypoints
 
 
-def help(github, model, force_reload=False, skip_validation=False):
+def help(github, model, force_reload=False, skip_validation=False, trust_repo=None):
     r"""
     Show the docstring of entrypoint ``model``.
 
     Args:
-        github (string): a string with format <repo_owner/repo_name[:tag_name]> with an optional
-            tag/branch. If ``tag_name`` is not specified, the default branch is assumed to be ``main`` if
-            it exists, and otherwise ``master``.
+        github (string): a string with format <repo_owner/repo_name[:ref]> with an optional
+            ref (a tag or a branch). If ``ref`` is not specified, the default branch is assumed
+            to be ``main`` if it exists, and otherwise ``master``.
             Example: 'pytorch/vision:0.10'
         model (string): a string of entrypoint name defined in repo's ``hubconf.py``
         force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
             Default is ``False``.
-        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+        skip_validation (bool, optional): if ``False``, torchhub will check that the ref
             specified by the ``github`` argument properly belongs to the repo owner. This will make
             requests to the GitHub API; you can specify a non-default GitHub token by setting the
             ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        trust_repo (bool, string or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v1.14.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v1.14.
     Example:
         >>> print(torch.hub.help('pytorch/vision', 'resnet18', force_reload=True))
     """
-    repo_dir = _get_cache_or_reload(github, force_reload, verbose=True, skip_validation=skip_validation)
+    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "help", verbose=True,
+                                    skip_validation=skip_validation)
 
     sys.path.insert(0, repo_dir)
 
@@ -341,7 +458,8 @@ def help(github, model, force_reload=False, skip_validation=False):
     return entry.__doc__
 
 
-def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose=True, skip_validation=False,
+def load(repo_or_dir, model, *args, source='github', trust_repo=None, force_reload=False, verbose=True,
+         skip_validation=False,
          **kwargs):
     r"""
     Load a model from a github repo or a local directory.
@@ -350,16 +468,16 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
     for loading other objects such as tokenizers, loss functions, etc.
 
     If ``source`` is 'github', ``repo_or_dir`` is expected to be
-    of the form ``repo_owner/repo_name[:tag_name]`` with an optional
-    tag/branch.
+    of the form ``repo_owner/repo_name[:ref]`` with an optional
+    ref (a tag or a branch).
 
     If ``source`` is 'local', ``repo_or_dir`` is expected to be a
     path to a local directory.
 
     Args:
         repo_or_dir (string): If ``source`` is 'github',
-            this should correspond to a github repo with format ``repo_owner/repo_name[:tag_name]`` with
-            an optional tag/branch, for example 'pytorch/vision:0.10'. If ``tag_name`` is not specified,
+            this should correspond to a github repo with format ``repo_owner/repo_name[:ref]`` with
+            an optional ref (tag or branch), for example 'pytorch/vision:0.10'. If ``ref`` is not specified,
             the default branch is assumed to be ``main`` if it exists, and otherwise ``master``.
             If ``source`` is 'local'  then it should be a path to a local directory.
         model (string): the name of a callable (entrypoint) defined in the
@@ -367,6 +485,23 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
         *args (optional): the corresponding args for callable ``model``.
         source (string, optional): 'github' or 'local'. Specifies how
             ``repo_or_dir`` is to be interpreted. Default is 'github'.
+        trust_repo (bool, string or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v1.14.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v1.14.
         force_reload (bool, optional): whether to force a fresh download of
             the github repo unconditionally. Does not have any effect if
             ``source = 'local'``. Default is ``False``.
@@ -399,7 +534,8 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
             f'Unknown source: "{source}". Allowed values: "github" | "local".')
 
     if source == 'github':
-        repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, verbose, skip_validation)
+        repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
+                                           verbose=verbose, skip_validation=skip_validation)
 
     model = _load_local(repo_or_dir, model, *args, **kwargs)
     return model
@@ -497,13 +633,6 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
             os.remove(f.name)
 
 
-def _download_url_to_file(url, dst, hash_prefix=None, progress=True):
-    warnings.warn('torch.hub._download_url_to_file has been renamed to\
-            torch.hub.download_url_to_file to be a public API,\
-            _download_url_to_file will be removed in after 1.3 release')
-    download_url_to_file(url, dst, hash_prefix, progress)
-
-
 # Hub used to support automatically extracts from zipfile manually compressed by users.
 # The legacy zip format expects only one file from torch.save() < 1.6 in the zip.
 # We should remove this support since zipfile is now default zipfile format for torch.save().
@@ -531,7 +660,14 @@ def _legacy_zip_load(filename, model_dir, map_location):
     return torch.load(extracted_file, map_location=map_location)
 
 
-def load_state_dict_from_url(url, model_dir=None, map_location=None, progress=True, check_hash=False, file_name=None):
+def load_state_dict_from_url(
+    url: str,
+    model_dir: Optional[str] = None,
+    map_location: Optional[Union[Callable[[str], str], Dict[str, str]]] = None,
+    progress: bool = True,
+    check_hash: bool = False,
+    file_name: Optional[str] = None
+) -> Dict[str, Any]:
     r"""Loads the Torch serialized object at the given URL.
 
     If downloaded file is a zip file, it will be automatically
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 77770129a6d3..9d0e94542bf4 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -1,7 +1,8 @@
 import torch._C
 
 from contextlib import contextmanager
-from typing import Iterator
+from typing import Iterator, Any
+import warnings
 
 from torch.utils import set_module
 
@@ -47,14 +48,21 @@
     _get_trace_graph,
 )
 from torch.jit._async import fork, wait
-from torch.jit._serialization import save, load
-from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph, _set_fusion_strategy
+from torch.jit._decomposition_utils import _register_decomposition
+from torch.jit._serialization import (
+    save,
+    load,
+    jit_module_from_flatbuffer,
+    save_jit_module_to_flatbuffer,
+)
+from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph, set_fusion_strategy
 from torch.jit._freeze import freeze, optimize_for_inference, run_frozen_optimizations
-
+from torch.jit._ir_utils import _InsertPoint
 
 # For backwards compatibility
 _fork = fork
 _wait = wait
+_set_fusion_strategy = set_fusion_strategy
 
 
 def export_opnames(m):
@@ -185,6 +193,34 @@ def forward(self, input: Any): # note the Any type
     """
     return _isinstance(obj, target_type)
 
+class strict_fusion(object):
+    """
+    This class errors if not all nodes have been fused in
+    inference, or symbolically differentiated in training.
+
+    Example:
+
+    Forcing fusion of additions.
+
+    .. code-block:: python
+
+        @torch.jit.script
+        def foo(x):
+            with torch.jit.strict_fusion():
+                return x + x + x
+
+    """
+
+    def __init__(self):
+        if not torch._jit_internal.is_scripting():
+            warnings.warn("Only works in script mode")
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, type: Any, value: Any, tb: Any) -> None:
+        pass
 
 # Context manager for globally hiding source ranges when printing graphs.
 # Note that these functions are exposed to Python as static members of the
@@ -198,6 +234,20 @@ def _hide_source_ranges() -> Iterator[None]:
     finally:
         torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges)  # type: ignore[attr-defined]
 
+def enable_onednn_fusion(enabled: bool):
+    """
+    Enables or disables onednn JIT fusion based on the parameter `enabled`.
+    """
+
+    torch._C._jit_set_llga_enabled(enabled)
+
+def onednn_fusion_enabled():
+    """
+    Returns whether onednn JIT fusion is enabled
+    """
+    return torch._C._jit_llga_enabled()
+
+del Any
 
 if not torch._C._jit_init():
     raise RuntimeError("JIT initialization failed")
diff --git a/torch/jit/_decomposition_utils.py b/torch/jit/_decomposition_utils.py
new file mode 100644
index 000000000000..3aa9b670ed4a
--- /dev/null
+++ b/torch/jit/_decomposition_utils.py
@@ -0,0 +1,8 @@
+import torch
+from torch._ops import OpOverload, OpOverloadPacket
+
+def _register_decomposition(op: OpOverload, graph: torch._C.Graph):
+    assert not isinstance(op, OpOverloadPacket), f"Must pass specific op overload, not overload packet, found {op}"
+    assert isinstance(op, OpOverload)
+
+    torch._C._jit_register_decomposition_for_schema(op._schema, graph)
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
new file mode 100644
index 000000000000..b939584f36d3
--- /dev/null
+++ b/torch/jit/_decompositions.py
@@ -0,0 +1,118 @@
+
+
+import torch
+from torch import Tensor
+aten = torch.ops.aten
+from typing import Optional, List, Dict, Set
+import inspect
+from torch.fx.operator_schemas import get_signature_for_torch_op
+import warnings
+
+decomposition_table: Dict[str, torch.jit.ScriptFunction] = {}
+function_name_set: Set[str] = set()
+
+def check_decomposition_has_type_annotations(f):
+
+    inspect_empty = inspect._empty  # type: ignore[attr-defined]
+    sig = inspect.signature(f)
+    for param in sig.parameters.values():
+        assert param.annotation != inspect_empty, \
+            "No signature on param {name} for function {func}".format(name=param.name, func=f.name)
+
+    assert sig.return_annotation != inspect_empty, "No return annotation for function {func}".format(func=f.name)
+
+def signatures_match(decomposition_sig, torch_op_sig):
+    decomp_params = decomposition_sig.parameters
+    op_params = torch_op_sig.parameters
+
+    if len(decomp_params) != len(op_params):
+        return False
+
+
+    for decomp_param, op_param in zip(decomp_params.values(), op_params.values()):
+        # can't check full equality yet because not all fields are correcly deduced
+        # in the torch_op_sig - like default value
+        # can't check 'kind' bc
+        # kwarg-only values with defaults not yet supported in TS
+        inspect_empty = inspect._empty  # type: ignore[attr-defined]
+        for field in ['name', 'annotation']:
+            if field == 'name' and decomp_param.name == "self":
+                warnings.warn("PyTorch uses 'input' instead of 'self' on public api")
+
+            if getattr(decomp_param, field) != getattr(op_param, field):
+                return False
+
+        decomp_default = decomp_param.default
+        op_default = op_param.default
+        # default value not always correctly inferred as being present on torch schema,
+        # but if specified on both they should be equal
+        if decomp_default != inspect_empty and op_default != inspect_empty:
+            if decomp_default != op_default:
+                return False
+
+    return decomposition_sig.return_annotation == torch_op_sig.return_annotation
+
+def register_decomposition(aten_op, registry=None):
+    def decomposition_decorator(f):
+        nonlocal registry
+        if registry is None:
+            registry = decomposition_table
+
+        check_decomposition_has_type_annotations(f)
+
+        torch_op_sigs, torch_op_schemas = get_signature_for_torch_op(aten_op, return_schemas=True)
+        decomposition_sig = inspect.signature(f)
+
+        found_index = None
+        for i, torch_op_sig in enumerate(torch_op_sigs):
+            if signatures_match(decomposition_sig, torch_op_sig):
+                found_index = i
+                break
+
+        assert found_index is not None, "Could not find matching signature: " + str(f)
+
+        # Need unique name for jit function serialization
+        assert f.__name__ not in function_name_set, "Duplicated function name {}".format(f.__name__)
+        function_name_set.add(f.__name__)
+
+        scripted_func = torch.jit.script(f)
+        torch._C._jit_pass_inline(scripted_func.graph)
+
+        for _ in range(2):
+            torch._C._jit_pass_peephole(scripted_func.graph)
+            torch._C._jit_pass_constant_propagation(scripted_func.graph)
+
+        registry[str(torch_op_schemas[found_index])] = scripted_func
+        return f
+
+    return decomposition_decorator
+
+# TODO: replace torch.sigmoid -> aten.sigmoid
+
+@register_decomposition(aten.var)
+def var_decomposition(input: Tensor, dim: Optional[List[int]] = None, correction: Optional[int] = None,
+                      keepdim: bool = False) -> Tensor:
+    if dim is None:
+        dim_i: List[int] = []
+        dim = dim_i
+
+    if isinstance(dim, (tuple, list)) and len(dim) == 0:
+        n = input.numel()
+    else:
+        n = 1
+        for dim_i in dim:  # type: ignore[assignment]
+            n *= input.shape[dim_i]  # type: ignore[call-overload]
+
+    mean = aten.mean(input, dim, True)
+    sub = input - mean
+    sq = sub * sub
+    sum = aten.sum(sq, dim, keepdim)
+
+    if correction is not None:
+        n = n - correction
+
+    return sum / n
+
+@register_decomposition(aten.var)
+def var(input: Tensor, unbiased: bool = True) -> Tensor:
+    return var_decomposition(input, correction=(1 if unbiased else 0))
diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py
index c704a67c5a34..d345f245c6b9 100644
--- a/torch/jit/_fuser.py
+++ b/torch/jit/_fuser.py
@@ -38,8 +38,8 @@ def fuser(name):
         torch._C._jit_set_nvfuser_enabled(False)
     elif name == 'fuser1':  # NNC
         old_profiling_executor = torch._C._jit_set_profiling_executor(True)
-        old_profiling_mode = torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_override_can_fuse_on_cpu(False)
+        old_profiling_mode = torch._C._get_graph_executor_optimize(True)
+        torch._C._jit_override_can_fuse_on_cpu(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
         torch._C._jit_set_nvfuser_enabled(False)
@@ -55,7 +55,7 @@ def fuser(name):
     finally:
         if name == 'fuser1':  # NNC
             torch._C._jit_set_profiling_executor(old_profiling_executor)
-            torch._C._jit_set_profiling_mode(old_profiling_mode)
+            torch._C._get_graph_executor_optimize(old_profiling_mode)
         # recover the previous values
         torch._C._jit_override_can_fuse_on_cpu(old_cpu_fuse)
         torch._C._jit_override_can_fuse_on_gpu(old_gpu_fuse)
@@ -106,48 +106,35 @@ def _script_method_graph_for(self, parent, *args, **kwargs):
         self(*args, **kwargs)
         return last_executed_optimized_graph()
 
-def _set_fusion_strategy(strategy: List[Tuple[str, int]]):
+def set_fusion_strategy(strategy: List[Tuple[str, int]]):
     """
-    Sets the type and number of specializations that can occur during fusion
+    Sets the type and number of specializations that can occur during fusion.
 
     Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or "DYNAMIC"
-           and depth is an integer.
-            //
+    and depth is an integer.
+
     Behavior - static vs dynamic:
-    - in STATIC fusion, fused ops are compiled to have fixed input shapes. The input shapes
-      are determined based on a number of initial profiling runs. The shape is determined based
-      on some initial profiling runs. For example, if on the first run an input of shape
-      [2, 4] is observed, then the compiled op will only work on shapes of size [2, 4].
-    - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
-      shapes are possible. Dynamic fusion uses "symbolic shapes", where any dimensions of the
-      same value that are observed in profiling runs are assumed to have the same value.
-      For example, if inputs of [2,3,4] and [3,4,5] are observed, then it is assumed that future
-      inputs will have shapes [a,b,c] and [b,c,d] for some values of a,b,c,d.
-
-   In both cases, we also recompile on new striding behavior, device, or dtype.
-
-            //
+        In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined
+        based on some initial profiling runs.
+        In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
+        shapes are possible.
+
+    In both cases, we also recompile on new striding behavior, device, or dtype.
+
     Behavior - fallback functions & depth:
-      When an input doesn't match the format required by the specialized compiled op, it will run
-      a fallback function.
-      Fallback functions can also recursively be compiled and specialized based on the input shape
-      Since compilation can be slow, the "depth" parameter is provided to limit the number of
-      specializations that can be compiled, before JIT gives up on recompiling and falls back
-      to a completely un-fused, un-specialized implementation.
-            //
+        When an input doesn't match the format required by the specialized compiled op, it will run
+        a fallback function. Fallback functions are recursively be compiled and specialized based
+        on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to
+        limit the number of specializations that can be compiled, before giving up on recompiling and
+        falling back to a completely un-fused, un-specialized implementation.
+
     The list of (type, depth) pairs controls the type of specializations and the number of
-      specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] indicates that the first
-      two specializations will use static fusions, the following two specializations will use
-      dynamic fusion, and any inputs that satisfy none of the 4 options will run an
-      unfused implementation.
-    Below an example of the fallback function structure is shown, if given a strategy of
-      [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had these input shapes:
-      [2, 2], [3, 3], [4, 4], [3, 5], ...
-            //
-      + specialized: statically fused, shape [2, 2]
-      |-> + fallback 1; statically fused, shape [3, 3]
-          |-> + fallback 2; dynamically fused, shape [A, A]
-              |-> + fallback 3: dynamically fused, shape [A, B]
-                  |-> final fallback: unspecialized, unfused
+    specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] indicates that the first
+    two specializations will use static fusions, the following two specializations will use
+    dynamic fusion, and any inputs that satisfy none of the 4 options will run an
+    unfused implementation.
+
+    NB: in the future, if more as more fusion backends are added there may be more granular
+    apis for specific fusers.
     """
     return torch._C._jit_set_fusion_strategy(strategy)
diff --git a/torch/jit/_ir_utils.py b/torch/jit/_ir_utils.py
new file mode 100644
index 000000000000..dd2d72880431
--- /dev/null
+++ b/torch/jit/_ir_utils.py
@@ -0,0 +1,18 @@
+import torch
+from typing import Union
+
+class _InsertPoint(object):
+    def __init__(self, insert_point_graph: torch._C.Graph, insert_point: Union[torch._C.Node, torch._C.Block]):
+        self.insert_point = insert_point
+        self.g = insert_point_graph
+        self.guard = None
+
+    def __enter__(self):
+        self.prev_insert_point = self.g.insertPoint()
+        self.g.setInsertPoint(self.insert_point)
+
+    def __exit__(self, *args):
+        self.g.setInsertPoint(self.prev_insert_point)
+
+def insert_point_guard(self, insert_point: Union[torch._C.Node, torch._C.Block]):
+    return _InsertPoint(self, insert_point)
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index a7efa0832ee2..8175d14fe5dc 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -129,6 +129,10 @@ def infer_concrete_type_builder(nn_module, share_types=True):
         concrete_type_builder.set_module_dict()
     if isinstance(nn_module, (torch.nn.ModuleList, torch.nn.Sequential)):
         concrete_type_builder.set_module_list()
+    if isinstance(nn_module, (torch.nn.ParameterList)):
+        concrete_type_builder.set_parameter_list()
+    if isinstance(nn_module, (torch.nn.ParameterDict)):
+        concrete_type_builder.set_parameter_dict()
 
     class_annotations = getattr(nn_module, '__annotations__', {})
     if isinstance(nn_module, (torch.ao.quantization.QuantWrapper)):
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 9ad8934d55bc..1aad03c82d4e 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -13,7 +13,7 @@
 import copy
 import pickle
 import warnings
-from typing import Any, Dict, List, Tuple, Union, Callable
+from typing import Any, Dict, List, Set, Tuple, Union, Callable
 
 
 import torch
@@ -75,7 +75,7 @@ def Attribute(value, type):  # type: ignore[no-redef]
     This method is a pass-through function that returns `value`, mostly
     used to indicate to the TorchScript compiler that the left-hand side
     expression is a class instance attribute with type of `type`. Note that
-    `torch.jit.Attribute` should only be used in `__init__` method of `nn.Module`
+    `torch.jit.Attribute` should only be used in `__init__` method of `jit.ScriptModule`
     subclasses.
 
     Though TorchScript can infer correct type for most Python expressions, there are some cases where
@@ -95,9 +95,9 @@ def Attribute(value, type):  # type: ignore[no-redef]
         import torch
         from typing import Dict
 
-        class AttributeModule(torch.nn.Module):
+        class AttributeModule(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super(AttributeModule, self).__init__()
                 self.foo = torch.jit.Attribute(0.1, float)
 
                 # we should be able to use self.foo as a float here
@@ -117,6 +117,27 @@ def __init__(self):
         del AttributeModule
         del m
 
+    Note: it's now preferred to instead use type annotations instead of `torch.jit.Annotate`:
+
+    .. testcode::
+
+        import torch
+        from typing import Dict
+
+        class AttributeModule(torch.nn.Module):
+            names: Dict[str, int]
+
+            def __init__(self):
+                super(AttributeModule, self).__init__()
+                self.names = {}
+
+        m = AttributeModule()
+
+    .. testcleanup::
+
+        del AttributeModule
+        del m
+
     Args:
         value: An initial value to be assigned to attribute.
         type: A Python type
@@ -249,7 +270,7 @@ def __init__(cls, name, bases, attrs):  # noqa: B902
         for base in reversed(bases):
             for k, v in getattr(base, "_methods", {}).items():
                 cls._methods[k] = v
-            base_constants = getattr(base, "_constants_set", set())
+            base_constants: Set = getattr(base, "_constants_set", set())
             cls._constants_set = cls._constants_set.union(base_constants)
 
         # find all the script methods of the current class
@@ -417,7 +438,7 @@ def __getattr__(self, attr):
                 return super(RecursiveScriptClass, self).__getattr__(attr)  # type: ignore[misc]
 
             if attr in self._props:
-                return self._props[attr].fget()
+                return self._props[attr].fget()  # type: ignore[call-arg, misc]
 
             return getattr(self._c, attr)
 
@@ -426,7 +447,7 @@ def __setattr__(self, attr, value):
                 return super(RecursiveScriptClass, self).__setattr__(attr, value)
 
             if attr in self._props:
-                return self._props[attr].fset(value)
+                return self._props[attr].fset(value)  # type: ignore[call-arg, misc]
 
             setattr(self._c, attr, value)
 
@@ -1305,9 +1326,13 @@ def forward(self, a) -> MyModule:
         qualified_name = _qualified_name(obj)
         # this is a decorated fn, and we need to the underlying fn and its rcb
         if hasattr(obj, "__script_if_tracing_wrapper"):
-            obj = obj.__original_fn
+            obj = obj.__original_fn  # type: ignore[union-attr]
             _rcb = _jit_internal.createResolutionCallbackFromClosure(obj)
 
+        # some functions are explicitly marked as not supported in script mode
+        if hasattr(obj, "__script_unsupported"):
+            raise RuntimeError("TorchScript error: " + obj.__script_unsupported)
+
         _check_directly_compile_overloaded(obj)
         maybe_already_compiled_fn = _try_get_jit_cached_function(obj)
         if maybe_already_compiled_fn:
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index f2c32c3a19bc..5e6517682be5 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -182,3 +182,104 @@ def validate_map_location(map_location=None):
         validate_cuda_device(map_location)
 
     return map_location
+
+
+def get_ff_module():
+    try:
+        import torch._C_flatbuffer as ff
+        return ff
+    except ImportError:
+        print("Please include //caffe2:_C_flatbuffer as dependency.")
+        raise
+
+
+def jit_module_from_flatbuffer(f):
+    ff = get_ff_module()
+    if isinstance(f, string_classes):
+        if not os.path.exists(f):  # type: ignore[type-var]
+            raise ValueError("The provided filename {} does not exist".format(f))  # type: ignore[str-bytes-safe]
+        if os.path.isdir(f):
+            raise ValueError("The provided filename {} is a directory".format(f))  # type: ignore[str-bytes-safe]
+
+    if isinstance(f, str) or isinstance(f, pathlib.Path):
+        f = str(f)
+        return wrap_cpp_module(ff._load_jit_module_from_file(f))
+    else:
+        return wrap_cpp_module(ff._load_jit_module_from_bytes(f.read()))
+
+
+def save_jit_module_to_flatbuffer(m, f):
+    r"""
+    Save an offline version of this module for use in a separate process. The
+    saved module serializes all of the methods, submodules, parameters, and
+    attributes of this module. It can be loaded into the C++ API using
+    ``torch::jit::load_jit_module_from_file(filename)`` or into the Python API with
+    :func:`torch.jit.jit_module_from_flatbuffer<torch.jit.jit_module_from_flatbuffer>`.
+
+    To be able to save a module, it must not make any calls to native Python
+    functions.  This means that all submodules must be subclasses of
+    :class:`ScriptModule` as well.
+
+    .. DANGER::
+        All modules, no matter their device, are always loaded onto the CPU
+        during loading.  This is different from :func:`torch.load`'s semantics
+        and may change in the future.
+
+    Args:
+        m: A :class:`ScriptModule` to save.
+        f: A string for file path
+
+
+    Example:
+
+    .. testcode::
+
+        import torch
+        import io
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + 10
+
+        m = torch.jit.script(MyModule())
+
+        # Save to file
+        torch.jit.save_jit_module_to_flatbuffer(m, 'scriptmodule.ff')
+    """
+    ff = get_ff_module()
+    if isinstance(f, str) or isinstance(f, pathlib.Path):
+        f = str(f)
+        ff._save_jit_module(m._c, f)
+    else:
+        s = ff._save_jit_module_to_bytes(m._c)
+        f.write(s)
+
+
+def get_flatbuffer_module_info(path_or_file):
+    r"""Get some information regarding a model file in flatbuffer format.
+
+
+    Args:
+        path_or_file: Either str, Path or file like object (BytesIO OK).
+            If it's str or Path, we will read the file referenced by that
+            path as Bytes.
+
+    Returns:
+        A dict with metadata on what that file contains, currently looks like
+        this:
+        {
+            'bytecode_version': 4,  # int
+            'operator_version': 4,  # int
+            'function_names': {
+                '__torch__.___torch_mangle_0.Foo.forward'}, # set
+            'type_names': set(),  # set
+            'opname_to_num_args': {'aten::linear': 3} # Dict[str, int]
+        }
+    """
+    ff = get_ff_module()
+    if isinstance(path_or_file, str) or isinstance(path_or_file, pathlib.Path):
+        with open(path_or_file, 'rb') as f:
+            all_bytes = f.read()
+    else:
+        all_bytes = path_or_file.read()
+    return ff._get_module_info_from_flatbuffer(all_bytes)
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
new file mode 100644
index 000000000000..7a912fc2da9e
--- /dev/null
+++ b/torch/jit/_shape_functions.py
@@ -0,0 +1,1032 @@
+from typing import List, Any, Optional, Union, Dict, Callable, Tuple
+import math
+number = Union[int, float]
+# flake8: noqa
+
+###
+# There are generated files that depend on this file
+# To re-generate, please run:
+# cd ~/pytorch && python
+# torchgen/shape_functions/gen_jit_shape_functions.py
+####
+
+import torch
+
+
+def broadcast(a: List[int], b: List[int]):
+    dimsA = len(a)
+    dimsB = len(b)
+    ndim = max(dimsA, dimsB)
+    expandedSizes: List[int] = []
+
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        sizeA = a[dimA] if (dimA >= 0) else 1
+        sizeB = b[dimB] if (dimB >= 0) else 1
+
+        if sizeA != sizeB and sizeA != 1 and sizeB != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                "The size of tensor a {} must match the size of tensor b ("
+                "{}) at non-singleton dimension {}".format(sizeA, sizeB, i)
+            )
+
+        expandedSizes.append(sizeB if sizeA == 1 else sizeA)
+
+    return expandedSizes
+
+def broadcast_three(a: List[int], b: List[int], c: List[int]):
+    return broadcast(broadcast(a, b), c)
+
+def broadcast_one_three(a: List[int], b: Any, c: List[int]):
+    return broadcast(a, c)
+
+def adaptive_avg_pool2d(self: List[int], out: List[int]):
+    assert len(out) == 2
+    assert len(self) == 3 or len(self) == 4
+    for i in range(1, len(self)):
+        assert self[i] != 0
+
+    shape: List[int] = []
+    for i in range(0, len(self) - 2):
+        shape.append(self[i])
+    for elem in out:
+        shape.append(elem)
+    return shape
+
+
+def _copy(self: List[int]):
+    out: List[int] = []
+    for elem in self:
+        out.append(elem)
+    return out
+
+
+def unary(self: List[int]):
+    return _copy(self)
+
+
+def broadcast_inplace(a: List[int], b: List[int]):
+    dimsA = len(a)
+    dimsB = len(b)
+    if dimsB > dimsA:
+        raise AssertionError(
+            "The dims of tensor b ({}) must be less than or equal to"
+            "the dims of tensor a ({}) ".format(dimsB, dimsA)
+        )
+    for dimA in range(dimsA):
+        dimB = dimsB - dimsA + dimA
+        sizeA = a[dimA]
+        sizeB = b[dimB] if (dimB >= 0) else 1
+        if sizeA != sizeB and sizeB != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                "The size of tensor a {} must match the size of tensor b ("
+                "{}) at non-singleton dimension {}".format(sizeA, sizeB, dimA)
+            )
+    return _copy(a)
+
+
+def expand(self: List[int], sizes: List[int]):
+    assert len(sizes) >= len(self)
+    ndim = len(sizes)
+    tensor_dim = len(self)
+    if ndim == 0:
+        return _copy(sizes)
+    out: List[int] = []
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dim = tensor_dim - 1 - offset
+        size = self[dim] if dim >= 0 else 1
+        targetSize = sizes[i]
+        if targetSize == -1:
+            assert dim >= 0
+            targetSize = size
+        if size != targetSize:
+            assert size == 1
+            size = targetSize
+        out.append(size)
+    return out
+
+
+def expand_one_unused(self: List[int], sizes: List[int], inp0: Any):
+    return expand(self, sizes)
+
+
+def infer_size_impl(shape: List[int], numel: int) -> List[int]:
+    newsize = 1
+    infer_dim: Optional[int] = None
+    for dim in range(len(shape)):
+        if shape[dim] == -1:
+            if infer_dim is not None:
+                raise AssertionError("only one dimension can be inferred")
+            infer_dim = dim
+        elif shape[dim] >= 0:
+            newsize *= shape[dim]
+        else:
+            raise AssertionError("invalid shape dimensions")
+    if not (
+        numel == newsize
+        or (infer_dim is not None and newsize > 0 and numel % newsize == 0)
+    ):
+        raise AssertionError("invalid shape")
+    out = _copy(shape)
+    if infer_dim is not None:
+        out[infer_dim] = numel // newsize
+    return out
+
+
+def numel(sizes: List[int]):
+    numel = 1
+    for elem in sizes:
+        numel *= elem
+    return numel
+
+
+def view(self: List[int], sizes: List[int]):
+    return infer_size_impl(sizes, numel(self))
+
+
+def view_one_unused(self: List[int], sizes: List[int], *, implicit: bool = False):
+    return view(self, sizes)
+
+
+def mean_dim(self: List[int], dims: List[int], keep_dim: bool, dt: Any):
+    out: List[int] = []
+    for idx in range(len(self)):
+        is_mean_dim: bool = False
+        for reduce_dim in dims:
+            if idx == maybe_wrap_dim(reduce_dim, len(self)):
+                is_mean_dim = True
+        if is_mean_dim:
+            if keep_dim:
+                out.append(1)
+        else:
+            out.append(self[idx])
+    return out
+
+def max_dim(self: List[int], dim: int, keep_dim: bool):
+    out = mean_dim(self, [dim], keep_dim, None)
+    return out, out
+
+# note: python already rounds down towards negative infinity on integer division, special arithmetic not needed
+def div_rtn(x: int, y: int):
+    return x // y
+
+
+def pooling_output_shape_pad_lr(
+    inputSize: int,
+    kernelSize: int,
+    pad_l: int,
+    pad_r: int,
+    stride: int,
+    dilation: int,
+    ceil_mode: bool,
+):
+    outputSize = (
+        div_rtn(
+            inputSize
+            + pad_l
+            + pad_r
+            - dilation * (kernelSize - 1)
+            - 1
+            + (stride - 1 if ceil_mode else 0),
+            stride,
+        )
+        + 1
+    )
+    if ceil_mode:
+        if (outputSize - 1) * stride >= inputSize + pad_l:
+            outputSize = outputSize - 1
+    return outputSize
+
+
+def pooling_output_shape(
+    inputSize: int,
+    kernelSize: int,
+    pad_l: int,
+    stride: int,
+    dilation: int,
+    ceil_mode: bool,
+):
+    assert stride != 0, "stride should not be zeero"
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad_l, pad_l, stride, dilation, ceil_mode
+    )
+
+
+def pool2d_shape_check(
+    input: List[int],
+    kH: int,
+    kW: int,
+    dH: int,
+    dW: int,
+    padH: int,
+    padW: int,
+    dilationH: int,
+    dilationW: int,
+    nInputPlane: int,
+    inputHeight: int,
+    inputWidth: int,
+    outputHeight: int,
+    outputWidth: int,
+):
+    ndim = len(input)
+    nOutputPlane = nInputPlane
+
+    assert kW > 0 and kH > 0
+    assert dW > 0 and dH > 0
+    assert dilationH > 0 and dilationW > 0
+
+    valid_dims = input[1] != 0 and input[2] != 0
+    assert (
+        ndim == 3
+        and input[0] != 0
+        and valid_dims
+        or (ndim == 4 and valid_dims and input[3] != 0)
+    )
+
+    assert kW // 2 >= padW and kH // 2 >= padH
+    assert outputWidth >= 1 and outputHeight >= 1
+
+
+def max_pool2d(
+    input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool,
+):
+    assert (
+        len(kernel_size) == 1 or len(kernel_size) == 2
+    ), "max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
+    kH = kernel_size[0]
+    kW = kH if len(kernel_size) == 1 else kernel_size[1]
+
+    assert (
+        len(stride) == 0 or len(stride) == 1 or len(stride) == 2
+    ), "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+    dH = kH if len(stride) == 0 else stride[0]
+    if len(stride) == 0:
+        dW = kW
+    elif len(stride) == 1:
+        dW = dH
+    else:
+        dW = stride[1]
+
+    assert (
+        len(padding) == 1 or len(padding) == 2
+    ), "max_pool2d: padding must be either be a single int, or a tuple of two ints"
+    padH = padding[0]
+    padW = padH if len(padding) == 1 else padding[1]
+
+    assert (
+        len(dilation) == 1 or len(dilation) == 2
+    ), "max_pool2d: dilation must be either a single int, or a tuple of two ints"
+    dilationH = dilation[0]
+    dilationW = dilationH if len(dilation) == 1 else dilation[1]
+
+    assert len(input) == 3 or len(input) == 4
+
+    nbatch = input[-4] if len(input) == 4 else 1
+    nInputPlane = input[-3]
+    inputHeight = input[-2]
+    inputWidth = input[-1]
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
+
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        dilationH,
+        dilationW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+    )
+
+    if len(input) == 3:
+        return [nInputPlane, outputHeight, outputWidth]
+    else:
+        return [nbatch, nInputPlane, outputHeight, outputWidth]
+
+
+def max_pool2d_with_indices(
+    input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool,
+):
+    out = max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
+    return (out, out)
+
+
+def upsample_nearest2d(
+    input: List[int],
+    output_size: Optional[List[int]],
+    scale_factors: Optional[List[float]],
+):
+    out: List[int] = []
+    out.append(input[0])
+    out.append(input[1])
+    if output_size is not None:
+        assert (
+            scale_factors is None
+        ), "Must specify exactly one of output_size and scale_factors"
+        assert len(output_size) == 2
+        out.append(output_size[0])
+        out.append(output_size[1])
+        return out
+
+    if scale_factors is not None:
+        assert (
+            output_size is None
+        ), "Must specify exactly one of output_size and scale_factors"
+        assert len(scale_factors) == 2
+        out.append(int(input[2] * scale_factors[0]))
+        out.append(int(input[3] * scale_factors[1]))
+        return out
+    assert 0, "Either output_size or scale_factors must be presented"
+
+
+def mm(self: List[int], mat2: List[int]):
+    assert len(self) == 2, "self must be a matrix"
+    assert len(mat2) == 2, "mat2 must be a matrix"
+
+    assert self[1] == mat2[0]
+    return [self[0], mat2[1]]
+
+
+def dot(self: List[int], tensor: List[int]):
+    assert len(self) == 1 and len(tensor) == 1
+    assert self[0] == tensor[0]
+    out: List[int] = []
+    return out
+
+
+def mv(self: List[int], vec: List[int]):
+    assert len(self) == 2 and len(vec) == 1
+    assert self[1] == vec[0]
+    # TODO: return self
+    return [self[0]]
+
+
+def unsqueeze(li: List[int], dim: int):
+    dim = maybe_wrap_dim(dim, len(li) + 1)
+    out = _copy(li)
+    out.insert(dim, 1)
+    return out
+
+
+def squeeze_nodim(li: List[int]):
+    out: List[int] = []
+    for i in range(len(li)):
+        if li[i] != 1:
+            out.append(li[i])
+    return out
+
+
+def squeeze(li: List[int], dim: int):
+    out: List[int] = []
+    wrapped_dim = maybe_wrap_dim(dim, len(li))
+    for i in range(len(li)):
+        if i == wrapped_dim:
+            if li[i] != 1:
+                out.append(li[i])
+        else:
+            out.append(li[i])
+    return out
+
+
+def index_select(self: List[int], dim: int, index: List[int]):
+    dim = maybe_wrap_dim(dim, len(self))
+    numel = multiply_integers(index)
+    assert len(index) <= 1
+    assert dim == 0 or dim < len(self)
+    result_size: List[int] = []
+    for i in range(len(self)):
+        if dim == i:
+            result_size.append(numel)
+        else:
+            result_size.append(self[i])
+    return result_size
+
+
+def embedding(
+    weight: List[int],
+    indices: List[int],
+    padding_idx: int = -1,
+    scale_grad_by_freq: bool = False,
+    sparse: bool = False,
+):
+    assert len(weight) == 2
+    if len(indices) == 1:
+        return index_select(weight, 0, indices)
+    size = _copy(indices)
+    size.append(weight[1])
+    return size
+
+
+def max_int():
+    return 9223372036854775807
+
+
+def slice(
+    self: List[int], dim: int, start: Optional[int], end: Optional[int], step: int
+):
+    ndim = len(self)
+    assert ndim != 0
+    dim = maybe_wrap_dim(dim, ndim)
+    start_val = start if start is not None else 0
+    end_val = end if end is not None else max_int()
+    assert step > 0
+    if start_val == max_int():
+        start_val = 0
+    if start_val < 0:
+        start_val += self[dim]
+    if end_val < 0:
+        end_val += self[dim]
+    if start_val < 0:
+        start_val = 0
+    elif start_val > self[dim]:
+        start_val = self[dim]
+    if end_val < start_val:
+        end_val = start_val
+    elif end_val >= self[dim]:
+        end_val = self[dim]
+    slice_len = end_val - start_val
+    out = _copy(self)
+    out[dim] = (slice_len + step - 1) // step
+    return out
+
+
+def check_cat_no_zero_dim(tensors: List[List[int]]):
+    for tensor in tensors:
+        assert len(tensor) > 0
+
+def legacy_cat_wrap_dim(dim: int, tensor_sizes: List[List[int]]):
+    out_dim: Optional[int] = None
+    for size in tensor_sizes:
+        if not (len(size) == 1 and size[0] == 0):
+            if out_dim is None:
+                out_dim = maybe_wrap_dim(dim, len(size))
+    if out_dim is None:
+        out_dim = dim
+    return out_dim
+
+
+def should_skip(tensor: List[int]):
+    return numel(tensor) == 0 and len(tensor) == 1
+
+
+def check_cat_shape_except_dim(
+    first: List[int], second: List[int], dimension: int, index: int
+):
+    first_dims = len(first)
+    second_dims = len(second)
+    assert first_dims == second_dims, "Tensors must have same number of dimensions"
+    for dim in range(0, first_dims):
+        if dim != dimension:
+            assert (
+                first[dim] == second[dim]
+            ), "Sizes of tensors must match except in dimension"
+
+
+def cat(tensors: List[List[int]], dim: int):
+    check_cat_no_zero_dim(tensors)
+    dim = legacy_cat_wrap_dim(dim, tensors)
+    assert len(tensors) > 0
+    not_skipped_tensor: Optional[List[int]] = None
+    for tensor in tensors:
+        if not should_skip(tensor):
+            not_skipped_tensor = tensor
+    if not_skipped_tensor is None:
+        return [0]
+
+    cat_dim_size = 0
+
+    for i in range(len(tensors)):
+        tensor = tensors[i]
+        if not should_skip(tensor):
+            check_cat_shape_except_dim(not_skipped_tensor, tensor, dim, i)
+            cat_dim_size = cat_dim_size + tensor[dim]
+
+    result_size = _copy(not_skipped_tensor)
+    result_size[dim] = cat_dim_size
+    return result_size
+
+
+def select(self: List[int], dim: int, index: int):
+    ndim = len(self)
+    assert ndim != 0
+    dim = maybe_wrap_dim(dim, ndim)
+    size = self[dim]
+    assert not (index < -size or index >= size)
+    if index < 0:
+        index += size
+    out: List[int] = []
+    for i in range(ndim):
+        if i != dim:
+            out.append(self[i])
+    return out
+
+
+def matmul(tensor1: List[int], tensor2: List[int]):
+    dim_tensor1 = len(tensor1)
+    dim_tensor2 = len(tensor2)
+    if dim_tensor1 == 1 and dim_tensor2 == 1:
+        return dot(tensor1, tensor2)
+    elif dim_tensor1 == 2 and dim_tensor2 == 1:
+        return mv(tensor1, tensor2)
+    elif dim_tensor1 == 1 and dim_tensor2 == 2:
+        return squeeze(mm(unsqueeze(tensor1, 0), tensor2), 0)
+    elif dim_tensor1 == 2 and dim_tensor2 == 2:
+        return mm(tensor1, tensor2)
+    elif dim_tensor1 >= 1 and dim_tensor2 >= 1:
+        # We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
+        # we track m1 vs m2 separately even though they must match for nicer error messages
+        n = tensor1[-2] if dim_tensor1 > 1 else 1
+        m1 = tensor1[-1]
+        batch_tensor1: List[int] = []
+        # TODO: handling of slice
+        for i in range(dim_tensor1 - 2):
+            batch_tensor1.append(tensor1[i])
+        m2 = tensor2[-1] if dim_tensor2 > 1 else 1
+        p = tensor2[-1]
+        batch_tensor2: List[int] = []
+        # TODO: handling of slice
+        for i in range(dim_tensor2 - 2):
+            batch_tensor2.append(tensor2[i])
+
+        # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
+        expand_batch_portion = broadcast(batch_tensor1, batch_tensor2)
+
+        # todo: copy ?
+        output_shape = expand_batch_portion
+        if dim_tensor1 > 1:
+            output_shape.append(n)
+
+        if dim_tensor2 > 1:
+            output_shape.append(p)
+
+        return output_shape
+    else:
+        assert False, "both  arguments to matmul need to be at least 1D"
+
+
+def t(self: List[int]):
+    assert len(self) <= 2
+    self_len = len(self)
+    if self_len == 0:
+        out: List[int] = []
+        return out
+    elif self_len == 1:
+        return [self[0]]
+    else:
+        return [self[1], self[0]]
+
+
+def transpose(self: List[int], dim0: int, dim1: int):
+    ndims = len(self)
+    dim0 = maybe_wrap_dim(dim0, ndims)
+    dim1 = maybe_wrap_dim(dim1, ndims)
+    if dim0 == dim1:
+        return _copy(self)
+    out: List[int] = []
+    for i in range(ndims):
+        if i == dim0:
+            out.append(self[dim1])
+        elif i == dim1:
+            out.append(self[dim0])
+        else:
+            out.append(self[i])
+    return out
+
+
+def linear(input: List[int], weight: List[int], bias: Optional[List[int]]):
+    out = matmul(input, t(weight))
+    if bias is not None:
+        assert broadcast(bias, out) == out
+    return out
+
+
+def addmm(self: List[int], mat1: List[int], mat2: List[int], beta: Any, alpha: Any):
+    return broadcast(self, mm(mat1, mat2))
+
+
+def check_non_negative(array: List[int]) -> bool:
+    # TODO: look into rewriting with early return and getting loop unrolling to fire
+    non_negative = False
+    for val in array:
+        if val < 0:
+            non_negative = True
+    return non_negative
+
+
+def check_shape_forward(
+    input: List[int],
+    weight_sizes: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    k = len(input)
+    weight_dim = len(weight_sizes)
+
+    # TODO: assertions could be expanded with the error messages
+    assert not check_non_negative(padding)
+    assert not check_non_negative(stride)
+
+    assert weight_dim == k
+    assert weight_sizes[0] >= groups
+    assert (weight_sizes[0] % groups) == 0
+    # only handling not transposed
+    assert input[1] == weight_sizes[1] * groups
+    assert bias is None or (len(bias) == 1 and bias[0] == weight_sizes[0])
+
+    for i in range(2, k):
+        assert (input[i] + 2 * padding[i - 2]) >= (
+            dilation[i - 2] * (weight_sizes[i] - 1) + 1
+        )
+
+    # this is not handling transposed convolution yet
+
+
+def conv_output_size(
+    input_size: List[int],
+    weight_size: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    check_shape_forward(
+        input_size, weight_size, bias, stride, padding, dilation, groups
+    )
+
+    has_dilation = len(dilation) > 0
+    dim = len(input_size)
+    output_size: List[int] = []
+    input_batch_size_dim = 0
+    weight_output_channels_dim = 0
+    output_size.append(input_size[input_batch_size_dim])
+    output_size.append(weight_size[weight_output_channels_dim])
+
+    for d in range(2, dim):
+        dilation_ = dilation[d - 2] if has_dilation else 1
+        kernel = dilation_ * (weight_size[d] - 1) + 1
+        output_size.append(
+            (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[d - 2] + 1
+        )
+    return output_size
+
+
+def conv1d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 3
+    assert len(input) == 3
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def conv2d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 4
+    assert len(input) == 4
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def batch_norm(
+    input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+    training: bool,
+    momentum: float,
+    eps: float,
+    cudnn_enabled: bool,
+):
+    out: List[int] = []
+    for elem in input:
+        out.append(elem)
+    return out
+
+
+def conv3d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 5
+    assert len(input) == 5
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
+    if dim_post_expr <= 0:
+        assert wrap_scalar
+        dim_post_expr = 1
+    min = -dim_post_expr
+    max = dim_post_expr - 1
+    assert not (dim < min or dim > max)
+    if dim < 0:
+        dim += dim_post_expr
+    return dim
+
+
+def zero_dim_tensor(input: Any):
+    out: List[int] = []
+    return out
+
+
+def multiply_integers(li: List[int]):
+    out = 1
+    for elem in li:
+        out = out * elem
+    return out
+
+
+def arange_end(end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any):
+    assert end >= 0
+    return [int(math.ceil(end))]
+
+
+def arange_start(
+    start: number, end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
+):
+    assert end >= 0
+    assert end >= start
+    return [int(math.ceil(end - start))]
+
+
+def arange_start_step(
+    start: number, end: number, step: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
+):
+    assert step != 0
+    if step < 0:
+        assert start >= end
+    else:
+        assert end >= start
+    return [int(math.ceil((end - start) / step))]
+
+
+def permute(input: List[int], dims: List[int]):
+    assert len(input) == len(dims)
+    ndim = len(dims)
+    seen_dims: List[int] = []
+    newSizes: List[int] = []
+    for i in range(ndim):
+        dim = maybe_wrap_dim(dims[i], ndim)
+        seen_dims.append(dim)
+        newSizes.append(input[dim])
+    for i in range(1, ndim):
+        for j in range(i):
+            assert seen_dims[i] != seen_dims[j]
+    return newSizes
+
+
+def flatten(input: List[int], start_dim: int, end_dim: int):
+    start_dim = maybe_wrap_dim(start_dim, len(input))
+    end_dim = maybe_wrap_dim(end_dim, len(input))
+    assert start_dim <= end_dim
+    if len(input) == 0:
+        return [1]
+    if start_dim == end_dim:
+        # TODO: return self
+        out: List[int] = []
+        for elem in input:
+            out.append(elem)
+        return out
+    slice_numel = 1
+    for i in range(start_dim, end_dim + 1):
+        slice_numel *= input[i]
+    # TODO: use slicing when slice optimization has landed
+    # slice_numel = multiply_integers(input[start_dim:end_dim - start_dim + 1])
+    shape: List[int] = []
+    for i in range(start_dim):
+        shape.append(input[i])
+    shape.append(slice_numel)
+    for i in range(end_dim + 1, len(input)):
+        shape.append(input[i])
+    return shape
+
+def nonzero_lower_bound(input: List[int]):
+    return [0, len(input)]
+
+def nonzero_upper_bound(input: List[int]):
+    return [numel(input), len(input)]
+
+def _reduce_along_dim(self: List[int], dim: int, keepdim: bool):
+    dim = maybe_wrap_dim(dim, len(self))
+    out: List[int] = []
+    for i, self_dim in enumerate(self):
+        if i == dim:
+            if keepdim:
+                out.append(1)
+        else:
+            out.append(self_dim)
+    return out
+
+def argmax(self: List[int], dim: Optional[int] = None, keepdim: bool = False) -> List[int]:
+    if dim is None:
+        return []
+    return _reduce_along_dim(self, dim, keepdim)
+
+def bmm(self: List[int], mat2: List[int]) -> List[int]:
+    assert len(self) == 3, "bmm only supports 3D tensors"
+    assert len(mat2) == 3, "bmm only supports 3D tensors"
+    assert self[0] == mat2[0], "mismatching batch dimension"
+    assert self[2] == mat2[1], "mismatching contracting dimension"
+    return [self[0], self[1], mat2[2]]
+
+def _shape_as_tensor(self: List[int]) -> List[int]:
+    return [len(self)]
+
+def topk(self: List[int], k: int, dim: int = -1) -> Tuple[List[int], List[int]]:
+    if len(self) == 0:
+        result: List[int] = []
+    else:
+        assert k <= self[dim], f"k ({k}) is too big for dimension {dim} of size {self[dim]}"
+        result = _copy(self)
+        result[dim] = k
+    return result, result
+
+def nll_loss_forward(self: List[int], target: List[int], weight: Optional[List[int]], reduction: int) -> Tuple[List[int], List[int]]:
+    # This is taken shamelessly from the meta function in LossNLL.cpp
+    self_dim = len(self)
+    target_dim = len(target)
+    assert 0 < self_dim <= 2
+    assert target_dim <= 1
+    no_batch_dim = self_dim == 1 and target_dim == 0
+    assert no_batch_dim or (self[0] == target[0])
+    n_classes = self[-1]
+    scalar_shape: List[int] = []
+    assert weight is None or (len(weight) == 1 and weight[0] == n_classes)
+    if reduction == 0 and self_dim == 2:
+        reduction_shape = [self[0]]
+    else:
+        reduction_shape = scalar_shape
+    return reduction_shape, scalar_shape
+
+def native_layer_norm(input: List[int], normalized_shape: List[int]) -> Tuple[List[int], List[int], List[int]]:
+    reduction_shape: List[int] = []
+    num_unreduced_dimensions = len(input) - len(normalized_shape)
+    assert num_unreduced_dimensions >= 0
+    for i in range(num_unreduced_dimensions):
+        reduction_shape.append(input[i])
+    for i in range(num_unreduced_dimensions, len(input)):
+        reduction_shape.append(1)
+    return _copy(input), reduction_shape, reduction_shape
+
+def native_batch_norm(input: List[int], weight: Optional[List[int]], bias: Optional[List[int]], running_mean: Optional[List[int]], running_var: Optional[List[int]], training: bool) -> Tuple[List[int], List[int], List[int]]:
+    if training:
+        _size = [input[1]]
+    else:
+        _size = [0]
+    return _copy(input), _size, _size
+
+# TODO: Add support for List[Optional[List[int]]] arguments (i.e. `Tensor?[]`).
+# def index_Tensor(self: List[int], indices: List[Optional[List[int]]]) -> List[int]:
+#     assert len(indices) <= len(self), "More indices than dimensions to index"
+#     broadcasted_shape: List[int] = []
+#     for index_tensor_shape in indices:
+#         if index_tensor_shape is not None:
+#             broadcasted_shape = broadcast(broadcasted_shape, index_tensor_shape)
+#     return broadcasted_shape
+
+ScriptFn = torch._C.ScriptFunction
+shape_compute_graph_mapping : Dict[str, ScriptFn ] = {}
+bounded_compute_graph_mapping : Dict[str, Tuple[ScriptFn, ScriptFn]] = {}
+script_func_map: Dict[Callable, ScriptFn] = {}
+
+def process_func(func: Callable):
+    if func not in script_func_map:
+        scripted_func = torch.jit.script(func)
+
+        torch._C._jit_pass_inline(scripted_func.graph)
+
+        for _ in range(2):
+            torch._C._jit_pass_peephole(scripted_func.graph)
+            torch._C._jit_pass_constant_propagation(scripted_func.graph)
+
+        script_func_map[func] = scripted_func
+    return script_func_map[func]
+
+
+def add_shape_compute_mapping(operator_schema: str, func: Callable):
+    global shape_compute_graph_mapping
+
+    shape_compute_graph_mapping[operator_schema] = process_func(func)
+
+def add_bounded_compute_mapping(operator_schema: str, lower_bound_func: Callable, upper_bound_func: Callable):
+    # Adds a shape compute function for both upper and lower bounds
+    fns = (process_func(lower_bound_func), process_func(upper_bound_func))
+    bounded_compute_graph_mapping[operator_schema] = fns
+
+add_shape_compute_mapping("aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)", unary)
+add_shape_compute_mapping("aten::rsub.Tensor(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", unary)
+add_shape_compute_mapping("aten::dropout(Tensor input, float p, bool train) -> Tensor", unary)
+add_shape_compute_mapping("aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", adaptive_avg_pool2d)
+add_shape_compute_mapping("prim::NumToTensor.Scalar(Scalar a) -> Tensor", zero_dim_tensor)
+add_shape_compute_mapping("prim::NumToTensor.bool(bool a) -> Tensor", zero_dim_tensor)
+add_shape_compute_mapping("aten::zeros(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", unary)
+add_shape_compute_mapping("aten::to.dtype(Tensor(a) self, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))", unary)
+add_shape_compute_mapping("aten::arange(Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", arange_end)
+add_shape_compute_mapping("aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", arange_start)
+add_shape_compute_mapping("aten::arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", arange_start_step)
+add_shape_compute_mapping("aten::squeeze(Tensor(a) self) -> Tensor(a)", squeeze_nodim)
+add_shape_compute_mapping("aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)", squeeze)
+add_shape_compute_mapping("aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)", unsqueeze)
+add_shape_compute_mapping("aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)", slice)
+add_shape_compute_mapping("aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)", select)
+add_shape_compute_mapping("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", index_select)
+add_shape_compute_mapping("aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, "
+                          "float eps=1e-05, bool cudnn_enable=True) -> Tensor", unary)
+add_shape_compute_mapping("aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", unary)
+add_shape_compute_mapping("aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor", unary)
+add_shape_compute_mapping("aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)", unary)
+add_shape_compute_mapping("aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor", embedding)
+add_shape_compute_mapping("aten::mm(Tensor self, Tensor mat2) -> Tensor", mm)
+add_shape_compute_mapping("aten::dot(Tensor self, Tensor tensor) -> Tensor", dot)
+add_shape_compute_mapping("aten::mv(Tensor self, Tensor vec) -> Tensor", mv)
+add_shape_compute_mapping("aten::matmul(Tensor self, Tensor other) -> Tensor", matmul)
+add_shape_compute_mapping("aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor", linear)
+add_shape_compute_mapping("aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", max_pool2d)
+add_shape_compute_mapping("aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", max_pool2d_with_indices)
+add_shape_compute_mapping("aten::t(Tensor(a) self) -> Tensor(a)", t)
+add_shape_compute_mapping("aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", transpose)
+add_shape_compute_mapping("aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor", conv1d)
+add_shape_compute_mapping("aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", conv2d)
+add_shape_compute_mapping("aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", batch_norm)
+add_shape_compute_mapping("aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor", conv3d)
+add_shape_compute_mapping("aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)", flatten)
+add_shape_compute_mapping("aten::cat(Tensor[] tensors, int dim=0) -> Tensor", cat)
+add_shape_compute_mapping("aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)", permute)
+add_shape_compute_mapping("aten::view(Tensor(a) self, int[] size) -> Tensor(a)", view)
+add_shape_compute_mapping("aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", expand)
+add_shape_compute_mapping("aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)", expand_one_unused)
+add_shape_compute_mapping("aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", mean_dim)
+add_shape_compute_mapping("aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", mean_dim)
+add_shape_compute_mapping("aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", max_dim)
+add_shape_compute_mapping("aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor", zero_dim_tensor)
+add_shape_compute_mapping("aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor", zero_dim_tensor)
+add_shape_compute_mapping("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", addmm)
+add_shape_compute_mapping("aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> (Tensor)", upsample_nearest2d)
+add_shape_compute_mapping("aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor", unary)
+add_shape_compute_mapping("aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor", unary)
+add_shape_compute_mapping("aten::dequantize(Tensor self) -> Tensor", unary)
+add_shape_compute_mapping("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc", broadcast)
+add_shape_compute_mapping("aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor", argmax)
+add_shape_compute_mapping("aten::bmm(Tensor self, Tensor mat2) -> Tensor", bmm)
+add_shape_compute_mapping("aten::_shape_as_tensor(Tensor self) -> Tensor", _shape_as_tensor)
+add_shape_compute_mapping("aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)", topk)
+add_shape_compute_mapping("aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)", nll_loss_forward)
+add_shape_compute_mapping("aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)", native_layer_norm)
+add_shape_compute_mapping("aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", native_batch_norm)
+# TODO: Add support for List[Optional[List[int]]] arguments (i.e. `Tensor?[]`).
+#add_shape_compute_mapping("aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", index_Tensor)
+
+# TODO: migrate over all of symbolic_shape_registry_util.cpp
+# These are duplicated here so that the functions will be serialiazed
+add_shape_compute_mapping("aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor", broadcast_three)
+add_shape_compute_mapping("aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor", broadcast_one_three)
+add_shape_compute_mapping("aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", broadcast_inplace)
+
+# quantized_conv_prepack TODO
+
+# Shape Compute Fn with upper and lower bounds
+add_bounded_compute_mapping("aten::nonzero(Tensor self) -> (Tensor)", nonzero_lower_bound, nonzero_upper_bound)
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 600af763a1a1..7a658f3285ad 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -489,13 +489,22 @@ def compare_outputs(original, reference, match_what):
                         orig = orig.to_dense()
                     if ref.is_mkldnn:
                         ref = ref.to_dense()
-                    torch.testing.assert_close(
-                        orig.double(),
-                        ref.double(),
-                        rtol=check_tolerance,
-                        atol=default_tolerances(orig, ref)[1],
-                        equal_nan=True,
-                    )
+                    if ref.is_complex() or orig.is_complex():
+                        torch.testing.assert_close(
+                            orig.to(torch.cdouble),
+                            ref.to(torch.cdouble),
+                            rtol=check_tolerance,
+                            atol=default_tolerances(orig, ref)[1],
+                            equal_nan=True,
+                        )
+                    else:
+                        torch.testing.assert_close(
+                            orig.double(),
+                            ref.double(),
+                            rtol=check_tolerance,
+                            atol=default_tolerances(orig, ref)[1],
+                            equal_nan=True,
+                        )
                 except AssertionError as e:
                     maybe_warn_nondeterministic()
                     warnings.warn(
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index fbbe962d40b7..9cba7663476a 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -3,6 +3,7 @@
 import ast
 import inspect
 import string
+import re
 from collections import namedtuple
 from textwrap import dedent
 from typing import List, Tuple  # noqa: F401
@@ -390,7 +391,8 @@ def process_ins_outs(args):
     def create_unique_name_ext(ctx, stmt):
         # extension will be based on the full path filename plus
         # the line number of original context manager
-        return ctx.filename.replace(".", "_").replace("/", "_") + "_" + str(stmt.lineno)
+        fn = re.sub(r'[^a-zA-Z0-9_]', '_', ctx.filename)
+        return f"{fn}_{stmt.lineno}"
 
     def build_return_ann_stmt(outputs):
         return_type_ann = ""
@@ -535,9 +537,8 @@ def build_AnnAssign(ctx, stmt):
             raise UnsupportedNodeError(ctx, stmt, reason='without assigned value')
 
         # Disallow type annotations on instance attributes outside of __init__
-        if type(stmt.target) == ast.Attribute and\
-                stmt.target.value.id == "self" and\
-                ctx.funcname != "__init__":
+        if type(stmt.target) == ast.Attribute and \
+                stmt.target.value.id == "self" and ctx.funcname != "__init__":  # type: ignore[attr-defined]
             start = stmt.col_offset
             end = start + len(f"self.{stmt.target.attr}")
             if hasattr(stmt.annotation, 'id'):
diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index a4c9bbbd12df..483296dce5d6 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -12,6 +12,10 @@ class QuantizedLinear(torch.jit.ScriptModule):
 
     def __init__(self, other):
         super(QuantizedLinear, self).__init__()
+        warnings.warn(
+            "torch.jit.QuantizedLinear is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.Linear instead.")
+
         self.in_features = other.in_features
         self.out_features = other.out_features
         # Quantize weight and discard the original
@@ -53,6 +57,9 @@ class QuantizedLinearFP16(torch.jit.ScriptModule):
 
     def __init__(self, other):
         super(QuantizedLinearFP16, self).__init__()
+        warnings.warn(
+            "torch.jit.QuantizedLinearFP16 is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.Linear instead.")
         self.in_features = other.in_features
         self.out_features = other.out_features
         self.original_weight = other.weight
@@ -90,6 +97,10 @@ class QuantizedRNNCellBase(torch.jit.ScriptModule):
 
     def __init__(self, other):
         super(QuantizedRNNCellBase, self).__init__()
+        warnings.warn(
+            "torch.jit.QuantizedRNNCellBase is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.RNNCell instead.")
+
         self.input_size = other.input_size
         self.hidden_size = other.hidden_size
         self.bias = other.bias
@@ -164,6 +175,9 @@ class QuantizedRNNCell(QuantizedRNNCellBase):
 
     def __init__(self, other):
         super(QuantizedRNNCell, self).__init__(other)
+        warnings.warn(
+            "torch.jit.QuantizedRNNCell is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.RNNCell instead.")
         self.nonlinearity = other.nonlinearity
 
     @torch.jit.script_method
@@ -196,6 +210,9 @@ def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
 class QuantizedLSTMCell(QuantizedRNNCellBase):
     def __init__(self, other):
         super(QuantizedLSTMCell, self).__init__(other)
+        warnings.warn(
+            "torch.jit.QuantizedLSTMCell is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.LSTMCell instead.")
 
     @torch.jit.script_method
     def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
@@ -216,6 +233,9 @@ def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) ->
 class QuantizedGRUCell(QuantizedRNNCellBase):
     def __init__(self, other):
         super(QuantizedGRUCell, self).__init__(other)
+        warnings.warn(
+            "torch.jit.QuantizedGRUCell is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.GRUCell instead.")
 
     @torch.jit.script_method
     def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
@@ -241,6 +261,9 @@ class QuantizedRNNBase(torch.jit.ScriptModule):
 
     def __init__(self, other, dtype=torch.int8):
         super(QuantizedRNNBase, self).__init__()
+        warnings.warn(
+            "torch.jit.QuantizedRNNBase is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic instead.")
         self.mode = other.mode
         self.input_size = other.input_size
         self.hidden_size = other.hidden_size
@@ -343,6 +366,9 @@ class QuantizedLSTM(QuantizedRNNBase):
 
     def __init__(self, other, dtype):
         super(QuantizedLSTM, self).__init__(other, dtype)
+        warnings.warn(
+            "torch.jit.QuantizedLSTM is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.LSTM instead.")
 
     @torch.jit.script_method
     def forward_impl(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]], batch_sizes: Optional[Tensor],
@@ -418,6 +444,13 @@ def forward(self, input, hx=None):
 class QuantizedGRU(QuantizedRNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            "torch.jit.QuantizedGRU is deprecated and will be removed in an upcoming "
+            "PyTorch release. Please use the torch.nn.quantized.dynamic.GRU instead.")
+
+
     @torch.jit.script_method
     def forward_impl(self, input: Tensor, hx: Optional[Tensor], batch_sizes: Optional[Tensor], max_batch_size: int,
                      sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tensor]:
diff --git a/torch/library.h b/torch/library.h
index e4c67ab8b3d2..38887740ecdf 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -355,12 +355,14 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
         return c10::DispatchKey::CPU;
       case c10::DeviceType::CUDA:
         return c10::DispatchKey::CUDA;
+      case c10::DeviceType::IPU:
+        return c10::DispatchKey::IPU;
       case c10::DeviceType::XLA:
         return c10::DispatchKey::XLA;
       case c10::DeviceType::Lazy:
         return c10::DispatchKey::Lazy;
-      case c10::DeviceType::MLC:
-        return c10::DispatchKey::MLC;
+      case c10::DeviceType::MPS:
+        return c10::DispatchKey::MPS;
       case c10::DeviceType::Meta:
         return c10::DispatchKey::Meta;
       case c10::DeviceType::HIP:
@@ -369,6 +371,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
         return c10::DispatchKey::ORT;
       case c10::DeviceType::HPU:
         return c10::DispatchKey::HPU;
+      case c10::DeviceType::PrivateUse1:
+        return c10::DispatchKey::PrivateUse1;
       default:
         TORCH_CHECK(
             false,
@@ -686,7 +690,7 @@ class TORCH_API Library final {
   }
 
   template <typename Name, typename Func>
-  Library& impl_UNBOXED(Name name, Func* raw_f) & {
+  Library& impl_UNBOXED(Name /*name*/, Func* /*raw_f*/) & {
     static_assert(
         c10::guts::false_t<Func>(),
         ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
@@ -703,7 +707,7 @@ class TORCH_API Library final {
     return def(raw_schema.operator const char*());
   }
   template <typename Func>
-  Library& def(detail::SelectiveStr<false>, Func&& raw_f) & {
+  Library& def(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
     return *this;
   }
   template <typename Func>
@@ -713,15 +717,15 @@ class TORCH_API Library final {
   }
 
   template <typename Func>
-  Library& impl(detail::SelectiveStr<false>, Func&& raw_f) & {
+  Library& impl(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
     return *this;
   }
   template <typename Dispatch, typename Func>
-  Library& impl(detail::SelectiveStr<false>, Dispatch&& key, Func&& raw_f) & {
+  Library& impl(detail::SelectiveStr<false>, Dispatch&& /*key*/, Func&& /*raw_f*/) & {
     return *this;
   }
   template <typename Func>
-  Library& impl_UNBOXED(detail::SelectiveStr<false> name, Func* raw_f) & {
+  Library& impl_UNBOXED(detail::SelectiveStr<false> /*name*/, Func* /*raw_f*/) & {
     static_assert(
         c10::guts::false_t<Func>(),
         ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
@@ -743,7 +747,7 @@ class TORCH_API Library final {
         std::forward<Func>(raw_f));
   }
   template <typename Func>
-  Library& impl_UNBOXED(detail::SelectiveStr<true> name, Func* raw_f) & {
+  Library& impl_UNBOXED(detail::SelectiveStr<true> /*name*/, Func* /*raw_f*/) & {
     static_assert(
         c10::guts::false_t<Func>(),
         ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
diff --git a/torch/library.py b/torch/library.py
new file mode 100644
index 000000000000..03e1d73fb465
--- /dev/null
+++ b/torch/library.py
@@ -0,0 +1,81 @@
+from ._ops import OpOverload
+from typing import Set
+import traceback
+import torch
+
+__all__ = ['Library', 'impl']
+
+# Set containing the combination of (namespace, operator, DispatchKey) for which a new kernel has been registered
+# The keys in the set are of the form `namespace + "/" + op_name + "/" + dispatch_key`.
+# This set is maintained to ensure that two libraries don't try to override the exact same functionality to avoid
+# libraries calling into kernels not intended to be called.
+_impls: Set[str] = set()
+
+class Library:
+    """
+    A class to create libraries that can be used to register new operators or
+    override operators in existing libraries from Python.
+    A user can optionally pass in a dispatch keyname if they only want to register
+    kernels corresponding to only one specific dispatch key.
+
+    Args:
+        ns: library name
+        kind: "DEF", "IMPL" (default: "IMPL")
+        dispatch_key: PyTorch dispatch key (default: "")
+    """
+    def __init__(self, ns, kind, dispatch_key=""):
+        if kind != "IMPL" and kind != "DEF":
+            raise ValueError("Unsupported kind: ", kind)
+        frame = traceback.extract_stack(limit=3)[0]
+        filename, lineno = frame.filename, frame.lineno
+        self.m = torch._C._dispatch_library(kind, ns, dispatch_key, filename, lineno)
+        self.ns = ns
+        self._op_impls = set()
+        self.kind = kind
+        self.dispatch_key = dispatch_key
+
+    def __repr__(self):
+        return "Library(kind={}, ns={}, dispatch_key={})>".format(self.kind, self.ns, self.dispatch_key)
+
+    def impl(self, op_name, fn, dispatch_key=''):
+        if dispatch_key == '':
+            if self.dispatch_key == '':
+                raise RuntimeError("Please specify the dispatch key that you want to register the kernel for.")
+            dispatch_key = self.dispatch_key
+
+        if isinstance(op_name, str):
+            name = op_name
+        elif isinstance(op_name, OpOverload):
+            name = op_name._schema.name
+            overload_name = op_name._schema.overload_name
+            if overload_name != '':
+                name = name + '.' + overload_name
+        else:
+            raise RuntimeError("impl should be passed either a name or an OpOverload object as the first argument")
+
+        key = self.ns + "/" + name.split("::")[-1] + "/" + dispatch_key
+        if key in _impls:
+            # TODO: in future, add more info about where the existing function is registered (this info is
+            # today already returned by the C++ warning when impl is called but we error out before that)
+            raise RuntimeError("This is not allowed since there's already a kernel registered from python overriding {}"
+                               "'s behavior for {} dispatch key and {} namespace.".
+                               format(name.split("::")[-1], dispatch_key, self.ns))
+
+        self.m.impl(name, dispatch_key, fn)
+        _impls.add(key)
+        self._op_impls.add(key)
+
+    def define(self, schema):
+        self.m.define(schema)
+
+    def __del__(self):
+        for key in self._op_impls:
+            _impls.remove(key)
+        del self.m
+
+# decorator to register python functions for library ops
+# Note: this decorator API should remain consistent with `Library.impl` API
+def impl(lib, name, dispatch_key=''):
+    def wrap(f):
+        lib.impl(name, f, dispatch_key)
+    return wrap
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 75b8d8d6cca0..5c9f5223b7ce 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -83,7 +83,7 @@
 
     A = LL^{\text{H}}\mathrlap{\qquad L \in \mathbb{K}^{n \times n}}
 
-where :math:`L` is a lower triangular matrix and
+where :math:`L` is a lower triangular matrix with real positive diagonal (even in the complex case) and
 :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex, and the transpose when :math:`L` is real-valued.
 
 Supports input of float, double, cfloat and cdouble dtypes.
@@ -234,7 +234,7 @@
     Consider using :func:`torch.linalg.solve` if possible for multiplying a matrix on the left by
     the inverse, as::
 
-        torch.linalg.solve(A, B) == A.inv() @ B
+        linalg.solve(A, B) == linalg.inv(A) @ B  # When B is a matrix
 
     It is always prefered to use :func:`~solve` when possible, as it is faster and more
     numerically stable than computing the inverse explicitly.
@@ -266,7 +266,7 @@
 
     >>> A = torch.randn(2, 3, 4, 4)  # Batch of matrices
     >>> Ainv = torch.linalg.inv(A)
-    >>> torch.dist(A @ Ainv, torch.eye(4)))
+    >>> torch.dist(A @ Ainv, torch.eye(4))
     tensor(1.9073e-06)
 
     >>> A = torch.randn(4, 4, dtype=torch.complex128)  # Complex matrix
@@ -821,6 +821,155 @@
     https://www.netlib.org/lapack/lug/node128.html
 """)
 
+ldl_factor = _add_docstr(_linalg.linalg_ldl_factor, r"""
+linalg.ldl_factor(A, *, hermitian=False, out=None) -> (Tensor, Tensor)
+
+Computes a compact representation of the LDL factorization of a Hermitian or symmetric (possibly indefinite) matrix.
+
+When :attr:`A` is complex valued it can be Hermitian (:attr:`hermitian`\ `= True`)
+or symmetric (:attr:`hermitian`\ `= False`).
+
+The factorization is of the form the form :math:`A = L D L^T`.
+If :attr:`hermitian` is `True` then transpose operation is the conjugate transpose.
+
+:math:`L` (or :math:`U`) and :math:`D` are stored in compact form in ``LD``.
+They follow the format specified by `LAPACK's sytrf`_ function.
+These tensors may be used in :func:`torch.linalg.ldl_solve` to solve linear systems.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.ldl_factor_ex")}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape (*, n, n) where * is zero or more batch dimensions consisting of symmetric or Hermitian matrices.
+                    `(*, n, n)` where `*` is one or more batch dimensions.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the input to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LD, pivots)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> A
+    tensor([[7.2079, 4.2414, 1.9428],
+            [4.2414, 3.4554, 0.3264],
+            [1.9428, 0.3264, 1.3823]])
+    >>> LD, pivots = torch.linalg.ldl_factor(A)
+    >>> LD
+    tensor([[ 7.2079,  0.0000,  0.0000],
+            [ 0.5884,  0.9595,  0.0000],
+            [ 0.2695, -0.8513,  0.1633]])
+    >>> pivots
+    tensor([1, 2, 3], dtype=torch.int32)
+
+.. _LAPACK's sytrf:
+    https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+""")
+
+ldl_factor_ex = _add_docstr(_linalg.linalg_ldl_factor_ex, r"""
+linalg.ldl_factor_ex(A, *, hermitian=False, check_errors=False, out=None) -> (Tensor, Tensor, Tensor)
+
+This is a version of :func:`~ldl_factor` that does not perform error checks unless :attr:`check_errors`\ `= True`.
+It also returns the :attr:`info` tensor returned by `LAPACK's sytrf`_.
+``info`` stores integer error codes from the backend library.
+A positive integer indicates the diagonal element of :math:`D` that is zero.
+Division by 0 will occur if the result is used for solving a system of linear equations.
+``info`` filled with zeros indicates that the factorization was successful.
+If ``check_errors=True`` and ``info`` contains positive integers, then a `RuntimeError` is thrown.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape (*, n, n) where * is zero or more batch dimensions consisting of symmetric or Hermitian matrices.
+                    `(*, n, n)` where `*` is one or more batch dimensions.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the input to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    check_errors (bool, optional): controls whether to check the content of ``info`` and raise
+                                   an error if it is non-zero. Default: `False`.
+    out (tuple, optional): tuple of three tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LD, pivots, info)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> A
+    tensor([[7.2079, 4.2414, 1.9428],
+            [4.2414, 3.4554, 0.3264],
+            [1.9428, 0.3264, 1.3823]])
+    >>> LD, pivots, info = torch.linalg.ldl_factor_ex(A)
+    >>> LD
+    tensor([[ 7.2079,  0.0000,  0.0000],
+            [ 0.5884,  0.9595,  0.0000],
+            [ 0.2695, -0.8513,  0.1633]])
+    >>> pivots
+    tensor([1, 2, 3], dtype=torch.int32)
+    >>> info
+    tensor(0, dtype=torch.int32)
+
+.. _LAPACK's sytrf:
+    https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+""")
+
+ldl_solve = _add_docstr(_linalg.linalg_ldl_solve, r"""
+linalg.ldl_solve(LD, pivots, B, *, hermitian=False, out=None) -> Tensor
+
+Computes the solution of a system of linear equations using the LDL factorization.
+
+:attr:`LD` and :attr:`pivots` are the compact representation of the LDL factorization and
+are expected to be computed by :func:`torch.linalg.ldl_factor_ex`.
+:attr:`hermitian` argument to this function should be the same
+as the corresponding argumens in :func:`torch.linalg.ldl_factor_ex`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    LD (Tensor): the `n \times n` matrix or the batch of such matrices of size
+                      `(*, n, n)` where `*` is one or more batch dimensions.
+    pivots (Tensor): the pivots corresponding to the LDL factorization of :attr:`LD`.
+    B (Tensor): right-hand side tensor of shape `(*, n, k)`.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the decomposed matrix to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    out (tuple, optional): output tensor. `B` may be passed as `out` and the result is computed in-place on `B`.
+                           Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> LD, pivots, info = torch.linalg.ldl_factor_ex(A)
+    >>> B = torch.randn(2, 3, 4)
+    >>> X = torch.linalg.ldl_solve(LD, pivots, B)
+    >>> torch.linalg.norm(A @ X - B)
+    >>> tensor(0.0001)
+""")
+
 lstsq = _add_docstr(_linalg.linalg_lstsq, r"""
 torch.linalg.lstsq(A, B, rcond=None, *, driver=None) -> (Tensor, Tensor, Tensor, Tensor)
 
@@ -1079,8 +1228,6 @@
 
 Computes a vector or matrix norm.
 
-If :attr:`A` is complex valued, it computes the norm of :attr:`A`\ `.abs()`
-
 Supports input of float, double, cfloat and cdouble dtypes.
 
 Whether this function computes a vector or matrix norm is determined as follows:
@@ -1210,18 +1357,18 @@
 """)
 
 vector_norm = _add_docstr(_linalg.linalg_vector_norm, r"""
-linalg.vector_norm(A, ord=2, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+linalg.vector_norm(x, ord=2, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
 
 Computes a vector norm.
 
-If :attr:`A` is complex valued, it computes the norm of :attr:`A`\ `.abs()`
+If :attr:`x` is complex valued, it computes the norm of :attr:`x`\ `.abs()`
 
 Supports input of float, double, cfloat and cdouble dtypes.
 
-This function does not necessarily treat multidimensonal :attr:`A` as a batch of
+This function does not necessarily treat multidimensonal :attr:`x` as a batch of
 vectors, instead:
 
-- If :attr:`dim`\ `= None`, :attr:`A` will be flattened before the norm is computed.
+- If :attr:`dim`\ `= None`, :attr:`x` will be flattened before the norm is computed.
 - If :attr:`dim` is an `int` or a `tuple`, the norm will be computed over these dimensions
   and the other dimensions will be treated as batch dimensions.
 
@@ -1241,12 +1388,16 @@
 
 where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
 
+:attr:`dtype` may be used to perform the computation in a more precise dtype.
+It is semantically equivalent to calling ``linalg.vector_norm(x.to(dtype))``
+but it is faster in some cases.
+
 .. seealso::
 
         :func:`torch.linalg.matrix_norm` computes a matrix norm.
 
 Args:
-    A (Tensor): tensor, flattened by default, but this behavior can be
+    x (Tensor): tensor, flattened by default, but this behavior can be
         controlled using :attr:`dim`.
     ord (int, float, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `2`
     dim (int, Tuple[int], optional): dimensions over which to compute
@@ -1257,12 +1408,14 @@
 
 Keyword args:
     out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
-    dtype (:class:`torch.dtype`, optional): If specified, the input tensor is cast to
-        :attr:`dtype` before performing the operation, and the returned tensor's type
-        will be :attr:`dtype`. Default: `None`
+    dtype (:class:`torch.dtype`, optional): type used to perform the accumulation and the return.
+        If specified, :attr:`x` is cast to :attr:`dtype` before performing the operation,
+        and the returned tensor’s type will be :attr:`dtype` if real and of its real counterpart if complex.
+        :attr:`dtype` may be complex if :attr:`x` is complex, otherwise it must be real.
+        :attr:`x` should be convertible without narrowing to :attr:`dtype`. Default: None
 
 Returns:
-    A real-valued tensor, even when :attr:`A` is complex.
+    A real-valued tensor, even when :attr:`x` is complex.
 
 Examples::
 
@@ -2073,7 +2226,7 @@
              As such, different platforms, like SciPy, or inputs on different devices,
              may produce different valid decompositions.
 
-.. warning:: Gradient computations are only supported if the input matrix is full-rank.
+             Gradient computations are only supported if the input matrix is full-rank.
              If this condition is not met, no error will be thrown, but the gradient may not be finite.
              This is because the LU decomposition with pivoting is not differentiable at these points.
 
@@ -2149,6 +2302,93 @@
     https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
 """)
 
+lu = _add_docstr(_linalg.linalg_lu, r"""
+lu(A, *, pivot=True, out=None) -> (Tensor, Tensor, Tensor)
+
+Computes the LU decomposition with partial pivoting of a matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **LU decomposition with partial pivoting** of a matrix
+:math:`A \in \mathbb{K}^{m \times n}` if `k = min(m,n)`, is defined as
+
+.. math::
+
+    A = PLU\mathrlap{\qquad P \in \mathbb{K}^{m \times m}, L \in \mathbb{K}^{m \times k}, U \in \mathbb{K}^{k \times n}}
+
+where :math:`P` is a `permutation matrix`_, :math:`L` is lower triangular with ones on the diagonal
+and :math:`U` is upper triangular.
+
+If :attr:`pivot`\ `= False` and :attr:`A` is on GPU, then the **LU decomposition without pivoting** is computed
+
+.. math::
+
+    A = LU\mathrlap{\qquad L \in \mathbb{K}^{m \times k}, U \in \mathbb{K}^{k \times n}}
+
+When :attr:`pivot`\ `= False`, the returned matrix :attr:`P` will be empty.
+The LU decomposition without pivoting `may not exist`_ if any of the principal minors of :attr:`A` is singular.
+In this case, the output matrix may contain `inf` or `NaN`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.linalg.solve` solves a system of linear equations using the LU decomposition
+        with partial pivoting.
+
+.. warning:: The LU decomposition is almost never unique, as often there are different permutation
+             matrices that can yield different LU decompositions.
+             As such, different platforms, like SciPy, or inputs on different devices,
+             may produce different valid decompositions.
+
+.. warning:: Gradient computations are only supported if the input matrix is full-rank.
+             If this condition is not met, no error will be thrown, but the gradient
+             may not be finite.
+             This is because the LU decomposition with pivoting is not differentiable at these points.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    pivot (bool, optional): Controls whether to compute the LU decomposition with partial pivoting or
+        no pivoting. Default: `True`.
+
+Keyword args:
+    out (tuple, optional): output tuple of three tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(P, L, U)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 2)
+    >>> P, L, U = torch.linalg.lu(A)
+    >>> P
+    tensor([[0., 1., 0.],
+            [0., 0., 1.],
+            [1., 0., 0.]])
+    >>> L
+    tensor([[1.0000, 0.0000],
+            [0.5007, 1.0000],
+            [0.0633, 0.9755]])
+    >>> U
+    tensor([[0.3771, 0.0489],
+            [0.0000, 0.9644]])
+    >>> torch.dist(A, P @ L @ U)
+    tensor(5.9605e-08)
+
+    >>> A = torch.randn(2, 5, 7, device="cuda")
+    >>> P, L, U = torch.linalg.lu(A, pivot=False)
+    >>> P
+    tensor([], device='cuda:0')
+    >>> torch.dist(A, L @ U)
+    tensor(1.0376e-06, device='cuda:0')
+
+.. _permutation matrix:
+    https://en.wikipedia.org/wiki/Permutation_matrix
+.. _may not exist:
+    https://en.wikipedia.org/wiki/LU_decomposition#Definitions
+""")
+
 tensorinv = _add_docstr(_linalg.linalg_tensorinv, r"""
 linalg.tensorinv(A, ind=2, *, out=None) -> Tensor
 
@@ -2174,7 +2414,7 @@
     Consider using :func:`torch.linalg.tensorsolve` if possible for multiplying a tensor on the left
     by the tensor inverse, as::
 
-        tensorsolve(A, B) == torch.tensordot(tensorinv(A), B)
+        linalg.tensorsolve(A, B) == torch.tensordot(linalg.tensorinv(A), B)  # When B is a tensor with shape A.shape[:B.ndim]
 
     It is always prefered to use :func:`~tensorsolve` when possible, as it is faster and more
     numerically stable than computing the pseudoinverse explicitly.
@@ -2243,7 +2483,7 @@
     A (Tensor): tensor to solve for. Its shape must satisfy
                     `prod(\ `:attr:`A`\ `.shape[:\ `:attr:`B`\ `.ndim]) ==
                     prod(\ `:attr:`A`\ `.shape[\ `:attr:`B`\ `.ndim:])`.
-    B (Tensor): tensor of shape :attr:`A`\ `.shape[\ `:attr:`B`\ `.ndim]`.
+    B (Tensor): tensor of shape :attr:`A`\ `.shape[:\ `:attr:`B`\ `.ndim]`.
     dims (Tuple[int], optional): dimensions of :attr:`A` to be moved.
         If `None`, no dimensions are moved. Default: `None`.
 
@@ -2290,7 +2530,8 @@
 
     A = QR\mathrlap{\qquad Q \in \mathbb{K}^{m \times m}, R \in \mathbb{K}^{m \times n}}
 
-where :math:`Q` is orthogonal in the real case and unitary in the complex case, and :math:`R` is upper triangular.
+where :math:`Q` is orthogonal in the real case and unitary in the complex case,
+and :math:`R` is upper triangular with real diagonal (even in the complex case).
 
 When `m > n` (tall matrix), as `R` is upper triangular, its last `m - n` rows are zero.
 In this case, we can drop the last `m - n` columns of `Q` to form the
@@ -2310,30 +2551,27 @@
 If :attr:`A` has shape `(*, m, n)`, denoting `k = min(m, n)`
 
 - :attr:`mode`\ `= 'reduced'` (default): Returns `(Q, R)` of shapes `(*, m, k)`, `(*, k, n)` respectively.
+  It is always differentiable.
 - :attr:`mode`\ `= 'complete'`: Returns `(Q, R)` of shapes `(*, m, m)`, `(*, m, n)` respectively.
+  It is differentiable for `m <= n`.
 - :attr:`mode`\ `= 'r'`: Computes only the reduced `R`. Returns `(Q, R)` with `Q` empty and `R` of shape `(*, k, n)`.
+  It is never differentiable.
 
 Differences with `numpy.linalg.qr`:
 
 - :attr:`mode`\ `= 'raw'` is not implemented.
 - Unlike `numpy.linalg.qr`, this function always returns a tuple of two tensors.
   When :attr:`mode`\ `= 'r'`, the `Q` tensor is an empty tensor.
-  This behavior may change in a future PyTorch release.
 
-.. note:: The elements in the diagonal of `R` are not necessarily positive.
-
-.. note:: :attr:`mode`\ `= 'r'` does not support backpropagation. Use :attr:`mode`\ `= 'reduced'` instead.
-
-.. warning:: The QR decomposition is only unique up to the sign of the diagonal of `R` when the
-             first `k = min(m, n)` columns of :attr:`A` are linearly independent.
-             If this is not the case, different platforms, like NumPy,
-             or inputs on different devices, may produce different valid decompositions.
+.. warning:: The elements in the diagonal of `R` are not necessarily positive.
+             As such, the returned QR decomposition is only unique up to the sign of the diagonal of `R`.
+             Therefore, different platforms, like NumPy, or inputs on different devices,
+             may produce different valid decompositions.
 
-.. warning:: Gradient computations are only supported if the first `k = min(m, n)` columns
+.. warning:: The QR decomposition is only well-defined if the first `k = min(m, n)` columns
              of every matrix in :attr:`A` are linearly independent.
-             If this condition is not met, no error will be thrown, but the gradient produced
-             will be incorrect.
-             This is because the QR decomposition is not differentiable at these points.
+             If this condition is not met, no error will be thrown, but the QR produced
+             may be incorrect and its autodiff may fail or produce incorrect results.
 
 Args:
     A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
@@ -2378,3 +2616,54 @@
     >>> torch.dist(Q.mT @ Q, torch.eye(4))
     tensor(6.2158e-07)
 """)
+
+vander = _add_docstr(_linalg.linalg_vander, r"""
+vander(x, N=None) -> Tensor
+
+Generates a Vandermonde matrix.
+
+Returns the Vandermonde matrix :math:`V`
+
+.. math::
+
+    V = \begin{pmatrix}
+            1 & x_1 & x_1^2 & \dots & x_1^{N-1}\\
+            1 & x_2 & x_2^2 & \dots & x_2^{N-1}\\
+            1 & x_3 & x_3^2 & \dots & x_3^{N-1}\\
+            \vdots & \vdots & \vdots & \ddots &\vdots \\
+            1 & x_n & x_n^2 & \dots & x_n^{N-1}
+        \end{pmatrix}.
+
+for `N > 1`.
+If :attr:`N`\ `= None`, then `N = x.size(-1)` so that the output is a square matrix.
+
+Supports inputs of float, double, cfloat, cdouble, and integral dtypes.
+Also supports batches of vectors, and if :attr:`x` is a batch of vectors then
+the output has the same batch dimensions.
+
+Differences with `numpy.vander`:
+
+- Unlike `numpy.vander`, this function returns the powers of :attr:`x` in ascending order.
+  To get them in the reverse order call ``linalg.vander(x, N).flip(-1)``.
+
+Args:
+    x (Tensor): tensor of shape `(*, n)` where `*` is zero or more batch dimensions
+                consisting of vectors.
+
+Keyword args:
+    N (int, optional): Number of columns in the output. Default: `x.size(-1)`
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 5])
+    >>> linalg.vander(x)
+    tensor([[  1,   1,   1,   1],
+            [  1,   2,   4,   8],
+            [  1,   3,   9,  27],
+            [  1,   5,  25, 125]])
+    >>> linalg.vander(x, N=3)
+    tensor([[ 1,  1,  1],
+            [ 1,  2,  4],
+            [ 1,  3,  9],
+            [ 1,  5, 25]])
+""")
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 5eff241da456..4a5d725c0e49 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -6,6 +6,8 @@
 import multiprocessing
 from multiprocessing.util import register_after_fork
 from multiprocessing.reduction import ForkingPickler
+from typing import Union
+
 try:
     # Early load resource_sharer to prevent a partially initialized instance
     # from being inherited in a forked child process. The reduce_storage method
@@ -103,7 +105,7 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
                         requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required):
     # If storage_handle is None, storage points to nullptr.
     if storage_handle is None or storage_size_bytes == 0:
-        storage = storage_cls(0)
+        storage = storage_cls(0, dtype=dtype, device=storage_device)
     else:
         storage = storage_from_cache(storage_cls, (storage_handle, storage_offset_bytes))
         if storage is None:
@@ -120,10 +122,10 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
             shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(storage)
         else:
             # We already ref counting this Storage, but producer needs new ref-counters to be released.
-            storage_cls._release_ipc_counter(ref_counter_handle, ref_counter_offset)
+            storage_cls._release_ipc_counter(ref_counter_handle, ref_counter_offset, device=storage_device)
 
     t = torch._utils._rebuild_tensor(
-        torch.storage.TypedStorage(wrap_storage=storage._untyped(), dtype=dtype),
+        torch.storage._TypedStorage(wrap_storage=storage._untyped(), dtype=dtype),
         tensor_offset, tensor_size, tensor_stride)
 
     if tensor_cls == torch.nn.parameter.Parameter:
@@ -288,7 +290,7 @@ def storage_from_cache(cls, key):
     storage_ref = shared_cache.get(key)
     if storage_ref is None:
         return None
-    return cls._new_with_weak_ptr(storage_ref.cdata)
+    return torch._UntypedStorage._new_with_weak_ptr(storage_ref.cdata)
 
 
 def rebuild_storage_fd(cls, df, size):
@@ -304,11 +306,18 @@ def rebuild_storage_fd(cls, df, size):
         os.close(fd)
 
 
-def rebuild_storage_filename(cls, manager, handle, size):
-    storage = storage_from_cache(cls, handle)
+def rebuild_storage_filename(cls, manager, handle, size, dtype=None):
+    storage: Union[torch._TypedStorage, torch._UntypedStorage] = storage_from_cache(cls, handle)
     if storage is not None:
         return storage._shared_decref()
-    storage = cls._new_shared_filename(manager, handle, size)
+    if dtype is None:
+        storage = torch._UntypedStorage._new_shared_filename(manager, handle, size)
+    else:
+        byte_size = size * torch._utils._element_size(dtype)
+        untyped_storage: torch._UntypedStorage = torch._UntypedStorage._new_shared_filename(manager, handle, byte_size)
+        storage = torch._TypedStorage(
+            wrap_storage=untyped_storage,
+            dtype=dtype)
     shared_cache[handle] = StorageWeakRef(storage)
     return storage._shared_decref()
 
@@ -317,16 +326,16 @@ def rebuild_storage_empty(cls):
     return cls()
 
 def rebuild_typed_storage(storage, dtype):
-    return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype)
+    return torch.storage._TypedStorage(wrap_storage=storage, dtype=dtype)
 
-# Use for torch.storage.TypedStorage
+# Use for torch.storage._TypedStorage
 def reduce_typed_storage(storage):
     return (rebuild_typed_storage, (storage._storage, storage.dtype))
 
 def rebuild_typed_storage_child(storage, storage_type):
     return storage_type(wrap_storage=storage)
 
-# Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage
+# Use for child classes of torch.storage._TypedStorage, like torch.FloatStorage
 def reduce_typed_storage_child(storage):
     return (rebuild_typed_storage_child, (storage._storage, type(storage)))
 
@@ -338,6 +347,8 @@ def reduce_storage(storage):
         metadata = storage._share_filename_()
         cache_key = metadata[1]
         rebuild = rebuild_storage_filename
+        if isinstance(storage, torch._TypedStorage):
+            metadata += (storage.dtype,)
         storage._shared_incref()
     elif storage.size() == 0:
         # This is special cased because Empty tensors
@@ -358,12 +369,12 @@ def init_reductions():
     ForkingPickler.register(torch.cuda.Event, reduce_event)
 
     for t in torch._storage_classes:
-        if t.__name__ == 'UntypedStorage':
+        if t.__name__ == '_UntypedStorage':
             ForkingPickler.register(t, reduce_storage)
         else:
             ForkingPickler.register(t, reduce_typed_storage_child)
 
-    ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage)
+    ForkingPickler.register(torch.storage._TypedStorage, reduce_typed_storage)
 
     for t in torch._tensor_classes:
         ForkingPickler.register(t, reduce_tensor)
diff --git a/tools/linter/install/__init__.py b/torch/nested/__init__.py
similarity index 100%
rename from tools/linter/install/__init__.py
rename to torch/nested/__init__.py
diff --git a/torch/nn/__init__.py b/torch/nn/__init__.py
index 0a2f6cfcb3bf..b8cabc2ecda6 100644
--- a/torch/nn/__init__.py
+++ b/torch/nn/__init__.py
@@ -2,6 +2,7 @@
 from .parameter import Parameter, UninitializedParameter, UninitializedBuffer
 from .parallel import DataParallel
 from . import init
+from . import functional
 from . import utils
 
 
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 5e50ea33894b..2a779c7c8d23 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1,5 +1,5 @@
 r"""Functional interface"""
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, Union
 import math
 import warnings
 
@@ -645,12 +645,6 @@ def max_pool1d_with_indices(
         return_indices: If ``True``, will return the argmax along with the max values.
                         Useful for :class:`torch.nn.functional.max_unpool1d` later
     """
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool1d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool1d_with_indices,
@@ -676,12 +670,6 @@ def _max_pool1d(
     ceil_mode: bool = False,
     return_indices: bool = False
 ) -> Tensor:
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool1d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool1d,
@@ -743,12 +731,6 @@ def max_pool2d_with_indices(
         return_indices: If ``True``, will return the argmax along with the max values.
                         Useful for :class:`torch.nn.functional.max_unpool2d` later
     """
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool2d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool2d_with_indices,
@@ -774,12 +756,6 @@ def _max_pool2d(
     ceil_mode: bool = False,
     return_indices: bool = False
 ) -> Tensor:
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool2d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool2d,
@@ -841,12 +817,6 @@ def max_pool3d_with_indices(
         return_indices: If ``True``, will return the argmax along with the max values.
                         Useful for :class:`torch.nn.functional.max_unpool3d` later
     """
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool3d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool3d_with_indices,
@@ -872,12 +842,6 @@ def _max_pool3d(
     ceil_mode: bool = False,
     return_indices: bool = False
 ) -> Tensor:
-    # See: https://github.com/pytorch/pytorch/pull/62544#issuecomment-896195121
-    # and https://github.com/pytorch/pytorch/issues/62545 for context
-    if ceil_mode != return_indices:
-        warnings.warn("Note that order of the arguments: ceil_mode and return_indices will change"
-                      "to match the args list in nn.MaxPool3d in a future release.")
-
     if has_torch_function_unary(input):
         return handle_torch_function(
             max_pool3d,
@@ -1040,8 +1004,8 @@ def max_unpool3d(
 
 
 def lp_pool2d(
-    input: Tensor, norm_type: float,
-    kernel_size: int,
+    input: Tensor, norm_type: Union[int, float],
+    kernel_size: BroadcastingList2[int],
     stride: Optional[BroadcastingList2[int]] = None,
     ceil_mode: bool = False
 ) -> Tensor:
@@ -1065,7 +1029,7 @@ def lp_pool2d(
 
 
 def lp_pool1d(
-    input: Tensor, norm_type: float,
+    input: Tensor, norm_type: Union[int, float],
     kernel_size: int,
     stride: Optional[BroadcastingList1[int]] = None,
     ceil_mode: bool = False
@@ -1478,7 +1442,7 @@ def glu(input: Tensor, dim: int = -1) -> Tensor:
     return torch._C._nn.glu(input, dim)
 
 
-def hardtanh(input: Tensor, min_val: float = -1.0, max_val: float = 1.0, inplace: bool = False) -> Tensor:
+def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace: bool = False) -> Tensor:
     r"""
     hardtanh(input, min_val=-1., max_val=1., inplace=False) -> Tensor
 
@@ -1681,21 +1645,22 @@ def rrelu(
 """,
 )
 
-
 gelu = _add_docstr(
     torch._C._nn.gelu,
     r"""
-gelu(input) -> Tensor
+gelu(input, approximate = 'none') -> Tensor
 
-Applies element-wise the function
+When the approximate argument is 'none', it applies element-wise the function
 :math:`\text{GELU}(x) = x * \Phi(x)`
 
 where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
 
+When the approximate argument is 'tanh', Gelu is estimated with:
+    :math::  \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+
 See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
 """)
 
-
 hardshrink = _add_docstr(
     torch.hardshrink,
     r"""
@@ -1979,7 +1944,7 @@ def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
 Shape:
 
     - Input: :math:`(*, in\_features)` where `*` means any number of
-        additional dimensions, including none
+      additional dimensions, including none
     - Weight: :math:`(out\_features, in\_features)` or :math:`(in\_features)`
     - Bias: :math:`(out\_features)` or :math:`()`
     - Output: :math:`(*, out\_features)` or :math:`(*)`, based on the shape of the weight
@@ -2367,7 +2332,8 @@ def embedding_bag(
     return ret
 
 
-embedding_bag.__doc__ = embedding_bag.__doc__.format(**reproducibility_notes)
+if embedding_bag.__doc__:
+    embedding_bag.__doc__ = embedding_bag.__doc__.format(**reproducibility_notes)
 
 
 def _verify_batch_size(size: List[int]) -> None:
@@ -2601,7 +2567,8 @@ def ctc_loss(
     )
 
 
-ctc_loss.__doc__ = ctc_loss.__doc__.format(**reproducibility_notes)
+if ctc_loss.__doc__:
+    ctc_loss.__doc__ = ctc_loss.__doc__.format(**reproducibility_notes)
 
 
 def nll_loss(
@@ -3470,8 +3437,7 @@ def multi_margin_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:
-    r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None,
-                          reduce=None, reduction='mean') -> Tensor
+    r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.MultiMarginLoss` for details.
     """
@@ -3590,6 +3556,50 @@ def multi_margin_loss(
 """,
 )
 
+native_channel_shuffle = _add_docstr(
+    torch.native_channel_shuffle,
+    r"""
+native_channel_shuffle(input, groups) -> Tensor
+
+Native kernel level implementation of the `channel_shuffle`.
+This function might become private in future releases, use with caution.
+
+Divide the channels in a tensor of shape :math:`(*, C , H, W)`
+into g groups and rearrange them as :math:`(*, C \frac g, g, H, W)`,
+while keeping the original tensor shape.
+
+See :class:`~torch.nn.ChannelShuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    groups (int): number of groups to divide channels in and rearrange.
+
+Examples::
+
+    >>> input = torch.randn(1, 4, 2, 2)
+    >>> print(input)
+    [[[[1, 2],
+       [3, 4]],
+      [[5, 6],
+       [7, 8]],
+      [[9, 10],
+       [11, 12]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+    >>> output = torch.nn.functional.native_channel_shuffle(input, 2)
+    >>> print(output)
+    [[[[1, 2],
+       [3, 4]],
+      [[9, 10],
+       [11, 12]],
+      [[5, 6],
+       [7, 8]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+""",
+)
 
 @_overload  # noqa: F811
 def upsample(input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None, mode: str = "nearest", align_corners: Optional[bool] = None) -> Tensor:  # noqa: F811
@@ -3662,7 +3672,8 @@ def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=
     return interpolate(input, size, scale_factor, mode, align_corners)
 
 
-upsample.__doc__ = upsample.__doc__.format(**reproducibility_notes)
+if upsample.__doc__:
+    upsample.__doc__ = upsample.__doc__.format(**reproducibility_notes)
 
 
 @_overload  # noqa: F811
@@ -3903,7 +3914,8 @@ def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optiona
     )
 
 
-interpolate.__doc__ = interpolate.__doc__.format(**reproducibility_notes)
+if interpolate.__doc__:
+    interpolate.__doc__ = interpolate.__doc__.format(**reproducibility_notes)
 
 
 @_overload  # noqa: F811
@@ -3940,7 +3952,8 @@ def upsample_nearest(input, size=None, scale_factor=None):  # noqa: F811
     return interpolate(input, size, scale_factor, mode="nearest")
 
 
-upsample_nearest.__doc__ = upsample_nearest.__doc__.format(**reproducibility_notes)
+if upsample_nearest.__doc__:
+    upsample_nearest.__doc__ = upsample_nearest.__doc__.format(**reproducibility_notes)
 
 
 @_overload  # noqa: F811
@@ -3995,7 +4008,8 @@ def upsample_bilinear(input, size=None, scale_factor=None):  # noqa: F811
     return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)
 
 
-upsample_bilinear.__doc__ = upsample_bilinear.__doc__.format(**reproducibility_notes)
+if upsample_bilinear.__doc__:
+    upsample_bilinear.__doc__ = upsample_bilinear.__doc__.format(**reproducibility_notes)
 
 GRID_SAMPLE_INTERPOLATION_MODES = {
     "bilinear": 0,
@@ -4255,109 +4269,70 @@ def affine_grid(theta: Tensor, size: List[int], align_corners: Optional[bool] =
     return torch.affine_grid_generator(theta, size, align_corners)
 
 
-def _pad(input: Tensor, pad: List[int], mode: str = "constant", value: float = 0.0) -> Tensor:
-    r"""Pads tensor.
-
-    Padding size:
-        The padding size by which to pad some dimensions of :attr:`input`
-        are described starting from the last dimension and moving forward.
-        :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor` dimensions
-        of ``input`` will be padded.
-        For example, to pad only the last dimension of the input tensor, then
-        :attr:`pad` has the form
-        :math:`(\text{padding\_left}, \text{padding\_right})`;
-        to pad the last 2 dimensions of the input tensor, then use
-        :math:`(\text{padding\_left}, \text{padding\_right},`
-        :math:`\text{padding\_top}, \text{padding\_bottom})`;
-        to pad the last 3 dimensions, use
-        :math:`(\text{padding\_left}, \text{padding\_right},`
-        :math:`\text{padding\_top}, \text{padding\_bottom}`
-        :math:`\text{padding\_front}, \text{padding\_back})`.
-
-    Padding mode:
-        See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
-        :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
-        padding modes works. Constant padding is implemented for arbitrary dimensions.
-        Replicate and reflection padding is implemented for padding the last 3
-        dimensions of 5D input tensor, or the last 2 dimensions of 4D input
-        tensor, or the last dimension of 3D input tensor.
-
-    Note:
-        When using the CUDA backend, this operation may induce nondeterministic
-        behaviour in its backward pass that is not easily switched off.
-        Please see the notes on :doc:`/notes/randomness` for background.
-
-    Args:
-        input (Tensor): N-dimensional tensor
-        pad (tuple): m-elements tuple, where
-            :math:`\frac{m}{2} \leq` input dimensions and :math:`m` is even.
-        mode: ``'constant'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
-            Default: ``'constant'``
-        value: fill value for ``'constant'`` padding. Default: ``0``
+pad = _add_docstr(
+    torch._C._nn.pad,
+    r"""
+pad(input, pad, mode="constant", value=None) -> Tensor
+
+Pads tensor.
+
+Padding size:
+    The padding size by which to pad some dimensions of :attr:`input`
+    are described starting from the last dimension and moving forward.
+    :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor` dimensions
+    of ``input`` will be padded.
+    For example, to pad only the last dimension of the input tensor, then
+    :attr:`pad` has the form
+    :math:`(\text{padding\_left}, \text{padding\_right})`;
+    to pad the last 2 dimensions of the input tensor, then use
+    :math:`(\text{padding\_left}, \text{padding\_right},`
+    :math:`\text{padding\_top}, \text{padding\_bottom})`;
+    to pad the last 3 dimensions, use
+    :math:`(\text{padding\_left}, \text{padding\_right},`
+    :math:`\text{padding\_top}, \text{padding\_bottom}`
+    :math:`\text{padding\_front}, \text{padding\_back})`.
+
+Padding mode:
+    See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
+    :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
+    padding modes works. Constant padding is implemented for arbitrary dimensions.
+    Replicate and reflection padding are implemented for padding the last 3
+    dimensions of a 4D or 5D input tensor, the last 2 dimensions of a 3D
+    or 4D input tensor, or the last dimension of a 2D or 3D input tensor.
 
-    Examples::
+Note:
+    When using the CUDA backend, this operation may induce nondeterministic
+    behaviour in its backward pass that is not easily switched off.
+    Please see the notes on :doc:`/notes/randomness` for background.
 
-        >>> t4d = torch.empty(3, 3, 4, 2)
-        >>> p1d = (1, 1) # pad last dim by 1 on each side
-        >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
-        >>> print(out.size())
-        torch.Size([3, 3, 4, 4])
-        >>> p2d = (1, 1, 2, 2) # pad last dim by (1, 1) and 2nd to last by (2, 2)
-        >>> out = F.pad(t4d, p2d, "constant", 0)
-        >>> print(out.size())
-        torch.Size([3, 3, 8, 4])
-        >>> t4d = torch.empty(3, 3, 4, 2)
-        >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
-        >>> out = F.pad(t4d, p3d, "constant", 0)
-        >>> print(out.size())
-        torch.Size([3, 9, 7, 3])
+Args:
+    input (Tensor): N-dimensional tensor
+    pad (tuple): m-elements tuple, where
+        :math:`\frac{m}{2} \leq` input dimensions and :math:`m` is even.
+    mode: ``'constant'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        Default: ``'constant'``
+    value: fill value for ``'constant'`` padding. Default: ``0``
 
-    """
-    if has_torch_function_unary(input):
-        return handle_torch_function(_pad, (input,), input, pad, mode=mode, value=value)
-    assert len(pad) % 2 == 0, "Padding length must be divisible by 2"
-    assert len(pad) // 2 <= input.dim(), "Padding length too large"
-    if mode == "constant":
-        return _VF.constant_pad_nd(input, pad, value)
-    else:
-        assert value == 0.0, 'Padding mode "{}"" doesn\'t take in value argument'.format(mode)
-        if len(pad) == 2 and (input.dim() == 2 or input.dim() == 3):
-            if mode == "reflect":
-                return torch._C._nn.reflection_pad1d(input, pad)
-            elif mode == "replicate":
-                return torch._C._nn.replication_pad1d(input, pad)
-            elif mode == "circular":
-                return _pad_circular(input, pad)
-            else:
-                raise NotImplementedError
-
-        elif len(pad) == 4 and (input.dim() == 3 or input.dim() == 4):
-            if mode == "reflect":
-                return torch._C._nn.reflection_pad2d(input, pad)
-            elif mode == "replicate":
-                return torch._C._nn.replication_pad2d(input, pad)
-            elif mode == "circular":
-                return _pad_circular(input, pad)
-            else:
-                raise NotImplementedError
-
-        elif len(pad) == 6 and (input.dim() == 4 or input.dim() == 5):
-            if mode == "reflect":
-                return torch._C._nn.reflection_pad3d(input, pad)
-            elif mode == "replicate":
-                return torch._C._nn.replication_pad3d(input, pad)
-            elif mode == "circular":
-                return _pad_circular(input, pad)
-            else:
-                raise NotImplementedError
-        else:
-            raise NotImplementedError("Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now")
+Examples::
 
+    >>> t4d = torch.empty(3, 3, 4, 2)
+    >>> p1d = (1, 1) # pad last dim by 1 on each side
+    >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
+    >>> print(out.size())
+    torch.Size([3, 3, 4, 4])
+    >>> p2d = (1, 1, 2, 2) # pad last dim by (1, 1) and 2nd to last by (2, 2)
+    >>> out = F.pad(t4d, p2d, "constant", 0)
+    >>> print(out.size())
+    torch.Size([3, 3, 8, 4])
+    >>> t4d = torch.empty(3, 3, 4, 2)
+    >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
+    >>> out = F.pad(t4d, p3d, "constant", 0)
+    >>> print(out.size())
+    torch.Size([3, 9, 7, 3])
 
-# We define this function as _pad because it takes an argument
-# named pad, which clobbers the recursive reference to the pad
-# function needed for __torch_function__ support
-pad = _pad
+""")
+# TODO: Fix via https://github.com/pytorch/pytorch/issues/75798
+pad.__module__ = "torch.nn.functional"
 
 # distance
 
@@ -4384,11 +4359,11 @@ def _pad(input: Tensor, pad: List[int], mode: str = "constant", value: float = 0
 If input has shape :math:`N \times M` then the output will have shape
 :math:`\frac{1}{2} N (N - 1)`.
 
-This function is equivalent to `scipy.spatial.distance.pdist(input,
-'minkowski', p=p)` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is
-equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`.
+This function is equivalent to ``scipy.spatial.distance.pdist(input,
+'minkowski', p=p)`` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is
+equivalent to ``scipy.spatial.distance.pdist(input, 'hamming') * M``.
 When :math:`p = \infty`, the closest scipy function is
-`scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`.
+``scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())``.
 
 Args:
     input: input tensor of shape :math:`N \times M`.
@@ -4674,172 +4649,6 @@ def fold(
         raise NotImplementedError("Input Error: Only unbatched (2D) or batched (3D) input Tensors"
                                   f"are supported (got {input.dim()}D)")
 
-
-def _pad_circular(input: Tensor, padding: List[int]) -> Tensor:
-    """Circularly pads tensor.
-
-    Tensor values at the beginning are used to pad the end, and values at the
-    end are used to pad the beginning. For example, consider a single dimension
-    with values [0, 1, 2, 3]. With circular padding of (1, 1) it would be
-    padded to [3, 0, 1, 2, 3, 0], and with padding (1, 2) it would be padded to
-    [3, 0, 1, 2, 3, 0, 1]. If negative padding is applied then the ends of the
-    tensor get removed. With circular padding of (-1, -1) the previous example
-    would become [1, 2]. Circular padding of (-1, 1) would produce
-    [1, 2, 3, 1].
-
-    The first and second dimensions of the tensor are not padded.
-
-    Args:
-        input: Tensor with shape :math:`(N, C, D[, H, W])`.
-        padding: Tuple containing the number of elements to pad each side of
-            the tensor. The length of padding must be twice the number of
-            paddable dimensions. For example, the length of padding should be 4
-            for a tensor of shape :math:`(N, C, H, W)`, and the length should
-            be 6 for a tensor of shape :math:`(N, C, D, H, W)`.
-
-    Examples::
-
-        >>> x = torch.tensor([[[[0, 1, 2], [3, 4, 5]]]])  # Create tensor
-        >>> # Example 1
-        >>> padding = (1, 1, 1, 1)
-        >>> y = F.pad(x, padding, mode='circular')
-        >>> print(y)
-        tensor([[[[5, 3, 4, 5, 3],
-                  [2, 0, 1, 2, 0],
-                  [5, 3, 4, 5, 3],
-                  [2, 0, 1, 2, 0]]]])
-        >>> print(y.shape)
-        torch.Size([1, 1, 4, 5])
-        >>> # Example 2
-        >>> padding = (1, 1, 2, 2)
-        >>> z = F.pad(x, padding, mode='circular')
-        >>> print(z)
-        tensor([[[[2, 0, 1, 2, 0],
-                  [5, 3, 4, 5, 3],
-                  [2, 0, 1, 2, 0],
-                  [5, 3, 4, 5, 3],
-                  [2, 0, 1, 2, 0],
-                  [5, 3, 4, 5, 3]]]])
-        >>> print(z.shape)
-        torch.Size([1, 1, 6, 5])
-    """
-    in_shape = input.shape
-    paddable_shape = in_shape[2:]
-    ndim = len(paddable_shape)
-
-    for idx, size in enumerate(paddable_shape):
-        # Only supports wrapping around once
-        assert padding[-(idx * 2 + 1)] <= size, "Padding value causes wrapping around more than once."
-        assert padding[-(idx * 2 + 2)] <= size, "Padding value causes wrapping around more than once."
-        # Negative padding should not result in negative sizes
-        assert (
-            padding[-(idx * 2 + 1)] + padding[-(idx * 2 + 2)] + size >= 0
-        ), "Negative padding value is resulting in an empty dimension."
-
-    # Get shape of padded tensor
-    out_shape = in_shape[:2]
-    for idx, size in enumerate(paddable_shape):
-        out_shape += (size + padding[-(idx * 2 + 1)] + padding[-(idx * 2 + 2)],)
-
-    out = input.new_empty(out_shape)
-
-    # Put original array in padded array
-    if ndim == 1:
-        out_d0 = max(padding[-2], 0)
-        out_d1 = out_shape[2] - max(padding[-1], 0)
-
-        in_d0 = max(-padding[-2], 0)
-        in_d1 = in_shape[2] - max(-padding[-1], 0)
-
-        out[..., out_d0:out_d1] = input[..., in_d0:in_d1]
-    elif ndim == 2:
-        out_d0 = max(padding[-2], 0)
-        out_d1 = out_shape[2] - max(padding[-1], 0)
-
-        out_h0 = max(padding[-4], 0)
-        out_h1 = out_shape[3] - max(padding[-3], 0)
-
-        in_d0 = max(-padding[-2], 0)
-        in_d1 = in_shape[2] - max(-padding[-1], 0)
-
-        in_h0 = max(-padding[-4], 0)
-        in_h1 = in_shape[3] - max(-padding[-3], 0)
-
-        out[..., out_d0:out_d1, out_h0:out_h1] = input[..., in_d0:in_d1, in_h0:in_h1]
-    elif ndim == 3:
-        out_d0 = max(padding[-2], 0)
-        out_d1 = out_shape[2] - max(padding[-1], 0)
-
-        out_h0 = max(padding[-4], 0)
-        out_h1 = out_shape[3] - max(padding[-3], 0)
-
-        out_w0 = max(padding[-6], 0)
-        out_w1 = out_shape[4] - max(padding[-5], 0)
-
-        in_d0 = max(-padding[-2], 0)
-        in_d1 = in_shape[2] - max(-padding[-1], 0)
-
-        in_h0 = max(-padding[-4], 0)
-        in_h1 = in_shape[3] - max(-padding[-3], 0)
-
-        in_w0 = max(-padding[-6], 0)
-        in_w1 = in_shape[4] - max(-padding[-5], 0)
-
-        out[..., out_d0:out_d1, out_h0:out_h1, out_w0:out_w1] = input[..., in_d0:in_d1, in_h0:in_h1, in_w0:in_w1]
-
-    # The following steps first pad the beginning of the tensor (left side),
-    # and then pad the end of the tensor (right side).
-    # Note: Corners will be written more than once when ndim > 1.
-
-    # Only in cases where padding values are > 0 are when additional copying
-    # is required.
-
-    # Pad first dimension (depth)
-    if padding[-2] > 0:
-        i0 = out_shape[2] - padding[-2] - max(padding[-1], 0)
-        i1 = out_shape[2] - max(padding[-1], 0)
-        o0 = 0
-        o1 = padding[-2]
-        out[:, :, o0:o1] = out[:, :, i0:i1]
-    if padding[-1] > 0:
-        i0 = max(padding[-2], 0)
-        i1 = max(padding[-2], 0) + padding[-1]
-        o0 = out_shape[2] - padding[-1]
-        o1 = out_shape[2]
-        out[:, :, o0:o1] = out[:, :, i0:i1]
-
-    # Pad second dimension (height)
-    if len(padding) > 2:
-        if padding[-4] > 0:
-            i0 = out_shape[3] - padding[-4] - max(padding[-3], 0)
-            i1 = out_shape[3] - max(padding[-3], 0)
-            o0 = 0
-            o1 = padding[-4]
-            out[:, :, :, o0:o1] = out[:, :, :, i0:i1]
-        if padding[-3] > 0:
-            i0 = max(padding[-4], 0)
-            i1 = max(padding[-4], 0) + padding[-3]
-            o0 = out_shape[3] - padding[-3]
-            o1 = out_shape[3]
-            out[:, :, :, o0:o1] = out[:, :, :, i0:i1]
-
-    # Pad third dimension (width)
-    if len(padding) > 4:
-        if padding[-6] > 0:
-            i0 = out_shape[4] - padding[-6] - max(padding[-5], 0)
-            i1 = out_shape[4] - max(padding[-5], 0)
-            o0 = 0
-            o1 = padding[-6]
-            out[:, :, :, :, o0:o1] = out[:, :, :, :, i0:i1]
-        if padding[-5] > 0:
-            i0 = max(padding[-6], 0)
-            i1 = max(padding[-6], 0) + padding[-5]
-            o0 = out_shape[4] - padding[-5]
-            o1 = out_shape[4]
-            out[:, :, :, :, o0:o1] = out[:, :, :, :, i0:i1]
-
-    return out
-
 #
 # multihead attention
 #
@@ -4988,9 +4797,11 @@ def _scaled_dot_product_attention(
     B, Nt, E = q.shape
     q = q / math.sqrt(E)
     # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
-    attn = torch.bmm(q, k.transpose(-2, -1))
     if attn_mask is not None:
-        attn += attn_mask
+        attn = torch.baddbmm(attn_mask, q, k.transpose(-2, -1))
+    else:
+        attn = torch.bmm(q, k.transpose(-2, -1))
+
     attn = softmax(attn, dim=-1)
     if dropout_p > 0.0:
         attn = dropout(attn, p=dropout_p)
@@ -5052,7 +4863,7 @@ def multi_head_attention_forward(
     value: Tensor,
     embed_dim_to_check: int,
     num_heads: int,
-    in_proj_weight: Tensor,
+    in_proj_weight: Optional[Tensor],
     in_proj_bias: Optional[Tensor],
     bias_k: Optional[Tensor],
     bias_v: Optional[Tensor],
@@ -5162,6 +4973,7 @@ def multi_head_attention_forward(
             v_proj_weight=v_proj_weight,
             static_k=static_k,
             static_v=static_v,
+            average_attn_weights=average_attn_weights,
         )
 
     is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
@@ -5199,6 +5011,7 @@ def multi_head_attention_forward(
     # compute in-projection
     #
     if not use_separate_proj_weight:
+        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
         q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
     else:
         assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
@@ -5313,8 +5126,9 @@ def multi_head_attention_forward(
     # (deep breath) calculate attention and out projection
     #
     attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p)
-    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 
     if need_weights:
         # optionally average attention weights over heads
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index 8e92e29d6c6b..33c19a0e127e 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -141,7 +141,7 @@ def rrelu(input: Tensor, lower: float = ..., upper: float = ..., training: bool
           inplace: bool = ...) -> Tensor: ...
 
 
-def gelu(input: Any): ...
+def gelu(input: Any, approximate: str = ...): ...
 
 
 def hardshrink(input: Tensor, lambd: float = ...) -> Tensor: ...
@@ -358,14 +358,14 @@ def multi_head_attention_forward(query: Tensor,
                                  value: Tensor,
                                  embed_dim_to_check: int,
                                  num_heads: int,
-                                 in_proj_weight: Tensor,
-                                 in_proj_bias: Tensor,
+                                 in_proj_weight: Optional[Tensor],
+                                 in_proj_bias: Optional[Tensor],
                                  bias_k: Optional[Tensor],
                                  bias_v: Optional[Tensor],
                                  add_zero_attn: bool,
                                  dropout_p: float,
                                  out_proj_weight: Tensor,
-                                 out_proj_bias: Tensor,
+                                 out_proj_bias: Optional[Tensor],
                                  training: bool = True,
                                  key_padding_mask: Optional[Tensor] = None,
                                  need_weights: bool = True,
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 357fb7498c56..09d233e864f3 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -4,9 +4,6 @@
 from torch import Tensor
 import torch
 
-from ..overrides import (
-    has_torch_function_variadic,
-    handle_torch_function)
 
 # These no_grad_* functions are necessary as wrappers around the parts of these
 # functions that use `with torch.no_grad()`. The JIT doesn't support context
@@ -135,8 +132,8 @@ def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor:
         >>> w = torch.empty(3, 5)
         >>> nn.init.uniform_(w)
     """
-    if has_torch_function_variadic(tensor):
-        return handle_torch_function(uniform_, (tensor,), tensor=tensor, a=a, b=b)
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(uniform_, (tensor,), tensor=tensor, a=a, b=b)
     return _no_grad_uniform_(tensor, a, b)
 
 
@@ -153,8 +150,8 @@ def normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor:
         >>> w = torch.empty(3, 5)
         >>> nn.init.normal_(w)
     """
-    if has_torch_function_variadic(tensor):
-        return handle_torch_function(normal_, (tensor,), tensor=tensor, mean=mean, std=std)
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(normal_, (tensor,), tensor=tensor, mean=mean, std=std)
     return _no_grad_normal_(tensor, mean, std)
 
 def trunc_normal_(tensor: Tensor, mean: float = 0., std: float = 1., a: float = -2., b: float = 2.) -> Tensor:
@@ -190,8 +187,8 @@ def constant_(tensor: Tensor, val: float) -> Tensor:
         >>> w = torch.empty(3, 5)
         >>> nn.init.constant_(w, 0.3)
     """
-    if has_torch_function_variadic(tensor):
-        return handle_torch_function(constant_, (tensor,), tensor=tensor, val=val)
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(constant_, (tensor,), tensor=tensor, val=val)
     return _no_grad_fill_(tensor, val)
 
 
@@ -366,7 +363,9 @@ def _calculate_correct_fan(tensor, mode):
     return fan_in if mode == 'fan_in' else fan_out
 
 
-def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+def kaiming_uniform_(
+    tensor: Tensor, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'leaky_relu'
+):
     r"""Fills the input `Tensor` with values according to the method
     described in `Delving deep into rectifiers: Surpassing human-level
     performance on ImageNet classification` - He, K. et al. (2015), using a
@@ -393,8 +392,14 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
         >>> w = torch.empty(3, 5)
         >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
     """
-    if has_torch_function_variadic(tensor):
-        return handle_torch_function(kaiming_uniform_, (tensor,), tensor=tensor, a=a, mode=mode, nonlinearity=nonlinearity)
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(
+            kaiming_uniform_,
+            (tensor,),
+            tensor=tensor,
+            a=a,
+            mode=mode,
+            nonlinearity=nonlinearity)
 
     if 0 in tensor.shape:
         warnings.warn("Initializing zero-element tensors is a no-op")
@@ -407,7 +412,9 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
         return tensor.uniform_(-bound, bound)
 
 
-def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+def kaiming_normal_(
+    tensor: Tensor, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'leaky_relu'
+):
     r"""Fills the input `Tensor` with values according to the method
     described in `Delving deep into rectifiers: Surpassing human-level
     performance on ImageNet classification` - He, K. et al. (2015), using a
@@ -462,6 +469,9 @@ def orthogonal_(tensor, gain=1):
     if tensor.ndimension() < 2:
         raise ValueError("Only tensors with 2 or more dimensions are supported")
 
+    if tensor.numel() == 0:
+        # no-op
+        return tensor
     rows = tensor.size(0)
     cols = tensor.numel() // rows
     flattened = tensor.new(rows, cols).normal_(0, 1)
diff --git a/torch/nn/intrinsic/modules/__init__.py b/torch/nn/intrinsic/modules/__init__.py
index 21536df25017..c998621f668d 100644
--- a/torch/nn/intrinsic/modules/__init__.py
+++ b/torch/nn/intrinsic/modules/__init__.py
@@ -11,6 +11,7 @@
 from .fused import LinearReLU
 from .fused import BNReLU2d
 from .fused import BNReLU3d
+from .fused import LinearBn1d
 
 
 __all__ = [
@@ -27,4 +28,5 @@
     'LinearReLU',
     'BNReLU2d',
     'BNReLU3d',
+    'LinearBn1d',
 ]
diff --git a/torch/nn/intrinsic/modules/fused.py b/torch/nn/intrinsic/modules/fused.py
index 17ab2c31eb31..b30b9a7d430c 100644
--- a/torch/nn/intrinsic/modules/fused.py
+++ b/torch/nn/intrinsic/modules/fused.py
@@ -1,5 +1,6 @@
 import torch
 from torch.nn import Conv1d, Conv2d, Conv3d, ReLU, Linear, BatchNorm1d, BatchNorm2d, BatchNorm3d
+from torch.nn.utils.parametrize import type_before_parametrizations
 
 # Used for identifying intrinsic modules used in quantization
 class _FusedModule(torch.nn.Sequential):
@@ -9,90 +10,90 @@ class ConvReLU1d(_FusedModule):
     r"""This is a sequential container which calls the Conv1d and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, relu):
-        assert type(conv) == Conv1d and type(relu) == ReLU, \
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(relu))
+                type_before_parametrizations(conv), type_before_parametrizations(relu))
         super().__init__(conv, relu)
 
 class ConvReLU2d(_FusedModule):
     r"""This is a sequential container which calls the Conv2d and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, relu):
-        assert type(conv) == Conv2d and type(relu) == ReLU, \
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(relu))
+                type_before_parametrizations(conv), type_before_parametrizations(relu))
         super().__init__(conv, relu)
 
 class ConvReLU3d(_FusedModule):
     r"""This is a sequential container which calls the Conv3d and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, relu):
-        assert type(conv) == Conv3d and type(relu) == ReLU, \
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(relu))
+                type_before_parametrizations(conv), type_before_parametrizations(relu))
         super().__init__(conv, relu)
 
 class LinearReLU(_FusedModule):
     r"""This is a sequential container which calls the Linear and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, linear, relu):
-        assert type(linear) == Linear and type(relu) == ReLU, \
+        assert type_before_parametrizations(linear) == Linear and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(linear), type(relu))
+                type_before_parametrizations(linear), type_before_parametrizations(relu))
         super().__init__(linear, relu)
 
 class ConvBn1d(_FusedModule):
     r"""This is a sequential container which calls the Conv 1d and Batch Norm 1d modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn):
-        assert type(conv) == Conv1d and type(bn) == BatchNorm1d, \
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(bn) == BatchNorm1d, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(bn))
+                type_before_parametrizations(conv), type_before_parametrizations(bn))
         super().__init__(conv, bn)
 
 class ConvBn2d(_FusedModule):
     r"""This is a sequential container which calls the Conv 2d and Batch Norm 2d modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn):
-        assert type(conv) == Conv2d and type(bn) == BatchNorm2d, \
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(bn))
+                type_before_parametrizations(conv), type_before_parametrizations(bn))
         super(ConvBn2d, self).__init__(conv, bn)
 
 class ConvBnReLU1d(_FusedModule):
     r"""This is a sequential container which calls the Conv 1d, Batch Norm 1d, and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
-        assert type(conv) == Conv1d and type(bn) == BatchNorm1d and \
-            type(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type(conv), type(bn), type(relu))
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(bn) == BatchNorm1d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
         super().__init__(conv, bn, relu)
 
 class ConvBnReLU2d(_FusedModule):
     r"""This is a sequential container which calls the Conv 2d, Batch Norm 2d, and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
-        assert type(conv) == Conv2d and type(bn) == BatchNorm2d and \
-            type(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type(conv), type(bn), type(relu))
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
         super().__init__(conv, bn, relu)
 
 class ConvBn3d(_FusedModule):
     r"""This is a sequential container which calls the Conv 3d and Batch Norm 3d modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn):
-        assert type(conv) == Conv3d and type(bn) == BatchNorm3d, \
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(bn) == BatchNorm3d, \
             'Incorrect types for input modules{}{}'.format(
-                type(conv), type(bn))
+                type_before_parametrizations(conv), type_before_parametrizations(bn))
         super().__init__(conv, bn)
 
 class ConvBnReLU3d(_FusedModule):
     r"""This is a sequential container which calls the Conv 3d, Batch Norm 3d, and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
-        assert type(conv) == Conv3d and type(bn) == BatchNorm3d and \
-            type(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type(conv), type(bn), type(relu))
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(bn) == BatchNorm3d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
         super().__init__(conv, bn, relu)
 
 
@@ -100,16 +101,25 @@ class BNReLU2d(_FusedModule):
     r"""This is a sequential container which calls the BatchNorm 2d and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, batch_norm, relu):
-        assert type(batch_norm) == BatchNorm2d and type(relu) == ReLU, \
+        assert type_before_parametrizations(batch_norm) == BatchNorm2d and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(batch_norm), type(relu))
+                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
         super().__init__(batch_norm, relu)
 
 class BNReLU3d(_FusedModule):
     r"""This is a sequential container which calls the BatchNorm 3d and ReLU modules.
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, batch_norm, relu):
-        assert type(batch_norm) == BatchNorm3d and type(relu) == ReLU, \
+        assert type_before_parametrizations(batch_norm) == BatchNorm3d and type_before_parametrizations(relu) == ReLU, \
             'Incorrect types for input modules{}{}'.format(
-                type(batch_norm), type(relu))
+                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
         super().__init__(batch_norm, relu)
+
+
+class LinearBn1d(_FusedModule):
+    r"""This is a sequential container which calls the Linear and BatchNorm1d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, linear, bn):
+        assert type_before_parametrizations(linear) == Linear and type_before_parametrizations(bn) == BatchNorm1d, \
+            'Incorrect types for input modules{}{}'.format(type_before_parametrizations(linear), type_before_parametrizations(bn))
+        super().__init__(linear, bn)
diff --git a/torch/nn/intrinsic/qat/modules/__init__.py b/torch/nn/intrinsic/qat/modules/__init__.py
index 7b166334a094..f44820c637e8 100644
--- a/torch/nn/intrinsic/qat/modules/__init__.py
+++ b/torch/nn/intrinsic/qat/modules/__init__.py
@@ -1,4 +1,5 @@
 from .linear_relu import LinearReLU
+from .linear_fused import LinearBn1d
 from .conv_fused import (
     ConvBn1d,
     ConvBn2d,
@@ -6,6 +7,7 @@
     ConvBnReLU1d,
     ConvBnReLU2d,
     ConvBnReLU3d,
+    ConvReLU1d,
     ConvReLU2d,
     ConvReLU3d,
     update_bn_stats,
@@ -14,6 +16,8 @@
 
 __all__ = [
     "LinearReLU",
+    "LinearBn1d",
+    "ConvReLU1d",
     "ConvReLU2d",
     "ConvReLU3d",
     "ConvBn1d",
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index 875ab6b11fef..7fde27abb935 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -5,6 +5,7 @@
 import torch.nn.qat as nnqat
 import torch.nn.functional as F
 from torch.nn import init
+from torch.nn.utils import fuse_conv_bn_weights
 from torch.nn.modules.utils import _single, _pair, _triple
 from torch.nn.parameter import Parameter
 from typing import TypeVar
@@ -217,7 +218,6 @@ def from_float(cls, mod):
         return qat_convbn
 
     def to_float(self):
-        modules = []
         cls = type(self)
         conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined]
             self.in_channels,
@@ -232,27 +232,30 @@ def to_float(self):
         conv.weight = torch.nn.Parameter(self.weight.detach())
         if self.bias is not None:
             conv.bias = torch.nn.Parameter(self.bias.detach())
-        modules.append(conv)
 
         if cls._FLOAT_BN_MODULE:  # type: ignore[attr-defined]
-            bn = cls._FLOAT_BN_MODULE(  # type: ignore[attr-defined]
-                self.bn.num_features,
+            # fuse bn into conv
+            conv.weight, conv.bias = fuse_conv_bn_weights(
+                conv.weight,
+                conv.bias,
+                self.bn.running_mean,
+                self.bn.running_var,
                 self.bn.eps,
-                self.bn.momentum,
-                self.bn.affine,
-                self.bn.track_running_stats)
-            bn.weight = Parameter(self.bn.weight.detach())
-            if self.bn.affine:
-                bn.bias = Parameter(self.bn.bias.detach())
-            modules.append(bn)
+                self.bn.weight,
+                self.bn.bias
+            )
 
         if cls._FLOAT_RELU_MODULE:  # type: ignore[attr-defined]
+            modules = []
+            modules.append(conv)
             relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
             modules.append(relu)
-
-        result = cls._FLOAT_MODULE(*modules)  # type: ignore[operator]
-        result.train(self.training)
-        return result
+            conv_relu = cls._FUSED_FLOAT_MODULE(*modules)  # type: ignore[attr-defined]
+            conv_relu.train(self.training)
+            return conv_relu
+        else:
+            conv.train(self.training)
+            return conv
 
 class ConvBn1d(_ConvBnNd, nn.Conv1d):
     r"""
@@ -319,6 +322,8 @@ class ConvBnReLU1d(ConvBn1d):
     _FLOAT_CONV_MODULE = nn.Conv1d
     _FLOAT_BN_MODULE = nn.BatchNorm1d
     _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU1d
 
     def __init__(self,
                  # Conv1d args
@@ -347,6 +352,43 @@ def forward(self, input):
     def from_float(cls, mod):
         return super(ConvBnReLU1d, cls).from_float(mod)
 
+class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
+    r"""A ConvReLU1d module is a fused module of Conv1d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv1d` and
+    :class:`~torch.nn.BatchNorm1d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvReLU1d
+    _FLOAT_CONV_MODULE = nn.Conv1d
+    _FLOAT_BN_MODULE = None
+    _FLOAT_RELU_MODULE = nn.ReLU
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=True, padding_mode='zeros',
+                 qconfig=None):
+        super(ConvReLU1d, self).__init__(in_channels, out_channels, kernel_size,
+                                         stride=stride, padding=padding, dilation=dilation,
+                                         groups=groups, bias=bias, padding_mode=padding_mode,
+                                         qconfig=qconfig)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super(ConvReLU1d, cls).from_float(mod)
+
 class ConvBn2d(_ConvBnNd, nn.Conv2d):
     r"""
     A ConvBn2d module is a module fused from Conv2d and BatchNorm2d,
@@ -412,6 +454,8 @@ class ConvBnReLU2d(ConvBn2d):
     _FLOAT_CONV_MODULE = nn.Conv2d
     _FLOAT_BN_MODULE = nn.BatchNorm2d
     _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU2d
 
     def __init__(self,
                  # Conv2d args
@@ -565,6 +609,8 @@ class ConvBnReLU3d(ConvBn3d):
     _FLOAT_CONV_MODULE = nn.Conv3d
     _FLOAT_BN_MODULE = nn.BatchNorm3d
     _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU3d
 
     def __init__(
         self,
diff --git a/torch/nn/intrinsic/qat/modules/linear_fused.py b/torch/nn/intrinsic/qat/modules/linear_fused.py
new file mode 100644
index 000000000000..e53303ff706f
--- /dev/null
+++ b/torch/nn/intrinsic/qat/modules/linear_fused.py
@@ -0,0 +1,167 @@
+import torch
+import torch.nn as nn
+import torch.nn.intrinsic as nni
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+
+
+class LinearBn1d(nn.modules.linear.Linear, nni._FusedModule):
+    r"""
+    A LinearBn1d module is a module fused from Linear and BatchNorm1d, attached
+    with FakeQuantize modules for weight, used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Linear` and
+    :class:torch.nn.BatchNorm1d`.
+
+    Similar to :class:`torch.nn.Linear`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    def __init__(self,
+                 # Linear args
+                 in_features, out_features, bias=True,
+                 # BatchNorm1d args
+                 # num_features: out_features
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        nn.modules.linear.Linear.__init__(self, in_features, out_features, bias)
+        assert qconfig, 'qconfig must be provded for QAT module'
+        self.qconfig = qconfig
+        self.freeze_bn = freeze_bn if self.training else True
+        self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True)
+        self.weight_fake_quant = self.qconfig.weight()
+        if bias:
+            self.bias = Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_bn_parameters()
+
+        # this needs to be called after reset_bn_parameters,
+        # as they modify the same state
+        if self.training:
+            if freeze_bn:
+                self.freeze_bn_stats()
+            else:
+                self.update_bn_stats()
+        else:
+            self.freeze_bn_stats()
+
+    def reset_running_stats(self):
+        self.bn.reset_running_stats()
+
+    def reset_bn_parameters(self):
+        self.bn.reset_running_stats()
+        init.uniform_(self.bn.weight)
+        init.zeros_(self.bn.bias)
+
+    def reset_parameters(self):
+        super(LinearBn1d, self).reset_parameters()
+
+    def update_bn_stats(self):
+        self.freeze_bn = False
+        self.bn.training = True
+        return self
+
+    def freeze_bn_stats(self):
+        self.freeze_bn = True
+        self.bn.training = False
+        return self
+
+    def forward(self, input):
+        assert self.bn.running_var is not None
+
+        # Scale the linear weights by BN's running statistics to reduce
+        # weight jitter, see https://arxiv.org/pdf/1806.08342.pdf, page 18
+        # for motivation.
+        #
+        # Instead of
+        #
+        #   x1 = F.linear(x0, fq(w), b)
+        #   x2 = self.bn(x1)
+        #
+        # We have
+        #
+        #   # scale the weight by previous batch's running statistics
+        #   scale_factor = bn.w / bn.running_std_from_prev_batch
+        #   # do the linear transformation without bias
+        #   x1_scaled = F.linear(x0, fq(w * scale_factor), 0)
+        #   # reverse the scaling and add original bias
+        #   x1_orig = x1_scaled / scale_factor + b
+        #   x2 = self.bn(x1_orig)
+
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape))
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias)
+        else:
+            zero_bias = torch.zeros(self.out_features, device=scaled_weight.device)
+        linear_out = F.linear(input, scaled_weight, zero_bias)
+        linear_out_orig = linear_out / scale_factor.reshape(bias_shape)
+        if self.bias is not None:
+            linear_out_orig = linear_out_orig + self.bias.reshape(bias_shape)
+        bn_out = self.bn(linear_out_orig)
+        return bn_out
+
+    def train(self, mode=True):
+        """
+        Batchnorm's training behavior is using the self.training flag. Prevent
+        changing it if BN is frozen. This makes sure that calling `model.train()`
+        on a model with a frozen BN will behave properly.
+        """
+        self.training = mode
+        if not self.freeze_bn:
+            for module in self.children():
+                module.train(mode)
+        return self
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+            Args: `mod' a float module, either produced by torch.ao.quantization
+            utilities or directly from user
+        """
+        assert type(mod) == nni.LinearBn1d, 'qat.' + cls.__name__ + \
+            '.from_float only works for ' + nni.LinearBn1d.__name__
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid config'
+        qconfig = mod.qconfig
+        linear, bn = mod[0], mod[1]
+        qat_linearbn = cls(linear.in_features, linear.out_features, linear.bias is not None,
+                           bn.eps, bn.momentum,
+                           False, qconfig)
+        qat_linearbn.weight = linear.weight
+        qat_linearbn.bias = linear.bias
+        qat_linearbn.bn.weight = bn.weight
+        qat_linearbn.bn.bias = bn.bias
+        qat_linearbn.bn.running_mean = bn.running_mean
+        qat_linearbn.bn.running_var = bn.running_var
+        qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked
+        return qat_linearbn
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features)
+        linear.weight, linear.bias = fuse_linear_bn_weights(
+            self.weight,
+            self.bias,
+            self.bn.running_mean,
+            self.bn.running_var,
+            self.bn.eps,
+            self.bn.weight,
+            self.bn.bias)
+        return linear
diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
index c30b3109ef60..2b0b9b37ef40 100644
--- a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -44,3 +44,7 @@ def _get_name(self):
     @classmethod
     def from_float(cls, mod):
         return super(LinearReLU, cls).from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qlinear_relu):
+        return super().from_reference(ref_qlinear_relu[0])
diff --git a/torch/nn/intrinsic/quantized/modules/bn_relu.py b/torch/nn/intrinsic/quantized/modules/bn_relu.py
index d9c53c69e015..0727e57553d0 100644
--- a/torch/nn/intrinsic/quantized/modules/bn_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/bn_relu.py
@@ -17,8 +17,8 @@ class BNReLU2d(nnq.BatchNorm2d):
     """
     _FLOAT_MODULE = torch.nn.intrinsic.BNReLU2d
 
-    def __init__(self, num_features, eps=1e-5, momentum=0.1):
-        super(BNReLU2d, self).__init__(num_features, eps=eps, momentum=momentum)
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super(BNReLU2d, self).__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -37,6 +37,9 @@ def from_float(cls, mod):
         # TODO: Add qat support for BNReLU2d
         return super(BNReLU2d, cls).from_float(mod)
 
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
 
 class BNReLU3d(nnq.BatchNorm3d):
     r"""
@@ -50,8 +53,8 @@ class BNReLU3d(nnq.BatchNorm3d):
     """
     _FLOAT_MODULE = torch.nn.intrinsic.BNReLU3d
 
-    def __init__(self, num_features, eps=1e-5, momentum=0.1):
-        super(BNReLU3d, self).__init__(num_features, eps=eps, momentum=momentum)
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super(BNReLU3d, self).__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -69,3 +72,7 @@ def _get_name(self):
     def from_float(cls, mod):
         # TODO: Add qat support for BNReLU3d
         return super(BNReLU3d, cls).from_float(mod)
+
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
diff --git a/torch/nn/intrinsic/quantized/modules/conv_relu.py b/torch/nn/intrinsic/quantized/modules/conv_relu.py
index 40cea2beb8a9..d4c24ee241f6 100644
--- a/torch/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/conv_relu.py
@@ -9,6 +9,7 @@
 
 _reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
 
+# TODO: factor out the common parts to ConvNd
 class ConvReLU1d(nnq.Conv1d):
     r"""
     A ConvReLU1d module is a fused module of Conv1d and ReLU
@@ -23,11 +24,11 @@ class ConvReLU1d(nnq.Conv1d):
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
-                 padding_mode='zeros'):
+                 padding_mode='zeros', device=None, dtype=None):
         super(ConvReLU1d, self).__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
-            padding_mode=padding_mode)
+            padding_mode=padding_mode, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -53,6 +54,12 @@ def from_float(cls, mod):
                 mod.bn.eps, mod.bn.weight, mod.bn.bias)
         return super(ConvReLU1d, cls).from_float(mod)
 
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU1d, \
+            "BatchNorm1d should be fused into Conv1d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
 class ConvReLU2d(nnq.Conv2d):
     r"""
     A ConvReLU2d module is a fused module of Conv2d and ReLU
@@ -67,11 +74,11 @@ class ConvReLU2d(nnq.Conv2d):
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
-                 padding_mode='zeros'):
+                 padding_mode='zeros', device=None, dtype=None):
         super(ConvReLU2d, self).__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
-            padding_mode=padding_mode)
+            padding_mode=padding_mode, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -96,6 +103,12 @@ def from_float(cls, mod):
                 mod.bn.eps, mod.bn.weight, mod.bn.bias)
         return super(ConvReLU2d, cls).from_float(mod)
 
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU2d, \
+            "BatchNorm2d should be fused into Conv2d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
 
 class ConvReLU3d(nnq.Conv3d):
     r"""
@@ -110,12 +123,12 @@ class ConvReLU3d(nnq.Conv3d):
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
-                 padding_mode='zeros'):
+                 padding_mode='zeros', device=None, dtype=None):
         assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
         super(ConvReLU3d, self).__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
-            padding_mode=padding_mode)
+            padding_mode=padding_mode, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -145,3 +158,9 @@ def from_float(cls, mod):
                 mod.bn.bias,
             )
         return super(ConvReLU3d, cls).from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU3d, \
+            "BatchNorm3d should be fused into Conv3d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index f0d04441e9be..8e10c1e798ec 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -16,17 +16,31 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
         if size == 1 and world_size < 2:
             raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
 
-        # calculate mean/invstd for input.
-        mean, invstd = torch.batch_norm_stats(input, eps)
-
-        count = torch.full((1,), input.numel() // input.size(1),
-                           dtype=mean.dtype,
-                           device=mean.device)
+        num_channels = input.shape[1]
+        if input.numel() > 0:
+            # calculate mean/invstd for input.
+            mean, invstd = torch.batch_norm_stats(input, eps)
+
+            count = torch.full(
+                (1,),
+                input.numel() // input.size(1),
+                dtype=mean.dtype,
+                device=mean.device
+            )
 
+            # C, C, 1 -> (2C + 1)
+            combined = torch.cat([mean, invstd, count], dim=0)
+        else:
+            # for empty input, set stats and the count to zero. The stats with
+            # zero count will be filtered out later when computing global mean
+            # & invstd, but they still needs to participate the all_gather
+            # collective communication to unblock other peer processes.
+            combined = torch.zeros(
+                2 * num_channels + 1,
+                dtype=input.dtype,
+                device=input.device
+            )
 
-        num_channels = input.shape[1]
-        # C, C, 1 -> (2C + 1)
-        combined = torch.cat([mean, invstd, count], dim=0)
         # Use allgather instead of allreduce because count could be different across
         # ranks, simple all reduce op can not give correct results.
         # batch_norm_gather_stats_with_counts calculates global mean & invstd based on
@@ -46,13 +60,19 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
         else:
             # world_size * (2C + 1)
             combined_list = [
-                torch.empty_like(combined) for k in range(world_size)
+                torch.empty_like(combined) for _ in range(world_size)
             ]
             dist.all_gather(combined_list, combined, process_group, async_op=False)
             combined = torch.stack(combined_list, dim=0)
             # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
             mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
 
+        # remove stats from empty inputs
+        mask = count_all.squeeze(-1) >= 1
+        count_all = count_all[mask]
+        mean_all = mean_all[mask]
+        invstd_all = invstd_all[mask]
+
         # calculate global mean & invstd
         mean, invstd = torch.batch_norm_gather_stats_with_counts(
             input,
@@ -69,8 +89,10 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
         self.process_group = process_group
 
         # apply element-wise normalization
-        out = torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
-        return out
+        if input.numel() > 0:
+            return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
+        else:
+            return torch.empty_like(input)
 
     @staticmethod
     def backward(self, grad_output):
@@ -80,45 +102,64 @@ def backward(self, grad_output):
         grad_input = grad_weight = grad_bias = None
         process_group = self.process_group
 
-        # calculate local stats as well as grad_weight / grad_bias
-        sum_dy, sum_dy_xmu, grad_weight, grad_bias = torch.batch_norm_backward_reduce(
-            grad_output,
-            saved_input,
-            mean,
-            invstd,
-            weight,
-            self.needs_input_grad[0],
-            self.needs_input_grad[1],
-            self.needs_input_grad[2]
-        )
-
-        if self.needs_input_grad[0]:
-            # synchronizing stats used to calculate input gradient.
-            num_channels = sum_dy.shape[0]
-            combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
-            torch.distributed.all_reduce(
-                combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
-            sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
-
-            # backward pass for gradient calculation
-            grad_input = torch.batch_norm_backward_elemt(
+        if saved_input.numel() > 0:
+            # calculate local stats as well as grad_weight / grad_bias
+            sum_dy, sum_dy_xmu, grad_weight, grad_bias = torch.batch_norm_backward_reduce(
                 grad_output,
                 saved_input,
                 mean,
                 invstd,
                 weight,
-                sum_dy,
-                sum_dy_xmu,
-                count_tensor
+                self.needs_input_grad[0],
+                self.needs_input_grad[1],
+                self.needs_input_grad[2]
             )
 
-        # synchronizing of grad_weight / grad_bias is not needed as distributed
-        # training would handle all reduce.
-        if weight is None or not self.needs_input_grad[1]:
-            grad_weight = None
-
-        if weight is None or not self.needs_input_grad[2]:
-            grad_bias = None
+            if self.needs_input_grad[0]:
+                # synchronizing stats used to calculate input gradient.
+                num_channels = sum_dy.shape[0]
+                combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
+                torch.distributed.all_reduce(
+                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
+                sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
+
+                # backward pass for gradient calculation
+                grad_input = torch.batch_norm_backward_elemt(
+                    grad_output,
+                    saved_input,
+                    mean,
+                    invstd,
+                    weight,
+                    sum_dy,
+                    sum_dy_xmu,
+                    count_tensor
+                )
+            # synchronizing of grad_weight / grad_bias is not needed as distributed
+            # training would handle all reduce.
+            if weight is None or not self.needs_input_grad[1]:
+                grad_weight = None
+
+            if weight is None or not self.needs_input_grad[2]:
+                grad_bias = None
+        else:
+            # This process got an empty input tensor in the forward pass.
+            # Although this process can directly set grad_input as an empty
+            # tensor of zeros, it still needs to participate in the collective
+            # communication to unblock its peers, as other peer processes might
+            # have recieved non-empty inputs.
+            num_channels = saved_input.shape[1]
+            if self.needs_input_grad[0]:
+                # launch all_reduce to unblock other peer processes
+                combined = torch.zeros(
+                    2 * num_channels,
+                    dtype=saved_input.dtype,
+                    device=saved_input.device
+                )
+                torch.distributed.all_reduce(
+                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
+
+            # Leave grad_input, grad_weight and grad_bias as None, which will be
+            # interpreted by the autograd engine as Tensors full of zeros.
 
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 6066d855c8c7..27a73579fd35 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -174,14 +174,11 @@ class Hardtanh(Module):
 
     .. math::
         \text{HardTanh}(x) = \begin{cases}
-            1 & \text{ if } x > 1 \\
-            -1 & \text{ if } x < -1 \\
+            \text{max\_val} & \text{ if } x > \text{ max\_val } \\
+            \text{min\_val} & \text{ if } x < \text{ min\_val } \\
             x & \text{ otherwise } \\
         \end{cases}
 
-    The range of the linear region :math:`[-1, 1]` can be adjusted using
-    :attr:`min_val` and :attr:`max_val`.
-
     Args:
         min_val: minimum value of the linear region range. Default: -1
         max_val: maximum value of the linear region range. Default: 1
@@ -654,6 +651,13 @@ class GELU(Module):
 
     where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
 
+    When the approximate argument is 'tanh', Gelu is estimated with:
+        :math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (string, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+
     Shape:
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
@@ -666,8 +670,18 @@ class GELU(Module):
         >>> input = torch.randn(2)
         >>> output = m(input)
     """
+    __constants__ = ['approximate']
+    approximate: str
+
+    def __init__(self, approximate: str = 'none') -> None:
+        super(GELU, self).__init__()
+        self.approximate = approximate
+
     def forward(self, input: Tensor) -> Tensor:
-        return F.gelu(input)
+        return F.gelu(input, approximate=self.approximate)
+
+    def extra_repr(self) -> str:
+        return 'approximate={}'.format(self.approximate)
 
 
 class Hardshrink(Module):
@@ -879,6 +893,29 @@ class MultiheadAttention(Module):
 
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
+    ``forward()`` will use a special optimized implementation if all of the following
+    conditions are met:
+
+    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
+      restriction will be loosened in the future.)
+    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
+    - training is disabled (using ``.eval()``)
+    - dropout is 0
+    - ``add_bias_kv`` is ``False``
+    - ``add_zero_attn`` is ``False``
+    - ``batch_first`` is ``True`` and the input is batched
+    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
+    - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
+    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
+      nor ``attn_mask`` is passed
+
+    If the optimized implementation is in use, a
+    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
+    ``query``/``key``/``value`` to represent padding more efficiently than using a
+    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
+    will be returned, and an additional speedup proportional to the fraction of the input
+    that is padding can be expected.
+
     Args:
         embed_dim: Total dimension of the model.
         num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
@@ -897,6 +934,7 @@ class MultiheadAttention(Module):
 
         >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
         >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+
     """
     __constants__ = ['batch_first']
     bias_k: Optional[torch.Tensor]
@@ -1003,7 +1041,7 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
             the attention weight.
         average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
             heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
-            effect when ``need_weights=True.``. Default: True (i.e. average weights across heads)
+            effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
 
     Outputs:
         - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
@@ -1014,14 +1052,91 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
           returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
           :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
           :math:`S` is the source sequence length. If ``average_weights=False``, returns attention weights per
-          head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`.
+          head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
 
         .. note::
             `batch_first` argument is ignored for unbatched inputs.
         """
         is_batched = query.dim() == 3
+        why_not_fast_path = ''
+        if not is_batched:
+            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype:
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.dropout:
+            why_not_fast_path = f"dropout was {self.dropout}, required zero"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
+            why_not_fast_path = "key_padding_mask and attn_mask are not supported with NestedTensor input"
+        elif not query.is_nested and key_padding_mask is not None and attn_mask is not None:
+            why_not_fast_path = "key_padding_mask and attn_mask were both supplied"
+
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_torch_function"
+            elif not all([(x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]):
+                why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
+            elif torch.is_grad_enabled() and any([x.requires_grad for x in tensor_args]):
+                why_not_fast_path = ("grad is enabled and at least one of query or the "
+                                     "input/output projection weights or biases requires_grad")
+            if not why_not_fast_path:
+                return torch._native_multi_head_attention(
+                    query,
+                    key,
+                    value,
+                    self.embed_dim,
+                    self.num_heads,
+                    self.in_proj_weight,
+                    self.in_proj_bias,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    key_padding_mask if key_padding_mask is not None else attn_mask,
+                    need_weights,
+                    average_attn_weights)
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
+                                f"The fast path was not hit because {why_not_fast_path}")
+
         if self.batch_first and is_batched:
-            query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+            # make sure that the transpose op does not affect the "is" property
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = [x.transpose(1, 0) for x in (query, key)]
+                    value = key
+            else:
+                query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
 
         if not self._qkv_same_embed_dim:
             attn_output, attn_output_weights = F.multi_head_attention_forward(
@@ -1197,7 +1312,7 @@ def __init__(self, dim: Optional[int] = None) -> None:
         self.dim = dim
 
     def __setstate__(self, state):
-        self.__dict__.update(state)
+        super().__setstate__(state)
         if not hasattr(self, 'dim'):
             self.dim = None
 
@@ -1253,7 +1368,7 @@ def __init__(self, dim: Optional[int] = None) -> None:
         self.dim = dim
 
     def __setstate__(self, state):
-        self.__dict__.update(state)
+        super().__setstate__(state)
         if not hasattr(self, 'dim'):
             self.dim = None
 
@@ -1324,7 +1439,7 @@ def __init__(self, dim: Optional[int] = None) -> None:
         self.dim = dim
 
     def __setstate__(self, state):
-        self.__dict__.update(state)
+        super().__setstate__(state)
         if not hasattr(self, 'dim'):
             self.dim = None
 
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 1551aba6df76..65271ebd1b81 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -118,14 +118,14 @@ def _load_from_state_dict(
 class _BatchNorm(_NormBase):
     def __init__(
         self,
-        num_features,
-        eps=1e-5,
-        momentum=0.1,
-        affine=True,
-        track_running_stats=True,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
         device=None,
         dtype=None
-    ):
+    ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
         super(_BatchNorm, self).__init__(
             num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 794c3f671b4c..916ac3eae517 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -497,7 +497,9 @@ def __iadd__(self, parameters: Iterable[Any]) -> 'ParameterList':
         return self.extend(parameters)
 
     def __dir__(self):
-        return list(range(self._size))
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
 
     def append(self, value: Any) -> 'ParameterList':
         """Appends a given value at the end of the list.
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index dcd1c9b961f0..4fd9f1386939 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -191,6 +191,8 @@ class Conv1d(_ConvNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation, a single
       number or a one-element tuple.
 
@@ -323,6 +325,8 @@ class Conv2d(_ConvNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation, a single
       number or a tuple.
 
@@ -462,6 +466,8 @@ class Conv3d(_ConvNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation.
 
     * :attr:`padding` controls the amount of padding applied to the input. It
@@ -506,7 +512,7 @@ class Conv3d(_ConvNd):
     Shape:
         - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`,
-            where
+          where
 
           .. math::
               D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
@@ -610,22 +616,24 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
     # compatibility
     def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
                         stride: List[int], padding: List[int], kernel_size: List[int],
-                        dilation: Optional[List[int]] = None) -> List[int]:
+                        num_spatial_dims: int, dilation: Optional[List[int]] = None) -> List[int]:
         if output_size is None:
             ret = _single(self.output_padding)  # converting to list if was not already
         else:
-            k = input.dim() - 2
-            if len(output_size) == k + 2:
-                output_size = output_size[2:]
-            if len(output_size) != k:
+            has_batch_dim = input.dim() == num_spatial_dims + 2
+            num_non_spatial_dims = 2 if has_batch_dim else 1
+            if len(output_size) == num_non_spatial_dims + num_spatial_dims:
+                output_size = output_size[num_non_spatial_dims:]
+            if len(output_size) != num_spatial_dims:
                 raise ValueError(
-                    "output_size must have {} or {} elements (got {})"
-                    .format(k, k + 2, len(output_size)))
+                    "ConvTranspose{}D: for {}D input, output_size must have {} or {} elements (got {})"
+                    .format(num_spatial_dims, input.dim(), num_spatial_dims,
+                            num_non_spatial_dims + num_spatial_dims, len(output_size)))
 
             min_sizes = torch.jit.annotate(List[int], [])
             max_sizes = torch.jit.annotate(List[int], [])
-            for d in range(k):
-                dim_size = ((input.size(d + 2) - 1) * stride[d] -
+            for d in range(num_spatial_dims):
+                dim_size = ((input.size(d + num_non_spatial_dims) - 1) * stride[d] -
                             2 * padding[d] +
                             (dilation[d] if dilation is not None else 1) * (kernel_size[d] - 1) + 1)
                 min_sizes.append(dim_size)
@@ -642,7 +650,7 @@ def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
                             output_size, min_sizes, max_sizes, input.size()[2:]))
 
             res = torch.jit.annotate(List[int], [])
-            for d in range(k):
+            for d in range(num_spatial_dims):
                 res.append(output_size[d] - min_sizes[d])
 
             ret = res
@@ -661,6 +669,8 @@ class ConvTranspose1d(_ConvTransposeNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation.
 
     * :attr:`padding` controls the amount of implicit zero padding on both
@@ -769,8 +779,10 @@ def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Ten
         assert isinstance(self.padding, tuple)
         # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 1
         output_padding = self._output_padding(
-            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
         return F.conv_transpose1d(
             input, self.weight, self.bias, self.stride, self.padding,
             output_padding, self.groups, self.dilation)
@@ -788,6 +800,8 @@ class ConvTranspose2d(_ConvTransposeNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation.
 
     * :attr:`padding` controls the amount of implicit zero padding on both
@@ -897,7 +911,7 @@ def __init__(
         output_padding: _size_2_t = 0,
         groups: int = 1,
         bias: bool = True,
-        dilation: int = 1,
+        dilation: _size_2_t = 1,
         padding_mode: str = 'zeros',
         device=None,
         dtype=None
@@ -919,8 +933,10 @@ def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Ten
         assert isinstance(self.padding, tuple)
         # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 2
         output_padding = self._output_padding(
-            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
 
         return F.conv_transpose2d(
             input, self.weight, self.bias, self.stride, self.padding,
@@ -941,6 +957,8 @@ class ConvTranspose3d(_ConvTransposeNd):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     * :attr:`stride` controls the stride for the cross-correlation.
 
     * :attr:`padding` controls the amount of implicit zero padding on both
@@ -994,7 +1012,7 @@ class ConvTranspose3d(_ConvTransposeNd):
     Shape:
         - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or
-            :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where
+          :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where
 
         .. math::
               D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0]
@@ -1067,8 +1085,10 @@ def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Ten
         assert isinstance(self.padding, tuple)
         # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 3
         output_padding = self._output_padding(
-            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
 
         return F.conv_transpose3d(
             input, self.weight, self.bias, self.stride, self.padding,
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py
index 00513ac2aa08..174659d3d30f 100644
--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@@ -21,7 +21,7 @@ class PairwiseDistance(Module):
         - Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension`
         - Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1
         - Output: :math:`(N)` or :math:`()` based on input dimension.
-                If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
+          If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
     Examples::
         >>> pdist = nn.PairwiseDistance(p=2)
         >>> input1 = torch.randn(100, 128)
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 6498dea6f918..5c10bd21df2b 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -103,7 +103,7 @@ class Fold(Module):
     Shape:
         - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)`
         - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
-                  or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
+          or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
 
     Examples::
 
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index ede96c1aaa59..dc810af2762e 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -89,7 +89,7 @@ class LazyModuleMixin:
     >>> lazy_mlp = LazyMLP()
     >>> # transforms the network's device and dtype
     >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
-    >>> lazy_mlp = mlp.cuda().double()
+    >>> lazy_mlp = lazy_mlp.cuda().double()
     >>> lazy_mlp
     LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
       (relu1): ReLU()
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 539ab8aa1871..18bf25f71023 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -1,4 +1,5 @@
 import math
+from typing import Any
 
 import torch
 from torch import Tensor
@@ -9,6 +10,14 @@
 from .lazy import LazyModuleMixin
 
 
+__all__ = [
+    'Bilinear',
+    'Identity',
+    'LazyLinear',
+    'Linear',
+]
+
+
 class Identity(Module):
     r"""A placeholder identity operator that is argument-insensitive.
 
@@ -29,7 +38,7 @@ class Identity(Module):
         torch.Size([128, 20])
 
     """
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super(Identity, self).__init__()
 
     def forward(self, input: Tensor) -> Tensor:
@@ -41,6 +50,8 @@ class Linear(Module):
 
     This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     Args:
         in_features: size of each input sample
         out_features: size of each output sample
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index b77bfb47b39a..b93fd4083eee 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -451,8 +451,9 @@ class KLDivLoss(_Loss):
         >>> target = F.softmax(torch.rand(3, 5))
         >>> output = kl_loss(input, target)
 
+        >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
         >>> log_target = F.log_softmax(torch.rand(3, 5))
-        >>> output = kl_loss(input, log_target, log_target=True)
+        >>> output = kl_loss(input, log_target)
     """
     __constants__ = ['reduction']
 
@@ -1216,7 +1217,7 @@ class CosineEmbeddingLoss(_Loss):
     r"""Creates a criterion that measures the loss given input tensors
     :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
     This is used for measuring whether two inputs are similar or dissimilar,
-    using the cosine distance, and is typically used for learning nonlinear
+    using the cosine similarity, and is typically used for learning nonlinear
     embeddings or semi-supervised learning.
 
     The loss function for each sample is:
@@ -1449,9 +1450,9 @@ class TripletMarginLoss(_Loss):
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
 
     Shape:
-        - Input: :math:`(N, D)` or :math`(D)` where :math:`D` is the vector dimension.
+        - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension.
         - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
-                  input shape is :math`(N, D)`; a scalar otherwise.
+          input shape is :math:`(N, D)`; a scalar otherwise.
 
     Examples::
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 05c1ce462ad3..4110e2b28d7e 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -8,7 +8,6 @@
 import torch.utils.hooks as hooks
 
 from torch import Tensor, device, dtype
-import typing
 from typing import Union, Tuple, Any, Callable, Iterator, Set, Optional, overload, TypeVar, Mapping, Dict, List
 from ...utils.hooks import RemovableHandle
 
@@ -199,7 +198,7 @@ def _forward_unimplemented(self, *input: Any) -> None:
         instead of this since the former takes care of running the
         registered hooks while the latter silently ignores them.
     """
-    raise NotImplementedError
+    raise NotImplementedError(f"Module [{type(self).__name__}] is missing the required \"forward\" function")
 
 
 class Module:
@@ -237,6 +236,7 @@ def forward(self, x):
 
     dump_patches: bool = False
 
+    _version: int = 1
     r"""This allows better BC support for :meth:`load_state_dict`. In
     :meth:`state_dict`, the version number will be saved as in the attribute
     `_metadata` of the returned state dict, and thus pickled. `_metadata` is a
@@ -247,7 +247,6 @@ def forward(self, x):
     be bumped, and the module's `_load_from_state_dict` method can compare the
     version number and do appropriate changes if the state dict is from before
     the change."""
-    _version: int = 1
 
     training: bool
     _is_full_backward_hook: Optional[bool]
@@ -268,6 +267,7 @@ def __init__(self) -> None:
         self._forward_pre_hooks: Dict[int, Callable] = OrderedDict()
         self._state_dict_hooks: Dict[int, Callable] = OrderedDict()
         self._load_state_dict_pre_hooks: Dict[int, Callable] = OrderedDict()
+        self._load_state_dict_post_hooks: Dict[int, Callable] = OrderedDict()
         self._modules: Dict[str, Optional['Module']] = OrderedDict()
 
     forward: Callable[..., Any] = _forward_unimplemented
@@ -404,7 +404,7 @@ def get_submodule(self, target: str) -> "Module":
         For example, let's say you have an ``nn.Module`` ``A`` that
         looks like this:
 
-        .. code-block::text
+        .. code-block:: text
 
             A(
                 (net_b): Module(
@@ -556,7 +556,7 @@ def get_extra_state(self) -> Any:
         """
         raise RuntimeError(
             "Reached a code path in Module.get_extra_state() that should never be called. "
-            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
             "to report this bug.")
 
     def set_extra_state(self, state: Any):
@@ -571,7 +571,7 @@ def set_extra_state(self, state: Any):
         """
         raise RuntimeError(
             "Reached a code path in Module.set_extra_state() that should never be called. "
-            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
             "to report this bug.")
 
     def _apply(self, fn):
@@ -688,6 +688,25 @@ def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
         """
         return self._apply(lambda t: t.cuda(device))
 
+    def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
+        r"""Moves all model parameters and buffers to the IPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on IPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.ipu(device))
+
     def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
         r"""Moves all model parameters and buffers to the XPU.
 
@@ -896,7 +915,7 @@ def to(self, *args, **kwargs):
                 warnings.warn(
                     "Complex modules are a new feature under active development whose design may change, "
                     "and some modules might not work as expected when using complex tensors as parameters or buffers. "
-                    "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+                    "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
                     "if a complex module does not work as expected.")
 
         def convert(t):
@@ -1165,6 +1184,8 @@ def __setstate__(self, state):
             self._state_dict_hooks = OrderedDict()
         if '_load_state_dict_pre_hooks' not in self.__dict__:
             self._load_state_dict_pre_hooks = OrderedDict()
+        if '_load_state_dict_post_hooks' not in self.__dict__:
+            self._load_state_dict_post_hooks = OrderedDict()
         if '_non_persistent_buffers_set' not in self.__dict__:
             self._non_persistent_buffers_set = set()
         if '_is_full_backward_hook' not in self.__dict__:
@@ -1280,25 +1301,47 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns
     # back that same object. But if they pass nothing, an `OrederedDict` is created and returned.
-    T_destination = TypeVar('T_destination', bound=Mapping[str, Tensor])
+    T_destination = TypeVar('T_destination', bound=Dict[str, Any])
 
     @overload
-    def state_dict(self, destination: T_destination, prefix: str = ..., keep_vars: bool = ...) -> T_destination:
+    def state_dict(self, *, destination: T_destination, prefix: str = ..., keep_vars: bool = ...) -> T_destination:
         ...
 
-    # TODO: Remove string escape once Python-3.6 no longer supported
-    # See https://github.com/python/mypy/issues/6904#issuecomment-496207426
     @overload
-    def state_dict(self, prefix: str = ..., keep_vars: bool = ...) -> typing.OrderedDict[str, Tensor]:
+    def state_dict(self, *, prefix: str = ..., keep_vars: bool = ...) -> Dict[str, Any]:
         ...
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
+    # TODO: Change `*args` to `*` and remove the copprespinding warning in docs when BC allows.
+    # Also remove the logic for arg parsing together.
+    def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
         r"""Returns a dictionary containing a whole state of the module.
 
         Both parameters and persistent buffers (e.g. running averages) are
         included. Keys are corresponding parameter and buffer names.
         Parameters and buffers set to ``None`` are not included.
 
+        .. warning::
+            Currently ``state_dict()`` also accepts positional arguments for
+            ``destination``, ``prefix`` and ``keep_vars`` in order. However,
+            this is being deprecated and keyword arguments will be enforced in
+            future releases.
+
+        .. warning::
+            Please avoid the use of argument ``destination`` as it is not
+            designed for end-users.
+
+        Args:
+            destination (dict, optional): If provided, the state of module will
+                be updated into the dict and the same object is returned.
+                Otherwise, an ``OrderedDict`` will be created and returned.
+                Default: ``None``.
+            prefix (str, optional): a prefix added to parameter and buffer
+                names to compose the keys in state_dict. Default: ``''``.
+            keep_vars (bool, optional): by default the :class:`~torch.Tensor` s
+                returned in the state dict are detached from autograd. If it's
+                set to ``True``, detaching will not be performed.
+                Default: ``False``.
+
         Returns:
             dict:
                 a dictionary containing a whole state of the module
@@ -1309,14 +1352,33 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             ['bias', 'weight']
 
         """
+
+        # TODO: Remove `args` and the parsing logic when BC allows.
+        if len(args) > 0:
+            if destination is None:
+                destination = args[0]
+            if len(args) > 1 and prefix == '':
+                prefix = args[1]
+            if len(args) > 2 and keep_vars is False:
+                keep_vars = args[2]
+            # DeprecationWarning is ignored by default
+            warnings.warn(
+                "Positional args are being deprecated, use kwargs instead. Refer to "
+                "https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict"
+                " for details.")
+
         if destination is None:
             destination = OrderedDict()
             destination._metadata = OrderedDict()
-        destination._metadata[prefix[:-1]] = local_metadata = dict(version=self._version)
+
+        local_metadata = dict(version=self._version)
+        if hasattr(destination, "_metadata"):
+            destination._metadata[prefix[:-1]] = local_metadata
+
         self._save_to_state_dict(destination, prefix, keep_vars)
         for name, module in self._modules.items():
             if module is not None:
-                module.state_dict(destination, prefix + name + '.', keep_vars=keep_vars)
+                module.state_dict(destination=destination, prefix=prefix + name + '.', keep_vars=keep_vars)
         for hook in self._state_dict_hooks.values():
             hook_result = hook(self, destination, prefix, local_metadata)
             if hook_result is not None:
@@ -1344,6 +1406,37 @@ def _register_load_state_dict_pre_hook(self, hook, with_module=False):
         self._load_state_dict_pre_hooks[handle.id] = hook
         return handle
 
+    def register_load_state_dict_post_hook(self, hook):
+        r"""Registers a post hook to be run after module's ``load_state_dict``
+        is called.
+
+        It should have the following signature::
+            hook(module, incompatible_keys) -> None
+
+        The ``module`` argument is the current module that this hook is registered
+        on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting
+        of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys``
+        is a ``list`` of ``str`` containing the missing keys and
+        ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys.
+
+        The given incompatible_keys can be modified inplace if needed.
+
+        Note that the checks performed when calling :func:`load_state_dict` with
+        ``strict=True`` are affected by modifications the hook makes to
+        ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either
+        set of keys will result in an error being thrown when ``strict=True``, and
+        clearning out both missing and unexpected keys will avoid an error.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._load_state_dict_post_hooks)
+        self._load_state_dict_post_hooks[handle.id] = hook
+        return handle
+
+
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         r"""Copies parameters and buffers from :attr:`state_dict` into only
@@ -1438,7 +1531,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                     if input_name not in self._modules and input_name not in local_state:
                         unexpected_keys.append(key)
 
-    def load_state_dict(self, state_dict: 'OrderedDict[str, Tensor]',
+    def load_state_dict(self, state_dict: Mapping[str, Any],
                         strict: bool = True):
         r"""Copies parameters and buffers from :attr:`state_dict` into
         this module and its descendants. If :attr:`strict` is ``True``, then
@@ -1462,13 +1555,16 @@ def load_state_dict(self, state_dict: 'OrderedDict[str, Tensor]',
             exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a
             ``RuntimeError``.
         """
+        if not isinstance(state_dict, Mapping):
+            raise TypeError("Expected state_dict to be dict-like, got {}.".format(type(state_dict)))
+
         missing_keys: List[str] = []
         unexpected_keys: List[str] = []
         error_msgs: List[str] = []
 
         # copy state_dict so _load_from_state_dict can modify it
         metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
+        state_dict = OrderedDict(state_dict)
         if metadata is not None:
             # mypy isn't aware that "_metadata" exists in state_dict
             state_dict._metadata = metadata  # type: ignore[attr-defined]
@@ -1481,6 +1577,16 @@ def load(module, prefix=''):
                 if child is not None:
                     load(child, prefix + name + '.')
 
+            # Note that the hook can modify missing_keys and unexpected_keys.
+            incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys)
+            for hook in module._load_state_dict_post_hooks.values():
+                out = hook(module, incompatible_keys)
+                assert out is None, (
+                    "Hooks registered with ``register_load_state_dict_post_hook`` are not"
+                    "expected to return new values, if incompatible_keys need to be modified,"
+                    "it should be done inplace."
+                )
+
         load(self)
         del load
 
@@ -1864,6 +1970,6 @@ def _replicate_for_data_parallel(self):
         replica._parameters = OrderedDict()
         replica._buffers = replica._buffers.copy()
         replica._modules = replica._modules.copy()
-        replica._is_replica = True
+        replica._is_replica = True  # type: ignore[assignment]
 
         return replica
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 64ef00887746..b9c43c402c5f 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -202,7 +202,8 @@ class GroupNorm(Module):
         y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
 
     The input channels are separated into :attr:`num_groups` groups, each containing
-    ``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
+    ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
+    :attr:`num_groups`. The mean and standard-deviation are calculated
     separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
     per-channel affine transform parameter vectors of size :attr:`num_channels` if
     :attr:`affine` is ``True``.
@@ -246,6 +247,9 @@ def __init__(self, num_groups: int, num_channels: int, eps: float = 1e-5, affine
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
         super(GroupNorm, self).__init__()
+        if num_channels % num_groups != 0:
+            raise ValueError('num_channels must be divisible by num_groups')
+
         self.num_groups = num_groups
         self.num_channels = num_channels
         self.eps = eps
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 845029a5ee1e..91c0476f6996 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -86,8 +86,8 @@ class MaxPool1d(_MaxPoolNd):
 
     def forward(self, input: Tensor):
         return F.max_pool1d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, self.ceil_mode,
-                            self.return_indices)
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
 
 
 class MaxPool2d(_MaxPoolNd):
@@ -160,8 +160,8 @@ class MaxPool2d(_MaxPoolNd):
 
     def forward(self, input: Tensor):
         return F.max_pool2d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, self.ceil_mode,
-                            self.return_indices)
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
 
 
 class MaxPool3d(_MaxPoolNd):
@@ -238,8 +238,8 @@ class MaxPool3d(_MaxPoolNd):
 
     def forward(self, input: Tensor):
         return F.max_pool3d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, self.ceil_mode,
-                            self.return_indices)
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
 
 
 class _MaxUnpoolNd(Module):
@@ -508,7 +508,7 @@ class AvgPool1d(_AvgPoolNd):
         count_include_pad: when True, will include the zero-padding in the averaging calculation
 
     Shape:
-        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in}`.
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
         - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
 
           .. math::
@@ -1192,7 +1192,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
                      be the same as that of the input.
 
     Shape:
-        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
         - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`,
           where :math:`S=\text{output\_size}`.
 
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index e3d3fd9450cd..fe2450e0519d 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -313,7 +313,7 @@ class RNN(RNNBase):
     function:
 
     .. math::
-        h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
+        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})
 
     where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
     the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
@@ -601,13 +601,16 @@ class LSTM(RNNBase):
           :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
           `(h_t)` from the last layer of the LSTM, for each `t`. If a
           :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
-          will also be a packed sequence.
+          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
+          a concatenation of the forward and reverse hidden states at each time step in the sequence.
         * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
           :math:`(D * \text{num\_layers}, N, H_{out})` containing the
-          final hidden state for each element in the sequence.
+          final hidden state for each element in the sequence. When ``bidirectional=True``,
+          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
         * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
           :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
-          final cell state for each element in the sequence.
+          final cell state for each element in the sequence. When ``bidirectional=True``,
+          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.
 
     Attributes:
         weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
@@ -645,6 +648,11 @@ class LSTM(RNNBase):
         Example of splitting the output layers when ``batch_first=False``:
         ``output.view(seq_len, batch, num_directions, hidden_size)``.
 
+    .. note::
+        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
+        former contains the final forward and reverse hidden states, while the latter contains the
+        final forward hidden state and the initial reverse hidden state.
+
     .. note::
         ``batch_first`` argument is ignored for unbatched inputs.
 
@@ -1145,6 +1153,8 @@ class LSTMCell(RNNCellBase):
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     Examples::
 
         >>> rnn = nn.LSTMCell(10, 20) # (input_size, hidden_size)
@@ -1236,6 +1246,8 @@ class GRUCell(RNNCellBase):
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`
 
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
     Examples::
 
         >>> rnn = nn.GRUCell(10, 20)
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 58bd22140875..b1a0de0846e7 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -102,8 +102,8 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
               `(N, S, E)` if `batch_first=True`.
             - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
               `(N, T, E)` if `batch_first=True`.
-            - src_mask: :math:`(S, S)`.
-            - tgt_mask: :math:`(T, T)`.
+            - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`.
+            - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`.
             - memory_mask: :math:`(T, S)`.
             - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
             - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
@@ -289,6 +289,29 @@ class TransformerEncoderLayer(Module):
         >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
         >>> src = torch.rand(32, 10, 512)
         >>> out = encoder_layer(src)
+
+    Fast path:
+        forward() will use a special optimized implementation if all of the following
+        conditions are met:
+
+        - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
+          argument ``requires_grad``
+        - training is disabled (using ``.eval()``)
+        - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``)
+        - norm_first is ``False`` (this restriction may be loosened in the future)
+        - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu``
+        - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed
+        - if src is a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_, neither ``src_mask``
+          nor ``src_key_padding_mask`` is passed
+        - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case
+          unless the caller has manually modified one without modifying the other)
+
+        If the optimized implementation is in use, a
+        `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be
+        passed for ``src`` to represent padding more efficiently than using a padding
+        mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
+        returned, and an additional speedup proportional to the fraction of the input that
+        is padding can be expected.
     """
     __constants__ = ['batch_first', 'norm_first']
 
@@ -313,16 +336,25 @@ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropou
 
         # Legacy string support for activation function.
         if isinstance(activation, str):
-            self.activation = _get_activation_fn(activation)
+            activation = _get_activation_fn(activation)
+
+        # We can't test self.activation in forward() in TorchScript,
+        # so stash some information about it instead.
+        if activation is F.relu:
+            self.activation_relu_or_gelu = 1
+        elif activation is F.gelu:
+            self.activation_relu_or_gelu = 2
         else:
-            self.activation = activation
+            self.activation_relu_or_gelu = 0
+        self.activation = activation
 
     def __setstate__(self, state):
         if 'activation' not in state:
             state['activation'] = F.relu
         super(TransformerEncoderLayer, self).__setstate__(state)
 
-    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
         r"""Pass the input through the encoder layer.
 
         Args:
@@ -336,6 +368,53 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_paddin
 
         # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
 
+        if (not self.norm_first and not self.training and
+            self.self_attn.batch_first and src.dim() == 3 and self.self_attn._qkv_same_embed_dim and
+            self.activation_relu_or_gelu and self.norm1.eps == self.norm2.eps and
+            ((src_mask is None and src_key_padding_mask is None)
+             if src.is_nested
+             else (src_mask is None or src_key_padding_mask is None))):
+            tensor_args = (
+                src,
+                self.self_attn.in_proj_weight,
+                self.self_attn.in_proj_bias,
+                self.self_attn.out_proj.weight,
+                self.self_attn.out_proj.bias,
+                self.norm1.weight,
+                self.norm1.bias,
+                self.norm2.weight,
+                self.norm2.bias,
+                self.linear1.weight,
+                self.linear1.bias,
+                self.linear2.weight,
+                self.linear2.bias,
+            )
+            if (not torch.overrides.has_torch_function(tensor_args) and
+                    # We have to use a list comprehension here because TorchScript
+                    # doesn't support generator expressions.
+                    all([(x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]) and
+                    (not torch.is_grad_enabled() or all([not x.requires_grad for x in tensor_args]))):
+                return torch._transformer_encoder_layer_fwd(
+                    src,
+                    self.self_attn.embed_dim,
+                    self.self_attn.num_heads,
+                    self.self_attn.in_proj_weight,
+                    self.self_attn.in_proj_bias,
+                    self.self_attn.out_proj.weight,
+                    self.self_attn.out_proj.bias,
+                    self.activation_relu_or_gelu == 2,
+                    False,  # norm_first, currently not supported
+                    self.norm1.eps,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    self.linear1.weight,
+                    self.linear1.bias,
+                    self.linear2.weight,
+                    self.linear2.bias,
+                    src_mask if src_mask is not None else src_key_padding_mask,
+                )
         x = src
         if self.norm_first:
             x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
@@ -488,7 +567,7 @@ def _get_clones(module, N):
     return ModuleList([copy.deepcopy(module) for i in range(N)])
 
 
-def _get_activation_fn(activation):
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
     if activation == "relu":
         return F.relu
     elif activation == "gelu":
diff --git a/torch/nn/parallel/_replicated_tensor_ddp_interop.py b/torch/nn/parallel/_replicated_tensor_ddp_interop.py
new file mode 100644
index 000000000000..c66d1c4b46ab
--- /dev/null
+++ b/torch/nn/parallel/_replicated_tensor_ddp_interop.py
@@ -0,0 +1,46 @@
+import torch
+from torch.distributed._shard.replicated_tensor import ReplicatedTensor
+
+class ReplicatedTensorFunction(torch.autograd.Function):
+    """
+    Autograd function to ensure gradients are replicated between the
+    replicated tensor and the original one.
+    """
+    @staticmethod
+    def forward(ctx, inp, process_group=None):
+        # set_materialize_grads(False) will ensure that None gradients stay as
+        # None and are not filled with zeros.
+        ctx.set_materialize_grads(False)
+        return ReplicatedTensor(inp, process_group)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def _make_replicated_tensor(tensor, process_group):
+    replicated_tensor = ReplicatedTensorFunction.apply(tensor, process_group)
+    replicated_tensor.grad = tensor.grad
+    return replicated_tensor
+
+def _replicate_module_recurse(module, process_group):
+    replica = module._replicate_for_data_parallel()
+    for param_name, param in module._parameters.items():
+        if param is not None:
+            setattr(replica, param_name, _make_replicated_tensor(param, process_group))
+        else:
+            setattr(replica, param_name, param)
+
+    for buffer_name, buffer in module._buffers.items():
+        setattr(replica, buffer_name, buffer)
+
+    for module_name, child in module._modules.items():
+        setattr(replica, module_name, _replicate_module_recurse(child, process_group))
+    return replica
+
+def _replicate_module(network, process_group):
+    from torch.nn.parallel.replicate import _replicatable_module  # type: ignore[attr-defined]
+    if not _replicatable_module(network):
+        raise RuntimeError("Cannot replicate network where python modules are "
+                           "childrens of ScriptModule")
+
+    return _replicate_module_recurse(network, process_group)
diff --git a/torch/nn/parallel/_replicated_tensor_ddp_utils.py b/torch/nn/parallel/_replicated_tensor_ddp_utils.py
new file mode 100644
index 000000000000..9ef00af4a163
--- /dev/null
+++ b/torch/nn/parallel/_replicated_tensor_ddp_utils.py
@@ -0,0 +1,31 @@
+from contextlib import contextmanager
+
+_DDP_WITH_REPLICATED_TENSOR = False
+
+@contextmanager
+def _ddp_replicated_tensor(val):
+    """
+    A context manager to tag tensors in the forward pass of DDP to be
+    ``ReplicatedTensor``. This can be used by ReplicatedTensor inter-op
+    during the forward pass to perform appropriate optimizations.
+
+    This context manager needs to wrap DDP creation and modifying the underlying
+    module passed into DDP after leaving this context manager would cause
+    inconsitencies and the changes will not be picked up during the forward
+    pass.
+    """
+    global _DDP_WITH_REPLICATED_TENSOR
+    old_val = _DDP_WITH_REPLICATED_TENSOR
+    _DDP_WITH_REPLICATED_TENSOR = val
+    try:
+        yield
+    finally:
+        _DDP_WITH_REPLICATED_TENSOR = old_val
+
+def _ddp_with_replicated_tensor_enabled():
+    global _DDP_WITH_REPLICATED_TENSOR
+    return _DDP_WITH_REPLICATED_TENSOR
+
+def _set_ddp_with_replicated_tensor(value):
+    global _DDP_WITH_REPLICATED_TENSOR
+    _DDP_WITH_REPLICATED_TENSOR = value
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index c7c19f368c45..69bbd8d705cd 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1,4 +1,4 @@
-import collections.abc
+import sys
 import copy
 from dataclasses import dataclass
 from typing import Callable, Any, Type
@@ -18,10 +18,16 @@
     Joinable,
     JoinHook,
 )
+
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
 RPC_AVAILABLE = False
 if dist.is_available():
+    from torch.distributed.utils import (
+        _verify_param_shape_across_processes,
+        _sync_module_states,
+        _to_kwargs,
+    )
     from torch.distributed.distributed_c10d import ReduceOp, _get_default_group
 if torch.distributed.rpc.is_available():
     RPC_AVAILABLE = True
@@ -30,9 +36,11 @@
 from torch._utils import _get_device_index
 
 from ..modules import Module
-from ._functions import _get_stream
-from .scatter_gather import gather, is_namedtuple, scatter_kwargs
+from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
+from .scatter_gather import gather, is_namedtuple, scatter_kwargs  # noqa: F401
+
 
+logger = logging.getLogger(__name__)
 
 def _tree_flatten_with_rref(output):
     output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
@@ -228,7 +236,6 @@ def post_hook(self, is_last_joiner: bool):
         """
         self.ddp._sync_final_model(is_last_joiner)
 
-
 class DistributedDataParallel(Module, Joinable):
     r"""Implements distributed data parallelism that is based on
     ``torch.distributed`` package at the module level.
@@ -406,11 +413,6 @@ class DistributedDataParallel(Module, Joinable):
         Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
         likely experience deadlocks if you don't change this setting.
 
-    .. warning::
-        Forward and backward hooks defined on :attr:`module` and its submodules
-        won't be invoked anymore, unless the hooks are initialized in the
-        :meth:`forward` method.
-
     .. warning::
         You should never try to change your model's parameters after wrapping
         up your model with ``DistributedDataParallel``. Because, when
@@ -608,6 +610,9 @@ def __init__(
         else:
             self.parameters_to_ignore = []
 
+        self._use_replicated_tensor_module = _ddp_with_replicated_tensor_enabled()
+        self._build_replicated_tensor_module()
+
         if check_reduction:
             # This argument is no longer used since the reducer
             # will ensure reduction completes even if some parameters
@@ -638,35 +643,33 @@ def __init__(
         # Build parameters for reducer.
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
         # Verify model equivalence.
-        dist._verify_params_across_processes(self.process_group, parameters)
+        _verify_param_shape_across_processes(self.process_group, parameters)
         # Sync params and buffers. Ensures all DDP models start off at the same value.
-        self._sync_params_and_buffers(authoritative_rank=0)
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=0,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+        )
         # In debug mode, build a mapping of parameter index -> parameter.
-        if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF:
-            param_to_name_mapping = self._build_param_to_name_mapping(parameters)
-        else:
-            param_to_name_mapping = {}
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
         # Builds reducer.
-        self._ddp_init_helper(parameters, expect_sparse_gradient, param_to_name_mapping)
+        self._ddp_init_helper(
+            parameters, expect_sparse_gradient, param_to_name_mapping, static_graph
+        )
         self._has_rebuilt_buckets = False
 
         if static_graph:
             self._set_static_graph()
 
-    def _sync_params_and_buffers(self, authoritative_rank=0):
-        module_states = []
-        for name, param in self.module.named_parameters():
-            if name not in self.parameters_to_ignore:
-                module_states.append(param.detach())
-
-        for name, buffer in self.module.named_buffers():
-            if name not in self.parameters_to_ignore:
-                module_states.append(buffer.detach())
-
-        if len(module_states) > 0:
-            self._distributed_broadcast_coalesced(
-                module_states, self.broadcast_bucket_size, authoritative_rank
-            )
+    def _build_replicated_tensor_module(self):
+        if self._use_replicated_tensor_module:
+            # Create a module with ReplicatedTensor without copying tensors. Avoid
+            # registering '_replicated_tensor_module' as a submodule by directly
+            # adding to self.__dict__.
+            from ._replicated_tensor_ddp_interop import _replicate_module
+            self.__dict__['_replicated_tensor_module'] = _replicate_module(self.module, self.process_group)
 
     def _log_and_throw(self, err_type, err_msg):
         if self.logger is not None:
@@ -674,25 +677,44 @@ def _log_and_throw(self, err_type, err_msg):
         raise err_type(err_msg)
 
     def _ddp_init_helper(
-        self, parameters, expect_sparse_gradient, param_to_name_mapping
+        self, parameters, expect_sparse_gradient, param_to_name_mapping,
+        static_graph
     ):
         """
         Initialization helper function that does the following:
         (1) bucketing the parameters for reductions
         (2) resetting the bucketing states
         (3) registering the grad hooks
-        (4) Logging constructin-time DDP logging data
+        (4) Logging construction-time DDP logging data
         (5) passing a handle of DDP to SyncBatchNorm Layer
         """
         self.num_iterations = 0
-        # The bucket size limit is specified in the constructor.
-        # Additionally, we allow for a single small bucket for parameters
-        # that are defined first, such that their gradients don't spill into
-        # a much larger bucket, adding unnecessary latency after gradient
-        # computation finishes. Experiments showed 1MB is a reasonable value.
+        # Notice, the parameters order is not in the order in which they are used,
+        # especially in models with control flow.
+        #
+        # Alongside parameters are not presented in the real execution order,
+        # if a certain model happens to also
+        #   1) have other collectives comm ops in its backward graph.
+        #   2) have unused parameter in subset ranks of the whole world.
+        # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter,
+        # matching up with other collectives comm ops on other ranks unexpectedly.
+        #
+        # In order to handle this corner case, when the parameters are not in the real execution order,
+        # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients
+        # of the whole graph are computed.
+        #
+        # Notice, here we only disable bucketing for the first iteration.
+        # After the first iteration, it's OK to rebuild buckets,
+        # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph.
+
+        # Can remove this branching once #73732 is landed.
+        if static_graph is True or self.find_unused_parameters is False:
+            bucket_size_limits = [sys.maxsize]
+        else:
+            bucket_size_limits = [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap]
         bucket_indices, per_bucket_size_limits = dist._compute_bucket_assignment_by_size(
             parameters,
-            [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap],
+            bucket_size_limits,
             expect_sparse_gradient,
         )
 
@@ -705,6 +727,11 @@ def _ddp_init_helper(
             list(reversed(per_bucket_size_limits)),
             self.process_group,
             expect_sparse_gradient,
+            # The bucket size limit is specified in the constructor.
+            # Additionally, we allow for a single small bucket for parameters
+            # that are defined first, such that their gradients don't spill into
+            # a much larger bucket, adding unnecessary latency after gradient
+            # computation finishes. Experiments showed 1MB is a reasonable value.
             self.bucket_bytes_cap,
             self.find_unused_parameters,
             self.gradient_as_bucket_view,
@@ -731,7 +758,8 @@ def _ddp_init_helper(
             [] if self.device_ids is None else self.device_ids,
             -1 if self.output_device is None else self.output_device,
             self.broadcast_buffers,
-            has_sync_bn
+            has_sync_bn,
+            static_graph,
         )
 
         # passing a handle to torch.nn.SyncBatchNorm layer
@@ -743,22 +771,24 @@ def __getstate__(self):
         del attrs["process_group"]
         del attrs["reducer"]
         del attrs["logger"]
+        if self._use_replicated_tensor_module:
+            del attrs["_replicated_tensor_module"]
         return attrs
 
     def __setstate__(self, state):
         # If serializable, then the process group should be the default one
         self.process_group = _get_default_group()
         super(DistributedDataParallel, self).__setstate__(state)
+        self._build_replicated_tensor_module()
         self.__dict__.setdefault("require_forward_param_sync", True)
         self.__dict__.setdefault("require_backward_grad_sync", True)
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
         # In debug mode, build a mapping of parameter index -> parameter.
-        if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF:
-            param_to_name_mapping = self._build_param_to_name_mapping(parameters)
-        else:
-            param_to_name_mapping = {}
-        # Builds reducer
-        self._ddp_init_helper(parameters, expect_sparse_gradient, param_to_name_mapping)
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters, expect_sparse_gradient, param_to_name_mapping, self.static_graph
+        )
         if self.static_graph:
             self.reducer._set_static_graph()
             self.logger._set_static_graph()
@@ -831,7 +861,10 @@ def _assign_modules_buffers(self):
             buffer_name: buffer for (buffer, buffer_name) in named_module_buffers
         }
 
-    def _build_param_to_name_mapping(self, parameters):
+    def _build_debug_param_to_name_mapping(self, parameters):
+        if dist.get_debug_level() == dist.DebugLevel.OFF:
+            return {}
+
         param_to_param_index = {parameters[i]: i for i in range(len(parameters))}
         param_set = set(parameters)
         param_index_to_param_fqn = {}
@@ -923,6 +956,20 @@ def no_sync(self):
         finally:
             self.require_backward_grad_sync = old_require_backward_grad_sync
 
+    def _run_ddp_forward(self, *inputs, **kwargs):
+        module_to_run = self._replicated_tensor_module if self._use_replicated_tensor_module else self.module
+
+        if self.device_ids:
+            inputs, kwargs = _to_kwargs(
+                inputs,
+                kwargs,
+                self.device_ids[0],
+                self.use_side_stream_for_tensor_copies
+            )
+            return module_to_run(*inputs[0], **kwargs[0])
+        else:
+            return module_to_run(*inputs, **kwargs)
+
     def forward(self, *inputs, **kwargs):
         with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
@@ -945,7 +992,7 @@ def forward(self, *inputs, **kwargs):
             # during forward computation.
             # This should be called only once during whole training period.
             if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
-                logging.info("Reducer buckets have been rebuilt in this iteration.")
+                logger.info("Reducer buckets have been rebuilt in this iteration.")
                 self._has_rebuilt_buckets = True
 
             # sync params according to location (before/after forward) user
@@ -958,11 +1005,7 @@ def forward(self, *inputs, **kwargs):
                 # Notify joined ranks whether they should sync in backwards pass or not.
                 self._check_global_requires_backward_grad_sync(is_joined_rank=False)
 
-            if self.device_ids:
-                inputs, kwargs = self.to_kwargs(inputs, kwargs, self.device_ids[0])
-                output = self.module(*inputs[0], **kwargs[0])
-            else:
-                output = self.module(*inputs, **kwargs)
+            output = self._run_ddp_forward(*inputs, **kwargs)
 
             # sync params according to location (before/after forward) user
             # specified as part of hook, if hook was specified.
@@ -1027,87 +1070,21 @@ def forward(self, *inputs, **kwargs):
     def scatter(self, inputs, kwargs, device_ids):
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
-    def _recursive_to(self, inputs, target_gpu):
-        r"""
-        Recursively moves input to the target_gpu.
-        """
-
-        def to_map(obj):
-            if isinstance(obj, torch.Tensor):
-                if obj.device == torch.device("cuda", target_gpu):
-                    return (obj,)
-                if not self.use_side_stream_for_tensor_copies:
-                    return (obj.to(target_gpu),)
-                else:
-                    # Perform CPU -> GPU copies in a background stream. This code is
-                    # motivated from similar logic in torch/nn/parallel/_functions.py
-                    stream = _get_stream(target_gpu)
-                    with torch.cuda.stream(stream):
-                        output = obj.to(target_gpu)
-                    # synchronize with the copy stream
-                    with torch.cuda.device(target_gpu):
-                        current_stream = torch.cuda.current_stream()
-                        # Sync the current stream with the copy stream
-                        current_stream.wait_stream(stream)
-                        # Ensure tensor memory is not reused until work on
-                        # main stream is complete
-                        output.record_stream(current_stream)
-                    return (output,)
-            if is_namedtuple(obj):
-                return [type(obj)(*args) for args in zip(*map(to_map, obj))]
-            if isinstance(obj, tuple) and len(obj) > 0:
-                return list(zip(*map(to_map, obj)))
-            if isinstance(obj, str):
-                # Needs to be checked, otherwise it's taken as a sequence infinitely.
-                # This is because the elements of a string are also strings, and so on.
-                return [obj]
-            if isinstance(obj, collections.abc.Sequence) and len(obj) > 0:
-                try:
-                    return [type(obj)(i) for i in zip(*map(to_map, obj))]
-                except TypeError:
-                    # The sequence type may not support `__init__(iterable)` (e.g., `range`).
-                    return [list(i) for i in zip(*map(to_map, obj))]
-            if isinstance(obj, collections.abc.Mapping) and len(obj) > 0:
-                try:
-                    return [type(obj)(i) for i in zip(*map(to_map, obj.items()))]
-                except TypeError:
-                    # The mapping type may not support `__init__(iterable)`.
-                    return [dict(i) for i in zip(*map(to_map, obj.items()))]
-            return [obj]
-
-        # Avoid reference cycle
-        try:
-            res = to_map(inputs)
-        finally:
-            to_map = None
-        return res
-
     def to_kwargs(self, inputs, kwargs, device_id):
-        inputs = self._recursive_to(inputs, device_id) if inputs else []
-        kwargs = self._recursive_to(kwargs, device_id) if kwargs else []
-        if len(inputs) < len(kwargs):
-            inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
-        elif len(kwargs) < len(inputs):
-            kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
-        inputs = tuple(inputs)
-        kwargs = tuple(kwargs)
-        return inputs, kwargs
+        # Kept for BC
+        return _to_kwargs(
+            inputs, kwargs, device_id, self.use_side_stream_for_tensor_copies
+        )
 
     def gather(self, outputs, output_device):
         return gather(outputs, output_device, dim=self.dim)
 
     def train(self, mode=True):
         super(DistributedDataParallel, self).train(mode)
+        if self._use_replicated_tensor_module:
+            self._replicated_tensor_module.train(mode)
         return self
 
-    # When running in join mode, schedules an allreduce to match the one in the
-    # forward pass to determine the no. of currently active processes and whether
-    # all processes have joined.
-    def _schedule_shadow_all_reduce_for_fwd_pass(self):
-        all_active_procs = torch.zeros(1, device=self.device)
-        dist.all_reduce(all_active_procs, group=self.process_group)
-        return all_active_procs.item()
-
     # When running in join mode, schedules an allreduce to notify joined ranks
     # of whether backwards pass synchronization will run this iteraton or not.
     def _check_global_requires_backward_grad_sync(self, is_joined_rank):
@@ -1137,7 +1114,13 @@ def _sync_final_model(self, is_last_joiner):
         self._authoritative_rank = self._find_common_rank(
             self._distributed_rank, is_last_joiner
         )
-        self._sync_params_and_buffers(authoritative_rank=self._authoritative_rank)
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=self._authoritative_rank,
+            params_and_buffers_to_ignore=self.parameters_to_ignore
+        )
 
     # Schedule comm ops to match those scheduled in the reducer's backward
     # pass.
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index c5d8674b6296..465e5e56f231 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -3,7 +3,15 @@
 from collections import OrderedDict
 
 
-class Parameter(torch.Tensor):
+# Metaclass to combine _TensorMeta and the instance check override for Parameter.
+class _ParameterMeta(torch._C._TensorMeta):
+    # Make `isinstance(t, Parameter)` return True for custom tensor instances that have the _is_param flag.
+    def __instancecheck__(self, instance):
+        return super().__instancecheck__(instance) or (
+            isinstance(instance, torch.Tensor) and getattr(instance, '_is_param', False))
+
+
+class Parameter(torch.Tensor, metaclass=_ParameterMeta):
     r"""A kind of Tensor that is to be considered a module parameter.
 
     Parameters are :class:`~torch.Tensor` subclasses, that have a
@@ -23,8 +31,24 @@ class Parameter(torch.Tensor):
     def __new__(cls, data=None, requires_grad=True):
         if data is None:
             data = torch.empty(0)
-        return torch.Tensor._make_subclass(cls, data, requires_grad)
-
+        if type(data) is torch.Tensor or type(data) is Parameter:
+            # For ease of BC maintenance, keep this path for standard Tensor.
+            # Eventually (tm), we should change the behavior for standard Tensor to match.
+            return torch.Tensor._make_subclass(cls, data, requires_grad)
+
+        # Path for custom tensors: set a flag on the instance to indicate parameter-ness.
+        t = data.detach().requires_grad_(requires_grad)
+        if type(t) is not type(data):
+            raise RuntimeError(f"Creating a Parameter from an instance of type {type(data).__name__} "
+                               "requires that detach() returns an instance of the same type, but return "
+                               f"type {type(t).__name__} was found instead. To use the type as a "
+                               "Parameter, please correct the detach() semantics defined by "
+                               "its __torch_dispatch__() implementation.")
+        t._is_param = True
+        return t
+
+    # Note: the 3 methods below only apply to standard Tensor. Parameters of custom tensor types
+    # are still considered that custom tensor type and these methods will not be called for them.
     def __deepcopy__(self, memo):
         if id(self) in memo:
             return memo[id(self)]
diff --git a/torch/nn/qat/dynamic/modules/linear.py b/torch/nn/qat/dynamic/modules/linear.py
index e045807a4f61..8f4bbe47a41e 100644
--- a/torch/nn/qat/dynamic/modules/linear.py
+++ b/torch/nn/qat/dynamic/modules/linear.py
@@ -1,6 +1,7 @@
 import torch
 from torch.ao.quantization import activation_is_memoryless
 
+
 class Linear(torch.nn.qat.Linear):
     r"""
     A linear module attached with FakeQuantize modules for weight,
@@ -13,8 +14,12 @@ class Linear(torch.nn.qat.Linear):
     Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to
     default.
     """
+
     def __init__(self, in_features, out_features, bias=True,
                  qconfig=None, device=None, dtype=None) -> None:
         super().__init__(in_features, out_features, bias, qconfig, device, dtype)
         if not activation_is_memoryless(qconfig):
-            raise ValueError("Dynamic QAT requires a memoryless observer")
+            raise ValueError(
+                "Dynamic QAT requires a memoryless observer." +
+                "This means a MovingAverage observer with averaging constant equal to 1"
+            )
diff --git a/torch/nn/qat/modules/__init__.py b/torch/nn/qat/modules/__init__.py
index b7e726037720..988a1dd5ed4b 100644
--- a/torch/nn/qat/modules/__init__.py
+++ b/torch/nn/qat/modules/__init__.py
@@ -1,10 +1,12 @@
 from .linear import Linear
+from .conv import Conv1d
 from .conv import Conv2d
 from .conv import Conv3d
 from .embedding_ops import EmbeddingBag, Embedding
 
 __all__ = [
     "Linear",
+    "Conv1d",
     "Conv2d",
     "Conv3d",
     "Embedding",
diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py
index 4dbd518d38e2..ef6a7c909f49 100644
--- a/torch/nn/qat/modules/conv.py
+++ b/torch/nn/qat/modules/conv.py
@@ -1,34 +1,35 @@
 import torch
 import torch.nn as nn
-from torch.nn.intrinsic import ConvReLU2d, ConvReLU3d
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.intrinsic import _FusedModule
+from typing import Tuple, TypeVar, Union
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
 
+MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
 
-class Conv2d(nn.Conv2d):
-    r"""
-    A Conv2d module attached with FakeQuantize modules for weight,
-    used for quantization aware training.
-
-    We adopt the same interface as `torch.nn.Conv2d`, please see
-    https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d
-    for documentation.
+class _ConvNd(nn.modules.conv._ConvNd):
 
-    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
-    default.
-
-    Attributes:
-        weight_fake_quant: fake quant module for weight
-    """
-    _FLOAT_MODULE = nn.Conv2d
+    _FLOAT_MODULE = MOD
 
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1,
-                 bias=True, padding_mode='zeros', qconfig=None,
-                 device=None, dtype=None) -> None:
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        super().__init__(in_channels, out_channels, kernel_size,
-                         stride=stride, padding=padding, dilation=dilation,
-                         groups=groups, bias=bias, padding_mode=padding_mode,
-                         **factory_kwargs)
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Tuple[int, ...],
+                 stride: Tuple[int, ...],
+                 padding: Tuple[int, ...],
+                 dilation: Tuple[int, ...],
+                 transposed: bool,
+                 output_padding: Tuple[int, ...],
+                 groups: int,
+                 bias: bool,
+                 padding_mode: str,
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        nn.modules.conv._ConvNd.__init__(self, in_channels, out_channels, kernel_size,
+                                         stride, padding, dilation, transposed,
+                                         output_padding, groups, bias, padding_mode, **factory_kwargs)
         assert qconfig, 'qconfig must be provided for QAT module'
         self.qconfig = qconfig
         self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
@@ -36,19 +37,24 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
 
-    @classmethod
+    @staticmethod
     def from_float(cls, mod):
-        r"""Create a qat module from a float module or qparams_dict
+        r"""Create a qat module from a float module
 
-            Args: `mod` a float module, either produced by torch.ao.quantization utilities
-            or directly from user
+            Args:
+               `mod`: a float module, either produced by torch.ao.quantization utilities
+               or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
-            cls._FLOAT_MODULE.__name__
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        )
         assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
         assert mod.qconfig, 'Input float module must have a valid qconfig'
-        if type(mod) == ConvReLU2d:
-            mod = mod[0]
+        if issubclass(type(mod), _FusedModule):
+            mod = mod[0]  # type: ignore[index]
         qconfig = mod.qconfig
         qat_conv = cls(mod.in_channels, mod.out_channels, mod.kernel_size,
                        stride=mod.stride, padding=mod.padding, dilation=mod.dilation,
@@ -59,7 +65,11 @@ def from_float(cls, mod):
         return qat_conv
 
     def to_float(self):
-        conv = torch.nn.Conv2d(
+        """ This works for both single qat conv, and the qat conv - relu modules
+        to convert the qat module to a floating point module
+        """
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined, operator]
             self.in_channels,
             self.out_channels,
             self.kernel_size,  # type: ignore[arg-type]
@@ -72,9 +82,130 @@ def to_float(self):
         conv.weight = torch.nn.Parameter(self.weight.detach())
         if self.bias is not None:
             conv.bias = torch.nn.Parameter(self.bias.detach())
-        return conv
+        # conv relu
+        if issubclass(cls, _FusedModule):
+            modules = [conv]
+            assert hasattr(cls, "_FLOAT_RELU_MODULE")
+            relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
+            modules.append(relu)
+            fused = cls._FLOAT_MODULE(*modules)  # type: ignore[arg-type, attr-defined, operator]
+            fused.train(self.training)
+            return fused
+        else:
+            return conv
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    r"""
+    A Conv1d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as :class:`~torch.nn.Conv1d`
+
+    Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv1d
+    _FLOAT_CONV_MODULE = nn.Conv1d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: Union[str, _size_1_t] = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_single(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
 
-class Conv3d(nn.Conv3d):
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    r"""
+    A Conv2d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv2d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d
+    for documentation.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv2d
+    _FLOAT_CONV_MODULE = nn.Conv2d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_2_t,
+                 stride: _size_2_t = 1,
+                 padding: Union[str, _size_2_t] = 0,
+                 dilation: _size_2_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_pair(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv3d(_ConvNd, nn.Conv3d):
     r"""
     A Conv3d module attached with FakeQuantize modules for weight,
     used for quantization aware training.
@@ -90,88 +221,44 @@ class Conv3d(nn.Conv3d):
         weight_fake_quant: fake quant module for weight
     """
     _FLOAT_MODULE = nn.Conv3d
+    _FLOAT_CONV_MODULE = nn.Conv3d
 
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        dilation=1,
-        groups=1,
-        bias=True,
-        padding_mode="zeros",
-        qconfig=None,
-        device=None,
-        dtype=None
-    ) -> None:
-        factory_kwargs = {'device': device, 'dtype': dtype}
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_3_t,
+                 stride: _size_3_t = 1,
+                 padding: Union[str, _size_3_t] = 0,
+                 dilation: _size_3_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
         super().__init__(
             in_channels,
             out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_triple(0),
             groups=groups,
             bias=bias,
             padding_mode=padding_mode,
-            **factory_kwargs
-        )
-        assert qconfig, "qconfig must be provided for QAT module"
-        self.qconfig = qconfig
-        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
 
     def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
     def from_float(cls, mod):
-        r"""Create a qat module from a float module or qparams_dict
-
-        Args: `mod` a float module, either produced by torch.ao.quantization utilities
-        or directly from user
-        """
-        assert type(mod) == cls._FLOAT_MODULE, (
-            "qat."
-            + cls.__name__
-            + ".from_float only works for "
-            + cls._FLOAT_MODULE.__name__
-        )
-        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
-        assert mod.qconfig, "Input float module must have a valid qconfig"
-        if type(mod) == ConvReLU3d:
-            mod = mod[0]
-        qconfig = mod.qconfig
-        qat_conv = cls(
-            mod.in_channels,
-            mod.out_channels,
-            mod.kernel_size,
-            stride=mod.stride,
-            padding=mod.padding,
-            dilation=mod.dilation,
-            groups=mod.groups,
-            bias=mod.bias is not None,
-            padding_mode=mod.padding_mode,
-            qconfig=qconfig,
-        )
-        qat_conv.weight = mod.weight
-        qat_conv.bias = mod.bias
-        return qat_conv
-
-    def to_float(self):
-        conv = torch.nn.Conv3d(
-            self.in_channels,
-            self.out_channels,
-            self.kernel_size,  # type: ignore[arg-type]
-            self.stride,  # type: ignore[arg-type]
-            self.padding,  # type: ignore[arg-type]
-            self.dilation,  # type: ignore[arg-type]
-            self.groups,
-            self.bias is not None,
-            self.padding_mode)
-        conv.weight = torch.nn.Parameter(self.weight.detach())
-        if self.bias is not None:
-            conv.bias = torch.nn.Parameter(self.bias.detach())
-        return conv
+        return super().from_float(cls, mod)
diff --git a/torch/nn/qat/modules/embedding_ops.py b/torch/nn/qat/modules/embedding_ops.py
index c8b7224355ce..29425598452b 100644
--- a/torch/nn/qat/modules/embedding_ops.py
+++ b/torch/nn/qat/modules/embedding_ops.py
@@ -51,9 +51,10 @@ def from_float(cls, mod):
             cls._FLOAT_MODULE.__name__
         assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
         assert mod.qconfig, 'Input float module must have a valid qconfig'
-        assert mod.qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, \
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, \
             'Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
-            str(mod.qconfig.weight().qscheme)
+            str(weight_qscheme)
 
         qconfig = mod.qconfig
         qat_embedding_bag = cls(mod.num_embeddings, mod.embedding_dim, mod.padding_idx,
@@ -65,7 +66,7 @@ def from_float(cls, mod):
     def to_float(self):
         embedding_bag = torch.nn.Embedding(self.num_embeddings, self.embedding_dim, self.padding_idx,
                                            self.max_norm, self.norm_type, self.scale_grad_by_freq,
-                                           self.sparse, None, self.device, self.dtype)
+                                           self.sparse, None)
         embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
         embedding_bag.train(self.training)
         return embedding_bag
@@ -120,9 +121,10 @@ def from_float(cls, mod):
             cls._FLOAT_MODULE.__name__
         assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
         assert mod.qconfig, 'Input float module must have a valid qconfig'
-        assert mod.qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, \
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, \
             'Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
-            str(mod.qconfig.weight().qscheme)
+            str(weight_qscheme)
 
         qconfig = mod.qconfig
         qat_embedding_bag = cls(mod.num_embeddings, mod.embedding_dim, mod.max_norm, mod.norm_type,
@@ -134,8 +136,7 @@ def from_float(cls, mod):
     def to_float(self):
         embedding_bag = torch.nn.EmbeddingBag(self.num_embeddings, self.embedding_dim, self.max_norm,
                                               self.norm_type, self.scale_grad_by_freq, self.mode, self.sparse,
-                                              None, self.include_last_offset, self.padding_idx,
-                                              self.device, self.dtype)
+                                              None, self.include_last_offset, self.padding_idx)
         embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
         embedding_bag.train(self.training)
         return embedding_bag
diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py
index d7a4ab66c0c4..b03c255f97f8 100644
--- a/torch/nn/qat/modules/linear.py
+++ b/torch/nn/qat/modules/linear.py
@@ -2,6 +2,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.intrinsic import LinearReLU
+from torch.nn.utils.parametrize import (
+    is_parametrized,
+    type_before_parametrizations,
+    transfer_parametrizations_and_params,
+)
 
 class Linear(nn.Linear):
     r"""
@@ -34,21 +39,33 @@ def forward(self, input):
     @classmethod
     def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
-
             Args: `mod` a float module, either produced by torch.ao.quantization utilities
             or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \
-            cls._FLOAT_MODULE.__name__
-        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-        assert mod.qconfig, 'Input float module must have a valid qconfig'
-        if type(mod) == LinearReLU:
+        assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        if type_before_parametrizations(mod) == LinearReLU:
             mod = mod[0]
 
         qconfig = mod.qconfig
         qat_linear = cls(mod.in_features, mod.out_features, bias=mod.bias is not None, qconfig=qconfig)
-        qat_linear.weight = mod.weight
-        qat_linear.bias = mod.bias
+
+        if is_parametrized(mod, "weight"):
+            transfer_parametrizations_and_params(mod, qat_linear, "weight")
+        else:
+            qat_linear.weight = mod.weight
+
+        if is_parametrized(mod, "bias"):
+            transfer_parametrizations_and_params(mod, qat_linear, "bias")
+        else:
+            qat_linear.bias = mod.bias
+
         return qat_linear
 
     def to_float(self):
diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py
index 441852c38f9b..3627f93ebd6c 100644
--- a/torch/nn/quantized/_reference/modules/__init__.py
+++ b/torch/nn/quantized/_reference/modules/__init__.py
@@ -1,9 +1,20 @@
 from .linear import Linear
-from .conv import Conv1d, Conv2d, Conv3d
+from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from .rnn import RNNCell, LSTMCell, GRUCell, LSTM
+from .sparse import Embedding, EmbeddingBag
 
 __all__ = [
     'Linear',
     'Conv1d',
     'Conv2d',
     'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'RNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'LSTM',
+    'Embedding',
+    'EmbeddingBag',
 ]
diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py
index ed151cb7f5e9..64b31c0c75a7 100644
--- a/torch/nn/quantized/_reference/modules/conv.py
+++ b/torch/nn/quantized/_reference/modules/conv.py
@@ -1,87 +1,18 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 from torch.nn.common_types import _size_1_t
-from .utils import _quantize_weight, _quantize_and_dequantize_weight
-from .utils import _save_weight_qparams
-from .utils import _get_weight_qparam_keys
+from .utils import ReferenceQuantizedModule
 
-class _ConvNd(torch.nn.modules.conv._ConvNd):
+class _ConvNd(torch.nn.modules.conv._ConvNd, ReferenceQuantizedModule):
     """ A reference version of nn.quantized.Conv2d
         we will not pack the parameters in this module, since weight packing is an
         optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
         this is useful when user want to use this module in other backends like Glow.
     """
     __annotations__ = {"bias": Optional[torch.Tensor]}
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super()._save_to_state_dict(destination, prefix, keep_vars)
-        _save_weight_qparams(
-            destination, prefix, self.weight_qscheme, self.weight_dtype,
-            self.weight_scale, self.weight_zero_point, self.weight_axis)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        for key in _get_weight_qparam_keys(state_dict, prefix):
-            setattr(self, key, state_dict[prefix + key])
-            state_dict.pop(prefix + key)
-
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, False,
-            missing_keys, unexpected_keys, error_msgs)
-
-    def _init_weight_qparams(self, weight_qparams, device):
-        if weight_qparams is None:
-            weight_qparams = {
-                "qscheme": torch.per_tensor_affine,
-                "dtype": torch.quint8,
-                "scale": 1.0,
-                "zero_point": 0
-            }
-        self.weight_qscheme = weight_qparams["qscheme"]
-        self.weight_dtype = weight_qparams["dtype"]
-        assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
-            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized conv module")
-        if self.weight_qscheme is not None:
-            self.register_buffer(
-                "weight_scale",
-                torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
-            self.register_buffer(
-                "weight_zero_point",
-                torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
-            if self.weight_qscheme == torch.per_channel_affine:
-                self.register_buffer(
-                    "weight_axis",
-                    torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
-            else:
-                # added for TorchScriptability, not used
-                self.register_buffer(
-                    "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
-
-    def get_weight(self):
-        """
-        Fake quantize (quantize and dequantize) the weight with
-        the quantization parameters for weight, this is used to
-        simulate the numerics for the quantized weight in a quantized
-        model
-        """
-        # supress mypy warning
-        assert isinstance(self.weight_scale, torch.Tensor)
-        assert isinstance(self.weight_zero_point, torch.Tensor)
-        assert isinstance(self.weight_axis, torch.Tensor)
-        return _quantize_and_dequantize_weight(
-            self.weight, self.weight_qscheme,
-            self.weight_dtype, self.weight_scale, self.weight_zero_point, self.weight_axis)
-
-    def get_quantized_weight(self):
-        # suppress mypy warning
-        assert isinstance(self.weight_scale, torch.Tensor)
-        assert isinstance(self.weight_zero_point, torch.Tensor)
-        assert isinstance(self.weight_axis, torch.Tensor)
-        return _quantize_weight(
-            self.weight, self.weight_qscheme, self.weight_dtype, self.weight_scale,
-            self.weight_zero_point, self.weight_axis)
+    _IS_REFERENCE = True
 
     @staticmethod
     def from_float(cls, float_conv, weight_qparams):
@@ -133,9 +64,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x -- quant --- *dequant --  *F.conv1d --- *quant - dequant
         and the backend should be able to fuse the ops with `*` into a quantized conv1d
         """
-        weight_dequant = self.get_weight()
+        weight_quant_dequant = self.get_weight()
         result = F.conv1d(
-            x, weight_dequant, self.bias, self.stride,
+            x, weight_quant_dequant, self.bias, self.stride,
             self.padding, self.dilation, self.groups)
         return result
 
@@ -169,9 +100,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x -- quant --- *dequant --  *F.conv2d --- *quant - dequant
         and the backend should be able to fuse the ops with `*` into a quantized conv2d
         """
-        weight_dequant = self.get_weight()
+        weight_quant_dequant = self.get_weight()
         result = F.conv2d(
-            x, weight_dequant, self.bias, self.stride,
+            x, weight_quant_dequant, self.bias, self.stride,
             self.padding, self.dilation, self.groups)
         return result
 
@@ -191,7 +122,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  weight_qparams: Optional[Dict[str, Any]] = None):
         nn.Conv3d.__init__(
             self, in_channels, out_channels, kernel_size, stride, padding, dilation,
-            groups, bias, padding_mode)
+            groups, bias, padding_mode, device, dtype)
         self._init_weight_qparams(weight_qparams, device)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -205,9 +136,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x -- quant --- *dequant --  *F.conv3d --- *quant - dequant
         and the backend should be able to fuse the ops with `*` into a quantized conv3d
         """
-        weight_dequant = self.get_weight()
+        weight_quant_dequant = self.get_weight()
         result = F.conv3d(
-            x, weight_dequant, self.bias, self.stride,
+            x, weight_quant_dequant, self.bias, self.stride,
             self.padding, self.dilation, self.groups)
         return result
 
@@ -217,3 +148,169 @@ def _get_name(self):
     @classmethod
     def from_float(cls, float_conv, weight_qparams):
         return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd):
+    """ A reference version of nn.quantized.ConvTranspose2d
+        we will not pack the parameters in this module, since weight packing is an
+        optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
+        this is useful when user want to use this module in other backends like Glow.
+    """
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.output_padding,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams)
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+
+class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: _size_1_t = 0,
+                 output_padding: _size_1_t = 0,
+                 groups: int = 1,
+                 bias: bool = True,
+                 dilation: _size_1_t = 1,
+                 padding_mode: str = "zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.ConvTranspose1d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose1d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose1d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0,
+                 groups=1, bias=True, dilation=1,
+                 padding_mode='zeros',
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+
+        nn.ConvTranspose2d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose2d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose2d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose2d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0,
+                 groups=1, bias=True, dilation=1,
+                 padding_mode="zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.ConvTranspose3d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose3d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose3d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
diff --git a/torch/nn/quantized/_reference/modules/linear.py b/torch/nn/quantized/_reference/modules/linear.py
index 3cfacda6700f..adbcba39685f 100644
--- a/torch/nn/quantized/_reference/modules/linear.py
+++ b/torch/nn/quantized/_reference/modules/linear.py
@@ -2,12 +2,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Dict, Any
-from .utils import _quantize_and_dequantize_weight
-from .utils import _quantize_weight
-from .utils import _save_weight_qparams
-from .utils import _get_weight_qparam_keys
+from .utils import ReferenceQuantizedModule
 
-class Linear(nn.Linear):
+class Linear(nn.Linear, ReferenceQuantizedModule):
     """ A reference quantized linear module that fits into the FX
     Graph Mode Quantization workflow
     activation will be floating point Tensor, we will store floating
@@ -15,6 +12,8 @@ class Linear(nn.Linear):
     and dequantize the weight before running the floating point functional
     linear operator.
     """
+    _IS_REFERENCE = True
+
     def __init__(
             self,
             in_features: int,
@@ -24,63 +23,11 @@ def __init__(
             dtype: Optional[torch.dtype] = None,
             weight_qparams: Optional[Dict[str, Any]] = None):
         super().__init__(in_features, out_features, bias_, device, dtype)
-        if weight_qparams is None:
-            weight_qparams = {
-                "qscheme": torch.per_tensor_affine,
-                "dtype": torch.quint8,
-                "scale": 1.0,
-                "zero_point": 0
-            }
-        self.weight_qscheme = weight_qparams["qscheme"]
-        self.weight_dtype = weight_qparams["dtype"]
-        assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
-            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized linear module")
-        if self.weight_qscheme is not None:
-            self.register_buffer(
-                "weight_scale",
-                torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
-            self.register_buffer(
-                "weight_zero_point",
-                torch.tensor(
-                    weight_qparams["zero_point"],
-                    dtype=torch.int, device=device))
-            if self.weight_qscheme == torch.per_channel_affine:
-                self.register_buffer(
-                    "weight_axis",
-                    torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
-            else:
-                # added for TorchScriptability, not used
-                self.register_buffer(
-                    "weight_axis",
-                    torch.tensor(0, dtype=torch.int, device=device))
+        self._init_weight_qparams(weight_qparams, device)
 
     def _get_name(self):
         return "QuantizedLinear(Reference)"
 
-    def get_weight(self):
-        """
-        Fake quantize (quantize and dequantize) the weight with
-        the quantization parameters for weight, this is used to
-        simulate the numerics for the quantized weight in a quantized
-        model
-        """
-        # suppress mypy warning
-        assert isinstance(self.weight_scale, torch.Tensor)
-        assert isinstance(self.weight_zero_point, torch.Tensor)
-        assert isinstance(self.weight_axis, torch.Tensor)
-        return _quantize_and_dequantize_weight(
-            self.weight, self.weight_qscheme, self.weight_dtype, self.weight_scale,
-            self.weight_zero_point, self.weight_axis)
-
-    def get_quantized_weight(self):
-        # suppress mypy warning
-        assert isinstance(self.weight_scale, torch.Tensor)
-        assert isinstance(self.weight_zero_point, torch.Tensor)
-        assert isinstance(self.weight_axis, torch.Tensor)
-        return _quantize_weight(
-            self.weight, self.weight_qscheme, self.weight_dtype, self.weight_scale,
-            self.weight_zero_point, self.weight_axis)
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         we have:
@@ -92,26 +39,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x -- quant --- *dequant --  *F.linear --- *quant - dequant
         and the backend should be able to fuse the ops with `*` into a quantized linear
         """
-        weight_dequant = self.get_weight()
-        result = F.linear(x, weight_dequant, self.bias)
+        weight_quant_dequant = self.get_weight()
+        result = F.linear(x, weight_quant_dequant, self.bias)
         return result
 
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super()._save_to_state_dict(destination, prefix, keep_vars)
-        _save_weight_qparams(
-            destination, prefix, self.weight_qscheme, self.weight_dtype,
-            self.weight_scale, self.weight_zero_point, self.weight_axis)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        for key in _get_weight_qparam_keys(state_dict, prefix):
-            setattr(self, key, state_dict[prefix + key])
-            state_dict.pop(prefix + key)
-
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, False,
-            missing_keys, unexpected_keys, error_msgs)
-
     @classmethod
     def from_float(cls, float_linear, weight_qparams):
         qref_linear = Linear(
diff --git a/torch/nn/quantized/_reference/modules/rnn.py b/torch/nn/quantized/_reference/modules/rnn.py
new file mode 100644
index 000000000000..bb5ec8bdcc98
--- /dev/null
+++ b/torch/nn/quantized/_reference/modules/rnn.py
@@ -0,0 +1,471 @@
+import torch
+import torch.nn as nn
+from torch import Tensor
+from .utils import _quantize_and_dequantize_weight
+from .utils import _quantize_weight
+from typing import Optional, Dict, Any, Tuple
+from torch import _VF
+from torch.nn.utils.rnn import PackedSequence
+
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+def get_weight_and_quantization_params(module, wn):
+    weight = getattr(module, wn)
+    params = [weight]
+    for param_name in [wn + n for n in ["_qscheme", "_dtype", "_scale", "_zero_point", "_axis"]]:
+        if hasattr(module, param_name):
+            param = getattr(module, param_name)
+        else:
+            param = None
+        params.append(param)
+    return params
+
+def get_quantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = get_weight_and_quantization_params(module, wn)
+    weight = _quantize_weight(*params)
+    return weight
+
+def get_quantize_and_dequantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = get_weight_and_quantization_params(module, wn)
+    weight = _quantize_and_dequantize_weight(*params)
+    return weight
+
+class RNNCellBase(nn.RNNCellBase):
+    def __init__(self, input_size: int, hidden_size: int, bias: bool, num_chunks: int,
+                 device=None, dtype=None, weight_qparams_dict=None) -> None:
+        super().__init__(input_size, hidden_size, bias, num_chunks, device=device, dtype=dtype)
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+            weight_qparams_dict = {
+                "weight_ih": weight_qparams,
+                "weight_hh": weight_qparams
+            }
+        assert len(weight_qparams_dict) == 2, "Expected length for weight_qparams_dict to be 2 for QuantizedRNNCellBase(Reference)"
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        assert weight_qparams_dict is not None
+        for key, weight_qparams in weight_qparams_dict.items():
+            # TODO: refactor the duplicated code to utils.py
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+                Exception(f"qscheme: {weight_qscheme} is not support in {self._get_name()}")
+            if weight_qscheme is not None:
+                self.register_buffer(
+                    key + "_scale",
+                    torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+                self.register_buffer(
+                    key + "_zero_point",
+                    torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
+                if weight_qscheme == torch.per_channel_affine:
+                    self.register_buffer(
+                        key + "_axis",
+                        torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device))
+
+    def _get_name(self):
+        return "QuantizedRNNCellBase(Reference)"
+
+    def get_quantized_weight_ih(self):
+        return get_quantized_weight(self, "weight_ih")
+
+    def get_quantized_weight_hh(self):
+        return get_quantized_weight(self, "weight_hh")
+
+    def get_weight_ih(self):
+        return get_quantize_and_dequantized_weight(self, "weight_ih")
+
+    def get_weight_hh(self):
+        return get_quantize_and_dequantized_weight(self, "weight_hh")
+
+class RNNCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True, nonlinearity: str = "tanh",
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        self.nonlinearity = nonlinearity
+
+    def _get_name(self):
+        return "QuantizedRNNCell(Reference)"
+
+    # TODO: refactor nn.RNNCell to have a _forward that takes weight_ih and weight_hh as input
+    # and remove duplicated code, same for the other two Cell modules
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        assert input.dim() in (1, 2), \
+            f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        if self.nonlinearity == "tanh":
+            ret = _VF.rnn_tanh_cell(
+                input, hx,
+                self.get_weight_ih(), self.get_weight_hh(),
+                self.bias_ih, self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = _VF.rnn_relu_cell(
+                input, hx,
+                self.get_weight_ih(), self.get_weight_hh(),
+                self.bias_ih, self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(
+                "Unknown nonlinearity: {}".format(self.nonlinearity))
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.nonlinearity,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class LSTMCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedLSTMCell(Reference)"
+
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        assert input.dim() in (1, 2), \
+            f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        else:
+            hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx
+
+        ret = _VF.lstm_cell(
+            input, hx,
+            self.get_weight_ih(), self.get_weight_hh(),
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = (ret[0].squeeze(0), ret[1].squeeze(0))
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class GRUCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedGRUCell(Reference)"
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        assert input.dim() in (1, 2), \
+            f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        ret = _VF.gru_cell(
+            input, hx,
+            self.get_weight_ih(), self.get_weight_hh(),
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class RNNBase(nn.RNNBase):
+    def __init__(self, mode: str, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True, batch_first: bool = False,
+                 dropout: float = 0., bidirectional: bool = False, proj_size: int = 0,
+                 device=None, dtype=None,
+                 weight_qparams_dict: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
+        super().__init__(
+            mode, input_size, hidden_size, num_layers, bias, batch_first, dropout,
+            bidirectional, proj_size, device, dtype
+        )
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                'qscheme': torch.per_tensor_affine,
+                'dtype': torch.quint8,
+                'scale': 1.0,
+                'zero_point': 0
+            }
+            weight_qparams_dict = dict()
+            for wn in self._flat_weights_names:
+                if wn.startswith("weight"):
+                    weight_qparams_dict[wn] = weight_qparams
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        for key, weight_qparams in weight_qparams_dict.items():
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+                Exception(f"qscheme: {weight_qscheme} is not support in {self._get_name()}")
+            if weight_qscheme is not None:
+                self.register_buffer(
+                    key + "_scale",
+                    torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+                self.register_buffer(
+                    key + "_zero_point",
+                    torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
+                if weight_qscheme == torch.per_channel_affine:
+                    self.register_buffer(
+                        key + "_axis",
+                        torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device))
+
+class LSTM(RNNBase):
+    """ Reference Quantized LSTM Module
+    We'll store weight_qparams for all the weights in _flat_weights, we need to pass in
+    a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0,
+    to the weight_qparams for that weight
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__('LSTM', *args, **kwargs)
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    def permute_hidden(self,  # type: ignore[override]
+                       hx: Tuple[Tensor, Tensor],
+                       permutation: Optional[Tensor]
+                       ) -> Tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
+
+    def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+        return expected_hidden_size
+
+    # In the future, we should prevent mypy from applying contravariance rules here.
+    # See torch/nn/modules/module.py::_forward_unimplemented
+    def check_forward_args(self,  # type: ignore[override]
+                           input: Tensor,
+                           hidden: Tuple[Tensor, Tensor],
+                           batch_sizes: Optional[Tensor],
+                           ):
+        self.check_input(input, batch_sizes)
+        self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
+                               'Expected hidden[0] size {}, got {}')
+        self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
+                               'Expected hidden[1] size {}, got {}')
+
+    def get_quantized_weight_bias_dict(self):
+        """ dictionary from flat_weight_name to quantized weight or (unquantized) bias
+        e.g.
+        {
+          "weight_ih_l0": quantized_weight,
+          "bias_ih_l0": unquantized_bias,
+          ...
+        }
+        """
+        quantized_weight_bias_dict = {}
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                if wn.startswith("weight"):
+                    weight_or_bias = get_quantized_weight(self, wn)
+                else:
+                    weight_or_bias = getattr(self, wn)
+            else:
+                weight_or_bias = None
+            quantized_weight_bias_dict[wn] = weight_or_bias
+        return quantized_weight_bias_dict
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                weight = getattr(self, wn)
+                if wn.startswith("weight"):
+                    params = get_weight_and_quantization_params(self, wn)
+                    weight = _quantize_and_dequantize_weight(*params)
+            else:
+                weight = None
+            flat_weights.append(weight)
+        return flat_weights
+
+    def forward(self, input, hx=None):  # noqa: F811
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        batch_sizes = None
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            max_batch_size = int(max_batch_size)
+        else:
+            batch_sizes = None
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+            h_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, real_hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            c_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, self.hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            hx = (h_zeros, c_zeros)
+        else:
+            if batch_sizes is None:  # If not PackedSequence input.
+                if is_batched:
+                    if (hx[0].dim() != 3 or hx[1].dim() != 3):
+                        msg = ("For batched 3-D input, hx and cx should "
+                               f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                else:
+                    if hx[0].dim() != 2 or hx[1].dim() != 2:
+                        msg = ("For unbatched 2-D input, hx and cx should "
+                               f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                    hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1))
+
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.lstm(input, hx, self.get_flat_weights(), self.bias, self.num_layers,
+                              self.dropout, self.training, self.bidirectional, self.batch_first)
+        else:
+            result = _VF.lstm(input, batch_sizes, hx, self.get_flat_weights(), self.bias,
+                              self.num_layers, self.dropout, self.training, self.bidirectional)
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:
+                output = output.squeeze(batch_dim)
+                hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def _get_name(self):
+        return "QuantizedLSTM(Reference)"
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.num_layers,
+            mod.bias,
+            mod.batch_first,
+            mod.dropout,
+            mod.bidirectional,
+            weight_qparams_dict=weight_qparams_dict)
+        for wn in mod._flat_weights_names:
+            setattr(ref_mod, wn, getattr(mod, wn))
+        return ref_mod
diff --git a/torch/nn/quantized/_reference/modules/sparse.py b/torch/nn/quantized/_reference/modules/sparse.py
new file mode 100644
index 000000000000..5ace87f0fb73
--- /dev/null
+++ b/torch/nn/quantized/_reference/modules/sparse.py
@@ -0,0 +1,92 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from .utils import ReferenceQuantizedModule
+from typing import Optional, Dict, Any
+
+class Embedding(nn.Embedding, ReferenceQuantizedModule):
+    """ A reference quantized Embedding module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 sparse: bool = False, _weight: Optional[Tensor] = None,
+                 device=None, dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(num_embeddings, embedding_dim, padding_idx, max_norm,
+                         norm_type, scale_grad_by_freq, sparse, _weight, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(self, input: Tensor) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding(
+            input, weight_quant_dequant, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.padding_idx,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.sparse,
+            mod.weight,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams)
+
+class EmbeddingBag(nn.EmbeddingBag, ReferenceQuantizedModule):
+    """ A reference quantized EmbeddingBag module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 mode: str = 'mean', sparse: bool = False, _weight: Optional[Tensor] = None,
+                 include_last_offset: bool = False, padding_idx: Optional[int] = None,
+                 device=None, dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(num_embeddings, embedding_dim, max_norm, norm_type,
+                         scale_grad_by_freq, mode, sparse, _weight, include_last_offset,
+                         padding_idx, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(self, input: Tensor, offsets: Optional[Tensor] = None, per_sample_weights: Optional[Tensor] = None) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding_bag(input, weight_quant_dequant, offsets,
+                               self.max_norm, self.norm_type,
+                               self.scale_grad_by_freq, self.mode, self.sparse,
+                               per_sample_weights, self.include_last_offset,
+                               self.padding_idx)
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.mode,
+            mod.sparse,
+            mod.weight,
+            mod.include_last_offset,
+            mod.padding_idx,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams
+        )
diff --git a/torch/nn/quantized/_reference/modules/utils.py b/torch/nn/quantized/_reference/modules/utils.py
index ebb13cffca45..58d5cd608ffb 100644
--- a/torch/nn/quantized/_reference/modules/utils.py
+++ b/torch/nn/quantized/_reference/modules/utils.py
@@ -1,6 +1,93 @@
 import torch
 from typing import Dict, Any
 
+class ReferenceQuantizedModule(torch.nn.Module):
+    def _init_weight_qparams(self, weight_qparams, device):
+        if weight_qparams is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+        self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"]
+        self.weight_dtype = weight_qparams["dtype"]
+        assert self.weight_qscheme in [
+            None, torch.per_tensor_affine, torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams], \
+            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}")
+        if self.weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
+            zero_point_dtype = weight_qparams["zero_point"].dtype if \
+                isinstance(weight_qparams["zero_point"], torch.Tensor) else \
+                torch.int
+            self.register_buffer(
+                "weight_scale",
+                torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+            self.register_buffer(
+                "weight_zero_point",
+                torch.tensor(weight_qparams["zero_point"], dtype=zero_point_dtype, device=device))
+            if self.weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
+                self.register_buffer(
+                    "weight_axis",
+                    torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+            else:
+                # added for TorchScriptability, not used
+                self.register_buffer(
+                    "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
+        else:
+            # added for TorchScriptability, and for torch.float
+            self.register_buffer("weight_scale", torch.tensor(1.0, dtype=torch.float, device=device))
+            self.register_buffer("weight_zero_point", torch.tensor(0, dtype=torch.int, device=device))
+            self.register_buffer(
+                "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
+
+    def get_weight(self):
+        """
+        Fake quantize (quantize and dequantize) the weight with
+        the quantization parameters for weight, this is used to
+        simulate the numerics for the quantized weight in a quantized
+        model
+        """
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        assert isinstance(self.weight_axis, torch.Tensor)
+        return _quantize_and_dequantize_weight(
+            self.weight,  # type: ignore[arg-type]
+            self.weight_qscheme,
+            self.weight_dtype,
+            self.weight_scale,
+            self.weight_zero_point, self.weight_axis)
+
+    def get_quantized_weight(self):
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        assert isinstance(self.weight_axis, torch.Tensor)
+        return _quantize_weight(
+            self.weight,  # type: ignore[arg-type]
+            self.weight_qscheme,
+            self.weight_dtype,
+            self.weight_scale,
+            self.weight_zero_point,
+            self.weight_axis)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        _save_weight_qparams(
+            destination, prefix, self.weight_qscheme, self.weight_dtype,
+            self.weight_scale, self.weight_zero_point, self.weight_axis)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        for key in _get_weight_qparam_keys(state_dict, prefix):
+            setattr(self, key, state_dict[prefix + key])
+            state_dict.pop(prefix + key)
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
 def _quantize_weight(
         weight: torch.Tensor,
         weight_qscheme: torch.qscheme,
@@ -8,16 +95,21 @@ def _quantize_weight(
         weight_scale: torch.Tensor,
         weight_zero_point: torch.Tensor,
         weight_axis: torch.Tensor):
+    if weight_dtype == torch.float16:
+        weight = weight.to(weight_dtype)
+        return weight
+
     if weight_qscheme == torch.per_tensor_affine:
-        weight = torch.quantize_per_tensor(weight, weight_scale, weight_zero_point, weight_dtype)
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.quantize_per_tensor(weight, weight_scale, weight_zero_point, weight_dtype)
+            return weight
     elif weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
-        weight = torch.quantize_per_channel(
-            weight, weight_scale,
-            weight_zero_point, weight_axis.item(), weight_dtype)  # type: ignore[arg-type]
-    else:
-        raise Exception(f"Unsupported qscheme: {weight_qscheme}")
-    return weight
-
+        if weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
+            weight = torch.quantize_per_channel(
+                weight, weight_scale,
+                weight_zero_point, weight_axis.item(), weight_dtype)  # type: ignore[arg-type]
+            return weight
+    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
 
 def _quantize_and_dequantize_weight(
         weight: torch.Tensor,
diff --git a/torch/nn/quantized/dynamic/modules/linear.py b/torch/nn/quantized/dynamic/modules/linear.py
index ed7fcd330668..8049a21009d6 100644
--- a/torch/nn/quantized/dynamic/modules/linear.py
+++ b/torch/nn/quantized/dynamic/modules/linear.py
@@ -110,3 +110,17 @@ def from_float(cls, mod):
         qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
         qlinear.set_weight_bias(qweight, mod.bias)
         return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear):
+        """ Create a (fbgemm/qnnpack) dynamic quantized module from a reference quantized
+        module
+        Args:
+            ref_qlinear (Module): a reference quantized  module, either produced by
+            torch.ao.quantization functions or provided by the user
+        """
+        qlinear = cls(ref_qlinear.in_features, ref_qlinear.out_features, dtype=ref_qlinear.weight_dtype)
+        qweight = ref_qlinear.get_quantized_weight()
+        bias = ref_qlinear.bias
+        qlinear.set_weight_bias(qweight, bias)
+        return qlinear
diff --git a/torch/nn/quantized/dynamic/modules/rnn.py b/torch/nn/quantized/dynamic/modules/rnn.py
index 4f8bbca739ba..5cba31474723 100644
--- a/torch/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/nn/quantized/dynamic/modules/rnn.py
@@ -11,6 +11,27 @@
 def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
     return tensor.index_select(dim, permutation)
 
+def pack_weight_bias(qweight, bias, dtype):
+
+    if dtype == torch.qint8:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   w_ih, w_hh
+        packed_weight = \
+            torch.ops.quantized.linear_prepack(qweight, bias)
+
+        return packed_weight
+    else:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   packed_ih, packed_hh, b_ih, b_hh
+        packed_weight = torch.ops.quantized.linear_prepack_fp16(
+            qweight, bias)
+
+        return packed_weight
+
 class PackedParameter(torch.nn.Module):
     def __init__(self, param):
         super(PackedParameter, self).__init__()
@@ -92,9 +113,7 @@ def __init__(self, mode, input_size, hidden_size,
                     else:
                         cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
                             packed_ih, packed_hh, b_ih, b_hh, True)
-
                 else:
-
                     packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
                     packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
                     cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
@@ -197,6 +216,43 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         super(RNNBase, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                    missing_keys, unexpected_keys, error_msgs)
 
+    def set_weight_bias(self, weight_bias_dict):
+
+        def weight_bias_name(ihhh, layer, suffix):
+            weight_name = "weight_{}_l{}{}".format(ihhh, layer, suffix)
+            bias_name = "bias_{}_l{}{}".format(ihhh, layer, suffix)
+            return weight_name, bias_name
+
+        num_directions = 2 if self.bidirectional else 1
+        # TODO: dedup with __init__ of RNNBase
+        _all_weight_values = []
+        for layer in range(self.num_layers):
+            for direction in range(num_directions):
+                suffix = "_reverse" if direction == 1 else ""
+                w_ih_name, b_ih_name = weight_bias_name("ih", layer, suffix)
+                w_hh_name, b_hh_name = weight_bias_name("hh", layer, suffix)
+                w_ih = weight_bias_dict[w_ih_name]
+                b_ih = weight_bias_dict[b_ih_name]
+                w_hh = weight_bias_dict[w_hh_name]
+                b_hh = weight_bias_dict[b_hh_name]
+                if w_ih.dtype == torch.qint8:
+                    packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh)
+                    if self.version is None or self.version < 2:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh)
+                    else:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh, True)
+                else:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh)
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        self._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
     @classmethod
     def from_float(cls, mod):
         assert type(mod) in set(
@@ -429,6 +485,24 @@ def forward(self, input, hx=None):
     def from_float(cls, mod):
         return super(LSTM, cls).from_float(mod)
 
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 "
+        "exists in LSTM, may need to relax the assumption to support the use case"
+        qmod = cls(
+            ref_mod.input_size,
+            ref_mod.hidden_size,
+            ref_mod.num_layers,
+            ref_mod.bias,
+            ref_mod.batch_first,
+            ref_mod.dropout,
+            ref_mod.bidirectional,
+            # assuming there is layer 0, which should be OK
+            ref_mod.weight_ih_l0_dtype,
+        )
+        qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict())
+        return qmod
+
 
 class GRU(RNNBase):
     r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
@@ -544,8 +618,7 @@ def __init__(self, *args, **kwargs):
     def _get_name(self):
         return 'DynamicQuantizedGRU'
 
-    def check_forward_args(self, input, hidden, batch_sizes):
-        # type: (Tensor, Tensor, Optional[Tensor])->None
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None:
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
 
@@ -653,6 +726,7 @@ def __init__(self, input_size, hidden_size, bias=True, num_chunks=4, dtype=torch
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
+        self.weight_dtype = dtype
         if bias:
             self.bias_ih = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
             self.bias_hh = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
@@ -751,42 +825,60 @@ def from_float(cls, mod):
             raise NotImplementedError('Only LSTMCell, GRUCell and RNNCell \
             are supported for QuantizedRNN for now')
 
-
         assert mod.bias
 
-        def process_weights(weight, bias, dtype):
-
+        def _observe_and_quantize_weight(weight):
             if dtype == torch.qint8:
-                # for each layer, for each direction we need to quantize and pack
-                # weights and pack parameters in this order:
-                #
-                #   w_ih, w_hh
                 weight_observer = weight_observer_method()
                 weight_observer(weight)
                 qweight = _quantize_weight(weight.float(), weight_observer)
-                packed_weight = \
-                    torch.ops.quantized.linear_prepack(qweight, bias)
-
-                return packed_weight
+                return qweight
             else:
-                # for each layer, for each direction we need to quantize and pack
-                # weights and pack parameters in this order:
-                #
-                #   packed_ih, packed_hh, b_ih, b_hh
-                packed_weight = torch.ops.quantized.linear_prepack_fp16(
-                    weight.float(), bias)
+                return weight.float()
 
-                return packed_weight
-
-        qRNNCellBase._packed_weight_ih = process_weights(mod.weight_ih, mod.bias_ih, dtype)
-        qRNNCellBase._packed_weight_hh = process_weights(mod.weight_hh, mod.bias_hh, dtype)
+        qRNNCellBase._packed_weight_ih = pack_weight_bias(_observe_and_quantize_weight(mod.weight_ih), mod.bias_ih, dtype)
+        qRNNCellBase._packed_weight_hh = pack_weight_bias(_observe_and_quantize_weight(mod.weight_hh), mod.bias_hh, dtype)
         return qRNNCellBase
 
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_dtype"), "We are assuming weight_ih "
+        "exists in reference module, may need to relax the assumption to support the use case"
+        if hasattr(ref_mod, "nonlinearity"):
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                ref_mod.nonlinearity,
+                dtype=ref_mod.weight_ih_dtype
+            )
+        else:
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                dtype=ref_mod.weight_ih_dtype
+            )
+        weight_bias_dict = {
+            "weight": {
+                "weight_ih": ref_mod.get_quantized_weight_ih(),
+                "weight_hh": ref_mod.get_quantized_weight_hh(),
+            },
+            "bias": {
+                "bias_ih": ref_mod.bias_ih,
+                "bias_hh": ref_mod.bias_hh,
+            }
+        }
+        qmod.set_weight_bias(weight_bias_dict)
+        return qmod
+
     def _weight_bias(self):
         # Returns a dict of weights and biases
         weight_bias_dict: Dict[str, Dict] = {'weight' : {}, 'bias' : {}}
         w1, b1 = self._packed_weight_ih.__getstate__()[0]
         w2, b2 = self._packed_weight_hh.__getstate__()[0]
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
         weight_bias_dict['weight']['weight_ih'] = w1
         weight_bias_dict['weight']['weight_hh'] = w2
         weight_bias_dict['bias']['bias_ih'] = b1
@@ -799,12 +891,23 @@ def get_weight(self):
     def get_bias(self):
         return self._weight_bias()['bias']
 
+    def set_weight_bias(self, weight_bias_dict):
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
+        self._packed_weight_ih = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_ih"],
+            weight_bias_dict["bias"]["bias_ih"],
+            self.weight_dtype)
+        self._packed_weight_hh = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_hh"],
+            weight_bias_dict["bias"]["bias_hh"],
+            self.weight_dtype)
+
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         super(RNNCellBase, self)._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + '_packed_weight_ih'] = self._packed_weight_ih
         destination[prefix + '_packed_weight_hh'] = self._packed_weight_hh
 
-
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         self._packed_weight_ih = state_dict.pop(prefix + '_packed_weight_ih')
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index a1f04ed69593..0f7aba8fd30e 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -376,7 +376,7 @@ def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
     if stride is None:
         stride = torch.jit.annotate(List[int], [])
     return torch.nn.functional.max_pool1d(input, kernel_size, stride, padding,
-                                          dilation, ceil_mode, return_indices)
+                                          dilation, ceil_mode=ceil_mode, return_indices=return_indices)
 
 def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
                ceil_mode=False, return_indices=False):
@@ -392,7 +392,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
     if stride is None:
         stride = torch.jit.annotate(List[int], [])
     return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding,
-                                          dilation, ceil_mode, return_indices)
+                                          dilation, ceil_mode=ceil_mode, return_indices=return_indices)
 
 def celu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.) -> Tensor:
     r"""celu(input, scale, zero_point, alpha=1.) -> Tensor
diff --git a/torch/nn/quantized/modules/__init__.py b/torch/nn/quantized/modules/__init__.py
index e714a97e0667..4a899ef26079 100644
--- a/torch/nn/quantized/modules/__init__.py
+++ b/torch/nn/quantized/modules/__init__.py
@@ -1,7 +1,7 @@
 import torch
 from torch.nn.modules.pooling import MaxPool2d
 
-from .activation import ReLU6, Hardswish, ELU, LeakyReLU, Sigmoid
+from .activation import ReLU6, Hardswish, ELU, LeakyReLU, Sigmoid, Softmax
 from .dropout import Dropout
 from .batchnorm import BatchNorm2d, BatchNorm3d
 from .normalization import LayerNorm, GroupNorm, InstanceNorm1d, \
@@ -114,6 +114,7 @@ def from_float(mod):
     'Quantize',
     'ReLU6',
     'Sigmoid',
+    'Softmax',
     'Dropout',
     # Wrapper modules
     'FloatFunctional',
diff --git a/torch/nn/quantized/modules/activation.py b/torch/nn/quantized/modules/activation.py
index a3a745feba59..cd581c34a822 100644
--- a/torch/nn/quantized/modules/activation.py
+++ b/torch/nn/quantized/modules/activation.py
@@ -62,6 +62,10 @@ def from_float(mod):
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         return Hardswish(float(scale), int(zero_point))
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point))
+
 class ELU(torch.nn.ELU):
     r"""This is the quantized equivalent of :class:`~torch.nn.ELU`.
 
@@ -87,6 +91,10 @@ def from_float(mod):
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         return ELU(float(scale), int(zero_point), mod.alpha)
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.alpha)
+
 class LeakyReLU(torch.nn.LeakyReLU):
     r"""This is the quantized equivalent of :class:`~torch.nn.LeakyReLU`.
 
@@ -114,6 +122,10 @@ def from_float(cls, mod):
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
+
 class Sigmoid(torch.nn.Sigmoid):
     r"""This is the quantized equivalent of :class:`~torch.nn.Sigmoid`.
 
@@ -134,3 +146,40 @@ def forward(self, input):
     def from_float(cls, mod):
         output_scale, output_zero_point = mod.activation_post_process.calculate_qparams()
         return cls(float(output_scale), int(output_zero_point))
+
+class Softmax(torch.nn.Softmax):
+    r"""This is the quantized version of :class:`~torch.nn.Softmax`.
+
+    Args:
+        dim: A dimension along which Softmax will be computed (so every slice along dim will sum to 1).
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+    def __init__(self, dim=None, scale=1.0, zero_point=0):
+        super().__init__()
+        self.dim = dim
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def forward(self, input):
+        dim = self.dim
+        if dim is None:
+            stacklevel = 3
+            # Note: adding the mypy ignore on _get_softmax_dim seems less bad
+            # than making `_get_softmax_dim` an official API.
+            dim = torch.nn.functional._get_softmax_dim(  # type: ignore[attr-defined]
+                "softmax", input.dim(), stacklevel)
+        return torch.ops.quantized.softmax(
+            input, dim, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedSoftmax'
+
+    @staticmethod
+    def from_float(mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Softmax(mod.dim, float(scale), int(zero_point))
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.dim, float(scale), int(zero_point))
diff --git a/torch/nn/quantized/modules/batchnorm.py b/torch/nn/quantized/modules/batchnorm.py
index b30bf203cfaf..1046d0254b6d 100644
--- a/torch/nn/quantized/modules/batchnorm.py
+++ b/torch/nn/quantized/modules/batchnorm.py
@@ -1,29 +1,19 @@
 import torch
 import torch.nn.quantized.functional
 import torch.nn.intrinsic as nni
+from torch import Tensor
 
-class BatchNorm2d(torch.nn.BatchNorm2d):
-    r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`.
-    """
-
+class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm):
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(BatchNorm2d, self).__init__(num_features, **factory_kwargs)
-        self.eps = eps
+        super().__init__(num_features, eps, momentum, True, True, **factory_kwargs)
         self.register_buffer('scale', torch.tensor(1.0, **factory_kwargs))
         self.register_buffer('zero_point', torch.tensor(0, **factory_kwargs))
 
-    def forward(self, input):
-        return torch.ops.quantized.batch_norm2d(input, self.weight, self.bias, self.running_mean,
-                                                self.running_var, self.eps, self.scale, self.zero_point)
-
-    def _get_name(self):
-        return 'QuantizedBatchNorm2d'
-
-    @classmethod
+    @staticmethod
     def from_float(cls, mod):
         activation_post_process = mod.activation_post_process
-        if type(mod) == nni.BNReLU2d:
+        if type(mod) == cls._NNI_BN_RELU_MODULE:
             mod = mod[0]
         scale, zero_point = activation_post_process.calculate_qparams()
         new_mod = cls(mod.num_features, mod.eps)
@@ -35,36 +25,79 @@ def from_float(cls, mod):
         new_mod.zero_point = zero_point
         return new_mod
 
-# TODO: dedup with BatchNorm2d
-class BatchNorm3d(torch.nn.BatchNorm3d):
+    @classmethod
+    def from_reference(cls, bn, output_scale, output_zero_point):
+        qbn = cls(
+            bn.num_features,
+            bn.eps,
+            bn.momentum,
+            device=bn.weight.device,
+            dtype=bn.weight.dtype
+        )
+        qbn.weight = bn.weight
+        qbn.bias = bn.bias
+        qbn.running_mean = bn.running_mean
+        qbn.running_var = bn.running_var
+        qbn.scale = output_scale
+        qbn.zero_point = output_zero_point
+        return qbn
+
+class BatchNorm2d(_BatchNorm):
+    r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`.
+    """
+
+    _NNI_BN_RELU_MODULE = nni.BNReLU2d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedBatchNorm2d'
+
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: Tensor) -> Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm2d(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        return _BatchNorm.from_float(cls, mod)
+
+class BatchNorm3d(_BatchNorm):
     r"""This is the quantized version of :class:`~torch.nn.BatchNorm3d`.
     """
 
+    _NNI_BN_RELU_MODULE = nni.BNReLU3d
+
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(BatchNorm3d, self).__init__(num_features, **factory_kwargs)
-        self.eps = eps
-        self.register_buffer('scale', torch.tensor(1.0, **factory_kwargs))
-        self.register_buffer('zero_point', torch.tensor(0, **factory_kwargs))
-
-    def forward(self, input):
-        return torch.ops.quantized.batch_norm3d(input, self.weight, self.bias, self.running_mean,
-                                                self.running_var, self.eps, self.scale, self.zero_point)
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
 
     def _get_name(self):
         return 'QuantizedBatchNorm3d'
 
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: Tensor) -> Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm3d(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
-        if type(mod) == nni.BNReLU3d:
-            mod = mod[0]
-        scale, zero_point = activation_post_process.calculate_qparams()
-        new_mod = cls(mod.num_features, mod.eps)
-        new_mod.weight = mod.weight
-        new_mod.bias = mod.bias
-        new_mod.running_mean = mod.running_mean
-        new_mod.running_var = mod.running_var
-        new_mod.scale = scale
-        new_mod.zero_point = zero_point
-        return new_mod
+        return _BatchNorm.from_float(cls, mod)
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index 38b206b0c9b3..591948b667c2 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -12,7 +12,7 @@
 from torch._ops import ops
 from torch.nn.common_types import _size_1_t
 from torch.nn.modules.utils import _single, _pair, _triple
-from torch.nn.quantized.modules.utils import _quantize_weight, ReferenceableQuantizedModule
+from torch.nn.quantized.modules.utils import _quantize_weight, WeightedQuantizedModule
 from torch.nn.utils import fuse_conv_bn_weights
 
 _SUPPORTED_PADDING = {
@@ -29,7 +29,7 @@ def _reverse_repeat_padding(padding: List[int]) -> List[int]:
             _reversed_padding_repeated_twice.append(padding[N - idx - 1])
     return _reversed_padding_repeated_twice
 
-class _ConvNd(ReferenceableQuantizedModule):
+class _ConvNd(WeightedQuantizedModule):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
@@ -244,7 +244,7 @@ def from_reference(cls, ref_qconv, output_scale, output_zero_point):
             ref_module (Module): a reference quantized  module, either produced by torch.ao.quantization
                           utilities or provided by the user
             output_scale (float): scale for output Tensor
-            zero_point (int): zero point for output Tensor
+            output_zero_point (int): zero point for output Tensor
         """
         qconv = cls(
             ref_qconv.in_channels,
@@ -622,6 +622,33 @@ def from_float(cls, mod):
             qconv.zero_point = int(act_zp)
             return qconv
 
+    @staticmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+        Args:
+            ref_module (Module): a reference quantized  module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qconv = cls(
+            ref_qconvt.in_channels,
+            ref_qconvt.out_channels,
+            ref_qconvt.kernel_size,  # type: ignore[arg-type]
+            ref_qconvt.stride,  # type: ignore[arg-type]
+            ref_qconvt.padding,  # type: ignore[arg-type]
+            ref_qconvt.output_padding,  # type: ignore[arg-type]
+            ref_qconvt.groups,
+            ref_qconvt.bias is not None,  # type: ignore[arg-type]
+            ref_qconvt.dilation,  # type: ignore[arg-type]
+            ref_qconvt.padding_mode,
+            device=ref_qconvt.weight.device,
+            dtype=ref_qconvt.weight.dtype)
+        qweight = ref_qconvt.get_quantized_weight()
+        qconv.set_weight_bias(qweight, ref_qconvt.bias)
+        qconv.scale = float(output_scale)
+        qconv.zero_point = int(output_zero_point)
+        return qconv
 
 class ConvTranspose1d(_ConvTransposeNd):
     r"""Applies a 1D transposed convolution operator over an input image
@@ -708,6 +735,10 @@ def forward(self, input):
         return torch.ops.quantized.conv_transpose1d(
             input, self._packed_params, self.scale, self.zero_point)
 
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
+
 
 class ConvTranspose2d(_ConvTransposeNd):
     r"""Applies a 2D transposed convolution operator over an input image
@@ -792,6 +823,10 @@ def forward(self, input):
         return ops.quantized.conv_transpose2d(
             input, self._packed_params, self.scale, self.zero_point)
 
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
+
 class ConvTranspose3d(_ConvTransposeNd):
     r"""Applies a 3D transposed convolution operator over an input image
     composed of several input planes.
@@ -876,3 +911,7 @@ def forward(self, input):
             raise ValueError("Input shape must be `(N, C, T, H, W)`!")
         return ops.quantized.conv_transpose3d(
             input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
diff --git a/torch/nn/quantized/modules/dropout.py b/torch/nn/quantized/modules/dropout.py
index 98a1ed66e305..ae540dad2e95 100644
--- a/torch/nn/quantized/modules/dropout.py
+++ b/torch/nn/quantized/modules/dropout.py
@@ -20,3 +20,7 @@ def _get_name(self):
     @classmethod
     def from_float(cls, mod):
         return cls(mod.p, mod.inplace)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.p, mod.inplace)
diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py
index d183c8a75d7c..7af12e9a72e2 100644
--- a/torch/nn/quantized/modules/embedding_ops.py
+++ b/torch/nn/quantized/modules/embedding_ops.py
@@ -19,7 +19,7 @@ def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
                                                            axis=0, dtype=self.dtype)
             self.set_weight(wq)
         else:
-            raise NotImplementedError('Unsupported dtype on quantized embedding! Supports quint8 and quint4x2.')
+            raise NotImplementedError(f'Unsupported dtype on quantized embedding! Supports quint8 and quint4x2. Got dtype: {dtype}')
 
     @torch.jit.export
     def set_weight(self, weight: torch.Tensor) -> None:
@@ -152,8 +152,8 @@ def from_float(cls, mod):
                 nn.Embedding.__name__
             assert hasattr(mod, 'qconfig'), 'Embedding input float module must have qconfig defined'
             from torch.ao.quantization import float_qparams_weight_only_qconfig
-            if mod.qconfig is not None and mod.qconfig.weight is not None:
-                weight_observer = mod.qconfig.weight()
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
             else:
                 weight_observer = float_qparams_weight_only_qconfig.weight()
 
@@ -174,6 +174,20 @@ def from_float(cls, mod):
         qembedding.set_weight(qweight)
         return qembedding
 
+    @classmethod
+    def from_reference(cls, ref_embedding):
+        qembedding = cls(
+            ref_embedding.num_embeddings,
+            ref_embedding.embedding_dim,
+            ref_embedding.padding_idx,
+            ref_embedding.max_norm,
+            ref_embedding.norm_type,
+            ref_embedding.scale_grad_by_freq,
+            ref_embedding.sparse,
+            ref_embedding.get_quantized_weight(),
+            ref_embedding.weight_dtype,
+        )
+        return qembedding
 
 class EmbeddingBag(Embedding):
     r"""
@@ -239,8 +253,8 @@ def from_float(cls, mod):
                 nn.EmbeddingBag.__name__
             assert hasattr(mod, 'qconfig'), 'EmbeddingBag input float module must have qconfig defined'
             from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
-            if mod.qconfig is not None and mod.qconfig.weight is not None:
-                weight_observer = mod.qconfig.weight()
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
             else:
                 weight_observer = float_qparams_weight_only_qconfig.weight()
 
@@ -260,3 +274,19 @@ def from_float(cls, mod):
         qembedding_bag = EmbeddingBag(mod.num_embeddings, mod.embedding_dim, dtype=dtype)
         qembedding_bag.set_weight(qweight)
         return qembedding_bag
+
+    @classmethod
+    def from_reference(cls, ref_embedding_bag):
+        qembedding_bag = cls(
+            ref_embedding_bag.num_embeddings,
+            ref_embedding_bag.embedding_dim,
+            ref_embedding_bag.max_norm,
+            ref_embedding_bag.norm_type,
+            ref_embedding_bag.scale_grad_by_freq,
+            ref_embedding_bag.mode,
+            ref_embedding_bag.sparse,
+            ref_embedding_bag.get_quantized_weight(),
+            ref_embedding_bag.include_last_offset,
+            ref_embedding_bag.weight_dtype,
+        )
+        return qembedding_bag
diff --git a/torch/nn/quantized/modules/functional_modules.py b/torch/nn/quantized/modules/functional_modules.py
index 4bac20dc28d3..d5090b47349e 100644
--- a/torch/nn/quantized/modules/functional_modules.py
+++ b/torch/nn/quantized/modules/functional_modules.py
@@ -223,7 +223,7 @@ def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
     def from_float(cls, mod):
         assert type(mod) == FloatFunctional,\
             "QFunctional.from_float expects an instance of FloatFunctional"
-        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
         new_mod = QFunctional()
         new_mod.scale = float(scale)
         new_mod.zero_point = int(zero_point)
diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py
index d343ed1b00db..1463497cafe0 100644
--- a/torch/nn/quantized/modules/linear.py
+++ b/torch/nn/quantized/modules/linear.py
@@ -3,7 +3,10 @@
 
 import torch.nn as nn
 import torch.nn.intrinsic as nni
-from torch.nn.quantized.modules.utils import _quantize_weight, hide_packed_params_repr, ReferenceableQuantizedModule
+import torch.nn.intrinsic.qat as nniqat
+from torch.nn.quantized.modules.utils import _quantize_weight, hide_packed_params_repr, WeightedQuantizedModule
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+from torch.nn.utils.parametrize import type_before_parametrizations
 from typing import Optional
 
 class LinearPackedParams(torch.nn.Module):
@@ -88,7 +91,7 @@ def __repr__(self):
         return self._weight_bias().__repr__()
 
 
-class Linear(ReferenceableQuantizedModule):
+class Linear(WeightedQuantizedModule):
     r"""
     A quantized linear module with quantized tensor as inputs and outputs.
     We adopt the same interface as `torch.nn.Linear`, please see
@@ -239,7 +242,10 @@ def from_float(cls, mod):
                           utilities or provided by the user
         """
         if hasattr(mod, 'weight_fake_quant'):
-            # assert type(mod) == QATLinear, 'training mode nnq.Linear.from_float only works for nn.qat.Linear'
+            if type_before_parametrizations(mod) == nniqat.LinearBn1d:
+                mod.weight, mod.bias = fuse_linear_bn_weights(
+                    mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                    mod.bn.eps, mod.bn.weight, mod.bn.bias)
             weight_post_process = mod.weight_fake_quant
             activation_post_process = mod.activation_post_process
         else:
@@ -250,10 +256,10 @@ def from_float(cls, mod):
                 cls._FLOAT_MODULE = [cls._FLOAT_MODULE]  # type: ignore[assignment]
             supported_modules = ', '.join([float_mod.__name__ for float_mod in cls._FLOAT_MODULE])  # type: ignore[attr-defined]
             error_msg = 'nnq.{}.from_float only works for {}, but got: {}'.format(cls.__name__, supported_modules, type(mod))
-            assert type(mod) in cls._FLOAT_MODULE, error_msg.format()  # type: ignore[attr-defined]
+            assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, error_msg.format()  # type: ignore[attr-defined]
             assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
             activation_post_process = mod.activation_post_process
-            if type(mod) == nni.LinearReLU:
+            if type_before_parametrizations(mod) == nni.LinearReLU:
                 mod = mod[0]
             weight_post_process = mod.qconfig.weight()
         weight_post_process(mod.weight)
@@ -274,7 +280,7 @@ def from_reference(cls, ref_qlinear, output_scale, output_zero_point):
         r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
 
         Args:
-            ref_module (Module): a reference quantized  module, either produced by torch.ao.quantization
+            ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization
                           utilities or provided by the user
             output_scale (float): scale for output Tensor
             zero_point (int): zero point for output Tensor
diff --git a/torch/nn/quantized/modules/normalization.py b/torch/nn/quantized/modules/normalization.py
index cfe5c9630376..b695df32b7ca 100644
--- a/torch/nn/quantized/modules/normalization.py
+++ b/torch/nn/quantized/modules/normalization.py
@@ -37,6 +37,12 @@ def from_float(cls, mod):
             int(zero_point), mod.eps, mod.elementwise_affine)
         return new_mod
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.normalized_shape, mod.weight, mod.bias, float(scale),
+            int(zero_point), mod.eps, mod.elementwise_affine)
+
 class GroupNorm(torch.nn.GroupNorm):
     r"""This is the quantized version of :class:`~torch.nn.GroupNorm`.
 
@@ -108,6 +114,12 @@ def from_float(cls, mod):
             mod.eps, mod.affine)
         return new_mod
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+
 class InstanceNorm2d(torch.nn.InstanceNorm2d):
     r"""This is the quantized version of :class:`~torch.nn.InstanceNorm2d`.
 
@@ -143,6 +155,12 @@ def from_float(cls, mod):
             mod.eps, mod.affine)
         return new_mod
 
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+
 class InstanceNorm3d(torch.nn.InstanceNorm3d):
     r"""This is the quantized version of :class:`~torch.nn.InstanceNorm3d`.
 
@@ -177,3 +195,9 @@ def from_float(cls, mod):
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
             mod.eps, mod.affine)
         return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
diff --git a/torch/nn/quantized/modules/utils.py b/torch/nn/quantized/modules/utils.py
index 0984b6951c35..e9e801f2f055 100644
--- a/torch/nn/quantized/modules/utils.py
+++ b/torch/nn/quantized/modules/utils.py
@@ -4,7 +4,7 @@
 import collections
 from torch.nn.modules.module import _addindent
 
-class ReferenceableQuantizedModule(torch.nn.Module, metaclass=abc.ABCMeta):
+class WeightedQuantizedModule(torch.nn.Module, metaclass=abc.ABCMeta):
     """Wrapper for quantized modules than can be lowered from reference modules."""
     @classmethod
     @abc.abstractmethod
diff --git a/torch/nn/utils/__init__.py b/torch/nn/utils/__init__.py
index 32ef82e561f3..6512af322638 100644
--- a/torch/nn/utils/__init__.py
+++ b/torch/nn/utils/__init__.py
@@ -7,3 +7,4 @@
 from .memory_format import convert_conv2d_weight_memory_format
 from . import parametrizations
 from .init import skip_init
+from . import stateless
diff --git a/torch/nn/utils/_expanded_weights/__init__.py b/torch/nn/utils/_expanded_weights/__init__.py
new file mode 100644
index 000000000000..102474614238
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/__init__.py
@@ -0,0 +1,9 @@
+from .conv_expanded_weights import ConvPerSampleGrad
+from .embedding_expanded_weights import EmbeddingPerSampleGrad
+from .group_norm_expanded_weights import GroupNormPerSampleGrad
+from .instance_norm_expanded_weights import InstanceNormPerSampleGrad
+from .layer_norm_expanded_weights import LayerNormPerSampleGrad
+from .linear_expanded_weights import LinearPerSampleGrad
+from .expanded_weights_impl import ExpandedWeight
+
+__all__ = ['ExpandedWeight']
diff --git a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
new file mode 100644
index 000000000000..bfcd72e591da
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+
+from .conv_utils import conv_backward, conv_args_and_kwargs
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import forward_helper
+
+@implements_per_sample_grads(F.conv1d)
+@implements_per_sample_grads(F.conv2d)
+@implements_per_sample_grads(F.conv3d)
+class ConvPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, conv_fn, *expanded_args_and_kwargs):
+        if any([isinstance(i, str) for i in expanded_args_and_kwargs]):
+            raise RuntimeError("Expanded Weights does not support convolution padding as a string. "
+                               "Please file an issue to prioritize support")
+        expanded_args, expanded_kwargs = conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs)
+        output = forward_helper(conv_fn, expanded_args, expanded_kwargs)
+        input, weight = expanded_args
+
+        ctx.conv_fn = conv_fn
+
+        ctx.batch_size = input.shape[0]
+        ctx.input_required_grad = input.requires_grad
+        ctx.stride, ctx.padding = expanded_kwargs['stride'], expanded_kwargs['padding']
+        ctx.dilation, ctx.groups = expanded_kwargs['dilation'], expanded_kwargs['groups']
+
+        if isinstance(weight, ExpandedWeight):
+            ctx.input = input
+        ctx.weight = weight
+        ctx.bias = expanded_kwargs['bias']
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return conv_backward(ctx.conv_fn, ctx, grad_output)
diff --git a/torch/nn/utils/_expanded_weights/conv_utils.py b/torch/nn/utils/_expanded_weights/conv_utils.py
new file mode 100644
index 000000000000..16b977432529
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -0,0 +1,184 @@
+import torch
+import torch.nn.functional as F
+
+import numpy as np
+from typing import List, Optional
+
+from .expanded_weights_utils import \
+    set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+
+THRESHOLD = 32
+
+def conv_picker(func, conv1dOpt, conv2dOpt, conv3dOpt):
+    if func == F.conv1d:
+        return conv1dOpt
+    if func == F.conv2d:
+        return conv2dOpt
+    else:
+        assert func == F.conv3d
+        return conv3dOpt
+
+def conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs):
+    args = expanded_args_and_kwargs[:len(expanded_args_and_kwargs) - len(kwarg_names)]
+    kwargs = expanded_args_and_kwargs[len(expanded_args_and_kwargs) - len(kwarg_names):]
+    kwargs = {name: arg for (name, arg) in zip(kwarg_names, kwargs)}
+
+    return conv_normalizer(*args, **kwargs)
+
+def conv_normalizer(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    return (input, weight), {'bias': bias, 'stride': stride, 'padding': padding, 'dilation': dilation, 'groups': groups}
+
+def conv_backward(func, ctx, grad_output):
+
+    def weight_grad_sample(weight):
+        if (batch_size < THRESHOLD and groups == 1):
+            return conv_group_weight_grad_sample(ctx.input, grad_output, weight_shape, stride, padding, dilation, batch_size, func)
+        else:
+            return conv_unfold_weight_grad_sample(ctx.input, grad_output, weight_shape, kernel_size,
+                                                  stride, padding, dilation, groups, func)
+
+    def expand(param):
+        if isinstance(param, int):
+            return conv_picker(func, (param,), (param, param), (param, param, param))
+        else:
+            return param
+
+    weight_shape = ctx.weight.shape
+    stride, padding, dilation, groups = expand(ctx.stride), expand(ctx.padding), expand(ctx.dilation), ctx.groups
+
+    kernel_size = []
+    for i in range(2, conv_picker(func, 3, 4, 5)):
+        kernel_size.append(weight_shape[i])
+
+    batch_size = ctx.batch_size
+    results: List[Optional[torch.Tensor]] = []
+    results.append(None)  # for kwarg names
+    results.append(None)  # for op reference
+
+    if ctx.input_required_grad:
+        output_padding = []
+        input_dims = conv_picker(func, 1, 2, 3)
+        for i in range(input_dims):
+            input_dim = ctx.input.shape[2 + i]
+            output_padding.append((2 * padding[i] + input_dim - (kernel_size[i] * dilation[i] - dilation[i] + 1)) % stride[i])
+        weight_ = unpack_expanded_weight_or_tensor(ctx.weight)
+        transpose_func = conv_picker(func, F.conv_transpose1d, F.conv_transpose2d, F.conv_transpose3d)
+        results.append(transpose_func(grad_output, weight_, None, stride, padding, tuple(output_padding), groups, dilation))
+    else:
+        results.append(None)
+    # weight and bias don't compute batched gradients; no other arguments are differentiable
+    results = results + [None] * 6
+
+    # set grad_sample field for weight and bias with per sample gradients
+    set_grad_sample_if_exists(ctx.weight, weight_grad_sample)
+    set_grad_sample_if_exists(ctx.bias, lambda _: grad_output.reshape(*grad_output.shape[:2], -1).sum(dim=2))
+    return tuple(results)
+
+def conv_unfold_weight_grad_sample(input, grad_output, weight_shape, kernel_size, stride, padding, dilation, groups, func):
+    n = input.shape[0]
+    in_channels = input.shape[1]
+
+    unfold_func = conv_picker(
+        func,
+        lambda: F.unfold(input.unsqueeze(-2),
+                         kernel_size=(1, kernel_size[0]),
+                         dilation=(1, dilation[0]),
+                         padding=(0, padding[0]),
+                         stride=(1, stride[0])),
+        lambda: F.unfold(input, kernel_size, dilation=dilation, padding=padding, stride=stride),
+        lambda: unfold3d(input, kernel_size, dilation, padding, stride)
+    )
+
+    input = unfold_func()
+    grad_output = grad_output.reshape(n, -1, input.shape[-1])
+
+    # n=batch_sz; o=num_out_channels; p=(num_in_channels/groups)*kernel_sz
+    weight_grad_sample = torch.einsum("noq,npq->nop", grad_output, input)
+    # rearrange the above tensor and extract diagonals.
+    weight_grad_sample = weight_grad_sample.view(
+        n,
+        groups,
+        -1,
+        groups,
+        int(in_channels / groups),
+        np.prod(kernel_size),
+    )
+    weight_grad_sample = torch.einsum("ngrg...->ngr...", weight_grad_sample).contiguous()
+    shape = [n] + list(weight_shape)
+    weight_grad_sample = weight_grad_sample.view(shape)
+    return weight_grad_sample
+
+def conv_group_weight_grad_sample(input, grad_output, weight_shape, stride, padding, dilation, batch_size, func):
+    I = input.shape[1]
+    O = grad_output.shape[1]
+
+    input_ = input.transpose(0, 1)
+    grad_output_ = grad_output.view(grad_output.shape[0] * grad_output.shape[1], 1, *grad_output.shape[2:])
+
+    weight_grad_sample = func(input_, grad_output_, None, stride=dilation, padding=padding, dilation=stride, groups=batch_size)
+    input_dims = conv_picker(func, 3, 4, 5)
+    for i in range(2, input_dims):
+        weight_grad_sample = weight_grad_sample.narrow(i, 0, weight_shape[i])
+    weight_grad_sample = weight_grad_sample.view(I, batch_size, O, *weight_grad_sample.shape[2:])
+    weight_grad_sample = weight_grad_sample.movedim(0, 2)
+    return weight_grad_sample
+
+
+def unfold3d(
+    tensor,
+    kernel_size,
+    padding,
+    stride,
+    dilation,
+):
+    r"""
+    Extracts sliding local blocks from an batched input tensor.
+    :class:`torch.nn.Unfold` only supports 4D inputs (batched image-like tensors).
+    This method implements the same action for 5D inputs
+    Args:
+        tensor: An input tensor of shape ``(B, C, D, H, W)``.
+        kernel_size: the size of the sliding blocks
+        padding: implicit zero padding to be added on both sides of input
+        stride: the stride of the sliding blocks in the input spatial dimensions
+        dilation: the spacing between the kernel points.
+    Returns:
+        A tensor of shape ``(B, C * np.product(kernel_size), L)``, where L - output spatial dimensions.
+        See :class:`torch.nn.Unfold` for more details
+    Example:
+        >>> B, C, D, H, W = 3, 4, 5, 6, 7
+        >>> tensor = torch.arange(1, B*C*D*H*W + 1.).view(B, C, D, H, W)
+        >>> unfold3d(tensor, kernel_size=2, padding=0, stride=1).shape
+        torch.Size([3, 32, 120])
+    """
+
+    if len(tensor.shape) != 5:
+        raise ValueError(
+            f"Input tensor must be of the shape [B, C, D, H, W]. Got{tensor.shape}"
+        )
+
+    if dilation != (1, 1, 1):
+        raise NotImplementedError(f"dilation={dilation} not supported.")
+
+    batch_size, channels, _, _, _ = tensor.shape
+
+    # Input shape: (B, C, D, H, W)
+    tensor = F.pad(
+        tensor, (padding[2], padding[2], padding[1], padding[1], padding[0], padding[0])
+    )
+    # Output shape: (B, C, D+2*padding[2], H+2*padding[1], W+2*padding[0])
+
+    tensor = tensor.unfold(dimension=2, size=kernel_size[0], step=stride[0])
+    tensor = tensor.unfold(dimension=3, size=kernel_size[1], step=stride[1])
+    tensor = tensor.unfold(dimension=4, size=kernel_size[2], step=stride[2])
+    # Output shape: (B, C, D_out, H_out, W_out, kernel_size[0], kernel_size[1], kernel_size[2])
+    # For D_out, H_out, W_out definitions see :class:`torch.nn.Unfold`
+
+    tensor = tensor.permute(0, 2, 3, 4, 1, 5, 6, 7)
+    # Output shape: (B, D_out, H_out, W_out, C, kernel_size[0], kernel_size[1], kernel_size[2])
+
+    tensor = tensor.reshape(batch_size, -1, channels * np.prod(kernel_size)).transpose(
+        1, 2
+    )
+    # Output shape: (B, D_out * H_out * W_out, C * kernel_size[0] * kernel_size[1] * kernel_size[2]
+
+    return tensor
diff --git a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
new file mode 100644
index 000000000000..c7956a3a1b1f
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import standard_kwargs, forward_helper, set_grad_sample_if_exists
+
+from typing import List, Optional
+
+@implements_per_sample_grads(F.embedding)
+class EmbeddingPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        if len(expanded_args[0].shape) == 1:
+            raise RuntimeError(f"Expanded Weights needs an input with a batch size, got a 1D tensor, {expanded_args[0]}")
+        output = forward_helper(F.embedding, expanded_args, expanded_kwargs)
+        ctx.input, ctx.weight = expanded_args
+        ctx.padding_idx, ctx.scale_grad_by_freq = expanded_kwargs['padding_idx'], expanded_kwargs['scale_grad_by_freq']
+        ctx.sparse = expanded_kwargs['sparse']
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.input, ctx.weight
+        padding_idx, scale_grad_by_freq, sparse = ctx.padding_idx, ctx.scale_grad_by_freq, ctx.sparse
+
+        def weight_per_sample_grad(weight):
+            batch_size = input.shape[0]
+            embedding_dim = weight.shape[1]
+            index = (
+                input.unsqueeze(-1)
+                .expand(*input.shape, embedding_dim)
+                .reshape(batch_size, -1, embedding_dim)
+            )
+            grad_sample = torch.zeros(
+                batch_size, *weight.shape, device=weight.device, dtype=grad_output.dtype
+            )
+            return grad_sample.scatter_add_(1, index, grad_output.reshape(batch_size, -1, embedding_dim))
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            bw_fn = torch.ops.aten.embedding_backward
+            results.append(bw_fn(grad_output, input, weight.shape[0], padding_idx, scale_grad_by_freq, sparse))
+        else:
+            results.append(None)
+
+        # weight doesn't compute batched gradients; no other arguments are differentiable (2 not saved from forward)
+        results = results + [None] * 6
+
+        # set grad_sample field for weight with per sample gradients
+        set_grad_sample_if_exists(weight, weight_per_sample_grad)
+        return tuple(results)
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
new file mode 100644
index 000000000000..7914cf8dc1ee
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -0,0 +1,59 @@
+from torch._C import _TensorBase
+import torch
+import functools
+
+from typing import Callable, Dict, cast
+
+HANDLED_FUNCTIONS: Dict[Callable, torch.autograd.Function] = {}
+
+def implements_per_sample_grads(torch_function):
+    @functools.wraps(torch_function)
+    def decorator(autograd_func):
+        HANDLED_FUNCTIONS[torch_function] = autograd_func
+        return autograd_func
+    return decorator
+
+# ExpandedWeight represents a weight (parameter) Tensor that has an expanded
+# batch dimension. Operations on the ExpandedWeight Tensor act exactly like
+# those without an expanded batch dimension but a call to .backward() populates
+# the original (unexpanded) tensor with per-sample-gradients for in the grad_sample field
+#
+# ExpandedWeight has a fallback that always fails since we cannot know what the batch
+# dimension of the input tensor is and therefore cannot know if this is a valid call
+#
+# This is a __torch_function__ object but it could have also been a Tensor Extension
+# with a dispatch key.
+#
+# Needs to be a tensor subclass to allow reparamaterization
+class ExpandedWeight(torch.Tensor):
+    def __init__(self, orig_weight, batch_size):
+        self.batch_size = batch_size
+        self.orig_weight = orig_weight
+
+    handled_functions = HANDLED_FUNCTIONS
+
+    def __new__(cls, orig_weight, _):
+        if not isinstance(orig_weight, torch.Tensor):
+            raise RuntimeError(f"Can only make Expanded Weights of Tensors, got {type(orig_weight).__name__}")
+        if not orig_weight.requires_grad:
+            raise RuntimeError("Can only build ExpandedWeights objects of tensors that require_grad")
+        ret = torch.Tensor._make_subclass(cast(_TensorBase, cls), orig_weight, True)
+        return ret
+
+    @classmethod
+    def __torch_function__(cls, func, _, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        if func in cls.handled_functions:
+            return cls.handled_functions[func].apply(tuple(kwargs.keys()), func, *(args + tuple(kwargs.values())))
+        # We cannot use a fallback here because we do not know the batch dimension for any regular tensor inputs,
+        # i.e. torch.add(torch.Tensor, ExpandedWeight)
+        raise RuntimeError(f"Expanded Weights encountered but cannot handle function {func.__name__}")
+
+    @property
+    def dtype(self):
+        return self.orig_weight.dtype
+
+    @property
+    def shape(self):
+        return self.orig_weight.shape
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
new file mode 100644
index 000000000000..ca0fc7c9e35b
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -0,0 +1,102 @@
+import torch
+from .expanded_weights_impl import ExpandedWeight
+
+def standard_kwargs(kwarg_names, expanded_args):
+    r'''Most `__torch_function__`s standardize the kwargs that they give, so this will separate
+    the args and kwargs they pass. Functions that don't are linear and convND
+    '''
+    kwarg_values = expanded_args[len(expanded_args) - len(kwarg_names):]
+    expanded_args_without_kwargs = expanded_args[:len(expanded_args) - len(kwarg_names)]
+    expanded_kwargs = {name: value for (name, value) in zip(kwarg_names, kwarg_values)}
+    return expanded_args_without_kwargs, expanded_kwargs
+
+def forward_helper(func, expanded_args, expanded_kwargs):
+    r'''Forward helper computes the forward pass for a function that has expanded weight(s)
+    passed to it. It will run the forward pass where all ExpandedWeights are their original
+    weight. It runs checks on the given arguments and detaches the outputs.
+
+    .. note:: First argument in :attr:`expanded_args` must be the input with the batch
+    dimension as the first element of the shape
+
+    .. note:: :attr:`func` must return a Tensor or tuple of Tensors
+
+    Args:
+        func: The function to be called
+        ctx: The context from the autograd.Function object. Will be used to save
+          computed state from the forward pass
+        expanded_args: Arguments to be passed to :attr:`func`. Will include arguments
+          that need to be unpacked because they are ExpandedWeights
+        num_true_outs: The number of outputs seen by the user since some functions
+          return auxillary data that is only used in the backward pass
+    '''
+    unexpanded_args, unexpanded_kwargs = _check_and_unexpand_args(func, expanded_args, expanded_kwargs)
+    return func(*unexpanded_args, **unexpanded_kwargs)
+
+def _check_and_unexpand_args(func, expanded_args, expanded_kwargs):
+    # input must be the first argument passed
+    input = expanded_args[0]
+    if isinstance(input, ExpandedWeight):
+        raise RuntimeError("Expanded Weights do not support inputs that are also ExpandedWeights. "
+                           f"Input must be a Tensor, got {type(input).__name__} in function {func.__name__}")
+    if not isinstance(input, torch.Tensor):
+        raise RuntimeError("Expanded Weights requires a Tensor as the first input to get the batch dimension, "
+                           f"got {type(input).__name__} in function {func.__name__}")
+    if len(input.shape) == 0:
+        raise RuntimeError(f"Expanded Weights requires a batch dimension but got an input of size 0 in function {func.__name__}")
+    if input.shape[0] == 0:
+        raise RuntimeError("0 is not a valid batch size for Expanded Weights but got input tensor of "
+                           f"{input} in function {func.__name__}")
+    batch_size = input.shape[0]
+    for arg in expanded_args + tuple(expanded_kwargs.values()):
+        if isinstance(arg, ExpandedWeight) and arg.batch_size != batch_size:
+            raise RuntimeError("Expected ExpandedWeights to have batch size matching input but got "
+                               f"input batch size of {batch_size} with ExpandedWeight of batch size {arg.batch_size}")
+
+    unexpanded_args = tuple(arg.orig_weight if isinstance(arg, ExpandedWeight) else arg for arg in expanded_args)
+    unexpanded_kwargs = {name: arg.orig_weight if isinstance(arg, ExpandedWeight) else arg
+                         for (name, arg) in expanded_kwargs.items()}
+    return unexpanded_args, unexpanded_kwargs
+
+def set_grad_sample_if_exists(maybe_expanded_weight, per_sample_grad_fn):
+    unpacked = unpack_expanded_weight_or_tensor(maybe_expanded_weight)
+    if isinstance(maybe_expanded_weight, ExpandedWeight):
+        if hasattr(unpacked, "grad_sample") and unpacked.grad_sample is not None:
+            unpacked.grad_sample = unpacked.grad_sample + per_sample_grad_fn(unpacked)
+        else:
+            unpacked.grad_sample = per_sample_grad_fn(unpacked)
+
+def unpack_expanded_weight_or_tensor(maybe_expanded_weight, func=lambda x: x):
+    if isinstance(maybe_expanded_weight, ExpandedWeight):
+        orig_weight = maybe_expanded_weight.orig_weight
+        return func(orig_weight)
+    elif isinstance(maybe_expanded_weight, torch.Tensor) and not maybe_expanded_weight.requires_grad:
+        return func(maybe_expanded_weight)
+    elif isinstance(maybe_expanded_weight, torch.Tensor):
+        raise RuntimeError("ExpandedWeights currently does not support a mixture of ExpandedWeight parameters "
+                           "and normal Parameters. Please file and issue with pytorch/pytorch")
+
+
+
+def sum_over_all_but_batch_and_last_n(
+    tensor: torch.Tensor, n_dims: int
+) -> torch.Tensor:
+    r"""
+    Calculates the sum over all dimensions, except the first
+    (batch dimension), and excluding the last n_dims.
+    This function will ignore the first dimension and it will
+    not aggregate over the last n_dims dimensions.
+    Args:
+        tensor: An input tensor of shape ``(B, ..., X[n_dims-1])``.
+        n_dims: Number of dimensions to keep.
+    Example:
+        >>> tensor = torch.ones(1, 2, 3, 4, 5)
+        >>> sum_over_all_but_batch_and_last_n(tensor, n_dims=2).shape
+        torch.Size([1, 4, 5])
+    Returns:
+        A tensor of shape ``(B, ..., X[n_dims-1])``
+    """
+    if tensor.dim() == n_dims + 1:
+        return tensor
+    else:
+        dims = list(range(1, tensor.dim() - n_dims))
+        return tensor.sum(dim=dims)
diff --git a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
new file mode 100644
index 000000000000..fe29b1eafbe2
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
@@ -0,0 +1,64 @@
+from functools import reduce
+import operator
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import standard_kwargs, \
+    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.group_norm)
+class GroupNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        input, num_groups = expanded_args
+        N = input.shape[0]
+        C = input.shape[1]
+        HxW = reduce(operator.mul, input.shape[2:], 1)
+        weight, bias, eps = expanded_kwargs['weight'], expanded_kwargs['bias'], expanded_kwargs['eps']
+        output, mean, rstd = forward_helper(torch.native_group_norm, (input, weight, bias, N, C, HxW, num_groups, eps), {})
+        ctx.input, ctx.num_groups = input, num_groups
+        ctx.weight, ctx.eps = weight, eps
+        ctx.mean, ctx.rstd = mean, rstd
+        if isinstance(bias, ExpandedWeight):
+            ctx.bias = bias
+        if input.requires_grad and isinstance(weight, ExpandedWeight):
+            ctx.weight = weight
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, num_groups = ctx.input, ctx.num_groups
+        weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
+        mean, rstd = ctx.mean, ctx.rstd
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            weight_c = unpack_expanded_weight_or_tensor(weight, lambda t: t.contiguous())
+            input_c = input.contiguous()
+            grad_output_c = grad_output.contiguous() if grad_output is not None else None
+            N = input.shape[0]
+            C = input.shape[1]
+            HxW = 1
+            for s in input.shape[2:]:
+                HxW *= s
+            bw_fn = torch.ops.aten.native_group_norm_backward
+            results.append(bw_fn(grad_output_c, input_c,
+                                 mean, rstd, weight_c, N, C, HxW, num_groups, (True, False, False))[0])
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable
+        results = results + [None] * 4
+
+        # set grad_sample field for weight and bias with per sample gradients
+        if hasattr(ctx, "weight"):
+            set_grad_sample_if_exists(weight,
+                                      lambda _: torch.einsum("ni...->ni", F.group_norm(input, num_groups, eps=eps) * grad_output))
+        if hasattr(ctx, "bias"):
+            set_grad_sample_if_exists(bias, lambda _: torch.einsum("ni...->ni", grad_output))
+        return tuple(results)
diff --git a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
new file mode 100644
index 000000000000..f050a98836ff
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
@@ -0,0 +1,59 @@
+from functools import partial
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import \
+    forward_helper, set_grad_sample_if_exists, standard_kwargs, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.instance_norm)
+class InstanceNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        instance_norm = partial(torch.instance_norm, cudnn_enabled=True)
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        output = forward_helper(instance_norm, expanded_args, expanded_kwargs)
+        ctx.input = expanded_args[0]
+        ctx.running_mean, ctx.running_var = expanded_kwargs['running_mean'], expanded_kwargs['running_var']
+        ctx.weight, ctx.bias, ctx.eps = expanded_kwargs['weight'], expanded_kwargs['bias'], expanded_kwargs['eps']
+        return output
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, running_mean, running_var = ctx.input, ctx.running_mean, ctx.running_var
+        weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+        if input.requires_grad:
+            b = input.shape[0]
+            c = input.shape[1]
+            new_shape = (1, b * c, *input.shape[2:])
+
+            weight_ = unpack_expanded_weight_or_tensor(weight, lambda orig_weight: orig_weight.repeat(b))
+            running_mean_ = running_mean.repeat(b) if running_mean is not None else None
+            running_var_ = running_var.repeat(b) if running_var is not None else None
+            input_reshaped = input.contiguous().view(new_shape)
+            grad_output_reshaped = grad_output.contiguous().view(new_shape)
+            mean = torch.mean(input_reshaped.transpose(0, 1), tuple(range(1, input.dim())), False)
+            rstd = torch.var(input_reshaped.transpose(0, 1), tuple(range(1, input.dim())), keepdim=False, unbiased=False)
+
+            # must use native batch norm since it supports all inputs. This may have used cuda or openmi during the forward but
+            # it didn't save the metadata, so we don't know during the backward
+            res = torch.ops.aten.native_batch_norm_backward(
+                grad_output_reshaped, input_reshaped, weight_, running_mean_, running_var_,
+                mean, rstd, True, eps, (True, False, False))
+            results.append(res[0].reshape(input.shape))
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable (2 are not saved from the forward)
+        results = results + [None] * 7
+
+        # set grad_sample field for weight and bias with per sample gradients
+        set_grad_sample_if_exists(weight,
+                                  lambda _: torch.einsum("ni...->ni", F.instance_norm(input, eps=eps) * grad_output))
+        set_grad_sample_if_exists(bias, lambda _: torch.einsum("ni...->ni", grad_output))
+        return tuple(results)
diff --git a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
new file mode 100644
index 000000000000..53cb3fe032ea
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
@@ -0,0 +1,59 @@
+
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \
+    standard_kwargs, sum_over_all_but_batch_and_last_n, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.layer_norm)
+class LayerNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        input = expanded_args[0]
+        normalized_shape = expanded_args[1]
+        if len(input.shape) <= len(normalized_shape):
+            raise RuntimeError("Expanded Weights: Layer norm should not normalize over batch dimension for per sample gradient"
+                               f"computations but got that normalized shape, {normalized_shape}, matched input shape.")
+        output, mean, rstd = forward_helper(torch.native_layer_norm, expanded_args, expanded_kwargs)
+        ctx.args = expanded_args
+
+        if input.requires_grad or isinstance(ExpandedWeight, expanded_kwargs['weight']):
+            ctx.weight = expanded_kwargs['weight']
+        if input.requires_grad or isinstance(ExpandedWeight, expanded_kwargs['bias']):
+            ctx.bias = expanded_kwargs['bias']
+        ctx.eps = expanded_kwargs['eps']
+        ctx.mean, ctx.rstd = mean, rstd
+        return output
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        def weight_per_sample_grad(weight):
+            return sum_over_all_but_batch_and_last_n(F.layer_norm(input, normalized_shape, eps=ctx.eps) * grad_output, weight.dim())
+
+        input, normalized_shape = ctx.args
+        mean, rstd = ctx.mean, ctx.rstd
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+        if input.requires_grad:
+            weight_ = unpack_expanded_weight_or_tensor(ctx.weight)
+            bias_ = unpack_expanded_weight_or_tensor(ctx.bias)
+            results.append(torch.ops.aten.native_layer_norm_backward(
+                grad_output, input, normalized_shape, mean, rstd, weight_, bias_, (True, False, False))[0])
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable
+        results = results + [None] * 4
+
+        # set grad_sample field for weight and bias with per sample gradients
+        if hasattr(ctx, "weight"):
+            set_grad_sample_if_exists(ctx.weight, weight_per_sample_grad)
+        if hasattr(ctx, "bias"):
+            set_grad_sample_if_exists(ctx.bias, lambda bias: sum_over_all_but_batch_and_last_n(grad_output, bias.dim()))
+        return tuple(results)
diff --git a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
new file mode 100644
index 000000000000..70db268b8fe7
--- /dev/null
+++ b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import \
+    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.linear)
+class LinearPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, _, __, *expanded_args_and_kwargs):
+        if len(expanded_args_and_kwargs[0].shape) <= 1:
+            raise RuntimeError("Input does not have a batch dimension. Expanded Weights expected input "
+                               f"of at least rank 2, got of rank {len(expanded_args_and_kwargs[0].shape)}")
+        expanded_kwargs = {'bias': expanded_args_and_kwargs[2] if len(expanded_args_and_kwargs) == 3 else None}
+        expanded_args = expanded_args_and_kwargs[:2]
+        output = forward_helper(F.linear, expanded_args, expanded_kwargs)
+        ctx.args = expanded_args
+        ctx.kwargs = expanded_kwargs
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.args
+        bias = ctx.kwargs['bias']
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg_names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            results.append(grad_output.matmul(unpack_expanded_weight_or_tensor(weight)))
+        else:
+            results.append(None)
+        results.extend([None] * 2)  # weight and bias don't compute batched gradients
+
+        # weight and bias get their grad_sample fields set directly if they exist
+        set_grad_sample_if_exists(weight, lambda _: torch.einsum("n...i,n...j->nij", grad_output, input))
+        set_grad_sample_if_exists(bias, lambda _: torch.einsum("n...k->nk", grad_output))
+        return tuple(results)
diff --git a/torch/nn/utils/_per_sample_grad.py b/torch/nn/utils/_per_sample_grad.py
new file mode 100644
index 000000000000..9d67cc014877
--- /dev/null
+++ b/torch/nn/utils/_per_sample_grad.py
@@ -0,0 +1,57 @@
+import torch
+from torch.nn.utils._stateless import functional_call
+from torch.nn.utils._expanded_weights.expanded_weights_impl import ExpandedWeight
+
+# dependency on `functional_call` means that this can't be exposed in utils
+# without creating circular dependency
+def call_for_per_sample_grads(module, batch_size, args, kwargs=None):
+    r"""
+    call_for_per_sample_grads(module, batch_size, args, kwargs=None) -> Tensor
+    Invoked just like a forward pass, ``call_for_per_sample_grads`` will produce the same
+    forward result. Then, when backward is invoked, the parameters of ``module``
+    will have a ``grad_sample`` field populated with the per sample gradients
+    instead of the regular gradients
+
+    Args:
+        module: The ``nn.Module`` to get per sample gradients with respect to. All trainable
+          parameters will compute per sample gradients, located in a ``grad_sample``
+          field when ``backward`` is invoked
+        batch_size: The batch size of the input. Typically the input's first dimension
+        args: Tuple of positional args passed to ``module`` to perform the forward pass
+        kwargs: Dict of named args passed to ``module`` to perform the forward pass. Default: None
+
+    Examples::
+        >>> model = nn.Linear(4, 3)
+        >>> batched_input = torch.randn(5, 4)  # batch size of 5
+        >>> res = call_for_per_sample_grads(model, batched_input.shape[0], batched_input).sum()
+        >>> res.backward()
+        >>> assert model.weight.shape == (3, 4)
+        >>> assert model.weight.grad_sample.shape == (5, 3, 4)
+        >>> assert model.weight.grad == None
+        >>> assert model.bias.shape == (3,)
+        >>> assert model.bias.grad_sample.shape == (5, 3)
+        >>> assert model.bias.grad == None
+
+    Note::
+        Does not work with any `nn.RNN`, including `nn.GRU` or `nn.LSTM`. Please use custom
+        rewrites that wrap an `nn.Linear` module. See Opacus for an example
+    """
+    def maybe_build_expanded_weight(og_tensor):
+        if og_tensor.requires_grad:
+            return ExpandedWeight(og_tensor, batch_size)
+        else:
+            return og_tensor
+
+    if not isinstance(module, torch.nn.Module):
+        raise RuntimeError(f"Module passed must be nn.Module, got {type(module).__name__}")
+    if not isinstance(batch_size, int):
+        raise RuntimeError(f"Batch size passed must be an integer, got {type(batch_size).__name__}")
+    if batch_size < 1:
+        raise RuntimeError(f"Batch size must be positive, got {batch_size}")
+    for weight in module.parameters():
+        if hasattr(weight, "grad_sample") and weight.grad_sample is not None:  # type: ignore[attr-defined]
+            raise RuntimeError("Current Expanded Weights accumulates the gradients, which will be incorrect for multiple "
+                               f"calls without clearing gradients. Please clear out the grad_sample parameter of {weight} or "
+                               "post an issue to pytorch/pytorch to prioritize correct behavior")
+    params = {name: maybe_build_expanded_weight(value) for (name, value) in module.named_parameters()}
+    return functional_call(module, params, args, kwargs)
diff --git a/torch/nn/utils/_stateless.py b/torch/nn/utils/_stateless.py
index 1735944d3ea4..48b4556f5634 100644
--- a/torch/nn/utils/_stateless.py
+++ b/torch/nn/utils/_stateless.py
@@ -1,120 +1,15 @@
-import contextlib
-from typing import Any, Callable, Dict, Iterator, List, Tuple
-
-import torch
-from torch import Tensor
-
-
-# We avoid typing module here because module attributes are declared as Union[Parameter, Tensor] by default
-# and using other types causes mypy errors
-def _change_class(module) -> None:
-    cls = module.__class__
-    func_params : Dict[str, Tensor] = module._functional_parameters
-
-    def _getattribute(self, name: str) -> Any:
-        if name in func_params:
-            return func_params[name]
-        return cls.__getattribute__(self, name)
-
-    param_cls = type(
-        f"StatelessReplacer{cls.__name__}",
-        (cls,),
-        {
-            "__getattribute__": _getattribute,
-        },
-    )
-
-    module.__class__ = param_cls
-    module._orig_class = cls
-
-
-def _swap_parameters(module, tensor_name: str, tensor: Tensor) -> None:
-    # Changes the module class to get a new __getattr__ dunder method
-    # that looks for the reparametrized tensor
-    if hasattr(module, "_functional_parameters"):
-        module._functional_parameters[tensor_name] = tensor
-    else:
-        module._functional_parameters = {}
-        module._functional_parameters[tensor_name] = tensor
-        _change_class(module)
-
-
-def _remove_swap(module, name: str) -> None:
-    if hasattr(module, "_orig_class"):
-        module.__class__ = module._orig_class
-        delattr(module, "_orig_class")
-        delattr(module, "_functional_parameters")
-
-
-@contextlib.contextmanager
-def reparametrize_module(
-    module: torch.nn.Module,
-    parameters_and_buffers: Dict[str, Tensor],
-) -> Iterator[None]:
-    for name, tensor in parameters_and_buffers.items():
-        _apply_func_submodules(
-            _swap_parameters,
-            module, name.split("."), (tensor,))
-    yield
-    for name in parameters_and_buffers:
-        _apply_func_submodules(
-            _remove_swap,
-            module, name.split("."), ())
-
-
-def _apply_func_submodules(
-    func: Callable[..., None],
-    module: torch.nn.Module,
-    path: List[str],
-    args: Tuple,
-):
-    if len(path) == 1:
-        func(module, path[0], *args)
-    else:
-        _apply_func_submodules(func, getattr(module, path[0]), path[1:], args)
-
-
-def functional_call(
-    module: torch.nn.Module,
-    parameters_and_buffers: Dict[str, Tensor],
-    args: Tuple,
-    kwargs : Dict[str, Any] = None,
-):
-    r"""Performs a functional call on the module by replacing the module parameters
-    and buffers with the provided ones.
-
-    .. note:: If the module has active parametrizations, passing a value in the
-        `parameters_and_buffers` argument with the name set to the regular parameter
-         name will completely disable the parametrization.
-         If you want to apply the parametrization function to the value passed
-         please set the key as `{submodule_name}.parametrizations.{parameter_name}.original`.
-
-    Args:
-        module (torch.nn.Module): the module to call
-        parameters_and_buffers (dict of str and Tensor): the parameters that will be used in
-            the module call.
-        args (tuple): arguments to be passed to the module call
-        kwargs (dict): keyword arguments to be passed to the module call
-
-    Returns:
-        Any: the result of calling ``module``.
-    """
-    # TODO allow kwargs such as unsafe and others for parametrization
-    if (
-            torch.jit.is_tracing()
-            or torch.jit.is_scripting()
-            or isinstance(module, (
-                torch.jit.RecursiveScriptModule,
-                torch.jit.ScriptModule,
-                torch.jit.ScriptFunction)
-            )
-    ):
-        raise RuntimeError("The stateless API can't be used with Jitted modules")
-    if kwargs is None:
-        kwargs = {}
-    with reparametrize_module(module, parameters_and_buffers):
-        if isinstance(args, tuple):
-            out = module(*args, **kwargs)
-        else:
-            out = module(args, **kwargs)
-    return out
+# This file is never automatically imported within PyTorch so it is ok to
+# always warn here
+import warnings
+
+warnings.warn("The `torch.nn.utils._stateless` code is deprecated now that "
+              "it is publicly available. Please use `torch.nn.utils.stateless "
+              "instead.", DeprecationWarning)
+
+# Import * wouldn't work as most things are private and thus wouldn't be imported
+# here.
+from torch.nn.utils.stateless import functional_call  # noqa: F401
+from torch.nn.utils.stateless import _apply_func_submodules, _change_class  # noqa: F401
+# This one used to look public but should actually be private. This was fixed when making the module
+# public and is kept here for BC
+from torch.nn.utils.stateless import _reparametrize_module as reparametrize_module  # noqa: F401
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 65e29c4c9a7a..33c918d769b3 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -25,7 +25,7 @@ def clip_grad_norm_(
             ``inf``, or ``-inf``. Default: False (will switch to True in the future)
 
     Returns:
-        Total norm of the parameters (viewed as a single vector).
+        Total norm of the parameter gradients (viewed as a single vector).
     """
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 1d8596b33e0b..071787fa1b9c 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -7,7 +7,6 @@
 from contextlib import contextmanager
 from typing import Union, Optional, Dict, Tuple, Sequence
 
-
 _cache_enabled = 0
 _cache: Dict[Tuple[int, str], Optional[Tensor]] = {}
 
@@ -573,7 +572,6 @@ def is_parametrized(module: Module, tensor_name: Optional[str] = None) -> bool:
     else:
         return tensor_name in parametrizations
 
-
 def remove_parametrizations(
     module: Module, tensor_name: str, leave_parametrized: bool = True
 ) -> Module:
@@ -616,7 +614,19 @@ def remove_parametrizations(
             # We do this so that the parameter does not to change the id()
             # This way the user does not need to update the optimizer
             with torch.no_grad():
-                original.set_(t)
+                if type(original) is torch.Tensor:
+                    original.set_(t)
+                else:
+                    try:
+                        original.set_(t)
+                    except RuntimeError as e:
+                        # TODO: Fix this for tensor subclasses that are parameters:
+                        # RuntimeError: set_storage is not allowed on a Tensor created from .data or .detach().
+                        raise RuntimeError("Calling remove_parametrizations() with leave_parametrized=True "
+                                           "for a parameter that is an instance of a tensor subclass requires "
+                                           "set_() to be implemented correctly for the tensor subclass. Either "
+                                           "set leave_parametrized=False or provide a working implementation for "
+                                           "set_() in the tensor subclass.")
     else:
         if leave_parametrized:
             # We cannot use no_grad because we need to know whether one or more
@@ -644,3 +654,75 @@ def remove_parametrizations(
         orig_cls = module.__class__.__bases__[0]
         module.__class__ = orig_cls
     return module
+
+def type_before_parametrizations(module: Module) -> type:
+    r"""Returns the module type before parametrizations were applied and if not,
+    then it returns the module type.
+
+    Args:
+        module (nn.Module): module to get type of
+    """
+    if is_parametrized(module):
+        return module.__class__.__bases__[0]
+    else:
+        return type(module)
+
+def transfer_parametrizations_and_params(
+    from_module: Module, to_module: Module, tensor_name: Optional[str] = None
+) -> Module:
+    r"""Transfers parametrizations and the parameters they parametrize from from_module
+    to to_module. If tensor_name is specified, only transfers the specified parameter, otherwise
+    transfers all parametrized parameters. If those parameters do not exist in to_module, it will create them.
+    Does nothing if from_module is not parametrized.
+
+    Args:
+        from_module (nn.Module): module to transfer from
+        to_module (nn.Module): module to transfer to
+        tensor_name (str, optional): parameter to transfer
+
+    Returns:
+        Module: to_module
+    """
+    if is_parametrized(from_module):
+        assert isinstance(from_module.parametrizations, ModuleDict)  # for mypy
+
+        # get list of all params or the single param to transfer
+        parameters_to_transfer: Union[list, ModuleDict] = (
+            from_module.parametrizations if tensor_name is None else [tensor_name]
+        )
+
+        assert hasattr(parameters_to_transfer, "__iter__")  # for mypy
+        for parameter_name in parameters_to_transfer:
+
+            # initialize the to-be-transfered param in to_module if it doesn't exist already
+            if not hasattr(to_module, parameter_name):
+                setattr(
+                    to_module,
+                    parameter_name,
+                    Parameter(getattr(from_module, parameter_name)),
+                )
+
+            # apply the params's parametrizations to to_module
+            for param_func in from_module.parametrizations[parameter_name]:
+                register_parametrization(to_module, parameter_name, param_func)
+            assert isinstance(to_module.parametrizations, ModuleDict)  # for mypy
+
+            # make values match, original values can be stored in either original or
+            # original0, original1..., need to check both cases
+            if hasattr(from_module.parametrizations[parameter_name], "original"):
+                to_module.parametrizations[parameter_name].original = \
+                    from_module.parametrizations[parameter_name].original
+            else:
+                num = 0
+                orig_num = "original" + str(num)
+                # loop through each original# until all values have been set
+                while hasattr(from_module.parametrizations[parameter_name], orig_num):
+                    setattr(
+                        to_module.parametrizations[parameter_name],
+                        orig_num,
+                        getattr(from_module.parametrizations[parameter_name], orig_num),
+                    )
+                    num = num + 1
+                    orig_num = "original" + str(num)
+
+    return to_module
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index b65c6b5e4393..c7a0755255b9 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -307,7 +307,7 @@ def add_pruning_method(self, method):
                 + " Found '{}'".format(method._tensor_name)
             )
         # if all checks passed, add to _pruning_methods tuple
-        self._pruning_methods += (method,)
+        self._pruning_methods += (method,)  # type: ignore[operator]
 
     def __len__(self):
         return len(self._pruning_methods)
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 981e8bbb139d..312b33578785 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -6,7 +6,7 @@
 from ... import _VF
 from ..._jit_internal import Optional
 
-from typing import List, Tuple
+from typing import List, Tuple, Union, Iterable
 
 
 
@@ -153,8 +153,12 @@ def is_pinned(self):
 
 # TorchScript doesn't support constructors on named tuples, so we use this helper
 # method to construct PackedSequence
-def _packed_sequence_init_args(data, batch_sizes=None, sorted_indices=None, unsorted_indices=None):
-    # type: (Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]  # noqa: B950
+def _packed_sequence_init_args(
+    data: Tensor,
+    batch_sizes: Optional[Tensor] = None,
+    sorted_indices: Optional[Tensor] = None,
+    unsorted_indices: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
     # NB: if unsorted_indices is provided, it should be the inverse permutation
     # to sorted_indices. Don't assert it here because the PackedSequence ctor
     # should only be used internally.
@@ -180,15 +184,18 @@ def _packed_sequence_init_args(data, batch_sizes=None, sorted_indices=None, unso
         return data[0], data[1], sorted_indices, unsorted_indices
 
 
-def _packed_sequence_init(data, batch_sizes=None, sorted_indices=None, unsorted_indices=None):
-    # type: (Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]) -> PackedSequence
+def _packed_sequence_init(
+    data: Tensor,
+    batch_sizes: Optional[Tensor] = None,
+    sorted_indices: Optional[Tensor] = None,
+    unsorted_indices: Optional[Tensor] = None,
+) -> PackedSequence:
     data, batch_sizes, sorted_indices, unsorted_indices = _packed_sequence_init_args(
         data, batch_sizes, sorted_indices, unsorted_indices)
     return PackedSequence(data, batch_sizes, sorted_indices, unsorted_indices)
 
 
-def invert_permutation(permutation):
-    # type: (Optional[Tensor]) -> Optional[Tensor]
+def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
     if permutation is None:
         return None
     output = torch.empty_like(permutation, memory_format=torch.legacy_contiguous_format)
@@ -197,8 +204,12 @@ def invert_permutation(permutation):
     return output
 
 
-def pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True):
-    # type: (Tensor, Tensor, bool, bool) -> PackedSequence
+def pack_padded_sequence(
+    input: Tensor,
+    lengths: Tensor,
+    batch_first: bool = False,
+    enforce_sorted: bool = True,
+) -> PackedSequence:
     r"""Packs a Tensor containing padded sequences of variable length.
 
     :attr:`input` can be of size ``T x B x *`` where `T` is the length of the
@@ -250,8 +261,12 @@ def pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True)
     return _packed_sequence_init(data, batch_sizes, sorted_indices, None)
 
 
-def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None):
-    # type: (PackedSequence, bool, float, Optional[int]) -> Tuple[Tensor, Tensor]
+def pad_packed_sequence(
+    sequence: PackedSequence,
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    total_length: Optional[int] = None,
+) -> Tuple[Tensor, Tensor]:
     r"""Pads a packed batch of variable length sequences.
 
     It is an inverse operation to :func:`pack_padded_sequence`.
@@ -320,8 +335,11 @@ def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_le
     return padded_output, lengths
 
 
-def pad_sequence(sequences, batch_first=False, padding_value=0.0):
-    # type: (List[Tensor], bool, float) -> Tensor
+def pad_sequence(
+    sequences: Union[Tensor, List[Tensor]],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+) -> Tensor:
     r"""Pad a list of variable length Tensors with ``padding_value``
 
     ``pad_sequence`` stacks a list of Tensors along a new dimension,
@@ -358,13 +376,31 @@ def pad_sequence(sequences, batch_first=False, padding_value=0.0):
         Tensor of size ``B x T x *`` otherwise
     """
 
+    if not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        # JIT doesn't support `Iterable`
+        if not isinstance(sequences, Iterable):
+            msg = ('pad_sequence: Expected iterable for input sequences, but got arg of type: '
+                   f'{type(sequences)}')
+            raise RuntimeError(msg)
+
+        # In JIT context this leads to,
+        # RuntimeError: cannot statically infer the expected size of a list in this context
+        sequences = tuple(sequences)
+    else:
+        # For JIT, we only support Union[Tensor, Tuple[Tensor]]
+        if isinstance(sequences, torch.Tensor):
+            sequences = sequences.unbind(0)
+
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
     return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
 
 
-def unpad_sequence(padded_sequences, lengths, batch_first=False):
-    # type: (Tensor, Tensor, bool) -> List[Tensor]
+def unpad_sequence(
+    padded_sequences: Tensor,
+    lengths: Tensor,
+    batch_first: bool = False,
+) -> List[Tensor]:
     r"""Unpad padded Tensor into a list of variable length Tensors
 
     ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors.
@@ -410,8 +446,7 @@ def unpad_sequence(padded_sequences, lengths, batch_first=False):
     return unpadded_sequences
 
 
-def pack_sequence(sequences, enforce_sorted=True):
-    # type: (List[Tensor], bool) -> PackedSequence
+def pack_sequence(sequences: List[Tensor], enforce_sorted: bool = True) -> PackedSequence:
     r"""Packs a list of variable length Tensors
 
     Consecutive call of the next functions: ``pad_sequence``, ``pack_padded_sequence``.
@@ -447,8 +482,7 @@ def pack_sequence(sequences, enforce_sorted=True):
     return pack_padded_sequence(pad_sequence(sequences), lengths, enforce_sorted=enforce_sorted)
 
 
-def unpack_sequence(packed_sequences):
-    # type: (PackedSequence) -> List[Tensor]
+def unpack_sequence(packed_sequences: PackedSequence) -> List[Tensor]:
     r"""Unpacks PackedSequence into a list of variable length Tensors
 
     ``packed_sequences`` should be a PackedSequence object.
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
new file mode 100644
index 000000000000..e6166b8bcff1
--- /dev/null
+++ b/torch/nn/utils/stateless.py
@@ -0,0 +1,142 @@
+import contextlib
+from typing import Any, Callable, Dict, Iterator, List, Tuple
+
+import torch
+from torch import Tensor
+
+__all__ = ["functional_call"]
+
+# We avoid typing module here because module attributes are declared as Union[Parameter, Tensor] by default
+# and using other types causes mypy errors
+def _change_class(module, params_and_buffers) -> None:
+    cls = module.__class__
+    attr_to_path : Dict[str, str] = module._attr_to_path
+
+    def _getattribute(self, name: str) -> Any:
+        if name in attr_to_path:
+            return params_and_buffers[attr_to_path[name]]
+        return cls.__getattribute__(self, name)
+
+    def _setattr(self, name: str, value: Any) -> None:
+        if name in attr_to_path:
+            params_and_buffers[attr_to_path[name]] = value
+        else:
+            return cls.__setattr__(self, name, value)
+
+    param_cls = type(
+        f"StatelessReplacer{cls.__name__}",
+        (cls,),
+        {
+            "__getattribute__": _getattribute,
+            "__setattr__": _setattr,
+        },
+    )
+
+    module.__class__ = param_cls
+    module._orig_class = cls
+
+def _create_swap_params(params_and_buffers):
+    def _swap_parameters(module, tensor_name: str, full_path: str, tensor: Tensor) -> None:
+        # Changes the module class to get a new __getattr__ dunder method
+        # that looks for the reparametrized tensor
+        if hasattr(module, "_attr_to_path"):
+            module._attr_to_path[tensor_name] = full_path
+        else:
+            module._attr_to_path = {}
+            module._attr_to_path[tensor_name] = full_path
+            _change_class(module, params_and_buffers)
+    return _swap_parameters
+
+
+def _remove_swap(module, name: str, full_path: str) -> None:
+    if hasattr(module, "_orig_class"):
+        module.__class__ = module._orig_class
+        delattr(module, "_orig_class")
+        delattr(module, "_attr_to_path")
+
+
+@contextlib.contextmanager
+def _reparametrize_module(
+    module: 'torch.nn.Module',
+    parameters_and_buffers: Dict[str, Tensor],
+) -> Iterator[None]:
+    for name, tensor in parameters_and_buffers.items():
+        _apply_func_submodules(
+            _create_swap_params(parameters_and_buffers),
+            module, name.split("."), name, (tensor,))
+    yield
+    for name in parameters_and_buffers:
+        _apply_func_submodules(
+            _remove_swap,
+            module, name.split("."), name, ())
+
+
+def _apply_func_submodules(
+    func: Callable[..., None],
+    module: 'torch.nn.Module',
+    path: List[str],
+    full_path: str,
+    args: Tuple,
+):
+    if len(path) == 1:
+        func(module, path[0], full_path, *args)
+    else:
+        _apply_func_submodules(func, getattr(module, path[0]), path[1:], full_path, args)
+
+
+def functional_call(
+    module: 'torch.nn.Module',
+    parameters_and_buffers: Dict[str, Tensor],
+    args: Tuple,
+    kwargs : Dict[str, Any] = None,
+):
+    r"""Performs a functional call on the module by replacing the module parameters
+    and buffers with the provided ones.
+
+    .. note:: If the module has active parametrizations, passing a value in the
+        :attr:`parameters_and_buffers` argument with the name set to the regular parameter
+        name will completely disable the parametrization.
+        If you want to apply the parametrization function to the value passed
+        please set the key as ``{submodule_name}.parametrizations.{parameter_name}.original``.
+
+    .. note:: If the module performs in-place operations on parameters/buffers, these will be reflected
+        in the `parameters_and_buffers` input.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> mod = Foo()  # does self.foo = self.foo + 1
+            >>> print(mod.foo)  # tensor(0.)
+            >>> functional_call(mod, a, torch.ones(()))
+            >>> print(mod.foo)  # tensor(0.)
+            >>> print(a['foo'])  # tensor(1.)
+
+    Args:
+        module (torch.nn.Module): the module to call
+        parameters_and_buffers (dict of str and Tensor): the parameters that will be used in
+            the module call.
+        args (tuple): arguments to be passed to the module call
+        kwargs (dict): keyword arguments to be passed to the module call
+
+    Returns:
+        Any: the result of calling ``module``.
+    """
+    # TODO allow kwargs such as unsafe and others for parametrization
+    if (
+            torch.jit.is_tracing()
+            or torch.jit.is_scripting()
+            or isinstance(module, (
+                torch.jit.RecursiveScriptModule,
+                torch.jit.ScriptModule,
+                torch.jit.ScriptFunction)
+            )
+    ):
+        raise RuntimeError("The stateless API can't be used with Jitted modules")
+    if kwargs is None:
+        kwargs = {}
+    with _reparametrize_module(module, parameters_and_buffers):
+        if isinstance(args, tuple):
+            out = module(*args, **kwargs)
+        else:
+            out = module(args, **kwargs)
+    return out
diff --git a/torch/onnx/README.md b/torch/onnx/README.md
new file mode 100644
index 000000000000..cb190ba1e496
--- /dev/null
+++ b/torch/onnx/README.md
@@ -0,0 +1,7 @@
+# torch.onnx
+
+Torch->ONNX converter / exporter.
+
+[User-facing docs](https://pytorch.org/docs/master/onnx.html).
+
+[Developer docs](https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter).
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 88a15db1da36..64bf14ccb9e1 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -1,3 +1,5 @@
+from typing import Dict
+
 import torch._C as _C
 
 TensorProtoDataType = _C._onnx.TensorProtoDataType
@@ -10,8 +12,9 @@
 producer_name = "pytorch"
 producer_version = _C._onnx.PRODUCER_VERSION
 
+
 class ExportTypes:
-    r""""Specifies how the ONNX model is stored."""
+    r""" "Specifies how the ONNX model is stored."""
 
     PROTOBUF_FILE = "Saves model in the specified protobuf file."
     ZIP_ARCHIVE = "Saves model in the specified ZIP file (uncompressed)."
@@ -25,17 +28,49 @@ class CheckerError(Exception):
     pass
 
 
+class SymbolicContext:
+    r"""Provides extra context for symbolic functions.
+
+    Args:
+        params_dict (Dict[str, _C.IValue]): Mapping from graph initializer name to IValue.
+        env (Dict[_C.Value, _C.Value]): Mapping from Torch domain graph Value to ONNX domain graph Value.
+        cur_node (_C.Node): Current node being converted to ONNX domain.
+        onnx_block (_C.Block): Current ONNX block that converted nodes are being appended to.
+    """
+
+    def __init__(self, params_dict, env, cur_node, onnx_block):
+        self.params_dict: Dict[str, _C.IValue] = params_dict
+        self.env: Dict[_C.Value, _C.Value] = env
+        # Current node that is being converted.
+        self.cur_node: _C.Node = cur_node
+        # Current onnx block that converted nodes are being appended to.
+        self.onnx_block: _C.Block = onnx_block
+
+
 def _export(*args, **kwargs):
     from torch.onnx import utils
+
     result = utils._export(*args, **kwargs)
     return result
 
 
-def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL,
-           input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
-           opset_version=None, do_constant_folding=True, dynamic_axes=None,
-           keep_initializers_as_inputs=None, custom_opsets=None,
-           export_modules_as_functions=False):
+def export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=OperatorExportTypes.ONNX,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    custom_opsets=None,
+    export_modules_as_functions=False,
+):
     r"""
     Exports a model into ONNX format. If ``model`` is not a
     :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
@@ -108,7 +143,7 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         verbose (bool, default False): if True, prints a description of the
             model being exported to stdout. In addition, the final ONNX graph will include the
             field ``doc_string``` from the exported model which mentions the source code locations
-            for ``model``.
+            for ``model``. If True, ONNX exporter logging will be turned on.
         training (enum, default TrainingMode.EVAL):
             * ``TrainingMode.EVAL``: export the model in inference mode.
             * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
@@ -177,9 +212,9 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
 
                 Models exported this way are probably runnable only by Caffe2.
 
-        opset_version (int, default 9): The version of the
+        opset_version (int, default 13): The version of the
             `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
-            to target. Must be >= 7 and <= 15.
+            to target. Must be >= 7 and <= 16.
         do_constant_folding (bool, default True): Apply the constant-folding optimization.
             Constant-folding will replace some of the ops that have all constant inputs
             with pre-computed constant nodes.
@@ -286,6 +321,19 @@ def forward(self, x):
             particular types of modules to export as local functions in ONNX.
             This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
             ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
 
             * ``False``(default): export ``nn.Module`` forward calls as fine grained nodes.
             * ``True``: export all ``nn.Module`` forward calls as local function nodes.
@@ -298,11 +346,24 @@ def forward(self, x):
     """
 
     from torch.onnx import utils
-    return utils.export(model, args, f, export_params, verbose, training,
-                        input_names, output_names, operator_export_type, opset_version,
-                        do_constant_folding, dynamic_axes,
-                        keep_initializers_as_inputs, custom_opsets,
-                        export_modules_as_functions)
+
+    return utils.export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type,
+        opset_version,
+        do_constant_folding,
+        dynamic_axes,
+        keep_initializers_as_inputs,
+        custom_opsets,
+        export_modules_as_functions,
+    )
 
 
 def export_to_pretty_string(*args, **kwargs) -> str:
@@ -323,10 +384,13 @@ def export_to_pretty_string(*args, **kwargs) -> str:
       A UTF-8 str containing a human-readable representation of the ONNX model.
     """
     from torch.onnx import utils
+
     return utils.export_to_pretty_string(*args, **kwargs)
 
+
 def _optimize_trace(graph, operator_export_type):
     from torch.onnx import utils
+
     return utils._optimize_graph(graph, operator_export_type)
 
 
@@ -342,16 +406,19 @@ def select_model_mode_for_export(model, mode):
     """
 
     from torch.onnx import utils
+
     return utils.select_model_mode_for_export(model, mode)
 
 
 def _run_symbolic_function(*args, **kwargs):
     from torch.onnx import utils
+
     return utils._run_symbolic_function(*args, **kwargs)
 
 
 def _run_symbolic_method(*args, **kwargs):
     from torch.onnx import utils
+
     return utils._run_symbolic_method(*args, **kwargs)
 
 
@@ -361,6 +428,7 @@ def is_in_onnx_export():
     """
 
     from torch.onnx import utils
+
     return utils.is_in_onnx_export()
 
 
@@ -378,6 +446,7 @@ def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
       opset_version (int): The ONNX opset version in which to register.
     """
     from torch.onnx import utils
+
     utils.register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version)
 
 
@@ -393,4 +462,48 @@ def unregister_custom_op_symbolic(symbolic_name, opset_version):
     """
 
     from torch.onnx import utils
+
     utils.unregister_custom_op_symbolic(symbolic_name, opset_version)
+
+
+def is_onnx_log_enabled():
+    r"""
+    Returns True iff ONNX logging is turned on.
+    """
+    return _C._jit_is_onnx_log_enabled()
+
+
+def enable_log():
+    r"""
+    Enables ONNX logging.
+    """
+    _C._jit_set_onnx_log_enabled(True)
+
+
+def disable_log():
+    r"""
+    Disables ONNX logging.
+    """
+    _C._jit_set_onnx_log_enabled(False)
+
+
+def set_log_stream(stream_name="stdout"):
+    r"""
+    Set output stream for ONNX logging.
+
+    Args:
+      stream_name (str, default "stdout"): Only ``stdout`` and ``stderr`` are supported
+        as `stream_name`.
+    """
+    _C._jit_set_onnx_log_output_stream(stream_name)
+
+
+def log(*args):
+    r"""
+    A simple logging facility for ONNX exporter.
+
+    Args:
+      args: Arguments are converted to string, concatenated together with a newline
+        character appended to the end, and flushed to output stream.
+    """
+    _C._jit_onnx_log(*args)
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
new file mode 100644
index 000000000000..0a7e853a6d50
--- /dev/null
+++ b/torch/onnx/_constants.py
@@ -0,0 +1,6 @@
+"""Constant values used in ONNX."""
+
+onnx_default_opset = 13
+onnx_main_opset = 16
+onnx_stable_opsets = tuple(range(7, onnx_main_opset))
+onnx_constant_folding_opsets = tuple(range(9, onnx_main_opset + 1))
diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
new file mode 100644
index 000000000000..67d478e5e7cb
--- /dev/null
+++ b/torch/onnx/_globals.py
@@ -0,0 +1,48 @@
+"""Globals used internally by the ONNX exporter.
+
+Do not use this module outside of `torch.onnx` and its tests.
+
+Be very judicious when adding any new global variables. Do not create new global
+variables unless they are absolutely necessary.
+"""
+from __future__ import annotations
+
+import typing
+from typing import Optional
+
+# This module should only depend on _constants and nothing else in torch.onnx to keep
+# dependency direction clean.
+from torch.onnx import _constants
+
+if typing.TYPE_CHECKING:
+    # Postpone type checking to avoid circular dependencies.
+    from torch.onnx import OperatorExportTypes, TrainingMode
+
+
+class _InternalGlobals:
+    """Globals used internally by ONNX exporter.
+
+    NOTE: Be very judicious when adding any new variables. Do not create new
+    global variables unless they are absolutely necessary.
+    """
+
+    def __init__(self):
+        self._export_onnx_opset_version = _constants.onnx_default_opset
+        self.operator_export_type: Optional[OperatorExportTypes] = None
+        self.training_mode: Optional[TrainingMode] = None
+        self.onnx_shape_inference: bool = False
+
+    @property
+    def export_onnx_opset_version(self):
+        return self._export_onnx_opset_version
+
+    @export_onnx_opset_version.setter
+    def export_onnx_opset_version(self, value: int):
+        supported_versions = [_constants.onnx_main_opset]
+        supported_versions.extend(_constants.onnx_stable_opsets)
+        if value not in supported_versions:
+            raise ValueError(f"Unsupported ONNX opset version: {value}")
+        self._export_onnx_opset_version = value
+
+
+GLOBALS = _InternalGlobals()
diff --git a/torch/onnx/_patch_torch.py b/torch/onnx/_patch_torch.py
new file mode 100644
index 000000000000..01af5fe26b19
--- /dev/null
+++ b/torch/onnx/_patch_torch.py
@@ -0,0 +1,238 @@
+"""Importing this patches torch._C classes to add ONNX conveniences."""
+import numbers
+import re
+from typing import Iterable, Tuple, Union
+
+import torch
+from torch.onnx._globals import GLOBALS
+
+
+def _graph_op(
+    g: torch._C.Graph,
+    opname: str,
+    *raw_args: torch._C.Node,
+    outputs: int = 1,
+    **kwargs,
+) -> Union[torch._C.Value, Tuple[torch._C.Value, ...]]:
+    r"""Creates an ONNX operator "opname", taking "args" as inputs and attributes "kwargs".
+
+    The set of operators and the inputs/attributes they take
+    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+    This function is monkey-patched onto Graph.
+
+    Args:
+        g: The Torch graph.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`. TODO(justinchu): Update examples to correct ones.
+        raw_args: The inputs to the operator; usually provided
+            as arguments to the `symbolic` definition.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Node`, representing each output of the ONNX operator
+            in positional.
+        kwargs: The attributes of the ONNX operator, whose keys are named
+            according to the following convention: `alpha_f` indicates
+            the `alpha` attribute with type `f`.  The valid type specifiers are
+            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+            specified with type float accepts either a single float, or a
+            list of floats (e.g., you would say `dims_i` for a `dims` attribute
+            that takes a list of integers).
+
+    Returns:
+        The node representing the single output of this operator (see the `outputs`
+        keyword argument for multi-return nodes).
+    """
+    # Filter out None attributes, this can be convenient client side because
+    # now they can pass through None attributes, and have them not show up
+    kwargs = dict((k, v) for k, v in kwargs.items() if v is not None)
+
+    def const_if_tensor(arg):
+        if arg is None:
+            return arg
+        elif isinstance(arg, torch._C.Value):
+            return arg
+        else:
+            return g.op("Constant", value_z=arg)  # type: ignore[attr-defined]
+
+    args = [const_if_tensor(arg) for arg in raw_args]
+    n = g.insertNode(_new_node(g, opname, outputs, *args, **kwargs))  # type: ignore[attr-defined]
+
+    # Import utils to get _params_dict because it is a global that is accessed by c++ code
+    from torch.onnx import utils
+
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            n, utils._params_dict, GLOBALS.export_onnx_opset_version
+        )
+
+    if outputs == 1:
+        return n.output()
+    return tuple(n.outputs())
+
+
+# Generate an ONNX ATen op node.
+def _aten_op(g, operator, *args, overload_name="", **kwargs):
+    kwargs["aten"] = True
+    return g.op(
+        "ATen", *args, operator_s=operator, overload_name_s=overload_name, **kwargs
+    )
+
+
+def _block_op(b, opname, *args, **kwargs):
+    if "::" in opname:
+        aten = False
+        ns_opname = opname
+    else:
+        aten = kwargs.pop("aten", False)
+        ns = "aten" if aten else "onnx"
+        ns_opname = ns + "::" + opname
+    n = b.addNode(ns_opname, list(args))
+    for k, v in sorted(kwargs.items()):
+        # TODO: enable inplace in aten exporting mode.
+        if k == "inplace":
+            continue
+        _add_attribute(n, k, v, aten=aten)
+    if len(list(n.outputs())) == 1:
+        return n.output()
+    return tuple(o for o in n.outputs())
+
+
+def _new_node(g: torch._C.Graph, opname: str, outputs, *args, **kwargs):
+    if "::" in opname:
+        aten = False
+        ns_opname = opname
+    else:
+        aten = kwargs.pop("aten", False)
+        ns = "aten" if aten else "onnx"
+        ns_opname = ns + "::" + opname
+    n = g.create(ns_opname, args, outputs)  # type: ignore[attr-defined]
+    for k, v in sorted(kwargs.items()):
+        # TODO: enable inplace in aten exporting mode.
+        if k == "inplace":
+            continue
+        _add_attribute(n, k, v, aten=aten)
+    return n
+
+
+_attr_pattern = re.compile("^(.+)_(([ifstgz])|(ty))$")
+
+
+def _is_onnx_list(value):
+    return (
+        not isinstance(value, torch._six.string_classes)
+        and not isinstance(value, torch.Tensor)
+        and isinstance(value, Iterable)
+    )
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x[0]
+
+
+def _is_caffe2_aten_fallback():
+    return (
+        GLOBALS.operator_export_type
+        == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        and torch.onnx._CAFFE2_ATEN_FALLBACK
+    )
+
+
+def _add_attribute(node, key, value, aten):
+    r"""Initializes the right attribute based on type of value."""
+    m = _attr_pattern.match(key)
+    if m is None:
+        raise IndexError(
+            (
+                "Invalid attribute specifier '{}' names "
+                + " must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
+            ).format(key)
+        )
+    name, kind = m.group(1), m.group(2)
+    if _is_onnx_list(value):
+        kind += "s"
+
+    if aten and _is_caffe2_aten_fallback():
+        if isinstance(value, torch.Tensor):
+            # Caffe2 proto does not support tensor attribute.
+            if value.numel() > 1:
+                raise ValueError("Should not pass tensor attribute")
+            value = _scalar(value)
+            if isinstance(value, float):
+                kind = "f"
+            else:
+                kind = "i"
+    return getattr(node, kind + "_")(name, value)
+
+
+# TODO: We might not need this anymore, since most scalars now show up as tensors
+# TODO(#76254): Remove the helper function if not needed.
+def _graph_constant(
+    g,
+    value,
+    dims,
+    type_: str,
+    *args,
+    **kwargs,
+):
+    """This helper function can create either constant tensor or constant scalar.
+
+    If dims is None or 0 or [0], generate a 0-d tensor (scalar).
+    """
+    assert isinstance(value, numbers.Number)
+    assert type_ is not None
+    isscalar = False
+    if dims is None or dims == 0 or set(dims) == set([0]):
+        dims = [1]
+        isscalar = True
+    type_ = type_.lower()
+    tensor: Union[
+        torch.CharTensor,
+        torch.ShortTensor,
+        torch.IntTensor,
+        torch.LongTensor,
+        torch.HalfTensor,
+        torch.FloatTensor,
+        torch.DoubleTensor,
+    ]
+    if type_ == "char":
+        tensor = torch.CharTensor(*dims)
+    elif type_ == "short":
+        tensor = torch.ShortTensor(*dims)
+    elif type_ == "int":
+        tensor = torch.IntTensor(*dims)
+    elif type_ == "long":
+        tensor = torch.LongTensor(*dims)
+    elif type_ == "half":
+        tensor = torch.HalfTensor(*dims)
+    elif type_ == "float":
+        tensor = torch.FloatTensor(*dims)
+    elif type_ == "double":
+        tensor = torch.DoubleTensor(*dims)
+    else:
+        raise ValueError(
+            "Unknown type, type should be one of the following strings: "
+            "char, short, int, long, half, float, double"
+        )
+    tensor.fill_(value)  # type: ignore[call-overload]
+    if isscalar:
+        return g.op("Constant", *args, value_z=tensor, **kwargs)
+    return g.op("Constant", *args, value_t=tensor, **kwargs)
+
+
+def _node_getitem(self, k):
+    """Gets attributes of a node which is polymorphic over return type.
+
+    This is monkey-patched onto Node.
+    """
+    sel = self.kindOf(k)
+    return getattr(self, sel)(k)
+
+
+torch._C.Graph.op = _graph_op  # type: ignore[attr-defined]
+torch._C.Graph.at = _aten_op  # type: ignore[attr-defined]
+torch._C.Block.op = _block_op  # type: ignore[attr-defined]
+torch._C.Graph.constant = _graph_constant  # type: ignore[attr-defined]
+torch._C.Node.__getitem__ = _node_getitem  # type: ignore[attr-defined, misc, assignment]
diff --git a/torch/onnx/onnx_supported_ops.py b/torch/onnx/onnx_supported_ops.py
new file mode 100644
index 000000000000..eacc77d267b4
--- /dev/null
+++ b/torch/onnx/onnx_supported_ops.py
@@ -0,0 +1,111 @@
+import inspect
+from typing import Dict, List, Union
+
+import torch._C
+from torch.onnx import _constants, symbolic_registry
+
+for v in _constants.onnx_stable_opsets:
+    symbolic_registry.register_version("", v)
+symbolic_registry.register_version("", _constants.onnx_main_opset)
+
+
+class _TorchSchema:
+    def __init__(self, schema: Union[torch._C.FunctionSchema, str]) -> None:
+        if isinstance(schema, torch._C.FunctionSchema):
+            self.name: str = schema.name
+            self.overload_name: str = schema.overload_name
+            self.arguments: List[str] = [arg.name for arg in schema.arguments]
+            self.optional_arguments: List[str] = []
+            self.returns: List[str] = [ret.name for ret in schema.returns]
+            self.opsets: List[int] = []
+        else:
+            self.name = schema
+            self.overload_name = ""
+            self.arguments = []
+            self.optional_arguments = []
+            self.returns = []
+            self.opsets = []
+
+    def __str__(self) -> str:
+        s = f"{self.name}.{self.overload_name}("
+        s += ", ".join(self.arguments)
+        s += ") -> ("
+        s += ", ".join(self.returns)
+        s += ")"
+        s += " in opsets "
+        s += ", ".join(str(opset) for opset in self.opsets)
+        return s
+
+    def __hash__(self):
+        # TODO(thiagocrepaldi): handle overload_name?
+        return hash((self.name))
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, _TorchSchema):
+            return False
+        # TODO(thiagocrepaldi): handle overload_name?
+        return self.name == other.name
+
+    def is_aten(self) -> bool:
+        return self.name.startswith("aten::")
+
+    def is_backward(self) -> bool:
+        return "backward" in self.name
+
+
+def _all_aten_forward_schemas():
+    """Creates a list of _TorchSchema for all aten schemas."""
+    torch_schemas = [_TorchSchema(s) for s in torch._C._jit_get_all_schemas()]
+    torch_schemas = sorted(torch_schemas, key=lambda x: x.name)
+    aten_schemas = [s for s in torch_schemas if s.is_aten() and not s.is_backward()]
+    return aten_schemas
+
+
+def _symbolic_argument_count(func):
+    params = []
+    sig = inspect.signature(func)
+    optional_params = []
+    has_var = False
+    for name, p in sig.parameters.items():
+        if p.kind.name == "VAR_POSITIONAL":
+            has_var = True
+        elif name == "_outputs" or name == "g":
+            continue
+        elif p.default != inspect._empty:  # type: ignore[attr-defined]
+            optional_params.append(p)
+        else:
+            params.append(str(p))
+    return params
+
+
+def _all_symbolics_schemas():
+    symbolics_schemas: Dict[str, _TorchSchema] = dict()
+
+    for domain, version in symbolic_registry._registry:
+        for opname, sym_func in symbolic_registry._registry[(domain, version)].items():
+            symbolics_schema = _TorchSchema("aten::" + opname)
+            symbolics_schema.arguments = _symbolic_argument_count(sym_func)
+            if opname in symbolics_schemas:
+                symbolics_schemas[opname].opsets.append(version)
+            else:
+                symbolics_schema.opsets = [version]
+                symbolics_schemas[opname] = symbolics_schema
+    return symbolics_schemas
+
+
+def onnx_supported_ops():
+    aten_schemas = _all_aten_forward_schemas()
+    symbolic_schemas = _all_symbolics_schemas()
+    torch_schemas = set(symbolic_schemas.values())
+    supported_ops, unsupported_ops = list(), list()
+    onnx_supported_ops = list()
+    for schema in aten_schemas:
+        if schema in torch_schemas:
+            opname = schema.name[6:]  # without "aten::" prefix
+            opsets = symbolic_schemas[opname].opsets
+            if schema not in supported_ops:
+                supported_ops.append(symbolic_schemas[opname])
+                onnx_supported_ops.append((opname, " ".join(str(o) for o in opsets)))
+        else:
+            unsupported_ops.append(schema)
+    return sorted(onnx_supported_ops, key=lambda x: x[0])
diff --git a/torch/onnx/symbolic_caffe2.py b/torch/onnx/symbolic_caffe2.py
index a53e4f7797d6..2417369e0a76 100644
--- a/torch/onnx/symbolic_caffe2.py
+++ b/torch/onnx/symbolic_caffe2.py
@@ -1,9 +1,11 @@
-from torch.onnx.symbolic_helper import parse_args
-import torch.onnx.symbolic_helper as sym_help
-import torch.onnx.symbolic_registry as sym_registry
 import importlib
 from inspect import getmembers, isfunction
 
+import torch.onnx.symbolic_helper as sym_help
+import torch.onnx.symbolic_registry as sym_registry
+from torch.onnx.symbolic_helper import parse_args
+
+
 def register_quantized_ops(domain, version):
     # Register all the non-quantized ops
     sym_registry.register_version("", version)
@@ -12,14 +14,27 @@ def register_quantized_ops(domain, version):
     sym_registry._symbolic_versions["caffe2"] = module
     quant_version_ops = getmembers(sym_registry._symbolic_versions["caffe2"])
     for op in quant_version_ops:
-        if isfunction(op[1]) and not sym_registry.is_registered_op(op[0], domain, version):
-            aten_q_ops = ["relu", "_empty_affine_quantized", "dequantize",
-                          "quantize_per_tensor", "upsample_nearest2d", "avg_pool2d",
-                          "reshape", "slice", "cat", "max_pool2d", "sigmoid"]
+        if isfunction(op[1]) and not sym_registry.is_registered_op(
+            op[0], domain, version
+        ):
+            aten_q_ops = [
+                "relu",
+                "_empty_affine_quantized",
+                "dequantize",
+                "quantize_per_tensor",
+                "upsample_nearest2d",
+                "avg_pool2d",
+                "reshape",
+                "slice",
+                "cat",
+                "max_pool2d",
+                "sigmoid",
+            ]
             if op[0] in aten_q_ops:
                 sym_registry.register_op(op[0], op[1], "", version)
             sym_registry.register_op(op[0], op[1], domain, version)
 
+
 def _permute_helper(g, input, axes):
     quant_args = {
         "axes_i": axes,
@@ -30,14 +45,17 @@ def _permute_helper(g, input, axes):
     sym_help._quantized_ops.add(output)
     return output
 
+
 def nchw2nhwc(g, input):
     axes = [0, 2, 3, 1]
     return _permute_helper(g, input, axes)
 
+
 def nhwc2nchw(g, input):
     axes = [0, 3, 1, 2]
     return _permute_helper(g, input, axes)
 
+
 def linear_prepack(g, weight, bias):
     # Mapping to a dummy caffe2 prepack node.
     # During the onnx -> c2 conversion we can look up original weight and bias
@@ -46,6 +64,7 @@ def linear_prepack(g, weight, bias):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "v", "v", "f", "i")
 def linear(g, input, weight, bias, scale, zero_point):
     kwargs = {
@@ -56,6 +75,7 @@ def linear(g, input, weight, bias, scale, zero_point):
     sym_help._quantized_ops.add(output)
     return output
 
+
 def conv_prepack(g, input, weight, bias, stride, padding, dilation, groups):
     # Mapping to a dummy caffe2 prepack node.
     # During the onnx -> c2 conversion we can look up original weight and bias
@@ -64,8 +84,11 @@ def conv_prepack(g, input, weight, bias, stride, padding, dilation, groups):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
-def conv2d(g, input, weight, bias, stride, padding, dilation, groups, scale, zero_point):
+def conv2d(
+    g, input, weight, bias, stride, padding, dilation, groups, scale, zero_point
+):
     kernel_size = weight.node()["shape"][1:3]
     kwargs = {
         "strides_i": stride,
@@ -81,8 +104,11 @@ def conv2d(g, input, weight, bias, stride, padding, dilation, groups, scale, zer
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
-def conv2d_relu(g, input, weight, bias, stride, padding, dilation, groups, scale, zero_point):
+def conv2d_relu(
+    g, input, weight, bias, stride, padding, dilation, groups, scale, zero_point
+):
     kernel_size = weight.node()["shape"][1:3]
     kwargs = {
         "strides_i": stride,
@@ -98,6 +124,7 @@ def conv2d_relu(g, input, weight, bias, stride, padding, dilation, groups, scale
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "v", "f", "i")
 def add(g, input_a, input_b, scale, zero_point):
     kwargs = {
@@ -108,10 +135,12 @@ def add(g, input_a, input_b, scale, zero_point):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v")
 def relu(g, input):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import relu
+
         return relu(g, input)
     kwargs = {
         "Y_scale_f": input.node()["Y_scale"],
@@ -121,6 +150,7 @@ def relu(g, input):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "f", "i", "t")
 def quantize_per_tensor(g, input, scale, zero_point, dtype):
     kwargs = {
@@ -131,17 +161,27 @@ def quantize_per_tensor(g, input, scale, zero_point, dtype):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v")
 def dequantize(g, input):
     return g.op("_caffe2::Int8Dequantize", input)
 
+
 @parse_args("v", "t", "t", "t", "t", "t", "t", "t")
-def _empty_affine_quantized(g, input, shape, scale, zero_point, dtype, pin_memory, memory_format, layout):
+def _empty_affine_quantized(
+    g, input, shape, scale, zero_point, dtype, pin_memory, memory_format, layout
+):
     return input
 
-def upsample_nearest2d(g, input, output_size, align_corners=None, scales_h=None, scales_w=None):
+
+def upsample_nearest2d(
+    g, input, output_size, align_corners=None, scales_h=None, scales_w=None
+):
     if input not in sym_help._quantized_ops:
-        from torch.onnx.symbolic_opset9 import upsample_nearest2d as upsample_nearest2d_impl
+        from torch.onnx.symbolic_opset9 import (
+            upsample_nearest2d as upsample_nearest2d_impl,
+        )
+
         return upsample_nearest2d_impl(g, input, output_size, align_corners)
 
     output_size = sym_help._parse_arg(output_size, "is")
@@ -155,10 +195,13 @@ def upsample_nearest2d(g, input, output_size, align_corners=None, scales_h=None,
     output = nhwc2nchw(g, output)
     sym_help._quantized_ops.add(output)
     return output
+
+
 @parse_args("v", "is", "is", "is", "is", "i")
 def max_pool2d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import max_pool2d
+
         return max_pool2d(g, input, kernel_size, stride, padding, dilation, ceil_mode)
     kwargs = {
         "strides_i": stride,
@@ -174,11 +217,31 @@ def max_pool2d(g, input, kernel_size, stride, padding, dilation, ceil_mode):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "is", "is", "is", "i", "i", "none")
-def avg_pool2d(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override=None):
+def avg_pool2d(
+    g,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override=None,
+):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import avg_pool2d
-        return avg_pool2d(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
+
+        return avg_pool2d(
+            g,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
     kwargs = {
         "strides_i": stride,
         "pads_i": padding + padding,
@@ -193,9 +256,11 @@ def avg_pool2d(g, input, kernel_size, stride, padding, ceil_mode, count_include_
     sym_help._quantized_ops.add(output)
     return output
 
+
 def reshape(g, input, shape):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import reshape
+
         return reshape(g, input, shape)
 
     kwargs = {
@@ -206,10 +271,12 @@ def reshape(g, input, shape):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v", "v", "v", "v", "i")
 def slice(g, input, dim, start, end, step):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import slice
+
         return slice(g, input, dim, start, end, step)
 
     if step != 1:
@@ -229,11 +296,13 @@ def slice(g, input, dim, start, end, step):
     sym_help._quantized_ops.add(output)
     return output
 
+
 def cat(g, tensor_list, dim, scale=None, zero_point=None):
     tensors = sym_help._unpack_list(tensor_list)
     input = tensors[0]
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import cat
+
         return cat(g, tensor_list, dim)
 
     dim = sym_help._parse_arg(dim, "i")
@@ -245,10 +314,12 @@ def cat(g, tensor_list, dim, scale=None, zero_point=None):
     sym_help._quantized_ops.add(output)
     return output
 
+
 @parse_args("v")
 def sigmoid(g, input):
     if input not in sym_help._quantized_ops:
         from torch.onnx.symbolic_opset9 import sigmoid
+
         return sigmoid(g, input)
     # Caffe2 expects the output scale to be 1/2^8
     # and output zero_point to be 0 (quint8 type)
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index d6746967d76b..59cd61e25000 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -1,30 +1,31 @@
 import enum
-import torch
-import warnings
 import inspect
+import warnings
+from functools import wraps
 from sys import maxsize as maxsize
 from typing import Set
 
+import torch
 import torch.onnx
-# This import monkey-patches graph manipulation methods on Graph, used for the
-# ONNX symbolics
-import torch.onnx.utils
-
-from functools import wraps
 from torch._C import OptionalType
 
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+from torch.onnx import _patch_torch  # noqa: F401
+from torch.onnx._globals import GLOBALS
 
 # Note [Edit Symbolic Files]
 # EDITING THIS FILE AND SYMBOLIC_OPSET<VERSION> FILES? READ THIS FIRST!
 #
-# - These files are ONLY for ATen operators (e.g., operators that show up in the
-#   trace as aten::blah).  If you need to special case a primitive operator,
-#   look at _run_symbolic_function
-# - Parameter ordering does NOT necessarily match what is in VariableType.cpp;
-#   tensors are always first, then non-tensor arguments.
-# - Parameter names must *exactly* match the names in VariableType.cpp, because
+# - Module-level functions are called to convert the corresponding op in the `aten` domain.
+#   E.g. symbolic_opset9.foo is called to convert aten::foo.
+#   Symbolic functions for other domains are staticmethods in classes named after the domain.
+#   E.g. symbolic_opset9.Prim.ConstantChunk is called to convert prim::ConstantChunk.
+# - Parameter names must *exactly* match the names in
+#   aten/src/ATen/native/native_functions.yaml, because
 #   dispatch is done with keyword arguments.
-# - Looking for inplace ops?  They're detected by the trailing underscore, and
+# - Looking for inplace ops?  They're detected by
+#   `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
 #   transparently dispatched to their non inplace versions in
 #   "run_symbolic_function".   See Note [Export inplace]
 #
@@ -42,14 +43,24 @@
 # on the number of dimensions which is better than relying on
 # concrete shapes. Doing so will make the export symbolics
 # more robust to different graphs.
+#
+# ----------------------------------------------------------------------------------
+# Extra context for symbolic functions
+# ----------------------------------------------------------------------------------
+#
+# In general, symbolic functions only require inputs and attributes to
+# the original node. In rare circumstances, extra context may be required.
+# For example, symbolic function for `prim::Loop` needs access to the subblock of
+# the original node.
+# A symbolic function that has a first arg (before the Graph object) with the
+# type annotation of torch.onnx.SymbolicContext will be called with that additional context.
+# During export, it is populated from `utils._run_symbolic_function`
+# to contain the context for each node being converted.
 
 # ---------------------------------------------------------------------------------
 # Helper functions
 # ---------------------------------------------------------------------------------
 
-# Save some builtins as locals, because we'll shadow them below
-_sum = sum
-
 
 def _parse_arg(value, desc, arg_name=None, node_name=None):
     if desc == "none":
@@ -80,18 +91,29 @@ def _parse_arg(value, desc, arg_name=None, node_name=None):
         if desc == "is":
             for v in value.node().inputs():
                 if v.node().kind() != "onnx::Constant":
-                    raise RuntimeError("Failed to export an ONNX attribute '" + v.node().kind() +
-                                       "', since it's not constant, please try to make "
-                                       "things (e.g., kernel size) static if possible")
+                    raise RuntimeError(
+                        "Failed to export an ONNX attribute '"
+                        + v.node().kind()
+                        + "', since it's not constant, please try to make "
+                        "things (e.g., kernel size) static if possible"
+                    )
             return [int(v.node()["value"]) for v in value.node().inputs()]
         else:
-            raise RuntimeError("ONNX symbolic doesn't know to interpret ListConstruct node")
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret ListConstruct node"
+            )
 
     if arg_name is None or node_name is None:
-        raise RuntimeError("Expected node type 'onnx::Constant', got '{}'.".format(value.node().kind()))
+        raise RuntimeError(
+            "Expected node type 'onnx::Constant', got '{}'.".format(value.node().kind())
+        )
     else:
-        raise RuntimeError("Expected node type 'onnx::Constant' "
-                           "for argument '{}' of node '{}', got '{}'.".format(arg_name, node_name, value.node().kind()))
+        raise RuntimeError(
+            "Expected node type 'onnx::Constant' "
+            "for argument '{}' of node '{}', got '{}'.".format(
+                arg_name, node_name, value.node().kind()
+            )
+        )
 
 
 def _maybe_get_const(value, desc):
@@ -109,7 +131,11 @@ def _maybe_get_scalar(value):
 
 def _get_const(value, desc, arg_name):
     if not _is_constant(value):
-        raise RuntimeError("ONNX symbolic expected a constant value of the {} argument, got `{}`".format(arg_name, value))
+        raise RuntimeError(
+            "ONNX symbolic expected a constant value of the {} argument, got `{}`".format(
+                arg_name, value
+            )
+        )
     return _parse_arg(value, desc)
 
 
@@ -119,6 +145,17 @@ def _unpack_list(list_value):
     return list(list_node.inputs())
 
 
+def _unpack_tuple(tuple_value):
+    tuple_node = tuple_value.node()
+    if tuple_node.kind() != "prim::TupleConstruct":
+        raise RuntimeError(
+            "ONNX symbolic expected node type `prim::TupleConstruct`, got `{}`".format(
+                tuple_node
+            )
+        )
+    return list(tuple_node.inputs())
+
+
 # Check if list_value is output from prim::ListConstruct
 # This is usually called before _unpack_list to ensure the list can be unpacked.
 def _is_packed_list(list_value):
@@ -129,24 +166,27 @@ def parse_args(*arg_descriptors):
     """A decorator which converts args from torch._C.Value to built-in types.
 
     For example:
+
+    ```
     @parse_args('v', 'i', 'fs')
     foo(g, a, b, c):
-      assert isinstance(a, torch._C.Value)
-      assert isinstance(b, int)
-      assert isinstance(c, list)
-      assert isinstance(c[0], float)
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
 
     Args:
-      arg_descriptors: list of str, where each element is
-        a string that specifies the type to convert to. Valid descriptors:
-        "v": no conversion, keep torch._C.Value.
-        "i": int
-        "is": list(int)
-        "f": float
-        "fs": list of float
-        "b": bool
-        "s": str
-        "t": torch.Tensor
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
     """
 
     def decorator(fn):
@@ -155,7 +195,17 @@ def decorator(fn):
         @wraps(fn)
         def wrapper(g, *args, **kwargs):
             # some args may be optional, so the length may be smaller
-            assert len(arg_descriptors) >= len(args)
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
             try:
                 sig = inspect.signature(fn)
                 arg_names = list(sig.parameters.keys())[1:]
@@ -163,15 +213,101 @@ def wrapper(g, *args, **kwargs):
             except Exception:
                 arg_names = [None] * len(args)  # type: ignore[list-item]
                 fn_name = None
-            args = [_parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[assignment]
-                    for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)]
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[assignment]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
             # only support _outputs in kwargs
-            assert len(kwargs) <= 1
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
             if len(kwargs) == 1:
-                assert "_outputs" in kwargs
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain '_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
             return fn(g, *args, **kwargs)
 
         return wrapper
+
+    return decorator
+
+
+def quantized_args(*arg_q_descriptors, scale=None, zero_point=None):
+    """A decorator which extends support for quantized version of the base operator.
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument dequantization and output quantization.
+    Otherwise, only base symbolic function will be invoked.
+
+    For example:
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+
+    is equivalent to
+
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+
+    Args:
+        arg_q_descriptors: list of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator.
+        scale: float default None, quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: int default None, quantized output zero point. If None,
+          derive from the first quantized input zero point.
+    """
+
+    def decorator(fn):
+        fn._scale = scale
+        fn._zero_point = zero_point
+
+        @wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            _scale = fn._scale
+            if _scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(_scale))
+            _zero_point = fn._zero_point
+            if _zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(_zero_point))
+
+            # some args may be optional, so the length may be smaller
+            assert len(arg_q_descriptors) >= len(args)
+            desc_args = tuple(zip(arg_q_descriptors[: len(args)], args))
+            # Run regular symbolic function if none of the argument is QTensor.
+            if not any(
+                (desc and arg.node().kind() == "prim::TupleConstruct")
+                for desc, arg in desc_args
+            ):
+                return fn(g, *args, **kwargs)
+
+            dequantized_args = []
+            for desc, arg in desc_args:
+                if desc:
+                    dequantized_arg, scale, zero_point, _ = dequantize_helper(g, arg)
+                    dequantized_args.append(dequantized_arg)
+                    if _scale is None:
+                        _scale = scale
+                    if _zero_point is None:
+                        _zero_point = zero_point
+                else:
+                    dequantized_args.append(arg)
+            # TODO: only support single output
+            output = fn(g, *dequantized_args, **kwargs)
+
+            return quantize_helper(g, output, _scale, _zero_point)
+
+        return wrapper
+
     return decorator
 
 
@@ -181,7 +317,7 @@ def _scalar(x):
     return x.item()
 
 
-def _if_scalar_type_as(g, self, tensor):
+def _if_scalar_type_as(g: torch._C.Graph, self, tensor):
     """
     Convert self into the same type of tensor, as necessary.
 
@@ -203,21 +339,30 @@ def _if_scalar_type_as(g, self, tensor):
 def _is_none(x):
     return x.node().mustBeNone()
 
+
 def _is_value(x):
     return isinstance(x, torch._C.Value)
 
+
 def _is_constant(value):
-    return not _is_value(value) or value.node().kind() in ('onnx::Constant', 'prim::Constant')
+    return not _is_value(value) or value.node().kind() in (
+        "onnx::Constant",
+        "prim::Constant",
+    )
+
 
 def _is_tensor(x):
     return x.type().isSubtypeOf(torch._C.TensorType.get())
 
+
 def _is_list(x):
     return isinstance(x.type(), torch._C.ListType)
 
+
 def _is_tensor_list(x):
     return _is_list(x) and isinstance(x.type().getElementType(), torch._C.TensorType)
 
+
 def _is_scalar_list(x):
     """
     Check if x is a scalar list, for example: List[float], List[int].
@@ -226,15 +371,27 @@ def _is_scalar_list(x):
     a valid ONNX data type.
     """
     element_type = str(x.type().getElementType())
-    return _is_list(x) and \
-        element_type in scalar_name_to_pytorch.keys() and \
-        (scalar_name_to_pytorch[element_type] in cast_pytorch_to_onnx.keys())
+    return (
+        _is_list(x)
+        and element_type in scalar_name_to_pytorch.keys()
+        and (scalar_name_to_pytorch[element_type] in cast_pytorch_to_onnx.keys())
+    )
+
+
+def is_caffe2_aten_fallback():
+    return (
+        GLOBALS.operator_export_type
+        == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        and torch.onnx._CAFFE2_ATEN_FALLBACK
+    )
+
 
 def _get_tensor_rank(x):
     if not _is_tensor(x) or x.type() is None:
         return None
     return x.type().dim()
 
+
 def _get_tensor_sizes(x, allow_nonstatic=True):
     if not _is_tensor(x) or x.type() is None:
         return None
@@ -246,6 +403,7 @@ def _get_tensor_sizes(x, allow_nonstatic=True):
     # e.g. [1, "a", "b"] -> None
     return x.type().sizes()
 
+
 def _get_tensor_dim_size(x, dim):
     try:
         sizes = _get_tensor_sizes(x)
@@ -254,29 +412,65 @@ def _get_tensor_dim_size(x, dim):
         pass
     return None
 
+
+def _get_dim_for_cross(input, dim):
+    if dim == -1:
+        return dim + _get_tensor_rank(input)
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(input)
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
 def _unimplemented(op, msg):
-    warnings.warn("ONNX export failed on " + op + " because " + msg + " not supported")
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if torch.onnx._CAFFE2_ATEN_FALLBACK:
+        warnings.warn(
+            "ONNX export failed on " + op + " because " + msg + " not supported"
+        )
+    elif GLOBALS.operator_export_type == torch.onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}")
 
 
 def _onnx_unsupported(op_name):
-    raise RuntimeError("Unsupported: ONNX export of operator {}. "
-                       "Please feel free to request support or submit a pull request on PyTorch GitHub.".format(op_name))
+    raise RuntimeError(
+        "Unsupported: ONNX export of operator {}. "
+        "Please feel free to request support or submit a pull request on PyTorch GitHub.".format(
+            op_name
+        )
+    )
 
 
 def _onnx_opset_unsupported(op_name, current_opset, supported_opset):
-    raise RuntimeError("Unsupported: ONNX export of {} in "
-                       "opset {}. Please try opset version {}.".format(op_name, current_opset, supported_opset))
+    raise RuntimeError(
+        "Unsupported: ONNX export of {} in "
+        "opset {}. Please try opset version {}.".format(
+            op_name, current_opset, supported_opset
+        )
+    )
+
 
 def _onnx_opset_unsupported_detailed(op_name, current_opset, supported_opset, reason):
-    raise RuntimeError("Unsupported: ONNX export of {} in "
-                       "opset {}. {}. Please try opset version {}.".format(op_name, current_opset, reason, supported_opset))
+    raise RuntimeError(
+        "Unsupported: ONNX export of {} in "
+        "opset {}. {}. Please try opset version {}.".format(
+            op_name, current_opset, reason, supported_opset
+        )
+    )
 
 
 def _block_list_in_opset(name):
     def symbolic_fn(*args, **kwargs):
-        raise RuntimeError("ONNX export failed on {}, which is not implemented for opset {}. "
-                           "Try exporting with other opset versions."
-                           .format(name, _export_onnx_opset_version))
+        raise RuntimeError(
+            "ONNX export failed on {}, which is not implemented for opset {}. "
+            "Try exporting with other opset versions.".format(
+                name, GLOBALS.export_onnx_opset_version
+            )
+        )
+
     return symbolic_fn
 
 
@@ -298,7 +492,9 @@ def _select_helper(g, self, dim, index, apply_reshape=True):
     elif index_dim is not None and apply_reshape:
         if index_dim == 0:
             # Index is a scalar. Reshape it to a size 1 tensor.
-            index = _reshape_helper(g, index, g.op("Constant", value_t=torch.LongTensor([1])))
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
 
     index_scalar_type = index.type().scalarType()
     if index_scalar_type is None or index_scalar_type not in ["Long", "Int"]:
@@ -307,24 +503,35 @@ def _select_helper(g, self, dim, index, apply_reshape=True):
 
 
 def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
-    if _export_onnx_opset_version <= 9:
+    if GLOBALS.export_onnx_opset_version <= 9:
         from torch.onnx.symbolic_opset9 import _slice as _slice9
+
         return _slice9(g, input, axes, starts, ends)
     else:
         from torch.onnx.symbolic_opset10 import _slice as _slice10
+
         return _slice10(g, input, axes, starts, ends, steps, dynamic_slice)
 
+
 def _is_fp(value):
     if value:
         if isinstance(value, torch.Tensor):
-            return value.dtype in (torch.float16, torch.float32, torch.float64, torch.bfloat16)
+            return value.dtype in (
+                torch.float16,
+                torch.float32,
+                torch.float64,
+                torch.bfloat16,
+            )
         else:
             type = value.type().scalarType()
             if type is None:
-                warnings.warn("Type cannot be inferred, which might cause exported graph to produce incorrect results.")
+                warnings.warn(
+                    "Type cannot be inferred, which might cause exported graph to produce incorrect results."
+                )
             return type in ("Float", "Double", "Half", "BFloat16")
     return False
 
+
 def _generate_wrapped_number(g, scalar):
     """
     Create a wrapped number based on https://github.com/pytorch/pytorch/issues/9515
@@ -342,17 +549,24 @@ def _generate_wrapped_number(g, scalar):
         return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
     return g.op("Constant", value_t=torch.tensor(scalar))
 
+
 def _sort_helper(g, input, dim, decending=True, out=None):
     if out is not None:
         _unimplemented("Sort", "Out parameter is not supported")
     shape_ = g.op("Shape", input)
-    dim_size_ = g.op("Gather", shape_, g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)))
-    if _export_onnx_opset_version <= 10:
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if GLOBALS.export_onnx_opset_version <= 10:
         if not decending:
             _unimplemented("Sort", "Ascending is not supported")
         return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
     else:
-        return g.op("TopK", input, dim_size_, axis_i=dim, largest_i=decending, outputs=2)
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=decending, outputs=2
+        )
 
 
 def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
@@ -362,84 +576,147 @@ def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
         k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
     else:
         k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
-    if _export_onnx_opset_version <= 10:
+        if _try_get_scalar_type(k) != "Long":
+            k = g.op("Cast", k, to_i=torch.onnx.TensorProtoDataType.INT64)
+    if GLOBALS.export_onnx_opset_version <= 10:
         if not largest:
             _unimplemented("TopK", "Ascending is not supported")
         return g.op("TopK", input, k, axis_i=dim, outputs=2)
     else:
-        return g.op("TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2)
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
 
 
 def _lt_helper(g, input, other):
-    if _export_onnx_opset_version <= 8:
+    if GLOBALS.export_onnx_opset_version <= 8:
         from torch.onnx.symbolic_opset8 import lt as _lt8
+
         return _lt8(g, input, other)
     else:
         from torch.onnx.symbolic_opset9 import lt as _lt9
+
         return _lt9(g, input, other)
 
 
 def _interpolate_warning(interpolate_mode):
-    onnx_op = "onnx:Resize" if _export_onnx_opset_version >= 10 else "onnx:Upsample"
-    warnings.warn("You are trying to export the model with " + onnx_op + " for ONNX opset version "
-                  "" + str(_export_onnx_opset_version) + ". "
-                  "This operator might cause results to not match the expected results by PyTorch.\n"
-                  "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
-                  "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
-                  "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
-                  "We recommend using opset 11 and above for models using this operator.")
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
 
 def _unsqueeze_helper(g, input, axes_i):
-    if _export_onnx_opset_version >= 13:
-        axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-        return g.op("Unsqueeze", input, axes)
-    else:
+    if _is_constant(axes_i[0]):
+        if GLOBALS.export_onnx_opset_version >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
         return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if GLOBALS.export_onnx_opset_version < 13:
+        raise ValueError(
+            f"Opset version must be >= 13 for Unsqueeze with dynamic axes. {input.node().sourceRange()}"
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
 
 def _squeeze_helper(g, input, axes_i):
-    if _export_onnx_opset_version >= 13:
-        axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-        return g.op("Squeeze", input, axes)
-    else:
+    if _is_constant(axes_i[0]):
+        if GLOBALS.export_onnx_opset_version >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
         return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if GLOBALS.export_onnx_opset_version < 13:
+        raise ValueError(
+            f"Opset version must be >= 13 for Squeeze with dynamic axes. {input.node().sourceRange()}"
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    if axes_rank > 1:
+        raise ValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec."
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
 
 def _reducesum_helper(g, input, axes_i=None, keepdims_i=1, noop_with_empty_axes_i=0):
     keepdims_i = _maybe_get_const(keepdims_i, "i")
-    if _export_onnx_opset_version >= 13:
+    if GLOBALS.export_onnx_opset_version >= 13:
         if axes_i:
             if not _is_value(axes_i):
-                axes_i = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-            return g.op("ReduceSum", input, axes_i, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i)
-        return g.op("ReduceSum", input, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i)
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
     else:
         return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
 
+
 def _interpolate_size_to_scales(g, input, output_size, dim):
     output_size = _maybe_get_const(output_size, "is")
     if _is_value(output_size):
         offset = 2
         offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
         dividend = g.op("Cast", output_size, to_i=cast_pytorch_to_onnx["Float"])
-        divisor = _slice_helper(g, g.op("Shape", input), axes=[0], ends=[maxsize], starts=[offset])
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[maxsize], starts=[offset]
+        )
         divisor = g.op("Cast", divisor, to_i=cast_pytorch_to_onnx["Float"])
         scale_dims = g.op("Div", dividend, divisor)
         scales = g.op("Concat", offsets, scale_dims, axis_i=0)
     else:
-        scales_constant = [1. if i < 2 else
-                           float(output_size[-(dim - i)]) / float(input.type().sizes()[-(dim - i)])
-                           for i in range(0, dim)]
-        scales = g.op("Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32))
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
     return scales
 
 
 def _interpolate_get_scales_if_available(g, scales):
-    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(scales[0])
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
 
     if not available_scales:
         return None
 
     offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
-    scales_list = g.op("Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs")))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
     scales = g.op("Concat", offsets, scales_list, axis_i=0)
     return scales
 
@@ -454,10 +731,13 @@ def _get_interpolate_attributes(g, mode, args):
     scales = _interpolate_get_scales_if_available(g, scales)
     return scales, align_corners
 
+
 def _interpolate_get_scales(g, scale_factor, dim):
     offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
     scale_factor_rank = _get_tensor_rank(scale_factor)
-    if isinstance(scale_factor.type(), torch._C.ListType) or (scale_factor_rank is not None and scale_factor_rank > 0):
+    if isinstance(scale_factor.type(), torch._C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
         return g.op("Concat", offsets, scale_factor, axis_i=0)
     else:
         scale_factor = _unsqueeze_helper(g, scale_factor, [0])
@@ -467,7 +747,7 @@ def _interpolate_get_scales(g, scale_factor, dim):
     return scale_factor
 
 
-def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_corners):
+def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode, align_corners):
     mode = _maybe_get_const(mode, "s")
     if "linear" in mode:
         mode = "linear"
@@ -487,14 +767,16 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_
         scale_factor = _interpolate_get_scales(g, scale_factor, dim)
     elif not _is_none(size):
         if not _is_packed_list(size):
-            is_scalar = ((_maybe_get_const(size, "t").dim() == 0))
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
             if is_scalar:
                 size = _unsqueeze_helper(g, size, [0])
                 size = [size for i in range(dim - 2)]
                 size = g.op("Concat", *size, axis_i=0)
         scale_factor = _interpolate_size_to_scales(g, input, size, dim)
     else:
-        return _unimplemented("interpolate", "Both size and scales are None in __interpolate")
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
     return scale_factor, mode
 
 
@@ -502,49 +784,69 @@ def _interpolate_helper(name, dim, interpolate_mode):
     def symbolic_fn(g, input, output_size, *args):
         scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
         align_corners = _maybe_get_scalar(align_corners)
-        coordinate_transformation_mode = "asymmetric" if interpolate_mode == "nearest" \
-            else "align_corners" if align_corners else "pytorch_half_pixel"
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "pytorch_half_pixel"
+        )
 
         if scales is None:
             input_size = g.op("Shape", input)
-            input_size_beg = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
             output_size = g.op("Cast", output_size, to_i=cast_pytorch_to_onnx["Long"])
             output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
 
-            if _export_onnx_opset_version >= 13:
+            if GLOBALS.export_onnx_opset_version >= 13:
                 empty_roi = _optional_input_placeholder_tensor(g)
                 empty_scales = _optional_input_placeholder_tensor(g)
             else:
-                empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-                empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-            return g.op("Resize",
-                        input,
-                        empty_roi,
-                        empty_scales,
-                        output_size,
-                        coordinate_transformation_mode_s=coordinate_transformation_mode,
-                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                        mode_s=interpolate_mode,  # nearest, linear, or cubic
-                        nearest_mode_s="floor")  # only valid when mode="nearest"
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
         else:
-            if _export_onnx_opset_version >= 13:
+            if GLOBALS.export_onnx_opset_version >= 13:
                 empty_roi = _optional_input_placeholder_tensor(g)
             else:
-                empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-            return g.op("Resize",
-                        input,
-                        empty_roi,
-                        scales,
-                        coordinate_transformation_mode_s=coordinate_transformation_mode,
-                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                        mode_s=interpolate_mode,  # nearest, linear, or cubic
-                        nearest_mode_s="floor")  # only valid when mode="nearest"
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
     return symbolic_fn
 
 
-def __interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor):
+def __interpolate_helper(
+    g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+):
     mode = _maybe_get_const(mode, "s")
     if "linear" in mode:
         mode = "linear"
@@ -552,10 +854,15 @@ def __interpolate_helper(g, input, size, scale_factor, mode, align_corners, reco
         mode = "cubic"
     align_corners = _maybe_get_const(align_corners, "b")
     align_corners = False if not isinstance(align_corners, bool) else align_corners
-    coordinate_transformation_mode = "asymmetric" if mode == "nearest" \
-        else "align_corners" if align_corners else "pytorch_half_pixel"
-
-    if not _is_none(size) :
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "pytorch_half_pixel"
+    )
+
+    if not _is_none(size):
         input_size = g.op("Shape", input)
         input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
         # in some cases size is not a packed list but size is a scalar
@@ -563,65 +870,77 @@ def __interpolate_helper(g, input, size, scale_factor, mode, align_corners, reco
         # but this information is not always available. Try to get the dim,
         # and if not assume that it is not a scalar.
         try:
-            is_scalar = not _is_packed_list(size) and ((_maybe_get_const(size, "t").dim() == 0))
+            is_scalar = not _is_packed_list(size) and (
+                (_maybe_get_const(size, "t").dim() == 0)
+            )
         except AttributeError:
             is_scalar = not _is_packed_list(size)
             if not is_scalar:
-                warnings.warn("Cannot verify if the output_size is a scalar "
-                              "while exporting interpolate. Assuming that it is not a scalar.")
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
 
         if is_scalar:
             rank = _get_tensor_rank(input)
             if rank is None:
-                return _unimplemented("interpolate (with a scalar output_size)",
-                                      "missing input shape (try giving an array of output_size values)")
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
             size = _unsqueeze_helper(g, size, [0])
             size = [size for i in range(rank - 2)]
             size = g.op("Concat", *size, axis_i=0)
         size = g.op("Cast", size, to_i=cast_pytorch_to_onnx["Long"])
         size = g.op("Concat", input_size, size, axis_i=0)
 
-        if _export_onnx_opset_version >= 13:
+        if GLOBALS.export_onnx_opset_version >= 13:
             empty_roi = _optional_input_placeholder_tensor(g)
             empty_scales = _optional_input_placeholder_tensor(g)
         else:
             empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-            empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-        return g.op("Resize",
-                    input,
-                    empty_roi,
-                    empty_scales,
-                    size,
-                    coordinate_transformation_mode_s=coordinate_transformation_mode,
-                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                    mode_s=mode,  # nearest, linear, or cubic
-                    nearest_mode_s="floor")
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
     else:  # if not _is_none(scales)
         rank = _get_tensor_rank(input)
         if rank is None:
             return _unimplemented("interpolate (with scales)", "missing input shape")
 
-        if _export_onnx_opset_version >= 13:
+        if GLOBALS.export_onnx_opset_version >= 13:
             empty_roi = _optional_input_placeholder_tensor(g)
         else:
             empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
 
         scales = _interpolate_get_scales(g, scale_factor, rank)
-        return g.op("Resize",
-                    input,
-                    empty_roi,
-                    scales,
-                    coordinate_transformation_mode_s=coordinate_transformation_mode,
-                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                    mode_s=mode,  # nearest, linear, or cubic
-                    nearest_mode_s="floor")  # only valid when mode="nearest"
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
 
 
 def _unbind_helper(g, self, dim, _outputs):
-    if _export_onnx_opset_version < 11:
+    if GLOBALS.export_onnx_opset_version < 11:
         from torch.onnx.symbolic_opset9 import unbind
-    elif _export_onnx_opset_version <= 12:
+    elif GLOBALS.export_onnx_opset_version <= 12:
         from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
     else:
         from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
@@ -629,20 +948,24 @@ def _unbind_helper(g, self, dim, _outputs):
 
 
 def _scatter_helper(g, self, dim, index, src):
-    if _export_onnx_opset_version <= 10:
+    if GLOBALS.export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
         # for mypy, scatter was imported two lines above
         from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
     return scatter(g, self, dim, index, src)
 
+
 def _repeat_interleave_split_helper(g, self, reps, dim):
-    if _export_onnx_opset_version <= 12:
-        return g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    if GLOBALS.export_onnx_opset_version <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
     else:
         from torch.onnx.symbolic_opset13 import split
+
         repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
-        return split(g, self, repeats, dim, _outputs=reps)
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
 
 def _arange_cast_helper(g, end, start=None, step=None, dtype=None):
     def _is_all_integral(scalars):
@@ -671,16 +994,19 @@ def _is_all_integral(scalars):
     step = g.op("Cast", step, to_i=scalar_type_to_onnx[type]) if step else None
     return type, end, start, step
 
+
 def _arange_helper(g, *args):
-    if _export_onnx_opset_version <= 10:
+    if GLOBALS.export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import arange
     else:
         from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
     return arange(g, *args)
 
+
 def _size_helper(g, self, dim):
     full_shape = g.op("Shape", self)
     from torch.onnx.symbolic_opset9 import select
+
     return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
 
 
@@ -691,7 +1017,8 @@ def _index_fill_reshape_helper(g, self, dim, index):
     # 4. apply onnx::scatter.
 
     from torch.onnx.symbolic_opset9 import expand
-    if _export_onnx_opset_version <= 10:
+
+    if GLOBALS.export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
         # for mypy, scatter was imported two lines above
@@ -701,55 +1028,86 @@ def _index_fill_reshape_helper(g, self, dim, index):
         return _unimplemented("index_fill", "input rank not accesible")
     self_dim = self.type().dim()
     dim_value = _parse_arg(dim, "i")
-    unsqueezed_index = _unsqueeze_helper(g, index, [i for i in range(self_dim) if i != dim_value])
-    expanded_index_shape = scatter(g, g.op("Shape", self), 0,
-                                   _unsqueeze_helper(g, dim, [0]), g.op("Shape", index))
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
     expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
     return expanded_index_shape, expanded_index
 
-# When using reshape helper (opset_version >= 14), if reshape has -1,
-# allowzero cannot be set to 1
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
 def _reshape_helper(g, input, shape, allowzero=0):
     shape = _maybe_get_const(shape, "is")
     if not _is_value(shape):
         shape = g.op("Constant", value_t=torch.LongTensor(shape))
-    if _export_onnx_opset_version <= 13:
+    if GLOBALS.export_onnx_opset_version <= 13:
+        if allowzero == 1:
+            raise _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14
+            )
         return g.op("Reshape", input, shape)
     else:
-        warnings.warn("allowzero=0 by default. In order to honor zero value in shape use allowzero=1")
         return g.op("Reshape", input, shape, allowzero_i=allowzero)
 
+
 def _batchnorm_helper(g, input, weight, bias, running_mean, running_var):
     from torch.onnx.symbolic_opset9 import _var_mean
+
     batch_size = _get_tensor_dim_size(input, 0)
     channel_size = _get_tensor_dim_size(input, 1)
 
     if weight is None or _is_none(weight):
         if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
-                               "channel size.")
-        weight_value = torch.tensor([1.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
+            raise RuntimeError(
+                "Unsupported: ONNX export of batch_norm for unknown " "channel size."
+            )
+        weight_value = torch.tensor([1.0] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or _is_none(bias):
         if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
-                               "channel size.")
-        bias_value = torch.tensor([0.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
+            raise RuntimeError(
+                "Unsupported: ONNX export of batch_norm for unknown " "channel size."
+            )
+        bias_value = torch.tensor([0.0] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         bias = g.op("Constant", value_t=bias_value)
     # If track_running_stats is set to False batch statistics are instead used during evaluation time
-    if running_mean is None or _is_none(running_mean) or running_var is None or _is_none(running_var):
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
         assert batch_size is not None and channel_size is not None
-        reshape_in = _reshape_helper(g, input,
-                                     g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1],
-                                          dtype=torch.int64)))
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
         trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
-        running_var, running_mean = _var_mean(g, trans_in,
-                                              g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
-                                              False, False)
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
     return weight, bias, running_mean, running_var
 
+
 def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name):
     if divisor_override and divisor_override.node().kind() != "prim::Constant":
         return _unimplemented(name, "divisor_override")
@@ -760,20 +1118,26 @@ def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, na
 
 
 def check_training_mode(op_train_mode, op_name):
-    global _training_mode
     op_train_mode = True if op_train_mode == 1 else False
-    if _training_mode is not None and op_train_mode != _training_mode:
+    if GLOBALS.training_mode is not None and op_train_mode != GLOBALS.training_mode:
         op_mode = "training " if op_train_mode else "inference"
-        training_mode = "training " if _training_mode else "inference"
-        # setting the model mode could result in op_mode != _training_mode
+        training_mode = "training " if GLOBALS.training_mode else "inference"
+        # setting the model mode could result in op_mode != _flags.training_mode
         # if the model is a FuncModule. In this case we warn the user of
         # the state and export depending on op_mode
         # This is to support use-cases of fixing certain layer weights
         # in training.
-        warnings.warn("ONNX export mode is set to " + training_mode +
-                      " mode, but operator " + op_name + " is set to " +
-                      op_mode + " mode. The operators will be exported in " +
-                      op_mode + ", as specified by the functional operator.")
+        warnings.warn(
+            "ONNX export mode is set to "
+            + training_mode
+            + " mode, but operator "
+            + op_name
+            + " is set to "
+            + op_mode
+            + " mode. The operators will be exported in "
+            + op_mode
+            + ", as specified by the functional operator."
+        )
 
 
 def _flatten_helper(g, input, start_dim, end_dim, dim):
@@ -781,93 +1145,177 @@ def _flatten_helper(g, input, start_dim, end_dim, dim):
     slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
     slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
     if end_dim < dim - 1:
-        slice3 = _slice_helper(g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim])
-        slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)), slice3]
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
 
     final_shape = g.op("Concat", *slices, axis_i=0)
     from torch.onnx.symbolic_opset9 import _reshape_from_tensor
+
     return _reshape_from_tensor(g, input, final_shape)
 
+
 def _is_split_static(split_size_or_sizes, _outputs):
     if _outputs is None:
         return False
-    if _is_value(split_size_or_sizes) and split_size_or_sizes.node().kind() != "onnx::Constant":
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
         return False
     return True
 
+
 def _optional_input_placeholder_tensor(g):
     n = g.op("prim::Constant")
     n.setType(OptionalType.ofTensor())
     return n
 
+
 def _handle_reduce_dim_none(g, self, op_name):
     rank = _get_tensor_rank(self)
-    if rank is not None and any([_get_tensor_dim_size(self, i) == 0 for i in range(rank)]):
+    if rank is not None and any(
+        [_get_tensor_dim_size(self, i) == 0 for i in range(rank)]
+    ):
         # If input tensor is empty, according to ONNX ReduceSum definition,
         # set keepdims=1 so that the resulted tensor has the same rank as the input.
         return g.op(op_name, self, keepdims_i=1)
     return g.op(op_name, self, keepdims_i=0)
 
-# ---------------------------------------------------------------------
-# ONNX operator version
-# ---------------------------------------------------------------------
 
-# READ ME BEFORE EDITING _default_onnx_opset_version:
-#
-# The variable below controls which ONNX operator set version we are
-# targeting. THIS VARIABLE HAS SEMANTIC EFFECT! Say a breaking
-# change occurred in version 8. As long as this variable < 8, you can
-# export models targeting the old behavior. However, if you bump
-# this variable to 8 or later, the breaking change will take into effect:
-# you MUST adjust any symbolic affected by breaking changes. The ONNX
-# spec publishes a *comprehensive* list of BC-breaking changes for every
-# operator revision at:
-#
-#   https://github.com/onnx/onnx/blob/master/docs/Changelog.md
-#
-# Please be sure to go through and check all of our implementations here before
-# increasing this number. This includes symbolic definitions NOT in this
-# file, so grep for "OpName" (with quotes)
-#
-# Besides, opset_version can be specified in the invocation of export()
-# and export_to_pretty_string(), and _export_onnx_opset_version will be set
-# and the symbolic functions should check it to determine the behavior
-# of the exporter.
-
-
-_default_onnx_opset_version = 9
-_onnx_main_opset = 15
-_onnx_stable_opsets = [7, 8, 9, 10, 11, 12, 13, 14]
-_export_onnx_opset_version = _default_onnx_opset_version
-_constant_folding_opset_versions = list(range(9, _onnx_main_opset + 1))
-
-
-def _set_opset_version(opset_version):
-    global _export_onnx_opset_version
-    if opset_version == _default_onnx_opset_version:
-        _export_onnx_opset_version = opset_version
-        return
-    if opset_version in _onnx_stable_opsets + [_onnx_main_opset]:
-        _export_onnx_opset_version = opset_version
-        return
-    raise ValueError("Unsupported ONNX opset version: " + str(opset_version))
-
-_operator_export_type = None
+def dequantize_helper(g, qtensor, qdtype=None):
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point) for per tensor quantization,
+          or (quantized_tensor, scale, zero_point, axis) for per channel quantization.
+          Representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the data type of quantized tensor.
+          It must be either torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_tuple(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = cast_pytorch_to_onnx[tensor.type().scalarType()]
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype
+        else:
+            qdtype = torch.onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+def quantize_helper(g, tensor, scale, zero_point, axis=None):
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+        )
+
+    assert scale is not None
+    if scale.type().scalarType() != "Float":
+        scale = g.op("Cast", scale, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if zero_point.type().scalarType() not in ("Byte", "Char"):
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+def requantize_bias_helper(g, bias, input_scale, weight_scale, axis=None):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=torch.onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = args[0].type().scalarType()
+    has_same_dtype = all(elem.type().scalarType() == base_dtype for elem in args)
+    return has_same_dtype
+
+
+# TODO(justinchuby): Delete these setters, users should set the vars directly.
+def _set_opset_version(opset_version: int):
+    GLOBALS.export_onnx_opset_version = opset_version
+
+
 def _set_operator_export_type(operator_export_type):
-    global _operator_export_type
-    _operator_export_type = operator_export_type
+    GLOBALS.operator_export_type = operator_export_type
+
 
-_training_mode = None
 def _set_training_mode(training_mode):
-    global _training_mode
-    _training_mode = training_mode
+    GLOBALS.training_mode = training_mode
+
 
-_onnx_shape_inference = False
 # This function is for debug use only.
-# onnx_shape_inference = True by default.
-def _set_onnx_shape_inference(onnx_shape_inference):
-    global _onnx_shape_inference
-    _onnx_shape_inference = onnx_shape_inference
+# onnx_shape_inference = False by default.
+def _set_onnx_shape_inference(onnx_shape_inference: bool):
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
 
 
 # Metaprogram symbolics for each ATen native specialized cast operator.
@@ -911,9 +1359,9 @@ def _set_onnx_shape_inference(onnx_shape_inference):
 }
 
 
-
 class ScalarType(enum.IntEnum):
     """A human-readable name for a key into scalar_type_to_pytorch_type."""
+
     UINT8 = 0
     INT8 = enum.auto()
     SHORT = enum.auto()
@@ -936,45 +1384,66 @@ class ScalarType(enum.IntEnum):
 # torch type. Related source:
 # https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
 scalar_type_to_pytorch_type = [
-    torch.uint8,        # 0
-    torch.int8,         # 1
-    torch.short,        # 2
-    torch.int,          # 3
-    torch.int64,        # 4
-    torch.half,         # 5
-    torch.float,        # 6
-    torch.double,       # 7
-    torch.complex32,    # 8
-    torch.complex64,    # 9
-    torch.complex128,   # 10
-    torch.bool,         # 11
-    torch.qint8,        # 12
-    torch.quint8,       # 13
-    torch.qint32,       # 14
-    torch.bfloat16,     # 15
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
 ]
 
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
 def _cast_func_template(to_i, g, input, non_blocking):
     return g.op("Cast", input, to_i=to_i)
 
 
 scalar_type_to_onnx = [
-    cast_pytorch_to_onnx["Byte"],            # 0
-    cast_pytorch_to_onnx["Char"],            # 1
-    cast_pytorch_to_onnx["Short"],           # 2
-    cast_pytorch_to_onnx["Int"],             # 3
-    cast_pytorch_to_onnx["Long"],            # 4
-    cast_pytorch_to_onnx["Half"],            # 5
-    cast_pytorch_to_onnx["Float"],           # 6
-    cast_pytorch_to_onnx["Double"],          # 7
-    cast_pytorch_to_onnx["Undefined"],       # 8
-    cast_pytorch_to_onnx["ComplexFloat"],    # 9
-    cast_pytorch_to_onnx["ComplexDouble"],   # 10
-    cast_pytorch_to_onnx["Bool"],            # 11
-    cast_pytorch_to_onnx["Char"],            # 12
-    cast_pytorch_to_onnx["Byte"],            # 13
-    cast_pytorch_to_onnx["Int"],             # 14
-    cast_pytorch_to_onnx["BFloat16"],        # 15
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
 ]
 
 # Global set to store the list of quantized operators in the network.
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 82709d4e9463..fe8e09945ceb 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -1,16 +1,26 @@
+import sys
+import warnings
 
 import torch
-from torch.nn.modules.utils import _single, _pair, _triple
 import torch.onnx
-# This import monkey-patches graph manipulation methods on Graph, used for the
-# ONNX symbolics
-import torch.onnx.utils
-
 import torch.onnx.symbolic_helper as sym_help
-from torch.onnx.symbolic_helper import parse_args, _unimplemented
 import torch.onnx.symbolic_opset9
+from torch.nn.modules.utils import _pair, _single, _triple
 
-from sys import maxsize
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+from torch.onnx import _patch_torch  # noqa: F401
+from torch.onnx._globals import GLOBALS
+from torch.onnx.symbolic_helper import parse_args, quantized_args
+from torch.onnx.symbolic_opset9 import (
+    add,
+    conv2d,
+    hardswish,
+    linear,
+    mul,
+    op_with_optional_float_cast,
+    relu,
+)
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -32,7 +42,9 @@ def _div_rounding_mode(g, self, other, rounding_mode):
     if rounding_mode == "floor":
         return _floor_divide(g, self, other)
     else:
-        return torch.onnx.symbolic_opset9._div_rounding_mode(g, self, other, rounding_mode)
+        return torch.onnx.symbolic_opset9._div_rounding_mode(
+            g, self, other, rounding_mode
+        )
 
 
 def _floor_divide(g, self, other):
@@ -44,14 +56,11 @@ def _floor_divide(g, self, other):
         div = g.op("Div", self, other)
         # Division is negative if: self < 0 != other < 0
         zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op("Xor",
-                        g.op("Less", self, zero),
-                        g.op("Less", other, zero))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
 
         # For negative numbers with self % other != 0, subtract 1 to round down instead of up
         mod = g.op("Mod", self, other, fmod_i=0)
-        fixup_mask = g.op("And", negative,
-                          g.op("Not", g.op("Equal", mod, zero)))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
 
         one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
         fixup = g.op("Sub", div, one)
@@ -65,10 +74,13 @@ def sort(g, self, dim, decending, out=None):
 
 @parse_args("v", "v", "i", "i", "i", "none")
 def topk(g, self, k, dim, largest, sorted, out=None):
-    return sym_help._topk_helper(g, self, k, dim, largest=largest, sorted=sorted, out=out)
+    return sym_help._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
 
 
 def _max_pool(name, tuple_fn, ndims, return_indices):
+    @quantized_args(True, False, False, False, False, False)
     @parse_args("v", "is", "is", "is", "is", "i")
     def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
         if not stride:
@@ -96,13 +108,23 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
         # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
         if return_indices:
             r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
-            _, flattened_indices = g.op("MaxPool", input, outputs=2,
-                                        kernel_shape_i=[1 for _ in range(ndims)],
-                                        strides_i=[1 for _ in range(ndims)])
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
             # convert indices to have non-flattened indices values
             from torch.onnx.symbolic_opset9 import sub
-            s = sym_help._slice_helper(g, flattened_indices, axes=[2 + i for i in range(ndims)],
-                                       starts=tuple_fn(0), ends=tuple_fn(1))
+
+            s = sym_help._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=tuple_fn(0),
+                ends=tuple_fn(1),
+            )
             indices = sub(g, indices, s)
             return r, indices
         else:
@@ -115,29 +137,55 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
 max_pool1d = _max_pool("max_pool1d", _single, 1, return_indices=False)
 max_pool2d = _max_pool("max_pool2d", _pair, 2, return_indices=False)
 max_pool3d = _max_pool("max_pool3d", _triple, 3, return_indices=False)
-max_pool1d_with_indices = _max_pool("max_pool1d_with_indices", _single, 1, return_indices=True)
-max_pool2d_with_indices = _max_pool("max_pool2d_with_indices", _pair, 2, return_indices=True)
-max_pool3d_with_indices = _max_pool("max_pool3d_with_indices", _triple, 3, return_indices=True)
+max_pool1d_with_indices = _max_pool(
+    "max_pool1d_with_indices", _single, 1, return_indices=True
+)
+max_pool2d_with_indices = _max_pool(
+    "max_pool2d_with_indices", _pair, 2, return_indices=True
+)
+max_pool3d_with_indices = _max_pool(
+    "max_pool3d_with_indices", _triple, 3, return_indices=True
+)
 
 
 def _avg_pool(name, tuple_fn):
     @parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override=None):
+    def symbolic_fn(
+        g,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override=None,
+    ):
         if not stride:
             stride = kernel_size
-        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name)
+        padding = sym_help._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
         if count_include_pad:
-            input = g.op("Pad", input,
-                         pads_i=((0,) * 2 + padding) * 2,
-                         mode_s="constant",
-                         value_f=0.)
+            input = op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
             padding = (0,) * len(padding)
-        output = g.op("AveragePool", input,
-                      kernel_shape_i=tuple_fn(kernel_size),
-                      strides_i=tuple_fn(stride),
-                      pads_i=padding * 2,
-                      ceil_mode_i=ceil_mode)
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding * 2,
+            ceil_mode_i=ceil_mode,
+        )
         return output
+
     return symbolic_fn
 
 
@@ -148,14 +196,17 @@ def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include
 
 def _interpolate(name, dim, interpolate_mode):
     def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = sym_help._get_interpolate_attributes(g, interpolate_mode, args)
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
         sym_help._interpolate_warning(interpolate_mode)
         align_corners = sym_help._maybe_get_scalar(align_corners)
         if align_corners:
-            return _unimplemented(name, "align_corners == True")
+            return sym_help._unimplemented(name, "align_corners == True")
         if scales is None:
             scales = sym_help._interpolate_size_to_scales(g, input, output_size, dim)
         return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
     return symbolic_fn
 
 
@@ -166,9 +217,13 @@ def symbolic_fn(g, input, output_size, *args):
 upsample_bilinear2d = _interpolate("upsample_bilinear2d", 4, "linear")
 upsample_trilinear3d = _interpolate("upsample_trilinear3d", 5, "linear")
 
-def __interpolate(g, input, size, scale_factor, mode , align_corners, recompute_scale_factor, antialias):
-    scales, mode = sym_help._interpolate_get_scales_and_mode(g, input, size, scale_factor,
-                                                             mode , align_corners)
+
+def __interpolate(
+    g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias
+):
+    scales, mode = sym_help._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
     return g.op("Resize", input, scales, mode_s=mode)
 
 
@@ -183,8 +238,12 @@ def _slice(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
         assert len(starts) == len(ends)
         assert len(starts) == len(axes)
         assert steps is None or len(starts) == len(steps)
-        if len(starts) == 1 and starts[0] == 0 and ends[0] == 9223372036854775807\
-           and (steps is None or (len(steps) == 1 and steps[0] == 1)):
+        if (
+            len(starts) == 1
+            and starts[0] == 0
+            and ends[0] == 9223372036854775807
+            and (steps is None or (len(steps) == 1 and steps[0] == 1))
+        ):
             return input
         axes = g.op("Constant", value_t=torch.tensor(axes))
         starts = g.op("Constant", value_t=torch.tensor(starts))
@@ -205,14 +264,20 @@ def slice(g, self, *args):
         dim = 0
     else:
         raise NotImplementedError("Unknown aten::slice signature")
-    is_start_none = start.node().kind() == "prim::Constant" and start.type().kind() == "NoneType"
-    is_end_none = end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+    is_start_none = (
+        start.node().kind() == "prim::Constant" and start.type().kind() == "NoneType"
+    )
+    is_end_none = (
+        end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+    )
     is_start_onnx_const = start.node().kind() == "onnx::Constant"
     is_end_onnx_const = end.node().kind() == "onnx::Constant"
     step = sym_help._parse_arg(step, "i")
-    if (not is_start_none and not is_start_onnx_const) or \
-       (not isinstance(end, int) and not is_end_none and not is_end_onnx_const) or \
-       (not isinstance(dim, int) and dim.node().kind() != "onnx::Constant"):
+    if (
+        (not is_start_none and not is_start_onnx_const)
+        or (not isinstance(end, int) and not is_end_none and not is_end_onnx_const)
+        or (not isinstance(dim, int) and dim.node().kind() != "onnx::Constant")
+    ):
         dynamic_slice = True
         if is_start_none:
             start = g.op("Constant", value_t=torch.tensor(0))
@@ -223,15 +288,27 @@ def slice(g, self, *args):
         end = [9223372036854775807 if is_end_none else sym_help._parse_arg(end, "i")]
         dim = [sym_help._parse_arg(dim, "i")]
         dynamic_slice = False
-    return sym_help._slice_helper(g, self, axes=dim, starts=start, ends=end, steps=[step], dynamic_slice=dynamic_slice)
+    return sym_help._slice_helper(
+        g,
+        self,
+        axes=dim,
+        starts=start,
+        ends=end,
+        steps=[step],
+        dynamic_slice=dynamic_slice,
+    )
 
 
 @parse_args("v", "is")
 def flip(g, input, dims):
-    return sym_help._slice_helper(g, input, axes=dims,
-                                  starts=[-1] * len(dims),
-                                  ends=[-9223372036854775807] * len(dims),
-                                  steps=[-1] * len(dims))
+    return sym_help._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-9223372036854775807] * len(dims),
+        steps=[-1] * len(dims),
+    )
 
 
 def fmod(g, input, other):
@@ -239,24 +316,31 @@ def fmod(g, input, other):
 
 
 @parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(g,
-                  embedding_matrix,
-                  indices,
-                  offsets,
-                  scale_grad_by_freq,
-                  mode,
-                  sparse,
-                  per_sample_weights,
-                  include_last_offset,
-                  padding_idx):
-    if scale_grad_by_freq and sym_help._training_mode:
-        return sym_help._onnx_unsupported("embedding_bag with scale_grad_by_freq for training mode")
+def embedding_bag(
+    g,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.training_mode:
+        return sym_help._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
     if padding_idx is not None and padding_idx >= 0:
         raise RuntimeError("embedding_bag with padding_idx")
+
     from torch.onnx.symbolic_opset9 import select
-    import warnings
-    warnings.warn("Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
-                  "Please use opset 11 or higher to export model for dynamic input shape.'")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
     offsets_dim_0 = sym_help._get_tensor_dim_size(offsets, 0)
     if offsets_dim_0 is not None:
         if include_last_offset:
@@ -264,22 +348,37 @@ def embedding_bag(g,
             offsets_extended = offsets
         else:
             offset_len = offsets_dim_0
-            offsets_extended = [offsets, g.op("Constant", value_t=torch.tensor([maxsize]))]
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
             offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
         list_ = []
         for i in range(offset_len):
-            start_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), [0])
-            end_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)), [0])
+            start_ = sym_help._unsqueeze_helper(
+                g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), [0]
+            )
+            end_ = sym_help._unsqueeze_helper(
+                g,
+                select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)),
+                [0],
+            )
             axes_ = g.op("Constant", value_t=torch.tensor([0]))
             indices_row = g.op("Slice", indices, start_, end_, axes_)
 
             embeddings = g.op("Gather", embedding_matrix, indices_row)
             if not sym_help._is_none(per_sample_weights):
-                per_sample_weights_row = g.op("Slice", per_sample_weights, start_, end_, axes_)
-                per_sample_weights_row = sym_help._unsqueeze_helper(g, per_sample_weights_row, [1])
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = sym_help._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
                 embeddings = g.op("Mul", embeddings, per_sample_weights_row)
             if mode == 0:
-                embeddings = sym_help._reducesum_helper(g, embeddings, axes_i=[0], keepdims_i=0)
+                embeddings = sym_help._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
             elif mode == 1:
                 embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
             else:
@@ -293,28 +392,215 @@ def embedding_bag(g,
         # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
         return output, None, None, None
     else:
-        return sym_help._onnx_unsupported("embedding_bag with unknown shape of offsets for opset 10 is not supported. "
-                                          "please use opset 11 or higher.")
-
-
-@parse_args("v", "t", "i", "i", "i")
-def fake_quantize_per_tensor_affine(g, inputs, scale, zero_point, quant_min=-128, quant_max=127):
-    if quant_min not in [0, -128] or quant_max not in [127, 255]:
+        return sym_help._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g, inputs, scale, zero_point, quant_min=-128, quant_max=127
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        sym_help._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
         raise RuntimeError(
-            "ONNX defines [0, 255] for quint8 and [-128, 127] for qint8, got [{}, {}]".format(quant_min, quant_max))
+            "For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            "Got ({}, {})".format(quant_min, quant_max)
+        )
+    scale = sym_help._maybe_get_scalar(scale)
+    if scale is None:
+        sym_help._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+        )
     scale = scale.float().data  # Avoid exporter generating double type
-    zero_point_dtype = torch.int8 if quant_min == -128 else torch.uint8
-    zero_point = torch.tensor(zero_point, dtype=zero_point_dtype)  # ONNX requires zero_point to be tensor
-    return g.op("DequantizeLinear", g.op("QuantizeLinear", inputs, scale, zero_point), scale, zero_point)
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
 
 
 def isinf(g, input):
     from torch.onnx.symbolic_opset9 import _cast_Double  # type: ignore[attr-defined]
+
     return g.op("IsInf", _cast_Double(g, input, False))
 
 
 def isfinite(g, input):
-    from torch.onnx.symbolic_opset9 import isnan, __not_, __or_
+    from torch.onnx.symbolic_opset9 import __not_, __or_, isnan
+
     inf_node = isinf(g, input)
     nan_node = isnan(g, input)
     return __not_(g, __or_(g, inf_node, nan_node))
+
+
+def quantize_per_tensor(g, input, scale, zero_point, dtype):
+    dtype = sym_help._get_const(dtype, "i", "dtype")
+    zero_point = g.op("Cast", zero_point, to_i=sym_help.scalar_type_to_onnx[dtype])
+    scale = g.op("Cast", scale, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+    return sym_help.quantize_helper(g, input, scale, zero_point)
+
+
+def dequantize(g, input):
+    return sym_help.dequantize_helper(g, input)[0]
+
+
+@parse_args("v", "f", "f", "f")
+def nan_to_num(g, input, nan, posinf, neginf):
+    from torch.onnx.symbolic_opset9 import gt, isnan, logical_and, lt
+
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not sym_help._is_fp(input):
+        return input
+    input_dtype = sym_help.pytorch_name_to_type[input.type().scalarType()]
+    if nan is None:
+        nan = 0.0
+    nan_cond = isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input’s dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = logical_and(
+        g,
+        isinf(g, nan_result),
+        gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        lt(g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+class Quantized:
+    """
+    https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+
+    Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were introduced in opset version 10.
+    """
+
+    domain = "quantized"
+
+    @staticmethod
+    def linear(g, q_input, q_weight, bias, op_scale, op_zero_point):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, _ = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(g, bias, input_scale, weight_scale)
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = linear(g, input, weight, bias)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def add(g, x, y, op_scale, op_zero_point):
+        x, _, _, _ = sym_help.dequantize_helper(g, x)
+        y, _, _, _ = sym_help.dequantize_helper(g, y)
+
+        output = add(g, x, y)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def mul(g, x, y, op_scale, op_zero_point):
+        x, _, _, _ = sym_help.dequantize_helper(g, x)
+        y, _, _, _ = sym_help.dequantize_helper(g, y)
+
+        output = mul(g, x, y)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def hardswish(g, x, op_scale, op_zero_point):
+        x, _, _, _ = sym_help.dequantize_helper(g, x)
+
+        output = hardswish(g, x)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def conv2d_relu(
+        g,
+        q_input,
+        q_weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        op_scale,
+        op_zero_point,
+    ):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, _ = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(g, bias, input_scale, weight_scale)
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+        output = relu(g, output)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def conv2d(
+        g,
+        q_input,
+        q_weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        op_scale,
+        op_zero_point,
+    ):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, _ = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(g, bias, input_scale, weight_scale)
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 60222a515738..b3dc019f73e6 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -1,16 +1,21 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
+import warnings
 from sys import maxsize
 
 import torch
 import torch.onnx.symbolic_helper as sym_help
-import warnings
-
-from torch.onnx.symbolic_helper import parse_args, _unimplemented, _is_tensor_list, ScalarType
-from torch.onnx.symbolic_opset9 import expand, unused, mul
-from torch.nn.modules.utils import _single, _pair, _triple
-from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
+import torch.onnx.utils
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.onnx._globals import GLOBALS
+from torch.onnx.symbolic_helper import (
+    ScalarType,
+    _is_tensor_list,
+    _unimplemented,
+    parse_args,
+    quantized_args,
+)
+from torch.onnx.symbolic_opset9 import _pad_circular, expand
+from torch.onnx.symbolic_opset9 import linalg_vector_norm as lvn
+from torch.onnx.symbolic_opset9 import mul, op_with_optional_float_cast, unused
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -25,9 +30,21 @@ def hardtanh(g, self, min_val, max_val):
         dtype = ScalarType.FLOAT
     else:
         dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
-    min_val = g.op("Constant", value_t=torch.tensor(min_val, dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
-    max_val = g.op("Constant", value_t=torch.tensor(max_val, dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
-    return g.op("Clip", self, min_val, max_val)
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            min_val, dtype=sym_help.scalar_type_to_pytorch_type[dtype]
+        ),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            max_val, dtype=sym_help.scalar_type_to_pytorch_type[dtype]
+        ),
+    )
+    return op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
 
 
 def clamp(g, self, min, max):
@@ -49,7 +66,9 @@ def _cast_if_not_none(tensor, dtype):
         return clamp_min(g, self, min)
     else:
         if sym_help._get_tensor_rank(min) == 0 and sym_help._get_tensor_rank(max) == 0:
-            return g.op("Clip", self, min, max)
+            return op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
         else:
             return clamp_max(g, clamp_min(g, self, min), max)
 
@@ -60,9 +79,9 @@ def clamp_min(g, self, min):
     min = g.op("Cast", min, to_i=sym_help.cast_pytorch_to_onnx[dtype])
     if sym_help._get_tensor_rank(min) == 0:
         max = unused(g)
-        return g.op("Clip", self, min, max)
+        return op_with_optional_float_cast(g, "Clip", self, min, max, opset_before=12)
     else:
-        return g.op("Max", self, min)
+        return op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
 
 
 @parse_args("v", "v")
@@ -71,20 +90,26 @@ def clamp_max(g, self, max):
     max = g.op("Cast", max, to_i=sym_help.cast_pytorch_to_onnx[dtype])
     if sym_help._get_tensor_rank(max) == 0:
         min = unused(g)
-        return g.op("Clip", self, min, max)
+        return op_with_optional_float_cast(g, "Clip", self, min, max, opset_before=12)
     else:
-        return g.op("Min", self, max)
+        return op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
 
 
 def relu6(g, input):
-    relu = g.op("Relu", input)
+    relu = op_with_optional_float_cast(g, "Relu", input, opset_before=14)
     dtype = input.type().scalarType()
     if dtype is None:
         dtype = ScalarType.FLOAT
     else:
         dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
-    min_val = g.op("Constant", value_t=torch.tensor(0, dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
-    max_val = g.op("Constant", value_t=torch.tensor(6, dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
     return clamp(g, relu, min_val, max_val)
 
 
@@ -99,11 +124,12 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
         indices_list = sym_help._unpack_list(indices_list_value)
     else:
         indices_list = [indices_list_value]
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+    if sym_help.is_caffe2_aten_fallback():
         args = [self] + indices_list + [values, accumulate]
-        return g.op("ATen", *args, operator_s="index_put")
+        return g.at("index_put", *args)
 
     from torch.onnx.symbolic_opset9 import add, expand
+
     accumulate = sym_help._parse_arg(accumulate, "b")
 
     if len(indices_list) == 0:
@@ -119,7 +145,10 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
             index = add(g, index, ind)
         broadcast_index_shape = g.op("Shape", index)
         indices_list = [
-            sym_help._unsqueeze_helper(g, expand(g, ind, broadcast_index_shape, None), [-1]) for ind in indices_list
+            sym_help._unsqueeze_helper(
+                g, expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
         ]
         index = g.op("Concat", *indices_list, axis_i=-1)
     else:
@@ -167,12 +196,14 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
             rank = sym_help._get_tensor_rank(values)
             if rank is not None and rank == 0:
                 from torch.onnx.symbolic_opset9 import masked_fill
+
                 return masked_fill(g, self, bool_inp, values)
             return masked_scatter(g, self, bool_inp, values)
         broadcast_index_shape = g.op("Shape", index)
         index = sym_help._unsqueeze_helper(g, index, [-1])
     sub_data_shape = sym_help._slice_helper(
-        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[maxsize])
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[maxsize]
+    )
     values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
     # Check if values is a singular value and expand accordingly
     rank = sym_help._get_tensor_rank(values)
@@ -187,7 +218,11 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
     dtype = sym_help.scalar_type_to_pytorch_type[dtype]
 
     if accumulate:
-        zeros = g.op("ConstantOfShape", g.op("Shape", self), value_t=torch.tensor([0], dtype=dtype))
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=dtype),
+        )
         result = g.op("ScatterND", zeros, index, values)
         result = add(g, self, result)
     else:
@@ -217,23 +252,29 @@ def _interpolate(name, dim, interpolate_mode):
 upsample_bicubic2d = _interpolate("upsample_bicubic2d", 4, "cubic")
 
 
-def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias):
-    return sym_help.__interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor)
+def __interpolate(
+    g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias
+):
+    return sym_help.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
 
 @parse_args("v", "i", "v", "v")
 def gather(g, self, dim, index, sparse_grad=False):
     if sym_help._maybe_get_const(sparse_grad, "i"):
         return _unimplemented("gather", "sparse_grad == True")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, dim, index, sparse_grad, operator_s="gather")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("gather", self, dim, index, sparse_grad)
     return g.op("GatherElements", self, index, axis_i=dim)
 
 
 @parse_args("v", "i", "v", "v")
 def scatter(g, self, dim, index, src):
     from torch.onnx.symbolic_opset9 import expand_as
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, dim, index, src, operator_s="scatter")
+
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("scatter", self, dim, index, src, overload_name="src")
     src_type = src.type().scalarType()
     src = sym_help._maybe_get_scalar(src)
     if sym_help._is_value(src):
@@ -242,8 +283,14 @@ def scatter(g, self, dim, index, src):
         # Check if scalar "src" has same type as self (PyTorch allows different
         # type for scalar src (but not when src is tensor)). If not, insert Cast node.
         if self.type().scalarType() != src_type:
-            src = g.op("Cast", src, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
-        return g.op("ScatterElements", self, index, expand_as(g, src, index), axis_i=dim)
+            src = g.op(
+                "Cast",
+                src,
+                to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()],
+            )
+        return g.op(
+            "ScatterElements", self, index, expand_as(g, src, index), axis_i=dim
+        )
 
 
 @parse_args("v", "i", "none")
@@ -259,23 +306,28 @@ def cumsum(g, self, dim, dtype=None):
 
 
 def masked_select(g, self, mask):
-    from torch.onnx.symbolic_opset9 import nonzero, expand_as
+    from torch.onnx.symbolic_opset9 import expand_as, nonzero
+
     index = nonzero(g, expand_as(g, mask, self))
     return g.op("GatherND", self, index)
 
 
 def masked_scatter(g, self, mask, source):
-    from torch.onnx.symbolic_opset9 import nonzero, expand_as, size
+    from torch.onnx.symbolic_opset9 import expand_as, nonzero, size
+
     index = nonzero(g, expand_as(g, mask, self))
     # NOTE: source can have more elements than needed.
     # It could also have arbitrary shape.
     # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
     source = sym_help._reshape_helper(g, source, torch.LongTensor([-1]))
-    source = sym_help._slice_helper(g, source,
-                                    axes=torch.LongTensor([0]),
-                                    starts=torch.LongTensor([0]),
-                                    ends=size(g, index, torch.LongTensor([0])),
-                                    dynamic_slice=True)
+    source = sym_help._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=size(g, index, torch.LongTensor([0])),
+        dynamic_slice=True,
+    )
     return g.op("ScatterND", self, index, source)
 
 
@@ -292,12 +344,15 @@ def __getitem_(g, self, i):
         return g.op("SequenceAt", self, i)
     else:
         from torch.onnx.symbolic_opset9 import __getitem_ as getitem
+
         return getitem(g, self, i)
 
+
 def _set_item(g, tensor_list, i, v):
     tensor_list = g.op("SequenceErase", tensor_list, i)
     return g.op("SequenceInsert", tensor_list, v, i)
 
+
 def append(g, self, tensor):
     return g.op("SequenceInsert", self, tensor)
 
@@ -306,7 +361,9 @@ def add(g, self, other, alpha=None):
     if sym_help._is_value(self) and sym_help._is_tensor_list(self):
         tensor_list_node = other.node()
         if tensor_list_node.kind() != "prim::ListConstruct":
-            return _unimplemented("add", "does not support adding dynamic tensor list to another")
+            return _unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
         tensors = sym_help._unpack_list(other)
         l = self
         for t in tensors:
@@ -315,6 +372,7 @@ def add(g, self, other, alpha=None):
 
     return torch.onnx.symbolic_opset9.add(g, self, other, alpha)
 
+
 def insert(g, self, pos, tensor):
     return g.op("SequenceInsert", self, tensor, pos)
 
@@ -322,12 +380,15 @@ def insert(g, self, pos, tensor):
 def pop(g, tensor_list, dim):
     return g.op("SequenceErase", tensor_list, dim)
 
+
 def Delete(g, tensor_list, dim):
     return g.op("SequenceErase", tensor_list, dim)
 
+
 def cat(g, tensor_list, dim):
     if sym_help._is_packed_list(tensor_list):
         from torch.onnx.symbolic_opset9 import cat as cat_opset9
+
         return cat_opset9(g, tensor_list, dim)
     else:
         dim = sym_help._get_const(dim, "i", "dim")
@@ -337,6 +398,7 @@ def cat(g, tensor_list, dim):
 def stack(g, tensor_list, dim):
     if sym_help._is_packed_list(tensor_list):
         from torch.onnx.symbolic_opset9 import stack as stack_opset9
+
         return stack_opset9(g, tensor_list, dim)
     else:
         dim = sym_help._get_const(dim, "i", "dim")
@@ -345,26 +407,47 @@ def stack(g, tensor_list, dim):
 
 @parse_args("v", "i", "i", "i")
 def _unique2(g, self, sorted, return_inverse, return_counts):
-    u, indices, inverse_indices, counts = g.op("Unique", self, sorted_i=sorted, outputs=4)
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
     return u, inverse_indices, counts
 
 
 def _avg_pool(name, tuple_fn):
     @parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override=None):
-        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name)
+    def symbolic_fn(
+        g,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override=None,
+    ):
+        padding = sym_help._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
         if not stride:
             stride = kernel_size
         if count_include_pad:
-            input = g.op("Pad", input,
-                         g.op("Constant", value_t=torch.tensor(((0,) * 2 + padding) * 2)), mode_s="constant")
+            input = g.op(
+                "Pad",
+                input,
+                g.op("Constant", value_t=torch.tensor(((0,) * 2 + padding) * 2)),
+                mode_s="constant",
+            )
             padding = (0,) * len(padding)
-        output = g.op("AveragePool", input,
-                      kernel_shape_i=tuple_fn(kernel_size),
-                      strides_i=tuple_fn(stride),
-                      pads_i=padding * 2,
-                      ceil_mode_i=ceil_mode)
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding * 2,
+            ceil_mode_i=ceil_mode,
+        )
         return output
+
     return symbolic_fn
 
 
@@ -375,13 +458,17 @@ def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include
 
 @parse_args("v", "i", "i", "i", "i")
 def unique_dim(g, self, dim, sorted, return_inverse, return_counts):
-    u, indices, inverse_indices, counts = g.op("Unique", self, axis_i=dim, sorted_i=sorted, outputs=4)
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
     return u, inverse_indices, counts
 
 
 @parse_args("v", "v", "i", "i", "i", "none")
 def topk(g, self, k, dim, largest, sorted, out=None):
-    return sym_help._topk_helper(g, self, k, dim, largest=largest, sorted=sorted, out=out)
+    return sym_help._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
 
 
 @parse_args("v", "i", "i", "none")
@@ -396,6 +483,7 @@ def round(g, self):
 def remainder(g, input, other):
     if sym_help._is_fp(input) or sym_help._is_fp(other):
         from torch.onnx.symbolic_opset9 import remainder as _remainder_9
+
         return _remainder_9(g, input, other)
     return g.op("Mod", input, other, fmod_i=0)
 
@@ -407,20 +495,36 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None):
         if _outputs is None:
             return split_out
         # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if sym_help._is_packed_list(split_size_or_sizes) and len(sym_help._unpack_list(split_size_or_sizes)) == _outputs:
-            split_sizes = [sym_help._unsqueeze_helper(g, v, [0]) for v in sym_help._unpack_list(split_size_or_sizes)]
+        if (
+            sym_help._is_packed_list(split_size_or_sizes)
+            and len(sym_help._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                sym_help._unsqueeze_helper(g, v, [0])
+                for v in sym_help._unpack_list(split_size_or_sizes)
+            ]
             start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
             axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
             res = []
             for i in range(_outputs):
-                end = g.op("Add", start, split_sizes[i])  # split_sizes is a list of same length as _outputs
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
                 res.append(g.op("Slice", self, start, end, axis))
                 start = end
             return res
-        return [g.op("SequenceAt", split_out, g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)))
-                for i in range(_outputs)]
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
     else:
-        return torch.onnx.symbolic_opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+        return torch.onnx.symbolic_opset9.split(
+            g, self, split_size_or_sizes, dim, _outputs
+        )
 
 
 @parse_args("v", "v", "i", "i")
@@ -431,7 +535,13 @@ def split_with_sizes(g, self, split_sizes, dim, _outputs=None):
 @parse_args("v", "i", "i")
 def unbind(g, self, dim=0, _outputs=None):
     if _outputs is None:
-        return g.op("SplitToSequence", self, g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)), axis_i=dim, keepdims_i=0)
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
     else:
         return torch.onnx.symbolic_opset9.unbind(g, self, dim, _outputs)
 
@@ -443,32 +553,54 @@ def unbind(g, self, dim=0, _outputs=None):
 #          The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
 #          where m is in range [0, n].
 def _prepare_onnx_paddings(g, input, pad):
-    if not sym_help._is_packed_list(pad) and sym_help._is_list(pad) and sym_help._is_scalar_list(pad):
+    if (
+        not sym_help._is_packed_list(pad)
+        and sym_help._is_list(pad)
+        and sym_help._is_scalar_list(pad)
+    ):
         pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
     # The desired order of paddings is
     # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
     # n is the dimension of input.
     # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
-    pad_len = torch.onnx.symbolic_opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    pad_len = torch.onnx.symbolic_opset9.size(
+        g, pad, g.op("Constant", value_t=torch.tensor([0]))
+    )
     # Set extension = [0] * (dim * 2 - len(pad))
     rank = sym_help._get_tensor_rank(input)
     if rank is None:
         rank = g.op("Size", g.op("Shape", input))
     else:
         rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
-    extension = g.op("Sub", g.op("Mul", rank,
-                     g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))), pad_len)
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
     # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
     # Currently ONNX only supports int64 type for Pad
     pad = g.op("Cast", pad, to_i=sym_help.cast_pytorch_to_onnx["Long"])
-    paddings = g.op("Concat", pad, g.op("ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)), axis_i=0)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
     # Reshape and reverse order and collate first beginnings and then ends
     # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
     #               [..., 0, dim_n-1_end, dim_n_end]]
     # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
-    paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2])))
-    paddings = g.op("Transpose", torch.onnx.symbolic_opset10.flip(g, paddings, [0]), perm_i=[1, 0])
-    paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1])))
+    paddings = sym_help._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op(
+        "Transpose", torch.onnx.symbolic_opset10.flip(g, paddings, [0]), perm_i=[1, 0]
+    )
+    paddings = sym_help._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
     padding_c = g.op("Cast", paddings, to_i=sym_help.cast_pytorch_to_onnx["Long"])
     return padding_c
 
@@ -501,12 +633,27 @@ def replication_pad(g, input, padding):
 replication_pad3d = replication_pad
 
 
+def pad(g, input, pad, mode, value):
+    mode = sym_help._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise RuntimeError(f"Unrecognized padding mode {mode}")
+
+
 def linalg_det(g, self):
     return g.op("Det", self)
 
 
 def logdet(g, input):
     from torch.onnx.symbolic_opset9 import log
+
     return log(g, linalg_det(g, input))
 
 
@@ -522,9 +669,17 @@ def _get_arange_dtype(dtype):
         else:
             # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
             dtype = _get_arange_dtype(args[1])
-        type, end, start, step = sym_help._arange_cast_helper(g, end=args[0], dtype=dtype)
-        start_default = g.op("Constant", value_t=torch.tensor(0, dtype=sym_help.scalar_type_to_pytorch_type[type]))
-        delta_default = g.op("Constant", value_t=torch.tensor(1, dtype=sym_help.scalar_type_to_pytorch_type[type]))
+        type, end, start, step = sym_help._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=sym_help.scalar_type_to_pytorch_type[type]),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=sym_help.scalar_type_to_pytorch_type[type]),
+        )
         arange_tensor = g.op("Range", start_default, end, delta_default)
     elif len(args) == 4 or len(args) == 7:
         if len(args) == 4:
@@ -533,27 +688,35 @@ def _get_arange_dtype(dtype):
         else:
             # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
             dtype = _get_arange_dtype(args[3])
-        type, end, start, step = sym_help._arange_cast_helper(g, start=args[0], end=args[1], step=args[2], dtype=dtype)
+        type, end, start, step = sym_help._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
         arange_tensor = g.op("Range", start, end, step)
     elif len(args) == 6:
         # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
         dtype = _get_arange_dtype(args[2])
-        type, end, start, step = sym_help._arange_cast_helper(g, start=args[0], end=args[1], dtype=dtype)
-        delta_default = g.op("Constant", value_t=torch.tensor(1, dtype=sym_help.scalar_type_to_pytorch_type[type]))
+        type, end, start, step = sym_help._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=sym_help.scalar_type_to_pytorch_type[type]),
+        )
         arange_tensor = g.op("Range", start, end, delta_default)
     else:
-        raise NotImplementedError("Unknown aten::arange signature taking " + str(len(args)) + " arguments.")
+        raise NotImplementedError(
+            "Unknown aten::arange signature taking " + str(len(args)) + " arguments."
+        )
     return arange_tensor
 
 
 @parse_args("v", "i")
 def _dim_arange(g, like, dim):
     like_shape = g.op("Shape", like)
-    stop = g.op("Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0)
-    # Caffe2-specific op
-    is_caffe2_aten_fallback = (sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK and
-                               torch.onnx._CAFFE2_ATEN_FALLBACK)
-    if is_caffe2_aten_fallback:
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    if sym_help.is_caffe2_aten_fallback():
         return g.op("_caffe2::Range", stop)
     return arange(g, stop, 4, None, None, None)
 
@@ -568,6 +731,10 @@ def squeeze(g, self, dim=None):
     if dim is None:
         return g.op("Squeeze", self)
 
+    # dim as a tensor
+    if not sym_help._is_constant(dim):
+        return sym_help._squeeze_helper(g, self, [dim])
+
     dim = sym_help._get_const(dim, "i", "dim")
 
     input_rank = sym_help._get_tensor_rank(self)
@@ -597,25 +764,34 @@ def squeeze(g, self, dim=None):
     # For static input shape
     dim = adjusted_dim
     if dim_size > 1:
-        warnings.warn("This model contains a squeeze operation on dimension " + str(dim) + ". The size of " +
-                      "this dimension in the given input is " + str(dim_size) + ". The model will " +
-                      "be exported without the squeeze node. If the model is intended to be used with dynamic " +
-                      "input shapes, please export with dynamic_axes argument.")
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
         return self
     return sym_help._squeeze_helper(g, self, [dim])
 
 
-@parse_args("v", "i")
 def unsqueeze(g, self, dim):
+    if sym_help._is_constant(dim):
+        dim = sym_help._get_const(dim, "i", "dim")
+
     return sym_help._unsqueeze_helper(g, self, [dim])
 
+
 def mm(g, self, other):
     return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
 
 
 def index(g, self, index):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, operator_s="index")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("index", self, index, overload_name="Tensor")
 
     if sym_help._is_packed_list(index):
         indices = sym_help._unpack_list(index)
@@ -625,19 +801,33 @@ def index(g, self, index):
     # Handle single mask index.
     if len(indices) == 1:
         index = indices[0]
-        if not sym_help._is_none(index) and (index.type().scalarType() == "Bool" or index.type().scalarType() == "Byte"):
+        if not sym_help._is_none(index) and (
+            index.type().scalarType() == "Bool" or index.type().scalarType() == "Byte"
+        ):
             from torch.onnx.symbolic_opset9 import nonzero
+
             index = nonzero(g, index)
             return g.op("GatherND", self, index)
     from torch.onnx.symbolic_opset9 import index as index_opset9
+
     return index_opset9(g, self, index)
 
 
 def index_fill(g, self, dim, index, value):
     dim_value = sym_help._parse_arg(dim, "i")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, value, dim_i=dim_value, operator_s="index_fill")
-    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(g, self, dim, index)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "index_fill",
+            self,
+            index,
+            value,
+            overload_name="int_Scalar",
+            dim_i=dim_value,
+        )
+
+    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(
+        g, self, dim, index
+    )
     value = sym_help._maybe_get_scalar(value)
     value = sym_help._if_scalar_type_as(g, value, self)
     expanded_value = expand(g, value, expanded_index_shape, None)
@@ -646,9 +836,11 @@ def index_fill(g, self, dim, index, value):
 
 def index_copy(g, self, dim, index, source):
     dim_value = sym_help._parse_arg(dim, "i")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, source, dim_i=dim_value, operator_s="index_copy")
-    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(g, self, dim, index)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("index_copy", self, index, source, dim_i=dim_value)
+    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(
+        g, self, dim, index
+    )
     return scatter(g, self, dim, expanded_index, source)
 
 
@@ -656,7 +848,9 @@ def __rshift_(g, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
     if other.type().scalarType() != self.type().scalarType():
-        other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+        other = g.op(
+            "Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+        )
 
     if self.type().scalarType() == "Byte":
         return g.op("BitShift", self, other, direction_s="RIGHT")
@@ -666,7 +860,9 @@ def __rshift_(g, self, other):
     if not sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Float"])
     two_pow = g.op("Pow", two, other)
-    two_pow = g.op("Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+    two_pow = g.op(
+        "Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+    )
     rshift = g.op("Div", self, two_pow)
     return rshift
 
@@ -675,7 +871,9 @@ def __lshift_(g, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
     if other.type().scalarType() != self.type().scalarType():
-        other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+        other = g.op(
+            "Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+        )
 
     if self.type().scalarType() == "Byte":
         return g.op("BitShift", self, other, direction_s="LEFT")
@@ -685,24 +883,38 @@ def __lshift_(g, self, other):
     if not sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Float"])
     two_pow = g.op("Pow", two, other)
-    two_pow = g.op("Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+    two_pow = g.op(
+        "Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+    )
     lshift = g.op("Mul", self, two_pow)
     return lshift
 
 
-def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding_d, stride_d):
+def _get_im2col_indices_along_dim(
+    g, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
     # Input is always 4-D (N, C, H, W)
     # Calculate indices of sliding blocks along spatial dimension
     # Slide kernel over input each dim d:
     # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
     # with steps = stride
 
-    blocks_d = g.op("Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2)))
-    blocks_d = g.op("Sub", blocks_d, g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))))
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
 
     # Stride kernel over input and find starting indices along dim d
-    blocks_d_indices = g.op("Range", g.op("Constant", value_t=torch.tensor(0)),
-                            blocks_d, g.op("Constant", value_t=torch.tensor(stride_d)))
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
 
     # Apply dilation on kernel and find its indices along dim d
     kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
@@ -710,8 +922,12 @@ def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding
 
     # Broadcast and add kernel staring positions (indices) with
     # kernel_grid along dim d, to get block indices along dim d
-    blocks_d_indices = sym_help._unsqueeze_helper(g, blocks_d_indices, [0])  # Reshape to [1, -1]
-    kernel_mask = sym_help._reshape_helper(g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1])))
+    blocks_d_indices = sym_help._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = sym_help._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
     block_mask = g.op("Add", blocks_d_indices, kernel_mask)
 
     return block_mask
@@ -728,13 +944,17 @@ def _get_im2col_padded_input(g, input, padding_h, padding_w):
 def _get_im2col_output_shape(g, input, kernel_h, kernel_w):
     batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
     channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
-    channel_unfolded = g.op("Mul", channel_dim,
-                            g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
 
-    return g.op("Concat",
-                sym_help._unsqueeze_helper(g, batch_dim, [0]),
-                sym_help._unsqueeze_helper(g, channel_unfolded, [0]),
-                g.op("Constant", value_t=torch.tensor([-1])), axis_i=0)
+    return g.op(
+        "Concat",
+        sym_help._unsqueeze_helper(g, batch_dim, [0]),
+        sym_help._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
 
 
 @parse_args("v", "is", "is", "is", "is")
@@ -750,8 +970,12 @@ def im2col(g, input, kernel_size, dilation, padding, stride):
     dilation_h, dilation_w = dilation[0], dilation[1]
     kernel_h, kernel_w = kernel_size[0], kernel_size[1]
 
-    blocks_row_indices = _get_im2col_indices_along_dim(g, input_h, kernel_h, dilation_h, padding_h, stride_h)
-    blocks_col_indices = _get_im2col_indices_along_dim(g, input_w, kernel_w, dilation_w, padding_w, stride_w)
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
 
     output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
     padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
@@ -787,44 +1011,71 @@ def im2col(g, input, kernel_size, dilation, padding, stride):
 
 def narrow(g, input, dim, start, length):
     from torch.onnx.symbolic_helper import _slice_helper
+
     end = g.op("Add", start, length)
     return _slice_helper(g, input, axes=dim, starts=start, ends=end, dynamic_slice=True)
 
 
+@quantized_args(True, False, False)
 @parse_args("v", "i", "i")
 def flatten(g, input, start_dim, end_dim):
     dim = sym_help._get_tensor_rank(input)
+    if dim == 1:
+        return input
     # use ONNX's Flatten operator for cases where the output shape is 2D
     if start_dim == 1:
-        if (end_dim == -1 or (dim is not None and end_dim == dim - 1)):
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
             return g.op("Flatten", input, axis_i=start_dim)
     elif start_dim == 0:
-        if (end_dim == -2 or (dim is not None and end_dim == dim - 2)):
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
             return g.op("Flatten", input, axis_i=end_dim + 1)
     if dim is None:
-        return _unimplemented("dim",
-                              "ONNX and PyTorch use different strategies to split the input. "
-                              "Input rank must be known at export time.")
+        return _unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
     # if end_dim is negative add dim
-    if end_dim < 0 :
+    if end_dim < 0:
         end_dim = dim + end_dim
 
     return sym_help._flatten_helper(g, input, start_dim, end_dim, dim)
 
 
+@parse_args("v", "f", "is", "i", "v")
+def linalg_vector_norm(g, self, ord, dim, keepdim, dtype):
+    if ord == 0:
+        if dim is None:
+            self = sym_help._reshape_helper(
+                g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+            )
+            keepdim = None
+        cond_op = g.op(
+            "Not", g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0])))
+        )
+        cond_op = g.op("Cast", cond_op, to_i=sym_help.cast_pytorch_to_onnx["Long"])
+        return sym_help._reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    else:
+        return lvn(g, self, ord, dim, keepdim, dtype)
+
+
 @parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(g,
-                  embedding_matrix,
-                  indices,
-                  offsets,
-                  scale_grad_by_freq,
-                  mode,
-                  sparse,
-                  per_sample_weights,
-                  include_last_offset,
-                  padding_idx):
-    if scale_grad_by_freq and sym_help._training_mode:
-        return sym_help._onnx_unsupported("embedding_bag with scale_grad_by_freq for training mode")
+def embedding_bag(
+    g,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.training_mode:
+        return sym_help._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
     if padding_idx is not None and padding_idx >= 0:
         raise RuntimeError("embedding_bag with padding_idx")
 
@@ -832,9 +1083,11 @@ def embedding_bag(g,
     loop_condition = g.op("Cast", loop_condition, to_i=9)
     zero = g.op("Constant", value_t=torch.tensor([0]))
 
-    indices_len = sym_help._unsqueeze_helper(g,
-                                             sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
-                                             [0])
+    indices_len = sym_help._unsqueeze_helper(
+        g,
+        sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
     if not include_last_offset:
         offsets = [offsets, indices_len]
         offsets = g.op("Concat", *offsets, axis_i=0)
@@ -842,15 +1095,21 @@ def embedding_bag(g,
     # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
     # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
     # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
-    offsets_starts = sym_help._slice_helper(g, offsets, axes=[0], starts=[0], ends=[maxsize], steps=[1])
-    offsets_ends = sym_help._slice_helper(g, offsets, axes=[0], starts=[1], ends=[maxsize], steps=[1])
-
-    loop_len = sym_help._size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+    offsets_starts = sym_help._slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[maxsize], steps=[1]
+    )
+    offsets_ends = sym_help._slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[maxsize], steps=[1]
+    )
+
+    loop_len = sym_help._size_helper(
+        g, offsets_ends, g.op("Constant", value_t=torch.tensor(0))
+    )
     loop = g.op("Loop", loop_len, loop_condition)
 
-    loop_block = _add_block(loop.node())
-    block_input_iter = _add_input_to_block(loop_block)
-    cond = _add_input_to_block(loop_block)
+    loop_block = torch.onnx.utils._add_block(loop.node())
+    block_input_iter = torch.onnx.utils._add_input_to_block(loop_block)
+    cond = torch.onnx.utils._add_input_to_block(loop_block)
 
     indices_start = loop_block.op("Gather", offsets_starts, block_input_iter, axis_i=0)
     indices_end = loop_block.op("Gather", offsets_ends, block_input_iter, axis_i=0)
@@ -860,53 +1119,80 @@ def embedding_bag(g,
     indices_row = loop_block.op("Slice", indices, indices_start, indices_end, zero)
     embeddings = loop_block.op("Gather", embedding_matrix, indices_row, axis_i=0)
     if not sym_help._is_none(per_sample_weights):
-        per_sample_weights_row = loop_block.op("Slice", per_sample_weights,
-                                               indices_start,
-                                               indices_end,
-                                               zero)
-        per_sample_weights_row = sym_help._unsqueeze_helper(loop_block, per_sample_weights_row, [1])
+        per_sample_weights_row = loop_block.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = sym_help._unsqueeze_helper(
+            loop_block, per_sample_weights_row, [1]
+        )
         embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row)
     if mode == 0:
-        embeddings = sym_help._reducesum_helper(loop_block, embeddings, axes_i=[0], keepdims_i=0)
+        embeddings = sym_help._reducesum_helper(
+            loop_block, embeddings, axes_i=[0], keepdims_i=0
+        )
     elif mode == 1:
         embeddings = loop_block.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
     else:
         embeddings = loop_block.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
 
     cond_out = loop_block.op("Cast", loop_condition, to_i=9)
-    _add_output_to_block(loop_block, cond_out)
-    _add_output_to_block(loop_block, embeddings)
+    torch.onnx.utils._add_output_to_block(loop_block, cond_out)
+    torch.onnx.utils._add_output_to_block(loop_block, embeddings)
 
     # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
     # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
     return loop.node().output(), None, None, None
 
 
-def prim_ConstantChunk(g, self, chunks, dim):
-    input_shape = g.op("Shape", self)
-    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
-    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
-    chunk_size_minus_1 = g.op("Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long))
-    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
-    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
-    res = []
-    for i in range(chunks):
-        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
-        end = g.op("Mul", chunk_dim, index)
-        res.append(g.op("Slice", self, start, end, axis))
-        start = end
-    return res
+@parse_args("v", "v", "f", "f")
+def embedding_renorm(g, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_type = int(norm_type)
+    if norm_type == 1:
+        norm_type = "ReduceL1"
+    elif norm_type == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise RuntimeError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_type}. "
+            "Only 1. and 2. are supported."
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        sym_help._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
 
 def chunk(g, self, chunks, dim):
     # Calculate chunk size for dynamic chunk
     dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
-    chunk_size_s = g.op("Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long)))
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
     chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
     # Create splits vector
-    chunk_vec = [expand(g, chunk_size, chunk_size_s, None),
-                 g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s))]
+    chunk_vec = [
+        expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
     chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
     return split(g, self, chunk_vec, dim)
 
@@ -919,3 +1205,27 @@ def normal(g, loc, scale, seed):
     # is a sample with mean μ and variance σ's square.
     result = mul(g, scale, g.op("RandomNormalLike", loc))
     return add(g, result, loc)
+
+
+class Prim:
+    domain = "prim"
+
+    @staticmethod
+    def ConstantChunk(g, self, chunks, dim):
+        input_shape = g.op("Shape", self)
+        axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+        start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+        chunk_size_minus_1 = g.op(
+            "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+        )
+        input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+        chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+        res = []
+        for i in range(chunks):
+            index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+            end = g.op("Mul", chunk_dim, index)
+            res.append(g.op("Slice", self, start, end, axis))
+            start = end
+        return res
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 7144f9721cb9..df8e6bc54e4d 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -1,47 +1,62 @@
-import torch
-import torch.onnx.symbolic_helper as sym_help
-from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented
-from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
-from sys import maxsize
-from torch.onnx.symbolic_opset9 import permute, _reshape_from_tensor
 import warnings
+from sys import maxsize
 
+import torch
+import torch.onnx.symbolic_helper as sym_help
+import torch.onnx.utils
+from torch.onnx.symbolic_helper import _parse_arg, _unimplemented, parse_args
+from torch.onnx.symbolic_opset9 import _reshape_from_tensor, permute
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
 
 # This file exports ONNX ops for opset 12
 
+
 def einsum_helper(g, equation, tensors):
     if not tensors:
         raise RuntimeError("Einsum inputs are empty.")
     # ONNX does not support bool for Einsum inputs.
     if tensors[0].type().scalarType() == "Bool":
-        tensors = [g.op("Cast", tensor, to_i=sym_help.cast_pytorch_to_onnx["Long"]) for tensor in tensors]
-        return g.op("Cast", g.op("Einsum", *tensors, equation_s=equation), to_i=sym_help.cast_pytorch_to_onnx["Bool"])
+        tensors = [
+            g.op("Cast", tensor, to_i=sym_help.cast_pytorch_to_onnx["Long"])
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
+        )
     else:
         return g.op("Einsum", *tensors, equation_s=equation)
 
+
 @parse_args("s", "v")
 def einsum(g, equation, tensor_list):
     tensors = sym_help._unpack_list(tensor_list)
     return einsum_helper(g, equation, tensors)
 
+
 @parse_args("v", "v")
 def outer(g, input, other):
     # make sure to cast other to self's type
     if other.type().scalarType() != input.type().scalarType():
-        other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[input.type().scalarType()])
+        other = g.op(
+            "Cast", other, to_i=sym_help.cast_pytorch_to_onnx[input.type().scalarType()]
+        )
     return einsum_helper(g, "i,j->ij", [input, other])
 
+
 @parse_args("v", "f", "i")
 def dropout(g, input, p, train):
     sym_help.check_training_mode(train, "dropout")
     # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op
     if not train:
         return input
-    warnings.warn("Dropout is a training op and should not be exported in inference mode. "
-                  "For inference, make sure to call eval() on the model and to export it with param training=False.")
+    warnings.warn(
+        "Dropout is a training op and should not be exported in inference mode. "
+        "For inference, make sure to call eval() on the model and to export it with param training=False."
+    )
     p = g.op("Constant", value_t=torch.tensor(p))
     t = g.op("Constant", value_t=torch.tensor(True))
     r, _ = g.op("Dropout", input, p, t, outputs=2)
@@ -60,9 +75,22 @@ def nll_loss(g, self, target, weight, reduction, ignore_index):
     # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
     ignore_index = sym_help._maybe_get_const(ignore_index, "i")
     if weight.node().mustBeNone():
-        nllloss = g.op("NegativeLogLikelihoodLoss", self, target, reduction_s=reduction, ignore_index_i=ignore_index)
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
     else:
-        nllloss = g.op("NegativeLogLikelihoodLoss", self, target, weight, reduction_s=reduction, ignore_index_i=ignore_index)
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
 
     return nllloss
 
@@ -75,7 +103,9 @@ def nll_loss_nd(g, self, target, weight, reduction, ignore_index):
     return nll_loss(g, self, target, weight, reduction, ignore_index)
 
 
-def cross_entropy_loss(g, self, target, weight, reduction, ignore_index, label_smoothing):
+def cross_entropy_loss(
+    g, self, target, weight, reduction, ignore_index, label_smoothing
+):
     # none reduction : onnx::Constant[value={0}]
     # mean reduction : onnx::Constant[value={1}]
     # sum reduction : onnx::Constant[value={2}]
@@ -91,16 +121,30 @@ def cross_entropy_loss(g, self, target, weight, reduction, ignore_index, label_s
     # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
     ignore_index = sym_help._maybe_get_const(ignore_index, "i")
     if weight.node().mustBeNone():
-        celoss = g.op("SoftmaxCrossEntropyLoss", self, target, reduction_s=reduction, ignore_index_i=ignore_index)
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
     else:
-        celoss = g.op("SoftmaxCrossEntropyLoss", self, target, weight, reduction_s=reduction, ignore_index_i=ignore_index)
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
 
     return celoss
 
 
 @parse_args("v", "v", "v", "v", "i")
 def binary_cross_entropy_with_logits(g, input, target, weight, pos_weight, reduction):
-    from torch.onnx.symbolic_opset9 import sigmoid, log, sub, neg, mul, add
+    from torch.onnx.symbolic_opset9 import add, log, mul, neg, sigmoid, sub
+
     p = g.op("Constant", value_t=torch.tensor([1]))
     sig_x = sigmoid(g, input)
     log_sig_x = log(g, sig_x)
@@ -110,7 +154,14 @@ def binary_cross_entropy_with_logits(g, input, target, weight, pos_weight, reduc
     if pos_weight is None or sym_help._is_none(pos_weight):
         output = neg(g, add(g, mul(g, target, log_sig_x), mul(g, sub_1_y, log_1_x)))
     else:
-        output = neg(g, add(g, mul(g, mul(g, target, log_sig_x), pos_weight), mul(g, sub_1_y, log_1_x)))
+        output = neg(
+            g,
+            add(
+                g,
+                mul(g, mul(g, target, log_sig_x), pos_weight),
+                mul(g, sub_1_y, log_1_x),
+            ),
+        )
 
     if weight is not None and not sym_help._is_none(weight):
         output = mul(g, weight, output)
@@ -123,7 +174,9 @@ def binary_cross_entropy_with_logits(g, input, target, weight, pos_weight, reduc
     elif reduction == 2:
         return g.op("ReduceSum", output, keepdims_i=0)
     else:
-        return sym_help._onnx_unsupported("binary_cross_entropy_with_logits with reduction other than none, mean, or sum")
+        return sym_help._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum"
+        )
 
 
 def celu(g, self, alpha):
@@ -139,42 +192,58 @@ def celu(g, self, alpha):
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
-        return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
+        flattened = sym_help._reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        return g.op(
+            "ArgMax", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False
+        )
     else:
         dim = _parse_arg(dim, "i")
         keepdim = _parse_arg(keepdim, "i")
-        return g.op("ArgMax", input, axis_i=dim, keepdims_i=keepdim, select_last_index_i=False)
+        return g.op(
+            "ArgMax", input, axis_i=dim, keepdims_i=keepdim, select_last_index_i=False
+        )
 
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
-        return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
+        flattened = sym_help._reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        return g.op(
+            "ArgMin", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False
+        )
     else:
         dim = _parse_arg(dim, "i")
         keepdim = _parse_arg(keepdim, "i")
-        return g.op("ArgMin", input, axis_i=dim, keepdims_i=keepdim, select_last_index_i=False)
+        return g.op(
+            "ArgMin", input, axis_i=dim, keepdims_i=keepdim, select_last_index_i=False
+        )
 
 
 def pow(g, self, exponent):
     return g.op("Pow", self, exponent)
 
+
 def ge(g, input, other):
     return g.op("GreaterOrEqual", input, other)
 
+
 def le(g, input, other):
     return g.op("LessOrEqual", input, other)
 
+
 @parse_args("v", "i", "v", "v")
 def unfold(g, input, dimension, size, step):
     const_size = sym_help._maybe_get_const(size, "i")
     const_step = sym_help._maybe_get_const(step, "i")
     if not sym_help._is_value(const_size) and not sym_help._is_value(const_step):
         from torch.onnx.symbolic_opset9 import unfold as _unfold
+
         return _unfold(g, input, dimension, const_size, const_step)
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("unfold", input, dimension_i=dimension, size_i=size, step_i=step)
 
     sizedim = sym_help._get_tensor_dim_size(input, dimension)
     if sizedim is not None:
@@ -184,8 +253,12 @@ def unfold(g, input, dimension, size, step):
         low_indices = g.op("Range", low_start, low_end, step)
         hi_indices = g.op("Range", size, hi_end, step)
 
-        low_size = sym_help._size_helper(g, low_indices, g.op("Constant", value_t=torch.tensor(0)))
-        hi_size = sym_help._size_helper(g, hi_indices, g.op("Constant", value_t=torch.tensor(0)))
+        low_size = sym_help._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = sym_help._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
 
         ndim = sym_help._get_tensor_rank(input)
         perm = list(range(0, ndim))
@@ -197,9 +270,9 @@ def unfold(g, input, dimension, size, step):
         loop_len = g.op("Min", low_size, hi_size)
         loop = g.op("Loop", loop_len, loop_condition)
 
-        loop_block = _add_block(loop.node())
-        block_input_iter = _add_input_to_block(loop_block)
-        cond = _add_input_to_block(loop_block)
+        loop_block = torch.onnx.utils._add_block(loop.node())
+        block_input_iter = torch.onnx.utils._add_input_to_block(loop_block)
+        cond = torch.onnx.utils._add_input_to_block(loop_block)
 
         starts = loop_block.op("Gather", low_indices, block_input_iter)
         ends = loop_block.op("Gather", hi_indices, block_input_iter)
@@ -208,13 +281,15 @@ def unfold(g, input, dimension, size, step):
         ends = sym_help._unsqueeze_helper(loop_block, ends, [0])
         stack = loop_block.op("Slice", input, starts, ends, axes)
 
-        unsqueeze = sym_help._unsqueeze_helper(loop_block, loop_block.op("Transpose", stack, perm_i=perm), [dimension])
+        unsqueeze = sym_help._unsqueeze_helper(
+            loop_block, loop_block.op("Transpose", stack, perm_i=perm), [dimension]
+        )
         unsqueeze_list.append(unsqueeze)
         concat = loop_block.op("Concat", *unsqueeze_list, axis_i=0)
 
         cond_out = loop_block.op("Cast", loop_condition, to_i=9)
-        _add_output_to_block(loop_block, cond_out)
-        _add_output_to_block(loop_block, concat)
+        torch.onnx.utils._add_output_to_block(loop_block, cond_out)
+        torch.onnx.utils._add_output_to_block(loop_block, concat)
 
         loop_output = loop.node().output()
         perm = [0, 1, 2, 3, 4]
@@ -226,6 +301,7 @@ def unfold(g, input, dimension, size, step):
     else:
         return _unimplemented("Unfold", "input size not accessible")
 
+
 @parse_args("v", "v", "is", "is", "v")
 def tensordot(g, input_a, input_b, dims_a, dims_b, out=None):
     if out is not None:
@@ -233,14 +309,24 @@ def tensordot(g, input_a, input_b, dims_a, dims_b, out=None):
 
     dim_count_a = sym_help._get_tensor_rank(input_a)
     if dim_count_a is None:
-        raise RuntimeError("Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank."
+        )
 
     dim_count_b = sym_help._get_tensor_rank(input_b)
     if dim_count_b is None:
-        raise RuntimeError("Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.")
-
-    dims_a = [(dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i] for i in range(len(dims_a))]
-    dims_b = [(dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i] for i in range(len(dims_b))]
+        raise RuntimeError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank."
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
 
     left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
     left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
@@ -249,24 +335,46 @@ def tensordot(g, input_a, input_b, dims_a, dims_b, out=None):
     new_input_b = permute(g, input_b, dims_b + left_dims_b)
 
     input_shape = g.op("Shape", new_input_a)
-    left_sizes_a = sym_help._slice_helper(g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)])
-    shape_sizes = [left_sizes_a, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    left_sizes_a = sym_help._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
     output_a = _reshape_from_tensor(g, new_input_a, shape_sizes)
 
     input_shape = g.op("Shape", output_a)
-    slices = sym_help._slice_helper(g, input_shape, axes=[0], starts=[-1], ends=[maxsize])
-    shape_sizes = [g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)), slices]
+    slices = sym_help._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
     output_a = _reshape_from_tensor(g, new_input_a, shape_sizes)
 
     input_shape = g.op("Shape", new_input_b)
-    left_sizes_b = sym_help._slice_helper(g, input_shape, axes=[0], starts=[len(dims_b)], ends=[maxsize])
-    slices = sym_help._slice_helper(g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)])
-    shape_sizes = [slices, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    left_sizes_b = sym_help._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[maxsize]
+    )
+    slices = sym_help._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
     output_b = _reshape_from_tensor(g, new_input_b, shape_sizes)
 
     input_shape = g.op("Shape", output_b)
-    slices = sym_help._slice_helper(g, input_shape, axes=[0], starts=[-1], ends=[maxsize])
-    shape_sizes = [g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)), slices]
+    slices = sym_help._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
     output_b = _reshape_from_tensor(g, new_input_b, shape_sizes)
 
     output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index d4551daaf711..6167e40b4b3f 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -4,12 +4,22 @@
 # This file exports ONNX ops for opset 13
 import torch
 import torch.onnx.symbolic_helper as sym_help
-from torch.onnx.symbolic_helper import parse_args, _unimplemented
-from torch.onnx.symbolic_opset9 import (overload_by_arg_count, _maybe_cast_reduce_op_input,
-                                        nonzero, expand, zeros, ones, size)
+import torch.onnx.utils
+from torch.onnx.symbolic_helper import _unimplemented, parse_args
+from torch.onnx.symbolic_opset9 import (
+    _maybe_cast_reduce_op_input,
+    conv2d,
+    expand,
+    linear,
+    nonzero,
+    ones,
+    overload_by_arg_count,
+    relu,
+    size,
+    unused,
+    zeros,
+)
 from torch.onnx.symbolic_opset11 import unsqueeze
-from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
-
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -32,7 +42,9 @@ def log_softmax(g, input, dim, dtype=None):
     return_op = g.op("LogSoftmax", input, axis_i=dim)
     if dtype and dtype.node().kind() != "prim::Constant":
         parsed_dtype = sym_help._get_const(dtype, "i", "dtype")
-        return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+        return_op = g.op(
+            "Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype]
+        )
     return return_op
 
 
@@ -53,20 +65,33 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None):
         if _outputs is None:
             return split_out
         # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if sym_help._is_packed_list(split_size_or_sizes) and \
-                len(sym_help._unpack_list(split_size_or_sizes)) == _outputs:
-            split_sizes = [sym_help._unsqueeze_helper(g, v, [0]) for v in sym_help._unpack_list(split_size_or_sizes)]
+        if (
+            sym_help._is_packed_list(split_size_or_sizes)
+            and len(sym_help._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                sym_help._unsqueeze_helper(g, v, [0])
+                for v in sym_help._unpack_list(split_size_or_sizes)
+            ]
 
             start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
             axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
             res = []
             for i in range(_outputs):
-                end = g.op("Add", start, split_sizes[i])  # split_sizes is a list of same length as _outputs
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
                 res.append(g.op("Slice", self, start, end, axis))
                 start = end
             return res
-        return [g.op("SequenceAt", split_out, g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)))
-                for i in range(_outputs)]
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
 
     split_val = split_size_or_sizes.node()["value"]
     if split_val.dim() > 0:
@@ -102,15 +127,21 @@ def unsafe_split_with_sizes(g, self, split_sizes, dim, _outputs=None):
 @parse_args("v", "i", "i")
 def unbind(g, self, dim=0, _outputs=None):
     if _outputs is None:
-        return g.op("SplitToSequence",
-                    self,
-                    g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-                    axis_i=dim, keepdims_i=0)
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
 
     splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
     outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
     outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim]))) for out in outputs]
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
     return squeezed_outputs
 
 
@@ -126,24 +157,66 @@ def where(g, condition, self=None, other=None, _outputs=None):
         condition = g.op("Cast", condition, to_i=sym_help.cast_pytorch_to_onnx["Bool"])
     if self is None:
         condition = nonzero(g, condition)
-        return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs)
+        return sym_help._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
     return g.op("Where", condition, self, other)
 
+
 @parse_args("v", "v", "v", "i", "i", "i")
-def fake_quantize_per_channel_affine(g, inputs, scale, zero_point, axis, quant_min=-128, quant_max=127):
-    if quant_min not in [0, -128] or quant_max not in [127, 255]:
+def fake_quantize_per_channel_affine(
+    g, inputs, scale, zero_point, axis, quant_min=-128, quant_max=127
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
         raise RuntimeError(
-            "ONNX defines [0, 255] for quint8 and [-128, 127] for qint8, got [{}, {}]".format(quant_min, quant_max))
-
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            "Got ({}, {})".format(quant_min, quant_max)
+        )
     # ONNX defines zero_point to be int8 or uint8
     if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx["Byte"])
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g, inputs, scale, zero_point, quant_min=-128, quant_max=127
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise RuntimeError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            "Got ({}, {})".format(quant_min, quant_max)
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.UINT8)
     else:
-        zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx["Char"])
-    return g.op(
-        "DequantizeLinear",
-        g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis),
-        scale, zero_point, axis_i=axis)
+        zero_point = g.op("Cast", zero_point, to_i=torch.onnx.TensorProtoDataType.INT8)
+    if scale.type().scalarType() != "Float":
+        scale = g.op("Cast", scale, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
 
 def _reduce_op_symbolic(onnx_op_name):
     def symbolic(g, self, dim=None, keepdim=None):
@@ -154,8 +227,10 @@ def symbolic(g, self, dim=None, keepdim=None):
         else:
             keepdim = sym_help._get_const(keepdim, "i", "keepdim")
             return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
     return symbolic
 
+
 def _reduce_with_dtype(onnx_op, name):
     symbolic = _reduce_op_symbolic(onnx_op)
 
@@ -178,18 +253,25 @@ def reduce_dim(g, self, dim, keepdim, dtype):
             elif dtype.node().kind() != "prim::Constant":
                 return _unimplemented(name, "dtype")
             return symbolic(g, self, dim, keepdim)
+
         return reduce_nodim, reduce_dim
+
     return reduce
 
+
 sum = _reduce_with_dtype("ReduceSum", "sum")
 
+
 @parse_args("v", "i", "i", "i")
 def unsafe_chunk(g, self, chunks, dim, _outputs=None):
     if _outputs is None:
-        return g.op("SplitToSequence",
-                    self,
-                    g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-                    axis_i=dim, keepdims_i=0)
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
 
     size = sym_help._get_tensor_dim_size(self, dim)
     if size is None:
@@ -206,13 +288,16 @@ def unsafe_chunk(g, self, chunks, dim, _outputs=None):
     splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
     return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
 
+
 def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     input = self
     final_dim = dim
     # if dim is None flatten
     # By default, use the flattened input array, and return a flat output array
     if sym_help._is_none(dim):
-        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        input = sym_help._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
         dim = 0
     else:
         dim = sym_help._maybe_get_scalar(dim)
@@ -221,14 +306,17 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     repeats_sizes = sym_help._get_tensor_sizes(repeats)
     input_sizes = sym_help._get_tensor_sizes(input)
     if repeats_dim is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats rank.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "repeats rank."
+        )
     if repeats_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats size.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "repeats size."
+        )
     if input_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "input size.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "input size."
+        )
     # Handle cases where dim is negative
     if dim < 0:
         dim += len(input_sizes)
@@ -237,9 +325,8 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     for idx, input_size in enumerate(input_sizes):
         if input_size is None:
             output_sizes[idx], input_sizes[idx] = 0, -1
-    print(output_sizes, input_sizes)
 
-    cond_dynamic_repeats = (repeats_dim == 1 and repeats_sizes[0] is None)
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
     # If input size is dynamic or repeats vector is dynamic
     if output_sizes[dim] == 0 or cond_dynamic_repeats:
         reps = sym_help._size_helper(g, input, dim)
@@ -254,8 +341,12 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
         # As repeats is dynamic, we use a where node as a substitute for the if statement
         # If repests_dim = 1, expand repeats otherwise use original tensor
         elif cond_dynamic_repeats:
-            repeat_dim = sym_help._size_helper(g, repeats, g.op("Constant", value_t=torch.LongTensor([0])))
-            repeat_cond = g.op("Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1])))
+            repeat_dim = sym_help._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
             repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
     # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
     # provided along one of the dynamic axes provided. A simple example would be
@@ -266,8 +357,11 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     else:
         return torch.onnx.symbolic_opset9.repeat_interleave(g, self, repeats, final_dim)
 
-    reps_like = g.op("ConstantOfShape", g.op("Shape", repeats),
-                     value_t=torch.tensor([1], dtype=torch.long))
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
     r_splits = split(g, repeats, reps_like, 0)
     i_splits = split(g, input, reps_like, dim)
 
@@ -293,28 +387,31 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     loop = g.op("Loop", loop_len, loop_condition, final_splits)
 
     # Loop inputs
-    loop_block = _add_block(loop.node())
-    block_input_iter = _add_input_to_block(loop_block)
-    cond = _add_input_to_block(loop_block)
-    final_splits = _add_input_to_block(loop_block)
+    loop_block = torch.onnx.utils._add_block(loop.node())
+    block_input_iter = torch.onnx.utils._add_input_to_block(loop_block)
+    cond = torch.onnx.utils._add_input_to_block(loop_block)
+    final_splits = torch.onnx.utils._add_input_to_block(loop_block)
 
     r_split = loop_block.op("SequenceAt", r_splits, block_input_iter)
     i_split = loop_block.op("SequenceAt", i_splits, block_input_iter)
 
     i_split = unsqueeze(loop_block, i_split, dim + 1)
-    r_concat = [loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[:dim + 1])),
-                r_split,
-                loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))]
+    r_concat = [
+        loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
     r_concat = loop_block.op("Concat", *r_concat, axis_i=0)
     i_split = expand(loop_block, i_split, r_concat, None)
-    i_split = sym_help._reshape_helper(loop_block, i_split,
-                                       g.op("Constant", value_t=torch.LongTensor(output_sizes)))
+    i_split = sym_help._reshape_helper(
+        loop_block, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
     final_splits = loop_block.op("SequenceInsert", final_splits, i_split)
 
     # Loop outputs
     cond_out = loop_block.op("Cast", loop_condition, to_i=9)
-    _add_output_to_block(loop_block, cond_out)
-    _add_output_to_block(loop_block, final_splits)
+    torch.onnx.utils._add_output_to_block(loop_block, cond_out)
+    torch.onnx.utils._add_output_to_block(loop_block, final_splits)
 
     loop_out = loop.node().output()
     loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
@@ -355,12 +452,18 @@ def diagonal(g, self, offset, dim1, dim2):
     # calculation of selection window
     offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
     if offset >= 0:
-        diag_size = g.op("Max", g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
-                         g.op("Constant", value_t=torch.LongTensor([0])))
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
         offset = 0
     else:
-        diag_size = g.op("Max", g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
-                         g.op("Constant", value_t=torch.LongTensor([0])))
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
     diag_size = g.op("Concat", diag_size, axis_i=0)
 
     # Calculate which diagonal values to select
@@ -371,11 +474,21 @@ def diagonal(g, self, offset, dim1, dim2):
     # with all columns that are to be selected
     # So in this example, it is [1, 2]
     select_window_ones_fill = ones(g, diag_size, 4, None, None)
-    select_window = g.op("CumSum", select_window_ones_fill, g.op("Constant", value_t=torch.LongTensor([0])))
-    select_window = g.op("Add", select_window, g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])))
-
-    gather_shape = [size(g, result,
-                         dim=g.op("Constant", value_t=torch.LongTensor([axis]))) for axis in list(range(rank))[:-2]]
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
     gather_shape.append(diag_size)
     gather_shape = g.op("Concat", *gather_shape, axis_i=0)
     gather_indices = zeros(g, gather_shape, 4, None, None)
@@ -390,17 +503,98 @@ def diagonal(g, self, offset, dim1, dim2):
     # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
     # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
     # returning an empty tensor
-    overrun_cond = g.op("Not", g.op("Equal", diag_size, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))))
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
     if_op = g.op("If", overrun_cond)
     if_node = if_op.node()
 
-    if_block = _add_block(if_node)
+    if_block = torch.onnx.utils._add_block(if_node)
     gather_indices_if_block = if_block.op("Add", gather_indices, select_window)
-    gather_indices_if_block = sym_help._unsqueeze_helper(if_block, gather_indices_if_block, [rank - 1])
-    final_non_overrun_ = if_block.op("GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2)
-    _add_output_to_block(if_block, final_non_overrun_)
-
-    else_block = _add_block(if_node)
+    gather_indices_if_block = sym_help._unsqueeze_helper(
+        if_block, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun_ = if_block.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    torch.onnx.utils._add_output_to_block(if_block, final_non_overrun_)
+
+    else_block = torch.onnx.utils._add_block(if_node)
     final_overrun_ = zeros(else_block, gather_shape, 6, None, None)
-    _add_output_to_block(else_block, final_overrun_)
+    torch.onnx.utils._add_output_to_block(else_block, final_overrun_)
     return if_op
+
+
+class Quantized:
+    """
+    https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+    """
+
+    domain = "quantized"
+
+    @staticmethod
+    def linear(g, q_input, q_weight, bias, op_scale, op_zero_point):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, axis = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(
+            g, bias, input_scale, weight_scale, axis
+        )
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = linear(g, input, weight, bias)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def conv2d(
+        g,
+        q_input,
+        q_weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        op_scale,
+        op_zero_point,
+    ):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, axis = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(
+            g, bias, input_scale, weight_scale, axis
+        )
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
+
+    @staticmethod
+    def conv2d_relu(
+        g,
+        q_input,
+        q_weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        op_scale,
+        op_zero_point,
+    ):
+        input, input_scale, _, _ = sym_help.dequantize_helper(g, q_input)
+        weight, weight_scale, _, axis = sym_help.dequantize_helper(g, q_weight)
+        q_bias = sym_help.requantize_bias_helper(
+            g, bias, input_scale, weight_scale, axis
+        )
+        bias, _, _, _ = sym_help.dequantize_helper(g, q_bias)
+
+        output = conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+        output = relu(g, output)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index d4775b553da8..b46b6356c568 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -3,8 +3,8 @@
 
 # This file exports ONNX ops for opset 14
 import torch
-
 import torch.onnx.symbolic_helper as sym_help
+from torch.onnx._globals import GLOBALS
 from torch.onnx.symbolic_helper import parse_args
 
 # Note [ONNX operators that are added/updated in opset 14]
@@ -18,33 +18,76 @@
 #   GRU, LSTM, RNN
 #   BatchNorm, Cumsum, Relu
 
+
 @parse_args("v")
 def hardswish(g, self):
     return g.op("HardSwish", self)
 
+
 @parse_args("v", "i")
 def tril(g, self, diagonal, out=None):
     k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64))
     return g.op("Trilu", self, k, upper_i=0)
 
+
 @parse_args("v", "i")
 def triu(g, self, diagonal, out=None):
     k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64))
     return g.op("Trilu", self, k, upper_i=1)
 
+
 @parse_args("v", "v")
 def reshape(g, self, shape):
-    return sym_help._reshape_helper(g, self, shape)
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return sym_help._reshape_helper(g, self, shape, allowzero=0)
+
 
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
+def batch_norm(
+    g,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+
+    if (
+        torch.is_autocast_enabled()
+        and not sym_help.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return sym_help._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+        )
+
     sym_help.check_training_mode(training, "batch_norm")
-    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
-    out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
-               epsilon_f=eps,
-               momentum_f=1 - momentum,
-               training_mode_i=0 if not training else 1,
-               outputs=1 if not training else 3)
+    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
     if not training:
         return out
     else:
@@ -52,3 +95,19 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome
         new_running_mean.setType(running_mean.type())
         new_running_var.setType(running_var.type())
         return res
+
+
+class Quantized:
+    """
+    https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+    """
+
+    domain = "quantized"
+
+    @staticmethod
+    def hardswish(g, x, op_scale, op_zero_point):
+        x, _, _, _ = sym_help.dequantize_helper(g, x)
+
+        output = hardswish(g, x)
+
+        return sym_help.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/torch/onnx/symbolic_opset15.py b/torch/onnx/symbolic_opset15.py
index f02afcb97737..32222673fdc6 100644
--- a/torch/onnx/symbolic_opset15.py
+++ b/torch/onnx/symbolic_opset15.py
@@ -23,3 +23,38 @@
 #    Shape              https://github.com/onnx/onnx/pull/3580
 #                       Backwards compatible
 #                       TODO: optional start/end attribute.
+
+
+import torch
+from torch._C import OptionalType
+from torch.onnx.symbolic_helper import _is_none
+from torch.onnx.symbolic_opset9 import eq, wrap_logical_op_with_negation
+
+
+def __is_(g, self, other):
+    if _is_none(other):
+        if isinstance(self.type(), OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@wrap_logical_op_with_negation
+def __isnot_(g, self, other):
+    return __is_(g, self, other)
+
+
+class Prim:
+    domain = "prim"
+
+    @staticmethod
+    def unchecked_cast(g, self):
+        # exists to refine the type of the Value
+        # if x is Optional[Tensor], unchecked_cast will cast
+        # x to Tensor, so the rest of the graph knows that x is a Tensor.
+        if isinstance(self.type(), OptionalType):
+            return g.op("OptionalGetElement", self)
+
+        return self
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
new file mode 100644
index 000000000000..82219d173fad
--- /dev/null
+++ b/torch/onnx/symbolic_opset16.py
@@ -0,0 +1,48 @@
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+# This file exports ONNX ops for opset 16
+
+# Note [ONNX Operators that are added/updated in opset 16]
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+# New operators:
+#   GridSample https://github.com/onnx/onnx/pull/3557
+#
+# Updated operators:
+#    Identity
+#    If
+#    LeakyRelu
+#    Loop
+#    PRelu
+#    RoiAlign
+#    Scan
+#    ScatterElemenets
+#    ScatterND
+#    Where
+#    GreaterOrEqual
+#    LessOrEqual
+#    SequenceMap
+
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx.symbolic_helper import parse_args
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@parse_args("v", "v", "i", "i", "b")
+def grid_sampler(g, input, grid, mode_enum, padding_mode_enum, align_corners):
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[padding_mode_enum]  # type: ignore[call-arg]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
diff --git a/torch/onnx/symbolic_opset7.py b/torch/onnx/symbolic_opset7.py
index 4bb62b187a85..ad2e12c58372 100644
--- a/torch/onnx/symbolic_opset7.py
+++ b/torch/onnx/symbolic_opset7.py
@@ -1,9 +1,7 @@
-from torch.onnx.symbolic_helper import _block_list_in_opset
-
-import torch.onnx.symbolic_opset9 as sym_opset9
-
 import warnings
 
+import torch.onnx.symbolic_opset9 as sym_opset9
+from torch.onnx.symbolic_helper import _block_list_in_opset
 
 # Note [ONNX operators that are added/updated from opset 7 to opset 8]
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -16,9 +14,16 @@
 #   Scan
 
 block_listed_operators = [
-    "scan", "expand", "expand_as", "meshgrid",
-    "adaptive_max_pool1d", "adaptive_max_pool2d", "adaptive_max_pool3d",
-    "max_pool1d_with_indices", "max_pool2d_with_indices", "max_pool3d_with_indices"
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
 ]
 
 
@@ -28,18 +33,22 @@
 def max(g, self, dim_or_y=None, keepdim=None):
     # torch.max(input, other)
     if keepdim is None and dim_or_y is not None:
-        warnings.warn("Multidirectional broadcasting is not supported in opset 7. "
-                      "This might cause the onnx model to be incorrect, if inputs to max operators "
-                      "have different shapes")
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
     return sym_opset9.max(g, self, dim_or_y, keepdim)
 
 
 def min(g, self, dim_or_y=None, keepdim=None):
     # torch.min(input, other)
     if keepdim is None and dim_or_y is not None:
-        warnings.warn("Multidirectional broadcasting is not supported in opset 7. "
-                      "This might cause the onnx model to be incorrect, if inputs to min operators "
-                      "have different shapes")
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
     return sym_opset9.min(g, self, dim_or_y, keepdim)
 
 
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 128ccea84bba..6f2da64c4491 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -1,12 +1,15 @@
+import warnings
 
 import torch
 import torch.onnx.symbolic_helper as sym_help
 import torch.onnx.symbolic_opset9 as sym_opset9
-
-from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type, ScalarType
-from torch.onnx.symbolic_opset9 import _cast_Float  # type: ignore[attr-defined]
-
-import warnings
+from torch.onnx.symbolic_helper import (
+    ScalarType,
+    _block_list_in_opset,
+    _try_get_scalar_type,
+    _unimplemented,
+    parse_args,
+)
 
 # Note [ONNX operators that are added/updated from opset 8 to opset 9]
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -39,11 +42,22 @@
 #   Scan
 
 block_listed_operators = [
-    "nonzero", "where", "scatter", "scatter_add", "erf", "sign", "isnan", "gather",
-    "arange", "masked_fill",
-    "index_fill", "index_copy", "repeat_interleave",
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
     "isnan",
-    "any", "all"
+    "any",
+    "all",
 ]
 
 for block_listed_op in block_listed_operators:
@@ -52,7 +66,9 @@
 
 def _interpolate(name, dim, interpolate_mode):
     def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = sym_help._get_interpolate_attributes(g, interpolate_mode, args)
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
         sym_help._interpolate_warning(interpolate_mode)
         align_corners = sym_help._maybe_get_scalar(align_corners)
         if align_corners:
@@ -61,10 +77,15 @@ def symbolic_fn(g, input, output_size, *args):
         if sym_help._is_value(output_size):
             return _unimplemented(name, "torch._C.Value (output_size) indexing")
         if scales is None:
-            scales = [1. if i < 2 else
-                      float(output_size[-(dim - i)]) / float(input.type().sizes()[-(dim - i)])
-                      for i in range(0, dim)]
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
         return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
     return symbolic_fn
 
 
@@ -76,7 +97,9 @@ def symbolic_fn(g, input, output_size, *args):
 upsample_trilinear3d = _interpolate("upsample_trilinear3d", 5, "linear")
 
 
-def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias):
+def __interpolate(
+    g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias
+):
     align_corners = sym_help._maybe_get_const(align_corners, "b")
     if not sym_help._is_none(align_corners) and align_corners:
         return _unimplemented("interpolate", "align_corners == True")
@@ -87,8 +110,9 @@ def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_s
     if not sym_help._is_none(size) and sym_help._is_value(size):
         return _unimplemented("interpolate", "dynamic size in opset 8")
 
-    scales, mode = sym_help._interpolate_get_scales_and_mode(g, input, size, scale_factor,
-                                                             mode , align_corners)
+    scales, mode = sym_help._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
     return g.op("Upsample", input, mode_s=mode, scales_f=scales)
 
 
@@ -104,13 +128,19 @@ def _try_cast_integer_to_float(g, *args):
     if arg0_type is not None:
         old_type = arg0_type
         if old_type not in floating_scalar_types:
-            args = tuple(_cast_Float(g, arg, False) for arg in args)
+            # TODO(justinchuby): Remove the type ignore hint once _cast_Float is
+            # properly defined.
+            # NOTE: _cast_Float is generated programmatically so we need to make the
+            # type checker happy with ignore[attr-defined].
+            args = tuple(sym_opset9._cast_Float(g, arg, False) for arg in args)  # type: ignore[attr-defined]
         else:
             return (None,) + args
     else:
-        warnings.warn("Only floating datatype is supported for these operators: "
-                      "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
-                      "the onnx model to be incorrect, if inputs have integer datatypes.")
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
     return (old_type,) + args
 
 
@@ -167,7 +197,9 @@ def mm(g, self, other):
     C = g.constant(0, [1], ty)
     if _try_get_scalar_type(self):
         old_type, self, other, C = _try_cast_integer_to_float(g, self, other, C)
-        return _cast_to_type(g, g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0), old_type)
+        return _cast_to_type(
+            g, g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0), old_type
+        )
     else:
         return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
 
@@ -177,10 +209,26 @@ def addmm(g, self, mat1, mat2, beta, alpha):
     if _try_get_scalar_type(self):
         old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
         return _cast_to_type(
-            g, g.op("Gemm", mat1, mat2, self,
-                    beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha)), old_type)
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=sym_help._scalar(beta),
+                alpha_f=sym_help._scalar(alpha),
+            ),
+            old_type,
+        )
     else:
-        return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=sym_help._scalar(beta),
+            alpha_f=sym_help._scalar(alpha),
+        )
 
 
 def flatten(g, input, start_dim, end_dim):
@@ -188,19 +236,23 @@ def flatten(g, input, start_dim, end_dim):
     end_dim_i = sym_help._get_const(end_dim, "i", "end_dim")
 
     dim = input.type().dim()
-    if end_dim_i < 0 :
+    if end_dim_i < 0:
         end_dim_i = dim + end_dim_i
     # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim_i == 1 and end_dim_i == dim - 1 :
+    if start_dim_i == 1 and end_dim_i == dim - 1:
         if _try_get_scalar_type(input):
             old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(g, g.op("Flatten", input, axis_i=start_dim_i), old_type)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
         else:
             return g.op("Flatten", input, axis_i=start_dim_i)
-    if start_dim_i == 0 and end_dim_i == dim - 2 :
+    if start_dim_i == 0 and end_dim_i == dim - 2:
         if _try_get_scalar_type(input):
             old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
         else:
             return g.op("Flatten", input, axis_i=end_dim_i + 1)
 
@@ -212,10 +264,24 @@ def _constant_fill(g, sizes, dtype, const_value):
         dtype = ScalarType.FLOAT
     if not sym_help.scalar_type_to_pytorch_type[dtype].is_floating_point:
         result = g.op(
-            "ConstantFill", sizes, dtype_i=sym_help.cast_pytorch_to_onnx["Float"], input_as_shape_i=1, value_f=const_value)
-        return sym_help._cast_func_template(sym_help.scalar_type_to_onnx[dtype], g, result, None)
+            "ConstantFill",
+            sizes,
+            dtype_i=sym_help.cast_pytorch_to_onnx["Float"],
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return sym_help._cast_func_template(
+            sym_help.scalar_type_to_onnx[dtype], g, result, None
+        )
     else:
-        return g.op("ConstantFill", sizes, dtype_i=sym_help.scalar_type_to_onnx[dtype], input_as_shape_i=1, value_f=const_value)
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=sym_help.scalar_type_to_onnx[dtype],
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
 
 @parse_args("v", "i", "v", "v", "v", "v")
 def empty(g, sizes, dtype, layout, device, pin_memory=False, memory_format=None):
@@ -226,6 +292,7 @@ def empty(g, sizes, dtype, layout, device, pin_memory=False, memory_format=None)
 def empty_like(g, input, dtype, layout, device, pin_memory=False, memory_format=None):
     return zeros_like(g, input, dtype, layout, device, pin_memory)
 
+
 @parse_args("v", "i", "v", "v", "v")
 def zeros(g, sizes, dtype, layout, device, pin_memory=False):
     # NOTE: no way to set device and layout in ONNX, so we ignore it
@@ -260,7 +327,9 @@ def full(g, sizes, value, dtype, layout, device, pin_memory=False):
 
 
 @parse_args("v", "f", "i", "v", "v", "v", "v")
-def full_like(g, input, fill_value, dtype, layout, device, pin_memory=False, memory_format=None):
+def full_like(
+    g, input, fill_value, dtype, layout, device, pin_memory=False, memory_format=None
+):
     shape = g.op("Shape", input)
     return _constant_fill(g, shape, dtype, fill_value)
 
@@ -277,5 +346,7 @@ def repeat(g, self, repeats):
         sizes = self.type().sizes()
         diff_dims = repeat_size_len - len(sizes)
         if diff_dims > 0:
-            self = sym_opset9.view(g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes)))
+            self = sym_opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
     return g.op("Tile", self, repeats)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 28b18978d3b9..e32b53e6cdc9 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1,24 +1,27 @@
-import torch
-from torch._C import ListType, OptionalType
-from torch.nn.modules.utils import _single, _pair, _triple
+import math
+import warnings
+from functools import partial, wraps
+from sys import maxsize as maxsize
+from typing import Optional
 
+import torch
 import torch.onnx
-# This import monkey-patches graph manipulation methods on Graph, used for the
-# ONNX symbolics
-import torch.onnx.utils
-
-from functools import partial
-from functools import wraps
-
 import torch.onnx.symbolic_helper as sym_help
-from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented, ScalarType
-
-from typing import Optional
-from sys import maxsize as maxsize
-
-import math
-import warnings
+from torch._C import DeviceObjType, ListType, OptionalType
+from torch.nn.modules.utils import _pair, _single, _triple
 
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+from torch.onnx import _patch_torch  # noqa: F401
+from torch.onnx._globals import GLOBALS
+from torch.onnx.symbolic_helper import (
+    ScalarType,
+    _parse_arg,
+    _unimplemented,
+    args_have_same_dtype,
+    parse_args,
+    quantized_args,
+)
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -62,12 +65,13 @@ def unused(g):
     n.setType(OptionalType.ofTensor())
     return n
 
+
 def _shape_as_tensor(g, input):
     return g.op("Shape", input)
 
 
 def _reshape_from_tensor(g, input, shape):
-    if (isinstance(shape, list)):
+    if isinstance(shape, list):
         shape = g.op("Concat", *shape, axis_i=0)
     return reshape(g, input, shape)
 
@@ -83,7 +87,9 @@ def reshape_as(g, self, other):
 
 def add(g, self, other, alpha=None):
     if sym_help._is_value(self) and sym_help._is_tensor_list(self):
-        return sym_help._onnx_opset_unsupported_detailed("Add", 9, 11, "Add between list of tensors not supported")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported"
+        )
 
     # default alpha arg is to allow no-alpha add (aten add st overload no alpha)
     if alpha and sym_help._scalar(sym_help._maybe_get_scalar(alpha)) != 1:
@@ -113,6 +119,12 @@ def div(g, self, other, *args):
         return _div_rounding_mode(g, self, other, *args)
 
 
+@parse_args("v", "v", "v", "f")
+def addcmul(g, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
 @parse_args("v", "v", "s")
 def _div_rounding_mode(g, self, other, rounding_mode):
     if rounding_mode is None:
@@ -122,7 +134,9 @@ def _div_rounding_mode(g, self, other, rounding_mode):
     elif rounding_mode == "trunc":
         return _trunc_divide(g, self, other)
     else:
-        raise RuntimeError(f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"')
+        raise RuntimeError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"'
+        )
 
 
 def _trunc_divide(g, self, other):
@@ -142,9 +156,11 @@ def _trunc_divide(g, self, other):
     scalar_type = self.type().scalarType()
 
     if scalar_type is not None:
-        if not sym_help._is_fp(self) and \
-           other.type().scalarType() is not None and \
-           sym_help._is_fp(other):
+        if (
+            not sym_help._is_fp(self)
+            and other.type().scalarType() is not None
+            and sym_help._is_fp(other)
+        ):
             out = g.op("Cast", out, to_i=sym_help.cast_pytorch_to_onnx["Float"])
         else:
             out = g.op("Cast", out, to_i=sym_help.cast_pytorch_to_onnx[scalar_type])
@@ -162,14 +178,15 @@ def _floor_divide(g, self, other):
         div = g.op("Div", self, other)
         # Division is negative if: self < 0 != other < 0
         zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op("Xor",
-                        sym_help._lt_helper(g, self, zero),
-                        sym_help._lt_helper(g, other, zero))
+        negative = g.op(
+            "Xor",
+            sym_help._lt_helper(g, self, zero),
+            sym_help._lt_helper(g, other, zero),
+        )
 
         # For negative numbers with self % other != 0, subtract 1 to round down instead of up
         mod = g.op("Sub", self, g.op("Mul", div, other))
-        fixup_mask = g.op("And", negative,
-                          g.op("Not", g.op("Equal", mod, zero)))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
 
         one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
         fixup = g.op("Mul", fixup_mask, one)
@@ -184,11 +201,15 @@ def floor_divide(g, self, other):
 def floordiv(g, self, other):
     return floor_divide(g, self, other)
 
-# Division where both inputs are cast to floating types
-# If both inputs are floating, performs div as usual
-# If only one input is a floating type, the other input is cast to its type
-# If neither input is a floating type, both inputs are cast to the default scalar type
+
 def true_divide(g, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
     # Case 1: either values are floating
     # Performs div as usual.
     # Implicit casting will be handled in scalar type analysis pass.
@@ -223,7 +244,10 @@ def cat(g, tensor_list, dim):
 
 @parse_args("v", "i")
 def stack(g, tensor_list, dim):
-    unsqueezed = [sym_help._unsqueeze_helper(g, t, [dim]) for t in sym_help._unpack_list(tensor_list)]
+    unsqueezed = [
+        sym_help._unsqueeze_helper(g, t, [dim])
+        for t in sym_help._unpack_list(tensor_list)
+    ]
     return g.op("Concat", *unsqueezed, axis_i=dim)
 
 
@@ -276,17 +300,24 @@ def isNotNoneAnd(v, u):
         beta = sym_help._scalar(beta)
 
         if alpha != 1:
-            alpha = g.op("Constant",
-                         value_t=torch.tensor(alpha, dtype=dtype))
+            alpha = g.op("Constant", value_t=torch.tensor(alpha, dtype=dtype))
             res1 = g.op("Mul", res1, alpha)
         if beta != 1:
-            beta = g.op("Constant",
-                        value_t=torch.tensor(sym_help._scalar(beta), dtype=dtype))
+            beta = g.op(
+                "Constant", value_t=torch.tensor(sym_help._scalar(beta), dtype=dtype)
+            )
             res2 = g.op("Mul", res2, beta)
 
         return g.op("Add", res1, res2)
 
-    return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=sym_help._scalar(beta),
+        alpha_f=sym_help._scalar(alpha),
+    )
 
 
 def neg(g, self):
@@ -298,7 +329,9 @@ def sqrt(g, self):
 
 
 def rsqrt(g, self):
-    return g.op("Div", sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self))
+    return g.op(
+        "Div", sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self)
+    )
 
 
 def tanh(g, self):
@@ -363,11 +396,13 @@ def symbolic(g, self, dim=None, keepdim=None):
         else:
             # dim-reduce path
             desc = "is" if allow_multi_dim_support else "i"
-            dim, keepdim = sym_help._get_const(dim, desc, "dim"), sym_help._get_const(keepdim, "i", "keepdim")
+            dim, keepdim = sym_help._get_const(dim, desc, "dim"), sym_help._get_const(
+                keepdim, "i", "keepdim"
+            )
             dim_list = dim if allow_multi_dim_support else [dim]
             return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
-    return symbolic
 
+    return symbolic
 
 
 def overload_by_arg_count(fn):
@@ -380,11 +415,14 @@ def wrapper(g, *args):
             if len(arg_descriptors) == len(args):
                 return overload(g, *args)
         raise NotImplementedError("Unknown aten::{} signature".format(fn.__name__))
+
     return wrapper
 
 
 def _reduce_with_dtype(onnx_op, name, allow_multi_dim_support=True):
-    symbolic = _reduce_op_symbolic(onnx_op, allow_multi_dim_support=allow_multi_dim_support)
+    symbolic = _reduce_op_symbolic(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
 
     @overload_by_arg_count
     def reduce(g, *args, **kwargs):
@@ -407,41 +445,46 @@ def reduce_dim(g, self, dim, keepdim, dtype):
             elif dtype.node().kind() != "prim::Constant":
                 return _unimplemented(name, "dtype")
             return symbolic(g, self, dim, keepdim)
+
         return reduce_nodim, reduce_dim
+
     return reduce
 
 
 sum = _reduce_with_dtype("ReduceSum", "sum")
 mean = _reduce_with_dtype("ReduceMean", "mean")
-prod = _reduce_with_dtype("ReduceProd", "prod", allow_multi_dim_support=False)  # torch.prod does not support multidimensional "dim"
+# torch.prod does not support multidimensional "dim"
+prod = _reduce_with_dtype("ReduceProd", "prod", allow_multi_dim_support=False)
 
 
 @parse_args("v", "i", "none")
 def cumsum(g, input, dim, dtype):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+    if sym_help.is_caffe2_aten_fallback():
         if dtype.node().kind() != "prim::Constant":
             return _unimplemented(name, "dtype")
-        return g.op("ATen", input, operator_s="cumsum", dim_i=dim)
+        return g.at("cumsum", input, dim_i=dim)
     else:
         sym_help._onnx_opset_unsupported("cumsum", 9, 11)
 
 
 def _sample_dirichlet(g, self, generator):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+    if sym_help.is_caffe2_aten_fallback():
         if not sym_help._is_none(generator):
-            return _unimplemented("_sample_dirichlet",
-                                  "We are not able to export generator")
-        return g.op("ATen", self, operator_s="_sample_dirichlet")
+            return _unimplemented(
+                "_sample_dirichlet", "We are not able to export generator"
+            )
+        return g.at("_sample_dirichlet", self)
     else:
         return sym_help._onnx_unsupported("_sample_dirichlet")
 
 
 def _standard_gamma(g, self, generator):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+    if sym_help.is_caffe2_aten_fallback():
         if not sym_help._is_none(generator):
-            return _unimplemented("_standard_gamma",
-                                  "We are not able to export generator")
-        return g.op("ATen", self, operator_s="_standard_gamma")
+            return _unimplemented(
+                "_standard_gamma", "We are not able to export generator"
+            )
+        return g.at("_standard_gamma", self)
     else:
         return sym_help._onnx_unsupported("_standard_gamma")
 
@@ -458,7 +501,9 @@ def expand(g, self, size, implicit):
         # Expand with -1 dim value means dim is unchanged.
         # Since onnx::expand supports two-way broadcasting,
         # -1 dim value can be exported to onnx as 1
-        size = sym_help._reshape_helper(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1])))
+        size = sym_help._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
     dtype = ScalarType.INT64
     ones = ones_like(g, size, dtype)
     neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
@@ -483,42 +528,49 @@ def expand_as(g, self, other):
 
 @parse_args("v", "v", "i", "b", "v")
 def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
-    if scale_grad_by_freq and sym_help._training_mode:
-        raise RuntimeError("Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
-                           "for training mode. ONNX does not support scaling the gradients.")
-    if padding_idx >= 0 and sym_help._training_mode:
-        warnings.warn("Warning: ONNX export of embedding with padding_idx >= 0 "
-                      "for training mode. "
-                      "ONNX does not support not updating the embedding vector at padding_idx during training.")
+    if scale_grad_by_freq and GLOBALS.training_mode:
+        raise RuntimeError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients."
+        )
+    if padding_idx >= 0 and GLOBALS.training_mode:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
 
     return g.op("Gather", weight, indices)
 
 
 @parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(g,
-                  embedding_matrix,
-                  indices,
-                  offsets,
-                  scale_grad_by_freq,
-                  mode,
-                  sparse,
-                  per_sample_weights,
-                  include_last_offset,
-                  padding_idx):
+def embedding_bag(
+    g,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
     if not sym_help._is_none(per_sample_weights):
         return sym_help._onnx_unsupported("embedding_bag  with per_sample_weights")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen",
-                    embedding_matrix,
-                    indices,
-                    offsets,
-                    operator_s="embedding_bag",
-                    outputs=4,
-                    scale_grad_by_freq_i=scale_grad_by_freq,
-                    mode_i=mode,
-                    sparse_i=sparse,
-                    include_last_offset_i=include_last_offset,
-                    padding_idx_i=padding_idx)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "embedding_bag",
+            embedding_matrix,
+            indices,
+            offsets,
+            outputs=4,
+            scale_grad_by_freq_i=scale_grad_by_freq,
+            mode_i=mode,
+            sparse_i=sparse,
+            include_last_offset_i=include_last_offset,
+            padding_idx_i=padding_idx,
+        )
     else:
         return sym_help._onnx_unsupported("embedding_bag")
 
@@ -548,11 +600,14 @@ def transpose(g, self, dim0, dim1):
     else:
         # if we don't have dim information we cannot
         # output a permute so use ATen instead
-        if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-            return g.op("ATen", self, operator_s="transpose", dim0_i=dim0, dim1_i=dim1)
+        if sym_help.is_caffe2_aten_fallback():
+            return g.at(
+                "transpose", self, overload_name="int", dim0_i=dim0, dim1_i=dim1
+            )
         else:
-            raise RuntimeError("Unsupported: ONNX export of transpose for tensor "
-                               "of unknown rank.")
+            raise RuntimeError(
+                "Unsupported: ONNX export of transpose for tensor " "of unknown rank."
+            )
 
 
 @parse_args("v", "is")
@@ -571,33 +626,12 @@ def view_as(g, self, other):
     return reshape(g, self, shape)
 
 
-def prim_ConstantSplit(g, self, split_size, dim):
-    size = sym_help._get_tensor_dim_size(self, dim)
-    if size is None:
-        return _unimplemented("prim::ConstantSplit", "unknown dimension size")
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
-
-
-# TODO: It would be better to export this as a chunk directly, as this is
-# less sensitive to changes in input size.
-# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
-# method, and use the desugared version
-def prim_ConstantChunk(g, self, chunks, dim):
-    dim_size = sym_help._get_tensor_dim_size(self, dim)
-    if dim_size is None:
-        return _unimplemented("prim::ConstantChunk", "unknown dimension size")
-    split_size = (dim_size + chunks - 1) // chunks
-    return prim_ConstantSplit(g, self, split_size, dim)
-
-
 @parse_args("v", "i", "i", "i")
 def unsafe_chunk(g, self, chunks, dim, _outputs=None):
     if _outputs is None:
-        return sym_help._onnx_opset_unsupported_detailed("unsafe_chunk", 9, 11, "Dynamic number of outputs not supported")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported"
+        )
     size = sym_help._get_tensor_dim_size(self, dim)
     if size is None:
         return _unimplemented("unsafe_chunk", "unknown dimension size")
@@ -612,7 +646,9 @@ def unsafe_chunk(g, self, chunks, dim, _outputs=None):
 @parse_args("v", "v", "v", "i")
 def split(g, self, split_size_or_sizes, dim, _outputs=None):
     if not sym_help._is_split_static(split_size_or_sizes, _outputs):
-        return sym_help._onnx_opset_unsupported_detailed("split", 9, 11, "Dynamic number of outputs not supported")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported"
+        )
     split_val = split_size_or_sizes.node()["value"]
     if split_val.dim() > 0:
         return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
@@ -624,7 +660,9 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None):
         if _outputs is not None:
             size = split_size * _outputs
         else:
-            return sym_help._onnx_opset_unsupported_detailed("split", 9, 11, "Unknown dimension size not supported")
+            return sym_help._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported"
+            )
     splits = [split_size] * (size // split_size)
     leftover = size % split_size
     if leftover:
@@ -639,7 +677,9 @@ def unsafe_split(g, self, split_size_or_sizes, dim, _outputs=None):
 @parse_args("v", "is", "i", "i")
 def split_with_sizes(g, self, split_sizes, dim, _outputs=None):
     if not sym_help._is_split_static(split_sizes, _outputs):
-        return sym_help._onnx_opset_unsupported_detailed("split_with_sizes", 9, 11, "Dynamic number of outputs not supported")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported"
+        )
     return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
 
 
@@ -650,7 +690,9 @@ def unsafe_split_with_sizes(g, self, split_sizes, dim, _outputs=None):
 @parse_args("v", "i", "i")
 def unbind(g, self, dim=0, _outputs=None):
     if _outputs is None:
-        return sym_help._onnx_opset_unsupported_detailed("unbind", 9, 11, "Dynamic number of outputs not supported")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported"
+        )
 
     outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
     outputs = [outputs] if _outputs == 1 else outputs
@@ -666,7 +708,9 @@ def select(g, self, dim, index):
             end_index = 9223372036854775807
         else:
             end_index = index + 1
-        slice_node = sym_help._slice_helper(g, self, axes=[dim], starts=[index], ends=[end_index])
+        slice_node = sym_help._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
         return sym_help._squeeze_helper(g, slice_node, [dim])
     else:
         return g.op("Gather", self, index, axis_i=dim)
@@ -685,56 +729,148 @@ def squeeze(g, self, dim=None):
     if squeeze_dim < 0:
         rank = sym_help._get_tensor_rank(self)
         if rank is not None:
-            warnings.warn("ONNX export squeeze with negative axis " + str(squeeze_dim) +
-                          " might cause the onnx model to be incorrect. " +
-                          "Negative axis is not supported in ONNX. " +
-                          "Axis is converted to " + str(squeeze_dim + rank) +
-                          " based on input shape at export time. " +
-                          "Passing an tensor of different rank in execution will be incorrect.")
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
             squeeze_dim += rank
         else:
             return _unimplemented("squeeze", "negative axis with unknown input rank")
 
     dim_size = sym_help._get_tensor_dim_size(self, squeeze_dim)
     if dim_size is None:
-        warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + " on an input " +
-                      "with unknown shape. Note that if the size of dimension " + str(squeeze_dim) + " of the input " +
-                      "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on " +
-                      "non-singleton dimensions, it is recommended to export this model using opset " +
-                      "version 11 or higher.")
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
         return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim])
     if dim_size > 1:
-        warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". The size of " +
-                      "this dimension in the given input is " + str(dim_size) + ". The model will " +
-                      "be exported without the squeeze node. If the model is intended to be used with dynamic " +
-                      "input shapes, please use opset version 11 to " +
-                      "export the model.")
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
         return self
 
-    warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". If the model is " +
-                  "intended to be used with dynamic input shapes, please use opset version 11 to export the model.")
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
     return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim])
 
+
 def prelu(g, self, weight):
     self_rank = sym_help._get_tensor_rank(self)
-    if self_rank is not None and self_rank > 2:
-        weight = sym_help._unsqueeze_helper(g, weight, list(range(1, self_rank - 1)))
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = sym_help._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0:
+            # weight is always rank 1. torch allows scalar self, and ONNX is ambiguous
+            # about whether this is allowed, but some implementations enforce
+            # rank(self) >= rank(weight), which makes sense.
+            self = sym_help._unsqueeze_helper(g, self, [0])
+            self_rank = 1
+
+    weight_rank = sym_help._get_tensor_rank(weight)
+    if self_rank is not None and weight_rank is not None:
+        assert (
+            self_rank >= weight_rank
+        ), "rank(x) should be >= rank(slope) but got {} < {}".format(
+            self_rank, weight_rank
+        )
     return g.op("PRelu", self, weight)
 
 
 def silu(g, input):
     return g.op("Mul", input, g.op("Sigmoid", input))
 
+
 def mish(g, input):
     return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
 
+
+def op_with_optional_float_cast(g, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, "Float" by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", "Float")
+
+    inputs = list(args)
+    dtype_0 = inputs[0].type().scalarType()
+
+    require_cast = not sym_help._is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor() and input.type().scalarType() != dtype_0:
+                raise RuntimeError(
+                    f"Inputs of {op_name} must have same dtype. Got {dtype_0} and {input.type().scalarType()}"
+                )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not sym_help._is_fp(input):
+                inputs[i] = g.op(
+                    "Cast", input, to_i=sym_help.cast_pytorch_to_onnx[target_float_t]
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[dtype_0])
+
+    return self
+
+
+@quantized_args(True)
 def relu(g, input):
-    return g.op("Relu", input)
+    return op_with_optional_float_cast(g, "Relu", input, opset_before=14)
+
 
+@quantized_args(True)
 def relu6(g, input):
-    relu = g.op("Relu", input)
+    relu = op_with_optional_float_cast(g, "Relu", input, opset_before=14)
     return clamp_max(g, relu, 6)
 
+
 def ceil(g, input):
     return g.op("Ceil", input)
 
@@ -803,7 +939,7 @@ def softmax(g, input, dim, dtype=None):
         if dim < 0:
             dim = input_dim + dim
 
-        is_transpose_required = (input_dim != dim + 1)
+        is_transpose_required = input_dim != dim + 1
 
         if is_transpose_required:
             axes = list(range(input_dim))
@@ -814,7 +950,9 @@ def softmax(g, input, dim, dtype=None):
         softmax = g.op("Softmax", input, axis_i=dim)
         if dtype and dtype.node().kind() != "prim::Constant":
             parsed_dtype = sym_help._get_const(dtype, "i", "dtype")
-            softmax = g.op("Cast", softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+            softmax = g.op(
+                "Cast", softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype]
+            )
 
         if is_transpose_required:
             softmax = g.op("Transpose", softmax, perm_i=axes)
@@ -831,6 +969,7 @@ def softmax(g, input, dim, dtype=None):
         softmax = g.op("Cast", softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
     return softmax
 
+
 def softplus(g, self, beta, threshold):
     beta_const = sym_help._maybe_get_const(beta, "f")
     if beta_const != 1:
@@ -840,27 +979,41 @@ def softplus(g, self, beta, threshold):
 
 def get_pool_ceil_padding(input, kernel_size, stride, padding):
     sizes = sym_help._get_tensor_sizes(input)
-    dim = sizes[-len(padding):] if sizes is not None else None
+    dim = sizes[-len(padding) :] if sizes is not None else None
     if dim is None or any([i is None for i in dim]):
         return _unimplemented(name, "input size not accessible")
-    ceiled_output_dim = [int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i]))) + 1
-                         for i in range(0, len(padding))]
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
     # ensure last pooling starts inside
-    ceiled_output_dim = [ceiled_output_dim[i] - 1
-                         if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
-                         else ceiled_output_dim[i]
-                         for i in range(0, len(ceiled_output_dim))]
-    padding_ceil = [0
-                    if (stride[i] == 1)
-                    else
-                    (kernel_size[i] - (dim[i] + 2 * padding[i] - ((ceiled_output_dim[i] - 1) * stride[i] + 1)))
-                    for i in range(0, len(padding))]
+    ceiled_output_dim = [
+        ceiled_output_dim[i] - 1
+        if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+        else ceiled_output_dim[i]
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        0
+        if (stride[i] == 1)
+        else (
+            kernel_size[i]
+            - (dim[i] + 2 * padding[i] - ((ceiled_output_dim[i] - 1) * stride[i] + 1))
+        )
+        for i in range(0, len(padding))
+    ]
     # ensure padding is not > kernel_size
-    padding_ceil = [(int(padding_ceil[i]) if padding_ceil[i] < kernel_size[i] - 1 else int(kernel_size[i] - 1))
-                    if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
-                    else
-                    int(padding_ceil[i])
-                    for i in range(0, len(padding_ceil))]
+    padding_ceil = [
+        (
+            int(padding_ceil[i])
+            if padding_ceil[i] < kernel_size[i] - 1
+            else int(kernel_size[i] - 1)
+        )
+        if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+        else int(padding_ceil[i])
+        for i in range(0, len(padding_ceil))
+    ]
     return padding_ceil
 
 
@@ -897,12 +1050,21 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
         # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
         if return_indices:
             r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
-            _, flattened_indices = g.op("MaxPool", input, outputs=2,
-                                        kernel_shape_i=[1 for _ in range(ndims)],
-                                        strides_i=[1 for _ in range(ndims)])
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
             # convert indices to have non-flattened indices values
-            s = sym_help._slice_helper(g, flattened_indices, axes=[2 + i for i in range(ndims)],
-                                       starts=tuple_fn(0), ends=tuple_fn(1))
+            s = sym_help._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=tuple_fn(0),
+                ends=tuple_fn(1),
+            )
             indices = sub(g, indices, s)
             return r, indices
         else:
@@ -915,34 +1077,58 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
 max_pool1d = _max_pool("max_pool1d", _single, 1, return_indices=False)
 max_pool2d = _max_pool("max_pool2d", _pair, 2, return_indices=False)
 max_pool3d = _max_pool("max_pool3d", _triple, 3, return_indices=False)
-max_pool1d_with_indices = _max_pool("max_pool1d_with_indices", _single, 1, return_indices=True)
-max_pool2d_with_indices = _max_pool("max_pool2d_with_indices", _pair, 2, return_indices=True)
-max_pool3d_with_indices = _max_pool("max_pool3d_with_indices", _triple, 3, return_indices=True)
+max_pool1d_with_indices = _max_pool(
+    "max_pool1d_with_indices", _single, 1, return_indices=True
+)
+max_pool2d_with_indices = _max_pool(
+    "max_pool2d_with_indices", _pair, 2, return_indices=True
+)
+max_pool3d_with_indices = _max_pool(
+    "max_pool3d_with_indices", _triple, 3, return_indices=True
+)
 
 
 def _avg_pool(name, tuple_fn):
     @parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override=None):
+    def symbolic_fn(
+        g,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override=None,
+    ):
         if not stride:
             stride = kernel_size
-        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name)
+        padding = sym_help._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
         if ceil_mode:
             padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
         if count_include_pad:
-            input = g.op("Pad", input,
-                         pads_i=((0,) * 2 + padding) * 2,
-                         mode_s="constant",
-                         value_f=0.)
+            input = g.op(
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+            )
             padding = (0,) * len(padding)
         if ceil_mode:
             padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
         else:
             padding = padding * 2
-        output = g.op("AveragePool", input,
-                      kernel_shape_i=tuple_fn(kernel_size),
-                      strides_i=tuple_fn(stride),
-                      pads_i=padding)
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding,
+        )
         return output
+
     return symbolic_fn
 
 
@@ -952,6 +1138,7 @@ def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include
 
 
 def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @quantized_args(True, False)
     def symbolic_fn(g, input, output_size):
         # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
         # by executing a GlobalPool.
@@ -965,7 +1152,9 @@ def symbolic_fn(g, input, output_size):
         try:
             output_size = _parse_arg(output_size, "is")
         except Exception:
-            return sym_help._onnx_unsupported("adaptive pooling, since output_size is not constant.")
+            return sym_help._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant."
+            )
         if output_size == [1] * len(output_size) and type == "AveragePool":
             return g.op("GlobalAveragePool", input)
         sizes = sym_help._get_tensor_sizes(input)
@@ -982,18 +1171,14 @@ def symbolic_fn(g, input, output_size):
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
-            if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-                return _unimplemented(name, "output size that are not factor of input size")
-            else:
-                return sym_help._onnx_unsupported(name + ", since output size is not factor of input size")
+            return _unimplemented(name, "output size that are not factor of input size")
         k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
             return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
-        output = g.op(type, input,
-                      kernel_shape_i=tuple_fn(k),
-                      strides_i=tuple_fn(k))
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
         return output
+
     return symbolic_fn
 
 
@@ -1001,9 +1186,15 @@ def symbolic_fn(g, input, output_size):
 adaptive_avg_pool2d = _adaptive_pool("adaptive_avg_pool2d", "AveragePool", _pair)
 adaptive_avg_pool3d = _adaptive_pool("adaptive_avg_pool3d", "AveragePool", _triple)
 
-adaptive_max_pool1d = _adaptive_pool("adaptive_max_pool1d", "MaxPool", _single, max_pool1d_with_indices)
-adaptive_max_pool2d = _adaptive_pool("adaptive_max_pool2d", "MaxPool", _pair, max_pool2d_with_indices)
-adaptive_max_pool3d = _adaptive_pool("adaptive_max_pool3d", "MaxPool", _triple, max_pool3d_with_indices)
+adaptive_max_pool1d = _adaptive_pool(
+    "adaptive_max_pool1d", "MaxPool", _single, max_pool1d_with_indices
+)
+adaptive_max_pool2d = _adaptive_pool(
+    "adaptive_max_pool2d", "MaxPool", _pair, max_pool2d_with_indices
+)
+adaptive_max_pool3d = _adaptive_pool(
+    "adaptive_max_pool3d", "MaxPool", _triple, max_pool3d_with_indices
+)
 
 
 # Generate paddings in ONNX order based on pad in pytorch.
@@ -1022,6 +1213,7 @@ def _prepare_onnx_paddings(dim, pad):
     paddings = paddings[-2::-2] + paddings[-1::-2]
     return paddings
 
+
 def _convert_padding_node(padding):
     padding = sym_help._maybe_get_const(padding, "is")
     if sym_help._is_value(padding) and sym_help._is_packed_list(padding):
@@ -1029,33 +1221,84 @@ def _convert_padding_node(padding):
         try:
             padding = [sym_help._get_const(v, "i", "padding") for v in input_list]
         except Exception:
-            return sym_help._onnx_opset_unsupported_detailed("Pad", 9, 11, "The sizes of the padding must be constant")
+            return sym_help._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant"
+            )
     return padding
 
+
 def constant_pad_nd(g, input, padding, value):
     mode = "constant"
     try:
         value = sym_help._get_const(value, "f", "value")
     except Exception:
-        return sym_help._onnx_opset_unsupported_detailed("Pad", 9, 11, "The value for the padding must be constant")
+        return sym_help._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant"
+        )
 
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding)
-    return g.op("Pad", input, pads_i=paddings, mode_s=mode, value_f=value)
+    return op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+def _pad_circular(g, input, pad):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_l = padding[-(2 * idx + 1)]
+        pad_r = padding[-(2 * idx + 2)]
+
+        tensors = []
+        if pad_l > 0:
+            left = sym_help._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l + 1)], ends=[-1]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            middle = sym_help._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[max(0, -pad_l)],
+                ends=[-(1 + max(0, -pad_r))],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = sym_help._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
 
 
 def reflection_pad(g, input, padding):
     mode = "reflect"
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding)
-    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+    return op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
 
 
 def replication_pad(g, input, padding):
     mode = "edge"
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding)
-    return g.op("Pad", input, pads_i=paddings, mode_s=mode)
+    return op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
 
 
 reflection_pad1d = reflection_pad
@@ -1066,9 +1309,25 @@ def replication_pad(g, input, padding):
 replication_pad3d = replication_pad
 
 
+def pad(g, input, pad, mode, value):
+    mode = sym_help._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise RuntimeError(f"Unrecognized padding mode {mode}")
+
+
 def _interpolate(name, dim, interpolate_mode):
     def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = sym_help._get_interpolate_attributes(g, interpolate_mode, args)
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
         sym_help._interpolate_warning(interpolate_mode)
         align_corners = sym_help._maybe_get_scalar(align_corners)
         if align_corners:
@@ -1076,6 +1335,7 @@ def symbolic_fn(g, input, output_size, *args):
         if scales is None:
             scales = sym_help._interpolate_size_to_scales(g, input, output_size, dim)
         return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
     return symbolic_fn
 
 
@@ -1087,24 +1347,33 @@ def symbolic_fn(g, input, output_size, *args):
 upsample_trilinear3d = _interpolate("upsample_trilinear3d", 5, "linear")
 
 
-def __interpolate(g, input, size, scale_factor, mode , align_corners, recompute_scale_factor, antialias):
-    scales, mode = sym_help._interpolate_get_scales_and_mode(g, input, size, scale_factor,
-                                                             mode , align_corners)
+def __interpolate(
+    g, input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias
+):
+    scales, mode = sym_help._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
     return g.op("Upsample", input, scales, mode_s=mode)
 
 
 def bitwise_not(g, inp):
     if inp.type().scalarType() != "Bool":
-        raise NotImplementedError("ONNX export does NOT support exporting bitwise Not " +
-                                  "for non-boolean input values")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting bitwise Not "
+            + "for non-boolean input values"
+        )
     return g.op("Not", inp)
 
 
 def wrap_logical_op_with_cast_to(to_type):
     def decorator(fn):
         def wrap_with_cast(g, input, other):
-            return g.op("Cast", fn(g, input, other), to_i=sym_help.cast_pytorch_to_onnx[to_type])
+            return g.op(
+                "Cast", fn(g, input, other), to_i=sym_help.cast_pytorch_to_onnx[to_type]
+            )
+
         return wrap_with_cast
+
     return decorator
 
 
@@ -1113,31 +1382,44 @@ def decorator(fn):
         def wrap_with_cast(g, input, other):
             to_cast_func = globals()["_cast_{}".format(to_type)]
             from_cast_func = wrap_logical_op_with_cast_to(input.type().scalarType())(fn)
-            return from_cast_func(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+            return from_cast_func(
+                g, to_cast_func(g, input, False), to_cast_func(g, other, False)
+            )
+
         return wrap_with_cast
+
     return decorator
 
 
 def wrap_logical_op_with_negation(func):
     def wrap_with_not(g, input, other):
         return g.op("Not", func(g, input, other))
+
     return wrap_with_not
 
 
 def __not_(g, self):
     if self.type().scalarType() != "Bool":
-        raise NotImplementedError("ONNX export does NOT support exporting bitwise Not " +
-                                  "for non-boolean input values")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting bitwise Not "
+            + "for non-boolean input values"
+        )
     return g.op("Not", self)
 
 
 def eq(g, self, other):
+    if isinstance(self.type(), DeviceObjType) and isinstance(
+        other.type(), DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
     return g.op("Equal", self, other)
 
 
 @wrap_logical_op_with_negation
 def ne(g, self, other):
-    return g.op("Equal", self, other)
+    return eq(g, self, other)
 
 
 def gt(g, input, other):
@@ -1145,8 +1427,12 @@ def gt(g, input, other):
 
 
 def gt_impl(g, input, other):
-    if input.type().scalarType() is not None and input.type().scalarType() == "Bool" and \
-            other.type().scalarType() is not None and other.type().scalarType() == "Bool":
+    if (
+        input.type().scalarType() is not None
+        and input.type().scalarType() == "Bool"
+        and other.type().scalarType() is not None
+        and other.type().scalarType() == "Bool"
+    ):
         input = g.op("Cast", input, to_i=sym_help.cast_pytorch_to_onnx["Int"])
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Int"])
     return g.op("Greater", input, other)
@@ -1157,8 +1443,12 @@ def lt(g, input, other):
 
 
 def lt_impl(g, input, other):
-    if input.type().scalarType() is not None and input.type().scalarType() == "Bool" and \
-            other.type().scalarType() is not None and other.type().scalarType() == "Bool":
+    if (
+        input.type().scalarType() is not None
+        and input.type().scalarType() == "Bool"
+        and other.type().scalarType() is not None
+        and other.type().scalarType() == "Bool"
+    ):
         input = g.op("Cast", input, to_i=sym_help.cast_pytorch_to_onnx["Int"])
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Int"])
     return g.op("Less", input, other)
@@ -1175,30 +1465,33 @@ def le(g, input, other):
 
 
 def __and_(g, input, other):
-    if input.type().scalarType() == "Bool" and \
-            other.type().scalarType() == "Bool":
+    if input.type().scalarType() == "Bool" and other.type().scalarType() == "Bool":
         return g.op("And", input, other)
     else:
-        raise NotImplementedError("ONNX export does NOT support exporting bitwise AND " +
-                                  "for non-boolean input values")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting bitwise AND "
+            + "for non-boolean input values"
+        )
 
 
 def __or_(g, input, other):
-    if input.type().scalarType() == "Bool" and \
-            other.type().scalarType() == "Bool":
+    if input.type().scalarType() == "Bool" and other.type().scalarType() == "Bool":
         return g.op("Or", input, other)
     else:
-        raise NotImplementedError("ONNX export does NOT support exporting bitwise OR " +
-                                  "for non-boolean input values")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting bitwise OR "
+            + "for non-boolean input values"
+        )
 
 
 def __xor_(g, input, other):
-    if input.type().scalarType() == "Bool" and \
-            other.type().scalarType() == "Bool":
+    if input.type().scalarType() == "Bool" and other.type().scalarType() == "Bool":
         return g.op("Xor", input, other)
     else:
-        raise NotImplementedError("ONNX export does NOT support exporting bitwise XOR " +
-                                  "for non-boolean input values")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            + "for non-boolean input values"
+        )
 
 
 @wrap_logical_op_with_cast_to_and_from("Bool")
@@ -1220,14 +1513,18 @@ def __rshift_(g, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
     if other.type().scalarType() != self.type().scalarType():
-        other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+        other = g.op(
+            "Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+        )
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
     # exponent (same type as self) has to be float or double in onnx::Pow
     if not sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Float"])
     two_pow = g.op("Pow", two, other)
-    two_pow = g.op("Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+    two_pow = g.op(
+        "Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+    )
     rshift = g.op("Div", self, two_pow)
     return rshift
 
@@ -1236,14 +1533,18 @@ def __lshift_(g, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
     if other.type().scalarType() != self.type().scalarType():
-        other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+        other = g.op(
+            "Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+        )
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
     # exponent (same type as self) has to be float or double in onnx::Pow
     if not sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx["Float"])
     two_pow = g.op("Pow", two, other)
-    two_pow = g.op("Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+    two_pow = g.op(
+        "Cast", two_pow, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()]
+    )
     lshift = g.op("Mul", self, two_pow)
     return lshift
 
@@ -1254,8 +1555,10 @@ def where(g, condition, self=None, other=None, _outputs=None):
     if condition.type().scalarType() != "Bool":
         condition = g.op("Cast", condition, to_i=sym_help.cast_pytorch_to_onnx["Bool"])
     if self is None:
-        condition = torch.onnx.symbolic_opset9.nonzero(g, condition)
-        return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs)
+        condition = nonzero(g, condition)
+        return sym_help._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
     return g.op("Where", condition, self, other)
 
 
@@ -1266,12 +1569,14 @@ def log_softmax(g, input, dim, dtype=None):
     # TODO: remove this as onnx opset 11 spec allows negative axes
     input_dim = sym_help._get_tensor_rank(input)
     if input_dim is None:
-        return _unimplemented("dim",
-                              "ONNX and PyTorch use different strategies to split the input. "
-                              "Input rank must be known at export time.")
+        return _unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
     if dim < 0:
         dim = input_dim + dim
-    is_transpose_required = (input_dim != dim + 1)
+    is_transpose_required = input_dim != dim + 1
     # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
     if is_transpose_required:
         axes = list(range(input_dim))
@@ -1281,15 +1586,31 @@ def log_softmax(g, input, dim, dtype=None):
     return_op = g.op("LogSoftmax", input, axis_i=dim)
     if dtype and dtype.node().kind() != "prim::Constant":
         parsed_dtype = sym_help._get_const(dtype, "i", "dtype")
-        return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+        return_op = g.op(
+            "Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype]
+        )
     if is_transpose_required:
         return_op = g.op("Transpose", return_op, perm_i=axes)
     return return_op
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i")
-def _convolution(g, input, weight, bias, stride, padding, dilation,
-                 transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32):
+def _convolution(
+    g,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
     weight_size = sym_help._get_tensor_sizes(weight)
     try:
         kernel_shape = weight_size[2:]
@@ -1297,21 +1618,24 @@ def _convolution(g, input, weight, bias, stride, padding, dilation,
         kernel_shape = None
 
     if kernel_shape is None or any([i is None for i in kernel_shape]):
-        raise RuntimeError("Unsupported: ONNX export of convolution for kernel "
-                           "of unknown shape.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of convolution for kernel " "of unknown shape."
+        )
 
     args = [input, weight]
     # ONNX only supports 1D bias
     if not sym_help._is_none(bias) and sym_help._get_tensor_rank(bias) == 1:
         args.append(bias)
 
-    kwargs = {"kernel_shape_i": weight_size[2:],
-              "strides_i": stride,
-              # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
-              # symmetric padding
-              "pads_i": padding + padding,
-              "dilations_i": dilation,
-              "group_i": groups}
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
 
     if any(o != 0 for o in output_padding):
         # ONNX supports both output_shape and output_padding. they are equivalent expressive.
@@ -1331,42 +1655,172 @@ def _convolution(g, input, weight, bias, stride, padding, dilation,
 
 @parse_args("v", "v", "v", "is", "is", "is", "i")
 def conv1d(g, input, weight, bias, stride, padding, dilation, groups):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, False, (), groups, None, None, None, None)
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        False,
+        (),
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i")
 def conv2d(g, input, weight, bias, stride, padding, dilation, groups):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, False, (), groups, None, None, None, None)
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        False,
+        (),
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i")
 def conv3d(g, input, weight, bias, stride, padding, dilation, groups):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, False, (), groups, None, None, None, None)
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        False,
+        (),
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose1d(g, input, weight, bias, stride, padding, output_padding, groups, dilation):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, True, output_padding, groups, None, None, None, None)
+def conv_transpose1d(
+    g, input, weight, bias, stride, padding, output_padding, groups, dilation
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose2d(g, input, weight, bias, stride, padding, output_padding, groups, dilation):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, True, output_padding, groups, None, None, None, None)
+def conv_transpose2d(
+    g, input, weight, bias, stride, padding, output_padding, groups, dilation
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, groups, dilation):
-    return _convolution(g, input, weight, bias, stride, padding, dilation, True, output_padding, groups, None, None, None, None)
+def conv_transpose3d(
+    g, input, weight, bias, stride, padding, output_padding, groups, dilation
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
 
 
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
+def batch_norm(
+    g,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
     sym_help.check_training_mode(training, "batch_norm")
-    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
-    out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
-               epsilon_f=eps,
-               momentum_f=1 - momentum,
-               outputs=1 if not training else 5)
+
+    if (
+        torch.is_autocast_enabled()
+        and not args_have_same_dtype([input, weight, bias, running_mean, running_var])
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return sym_help._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+        )
+
+    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
     if not training:
         return out
     else:
@@ -1380,13 +1834,20 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome
 
 @parse_args("v", "is", "v", "v", "f", "i")
 def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, weight, bias, normalized_shape_i=normalized_shape,
-                    eps_f=eps, cudnn_enable_i=cudnn_enable, operator_s="layer_norm")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "layer_norm",
+            input,
+            weight,
+            bias,
+            normalized_shape_i=normalized_shape,
+            eps_f=eps,
+            cudnn_enable_i=cudnn_enable,
+        )
 
     axes = [-i for i in range(len(normalized_shape), 0, -1)]
 
-    two_cst = sym_help._generate_wrapped_number(g, 2.)
+    two_cst = sym_help._generate_wrapped_number(g, 2.0)
     eps_cst = sym_help._generate_wrapped_number(g, eps)
 
     mean = g.op("ReduceMean", input, axes_i=axes)
@@ -1406,24 +1867,44 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
 
 
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled):
+def instance_norm(
+    g,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
     sym_help.check_training_mode(use_input_stats, "instance_norm")
     channel_size = sym_help._get_tensor_dim_size(input, 1)
     if weight is None or sym_help._is_none(weight):
         if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of instance_norm for unknown "
-                               "channel size.")
-        weight_value = torch.tensor([1.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
+            raise RuntimeError(
+                "Unsupported: ONNX export of instance_norm for unknown " "channel size."
+            )
+        weight_value = torch.tensor([1.0] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or sym_help._is_none(bias):
         if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of instance_norm for unknown "
-                               "channel size.")
-        bias_value = torch.tensor([0.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
+            raise RuntimeError(
+                "Unsupported: ONNX export of instance_norm for unknown " "channel size."
+            )
+        bias_value = torch.tensor([0.0] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         bias = g.op("Constant", value_t=bias_value)
-    if running_mean is None or sym_help._is_none(running_mean) or running_var is None or sym_help._is_none(running_var):
+    if (
+        running_mean is None
+        or sym_help._is_none(running_mean)
+        or running_var is None
+        or sym_help._is_none(running_var)
+    ):
         return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
     else:
         input_size = sym_help._get_tensor_sizes(input)
@@ -1433,25 +1914,53 @@ def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_s
         input_size_reshape = input_size.copy()
         n = input_size[0]
         if n is None:
-            raise RuntimeError("Unsupported: ONNX export of instance_norm training for unknown "
-                               "batch size.")
+            raise RuntimeError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size."
+            )
         c = input_size[1]
         input_size_reshape[0] = 1
         input_size_reshape[1] = n * c
-        weight_ = repeat(g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)))
-        bias_ = repeat(g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)))
-        running_mean_ = repeat(g, running_mean, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)))
-        running_var_ = repeat(g, running_var, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)))
-        input_reshaped = g.op("Reshape", input, g.op("Constant", value_t=torch.LongTensor(input_size_reshape)))
-        out = batch_norm(g, input_reshaped, weight_, bias_, running_mean_, running_var_, use_input_stats,
-                         momentum, eps, cudnn_enabled)
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
         return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
 
 
 @parse_args("v", "i", "i", "i")
 def unfold(g, input, dimension, size, step):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("unfold", input, dimension_i=dimension, size_i=size, step_i=step)
     sizes = sym_help._get_tensor_sizes(input)
     try:
         sizedim = sizes[dimension]
@@ -1460,12 +1969,19 @@ def unfold(g, input, dimension, size, step):
     if sizedim is not None:
         low_indices = range(0, sizedim, step)
         hi_indices = range(size, sizedim + 1, step)
-        stack = [sym_help._slice_helper(g, input, axes=[dimension], starts=[low], ends=[hi])
-                 for low, hi in zip(low_indices, hi_indices)]
+        stack = [
+            sym_help._slice_helper(g, input, axes=[dimension], starts=[low], ends=[hi])
+            for low, hi in zip(low_indices, hi_indices)
+        ]
         ndim = len(sizes)
         perm = list(range(0, ndim))
         perm.append(perm.pop(dimension))
-        unsqueeze = [sym_help._unsqueeze_helper(g, g.op("Transpose", t, perm_i=perm), [dimension]) for t in stack]
+        unsqueeze = [
+            sym_help._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
         return g.op("Concat", *unsqueeze, axis_i=dimension)
     else:
         return _unimplemented("Unfold", "input size not accessible")
@@ -1473,9 +1989,9 @@ def unfold(g, input, dimension, size, step):
 
 @parse_args("v", "t", "t", "t")
 def elu(g, input, alpha, scale, input_scale):
-    if scale and scale != 1.:
+    if scale and scale != 1.0:
         return _unimplemented("scale", "does not support scale in Elu")
-    if input_scale and input_scale != 1.:
+    if input_scale and input_scale != 1.0:
         return _unimplemented("input_scale", "does not support input_scale in Elu")
     # See Note [Export inplace]
     return g.op("Elu", input, alpha_f=sym_help._scalar(alpha))
@@ -1498,9 +2014,9 @@ def index_put(g, self, indices_list_value, values, accumulate):
         indices_list = sym_help._unpack_list(indices_list_value)
     else:
         indices_list = [indices_list_value]
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+    if sym_help.is_caffe2_aten_fallback():
         args = [self] + indices_list + [values, accumulate]
-        return g.op("ATen", *args, operator_s="index_put")
+        return g.at("index_put", *args)
 
     accumulate = sym_help._parse_arg(accumulate, "b")
 
@@ -1515,9 +2031,19 @@ def index_put(g, self, indices_list_value, values, accumulate):
 
 def index_fill(g, self, dim, index, value):
     dim_value = sym_help._parse_arg(dim, "i")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, value, dim_i=dim_value, operator_s="index_fill")
-    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(g, self, dim, index)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "index_fill",
+            self,
+            index,
+            value,
+            overload_name="int_Scalar",
+            dim_i=dim_value,
+        )
+
+    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(
+        g, self, dim, index
+    )
     value = sym_help._maybe_get_scalar(value)
     value = sym_help._if_scalar_type_as(g, value, self)
     expanded_value = expand(g, value, expanded_index_shape, None)
@@ -1527,12 +2053,43 @@ def index_fill(g, self, dim, index, value):
 
 def index_copy(g, self, dim, index, source):
     dim_value = sym_help._parse_arg(dim, "i")
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, source, dim_i=dim_value, operator_s="index_copy")
-    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(g, self, dim, index)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("index_copy", self, index, source, dim_i=dim_value)
+    expanded_index_shape, expanded_index = sym_help._index_fill_reshape_helper(
+        g, self, dim, index
+    )
     return scatter(g, self, dim, expanded_index, source)
 
 
+@parse_args("v", "v", "b", "b")
+def bucketize(g, self, boundaries, out_int32=False, right=False):
+    out_type = torch.onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = torch.onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    unsqueeze_axes = list(range(1, sym_help._get_tensor_rank(self) + 1))
+    expanded_boundaries = expand(
+        g, sym_help._unsqueeze_helper(g, boundaries, unsqueeze_axes), new_shape, None
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return sym_help._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
 def type_as(g, self, other):
     self_dtype = sym_help._try_get_scalar_type(self)
     other_dtype = sym_help._try_get_scalar_type(other)
@@ -1541,21 +2098,45 @@ def type_as(g, self, other):
     if other_dtype is not None:
         return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other_dtype])
     else:
-        if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+        if sym_help.is_caffe2_aten_fallback():
             # We don't know the type of other, bail by emitting ATen
-            return g.op("ATen", self, other, operator_s="type_as")
+            return g.at("type_as", self, other)
         else:
-            raise RuntimeError("Unsupported: ONNX export of type_as for tensor "
-                               "of unknown dtype. Please check if the dtype of the "
-                               "parameter passed to the type_as function is correct.")
+            raise RuntimeError(
+                "Unsupported: ONNX export of type_as for tensor "
+                "of unknown dtype. Please check if the dtype of the "
+                "parameter passed to the type_as function is correct."
+            )
 
 
 @parse_args("v", "v", "i", "f")
 def cosine_similarity(g, x1, x2, dim, eps):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", x1, x2, dim_i=dim, eps_f=eps, operator_s="cosine_similarity")
-    else:
-        return sym_help._onnx_unsupported("cosine_similarity")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("cosine_similarity", x1, x2, dim_i=dim, eps_f=eps)
+    cross = sym_help._reducesum_helper(g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0)
+    x1_l2 = sym_help._reducesum_helper(g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0)
+    x2_l2 = sym_help._reducesum_helper(g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0)
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+def pairwise_distance(g, input1, input2, p, eps, keepdim):
+    if not sym_help._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = sym_help._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=_parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
 
 
 # ignore clone operators that are inserted by PyTorch autograd
@@ -1600,7 +2181,14 @@ def clamp(g, self, min, max):
         return clamp_min(g, self, min)
     else:
         if sym_help._is_constant(min) and sym_help._is_constant(max):
-            return g.op("Clip", self, min_f=_parse_arg(min, "f"), max_f=_parse_arg(max, "f"))
+            return op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=_parse_arg(min, "f"),
+                max_f=_parse_arg(max, "f"),
+                opset_before=12,
+            )
         else:
             return clamp_max(g, clamp_min(g, self, min), max)
 
@@ -1608,21 +2196,25 @@ def clamp(g, self, min, max):
 @parse_args("v", "v")
 def clamp_min(g, self, min):
     if sym_help._is_constant(min):
-        return g.op("Clip", self, min_f=_parse_arg(min, "f"))
+        return op_with_optional_float_cast(
+            g, "Clip", self, min_f=_parse_arg(min, "f"), opset_before=12
+        )
     else:
         dtype = self.type().scalarType()
         min = g.op("Cast", min, to_i=sym_help.cast_pytorch_to_onnx[dtype])
-        return g.op("Max", self, min)
+        return op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
 
 
 @parse_args("v", "v")
 def clamp_max(g, self, max):
     if sym_help._is_constant(max):
-        return g.op("Clip", self, max_f=_parse_arg(max, "f"))
+        return op_with_optional_float_cast(
+            g, "Clip", self, max_f=_parse_arg(max, "f"), opset_before=12
+        )
     else:
         dtype = self.type().scalarType()
         max = g.op("Cast", max, to_i=sym_help.cast_pytorch_to_onnx[dtype])
-        return g.op("Min", self, max)
+        return op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
 
 
 # torch.max (same for torch.min) actually has two interfaces smashed together:
@@ -1633,7 +2225,7 @@ def max(g, self, dim_or_y=None, keepdim=None):
         return g.op("ReduceMax", self, keepdims_i=0)
     # torch.max(input, other)
     if keepdim is None:
-        return g.op("Max", self, dim_or_y)
+        return op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
     # torch.max(input, dim, keepdim)
     else:
         dim = sym_help._get_const(dim_or_y, "i", "dim")
@@ -1643,13 +2235,17 @@ def max(g, self, dim_or_y=None, keepdim=None):
         return max, indices
 
 
+def maximum(g, input, other):
+    return max(g, input, dim_or_y=other)
+
+
 def min(g, self, dim_or_y=None, keepdim=None):
     # torch.min(input)
     if dim_or_y is None and keepdim is None:
         return g.op("ReduceMin", self, keepdims_i=0)
     # torch.min(input, other)
     if keepdim is None:
-        return g.op("Min", self, dim_or_y)
+        return op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
     # torch.min(input, dim, keepdim)
     else:
         dim = sym_help._get_const(dim_or_y, "i", "dim")
@@ -1659,6 +2255,32 @@ def min(g, self, dim_or_y=None, keepdim=None):
         return min, indices
 
 
+def minimum(g, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@parse_args("v", "is", "i")
+def amax(g, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@parse_args("v", "is", "i")
+def amin(g, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@parse_args("v", "v", "i")
+def aminmax(g, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not sym_help._is_none(dim):
+        dim = sym_help._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
 def exp(g, self):
     return g.op("Exp", self)
 
@@ -1669,8 +2291,10 @@ def dropout(g, input, p, train):
     # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op
     if not train:
         return input
-    warnings.warn("Dropout is a training op and should not be exported in inference mode. "
-                  "For inference, make sure to call eval() on the model and to export it with param training=False.")
+    warnings.warn(
+        "Dropout is a training op and should not be exported in inference mode. "
+        "For inference, make sure to call eval() on the model and to export it with param training=False."
+    )
     r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
     return r
 
@@ -1682,6 +2306,7 @@ def feature_dropout(g, input, p, train):
         if train:
             return _unimplemented(name, "training mode")
         return input
+
     return feature_dropout
 
 
@@ -1709,8 +2334,8 @@ def norm(g, self, p, dim, keepdim):
 
 @parse_args("v", "v", "v", "i")
 def conv_tbc(g, input, weight, bias, pad):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, weight, bias, operator_s="conv_tbc", pad_i=pad)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("conv_tbc", input, weight, bias, pad_i=pad)
     else:
         # input must have 3 dimensions, see:
         # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
@@ -1725,23 +2350,35 @@ def conv_tbc(g, input, weight, bias, pad):
 
 @parse_args("v", "i", "i")
 def _unique(g, input, sorted, return_inverse):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, operator_s="_unique", sorted_i=sorted,
-                    return_inverse_i=return_inverse, outputs=2)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "_unique",
+            input,
+            sorted_i=sorted,
+            return_inverse_i=return_inverse,
+            outputs=2,
+        )
     else:
         return sym_help._onnx_unsupported("_unique")
 
 
 @parse_args("v", "i", "i", "i")
 def _unique2(g, input, sorted, return_inverse, return_counts):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, operator_s="_unique2", sorted_i=sorted,
-                    return_inverse_i=return_inverse, return_counts_i=return_counts,
-                    outputs=3)
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "_unique2",
+            input,
+            sorted_i=sorted,
+            return_inverse_i=return_inverse,
+            return_counts_i=return_counts,
+            outputs=3,
+        )
     else:
         sym_help._onnx_opset_unsupported("_unique2", 9, 11)
 
 
+# TODO(justinchuby): Clean up this function generation magic by defining the functions
+# explicitly.
 for k, v in sym_help.cast_pytorch_to_onnx.items():
     name = "_cast_{}".format(k)
     globals()[name] = parse_args("v", "i")(partial(sym_help._cast_func_template, v))
@@ -1753,7 +2390,9 @@ def empty(g, sizes, dtype, layout, device, pin_memory=False, memory_format=None)
 
 
 @parse_args("v", "i", "v", "v", "v", "v")
-def empty_like(g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None):
+def empty_like(
+    g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None
+):
     return zeros_like(g, input, dtype, layout, device, pin_memory)
 
 
@@ -1778,7 +2417,9 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False):
     if sym_help._is_packed_list(data):
         if dtype is None:
             dtype = sym_help._unpack_list(data)[0].type().scalarType()
-            dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
+            dtype = sym_help.scalar_type_to_onnx.index(
+                sym_help.cast_pytorch_to_onnx[dtype]
+            )
         input_list = list()
         for t in sym_help._unpack_list(data):
             shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
@@ -1789,14 +2430,20 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False):
     else:
         if dtype is None:
             dtype = data.type().scalarType()
-            dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
-        if sym_help._is_list(data) and (sym_help._is_tensor_list(data) or sym_help._is_scalar_list(data)):
+            dtype = sym_help.scalar_type_to_onnx.index(
+                sym_help.cast_pytorch_to_onnx[dtype]
+            )
+        if sym_help._is_list(data) and (
+            sym_help._is_tensor_list(data) or sym_help._is_scalar_list(data)
+        ):
             data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
     return g.op("Cast", data, to_i=sym_help.scalar_type_to_onnx[dtype])
 
+
 def as_tensor(g, data, dtype=None, device=None):
     return tensor(g, data, dtype, device)
 
+
 @parse_args("v", "i", "v", "v", "v")
 def zeros(g, sizes, dtype, layout, device, pin_memory=False):
     # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
@@ -1805,17 +2452,25 @@ def zeros(g, sizes, dtype, layout, device, pin_memory=False):
     sizes_ = sym_help._maybe_get_const(sizes, "is")
     if isinstance(sizes_, list) and len(sizes_) == 0:
         sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op("ConstantOfShape", sizes,
-                value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
 
 
 @parse_args("v", "i", "v", "v", "v", "v")
-def zeros_like(g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None):
+def zeros_like(
+    g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None
+):
     shape = g.op("Shape", input)
     if dtype is None:
         dtype = ScalarType.FLOAT
-    return g.op("ConstantOfShape", shape,
-                value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
 
 
 def new_zeros(g, self, sizes, dtype, layout, device, pin_memory=False):
@@ -1833,17 +2488,26 @@ def ones(g, sizes, dtype, layout, device, pin_memory=False):
     sizes_ = sym_help._maybe_get_const(sizes, "is")
     if isinstance(sizes_, list) and len(sizes_) == 0:
         sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op("ConstantOfShape", sizes,
-                value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
 
 
 @parse_args("v", "i", "v", "v", "v", "v")
-def ones_like(g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None):
+def ones_like(
+    g, input, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None
+):
     shape = g.op("Shape", input)
     if dtype is None:
         dtype = ScalarType.FLOAT
-    return g.op("ConstantOfShape", shape,
-                value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]),
+    )
+
 
 def new_ones(g, self, sizes, dtype, layout, device, pin_memory=False):
     self_dtype = sym_help._try_get_scalar_type(self)
@@ -1852,6 +2516,7 @@ def new_ones(g, self, sizes, dtype, layout, device, pin_memory=False):
         dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
     return ones(g, sizes, dtype, layout, device, pin_memory)
 
+
 def full(g, sizes, value, dtype, layout, device, pin_memory=False):
     const_value = sym_help._maybe_get_const(value, "t")
     if sym_help._is_value(const_value):
@@ -1864,11 +2529,23 @@ def full(g, sizes, value, dtype, layout, device, pin_memory=False):
         sizes_ = sym_help._maybe_get_const(sizes, "is")
         if isinstance(sizes_, list) and len(sizes_) == 0:
             sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-        return g.op("ConstantOfShape", sizes,
-                    value_t=const_value.view(1).to(sym_help.scalar_type_to_pytorch_type[dtype]))
-
-
-def full_like(g, input, fill_value, dtype=None, layout=None, device=None, pin_memory=False, memory_format=None):
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(sym_help.scalar_type_to_pytorch_type[dtype]),
+        )
+
+
+def full_like(
+    g,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
     fill_value = sym_help._maybe_get_const(fill_value, "f")
     dtype = sym_help._get_const(dtype, "i", "dtype")
     dtype = ScalarType.FLOAT if dtype is None else dtype
@@ -1878,8 +2555,13 @@ def full_like(g, input, fill_value, dtype=None, layout=None, device=None, pin_me
         return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
     else:
         shape = g.op("Shape", input)
-        return g.op("ConstantOfShape", shape,
-                    value_t=torch.tensor([fill_value]).to(sym_help.scalar_type_to_pytorch_type[dtype]))
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value]).to(
+                sym_help.scalar_type_to_pytorch_type[dtype]
+            ),
+        )
 
 
 def new_full(g, self, size, fill_value, dtype, layout, device, pin_memory=False):
@@ -1901,7 +2583,12 @@ def eye(g, *args):
     elif len(args) == 6:
         # aten::eye(n, m, dtype, layout, device, pin_memory)
         n, m, dtype, layout, device, pin_memory = args
-        shape = g.op("Concat", sym_help._unsqueeze_helper(g, n, [0]), sym_help._unsqueeze_helper(g, m, [0]), axis_i=0)
+        shape = g.op(
+            "Concat",
+            sym_help._unsqueeze_helper(g, n, [0]),
+            sym_help._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
         tensor = zeros(g, shape, dtype, layout, device)
         return g.op("EyeLike", tensor)
     else:
@@ -1915,33 +2602,55 @@ def slice(g, self, *args):
         step = _parse_arg(step, "i")
         if step != 1:
             raise RuntimeError("step!=1 is currently not supported")
-        is_start_none = start.node().kind() == "prim::Constant" and start.type().kind() == "NoneType"
-        is_end_none = end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+        is_start_none = (
+            start.node().kind() == "prim::Constant"
+            and start.type().kind() == "NoneType"
+        )
+        is_end_none = (
+            end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+        )
         is_start_onnx_const = start.node().kind() == "onnx::Constant"
         is_end_onnx_const = end.node().kind() == "onnx::Constant"
-        if ((not is_start_none) and (not is_start_onnx_const)) or \
-           ((not is_end_none) and (not is_end_onnx_const)) or \
-           dim.node().kind() != "onnx::Constant":
-            if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX:
-                raise RuntimeError("Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
-                                   "is a deprecated experimental op. Please use statically allocated "
-                                   "variables or export to a higher opset version.")
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == torch.onnx.OperatorExportTypes.ONNX:
+                raise RuntimeError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version."
+                )
             else:
                 start_unsqueezed = sym_help._unsqueeze_helper(g, start, [0])
                 end_unsqueezed = sym_help._unsqueeze_helper(g, end, [0])
                 dim_unsqueezed = sym_help._unsqueeze_helper(g, dim, [0])
-                return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed)
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
         else:
             start = 0 if is_start_none else _parse_arg(start, "i")
             end = 9223372036854775807 if is_end_none else _parse_arg(end, "i")
             dim = _parse_arg(dim, "i")
-            return sym_help._slice_helper(g, self, axes=[dim], starts=[start], ends=[end])
+            return sym_help._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
     elif len(args) == 3:
         # aten::slice(t[] l, int start, int end, int step) -> t[]
         start, end, step = args
         dim = 0
-        is_start_none = start.node().kind() == "prim::Constant" and start.type().kind() == "NoneType"
-        is_end_none = end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+        is_start_none = (
+            start.node().kind() == "prim::Constant"
+            and start.type().kind() == "NoneType"
+        )
+        is_end_none = (
+            end.node().kind() == "prim::Constant" and end.type().kind() == "NoneType"
+        )
         start = 0 if is_start_none else _parse_arg(start, "i")
         end = 9223372036854775807 if is_end_none else _parse_arg(end, "i")
         return sym_help._slice_helper(g, self, axes=[dim], starts=[start], ends=[end])
@@ -1951,7 +2660,9 @@ def slice(g, self, *args):
 
 @parse_args("v", "f", "f")
 def hardtanh(g, self, min_val, max_val):
-    return g.op("Clip", self, min_f=min_val, max_f=max_val)
+    return op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
 
 
 @parse_args("v")
@@ -1960,31 +2671,47 @@ def hardswish(g, self):
     return g.op("Mul", self, hs)
 
 
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@quantized_args(True, scale=1.0 / 256.0, zero_point=0)
 @parse_args("v")
 def hardsigmoid(g, self):
     # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
     # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
     return g.op("HardSigmoid", self, alpha_f=1 / 6)
 
+
 @parse_args("v")
 def tanhshrink(g, self):
     return g.op("Sub", self, tanh(g, self))
 
+
 @parse_args("v", "f")
 def hardshrink(g, self, lambd):
     lambd_op = g.op("Constant", value_t=torch.FloatTensor([lambd]))
     cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
     return g.op("Where", cond, self, g.op("Constant", value_t=torch.FloatTensor([0])))
 
+
 @parse_args("v", "f")
 def softshrink(g, self, lambd):
     lambd_op = g.op("Constant", value_t=torch.FloatTensor([lambd]))
     gt_cond = gt(g, self, lambd_op)
-    gt_out = g.op("Where", gt_cond, sub(g, self, lambd_op), g.op("Constant", value_t=torch.FloatTensor([0])))
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op("Constant", value_t=torch.FloatTensor([0])),
+    )
     lt_cond = lt(g, self, neg(g, lambd_op))
-    lt_out = g.op("Where", lt_cond, add(g, self, lambd_op), g.op("Constant", value_t=torch.FloatTensor([0])))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op("Constant", value_t=torch.FloatTensor([0])),
+    )
     return add(g, gt_out, lt_out)
 
+
 def alias(g, self):
     return self
 
@@ -1995,12 +2722,16 @@ def unsqueeze(g, self, dim):
     if dim < 0:
         rank = sym_help._get_tensor_rank(self)
         if rank is not None:
-            warnings.warn("ONNX export unsqueeze with negative axis " + str(dim) +
-                          " might cause the onnx model to be incorrect. " +
-                          "Negative axis is not supported in ONNX. " +
-                          "Axis is converted to " + str(dim + rank + 1) +
-                          " based on input shape at export time. " +
-                          "Passing an tensor of different rank in execution will be incorrect.")
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
             dim = dim + rank + 1
         else:
             return _unimplemented("unsqueeze", "negative axis with unknown input rank")
@@ -2040,33 +2771,53 @@ def topk(g, self, k, dim, largest, sorted, out=None):
 
 
 def to(g, self, *args):
-    # ONNX doesn't have a concept of a device, so we ignore device casts
-    if len(args) == 4:
-        if args[0].node().kind() == "prim::device" or args[0].type().isSubtypeOf(ListType.ofInts()):
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
             # aten::to(Tensor, Device, bool, bool, memory_format)
-            return self
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(ListType.ofInts())
+                or isinstance(args[0].type(), DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = sym_help._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = sym_help._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so sym_help._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if sym_help._is_value(args[0]) and args[0].node().kind() == "onnx::Constant":
+            tval = args[0].node()["value"]
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if sym_help._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = args[0].type().scalarType()
+            return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[dtype])
         else:
-            # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
-            # In this case, the constant value is a tensor not int,
-            # so sym_help._maybe_get_const(args[0], 'i') would not work.
-            dtype = args[0]
-            if sym_help._is_value(args[0]) and args[0].node().kind() == "onnx::Constant":
-                tval = args[0].node()["value"]
-                if isinstance(tval, torch.Tensor):
-                    if len(tval.shape) == 0:
-                        tval = tval.item()
-                        dtype = int(tval)
-                    else:
-                        dtype = tval
-
-            if sym_help._is_value(dtype) or isinstance(dtype, torch.Tensor):
-                # aten::to(Tensor, Tensor, bool, bool, memory_format)
-                dtype = args[0].type().scalarType()
-                return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[dtype])
-            else:
-                # aten::to(Tensor, ScalarType, bool, bool, memory_format)
-                # memory_format is ignored
-                return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 5:
         # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
         dtype = sym_help._get_const(args[1], "i", "dtype")
@@ -2083,7 +2834,7 @@ def to(g, self, *args):
         # Layout, device and memory_format are ignored
         return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     else:
-        raise NotImplementedError("Unknown aten::to signature")
+        return sym_help._onnx_unsupported("Unknown aten::to signature")
 
 
 def repeat(g, self, repeats):
@@ -2098,7 +2849,9 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     # if dim is None flatten
     # By default, use the flattened input array, and return a flat output array
     if sym_help._is_none(dim):
-        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        input = sym_help._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
         dim = 0
     else:
         dim = sym_help._maybe_get_scalar(dim)
@@ -2107,14 +2860,17 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     repeats_sizes = sym_help._get_tensor_sizes(repeats)
     input_sizes = sym_help._get_tensor_sizes(input)
     if repeats_dim is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats rank.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank."
+        )
     if repeats_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats size.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size."
+        )
     if input_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "input size.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size."
+        )
 
     input_sizes_temp = input_sizes.copy()
     for idx, input_size in enumerate(input_sizes):
@@ -2122,25 +2878,38 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
             input_sizes[idx], input_sizes_temp[idx] = 0, -1
 
     # Cases where repeats is an int or single value tensor
-    if (repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1)):
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
         if not sym_help._is_tensor(repeats):
             repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
         if input_sizes[dim] == 0:
-            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
-                                                             "Unsupported along dimension with unknown input size")
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+            )
         else:
             reps = input_sizes[dim]
-            repeats = expand(g, repeats, g.op("Constant", value_t=torch.tensor([reps])), None)
+            repeats = expand(
+                g, repeats, g.op("Constant", value_t=torch.tensor([reps])), None
+            )
 
     # Cases where repeats is a 1 dim Tensor
     elif repeats_dim == 1:
         if input_sizes[dim] == 0:
-            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
-                                                             "Unsupported along dimension with unknown input size")
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+            )
         if repeats_sizes[0] is None:
-            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
-                                                             "Unsupported for cases with dynamic repeats")
-        assert repeats_sizes[0] == input_sizes[dim], "repeats must have the same size as input along dim"
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave", 9, 13, "Unsupported for cases with dynamic repeats"
+            )
+        assert (
+            repeats_sizes[0] == input_sizes[dim]
+        ), "repeats must have the same size as input along dim"
         reps = repeats_sizes[0]
     else:
         raise RuntimeError("repeats must be 0-dim or 1-dim tensor")
@@ -2151,12 +2920,19 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     input_sizes[dim], input_sizes_temp[dim] = -1, 1
     for idx, r_split in enumerate(r_splits):
         i_split = unsqueeze(g, i_splits[idx], dim + 1)
-        r_concat = [g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[:dim + 1])),
-                    r_split,
-                    g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1:]))]
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
         r_concat = g.op("Concat", *r_concat, axis_i=0)
         i_split = expand(g, i_split, r_concat, None)
-        i_split = sym_help._reshape_helper(g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes)), allowzero=0)
+        i_split = sym_help._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
         final_splits.append(i_split)
     return g.op("Concat", *final_splits, axis_i=dim)
 
@@ -2166,40 +2942,188 @@ def pixel_shuffle(g, self, upscale_factor):
     dims = sym_help._get_tensor_sizes(self)
     if len(dims) != 4:
         return _unimplemented("pixel_shuffle", "only support 4d input")
-    if any([i is None for i in dims[1:]]):
-        return _unimplemented("pixel_shuffle", "only support static input shape, except for batch size")
-    output_channel = dims[1] // upscale_factor // upscale_factor
-    after_view = sym_help._reshape_helper(g, self,
-                                          g.op("Constant", value_t=torch.tensor([-1, output_channel,
-                                                                                upscale_factor, upscale_factor,
-                                                                                dims[2], dims[3]])),
-                                          allowzero=0)
-    after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-    return sym_help._reshape_helper(g, after_transpose,
-                                    g.op("Constant", value_t=torch.tensor([-1, output_channel,
-                                                                          dims[2] * upscale_factor,
-                                                                          dims[3] * upscale_factor])),
-                                    allowzero=0)
-
-
-def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
-                 num_layers, dropout, train, bidirectional, batch_first=None, batch_sizes=None):
-
-    warnings.warn("Exporting a model to ONNX with a batch_size other than 1, " +
-                  "with a variable length with " + variant + " can cause an error " +
-                  "when running the ONNX model with a different batch size. " +
-                  "Make sure to save the model with a batch size of 1, " +
-                  "or define the initial states (h0/c0) as inputs of the model. ")
-
-    onnxActivations = ["Relu", "Tanh", "Sigmoid", "Affine", "LeakyRelu", "ThresholdedRelu",
-                       "ScaledTanh", "HardSigmoid", "Elu", "Softsign", "Softplus"]
-    variantToOnnxActivationMap = dict(zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations))
+    if any(i is None for i in dims[1:]):
+        after_view = sym_help._reshape_helper(
+            g,
+            sym_help._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = sym_help._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = sym_help._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return sym_help._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = sym_help._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return sym_help._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@parse_args("v", "i")
+def pixel_unshuffle(g, self, downscale_factor):
+    dims = sym_help._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return _unimplemented("pixel_shuffle", "only support 4d input")
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = sym_help._reshape_helper(
+            g,
+            sym_help._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = sym_help._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = sym_help._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return sym_help._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = sym_help._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return sym_help._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+def _generic_rnn(
+    g,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
     weights_per_layer = 4 if has_biases else 2
     # this means that projections are used inside LSTM, so need to tell user that it's not supported
-    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (1 + bidirectional):
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
         return _unimplemented("LSTM", "LSTMs with projections")
     assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
-    layer_weights = [all_weights[i:i + weights_per_layer] for i in range(0, len(all_weights), weights_per_layer)]
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
     if batch_first:
         # batch, seq, feat -> seq, batch, feat
         input = g.op("Transpose", input, perm_i=[1, 0, 2])
@@ -2238,7 +3162,10 @@ def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
         reform_permutation = [(0, 1), (3, 4), (1, 3)]
 
     def reform_weights(g, w, n, intervals):
-        slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
+        slices = [
+            sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
         return g.op("Concat", *slices, axis_i=0)
 
     def transform_weights_no_bias(layer_index):
@@ -2246,22 +3173,33 @@ def transform_weights_no_bias(layer_index):
         if variant == "RNN":
             weight_ih, weight_hh = weights
         elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh = \
-                [reform_weights(g, w, hidden_size, reform_permutation) for w in weights]
-        return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh))
+            weight_ih, weight_hh = [
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            ]
+        return tuple(
+            sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh)
+        )
 
     def transform_weights(layer_index):
         weights = layer_weights[layer_index]
         if variant == "RNN":
             weight_ih, weight_hh, bias_ih, bias_hh = weights
         elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh, bias_ih, bias_hh = \
-                [reform_weights(g, w, hidden_size, reform_permutation) for w in weights]
+            weight_ih, weight_hh, bias_ih, bias_hh = [
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            ]
         bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)
-        return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh, bias_concat))
+        return tuple(
+            sym_help._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)
+        )
 
     def retrieve_state(x, start, end):
-        return x if num_layers == 1 else sym_help._slice_helper(g, x, axes=[0], starts=[start], ends=[end])
+        return (
+            x
+            if num_layers == 1
+            else sym_help._slice_helper(g, x, axes=[0], starts=[start], ends=[end])
+        )
 
     for i in range(num_layers):
         if unidirectional:
@@ -2300,19 +3238,27 @@ def retrieve_state(x, start, end):
             else:
                 activation = [nonlinearity]
 
-            prev_output, h_out = g.op("RNN", *inputs, outputs=2,
-                                      hidden_size_i=hidden_size,
-                                      activations_s=activation,
-                                      **extra_kwargs)
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
         elif variant == "GRU":
-            prev_output, h_out = g.op("GRU", *inputs, outputs=2,
-                                      hidden_size_i=hidden_size,
-                                      linear_before_reset_i=1,
-                                      **extra_kwargs)
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
         elif variant == "LSTM":
-            prev_output, h_out, c_out = g.op("LSTM", *inputs, outputs=3,
-                                             hidden_size_i=hidden_size,
-                                             **extra_kwargs)
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
 
         if bidirectional:
             # The ONNX RNN/GRU/LSTM produce an output of dimensions
@@ -2323,8 +3269,12 @@ def retrieve_state(x, start, end):
             # Transpose, and then combining it with hidden_size
             # with Reshape.
             prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
-            prev_output = sym_help._reshape_helper(g, prev_output,
-                                                   g.op("Constant", value_t=torch.LongTensor([0, 0, -1])), allowzero=0)
+            prev_output = sym_help._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
         else:
             prev_output = sym_help._squeeze_helper(g, prev_output, [1])
 
@@ -2343,17 +3293,61 @@ def retrieve_state(x, start, end):
 
 
 @parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-def _lstm_full(g, input, hidden_v, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first):
+def _lstm_full(
+    g,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
     hidden, weight = sym_help._unpack_list(hidden_v), sym_help._unpack_list(weight_v)
-    return _generic_rnn(g, "LSTM", input, hidden, weight, has_biases, num_layers,
-                        dropout, train, bidirectional, batch_first)
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
 
 
 @parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-def _lstm_packed(g, input, batch_sizes, hidden_v, weight_v, has_biases, num_layers, dropout, train, bidirectional):
+def _lstm_packed(
+    g,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
     hidden, weight = sym_help._unpack_list(hidden_v), sym_help._unpack_list(weight_v)
-    return _generic_rnn(g, "LSTM", input, hidden, weight, has_biases, num_layers,
-                        dropout, train, bidirectional, batch_sizes=batch_sizes)
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
 
 
 def lstm(g, *args):
@@ -2369,23 +3363,80 @@ def lstm_cell(g, self, hidden, w_ih, w_hh, b_ih, b_hh):
     hidden = [sym_help._unsqueeze_helper(g, x, [0]) for x in hidden]
     weight = (w_ih, w_hh, b_ih, b_hh) if sym_help._is_tensor(b_ih) else (w_ih, w_hh)
     has_biases = True if sym_help._is_tensor(b_ih) else False
-    _, h_outs, c_outs = _generic_rnn(g, "LSTM", input, hidden, weight, has_biases, num_layers=1,
-                                     dropout=0, train=0, bidirectional=False, batch_first=False)
-    return sym_help._squeeze_helper(g, h_outs, [0]), sym_help._squeeze_helper(g, c_outs, [0])
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return sym_help._squeeze_helper(g, h_outs, [0]), sym_help._squeeze_helper(
+        g, c_outs, [0]
+    )
 
 
 def _one_hidden_rnn(kind):
     @parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-    def _rnn_full(g, input, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first):
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
         weight = sym_help._unpack_list(weight_v)
-        return _generic_rnn(g, kind, input, hidden, weight, has_biases, num_layers,
-                            dropout, train, bidirectional, batch_first)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
 
     @parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-    def _rnn_packed(g, input, batch_sizes, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional):
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
         weight = sym_help._unpack_list(weight_v)
-        return _generic_rnn(g, kind, input, hidden, weight, has_biases, num_layers,
-                            dropout, train, bidirectional, batch_sizes=batch_sizes)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
 
     def symbolic(g, *args):
         if sym_help._is_tensor_list(args[3]):
@@ -2404,11 +3455,10 @@ def symbolic(g, *args):
 @parse_args("v", "i")
 def _dim_arange(g, like, dim):
     like_shape = g.op("Shape", like)
-    stop = g.op("Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0)
-    # Caffe2-specific op
-    is_caffe2_aten_fallback = (sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK and
-                               torch.onnx._CAFFE2_ATEN_FALLBACK)
-    if is_caffe2_aten_fallback:
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    if sym_help.is_caffe2_aten_fallback():
         return g.op("_caffe2::Range", stop)
     else:
         # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
@@ -2445,7 +3495,9 @@ def _pack_padded_sequence(g, input, lengths, batch_first):
 
 
 @parse_args("v", "v", "i", "t", "v")
-def _pad_packed_sequence(g, data, batch_sizes, batch_first, padding_value, total_length):
+def _pad_packed_sequence(
+    g, data, batch_sizes, batch_first, padding_value, total_length
+):
     # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
     # It is only useful/used when training using data_parallel model, so
     # It shouldn't be relevant for ONNX anyway
@@ -2461,9 +3513,14 @@ def randn(g, shapes, dtype, *options):
         dtype = ScalarType.FLOAT
     shape = sym_help._maybe_get_const(shapes, "is")
     if sym_help._is_value(shape):
-        shape_const = g.op("ConstantOfShape", shapes,
-                           value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[6]))
-        return g.op("RandomNormalLike", shape_const, dtype_i=sym_help.scalar_type_to_onnx[dtype])
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[6]),
+        )
+        return g.op(
+            "RandomNormalLike", shape_const, dtype_i=sym_help.scalar_type_to_onnx[dtype]
+        )
     return g.op("RandomNormal", shape_i=shape)
 
 
@@ -2473,20 +3530,31 @@ def rand(g, shapes, dtype, *options):
         dtype = ScalarType.FLOAT
     shape = sym_help._maybe_get_const(shapes, "is")
     if sym_help._is_value(shape):
-        shape_const = g.op("ConstantOfShape", shapes,
-                           value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[6]))
-        return g.op("RandomUniformLike", shape_const, dtype_i=sym_help.scalar_type_to_onnx[dtype])
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[6]),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=sym_help.scalar_type_to_onnx[dtype],
+        )
     return g.op("RandomUniform", shape_i=shape)
 
 
-def randn_like(g, self, dtype, layout=None, device=None, pin_memory=False, memory_format=None):
+def randn_like(
+    g, self, dtype, layout=None, device=None, pin_memory=False, memory_format=None
+):
     dtype = sym_help._get_const(dtype, "i", "dtype")
     if dtype is None:
         dtype = ScalarType.FLOAT
     return g.op("RandomNormalLike", self, dtype_i=sym_help.scalar_type_to_onnx[dtype])
 
 
-def rand_like(g, self, dtype, layout=None, device=None, pin_memory=False, memory_format=None):
+def rand_like(
+    g, self, dtype, layout=None, device=None, pin_memory=False, memory_format=None
+):
     dtype = sym_help._get_const(dtype, "i", "dtype")
     if dtype is None:
         dtype = ScalarType.FLOAT
@@ -2508,7 +3576,13 @@ def bernoulli(g, input, generator=None, out=None):
     dtype = sym_help._try_get_scalar_type(input)
     if dtype is None:
         return _unimplemented("Bernoulli", "input dtype not accessible")
-    p = g.op("RandomUniformLike", input, high_f=1.0, low_f=0.0, dtype_i=sym_help.cast_pytorch_to_onnx[dtype])
+    p = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=sym_help.cast_pytorch_to_onnx[dtype],
+    )
     output = g.op("Less", p, input)
     return g.op("Cast", output, to_i=sym_help.cast_pytorch_to_onnx[dtype])
 
@@ -2524,28 +3598,32 @@ def erf(g, input):
     return g.op("Erf", input)
 
 
+@quantized_args(True, False, False)
 @parse_args("v", "i", "i")
 def flatten(g, input, start_dim, end_dim):
     dim = sym_help._get_tensor_rank(input)
     if dim is None:
-        return _unimplemented("dim",
-                              "ONNX and PyTorch use different strategies to split the input. "
-                              "Input rank must be known at export time.")
+        return _unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
 
     # TODO: remove this as onnx opset 11 spec allows negative axes
-    if end_dim < 0 :
+    if end_dim < 0:
         end_dim = dim + end_dim
     # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim == 1 and end_dim == dim - 1 :
+    if start_dim == 1 and end_dim == dim - 1:
         return g.op("Flatten", input, axis_i=start_dim)
-    if start_dim == 0 and end_dim == dim - 2 :
+    if start_dim == 0 and end_dim == dim - 2:
         return g.op("Flatten", input, axis_i=end_dim + 1)
 
     return sym_help._flatten_helper(g, input, start_dim, end_dim, dim)
 
-# Emitted from `torch.nonzero(x, as_tuple=False)`
+
 @parse_args("v")
 def nonzero(g, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
     return t(g, g.op("NonZero", input))
 
 
@@ -2559,6 +3637,7 @@ def isnan(g, input):
     output = g.op("IsNaN", input)
     return output
 
+
 def _any(g, *args):
     # aten::any(Tensor self)
     if len(args) == 1:
@@ -2570,10 +3649,10 @@ def _any(g, *args):
         dim = [_parse_arg(dim, "i")]
         keepdim = _parse_arg(keepdim, "i")
     input = _cast_Long(g, input, False)  # type: ignore[name-defined]
-    input_sum = sym_help._reducesum_helper(g, input,
-                                           axes_i=dim, keepdims_i=keepdim)
+    input_sum = sym_help._reducesum_helper(g, input, axes_i=dim, keepdims_i=keepdim)
     return gt(g, input_sum, g.op("Constant", value_t=torch.LongTensor([0])))
 
+
 def _all(g, *args):
     input = g.op("Not", args[0])
     # aten::all(Tensor self)
@@ -2586,12 +3665,16 @@ def _all(g, *args):
 
 @parse_args("v", "i", "i", "i")
 def narrow(g, input, dim, start, length):
-    return sym_help._slice_helper(g, input, axes=[dim], starts=[start], ends=[start + length])
+    return sym_help._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
 
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
         return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, "i")
@@ -2601,7 +3684,9 @@ def argmax(g, input, dim, keepdim):
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
         return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, "i")
@@ -2619,7 +3704,11 @@ def scatter(g, self, dim, index, src):
         # Check if scalar "src" has same type as self (PyTorch allows different
         # type for scalar src (but not when src is tensor)). If not, insert Cast node.
         if self.type().scalarType() != src_type:
-            src = g.op("Cast", src, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
+            src = g.op(
+                "Cast",
+                src,
+                to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()],
+            )
         return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
 
 
@@ -2645,22 +3734,6 @@ def log2(g, self):
     return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln2])))
 
 
-def prim_shape(g, self):
-    return g.op("Shape", self)
-
-def prim_max(g, self, other):
-    return g.op("Max", self, other)
-
-def prim_min(g, self, other=None):
-    if not other:
-        if (sym_help._is_packed_list(self)):
-            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
-        return min(g, self)
-    return min(g, self, other)
-
-def prim_data(g, self):
-    return self
-
 def is_floating_point(g, self):
     if sym_help._is_fp(self):
         return g.op("Constant", value_t=torch.BoolTensor([1]))
@@ -2679,37 +3752,14 @@ def __is_(g, self, other):
 def __isnot_(g, self, other):
     return __is_(g, self, other)
 
-# exists to refine the type of the Value
-# if x is an optional Tensor, unchecked_cast will cast
-# x to Tensor, so the rest of the graph knows that x is a Tensor
-# this doesn't do anything in runtime and is a noop in ONNX
-def prim_unchecked_cast(g, self):
-    return self
-
-
-def prim_dtype(g, self):
-    dtype = sym_help._try_get_scalar_type(self)
-    if dtype is None:
-        dtype = "Float"
-    dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
-    return g.op("Constant", value_t=torch.tensor(dtype))
-
-
-# tolist is currently supported only for 1D input tensors.
-# dim_val and elem_ty_val represent dimension and type annotations
-# that need to match dimension and type of the input tensor.
-def prim_tolist(g, input, dim_val, elem_ty_val):
-    dim = sym_help._maybe_get_const(dim_val, "i")
-    if dim > 1:
-        return _unimplemented("prim_tolist", "dim_val > 1")
-    return input
-
 
 def one_hot(g, self, num_classes):
     values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
     # onnxruntime supports limited type combinations for OneHot.
     if num_classes.type().scalarType() in ("Byte", "Char", "Int", "Short"):
-        num_classes = g.op("Cast", num_classes, to_i=sym_help.cast_pytorch_to_onnx["Long"])
+        num_classes = g.op(
+            "Cast", num_classes, to_i=sym_help.cast_pytorch_to_onnx["Long"]
+        )
     return g.op("OneHot", self, num_classes, values, axis_i=-1)
 
 
@@ -2722,7 +3772,11 @@ def gather(g, self, dim, index, sparse_grad=False):
     dtype = self.type().scalarType()
     values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
     depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
-    index = g.op("Cast", g.op("OneHot", index, depth, values, axis_i=dim), to_i=sym_help.cast_pytorch_to_onnx[dtype])
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=sym_help.cast_pytorch_to_onnx[dtype],
+    )
     mul = g.op("Mul", sym_help._unsqueeze_helper(g, self, [dim + 1]), index)
     return sym_help._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
 
@@ -2738,7 +3792,12 @@ def _var_mean(g, input, dim, correction, keepdim):
         t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
         redudced_dims = g.op("Shape", input)
         # dim could contain one or multiple dimensions
-        redudced_dims = g.op("Gather", redudced_dims, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0)
+        redudced_dims = g.op(
+            "Gather",
+            redudced_dims,
+            g.op("Constant", value_t=torch.tensor(dim)),
+            axis_i=0,
+        )
         num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
     sub_v = g.op("Sub", input, t_mean)
     sqr_sub = g.op("Mul", sub_v, sub_v)
@@ -2748,7 +3807,9 @@ def _var_mean(g, input, dim, correction, keepdim):
     if correction is None:
         correction = 1
     if correction != 0:
-        num_elements = g.op("Cast", num_elements, to_i=sym_help.cast_pytorch_to_onnx["Float"])
+        num_elements = g.op(
+            "Cast", num_elements, to_i=sym_help.cast_pytorch_to_onnx["Float"]
+        )
         one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
         mul = g.op("Mul", var, num_elements)
         var = g.op("Div", mul, g.op("Sub", num_elements, one))
@@ -2788,8 +3849,8 @@ def logsumexp(g, input, dim, keepdim):
 
 
 def arange(g, *args):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", *args, operator_s="arange")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("arange", *args)
 
     def _get_arange_dtype(dtype):
         dtype = sym_help._maybe_get_const(dtype, "i")
@@ -2797,7 +3858,9 @@ def _get_arange_dtype(dtype):
 
     def _float_step_convert(range_tensor):
         if sym_help._is_fp(range_tensor):
-            range_tensor = g.op("Cast", g.op("Ceil", range_tensor), to_i=sym_help.scalar_type_to_onnx[4])
+            range_tensor = g.op(
+                "Cast", g.op("Ceil", range_tensor), to_i=sym_help.scalar_type_to_onnx[4]
+            )
         return range_tensor
 
     if len(args) == 2 or len(args) == 5:
@@ -2807,10 +3870,14 @@ def _float_step_convert(range_tensor):
         else:
             # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
             dtype = _get_arange_dtype(args[1])
-        dtype, end, start, step = sym_help._arange_cast_helper(g, end=args[0], dtype=dtype)
+        dtype, end, start, step = sym_help._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
         end = sym_help._unsqueeze_helper(g, end, [0])
         range_tensor = _float_step_convert(end)
-        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1])
+        arange_tensor = sym_help._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 4 or len(args) == 7:
         if len(args) == 4:
@@ -2819,30 +3886,55 @@ def _float_step_convert(range_tensor):
         else:
             # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
             dtype = _get_arange_dtype(args[3])
-        dtype, end, start, step = sym_help._arange_cast_helper(g, start=args[0], end=args[1], step=args[2], dtype=dtype)
+        dtype, end, start, step = sym_help._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
         step = sym_help._unsqueeze_helper(g, step, [0])
         end = sym_help._unsqueeze_helper(g, end, [0])
         start = sym_help._unsqueeze_helper(g, start, [0])
         range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
-        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, None, None, None)), [1])
+        arange_tensor = sym_help._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
         arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 6:
         # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
         dtype = _get_arange_dtype(args[2])
-        dtype, end, start, step = sym_help._arange_cast_helper(g, start=args[0], end=args[1], dtype=dtype)
+        dtype, end, start, step = sym_help._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
         end = sym_help._unsqueeze_helper(g, end, [0])
         start = sym_help._unsqueeze_helper(g, start, [0])
         range_tensor = _float_step_convert(g.op("Sub", end, start))
-        arange_tensor = g.op("Add", sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]), start)
+        arange_tensor = g.op(
+            "Add",
+            sym_help._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     else:
-        raise NotImplementedError("Unknown aten::arange signature taking " + str(len(args)) + " arguments.")
+        raise NotImplementedError(
+            "Unknown aten::arange signature taking " + str(len(args)) + " arguments."
+        )
+
 
 def linspace(g, start, end, steps, dtype, layout, device, pin_memory):
-    step = div(g, sub(g, end, start), sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))))
-    end_epsilon = g.op("Add", step, end)
-    return sym_help._arange_helper(g, start, end_epsilon, step, dtype, None, None, None)
+    range_tensor = sym_help._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+def lift(g, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
 
 def masked_fill(g, self, mask, value):
     mask = _cast_Bool(g, mask, False)  # type: ignore[name-defined]
@@ -2851,8 +3943,8 @@ def masked_fill(g, self, mask, value):
 
 
 def index(g, self, index):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", self, index, operator_s="index")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at("index", self, index, overload_name="Tensor")
 
     if sym_help._is_packed_list(index):
         indices = sym_help._unpack_list(index)
@@ -2860,12 +3952,18 @@ def index(g, self, index):
         indices = [index]
 
     def try_mask_to_index(index):
-        if not sym_help._is_none(index) and (index.type().scalarType() == "Byte" or index.type().scalarType() == "Bool"):
-            if sym_help._export_onnx_opset_version < 9:
-                raise RuntimeError("Exporting masked indices are only supported after ONNX opset 9.")
-            warnings.warn("Exporting aten::index operator with indices of type Byte. "
-                          "Only 1-D indices are supported. In any other case, "
-                          "this will produce an incorrect ONNX graph.")
+        if not sym_help._is_none(index) and (
+            index.type().scalarType() == "Byte" or index.type().scalarType() == "Bool"
+        ):
+            if GLOBALS.export_onnx_opset_version < 9:
+                raise RuntimeError(
+                    "Exporting masked indices are only supported after ONNX opset 9."
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
             index = sym_help._squeeze_helper(g, nonzero(g, index), [1])
         return index
 
@@ -2891,32 +3989,51 @@ def try_mask_to_index(index):
         #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
         #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
         # After gather, reshape and transpose back.
-        adv_idx_indices = [i for i, idx in enumerate(indices) if not sym_help._is_none(idx)]
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not sym_help._is_none(idx)
+        ]
 
         if len(adv_idx_indices) == 0:
             return self
         elif len(adv_idx_indices) == 1:
-            return index_select(g, self, adv_idx_indices[0], indices[adv_idx_indices[0]])
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
         else:
             rank = sym_help._get_tensor_rank(self)
             if rank is None:
-                raise NotImplementedError("Unsupported aten::index operator of advanced indexing on tensor of unknown rank, " +
-                                          "try turning on shape and type propagate during export: " +
-                                          "torch.onnx._export(..., propagate=True).")
+                raise NotImplementedError(
+                    "Unsupported aten::index operator of advanced indexing on tensor of unknown rank, "
+                    + "try turning on shape and type propagate during export: "
+                    + "torch.onnx._export(..., propagate=True)."
+                )
             # TODO: If indexing is supported natively in ONNX in future opsets,
             #       update the warning to recommend exporting with higher opset version.
-            warnings.warn("Exporting aten::index operator of advanced indexing in opset " +
-                          str(sym_help._export_onnx_opset_version) +
-                          " is achieved by combination of multiple ONNX operators, " +
-                          "including Reshape, Transpose, Concat, and Gather. " +
-                          "If indices include negative values, the exported graph will produce incorrect results.")
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                + str(GLOBALS.export_onnx_opset_version)
+                + " is achieved by combination of multiple ONNX operators, "
+                + "including Reshape, Transpose, Concat, and Gather. "
+                + "If indices include negative values, the exported graph will produce incorrect results."
+            )
             adv_idx_count = len(adv_idx_indices)
             shape_tensor = _shape_as_tensor(g, self)
             dim_tensor_list = [
-                g.op("Gather", shape_tensor, g.op("Constant", value_t=torch.LongTensor([dim])), axis_i=0) for dim in range(rank)
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
             ]
 
-            self = g.op("Transpose", self, perm_i=adv_idx_indices + [i for i in range(rank) if i not in adv_idx_indices])
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
             self = g.op("Flatten", self, axis_i=adv_idx_count)
 
             # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
@@ -2925,7 +4042,9 @@ def try_mask_to_index(index):
             for i in range(adv_idx_count - 2, -1, -1):
                 adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
                 cum_adv_index = g.op("Add", cum_adv_index, adv_index)
-                multiplier = g.op("Mul", multiplier, dim_tensor_list[adv_idx_indices[i]])
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
 
             # perform gather
             self = index_select(g, self, 0, cum_adv_index)
@@ -2934,33 +4053,165 @@ def try_mask_to_index(index):
             # check if all advanced indices are consecutive.
             # Refer to https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
             # to understand how the subarray position is decided.
-            if adv_idx_indices == list(range(adv_idx_indices[0], adv_idx_indices[-1] + 1)):
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
                 # unfold regular index axes
-                folded_adv_idx_shape_list = [g.op("Constant", value_t=torch.LongTensor([-1]))]  \
-                    + [dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices]
-                folded_adv_idx_shape = g.op("Concat", *folded_adv_idx_shape_list, axis_i=0)
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
                 self = sym_help._reshape_helper(g, self, folded_adv_idx_shape)
 
                 # Transpose folded advanced indexed axis to its original location.
-                adv_idx_permute = list(range(1, adv_idx_indices[0] + 1))                    \
-                    + [0] + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
                 self = g.op("Transpose", self, perm_i=adv_idx_permute)
 
                 # unfold advanced index axes
-                final_shape_list = [dim_tensor_list[i] for i in range(adv_idx_indices[0])]                      \
-                    + [cum_adv_index_shape_tensor]                                                              \
-                    + [dim_tensor_list[i] for i in range(adv_idx_indices[0], rank) if i not in adv_idx_indices]
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
                 final_shape = g.op("Concat", *final_shape_list, axis_i=0)
             else:
                 final_shape = g.op(
                     "Concat",
                     cum_adv_index_shape_tensor,
-                    *[dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices],
-                    axis_i=0)
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
 
             return sym_help._reshape_helper(g, self, final_shape)
 
 
+@parse_args("v", "v", "is", "i", "v")
+def linalg_norm(g, self, ord, dim, keepdim, dtype):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if sym_help._is_none(ord):
+            self = sym_help._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = sym_help._get_tensor_rank(self)
+        if self_dim is None:
+            return _unimplemented("dim", "Input rank must be known at export time.")
+        if self_dim == 1:
+            ord_value = sym_help._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if sym_help._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = sym_help._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@parse_args("v", "f", "is", "i", "v")
+def linalg_vector_norm(g, self, ord, dim, keepdim, dtype):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if dim is None:
+        self = sym_help._reshape_helper(g, self, [-1])
+        keepdim = None
+
+    if ord == math.inf:
+        result = g.op("ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        result = g.op("ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
+    elif ord == 0:
+        return sym_help._onnx_opset_unsupported_detailed(
+            "linalg_vector_norm", 9, 11, "ord=0 not supported"
+        )
+    else:
+        ord_op = g.op("Constant", value_t=torch.FloatTensor([ord]))
+        result = sym_help._reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op("Div", g.op("Constant", value_t=torch.FloatTensor([1])), ord_op),
+        )
+    return result
+
+
+@parse_args("v", "v", "is", "i", "v")
+def linalg_matrix_norm(g, self, ord, dim, keepdim, dtype):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = sym_help._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return _unimplemented("linalg.matrix_norm", "ord==nuc")
+    else:
+        ord_value = sym_help._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return _unimplemented("linalg.matrix_norm", "ord==2")
+        # Wrap the dim vector to handle neagtive dim values
+        self_dim = sym_help._get_tensor_rank(self)
+        if self_dim is None:
+            return _unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time."
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = sym_help._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@parse_args("v", "v", "i")
+def linalg_cross(g, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
 @parse_args("v", "is", "i")
 def frobenius_norm(g, self, dim=None, keepdim=False):
     sqr = g.op("Mul", self, self)
@@ -2973,18 +4224,26 @@ def multinomial(g, input, num_samples, replacement=False, generator=None):
     if generator is not None and not sym_help._is_none(generator):
         _unimplemented("Multinomial", "generator is not supported for multinomial")
     if not replacement and num_samples > 1:
-        _unimplemented("Multinomial", "replacement=False when num_samples > 1 is not supported for multinomial")
+        _unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+        )
 
     log_input = log(g, input)
-    return g.op("Multinomial", log_input,
-                dtype_i=sym_help.cast_pytorch_to_onnx["Long"],
-                sample_size_i=num_samples)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=sym_help.cast_pytorch_to_onnx["Long"],
+        sample_size_i=num_samples,
+    )
 
 
 def baddbmm(g, self, batch1, batch2, beta, alpha):
     dtype = self.type().scalarType()
     batch_mul = matmul(g, batch1, batch2)
-    mul_a = mul(g, batch_mul, g.op("Cast", alpha, to_i=sym_help.cast_pytorch_to_onnx[dtype]))
+    mul_a = mul(
+        g, batch_mul, g.op("Cast", alpha, to_i=sym_help.cast_pytorch_to_onnx[dtype])
+    )
     mul_b = mul(g, self, g.op("Cast", beta, to_i=sym_help.cast_pytorch_to_onnx[dtype]))
     return add(g, mul_a, mul_b)
 
@@ -2997,13 +4256,17 @@ def meshgrid(g, tensor_list, indexing: Optional[str] = None):
         raise ValueError(f"Unsupported indexing: {indexing}")
     if indexing == "xy":
         tensor_list[0], tensor_list[1] = tensor_list[1], tensor_list[0]
-    tensors = [sym_help._reshape_helper(g, t, g.op("Constant", value_t=torch.LongTensor([-1])))
-               for t in sym_help._unpack_list(tensor_list)]
+    tensors = [
+        sym_help._reshape_helper(g, t, g.op("Constant", value_t=torch.LongTensor([-1])))
+        for t in sym_help._unpack_list(tensor_list)
+    ]
     tensors_shape = [g.op("Shape", t) for t in tensors]
     out_shape = g.op("Concat", *tensors_shape, axis_i=0)
     out = []
     for i, t in enumerate(tensors):
-        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(tensors)
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
         shape_i[i] = tensors_shape[i]
         t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
         out.append(g.op("Expand", t_reshaped, out_shape))
@@ -3018,17 +4281,45 @@ def remainder(g, input, other):
     return g.op("Sub", input, quo)
 
 
-def gelu(g, self):
-    _sqrt2 = 1.4142135623730951
-    erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
-    erf_plusone = add(g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double)))
-    return mul(g, mul(g, self, erf_plusone), g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)))
+@parse_args("v", "s")
+def gelu(g, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
 
 @parse_args("v", "i", "v", "v", "f", "i")
 def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
-    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", input, weight, bias, num_groups_i=num_groups,
-                    eps_f=eps, cudnn_enabled_i=cudnn_enabled, operator_s="group_norm")
+    if sym_help.is_caffe2_aten_fallback():
+        return g.at(
+            "group_norm",
+            input,
+            weight,
+            bias,
+            num_groups_i=num_groups,
+            eps_f=eps,
+            cudnn_enabled_i=cudnn_enabled,
+        )
 
     channel_size = sym_help._get_tensor_dim_size(input, 1)
     if channel_size is not None:
@@ -3038,32 +4329,49 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
         return _unimplemented("group_norm", "unknown input rank")
     # 0 in the shape list keeps dimension value unchanged.
     shape = [0, num_groups, -1]
-    input_reshaped = sym_help._reshape_helper(g, input,
-                                              g.op("Constant", value_t=torch.LongTensor(shape)))
+    input_reshaped = sym_help._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
 
     # C is always divisible by num_groups
     # Due to shape difference. we need to apply weight and bias after
     # instance norm computation and reshape
-    weight_ = g.op("Constant", value_t=torch.tensor([1.] * num_groups).type(
-        "torch." + input.type().scalarType() + "Tensor"))
-    bias_ = g.op("Constant", value_t=torch.tensor([0.] * num_groups).type(
-        "torch." + input.type().scalarType() + "Tensor"))
-
-    norm_reshaped = g.op("InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps)
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor([1.0] * num_groups).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor([0.0] * num_groups).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
     norm = sym_help._reshape_helper(g, norm_reshaped, g.op("Shape", input))
 
     if weight is None or weight.node().mustBeNone():
-        weight_value = torch.tensor([1.]).type(
-            "torch." + input.type().scalarType() + "Tensor")
+        weight_value = torch.tensor([1.0]).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or bias.node().mustBeNone():
-        bias_value = torch.tensor([0.]).type(
-            "torch." + input.type().scalarType() + "Tensor")
+        bias_value = torch.tensor([0.0]).type(
+            "torch." + input.type().scalarType() + "Tensor"
+        )
         bias = g.op("Constant", value_t=bias_value)
 
     # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
     axes = list(range(1, input_rank - 1))
-    return add(g, mul(g, norm, sym_help._unsqueeze_helper(g, weight, axes)), sym_help._unsqueeze_helper(g, bias, axes))
+    return add(
+        g,
+        mul(g, norm, sym_help._unsqueeze_helper(g, weight, axes)),
+        sym_help._unsqueeze_helper(g, bias, axes),
+    )
 
 
 @parse_args("v", "v", "i")
@@ -3084,11 +4392,12 @@ def _weight_norm(g, weight_v, weight_g, dim):
         norm_v = norm(g, weight_v, 2, axes, 1)
         div = g.op("Div", weight_v, norm_v)
         return g.op("Mul", div, weight_g)
-    elif sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-        return g.op("ATen", weight_v, weight_g, dim_i=dim, operator_s="_weight_norm")
+    elif sym_help.is_caffe2_aten_fallback():
+        return g.at("_weight_norm", weight_v, weight_g, dim_i=dim)
     else:
-        raise RuntimeError("Unsupported: ONNX export of _weight_norm for tensor "
-                           "of unknown rank.")
+        raise RuntimeError(
+            "Unsupported: ONNX export of _weight_norm for tensor " "of unknown rank."
+        )
 
 
 def dim(g, self):
@@ -3107,7 +4416,9 @@ def item(g, self):
 
 
 def take(g, self, index):
-    self_flattened = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+    self_flattened = sym_help._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
     out = index_select(g, self_flattened, 0, index)
     out = reshape_as(g, out, index)
     return out
@@ -3144,14 +4455,18 @@ def kl_div(g, input, target, reduction, log_target):
     elif reduction == 2:
         return sym_help._reducesum_helper(g, output, keepdims_i=0)
     else:
-        return sym_help._onnx_unsupported("kl_div with reduction other than none, mean, or sum.")
+        return sym_help._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum."
+        )
 
 
 @parse_args("v", "v", "is", "i")
 def as_strided(g, self, sizes, strides, offset=None):
     sizes = sym_help._maybe_get_const(sizes, "is")
     rank = len(strides)
-    self_1d = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+    self_1d = sym_help._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
     ind: Optional[torch.Tensor]
     if not sym_help._is_value(sizes):
         ind = torch.tensor([0], dtype=torch.long)
@@ -3167,10 +4482,20 @@ def as_strided(g, self, sizes, strides, offset=None):
         for i, stride in enumerate(strides):
             r_size = [1] * rank
             r_size[i] = -1
-            size = select(g, sizes, g.op("Constant", value_t=torch.tensor([0])), g.op("Constant", value_t=torch.tensor(i)))
-            tmp_ind = sym_help._reshape_helper(g, arange(g, size, 4, None, None, None),
-                                               g.op("Constant", value_t=torch.tensor(r_size)))
-            tmp_ind = g.op("Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride])))
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = sym_help._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
             if ind is None:
                 ind = tmp_ind
             else:
@@ -3214,7 +4539,16 @@ def linear(g, input, weight, bias):
 
 
 @parse_args("v", "b", "i", "v", "v", "v", "v")
-def hann_window(g, window_length, periodic=True, dtype=None, layout=None, device=None, pin_memory=None, requires_grad=False):
+def hann_window(
+    g,
+    window_length,
+    periodic=True,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
     if dtype is None:
         dtype = torch.get_default_dtype()
         if not dtype or not dtype.is_floating_point:
@@ -3223,12 +4557,18 @@ def hann_window(g, window_length, periodic=True, dtype=None, layout=None, device
 
     n_array = arange(g, window_length, 4, None, None, None)
     output = g.op("Cast", n_array, to_i=sym_help.cast_pytorch_to_onnx["Float"])
-    output = mul(g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
 
     if periodic is False:
-        window_length = sub(g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int)))
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
     output = div(g, output, window_length)
-    output = g.op("Cast", square(g, sin(g, output)), to_i=sym_help.scalar_type_to_onnx[dtype])
+    output = g.op(
+        "Cast", square(g, sin(g, output)), to_i=sym_help.scalar_type_to_onnx[dtype]
+    )
 
     return output
 
@@ -3253,8 +4593,10 @@ def fill(g, self, value):
 
 
 def index_add(g, self, dim, index, other, alpha=None):
-    warnings.warn("Warning: ONNX export does not support duplicated values in 'index' field, " +
-                  "this will cause the ONNX model to be incorrect.")
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
     from torch.onnx.symbolic_opset9 import scatter_add
 
     # ONNX does not support "alpha" argument, unlike aten index_add
@@ -3264,44 +4606,46 @@ def index_add(g, self, dim, index, other, alpha=None):
 
     dim = sym_help._maybe_get_const(dim, "i")
     if dim is None:
-        raise NotImplementedError("ONNX export does NOT support exporting 'index_add_()' function with " +
-                                  "unknown 'dim' value.")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            + "unknown 'dim' value."
+        )
 
     self_dim_rank = sym_help._get_tensor_rank(self)
     other_dim_rank = sym_help._get_tensor_rank(other)
 
     if self_dim_rank is None or other_dim_rank is None:
-        raise NotImplementedError("ONNX export does NOT support exporting 'index_add_()' function while " +
-                                  "the rank of self tensor or tensor to be added is unknown.")
+        raise NotImplementedError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            + "the rank of self tensor or tensor to be added is unknown."
+        )
 
     if other_dim_rank != self_dim_rank:
         delta = self_dim_rank - other_dim_rank
         for i in range(delta):
-            other = sym_help._unsqueeze_helper(g, other, [sym_help._get_tensor_rank(other)])
+            other = sym_help._unsqueeze_helper(
+                g, other, [sym_help._get_tensor_rank(other)]
+            )
 
     other_dim_size = sym_help._get_tensor_dim_size(other, dim)
     self_dim_size = sym_help._get_tensor_dim_size(self, dim)
 
     if (other_dim_size is not None) and (self_dim_size is not None):
         if other_dim_size > self_dim_size:
-            raise NotImplementedError("ONNX export does NOT support exporting 'index_add_()' function with " +
-                                      "duplicated values in 'index' parameter yet.")
+            raise NotImplementedError(
+                "ONNX export does NOT support exporting 'index_add_()' function with "
+                + "duplicated values in 'index' parameter yet."
+            )
 
     # Construct a new shape. It's almost as same as self except the size of the 'dim'
     # dimension is 1, so that we can expand other dimensions as expected.
     new_shape_axes = list(range(self_dim_rank))
     new_shape_starts = [0 for i in range(self_dim_rank)]
-    new_shape_ends = [maxsize
-                      if (i != dim)
-                      else
-                      1
-                      for i in range(self_dim_rank)]
-
-    new_shape = sym_help._slice_helper(g,
-                                       self,
-                                       axes=new_shape_axes,
-                                       starts=new_shape_starts,
-                                       ends=new_shape_ends)
+    new_shape_ends = [maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = sym_help._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
     other = expand_as(g, other, new_shape)
 
     for i in range(dim):
@@ -3320,23 +4664,52 @@ def roll(g, self, shifts, dims):
     result = self
     for i in range(len(shifts)):
         shapes = []
-        shape = sym_help._slice_helper(g,
-                                       result,
-                                       axes=[dims[i]],
-                                       starts=[-shifts[i]],
-                                       ends=[maxsize])
+        shape = sym_help._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[maxsize]
+        )
         shapes.append(shape)
-        shape = sym_help._slice_helper(g,
-                                       result,
-                                       axes=[dims[i]],
-                                       starts=[0],
-                                       ends=[-shifts[i]])
+        shape = sym_help._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
         shapes.append(shape)
         result = g.op("Concat", *shapes, axis_i=dims[i])
 
     return result
 
 
+@parse_args("v", "v", "i")
+def cross(g, input, other, dim=None):
+    dim = sym_help._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+def cdist(g, x1, x2, p=2.0, compute_mode="use_mm_for_euclid_dist_if_necessary"):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    # Currently we ignore the 'compute_mode' variable as we use default to
+    # using matrix multiplication to calculate the euclidean distance
+    rank = sym_help._get_tensor_rank(x1)
+    broadcasted_x1 = sym_help._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = sym_help._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
 def broadcast_tensors(g, self):
     all_tensors = sym_help._unpack_list(self)
     t_with_final_shape = zeros_like(g, all_tensors[0])
@@ -3348,3 +4721,280 @@ def broadcast_tensors(g, self):
 
     t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
     return g.op("prim::ListConstruct", *t_list)
+
+
+class Prim:
+    domain = "prim"
+
+    @staticmethod
+    def ConstantSplit(g, self, split_size, dim):
+        size = sym_help._get_tensor_dim_size(self, dim)
+        if size is None:
+            return _unimplemented("prim::ConstantSplit", "unknown dimension size")
+        splits = [split_size] * (size // split_size)
+        leftover = size % split_size
+        if leftover:
+            splits.append(leftover)
+        return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+    # TODO: It would be better to export this as a chunk directly, as this is
+    # less sensitive to changes in input size.
+    # TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+    # method, and use the desugared version
+    @staticmethod
+    def ConstantChunk(g, self, chunks, dim):
+        dim_size = sym_help._get_tensor_dim_size(self, dim)
+        if dim_size is None:
+            return _unimplemented("prim::ConstantChunk", "unknown dimension size")
+        split_size = (dim_size + chunks - 1) // chunks
+        return Prim.ConstantSplit(g, self, split_size, dim)
+
+    @staticmethod
+    def shape(g, self):
+        return g.op("Shape", self)
+
+    @staticmethod
+    def max(g, self, other):
+        return op_with_optional_float_cast(g, "Max", self, other, opset_before=12)
+
+    @staticmethod
+    def min(g, self, other=None):
+        if not other:
+            if sym_help._is_packed_list(self):
+                self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+            return min(g, self)
+        return min(g, self, other)
+
+    @staticmethod
+    def data(g, self):
+        return self
+
+    @staticmethod
+    def ListConstruct(g, *inputs, **kwargs):
+        return None
+
+    @staticmethod
+    def ListUnpack(g, *inputs, **kwargs):
+        return None
+
+    @staticmethod
+    def TupleConstruct(g, *inputs, **kwargs):
+        return None
+
+    @staticmethod
+    def Uninitialized(g, *inputs, **kwargs):
+        return None
+
+    # exists to refine the type of the Value
+    # if x is an optional Tensor, unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor
+    # this doesn't do anything in runtime and is a noop in ONNX
+    @staticmethod
+    def unchecked_cast(g, self):
+        return self
+
+    @staticmethod
+    def dtype(g, self):
+        dtype = sym_help._try_get_scalar_type(self)
+        if dtype is None:
+            dtype = "Float"
+        dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
+        return g.op("Constant", value_t=torch.tensor(dtype))
+
+    # tolist is currently supported only for 1D input tensors.
+    # dim_val and elem_ty_val represent dimension and type annotations
+    # that need to match dimension and type of the input tensor.
+    @staticmethod
+    def tolist(g, input, dim_val, elem_ty_val):
+        dim = sym_help._maybe_get_const(dim_val, "i")
+        if dim > 1:
+            return _unimplemented("prim::tolist", "dim_val > 1")
+        return input
+
+    # -----------------------------------------------------------------------------
+    # Symbolic functions that need extra context
+    # -----------------------------------------------------------------------------
+    @staticmethod
+    def device(ctx: torch.onnx.SymbolicContext, g, *inputs, **kwargs):
+        n = ctx.cur_node
+
+        if n.output().type().kind() == "DeviceObjType":
+            return None
+
+        return _unimplemented("prim::device", "output type is not `DeviceObjType`.")
+
+    @staticmethod
+    def Loop(ctx: torch.onnx.SymbolicContext, g, *inputs, **attrs):
+        n = ctx.cur_node
+        env = ctx.env
+        params_dict = ctx.params_dict
+
+        operator_export_type = GLOBALS.operator_export_type
+        opset_version = GLOBALS.export_onnx_opset_version
+
+        new_op_outputs = g.op("Loop", *inputs, outputs=n.outputsSize())
+        new_node = (
+            new_op_outputs[0].node() if n.outputsSize() > 1 else new_op_outputs.node()
+        )
+        for b in n.blocks():
+            new_block = new_node.addBlock()
+            # Copy input metadata to subblock
+            #
+            #   prim::Loop(iter, cond, input_1, ..., input_n)
+            #     block0(iter, input_1, ..., input_n)
+            #
+            # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+            for i, b_in in enumerate(b.inputs()):
+                if i == 0 and i < len(inputs):
+                    b_in.setType(inputs[i].type())
+                # For optional block inputs, they may switch between None not-None inside
+                # the loop body, so if the loop input is not optional, the block input may
+                # still need to be optional.
+                if (
+                    i > 0
+                    and (i + 1) < len(inputs)
+                    and not isinstance(b_in.type(), OptionalType)
+                ):
+                    b_in.setType(inputs[i + 1].type())
+            torch._C._jit_pass_onnx_block(
+                b, new_block, operator_export_type, env, False  # type:ignore[arg-type]
+            )
+        new_op_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for Loop after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return new_op_outputs
+
+    @staticmethod
+    def If(ctx: torch.onnx.SymbolicContext, g, *inputs, **attrs):
+        n = ctx.cur_node
+        block = ctx.onnx_block
+        env = ctx.env
+        params_dict = ctx.params_dict
+
+        operator_export_type = GLOBALS.operator_export_type
+        opset_version = GLOBALS.export_onnx_opset_version
+
+        static_if = inputs[0].node().kind() == "onnx::Constant"
+        if static_if:
+            # Fold static if
+            #
+            # The torch IR
+            # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+            #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+            # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+            # %21 : Long(device=cpu) = aten::eq(%20, %64)
+            # %22 : Long(device=cpu) = prim::If(%21)
+            #     block0():
+            #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+            #     -> (%23)
+            #     block1():
+            #     -> (%65)
+            # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+            #     block0():
+            #     -> (%embedding_matrix.1, %input.1)
+            #     block1():
+            #     -> (%input.1, %embedding_matrix.1)
+            # %26 : int[] = aten::size(%input.53)
+            #
+            # The converted ONNX graph
+            # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+            # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+            # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+            # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+            input_flag = inputs[0].node()["value"].tolist()
+            const_value = (
+                all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+            )
+            block_idx = 0 if const_value else 1
+            current_b = list(n.blocks())[block_idx]
+            env = torch._C._jit_pass_onnx_block(
+                current_b,
+                block,
+                operator_export_type,  # type:ignore[arg-type]
+                env,  # type:ignore[arg-type]
+                True,
+            )
+            if_output_list = list(n.outputs())
+            current_b_list = list(current_b.outputs())
+
+            final_b_list = []
+            for idx in range(len(if_output_list)):
+                if current_b_list[idx] not in env:
+                    raise RuntimeError(
+                        "The sub block ATen output {}"
+                        " is not in env.".format(current_b_list[idx])
+                    )  # type:ignore[operator]
+                onnx_b = env[current_b_list[idx]]
+                final_b_list.append(onnx_b)
+            return final_b_list
+        else:
+            new_op_outputs = g.op("If", *inputs, outputs=n.outputsSize())
+            new_node = (
+                new_op_outputs[0].node()
+                if n.outputsSize() > 1
+                else new_op_outputs.node()
+            )
+            for b in n.blocks():
+                new_block = new_node.addBlock()
+                torch._C._jit_pass_onnx_block(
+                    b,
+                    new_block,
+                    operator_export_type,  # type:ignore[arg-type]
+                    env,
+                    False,
+                )
+            new_op_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+                new_node, opset_version
+            )
+            # Run shape type inference for If after subblock is converted.
+            if GLOBALS.onnx_shape_inference:
+                torch._C._jit_pass_onnx_node_shape_type_inference(
+                    new_node, params_dict, opset_version
+                )
+            return new_op_outputs
+
+    @staticmethod
+    def Constant(ctx: torch.onnx.SymbolicContext, g, *inputs, **attrs):
+        n = ctx.cur_node
+
+        if n.mustBeNone():
+            return None
+        # This must go before checking for string values, because some device constants
+        # have string values, but we want to keep them as unconverted Device types so
+        # that eq() can work on them.
+        if isinstance(n.output().type(), DeviceObjType):
+            return None
+        if n.kindOf("value") == "t":
+            return g.op("Constant", value_t=n["value"])
+        if n.kindOf("value") == "s":
+            return g.op("Constant", value_s=n["value"])
+        elif n.output().type().isSubtypeOf(
+            ListType.ofInts()
+        ) or n.output().type().isSubtypeOf(ListType.ofFloats()):
+            return g.op("Constant", value_t=torch.tensor(n["value"]))
+        else:
+            raise RuntimeError(
+                "Unsupported prim::Constant kind: `{}`. Send a bug report.".format(
+                    n.kindOf("value")
+                )
+            )
+
+
+class Onnx:
+    domain = "onnx"
+
+    # -----------------------------------------------------------------------------
+    # Symbolic functions that need extra context
+    # -----------------------------------------------------------------------------
+    @staticmethod
+    def Placeholder(ctx: torch.onnx.SymbolicContext, g, *inputs, **attrs):
+        n = ctx.cur_node
+        block = ctx.onnx_block
+        env = ctx.env
+
+        return torch._C._jit_onnx_convert_pattern_from_subblock(block, n, env)
diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py
index eacffc0be3d5..3aaf7b906ded 100644
--- a/torch/onnx/symbolic_registry.py
+++ b/torch/onnx/symbolic_registry.py
@@ -1,50 +1,55 @@
-import warnings
 import importlib
-from inspect import getmembers, isfunction
-from typing import Dict, Tuple, Any, Union
-
-# The symbolic registry "_registry" is a dictionary that maps operators
-# (for a specific domain and opset version) to their symbolic functions.
-# An operator is defined by its domain, opset version, and opname.
-# The keys are tuples (domain, version), (where domain is a string, and version is an int),
-# and the operator's name (string).
-# The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
-_registry: Dict[Tuple[str, int], Dict] = {}
+import inspect
+import itertools
+import warnings
+from typing import Any, Callable, Dict, Tuple, Union
+
+import torch._C
+from torch.onnx import _constants
+
+_SymbolicFunction = Callable[..., Union[torch._C.Value, Tuple[torch._C.Value]]]
+
+"""
+The symbolic registry "_registry" is a dictionary that maps operators
+(for a specific domain and opset version) to their symbolic functions.
+An operator is defined by its domain, opset version, and opname.
+The keys are tuples (domain, version), (where domain is a string, and version is an int),
+and the operator's name (string).
+The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
+"""
+_registry: Dict[
+    Tuple[str, int],
+    Dict[str, _SymbolicFunction],
+] = {}
 
 _symbolic_versions: Dict[Union[int, str], Any] = {}
-from torch.onnx.symbolic_helper import _onnx_stable_opsets, _onnx_main_opset
-for opset_version in _onnx_stable_opsets + [_onnx_main_opset]:
-    module = importlib.import_module("torch.onnx.symbolic_opset{}".format(opset_version))
-    _symbolic_versions[opset_version] = module
 
 
-def register_version(domain, version):
+def _import_symbolic_opsets():
+    for opset_version in itertools.chain(
+        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
+    ):
+        module = importlib.import_module(
+            "torch.onnx.symbolic_opset{}".format(opset_version)
+        )
+        global _symbolic_versions
+        _symbolic_versions[opset_version] = module
+
+
+def register_version(domain: str, version: int):
     if not is_registered_version(domain, version):
         global _registry
         _registry[(domain, version)] = {}
     register_ops_in_version(domain, version)
 
 
-def register_ops_helper(domain, version, iter_version):
-    version_ops = get_ops_in_version(iter_version)
-    for op in version_ops:
-        if op[0] == "_len":
-            op = ("len", op[1])
-        if op[0] == "_list":
-            op = ("list", op[1])
-        if op[0] == "_any":
-            op = ("any", op[1])
-        if op[0] == "_all":
-            op = ("all", op[1])
-        domain_register = domain
-        if op[0].startswith("prim_"):
-            op = (op[0][5:], op[1])
-            domain_register = "prim"
-        if isfunction(op[1]) and not is_registered_op(op[0], domain_register, version):
-            register_op(op[0], op[1], domain_register, version)
-
-
-def register_ops_in_version(domain, version):
+def register_ops_helper(domain: str, version: int, iter_version: int):
+    for domain, op_name, op_func in get_ops_in_version(iter_version):
+        if not is_registered_op(op_name, domain, version):
+            register_op(op_name, op_func, domain, version)
+
+
+def register_ops_in_version(domain: str, version: int):
     # iterates through the symbolic functions of
     # the specified opset version, and the previous
     # opset versions for operators supported in
@@ -73,31 +78,54 @@ def register_ops_in_version(domain, version):
     register_ops_helper(domain, version, 9)
 
 
-def get_ops_in_version(version):
-    return getmembers(_symbolic_versions[version])
-
-
-def is_registered_version(domain, version):
+def get_ops_in_version(version: int):
+    if not _symbolic_versions:
+        _import_symbolic_opsets()
+    members = inspect.getmembers(_symbolic_versions[version])
+    domain_opname_ops = []
+    for obj in members:
+        if isinstance(obj[1], type) and hasattr(obj[1], "domain"):
+            ops = inspect.getmembers(obj[1], predicate=inspect.isfunction)
+            for op in ops:
+                domain_opname_ops.append((obj[1].domain, op[0], op[1]))  # type: ignore[attr-defined]
+
+        elif inspect.isfunction(obj[1]):
+            if obj[0] == "_len":
+                obj = ("len", obj[1])
+            if obj[0] == "_list":
+                obj = ("list", obj[1])
+            if obj[0] == "_any":
+                obj = ("any", obj[1])
+            if obj[0] == "_all":
+                obj = ("all", obj[1])
+            domain_opname_ops.append(("", obj[0], obj[1]))
+    return domain_opname_ops
+
+
+def is_registered_version(domain: str, version: int):
     global _registry
     return (domain, version) in _registry
 
 
 def register_op(opname, op, domain, version):
     if domain is None or version is None:
-        warnings.warn("ONNX export failed. The ONNX domain and/or version to register are None.")
+        warnings.warn(
+            "ONNX export failed. The ONNX domain and/or version to register are None."
+        )
     global _registry
     if not is_registered_version(domain, version):
         _registry[(domain, version)] = {}
     _registry[(domain, version)][opname] = op
 
 
-def is_registered_op(opname, domain, version):
+def is_registered_op(opname: str, domain: str, version: int):
     if domain is None or version is None:
         warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
     global _registry
     return (domain, version) in _registry and opname in _registry[(domain, version)]
 
-def unregister_op(opname, domain, version):
+
+def unregister_op(opname: str, domain: str, version: int):
     global _registry
     if is_registered_op(opname, domain, version):
         del _registry[(domain, version)][opname]
@@ -106,25 +134,42 @@ def unregister_op(opname, domain, version):
     else:
         warnings.warn("The opname " + opname + " is not registered.")
 
-def get_op_supported_version(opname, domain, version):
+
+def get_op_supported_version(opname: str, domain: str, version: int):
     iter_version = version
-    while iter_version <= _onnx_main_opset:
-        ops = [op[0] for op in get_ops_in_version(iter_version)]
-        if opname in ops:
+    while iter_version <= _constants.onnx_main_opset:
+        ops = [(op[0], op[1]) for op in get_ops_in_version(iter_version)]
+        if (domain, opname) in ops:
             return iter_version
         iter_version += 1
     return None
 
-def get_registered_op(opname, domain, version):
+
+def get_registered_op(opname: str, domain: str, version: int) -> _SymbolicFunction:
     if domain is None or version is None:
         warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
     global _registry
     if not is_registered_op(opname, domain, version):
-        msg = "Exporting the operator " + opname + " to ONNX opset version " + str(version) + " is not supported. "
+        raise UnsupportedOperatorError(domain, opname, version)
+    return _registry[(domain, version)][opname]
+
+
+class UnsupportedOperatorError(RuntimeError):
+    def __init__(self, domain: str, opname: str, version: int):
         supported_version = get_op_supported_version(opname, domain, version)
-        if supported_version is not None:
-            msg += "Support for this operator was added in version " + str(supported_version) + ", try exporting with this version."
+        if domain in {"", "aten", "prim", "quantized"}:
+            msg = f"Exporting the operator {domain}::{opname} to ONNX opset version {version} is not supported. "
+            if supported_version is not None:
+                msg += (
+                    f"Support for this operator was added in version {supported_version}, "
+                    "try exporting with this version."
+                )
+            else:
+                msg += "Please feel free to request support or submit a pull request on PyTorch GitHub."
         else:
-            msg += "Please feel free to request support or submit a pull request on PyTorch GitHub."
-        raise RuntimeError(msg)
-    return _registry[(domain, version)][opname]
+            msg = (
+                f"ONNX export failed on an operator with unrecognized namespace {domain}::{opname}. "
+                "If you are trying to export a custom operator, make sure you registered "
+                "it with the right domain and version."
+            )
+        super().__init__(msg)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 6ae319c69394..30b2c6a42800 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1,26 +1,31 @@
+"""Functions to export models into the ONNX IR format.
 
-r"""
-The torch.onnx module contains functions to export models into the ONNX
-IR format.  These models can be loaded with the ONNX library and then
+These models can be loaded with the ONNX library and then
 converted to models which run on other deep learning frameworks.
 """
-
-import torch
-import torch.jit
-import torch.autograd
-import torch.serialization
-import re
-import collections
 import contextlib
 import copy
-import numbers
+import inspect
+import itertools
+import os
+import re
+import textwrap
+import typing
 import warnings
-from torch._six import string_classes
-from torch.jit import _unique_state_dict
-from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode, CheckerError
-from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto
-from typing import List, Tuple, Union
+import zipfile
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import torch
+import torch.jit._trace
+import torch.serialization
+from torch.onnx import (  # noqa: F401
+    _constants,
+    _patch_torch,
+    symbolic_caffe2,
+    symbolic_helper,
+    symbolic_registry,
+)
+from torch.onnx._globals import GLOBALS
 
 # the flag to tell the user whether it's in the middle of ONNX export or not
 __IN_ONNX_EXPORT = False
@@ -31,6 +36,7 @@ def is_in_onnx_export():
     return __IN_ONNX_EXPORT
 
 
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
 # Skip check due to cannot import IValue from torch._C
 _params_dict = {}  # type: ignore[var-annotated]
 
@@ -41,37 +47,46 @@ def select_model_mode_for_export(model, mode):
         is_originally_training = model.training
 
         if mode is None:
-            mode = TrainingMode.EVAL
+            mode = torch.onnx.TrainingMode.EVAL
             # if the model is in training mode but the user did not specify
             # to export the model in training mode, export the model in inference
             # mode (default) and warn them
             if is_originally_training:
-                warnings.warn("You are exporting the model to ONNX while in training mode with "
-                              "'train' parameter not specified. The model will default to inference mode export. "
-                              "If you wish to export a training amenable ONNX model, specify training=TrainingMode.TRAINING or "
-                              "training=TrainingMode.PRESERVE (to preserve the original model state) in torch.onnx.export().")
+                warnings.warn(
+                    "You are exporting the model to ONNX while in training mode with "
+                    "'train' parameter not specified. The model will default to inference mode export. "
+                    "If you wish to export a training amenable ONNX model, specify training=TrainingMode.TRAINING or "
+                    "training=TrainingMode.PRESERVE (to preserve the original model state) in torch.onnx.export()."
+                )
 
         # if mode == TrainingMode.EVAL or (mode == TrainingMode.PRESERVE and not is_originally_training) => is_training = False
         is_export_training = False
         # ONNX opset 12 has better support for training amenable models, with updated
         # versions of the dropout and batch_norm operators
-        if mode == TrainingMode.TRAINING or (mode == TrainingMode.PRESERVE and is_originally_training):
-            from torch.onnx.symbolic_helper import _export_onnx_opset_version
-            if _export_onnx_opset_version < 12:
-                warnings.warn("You are exporting the model in training mode with onnx opset version {}. "
-                              "Opset versions lower than opset 12 will not be able to export nodes such as "
-                              "Dropout and BatchNorm correctly.".format(_export_onnx_opset_version))
+        if mode == torch.onnx.TrainingMode.TRAINING or (
+            mode == torch.onnx.TrainingMode.PRESERVE and is_originally_training
+        ):
+
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset version {}. "
+                    "Opset versions lower than opset 12 will not be able to export nodes such as "
+                    "Dropout and BatchNorm correctly.".format(
+                        GLOBALS.export_onnx_opset_version
+                    )
+                )
             is_export_training = True
 
-        from torch.onnx.symbolic_helper import _set_training_mode
-        _set_training_mode(is_export_training)
+        symbolic_helper._set_training_mode(is_export_training)
         model.train(is_export_training)
     try:
         yield
     finally:
         if not isinstance(model, torch.jit.ScriptFunction):
+            # FIXME(justinchuby): is_originally_training is possibly unbound
             model.train(is_originally_training)
 
+
 @contextlib.contextmanager
 def disable_apex_o2_state_dict_hook(model):
     # Apex O2 hook state_dict to return fp16 weights as fp32.
@@ -82,7 +97,7 @@ def disable_apex_o2_state_dict_hook(model):
         tmp_map = {}  # type: ignore[var-annotated]
         for module in model.modules():
             for k, v in module._state_dict_hooks.items():
-                if type(v).__name__ == 'O2StateDictHook':
+                if type(v).__name__ == "O2StateDictHook":
                     if module not in tmp_map:
                         tmp_map[module] = {}
                     tmp_map[module][k] = v
@@ -93,39 +108,83 @@ def disable_apex_o2_state_dict_hook(model):
         yield
     finally:
         if not isinstance(model, torch.jit.ScriptFunction):
+            # FIXME(justinchuby): tmp_map is possibly unbound
             for module, m_map in tmp_map.items():
                 for k, v in m_map.items():
                     module._state_dict_hooks[k] = v
 
-@contextlib.contextmanager
-def exporter_context(model, mode):
-    with select_model_mode_for_export(model, mode) as mode_ctx, \
-            disable_apex_o2_state_dict_hook(model) as apex_ctx:
-        yield (mode_ctx, apex_ctx)
 
+@contextlib.contextmanager
+def setup_onnx_logging(verbose):
+    is_originally_enabled = torch.onnx.is_onnx_log_enabled()
+    if is_originally_enabled or verbose:
+        torch.onnx.enable_log()
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:
+            torch.onnx.disable_log()
 
-def export(model, args, f, export_params=True, verbose=False, training=None,
-           input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
-           opset_version=None, do_constant_folding=True, dynamic_axes=None,
-           keep_initializers_as_inputs=None, custom_opsets=None,
-           export_modules_as_functions=False):
 
-    _export(model, args, f, export_params, verbose, training, input_names, output_names,
-            operator_export_type=operator_export_type, opset_version=opset_version,
-            do_constant_folding=do_constant_folding, dynamic_axes=dynamic_axes,
-            keep_initializers_as_inputs=keep_initializers_as_inputs,
-            custom_opsets=custom_opsets, export_modules_as_functions=export_modules_as_functions)
+@contextlib.contextmanager
+def exporter_context(model, mode, verbose):
+    with select_model_mode_for_export(
+        model, mode
+    ) as mode_ctx, disable_apex_o2_state_dict_hook(
+        model
+    ) as apex_ctx, setup_onnx_logging(
+        verbose
+    ) as log_ctx:
+        yield (mode_ctx, apex_ctx, log_ctx)
+
+
+def export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=None,
+    input_names=None,
+    output_names=None,
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    custom_opsets=None,
+    export_modules_as_functions=False,
+):
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+    )
 
 
 def _is_constant_tensor_list(node):
     if node.kind() != "prim::Constant":
         return False
     output_type = node.output().type()
-    if output_type.isSubtypeOf(ListType.ofTensors()):
+    if output_type.isSubtypeOf(torch._C.ListType.ofTensors()):
         return True
-    if output_type.isSubtypeOf(ListType(OptionalType.ofTensor())):
+    if output_type.isSubtypeOf(torch._C.ListType(torch._C.OptionalType.ofTensor())):
         return True
 
+
 # ONNX can't handle constants that are lists of tensors, which can
 # get generated in constant prop. So we split them back into prim::ListConstructs
 
@@ -142,16 +201,26 @@ def _split_tensor_list_constants(g, block):
                 input.node().copyMetadata(node)
                 inputs.append(input)
 
-            lc = (g.create("prim::ListConstruct", inputs)
-                  .insertBefore(node)
-                  .output()
-                  .setType(ListType.ofTensors()))
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(torch._C.ListType.ofTensors())
+            )
             lc.node().copyMetadata(node)
             node.output().replaceAllUsesWith(lc)
 
 
-def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    params_dict=None, dynamic_axes=None, input_names=None, module=None):
+def _optimize_graph(
+    graph: torch._C.Graph,
+    operator_export_type: torch.onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
     # Inline everything
     torch._C._jit_pass_inline(graph)
 
@@ -178,7 +247,6 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_peephole(graph, True)
     torch._C._jit_pass_fuse_addmm(graph)
     torch._C._jit_pass_lint(graph)
-    from torch.onnx.symbolic_helper import _onnx_shape_inference, _export_onnx_opset_version
 
     torch._C._jit_pass_peephole(graph, True)
     torch._C._jit_pass_lower_all_tuples(graph)
@@ -200,41 +268,53 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_onnx_remove_print(graph)
     torch._C._jit_pass_onnx_preprocess_caffe2(graph)
 
-    # Caffe2-specific optimization
-    is_caffe2_aten_fallback = (operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK and
-                               torch.onnx._CAFFE2_ATEN_FALLBACK)
-    if is_caffe2_aten_fallback:
-        torch.onnx.symbolic_helper._quantized_ops.clear()
-        # Unpack quantized weights for conv and linear ops and insert into graph.
-        torch._C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    torch._C._jit_pass_onnx_unpack_quantized_weights(
+        graph, params_dict, symbolic_helper.is_caffe2_aten_fallback()
+    )
+    if symbolic_helper.is_caffe2_aten_fallback():
         # Insert permutes before and after each conv op to ensure correct order.
         torch._C._jit_pass_onnx_quantization_insert_permutes(graph, params_dict)
 
         # Find consecutive permutes that are no-ops and remove them.
-        torch._C._jit_pass_custom_pattern_based_rewrite_graph("""
-        graph(%Pi):
-            %Pq = quantized::nhwc2nchw(%Pi)
-            %Pr = quantized::nchw2nhwc(%Pq)
-            return (%Pr)""", """
-        graph(%Ri):
-            return (%Ri)""", graph)
+        torch._C._jit_pass_custom_pattern_based_rewrite_graph(
+            textwrap.dedent(
+                """\
+                graph(%Pi):
+                    %Pq = quantized::nhwc2nchw(%Pi)
+                    %Pr = quantized::nchw2nhwc(%Pq)
+                    return (%Pr)"""
+            ),
+            textwrap.dedent(
+                """\
+                graph(%Ri):
+                    return (%Ri)"""
+            ),
+            graph,
+        )
 
     # onnx only supports tensors, so we turn all out number types into tensors
     torch._C._jit_pass_erase_number_types(graph)
-
-    if _onnx_shape_inference:
+    if GLOBALS.onnx_shape_inference:
         input_names = [] if input_names is None else input_names
         dynamic_axes = {} if dynamic_axes is None else dynamic_axes
-        torch._C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+        torch._C._jit_pass_onnx_set_dynamic_input_shape(
+            graph, dynamic_axes, input_names
+        )
     torch._C._jit_pass_onnx_lint(graph)
     graph = torch._C._jit_pass_onnx(graph, operator_export_type)
     torch._C._jit_pass_onnx_lint(graph)
     torch._C._jit_pass_lint(graph)
 
-    torch._C._jit_pass_onnx_scalar_type_analysis(graph, True, _export_onnx_opset_version)
+    torch._C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
     torch._C._jit_pass_lint(graph)
 
-    torch._C._jit_pass_onnx_peephole(graph, _export_onnx_opset_version, fixed_batch_size)
+    torch._C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
     torch._C._jit_pass_lint(graph)
 
     # graph is not a valid jit graph anymore because types have been replaced
@@ -246,131 +326,179 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_lint(graph)
     graph = torch._C._jit_pass_canonicalize(graph)
     torch._C._jit_pass_lint(graph)
-    if _onnx_shape_inference:
-        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version)
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_graph_shape_type_inference(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
     return graph
 
 
-# We accept dictionaries and strings as ONNX inputs,
-# but they should be only for configuration use.
-# we detect here if these inputs are modified, and if so
-# we warn the user that the changes won't take effect in the
-# traced ONNX graph
 def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
     for input, traced_input in zip(input_states[0], input_states[1]):
         if isinstance(input, dict):
             if list(input.keys()) != list(traced_input.keys()):
-                warning = "We detected that you are modifying a dictionary that is an input to your " \
-                          "model. " \
-                          "Note that dictionaries are allowed as inputs in ONNX but they should be " \
-                          "handled with care. " \
-                          "Usages of dictionaries is not recommended, and should not be used except " \
-                          "for configuration use. " \
-                          "Also note that the order and values of the keys must remain the same. "
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
                 warnings.warn(warning)
         elif isinstance(input, str):
             if input != traced_input:
-                warning = "The model seems to have string inputs/outputs. " \
-                          "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
                 warnings.warn(warning)
 
 
 def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
-    # This helper method resolves the arguments that are ignored when export_type != operator_export_type.ONNX
-    if operator_export_type is not operator_export_type.ONNX:
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    if (
+        operator_export_type is not operator_export_type.ONNX
+        and torch.onnx._CAFFE2_ATEN_FALLBACK
+    ):
         if arg_value is True:
-            warnings.warn("`{}' can be set to True only when 'operator_export_type' is "
-                          "`ONNX`. Since 'operator_export_type' is not set to 'ONNX', "
-                          "`{}` argument will be ignored.".format(arg_name, arg_name))
+            warnings.warn(
+                "`{}' can be set to True only when 'operator_export_type' is "
+                "`ONNX`. Since 'operator_export_type' is not set to 'ONNX', "
+                "`{}` argument will be ignored.".format(arg_name, arg_name)
+            )
         arg_value = False
     return arg_value
 
 
-def _decide_keep_init_as_input(keep_initializers_as_inputs, operator_export_type,
-                               opset_version):
-    # This method encapsulates the logic to decide whether the initializers in the graph
-    # should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
-    # If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
-    # initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
-    # is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
-    # export types keep initializers as input (val_keep_init_as_ip=True).
-    # If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
-    # in which case it must be ignored because for opset version <= 8, all initializers MUST be
-    # part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
-
-    # Special handling is needed for opset version 8 or lower, because irrespective
-    # of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
-    # semantics, i.e. all initializers must be listed as ONNX graph input.
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: Optional[bool],
+    operator_export_type: torch.onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
     if opset_version < 9:
         if keep_initializers_as_inputs is False:
-            warnings.warn("Setting 'keep_initializers_as_inputs=False' for opset version"
-                          "8 or lower would lead to an invalid ONNX graph. Therefore, "
-                          "'keep_initializers_as_inputs=False' is ignored during export."
-                          "Exported model will have initializers as graph inputs (compliant "
-                          " to ONNX IR v3).")
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
         return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
-    val_keep_init_as_ip = True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
-    if keep_initializers_as_inputs is None and operator_export_type is OperatorExportTypes.ONNX:
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is torch.onnx.OperatorExportTypes.ONNX
+    ):
         val_keep_init_as_ip = False
     return val_keep_init_as_ip
 
 
 def _decide_add_node_names(add_node_names, operator_export_type):
-    return _resolve_args_by_export_type("add_node_names", add_node_names, operator_export_type)
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
 
 
 def _decide_constant_folding(do_constant_folding, operator_export_type, training):
-    do_constant_folding = _resolve_args_by_export_type("do_constant_folding", do_constant_folding, operator_export_type)
-    if do_constant_folding and (training is not None and training is not TrainingMode.EVAL):
-        warnings.warn("It is recommended that constant folding be turned off ('do_constant_folding=False') "
-                      "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
-                      "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
-                      "learnable model parameters may not translate correctly in the exported ONNX model "
-                      "because constant folding mutates model parameters. Please consider "
-                      "turning off constant folding or setting the training=TrainingMode.EVAL.")
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not torch.onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
     return do_constant_folding
 
 
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
 def _decide_input_format(model, args):
-    import inspect
     try:
-        sig = inspect.signature(model.forward)
-        ordered_list_keys = list(sig.parameters.keys())
-        if isinstance(args[-1], dict):
-            args_dict = args[-1]
-            args = list(args)[:-1]
-            n_nonkeyword = len(args)
-            for optional_arg in ordered_list_keys[n_nonkeyword:]:
-                if optional_arg in args_dict:
-                    args.append(args_dict[optional_arg])
-                # Check if this arg has a default value
-                else:
-                    param = sig.parameters[optional_arg]
-                    if param.default is param.empty:
-                        args.append(None)
-                    else:
-                        args.append(param.default)
-            args = tuple(args)
-        return args
-    # Cases of models without forward functions and dict inputs
-    except (AttributeError, ValueError):
-        warnings.warn("Model has no forward function")
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn("%s, skipping _decide_input_format" % e)
         return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: Dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
     # Cases of models with no input args
     except IndexError:
-        warnings.warn("No input args")
-        return args
+        warnings.warn("No input args, skipping _decide_input_format")
     except Exception as e:
         warnings.warn("Skipping _decide_input_format\n {}".format(e.args[0]))
-        return args
+
+    return args
+
 
 def _trace(func, args, operator_export_type, return_outs=False):
     # Special case for common case of passing a single Tensor
     if isinstance(args, torch.Tensor):
-        args = (args, )
+        args = (args,)
 
-    trace_graph, torch_out, inputs_states = \
-        torch.jit._get_trace_graph(func, args, strict=False, _force_outplace=False, _return_inputs_states=True)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func, args, strict=False, _force_outplace=False, _return_inputs_states=True
+    )
     warn_on_static_input_change(inputs_states)
 
     trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
@@ -380,18 +508,20 @@ def _trace(func, args, operator_export_type, return_outs=False):
 
 
 def _trace_and_get_graph_from_model(model, args):
-
     # A basic sanity check: make sure the state_dict keys are the same
     # before and after running the model.  Fail fast!
-    orig_state_dict_keys = _unique_state_dict(model).keys()
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
 
-    trace_graph, torch_out, inputs_states = \
-        torch.jit._get_trace_graph(model, args, strict=False, _force_outplace=False, _return_inputs_states=True)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model, args, strict=False, _force_outplace=False, _return_inputs_states=True
+    )
     warn_on_static_input_change(inputs_states)
 
-    if orig_state_dict_keys != _unique_state_dict(model).keys():
-        raise RuntimeError("state_dict changed after running the tracer; "
-                           "something weird is happening in your model!")
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
 
     return trace_graph, torch_out
 
@@ -403,41 +533,73 @@ def _get_param_count_list(method_graph, args_params):
             in_vars, _ = torch.jit._flatten(arg_params_)
             param_count_list.append(len(in_vars))
         else:
-            param_count_list.append(1)
+            param_count_list.append(arg_params_ is not None)
+
     return param_count_list
 
 
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                for y in flatten(inner):
+                    yield y
+        elif isinstance(x, dict):
+            for inner in x.values():
+                for y in flatten(inner):
+                    yield y
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
 def _create_jit_graph(model, args):
     torch_out = None
     params: Union[List, Tuple]
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
     if isinstance(model, torch.jit.ScriptModule):
         try:
             graph = model.forward.graph
-            torch._C._jit_pass_onnx_function_substitution(graph)
-            freezed_m = torch._C._freeze_module(model._c, preserveParameters=True)
-            module, params = torch._C._jit_onnx_list_model_parameters(freezed_m)
-            method_graph = module._get_method("forward").graph
-            args_params = tuple(args) + tuple(params)
-            param_count_list = _get_param_count_list(method_graph, args_params)
-            in_vars, _ = torch.jit._flatten(args_params)
-            graph = _propagate_and_assign_input_shapes(
-                method_graph, tuple(in_vars), param_count_list, False, False)
         except AttributeError as e:
             raise RuntimeError("'forward' method must be a script method") from e
+        torch._C._jit_pass_onnx_function_substitution(graph)
+        freezed_m = torch._C._freeze_module(model._c, preserveParameters=True)
+        module, params = torch._C._jit_onnx_list_model_parameters(freezed_m)
+        method_graph = module._get_method("forward").graph
+        args_params = tuple(args) + tuple(params)
+        param_count_list = _get_param_count_list(method_graph, args_params)
+        in_vars, _ = torch.jit._flatten(args_params)
+        graph = torch._C._propagate_and_assign_input_shapes(
+            method_graph, tuple(in_vars), param_count_list, False, False
+        )
         return graph, params, torch_out, module
     elif isinstance(model, torch.jit.ScriptFunction):
         params = ()
-        in_vars, in_desc = torch.jit._flatten(tuple(args))
         graph = model.graph
         torch._C._jit_pass_onnx_function_substitution(graph)
         param_count_list = _get_param_count_list(graph, args)
-        graph = _propagate_and_assign_input_shapes(
-            graph, tuple(in_vars), param_count_list, False, False)
+        # FIXME(justinchuby): flattened_args is possibly unbound
+        graph = torch._C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
         return graph, params, torch_out, None
     else:
         graph, torch_out = _trace_and_get_graph_from_model(model, args)
         torch._C._jit_pass_onnx_lint(graph)
-        state_dict = _unique_state_dict(model)
+        state_dict = torch.jit._unique_state_dict(model)
         params = list(state_dict.values())
         graph_inputs = list(graph.inputs())
         user_input_num = len(graph_inputs) - len(state_dict)
@@ -451,7 +613,7 @@ def _create_jit_graph(model, args):
 
 def _get_named_param_dict(graph, params):
     input_and_param_names = [val.debugName() for val in graph.inputs()]
-    param_names = input_and_param_names[len(input_and_param_names) - len(params):]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
     _params_dict = dict(zip(param_names, params))
     return _params_dict
 
@@ -464,91 +626,180 @@ def _get_example_outputs(model, args):
         input_args = input_args[:-1]
 
     example_outputs = model(*input_args, **input_kwargs)
-    if isinstance(example_outputs, (torch.Tensor, int, float, bool)):
-        example_outputs = (example_outputs,)
-
     if isinstance(example_outputs, list):
         example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
     return example_outputs
 
 
-def _model_to_graph(model, args, verbose=False,
-                    input_names=None, output_names=None,
-                    operator_export_type=OperatorExportTypes.ONNX,
-                    do_constant_folding=True,
-                    _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, dynamic_axes=None):
-    r"""Converts model into an ONNX graph.
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+def unpack_quantized_tensor(value):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = torch.tensor(value.q_scale(), dtype=torch.double)
+        q_zero_point = torch.tensor(value.q_zero_point(), dtype=torch.int64)
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", lambda: [])()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+def _assign_onnx_node_name(graph, node_names):
+    """Takes in ONNX graph, and mapping from torch._C.Node to node name in exported ONNX ModelProto.
 
     Returns:
-      graph (torch._C.Graph): A TorchScript IR Graph with ONNX nodes.
-      params_dict (Dict[str, torch.Tensor]): Dict from input param name to param value.
-      torch_out (Union[NoneType, torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]]):
-        The output tensors resulting from the trace of ``model``.
-        If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
-        this will be None, since we are not doing any tracing.
+        graph (torch._C.Graph): A TorchScript IR Graph with ONNX nodes, where each torch._C.Node gets its name
+        in exported ONNX ModelProto assigned as attribute ``onnx_name``.
+    """
+
+    def n_fn(n, b_fn, node_names):
+        for b in n.blocks():
+            b_fn(b, node_names)
+        if n in node_names:
+            n.s_("onnx_name", node_names[n])
+
+    def b_fn(b, node_names):
+        for n in b.nodes():
+            n_fn(n, b_fn, node_names)
+
+    b_fn(graph, node_names)
+    return graph
+
+
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=None,
+    dynamic_axes=None,
+) -> Tuple[
+    torch._C.Graph,
+    Dict[str, torch.Tensor],
+    Optional[Union[torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]]],
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph (torch._C.Graph): A TorchScript IR Graph with ONNX nodes.
+        params_dict (Dict[str, torch.Tensor]): Dict from input param name to param value.
+        torch_out (Union[NoneType, torch.Tensor, Tuple[torch.Tensor], List[torch.Tensor]]):
+            The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
     """
     # TODO: can we simplify this to always return a tuple of Tensor or None?
-    from torch.onnx.symbolic_helper import _export_onnx_opset_version
+
     # Special case for common case of passing a single Tensor
     if isinstance(args, (torch.Tensor, int, float, bool)):
-        args = (args, )
+        args = (args,)
 
+    model = _pre_trace_quant_model(model, args)
     graph, params, torch_out, module = _create_jit_graph(model, args)
-
     params_dict = _get_named_param_dict(graph, params)
 
-    graph = _optimize_graph(graph, operator_export_type,
-                            _disable_torch_constant_prop=_disable_torch_constant_prop,
-                            fixed_batch_size=fixed_batch_size, params_dict=params_dict,
-                            dynamic_axes=dynamic_axes, input_names=input_names,
-                            module=module)
-    from torch.onnx.symbolic_helper import _onnx_shape_inference
-    if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception as e:
+        torch.onnx.log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
         example_outputs = _get_example_outputs(model, args)
-        out_vars, desc = torch.jit._flatten(tuple(example_outputs))
-        torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, desc, _onnx_shape_inference)
-    else:
-        flatten_args, _ = torch._C._jit_flatten(args)
-        # make sure that the param dict and the graph match each other
-        assert len(params) + len(flatten_args) == sum(1 for _ in graph.inputs())
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        torch._C._jit_pass_onnx_assign_output_shape(
+            graph, out_vars, desc, GLOBALS.onnx_shape_inference, is_script
+        )
 
     # NB: ONNX requires complete information about output types, which might be
     # erased by some optimizations, so we need to set it explicitly again.
-    if torch_out is not None:
-        if not (isinstance(torch_out, list) or isinstance(torch_out, tuple)):
+    else:
+        if not isinstance(torch_out, (list, tuple)):
             output_wrapped = [torch_out]
         else:
             output_wrapped = torch_out  # type: ignore[assignment]
 
         output_tensors, out_desc = torch._C._jit_flatten(tuple(output_wrapped))
-        torch._C._jit_pass_onnx_assign_output_shape(graph, output_tensors, out_desc, _onnx_shape_inference)
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            torch._C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+            )
 
     _set_input_and_output_names(graph, input_names, output_names)
     params_dict = _get_named_param_dict(graph, params)
 
-    if training is None or training == TrainingMode.EVAL:
+    if training is None or training == torch.onnx.TrainingMode.EVAL:
         params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
 
-    from torch.onnx.symbolic_helper import _constant_folding_opset_versions
-    if do_constant_folding and _export_onnx_opset_version in _constant_folding_opset_versions:
-        params_dict = torch._C._jit_pass_onnx_constant_fold(graph, params_dict,
-                                                            _export_onnx_opset_version)
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version in _constants.onnx_constant_folding_opsets
+    ):
+        params_dict = torch._C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
         torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
 
-    if _onnx_shape_inference:
-        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version)
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_graph_shape_type_inference(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
 
     params_dict = torch._C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
 
     # For ONNX opset < 9, constants only have three data types: float16, float, double.
     # In this pass transform constants of other data types to float/double + cast operator.
-    if _export_onnx_opset_version < 9:
+    if GLOBALS.export_onnx_opset_version < 9:
         torch._C._jit_pass_onnx_cast_all_constant_to_floating(graph)
 
-    if verbose:
-        print(graph)
-
     params_dict = torch._C._jit_pass_filter_non_tensor_arguments(params_dict)
     torch._C._jit_decay_packed_param_input_types(graph)
 
@@ -559,41 +810,74 @@ def _model_to_graph(model, args, verbose=False,
     return graph, params_dict, torch_out
 
 
-def export_to_pretty_string(model, args, export_params=True, verbose=False, training=None,
-                            input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
-                            export_type=ExportTypes.PROTOBUF_FILE, google_printer=False, opset_version=None,
-                            keep_initializers_as_inputs=None, custom_opsets=None, add_node_names=True,
-                            do_constant_folding=True, dynamic_axes=None):
-    from torch.onnx.symbolic_helper import _default_onnx_opset_version, _set_opset_version
-    from torch.onnx.symbolic_helper import _set_operator_export_type
+def export_to_pretty_string(
+    model,
+    args,
+    export_params=True,
+    verbose=False,
+    training=None,
+    input_names=None,
+    output_names=None,
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    export_type=torch.onnx.ExportTypes.PROTOBUF_FILE,
+    google_printer=False,
+    opset_version=None,
+    keep_initializers_as_inputs=None,
+    custom_opsets=None,
+    add_node_names=True,
+    do_constant_folding=True,
+    dynamic_axes=None,
+):
+
     if opset_version is None:
-        opset_version = _default_onnx_opset_version
+        opset_version = _constants.onnx_default_opset
     if custom_opsets is None:
         custom_opsets = {}
-    _set_opset_version(opset_version)
-    _set_operator_export_type(operator_export_type)
-    from torch.onnx.symbolic_helper import _set_onnx_shape_inference
-    _set_onnx_shape_inference(True)
-    with exporter_context(model, training):
-        val_keep_init_as_ip = _decide_keep_init_as_input(keep_initializers_as_inputs,
-                                                         operator_export_type,
-                                                         opset_version)
-        val_add_node_names = _decide_add_node_names(add_node_names, operator_export_type)
-        val_do_constant_folding = _decide_constant_folding(do_constant_folding, operator_export_type, training)
+    symbolic_helper._set_opset_version(opset_version)
+    symbolic_helper._set_operator_export_type(operator_export_type)
+
+    symbolic_helper._set_onnx_shape_inference(True)
+    with exporter_context(model, training, verbose):
+        val_keep_init_as_ip = _decide_keep_init_as_input(
+            keep_initializers_as_inputs, operator_export_type, opset_version
+        )
+        val_add_node_names = _decide_add_node_names(
+            add_node_names, operator_export_type
+        )
+        val_do_constant_folding = _decide_constant_folding(
+            do_constant_folding, operator_export_type, training
+        )
         args = _decide_input_format(model, args)
-        graph, params_dict, torch_out = _model_to_graph(model, args, verbose, input_names,
-                                                        output_names, operator_export_type,
-                                                        val_do_constant_folding,
-                                                        training=training, dynamic_axes=dynamic_axes)
-
-        return graph._pretty_print_onnx(params_dict, opset_version, False,
-                                        operator_export_type, google_printer,
-                                        val_keep_init_as_ip, custom_opsets, val_add_node_names)
-
-def unconvertible_ops(model, args, training=TrainingMode.EVAL, opset_version=None):
+        graph, params_dict, torch_out = _model_to_graph(
+            model,
+            args,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            val_do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return graph._pretty_print_onnx(  # type: ignore[attr-defined]
+            params_dict,
+            opset_version,
+            False,
+            operator_export_type,
+            google_printer,
+            val_keep_init_as_ip,
+            custom_opsets,
+            val_add_node_names,
+        )
+
+
+def unconvertible_ops(
+    model, args, training=torch.onnx.TrainingMode.EVAL, opset_version=None
+):
     r"""
     Converts the model with operator_export_type set to
-    OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of
+    torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of
     all the ops that are not supported/implemented by the exporter.
 
     Args:
@@ -604,89 +888,156 @@ def unconvertible_ops(model, args, training=TrainingMode.EVAL, opset_version=Non
 
     Returns:
         Tuple[torch._C.Graph, List[str]], where the list includes the names
-          of the unconvertible ops.
+        of the unconvertible ops.
     """
-    from torch.onnx.symbolic_helper import _default_onnx_opset_version, _set_opset_version
-    opset_version = opset_version or _default_onnx_opset_version
-    _set_opset_version(opset_version)
+
+    opset_version = opset_version or _constants.onnx_default_opset
+    symbolic_helper._set_opset_version(opset_version)
     # operator_export_type is set to ONNX_FALLTHROUGH by default so that if an op is not supported
     # in ONNX, fall through will occur and export the operator as is, as a custom ONNX op.
-    operator_export_type = OperatorExportTypes.ONNX_FALLTHROUGH
-    with exporter_context(model, training):
+    with exporter_context(model, training, False):
         args = _decide_input_format(model, args)
         graph, params_dict, torch_out = _model_to_graph(
-            model, args,
+            model,
+            args,
             # So that if an op connot be converted to ONNX, it will be kept
             # as-is rather than cause a failure.
-            operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH)
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH,
+        )
     unsupported_ops = list()
-    supported_namespaces = ("onnx", "prim")
-    for node in graph.nodes():
+    supported_namespaces = ("onnx", "prim", "quantized")
+    for node in graph.nodes():  # type: ignore[attr-defined]
         if node.kind().split(":")[0] not in supported_namespaces:
             unsupported_ops.append(node.kind())
     return graph, unsupported_ops
 
+
 def _setup_trace_module_map(model, export_modules_as_functions):
     def __setup_trace_module_map():
-        trace_module_map = {_m : torch.typename(type(_m)) for _m in model.modules()}
+        trace_module_map = {_m: torch.typename(type(_m)) for _m in model.modules()}
         torch.jit._trace._trace_module_map = trace_module_map
         return trace_module_map
 
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = torch._C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            torch._C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
     if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
         trace_module_map = __setup_trace_module_map()
         export_modules_as_functions = {v for k, v in trace_module_map.items()}
-    elif isinstance(export_modules_as_functions, set) and len(export_modules_as_functions) > 0:
+    elif (
+        isinstance(export_modules_as_functions, set)
+        and len(export_modules_as_functions) > 0
+    ):
+
         def _find_typename(v):
             if isinstance(v, type):
                 return torch.typename(v)
             else:
-                raise RuntimeError("Only type of the `nn.Module` should be "
-                                   "passed in the set for argument `export_modules_as_functions`. "
-                                   "Got `%s`." % (type(v).__name__))
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    "Got `%s`." % (type(v).__name__)
+                )
+
         trace_module_map = __setup_trace_module_map()
         module_typenames = {_find_typename(v) for v in export_modules_as_functions}
         export_modules_as_functions = module_typenames
     else:
         export_modules_as_functions = None
+
+    if export_modules_as_functions:
+        __register_attribute_hook()
+
     return export_modules_as_functions
 
+
 def _reset_trace_module_map():
     torch.jit._trace._trace_module_map = None
-
-def _export(model, args, f, export_params=True, verbose=False, training=None,
-            input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
-            export_type=ExportTypes.PROTOBUF_FILE, opset_version=None,
-            do_constant_folding=True, dynamic_axes=None, keep_initializers_as_inputs=None,
-            fixed_batch_size=False, custom_opsets=None, add_node_names=True,
-            onnx_shape_inference=True, export_modules_as_functions=False):
-
-    if export_modules_as_functions and opset_version < 15:
-        raise ValueError("`export_modules_as_functions` is not supported for `opset_version` < 15."
-                         "This is because `opset_version` < 15 implies IR version < 8, which means "
-                         "no local function support. ")
-    export_modules_as_functions = _setup_trace_module_map(model, export_modules_as_functions)
+    torch._C._jit_pass_onnx_clear_scope_records()
+
+
+def _get_module_attributes(module):
+
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    return {k: getattr(module, k) for k in annotations}
+
+
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=None,
+    input_names=None,
+    output_names=None,
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+    export_type=torch.onnx.ExportTypes.PROTOBUF_FILE,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions=False,
+):
 
     if isinstance(model, torch.nn.DataParallel):
-        raise ValueError("torch.nn.DataParallel is not supported by ONNX "
-                         "exporter, please use 'attribute' module to "
-                         "unwrap model from torch.nn.DataParallel. Try "
-                         "torch.onnx.export(model.module, ...)")
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
     global __IN_ONNX_EXPORT
     assert __IN_ONNX_EXPORT is False
     __IN_ONNX_EXPORT = True
     try:
-        from torch.onnx.symbolic_helper import _set_onnx_shape_inference
-        _set_onnx_shape_inference(onnx_shape_inference)
 
-        from torch.onnx.symbolic_helper import _default_onnx_opset_version, _set_opset_version
-        from torch.onnx.symbolic_helper import _set_operator_export_type
+        symbolic_helper._set_onnx_shape_inference(onnx_shape_inference)
+
         if opset_version is None:
-            opset_version = _default_onnx_opset_version
+            opset_version = _constants.onnx_default_opset
+
+        if export_modules_as_functions and opset_version < 15:
+            raise ValueError(
+                "`export_modules_as_functions` is not supported for `opset_version` < 15."
+                "This is because `opset_version` < 15 implies IR version < 8, which means "
+                "no local function support. "
+            )
+        export_modules_as_functions = _setup_trace_module_map(
+            model, export_modules_as_functions
+        )
+
         if not operator_export_type:
             if torch.onnx._CAFFE2_ATEN_FALLBACK:
-                operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK
+                operator_export_type = torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
             else:
-                operator_export_type = OperatorExportTypes.ONNX
+                operator_export_type = torch.onnx.OperatorExportTypes.ONNX
 
         # By default, training=None, (which defaults to TrainingMode.EVAL),
         # which is good because running a model in training mode could result in
@@ -694,14 +1045,18 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
         # If you really know what you're doing, you can turn
         # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
         # (to preserve whatever the original training mode was.)
-        _set_opset_version(opset_version)
-        _set_operator_export_type(operator_export_type)
-        with exporter_context(model, training):
-            val_keep_init_as_ip = _decide_keep_init_as_input(keep_initializers_as_inputs,
-                                                             operator_export_type,
-                                                             opset_version)
-            val_add_node_names = _decide_add_node_names(add_node_names, operator_export_type)
-            val_do_constant_folding = _decide_constant_folding(do_constant_folding, operator_export_type, training)
+        symbolic_helper._set_opset_version(opset_version)
+        symbolic_helper._set_operator_export_type(operator_export_type)
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs, operator_export_type, opset_version
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
             # Normally f can be a file-like object, but for large models, the external data format requires a
             # valid `model_file_location`. Code in export.cpp will enforce this.
             if isinstance(f, str):
@@ -713,62 +1068,114 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                 dynamic_axes = {}
             _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
 
-            graph, params_dict, torch_out = \
-                _model_to_graph(model, args, verbose, input_names,
-                                output_names, operator_export_type,
-                                val_do_constant_folding,
-                                fixed_batch_size=fixed_batch_size,
-                                training=training,
-                                dynamic_axes=dynamic_axes)
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
 
             # TODO: Don't allocate a in-memory string for the protobuf
-            defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE
+            defer_weight_export = (
+                export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE
+            )
             if custom_opsets is None:
                 custom_opsets = {}
 
             torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
             node_attr_to_name = {}  # type: ignore[var-annotated]
-            if export_modules_as_functions is not None:
+            if export_modules_as_functions:
                 # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
                 node_attr_to_name = torch._C._jit_pass_onnx_function_extraction(
-                    graph, export_modules_as_functions, list(params_dict.keys()))
+                    graph, export_modules_as_functions, list(params_dict.keys())
+                )
+            params_dict = torch._C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                graph, params_dict, getattr(model, "training", False)  # type: ignore[arg-type]
+            )
             if export_params:
-                proto, export_map, val_use_external_data_format = graph._export_onnx(
-                    params_dict, opset_version, dynamic_axes, defer_weight_export,
-                    operator_export_type, not verbose, val_keep_init_as_ip, custom_opsets,
-                    val_add_node_names, model_file_location, node_attr_to_name)
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
             else:
-                proto, export_map, val_use_external_data_format = graph._export_onnx(
-                    {}, opset_version, dynamic_axes, False, operator_export_type,
-                    not verbose, val_keep_init_as_ip, custom_opsets, val_add_node_names,
-                    model_file_location, node_attr_to_name)
-            if export_type == ExportTypes.PROTOBUF_FILE:
-                assert(len(export_map) == 0)
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    False,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            if verbose:
+                torch.onnx.log(
+                    "Exported graph: ", _assign_onnx_node_name(graph, node_names)
+                )
+            if export_type == torch.onnx.ExportTypes.PROTOBUF_FILE:
+                assert len(export_map) == 0
                 with torch.serialization._open_file_like(f, "wb") as opened_file:
                     opened_file.write(proto)
-            elif export_type in [ExportTypes.ZIP_ARCHIVE, ExportTypes.COMPRESSED_ZIP_ARCHIVE]:
-                import zipfile
-                compression = zipfile.ZIP_DEFLATED \
-                    if export_type == ExportTypes.COMPRESSED_ZIP_ARCHIVE \
+            elif export_type in [
+                torch.onnx.ExportTypes.ZIP_ARCHIVE,
+                torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+            ]:
+                compression = (
+                    zipfile.ZIP_DEFLATED
+                    if export_type == torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE
                     else zipfile.ZIP_STORED
+                )
                 with zipfile.ZipFile(f, "w", compression=compression) as z:
-                    z.writestr(ONNX_ARCHIVE_MODEL_PROTO_NAME, proto)
+                    z.writestr(torch.onnx.ONNX_ARCHIVE_MODEL_PROTO_NAME, proto)
                     for k, v in export_map.items():
                         z.writestr(k, v)
-            elif export_type == ExportTypes.DIRECTORY:
-                import os
+            elif export_type == torch.onnx.ExportTypes.DIRECTORY:
                 if os.path.exists(f):
-                    assert(os.path.isdir(f))
+                    assert os.path.isdir(f)
                 else:
                     os.makedirs(f)
 
-                model_proto_file = os.path.join(f, ONNX_ARCHIVE_MODEL_PROTO_NAME)
-                with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
+                model_proto_file = os.path.join(
+                    f, torch.onnx.ONNX_ARCHIVE_MODEL_PROTO_NAME
+                )
+                with torch.serialization._open_file_like(
+                    model_proto_file, "wb"
+                ) as opened_file:
                     opened_file.write(proto)
 
                 for k, v in export_map.items():
                     weight_proto_file = os.path.join(f, k)
-                    with torch.serialization._open_file_like(weight_proto_file, "wb") as opened_file:
+                    with torch.serialization._open_file_like(
+                        weight_proto_file, "wb"
+                    ) as opened_file:
                         opened_file.write(v)
             else:
                 raise RuntimeError("Unknown export type")
@@ -778,15 +1185,18 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
             # If large model format export is enabled, proto will only contain data location instead of
             # raw data and _check_onnx_proto() will fail because it can only handle the raw ONNX proto
             # string in memory.
-            if (operator_export_type is OperatorExportTypes.ONNX) and (not val_use_external_data_format):
+            if (operator_export_type is torch.onnx.OperatorExportTypes.ONNX) and (
+                not val_use_external_data_format
+            ):
                 try:
-                    _check_onnx_proto(proto)
+                    torch._C._check_onnx_proto(proto, full_check=True)
                 except RuntimeError as e:
-                    raise CheckerError(e)
+                    raise torch.onnx.CheckerError(e)
     finally:
         assert __IN_ONNX_EXPORT
         __IN_ONNX_EXPORT = False
         _reset_trace_module_map()
+
     return torch_out
 
 
@@ -809,7 +1219,8 @@ def set_names(node_list, name_list, descriptor):
         if len(name_list) > len(node_list):
             raise RuntimeError(
                 "number of %s names provided (%d) exceeded number of %ss (%d)"
-                % (descriptor, len(name_list), descriptor, len(node_list)))
+                % (descriptor, len(name_list), descriptor, len(node_list))
+            )
 
         # Mark if the output node DebugName is set before.
         output_node_set = set()
@@ -832,9 +1243,6 @@ def set_names(node_list, name_list, descriptor):
     set_names(list(graph.outputs()), output_names, "output")
 
 
-attr_pattern = re.compile("^(.+)_([ifstgz])$")
-
-
 def _run_symbolic_method(g, op_name, symbolic_fn, args):
     r"""
     This trampoline function gets invoked for every symbolic method
@@ -850,430 +1258,204 @@ def _run_symbolic_method(g, op_name, symbolic_fn, args):
         raise
 
 
-def _is_onnx_list(value):
-    if not isinstance(value, string_classes) and \
-            not isinstance(value, torch.Tensor) and \
-            isinstance(value, collections.abc.Iterable):
-        return True
-    return False
-
-
-def _add_attribute(node, key, value, aten):
-    r""" initializes the right attribute based on type of value """
-    m = attr_pattern.match(key)
-    if m is None:
-        raise IndexError((
-            "Invalid attribute specifier '{}' names " +
-            " must be suffixed with type, e.g. 'dim_i' or 'dims_i'").format(key))
-    name, kind = m.group(1), m.group(2)
-    if _is_onnx_list(value):
-        kind += "s"
-    if aten:
-        if isinstance(value, torch.Tensor):
-            # Caffe2 proto does not support tensor attribute.
-            if value.numel() > 1:
-                raise ValueError("Should not pass tensor attribute")
-            value = _scalar(value)
-            if isinstance(value, float):
-                kind = "f"
-            else:
-                kind = "i"
-    return getattr(node, kind + "_")(name, value)
-
-
-def _scalar(x):
-    """Convert a scalar tensor into a Python value."""
-    assert x.numel() == 1
-    return x[0]
-
+def _add_block(node: torch._C.Node):
+    return node.addBlock()  # type: ignore[attr-defined]
 
-def _newNode(g, opname, outputs, *args, **kwargs):
-    if "::" in opname:
-        aten = False
-        ns_opname = opname
-    else:
-        aten = kwargs.pop("aten", False)
-        ns = "aten" if aten else "onnx"
-        ns_opname = ns + "::" + opname
-    n = g.create(ns_opname, args, outputs)
-    for k, v in sorted(kwargs.items()):
-        # TODO: enable inplace in aten exporting mode.
-        if k == "inplace":
-            continue
-        _add_attribute(n, k, v, aten=aten)
-    return n
-
-
-def _graph_op(g, opname, *raw_args, **kwargs):
-    r"""
-    Create an ONNX operator "opname", taking "args" as inputs and attributes
-    "kwargs"; returning the node representing the single output of this operator
-    (see the `outputs` keyword argument for multi-return nodes).
 
-    The set of operators and the inputs/attributes they take
-    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+def _add_input_to_block(block: torch._C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
 
-    This function is monkey-patched onto Graph.
 
-    Args:
-        opname (string): The ONNX operator name, e.g., `Abs` or `Add`.
-        args (Node...): The inputs to the operator; usually provided
-            as arguments to the `symbolic` definition.
-        kwargs: The attributes of the ONNX operator, with keys named
-            according to the following convention: `alpha_f` indicates
-            the `alpha` attribute with type `f`.  The valid type specifiers are
-            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
-            specified with type float accepts either a single float, or a
-            list of floats (e.g., you would say `dims_i` for a `dims` attribute
-            that takes a list of integers).
-        outputs (int, optional):  The number of outputs this operator returns;
-            by default an operator is assumed to return a single output.
-            If `outputs` is greater than one, this functions returns a tuple
-            of output `Node`, representing each output of the ONNX operator
-            in positional.
-    """
-    outputs = kwargs.pop("outputs", 1)
-
-    # Filter out None attributes, this can be convenient client side because
-    # now they can pass through None attributes, and have them not show up
-    kwargs = dict((k, v) for k, v in kwargs.items() if v is not None)
-
-    def const_if_tensor(arg):
-        if arg is None:
-            return arg
-        elif isinstance(arg, torch._C.Value):
-            return arg
-        else:
-            return g.op("Constant", value_z=arg)
+def _add_output_to_block(block: torch._C.Block, value: torch._C.Value):
+    new_output = block.registerOutput(value)  # type: ignore[attr-defined]
+    return new_output
 
-    args = list(const_if_tensor(arg) for arg in raw_args)
-    n = g.insertNode(_newNode(g, opname, outputs, *args, **kwargs))
 
-    from torch.onnx.symbolic_helper import _onnx_shape_inference
-    if _onnx_shape_inference:
-        from torch.onnx.symbolic_helper import _export_onnx_opset_version as opset_version
-        torch._C._jit_pass_onnx_node_shape_type_inference(n, _params_dict, opset_version)
+# Note [Export inplace]
+# ~~~~~~~~~~~~~~~~~~~~~
+# In abstract, it would be better for us to export inplace annotations,
+# than to not export them, since it is useful information that can
+# help the target of an ONNX export export more efficiently.  However,
+# ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
+# inplace annotations, but we are losing information this way.
 
-    if outputs == 1:
-        return n.output()
-    return tuple(o for o in n.outputs())
 
+def _find_symbolic_in_registry(
+    domain: str,
+    op_name: str,
+    opset_version: int,
+    operator_export_type: torch.onnx.OperatorExportTypes,
+) -> Optional[Callable]:
+    """Looks up for the symbolic function in the registry.
 
-def _block_op(b, opname, *args, **kwargs):
-    if "::" in opname:
-        aten = False
-        ns_opname = opname
-    else:
-        aten = kwargs.pop("aten", False)
-        ns = "aten" if aten else "onnx"
-        ns_opname = ns + "::" + opname
-    n = b.addNode(ns_opname, list(args))
-    for k, v in sorted(kwargs.items()):
-        # TODO: enable inplace in aten exporting mode.
-        if k == "inplace":
-            continue
-        _add_attribute(n, k, v, aten=aten)
-    if len(list(n.outputs())) == 1:
-        return n.output()
-    return tuple(o for o in n.outputs())
+    Args:
+        domain: The domain of the symbolic function.
+        op_name: The name of the op.
+        opset_version: Currect opset used.
+        operator_export_type: An enum in torch.onnx.OperatorExportTypes.
 
+    Returns:
+        The symbolic function if found, None otherwise.
+    """
 
-def _add_block(node):
-    return node.addBlock()
+    if not symbolic_registry.is_registered_op(op_name, domain, opset_version):
+        if operator_export_type == torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            # Use the original node directly
+            return None
+    return symbolic_registry.get_registered_op(op_name, domain, opset_version)
+
+
+def _should_aten_fallback(ns, op_name, opset_version, operator_export_type):
+
+    is_exportable_aten_op = symbolic_registry.is_registered_op(
+        op_name, "", opset_version
+    )
+    is_onnx_aten_export = (
+        operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN
+    )
+    is_aten_fallback_export = (
+        operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+    return is_onnx_aten_export or (
+        not is_exportable_aten_op and is_aten_fallback_export
+    )
+
+
+def _need_symbolic_context(symbolic_fn) -> bool:
+    """Checks if the first argument to symbolic_fn is annotated as type `torch.onnx.SymbolicContext`."""
+    params = tuple(inspect.signature(symbolic_fn).parameters.values())
+    # When the annotation is postpone-evaluated, the annotation is a string
+    # and not a type. We need to use get_type_hints to get the real type.
+    if not params:
+        return False
+    first_param_name = params[0].name
+    type_hints = typing.get_type_hints(symbolic_fn)
+    if first_param_name not in type_hints:
+        return False
+    param_type = type_hints[first_param_name]
+    return issubclass(param_type, torch.onnx.SymbolicContext)
 
 
-def _add_input_to_block(block):
-    return block.addInputToBlock()
+def _get_aten_op_overload_name(n: torch._C.Node) -> str:
 
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::") or symbolic_helper.is_caffe2_aten_fallback():
+        return ""
+    return torch._C.parse_schema(schema).overload_name
 
-def _add_output_to_block(block, value):
-    new_output = block.registerOutput(value)
-    return new_output
 
+def _run_symbolic_function(
+    g: torch._C.Graph,
+    block: torch._C.Block,
+    n: torch._C.Node,
+    inputs: Any,
+    env: Dict[torch._C.Value, torch._C.Value],
+    operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
+) -> Optional[Union[torch._C.Value, Tuple[torch._C.Value, ...]]]:
+    """Runs a symbolic function.
 
-# Note [Export inplace]
-# ~~~~~~~~~~~~~~~~~~~~~
-# In abstract, it would be better for us to export inplace annotations,
-# than to not export them, since it is useful information that can
-# help the target of an ONNX export export more efficiently.  However,
-# ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
-# inplace annotations, but we are losing information this way.
+    The function is used in C++ to export the node to ONNX.
 
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
 
-def _find_symbolic_in_registry(domain, op_name, opset_version, operator_export_type):
-    import torch.onnx.symbolic_registry as sym_registry
-    if not sym_registry.is_registered_op(op_name, domain, opset_version):
-        if operator_export_type == OperatorExportTypes.ONNX_FALLTHROUGH:
-            # Use the original node directly
-            return None
-    return sym_registry.get_registered_op(op_name, domain, opset_version)
+    opset_version = GLOBALS.export_onnx_opset_version
+    symbolic_helper.is_caffe2_aten_fallback = symbolic_helper.is_caffe2_aten_fallback
 
+    # See Note [Export inplace]
+    # TODO(ezyang): I think this is not necessary anymore
+    if n.kind().endswith("_"):  # type: ignore[attr-defined]
+        ns_op_name = n.kind()[:-1]  # type: ignore[attr-defined]
+    else:
+        ns_op_name = n.kind()  # type: ignore[attr-defined]
+    ns, op_name = ns_op_name.split("::")
 
-def _run_symbolic_function(g, block, n, inputs, env, operator_export_type=OperatorExportTypes.ONNX):
-    # NB: Returning None means the node gets cloned as is into
-    # the new graph
     try:
-        import torch
-        from torch.onnx.symbolic_helper import _export_onnx_opset_version as opset_version
-        import torch.onnx.symbolic_registry as sym_registry
-
-        sym_registry.register_version("", opset_version)
+        symbolic_registry.register_version("", opset_version)
 
         # Caffe2-specific: Quantized op symbolics are registered for opset 9 only.
-        is_caffe2_aten_fallback = (operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK and
-                                   torch.onnx._CAFFE2_ATEN_FALLBACK)
-        if is_caffe2_aten_fallback and opset_version == 9:
-            import torch.onnx.symbolic_caffe2
-            torch.onnx.symbolic_caffe2.register_quantized_ops("caffe2", opset_version)
-
-        # See Note [Export inplace]
-        # TODO: I think this is not necessary anymore
-        if n.kind().endswith("_"):
-            ns_op_name = n.kind()[:-1]
-        else:
-            ns_op_name = n.kind()
-        ns, op_name = ns_op_name.split("::")
-        if ns == "onnx":
-            if op_name == "Placeholder":
-                return torch._C._jit_onnx_convert_pattern_from_subblock(block, n, env)
-            else:
-                # Use the original node directly
-                attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}
-                return g.op(op_name, *inputs, **attrs, outputs=n.outputsSize())
-
-        elif ns == "aten":
-            is_exportable_aten_op = sym_registry.is_registered_op(op_name, "", opset_version)
-            is_onnx_aten_export = operator_export_type == OperatorExportTypes.ONNX_ATEN
-            is_aten_fallback_export = operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK
-            if is_onnx_aten_export or (not is_exportable_aten_op and is_aten_fallback_export):
-                # Direct ATen export requested
-                attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}
-                outputs = n.outputsSize()
-                attrs["outputs"] = outputs
-                return _graph_at(g, op_name, *inputs, aten=True, **attrs)
-            else:
-                # Export it regularly
-                domain = ""
-                symbolic_fn = _find_symbolic_in_registry(domain, op_name, opset_version, operator_export_type)
-                if symbolic_fn is None:
-                    return None
-                attrs = {k: n[k] for k in n.attributeNames()}
-                return symbolic_fn(g, *inputs, **attrs)
-
-        elif ns == "prim":
-            if op_name == "Constant" and not n.mustBeNone():
-                if n.kindOf("value") == "t":
-                    return g.op("Constant", value_t=n["value"])
-                if n.kindOf("value") == "s":
-                    return g.op("Constant", value_s=n["value"])
-                elif n.output().type().isSubtypeOf(ListType.ofInts()) or n.output().type().isSubtypeOf(ListType.ofFloats()):
-                    vals = n.output().toIValue()
-                    value = torch.stack([torch.tensor(v) for v in vals]) if len(vals) else []
-                    return g.op("Constant", value_t=value)
-                elif n.output().type().kind() == "DeviceObjType":
-                    return None
-                else:
-                    raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format(
-                        n.kindOf("value")))
-            elif n.mustBeNone() or op_name == "ListConstruct" or op_name == "ListUnpack" or op_name == "Uninitialized":
-                # None is not an ONNX operator; keep it as None
-                # Let the exporter handle and finally eliminate these ops
-                # ListConstruct and ListUnpack will be erased in the ONNX peephole pass
-                # Uninitialized will be erased during shape/type inference
-                return None
-            elif op_name == "device" and n.output().type().kind() == "DeviceObjType":
-                return None
-            elif op_name == "Loop" or op_name == "If":
-                static_if = (op_name == "If" and inputs[0].node().kind() == "onnx::Constant")
-                is_sub_block = False
-                if static_if:
-                    # Fold static if
-                    #
-                    # The torch IR
-                    # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
-                    #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
-                    # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
-                    # %21 : Long(device=cpu) = aten::eq(%20, %64)
-                    # %22 : Long(device=cpu) = prim::If(%21)
-                    #     block0():
-                    #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
-                    #     -> (%23)
-                    #     block1():
-                    #     -> (%65)
-                    # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
-                    #     block0():
-                    #     -> (%embedding_matrix.1, %input.1)
-                    #     block1():
-                    #     -> (%input.1, %embedding_matrix.1)
-                    # %26 : int[] = aten::size(%input.53)
-                    #
-                    # The converted ONNX graph
-                    # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
-                    # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
-                    # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
-                    # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
-                    input_flag = inputs[0].node()['value'].tolist()
-                    const_value = all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
-                    block_idx = 0 if const_value else 1
-                    current_b = list(n.blocks())[block_idx]
-                    is_sub_block = True
-                    env = torch._C._jit_pass_onnx_block(current_b, block, operator_export_type, env,
-                                                        is_sub_block)
-                    if_output_list = list(n.outputs())
-                    current_b_list = list(current_b.outputs())
-
-                    final_b_list = []
-                    for idx in range(len(if_output_list)):
-                        if current_b_list[idx] not in env:
-                            raise RuntimeError("The sub block ATen output " + current_b_list[idx] + " is not in env.")
-                        onnx_b = env[current_b_list[idx]]
-                        final_b_list.append(onnx_b)
-                    return final_b_list
-                else:
-                    new_op_outputs = g.op(op_name, *inputs, outputs=n.outputsSize())
-                    new_node = new_op_outputs[0].node() if n.outputsSize() > 1 else new_op_outputs.node()
-                    for b in n.blocks():
-                        new_block = new_node.addBlock()
-                        # Copy input metadata to subblock
-                        #
-                        # If format:
-                        #   prim::If(cond)
-                        #     block0()
-                        #     block1()
-                        #
-                        # Loop format:
-                        #   prim::Loop(iter, cond, input_1, ..., input_n)
-                        #     block0(iter, input_1, ..., input_n)
-                        #
-                        # For `If` node, there is nothing to copy.
-                        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
-                        for i, b_in in enumerate(b.inputs()):
-                            if i == 0 and i < len(inputs):
-                                b_in.setType(inputs[i].type())
-                            if i > 0 and (i + 1) < len(inputs):
-                                b_in.setType(inputs[i + 1].type())
-                        torch._C._jit_pass_onnx_block(b, new_block, operator_export_type, env,
-                                                      is_sub_block)
-                    new_op_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(new_node, opset_version)
-                    # Process Loop and If after subblock is converted.
-                    from torch.onnx.symbolic_helper import _onnx_shape_inference
-                    if _onnx_shape_inference:
-                        torch._C._jit_pass_onnx_node_shape_type_inference(new_node, _params_dict, opset_version)
-                    return new_op_outputs
-            else:
-                symbolic_fn = _find_symbolic_in_registry("prim", op_name, opset_version,
-                                                         operator_export_type)
-                if symbolic_fn is None:
-                    return None
-                attrs = {k: n[k] for k in n.attributeNames()}
-                # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1408006
-                # PythonOp symbolic need access the node to resolve the name conflict,
-                # this is inconsistent with regular op symbolic.
-                if op_name == "PythonOp":
-                    inputs = (n, *inputs)
-                return symbolic_fn(g, *inputs, **attrs)
-
-        elif ns == "quantized":
-            domain = ""
-            # Caffe2-specific quantized op
-            if is_caffe2_aten_fallback:
-                domain = "caffe2"
-            symbolic_fn = _find_symbolic_in_registry(domain, op_name, opset_version, operator_export_type)
-            if symbolic_fn is None:
-                return None
-            attrs = {k: n[k] for k in n.attributeNames()}
-            return symbolic_fn(g, *inputs, **attrs)
+        if symbolic_helper.is_caffe2_aten_fallback() and opset_version == 9:
 
-        # custom ops
-        elif sym_registry.is_registered_version(ns, opset_version):
+            symbolic_caffe2.register_quantized_ops("caffe2", opset_version)
+
+        if ns == "aten":
+            domain = ""
+        elif ns == "quantized" and symbolic_helper.is_caffe2_aten_fallback():
+            domain = "caffe2"
+        else:
             domain = ns
-            symbolic_fn = _find_symbolic_in_registry(domain, op_name, opset_version, operator_export_type)
-            if symbolic_fn is None:
-                return None
-            attrs = {k: n[k] for k in n.attributeNames()}
+
+        if symbolic_registry.is_registered_op(op_name, domain, opset_version):
+            symbolic_fn = _find_symbolic_in_registry(
+                domain, op_name, opset_version, operator_export_type
+            )
+            assert symbolic_fn is not None
+
+            attrs = {k: n[k] for k in n.attributeNames()}  # type: ignore[attr-defined]
+            if _need_symbolic_context(symbolic_fn):
+                ctx = torch.onnx.SymbolicContext(_params_dict, env, n, block)
+                return symbolic_fn(ctx, g, *inputs, **attrs)
+            # PythonOp symbolic need access to the node to resolve the name conflict,
+            # this is inconsistent with regular op symbolic.
+            if op_name == "PythonOp":
+                inputs = (n, *inputs)
             return symbolic_fn(g, *inputs, **attrs)
+        elif ns == "onnx":
+            # Clone node to trigger ONNX shape inference
+            attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}  # type: ignore[attr-defined]
+            return g.op(op_name, *inputs, **attrs, outputs=n.outputsSize())  # type: ignore[attr-defined]
+        elif _should_aten_fallback(ns, op_name, opset_version, operator_export_type):
+            # Direct ATen export requested
+            attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}  # type: ignore[attr-defined]
+            outputs = n.outputsSize()
+            attrs["outputs"] = outputs
+            # `overload_name` is set for non-Caffe2 builds only
+            return g.at(  # type: ignore[attr-defined]
+                op_name, *inputs, overload_name=_get_aten_op_overload_name(n), **attrs
+            )
         else:
-            raise RuntimeError("ONNX export failed on an operator with unrecognized namespace {}::{}. "
-                               "If you are trying to export a custom operator, make sure you registered "
-                               "it with the right domain and version.".format(ns, op_name))
+            raise symbolic_registry.UnsupportedOperatorError(
+                domain, op_name, opset_version
+            )
     except RuntimeError:
-        if operator_export_type == OperatorExportTypes.ONNX_FALLTHROUGH:
+        if operator_export_type == torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
             return None
+        elif (
+            operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+            and not symbolic_helper.is_caffe2_aten_fallback()
+        ):
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {k + "_" + n.kindOf(k)[0]: n[k] for k in n.attributeNames()}  # type: ignore[attr-defined]
+            return g.at(  # type: ignore[attr-defined]
+                op_name, *inputs, overload_name=_get_aten_op_overload_name(n), **attrs
+            )
         raise
     except TypeError as e:
         # Handle the specific case where we didn't successfully dispatch.
         # Otherwise, the backtrace will have the clues you need.
-        e.args = ("{} \n(Occurred when translating {}).".format(e.args[0], op_name),)
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
         raise
 
 
-# Generate an ONNX ATen op node.
-def _graph_at(g, opname, *args, **kwargs):
-    return g.op("ATen", *args, operator_s=opname, **kwargs)
-
-
-# This helper function can create either constant tensor or constant scalar.
-# If dims is None or 0 or [0], generate a 0-d tensor (scalar).
-#
-# TODO: We might not need this anymore, since most scalars now show up
-# as tensors
-def _graph_constant(g, value, dims, type, *args, **kwargs):
-    assert isinstance(value, numbers.Number)
-    assert type is not None
-    isscalar = False
-    if dims is None or dims == 0 or set(dims) == set([0]):
-        dims = [1]
-        isscalar = True
-    type = type.lower()
-    tensor: Union[torch.CharTensor, torch.ShortTensor,
-                  torch.IntTensor, torch.LongTensor,
-                  torch.HalfTensor, torch.FloatTensor,
-                  torch.DoubleTensor]
-    if type == "char":
-        tensor = torch.CharTensor(*dims)
-    elif type == "short":
-        tensor = torch.ShortTensor(*dims)
-    elif type == "int":
-        tensor = torch.IntTensor(*dims)
-    elif type == "long":
-        tensor = torch.LongTensor(*dims)
-    elif type == "half":
-        tensor = torch.HalfTensor(*dims)
-    elif type == "float":
-        tensor = torch.FloatTensor(*dims)
-    elif type == "double":
-        tensor = torch.DoubleTensor(*dims)
-    else:
-        raise ValueError("Unknown type, type should be one of the following strings: "
-                         "char, short, int, long, half, float, double")
-    tensor.fill_(value)  # type: ignore[call-overload]
-    if isscalar:
-        return g.op("Constant", *args, value_z=tensor, **kwargs)
-    return g.op("Constant", *args, value_t=tensor, **kwargs)
-
-
-def _node_getitem(self, k):
-    r"""
-    Accessor for attributes of a node which is polymorphic over
-    return type.
-
-    NB: This is monkey-patched onto Node.
-    """
-    sel = self.kindOf(k)
-    return getattr(self, sel)(k)
-
-
 def get_ns_op_name_from_custom_op(symbolic_name):
-    if not bool(re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name)):
-        raise ValueError("Failed to register operator {}. \
-                          The symbolic name must match the format Domain::Name, \
-                          and should start with a letter and contain only \
-                          alphanumerical characters".format(symbolic_name))
+    if not bool(
+        re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name)
+    ):
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}."
+            "The symbolic name must match the format Domain::Name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
     ns, op_name = symbolic_name.split("::")
     if ns == "onnx":
-        raise ValueError("Failed to register operator {}. \
-                          {} domain cannot be modified."
-                         .format(symbolic_name, ns))
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
 
     if ns == "aten":
         ns = ""
@@ -1281,36 +1463,39 @@ def get_ns_op_name_from_custom_op(symbolic_name):
     return ns, op_name
 
 
-# When the user registers symbolic for custom/contrib ops,
-# it is highly recommended to add shape inference for that operator via setType API,
-# otherwise the exported graph may have incorrect shape inference in some extreme cases.
-# An example of setType is test_aten_embedding_2 in test_operators.py..
 def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+    """
     ns, op_name = get_ns_op_name_from_custom_op(symbolic_name)
-    import torch.onnx.symbolic_registry as sym_registry
-    from torch.onnx.symbolic_helper import _onnx_stable_opsets, _onnx_main_opset
 
-    for version in _onnx_stable_opsets + [_onnx_main_opset]:
+    for version in itertools.chain(
+        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
+    ):
         if version >= opset_version:
-            sym_registry.register_op(op_name, symbolic_fn, ns, version)
+            symbolic_registry.register_op(op_name, symbolic_fn, ns, version)
 
 
 def unregister_custom_op_symbolic(symbolic_name, opset_version):
     ns, op_name = get_ns_op_name_from_custom_op(symbolic_name)
-    import torch.onnx.symbolic_registry as sym_registry
-    from torch.onnx.symbolic_helper import _onnx_stable_opsets, _onnx_main_opset
 
-    for version in _onnx_stable_opsets + [_onnx_main_opset]:
+    for version in itertools.chain(
+        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
+    ):
         if version >= opset_version:
-            sym_registry.unregister_op(op_name, ns, version)
+            symbolic_registry.unregister_op(op_name, ns, version)
 
 
-# This helper function ensures dynamic axes argument is following the expected format
 def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
     if len(dynamic_axes) == 0:
         return
 
-    if(hasattr(model, "graph")):
+    if hasattr(model, "graph"):
         # Extracting set of valid input/output names that shall be used for dynamic_axes
         if (input_names is None) or len(input_names) == 0:
             input_names = [x.debugName() for x in model.graph.inputs()]
@@ -1325,25 +1510,31 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
     # provided dynamic axes of specified input/output
     for key, value in dynamic_axes.items():
         if key not in valid_names:
-            warnings.warn("Provided key {} for dynamic axes is not a valid input/output name".format(key))
+            warnings.warn(
+                "Provided key {} for dynamic axes is not a valid input/output name".format(
+                    key
+                )
+            )
         if isinstance(value, list):
-            warnings.warn("No names were found for specified dynamic axes of provided input."
-                          "Automatically generated names will be applied to each dynamic axes of input {}".format(key))
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                "Automatically generated names will be applied to each dynamic axes of input {}".format(
+                    key
+                )
+            )
 
             value_dict = {}
             for i, x in enumerate(value):
                 if not isinstance(x, int):
-                    raise ValueError("The type of axis index is expected to be an integer")
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
                 if x in value_dict:
-                    warnings.warn("Duplicate dynamic axis index {} was provided for input {}."
-                                  .format(x, key))
+                    warnings.warn(
+                        "Duplicate dynamic axis index {} was provided for input {}.".format(
+                            x, key
+                        )
+                    )
                 else:
                     value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
             dynamic_axes[key] = value_dict
-
-
-torch._C.Graph.op = _graph_op  # type: ignore[attr-defined]
-torch._C.Graph.at = _graph_at  # type: ignore[attr-defined]
-torch._C.Block.op = _block_op  # type: ignore[attr-defined]
-torch._C.Graph.constant = _graph_constant  # type: ignore[attr-defined]
-torch._C.Node.__getitem__ = _node_getitem  # type: ignore[attr-defined, misc]
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index b7637464761f..2fa7b3dddd04 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -1,242 +1,23 @@
 r"""Functional interface"""
 import math
-import torch
 from torch import Tensor
-from typing import List, Optional
+from typing import List
 
 from .adadelta import adadelta  # type: ignore[attr-defined] # noqa: F401
 from .adagrad import adagrad, _make_sparse  # type: ignore[attr-defined] # noqa: F401
+from .adam import adam  # type: ignore[attr-defined] # noqa: F401
+from .adamw import adamw  # type: ignore[attr-defined] # noqa: F401
 from .adamax import adamax  # type: ignore[attr-defined] # noqa: F401
 from .asgd import asgd  # type: ignore[attr-defined] # noqa: F401
 from .nadam import nadam  # type: ignore[attr-defined] # noqa: F401
 from .radam import radam  # type: ignore[attr-defined] # noqa: F401
+from .rmsprop import rmsprop  # type: ignore[attr-defined] # noqa: F401
+from .rprop import rprop  # type: ignore[attr-defined] # noqa: F401
+from .sgd import sgd  # type: ignore[attr-defined] # noqa: F401
 
 
 # TODO: use foreach API in optim._functional to do all the computation
 
-def adam(params: List[Tensor],
-         grads: List[Tensor],
-         exp_avgs: List[Tensor],
-         exp_avg_sqs: List[Tensor],
-         max_exp_avg_sqs: List[Tensor],
-         state_steps: List[Tensor],
-         *,
-         amsgrad: bool,
-         beta1: float,
-         beta2: float,
-         lr: float,
-         weight_decay: float,
-         eps: float,
-         maximize: bool):
-    r"""Functional API that performs Adam algorithm computation.
-    See :class:`~torch.optim.Adam` for details.
-    """
-
-    if not all([isinstance(t, torch.Tensor) for t in state_steps]):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
-
-    for i, param in enumerate(params):
-
-        grad = grads[i] if not maximize else -grads[i]
-        exp_avg = exp_avgs[i]
-        exp_avg_sq = exp_avg_sqs[i]
-        step_t = state_steps[i]
-        # update step
-        step_t += 1
-        step = step_t.item()
-
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
-
-        if weight_decay != 0:
-            grad = grad.add(param, alpha=weight_decay)
-
-        # Decay the first and second moment running average coefficient
-        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
-        if amsgrad:
-            # Maintains the maximum of all 2nd moment running avg. till now
-            torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
-            # Use the max. for normalizing running avg. of gradient
-            denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
-        else:
-            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
-
-
-
-        step_size = lr / bias_correction1
-        param.addcdiv_(exp_avg, denom, value=-step_size)
-
-def adamw(params: List[Tensor],
-          grads: List[Tensor],
-          exp_avgs: List[Tensor],
-          exp_avg_sqs: List[Tensor],
-          max_exp_avg_sqs: List[Tensor],
-          state_steps: List[Tensor],
-          *,
-          amsgrad: bool,
-          beta1: float,
-          beta2: float,
-          lr: float,
-          weight_decay: float,
-          eps: float,
-          maximize: bool):
-    r"""Functional API that performs AdamW algorithm computation.
-
-    See :class:`~torch.optim.AdamW` for details.
-    """
-
-    if not all([isinstance(t, torch.Tensor) for t in state_steps]):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
-
-    for i, param in enumerate(params):
-        grad = grads[i] if not maximize else -grads[i]
-        exp_avg = exp_avgs[i]
-        exp_avg_sq = exp_avg_sqs[i]
-        step_t = state_steps[i]
-        # update step
-        step_t += 1
-        step = step_t.item()
-
-        # Perform stepweight decay
-        param.mul_(1 - lr * weight_decay)
-
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
-
-        # Decay the first and second moment running average coefficient
-        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        if amsgrad:
-            # Maintains the maximum of all 2nd moment running avg. till now
-            torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
-            # Use the max. for normalizing running avg. of gradient
-            denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
-        else:
-            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
-
-        step_size = lr / bias_correction1
-
-        param.addcdiv_(exp_avg, denom, value=-step_size)
-
-
-def sgd(params: List[Tensor],
-        d_p_list: List[Tensor],
-        momentum_buffer_list: List[Optional[Tensor]],
-        *,
-        weight_decay: float,
-        momentum: float,
-        lr: float,
-        dampening: float,
-        nesterov: bool,
-        maximize: bool):
-    r"""Functional API that performs SGD algorithm computation.
-
-    See :class:`~torch.optim.SGD` for details.
-    """
-
-    for i, param in enumerate(params):
-
-        d_p = d_p_list[i]
-        if weight_decay != 0:
-            d_p = d_p.add(param, alpha=weight_decay)
-
-        if momentum != 0:
-            buf = momentum_buffer_list[i]
-
-            if buf is None:
-                buf = torch.clone(d_p).detach()
-                momentum_buffer_list[i] = buf
-            else:
-                buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
-
-            if nesterov:
-                d_p = d_p.add(buf, alpha=momentum)
-            else:
-                d_p = buf
-
-        alpha = lr if maximize else -lr
-        param.add_(d_p, alpha=alpha)
-
-
-def rmsprop(params: List[Tensor],
-            grads: List[Tensor],
-            square_avgs: List[Tensor],
-            grad_avgs: List[Tensor],
-            momentum_buffer_list: List[Tensor],
-            *,
-            lr: float,
-            alpha: float,
-            eps: float,
-            weight_decay: float,
-            momentum: float,
-            centered: bool):
-    r"""Functional API that performs rmsprop algorithm computation.
-
-    See :class:`~torch.optim.RMSProp` for details.
-    """
-
-    for i, param in enumerate(params):
-        grad = grads[i]
-        square_avg = square_avgs[i]
-
-        if weight_decay != 0:
-            grad = grad.add(param, alpha=weight_decay)
-
-        square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
-
-        if centered:
-            grad_avg = grad_avgs[i]
-            grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
-            avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(eps)
-        else:
-            avg = square_avg.sqrt().add_(eps)
-
-        if momentum > 0:
-            buf = momentum_buffer_list[i]
-            buf.mul_(momentum).addcdiv_(grad, avg)
-            param.add_(buf, alpha=-lr)
-        else:
-            param.addcdiv_(grad, avg, value=-lr)
-
-
-def rprop(params: List[Tensor],
-          grads: List[Tensor],
-          prevs: List[Tensor],
-          step_sizes: List[Tensor],
-          *,
-          step_size_min: float,
-          step_size_max: float,
-          etaminus: float,
-          etaplus: float):
-    r"""Functional API that performs rprop algorithm computation.
-
-    See :class:`~torch.optim.Rprop` for details.
-    """
-
-    for i, param in enumerate(params):
-        grad = grads[i]
-        prev = prevs[i]
-        step_size = step_sizes[i]
-
-        sign = grad.mul(prev).sign()
-        sign[sign.gt(0)] = etaplus
-        sign[sign.lt(0)] = etaminus
-        sign[sign.eq(0)] = 1
-
-        # update stepsizes with step size updates
-        step_size.mul_(sign).clamp_(step_size_min, step_size_max)
-
-        # for dir<0, dfdx=0
-        # for dir>=0 dfdx=dfdx
-        grad = grad.clone(memory_format=torch.preserve_format)
-        grad[sign.eq(etaminus)] = 0
-
-        # update parameters
-        param.addcmul_(grad.sign(), step_size, value=-1)
-
-        prev.copy_(grad)
-
 
 def sparse_adam(params: List[Tensor],
                 grads: List[Tensor],
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
index 75661046b970..32ea41956604 100644
--- a/torch/optim/_multi_tensor/__init__.py
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -4,23 +4,25 @@
 enough, so that more sophisticated ones can be also easily integrated in the
 future.
 """
-from functools import partial
+from functools import partialmethod
 from torch import optim
 
-from .adam import Adam
-from .adamw import AdamW
-NAdam = partial(optim.NAdam, foreach=True)
-from .sgd import SGD
-RAdam = partial(optim.RAdam, foreach=True)
-from .rmsprop import RMSprop
-from .rprop import Rprop
-ASGD = partial(optim.ASGD, foreach=True)
-Adamax = partial(optim.Adamax, foreach=True)
-Adadelta = partial(optim.Adadelta, foreach=True)
-Adagrad = partial(optim.Adagrad, foreach=True)
+def partialclass(cls, *args, **kwargs):
 
-del adam
-del adamw
-del sgd
-del rmsprop
-del rprop
+    class NewCls(cls):
+        __init__ = partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+Adam = partialclass(optim.Adam, foreach=True)
+AdamW = partialclass(optim.AdamW, foreach=True)
+NAdam = partialclass(optim.NAdam, foreach=True)
+SGD = partialclass(optim.SGD, foreach=True)
+RAdam = partialclass(optim.RAdam, foreach=True)
+RMSprop = partialclass(optim.RMSprop, foreach=True)
+Rprop = partialclass(optim.Rprop, foreach=True)
+ASGD = partialclass(optim.ASGD, foreach=True)
+Adamax = partialclass(optim.Adamax, foreach=True)
+Adadelta = partialclass(optim.Adadelta, foreach=True)
+Adagrad = partialclass(optim.Adagrad, foreach=True)
diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi
index 516c334e4454..fec9f9ae782c 100644
--- a/torch/optim/_multi_tensor/__init__.pyi
+++ b/torch/optim/_multi_tensor/__init__.pyi
@@ -1,13 +1,13 @@
 from functools import partial
 from torch import optim
 
-from .adam import Adam as Adam
-from .adamw import AdamW as AdamW
+Adam = partial(optim.Adam, foreach=True)
+AdamW = partial(optim.AdamW, foreach=True)
 NAdam = partial(optim.NAdam, foreach=True)
-from .sgd import SGD as SGD
+SGD = partial(optim.SGD, foreach=True)
 RAdam = partial(optim.RAdam, foreach=True)
-from .rmsprop import RMSprop as RMSprop
-from .rprop import Rprop as Rprop
+RMSprop = partial(optim.RMSprop, foreach=True)
+Rprop = partial(optim.Rprop, foreach=True)
 ASGD = partial(optim.ASGD, foreach=True)
 Adamax = partial(optim.Adamax, foreach=True)
 Adadelta = partial(optim.Adadelta, foreach=True)
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
deleted file mode 100644
index 30f17baf4826..000000000000
--- a/torch/optim/_multi_tensor/adam.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import math
-import torch
-from ..optimizer import Optimizer
-
-class Adam(Optimizer):
-    r"""Implements Adam algorithm with multi tensor APIs.
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-    The implementation of the L2 penalty follows changes proposed in
-    `Decoupled Weight Decay Regularization`_.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, amsgrad=False, *, maximize: bool = False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad,
-                        maximize=maximize, foreach=True)
-        super(Adam, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Adam, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-            group.setdefault('maximize', False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
-        if not step_is_tensor:
-            for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            amsgrad = group['amsgrad']
-
-            grads = []
-            state_steps = []
-            exp_avg = []
-            exp_avg_sq = []
-            max_exp_avg_sq = []
-            params_with_grad = []
-
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    params_with_grad.append(p)
-                    grads.append(p.grad)
-
-            if group['maximize']:
-                grads = torch._foreach_neg(tuple(grads))
-
-            for p in params_with_grad:
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = torch.tensor(0.)
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg.append(state['exp_avg'])
-                exp_avg_sq.append(state['exp_avg_sq'])
-
-                if amsgrad:
-                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
-
-                state_steps.append(state['step'])
-
-            beta1, beta2 = group['betas']
-
-            # update steps
-            torch._foreach_add_(state_steps, 1)
-
-            bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
-            bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
-            if group['weight_decay'] != 0:
-                grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay'])
-
-            #
-            # Decay the first and second moment running average coefficient
-            #
-            torch._foreach_mul_(exp_avg, beta1)
-            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
-
-            torch._foreach_mul_(exp_avg_sq, beta2)
-            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
-
-            if amsgrad:
-                # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq)
-
-                # Use the max. for normalizing running avg. of gradient
-                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
-            else:
-                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
-
-            step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1]
-            torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size)
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi
deleted file mode 100644
index 09f29597fd18..000000000000
--- a/torch/optim/_multi_tensor/adam.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Adam(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
deleted file mode 100644
index 6e4e1701f3fa..000000000000
--- a/torch/optim/_multi_tensor/adamw.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import math
-import torch
-from ..optimizer import Optimizer
-
-class AdamW(Optimizer):
-    r"""Implements AdamW algorithm.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=1e-2, amsgrad=False, *, maximize: bool = False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize, foreach=True)
-        super(AdamW, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(AdamW, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-            group.setdefault('maximize', False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
-        if not step_is_tensor:
-            for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            amsgrad = group['amsgrad']
-
-            grads = []
-            state_steps = []
-            exp_avg = []
-            exp_avg_sq = []
-            max_exp_avg_sq = []
-            params_with_grad = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('AdamW does not support sparse gradients')
-
-                    # Perform stepweight decay
-                    p.mul_(1 - group['lr'] * group['weight_decay'])
-
-                    params_with_grad.append(p)
-                    grads.append(p.grad)
-
-            if group['maximize']:
-                grads = torch._foreach_neg(tuple(grads))
-
-            for p in params_with_grad:
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = torch.tensor(0.)
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg.append(state['exp_avg'])
-                exp_avg_sq.append(state['exp_avg_sq'])
-
-                if amsgrad:
-                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
-
-                state_steps.append(state['step'])
-
-
-            beta1, beta2 = group['betas']
-
-            # update steps
-            torch._foreach_add_(state_steps, 1)
-
-            bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
-            bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
-
-            #
-            # Decay the first and second moment running average coefficient
-            #
-            torch._foreach_mul_(exp_avg, beta1)
-            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
-
-            torch._foreach_mul_(exp_avg_sq, beta2)
-            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
-
-            if amsgrad:
-                # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq)
-
-                # Use the max. for normalizing running avg. of gradient
-                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
-            else:
-                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
-
-            step_size = [-1 * (group['lr'] / bc) for bc in bias_correction1]
-            torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size)
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi
deleted file mode 100644
index dedd8de3f876..000000000000
--- a/torch/optim/_multi_tensor/adamw.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class AdamW(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
deleted file mode 100644
index b15491907b83..000000000000
--- a/torch/optim/_multi_tensor/rmsprop.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-class RMSprop(Optimizer):
-    r"""Implements RMSprop algorithm.
-
-    Proposed by G. Hinton in his
-    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
-
-    The centered version first appears in `Generating Sequences
-    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
-
-    The implementation here takes the square root of the gradient average before
-    adding epsilon (note that TensorFlow interchanges these two operations). The effective
-    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
-    is the scheduled learning rate and :math:`v` is the weighted moving average
-    of the squared gradient.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-2)
-        momentum (float, optional): momentum factor (default: 0)
-        alpha (float, optional): smoothing constant (default: 0.99)
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        centered (bool, optional) : if ``True``, compute the centered RMSProp,
-            the gradient is normalized by an estimation of its variance
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-
-    """
-
-    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= momentum:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0.0 <= alpha:
-            raise ValueError("Invalid alpha value: {}".format(alpha))
-
-        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered,
-                        weight_decay=weight_decay, foreach=True)
-        super(RMSprop, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(RMSprop, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('momentum', 0)
-            group.setdefault('centered', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            grads = []
-            params_with_grad = []
-            states = []
-            alpha = group['alpha']
-            square_avg = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('RMSprop does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        if group['momentum'] > 0:
-                            state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        if group['centered']:
-                            state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                        state['step'] += 1
-
-                    states.append(state)
-                    square_avg.append(state['square_avg'])
-
-            if group['weight_decay'] != 0:
-                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
-
-            torch._foreach_mul_(square_avg, alpha)
-            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)
-
-            if group['centered']:
-                grad_avgs = [s['grad_avg'] for s in states]
-                torch._foreach_mul_(grad_avgs, alpha)
-                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
-                avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1)
-                torch._foreach_sqrt_(avg)
-                torch._foreach_add_(avg, group['eps'])
-            else:
-                avg = torch._foreach_sqrt(square_avg)
-                torch._foreach_add_(avg, group['eps'])
-
-            if group['momentum'] > 0:
-                buf = [s['momentum_buffer'] for s in states]
-                torch._foreach_mul_(buf, group['momentum'])
-                torch._foreach_addcdiv_(buf, grads, avg)
-                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
-            else:
-                torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr'])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi
deleted file mode 100644
index 691f2188ebb1..000000000000
--- a/torch/optim/_multi_tensor/rmsprop.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class RMSprop(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
deleted file mode 100644
index 67baf1e3b34b..000000000000
--- a/torch/optim/_multi_tensor/rprop.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-class Rprop(Optimizer):
-    """Implements the resilient backpropagation algorithm.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-2)
-        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
-            are multiplicative increase and decrease factors
-            (default: (0.5, 1.2))
-        step_sizes (Tuple[float, float], optional): a pair of minimal and
-            maximal allowed step sizes (default: (1e-6, 50))
-    """
-
-    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 < etas[0] < 1.0 < etas[1]:
-            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
-
-        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes, foreach=True)
-        super(Rprop, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        grads = []
-        states = []
-        params_with_grad = []
-        step_sizes = []
-
-        for group in self.param_groups:
-            for p in group['params']:
-                etaminus, etaplus = group['etas']
-                step_size_min, step_size_max = group['step_sizes']
-
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('RMSprop does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr'])
-
-                        state['step'] += 1
-
-                    states.append(state)
-                    step_sizes.append(state['step_size'])
-
-            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
-            signs = [s.sign() for s in signs]
-            for sign in signs:
-                sign[sign.gt(0)] = etaplus
-                sign[sign.lt(0)] = etaminus
-                sign[sign.eq(0)] = 1
-
-            # update stepsizes with step size updates
-            torch._foreach_mul_(step_sizes, signs)
-            for step_size in step_sizes:
-                step_size.clamp_(step_size_min, step_size_max)
-
-            # for dir<0, dfdx=0
-            # for dir>=0 dfdx=dfdx
-            for i in range(len(grads)):
-                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
-                grads[i][signs[i].eq(etaminus)] = 0
-
-            # update parameters
-            grad_signs = [grad.sign() for grad in grads]
-            torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1)
-
-            for i in range(len(states)):
-                states[i]['prev'].copy_(grads[i])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi
deleted file mode 100644
index 0ea64c63d25e..000000000000
--- a/torch/optim/_multi_tensor/rprop.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Rprop(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
deleted file mode 100644
index 5f5697560d0c..000000000000
--- a/torch/optim/_multi_tensor/sgd.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import torch
-from ..optimizer import Optimizer, required
-
-class SGD(Optimizer):
-    r"""Implements stochastic gradient descent (optionally with momentum).
-
-    Nesterov momentum is based on the formula from
-    `On the importance of initialization and momentum in deep learning`__.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float): learning rate
-        momentum (float, optional): momentum factor (default: 0)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        dampening (float, optional): dampening for momentum (default: 0)
-        nesterov (bool, optional): enables Nesterov momentum (default: False)
-
-    Example:
-        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
-        >>> optimizer.zero_grad()
-        >>> loss_fn(model(input), target).backward()
-        >>> optimizer.step()
-
-    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
-
-    .. note::
-        The implementation of SGD with Momentum/Nesterov subtly differs from
-        Sutskever et. al. and implementations in some other frameworks.
-
-        Considering the specific case of Momentum, the update can be written as
-
-        .. math::
-            \begin{aligned}
-                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
-                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
-            \end{aligned}
-
-        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the
-        parameters, gradient, velocity, and momentum respectively.
-
-        This is in contrast to Sutskever et. al. and
-        other frameworks which employ an update of the form
-
-        .. math::
-            \begin{aligned}
-                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
-                p_{t+1} & = p_{t} - v_{t+1}.
-            \end{aligned}
-
-        The Nesterov version is analogously modified.
-    """
-
-    def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False, *, maximize=False):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if momentum < 0.0:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if weight_decay < 0.0:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov, maximize=maximize, foreach=True)
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
-        super(SGD, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(SGD, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('nesterov', False)
-            group.setdefault('maximize', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-            maximize = group['maximize']
-
-            grads = []
-            params_with_grad = []
-            states = []
-            has_sparse_grad = False
-
-            for p in group['params']:
-                if p.grad is not None:
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-                    states.append(self.state[p])
-
-                    if p.grad.is_sparse:
-                        has_sparse_grad = True
-
-                        if momentum != 0:
-                            raise RuntimeError('SGD does not support momentum for sparse gradients')
-
-            if grads == []:
-                return loss
-
-            if weight_decay != 0:
-                grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay)
-
-            if momentum != 0:
-                bufs = []
-
-                all_states_with_momentum_buffer = True
-                for i in range(len(states)):
-                    if 'momentum_buffer' not in states[i]:
-                        all_states_with_momentum_buffer = False
-                        break
-                    else:
-                        bufs.append(states[i]['momentum_buffer'])
-
-                if all_states_with_momentum_buffer:
-                    torch._foreach_mul_(bufs, momentum)
-                    torch._foreach_add_(bufs, grads, alpha=1 - dampening)
-                else:
-                    bufs = []
-                    for i in range(len(states)):
-                        if 'momentum_buffer' not in states[i]:
-                            buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach()
-                        else:
-                            buf = states[i]['momentum_buffer']
-                            buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
-
-                        bufs.append(buf)
-
-                if nesterov:
-                    torch._foreach_add_(grads, bufs, alpha=momentum)
-                else:
-                    grads = bufs
-
-            alpha = group['lr'] if maximize else -group['lr']
-            if not has_sparse_grad:
-                torch._foreach_add_(params_with_grad, grads, alpha=alpha)
-            else:
-                # foreach APIs dont support sparse
-                for i in range(len(params_with_grad)):
-                    params_with_grad[i].add_(grads[i], alpha=alpha)
-
-        return loss
diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi
deleted file mode 100644
index 6082e230cd79..000000000000
--- a/torch/optim/_multi_tensor/sgd.pyi
+++ /dev/null
@@ -1,4 +0,0 @@
-from ..optimizer import _params_t, Optimizer
-
-class SGD(Optimizer):
-    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index a425e7a42bf6..eb1d4e3f769a 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -45,12 +45,15 @@ class Adadelta(Optimizer):
             to the parameters (default: 1.0)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         foreach (bool, optional): whether foreach implementation of optimizer is used (default: None)
+        maximize (bool, optional): maximize the params based on the objective, instead of
+            minimizing (default: False)
 
     .. _ADADELTA\: An Adaptive Learning Rate Method:
         https://arxiv.org/abs/1212.5701
     """
 
-    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0, foreach: Optional[bool] = None):
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0,
+                 foreach: Optional[bool] = None, *, maximize: bool = False):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= rho <= 1.0:
@@ -60,13 +63,15 @@ def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0, foreach: O
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
-        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay, foreach=foreach)
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay,
+                        maximize=maximize, foreach=foreach)
         super(Adadelta, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('foreach', None)
+            group.setdefault('maximize', False)
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -86,11 +91,12 @@ def step(self, closure=None):
             grads = []
             square_avgs = []
             acc_deltas = []
-            lr, rho, eps, weight_decay, foreach = (group['lr'],
-                                                   group['rho'],
-                                                   group['eps'],
-                                                   group['weight_decay'],
-                                                   group['foreach'])
+            lr, rho, eps, weight_decay, foreach, maximize = (group['lr'],
+                                                             group['rho'],
+                                                             group['eps'],
+                                                             group['weight_decay'],
+                                                             group['foreach'],
+                                                             group['maximize'])
 
             for p in group['params']:
                 if p.grad is None:
@@ -121,7 +127,8 @@ def step(self, closure=None):
                      rho=rho,
                      eps=eps,
                      weight_decay=weight_decay,
-                     foreach=foreach)
+                     foreach=foreach,
+                     maximize=maximize)
 
         return loss
 
@@ -137,7 +144,8 @@ def adadelta(params: List[Tensor],
              lr: float,
              rho: float,
              eps: float,
-             weight_decay: float):
+             weight_decay: float,
+             maximize: bool):
     r"""Functional API that performs Adadelta algorithm computation.
 
     See :class:`~torch.optim.Adadelta` for details.
@@ -162,7 +170,8 @@ def adadelta(params: List[Tensor],
          lr=lr,
          rho=rho,
          eps=eps,
-         weight_decay=weight_decay)
+         weight_decay=weight_decay,
+         maximize=maximize)
 
 
 def _single_tensor_adadelta(params: List[Tensor],
@@ -173,9 +182,12 @@ def _single_tensor_adadelta(params: List[Tensor],
                             lr: float,
                             rho: float,
                             eps: float,
-                            weight_decay: float):
+                            weight_decay: float,
+                            maximize: bool):
 
     for (param, grad, square_avg, acc_delta) in zip(params, grads, square_avgs, acc_deltas):
+        grad = grad if not maximize else -grad
+
         if weight_decay != 0:
             grad = grad.add(param, alpha=weight_decay)
 
@@ -201,11 +213,15 @@ def _multi_tensor_adadelta(params: List[Tensor],
                            lr: float,
                            weight_decay: float,
                            rho: float,
-                           eps: float):
+                           eps: float,
+                           maximize: bool):
 
     if len(params) == 0:
         return
 
+    if maximize:
+        grads = torch._foreach_neg(grads)
+
     if weight_decay != 0:
         torch._foreach_add_(grads, params, alpha=weight_decay)
 
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 5f385cfb2306..59c1db6caf29 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -41,13 +41,25 @@ class Adagrad(Optimizer):
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-10)
         foreach (bool, optional): whether foreach implementation of optimizer is used (default: None)
+        maximize (bool, optional): maximize the params based on the objective, instead of
+            minimizing (default: False)
 
     .. _Adaptive Subgradient Methods for Online Learning and Stochastic
         Optimization: http://jmlr.org/papers/v12/duchi11a.html
     """
 
-    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0,
-                 eps=1e-10, foreach: Optional[bool] = None):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lr_decay=0,
+        weight_decay=0,
+        initial_accumulator_value=0,
+        eps=1e-10,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= lr_decay:
@@ -55,37 +67,57 @@ def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumula
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if not 0.0 <= initial_accumulator_value:
-            raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
+            raise ValueError(
+                "Invalid initial_accumulator_value value: {}".format(
+                    initial_accumulator_value
+                )
+            )
         if not 0.0 <= eps:
             raise ValueError("Invalid epsilon value: {}".format(eps))
 
-        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
-                        initial_accumulator_value=initial_accumulator_value, foreach=foreach)
+        defaults = dict(
+            lr=lr,
+            lr_decay=lr_decay,
+            eps=eps,
+            weight_decay=weight_decay,
+            initial_accumulator_value=initial_accumulator_value,
+            foreach=foreach,
+            maximize=maximize,
+        )
         super(Adagrad, self).__init__(params, defaults)
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 state = self.state[p]
-                state['step'] = torch.tensor(0.)
-                init_value = complex(initial_accumulator_value, initial_accumulator_value) if torch.is_complex(p) \
+                state["step"] = torch.tensor(0.0)
+                init_value = (
+                    complex(initial_accumulator_value, initial_accumulator_value)
+                    if torch.is_complex(p)
                     else initial_accumulator_value
-                state['sum'] = torch.full_like(p, init_value, memory_format=torch.preserve_format)
+                )
+                state["sum"] = torch.full_like(
+                    p, init_value, memory_format=torch.preserve_format
+                )
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
+                s["step"] = torch.tensor(float(s["step"]))
 
     def share_memory(self):
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 state = self.state[p]
-                state['sum'].share_memory_()
+                state["sum"].share_memory_()
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -108,72 +140,83 @@ def step(self, closure=None):
             state_steps = []
 
             has_sparse_grad = False
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is not None:
                     if p.grad.is_sparse:
                         has_sparse_grad = True
                     params_with_grad.append(p)
                     grads.append(p.grad)
                     state = self.state[p]
-                    state_sums.append(state['sum'])
-                    state_steps.append(state['step'])
-
-            adagrad(params_with_grad,
-                    grads,
-                    state_sums,
-                    state_steps,
-                    lr=group['lr'],
-                    weight_decay=group['weight_decay'],
-                    lr_decay=group['lr_decay'],
-                    eps=group['eps'],
-                    has_sparse_grad=has_sparse_grad,
-                    foreach=group['foreach'])
+                    state_sums.append(state["sum"])
+                    state_steps.append(state["step"])
+
+            adagrad(
+                params_with_grad,
+                grads,
+                state_sums,
+                state_steps,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                lr_decay=group["lr_decay"],
+                eps=group["eps"],
+                has_sparse_grad=has_sparse_grad,
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+            )
 
         return loss
 
 
-def adagrad(params: List[Tensor],
-            grads: List[Tensor],
-            state_sums: List[Tensor],
-            state_steps: List[Tensor],
-            # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-            # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
-            has_sparse_grad: bool = None,
-            foreach: bool = None,
-            *,
-            lr: float,
-            weight_decay: float,
-            lr_decay: float,
-            eps: float):
+def adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
+    has_sparse_grad: bool = None,
+    foreach: bool = None,
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    maximize: bool,
+):
     r"""Functional API that performs Adagrad algorithm computation.
 
     See :class:`~torch.optim.Adagrad` for details.
     """
 
     if not all([isinstance(t, torch.Tensor) for t in state_steps]):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
 
     if foreach is None:
         # Placeholder for more complex foreach logic to be added when value is not set
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adagrad
     else:
         func = _single_tensor_adagrad
 
-    func(params,
-         grads,
-         state_sums,
-         state_steps,
-         lr=lr,
-         weight_decay=weight_decay,
-         lr_decay=lr_decay,
-         eps=eps,
-         has_sparse_grad=has_sparse_grad)
+    func(
+        params,
+        grads,
+        state_sums,
+        state_steps,
+        lr=lr,
+        weight_decay=weight_decay,
+        lr_decay=lr_decay,
+        eps=eps,
+        has_sparse_grad=has_sparse_grad,
+        maximize=maximize,
+    )
 
 
 def _make_sparse(grad, grad_indices, values):
@@ -183,25 +226,31 @@ def _make_sparse(grad, grad_indices, values):
     return torch.sparse_coo_tensor(grad_indices, values, size)
 
 
-def _single_tensor_adagrad(params: List[Tensor],
-                           grads: List[Tensor],
-                           state_sums: List[Tensor],
-                           state_steps: List[Tensor],
-                           *,
-                           lr: float,
-                           weight_decay: float,
-                           lr_decay: float,
-                           eps: float,
-                           has_sparse_grad: bool):
+def _single_tensor_adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    has_sparse_grad: bool,
+    maximize: bool,
+):
 
     for (param, grad, state_sum, step_t) in zip(params, grads, state_sums, state_steps):
         # update step
         step_t += 1
         step = step_t.item()
+        grad = grad if not maximize else -grad
 
         if weight_decay != 0:
             if grad.is_sparse:
-                raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+                raise RuntimeError(
+                    "weight_decay option is not compatible with sparse gradients"
+                )
             grad = grad.add(param, alpha=weight_decay)
 
         clr = lr / (1 + (step - 1) * lr_decay)
@@ -215,7 +264,9 @@ def _single_tensor_adagrad(params: List[Tensor],
             state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
             std = state_sum.sparse_mask(grad)
             std_values = std._values().sqrt_().add_(eps)
-            param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
+            param.add_(
+                _make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr
+            )
         else:
             is_complex = torch.is_complex(param)
             if is_complex:
@@ -230,34 +281,43 @@ def _single_tensor_adagrad(params: List[Tensor],
                 state_sum = torch.view_as_complex(state_sum)
 
 
-def _multi_tensor_adagrad(params: List[Tensor],
-                          grads: List[Tensor],
-                          state_sums: List[Tensor],
-                          state_steps: List[Tensor],
-                          *,
-                          lr: float,
-                          weight_decay: float,
-                          lr_decay: float,
-                          eps: float,
-                          has_sparse_grad: bool):
+def _multi_tensor_adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    has_sparse_grad: bool,
+    maximize: bool,
+):
 
     # Foreach functions will throw errors if given empty lists
     if len(params) == 0:
         return
 
+    if maximize:
+        grads = torch._foreach_neg(grads)
+
     if has_sparse_grad is None:
         has_sparse_grad = any([grad.is_sparse for grad in grads])
 
     if has_sparse_grad:
-        return _single_tensor_adagrad(params,
-                                      grads,
-                                      state_sums,
-                                      state_steps,
-                                      lr=lr,
-                                      weight_decay=weight_decay,
-                                      lr_decay=lr_decay,
-                                      eps=eps,
-                                      has_sparse_grad=has_sparse_grad)
+        return _single_tensor_adagrad(
+            params,
+            grads,
+            state_sums,
+            state_steps,
+            lr=lr,
+            weight_decay=weight_decay,
+            lr_decay=lr_decay,
+            eps=eps,
+            has_sparse_grad=has_sparse_grad,
+            maximize=False,
+        )
 
     # Update steps
     torch._foreach_add_(state_steps, 1)
@@ -268,10 +328,18 @@ def _multi_tensor_adagrad(params: List[Tensor],
     minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]
 
     grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads]
-    state_sums = [torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums]
+    state_sums = [
+        torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums
+    ]
     torch._foreach_addcmul_(state_sums, grads, grads, value=1)
     std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
     toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
-    toAdd = [torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(toAdd)]
+    toAdd = [
+        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
+        for i, x in enumerate(toAdd)
+    ]
     torch._foreach_add_(params, toAdd)
-    state_sums = [torch.view_as_complex(x) if torch.is_complex(params[i]) else x for i, x in enumerate(state_sums)]
+    state_sums = [
+        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
+        for i, x in enumerate(state_sums)
+    ]
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index f69eef21ad29..09c5c6b56c96 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,6 +1,8 @@
+import math
 import torch
-from . import _functional as F
+from torch import Tensor
 from .optimizer import Optimizer
+from typing import List, Optional
 
 
 class Adam(Optimizer):
@@ -55,6 +57,8 @@ class Adam(Optimizer):
         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False)
+        foreach (bool, optional): whether foreach implementation of optimizer
+            is used (default: None)
         maximize (bool, optional): maximize the params based on the objective, instead of
             minimizing (default: False)
 
@@ -65,7 +69,8 @@ class Adam(Optimizer):
     """
 
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, amsgrad=False, *, maximize: bool = False):
+                 weight_decay=0, amsgrad=False, *, foreach: Optional[bool] = None,
+                 maximize: bool = False):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -77,14 +82,16 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize)
+                        weight_decay=weight_decay, amsgrad=amsgrad,
+                        maximize=maximize, foreach=foreach)
         super(Adam, self).__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(Adam, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('amsgrad', False)
             group.setdefault('maximize', False)
+            group.setdefault('foreach', None)
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
         if not step_is_tensor:
@@ -140,17 +147,172 @@ def step(self, closure=None):
 
                     state_steps.append(state['step'])
 
-            F.adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=group['amsgrad'],
-                   beta1=beta1,
-                   beta2=beta2,
-                   lr=group['lr'],
-                   weight_decay=group['weight_decay'],
-                   eps=group['eps'],
-                   maximize=group['maximize'])
+            adam(params_with_grad,
+                 grads,
+                 exp_avgs,
+                 exp_avg_sqs,
+                 max_exp_avg_sqs,
+                 state_steps,
+                 amsgrad=group['amsgrad'],
+                 beta1=beta1,
+                 beta2=beta2,
+                 lr=group['lr'],
+                 weight_decay=group['weight_decay'],
+                 eps=group['eps'],
+                 maximize=group['maximize'],
+                 foreach=group['foreach'])
+
         return loss
+
+
+def adam(params: List[Tensor],
+         grads: List[Tensor],
+         exp_avgs: List[Tensor],
+         exp_avg_sqs: List[Tensor],
+         max_exp_avg_sqs: List[Tensor],
+         state_steps: List[Tensor],
+         # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+         # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+         foreach: bool = None,
+         *,
+         amsgrad: bool,
+         beta1: float,
+         beta2: float,
+         lr: float,
+         weight_decay: float,
+         eps: float,
+         maximize: bool):
+    r"""Functional API that performs Adam algorithm computation.
+    See :class:`~torch.optim.Adam` for details.
+    """
+
+    if not all([isinstance(t, torch.Tensor) for t in state_steps]):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+
+    if foreach is None:
+        # Placeholder for more complex foreach logic to be added when value is not set
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adam
+    else:
+        func = _single_tensor_adam
+
+    func(params,
+         grads,
+         exp_avgs,
+         exp_avg_sqs,
+         max_exp_avg_sqs,
+         state_steps,
+         amsgrad=amsgrad,
+         beta1=beta1,
+         beta2=beta2,
+         lr=lr,
+         weight_decay=weight_decay,
+         eps=eps,
+         maximize=maximize)
+
+
+def _single_tensor_adam(params: List[Tensor],
+                        grads: List[Tensor],
+                        exp_avgs: List[Tensor],
+                        exp_avg_sqs: List[Tensor],
+                        max_exp_avg_sqs: List[Tensor],
+                        state_steps: List[Tensor],
+                        *,
+                        amsgrad: bool,
+                        beta1: float,
+                        beta2: float,
+                        lr: float,
+                        weight_decay: float,
+                        eps: float,
+                        maximize: bool):
+
+    for i, param in enumerate(params):
+
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+        # update step
+        step_t += 1
+        step = step_t.item()
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
+        if amsgrad:
+            # Maintains the maximum of all 2nd moment running avg. till now
+            torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
+            # Use the max. for normalizing running avg. of gradient
+            denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
+        else:
+            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+
+
+        step_size = lr / bias_correction1
+        param.addcdiv_(exp_avg, denom, value=-step_size)
+
+
+def _multi_tensor_adam(params: List[Tensor],
+                       grads: List[Tensor],
+                       exp_avgs: List[Tensor],
+                       exp_avg_sqs: List[Tensor],
+                       max_exp_avg_sqs: List[Tensor],
+                       state_steps: List[Tensor],
+                       *,
+                       amsgrad: bool,
+                       beta1: float,
+                       beta2: float,
+                       lr: float,
+                       weight_decay: float,
+                       eps: float,
+                       maximize: bool):
+
+    if len(params) == 0:
+        return
+
+    # update steps
+    torch._foreach_add_(state_steps, 1)
+
+    if maximize:
+        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]
+
+    bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
+    bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
+    if weight_decay != 0:
+        torch._foreach_add_(grads, params, alpha=weight_decay)
+
+    torch._foreach_mul_(exp_avgs, beta1)
+    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
+
+    torch._foreach_mul_(exp_avg_sqs, beta2)
+    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)
+
+    if amsgrad:
+        # Maintains the maximum of all 2nd moment running avg. till now
+        max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+
+        # Use the max. for normalizing running avg. of gradient
+        max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
+        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+        torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+        denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
+    else:
+        exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
+        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+        torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
+        denom = torch._foreach_add(exp_avg_sq_sqrt, eps)
+
+    step_size = [(lr / bc) * -1 for bc in bias_correction1]
+    torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index d8d70745a36b..00d58936fbea 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -42,13 +42,15 @@ class Adamax(Optimizer):
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         foreach (bool, optional): whether foreach implementation of optimizer is used (default: None)
+        maximize (bool, optional): maximize the params based on the objective, instead of
+            minimizing (default: False)
 
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     """
 
     def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, foreach: Optional[bool] = None):
+                 weight_decay=0, foreach: Optional[bool] = None, *, maximize: bool = False):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -60,13 +62,15 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, foreach=foreach)
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        foreach=foreach, maximize=maximize)
         super(Adamax, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('foreach', None)
+            group.setdefault('maximize', False)
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
         if not step_is_tensor:
@@ -98,6 +102,7 @@ def step(self, closure=None):
             lr = group['lr']
             weight_decay = group['weight_decay']
             foreach = group['foreach']
+            maximize = group['maximize']
 
             for p in group['params']:
                 if p.grad is None:
@@ -129,7 +134,8 @@ def step(self, closure=None):
                    beta2=beta2,
                    lr=lr,
                    weight_decay=weight_decay,
-                   foreach=foreach)
+                   foreach=foreach,
+                   maximize=maximize)
 
         return loss
 
@@ -142,6 +148,7 @@ def adamax(params: List[Tensor],
            # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
            # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
            foreach: bool = None,
+           maximize: bool = False,
            *,
            eps: float,
            beta1: float,
@@ -177,7 +184,8 @@ def adamax(params: List[Tensor],
          beta1=beta1,
          beta2=beta2,
          lr=lr,
-         weight_decay=weight_decay)
+         weight_decay=weight_decay,
+         maximize=maximize)
 
 
 def _single_tensor_adamax(params: List[Tensor],
@@ -190,10 +198,12 @@ def _single_tensor_adamax(params: List[Tensor],
                           beta1: float,
                           beta2: float,
                           lr: float,
-                          weight_decay: float):
+                          weight_decay: float,
+                          maximize: bool):
 
     for i, param in enumerate(params):
         grad = grads[i]
+        grad = grad if not maximize else -grad
         exp_avg = exp_avgs[i]
         exp_inf = exp_infs[i]
         step_t = state_steps[i]
@@ -229,11 +239,15 @@ def _multi_tensor_adamax(params: List[Tensor],
                          beta2: float,
                          lr: float,
                          weight_decay: float,
-                         eps: float):
+                         eps: float,
+                         maximize: bool):
 
     if len(params) == 0:
         return
 
+    if maximize:
+        grads = torch._foreach_neg(grads)
+
     # Update steps
     torch._foreach_add_(state_steps, 1)
 
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 0089acbcff0a..9582ea38b002 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -1,6 +1,8 @@
+import math
 import torch
-from . import _functional as F
+from torch import Tensor
 from .optimizer import Optimizer
+from typing import List, Optional
 
 
 class AdamW(Optimizer):
@@ -57,6 +59,8 @@ class AdamW(Optimizer):
             (default: False)
         maximize (bool, optional): maximize the params based on the objective, instead of
             minimizing (default: False)
+        foreach (bool, optional): whether foreach implementation of optimizer
+            is used (default: None)
 
     .. _Decoupled Weight Decay Regularization:
         https://arxiv.org/abs/1711.05101
@@ -65,7 +69,8 @@ class AdamW(Optimizer):
     """
 
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=1e-2, amsgrad=False, *, maximize: bool = False):
+                 weight_decay=1e-2, amsgrad=False, *, maximize: bool = False,
+                 foreach: Optional[bool] = None):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -77,14 +82,16 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize)
+                        weight_decay=weight_decay, amsgrad=amsgrad,
+                        foreach=foreach, maximize=maximize)
         super(AdamW, self).__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(AdamW, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('amsgrad', False)
             group.setdefault('maximize', False)
+            group.setdefault('foreach', None)
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
         if not step_is_tensor:
@@ -109,7 +116,6 @@ def step(self, closure=None):
             grads = []
             exp_avgs = []
             exp_avg_sqs = []
-            state_sums = []
             max_exp_avg_sqs = []
             state_steps = []
             amsgrad = group['amsgrad']
@@ -144,18 +150,173 @@ def step(self, closure=None):
 
                 state_steps.append(state['step'])
 
-            F.adamw(params_with_grad,
-                    grads,
-                    exp_avgs,
-                    exp_avg_sqs,
-                    max_exp_avg_sqs,
-                    state_steps,
-                    amsgrad=amsgrad,
-                    beta1=beta1,
-                    beta2=beta2,
-                    lr=group['lr'],
-                    weight_decay=group['weight_decay'],
-                    eps=group['eps'],
-                    maximize=group['maximize'])
+            adamw(params_with_grad,
+                  grads,
+                  exp_avgs,
+                  exp_avg_sqs,
+                  max_exp_avg_sqs,
+                  state_steps,
+                  amsgrad=amsgrad,
+                  beta1=beta1,
+                  beta2=beta2,
+                  lr=group['lr'],
+                  weight_decay=group['weight_decay'],
+                  eps=group['eps'],
+                  maximize=group['maximize'],
+                  foreach=group['foreach'])
 
         return loss
+
+
+def adamw(params: List[Tensor],
+          grads: List[Tensor],
+          exp_avgs: List[Tensor],
+          exp_avg_sqs: List[Tensor],
+          max_exp_avg_sqs: List[Tensor],
+          state_steps: List[Tensor],
+          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+          foreach: bool = None,
+          *,
+          amsgrad: bool,
+          beta1: float,
+          beta2: float,
+          lr: float,
+          weight_decay: float,
+          eps: float,
+          maximize: bool):
+    r"""Functional API that performs AdamW algorithm computation.
+
+    See :class:`~torch.optim.AdamW` for details.
+    """
+
+    if not all([isinstance(t, torch.Tensor) for t in state_steps]):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+
+    if foreach is None:
+        # Placeholder for more complex foreach logic to be added when value is not set
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adamw
+    else:
+        func = _single_tensor_adamw
+
+    func(params,
+         grads,
+         exp_avgs,
+         exp_avg_sqs,
+         max_exp_avg_sqs,
+         state_steps,
+         amsgrad=amsgrad,
+         beta1=beta1,
+         beta2=beta2,
+         lr=lr,
+         weight_decay=weight_decay,
+         eps=eps,
+         maximize=maximize)
+
+
+def _single_tensor_adamw(params: List[Tensor],
+                         grads: List[Tensor],
+                         exp_avgs: List[Tensor],
+                         exp_avg_sqs: List[Tensor],
+                         max_exp_avg_sqs: List[Tensor],
+                         state_steps: List[Tensor],
+                         *,
+                         amsgrad: bool,
+                         beta1: float,
+                         beta2: float,
+                         lr: float,
+                         weight_decay: float,
+                         eps: float,
+                         maximize: bool):
+
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+        # update step
+        step_t += 1
+        step = step_t.item()
+
+        # Perform stepweight decay
+        param.mul_(1 - lr * weight_decay)
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        if amsgrad:
+            # Maintains the maximum of all 2nd moment running avg. till now
+            torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
+            # Use the max. for normalizing running avg. of gradient
+            denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
+        else:
+            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+        step_size = lr / bias_correction1
+
+        param.addcdiv_(exp_avg, denom, value=-step_size)
+
+
+def _multi_tensor_adamw(params: List[Tensor],
+                        grads: List[Tensor],
+                        exp_avgs: List[Tensor],
+                        exp_avg_sqs: List[Tensor],
+                        max_exp_avg_sqs: List[Tensor],
+                        state_steps: List[Tensor],
+                        *,
+                        amsgrad: bool,
+                        beta1: float,
+                        beta2: float,
+                        lr: float,
+                        weight_decay: float,
+                        eps: float,
+                        maximize: bool):
+
+    if len(params) == 0:
+        return
+
+    if maximize:
+        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]
+
+    # Perform stepweight decay
+    torch._foreach_mul_(params, 1 - lr * weight_decay)
+
+    # update steps
+    torch._foreach_add_(state_steps, 1)
+
+    bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
+    bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
+
+    # Decay the first and second moment running average coefficient
+    torch._foreach_mul_(exp_avgs, beta1)
+    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
+
+    torch._foreach_mul_(exp_avg_sqs, beta2)
+    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)
+
+    if amsgrad:
+        # Maintains the maximum of all 2nd moment running avg. till now
+        max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]
+
+        # Use the max. for normalizing running avg. of gradient
+        max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
+        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+        torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+        denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
+    else:
+        exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
+        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+        torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
+        denom = torch._foreach_add(exp_avg_sq_sqrt, eps)
+
+    step_size = [-1 * (lr / bc) for bc in bias_correction1]
+    torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index c9c512c1336a..b6b5d6c7603a 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -730,6 +730,11 @@ def get_lr(self):
 
         if self.last_epoch == 0:
             return [group['lr'] for group in self.optimizer.param_groups]
+        elif self._step_count == 1 and self.last_epoch > 0:
+            return [self.eta_min + (base_lr - self.eta_min) *
+                    (1 + math.cos((self.last_epoch) * math.pi / self.T_max)) / 2
+                    for base_lr, group in
+                    zip(self.base_lrs, self.optimizer.param_groups)]
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
             return [group['lr'] + (base_lr - self.eta_min) *
                     (1 - math.cos(math.pi / self.T_max)) / 2
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 8acf2eaebc5c..9da9277dee32 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -151,17 +151,19 @@ def load_state_dict(self, state_dict):
                   zip(chain.from_iterable((g['params'] for g in saved_groups)),
                       chain.from_iterable((g['params'] for g in groups)))}
 
-        def cast(param, value):
+        def cast(param, value, key=None):
             r"""Make a deep copy of value, casting all tensors to device of param."""
             if isinstance(value, torch.Tensor):
                 # Floating-point types are a bit special here. They are the only ones
                 # that are assumed to always match the type of params.
-                if param.is_floating_point():
-                    value = value.to(param.dtype)
-                value = value.to(param.device)
+                # Make sure state['step'] is not casted https://github.com/pytorch/pytorch/issues/74424
+                if (key != "step"):
+                    if param.is_floating_point():
+                        value = value.to(param.dtype)
+                    value = value.to(param.device)
                 return value
             elif isinstance(value, dict):
-                return {k: cast(param, v) for k, v in value.items()}
+                return {k: cast(param, v, key=k) for k, v in value.items()}
             elif isinstance(value, container_abcs.Iterable):
                 return type(value)(cast(param, v) for v in value)
             else:
diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index 6202050f3493..3838b4bf599b 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -13,6 +13,6 @@ class Optimizer:
     def __setstate__(self, state: dict) -> None: ...
     def state_dict(self) -> dict: ...
     def load_state_dict(self, state_dict: dict) -> None: ...
-    def zero_grad(self, set_to_none: Optional[bool]=...) -> None: ...
+    def zero_grad(self, set_to_none: bool=...) -> None: ...
     def step(self, closure: Optional[Callable[[], float]]=...) -> Optional[float]: ...
     def add_param_group(self, param_group: dict) -> None: ...
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index dc72181b351f..313c4e922955 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -1,6 +1,7 @@
 import torch
-from . import _functional as F
+from torch import Tensor
 from .optimizer import Optimizer
+from typing import List, Optional
 
 
 class RMSprop(Optimizer):
@@ -58,10 +59,13 @@ class RMSprop(Optimizer):
         centered (bool, optional) : if ``True``, compute the centered RMSProp,
             the gradient is normalized by an estimation of its variance
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        foreach (bool, optional): whether foreach implementation of optimizer
+            is used (default: None)
 
     """
 
-    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0,
+                 centered=False, foreach: Optional[bool] = None):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -73,14 +77,16 @@ def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, moment
         if not 0.0 <= alpha:
             raise ValueError("Invalid alpha value: {}".format(alpha))
 
-        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered,
+                        weight_decay=weight_decay, foreach=foreach)
         super(RMSprop, self).__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(RMSprop, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('momentum', 0)
             group.setdefault('centered', False)
+            group.setdefault('foreach', None)
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -132,16 +138,138 @@ def step(self, closure=None):
                 state['step'] += 1
 
 
-            F.rmsprop(params_with_grad,
-                      grads,
-                      square_avgs,
-                      grad_avgs,
-                      momentum_buffer_list,
-                      lr=group['lr'],
-                      alpha=group['alpha'],
-                      eps=group['eps'],
-                      weight_decay=group['weight_decay'],
-                      momentum=group['momentum'],
-                      centered=group['centered'])
+            rmsprop(params_with_grad,
+                    grads,
+                    square_avgs,
+                    grad_avgs,
+                    momentum_buffer_list,
+                    lr=group['lr'],
+                    alpha=group['alpha'],
+                    eps=group['eps'],
+                    weight_decay=group['weight_decay'],
+                    momentum=group['momentum'],
+                    centered=group['centered'],
+                    foreach=group['foreach'])
 
         return loss
+
+
+def rmsprop(params: List[Tensor],
+            grads: List[Tensor],
+            square_avgs: List[Tensor],
+            grad_avgs: List[Tensor],
+            momentum_buffer_list: List[Tensor],
+            # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+            # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+            foreach: bool = None,
+            *,
+            lr: float,
+            alpha: float,
+            eps: float,
+            weight_decay: float,
+            momentum: float,
+            centered: bool):
+    r"""Functional API that performs rmsprop algorithm computation.
+    See :class:`~torch.optim.RMSProp` for details.
+    """
+
+    if foreach is None:
+        # Placeholder for more complex foreach logic to be added when value is not set
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rmsprop
+    else:
+        func = _single_tensor_rmsprop
+
+    func(params,
+         grads,
+         square_avgs,
+         grad_avgs,
+         momentum_buffer_list,
+         lr=lr,
+         alpha=alpha,
+         eps=eps,
+         weight_decay=weight_decay,
+         momentum=momentum,
+         centered=centered)
+
+
+def _single_tensor_rmsprop(params: List[Tensor],
+                           grads: List[Tensor],
+                           square_avgs: List[Tensor],
+                           grad_avgs: List[Tensor],
+                           momentum_buffer_list: List[Tensor],
+                           *,
+                           lr: float,
+                           alpha: float,
+                           eps: float,
+                           weight_decay: float,
+                           momentum: float,
+                           centered: bool):
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        square_avg = square_avgs[i]
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
+
+        if centered:
+            grad_avg = grad_avgs[i]
+            grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
+            avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(eps)
+        else:
+            avg = square_avg.sqrt().add_(eps)
+
+        if momentum > 0:
+            buf = momentum_buffer_list[i]
+            buf.mul_(momentum).addcdiv_(grad, avg)
+            param.add_(buf, alpha=-lr)
+        else:
+            param.addcdiv_(grad, avg, value=-lr)
+
+
+def _multi_tensor_rmsprop(params: List[Tensor],
+                          grads: List[Tensor],
+                          square_avgs: List[Tensor],
+                          grad_avgs: List[Tensor],
+                          momentum_buffer_list: List[Tensor],
+                          *,
+                          lr: float,
+                          alpha: float,
+                          eps: float,
+                          weight_decay: float,
+                          momentum: float,
+                          centered: bool):
+
+    if len(params) == 0:
+        return
+
+    if weight_decay != 0:
+        torch._foreach_add_(grads, params, alpha=weight_decay)
+
+    torch._foreach_mul_(square_avgs, alpha)
+    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha)
+
+    if centered:
+        torch._foreach_mul_(grad_avgs, alpha)
+        torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
+        avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1)
+        torch._foreach_sqrt_(avg)
+        torch._foreach_add_(avg, eps)
+    else:
+        avg = torch._foreach_sqrt(square_avgs)
+        torch._foreach_add_(avg, eps)
+
+    if momentum > 0:
+        torch._foreach_mul_(momentum_buffer_list, momentum)
+        torch._foreach_addcdiv_(momentum_buffer_list, grads, avg)
+        torch._foreach_add_(params, momentum_buffer_list, alpha=-lr)
+    else:
+        torch._foreach_addcdiv_(params, grads, avg, value=-lr)
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 741f6de69432..d976647f7ab4 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,6 +1,7 @@
 import torch
-from . import _functional as F
+from torch import Tensor
 from .optimizer import Optimizer
+from typing import List, Optional
 
 
 class Rprop(Optimizer):
@@ -25,6 +26,7 @@ class Rprop(Optimizer):
             &\hspace{10mm}  \textbf{else if}  \:  g^i_{prev} g^i_t < 0                           \\
             &\hspace{15mm}  \eta^i_t \leftarrow \mathrm{max}(\eta^i_{t-1} \eta_{-},
                 \Gamma_{min})                                                                    \\
+            &\hspace{15mm}  g^i_t \leftarrow 0                                                   \\
             &\hspace{10mm}  \textbf{else}  \:                                                    \\
             &\hspace{15mm}  \eta^i_t \leftarrow \eta^i_{t-1}                                     \\
             &\hspace{5mm}\theta_t \leftarrow \theta_{t-1}- \eta_t \mathrm{sign}(g_t)             \\
@@ -47,17 +49,25 @@ class Rprop(Optimizer):
             (default: (0.5, 1.2))
         step_sizes (Tuple[float, float], optional): a pair of minimal and
             maximal allowed step sizes (default: (1e-6, 50))
+        foreach (bool, optional): whether foreach implementation of optimizer
+            is used (default: None)
     """
 
-    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
+    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50),
+                 foreach: Optional[bool] = None):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 < etas[0] < 1.0 < etas[1]:
             raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
 
-        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
+        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes, foreach=foreach)
         super(Rprop, self).__init__(params, defaults)
 
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('foreach', None)
+
     @torch.no_grad()
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -76,6 +86,9 @@ def step(self, closure=None):
             grads = []
             prevs = []
             step_sizes = []
+            etaminus, etaplus = group['etas']
+            step_size_min, step_size_max = group['step_sizes']
+            foreach = group['foreach']
 
             for p in group['params']:
                 if p.grad is None:
@@ -97,18 +110,128 @@ def step(self, closure=None):
                 prevs.append(state['prev'])
                 step_sizes.append(state['step_size'])
 
-                etaminus, etaplus = group['etas']
-                step_size_min, step_size_max = group['step_sizes']
-
                 state['step'] += 1
 
-            F.rprop(params,
-                    grads,
-                    prevs,
-                    step_sizes,
-                    step_size_min=step_size_min,
-                    step_size_max=step_size_max,
-                    etaminus=etaminus,
-                    etaplus=etaplus)
+            rprop(params,
+                  grads,
+                  prevs,
+                  step_sizes,
+                  step_size_min=step_size_min,
+                  step_size_max=step_size_max,
+                  etaminus=etaminus,
+                  etaplus=etaplus,
+                  foreach=foreach)
 
         return loss
+
+
+def rprop(params: List[Tensor],
+          grads: List[Tensor],
+          prevs: List[Tensor],
+          step_sizes: List[Tensor],
+          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+          foreach: bool = None,
+          *,
+          step_size_min: float,
+          step_size_max: float,
+          etaminus: float,
+          etaplus: float):
+    r"""Functional API that performs rprop algorithm computation.
+
+    See :class:`~torch.optim.Rprop` for details.
+    """
+
+    if foreach is None:
+        # Placeholder for more complex foreach logic to be added when value is not set
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rprop
+    else:
+        func = _single_tensor_rprop
+
+    func(params,
+         grads,
+         prevs,
+         step_sizes,
+         step_size_min=step_size_min,
+         step_size_max=step_size_max,
+         etaminus=etaminus,
+         etaplus=etaplus)
+
+
+def _single_tensor_rprop(params: List[Tensor],
+                         grads: List[Tensor],
+                         prevs: List[Tensor],
+                         step_sizes: List[Tensor],
+                         *,
+                         step_size_min: float,
+                         step_size_max: float,
+                         etaminus: float,
+                         etaplus: float):
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        prev = prevs[i]
+        step_size = step_sizes[i]
+
+        sign = grad.mul(prev).sign()
+        sign[sign.gt(0)] = etaplus
+        sign[sign.lt(0)] = etaminus
+        sign[sign.eq(0)] = 1
+
+        # update stepsizes with step size updates
+        step_size.mul_(sign).clamp_(step_size_min, step_size_max)
+
+        # for dir<0, dfdx=0
+        # for dir>=0 dfdx=dfdx
+        grad = grad.clone(memory_format=torch.preserve_format)
+        grad[sign.eq(etaminus)] = 0
+
+        # update parameters
+        param.addcmul_(grad.sign(), step_size, value=-1)
+
+        prev.copy_(grad)
+
+
+def _multi_tensor_rprop(params: List[Tensor],
+                        grads: List[Tensor],
+                        prevs: List[Tensor],
+                        step_sizes: List[Tensor],
+                        *,
+                        step_size_min: float,
+                        step_size_max: float,
+                        etaminus: float,
+                        etaplus: float):
+
+    if len(params) == 0:
+        return
+
+    signs = torch._foreach_mul(grads, prevs)
+    signs = [s.sign() for s in signs]
+    for sign in signs:
+        sign[sign.gt(0)] = etaplus
+        sign[sign.lt(0)] = etaminus
+        sign[sign.eq(0)] = 1
+
+    # update stepsizes with step size updates
+    torch._foreach_mul_(step_sizes, signs)
+    for step_size in step_sizes:
+        step_size.clamp_(step_size_min, step_size_max)
+
+    # for dir<0, dfdx=0
+    # for dir>=0 dfdx=dfdx
+    for i in range(len(grads)):
+        grads[i] = grads[i].clone(memory_format=torch.preserve_format)
+        grads[i][signs[i].eq(etaminus)] = 0
+
+    # update parameters
+    grad_signs = [grad.sign() for grad in grads]
+    torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1)
+
+    for i in range(len(prevs)):
+        prevs[i].copy_(grads[i])
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 8109b9a37478..022d3f1b0bd8 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -1,6 +1,7 @@
 import torch
-from . import _functional as F
+from torch import Tensor
 from .optimizer import Optimizer, required
+from typing import List, Optional
 
 
 class SGD(Optimizer):
@@ -24,7 +25,7 @@ class SGD(Optimizer):
             &\hspace{10mm}\textbf{else}                                                          \\
             &\hspace{15mm} \textbf{b}_t \leftarrow g_t                                           \\
             &\hspace{10mm}\textbf{if} \: \textit{nesterov}                                       \\
-            &\hspace{15mm} g_t \leftarrow g_{t-1} + \mu \textbf{b}_t                             \\
+            &\hspace{15mm} g_t \leftarrow g_{t} + \mu \textbf{b}_t                             \\
             &\hspace{10mm}\textbf{else}                                                   \\[-1.ex]
             &\hspace{15mm} g_t  \leftarrow  \textbf{b}_t                                         \\
             &\hspace{5mm}\textbf{if} \: \textit{maximize}                                          \\
@@ -49,6 +50,8 @@ class SGD(Optimizer):
         nesterov (bool, optional): enables Nesterov momentum (default: False)
         maximize (bool, optional): maximize the params based on the objective, instead of
             minimizing (default: False)
+        foreach (bool, optional): whether foreach implementation of optimizer
+            is used (default: None)
 
     Example:
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
@@ -86,7 +89,7 @@ class SGD(Optimizer):
     """
 
     def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False, *, maximize=False):
+                 weight_decay=0, nesterov=False, *, maximize=False, foreach: Optional[bool] = None):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if momentum < 0.0:
@@ -95,16 +98,18 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov, maximize=maximize)
+                        weight_decay=weight_decay, nesterov=nesterov,
+                        maximize=maximize, foreach=foreach)
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
         super(SGD, self).__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(SGD, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('nesterov', False)
             group.setdefault('maximize', False)
+            group.setdefault('foreach', None)
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -123,17 +128,14 @@ def step(self, closure=None):
             params_with_grad = []
             d_p_list = []
             momentum_buffer_list = []
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-            maximize = group['maximize']
-            lr = group['lr']
+            has_sparse_grad = False
 
             for p in group['params']:
                 if p.grad is not None:
                     params_with_grad.append(p)
                     d_p_list.append(p.grad)
+                    if p.grad.is_sparse:
+                        has_sparse_grad = True
 
                     state = self.state[p]
                     if 'momentum_buffer' not in state:
@@ -141,15 +143,17 @@ def step(self, closure=None):
                     else:
                         momentum_buffer_list.append(state['momentum_buffer'])
 
-            F.sgd(params_with_grad,
-                  d_p_list,
-                  momentum_buffer_list,
-                  weight_decay=weight_decay,
-                  momentum=momentum,
-                  lr=lr,
-                  dampening=dampening,
-                  nesterov=nesterov,
-                  maximize=maximize,)
+            sgd(params_with_grad,
+                d_p_list,
+                momentum_buffer_list,
+                weight_decay=group['weight_decay'],
+                momentum=group['momentum'],
+                lr=group['lr'],
+                dampening=group['dampening'],
+                nesterov=group['nesterov'],
+                maximize=group['maximize'],
+                has_sparse_grad=has_sparse_grad,
+                foreach=group['foreach'])
 
             # update momentum_buffers in state
             for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
@@ -157,3 +161,141 @@ def step(self, closure=None):
                 state['momentum_buffer'] = momentum_buffer
 
         return loss
+
+
+def sgd(params: List[Tensor],
+        d_p_list: List[Tensor],
+        momentum_buffer_list: List[Optional[Tensor]],
+        # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+        # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+        has_sparse_grad: bool = None,
+        foreach: bool = None,
+        *,
+        weight_decay: float,
+        momentum: float,
+        lr: float,
+        dampening: float,
+        nesterov: bool,
+        maximize: bool):
+    r"""Functional API that performs SGD algorithm computation.
+
+    See :class:`~torch.optim.SGD` for details.
+    """
+
+    if foreach is None:
+        # Placeholder for more complex foreach logic to be added when value is not set
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_sgd
+    else:
+        func = _single_tensor_sgd
+
+    func(params,
+         d_p_list,
+         momentum_buffer_list,
+         weight_decay=weight_decay,
+         momentum=momentum,
+         lr=lr,
+         dampening=dampening,
+         nesterov=nesterov,
+         has_sparse_grad=has_sparse_grad,
+         maximize=maximize)
+
+def _single_tensor_sgd(params: List[Tensor],
+                       d_p_list: List[Tensor],
+                       momentum_buffer_list: List[Optional[Tensor]],
+                       *,
+                       weight_decay: float,
+                       momentum: float,
+                       lr: float,
+                       dampening: float,
+                       nesterov: bool,
+                       maximize: bool,
+                       has_sparse_grad: bool):
+
+    for i, param in enumerate(params):
+
+        d_p = d_p_list[i]
+        if weight_decay != 0:
+            d_p = d_p.add(param, alpha=weight_decay)
+
+        if momentum != 0:
+            buf = momentum_buffer_list[i]
+
+            if buf is None:
+                buf = torch.clone(d_p).detach()
+                momentum_buffer_list[i] = buf
+            else:
+                buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+
+            if nesterov:
+                d_p = d_p.add(buf, alpha=momentum)
+            else:
+                d_p = buf
+
+        alpha = lr if maximize else -lr
+        param.add_(d_p, alpha=alpha)
+
+
+def _multi_tensor_sgd(params: List[Tensor],
+                      grads: List[Tensor],
+                      momentum_buffer_list: List[Optional[Tensor]],
+                      *,
+                      weight_decay: float,
+                      momentum: float,
+                      lr: float,
+                      dampening: float,
+                      nesterov: bool,
+                      maximize: bool,
+                      has_sparse_grad: bool):
+
+    if len(params) == 0:
+        return
+
+    if has_sparse_grad is None:
+        has_sparse_grad = any([grad.is_sparse for grad in grads])
+
+    if weight_decay != 0:
+        grads = torch._foreach_add(grads, params, alpha=weight_decay)
+
+    if momentum != 0:
+        bufs = []
+
+        all_states_with_momentum_buffer = True
+        for i in range(len(momentum_buffer_list)):
+            if momentum_buffer_list[i] is None:
+                all_states_with_momentum_buffer = False
+                break
+            else:
+                bufs.append(momentum_buffer_list[i])
+
+        if all_states_with_momentum_buffer:
+            torch._foreach_mul_(bufs, momentum)
+            torch._foreach_add_(bufs, grads, alpha=1 - dampening)
+        else:
+            bufs = []
+            for i in range(len(momentum_buffer_list)):
+                if momentum_buffer_list[i] is None:
+                    buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach()
+                else:
+                    buf = momentum_buffer_list[i]
+                    buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
+
+                bufs.append(buf)
+
+        if nesterov:
+            torch._foreach_add_(grads, bufs, alpha=momentum)
+        else:
+            grads = bufs
+
+    alpha = lr if maximize else -lr
+    if not has_sparse_grad:
+        torch._foreach_add_(params, grads, alpha=alpha)
+    else:
+        # foreach APIs dont support sparse
+        for i in range(len(params)):
+            params[i].add_(grads[i], alpha=alpha)
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 479e532448aa..7e66d2c4a91f 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -1,6 +1,7 @@
 import itertools
 import math
 from copy import deepcopy
+import warnings
 
 import torch
 from torch.nn import Module
diff --git a/torch/overrides.py b/torch/overrides.py
index 408012ea6e94..383061ecfc99 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -26,12 +26,15 @@
 import functools
 import types
 import warnings
-from typing import Dict, Set, List, Any, Callable, Iterable, Type
+from typing import Dict, Set, List, Any, Callable, Iterable, Type, Iterator, Tuple
+import contextlib
 
 import torch
 from torch._C import (
     _has_torch_function, _has_torch_function_unary,
-    _has_torch_function_variadic, _add_docstr)
+    _has_torch_function_variadic, _add_docstr, _set_torch_function_mode, _get_torch_function_mode)
+
+from torch.utils._mode_utils import _enable_mode, _push_mode, _ModeInfo, _wrap_init, MetaInitErrorInfo
 
 __all__ = [
     "get_ignored_functions",
@@ -39,9 +42,11 @@
     "get_testing_overrides",
     "handle_torch_function",
     "has_torch_function",
+    "resolve_name",
     "is_tensor_like",
     "is_tensor_method_or_property",
     "wrap_torch_function",
+    "enable_reentrant_dispatch",
 ]
 
 @functools.lru_cache(None)
@@ -107,6 +112,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.dtype,
         torch.finfo,
         torch.has_mkl,
+        torch.has_mps,
         torch.has_mkldnn,
         torch.has_openmp,
         torch.iinfo,
@@ -141,6 +147,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.fft.rfftfreq,
         torch.from_file,
         torch.full,
+        torch.fill,
         torch.hamming_window,
         torch.hann_window,
         torch.kaiser_window,
@@ -151,6 +158,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.mkldnn_max_pool2d,
         torch.mkldnn_max_pool3d,
         torch.mkldnn_linear_backward_weights,
+        torch.nested_tensor,
         torch.normal,
         torch.ones,
         torch.promote_types,
@@ -162,7 +170,11 @@ def get_ignored_functions() -> Set[Callable]:
         torch.result_type,
         torch.scalar_tensor,
         torch.sparse_coo_tensor,
+        torch.sparse_compressed_tensor,
         torch.sparse_csr_tensor,
+        torch.sparse_csc_tensor,
+        torch.sparse_bsr_tensor,
+        torch.sparse_bsc_tensor,
         torch.tril_indices,
         torch.triu_indices,
         torch.vander,
@@ -179,6 +191,20 @@ def get_ignored_functions() -> Set[Callable]:
         torch.nn.functional.sigmoid,
         torch.nn.functional.hardsigmoid,
         torch.nn.functional.tanh,
+        # Doesn't actually take or return tensor arguments
+        torch.nn.init.calculate_gain,
+        # These are deprecated; don't test them
+        torch.nn.init.uniform,
+        torch.nn.init.normal,
+        torch.nn.init.constant,
+        torch.nn.init.eye,
+        torch.nn.init.dirac,
+        torch.nn.init.xavier_uniform,
+        torch.nn.init.xavier_normal,
+        torch.nn.init.kaiming_uniform,
+        torch.nn.init.kaiming_normal,
+        torch.nn.init.orthogonal,
+        torch.nn.init.sparse,
         has_torch_function,
         handle_torch_function,
         torch.set_autocast_enabled,
@@ -201,6 +227,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic_algorithms_warn_only_enabled,
         torch.set_deterministic_debug_mode,
         torch.get_deterministic_debug_mode,
+        torch.set_float32_matmul_precision,
+        torch.get_float32_matmul_precision,
         torch.unify_type_list,
         torch.is_warn_always_enabled,
         torch.set_warn_always,
@@ -218,6 +246,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor.__delattr__,
         Tensor.__setattr__,
         Tensor.__torch_function__,
+        Tensor.__torch_dispatch__,
         Tensor.__new__,
         Tensor.__class__,
         Tensor.__subclasshook__,
@@ -231,10 +260,14 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor.new_ones,
         Tensor.new_full,
         Tensor._make_subclass,
+        Tensor.solve,
         Tensor.stride,
         Tensor.unflatten,
         Tensor.to_sparse_coo,
         Tensor.to_sparse_csr,
+        Tensor.to_sparse_csc,
+        Tensor.to_sparse_bsr,
+        Tensor.to_sparse_bsc,
         Tensor._reduce_ex_internal,
         Tensor._fix_weakref,
         Tensor._make_wrapper_subclass,
@@ -243,6 +276,9 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor._conj_physical,
         Tensor._neg_view,
         Tensor._is_zerotensor,
+        Tensor._addmm_activation,
+        Tensor._nested_tensor_layer_norm,
+        Tensor.to_padded_tensor,
     }
 
 
@@ -539,7 +575,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.hinge_embedding_loss: lambda input, target, margin=1.0, size_average=None, reduce=None, reduction='mean': -1,
         torch.histc: lambda input, bins=100, min=0, max=0, out=None: -1,
         torch.histogram: lambda input, bins=100, min=None, max=None, weight=None, density=False, out=None: -1,
-        torch.histogramdd: lambda input, bins, weight=None, density=False: -1,
+        torch.histogramdd: lambda input, bins, range=None, weight=None, density=False: -1,
         torch.linalg.householder_product: lambda input, tau: -1,
         torch.hspmm: lambda mat1, mat2, out=None: -1,
         torch.hsplit: lambda input, indices_or_sections: -1,
@@ -553,6 +589,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.index_put: lambda input, indices, values, accumulate=False: -1,
         torch.index_select: lambda input, dim, index, out=None: -1,
         torch.index_fill: lambda input, dim, index, value: -1,
+        torch.index_reduce: lambda input, dim, index, source, reduce, include_input=True: -1,
         torch.isfinite: lambda tensor: -1,
         torch.isin: lambda e, te, assume_unique=False, invert=False: -1,
         torch.isinf: lambda tensor: -1,
@@ -581,6 +618,9 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1,
         torch.kron: lambda input, other: -1,
         torch.kthvalue: lambda input, k, dim=None, keepdim=False, out=None: -1,
+        torch.linalg.ldl_factor_ex: lambda input, hermitian=False, check_errors=False, out=None: -1,
+        torch.linalg.ldl_factor: lambda input, hermitian=False, out=None: -1,
+        torch.linalg.ldl_solve: lambda LD, pivots, B, hermitian=False, out=None: -1,
         torch.layer_norm: lambda input, normalized_shape, weight=None, bias=None, esp=1e-05, cudnn_enabled=True: -1,
         torch.lcm: lambda input, other, out=None: -1,
         torch.ldexp: lambda input, other, out=None: -1,
@@ -618,6 +658,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.masked_scatter: lambda input, mask, source: -1,
         torch.masked_select: lambda input, mask, out=None: -1,
         torch.matmul: lambda input, other, out=None: -1,
+        torch.linalg.lu: lambda input, pivot=True, out=None: -1,
         torch.linalg.lu_factor: lambda input, pivot=True, out=None: -1,
         torch.linalg.lu_factor_ex: lambda input, pivot=True, check_errors=False, out=None: -1,
         torch.linalg.matmul: lambda input, other, out=None: -1,  # alias for torch.matmul
@@ -669,7 +710,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
         torch.native_dropout: lambda input, p, train: -1,
         torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
-        torch._native_multi_head_self_attention: lambda query, qkv_weight, qkv_bias, proj_weight, proj_bias, mask=None: -1,
         torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
         torch.native_norm: lambda input, p=2: -1,
         torch.native_norm: lambda input, p=2: -1,
@@ -730,7 +770,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
             lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False,
             _random_samples=None: -1),
         torch.nn.functional.gaussian_nll_loss: lambda input, target, var, full=False, eps=1e-06, reduction='mean': -1,
-        torch.nn.functional.gelu: lambda input: -1,
+        torch.nn.functional.gelu: lambda input, approximate='none': -1,
         torch.nn.functional.glu: lambda input, dim=-1: -1,
         torch.nn.functional.grid_sample: lambda input, grid, mode='bilinear', padding_mode='zeros', align_corners=None: -1,
         torch.nn.functional.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05: -1,
@@ -775,7 +815,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
             lambda query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v,
             add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=True, key_padding_mask=None,
             need_weights=True, attn_mask=None, use_separate_proj_weight=False, q_proj_weight=None, k_proj_weight=None,
-            v_proj_weight=None, static_k=None, static_v=None: -1),
+            v_proj_weight=None, static_k=None, static_v=None, average_attn_weights=None: -1),
         torch.nn.functional.multi_margin_loss: (lambda input, target, p=1, margin=1.0, weight=None, size_average=None,
                                                 reduce=None, reduction='mean': -1),
         torch.nn.functional.multilabel_margin_loss: (lambda input, target, size_average=None, reduce=None,
@@ -813,6 +853,11 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
                                                                 distance_function=None, margin=1.0,
                                                                 swap=False, reduction='mean': -1),
         torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1,
+        torch.nn.init.uniform_: lambda tensor, a=0., b=1.: -1,
+        torch.nn.init.constant_: lambda tensor, val: -1,
+        torch.nn.init.normal_: lambda tensor, mean=0., std=1.: -1,
+        torch.nn.init.constant_: lambda tensor, val: -1,
+        torch.nn.init.kaiming_uniform_: lambda tensor, a=0, mode='fan_in', nonlinearity='leaky_relu': -1,
         torch.nonzero: lambda input, as_tuple=False: -1,
         torch.argwhere: lambda input: -1,
         torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
@@ -897,12 +942,12 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.saddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
         torch.scatter: lambda input, dim, index, src: -1,
         torch.scatter_add: lambda input, dim, index, src: -1,
-        torch.scatter_reduce: lambda input, dim, index, reduce, output_size=None: -1,
+        torch.scatter_reduce: lambda input, dim, index, src, reduce, include_self=True: -1,
         torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1,
         torch.segment_reduce: lambda data, reduce="max", lengths=None, indices=None, axis=0, unsafe=False: -1,
         torch.select: lambda input, dim, index: -1,
         torch.select_scatter: lambda input, src, dim, index: -1,
-        torch.slice_scatter: lambda input, src, dim, start, end, step: -1,
+        torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1,
         torch.selu: lambda input, inplace=False: -1,
         torch.sigmoid: lambda input, out=None: -1,
         torch.sign: lambda input, out=None: -1,
@@ -916,7 +961,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.smm: lambda input, mat2: -1,
         torch.spmm: lambda input, mat2: -1,
         torch.softmax: lambda input, dim, dtype=None: -1,
-        torch.solve: lambda input, A, out=None: -1,
         torch.linalg.solve: lambda input, other, out=None: -1,
         torch.sort: lambda input, dim=-1, descending=False, *, stable=False, out=None: -1,
         torch.split: lambda tensor, split_size_or_sections, dim=0: -1,
@@ -969,6 +1013,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.special.multigammaln: lambda input, p: -1,
         torch.special.ndtri: lambda input: -1,
         torch.special.ndtr: lambda input: -1,
+        torch.special.log_ndtr: lambda input: -1,
         torch.special.xlogy: lambda input, other, out=None: -1,
         torch.special.xlog1py: lambda input, other, out=None: -1,
         torch.special.zeta: lambda self, other, out=None: -1,
@@ -1004,12 +1049,49 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.unsafe_split: lambda tensor, split_size_or_sections, dim=0: -1,
         torch.unsafe_split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1,
         torch.unsqueeze: lambda input, dim, out=None: -1,
+        torch.linalg.vander: lambda x, N=None: -1,
         torch.var: lambda input, dim=None: -1,
         torch.var_mean: lambda input, dim=None: -1,
         torch.vsplit: lambda input, indices_or_sections: -1,
         torch.vstack: lambda tensors, out=None: -1,
         torch.where: lambda condition, x=None, y=None: -1,
         torch.zeros_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch._fw_primal_copy: lambda self, level: -1,
+        torch._make_dual_copy: lambda primal, tangent, level: -1,
+        torch.view_as_real_copy: lambda self: -1,
+        torch.view_as_complex_copy: lambda self: -1,
+        torch._conj_copy: lambda self: -1,
+        torch._neg_view_copy: lambda self: -1,
+        torch.as_strided_copy: lambda self, size, stride, storage_offset=None: -1,
+        torch._sparse_broadcast_to_copy: lambda self, size: -1,
+        torch.diagonal_copy: lambda self, offset=0, dim1=0, dim2=1: -1,
+        torch.expand_copy: lambda self, size, *, implicit=False: -1,
+        torch.narrow_copy: lambda self, dim, start, length: -1,
+        torch.permute_copy: lambda self, dims: -1,
+        torch._reshape_alias_copy: lambda self, size, stride: -1,
+        torch.select_copy: lambda self, dim, index: -1,
+        torch.detach_copy: lambda self: -1,
+        torch.slice_copy: lambda self, dim=0, start=None, end=None, step=1: -1,
+        torch.split_copy: lambda self, split_size, dim=0: -1,
+        torch.split_with_sizes_copy: lambda self, split_sizes, dim=0: -1,
+        torch.squeeze_copy: lambda self: -1,
+        torch.squeeze_copy: lambda self, dim: -1,
+        torch.t_copy: lambda self: -1,
+        torch.transpose_copy: lambda self, dim0, dim1: -1,
+        torch.unsqueeze_copy: lambda self, dim: -1,
+        torch._indices_copy: lambda self: -1,
+        torch._values_copy: lambda self: -1,
+        torch.indices_copy: lambda self: -1,
+        torch.values_copy: lambda self: -1,
+        torch.crow_indices_copy: lambda self: -1,
+        torch.col_indices_copy: lambda self: -1,
+        torch.ccol_indices_copy: lambda self: -1,
+        torch.row_indices_copy: lambda self: -1,
+        torch.unbind_copy: lambda self, dim=0: -1,
+        torch.view_copy: lambda self, size: -1,
+        torch.view_copy: lambda self, dtype: -1,
+        torch.unfold_copy: lambda self, dimension, size, step: -1,
+        torch.alias_copy: lambda self: -1,
         Tensor.__floordiv__: lambda self, other: -1,
         Tensor.__rfloordiv__: lambda self, other: -1,
         Tensor.__ifloordiv__: lambda self, other: -1,
@@ -1046,7 +1128,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.__format__: lambda self, format_spec: -1,
         Tensor.__reduce_ex__: lambda self, proto: -1,
         Tensor.__reversed__: lambda self: -1,
-        Tensor.__repr__: lambda self: -1,
+        Tensor.__repr__: lambda self, *, tensor_contents=None: -1,
         Tensor.__setitem__: lambda self, k, v: -1,
         Tensor.__setstate__: lambda self, d: -1,
         Tensor.T.__get__: lambda self: -1,
@@ -1061,17 +1143,19 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor._grad_fn.__get__: lambda self: -1,
         Tensor.grad_fn.__get__: lambda self: -1,
         Tensor._version.__get__: lambda self: -1,
-        Tensor._autocast_to_reduced_precision: lambda self: -1,
-        Tensor._autocast_to_full_precision: lambda self: -1,
+        Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1,
+        Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1,
         Tensor.data.__get__: lambda self: -1,
         Tensor.device.__get__: lambda self: -1,
         Tensor.dtype.__get__: lambda self: -1,
         Tensor.is_cuda.__get__: lambda self: -1,
         Tensor.is_xpu.__get__: lambda self: -1,
+        Tensor.is_ipu.__get__: lambda self: -1,
         Tensor.is_leaf.__get__: lambda self: -1,
         Tensor.retains_grad.__get__: lambda self: -1,
         Tensor.is_meta.__get__: lambda self: -1,
-        Tensor.is_mlc.__get__: lambda self: -1,
+        Tensor.is_mps.__get__: lambda self: -1,
+        Tensor.is_nested.__get__: lambda self: -1,
         Tensor.is_ort.__get__: lambda self: -1,
         Tensor.is_mkldnn.__get__: lambda self: -1,
         Tensor.is_quantized.__get__: lambda self: -1,
@@ -1098,6 +1182,8 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor._nnz: lambda self: -1,
         Tensor.crow_indices: lambda self: -1,
         Tensor.col_indices: lambda self: -1,
+        Tensor.ccol_indices: lambda self: -1,
+        Tensor.row_indices: lambda self: -1,
         Tensor._update_names: lambda self, names, inplace: -1,
         Tensor._values: lambda self: -1,
         Tensor.adjoint: lambda self: -1,
@@ -1119,6 +1205,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.cpu: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.cuda: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.xpu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.ipu: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.data_ptr: lambda self: -1,
         Tensor.dense_dim: lambda self: -1,
         Tensor.diagonal_scatter: lambda self, src, offset=0, dim1=0, dim2=1: -1,
@@ -1136,6 +1223,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.geometric_: lambda self, p, *, generator=None: -1,
         Tensor.get_device: lambda self: -1,
         Tensor.half: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.chalf: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.has_names: lambda self: -1,
         Tensor.indices: lambda self: -1,
         Tensor.int: lambda self, memory_format=torch.preserve_format: -1,
@@ -1172,13 +1260,14 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.resize: lambda self, *size: -1,
         Tensor.resize_: lambda self, size: -1,
         Tensor.resize_as: lambda self, other: -1,
+        Tensor.resize_as_sparse_: lambda self, other: -1,
         Tensor.retain_grad: lambda self: -1,
         Tensor.set_: lambda self, source=None, storage_offset=0, size=None, stride=None: -1,
         Tensor.select_scatter: lambda self, src, dim, index: -1,
         Tensor.share_memory_: lambda self: -1,
         Tensor.short: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.size: lambda self: -1,
-        Tensor.slice_scatter: lambda self, src, dim, start, end, step: -1,
+        Tensor.slice_scatter: lambda self, src, dim=0, start=None, end=None, step=1: -1,
         Tensor.sparse_dim: lambda self: -1,
         Tensor.sparse_mask: lambda self, mask: -1,
         Tensor.sparse_resize_: lambda self, size1, size2, dense_dim: -1,
@@ -1191,7 +1280,8 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.sum_to_size: lambda self, size: -1,
         Tensor.tile: lambda self, *reps: -1,
         Tensor.to: lambda self, dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format: -1,
-        Tensor.to_dense: lambda self: -1,
+        Tensor.to_dense: lambda self, dtype=None: -1,
+        Tensor._to_dense: lambda self, dtype=None: -1,
         Tensor.to_sparse: lambda self: -1,
         Tensor.tolist: lambda self: -1,
         Tensor.to_mkldnn: lambda self: -1,
@@ -1266,7 +1356,7 @@ def inner(func):
         def wrapped(*args, **kwargs):
             relevant_args = dispatcher(*args, **kwargs)
             if has_torch_function(relevant_args):
-                return handle_torch_function(func, relevant_args, *args, **kwargs)
+                return handle_torch_function(wrapped, relevant_args, *args, **kwargs)
 
             return func(*args, **kwargs)
 
@@ -1305,6 +1395,9 @@ def _get_overloaded_args(relevant_args: Iterable[Any]) -> List[Any]:
     .. _NEP-0018:
        https://numpy.org/neps/nep-0018-array-function-protocol.html
     """
+    # If torch function is not enabled, there are no overloaded types
+    if not torch._C._is_torch_function_enabled():
+        return []
     # Runtime is O(num_arguments * num_unique_types)
     overloaded_types: Set[Type] = set()
     overloaded_args: List[Any] = []
@@ -1370,7 +1463,7 @@ def handle_torch_function(
     Example
     -------
     >>> def func(a):
-    ...     if type(a) is not torch.Tensor:  # This will make func dispatchable by __torch_function__
+    ...     if has_torch_function_unary(a):
     ...         return handle_torch_function(func, (a,), a)
     ...     return a + 0
     """
@@ -1379,14 +1472,24 @@ def handle_torch_function(
     # overloaded_args already have unique types.
     types = tuple(map(type, overloaded_args))
 
+    # Check for __torch_function__ mode.
+    mode = _get_torch_function_mode()
+    if mode is not None:
+        # NB: unlike on tensors, modes are instances
+        with _no_torch_function_mode():
+            result = mode.__torch_function__(public_api, types, args, kwargs)
+        if result is not NotImplemented:
+            return result
+
     # Call overrides
     for overloaded_arg in overloaded_args:
         # This call needs to become a classmethod call in the future.
         # See https://github.com/pytorch/pytorch/issues/63767
         torch_func_method = overloaded_arg.__torch_function__
-        if hasattr(torch_func_method, "__self__") and torch_func_method.__self__ is overloaded_arg:
+        if hasattr(torch_func_method, "__self__") and torch_func_method.__self__ is overloaded_arg and \
+                torch_func_method is not torch._C._disabled_torch_function_impl:
             warnings.warn("Defining your `__torch_function__ as a plain method is deprecated and "
-                          "will be an error in PyTorch 1.11, please define it as a classmethod.",
+                          "will be an error in future, please define it as a classmethod.",
                           DeprecationWarning)
 
         # Use `public_api` instead of `implementation` so __torch_function__
@@ -1397,14 +1500,21 @@ def handle_torch_function(
             return result
 
     func_name = '{}.{}'.format(public_api.__module__, public_api.__name__)
-    raise TypeError("no implementation found for '{}' on types that implement "
-                    '__torch_function__: {}'
-                    .format(func_name, [type(arg) for arg in overloaded_args]))
+    msg = (
+        "no implementation found for '{}' on types that implement "
+        '__torch_function__: {}'
+    ).format(func_name, [type(arg) for arg in overloaded_args])
+    if mode is not None:
+        msg += f" nor in mode {mode}"
+    raise TypeError(msg)
 
 has_torch_function = _add_docstr(
     _has_torch_function,
-    r"""Check for __torch_function__ implementations in the elements of an iterable.
-    Considers exact ``Tensor`` s and ``Parameter`` s non-dispatchable.
+    r"""Check for __torch_function__ implementations in the elements of an iterable
+    or if a __torch_function__ mode is enabled.  Considers exact ``Tensor`` s
+    and ``Parameter`` s non-dispatchable.  Use this to guard a call to
+    :func:`handle_torch_function`; don't use it to test if something
+    is Tensor-like, use :func:`is_tensor_like` instead.
     Arguments
     ---------
     relevant_args : iterable
@@ -1447,35 +1557,32 @@ def handle_torch_function(
 )
 
 @functools.lru_cache(None)
-def get_overridable_functions() -> Dict[Any, List[Callable]]:
-    """List functions that are overridable via __torch_function__
-
-    Returns
-    -------
-    Dict[Any, List[Callable]]
-        A dictionary that maps namespaces that contain overridable functions
-        to functions in that namespace that can be overridden.
-    """
+def _get_overridable_functions() -> Tuple[Dict[Any, List[Callable]], Dict[Callable, str]]:
     overridable_funcs = collections.defaultdict(list)
+    index = {}
     tested_namespaces = [
-        (torch, torch.__all__ + dir(torch._C._VariableFunctions)),
-        (torch.functional, torch.functional.__all__),
-        (torch.nn.functional, dir(torch.nn.functional)),
-        (torch.Tensor, dir(torch.Tensor)),
-        (torch.linalg, dir(torch.linalg)),
-        (torch.fft, dir(torch.fft)),
-        (torch.special, dir(torch.special)),
+        ("torch", torch, torch.__all__ + dir(torch._C._VariableFunctions)),
+        ("torch.functional", torch.functional, torch.functional.__all__),
+        ("torch.nn.functional", torch.nn.functional, dir(torch.nn.functional)),
+        ("torch.nn.init", torch.nn.init, dir(torch.nn.init)),
+        ("torch.Tensor", torch.Tensor, dir(torch.Tensor)),
+        ("torch.linalg", torch.linalg, dir(torch.linalg)),
+        ("torch.fft", torch.fft, dir(torch.fft)),
+        ("torch.special", torch.special, dir(torch.special)),
     ]
-    for namespace, ns_funcs in tested_namespaces:
+    for namespace_str, namespace, ns_funcs in tested_namespaces:
         for func_name in ns_funcs:
+            ignore = False
             # ignore private functions or functions that are deleted in torch.__init__
             if namespace is not torch.Tensor:
-                if func_name.startswith('_'):
+                if func_name.startswith('__'):
                     continue
+                elif func_name.startswith('_'):
+                    ignore = True
                 elif func_name.endswith('_'):
-                    continue
+                    ignore = True
                 elif not func_name[0].islower():
-                    continue
+                    ignore = True
                 elif func_name == 'unique_dim':
                     continue
             else:
@@ -1495,6 +1602,10 @@ def get_overridable_functions() -> Dict[Any, List[Callable]]:
                 continue
 
             if not callable(func) and hasattr(func, "__get__"):
+                index[func.__get__] = f"{namespace_str}.{func_name}.__get__"
+                index[func.__set__] = f"{namespace_str}.{func_name}.__set__"
+                if ignore:
+                    continue
                 if func.__get__ in get_ignored_functions():
                     msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
                            "but still has an explicit override")
@@ -1507,6 +1618,11 @@ def get_overridable_functions() -> Dict[Any, List[Callable]]:
             if not callable(func):
                 continue
 
+            index[func] = f"{namespace_str}.{func_name}"
+
+            if ignore:
+                continue
+
             # cannot be overriden by __torch_function__
             if func in get_ignored_functions():
                 msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
@@ -1514,7 +1630,37 @@ def get_overridable_functions() -> Dict[Any, List[Callable]]:
                 assert func not in get_testing_overrides(), msg.format(namespace, func.__name__)
                 continue
             overridable_funcs[namespace].append(func)
-    return overridable_funcs
+    return overridable_funcs, index
+
+def get_overridable_functions() -> Dict[Any, List[Callable]]:
+    """List functions that are overridable via __torch_function__
+
+    Returns
+    -------
+    Dict[Any, List[Callable]]
+        A dictionary that maps namespaces that contain overridable functions
+        to functions in that namespace that can be overridden.
+    """
+    return _get_overridable_functions()[0]
+
+def resolve_name(f):
+    """Get a human readable string name for a function passed to
+    __torch_function__
+
+    Arguments
+    ---------
+    callable : Callable
+        Function to resolve the name of.
+
+    Returns
+    -------
+    str
+        Name of the function; if eval'ed it should give back the input
+        function.
+    """
+    if isinstance(f, torch._ops.OpOverload):
+        return str(f)
+    return _get_overridable_functions()[1].get(f)
 
 @functools.lru_cache(None)
 def _get_tensor_methods() -> Set[Callable]:
@@ -1582,3 +1728,187 @@ def is_tensor_like(inp):
     True
     """
     return type(inp) is torch.Tensor or hasattr(type(inp), "__torch_function__")
+
+
+def _wrap_torch_function(f):
+    @functools.wraps(f)
+    def wrapped(self, *args, **kwargs):
+        with enable_torch_function_mode(self.inner):
+            return f(self, *args, **kwargs)
+    return wrapped
+
+
+# Implementation note: I had a choice about how much of mode stacks
+# to implement in Python versus in C++.  At time of writing, I did not care
+# too much about implementation efficiency; however, I do care about making it
+# hard for users to implement modes in the wrong way.  In the end, it turned
+# out to be possible to implement mode stacks entirely from userland, with the
+# C++ API providing only _get_torch_function_mode() and
+# _set_torch_function_mode(), so I opted to provide some unsafe C++ bindings and
+# have the bulk of the logic for managing the stack in Python, which helped
+# simplify the C++ API surface.  It would also have been valid to build in the
+# notion of mode stack directly into C++ but in this design it's substantially
+# more difficult to interact with TorchFunctionModeMeta.
+
+
+class _TorchFunctionMetaInitErrorInfo(MetaInitErrorInfo):
+    def __init__(self):
+        super().__init__(mode_class_name="TorchDispatchMode", mode_name="torch_dispatch")
+
+
+class TorchFunctionModeMeta(type):
+    """
+    Metaclass for :class:`TorchFunctionMode`; it does two things:
+
+        * Adds an implicit ``inner`` kwarg to ``__init__``, to
+          allow the modes to be chained together to form a stack.
+
+        * Reenables the inner mode, so that by default PyTorch API calls
+          will compositionally proceed to the next mode on the stack.
+
+    The default behavior for the second bullet is important, as it is easy to
+    accidentally write ``__torch_function__`` implementations that are not
+    compositional, and the wrapping here makes the obvious code do the
+    right thing (aka, this is why there is a metaclass).
+    """
+    def __new__(metacls, name, bases, dct):
+        if '__init__' in dct:
+            dct['__init__'] = _wrap_init(dct['__init__'], _TorchFunctionMetaInitErrorInfo())
+        if '__torch_function__' in dct:
+            dct['__torch_function__'] = _wrap_torch_function(dct['__torch_function__'])
+        return super().__new__(metacls, name, bases, dct)
+
+
+class TorchFunctionMode(metaclass=TorchFunctionModeMeta):
+    """
+    A ``TorchFunctionMode`` allows you to override the meaning of all
+    ``__torch_function__`` overrideable functions within a dynamic scope,
+    without having to actually create a tensor subclass or manually
+    monkey-patch functions in the PyTorch API.  Some common situations
+    where you should use a mode:
+
+        * You want to override the meaning of factory functions, or other
+          functions that do not otherwise take a tensor as an argument
+          (these cannot be overridden with tensor subclasses).
+
+        * You want to override the behavior of all functions without needing
+          to wrap your inputs in tensor subclasses; e.g., if you are just
+          interested in logging intermediate computations.
+
+        * You want to control the order of execution of various tensor
+          subclasses explicitly, rather than implicitly via the return of
+          ``NotImplemented``.
+
+    Independent subclasses of :class:`TorchFunctionMode` are compositional:
+    modes can be pushed onto a stack with :func:`push_torch_function_mode`.
+    When you call functions in the PyTorch API inside your
+    ``__torch_function__`` implementation, by default, they will forward on to
+    the next mode on the mode stack.  If you want recursively call back into
+    your current ``__torch_function__`` implementation, either explicitly
+    invoke ``self.__torch_function__(...)``, or use the context manager
+    ``enable_torch_function_mode(self, replace=self.inner)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+    inner: "TorchFunctionMode"
+
+    # Force metaclass to generate constructor at the base of the hierarchy
+    def __init__(self):
+        pass
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        raise NotImplementedError()
+
+    @classmethod
+    def push(cls, *args, **kwargs):
+        return push_torch_function_mode(functools.partial(cls, *args, **kwargs))
+
+
+class BaseTorchFunctionMode(TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+
+# This is private API as I'm not sure it's possible for users to use this
+# compositionally (easy to discard too many modes).  It is useful for
+# library code though, e.g., in handle_torch_function
+@contextlib.contextmanager
+def _no_torch_function_mode() -> Iterator[None]:
+    old = _get_torch_function_mode()
+    _set_torch_function_mode(None)
+    try:
+        yield
+    finally:
+        _set_torch_function_mode(old)
+
+
+class _TorchFunctionModeInfo(_ModeInfo):
+    def __init__(self):
+        super().__init__(mode_name="torch_function", mode_class=TorchFunctionMode,
+                         base_mode_class=BaseTorchFunctionMode)
+
+    def get_mode(self):
+        return _get_torch_function_mode()
+
+    def set_mode(self, mode):
+        return _set_torch_function_mode(mode)
+
+
+@contextlib.contextmanager
+def enable_torch_function_mode(mode, *, replace=None, ignore_preexisting=False) -> Iterator[None]:
+    """
+    Context manager that sets the current :class:`TorchFunctionMode`; see the
+    class for more information on what modes are.  This function is
+    non-compositional; if there is already an existing mode, it will raise an
+    error; prefer using :func:`push_torch_function_mode` if your
+    ``__torch_function__`` implementation can defer to an inner mode.
+
+    This function is safe to use inside a ``__torch_function__`` mode handler,
+    as the mode is guaranteed to be disabled in this context.  You can use
+    this context manager to reinstate the mode so that calls to overridable
+    APIs recursively call back into your mode handler (this can easily cause
+    infinite loops, so use with care!)
+
+    Args:
+        mode (:class:`TorchFunctionMode`, Tensor-like class or None): the
+            mode to set as current mode.  If you pass a Tensor-like class,
+            it will be treated as a non-compositional mode with no state,
+            which is convenient if you have an existing tensor subclass
+            that you'd like to apply globally in a quick and dirty way.
+            Passing None will disable the current mode.
+        replace (:class:`TorchFunctionMode` or Tensor-like class): the
+            mode to replace.  You can use this argument to change the mode in
+            a situation where you know what the current mode is (and you are
+            intentionally overwriting it.)  If you don't know what the current
+            mode is, use ``ignore_preexisting`` instead.
+        ignore_preexisting (bool): if True, ignore any preexisting mode
+            and overwrite it with the passed mode.
+    """
+    return _enable_mode(mode, _TorchFunctionModeInfo(), replace=replace, ignore_preexisting=ignore_preexisting)
+
+@contextlib.contextmanager
+def push_torch_function_mode(ctor) -> Iterator[TorchFunctionMode]:
+    """
+    Context manager that pushes a :class:`TorchFunctionMode` onto the current
+    mode stack; see the class for more information on what modes are.  Stacked
+    modes can delegate to each other by invoking the ``__torch_function__``
+    method for the ``inner`` mode.
+
+    Args:
+        ctor: a function that when invoked as ``ctor(inner=...)`` produces
+            a :class:`TorchFunctionMode`.  If your :class:`TorchFunctionMode`
+            has no ``__init__`` implementation, you can simply pass the class
+            itself (e.g., ``push_torch_function_mode(MyMode)``); otherwise,
+            use ``functools.partial`` to partially apply the constructor with all
+            non-inner arguments (e.g.,
+            ``push_torch_function_mode(partial(MyMode, arg))``)
+    """
+    return _push_mode(ctor, _TorchFunctionModeInfo())
+
+class enable_reentrant_dispatch():
+    def __enter__(self):
+        self._raii_guard = torch._C._RestorePythonTLSSnapshot()
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        del self._raii_guard
diff --git a/torch/package/_digraph.py b/torch/package/_digraph.py
index 5f91ce86bd26..c918b3d56fc0 100644
--- a/torch/package/_digraph.py
+++ b/torch/package/_digraph.py
@@ -1,5 +1,5 @@
 from collections import deque
-from typing import Set
+from typing import Set, List
 
 
 class DiGraph:
@@ -18,6 +18,11 @@ def __init__(self):
         # Nested dict of node -> predecessor node -> nothing.
         self._pred = {}
 
+        # Keep track of the order in which nodes are added to
+        # the graph.
+        self._node_order = {}
+        self._insertion_idx = 0
+
     def add_node(self, n, **kwargs):
         """Add a node to the graph.
 
@@ -29,6 +34,8 @@ def add_node(self, n, **kwargs):
             self._node[n] = kwargs
             self._succ[n] = {}
             self._pred[n] = {}
+            self._node_order[n] = self._insertion_idx
+            self._insertion_idx += 1
         else:
             self._node[n].update(kwargs)
 
@@ -38,14 +45,8 @@ def add_edge(self, u, v):
         ``u`` and ``v`` will be created if they do not already exist.
         """
         # add nodes
-        if u not in self._node:
-            self._node[u] = {}
-            self._succ[u] = {}
-            self._pred[u] = {}
-        if v not in self._node:
-            self._node[v] = {}
-            self._succ[v] = {}
-            self._pred[v] = {}
+        self.add_node(u)
+        self.add_node(v)
 
         # add the edge
         self._succ[u][v] = True
@@ -138,6 +139,24 @@ def all_paths(self, src: str, dst: str):
 
         return result_graph.to_dot()
 
+    def first_path(self, dst: str) -> List[str]:
+        """Returns a list of nodes that show the first path that resulted in dst being added to the graph."""
+        path = []
+
+        while dst:
+            path.append(dst)
+            candidates = self._pred[dst].keys()
+            dst, min_idx = "", None
+            for candidate in candidates:
+                idx = self._node_order.get(candidate, None)
+                if idx is None:
+                    break
+                if min_idx is None or idx < min_idx:
+                    min_idx = idx
+                    dst = candidate
+
+        return list(reversed(path))
+
     def to_dot(self) -> str:
         """Returns the dot representation of the graph.
 
diff --git a/torch/package/_directory_reader.py b/torch/package/_directory_reader.py
index 30833493c4fb..14d20181cd3b 100644
--- a/torch/package/_directory_reader.py
+++ b/torch/package/_directory_reader.py
@@ -35,7 +35,7 @@ def get_record(self, name):
     def get_storage_from_record(self, name, numel, dtype):
         filename = f"{self.directory}/{name}"
         nbytes = torch._utils._element_size(dtype) * numel
-        storage = cast(Storage, torch.UntypedStorage)
+        storage = cast(Storage, torch._UntypedStorage)
         return _HasStorage(storage.from_file(filename=filename, nbytes=nbytes))
 
     def has_record(self, path):
diff --git a/torch/package/_package_pickler.py b/torch/package/_package_pickler.py
index ebe91c0ab602..9454efe94fcf 100644
--- a/torch/package/_package_pickler.py
+++ b/torch/package/_package_pickler.py
@@ -24,15 +24,15 @@ def __init__(self, importer: Importer, *args, **kwargs):
         # is imported, then the offending library removes its dispatch entries,
         # leaving PackagePickler with a stale dispatch table that may cause
         # unwanted behavior.
-        self.dispatch = _Pickler.dispatch.copy()
-        self.dispatch[FunctionType] = PackagePickler.save_global
+        self.dispatch = _Pickler.dispatch.copy()  # type: ignore[misc]
+        self.dispatch[FunctionType] = PackagePickler.save_global  # type: ignore[assignment]
 
     def save_global(self, obj, name=None):
         # unfortunately the pickler code is factored in a way that
         # forces us to copy/paste this function. The only change is marked
         # CHANGED below.
-        write = self.write
-        memo = self.memo
+        write = self.write  # type: ignore[attr-defined]
+        memo = self.memo  # type: ignore[attr-defined]
 
         # CHANGED: import module from module environment instead of __import__
         try:
@@ -44,7 +44,7 @@ def save_global(self, obj, name=None):
         _, parent = _getattribute(module, name)
         # END CHANGED
 
-        if self.proto >= 2:
+        if self.proto >= 2:  # type: ignore[attr-defined]
             code = _extension_registry.get((module_name, name))
             if code:
                 assert code > 0
@@ -59,13 +59,13 @@ def save_global(self, obj, name=None):
         if parent is module:
             name = lastname
         # Non-ASCII identifiers are supported only with protocols >= 3.
-        if self.proto >= 4:
-            self.save(module_name)
-            self.save(name)
+        if self.proto >= 4:  # type: ignore[attr-defined]
+            self.save(module_name)  # type: ignore[attr-defined]
+            self.save(name)  # type: ignore[attr-defined]
             write(STACK_GLOBAL)
         elif parent is not module:
-            self.save_reduce(getattr, (parent, lastname))
-        elif self.proto >= 3:
+            self.save_reduce(getattr, (parent, lastname))  # type: ignore[attr-defined]
+        elif self.proto >= 3:  # type: ignore[attr-defined]
             write(
                 GLOBAL
                 + bytes(module_name, "utf-8")
@@ -74,7 +74,7 @@ def save_global(self, obj, name=None):
                 + b"\n"
             )
         else:
-            if self.fix_imports:
+            if self.fix_imports:  # type: ignore[attr-defined]
                 r_name_mapping = _compat_pickle.REVERSE_NAME_MAPPING
                 r_import_mapping = _compat_pickle.REVERSE_IMPORT_MAPPING
                 if (module_name, name) in r_name_mapping:
@@ -92,10 +92,10 @@ def save_global(self, obj, name=None):
             except UnicodeEncodeError:
                 raise PicklingError(
                     "can't pickle global identifier '%s.%s' using "
-                    "pickle protocol %i" % (module, name, self.proto)
+                    "pickle protocol %i" % (module, name, self.proto)  # type: ignore[attr-defined]
                 ) from None
 
-        self.memoize(obj)
+        self.memoize(obj)  # type: ignore[attr-defined]
 
 
 def create_pickler(data_buf, importer, protocol=4):
diff --git a/torch/package/_package_unpickler.py b/torch/package/_package_unpickler.py
index c55905850a83..b00210e3c191 100644
--- a/torch/package/_package_unpickler.py
+++ b/torch/package/_package_unpickler.py
@@ -17,7 +17,7 @@ def __init__(self, importer: Importer, *args, **kwargs):
 
     def find_class(self, module, name):
         # Subclasses may override this.
-        if self.proto < 3 and self.fix_imports:
+        if self.proto < 3 and self.fix_imports:  # type: ignore[attr-defined]
             if (module, name) in _compat_pickle.NAME_MAPPING:
                 module, name = _compat_pickle.NAME_MAPPING[(module, name)]
             elif module in _compat_pickle.IMPORT_MAPPING:
diff --git a/torch/package/analyze/__init__.py b/torch/package/analyze/__init__.py
index 0aee6ad417b7..6146eaf937ee 100644
--- a/torch/package/analyze/__init__.py
+++ b/torch/package/analyze/__init__.py
@@ -1,3 +1,6 @@
+from .find_first_use_of_broken_modules import (
+    find_first_use_of_broken_modules,
+)
 from .trace_dependencies import (
     trace_dependencies,
 )
diff --git a/torch/package/analyze/find_first_use_of_broken_modules.py b/torch/package/analyze/find_first_use_of_broken_modules.py
new file mode 100644
index 000000000000..88553e3238c0
--- /dev/null
+++ b/torch/package/analyze/find_first_use_of_broken_modules.py
@@ -0,0 +1,29 @@
+from typing import Dict, List
+
+from ..package_exporter import PackagingError
+
+
+def find_first_use_of_broken_modules(exc: PackagingError) -> Dict[str, List[str]]:
+    """
+    Find all broken modules in a PackagingError, and for each one, return the
+    dependency path in which the module was first encountered.
+
+    E.g. broken module m.n.o was added to a dependency graph while processing a.b.c,
+    then re-encountered while processing d.e.f. This method would return
+    {'m.n.o': ['a', 'b', 'c']}
+
+    Args:
+        exc: a PackagingError
+
+    Returns: A dict from broken module names to lists of module names in the path.
+    """
+
+    assert isinstance(exc, PackagingError), "exception must be a PackagingError"
+    uses = {}
+    broken_module_names = [
+        m for m, attr in exc.dependency_graph.nodes.items() if attr.get("error", False)
+    ]
+    for module_name in broken_module_names:
+        path = exc.dependency_graph.first_path(module_name)
+        uses[module_name] = path
+    return uses
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 5bfd4444f8e0..7ec9616c34d3 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -3,6 +3,7 @@
 import io
 import linecache
 import pickletools
+import platform
 import types
 from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
@@ -84,6 +85,11 @@ def __repr__(self):
         "Module did not match against any action pattern. Extern, mock, or intern it."
     )
     DENIED = "Module was denied by a pattern."
+    MOCKED_BUT_STILL_USED = (
+        "Module was mocked out, but is still being used in the package. "
+        "Please intern or extern the mocked modules if objects are supposed to be in "
+        "the package."
+    )
 
 
 @dataclass
@@ -191,6 +197,8 @@ def __init__(
             importer: If a single Importer is passed, use that to search for modules.
                 If a sequence of importers are passsed, an ``OrderedImporter`` will be constructed out of them.
         """
+        torch._C._log_api_usage_once("torch.package.PackageExporter")
+
         if isinstance(f, (Path, str)):
             f = str(f)
             self.buffer: Optional[BinaryIO] = None
@@ -586,7 +594,7 @@ def save_pickle(
         pickler.persistent_id = self._persistent_id
         pickler.dump(obj)
         data_value = data_buf.getvalue()
-
+        mocked_modules = defaultdict(list)
         name_in_dependency_graph = f"<{package}.{resource}>"
         self.dependency_graph.add_node(
             name_in_dependency_graph,
@@ -596,6 +604,15 @@ def save_pickle(
         )
 
         def _check_mocked_error(module: Optional[str], field: Optional[str]):
+            """
+            checks if an object (field) comes from a mocked module and then adds
+            the pair to mocked_modules which contains mocked modules paired with their
+            list of mocked objects present in the pickle.
+
+            We also hold the invariant that the first user defined rule that applies
+            to the module is the one we use.
+            """
+
             assert isinstance(module, str)
             assert isinstance(field, str)
             if self._can_implicitly_extern(module):
@@ -603,14 +620,8 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
             for pattern, pattern_info in self.patterns.items():
                 if pattern.matches(module):
                     if pattern_info.action == _ModuleProviderAction.MOCK:
-                        raise NotImplementedError(
-                            f"Object '{field}' from module {module} was mocked out during packaging "
-                            f"but is being used in resource - {resource} in package {package}. "
-                            "If this error is happening during 'save_pickle', please ensure that your "
-                            "pickled object doesn't contain any mocked objects."
-                        )
-                    else:
-                        return
+                        mocked_modules[module].append(field)
+                    return
 
         if dependencies:
             all_dependencies = []
@@ -654,7 +665,23 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
                     _check_mocked_error(module, field)
             for module_name in all_dependencies:
                 self.dependency_graph.add_edge(name_in_dependency_graph, module_name)
-                self.add_dependency(module_name)
+
+                """ If an object happens to come from a mocked module, then we collect these errors and spit them
+                    out with the other errors found by package exporter.
+                """
+                if module in mocked_modules:
+                    assert isinstance(module, str)
+                    fields = mocked_modules[module]
+                    self.dependency_graph.add_node(
+                        module_name,
+                        action=_ModuleProviderAction.MOCK,
+                        error=PackagingErrorReason.MOCKED_BUT_STILL_USED,
+                        error_context=f"Object(s) '{fields}' from module `{module_name}` was mocked out during packaging "
+                        f"but is being used in resource - `{resource}` in package `{package}`. ",
+                        provided=True,
+                    )
+                else:
+                    self.add_dependency(module_name)
 
         self._write(filename, data_value)
 
@@ -849,8 +876,8 @@ def deny(self, include: "GlobPattern", *, exclude: "GlobPattern" = ()):
         )
 
     def _persistent_id(self, obj):
-        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
-            if isinstance(obj, torch.storage.TypedStorage):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage._TypedStorage):
+            if isinstance(obj, torch.storage._TypedStorage):
                 # TODO: Once we decide to break serialization FC, we can
                 # remove this case
                 storage = obj._storage
@@ -904,7 +931,7 @@ def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        # If __exit__ was called because an exception was raised, we do not attempt to
+        # If __exit__ was called because an exception was raised, we do not
         # attempt to finalize the package. Instead, control is returned to the
         # caller to continue raising the exception.
         if exc_type is not None:
@@ -986,7 +1013,7 @@ def _execute_dependency_graph(self):
                     )
 
                 if attrs.get("is_pickle") is True:
-                    # This node came from save_source_pickle, we don't need to write any source for it.
+                    # This node came from save_pickle, we don't need to write any source for it.
                     continue
 
                 is_package = attrs["is_package"]
@@ -1005,6 +1032,10 @@ def _execute_dependency_graph(self):
         extern_file_contents = "\n".join(extern_modules) + "\n"
         self._write(".data/extern_modules", extern_file_contents)
 
+    def _write_python_version(self):
+        """Writes the python version that the package was created with to .data/python_version"""
+        self._write(".data/python_version", platform.python_version())
+
     def close(self):
         """Write the package to the filesystem. Any calls after :meth:`close` are now invalid.
         It is preferable to use resource guard syntax instead::
@@ -1013,6 +1044,7 @@ def close(self):
                 ...
         """
         self._execute_dependency_graph()
+        self._write_python_version()
 
         self.script_module_serializer.write_files()
         self._finalize_zip()
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 5b0f0037b144..09a147bbde88 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -45,6 +45,8 @@ class PackageImporter(Importer):
     """The dictionary of already loaded modules from this package, equivalent to ``sys.modules`` but
     local to this importer.
     """
+    torch._C._log_api_usage_once("torch.package.PackageImporter")
+
     modules: Dict[str, types.ModuleType]
 
     def __init__(
@@ -217,8 +219,8 @@ def persistent_load(saved_id):
                     )
                 storage = loaded_storages[key]
                 # TODO: Once we decide to break serialization FC, we can
-                # stop wrapping with TypedStorage
-                return torch.storage.TypedStorage(
+                # stop wrapping with _TypedStorage
+                return torch.storage._TypedStorage(
                     wrap_storage=storage._untyped(), dtype=dtype
                 )
             elif typename == "reduce_package":
@@ -237,7 +239,7 @@ def persistent_load(saved_id):
         # Load the data (which may in turn use `persistent_load` to load tensors)
         data_file = io.BytesIO(self.zip_reader.get_record(pickle_file))
         unpickler = self.Unpickler(data_file)
-        unpickler.persistent_load = persistent_load
+        unpickler.persistent_load = persistent_load  # type: ignore[assignment]
 
         @contextmanager
         def set_deserialization_context():
@@ -289,6 +291,22 @@ def file_structure(
             self.filename, self.zip_reader.get_all_records(), include, exclude
         )
 
+    def python_version(self):
+        """Returns the version of python that was used to create this package.
+
+        Note: this function is experimental and not Forward Compatible. The plan is to move this into a lock
+        file later on.
+
+        Returns:
+            :class:`Optional[str]` a python version e.g. 3.8.9 or None if no version was stored with this package
+        """
+        python_version_path = ".data/python_version"
+        return (
+            self.zip_reader.get_record(python_version_path).decode("utf-8").strip()
+            if self.zip_reader.has_record(python_version_path)
+            else None
+        )
+
     def _read_extern(self):
         return (
             self.zip_reader.get_record(".data/extern_modules")
diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index ba67be5e88c2..161232389b61 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -9,6 +9,7 @@
 '''
 
 from .profiler import profile, _KinetoProfile, \
-    schedule, supported_activities, tensorboard_trace_handler, ProfilerAction, ProfilerActivity
+    schedule, supported_activities, tensorboard_trace_handler, ProfilerAction,\
+    ProfilerActivity, _ExperimentalConfig
 from torch.autograd import kineto_available, _supported_activities, DeviceType
 from torch.autograd.profiler import record_function
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index c1f0d7000ac5..99f5f30469d5 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -10,6 +10,7 @@
 import torch
 import torch.autograd.profiler as prof
 from torch.autograd import ProfilerActivity, kineto_available
+from torch._C._autograd import _ExperimentalConfig
 
 
 def supported_activities():
@@ -45,6 +46,9 @@ class _KinetoProfile(object):
             Note that this support exist, at the moment, only for TorchScript models
             and not eager mode models.
 
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+
     .. note::
         This API is an experimental and subject to change in future.
 
@@ -61,13 +65,15 @@ def __init__(
             profile_memory: bool = False,
             with_stack: bool = False,
             with_flops: bool = False,
-            with_modules: bool = False):
+            with_modules: bool = False,
+            experimental_config: Optional[_ExperimentalConfig] = None):
         self.activities = set(activities) if activities else supported_activities()
         self.record_shapes = record_shapes
         self.with_flops = with_flops
         self.profile_memory = profile_memory
         self.with_stack = with_stack
         self.with_modules = with_modules
+        self.experimental_config = experimental_config
         self.profiler: Optional[prof.profile] = None
 
     def start(self):
@@ -87,6 +93,7 @@ def prepare_trace(self):
             with_stack=self.with_stack,
             with_modules=self.with_modules,
             use_kineto=True,
+            experimental_config=self.experimental_config,
         )
         self.profiler._prepare_trace()
 
@@ -281,6 +288,9 @@ class profile(_KinetoProfile):
             then aten::add's module hierarchy is A.B
             Note that this support exist, at the moment, only for TorchScript models
             and not eager mode models.
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used for Kineto library features. Note, backward compatibility is not guaranteed.
+
         use_cuda (bool):
             .. deprecated:: 1.8.1
                 use ``activities`` instead.
@@ -376,6 +386,7 @@ def __init__(
             with_stack: bool = False,
             with_flops: bool = False,
             with_modules: bool = False,
+            experimental_config: Optional[_ExperimentalConfig] = None,
             # deprecated:
             use_cuda: Optional[bool] = None):
 
@@ -394,7 +405,8 @@ def __init__(
             profile_memory=profile_memory,
             with_stack=with_stack,
             with_flops=with_flops,
-            with_modules=with_modules
+            with_modules=with_modules,
+            experimental_config=experimental_config,
         )
 
         if schedule:
@@ -473,13 +485,15 @@ def step(self):
         if self.record_steps and self.step_rec_fn:
             self.step_rec_fn.__exit__(None, None, None)
         prev_action = self.current_action
+        cur_step = self.step_num
         self.step_num += 1
         self.current_action = self.schedule(self.step_num)
 
         self._transit_action(prev_action, self.current_action)
 
+        prof.kineto_step()
         if self.record_steps:
-            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num))
+            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(cur_step))
             self.step_rec_fn.__enter__()
 
     def _trace_ready(self):
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index 5897753cbe07..37218c5c042a 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -49,8 +49,8 @@ def default_eval_fn(model, calib_data):
     'default_per_channel_weight_observer',
     # FakeQuantize (for qat)
     'default_fake_quant', 'default_weight_fake_quant',
-    'default_symmetric_fixed_qparams_fake_quant',
-    'default_affine_fixed_qparams_fake_quant',
+    'default_fixed_qparams_range_neg1to1_fake_quant',
+    'default_fixed_qparams_range_0to1_fake_quant',
     'default_per_channel_weight_fake_quant',
     'default_histogram_fake_quant',
     # QConfig
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 3b28b759a7fa..e7da7a485ebb 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -17,8 +17,8 @@
     FusedMovingAvgObsFakeQuantize,
     default_fake_quant,
     default_weight_fake_quant,
-    default_symmetric_fixed_qparams_fake_quant,
-    default_affine_fixed_qparams_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
     default_per_channel_weight_fake_quant,
     default_histogram_fake_quant,
     default_fused_act_fake_quant,
diff --git a/torch/quantization/fx/__init__.py b/torch/quantization/fx/__init__.py
index 3e33e57fd5ce..c1c1effbb281 100644
--- a/torch/quantization/fx/__init__.py
+++ b/torch/quantization/fx/__init__.py
@@ -11,4 +11,4 @@
 # the newly added lower_to_fbgemm etc.
 from torch.ao.quantization.fx.prepare import prepare
 from torch.ao.quantization.fx.convert import convert
-from torch.ao.quantization.fx.fuse import Fuser
+from torch.ao.quantization.fx.fuse import fuse
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index e1dd864aa5f5..9d6ac350602b 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -6,6 +6,4 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
-from torch.ao.quantization.fx.convert import (
-    convert
-)
+from torch.ao.quantization.fx.convert import convert
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 9ccc02cedae4..67527080304f 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -6,6 +6,4 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
-from torch.ao.quantization.fx.fuse import (
-    Fuser
-)
+from torch.ao.quantization.fx.fuse import fuse
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index e188aa286f1d..bec24ae06f18 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -15,3 +15,21 @@
     get_default_quant_patterns,
     get_default_output_activation_post_process_map
 )
+
+# QuantizeHandler.__module__ = _NAMESPACE
+MatchResult.__module__ = "torch.quantization.fx.pattern_utils"
+register_fusion_pattern.__module__ = "torch.quantization.fx.pattern_utils"
+get_default_fusion_patterns.__module__ = "torch.quantization.fx.pattern_utils"
+register_quant_pattern.__module__ = "torch.quantization.fx.pattern_utils"
+get_default_quant_patterns.__module__ = "torch.quantization.fx.pattern_utils"
+get_default_output_activation_post_process_map.__module__ = "torch.quantization.fx.pattern_utils"
+
+# __all__ = [
+#     "QuantizeHandler",
+#     "MatchResult",
+#     "register_fusion_pattern",
+#     "get_default_fusion_patterns",
+#     "register_quant_pattern",
+#     "get_default_quant_patterns",
+#     "get_default_output_activation_post_process_map",
+# ]
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index c4219701ecba..09602287115c 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -22,3 +22,18 @@
     GeneralTensorShapeOpQuantizeHandler,
     StandaloneModuleQuantizeHandler
 )
+
+QuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+BinaryOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+CatQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+ConvReluQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+LinearReLUQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+BatchNormQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+EmbeddingQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+RNNDynamicQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+DefaultNodeQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+FixedQParamsOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+CopyNodeQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+CustomModuleQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+GeneralTensorShapeOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+StandaloneModuleQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
diff --git a/torch/quantization/fx/quantization_types.py b/torch/quantization/fx/quantization_types.py
index 016b488a34cf..0e7b493f3640 100644
--- a/torch/quantization/fx/quantization_types.py
+++ b/torch/quantization/fx/quantization_types.py
@@ -6,7 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
-from torch.ao.quantization.fx.quantization_types import (
+from torch.ao.quantization.quantization_types import (
     Pattern,
     QuantizerCls
 )
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index d891c667b70b..230c10113e62 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -21,7 +21,7 @@
     create_qparam_nodes,
     all_node_args_have_no_tensors,
     node_return_type_is_int,
-    node_bool_tensor_arg_indexes,
+    get_non_observable_arg_indexes_and_types,
     is_get_tensor_info_node,
     maybe_get_next_module
 )
diff --git a/torch/return_types.py b/torch/return_types.py
index f3a18070b43b..b46441f2071d 100644
--- a/torch/return_types.py
+++ b/torch/return_types.py
@@ -1,12 +1,31 @@
 import torch
+import inspect
 
 __all__ = []
 
 # error: Module has no attribute "_return_types"
 return_types = torch._C._return_types  # type: ignore[attr-defined]
 
+def pytree_register_structseq(cls):
+    def structseq_flatten(structseq):
+        return list(structseq), None
+
+    def structseq_unflatten(values, context):
+        return cls(values)
+
+    torch.utils._pytree._register_pytree_node(cls, structseq_flatten, structseq_unflatten)
+
 for name in dir(return_types):
     if name.startswith('__'):
         continue
     globals()[name] = getattr(return_types, name)
     __all__.append(name)
+
+    # Today everything in torch.return_types is a structseq, aka a "namedtuple"-like
+    # thing defined by the Python C-API. We're going to need to modify this when that
+    # is no longer the case.
+    # NB: I don't know how to check that something is a "structseq" so we do a fuzzy
+    # check for tuple
+    attr = globals()[name]
+    if inspect.isclass(attr) and issubclass(attr, tuple):
+        pytree_register_structseq(attr)
diff --git a/torch/serialization.py b/torch/serialization.py
index c63a115b67b0..3cc92349bbc5 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -162,7 +162,7 @@ def _cuda_deserialize(obj, location):
 register_package(20, _cuda_tag, _cuda_deserialize)
 
 
-def location_tag(storage: Union[Storage, torch.storage.TypedStorage]):
+def location_tag(storage: Union[Storage, torch.storage._TypedStorage]):
     for _, tagger, _ in _package_registry:
         location = tagger(storage)
         if location:
@@ -413,8 +413,8 @@ def persistent_id(obj: Any) -> Optional[Tuple]:
                               "for correctness upon loading.")
             return ('module', obj, source_file, source)
 
-        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
-            if isinstance(obj, torch.storage.TypedStorage):
+        if isinstance(obj, torch.storage._TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage._TypedStorage):
                 # TODO: Once we decide to break serialization FC, this case
                 # can be deleted
                 storage = obj._storage
@@ -463,8 +463,8 @@ def persistent_id(obj: Any) -> Optional[Tuple]:
             # effectively saving nbytes in this case.  We'll be able to load it
             # and the tensor back up with no problems in _this_ and future
             # versions of pytorch, but in older versions, here's the problem:
-            # the storage will be loaded up as a UntypedStorage, and then the
-            # FloatTensor will loaded and the UntypedStorage will be assigned to
+            # the storage will be loaded up as a _UntypedStorage, and then the
+            # FloatTensor will loaded and the _UntypedStorage will be assigned to
             # it. Since the storage dtype does not match the tensor dtype, this
             # will cause an error.  If we reverse the list, like `[tensor,
             # storage]`, then we will save the `tensor.storage()` as a faked
@@ -472,7 +472,7 @@ def persistent_id(obj: Any) -> Optional[Tuple]:
             # dtype-specific numel count that old versions expect. `tensor`
             # will be able to load up properly in old versions, pointing to
             # a FloatStorage. However, `storage` is still being translated to
-            # a UntypedStorage, and it will try to resolve to the same
+            # a _UntypedStorage, and it will try to resolve to the same
             # FloatStorage that `tensor` contains. This will also cause an
             # error. It doesn't seem like there's any way around this.
             # Probably, we just cannot maintain FC for the legacy format if the
@@ -539,9 +539,9 @@ def persistent_id(obj):
         # see
         # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
         # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
-        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+        if isinstance(obj, torch.storage._TypedStorage) or torch.is_storage(obj):
 
-            if isinstance(obj, torch.storage.TypedStorage):
+            if isinstance(obj, torch.storage._TypedStorage):
                 # TODO: Once we decide to break serialization FC, this case
                 # can be deleted
                 storage = obj._storage
@@ -806,11 +806,11 @@ def persistent_load(saved_id):
                     args = pickle_module.load(f, **pickle_load_args)
                     key, location, storage_type = args
                     dtype = storage_type.dtype
-                    obj = cast(Storage, torch.UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype))
+                    obj = cast(Storage, torch._UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype))
                     obj = restore_location(obj, location)
                     # TODO: Once we decide to break serialization FC, we can
-                    # stop wrapping with TypedStorage
-                    deserialized_objects[key] = torch.storage.TypedStorage(
+                    # stop wrapping with _TypedStorage
+                    deserialized_objects[key] = torch.storage._TypedStorage(
                         wrap_storage=obj,
                         dtype=dtype)
 
@@ -820,8 +820,8 @@ def persistent_load(saved_id):
                     element_size = torch._utils._element_size(root.dtype)
                     offset_bytes = offset * element_size
                     # TODO: Once we decide to break serialization FC, we can
-                    # stop wrapping with TypedStorage
-                    deserialized_objects[target_cdata] = torch.storage.TypedStorage(
+                    # stop wrapping with _TypedStorage
+                    deserialized_objects[target_cdata] = torch.storage._TypedStorage(
                         wrap_storage=root._storage[offset_bytes:offset_bytes + numel * element_size],
                         dtype=root.dtype)
 
@@ -868,11 +868,11 @@ def persistent_load(saved_id):
             nbytes = numel * torch._utils._element_size(dtype)
 
             if root_key not in deserialized_objects:
-                obj = cast(Storage, torch.UntypedStorage(nbytes))
+                obj = cast(Storage, torch._UntypedStorage(nbytes))
                 obj._torch_load_uninitialized = True
                 # TODO: Once we decide to break serialization FC, we can
-                # stop wrapping with TypedStorage
-                deserialized_objects[root_key] = torch.storage.TypedStorage(
+                # stop wrapping with _TypedStorage
+                deserialized_objects[root_key] = torch.storage._TypedStorage(
                     wrap_storage=restore_location(obj, location),
                     dtype=dtype)
 
@@ -883,8 +883,8 @@ def persistent_load(saved_id):
                 view_size_bytes = view_size * torch._utils._element_size(dtype)
                 if view_key not in deserialized_objects:
                     # TODO: Once we decide to break serialization FC, we can
-                    # stop wrapping with TypedStorage
-                    deserialized_objects[view_key] = torch.storage.TypedStorage(
+                    # stop wrapping with _TypedStorage
+                    deserialized_objects[view_key] = torch.storage._TypedStorage(
                         wrap_storage=typed_storage._storage[offset_bytes:offset_bytes + view_size_bytes],
                         dtype=dtype)
                 res = deserialized_objects[view_key]
@@ -994,10 +994,10 @@ def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickl
     def load_tensor(dtype, numel, key, location):
         name = f'data/{key}'
 
-        storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage).storage()._untyped()
+        storage = zip_file.get_storage_from_record(name, numel, torch._UntypedStorage).storage()._untyped()
         # TODO: Once we decide to break serialization FC, we can
-        # stop wrapping with TypedStorage
-        loaded_storages[key] = torch.storage.TypedStorage(
+        # stop wrapping with _TypedStorage
+        loaded_storages[key] = torch.storage._TypedStorage(
             wrap_storage=restore_location(storage, location),
             dtype=dtype)
 
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index ba4156019e62..29cd5510a74b 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -29,9 +29,14 @@
 sparse.addmm(mat, mat1, mat2, *, beta=1., alpha=1.) -> Tensor
 
 This function does exact same thing as :func:`torch.addmm` in the forward,
-except that it supports backward for sparse matrix :attr:`mat1`. :attr:`mat1`
-need to have `sparse_dim = 2`. Note that the gradients of :attr:`mat1` is a
-coalesced sparse tensor.
+except that it supports backward for sparse COO matrix :attr:`mat1`.
+When :attr:`mat1` is a COO tensor it must have `sparse_dim = 2`.
+When inputs are COO tensors, this function also supports backward for both inputs.
+
+Supports both CSR and COO storage formats.
+
+.. note::
+    This function doesn't support computing derivaties with respect to CSR matrices.
 
 Args:
     mat (Tensor): a dense matrix to be added
@@ -42,17 +47,21 @@
 """)
 
 
-def mm(mat1: Tensor, mat2: Tensor) -> Tensor:
-    r"""
+mm = _add_docstr(_sparse._sparse_mm, r"""
     Performs a matrix multiplication of the sparse matrix :attr:`mat1`
-    and the (sparse or strided) matrix :attr:`mat2`. Similar to :func:`torch.mm`, If :attr:`mat1` is a
+    and the (sparse or strided) matrix :attr:`mat2`. Similar to :func:`torch.mm`, if :attr:`mat1` is a
     :math:`(n \times m)` tensor, :attr:`mat2` is a :math:`(m \times p)` tensor, out will be a
-    :math:`(n \times p)` tensor. :attr:`mat1` need to have `sparse_dim = 2`.
-    This function also supports backward for both matrices. Note that the gradients of
-    :attr:`mat1` is a coalesced sparse tensor.
+    :math:`(n \times p)` tensor.
+    When :attr:`mat1` is a COO tensor it must have `sparse_dim = 2`.
+    When inputs are COO tensors, this function also supports backward for both inputs.
+
+    Supports both CSR and COO storage formats.
+
+.. note::
+    This function doesn't support computing derivaties with respect to CSR matrices.
 
     Args:
-        mat1 (SparseTensor): the first sparse matrix to be multiplied
+        mat1 (Tensor): the first sparse matrix to be multiplied
         mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense
 
     Shape:
@@ -85,10 +94,7 @@ def mm(mat1: Tensor, mat2: Tensor) -> Tensor:
                                [0, 1, 2, 0, 1, 2]]),
                values=tensor([ 0.1394, -0.6415, -2.1639,  0.1394, -0.6415, -2.1639]),
                size=(2, 3), nnz=6, layout=torch.sparse_coo)
-    """
-    if mat1.is_sparse and mat2.is_sparse:
-        return torch._sparse_sparse_matmul(mat1, mat2)
-    return torch._sparse_mm(mat1, mat2)
+    """)
 
 
 sampled_addmm = _add_docstr(_sparse.sparse_sampled_addmm, r"""
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index a11d3904ab29..ab91f5b44b8b 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -4,7 +4,7 @@
 
 __all__ = ['entr', 'psi', 'digamma', 'gammaln', 'polygamma', 'erf', 'erfc', 'erfinv',
            'erfcx', 'logit', 'logsumexp', 'expit', 'exp2', 'expm1', 'xlog1py', 'xlogy',
-           'i0', 'i0e', 'i1', 'i1e', 'ndtr', 'ndtri', 'log1p', 'sinc', 'round', 'log_softmax',
+           'i0', 'i0e', 'i1', 'i1e', 'ndtr', 'ndtri', 'log_ndtr', 'log1p', 'sinc', 'round', 'log_softmax',
            'zeta', 'multigammaln', 'gammainc', 'gammaincc', 'softmax']
 
 Tensor = torch.Tensor
@@ -547,6 +547,27 @@
     tensor([   -inf, -0.6745,  0.0000,  0.6745,     inf])
 """.format(**common_args))
 
+log_ndtr = _add_docstr(_special.special_log_ndtr,
+                       r"""
+log_ndtr(input, *, out=None) -> Tensor
+Computes the log of the area under the standard Gaussian probability density function,
+integrated from minus infinity to :attr:`input`, elementwise.
+
+.. math::
+    \text{log\_ndtr}(x) = \log\left(\frac{1}{\sqrt{2 \pi}}\int_{-\infty}^{x} e^{-\frac{1}{2}t^2} dt \right)
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.log_ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
+    tensor([-6.6077 -3.7832 -1.841  -0.6931 -0.1728 -0.023  -0.0014])
+""".format(**common_args))
+
 log1p = _add_docstr(_special.special_log1p,
                     r"""
 log1p(input, *, out=None) -> Tensor
diff --git a/torch/storage.py b/torch/storage.py
index 620c891fe2bf..2537d7ac8e00 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -7,8 +7,13 @@
 import copy
 import collections
 from functools import lru_cache
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
 
-T = TypeVar('T', bound='Union[_StorageBase, TypedStorage]')
+T = TypeVar('T', bound='Union[_StorageBase, _TypedStorage]')
 class _StorageBase(object):
     _cdata: Any
     is_cuda: bool = False
@@ -38,6 +43,15 @@ def _share_fd_(self): ...  # noqa: E704
     def _new_using_filename(cls: Type[T], size: int) -> T: ...  # noqa: E704
     @classmethod
     def _new_using_fd(cls: Type[T], size: int) -> T: ...  # noqa: E704
+    @classmethod
+    def from_buffer(cls, *args, **kwargs) -> T: ...  # noqa: E704
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None) -> T: ...  # noqa: E704
+    @classmethod
+    def _release_ipc_counter(cls, *args, **kwargs) -> T: ...  # noqa: E704
+    @classmethod
+    def _new_with_weak_ptr(cls, *args, **kwargs) -> T: ...  # noqa: E704
+    def _shared_decref(self) -> T: ...  # noqa: E704
 
     def __str__(self):
         content = ' ' + '\n '.join(str(self[i]) for i in range(len(self)))
@@ -83,6 +97,8 @@ def cpu(self):
         return _type(self, getattr(torch, self.__class__.__name__))
 
     def _to(self, dtype):
+        if not isinstance(dtype, torch.dtype):
+            raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
         storage = torch.tensor([], dtype=torch.uint8, device=self.device).set_(cast(Storage, self)).to(dtype).storage()
         if storage.data_ptr() == self.data_ptr():
             storage = storage.clone()
@@ -187,6 +203,11 @@ def _load_from_bytes(b):
 
 @lru_cache(maxsize=None)
 def _dtype_to_storage_type_map():
+    # NOTE: We should no longer add dtypes to this map. This map
+    # is only used for BC/FC with older PyTorch versions. Going forward,
+    # new dtypes of _TypedStorage should not translate to a legacy
+    # <type>Storage class. Instead, new dtypes of _TypedStorage should
+    # be serialized as an _UntypedStorage paired with a torch.dtype
     return {
         torch.double: 'DoubleStorage',
         torch.float: 'FloatStorage',
@@ -213,104 +234,183 @@ def _storage_type_to_dtype_map():
         val: key for key, val in _dtype_to_storage_type_map().items()}
     return dtype_map
 
-class TypedStorage:
+def _get_storage_from_sequence(sequence, dtype, device):
+    if dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+        interpret_dtypes = {
+            torch.quint8: torch.uint8,
+            torch.quint4x2: torch.uint8,
+            torch.quint2x4: torch.uint8,
+            torch.qint32: torch.int32,
+            torch.qint8: torch.int8
+        }
+        tmp_tensor = torch.tensor(
+            sequence,
+            dtype=interpret_dtypes[dtype],
+            device=device)
+
+    else:
+        tmp_tensor = torch.tensor(
+            sequence,
+            dtype=dtype,
+            device=device)
+
+    return tmp_tensor.storage()._untyped()
+
+def _isint(x):
+    if HAS_NUMPY:
+        return isinstance(x, (int, np.integer))
+    else:
+        return isinstance(x, int)
+
+class _TypedStorage:
     is_sparse = False
 
+    dtype: torch.dtype
+
     def fill_(self, value):
         self[0:len(self)] = value
         return self
 
-    def __init__(self, *args, **kwargs):
-        arg_error_msg = (
-            f'{type(self)} constructor received an invalid combination '
-            f'of arguments - got args={tuple(type(arg) for arg in args)}, '
-            f'kwargs={ {key: type(val) for key, val in kwargs.items()} }, but '
-            'expected one of:\n'
-            ' * no arguments\n'
-            ' * (int size)\n'
-            ' * (Sequence data)\n')
-        if type(self) == TypedStorage:
-            arg_error_msg += ' * (wrap_storage=<UntypedStorage>, dtype=<torch.dtype>)'
+    def __new__(cls, *args, wrap_storage=None, dtype=None, device=None):
+        if cls == torch.storage._LegacyStorage:
+            raise RuntimeError("Only child classes of _LegacyStorage can be instantiated")
+
+        if cls == _TypedStorage:
+            return super().__new__(cls)
+
         else:
-            arg_error_msg += ' * (wrap_storage=<UntypedStorage>)'
-
-        if 'wrap_storage' in kwargs:
-            assert len(args) == 0, (
-                "No positional arguments should be given when using "
-                "'wrap_storage'")
-
-            if type(self) == TypedStorage:
-                assert 'dtype' in kwargs, (
-                    "When using 'wrap_storage', 'dtype' also must be specified")
-                assert len(kwargs) == 2, (
-                    "Only 'wrap_storage' and 'dtype' should be given, but got: "
-                    f"{kwargs}")
-                dtype = kwargs['dtype']
-                assert isinstance(dtype, torch.dtype)
-                self.dtype = dtype
+            arg_error_msg = (
+                f'{cls}.__new__ received an invalid combination '
+                f'of arguments. Expected one of:\n'
+                ' * no arguments\n'
+                ' * (int size)\n'
+                ' * (Sequence data)\n'
+                ' * (*, _UntypedStorage wrap_storage)')
+
+            if device is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nKeyword argument 'device' cannot be specified")
+
+            if dtype is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nKeyword argument 'dtype' cannot be specified")
+
+            if wrap_storage is None:
+                if len(args) > 1:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        "\nToo many positional arguments")
+
+                if len(args) == 1 and not _isint(args[0]) and not isinstance(args[0], collections.abc.Sequence):
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument type not recognized: {type(args[0])}")
+
+                return _TypedStorage(
+                    *args,
+                    dtype=cls.dtype,
+                    device='cuda' if eval(cls.__module__) is torch.cuda else 'cpu')
 
             else:
-                assert hasattr(self, 'dtype')
-                assert len(kwargs) == 1, (
-                    f"Only 'wrap_storage' should be given, but got: {kwargs.keys()}")
-                dtype = self.dtype
+                if len(args) != 0:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        "\nNo positional arguments should be given when using "
+                        "'wrap_storage'")
+
+                if not isinstance(wrap_storage, (torch._UntypedStorage, torch.cuda._UntypedStorage)):
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument 'wrap_storage' must be _UntypedStorage, but got {type(wrap_storage)}")
+
+                cls_device = 'cuda' if cls.__module__ == 'torch.cuda' else 'cpu'
+
+                if wrap_storage.device.type != cls_device:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        f"\nDevice of 'wrap_storage' must be {cls_device}"
+                        f", but got {wrap_storage.device.type}")
+
+                return _TypedStorage(
+                    *args,
+                    wrap_storage=wrap_storage,
+                    dtype=cls.dtype)
+
+    def __init__(self, *args, device=None, dtype=None, wrap_storage=None):
+        arg_error_msg = (
+            '_TypedStorage.__init__ received an invalid combination '
+            'of arguments. Expected one of:\n'
+            ' * (*, torch.device device, torch.dtype dtype)\n'
+            ' * (int size, *, torch.device device, torch.dtype dtype)\n'
+            ' * (Sequence data, *, torch.device device, torch.dtype dtype)\n'
+            ' * (*, _UntypedStorage wrap_storage, torch.dtype dtype)')
+
+        if wrap_storage is not None:
+            if len(args) != 0:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nNo positional arguments should be given when using "
+                    "'wrap_storage'")
+
+            if dtype is None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nArgument 'dtype' must be specified")
+
+            if not isinstance(dtype, torch.dtype):
+                raise TypeError(
+                    arg_error_msg +
+                    f"\nArgument 'dtype' must be torch.dtype, not {type(dtype)}")
+
+            if device is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nArgument 'device' should not be specified when 'wrap_storage' is given")
 
-            storage = kwargs['wrap_storage']
+            self.dtype = dtype
 
-            if not isinstance(storage, (torch.UntypedStorage, torch.cuda.UntypedStorage)):
-                raise TypeError(arg_error_msg)
-            if type(self) != TypedStorage and storage.__module__ != self.__module__:
-                raise TypeError((
+            if not isinstance(wrap_storage, (torch._UntypedStorage, torch.cuda._UntypedStorage)):
+                raise TypeError(
                     arg_error_msg +
-                    f'\n`storage` `module {storage.__module__}` does not match '
-                    f'module of {type(self)}'))
-            self._storage = storage
+                    f"\nArgument 'wrap_storage' must be _UntypedStorage, but got {type(wrap_storage)}")
+
+            self._storage = wrap_storage
 
         else:
-            assert type(self) != TypedStorage, (
-                "Calling __init__ this way is only supported in TypedStorage's "
-                "child classes. TypedStorage can only be directly instantiated "
-                "when kwargs 'wrap_storage' and 'dtype' are given.")
+            self.dtype = torch.get_default_dtype() if dtype is None else dtype
+            device = torch.device('cpu' if device is None else device)
 
-            assert len(kwargs) == 0, "invalid keyword arguments"
+            if device.type == 'cpu':
+                untyped_storage_class = torch._UntypedStorage
+            elif device.type == 'cuda':
+                untyped_storage_class = torch.cuda._UntypedStorage
+            else:
+                raise RuntimeError(f"Storage device not recognized: {device}")
 
-            def isint(x):
-                try:
-                    int(x)
-                except TypeError:
-                    return False
-                return True
+            if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+                if device.type == 'cuda':
+                    raise RuntimeError("Cannot create CUDA storage with quantized dtype")
 
             if len(args) == 0:
-                self._storage = eval(self.__module__).UntypedStorage()
-
-            elif len(args) == 1 and isint(args[0]):
-                self._storage = eval(self.__module__).UntypedStorage(int(args[0]) * self.element_size())
-
-            elif len(args) == 1 and isinstance(args[0], collections.abc.Sequence):
-                if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
-                    interpret_dtypes = {
-                        torch.quint8: torch.uint8,
-                        torch.quint4x2: torch.uint8,
-                        torch.quint2x4: torch.uint8,
-                        torch.qint32: torch.int32,
-                        torch.qint8: torch.int8
-                    }
-                    tmp_tensor = torch.tensor(
-                        args[0],
-                        dtype=interpret_dtypes[self.dtype],
-                        device='cuda' if eval(self.__module__) is torch.cuda else 'cpu')
+                self._storage = untyped_storage_class()
 
+            elif len(args) == 1:
+                if _isint(args[0]):
+                    self._storage = untyped_storage_class(int(args[0]) * self.element_size())
+                elif isinstance(args[0], collections.abc.Sequence):
+                    self._storage = _get_storage_from_sequence(args[0], self.dtype, device)
                 else:
-                    tmp_tensor = torch.tensor(
-                        args[0],
-                        dtype=self.dtype,
-                        device='cuda' if eval(self.__module__) is torch.cuda else 'cpu')
-
-                self._storage = tmp_tensor.storage()._untyped()
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument type not recognized: {type(args[0])}")
 
             else:
-                raise TypeError(arg_error_msg)
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nToo many positional arguments")
+
 
     @property
     def is_cuda(self):
@@ -321,10 +421,10 @@ def _untyped(self):
 
     def _new_wrapped_storage(self, untyped_storage):
         module = eval(untyped_storage.__module__)
-        assert type(untyped_storage) == module.UntypedStorage
+        assert type(untyped_storage) == module._UntypedStorage
 
-        if type(self) == TypedStorage:
-            return TypedStorage(wrap_storage=untyped_storage, dtype=self.dtype)
+        if type(self) == _TypedStorage:
+            return _TypedStorage(wrap_storage=untyped_storage, dtype=self.dtype)
         else:
             # NOTE: We need to use the module of untyped_storage in case self's
             # module is different, e.g. if self is on CPU and untyped_storage
@@ -371,7 +471,7 @@ def __setitem__(self, idx, value):
                 torch.qint8: torch.int8
             }
             tmp_dtype = interpret_dtypes[self.dtype]
-            tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self.device).set_(TypedStorage(
+            tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self.device).set_(_TypedStorage(
                 wrap_storage=self._storage,
                 dtype=tmp_dtype))
         else:
@@ -380,12 +480,12 @@ def __setitem__(self, idx, value):
         tmp_tensor[idx] = value
 
     def __getitem__(self, idx):
-        # NOTE: Before TypedStorage existed, indexing with a slice used to be
+        # NOTE: Before _TypedStorage existed, indexing with a slice used to be
         # possible for <type>Storage objects. However, it would return
-        # a storage view, which would be a hassle to implement in TypedStorage,
+        # a storage view, which would be a hassle to implement in _TypedStorage,
         # so it was disabled
         if isinstance(idx, slice):
-            raise RuntimeError('slices are only supported in UntypedStorage.__getitem__')
+            raise RuntimeError('slices are only supported in _UntypedStorage.__getitem__')
         elif not isinstance(idx, int):
             raise RuntimeError(f"can't index a {type(self)} with {type(idx)}")
 
@@ -397,7 +497,7 @@ def __getitem__(self, idx):
                 torch.qint32: torch.int32,
                 torch.qint8: torch.int8
             }
-            return TypedStorage(
+            return _TypedStorage(
                 wrap_storage=self._storage,
                 dtype=interpret_dtypes[self.dtype])[idx]
 
@@ -414,11 +514,19 @@ def nbytes(self):
 
     def type(self, dtype: str = None, non_blocking: bool = False) -> Union[T, str]:
         if dtype is None:
+            legacy_class = self._get_legacy_storage_class()
+
+            if legacy_class is not None:
+                return legacy_class.__module__ + '.' + legacy_class.__name__
+
             return '.'.join([self.__module__, type(self).__name__])
+
         else:
             return self._storage.type(dtype, non_blocking)
 
     def cuda(self, device=None, non_blocking=False, **kwargs) -> T:
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            raise RuntimeError("Cannot create CUDA storage with quantized dtype")
         cuda_storage = self._storage.cuda(device, non_blocking, **kwargs)
         return self._new_wrapped_storage(cuda_storage)
 
@@ -430,12 +538,9 @@ def get_device(self) -> int:
 
     def __str__(self):
         data_str = ' ' + '\n '.join(str(self[i]) for i in range(self.size()))
-        if type(self) == TypedStorage:
-            return data_str + (
-                f'\n[{torch.typename(self)} with dtype {self.dtype} '
-                f'of size {len(self)}]')
-        else:
-            return data_str + f'\n[{torch.typename(self)} of size {len(self)}]'
+        return data_str + (
+            f'\n[{torch.typename(self)}(dtype={self.dtype}, '
+            f'device={self.device}) of size {len(self)}]')
 
     def __repr__(self):
         return str(self)
@@ -450,7 +555,7 @@ def __deepcopy__(self, memo):
         return self._new_wrapped_storage(copy.deepcopy(self._storage, memo))
 
     def __sizeof__(self):
-        return super(TypedStorage, self).__sizeof__() + self.nbytes()
+        return super(_TypedStorage, self).__sizeof__() + self.nbytes()
 
     def clone(self):
         """Returns a copy of this storage"""
@@ -480,12 +585,16 @@ def share_memory_(self):
         self._storage.share_memory_()
         return self
 
-    @classmethod
-    def _new_shared(cls, size):
+    def _new_shared(self, size):
         """Creates a new storage in shared memory with the same data type"""
-        module = eval(cls.__module__)
-        untyped_storage = module.UntypedStorage._new_shared(size * cls().element_size())
-        return cls(wrap_storage=untyped_storage)
+        if self.is_cuda:
+            untyped_cls = torch.cuda._UntypedStorage
+        else:
+            untyped_cls = torch._UntypedStorage
+        untyped_storage = untyped_cls._new_shared(size * self.element_size())
+        return _TypedStorage(
+            wrap_storage=untyped_storage,
+            dtype=self.dtype)
 
     @property
     def _cdata(self):
@@ -517,28 +626,45 @@ def resize_(self, size):
 
     @classmethod
     def _free_weak_ref(cls, *args, **kwargs):
-        return eval(cls.__module__).UntypedStorage._free_weak_ref(*args, **kwargs)
+        return eval(cls.__module__)._UntypedStorage._free_weak_ref(*args, **kwargs)
 
     def _weak_ref(self, *args, **kwargs):
         return self._storage._weak_ref(*args, **kwargs)
 
     @classmethod
-    def from_buffer(cls, *args, **kwargs):
-        if cls == TypedStorage:
-            raise RuntimeError(
-                'from_buffer: only supported for subclasses of TypedStorage')
+    def from_buffer(cls, *args, dtype=None, device=None, **kwargs):
+        if cls == _TypedStorage:
+            dtype = torch.get_default_dtype() if dtype is None else dtype
+            device = torch.device('cpu' if device is None else device)
+
+            if device.type == 'cpu':
+                untyped_cls = torch._UntypedStorage
+            elif device.type == 'cuda':
+                untyped_cls = torch.cuda._UntypedStorage
+            else:
+                raise RuntimeError(
+                    f"_TypedStorage.from_buffer: device '{device}' not recognized")
+            untyped_storage: Union[torch._UntypedStorage, torch.cuda._UntypedStorage]
+            untyped_storage = untyped_cls.from_buffer(*args, dtype=dtype, **kwargs)
 
-        if 'dtype' in kwargs or len(args) == 5:
-            raise RuntimeError((
-                "from_buffer: 'dtype' can only be specified in "
-                "UntypedStorage.from_buffer"))
+        else:
+            if dtype is not None or len(args) == 5:
+                raise RuntimeError((
+                    "from_buffer: 'dtype' can only be specified in "
+                    "_UntypedStorage.from_buffer and _TypedStorage.from_buffer"))
+            if device is not None:
+                raise RuntimeError((
+                    "from_buffer: 'device' can only be specified in "
+                    "_UntypedStorage.from_buffer and _TypedStorage.from_buffer"))
 
-        kwargs['dtype'] = cls().dtype
+            dtype = cls.dtype
+            untyped_storage = eval(cls.__module__)._UntypedStorage.from_buffer(*args, dtype=dtype, **kwargs)
 
-        untyped_storage = eval(cls.__module__).UntypedStorage.from_buffer(*args, **kwargs)
-        return cls(wrap_storage=untyped_storage)
+        return _TypedStorage(wrap_storage=untyped_storage, dtype=dtype)
 
     def _to(self, dtype):
+        if not isinstance(dtype, torch.dtype):
+            raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
         storage = torch.tensor([], dtype=self.dtype, device=self.device).set_(self).to(dtype).storage()
         if storage.data_ptr() == self.data_ptr():
             storage = storage.clone()
@@ -594,9 +720,26 @@ def complex_float(self):
 
     @classmethod
     def from_file(cls, filename, shared, size):
-        if cls == TypedStorage:
+        """
+        from_file(filename, shared=False, size=0) -> Storage
+
+        If `shared` is `True`, then memory is shared between all processes.
+        All changes are written to the file. If `shared` is `False`, then the changes on
+        the storage do not affect the file.
+
+        `size` is the number of elements in the storage. If `shared` is `False`,
+        then the file must contain at least `size * sizeof(Type)` bytes
+        (`Type` is the type of storage). If `shared` is `True` the file will be
+        created if needed.
+
+        Args:
+            filename (str): file name to map
+            shared (bool): whether to share memory
+            size (int): number of elements in the storage
+        """
+        if cls == _TypedStorage:
             raise RuntimeError('from_file can only be called on derived classes')
-        untyped_storage = eval(cls.__module__).UntypedStorage.from_file(
+        untyped_storage = eval(cls.__module__)._UntypedStorage.from_file(
             filename,
             shared,
             size * torch._utils._element_size(cls.dtype))
@@ -605,7 +748,7 @@ def from_file(cls, filename, shared, size):
 
     @classmethod
     def _expired(cls, *args, **kwargs):
-        return eval(cls.__module__).UntypedStorage._expired(*args, **kwargs)
+        return eval(cls.__module__)._UntypedStorage._expired(*args, **kwargs)
 
     def is_pinned(self):
         return self._storage.is_pinned()
@@ -627,28 +770,28 @@ def is_shared(self):
 
     @classmethod
     def _new_shared_cuda(cls, *args, **kwargs):
-        return eval(cls.__module__).UntypedStorage._new_shared_cuda(*args, **kwargs)
-
-    @classmethod
-    def _new_with_weak_ptr(cls, *args, **kwargs):
-        return eval(cls.__module__).UntypedStorage._new_with_weak_ptr(*args, **kwargs)
+        return torch.cuda._UntypedStorage._new_shared_cuda(*args, **kwargs)
 
     def _share_filename_(self, *args, **kwargs):
         manager_handle, storage_handle, size = self._storage._share_filename_(*args, **kwargs)
         return manager_handle, storage_handle, size // self.element_size()
 
-    @classmethod
-    def _new_shared_filename(cls, manager, obj, size):
-        bytes_size = size * torch._utils._element_size(cls.dtype)
-        return cls(wrap_storage=eval(cls.__module__).UntypedStorage._new_shared_filename(manager, obj, bytes_size))
-
     def _shared_decref(self):
         self._storage._shared_decref()
         return self
 
     @classmethod
-    def _release_ipc_counter(cls, *args, **kwargs):
-        return eval(cls.__module__).UntypedStorage._release_ipc_counter(*args, **kwargs)
+    def _release_ipc_counter(cls, *args, device=None, **kwargs):
+        device = torch.device('cpu' if device is None else device)
+
+        if device.type == 'cpu':
+            untyped_cls = torch._UntypedStorage
+        elif device.type == 'cuda':
+            untyped_cls = torch.cuda._UntypedStorage
+        else:
+            raise RuntimeError(f"device {device} not recognized")
+
+        return untyped_cls._release_ipc_counter(*args, **kwargs)
 
     def _shared_incref(self, *args, **kwargs):
         return self._storage._shared_incref(*args, **kwargs)
@@ -657,6 +800,51 @@ def _share_fd_(self, *args, **kwargs):
         fd, size = self._storage._share_fd_(*args, **kwargs)
         return fd, size // self.element_size()
 
+    def _get_legacy_storage_class(self):
+        if self.dtype not in _dtype_to_storage_type_map():
+            return None
+
+        storage_name = _dtype_to_storage_type_map()[self.dtype]
+
+        if self.device.type not in ['cpu', 'cuda']:
+            return None
+
+        module = 'torch.' if self.device.type == 'cpu' else 'torch.cuda.'
+
+        try:
+            return eval(module + storage_name)
+        except AttributeError:
+            return None
+
+_TypedStorage.type.__doc__ = _type.__doc__
+_TypedStorage.cuda.__doc__ = _cuda.__doc__
+
+class _LegacyStorageMeta(type):
+    dtype: torch.dtype
+
+    def __instancecheck__(cls, instance):
+        if type(instance) == _TypedStorage:
+            cls_device = 'cuda' if cls.__module__ == 'torch.cuda' else 'cpu'
+            return (cls_device == instance.device.type) and (cls.dtype == instance.dtype)
+        return False
+
+class _LegacyStorage(_TypedStorage, metaclass=_LegacyStorageMeta):
+    @classmethod
+    def _new_shared(cls, size):
+        """Creates a new storage in shared memory with the same data type"""
+        module = eval(cls.__module__)
+        untyped_storage = module._UntypedStorage._new_shared(size * cls().element_size())
+        return cls(wrap_storage=untyped_storage)
+
+    @classmethod
+    def _release_ipc_counter(cls, *args, **kwargs):
+        return eval(cls.__module__)._UntypedStorage._release_ipc_counter(*args, **kwargs)
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size):
+        bytes_size = size * torch._utils._element_size(cls.dtype)
+        return cls(wrap_storage=eval(cls.__module__)._UntypedStorage._new_shared_filename(manager, obj, bytes_size))
+
 def _get_dtype_from_pickle_storage_type(pickle_storage_type: str):
     try:
         return _storage_type_to_dtype_map()[pickle_storage_type]
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 043797044474..130eaf672983 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,4 @@
 from ._comparison import assert_close
-from ._core import *  # noqa: F403
-from ._creation import *  # noqa: F403
+from torch._C import FileCheck
+from ._creation import make_tensor
 from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 993b3d1d5cb9..a6140601b7b2 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -6,8 +6,6 @@
 
 import torch
 
-from ._core import _unravel_index
-
 try:
     import numpy as np
 
@@ -217,6 +215,18 @@ def make_tensor_mismatch_msg(
             as callable in which case it will be called by the default value to create the description at runtime.
             Defaults to "Tensor-likes".
     """
+    def unravel_flat_index(flat_index: int) -> Tuple[int, ...]:
+        if not mismatches.shape:
+            return ()
+
+        inverse_index = []
+        for size in mismatches.shape[::-1]:
+            div, mod = divmod(flat_index, size)
+            flat_index = div
+            inverse_index.append(mod)
+
+        return tuple(inverse_index[::-1])
+
     number_of_elements = mismatches.numel()
     total_mismatches = torch.sum(mismatches).item()
     extra = (
@@ -242,10 +252,10 @@ def make_tensor_mismatch_msg(
         identifier=identifier,
         extra=extra,
         abs_diff=max_abs_diff.item(),
-        abs_diff_idx=_unravel_index(max_abs_diff_flat_idx.item(), mismatches.shape),
+        abs_diff_idx=unravel_flat_index(int(max_abs_diff_flat_idx)),
         atol=atol,
         rel_diff=max_rel_diff.item(),
-        rel_diff_idx=_unravel_index(max_rel_diff_flat_idx.item(), mismatches.shape),
+        rel_diff_idx=unravel_flat_index(int(max_rel_diff_flat_idx)),
         rtol=rtol,
     )
 
@@ -592,35 +602,23 @@ def _to_tensor(self, tensor_like: Any) -> torch.Tensor:
             raise UnsupportedInputs()
 
     def _check_supported(self, tensor: torch.Tensor, *, id: Tuple[Any, ...]) -> None:
-        if tensor.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}:  # type: ignore[attr-defined]
+        if tensor.layout not in {torch.strided,
+                                 torch.sparse_coo,
+                                 torch.sparse_csr,
+                                 torch.sparse_csc,
+                                 torch.sparse_bsr,
+                                 torch.sparse_bsc}:
             raise ErrorMeta(ValueError, f"Unsupported tensor layout {tensor.layout}", id=id)
 
     def compare(self) -> None:
         actual, expected = self.actual, self.expected
 
-        with self._handle_meta_tensor_data_access():
-            self._compare_attributes(actual, expected)
-            actual, expected = self._equalize_attributes(actual, expected)
-
-            self._compare_values(actual, expected)
-
-    @contextlib.contextmanager
-    def _handle_meta_tensor_data_access(self):
-        """Turns a vanilla :class:`NotImplementedError` stemming from data access on a meta tensor into an expressive
-        :class:`ErrorMeta`.
-
-        Although it looks like meta tensors could be handled upfront, we need to do it lazily: there are use cases
-        where a meta tensor wraps a data tensors and dispatches all operator calls to it. Thus, although the tensor is
-        a meta tensor, it behaves like a regular one.
-        """
-        try:
-            yield
-        except NotImplementedError as error:
-            if "meta" not in str(error).lower():
-                raise error
+        self._compare_attributes(actual, expected)
+        if any(input.device.type == "meta" for input in (actual, expected)):
+            return
 
-            # TODO: See https://github.com/pytorch/pytorch/issues/68592
-            raise self._make_error_meta(NotImplementedError, "Comparing meta tensors is currently not supported.")
+        actual, expected = self._equalize_attributes(actual, expected)
+        self._compare_values(actual, expected)
 
     def _compare_attributes(
         self,
@@ -709,8 +707,8 @@ def _compare_values(self, actual: torch.Tensor, expected: torch.Tensor) -> None:
             compare_fn = self._compare_quantized_values
         elif actual.is_sparse:
             compare_fn = self._compare_sparse_coo_values
-        elif actual.is_sparse_csr:
-            compare_fn = self._compare_sparse_csr_values
+        elif actual.layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}:
+            compare_fn = self._compare_sparse_compressed_values
         else:
             compare_fn = self._compare_regular_values_close
 
@@ -778,34 +776,41 @@ def _compare_sparse_coo_values(
             identifier="Sparse COO values",
         )
 
-    def _compare_sparse_csr_values(
+    def _compare_sparse_compressed_values(
         self, actual: torch.Tensor, expected: torch.Tensor, *, rtol: float, atol: float, equal_nan: bool
     ) -> None:
-        """Compares sparse CSR tensors by comparing
+        """Compares sparse compressed tensors by comparing
 
         - the number of non-zero elements (nnz) for equality,
-        - the col_indices for equality,
-        - the crow_indices for equality, and
+        - the plain indices for equality,
+        - the compressed indices for equality, and
         - the values for closeness.
         """
+        format_name, compressed_indices_method, plain_indices_method = {
+            torch.sparse_csr: ('CSR', torch.Tensor.crow_indices, torch.Tensor.col_indices),
+            torch.sparse_csc: ('CSC', torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+            torch.sparse_bsr: ('BSR', torch.Tensor.crow_indices, torch.Tensor.col_indices),
+            torch.sparse_bsc: ('BSC', torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+        }[actual.layout]
+
         if actual._nnz() != expected._nnz():
             raise self._make_error_meta(
                 AssertionError,
                 (
-                    f"The number of specified values in sparse CSR tensors does not match: "
+                    f"The number of specified values in sparse {format_name} tensors does not match: "
                     f"{actual._nnz()} != {expected._nnz()}"
                 ),
             )
 
         self._compare_regular_values_equal(
-            actual.crow_indices(),
-            expected.crow_indices(),
-            identifier="Sparse CSR crow_indices",
+            compressed_indices_method(actual),
+            compressed_indices_method(expected),
+            identifier=f"Sparse {format_name} {compressed_indices_method.__name__}",
         )
         self._compare_regular_values_equal(
-            actual.col_indices(),
-            expected.col_indices(),
-            identifier="Sparse CSR col_indices",
+            plain_indices_method(actual),
+            plain_indices_method(expected),
+            identifier=f"Sparse {format_name} {plain_indices_method.__name__}",
         )
         self._compare_regular_values_close(
             actual.values(),
@@ -813,7 +818,7 @@ def _compare_sparse_csr_values(
             rtol=rtol,
             atol=atol,
             equal_nan=equal_nan,
-            identifier="Sparse CSR values",
+            identifier=f"Sparse {format_name} values",
         )
 
     def _compare_regular_values_equal(
@@ -1103,13 +1108,19 @@ def assert_close(
 
         \lvert \text{actual} - \text{expected} \rvert \le \texttt{atol} + \texttt{rtol} \cdot \lvert \text{expected} \rvert
 
-    and they have the same :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``), same ``dtype`` (if
-    ``check_dtype`` is ``True``), and the same stride (if ``check_stride`` is ``True``). Non-finite values
-    (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are only considered equal
-    to each other if ``equal_nan`` is ``True``.
+    Non-finite values (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are
+    only considered equal to each other if ``equal_nan`` is ``True``.
+
+    In addition, they are only considered close if they have the same
+    - :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``),
+    - ``dtype`` (if ``check_dtype`` is ``True``),
+    - ``layout`` (if ``check_layout`` is ``True``), and
+    - stride (if ``check_stride`` is ``True``).
+    If either ``actual`` or ``expected`` is a meta tensor, only the attribute checks will be performed.
 
-    If ``actual`` and ``expected`` are sparse (either having COO or CSR layout), their strided members are
-    checked individually. Indices, namely ``indices`` for COO or ``crow_indices``  and ``col_indices`` for CSR layout,
+    If ``actual`` and ``expected`` are sparse (either having COO, CSR, CSC, BSR, or BSC layout), their strided members are
+    checked individually. Indices, namely ``indices`` for COO, ``crow_indices`` and ``col_indices`` for CSR and BSR,
+    or ``ccol_indices``  and ``row_indices`` for CSC and BSC layouts, respectively,
     are always checked for equality whereas the values are checked for closeness according to the definition above.
 
     If ``actual`` and ``expected`` are quantized, they are considered close if they have the same
diff --git a/torch/testing/_core.py b/torch/testing/_core.py
deleted file mode 100644
index 8fab432009de..000000000000
--- a/torch/testing/_core.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-The testing package contains testing-specific utilities.
-"""
-
-import torch
-import random
-import operator
-
-FileCheck = torch._C.FileCheck
-
-__all__ = [
-    "FileCheck",
-    "make_non_contiguous",
-]
-
-# Helper function that returns True when the dtype is an integral dtype,
-# False otherwise.
-# TODO: implement numpy-like issubdtype
-def is_integral(dtype: torch.dtype) -> bool:
-    return dtype in (torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
-
-def is_quantized(dtype: torch.dtype) -> bool:
-    return dtype in (torch.quint8, torch.qint8, torch.qint32, torch.quint4x2)
-
-# Helper function that maps a flattened index back into the given shape
-# TODO: consider adding torch.unravel_index
-def _unravel_index(flat_index, shape):
-    flat_index = operator.index(flat_index)
-    res = []
-
-    # Short-circuits on zero dim tensors
-    if shape == torch.Size([]):
-        return 0
-
-    for size in shape[::-1]:
-        res.append(flat_index % size)
-        flat_index = flat_index // size
-
-    if len(res) == 1:
-        return res[0]
-
-    return tuple(res[::-1])
-
-
-def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor:
-    if tensor.numel() <= 1:  # can't make non-contiguous
-        return tensor.clone()
-    osize = list(tensor.size())
-
-    # randomly inflate a few dimensions in osize
-    for _ in range(2):
-        dim = random.randint(0, len(osize) - 1)
-        add = random.randint(4, 15)
-        osize[dim] = osize[dim] + add
-
-    # narrow doesn't make a non-contiguous tensor if we only narrow the 0-th dimension,
-    # (which will always happen with a 1-dimensional tensor), so let's make a new
-    # right-most dimension and cut it off
-
-    input = tensor.new(torch.Size(osize + [random.randint(2, 3)]))
-    input = input.select(len(input.size()) - 1, random.randint(0, 1))
-    # now extract the input of correct size from 'input'
-    for i in range(len(osize)):
-        if input.size(i) != tensor.size(i):
-            bounds = random.randint(1, input.size(i) - tensor.size(i))
-            input = input.narrow(i, bounds, tensor.size(i))
-
-    input.copy_(tensor)
-
-    # Use .data here to hide the view relation between input and other temporary Tensors
-    return input.data
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index 4be53dd82829..36c86f5d00e8 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -5,16 +5,18 @@
 import torch
 from typing import Optional, List, Tuple, Union, cast
 import math
+import collections.abc
 
-__all__ = [
-    "make_tensor",
-]
+# Used by make_tensor for generating complex tensor.
+complex_to_corresponding_float_type_map = {torch.complex32: torch.float16,
+                                           torch.complex64: torch.float32,
+                                           torch.complex128: torch.float64}
+float_to_corresponding_complex_type_map = {v: k for k, v in complex_to_corresponding_float_type_map.items()}
 
 def make_tensor(
-    shape: Union[torch.Size, List[int], Tuple[int, ...]],
-    device: Union[str, torch.device],
+    *shape: Union[int, torch.Size, List[int], Tuple[int, ...]],
     dtype: torch.dtype,
-    *,
+    device: Union[str, torch.device],
     low: Optional[float] = None,
     high: Optional[float] = None,
     requires_grad: bool = False,
@@ -44,9 +46,9 @@ def make_tensor(
     +---------------------------+------------+----------+
 
     Args:
-        shape (Tuple[int, ...]): A sequence of integers defining the shape of the output tensor.
-        device (Union[str, torch.device]): The device of the returned tensor.
+        shape (Tuple[int, ...]): Single integer or a sequence of integers defining the shape of the output tensor.
         dtype (:class:`torch.dtype`): The data type of the returned tensor.
+        device (Union[str, torch.device]): The device of the returned tensor.
         low (Optional[Number]): Sets the lower limit (inclusive) of the given range. If a number is provided it is
             clamped to the least representable finite value of the given dtype. When ``None`` (default),
             this value is determined based on the :attr:`dtype` (see the table above). Default: ``None``.
@@ -103,18 +105,22 @@ def clamp(a, l, h):
 
         return low, high
 
+    if len(shape) == 1 and isinstance(shape[0], collections.abc.Sequence):
+        shape = shape[0]  # type: ignore[assignment]
+    shape = cast(Tuple[int, ...], tuple(shape))
+
     _integral_types = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
     _floating_types = [torch.float16, torch.bfloat16, torch.float32, torch.float64]
-    _complex_types = [torch.cfloat, torch.cdouble]
+    _complex_types = [torch.complex32, torch.complex64, torch.complex128]
     if requires_grad and dtype not in _floating_types and dtype not in _complex_types:
         raise ValueError("make_tensor: requires_grad must be False for integral dtype")
 
     if dtype is torch.bool:
-        result = torch.randint(0, 2, shape, device=device, dtype=dtype)
+        result = torch.randint(0, 2, shape, device=device, dtype=dtype)  # type: ignore[call-overload]
     elif dtype is torch.uint8:
         ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
         low, high = cast(Tuple[int, int], _modify_low_high(low, high, ranges[0], ranges[1], 0, 10, dtype))
-        result = torch.randint(low, high, shape, device=device, dtype=dtype)
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)   # type: ignore[call-overload]
     elif dtype in _integral_types:
         ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
         low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 10, dtype)
@@ -125,7 +131,7 @@ def clamp(a, l, h):
         rand_val = torch.rand(shape, device=device, dtype=dtype)
         result = high * rand_val + low * (1 - rand_val)
     elif dtype in _complex_types:
-        float_dtype = torch.float if dtype is torch.cfloat else torch.double
+        float_dtype = complex_to_corresponding_float_type_map[dtype]
         ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max)
         low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
         real_rand_val = torch.rand(shape, device=device, dtype=float_dtype)
@@ -147,7 +153,7 @@ def clamp(a, l, h):
         elif dtype in _floating_types:
             replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype)
         else:  # dtype in _complex_types:
-            float_dtype = torch.float if dtype is torch.cfloat else torch.double
+            float_dtype = complex_to_corresponding_float_type_map[dtype]
             float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype)
             replace_with = torch.complex(float_eps, float_eps)
         result[result == 0] = replace_with
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index 67826b3a6287..3dad2e62b0f2 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -4,6 +4,7 @@
 """
 
 import functools
+import random
 import warnings
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
@@ -17,13 +18,14 @@
     "randn",
     "assert_allclose",
     "get_all_device_types",
+    "make_non_contiguous",
 ]
 
 
 def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
     def outer_wrapper(fn: Callable) -> Callable:
         name = fn.__name__
-        head = f"torch.testing.{name}() is deprecated and will be removed in a future release. "
+        head = f"torch.testing.{name}() is deprecated since 1.12 and will be removed in 1.14. "
 
         @functools.wraps(fn)
         def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
@@ -38,8 +40,8 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
     return outer_wrapper
 
 
-rand = warn_deprecated("Use torch.rand instead.")(torch.rand)
-randn = warn_deprecated("Use torch.randn instead.")(torch.randn)
+rand = warn_deprecated("Use torch.rand() instead.")(torch.rand)
+randn = warn_deprecated("Use torch.randn() instead.")(torch.randn)
 
 
 _DTYPE_PRECISIONS = {
@@ -55,11 +57,10 @@ def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) ->
     return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
 
 
-# TODO: include the deprecation as soon as torch.testing.assert_close is stable
-# @warn_deprecated(
-#     "Use torch.testing.assert_close instead. "
-#     "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
-# )
+@warn_deprecated(
+    "Use torch.testing.assert_close() instead. "
+    "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
+)
 def assert_allclose(
     actual: Any,
     expected: Any,
@@ -89,15 +90,52 @@ def assert_allclose(
     )
 
 
-# We iterate over all dtype getters and expose them here with an added deprecation warning
+getter_instructions = (
+    lambda name, args, kwargs, return_value: f"This call can be replaced with {return_value}."  # noqa: E731
+)
+
+# Deprecate and expose all dtype getters
 for name in _legacy.__all_dtype_getters__:
     fn = getattr(_legacy, name)
-    instructions = (
-        lambda name, args, kwargs, return_value: f"This call to {name}(...) can be replaced with {return_value}."
-    )
-    globals()[name] = warn_deprecated(instructions)(fn)
+    globals()[name] = warn_deprecated(getter_instructions)(fn)
     __all__.append(name)
 
-
-instructions = lambda name, args, kwargs, return_value: f"This call can be replaced with {return_value}."  # noqa: E731
-get_all_device_types = warn_deprecated(instructions)(_legacy.get_all_device_types)
+get_all_device_types = warn_deprecated(getter_instructions)(_legacy.get_all_device_types)
+
+
+@warn_deprecated(
+    "Depending on the use case there a different replacement options:\n\n"
+    "- If you are using `make_non_contiguous` in combination with a creation function to create a noncontiguous tensor "
+    "with random values, use `torch.testing.make_tensor(..., noncontiguous=True)` instead.\n"
+    "- If you are using `make_non_contiguous` with a specific tensor, you can replace this call with "
+    "`torch.repeat_interleave(input, 2, dim=-1)[..., ::2]`.\n"
+    "- If you are using `make_non_contiguous` in the PyTorch test suite, use "
+    "`torch.testing._internal.common_utils.noncontiguous_like` instead."
+)
+def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.numel() <= 1:  # can't make non-contiguous
+        return tensor.clone()
+    osize = list(tensor.size())
+
+    # randomly inflate a few dimensions in osize
+    for _ in range(2):
+        dim = random.randint(0, len(osize) - 1)
+        add = random.randint(4, 15)
+        osize[dim] = osize[dim] + add
+
+    # narrow doesn't make a non-contiguous tensor if we only narrow the 0-th dimension,
+    # (which will always happen with a 1-dimensional tensor), so let's make a new
+    # right-most dimension and cut it off
+
+    input = tensor.new(torch.Size(osize + [random.randint(2, 3)]))
+    input = input.select(len(input.size()) - 1, random.randint(0, 1))
+    # now extract the input of correct size from 'input'
+    for i in range(len(osize)):
+        if input.size(i) != tensor.size(i):
+            bounds = random.randint(1, input.size(i) - tensor.size(i))
+            input = input.narrow(i, bounds, tensor.size(i))
+
+    input.copy_(tensor)
+
+    # Use .data here to hide the view relation between input and other temporary Tensors
+    return input.data
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 00ed80724955..8afc1f3b28d2 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -101,6 +101,7 @@ def __init__(self, dev):
             ("addmv", pointwise0_fp32 + mat2_fp32 + pointwise1_fp32),
             ("addr", mat0_fp32 + pointwise0_fp32 + pointwise1_fp32),
             ("matmul", mat0_fp32 + mat1_fp32),
+            ("einsum", "bkhd,bqhd->bqkh", mat0_fp32 + mat1_fp32),
             ("mm", mat0_fp32 + mat1_fp32),
             ("mv", mat0_fp32 + pointwise0_fp32),
             ("chain_matmul", mat0_fp32 + mat1_fp32 + mat2_fp32),
@@ -301,43 +302,31 @@ def __init__(self, dev):
             ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
                      torch.randn((n, n, n), device=dev, dtype=torch.float32))),
             ("mm", mat0_fp32 + mat1_fp32),
+            ("matmul", mat0_fp32 + mat1_fp32),
             ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
                          torch.randn((n, n, n), device=dev, dtype=torch.float32),
                          torch.randn((n, n, n), device=dev, dtype=torch.float32))),
             ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
             ("addbmm", mat0_fp32 + (torch.randn((n, n, n), device=dev, dtype=torch.float32),
                                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
+                          torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
+                          torch.randn(5, device=dev, dtype=torch.float32),
+                          0)),
         ]
         self.torch_fp32 = [
             ("conv_transpose1d", conv_args_bf16[0]),
             ("conv_transpose2d", conv_args_bf16[1]),
             ("conv_transpose3d", conv_args_bf16[2]),
-            ("batch_norm", dummy_bf16[2], {"weight": None, "bias": None, "running_mean": torch.rand((n), dtype=torch.float32),
-                                           "running_var": torch.rand((n), dtype=torch.float32), "training": False,
-                                           "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}),
-            ("dropout", dummy_bf16[2], {"p": 0.1, "train": False}),
-            ("binary_cross_entropy_with_logits", mat0_bf16 + (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
-            ("instance_norm", dummy_bf16[1], {"weight": None, "bias": None, "running_mean": None,
-                                              "running_var": None, "use_input_stats": True,
-                                              "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}),
         ]
         self.nn_bf16 = [
-            ("linear", mat0_fp32 + mat1_fp32),
+            ("linear", mat0_fp32 + mat1_fp32, {}),
         ]
         self.nn_fp32 = [
-            ("avg_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}),
             ("avg_pool3d", dummy_bf16[3], {"kernel_size": (3, 3, 3), "stride": (1, 1, 1)}),
-            ("gelu", dummy_bf16[3]),
-            ("upsample_nearest1d", dummy_bf16[2], {"output_size": (n)}),
-            ("upsample_nearest2d", dummy_bf16[3], {"output_size": (n, n)}),
-            ("upsample_nearest3d", dummy_bf16[4], {"output_size": (n, n, n)}),
-            ("upsample_linear1d", dummy_bf16[2], {"output_size": (n), "align_corners": False}),
-            ("upsample_bilinear2d", dummy_bf16[3], {"output_size": (n, n), "align_corners": False}),
-            ("upsample_trilinear3d", dummy_bf16[4], {"output_size": (n, n, n), "align_corners": False}),
             ("binary_cross_entropy", (torch.rand((n, n), device=dev, dtype=torch.bfloat16),) +
                                      (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
             ("reflection_pad1d", dummy_bf16[2], {"padding": (3, 3)}),
-            ("smooth_l1_loss", mat0_bf16 + mat1_bf16),
         ]
         self.torch_need_autocast_promote = [
             ("cat", (pointwise0_bf16 + pointwise1_fp32,)),
diff --git a/torch/testing/_internal/codegen/random_topo_test.py b/torch/testing/_internal/codegen/random_topo_test.py
index cf27fadff314..e92720be6b80 100644
--- a/torch/testing/_internal/codegen/random_topo_test.py
+++ b/torch/testing/_internal/codegen/random_topo_test.py
@@ -370,7 +370,7 @@ def parse_args():
     # Turn off profiling executor
     if not args.profiling_executor:
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
 
     # factor sorta control the depth of the model
     GRAPH_FACTOR = args.depth_factor
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 356346d43cc1..8dc18cca6f76 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -160,6 +160,12 @@ def wrapped(*args, **kwargs):
 
     return wrapped
 
+def _get_magma_version():
+    if 'Magma' not in torch.__config__.show():
+        return (0, 0)
+    position = torch.__config__.show().find('Magma ')
+    version_str = torch.__config__.show()[position + len('Magma '):].split('\n')[0]
+    return tuple(int(x) for x in version_str.split("."))
 
 def _get_torch_cuda_version():
     if torch.version.cuda is None:
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 89368be7af77..569f0bb211cc 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -12,8 +12,9 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, DeterministicGuard, TEST_SKIP_NOARCH, \
-    _TestParametrizer, compose_parametrize_fns, dtype_name, TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, DeterministicGuard, \
+    _TestParametrizer, compose_parametrize_fns, dtype_name, \
+    TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_dtype import get_all_dtypes
 
@@ -446,15 +447,6 @@ class CPUTestBase(DeviceTypeTestBase):
     def _should_stop_test_suite(self):
         return False
 
-# The meta device represents tensors that don't have any storage; they have
-# all metadata (size, dtype, strides) but they don't actually do any compute
-class MetaTestBase(DeviceTypeTestBase):
-    device_type = 'meta'
-    _ignore_not_implemented_error = True
-
-    def _should_stop_test_suite(self):
-        return False
-
 class CUDATestBase(DeviceTypeTestBase):
     device_type = 'cuda'
     _do_cuda_memory_leak_check = True
@@ -494,6 +486,13 @@ def setUpClass(cls):
         # Acquires the current device as the primary (test) device
         cls.primary_device = 'cuda:{0}'.format(torch.cuda.current_device())
 
+class MPSTestBase(DeviceTypeTestBase):
+    device_type = 'mps'
+
+    def _should_stop_test_suite(self):
+        return False
+
+    # TODO: Maybe override `_get_dtypes`, `_get_precision_override`
 
 # Adds available device-type-specific test base classes
 def get_device_type_test_bases():
@@ -508,17 +507,17 @@ def get_device_type_test_bases():
                 test_bases.append(CUDATestBase)
         else:
             test_bases.append(CPUTestBase)
-            test_bases.append(MetaTestBase)
     else:
         test_bases.append(CPUTestBase)
-        if not TEST_SKIP_NOARCH:
-            test_bases.append(MetaTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
+        # Disable MPS testing in generic device testing temporarily while we're
+        # ramping up support.
+        # elif torch.backends.mps.is_available():
+        #   test_bases.append(MPSTestBase)
 
     return test_bases
 
-
 device_type_test_bases = get_device_type_test_bases()
 
 
@@ -647,23 +646,23 @@ def split_if_not_empty(x: str):
 # Category of dtypes to run an OpInfo-based test for
 # Example use: @ops(dtype=OpDTypes.supported)
 #
-# There are 6 categories:
-# - basic: The dtypes the operator wants to be tested on by default. This will be
-#          a subset of the types supported by the operator.
+# There are 5 categories:
 # - supported: Every dtype supported by the operator. Use for exhaustive
 #              testing of all dtypes.
 # - unsupported: Run tests on dtypes not supported by the operator. e.g. for
 #                testing the operator raises an error and doesn't crash.
 # - supported_backward: Every dtype supported by the operator's backward pass.
 # - unsupported_backward: Run tests on dtypes not supported by the operator's backward pass.
+# - any_one: Runs a test for one dtype the operator supports. Prioritizes dtypes the
+#     operator supports in both forward and backward.
 # - none: Useful for tests that are not dtype-specific. No dtype will be passed to the test
 #         when this is selected.
 class OpDTypes(Enum):
-    basic = 0  # Test the basic set of dtypes (default)
-    supported = 1  # Test all supported dtypes
-    unsupported = 2  # Test only unsupported dtypes
-    supported_backward = 3  # Test all supported backward dtypes
-    unsupported_backward = 4  # Test only unsupported backward dtypes
+    supported = 0  # Test all supported dtypes (default)
+    unsupported = 1  # Test only unsupported dtypes
+    supported_backward = 2  # Test all supported backward dtypes
+    unsupported_backward = 3  # Test only unsupported backward dtypes
+    any_one = 4  # Test precisely one supported dtype
     none = 5  # Instantiate no dtype variants (no dtype kwarg needed)
 
 
@@ -679,9 +678,7 @@ class OpDTypes(Enum):
 # on each device the OpInfo's operator supports, and for every dtype supported by
 # that operator. There are a few caveats to the dtype rule, explained below.
 #
-# First, if the OpInfo defines "default_test_dtypes" then the test
-# is instantiated for the intersection of default_test_dtypes and the
-# dtypes the operator supports. Second, the @ops decorator can accept two
+# The @ops decorator can accept two
 # additional arguments, "dtypes" and "allowed_dtypes". If "dtypes" is specified
 # then the test variants are instantiated for those dtypes, regardless of
 # what the operator supports. If given "allowed_dtypes" then test variants
@@ -698,16 +695,18 @@ class OpDTypes(Enum):
 #     operator's gradient formula supports
 #   OpDTypes.unsupported_backward - the test is instantiated for all dtypes the
 #     operator's gradient formula doesn't support
-#   OpDTypes.none - the test is instantied without any dtype. The test signature
+#   OpDTypes.any_one - the test is instantiated for one dtype the
+#     operator supports. The dtype supports forward and backward if possible.
+#   OpDTypes.none - the test is instantiated without any dtype. The test signature
 #     should not include a dtype kwarg in this case.
 #
 # These options allow tests to have considerable control over the dtypes
 #   they're instantiated for.
 
 class ops(_TestParametrizer):
-    def __init__(self, op_list, *, dtypes: Union[OpDTypes, Sequence[torch.dtype]] = OpDTypes.basic,
+    def __init__(self, op_list, *, dtypes: Union[OpDTypes, Sequence[torch.dtype]] = OpDTypes.supported,
                  allowed_dtypes: Optional[Sequence[torch.dtype]] = None):
-        self.op_list = op_list
+        self.op_list = list(op_list)
         self.opinfo_dtypes = dtypes
         self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
 
@@ -731,8 +730,34 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                 dtypes = set(get_all_dtypes()).difference(op.supported_dtypes(device_cls.device_type))
             elif self.opinfo_dtypes == OpDTypes.supported:
                 dtypes = op.supported_dtypes(device_cls.device_type)
-            elif self.opinfo_dtypes == OpDTypes.basic:
-                dtypes = op.default_test_dtypes(device_cls.device_type)
+            elif self.opinfo_dtypes == OpDTypes.any_one:
+                # Arbitrary order
+                dtype_order = (
+                    torch.float32,
+                    torch.float64,
+                    torch.complex64,
+                    torch.complex128,
+                    torch.float16,
+                    torch.bfloat16,
+                    torch.long,
+                    torch.int32,
+                    torch.int16,
+                    torch.int8,
+                    torch.uint8,
+                    torch.bool
+                )
+
+                # Tries to pick a dtype that supports both forward or backward
+                supported = op.supported_dtypes(device_cls.device_type)
+                supported_backward = op.supported_backward_dtypes(device_cls.device_type)
+                supported_both = supported.intersection(supported_backward)
+                dtype_set = supported_both if len(supported_both) > 0 else supported
+                for dtype in dtype_order:
+                    if dtype in dtype_set:
+                        dtypes = {dtype}
+                        break
+                else:
+                    dtypes = {}
             elif self.opinfo_dtypes == OpDTypes.none:
                 dtypes = {None}
             else:
@@ -829,7 +854,10 @@ def _has_sufficient_memory(device, size):
             return False
         gc.collect()
         torch.cuda.empty_cache()
-        return torch.cuda.get_device_properties(device).total_memory - torch.cuda.memory_allocated(device) >= size
+        # torch.cuda.mem_get_info, aka cudaMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
+        if device == 'cuda':
+            device = 'cuda:0'
+        return torch.cuda.memory.mem_get_info(device)[0] >= size
 
     if device == 'xla':
         raise unittest.SkipTest('TODO: Memory availability checks for XLA?')
@@ -1061,6 +1089,10 @@ class dtypesIfCUDA(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type='cuda')
 
+class dtypesIfMPS(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type='mps')
 
 def onlyCPU(fn):
     return onlyOn('cpu')(fn)
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 88a836d49547..8baf7d03d9f7 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -5,6 +5,7 @@
 import sys
 import tempfile
 import threading
+import subprocess
 import time
 import traceback
 import types
@@ -13,8 +14,11 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
-from functools import partial, reduce
-from functools import wraps
+from functools import (
+    partial,
+    reduce,
+    wraps
+)
 from io import StringIO
 from typing import NamedTuple, Optional, Union
 
@@ -47,6 +51,7 @@ class TestSkip(NamedTuple):
         72, "Skipped because distributed backend is not available."
     ),
     "small_worldsize": TestSkip(73, "Skipped due to small world size."),
+    "odd_worldsize": TestSkip(87, "Skipped due to odd world size."),
     "no_cuda": TestSkip(74, "CUDA is not available."),
     "multi-gpu-1": TestSkip(75, "Need at least 1 CUDA device"),
     "multi-gpu-2": TestSkip(77, "Need at least 2 CUDA devices"),
@@ -83,8 +88,8 @@ class DistTestCases:
 
 
 def skip_if_no_gpu(func):
-    """Nccl multigpu tests require at least 2 GPUS. Skip if this is not met"""
-
+    """Skips if the world size exceeds the number of GPUs, ensuring that if the
+    test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
     @wraps(func)
     def wrapper(*args, **kwargs):
         if not torch.cuda.is_available():
@@ -108,6 +113,15 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+def skip_if_odd_worldsize(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) % 2 == 1:
+            sys.exit(TEST_SKIPS["odd_worldsize"].exit_code)
+
+        return func(*args, **kwargs)
+
+    return wrapper
 
 def require_n_gpus_for_nccl_backend(n, backend):
     def decorator(func):
@@ -224,6 +238,7 @@ def wrapper(*args, **kwargs):
             old_level = os.environ.get("TORCH_DISTRIBUTED_DEBUG", None)
             for level in levels:
                 os.environ["TORCH_DISTRIBUTED_DEBUG"] = level
+                c10d.set_debug_level_from_env()
                 ret = func(*args, **kwargs)
                 c10d.barrier()
                 if old_level is not None:
@@ -324,6 +339,9 @@ def create_tcp_store(
     TIMEOUT_DEFAULT = 100
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
+# https://github.com/pytorch/pytorch/issues/75665
+if TEST_WITH_ROCM:
+    TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
 
 def create_device(interface=None):
     if sys.platform == "win32" or interface is None:
@@ -572,6 +590,10 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
 
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+        # Enable DDP + ReplicatedTensor
+        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
+        _set_ddp_with_replicated_tensor(True)
+
         self = cls(test_name)
 
         self.rank = rank
@@ -783,3 +805,32 @@ def _check_return_codes(self, elapsed_time) -> None:
     @property
     def is_master(self) -> bool:
         return self.rank == 0
+
+# Cannot use functools.cache as it requires python 3.9
+EFA_PROBE_RESULT = None
+
+def has_efa() -> bool:
+    """
+    If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
+    Libfabric EFA interfaces and EFA software components installed,
+    see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
+    """
+    global EFA_PROBE_RESULT
+    if EFA_PROBE_RESULT is not None:
+        return EFA_PROBE_RESULT
+
+    try:
+        EFA_PROBE_RESULT = subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
+    except FileNotFoundError:
+        EFA_PROBE_RESULT = False
+    return EFA_PROBE_RESULT
+
+
+def tp_transports():
+    """
+    If the machine has Libfabric EFA interfaces and EFA software components installed it may cause
+    'RuntimeError: In operator() at tensorpipe/common/ibv.h:172 "": Operation not supported' if tensorpipe
+    uses InfiniBand transport, so we exclude it from tensorpipe transports,
+    see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
+    """
+    return ["shm", "uv"] if has_efa() else None
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
index f29decef4761..6b16ad4779b3 100644
--- a/torch/testing/_internal/common_dtype.py
+++ b/torch/testing/_internal/common_dtype.py
@@ -10,6 +10,7 @@
     all_types_and_complex_and,
     all_types_and_half,
     complex_types,
+    complex_types_and,
     empty_types,
     floating_and_complex_types,
     floating_and_complex_types_and,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index ff0fbb69e34e..41ffe6b5fa9f 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1,27 +1,25 @@
 # Owner(s): ["oncall: distributed"]
 
+import sys
 from contextlib import suppress
+from copy import deepcopy
 from enum import Enum
-import os
-import sys
+from math import inf
+from typing import Union
 from unittest import mock
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp import FullyShardedDataParallel, CPUOffload
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    TrainingState_,
-)
+from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel
+from torch.distributed.fsdp.fully_sharded_data_parallel import TrainingState_
 from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
     TEST_SKIPS,
+    MultiProcessTestCase,
 )
-from torch.testing._internal.common_utils import (
-    FILE_SCHEMA,
-    get_cycles_per_ms,
-)
-
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import wrap
+from torch.testing._internal.common_utils import FILE_SCHEMA, get_cycles_per_ms
 
 
 class FSDPInitMode(Enum):
@@ -32,21 +30,48 @@ class FSDPInitMode(Enum):
     # Don't move model to CUDA at all.
     CUDA_NEVER = 3
 
+def _get_full_detached_param(fsdp_model: FullyShardedDataParallel):
+    with FullyShardedDataParallel.summon_full_params(fsdp_model):
+        params = list(p.clone().detach_() for p in fsdp_model.parameters())
+
+    return params
+
+def _validate(model, process_group, assert_fn):
+    module_states = [param.detach().cpu() for param in model.parameters()]
+    module_states.extend([buffer.detach().cpu() for buffer in model.buffers()])
+    world_size = dist.get_world_size(process_group)
+    olist = [None for _ in range(world_size)]
+    dist.all_gather_object(olist, module_states, group=process_group)
+    rank0_states = olist[0]
+    for state in olist[1:]:
+        for p1, p2 in zip(rank0_states, state):
+            assert_fn(p1, p2)
+
+def _zero_model(fsdp_model: FullyShardedDataParallel):
+    with FullyShardedDataParallel.summon_full_params(fsdp_model):
+        for param in fsdp_model.parameters():
+            with torch.no_grad():
+                param.zero_()
+
+def _get_state_dict(model, cpu_offload=False, half=False):
+    if not cpu_offload:
+        model = model.cuda()
+    if half:
+        model.half()
+
+    return model.state_dict()
+
+def subtest_name(test_name_mapping, *args):
+    return '_'.join(
+        [test_name_mapping[str(s)] if s is not None else "none" for s in args]
+    )
+
 # get full params of a model recursively. Note that if CPU offloading, it will
 # also automatically move the parameters to GPU, due to _rebuild_full_params
 # call.
 def get_full_params(model, recurse=True):
-    if recurse:
-        # get all params for any nested FSDP instances.
-        for module in model.modules():
-            if isinstance(module, FullyShardedDataParallel):
-                get_full_params(module, recurse=False)
-    else:
-        torch.cuda.synchronize()
-        model._rebuild_full_params()
-        torch.cuda.synchronize()
-        if model.module.flat_param is not None:
-            model.module._unflatten_params()
+    with FullyShardedDataParallel.summon_full_params(model, recurse=recurse):
+        return deepcopy(list(model.parameters()))
 
 def _maybe_cuda(model, move_to_cuda):
     return model.cuda() if move_to_cuda else model
@@ -68,6 +93,31 @@ def rank(self) -> int:
     def size(self) -> int:
         return self._size
 
+    def allreduce(self, *args, **kwargs):
+        dist_wait = mock.Mock()
+
+        def get_future():
+            future = torch.futures.Future()
+            future.set_result(1)
+            return future
+
+        dist_wait.get_future = get_future
+        return dist_wait
+
+class DeterministicModel(torch.nn.Module):
+    def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
+        super().__init__()
+        # keep everything deterministic for model initialization
+        torch.manual_seed(0)
+        self.inner: Union[torch.nn.Linear, FullyShardedDataParallel] = \
+            torch.nn.Linear(2, 2).cuda()
+        if wrap_fsdp:
+            self.inner = FullyShardedDataParallel(self.inner, cpu_offload=cpu_offload)
+        self.outer = torch.nn.Linear(2, 2).cuda()
+
+    def forward(self, x):
+        y = self.inner(x)
+        return self.outer(y)
 
 class TransformerWithSharedParams(nn.Module):
     def __init__(
@@ -127,6 +177,9 @@ def get_loss(self, input, output):
     def run_backward(self, loss):
         loss.backward()
 
+    def get_ignored_modules(self):
+        return [self.transformer]
+
 
 class NestedWrappedModule(nn.Module):
     def __init__(self, group, wrap_fsdp, *args, wrap_everything=False, fsdp_init_mode=FSDPInitMode.CUDA_AFTER, **kwargs):
@@ -220,6 +273,8 @@ def __init__(
         fsdp_init_mode=FSDPInitMode.CUDA_AFTER,
         cpu_offload=None,
         backward_prefetch=None,
+        sharding_strategy=None,
+        mixed_precision=None,
         **kwargs
     ):
         super().__init__(
@@ -229,6 +284,8 @@ def __init__(
                 fsdp_init_mode=fsdp_init_mode,
                 cpu_offload=cpu_offload,
                 backward_prefetch=backward_prefetch,
+                sharding_strategy=sharding_strategy,
+                mixed_precision=mixed_precision,
             ),
             **kwargs
         )
@@ -346,10 +403,7 @@ def _run(cls, rank, test_name, file_name, pipe):
 
         # Specify gloo backend to make 'init_process_group()' succeed,
         # Actual tests will be skipped if there is no enough GPUs.
-
-        backend = os.environ.get("BACKEND", None)
-        if backend is None:
-            backend = "nccl" if torch.cuda.is_available() else "gloo"
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
 
         try:
             dist.init_process_group(
@@ -379,10 +433,23 @@ def _run(cls, rank, test_name, file_name, pipe):
         dist.destroy_process_group()
         sys.exit(0)
 
-    def _train_for_several_steps(self, model, num_steps, autocast, lr=0.01, fsdp_cpu_offload=None):
+    def _train_for_several_steps(
+        self,
+        model,
+        num_steps,
+        autocast,
+        lr=0.01,
+        fsdp_cpu_offload=None,
+        clip_norm=0.3,
+        norm_type=None,
+        save_model=False,
+        mixed_precision=None,
+        enable_sharded_grad_scaler=False,
+    ):
         cpu_offload_params = fsdp_cpu_offload and fsdp_cpu_offload.offload_params
 
         model_device = next(model.parameters()).device
+        sharded_grad_scaler = ShardedGradScaler(enabled=enable_sharded_grad_scaler)
         # use SGD with momentum instead of Adam, since Adam is scale invariant
         # and this makes it bad for tests
         optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
@@ -391,6 +458,11 @@ def _train_for_several_steps(self, model, num_steps, autocast, lr=0.01, fsdp_cpu
             with torch.cuda.amp.autocast(enabled=autocast):
                 # Inputs always cuda regardless of cpu offloading, or model.device
                 input = model.module.get_input(torch.device("cuda"))
+                if mixed_precision and not isinstance(model, FullyShardedDataParallel):
+                    if isinstance(input, torch.Tensor):
+                        input = input.half()
+                    else:
+                        input = tuple(x.half() for x in input)
                 output = model(*input)
                 # Post-forward, if CPU offloading model param should be on CPU.
                 if cpu_offload_params and isinstance(model, FullyShardedDataParallel):
@@ -400,18 +472,51 @@ def _train_for_several_steps(self, model, num_steps, autocast, lr=0.01, fsdp_cpu
                         self.assertEqual(p.device, torch.device("cpu"))
 
                 loss = model.module.get_loss(input, output).to(model_device)
-            assert (
-                loss.dtype == torch.float32
-            ), "loss data type should be float32, as the original \
-                 parameter data type is float32."
+            loss = sharded_grad_scaler.scale(loss)
+
+            if not mixed_precision:
+                assert (
+                    loss.dtype == torch.float32
+                ), "loss data type should be float32, as the original \
+                    parameter data type is float32."
+            else:
+                # FSDP loss is fp16, DDP AMP loss is fp32
+                if isinstance(model, FullyShardedDataParallel):
+                    self.assertEqual(loss.dtype, mixed_precision.param_dtype)
+                else:
+                    self.assertEqual(loss.dtype, torch.float32)
             model.module.run_backward(loss)
+            if norm_type is not None:
+                if isinstance(model, FullyShardedDataParallel):
+                    model.clip_grad_norm_(clip_norm, norm_type)
+                    total_norm_after_clip = _collect_total_grad_norm_fsdp(
+                        model, norm_type, self.rank
+                    )
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm, norm_type)
+                    total_norm_after_clip = _collect_total_grad_norm_local(
+                        model, norm_type
+                    )
+                self.assertTrue(total_norm_after_clip <= clip_norm)
             # Post-backward, if CPU offloading model params should be on CPU.
             if cpu_offload_params and isinstance(model, FullyShardedDataParallel):
                 for p in model.parameters():
                     # Params should always be on CPU, even if
                     # p._is_sharded=False
                     self.assertEqual(p.device, torch.device("cpu"))
-            optim.step()
+            # Unscale the gradients and step
+            sharded_grad_scaler.step(optim)
+            # Update the scale factor
+            sharded_grad_scaler.update()
+            # if save_model, simulate save + load.
+            if save_model:
+                state_dict = {k: v.clone() for k, v in model.state_dict().items()}
+                # Zero params, if save/load state_dict did not work properly, this
+                # would break the parity test with DDP.
+                _zero_model(model)
+
+                model.load_state_dict(state_dict)
+
         if isinstance(model, FullyShardedDataParallel):
             model._assert_state(TrainingState_.IDLE)
         return loss.detach()
@@ -426,6 +531,12 @@ def _test_identical_outputs(
         lr=0.01,
         cpu_offload=CPUOffload(),
         backward_prefetch=None,
+        sharding_strategy=None,
+        mixed_precision=None,
+        save_model=True,
+        clip_norm=0.3,
+        norm_type=None,
+        enable_sharded_grad_scaler=False,
         **kwargs
     ):
         group = dist.distributed_c10d._get_default_group()
@@ -438,8 +549,12 @@ def _test_identical_outputs(
             )
         else:
             model = ref_ddp_fn(model)
+
+        # DDP training
         ref_loss = self._train_for_several_steps(
-            model, num_steps, autocast=False, lr=lr, fsdp_cpu_offload=cpu_offload
+            model, num_steps, autocast=mixed_precision is not None, lr=lr,
+            fsdp_cpu_offload=cpu_offload, mixed_precision=mixed_precision,
+            enable_sharded_grad_scaler=enable_sharded_grad_scaler,
         )
         ref_full_params = list(model.parameters())
 
@@ -450,13 +565,21 @@ def _test_identical_outputs(
                 wrap_fsdp=True,
                 fsdp_init_mode=fsdp_init_mode,
                 cpu_offload=cpu_offload,
-                backward_prefetch=backward_prefetch
+                backward_prefetch=backward_prefetch,
+                sharding_strategy=sharding_strategy,
+                mixed_precision=mixed_precision,
             )
         except Exception as e:
             raise ValueError(f"model_Init_fn {model_init_fn} got error {str(e)}")
 
         cpu_offload = cpu_offload or CPUOffload()  # disabled if not specified.
-        model = FullyShardedDataParallel(model, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch)
+        model = FullyShardedDataParallel(
+            model,
+            cpu_offload=cpu_offload,
+            backward_prefetch=backward_prefetch,
+            sharding_strategy=sharding_strategy,
+            mixed_precision=mixed_precision,
+        )
         # Call model.cuda() after init FSDP if specified.
         if fsdp_init_mode == FSDPInitMode.CUDA_AFTER:
             model = model.cuda()
@@ -475,9 +598,12 @@ def _test_identical_outputs(
             if only_check_err else suppress()
         )
         with ctx:
+            # FSDP training
             shard_loss = self._train_for_several_steps(
                 model, num_steps, autocast=False, lr=lr,
-                fsdp_cpu_offload=cpu_offload,
+                fsdp_cpu_offload=cpu_offload, save_model=save_model,
+                mixed_precision=mixed_precision,
+                enable_sharded_grad_scaler=enable_sharded_grad_scaler,
             )
         # We only check for errors in the case we have the following setup:
         # model = FSDP(model, cpu_offload=True)
@@ -494,28 +620,105 @@ def _test_identical_outputs(
                 device_set,
                 f"Got device set {device_set}"
             )
-        get_full_params(model)
-        shard_full_params = list(model.parameters())
+        shard_full_params = get_full_params(model)
 
         if cpu_offload.offload_params:
             shard_loss = shard_loss.cuda()
         torch.testing.assert_allclose(ref_loss, shard_loss)
-        self.assertEqual(
-            ref_full_params,
-            shard_full_params,
-            exact_device=True,
-            msg="FullyShardedDataParallel didn't match PyTorch DDP",
-        )
+        # Note that we don't do parameter check when testing mixed precision,
+        # as FSDP will bring the full param back to fp32 but we did model.half()
+        # for DDP so they wouldn't be equal. Further, DDP + model.half() would
+        # run optimizer in reduced precision versus FSDP's full precision.
+        if not mixed_precision:
+            self.assertEqual(
+                ref_full_params,
+                shard_full_params,
+                exact_device=True,
+                msg="FullyShardedDataParallel didn't match PyTorch DDP",
+            )
 
     def _get_wrapped_model(
-        self, group, cuda_first=False, **model_kwargs
+        self, group, cuda_first=False, ignore_modules=False, config=None,
+        **model_kwargs,
     ) -> FullyShardedDataParallel:
-        if cuda_first:
-            model = FullyShardedDataParallel(
-                TransformerWithSharedParams(group, **model_kwargs).cuda(), group
-            )
-        else:
-            model = FullyShardedDataParallel(
-                TransformerWithSharedParams(group, **model_kwargs), group
-            ).cuda()
+        if config is None:
+            config = {}
+        move_to_cuda = not (
+            "cpu_offload" in config and config["cpu_offload"].offload_params
+        )
+        transformer = TransformerWithSharedParams(group, **model_kwargs)
+        if cuda_first and move_to_cuda:
+            transformer = transformer.cuda()
+        if ignore_modules:
+            assert "ignored_modules" not in config, \
+                "Do not pass in `ignored_modules` via `config`"
+            config["ignored_modules"] = transformer.get_ignored_modules()
+        model = FullyShardedDataParallel(transformer, group, **config)
+        if not cuda_first and move_to_cuda:
+            model = model.cuda()
         return model
+
+    def _get_nonwrapped_model(
+        self, group, **model_kwargs,
+    ) -> torch.nn.Module:
+        """Returns the non-wrapped model that is wrapped in
+        :meth:`_get_wrapped_model`. The model used in these two methods should
+        be kept in sync for tests that use both for parity comparisons."""
+        return TransformerWithSharedParams(group, **model_kwargs).cuda()
+
+
+class SkipModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = nn.Linear(10, 10, bias=False)
+
+    def forward(self, x):
+        return self.lin(x)
+
+
+class NestedLinear(nn.Module):
+    def __init__(self, fsdp_wrap):
+        super().__init__()
+        if fsdp_wrap:
+            self.nested_linear = wrap(nn.Linear(10, 10, bias=False).cuda())
+        else:
+            self.nested_linear = nn.Linear(10, 10, bias=False).cuda()
+
+    def forward(self, x):
+        return self.nested_linear(x)
+
+
+class SkipModel(nn.Module):
+    def __init__(self, double_nest):
+        super().__init__()
+        self.linear = nn.Linear(10, 10, bias=False).cuda()
+        self.linear_skip = SkipModule().cuda()
+        self.nested_linear = wrap(NestedLinear(fsdp_wrap=double_nest))
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.linear_skip(x)
+        x = self.nested_linear(x)
+        return x
+
+
+def _collect_total_grad_norm_fsdp(model, norm_type, rank):
+    total_norm = _collect_total_grad_norm_local(model, norm_type)
+    op = torch.distributed.ReduceOp.SUM
+    if norm_type == inf:
+        op = torch.distributed.ReduceOp.MAX
+        norm_type = 1.0
+    return_norm = torch.tensor(total_norm ** norm_type, device=rank)
+    dist.all_reduce(return_norm, op=op)
+    return return_norm ** (1.0 / norm_type)
+
+
+def _collect_total_grad_norm_local(model, norm_type):
+    if norm_type == inf:
+        return max(p.grad.abs().max() for p in model.parameters())
+    else:
+        total_norm = 0.0
+        for p in model.parameters():
+            local_norm = torch.linalg.vector_norm(p.grad, norm_type, dtype=torch.float32)
+            total_norm += local_norm ** norm_type
+        return total_norm ** (1.0 / norm_type)
diff --git a/torch/testing/_internal/common_fx2trt.py b/torch/testing/_internal/common_fx2trt.py
index 79eab0d68d14..f2c2d7eca05a 100644
--- a/torch/testing/_internal/common_fx2trt.py
+++ b/torch/testing/_internal/common_fx2trt.py
@@ -1,17 +1,19 @@
 import unittest
 from typing import Callable, List, Tuple
 
+import fx2trt_oss.tracer.acc_tracer.acc_tracer as acc_tracer
 import torch
 import torch.fx
-import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer
 from fx2trt_oss.fx import (
     TRTInterpreter,
     InputTensorSpec,
     TRTModule,
 )
-from torch.testing._internal.common_utils import TestCase
+from fx2trt_oss.fx.passes.pass_utils import chain_passes
+from fx2trt_oss.fx.utils import LowerPrecision
 from torch.fx.experimental.normalize import NormalizeArgs
 from torch.fx.passes import shape_prop
+from torch.testing._internal.common_utils import TestCase
 
 
 def fetch_attr(mod, target):
@@ -41,7 +43,7 @@ def setUp(self):
         super().setUp()
         torch.manual_seed(3)
 
-    def run_test(self, mod, inputs, expected_ops, unexpected_ops, interpreter, rtol, atol):
+    def run_test(self, mod, inputs, expected_ops, unexpected_ops, interpreter, rtol, atol, precision=LowerPrecision.FP32):
         with torch.no_grad():
             cuda_inputs = []
             for i in inputs:
@@ -53,7 +55,7 @@ def run_test(self, mod, inputs, expected_ops, unexpected_ops, interpreter, rtol,
             if unexpected_ops:
                 self.assert_unexpected_op(mod, unexpected_ops)
 
-            interpreter_result = interpreter.run(fp16_mode=False)
+            interpreter_result = interpreter.run(lower_precision=precision)
             trt_mod = TRTModule(
                 interpreter_result.engine,
                 interpreter_result.input_names,
@@ -66,8 +68,10 @@ def run_test(self, mod, inputs, expected_ops, unexpected_ops, interpreter, rtol,
             if isinstance(outputs, torch.Tensor):
                 ref_outputs = [ref_outputs]
                 outputs = [outputs]
-
             for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
                 torch.testing.assert_allclose(out.cpu(), ref, rtol=rtol, atol=atol)
 
     def run_test_custom_compare_results(
@@ -100,7 +104,7 @@ def run_test_custom_compare_results(
             if len(expected_ops):
                 self.assert_has_op(mod, expected_ops)
 
-            interpreter_result = interpreter.run(fp16_mode=fp16_mode)
+            interpreter_result = interpreter.run(lower_precision=LowerPrecision.FP16 if fp16_mode else LowerPrecision.FP32)
             trt_mod = TRTModule(
                 interpreter_result.engine,
                 interpreter_result.input_names,
@@ -125,7 +129,7 @@ def run_test_with_error(self, mod, inputs, interpreter, expect_error):
                     cuda_inputs.append(i.cuda())
 
                 mod.eval()
-                interpreter.run(fp16_mode=False)
+                interpreter.run(lower_precision=LowerPrecision.FP32)
 
     def assert_has_op(self, mod, ops):
         ops_in_mod = set()
@@ -190,26 +194,38 @@ def run_test(
         apply_passes=None,
         test_explicit_batch_dim=True,
         test_implicit_batch_dim=True,
+        test_explicit_precision=False,
         rtol=1e-03,
         atol=1e-03,
+        precision=LowerPrecision.FP32,
     ):
         mod.eval()
         mod = acc_tracer.trace(mod, inputs)
 
         if apply_passes is not None:
-            for p in apply_passes:
-                mod = p(mod)
+            pass_tracer = chain_passes(*apply_passes)
+            mod = pass_tracer(mod, inputs)
 
         if test_implicit_batch_dim:
             interp = TRTInterpreter(mod, InputTensorSpec.from_tensors(inputs))
-            super().run_test(mod, inputs, expected_ops, unexpected_ops, interp, rtol, atol)
+            super().run_test(mod, inputs, expected_ops, unexpected_ops, interp, rtol, atol, precision)
 
         if test_explicit_batch_dim:
             interp = TRTInterpreter(
                 mod, InputTensorSpec.from_tensors(inputs), explicit_batch_dimension=True
             )
+            super().run_test(mod, inputs, expected_ops, unexpected_ops, interp, rtol, atol, precision)
+
+        if test_explicit_precision:
+            interp = TRTInterpreter(mod, InputTensorSpec.from_tensors(inputs), explicit_precision=test_explicit_precision)
             super().run_test(mod, inputs, expected_ops, unexpected_ops, interp, rtol, atol)
 
+            interp = TRTInterpreter(
+                mod, InputTensorSpec.from_tensors(inputs), explicit_batch_dimension=True, explicit_precision=test_explicit_precision
+            )
+            super().run_test(mod, inputs, expected_ops, unexpected_ops, interp, rtol, atol, precision)
+
+
     def run_test_with_assert_error(
         self,
         mod,
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 8154963091eb..30e320743ad2 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -272,14 +272,15 @@ def assertAutodiffNode(self, graph, should_autodiff_node, nonfusible_nodes, fusi
                 fusion_nodes_not_found.append(node)
         found_all_fusible_nodes = len(fusion_nodes_found) == len(fusible_nodes)
 
-        err_msg = self.autoDiffErrorMessage(should_autodiff_node,
-                                            nodes_not_in_diff_graph,
-                                            fusion_nodes_not_found,
-                                            non_fusible_nodes_being_fused,
-                                            fusion_nodes_found,
-                                            nodes_in_diff_graph)
-        self.assertEqual(should_autodiff_node,
-                         found_all_nonfusible_nodes and found_all_fusible_nodes, err_msg)
+        if should_autodiff_node is not None:
+            err_msg = self.autoDiffErrorMessage(should_autodiff_node,
+                                                nodes_not_in_diff_graph,
+                                                fusion_nodes_not_found,
+                                                non_fusible_nodes_being_fused,
+                                                fusion_nodes_found,
+                                                nodes_in_diff_graph)
+            self.assertEqual(should_autodiff_node,
+                             found_all_nonfusible_nodes and found_all_fusible_nodes, err_msg)
 
     def checkShapeAnalysis(self, out_sizes: Union[List[int], List[List[int]]],
                            traced_graph, assert_propagation, constant_prop=True):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 59ac1100b170..cbb43439a26d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -14,19 +14,24 @@
 from torch._six import inf
 import collections.abc
 
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, Iterable
+from dataclasses import dataclass, asdict
+from torchgen.utils import dataclass_repr
 
-from torch.testing import make_non_contiguous, make_tensor
+from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     _dispatch_dtypes, floating_types, floating_types_and, complex_types, floating_and_complex_types,
     floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, integral_types_and,
-    all_types, double_types, empty_types
+    all_types, double_types, empty_types, complex_types_and, integral_types
 )
 from torch.testing._internal.common_device_type import \
-    (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
-     skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride,
+    (onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
+     skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, skipCUDAIf, precisionOverride,
+     skipCPUIfNoMklSparse,
      toleranceOverride, tol, has_cusolver)
-from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater, SM60OrLater
+from torch.testing._internal.common_cuda import (
+    CUDA11OrLater, SM53OrLater, SM60OrLater, with_tf32_off, TEST_CUDNN,
+    _get_torch_cuda_version, _get_magma_version)
 from torch.testing._internal.common_utils import \
     (is_iterable_of_tensors,
      random_symmetric_matrix, random_symmetric_psd_matrix,
@@ -39,10 +44,16 @@
      freeze_rng_state)
 import torch.testing._internal.opinfo_helper as opinfo_helper
 
+import torch._refs as refs  # noqa: F401
+import torch._refs.nn.functional
+import torch._refs.special
+
 from distutils.version import LooseVersion
 
 has_scipy_fft = False
 if TEST_SCIPY:
+    from scipy import stats
+    import scipy.spatial
     import scipy.special
     try:
         import scipy.fft
@@ -77,6 +88,11 @@ def __init__(self, decorators, cls_name=None, test_name=None, *,
         self.dtypes = dtypes
         self.active_if = active_if
 
+        # Validate dtypes
+        if self.dtypes is not None:
+            for dtype in self.dtypes:
+                assert isinstance(dtype, torch.dtype)
+
     def is_active(self, cls_name, test_name, device_type, dtype):
         return (
             self.active_if and
@@ -86,18 +102,19 @@ def is_active(self, cls_name, test_name, device_type, dtype):
             (self.dtypes is None or dtype in self.dtypes)
         )
 
-
+# FIXME
+# Note: historically the 'input' kwarg had to be a Tensor or TensorList, but we are trying
+#   to support scalar inputs, too. Some tests still depend on 'input' being a Tensor
+#   or TensorList, however.
 class SampleInput(object):
     """Represents sample inputs to a function."""
 
     __slots__ = ['input', 'args', 'kwargs', 'output_process_fn_grad', 'broadcasts_input', 'name']
 
     def __init__(self, input, *, args=tuple(), kwargs=None, output_process_fn_grad=lambda x: x, broadcasts_input=False, name=""):
-        # input is the first input to the op and must be either a Tensor or TensorList (Sequence[Tensor]).
+        # input is the first input to the op and is typically either a Tensor or TensorList (Sequence[Tensor]).
         # This follows the typical pattern where for Tensor inputs op(t, ...) = t.op(...).
-        # op with TensorList inputs do not support method or inplace variants.
-        assert isinstance(input, torch.Tensor) or is_iterable_of_tensors(input)
-        self.input: Union[torch.Tensor, Sequence[torch.Tensor]] = input
+        self.input = input
         self.args = args
         self.kwargs = kwargs if kwargs is not None else {}
         self.output_process_fn_grad = output_process_fn_grad
@@ -159,7 +176,8 @@ def formatter(arg):
     def transform(self, f):
         def tt(t):
             def _tt(t):
-                return f(t)
+                with torch.no_grad():
+                    return f(t)
 
             if isinstance(t, torch.Tensor):
                 return _tt(t)
@@ -175,7 +193,15 @@ def _tt(t):
                 return t
 
         sample_tt_input, tt_args, tt_kwargs = tt(self.input), tt(self.args), tt(self.kwargs)
-        return (sample_tt_input, tt_args, tt_kwargs)
+
+        # Note the transformed SampleInput assumes metadata like output_process_fn_grad is still valid!
+        return SampleInput(
+            sample_tt_input,
+            args=tt_args,
+            kwargs=tt_kwargs,
+            output_process_fn_grad=self.output_process_fn_grad,
+            broadcasts_input=self.broadcasts_input,
+            name=self.name + "_transformed")
 
     # Returns the NumPy version of the sample input object in the form of a tuple: (input, args, kwargs)
     # Converts tensors to ndarrays by calling .detach().cpu().numpy() on them
@@ -183,19 +209,27 @@ def _tt(t):
     def numpy(self):
         def to_numpy(t):
             if isinstance(t, torch.Tensor):
+                if t.dtype is torch.bfloat16:
+                    return t.detach().cpu().to(torch.float32).numpy()
+                if t.dtype is torch.chalf:
+                    return t.detach().cpu().to(torch.cfloat).numpy()
                 return t.detach().cpu().numpy()
             elif isinstance(t, torch.dtype):
                 return torch_to_numpy_dtype_dict[t]
 
+            return t
+
         return self.transform(to_numpy)
 
     def noncontiguous(self):
         def to_noncontiguous(t):
             if isinstance(t, torch.Tensor):
                 return noncontiguous_like(t)
-            if isinstance(t, torch.dtype):
+            elif isinstance(t, torch.dtype):
                 return t
 
+            return t
+
         return self.transform(to_noncontiguous)
 
 
@@ -207,7 +241,7 @@ class ErrorInput(object):
 
     __slots__ = ['sample_input', 'error_type', 'error_regex']
 
-    def __init__(self, sample_input, *, error_type, error_regex):
+    def __init__(self, sample_input, *, error_type=RuntimeError, error_regex):
         self.sample_input = sample_input
         self.error_type = error_type
         self.error_regex = error_regex
@@ -342,7 +376,7 @@ def close_to_int(x, eps=0.1):
 #   "name".
 #
 #   All the "sample_inputs" functions are invoked within a `torch.no_grad()`
-#   environment for efficiency and correctness. As such remember to set the the
+#   environment for efficiency and correctness. As such remember to set the
 #   "requires_grad" flag on the inputs **after** performing any transformations
 #   on them.
 #
@@ -372,6 +406,16 @@ def close_to_int(x, eps=0.1):
 # "name" is a string that's just used for debugging. It appears when printing
 #   the SampleInput.
 #
+# Sample inputs are designed to be used with many tests, some
+#   that are very time consuming, so they should be a small
+#   set with small tensors. An elaborated set of sample inputs
+#   can be specified using the "reference_inputs_func" attribute.
+#   The "reference inputs" for an operation are an extended
+#   set of sample inputs that can more exhausively test an
+#   operator. They are used by only a few tests that are careful
+#   not to take too long to run. Adding reference inputs
+#   is highly encouraged!
+#
 # THE (OPTIONAL) ERROR INPUTS FUNCTION
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
@@ -419,6 +463,13 @@ def close_to_int(x, eps=0.1):
 #   - that the operation is autodifferentiated by the jit as expected
 #   - that the operator's aliases, if any, perform the same operation and that
 #       the jit understands the alias
+#   - that the operator throws the correct errors (if error_inputs is defined)
+#   - that the operator produces the same results as a NumPy reference (if ref is defined)
+#   - that the operator produces the same results as a NumPy reference on an extended
+#       set of "reference inputs" (if both ref and reference_inputs_func are defined)
+#       (NOTE: elementwise unary and elementwise binary OpInfos do this even if only
+#         ref is defined, because they effectively autogenerate reference inputs)
+#   - that the operator works on different CUDA devices
 #
 # Additional OpInfo tests are in test_jit_fuser_te.py, test_fx_experimental.py,
 #   and test_fx.py. These tests validate that operators work with NNC and FX
@@ -430,9 +481,15 @@ def close_to_int(x, eps=0.1):
 # In addition to these tests, some subclasses (discussed in the next section)
 #   define additional tests.
 #
-# Critically, as mentioned above, what's not tested is that the operator
+# Critically, as mentioned above, what's not necessarily tested is that the operator
 #   works as expected. When implementing an OpInfo an engineer must still
 #   typically write one or more tests validating the operator's behavior.
+#   The exception to this is if reference testing is sufficient, or if
+#   the operation belongs to an OpInfo subclass that has more exhaustive
+#   operator testing. Elementwise unary and elementwise binary operators,
+#   in particular, usually don't require additional testing beyond
+#   writing an Opinfo.
+#
 #
 # OPINFO (SUB)CLASSES
 # ~~~~~~~~~~~~~~~~~~~
@@ -465,9 +522,9 @@ def close_to_int(x, eps=0.1):
 # then you should typically add an OpInfo for it.
 #
 # As mentioned a couple times above, implementing an OpInfo is not
-#   usually sufficient testing (unless the operator is a unary elementwise
+#   usually sufficient testing (unless the operator is a unary or binary elementwise
 #   operator). The OpInfo will only test the properties described in the
-#   "WHAT'S TESTED" section. It DOES NOT verify that the operator is
+#   "WHAT'S TESTED" section. It DOES NOT necessarily verify that the operator is
 #   implemented correctly.
 #
 # TIPS FOR WRITING AN OPINFO AND OPINFO TESTS
@@ -509,126 +566,229 @@ def close_to_int(x, eps=0.1):
 #
 
 # Classes and methods for the operator database
+@dataclass
 class OpInfo(object):
     """Operator information and helper functions for acquiring it."""
 
-    def __init__(self,
-                 name,  # the string name of the function
-                 *,
-                 ref=None,  # An optional reference function that accepts ndarrays (AKA "NumPy arrays").
-                            # If given, the op will be compared with its reference on each of its sample inputs.
-                 # the following metadata describes the operator, its variants,
-                 #   and its aliases, if any
-                 aliases=None,  # iterable of aliases, e.g. ("absolute",) for torch.abs
-                 variant_test_name='',  # additional string to include in the test name
-                                        # this is useful when an op needs multiple OpInfos,
-                                        # like divide does, often because it's really several
-                                        # different ops behind the scenes
-                 op=None,  # the function variant of the operation, populated as torch.<name> if None
-                 method_variant=_NOTHING,  # explicitly specifies the method variant of the operator
-                                           # if _NOTHING (default), the method variant will be autopopulated
-                                           # if None, then the OpInfo specifies no method variant
-                 inplace_variant=_NOTHING,  # explicitly specifies the inplace variant of the operator
-                                            # if _NOTHING (default), the method variant will be autopopulated
-                                            # if None, then the OpInfo specifies no method variant
-
-                 # the following metadata are test directives for skipping or
-                 #   modifying tests
-                 skips=tuple(),  # information about which tests to skip
-                 decorators=tuple(),  # decorators to apply to generated tests
-
-                 # the following are pointers to functions to generate certain classes
-                 #   of inputs
-                 sample_inputs_func=None,  # function to generate sample inputs
-                 error_inputs_func=None,  # function to generate inputs that will throw errors
-
-                 # the following metadata relates to dtype support and is tested for correctness in test_ops.py
-                 dtypes,  # dtypes this function works with on the CPU,
-                          # inherited by other device types that don't specify their own dtypes
-
-                 # the following dtypesIf... options override the dtypes value
-                 # on their respective device types
-                 dtypesIfCPU=None,  # dtypes this function is expected to work with on the CPU,
-                                    # typically unnecessary since it's (now) redundant with the dtypes kwarg above
-                 dtypesIfCUDA=None,  # dtypes this function is expected to work with on CUDA
-                 dtypesIfROCM=None,  # dtypes this function is expected to work with on ROCM
-                 backward_dtypes=None,  # backward dtypes this function is expected to work with
-                 backward_dtypesIfCPU=None,  # backward dtypes this function is expected to work with on CPU
-                 backward_dtypesIfCUDA=None,  # backward dtypes this function is expected to work with on CUDA
-                 backward_dtypesIfROCM=None,  # backward dtypes this function is expected to work with on ROCM
-                 default_test_dtypes=None,  # dtypes to test with by default. Tests are instantiated with
-                                            # these dtypes for the op unless otherwise specified.
-                                            # This is helpful in reducing the test matrix.
-                 # the following metadata describes the operators out= support
-                 supports_out=True,  # whether the op supports the out kwarg
-                                     # defaults to True, if the op does not allow the out kwarg or
-                                     # supports it incorrectly then test_out in test_ops.py should fail
-                 safe_casts_outputs=False,  # whether op allows safe casting when writing to out arguments
-
-                 # the following metadata relates to autograd support
-                 supports_autograd=True,  # whether the operation supports backward mode AD
-                                          # if true, gradient correctness is tested in test_ops.py
-                                          # using the op's sample inputs
-                 supports_gradgrad=None,  # whether the op supports second order gradients
-                                          # if true, gradgrad correctness is tested in test_ops.py
-                                          # defaults to support_autograd's value
-                                          # TODO: rename this to supports_bwgrad_bwgrad to be consistent with below
-                 supports_fwgrad_bwgrad=False,  # whether the ops supports second order gradients via
-                                                # forward-over-reverse. If True, forward-over-reverse gradgrad correctness
-                                                # is tested. If False, test that forward grad is not implemented.
-                                                # Defaults to False.
-                 supports_inplace_autograd=None,  # whether the operation supports inplace autograd
-                                                  # if true, tested in test_ops.py
-                                                  # defaults to supports_autograd's value
-                 supports_forward_ad=False,  # Whether the operation support forward mode AD
-                                             # If the value is True, we check that the gradients are correct
-                                             # If the value is False, we test that forward grad is not implemented
-                 gradcheck_wrapper=lambda op, *args, **kwargs: op(*args, **kwargs),  # wrapper function for gradcheck
-                 check_batched_grad=None,  # whether to check batched grad when doing gradcheck
-                                           # defaults to support_autograd's value
-                 check_batched_gradgrad=None,  # whether to check batched grad grad when doing gradgradcheck
-                                               # default's to support_gradgrad's value
-                 check_batched_forward_grad=None,  # whether to check batched forward grad when doing gradcheck
-                                                   # defaults to the value of `supports_forward_ad`
-                 check_inplace_batched_forward_grad=None,   # whether to check batched forward grad when doing gradcheck
-                                                            # defaults to the value of `check_batched_forward_grad`
-                 gradcheck_nondet_tol=0.0,  # tolerance for nondeterminism while performing gradcheck
-                 gradcheck_fast_mode=None,  # Whether to use the fast implmentation for gradcheck/gradgradcheck.
-                                            # When set to None, defers to the default value provided by the wrapper
-                                            # function around gradcheck (testing._internal.common_utils.gradcheck)
-
-                 # the following metadata relates to JIT support and is tested for correctness in test_ops.py
-                 aten_name=None,  # name of the corresponding aten:: operator
-                 assert_autodiffed=False,  # if a op's aten::node is expected to be symbolically autodiffed
-                 autodiff_nonfusible_nodes=None,  # a list of strings with node names that are expected to be in a
-                                                  # DifferentiableGraph when autodiffed. Ex: ['aten::add', 'aten::mm'],
-                                                  # default is populated to be ['aten::(name of Python operator)']
-                 autodiff_fusible_nodes=None,  # a list of strings with node names that are expected to be in FusionGroups
-                                               # inside of DifferentiableGraphs when this operation is autodiffed.
-                                               # Ex: ['aten::add', 'aten::mm'], defaults to an empty list
-                                               # Note: currently no ops use fusible nodes
-
-                 # the following metadata relates to sparse support and is used in test_sparse.py
-                 supports_sparse=False,  # whether the op supports sparse inputs
-
-                 supports_scripting=True,  # only run tracing tests
-                 # the following metadata relates to sparse csr support and is used in test_sparse_csr.py
-                 supports_sparse_csr=False,  # whether the op supports sparse csr inputs
-                 # the following metadata relates to complex support and is checked in test_ops.py
-                 test_conjugated_samples=True,
-                 test_neg_view=True,
-                 assert_jit_shape_analysis=False,  # assert that jit shape analysis fully propagates shape
-                 ):
-
-        dtypes_args = (dtypes, dtypesIfCPU, dtypesIfCUDA, dtypesIfROCM)
+    # the string name of the function
+    name: str
+
+    # An optional reference function that accepts ndarrays (AKA "NumPy arrays").
+    # If given, the op will be compared with its reference on each of its sample inputs.
+    ref: Callable = None
+
+    # the following metadata describes the operator, its variants, and its aliases, if any
+
+    # iterable of aliases, e.g. ("absolute",) for torch.abs
+    aliases: Iterable = None
+
+    # additional string to include in the test name
+    # this is useful when an op needs multiple OpInfos,
+    # like divide does, often because it's really several
+    # different ops behind the scenes
+    variant_test_name: str = ''
+
+    # the function variant of the operation, populated as torch.<name> if None
+    op: Callable = None
+
+    # allows the method variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated method
+    # - if a Callable, then that callable should be the method associated with this operation
+    method_variant: Callable = _NOTHING
+
+    # allows the inplace variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated inplace variant
+    # - if a Callable, then that callable should be the inplace variant associated with this operation
+    inplace_variant: Callable = _NOTHING
+
+    # allows the operator variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated operator
+    # - if a Callable, then that callable should be the operator associated with this operation
+    operator_variant: Callable = _NOTHING
+
+    # allows the inplace operator variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated inplace operator
+    # - if a Callable, then that callable should be the inplace operator associated with this operation
+    inplace_operator_variant: Callable = _NOTHING
+
+    # the following metadata are test directives for skipping or modifying tests
+
+    # information about which tests to skip
+    skips: Tuple = tuple()
+
+    # decorators to apply to generated tests
+    decorators: Tuple = tuple()
+
+    # the following are pointers to functions to generate certain classes of inputs
+
+    # function to generate sample inputs with strided layouts
+    sample_inputs_func: Callable = None
+
+    # function to generate a more thorough set of samples inputs with strided layouts
+    reference_inputs_func: Callable = None
+
+    # function to generate inputs that will throw errors
+    error_inputs_func: Callable = None
+
+    # function to generate sample inputs with sparse coo layouts
+    sample_inputs_sparse_coo_func: Callable = None
+
+    # function to generate sample inputs with sparse csr layouts
+    sample_inputs_sparse_csr_func: Callable = None
+
+    # the following metadata relates to dtype support and is tested for correctness in test_ops.py
+
+    # dtypes this function works with on the CPU,
+    # inherited by other device types that don't specify their own dtypes
+    dtypes: _dispatch_dtypes = None
+
+    # the following dtypesIf... options override the dtypes value on their respective device types
+
+    # dtypes this function is expected to work with on CUDA
+    dtypesIfCUDA: _dispatch_dtypes = None
+
+    # dtypes this function is expected to work with on ROCM
+    dtypesIfROCM: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with
+    backward_dtypes: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with on CUDA
+    backward_dtypesIfCUDA: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with on ROCM
+    backward_dtypesIfROCM: _dispatch_dtypes = None
+
+    # the following metadata describes the operators out= support
+
+    # whether the op supports the out kwarg
+    # defaults to True, if the op does not allow the out kwarg or
+    # supports it incorrectly then test_out in test_ops.py should fail
+    supports_out: bool = True
+
+    # the following metadata relates to autograd support
+    # whether the operation supports backward mode AD
+    # if true, gradient correctness is tested in test_ops.py
+    # using the op's sample inputs
+    supports_autograd: bool = True
+
+    # whether the op supports second order gradients
+    # if true, gradgrad correctness is tested in test_ops.py
+    # defaults to support_autograd's value
+    # TODO: rename this to supports_bwgrad_bwgrad to be consistent with below
+    supports_gradgrad: bool = None
+
+    # whether the ops supports second order gradients via
+    # forward-over-reverse. If True, forward-over-reverse gradgrad correctness
+    # is tested. If False, test that forward grad is not implemented.
+    # Defaults to False.
+    supports_fwgrad_bwgrad: bool = False
+
+    # whether the operation supports inplace autograd
+    # if true, tested in test_ops.py
+    # defaults to supports_autograd's value
+    supports_inplace_autograd: bool = None
+
+    # Whether the operation support forward mode AD
+    # If the value is True, we check that the gradients are correct
+    # If the value is False, we test that forward grad is not implemented
+    supports_forward_ad: bool = False
+
+    # wrapper function for gradcheck
+    gradcheck_wrapper: Callable = lambda op, *args, **kwargs: op(*args, **kwargs)
+
+    # whether to check batched grad when doing gradcheck
+    # defaults to support_autograd's value
+    check_batched_grad: bool = None
+
+    # whether to check batched grad grad when doing gradgradcheck
+    # default's to support_gradgrad's value
+    check_batched_gradgrad: bool = None
+
+    # whether to check batched forward grad when doing gradcheck
+    # defaults to the value of `supports_forward_ad`
+    check_batched_forward_grad: bool = None
+
+    # whether to check batched forward grad when doing gradcheck
+    # defaults to the value of `check_batched_forward_grad`
+    check_inplace_batched_forward_grad: bool = None
+
+    # tolerance for nondeterminism while performing gradcheck
+    gradcheck_nondet_tol: float = 0.0
+
+    # Whether to use the fast implmentation for gradcheck/gradgradcheck.
+    # When set to None, defers to the default value provided by the wrapper
+    # function around gradcheck (testing._internal.common_utils.gradcheck)
+    gradcheck_fast_mode: bool = None
+
+    # the following metadata relates to JIT support and is tested for correctness in test_ops.py
+
+    # name of the corresponding aten:: operator
+    aten_name: str = None
+
+    # if this is a composite implicit autograd op, the decomposed op
+    decomp_aten_name: Optional[str] = None
+
+    # name of the corresponding aten:: operator for backwards
+    aten_backward_name: Optional[str] = None
+
+    # if a op's aten::node is expected to be symbolically autodiffed
+    assert_autodiffed: bool = False
+
+    # a list of strings with node names that are expected to be in a
+    # DifferentiableGraph when autodiffed. Ex: ['aten::add', 'aten::mm'],
+    # default is populated to be ['aten::(name of Python operator)']
+    autodiff_nonfusible_nodes: List[str] = None
+
+    # a list of strings with node names that are expected to be in FusionGroups
+    # inside of DifferentiableGraphs when this operation is autodiffed.
+    # Ex: ['aten::add', 'aten::mm'], defaults to an empty list
+    # Note: currently no ops use fusible nodes
+    autodiff_fusible_nodes: List[str] = None
+
+    # the following metadata relates to sparse support and is used in test_sparse.py
+
+    # whether the op supports sparse inputs
+    supports_sparse: bool = False
+
+    # only run tracing tests
+    supports_scripting: bool = True
+
+    # the following metadata relates to sparse csr support and is used in test_sparse_csr.py
+
+    # whether the op supports sparse csr inputs
+    supports_sparse_csr: bool = False
+
+    # the following metadata relates to complex support and is checked in test_ops.py
+
+    test_conjugated_samples: bool = True
+
+    test_neg_view: bool = True
+
+    # assert that jit shape analysis fully propagates shape
+    assert_jit_shape_analysis: bool = False
+
+    # the following metadata relates to ExpandedWeights support and is checked in test_expanded_weights.py
+
+    supports_expanded_weight: bool = False
+
+    def __post_init__(self):
+        self._original_opinfo_args = asdict(self).copy()
+
+        assert self.dtypes is not None, "OpInfo for {0} has no dtypes!".format(self.name)
+
+        dtypes_args = (self.dtypes, self.dtypesIfCUDA, self.dtypesIfROCM)
+
         # Validates the dtypes are generated from the dispatch-related functions
         for dtype_list in dtypes_args:
             assert isinstance(dtype_list, (_dispatch_dtypes, type(None)))
 
-        self.name = name
-        self.ref = ref
-        self.aten_name = aten_name if aten_name is not None else name
-        self.variant_test_name = variant_test_name
+        if self.aten_name is None:
+            self.aten_name = self.name
 
         # Attribute to verify dynamic_dtypes are used.
         self.dynamic_dtypes = any(map(lambda dtypes: isinstance(
@@ -637,151 +797,141 @@ def __init__(self,
         if self.dynamic_dtypes:
             # Make sure `dtyesIfCUDA` is dynamic, if dynamic dispatch is used for CPU
             # This is because, below we set dtypesIfCUDA to dtypes if they are None.
-            assert isinstance(dtypesIfCUDA, opinfo_helper._dynamic_dispatch_dtypes), \
-                (f"To use dynamic dypes for operator {name}, "
+            assert isinstance(self.dtypesIfCUDA, opinfo_helper._dynamic_dispatch_dtypes), \
+                (f"To use dynamic dypes for operator {self.name}, "
                  "acquire the dtypes dynamically for argument `dtypesIfCUDA`."
                  "This is to ensure that CUDA dtypes are acquired correctly as they"
                  "differ from CPU dtypes occasionally")
 
-        self.dtypes = set(dtypes)
+        self.dtypes = set(self.dtypes)
 
         # NOTE: backward dtypes must be acquired before forward dtypes
         #   since they fallback to explicit (not implicit!) specifications of
         #   forward dtypes
-        self.backward_dtypes = set(backward_dtypes) if backward_dtypes is not None else self.dtypes
-        self.backward_dtypesIfCPU = set(backward_dtypesIfCPU) if backward_dtypesIfCPU is not None else (
-            backward_dtypes if backward_dtypes is not None
-            else dtypesIfCPU if dtypesIfCPU is not None
-            else dtypes)
-        self.backward_dtypesIfCUDA = set(backward_dtypesIfCUDA) if backward_dtypesIfCUDA is not None else (
-            backward_dtypes if backward_dtypes is not None
-            else dtypesIfCUDA if dtypesIfCUDA is not None
-            else dtypes)
-        self.backward_dtypesIfROCM = set(backward_dtypesIfROCM) if backward_dtypesIfROCM is not None else (
-            backward_dtypesIfCUDA if backward_dtypesIfCUDA is not None
-            else backward_dtypes if backward_dtypes is not None
-            else dtypesIfROCM if dtypesIfROCM is not None
-            else dtypesIfCUDA if dtypesIfCUDA is not None
-            else dtypes)
-
-        self.dtypesIfCPU = set(dtypesIfCPU) if dtypesIfCPU is not None else self.dtypes
-        self.dtypesIfCUDA = set(dtypesIfCUDA) if dtypesIfCUDA is not None else self.dtypes
-        self.dtypesIfROCM = set(dtypesIfROCM) if dtypesIfROCM is not None else self.dtypesIfCUDA
-
-        self._default_test_dtypes = set(default_test_dtypes) if default_test_dtypes is not None else None
+        self.backward_dtypesIfROCM = set(self.backward_dtypesIfROCM) if self.backward_dtypesIfROCM is not None else (
+            self.backward_dtypesIfCUDA if self.backward_dtypesIfCUDA is not None
+            else self.backward_dtypes if self.backward_dtypes is not None
+            else self.dtypesIfROCM if self.dtypesIfROCM is not None
+            else self.dtypesIfCUDA if self.dtypesIfCUDA is not None
+            else self.dtypes)
+        self.backward_dtypesIfCUDA = set(self.backward_dtypesIfCUDA) if self.backward_dtypesIfCUDA is not None else (
+            self.backward_dtypes if self.backward_dtypes is not None
+            else self.dtypesIfCUDA if self.dtypesIfCUDA is not None
+            else self.dtypes)
+        self.backward_dtypes = set(self.backward_dtypes) if self.backward_dtypes is not None else self.dtypes
+
+        self.dtypesIfCUDA = set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypes
+        self.dtypesIfROCM = set(self.dtypesIfROCM) if self.dtypesIfROCM is not None else self.dtypesIfCUDA
 
         # NOTE: if the op is unspecified it is assumed to be under the torch namespace
-        self.op = op if op else _getattr_qual(torch, self.name)
-        method_variant = getattr(torch.Tensor, name, None) if method_variant is _NOTHING else method_variant
-        # attributes like real, imag are not callable
-        self.method_variant = method_variant if callable(method_variant) else None
-        inplace_name = name + "_"
-        self.inplace_variant = getattr(torch.Tensor, inplace_name, None) \
-            if inplace_variant is _NOTHING else inplace_variant
-        self.operator_variant = getattr(operator, name, None)
+        if not self.op:
+            self.op = _getattr_qual(torch, self.name)
+
+        if self.method_variant is _NOTHING:
+            self.method_variant = getattr(torch.Tensor, self.name, None)
 
-        self.supports_out = supports_out
-        self.safe_casts_outputs = safe_casts_outputs
+        # attributes like real, imag are not callable
+        if not callable(self.method_variant):
+            self.method_variant = None
+
+        if self.inplace_variant is _NOTHING:
+            inplace_name = self.name + "_"
+            self.inplace_variant = getattr(torch.Tensor, inplace_name, None)
+
+        if self.operator_variant is _NOTHING:
+            self.operator_variant = getattr(operator, self.name, None)
+
+        if self.inplace_operator_variant is _NOTHING:
+            # Note: operator.i<op> will use operator.<op> and assign the result to the lhs when no
+            # __i<op>__ method is found. This results in the appearance of an inplace operator variant which
+            # does not have the correct inplace behavior. To avoid this, we guard automatic detection of the inplace
+            # operator with a check that an inplace variant exists.
+            if self.inplace_variant is not None:
+                inplace_operator_name = "i" + self.name
+                self.inplace_operator_variant = getattr(operator, inplace_operator_name, None)
+            else:
+                self.inplace_operator_variant = None
 
-        self.decorators = (*decorators, *skips)
+        self.decorators = (*self.decorators, *self.skips)
 
         # We run the sampling functions without tracking the gradiends of the creation of inputs
-        self.sample_inputs_func = torch.no_grad()(sample_inputs_func)
-        self.error_inputs_func = error_inputs_func
+        self.sample_inputs_func = torch.no_grad()(self.sample_inputs_func)
+        self.sample_inputs_sparse_coo_func = torch.no_grad()(self.sample_inputs_sparse_coo_func)
+        self.sample_inputs_sparse_csr_func = torch.no_grad()(self.sample_inputs_sparse_csr_func)
+        if self.reference_inputs_func is not None:
+            self.reference_inputs_func = torch.no_grad()(self.reference_inputs_func)
 
-        self.assert_autodiffed = assert_autodiffed
-        self.autodiff_fusible_nodes = autodiff_fusible_nodes if autodiff_fusible_nodes else []
-        if autodiff_nonfusible_nodes is None:
+        if not self.autodiff_fusible_nodes:
+            self.autodiff_fusible_nodes = []
+
+        if self.autodiff_nonfusible_nodes is None:
             self.autodiff_nonfusible_nodes = ['aten::' + self.name]
-        else:
-            self.autodiff_nonfusible_nodes = autodiff_nonfusible_nodes
 
         # Autograd support
 
-        # Autograd flags that don't depend on backward AD
-        self.supports_autograd = supports_autograd
-        self.supports_forward_ad = supports_forward_ad
-        self.gradcheck_fast_mode = gradcheck_fast_mode
-        self.gradcheck_wrapper = gradcheck_wrapper
-        self.gradcheck_nondet_tol = gradcheck_nondet_tol
-
         # Autograd flags that depend on backward AD only
         # - If setting has been explicitly set, raise error if inconsistent
-        if supports_gradgrad is None:
-            supports_gradgrad = supports_autograd
+        if self.supports_gradgrad is None:
+            self.supports_gradgrad = self.supports_autograd
         else:
-            assert not (supports_gradgrad and not supports_autograd), (
+            assert not (self.supports_gradgrad and not self.supports_autograd), (
                 "supports_gradgrad refines the part of autograd is supported, so it should "
                 "not be set if supports_autograd is False")
-        if check_batched_grad is None:
-            check_batched_grad = supports_autograd or supports_forward_ad
+        if self.check_batched_grad is None:
+            self.check_batched_grad = self.supports_autograd or self.supports_forward_ad
         else:
-            assert not (check_batched_grad and not (supports_autograd or supports_forward_ad)), (
+            assert not (self.check_batched_grad and not (self.supports_autograd or self.supports_forward_ad)), (
                 "check_batched_grad refines the part of autograd that will be checked (by gradcheck), so "
                 "it should not be set if supports_autograd is False")
-        if check_batched_gradgrad is None:
-            check_batched_gradgrad = supports_gradgrad
+        if self.check_batched_gradgrad is None:
+            self.check_batched_gradgrad = self.supports_gradgrad
         else:
-            assert not (check_batched_gradgrad and not supports_gradgrad), (
+            assert not (self.check_batched_gradgrad and not self.supports_gradgrad), (
                 "check_batched_gradgrad refines the part of autograd that will be checked (by "
                 "gradgradcheck), so it should not be set if either supports_gradgrad or supports_autograd "
                 "is False.")
-        if check_batched_forward_grad is None:
-            check_batched_forward_grad = supports_forward_ad
+        if self.check_batched_forward_grad is None:
+            self.check_batched_forward_grad = self.supports_forward_ad
         else:
-            assert not (check_batched_forward_grad and not supports_forward_ad), (
+            assert not (self.check_batched_forward_grad and not self.supports_forward_ad), (
                 "check_batched_forward_grad should only be used when supports_forward_ad "
                 "is True. It is used to disable the test in the specific cases "
                 "where the op supports forward ad but fails to compute "
                 "batched forward grad.")
 
-        if check_inplace_batched_forward_grad is None:
-            check_inplace_batched_forward_grad = check_batched_forward_grad
+        if self.check_inplace_batched_forward_grad is None:
+            self.check_inplace_batched_forward_grad = self.check_batched_forward_grad
         else:
-            assert not (check_inplace_batched_forward_grad and not check_batched_forward_grad), (
+            assert not (self.check_inplace_batched_forward_grad and not self.check_batched_forward_grad), (
                 "check_batched_forward_grad should only be used when check_batched_forward_grad "
                 "is True. It is used to disable the test in the specific cases "
                 "where the op supports batched forward grad but fails to compute batched forward "
                 "grad for the inplace variant of the op.")
 
-        assert not (supports_fwgrad_bwgrad and not supports_autograd), (
+        assert not (self.supports_fwgrad_bwgrad and not self.supports_autograd), (
             "supports_fwgrad_bwgrad enables forward-over-backward gradgrad checks and should only be "
             "True if backward ad is also checked, i.e., supports_forward_ad should be True.", self.name)
 
-        self.supports_fwgrad_bwgrad = supports_fwgrad_bwgrad
-
-        self.supports_gradgrad = supports_gradgrad
-        self.check_batched_grad = check_batched_grad
-        self.check_batched_gradgrad = check_batched_gradgrad
-        self.check_batched_forward_grad = check_batched_forward_grad
-        self.check_inplace_batched_forward_grad = check_inplace_batched_forward_grad
-
         # Autograd flags that depend on both forward AD and backward AD
-        if supports_inplace_autograd is None:
-            supports_inplace_autograd = supports_autograd or supports_forward_ad
+        if self.supports_inplace_autograd is None:
+            self.supports_inplace_autograd = self.supports_autograd or self.supports_forward_ad
         else:
-            assert not (supports_inplace_autograd and not supports_autograd and not supports_forward_ad), (
+            assert not (self.supports_inplace_autograd and not self.supports_autograd and not self.supports_forward_ad), (
                 "supports_inplace_autograd refines the part of autograd that is supported, so "
                 "it should not be set if both supports_autograd and supports_forward_ad are False")
-        self.supports_inplace_autograd = supports_inplace_autograd
-
-        self.supports_sparse = supports_sparse
-        self.supports_sparse_csr = supports_sparse_csr
 
-        self.aliases = ()
-        if aliases is not None:
-            self.aliases = tuple(AliasInfo(a) for a in aliases)  # type: ignore[assignment]
-
-        self.supports_scripting = supports_scripting
-        self.assert_jit_shape_analysis = assert_jit_shape_analysis
-
-        self.test_conjugated_samples = test_conjugated_samples
-        self.test_neg_view = test_neg_view
+        if self.aliases is not None:
+            self.aliases = tuple(AliasInfo(a) for a in self.aliases)  # type: ignore[assignment]
+        else:
+            self.aliases = ()
 
     def __call__(self, *args, **kwargs):
         """Calls the function variant of the operator."""
         return self.op(*args, **kwargs)
 
+    def __str__(self):
+        return dataclass_repr(self)
+
     def get_op(self):
         """Returns the function variant of the operator, torch.<op_name>."""
         return self.op
@@ -798,24 +948,23 @@ def get_inplace(self):
         """
         return self.inplace_variant
 
-    def get_operator_variant(self):
+    def get_operator(self):
         """Returns operator variant of the operator, e.g. operator.neg
         Returns None if the operator has no operator variant.
         """
         return self.operator_variant
 
+    def get_inplace_operator(self):
+        """Returns the inplace operator variant of the operator, e.g operator.iadd
+        Returns None if the operator has no inplace operator variant"""
+        return self.inplace_operator_variant
+
     def conjugate_sample_inputs(self, device, dtype, requires_grad=False, **kwargs):
         """Returns an iterable of SampleInputs but with the tensor input or first
         tensor in a sequence input conjugated.
         """
 
-        # TODO: Remove the try/except once all operators have sample_inputs_func with
-        #       **kwargs in their signature.
-        try:
-            samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
-        except TypeError:
-            samples = self.sample_inputs_func(self, device, dtype, requires_grad)
-
+        samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
         conj_samples = list(samples)
 
         def conjugate(tensor):
@@ -834,20 +983,15 @@ def conjugate(tensor):
         return tuple(conj_samples)
 
     def sample_inputs(self, device, dtype, requires_grad=False, **kwargs):
-        """Returns an iterable of SampleInputs.
+        """
+        Returns an iterable of SampleInputs.
 
         These samples should be sufficient to test the function works correctly
         with autograd, TorchScript, etc.
         """
+        samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
 
-        # TODO: Remove the try/except once all operators have sample_inputs_func with
-        #       **kwargs in their signature.
-        try:
-            samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
-        except TypeError:
-            samples = self.sample_inputs_func(self, device, dtype, requires_grad)
-
-        if 'include_conjugated_inputs' in kwargs and kwargs.get('include_conjugated_inputs'):
+        if kwargs.get('include_conjugated_inputs', False):
             conj_samples = self.conjugate_sample_inputs(device, dtype, requires_grad, **kwargs)
             samples_list = list(samples)
             samples_list.extend(conj_samples)
@@ -855,12 +999,40 @@ def sample_inputs(self, device, dtype, requires_grad=False, **kwargs):
 
         return samples
 
+    def reference_inputs(self, device, dtype, requires_grad=False, **kwargs):
+        """
+        Returns an iterable of SampleInputs.
+
+        Distinct from sample_inputs() above because this returns an expanded set
+        of inputs when reference_inputs_func is defined. If undefined this returns
+        the sample inputs.
+        """
+        if self.reference_inputs_func is None:
+            return self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
+
+        if kwargs.get('include_conjugated_inputs', False):
+            raise NotImplementedError
+
+        return self.reference_inputs_func(self, device, dtype, requires_grad, **kwargs)
+
     def error_inputs(self, device, **kwargs):
         """
         Returns an iterable of ErrorInputs.
         """
         return self.error_inputs_func(self, device, **kwargs)
 
+    def sample_inputs_sparse_coo(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        coo layout.
+        """
+        return self.sample_inputs_sparse_coo_func(self, device, dtype, requires_grad, **kwargs)
+
+    def sample_inputs_sparse_csr(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        csr layout.
+        """
+        return self.sample_inputs_sparse_csr_func(self, device, dtype, requires_grad, **kwargs)
+
     def get_decorators(self, test_class, test_name, device, dtype):
         '''Returns the decorators targeting the given test.'''
         result = []
@@ -874,7 +1046,7 @@ def get_decorators(self, test_class, test_name, device, dtype):
 
     def supported_dtypes(self, device_type):
         if device_type == 'cpu':
-            return self.dtypesIfCPU
+            return self.dtypes
         if device_type == 'cuda':
             return self.dtypesIfROCM if TEST_WITH_ROCM else self.dtypesIfCUDA
         else:
@@ -886,39 +1058,18 @@ def supported_backward_dtypes(self, device_type):
 
         backward_dtypes = None
         if device_type == 'cpu':
-            backward_dtypes = self.backward_dtypesIfCPU
+            backward_dtypes = self.backward_dtypes
         elif device_type == 'cuda':
             backward_dtypes = self.backward_dtypesIfROCM if TEST_WITH_ROCM else self.backward_dtypesIfCUDA
         else:
             backward_dtypes = self.backward_dtypes
 
-        allowed_backward_dtypes = floating_and_complex_types_and(torch.bfloat16, torch.float16)
+        allowed_backward_dtypes = floating_and_complex_types_and(torch.bfloat16, torch.float16, torch.complex32)
         return set(allowed_backward_dtypes).intersection(backward_dtypes)
 
-    def supports_complex_autograd(self, device_type):
-        if device_type == 'cpu':
-            return any(dtype.is_complex for dtype in self.backward_dtypesIfCPU)
-        if device_type == 'cuda':
-            if TEST_WITH_ROCM:
-                return any(dtype.is_complex for dtype in self.backward_dtypesIfROCM)
-            else:
-                return any(dtype.is_complex for dtype in self.backward_dtypesIfCUDA)
-        else:
-            return any(dtype.is_complex for dtype in self.backward_dtypes)
-
     def supports_dtype(self, dtype, device_type):
         return dtype in self.supported_dtypes(device_type)
 
-    def default_test_dtypes(self, device_type):
-        """Returns the default dtypes used to test this operator on the device.
-
-        Equal to the operator's default_test_dtypes filtered to remove dtypes
-        not supported by the device.
-        """
-        supported = self.supported_dtypes(device_type)
-        return (supported if self._default_test_dtypes is None
-                else supported.intersection(self._default_test_dtypes))
-
     @property
     def formatted_name(self):
         """Returns a formatted full name for this OpInfo that can be used in test names."""
@@ -928,10 +1079,10 @@ def formatted_name(self):
 
 def _generate_reduction_inputs(device, dtype, requires_grad, **kwargs):
     """Generates input tensors for testing reduction operators"""
-    yield make_tensor([], device, dtype, requires_grad=requires_grad)
-    yield make_tensor([2], device, dtype, requires_grad=requires_grad)
-    yield make_tensor([3, 5], device, dtype, requires_grad=requires_grad)
-    yield make_tensor([3, 2, 1, 2], device, dtype, requires_grad=requires_grad)
+    yield make_tensor([], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor([2], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor([3, 5], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor([3, 2, 1, 2], dtype=dtype, device=device, requires_grad=requires_grad)
 
 
 def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
@@ -971,7 +1122,7 @@ def sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
     supports_multiple_dims: bool = kwargs.get('supports_multiple_dims', True)
 
     # TODO(@heitorschueroff) Once all reduction operators are using ReductionOpInfo
-    # use op_info.genearte_args_kwargs directly.
+    # use op_info.generate_args_kwargs directly.
     generate_args_kwargs = kwargs.get('generate_args_kwargs', lambda *args, **kwargs: (yield tuple(), {}))
 
     inputs: List[SampleInput] = []
@@ -989,18 +1140,18 @@ def sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
 
 def _generate_masked_op_mask(input_shape, device, **kwargs):
     yield None
-    yield make_tensor(input_shape, device, torch.bool, requires_grad=False)
+    yield make_tensor(input_shape, dtype=torch.bool, device=device, requires_grad=False)
     if len(input_shape) > 2:
         # broadcast last mask dimension:
-        yield make_tensor(input_shape[:-1] + (1,), device, torch.bool, requires_grad=False)
+        yield make_tensor(input_shape[:-1] + (1,), dtype=torch.bool, device=device, requires_grad=False)
         # broadcast middle mask dimension:
-        yield make_tensor(input_shape[:1] + (1,) + input_shape[2:], device, torch.bool, requires_grad=False)
+        yield make_tensor(input_shape[:1] + (1,) + input_shape[2:], dtype=torch.bool, device=device, requires_grad=False)
         # broadcast first mask dimension:
-        yield make_tensor((1,) + input_shape[1:], device, torch.bool, requires_grad=False)
+        yield make_tensor((1,) + input_shape[1:], dtype=torch.bool, device=device, requires_grad=False)
         # mask.ndim < input.ndim
-        yield make_tensor(input_shape[1:], device, torch.bool, requires_grad=False)
+        yield make_tensor(input_shape[1:], dtype=torch.bool, device=device, requires_grad=False)
         # mask.ndim == 1
-        yield make_tensor(input_shape[-1:], device, torch.bool, requires_grad=False)
+        yield make_tensor(input_shape[-1:], dtype=torch.bool, device=device, requires_grad=False)
         # masks that require broadcasting of inputs (mask.ndim >
         # input.ndim) will not be supported, however, we may
         # reconsider this if there will be demand on this kind of
@@ -1016,6 +1167,7 @@ def sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwar
     """
     inputs: List[SampleInput] = []
     kwargs['supports_multiple_dims'] = op_info.supports_multiple_dims
+
     for sample_input in sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
         for mask in _generate_masked_op_mask(sample_input.input.shape, device, **kwargs):
             sample_input_args, sample_input_kwargs = sample_input.args, dict(mask=mask, **sample_input.kwargs)
@@ -1030,7 +1182,75 @@ def sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwar
                     inputs.append(SampleInput(t.detach().requires_grad_(requires_grad),
                                               args=sample_input_args,
                                               kwargs=sample_input_kwargs))
+    return inputs
+
+
+def sample_inputs_sparse_coo_masked_reduction(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked reduction operators that support inputs
+    with sparse coo layouts.
+    """
+    inputs: List[SampleInput] = []
+
+    if op_info.supports_sparse:
+        op_name = op_info.name.replace('_masked.', '')
+        for sample_input in sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwargs):
+            mask = sample_input.kwargs.get('mask')
+            if mask is not None:
+                sample_input_kwargs = sample_input.kwargs.copy()
+                sample_input_kwargs.update(mask=mask.to_sparse())
+                inputs.append(SampleInput(sample_input.input.to_sparse(),
+                                          args=sample_input.args, kwargs=sample_input_kwargs))
+            else:
+                if op_name in {'prod', 'amax', 'amin'}:
+                    # FIXME: for now reductions with non-zero reduction identity and
+                    # unspecified mask are not supported for sparse COO
+                    # tensors, see torch._masked.prod implementation
+                    # for details.
+                    continue
+                inputs.append(SampleInput(sample_input.input.to_sparse(),
+                                          args=sample_input.args, kwargs=sample_input.kwargs))
+    return inputs
+
 
+def sample_inputs_sparse_csr_masked_reduction(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked reduction operators that support inputs
+    with sparse csr layouts.
+    """
+    inputs: List[SampleInput] = []
+    if op_info.supports_sparse_csr:
+        for sample_input in sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwargs):
+            if not (sample_input.input.ndim == 2 and sample_input.kwargs.get('keepdim')):
+                # - sparse CSR tensors are always 2-D tensors
+                # - masked reduction on CSR tensors are defined only if keepdim is True.
+                continue
+            mask = sample_input.kwargs.get('mask')
+            if mask is not None:
+                sample_input_kwargs = sample_input.kwargs.copy()
+                sample_input_kwargs.update(mask=mask.to_sparse_csr())
+                inputs.append(SampleInput(sample_input.input.to_sparse_csr(),
+                                          args=sample_input.args, kwargs=sample_input_kwargs))
+            else:
+                if op_info.name.lstrip('_masked.') in ['prod']:
+                    # reductions with non-zero reduction identity and
+                    # unspecified mask is not supported for sparse CSR
+                    # tensors, see torch._masked.prod implementation
+                    # for details.
+                    continue
+                inputs.append(SampleInput(sample_input.input.to_sparse_csr(),
+                                          args=sample_input.args, kwargs=sample_input.kwargs))
+            if sample_input.kwargs['dim'] == 0:
+                # Reductions of CSR tensors use different implementations for
+                # inner and/or outer dimensions. So, as a minimum of testing CSR
+                # implementations the following kwargs must be generated:
+                #   dict(dim=0, keepdim=True)
+                #   dict(dim=1, keepdim=True)
+                #   dict(dim=(0, 1), keepdim=True)
+                # Here we generate the dim=1 case from the dim=0 case.
+                sample_input = inputs[-1]
+                sample_input_kwargs = sample_input.kwargs.copy()
+                sample_input_kwargs.update(dim=1)
+                inputs.append(SampleInput(sample_input.input.clone(),
+                                          args=sample_input.args, kwargs=sample_input_kwargs))
     return inputs
 
 
@@ -1046,8 +1266,8 @@ def sample_inputs_masked_norm(op_info, device, dtype, requires_grad, **kwargs):
     return inputs
 
 
-def sample_inputs_masked_var(op_info, device, dtype, requires_grad, **kwargs):
-    """Sample inputs for masked var.
+def sample_inputs_masked_std_var(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked std/var.
     """
     inputs: List[SampleInput] = []
     for unbiased in [False, True]:
@@ -1061,20 +1281,23 @@ def sample_inputs_masked_var(op_info, device, dtype, requires_grad, **kwargs):
                 sample_input_args = sample_input.args
                 sample_input_kwargs = dict(sample_input.kwargs, unbiased=unbiased)
             if requires_grad:
-                inmask = torch._masked._input_mask(sample_input.input, *sample_input_args, **sample_input_kwargs)
-                orig_count = torch._masked.sum(inmask.new_ones(sample_input.input.shape, dtype=torch.int64),
-                                               dim, keepdim=True, mask=inmask)
-                if orig_count.min() <= int(unbiased):
+                if sample_input_kwargs.get('mask') is None:
+                    orig_count = torch._masked.sum(torch.ones(sample_input.input.shape, dtype=torch.int64), dim, keepdim=True)
+                else:
+                    inmask = torch._masked._input_mask(sample_input.input, *sample_input_args, **sample_input_kwargs)
+                    orig_count = torch._masked.sum(inmask.new_ones(sample_input.input.shape, dtype=torch.int64),
+                                                   dim, keepdim=True, mask=inmask)
+                if orig_count.min() <= int(unbiased) + 1:
                     # Skip samples that lead to singularities in var
                     # computation resulting nan values both in var and
                     # autograd output that test_grad_fn cannot handle
-                    # correctly.
+                    # correctly. Also, skip samples when the autograd output
+                    # for std could not be handled correctly due to torch.sqrt
                     continue
             inputs.append(SampleInput(sample_input.input.clone().requires_grad_(requires_grad),
                                       args=sample_input_args, kwargs=sample_input_kwargs))
     return inputs
 
-
 # NOTE [Reductions]:
 #
 # For testing purposes, we relax the definition of a reduction operator
@@ -1148,6 +1371,7 @@ def __init__(
         # Options from the OpInfo base class
         **kwargs,
     ):
+        self._original_reduction_args = locals().copy()
         assert nan_policy in (None, 'propagate', 'omit')
 
         # These are mutually exclusive options
@@ -1166,9 +1390,7 @@ def sample_inputs_func(*args, **kwargs):
         # Override OpInfo defaults and call base class __init__
         kwargs.setdefault('inplace_variant', None)
         kwargs.setdefault('sample_inputs_func', sample_inputs_func)
-        kwargs.setdefault('default_test_dtypes', (
-            torch.uint8, torch.int64, torch.float16, torch.bfloat16, torch.float32, torch.complex64))
-        super(ReductionOpInfo, self).__init__(name, **kwargs)
+        super().__init__(name, **kwargs)
 
         self.identity = identity
         self.nan_policy = nan_policy
@@ -1178,91 +1400,6 @@ def sample_inputs_func(*args, **kwargs):
         self.result_dtype = result_dtype
         self.generate_args_kwargs = generate_args_kwargs
 
-
-def sample_inputs_unary(op_info, device, dtype, requires_grad, op_kwargs=None, **kwargs):
-    if not op_kwargs:
-        op_kwargs = {}
-
-    low, high = op_info.domain
-    low = low if low is None else low + op_info._domain_eps
-    high = high if high is None else high - op_info._domain_eps
-
-    if op_info.supports_sparse_csr:
-        # Tensors with dim=2 for sparse CSR testing
-        yield SampleInput(make_tensor((L, L), device=device, dtype=dtype,
-                                      low=low, high=high,
-                                      requires_grad=requires_grad), kwargs=op_kwargs)
-    else:
-        # Creates a 1D, empty, and scalar tensor
-        for shape in ((L,), (1, 0, 3), ()):
-            yield SampleInput(make_tensor(shape, device=device, dtype=dtype,
-                                          low=low, high=high,
-                                          requires_grad=requires_grad), kwargs=op_kwargs)
-
-
-# Metadata class for unary "universal functions (ufuncs)" that accept a single
-# tensor and have common properties like:
-class UnaryUfuncInfo(OpInfo):
-    """Operator information for 'universal unary functions (unary ufuncs).'
-    These are functions of a single tensor with common properties like:
-      - they are elementwise functions
-      - the input shape is the output shape
-      - they typically have method and inplace variants
-      - they typically support the out kwarg
-      - they typically have NumPy or SciPy references
-    See NumPy's universal function documentation
-    (https://numpy.org/doc/1.18/reference/ufuncs.html) for more details
-    about the concept of ufuncs.
-    """
-
-    def __init__(self,
-                 name,  # the string name of the function
-                 *,
-                 ref,  # a reference function
-                 dtypes=floating_types(),
-                 dtypesIfCUDA=None,
-                 dtypesIfROCM=None,
-                 default_test_dtypes=(
-                     torch.uint8, torch.long, torch.half, torch.bfloat16,
-                     torch.float32, torch.cfloat),  # dtypes which tests check by default
-                 domain=(None, None),  # the [low, high) domain of the function
-                 handles_large_floats=True,  # whether the op correctly handles large float values (like 1e20)
-                 handles_extremals=True,  # whether the op correctly handles extremal values (like inf)
-                 handles_complex_extremals=True,  # whether the op correct handles complex extremals (like inf -infj)
-                 supports_complex_to_float=False,  # op supports casting from complex input to real output safely eg. angle
-                 sample_inputs_func=sample_inputs_unary,
-                 sample_kwargs=lambda device, dtype, input: ({}, {}),
-                 supports_sparse=False,
-                 reference_numerics_filter=None,  # Filter for singular input values for test_reference_numerics_normal
-                 **kwargs):
-        super(UnaryUfuncInfo, self).__init__(name,
-                                             dtypes=dtypes,
-                                             dtypesIfCUDA=dtypesIfCUDA,
-                                             dtypesIfROCM=dtypesIfROCM,
-                                             default_test_dtypes=default_test_dtypes,
-                                             sample_inputs_func=sample_inputs_func,
-                                             supports_sparse=supports_sparse,
-                                             **kwargs)
-        self.ref = ref
-        self.domain = domain
-        self.handles_large_floats = handles_large_floats
-        self.handles_extremals = handles_extremals
-        self.handles_complex_extremals = handles_complex_extremals
-        self.supports_complex_to_float = supports_complex_to_float
-        self.reference_numerics_filter = reference_numerics_filter
-
-        # test_unary_ufuncs.py generates its own inputs to test the consistency
-        # of the operator on sliced tensors, non-contig tensors, etc.
-        # `sample_kwargs` is a utility function to provide kwargs
-        # along with those inputs if required (eg. clamp).
-        # It should return two dictionaries, first holding kwarg for
-        # torch operator and second one for reference NumPy operator.
-        self.sample_kwargs = sample_kwargs
-
-        # Epsilon to ensure grad and gradgrad checks don't test values
-        #   outside a function's domain.
-        self._domain_eps = 1e-5
-
 def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype,
                          low=None, high=None, requires_grad=requires_grad)
@@ -1315,7 +1452,7 @@ def sample_inputs_linalg_det(op_info, device, dtype, requires_grad, **kwargs):
     return [SampleInput(t) for t in inputs]
 
 def sample_inputs_linalg_det_singular(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_arg = partial(make_tensor, device=device, dtype=dtype)
 
     def make_singular_matrix_batch_base(size, rank):
         assert size[-1] == size[-2]
@@ -1331,7 +1468,7 @@ def make_singular_matrix_batch_base(size, rank):
         u_diag_abs_largest = u_diag_abs.max(dim=-1, keepdim=True).values
         u_diag_abs_smallest_idxs = torch.topk(u_diag_abs, k=(n - rank), largest=False).indices
         u.diagonal(0, -2, -1).div_(u_diag_abs_largest)
-        u[..., u_diag_abs_smallest_idxs] = torch.finfo(dtype).eps
+        u.diagonal(0, -2, -1)[..., u_diag_abs_smallest_idxs] = torch.finfo(dtype).eps
         matrix = p @ l @ u
 
         matrix.requires_grad_(requires_grad)
@@ -1365,31 +1502,31 @@ def sample_inputs_linalg_matrix_power(op_info, device, dtype, requires_grad, **k
             yield SampleInput(make_arg_fullrank(*size), args=(n,))
 
 def sample_inputs_hsplit(op_info, device, dtype, requires_grad, **kwargs):
-    return (SampleInput(make_tensor((6,), device, dtype,
+    return (SampleInput(make_tensor((6,), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(2,),),
-            SampleInput(make_tensor((S, S, S), device, dtype,
+            SampleInput(make_tensor((S, S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=([1, 2, 3],),),)
 
 def sample_inputs_vsplit(op_info, device, dtype, requires_grad, **kwargs):
-    return (SampleInput(make_tensor((6, S), device, dtype,
+    return (SampleInput(make_tensor((6, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(2,),),
-            SampleInput(make_tensor((S, S, S), device, dtype,
+            SampleInput(make_tensor((S, S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=([1, 2, 3],),),)
 
 def sample_inputs_dsplit(op_info, device, dtype, requires_grad, **kwargs):
-    return (SampleInput(make_tensor((S, S, S), device, dtype,
+    return (SampleInput(make_tensor((S, S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=([1, 2, 3],),),
-            SampleInput(make_tensor((S, S, 6), device, dtype,
+            SampleInput(make_tensor((S, S, 6), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(2,),),)
@@ -1408,8 +1545,8 @@ def error_inputs_hsplit(op_info, device, **kwargs):
                                   dtype=torch.float32,
                                   device=device),
                       args=(0,),)
-    return (ErrorInput(si1, error_type=RuntimeError, error_regex=err_msg1),
-            ErrorInput(si2, error_type=RuntimeError, error_regex=err_msg2),)
+    return (ErrorInput(si1, error_regex=err_msg1),
+            ErrorInput(si2, error_regex=err_msg2),)
 
 def error_inputs_vsplit(op_info, device, **kwargs):
     err_msg1 = ("torch.vsplit requires a tensor with at least 2 dimension, "
@@ -1425,8 +1562,8 @@ def error_inputs_vsplit(op_info, device, **kwargs):
                                   dtype=torch.float32,
                                   device=device),
                       args=(0,),)
-    return (ErrorInput(si1, error_type=RuntimeError, error_regex=err_msg1),
-            ErrorInput(si2, error_type=RuntimeError, error_regex=err_msg2),)
+    return (ErrorInput(si1, error_regex=err_msg1),
+            ErrorInput(si2, error_regex=err_msg2),)
 
 def error_inputs_dsplit(op_info, device, **kwargs):
     err_msg1 = ("torch.dsplit requires a tensor with at least 3 dimension, "
@@ -1442,8 +1579,8 @@ def error_inputs_dsplit(op_info, device, **kwargs):
                                   dtype=torch.float32,
                                   device=device),
                       args=(0,),)
-    return (ErrorInput(si1, error_type=RuntimeError, error_regex=err_msg1),
-            ErrorInput(si2, error_type=RuntimeError, error_regex=err_msg2),)
+    return (ErrorInput(si1, error_regex=err_msg1),
+            ErrorInput(si2, error_regex=err_msg2),)
 
 def sample_inputs_linalg_multi_dot(op_info, device, dtype, requires_grad, **kwargs):
     # Each test case consists of the sizes in the chain of multiplications
@@ -1462,7 +1599,7 @@ def sample_inputs_linalg_multi_dot(op_info, device, dtype, requires_grad, **kwar
     for sizes in test_cases:
         tensors = []
         for size in zip(sizes[:-1], sizes[1:]):
-            t = make_tensor(size, device, dtype, requires_grad=requires_grad)
+            t = make_tensor(size, dtype=dtype, device=device, requires_grad=requires_grad)
             tensors.append(t)
         result.append(SampleInput(tensors))
 
@@ -1475,12 +1612,15 @@ def sample_inputs_linalg_matrix_norm(op_info, device, dtype, requires_grad, **kw
 
     inputs: List[SampleInput] = []
     for size, ord, dim, keepdim in product(sizes, ords, dims, [True, False]):
-        t = make_tensor(size, device, dtype, requires_grad=requires_grad)
+        t = make_tensor(size, dtype=dtype, device=device, requires_grad=requires_grad)
         inputs.append(SampleInput(t, args=(ord, dim, keepdim)))
 
     return inputs
 
-def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad, **kwargs):
+def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad, *, variant=None, **kwargs):
+    if variant is not None and variant not in ('subgradient_at_zero',):
+        raise ValueError(f"Unsupported variant, expected variant to be 'subgradient_at_zero' but got: {variant}")
+
     test_sizes = [
         (S,),
         (0,),
@@ -1504,12 +1644,13 @@ def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad, **kwargs):
         is_matrix_norm = len(test_size) == 2
 
         for keepdim in [False, True]:
-            inputs.append(SampleInput(
-                make_tensor(
-                    test_size, device, dtype, low=None, high=None,
-                    requires_grad=requires_grad),
-                kwargs=dict(
-                    keepdim=keepdim)))
+            if not variant == 'subgradient_at_zero':
+                inputs.append(SampleInput(
+                    make_tensor(
+                        test_size, dtype=dtype, device=device, low=None, high=None,
+                        requires_grad=requires_grad),
+                    kwargs=dict(
+                        keepdim=keepdim)))
 
             if not (is_vector_norm or is_matrix_norm):
                 continue
@@ -1517,48 +1658,56 @@ def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad, **kwargs):
             ords = vector_ords if is_vector_norm else matrix_ords
 
             for ord in ords:
-
-                inputs.append(SampleInput(
-                    make_tensor(
-                        test_size, device, dtype,
-                        low=None, high=None,
-                        requires_grad=requires_grad),
-                    args=(ord,),
-                    kwargs=dict(
-                        keepdim=keepdim)))
-
-                if ord in ['nuc', 'fro']:
+                if variant == 'subgradient_at_zero':
+                    inputs.append(SampleInput(
+                        torch.zeros(
+                            test_size, dtype=dtype, device=device,
+                            requires_grad=requires_grad),
+                        args=(ord,),
+                        kwargs=dict(keepdim=keepdim)))
+                else:
                     inputs.append(SampleInput(
                         make_tensor(
-                            test_size, device, dtype,
+                            test_size, dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad),
+                        args=(ord,),
                         kwargs=dict(
-                            ord=ord,
-                            keepdim=keepdim,
-                            dim=(0, 1))))
+                            keepdim=keepdim)))
+
+                    if ord in ['nuc', 'fro']:
+                        inputs.append(SampleInput(
+                            make_tensor(
+                                test_size, dtype=dtype, device=device,
+                                low=None, high=None,
+                                requires_grad=requires_grad),
+                            kwargs=dict(
+                                ord=ord,
+                                keepdim=keepdim,
+                                dim=(0, 1))))
+
         return inputs
 
 def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
     # input shape, output shape, output stride, output storage offset
-    test_cases = [
+    test_cases = (
         ((1,), (1,), (1,), 0),
         ((3, 3), (2, 2), (1, 2), 0),
         ((3, 3), (2, 2), (1, 2), 1),
         ((16,), (2, 2, 2, 2), (1, 1, 1, 1), 0),
         ((16,), (2, 1, 1, 2), (1, 7, 7, 1), 0),
-    ]
-
-    samples = []
+    )
 
     for input_shape, output_shape, stride, storage_offset in test_cases:
         input_t = make_arg(input_shape)
         kwargs = dict(storage_offset=storage_offset)
-        samples.append(SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs))
+        yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
-    return samples
+    # as_strided on offset, partial views
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
 
 def sample_inputs_combinations(op_info, device, dtype, requires_grad, **kwargs):
     inputs = (
@@ -1712,16 +1861,19 @@ def sample_inputs_nn_functional_prelu(op_info, device, dtype, requires_grad, **k
     for shape in cases:
         for weight in [-1., 0., 0.8, 1.]:
             weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad)
-            yield SampleInput(make_arg(shape), kwargs=dict(weight=weight_tensor))
+            yield SampleInput(make_arg(shape), args=(weight_tensor,))
 
         if len(shape) >= 2:
             channel_size = shape[1]
-            yield SampleInput(make_arg(shape), kwargs=dict(weight=make_arg((channel_size,))))
+            yield SampleInput(make_arg(shape), args=(make_arg((channel_size,)),))
+    weight_tensor = torch.tensor(1., device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg((S, S)), kwargs=dict(weight=weight_tensor,))
+    yield SampleInput(make_arg((S, S)), kwargs=dict(weight=make_arg((S,)),))
 
 def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    cases = (
+    cases = [
         ((S, S), (2,), '2'),
         ((S, S), (0,), '0'),
         ((S, S), (0.5,), '0_5'),
@@ -1731,7 +1883,13 @@ def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S), (-2,), 'neg_2'),
         ((S, S), (-0.5,), 'neg_0_5'),
         ((S, S), (-1.5,), 'neg_1_5'),
-    )
+    ]
+
+    # FIXME gradgrad and noncotiguous_samples fail on inf and -inf norms on CPU because of vectorization
+    # For how to fix this, see the implementation of `linalg_vector_norm`.
+    if torch.device(device).type == "cuda":
+        cases += [((S, S), (inf,), 'inf'),
+                  ((S, S), (-inf,), 'neg_inf')]
 
     cases_nonzero_input = (
         ((S, S, S), (1.5,), '1_5_default'),
@@ -1741,13 +1899,13 @@ def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S, S), (1.5, -1, True), 'keepdim_1_5_neg_dim'),
     )
 
-    cases_negdim_base = (
-        ((S, S), (-2, 1,), 'neg_2_2_dim'),
-        ((S, S), (-1, 1,), 'neg_1_2_dim'),
-        ((S, S), (0, 1,), '0_2_dim'),
-        ((S, S), (1, 1,), '1_2_dim'),
-        ((S, S), (2, 1,), '2_2_dim'),
-        ((S, S), (3, 1,), '3_2_dim'),
+    cases_posdim = (
+        ((S, S), (-2, 1,), 'neg_2_dim'),
+        ((S, S), (-1, 1,), 'neg_1_dim'),
+        ((S, S), (0, 1,), '0_dim'),
+        ((S, S), (1, 1,), '1_dim'),
+        ((S, S), (2, 1,), '2_dim'),
+        ((S, S), (3, 1,), '3_dim'),
         ((S, S, S), (2, 1), '2_dim'),
         ((S, S, S), (3, 1), '3_dim'),
         ((S, S, S), (2, 1, True), 'keepdim_2_dim'),
@@ -1758,15 +1916,10 @@ def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
         ((), (3, 0, True), 'keepdim_3_dim_scalar'),
     )
 
-    cases_negdim = []
-    for case in cases_negdim_base:
-        cases_negdim.append(case)
-        shape, args, name = case
-        new_args = copy.deepcopy(list(args))
-        new_args[1] *= -1
-        cases_negdim.append((shape, tuple(new_args), name.replace("_dim", "_neg_dim")))
+    cases_negdim = ((shape, args[:1] + (-args[1],) + args[2:], name.replace("_dim", "_neg_dim"))
+                    for shape, args, name in cases_posdim)
 
-    for shape, args, name in itertools.chain(cases, cases_negdim):
+    for shape, args, name in itertools.chain(cases, cases_posdim, cases_negdim):
         yield SampleInput(make_arg(shape), args=args, name=name)
 
     for shape, args, name in cases_nonzero_input:
@@ -1813,86 +1966,528 @@ def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs):
 
 
 def sample_inputs_linalg_vector_norm(op_info, device, dtype, requires_grad, **kwargs):
-    size_1D = (S,)
-    size_2D = (2, 2)
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    test_cases = [
-        # input size, ord, dim args
-        (size_1D, 2, None),
-        (size_1D, 2, (0,)),
-        (size_1D, 0, None),
-        (size_1D, 0, (0,)),
-        (size_1D, 0.9, None),
-        (size_1D, 0.9, (0,)),
-        (size_1D, 1, None),
-        (size_1D, 1, (0,)),
-        (size_1D, -2.1, None),
-        (size_1D, -2.1, (0,)),
-        (size_1D, inf, None),
-        (size_1D, inf, (0,)),
-        (size_1D, -inf, None),
-        (size_1D, -inf, (0,)),
-
-        (size_2D, 2, None),
-        (size_2D, 2, (0,)),
-        (size_2D, 2, (-1, 0)),
-        (size_2D, 0, None),
-        (size_2D, 0, (0,)),
-        (size_2D, 0, (-1, 0)),
-        (size_2D, 0.9, None),
-        (size_2D, 0.9, (0,)),
-        (size_2D, 0.9, (-1, 0)),
-        (size_2D, 1, None),
-        (size_2D, 1, (0,)),
-        (size_2D, 1, (-1, 0)),
-        (size_2D, -2.1, None),
-        (size_2D, -2.1, (0,)),
-        (size_2D, -2.1, (-1, 0)),
-        (size_2D, inf, None),
-        (size_2D, inf, (0,)),
-        (size_2D, inf, (-1, 0)),
-        (size_2D, -inf, None),
-        (size_2D, -inf, (0,)),
-        (size_2D, -inf, (-1, 0)),
-    ]
-    inputs = []
+    sizes = ((S,), (2, 2))
+    dims = (None, 0, -1)
+    ords = (inf, 2, 1, 0, 0.9, -2.1, -inf)
 
-    for test_size, ord, dim in test_cases:
-        for keepdim in [False, True]:
-            inputs.append(SampleInput(
-                make_tensor(
-                    test_size, device, dtype,
-                    low=None, high=None,
-                    requires_grad=requires_grad),
-                args=(ord,),
-                kwargs=dict(
-                    keepdim=keepdim,
-                    dim=dim)))
+    for size, ord_, keepdim in product(sizes, ords, (True, False)):
+        for dim in dims:
+            yield SampleInput(make_arg(size), args=(ord_,), kwargs=dict(keepdim=keepdim, dim=dim))
+            if dtype == torch.float32:
+                yield SampleInput(make_arg(size), args=(ord_,), kwargs=dict(keepdim=keepdim, dim=dim, dtype=torch.float64))
+            if dtype == torch.complex64:
+                yield SampleInput(make_arg(size), args=(ord_,), kwargs=dict(keepdim=keepdim, dim=dim, dtype=torch.complex128))
 
-    return inputs
+        # Test several dims
+        if len(size) == 2:
+            yield SampleInput(make_arg(size), args=(ord_,), kwargs=dict(keepdim=keepdim, dim=(-1, 0)))
 
+# The following functions and classes are for testing elementwise binary operators.
 
-# Metadata class for binary "universal functions (ufuncs)" that accept two
-# tensor and have common properties
-class BinaryUfuncInfo(OpInfo):
-    """Operator information for 'universal binary functions (binary ufuncs).'
-    These are functions of two tensors with common properties like:
-      - they are elementwise functions
-      - the output shape is determined by the input shape
-      - they typically have method and inplace variants
-      - they typically support the out kwarg
-      - they typically have NumPy or SciPy references
-    See NumPy's universal function documentation
-    (https://numpy.org/doc/stable/reference/ufuncs.html) for more details
-    about the concept of ufuncs.
-    """
-    def __init__(self, name, *,
-                 lhs_make_tensor_kwargs=None,
-                 rhs_make_tensor_kwargs=None,
-                 promotes_int_to_float=False,  # Set to true if the op promotes integer inputs to float
-                 always_returns_bool=False,  # Set to true if the op always returns bool tensors
-                 **kwargs):
-        super().__init__(name, **kwargs)
+# Returns a generator of pairs of contiguous tensors on the requested device
+#   and with the requested dtype.
+#
+# This function is intended to test the non-vectorized and vectorized code
+#   paths of elementwise binary functions, as well as their handling of odd tensor
+#   sizes (like zero-dim tensors and tensors with zero elements).
+#
+# Each iterable will include an a tensor with no elements,
+#   zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and
+#   a large 2D tensor.
+def generate_elementwise_binary_tensors(op, *, device, dtype, requires_grad=False):
+    shapes = (
+        # tensors with no elements
+        (0,),
+        (1, 0, 3),
+        # zero dim (scalar) tensor
+        (),
+        # small 1D tensor
+        (20,),
+        # medium 1D tensor
+        (812,),
+        # large 2D tensor
+        (1029, 917),
+    )
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+        yield SampleInput(lhs, args=(rhs,))
+
+def generate_elementwise_binary_arbitrarily_strided_tensors(op, *, device, dtype, requires_grad=False):
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    for shape, strides, offset in strided_cases:
+        a = make_arg(500,).as_strided(shape, strides, offset)
+        b = make_arg(shape)
+        yield SampleInput(a, args=(b,))
+
+# Returns a generator of pairs of contiguous tensors on the requested device and with
+#   the requested dtype.
+#
+# Unlike the previous function, the values in these tensors are specified manually.
+def generate_elementwise_binary_small_value_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=None
+):
+    if exclude_zero is None:
+        if hasattr(op, "rhs_make_tensor_kwargs"):
+            exclude_zero = op.rhs_make_tensor_kwargs.get("exclude_zero", False)
+
+    # defines interesting values
+    _unsigned_int_vals = (0, 1, 55, 127, 128, 190, 210, 220, 254)
+    _int_vals = (0, -1, 1, -55, 55, -127, 127, -128)
+    _float_vals = (
+        0.0,
+        -0.001,
+        0.001,
+        -0.25,
+        0.25,
+        -1.0,
+        1.0,
+        -math.pi / 2,
+        math.pi / 2,
+        -math.pi + 0.00001,
+        math.pi - 0.00001,
+        -math.pi,
+        math.pi,
+        -math.pi - 0.00001,
+        math.pi + 0.00001,
+    )
+
+    l_vals = []
+    r_vals = []
+
+    if dtype.is_floating_point:
+        prod = product(_float_vals, _float_vals)
+    elif dtype.is_complex:
+        complex_vals = product(_float_vals, _float_vals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = list(map(lambda x: complex(*x), complex_vals))
+        prod = product(complex_vals, complex_vals)
+    elif dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
+        prod = product(_int_vals, _int_vals)
+    elif dtype is torch.uint8:
+        prod = product(_unsigned_int_vals, _unsigned_int_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        if r == 0 and exclude_zero:
+            r_vals.append(1)
+        else:
+            r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+
+def generate_elementwise_binary_large_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    _large_int_vals = (-1113, 1113, -10701, 10701)
+    _large_float16_vals = (-501, 501, -1001.2, 1001.2, -13437.7, 13437.7)
+    _large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20)
+
+    l_vals = []
+    r_vals = []
+
+    if dtype == torch.float16:
+        prod = product(_large_float16_vals, _large_float16_vals)
+    elif dtype.is_floating_point:
+        prod = product(_large_float_vals, _large_float_vals)
+    elif dtype.is_complex:
+        complex_vals = product(_large_float_vals, _large_float_vals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = list(map(lambda x: complex(*x), complex_vals))
+        prod = product(complex_vals, complex_vals)
+    elif dtype in (torch.int16, torch.int32, torch.int64):
+        prod = product(_large_int_vals, _large_int_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+
+def generate_elementwise_binary_extremal_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    _float_extremals = (float("inf"), float("-inf"), float("nan"))
+
+    l_vals = []
+    r_vals = []
+
+    if dtype.is_floating_point:
+        prod = product(_float_extremals, _float_extremals)
+    elif dtype.is_complex:
+        complex_vals = product(_float_extremals, _float_extremals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = list(map(lambda x: complex(*x), complex_vals))
+        prod = product(complex_vals, complex_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+    # Test case for NaN propagation
+    nan = float('nan') if dtype.is_floating_point else complex(float('nan'), float('nan'))
+    lhs = make_tensor((128, 128), device=device, dtype=dtype, requires_grad=requires_grad)
+    lhs.flatten()[::3] = nan
+    rhs = make_tensor((128, 128), device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs.flatten()[::3] = nan
+
+    yield SampleInput(lhs, args=(rhs,))
+
+# Returns a generator of pairs of contiguous and noncontiguous tensors that
+#   require broadcasting
+def generate_elementwise_binary_broadcasting_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    shapes = (
+        ((1,), ()),
+        ((2,), ()),
+        ((1,), (2,)),
+        ((2, 1), (2,)),
+        ((1, 2), (2,)),
+        ((3, 2), (2,)),
+        ((1, 3, 2), (2,)),
+        ((1, 3, 2), (3, 2)),
+        ((3, 1, 2), (3, 2)),
+        ((2, 3, 2), ()),
+        ((3, 1, 2), (1, 3, 2)),
+    )
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    for shape, noncontiguous in product(shapes, [True, False]):
+        shape_lhs, shape_rhs = shape
+        lhs = make_arg(
+            shape_lhs, noncontiguous=noncontiguous, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_arg(
+            shape_rhs, noncontiguous=noncontiguous, **op.rhs_make_tensor_kwargs
+        )
+
+        yield SampleInput(lhs, args=(rhs,), broadcasts_input=True)
+
+
+# Returns a generator of pairs of contiguous tensors and scalars
+def generate_elementwise_binary_with_scalar_samples(
+    op, *, device, dtype, requires_grad=False
+):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    scalar_shapes = ((), (3,), (5, 3), (0, 1, 3), (1, 5))
+    if op.supports_rhs_python_scalar:
+        for scalar_shape in scalar_shapes:
+            lhs = make_arg(scalar_shape, **op.lhs_make_tensor_kwargs)
+            rhs = make_arg(scalar_shape, **op.rhs_make_tensor_kwargs)
+            lhs_scalar = make_arg((), **op.lhs_make_tensor_kwargs).item()
+            rhs_scalar = make_arg((), **op.rhs_make_tensor_kwargs).item()
+
+            yield SampleInput(lhs, args=(rhs_scalar,))
+
+        # Extends with scalar lhs
+        if op.supports_one_python_scalar:
+            yield SampleInput(lhs_scalar, args=(rhs,))
+
+    if op.supports_two_python_scalars:
+        lhs_scalar = make_arg((), **op.lhs_make_tensor_kwargs).item()
+        rhs_scalar = make_arg((), **op.rhs_make_tensor_kwargs).item()
+
+        yield SampleInput(lhs_scalar, args=(rhs_scalar,))
+
+
+# Returns a generator of pairs of noncontiguous tensors
+def generate_elementwise_binary_noncontiguous_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    # Generic noncontiguity
+    lhs = make_arg((1026,), noncontiguous=True, **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((1026,), noncontiguous=True, **op.rhs_make_tensor_kwargs)
+
+    yield SampleInput(lhs.clone(), args=(rhs.clone(),))
+    yield SampleInput(lhs.contiguous(), args=(rhs,))
+
+    # Transposed
+    lhs = make_arg((789, 357), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((789, 357), **op.rhs_make_tensor_kwargs)
+
+    yield SampleInput(lhs.T, args=(rhs.T,))
+
+    # More noncontiguity
+    shapes = ((5, 7), (1024,))
+
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+        lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+        lhs_non_contig.copy_(lhs)
+
+        rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+        rhs_non_contig.copy_(rhs)
+
+        yield SampleInput(lhs_non_contig.clone(), args=(rhs_non_contig.clone(),))
+        yield SampleInput(lhs_non_contig.contiguous(), args=(rhs_non_contig,))
+
+    # Noncontiguous indices
+    shape = (2, 2, 1, 2)
+    lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+    rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+    lhs_non_contig = lhs[:, 1, ...]
+    rhs_non_contig = rhs[:, 1, ...]
+
+    yield SampleInput(lhs_non_contig.clone(), args=(rhs_non_contig.clone(),))
+    yield SampleInput(lhs_non_contig.contiguous(), args=(rhs_non_contig,))
+
+    # Expanded tensors
+    shapes = ((1, 3), (1, 7), (5, 7))
+
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+        lhs_non_contig = lhs.expand(3, -1, -1)
+        rhs_non_contig = rhs.expand(3, -1, -1)
+
+        yield SampleInput(lhs_non_contig, args=(rhs_non_contig,))
+
+
+# Sample inputs for elementwise binary operators, like add
+def sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    shapes = (
+        ((), ()),
+        ((S,), ()),
+        ((S, 1), (S,)),
+        ((M, S), ()),
+        ((S, M, S), (M, S)),
+        ((S, M, S), (S, M, S)),
+        ((M, 1, S), (M, S)),
+        ((M, 1, S), (1, M, S)),
+        ((0, 1, 3), (0, 10, 3)),
+    )
+
+    sample_kwargs = kwargs.get("sample_kwargs", {})
+
+    for shape_lhs, shape_rhs in shapes:
+        lhs = make_arg(shape_lhs, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape_rhs, **op.rhs_make_tensor_kwargs)
+        broadcasts_input = shape_lhs != torch.broadcast_shapes(shape_lhs, shape_rhs)
+
+        yield SampleInput(
+            lhs, args=(rhs,), kwargs=sample_kwargs, broadcasts_input=broadcasts_input
+        )
+
+
+def sample_inputs_jiterator(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        ((), ()),
+        ((S,), ()),
+        ((S, 1), (S,)),
+        ((M, S), ()),
+        ((S, M, S), (M, S)),
+        ((S, M, S), (S, M, S)),
+        ((M, 1, S), (M, S)),
+        ((M, 1, S), (1, M, S)),
+        ((0, 1, 3), (0, 10, 3))
+    )
+
+    num_inputs = kwargs.get('num_inputs')
+    sample_kwargs = kwargs.get('sample_kwargs', {})
+
+    for shape_lhs, shape_rhs in shapes:
+        lhs = make_arg(shape_lhs)
+
+        args = []
+        for i in range(num_inputs - 1):
+            args.append(make_arg(shape_rhs))
+        broadcasts_input = (shape_lhs != torch.broadcast_shapes(shape_lhs, shape_rhs))
+
+        yield SampleInput(lhs, args=tuple(args), kwargs=sample_kwargs, broadcasts_input=broadcasts_input)
+
+# The base reference input generation for elementwise binary operations
+def _reference_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs):
+    yield from op.sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
+    yield from generate_elementwise_binary_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    if dtype is not torch.bool:
+        yield from generate_elementwise_binary_small_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    if dtype not in (torch.bool, torch.uint8, torch.int8):
+        yield from generate_elementwise_binary_large_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    # TODO: FIXME: RuntimeError: "index_select" not implemented for 'ComplexHalf'
+    if dtype not in (torch.chalf,):
+        yield from generate_elementwise_binary_broadcasting_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    yield from generate_elementwise_binary_with_scalar_samples(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    if dtype.is_floating_point or dtype.is_complex:
+        yield from generate_elementwise_binary_extremal_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+
+
+# Note that these references inputs use scalars for the SampleInput.input value,
+#   and many tests require SampleInput.input be a tensor or a list of tensors
+def reference_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs):
+    gen = partial(
+        _reference_inputs_elementwise_binary, op, device, dtype, requires_grad, **kwargs
+    )
+
+    # yields "normal" samples
+    yield from gen()
+
+    # TODO: RuntimeError: "index_select" not implemented for 'ComplexHalf'
+    if dtype is torch.chalf:
+        return
+
+    # yields noncontiguous samples
+    for sample in gen():
+        yield sample.noncontiguous()
+
+    yield from generate_elementwise_binary_noncontiguous_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    yield from generate_elementwise_binary_arbitrarily_strided_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+
+# A functional that extends an elementwise binary operator's bespoke error inputs
+#   with generic error inputs for the class of elementwise binary operations
+def make_error_inputs_elementwise_binary(error_inputs_func):
+    def error_inputs_func_wrapper(op, device, **kwargs):
+        if error_inputs_func is not None:
+            yield from error_inputs_func(op, device, **kwargs)
+
+        if not op.supports_rhs_python_scalar:
+            si = SampleInput(torch.tensor((1, 2, 3), device=device), args=(2,))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+        if not op.supports_one_python_scalar:
+            si = SampleInput(2, args=(torch.tensor((1, 2, 3), device=device),))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+        if (
+            not kwargs.get("skip_two_python_scalars", False)
+            and not op.supports_two_python_scalars
+        ):
+            si = SampleInput(2, args=(3,))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+    return error_inputs_func_wrapper
+
+
+# Metadata class for binary "universal functions (ufuncs)" that accept two
+# tensor and have common properties
+class BinaryUfuncInfo(OpInfo):
+    """Operator information for 'universal binary functions (binary ufuncs).'
+    These are functions of two tensors with common properties like:
+      - they are elementwise functions
+      - the output shape is determined by the input shape
+      - they typically have method and inplace variants
+      - they typically support the out kwarg
+      - they typically have NumPy or SciPy references
+    See NumPy's universal function documentation
+    (https://numpy.org/doc/stable/reference/ufuncs.html) for more details
+    about the concept of ufuncs.
+    """
+
+    def __init__(
+        self,
+        name,
+        *,
+        sample_inputs_func=sample_inputs_elementwise_binary,
+        reference_inputs_func=reference_inputs_elementwise_binary,
+        error_inputs_func=None,
+        lhs_make_tensor_kwargs=None,
+        rhs_make_tensor_kwargs=None,
+        promotes_int_to_float=False,  # Set to true if the op promotes integer inputs to float
+        always_returns_bool=False,  # Set to true if the op always returns bool tensors
+        supports_rhs_python_scalar=True,  # Whether the operator allows Tensor x scalar inputs
+        supports_one_python_scalar=False,  # Whether the operator allows scalar x tensor and tensor x scalar inputs
+        supports_two_python_scalars=False,  # Whether the operator allows scalar x scalar inputs
+        **kwargs,
+    ):
+
+        self._original_binary_ufunc_args = locals().copy()
+
+        # Elementwise binary operations perform the equivalent of test_reference_testing
+        #   in test_binary_ufuncs, but with additional test granularity. So the
+        #   generic test_ops.py test is skipped because it's redundant.
+        common_skips = (
+            DecorateInfo(
+                unittest.skip("Skipping redundant test."),
+                "TestCommon",
+                "test_reference_testing",
+            ),
+        )
+        kwargs["skips"] = kwargs.get("skips", tuple()) + common_skips
+        super(BinaryUfuncInfo, self).__init__(
+            name,
+            sample_inputs_func=sample_inputs_func,
+            reference_inputs_func=reference_inputs_func,
+            error_inputs_func=make_error_inputs_elementwise_binary(error_inputs_func),
+            **kwargs,
+        )
 
         # [lr]hs_make_tensor_kwargs are part of the OpInfo to be able to dynamically generate valid samples later on.
         if lhs_make_tensor_kwargs is None:
@@ -1905,154 +2500,378 @@ def __init__(self, name, *,
 
         self.promotes_int_to_float = promotes_int_to_float
         self.always_returns_bool = always_returns_bool
+        self.supports_rhs_python_scalar = supports_rhs_python_scalar
+        self.supports_one_python_scalar = supports_one_python_scalar
+        self.supports_two_python_scalars = supports_two_python_scalars
 
-def _resolve_binary_pwise_kwargs(
-        op_info, *, op_kwargs=None, lhs_make_tensor_kwargs=None, rhs_make_tensor_kwargs=None
-):
-    """Resolves default values for :func:`sample_inputs_binary_pwise`.
+        if self.supports_two_python_scalars:
+            self.supports_one_python_scalar = True
 
-    By default :attr:`op_kwargs`, :attr:`lhs_make_tensor_kwargs`, and :attr:`rhs_make_tensor_kwargs` are just empty
-    dictionaries. In case :attr:`op_info` is a :class:`BinaryUfuncInfo`, :attr:`BinaryUfuncInfo.lhs_make_tensor_kwargs`
-    and :attr:`BinaryUfuncInfo.rhs_make_tensor_kwargs` will be used as defaults.
-    """
-    if op_kwargs is None:
-        op_kwargs = {}
-    if lhs_make_tensor_kwargs is None:
-        lhs_make_tensor_kwargs = op_info.lhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {}
-    if rhs_make_tensor_kwargs is None:
-        rhs_make_tensor_kwargs = op_info.rhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {}
-
-    return op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs
-
-
-def sample_inputs_binary_pwise(
-    op_info,
-    device,
-    dtype,
-    requires_grad,
-    *,
-    python_scalars=False,
-    op_kwargs=None,
-    lhs_make_tensor_kwargs=None,
-    rhs_make_tensor_kwargs=None,
-    **kwargs,
+        if self.supports_one_python_scalar:
+            assert (
+                supports_rhs_python_scalar
+            ), "Can't support lhs and rhs Python scalars but not rhs scalars!"
+
+
+# The following functions and classes are for testing elementwise unary operators.
+def sample_inputs_elementwise_unary(
+    op_info, device, dtype, requires_grad, op_kwargs=None, **kwargs
 ):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    if not op_kwargs:
+        op_kwargs = {}
 
-    op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binary_pwise_kwargs(
-        op_info,
-        op_kwargs=op_kwargs,
-        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
-        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
-    )
+    low, high = op_info.domain
+    low = low if low is None else low + op_info._domain_eps
+    high = high if high is None else high - op_info._domain_eps
 
-    scalar = make_arg((), **rhs_make_tensor_kwargs)
-    if python_scalars:
-        scalar = scalar.item()  # type: ignore[assignment]
+    if op_info.supports_sparse_csr:
+        # Tensors with dim=2 for sparse CSR testing
+        yield SampleInput(
+            make_tensor(
+                (L, L),
+                device=device,
+                dtype=dtype,
+                low=low,
+                high=high,
+                requires_grad=requires_grad,
+            ),
+            kwargs=op_kwargs,
+        )
+    else:
+        # Creates a 1D, empty, and scalar tensor
+        for shape in ((L,), (1, 0, 3), ()):
+            yield SampleInput(
+                make_tensor(
+                    shape,
+                    device=device,
+                    dtype=dtype,
+                    low=low,
+                    high=high,
+                    requires_grad=requires_grad,
+                ),
+                kwargs=op_kwargs,
+            )
 
-    shapes = [
-        ((), scalar),
-        ((S,), scalar),
-        ((S, 1), (S,)),
-        ((M, S), scalar),
-        ((S, M, S), (M, S)),
-        ((S, M, S), (S, M, S)),
-        ((M, 1, S), (M, S)),
-        ((M, 1, S), (1, M, S)),
-        ((0, 1, 3), (0, 10, 3))
-    ]
 
-    for shape_lhs, shape_rhs_or_scalar in shapes:
-        lhs = make_arg(shape_lhs, **lhs_make_tensor_kwargs)
-        if isinstance(shape_rhs_or_scalar, tuple):
-            # shape
-            rhs = make_arg(shape_rhs_or_scalar, **rhs_make_tensor_kwargs)
-            broadcasts_input = torch.broadcast_shapes(shape_lhs, shape_rhs_or_scalar) != shape_lhs
+# Replace values satisfying condition with a safe value. This is used to block
+# out values the could cause singularity like tan(pi/2)
+def _replace_values_in_tensor(tensor, condition, safe_value):
+    mask = condition(tensor)
+    tensor.masked_fill_(mask, safe_value)
+
+
+# Helper to create a unary elementwise tensor with valid inputs
+def _make_unary_elementwise_tensor(shape, *, op, dtype, **kwargs):
+    low, high = op.domain
+    low = low if low is None else low + op._domain_eps
+    high = high if high is None else high - op._domain_eps
+
+    a = make_tensor(shape, low=low, high=high, dtype=dtype, **kwargs)
+
+    if op.reference_numerics_filter is not None and dtype is not torch.bool:
+        condition, safe_value = op.reference_numerics_filter
+        _replace_values_in_tensor(a, condition, safe_value)
+
+    return a
+
+
+# Restricts the values in the tensor to the domain of the
+# given elementwise unary operator
+def _filter_unary_elementwise_tensor(a, *, op):
+    # short-circuits for boolean tensors
+    if a.dtype is torch.bool:
+        return a
+
+    low, high = op.domain
+    low = low if low is None else low + op._domain_eps
+    high = high if high is None else high - op._domain_eps
+
+    if a.dtype is torch.uint8 and low is not None:
+        low = max(low, 0)
+
+    if not a.dtype.is_floating_point and not a.dtype.is_complex:
+        low = math.ceil(low) if low is not None else None
+        high = math.floor(high) if high is not None else None
+
+    if op.reference_numerics_filter is not None:
+        condition, safe_value = op.reference_numerics_filter
+        _replace_values_in_tensor(a, condition, safe_value)
+
+    if low is not None or high is not None:
+        if a.dtype.is_complex:
+            a.real.clamp_(low, high)
+            a.imag.clamp_(low, high)
         else:
-            # scalar
-            rhs = shape_rhs_or_scalar  # type: ignore[assignment]
-            broadcasts_input = False
-
-        yield SampleInput(lhs, args=(rhs,), kwargs=op_kwargs, broadcasts_input=broadcasts_input)
-
-
-def sample_inputs_add_sub(
-    op_info,
-    device,
-    dtype,
-    requires_grad,
-    python_scalars=False,
-    alpha=1,
-    op_kwargs=None,
-    lhs_make_tensor_kwargs=None,
-    rhs_make_tensor_kwargs=None,
-    **kwargs,
-):
-    op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binary_pwise_kwargs(
-        op_info,
-        op_kwargs=op_kwargs,
-        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
-        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
+            a.clamp_(min=low, max=high)
+
+    return a
+
+
+def generate_elementwise_unary_tensors(op, *, device, dtype, requires_grad, **kwargs):
+
+    # Special-cases bool
+    if dtype is torch.bool:
+        tensors = (
+            torch.empty(0, device=device, dtype=torch.bool),
+            torch.tensor(True, device=device),
+            torch.tensor(False, device=device),
+            torch.tensor((True, False), device=device),
+            make_tensor((812,), device=device, dtype=dtype),
+            make_tensor((1029, 917), device=device, dtype=dtype),
+        )
+        for a in tensors:
+            yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+    shapes = (
+        (1029, 917),
+        (812,),
+        # Empty sizes
+        (0,),
+        (0, 3, 3),
+        (1, 0, 5),
+        (6, 0, 0, 0),
+        (3, 0, 1, 0),
     )
 
-    yield from sample_inputs_binary_pwise(
-        op_info,
-        device,
-        dtype,
-        requires_grad,
-        python_scalars=python_scalars,
-        op_kwargs=op_kwargs,
-        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
-        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
-        **kwargs,
+    make_arg = partial(
+        _make_unary_elementwise_tensor,
+        op=op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
     )
+    for shape in shapes:
+        a = make_arg(shape)
+        yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_small_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_small_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        a = _filter_unary_elementwise_tensor(sample.input, op=op)
+        yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_large_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_large_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        a = _filter_unary_elementwise_tensor(sample.input, op=op)
+        yield SampleInput(sample.input, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_extremal_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_extremal_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        yield SampleInput(
+            sample.input, kwargs=op.sample_kwargs(device, dtype, sample.input)[0]
+        )
+
 
-    lhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **lhs_make_tensor_kwargs)
-    rhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **rhs_make_tensor_kwargs)
-    yield SampleInput(lhs, args=(rhs,), kwargs=dict(op_kwargs, alpha=alpha), broadcasts_input=False)
-
-def sample_inputs_isclose(
-    op_info,
-    device,
-    dtype,
-    requires_grad,
-    python_scalars=False,
-    op_kwargs=None,
-    lhs_make_tensor_kwargs=None,
-    rhs_make_tensor_kwargs=None,
-    **kwargs,
+def generate_elementwise_unary_noncontiguous_tensors(
+    op, *, device, dtype, requires_grad=False
 ):
-    op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binary_pwise_kwargs(
-        op_info,
-        op_kwargs=op_kwargs,
-        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
-        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
+    low, high = op.domain
+    low = low if low is None else low + op._domain_eps
+    high = high if high is None else high - op._domain_eps
+
+    make_arg = partial(
+        _make_unary_elementwise_tensor,
+        op=op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
     )
 
-    yield from sample_inputs_binary_pwise(
-        op_info,
-        device,
-        dtype,
-        requires_grad,
-        python_scalars=python_scalars,
-        op_kwargs=op_kwargs,
-        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
-        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
-        **kwargs,
+    # Generic noncontiguity
+    t = make_arg((1026,), noncontiguous=True)
+    yield SampleInput(t, kwargs=op.sample_kwargs(device, dtype, t)[0])
+
+    # Transposed
+    t = make_arg((1024, 1024)).T
+    yield SampleInput(t, kwargs=op.sample_kwargs(device, dtype, t)[0])
+
+    # Expanded tensors
+    shapes = ((1, 3), (1, 7), (5, 7))
+
+    for shape in shapes:
+        t = make_arg(shape)
+        t_non_contig = t.expand(3, -1, -1)
+        yield SampleInput(
+            t_non_contig, kwargs=op.sample_kwargs(device, dtype, t_non_contig)[0]
+        )
+
+def generate_elementwise_unary_arbitrarily_strided_tensors(op, *, device, dtype, requires_grad=False):
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
     )
+    for shape, strides, offset in strided_cases:
+        a = make_arg(500,).as_strided(shape, strides, offset)
+        yield SampleInput(a)
+
+# Reuses the elementwise binary generators for consistency
+# TODO: in the future generalize the reference generators to handle n-ary elementwise operations
+def _reference_inputs_elementwise_unary(op, device, dtype, requires_grad, **kwargs):
+    yield from op.sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
 
+    yield from generate_elementwise_unary_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+    if dtype is not torch.bool:
+        yield from generate_elementwise_unary_small_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+    if dtype not in (torch.bool, torch.uint8, torch.int8) and (
+        op.handles_large_floats
+        or (not dtype.is_floating_point and not dtype.is_complex)
+    ):
+        yield from generate_elementwise_unary_large_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+    if dtype.is_floating_point or dtype.is_complex:
+        yield from generate_elementwise_unary_extremal_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+
+def reference_inputs_elementwise_unary(op, device, dtype, requires_grad, **kwargs):
+    gen = partial(
+        _reference_inputs_elementwise_unary, op, device, dtype, requires_grad, **kwargs
+    )
+
+    # yields "normal" samples
+    yield from gen()
+
+    # yields noncontiguous samples
+    for sample in gen():
+        yield sample.noncontiguous()
+
+    yield from generate_elementwise_unary_noncontiguous_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+    yield from generate_elementwise_unary_arbitrarily_strided_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+
+# Metadata class for unary "universal functions (ufuncs)" that accept a single
+# tensor and have common properties like:
+class UnaryUfuncInfo(OpInfo):
+    """Operator information for 'universal unary functions (unary ufuncs).'
+    These are functions of a single tensor with common properties like:
+      - they are elementwise functions
+      - the input shape is the output shape
+      - they typically have method and inplace variants
+      - they typically support the out kwarg
+      - they typically have NumPy or SciPy references
+    See NumPy's universal function documentation
+    (https://numpy.org/doc/1.18/reference/ufuncs.html) for more details
+    about the concept of ufuncs.
+    """
+
+    def __init__(
+        self,
+        name,  # the string name of the function
+        *,
+        ref,  # a reference function
+        dtypes=floating_types(),
+        dtypesIfCUDA=None,
+        dtypesIfROCM=None,
+        domain=(None, None),  # the [low, high) domain of the function
+        handles_large_floats=True,  # whether the op correctly handles large float values (like 1e20)
+        supports_complex_to_float=False,  # op supports casting from complex input to real output safely eg. angle
+        sample_inputs_func=sample_inputs_elementwise_unary,
+        reference_inputs_func=reference_inputs_elementwise_unary,
+        sample_kwargs=lambda device, dtype, input: ({}, {}),
+        supports_sparse=False,
+        reference_numerics_filter=None,  # Filters values in the range of the domain specified above but that should not be tested
+        **kwargs,
+    ):
+        self._original_unary_ufunc_args = locals().copy()
+
+        super(UnaryUfuncInfo, self).__init__(
+            name,
+            dtypes=dtypes,
+            dtypesIfCUDA=dtypesIfCUDA,
+            dtypesIfROCM=dtypesIfROCM,
+            sample_inputs_func=sample_inputs_func,
+            reference_inputs_func=reference_inputs_func,
+            supports_sparse=supports_sparse,
+            **kwargs,
+        )
+        self.ref = ref
+        self.domain = domain
+        self.handles_large_floats = handles_large_floats
+        self.supports_complex_to_float = supports_complex_to_float
+        self.reference_numerics_filter = reference_numerics_filter
+
+        # test_unary_ufuncs.py generates its own inputs to test the consistency
+        # of the operator on sliced tensors, non-contig tensors, etc.
+        # `sample_kwargs` is a utility function to provide kwargs
+        # along with those inputs if required (eg. clamp).
+        # It should return two dictionaries, first holding kwarg for
+        # torch operator and second one for reference NumPy operator.
+        self.sample_kwargs = sample_kwargs
+
+        # Epsilon to ensure grad and gradgrad checks don't test values
+        #   outside a function's domain.
+        self._domain_eps = 1e-5
+
+def sample_inputs_add_sub(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
+
+    # Adds alpha kwarg cases
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
+    if dtype is not torch.bool:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': 2})
+    else:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': True})
+    neg_alpha = -3.14 if (dtype.is_floating_point or dtype.is_complex) else -3
+    lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
+    if dtype is not torch.bool:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': neg_alpha})
+    else:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': False})
+
+def sample_inputs_isclose(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
+
+    # Creates additional inputs to test the rtol, atol, and equal_nan params
     rtols = [0., 1e-7]
     atols = [0., 1e-7]
     equal_nans = [False, True]
 
     products = product(rtols, atols, equal_nans)
 
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     for rtol, atol, equal_nan in products:
-        lhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **lhs_make_tensor_kwargs)
-        rhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **rhs_make_tensor_kwargs)
+        lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+        rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
 
         yield SampleInput(lhs, args=(rhs,),
-                          kwargs=dict(op_kwargs, rtol=rtol, atol=atol, equal_nan=equal_nan))
+                          kwargs=dict(rtol=rtol, atol=atol, equal_nan=equal_nan))
 
 def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2092,11 +2911,11 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
     for shape_a, shape_b, shape_c, broadcasts_input in test_cases:
         sample_inputs.append(
             SampleInput(
-                make_tensor(shape_a, device, dtype, requires_grad=requires_grad),
+                make_tensor(shape_a, dtype=dtype, device=device, requires_grad=requires_grad),
                 args=(
-                    make_tensor(shape_b, device, dtype,
+                    make_tensor(shape_b, dtype=dtype, device=device,
                                 requires_grad=requires_grad),
-                    make_tensor(shape_c, device, dtype,
+                    make_tensor(shape_c, dtype=dtype, device=device,
                                 requires_grad=requires_grad)),
                 kwargs={'alpha': alpha_val, 'beta': beta_val},
                 broadcasts_input=broadcasts_input))
@@ -2104,27 +2923,57 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
     if dtype.is_complex:
         shape = (3, 3)
         sample_inputs.append(
-            SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad),
+            SampleInput(make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad),
                         args=(
-                            make_tensor(shape, device, dtype).mH.requires_grad_(requires_grad),
-                            make_tensor(shape, device, dtype,
+                            make_tensor(shape, dtype=dtype, device=device).mH.requires_grad_(requires_grad),
+                            make_tensor(shape, dtype=dtype, device=device,
                                         requires_grad=requires_grad)),
                         kwargs={'alpha': alpha_val, 'beta': beta_val},))
         sample_inputs.append(
-            SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad),
+            SampleInput(make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad),
                         args=(
-                            make_tensor(shape, device, dtype,
+                            make_tensor(shape, dtype=dtype, device=device,
                                         requires_grad=requires_grad),
-                            make_tensor(shape, device, dtype).mH.requires_grad_(requires_grad)),
+                            make_tensor(shape, dtype=dtype, device=device).mH.requires_grad_(requires_grad)),
                         kwargs={'alpha': alpha_val, 'beta': beta_val},))
     return sample_inputs
 
+def sample_inputs_sparse_sampled_addmm(op_info, device, dtype, requires_grad, **kwargs):
+    alpha = 2 + 3j if dtype.is_complex else 0.6
+    beta = 1 + 2j if dtype.is_complex else 0.2
+
+    def generator():
+        # sparse.sampled_addmm performs: alpha * (A @ B) * sparse_ones_like(C) + beta * C
+        for m, n, k in itertools.product([0, 5], repeat=3):
+            yield SampleInput(
+                torch.eye(m, n, device=device, dtype=dtype)
+                .to_sparse_csr()
+                .requires_grad_(requires_grad),
+                args=(
+                    make_tensor(
+                        (m, k),
+                        device=device,
+                        dtype=dtype,
+                        requires_grad=requires_grad,
+                    ),
+                    make_tensor(
+                        (k, n),
+                        device=device,
+                        dtype=dtype,
+                        requires_grad=requires_grad,
+                    ),
+                ),
+                kwargs={"alpha": alpha, "beta": beta},
+            )
+
+    return list(generator())
+
 def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
     return (
         SampleInput(
-            make_tensor((S, M, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((S, M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(
-                make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+                make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             )
         ),
     )
@@ -2132,9 +2981,9 @@ def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
 def sample_inputs_bmm(self, device, dtype, requires_grad, **kwargs):
     return (
         SampleInput(
-            make_tensor((M, S, M, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, S, M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(
-                make_tensor((M, M, S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+                make_tensor((M, M, S, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             )
         ),
     )
@@ -2205,14 +3054,22 @@ def sample_inputs_addcmul_addcdiv(op_info, device, dtype, requires_grad, **kwarg
 
     sample_inputs = []
     for input_args, broadcasts_input in test_cases:
-        args = tuple(make_tensor(arg, device, dtype, requires_grad=requires_grad) if isinstance(arg, tuple) else arg
+        # addcdiv should accept inputs with zero value
+        # Currently, it throws ZeroDivisionError when the denominator is zero
+        # TODO: exclude_zeros can be removed after https://github.com/pytorch/pytorch/issues/73638 is fixed
+        args = tuple(make_tensor(arg, dtype=dtype, device=device, requires_grad=requires_grad,
+                     exclude_zero=True) if isinstance(arg, tuple) else arg
                      for arg in input_args)
         sample_inputs.append(SampleInput(
             args[0],
             args=args[1:],
             broadcasts_input=broadcasts_input))
 
-        args = tuple(make_tensor(arg, device, dtype, requires_grad=requires_grad) if isinstance(arg, tuple) else arg
+        # addcdiv should accept inputs with zero value
+        # Currently, it throws ZeroDivisionError when the denominator is zero
+        # TODO: exclude_zeros can be removed after https://github.com/pytorch/pytorch/issues/73638 is fixed
+        args = tuple(make_tensor(arg, dtype=dtype, device=device, requires_grad=requires_grad,
+                     exclude_zero=True) if isinstance(arg, tuple) else arg
                      for arg in input_args)
         sample_inputs.append(SampleInput(
             args[0],
@@ -2231,13 +3088,13 @@ def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs):
                   ]
     sample_inputs = []
     for (input_shape, batch1_shape, batch2_shape, alpha, beta, broadcasts_input) in test_cases:
-        args = (make_tensor(input_shape, device, dtype,
+        args = (make_tensor(input_shape, dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad),
-                make_tensor(batch1_shape, device, dtype,
+                make_tensor(batch1_shape, dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad),
-                make_tensor(batch2_shape, device, dtype,
+                make_tensor(batch2_shape, dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad))
 
@@ -2253,13 +3110,13 @@ def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs):
 
     if dtype.is_complex:
         shapes = [(S, S, S), (S, M, S), (S, S, M)]
-        args = (make_tensor(shapes[0], device, dtype,
+        args = (make_tensor(shapes[0], dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad),
-                make_tensor(shapes[1], device, dtype,
+                make_tensor(shapes[1], dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad),
-                make_tensor(shapes[2], device, dtype,
+                make_tensor(shapes[2], dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad))
         sample_inputs.append(
@@ -2271,18 +3128,33 @@ def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs):
 
     return tuple(sample_inputs)
 
+# TODO: add reduction kwargs
+def sample_inputs_multilabel_soft_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        (S,),
+        (S, S),
+    )
+
+    for shape in shapes:
+        # Produce one with weight and one without.
+        yield SampleInput(_make_tensor(shape), args=(_make_tensor(shape, requires_grad=False),), kwargs={})
+        yield SampleInput(_make_tensor(shape), args=(_make_tensor(shape, requires_grad=False),),
+                          kwargs={'weight': _make_tensor(shape, requires_grad=False)})
+
 def sample_inputs_addr(op_info, device, dtype, requires_grad, **kwargs):
     input1 = SampleInput(
-        make_tensor((S, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
         args=(
-            make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
-            make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad)))
+            make_tensor((S, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)))
 
     input2 = SampleInput(
-        make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
         args=(
-            make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
-            make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad)),
+            make_tensor((S, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)),
         broadcasts_input=True)
 
     if dtype.is_complex:
@@ -2293,60 +3165,22 @@ def sample_inputs_addr(op_info, device, dtype, requires_grad, **kwargs):
         alpha, beta = 2, 3
 
     input3 = SampleInput(
-        make_tensor((S, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
         args=(
-            make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
-            make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad)),
+            make_tensor((S, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)),
         kwargs=dict(beta=beta, alpha=alpha))
 
     input4 = SampleInput(
-        make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
         args=(
-            make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
-            make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad)),
+            make_tensor((S, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)),
         kwargs=dict(beta=beta, alpha=alpha),
         broadcasts_input=True)
 
     return (input1, input2, input3, input4)
 
-def sample_inputs_xlogy(self, device, dtype, requires_grad, **kwargs):
-    return (
-        SampleInput(
-            make_tensor((S, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
-            args=(
-                make_tensor((S, S), device, dtype, low=0, high=None, requires_grad=requires_grad),
-            )
-        ),
-    )
-
-
-def sample_inputs_xlog1py(self, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-
-    # same shape
-    yield SampleInput(make_arg((S, S)), args=(make_arg((S, S), low=-1),))
-    # rhs broadcast
-    yield SampleInput(make_arg((S, S)), args=(make_arg((S,), low=-1),))
-    # all zero `x`
-    x = make_arg((S, S))
-    x.fill_(0)
-    yield SampleInput(x, args=(make_arg((S, S), low=-1),))
-
-    # randomly zero-masked `x`
-    x = make_arg((S, S))
-    y = make_arg((S, S), low=-1)
-    x[torch.rand(x.shape) > 0.5] = 0
-    yield SampleInput(x, args=(y,))
-
-    # Scalar x
-    # `input` has to be a tensor
-    # yield SampleInput(0, args=(make_arg((S, S), low=-1),))
-    # yield SampleInput(2.1, args=(make_arg((S, S), low=-1),))
-
-    # Scalar y
-    yield SampleInput(make_arg((S, S)), args=(-0.5,))
-    yield SampleInput(make_arg((S, S)), args=(1.2,))
-
 def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -2355,20 +3189,39 @@ def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     for shape in cases:
         yield(SampleInput(make_arg(shape)))
 
+# TODO: add reduction kwargs
+def sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+
+    inputs = (
+        ((), make_target([], low=0, high=1), {}),
+        ((S,), make_target([], low=0, high=S), {"p": 1}),
+        ((S,), make_target([1], low=0, high=S), {"p": 2}),
+        ((S, M), make_target([S], low=0, high=M), {"margin": 1.0}),
+        ((M, S), make_target([M], low=0, high=S), {"weight": None}),
+    )
+
+    for input_shape, target, kwargs in inputs:
+        yield SampleInput(_make_tensor(input_shape), args=(target,), kwargs=kwargs)
 
 def sample_inputs_logsumexp(self, device, dtype, requires_grad, **kwargs):
     inputs = (
         ((), (0,), True),
         ((S, S), (1,), True),
-        ((S, S), (1,), False)
+        ((S, S), (1,), False),
+        ((S, S), (-2,), False),
     )
     samples = []
-
-    for shape, dim, keepdim in inputs:
-        t = make_tensor(shape, device, dtype,
-                        low=None, high=None,
-                        requires_grad=requires_grad)
-        samples.append(SampleInput(t, args=(dim, keepdim)))
+    # Test large inputs to check numerical stability
+    lows = (None, 1e3, 1e6) if dtype in (torch.float32, torch.float64) else (None,)
+    for low in lows:
+        high = low * 2 if low is not None else None
+        for shape, dim, keepdim in inputs:
+            t = make_tensor(shape, dtype=dtype, device=device,
+                            low=low, high=high,
+                            requires_grad=requires_grad)
+            samples.append(SampleInput(t, args=(dim, keepdim)))
 
     return tuple(samples)
 
@@ -2389,13 +3242,42 @@ def sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
 
     samples = []
     for shape, kwargs in inputs:
-        t = make_tensor(shape, device, dtype,
+        t = make_tensor(shape, dtype=dtype, device=device,
                         low=None, high=None,
                         requires_grad=requires_grad)
         samples.append(SampleInput(t, kwargs=kwargs))
 
     return tuple(samples)
 
+def reference_inputs_like_fns(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_like_fns(op, device, dtype, requires_grad, **kwargs)
+
+    # shape
+    cases = (
+        (), (0,), (1, 0), (1, 1, 4, 5), (5, 3, 0, 1), (1, 4, 3, 1, 1)
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
+        yield SampleInput(make_arg(shape).transpose(0, -1))
+        yield SampleInput(make_arg(shape, noncontiguous=True))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1))
+
+# TODO: add reduction kwargs
+def sample_inputs_multilabel_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+
+    inputs = (
+        ([], make_target([], low=0, high=1)),
+        ([S], make_target([S], low=0, high=S)),
+        ([M, S], make_target([M, S], low=0, high=S)),
+    )
+
+    for shape, target in inputs:
+        yield SampleInput(_make_tensor(shape), args=(target,))
+
 def get_independent_tensor(tensor):
     return tensor.clone().requires_grad_(tensor.requires_grad)
 
@@ -2417,6 +3299,24 @@ def sample_inputs_randint_like(self, device, dtype, requires_grad, **kwargs):
             kwargs=sample.kwargs))
     return tuple(samples)
 
+# TODO: add reduction kwargs
+def sample_inputs_margin_ranking_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        (),
+        (S,),
+        (S, S),
+        (S, S, S),
+    )
+
+    for shape in shapes:
+        for kwargs in [{}, {'margin': 1.0}]:
+            yield SampleInput(_make_tensor(shape),
+                              args=(_make_tensor(shape, requires_grad=False),
+                                    _make_tensor(shape, requires_grad=False)),
+                              kwargs=kwargs)
+
 def sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
     inputs = [
         ((), (), {}),
@@ -2434,16 +3334,26 @@ def sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
 
     samples = []
     for input_shape, output_shape, kwargs in inputs:
-        t = make_tensor(input_shape, device, dtype,
+        t = make_tensor(input_shape, dtype=dtype, device=device,
                         low=None, high=None,
                         requires_grad=requires_grad)
         samples.append(SampleInput(t, args=(output_shape,), kwargs=kwargs))
 
     return tuple(samples)
 
+def sample_inputs_empty(op, device, dtype, requires_grad, **kwargs):
+    # shape
+    cases = (
+        (), (0,), (1,), (1, 3, 5), (5, 3, 1), (1, 0, 5, 1),
+    )
+
+    for case in cases:
+        _kwargs = {'device': device, 'dtype': dtype, 'requires_grad': requires_grad}
+        yield SampleInput(case, args=(), kwargs=_kwargs)
+
 def sample_inputs_new_full(self, device, dtype, requires_grad, **kwargs):
     def get_val(dtype):
-        return make_tensor([], 'cpu', dtype).item()
+        return make_tensor([], dtype=dtype, device="cpu").item()
 
     samples = []
     for sample in sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
@@ -2456,7 +3366,7 @@ def get_val(dtype):
 
 def sample_inputs_full_like(self, device, dtype, requires_grad, **kwargs):
     def get_val(dtype):
-        return make_tensor([], 'cpu', dtype).item()
+        return make_tensor([], dtype=dtype, device="cpu").item()
 
     inputs = [
         ((), get_val(dtype), {}),
@@ -2474,7 +3384,7 @@ def get_val(dtype):
 
     samples = []
     for shape, fill_value, kwargs in inputs:
-        t = make_tensor(shape, device, dtype,
+        t = make_tensor(shape, dtype=dtype, device=device,
                         low=None, high=None,
                         requires_grad=requires_grad)
         samples.append(SampleInput(t, args=(fill_value,), kwargs=kwargs))
@@ -2494,7 +3404,7 @@ def sample_inputs_multinomial(self, device, dtype, requires_grad, **kwargs):
 
     samples = []
     for shape, num_samples, kwargs in cases:
-        t = make_tensor(shape, device, dtype,
+        t = make_tensor(shape, dtype=dtype, device=device,
                         low=0, high=None,
                         requires_grad=requires_grad)
         samples.append(SampleInput(t, args=(num_samples,), kwargs=kwargs))
@@ -2503,7 +3413,7 @@ def sample_inputs_multinomial(self, device, dtype, requires_grad, **kwargs):
 def sample_inputs_normal_common(self, device, dtype, requires_grad, cases, **kwargs):
     def get_value_or_make_tensor(value_or_shape):
         if isinstance(value_or_shape, list):
-            return make_tensor(value_or_shape, device, dtype,
+            return make_tensor(value_or_shape, dtype=dtype, device=device,
                                low=0, high=None,
                                requires_grad=requires_grad)
         return value_or_shape
@@ -2522,6 +3432,7 @@ def sample_inputs_normal_tensor_first(self, device, dtype, requires_grad, **kwar
         ([3], [3], {}),
         ([3, 4, 2], [3, 4, 2], {}),
         ([2, 3], 1.1, {}),
+        ([1, 2, 3], [5, 2, 3], {}),  # broadcasting
     ]
 
     return sample_inputs_normal_common(self, device, dtype, requires_grad, cases, **kwargs)
@@ -2542,7 +3453,7 @@ def sample_inputs_bernoulli(self, device, dtype, requires_grad, **kwargs):
 
     samples = []
     for shape in shapes:
-        t = make_tensor(shape, device, dtype,
+        t = make_tensor(shape, dtype=dtype, device=device,
                         low=0, high=1,
                         requires_grad=requires_grad)
         samples.append(SampleInput(t))
@@ -2558,7 +3469,7 @@ def sample_inputs_logcumsumexp(self, device, dtype, requires_grad, **kwargs):
 
     for large_number in (True, False):
         for shape, dim in inputs:
-            t = make_tensor(shape, device, dtype,
+            t = make_tensor(shape, dtype=dtype, device=device,
                             low=None, high=None,
                             requires_grad=requires_grad)
 
@@ -2569,7 +3480,7 @@ def sample_inputs_logcumsumexp(self, device, dtype, requires_grad, **kwargs):
     return tuple(samples)
 
 def sample_inputs_trace(self, device, dtype, requires_grad, **kwargs):
-    return (SampleInput((make_tensor((S, S), device, dtype,
+    return (SampleInput((make_tensor((S, S), dtype=dtype, device=device,
                                      low=None, high=None,
                                      requires_grad=requires_grad))),)
 
@@ -2601,6 +3512,12 @@ def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwarg
     for shape, args in cases:
         yield SampleInput(make_arg(shape), args=args)
 
+def _numpy_ref_transpose(a, dim0, dim1):
+    if a.ndim <= 1:
+        return a
+
+    return np.swapaxes(a, dim0, dim1)
+
 def sample_inputs_adjoint(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
@@ -2668,8 +3585,8 @@ def sample_inputs_singular_matrix_factors(op_info, device, dtype, requires_grad=
 
     for batch, m, n in product(batches, size, size):
         for k in range(min(3, min(m, n))):
-            a = make_tensor((*batch, m, k), device, dtype, requires_grad=requires_grad)
-            b = make_tensor((*batch, n, k), device, dtype, requires_grad=requires_grad)
+            a = make_tensor((*batch, m, k), dtype=dtype, device=device, requires_grad=requires_grad)
+            b = make_tensor((*batch, n, k), dtype=dtype, device=device, requires_grad=requires_grad)
             yield SampleInput(a, args=(b,), kwargs=kwargs)
 
 
@@ -2682,7 +3599,7 @@ def clone_sample(sample, **kwargs):
 
     def clone_tensor(t):
         if isinstance(t, torch.Tensor):
-            return t.clone().requires_grad_(t.requires_grad)
+            return t.detach().clone().requires_grad_(t.requires_grad)
         else:
             return t
 
@@ -2719,7 +3636,7 @@ def sample_inputs_svd_lowrank(op_info, device, dtype, requires_grad=False, **kwa
         # TODO: fix bug in the documentation for svd_lowrank:
         # M has to be (*, m, n), and not (*, 1, n) as written
         # in the documentation
-        op_kwargs['M'] = make_tensor((*batch, m, n), device, dtype, requires_grad=requires_grad)
+        op_kwargs['M'] = make_tensor((*batch, m, n), dtype=dtype, device=device, requires_grad=requires_grad)
         yield clone_sample(sample, **op_kwargs)
 
 def chunk_iter(iterable, size):
@@ -2753,6 +3670,38 @@ def sample_inputs_linalg_cond(op_info, device, dtype, requires_grad=False, **kwa
     for shape in shapes:
         yield SampleInput(make_arg(shape))
 
+def sample_inputs_linalg_vander(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    shapes = ((),
+              (1,),
+              (S,),
+              (2, S),)
+
+    for shape in shapes:
+        if len(shape) > 0 and shape[-1] > 1:
+            yield SampleInput(make_arg(shape))
+        n = shape[-1] if len(shape) > 0 else 1
+        for i in range(3):
+            # n-1, n, n+1
+            N = n + i - 1
+            if N < 2:
+                continue
+            yield SampleInput(make_arg(shape), kwargs=dict(N=N))
+
+def np_vander_batched(x, N=None):
+    # Wrapper around np.vander that supports batches of 1 dimension (enough for the tests)
+    if x.ndim == 0:
+        x = x[np.newaxis]
+    if x.ndim == 1:
+        y = np.vander(x, N=N, increasing=True)
+        return y
+    else:
+        if N is None:
+            N = x.shape[-1]
+        y = np.vander(x.ravel(), N=N, increasing=True).reshape((*x.shape, N))
+        return y
+
 def np_sinc_with_fp16_as_fp32(x):
     # Wraps numpy's sinc function so that fp16 values are promoted to fp32
     # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
@@ -2775,7 +3724,7 @@ def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad, **kwargs):
 
     return tuple(
         SampleInput(
-            make_tensor(size, device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor(size, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(shape,)) for size, shape in test_cases)
 
 def sample_inputs_broadcast_tensors(op_info, device, dtype, requires_grad, **kwargs):
@@ -2796,24 +3745,7 @@ def sample_inputs_block_diag(op_info, device, dtype, requires_grad, **kwargs):
     for shape, *other_shapes in test_cases:
         samples.append(SampleInput(make_arg(shape), args=tuple(make_arg(s) for s in other_shapes)))
 
-    return samples
-
-def sample_inputs_bitwise_shift(op_info, device, dtype, requires_grad, **kwargs):
-    test_cases = (
-        (S, S, S),
-        (S,),
-        (),
-    )
-
-    sample_inputs = []
-    for size in test_cases:
-        tensor1 = make_tensor(size, device, dtype, low=-32, high=32, requires_grad=requires_grad)
-        tensor2 = make_tensor(size, device, dtype, low=0, high=5, requires_grad=requires_grad)
-        sample_inputs.append(SampleInput(tensor1, args=(tensor2,)))
-        sample_inputs.append(SampleInput(tensor1, args=(2,)))
-
-    return tuple(sample_inputs)
-
+    return samples
 
 def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
     small_S = 2
@@ -2845,8 +3777,8 @@ def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
             for t1_size, t2_size in test_cases:
                 # The args should never be non-contiguous as this is not supported in the backward
                 samples.append(SampleInput(
-                    make_tensor(t1_size, device, dtype, requires_grad=requires_grad),
-                    args=(make_tensor(t2_size, device, dtype, requires_grad=requires_grad), p, cm)))
+                    make_tensor(t1_size, dtype=dtype, device=device, requires_grad=requires_grad),
+                    args=(make_tensor(t2_size, dtype=dtype, device=device, requires_grad=requires_grad), p, cm)))
 
     return samples
 
@@ -2863,60 +3795,30 @@ def sample_inputs_fill_(op_info, device, dtype, requires_grad, **kwargs):
         yield SampleInput(make_arg(shape), args=args)
 
 
-def sample_inputs_comparison_ops(self, device, dtype, requires_grad, **kwargs):
-    test_cases = (
-        ((S, S, S), (S, S, S), False),
-        ((S, S, S), (), False),
-        ((S, S, S), (1,), False),
-        ((S,), (1,), False),
-        ((), (), False),
-    )
-    test_cases_lhs_broadcasting = (
-        ((S, 1, S), (S, S, S), True),
-        ((1,), (S, S, S), True),
-        ((1, S), (1, 1, S), True),
-        ((), (0,), True),
-        ((), (S, S, S), True),
-    )
-    cases = test_cases + test_cases_lhs_broadcasting
-    sample_inputs = list(SampleInput(make_tensor(first_shape, device, dtype,
-                                                 requires_grad=requires_grad),
-                                     args=(make_tensor(second_shape, device, dtype,
-                                                       requires_grad=requires_grad),),
-                                     broadcasts_input=broadcasts_input)
-                         for first_shape, second_shape, broadcasts_input in cases)
-    equal_tensors_non_bool = (
-        ([[[-8, 6], [9, 0]], [[0, 5], [5, 7]]]),
-        ([[[6, 5]], [[1, -5]]]),
-        ([[2], [-1]]),
-        ([0, -6]),
-        ([3],),
-    )
-    equal_tensors_bool = (
-        ([[[1, 0], [0, 0]], [[0, 1], [1, 0]]]),
-        ([[[1, 1]], [[1, 0]]]),
-        ([[1], [0]]),
-        ([0, 1]),
-        ([1],),
-    )
-    more_cases = equal_tensors_bool if dtype is torch.bool else equal_tensors_non_bool
-    more_inputs = list(SampleInput(torch.tensor(elements, device=device, dtype=dtype,
-                                                requires_grad=requires_grad),
-                                   args=(torch.tensor(elements, device=device, dtype=dtype,
-                                                      requires_grad=requires_grad),))
-                       for elements in more_cases)
-    sample_inputs = [*sample_inputs, *more_inputs]
-    return tuple(sample_inputs)
+def sample_inputs_comparison_ops(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
 
+    # Adds a sample input where both tensors have the same values
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    lhs = make_arg((S, S))
+    yield SampleInput(lhs, args=(lhs.clone(),))
 
 def sample_inputs_stack(op_info, device, dtype, requires_grad, **kwargs):
-    tensors = [
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-    ]
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # shape x number of tensors
+    cases = (
+        ((3, 4), 1),
+        ((1, 2, 1, 4), 3),
+        ((0, 1, 0), 2),)
 
-    return (SampleInput(tensors, args=(0,)),)
+    for shape, num_tensors in cases:
+        tensors = []
+        for _ in range(num_tensors):
+            tensors.append(make_arg(shape))
+        for dim in range(-1, len(shape) - 1):
+            yield SampleInput(tensors, args=(dim,))
 
 def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2934,40 +3836,44 @@ def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
     for input_shape1, input_shape2, kwargs in cases:
         yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
 
+def reference_inputs_cat(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_cat_concat(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Noncontiguous type promoting tensors
+    a = make_arg((3, 4, 2))
+    b = make_arg((3, 2, 2), noncontiguous=True, dtype=torch.double)
+    c = make_arg((3, 3, 2), dtype=torch.float16).permute(1, 0, 2)
+
+    yield SampleInput((a, b, c), kwargs={'dim': 1})
+
 def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs):
     tensors = [
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
+        make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
+        make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
+        make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
     ]
 
     return (SampleInput(tensors),)
 
-def sample_inputs_hypot(op_info, device, dtype, requires_grad, **kwargs):
-    input = make_tensor((S, S), device, dtype, requires_grad=requires_grad)
-    args = make_tensor((S, S), device, dtype, requires_grad=requires_grad)
-
-    return (
-        SampleInput(input, args=(args,)),
-    )
-
 def sample_inputs_gather(op_info, device, dtype, requires_grad, **kwargs):
     return (
         SampleInput(
-            make_tensor((M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(0, gather_variable((S, S), 1, M, True, device=device))),
         SampleInput(
-            make_tensor((M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(1, gather_variable((M, S // 2), 0, S, True, device=device))),
         SampleInput(
-            make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(0, torch.tensor([0], dtype=torch.int64, device=device))),
         # Empty index tensor case, see: https://github.com/pytorch/pytorch/pull/65006
         SampleInput(
-            make_tensor((S,), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((S,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(0, torch.tensor([], dtype=torch.uint8, device=device))),
         SampleInput(
-            make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=(0, torch.tensor(0, dtype=torch.int64, device=device))),
     )
 
@@ -2990,12 +3896,12 @@ def error_inputs_gather(op_info, device, **kwargs):
 
     # Index should be smaller than self except on dimesion 1
     bad_src = make_tensor((1, 1), device=device, dtype=torch.float32)
-    yield ErrorInput(SampleInput(bad_src, args=(1, idx,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(bad_src, args=(1, idx,)),
                      error_regex="Size does not match at dimension 0")
 
     # Index must have long dtype
     bad_idx = idx.to(torch.int32)
-    yield ErrorInput(SampleInput(src, args=(1, bad_idx)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(src, args=(1, bad_idx)),
                      error_regex="Expected dtype int64 for index")
 
     # TODO: FIXME
@@ -3004,20 +3910,20 @@ def error_inputs_gather(op_info, device, **kwargs):
     src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 0), (1, 0)), device=device, dtype=torch.long)
     out = torch.empty((2, 2), device=device, dtype=torch.float64)
-    yield ErrorInput(SampleInput(src, args=(1, idx), kwargs={'out': out}), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(src, args=(1, idx), kwargs={'out': out}),
                      error_regex="Expected out tensor to have dtype")
 
     # src and index tensors must have the same # of dimensions
     # idx too few dimensions
     src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
     idx = torch.tensor((0, 0), device=device, dtype=torch.long)
-    yield ErrorInput(SampleInput(src, args=(1, idx)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(src, args=(1, idx)),
                      error_regex="Index tensor must have the same number of dimensions")
 
     # src too few dimensions
     src = torch.tensor((1, 2), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 0), (1, 0)), device=device, dtype=torch.long)
-    yield ErrorInput(SampleInput(src, args=(0, idx)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(src, args=(0, idx)),
                      error_regex="Index tensor must have the same number of dimensions")
 
     # index out of bounds
@@ -3025,37 +3931,70 @@ def error_inputs_gather(op_info, device, **kwargs):
     if torch.device(device).type == 'cpu':
         src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
         idx = torch.tensor(((0, 23), (1, 0)), device=device, dtype=torch.long)
-        yield ErrorInput(SampleInput(src, args=(1, idx,)), error_type=RuntimeError,
+        yield ErrorInput(SampleInput(src, args=(1, idx,)),
                          error_regex="index 23 is out of bounds for dimension")
 
+    x = torch.rand((1,), device=device).expand((3,))
+    src = torch.rand((6,), device=device)
+    ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
+
+    yield ErrorInput(SampleInput(src, args=(0, ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(src, args=(0, ind,), kwargs=dict(out=src)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(ind.clone(), args=(0, ind[1:],), kwargs=dict(out=ind[:1])),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+def error_inputs_take(op_info, device, **kwargs):
+    x = torch.rand((1,), device=device).expand((3,))
+    src = torch.rand((6,), device=device)
+    ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
+
+    yield ErrorInput(SampleInput(src, args=(ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(src, args=(ind,), kwargs=dict(out=src)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(ind.clone(), args=(ind[1:],), kwargs=dict(out=ind[:-1])),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
 # Error inputs for scatter
 def error_inputs_scatter_and_scatter_add(op_info, device, **kwargs):
     # Error when self.dtype != src.dtype (and src is not a scalar)
     src = make_tensor((2, 5), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.long)
     dst = torch.zeros((3, 5), device=device, dtype=torch.double)
-    yield ErrorInput(SampleInput(dst, args=(0, idx, src)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Expected self.dtype to be equal to src.dtype")
 
     # Index dtype must be long
     src = make_tensor((2, 5), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.int32)
     dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
-    yield ErrorInput(SampleInput(dst, args=(0, idx, src)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Expected dtype int64 for index")
 
     # Index and destination must have the same number of dimensions
     src = make_tensor((2, 5), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.long)
     dst = torch.zeros((3, 5, 3), device=device, dtype=torch.float32)
-    yield ErrorInput(SampleInput(dst, args=(0, idx, src)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Index tensor must have the same number of dimensions as self tensor")
 
     # Index and src must have the same number of dimensions when src is not a scalar
     src = make_tensor((2, 5, 2), device=device, dtype=torch.float32)
     idx = torch.tensor(((34, 1), (1, 2)), device=device, dtype=torch.long)
     dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
-    yield ErrorInput(SampleInput(dst, args=(0, idx, src)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Index tensor must have the same number of dimensions as src tensor")
 
     # Index out of bounds
@@ -3064,39 +4003,226 @@ def error_inputs_scatter_and_scatter_add(op_info, device, **kwargs):
         src = make_tensor((2, 5), device=device, dtype=torch.float32)
         idx = torch.tensor(((34, 1), (1, 2)), device=device, dtype=torch.long)
         dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
-        yield ErrorInput(SampleInput(dst, args=(0, idx, src)), error_type=RuntimeError,
+        yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                          error_regex="index 34 is out of bounds for dimension 0 with size 3")
 
+def error_inputs_renorm(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(0.5, 0, 1.0)), error_type=RuntimeError,
+                     error_regex="needs at least 2 dimensions, got 0 dimensions")
+
+def error_inputs_lstsq(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(zero_d)), error_type=TypeError,
+                     error_regex="iteration over a 0-d tensor")
+
+def error_inputs_eig(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+
+    yield ErrorInput(SampleInput(zero_d, args=(False,)), error_type=RuntimeError,
+                     error_regex="input should be 2 dimensional")
+
+    yield ErrorInput(SampleInput(zero_d, args=(True,)), error_type=RuntimeError,
+                     error_regex="input should be 2 dimensional")
+
+def error_inputs_ormqr(op_info, device, **kwargs):
+    # this is only implemented on cpu
+    if (torch.device(device).type == 'cpu'):
+        zero_d = torch.randn((), device=device)
+        yield ErrorInput(SampleInput(zero_d, args=(zero_d, zero_d)), error_type=RuntimeError,
+                         error_regex="input must have at least 2 dimensions")
+
+def error_inputs_diag(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(zero_d)), error_type=TypeError,
+                     error_regex="iteration over a 0-d tensor")
+
+def error_inputs_embedding(op_info, device, **kwargs):
+    indices = torch.rand(2, 2, device=device).long()
+    weights = [
+        torch.tensor(1.0, device=device),
+        torch.tensor(1.0, device=device).reshape(1, 1, 1),
+    ]
+
+    for weight in weights:
+        yield ErrorInput(SampleInput(weight, args=(indices,)), error_type=RuntimeError,
+                         error_regex="'weight' must be 2-D")
+
+def error_inputs_multinomial(op_info, device, **kwargs):
+    x = torch.empty(1, 2, 3, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,)), error_type=RuntimeError,
+                     error_regex="prob_dist must be 1 or 2 dim")
+
+    x = torch.empty(1, 2, dtype=torch.long, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,)), error_type=RuntimeError,
+                     error_regex="multinomial only supports floating-point dtypes for input")
+
+    x = torch.empty(1, 2, dtype=torch.double, device=device)
+    y = torch.empty(1, 2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,), kwargs=dict(out=y)), error_type=RuntimeError,
+                     error_regex="multinomial expects Long tensor out")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(0,)), error_type=RuntimeError,
+                     error_regex="cannot sample n_sample <= 0 samples")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(-1,)), error_type=RuntimeError,
+                     error_regex="cannot sample n_sample <= 0 samples")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(3, False,)), error_type=RuntimeError,
+                     error_regex="cannot sample n_sample > prob_dist")
+
+    x = torch.empty(16777217, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(3,)), error_type=RuntimeError,
+                     error_regex="number of categories cannot exceed")
+
+def error_inputs_gradient(op_info, device, **kwargs):
+    for dtype in [torch.long, torch.float32, torch.complex64]:
+        t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device, dtype=dtype)
+
+        dim = (1, 0)
+        spacing = [0.1]
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=spacing, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected spacing to be unspecified, a scalar ')
+
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=3)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient only supports edge_order=1 and edge_order=2.')
+
+        dim = (1, 1)
+        spacing = 0.1
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=spacing, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='dim 1 appears multiple times in the list of dims')
+
+        dim = (0, 1)
+        coordinates = [torch.tensor([1, 2, 4], device='cpu'), torch.tensor([1, 2, 4], device='meta')]
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=coordinates, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each tensor to be on the same device,')
+
+        yield ErrorInput(SampleInput(t, kwargs=dict(dim=3)),
+                         error_type=IndexError, error_regex='')
+
+        t = torch.tensor([[1], [2], [3]])
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each dimension size to be at least')
+
+        t = torch.tensor([[1, 2], [3, 4]])
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=2)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each dimension size to be at least')
+
+def error_inputs_masked_select(op_info, device, **kwargs):
+    x = torch.rand((1,), device=device).expand((3,))
+    y = torch.rand((6,), device=device)
+    mask = torch.tensor([True, False, True, True, False, False], device=device)
+
+    yield ErrorInput(SampleInput(y, args=(mask,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(y, args=(mask,), kwargs=dict(out=y)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(mask.clone(), args=(mask,), kwargs=dict(out=mask)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+def error_inputs_index_select(op_info, device, **kwargs):
+    x = torch.rand((1, 6), device=device).expand((2, 6))
+    y = torch.rand((3, 6), device=device)
+    ind = torch.tensor([0, 1], dtype=torch.int64, device=device)
+
+    yield ErrorInput(SampleInput(y, args=(1, ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
 def sample_inputs_take_along_dim(op_info, device, dtype, requires_grad, **kwargs):
-    return (SampleInput(make_tensor((S, S), device, dtype,
+    return (SampleInput(make_tensor((S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(gather_variable((S, S), 1, S, True, device=device), 0)),
 
             # `indices` broadcast
-            SampleInput(make_tensor((S, S), device, dtype,
+            SampleInput(make_tensor((S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(gather_variable((1, S // 2), 0, S, True, device=device), 1)),
 
             # `self` broadcast
-            SampleInput(make_tensor((1, S), device, dtype,
+            SampleInput(make_tensor((1, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(gather_variable((S, S // 2), 0, S, True, device=device), 1)),
 
             # without `dim` arg
-            SampleInput(make_tensor((S, S), device, dtype,
+            SampleInput(make_tensor((S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(gather_variable((S, S // 2), 0, S, True, device=device), )),
-            SampleInput(make_tensor((S, S), device, dtype,
+            SampleInput(make_tensor((S, S), dtype=dtype, device=device,
                                     low=None, high=None,
                                     requires_grad=requires_grad),
                         args=(gather_variable((S, S // 2), 0, S, True, device=device),)),
             )
 
 
+def error_inputs_aminmax_amax_amin(op_info, device, **kwargs):
+
+    # Error Inputs for zero-dim tensors, when 'dim' arg is not provided.
+    shape = (S, 0, S)
+    err_msg_amax_amin = "reduction"
+    err_msg_aminmax = "cannot compute aminmax over an empty dimension as the operation has no identity"
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_amax_amin)
+    elif op_info.name in ['aminmax']:
+        yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_aminmax)
+
+    # Error Inputs for tensors with more than 64 dimension
+    sizes = [1] * 65
+    err_msg1 = "only tensors with up to 64 dims are supported"
+    yield ErrorInput(SampleInput(torch.randn(sizes, device=device), kwargs={'dim': -1}),
+                     error_regex=err_msg1)
+    yield ErrorInput(SampleInput(torch.randn(sizes, device=device), kwargs={'dim': 64}),
+                     error_regex=err_msg1)
+
+    # Error Inputs for repeated 'dim'
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        dims = [(0, 0), (0, -4)]
+        err_msg2 = "in the list of dims"
+        x = torch.randn(S, S, S, S, device=device)
+        for dim in dims:
+            yield ErrorInput(SampleInput(x, kwargs={'dim': dim}), error_regex=err_msg2)
+
+    # Error Input for illegal dtype
+    input5 = torch.randn(L, L, dtype=torch.float32, device=device)
+    max_values = torch.empty(L, dtype=torch.float32, device=device)
+    min_values = torch.empty(L, dtype=torch.double, device=device)
+    illegal_values = torch.empty(L, dtype=torch.int, device=device)
+
+    err_msg_amax_amin2 = "Expected the dtype for input and out to match"
+    err_msg_aminmax2 = "Expected out tensor to have dtype float, but got double instead"
+
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': illegal_values}),
+                         error_regex=err_msg_amax_amin2)
+    elif op_info.name in ['aminmax']:
+        yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': (max_values, min_values)}),
+                         error_regex=err_msg_aminmax2)
+
+    # Error Inputs for functions to raise an error on specified zero'd dimension as reduction dim
+    err_msg3 = "reduction"
+    # FIXME: eager and ref impl throw different types of errors
+    error_type = IndexError if 'refs' not in op_info.name else RuntimeError
+    yield ErrorInput(SampleInput(torch.rand(shape, device=device), kwargs={'dim': 1}),
+                     error_type=error_type, error_regex=err_msg3)
+
 def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
     test_cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
         ((S, S, S), {}),
@@ -3110,7 +4236,7 @@ def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
     samples: List[SampleInput] = []
     for shape, kwargs in test_cases:
         samples.append(SampleInput(
-            make_tensor(shape, device, dtype, requires_grad=requires_grad),
+            make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad),
             kwargs=kwargs))
 
     return samples
@@ -3276,7 +4402,7 @@ def sample_inputs_gradient(op_info, device, dtype, requires_grad, **kwargs):
         ((4, 4, 4), [2., 1.], (0, 1), 2),
     )
     for size, spacing, dim, edge_order in test_cases_float:
-        t = make_tensor(size, device, dtype, low=None, high=None, requires_grad=requires_grad)
+        t = make_tensor(size, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
         sample_inputs.append(SampleInput(t, kwargs=dict(dim=dim, spacing=spacing, edge_order=edge_order)))
 
     test_cases_tensor = (
@@ -3284,7 +4410,7 @@ def sample_inputs_gradient(op_info, device, dtype, requires_grad, **kwargs):
         ((3, 3, 3), ((1.0, 3.0, 2.0), (8.0, 6.0, 1.0)), (0, 1), 2),
     )
     for size, coordinates, dim, edge_order in test_cases_tensor:
-        t = make_tensor(size, device, dtype, low=None, high=None, requires_grad=requires_grad)
+        t = make_tensor(size, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
         coordinates_tensor_list = []
         for coords in coordinates:
             # `coords` will always contain floating point values and Python 3.10 does not support this
@@ -3434,28 +4560,6 @@ def sample_inputs_unique_consecutive(*args, **kwargs):
             sample_input.kwargs.pop("sorted")
             yield sample_input
 
-def sample_inputs_max_min_binary(op_info, device, dtype, requires_grad, **kwargs):
-    inputs = []
-    args_for_binary_op = (
-        ((S, S, S), (S, S, S),),
-        ((S, S, S), (S,),),
-        ((S,), (S, S, S),),
-        ((S, 1, S), (S, S),),
-        ((S, S), (S, S),),
-        ((), (),),
-        ((S, S, S), (),),
-        ((), (S, S, S),),
-    )
-    inputs = list((SampleInput(make_tensor(input_tensor, device, dtype,
-                                           low=None, high=None,
-                                           requires_grad=requires_grad),
-                               args=(make_tensor(other_tensor, device, dtype,
-                                                 low=None, high=None,
-                                                 requires_grad=requires_grad),),))
-                  for input_tensor, other_tensor in args_for_binary_op)
-    return inputs
-
-
 def sample_inputs_adaptive_avg_pool1d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -3903,7 +5007,6 @@ def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
     # With `None` weight and bias (tests failing for this, see the link above)
     # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,))))
 
-
 def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -3925,7 +5028,6 @@ def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kw
     for input_shape, size, kwargs in cases:
         yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs)
 
-
 def sample_inputs_hardswish(self, device, dtype, requires_grad, **kwargs):
     N = 5
     # make sure we are testing -3 -> 3 range. default is -10 -> 10 so maybe unnecessary ?
@@ -4080,10 +5182,16 @@ def shape(size, rank, with_batch_channel=True):
 
     return sample_inputs
 
+
 def sample_inputs_gelu(self, device, dtype, requires_grad, **kwargs):
     N = 5
-    tensors = [SampleInput(make_tensor((N * 2, N * 2), device=device, dtype=dtype,
-               requires_grad=requires_grad, low=-3, high=3)) for _ in range(1, N)]
+    tensors = []
+    for _ in range(1, N):
+        for approximate in ['none', 'tanh']:
+            tensors.append(SampleInput(
+                make_tensor((N * 2, N * 2), device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-3, high=3),
+                kwargs=dict(approximate=approximate)))
     return tensors
 
 def sample_inputs_max_min_reduction_with_dim(op_info, device, dtype, requires_grad, **kwargs):
@@ -4094,7 +5202,7 @@ def sample_inputs_max_min_reduction_with_dim(op_info, device, dtype, requires_gr
         ((), (0,),),
         ((), (0, True,),),
     )
-    inputs = list((SampleInput(make_tensor(input_tensor, device, dtype,
+    inputs = list((SampleInput(make_tensor(input_tensor, dtype=dtype, device=device,
                                            low=None, high=None,
                                            requires_grad=requires_grad),
                                args=args,))
@@ -4103,10 +5211,10 @@ def sample_inputs_max_min_reduction_with_dim(op_info, device, dtype, requires_gr
 
 def sample_inputs_max_min_reduction_no_dim(op_info, device, dtype, requires_grad, **kwargs):
     inputs = []
-    inputs.append(SampleInput(make_tensor((S, S, S), device, dtype,
+    inputs.append(SampleInput(make_tensor((S, S, S), dtype=dtype, device=device,
                                           low=None, high=None,
                                           requires_grad=requires_grad),))
-    inputs.append(SampleInput(make_tensor((), device, dtype,
+    inputs.append(SampleInput(make_tensor((), dtype=dtype, device=device,
                                           low=None, high=None,
                                           requires_grad=requires_grad),))
     return inputs
@@ -4120,7 +5228,7 @@ def sample_inputs_nan_reduction(supports_multiple_dims):
     # Generates sample inputs for reduction ops that contain the input tensor
     # and dim and keepdim kwargs. If a reduction op needs to test additional
     # args/kwargs then create a separate sample_inputs function
-    def fn(op_info, device, dtype, requires_grad):
+    def fn(op_info, device, dtype, requires_grad, **kwargs):
         inputs = []
 
         for t in _generate_nan_reduction_inputs(device, dtype, requires_grad):
@@ -4135,7 +5243,7 @@ def fn(op_info, device, dtype, requires_grad):
     return fn
 
 def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad, **kwargs):
-    test_quantiles = (0.5, make_tensor((2,), device, dtype, low=0, high=1, requires_grad=requires_grad))
+    test_quantiles = (0.5, make_tensor((2,), dtype=dtype, device=device, low=0, high=1, requires_grad=requires_grad))
     test_interpolations = ['linear', 'midpoint']
 
     inputs = []
@@ -4309,7 +5417,7 @@ def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs):
 
 def sample_inputs_topk(op_info, device, dtype, requires_grad, **kwargs):
     def get_tensor_input(size):
-        return make_tensor(size, device, dtype, requires_grad=requires_grad)
+        return make_tensor(size, dtype=dtype, device=device, requires_grad=requires_grad)
 
     inputs = []
     inputs.append(SampleInput(get_tensor_input((S, M, S)), args=(3,)))
@@ -4332,25 +5440,11 @@ def get_tensor_input(size):
 
 def sample_inputs_outer(op_info, device, dtype, requires_grad, **kwargs):
     inputs = []
-    arg_a = make_tensor((S,), device, dtype, requires_grad=requires_grad)
-    arg_b = make_tensor((M,), device, dtype, requires_grad=requires_grad)
+    arg_a = make_tensor((S,), dtype=dtype, device=device, requires_grad=requires_grad)
+    arg_b = make_tensor((M,), dtype=dtype, device=device, requires_grad=requires_grad)
     inputs.append(SampleInput(arg_a, args=(arg_b,)))
     return inputs
 
-
-def sample_inputs_igamma_igammac(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, low=1e-3)
-    cases = (((S, S), (S, S), False),
-             ((S, S), (S, ), False),
-             ((S, ), (S, S), True),
-             ((), (), False))
-
-    for shape, other_shape, broadcasts_input in cases:
-        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
-                          args=(make_arg(other_shape, requires_grad=False),),
-                          broadcasts_input=broadcasts_input)
-
-
 def sample_inputs_dist(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     sizes = ((S, S, S), (S,), (S, 1, S), (), (S, S))
@@ -4371,6 +5465,7 @@ def sample_inputs_index(op_info, device, dtype, requires_grad, **kwargs):
     # target.index_fill(dim, idx, value)
     fill = op_info.name == "index_fill"
 
+
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     make_permutation = partial(torch.randperm, device=device, dtype=torch.int64)
 
@@ -4406,6 +5501,44 @@ def make_idx(n):
 
         yield SampleInput(t, args=args, kwargs=kwargs)
 
+def sample_inputs_index_reduce(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_idx(n, m):
+        return make_tensor((n,), device=device, dtype=torch.int64, low=0, high=m)
+
+    shapes = [((), ()), ((1,), (1,)), ((S, S), (S, M)), ((S, S, S), (S, M, S))]
+    include_selfs = (True, False)
+    reduces = ('prod', 'mean', 'amin', 'amax')
+
+    for shape, include_self, reduce in product(shapes, include_selfs, reduces):
+        self_shape, src_shape = shape
+        # dim. We handle the scalar case
+        dim = 1 if len(self_shape) >= 2 else 0
+        idx = make_idx(src_shape[dim] if len(src_shape) != 0 else 1,
+                       self_shape[dim] if len(self_shape) != 0 else 1)
+        args = (dim, idx, make_arg(src_shape), reduce)
+        yield SampleInput(make_arg(self_shape),
+                          args=args,
+                          kwargs={'include_self' : include_self})
+
+    # Sample inputs to test edge cases for backward
+    if requires_grad:
+        # Check that gradients are propagated correctly for prod when zeros in self/src are reduced
+        # This sample tests gradients for the following cases
+        # (a) 1 zero reduced (from source (self[0, 1]), from self (self[0, 0]))
+        # (b) 2 zeros reduced (1 from src and 1 from self (self[1, 0], self[1, 1])
+        # (c) no zeros reduced (self[2, 1], self[2, 2])
+        # (d) 2 zeros reduced (both from src) is tested in test/test_autograd.py
+        #     test_scatter_index_reduce_prod_gradgrad_error as this case is not supported for gradgrad
+        input = torch.tensor([[0, 13], [0, 0], [15, 19]], dtype=dtype, device=device, requires_grad=requires_grad)
+        src = torch.tensor([[2, 0], [0, 0], [2, 3], [2, 2]], dtype=dtype, device=device, requires_grad=requires_grad)
+        idx = torch.tensor([0, 1, 2, 0], dtype=torch.long, device=device)
+
+        yield SampleInput(input,
+                          args=(0, idx, src, 'prod'),
+                          kwargs={'include_self': True})
+
 def sample_inputs_mode(op_info, device, dtype, requires_grad, **kwargs):
     inputs = []
     args = (
@@ -4416,7 +5549,7 @@ def sample_inputs_mode(op_info, device, dtype, requires_grad, **kwargs):
         ((), (0,),),
         ((), (0, True,),),
     )
-    inputs = list((SampleInput(make_tensor(input_tensor, device, dtype,
+    inputs = list((SampleInput(make_tensor(input_tensor, dtype=dtype, device=device,
                                            low=None, high=None,
                                            requires_grad=requires_grad),
                                args=args,))
@@ -4493,10 +5626,10 @@ def sample_inputs_take(op_info, device, dtype, requires_grad, **kwargs):
 def sample_movedim_moveaxis(op_info, device, dtype, requires_grad, **kwargs):
     return (
         SampleInput(
-            make_tensor((4, 3, 2, 1), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((4, 3, 2, 1), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=([0, 1, 2, 3], [3, 2, 1, 0])),
         SampleInput(
-            make_tensor((4, 3, 2, 1), device, dtype, low=None, high=None, requires_grad=requires_grad),
+            make_tensor((4, 3, 2, 1), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
             args=([0, -1, -2, -3], [-3, -2, -1, -0]))
     )
 
@@ -4530,10 +5663,11 @@ def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S, S), (-1, 2, 2)),
         ((S, S, S), (1, 0, 0)),
         ((S, S, S), (-1, 0, 0)),
+        ((S, S, S), (2, 1, 2)),
     )
 
     for shape, args in shapes_and_args:
-        tensor = make_tensor(shape, device, dtype, low=None, high=None,
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
         yield SampleInput(tensor, args=args)
 
@@ -4554,10 +5688,10 @@ def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
     ]
     samples = []
     for y_shape, x_shape, kwarg in y_shape_x_shape_and_kwargs:
-        y_tensor = make_tensor(y_shape, device, dtype, low=None, high=None,
+        y_tensor = make_tensor(y_shape, dtype=dtype, device=device, low=None, high=None,
                                requires_grad=requires_grad)
         if x_shape is not None:
-            x_tensor = make_tensor(x_shape, device, dtype, low=None, high=None,
+            x_tensor = make_tensor(x_shape, dtype=dtype, device=device, low=None, high=None,
                                    requires_grad=requires_grad)
             samples.append(SampleInput(y_tensor, args=(x_tensor,), kwargs=kwarg))
         else:
@@ -4582,10 +5716,10 @@ def sample_cumulative_trapezoid(op_info, device, dtype, requires_grad, **kwargs)
     ]
     samples = []
     for y_shape, x_shape, kwarg in y_shape_x_shape_and_kwargs:
-        y_tensor = make_tensor(y_shape, device, dtype, low=None, high=None,
+        y_tensor = make_tensor(y_shape, dtype=dtype, device=device, low=None, high=None,
                                requires_grad=requires_grad)
         if x_shape is not None:
-            x_tensor = make_tensor(x_shape, device, dtype, low=None, high=None,
+            x_tensor = make_tensor(x_shape, dtype=dtype, device=device, low=None, high=None,
                                    requires_grad=requires_grad)
             samples.append(SampleInput(y_tensor, args=(x_tensor,), kwargs=kwarg))
         else:
@@ -4599,12 +5733,15 @@ def sample_unsqueeze(op_info, device, dtype, requires_grad, **kwargs):
         ((3, 4, 5), 3),
         ((3, 4, 5), -1),
         ((3, 4, 5), -3),
-        ((), 0)
+        ((), 0),
+        ((), -1),
+        ((1,), 0),
+        ((1,), -1),
     ]
 
     samples = []
     for shape, axis in shapes_and_axes:
-        tensor = make_tensor(shape, device, dtype, low=None, high=None,
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
         samples.append(SampleInput(tensor, args=(axis,),))
 
@@ -4620,11 +5757,11 @@ def sample_inputs_nn_unfold(op_info, device, dtype, requires_grad, **kwargs):
 
     cases = product(shapes, kernel_sizes, dilations, paddings, strides)
     for shape, kernel_size, dilation, padding, stride in cases:
-        tensor = make_tensor(shape, device, dtype, requires_grad=requires_grad)
+        tensor = make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
         yield SampleInput(tensor, args=(kernel_size, dilation, padding, stride))
 
     # With default args
-    yield SampleInput(make_tensor((1, 1, 5, 5), device, dtype, requires_grad=requires_grad),
+    yield SampleInput(make_tensor((1, 1, 5, 5), dtype=dtype, device=device, requires_grad=requires_grad),
                       args=((3, 3),))
 
 
@@ -4640,7 +5777,7 @@ def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs):
     )
 
     for shape, args in shapes_and_args:
-        tensor = make_tensor(shape, device, dtype, low=None, high=None,
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
 
         yield SampleInput(tensor, args=args)
@@ -4808,15 +5945,33 @@ def wrapped_fn(x):
     return wrapped_fn
 
 def sample_inputs_spectral_ops(self, device, dtype, requires_grad=False, **kwargs):
-    nd_tensor = partial(make_tensor, (S, S + 1, S + 2), device=device,
-                        dtype=dtype, requires_grad=requires_grad)
-    oned_tensor = partial(make_tensor, (31,), device=device,
-                          dtype=dtype, requires_grad=requires_grad)
+    is_fp16_or_chalf = dtype == torch.complex32 or dtype == torch.half
+    if not is_fp16_or_chalf:
+        nd_tensor = partial(make_tensor, (S, S + 1, S + 2), device=device,
+                            dtype=dtype, requires_grad=requires_grad)
+        oned_tensor = partial(make_tensor, (31,), device=device,
+                              dtype=dtype, requires_grad=requires_grad)
+    else:
+        # cuFFT supports powers of 2 for half and complex half precision
+        # NOTE: For hfft, hfft2, hfftn, irfft, irfft2, irfftn with default args
+        # where output_size n=2*(input_size - 1), we make sure that logical fft size is a power of two
+        if self.name in ['fft.hfft', 'fft.irfft']:
+            shapes = ((2, 9, 9), (33,))
+        elif self.name in ['fft.hfft2', 'fft.irfft2']:
+            shapes = ((2, 8, 9), (33,))
+        elif self.name in ['fft.hfftn', 'fft.irfftn']:
+            shapes = ((2, 2, 33), (33,))
+        else:
+            shapes = ((2, 8, 16), (32,))
+        nd_tensor = partial(make_tensor, shapes[0], device=device,
+                            dtype=dtype, requires_grad=requires_grad)
+        oned_tensor = partial(make_tensor, shapes[1], device=device,
+                              dtype=dtype, requires_grad=requires_grad)
 
     if self.ndimensional == SpectralFuncType.ND:
         return [
             SampleInput(nd_tensor(),
-                        kwargs=dict(s=(3, 10), dim=(1, 2), norm='ortho')),
+                        kwargs=dict(s=(3, 10) if not is_fp16_or_chalf else (4, 8), dim=(1, 2), norm='ortho')),
             SampleInput(nd_tensor(),
                         kwargs=dict(norm='ortho')),
             SampleInput(nd_tensor(),
@@ -4830,11 +5985,11 @@ def sample_inputs_spectral_ops(self, device, dtype, requires_grad=False, **kwarg
     elif self.ndimensional == SpectralFuncType.TwoD:
         return [
             SampleInput(nd_tensor(),
-                        kwargs=dict(s=(3, 10), dim=(1, 2), norm='ortho')),
+                        kwargs=dict(s=(3, 10) if not is_fp16_or_chalf else (4, 8), dim=(1, 2), norm='ortho')),
             SampleInput(nd_tensor(),
                         kwargs=dict(norm='ortho')),
             SampleInput(nd_tensor(),
-                        kwargs=dict(s=(6, 8))),
+                        kwargs=dict(s=(6, 8) if not is_fp16_or_chalf else (4, 8))),
             SampleInput(nd_tensor(),
                         kwargs=dict(dim=0)),
             SampleInput(nd_tensor(),
@@ -4845,11 +6000,12 @@ def sample_inputs_spectral_ops(self, device, dtype, requires_grad=False, **kwarg
     else:
         return [
             SampleInput(nd_tensor(),
-                        kwargs=dict(n=10, dim=1, norm='ortho')),
+                        kwargs=dict(n=10 if not is_fp16_or_chalf else 8, dim=1, norm='ortho')),
             SampleInput(nd_tensor(),
                         kwargs=dict(norm='ortho')),
             SampleInput(nd_tensor(),
-                        kwargs=dict(n=7)),
+                        kwargs=dict(n=7 if not is_fp16_or_chalf else 8)
+                        ),
             SampleInput(oned_tensor()),
 
             *(SampleInput(nd_tensor(),
@@ -4885,7 +6041,8 @@ def __init__(self,
         decorators = list(decorators) if decorators is not None else []
         decorators += [
             skipCPUIfNoFFT,
-            skipCUDAIfRocm,
+            DecorateInfo(toleranceOverride({torch.chalf: tol(4e-2, 4e-2)}),
+                         "TestCommon", "test_complex_half_reference_testing")
         ]
 
         super().__init__(name=name,
@@ -4972,11 +6129,11 @@ def __init__(self,
                                             **kwargs)
         self.ref = ref
 
-def sample_inputs_foreach(self, device, dtype, N, *, noncontiguous=False, same_size=False):
+def sample_inputs_foreach(self, device, dtype, N, *, noncontiguous=False, same_size=False, low=None, high=None):
     if same_size:
-        return [make_tensor((N, N), device, dtype, noncontiguous=noncontiguous) for _ in range(N)]
+        return [make_tensor((N, N), dtype=dtype, device=device, noncontiguous=noncontiguous) for _ in range(N)]
     else:
-        return [make_tensor((N - i, N - i), device, dtype, noncontiguous=noncontiguous) for i in range(N)]
+        return [make_tensor((N - i, N - i), dtype=dtype, device=device, noncontiguous=noncontiguous) for i in range(N)]
 
 
 def get_foreach_method_names(name):
@@ -4998,7 +6155,6 @@ def __init__(self,
                  dtypes=floating_and_complex_types(),
                  dtypesIfCUDA=floating_and_complex_types_and(torch.half),
                  dtypesIfROCM=None,
-                 safe_casts_outputs=True,
                  supports_alpha_param=False,
                  sample_inputs_func=sample_inputs_foreach,
                  **kwargs):
@@ -5007,7 +6163,6 @@ def __init__(self,
             dtypes=dtypes,
             dtypesIfCUDA=dtypesIfCUDA,
             dtypesIfROCM=dtypesIfROCM,
-            safe_casts_outputs=safe_casts_outputs,
             sample_inputs_func=sample_inputs_func,
             **kwargs
         )
@@ -5024,21 +6179,104 @@ def __init__(self,
 
 
 def sample_inputs_linalg_cholesky_inverse(op_info, device, dtype, requires_grad=False, **kwargs):
-    # Generate Cholesky factors of positive-definite (non-singular) Hermitian (symmetric) matrices
-    from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+    from torch.testing._internal.common_utils import random_well_conditioned_matrix
+
+    # Cholesky factorization is for positive-definite matrices
+    single_well_conditioned_matrix = random_well_conditioned_matrix(S, S, dtype=dtype, device=device)
+    batch_well_conditioned_matrices = random_well_conditioned_matrix(2, S, S, dtype=dtype, device=device)
+    single_pd = single_well_conditioned_matrix @ single_well_conditioned_matrix.mH
+    batch_pd = batch_well_conditioned_matrices @ batch_well_conditioned_matrices.mH
+
     inputs = (
         torch.zeros(0, 0, dtype=dtype, device=device),  # 0x0 matrix
         torch.zeros(0, 2, 2, dtype=dtype, device=device),  # zero batch of matrices
-        random_hermitian_pd_matrix(S, dtype=dtype, device=device),  # single matrix
-        random_hermitian_pd_matrix(S, 2, dtype=dtype, device=device),  # batch of matrices
+        single_pd,
+        batch_pd
     )
-    test_cases = (torch.linalg.cholesky(a) for a in inputs)
-    out = []
-    for a in test_cases:
-        a.requires_grad = requires_grad
-        out.append(SampleInput(a))
-        out.append(SampleInput(a.clone().requires_grad_(requires_grad), kwargs=dict(upper=True)))
-    return out
+    test_cases = (torch.linalg.cholesky(a, upper=False) for a in inputs)
+    for l in test_cases:
+        # generated lower-triangular samples
+        l.requires_grad = requires_grad
+        yield SampleInput(l)  # upper=False by default
+        yield SampleInput(l.detach().clone().requires_grad_(requires_grad), kwargs=dict(upper=False))
+
+        # generate upper-triangular inputs
+        u = l.detach().clone().mT.contiguous().requires_grad_(requires_grad)
+        yield SampleInput(u, kwargs=dict(upper=True))
+
+def sample_inputs_linalg_ldl_factor(op_info, device, dtype, requires_grad=False, **kwargs):
+    from torch.testing._internal.common_utils import (
+        random_hermitian_pd_matrix,
+        random_symmetric_pd_matrix,
+    )
+
+    device = torch.device(device)
+
+    # Symmetric inputs
+    yield SampleInput(
+        random_symmetric_pd_matrix(S, dtype=dtype, device=device),
+        kwargs=dict(hermitian=False),
+    )  # single matrix
+    yield SampleInput(
+        random_symmetric_pd_matrix(S, 2, dtype=dtype, device=device),
+        kwargs=dict(hermitian=False),
+    )  # batch of matrices
+    yield SampleInput(
+        torch.zeros(0, 0, dtype=dtype, device=device), kwargs=dict(hermitian=False)
+    )  # 0x0 matrix
+    yield SampleInput(
+        torch.zeros(0, 2, 2, dtype=dtype, device=device), kwargs=dict(hermitian=False)
+    )  # zero batch of matrices
+
+    # Hermitian inputs
+    # hermitian=True for complex inputs on CUDA is supported only with MAGMA 2.5.4+
+    magma_254_available = device.type == 'cuda' and _get_magma_version() >= (2, 5, 4)
+    if dtype.is_complex and (device.type == 'cpu' or magma_254_available):
+        yield SampleInput(
+            random_hermitian_pd_matrix(S, dtype=dtype, device=device),
+            kwargs=dict(hermitian=True),
+        )  # single matrix
+        yield SampleInput(
+            random_hermitian_pd_matrix(S, 2, dtype=dtype, device=device),
+            kwargs=dict(hermitian=True),
+        )  # batch of matrices
+
+def sample_inputs_linalg_ldl_solve(op_info, device, dtype, requires_grad=False, **kwargs):
+    # Generate LDL factors of symmetric (and Hermitian on CPU) matrices
+    from torch.testing._internal.common_utils import random_hermitian_pd_matrix, random_symmetric_pd_matrix
+    device = torch.device(device)
+    symmetric_inputs = (
+        random_symmetric_pd_matrix(S, dtype=dtype, device=device),  # single matrix
+        random_symmetric_pd_matrix(S, 2, dtype=dtype, device=device),  # batch of matrices
+        torch.zeros(0, 0, dtype=dtype, device=device),  # 0x0 matrix
+        torch.zeros(0, 2, 2, dtype=dtype, device=device),  # zero batch of matrices
+    )
+    hermitian_inputs = (
+        random_hermitian_pd_matrix(S, dtype=dtype, device=device),
+        random_hermitian_pd_matrix(S, 2, dtype=dtype, device=device),
+    ) if device.type == 'cpu' and dtype.is_complex else ()
+    test_cases1 = (torch.linalg.ldl_factor_ex(a, hermitian=False) for a in symmetric_inputs)
+    test_cases2 = (torch.linalg.ldl_factor_ex(a, hermitian=True) for a in hermitian_inputs)
+
+    # Symmetric case
+    for test_case in test_cases1:
+        factors, pivots, _ = test_case
+        factors.requires_grad = requires_grad
+        for B_batch_shape in ((), factors.shape[:-2]):
+            B = make_tensor((*B_batch_shape, factors.shape[-1], S), device=device, dtype=dtype, requires_grad=requires_grad)
+            yield SampleInput(factors, args=(pivots, B), kwargs=dict(hermitian=False))
+            clone_factors = factors.detach().clone().requires_grad_(requires_grad)
+            yield SampleInput(clone_factors, args=(pivots, B), kwargs=dict(hermitian=False))
+
+    # Hermitian case
+    for test_case in test_cases2:
+        factors, pivots, _ = test_case
+        factors.requires_grad = requires_grad
+        for B_batch_shape in ((), factors.shape[:-2]):
+            B = make_tensor((*B_batch_shape, factors.shape[-1], S), device=device, dtype=dtype, requires_grad=requires_grad)
+            yield SampleInput(factors, args=(pivots, B), kwargs=dict(hermitian=True))
+            clone_factors = factors.detach().clone().requires_grad_(requires_grad)
+            yield SampleInput(clone_factors, args=(pivots, B), kwargs=dict(hermitian=True))
 
 def sample_inputs_linalg_lstsq(op_info, device, dtype, requires_grad=False, **kwargs):
     from torch.testing._internal.common_utils import random_well_conditioned_matrix
@@ -5065,7 +6303,7 @@ def sample_inputs_linalg_lstsq(op_info, device, dtype, requires_grad=False, **kw
         shape = batch + (3 + delta, 3)
         a = random_well_conditioned_matrix(*shape, dtype=dtype, device=device)
         a.requires_grad_(requires_grad)
-        b = make_tensor(shape, device, dtype, low=None, high=None, requires_grad=requires_grad)
+        b = make_tensor(shape, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
         out.append(SampleInput(a, args=(b,), kwargs=dict(driver=driver)))
     return out
 
@@ -5079,31 +6317,31 @@ def sample_inputs_householder_product(op_info, device, dtype, requires_grad, **k
     # the Jacobian matrix entries and making the finite-difference result of grad check less accurate.
     # That's why gradcheck with the default range [-9, 9] fails and [-2, 2] is used here.
     samples = (
-        SampleInput(make_tensor((S, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((S,), device, dtype, low=-2, high=2, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((S, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((S,), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),)),
 
-        SampleInput(make_tensor((S + 1, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((S,), device, dtype, low=-2, high=2, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((S + 1, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((S,), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),)),
 
-        SampleInput(make_tensor((2, 1, S, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((2, 1, S,), device, dtype, low=-2, high=2, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((2, 1, S, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((2, 1, S,), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),)),
 
-        SampleInput(make_tensor((2, 1, S + 1, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((2, 1, S,), device, dtype, low=-2, high=2, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((2, 1, S + 1, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((2, 1, S,), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),)),
 
-        SampleInput(make_tensor((0, 0), device, dtype, low=None, high=None, requires_grad=requires_grad),
-                    args=(make_tensor((0,), device, dtype, low=None, high=None, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((0, 0), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+                    args=(make_tensor((0,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),)),
 
-        SampleInput(make_tensor((S, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((0,), device, dtype, low=None, high=None, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((S, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((0,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),)),
 
         # m = n = S, k = S - 2
-        SampleInput(make_tensor((S, S), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((S - 2,), device, dtype, low=None, high=None, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((S, S), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((S - 2,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),)),
 
         # m = S, n = S -1, k = S - 2
-        SampleInput(make_tensor((S, S - 1), device, dtype, low=-2, high=2, requires_grad=requires_grad),
-                    args=(make_tensor((S - 2,), device, dtype, low=None, high=None, requires_grad=requires_grad),)),
+        SampleInput(make_tensor((S, S - 1), dtype=dtype, device=device, low=-2, high=2, requires_grad=requires_grad),
+                    args=(make_tensor((S - 2,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),)),
     )
 
     return samples
@@ -5318,16 +6556,15 @@ def sample_inputs_legacy_solve(op_info, device, dtype, requires_grad=False, **kw
 
 
 def sample_inputs_cholesky_solve(op_info, device, dtype, requires_grad=False, **kwargs):
-    out = sample_inputs_linalg_cholesky_inverse(
+    cholesky_inverse_samples = sample_inputs_linalg_cholesky_inverse(
         op_info, device, dtype, requires_grad=False
     )
 
-    for sample in out:
+    for sample in cholesky_inverse_samples:
         psd_matrix = sample.input
-        sample.input = make_tensor(psd_matrix.shape, device, dtype, requires_grad=requires_grad, low=None, high=None)
+        sample.input = make_tensor(psd_matrix.shape, dtype=dtype, device=device, requires_grad=requires_grad, low=None, high=None)
         sample.args = (psd_matrix.requires_grad_(requires_grad),)
-
-    return out
+        yield sample
 
 
 def sample_inputs_lu(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -5341,12 +6578,17 @@ def sample_inputs_lu(op_info, device, dtype, requires_grad=False, **kwargs):
         input = make_arg(*shape)
         yield SampleInput(input, args=(True, get_infos))
 
-def sample_inputs_linalg_lu_factor(op_info, device, dtype, requires_grad=False, **kwargs):
-    # When calling `lu_factor` we need to assure that the matrix is invertible
-    make_fn = make_tensor if "ex" in op_info.name else make_fullrank_matrices_with_distinct_singular_values
+def sample_inputs_linalg_lu(op_info, device, dtype, requires_grad=False, **kwargs):
+    full_rank = (op_info.name == "linalg.lu_factor")
+    make_fn = make_tensor if not full_rank else make_fullrank_matrices_with_distinct_singular_values
     make_arg = partial(make_fn, dtype=dtype, device=device, requires_grad=requires_grad)
 
-    # not needed once OpInfo tests support Iterables
+    def out_fn(output):
+        if op_info.name in ("linalg.lu"):
+            return output[1], output[2]
+        else:
+            return output
+
     batch_shapes = ((), (3,), (3, 3))
     # pivot=False only supported in CUDA
     pivots = (True, False) if torch.device(device).type == "cuda" else (True,)
@@ -5354,9 +6596,8 @@ def sample_inputs_linalg_lu_factor(op_info, device, dtype, requires_grad=False,
     for batch_shape, pivot, delta in product(batch_shapes, pivots, deltas):
         shape = batch_shape + (S + delta, S)
         # Insanely annoying that make_fullrank_blablabla accepts a *shape and not a tuple!
-        A = make_arg(shape) if "ex" in op_info.name else make_arg(*shape)
-        yield SampleInput(A, kwargs={"pivot": pivot})
-
+        A = make_arg(shape) if not full_rank else make_arg(*shape)
+        yield SampleInput(A, kwargs={"pivot": pivot}, output_process_fn_grad=out_fn)
 
 def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs):
     make_fn = make_fullrank_matrices_with_distinct_singular_values
@@ -5390,10 +6631,13 @@ def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs
             yield SampleInput(b_, args=(lu_, pivs))
 
 def sample_inputs_lu_unpack(op_info, device, dtype, requires_grad=False, **kwargs):
-    for lu_sample in sample_inputs_lu(op_info, device, dtype, requires_grad, **kwargs):
+    def out_fn(output):
+        return output[1], output[2]
+
+    for lu_sample in sample_inputs_linalg_lu(op_info, device, dtype, requires_grad, **kwargs):
         lu_data, pivots = torch.linalg.lu_factor(lu_sample.input)
         lu_data.requires_grad_(requires_grad)
-        yield SampleInput(lu_data, args=(pivots,))
+        yield SampleInput(lu_data, args=(pivots,), output_process_fn_grad=out_fn)
 
 
 def sample_inputs_roll(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -5438,7 +6682,7 @@ def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs):
 def _generate_correlation_inputs(device, dtype, requires_grad, **kwargs):
     shapes = [(2,), (1, 2), (3, 2), (2, 3)]
     for shape in shapes:
-        yield make_tensor(shape, device, dtype, requires_grad=requires_grad)
+        yield make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
 
 
 def sample_inputs_corrcoef(op_info, device, dtype, requires_grad, **kwargs):
@@ -5450,14 +6694,47 @@ def sample_inputs_cov(op_info, device, dtype, requires_grad, **kwargs):
     for t in _generate_correlation_inputs(device, dtype, requires_grad):
         inputs.append(SampleInput(t))
         num_observations = t.numel() if t.ndimension() < 2 else t.size(1)
-        fweights = make_tensor((num_observations,), device, torch.int, low=1, high=10)
-        aweights = make_tensor((num_observations,), device, torch.float, low=0, high=1, requires_grad=requires_grad)
+        fweights = make_tensor((num_observations,), dtype=torch.int, device=device, low=1, high=10)
+        aweights = make_tensor((num_observations,), dtype=torch.float, device=device, low=0, high=1, requires_grad=requires_grad)
         for correction, fw, aw in product(range(num_observations), [None, fweights], [None, aweights]):
             inputs.append(SampleInput(t.clone().requires_grad_(requires_grad),
                                       kwargs={'correction': correction, 'fweights': fw, 'aweights': aw}))
     return inputs
 
 
+def error_inputs_cov(op_info, device, **kwargs):
+    a = torch.rand(S, device=device)
+    error_inputs = []
+    error_inputs.append(ErrorInput(
+        SampleInput(torch.rand(S, S, S, device=device)),
+        error_regex="expected input to have two or fewer dimensions"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'fweights': torch.rand(S, S, device=device)}),
+        error_regex="expected fweights to have one or fewer dimensions"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'aweights': torch.rand(S, S, device=device)}),
+        error_regex="expected aweights to have one or fewer dimensions"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'fweights': torch.rand(S, device=device)}),
+        error_regex="expected fweights to have integral dtype"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'aweights': torch.tensor([1, 1], device=device)}),
+        error_regex="expected aweights to have floating point dtype"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'fweights': torch.tensor([1], device=device)}),
+        error_regex="expected fweights to have the same numel"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'aweights': torch.rand(1, device=device)}),
+        error_regex="expected aweights to have the same numel"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'fweights': torch.tensor([-1, -2, -3, -4 , -5], device=device)}),
+        error_regex="fweights cannot be negative"))
+    error_inputs.append(ErrorInput(
+        SampleInput(a, kwargs={'aweights': torch.tensor([-1., -2., -3., -4., -5.], device=device)}),
+        error_regex="aweights cannot be negative"))
+    return error_inputs
+
+
 def sample_inputs_svd(op_info, device, dtype, requires_grad=False, **kwargs):
     make_fullrank = make_fullrank_matrices_with_distinct_singular_values
     make_arg = partial(make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -5511,65 +6788,30 @@ def sample_inputs_permute(op_info, device, dtype, requires_grad, **kwargs):
     for shape, args in cases:
         yield SampleInput(make_arg(shape), args=(args,))
 
+def reference_inputs_permute(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_permute(op, device, dtype, requires_grad, **kwargs)
 
-# Based on erstwhile method_tests tests & some tensor_op_tests for pow
-def sample_inputs_pow(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype)
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    samples = []
+    cases = (
+        ((), ()),
+        ((1,), (0,)),
+        ((2, 2), (1, 0)),
+        ((2, 2), (0, 1)),
+        ((2, 0, 1), (0, 2, 1)),
+        ((3, 4, 2), (2, 1, 0)),
+        ((3, 4, 2), (1, 0, 2)),
+        ((3, 4, 2), (0, 1, 2)),
+    )
 
-    if dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64]:
-        test_cases = (
-            ((2, 2), 0, 5, 1e-3, requires_grad, (2, 2), 0, 1, 0.1, requires_grad, False),
-            ((2, 2), 0, 5, 1e-3, requires_grad, (1,), 0, 1, 0.1, requires_grad, False),
-            ((), 1e-3, 1e-3 + 1, 0, requires_grad, (), 0.1, 1.1, 0, requires_grad, False),
-            ((2, 2), 0, 5, 1e-3, requires_grad, (), 0.1, 1.1, 1, requires_grad, False),
-        )
-        tests_require_resizing = (
-            ((1,), 0, 5, 1e-3, requires_grad, (2, 2), 0, 1, 0.1, requires_grad, requires_grad),
-            ((2, 1, 2), 0, 5, 1e-3, requires_grad, (1, 2, 1), 0, 1, 0.1, requires_grad, requires_grad),
-            ((), 1e-3, 1e-3 + 1, 0, requires_grad, (1, S, 1), 0, 1, 0.1, requires_grad, requires_grad),
-        )
-        cases = test_cases + tests_require_resizing
-
-        samples = []
-        for (shape_b, low_b, high_b, additive_b, b_grad, shape_e, low_e,
-             high_e, additive_e, e_grad, broadcasts_input) in cases:
-            si = SampleInput((make_arg(shape_b, low=low_b, high=high_b) + additive_b).requires_grad_(b_grad),
-                             args=((make_arg(shape_e, low=low_e, high=high_e) + additive_e).requires_grad_(e_grad),),
-                             broadcasts_input=broadcasts_input)
-            samples.append(si)
-
-        tensor_scalar_inputs = (
-            ((2, 2), 0, 5, 1e-3, requires_grad, (3.14,)),
-            ((), 1e-3, 1e-3 + 1, 0, requires_grad, (3.14,))
-        )
-        more_samples = list(SampleInput(
-            (make_arg(shape, high=high, low=low) + additive).requires_grad_(b_grad),
-            args=exp)
-            for shape, low, high, additive, b_grad, exp in tensor_scalar_inputs)
-
-        samples = [*samples, *more_samples]
-    elif dtype in [torch.complex64, torch.complex128]:
-        args_tuple = (
-            ((2, 2), 0, 5, requires_grad, (3.14,)),
-            ((), 0, 1, requires_grad, (3.14,)),
-            ((), 0, 1, requires_grad, (3.14j,))
-        )
-        samples = list(SampleInput(
-            (make_arg(shape, high=high, low=low) + 1e-3 * (1 + 1j)).requires_grad_(b_grad),
-            args=arg)
-            for shape, low, high, b_grad, arg in args_tuple)
-    else:  # integral dtype
-        exp_tuple = (1, 2, 3)
-        samples = list(SampleInput(
-            make_arg((2, 2), requires_grad=requires_grad),
-            args=(arg,))
-            for arg in exp_tuple)
-        samples.append(SampleInput(
-            make_arg((2, 2), requires_grad=requires_grad),
-            args=(make_arg((2, 2), requires_grad=requires_grad),)))
-    return tuple(samples)
+    # Adds tricky permutations and permutations with noncontiguity
+    for shape, permutation in cases:
+        for p in itertools.permutations(permutation):
+            a = make_arg(shape).permute(p)
+            yield SampleInput(a, args=(permutation,))
+
+            a = make_arg(shape, noncontiguous=True).permute(p)
+            yield SampleInput(a, args=(permutation,))
 
 def sample_inputs_linalg_svdvals(op_info, device, dtype, requires_grad=False, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -5627,15 +6869,15 @@ def sample_inputs_einsum(op_info, device, dtype, requires_grad=False, **kwargs):
     def c(t):
         return t.clone().requires_grad_(requires_grad)
 
-    x = make_tensor((3,), device, dtype, requires_grad=requires_grad)
-    y = make_tensor((4,), device, dtype, requires_grad=requires_grad)
-    A = make_tensor((2, 3,), device, dtype, requires_grad=requires_grad)
-    B = make_tensor((1, 3,), device, dtype, requires_grad=requires_grad)
-    C = make_tensor((1, 2, 3,), device, dtype, requires_grad=requires_grad)
-    D = make_tensor((1, 3, 4,), device, dtype, requires_grad=requires_grad)
-    E = make_tensor((4, 4,), device, dtype, requires_grad=requires_grad)
-    H = make_tensor((3, 3,), device, dtype, requires_grad=requires_grad)
-    I = make_tensor((1, 3, 1,), device, dtype, requires_grad=requires_grad)
+    x = make_tensor((3,), dtype=dtype, device=device, requires_grad=requires_grad)
+    y = make_tensor((4,), dtype=dtype, device=device, requires_grad=requires_grad)
+    A = make_tensor((2, 3,), dtype=dtype, device=device, requires_grad=requires_grad)
+    B = make_tensor((1, 3,), dtype=dtype, device=device, requires_grad=requires_grad)
+    C = make_tensor((1, 2, 3,), dtype=dtype, device=device, requires_grad=requires_grad)
+    D = make_tensor((1, 3, 4,), dtype=dtype, device=device, requires_grad=requires_grad)
+    E = make_tensor((4, 4,), dtype=dtype, device=device, requires_grad=requires_grad)
+    H = make_tensor((3, 3,), dtype=dtype, device=device, requires_grad=requires_grad)
+    I = make_tensor((1, 3, 1,), dtype=dtype, device=device, requires_grad=requires_grad)
 
     inputs = []
 
@@ -5685,54 +6927,16 @@ def sample_inputs_flip(op_info, device, dtype, requires_grad, **kwargs):
 
 def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad, **kwargs):
     tensors = (
-        make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
-        make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad)
+        make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, 0, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
     )
     return [SampleInput(tensor) for tensor in tensors]
 
-def sample_inputs_fmod_remainder(op_info, device, dtype, requires_grad, *, autodiffed=False, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-
-    if autodiffed:
-        samples = (
-            ((S, S, S), 1.5, False),
-            ((), 1.5, False),
-        )
-    else:
-        cases = (
-            ((S, S, S), (), False),
-            ((S, S, S), (S, S, S), False),
-            ((S, S, S), (S,), False),
-        )
-
-        # Sample inputs with scalars as torch tensors
-        # FIXME It does not work for mak make_arg((1,), exclude_zero=True)
-        cases_with_tensor_scalar = (
-            ((), make_arg((), exclude_zero=True), False),
-        )
-
-        # Sample inputs with broadcasting
-        cases_with_broadcasting = (
-            ((S,), (S, S, S), True),
-            ((S, 1, S), (S, S, S), True),
-            ((), (S, S, S), True),
-        )
-
-        samples = cases + cases_with_tensor_scalar + cases_with_broadcasting  # type: ignore[assignment]
-
-    for shape, arg_other, broadcasts_input in samples:
-        if isinstance(arg_other, tuple):
-            arg = make_arg(arg_other, exclude_zero=True)
-        else:
-            # shape_other is scalar or torch.tensor
-            arg = arg_other
-        yield(SampleInput(make_arg(shape), args=(arg,), broadcasts_input=broadcasts_input))
-
 # TODO: clamp shares tensors among its sample inputs --- we should prohibit this!
 def sample_inputs_clamp(op_info, device, dtype, requires_grad, **kwargs):
-    x = make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad)
-    lb = make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad)
-    ub = make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad)
+    x = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    lb = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    ub = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
 
     def detach(tensor):
         return tensor.clone().detach_().requires_grad_(requires_grad)
@@ -5745,8 +6949,8 @@ def detach(tensor):
 
 def sample_inputs_clamp_scalar(op_info, device, dtype, requires_grad, **kwargs):
     tensors = (
-        make_tensor((2, 3, 2), device, dtype, low=None, high=None, requires_grad=requires_grad),
-        make_tensor((2, 0, 3), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((2, 3, 2), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((2, 0, 3), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
     )
 
     if dtype is torch.uint8:
@@ -5790,7 +6994,7 @@ def sample_inputs_cross(op_info, device, dtype, requires_grad, **kwargs):
 def sample_inputs_cumprod(op_info, device, dtype, requires_grad, **kwargs):
     def make_arg(shape):
         # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
-        return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=-1, high=+1, requires_grad=requires_grad)
 
     def prod_zeros(dim_select):
         assert len(dim_select) == 2
@@ -5814,51 +7018,19 @@ def prod_zeros(dim_select):
     yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype})
 
 def sample_inputs_view_as_complex(op_info, device, dtype, requires_grad, **kwargs):
-    return [SampleInput(make_tensor((S, 2), device, dtype, requires_grad=requires_grad),)]
+    return [SampleInput(make_tensor((S, 2), dtype=dtype, device=device, requires_grad=requires_grad),)]
 
 def sample_inputs_view_as_real(op_info, device, dtype, requires_grad, **kwargs):
     tensors = (
-        make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-        make_tensor((), device, dtype, requires_grad=requires_grad)
+        make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
+        make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad)
     )
     return [SampleInput(tensor) for tensor in tensors]
 
-def sample_inputs_copysign(op_info, device, dtype, requires_grad, **kwargs):
-    def _make_tensor(*shape, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
-
-    cases = [
-        # no broadcast
-        ((S, S, S), (S, S, S), False),
-        # broadcast rhs
-        ((S, S, S), (S, S), False),
-
-        # scalar
-        ((S, S), 3.14, False),
-        # scalar positive zero
-        ((S, S), 0.0, False),
-        # scalar negative zero
-        ((S, S), -0.0, False),
-    ]
-
-    # broadcast lhs
-    cases.append(((S, S), (S, S, S), True))
-    # broadcast all
-    cases.append(((S, 1, S), (M, S), True))
-
-    for input_shape, arg_val, broadcasts_input in cases:
-        if isinstance(arg_val, tuple):
-            arg = _make_tensor(*arg_val)
-        else:
-            # arg_val is scalar
-            arg = arg_val
-
-        yield SampleInput(_make_tensor(*input_shape), args=(arg, ), broadcasts_input=broadcasts_input)
-
 def sample_inputs_prod(op_info, device, dtype, requires_grad, **kwargs):
     def make_arg(shape):
         # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
-        return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=-1, high=+1, requires_grad=requires_grad)
 
     def prod_single_zero():
         result = make_arg(2 * (S,))
@@ -5893,28 +7065,15 @@ def error_inputs_neg(op_info, device, **kwargs):
     msg = ("Negation, the `\\-` operator, on a bool tensor is not supported."
            " If you are trying to invert a mask, use the `\\~` or"
            " `logical_not\\(\\)` operator instead.")
-    return (ErrorInput(si, error_type=RuntimeError, error_regex=msg),)
-
-def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-
-    cases = (
-        ((S, S), (S, S), False),
-        ((S, S), (S,), False),
-        ((S, ), (S, S), True)
-    )
-
-    for shape, other_shape, broadcasts_input in cases:
-        yield SampleInput(make_arg(shape), args=(make_arg(other_shape),), broadcasts_input=broadcasts_input)
-
+    return (ErrorInput(si, error_regex=msg),)
 
 def sample_inputs_diag(op_info, device, dtype, requires_grad, **kwargs):
-    vec_sample = SampleInput(make_tensor((M, ), device, dtype, low=None, high=None, requires_grad=requires_grad))
+    vec_sample = SampleInput(make_tensor((M, ), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad))
 
     tensors = (
-        make_tensor((M, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
-        make_tensor((3, 5), device, dtype, low=None, high=None, requires_grad=requires_grad),
-        make_tensor((5, 3), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((M, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((3, 5), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((5, 3), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
     )
 
     args = ((), (2,), (-2,), (1,), (2,))
@@ -6053,7 +7212,6 @@ def sample_inputs_softmax_variant(op_info, device, dtype, requires_grad, with_dt
         for shape, dim in cases
     ]
 
-
 def sample_inputs_masked_softmax(op_info, device, dtype, requires_grad, with_dtype=False, **kwargs):
     """Sample inputs for masked softmax, log_softmax, and softmin.
 
@@ -6070,6 +7228,29 @@ def sample_inputs_masked_softmax(op_info, device, dtype, requires_grad, with_dty
                                       args=sample_input_args, kwargs=sample_input_kwargs))
     return inputs
 
+def sample_inputs_masked_cumops(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked cumsum and cumprod.
+    """
+    inputs: List[SampleInput] = []
+    for sample_input in sample_inputs_softmax_variant(op_info, device, dtype, requires_grad, **kwargs):
+        for mask in _generate_masked_op_mask(sample_input.input.shape, device, **kwargs):
+            if type(mask) != torch.Tensor:
+                continue
+            sample_input_args, sample_input_kwargs = sample_input.args, dict(mask=mask, **sample_input.kwargs)
+            if 'keepdim' in sample_input_kwargs:
+                sample_input_kwargs.pop('keepdim')
+            # dimension is required
+            if sample_input_args:
+                dim = sample_input.args[0]
+            else:
+                if 'dim' not in sample_input_kwargs:
+                    continue
+                dim = sample_input_kwargs.pop('dim')
+                sample_input_args = (dim,)
+            inputs.append(SampleInput(sample_input.input.clone().requires_grad_(requires_grad),
+                                      args=sample_input_args, kwargs=sample_input_kwargs))
+
+    return inputs
 
 def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwargs):
     """Sample inputs for masked normalize.
@@ -6094,11 +7275,11 @@ def sample_inputs_logit(op_info, device, dtype, requires_grad, **kwargs):
     high = high - domain_eps
 
     samples = (
-        SampleInput(make_tensor((S, S, S), device, dtype, low=low, high=high, requires_grad=requires_grad)),
-        SampleInput(make_tensor((S, S, S), device, dtype, low=low,
+        SampleInput(make_tensor((S, S, S), dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)),
+        SampleInput(make_tensor((S, S, S), dtype=dtype, device=device, low=low,
                                 high=high, requires_grad=requires_grad), args=(0.2,)),
-        SampleInput(make_tensor((), device, dtype, low=low, high=high, requires_grad=requires_grad)),
-        SampleInput(make_tensor((), device, dtype, low=low,
+        SampleInput(make_tensor((), dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)),
+        SampleInput(make_tensor((), dtype=dtype, device=device, low=low,
                                 high=high, requires_grad=requires_grad), args=(0.2,)),
     )
 
@@ -6143,25 +7324,25 @@ def sample_inputs_masked_fill(op_info, device, dtype, requires_grad, **kwargs):
 
 def sample_inputs_masked_select(op_info, device, dtype, requires_grad, **kwargs):
     samples = (
-        SampleInput(make_tensor((M, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((M, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.randn(M, M, device=device) > 0,)),
 
-        SampleInput(make_tensor((M, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((M, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.randn((M,), device=device) > 0,)),
 
-        SampleInput(make_tensor((M,), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((M,), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.randn((M, M), device=device) > 0,)),
 
-        SampleInput(make_tensor((M, 1, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((M, 1, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.randn((M, M), device=device) > 0,)),
 
-        SampleInput(make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.tensor(1, device=device, dtype=torch.bool),)),
 
-        SampleInput(make_tensor((M, M), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((M, M), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.tensor(1, device=device, dtype=torch.bool),)),
 
-        SampleInput(make_tensor((), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        SampleInput(make_tensor((), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
                     args=(torch.randn((M, M), device=device) > 0,)),
     )
 
@@ -6169,8 +7350,8 @@ def sample_inputs_masked_select(op_info, device, dtype, requires_grad, **kwargs)
 
 def sample_inputs_matrix_exp(op_info, device, dtype, requires_grad, **kwargs):
     samples = (
-        SampleInput(make_tensor((S, S), device, dtype, requires_grad=requires_grad)),
-        SampleInput(make_tensor((S, S, S), device, dtype, requires_grad=requires_grad)),
+        SampleInput(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad)),
+        SampleInput(make_tensor((S, S, S), dtype=dtype, device=device, requires_grad=requires_grad)),
     )
 
     return samples
@@ -6192,8 +7373,8 @@ def sample_inputs_matmul(op_info, device, dtype, requires_grad, **kwargs):
                   ((M,), (S, S, M, S)))
     sample_inputs = []
     for lhs_shape, rhs_shape in test_cases:
-        lhs = make_tensor(lhs_shape, device, dtype, low=None, high=None, requires_grad=requires_grad)
-        rhs = make_tensor(rhs_shape, device, dtype, low=None, high=None, requires_grad=requires_grad)
+        lhs = make_tensor(lhs_shape, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+        rhs = make_tensor(rhs_shape, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
         if op_info.name == 'matmul':
             sample_inputs.append(SampleInput(lhs, args=(rhs,)))
         elif op_info.name == '__rmatmul__':
@@ -6205,7 +7386,7 @@ def sample_inputs_matmul(op_info, device, dtype, requires_grad, **kwargs):
 
 def sample_inputs_meshgrid(op_info: OpInfo, device: torch.device, dtype: torch.dtype,
                            requires_grad: bool,
-                           *, variant: str) -> List[SampleInput]:
+                           *, variant: str, **kwargs) -> List[SampleInput]:
     if variant == 'variadic':
         def make_inputs(
                 tensors: List[torch.Tensor]) -> Tuple[Union[torch.Tensor,
@@ -6236,36 +7417,12 @@ def make_inputs(
     sample_inputs = []
     for shapes, indexing in itertools.product(test_cases, {'xy', 'ij'}):
         input, args = make_inputs(
-            [make_tensor(shape, device, dtype, requires_grad=requires_grad)
+            [make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
              for shape in shapes])
         sample_inputs.append(SampleInput(input=input, args=args,
                                          kwargs=dict(indexing=indexing)))
     return sample_inputs
 
-
-def sample_inputs_polar(op_info, device, dtype, requires_grad, **kwargs):
-    def _make_tensor_helper(shape, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
-
-    samples = (
-        SampleInput(_make_tensor_helper((S, S), low=0), args=(_make_tensor_helper((S, S)),)),
-        SampleInput(_make_tensor_helper((), low=0), args=(_make_tensor_helper(()),)),
-    )
-
-    return samples
-
-def sample_inputs_complex(op_info, device, dtype, requires_grad, **kwargs):
-    def _make_tensor_helper(shape):
-        return make_tensor(shape, device, dtype, requires_grad=requires_grad)
-
-    samples = (
-        SampleInput(_make_tensor_helper((S, S)), args=(_make_tensor_helper((S, S)),)),
-        SampleInput(_make_tensor_helper(()), args=(_make_tensor_helper(()),)),
-    )
-
-    return samples
-
-
 def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     tensor_shapes = ((S, S), ())
@@ -6333,7 +7490,6 @@ def __init__(self, variant_test_name, domain, skips, sample_kwargs):
             dtypes=all_types_and(torch.bfloat16),
             dtypesIfCUDA=all_types_and(torch.half),
             sample_inputs_func=sample_inputs_mvlgamma,
-            safe_casts_outputs=True,
             supports_forward_ad=True,
             supports_fwgrad_bwgrad=True,
             skips=skips,
@@ -6346,33 +7502,21 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
     if requires_grad:
         low = 0 + op_info._domain_eps
 
-    return (SampleInput(make_tensor((L,), device, dtype,
+    return (SampleInput(make_tensor((L,), dtype=dtype, device=device,
                                     low=low,
                                     requires_grad=requires_grad)),
-            SampleInput(make_tensor((), device, dtype,
+            SampleInput(make_tensor((), dtype=dtype, device=device,
                                     low=low,
                                     requires_grad=requires_grad)))
 
-
-def sample_inputs_zeta(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    samples = (SampleInput(make_arg((S,), low=1, requires_grad=requires_grad),
-                           args=(make_arg((S,), low=2, requires_grad=False),)),
-               SampleInput(make_arg((S,), low=1, requires_grad=requires_grad),
-                           args=(3.,)),
-               )
-
-    return samples
-
-
 # TODO: Consolidate `i0e` with sample_inputs_unary when `make_tensor`,
 #       supports `exclude` argument.
 #       For more context: https://github.com/pytorch/pytorch/pull/56352#discussion_r633277617
 def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
 
-    samples = (SampleInput(make_tensor((S,), device, dtype,
+    samples = (SampleInput(make_tensor((S,), dtype=dtype, device=device,
                                        requires_grad=requires_grad)),
-               SampleInput(make_tensor((), device, dtype,
+               SampleInput(make_tensor((), dtype=dtype, device=device,
                                        requires_grad=requires_grad)))
 
     if requires_grad and op_info.op == torch.special.i0e:
@@ -6385,7 +7529,7 @@ def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
     elif requires_grad and op_info.op != torch.special.i0e:
         # Special Case for gradient
         # Sample with `0` in the input
-        t = make_tensor((S,), device, dtype,
+        t = make_tensor((S,), dtype=dtype, device=device,
                         requires_grad=requires_grad)
         t[0] = 0
 
@@ -6393,37 +7537,9 @@ def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
 
     return samples
 
-
-def sample_inputs_rsub(op_info, device, dtype, requires_grad, other_scalar, **kwargs):
-    make_arg = partial(make_tensor, device=device)
-
-    shapes = ((S, S), (S,), ()) if not other_scalar else ((),)
-    # We are doing y - a*x, where y may be a scalar or a tensor
-    # If y is a scalar, y may be of any dtype that can be cast to the dtype of x
-    # a may always be of any dtype that can be cast to the dtype of x
-    if dtype.is_complex:
-        dtypes_a = (torch.int32, torch.float32, dtype)
-    elif dtype.is_floating_point:
-        dtypes_a = (torch.int32, dtype)
-    else:
-        dtypes_a = (dtype, )
-    dtypes_y = dtypes_a if other_scalar else (dtype,)
-
-    for shape_x, shape_y, dtype_y, dtype_a in product(shapes, shapes, dtypes_y, dtypes_a):
-        requires_grad_y = (requires_grad and
-                           not other_scalar and
-                           (dtype_y.is_floating_point or dtype_y.is_complex))
-
-        x = make_arg(shape_x, dtype=dtype, requires_grad=requires_grad)
-        y = make_arg(shape_y, dtype=dtype_y, requires_grad=requires_grad_y)
-        if other_scalar:
-            y = y.item()
-        a = make_arg((), dtype=dtype_a).item()
-        yield SampleInput(x, args=(y,), kwargs={"alpha": a})
-
 def sample_inputs_cumulative_ops(op_info, device, dtype, requires_grad, supports_dtype_kwargs=True, **kwargs):
     def _make_tensor_helper(shape, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
 
     samples = [
         SampleInput(_make_tensor_helper((S, S, S)), args=(0,)),
@@ -6467,28 +7583,12 @@ def sample_inputs_unfold(op_info, device, dtype, requires_grad, **kwargs):
 
     sample_inputs = []
     for shape, arguments in test_cases:
-        sample_inputs += [SampleInput(make_tensor(shape, device, dtype,
+        sample_inputs += [SampleInput(make_tensor(shape, dtype=dtype, device=device,
                                       low=None, high=None,
                                       requires_grad=requires_grad),
                                       args=arguments)]
     return sample_inputs
 
-
-def sample_inputs_atan2(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    cases = (
-        ((S, S, S), (S, S, S), False),
-        ((), (), False),
-        ((S, S, S), (S,), False),
-        ((S,), (S, S, S), True),
-        ((S, 1, S), (S, S), True),
-    )
-
-    for x_shape, y_shape, broadcasts_input in cases:
-        yield SampleInput(make_arg(x_shape), args=(make_arg(y_shape),),
-                          broadcasts_input=broadcasts_input)
-
-
 def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=False, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -6536,7 +7636,7 @@ def large_1d_unique(dtype, device):
     # Test case for large tensor.
     largesample = SampleInput(large_1d_unique(dtype, device))
 
-    sample = SampleInput(make_tensor((S, M, S), device, dtype,
+    sample = SampleInput(make_tensor((S, M, S), dtype=dtype, device=device,
                                      low=None, high=None,
                                      requires_grad=requires_grad))
 
@@ -6602,9 +7702,9 @@ def sample_inputs_tensordot(self, device, dtype, requires_grad, **kwargs):
     )
     samples = []
     for first_shape, second_shape, dims in cases:
-        samples.append(SampleInput(make_tensor(first_shape, device, dtype,
+        samples.append(SampleInput(make_tensor(first_shape, dtype=dtype, device=device,
                                    requires_grad=requires_grad),
-                       args=(make_tensor(second_shape, device, dtype,
+                       args=(make_tensor(second_shape, dtype=dtype, device=device,
                              requires_grad=requires_grad),),
                        kwargs=dict(dims=dims,)))
     return tuple(samples)
@@ -6616,8 +7716,8 @@ def sample_inputs_kron(op_info, device, dtype, requires_grad, **kwargs):
 
     sample_inputs = []
     for input_shape, other_shape in test_cases:
-        input = make_tensor(input_shape, device, dtype, low=None, high=None, requires_grad=requires_grad)
-        other = make_tensor(other_shape, device, dtype, low=None, high=None, requires_grad=requires_grad)
+        input = make_tensor(input_shape, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+        other = make_tensor(other_shape, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
         sample = SampleInput(input, args=(other,))
         sample_inputs.append(sample)
     return tuple(sample_inputs)
@@ -6625,22 +7725,22 @@ def sample_inputs_kron(op_info, device, dtype, requires_grad, **kwargs):
 def sample_inputs_inner(self, device, dtype, requires_grad, **kwargs):
     return (
         SampleInput(
-            make_tensor((S, ), device, dtype, requires_grad=requires_grad),
+            make_tensor((S, ), dtype=dtype, device=device, requires_grad=requires_grad),
             args=(
-                make_tensor((S, ), device, dtype, requires_grad=requires_grad),
+                make_tensor((S, ), dtype=dtype, device=device, requires_grad=requires_grad),
             )
         ),
         SampleInput(
-            make_tensor((), device, dtype, requires_grad=requires_grad),
+            make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad),
             args=(
-                make_tensor((S, S), device, dtype, requires_grad=requires_grad),
+                make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
             )
         ),
     )
 
 def sample_inputs_scatter(op_info, device, dtype, requires_grad, **kwargs):
     def _tensor(shape, dtype=dtype, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
 
     def _gather(shape, index_dim, max_indices):
         return gather_variable(shape, index_dim, max_indices, device=device)
@@ -6677,7 +7777,7 @@ def _gather(shape, index_dim, max_indices):
 
 def sample_inputs_scatter_add(op_info, device, dtype, requires_grad, **kwargs):
     def _tensor(shape, dtype=dtype, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
 
     def _gather(shape, index_dim, max_indices):
         return gather_variable(shape, index_dim, max_indices, device=device)
@@ -6695,37 +7795,54 @@ def _gather(shape, index_dim, max_indices):
 
     return [SampleInput(tensor, args=args) for tensor, args in test_cases]
 
-def sample_inputs_scatter_reduce(op_info, device, dtype, requires_grad):
+def sample_inputs_scatter_reduce(op_info, device, dtype, requires_grad, **kwargs):
     def _tensor(shape, dtype=dtype, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
 
-    def _index(shape, max_index):
-        return torch.from_numpy(np.random.choice(max_index, size=shape)).to(dtype=torch.int64, device=device)
-
-    reduces = ["sum", "prod", "mean", "amax", "amin"]
-    shapes_and_dims = [((M,), 1), ((M, S), 2), ((M, M, S), 3), ((1, M, M, S), 4)]
-
-    sample_inputs = []
+    def _gather(shape, index_dim, max_indices):
+        return gather_variable(shape, index_dim, max_indices, device=device)
 
-    for ((shape, dim), reduce) in itertools.product(shapes_and_dims, reduces):
-        for d in range(dim):
-            # Generate a random maximum integer that can appear in index array
-            max_index = np.random.randint(1, shape[d] * 2)
-            index = _index(shape, max_index)
-            sample_inputs.append(
-                SampleInput(
-                    _tensor(shape),
-                    args=(d, index, reduce),
-                )
-            )
+    zero = torch.tensor(0, dtype=torch.long, device=device)
+    test_cases = (
+        ((M, S), 0, _gather((S, S), 1, M), (S, S)),
+        ((M, S), 1, _gather((S, S), 0, S), (S, S)),
+        ((M, S), -1, _gather((S, S), 0, S), (S, S)),
+        ((M, S), 0, _gather((M, S // 2), 1, M), (M, S // 2)),
+        ((M, S), 1, _gather((M, S // 2), 0, S), (M, S // 2)),
+        ((M, S), -1, _gather((M, S // 2), 0, S), (M, S // 2)),
+        ((), 0, zero.clone().detach(), ()),
+    )
 
-    return sample_inputs
+    reduce = op_info.variant_test_name
+    for args, include_self in product(test_cases, [True, False]):
+        inp_shape, dim, index, src_shape = args
+        yield SampleInput(_tensor(inp_shape),
+                          args=(dim, index, _tensor(src_shape), reduce),
+                          kwargs={'include_self': include_self})
+
+
+    # Sample inputs to test edge cases for backward
+    # Check that gradients are propagated correctly for prod when zeros in self/src are reduced
+    if requires_grad and reduce == 'prod':
+        # This sample tests gradients for the following cases
+        # (a) 1 zero reduced (from src (self[0, 1], self[1, 1]), from self (self[0, 0], self[2, 0]))
+        # (b) 2 zeros reduced (1 from src and 1 from self (self[1, 0])
+        # (c) no zeros reduced (self([2, 1]))
+        # (d) 2 zeros reduced (both from src) is tested in test/test_autograd.py
+        #     test_scatter_index_reduce_prod_gradgrad_error as this case is not supported for gradgrad
+        input = torch.tensor([[0, 13], [0, 17], [0, 19]], dtype=dtype, device=device, requires_grad=requires_grad)
+        src = torch.tensor([[0, 1, 2, 3], [0, 4, 0, 1], [2, 3, 5, 6]], dtype=dtype, device=device, requires_grad=requires_grad)
+        idx = torch.tensor([[1, 1, 0, 0], [0, 0, 1, 1], [0, 0, 0, 1]], dtype=torch.long, device=device)
+
+        yield SampleInput(input,
+                          args=(1, idx, src, reduce),
+                          kwargs={'include_self': True})
 
 def sample_inputs_ravel(op_info, device, dtype, requires_grad, **kwargs):
-    samples = (SampleInput(make_tensor((S, S, S), device, dtype,
+    samples = (SampleInput(make_tensor((S, S, S), dtype=dtype, device=device,
                                        low=None, high=None,
                                        requires_grad=requires_grad)),
-               SampleInput(make_tensor((), device, dtype,
+               SampleInput(make_tensor((), dtype=dtype, device=device,
                                        low=None, high=None,
                                        requires_grad=requires_grad)),)
 
@@ -6750,6 +7867,38 @@ def sample_inputs_clone(op_info, device, dtype, requires_grad, **kwargs):
     yield SampleInput(make_arg((S, M, S)))
     yield SampleInput(make_arg(()))
 
+def reference_inputs_clone(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_clone(op, device, dtype, requires_grad, **kwargs)
+
+    shapes = (
+        (3, 5, 6),
+        (1, 1, 3, 5, 6),
+        (1, 1, 3, 5, 6, 1, 1),
+        (1, 0, 3, 5, 0, 2),
+        (1, 0, 3, 5, 0, 0, 1, 1, 2),
+        (),
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in shapes:
+        yield SampleInput(make_arg(shape))
+        yield SampleInput(make_arg(shape).transpose(0, -1))
+        yield SampleInput(make_arg(shape, noncontiguous=True))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1))
+
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    for shape, strides, offset in strided_cases:
+        yield SampleInput(make_arg(500,).as_strided(shape, strides, offset))
+
 
 def sample_inputs_contiguous(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6800,23 +7949,82 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
 def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
-    cases = (((S, S, S), (S * S, S)),
-             ((S * S, S), (S, S, S)),
-             ((S * S, S), (S, -1, S)),
-             ((S * S * 2, S), (S, -1)),
-             ((S,), (S,)),
-             ((), ()),
-             ((), (1,)))
+    cases = (
+        ((S, S, S), (S * S, S)),
+        ((S * S, S), (S, S, S)),
+        ((S * S, S), (S, -1, S)),
+        ((S * S * 2, S), (S, -1)),
+        ((S,), (S,)),
+        ((), ()),
+        ((), (1,)),
+    )
 
-    for case in cases:
-        shape, args = case
-        inp = make_arg(shape, requires_grad=requires_grad)
-        yield(SampleInput(inp, args=(args, )))
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=(args,))
+
+        if kwargs.get("transpose_samples", False) and len(shape) >= 2:
+            transposed = make_arg(shape).transpose(0, 1).detach().requires_grad_(requires_grad)
+            yield SampleInput(transposed, args=(args,))
+
+def reference_inputs_view_reshape(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_view_reshape(op, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        ((125,), (25, 5)),
+        ((25, 25), (1, 5, 5, 1, 5, 1, 5, 1)),
+        ((16, 32), (2, 4, 1, 4, 4, 1, 4)),
+        ((16, 12), (12, 16)),
+        ((1, 16, 12), (12, 16)),
+        ((1, 5, 1, 5), (25, 1)),
+        ((2, 4, 2), (4, 4)),
+        ((1, 4), (1, 1, 2, 1, 2)),
+        ((3, 5, 7), (7, 5, 3)),
+        ((1,), ()),
+        ((5, 0, 2, 3), (5, 0, 2, 3)),
+        ((2, 1, 0, 3, 1), (5, 0)),
+        ((1,), ()),
+        ((4, 5, 6), (4, 5, 6, 1, 1, 1)),
+        ((), (1, 1, 1, 1)),
+    )
+
+    irreversible_cases = (
+        ((), (-1,)),
+        ((4, 7, 9, 1, 1), (1, 4, 3, -1, 1)),
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for a, b in cases:
+        yield SampleInput(make_arg(a), args=(b,))
+        yield SampleInput(make_arg(b), args=(a,))
+
+        if kwargs.get("transpose_samples", False):
+            yield SampleInput(make_arg(a, noncontiguous=True).transpose(0, -1), args=(b,))
+        else:
+            yield SampleInput(make_arg(a, noncontiguous=True), args=(b,))
+
+    for a, b in irreversible_cases:
+        yield SampleInput(make_arg(a), args=(b,))
+
+def error_inputs_reshape(op, device, **kwargs):
+
+    cases = (
+        # Reshape to different numel
+        ((2,), ()),
+        ((1, 3, 0), ()),
+        ((4, 3), (4, 2)),
+        ((1, 3, 5), (5, 2, 2)),
+        # No valid inference
+        ((1, 3, 5), (5, -1, 2)),
+        # Two inferred shapes
+        ((1, 3, 5), (5, -1, -1)),
+        ((1), (0, -1)),
+        ((0, 5), (0, -1)),
+    )
+
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device, requires_grad=False)
+    for a, b in cases:
+        yield ErrorInput(SampleInput(make_arg(a), args=(b,)), error_type=Exception, error_regex="")
 
-        if op_info.name != "view" and len(shape) >= 2:
-            yield(SampleInput(
-                inp.clone().transpose(0, 1).requires_grad_(requires_grad),
-                args=(args, )))
 
 def sample_inputs_view_as_reshape_as(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device)
@@ -6869,6 +8077,32 @@ def sample_inputs_flatten(op_info, device, dtype, requires_grad, **kwargs):
             samples.append(SampleInput(make_tensor_partial(shape), kwargs=dict(start_dim=1, end_dim=-1)))
     return samples
 
+def reference_inputs_flatten(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_flatten(op, device, dtype, requires_grad, **kwargs)
+
+    # shape x start_dim x end_dim
+    cases = (
+        ((5, 4, 0, 1, 3, 7), 1, 3),
+        ((5, 4, 0, 1, 3, 7), 4, 5),
+        ((5, 4, 1, 1, 3, 7), 2, 3),
+        ((), 0, -1),
+        ((1,), 0, -1),
+        ((3, 7, 5), 1, 2),
+        ((4, 5), 1, 1),
+        ((1, 5, 5, 1, 5, 1, 5, 1), 0, 2),
+        ((1, 5, 5, 1, 5, 1, 5, 1), 3, -1),
+        ((1, 5, 5, 1, 5, 7, 5, 1), -2, -1),
+        ((2, 4, 2), 0, 1),
+        ((4, 2, 2), 1, 2),
+        ((0, 3, 4, 5), 1, 3),
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape, start, end in cases:
+        yield SampleInput(make_arg(shape), args=(start, end,))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1), args=(start, end,))
+        yield SampleInput(make_arg(shape).transpose(0, -1), args=(start, end,))
+
 def sample_inputs_select(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
@@ -6918,26 +8152,6 @@ def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs)
         src = make_arg(src_shape)
         yield SampleInput(input_, args=(src, *args))
 
-
-def sample_inputs_rbinops(op_info, device, dtype, requires_grad, supports_dtype_kwargs=True, **kwargs):
-    def _make_tensor_helper(shape, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
-
-    scalar: Union[int, float, complex] = 3
-
-    if dtype.is_floating_point:
-        scalar = 3.14
-    elif dtype.is_complex:
-        scalar = 3.14j
-
-    samples = [
-        SampleInput(_make_tensor_helper((S, S, S)), args=(scalar,)),
-        SampleInput(_make_tensor_helper(()), args=(scalar,)),
-    ]
-
-    return samples
-
-
 def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
@@ -6966,14 +8180,7 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
     for shape, memory_format in itertools.product(shapes, memory_format_options):
         yield SampleInput(make_arg(shape),
                           kwargs={'memory_format': memory_format} if memory_format else {})
-
-def sample_inputs_conversion_channels_last(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-    return [
-        # Channels last case: input must be 4d
-        SampleInput(make_arg((2, 3, 2, 3)), kwargs={'memory_format': torch.channels_last})
-
-    ]
+    yield SampleInput(make_arg((2, 3, 2, 3)), kwargs={'memory_format': torch.channels_last})
 
 def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device)
@@ -7022,6 +8229,16 @@ def random_index(shape):
                           args=(make_bool_mask(mask_shape), make_arg(other_shape)),
                           broadcasts_input=broadcasts_input)
 
+def error_inputs_where(op_info, device, **kwargs):
+    shape = (S,)
+    err_msg = "Expected all tensors to be on the same device"
+    for devices in product(('cpu', device), repeat=3):
+        if len(set(devices)) == 2:
+            si = SampleInput(make_tensor(shape, device=devices[0], dtype=torch.float32),
+                             args=(make_tensor(shape, dtype=torch.bool, device=devices[1]),
+                             make_tensor(shape, device=devices[2], dtype=torch.float32)))
+            yield ErrorInput(si, error_regex=err_msg)
+
 def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
@@ -7044,7 +8261,7 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
                           kwargs=dict(as_tuple=as_tuple)))
 
 def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device)
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
     cases = (((S, S, S), (2,)),
              ((S, S, S), (S, 1)),
@@ -7052,11 +8269,36 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
+        yield(SampleInput(make_arg(shape), args=args))
+
+def reference_inputs_chunk(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_chunk(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # shape x chunks x dim
+    cases = (
+        ((13, 9, 11), 17, -1),
+        ((13, 9, 11), 11, -1),
+        ((13,), 12, -1),
+        ((15,), 12, -1),
+        ((15,), 7, 0),
+        ((15,), 9, 0),
+        ((3, 7), 9, 1),
+        ((3, 7), 9, 0),
+        ((3, 7), 2, 0),
+        ((3, 7), 3, 0),
+        ((3, 7), 1, 0),
+        ((3, 7), 1, 1),
+        ((4, 4), 2, 0),
+    )
+
+    for shape, chunks, dim in cases:
+        yield SampleInput(make_arg(shape), args=(chunks, dim))
 
 def sample_inputs_kthvalue(op_info, device, dtype, requires_grad, **kwargs):
     def _tensor(shape, dtype=dtype, low=None, high=None):
-        return make_tensor(shape, device, dtype, low=low, high=high, requires_grad=requires_grad)
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
 
     test_cases = [
         (_tensor((S, S, S)), (2,)),
@@ -7080,13 +8322,13 @@ def error_inputs_kthvalue(op_info, device, **kwargs):
     si = SampleInput(t, args=(5,), kwargs={'out': (t, indices)})
 
     k_out_of_range_err = "selected number k out of range for dimension"
-    return (ErrorInput(si, error_type=RuntimeError, error_regex="unsupported operation"),
+    return (ErrorInput(si, error_regex="unsupported operation"),
             ErrorInput(SampleInput(torch.randn(2, 2, device=device), args=(3, 0)),
-                       error_type=RuntimeError, error_regex=k_out_of_range_err),
+                       error_regex=k_out_of_range_err),
             ErrorInput(SampleInput(torch.randn(2, 2, device=device), args=(3,)),
-                       error_type=RuntimeError, error_regex=k_out_of_range_err),
+                       error_regex=k_out_of_range_err),
             ErrorInput(SampleInput(torch.tensor(2, device=device), args=(3,)),
-                       error_type=RuntimeError, error_regex=k_out_of_range_err),)
+                       error_regex=k_out_of_range_err),)
 
 def sample_inputs_dropout(op_info, device, dtype, requires_grad, *,
                           train=None, valid_input_dim=None, **kwargs):
@@ -7367,22 +8609,25 @@ def sample_inputs_tensorsolve(op_info, device, dtype, requires_grad, **kwargs):
         b = make_tensor(a_shape[:2], dtype=dtype, device=device, requires_grad=requires_grad)
         yield SampleInput(a, args=(b,), kwargs=dict(dims=dims))
 
-def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs):
+def sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs):
+    rhs_requires_grad = kwargs.get('rhs_requires_grad', requires_grad)
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    shapes_and_kwargs = [
+    # Although most losses also support the reduce and size_average combination instead of reduce, the former is
+    # deprecated since 0.4.1 and thus is not tested
+    shapes_and_kwargs = (
         ((), None),
         ((S,), dict(reduction="mean")),
         ((S,), dict(reduction="sum")),
         ((S,), dict(reduction="none")),
         ((S, S), None),
         ((S, S, S), None),
-    ]
+    )
 
-    return [
-        SampleInput(_make_tensor(shape), args=(_make_tensor(shape),), kwargs=kwargs)
-        for shape, kwargs in shapes_and_kwargs
-    ]
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(_make_tensor(shape),
+                          args=(_make_tensor(shape, requires_grad=rhs_requires_grad),),
+                          kwargs=kwargs)
 
 def sample_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7459,7 +8704,7 @@ def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     num_classes = shape[1]
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     # FIXME: Derivative wrt. weight not implemented
-    make_weight = partial(make_tensor, shape=(num_classes,), device=device, dtype=dtype, requires_grad=False)
+    make_weight = partial(make_tensor, num_classes, device=device, dtype=dtype, requires_grad=False)
 
     def make_target(shape, zeros=False):
         s = (shape[0], *shape[2:]) if len(shape) > 1 else ()
@@ -7488,6 +8733,7 @@ def gen_shape_kwargs():
             if t.eq(ignore).all() and reduction == "mean":
                 t.fill_(0)
             yield make_input(s), t, dict(ignore_index=num_classes // 2, reduction=reduction)
+            yield make_input(s), t, dict(ignore_index=num_classes // 2, reduction=reduction, weight=make_weight())
             # Test ignoring all the targets
             # If "mean", nll returns NaN, so it's not differentiable at those points
             if reduction != "mean":
@@ -7496,6 +8742,34 @@ def gen_shape_kwargs():
     for input, target, kwargs in gen_shape_kwargs():
         yield SampleInput(input, args=(target,), kwargs=kwargs)
 
+def sample_inputs_binary_cross_entropy_with_logits(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    make = partial(make_tensor, device=device, dtype=dtype)
+    make_prob = partial(make, low=0, high=1)
+    reductions = ("mean", "sum", "none")
+
+    def make_weight_shape_kwargs():
+        kwargs = []
+        for shape in ((1,), (1, S), (S), (S, S)):
+            kwargs.extend([((S, S), dict(reduction=reduction, weight=make(shape))) for reduction in reductions])
+        return kwargs
+
+    shapes_and_kwargs = [
+        *[(shape, None) for shape in ((), (1,), (S,), (S, S), (S, S, S))],
+        *[((S, S), dict(reduction=reduction)) for reduction in reductions],
+        *make_weight_shape_kwargs(),
+        *[((S, S), dict(reduction=reduction, pos_weight=make((S,), low=0))) for reduction in reductions],
+        *[((S, S), dict(reduction=reduction, weight=make((S, S)), pos_weight=make((S,), low=0))) for reduction in reductions],
+    ]
+
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(
+            make(shape, requires_grad=requires_grad),
+            args=(make_prob(shape, requires_grad=requires_grad),),
+            kwargs=kwargs,
+        )
+
 def sample_inputs_argwhere(op_info, device, dtype, requires_grad, **kwargs):
     yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad))
     mask = torch.tensor([[0, 1, 0, 1, 0],
@@ -7607,6 +8881,22 @@ def gen_shape_kwargs():
     for input, target, kwargs in gen_shape_kwargs():
         yield SampleInput(input, args=(target, ), kwargs=kwargs)
 
+def sample_inputs_triplet_margin_loss(op_info, device, dtype, requires_grad, with_distance=False, **kwargs):
+    make = partial(make_tensor, (S, M), device=device, dtype=dtype, requires_grad=requires_grad)
+
+    kwargss = (
+        *[dict(margin=margin) for margin in (1e-6, 1.0, 10.0)],
+        dict(swap=True),
+        *[dict(reduction=reduction) for reduction in ("mean", "sum", "none")],
+    )
+
+    for kwargs in kwargss:
+        input = make()
+        args = (make(), make())
+        if with_distance:
+            kwargs["distance_function"] = torch.nn.PairwiseDistance()
+        yield SampleInput(input, args=args, kwargs=kwargs)
+
 def sample_inputs_pairwise_distance(op_info, device, dtype, requires_grad, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -7644,6 +8934,30 @@ def sample_inputs_pixel_unshuffle(op_info, device, dtype, requires_grad, **kwarg
         for downscale_factor in (1, 3)
     ]
 
+def sample_inputs_binary_cross_entropy(op_info, device, dtype, requires_grad, logits=False, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype)
+    make_prob = partial(make, low=0, high=1)
+
+    reductions = ("mean", "sum", "none")
+
+    shapes_and_kwargs = [
+        *[(shape, None) for shape in ((), (1,), (S,), (S, S), (S, S, S))],
+        *[((S, S), dict(reduction=reduction)) for reduction in reductions],
+        *[((S, S), dict(reduction=reduction, weight=make((S, S)))) for reduction in reductions],
+    ]
+
+    if logits:
+        shapes_and_kwargs.extend(
+            [((S, S), dict(reduction=reduction, pos_weight=make((S,), low=0))) for reduction in reductions]
+        )
+
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(
+            (make if logits else make_prob)(shape, requires_grad=requires_grad),
+            args=(make_prob(shape, requires_grad=requires_grad),),
+            kwargs=kwargs,
+        )
+
 def sample_inputs_allclose(op_info, device, dtype, requires_grad, **kwargs):
     samples = []
     sample_shapes = [(), (S), (S, S, S)]
@@ -7665,6 +8979,24 @@ def sample_inputs_allclose(op_info, device, dtype, requires_grad, **kwargs):
 
     return samples
 
+def sample_inputs_l1_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs)
+
+    # In addition to the regular test cases, we add two for mixed floating point and complex inputs
+    if dtype.is_complex:
+        make = partial(make_tensor, (), device=device, requires_grad=requires_grad)
+        yield SampleInput(make(dtype=dtype), args=(make(dtype=torch.double),))
+        yield SampleInput(make(dtype=torch.double), args=(make(dtype=dtype),))
+
+def sample_inputs_smooth_l1_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs)
+
+    make = partial(make_tensor, (S, S), device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # This test case always triggers the smooth condition, since absolute difference of input and target
+    # is smaller than beta
+    yield SampleInput(make(low=0, high=2), args=(make(low=-2, high=0),), kwargs=dict(beta=5))
+    yield SampleInput(make(), args=(make(),), kwargs=dict(beta=0))
 
 def sample_inputs_kl_div(op_info, device, dtype, requires_grad, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7690,6 +9022,22 @@ def sample_inputs_kl_div(op_info, device, dtype, requires_grad, **kwargs):
         )
     return sample_inputs
 
+def sample_inputs_pdist(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield from (SampleInput(make_input((n, m))) for n, m in itertools.product((1, S), repeat=2))
+    yield from (SampleInput(make_input((S, S)), kwargs=dict(p=p)) for p in (0.0, 1.0, 2.0, 10.0, float("inf")))
+
+def reference_pdist(input, p=2):
+    pdist = scipy.spatial.distance.pdist
+    if p == 0:
+        output = pdist(input, "hamming") * input.shape[1]
+    elif p == float("inf"):
+        output = pdist(input, lambda x, y: np.abs(x - y).max())
+    else:
+        output = pdist(input, "minkowski", p=p)
+    return output.astype(input.dtype)
+
 def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -7701,6 +9049,79 @@ def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
         SampleInput(make_input((2,)), kwargs=dict(offset=-1)),
     ]
 
+def sample_inputs_max_unpool(op_info, device, dtype, requires_grad, **kwargs):
+    unpool_name_to_pool_method_dict = {
+        'nn.functional.max_unpool1d': torch.nn.functional.max_pool1d,
+        'nn.functional.max_unpool2d': torch.nn.functional.max_pool2d,
+        'nn.functional.max_unpool3d': torch.nn.functional.max_pool3d
+    }
+
+    unpool_name_to_dim = {
+        'nn.functional.max_unpool1d': 1,
+        'nn.functional.max_unpool2d': 2,
+        'nn.functional.max_unpool3d': 3
+    }
+
+    unpool_to_pool_name_dict = dict((
+        (k, f'nn.functional.{v.__name__}') for k, v in unpool_name_to_pool_method_dict.items()
+    ))
+
+    pool_dim = unpool_name_to_dim[op_info.name]
+    pool_method = unpool_name_to_pool_method_dict[op_info.name]
+
+    pool_op_info = copy.copy(op_info)
+    pool_op_info.name = unpool_to_pool_name_dict[op_info.name]
+
+    for sample in sample_inputs_max_pool(pool_op_info, device, dtype, requires_grad, **kwargs):
+        # shapes (C, ...) do not work as of now,
+        # see https://github.com/pytorch/pytorch/issues/68337
+        # TODO: remove once the issue is resolved
+        if sample.input.dim() != pool_dim + 2:
+            continue
+
+        # No dilation > 1 for max_unpool,
+        # see https://github.com/pytorch/pytorch/issues/68420
+        if sample.kwargs['dilation'] != 1:
+            continue
+
+        # Can't unpool without indices
+        if sample.kwargs['return_indices']:
+            pool, indices = pool_method(sample.input, **sample.kwargs)
+            # arg has to be a leaf
+            arg = pool.detach().requires_grad_(requires_grad)
+            sample_kwargs = {
+                'kernel_size': sample.kwargs['kernel_size'],
+                'stride': sample.kwargs['stride'],
+                'padding': sample.kwargs['padding'],
+                # output_size could be None but we specify it explicitly
+                # to compensate for the information lose in pool due
+                # to the floor/ceil operation used to compute the shapes
+                'output_size': sample.input.size()
+            }
+
+            yield SampleInput(arg, args=(indices,), kwargs=sample_kwargs)
+
+def sample_inputs_max_unpool_grad(op_info, device, dtype, requires_grad, **kwargs):
+    for sample in sample_inputs_max_unpool(op_info, device, dtype, requires_grad, **kwargs):
+        indices = sample.args[0]
+        # The samples for max_unpool are generated with max_pool.
+        # It could be that a single element from the max_pool's
+        # input is mapped to several locations in its output.
+        # This situation leads to failed gradchecks because
+        # the finite difference algorithm perturbes the elements
+        # of the output one by one, and not in classes of
+        # equivalences determined by whether two elements
+        # in the output are coming from the same location in the
+        # input (simply put, they have the same corresponding index).
+        # So, there are two ways to resolve this issue:
+        # 1. Extract a pertubation for one element and apply it all
+        #    the elements from the same equivalence class, or
+        # 2. Make sure that the equivalence classes are all singletons,
+        # i.e. the index tensor has to be comprised of only unique
+        # indices.
+        # Here we go with the solution 2, the easiest of all.
+        if indices.unique().numel() == indices.numel():
+            yield sample
 
 foreach_unary_op_db: List[OpInfo] = [
     ForeachFuncInfo('exp'),
@@ -7722,7 +9143,6 @@ def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and_complex(),
         dtypesIfCUDA=all_types_and_complex(),
         sample_inputs_func=sample_inputs_foreach,
-        safe_casts_outputs=False,
     ),
 
     ForeachFuncInfo(
@@ -7801,7 +9221,6 @@ def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
         'abs',
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
-        safe_casts_outputs=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
     ),
@@ -7861,7 +9280,7 @@ def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
 foreach_reduce_op_db: List[ForeachFuncInfo] = [
     ForeachFuncInfo(
         "norm",
-        dtypesIfCPU=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
     ),
 ]
@@ -7948,9 +9367,11 @@ def reference_polygamma(x, n):
     # >>> scipy.special.polygamma(0, np.array([501], dtype=np.float32)).dtype
     # dtype('float32')
     #
-    # Thus we cast output to the default torch dtype.
-    np_dtype = torch_to_numpy_dtype_dict[torch.get_default_dtype()]
-    return scipy.special.polygamma(n, x).astype(np_dtype)
+    # Thus we cast output to the default torch dtype or preserve double
+    result_dtype = torch_to_numpy_dtype_dict[torch.get_default_dtype()]
+    if x.dtype == np.double:
+        result_dtype = np.double
+    return scipy.special.polygamma(n, x).astype(result_dtype)
 
 
 def reference_mvlgamma(x, d):
@@ -7965,6 +9386,20 @@ def reference_softplus(input, beta=1, threshold=20):
     output[non_linear] = np.log(1 + np.exp(beta * input[non_linear])) / beta
     return output
 
+def reference_gelu(X, *, approximate='none'):
+    def _gelu_ref(X):
+        return X * stats.norm.cdf(X)
+
+    def _tanh_gelu_ref(X):
+        M_SQRT_2_PI = math.sqrt(2 / math.pi)
+        Z = M_SQRT_2_PI * (X + 0.044715 * np.power(X, 3.0))
+        return 0.5 * X * (1.0 + np.tanh(Z))
+
+    if approximate == 'tanh':
+        return _tanh_gelu_ref(X)
+    else:
+        return _gelu_ref(X)
+
 
 def reference_one_hot(a: np.ndarray, num_classes: int = -1) -> np.ndarray:
     if num_classes == -1:
@@ -8073,7 +9508,7 @@ def gradcheck_wrapper_hermitian_input(op, input, *args, **kwargs):
 
 
 def gradcheck_wrapper_triangular_input(op, *args, upper=False, idx=0, **kwargs):
-    """Gradcheck wrpper for functions that take lower or upper triangular matrices as input.
+    """Gradcheck wrapper for functions that take lower or upper triangular matrices as input.
 
     They require a modified function because the finite-difference algorithm
     for calculating derivatives does not preserve the triangular property of the input.
@@ -8083,6 +9518,23 @@ def gradcheck_wrapper_triangular_input(op, *args, upper=False, idx=0, **kwargs):
     return op(*args[:idx], triangular_arg, *args[idx + 1:], upper, **kwargs)
 
 
+def gradcheck_wrapper_triangular_input_real_positive_diagonal(op, *args, upper=False, idx=0, **kwargs):
+    """Gradcheck wrapper for functions that take lower/upper triangular matrices
+    with real and positive diagonals, for example, cholesky-like operations.
+    """
+    arg = args[idx]
+    arg_diag = arg.diagonal(0, -2, -1)
+    arg_diag_embed = torch.diag_embed(arg_diag)
+    id_diag_tensor = torch.ones_like(arg_diag)
+    id_tensor = torch.diag_embed(id_diag_tensor)
+    # new_arg = arg - diag(arg) + I
+    new_arg = arg - arg_diag_embed + id_tensor
+    return gradcheck_wrapper_triangular_input(
+        op, *args[:idx], new_arg, *args[idx + 1:],
+        upper=upper, idx=idx, **kwargs
+    )
+
+
 def gradcheck_wrapper_masked_operation(op, input, *args, **kwargs):
     """Gradcheck wrapper for masked operations.
 
@@ -8140,6 +9592,7 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         if 'mask' in keys:
             mask = kwargs.pop('mask')
             if mask is not None:
+                assert mask.layout == torch.strided
                 kwargs['where'] = mask.cpu().numpy()
 
         if 'identity' in keys:
@@ -8167,6 +9620,33 @@ def wrapper(x: np.ndarray, *args, **kwargs):
 
     return wrapper
 
+def loss_reference_reduction_wrapper(fn):
+    def wrapper(input, target, *, size_average=None, reduce=None, reduction="mean", **other_kwargs):
+        if size_average is not None or reduce is not None:
+            raise RuntimeError(
+                "The keyword arguments 'size_average' and 'reduce' are deprecated and not supported by this wrapper"
+            )
+        output = fn(input, target, **other_kwargs)
+        if reduction == "mean":
+            return np.mean(output)
+        elif reduction == "sum":
+            return np.sum(output)
+        else:  # reduction == "none"
+            return output
+
+    return wrapper
+
+@loss_reference_reduction_wrapper
+def reference_smooth_l1_loss(input, target, beta=1.0):
+    diff = input - target
+    abs_diff = np.abs(diff)
+    above_threshold = abs_diff >= beta
+
+    loss = np.empty_like(input)
+    loss[above_threshold] = abs_diff[above_threshold] - 0.5 * beta
+    loss[~above_threshold] = diff[~above_threshold] ** 2 / (2 * beta)
+
+    return loss
 
 def reference_std_var(f):
     """Forwards unbiased/correction kwargs as NumPy's equivalent ddof"""
@@ -8185,7 +9665,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
 
     return wrapper
 
-
 def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
     """Generates unbiased/correction kwargs for std/var operators"""
     yield ((), {'unbiased': True})
@@ -8200,24 +9679,56 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
         numel = torch.tensor(t.shape)[kwargs.get('dim')].prod()
         yield ((), {'correction': numel // 2})
 
-def ref_pairwise_distance(input1, input2):
-    pass
-
+def error_inputs_mean(op_info, device, **kwargs):
+    err_msg1 = (r"mean\(\): could not infer output dtype. "
+                r"Input dtype must be either a floating point or complex dtype. "
+                r"Got: Long")
+    si1 = SampleInput(
+        make_tensor((3, 4, 5), dtype=torch.int64, device=device),
+        args=([],))
+
+    err_msg2 = (r"mean\(\): could not infer output dtype. "
+                r"Optional dtype must be either a floating point or complex dtype. "
+                r"Got: Long")
+    si2 = SampleInput(
+        make_tensor((3, 4, 5), dtype=torch.float32, device=device),
+        args=([],),
+        kwargs={"dtype": torch.int64})
+
+    err_msg3 = "Expected out tensor to have dtype double, but got float instead"
+    si3 = SampleInput(
+        make_tensor((3, 4, 5), dtype=torch.int64, device=device),
+        args=([],),
+        kwargs={
+            "dtype": torch.float64,
+            "out": make_tensor([], dtype=torch.float32, device=device),
+        })
+
+    return (ErrorInput(si1, error_regex=err_msg1),
+            ErrorInput(si2, error_regex=err_msg2),
+            ErrorInput(si3, error_regex=err_msg3))
 
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     UnaryUfuncInfo('abs',
                    aliases=('absolute', ),
                    ref=np.abs,
-                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                    skips=(
+                       # Inplace abs doesn't support complex inputs
+                       DecorateInfo(unittest.expectedFailure, 'TestGradients',
+                                    'test_inplace_grad', dtypes=(torch.cdouble,)),
+                       DecorateInfo(unittest.expectedFailure, 'TestGradients',
+                                    'test_inplace_gradgrad', dtypes=(torch.cdouble,)),
+                       DecorateInfo(unittest.expectedFailure, 'TestGradients',
+                                    'test_inplace_forward_mode_AD', dtypes=(torch.cdouble,)),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat]),
                        # Reference: https://github.com/pytorch/pytorch/issues/49224
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.int8], active_if=TEST_WITH_ASAN),
                        # TODO: Fix test_out_arg_all_dtypes as torch.empty_like(expected_output) where expected_output=op(input)
                        # We can break the logic of the loop over all possible types but it is OK.
@@ -8230,8 +9741,20 @@ def ref_pairwise_distance(input1, input2):
                        # Forward-over-reverse gradgrad might be wrong for complex (see above):
                        DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
                                     dtypes=complex_types()),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_consistency',
+                                    dtypes=(torch.chalf,)),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_inplace',
+                                    dtypes=(torch.chalf,)),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_out',
+                                    dtypes=(torch.chalf,)),
+                       # add_out_op2_sparse_csr
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR',
+                                    'test_zero_to_zero_correspondence_unary',
+                                    dtypes=(torch.chalf,)),
                    ),
-                   supports_inplace_autograd=False,
                    supports_fwgrad_bwgrad=True,
                    assert_autodiffed=True,
                    supports_sparse_csr=True,
@@ -8241,20 +9764,30 @@ def ref_pairwise_distance(input1, input2):
                    aliases=('arccos', ),
                    ref=np.arccos,
                    domain=(-1, 1),
-                   handles_complex_extremals=False,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   # "rsqrt_cpu" not implemented for 'BFloat16'
-                   backward_dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    decorators=(precisionOverride({torch.float16: 1e-2,
                                                   torch.bfloat16: 1e-1,
                                                   torch.complex64: 1e-2}),),
-                   safe_casts_outputs=True,
                    skips=(
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),
@@ -8276,20 +9809,27 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    # "rsqrt_cuda" not implemented for 'BFloat16'
                    backward_dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
                    supports_inplace_autograd=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
                        # Reference: https://github.com/pytorch/pytorch/issues/50692
@@ -8308,13 +9848,33 @@ def ref_pairwise_distance(input1, input2):
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
                     ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
                     else np.add(input, np.multiply(alpha, other)),
-                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                     torch.float16, torch.chalf),
                     assert_autodiffed=True,
-                    sample_inputs_func=partial(sample_inputs_add_sub, alpha=2),
-                    supports_inplace_autograd=False,
+                    sample_inputs_func=sample_inputs_add_sub,
                     supports_fwgrad_bwgrad=True,
                     supports_forward_ad=True,
+                    supports_two_python_scalars=True,
+                    decorators=(
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                            'TestBinaryUfuncs', 'test_reference_numerics'),
+                    ),
                     skips=(
+                        # boolean alpha not handled properly
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestCudaFuserOpInfo',
+                                     'test_nvfuser_correctness',
+                                     dtypes=(torch.bool,)),
+                        # boolean alpha not handled properly
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestNNCOpInfo',
+                                     'test_nnc_correctness',
+                                     dtypes=(torch.bool,)),
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestCommon',
+                                     'test_reference_testing',
+                                     dtypes=(torch.complex128,)),
                         DecorateInfo(unittest.skip("Skipped!"),
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_extremal_values',
@@ -8322,25 +9882,34 @@ def ref_pairwise_distance(input1, input2):
                     )),
     BinaryUfuncInfo('mul',
                     aliases=('multiply',),
-                    dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
+                    dtypes=all_types_and_complex_and(torch.chalf, torch.float16, torch.bfloat16, torch.bool),
                     assert_autodiffed=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=partial(sample_inputs_binary_pwise, python_scalars=True)),
+                    supports_two_python_scalars=True),
     BinaryUfuncInfo('sub',
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
                     ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
                     aliases=('subtract',),
-                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.chalf),
                     assert_autodiffed=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=partial(sample_inputs_add_sub, alpha=2, python_scalars=True),
-                    supports_inplace_autograd=False,
+                    sample_inputs_func=sample_inputs_add_sub,
+                    supports_two_python_scalars=True,
                     decorators=(
                         DecorateInfo(
                             toleranceOverride({torch.float16: tol(atol=1e-2, rtol=0)}),
                             'TestBinaryUfuncs', 'test_reference_numerics'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                            'TestCommon', 'test_complex_half_reference_testing', device_type='cpu'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                            'TestDecomp', 'test_comprehensive', device_type='cpu'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                            'TestDecomp', 'test_quick', device_type='cpu'),
                     ),
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"),
@@ -8356,11 +9925,10 @@ def ref_pairwise_distance(input1, input2):
            # This addmm OpInfo is for when alpha and beta are not both equal to 1.
            # alpha=beta=1 is tested in the following opinfo, because that special case will
            # trigger addmm being decomposed by a jit pass.
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
            assert_autodiffed=True,
-           supports_inplace_autograd=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -8368,11 +9936,10 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('addmm',
            # When alpha=beta=1 as compile-time constants, JIT will decompose addmm into mm and add.
            variant_test_name='decomposed',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
-           dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if(CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
-           supports_inplace_autograd=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -8382,13 +9949,12 @@ def ref_pairwise_distance(input1, input2):
                # https://github.com/pytorch/pytorch/issues/71784
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.float16,)),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness', dtypes=(torch.float16,)),
            )),
     OpInfo('addmv',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
-                                           *[torch.bfloat16] if CUDA11OrLater else []),
-           dtypesIfROCM=floating_types_and(torch.half),
-           supports_inplace_autograd=False,
+                                           *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_addmv),
@@ -8396,11 +9962,10 @@ def ref_pairwise_distance(input1, input2):
            ref=lambda M, batch1, batch2, beta=1, alpha=1: np.add(np.multiply(np.asarray(beta, dtype=M.dtype), M),
                                                                  np.multiply(np.asarray(alpha, dtype=batch1.dtype),
                                                                              np.sum(np.matmul(batch1, batch2), axis=0))),
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater else []),
-           dtypesIfROCM=floating_types_and(torch.half),
-           backward_dtypesIfROCM=floating_types_and(torch.half),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            decorators=[
@@ -8409,9 +9974,8 @@ def ref_pairwise_distance(input1, input2):
                                       torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_reference_testing')],
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+
-               #   and SM53+
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', active_if=IS_WINDOWS),
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
                # addbmm does not correctly warn when resizing out= inputs
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # https://github.com/pytorch/pytorch/issues/55907
@@ -8419,11 +9983,11 @@ def ref_pairwise_distance(input1, input2):
            ),
            sample_inputs_func=sample_inputs_addbmm),
     OpInfo('baddbmm',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
-                                           *[torch.bfloat16] if CUDA11OrLater else []),
+                                           *[torch.bfloat16] if CUDA11OrLater or TEST_WITH_ROCM else []),
            backward_dtypesIfCUDA=floating_types_and(torch.float16,
-                                                    *[torch.bfloat16] if SM53OrLater else [],
+                                                    *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else [],
                                                     torch.complex64, torch.complex128),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8436,36 +10000,40 @@ def ref_pairwise_distance(input1, input2):
                    'TestMathBits', 'test_conj_view', device_type='cuda')],
            sample_inputs_func=sample_inputs_baddbmm),
     OpInfo('dot',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            ),
     OpInfo('vdot',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            ),
     OpInfo('bmm',
-           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
+           assert_jit_shape_analysis=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+
-               #   and SM53+
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', active_if=IS_WINDOWS),
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
            ),
            sample_inputs_func=sample_inputs_bmm),
     OpInfo('mv',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8473,9 +10041,9 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('addr',
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
            backward_dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-           backward_dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           backward_dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, *[torch.bfloat16]
+                                                           if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            # Reference: https://github.com/pytorch/pytorch/issues/50747
-           supports_inplace_autograd=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
@@ -8491,22 +10059,25 @@ def ref_pairwise_distance(input1, input2):
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           supports_inplace_autograd=False,
            skips=(
                # TODO: update sample inputs with for_inplace_variant kwarg to support this test
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               # 76047
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.int8, torch.int16, torch.int32, torch.int64)),
+           ),
            sample_inputs_func=sample_inputs_addcmul_addcdiv),
     OpInfo('addcdiv',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           supports_inplace_autograd=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
                # TODO: update sample inputs with for_inplace_variant kwarg to support this test
-               DecorateInfo(unittest.skip("Skipped!"),
+               DecorateInfo(unittest.expectedFailure,
                             'TestCommon',
-                            'test_variant_consistency_eager'),),
+                            'test_variant_consistency_eager'),
+           ),
            sample_inputs_func=sample_inputs_addcmul_addcdiv),
     UnaryUfuncInfo('asin',
                    aliases=('arcsin', ),
@@ -8516,7 +10087,6 @@ def ref_pairwise_distance(input1, input2):
                    supports_sparse_csr=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
@@ -8529,12 +10099,12 @@ def ref_pairwise_distance(input1, input2):
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
@@ -8546,7 +10116,6 @@ def ref_pairwise_distance(input1, input2):
                    ref=np.arcsinh,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
                    supports_inplace_autograd=False,
                    supports_forward_ad=True,
@@ -8556,14 +10125,16 @@ def ref_pairwise_distance(input1, input2):
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda', dtypes=[torch.cdouble],
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
@@ -8580,18 +10151,21 @@ def ref_pairwise_distance(input1, input2):
                    supports_sparse=True,
                    supports_sparse_csr=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
-                   safe_casts_outputs=True,
                    skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
@@ -8599,13 +10173,15 @@ def ref_pairwise_distance(input1, input2):
                    )),
     BinaryUfuncInfo('atan2',
                     aliases=('arctan2',),
-                    dtypes=all_types_and(torch.bool),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                     dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
                     promotes_int_to_float=True,
-                    sample_inputs_func=sample_inputs_atan2,
+                    supports_rhs_python_scalar=False,
                     skips=(
-                        # TypeError: atan2(): argument 'other' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+                        # Incorrectly attempts to use a scalar for the second argument
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
                     )),
     UnaryUfuncInfo('atanh',
                    aliases=('arctanh', ),
@@ -8613,7 +10189,6 @@ def ref_pairwise_distance(input1, input2):
                    domain=(-1, 1),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    supports_inplace_autograd=False,
                    supports_forward_ad=True,
@@ -8621,20 +10196,22 @@ def ref_pairwise_distance(input1, input2):
                    supports_sparse=True,
                    supports_sparse_csr=True,
                    skips=(
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
-                                    device_type='cpu', dtypes=[torch.cfloat]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda', dtypes=[torch.cfloat],
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
                    )),
     OpInfo('allclose',
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
@@ -8645,90 +10222,101 @@ def ref_pairwise_distance(input1, input2):
            skips=(
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
            ),
            supports_out=False),
     OpInfo('broadcast_to',
+           ref=np.broadcast_to,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_broadcast_to),
     OpInfo('broadcast_tensors',
+           ref=np.broadcast_arrays,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
+               # https://github.com/pytorch/pytorch/issues/64997
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # JIT does not support variadic tensors.
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
            ),
            sample_inputs_func=sample_inputs_broadcast_tensors),
     OpInfo('block_diag',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
+               # https://github.com/pytorch/pytorch/issues/64997
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # JIT does not support variadic tensors.
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               # Problem; should be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            ),
            sample_inputs_func=sample_inputs_block_diag),
-    BinaryUfuncInfo('bitwise_and',
-                    dtypes=integral_types_and(torch.bool),
-                    supports_autograd=False,
-                    sample_inputs_func=sample_inputs_binary_pwise,
-                    skips=(
-                        # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                    )),
     UnaryUfuncInfo('bitwise_not',
                    ref=np.bitwise_not,
                    dtypes=integral_types_and(torch.bool),
+                   operator_variant=operator.invert,
                    supports_autograd=False),
     BinaryUfuncInfo('bitwise_left_shift',
                     op=torch.bitwise_left_shift,
-                    dtypes=all_types(),
-                    dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    dtypes=integral_types(),
+                    dtypesIfCUDA=integral_types(),
+                    operator_variant=operator.lshift,
+                    inplace_operator_variant=operator.ilshift,
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_bitwise_shift,
+                    supports_one_python_scalar=True,
+                    rhs_make_tensor_kwargs=dict(low=0),
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # FIXME: Undefined behavior sanitizer: shift exponent -9 is negative
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', device_type='cpu'),
                     )),
     BinaryUfuncInfo('bitwise_right_shift',
                     op=torch.bitwise_right_shift,
-                    dtypes=all_types(),
-                    dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    dtypes=integral_types(),
+                    dtypesIfCUDA=integral_types(),
+                    operator_variant=operator.rshift,
+                    inplace_operator_variant=operator.irshift,
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_bitwise_shift,
+                    supports_one_python_scalar=True,
+                    rhs_make_tensor_kwargs=dict(low=0),
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # FIXME: Undefined behavior sanitizer
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', device_type='cpu'),
                     )),
     OpInfo('combinations',
            op=torch.combinations,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           supports_autograd=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False,
            sample_inputs_func=sample_inputs_combinations),
     OpInfo('cartesian_prod',
            op=torch.cartesian_prod,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           supports_autograd=False,
            supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_cartesian_prod,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
-               DecorateInfo(unittest.skip("Skipped!"),
+               DecorateInfo(unittest.expectedFailure,
                             'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
            )),
     OpInfo('cdist',
@@ -8748,25 +10336,22 @@ def ref_pairwise_distance(input1, input2):
                    assert_autodiffed=True),
     OpInfo('cholesky',
            dtypes=floating_and_complex_types(),
-           check_batched_gradgrad=False,
            sample_inputs_func=sample_inputs_linalg_cholesky,
            gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],),
     OpInfo('cholesky_inverse',
            dtypes=floating_and_complex_types(),
-           backward_dtypes=floating_types(),
-           # TODO: RuntimeError: cholesky_inverse does not support automatic differentiation for outputs
-           # with complex dtype.
-           check_batched_gradgrad=False,
+           backward_dtypes=floating_and_complex_types(),
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           check_batched_gradgrad=True,
            sample_inputs_func=sample_inputs_linalg_cholesky_inverse,
-           gradcheck_wrapper=gradcheck_wrapper_triangular_input,
+           gradcheck_wrapper=gradcheck_wrapper_triangular_input_real_positive_diagonal,
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
-               # TODO: FIXME: cholesky_inverse throws an error in forward when requires_grad=True
-               #   for complex tensors
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
-               # cholesky_inverse does not correctly warn when resizing out= inputs
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               # Strides are not the same! Original strides were ((4, 2, 1),) and strides are now ((4, 1, 2),)
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),)),
     OpInfo('cholesky_solve',
            op=torch.cholesky_solve,
            dtypes=floating_and_complex_types(),
@@ -8775,63 +10360,88 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_wrapper=lambda *args, **kwargs: gradcheck_wrapper_triangular_input(*args, idx=1, **kwargs),
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
-           skips=(
-               # cholesky_solve does not correctly warn when resizing out= inputs
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),),),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
     OpInfo('chunk',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            sample_inputs_func=sample_inputs_chunk,
+           reference_inputs_func=reference_inputs_chunk,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('clone',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           ref=np.copy,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            sample_inputs_func=sample_inputs_clone,
+           reference_inputs_func=reference_inputs_clone,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('contiguous',
            op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            sample_inputs_func=sample_inputs_contiguous,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            autodiff_fusible_nodes=['aten::contiguous'],
            assert_jit_shape_analysis=True,
-           supports_out=False),
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+           )),
     OpInfo('sum_to_size',
            op=lambda x, *args, **kwargs: x.sum_to_size(*args, **kwargs),
-           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_sum_to_size,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False,
            skips=(
-               # RuntimeError: inputSet && outputSet
-               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":118
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),),),
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float,)),),),
     OpInfo('symeig',
            dtypes=floating_and_complex_types(),
            check_batched_grad=False,
            check_batched_gradgrad=False,
            sample_inputs_func=sample_inputs_symeig,
            gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
-    # NOTE: clamp has seperate opinfos for scalar min/max (unary op) vs. tensors
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           ),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off]),
+    # NOTE: clamp has separate opinfos for scalar min/max (unary op) vs. tensors
     OpInfo('clamp',
            aliases=('clip',),
            dtypes=all_types_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
            assert_autodiffed=True,
-           sample_inputs_func=sample_inputs_clamp),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_clamp,
+           skips=(
+               # nvFuser and NNC appear to not handle boolean clamp
+               DecorateInfo(unittest.expectedFailure,
+                            'TestCudaFuserOpInfo',
+                            'test_nvfuser_correctness',
+                            dtypes=(torch.bool,)),
+               DecorateInfo(unittest.expectedFailure,
+                            'TestNNCOpInfo',
+                            'test_nnc_correctness',
+                            dtypes=(torch.bool,)),
+           )),
     UnaryUfuncInfo('clamp',
                    variant_test_name='scalar',
                    aliases=('clip', ),
                    decorators=(precisionOverride({torch.bfloat16: 7e-2, torch.float16: 1e-2}),),
                    ref=np.clip,
-                   dtypes=all_types_and(torch.bfloat16),
-                   dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -8839,6 +10449,15 @@ def ref_pairwise_distance(input1, input2):
                        # Reference: https://github.com/pytorch/pytorch/issues/54841
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
+                       # nvFuser and NNC appear to not handle clamp type promotion
+                       DecorateInfo(unittest.expectedFailure,
+                                    'TestCudaFuserOpInfo',
+                                    'test_nvfuser_correctness',
+                                    dtypes=(torch.bool, torch.int32, torch.int64)),
+                       DecorateInfo(unittest.skip("Failing on some jobs!"),
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=(torch.bool, torch.int8, torch.int16, torch.int32, torch.int64)),
                    ),
                    sample_kwargs=sample_kwargs_clamp_scalar,
                    sample_inputs_func=sample_inputs_clamp_scalar),
@@ -8851,16 +10470,16 @@ def ref_pairwise_distance(input1, input2):
                    ),
     UnaryUfuncInfo('conj',
                    ref=np.conj,
-                   dtypes=all_types_and_complex_and(torch.bool,
-                                                    torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
                    supports_sparse=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_out=False),
     UnaryUfuncInfo('conj_physical',
                    ref=np.conj,
-                   dtypes=all_types_and_complex_and(torch.bool,
-                                                    torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
@@ -8872,6 +10491,19 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, )),
                        DecorateInfo(unittest.skip("Skipped! conj_physical_ not implemented for sparse"),
                                     'TestSparseUnaryUfuncs', 'test_inplace'),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_consistency",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_unary_inplace",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_unary_out",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "add_out_op2_sparse_csr" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR",
+                                    "test_zero_to_zero_correspondence_unary",
+                                    dtypes=(torch.complex32,)),
                    )),
     OpInfo('resolve_conj',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -8881,7 +10513,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            ),
     OpInfo('resolve_neg',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_view_as_real,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8907,28 +10539,28 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                # RuntimeError: "eq_cpu" not implemented for 'ComplexHalf'
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.half,)),
+               # RuntimeError: "eq_cpu" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness', dtypes=(torch.half,)),
            )),
     BinaryUfuncInfo('complex',
-                    dtypes=floating_types(),
-                    sample_inputs_func=sample_inputs_complex,
+                    dtypes=floating_types_and(torch.half),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False,
                     skips=(
-                        # RuntimeError: Expected object of scalar type Float but got scalar type Double for second argument
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # TypeError: complex(): argument 'imag' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+                        # Test doesn't account for complex's type promotion semantics
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out', device_type='mps'),
                     )),
     BinaryUfuncInfo('copysign',
                     dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
                     promotes_int_to_float=True,
-                    sample_inputs_func=sample_inputs_copysign,
-                    supports_inplace_autograd=False,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True),
     OpInfo('corrcoef',
-           dtypes=all_types_and_complex(),
-           dtypesIfCUDA=all_types_and_complex_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half,
+                                                  *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_corrcoef,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8939,11 +10571,15 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    handles_large_floats=False,
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
+                       # This fails on CUDA but passes on ROCm
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,), device_type='cuda'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
@@ -8954,34 +10590,44 @@ def ref_pairwise_distance(input1, input2):
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/issues/48641
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.int8]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu',
                                     dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu',
                                     dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
                    )),
     OpInfo('cov',
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=all_types_and_complex_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half,
+                                                  *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           backward_dtypesIfCUDA=all_types_and_complex_and(torch.half, *[torch.bfloat16]
+                                                           if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_cov,
+           error_inputs_func=error_inputs_cov,
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
+               # Float did not match double
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
+               # Jacobian mismatch
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Barely fails"), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
                # JIT test not working for tensor kwargs (https://github.com/pytorch/pytorch/issues/58507)
                # RuntimeError:
                # undefined value tensor:
@@ -8989,7 +10635,8 @@ def ref_pairwise_distance(input1, input2):
                # def the_method(i0):
                #     return torch.cov(i0, correction=0, fweights=None, aweights=tensor([0.0518, 0.4681], dtype=torch.float32, requires_grad=True)) # noqa: B950
                #                                                                ~~~~~~ <--- HERE
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
            )),
     OpInfo('cross',
            dtypes=all_types_and_complex_and(torch.bfloat16),
@@ -9001,8 +10648,7 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('linalg.cross',
            ref=lambda x, y, dim=-1: np.cross(x, y, axis=dim),
            op=torch.linalg.cross,
-           dtypes=all_types_and_complex(),
-           dtypesIfCPU=all_types_and_complex_and(torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.half),
            aten_name='linalg_cross',
            sample_inputs_func=sample_inputs_cross,
@@ -9010,7 +10656,7 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True),
     OpInfo('cumsum',
-           dtypes=all_types_and_complex(),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9020,13 +10666,14 @@ def ref_pairwise_distance(input1, input2):
            ),
            sample_inputs_func=sample_inputs_cumulative_ops),
     OpInfo('cumprod',
-           dtypes=all_types_and_complex(),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
                # cumprod does not handle correctly out= dtypes
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
            ),
            # gradgradcheck fails in fast_mode=True: #56275
            sample_inputs_func=sample_inputs_cumprod,
@@ -9037,6 +10684,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=partial(sample_inputs_cumulative_ops, supports_dtype_kwargs=False),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+           ),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('cummin',
            dtypes=all_types_and(torch.bool, torch.bfloat16),
@@ -9044,6 +10694,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=partial(sample_inputs_cumulative_ops, supports_dtype_kwargs=False),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+           ),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     UnaryUfuncInfo('deg2rad',
                    ref=np.radians,
@@ -9054,10 +10707,9 @@ def ref_pairwise_distance(input1, input2):
                    supports_fwgrad_bwgrad=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/51283#issuecomment-770614273
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16]),
-                   ),
-                   safe_casts_outputs=True),
+                   )),
     OpInfo('diff',
            op=torch.diff,
            # np.diff has np._NoValue as default values for prepend and append, compare_with_reference breaks if prepend/append
@@ -9073,61 +10725,72 @@ def ref_pairwise_distance(input1, input2):
                     aliases=('divide',),
                     variant_test_name='no_rounding_mode',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=partial(sample_inputs_binary_pwise, python_scalars=True),
+                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                     supports_forward_ad=True,
                     promotes_int_to_float=True,
                     supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
                     assert_autodiffed=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=True),),
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='trunc_rounding',
-                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="trunc", python_scalars=True),
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_elementwise_binary, sample_kwargs=dict(rounding_mode="trunc")),
                     supports_forward_ad=True,
                     promotes_int_to_float=True,
                     supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
                     assert_autodiffed=True,
-                    rhs_make_tensor_kwargs=dict(exclude_zero=True),),
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    skips=(
+                        # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
+                        DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
+                    )),
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='floor_rounding',
-                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="floor", python_scalars=True),
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_elementwise_binary, sample_kwargs=dict(rounding_mode="floor")),
                     supports_forward_ad=True,
                     promotes_int_to_float=True,
                     supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
                     assert_autodiffed=True,
-                    rhs_make_tensor_kwargs=dict(exclude_zero=True),),
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    skips=(
+                        # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
+                        DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
+                    )),
     BinaryUfuncInfo('true_divide',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                     supports_forward_ad=True,
                     promotes_int_to_float=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_binary_pwise,
+                    supports_two_python_scalars=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=True)),
     UnaryUfuncInfo('exp',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.exp),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/50093#pullrequestreview-561791547
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                    dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.bfloat16, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.bfloat16]),
                        # Reference: https://github.com/pytorch/pytorch/issues/48010
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                    ),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     OpInfo('expand',
            op=lambda self, shape: self.expand(shape),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -9135,7 +10798,10 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
-           supports_out=False),
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+           )),
     OpInfo('expand_as',
            op=lambda self, other: self.expand_as(other),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -9144,17 +10810,17 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_expand_as,
            supports_out=False,
            skips=(
-               DecorateInfo(unittest.skip("Second argument does not need gradient"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),),
            ),
     OpInfo('diag',
-           dtypes=all_types_and_complex_and(torch.bool),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_diag),
+           sample_inputs_func=sample_inputs_diag,
+           error_inputs_func=error_inputs_diag),
     OpInfo('diag_embed',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9163,7 +10829,8 @@ def ref_pairwise_distance(input1, input2):
            # They are not strictly aliases as they have diverging defaults, but we can see them as aliases for testing purposes
            # If we add tests that test the function against the alias, make linalg.diagonal into its own OpInfo
            aliases=('linalg.diagonal',),
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           aten_backward_name='diagonal_backward',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9175,57 +10842,55 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_diagonal_scatter),
     BinaryUfuncInfo('eq',
-                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
                     always_returns_bool=True,
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops),
+                    sample_inputs_func=sample_inputs_comparison_ops,
+                    skips=(
+                        # https://github.com/pytorch/pytorch/issues/76805
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                    )),
     BinaryUfuncInfo('fmax',
                     op=torch.fmax,
                     dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_max_min_binary,
+                    supports_rhs_python_scalar=False,
                     skips=(
                         # RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # TypeError: fmax(): argument 'other' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs'),
                     )),
     BinaryUfuncInfo('fmin',
                     op=torch.fmin,
                     dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_max_min_binary,
+                    supports_rhs_python_scalar=False,
                     skips=(
                         # RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # TypeError: fmin(): argument 'other' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs'),
-                    )),
-    BinaryUfuncInfo('fmod',
-                    ref=np.fmod,
-                    dtypes=all_types_and(torch.float16),
-                    rhs_make_tensor_kwargs={'exclude_zero': True},
-                    supports_forward_ad=True,
-                    supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_fmod_remainder,
-                    skips=(
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)),
                     )),
     BinaryUfuncInfo('fmod',
                     ref=np.fmod,
-                    variant_test_name='autodiffed',
-                    dtypes=all_types_and(torch.float16, torch.bool),
-                    assert_autodiffed=True,
+                    dtypes=all_types_and(torch.float16, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and(torch.float16),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=None,
                     rhs_make_tensor_kwargs={'exclude_zero': True},
-                    sample_inputs_func=partial(sample_inputs_fmod_remainder, autodiffed=True),
-                    skips=(
+                    decorators=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)),
+                                     'test_contig_vs_every_other',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_non_contig',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.uint8,)),
                     )),
     BinaryUfuncInfo('remainder',
                     ref=np.remainder,
@@ -9233,37 +10898,28 @@ def ref_pairwise_distance(input1, input2):
                     dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=None,
+                    operator_variant=operator.mod,
+                    inplace_operator_variant=operator.imod,
+                    supports_one_python_scalar=True,
                     rhs_make_tensor_kwargs={'exclude_zero': True},
-                    sample_inputs_func=sample_inputs_fmod_remainder,
-                    skips=(
-                        # AssertionError: False is not true : Tensors failed to compare as equal!
+                    decorators=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_contig_vs_every_other',
                                      dtypes=(torch.bfloat16,)),
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)),
-                    )),
-    BinaryUfuncInfo('remainder',
-                    ref=np.remainder,
-                    variant_test_name='autodiffed',
-                    dtypes=all_types_and(torch.float16, torch.bool, torch.bfloat16),
-                    dtypesIfCUDA=all_types_and(torch.float16, torch.bool, torch.bfloat16),
-                    supports_forward_ad=True,
-                    supports_fwgrad_bwgrad=True,
-                    assert_autodiffed=True,
-                    rhs_make_tensor_kwargs={'exclude_zero': True},
-                    sample_inputs_func=partial(sample_inputs_fmod_remainder, autodiffed=True),
-                    decorators=(
-                        # Fails on XLA
-                        # False is not true : Tensors failed to compare as equal!
-                        # Attempted to compare equality of tensors with different dtypes
-                        DecorateInfo(unittest.expectedFailure, 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
-                    ),
-                    skips=(
-                        # AssertionError: False is not true : Tensors failed to compare as equal!
+                                     'test_non_contig',
+                                     dtypes=(torch.bfloat16,)),
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics',
                                      dtypes=(torch.bfloat16,)),
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)),
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.uint8,)),
+                        # Fails on XLA
+                        # False is not true : Tensors failed to compare as equal!
+                        # Attempted to compare equality of tensors with different dtypes
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
                     )),
     UnaryUfuncInfo('frac',
                    ref=lambda x: np.modf(x)[0],
@@ -9272,22 +10928,36 @@ def ref_pairwise_distance(input1, input2):
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   # Reference for disabling extremals
-                   # https://github.com/pytorch/pytorch/issues/51948
-                   handles_extremals=False),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.bfloat16, torch.float16, torch.float32, torch.float64)),
+                       # 76047
+                       DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                                    dtypes=(torch.float32, torch.float64)),
+                   )),
     SpectralFuncInfo('fft.fft',
                      aten_name='fft_fft',
                      ref=np.fft.fft,
                      ndimensional=SpectralFuncType.OneD,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      ),
     SpectralFuncInfo('fft.fft2',
                      aten_name='fft_fft2',
                      ref=np.fft.fft2,
                      ndimensional=SpectralFuncType.TwoD,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      decorators=[precisionOverride(
                          {torch.float: 1e-4, torch.cfloat: 1e-4})],
                      ),
@@ -9296,7 +10966,12 @@ def ref_pairwise_distance(input1, input2):
                      ref=np.fft.fftn,
                      ndimensional=SpectralFuncType.ND,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      decorators=[precisionOverride(
                          {torch.float: 1e-4, torch.cfloat: 1e-4})],
                      ),
@@ -9305,14 +10980,24 @@ def ref_pairwise_distance(input1, input2):
                      ref=np.fft.hfft,
                      ndimensional=SpectralFuncType.OneD,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_gradgrad=False),
     SpectralFuncInfo('fft.hfft2',
                      aten_name='fft_hfft2',
                      ref=scipy.fft.hfft2 if has_scipy_fft else None,
                      ndimensional=SpectralFuncType.TwoD,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_gradgrad=False,
                      decorators=[
                          DecorateInfo(
@@ -9324,7 +11009,12 @@ def ref_pairwise_distance(input1, input2):
                      ref=scipy.fft.hfftn if has_scipy_fft else None,
                      ndimensional=SpectralFuncType.ND,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_gradgrad=False,
                      decorators=[
                          DecorateInfo(
@@ -9336,39 +11026,67 @@ def ref_pairwise_distance(input1, input2):
                      ref=np.fft.rfft,
                      ndimensional=SpectralFuncType.OneD,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_grad=False,
+                     skips=(
+                     ),
                      check_batched_gradgrad=False),
     SpectralFuncInfo('fft.rfft2',
                      aten_name='fft_rfft2',
                      ref=np.fft.rfft2,
                      ndimensional=SpectralFuncType.TwoD,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_grad=False,
                      check_batched_gradgrad=False,
-                     decorators=[precisionOverride({torch.float: 1e-4})],),
+                     decorators=[
+                         precisionOverride({torch.float: 1e-4}),
+                     ],),
     SpectralFuncInfo('fft.rfftn',
                      aten_name='fft_rfftn',
                      ref=np.fft.rfftn,
                      ndimensional=SpectralFuncType.ND,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      check_batched_grad=False,
                      check_batched_gradgrad=False,
-                     decorators=[precisionOverride({torch.float: 1e-4})],),
+                     decorators=[
+                         precisionOverride({torch.float: 1e-4}),
+                     ],),
     SpectralFuncInfo('fft.ifft',
                      aten_name='fft_ifft',
                      ref=np.fft.ifft,
                      ndimensional=SpectralFuncType.OneD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types()),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),),
     SpectralFuncInfo('fft.ifft2',
                      aten_name='fft_ifft2',
                      ref=np.fft.ifft2,
                      ndimensional=SpectralFuncType.TwoD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
                      decorators=[
                          DecorateInfo(
                              precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
@@ -9378,8 +11096,13 @@ def ref_pairwise_distance(input1, input2):
                      aten_name='fft_ifftn',
                      ref=np.fft.ifftn,
                      ndimensional=SpectralFuncType.ND,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
                      decorators=[
                          DecorateInfo(
                              precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
@@ -9389,31 +11112,51 @@ def ref_pairwise_distance(input1, input2):
                      aten_name='fft_ihfft',
                      ref=np.fft.ihfft,
                      ndimensional=SpectralFuncType.OneD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
+                     skips=(
+                     ),
                      check_batched_grad=False),
     SpectralFuncInfo('fft.ihfft2',
                      aten_name='fft_ihfft2',
                      ref=scipy.fft.ihfftn if has_scipy_fft else None,
                      ndimensional=SpectralFuncType.TwoD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
                      check_batched_grad=False,
                      check_batched_gradgrad=False,
-                     decorators=[
-                         DecorateInfo(
-                             precisionOverride({torch.float: 2e-4}),
-                             'TestFFT', 'test_reference_nd')],
-                     ),
+                     decorators=(
+                         # The values for attribute 'shape' do not match: torch.Size([5, 6, 5]) != torch.Size([5, 6, 6]).
+                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+                         DecorateInfo(precisionOverride({torch.float: 2e-4}), 'TestFFT', 'test_reference_nd'),
+                         # Mismatched elements!
+                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warnings'))),
     SpectralFuncInfo('fft.ihfftn',
                      aten_name='fft_ihfftn',
                      ref=scipy.fft.ihfftn if has_scipy_fft else None,
                      ndimensional=SpectralFuncType.ND,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and(torch.bool),
-                     default_test_dtypes=floating_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archss
+                     dtypesIfCUDA=all_types_and(torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,)),
                      check_batched_grad=False,
                      check_batched_gradgrad=False,
                      decorators=[
+                         # The values for attribute 'shape' do not match: torch.Size([5, 6, 5]) != torch.Size([5, 6, 6]).
+                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+                         # Mismatched elements!
+                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                          DecorateInfo(
                              precisionOverride({torch.float: 2e-4}),
                              'TestFFT', 'test_reference_nd')],
@@ -9422,15 +11165,25 @@ def ref_pairwise_distance(input1, input2):
                      aten_name='fft_irfft',
                      ref=np.fft.irfft,
                      ndimensional=SpectralFuncType.OneD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
                      check_batched_gradgrad=False),
     SpectralFuncInfo('fft.irfft2',
                      aten_name='fft_irfft2',
                      ref=np.fft.irfft2,
                      ndimensional=SpectralFuncType.TwoD,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
                      check_batched_gradgrad=False,
                      decorators=[
                          DecorateInfo(
@@ -9441,8 +11194,13 @@ def ref_pairwise_distance(input1, input2):
                      aten_name='fft_irfftn',
                      ref=np.fft.irfftn,
                      ndimensional=SpectralFuncType.ND,
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
                      dtypes=all_types_and_complex_and(torch.bool),
-                     default_test_dtypes=floating_and_complex_types(),
+                     # rocFFT doesn't support Half/Complex Half Precision FFT
+                     # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+                     dtypesIfCUDA=all_types_and_complex_and(
+                         torch.bool, *() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half, torch.complex32)),
                      check_batched_gradgrad=False,
                      decorators=[
                          DecorateInfo(
@@ -9468,28 +11226,37 @@ def ref_pairwise_distance(input1, input2):
                skipCPUIfNoFFT,
                DecorateInfo(unittest.skip("Skipped! stft does not match the native function"),
                             'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            ],
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_stft,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
            check_batched_grad=False,
            check_batched_gradgrad=False,
            supports_out=False,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            ),
     OpInfo('istft',
-           decorators=[
-               skipCPUIfNoFFT,
-               DecorateInfo(unittest.skip("Skipped! istft does not match the native function"),
-                            'TestJit', 'test_variant_consistency_jit'),
-               # gradcheck fails on ROCm (gh-68429)
-               DecorateInfo(skipCUDAIfRocm, 'TestGradients', 'test_fn_grad'),
-           ],
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_istft,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
            check_batched_grad=False,
            check_batched_gradgrad=False,
            supports_out=False,
+           decorators=(
+               DecorateInfo(unittest.skip("Skipped! istft does not match the native function"),
+                            'TestJit', 'test_variant_consistency_jit'),
            ),
+           skips=(
+               skipCPUIfNoFFT,
+               # gradcheck fails on ROCm (gh-68429)
+               # grad is computed improperly (probably for weights tensor)
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
+           )),
     UnaryUfuncInfo('floor',
                    ref=np.floor,
                    dtypes=floating_types_and(torch.bfloat16),
@@ -9520,53 +11287,88 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False),
+    OpInfo('sparse.sampled_addmm',
+           dtypes=floating_and_complex_types(),
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_sparse_sampled_addmm,
+           decorators=[
+               skipCUDAIf(_get_torch_cuda_version() < (11, 3), "cusparseSDDMM was added in 11.2.1"),
+               skipCPUIfNoMklSparse, ],
+           skips=(
+               # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: Sparse CSR tensors do not have strides.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               # RuntimeError: sampled_addmm: Expected result to have sparse csr layout, but got Strided
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_operator'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: unsupported memory format option Preserve
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
+           )),
     UnaryUfuncInfo('i0',
                    ref=np_unary_ufunc_integer_promotion_wrapper(
                        scipy.special.i0) if TEST_SCIPY else _NOTHING,
                    aliases=('special.i0',),
                    decorators=(precisionOverride({torch.bfloat16: 3e-1,
                                                   torch.float16: 5e-1}),),
-                   backward_dtypesIfCPU=floating_types(),
-                   backward_dtypesIfCUDA=floating_types(),
-                   backward_dtypesIfROCM=floating_types(),
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
+                   backward_dtypes=floating_types(),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   sample_inputs_func=sample_inputs_i0_i1),
+                   sample_inputs_func=sample_inputs_i0_i1,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.int8,)),
+                   )),
     UnaryUfuncInfo('special.i0e',
                    aten_name='special_i0e',
                    ref=scipy.special.i0e if TEST_SCIPY else _NOTHING,
                    decorators=(precisionOverride({torch.bfloat16: 3e-1,
                                                   torch.float16: 3e-1}),),
-                   backward_dtypesIfCPU=floating_types(),
-                   backward_dtypesIfCUDA=floating_types(),
-                   backward_dtypesIfROCM=floating_types(),
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   backward_dtypes=floating_types(),
                    sample_inputs_func=sample_inputs_i0_i1,
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('special.i1',
                    aten_name='special_i1',
                    ref=np_unary_ufunc_integer_promotion_wrapper(scipy.special.i1) if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool),
                    dtypesIfCUDA=all_types_and(torch.bool),
                    sample_inputs_func=sample_inputs_i0_i1,
-                   safe_casts_outputs=True,
                    decorators=(
                        DecorateInfo(toleranceOverride({
                            torch.float32: tol(atol=1e-4, rtol=0),
                            torch.bool: tol(atol=1e-4, rtol=0)})),
                    ),
                    skips=(
-                       # TODO: FIXME: jiterator does not support casting to complex outs
-                       DecorateInfo(unittest.skip("FIXME: Jiterator does not support complex outs!"),
-                                    "TestUnaryUfuncs",
-                                    "test_out_arg_all_dtypes",
-                                    device_type='cuda'),
+                       DecorateInfo(unittest.skip("Incorrect result!"),
+                                    'TestUnaryUfuncs',
+                                    'test_reference_numerics_large',
+                                    dtypes=(torch.int8,)),
                    ),
                    supports_fwgrad_bwgrad=True,
                    supports_forward_ad=True),
@@ -9577,8 +11379,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and(torch.bool),
                    sample_inputs_func=sample_inputs_i0_i1,
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('special.ndtr',
                    aten_name='special_ndtr',
                    decorators=(precisionOverride({torch.bfloat16: 5e-3,
@@ -9588,17 +11389,19 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True,
                    skips=(
                        # Dispatch stub: unsupported device typemeta
                        DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
                    )),
     BinaryUfuncInfo('floor_divide',
                     dtypes=all_types_and(torch.half, torch.bfloat16),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
                     rhs_make_tensor_kwargs=dict(exclude_zero=True),
-                    ),
+                    supports_two_python_scalars=True,
+                    skips=(
+                        # AssertionError: Results of original model and exported/imported version of model differed
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+                    )),
     UnaryUfuncInfo('frexp',
                    op=torch.frexp,
                    ref=np.frexp,
@@ -9606,7 +11409,6 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=floating_types_and(torch.half),
                    # skip testing torch.frexp as it is not supported by ROCm platform yet
                    decorators=[],
-                   supports_out=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    skips=(
@@ -9617,13 +11419,14 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_contig_vs_transposed'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_non_contig_expand'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_variant_consistency'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
 
                        # skips test_reference_numerics due to error in Windows CI.
                        # The np.frexp returns exponent as np.intc dtype on Windows platform,
                        # and np.intc does not have the correspond torch dtype
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     active_if=IS_WINDOWS),
@@ -9632,22 +11435,26 @@ def ref_pairwise_distance(input1, input2):
                     aliases=('greater_equal',),
                     dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                     always_returns_bool=True,
-                    supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops),
+                    supports_autograd=False,),
     OpInfo('geqrf',
            dtypes=floating_and_complex_types(),
-           supports_autograd=False,
            sample_inputs_func=sample_inputs_linalg_qr_geqrf,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           supports_autograd=False,
+           skips=(
+               # FIXME: geqrf can't forward with complex inputs that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
     BinaryUfuncInfo('gt',
                     aliases=('greater',),
                     dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                     always_returns_bool=True,
-                    supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops),
+                    supports_autograd=False,),
     UnaryUfuncInfo('imag',
                    ref=np.imag,
-                   dtypes=complex_types(),
+                   dtypes=complex_types_and(torch.chalf),
                    supports_out=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -9666,15 +11473,18 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # following tests give a runtime error with undefined value tensor
                # see discussion : https://github.com/pytorch/pytorch/issues/56660
                # RuntimeError:
                # Arguments for call are not valid.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, torch.complex64)),  # noqa: B950
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
            ),
            supports_inplace_autograd=False,
-           sample_inputs_func=sample_inputs_gradient),
+           sample_inputs_func=sample_inputs_gradient,
+           error_inputs_func=error_inputs_gradient),
     OpInfo('inverse',
            op=torch.inverse,
            dtypes=floating_and_complex_types(),
@@ -9683,7 +11493,17 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_linalg_invertible,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack]),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', '.test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', '.test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     OpInfo('isin',
            dtypes=all_types(),
            dtypesIfCUDA=all_types_and(torch.half),
@@ -9700,28 +11520,27 @@ def ref_pairwise_distance(input1, input2):
                     aliases=('less_equal',),
                     dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                     always_returns_bool=True,
-                    supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops),
+                    supports_autograd=False,),
     OpInfo('linalg.det',
            op=torch.linalg.det,
-           aliases=('det', ),
+           aliases=('det',),
            dtypes=floating_and_complex_types(),
            backward_dtypes=floating_and_complex_types(),
            aten_name='linalg_det',
            sample_inputs_func=sample_inputs_linalg_det,
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, skipCUDAIfRocm,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack,
                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-3, rtol=1e-3)}))],
            check_batched_gradgrad=False,
            supports_inplace_autograd=False),
     OpInfo('linalg.det',
            op=torch.linalg.det,
            variant_test_name='singular',
-           aliases=('det', ),
+           aliases=('det',),
            dtypes=double_types(),
            backward_dtypes=double_types(),
            aten_name='linalg_det',
            sample_inputs_func=sample_inputs_linalg_det_singular,
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, skipCUDAIfRocm,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack,
                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-3, rtol=1e-3)}))],
            check_batched_gradgrad=False,
            supports_inplace_autograd=False,
@@ -9729,37 +11548,36 @@ def ref_pairwise_distance(input1, input2):
                # These tests started breaking after touching the SVD.
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cpu',
                             dtypes=(torch.complex128,), active_if=IS_WINDOWS),
-               # For complex dtypes: Will be removed once https://github.com/pytorch/pytorch/issues/62328 is fixed
-               # Probable fix (open PR): https://github.com/pytorch/pytorch/pull/62570
-               # Illegal Memory Access failure: https://github.com/pytorch/pytorch/issues/72203
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cuda'),
-               # Illegal Memory Access failure: https://github.com/pytorch/pytorch/issues/72204
-               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               # dtypes are tested in the suite above, no need to repeat it for singular
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
            )),
     OpInfo('linalg.cholesky',
            aten_name='linalg_cholesky',
            dtypes=floating_and_complex_types(),
-           # TODO: RuntimeError: While computing batched gradients,
-           # got: vmap: Calling Tensor.as_strided is not supported
-           # unless the batch dims being vmapped over are at the front of the tensor (in memory layout).
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_linalg_cholesky,
            gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
            ),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],),
     OpInfo('linalg.cholesky_ex',
            aten_name='linalg_cholesky_ex',
            dtypes=floating_and_complex_types(),
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_linalg_cholesky,
            gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
+           skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+           ),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            ),
     OpInfo('linalg.cond',
            aten_name='linalg_cond',
@@ -9770,7 +11588,7 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],),
     OpInfo('linalg.eig',
            aten_name='linalg_eig',
            op=torch.linalg.eig,
@@ -9781,10 +11599,22 @@ def ref_pairwise_distance(input1, input2):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
            skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
                # Forward-over-reverse gradgrad might be incorrect
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),),),
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           ),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off],
+           ),
     OpInfo('linalg.eigvals',
            aten_name='linalg_eigvals',
            op=torch.linalg.eigvals,
@@ -9795,10 +11625,19 @@ def ref_pairwise_distance(input1, input2):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               # exits early on eager extremal value test
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     OpInfo('linalg.eigh',
            aten_name='linalg_eigh',
@@ -9810,12 +11649,20 @@ def ref_pairwise_distance(input1, input2):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off],
            skips=(
                # Forward-over-reverse gradgrad might be incorrect
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                            dtypes=complex_types()),),
-           ),
+                            dtypes=complex_types()),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     OpInfo('linalg.eigvalsh',
            aten_name='linalg_eigvalsh',
            dtypes=floating_and_complex_types(),
@@ -9826,10 +11673,17 @@ def ref_pairwise_distance(input1, input2):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     OpInfo('linalg.householder_product',
            aten_name='linalg_householder_product',
@@ -9844,19 +11698,55 @@ def ref_pairwise_distance(input1, input2):
            check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_householder_product,
            decorators=[
-               skipCUDAIfNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack,
+               skipCUDAIfNoCusolver, skipCPUIfNoLapack,
                DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-3, rtol=1e-3)})),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            ]),
+    OpInfo('linalg.ldl_factor',
+           aten_name='linalg_ldl_factor',
+           dtypes=floating_and_complex_types(),
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_linalg_ldl_factor,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, skipCUDAIfRocm],
+           ),
+    OpInfo('linalg.ldl_factor_ex',
+           aten_name='linalg_ldl_factor_ex',
+           dtypes=floating_and_complex_types(),
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_linalg_ldl_factor,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, skipCUDAIfRocm],
+           ),
+    OpInfo('linalg.ldl_solve',
+           aten_name='linalg_ldl_solve',
+           dtypes=floating_and_complex_types(),
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_linalg_ldl_solve,
+           decorators=[
+               skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1"),
+               skipCUDAIfNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
+           ),
     OpInfo('linalg.lstsq',
            aten_name='linalg_lstsq',
            dtypes=floating_and_complex_types(),
            supports_out=True,
            sample_inputs_func=sample_inputs_linalg_lstsq,
+           error_inputs_func=error_inputs_lstsq,
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
                # we skip gradient checks for this suite as they are tested in
                # variant_test_name='grad_oriented'
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+               # At this time ROCm uses magma instead of rocSolver, and the test passes
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', active_if=(not TEST_WITH_ROCM)),
+               # The values for attribute 'shape' do not match
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     OpInfo('linalg.lstsq',
            aten_name='linalg_lstsq',
@@ -9866,6 +11756,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_linalg_lstsq,
+           error_inputs_func=error_inputs_lstsq,
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9873,8 +11764,11 @@ def ref_pairwise_distance(input1, input2):
            skips=(
                # tests do not work with passing lambda for op
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               # At this time ROCm uses magma instead of rocSolver, and the test passes
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', active_if=(not TEST_WITH_ROCM)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            active_if=(not TEST_WITH_ROCM)),
            )),
     OpInfo('linalg.matrix_power',
            aliases=('matrix_power',),
@@ -9884,15 +11778,19 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            check_batched_grad=False,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, skipCUDAIfRocm],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
            sample_inputs_func=sample_inputs_linalg_matrix_power,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           ),
+           skips=(
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
     OpInfo('linalg.multi_dot',
            # Need this lambda because gradcheck does not work with TensorList inputs
            aten_name='linalg_multi_dot',
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            supports_inplace_autograd=False,
            # Batched grad checks fail for empty input tensors (see https://github.com/pytorch/pytorch/issues/53407)
            check_batched_grad=False,
@@ -9908,67 +11806,102 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.skip("67470!"), 'TestCommon', 'test_noncontiguous_samples'),
                # Fails on XLA.
                # AssertionError: False is not true : Tensors failed to compare as equal!
-               DecorateInfo(unittest.expectedFailure, 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
                # https://github.com/pytorch/pytorch/issues/71774
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.long,)),
            )),
+    # NB: linalg.norm has two variants so that different skips can be used for different sample inputs
     OpInfo('linalg.norm',
            op=torch.linalg.norm,
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
            sample_inputs_func=sample_inputs_linalg_norm,
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+           # Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
            aten_name='linalg_norm',
            skips=(
-               # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
-               # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=[torch.complex128]),
+           )),
+    OpInfo('linalg.norm',
+           op=torch.linalg.norm,
+           variant_test_name='subgradients_at_zero',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+           sample_inputs_func=partial(sample_inputs_linalg_norm, variant='subgradient_at_zero'),
+           aten_name='linalg_norm',
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+           # Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # [NEW] Skips specifically for sample inputs at zero
+               # norm's vjp/jvp are not well-conditioned near zero
+               DecorateInfo(unittest.expectedFailure, "TestGradients", 'test_fn_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, "TestGradients", 'test_fn_fwgrad_bwgrad')
            )),
     OpInfo('linalg.matrix_norm',
            aten_name='linalg_matrix_norm',
            dtypes=floating_and_complex_types(),
            check_batched_gradgrad=False,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
-           sample_inputs_func=sample_inputs_linalg_matrix_norm,
-           skips=(
-               # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
-               # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-           )),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+           sample_inputs_func=sample_inputs_linalg_matrix_norm),
     OpInfo('linalg.qr',
            aten_name='linalg_qr',
            op=torch.linalg.qr,
            dtypes=floating_and_complex_types(),
-           # batched gradients do not work for empty inputs
-           # https://github.com/pytorch/pytorch/issues/50743#issuecomment-767376085
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           # See https://github.com/pytorch/pytorch/issues/66357
-           check_batched_forward_grad=False,
+           # In-place ops
+           check_batched_gradgrad=False,
            sample_inputs_func=sample_inputs_linalg_qr_geqrf,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
+           skips=(
+               # The test is wrong
+               # https://github.com/pytorch/pytorch/pull/76115#discussion_r854328384
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
     OpInfo('linalg.slogdet',
            aten_name='linalg_slogdet',
            op=torch.linalg.slogdet,
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_linalg_slogdet,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    OpInfo('linalg.vander',
+           aten_name='linalg_vander',
+           ref=np_vander_batched,
+           op=torch.linalg.vander,
+           dtypes=all_types_and_complex(),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           sample_inputs_func=sample_inputs_linalg_vander),
     OpInfo('linalg.vector_norm',
            op=torch.linalg.vector_norm,
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            sample_inputs_func=sample_inputs_linalg_vector_norm,
-           aten_name='linalg_vector_norm'),
+           aten_name='linalg_vector_norm',
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+           # got: Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=[torch.complex128]),
+           )),
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, None),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.chalf),
                    assert_autodiffed=True,
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
@@ -9986,7 +11919,6 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    assert_autodiffed=True,
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    skips=(
@@ -10007,7 +11939,6 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
                    ),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
@@ -10019,7 +11950,6 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
@@ -10034,9 +11964,9 @@ def ref_pairwise_distance(input1, input2):
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     supports_inplace_autograd=False,
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     promotes_int_to_float=True,
                     supports_out=True,
+                    supports_rhs_python_scalar=False,
                     skips=(
                         # RuntimeError: mul(): functions with out=... arguments don't support
                         # automatic differentiation, but one of the arguments requires grad
@@ -10045,8 +11975,6 @@ def ref_pairwise_distance(input1, input2):
                         DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
                         DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                         DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
-                        # FIXME: ldexp does not accept scalar inputs
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
                     ),
                     decorators=[
                         DecorateInfo(
@@ -10063,8 +11991,8 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=lambda op_info, device, dtype, requires_grad=False, **kwargs:
-           (SampleInput(make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-                        args=(make_tensor((S, S), device, dtype, requires_grad=requires_grad),)),)),
+           (SampleInput(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
+                        args=(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),)),)),
     OpInfo('logaddexp2',
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.bfloat16),
@@ -10072,14 +12000,13 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=lambda op_info, device, dtype, requires_grad=False, **kwargs:
-           (SampleInput(make_tensor((S, S), device, dtype, requires_grad=requires_grad),
-                        args=(make_tensor((S, S), device, dtype, requires_grad=requires_grad),)),)),
+           (SampleInput(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
+                        args=(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),)),)),
     UnaryUfuncInfo('logical_not',
                    ref=np.logical_not,
                    decorators=(precisionOverride({torch.bfloat16: 7e-1,
                                                   torch.float16: 5e-1}),),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    supports_autograd=False,
                    skips=(
                        # The function variant always returns BoolTensor
@@ -10100,33 +12027,45 @@ def ref_pairwise_distance(input1, input2):
                     aliases=('less',),
                     dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                     always_returns_bool=True,
-                    supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops),
+                    supports_autograd=False,),
     OpInfo('linalg.lu_factor',
            aten_name='linalg_lu_factor',
            op=torch.linalg.lu_factor,
            dtypes=floating_and_complex_types(),
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_linalg_lu_factor,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_linalg_lu,
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
     OpInfo('linalg.lu_factor_ex',
            aten_name='linalg_lu_factor_ex',
            op=torch.linalg.lu_factor_ex,
            dtypes=floating_and_complex_types(),
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_linalg_lu_factor,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_linalg_lu,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
+    OpInfo('linalg.lu',
+           aten_name='linalg_lu',
+           op=torch.linalg.lu,
+           dtypes=floating_and_complex_types(),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_linalg_lu,
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
+    OpInfo('lu_unpack',
+           op=torch.lu_unpack,
+           dtypes=floating_and_complex_types(),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(skipCPUIfNoLapack,),
+           sample_inputs_func=sample_inputs_lu_unpack),
     OpInfo('lu',
            op=torch.lu,
            dtypes=floating_and_complex_types(),
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=False,  # need: lu_unpack
+           supports_fwgrad_bwgrad=True,
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
-           supports_out=False,
            sample_inputs_func=sample_inputs_lu,
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            skips=(
@@ -10137,48 +12076,47 @@ def ref_pairwise_distance(input1, input2):
                # def the_method(i0):
                #     return i0.lu(True, True)
                #            ~~~~~ <--- HERE
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError not raised: Expected RuntimeError when calling with input.device=cpu and out.device=cuda
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
     OpInfo('lu_solve',
            op=torch.lu_solve,
            dtypes=floating_and_complex_types(),
            check_batched_gradgrad=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=False,  # need: lu_unpack
+           supports_fwgrad_bwgrad=True,
            # See https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_lu_solve,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
-               # RuntimeError: lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
                DecorateInfo(unittest.skip("Tests different backward implementations"),
                             "TestCommon", "test_floating_inputs_are_differentiable"),),
            ),
-    OpInfo('lu_unpack',
-           op=torch.lu_unpack,
-           dtypes=floating_and_complex_types(),
-           supports_inplace_autograd=False,
-           # we use in-place operations which cannot be avoided.
-           # This causes vmap failures, hence we skip batched gradient checks
-           check_batched_grad=False,
-           supports_out=True,
-           sample_inputs_func=sample_inputs_lu_unpack,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
-           skips=(
-               # LU_pivots is expected to be a contiguous tensor
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
-               # cuda gradchecks are slow
-               # see discussion https://github.com/pytorch/pytorch/pull/47761#issuecomment-747316775
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad', device_type='cuda'),
-           )),
     OpInfo('masked_fill',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_masked_fill,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            check_batched_forward_grad=False,
-           supports_out=False),
+           supports_out=False,
+           skips=(
+               # RuntimeError: "where_cpu" not implemented for 'ComplexHalf'
+               # RuntimeError: "where_cuda" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive', dtypes=(torch.chalf,)),
+               # RuntimeError: "where_cpu" not implemented for 'ComplexHalf'
+               # RuntimeError: "where_cuda" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick', dtypes=(torch.chalf,)),
+           )),
     OpInfo('masked_scatter',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_masked_scatter,
@@ -10191,10 +12129,12 @@ def ref_pairwise_distance(input1, input2):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_masked_select),
+           sample_inputs_func=sample_inputs_masked_select,
+           error_inputs_func=error_inputs_masked_select),
     OpInfo('matrix_exp',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            aliases=('linalg.matrix_exp',),
            sample_inputs_func=sample_inputs_matrix_exp,
            # Needs to construct a 2nx2n matrix by copy_ ing into it
@@ -10204,15 +12144,20 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               # times out
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+           ),
            supports_out=False,
            ),
     OpInfo('matmul',
            aliases=('linalg.matmul',),
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           dtypesIfROCM=floating_types_and(torch.half, torch.bfloat16),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                                *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
            assert_jit_shape_analysis=True,
            supports_forward_ad=True,
@@ -10220,17 +12165,27 @@ def ref_pairwise_distance(input1, input2):
            check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_matmul,
            decorators=[
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
                # ROCm intermittently fails the test with standard atol/rtol
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}),
+                            'TestCommon', 'test_noncontiguous_samples', device_type='cuda',
+                            active_if=TEST_WITH_ROCM),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}),
+                            'TestCommon', 'test_out', device_type='cuda',
+                            active_if=TEST_WITH_ROCM),
+               # mv for the sample with shapes (S, S, M, M), (M,) has some variance in the
+               # backward on CPU
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=0, rtol=1e-5)}),
                             'TestCommon', 'test_noncontiguous_samples',
-                            active_if=TEST_WITH_ROCM), ],
+                            device_type='cpu'), ],
            skips=(
                # https://github.com/pytorch/pytorch/issues/67470
                DecorateInfo(unittest.skip("67470!"),
                             'TestCommon', 'test_noncontiguous_samples',
                             device_type='cpu', dtypes=(torch.long,)),
                # AssertionError: False is not true : Tensors failed to compare as equal!
-               DecorateInfo(unittest.expectedFailure, 'TestOpInfo',
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo',
                             device_type='xla', dtypes=(torch.long,)),
                # https://github.com/pytorch/pytorch/issues/71774
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
@@ -10269,14 +12224,15 @@ def ref_pairwise_distance(input1, input2):
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
-           backward_dtypes=floating_types_and(torch.half),
-           backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
+           backward_dtypes=floating_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.half),
            # TODO: some signatures of var_mean do support out
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=False,  # Need: var_mean
            skips=(
+               # var_mean does not support automatic differentiation for outputs with complex dtype
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
                # https://github.com/pytorch/pytorch/issues/67539
                DecorateInfo(unittest.skip("67539"), 'TestCommon', 'test_noncontiguous_samples',
                             active_if=TEST_WITH_ASAN, device_type='cpu'),
@@ -10286,6 +12242,8 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
                # Division by zero, may be related to above?
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad'))),
@@ -10293,23 +12251,25 @@ def ref_pairwise_distance(input1, input2):
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
-           backward_dtypes=floating_types_and(torch.half),
-           backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
+           backward_dtypes=floating_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.half),
            # TODO: some signatures of std_mean do support out
            supports_out=False,
            supports_forward_ad=True,  # Supports only certain variants?
            supports_fwgrad_bwgrad=False,  # Need: std_mean
            skips=(
+               DecorateInfo(unittest.skip("ASAN: division by zero!"), active_if=TEST_WITH_ASAN),
+               # std_mean does not support forward when complex inputs require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
                # https://github.com/pytorch/pytorch/issues/67539
                DecorateInfo(unittest.skip("67539"), 'TestCommon', 'test_noncontiguous_samples',
                             active_if=TEST_WITH_ASAN, device_type='cpu'),
-               # TODO: FIXME: complex inputs requiring grad error in forward
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
                # TODO: fix along with var_mean autograd tests
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
                # Division by zero, may be related to above?
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad'))),
@@ -10370,6 +12330,10 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_reduction_quantile,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+           ),
            # See https://github.com/pytorch/pytorch/issues/66357
            # Relies on copy_ to broadcast, but the forward AD path calls broadcast_to which
            # does not have a batching rule in core
@@ -10379,6 +12343,10 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_reduction_quantile,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+           ),
            # See https://github.com/pytorch/pytorch/issues/66357
            # Relies on copy_ to broadcast, but the forward AD path calls broadcast_to which
            # does not have a batching rule in core
@@ -10388,68 +12356,55 @@ def ref_pairwise_distance(input1, input2):
         aliases=('maximum',),
         variant_test_name='binary',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        sample_inputs_func=sample_inputs_max_min_binary,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         assert_autodiffed=True,
         ref=np.maximum,
+        supports_rhs_python_scalar=False,
         skips=(
-            # FIXME: maximum does not accept scalar inputs
-            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
+            # Incorrectly attempts to use a scalar for the second argument
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
             # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
-            DecorateInfo(unittest.expectedFailure,
-                         'TestBinaryUfuncs',
-                         'test_type_promotion',
-                         device_type='cuda'),
-        ),
-    ),
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
+        )),
     BinaryUfuncInfo(
         'maximum',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        sample_inputs_func=sample_inputs_max_min_binary,
         ref=np.maximum,
+        supports_rhs_python_scalar=False,
         skips=(
-            # FIXME: maximum does not accept scalar inputs
-            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
             # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
-            DecorateInfo(unittest.expectedFailure,
-                         'TestBinaryUfuncs',
-                         'test_type_promotion',
-                         device_type='cuda'),
-        ),
-    ),
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
+        )),
     BinaryUfuncInfo(
         'min',
         aliases=('minimum',),
         variant_test_name='binary',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        sample_inputs_func=sample_inputs_max_min_binary,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         assert_autodiffed=True,
         ref=np.minimum,
+        supports_rhs_python_scalar=False,
         skips=(
-            # FIXME: min does not accept scalar inputs
-            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
+            # Incorrectly attempts to use a scalar for the second argument
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
             # TODO: FIXME: RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
             DecorateInfo(unittest.expectedFailure,
                          'TestBinaryUfuncs',
                          'test_type_promotion',
                          device_type='cuda'),
-        ),
-    ),
+        )),
     BinaryUfuncInfo(
         'minimum',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        sample_inputs_func=sample_inputs_max_min_binary,
         ref=np.minimum,
+        supports_rhs_python_scalar=False,
         skips=(
-            # FIXME: minimum does not accept scalar inputs
-            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
             # TODO: FIXME: RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
             DecorateInfo(unittest.expectedFailure,
                          'TestBinaryUfuncs',
@@ -10460,38 +12415,40 @@ def ref_pairwise_distance(input1, input2):
     BinaryUfuncInfo('logical_and',
                     ref=np.logical_and,
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
                     always_returns_bool=True,
-                    skips=(
-                        # FIXME: logical_and does not accept scalar inputs
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
-                    )),
+                    supports_rhs_python_scalar=False),
     BinaryUfuncInfo('logical_or',
                     ref=np.logical_or,
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
                     always_returns_bool=True,
-                    skips=(
-                        # FIXME: logical_or does not accept scalar inputs
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
-                    )),
+                    supports_rhs_python_scalar=False),
     BinaryUfuncInfo('logical_xor',
                     ref=np.logical_xor,
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
                     always_returns_bool=True,
+                    supports_rhs_python_scalar=False),
+    BinaryUfuncInfo('bitwise_and',
+                    ref=np.bitwise_and,
+                    dtypes=integral_types_and(torch.bool),
+                    operator_variant=operator.and_,
+                    inplace_operator_variant=operator.iand,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
                     skips=(
-                        # FIXME: logical_xor does not accept scalar inputs
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
+                        # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
+                                     'test_type_promotion', device_type='cuda'),
                     )),
     BinaryUfuncInfo('bitwise_or',
                     ref=np.bitwise_or,
                     dtypes=integral_types_and(torch.bool),
-                    sample_inputs_func=sample_inputs_binary_pwise,
+                    operator_variant=operator.or_,
+                    inplace_operator_variant=operator.ior,
                     supports_autograd=False,
+                    supports_one_python_scalar=True,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_or_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
@@ -10502,8 +12459,10 @@ def ref_pairwise_distance(input1, input2):
     BinaryUfuncInfo('bitwise_xor',
                     ref=np.bitwise_xor,
                     dtypes=integral_types_and(torch.bool),
-                    sample_inputs_func=sample_inputs_binary_pwise,
+                    operator_variant=operator.xor,
+                    inplace_operator_variant=operator.ixor,
                     supports_autograd=False,
+                    supports_one_python_scalar=True,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_xor_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
@@ -10517,9 +12476,8 @@ def ref_pairwise_distance(input1, input2):
                         np.int64(np.heaviside(a, b)) if a.dtype == np.int64 and b.dtype == np.int64 else np.heaviside(a, b)
                     ),
                     dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
-                    # FIXME: heaviside does not accept scalar inputs
+                    supports_rhs_python_scalar=False,
                     skips=(
                         # RuntimeError: heaviside is not yet implemented for tensors with different dtypes.
                         DecorateInfo(unittest.expectedFailure,
@@ -10529,40 +12487,33 @@ def ref_pairwise_distance(input1, input2):
                         DecorateInfo(unittest.skip("Skipped!"),
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_extremal_values'),
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
                     )),
     BinaryUfuncInfo('lcm',
                     ref=np.lcm,
                     dtypes=integral_types_and(),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
-                    skips=(
-                        # TODO: FIXME: lcm doesn't support scalars
-                        DecorateInfo(unittest.expectedFailure,
-                                     'TestBinaryUfuncs',
-                                     'test_broadcast_python_scalar'),
-                    )),
+                    supports_rhs_python_scalar=False),
     BinaryUfuncInfo('gcd',
                     ref=np.gcd,
                     dtypes=integral_types_and(),
-                    sample_inputs_func=sample_inputs_binary_pwise,
                     supports_autograd=False,
+                    supports_rhs_python_scalar=False,
                     skips=(
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
-                                     dtypes=(torch.int8,)),
-                        # TODO: FIXME: jiterator doesn't support non-tensor inputs
-                        DecorateInfo(unittest.expectedFailure,
-                                     'TestBinaryUfuncs',
-                                     'test_broadcast_python_scalar'),)),
+                                     dtypes=(torch.int8,)),)),
     BinaryUfuncInfo('isclose',
                     ref=np.isclose,
                     dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
                     sample_inputs_func=sample_inputs_isclose,
                     supports_autograd=False,
                     supports_out=False,
+                    supports_rhs_python_scalar=False,
                     skips=(
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestCommon',
+                                     'test_reference_testing', dtypes=(torch.complex128,)),
                         # RuntimeError: Short did not match Int
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
@@ -10570,8 +12521,10 @@ def ref_pairwise_distance(input1, input2):
                         DecorateInfo(unittest.skip("Skipped!"),
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_extremal_values'),
-                        # FIXME: isclose does not accept scalar inputs
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_broadcast_python_scalar'),
+                        # Problem due to internal inplace operations
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestCompositeCompliance',
+                                     'test_operator'),
                     )),
     # `softmax` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
@@ -10579,12 +12532,14 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('softmax',
            aliases=('special.softmax', 'nn.functional.softmax',),
            aten_name='softmax',
+           aten_backward_name='_softmax_backward_data',
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_softmax_variant,
            assert_jit_shape_analysis=True,
            assert_autodiffed=True,
-           supports_out=False),
+           supports_forward_ad=True,
+           supports_out=True),
     OpInfo('softmax',
            aliases=('special.softmax', 'nn.functional.softmax',),
            variant_test_name="with_dtype",
@@ -10592,7 +12547,8 @@ def ref_pairwise_distance(input1, input2):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=True,
-           supports_out=False),
+           supports_forward_ad=True,
+           supports_out=True),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     # https://github.com/pytorch/pytorch/issues/68752
@@ -10603,6 +12559,7 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_softmax_variant,
            assert_jit_shape_analysis=False,
            assert_autodiffed=False,
+           supports_forward_ad=True,
            supports_out=False),
     OpInfo('nn.functional.softmin',
            variant_test_name="with_dtype",
@@ -10610,6 +12567,7 @@ def ref_pairwise_distance(input1, input2):
            dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=False,
+           supports_forward_ad=True,
            supports_out=False),
     OpInfo(
         "nn.functional.cross_entropy",
@@ -10617,6 +12575,7 @@ def ref_pairwise_distance(input1, input2):
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_cross_entropy,
         supports_out=False,
+        supports_forward_ad=True,
         decorators=(
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-3)}),
@@ -10624,9 +12583,6 @@ def ref_pairwise_distance(input1, input2):
                 "test_variant_consistency_jit",
                 device_type="cpu",
             ),
-            # FIXME Derivative wrt weights is not implemented
-            DecorateInfo(unittest.expectedFailure, "TestCommon",
-                         "test_floating_inputs_are_differentiable")
         ),
         skips=(
             # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 1536
@@ -10643,18 +12599,29 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('nn.functional.normalize',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=sample_inputs_normalize),
+           sample_inputs_func=sample_inputs_normalize,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=[torch.complex128]),
+           )),
     OpInfo('aminmax',
            ref=lambda x, dim=None, keepdim=False: (np.amin(x, axis=dim, keepdims=keepdim), np.amax(x, axis=dim, keepdims=keepdim)),
            dtypes=all_types_and(torch.bool),
            dtypesIfCUDA=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            decorators=(onlyNativeDeviceTypes,),
            supports_autograd=False,
-           sample_inputs_func=sample_inputs_aminmax),
+           sample_inputs_func=sample_inputs_aminmax,
+           error_inputs_func=error_inputs_aminmax_amax_amin,
+           skips=(
+               # AssertionError: Resizing an out= argument with no elements threw a resize warning!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
+           )),
     OpInfo('as_strided',
            op=lambda x, size, stride, storage_offset=0:
                torch.as_strided(x, size, stride, storage_offset=storage_offset),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10662,10 +12629,21 @@ def ref_pairwise_distance(input1, input2):
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided,
            skips=(
-               # AssertionError: False is not true : Tensors failed to compare as equal!
+               # Note: This xfail is fine -- it's inherent to how as_strided works
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
                # AssertionError: False is not true : Scalars failed to compare as equal!
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),)),
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                            'TestCommon', 'test_variant_consistency_eager'),
+               # RuntimeError: This operator is not Composite Compliant
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestCompositeCompliance', 'test_forward_ad'),
+               # Not close
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               # Not close
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestGradients'))),
     OpInfo('nn.functional.cosine_similarity',
            aten_name="cosine_similarity",
            dtypes=floating_types_and(torch.bfloat16),
@@ -10675,8 +12653,7 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_cosine_similarity),
     OpInfo('nn.functional.adaptive_avg_pool1d',
-           dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
@@ -10684,8 +12661,7 @@ def ref_pairwise_distance(input1, input2):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_adaptive_avg_pool1d),
     OpInfo('nn.functional.adaptive_avg_pool2d',
-           dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            decorators=(
                # RuntimeError:
@@ -10725,8 +12701,7 @@ def ref_pairwise_distance(input1, input2):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_adaptive_avg_pool3d),
     OpInfo('nn.functional.adaptive_max_pool1d',
-           dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
@@ -10736,8 +12711,7 @@ def ref_pairwise_distance(input1, input2):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_adaptive_max_pool1d),
     OpInfo('nn.functional.adaptive_max_pool2d',
-           dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            decorators=(
                # RuntimeError:
@@ -10786,8 +12760,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCPU=floating_types_and(torch.int64, torch.bfloat16),
+           dtypes=floating_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_avgpool1d),
@@ -10796,11 +12769,42 @@ def ref_pairwise_distance(input1, input2):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           supports_out=False,
            dtypes=floating_types_and(torch.int64),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           sample_inputs_func=sample_inputs_avgpool3d),
+           sample_inputs_func=sample_inputs_avgpool3d,
+           skips=(
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
+           )),
+    OpInfo(
+        "nn.functional.binary_cross_entropy_with_logits",
+        aten_name="binary_cross_entropy_with_logits",
+        supports_autograd=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        sample_inputs_func=sample_inputs_binary_cross_entropy_with_logits,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                'TestJit',
+                'test_variant_consistency_jit',
+                dtypes=(torch.float32,)
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                'TestCompositeCompliance',
+                'test_forward_ad',
+                dtypes=(torch.float32,)
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', "test_fn_gradgrad", dtypes=(torch.float64,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', "test_fn_fwgrad_bwgrad", dtypes=(torch.float64,)),
+        ),
+    ),
     OpInfo('nn.functional.relu',
            aten_name="relu",
            supports_autograd=True,
@@ -10814,7 +12818,7 @@ def ref_pairwise_distance(input1, input2):
            aten_name='conv_transpose1d',
            aliases=('conv_transpose1d',),
            dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv_transpose1d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10833,7 +12837,8 @@ def ref_pairwise_distance(input1, input2):
            aten_name='conv_transpose2d',
            aliases=('conv_transpose2d',),
            dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_types_and(torch.float16,
+                                           *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv_transpose2d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10852,7 +12857,7 @@ def ref_pairwise_distance(input1, input2):
            aten_name='conv_transpose3d',
            aliases=('conv_transpose3d',),
            dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv_transpose3d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10868,15 +12873,20 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Skipped! 75029"), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+               DecorateInfo(unittest.skip("Skipped! 75363"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
                DecorateInfo(unittest.skip("Skipped! RuntimeError: bias tensor has to be contiguous"), 'TestGradients',
                             'test_forward_mode_AD', device_type='cuda', active_if=TEST_WITH_ROCM),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad', device_type='cuda',
+                            active_if=(not TEST_CUDNN)),
            ),
            supports_out=False,),
     OpInfo('nn.functional.conv1d',
            aliases=('conv1d',),
            aten_name='conv1d',
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv1d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10885,13 +12895,23 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":103, please report a bug to PyTorch.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # AssertionError: None mismatch: torch.complex128 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules',
+                            'test_custom_rules', dtypes=(torch.complex64, torch.complex128)),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.complex64, torch.complex128)),
            ),
+           supports_expanded_weight=True,
            supports_out=False,),
     OpInfo('nn.functional.conv2d',
            aliases=('conv2d',),
            aten_name='conv2d',
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=partial(sample_inputs_conv2d),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            supports_forward_ad=True,
@@ -10899,14 +12919,22 @@ def ref_pairwise_distance(input1, input2):
            skips=(
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":103, please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Works on some configs!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # AssertionError: None mismatch: torch.complex128 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules',
+                            'test_custom_rules', dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.complex64, torch.complex128)),
            ),
+           supports_expanded_weight=True,
            supports_out=False,),
     OpInfo('nn.functional.group_norm',
            aten_name='group_norm',
            aliases=('group_norm',),
            ref=reference_group_norm,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
@@ -10916,40 +12944,43 @@ def ref_pairwise_distance(input1, input2):
                # Consider making it a parameter or input, or detaching the gradient
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,))
            ],
-           sample_inputs_func=sample_inputs_group_norm,),
+           sample_inputs_func=sample_inputs_group_norm,
+           supports_expanded_weight=True,),
     OpInfo('nn.functional.instance_norm',
            # no ref because instance_norm will often have numerical instability (large numbers or nan)
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,))
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            active_if=TEST_WITH_ROCM)
            ],
-           skips=(
-               DecorateInfo(unittest.skip("We don't want to differentiate wrt running mean / std"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
-           sample_inputs_func=sample_inputs_instance_norm,),
+           sample_inputs_func=sample_inputs_instance_norm,
+           supports_expanded_weight=True,),
     OpInfo('nn.functional.layer_norm',
            aten_name='layer_norm',
+           aten_backward_name='layer_norm_backward',
            aliases=('layer_norm',),
            ref=reference_layer_norm,
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           assert_jit_shape_analysis=True,
            decorators=[
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}),
                    'TestCommon', 'test_reference_testing'
                )
            ],
-           sample_inputs_func=sample_inputs_layer_norm,),
+           sample_inputs_func=sample_inputs_layer_norm,
+           supports_expanded_weight=True,),
     OpInfo('nn.functional.local_response_norm',
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCPU=floating_types_and(torch.int64, torch.bfloat16),
+           dtypes=floating_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
@@ -10957,7 +12988,9 @@ def ref_pairwise_distance(input1, input2):
            decorators=[
                # RuntimeError: falseINTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,))
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                            'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
            ],
            sample_inputs_func=sample_inputs_local_response_norm,),
     OpInfo('nn.functional.pad',
@@ -11016,10 +13049,11 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False),
     OpInfo('nn.functional.hardswish',
            aten_name="hardswish",
+           aten_backward_name='hardswish_backward',
            supports_autograd=True,
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_hardswish,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            supports_gradgrad=False,
            supports_forward_ad=True,
@@ -11028,32 +13062,32 @@ def ref_pairwise_distance(input1, input2):
            autodiff_nonfusible_nodes=["aten::hardswish"]),
     OpInfo('nn.functional.unfold',
            aten_name='im2col',
-           dtypes=floating_and_complex_types_and(torch.half),
-           dtypesIfCPU=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           aten_backward_name='im2col_backward',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half),
            sample_inputs_func=sample_inputs_nn_unfold,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           supports_out=False,
            skips=(
-               # RuntimeError: false
-               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
-               # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           ),
-           supports_out=False),
+               # NOTE: this failure may not reproduce consistently on different systems
+               # false INTERNAL ASSERT FAILED at "...torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185
+               DecorateInfo(unittest.skip("Internal assert failed!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
     OpInfo('nn.functional.interpolate',
            aten_name="interpolate",
            variant_test_name='nearest',
            supports_autograd=True,
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.uint8),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.uint8),
            sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'),
            skips=(
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -11062,14 +13096,14 @@ def ref_pairwise_distance(input1, input2):
            supports_autograd=True,
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            sample_inputs_func=partial(sample_inputs_interpolate, 'linear'),
            skips=(
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -11078,7 +13112,7 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            supports_autograd=True,
            supports_forward_ad=True,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
@@ -11086,7 +13120,7 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -11095,7 +13129,7 @@ def ref_pairwise_distance(input1, input2):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -11103,7 +13137,7 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -11112,7 +13146,7 @@ def ref_pairwise_distance(input1, input2):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'),
@@ -11120,7 +13154,7 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -11137,14 +13171,14 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
     OpInfo('nn.functional.upsample_bilinear',
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),
@@ -11152,14 +13186,24 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
+    OpInfo(
+        "nn.functional.soft_margin_loss",
+        ref=_NOTHING,
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        supports_forward_ad=True,
+        # doesn't support grad on target
+        sample_inputs_func=partial(sample_inputs_loss, rhs_requires_grad=False),
+    ),
     OpInfo('nn.functional.upsample_nearest',
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.uint8),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.uint8),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'nearest'),
@@ -11167,12 +13211,43 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: false
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
+    OpInfo(
+        "nn.functional.margin_ranking_loss",
+        ref=_NOTHING,
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_margin_ranking_loss,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+        )),
+    OpInfo(
+        "nn.functional.multi_margin_loss",
+        ref=_NOTHING,
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        supports_gradgrad=False,
+        sample_inputs_func=sample_inputs_multi_margin_loss,
+    ),
+    OpInfo(
+        "nn.functional.multilabel_margin_loss",
+        ref=_NOTHING,
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        supports_gradgrad=False,
+        sample_inputs_func=sample_inputs_multilabel_margin_loss
+    ),
     OpInfo('nn.functional.leaky_relu',
            aliases=None,
            aten_name="leaky_relu",
+           aten_backward_name='leaky_relu_backward',
            sample_inputs_func=sample_inputs_leaky_relu,
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
@@ -11181,18 +13256,50 @@ def ref_pairwise_distance(input1, input2):
            supports_gradgrad=True,
            supports_out=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=False,  # Need: leaky_relu_backward
+           supports_fwgrad_bwgrad=True,
            autodiff_nonfusible_nodes=["aten::leaky_relu"]),
+    OpInfo(
+        "nn.functional.multilabel_soft_margin_loss",
+        ref=_NOTHING,
+        supports_out=False,
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16),
+        sample_inputs_func=sample_inputs_multilabel_soft_margin_loss,
+        supports_forward_ad=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+        ),
+        skips=(
+            # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 4096
+            # __main__.TestJitCUDA.test_variant_consistency_jit_nn_functional_multilabel_soft_margin_loss_cuda_float32
+            # leaked 4096 bytes CUDA memory on device 0
+            DecorateInfo(
+                # Skip instead of expectedFailure because this fails
+                # locally for me but passes in CI.
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        ),
+    ),
     OpInfo('nn.functional.avg_pool2d',
            aten_name='avg_pool2d',
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           supports_out=False,
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCPU=floating_types_and(torch.int64, torch.bfloat16),
+           dtypes=floating_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-           sample_inputs_func=sample_inputs_avgpool2d),
+           sample_inputs_func=sample_inputs_avgpool2d,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'),
+           )),
     OpInfo('nn.functional.fractional_max_pool2d',
            supports_autograd=True,
            supports_out=False,
@@ -11211,7 +13318,7 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'))),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'))),
     OpInfo('nn.functional.fractional_max_pool3d',
            supports_autograd=True,
            supports_out=False,
@@ -11228,73 +13335,169 @@ def ref_pairwise_distance(input1, input2):
            decorators=(
                # FIXME: both derivatives are implemented incorrectly
                # https://github.com/pytorch/pytorch/issues/69322
-               # RuntimeError: cannot reshape tensor of 0 elements into shape [0, 1, -1] because the
-               # unspecified dimension size -1 can be any value and is ambiguous
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
                # FIXME: AssertionError: False is not true : Tensors failed to compare as equal!
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),)),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),)),
     OpInfo('nn.functional.max_pool1d',
            aten_name='max_pool1d',
            supports_autograd=True,
-           supports_out=False,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           # TODO: add shape checks
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           skips=(
+               # Pre-existing condition; Needs to be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cpu'),
+               DecorateInfo(unittest.skip("Works on some configs"), 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.bfloat16,)),
+               DecorateInfo(unittest.skip("Works on some conifgs"), 'TestCudaFuserOpInfo',
+                            'test_nvfuser_correctness', dtypes=(torch.bfloat16,)),
+           ),
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('nn.functional.max_pool2d',
+           aten_name='max_pool2d',
+           supports_autograd=True,
+           # Vmap is not happy with non-contiguous (channels_last) inputs
+           check_batched_gradgrad=False,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           assert_jit_shape_analysis=True,
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('nn.functional.max_pool3d',
+           aten_name='max_pool3d',
+           supports_autograd=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           # TODO: add shape checks
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           # TODO: investigate nondeterminism
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('nn.functional.max_unpool1d',
+           aten_name='max_unpool1d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
+           skips=(
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
+           )),
+    OpInfo('nn.functional.max_unpool1d',
+           variant_test_name='grad',
+           aten_name='max_unpool1d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
+    OpInfo('nn.functional.max_unpool2d',
+           aten_name='max_unpool2d',
+           supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           # got: Batching rule not implemented for aten::flatten.using_ints
-           check_batched_forward_grad=False,
-           # TODO: add shape checks
+           supports_out=False,
            assert_jit_shape_analysis=False,
            dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
            skips=(
-               # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
-               # RuntimeError: "max_pool1d_impl" not implemented for 'BFloat16'
-               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)),
-           ),
-           sample_inputs_func=sample_inputs_max_pool),
-    OpInfo('nn.functional.max_pool2d',
-           aten_name='max_pool2d',
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+           )),
+    OpInfo('nn.functional.max_unpool2d',
+           variant_test_name='grad',
+           aten_name='max_unpool2d',
            supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            # Vmap is not happy with non-contiguous (channels_last) inputs
-           check_batched_gradgrad=False,
+           check_batched_grad=False,
            supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
+    OpInfo('nn.functional.max_unpool3d',
+           aten_name='max_unpool3d',
+           supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           # got: Batching rule not implemented for aten::flatten.using_ints
-           check_batched_forward_grad=False,
-           assert_jit_shape_analysis=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
            dtypes=floating_types(),
-           dtypesIfCPU=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-           sample_inputs_func=sample_inputs_max_pool),
-    OpInfo('nn.functional.max_pool3d',
-           aten_name='max_pool3d',
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
+           skips=(
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+           )),
+    OpInfo('nn.functional.max_unpool3d',
+           variant_test_name='grad',
+           aten_name='max_unpool3d',
            supports_autograd=True,
-           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           # got: Batching rule not implemented for aten::flatten.using_ints
-           check_batched_forward_grad=False,
-           # TODO: add shape checks
+           supports_out=False,
            assert_jit_shape_analysis=False,
            dtypes=floating_types(),
-           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-           # TODO: investigate nondeterminism
-           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           sample_inputs_func=sample_inputs_max_pool),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
     OpInfo('nn.functional.linear',
            aten_name='linear',
            supports_autograd=True,
            sample_inputs_func=sample_inputs_linear,
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                                *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
+                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
+                                                                if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            # linear calls mm under the hood which is nondeterministic on CUDA
            # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -11302,39 +13505,50 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            # See https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
-           supports_out=False),
+           supports_expanded_weight=True,
+           skips=(
+               # Problem, needs to be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+           ),
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                            'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+           )),
     OpInfo('nn.functional.bilinear',
            aten_name='bilinear',
            supports_autograd=True,
            sample_inputs_func=sample_inputs_bilinear,
-           dtypes=all_types_and(torch.half, torch.bfloat16),
-           dtypesIfROCM=floating_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16,
+                                           *[torch.bfloat16] if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
            skips=(
-               # FIXME: bfloat16 backward support likely depends on CUDA11+ and SM53+
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)),
            ),
-           supports_forward_ad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('nn.functional.glu',
            aten_name='glu',
            supports_autograd=True,
            sample_inputs_func=sample_inputs_glu,
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfROCM=floating_types_and(torch.float16, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-           supports_forward_ad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     UnaryUfuncInfo(
         'nn.functional.elu',
+        aten_backward_name='elu_backward',
         ref=lambda x, alpha=1.0, inplace=False:
             np.maximum(0., x) + np.minimum(0., alpha * (np.exp(x) - 1)),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=False,  # Need: elu_backward
+        supports_fwgrad_bwgrad=True,
         supports_autograd=True,
         assert_autodiffed=False,
         supports_gradgrad=True,
@@ -11356,12 +13570,13 @@ def ref_pairwise_distance(input1, input2):
     ),
     OpInfo(
         'nn.functional.prelu',
+        aten_backward_name='prelu_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
             (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16),
-        supports_forward_ad=False,
+        supports_forward_ad=True,
         supports_autograd=True,
         assert_autodiffed=False,
         supports_gradgrad=True,
@@ -11374,16 +13589,16 @@ def ref_pairwise_distance(input1, input2):
             # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
             # Consider making it a parameter or input, or detaching the gradient
             # https://github.com/pytorch/pytorch/issues/68752
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'), ],
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'), ],
     ),
     UnaryUfuncInfo(
         'nn.functional.celu',
         ref=lambda x, alpha=1.0, inplace=False:
             np.maximum(0., x) + np.minimum(0., alpha * (np.exp(x / alpha) - 1)),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=False,  # Need: elu_backward
+        supports_fwgrad_bwgrad=True,
         supports_autograd=True,
         assert_autodiffed=False,
         supports_gradgrad=True,
@@ -11405,13 +13620,14 @@ def ref_pairwise_distance(input1, input2):
     ),
     UnaryUfuncInfo(
         'nn.functional.rrelu',
+        aten_backward_name='rrelu_with_noise_backward',
         op=lambda input, *args, **kwargs:
             wrapper_set_seed(torch.nn.functional.rrelu, input, *args, **kwargs),
         ref=_NOTHING,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         gradcheck_wrapper=wrapper_set_seed,
-        supports_forward_ad=False,
+        supports_forward_ad=True,
         supports_autograd=True,
         assert_autodiffed=False,
         supports_gradgrad=True,
@@ -11420,31 +13636,32 @@ def ref_pairwise_distance(input1, input2):
             ({'lower': 0., 'upper': 1.}, {'lower': 0., 'upper': 1.}),
         inplace_variant=lambda input, *args, **kwargs:
             wrapper_set_seed(partial(torch.nn.functional.rrelu, inplace=True), input, *args, **kwargs),
-        decorators=[
+        decorators=(
             DecorateInfo(
                 toleranceOverride({
                     torch.float16: tol(atol=1e-03, rtol=1.2e-03),
                     torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03)
                 }),
                 'TestUnaryUfuncs', device_type='cuda',
-            ),
-            # Probably because we have used lambda for the op here
-            # AssertionError: JIT Test does not execute any logic
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                'TestJit', 'test_variant_consistency_jit'
-            ), ],
-    ),
+            ),),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # In-place operations do not play well with forward AD
+            # https://github.com/pytorch/pytorch/issues/77447
+            DecorateInfo(unittest.expectedFailure, 'TestGradients',
+                         'test_inplace_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),)),
     UnaryUfuncInfo(
         'nn.functional.selu',
         ref=lambda x, inplace=False:
             1.0507009873554804934193349852946 * (
                 np.maximum(0., x) + np.minimum(0., 1.6732632423543772848170429916717 * (np.exp(x) - 1))
             ),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_forward_ad=True,  # depends on 'elu'
-        supports_fwgrad_bwgrad=False,  # Needs: elu_backward
+        supports_fwgrad_bwgrad=True,
         supports_autograd=True,
         assert_autodiffed=False,
         supports_gradgrad=True,
@@ -11463,10 +13680,45 @@ def ref_pairwise_distance(input1, input2):
     ),
     UnaryUfuncInfo(
         'nn.functional.silu',
+        aten_backward_name='silu_backward',
+        ref=lambda x, inplace=False: x / (1 + np.exp(-x)),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_autograd=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=False,
+        supports_out=False,
+        inplace_variant=lambda x: torch.nn.functional.silu(x, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-3, rtol=1e-3),
+                    torch.bfloat16: tol(atol=1e-4, rtol=1e-4)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=(torch.cfloat,), device_type='cpu'),
+        )
+    ),
+    # TODO: combine this with the nn.functional.silu OpInfo when
+    # complex autodiff for silu is supported or when
+    # the forward bug is fixed
+    # Note: silu errors when given inputs that require grad
+    #   but it doesn't support grad in their dtype
+    #   This is why the dtypes list above passes test_dtypes,
+    #   because it's getting lucky and failing in forward
+    #   because test_dtypes sets requires_grad to True
+    #   THIS IS A BUG
+    UnaryUfuncInfo(
+        'nn.functional.silu',
+        variant_test_name='complex',
         ref=lambda x, inplace=False:
             x / (1 + np.exp(-x)),
-        dtypes=floating_and_complex_types_and(torch.bfloat16),
-        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        dtypes=complex_types(),
+        dtypesIfCUDA=empty_types(),
         supports_forward_ad=False,
         supports_autograd=False,
         assert_autodiffed=False,
@@ -11480,14 +13732,26 @@ def ref_pairwise_distance(input1, input2):
                 }),
                 'TestUnaryUfuncs', device_type='cuda',
             ), ],
-        skips=[
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=(torch.cfloat,), device_type='cpu'),
+            # FIXME: intentionally misreports dtypes
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
             # FIXME: numpy reference diverges: Comparing (nan+nanj) and (-0+0j)
-            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', dtypes=(torch.complex64,)), ],
-    ),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_large',
+                         dtypes=(torch.complex64, torch.cdouble)),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_small',
+                         dtypes=(torch.complex64,)),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         dtypes=(torch.complex64,)))),
     UnaryUfuncInfo(
         'nn.functional.hardsigmoid',
+        aten_backward_name='hardsigmoid_backward',
         ref=reference_hardsigmoid,
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_autograd=True,
         assert_autodiffed=False,
@@ -11504,22 +13768,40 @@ def ref_pairwise_distance(input1, input2):
             # produces 0 instead of nan on ROCM
             DecorateInfo(unittest.expectedFailure,
                          'TestUnaryUfuncs', "test_reference_numerics_extremal",
-                         dtypes=(torch.bfloat16, torch.float16, torch.float32,), device_type='cuda',
+                         device_type='cuda',
                          active_if=(TEST_WITH_ROCM)), ]
     ),
     UnaryUfuncInfo(
         'nn.functional.logsigmoid',
         aten_name="log_sigmoid",
+        aten_backward_name='log_sigmoid_backward',
         ref=reference_logsigmoid,
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16),
         supports_autograd=True,
         assert_autodiffed=False,
+        supports_forward_ad=True,
         supports_gradgrad=True,
-        supports_out=False,
+        # autodiff_nonfusible_nodes=["aten::log_sigmoid"],
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_small'),
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_large'),
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+        ],
+        skips=(
+            # Resized a non-empty tensor but did not warn about it.
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cpu'),
+        ),
     ),
     UnaryUfuncInfo(
         'nn.functional.mish',
+        aten_backward_name='mish_backward',
         ref=lambda x: x * np.tanh(reference_softplus(x)),
         dtypes=floating_types(),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
@@ -11549,11 +13831,13 @@ def ref_pairwise_distance(input1, input2):
             DecorateInfo(
                 toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1.3e-04)}), 'TestUnaryUfuncs',), ],
         skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                         dtypes=(torch.int, torch.int8)),
             DecorateInfo(unittest.expectedFailure, 'TestGradients',
                          "test_fn_fwgrad_bwgrad", dtypes=(torch.complex128,)),
             # pytorch computes (0+nanj), numpy computes (-5e-18-1j) for input (-501.-1.0000e+20j)
             DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
-                         "test_reference_numerics_hard", dtypes=(torch.complex64,)),),
+                         "test_reference_numerics_large", dtypes=(torch.complex64,)),),
     ),
     UnaryUfuncInfo(
         'nn.functional.tanhshrink',
@@ -11567,23 +13851,30 @@ def ref_pairwise_distance(input1, input2):
         supports_gradgrad=True,
         supports_out=False,
         decorators=[
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
             DecorateInfo(
-                toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}), 'TestUnaryUfuncs',), ],
+                toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}), 'TestUnaryUfuncs',),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+        ],
         skips=(
             # in each case, pytorch will produce a nan while numpy will not
             DecorateInfo(unittest.expectedFailure,
-                         'TestUnaryUfuncs', "test_reference_numerics_normal",
-                         dtypes=(torch.complex64,), active_if=(IS_MACOS)),
-            DecorateInfo(unittest.expectedFailure,
-                         'TestUnaryUfuncs', "test_reference_numerics_hard",
-                         dtypes=(torch.complex64,), active_if=(IS_MACOS)),
-            DecorateInfo(unittest.expectedFailure,
+                         'TestUnaryUfuncs', "test_reference_numerics_small",
+                         dtypes=(torch.complex64, torch.complex128), active_if=(IS_MACOS)),
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
+                         'TestUnaryUfuncs', "test_reference_numerics_large",
+                         dtypes=(torch.complex64, torch.complex128), active_if=(IS_MACOS)),
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
                          'TestUnaryUfuncs', "test_reference_numerics_extremal",
-                         dtypes=(torch.complex64,), device_type='cpu',
-                         active_if=(IS_MACOS or IS_WINDOWS)),)
+                         dtypes=(torch.complex64, torch.complex128), device_type='cpu',
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+        ),
     ),
     OpInfo(
         'nn.functional.threshold',
+        aten_backward_name='threshold_backward',
         ref=lambda x, threshold, value: np.where(x > threshold, x, value).astype(x.dtype),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
@@ -11595,34 +13886,73 @@ def ref_pairwise_distance(input1, input2):
         supports_out=False,
         sample_inputs_func=sample_inputs_threshold,
     ),
+    OpInfo(
+        "nn.functional.triplet_margin_loss",
+        sample_inputs_func=sample_inputs_triplet_margin_loss,
+        dtypes=all_types_and_complex_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    OpInfo(
+        "nn.functional.triplet_margin_with_distance_loss",
+        sample_inputs_func=partial(sample_inputs_triplet_margin_loss, with_distance=True),
+        dtypes=all_types_and_complex_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # This test cannot handle a callable passed to `distance_function`. If we would use
+            # `distance_function=None`, the test would pass fine.
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+        ),
+    ),
     BinaryUfuncInfo('nextafter',
                     dtypes=floating_types_and(torch.bfloat16),
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_nextafter,
-                    skips=(
-                        # TypeError: nextafter(): argument 'other' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
-                    )),
+                    supports_rhs_python_scalar=False),
     OpInfo('topk',
            dtypes=all_types_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_topk),
     # Multiple variants for batch_norm to test with and without cuDNN disabled
     # See https://github.com/pytorch/pytorch/pull/63218#discussion_r688549391 for more details
     OpInfo('nn.functional.batch_norm',
            aten_name='batch_norm',
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_batch_norm,
            skips=(
-               DecorateInfo(unittest.skip("We don't want to differentiate wrt running mean / std"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),
                # see https://github.com/pytorch/pytorch/issues/71286
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
+                            device_type='cpu', dtypes=(torch.bfloat16,)),
+               # see https://github.com/pytorch/pytorch/issues/76283
+               DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type="cpu"),
+               # Trying to use forward AD with miopen_batch_norm that does not support it
+               # because it has not been implemented yet.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type="cuda", active_if=TEST_WITH_ROCM),
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                            'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
            )),
     # This variant tests batch_norm with cuDNN disabled only on CUDA devices
     OpInfo('nn.functional.batch_norm',
@@ -11634,92 +13964,136 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            decorators=[onlyCUDA, disablecuDNN],
            skips=(
-               DecorateInfo(unittest.skip("We don't want to differentiate wrt running mean / std"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type='cpu'),
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                            'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+           ),
            sample_inputs_func=sample_inputs_batch_norm),
+    OpInfo(
+        "nn.functional.binary_cross_entropy",
+        aten_backward_name='binary_cross_entropy_backward',
+        sample_inputs_func=sample_inputs_binary_cross_entropy,
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        gradcheck_fast_mode=False,
+        decorators=(
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+            ),
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-3, rtol=1e-3)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+        ),
+        skips=(
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+        ),
+    ),
     # We have to add 2 OpInfo entry for `igamma` and `igammac`.First is the
     # standard entry, second is to run gradcheck tests on the second argument.
     BinaryUfuncInfo('igamma',
                     dtypes=floating_types_and(torch.bfloat16, torch.float16),
                     aliases=('torch.special.gammainc',),
                     dtypesIfCUDA=floating_types(),
+                    # TODO: FIXME
+                    supports_rhs_python_scalar=False,
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_igamma_igammac,
                     skips=(
-                        # TypeError: igamma(): argument 'input' (position 1) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+                        # FIXME: incorrectly tries to pass a rhs scalar
+                        DecorateInfo(unittest.expectedFailure, 'TestJit',
+                                     'test_jit_alias_remapping'),
                     )),
-    BinaryUfuncInfo('igamma',
-                    variant_test_name='grad_other',
-                    # Since autograd formula is implemented only for other and
-                    # gradcheck test verifies the formula for input in SampleInput,
-                    # we permute the arguments.
-                    op=lambda self, other, **kwargs: torch.igamma(other, self, **kwargs),
-                    inplace_variant=None,
-                    method_variant=None,
-                    dtypes=floating_types_and(torch.bfloat16, torch.float16),
-                    backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
-                    dtypesIfCUDA=floating_types(),
-                    backward_dtypesIfCUDA=floating_types(),
-                    supports_inplace_autograd=False,
-                    decorators=[
-                        # Derivative wrt first tensor not implemented
-                        DecorateInfo(unittest.expectedFailure, "TestCommon",
-                                     "test_floating_inputs_are_differentiable")
-                    ],
-                    skips=(
-                        # test does not work with passing lambda for op
-                        # AssertionError: False is not true : Tensors failed to compare as equal!
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-                        # test fails are we permute the arguments function variant
-                        # but not for inplace or method.
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
-                        # TypeError: igamma(): argument 'input' (position 1) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
-                    ),
-                    sample_inputs_func=sample_inputs_igamma_igammac),
+    # TODO: FIXME, ideally by implemented grad for both inputs
+    # BinaryUfuncInfo('igamma',
+    #                 variant_test_name='grad_other',
+    #                 # Since autograd formula is implemented only for other and
+    #                 # gradcheck test verifies the formula for input in SampleInput,
+    #                 # we permute the arguments.
+    #                 op=lambda self, other, **kwargs: torch.igamma(other, self, **kwargs),
+    #                 inplace_variant=None,
+    #                 method_variant=None,
+    #                 supports_rhs_python_scalar=False,
+    #                 rhs_make_tensor_kwargs=dict(requires_grad=False),
+    #                 dtypes=floating_types_and(torch.bfloat16, torch.float16),
+    #                 backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
+    #                 dtypesIfCUDA=floating_types(),
+    #                 backward_dtypesIfCUDA=floating_types(),
+    #                 supports_inplace_autograd=False,
+    #                 skips=(
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable"),"),
+    #                     # test does not work with passing lambda for op
+    #                     # AssertionError: False is not true : Tensors failed to compare as equal!
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+    #                     # test fails are we permute the arguments function variant
+    #                     # but not for inplace or method.
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+    #                     # TypeError: igamma(): argument 'input' (position 1) must be Tensor, not float
+    #                     DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+    #                 )),
     BinaryUfuncInfo('igammac',
                     dtypes=floating_types_and(torch.bfloat16, torch.float16),
                     aliases=('torch.special.gammaincc',),
                     dtypesIfCUDA=floating_types(),
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_igamma_igammac,
+                    supports_rhs_python_scalar=False,
                     skips=(
-                        # TypeError: igammac(): argument 'input' (position 1) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+                        # FIXME: incorrectly tries to pass a rhs scalar
+                        DecorateInfo(unittest.expectedFailure, 'TestJit',
+                                     'test_jit_alias_remapping'),
                     )),
-    BinaryUfuncInfo('igammac',
-                    variant_test_name='grad_other',
-                    # Since autograd formula is implemented only for other and
-                    # gradcheck test verifies the formula for input in SampleInput,
-                    # we permute the arguments
-                    op=lambda self, other, **kwargs: torch.igammac(other, self, **kwargs),
-                    inplace_variant=None,
-                    method_variant=None,
-                    dtypes=floating_types_and(torch.bfloat16, torch.float16),
-                    backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
-                    dtypesIfCUDA=floating_types(),
-                    backward_dtypesIfCUDA=floating_types(),
-                    supports_inplace_autograd=False,
-                    decorators=[
-                        # Derivative wrt first tensor not implemented
-                        DecorateInfo(unittest.expectedFailure, "TestCommon",
-                                     "test_floating_inputs_are_differentiable"),
-                    ],
-                    skips=(
-                        # test does not work with passing lambda for op
-                        # AssertionError: False is not true : Tensors failed to compare as equal!
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-                        # test fails are we permute the arguments function variant
-                        # but not for inplace or method.
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
-                        # TypeError: igammac(): argument 'input' (position 1) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
-                    ),
-                    sample_inputs_func=sample_inputs_igamma_igammac),
+    # TODO: FIXME, ideally by implementing grad for both inputs
+    # BinaryUfuncInfo('igammac',
+    #                 variant_test_name='grad_other',
+    #                 # Since autograd formula is implemented only for other and
+    #                 # gradcheck test verifies the formula for input in SampleInput,
+    #                 # we permute the arguments
+    #                 op=lambda self, other, **kwargs: torch.igammac(other, self, **kwargs),
+    #                 inplace_variant=None,
+    #                 method_variant=None,
+    #                 supports_rhs_python_scalar=False,
+    #                 rhs_make_tensor_kwargs=dict(requires_grad=False),
+    #                 dtypes=floating_types_and(torch.bfloat16, torch.float16),
+    #                 backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
+    #                 dtypesIfCUDA=floating_types(),
+    #                 backward_dtypesIfCUDA=floating_types(),
+    #                 supports_inplace_autograd=False,
+    #                 decorators=[
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable"),
+    #                 ],
+    #                 skips=(
+    #                     # test does not work with passing lambda for op
+    #                     # AssertionError: False is not true : Tensors failed to compare as equal!
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+    #                     # test fails are we permute the arguments function variant
+    #                     # but not for inplace or method.
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+    #                     # TypeError: igammac(): argument 'input' (position 1) must be Tensor, not float
+    #                     DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+    #                 )),
     OpInfo('nn.functional.softshrink',
            aten_name="softshrink",
-           dtypes=floating_types(),
+           aten_backward_name='softshrink_backward',
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_autograd=True,
            supports_forward_ad=True,
@@ -11727,24 +14101,24 @@ def ref_pairwise_distance(input1, input2):
            assert_autodiffed=False,
            sample_inputs_func=sample_inputs_softshrink_hardshrink_hardtanh,
            supports_gradgrad=True,
-           supports_out=False,
            ),
     OpInfo('nn.functional.hardshrink',
            aten_name="hardshrink",
-           dtypes=floating_types(),
+           aten_backward_name='hardshrink_backward',
+           dtypes=floating_types_and(torch.bfloat16,),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_autograd=True,
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_softshrink_hardshrink_hardtanh,
            supports_gradgrad=True,
-           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            autodiff_nonfusible_nodes=["aten::hardshrink"]),
     OpInfo('nn.functional.hardtanh',
            aten_name="hardtanh",
+           aten_backward_name='hardtanh_backward',
            dtypes=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.bfloat16),
-           backward_dtypesIfCPU=all_types(),
+           backward_dtypes=all_types(),
            dtypesIfCUDA=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.float16, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.float16),
            supports_autograd=True,
@@ -11758,20 +14132,25 @@ def ref_pairwise_distance(input1, input2):
            ),
     OpInfo('nn.functional.gelu',
            aten_name="gelu",
+           aten_backward_name='gelu_backward',
+           ref=reference_gelu if TEST_SCIPY else _NOTHING,
            supports_autograd=True,
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_gelu,
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            supports_gradgrad=True,
-           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           autodiff_nonfusible_nodes=["aten::gelu"]),
+           autodiff_nonfusible_nodes=["aten::gelu"],
+           skips=(
+               # AssertionError: Tensor-likes are not close!
+               # May not replicate in CI
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),)),
     OpInfo('nn.functional.relu6',
            aten_name="relu6",
            dtypes=all_types_and(torch.bfloat16),
-           backward_dtypesIfCPU=floating_types(),
+           backward_dtypes=floating_types(),
            dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.float16),
            supports_autograd=True,
@@ -11783,8 +14162,9 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            autodiff_nonfusible_nodes=["aten::relu6"]),
     OpInfo('mm',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
+                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -11794,23 +14174,38 @@ def ref_pairwise_distance(input1, input2):
            dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               # Resized a non-empty tensor but did not warn about it
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           ),
            sample_inputs_func=sample_inputs_mode,),
     MvlGammaInfo(variant_test_name='mvlgamma_p_1',
                  domain=(1, None),
-                 skips=skips_mvlgamma(),
+                 skips=skips_mvlgamma() + \
+                 (DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+                  DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                               dtypes=(torch.float16, torch.int8)),
+                  DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                               dtypes=(torch.int8,)),),
                  sample_kwargs=lambda device, dtype, input: ({'p': 1}, {'d': 1})),
     MvlGammaInfo(variant_test_name='mvlgamma_p_3',
                  domain=(2, None),
                  skips=skips_mvlgamma(skip_redundant=True) + (
-                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                  dtypes=(torch.float16,)),
+                     DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                  dtypes=(torch.float16, torch.int8)),
+                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                  dtypes=(torch.int8,)),
                  ),
                  sample_kwargs=lambda device, dtype, input: ({'p': 3}, {'d': 3})),
     MvlGammaInfo(variant_test_name='mvlgamma_p_5',
                  domain=(3, None),
                  skips=skips_mvlgamma(skip_redundant=True) + (
-                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                  dtypes=(torch.float16,)),
+                     DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                  dtypes=(torch.float16, torch.int8)),
+                     DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                  dtypes=(torch.int8,)),
                  ),
                  sample_kwargs=lambda device, dtype, input: ({'p': 5}, {'d': 5})),
     BinaryUfuncInfo('ne',
@@ -11818,13 +14213,12 @@ def ref_pairwise_distance(input1, input2):
                     dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
                     always_returns_bool=True,
                     supports_autograd=False,
-                    sample_inputs_func=sample_inputs_comparison_ops,
                     skips=(
-                        # AssertionError: False is not true : Tensors failed to compare as equal!
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # https://github.com/pytorch/pytorch/issues/76805
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
                     )),
     OpInfo('narrow',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -11832,17 +14226,38 @@ def ref_pairwise_distance(input1, input2):
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
                    ref=np.negative,
-                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                    error_inputs_func=error_inputs_neg,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
                    supports_sparse_csr=True,
-                   assert_autodiffed=True,),
+                   assert_autodiffed=True,
+                   skips=(
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_consistency',
+                                    dtypes=(torch.chalf,),),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_inplace',
+                                    dtypes=(torch.chalf,),),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_out',
+                                    dtypes=(torch.chalf,),),
+                       # RuntimeError: "add_out_op2_sparse_csr" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR',
+                                    'test_zero_to_zero_correspondence_unary',
+                                    dtypes=(torch.chalf,),)
+
+                   )),
     OpInfo('dist',
            op=torch.dist,
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            supports_out=False,
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+           # Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_dist),
     OpInfo('outer',
            op=torch.outer,
@@ -11856,10 +14271,17 @@ def ref_pairwise_distance(input1, input2):
            dtypes=floating_and_complex_types(),
            supports_autograd=False,
            sample_inputs_func=sample_inputs_ormqr,
-           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack]),
+           error_inputs_func=error_inputs_ormqr,
+           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # ormqr does not support forward when complex inputs require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
     OpInfo('permute',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           ref=np.transpose,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            assert_autodiffed=True,
            autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
@@ -11867,46 +14289,90 @@ def ref_pairwise_distance(input1, input2):
            assert_jit_shape_analysis=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_permute),
+           sample_inputs_func=sample_inputs_permute,
+           reference_inputs_func=reference_inputs_permute),
     BinaryUfuncInfo('pow',
                     dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+                    ref=np.power,
                     # Due to AVX2 curently not being fully supported for Float16, log_vml_cpu can't be enabled
                     # for Float16, causing this test to fail. pow's autograd for Float16 is thus currently
                     # unsupported on CPU.
                     backward_dtypes=floating_and_complex_types_and(torch.bfloat16),
                     backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half),
-                    sample_inputs_func=sample_inputs_pow,
                     supports_inplace_autograd=False,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     assert_autodiffed=True,
+                    supports_one_python_scalar=True,
+                    # Integer types do not support negative exponentes
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    # Raising negative real numbers to fractional powers is not supported
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    decorators=(
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_reference_numerics'),
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05),
+                                                        torch.complex128: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_scalar_support'),
+                    ),
                     skips=(
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # RuntimeError: Integers to negative integer powers are not allowed.
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.int8, torch.int16, torch.int32, torch.int64)),
+                        # Skipping integers because they are being raised to negative powers causing an error
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=[torch.int8, torch.int16, torch.int32, torch.int64]),
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=[torch.int16, torch.int32, torch.int64]),
+                        # FIXME Complex values error with: Greatest absolute difference: nan at index
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_extremal_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
                     )),
     BinaryUfuncInfo('float_power',
+                    ref=np.float_power,
                     dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
                     promotes_int_to_float=True,
-                    sample_inputs_func=sample_inputs_pow,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
+                    supports_one_python_scalar=True,
+                    # Integer types do not support negative exponentes
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    # Raising negative real numbers to fractional powers is not supported
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    decorators=(
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05),
+                                                        torch.complex128: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_scalar_support'),
+                    ),
                     skips=(
+                        # FIXME
+                        # AssertionError: Object comparison failed: torch.float64 != torch.float32
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # -3.43399e+38 is outside the range of representable values of type 'float'
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+                        # Complex values error with: Greatest absolute difference: nan at index
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_extremal_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
                     )),
     OpInfo('qr',
            op=torch.qr,
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_linalg_qr_geqrf,
-           # batched gradients do not work for empty inputs
-           # https://github.com/pytorch/pytorch/issues/50743#issuecomment-767376085
-           check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           # See https://github.com/pytorch/pytorch/issues/66357
-           check_batched_forward_grad=False,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
+           # In-place ops
+           check_batched_gradgrad=False,
+           skips=(
+               # The test is wrong
+               # https://github.com/pytorch/pytorch/pull/76115#discussion_r854328384
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
     UnaryUfuncInfo('rad2deg',
                    ref=np.degrees,
                    decorators=(precisionOverride({torch.bfloat16: 7e-1,
@@ -11914,19 +14380,18 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/51283#issuecomment-770614273
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.bfloat16]),
                    ),
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('real',
                    ref=np.real,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
                    supports_out=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -11968,7 +14433,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=floating_types_and(torch.bfloat16),
                    dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
                    sample_kwargs=lambda device, dtype, input: ({'decimals': 0}, {'decimals': 0}),
-                   sample_inputs_func=partial(sample_inputs_unary, op_kwargs={'decimals': 0}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': 0}),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    assert_autodiffed=False,
@@ -11980,7 +14445,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=floating_types_and(torch.bfloat16),
                    dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
                    sample_kwargs=lambda device, dtype, input: ({'decimals': 3}, {'decimals': 3}),
-                   sample_inputs_func=partial(sample_inputs_unary, op_kwargs={'decimals': 3}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': 3}),
                    skips=(
                        # test_ops already tested for this overload with `decimals_0` opinfo entry
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -11999,7 +14464,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=floating_types_and(torch.bfloat16),
                    dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
                    sample_kwargs=lambda device, dtype, input: ({'decimals': -3}, {'decimals': -3}),
-                   sample_inputs_func=partial(sample_inputs_unary, op_kwargs={'decimals': -3}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': -3}),
                    skips=(
                        # test_ops already tested for this overload with `decimals_0` opinfo entry
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -12017,13 +14482,18 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    handles_large_floats=False,
-                   handles_complex_extremals=False,
-                   safe_casts_outputs=True,
                    supports_sparse=True,
                    supports_sparse_csr=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    skips=(
+                       # Fails on CUDA but passes on ROCm
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,), device_type='cuda'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
                    ),
@@ -12034,22 +14504,19 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    handles_large_floats=False,
-                   handles_complex_extremals=False,
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2,
                                                   torch.float16: 1e-2}),),
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/issues/49133
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.cfloat]),
                    )),
     UnaryUfuncInfo('sinh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -12060,11 +14527,13 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,)),
                        # Reference: https://github.com/pytorch/pytorch/issues/48641
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.int8]),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
@@ -12084,7 +14553,8 @@ def ref_pairwise_distance(input1, input2):
                    )),
     UnaryUfuncInfo('sgn',
                    ref=reference_sgn,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+                   backward_dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
@@ -12099,7 +14569,7 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.complex64, torch.complex128]),
                        # Reference: https://github.com/pytorch/pytorch/issues/48486
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.complex64]),
                        # The complex formula might be wrong
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
@@ -12111,9 +14581,22 @@ def ref_pairwise_distance(input1, input2):
                                     dtypes=complex_types()),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_consistency',
+                                    dtypes=(torch.chalf,)),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_inplace',
+                                    dtypes=(torch.chalf,)),
+                       # nonzero_count not implemented
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_out',
+                                    dtypes=(torch.chalf,)),
+                       # add_out_op2_sparse_csr
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR',
+                                    'test_zero_to_zero_correspondence_unary',
+                                    dtypes=(torch.chalf,)),
                    )),
     OpInfo('split',
-           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
            sample_inputs_func=partial(sample_inputs_split, list_args=False),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -12122,6 +14605,9 @@ def ref_pairwise_distance(input1, input2):
            autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
            assert_autodiffed=True),
     OpInfo('split',
+           # Cannot declare this aten_name because of
+           # test_variant_consistency_jit_split_list_args_cpu_float32
+           decomp_aten_name='split_with_sizes',
            variant_test_name='list_args',
            dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
            sample_inputs_func=partial(sample_inputs_split, list_args=True),
@@ -12129,7 +14615,7 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('split_with_sizes',
-           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
            sample_inputs_func=sample_inputs_split_with_sizes,
            autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
            autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
@@ -12140,16 +14626,10 @@ def ref_pairwise_distance(input1, input2):
     BinaryUfuncInfo('__radd__',
                     op=torch.Tensor.__radd__,
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
                     supports_out=False,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __radd__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__radd__(i0, 3.14j)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
+                        DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                     ),
                     assert_autodiffed=True,
                     supports_forward_ad=True,
@@ -12160,16 +14640,12 @@ def ref_pairwise_distance(input1, input2):
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
                     promotes_int_to_float=True,
                     lhs_make_tensor_kwargs={'exclude_zero': True},
-                    sample_inputs_func=sample_inputs_rbinops,
                     supports_out=False,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __rdiv__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__rdiv__(i0, 3.14j)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
+                        # https://github.com/pytorch/pytorch/issues/76806
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                     ),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
@@ -12178,50 +14654,48 @@ def ref_pairwise_distance(input1, input2):
     BinaryUfuncInfo('__rmul__',
                     op=torch.Tensor.__rmul__,
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
                     supports_out=False,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __rmul__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__rmul__(i0, 3.14j)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                     ),
                     assert_autodiffed=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     autodiff_nonfusible_nodes=['aten::mul'],),
-    OpInfo('__rand__',
-           op=torch.Tensor.__rand__,
-           dtypes=integral_types_and(torch.bool),
-           sample_inputs_func=sample_inputs_rbinops,
-           supports_out=False,
-           supports_autograd=False,
-           supports_forward_ad=True,),
+    BinaryUfuncInfo('__rand__',
+                    op=torch.Tensor.__rand__,
+                    dtypes=integral_types_and(torch.bool),
+                    supports_out=False,
+                    supports_autograd=False,
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
     BinaryUfuncInfo('__ror__',
                     op=torch.Tensor.__ror__,
                     dtypes=integral_types_and(torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
                     supports_out=False,
                     supports_autograd=False,
-                    supports_forward_ad=True,),
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
     BinaryUfuncInfo('__rxor__',
                     op=torch.Tensor.__rxor__,
                     dtypes=integral_types_and(torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
                     supports_out=False,
                     supports_autograd=False,
-                    supports_forward_ad=True,),
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
     OpInfo('__rmatmul__',
            op=torch.Tensor.__rmatmul__,
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else [],
-                                           torch.complex64, torch.complex128),
-           backward_dtypesIfCUDA=floating_types_and(torch.float16,
-                                                    *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else [],
-                                                    torch.complex64, torch.complex128),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_matmul,
            supports_out=False,
@@ -12229,110 +14703,87 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            check_batched_forward_grad=False,
            decorators=(
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                            'TestMathBits', 'test_conj_view'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}),
+                            'TestCommon', 'test_noncontiguous_samples'),
+           ),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                # https://github.com/pytorch/pytorch/issues/67470
                DecorateInfo(unittest.skip("67470!"),
                             'TestCommon', 'test_noncontiguous_samples',
                             device_type='cpu', dtypes=(torch.long,)),
-               DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
-                            'TestMathBits', 'test_conj_view'),
                # Fails on XLA.
                # AssertionError: False is not true : Tensors failed to compare as equal
-               DecorateInfo(unittest.expectedFailure, 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
-               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}),
-                            'TestCommon', 'test_noncontiguous_samples',
-                            device_type='cuda', active_if=TEST_WITH_ROCM),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
                # https://github.com/pytorch/pytorch/issues/71774
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.long,)),
-           ),
-           skips=(
-               # RuntimeError:
-               # object has no attribute __rmatmul__:
-               #   File "<string>", line 3
-               # def the_method(i0, i1):
-               #     return torch.__rmatmul__(i0, i1)
-               #            ~~~~~~~~~~~~~~ <--- HERE
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
            )),
     BinaryUfuncInfo('__rmod__',
                     op=torch.Tensor.__rmod__,
                     dtypes=floating_types_and(torch.bfloat16, torch.half,),
-                    dtypesIfCUDA=all_types_and(torch.bfloat16, torch.half, torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
+                    dtypesIfCUDA=all_types_and(torch.bfloat16, torch.half),
                     supports_out=False,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __rmod__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__rmod__(i0, 3.14)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
-                        # RuntimeError: "remainder_cuda" not implemented for 'Bool'
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool,))
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                     ),
                     # Support autograd after torch.remainder(Tensor, Tensor) supports
                     # autograd of the second argument.
                     # https://github.com/pytorch/pytorch/pull/58476/files#r637167630
-                    supports_autograd=False,
+                    # supports_autograd=False,
                     assert_autodiffed=True,
                     autodiff_nonfusible_nodes=['aten::remainder'],),
     BinaryUfuncInfo('__rpow__',
                     op=torch.Tensor.__rpow__,
-                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
                     # Reference: https://github.com/pytorch/pytorch/issues/54774
                     # "log2" "_vml_cpu" not implemented for Half
-                    backward_dtypesIfCPU=all_types_and_complex_and(torch.bfloat16, torch.bool),
-                    sample_inputs_func=sample_inputs_rbinops,
+                    backward_dtypes=all_types_and_complex_and(torch.bfloat16),
+                    backward_dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half),
                     supports_out=False,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __rpow__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__rpow__(i0, 3.14j)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
-                        # RuntimeError: "pow_cuda" not implemented for 'Bool'
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
-                                     dtypes=(torch.bool,)),
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                        # TODO: FIXME tolerance is too high
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestGradients'),
                     ),
                     assert_autodiffed=True,
                     autodiff_nonfusible_nodes=['aten::pow'],),
     BinaryUfuncInfo('__rsub__',
                     op=torch.Tensor.__rsub__,
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
-                    sample_inputs_func=sample_inputs_rbinops,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
                     supports_out=False,
+                    supports_two_python_scalars=True,
                     skips=(
-                        # RuntimeError:
-                        # object has no attribute __rsub__:
-                        #   File "<string>", line 3
-                        # def the_method(i0):
-                        #     return torch.__rsub__(i0, 3.14j)
-                        #            ~~~~~~~~~~~~~~ <--- HERE
-                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',),
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                     ),
                     assert_autodiffed=True,
                     autodiff_nonfusible_nodes=['aten::rsub'],),
     BinaryUfuncInfo('rsub',
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
-                    variant_test_name='rsub_tensor',
-                    supports_out=False,
-                    supports_inplace_autograd=False,
-                    sample_inputs_func=partial(sample_inputs_rsub, other_scalar=False),),
-    BinaryUfuncInfo('rsub',
-                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
-                    variant_test_name='rsub_scalar',
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
                     supports_out=False,
                     supports_inplace_autograd=False,
-                    sample_inputs_func=partial(sample_inputs_rsub, other_scalar=True),
-                    assert_autodiffed=True,),
+                    assert_autodiffed=None,
+                    sample_inputs_func=sample_inputs_add_sub),
     OpInfo('select',
-           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+           aten_backward_name='select_backward',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
            sample_inputs_func=sample_inputs_select,
            assert_jit_shape_analysis=True,
            supports_forward_ad=True,
@@ -12356,18 +14807,11 @@ def ref_pairwise_distance(input1, input2):
                    supports_sparse=True,
                    supports_sparse_csr=True,
                    supports_autograd=False,),
-    OpInfo('solve',
-           op=torch.solve,
-           dtypes=floating_and_complex_types(),
-           sample_inputs_func=sample_inputs_legacy_solve,
-           check_batched_gradgrad=False,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
     UnaryUfuncInfo('tan',
                    ref=np.tan,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
@@ -12375,17 +14819,17 @@ def ref_pairwise_distance(input1, input2):
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda', dtypes=[torch.float64],
                                     active_if=TEST_WITH_ROCM),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
@@ -12396,14 +14840,12 @@ def ref_pairwise_distance(input1, input2):
                        condition=lambda x: close_to_int(x / (math.pi * 0.5)), safe_val=math.pi)),
     UnaryUfuncInfo('tanh',
                    ref=np.tanh,
+                   aten_backward_name='tanh_backward',
                    aliases=('nn.functional.tanh',),
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   # "tanh_backward_cpu" not implemented for 'BFloat16'
-                   backward_dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    assert_autodiffed=True,
-                   safe_casts_outputs=True,
                    assert_jit_shape_analysis=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -12413,7 +14855,7 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
                        # alias, nn.functional.tanh, will produce (because of warning string saved):
@@ -12436,25 +14878,27 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            ),
            sample_inputs_func=sample_inputs_tensor_split,),
     OpInfo('hsplit',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_hsplit,
            error_inputs_func=error_inputs_hsplit,),
     OpInfo('vsplit',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_vsplit,
            error_inputs_func=error_inputs_vsplit,),
     OpInfo('dsplit',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -12468,11 +14912,19 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_wrapper=lambda *args, **kwargs: gradcheck_wrapper_triangular_input(*args, idx=1, **kwargs),
-           decorators=[skipCUDAIfNoMagma],
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # Gradcheck fails
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
                             dtypes=floating_and_complex_types()),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     UnaryUfuncInfo('trunc',
                    aliases=('fix', ),
@@ -12490,8 +14942,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('expm1',
                    aliases=('special.expm1', ),
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.expm1),
@@ -12501,15 +14952,14 @@ def ref_pairwise_distance(input1, input2):
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
                    supports_sparse_csr=True,
-                   safe_casts_outputs=True,
                    assert_autodiffed=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/48926#issuecomment-739734774
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
@@ -12538,7 +14988,6 @@ def ref_pairwise_distance(input1, input2):
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/issues/45690
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
@@ -12546,9 +14995,9 @@ def ref_pairwise_distance(input1, input2):
                        # Reference: https://github.com/pytorch/pytorch/pull/49102#issuecomment-744604601
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.bfloat16]),
                    )),
     UnaryUfuncInfo('rsqrt',
@@ -12557,11 +15006,13 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    decorators=(precisionOverride({torch.half: 5e-2}),),
-                   safe_casts_outputs=True,
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   handles_complex_extremals=False),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.cfloat, torch.cdouble)),
+                   )),
     UnaryUfuncInfo('sqrt',
                    ref=np.sqrt,
                    supports_sparse=True,
@@ -12575,17 +15026,15 @@ def ref_pairwise_distance(input1, input2):
                    decorators=(precisionOverride({torch.bfloat16: 7e-2}),),
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/issues/47358
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=IS_MACOS),
                        # Reference: https://github.com/pytorch/pytorch/pull/47293#issuecomment-721774436
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16]),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
-                   ),
-                   safe_casts_outputs=True,
-                   handles_complex_extremals=False),
+                   )),
     UnaryUfuncInfo('square',
                    ref=np.square,
                    dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -12594,7 +15043,7 @@ def ref_pairwise_distance(input1, input2):
                    supports_fwgrad_bwgrad=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/issues/52549
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.cfloat, torch.cdouble]),
                        # >>> t = torch.tensor(complex(-0.01, float("inf")))
                        # >>> np.square(t.numpy())
@@ -12606,7 +15055,7 @@ def ref_pairwise_distance(input1, input2):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]),
                        # Reference: https://github.com/pytorch/pytorch/pull/52551#issuecomment-782596181
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16]),
                    ),),
     OpInfo('lerp',
@@ -12626,8 +15075,17 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
-           ),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     OpInfo('linalg.inv_ex',
            aten_name='linalg_inv_ex',
            dtypes=floating_and_complex_types(),
@@ -12636,41 +15094,65 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, skipCPUIfNoLapack],
-           ),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     UnaryUfuncInfo('angle',
                    ref=np.angle,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool),
                    decorators=(precisionOverride({torch.float16: 1e-2,
                                                   torch.bfloat16: 1e-2}),),
-                   safe_casts_outputs=True,
+                   # TODO: add `torch.chalf` backward dtype support.
+                   # AssertionError: The supported dtypes for angle on device type cuda are incorrect!
+                   # The following dtypes did not work in backward but are listed by the OpInfo: {torch.complex32}.
+                   backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+                   backward_dtypesIfCUDA=floating_and_complex_types(),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse_csr=True,
                    supports_complex_to_float=True,
                    skips=(
-                       # The complex formula might be wrong
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
-                                    dtypes=complex_types()),
+                       # RuntimeError: "add_out_op2_sparse_csr" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_zero_to_zero_correspondence_unary',
+                                    dtypes=(torch.chalf,),),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_unary_out',
+                                    dtypes=(torch.chalf,),),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, 'TestSparseCSR', 'test_sparse_csr_consistency',
+                                    dtypes=(torch.chalf,),),
                    )),
     UnaryUfuncInfo('isfinite',
                    ref=np.isfinite,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
                    supports_out=False,
-                   supports_autograd=False,
-                   skips=(
-                       # Reference: https://github.com/pytorch/pytorch/issues/66402
-                       DecorateInfo(unittest.expectedFailure, "TestUnaryUfuncs", "test_reference_numerics_hard",
-                                    device_type='cpu', dtypes=(torch.complex64,), active_if=not (IS_MACOS or IS_WINDOWS)),
-                   )),
+                   supports_autograd=False),
     UnaryUfuncInfo('isinf',
                    ref=np.isinf,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
                    supports_out=False,
                    supports_sparse=True,
                    supports_sparse_csr=True,
-                   supports_autograd=False),
+                   supports_autograd=False,
+                   skips=(
+                       # "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       # "nonzero_cuda" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR",
+                                    "test_sparse_csr_consistency", dtypes=(torch.chalf,)),
+                       # "add_out_op2_sparse_csr" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR",
+                                    "test_zero_to_zero_correspondence_unary", dtypes=(torch.chalf,)),
+
+                   )),
     UnaryUfuncInfo('isposinf',
                    ref=np.isposinf,
                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
@@ -12685,7 +15167,7 @@ def ref_pairwise_distance(input1, input2):
                    supports_autograd=False),
     UnaryUfuncInfo('isreal',
                    ref=np.isreal,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
                    supports_out=False,
                    supports_autograd=False),
     UnaryUfuncInfo('isnan',
@@ -12703,19 +15185,26 @@ def ref_pairwise_distance(input1, input2):
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+           skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     OpInfo('linalg.solve_triangular',
            aten_name='linalg_solve_triangular',
            op=torch.linalg.solve_triangular,
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_linalg_solve_triangular,
            supports_fwgrad_bwgrad=True,
+           skips=(skipCPUIfNoLapack,),
            # linalg.solve_triangular cannot be batched over because of a call to out.copy_(result);
-           supports_forward_ad=True,
-           skips=(
-               DecorateInfo(unittest.skip("Tests different backward implementations"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
-           ),
+           supports_forward_ad=True),
     OpInfo('linalg.matrix_rank',
            aten_name='linalg_matrix_rank',
            dtypes=floating_and_complex_types(),
@@ -12724,7 +15213,13 @@ def ref_pairwise_distance(input1, input2):
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            ),
            ),
     OpInfo('linalg.matrix_rank',
@@ -12736,7 +15231,11 @@ def ref_pairwise_distance(input1, input2):
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            ),
            ),
     OpInfo('linalg.pinv',
@@ -12748,10 +15247,7 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_linalg_pinv,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack,
-                       # Derivative wrt rcond tensor not implemented
-                       DecorateInfo(unittest.expectedFailure,
-                                    "TestCommon", "test_floating_inputs_are_differentiable")],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            skips=(
                # errors with "leaked XXXX bytes CUDA memory on device 0"
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),)
@@ -12774,8 +15270,7 @@ def ref_pairwise_distance(input1, input2):
            # explicit backward implementation.
            decorators=[slowTest, skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            skips=(
-               # test does not work with passing lambda for op
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # CUDA runs out of memory
                DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
                             device_type='cuda', dtypes=[torch.cdouble]),
@@ -12791,24 +15286,41 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_linalg_pinv_hermitian,
            gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+           skips=(
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )
            ),
     OpInfo('eig',
            op=torch.eig,
            dtypes=floating_and_complex_types(),
            sample_inputs_func=sample_inputs_eig,
+           error_inputs_func=error_inputs_eig,
            decorators=[
                skipCUDAIfNoMagma,
                skipCPUIfNoLapack,
-               skipCUDAIfRocm
-           ],),
+           ],
+           skips=(
+               # following 2 tests failed intermittenly
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cuda',
+                            dtypes=[torch.complex128], active_if=TEST_WITH_ROCM),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad', device_type='cuda',
+                            dtypes=[torch.complex128], active_if=TEST_WITH_ROCM)),
+           ),
     OpInfo('einsum',
            # we need this lambda because SampleInput expects tensor input as the first argument
            # TODO(@heitorschueroff) update SampleInput to handle such cases
            op=lambda tensors, equation: torch.einsum(equation, tensors),
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half,
-                                                                *[torch.bfloat16] if (SM60OrLater and CUDA11OrLater) else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, *[torch.bfloat16]
+                                                                if ((SM60OrLater and CUDA11OrLater)
+                                                                or TEST_WITH_ROCM) else []),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -12816,6 +15328,7 @@ def ref_pairwise_distance(input1, input2):
            # See https://github.com/pytorch/pytorch/issues/66357
            sample_inputs_func=sample_inputs_einsum,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # test does not work with passing lambda for op
                # there's a test `test_einsum` in `test_jit.py` to handle this case
                # AssertionError: JIT Test does not execute any logic
@@ -12831,10 +15344,18 @@ def ref_pairwise_distance(input1, input2):
            # We're using at::allclose, which does not have a batching rule
            check_batched_grad=False,
            check_batched_gradgrad=False,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
            skips=(
                # Fixme, forward over backward gives a numerical error
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', dtypes=(torch.complex128,)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     OpInfo('linalg.svd',
            op=torch.linalg.svd,
@@ -12847,10 +15368,18 @@ def ref_pairwise_distance(input1, input2):
            check_batched_grad=False,
            check_batched_gradgrad=False,
            sample_inputs_func=sample_inputs_svd,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
            skips=(
                # FIXME forward over backward gives a numerical error
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', dtypes=(torch.complex128,)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
            )),
     OpInfo('linalg.svdvals',
            op=torch.linalg.svdvals,
@@ -12862,7 +15391,10 @@ def ref_pairwise_distance(input1, input2):
            # We're using at::allclose, which does not have a batching rule
            check_batched_gradgrad=False,
            sample_inputs_func=sample_inputs_linalg_svdvals,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+           ),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off]),
     OpInfo('svd_lowrank',
            op=lambda *args, **kwargs: wrapper_set_seed(
                lambda a, b, **kwargs: torch.svd_lowrank(a @ b.mT, **kwargs),
@@ -12876,13 +15408,16 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_svd_lowrank,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off,
                        DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-03)}),
                                     'TestCommon', 'test_noncontiguous_samples',
                                     device_type='cuda')],
            skips=(
                # test does not work with passing lambda for op
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            )),
     OpInfo('pca_lowrank',
            op=lambda *args, **kwargs: wrapper_set_seed(
@@ -12897,19 +15432,32 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_pca_lowrank,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off,
+                       DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-03)}),
+                                    'TestCommon', 'test_noncontiguous_samples',
+                                    device_type='cuda')],
            skips=(
                # test does not work with passing lambda for op
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            )),
     BinaryUfuncInfo('polar',
                     dtypes=floating_types(),
-                    sample_inputs_func=sample_inputs_polar,
+                    # this function is undefined if 'abs' values are <0
+                    supports_forward_ad=True,
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    supports_rhs_python_scalar=False,
                     skips=(
                         # RuntimeError: Expected object of scalar type Float but got scalar type Double for second argument
                         DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs', 'test_type_promotion'),
-                        # TypeError: polar(): argument 'angle' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+                        # GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0
+                        # Numerical:
+                        #  tensor([[0.]], dtype=torch.float64)
+                        # Analytical:
+                        # tensor([[-0.0047]], dtype=torch.float64, grad_fn=<CopySlices>)
+                        DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
                     )),
     # TODO(@kshitij12345): Refactor similar to `mvlgamma` entries.
     # To test reference numerics against multiple values of argument `n`,
@@ -12921,13 +15469,11 @@ def ref_pairwise_distance(input1, input2):
                    ref=reference_polygamma if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
-                       # AssertionError: JIT Test does not execute any logic
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                    ),
                    sample_kwargs=lambda device, dtype, input: ({'n': 0}, {'n': 0})),
     # A separate OpInfo entry for special.polygamma is needed to reorder the arguments
@@ -12938,13 +15484,13 @@ def ref_pairwise_distance(input1, input2):
                    ref=reference_polygamma if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
-                       # AssertionError: JIT Test does not execute any logic
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+                       # lambda impl
+                       DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                    ),
                    sample_kwargs=lambda device, dtype, input: ({'n': 0}, {'n': 0}),
                    # polygamma functions have multiple singularities at x <= 0
@@ -12955,7 +15501,6 @@ def ref_pairwise_distance(input1, input2):
                    ref=reference_polygamma if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
@@ -12963,10 +15508,11 @@ def ref_pairwise_distance(input1, input2):
                        # Redundant tests
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large'),
                    ),
                    sample_kwargs=lambda device, dtype, input: ({'n': 1}, {'n': 1}),
                    # polygamma functions have multiple singularities at x <= 0
@@ -12977,7 +15523,6 @@ def ref_pairwise_distance(input1, input2):
                    ref=reference_polygamma if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
@@ -12985,12 +15530,13 @@ def ref_pairwise_distance(input1, input2):
                        # Redundant tests
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     active_if=TEST_WITH_ROCM),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     active_if=TEST_WITH_ROCM),),
                    sample_kwargs=lambda device, dtype, input: ({'n': 2}, {'n': 2}),
                    # polygamma functions have multiple singularities at x <= 0
@@ -13001,7 +15547,6 @@ def ref_pairwise_distance(input1, input2):
                    ref=reference_polygamma if TEST_SCIPY else _NOTHING,
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
@@ -13009,13 +15554,10 @@ def ref_pairwise_distance(input1, input2):
                        # Redundant tests
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                    active_if=TEST_WITH_ROCM),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
-                                    active_if=TEST_WITH_ROCM),),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
                    sample_kwargs=lambda device, dtype, input: ({'n': 3}, {'n': 3}),
                    # polygamma functions have multiple singularities at x <= 0
                    reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
@@ -13026,7 +15568,6 @@ def ref_pairwise_distance(input1, input2):
                    decorators=(precisionOverride({torch.float16: 5e-4, torch.float32: 5e-4}),),
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    sample_inputs_func=sample_inputs_polygamma,
@@ -13034,63 +15575,68 @@ def ref_pairwise_distance(input1, input2):
                        # Redundant tests
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     active_if=TEST_WITH_ROCM),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     active_if=TEST_WITH_ROCM),),
                    sample_kwargs=lambda device, dtype, input: ({'n': 4}, {'n': 4}),
                    # polygamma functions have multiple singularities at x <= 0
                    reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
     OpInfo('ravel',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           ref=np.ravel,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_ravel,
            ),
     OpInfo('reshape',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           sample_inputs_func=sample_inputs_view_reshape,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           sample_inputs_func=partial(sample_inputs_view_reshape, transpose_samples=True),
+           reference_inputs_func=partial(reference_inputs_view_reshape, transpose_samples=True),
+           error_inputs_func=error_inputs_reshape,
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            ),
     OpInfo('reshape_as',
            op=lambda x, other: x.reshape_as(other),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_view_as_reshape_as,
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               DecorateInfo(unittest.skip("Second argument does not need gradient"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
-           ),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           )),
     OpInfo('view',
            op=lambda x, shape: x.view(shape),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
-           sample_inputs_func=sample_inputs_view_reshape,
-           ),
+           sample_inputs_func=partial(sample_inputs_view_reshape, transpose_samples=False),
+           reference_inputs_func=partial(reference_inputs_view_reshape, transpose_samples=False),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           )),
     OpInfo('view_as',
            op=lambda x, other: x.view_as(other),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_view_as_reshape_as,
            skips=(
-               DecorateInfo(unittest.skip("Second argument does not need gradient"),
-                            "TestCommon", "test_floating_inputs_are_differentiable"),),
-           ),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           )),
     OpInfo('atleast_1d',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13100,56 +15646,47 @@ def ref_pairwise_distance(input1, input2):
                # RuntimeError: input->type()->kind() == TypeKind::OptionalType
                # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
                # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
            ),
            ),
     OpInfo('atleast_2d',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               # JIT does not support variadic tensors.
-               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
-               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
-               # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
            ),
            sample_inputs_func=sample_inputs_atleast1d2d3d,
            ),
     OpInfo('atleast_3d',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               # JIT does not support variadic tensors.
-               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
-               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
-               # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
            ),
            sample_inputs_func=sample_inputs_atleast1d2d3d,
            ),
     OpInfo('flatten',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_flatten,
+           reference_inputs_func=reference_inputs_flatten,
            ),
     OpInfo('column_stack',
-           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               # JIT does not support variadic tensors.
-               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
-               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
-               # please report a bug to PyTorch.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
-               # AssertionError: UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),),
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),),
            sample_inputs_func=sample_inputs_column_stack,),
     OpInfo('pinverse',
            op=torch.pinverse,
@@ -13161,7 +15698,13 @@ def ref_pairwise_distance(input1, input2):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            supports_out=False,
            sample_inputs_func=sample_inputs_linalg_invertible,
-           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack]),
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
     OpInfo('gather',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -13169,11 +15712,10 @@ def ref_pairwise_distance(input1, input2):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           error_inputs_func=error_inputs_gather
+           error_inputs_func=error_inputs_gather,
            ),
     OpInfo('index_fill',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           supports_inplace_autograd=False,
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13182,17 +15724,19 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_index),
     OpInfo('index_copy',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           supports_inplace_autograd=False,
            supports_out=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
+           skips=(
+           ),
            sample_inputs_func=sample_inputs_index,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('index_select',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_index,
+           error_inputs_func=error_inputs_index_select,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
@@ -13206,8 +15750,12 @@ def ref_pairwise_distance(input1, input2):
            check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_index,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('index_reduce',
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_index_reduce),
     OpInfo('__getitem__',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13215,12 +15763,12 @@ def ref_pairwise_distance(input1, input2):
            supports_scripting=False,
            op=torch.Tensor.__getitem__,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 104448
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),),
-           assert_jit_shape_analysis=False,  # TODO: support index.Tensor()
            sample_inputs_func=sample_inputs_getitem),
     OpInfo('index_put',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_inplace_autograd=True,
            supports_forward_ad=True,
@@ -13230,6 +15778,7 @@ def ref_pairwise_distance(input1, input2):
            test_neg_view=False,
            sample_inputs_func=sample_inputs_index_put,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # RuntimeError: The following operation failed in the TorchScript interpreter.
                # Traceback of TorchScript (most recent call last):
                #   File "<string>", line 3, in forward
@@ -13242,16 +15791,9 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('sort',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-           dtypesIfROCM=all_types_and(torch.float16),
            sample_inputs_func=sample_inputs_sort,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           skips=(
-               # sort does not correctly warn when resizing out= inputs
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               # RuntimeError not raised
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
-           )),
+           supports_fwgrad_bwgrad=True),
     OpInfo('unique',
            dtypes=all_types_and(torch.bool, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
@@ -13259,14 +15801,12 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_autograd=False,
            skips=(
-               # RuntimeError:
-               # 'Tensor (inferred)' object has no attribute or method 'unique'.:
-               #   File "<string>", line 3
-               #
-               #  def the_method(i0):
-               #      return i0.unique(sorted=False, return_inverse=False, return_counts=False, dim=None)
-               #                 ~~~~~~~~~ <--- HERE
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # 76571
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values',
+                            dtypes=(torch.float16, torch.float32, torch.float64)),
            )),
     OpInfo('unique_consecutive',
            dtypes=all_types_and(torch.bool, torch.bfloat16),
@@ -13275,19 +15815,37 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_autograd=False,
            skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # 76571
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values',
+                            dtypes=(torch.float16, torch.float32, torch.float64)),
            )),
     OpInfo('put',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
            check_batched_gradgrad=False,  # vmap complains of the sizes
+           skips=(
+               # Problem, needs to be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+           ),
            sample_inputs_func=sample_inputs_put),
     OpInfo('take',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            check_batched_grad=False,  # vmap complains of the sizes
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=False,  # Need: put_
-           sample_inputs_func=sample_inputs_take),
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_take,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+           ),
+           error_inputs_func=error_inputs_take),
     OpInfo('scatter',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_forward_ad=True,
@@ -13296,164 +15854,92 @@ def ref_pairwise_distance(input1, input2):
            error_inputs_func=error_inputs_scatter_and_scatter_add),
     OpInfo('bfloat16',
            op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
-           )),
-    OpInfo('bfloat16',
-           op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
            skips=(
+               # autograd tests don't handle operators that change dtype
+               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
            )),
     OpInfo('bool',
            op=lambda x, *args, **kwargs: x.bool(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           sample_inputs_func=sample_inputs_conversion,
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('bool',
-           op=lambda x, *args, **kwargs: x.bool(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('byte',
-           op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('byte',
-           op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           # The autograd test runner cannot handle functions that change dtype
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # 76047
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.int8,)),
            )),
-    OpInfo('char',
-           op=lambda x, *args, **kwargs: x.char(*args, **kwargs),
+    OpInfo('byte',
+           op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
            # The autograd test runner cannot handle functions that change dtype
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('char',
            op=lambda x, *args, **kwargs: x.char(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
+           sample_inputs_func=sample_inputs_conversion,
            # The autograd test runner cannot handle functions that change dtype
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('double',
            op=lambda x, *args, **kwargs: x.double(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('double',
-           op=lambda x, *args, **kwargs: x.double(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('float',
            op=lambda x, *args, **kwargs: x.float(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('float',
-           op=lambda x, *args, **kwargs: x.float(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
            skips=(
+               # autograd tests don't handle operators that change dtype
+               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('half',
            op=lambda x, *args, **kwargs: x.half(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_out=False,
            sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('half',
-           op=lambda x, *args, **kwargs: x.half(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
+           supports_autograd=True,
            skips=(
+               # autograd tests don't handle operators that change dtype
+               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('int',
            op=lambda x, *args, **kwargs: x.int(*args, **kwargs),
@@ -13462,19 +15948,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_conversion,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('int',
-           op=lambda x, *args, **kwargs: x.int(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('long',
            op=lambda x, *args, **kwargs: x.long(*args, **kwargs),
@@ -13483,19 +15959,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_conversion,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('long',
-           op=lambda x, *args, **kwargs: x.long(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           supports_autograd=False,
-           skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('short',
            op=lambda x, *args, **kwargs: x.short(*args, **kwargs),
@@ -13504,26 +15970,47 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_conversion,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
-    OpInfo('short',
-           op=lambda x, *args, **kwargs: x.short(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+    OpInfo('chalf',
+           op=lambda x, *args, **kwargs: x.chalf(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
-           variant_test_name='channels_last',
-           sample_inputs_func=sample_inputs_conversion_channels_last,
-           supports_autograd=False,
+           sample_inputs_func=sample_inputs_conversion,
            skips=(
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-           )),
+               # autograd tests don't handle operators that change dtype
+               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               # use of lambda doesn't work with test_normalize_operator_exhaustive
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples',
+                            dtypes=(torch.float, torch.cfloat)),
+               # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='cpu'),
+               # TypeError: 'int' object is not iterable
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view',
+                            device_type='cpu'),
+               # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view',
+                            device_type='cpu'),
+               # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+               # RuntimeError: "neg_conj_cuda" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+           )
+           ),
     OpInfo('empty_like',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
            sample_inputs_func=sample_inputs_like_fns,
+           reference_inputs_func=reference_inputs_like_fns,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
@@ -13536,8 +16023,12 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_complex_half_reference_testing'),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('zeros_like',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -13545,8 +16036,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_like_fns,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('ones_like',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -13554,8 +16046,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_like_fns,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('randn_like',
            dtypes=floating_types_and(torch.half, torch.bfloat16, torch.complex64, torch.complex128),
@@ -13566,8 +16059,9 @@ def ref_pairwise_distance(input1, input2):
            supports_autograd=False,
            supports_sparse_csr=True,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('rand_like',
            dtypes=floating_types_and(torch.half, torch.bfloat16, torch.complex64, torch.complex128),
@@ -13577,10 +16071,11 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_like_fns,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('randint_like',
            dtypes=all_types_and(torch.half, torch.bfloat16),
@@ -13590,10 +16085,11 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_randint_like,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('full_like',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -13601,8 +16097,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_full_like,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
     OpInfo('new_zeros',
            op=lambda x, *args, **kwargs: x.new_zeros(*args, **kwargs),
@@ -13610,8 +16107,9 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            sample_inputs_func=sample_inputs_new_fns,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            ),
            supports_autograd=False),
     OpInfo('new_ones',
@@ -13620,8 +16118,9 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            sample_inputs_func=sample_inputs_new_fns,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            ),
            supports_autograd=False),
     OpInfo('new_empty',
@@ -13630,6 +16129,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            sample_inputs_func=sample_inputs_new_fns,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
@@ -13644,18 +16144,54 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            ),
            supports_autograd=False),
+    OpInfo('empty',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_empty,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Can't find schemas for this operator for some reason
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestCommon',
+                            'test_out'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestCommon',
+                            'test_out_warning'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestLazyOpInfo'),
+           )),
     OpInfo('new_full',
            op=lambda x, *args, **kwargs: x.new_full(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_out=False,
            sample_inputs_func=sample_inputs_new_full,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Can't find schemas for this operator for some reason
-               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            ),
            supports_autograd=False),
     OpInfo('multinomial',
@@ -13663,15 +16199,20 @@ def ref_pairwise_distance(input1, input2):
                wrapper_set_seed(torch.multinomial, inp, *args, **kwargs),
            method_variant=lambda inp, *args, **kwargs:
                wrapper_set_seed(torch.Tensor.multinomial, inp, *args, **kwargs),
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half),
            supports_out=True,
            sample_inputs_func=sample_inputs_multinomial,
+           error_inputs_func=error_inputs_multinomial,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Strides are not the same!
+               # This may not be reproducible in CI
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning')),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning')),
            supports_autograd=False),
     OpInfo('normal',
            op=lambda inp, *args, **kwargs:
@@ -13683,11 +16224,17 @@ def ref_pairwise_distance(input1, input2):
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_first,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),),
-           supports_autograd=False),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # NotImplementedError not raised
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestGradients'),)),
     OpInfo('normal',
            # This has its own variant b/c OpInfos assume the first arg is a Tensor but it is not here
            variant_test_name='number_mean',
@@ -13700,14 +16247,13 @@ def ref_pairwise_distance(input1, input2):
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_second,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-               # Seems like a bug:
-               # The size of tensor a (0) must match the size of tensor b (4) at non-singleton dimension 1
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
-               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),),
-           supports_autograd=False),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # NotImplementedError not raised
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestGradients'),)),
     OpInfo('bernoulli',
            op=lambda inp, *args, **kwargs:
                wrapper_set_seed(torch.bernoulli, inp, *args, **kwargs),
@@ -13718,50 +16264,52 @@ def ref_pairwise_distance(input1, input2):
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
            supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_bernoulli,
            skips=(
+               # vmap: We do not yet support calling random operations inside of vmap
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # Expected RuntimeError when doing an unsafe cast from a result of
                # dtype torch.float32 into an out= with dtype torch.lon
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning')),
-           supports_autograd=False),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'))),
     OpInfo('scatter_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_scatter_add,
            error_inputs_func=error_inputs_scatter_and_scatter_add,
-           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            ),
     OpInfo('stack',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_stack,
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/77046
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+           ),
            ),
     OpInfo('hstack',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(
-               # TODO: see https://github.com/pytorch/pytorch/issues/64709
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           )),
+           ),
     BinaryUfuncInfo('hypot',
                     dtypes=floating_types_and(torch.bfloat16),
                     dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_hypot,
-                    skips=(
-                        # TypeError: hypot(): argument 'other' (position 2) must be Tensor, not float
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
-                    )),
+                    supports_rhs_python_scalar=False),
     OpInfo('histogram',
            dtypes=floating_types(),
            dtypesIfCUDA=_dispatch_dtypes(),  # histogram is only implemented on CPU
@@ -13778,7 +16326,7 @@ def ref_pairwise_distance(input1, input2):
                #                                          ~~~~~~ <--- HERE
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # Not Implemented on XLA.
-               DecorateInfo(unittest.expectedFailure, 'TestOpInfo', device_type='xla'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla'),
            )),
     OpInfo('histogramdd',
            dtypes=floating_types(),
@@ -13786,12 +16334,13 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_histogramdd,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                # JIT tests don't work with Tensor keyword arguments
                # https://github.com/pytorch/pytorch/issues/58507
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('histc',
-           dtypes=floating_types(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64),
            sample_inputs_func=sample_inputs_histc,
            supports_out=True,
@@ -13801,6 +16350,7 @@ def ref_pairwise_distance(input1, input2):
                # "AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast
                # from a result of dtype torch.float32 into an out= with dtype torch.long"
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
            )),
     OpInfo('bincount',
            dtypes=integral_types_and(),
@@ -13822,8 +16372,7 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.skip("Expected failure!"), 'TestJit', 'test_variant_consistency_jit'),
            )),
     OpInfo('searchsorted',
-           dtypes=all_types(),
-           dtypesIfCPU=all_types_and(torch.bfloat16, torch.float16),
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
            dtypesIfCUDA=all_types_and(torch.float16),
            sample_inputs_func=sample_inputs_searchsorted,
            supports_autograd=False,
@@ -13836,14 +16385,13 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('cat',
            ref=lambda input_seq, dim=0, **kwargs: np.concatenate(input_seq, axis=dim, **kwargs),
            aliases=('concat',),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
            sample_inputs_func=sample_inputs_cat_concat,
+           reference_inputs_func=reference_inputs_cat,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_autodiffed=True,
            skips=(
-               # TODO: see https://github.com/pytorch/pytorch/issues/64709
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # RuntimeError: Arguments for call not valid.
                #               Expected a value of type 'List[Tensor]' for argument
                #               'tensors' but instead found type 'Tensor (inferred)'.
@@ -13852,28 +16400,24 @@ def ref_pairwise_distance(input1, input2):
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),)),
     OpInfo('vstack',
            aliases=('row_stack',),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            skips=(
-               # TODO: see https://github.com/pytorch/pytorch/issues/64709
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # RuntimeError: _fn() Expected a value of type
                #   'Tensor (inferred)' for argument 't0' but instead found type 'tuple'.
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),)),
     OpInfo('dstack',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(
-               # TODO: see https://github.com/pytorch/pytorch/issues/64709
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           )),
+           ),
     OpInfo('unfold',
            op=lambda x, *args: x.unfold(*args),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           backward_dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13881,6 +16425,7 @@ def ref_pairwise_distance(input1, input2):
            # See https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Skip operator schema test because this is a functional and not an operator
                DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            ),
@@ -13888,28 +16433,21 @@ def ref_pairwise_distance(input1, input2):
     OpInfo('msort',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-           dtypesIfROCM=all_types_and(torch.float16),
            check_batched_gradgrad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(
-               # msort does not correctly warn when resizing out= inputs.
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               # Expected RuntimeError when doing an unsafe cast from a result of dtype
-               #   torch.float32 into an out= with dtype torch.long
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
-           ),
            sample_inputs_func=sample_inputs_msort),
     OpInfo('movedim',
            aliases=('moveaxis',),
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_movedim_moveaxis),
     OpInfo('renorm',
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           sample_inputs_func=sample_inputs_renorm),
+           sample_inputs_func=sample_inputs_renorm,
+           error_inputs_func=error_inputs_renorm),
     ShapeFuncInfo('repeat',
                   op=lambda x, dims: x.repeat(dims),
                   ref=np.tile,
@@ -13917,9 +16455,12 @@ def ref_pairwise_distance(input1, input2):
                   supports_out=False,
                   supports_forward_ad=True,
                   supports_fwgrad_bwgrad=True,
-                  sample_inputs_func=sample_repeat_tile),
+                  sample_inputs_func=sample_repeat_tile,
+                  skips=(
+                      DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+                  )),
     OpInfo('squeeze',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            assert_autodiffed=True,
            autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
@@ -13940,7 +16481,9 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           backward_dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            skips=(
                # JIT has issue when op is passed as lambda
@@ -13950,6 +16493,9 @@ def ref_pairwise_distance(input1, input2):
                # https://github.com/pytorch/pytorch/issues/59137
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_inplace_gradgrad'),
+               DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            ),
            sample_inputs_func=sample_inputs_fill_),
     OpInfo('resize_',
@@ -13963,13 +16509,10 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_autograd=False,
            skips=(
-               # resize_ is raising an error on input that requires grad on purpose
-               DecorateInfo(
-                   unittest.skip('Skipped! Resizing of variables that require grad is not supported.'),
-                   'TestGradients',
-                   'test_nondifferentiable',
-               ),
-               DecorateInfo(unittest.skip("Allowed exception"), 'TestCommon', 'test_composite_compliance'),
+               # Cannot resize variables that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
            ),
            sample_inputs_func=sample_inputs_resize_ops),
     OpInfo('resize_as_',
@@ -13980,12 +16523,9 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_autograd=False,
            skips=(
-               # resize_ is raising an error on input that requires grad on purpose
-               DecorateInfo(
-                   unittest.skip('Skipped! Resizing of variables that require grad is not supported.'),
-                   'TestGradients',
-                   'test_nondifferentiable',
-               ),
+               # Cannot resize variables that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
            ),
            sample_inputs_func=sample_inputs_resize_ops),
     OpInfo('take_along_dim',
@@ -14008,30 +16548,22 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_trapezoid,
-           skips=(
-               # Dispatch stub: unsupported device typemeta
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
-           )),
+           sample_inputs_func=sample_trapezoid),
     OpInfo('trapezoid',
            dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_trapezoid,
-           skips=(
-               # Dispatch stub: unsupported device typemeta
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
-           )),
+           sample_inputs_func=sample_trapezoid),
     OpInfo('cumulative_trapezoid',
-           dtypes=all_types_and_complex_and(),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.float16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False,
            sample_inputs_func=sample_cumulative_trapezoid,),
     OpInfo('unsqueeze',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -14048,8 +16580,12 @@ def ref_pairwise_distance(input1, input2):
                     promotes_int_to_float=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    safe_casts_outputs=True,
-                    sample_inputs_func=sample_inputs_xlogy),
+                    supports_one_python_scalar=True,
+                    skips=(
+                        # nan vs nan comparisons
+                        # https://github.com/pytorch/pytorch/issues/74279
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                    )),
     OpInfo('zero_',
            op=lambda x: torch.zero_(x.clone()),
            method_variant=None,
@@ -14058,53 +16594,58 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           supports_gradgrad=True,
            skips=(
-               # JIT has issue when op is passed as lambda
-               # AssertionError: JIT Test does not execute any logic
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            sample_inputs_func=sample_inputs_zero_),
     BinaryUfuncInfo('special.xlog1py',
                     aten_name='special_xlog1py',
                     dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                    backward_dtypesIfCPU=all_types_and(torch.bool, torch.bfloat16),
+                    backward_dtypes=all_types_and(torch.bool, torch.bfloat16),
+                    backward_dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                     promotes_int_to_float=True,
-                    safe_casts_outputs=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    sample_inputs_func=sample_inputs_xlog1py),
+                    supports_one_python_scalar=True,
+                    skips=(
+                        # nan vs 0 comparisons
+                        # https://github.com/pytorch/pytorch/issues/74279
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                    )),
     BinaryUfuncInfo('special.zeta',
                     aten_name='special_zeta',
                     dtypes=all_types_and(torch.bool),
                     promotes_int_to_float=True,
                     supports_autograd=False,
-                    safe_casts_outputs=True,
-                    sample_inputs_func=sample_inputs_binary_pwise),
+                    supports_one_python_scalar=True),
+    # TODO: FIXME
     # OpInfo entry to verify the gradient formula of `other`/`q`
-    BinaryUfuncInfo('special.zeta',
-                    op=lambda q, x, **kwargs: torch.special.zeta(x, q, **kwargs),
-                    aten_name='special_zeta',
-                    variant_test_name='grad',
-                    dtypes=all_types_and(torch.bool),
-                    promotes_int_to_float=True,
-                    supports_autograd=True,
-                    safe_casts_outputs=True,
-                    decorators=[
-                        # Derivative wrt first tensor not implemented
-                        DecorateInfo(unittest.expectedFailure, "TestCommon",
-                                     "test_floating_inputs_are_differentiable")
-                    ],
-                    skips=(
-                        # Lambda doesn't work in JIT test
-                        # AssertionError: JIT Test does not execute any logic
-                        DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),
-                    ),
-                    sample_inputs_func=sample_inputs_zeta),
+    # BinaryUfuncInfo('special.zeta',
+    #                 op=lambda q, x, **kwargs: torch.special.zeta(x, q, **kwargs),
+    #                 aten_name='special_zeta',
+    #                 variant_test_name='grad',
+    #                 dtypes=all_types_and(torch.bool),
+    #                 promotes_int_to_float=True,
+    #                 supports_autograd=True,
+    #                 supports_rhs_python_scalar=False,
+    #                 decorators=[
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable")
+    #                 ],
+    #                 skips=(
+    #                     # Lambda doesn't work in JIT test
+    #                     # AssertionError: JIT Test does not execute any logic
+    #                     DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),
+    #                 )),
     OpInfo('logsumexp',
            aliases=('special.logsumexp',),
            dtypes=all_types_and(torch.bool, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.half),
            assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_logsumexp),
     OpInfo('trace',
            dtypes=all_types_and_complex(),
@@ -14115,10 +16656,10 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_trace),
     OpInfo('transpose',
+           ref=_numpy_ref_transpose,
            aliases=('swapdims', 'swapaxes'),
            assert_jit_shape_analysis=True,
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -14127,44 +16668,48 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_transpose_swapdims),
     OpInfo('T',
            op=lambda x: x.T,
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(  # Lambda doesn't work in JIT test
-               DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),),
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
            sample_inputs_func=sample_inputs_T),
     OpInfo('H',
            op=lambda x: x.H,
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(  # Lambda doesn't work in JIT test
-               DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),),
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
            sample_inputs_func=sample_inputs_T),
     OpInfo('mT',
            op=lambda x: x.mT,
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(  # Lambda doesn't work in JIT test
-               DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),),
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
            sample_inputs_func=sample_inputs_adjoint),
     OpInfo('mH',
            op=lambda x: x.mH,
            aliases=('adjoint',),
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skips=(  # Lambda doesn't work in JIT test
-               DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),),
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
            sample_inputs_func=sample_inputs_adjoint),
     OpInfo('tril',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -14186,18 +16731,19 @@ def ref_pairwise_distance(input1, input2):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_kron),
     OpInfo('inner',
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
+                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_inner,
            ),
     OpInfo('tensordot',
-           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
+                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           safe_casts_outputs=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_tensordot,
@@ -14214,24 +16760,32 @@ def ref_pairwise_distance(input1, input2):
            backward_dtypes=floating_types(),
            backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
+           supports_sparse_csr=True,
            check_batched_grad=False,
            check_batched_gradgrad=False,
            skips=(
+               # to_sparse does not support automatic differentiation for outputs with complex dtype
+               DecorateInfo(unittest.expectedFailure, 'TestGradients',
+                            'test_nondifferentiable', dtypes=(torch.cdouble,)),
                # NotImplementedError: Could not run 'aten::normal_' with arguments from the 'SparseCPU' backend
                DecorateInfo(unittest.skip(""), 'TestCommon', 'test_noncontiguous_samples'),
                # TODO: FIXME: complex inputs requiring grad error in forward
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
-               # JIT has issue when op is passed as lambda
-               # NotImplementedError: Cannot access storage of SparseTensorImpl
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Allowed exception: sparse tensors don't have strides
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_backward'),
+               # TODO: implement csr.to_sparse(sample_dim) where sampled_dim is 1.
+               DecorateInfo(unittest.skip("csr.to_sparse(1) not implemented. Skipped!"),
+                            'TestSparseCSR', 'test_sparse_csr_consistency'),
            )
            ),
     OpInfo('logcumsumexp',
-           dtypes=floating_types_and(),
+           dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           backward_dtypesIfCUDA=floating_types_and(),
+           backward_dtypes=floating_types_and(torch.bfloat16),
+           backward_dtypesIfCUDA=floating_types_and(torch.bfloat16),
            skips=(
                # AssertionError: UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cuda'),
@@ -14239,28 +16793,22 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_logcumsumexp),
     UnaryUfuncInfo('sigmoid',
                    aliases=('special.expit', 'nn.functional.sigmoid'),
+                   aten_backward_name='sigmoid_backward',
                    ref=reference_sigmoid if TEST_SCIPY else _NOTHING,
                    decorators=(precisionOverride({torch.float16: 1e-2,
                                                   torch.complex64: 1e-1,
                                                   torch.bfloat16: 1e-2}),),
                    skips=(
-                       # TODO: FIXME: sigmoid fails on complex inputs that require grad
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
                        # Reference: https://github.com/pytorch/pytorch/issues/56012
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    device_type='cuda', dtypes=[torch.complex64]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                    device_type='cuda', dtypes=[torch.complex64]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
-                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                                    dtypes=[torch.complex64, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.chalf, torch.complex64, torch.cdouble]),
                        # alias, nn.functional.sigmoid, will produce (because of warning string saved):
                        # "RuntimeError: Expected to not find "sigmoid" but found it"
                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping')),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-                   safe_casts_outputs=True,
+                   dtypesIfCUDA=all_types_and_complex_and(torch.complex32, torch.bool, torch.half, torch.bfloat16),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    assert_autodiffed=True,
@@ -14276,8 +16824,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('special.entr',
                    ref=scipy.special.entr if TEST_SCIPY else _NOTHING,
                    aten_name='special_entr',
@@ -14288,11 +16835,10 @@ def ref_pairwise_distance(input1, input2):
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    skips=(
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.bfloat16, torch.float16]),
                    ),
                    supports_inplace_autograd=False,
-                   safe_casts_outputs=True,
                    sample_inputs_func=sample_inputs_entr),
     UnaryUfuncInfo('special.ndtri',
                    ref=scipy.special.ndtri if TEST_SCIPY else _NOTHING,
@@ -14300,8 +16846,14 @@ def ref_pairwise_distance(input1, input2):
                    aten_name='special_ndtri',
                    dtypes=all_types_and(torch.bool),
                    supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True),
+    UnaryUfuncInfo('special.log_ndtr',
+                   aten_name='special_log_ndtr',
+                   ref=scipy.special.log_ndtr if TEST_SCIPY else _NOTHING,
+                   dtypes=all_types_and(torch.bool),
+                   supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   ),
     UnaryUfuncInfo('erf',
                    ref=scipy.special.erf if TEST_SCIPY else _NOTHING,
                    aliases=('special.erf', ),
@@ -14319,8 +16871,7 @@ def ref_pairwise_distance(input1, input2):
                    supports_sparse=True,
                    supports_sparse_csr=True,
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('erfc',
                    ref=scipy.special.erfc if TEST_SCIPY else _NOTHING,
                    aliases=('special.erfc', ),
@@ -14330,8 +16881,7 @@ def ref_pairwise_distance(input1, input2):
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     UnaryUfuncInfo('erfinv',
                    ref=scipy.special.erfinv if TEST_SCIPY else _NOTHING,
                    aliases=('special.erfinv', ),
@@ -14340,7 +16890,6 @@ def ref_pairwise_distance(input1, input2):
                                                   torch.float32: 1e-4}),),
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half),
-                   safe_casts_outputs=True,
                    supports_sparse_csr=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -14349,11 +16898,44 @@ def ref_pairwise_distance(input1, input2):
                        # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     active_if=TEST_SCIPY and LooseVersion(scipy.__version__) < "1.4.0"),
                    )),
+    OpInfo("nn.functional.smooth_l1_loss",
+           ref=reference_smooth_l1_loss,
+           sample_inputs_func=sample_inputs_smooth_l1_loss,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           backward_dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           backward_dtypesIfCUDA=floating_types_and(torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           skips=(
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalTypeINTERNAL ASSERT FAILED
+               # at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270, please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),)),
+    OpInfo(
+        "nn.functional.l1_loss",
+        ref=loss_reference_reduction_wrapper(lambda input, target: np.abs(input - target)),
+        aten_backward_name='l1_loss_backward',
+        sample_inputs_func=sample_inputs_l1_loss,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        backward_dtypes=all_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        skips=(
+            # RuntimeError: input->type()->kind() == TypeKind::OptionalTypeINTERNAL ASSERT FAILED
+            # at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270, please report a bug to PyTorch.
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32,),
+            ),
+        ),
+    ),
     UnaryUfuncInfo('lgamma',
                    ref=reference_lgamma if TEST_SCIPY else _NOTHING,
                    aliases=('special.gammaln', ),
@@ -14366,17 +16948,16 @@ def ref_pairwise_distance(input1, input2):
                        # Reference: https://github.com/pytorch/pytorch/pull/50140#discussion_r552615345
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     device_type='cpu', dtypes=[torch.bfloat16]),
                        # Reference: https://github.com/pytorch/pytorch/pull/50140#issuecomment-756150214
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
                    ),
-                   safe_casts_outputs=True,
                    # lgamma have multiple singularities at x <= 0
                    reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
     OpInfo(
@@ -14384,26 +16965,30 @@ def ref_pairwise_distance(input1, input2):
         dtypes=floating_types(),
         supports_out=False,
         sample_inputs_func=sample_inputs_logdet,
-        decorators=(skipCPUIfNoLapack, skipCUDAIfNoMagma, skipCUDAIfRocm)),
+        decorators=(skipCPUIfNoLapack, skipCUDAIfNoMagma)),
     # `log_softmax` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     OpInfo(
         'log_softmax',
         aliases=('special.log_softmax', 'nn.functional.log_softmax'),
-        supports_out=False,
+        supports_out=True,
+        aten_backward_name='_log_softmax_backward_data',
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_softmax_variant,
+        supports_forward_ad=True,
         assert_autodiffed=True),
     OpInfo(
         'log_softmax',
         variant_test_name='dtype',
         aliases=('special.log_softmax', 'nn.functional.log_softmax'),
-        supports_out=False,
+        supports_out=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
+        supports_forward_ad=True,
         assert_autodiffed=True),
     UnaryUfuncInfo('logit',
+                   aten_backward_name='logit_backward',
                    ref=scipy.special.logit if TEST_SCIPY else _NOTHING,
                    domain=(0, 1),
                    aliases=('special.logit', ),
@@ -14413,8 +16998,7 @@ def ref_pairwise_distance(input1, input2):
                                                   torch.float16: 5e-1}),),
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   sample_inputs_func=sample_inputs_logit,
-                   safe_casts_outputs=True),
+                   sample_inputs_func=sample_inputs_logit),
     OpInfo('where',
            # Currently only the `input` is tested in gradcheck.
            # If we pass `condition` first, none of the input which supports
@@ -14422,13 +17006,15 @@ def ref_pairwise_distance(input1, input2):
            op=lambda self, condition, other: torch.where(condition, self, other),
            ref=lambda self, condition, other: np.where(condition, self, other),
            sample_inputs_func=sample_inputs_where,
+           error_inputs_func=error_inputs_where,
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(onlyCUDA, "TestCommon", 'test_errors'),),
            skips=(
-               # test does not work with passing lambda for op
-               # AssertionError: False is not true :
-               # Failure in testing nodes' autodifferentiation.
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
            ),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)),
@@ -14437,6 +17023,9 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_nonzero,
            supports_autograd=False,
            skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # nonzero(): argument 'out' must be Tensor, not tuple
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # https://github.com/pytorch/pytorch/issues/67458
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # nonzero is not raising a warning when the out is resized
@@ -14444,26 +17033,137 @@ def ref_pairwise_distance(input1, input2):
                # Can't find schemas for this operator for some reason
                DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
            )),
+    # Following tests are for jiterator's python interface
+    # Jiterator can be used to author elementwise CUDA kernel
+    # jiterator._create_jit_fn returns a callable that behaves like a regular pytorch op
+    # See create_jit_fn in jiterator.py for more information
+    UnaryUfuncInfo(
+        'jiterator_unary',
+        op=torch.cuda.jiterator._create_jit_fn("template <typename T> T unary(T x) { return x * x + x; }"),
+        ref=lambda x: x * x + x,
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        decorators=[
+            onlyCUDA,
+            skipCUDAIfRocm,
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_hard'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_normal'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_small'),
+        ],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't suport CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Skip reference_numerics tests for bool type, as the defined function doesn't work for bool
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         dtypes=[torch.bool]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                         dtypes=[torch.bool]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=[torch.bool]),
+            # Expected failure: torch.jiterator_unary is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    BinaryUfuncInfo(
+        'jiterator_binary',
+        op=torch.cuda.jiterator._create_jit_fn(
+            "template <typename T> T binary(T x, T y, T alpha) { return x + alpha * y; }", alpha=1),
+        ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
+            else np.add(input, np.multiply(alpha, other)),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2, alpha=-3.14),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        supports_rhs_python_scalar=False,
+        decorators=[onlyCUDA, skipCUDAIfRocm],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't suport CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_binary is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    OpInfo(
+        'jiterator_4inputs_with_extra_args',
+        op=torch.cuda.jiterator._create_jit_fn(
+            "template <typename T> T binary(T i0, T i1, T i2, T i3, T alpha, T beta) { return alpha * i0 + beta * i1 + i2 + i3; }",
+            alpha=1, beta=1),
+        ref=lambda i0, i1, i2, i3, *, alpha=1, beta=1: alpha * i0 + beta * i1 + i2 + i3,
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=4, alpha=3.14, beta=-4.20),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        decorators=[onlyCUDA, skipCUDAIfRocm],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't suport CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_4inputs_with_extra_args is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
     # `torch.norm` has multiple code paths depending on the value of `p`.
     # These paths have different dtype support. Also JIT supports,
     # most variants but not all of them. So we split the OpInfo entries,
     # for `norm` based on the code-paths and JIT support.
-    OpInfo('norm',
-           sample_inputs_func=sample_inputs_norm,
-           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16)),
+    OpInfo(
+        "norm",
+        sample_inputs_func=sample_inputs_norm,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result
+            # of dtype torch.float32 into an out= with dtype torch.long
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestCommon",
+                "test_out",
+                device_type="meta",
+            ),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                         dtypes=[torch.complex128]),
+        ),
+    ),
     OpInfo('norm',
            variant_test_name='nuc',
            sample_inputs_func=sample_inputs_norm_nuc,
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            check_batched_gradgrad=False,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+           # got: Could not allocate memory to change Tensor SizesAndStrides!
            check_batched_forward_grad=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            dtypes=floating_and_complex_types(),
            dtypesIfCUDA=floating_and_complex_types(),
            skips=(
-               # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
                # RuntimeError not raised :
                # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
@@ -14477,27 +17177,58 @@ def ref_pairwise_distance(input1, input2):
            sample_inputs_func=sample_inputs_norm_fro,
            dtypes=floating_and_complex_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+           # got: Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
            skips=(
                # Pre-existing condition; Needs to be fixed
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
                # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # Arguments for call are not valid.
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.complex64, torch.float32,)),  # noqa: B950
            )),
-    OpInfo('norm',
-           variant_test_name='inf',
-           sample_inputs_func=sample_inputs_norm_inf,
-           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           backward_dtypesIfCPU=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           skips=(
-               # https://github.com/pytorch/pytorch/issues/67517
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
-               # following 2 tests failed intermittenly
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cpu', dtypes=(torch.complex128,)),  # noqa: B950
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad', device_type='cpu', dtypes=(torch.complex128,)),  # noqa: B950
-           )
-           ),
+    OpInfo(
+        "norm",
+        variant_test_name="inf",
+        sample_inputs_func=sample_inputs_norm_inf,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/67517
+            DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_noncontiguous_samples"),
+            # following 2 tests failed intermittenly
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestGradients",
+                "test_fn_grad",
+                device_type="cpu",
+                dtypes=(torch.complex128,),
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestGradients",
+                "test_fn_gradgrad",
+                device_type="cpu",
+                dtypes=(torch.complex128,),
+            ),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                         dtypes=[torch.complex128]),
+            # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result
+            # of dtype torch.float32 into an out= with dtype torch.long
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestCommon",
+                "test_out",
+                device_type="meta",
+            ),
+        ),
+    ),
     OpInfo('t',
            sample_inputs_func=sample_inputs_t,
            supports_out=False,
@@ -14515,8 +17246,7 @@ def ref_pairwise_distance(input1, input2):
                    decorators=(toleranceOverride({torch.float32: tol(atol=0, rtol=4e-6), }),),
                    dtypes=all_types_and(torch.bool),
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   safe_casts_outputs=True),
+                   supports_fwgrad_bwgrad=True),
     OpInfo(
         "nn.functional.dropout",
         op=lambda input, *args, **kwargs:
@@ -14525,19 +17255,14 @@ def ref_pairwise_distance(input1, input2):
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # Probably because we have used lambda for the op here
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # inplace variant dispatches to dropout kernel, while on CUDA
             # the op dispatches to _fused_dropout (with a few more conditions)
             # hence, different values and this skip here
-            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
-            # On CUDA, the op is dispatched (and a few more conditions) to
-            # _fused_dropout, which doesn't support forward AD
-            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD', device_type='cuda'),
-            # NotImplementedError: Trying to use forward AD with native_dropout that does not support it
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                         device_type='cuda', dtypes=[torch.float64]),),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),),
         gradcheck_wrapper=wrapper_set_seed,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -14555,14 +17280,14 @@ def ref_pairwise_distance(input1, input2):
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
             # vmap: We do not yet support calling random operations inside of vmap.
             # Please perform random operations outside of vmap as a workaround
             DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_forward_mode_AD"),
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_forward_mode_AD"),
-            # Probably because we have used lambda for the op here
-            # AssertionError: JIT Test does not execute any logic
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_forward_mode_AD"),),
         gradcheck_wrapper=wrapper_set_seed,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -14582,14 +17307,14 @@ def ref_pairwise_distance(input1, input2):
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
             # vmap: We do not yet support calling random operations inside of vmap.
             # Please perform random operations outside of vmap as a workaround
             DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_forward_mode_AD"),
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_forward_mode_AD"),
-            # Probably because we have used lambda for the op here
-            # AssertionError: JIT Test does not execute any logic
-            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_forward_mode_AD"),),
         gradcheck_wrapper=wrapper_set_seed,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -14606,8 +17331,8 @@ def ref_pairwise_distance(input1, input2):
         ref=_NOTHING,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         skips=(
-            # Probably because we have used lambda for the op here
-            # AssertionError: JIT Test does not execute any logic
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),
         gradcheck_wrapper=wrapper_set_seed,
         supports_forward_ad=True,
@@ -14625,19 +17350,25 @@ def ref_pairwise_distance(input1, input2):
     ),
     OpInfo(
         "nn.functional.embedding",
+        aten_backward_name="embedding_dense_backward",
         # We use lambda to reshuffle the positional arguments.
         # This is because currently only the `input` field of SampleInput
         # is tested in gradient tests.
         op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(idx, weight, **kwargs),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         sample_inputs_func=sample_inputs_embedding,
+        error_inputs_func=error_inputs_embedding,
+        supports_forward_ad=True,
         skips=(
-            # Does not work with lambda
-            # Raises : JIT Test does not execute any logic
+            # lambda impl
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # Reference: https://github.com/pytorch/pytorch/issues/67084
             DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
+            # Not a problem: embedding does weird stuff to its input (it renormalizes)
+            DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
         ),
+        supports_expanded_weight=True,
         supports_out=False,
     ),
     OpInfo(
@@ -14646,15 +17377,18 @@ def ref_pairwise_distance(input1, input2):
         # This is because currently only the `input` field of SampleInput
         # is tested in gradient tests.
         op=lambda weight, idx, **kwargs: torch.nn.functional.embedding_bag(idx, weight, **kwargs),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.float16),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
         # backward is not supported for mode `max` and dtype `bfloat16`
         backward_dtypesIfCUDA=floating_types_and(torch.float16),
         sample_inputs_func=sample_inputs_embedding_bag,
         skips=(
-            # Does not work with lambda
-            # Raises : JIT Test does not execute any logic
+            # lambda impl
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # Not a problem: embedding_bag does weird stuff to its input (it renormalizes)
+            DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type='cpu'),
         ),
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
         supports_out=False,
@@ -14662,12 +17396,12 @@ def ref_pairwise_distance(input1, input2):
     ),
     OpInfo(
         "nn.functional.softplus",
+        aten_backward_name='softplus_backward',
         ref=reference_softplus,
         sample_inputs_func=sample_inputs_softplus,
         supports_forward_ad=True,
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
-        supports_out=False,
     ),
     OpInfo(
         "linalg.tensorinv",
@@ -14685,22 +17419,24 @@ def ref_pairwise_distance(input1, input2):
         sample_inputs_func=sample_inputs_tensorsolve,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagma],
     ),
     OpInfo(
         "nn.functional.mse_loss",
-        ref=reference_mse_loss,
-        sample_inputs_func=sample_inputs_mse_loss,
+        aten_backward_name='mse_loss_backward',
+        ref=loss_reference_reduction_wrapper(lambda input, target: (input - target) ** 2),
+        sample_inputs_func=sample_inputs_loss,
         supports_out=False,
         supports_forward_ad=True,
         dtypes=floating_types_and(torch.float16),
-        backward_dtypesIfCPU=floating_types(),
+        backward_dtypes=floating_types(),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        backward_dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
         skips=(
             # RuntimeError: input->type()->kind() == TypeKind::OptionalType
             # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
             # please report a bug to PyTorch.
-            DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+            DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
         ),
     ),
     OpInfo(
@@ -14724,7 +17460,6 @@ def ref_pairwise_distance(input1, input2):
         'all',
         identity=True,
         supports_multiple_dims=False,
-        supports_out=False,
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -14743,7 +17478,6 @@ def ref_pairwise_distance(input1, input2):
         'any',
         identity=False,
         supports_multiple_dims=False,
-        supports_out=False,
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -14764,10 +17498,11 @@ def ref_pairwise_distance(input1, input2):
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         ref=reference_reduction_numpy(np.amax),
         skips=(
-            # FIXME: sum reduces all dimensions when dim=[]
+            # FIXME: reduces all dimensions when dim=[]
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
         ),
+        error_inputs_func=error_inputs_aminmax_amax_amin,
     ),
     ReductionOpInfo(
         'amin',
@@ -14775,15 +17510,17 @@ def ref_pairwise_distance(input1, input2):
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         ref=reference_reduction_numpy(np.amin),
         skips=(
-            # FIXME: sum reduces all dimensions when dim=[]
+            # FIXME: reduces all dimensions when dim=[]
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
         ),
+        error_inputs_func=error_inputs_aminmax_amax_amin,
     ),
     ReductionOpInfo(
         'argmax',
         supports_multiple_dims=False,
         supports_autograd=False,
+        assert_jit_shape_analysis=True,
         result_dtype=torch.int64,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
         ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
@@ -14831,14 +17568,18 @@ def ref_pairwise_distance(input1, input2):
     ReductionOpInfo(
         'mean',
         nan_policy='propagate',
-        supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
+        # FIXME: mean needs 'dim' parameter when using the 'out' overload.
+        # Adding it with 'generate_args_kwargs' does not work, since these also get passed
+        # onto the reference implementations.
+        supports_out=False,
         assert_autodiffed=True,
         assert_jit_shape_analysis=True,
         promotes_int_to_float=True,
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         ref=reference_reduction_numpy(np.mean),
+        error_inputs_func=error_inputs_mean,
         skips=(
             # FIXME: mean does not support passing keepdim without passing dim
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
@@ -14954,7 +17695,7 @@ def ref_pairwise_distance(input1, input2):
         promotes_int_to_int64=True,
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
         dtypes=all_types_and_complex_and(torch.bool),
-        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=sample_inputs_prod,
         ref=reference_reduction_numpy(np.prod),
         skips=(
@@ -14981,6 +17722,7 @@ def ref_pairwise_distance(input1, input2):
         supports_fwgrad_bwgrad=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         ref=reference_reduction_numpy(np.sum),
         skips=(
             # FIXME: sum does not support passing keepdim without passing dim
@@ -15002,24 +17744,18 @@ def ref_pairwise_distance(input1, input2):
         'nansum',
         identity=0,
         nan_policy='omit',
-        supports_out=False,
+        supports_out=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
         ref=reference_reduction_numpy(np.nansum),
         skips=(
-            # FIXME: nansum does not support passing keepdim without passing dim
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
             # FIXME: nansum reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
-            # FIXME: nansum does not support passing None to dim
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none_keepdim'),
-            # FIXME: improve precision
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: flaky test so skipped instead of xfailed
+            # possibly bad low precision reference in numpy
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
                          dtypes=[torch.float16]),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
-                         dtypes=[torch.float16]),
         ),
     ),
     ReductionOpInfo(
@@ -15031,14 +17767,22 @@ def ref_pairwise_distance(input1, input2):
         supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        promotes_int_to_int64=False,
+        supports_sparse=True,
+        supports_sparse_csr=True,
+        promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         skips=(
+            DecorateInfo(unittest.skip("Failing on some jobs"), 'TestReductions', 'test_reference_masked',
+                         dtypes=(torch.bool, torch.int8, torch.int16, torch.int32)),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: sum reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # RuntimeError: undefined value tensor
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
         decorators=[
             DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-03, rtol=1e-03)}),
@@ -15048,7 +17792,9 @@ def ref_pairwise_distance(input1, input2):
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-03)}),
                          'TestReductions', 'test_ref_small_input'),
         ],
-        sample_inputs_func=sample_inputs_masked_reduction
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction
     ),
     ReductionOpInfo(
         '_masked.prod',
@@ -15059,14 +17805,25 @@ def ref_pairwise_distance(input1, input2):
         supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
+        supports_sparse=True,
+        supports_sparse_csr=True,
         promotes_int_to_int64=True,
         # FIXME: "prod_cpu" not implemented for 'BFloat16'
         # FIXME: "prod_cpu" not implemented for 'Half'
         dtypes=all_types_and_complex_and(torch.bool),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         skips=(
-            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip("Failing on some jobs"), 'TestReductions', 'test_reference_masked',
+                         dtypes=(torch.bool, torch.int8, torch.int16, torch.int32),),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMasked', 'test_mask_layout', device_type='cuda',
+                         dtypes=(torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
+                                 torch.int64, torch.complex64, torch.complex128)),
         ),
         decorators=[
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-02)}),
@@ -15074,22 +17831,65 @@ def ref_pairwise_distance(input1, input2):
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-03)}),
                          'TestReductions', 'test_ref_duplicate_values'),
         ],
-        sample_inputs_func=sample_inputs_masked_reduction
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+    ),
+    OpInfo(
+        '_masked.cumsum',
+        dtypes=all_types_and_complex_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        method_variant=None,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+        ),
+        # Can reuse the same inputs; dim is required in both
+        sample_inputs_func=sample_inputs_masked_cumops,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    OpInfo(
+        '_masked.cumprod',
+        dtypes=all_types_and_complex_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        method_variant=None,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+        ),
+        # Can reuse the same inputs; dim is required in both
+        sample_inputs_func=sample_inputs_masked_cumops,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
     ),
     ReductionOpInfo(
         '_masked.amax',
         nan_policy='propagate',
         supports_out=False,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
+        supports_sparse=True,
         ref=reference_reduction_numpy(np.amax),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: amax reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # RuntimeError: Unknown builtin op: aten::iinfo
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMasked', 'test_mask_layout', device_type='cuda',
+                         dtypes=(torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
+                                 torch.int64, torch.complex64, torch.complex128)),
         ),
         sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
         gradcheck_wrapper=gradcheck_wrapper_masked_operation
     ),
     ReductionOpInfo(
@@ -15097,13 +17897,56 @@ def ref_pairwise_distance(input1, input2):
         nan_policy='propagate',
         supports_out=False,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
+        supports_sparse=True,
         ref=reference_reduction_numpy(np.amin),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: amax reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # RuntimeError: Unknown builtin op: aten::iinfo
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMasked', 'test_mask_layout', device_type='cuda',
+                         dtypes=(torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
+                                 torch.int64, torch.complex64, torch.complex128)),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation
+    ),
+    ReductionOpInfo(
+        '_masked.argmax',
+        supports_out=False,
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # initial is not a keyword for argmax
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_reference_masked'),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation
+    ),
+    ReductionOpInfo(
+        '_masked.argmin',
+        supports_out=False,
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmin, supports_keepdims=False),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # initial is not a keyword for argmin
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_reference_masked'),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)),
         ),
         sample_inputs_func=sample_inputs_masked_reduction,
         gradcheck_wrapper=gradcheck_wrapper_masked_operation
@@ -15119,11 +17962,21 @@ def ref_pairwise_distance(input1, input2):
         promotes_int_to_float=True,
         dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_ref_duplicate_values',
+                         dtypes=(torch.bool,)),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_reference_masked',
+                         dtypes=(torch.bool,)),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_ref_small_input',
+                         dtypes=(torch.bool,)),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: sum reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # RuntimeError: undefined value tensor
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
         decorators=[
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-03)}),
@@ -15141,33 +17994,83 @@ def ref_pairwise_distance(input1, input2):
         promotes_int_to_float=True,
         dtypes=floating_types_and(torch.float16, torch.bfloat16),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: sum reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # torch.jit.frontend.NotSupportedError: Compiled functions
             # can't take variable number of arguments or use
             # keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_masked_norm,
         gradcheck_wrapper=gradcheck_wrapper_masked_operation
     ),
     ReductionOpInfo(
-        '_masked.var',
-        ref=reference_reduction_numpy(np.var) if np.lib.NumpyVersion(np.__version__) >= '1.20.2' else None,
+        '_masked.var',
+        ref=reference_reduction_numpy(np.var) if np.lib.NumpyVersion(np.__version__) >= '1.20.2' else None,
+        method_variant=None,
+        nan_policy='propagate',
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        promotes_int_to_float=True,
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # RuntimeError: undefined value tensor
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
+        ),
+        decorators=[
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02),
+                                            torch.bfloat16: tol(atol=1e-03, rtol=1e-03)}),
+                         'TestReductions', 'test_reference_masked'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestReductions', 'test_ref_small_input'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestMasked', 'test_reference_masked'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+        ],
+        sample_inputs_func=sample_inputs_masked_std_var,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        check_batched_grad=True,
+        check_batched_forward_grad=True,
+    ),
+    ReductionOpInfo(
+        '_masked.std',
+        ref=reference_reduction_numpy(np.std) if np.lib.NumpyVersion(np.__version__) >= '1.20.2' else None,
         method_variant=None,
         nan_policy='propagate',
         supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         promotes_int_to_float=True,
-        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        dtypes=all_types_and_complex_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # FIXME: sum reduces all dimensions when dim=[]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
             # RuntimeError: undefined value tensor
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness',
+                         dtypes=(torch.float16,)),
         ),
         decorators=[
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
@@ -15177,8 +18080,10 @@ def ref_pairwise_distance(input1, input2):
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
                          'TestMasked', 'test_reference_masked'),
         ],
-        sample_inputs_func=sample_inputs_masked_var,
-        gradcheck_wrapper=gradcheck_wrapper_masked_operation
+        sample_inputs_func=sample_inputs_masked_std_var,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        check_batched_grad=True,
+        check_batched_forward_grad=True,
     ),
     OpInfo(
         '_masked.softmax',
@@ -15187,12 +18092,14 @@ def ref_pairwise_distance(input1, input2):
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=sample_inputs_masked_softmax,
         skips=(
-            # torch.jit.frontend.NotSupportedError: Compiled
-            # functions can't take variable number of arguments or
-            # use keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
         supports_out=False),
     OpInfo(
         '_masked.log_softmax',
@@ -15201,16 +18108,18 @@ def ref_pairwise_distance(input1, input2):
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=sample_inputs_masked_softmax,
         skips=(
-            # torch.jit.frontend.NotSupportedError: Compiled
-            # functions can't take variable number of arguments or
-            # use keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
         decorators=[
             DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1e-02)}),
                          'TestMasked', 'test_reference_masked'),
         ],
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
         supports_out=False),
     OpInfo(
         '_masked.softmin',
@@ -15219,28 +18128,33 @@ def ref_pairwise_distance(input1, input2):
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=sample_inputs_masked_softmax,
         skips=(
-            # torch.jit.frontend.NotSupportedError: Compiled
-            # functions can't take variable number of arguments or
-            # use keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # see https://github.com/pytorch/pytorch/issues/76227
+            DecorateInfo(unittest.skip("Fails on UBSAN!"), 'TestCompositeCompliance', 'test_forward_ad',
+                         device_type='cpu'),
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
         supports_out=False),
     OpInfo(
         '_masked.normalize',
         method_variant=None,
-        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=sample_inputs_masked_normalize,
         skips=(
-            # torch.jit.frontend.NotSupportedError: Compiled
-            # functions can't take variable number of arguments or
-            # use keyword-only arguments with defaults
-            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Prexisting issue with linalg.vector_norm
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                         dtypes=[torch.complex128]),
             # RuntimeError: "clamp_min_cpu" not implemented for 'Half'
-            DecorateInfo(unittest.skip("Skipped!"), 'TestMasked', 'test_reference_masked',
+            DecorateInfo(unittest.expectedFailure, 'TestMasked', 'test_reference_masked',
                          device_type='cpu', dtypes=[torch.half]),
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False),
     OpInfo(
         "nn.functional.ctc_loss",
@@ -15272,7 +18186,8 @@ def ref_pairwise_distance(input1, input2):
                 dtypes=(torch.float32,),
             ),
             # Operation calls data_ptr() somewhere; needs to be fixed
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_composite_compliance'),
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
         ),
     ),
     OpInfo(
@@ -15292,10 +18207,8 @@ def ref_pairwise_distance(input1, input2):
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_nll_loss,
-        decorators=[
-            # FIXME: Derivative wrt. weight not implemented
-            DecorateInfo(unittest.expectedFailure, "TestCommon",
-                         "test_floating_inputs_are_differentiable")],
+        supports_forward_ad=True,
+        assert_jit_shape_analysis=True,
         skips=(
             # RuntimeError:
             # undefined value tensor:
@@ -15309,19 +18222,24 @@ def ref_pairwise_distance(input1, input2):
     OpInfo(
         "nn.functional.gaussian_nll_loss",
         ref=_NOTHING,
-        dtypes=all_types_and(torch.bfloat16),
-        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_gaussian_nll_loss,
         skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
             # JIT does not support variadic tensors.
             # RuntimeError: input->type()->kind() == TypeKind::OptionalType
             # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270,
             # please report a bug to PyTorch.
             DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
         ),
+        decorators=(
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+        )
     ),
     OpInfo(
         "nn.functional.hinge_embedding_loss",
@@ -15332,9 +18250,13 @@ def ref_pairwise_distance(input1, input2):
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_hinge_embedding_loss,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+        )
     ),
     OpInfo(
         "nn.functional.huber_loss",
+        aten_backward_name='huber_loss_backward',
         ref=_NOTHING,
         dtypes=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
@@ -15348,6 +18270,13 @@ def ref_pairwise_distance(input1, input2):
             DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
         )
     ),
+    OpInfo(
+        "nn.functional.pdist",
+        ref=reference_pdist,
+        sample_inputs_func=sample_inputs_pdist,
+        dtypes=floating_types(),
+        supports_out=False,
+        supports_gradgrad=False),
     OpInfo(
         "nn.functional.poisson_nll_loss",
         ref=_NOTHING,
@@ -15398,6 +18327,8 @@ def ref_pairwise_distance(input1, input2):
         sample_inputs_func=sample_inputs_pairwise_distance,
         dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
         supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         skips=(
             DecorateInfo(
                 unittest.skip("Skipped!"),
@@ -15405,6 +18336,8 @@ def ref_pairwise_distance(input1, input2):
                 "test_variant_consistency_jit",
                 dtypes=(torch.float32, torch.complex64),
             ),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                         dtypes=[torch.complex128]),
         ),
     ),
     OpInfo(
@@ -15443,7 +18376,7 @@ def ref_pairwise_distance(input1, input2):
         "nn.functional.kl_div",
         sample_inputs_func=sample_inputs_kl_div,
         dtypes=floating_types_and(torch.bfloat16, torch.int8, torch.int16, torch.int32, torch.int64),
-        backward_dtypesIfCPU=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64),
+        backward_dtypes=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64),
         dtypesIfCUDA=floating_types_and(
             torch.float16, torch.bfloat16, torch.int8, torch.int16, torch.int32, torch.int64
         ),
@@ -15464,7 +18397,7 @@ def ref_pairwise_distance(input1, input2):
         "diagflat",
         ref=lambda input, offset=0: np.diagflat(input, k=offset),
         sample_inputs_func=sample_inputs_diagflat,
-        dtypes=all_types_and_complex_and(torch.bool),
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
         supports_forward_ad=True,
@@ -15472,29 +18405,786 @@ def ref_pairwise_distance(input1, input2):
     ),
     OpInfo(
         'scatter_reduce',
+        variant_test_name='sum',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='prod',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='mean',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
         dtypes=all_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_scatter_reduce,
-        supports_out=False,
-        decorators=(onlyCPU,),
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='amin',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='amax',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+]
+
+# NOTE [Python References]
+# Python References emulate existing PyTorch operations, but can ultimately
+#   be expressed in terms of "primitive" operations from torch._prims.
+#
+# These references are experimental.
+# See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577
+#   for additional context.
+#
+# Python Reference OpInfos should be added to the python_ref_db list below.
+#   Tests can opt-into running on these references by including
+#   that list in the Sequence they pass to the @ops decorator.
+#
+# When a Python Reference OpInfo is constructed a pointer to an
+#   existing OpInfo must be provided using the torch_opinfo_name kwarg.
+#   The existing OpInfo with that name and no variant will be found
+#   to inherit from.
+#
+# Instead of just inheriting the existing OpInfo's metadata, the
+#   Python Reference OpInfos inherit the existing OpInfo's
+#   construction arguments. These arguments can be overridden
+#   by adding kwargs to the constructor.
+
+def _find_referenced_opinfo(referenced_name):
+    '''
+    Finds the OpInfo with the given name that has no variant name.
+    '''
+    for opinfo in op_db:
+        if opinfo.name == referenced_name and opinfo.variant_test_name == '':
+            return opinfo
+
+def _inherit_constructor_args(name, op, inherited, overrides):
+    # inherits metadata
+    common_kwargs = {
+        'name': name,
+        'op': op,
+        'aliases': None,  # TODO add a check for alias coverage
+        'method_variant': None,
+        'inplace_variant': None,  # TODO: add a check for inplace coverage
+        'supports_scripting': False,
+    }
+
+    # Acquires inherited kwargs
+    kwargs = inherited.copy()
+
+    # Fixes metadata
+    if 'kwargs' in kwargs:
+        kwargs.update(kwargs['kwargs'])
+        del kwargs['kwargs']
+    if 'self' in kwargs:
+        del kwargs['self']
+    if '__class__' in kwargs:
+        del kwargs['__class__']
+    if 'skips' in kwargs:
+        del kwargs['skips']
+    if 'decorators' in kwargs:
+        del kwargs['decorators']
+
+    # Overrides metadata
+    kwargs.update(common_kwargs)
+    kwargs.update(overrides)
+
+    return kwargs
+
+class PythonRefInfo(OpInfo):
+    '''
+    An OpInfo for a Python reference of an OpInfo base class operation.
+    '''
+    def __init__(
+            self,
+            name,  # the stringname of the callable Python reference
+            *,
+            op=None,  # the function variant of the operation, populated as torch.<name> if None
+            torch_opinfo_name,  # the string name of the corresponding torch opinfo
+            **kwargs):  # additional kwargs override kwargs inherited from the torch opinfo
+
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo = _find_referenced_opinfo(torch_opinfo_name)
+        assert isinstance(self.torch_opinfo, OpInfo)
+
+        inherited = self.torch_opinfo._original_opinfo_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+        super(PythonRefInfo, self).__init__(**ukwargs)
+
+class ReductionPythonRefInfo(ReductionOpInfo):
+    '''
+    An OpInfo for a Python reference of an elementwise unary operation.
+    '''
+    def __init__(
+            self,
+            name,  # the stringname of the callable Python reference
+            *,
+            op=None,  # the function variant of the operation, populated as torch.<name> if None
+            torch_opinfo_name,  # the string name of the corresponding torch opinfo
+            **kwargs):  # additional kwargs override kwargs inherited from the torch opinfo
+
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo = _find_referenced_opinfo(torch_opinfo_name)
+        assert isinstance(self.torch_opinfo, ReductionOpInfo)
+
+        inherited = self.torch_opinfo._original_reduction_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        # See https://github.com/pytorch/pytorch/issues/77216
+        self.validate_view_consistency = False
+
+        super().__init__(**ukwargs)
+
+class ElementwiseUnaryPythonRefInfo(UnaryUfuncInfo):
+    '''
+    An OpInfo for a Python reference of an elementwise unary operation.
+    '''
+    def __init__(
+            self,
+            name,  # the stringname of the callable Python reference
+            *,
+            op=None,  # the function variant of the operation, populated as torch.<name> if None
+            torch_opinfo_name,  # the string name of the corresponding torch opinfo
+            **kwargs):  # additional kwargs override kwargs inherited from the torch opinfo
+
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo = _find_referenced_opinfo(torch_opinfo_name)
+        assert isinstance(self.torch_opinfo, UnaryUfuncInfo)
+
+        inherited = self.torch_opinfo._original_unary_ufunc_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        super(ElementwiseUnaryPythonRefInfo, self).__init__(**ukwargs)
+
+class ElementwiseBinaryPythonRefInfo(BinaryUfuncInfo):
+    '''
+    An OpInfo for a Python reference of an elementwise binary operation.
+    '''
+    def __init__(
+            self,
+            name,  # the stringname of the callable Python reference
+            *,
+            op=None,  # the function variant of the operation, populated as torch.<name> if None
+            torch_opinfo_name,  # the string name of the corresponding torch opinfo
+            **kwargs):  # additional kwargs override kwargs inherited from the torch opinfo
+
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo = _find_referenced_opinfo(torch_opinfo_name)
+        assert isinstance(self.torch_opinfo, BinaryUfuncInfo)
+
+        inherited = self.torch_opinfo._original_binary_ufunc_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        super(ElementwiseBinaryPythonRefInfo, self).__init__(**ukwargs)
+
+
+# Separate registry for experimental Python Reference OpInfos.
+python_ref_db = [
+    #
+    # Elementwise Unary OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.abs",
+        torch_opinfo_name="abs",
+        skips=(
+            # On CPU: Output Mismatch as complexhalf uses non-vectorized path vs ref which seems to use
+            # vectorized path
+            # See also : https://github.com/pytorch/pytorch/issues/48486
+            # On CUDA: RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            # On CPU: RuntimeError: unsupported Storage type: torch.complex32
+            # On CUDA: RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,)),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.acos",
+        torch_opinfo_name="acos",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.acosh",
+        torch_opinfo_name="acosh",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.asin",
+        torch_opinfo_name="asin",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.atan",
+        torch_opinfo_name="atan",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.ceil",
+        torch_opinfo_name="ceil",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.cos",
+        torch_opinfo_name="cos",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.cosh",
+        torch_opinfo_name="cosh",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.digamma",
+        torch_opinfo_name="digamma",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erf",
+        torch_opinfo_name="erf",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erfinv",
+        torch_opinfo_name="erfinv",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erfc",
+        torch_opinfo_name="erfc",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.exp",
+        torch_opinfo_name="exp",
+        skips=(
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,))
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.expm1",
+        torch_opinfo_name="expm1",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.floor",
+        torch_opinfo_name="floor",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isfinite",
+        torch_opinfo_name="isfinite",
+        supports_out=True,
+        skips=(
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            # Same reason as `test_python_reference_consistency`
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,)),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isnan",
+        torch_opinfo_name="isnan",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.lgamma",
+        torch_opinfo_name="lgamma",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log",
+        torch_opinfo_name="log",
+        skips=(
+            # RuntimeError: "masked_fill_" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            # RuntimeError: "masked_fill_" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,))
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log1p",
+        torch_opinfo_name="log1p",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.neg",
+        torch_opinfo_name="neg",
+        skips=(
+            # On CPU
+            # RuntimeError: unsupported Storage type: torch.complex32
+            # https://github.com/pytorch/pytorch/issues/73502
+            # On CUDA
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            # Same reason as `test_python_reference_consistency`
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,)),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.reciprocal",
+        torch_opinfo_name="reciprocal",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.round",
+        torch_opinfo_name="round",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sign",
+        torch_opinfo_name="sign",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sin",
+        torch_opinfo_name="sin",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sinh",
+        torch_opinfo_name="sinh",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sqrt",
+        torch_opinfo_name="sqrt",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.square",
+        torch_opinfo_name="square",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.tan",
+        torch_opinfo_name="tan",
+    ),
+    #
+    # Elementwise Unary Special OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.i0e",
+        torch_opinfo_name="special.i0e",
+        decorators=(
+            DecorateInfo(toleranceOverride({
+                torch.bfloat16: tol(atol=1e-2, rtol=0),
+            }), 'TestCommon', 'test_python_reference_consistency', device_type='cpu'),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.i1e",
+        torch_opinfo_name="special.i1e",
+    ),
+    #
+    # Elementwise Unary nn.functional OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.elu",
+        torch_opinfo_name="nn.functional.elu",
+        decorators=(
+            # https://github.com/pytorch/pytorch/issues/77054
+            DecorateInfo(toleranceOverride({
+                torch.bfloat16: tol(atol=1e-2, rtol=0),
+                torch.float16: tol(atol=1e-3, rtol=0),
+            }), 'TestCommon', 'test_python_reference_consistency', device_type='cpu'),
+        ),
+    ),
+    #
+    # Elementwise Binary OpInfos
+    #
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.add",
+        torch_opinfo_name="add",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=False,
+        supports_one_python_scalar=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=1, rtol=0),
+                        torch.float16: tol(atol=1e-2, rtol=0),
+                        torch.chalf: tol(atol=1e-2, rtol=0),
+                    }
+                ),
+                "TestCommon",
+                "test_python_reference_consistency",
+                device_type='cpu'
+            ),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.atan2",
+        torch_opinfo_name="atan2",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_and",
+        torch_opinfo_name="bitwise_and",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_left_shift",
+        torch_opinfo_name="bitwise_left_shift",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_or",
+        torch_opinfo_name="bitwise_or",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_xor",
+        torch_opinfo_name="bitwise_xor",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.eq",
+        torch_opinfo_name="eq",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.float_power",
+        torch_opinfo_name="float_power",
+        skips=(
+            # Test doesn't account for float -> double type promotion
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+            # TODO: FIXME: meta strides for to_dtype are incorrect
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_reference_meta_functions',
+                         device_type='cuda'),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.ge",
+        torch_opinfo_name="ge",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.gt",
+        torch_opinfo_name="gt",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.igamma",
+        torch_opinfo_name="igamma",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.igammac",
+        torch_opinfo_name="igammac",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.isclose",
+        torch_opinfo_name="isclose",
+        skips=(
+            # Intentional xfail -- isclose does not type promote
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.le",
+        torch_opinfo_name="le",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logical_and",
+        torch_opinfo_name="logical_and",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logical_or",
+        torch_opinfo_name="logical_or",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.lt",
+        torch_opinfo_name="lt",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.maximum",
+        torch_opinfo_name="maximum",
+        skips=(
+            # refs.maximum supports scalars, unlike torch.maximum
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_errors'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.minimum",
+        torch_opinfo_name="minimum",
+        skips=(
+            # refs.minimum supports scalars, unlike torch.minimum
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_errors'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.mul",
+        torch_opinfo_name="mul",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=False,
+        supports_one_python_scalar=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,), device_type='cuda', active_if=(not TEST_WITH_ROCM)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.ne",
+        torch_opinfo_name="ne",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.nextafter",
+        torch_opinfo_name="nextafter",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.pow",
+        torch_opinfo_name="pow",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.sub",
+        torch_opinfo_name="sub",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=False,
+        supports_one_python_scalar=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=1, rtol=0),
+                        torch.float16: tol(atol=1e-2, rtol=0),
+                        torch.chalf: tol(atol=1e-2, rtol=0),
+                    }
+                ),
+                "TestCommon",
+                "test_python_reference_consistency",
+                device_type='cpu'
+            ),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.true_divide",
+        torch_opinfo_name="true_divide",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=False,
+        supports_one_python_scalar=True,
+        skips=(
+            # complex("-501.-501.j")/complex("-501.-infj")
+            # PyTorch jiterated Path : 0
+            # Python : 0
+            # PyTorch non-jiterated path : nan + nanj
+            # See also: https://github.com/pytorch/pytorch/issues/52332
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,), device_type='cuda',
+                         active_if=not TEST_WITH_ROCM),
+        ),
+    ),
+    #
+    # Data Conversion & Data Movement Opinfos
+    #
+    PythonRefInfo(
+        "_refs.clone",
+        torch_opinfo_name="clone",
+        skips=(
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,), device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,), device_type='cuda'),
+        ),
+    ),
+    #
+    # View & Shape OpInfos
+    #
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # TODO: fix and/or update to xfails
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                         'TestCommon', 'test_python_reference_meta_functions'),
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.cat",
+        torch_opinfo_name="cat",
+    ),
+    PythonRefInfo(
+        "_refs.chunk",
+        torch_opinfo_name="chunk",
+    ),
+    PythonRefInfo(
+        "_refs.flatten",
+        torch_opinfo_name="flatten",
+        skips=(
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,), device_type='cuda'),
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,), device_type='cuda'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.flip",
+        torch_opinfo_name="flip",
+    ),
+    PythonRefInfo(
+        "_refs.narrow",
+        torch_opinfo_name="narrow",
+    ),
+    PythonRefInfo(
+        "_refs.permute",
+        torch_opinfo_name="permute",
+        skips=(
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon',
+                         'test_python_reference_consistency', dtypes=(torch.chalf,), device_type='cuda'),
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon',
+                         'test_python_reference_meta_functions', dtypes=(torch.chalf,), device_type='cuda'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.reshape",
+        torch_opinfo_name="reshape",
+        skips=(
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_reference_consistency",
+                         dtypes=(torch.chalf,)),
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_reference_meta_functions",
+                         dtypes=(torch.chalf,)),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.stack",
+        torch_opinfo_name="stack",
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/77046
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.squeeze",
+        torch_opinfo_name="squeeze",
+    ),
+    PythonRefInfo(
+        "_refs.tensor_split",
+        torch_opinfo_name="tensor_split",
+        skips=(
+            # TensorMeta doesn't support tolist
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.transpose",
+        torch_opinfo_name="transpose",
+    ),
+    PythonRefInfo(
+        "_refs.unsqueeze",
+        torch_opinfo_name="unsqueeze",
+    ),
+    PythonRefInfo(
+        "_refs.view",
+        torch_opinfo_name="view",
+        skips=(
+            # RuntimeError: "index_select" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_consistency',
+                         dtypes=(torch.chalf,)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_reference_meta_functions',
+                         dtypes=(torch.chalf,)),
+        ),
+    ),
+    #
+    # Reduction Reference OpInfos
+    #
+    ReductionPythonRefInfo(
+        "_refs.sum",
+        torch_opinfo_name="sum",
+        supports_out=True,
+    ),
+    ReductionPythonRefInfo(
+        "_refs.amin",
+        torch_opinfo_name="amin",
+    ),
+    ReductionPythonRefInfo(
+        "_refs.amax",
+        torch_opinfo_name="amax",
+    ),
+    #
+    # Tensor Creation Reference OpInfos
+    #
+    PythonRefInfo(
+        "_refs.empty",
+        torch_opinfo_name="empty",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_reference_consistency'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.empty_like",
+        torch_opinfo_name="empty_like",
         skips=(
-            DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive',
-                         active_if=IS_WINDOWS),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive',
-                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_reference_consistency'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # RuntimeError: "index_select_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, dtypes=(torch.chalf,), device_type='cuda'),
         ),
     ),
+    # TODO: add full and full_like OpInfos
 ]
 
 # Common operator groupings
+ops_and_refs = op_db + python_ref_db
 unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)]
 binary_ufuncs = [op for op in op_db if isinstance(op, BinaryUfuncInfo)]
+binary_ufuncs_and_refs = tuple(op for op in ops_and_refs if isinstance(op, BinaryUfuncInfo))
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse]
 sparse_csr_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse_csr]
+sparse_reduction_ops = [op for op in op_db if isinstance(op, ReductionOpInfo) and op.supports_sparse]
 shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)]
 reduction_ops = [op for op in op_db if isinstance(op, ReductionOpInfo)]
 reference_filtered_ops = [op for op in reduction_ops if op.ref not in (_NOTHING, None)]
 reference_masked_ops = [op for op in reference_filtered_ops if op.name.startswith('_masked.')]
+sparse_masked_reduction_ops = [op for op in sparse_reduction_ops if op.name.startswith('_masked.')]
 
 # TODO: review porting these to make_tensor
 def index_variable(shape, max_indices, device=torch.device('cpu')):
@@ -15593,6 +19283,8 @@ def _compare_large_trilu_indices(
     (0, 3, 0),
     (0, 3, 1),
     (0, 3, -1),
+    (0, 1, 2),
+    (1, 0, 2),
     (3, 0, 0),
     (3, 0, 1),
     (3, 0, -1),
@@ -15675,67 +19367,3 @@ def run_additional_tri_tests(self, device):
         RuntimeError,
         lambda: torch.tril_indices(
             1, 1, device=device, layout=torch.sparse_coo))
-
-# TODO: move into common_utils.py or the test suite(s) that use this
-def unpack_variables(args):
-    if isinstance(args, tuple):
-        return tuple(unpack_variables(elem) for elem in args)
-    else:
-        return args
-
-
-class dont_convert(tuple):
-    pass
-
-
-non_differentiable = collections.namedtuple('non_differentiable', ['tensor'])
-
-
-# TODO: move into common_utils.py or the test suite(s) that use this
-def create_input(call_args, requires_grad=True, non_contiguous=False, call_kwargs=None, dtype=torch.double, device=None):
-    if not isinstance(call_args, tuple):
-        call_args = (call_args,)
-
-    def map_arg(arg):
-        def maybe_non_contig(tensor):
-            return tensor if not non_contiguous else make_non_contiguous(tensor)
-
-        def conjugate(tensor):
-            return tensor.conj()
-
-        if isinstance(arg, torch.Size) or isinstance(arg, dont_convert):
-            return arg
-        elif isinstance(arg, tuple) and len(arg) == 0:
-            var = conjugate(torch.randn((), dtype=dtype, device=device))
-            var.requires_grad = requires_grad
-            return var
-        elif isinstance(arg, tuple) and not isinstance(arg[0], torch.Tensor):
-            return conjugate(maybe_non_contig(torch.randn(*arg, dtype=dtype, device=device))).requires_grad_(requires_grad)
-        # double check casting
-        elif isinstance(arg, non_differentiable):
-            if isinstance(arg.tensor, torch.Tensor):
-                if arg.tensor.dtype == torch.float:
-                    return maybe_non_contig(arg.tensor.to(dtype=torch.double, device=device))
-                if arg.tensor.dtype == torch.cfloat:
-                    return conjugate(maybe_non_contig(arg.tensor.to(dtype=torch.cdouble, device=device)))
-                return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
-            return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
-        elif isinstance(arg, torch.Tensor):
-            if arg.dtype == torch.float:
-                arg = arg.double()
-            if arg.dtype == torch.cfloat:
-                arg = arg.to(torch.cdouble)
-            if arg.is_complex() != dtype.is_complex:
-                raise RuntimeError("User provided tensor is real for a test that runs with complex dtype, ",
-                                   "which is not supported for now")
-            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of v afterwards
-            v = conjugate(maybe_non_contig(arg)).detach().to(device=device).clone()
-            v.requires_grad = requires_grad and (v.is_floating_point() or v.is_complex())
-            return v
-        elif callable(arg):
-            return map_arg(arg(dtype=dtype, device=device))
-        else:
-            return arg
-    args_out = tuple(map_arg(arg) for arg in call_args)
-    kwargs_out = {k: map_arg(v) for k, v in call_kwargs.items()} if call_kwargs else {}
-    return args_out, kwargs_out
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index eed87d08787c..9917988817a1 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -14,11 +14,10 @@
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_nn import nllloss_reference, get_reduction
 from torch.testing._internal.common_utils import (
-    freeze_rng_state, set_single_threaded_if_parallel_tbb, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM)
+    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM)
 from types import ModuleType
 from typing import List, Tuple, Type, Set, Dict
 
-
 # List of all namespaces containing modules to test.
 MODULE_NAMESPACES: List[ModuleType] = [
     torch.nn.modules,
@@ -215,14 +214,14 @@ def bilinear_reference_fn(m, p, x1, x2, bias=True):
 
     module_inputs = [
         ModuleInput(constructor_input=FunctionInput(2, 3, 4),
-                    forward_input=FunctionInput(make_input(shape=(8, 2)), make_input(shape=(8, 3))),
+                    forward_input=FunctionInput(make_input((8, 2)), make_input((8, 3))),
                     reference_fn=lambda m, p, x1, x2: bilinear_reference_fn(m, p, x1, x2)),
         ModuleInput(constructor_input=FunctionInput(2, 3, 4, bias=False),
-                    forward_input=FunctionInput(make_input(shape=(8, 2)), make_input(shape=(8, 3))),
+                    forward_input=FunctionInput(make_input((8, 2)), make_input((8, 3))),
                     desc='no_bias',
                     reference_fn=lambda m, p, x1, x2: bilinear_reference_fn(m, p, x1, x2, bias=False)),
         ModuleInput(constructor_input=FunctionInput(2, 3, 4),
-                    forward_input=FunctionInput(make_input(shape=(2)), make_input(shape=(3))),
+                    forward_input=FunctionInput(make_input((2)), make_input((3))),
                     desc='no_batch_dim',
                     reference_fn=lambda m, p, x1, x2: bilinear_reference_fn(m, p, x1.view(1, -1), x2.view(1, -1))),
     ]
@@ -420,7 +419,7 @@ def generate_regression_criterion_inputs(make_input):
     return [
         ModuleInput(
             constructor_input=FunctionInput(reduction=reduction),
-            forward_input=FunctionInput(make_input(shape=(4, )), make_input(shape=4,)),
+            forward_input=FunctionInput(make_input((4, )), make_input(4,)),
             reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True),
             desc='no_batch_dim_{}'.format(reduction)
         ) for reduction in ['none', 'mean', 'sum']]
@@ -431,7 +430,7 @@ def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad,
 
     return [
         ModuleInput(constructor_input=FunctionInput(kernel_size=2),
-                    forward_input=FunctionInput(make_input(shape=(3, 6))),
+                    forward_input=FunctionInput(make_input((3, 6))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn)]
 
@@ -441,7 +440,7 @@ def module_inputs_torch_nn_AdaptiveAvgPool2d(module_info, device, dtype, require
 
     return [
         ModuleInput(constructor_input=FunctionInput(3,),
-                    forward_input=FunctionInput(make_input(shape=(1, 3, 5, 6))),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
                     desc='single')]
 
 
@@ -450,7 +449,7 @@ def module_inputs_torch_nn_BatchNorm2d(module_info, device, dtype, requires_grad
 
     return [
         ModuleInput(constructor_input=FunctionInput(3,),
-                    forward_input=FunctionInput(make_input(shape=(2, 3, 6, 6))))]
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))))]
 
 
 def module_inputs_torch_nn_BatchNorm3d(module_info, device, dtype, requires_grad, **kwargs):
@@ -458,7 +457,7 @@ def module_inputs_torch_nn_BatchNorm3d(module_info, device, dtype, requires_grad
 
     return [
         ModuleInput(constructor_input=FunctionInput(3,),
-                    forward_input=FunctionInput(make_input(shape=(2, 3, 4, 4, 4))))]
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))))]
 
 
 def module_inputs_torch_nn_ConvNd(module_info, device, dtype, requires_grad, **kwargs):
@@ -474,7 +473,7 @@ def module_inputs_torch_nn_ConvNd(module_info, device, dtype, requires_grad, **k
         ModuleInput(constructor_input=(FunctionInput(C_out, kernel_size, **conv_kwargs) if lazy else
                                        FunctionInput(C_in, C_out, kernel_size, **conv_kwargs)),
                     forward_input=FunctionInput(make_input(
-                        shape=(input_batch_shape if with_batch else input_no_batch_shape))),
+                        input_batch_shape if with_batch else input_no_batch_shape)),
                     desc=('' if with_batch else 'no_batch_dim'),
                     reference_fn=(None if with_batch else no_batch_dim_reference_fn))
         for with_batch, conv_kwargs in itertools.product([True, False], conv_kwargs_list)
@@ -486,17 +485,17 @@ def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, **kwar
 
     return [
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=(3, 2, 5))),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=())),
+                    forward_input=FunctionInput(make_input(())),
                     desc='scalar'),
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(shape=(3,))),
+                    forward_input=FunctionInput(make_input((3,))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=(2, 3, 2, 5))),
+                    forward_input=FunctionInput(make_input((2, 3, 2, 5))),
                     desc='4d_input')]
 
 
@@ -505,14 +504,14 @@ def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, **kwa
 
     return [
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=(3, 2, 5))),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=())),
+                    forward_input=FunctionInput(make_input(())),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1)),
                     desc='scalar'),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(shape=(3,))),
+                    forward_input=FunctionInput(make_input((3,))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn)]
 
@@ -537,12 +536,12 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, **k
 
     return [
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(shape=(2, 3, 4)),
-                                                make_input(shape=(2, 3, 4))),
+                    forward_input=FunctionInput(make_input((2, 3, 4)),
+                                                make_input((2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
                                                                          for a, b in zip(i, t))),
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(shape=()), make_input(shape=())),
+                    forward_input=FunctionInput(make_input(()), make_input(())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
                     desc='scalar')] + generate_regression_criterion_inputs(make_input)
 
@@ -558,23 +557,23 @@ def module_inputs_torch_nn_CrossEntropyLoss(module_info, device, dtype, requires
     for reduction in reductions:
         samples.append(
             ModuleInput(constructor_input=FunctionInput(reduction=reduction),
-                        forward_input=FunctionInput(make_input(shape=(9,)), make_target(shape=(), low=0, high=9)),
+                        forward_input=FunctionInput(make_input((9,)), make_target((), low=0, high=9)),
                         reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True))
         )
         samples.append(
-            ModuleInput(constructor_input=FunctionInput(reduction=reduction, weight=make_weight(shape=(9,))),
-                        forward_input=FunctionInput(make_input(shape=(9,)), make_target(shape=(), low=0, high=9)),
+            ModuleInput(constructor_input=FunctionInput(reduction=reduction, weight=make_weight((9,))),
+                        forward_input=FunctionInput(make_input((9,)), make_target((), low=0, high=9)),
                         reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True))
         )
         samples.append(
             ModuleInput(constructor_input=FunctionInput(reduction=reduction, label_smoothing=0.5),
-                        forward_input=FunctionInput(make_input(shape=(9,)), make_target(shape=(), low=0, high=9)),
+                        forward_input=FunctionInput(make_input((9,)), make_target((), low=0, high=9)),
                         reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True))
         )
         samples.append(
             ModuleInput(constructor_input=FunctionInput(reduction=reduction, label_smoothing=0.5,
-                                                        weight=make_weight(shape=(9,))),
-                        forward_input=FunctionInput(make_input(shape=(9,)), make_target(shape=(), low=0, high=9)),
+                                                        weight=make_weight((9,))),
+                        forward_input=FunctionInput(make_input((9,)), make_target((), low=0, high=9)),
                         reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True))
         )
 
@@ -587,13 +586,13 @@ def module_inputs_torch_nn_Hardswish(module_info, device, dtype, requires_grad,
     return [
         ModuleInput(
             constructor_input=FunctionInput(),
-            forward_input=FunctionInput(make_input(shape=4)),
+            forward_input=FunctionInput(make_input(4)),
             reference_fn=no_batch_dim_reference_fn,
             desc='no_batch_dim',
         ),
         ModuleInput(
             constructor_input=FunctionInput(),
-            forward_input=FunctionInput(make_input(shape=(2, 3, 2, 5))),
+            forward_input=FunctionInput(make_input((2, 3, 2, 5))),
             desc='4d_input')
     ]
 
@@ -604,15 +603,15 @@ def module_inputs_torch_nn_MaxPool2d(module_info, device, dtype, requires_grad,
     return [
         ModuleInput(
             constructor_input=FunctionInput((3, 3), (2, 2), (1, 1)),
-            forward_input=FunctionInput(make_input(shape=((3, 7, 7)))),
+            forward_input=FunctionInput(make_input(((3, 7, 7)))),
             desc='3d_input'),
         ModuleInput(
             constructor_input=FunctionInput((3, 3), (2, 2), (1, 1)),
-            forward_input=FunctionInput(make_input(shape=(1, 3, 7, 7))),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7))),
             desc='4d_input'),
         ModuleInput(
             constructor_input=FunctionInput((3, 3), (2, 2), (1, 1), return_indices=True),
-            forward_input=FunctionInput(make_input(shape=(1, 3, 7, 7))),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7))),
             desc='return_indices'),
     ]
 
@@ -623,12 +622,12 @@ def module_inputs_torch_nn_Sigmoid(module_info, device, dtype, requires_grad, **
     return [
         ModuleInput(
             constructor_input=FunctionInput(),
-            forward_input=FunctionInput(make_input(shape=(2, 3, 4, 5))),
+            forward_input=FunctionInput(make_input((2, 3, 4, 5))),
             desc='channels_last_mem_format'
         ),
         ModuleInput(
             constructor_input=FunctionInput(),
-            forward_input=FunctionInput(make_input(shape=(2, 3, 3, 4, 5))),
+            forward_input=FunctionInput(make_input((2, 3, 3, 4, 5))),
             desc='channels_last_3d_mem_format'
         )
     ]
@@ -641,14 +640,14 @@ def module_inputs_torch_nn_TransformerEncoderLayer(module_info, device, dtype, r
         ModuleInput(
             constructor_input=FunctionInput(4, 2, 16, 0.0),
             forward_input=FunctionInput(
-                make_input(shape=(2, 3, 4))
+                make_input((2, 3, 4))
             ),
             desc='relu_activation'
         ),
         ModuleInput(
             constructor_input=FunctionInput(4, 2, 8, 0.0, F.gelu),
             forward_input=FunctionInput(
-                make_input(shape=(2, 3, 4))
+                make_input((2, 3, 4))
             ),
             desc='gelu_activation'
         ), ]
@@ -662,7 +661,7 @@ def module_inputs_torch_nn_TransformerEncoderLayer(module_info, device, dtype, r
                 constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
                                                 dropout=0.0, batch_first=True, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), src_mask=src_mask, src_key_padding_mask=src_key_padding_mask
+                    make_input((3, 4)), src_mask=src_mask, src_key_padding_mask=src_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
                                      batch_first=True, kwargs_to_batchify={'src_key_padding_mask': 0}),
@@ -673,7 +672,7 @@ def module_inputs_torch_nn_TransformerEncoderLayer(module_info, device, dtype, r
             ModuleInput(
                 constructor_input=FunctionInput(4, 2, 8, dropout=0.0, batch_first=False, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), src_mask=src_mask, src_key_padding_mask=src_key_padding_mask
+                    make_input((3, 4)), src_mask=src_mask, src_key_padding_mask=src_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
                                      batch_first=False, kwargs_to_batchify={'src_key_padding_mask': 0}),
@@ -690,14 +689,14 @@ def module_inputs_torch_nn_TransformerDecoderLayer(module_info, device, dtype, r
         ModuleInput(
             constructor_input=FunctionInput(4, 2, 16, 0.0),
             forward_input=FunctionInput(
-                make_input(shape=(2, 3, 4)), make_input(shape=(2, 3, 4))
+                make_input((2, 3, 4)), make_input((2, 3, 4))
             ),
             desc='relu_activation'
         ),
         ModuleInput(
             constructor_input=FunctionInput(4, 2, 8, 0.0, F.gelu),
             forward_input=FunctionInput(
-                make_input(shape=(2, 3, 4)), make_input(shape=(2, 3, 4))
+                make_input((2, 3, 4)), make_input((2, 3, 4))
             ),
             desc='gelu_activation'
         ), ]
@@ -714,7 +713,7 @@ def module_inputs_torch_nn_TransformerDecoderLayer(module_info, device, dtype, r
                 constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
                                                 dropout=0.0, batch_first=True, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), make_input(shape=(3, 4)), tgt_mask=tgt_mask, memory_mask=memory_mask,
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, memory_mask=memory_mask,
                     tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
@@ -727,7 +726,7 @@ def module_inputs_torch_nn_TransformerDecoderLayer(module_info, device, dtype, r
             ModuleInput(
                 constructor_input=FunctionInput(4, 2, 8, dropout=0.0, batch_first=False, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), make_input(shape=(3, 4)), tgt_mask=tgt_mask, memory_mask=memory_mask,
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, memory_mask=memory_mask,
                     tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
@@ -755,7 +754,7 @@ def module_inputs_torch_nn_Transformer(module_info, device, dtype, requires_grad
                                                 num_encoder_layers=1, num_decoder_layers=1,
                                                 dropout=0.0, batch_first=True, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), make_input(shape=(3, 4)), tgt_mask=tgt_mask, src_mask=src_mask,
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, src_mask=src_mask,
                     tgt_key_padding_mask=tgt_key_padding_mask, src_key_padding_mask=src_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
@@ -770,7 +769,7 @@ def module_inputs_torch_nn_Transformer(module_info, device, dtype, requires_grad
                                                 num_encoder_layers=1, num_decoder_layers=1,
                                                 dropout=0.0, batch_first=False, norm_first=norm_first),
                 forward_input=FunctionInput(
-                    make_input(shape=(3, 4)), make_input(shape=(3, 4)), tgt_mask=tgt_mask, src_mask=src_mask,
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, src_mask=src_mask,
                     tgt_key_padding_mask=tgt_key_padding_mask, src_key_padding_mask=src_key_padding_mask
                 ),
                 reference_fn=partial(no_batch_dim_reference_fn,
@@ -995,21 +994,29 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
 module_db: List[ModuleInfo] = [
     ModuleInfo(torch.nn.AdaptiveAvgPool2d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-               module_inputs_func=module_inputs_torch_nn_AdaptiveAvgPool2d),
+               module_inputs_func=module_inputs_torch_nn_AdaptiveAvgPool2d,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.AvgPool1d,
                module_inputs_func=module_inputs_torch_nn_AvgPool1d,
                skips=(
                    # No channels_last support for AvgPool1d as it does not take 4D inputs
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.BatchNorm2d,
                module_inputs_func=module_inputs_torch_nn_BatchNorm2d,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=(
                    # Failure on ROCM for BatchNorm2d float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),)
                ),
     ModuleInfo(torch.nn.BatchNorm3d,
                module_inputs_func=module_inputs_torch_nn_BatchNorm3d,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=(
                    # Failure on ROCM for BatchNorm3d float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),)
@@ -1023,6 +1030,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64])
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1036,6 +1044,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1049,6 +1058,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1062,6 +1072,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1075,6 +1086,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1088,17 +1100,22 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                )),
     ModuleInfo(torch.nn.ELU,
-               module_inputs_func=module_inputs_torch_nn_ELU),
+               module_inputs_func=module_inputs_torch_nn_ELU,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.L1Loss,
                module_inputs_func=module_inputs_torch_nn_L1Loss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.LazyConv1d,
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=True),
@@ -1112,6 +1129,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1121,6 +1139,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
@@ -1144,6 +1163,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1160,6 +1180,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1176,6 +1197,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1192,6 +1214,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -1199,6 +1222,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
     ModuleInfo(torch.nn.Linear,
                module_inputs_func=module_inputs_torch_nn_Linear,
                skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for Linear currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -1212,6 +1236,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                        'TestModule', 'test_forward', device_type='cpu')
                ],
                skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for Bilinear currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -1222,72 +1247,105 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, **kwa
                    # return_indices=True for MaxPool2D), submit fix
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_non_contiguous_tensors'),
                    # TODO: test_cpu_gpu_parity doesn't handle case where output is not a singleton, submit fix
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.NLLLoss,
                module_inputs_func=module_inputs_torch_nn_NLLLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.GaussianNLLLoss,
                module_inputs_func=module_inputs_torch_nn_GaussianNLLLoss,
                skips=(
                    # No channels_last support for loss functions.
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)),
     ModuleInfo(torch.nn.CrossEntropyLoss,
-               module_inputs_func=module_inputs_torch_nn_CrossEntropyLoss),
+               module_inputs_func=module_inputs_torch_nn_CrossEntropyLoss,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.Hardswish,
                module_inputs_func=module_inputs_torch_nn_Hardswish,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                supports_gradgrad=False),
     ModuleInfo(torch.nn.TransformerEncoderLayer,
                module_inputs_func=module_inputs_torch_nn_TransformerEncoderLayer,
                skips=(
                    # No channels_last support for TransformerEncoderLayer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.TransformerDecoderLayer,
                module_inputs_func=module_inputs_torch_nn_TransformerDecoderLayer,
                skips=(
                    # No channels_last support for TransformerDecoderLayer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Transformer,
                module_inputs_func=module_inputs_torch_nn_Transformer,
                skips=(
                    # No channels_last support for Transformer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.MultiheadAttention,
                module_inputs_func=module_inputs_torch_nn_MultiheadAttention,
                skips=(
                    # No channels_last support for MultiheadAttention currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Embedding,
                module_inputs_func=module_inputs_torch_nn_Embedding,
                skips=(
-                   # No channels_last support for Embedding.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ReLU,
-               module_inputs_func=module_inputs_torch_nn_ReLU),
+               module_inputs_func=module_inputs_torch_nn_ReLU,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.RNNCell,
-               module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU_Cell, is_rnn=True)),
+               module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU_Cell, is_rnn=True),
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.GRUCell,
-               module_inputs_func=module_inputs_torch_nn_RNN_GRU_Cell),
+               module_inputs_func=module_inputs_torch_nn_RNN_GRU_Cell,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.LSTMCell,
-               module_inputs_func=module_inputs_torch_nn_LSTMCell),
+               module_inputs_func=module_inputs_torch_nn_LSTMCell,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.Sigmoid,
-               module_inputs_func=module_inputs_torch_nn_Sigmoid),
+               module_inputs_func=module_inputs_torch_nn_Sigmoid,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+               ),
     ModuleInfo(torch.nn.RNN,
                module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=True),
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=rnn_gru_lstm_module_info_decorators
                ),
     ModuleInfo(torch.nn.GRU,
                module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=False),
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=rnn_gru_lstm_module_info_decorators),
     ModuleInfo(torch.nn.LSTM,
                module_inputs_func=module_inputs_torch_nn_LSTM,
+               skips=(
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=rnn_gru_lstm_module_info_decorators)
 ]
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 599e4fecabe0..dadcac9285f6 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -3716,12 +3716,16 @@ def unsqueeze_inp(inp):
     ),
     dict(
         module_name='GELU',
+        constructor_args=('none',),
+        cpp_constructor_args='torch::nn::GELUOptions().approximate(\"none\")',
         input_size=(),
         desc='scalar',
         reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))),
     ),
     dict(
         module_name='GELU',
+        constructor_args=('none',),
+        cpp_constructor_args='torch::nn::GELUOptions().approximate(\"none\")',
         input_size=(3, 2, 5),
         reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))),
     ),
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 5a142794be57..42b1ea20c225 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -16,6 +16,7 @@
     QuantType,
     default_dynamic_qat_qconfig,
     default_embedding_qat_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
 )
 from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
@@ -294,6 +295,14 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch.onnx._CAFFE2_ATEN_FALLBACK:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
 try:
     import torchvision  # noqa: F401
     HAS_TORCHVISION = True
@@ -872,8 +881,8 @@ def checkGraphModeFxOp(
                 prepare_expected_node_occurrence, prepare_expected_node_list)
 
             prepared_copy = copy.deepcopy(prepared)
-            qgraph = convert_fx(prepared)
-            qgraph_reference = convert_fx(prepared_copy, is_reference=True)
+            qgraph = convert_fx(copy.deepcopy(prepared))
+            qgraph_reference = convert_fx(copy.deepcopy(prepared), is_reference=True)
             result = qgraph(*inputs)
             result_reference = qgraph_reference(*inputs)
             qgraph_copy = copy.deepcopy(qgraph)
@@ -1694,9 +1703,9 @@ class ManualConvLinearQATModel(torch.nn.Module):
     r"""A module with manually inserted `QuantStub` and `DeQuantStub`
     and contains both linear and conv modules
     """
-    def __init__(self):
+    def __init__(self, qconfig=None):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qat_qconfig("qnnpack")
+        self.qconfig = qconfig if qconfig else torch.quantization.get_default_qat_qconfig("qnnpack")
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.conv = torch.nn.Conv2d(3, 1, kernel_size=3).to(dtype=torch.float)
@@ -1711,6 +1720,13 @@ def forward(self, x):
         x = self.fc2(x)
         return self.dequant(x)
 
+class ManualConvLinearSymmQATModel(ManualConvLinearQATModel):
+    r"""Same as ManualConvLinearQATModule but with Symmetric Quantization.
+    Supported only with qnnpack.
+    """
+    def __init__(self):
+        super().__init__(default_symmetric_qnnpack_qat_qconfig)
+
 class ManualEmbeddingBagLinear(nn.Module):
     def __init__(self):
         super(ManualEmbeddingBagLinear, self).__init__()
@@ -1809,7 +1825,7 @@ def forward(self, x):
         x = self.sub1(x)
         x = self.dequant(x)
         x = self.sub2(x)
-        x = x.view(-1, 36).contiguous()
+        x = x.reshape(-1, 36).contiguous()
         x = self.fc(x)
         y = self.conv2(y)
         y = self.relu2(y)
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 9440e825e411..597fd774e329 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -46,9 +46,12 @@ def _requantize(x, multiplier, zero_point, qmin=0, qmax=255, qtype=np.uint8):
     qx = np.clip(qx, qmin, qmax).astype(qtype)
     return qx
 
-def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
+def _calculate_dynamic_qparams(X, dtype, reduce_range=False, qscheme=torch.per_tensor_affine):
     """Calculate the dynamic quantization parameters (scale, zero_point)
     according to the min and max element of the tensor"""
+    assert qscheme in (torch.per_tensor_affine, torch.per_tensor_symmetric)
+    if qscheme == torch.per_tensor_symmetric:
+        assert dtype == torch.qint8
     if isinstance(X, torch.Tensor):
         X = X.numpy()
     if dtype == torch.qint8:
@@ -63,17 +66,25 @@ def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
             qmin, qmax = 0, 255
     min_val = X.min()
     max_val = X.max()
+    is_symmetric = (qscheme == torch.per_tensor_symmetric)
     if min_val == max_val:
         scale = 1.0
         zero_point = 0
     else:
-        max_val = max(max_val, 0.0)
-        min_val = min(min_val, 0.0)
-        scale = (max_val - min_val) / (qmax - qmin)
-        scale = max(scale, np.finfo(np.float32).eps)
-        zero_point = qmin - round(min_val / scale)
-        zero_point = max(qmin, zero_point)
-        zero_point = min(qmax, zero_point)
+        if is_symmetric:
+            max_val = max(max_val, -min_val)
+            min_val = -max_val
+            scale = (max_val - min_val) / (qmax - qmin)
+            scale = max(scale, np.finfo(np.float32).eps)
+            zero_point = 0
+        else:
+            max_val = max(max_val, 0.0)
+            min_val = min(min_val, 0.0)
+            scale = (max_val - min_val) / (qmax - qmin)
+            scale = max(scale, np.finfo(np.float32).eps)
+            zero_point = qmin - round(min_val / scale)
+            zero_point = max(qmin, zero_point)
+            zero_point = min(qmax, zero_point)
     return [float(scale), int(zero_point)]
 
 def _calculate_dynamic_per_channel_qparams(X, dtype):
@@ -165,6 +176,8 @@ def qengine_is_fbgemm():
     return torch.backends.quantized.engine == 'fbgemm'
 def qengine_is_qnnpack():
     return torch.backends.quantized.engine == 'qnnpack'
+def qengine_is_onednn():
+    return torch.backends.quantized.engine == 'onednn'
 
 # Helper function used to simulate per-channel fake-quant against any axis
 def _permute_to_axis_zero(X, axis):
diff --git a/torch/testing/_internal/common_subclass.py b/torch/testing/_internal/common_subclass.py
new file mode 100644
index 000000000000..c04a87111a5e
--- /dev/null
+++ b/torch/testing/_internal/common_subclass.py
@@ -0,0 +1,219 @@
+import torch
+from copy import deepcopy
+from torch.utils._pytree import tree_map
+
+# TODO: Move LoggingTensor here.
+from torch.testing._internal.logging_tensor import LoggingTensor
+
+
+# Base class for wrapper-style tensors.
+class WrapperTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, *args, **kwargs):
+        t, kwargs = cls.get_wrapper_properties(*args, **kwargs)
+        if "size" not in kwargs:
+            size = t.size()
+        else:
+            size = kwargs["size"]
+            del kwargs["size"]
+        if "dtype" not in kwargs:
+            kwargs["dtype"] = t.dtype
+        if "layout" not in kwargs:
+            kwargs["layout"] = t.layout
+        if "device" not in kwargs:
+            kwargs["device"] = t.device
+        if "requires_grad" not in kwargs:
+            kwargs["requires_grad"] = False
+        # Ignore memory_format and pin memory for now as I don't know how to
+        # safely access them on a Tensor (if possible??)
+
+        wrapper = torch.Tensor._make_wrapper_subclass(cls, size, **kwargs)
+        wrapper._validate_methods()
+        return wrapper
+
+    @classmethod
+    def get_wrapper_properties(cls, *args, **kwargs):
+        # Should return both an example Tensor and a dictionaly of kwargs
+        # to override any of that example Tensor's properly.
+        # This is very similar to the `t.new_*(args)` API
+        raise NotImplementedError("You need to implement get_wrapper_properties")
+
+    def _validate_methods(self):
+        # Skip this if not in debug mode?
+        # Changing these on the python side is wrong as it would not be properly reflected
+        # on the c++ side
+        # This doesn't catch attributes set in the __init__
+        forbidden_overrides = ["size", "stride", "dtype", "layout", "device", "requires_grad"]
+        for el in forbidden_overrides:
+            if getattr(self.__class__, el) is not getattr(torch.Tensor, el):
+                raise RuntimeError(f"Subclass {self.__class__.__name__} is overwriting the "
+                                   f"property {el} but this is not allowed as such change would "
+                                   "not be reflected to c++ callers.")
+
+
+class DiagTensorBelow(WrapperTensor):
+    @classmethod
+    def get_wrapper_properties(cls, diag, requires_grad=False):
+        assert diag.ndim == 1
+        return diag, {"size": diag.size() + diag.size(), "requires_grad": requires_grad}
+
+    def __init__(self, diag, requires_grad=False):
+        self.diag = diag
+
+    handled_ops = {}
+
+    # We disable torch function here to avoid any unwanted wrapping of the output
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        # For everything else, call the handler:
+        fn = cls.handled_ops.get(func.__name__, None)
+        if fn:
+            return fn(*args, **kwargs or {})
+        else:
+            # Note that here, because we don't need to provide the autograd formulas
+            # we can have a default "fallback" that creates a plain Tensor based
+            # on the diag elements and calls the func again.
+
+            def unwrap(e):
+                return e.diag.diag() if isinstance(e, DiagTensorBelow) else e
+
+            def wrap(e):
+                if isinstance(e, torch.Tensor) and e.ndim == 1:
+                    return DiagTensorBelow(e)
+                if isinstance(e, torch.Tensor) and e.ndim == 2 and e.count_nonzero() == e.diag().count_nonzero():
+                    return DiagTensorBelow(e.diag())
+                return e
+
+            rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs or {})))
+            return rs
+
+    def __repr__(self):
+        return super().__repr__(tensor_contents=f"diag={self.diag}")
+
+
+class SparseTensor(WrapperTensor):
+    @classmethod
+    def get_wrapper_properties(cls, size, values, indices, requires_grad=False):
+        assert values.device == indices.device
+        return values, {"size": size, "requires_grad": requires_grad}
+
+    def __init__(self, size, values, indices, requires_grad=False):
+        self.values = values
+        self.indices = indices
+
+    def __repr__(self):
+        return super().__repr__(tensor_contents=f"values={self.values}, indices={self.indices}")
+
+    def sparse_to_dense(self):
+        res = torch.zeros(self.size(), dtype=self.values.dtype)
+        res[self.indices.unbind(1)] = self.values
+        return res
+
+    @staticmethod
+    def from_dense(t):
+        indices = t.nonzero()
+        values = t[indices.unbind(1)]
+        return SparseTensor(t.size(), values, indices)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        func_name = f"{func.__module__}.{func.__name__}"
+
+        res = cls._try_call_special_impl(func_name, args, kwargs)
+        if res is not NotImplemented:
+            return res
+
+        # Otherwise, use a default implementation that construct dense
+        # tensors and use that to compute values
+        def unwrap(e):
+            return e.sparse_to_dense() if isinstance(e, SparseTensor) else e
+
+        # Wrap back all Tensors into our custom class
+        def wrap(e):
+            # Check for zeros and use that to get indices
+            return SparseTensor.from_dense(e) if isinstance(e, torch.Tensor) else e
+
+        rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs or {})))
+        return rs
+
+    # To show how things happen later
+    def __rmul__(self, other):
+        return super().__rmul__(other)
+
+    _SPECIAL_IMPLS = {}
+
+    @classmethod
+    def _try_call_special_impl(cls, func, args, kwargs):
+        if func not in cls._SPECIAL_IMPLS:
+            return NotImplemented
+        return cls._SPECIAL_IMPLS[func](args, kwargs)
+
+
+# Example non-wrapper subclass that stores extra state.
+class NonWrapperTensor(torch.Tensor):
+    def __new__(cls, data):
+        t = torch.Tensor._make_subclass(cls, data)
+        t.extra_state = {
+            'last_func_called': None
+        }
+        return t
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        result = super().__torch_function__(func, types, args, kwargs)
+
+        if isinstance(result, cls):
+            # Do something with the extra state. For the example here, just store the name of the
+            # last function called (skip for deepcopy so the copy has the same extra state).
+            if func is torch.Tensor.__deepcopy__:
+                result.extra_state = deepcopy(args[0].extra_state)
+            else:
+                result.extra_state = {
+                    'last_func_called': func.__name__,
+                }
+
+        return result
+
+    # new_empty() must be defined for deepcopy to work
+    def new_empty(self, shape):
+        return type(self)(torch.empty(shape))
+
+
+# Class used to store info about subclass tensors used in testing.
+class SubclassInfo:
+
+    __slots__ = ['name', 'create_fn', 'closed_under_ops']
+
+    def __init__(self, name, create_fn, closed_under_ops=True):
+        self.name = name
+        self.create_fn = create_fn  # create_fn(shape) -> tensor instance
+        self.closed_under_ops = closed_under_ops
+
+
+subclass_db = {
+    torch.Tensor: SubclassInfo(
+        'base_tensor', create_fn=lambda shape: torch.randn(shape)
+    ),
+    NonWrapperTensor: SubclassInfo(
+        'non_wrapper_tensor',
+        create_fn=lambda shape: NonWrapperTensor(torch.randn(shape))
+    ),
+    LoggingTensor: SubclassInfo(
+        'logging_tensor',
+        create_fn=lambda shape: LoggingTensor(torch.randn(shape))
+    ),
+    SparseTensor: SubclassInfo(
+        'sparse_tensor',
+        create_fn=lambda shape: SparseTensor.from_dense(torch.randn(shape).relu())
+    ),
+    DiagTensorBelow: SubclassInfo(
+        'diag_tensor_below',
+        create_fn=lambda shape: DiagTensorBelow(torch.randn(shape)),
+        closed_under_ops=False  # sparse semantics
+    ),
+}
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index ccb538fa3f2a..c4280c1ae6c9 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -63,6 +63,7 @@
 from torch import Tensor
 import torch.backends.cudnn
 import torch.backends.mkl
+import torch.backends.xnnpack
 from enum import Enum
 from statistics import mean
 import functools
@@ -70,7 +71,8 @@
 from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.nn import ModuleList, ModuleDict, Sequential, ParameterList, ParameterDict
 from torch._C import ScriptList, ScriptDict  # type: ignore[attr-defined]
-
+from torch.onnx import (register_custom_op_symbolic,
+                        unregister_custom_op_symbolic)
 torch.backends.disable_global_flags()
 
 FILE_SCHEMA = "file://"
@@ -368,9 +370,9 @@ class ProfilingMode(Enum):
 
 def cppProfilingFlagsToProfilingMode():
     old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
-    old_prof_mode_state = torch._C._jit_set_profiling_mode(True)
+    old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
     torch._C._jit_set_profiling_executor(old_prof_exec_state)
-    torch._C._jit_set_profiling_mode(old_prof_mode_state)
+    torch._C._get_graph_executor_optimize(old_prof_mode_state)
 
     if old_prof_exec_state:
         if old_prof_mode_state:
@@ -384,23 +386,23 @@ def cppProfilingFlagsToProfilingMode():
 def enable_profiling_mode_for_profiling_tests():
     if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
         old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
-        old_prof_mode_state = torch._C._jit_set_profiling_mode(True)
+        old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
     try:
         yield
     finally:
         if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
             torch._C._jit_set_profiling_executor(old_prof_exec_state)
-            torch._C._jit_set_profiling_mode(old_prof_mode_state)
+            torch._C._get_graph_executor_optimize(old_prof_mode_state)
 
 @contextmanager
 def enable_profiling_mode():
     old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
-    old_prof_mode_state = torch._C._jit_set_profiling_mode(True)
+    old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
     try:
         yield
     finally:
         torch._C._jit_set_profiling_executor(old_prof_exec_state)
-        torch._C._jit_set_profiling_mode(old_prof_mode_state)
+        torch._C._get_graph_executor_optimize(old_prof_mode_state)
 
 @contextmanager
 def num_profiled_runs(num_runs):
@@ -441,8 +443,8 @@ def _get_test_report_path():
     test_source = override if override is not None else 'python-unittest'
     return os.path.join('test-reports', test_source)
 
-
-parser = argparse.ArgumentParser()
+is_running_via_run_test = "run_test.py" in getattr(__main__, "__file__", "")
+parser = argparse.ArgumentParser(add_help=not is_running_via_run_test)
 parser.add_argument('--subprocess', action='store_true',
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
@@ -645,6 +647,29 @@ def run_tests(argv=UNITTEST_ARGS):
     elif TEST_SAVE_XML is not None:
         # import here so that non-CI doesn't need xmlrunner installed
         import xmlrunner  # type: ignore[import]
+        from xmlrunner.result import _XMLTestResult  # type: ignore[import]
+
+        class XMLTestResultVerbose(_XMLTestResult):
+            """
+            Adding verbosity to test outputs:
+            by default test summary prints 'skip',
+            but we want to also print the skip reason.
+            GH issue: https://github.com/pytorch/pytorch/issues/69014
+
+            This works with unittest_xml_reporting<=3.2.0,>=2.0.0
+            (3.2.0 is latest at the moment)
+            """
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+
+            def addSkip(self, test, reason):
+                super().addSkip(test, reason)
+                for c in self.callback.__closure__:
+                    if isinstance(c.cell_contents, str) and c.cell_contents == 'skip':
+                        # this message is printed in test summary;
+                        # it stands for `verbose_str` captured in the closure
+                        c.cell_contents = f"skip: {reason}"
+
         test_filename = sanitize_test_filename(inspect.getfile(sys._getframe(1)))
         test_report_path = TEST_SAVE_XML + LOG_SUFFIX
         test_report_path = os.path.join(test_report_path, test_filename)
@@ -652,7 +677,10 @@ def run_tests(argv=UNITTEST_ARGS):
         verbose = '--verbose' in argv or '-v' in argv
         if verbose:
             print('Test results will be stored in {}'.format(test_report_path))
-        unittest.main(argv=argv, testRunner=xmlrunner.XMLTestRunner(output=test_report_path, verbosity=2 if verbose else 1))
+        unittest.main(argv=argv, testRunner=xmlrunner.XMLTestRunner(
+            output=test_report_path,
+            verbosity=2 if verbose else 1,
+            resultclass=XMLTestResultVerbose))
     elif REPEAT_COUNT > 1:
         for _ in range(REPEAT_COUNT):
             if not unittest.main(exit=False, argv=argv).result.wasSuccessful():
@@ -739,7 +767,7 @@ def _check_module_exists(name: str) -> bool:
 
 TEST_LIBROSA = _check_module_exists('librosa')
 
-BUILD_WITH_CAFFE2 = _check_module_exists("caffe2.python.caffe2_pybind11_state")
+BUILD_WITH_CAFFE2 = torch.onnx._CAFFE2_ATEN_FALLBACK
 
 # Python 2.7 doesn't have spawn
 NO_MULTIPROCESSING_SPAWN = os.environ.get('NO_MULTIPROCESSING_SPAWN', '0') == '1'
@@ -752,7 +780,6 @@ def _check_module_exists(name: str) -> bool:
 # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
 # See #64427
 TEST_WITH_MIOPEN_SUGGEST_NHWC = os.getenv('PYTORCH_MIOPEN_SUGGEST_NHWC', '0') == '1'
-
 # Enables tests that are slow to run (disabled by default)
 TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'
 
@@ -762,10 +789,29 @@ def _check_module_exists(name: str) -> bool:
 # it felt a little awkward.
 TEST_SKIP_FAST = os.getenv('PYTORCH_TEST_SKIP_FAST', '0') == '1'
 
-# Disables noarch tests; all but one CI configuration disables these.  We don't
-# disable them for local runs because you still want to run them
-# (unlike slow tests!)
-TEST_SKIP_NOARCH = os.getenv('PYTORCH_TEST_SKIP_NOARCH', '0') == '1'
+# Enables crossref tests, in addition to standard tests which
+# are being run.  crossref tests work by installing a torch
+# function mode that runs extra compute alongside the regular
+# computation that happens with the test.  After both computations
+# are done, we cross-reference them (thus the name) to check for
+# correction, before throwing out the extra compute and proceeding
+# as we had before.  By default, we don't run these tests.
+TEST_WITH_CROSSREF = os.getenv('PYTORCH_TEST_WITH_CROSSREF', '0') == '1'
+
+def skipIfCrossRef(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_WITH_CROSSREF:
+            raise unittest.SkipTest("test doesn't currently with crossref")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+class CrossRefMode(torch.overrides.TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        r = func(*args, **kwargs)
+        return r
 
 # Determine whether to enable cuda memory leak check.
 # CUDA mem leak check is expensive and thus we don't want to execute it on every
@@ -824,6 +870,10 @@ def has_corresponding_torch_dtype(np_dtype):
 
 # Dict of torch dtype -> NumPy dtype
 torch_to_numpy_dtype_dict = {value : key for (key, value) in numpy_to_torch_dtype_dict.items()}
+torch_to_numpy_dtype_dict.update({
+    torch.bfloat16: np.float32,
+    torch.complex32: np.complex64
+})
 
 def skipIfRocm(fn):
     @wraps(fn)
@@ -834,6 +884,15 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def skipIfMps(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if torch.backends.mps.is_available():
+            raise unittest.SkipTest("test doesn't currently work with MPS")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
 # Skips a test on CUDA if ROCm is unavailable or its version is lower than requested.
 def skipIfRocmVersionLessThan(version=None):
     def dec_fn(fn):
@@ -978,6 +1037,14 @@ def run_test_function(self):
         return fn(self, device)
     return run_test_function
 
+def skipIfNoXNNPACK(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch.backends.xnnpack.enabled:
+            raise unittest.SkipTest('XNNPACK must be enabled for these tests. Please build with USE_XNNPACK=1.')
+        else:
+            fn(*args, **kwargs)
+    return wrapper
 
 def skipIfNoLapack(fn):
     @wraps(fn)
@@ -988,7 +1055,6 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
-
 def skipIfNotRegistered(op_name, message):
     """Wraps the decorator to hide the import of the `core`.
 
@@ -1010,6 +1076,17 @@ def skipIfNotRegistered(op_name, message):
         skipper = unittest.skip("Cannot import `caffe2.python.core`")
     return skipper
 
+def _decide_skip_caffe2(expect_caffe2, reason):
+    def skip_dec(func):
+        def wrapper(self):
+            if torch.onnx._CAFFE2_ATEN_FALLBACK != expect_caffe2:
+                raise unittest.SkipTest(reason)
+            return func(self)
+        return wrapper
+    return skip_dec
+
+skipIfCaffe2 = _decide_skip_caffe2(False, "Not compatible with Caffe2")
+skipIfNoCaffe2 = _decide_skip_caffe2(True, "Caffe2 is not available")
 
 def skipIfNoSciPy(fn):
     @wraps(fn)
@@ -1054,20 +1131,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-# noarch tests are tests that should be only run on one CI configuration,
-# because they don't exercise any interesting platform specific code
-# and so if run once, indicate the test should pass everywhere.
-# See https://github.com/pytorch/pytorch/issues/53743
-def noarchTest(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if TEST_SKIP_NOARCH:
-            raise unittest.SkipTest("test is noarch: we are skipping noarch tests due to TEST_SKIP_NOARCH")
-        else:
-            fn(*args, **kwargs)
-    return wrapper
-
-
 def slowAwareTest(fn):
     fn.__dict__['slow_test'] = True
     return fn
@@ -1570,15 +1633,6 @@ def _process_inputs(self, actual, expected, *, id, allow_subclasses):
             self._check_supported(tensor, id=id)
         return actual, expected
 
-    # TODO: As discussed in https://github.com/pytorch/pytorch/issues/68590#issuecomment-975333883,
-    #  this relaxation should only be temporary and this overwrite should be removed completely in the future.
-    def _equalize_attributes(self, actual, expected):
-        actual, expected = super()._equalize_attributes(actual, expected)
-        if not actual.is_sparse:
-            return actual, expected
-
-        return actual.coalesce(), expected.coalesce()
-
 
 class UnittestPair(Pair):
     """Fallback ABC pair that handles non-numeric inputs.
@@ -1799,13 +1853,15 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True):
         if failures_before < len(result.failures):
             print(f"    {self._testMethodName} failed - num_retries_left: {num_retries_left}")
             if (report_only and num_retries_left < MAX_NUM_RETRIES) or (not report_only and num_retries_left > 0):
-                result.failures.pop(-1)
+                _, traceback_str = result.failures.pop(-1)
+                print(traceback_str)
                 result.addExpectedFailure(self, err)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only)
         elif errors_before < len(result.errors):
             print(f"    {self._testMethodName} errored - num_retries_left: {num_retries_left}")
             if (report_only and num_retries_left < MAX_NUM_RETRIES) or (not report_only and num_retries_left > 0):
-                result.errors.pop(-1)
+                _, traceback_str = result.errors.pop(-1)
+                print(traceback_str)
                 result.addExpectedFailure(self, err)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only)
         elif report_only and num_retries_left < MAX_NUM_RETRIES:
@@ -1815,8 +1871,11 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True):
 
 
     def run(self, result=None):
-        num_runs = MAX_NUM_RETRIES + 1 if RETRY_TEST_CASES else 1
-        self._run_with_retry(result=result, num_runs_left=num_runs, report_only=not OVERRIDE_FLAKY_SIGNAL)
+        with contextlib.ExitStack() as stack:
+            if TEST_WITH_CROSSREF:
+                stack.enter_context(torch.overrides.push_torch_function_mode(CrossRefMode))
+            num_runs = MAX_NUM_RETRIES + 1 if RETRY_TEST_CASES else 1
+            self._run_with_retry(result=result, num_runs_left=num_runs, report_only=not OVERRIDE_FLAKY_SIGNAL)
 
     def setUp(self):
         check_if_enable(self)
@@ -1972,27 +2031,61 @@ def sawteeth(n, m):
         crow_indices.cumsum_(dim=0)
         return crow_indices.to(device=device)
 
-    def genSparseCSRTensor(self, size, nnz, *, device, dtype, index_dtype):
+    def genSparseCompressedTensor(self, size, nnz, *, layout, device, dtype, index_dtype, block_size=()):
+        from operator import mul
+        from functools import reduce
         sparse_dim = 2
-        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
-        assert len(size) == sparse_dim
-
-        def random_sparse_csr(n_rows, n_cols, nnz):
-            crow_indices = self._make_crow_indices(n_rows, n_cols, nnz, device=device, dtype=index_dtype)
-            col_indices = torch.zeros(nnz, dtype=index_dtype, device=device)
-            for i in range(n_rows):
-                count = crow_indices[i + 1] - crow_indices[i]
-                col_indices[crow_indices[i]:crow_indices[i + 1]], _ = torch.sort(
-                    torch.randperm(n_cols, dtype=index_dtype, device=device)[:count])
+        assert all(size[d] > 0 for d in range(len(size))) or nnz == 0, 'invalid arguments'
+        assert len(size) >= sparse_dim
+        if block_size:
+            assert len(block_size) == 2
+
+        def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
+            compressed_indices = self._make_crow_indices(n_compressed_dims, n_plain_dims, nnz, device=device, dtype=index_dtype)
+            plain_indices = torch.zeros(nnz, dtype=index_dtype, device=device)
+            for i in range(n_compressed_dims):
+                count = compressed_indices[i + 1] - compressed_indices[i]
+                plain_indices[compressed_indices[i]:compressed_indices[i + 1]], _ = torch.sort(
+                    torch.randperm(n_plain_dims, dtype=index_dtype, device=device)[:count])
             low = -1 if dtype != torch.uint8 else 0
             high = 1 if dtype != torch.uint8 else 2
-            values = make_tensor([nnz], device=device, dtype=dtype, low=low, high=high)
-            return values, crow_indices, col_indices
+            values = make_tensor((nnz,) + block_size, device=device, dtype=dtype, low=low, high=high)
+            return values, compressed_indices, plain_indices
 
-        values, crow_indices, col_indices = random_sparse_csr(size[0], size[1], nnz)
-        return torch.sparse_csr_tensor(crow_indices,
-                                       col_indices,
-                                       values, size=size, dtype=dtype, device=device)
+        batch_shape = size[:-2]
+        n_batch = reduce(mul, batch_shape, 1)
+
+        if layout in {torch.sparse_csr, torch.sparse_bsr}:
+            n_compressed_dims, n_plain_dims = size[-2], size[-1]
+        else:
+            n_compressed_dims, n_plain_dims = size[-1], size[-2]
+        sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz) for _ in range(n_batch)]
+        sparse_tensors_it = map(list, zip(*sparse_tensors))
+
+        values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, nnz, *block_size)
+        compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
+        plain_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
+
+        return torch.sparse_compressed_tensor(compressed_indices, plain_indices,
+                                              values, size=size, dtype=dtype, layout=layout, device=device)
+
+    def genSparseCSRTensor(self, size, nnz, *, device, dtype, index_dtype):
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csr, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, block_size=())
+
+    def genSparseCSCTensor(self, size, nnz, *, device, dtype, index_dtype):
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csc, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, block_size=())
+
+    def genSparseBSRTensor(self, size, block_size, nnz, *, device, dtype, index_dtype):
+        assert len(block_size) == 2
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsr, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, block_size=block_size)
+
+    def genSparseBSCTensor(self, size, block_size, nnz, *, device, dtype, index_dtype):
+        assert len(block_size) == 2
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsc, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, block_size=block_size)
 
     def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
         # Assert not given impossible combination, where the sparse dims have
@@ -2005,8 +2098,9 @@ def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
         i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
         i = i.to(torch.long)
         if is_uncoalesced:
-            v = torch.cat([v, torch.randn_like(v)], 0)
-            i = torch.cat([i, i], 1)
+            i1 = i[:, :(nnz // 2), ...]
+            i2 = i[:, :((nnz + 1) // 2), ...]
+            i = torch.cat([i1, i2], 1)
         x = torch.sparse_coo_tensor(i, v, torch.Size(size), dtype=dtype, device=device)
 
         if not is_uncoalesced:
@@ -2022,18 +2116,22 @@ def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
         return x, x._indices().clone(), x._values().clone()
 
     def safeToDense(self, t):
-        return t.coalesce().to_dense()
+        # coalesce is only implemented for COO
+        if t.layout == torch.sparse_coo:
+            t = t.coalesce()
+        return t.to_dense()
 
     # Compares a torch function with a reference function for a given sample input (object of SampleInput)
     # Note: only values are compared, type comparison is not done here
     def compare_with_reference(self, torch_fn, ref_fn, sample_input, **kwargs):
-        n_inp, n_args, n_kwargs = sample_input.numpy()
+        numpy_sample = sample_input.numpy()
+        n_inp, n_args, n_kwargs = numpy_sample.input, numpy_sample.args, numpy_sample.kwargs
         t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs
 
         actual = torch_fn(t_inp, *t_args, **t_kwargs)
         expected = ref_fn(n_inp, *n_args, **n_kwargs)
 
-        self.assertEqual(actual, expected, exact_device=False)
+        self.assertEqual(actual, expected, exact_device=False, **kwargs)
 
     # Compares the given Torch and NumPy functions on the given tensor-like object.
     # NOTE: both torch_fn and np_fn should be functions that take a single
@@ -2098,6 +2196,15 @@ def assertEqual(
         # Hide this function from `pytest`'s traceback
         __tracebackhide__ = True
 
+        # TODO: the Tensor compare uses bunch of operations which is currently not
+        # supported by MPS. We will remove this move to CPU after all the
+        # support is added. https://github.com/pytorch/pytorch/issues/77144
+        if isinstance(x, torch.Tensor) and (x.is_mps):
+            x = x.to('cpu')
+
+        if isinstance(y, torch.Tensor) and (y.is_mps):
+            y = y.to('cpu')
+
         # numpy's dtypes are a superset of what PyTorch supports. In case we encounter an unsupported dtype, we fall
         # back to an elementwise comparison. Note that this has to happen here and not for example in
         # `TensorOrArrayPair`, since at that stage we can no longer split the array into its elements and perform
@@ -2133,7 +2240,7 @@ def to_list(input):
             ),
             sequence_types=(
                 Sequence,
-                torch.storage.TypedStorage,
+                torch.storage._TypedStorage,
                 Sequential,
                 ModuleList,
                 ParameterList,
@@ -2367,6 +2474,19 @@ def assertGreaterAlmostEqual(self, first, second, places=None, msg=None, delta=N
         msg = self._formatMessage(msg, standardMsg)
         raise self.failureException(msg)
 
+    def assertAtenOp(self, onnx_model, operator, overload_name=""):
+        all_aten_nodes = [p for p in onnx_model.graph.node
+                          if p.op_type == "ATen" and p.domain == "org.pytorch.aten"]
+        self.assertTrue(all_aten_nodes)
+
+        for op in all_aten_nodes:
+            attrs = {attr.name: attr.s.decode() for attr in op.attribute}
+            if attrs.get("operator") == operator:
+                break
+
+        self.assertEqual(attrs["operator"], operator)
+        self.assertEqual(attrs.get("overload_name", ""), overload_name)
+
     # run code in subprocess and capture exceptions.
     @staticmethod
     def run_process_no_exception(code, env=None):
@@ -2894,7 +3014,7 @@ def gradcheck(fn, inputs, **kwargs):
         "fast_mode": True,
     }
 
-    if os.environ.get('PYTORCH_TEST_WITH_SLOW_GRADCHECK', "0FF") == "ON":
+    if os.environ.get('PYTORCH_TEST_WITH_SLOW_GRADCHECK', "0") == "1":
         default_values["fast_mode"] = False
 
     for key, value in default_values.items():
@@ -2914,7 +3034,7 @@ def gradgradcheck(fn, inputs, grad_outputs=None, **kwargs):
         "fast_mode": True,
     }
 
-    if os.environ.get('PYTORCH_TEST_WITH_SLOW_GRADCHECK', "0FF") == "ON":
+    if os.environ.get('PYTORCH_TEST_WITH_SLOW_GRADCHECK', "0") == "1":
         default_values["fast_mode"] = False
 
     for key, value in default_values.items():
@@ -3061,21 +3181,6 @@ def check_bytes(byte_list):
     return torch.tensor(res, device=device, dtype=dtype)
 
 
-def has_breakpad():
-    # We always build with breakpad in CI
-    if IS_IN_CI:
-        return True
-
-    # If not on a special build, check that the library was actually linked in
-    try:
-        torch._C._get_minidump_directory()  # type: ignore[attr-defined]
-        return True
-    except RuntimeError as e:
-        if "Minidump handler is uninintialized" in str(e):
-            return True
-        return False
-
-
 def sandcastle_skip_if(condition, reason):
     """
     Similar to unittest.skipIf, however in the sandcastle environment it just
@@ -3083,20 +3188,17 @@ def sandcastle_skip_if(condition, reason):
     skipping continuously.
     """
     def decorator(func):
-
-        if not IS_SANDCASTLE and condition:
-            func.__unittest_skip__ = True
-            func.__unittest_skip_why__ = reason
-            return func
-
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            if condition and IS_SANDCASTLE:
-                print(f'Skipping {func.__name__} on sandcastle for following reason: {reason}', file=sys.stderr)
-                return
+        if condition:
+            if IS_SANDCASTLE:
+                @wraps(func)
+                def wrapper(*args, **kwargs):
+                    print(f'Skipping {func.__name__} on sandcastle for following reason: {reason}', file=sys.stderr)
+                return wrapper
             else:
-                return func(*args, **kwargs)
-        return wrapper
+                func.__unittest_skip__ = True
+                func.__unittest_skip_why__ = reason
+
+        return func
 
     return decorator
 
@@ -3178,3 +3280,12 @@ def clone_input_helper(input):
         return tuple(map(clone_input_helper, input))
 
     return input
+
+@contextmanager
+def custom_op(opname, symbolic_fn, opset_version):
+    """Context manager/decorator to test ONNX export with custom oeprator"""
+    try:
+        register_custom_op_symbolic(opname, symbolic_fn, opset_version)
+        yield
+    finally:
+        unregister_custom_op_symbolic(opname, opset_version)
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 85e2299c2ff6..644f4f71d6b4 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -1,10 +1,15 @@
 import torch
 from torch import Tensor
 import contextlib
+import itertools
 from typing import Iterator
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from functools import partial
-from torch.utils._python_dispatch import enable_python_mode
+from torch.utils._python_dispatch import enable_torch_dispatch_mode
+import torch.autograd.forward_ad as fwAD
+from torch.overrides import enable_reentrant_dispatch
+import re
+
 
 # TODO: move this into library proper
 @contextlib.contextmanager
@@ -22,12 +27,13 @@ def check_attr_consistency(wrapper_tensor, metadata_name, metadata_accessor):
     if metadata_wrapper_tensor == metadata_elem:
         return
     raise RuntimeError(
-        f"This operator is not CompositeImplicitAutograd compliant: the "
+        f"This operator is not Composite Compliant: the "
         f"{metadata_name} of the tensor was modified directly without "
         f"going through the PyTorch dispatcher.")
 
-def check_metadata_consistency(wrapper_tensor):
-    if not isinstance(wrapper_tensor, CompositeCompliantTensor):
+def check_metadata_consistency(wrapper_tensor, CCT):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct
+    if not isinstance(wrapper_tensor, CCT):
         return
     things_to_check = {
         'shape': Tensor.size,
@@ -41,7 +47,7 @@ def check_metadata_consistency(wrapper_tensor):
         check_attr_consistency(wrapper_tensor, metadata_name, metadata_accessor)
 
 def is_view_fn(func):
-    return func.__name__ in {
+    return func.overloadpacket.__name__ in {
         'as_strided',
         'detach',
         'diagonal',
@@ -81,7 +87,7 @@ def is_view_fn(func):
 # manually populated from native_functions that have inplace_view: True.
 # In the future we will probably be able to grab that list directly
 def is_inplace_view_fn(func):
-    return func.__name__ in {
+    return func.overloadpacket.__name__ in {
         'as_strided_',
         'detach_',
         'squeeze_',
@@ -92,106 +98,407 @@ def is_inplace_view_fn(func):
         'unsqueeze_',
     }
 
-class CompositeCompliantTensor(torch.Tensor):
-    elem: torch.Tensor
-
-    __slots__ = ['elem']
-
-    @staticmethod
-    def __new__(cls, elem, *args, **kwargs):
-        # The storage of CompositeCompliantTensor should never be used directly
-        # by a CompositeImplicitAutograd operation; if the CompositeImplicitAutograd
-        # operator attempts to read from the storage without dispatching then it'll
-        # raise a RuntimeError due to it being a meta storage.
-        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
-            cls, elem.size(),
-            dtype=elem.dtype, layout=elem.layout,
-            device=elem.device, requires_grad=elem.requires_grad,
-            strides=elem.stride(), storage_offset=elem.storage_offset())
-        r.elem = elem
-        return r
-
-    def __repr__(self):
-        return f"CompositeCompliantTensor({self.elem})"
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-        def unwrap(e):
-            return e.elem if isinstance(e, CompositeCompliantTensor) else e
-
-        def wrap(e):
-            return CompositeCompliantTensor(e) if isinstance(e, torch.Tensor) else e
-
-        if func.__name__ in ('set_', 'resize_'):
-            raise RuntimeError(
-                f"{func.__name__} is not allowed to be called inside of "
-                f"CompositeImplicitAutograd operators.")
-
-        with no_dispatch():
-            unwrapped_args = tree_map(unwrap, args)
-            unwrapped_kwargs = tree_map(unwrap, kwargs)
-            unwrapped_rs = func(*unwrapped_args, **unwrapped_kwargs)
-            rs = tree_map(wrap, unwrapped_rs)
-
-        if is_view_fn(func):
-            # Autograd asserts that for B = A.view_fn(...), B and A's storages
-            # are the same. Here we try to make B alias A to avoid those asserts.
-            # See https://github.com/pytorch/pytorch/issues/65339 for more information
-            # about the issue.
-            with no_dispatch():
-                # Idea: this is a weird way of getting a storage that aliases the input.
-                # This is a workaround for #65339.
-                # 1. under no_dispatch, all of the wrapper tensors look like regular
-                #    tensors with special storage (the storage is nullptr and
-                #    advertises CPU/CUDA device.
-                # 2. we run func, which ends up running the view operation
-                # 3. All view operations reuse the input's storage and return
-                #    result Tensor(s) with new sizes/strides/offset that alias
-                #    the input.
-                # 4. we set the storage (and sizes/strides/offset) of the wrapper
-                #    tensor results to be that of the tensors that alias the input
-                result = func(*args, **kwargs)
-                if isinstance(result, tuple) or isinstance(result, list):
-                    for a, b in zip(rs, result):
-                        a.set_(b)
-                else:
-                    rs.set_(result)
-
-        # Some operations are allowed to in-place modify the metadata of the
-        # inputs. The only ones are the "inplace view functions"; when we
-        # run into these, we manually modify the metadata of the input.
-        with no_dispatch():
-            if is_inplace_view_fn(func):
-                func(*args, **kwargs)
-
-        # For each CompositeCompliantTensor t, we check that t and t.elem
-        # have consistent metadata. If they don't have consistent metadata,
-        # that means the operator did something fishy.
-        check = partial(check_metadata_consistency)
-        tree_map(check, args)
-        tree_map(check, kwargs)
-        tree_map(check, rs)
-        return rs
 
+# Introspection please save us
+def is_inplace(func):
+    name = func.overloadpacket.__name__
+    if re.match('__i.+__', name):
+        return True
+    if re.match('__.+__', name):
+        return False
+    return name[-1] == '_'
+
+
+def generate_cct(enable_recursive_torch_dispatch=False,
+                 autograd_view_consistency=True):
+    # This function returns a new class CompositeCompliantTensor
+    # The two arguments control the behaviour described below.
+
+    # enable_recursive_torch_dispatch:
+    #   If True, enable __torch_dispatch__ before calling the func in
+    #   CCT's __torch_dispatch__ implementation else call
+    #   the func under `no_dispatch`.
+    #   NOTE: We need to disable dispatch under Torch Dispatch Mode,
+    #   to avoid infinite recursion.
+    #   Also, we need to enable dispatch for checking
+    #   forward_AD composite compliance
+    #   Refer: https://github.com/pytorch/pytorch/issues/75652
+
+    # autograd_view_consistency:
+    #   If True, alias result using `set_` if func returns a view
+    #   (See Note [Alias Result]).
+    #   Since Forward AD doesn't work with `set_`
+    #   we disable it by setting alias to False.
+
+    class CompositeCompliantTensor(torch.Tensor):
+        elem: torch.Tensor
+
+        __slots__ = ['elem']
+        __torch_function__ = torch._C._disabled_torch_function_impl
+
+        @staticmethod
+        def __new__(cls, elem, *args, **kwargs):
+            assert type(elem) is not cls, \
+                "Wrapping a CompositeCompliantTensor in a CompositeCompliantTensor is not supported"
+
+            # The storage of CompositeCompliantTensor should never be used directly
+            # by a Composite operation; if the Composite
+            # operator attempts to read from the storage without dispatching then it'll
+            # raise a RuntimeError due to it being a meta storage.
+            r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+                cls, elem.size(),
+                dtype=elem.dtype, layout=elem.layout,
+                device=elem.device, requires_grad=elem.requires_grad,
+                strides=elem.stride(), storage_offset=elem.storage_offset())
+
+            # CompositeCompliantTensor steals the "requires_grad"-ness.
+            if elem.requires_grad:
+                # Why clone? Because sometimes OpInfo shares inputs between tests...
+                r.elem = elem.detach().clone()
+            else:
+                r.elem = elem
+
+            # Propagate conjugate bits to the wrapper tensor
+            # Ref: https://github.com/albanD/subclass_zoo/issues/24
+            # Ref: https://github.com/albanD/subclass_zoo/issues/21
+            torch._C._set_conj(r, r.elem.is_conj())
+            torch._C._set_neg(r, r.elem.is_neg())
+            return r
+
+        def __repr__(self):
+            return f"CompositeCompliantTensor({self.elem})"
+
+        @classmethod
+        def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+            def unwrap(e):
+                return e.elem if isinstance(e, CompositeCompliantTensor) else e
+
+            def wrap(e):
+                return CompositeCompliantTensor(e) if isinstance(e, torch.Tensor) else e
+
+            if func.overloadpacket.__name__ in ('set_', 'resize_'):
+                raise RuntimeError(
+                    f"{func.__name__} is not allowed to be called inside of "
+                    f"Composite operators.")
+
+            if is_inplace(func):
+                # NB: We are making an assumption that if the function is in-place,
+                # then the first argument is being written to. Introspection please save us!
+                mutated_argument = args[0]
+                if not isinstance(mutated_argument, CompositeCompliantTensor) and \
+                        any([isinstance(a, CompositeCompliantTensor) for a in args[1:]]):
+                    raise RuntimeError(
+                        'Not composite compliant: performing in-place operation '
+                        f'{func.__name__} where the Tensor being written to is '
+                        'regular Tensor but the other tensors are Tensor Subclasses. '
+                        'Please try to avoid this in-place operation.')
+
+            with enable_reentrant_dispatch():
+                with contextlib.nullcontext() if enable_recursive_torch_dispatch else no_dispatch():
+                    unwrapped_args = tree_map(unwrap, args)
+                    unwrapped_kwargs = tree_map(unwrap, kwargs)
+                    unwrapped_rs = func(*unwrapped_args, **unwrapped_kwargs)
+                    rs = tree_map(wrap, unwrapped_rs)
+
+            if is_view_fn(func) and autograd_view_consistency:
+                # Note [Alias Result]
+                # Autograd asserts that for B = A.view_fn(...), B and A's storages
+                # are the same. Here we try to make B alias A to avoid those asserts.
+                # See https://github.com/pytorch/pytorch/issues/65339 for more information
+                # about the issue.
+                with enable_reentrant_dispatch():
+                    with no_dispatch():
+                        # Idea: this is a weird way of getting a storage that aliases the input.
+                        # This is a workaround for #65339.
+                        # 1. under no_dispatch, all of the wrapper tensors look like regular
+                        #    tensors with special storage (the storage is nullptr and
+                        #    advertises CPU/CUDA device.
+                        # 2. we run func, which ends up running the view operation
+                        # 3. All view operations reuse the input's storage and return
+                        #    result Tensor(s) with new sizes/strides/offset that alias
+                        #    the input.
+                        # 4. we set the storage (and sizes/strides/offset) of the wrapper
+                        #    tensor results to be that of the tensors that alias the input
+                        result = func(*args, **kwargs)
+                        if isinstance(result, tuple) or isinstance(result, list):
+                            for a, b in zip(rs, result):
+                                a.set_(b)
+                        else:
+                            rs.set_(result)
+
+            # Some operations are allowed to in-place modify the metadata of the
+            # inputs. The only ones are the "inplace view functions"; when we
+            # run into these, we manually modify the metadata of the input.
+            with enable_reentrant_dispatch():
+                with no_dispatch():
+                    if is_inplace_view_fn(func):
+                        func(*args, **kwargs)
+
+            # For each CompositeCompliantTensor t, we check that t and t.elem
+            # have consistent metadata. If they don't have consistent metadata,
+            # that means the operator did something fishy.
+            check = partial(check_metadata_consistency, CCT=cls)
+            tree_map(check, args)
+            tree_map(check, kwargs)
+            tree_map(check, rs)
+            return rs
+
+    return CompositeCompliantTensor
+
+def is_tensorlist(lst):
+    if not isinstance(lst, list) and not isinstance(lst, tuple):
+        return False
+    if len(lst) == 0:
+        return False
+    all_tensors = all([isinstance(elt, torch.Tensor) for elt in lst])
+    if all_tensors:
+        return True
+    exists_one_tensor = all([isinstance(elt, torch.Tensor) for elt in lst])
+    if exists_one_tensor:
+        raise RuntimeError('This test assumes that PyTorch APIs cannot take '
+                           'mixed lists of Tensor and other things')
+    return False
+
+
+def maybe_map(fn, should_map, arg):
+    return fn(arg) if should_map else arg
+
+
+def wrap(arg, CCT):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct
+    if isinstance(arg, torch.Tensor):
+        return CCT(arg)
+    if is_tensorlist(arg):
+        return [CCT(a) for a in arg]
+    raise RuntimeError("wrap assumes that the input can be wrapped")
+
+
+# Given a list of flat arguments, some of which may be Tensors, return all
+# possible ways some of the arguments could be CompositeCompliantTensors (CCT).
+# For example, given Tensors A, B, C and flat_args = [A, 1, B],
+# We would return the following 4 options:
+# [CCT(A), 1, CCT(B)]
+# [CCT(A), 1, B]
+# [A, 1, CCT(B)]
+# [A, 1, B]
+# NB: Yes, this is exponential. No, we don't care too much because PyTorch ops
+# don't accept that many input Tensors.
+def generate_subclass_choices(flat_args, CCT):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct
+    is_tensor_likes = [isinstance(arg, torch.Tensor) or is_tensorlist(arg) for arg in flat_args]
+    subclass_options = [[False, True] if is_tensor_like else [False] for is_tensor_like in is_tensor_likes]
+
+    for which_args_are_wrapped in itertools.product(*subclass_options):
+
+        result = [maybe_map(partial(wrap, CCT=CCT), should_wrap_arg, arg)
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+        yield result, which_args_are_wrapped
+
+
+# For an operation f(*args, **kwargs), each Tensor argument may either be
+# a regular Tensor or a Tensor Subclass. This iterator iterates through
+# all of those options.
+def generate_subclass_choices_args_kwargs(args, kwargs, CCT):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct
+    flat_kwargs, spec = tree_flatten(kwargs)
+    flat_args_kwargs = list(args) + list(flat_kwargs)
+    for choice, debug_metadata in generate_subclass_choices(flat_args_kwargs, CCT):
+        new_args = choice[:len(args)]
+        new_kwargs = tree_unflatten(choice[len(args):], spec)
+        which_args_are_wrapped = debug_metadata[:len(args)]
+        which_kwargs_are_wrapped = tree_unflatten(debug_metadata[len(args):], spec)
+        yield new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped
+
+
+def raise_composite_compliance_error(err, additional_info=''):
+    raise RuntimeError(
+        "Composite compilance check failed with "
+        "the above error.\n"
+        f"{additional_info}"
+        "If you are adding an OpInfo of an "
+        "existing operator, please feel free to skip this test "
+        "because the problem was pre-existing and file an issue. "
+        "Otherwise, if you added a new operator, please read "
+        "through the Composite Compliance section in "
+        "aten/src/ATen/native/README.md for how to resolve this. "
+    ) from err
+
+
+# This test checks ALL possible permutations of calling `op` with arguments
+# that are individually either a regular Tensor or a Tensor subclass.
+#
+# The general strategy is to wrap some Tensor args and kwargs in
+# CompositeCompliantTensor wrappers and call the operation.
+
+# If some composite operation does any non-compliant behavior,
+# CompositeCompliantTensor will raise an error.
+def check_all_permutations(op, args, kwargs):
+    CCT = generate_cct()
+    for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT):
+        new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+
+        try:
+            op(*new_args, **new_kwargs)
+        # NOTE: [What errors are Composite Compiance trying to catch?]
+        #
+        # There's two things we want to catch:
+        # - errors that would raise within the torch_dispatch impl
+        # - data_ptr accesses
+        # The first is easy to filter for (we could make the error a different
+        # error class), the second is always going to be a RuntimeError due to
+        # how it is implemented (if you try to access the data_ptr of thex
+        # wrapper Tensor, it raises you some internal RuntimeError).
+        #
+        # So the most general thing to catch here was RuntimeError. If you
+        # are here and debugging why your test failed, it's plausible that
+        # the operator itself is broken and that there are other tests failing.
+        except RuntimeError as err:
+            raise_composite_compliance_error(
+                err,
+                f"- wrapped_args: {which_args_are_wrapped}\n"
+                f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+            )
+
+# Checks via the usage of torch dispatch mode certain anti-patterns that
+# are not composite compliant.
+#
+# In particular, the anti-pattern we are trying to prevent is a user
+# creating an empty tensor and then resize_-ing it. Torch Dispatch Mode helps
+# here because all factory functions will create tensors that are
+# CompositeCompliantTensor.
+#
 # The general strategy is to wrap all Tensor args and kwargs in
 # CompositeCompliantTensor wrappers. If an operator that is
-# CompositeImplicitAutograd does any non-compliant behavior,
+# Composite does any non-compliant behavior,
 # CompositeCompliantTensor will raise an error.
-def _check_composite_compliance(op, args, kwargs):
+def check_with_mode(op, args, kwargs):
+    CCT = generate_cct()
+
     def wrap(e):
-        return CompositeCompliantTensor(e) if isinstance(e, torch.Tensor) else e
+        return CCT(e) if isinstance(e, torch.Tensor) else e
 
     args = tree_map(wrap, args)
     kwargs = tree_map(wrap, kwargs)
     try:
-        with enable_python_mode(CompositeCompliantTensor):
+        with enable_torch_dispatch_mode(CCT):
             op(*args, **kwargs)
+    # see NOTE: [What errors are Composite Compiance trying to catch?]
     except RuntimeError as err:
-        raise RuntimeError("CompositeImplicitAutograd compilance check failed with "
-                           "the above error. If you are adding an OpInfo of an "
-                           "existing operator, please feel free to skip this test "
-                           "because the problem was pre-existing and file an issue. "
-                           "Otherwise, if you added a new operator, please read "
-                           "through the CompositeImplicitAutograd Compliance section in "
-                           "aten/src/ATen/native/README.md for how to resolve this. "
-                           ) from err
+        raise_composite_compliance_error(err)
+
+def gather_leaf_tensors(args, kwargs):
+    leaf_tensors = []
+    args, args_spec = tree_flatten(args)
+    kwargs, kwargs_spec = tree_flatten(kwargs)
+    args = args + kwargs
+    for arg in args:
+        if not isinstance(arg, torch.Tensor):
+            continue
+        if arg.requires_grad:
+            leaf_tensors.append(arg)
+    return leaf_tensors
+
+
+# Checks if the backward formula is composite compliant by testing
+# all possible permutations of {inputs, grad_outputs} being
+# CompositeCompliantTensor or regular Tensors.
+def check_backward_formula(op, args, kwargs):
+    assert op.supports_autograd
+    CCT = generate_cct()
+    for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT):
+        new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+        leaf_tensors = gather_leaf_tensors(new_args, new_kwargs)
+        assert len(leaf_tensors) > 0
+
+        try:
+            results = op(*new_args, **new_kwargs)
+        # see NOTE: [What errors are Composite Compiance trying to catch?]
+        except RuntimeError as err:
+            raise_composite_compliance_error(
+                err,
+                f"- wrapped_args: {which_args_are_wrapped}\n"
+                f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+            )
+
+        # Hack: tree_flatten doesn't handle torch.return_types yet,
+        # so we're gonna convert them to tuple.
+        # TODO: https://github.com/pytorch/pytorch/issues/74624
+        if isinstance(results, tuple):
+            results = tuple(results)
+        flat_results, _ = tree_flatten(results)
+        flat_diff_results = [r for r in flat_results if r.requires_grad]
+        assert len(flat_diff_results) > 0
+
+        # NB: ones, not ones_like, so we get a regular Tensor here
+        grads = [torch.ones(r.shape, device=r.device, dtype=r.dtype)
+                 for r in flat_diff_results]
+        for flat_new_grads, which_grad_is_batched in generate_subclass_choices(grads, CCT):
+            try:
+                torch.autograd.grad(flat_diff_results, leaf_tensors, flat_new_grads,
+                                    allow_unused=True, retain_graph=True)
+            # see NOTE: [What errors are Composite Compiance trying to catch?]
+            except RuntimeError as err:
+                raise_composite_compliance_error(
+                    err,
+                    f"- wrapped_args: {which_args_are_wrapped}\n"
+                    f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+                    f"- wrapped_grads: {which_grad_is_batched}\n"
+                )
+
+# Checks if the forward AD formula is composite compliant by testing
+# all possible permutations of {primals, tangents} being
+# CompositeCompliantTensor or regular Tensors.
+def check_forward_ad_formula(op, args, kwargs):
+    assert op.supports_forward_ad
+
+    CCT = generate_cct(enable_recursive_torch_dispatch=True, autograd_view_consistency=False)
+    # Permutations of arg and kwargs in CCT.
+    for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT):
+        new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+
+        def maybe_tangent(t):
+            assert type(t) is not CCT
+            # Generate `tangent` tensor
+            # if given object is a Tensor and requires grad is set.
+            if isinstance(t, torch.Tensor) and t.requires_grad:
+                return torch.randn_like(t)
+            elif is_tensorlist(t):
+                return list(torch.randn_like(e) if e.requires_grad else None for e in t)
+            return None
+
+        tangent_args = tuple(maybe_tangent(arg) for arg in args)
+        flat_kwargs, spec = tree_flatten(kwargs)
+        flat_tangent_kwargs = tuple(maybe_tangent(arg) for arg in flat_kwargs)
+        tangent_kwargs = tree_unflatten(flat_tangent_kwargs, spec)
+
+        # Permutations tangent arg and tangent kwargs in CCT.
+        for tang_choice in generate_subclass_choices_args_kwargs(tangent_args, tangent_kwargs, CCT):
+            new_tang_args, new_tang_kwargs, \
+                which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
+
+            with fwAD.dual_level():
+                def maybe_make_dual(dual):
+                    # Returns dual tensor if primal is a tensor/tensor subclass
+                    # with requires_grad set.
+                    primal, tangent = dual
+                    if isinstance(primal, torch.Tensor) and primal.requires_grad:
+                        return fwAD.make_dual(primal, tangent)
+                    elif is_tensorlist(primal):
+                        return tuple(fwAD.make_dual(pri, tang) if tang is not None else pri
+                                     for pri, tang in zip(primal, tangent))
+                    return primal
+
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
+
+                try:
+                    op.gradcheck_wrapper(op.get_op(), *op_args, **op_kwargs)
+                # see NOTE: [What errors are Composite Compiance trying to catch?]
+                except RuntimeError as err:
+                    raise_composite_compliance_error(
+                        err,
+                        f"- wrapped_args: {which_args_are_wrapped}\n"
+                        f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+                        f"- wrapped_tangent_args: {which_tang_args_are_wrapped}\n"
+                        f"- wrapped_tangent_kwargs: {which_tang_kwargs_are_wrapped}\n"
+                    )
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
index a81e4b6b953a..c8f22d090375 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -7,6 +7,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     TEST_SKIPS,
+    tp_transports,
 )
 
 TEST_GPU_NUM = 4
@@ -31,8 +32,9 @@ def init_pg(self, backend="nccl"):
         if backend == "nccl":
             torch.cuda.set_device(self.rank)
 
+
     def init_rpc(self):
-        rpc_backend_options = rpc.TensorPipeRpcBackendOptions()
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports())
         rpc_backend_options.init_method = f"file://{self.file_name}"
         for rank in range(self.world_size):
             rpc_backend_options.set_device_map(
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
index 53da6285d322..f07113f4c1ab 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
@@ -1,6 +1,10 @@
+import builtins
+
 import torch
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
+    EnumerableShardingSpec,
+    ShardMetadata,
 )
 from torch.distributed._shard.sharding_spec._internals import (
     get_chunked_dim_size,
@@ -42,6 +46,35 @@ def generate_chunk_sharding_specs_for_test(sharding_dim):
     ]
 
 
+def generate_enumerable_sharding_specs_for_test():
+    return [
+        EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
+    ]
+
+
 def generate_local_weight_sharding_params_for_test(
     local_weight, sharded_dim, gpu_num, spec, rank
 ):
@@ -85,3 +118,17 @@ def clone_module_parameter(module, param_name):
     """
     tensor = getattr(module, param_name)
     return torch.nn.Parameter(tensor.detach().clone())
+
+def gen_binary_op_func(python_op, inplace=False):
+    src_lines = ['def f(lhs, rhs):']
+    if "torch" in python_op:
+        src_lines.append(f'  return {python_op}(lhs, rhs)\n')
+    elif inplace:
+        src_lines.append(f'  lhs {python_op}= rhs\n  return lhs\n')
+    else:
+        src_lines.append(f'  return lhs {python_op} rhs\n')
+
+    code_str = '\n'.join(src_lines)
+    g = {'torch': torch}
+    builtins.exec(code_str, g)
+    return g["f"]
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
index befe56ec107e..4352817476f6 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
@@ -1,5 +1,7 @@
 import copy
 import random
+import torch
+from torch.distributed._shard import sharded_tensor
 
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
@@ -26,3 +28,37 @@ def _chunk_sharding_specs_list_for_test(sharding_dims, seed=0):
             )
         )
     return spec_list
+
+class MyShardedModel2(torch.nn.Module):
+    def __init__(
+        self,
+        spec=None,
+        group=None,
+        init_rrefs=True
+    ) -> None:
+        super(MyShardedModel2, self).__init__()
+        if spec is not None:
+            self.sharded_tensor2 = sharded_tensor.rand(
+                spec, 10, 20, process_group=group, init_rrefs=init_rrefs
+            )
+        else:
+            self.sharded_tensor2 = None
+        self.random_tensor2 = torch.nn.Parameter(torch.rand(2, 2))
+
+
+class MyShardedModel1(torch.nn.Module):
+    def __init__(
+        self,
+        spec=None,
+        group=None,
+        init_rrefs=True
+    ) -> None:
+        super(MyShardedModel1, self).__init__()
+        if spec is not None:
+            self.sharded_tensor1 = sharded_tensor.rand(
+                spec, 10, 20, process_group=group, init_rrefs=init_rrefs
+            )
+        else:
+            self.sharded_tensor1 = None
+        self.random_tensor1 = torch.nn.Parameter(torch.rand(2, 2))
+        self.submodule = MyShardedModel2(spec, group, init_rrefs)
diff --git a/torch/testing/_internal/distributed/_shard/test_common.py b/torch/testing/_internal/distributed/_shard/test_common.py
new file mode 100644
index 000000000000..e3469de658af
--- /dev/null
+++ b/torch/testing/_internal/distributed/_shard/test_common.py
@@ -0,0 +1,39 @@
+import torch.nn as nn
+
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+
+class SimpleMegatronLM(nn.Module):
+    def __init__(self, linear_size, rank=None):
+        super().__init__()
+        self.fc1 = nn.Linear(*linear_size[0])
+        self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(*linear_size[1])
+        if rank:
+            self.fc1.cuda(rank)
+            self.fc2.cuda(rank)
+
+    def forward(self, inp):
+        return self.fc2(self.gelu(self.fc1(inp)))
+
+    def get_weights(self):
+        if isinstance(self.fc1.weight, ShardedTensor):
+            weight1 = self.fc1.weight.local_tensor()
+        else:
+            weight1 = self.fc1.weight
+
+        if isinstance(self.fc2.weight, ShardedTensor):
+            weight2 = self.fc2.weight.local_tensor()
+        else:
+            weight2 = self.fc2.weight
+
+        return (weight1, weight2)
+
+    def get_biases(self):
+        return (self.fc1.bias, self.fc2.bias)
+
+    def get_weight_grads(self):
+        return (self.fc1.weight.grad, self.fc2.weight.grad)
+
+    def get_bias_grads(self):
+        return (self.fc1.bias.grad, self.fc2.bias.grad)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 3909676dda01..1414a0376b12 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -6,16 +6,17 @@
 import sys
 import tempfile
 import time
-from collections import namedtuple
+from collections import namedtuple, OrderedDict
 from contextlib import contextmanager, suppress
 from datetime import timedelta
 from functools import reduce
 from typing import Union, NamedTuple, Callable, Any
-
+import numpy as np
 import torch
 import torch.cuda
 import torch.distributed as dist
 import torch.distributed.algorithms.model_averaging.averagers as averagers
+import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
 import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
 import torch.nn as nn
 import torch.nn.functional as F
@@ -37,6 +38,11 @@
     AllreduceOptions,
     GroupMember,
 )
+from torch.distributed.utils import (
+    _verify_param_shape_across_processes,
+    _sync_module_states,
+)
+
 from torch.nn.parallel import DistributedDataParallel
 from torch.nn.parallel.distributed import _dump_DDP_relevant_env_vars
 from torch.testing._internal.common_distributed import (
@@ -48,6 +54,7 @@
     simple_sparse_reduce_tests,
     skip_if_rocm,
     skip_if_small_worldsize,
+    skip_if_odd_worldsize,
     skip_if_lt_x_gpu,
     nccl_skip_if_lt_x_gpu,
     skip_if_no_gpu,
@@ -315,12 +322,18 @@ def forward(self, x):
         return (a, b)
 
 
-class EmbeddingNet(nn.Module):
-    def __init__(self, rank):
+class EmbeddingNetDifferentParams(nn.Module):
+    """
+    A module containing an embedding with different dimension or different # of
+    parameters depending on the rank.
+    """
+    def __init__(self, rank, diff_num_params=False):
         super().__init__()
-        embedding_dim = 500 if rank == 0 else 50
+        embedding_dim = 500 if diff_num_params or rank == 0 else 50
         self.embedding = nn.Embedding(num_embeddings=10, embedding_dim=embedding_dim)
         self.lin = nn.Linear(embedding_dim, 1)
+        if diff_num_params:
+            self.lin2 = nn.Linear(1, 1, bias=False)
 
     def forward(self, x):
         x = self.embedding(x)
@@ -428,7 +441,7 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
 
 def _build_multidim_tensor(dim, dim_size, value=None, dtype=torch.float):
     if value is None:
-        value = size
+        value = dim
     return torch.empty(size=[dim_size for _ in range(dim)], dtype=dtype).fill_(value)
 
 
@@ -517,6 +530,10 @@ def init_method(self):
 
     @classmethod
     def _run(cls, rank, test_name, file_name, pipe):
+        # Enable DDP + ReplicatedTensor
+        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
+        _set_ddp_with_replicated_tensor(True)
+
         if BACKEND == "nccl" and not torch.cuda.is_available():
             sys.exit(TEST_SKIPS["no_cuda"].exit_code)
         self = cls(test_name)
@@ -742,7 +759,7 @@ def _test_barrier_timeout(self, group_id, timeout):
                 expected_time = time.time() + timeout.total_seconds()
                 # In debug mode, we execute a monitored_barrier before the
                 # collective, so assert on that.
-                if dist._get_debug_mode() == dist._DistributedDebugLevel.DETAIL:
+                if dist.get_debug_level() == dist.DebugLevel.DETAIL:
                     exception_ctx = self.assertRaisesRegex(
                         Exception, "failed to pass monitoredBarrier"
                     )
@@ -1022,6 +1039,9 @@ def test_periodic_model_averager(self):
                 for step in range(0, 20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
+                    for params in model.parameters():
+                        # mock grad
+                        params.grad = torch.ones_like(param.data)
                     averager.average_parameters(model.parameters())
                     if step >= warmup_steps and (step - warmup_steps) % period == 0:
                         self.assertEqual(param.data, expected_avg_tensor)
@@ -1029,6 +1049,155 @@ def test_periodic_model_averager(self):
                         # No model averaging, so the parameters are not updated.
                         self.assertEqual(param.data, tensor)
 
+        @skip_if_lt_x_gpu(2)
+        def test_periodic_model_averager_param_group(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            opt = torch.optim.SGD(model.parameters(), lr=0.1)
+
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = averagers.PeriodicModelAverager(period=period, warmup_steps=warmup_steps)
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    for param_group in opt.param_groups:
+                        for params in param_group["params"]:
+                            # mock grad
+                            params.grad = torch.ones_like(param.data) * rank
+                            params.data = torch.ones_like(param.data) * rank
+                    averager.average_parameters(opt.param_groups)
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        for param_group in opt.param_groups:
+                            for params in param_group["params"]:
+                                if params.grad is None:
+                                    continue
+                                self.assertEqual(param.data, torch.ones_like(param.data) * sum(range(world_size)) / world_size)
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        for param_group in opt.param_groups:
+                            for params in param_group["params"]:
+                                if params.grad is None:
+                                    continue
+                                self.assertEqual(param.data, torch.ones_like(param.data) * rank)
+
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices"
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averager(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            tensor = torch.ones_like(param.data) * rank
+            expected_avg_tensor = (
+                torch.ones_like(param.data) * sum(range(world_size)) / world_size
+            )
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = hierarchicalSGD.HierarchicalModelAverager(
+                    # Run the global averaging at a period of 4,
+                    # which is equivalent to the above periodic model averaging test case.
+                    period_group_size_dict=OrderedDict([(period, world_size)]), warmup_steps=warmup_steps
+                )
+
+                averager = averagers.PeriodicModelAverager(period=period, warmup_steps=warmup_steps)
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    param.data = copy.deepcopy(tensor)
+                    for params in model.parameters():
+                        # mock grad
+                        params.grad = torch.ones_like(param.data)
+                    averager.average_parameters(model.parameters())
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        self.assertEqual(param.data, expected_avg_tensor)
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        self.assertEqual(param.data, tensor)
+
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices"
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_3_level_hierarchical_model_averager(self):
+            from torch.distributed.distributed_c10d import _pg_group_ranks
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            tensor = torch.ones_like(param.data) * rank
+            # Set up such a hierarchical model averaging as follows:
+            # after the first 10 warmup steps,
+            # run model averaging every 2 steps within each subgroup of size 2,
+            # run model averaging every 4 steps within each subgroup of size 3,
+            # and run the global model averaging every 8 steps.
+            # If there is a conflict in model averaging at a step, only run the highest-level model averaging.
+            warmup_steps = 10
+            subgroup_size1 = 2
+            subgroup_avg_period1 = 2
+            subgroup_size2 = 4
+            subgroup_avg_period2 = 4
+            global_avg_period = 8
+            period_group_size_dict = OrderedDict(
+                [(subgroup_avg_period1, subgroup_size1),
+                 (subgroup_avg_period2, subgroup_size2),
+                 (global_avg_period, world_size)])
+            averager = hierarchicalSGD.HierarchicalModelAverager(
+                period_group_size_dict=period_group_size_dict, warmup_steps=warmup_steps
+            )
+            subgroup1 = averager.period_process_group_dict[subgroup_avg_period1]
+            subgroup2 = averager.period_process_group_dict[subgroup_avg_period2]
+
+            real_group_ranks_res1 = list(_pg_group_ranks[subgroup1].keys())
+            real_group_ranks_res2 = list(_pg_group_ranks[subgroup2].keys())
+            expect_group_ranks_res1 = (rank // subgroup_size1 * subgroup_size1 + np.array(list(range(subgroup_size1)))).tolist()
+            expect_group_ranks_res2 = (rank // subgroup_size2 * subgroup_size2 + np.array(list(range(subgroup_size2)))).tolist()
+            self.assertEqual(real_group_ranks_res1, expect_group_ranks_res1)
+            self.assertEqual(real_group_ranks_res2, expect_group_ranks_res2)
+
+            expected_avg_tensor_within_subgroup1 = (
+                torch.ones_like(param.data) * sum(real_group_ranks_res1) / subgroup_size1
+            )
+            expected_avg_tensor_within_subgroup2 = (
+                torch.ones_like(param.data) * sum(real_group_ranks_res2) / subgroup_size2
+            )
+            expected_global_avg_tensor = (
+                torch.ones_like(param.data) * sum(range(world_size)) / world_size
+            )
+            for step in range(0, 25):
+                # Reset the parameters at every step.
+                param.data = copy.deepcopy(tensor)
+                for params in model.parameters():
+                    # mock grad
+                    params.grad = torch.ones_like(param.data)
+                averager.average_parameters(model.parameters())
+                if step == 16 or step == 24:
+                    # Run global model averaging when `step` can be divided by 8.
+                    self.assertEqual(param.data, expected_global_avg_tensor)
+                elif step == 12 or step == 20:
+                    # Run model averaging within subgroup when `step` can be divided by 4 but not by 8.
+                    self.assertEqual(param.data, expected_avg_tensor_within_subgroup2)
+                elif step == 10 or step == 14 or step == 18 or step == 22:
+                    # Run model averaging within subgroup when `step` can be divided by 2 but not by 4 or 8.
+                    self.assertEqual(param.data, expected_avg_tensor_within_subgroup1)
+                else:
+                    # No model averaging, so the parameters are not updated.
+                    self.assertEqual(param.data, tensor)
+
         # NCCL Batch SEND RECV
         @skip_if_no_gpu
         @sandcastle_skip_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
@@ -1058,11 +1227,36 @@ def test_batch_isend_irecv_nccl(self):
 
             self._barrier()
 
+        @skip_if_no_gpu
+        @sandcastle_skip_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_ring_exchange_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            p2p_op_list = []
+
+            send_tensor = _build_tensor(world_size, device_id=device_id)
+            recv_tensor = _build_tensor(world_size, value=-1, device_id=device_id)
+            send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1) % world_size)
+            recv_op = dist.P2POp(dist.irecv, recv_tensor, (rank - 1 + world_size) % world_size)
+            reqs = dist.batch_isend_irecv([send_op, recv_op])
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
         @skip_if_no_gpu
         @sandcastle_skip_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
         @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
         def test_batch_isend_irecv_self_nccl(self):
             self._barrier()
+            # Ensure the process group has been fully initialized (needed by
+            # the first sub-group batch_isend_irecv call)
+            dist.barrier()
             rank = dist.get_rank()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
             device_id = rank_to_GPU[rank][0]
@@ -1088,6 +1282,9 @@ def test_batch_isend_irecv_self_nccl(self):
         @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
         def test_batch_isend_irecv_no_rank_zero_nccl(self):
             self._barrier()
+            # Ensure the process group has been fully initialized (needed by
+            # the first sub-group batch_isend_irecv call)
+            dist.barrier()
             rank = dist.get_rank()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
             device_id = rank_to_GPU[rank][0]
@@ -2083,7 +2280,7 @@ def call_dist_op(
                 )
                 # DETAIL debug mode can use a pg wrapper that issues more collectives
                 # under the hood
-                if dist._get_debug_mode() != dist._DistributedDebugLevel.DETAIL:
+                if dist.get_debug_level() != dist.DebugLevel.DETAIL:
                     self.assertEqual(len(events), len(op_calls))
                 for e in events:
                     self.assertTrue(e.is_async)
@@ -2094,7 +2291,7 @@ def call_dist_op(
                     # under the hood
                     if (
                         tensor_shapes is not None
-                        and dist._get_debug_mode() != dist._DistributedDebugLevel.DETAIL
+                        and dist.get_debug_level() != dist.DebugLevel.DETAIL
                     ):
                         self.assertEqual(
                             e.input_shapes,
@@ -3835,7 +4032,6 @@ def forward(self):
             f"The {BACKEND} backend does not support DistributedDataParallel"
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_DistributedDataParallel_non_default_stream(self):
             stream = torch.cuda.Stream(self.rank)
             rank = self.rank
@@ -3874,7 +4070,6 @@ def test_DistributedDataParallel_non_default_stream(self):
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_ddp_comm_hook_logging(self):
             hooks = [
                 default.allreduce_hook,
@@ -4064,11 +4259,10 @@ def _test_ddp_hook_with_optimizer_parity(
                     dist.barrier()
 
         @sandcastle_skip_if(
-            BACKEND not in DistTestCases.backend_feature["ddp"],
-            f"The {BACKEND} backend does not support DistributedDataParallel"
+            BACKEND == "nccl",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         @parametrize("grad_as_bucket_view", [True, False])
         @parametrize("static_graph", [True, False])
         @parametrize("optimize_subset", [True, False])
@@ -4092,11 +4286,10 @@ def test_ddp_hook_with_optimizer_parity_adamw(
             )
 
         @sandcastle_skip_if(
-            BACKEND not in DistTestCases.backend_feature["ddp"],
-            f"The {BACKEND} backend does not support DistributedDataParallel"
+            BACKEND == "nccl",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         @parametrize("optimize_subset", [True, False])
         def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
             adam_lr = 1e-2
@@ -4113,11 +4306,10 @@ def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
             )
 
         @sandcastle_skip_if(
-            BACKEND not in DistTestCases.backend_feature["ddp"],
-            f"The {BACKEND} backend does not support DistributedDataParallel"
+            BACKEND == "nccl",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         @parametrize("optimize_subset", [True, False])
         def test_ddp_hook_with_optimizer_parity_sgd(self, optimize_subset):
             sgd_lr = 1e-2
@@ -4135,7 +4327,7 @@ def test_ddp_hook_with_optimizer_parity_sgd(self, optimize_subset):
                 weight_decay=sgd_weight_decay,
             )
 
-        def _test_ddp_hook_parity(self, state, hook):
+        def _test_ddp_hook_parity(self, state, hook, num_validated_iters=100):
             rank = self.rank
             m = torch.nn.Linear(1, 5)
             try:
@@ -4177,25 +4369,26 @@ def _test_ddp_hook_parity(self, state, hook):
                 loss_hook.backward()
                 grad_hook = net_with_hook.module.weight.grad
                 avg_hook = grad_hook.clone()
-                # Verify hook grad with expected.
-                self.assertEqual(
-                    avg_hook[0, 0].item(),
-                    expected_grad,
-                    msg=f"Expected hook grad of {expected_grad} but got {avg_hook[0, 0]}",
-                )
-                # Verify hook grad with vanilla allreduce
-                self.assertEqual(
-                    avg_hook[0, 0],
-                    avg[0, 0],
-                    msg=f"Expected hook grad to be close to allreduce {avg[0, 0]}, but got {avg_hook[0, 0]}",
-                )
+
+                if i < num_validated_iters:
+                    # Verify hook grad with expected.
+                    self.assertEqual(
+                        avg_hook[0, 0].item(),
+                        expected_grad,
+                        msg=f"Expected hook grad of {expected_grad} but got {avg_hook[0, 0]}",
+                    )
+                    # Verify hook grad with vanilla allreduce
+                    self.assertEqual(
+                        avg_hook[0, 0],
+                        avg[0, 0],
+                        msg=f"Expected hook grad to be close to allreduce {avg[0, 0]}, but got {avg_hook[0, 0]}",
+                    )
 
         @sandcastle_skip_if(
             BACKEND not in DistTestCases.backend_feature["cuda"],
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_ddp_hook_parity_allreduce(self):
             self._test_ddp_hook_parity(state=None, hook=default.allreduce_hook)
 
@@ -4204,7 +4397,6 @@ def test_ddp_hook_parity_allreduce(self):
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_ddp_hook_parity_allreduce_process_group(self):
             # process_group is passed in to both DDP and comm. hook
             world_size = dist.get_world_size()
@@ -4218,7 +4410,6 @@ def test_ddp_hook_parity_allreduce_process_group(self):
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_ddp_hook_parity_powerSGD(self):
             for warm_start in [True, False]:
                 powersgd_state = powerSGD.PowerSGDState(
@@ -4241,7 +4432,6 @@ def test_ddp_hook_parity_powerSGD(self):
                          don't support multiprocessing with spawn start method",
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
-        @skip_if_rocm
         def test_ddp_hook_parity_post_localSGD(self):
             # Although we start run local SGD at iteration 10, since we still use the global process group to run it,
             # the post-LocalSGD actually still allreduces gradients globally for the remaining iterations.
@@ -4251,6 +4441,20 @@ def test_ddp_hook_parity_post_localSGD(self):
             self._test_ddp_hook_parity(
                 state=state, hook=post_localSGD.post_localSGD_hook
             )
+            # Only validate the warmup iterations before local SGD is applied,
+            # because when `post_local_gradient_allreduce` is disabled, the gradients will not be synchronized at all.
+            # Note that in practice a model averager has to be applied to run model averaging,
+            # so local gradient averaging is not necessary.
+            start_localSGD_iter = 10
+            state = post_localSGD.PostLocalSGDState(
+                process_group=None,
+                subgroup=dist.group.WORLD,
+                start_localSGD_iter=start_localSGD_iter,
+                post_local_gradient_allreduce=False,
+            )
+            self._test_ddp_hook_parity(
+                state=state, hook=post_localSGD.post_localSGD_hook, num_validated_iters=start_localSGD_iter
+            )
 
             # When `subgroup` is None, it is equivalent to the subgroup on the each node.
             # For this single-node test environment, the intra-node process group is equivalent to
@@ -4685,36 +4889,29 @@ def _test_DistributedDataParallel_SyncBatchNorm(
             )
             self._barrier()
 
-        @skip_if_lt_x_gpu(2)
-        @sandcastle_skip_if(
-            BACKEND not in DistTestCases.backend_feature["ddp"],
-            f"The {BACKEND} backend does not support DistributedDataParallel"
-        )
-        def test_post_localSGD_optimizer_parity(self, grad_is_view=False):
+        def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
             learning_rate = 0.03
-            period = 4
-            warmup_steps = 10
-            torch.cuda.set_device(self.rank)
+
             net = torch.nn.parallel.DistributedDataParallel(
                 copy.deepcopy(DDP_NET).cuda(),
                 device_ids=[self.rank],
                 gradient_as_bucket_view=grad_is_view,
             )
+            averager = create_averager()
             opt = torch.optim.SGD(net.parameters(), lr=learning_rate)
-            averager = averagers.PeriodicModelAverager(
-                period=period, warmup_steps=warmup_steps
-            )
 
-            post_localSGD_net = torch.nn.parallel.DistributedDataParallel(
+            net_using_post_localSGD_opt = torch.nn.parallel.DistributedDataParallel(
                 copy.deepcopy(DDP_NET).cuda(),
                 device_ids=[self.rank],
                 gradient_as_bucket_view=grad_is_view,
             )
+            # Process group cannot be pickled in some environments,
+            # so cannot deep copy an averager. See:
+            # https://github.com/pytorch/pytorch/pull/74737#pullrequestreview-922487496
+            averager2 = create_averager()
             post_localSGD_opt = post_localSGD_optimizer.PostLocalSGDOptimizer(
-                optim=torch.optim.SGD(post_localSGD_net.parameters(), lr=learning_rate),
-                averager=averagers.PeriodicModelAverager(
-                    period=period, warmup_steps=warmup_steps
-                )
+                optim=torch.optim.SGD(net_using_post_localSGD_opt.parameters(), lr=learning_rate),
+                averager=averager2,
             )
 
             input = torch.randn(dist.get_world_size() * 2, 2).cuda()
@@ -4730,14 +4927,75 @@ def test_post_localSGD_optimizer_parity(self, grad_is_view=False):
                 averager.average_parameters(net.parameters())
 
                 post_localSGD_opt.zero_grad()
-                post_localSGD_output = post_localSGD_net(input)
-                post_localSGD_loss = loss_fn(post_localSGD_output, target)
-                post_localSGD_loss.backward()
+                output_using_post_localSGD_opt = net_using_post_localSGD_opt(input)
+                loss_using_post_localSGD_opt = loss_fn(output_using_post_localSGD_opt, target)
+                loss_using_post_localSGD_opt.backward()
                 post_localSGD_opt.step()
-
-                for p1, p2 in zip(net.parameters(), post_localSGD_net.parameters()):
+                for p1, p2 in zip(net.parameters(), net_using_post_localSGD_opt.parameters()):
                     self.assertEqual(p1.data, p2.data)
 
+            # Also check if the built-in step counters are the same to prevent a bug like #74737.
+            self.assertEqual(averager.step, averager2.step)
+
+        def _create_periodic_model_averager(self):
+            return averagers.PeriodicModelAverager(period=4, warmup_steps=10)
+
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        def test_post_localSGD_optimizer_parity(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_periodic_model_averager,
+                grad_is_view=False,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        def test_post_localSGD_optimizer_parity_grad_is_view(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_periodic_model_averager,
+                grad_is_view=True,
+            )
+
+        def _create_hierarchical_model_averager(self):
+            period_group_size_dict = OrderedDict([(2, 2), (4, dist.get_world_size())])
+            return hierarchicalSGD.HierarchicalModelAverager(
+                period_group_size_dict=period_group_size_dict, warmup_steps=4
+            )
+
+        @skip_if_lt_x_gpu(4)
+        @skip_if_odd_worldsize
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        def test_post_localSGD_optimizer_parity_with_hierarchical_sgd(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_hierarchical_model_averager,
+                grad_is_view=False,
+            )
+
+        @skip_if_lt_x_gpu(4)
+        @skip_if_odd_worldsize
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        def test_post_localSGD_optimizer_parity_with_hierarchical_sgd_grad_is_view(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_hierarchical_model_averager,
+                grad_is_view=True,
+            )
+
         @sandcastle_skip_if(
             BACKEND not in DistTestCases.backend_feature["ddp"],
             f"The {BACKEND} backend does not support DistributedDataParallel"
@@ -5092,7 +5350,7 @@ def test_ddp_logging_data_cpu(self):
             def parse_env(var):
                 return os.environ[var] if var in os.environ else "N/A"
 
-            os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+            dist.set_debug_level(dist.DebugLevel.INFO)
             group, group_id, rank = self._init_global_test()
             model_DDP = self._test_ddp_logging_data(is_gpu=False)
 
@@ -5171,16 +5429,6 @@ def parse_env(var):
             # type if it didn't exist.
             self.assertEqual(ddp_logging_data.get("unused_parameter_size", 0), 0)
             self.assertEqual(ddp_logging_data.get("has_rebuilt_buckets"), 1)
-            init_bucket_lims = ddp_logging_data.get("initial_bucket_size_limits")
-            rebuilt_bucket_lims = ddp_logging_data.get("rebuilt_bucket_size_limits")
-            self.assertEqual(
-                int(init_bucket_lims),
-                dist._DEFAULT_FIRST_BUCKET_BYTES,
-            )
-            self.assertEqual(
-                int(rebuilt_bucket_lims),
-                dist._DEFAULT_FIRST_BUCKET_BYTES,
-            )
             self.assertEqual(
                 ddp_logging_data.get("rebuilt_bucket_sizes"), str(param_size)
             )
@@ -5635,8 +5883,8 @@ def validate_net_equivalence(self, net):
             BACKEND not in DistTestCases.backend_feature["ddp"],
             f"The {BACKEND} backend does not support DistributedDataParallel"
         )
-        def test_ddp_sync_params_and_buffers(self):
-            # Test that after calling _sync_params_and_buffers, models across ranks
+        def test_ddp_sync_module_states(self):
+            # Test that after calling _sync_module_states, models across ranks
             # are the same and are equal to the model on the input rank.
             dim = 2
             rank = self.rank
@@ -5663,7 +5911,13 @@ def test_ddp_sync_params_and_buffers(self):
                         # tensor from another rank should be different.
                         self.assertNotEqual(t, tensor)
 
-            net._sync_params_and_buffers(authoritative_rank=rank_to_broadcast)
+            _sync_module_states(
+                module=net.module,
+                process_group=net.process_group,
+                broadcast_bucket_size=net.broadcast_bucket_size,
+                src=rank_to_broadcast,
+                params_and_buffers_to_ignore=net.parameters_to_ignore
+            )
             # Now all model params should be the same.
             self.validate_net_equivalence(net)
             # Since the network params were broadcast from rank_to_broadcast, validate that
@@ -6445,12 +6699,16 @@ def forward(self, x):
                 # Materialize new params. These are not registered in DDP and thus
                 # don't have autograd hooks installed on them.
                 ddp.module.fc2 = nn.Linear(1, 1, bias=False).to(device_id)
+                # Rebuild replicated_module to pick up the changes.
+                ddp._build_replicated_tensor_module()
+
                 # local model with the new materialized parameters.
                 local_model = copy.deepcopy(ddp.module).cuda(self.rank)
 
                 inp = torch.ones(1, dtype=torch.float).to(device_id) * (self.rank + 1)
                 for i in range(6):
                     ddp(inp).sum().backward()
+
                     local_model(inp).sum().backward()
                     # materialized param grad is not touched by DDP, so its grad should
                     # be the same as if running locally.
@@ -6515,7 +6773,7 @@ def forward(self, x):
                         ]
                         # In debug mode, should show parameters that weren't reduced.
                         # Without debug mode, should show suggestion to use debug mode.
-                        if dist._get_debug_mode() == dist._DistributedDebugLevel.OFF:
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
                             expected_strs.append(ddp_suggest_debug_mode_str)
                         else:
                             unreduced_params = ", ".join(["net2.weight"])
@@ -6781,7 +7039,7 @@ def test_ddp_control_flow_same_across_ranks(self):
                         ]
                         # In debug mode, should show parameters that weren't reduced.
                         # Without debug mode, should show suggestion to use debug mode.
-                        if dist._get_debug_mode() == dist._DistributedDebugLevel.OFF:
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
                             expected_strs.append(ddp_suggest_debug_mode_str)
                         else:
                             unreduced_params = ", ".join(["lin2.weight"])
@@ -6935,7 +7193,7 @@ def forward(self, x):
                         ]
                         # In debug mode, should show parameters that weren't reduced.
                         # Without debug mode, should show suggestion to use debug mode.
-                        if dist._get_debug_mode() == dist._DistributedDebugLevel.OFF:
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
                             expected_strs.append(ddp_suggest_debug_mode_str)
                         else:
                             unreduced_params = ", ".join(["lin2.weight"])
@@ -7007,7 +7265,7 @@ def _test_compute_bucket_assignment_by_size(self, use_logger):
 
             # Create a valid model. The constructor initializes the logger that we use later.
             # We never actually use the rest of the model - we only need its logger.
-            net = EmbeddingNet(0)
+            net = EmbeddingNetDifferentParams(0)
             net = torch.nn.parallel.DistributedDataParallel(
                 net.to(self.rank),
                 device_ids=[self.rank],
@@ -7035,24 +7293,33 @@ def _test_compute_bucket_assignment_by_size(self, use_logger):
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         def test_compute_bucket_assignment_by_size_sparse_error_without_logger(self):
             self._test_compute_bucket_assignment_by_size(use_logger=False)
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         def test_compute_bucket_assignment_by_size_sparse_error_with_logger(self):
             self._test_compute_bucket_assignment_by_size(use_logger=True)
 
-        def _determine_expected_error_verify_model_across_rank(self, group_to_use):
+        def _determine_expected_error_verify_model_across_rank(
+            self,
+            group_to_use,
+            diff_num_params=False
+        ):
             # When running with NCCL backend, we don't expect an error on rank 0,
             # rather, it will be taken down by NCCL_ASYNC_ERROR_HANDLING. When
             # running with Gloo or with debug mode wrapper, we expect the error
             # to be caught inline.
+            # All ranks report same error when there is a # of parameter
+            # mismatch since we use allgather in the impl.
+            if diff_num_params:
+                expected_err = "DDP expects same model across all ranks"
+                ctx = self.assertRaisesRegex(RuntimeError, expected_err)
+                return ctx, expected_err
+
             is_detail_dbg_mode = (
-                dist._get_debug_mode() == dist._DistributedDebugLevel.DETAIL
+                dist.get_debug_level() == dist.DebugLevel.DETAIL
             )
             if self.rank == 0:
                 if dist.get_backend(group_to_use) == dist.Backend.NCCL and not is_detail_dbg_mode:
@@ -7080,7 +7347,7 @@ def _test_verify_model_across_rank(self, use_logger):
             ctx, expected_err = self._determine_expected_error_verify_model_across_rank(group_to_use)
 
             # Create a valid model. The constructor initializes the logger that we use later.
-            net = EmbeddingNet(0)
+            net = EmbeddingNetDifferentParams(0)
             net = torch.nn.parallel.DistributedDataParallel(
                 net.to(self.rank),
                 device_ids=[self.rank],
@@ -7088,7 +7355,7 @@ def _test_verify_model_across_rank(self, use_logger):
             )
 
             # Modify the model so that the number of parameters are different for each rank.
-            # This will cause a RuntimeError to be thrown below in dist._verify_params_across_processes,
+            # This will cause a RuntimeError to be thrown below in _verify_param_shape_across_processes,
             # so we can check if the correct error is thrown and is logged.
             # We can't do this in the constructor above otherwise the logger will
             # not be properly initialized.
@@ -7097,9 +7364,16 @@ def _test_verify_model_across_rank(self, use_logger):
             # if we pass a logger we can verify that it was logged
             with ctx:
                 if use_logger:
-                    dist._verify_params_across_processes(net.process_group, list(net.parameters()), net.logger)
+                    _verify_param_shape_across_processes(
+                        net.process_group,
+                        list(net.parameters()),
+                        net.logger
+                    )
                 else:
-                    dist._verify_params_across_processes(net.process_group, list(net.parameters()))
+                    _verify_param_shape_across_processes(
+                        net.process_group,
+                        list(net.parameters())
+                    )
                 # Should only be run by rank 0, and blocking_wait catches and
                 # reports exception.
                 dist.barrier(group_to_use)
@@ -7116,22 +7390,36 @@ def _test_verify_model_across_rank(self, use_logger):
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         def test_verify_model_across_rank_with_logger(self):
             self._test_verify_model_across_rank(use_logger=True)
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         def test_verify_model_across_rank_without_logger(self):
             self._test_verify_model_across_rank(use_logger=False)
 
+        def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
+            with ctx:
+                net = torch.nn.parallel.DistributedDataParallel(
+                    net.to(self.rank),
+                    device_ids=[self.rank],
+                    process_group=ddp_group
+                )
+                # Should only be run by rank 0, and blocking_wait catches and
+                # reports exception.
+                dist.barrier(ddp_group)
+
+            # can't use verify_ddp_error_logged here because net was never properly constructed
+
+            # Perform gloo-based barrier to ensure one rank doesn't exit test
+            # early which causes failure with Barrier.sync.
+            dist.barrier(group_gloo)
+
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
-        def test_ddp_model_diff_across_ranks(self):
+        def test_ddp_model_diff_shape_across_ranks(self):
             group_gloo = dist.new_group(
                 timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
             )
@@ -7139,27 +7427,43 @@ def test_ddp_model_diff_across_ranks(self):
             # determinism.
             os.environ["NCCL_BLOCKING_WAIT"] = "1"
             group_to_use = dist.new_group(
-                backend=dist.get_backend(), timeout=timedelta(seconds=5)
+                backend=dist.get_backend(), timeout=timedelta(seconds=10)
             )
             torch.cuda.set_device(self.rank)
             ctx, expected_err = self._determine_expected_error_verify_model_across_rank(group_to_use)
             # Creates network with different sized embedding table on different
             # ranks. This should throw an error during DDP init.
-            net = EmbeddingNet(self.rank)
-            with ctx:
-                net = torch.nn.parallel.DistributedDataParallel(
-                    net.to(self.rank),
-                    device_ids=[self.rank],
-                    process_group=group_to_use,
-                )
-                # Should only be run by rank 0, and blocking_wait catches and
-                # reports exception.
-                dist.barrier(group_to_use)
-            # can't use verify_ddp_error_logged here because net was never properly constructed
+            net = EmbeddingNetDifferentParams(self.rank)
+            self._run_test_ddp_model_with_diff_params(
+                ctx, net, group_to_use, group_gloo
+            )
 
-            # Perform gloo-based barrier to ensure one rank doesn't exit test
-            # early which causes failure with Barrier.sync.
-            dist.barrier(group_gloo)
+        @require_backend(DistTestCases.backend_feature["gpu"])
+        @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_model_diff_num_params_across_ranks(self):
+            group_gloo = dist.new_group(
+                timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
+            )
+            # Set NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
+            # determinism.
+            os.environ["NCCL_BLOCKING_WAIT"] = "1"
+            group_to_use = dist.new_group(
+                backend=dist.get_backend(), timeout=timedelta(seconds=10)
+            )
+            torch.cuda.set_device(self.rank)
+            ctx, expected_err = self._determine_expected_error_verify_model_across_rank(
+                group_to_use, diff_num_params=True
+            )
+
+            # Creates network with diff # of param across ranks, reducer should
+            # recognize this and throw appropriate error.
+            net = EmbeddingNetDifferentParams(self.rank, diff_num_params=(self.rank == 1))
+
+
+            self._run_test_ddp_model_with_diff_params(
+                ctx, net, group_to_use, group_gloo,
+            )
 
         def _test_output_unused_in_loss(self, module_cls, gradient_as_bucket_view):
             model = module_cls()
@@ -7450,7 +7754,7 @@ def _test_monitored_barrier_allreduce_hang(self, wait_all_ranks):
                 # wrapper PG is enabled or not, since with wrapper pg, it will
                 # fail in a collective synchronization check and not actually
                 # call into the nccl pg.
-                if dist._get_debug_mode() == dist._DistributedDebugLevel.DETAIL:
+                if dist.get_debug_level() == dist.DebugLevel.DETAIL:
                     err_regex = "Timed out waiting"
                 else:
                     err_regex = "Caught collective operation timeout"
@@ -7478,7 +7782,6 @@ def _test_monitored_barrier_allreduce_hang(self, wait_all_ranks):
         @with_nccl_blocking_wait
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @skip_if_rocm
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_monitored_barrier_allreduce_hang(self):
             # tests expected behavior when nonzero rank hangs and we want to
@@ -7488,7 +7791,6 @@ def test_monitored_barrier_allreduce_hang(self):
         @with_nccl_blocking_wait
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @skip_if_rocm
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_monitored_barrier_allreduce_hang_wait_all_ranks(self):
             # tests expected behavior when nonzero rank hangs and we want to
@@ -7552,8 +7854,9 @@ def test_monitored_barrier_wait_all_ranks(self):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @with_dist_debug_levels(levels=["INFO"])
         @skip_if_lt_x_gpu(2)
-        def test_ddp_build_param_to_name_mapping(self):
+        def test_ddp_build_debug_param_to_name_mapping(self):
             model = TwoLinLayerNet()
             net = torch.nn.parallel.DistributedDataParallel(
                 model.cuda(self.rank),
@@ -7561,7 +7864,7 @@ def test_ddp_build_param_to_name_mapping(self):
             )
             expected_mapping = {0: "a.weight", 1: "b.weight"}
             net_params, _ = net._build_params_for_reducer()
-            param_to_name_mapping = net._build_param_to_name_mapping(net_params)
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
             self.assertDictEqual(expected_mapping, param_to_name_mapping)
 
             # Test when DDP is used with ignored parameters.
@@ -7577,7 +7880,7 @@ def test_ddp_build_param_to_name_mapping(self):
             )
             expected_mapping = {0: "b.weight"}
             net_params, _ = net._build_params_for_reducer()
-            param_to_name_mapping = net._build_param_to_name_mapping(net_params)
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
             self.assertDictEqual(expected_mapping, param_to_name_mapping)
 
             # Test errors are raised when DDP and module parameters mismatch.
@@ -7600,11 +7903,11 @@ def test_ddp_build_param_to_name_mapping(self):
             )
 
             with self.assertRaisesRegex(ValueError, "Expected param to name mapping"):
-                net._build_param_to_name_mapping(net_params)
+                net._build_debug_param_to_name_mapping(net_params)
 
             net_params = net_params[:-3]
             with self.assertRaisesRegex(ValueError, "Param with name"):
-                net._build_param_to_name_mapping(net_params)
+                net._build_debug_param_to_name_mapping(net_params)
 
             net_params.extend(
                 [
@@ -7617,8 +7920,9 @@ def test_ddp_build_param_to_name_mapping(self):
             BACKEND not in DistTestCases.backend_feature["ddp"],
             f"The {BACKEND} backend does not support DistributedDataParallel"
         )
+        @with_dist_debug_levels(levels=["INFO"])
         @skip_if_lt_x_gpu(2)
-        def test_ddp_build_param_to_name_mapping_requires_grad(self):
+        def test_ddp_build_debug_param_to_name_mapping_requires_grad(self):
             class Net(nn.Module):
                 def __init__(self):
                     super().__init__()
@@ -7638,16 +7942,16 @@ def forward(self, x):
                 0: "lin.weight",
             }
             net_params, _ = net._build_params_for_reducer()
-            param_to_name_mapping = net._build_param_to_name_mapping(net_params)
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
             self.assertEqual(param_to_name_mapping, expected_mapping)
 
         def _test_ddp_multiple_nested_unused_params_error(self, ignore_sparse):
-            debug_mode_off = dist._get_debug_mode() == dist._DistributedDebugLevel.OFF
+            debug_mode_off = dist.get_debug_level() == dist.DebugLevel.OFF
 
             class SubModule(nn.Module):
                 def __init__(self):
                     super().__init__()
-                    self.embedding_net = EmbeddingNet(0)
+                    self.embedding_net = EmbeddingNetDifferentParams(0)
                     self.lin = TwoLinLayerNet()
                     self.bn = BatchNormNet()
                     self.lin_layer = nn.Linear(4, 10, bias=False)
@@ -7656,7 +7960,7 @@ def forward(self, x):
                     x = self.bn(x)
                     x = self.lin_layer(x)
                     x = self.lin.a(x)  # self.lin.b param unused
-                    # EmbeddingNet entirely unused: self.embedding_net.embedding and
+                    # EmbeddingNetDifferentParams entirely unused: self.embedding_net.embedding and
                     # self.embedding_net.lin unused.
                     return x
 
@@ -7821,7 +8125,6 @@ def test_ddp_inference(self):
             f"The {BACKEND} backend does not support DistributedDataParallel"
         )
         @skip_if_lt_x_gpu(2)
-        @skip_if_rocm
         def test_ddp_sync_bn_training_vs_eval(self):
             rank = self.rank
             torch.cuda.set_device(rank)
@@ -8070,10 +8373,10 @@ def __init__(self):
                     super().__init__()
                     self.fc1 = nn.Linear(10, 10, bias=False)
                     self.fc2 = nn.Linear(10, 10, bias=False)
+                    self.device = self.fc1.weight.device
 
                 def __init_opt(self):
-                    param = next(self.parameters())
-                    opt = torch.randn(1, 10, device=param.device)
+                    opt = torch.randn(1, 10, device=self.device)
                     return opt
 
                 def forward(self, x, opt_1, opt_2, opt_nested):
@@ -8114,52 +8417,6 @@ def forward(self, x, opt_1, opt_2, opt_nested):
                             self.assertEqual(opt[i]["tensor"].grad_fn, None)
                     out.mean().backward()
 
-        @skip_if_lt_x_gpu(2)
-        @sandcastle_skip_if(
-            BACKEND not in DistTestCases.backend_feature["ddp"],
-            f"The {BACKEND} backend does not support DistributedDataParallel"
-        )
-        def test_ddp_get_bucket_sizes(self):
-            torch.cuda.set_device(self.rank)
-            default_bucket_cap_mb = 25 * (1024 ** 2)
-            first_bucket_bytes_mb = dist._DEFAULT_FIRST_BUCKET_BYTES
-            os.environ["DDP_SET_LAST_BUCKET_CAP"] = "1"
-
-            class MyModel(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.model = nn.Sequential(
-                        nn.Linear(2, 4000, bias=False),
-                        *[nn.Linear(4000, 4000, bias=False) for _ in range(10)]
-                    )
-
-                def forward(self, x):
-                    return self.model(x)
-
-            ddp = torch.nn.parallel.DistributedDataParallel(
-                MyModel().cuda(),
-                device_ids=[self.rank]
-            )
-            inp = torch.randn(10, 2)
-            rebuilt_bucket_index = 2
-            for i in range(6):
-                out = ddp(inp).sum()
-                out.backward()
-                logging_data = ddp._get_ddp_logging_data()
-                bucket_size_limits = [
-                    int(b) for b in logging_data[
-                        "{}_bucket_size_limits".format(
-                            "initial" if i < rebuilt_bucket_index else "rebuilt"
-                        )
-                    ].split(", ")
-                ]
-                # first_bucket_bytes is actually the last because we reverse
-                # parameter bucket order under DDP_SET_LAST_BUCKET_CAP flag.
-                self.assertEqual(bucket_size_limits[-1], first_bucket_bytes_mb)
-                for j, bucket_size in enumerate(bucket_size_limits):
-                    if j != len(bucket_size_limits) - 1:
-                        self.assertEqual(bucket_size, default_bucket_cap_mb)
-
         @skip_if_lt_x_gpu(2)
         @sandcastle_skip_if(
             BACKEND not in DistTestCases.backend_feature["ddp"],
@@ -8397,10 +8654,15 @@ def forward(self, x):
 
             device = self.rank
             module = MockModule().to(device)
-            module = torch.nn.parallel.DistributedDataParallel(
-                module,
-                device_ids=[device]
-            )
+            # Disable DDP + ReplicatedTensor since stateless looks for 'module'
+            # whereas with ReplicatedTensor, we run '_replicated_tensor_module'
+            # in the forward pass.
+            from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
+            with _ddp_replicated_tensor(False):
+                module = torch.nn.parallel.DistributedDataParallel(
+                    module,
+                    device_ids=[device]
+                )
             x = torch.rand((1, 1)).to(device)
             weight = torch.tensor([[1.0]], device=device, requires_grad=True)
             bias = torch.tensor([0.0], device=device, requires_grad=True)
@@ -8410,6 +8672,7 @@ def forward(self, x):
                           'module.buffer': buffer}
             prev_weight = module.module.l1.weight.clone()
             prev_buffer = module.module.buffer.clone()
+
             res = _stateless.functional_call(module, parameters, x)
             self.assertEqual(x, res)
             # check that the weight remain unmodified
@@ -8427,4 +8690,48 @@ def forward(self, x):
             self.assertIsNone(module.module.l1.bias.grad)
             self.assertIsNone(module.module.buffer.grad)
 
+
+        @require_backend(DistTestCases.backend_feature["gpu"])
+        @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_forward_backward_hook(self):
+            class DummyTestModel(nn.Module):
+                def __init__(self):
+                    super(DummyTestModel, self).__init__()
+                    torch.manual_seed(0)
+                    self.fc = nn.Linear(2, 2)
+
+                def forward(self, x):
+                    return self.fc(x)
+
+            def relu_hook(module, input):
+                return nn.functional.relu(input[0])
+
+            def gelu_hook(module, _input, output):
+                return nn.functional.gelu(output)
+
+            def celu_hook(module, _input, output):
+                return (nn.functional.celu(output[0]),)
+
+            local_model = DummyTestModel()
+            ddp_model = DummyTestModel()
+            local_model.fc.register_forward_pre_hook(relu_hook)
+            local_model.fc.register_forward_hook(gelu_hook)
+            ddp_model.fc.register_forward_pre_hook(relu_hook)
+            ddp_model.fc.register_forward_hook(gelu_hook)
+            local_model.fc.register_backward_hook(celu_hook)
+            ddp_model.fc.register_backward_hook(celu_hook)
+            ddp_model = DistributedDataParallel(
+                ddp_model.to(self.rank), device_ids=[self.rank]
+            )
+            input_data = torch.rand(5, 2)
+            output_local = local_model(input_data)
+            output_ddp = ddp_model(input_data.to(self.rank))
+            self.assertEqual(output_local, output_ddp)
+            output_local.sum().backward()
+            output_ddp.sum().backward()
+            ddp_grads = [p.grad for p in ddp_model.parameters()]
+            self.assertEqual(ddp_grads[0], local_model.fc.weight.grad)
+            self.assertEqual(ddp_grads[1], local_model.fc.bias.grad)
+
 instantiate_parametrized_tests(DistributedTest._DistTestBase)
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 5ba47c724415..383e6278bd0a 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -153,8 +153,11 @@ def record_function_on_caller_rpc_async(dst_worker_name: str, block: str) -> Ten
     t: Tensor = torch.ones(1)
     with record_function(block) as rf:
         fut1 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
+        # Extra operator call to avoid de-duplication of the next async call
+        # see https://github.com/pytorch/pytorch/pull/62710#discussion_r694680279
+        zero = torch.zeros_like(t)
         fut2 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
-        res = fut1.wait() + fut2.wait()
+        res = fut1.wait() + fut2.wait() + zero
     return res
 
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 47e3397212db..c168d9f95d69 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -17,7 +17,7 @@
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
 import torch.distributed.autograd as dist_autograd
-from torch.distributed.rpc import RRef, _get_debug_info, _rref_context_get_debug_info
+from torch.distributed.rpc import RRef, _get_debug_info, _rref_context_get_debug_info, WorkerInfo
 from torch.distributed.rpc.api import _delete_all_user_and_unforked_owner_rrefs, _use_rpc_pickler, _thread_local_var, _wait_all
 from torch.distributed.rpc.internal import (
     PythonUDF,
@@ -29,6 +29,7 @@
 from torch.testing._internal.common_distributed import (
     skip_if_lt_x_gpu,
     captured_output,
+    tp_transports,
 )
 from torch.testing._internal.common_utils import (
     IS_MACOS,
@@ -36,6 +37,7 @@
     sandcastle_skip_if,
     get_cycles_per_ms,
 )
+
 from torch.testing._internal.dist_utils import (
     dist_init,
     get_function_event,
@@ -107,7 +109,7 @@ def __init__(self, world_size):
 
     def get_worker_infos(self):
         return {
-            rpc.WorkerInfo(name=worker_name(rank), id=rank)
+            WorkerInfo(name=worker_name(rank), id=rank)
             for rank in range(self.world_size)
         }
 
@@ -277,6 +279,9 @@ def delayed_add(a, b, seconds=0.05):
     return a + b
 
 
+def identity(a):
+    return a
+
 def no_result():
     print("do nothing")
 
@@ -571,13 +576,15 @@ def cb(fut):
 # A custom Python class that contains a tensor, needed to see if we correctly
 # use the Python pickler to extract tensors from non-IValue-convertible types.
 class TensorWrapper:
-    __slots__ = ("tensor", "lock", "event")
+    __slots__ = ("tensor", "lock", "event", "thread")
 
     def __init__(self, t):
         self.tensor = t
         # Add one non-picklable field, to ensure it's ignored/skipped.
         self.lock = Lock()
         self.event = torch.cuda.Event(enable_timing=True)
+        self.thread = threading.Thread()
+        self.thread.start()
 
     def increase(self, v):
         with self.lock:
@@ -681,7 +688,37 @@ def average(rref, riteration, tensor):
                     fut.set_result(result)
         return fut
 
-class RpcTestCommon():
+
+class MyConvNetForMNIST(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, 3, 1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Flatten(1),
+            nn.Linear(4608, 128),
+            nn.ReLU(),
+            nn.Linear(128, 10),
+        ).to(device)
+        self.device = device
+
+    def forward(self, x, is_rref=False):
+        x = x.to_here() if is_rref else x
+        with torch.cuda.stream(torch.cuda.current_stream(self.device)):
+            # intentionally adding delay to current CUDA stream
+            torch.cuda._sleep(10 * FIFTY_MIL_CYCLES)
+            return self.net(x)
+
+    def __getstate__(self):
+        # return an empty dict to avoid inspecting the model contents on the
+        # owner
+        return {}
+
+
+class RpcTestCommon:
     def _run_func_in_mode(self, to, fn, mode, args=None, kwargs=None):
         if mode == RPCExecMode.SYNC:
             return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs)
@@ -770,6 +807,34 @@ def _multi_rpc(self, sparse):
             )
             self.assertEqual(ret, x * 2)
 
+    def _run_uneven_workload(self, f, x, num_repeat=30):
+        # worker0 drives and waits for worker1 and worker2
+        # throughout the test.
+        if self.rank == 0:
+            self.assertTrue(self.world_size >= 3)
+
+            # Phase 1: Only worker1 has workload.
+            dst = "worker1"
+            futs = []
+            for _ in range(num_repeat):
+                fut = rpc.rpc_async(dst, f, args=(x,))
+                futs.append(fut)
+
+            for fut in torch.futures.collect_all(futs).wait():
+                self.assertEqual(fut.wait(), 0)
+
+            # Phase 2: Only worker2 has workload.
+            # If join is not correctly implemented,
+            # worker2 should be closed by now.
+            dst = "worker2"
+            futs = []
+            for _ in range(num_repeat):
+                fut = rpc.rpc_async(dst, f, args=(x,))
+                futs.append(fut)
+
+            for val in torch.futures.wait_all(futs):
+                self.assertEqual(val, 0)
+
     def _wait_all_workers(self, f, x):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
         rpc.init_rpc(
@@ -978,6 +1043,22 @@ def _nested_rref_stress(self, f, expected1, expected2):
             self.assertEqual(rrefs[0].to_here(), expected1)
             self.assertEqual(rrefs[1].to_here(), expected2)
 
+    def _trainer_func(self, rref, sparse):
+        m = MyEmbeddingBagModel(sparse=sparse)
+        loss_fn = nn.MSELoss()
+        for i in range(10):
+            outputs = m(torch.rand(10, 10).long())
+            loss_fn(outputs, torch.rand(10, 10)).backward()
+            gradient = list(m.parameters())[0].grad
+            fut = rref.rpc_async().average(rref, i, gradient)
+            gradient = fut.wait()
+            if gradient.is_sparse:
+                gradient = gradient.to_dense().double()
+            ps_gradient = rref.rpc_sync().get_gradient(rref)
+            if ps_gradient.is_sparse:
+                ps_gradient = ps_gradient.to_dense().double()
+            self.assertTrue(torch.equal(gradient, ps_gradient))
+
     def _my_parameter_server(self, sparse):
         ps_rref = RRef(MyParameterServer(self.world_size - 1))
         futures = []
@@ -1024,6 +1105,32 @@ def _test_cuda_future_extraction(self, wrapper, unwrapper, sparse_tensor):
                 else:
                     self.assertTrue(torch.eq(tensor, expected_tensor).all().item())
 
+    @staticmethod
+    def _meta_tensor_method(tensors, device_types, sizes, numels, dtypes, strides, return_rrefs):
+        tensors = [t.to_here() if isinstance(t, torch._C._distributed_rpc.PyRRef) else t for t in tensors]
+        for tensor, device_type, size, numel, dtype, stride in zip(tensors, device_types, sizes, numels, dtypes,
+                                                                   strides):
+            assert tensor.device.type == device_type, f"{tensor.device.type} vs {device_type}"
+            assert tensor.size() == size, f"{tensor.size} vs {size}"
+            assert tensor.numel() == numel, f"{tensor.numel()} vs {numel}"
+            assert tensor.dtype == dtype, f"{tensor.dtype} vs {dtype}"
+            assert tensor.stride() == stride, f"{tensor.stride()} vs {stride}"
+        return [RRef(t) if return_rrefs else t for t in tensors]
+
+    def _test_meta_tensor(self, to, tensors, device_types, sizes, numels, dtypes, strides, return_rrefs):
+        returned_tensors = rpc.rpc_sync(to, RpcTest._meta_tensor_method,
+                                        args=(tensors, device_types, sizes, numels, dtypes, strides, return_rrefs))
+        returned_tensors = [t.to_here() if isinstance(t, torch._C._distributed_rpc.PyRRef) else t for t in
+                            returned_tensors]
+        for tensor, device_type, size, numel, dtype, stride in zip(returned_tensors, device_types, sizes, numels,
+                                                                   dtypes, strides):
+            assert isinstance(tensor, torch.Tensor), f"{type(tensor)}"
+            self.assertEqual(tensor.device.type, device_type)
+            self.assertEqual(tensor.size(), size)
+            self.assertEqual(tensor.numel(), numel)
+            self.assertEqual(tensor.dtype, dtype)
+            self.assertEqual(tensor.stride(), stride)
+
 
 class RpcTest(RpcAgentTestFixture, RpcTestCommon):
     @dist_init
@@ -1036,7 +1143,7 @@ def test_worker_id(self):
         self.assertEqual(self_worker_info.name, worker_name(self.rank))
         self.assertEqual(peer_worker_info.name, worker_name(peer_rank))
 
-        with self.assertRaisesRegex(RuntimeError, "Unknown destination worker"):
+        with self.assertRaisesRegex(RuntimeError, "could not find destination"):
             unknown_worker_id = rpc.get_worker_info("WorkerUnknown")
 
     @dist_init
@@ -1377,7 +1484,6 @@ def test_world_size_one(self):
 
     @dist_init(setup_rpc=False)
     def test_invalid_names(self):
-        from torch.distributed.rpc import WorkerInfo
 
         worker_id = 0
         with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
@@ -1394,6 +1500,14 @@ def test_invalid_names(self):
         with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
             info = WorkerInfo("".join(["a" for i in range(500)]), worker_id)
 
+    # Test that WorkerInfo can be pickled and sent in RPC call
+    @dist_init
+    def test_worker_info_pickle(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        worker_info = rpc.api.get_worker_info()
+        ret = rpc.rpc_sync(worker_name(dst_rank), identity, args=(worker_info,))
+        self.assertEqual(ret, worker_info)
+
     @dist_init
     def test_add(self):
         n = self.rank + 1
@@ -1473,34 +1587,6 @@ def test_future_wait_twice(self):
             with self.assertRaisesRegex(ValueError, "Expected error"):
                 fut.wait()
 
-    def _run_uneven_workload(self, f, x, num_repeat=30):
-        # worker0 drives and waits for worker1 and worker2
-        # throughout the test.
-        if self.rank == 0:
-            self.assertTrue(self.world_size >= 3)
-
-            # Phase 1: Only worker1 has workload.
-            dst = "worker1"
-            futs = []
-            for _ in range(num_repeat):
-                fut = rpc.rpc_async(dst, f, args=(x,))
-                futs.append(fut)
-
-            for fut in torch.futures.collect_all(futs).wait():
-                self.assertEqual(fut.wait(), 0)
-
-            # Phase 2: Only worker2 has workload.
-            # If join is not correctly implemented,
-            # worker2 should be closed by now.
-            dst = "worker2"
-            futs = []
-            for _ in range(num_repeat):
-                fut = rpc.rpc_async(dst, f, args=(x,))
-                futs.append(fut)
-
-            for val in torch.futures.wait_all(futs):
-                self.assertEqual(val, 0)
-
     @dist_init(setup_rpc=False)
     def test_wait_all_workers_timeout(self):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
@@ -2083,12 +2169,15 @@ def test_profiler_with_autograd_context_single_threaded(self):
     def test_profiler_with_autograd_context(self):
         self._run_test_profiler_with_autograd_context()
 
-    def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function=False, dst=None):
+    def _profiler_test_with_rpc(
+        self, rpc_exec_mode, func, args, use_record_function=False, dst=None, kineto_profile=False
+    ):
         dst = dst if dst is not None else (self.rank + 1) % self.world_size
 
         # only run profiler on rank 1.
+        p = _profile if not kineto_profile else torch.profiler.profile  # kineto
         if self.rank == 1:
-            with _profile() as prof:
+            with p() as prof:
                 record_function_ctx_mgr = (
                     contextlib.suppress()
                     if not use_record_function
@@ -2101,6 +2190,15 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                         rpc.rpc_sync(worker_name(dst), func, args=args)
                     elif rpc_exec_mode == RPCExecMode.ASYNC:
                         fut = rpc.rpc_async(worker_name(dst), func, args=args)
+                        if kineto_profile:
+                            # Ensure multiple async RPCs don't cause issues.
+                            # Would have raised
+                            # "RuntimeError: Cannot call
+                            # RemoteProfilerManager::setCurrentKey when current
+                            # key is already set." error if RPC profiling was
+                            # not disabled properly for kineto.
+                            fut2 = rpc.rpc_async(worker_name(dst), func, args=args)
+                            fut2.wait()
                         fut.wait()
                     else:
                         self.assertTrue(rpc_exec_mode == RPCExecMode.REMOTE)
@@ -2112,7 +2210,15 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                         # for recording the profiling event.
                         rref._get_profiling_future().wait()
 
-            events = prof.function_events
+            events = prof.function_events if not kineto_profile else prof.events()
+            if kineto_profile:
+                # RPC profiling is disabled so there should be no rpc related
+                # events.
+                with self.assertRaises(IndexError):
+                    get_function_event(events, rpc_exec_mode.value)
+
+                return
+
             rpc_event = get_function_event(events, rpc_exec_mode.value)
             # verify Node ID for this rpc event.
             self.assertEqual(rpc_event.node_id, self.rank)
@@ -2174,6 +2280,11 @@ def _run_test_profiler_with_async_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
+        # Test to ensure that kineto profiler enabled in RPC does not enable
+        # RPC profiling (it is unsupported) and does not result in issues.
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, my_sleep_func, args=(1,), kineto_profile=True
+        )
 
     @dist_init
     def test_profiler_with_async_rpc_udf(self):
@@ -2370,6 +2481,24 @@ def test_async_record_function_double_end_callbacks(self):
                         rf._call_end_callbacks_on_future(fut)
                 fut.wait()
 
+    @dist_init
+    def test_async_record_function_double_end_callbacks_new_signatures(self):
+        # Test the new _record_function ops work
+        # Note: Remove once record_function uses these directly
+        num_sleep_seconds = 1
+        if self.rank == 1:
+            with _profile() as pf:
+                try:
+                    record = torch.ops.profiler._record_function_enter_new("foo", None)
+                    fut = rpc.rpc_async(
+                        worker_name(0), my_sleep_func, args=(num_sleep_seconds,)
+                    )
+                    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
+                finally:
+                    torch.ops.profiler._record_function_exit(record)
+
+                fut.wait()
+
     @dist_init
     def test_async_record_function_cbs_jit_call(self):
         if self.rank == 1:
@@ -4292,6 +4421,191 @@ def test_init_rpc_twice(self):
 
         rpc.shutdown()
 
+    # Test init_rpc without world_size argument
+    @dist_init(setup_rpc=False)
+    def test_init_rpc_without_world_size(self):
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        rpc.shutdown()
+
+    # Dynamic RPC new ranks communicate with existing ranks
+    @dist_init(setup_rpc=False)
+    def test_without_world_size_new_rank_can_communicated_with_existing_rank(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+            result = rpc.rpc_sync(worker_name(0), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+            self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    # Dynamic RPC existing ranks can communicate with new ranks
+    @dist_init(setup_rpc=False)
+    def test_without_world_size_existing_rank_can_communicate_with_new_rank(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        # Rest of ranks join after barrier
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        dist.barrier()
+        if self.rank == 0:
+            for i in range(1, self.world_size):
+                result = rpc.rpc_sync(worker_name(i), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+                self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    # Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
+    @skip_if_lt_x_gpu(2)
+    @dist_init(setup_rpc=False)
+    def test_without_world_size_existing_rank_can_communicate_with_new_rank_cuda(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            options = self.rpc_backend_options
+            for i in range(1, self.world_size):
+                dst = worker_name(i)
+                options.set_device_map(dst, {1: 0})
+                options.set_device_map(dst, {0: 1})
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        # Rest of ranks join after barrier
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        dist.barrier()
+        if self.rank == 0:
+            for i in range(1, self.world_size):
+                x = torch.ones(2)
+                result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1))
+                result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1))
+                self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0)
+                self.assertEqual(torch.device('cuda:0'), result_on_device_0.device)
+                self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1)
+                self.assertEqual(torch.device('cuda:1'), result_on_device_1.device)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    @dist_init(setup_rpc=False)
+    def test_init_rpc_without_world_size_without_rank(self):
+        # default initialization uses file init
+        with self.assertRaisesRegex(ValueError, "rank parameter missing"):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # env init
+        with self.assertRaisesRegex(ValueError, "environment variable RANK expected"):
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(init_method="env://")
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+        # tcp init
+        with self.assertRaisesRegex(ValueError, "rank parameter missing"):
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(init_method="tcp://127.0.0.1:23456")
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    @dist_init(setup_rpc=False)
+    def test_init_dynamic_and_static_rpc_group(self):
+        # Initialize a static rpc group with size = self.world_size - 1
+        dist.init_process_group(
+            backend='gloo',
+            init_method=self.file_init_method,
+            rank=self.rank,
+            world_size=self.world_size)
+
+        world_size_minus_one = self.world_size - 1
+        if self.rank < world_size_minus_one:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=world_size_minus_one,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        dist.barrier()
+
+        # Attempt to add an additional dynamic group member
+        if self.rank == world_size_minus_one:
+            with self.assertRaisesRegex(RuntimeError, "RPC group mixes statically and dynamically\
+ initialized members which is not supported."):
+                rpc.init_rpc(
+                    name=worker_name(self.rank),
+                    backend=self.rpc_backend,
+                    rank=self.rank,
+                    rpc_backend_options=self.rpc_backend_options,
+                )
+
     def test_wrong_types(self):
         with self.assertRaisesRegex(
             TypeError,
@@ -4434,22 +4748,6 @@ def rref_error():
 
         dist.barrier()
 
-    def _trainer_func(self, rref, sparse):
-        m = MyEmbeddingBagModel(sparse=sparse)
-        loss_fn = nn.MSELoss()
-        for i in range(10):
-            outputs = m(torch.rand(10, 10).long())
-            loss_fn(outputs, torch.rand(10, 10)).backward()
-            gradient = list(m.parameters())[0].grad
-            fut = rref.rpc_async().average(rref, i, gradient)
-            gradient = fut.wait()
-            if gradient.is_sparse:
-                gradient = gradient.to_dense().double()
-            ps_gradient = rref.rpc_sync().get_gradient(rref)
-            if ps_gradient.is_sparse:
-                ps_gradient = ps_gradient.to_dense().double()
-            self.assertTrue(torch.equal(gradient, ps_gradient))
-
     @dist_init
     def test_my_parameter_server(self):
         self._my_parameter_server(False)
@@ -4804,6 +5102,7 @@ def test_rpc_script_timeout(self):
         # Reset for clean shutdown
         rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
 
+
 class TensorPipeAgentRpcTest(RpcAgentTestFixture, RpcTestCommon):
 
     def test_mismatched_type_for_options(self):
@@ -4824,7 +5123,8 @@ def test_mismatched_type_for_options(self):
 
     def test_infer_backend_from_options(self):
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
-            init_method=self.init_method
+            init_method=self.init_method,
+            _transports=tp_transports()
         )
 
         rpc.init_rpc(
@@ -4843,7 +5143,8 @@ def test_set_and_get_num_worker_threads(self):
         NUM_THREADS = 27
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
             init_method=self.rpc_backend_options.init_method,
-            num_worker_threads=NUM_THREADS
+            num_worker_threads=NUM_THREADS,
+            _transports=tp_transports(),
         )
         rpc.init_rpc(
             name=worker_name(self.rank),
@@ -4860,11 +5161,14 @@ def test_set_and_get_num_worker_threads(self):
     # FIXME Merge this test with the corresponding one in RpcTest.
     @dist_init(setup_rpc=False)
     def test_tensorpipe_set_default_timeout(self):
-        timeout = 0.5
+        # Set a high timeout since it doesn't affect test runtime and ensures
+        # the test doesn't erroneously timeout due to slow machines.
+        timeout = 100
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
             init_method=self.rpc_backend_options.init_method,
             num_worker_threads=self.rpc_backend_options.num_worker_threads,
-            rpc_timeout=timeout
+            rpc_timeout=timeout,
+            _transports=tp_transports(),
         )
         rpc.init_rpc(
             name=worker_name(self.rank),
@@ -4970,34 +5274,6 @@ def test_rref_proxy_timeout(self):
         for rpc_api in ["rpc_sync", "rpc_async", "remote"]:
             self._test_rref_proxy_timeout(rpc_api)
 
-class MyConvNetForMNIST(nn.Module):
-    def __init__(self, device):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Conv2d(1, 16, 3, 1),
-            nn.ReLU(),
-            nn.Conv2d(16, 32, 3, 1),
-            nn.ReLU(),
-            nn.MaxPool2d(2),
-            nn.Flatten(1),
-            nn.Linear(4608, 128),
-            nn.ReLU(),
-            nn.Linear(128, 10),
-        ).to(device)
-        self.device = device
-
-    def forward(self, x, is_rref=False):
-        x = x.to_here() if is_rref else x
-        with torch.cuda.stream(torch.cuda.current_stream(self.device)):
-            # intentionally adding delay to current CUDA stream
-            torch.cuda._sleep(10 * FIFTY_MIL_CYCLES)
-            return self.net(x)
-
-    def __getstate__(self):
-        # return an empty dict to avoid inspecting the model contents on the
-        # owner
-        return {}
-
     @dist_init
     def test_send_to_rank_sparse(self):
         dst_rank = (self.rank + 1) % self.world_size
@@ -5190,6 +5466,45 @@ def test_nested_rref_stress_sparse(self):
     def test_my_parameter_server_sparse(self):
         self._my_parameter_server(True)
 
+    @dist_init
+    def test_meta_one_tensor(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        for return_rrefs in [False, True]:
+            self._test_meta_tensor(dst_rank,
+                                   [torch.ones(42, device='meta')], ['meta'],
+                                   [torch.Size((42,))], [42],
+                                   [torch.float], [(1,)],
+                                   return_rrefs=return_rrefs)
+
+    @dist_init
+    def test_meta_one_tensor_rref(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        for return_rrefs in [False, True]:
+            self._test_meta_tensor(dst_rank,
+                                   [RRef(torch.ones(6, 7, device='meta', dtype=torch.int))], ['meta'],
+                                   [torch.Size((6, 7))], [42],
+                                   [torch.int], [(7, 1)],
+                                   return_rrefs=return_rrefs)
+
+    @dist_init
+    def test_meta_multiple_tensors(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        for return_rrefs in [False, True]:
+            self._test_meta_tensor(dst_rank,
+                                   [torch.empty(42, device='meta'),
+                                    torch.empty(6, 7, device='cpu', dtype=torch.bool),
+                                    RRef(torch.empty(2, 21, device='meta', dtype=torch.int)),
+                                    RRef(torch.empty(3, 14, device='cpu', dtype=torch.long))],
+                                   ['meta', 'cpu', 'meta', 'cpu'],
+                                   [torch.Size((42,)), torch.Size((6, 7)), torch.Size((2, 21)), torch.Size((3, 14))],
+                                   [42, 42, 42, 42],
+                                   [torch.float, torch.bool, torch.int, torch.long],
+                                   [(1,), (7, 1), (21, 1), (14, 1)],
+                                   return_rrefs=return_rrefs)
+
 
 class TensorPipeAgentCudaRpcTest(RpcAgentTestFixture, RpcTestCommon):
 
@@ -5645,7 +5960,8 @@ def test_device_maps_in_options(self):
             rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
                 init_method=options.init_method,
                 num_worker_threads=options.num_worker_threads,
-                device_maps={dst: {0: 1, 1: 0}}
+                device_maps={dst: {0: 1, 1: 0}},
+                _transports=tp_transports()
             )
         )
 
@@ -5706,7 +6022,7 @@ def _add_to_gpu(x, y):
     def _test_device_maps_missing_config(self, mode):
         dst = worker_name((self.rank + 1) % self.world_size)
         errMsg = (
-            "TensorPipe RPC backend only supports CPU tensors by default.*"
+            "TensorPipe RPC backend only supports CPU and Meta tensors by default.*"
             "`set_device_map` on `TensorPipeRpcBackendOptions`"
         )
 
@@ -6499,3 +6815,37 @@ def test_cuda_future_can_extract_custom_class_with_cuda_sparse_tensor(self):
         self._test_cuda_future_extraction(
             wrapper=lambda t: TensorWrapper(t), unwrapper=lambda v: v.tensor, sparse_tensor=True
         )
+
+    @skip_if_lt_x_gpu(1)
+    def test_meta_multiple_tensors(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {0: 0})
+
+        input_src = worker_name((self.rank - 1 + self.world_size) % self.world_size)
+        options.set_device_map(input_src, {0: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        for return_rrefs in [False, True]:
+            self._test_meta_tensor(dst_rank,
+                                   [torch.empty(42, device='meta'),
+                                    torch.empty(6, 7, device='cuda', dtype=torch.bool),
+                                    RRef(torch.empty(2, 21, device='meta', dtype=torch.int)),
+                                    RRef(torch.empty(3, 14, device='cuda', dtype=torch.long))],
+                                   ['meta', 'cuda', 'meta', 'cuda'],
+                                   [torch.Size((42,)), torch.Size((6, 7)), torch.Size((2, 21)), torch.Size((3, 14))],
+                                   [42, 42, 42, 42],
+                                   [torch.float, torch.bool, torch.int, torch.long],
+                                   [(1,), (7, 1), (21, 1), (14, 1)],
+                                   return_rrefs=return_rrefs)
+
+        rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
index 63b15f6228bf..0f5cb0a4987a 100644
--- a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
@@ -2,6 +2,9 @@
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
+from torch.testing._internal.common_distributed import (
+    tp_transports,
+)
 
 
 class TensorPipeRpcAgentTestFixture(RpcAgentTestFixture):
@@ -16,6 +19,7 @@ def rpc_backend_options(self):
         return rpc.backend_registry.construct_rpc_backend_options(
             self.rpc_backend,
             init_method=self.init_method,
+            _transports=tp_transports()
         )
 
     def get_shutdown_error_regex(self):
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 5f803c73458c..afcf6f625397 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -1,7 +1,5 @@
 # Torch
 from torch.jit.annotations import BroadcastingList2, BroadcastingList3  # noqa: F401
-from torch.testing._internal.common_methods_invocations import non_differentiable, create_input, \
-    unpack_variables
 import torch.nn.functional as F
 import torch
 import torch.cuda
@@ -11,8 +9,9 @@
 from torch.testing._internal.common_nn import module_tests, new_module_tests
 from torch.testing._internal.common_utils import is_iterable_of_tensors
 
+import collections
 from copy import deepcopy
-from typing import List, Union
+from typing import Any, Dict, List, Union
 import math  # noqa: F401
 
 # Testing utils
@@ -25,6 +24,69 @@
 M = 10
 S = 5
 
+
+def unpack_variables(args):
+    if isinstance(args, tuple):
+        return tuple(unpack_variables(elem) for elem in args)
+    else:
+        return args
+
+class dont_convert(tuple):
+    pass
+
+non_differentiable = collections.namedtuple('non_differentiable', ['tensor'])
+
+def create_input(call_args, requires_grad=True, non_contiguous=False, call_kwargs=None, dtype=torch.double, device=None):
+    if not isinstance(call_args, tuple):
+        call_args = (call_args,)
+
+    def map_arg(arg):
+        def maybe_non_contig(tensor):
+            if not non_contiguous or tensor.numel() < 2:
+                return tensor.clone()
+
+            return noncontiguous_like(tensor)
+
+        def conjugate(tensor):
+            return tensor.conj()
+
+        if isinstance(arg, torch.Size) or isinstance(arg, dont_convert):
+            return arg
+        elif isinstance(arg, tuple) and len(arg) == 0:
+            var = conjugate(torch.randn((), dtype=dtype, device=device))
+            var.requires_grad = requires_grad
+            return var
+        elif isinstance(arg, tuple) and not isinstance(arg[0], torch.Tensor):
+            return conjugate(maybe_non_contig(torch.randn(*arg, dtype=dtype, device=device))).requires_grad_(requires_grad)
+        # double check casting
+        elif isinstance(arg, non_differentiable):
+            if isinstance(arg.tensor, torch.Tensor):
+                if arg.tensor.dtype == torch.float:
+                    return maybe_non_contig(arg.tensor.to(dtype=torch.double, device=device))
+                if arg.tensor.dtype == torch.cfloat:
+                    return conjugate(maybe_non_contig(arg.tensor.to(dtype=torch.cdouble, device=device)))
+                return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
+            return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
+        elif isinstance(arg, torch.Tensor):
+            if arg.dtype == torch.float:
+                arg = arg.double()
+            if arg.dtype == torch.cfloat:
+                arg = arg.to(torch.cdouble)
+            if arg.is_complex() != dtype.is_complex:
+                raise RuntimeError("User provided tensor is real for a test that runs with complex dtype, ",
+                                   "which is not supported for now")
+            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of v afterwards
+            v = conjugate(maybe_non_contig(arg)).detach().to(device=device).clone()
+            v.requires_grad = requires_grad and (v.is_floating_point() or v.is_complex())
+            return v
+        elif callable(arg):
+            return map_arg(arg(dtype=dtype, device=device))
+        else:
+            return arg
+    args_out = tuple(map_arg(arg) for arg in call_args)
+    kwargs_out = {k: map_arg(v) for k, v in call_kwargs.items()} if call_kwargs else {}
+    return args_out, kwargs_out
+
 # NB: JIT script tests for all nn functional interfaces, script mode does
 # not support in_place operations yet, so no inplace operation tests added.
 # removed all the deprecated functions
@@ -345,30 +407,78 @@ def script_fn(*args, **kwargs):
         return output
     return script_fn
 
+class SplitInputs():
+    all_tensors: List[Any]
+    tensor_args: List[Any]
+    nontensor_args: List[Any]
+    arg_types: List[str]
+    tensor_kwargs: Dict[str, Any]
+    kwarg_order: List[str]
+    nontensor_kwargs: Dict[str, Any]
+    kwarg_types: Dict[str, Any]
+
+    @staticmethod
+    def _is_tensor_input(arg):
+        return isinstance(arg, torch.Tensor) or is_iterable_of_tensors(arg)
+
+    def __init__(self, args, kwargs):
+        self.arg_types = ['t' if self._is_tensor_input(arg) else 's' for arg in args]
+        self.kwarg_types = {k: 't' if self._is_tensor_input(v) else 's' for k, v in kwargs.items()}
+        self.tensor_args = [arg for arg in args if self._is_tensor_input(arg)]
+        self.nontensor_args = [arg for arg in args if not self._is_tensor_input(arg)]
+        self.tensor_kwargs = {k: v for k, v in kwargs.items() if self._is_tensor_input(v)}
+        self.nontensor_kwargs = {k: v for k, v in kwargs.items() if not self._is_tensor_input(v)}
+        self.all_tensors = [*self.tensor_args, *[v for k, v in self.tensor_kwargs.items()]]
+        self.kwarg_order = [k for k, v in kwargs.items()]
+
+    def nontensors_match(self, other: 'SplitInputs'):
+        if self.arg_types != other.arg_types:
+            return False
+        if self.kwarg_types != other.kwarg_types:
+            return False
+        if self.kwarg_order != other.kwarg_order:
+            return False
+        if self.nontensor_args != other.nontensor_args:
+            return False
+        if self.nontensor_kwargs != other.nontensor_kwargs:
+            return False
+        return True
+
 # make a new function where all non-tensor arguments in 'args' have been partially
 # applied, and all tensor arguments remain.
 # used to trace functions when some arguments are not tensors
-def partial_apply_nontensors(fn, args, **kwargs):
-    source = ['t' if (isinstance(arg, torch.Tensor) or is_iterable_of_tensors(arg)) else 's' for arg in args]
+def partial_apply_nontensors(fn, args, kwargs):
+    inputs = SplitInputs(args, kwargs)
 
     def new_fn(*tensors_):
         tensors = iter(tensors_)
-        return fn(*(args[i] if s == 's' else next(tensors) for i, s in enumerate(source)), **kwargs)
+        full_args = [args[i] if s == 's' else next(tensors) for i, s in enumerate(inputs.arg_types)]
+        full_kwargs = {k: kwargs[k] if s == 's' else next(tensors) for k, s in inputs.kwarg_types.items()}
+        return fn(*full_args, **full_kwargs)
 
-    return new_fn, [arg for arg in args if isinstance(arg, torch.Tensor) or is_iterable_of_tensors(arg)]
+    return new_fn, inputs
 
 # create a trace function from input fn
-def create_traced_fn(self, fn):
+def create_traced_fn(self, fn, cache_traced_fn=False):
     def traced_fn(*inputs, **kwargs):
-        fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         # `check_trace` is set to False because check_trace is run with @no_grad
         # Also, `check_against_reference` already does all the checks
         # against python function
-        traced = torch.jit.trace(fn_tensors, inputs_tensors, check_trace=False)
-        self.assertExportImport(traced.graph, inputs_tensors)
-        output = traced(*inputs_tensors)
+        fn_tensors, split_inputs = partial_apply_nontensors(fn, inputs, kwargs)
+        if not cache_traced_fn or not hasattr(traced_fn, 'traced'):
+            traced = torch.jit.trace(fn_tensors, split_inputs.all_tensors, check_trace=False)
+            self.assertExportImport(traced.graph, split_inputs.all_tensors)
+            output = traced(*split_inputs.all_tensors)
+            if cache_traced_fn:
+                traced_fn.traced = traced
+                traced_fn.split_inputs = split_inputs
+        else:
+            # Guard to check that nontensor inputs are the same as during tracing
+            self.assertTrue(traced_fn.split_inputs.nontensors_match(split_inputs))
+            output = traced_fn.traced(*split_inputs.all_tensors)
+            traced = traced_fn.traced
         # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087
-        traced_fn.last_graph = traced.graph_for(*inputs_tensors)  # type: ignore[attr-defined]
+        traced_fn.last_graph = traced.graph_for(*split_inputs.all_tensors)  # type: ignore[attr-defined]
         traced_fn.graph = traced.graph  # type: ignore[attr-defined]
         return output
     return traced_fn
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index c1dd32d7cc32..95c55e7db870 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -37,7 +37,7 @@
 import tempfile
 import textwrap
 from importlib.abc import Loader
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Tuple, Union
 
 RUN_CUDA = torch.cuda.is_available()
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
@@ -767,7 +767,7 @@ def _get_py3_code(code, fn_name):
 class TensorExprTestOptions():
     def __init__(self):
         self.old_profiling_executor = torch._C._jit_set_profiling_executor(True)
-        self.old_profiling_mode = torch._C._jit_set_profiling_mode(True)
+        self.old_profiling_mode = torch._C._get_graph_executor_optimize(True)
 
         self.old_cpu_fuser_state = torch._C._jit_can_fuse_on_cpu()
         self.old_gpu_fuser_state = torch._C._jit_can_fuse_on_gpu()
@@ -779,16 +779,18 @@ def __init__(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
+        self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(False)
 
     def restore(self):
         torch._C._jit_set_profiling_executor(self.old_profiling_executor)
-        torch._C._jit_set_profiling_mode(self.old_profiling_mode)
+        torch._C._get_graph_executor_optimize(self.old_profiling_mode)
 
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuser_state)
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_nvfuser_enabled(self.old_nvfuser)
 
 def clone_inputs(args):
     inputs: List[Union[torch.Tensor, List[torch.Tensor]]] = []
diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py
index fec00fe30d32..be02bc7463ad 100644
--- a/torch/testing/_internal/logging_tensor.py
+++ b/torch/testing/_internal/logging_tensor.py
@@ -22,12 +22,24 @@ def no_dispatch() -> Iterator[None]:
 # 3. Enter dispatcher, wind your way through Autograd
 # 4. Hit Python dispatch key, call __torch_dispatch__
 
+# This Tensor can work with autograd in two ways:
+#  - The wrapped Tensor does not require gradients. In that case, the LoggingTensor
+#    can require gradients if the user asks for it as a constructor kwarg.
+#  - The wrapped Tensor can require gradients. In that case autograd will be tracked
+#    for the wrapped Tensor and the LoggingTensor itself cannot require gradients.
+# WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single
+# test or you might get surprising behavior.
+
 # TODO: TensorBase should work
 class LoggingTensor(torch.Tensor):
     elem: torch.Tensor
 
     __slots__ = ['elem']
 
+    context = contextlib.nullcontext
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
     @staticmethod
     def __new__(cls, elem, *args, **kwargs):
         # The wrapping tensor (LoggingTensor) shouldn't hold any
@@ -38,30 +50,36 @@ def __new__(cls, elem, *args, **kwargs):
             strides=elem.stride(), storage_offset=elem.storage_offset(),
             # TODO: clone storage aliasing
             dtype=elem.dtype, layout=elem.layout,
-            device=elem.device, requires_grad=elem.requires_grad
+            device=elem.device, requires_grad=kwargs.get("requires_grad", False)
         )
         # ...the real tensor is held as an element on the tensor.
-        r.elem = elem
+        r.elem = elem.detach() if r.requires_grad else elem
         return r
 
     def __repr__(self):
-        return f"LoggingTensor({self.elem})"
+        return super().__repr__(tensor_contents=f"{self.elem}")
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         def unwrap(e):
-            return e.elem if isinstance(e, LoggingTensor) else e
+            return e.elem if isinstance(e, cls) else e
 
         def wrap(e):
-            return LoggingTensor(e) if isinstance(e, torch.Tensor) else e
+            return cls(e) if isinstance(e, torch.Tensor) else e
 
-        # no_dispatch is only needed if you use enable_python_mode.
-        # It prevents infinite recursion.
-        with no_dispatch():
+        with cls.context():
             rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
         logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
         return rs
 
+class LoggingTensorMode(LoggingTensor):
+    # no_dispatch is only needed if you use enable_torch_dispatch_mode.
+    # It prevents infinite recursion.
+    context = no_dispatch
+
+class LoggingTensorReentrant(LoggingTensor):
+    context = torch.overrides.enable_reentrant_dispatch
+
 # https://stackoverflow.com/questions/36408496/python-logging-handler-to-append-to-list
 class LoggingTensorHandler(logging.Handler):
     log_list: List[str]
diff --git a/torch/testing/_legacy.py b/torch/testing/_legacy.py
index 0d51da99bf05..1c7ba1472896 100644
--- a/torch/testing/_legacy.py
+++ b/torch/testing/_legacy.py
@@ -96,6 +96,9 @@ def all_types_and(*dtypes):
 def complex_types():
     return _complex_types
 
+def complex_types_and(*dtypes):
+    return _complex_types + _validate_dtypes(*dtypes)
+
 _all_types_and_complex = _all_types + _complex_types
 def all_types_and_complex():
     return _all_types_and_complex
@@ -109,17 +112,21 @@ def all_types_and_half():
 
 # The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro
 
+# See AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS.
 def get_all_dtypes(include_half=True,
                    include_bfloat16=True,
                    include_bool=True,
                    include_complex=True,
-                   include_complex32=False
+                   include_complex32=False,
+                   include_qint=False,
                    ) -> List[torch.dtype]:
     dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
     if include_bool:
         dtypes.append(torch.bool)
     if include_complex:
         dtypes += get_all_complex_dtypes(include_complex32)
+    if include_qint:
+        dtypes += get_all_qint_dtypes()
     return dtypes
 
 def get_all_math_dtypes(device) -> List[torch.dtype]:
@@ -143,5 +150,9 @@ def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dt
     return dtypes
 
 
+def get_all_qint_dtypes() -> List[torch.dtype]:
+    return [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2, torch.quint2x4]
+
+
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
diff --git a/torch/torch_version.py b/torch/torch_version.py
index a7aa6764b5d2..9fd989a44ff9 100644
--- a/torch/torch_version.py
+++ b/torch/torch_version.py
@@ -8,7 +8,7 @@ class _LazyImport:
        def v():
            return Version('1.2.3')
     and
-       Versoin = _LazyImport('Version')
+       Version = _LazyImport('Version')
        def v():
            return Version('1.2.3')
     The difference here is that in later example imports
@@ -18,7 +18,12 @@ def __init__(self, cls_name: str) -> None:
         self._cls_name = cls_name
 
     def get_cls(self):
-        from pkg_resources import packaging  # type: ignore[attr-defined]
+        try:
+            import packaging.version  # type: ignore[import]
+        except ImportError:
+            # If packaging isn't installed, try and use the vendored copy
+            # in pkg_resources
+            from pkg_resources import packaging  # type: ignore[attr-defined]
         return getattr(packaging.version, self._cls_name)
 
     def __call__(self, *args, **kwargs):
diff --git a/torch/types.py b/torch/types.py
index 6d6633c6e45c..80f7278ef488 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -22,6 +22,9 @@
 _size = Union[torch.Size, List[_int], Tuple[_int, ...]]
 _layout = torch.layout
 
+class SymInt:
+    pass
+
 # Meta-type for "numeric" things; matches our docs
 Number = Union[builtins.int, builtins.float, builtins.bool]
 
diff --git a/torch/utils/_mode_utils.py b/torch/utils/_mode_utils.py
new file mode 100644
index 000000000000..9e5101f265b9
--- /dev/null
+++ b/torch/utils/_mode_utils.py
@@ -0,0 +1,132 @@
+import functools
+from typing import Iterator
+from dataclasses import dataclass
+
+# This file has all the logic to dedupe logic between torch dispatch and
+# torch function modes
+#
+# Specifically, it has the helper functions for enable_ and push_X_mode and the
+# ModeInfo class, which is extended by each where they are different
+
+
+# a helper class for the error message in the _wrap_init function. This can't be shared with ModeInfo because
+# that causes a circular dependency. It also must has only strings attributes to avoid circular dependencies
+@dataclass
+class MetaInitErrorInfo:
+    mode_name: str
+    mode_class_name: str  # name of the mode class that extends the meta class here
+
+
+# used by both TorchFunctionMode and TorchDispatchMode, this will wrap the init
+# function to require an "inner" kwarg
+def _wrap_init(f, meta_init_error_info):
+    undef = object()
+
+    @functools.wraps(f)
+    def wrapped(self, *args, inner=undef, **kwargs):
+        if inner is undef:
+            raise TypeError(
+                f"missing inner keyword argument; instead of constructing a {meta_init_error_info.mode_class_name} "
+                f"directly, pass the constructor to push_{meta_init_error_info.mode_name}_mode"
+            )
+        self.inner = inner
+        return f(self, *args, **kwargs)
+    return wrapped
+
+
+# in order to dedupe the logic between python mode and torch_function mode, this
+# is a container to hold all the differences between the modes. Then functions like
+# _enable_mode are able to use this container to call functions or get correctly
+# formatted names
+@dataclass
+class _ModeInfo:
+    mode_name: str
+    mode_class: type  # the class related to the mode that's allowed to be passed in
+    base_mode_class: type  # the base class of mode_class that dispatches to the original function
+
+    def mode_class_name(self):
+        return self.mode_class.__name__
+
+    def get_mode(self):
+        """gets the current mode for this type of mode"""
+        raise NotImplementedError()
+
+    def set_mode(self, mode):
+        """
+        set mode to for this type of mode. Note that no checks are done on this, it's the unsafe
+        version where checks are assumed to have been already done by the helper function
+        """
+        raise NotImplementedError()
+
+
+# shared version of enable_torch_function/enable_torch_dispatch_mode in order to deduplicate the code.
+# The differences between the modes are captured by `mode_info` and then queried when they're
+# needed during the function's invocation
+def _enable_mode(mode, mode_info: _ModeInfo, *, replace=None, ignore_preexisting=False) -> Iterator[None]:
+    if not (
+        mode is None or
+        isinstance(mode, mode_info.mode_class) or
+        (isinstance(mode, type) and not issubclass(mode, mode_info.mode_class))
+    ):
+        raise ValueError(f'expected to get {mode_info.mode_class_name()}, Tensor-like class, '
+                         f'or None as an argument got {type(mode)} instead')
+    old = mode_info.get_mode()
+    if old is mode:
+        yield
+        return
+    if old is not None and not ignore_preexisting and old is not replace:
+        if isinstance(mode, mode_info.mode_class):
+            help_text = f'Use push_{mode_info.mode_name}_mode instead.'
+        else:
+            help_text = (
+                'If you intended to completely override the preexisting mode, '
+                'pass ignore_preexisting=True.  This can result in unexpected '
+                'behavior; please consider rewriting your mode to be a subclass '
+                f'of {mode_info.mode_class_name()} to make it compositional!'
+            )
+        raise ValueError(
+            f'Attempted to enable_{mode_info.mode_name}_mode, but there is already an '
+            f'active mode {old}.  {help_text}'
+        )
+    # NB: we don't require TorchFunctionMode/PythonMode since this is intended to also
+    # let you directly pass a Tensor subclass type to "mode-ify" it.
+    required_fn = "__" + mode_info.mode_name + "__"
+    if not hasattr(mode, required_fn):
+        raise ValueError(
+            f'The argument passed to enable_{mode_info.mode_name}_mode must implement {required_fn}'
+        )
+    mode_info.set_mode(mode)
+    try:
+        yield
+    finally:
+        mode_info.set_mode(old)
+
+
+# shared version of push_torch_function/push_torch_dispatch_mode in order to deduplicate the code.
+# The differences between the modes are captured by `mode_info` and then queried when they're
+# needed during the function's invocation
+def _push_mode(ctor, mode_info: _ModeInfo) -> Iterator[object]:
+    # Helper function for pushing a mode onto the stack
+    if isinstance(ctor, mode_info.mode_class):
+        raise ValueError(
+            f'Expected a {mode_info.mode_class_name()} constructor function, but got an '
+            f'instance of {mode_info.mode_class_name()} {ctor}.  Consider using '
+            f'enable_{mode_info.mode_name}_mode instead.'
+        )
+    old = mode_info.get_mode()
+    if old is None:
+        inner = mode_info.base_mode_class(inner=None)
+    else:
+        inner = old
+
+    mode = ctor(inner=inner)
+    if not isinstance(mode, mode_info.mode_class):
+        raise ValueError(
+            f'The callable passed to push_{mode_info.mode_name}_mode'
+            f'must return a {mode_info.mode_class_name()}'
+        )
+    mode_info.set_mode(mode)
+    try:
+        yield mode
+    finally:
+        mode_info.set_mode(old)
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index a7cfae10c37c..c3ed92133056 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,34 +1,163 @@
-import torch
 import contextlib
 from typing import Iterator
+import functools
 
-# Context manager that causes all pytorch operators to dispatch to the passed-in
-# type's __torch_dispatch__ function.
-# operation that accepts no tensors but returns a tensor.
-#
-# enable_python_mode is affected by torch._C._DisableTorchDispatch.
-#
-# NB: Calling an operator inside __torch_dispatch__ does go through
-# __torch_dispatch__ again. Please use _DisableTorchDispatch inside
-# __torch_dispatch__ to prevent infinite recursion.
-#
-# TODO: Limitations and things about enable_python_mode we should fix before exposing it:
-# - it currently cannot be nested. This should be simple to implement; we need a
-#   stack of TorchDispatchTypeObjects and the next bullet point.
+from torch.utils._mode_utils import _enable_mode, _push_mode, _ModeInfo, _wrap_init, MetaInitErrorInfo
+from torch._C import _get_torch_dispatch_mode, _set_torch_dispatch_mode
+from dataclasses import dataclass
+
+
+@dataclass
+class TorchDispatchModeInfo(_ModeInfo):
+    def __init__(self):
+        super().__init__(mode_name="torch_dispatch", mode_class=TorchDispatchMode,
+                         base_mode_class=BaseTorchDispatchMode)
+
+    def get_mode(self):
+        return _get_torch_dispatch_mode()
+
+    def set_mode(self, mode):
+        return _set_torch_dispatch_mode(mode)
+
+
+# TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it:
 # - We need a better user-facing api for torch._C._DisableTorchDispatch that
 #   is able to selectively disable __torch_dispatch__ of a particular class.
 # - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor)
 # - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694)
 @contextlib.contextmanager
-def enable_python_mode(cls) -> Iterator[None]:
-    if not hasattr(cls, '__torch_dispatch__'):
-        raise ValueError('The class passed to enable_python_mode '
-                         'must have a __torch_dispatch__ classmethod')
-    if not isinstance(cls, type) or not issubclass(cls, (torch.Tensor,)):
-        raise ValueError('The argument passed to enable_python_mode '
-                         'must be the type of a Tensor subclass')
-    torch._C._enter_python_mode(cls)
-    try:
-        yield
-    finally:
-        torch._C._exit_python_mode()
+def enable_torch_dispatch_mode(mode, *, replace=None, ignore_preexisting=False) -> Iterator[None]:
+    """
+    Context manager that causes all pytorch operators to dispatch to the passed-in
+    type's __torch_dispatch__ function, including operations that accept no tensors
+    but return a tensor.
+
+    This function is non-compositional; if there is already an existing mode,
+    it will raise an error
+
+    This function is safe to use inside a ``__torch_dispatch__`` mode handler,
+    as the mode is guaranteed to be disabled in this context.  You can use
+    this context manager to reinstate the mode so that calls to overridable
+    APIs recursively call back into your mode handler (this can easily cause
+    infinite loops, so use with care!)
+
+    enable_torch_dispatch_mode is affected by _DisableTorchDispatch.
+
+    Args:
+        mode (:class:`TorchDispatchMode`, Tensor-like class, or None): the
+            mode to set as current mode.  If you pass a Tensor-like class,
+            it will be treated as a non-compositional mode with no state,
+            which is convenient if you have an existing tensor subclass
+            that you'd like to apply globally in a quick and dirty way.
+            Passing None will disable the current mode.
+        replace (:class:`TorchDispatchMode` or Tensor-like class): the
+            mode to replace.  You can use this argument to change the mode in
+            a situation where you know what the current mode is (and you are
+            intentionally overwriting it.)  If you don't know what the current
+            mode is, use ``ignore_preexisting`` instead.
+        ignore_preexisting (bool): if True, ignore any preexisting mode
+            and overwrite it with the passed mode.
+    """
+
+    return _enable_mode(mode, mode_info=TorchDispatchModeInfo(), replace=replace, ignore_preexisting=ignore_preexisting)
+
+
+def _wrap_torch_dispatch(f):
+    @functools.wraps(f)
+    def wrapped(self, *args, **kwargs):
+        with enable_torch_dispatch_mode(self.inner):
+            return f(self, *args, **kwargs)
+    return wrapped
+
+
+# Implementation note, since this is based on TorchFunctionMode, this had the
+# same dilemma: I had a choice about how much of mode stacks
+# to implement in Python versus in C++.  At time of writing, I did not care
+# too much about implementation efficiency; however, I do care about making it
+# hard for users to implement modes in the wrong way.  In the end, it turned
+# out to be possible to implement mode stacks entirely from userland, with the
+# C++ API providing only _get_torch_dispatch_mode() and
+# _set_torch_dispatch_mode(), so I opted to provide some unsafe C++ bindings and
+# have the bulk of the logic for managing the stack in Python, which helped
+# simplify the C++ API surface.  It would also have been valid to build in the
+# notion of mode stack directly into C++ but in this design it's substantially
+# more difficult to interact with TorchDispatchModeMeta.
+
+class TorchDispatchMetaInitErrorInfo(MetaInitErrorInfo):
+    def __init__(self):
+        super().__init__(mode_class_name="TorchDispatchMode", mode_name="torch_dispatch")
+
+class TorchDispatchModeMeta(type):
+    """
+    Metaclass for :class:`TorchDispatchMode`; it does two things:
+
+        * Adds an implicit ``inner`` kwarg to ``__init__``, to
+          allow the modes to be chained together to form a stack.
+
+        * Reenables the inner mode, so that by default PyTorch API calls
+          will compositionally proceed to the next mode on the stack.
+
+    The default behavior for the second bullet is important, as it is easy to
+    accidentally write ``_wrap_torch_dispatch`` implementations that are not
+    compositional, and the wrapping here makes the obvious code do the
+    right thing (aka, this is why there is a metaclass).
+    """
+    def __new__(metacls, name, bases, dct):
+        if '__init__' in dct:
+            dct['__init__'] = _wrap_init(dct['__init__'], TorchDispatchMetaInitErrorInfo())
+        if '__torch_dispatch__' in dct:
+            dct['__torch_dispatch__'] = _wrap_torch_dispatch(dct['__torch_dispatch__'])
+        return super().__new__(metacls, name, bases, dct)
+
+
+class TorchDispatchMode(metaclass=TorchDispatchModeMeta):
+    """
+    A ``TorchDispatchMode`` allows you to override the meaning of all
+    ``__torch_dispatch__`` overrideable functions within a dynamic scope,
+    without having to actually create a tensor subclass or manually
+    monkey-patch functions in the PyTorch API.  Some common situations
+    where you should use a mode:
+
+        * You want to override the meaning of factory functions, or other
+          functions that do not otherwise take a tensor as an argument
+          (these cannot be overridden with tensor subclasses).
+
+        * You want to override the behavior of all functions without needing
+          to wrap your inputs in tensor subclasses; e.g., if you are just
+          interested in logging intermediate computations.
+
+        * You want to control the order of execution of various tensor
+          subclasses explicitly, rather than implicitly via the return of
+          ``NotImplemented``.
+
+    Independent subclasses of :class:`TorchDispatchMode` are compositional:
+    modes can be pushed onto a stack with :func:`push_torch_dispatch_mode`.
+    When you call functions in the PyTorch API inside your
+    ``__torch_dispatch__`` implementation, by default, they will forward on to
+    the next mode on the mode stack.  If you want recursively call back into
+    your current ``__torch_dispatch__`` implementation, either explicitly
+    invoke ``self.__torch_dispatch__(...)``, or use the context manager
+    ``__torch_dispatch__(self, replace=self.inner)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+    # Force metaclass to generate constructor at the base of the hierarchy
+    def __init__(self):
+        pass
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        raise NotImplementedError()
+
+    @classmethod
+    def push(cls, *args, **kwargs):
+        return push_torch_dispatch_mode(functools.partial(cls, *args, **kwargs))
+
+
+class BaseTorchDispatchMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+@contextlib.contextmanager
+def push_torch_dispatch_mode(ctor) -> Iterator[object]:
+    return _push_mode(ctor, mode_info=TorchDispatchModeInfo())
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index 5429f7823fe7..f5eaa109d1ef 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -227,17 +227,17 @@ def __repr__(self) -> str:
         return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
 
     @staticmethod
-    def merge(measurements):  # type: (Iterable[Measurement]) -> List[Measurement]
+    def merge(measurements: Iterable["Measurement"]) -> List["Measurement"]:
         """Convenience method for merging replicates.
 
         Merge will extrapolate times to `number_per_run=1` and will not
         transfer any metadata. (Since it might differ between replicates)
         """
-        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
+        grouped_measurements: DefaultDict[TaskSpec, List["Measurement"]] = collections.defaultdict(list)
         for m in measurements:
             grouped_measurements[m.task_spec].append(m)
 
-        def merge_group(task_spec: TaskSpec, group: List[Measurement]) -> Measurement:
+        def merge_group(task_spec: TaskSpec, group: List["Measurement"]) -> "Measurement":
             times: List[float] = []
             for m in group:
                 # Different measurements could have different `number_per_run`,
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 80af8e8d4697..7260c24cf282 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -58,7 +58,7 @@ def __iter__(self) -> Generator[FunctionCount, None, None]:
     def __len__(self) -> int:
         return len(self._data)
 
-    def __getitem__(self, item: Any) -> "Union[FunctionCount, FunctionCounts]":
+    def __getitem__(self, item: Any) -> Union[FunctionCount, "FunctionCounts"]:
         data: Union[FunctionCount, Tuple[FunctionCount, ...]] = self._data[item]
         return (
             FunctionCounts(cast(Tuple[FunctionCount, ...], data), self.inclusive, truncate_rows=False)
@@ -90,13 +90,13 @@ def __repr__(self) -> str:
 
     def __add__(
         self,
-        other,  # type: FunctionCounts
+        other: "FunctionCounts",
     ) -> "FunctionCounts":
         return self._merge(other, lambda c: c)
 
     def __sub__(
         self,
-        other,  # type: FunctionCounts
+        other: "FunctionCounts",
     ) -> "FunctionCounts":
         return self._merge(other, lambda c: -c)
 
@@ -137,7 +137,7 @@ def denoise(self) -> "FunctionCounts":
 
     def _merge(
         self,
-        second,   # type: FunctionCounts
+        second: "FunctionCounts",
         merge_fn: Callable[[int], int]
     ) -> "FunctionCounts":
         assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
@@ -217,7 +217,7 @@ def counts(self, *, denoise: bool = False) -> int:
     # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
     def delta(
         self,
-        other,  # type: CallgrindStats
+        other: "CallgrindStats",
         inclusive: bool = False,
     ) -> FunctionCounts:
         """Diff two sets of counts.
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index aba6143688ed..1ca2d56616bc 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -438,7 +438,7 @@ def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptMo
             num_bundled_inputs: int = len(bundled_inputs_fn())
 
             # Check inflate helper functions for each function, argument and bundled input
-            func = getattr(script_module, function_name, None)
+            func = getattr(script_module, function_name)
             for arg_idx in range(len(func.schema.arguments) - 1):
                 for input_idx in range(num_bundled_inputs):
                     helper_fn_name = _get_inflate_helper_fn_name(
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 4ec92df1bb73..8e02813e4956 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -371,7 +371,7 @@ def inner_unpack(packed):
 
     with torch.autograd.graph.saved_tensors_hooks(pack, unpack):
         output = function(*args)
-        if torch.cuda._initialized and not had_cuda_in_fwd:
+        if torch.cuda._initialized and preserve_rng_state and not had_cuda_in_fwd:
             # Cuda was not initialized before running the forward, so we didn't
             # stash the CUDA state.
             raise RuntimeError(
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index 85543c5cef42..07386607888a 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -42,6 +42,7 @@
     'hip_runtime_version',
     'miopen_runtime_version',
     'caching_allocator_config',
+    'is_xnnpack_available',
 ])
 
 
@@ -87,20 +88,28 @@ def run_and_return_first_line(run_lambda, command):
 
 
 def get_conda_packages(run_lambda):
-    if get_platform() == 'win32':
-        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
-        findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
-        grep_cmd = r'{} /R "torch numpy cudatoolkit soumith mkl magma mypy"'.format(findstr_cmd)
-    else:
-        grep_cmd = r'grep "torch\|numpy\|cudatoolkit\|soumith\|mkl\|magma\|mypy"'
     conda = os.environ.get('CONDA_EXE', 'conda')
-    out = run_and_read_all(run_lambda, conda + ' list | ' + grep_cmd)
+    out = run_and_read_all(run_lambda, f"{conda} list")
     if out is None:
         return out
-    # Comment starting at beginning of line
-    comment_regex = re.compile(r'^#.*\n')
-    return re.sub(comment_regex, '', out)
 
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#")
+        and any(
+            name in line
+            for name in {
+                "torch",
+                "numpy",
+                "cudatoolkit",
+                "soumith",
+                "mkl",
+                "magma",
+                "mkl",
+            }
+        )
+    )
 
 def get_gcc_version(run_lambda):
     return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
@@ -274,13 +283,19 @@ def get_pip_packages(run_lambda):
     # People generally have `pip` as `pip` or `pip3`
     # But here it is incoved as `python -mpip`
     def run_with_pip(pip):
-        if get_platform() == 'win32':
-            system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
-            findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
-            grep_cmd = r'{} /R "numpy torch mypy"'.format(findstr_cmd)
-        else:
-            grep_cmd = r'grep "torch\|numpy\|mypy"'
-        return run_and_read_all(run_lambda, pip + ' list --format=freeze | ' + grep_cmd)
+        out = run_and_read_all(run_lambda, f"{pip} list --format=freeze")
+        return "\n".join(
+            line
+            for line in out.splitlines()
+            if any(
+                name in line
+                for name in {
+                    "torch",
+                    "numpy",
+                    "mypy",
+                }
+            )
+        )
 
     pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
     out = run_with_pip(sys.executable + ' -mpip')
@@ -292,6 +307,12 @@ def get_cachingallocator_config():
     ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
     return ca_config
 
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
 
 def get_env_info():
     run_lambda = run
@@ -339,6 +360,7 @@ def get_env_info():
         clang_version=get_clang_version(run_lambda),
         cmake_version=get_cmake_version(run_lambda),
         caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
     )
 
 env_info_fmt = """
@@ -362,6 +384,7 @@ def get_env_info():
 cuDNN version: {cudnn_version}
 HIP runtime version: {hip_runtime_version}
 MIOpen runtime version: {miopen_runtime_version}
+Is XNNPACK available: {is_xnnpack_available}
 
 Versions of relevant libraries:
 {pip_packages}
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index b81fefbf1065..aed4fd5004bf 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1,6 +1,7 @@
 import copy
 import glob
 import importlib
+import importlib.abc
 import os
 import re
 import shlex
@@ -16,8 +17,9 @@
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
 from .hipify import hipify_python
-from .hipify.hipify_python import get_hip_file_path, GeneratedFileCleaner
-from typing import List, Optional, Union
+from .hipify.hipify_python import GeneratedFileCleaner
+from typing import List, Optional, Union, Tuple
+from torch.torch_version import TorchVersion
 
 from setuptools.command.build_ext import build_ext
 from pkg_resources import packaging  # type: ignore[attr-defined]
@@ -40,6 +42,28 @@
     TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}')))
 
 SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else ()
+MINIMUM_GCC_VERSION = (5, 0, 0)
+MINIMUM_MSVC_VERSION = (19, 0, 24215)
+
+# The following values were taken from the following GitHub gist that
+# summarizes the minimum valid major versions of g++/clang++ for each supported
+# CUDA version: https://gist.github.com/ax3l/9489132
+CUDA_GCC_VERSIONS = {
+    '10.2': (MINIMUM_GCC_VERSION, (8, 0, 0)),
+    '11.1': (MINIMUM_GCC_VERSION, (10, 0, 0)),
+    '11.2': (MINIMUM_GCC_VERSION, (10, 0, 0)),
+    '11.3': (MINIMUM_GCC_VERSION, (10, 0, 0)),
+    '11.4': ((6, 0, 0), (10, 0, 0))
+}
+
+CUDA_CLANG_VERSIONS = {
+    '10.2': ((3, 3, 0), (8, 0, 0)),
+    '11.1': ((6, 0, 0), (10, 0, 0)),
+    '11.2': ((6, 0, 0), (10, 0, 0)),
+    '11.3': ((6, 0, 0), (10, 0, 0)),
+    '11.4': ((6, 0, 0), (10, 0, 0))
+}
+
 
 # Taken directly from python stdlib < 3.9
 # See https://github.com/pytorch/pytorch/issues/48617
@@ -123,8 +147,6 @@ def _join_rocm_home(*paths) -> str:
     return os.path.join(ROCM_HOME, *paths)
 
 
-MINIMUM_GCC_VERSION = (5, 0, 0)
-MINIMUM_MSVC_VERSION = (19, 0, 24215)
 ABI_INCOMPATIBILITY_WARNING = '''
 
                                !! WARNING !!
@@ -168,6 +190,7 @@ def _join_rocm_home(*paths) -> str:
 '''
 ROCM_HOME = _find_rocm_home()
 MIOPEN_HOME = _join_rocm_home('miopen') if ROCM_HOME else None
+HIP_HOME = _join_rocm_home('hip') if ROCM_HOME else None
 IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
 ROCM_VERSION = None
 if torch.version.hip is not None:
@@ -279,22 +302,23 @@ def check_compiler_ok_for_platform(compiler: str) -> bool:
     return False
 
 
-def check_compiler_abi_compatibility(compiler) -> bool:
+def get_compiler_abi_compatibility_and_version(compiler) -> Tuple[bool, TorchVersion]:
     r'''
-    Verifies that the given compiler is ABI-compatible with PyTorch.
+    Determine if the given compiler is ABI-compatible with PyTorch alongside
+    its version.
 
     Args:
         compiler (str): The compiler executable name to check (e.g. ``g++``).
             Must be executable in a shell process.
 
     Returns:
-        False if the compiler is (likely) ABI-incompatible with PyTorch,
-        else True.
+        A tuple that contains a boolean that defines if the compiler is (likely) ABI-incompatible with PyTorch,
+        followed by a `TorchVersion` string that contains the compiler version separated by dots.
     '''
     if not _is_binary_build():
-        return True
+        return (True, TorchVersion('0.0.0'))
     if os.environ.get('TORCH_DONT_CHECK_COMPILER_ABI') in ['ON', '1', 'YES', 'TRUE', 'Y']:
-        return True
+        return (True, TorchVersion('0.0.0'))
 
     # First check if the compiler is one of the expected ones for the particular platform.
     if not check_compiler_ok_for_platform(compiler):
@@ -302,11 +326,11 @@ def check_compiler_abi_compatibility(compiler) -> bool:
             user_compiler=compiler,
             pytorch_compiler=_accepted_compilers_for_platform()[0],
             platform=sys.platform))
-        return False
+        return (False, TorchVersion('0.0.0'))
 
     if IS_MACOS:
         # There is no particular minimum version we need for clang, so we're good here.
-        return True
+        return (True, TorchVersion('0.0.0'))
     try:
         if IS_LINUX:
             minimum_required_version = MINIMUM_GCC_VERSION
@@ -320,15 +344,15 @@ def check_compiler_abi_compatibility(compiler) -> bool:
     except Exception:
         _, error, _ = sys.exc_info()
         warnings.warn(f'Error checking compiler version for {compiler}: {error}')
-        return False
+        return (False, TorchVersion('0.0.0'))
 
     if tuple(map(int, version)) >= minimum_required_version:
-        return True
+        return (True, TorchVersion('.'.join(version)))
 
     compiler = f'{compiler} {".".join(version)}'
     warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler))
 
-    return False
+    return (False, TorchVersion('.'.join(version)))
 
 
 # See below for why we inherit BuildExtension from object.
@@ -393,7 +417,7 @@ def finalize_options(self) -> None:
             self.force = True
 
     def build_extensions(self) -> None:
-        self._check_abi()
+        compiler_name, compiler_version = self._check_abi()
 
         cuda_ext = False
         extension_iter = iter(self.extensions)
@@ -407,7 +431,7 @@ def build_extensions(self) -> None:
             extension = next(extension_iter, None)
 
         if cuda_ext and not IS_HIP_EXTENSION:
-            self._check_cuda_version()
+            self._check_cuda_version(compiler_name, compiler_version)
 
         for extension in self.extensions:
             # Ensure at least an empty list of flags for 'cxx' and 'nvcc' when
@@ -756,7 +780,7 @@ def get_ext_filename(self, ext_name):
             ext_filename = '.'.join(without_abi)
         return ext_filename
 
-    def _check_abi(self):
+    def _check_abi(self) -> Tuple[str, TorchVersion]:
         # On some platforms, like Windows, compiler_cxx is not available.
         if hasattr(self.compiler, 'compiler_cxx'):
             compiler = self.compiler.compiler_cxx[0]
@@ -764,15 +788,16 @@ def _check_abi(self):
             compiler = os.environ.get('CXX', 'cl')
         else:
             compiler = os.environ.get('CXX', 'c++')
-        check_compiler_abi_compatibility(compiler)
+        _, version = get_compiler_abi_compatibility_and_version(compiler)
         # Warn user if VC env is activated but `DISTUILS_USE_SDK` is not set.
         if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
             msg = ('It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                    'This may lead to multiple activations of the VC env.'
                    'Please set `DISTUTILS_USE_SDK=1` and try again.')
             raise UserWarning(msg)
+        return compiler, version
 
-    def _check_cuda_version(self):
+    def _check_cuda_version(self, compiler_name: str, compiler_version: TorchVersion):
         if CUDA_HOME:
             nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc')
             cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode(*SUBPROCESS_DECODE_ARGS)
@@ -786,7 +811,33 @@ def _check_cuda_version(self):
                     if getattr(cuda_ver, "major", float("nan")) != getattr(torch_cuda_version, "major", float("nan")):
                         raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))
                     warnings.warn(CUDA_MISMATCH_WARN.format(cuda_str_version, torch.version.cuda))
+                if (sys.platform.startswith('linux') and
+                        os.environ.get('TORCH_DONT_CHECK_COMPILER_ABI') not in ['ON', '1', 'YES', 'TRUE', 'Y'] and
+                        _is_binary_build()):
+                    cuda_compiler_bounds = CUDA_CLANG_VERSIONS if compiler_name.startswith('clang') else CUDA_GCC_VERSIONS
 
+                    if cuda_str_version not in cuda_compiler_bounds:
+                        warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')
+                    else:
+                        min_compiler_version, max_compiler_version = cuda_compiler_bounds[cuda_str_version]
+                        min_compiler_version_str = '.'.join(map(str, min_compiler_version))
+                        max_compiler_version_str = '.'.join(map(str, max_compiler_version))
+
+                        version_bound_str = f'>={min_compiler_version_str}'
+                        version_bound_str = f'{version_bound_str}, <={max_compiler_version_str}'
+
+                        if compiler_version < TorchVersion(min_compiler_version_str):
+                            raise RuntimeError(
+                                f'The current installed version of {compiler_name} ({compiler_version}) is less '
+                                f'than the minimum required version by CUDA {cuda_str_version} ({min_compiler_version_str}). '
+                                f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
+                            )
+                        elif compiler_version > TorchVersion(max_compiler_version_str):
+                            raise RuntimeError(
+                                f'The current installed version of {compiler_name} ({compiler_version}) is greater '
+                                f'than the maximum required version by CUDA {cuda_str_version} ({max_compiler_version_str}). '
+                                f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
+                            )
         else:
             raise RuntimeError(CUDA_NOT_FOUND_MESSAGE)
 
@@ -913,6 +964,20 @@ def CUDAExtension(name, sources, *args, **kwargs):
     Note that while it's possible to include all supported archs, the more archs get included the
     slower the building process will be, as it will build a separate kernel image for each arch.
 
+    Note that CUDA-11.5 nvcc will hit internal compiler error while parsing torch/extension.h on Windows.
+    To workaround the issue, move python binding logic to pure C++ file.
+
+    Example use:
+        >>> #include <ATen/ATen.h>
+        >>> at::Tensor SigmoidAlphaBlendForwardCuda(....)
+
+    Instead of:
+        >>> #include <torch/extension.h>
+        >>> torch::Tensor SigmoidAlphaBlendForwardCuda(...)
+
+    Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460
+    Complete workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48
+
     '''
     library_dirs = kwargs.get('library_dirs', [])
     library_dirs += library_paths(cuda=True)
@@ -945,16 +1010,19 @@ def CUDAExtension(name, sources, *args, **kwargs):
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
             output_directory=build_dir,
-            includes=[os.path.join(os.path.relpath(include_dir, build_dir), '*') for include_dir in include_dirs] if include_dirs else ['*'],
+            header_include_dirs=include_dirs,
+            includes=[os.path.join(build_dir, '*')],  # limit scope to build_dir only
             extra_files=[os.path.abspath(s) for s in sources],
             show_detailed=True,
             is_pytorch_extension=True,
+            hipify_extra_files_only=True,  # don't hipify everything in includes path
         )
 
         hipified_sources = set()
         for source in sources:
             s_abs = os.path.abspath(source)
-            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs)
+            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if (s_abs in hipify_result and
+                                 hipify_result[s_abs]["hipified_path"] is not None) else s_abs)
 
         sources = list(hipified_sources)
 
@@ -991,6 +1059,8 @@ def include_paths(cuda: bool = False) -> List[str]:
         paths.append(_join_rocm_home('include'))
         if MIOPEN_HOME is not None:
             paths.append(os.path.join(MIOPEN_HOME, 'include'))
+        if HIP_HOME is not None:
+            paths.append(os.path.join(HIP_HOME, 'include'))
     elif cuda:
         cuda_home_include = _join_cuda_home('include')
         # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home.
@@ -1018,6 +1088,8 @@ def library_paths(cuda: bool = False) -> List[str]:
     if cuda and IS_HIP_EXTENSION:
         lib_dir = 'lib'
         paths.append(_join_rocm_home(lib_dir))
+        if HIP_HOME is not None:
+            paths.append(os.path.join(HIP_HOME, 'lib'))
     elif cuda:
         if IS_WINDOWS:
             lib_dir = 'lib/x64'
@@ -1331,15 +1403,25 @@ def _jit_compile(name,
             try:
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
-                        hipify_python.hipify(
+                        hipify_result = hipify_python.hipify(
                             project_directory=build_directory,
                             output_directory=build_directory,
-                            includes=os.path.join(build_directory, '*'),
+                            header_include_dirs=(extra_include_paths if extra_include_paths is not None else []),
                             extra_files=[os.path.abspath(s) for s in sources],
+                            ignores=[_join_rocm_home('*'), os.path.join(_TORCH_PATH, '*')],  # no need to hipify ROCm or PyTorch headers
                             show_detailed=verbose,
+                            show_progress=verbose,
                             is_pytorch_extension=True,
                             clean_ctx=clean_ctx
                         )
+
+                        hipified_sources = set()
+                        for source in sources:
+                            s_abs = os.path.abspath(source)
+                            hipified_sources.add(hipify_result[s_abs]["hipified_path"] if s_abs in hipify_result else s_abs)
+
+                        sources = list(hipified_sources)
+
                     _write_ninja_file_and_build_library(
                         name=name,
                         sources=sources,
@@ -1383,7 +1465,7 @@ def _write_ninja_file_and_compile_objects(
         compiler = os.environ.get('CXX', 'cl')
     else:
         compiler = os.environ.get('CXX', 'c++')
-    check_compiler_abi_compatibility(compiler)
+    get_compiler_abi_compatibility_and_version(compiler)
     if with_cuda is None:
         with_cuda = any(map(_is_cuda_file, sources))
     build_file_path = os.path.join(build_directory, 'build.ninja')
@@ -1426,7 +1508,7 @@ def _write_ninja_file_and_build_library(
         compiler = os.environ.get('CXX', 'cl')
     else:
         compiler = os.environ.get('CXX', 'c++')
-    check_compiler_abi_compatibility(compiler)
+    get_compiler_abi_compatibility_and_version(compiler)
     if with_cuda is None:
         with_cuda = any(map(_is_cuda_file, sources))
     extra_ldflags = _prepare_ldflags(
@@ -1758,6 +1840,7 @@ def _import_module_from_library(module_name, path, is_python_module):
     if is_python_module:
         # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
         spec = importlib.util.spec_from_file_location(module_name, filepath)
+        assert spec is not None
         module = importlib.util.module_from_spec(spec)
         assert isinstance(spec.loader, importlib.abc.Loader)
         spec.loader.exec_module(module)
@@ -1835,10 +1918,6 @@ def _write_ninja_file_to_build_library(path,
         cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
         cuda_flags += extra_cuda_cflags
         cuda_flags += _get_rocm_arch_flags(cuda_flags)
-        sources = [s if not _is_cuda_file(s) else
-                   os.path.abspath(os.path.join(
-                       path, get_hip_file_path(os.path.relpath(s, path), is_pytorch_extension=True)))
-                   for s in sources]
     elif with_cuda:
         cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
         if IS_WINDOWS:
@@ -1949,6 +2028,8 @@ def sanitize_flags(flags):
             nvcc = _join_cuda_home('bin', 'nvcc')
         config.append(f'nvcc = {nvcc}')
 
+    if IS_HIP_EXTENSION:
+        post_cflags = COMMON_HIP_FLAGS + post_cflags
     flags = [f'cflags = {" ".join(cflags)}']
     flags.append(f'post_cflags = {" ".join(post_cflags)}')
     if with_cuda:
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index f82746281b1f..2fef8254dbda 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -11,16 +11,18 @@
 from torch.utils.data.dataset import (
     ChainDataset,
     ConcatDataset,
-    DFIterDataPipe,
-    DataChunk,
     Dataset,
-    IterDataPipe,
     IterableDataset,
-    MapDataPipe,
     Subset,
     TensorDataset,
     random_split,
 )
+from torch.utils.data.datapipes.datapipe import (
+    DFIterDataPipe,
+    DataChunk,
+    IterDataPipe,
+    MapDataPipe,
+)
 from torch.utils.data.dataloader import (
     DataLoader,
     _DatasetKind,
@@ -29,7 +31,7 @@
     default_convert,
 )
 from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data._decorator import (
+from torch.utils.data.datapipes._decorator import (
     argument_validation,
     functional_datapipe,
     guaranteed_datapipes_determinism,
diff --git a/torch/utils/data/_typing.py b/torch/utils/data/_typing.py
deleted file mode 100644
index 207f12357894..000000000000
--- a/torch/utils/data/_typing.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Taking reference from official Python typing
-# https://github.com/python/cpython/blob/master/Lib/typing.py
-
-import collections
-import functools
-import inspect
-import numbers
-import sys
-from typing import (Any, Dict, Iterator, Generic, List, Set, Tuple, TypeVar, Union,
-                    get_type_hints)
-from typing import _eval_type, _tp_cache, _type_check, _type_repr  # type: ignore[attr-defined]
-from typing import ForwardRef
-
-# TODO: Use TypeAlias when Python 3.6 is deprecated
-# Please check [Note: TypeMeta and TypeAlias]
-# In case of metaclass conflict due to ABCMeta or _ProtocolMeta
-# For Python 3.9, only Protocol in typing uses metaclass
-from abc import ABCMeta
-from typing import _GenericAlias  # type: ignore[attr-defined, no-redef]
-
-class GenericMeta(ABCMeta):  # type: ignore[no-redef]
-    pass
-
-import torch
-
-class Integer(numbers.Integral):
-    pass
-
-
-class Boolean(numbers.Integral):
-    pass
-
-
-# Python 'type' object is not subscriptable
-# Tuple[int, List, dict] -> valid
-# tuple[int, list, dict] -> invalid
-# Map Python 'type' to abstract base class
-TYPE2ABC = {
-    bool: Boolean,
-    int: Integer,
-    float: numbers.Real,
-    complex: numbers.Complex,
-    dict: Dict,
-    list: List,
-    set: Set,
-    tuple: Tuple,
-    None: type(None),
-}
-
-
-def issubtype(left, right, recursive=True):
-    r"""
-    Check if the left-side type is a subtype of the right-side type.
-    If any of type is a composite type like `Union` and `TypeVar` with
-    bounds, it would be expanded into a list of types and check all
-    of left-side types are subtypes of either one from right-side types.
-    """
-    left = TYPE2ABC.get(left, left)
-    right = TYPE2ABC.get(right, right)
-
-    if right is Any or left == right:
-        return True
-
-    if isinstance(right, _GenericAlias):
-        if getattr(right, '__origin__', None) is Generic:
-            return True
-
-    if right == type(None):
-        return False
-
-    # Right-side type
-    constraints = _decompose_type(right)
-
-    if len(constraints) == 0 or Any in constraints:
-        return True
-
-    if left is Any:
-        return False
-
-    # Left-side type
-    variants = _decompose_type(left)
-
-    # all() will return True for empty variants
-    if len(variants) == 0:
-        return False
-
-    return all(_issubtype_with_constraints(variant, constraints, recursive) for variant in variants)
-
-
-def _decompose_type(t, to_list=True):
-    if isinstance(t, TypeVar):
-        if t.__bound__ is not None:
-            ts = [t.__bound__]
-        else:
-            # For T_co, __constraints__ is ()
-            ts = list(t.__constraints__)
-    elif hasattr(t, '__origin__') and t.__origin__ == Union:
-        ts = t.__args__
-    else:
-        if not to_list:
-            return None
-        ts = [t]
-    # Ignored: Generator has incompatible item type "object"; expected "Type[Any]"
-    ts = list(TYPE2ABC.get(_t, _t) for _t in ts)  # type: ignore[misc]
-    return ts
-
-
-def _issubtype_with_constraints(variant, constraints, recursive=True):
-    r"""
-    Check if the variant is a subtype of either one from constraints.
-    For composite types like `Union` and `TypeVar` with bounds, they
-    would be expanded for testing.
-    """
-    if variant in constraints:
-        return True
-
-    # [Note: Subtype for Union and TypeVar]
-    # Python typing is able to flatten Union[Union[...]] or Union[TypeVar].
-    # But it couldn't flatten the following scenarios:
-    #   - Union[int, TypeVar[Union[...]]]
-    #   - TypeVar[TypeVar[...]]
-    # So, variant and each constraint may be a TypeVar or a Union.
-    # In these cases, all of inner types from the variant are required to be
-    # extraced and verified as a subtype of any constraint. And, all of
-    # inner types from any constraint being a TypeVar or a Union are
-    # also required to be extracted and verified if the variant belongs to
-    # any of them.
-
-    # Variant
-    vs = _decompose_type(variant, to_list=False)
-
-    # Variant is TypeVar or Union
-    if vs is not None:
-        return all(_issubtype_with_constraints(v, constraints, recursive) for v in vs)
-
-    # Variant is not TypeVar or Union
-    if hasattr(variant, '__origin__') and variant.__origin__ is not None:
-        v_origin = variant.__origin__
-        # In Python-3.9 typing library untyped generics do not have args
-        v_args = getattr(variant, "__args__", None)
-    else:
-        v_origin = variant
-        v_args = None
-
-    # Constraints
-    for constraint in constraints:
-        cs = _decompose_type(constraint, to_list=False)
-
-        # Constraint is TypeVar or Union
-        if cs is not None:
-            if _issubtype_with_constraints(variant, cs, recursive):
-                return True
-        # Constraint is not TypeVar or Union
-        else:
-            # __origin__ can be None for plain list, tuple, ... in Python 3.6
-            if hasattr(constraint, '__origin__') and constraint.__origin__ is not None:
-                c_origin = constraint.__origin__
-                if v_origin == c_origin:
-                    if not recursive:
-                        return True
-                    # In Python-3.9 typing library untyped generics do not have args
-                    c_args = getattr(constraint, "__args__", None)
-                    if c_args is None or len(c_args) == 0:
-                        return True
-                    if v_args is not None and len(v_args) == len(c_args) and \
-                            all(issubtype(v_arg, c_arg) for v_arg, c_arg in zip(v_args, c_args)):
-                        return True
-            # Tuple[int] -> Tuple
-            else:
-                if v_origin == constraint:
-                    return True
-
-    return False
-
-
-def issubinstance(data, data_type):
-    if not issubtype(type(data), data_type, recursive=False):
-        return False
-
-    # In Python-3.9 typing library __args__ attribute is not defined for untyped generics
-    dt_args = getattr(data_type, "__args__", None)
-    if isinstance(data, tuple):
-        if dt_args is None or len(dt_args) == 0:
-            return True
-        if len(dt_args) != len(data):
-            return False
-        return all(issubinstance(d, t) for d, t in zip(data, dt_args))
-    elif isinstance(data, (list, set)):
-        if dt_args is None or len(dt_args) == 0:
-            return True
-        t = dt_args[0]
-        return all(issubinstance(d, t) for d in data)
-    elif isinstance(data, dict):
-        if dt_args is None or len(dt_args) == 0:
-            return True
-        kt, vt = dt_args
-        return all(issubinstance(k, kt) and issubinstance(v, vt) for k, v in data.items())
-
-    return True
-
-
-# [Note: TypeMeta and TypeAlias]
-# In order to keep compatibility for Python 3.6, use Meta for the typing.
-# TODO: When PyTorch drops the support for Python 3.6, it can be converted
-# into the Alias system and using `__class_getiterm__` for DataPipe. The
-# typing system will gain benefit of performance and resolving metaclass
-# conflicts as elaborated in https://www.python.org/dev/peps/pep-0560/
-
-
-class _DataPipeType:
-    r"""
-    Save type annotation in `param`
-    """
-
-    def __init__(self, param):
-        self.param = param
-
-    def __repr__(self):
-        return _type_repr(self.param)
-
-    def __eq__(self, other):
-        if isinstance(other, _DataPipeType):
-            return self.param == other.param
-        return NotImplemented
-
-    def __hash__(self):
-        return hash(self.param)
-
-    def issubtype(self, other):
-        if isinstance(other.param, _GenericAlias):
-            if getattr(other.param, '__origin__', None) is Generic:
-                return True
-        if isinstance(other, _DataPipeType):
-            return issubtype(self.param, other.param)
-        if isinstance(other, type):
-            return issubtype(self.param, other)
-        raise TypeError("Expected '_DataPipeType' or 'type', but found {}".format(type(other)))
-
-    def issubtype_of_instance(self, other):
-        return issubinstance(other, self.param)
-
-
-# Default type for DataPipe without annotation
-T_co = TypeVar('T_co', covariant=True)
-_DEFAULT_TYPE = _DataPipeType(Generic[T_co])
-
-
-class _DataPipeMeta(GenericMeta):
-    r"""
-    Metaclass for `DataPipe`. Add `type` attribute and `__init_subclass__` based
-    on the type, and validate the return hint of `__iter__`.
-    """
-    type: _DataPipeType
-
-    def __new__(cls, name, bases, namespace, **kwargs):
-        if '__iter__' in namespace:
-            hook_iterator(namespace, 'enumerate(DataPipe)#{}'.format(name))
-
-        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
-
-        cls.__origin__ = None
-        if 'type' in namespace:
-            return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
-
-        namespace['__type_class__'] = False
-        #  For plain derived class without annotation
-        for base in bases:
-            if isinstance(base, _DataPipeMeta):
-                return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
-
-        namespace.update({'type': _DEFAULT_TYPE,
-                          '__init_subclass__': _dp_init_subclass})
-        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
-
-    def __init__(self, name, bases, namespace, **kwargs):
-        super().__init__(name, bases, namespace, **kwargs)  # type: ignore[call-overload]
-
-    # TODO: Fix isinstance bug
-    @_tp_cache
-    def _getitem_(self, params):
-        if params is None:
-            raise TypeError('{}[t]: t can not be None'.format(self.__name__))
-        if isinstance(params, str):
-            params = ForwardRef(params)
-        if not isinstance(params, tuple):
-            params = (params, )
-
-        msg = "{}[t]: t must be a type".format(self.__name__)
-        params = tuple(_type_check(p, msg) for p in params)
-
-        if isinstance(self.type.param, _GenericAlias):
-            orig = getattr(self.type.param, '__origin__', None)
-            if isinstance(orig, type) and orig is not Generic:
-                p = self.type.param[params]  # type: ignore[index]
-                t = _DataPipeType(p)
-                l = len(str(self.type)) + 2
-                name = self.__name__[:-l]
-                name = name + '[' + str(t) + ']'
-                bases = (self,) + self.__bases__
-                return self.__class__(name, bases,
-                                      {'__init_subclass__': _dp_init_subclass,
-                                       'type': t,
-                                       '__type_class__': True})
-
-        if len(params) > 1:
-            raise TypeError('Too many parameters for {} actual {}, expected 1'.format(self, len(params)))
-
-        t = _DataPipeType(params[0])
-
-        if not t.issubtype(self.type):
-            raise TypeError('Can not subclass a DataPipe[{}] from DataPipe[{}]'
-                            .format(t, self.type))
-
-        # Types are equal, fast path for inheritance
-        if self.type == t:
-            return self
-
-        name = self.__name__ + '[' + str(t) + ']'
-        bases = (self,) + self.__bases__
-
-        return self.__class__(name, bases,
-                              {'__init_subclass__': _dp_init_subclass,
-                               '__type_class__': True,
-                               'type': t})
-
-    # TODO: Fix isinstance bug
-    def _eq_(self, other):
-        if not isinstance(other, _DataPipeMeta):
-            return NotImplemented
-        if self.__origin__ is None or other.__origin__ is None:  # type: ignore[has-type]
-            return self is other
-        return (self.__origin__ == other.__origin__  # type: ignore[has-type]
-                and self.type == other.type)
-
-    # TODO: Fix isinstance bug
-    def _hash_(self):
-        return hash((self.__name__, self.type))
-
-
-def hook_iterator(namespace, profile_name):
-
-    def context():
-        return torch.autograd.profiler.record_function(profile_name)
-
-    class IteratorDecorator:
-        '''Wrap the iterator return result by adding __next__'''
-        def __init__(self, iterator):
-            self.iterator = iterator
-
-        def __iter__(self):
-            return self
-
-        def __next__(self):
-            with context():
-                return next(self.iterator)
-
-    func = namespace['__iter__']
-
-    if inspect.isgeneratorfunction(func):
-        @functools.wraps(func)
-        def wrap_generator(*args, **kwargs):
-            gen = func(*args, **kwargs)
-            try:
-                with context():
-                    response = gen.send(None)
-                while True:
-                    request = yield response
-                    with context():
-                        response = gen.send(request)
-            except StopIteration as e:
-                return e.value
-
-        namespace['__iter__'] = wrap_generator
-    else:
-        if '__next__' in namespace:
-            next_func = namespace['__next__']
-
-            @functools.wraps(next_func)
-            def wrap_next(*args, **kwargs):
-                with context():
-                    return next_func(*args, **kwargs)
-
-            namespace['__next__'] = wrap_next
-        else:
-            # have the __iter__ but not __next__ like what _ChildDataPipe did.
-            @functools.wraps(func)
-            def wrap_iter(*args, **kwargs):
-                iter_ret = func(*args, **kwargs)
-                return IteratorDecorator(iter_ret)
-
-            namespace['__iter__'] = wrap_iter
-
-def _dp_init_subclass(sub_cls, *args, **kwargs):
-    # Add function for datapipe instance to reinforce the type
-    sub_cls.reinforce_type = reinforce_type
-
-    # TODO:
-    # - add global switch for type checking at compile-time
-
-    # Ignore internal type class
-    if getattr(sub_cls, '__type_class__', False):
-        return
-
-    # Check if the string type is valid
-    if isinstance(sub_cls.type.param, ForwardRef):
-        base_globals = sys.modules[sub_cls.__module__].__dict__
-        try:
-            param = _eval_type(sub_cls.type.param, base_globals, locals())
-            sub_cls.type.param = param
-        except TypeError as e:
-            raise TypeError("{} is not supported by Python typing"
-                            .format(sub_cls.type.param.__forward_arg__)) from e
-
-    if '__iter__' in sub_cls.__dict__:
-        iter_fn = sub_cls.__dict__['__iter__']
-        hints = get_type_hints(iter_fn)
-        if 'return' in hints:
-            return_hint = hints['return']
-            # Plain Return Hint for Python 3.6
-            if return_hint == Iterator:
-                return
-            if not (hasattr(return_hint, '__origin__') and
-                    (return_hint.__origin__ == Iterator or
-                     return_hint.__origin__ == collections.abc.Iterator)):
-                raise TypeError("Expected 'Iterator' as the return annotation for `__iter__` of {}"
-                                ", but found {}".format(sub_cls.__name__, _type_repr(hints['return'])))
-            data_type = return_hint.__args__[0]
-            if not issubtype(data_type, sub_cls.type.param):
-                raise TypeError("Expected return type of '__iter__' as a subtype of {}, but found {}"
-                                " for {}".format(sub_cls.type, _type_repr(data_type), sub_cls.__name__))
-
-def reinforce_type(self, expected_type):
-    r"""
-    Reinforce the type for DataPipe instance. And the 'expected_type' is required
-    to be a subtype of the original type hint to restrict the type requirement
-    of DataPipe instance.
-    """
-    if isinstance(expected_type, tuple):
-        expected_type = Tuple[expected_type]
-    _type_check(expected_type, msg="'expected_type' must be a type")
-
-    if not issubtype(expected_type, self.type.param):
-        raise TypeError("Expected 'expected_type' as subtype of {}, but found {}"
-                        .format(self.type, _type_repr(expected_type)))
-
-    self.type = _DataPipeType(expected_type)
-    return self
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index e2bf38804ffe..17b163cb8ec9 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -91,15 +91,18 @@ def default_collate(batch):
         `batch_size` or `batch_sampler` is defined in :class:`~torch.utils.data.DataLoader`.
 
         Here is the general input type (based on the type of the element within the batch) to output type mapping:
-        * :class:`torch.Tensor` -> :class:`torch.Tensor` (with an added outer dimension batch size)
-        * NumPy Arrays -> :class:`torch.Tensor`
-        * `float` -> :class:`torch.Tensor`
-        * `int` -> :class:`torch.Tensor`
-        * `str` -> `str` (unchanged)
-        * `bytes` -> `bytes` (unchanged)
-        * `Mapping[K, V_i]` -> `Mapping[K, default_collate([V_1, V_2, ...])]`
-        * `NamedTuple[V1_i, V2_i, ...]` -> `NamedTuple[default_collate([V1_1, V1_2, ...]), default_collate([V2_1, V2_2, ...]), ...]`
-        * `Sequence[V1_i, V2_i, ...]` -> `Sequence[default_collate([V1_1, V1_2, ...]), default_collate([V2_1, V2_2, ...]), ...]`
+
+            * :class:`torch.Tensor` -> :class:`torch.Tensor` (with an added outer dimension batch size)
+            * NumPy Arrays -> :class:`torch.Tensor`
+            * `float` -> :class:`torch.Tensor`
+            * `int` -> :class:`torch.Tensor`
+            * `str` -> `str` (unchanged)
+            * `bytes` -> `bytes` (unchanged)
+            * `Mapping[K, V_i]` -> `Mapping[K, default_collate([V_1, V_2, ...])]`
+            * `NamedTuple[V1_i, V2_i, ...]` -> `NamedTuple[default_collate([V1_1, V1_2, ...]),
+              default_collate([V2_1, V2_2, ...]), ...]`
+            * `Sequence[V1_i, V2_i, ...]` -> `Sequence[default_collate([V1_1, V1_2, ...]),
+              default_collate([V2_1, V2_2, ...]), ...]`
 
         Args:
             batch: a single batch to be collated
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index e5c73a542639..fd2879228d76 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -14,7 +14,7 @@
 from torch._utils import ExceptionWrapper
 
 
-def _pin_memory_loop(in_queue, out_queue, device_id, done_event):
+def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
     # This setting is thread local, and prevents the copy in pin_memory from
     # consuming all CPU cores.
     torch.set_num_threads(1)
@@ -31,7 +31,7 @@ def _pin_memory_loop(in_queue, out_queue, device_id, done_event):
         idx, data = r
         if not done_event.is_set() and not isinstance(data, ExceptionWrapper):
             try:
-                data = pin_memory(data)
+                data = pin_memory(data, device)
             except Exception:
                 data = ExceptionWrapper(
                     where="in pin memory thread for device {}".format(device_id))
@@ -45,27 +45,27 @@ def _pin_memory_loop(in_queue, out_queue, device_id, done_event):
         del r  # save memory
 
 
-def pin_memory(data):
+def pin_memory(data, device=None):
     if isinstance(data, torch.Tensor):
-        return data.pin_memory()
+        return data.pin_memory(device)
     elif isinstance(data, string_classes):
         return data
     elif isinstance(data, collections.abc.Mapping):
         try:
-            return type(data)({k: pin_memory(sample) for k, sample in data.items()})  # type: ignore[call-arg]
+            return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
         except TypeError:
             # The mapping type may not support `__init__(iterable)`.
-            return {k: pin_memory(sample) for k, sample in data.items()}
+            return {k: pin_memory(sample, device) for k, sample in data.items()}
     elif isinstance(data, tuple) and hasattr(data, '_fields'):  # namedtuple
-        return type(data)(*(pin_memory(sample) for sample in data))
+        return type(data)(*(pin_memory(sample, device) for sample in data))
     elif isinstance(data, tuple):
-        return [pin_memory(sample) for sample in data]  # Backwards compatibility.
+        return [pin_memory(sample, device) for sample in data]  # Backwards compatibility.
     elif isinstance(data, collections.abc.Sequence):
         try:
-            return type(data)([pin_memory(sample) for sample in data])  # type: ignore[call-arg]
+            return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
         except TypeError:
             # The sequence type may not support `__init__(iterable)` (e.g., `range`).
-            return [pin_memory(sample) for sample in data]
+            return [pin_memory(sample, device) for sample in data]
     elif hasattr(data, "pin_memory"):
         return data.pin_memory()
     else:
diff --git a/torch/utils/data/_utils/serialization.py b/torch/utils/data/_utils/serialization.py
new file mode 100644
index 000000000000..c2677919c66e
--- /dev/null
+++ b/torch/utils/data/_utils/serialization.py
@@ -0,0 +1,11 @@
+try:
+    import dill
+
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    DILL_AVAILABLE = True
+except ImportError:
+    DILL_AVAILABLE = False
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 6b781d021d8a..1609a9f46b74 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -222,6 +222,9 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
             import numpy as np
             np.random.seed(np_seed)
 
+        process_shared_rng = torch.Generator()
+        process_shared_rng.manual_seed(base_seed)
+
         global _worker_info
         _worker_info = WorkerInfo(id=worker_id, num_workers=num_workers,
                                   seed=seed, dataset=dataset)
@@ -264,6 +267,14 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
                 # Acknowledge the main process
                 data_queue.put((r, None))
                 iteration_end = False
+
+                from torch.utils.data import IterDataPipe
+
+                if isinstance(dataset, IterDataPipe):
+                    from torch.utils.data.graph_settings import apply_shuffle_seed
+
+                    dataset = apply_shuffle_seed(dataset, process_shared_rng)
+
                 # Recreate the fetcher for worker-reuse policy
                 fetcher = _DatasetKind.create_fetcher(
                     dataset_kind, dataset, auto_collation, collate_fn, drop_last)
diff --git a/torch/utils/data/communication/eventloop.py b/torch/utils/data/communication/eventloop.py
index 77aec5626ffc..5851e5ddcfd8 100644
--- a/torch/utils/data/communication/eventloop.py
+++ b/torch/utils/data/communication/eventloop.py
@@ -4,6 +4,22 @@
 
 from torch.utils.data import IterDataPipe, communication, MapDataPipe
 
+try:
+    import dill
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    HAS_DILL = True
+except ImportError:
+    HAS_DILL = False
+
+__all__ = [
+    "DataPipeToQueuesLoop",
+    "SpawnProcessForDataPipeline",
+    "SpawnThreadForDataPipeline",
+]
 
 def DataPipeToQueuesLoop(source_datapipe, req_queue, res_queue):
     if isinstance(source_datapipe, IterDataPipe):
@@ -39,8 +55,15 @@ def SpawnThreadForDataPipeline(datapipe):
 
     try:
         new_datapipe = pickle.loads(pickle.dumps(datapipe))
-    except Exception as e:
-        raise Exception('Unable to pickle DataPipe to make thread local copy', e)
+    except Exception as pe:
+        if HAS_DILL:
+            try:
+                new_datapipe = dill.loads(dill.dumps(datapipe))
+            except Exception as de:
+                Exception('Unable to dill DataPipe to make thread local copy', de)
+
+        else:
+            raise Exception('Unable to pickle DataPipe to make thread local copy (consider installing `dill`)', pe)
 
     process = threading.Thread(target=DataPipeToQueuesLoop, args=(
         new_datapipe, req_queue, res_queue), daemon=True)
diff --git a/torch/utils/data/communication/iter.py b/torch/utils/data/communication/iter.py
index aa335e15151c..94f7cd2ec703 100644
--- a/torch/utils/data/communication/iter.py
+++ b/torch/utils/data/communication/iter.py
@@ -5,6 +5,16 @@
 
 DEFAULT_NON_BLOCKING_SLEEP = 0.001
 
+__all__ = [
+    "DataPipeBehindQueues",
+    "EnsureNonBlockingDataPipe",
+    "InvalidStateResetRequired",
+    "NonBlocking",
+    "NotAvailable",
+    "QueueWrapper",
+    "default_not_available_hook",
+]
+
 
 def default_not_available_hook():
     time.sleep(DEFAULT_NON_BLOCKING_SLEEP)
diff --git a/torch/utils/data/communication/map.py b/torch/utils/data/communication/map.py
index 15b9fc58aa0f..8af63bf0c73e 100644
--- a/torch/utils/data/communication/map.py
+++ b/torch/utils/data/communication/map.py
@@ -5,6 +5,15 @@
 
 DEFAULT_NON_BLOCKING_SLEEP = 0.001
 
+__all__ = [
+    "DataPipeBehindQueues",
+    "EnsureNonBlockingMapDataPipe",
+    "NonBlockingMap",
+    "NotAvailable",
+    "QueueWrapperForMap",
+    "default_not_available_hook",
+]
+
 
 def default_not_available_hook():
     time.sleep(DEFAULT_NON_BLOCKING_SLEEP)
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index d10576987b4b..7b2a0253abce 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -18,11 +18,29 @@
 from torch._utils import ExceptionWrapper
 from torch._six import string_classes
 
-from . import IterDataPipe, IterableDataset, Sampler, SequentialSampler, RandomSampler, BatchSampler, Dataset
+from . import (
+    IterDataPipe,
+    MapDataPipe,
+    IterableDataset,
+    Sampler,
+    SequentialSampler,
+    RandomSampler,
+    BatchSampler,
+    Dataset,)
+
+from torch.utils.data.datapipes.datapipe import _IterDataPipeSerializationWrapper, _MapDataPipeSerializationWrapper
+
 from . import _utils
 
 import torch.utils.data.graph_settings
 
+__all__ = [
+    "DataLoader",
+    "get_worker_info",
+    "default_collate",
+    "default_convert",
+]
+
 T_co = TypeVar('T_co', covariant=True)
 T = TypeVar('T')
 _worker_init_fn_t = Callable[[int], None]
@@ -103,7 +121,7 @@ class DataLoader(Generic[T_co]):
             mini-batch of Tensor(s).  Used when using batched loading from a
             map-style dataset.
         pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
-            into CUDA pinned memory before returning them.  If your data elements
+            into device/CUDA pinned memory before returning them.  If your data elements
             are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
             see the example below.
         drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
@@ -118,12 +136,14 @@ class DataLoader(Generic[T_co]):
         generator (torch.Generator, optional): If not ``None``, this RNG will be used
             by RandomSampler to generate random indexes and multiprocessing to generate
             `base_seed` for workers. (default: ``None``)
-        prefetch_factor (int, optional, keyword-only arg): Number of samples loaded
+        prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
             in advance by each worker. ``2`` means there will be a total of
-            2 * num_workers samples prefetched across all workers. (default: ``2``)
+            2 * num_workers batches prefetched across all workers. (default: ``2``)
         persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
             the worker processes after a dataset has been consumed once. This allows to
             maintain the workers `Dataset` instances alive. (default: ``False``)
+        pin_memory_device (str, optional): the data loader will copy Tensors
+            into device pinned memory before returning them if pin_memory is set to true.
 
 
     .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
@@ -159,19 +179,21 @@ class DataLoader(Generic[T_co]):
     drop_last: bool
     timeout: float
     sampler: Union[Sampler, Iterable]
+    pin_memory_device: str
     prefetch_factor: int
     _iterator : Optional['_BaseDataLoaderIter']
     __initialized = False
 
     def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
-                 shuffle: bool = False, sampler: Union[Sampler, Iterable, None] = None,
+                 shuffle: Optional[bool] = None, sampler: Union[Sampler, Iterable, None] = None,
                  batch_sampler: Union[Sampler[Sequence], Iterable[Sequence], None] = None,
                  num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None,
                  pin_memory: bool = False, drop_last: bool = False,
                  timeout: float = 0, worker_init_fn: Optional[_worker_init_fn_t] = None,
                  multiprocessing_context=None, generator=None,
                  *, prefetch_factor: int = 2,
-                 persistent_workers: bool = False):
+                 persistent_workers: bool = False,
+                 pin_memory_device: str = ""):
         torch._C._log_api_usage_once("python.data_loader")
 
         if num_workers < 0:
@@ -193,10 +215,17 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
         self.num_workers = num_workers
         self.prefetch_factor = prefetch_factor
         self.pin_memory = pin_memory
+        self.pin_memory_device = pin_memory_device
         self.timeout = timeout
         self.worker_init_fn = worker_init_fn
         self.multiprocessing_context = multiprocessing_context
 
+        # _DataPipeSerializationWrapper container makes it easier to serialize without redefining pickler
+        if isinstance(self.dataset, IterDataPipe):
+            self.dataset = _IterDataPipeSerializationWrapper(self.dataset)
+        elif isinstance(self.dataset, MapDataPipe):
+            self.dataset = _MapDataPipeSerializationWrapper(self.dataset)
+
         # Arg-check dataset related before checking samplers because we want to
         # tell users that iterable-style datasets are incompatible with custom
         # samplers first, so that they don't learn that this combo doesn't work
@@ -229,8 +258,10 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
             # this, and support custom samplers that specify the assignments to
             # specific workers.
             if isinstance(dataset, IterDataPipe):
-                torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
-            elif shuffle is not False:
+                if shuffle is not None:
+                    dataset = torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
+            # We cannot check `shuffle is not None` here, since previously `shuffle=False` was the default.
+            elif shuffle not in {False, None}:
                 raise ValueError(
                     "DataLoader with IterableDataset: expected unspecified "
                     "shuffle option, but got shuffle={}".format(shuffle))
@@ -246,6 +277,7 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
                     "DataLoader with IterableDataset: expected unspecified "
                     "batch_sampler option, but got batch_sampler={}".format(batch_sampler))
         else:
+            shuffle = bool(shuffle)
             self._dataset_kind = _DatasetKind.Map
 
 
@@ -493,6 +525,8 @@ def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
 class _BaseDataLoaderIter(object):
     def __init__(self, loader: DataLoader) -> None:
         self._dataset = loader.dataset
+        if isinstance(self._dataset, IterDataPipe):
+            self._dataset = torch.utils.data.graph_settings.apply_shuffle_seed(self._dataset, loader.generator)
         self._dataset_kind = loader._dataset_kind
         self._IterableDataset_len_called = loader._IterableDataset_len_called
         self._auto_collation = loader._auto_collation
@@ -500,7 +534,20 @@ def __init__(self, loader: DataLoader) -> None:
         self._index_sampler = loader._index_sampler
         self._num_workers = loader.num_workers
         self._prefetch_factor = loader.prefetch_factor
-        self._pin_memory = loader.pin_memory and torch.cuda.is_available()
+        # for other backends, pin_memory_device need to set. if not set
+        # default behaviour is CUDA device. if pin_memory_device is selected
+        # and pin_memory is not set, the default behaviour false.
+        if (len(loader.pin_memory_device) == 0):
+            self._pin_memory = loader.pin_memory and torch.cuda.is_available()
+            self._pin_memory_device = None
+        else:
+            if not loader.pin_memory:
+                warn_msg = ("pin memory device is set and pin_memory flag is not used then device pinned memory won't be used"
+                            "please set pin_memory to true, if you need to use the device pin memory")
+                warnings.warn(warn_msg)
+
+            self._pin_memory = loader.pin_memory
+            self._pin_memory_device = loader.pin_memory_device
         self._timeout = loader.timeout
         self._collate_fn = loader.collate_fn
         self._sampler_iter = iter(self._index_sampler)
@@ -526,7 +573,8 @@ def _next_data(self):
     def __next__(self) -> Any:
         with torch.autograd.profiler.record_function(self._profile_name):
             if self._sampler_iter is None:
-                self._reset()
+                # TODO(https://github.com/pytorch/pytorch/issues/76750)
+                self._reset()  # type: ignore[call-arg]
             data = self._next_data()
             self._num_yielded += 1
             if self._dataset_kind == _DatasetKind.Iterable and \
@@ -569,7 +617,7 @@ def _next_data(self):
         index = self._next_index()  # may raise StopIteration
         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
         if self._pin_memory:
-            data = _utils.pin_memory.pin_memory(data)
+            data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
         return data
 
 
@@ -895,7 +943,6 @@ def __init__(self, loader):
             multiprocessing_context = loader.multiprocessing_context
 
         self._worker_init_fn = loader.worker_init_fn
-        self._worker_queue_idx_cycle = itertools.cycle(range(self._num_workers))
         # No certainty which module multiprocessing_context is
         self._worker_result_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
         self._worker_pids_set = False
@@ -937,7 +984,7 @@ def __init__(self, loader):
                 target=_utils.pin_memory._pin_memory_loop,
                 args=(self._worker_result_queue, self._data_queue,
                       torch.cuda.current_device(),
-                      self._pin_memory_thread_done_event))
+                      self._pin_memory_thread_done_event, self._pin_memory_device))
             pin_memory_thread.daemon = True
             pin_memory_thread.start()
             # Similar to workers (see comment above), we only register
@@ -981,6 +1028,8 @@ def _reset(self, loader, first_iter=False):
         # It does not mean that a worker is dead. In case of `_persistent_workers`,
         # the worker will be reset to available in the next epoch.
         self._workers_status = [True for i in range(self._num_workers)]
+        # Reset the worker queue cycle so it resumes next epoch at worker 0
+        self._worker_queue_idx_cycle = itertools.cycle(range(self._num_workers))
         # We resume the prefetching in case it was enabled
         if not first_iter:
             for idx in range(self._num_workers):
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
index eafb981fcaaa..f8a27844b0dd 100644
--- a/torch/utils/data/dataloader_experimental.py
+++ b/torch/utils/data/dataloader_experimental.py
@@ -10,6 +10,10 @@
 from torch.utils.data import DataLoader, IterDataPipe, communication
 from torch.utils.data.datapipes.iter import IterableWrapper
 
+__all__ = [
+    "DataLoader2",
+]
+
 
 class _ThreadingDataLoader2:
 
@@ -60,6 +64,7 @@ def _sharding_worker_init_fn(worker_init_fn, worker_id):
         worker_init_fn(worker_id)
     torch.utils.data.backward_compatibility.worker_init_fn(
         worker_id)
+
 class DataLoader2:
     def __new__(cls,
                 dataset,
@@ -87,9 +92,7 @@ def __new__(cls,
                 raise Exception(
                     'sampler is not yet supported by DataPipes')
             datapipe = dataset
-            if shuffle:
-                # Enforce at least one shuffle in the graph
-                datapipe = datapipe.shuffle()
+            datapipe = torch.utils.data.graph_settings.apply_shuffle_settings(datapipe, shuffle=shuffle)
             if batch_outside_worker and pin_memory:
                 raise Exception(
                     'pin_memory is not yet compatible with batch_outside_worker')
@@ -98,7 +101,6 @@ def __new__(cls,
                     datapipe = datapipe.batch(batch_size, drop_last=drop_last)
                     if collate_fn is None:
                         collate_fn = torch.utils.data._utils.collate.default_collate
-            torch.utils.data.graph_settings.apply_shuffle_settings(datapipe, shuffle=shuffle)
             if parallelism_mode == 'mp' or num_workers == 0:
                 my_worker_init_fn = functools.partial(
                     _sharding_worker_init_fn, worker_init_fn)
diff --git a/torch/utils/data/_decorator.py b/torch/utils/data/datapipes/_decorator.py
similarity index 98%
rename from torch/utils/data/_decorator.py
rename to torch/utils/data/datapipes/_decorator.py
index b9cd2b0f242a..e466de512523 100644
--- a/torch/utils/data/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -1,8 +1,8 @@
 import inspect
 from functools import wraps
 from typing import Any, Callable, Optional, Type, Union, get_type_hints
-from torch.utils.data import IterDataPipe, MapDataPipe
-from torch.utils.data._typing import _DataPipeMeta
+from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
+from torch.utils.data.datapipes._typing import _DataPipeMeta
 
 
 ######################################################
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
new file mode 100644
index 000000000000..e0a081182d9a
--- /dev/null
+++ b/torch/utils/data/datapipes/_typing.py
@@ -0,0 +1,571 @@
+# Taking reference from official Python typing
+# https://github.com/python/cpython/blob/master/Lib/typing.py
+
+import collections
+import functools
+import inspect
+import numbers
+import sys
+from typing import (Any, Dict, Iterator, Generic, List, Set, Tuple, TypeVar, Union,
+                    get_type_hints)
+from typing import _eval_type, _tp_cache, _type_check, _type_repr  # type: ignore[attr-defined]
+from typing import ForwardRef
+
+# TODO: Use TypeAlias when Python 3.6 is deprecated
+# Please check [Note: TypeMeta and TypeAlias]
+# In case of metaclass conflict due to ABCMeta or _ProtocolMeta
+# For Python 3.9, only Protocol in typing uses metaclass
+from abc import ABCMeta
+from typing import _GenericAlias  # type: ignore[attr-defined, no-redef]
+
+class GenericMeta(ABCMeta):  # type: ignore[no-redef]
+    pass
+
+import torch
+
+class Integer(numbers.Integral):
+    pass
+
+
+class Boolean(numbers.Integral):
+    pass
+
+
+# Python 'type' object is not subscriptable
+# Tuple[int, List, dict] -> valid
+# tuple[int, list, dict] -> invalid
+# Map Python 'type' to abstract base class
+TYPE2ABC = {
+    bool: Boolean,
+    int: Integer,
+    float: numbers.Real,
+    complex: numbers.Complex,
+    dict: Dict,
+    list: List,
+    set: Set,
+    tuple: Tuple,
+    None: type(None),
+}
+
+
+def issubtype(left, right, recursive=True):
+    r"""
+    Check if the left-side type is a subtype of the right-side type.
+    If any of type is a composite type like `Union` and `TypeVar` with
+    bounds, it would be expanded into a list of types and check all
+    of left-side types are subtypes of either one from right-side types.
+    """
+    left = TYPE2ABC.get(left, left)
+    right = TYPE2ABC.get(right, right)
+
+    if right is Any or left == right:
+        return True
+
+    if isinstance(right, _GenericAlias):
+        if getattr(right, '__origin__', None) is Generic:
+            return True
+
+    if right == type(None):
+        return False
+
+    # Right-side type
+    constraints = _decompose_type(right)
+
+    if len(constraints) == 0 or Any in constraints:
+        return True
+
+    if left is Any:
+        return False
+
+    # Left-side type
+    variants = _decompose_type(left)
+
+    # all() will return True for empty variants
+    if len(variants) == 0:
+        return False
+
+    return all(_issubtype_with_constraints(variant, constraints, recursive) for variant in variants)
+
+
+def _decompose_type(t, to_list=True):
+    if isinstance(t, TypeVar):
+        if t.__bound__ is not None:
+            ts = [t.__bound__]
+        else:
+            # For T_co, __constraints__ is ()
+            ts = list(t.__constraints__)
+    elif hasattr(t, '__origin__') and t.__origin__ == Union:
+        ts = t.__args__
+    else:
+        if not to_list:
+            return None
+        ts = [t]
+    # Ignored: Generator has incompatible item type "object"; expected "Type[Any]"
+    ts = list(TYPE2ABC.get(_t, _t) for _t in ts)  # type: ignore[misc]
+    return ts
+
+
+def _issubtype_with_constraints(variant, constraints, recursive=True):
+    r"""
+    Check if the variant is a subtype of either one from constraints.
+    For composite types like `Union` and `TypeVar` with bounds, they
+    would be expanded for testing.
+    """
+    if variant in constraints:
+        return True
+
+    # [Note: Subtype for Union and TypeVar]
+    # Python typing is able to flatten Union[Union[...]] or Union[TypeVar].
+    # But it couldn't flatten the following scenarios:
+    #   - Union[int, TypeVar[Union[...]]]
+    #   - TypeVar[TypeVar[...]]
+    # So, variant and each constraint may be a TypeVar or a Union.
+    # In these cases, all of inner types from the variant are required to be
+    # extraced and verified as a subtype of any constraint. And, all of
+    # inner types from any constraint being a TypeVar or a Union are
+    # also required to be extracted and verified if the variant belongs to
+    # any of them.
+
+    # Variant
+    vs = _decompose_type(variant, to_list=False)
+
+    # Variant is TypeVar or Union
+    if vs is not None:
+        return all(_issubtype_with_constraints(v, constraints, recursive) for v in vs)
+
+    # Variant is not TypeVar or Union
+    if hasattr(variant, '__origin__') and variant.__origin__ is not None:
+        v_origin = variant.__origin__
+        # In Python-3.9 typing library untyped generics do not have args
+        v_args = getattr(variant, "__args__", None)
+    else:
+        v_origin = variant
+        v_args = None
+
+    # Constraints
+    for constraint in constraints:
+        cs = _decompose_type(constraint, to_list=False)
+
+        # Constraint is TypeVar or Union
+        if cs is not None:
+            if _issubtype_with_constraints(variant, cs, recursive):
+                return True
+        # Constraint is not TypeVar or Union
+        else:
+            # __origin__ can be None for plain list, tuple, ... in Python 3.6
+            if hasattr(constraint, '__origin__') and constraint.__origin__ is not None:
+                c_origin = constraint.__origin__
+                if v_origin == c_origin:
+                    if not recursive:
+                        return True
+                    # In Python-3.9 typing library untyped generics do not have args
+                    c_args = getattr(constraint, "__args__", None)
+                    if c_args is None or len(c_args) == 0:
+                        return True
+                    if v_args is not None and len(v_args) == len(c_args) and \
+                            all(issubtype(v_arg, c_arg) for v_arg, c_arg in zip(v_args, c_args)):
+                        return True
+            # Tuple[int] -> Tuple
+            else:
+                if v_origin == constraint:
+                    return True
+
+    return False
+
+
+def issubinstance(data, data_type):
+    if not issubtype(type(data), data_type, recursive=False):
+        return False
+
+    # In Python-3.9 typing library __args__ attribute is not defined for untyped generics
+    dt_args = getattr(data_type, "__args__", None)
+    if isinstance(data, tuple):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        if len(dt_args) != len(data):
+            return False
+        return all(issubinstance(d, t) for d, t in zip(data, dt_args))
+    elif isinstance(data, (list, set)):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        t = dt_args[0]
+        return all(issubinstance(d, t) for d in data)
+    elif isinstance(data, dict):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        kt, vt = dt_args
+        return all(issubinstance(k, kt) and issubinstance(v, vt) for k, v in data.items())
+
+    return True
+
+
+# [Note: TypeMeta and TypeAlias]
+# In order to keep compatibility for Python 3.6, use Meta for the typing.
+# TODO: When PyTorch drops the support for Python 3.6, it can be converted
+# into the Alias system and using `__class_getitem__` for DataPipe. The
+# typing system will gain benefit of performance and resolving metaclass
+# conflicts as elaborated in https://www.python.org/dev/peps/pep-0560/
+
+
+class _DataPipeType:
+    r"""
+    Save type annotation in `param`
+    """
+
+    def __init__(self, param):
+        self.param = param
+
+    def __repr__(self):
+        return _type_repr(self.param)
+
+    def __eq__(self, other):
+        if isinstance(other, _DataPipeType):
+            return self.param == other.param
+        return NotImplemented
+
+    def __hash__(self):
+        return hash(self.param)
+
+    def issubtype(self, other):
+        if isinstance(other.param, _GenericAlias):
+            if getattr(other.param, '__origin__', None) is Generic:
+                return True
+        if isinstance(other, _DataPipeType):
+            return issubtype(self.param, other.param)
+        if isinstance(other, type):
+            return issubtype(self.param, other)
+        raise TypeError("Expected '_DataPipeType' or 'type', but found {}".format(type(other)))
+
+    def issubtype_of_instance(self, other):
+        return issubinstance(other, self.param)
+
+
+# Default type for DataPipe without annotation
+T_co = TypeVar('T_co', covariant=True)
+_DEFAULT_TYPE = _DataPipeType(Generic[T_co])
+
+
+class _DataPipeMeta(GenericMeta):
+    r"""
+    Metaclass for `DataPipe`. Add `type` attribute and `__init_subclass__` based
+    on the type, and validate the return hint of `__iter__`.
+
+    Note that there is subclass `_IterDataPipeMeta` specifically for `IterDataPipe`.
+    """
+    type: _DataPipeType
+
+    def __new__(cls, name, bases, namespace, **kwargs):
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        # TODO: the statements below are not reachable by design as there is a bug and typing is low priority for now.
+        cls.__origin__ = None
+        if 'type' in namespace:
+            return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        namespace['__type_class__'] = False
+        #  For plain derived class without annotation
+        for base in bases:
+            if isinstance(base, _DataPipeMeta):
+                return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        namespace.update({'type': _DEFAULT_TYPE,
+                          '__init_subclass__': _dp_init_subclass})
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+    def __init__(self, name, bases, namespace, **kwargs):
+        super().__init__(name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+    # TODO: Fix isinstance bug
+    @_tp_cache
+    def _getitem_(self, params):
+        if params is None:
+            raise TypeError('{}[t]: t can not be None'.format(self.__name__))
+        if isinstance(params, str):
+            params = ForwardRef(params)
+        if not isinstance(params, tuple):
+            params = (params, )
+
+        msg = "{}[t]: t must be a type".format(self.__name__)
+        params = tuple(_type_check(p, msg) for p in params)
+
+        if isinstance(self.type.param, _GenericAlias):
+            orig = getattr(self.type.param, '__origin__', None)
+            if isinstance(orig, type) and orig is not Generic:
+                p = self.type.param[params]  # type: ignore[index]
+                t = _DataPipeType(p)
+                l = len(str(self.type)) + 2
+                name = self.__name__[:-l]
+                name = name + '[' + str(t) + ']'
+                bases = (self,) + self.__bases__
+                return self.__class__(name, bases,
+                                      {'__init_subclass__': _dp_init_subclass,
+                                       'type': t,
+                                       '__type_class__': True})
+
+        if len(params) > 1:
+            raise TypeError('Too many parameters for {} actual {}, expected 1'.format(self, len(params)))
+
+        t = _DataPipeType(params[0])
+
+        if not t.issubtype(self.type):
+            raise TypeError('Can not subclass a DataPipe[{}] from DataPipe[{}]'
+                            .format(t, self.type))
+
+        # Types are equal, fast path for inheritance
+        if self.type == t:
+            return self
+
+        name = self.__name__ + '[' + str(t) + ']'
+        bases = (self,) + self.__bases__
+
+        return self.__class__(name, bases,
+                              {'__init_subclass__': _dp_init_subclass,
+                               '__type_class__': True,
+                               'type': t})
+
+    # TODO: Fix isinstance bug
+    def _eq_(self, other):
+        if not isinstance(other, _DataPipeMeta):
+            return NotImplemented
+        if self.__origin__ is None or other.__origin__ is None:  # type: ignore[has-type]
+            return self is other
+        return (self.__origin__ == other.__origin__  # type: ignore[has-type]
+                and self.type == other.type)
+
+    # TODO: Fix isinstance bug
+    def _hash_(self):
+        return hash((self.__name__, self.type))
+
+
+class _IterDataPipeMeta(_DataPipeMeta):
+    r"""
+    Metaclass for `IterDataPipe` and inherits from `_DataPipeMeta`. Aad a hook function to `__iter__`.
+    """
+
+    def __new__(cls, name, bases, namespace, **kwargs):
+        if '__iter__' in namespace:
+            hook_iterator(namespace, 'enumerate(DataPipe)#{}'.format(name))
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+
+def _simplify_obj_name(obj) -> str:
+    """
+    Simplify the display strings of objects for the purpose of rendering within DataPipe error messages.
+    """
+    if inspect.isfunction(obj):
+        return obj.__name__
+    else:
+        return repr(obj)
+
+
+def _generate_input_args_string(obj):
+    """
+    Generate a string for the input arguments of an object.
+    """
+    signature = inspect.signature(obj.__class__)
+    input_param_names = set()
+    for param_name, _ in signature.parameters.items():
+        input_param_names.add(param_name)
+    result = []
+    for name, obj in inspect.getmembers(obj):
+        if name in input_param_names:
+            result.append((name, _simplify_obj_name(obj)))
+    return ', '.join([f'{name}={value}' for name, value in result])
+
+
+def _generate_iterdatapipe_msg(datapipe):
+    return f"{datapipe.__class__.__name__}({_generate_input_args_string(datapipe)})"
+
+
+def _check_iterator_valid(datapipe, iterator_id, next_method_exists=False) -> None:
+    r"""
+    Given an instance of a DataPipe and an iterator ID, check if the IDs match, and if not, raises an exception.
+    In the case of ChildDataPipe, the ID gets compared to the one stored in `main_datapipe` as well.
+    """
+    msg = ("This iterator has been invalidated because another iterator has been created"
+           f"from the same IterDataPipe: {_generate_iterdatapipe_msg(datapipe)}\n"
+           "This may be caused multiple references to the same IterDataPipe. We recommend "
+           "using `.fork()` if that is necessary.")
+    feedback_msg = ("\nFor feedback regarding this single iterator per IterDataPipe constraint, feel free "
+                    "to comment on this issue: https://github.com/pytorch/data/issues/45.")
+    if next_method_exists:
+        # This is the case where `IterDataPipe` has both `__iter__` and `__next__`.
+        # The `_valid_iterator_id` should either be never set (`None`), or set by at most one
+        # iterator (`0`). Otherwise, it means there are multiple iterators.
+        if datapipe._valid_iterator_id is not None and datapipe._valid_iterator_id != 0:
+            extra_msg = "\nNote that this exception is raised inside your IterDataPipe's a `__next__` method"
+            raise RuntimeError(msg + extra_msg + feedback_msg)
+    elif hasattr(datapipe, "_is_child_datapipe") and datapipe._is_child_datapipe is True:
+        if hasattr(datapipe, "_check_valid_iterator_id"):
+            if not datapipe._check_valid_iterator_id(iterator_id):
+                raise RuntimeError("This iterator has been invalidated, because a new iterator has been created "
+                                   f"from one of the ChildDataPipes of "
+                                   f"{_generate_iterdatapipe_msg(datapipe.main_datapipe)}." + feedback_msg)
+        else:
+            raise RuntimeError("ChildDataPipe must have method `_check_valid_iterator_id`.")
+    elif datapipe._valid_iterator_id != iterator_id:
+        raise RuntimeError(msg + feedback_msg)
+
+
+def _set_datapipe_valid_iterator_id(datapipe):
+    r"""
+    Given a DataPipe, updates its valid iterator ID and reset the DataPipe.
+    """
+    if hasattr(datapipe, "_is_child_datapipe") and datapipe._is_child_datapipe is True:
+        if hasattr(datapipe, "_set_main_datapipe_valid_iterator_id"):
+            datapipe._set_main_datapipe_valid_iterator_id()  # reset() is called within this method when appropriate
+        else:
+            raise RuntimeError("ChildDataPipe must have method `_set_main_datapipe_valid_iterator_id`.")
+    else:
+        if datapipe._valid_iterator_id is None:
+            datapipe._valid_iterator_id = 0
+        else:
+            datapipe._valid_iterator_id += 1
+        datapipe.reset()
+    return datapipe._valid_iterator_id
+
+
+def hook_iterator(namespace, profile_name):
+    r"""
+    Hook that is applied to all `__iter__` of metaclass `_DataPipeMeta`. This is done for the purpose of
+    profiling and checking if an iterator is still valid.
+    """
+    def context():
+        return torch.autograd.profiler.record_function(profile_name)
+
+    class IteratorDecorator:
+        """Wrap the iterator and modifying its `__next__` method"""
+        def __init__(self, iterator, source_dp, iterator_id):
+            self.iterator = iterator
+            self.source_dp = source_dp
+            self.iterator_id = iterator_id
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            # TODO: Add try-except to in-place reduce traceback from the Exception
+            # See: https://github.com/pytorch/data/issues/284
+            with context():
+                _check_iterator_valid(self.source_dp, self.iterator_id)
+                return next(self.iterator)
+
+        def __getattr__(self, name):
+            return getattr(self.iterator, name)
+
+    func = namespace['__iter__']
+
+    # ``__iter__`` of IterDataPipe is a generator function
+    if inspect.isgeneratorfunction(func):
+        @functools.wraps(func)
+        def wrap_generator(*args, **kwargs):
+            gen = func(*args, **kwargs)
+            datapipe = args[0]
+            iterator_id = _set_datapipe_valid_iterator_id(datapipe)  # This ID is tied to each created iterator
+            try:
+                with context():
+                    response = gen.send(None)
+                while True:
+                    request = yield response
+                    with context():  # Pass through here every time `__next__` is called
+                        _check_iterator_valid(datapipe, iterator_id)
+                        response = gen.send(request)
+            except StopIteration as e:
+                return e.value
+            except Exception as e:
+                # TODO: Simplify the traceback message to skip over `response = gen.send(None)`
+                #       Part of https://github.com/pytorch/data/issues/284
+                datapipe = args[0]
+                msg = "thrown by __iter__ of"
+                full_msg = f"{msg} {datapipe.__class__.__name__}({_generate_input_args_string(datapipe)})"
+                if len(e.args) >= 1 and msg not in e.args[0]:
+                    e.args = (e.args[0] + f'\nThis exception is {full_msg}',) + e.args[1:]
+                raise
+
+        namespace['__iter__'] = wrap_generator
+    else:  # ``__iter__`` of IterDataPipe is NOT a generator function
+        # IterDataPipe is an iterator with both ``__iter__`` and ``__next__``
+        # And ``__iter__`` may or may not return `self`
+        if '__next__' in namespace:  # If `__next__` exists, put a wrapper around it
+            next_func = namespace['__next__']
+
+            @functools.wraps(next_func)
+            def wrap_next(*args, **kwargs):
+                with context():
+                    # Commented out since we do not wish to invalidate `datapipe` for now
+                    # datapipe = args[0]
+                    # _check_iterator_valid(datapipe, None, next_method_exists=True)
+                    return next_func(*args, **kwargs)
+
+            namespace['__next__'] = wrap_next
+
+            # Note that if the `__next__` and `__iter__` do something completely unrelated? It may cause issue but
+            # the user will be violating the iterator protocol
+
+        # Regardless if `__next__` exists or not, `__iter__` needs a wrapper to track the number of valid iterators
+        @functools.wraps(func)
+        def wrap_iter(*args, **kwargs):
+            iter_ret = func(*args, **kwargs)
+            datapipe = args[0]
+            iterator_id = _set_datapipe_valid_iterator_id(datapipe)  # This ID is tied to each created iterator
+            return IteratorDecorator(iter_ret, datapipe, iterator_id)
+
+        namespace['__iter__'] = wrap_iter
+
+
+def _dp_init_subclass(sub_cls, *args, **kwargs):
+    # Add function for datapipe instance to reinforce the type
+    sub_cls.reinforce_type = reinforce_type
+
+    # TODO:
+    # - add global switch for type checking at compile-time
+
+    # Ignore internal type class
+    if getattr(sub_cls, '__type_class__', False):
+        return
+
+    # Check if the string type is valid
+    if isinstance(sub_cls.type.param, ForwardRef):
+        base_globals = sys.modules[sub_cls.__module__].__dict__
+        try:
+            param = _eval_type(sub_cls.type.param, base_globals, locals())
+            sub_cls.type.param = param
+        except TypeError as e:
+            raise TypeError("{} is not supported by Python typing"
+                            .format(sub_cls.type.param.__forward_arg__)) from e
+
+    if '__iter__' in sub_cls.__dict__:
+        iter_fn = sub_cls.__dict__['__iter__']
+        hints = get_type_hints(iter_fn)
+        if 'return' in hints:
+            return_hint = hints['return']
+            # Plain Return Hint for Python 3.6
+            if return_hint == Iterator:
+                return
+            if not (hasattr(return_hint, '__origin__') and
+                    (return_hint.__origin__ == Iterator or
+                     return_hint.__origin__ == collections.abc.Iterator)):
+                raise TypeError("Expected 'Iterator' as the return annotation for `__iter__` of {}"
+                                ", but found {}".format(sub_cls.__name__, _type_repr(hints['return'])))
+            data_type = return_hint.__args__[0]
+            if not issubtype(data_type, sub_cls.type.param):
+                raise TypeError("Expected return type of '__iter__' as a subtype of {}, but found {}"
+                                " for {}".format(sub_cls.type, _type_repr(data_type), sub_cls.__name__))
+
+
+def reinforce_type(self, expected_type):
+    r"""
+    Reinforce the type for DataPipe instance. And the 'expected_type' is required
+    to be a subtype of the original type hint to restrict the type requirement
+    of DataPipe instance.
+    """
+    if isinstance(expected_type, tuple):
+        expected_type = Tuple[expected_type]
+    _type_check(expected_type, msg="'expected_type' must be a type")
+
+    if not issubtype(expected_type, self.type.param):
+        raise TypeError("Expected 'expected_type' as subtype of {}, but found {}"
+                        .format(self.type, _type_repr(expected_type)))
+
+    self.type = _DataPipeType(expected_type)
+    return self
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index 1f2767fa5c45..5d9028dc4217 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -1,15 +1,32 @@
 from typing import Any, Dict, List
 
-from torch.utils.data import (
-    DFIterDataPipe,
-    IterDataPipe,
-    functional_datapipe,
-)
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import DFIterDataPipe, IterDataPipe
 
 from torch.utils.data.datapipes.dataframe.structures import DataChunkDF
 
 # TODO(VitalyFedyunin): Add error when two different traces get combined
 
+__all__ = [
+    "Capture",
+    "CaptureAdd",
+    "CaptureCall",
+    "CaptureDataFrame",
+    "CaptureDataFrameWithDataPipeOps",
+    "CaptureF",
+    "CaptureGetAttr",
+    "CaptureGetItem",
+    "CaptureInitial",
+    "CaptureMul",
+    "CaptureSetItem",
+    "CaptureSub",
+    "CaptureVariable",
+    "CaptureVariableAssign",
+    "DataFrameTracer",
+    "DataFrameTracedOps",
+    "get_val",
+]
+
 
 class DataFrameTracedOps(DFIterDataPipe):
     def __init__(self, source_datapipe, output_var):
@@ -288,7 +305,7 @@ def filter(self, *args, **kwargs):
     def __getattr__(self, attrname):  # ?
         if attrname in DATAPIPES_OPS:
             return (self.as_datapipe()).__getattr__(attrname)
-        return super().__getattr__(attrname=attrname)
+        return super().__getattr__(attrname)
 
 
 @functional_datapipe('trace_as_dataframe')
diff --git a/torch/utils/data/datapipes/dataframe/datapipes.py b/torch/utils/data/datapipes/dataframe/datapipes.py
index d7849b83d862..d8f54cf1997e 100644
--- a/torch/utils/data/datapipes/dataframe/datapipes.py
+++ b/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -1,12 +1,20 @@
 import random
 
-from torch.utils.data import (
-    DFIterDataPipe,
-    IterDataPipe,
-    functional_datapipe,
-)
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import DFIterDataPipe, IterDataPipe
+
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
 
+__all__ = [
+    "ConcatDataFramesPipe",
+    "DataFramesAsTuplesPipe",
+    "ExampleAggregateAsDataFrames",
+    "FilterDataFramesPipe",
+    "PerRowDataFramesPipe",
+    "ShuffleDataFramesPipe",
+]
+
+
 @functional_datapipe('_dataframes_as_tuples')
 class DataFramesAsTuplesPipe(IterDataPipe):
     def __init__(self, source_datapipe):
diff --git a/torch/utils/data/datapipes/dataframe/structures.py b/torch/utils/data/datapipes/dataframe/structures.py
index c822f893ba28..003e7625604b 100644
--- a/torch/utils/data/datapipes/dataframe/structures.py
+++ b/torch/utils/data/datapipes/dataframe/structures.py
@@ -1,6 +1,7 @@
-from torch.utils.data import (
-    DataChunk,
-)
+from torch.utils.data.datapipes.datapipe import DataChunk
+
+__all__ = ["DataChunkDF", ]
+
 
 class DataChunkDF(DataChunk):
     """
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
new file mode 100644
index 000000000000..f6ce4d9adab5
--- /dev/null
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -0,0 +1,355 @@
+import functools
+import pickle
+from typing import Dict, Callable, Optional, TypeVar, Generic, Iterator
+
+from torch.utils.data.datapipes._typing import _DataPipeMeta, _IterDataPipeMeta
+from torch.utils.data.dataset import Dataset, IterableDataset
+
+try:
+    import dill
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    HAS_DILL = True
+except ImportError:
+    HAS_DILL = False
+
+__all__ = [
+    "DataChunk",
+    "DFIterDataPipe",
+    "IterDataPipe",
+    "MapDataPipe",
+]
+
+T = TypeVar('T')
+T_co = TypeVar('T_co', covariant=True)
+
+UNTRACABLE_DATAFRAME_PIPES = ['batch',  # As it returns DataChunks
+                              'groupby',   # As it returns DataChunks
+                              '_dataframes_as_tuples',  # As it unpacks DF
+                              'trace_as_dataframe',  # As it used to mark DF for tracing
+                              ]
+
+
+class IterDataPipe(IterableDataset[T_co], metaclass=_IterDataPipeMeta):
+    r"""
+    Iterable-style DataPipe.
+
+    All DataPipes that represent an iterable of data samples should subclass this.
+    This style of DataPipes is particularly useful when data come from a stream, or
+    when the number of samples is too large to fit them all in memory. ``IterDataPipe`` is lazily initialized and its
+    elements are computed only when ``next()`` is called on the iterator of an ``IterDataPipe``.
+
+    All subclasses should overwrite :meth:`__iter__`, which would return an
+    iterator of samples in this DataPipe. Calling ``__iter__`` of an ``IterDataPipe`` automatically invokes its
+    method ``reset()``, which by default performs no operation. When writing a custom ``IterDataPipe``, users should
+    override ``reset()`` if necesssary. The common usages include resetting buffers, pointers,
+    and various state variables within the custom ``IterDataPipe``.
+
+    Note:
+        Only `one` iterator can be valid for each ``IterDataPipe`` at a time,
+        and the creation a second iterator will invalidate the first one. This constraint is necessary because
+        some ``IterDataPipe`` have internal buffers, whose states can become invalid if there are multiple iterators.
+        The code example below presents details on how this constraint looks in practice.
+        If you have any feedback related to this constraint, please see `Github IterDataPipe Singler Iterator Issue`_.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing ``IterDataPipe`` (recommended, available to most but not all DataPipes).
+    You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple
+    operations in succession.
+
+    .. _Github IterDataPipe Singler Iterator Issue:
+        https://github.com/pytorch/data/issues/45
+
+    Note:
+        When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
+        item in the DataPipe will be yielded from the :class:`~torch.utils.data.DataLoader`
+        iterator. When :attr:`num_workers > 0`, each worker process will have a
+        different copy of the DataPipe object, so it is often desired to configure
+        each copy independently to avoid having duplicate data returned from the
+        workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
+        process, returns information about the worker. It can be used in either the
+        dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
+        :attr:`worker_init_fn` option to modify each copy's behavior.
+
+    Examples:
+        General Usage:
+            >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+            >>> dp = IterableWrapper(range(10))
+            >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+            >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+            >>> list(map_dp_1)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> list(map_dp_2)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0)
+            >>> list(filter_dp)
+            [2, 4, 6, 8, 10]
+        Single Iterator Constraint Example:
+            >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+            >>> dp = IterableWrapper(range(10))
+            >>> it1 = iter(source_dp)
+            >>> list(it1)
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+            >>> it1 = iter(source_dp)
+            >>> it2 = iter(source_dp)  # The creation of a new iterator invalidates `it1`
+            >>> next(it2)
+            0
+            >>> next(it1)  # Further usage of `it1` will raise a `RunTimeError`
+    """
+    functions: Dict[str, Callable] = {}
+    reduce_ex_hook: Optional[Callable] = None
+    getstate_hook: Optional[Callable] = None
+    str_hook: Optional[Callable] = None
+    repr_hook: Optional[Callable] = None
+    _valid_iterator_id: Optional[int] = None
+
+    def __getattr__(self, attribute_name):
+        if attribute_name in IterDataPipe.functions:
+            function = functools.partial(IterDataPipe.functions[attribute_name], self)
+            return function
+        else:
+            raise AttributeError("'{0}' object has no attribute '{1}".format(self.__class__.__name__, attribute_name))
+
+    @classmethod
+    def register_function(cls, function_name, function):
+        cls.functions[function_name] = function
+
+    @classmethod
+    def register_datapipe_as_function(cls, function_name, cls_to_register, enable_df_api_tracing=False):
+        if function_name in cls.functions:
+            raise Exception("Unable to add DataPipe function name {} as it is already taken".format(function_name))
+
+        def class_function(cls, enable_df_api_tracing, source_dp, *args, **kwargs):
+            result_pipe = cls(source_dp, *args, **kwargs)
+            if isinstance(result_pipe, IterDataPipe):
+                if enable_df_api_tracing or isinstance(source_dp, DFIterDataPipe):
+                    if function_name not in UNTRACABLE_DATAFRAME_PIPES:
+                        result_pipe = result_pipe.trace_as_dataframe()
+
+            return result_pipe
+
+        function = functools.partial(class_function, cls_to_register, enable_df_api_tracing)
+        cls.functions[function_name] = function
+
+    def __getstate__(self):
+        """
+        This contains special logic to serialize `lambda` functions when `dill` is available.
+        If this doesn't cover your custom DataPipe's use case, consider writing custom methods for
+        `__getstate__` and `__setstate__`, or use `pickle.dumps` for serialization.
+        """
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(self)
+        return self.__dict__
+
+    def __reduce_ex__(self, *args, **kwargs):
+        if IterDataPipe.reduce_ex_hook is not None:
+            try:
+                return IterDataPipe.reduce_ex_hook(self)
+            except NotImplementedError:
+                pass
+        return super().__reduce_ex__(*args, **kwargs)
+
+    @classmethod
+    def set_getstate_hook(cls, hook_fn):
+        if IterDataPipe.getstate_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing getstate_hook")
+        IterDataPipe.getstate_hook = hook_fn
+
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn):
+        if IterDataPipe.reduce_ex_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing reduce_ex_hook")
+        IterDataPipe.reduce_ex_hook = hook_fn
+
+    def __repr__(self):
+        if self.repr_hook is not None:
+            return self.repr_hook(self)
+        # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __str__(self):
+        if self.str_hook is not None:
+            return self.str_hook(self)
+        # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def reset(self):
+        r"""
+        Reset the `IterDataPipe` to the initial state. By default, no-op. For subclasses of `IterDataPipe`,
+        depending on their functionalities, they may want to override this method with implementations that
+        may clear the buffers and reset pointers of the DataPipe.
+        The `reset` method is always called when `__iter__` is called as part of `hook_iterator`.
+        """
+        pass
+
+
+class DFIterDataPipe(IterDataPipe):
+    def _is_dfpipe(self):
+        return True
+
+
+class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+    r"""
+    Map-style DataPipe.
+
+    All datasets that represent a map from keys to data samples should subclass this.
+    Subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+    data sample for a given, unique key. Subclasses can also optionally overwrite
+    :meth:`__len__`, which is expected to return the size of the dataset by many
+    :class:`~torch.utils.data.Sampler` implementations and the default options
+    of :class:`~torch.utils.data.DataLoader`.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing `MapDataPipe` (recommend, available to most but not all DataPipes).
+
+    Note:
+        :class:`~torch.utils.data.DataLoader` by default constructs an index
+        sampler that yields integral indices. To make it work with a map-style
+        DataPipe with non-integral indices/keys, a custom sampler must be provided.
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> map_dp_1 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> batch_dp = map_dp_1.batch(batch_size=2)
+        >>> list(batch_dp)
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    """
+    functions: Dict[str, Callable] = {}
+    reduce_ex_hook: Optional[Callable] = None
+    getstate_hook: Optional[Callable] = None
+    str_hook: Optional[Callable] = None
+    repr_hook: Optional[Callable] = None
+
+    def __getattr__(self, attribute_name):
+        if attribute_name in MapDataPipe.functions:
+            function = functools.partial(MapDataPipe.functions[attribute_name], self)
+            return function
+        else:
+            raise AttributeError("'{0}' object has no attribute '{1}".format(self.__class__.__name__, attribute_name))
+
+    @classmethod
+    def register_function(cls, function_name, function):
+        cls.functions[function_name] = function
+
+    @classmethod
+    def register_datapipe_as_function(cls, function_name, cls_to_register):
+        if function_name in cls.functions:
+            raise Exception("Unable to add DataPipe function name {} as it is already taken".format(function_name))
+
+        def class_function(cls, source_dp, *args, **kwargs):
+            result_pipe = cls(source_dp, *args, **kwargs)
+            return result_pipe
+
+        function = functools.partial(class_function, cls_to_register)
+        cls.functions[function_name] = function
+
+    def __getstate__(self):
+        """
+        This contains special logic to serialize `lambda` functions when `dill` is available.
+        If this doesn't cover your custom DataPipe's use case, consider writing custom methods for
+        `__getstate__` and `__setstate__`, or use `pickle.dumps` for serialization.
+        """
+        if MapDataPipe.getstate_hook is not None:
+            return MapDataPipe.getstate_hook(self)
+        return self.__dict__
+
+    def __reduce_ex__(self, *args, **kwargs):
+        if MapDataPipe.reduce_ex_hook is not None:
+            try:
+                return MapDataPipe.reduce_ex_hook(self)
+            except NotImplementedError:
+                pass
+        return super().__reduce_ex__(*args, **kwargs)
+
+    @classmethod
+    def set_getstate_hook(cls, hook_fn):
+        if MapDataPipe.getstate_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing getstate_hook")
+        MapDataPipe.getstate_hook = hook_fn
+
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn):
+        if MapDataPipe.reduce_ex_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing reduce_ex_hook")
+        MapDataPipe.reduce_ex_hook = hook_fn
+
+    def __repr__(self):
+        if self.repr_hook is not None:
+            return self.repr_hook(self)
+        # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __str__(self):
+        if self.str_hook is not None:
+            return self.str_hook(self)
+        # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+
+class _DataPipeSerializationWrapper:
+    def __init__(self, datapipe):
+        self._datapipe = datapipe
+
+    def __getstate__(self):
+        use_dill = False
+        try:
+            value = pickle.dumps(self._datapipe)
+        except Exception:
+            if HAS_DILL:
+                value = dill.dumps(self._datapipe)
+                use_dill = True
+            else:
+                raise
+        return (value, use_dill)
+
+    def __setstate__(self, state):
+        value, use_dill = state
+        if use_dill:
+            self._datapipe = dill.loads(value)
+        else:
+            self._datapipe = pickle.loads(value)
+
+    def __len__(self):
+        try:
+            return len(self._datapipe)
+        except Exception:
+            raise TypeError(
+                "{} instance doesn't have valid length".format(type(self).__name__)
+            )
+
+
+class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
+    def __iter__(self):
+        yield from self._datapipe
+
+
+class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe):
+    def __getitem__(self, idx):
+        return self._datapipe[idx]
+
+
+class DataChunk(list, Generic[T]):
+    def __init__(self, items):
+        super().__init__(items)
+        self.items = items
+
+    def as_str(self, indent=''):
+        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
+        return res
+
+    def __iter__(self) -> Iterator[T]:
+        for i in super().__iter__():
+            yield i
+
+    def raw_iterator(self) -> T:  # type: ignore[misc]
+        for i in self.items:
+            yield i
diff --git a/torch/utils/data/datapipes/datapipe.pyi.in b/torch/utils/data/datapipes/datapipe.pyi.in
new file mode 100644
index 000000000000..b776caf3bf31
--- /dev/null
+++ b/torch/utils/data/datapipes/datapipe.pyi.in
@@ -0,0 +1,89 @@
+# This base template ("datapipe.pyi.in") is generated from mypy stubgen with minimal editing for code injection
+# The output file will be "datapipe.pyi". This is executed as part of torch/CMakeLists.txt
+# Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
+# classes/objects here, even though we are not injecting extra code into them at the moment.
+
+from torch.utils.data.datapipes._typing import _DataPipeMeta, _IterDataPipeMeta
+from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, TypeVar
+from torch.utils.data import Dataset, IterableDataset
+
+T_co = TypeVar('T_co', covariant=True)
+T = TypeVar('T')
+UNTRACABLE_DATAFRAME_PIPES: Any
+
+
+class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+    functions: Dict[str, Callable] = ...
+    reduce_ex_hook: Optional[Callable] = ...
+    getstate_hook: Optional[Callable] = ...
+    str_hook: Optional[Callable] = ...
+    repr_hook: Optional[Callable] = ...
+    def __getattr__(self, attribute_name: Any): ...
+    @classmethod
+    def register_function(cls, function_name: Any, function: Any) -> None: ...
+    @classmethod
+    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any): ...
+    def __getstate__(self): ...
+    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
+    @classmethod
+    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
+    ${MapDataPipeMethods}
+
+
+class IterDataPipe(IterableDataset[T_co], metaclass=_IterDataPipeMeta):
+    functions: Dict[str, Callable] = ...
+    reduce_ex_hook: Optional[Callable] = ...
+    getstate_hook: Optional[Callable] = ...
+    str_hook: Optional[Callable] = ...
+    repr_hook: Optional[Callable] = ...
+    def __getattr__(self, attribute_name: Any): ...
+    @classmethod
+    def register_function(cls, function_name: Any, function: Any) -> None: ...
+    @classmethod
+    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any, enable_df_api_tracing: bool = ...): ...
+    def __getstate__(self): ...
+    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
+    @classmethod
+    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
+    ${IterDataPipeMethods}
+
+
+class DFIterDataPipe(IterDataPipe):
+    def _is_dfpipe(self): ...
+
+
+class _DataPipeSerializationWrapper:
+    def __init__(self, datapipe): ...
+    def __getstate__(self): ...
+    def __setstate__(self, state): ...
+    def __len__(self): ...
+
+
+class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
+    def __iter__(self): ...
+
+
+class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe):
+    def __getitem__(self, idx): ...
+
+
+class DataChunk(list, Generic[T]):
+    def __init__(self, items):
+        super().__init__(items)
+        self.items = items
+
+    def as_str(self, indent=''):
+        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
+        return res
+
+    def __iter__(self) -> Iterator[T]:
+        for i in super().__iter__():
+            yield i
+
+    def raw_iterator(self) -> T:  # type: ignore[misc]
+        for i in self.items:
+            yield i
diff --git a/torch/utils/data/gen_pyi.py b/torch/utils/data/datapipes/gen_pyi.py
similarity index 76%
rename from torch/utils/data/gen_pyi.py
rename to torch/utils/data/datapipes/gen_pyi.py
index f248da42574a..e7c496bdf255 100644
--- a/torch/utils/data/gen_pyi.py
+++ b/torch/utils/data/datapipes/gen_pyi.py
@@ -1,7 +1,29 @@
 import os
 import pathlib
-from typing import Dict, List, Set, Tuple
-from tools.codegen.gen import FileManager
+from typing import Any, Dict, List, Set, Tuple, Union
+
+
+def materialize_lines(lines: List[str], indentation: int) -> str:
+    output = ""
+    new_line_with_indent = "\n" + " " * indentation
+    for i, line in enumerate(lines):
+        if i != 0:
+            output += new_line_with_indent
+        output += line.replace('\n', new_line_with_indent)
+    return output
+
+
+def gen_from_template(dir: str, template_name: str, output_name: str, replacements: List[Tuple[str, Any, int]]):
+
+    template_path = os.path.join(dir, template_name)
+    output_path = os.path.join(dir, output_name)
+
+    with open(template_path, "r") as f:
+        content = f.read()
+    for placeholder, lines, indentation in replacements:
+        with open(output_path, "w") as f:
+            content = content.replace(placeholder, materialize_lines(lines, indentation))
+            f.write(content)
 
 
 def find_file_paths(dir_paths: List[str], files_to_exclude: Set[str]) -> Set[str]:
@@ -136,21 +158,26 @@ def process_signature(line: str) -> str:
     return line
 
 
-def get_method_definitions(file_path: str,
+def get_method_definitions(file_path: Union[str, List[str]],
                            files_to_exclude: Set[str],
                            deprecated_files: Set[str],
                            default_output_type: str,
-                           method_to_special_output_type: Dict[str, str]) -> List[str]:
+                           method_to_special_output_type: Dict[str, str],
+                           root: str = "") -> List[str]:
     """
     .pyi generation for functional DataPipes Process
     # 1. Find files that we want to process (exclude the ones who don't)
     # 2. Parse method name and signature
     # 3. Remove first argument after self (unless it is "*datapipes"), default args, and spaces
     """
-    os.chdir(str(pathlib.Path(__file__).parent.resolve()))
-    file_paths = find_file_paths([file_path],
+    if root == "":
+        root = str(pathlib.Path(__file__).parent.resolve())
+    file_path = [file_path] if isinstance(file_path, str) else file_path
+    file_path = [os.path.join(root, path) for path in file_path]
+    file_paths = find_file_paths(file_path,
                                  files_to_exclude=files_to_exclude.union(deprecated_files))
-    methods_and_signatures, methods_and_class_names, methods_w_special_output_types = parse_datapipe_files(file_paths)
+    methods_and_signatures, methods_and_class_names, methods_w_special_output_types = \
+        parse_datapipe_files(file_paths)
 
     method_definitions = []
     for method_name, arguments in methods_and_signatures.items():
@@ -162,37 +189,43 @@ def get_method_definitions(file_path: str,
         method_definitions.append(f"# Functional form of '{class_name}'\n"
                                   f"def {method_name}({arguments}) -> {output_type}: ...")
     method_definitions.sort(key=lambda s: s.split('\n')[1])  # sorting based on method_name
+
     return method_definitions
 
 
+# Defined outside of main() so they can be imported by TorchData
+iterDP_file_path: str = "iter"
+iterDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
+iterDP_deprecated_files: Set[str] = set()
+iterDP_method_to_special_output_type: Dict[str, str] = {"demux": "List[IterDataPipe]", "fork": "List[IterDataPipe]"}
+
+mapDP_file_path: str = "map"
+mapDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
+mapDP_deprecated_files: Set[str] = set()
+mapDP_method_to_special_output_type: Dict[str, str] = {}
+
+
 def main() -> None:
     """
-    # Inject file into template dataset.pyi.in
+    # Inject file into template datapipe.pyi.in
     TODO: The current implementation of this script only generates interfaces for built-in methods. To generate
           interface for user-defined DataPipes, consider changing `IterDataPipe.register_datapipe_as_function`.
     """
-
-    iterDP_file_path: str = "datapipes/iter"
-    iterDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
-    iterDP_deprecated_files: Set[str] = set()
-    iterDP_method_to_special_output_type: Dict[str, str] = {"demux": "List[IterDataPipe]", "fork": "List[IterDataPipe]"}
-
     iter_method_definitions = get_method_definitions(iterDP_file_path, iterDP_files_to_exclude, iterDP_deprecated_files,
                                                      "IterDataPipe", iterDP_method_to_special_output_type)
-    mapDP_file_path: str = "datapipes/map"
-    mapDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
-    mapDP_deprecated_files: Set[str] = set()
-    mapDP_method_to_special_output_type: Dict[str, str] = {}
 
     map_method_definitions = get_method_definitions(mapDP_file_path, mapDP_files_to_exclude, mapDP_deprecated_files,
                                                     "MapDataPipe", mapDP_method_to_special_output_type)
 
-    fm = FileManager(install_dir='.', template_dir='.', dry_run=False)
-    fm.write_with_template(filename="dataset.pyi",
-                           template_fn="dataset.pyi.in",
-                           env_callable=lambda: {'IterDataPipeMethods': iter_method_definitions,
-                                                 'MapDataPipeMethods': map_method_definitions})
+    path = pathlib.Path(__file__).parent.resolve()
+    replacements = [('${IterDataPipeMethods}', iter_method_definitions, 4),
+                    ('${MapDataPipeMethods}', map_method_definitions, 4)]
+    gen_from_template(dir=str(path),
+                      template_name="datapipe.pyi.in",
+                      output_name="datapipe.pyi",
+                      replacements=replacements)
 
 
 if __name__ == '__main__':
-    main()  # TODO: Run this script automatically within the build and CI process
+    print("Generating Python interface file 'datapipe.pyi'...")
+    main()
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 200f0c35f068..29c4bb694bee 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,11 +1,14 @@
 from typing import Callable, Iterator, Sized, TypeVar
 
-from torch.utils.data import IterDataPipe, _utils, functional_datapipe
-from torch.utils.data.datapipes.utils.common import DILL_AVAILABLE, check_lambda_fn
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data._utils.collate import default_collate
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import _check_lambda_fn
 
-if DILL_AVAILABLE:
-    import dill
-    dill.extend(use_dill=False)
+__all__ = [
+    "CollatorIterDataPipe",
+    "MapperIterDataPipe",
+]
 
 T_co = TypeVar("T_co", covariant=True)
 
@@ -33,6 +36,20 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
               multiple indices, the left-most one is used, and other indices will be removed.
             - Integer is used for list/tuple. ``-1`` represents to append result at the end.
             - Key is used for dict. New key is acceptable.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+        >>> def add_one(x):
+        ...     return x + 1
+        >>> dp = IterableWrapper(range(10))
+        >>> map_dp_1 = dp.map(add_one)  # Invocation via functional form is preferred
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle`
+        >>> # Use `functools.partial` or explicitly define the function instead
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     """
     datapipe: IterDataPipe
     fn: Callable
@@ -47,7 +64,7 @@ def __init__(
         super().__init__()
         self.datapipe = datapipe
 
-        check_lambda_fn(fn)
+        _check_lambda_fn(fn)
         self.fn = fn  # type: ignore[assignment]
 
         self.input_col = input_col
@@ -105,34 +122,6 @@ def __len__(self) -> int:
             "{} instance doesn't have valid length".format(type(self).__name__)
         )
 
-    def __getstate__(self):
-        if IterDataPipe.getstate_hook is not None:
-            return IterDataPipe.getstate_hook(self)
-
-        if DILL_AVAILABLE:
-            dill_function = dill.dumps(self.fn)
-        else:
-            dill_function = self.fn
-        state = (
-            self.datapipe,
-            dill_function,
-            self.input_col,
-            self.output_col,
-        )
-        return state
-
-    def __setstate__(self, state):
-        (
-            self.datapipe,
-            dill_function,
-            self.input_col,
-            self.output_col,
-        ) = state
-        if DILL_AVAILABLE:
-            self.fn = dill.loads(dill_function)  # type: ignore[assignment]
-        else:
-            self.fn = dill_function  # type: ignore[assignment]
-
 
 @functional_datapipe("collate")
 class CollatorIterDataPipe(MapperIterDataPipe):
@@ -166,7 +155,6 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         >>> ds = MyIterDataPipe(start=3, end=7)
         >>> print(list(ds))
         [3, 4, 5, 6]
-
         >>> def collate_fn(batch):
         ...     return torch.tensor(batch, dtype=torch.float)
         ...
@@ -178,6 +166,6 @@ class CollatorIterDataPipe(MapperIterDataPipe):
     def __init__(
         self,
         datapipe: IterDataPipe,
-        collate_fn: Callable = _utils.collate.default_collate,
+        collate_fn: Callable = default_collate,
     ) -> None:
         super().__init__(datapipe, fn=collate_fn)
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index e08ad75e183e..c63e2999fe9c 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -1,8 +1,16 @@
 import random
+import torch
 
-from torch.utils.data import IterDataPipe, Sampler, SequentialSampler, functional_datapipe
+from torch.utils.data import Sampler, SequentialSampler
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
 from typing import Dict, Iterator, List, Optional, Sized, Tuple, Type, TypeVar
 
+__all__ = [
+    "SamplerIterDataPipe",
+    "ShufflerIterDataPipe",
+]
+
 T_co = TypeVar('T_co', covariant=True)
 
 
@@ -67,53 +75,99 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]):
         buffer_size: The buffer size for shuffling (default to ``10000``)
         unbatch_level: Specifies if it is necessary to unbatch source data before
             applying the shuffle
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> shuffle_dp = dp.shuffle()
+        >>> list(shuffle_dp)
+        [0, 4, 1, 6, 3, 2, 9, 5, 7, 8]
     """
     datapipe: IterDataPipe[T_co]
     buffer_size: int
-    _shuffle_enabled: bool
+    _buffer: List[T_co]
+    _enabled: bool
+    _seed: Optional[int]
+    _rng: random.Random
 
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
                  *,
-                 default: bool = True,
                  buffer_size: int = 10000,
                  unbatch_level: int = 0
                  ) -> None:
         super().__init__()
+        # TODO: Performance optimization
+        #       buffer can be a fixed size and remove expensive `append()` and `len()` operations
+        self._buffer: List[T_co] = []
         assert buffer_size > 0, "buffer_size should be larger than 0"
         if unbatch_level == 0:
             self.datapipe = datapipe
         else:
             self.datapipe = datapipe.unbatch(unbatch_level=unbatch_level)
         self.buffer_size = buffer_size
-        self._shuffle_enabled = default
+        self._enabled = True
+        self._seed = None
+        self._rng = random.Random()
 
-    @staticmethod
-    def buffer_replace(buffer, x):
-        idx = random.randint(0, len(buffer) - 1)
-        val = buffer[idx]
-        buffer[idx] = x
-        return val
+    def set_shuffle(self, shuffle=True):
+        self._enabled = shuffle
+        return self
 
-    def set_shuffle_settings(self, shuffle=True):
-        self._shuffle_enabled = shuffle
+    def set_seed(self, seed: int):
+        self._seed = seed
 
     def __iter__(self) -> Iterator[T_co]:
-        if not self._shuffle_enabled:
+        if not self._enabled:
             for x in self.datapipe:
                 yield x
         else:
-            buffer: List[T_co] = []
+            self._rng.seed(self._seed)
+            self._seed = None
             for x in self.datapipe:
-                if len(buffer) == self.buffer_size:
-                    yield ShufflerIterDataPipe.buffer_replace(buffer, x)
+                if len(self._buffer) == self.buffer_size:
+                    idx = self._rng.randint(0, len(self._buffer) - 1)
+                    val, self._buffer[idx] = self._buffer[idx], x
+                    yield val
                 else:
-                    buffer.append(x)
-            random.shuffle(buffer)
-            while buffer:
-                yield buffer.pop()
+                    self._buffer.append(x)
+            self._rng.shuffle(self._buffer)
+            while self._buffer:
+                yield self._buffer.pop()
 
     def __len__(self) -> int:
         if isinstance(self.datapipe, Sized):
             return len(self.datapipe)
         raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
+
+    def reset(self) -> None:
+        self._buffer = []
+        if self._enabled and self._seed is None:
+            self._seed = int(torch.empty((), dtype=torch.int64).random_().item())
+
+    def __getstate__(self):
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(self)
+        state = (
+            self.datapipe,
+            self.buffer_size,
+            self._enabled,
+            self._seed,
+            self._rng.getstate(),
+        )
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.datapipe,
+            self.buffer_size,
+            self._enabled,
+            self._seed,
+            rng_state,
+        ) = state
+        self._rng = random.Random()
+        self._rng.setstate(rng_state)
+        self._buffer = []
+
+    def __del__(self):
+        self._buffer.clear()
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 01cbc8cefade..a370b5a6c1b5 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,14 +1,19 @@
 import warnings
 
 from collections import deque
-from typing import Any, Callable, Iterator, List, Optional, Set, Sized, Tuple, TypeVar, Deque
+from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque
 
-from torch.utils.data import IterDataPipe, functional_datapipe
-from torch.utils.data.datapipes.utils.common import DILL_AVAILABLE, check_lambda_fn
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import _check_lambda_fn
 
-if DILL_AVAILABLE:
-    import dill
-    dill.extend(use_dill=False)
+__all__ = [
+    "ConcaterIterDataPipe",
+    "DemultiplexerIterDataPipe",
+    "ForkerIterDataPipe",
+    "MultiplexerIterDataPipe",
+    "ZipperIterDataPipe",
+]
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -21,6 +26,14 @@ class ConcaterIterDataPipe(IterDataPipe):
 
     Args:
         datapipes: Iterable DataPipes being concatenated
+
+    Example:
+        >>> import random
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1 = IterableWrapper(range(3))
+        >>> dp2 = IterableWrapper(range(5))
+        >>> list(dp1.concat(dp2))
+        [0, 1, 2, 0, 1, 2, 3, 4]
     """
     datapipes: Tuple[IterDataPipe]
     length: Optional[int]
@@ -61,6 +74,15 @@ class ForkerIterDataPipe(IterDataPipe):
         buffer_size: this restricts how far ahead the leading child DataPipe
            can read relative to the slowest child DataPipe.
            Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper(range(5))
+        >>> dp1, dp2 = source_dp.fork(num_instances=2)
+        >>> list(dp1)
+        [0, 1, 2, 3, 4]
+        >>> list(dp2)
+        [0, 1, 2, 3, 4]
     """
     def __new__(cls, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000):
         if num_instances < 1:
@@ -126,9 +148,6 @@ def get_next_element_by_instance(self, instance_id: int):
            all(p == self.end_ptr for p in self.child_pointers):
             self._datapipe_iterator = None
 
-    def is_instance_started(self, instance_id: int) -> bool:
-        return self.child_pointers[instance_id] != 0
-
     def is_every_instance_exhausted(self) -> bool:
         return all(self.end_ptr == ptr for ptr in self.child_pointers)
 
@@ -140,37 +159,106 @@ def reset(self):
         self.leading_ptr = 0
         self.end_ptr = None
 
+    def __getstate__(self):
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(self)
+
+        state = (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+        )
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+        ) = state
+        self._datapipe_iterator = None
+        self.buffer = deque()
+        self.child_pointers = [0] * self.num_instances
+        self.slowest_ptr = 0
+        self.leading_ptr = 0
+        self.end_ptr = None
+
+    def __del__(self):
+        self.buffer.clear()
+
+
 class _ChildDataPipe(IterDataPipe):
     r"""
     Iterable Datapipe that is a child of a main DataPipe. The instance of this class
     will pass its instance_id to get the next value from its main DataPipe.
 
+    Note:
+        ChildDataPipe, like all other IterDataPipe, follows the single iterator per IterDataPipe constraint.
+        Since ChildDataPipes share a common buffer, when an iterator is created for one of the ChildDataPipes,
+        the previous iterators  for all ChildDataPipes must be invalidated, with the exception when a ChildDataPipe
+        hasn't had an iterator created from it since the last invalidation. See the example below.
+
+    Singler Iterator per IteraDataPipe Invalidation Example:
+        >>> source_dp = IterableWrapper(range(10))
+        >>> cdp1, cdp2 = source_dp.fork(num_instances=2)
+        >>> it1, it2 = iter(cdp1), iter(cdp2)
+        >>> it3 = iter(cdp1)
+        The line above invalidates `it1` and `it2`, and resets `ForkerIterDataPipe`.
+        >>> it4 = iter(cdp2)
+        The line above doesn't invalidate `it3`, because an iterator for `cdp2` hasn't been created since
+        the last invalidation.
+
     Args:
         main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)'
         instance_id: integer identifier of this instance
     """
-    def __init__(self, main_datapipe, instance_id: int):
-        required_attrs = ["get_next_element_by_instance", "is_instance_started", "is_every_instance_exhausted", "reset"]
+    _is_child_datapipe: bool = True
+
+    def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
+        required_attrs = ["get_next_element_by_instance", "is_every_instance_exhausted", "reset"]
         required_ops = [getattr(main_datapipe, attr) for attr in required_attrs]
         if any(not callable(op) for op in required_ops):
             raise NotImplementedError(f"Main Datapipe must have methods {required_attrs} implemented.")
-        self.main_datapipe = main_datapipe
+        self.main_datapipe: IterDataPipe = main_datapipe
         self.instance_id = instance_id
 
     def __iter__(self):
-        if self.main_datapipe.is_instance_started(self.instance_id):  # Only reset if the DataPipe started to read
+        # Note that the logic behind setting iterator ID and `reset` are handled within `hook_iterator`
+        # We want to separate the code for reset and yield, so that 'reset' executes before __next__ is called
+        return self.main_datapipe.get_next_element_by_instance(self.instance_id)
+
+    def __len__(self):
+        return len(self.main_datapipe)
+
+    # This method is called by `hook_iterator` in `_typing.py`.
+    def _set_main_datapipe_valid_iterator_id(self) -> int:
+        r"""
+        Update the valid iterator ID for both this DataPipe object and `main_datapipe`.
+        `main_datapipe.reset()` is called when the ID is incremented to a new generation.
+        """
+        # 1. First time any child iterator is created
+        if self.main_datapipe._valid_iterator_id is None:
+            self.main_datapipe._valid_iterator_id = 0  # type: ignore[attr-defined]
+        # 2. This instance was already in the same generation as `main_datapipe`,
+        #    we need to increment the ID further by 1
+        elif self.main_datapipe._valid_iterator_id == self._valid_iterator_id:  # type: ignore[has-type]
+            self.main_datapipe._valid_iterator_id += 1  # type: ignore[attr-defined]
+            # Whenever a new generation of iterator is created, the `main_datapipe` must reset
             if not self.main_datapipe.is_every_instance_exhausted():
                 warnings.warn("Some child DataPipes are not exhausted when __iter__ is called. We are resetting "
                               "the buffer and each child DataPipe will read from the start again.", UserWarning)
             self.main_datapipe.reset()
-        # We want to separate the code for reset and yield, so that 'reset' exeutes before __next__ is called
-        return self.get_generator_by_instance(self.instance_id)
-
-    def __len__(self):
-        return len(self.main_datapipe)
+        # 3. Otherwise, the iterator is behind the others, so it will just need to catch up by setting
+        #    the instance's iterator to match that of `main_datapipe`
+        self._valid_iterator_id = self.main_datapipe._valid_iterator_id
+        return self._valid_iterator_id
 
-    def get_generator_by_instance(self, instance_id: int):
-        yield from self.main_datapipe.get_next_element_by_instance(self.instance_id)
+    # This method is called by `hook_iterator` in `_typing.py`.
+    def _check_valid_iterator_id(self, iterator_id) -> bool:
+        r"""
+        Check the valid iterator ID against that of DataPipe object and that of `main_datapipe`.
+        """
+        return iterator_id == self._valid_iterator_id and iterator_id == self.main_datapipe._valid_iterator_id
 
 
 @functional_datapipe('demux')
@@ -187,13 +275,32 @@ class DemultiplexerIterDataPipe(IterDataPipe):
         buffer_size: this defines the maximum number of inputs that the buffer can hold across all child
             DataPipes while waiting for their values to be yielded.
             Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+
+    Examples:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def odd_or_even(n):
+        ...     return n % 2
+        >>> source_dp = IterableWrapper(range(5))
+        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even)
+        >>> list(dp1)
+        [0, 2, 4]
+        >>> list(dp2)
+        [1, 3]
+        >>> # It can also filter out any element that gets `None` from the `classifier_fn`
+        >>> def odd_or_even_no_zero(n):
+        ...     return n % 2 if n != 0 else None
+        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True)
+        >>> list(dp1)
+        [2, 4]
+        >>> list(dp2)
+        [1, 3]
     """
     def __new__(cls, datapipe: IterDataPipe, num_instances: int,
                 classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000):
         if num_instances < 1:
             raise ValueError(f"Expected `num_instaces` larger than 0, but {num_instances} is found")
 
-        check_lambda_fn(classifier_fn)
+        _check_lambda_fn(classifier_fn)
 
         # When num_instances == 1, demux can be replaced by filter,
         # but keep it as Demultiplexer for the sake of consistency
@@ -223,7 +330,6 @@ def __init__(self, datapipe: IterDataPipe[T_co], num_instances: int,
             )
         self.current_buffer_usage = 0
         self.child_buffers: List[Deque[T_co]] = [deque() for _ in range(num_instances)]
-        self.instance_started: List[bool] = [False] * num_instances
         self.classifier_fn = classifier_fn
         self.drop_none = drop_none
         self.main_datapipe_exhausted = False
@@ -255,7 +361,6 @@ def get_next_element_by_instance(self, instance_id: int):
         if self._datapipe_iterator is None and not self.main_datapipe_exhausted:
             self._datapipe_iterator = iter(self.main_datapipe)
         stop = False
-        self.instance_started[instance_id] = True
         while not stop:
             if self.child_buffers[instance_id]:
                 self.current_buffer_usage -= 1
@@ -268,32 +373,24 @@ def get_next_element_by_instance(self, instance_id: int):
                     self.main_datapipe_exhausted = True
                     self._datapipe_iterator = None
 
-    def is_instance_started(self, instance_id: int) -> bool:
-        return self.instance_started[instance_id]
-
     def is_every_instance_exhausted(self) -> bool:
         return self.main_datapipe_exhausted and all(not child_buffer for child_buffer in self.child_buffers)
 
     def reset(self):
-        self._datapipe_iterator = iter(self.main_datapipe)
+        self._datapipe_iterator = None
         self.current_buffer_usage = 0
         self.child_buffers = [deque() for _ in range(self.num_instances)]
-        self.instance_started = [False] * self.num_instances
         self.main_datapipe_exhausted = False
 
     def __getstate__(self):
         if IterDataPipe.getstate_hook is not None:
             return IterDataPipe.getstate_hook(self)
 
-        if DILL_AVAILABLE:
-            dill_function = dill.dumps(self.classifier_fn)
-        else:
-            dill_function = self.classifier_fn
         state = (
             self.main_datapipe,
             self.num_instances,
             self.buffer_size,
-            dill_function,
+            self.classifier_fn,
             self.drop_none,
         )
         return state
@@ -303,29 +400,34 @@ def __setstate__(self, state):
             self.main_datapipe,
             self.num_instances,
             self.buffer_size,
-            dill_function,
+            self.classifier_fn,
             self.drop_none,
         ) = state
-        if DILL_AVAILABLE:
-            self.classifier_fn = dill.loads(dill_function)  # type: ignore[assignment]
-        else:
-            self.classifier_fn = dill_function  # type: ignore[assignment]
         self._datapipe_iterator = None
         self.current_buffer_usage = 0
         self.child_buffers = [deque() for _ in range(self.num_instances)]
-        self.instance_started = [False] * self.num_instances
         self.main_datapipe_exhausted = False
 
+    def __del__(self):
+        for dq in self.child_buffers:
+            dq.clear()
+
 
 @functional_datapipe('mux')
 class MultiplexerIterDataPipe(IterDataPipe):
     r"""
     Yields one element at a time from each of the input Iterable DataPipes (functional name: ``mux``). As in,
     one element from the 1st input DataPipe, then one element from the 2nd DataPipe in the next iteration,
-    and so on. It skips over DataPipes that are exhausted, and ends when all input DataPipes are exhausted.
+    and so on. It ends when the shortest input DataPipe is exhausted.
 
     Args:
-        datapipes: Iterable DataPipes that will take turn to yield their elements, until they are all exhausted
+        datapipes: Iterable DataPipes that will take turn to yield their elements, until the shortest DataPipe is exhausted
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1, dp2, dp3 = IterableWrapper(range(3)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> list(dp1.mux(dp2, dp3))
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
     """
     def __init__(self, *datapipes):
         self.datapipes = datapipes
@@ -333,15 +435,16 @@ def __init__(self, *datapipes):
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
-        finished: Set[int] = set()
-        while len(finished) < len(iterators):
-            for i in range(len(iterators)):
-                if i not in finished:
-                    try:
-                        value = next(iterators[i])
-                        yield value
-                    except StopIteration:
-                        finished.add(i)
+        while len(iterators):
+            values: List[Any] = []
+            for it in iterators:
+                try:
+                    value = next(it)
+                    values.append(value)
+                except StopIteration:
+                    return
+            for value in values:
+                yield value
 
     def __len__(self):
         if self.length is not None:
@@ -349,7 +452,7 @@ def __len__(self):
                 raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
             return self.length
         if all(isinstance(dp, Sized) for dp in self.datapipes):
-            self.length = sum(len(dp) for dp in self.datapipes)
+            self.length = min(len(dp) for dp in self.datapipes) * len(self.datapipes)
         else:
             self.length = -1
         return len(self)
@@ -363,6 +466,12 @@ class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]):
 
     Args:
         *datapipes: Iterable DataPipes being aggregated
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> list(dp1.zip(dp2, dp3))
+        [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)]
     """
     datapipes: Tuple[IterDataPipe]
     length: Optional[int]
diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 4de205e55f0d..b00ec1a771f6 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -1,9 +1,12 @@
 from typing import Iterator, List, Sequence, Union
 
-from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
 from torch.utils.data.datapipes.iter import IterableWrapper
 from torch.utils.data.datapipes.utils.common import get_file_pathnames_from_root
 
+__all__ = ["FileListerIterDataPipe", ]
+
+
 class FileListerIterDataPipe(IterDataPipe[str]):
     r"""
     Given path(s) to the root directory, yields file pathname(s) (path + filename) of files within the root directory.
@@ -17,6 +20,12 @@ class FileListerIterDataPipe(IterDataPipe[str]):
         non_deterministic: Whether to return pathname in sorted order or not.
             If ``False``, the results yielded from each root directory will be sorted
         length: Nominal length of the datapipe
+
+    Example:
+        >>> from torchdata.datapipes.iter import FileLister
+        >>> dp = FileLister(root=".", recursive=True)
+        >>> list(dp)
+        ['example.py', './data/data.tar']
     """
 
     def __init__(
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index b7198d78b498..45eebac3a050 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -1,13 +1,21 @@
 from io import IOBase
-from typing import Iterable, Tuple
+from typing import Iterable, Tuple, Optional
 
-from torch.utils.data import IterDataPipe
-from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames, deprecation_warning
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames, _deprecation_warning
 
+__all__ = [
+    "FileOpenerIterDataPipe",
+    "FileLoaderIterDataPipe",
+]
 
+
+@functional_datapipe("open_files")
 class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
     r"""
-    Given pathnames, opens files and yield pathname and file stream in a tuple.
+    Given pathnames, opens files and yield pathname and file stream
+    in a tuple (functional name: ``open_files``).
 
     Args:
         datapipe: Iterable datapipe that provides pathnames
@@ -15,32 +23,49 @@ class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
             the file is opened by ``open()``. It defaults to ``b`` which
             means open for reading in binary mode. Another option is
             to use ``t`` for text mode
+        encoding: An optional string that specifies the encoding of the
+            underlying file. It defaults to ``None`` to match the default encoding of ``open``.
         length: Nominal length of the datapipe
 
     Note:
         The opened file handles will be closed by Python's GC periodically. Users can choose
         to close them explicitly.
+
+    Example:
+        >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader
+        >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt'))
+        >>> dp = FileOpener(dp)
+        >>> dp = StreamReader(dp)
+        >>> list(dp)
+        [('./abc.txt', 'abc')]
     """
 
     def __init__(
             self,
             datapipe: Iterable[str],
             mode: str = 'r',
+            encoding: Optional[str] = None,
             length: int = -1):
         super().__init__()
         self.datapipe: Iterable = datapipe
         self.mode: str = mode
+        self.encoding: Optional[str] = encoding
+
         if self.mode not in ('b', 't', 'rb', 'rt', 'r'):
             raise ValueError("Invalid mode {}".format(mode))
         # TODO: enforce typing for each instance based on mode, otherwise
         #       `argument_validation` with this DataPipe may be potentially broken
+
+        if 'b' in mode and encoding is not None:
+            raise ValueError("binary mode doesn't take an encoding argument")
+
         self.length: int = length
 
     # Remove annotation due to 'IOBase' is a general type and true type
     # is determined at runtime based on mode. Some `DataPipe` requiring
     # a subtype would cause mypy error.
     def __iter__(self):
-        yield from get_file_binaries_from_pathnames(self.datapipe, self.mode)
+        yield from get_file_binaries_from_pathnames(self.datapipe, self.mode, self.encoding)
 
     def __len__(self):
         if self.length == -1:
@@ -55,5 +80,10 @@ def __new__(
             datapipe: Iterable[str],
             mode: str = 'b',
             length: int = -1):
-        deprecation_warning(type(cls).__name__, new_name="FileOpener")
+        _deprecation_warning(
+            cls.__name__,
+            deprecation_version="1.12",
+            removal_version="1.13",
+            new_class_name="FileOpener",
+        )
         return FileOpenerIterDataPipe(datapipe=datapipe, mode=mode, length=length)
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 7e92a49cae42..37bfb18f6c84 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,12 +1,16 @@
 from collections import defaultdict
 
-from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
-from torch.utils.data.datapipes.utils.common import DILL_AVAILABLE, check_lambda_fn
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe, DataChunk
+from torch.utils.data.datapipes.utils.common import _check_lambda_fn
 from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
 
-if DILL_AVAILABLE:
-    import dill
-    dill.extend(use_dill=False)
+__all__ = [
+    "BatcherIterDataPipe",
+    "GrouperIterDataPipe",
+    "ShardingFilterIterDataPipe",
+    "UnBatcherIterDataPipe",
+]
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -58,6 +62,13 @@ class BatcherIterDataPipe(IterDataPipe[DataChunk]):
         drop_last: Option to drop the last batch if it's not full
         wrapper_class: wrapper to apply onto each batch (type ``List``) before yielding,
             defaults to ``DataChunk``
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> dp = dp.batch(batch_size=3, drop_last=True)
+        >>> list(dp)
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
     """
     datapipe: IterDataPipe
     batch_size: int
@@ -111,6 +122,16 @@ class UnBatcherIterDataPipe(IterDataPipe):
         datapipe: Iterable DataPipe being un-batched
         unbatch_level: Defaults to ``1`` (only flattening the top level). If set to ``2``,
             it will flatten the top two levels, and ``-1`` will flatten the entire DataPipe.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper([[[0, 1], [2]], [[3, 4], [5]], [[6]]])
+        >>> dp1 = source_dp.unbatch()
+        >>> list(dp1)
+        [[0, 1], [2], [3, 4], [5], [6]]
+        >>> dp2 = source_dp.unbatch(unbatch_level=2)
+        >>> list(dp2)
+        [0, 1, 2, 3, 4, 5, 6]
     """
 
     def __init__(self,
@@ -149,16 +170,42 @@ def _dive(self, element, unbatch_level):
 class GrouperIterDataPipe(IterDataPipe[DataChunk]):
     r"""
     Groups data from input IterDataPipe by keys which are generated from ``group_key_fn``,
-    and yields a ``DataChunk`` with size ranging from ``guaranteed_group_size``
-    to ``group_size`` (functional name: ``groupby``).
+    and yields a ``DataChunk`` with batch size up to ``group_size`` if defined (functional name: ``groupby``).
+
+    The samples are read sequentially from the source ``datapipe``, and a batch of samples belonging to the same group
+    will be yielded as soon as the size of the batch reaches ``group_size``. When the buffer is full,
+    the DataPipe will yield the largest batch with the same key, provided that its size is larger
+    than ``guaranteed_group_size``. If its size is smaller, it will be dropped if ``drop_remaining=True``.
+
+    After iterating through the entirety of source ``datapipe``, everything not dropped due to the buffer capacity
+    will be yielded from the buffer, even if the group sizes are smaller than ``guaranteed_group_size``.
 
     Args:
         datapipe: Iterable datapipe to be grouped
         group_key_fn: Function used to generate group key from the data of the source datapipe
         buffer_size: The size of buffer for ungrouped data
-        group_size: The size of each group
-        guaranteed_group_size: The guaranteed minimum group size
-        drop_remaining: Specifies if the group smaller than `guaranteed_group_size` will be dropped from buffer
+        group_size: The max size of each group, a batch is yielded as soon as it reaches this size
+        guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full
+        drop_remaining: Specifies if the group smaller than ``guaranteed_group_size`` will be dropped from buffer
+            when the buffer is full
+
+    Example:
+        >>> import os
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def group_fn(file):
+        ...    return os.path.basename(file).split(".")[0]
+        >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
+        >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
+        >>> list(dp0)
+        [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
+        >>> # A group is yielded as soon as its size equals to `group_size`
+        >>> dp1 = source_dp.groupby(group_key_fn=group_fn, group_size=2)
+        >>> list(dp1)
+        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
+        >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
+        >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
+        >>> list(dp2)
+        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
     """
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
@@ -168,10 +215,13 @@ def __init__(self,
                  group_size: Optional[int] = None,
                  guaranteed_group_size: Optional[int] = None,
                  drop_remaining: bool = False):
-        check_lambda_fn(group_key_fn)
+        _check_lambda_fn(group_key_fn)
         self.datapipe = datapipe
         self.group_key_fn = group_key_fn
-        self.buffer_size = buffer_size
+
+        self.max_buffer_size = buffer_size
+        self.buffer_elements: DefaultDict[Any, List] = defaultdict(list)
+        self.curr_buffer_size = 0
         self.group_size = group_size
         self.guaranteed_group_size = None
         if group_size is not None and buffer_size is not None:
@@ -183,79 +233,78 @@ def __init__(self,
         self.drop_remaining = drop_remaining
         self.wrapper_class = DataChunk
 
-    def _remove_biggest_key(self, buffer_elements, buffer_size):
+    def _remove_biggest_key(self):
         biggest_key = None
         biggest_size = 0
         result_to_yield = None
-        for findkey in buffer_elements.keys():
-            if len(buffer_elements[findkey]) > biggest_size:
-                biggest_size = len(buffer_elements[findkey])
+        for findkey in self.buffer_elements.keys():
+            if len(self.buffer_elements[findkey]) > biggest_size:
+                biggest_size = len(self.buffer_elements[findkey])
                 biggest_key = findkey
 
         if self.guaranteed_group_size is not None and biggest_size < self.guaranteed_group_size and not self.drop_remaining:
-            raise RuntimeError('Failed to group items', str(buffer_elements[biggest_key]))
+            raise RuntimeError('Failed to group items', str(self.buffer_elements[biggest_key]))
 
         if self.guaranteed_group_size is None or biggest_size >= self.guaranteed_group_size:
-            result_to_yield = buffer_elements[biggest_key]
+            result_to_yield = self.buffer_elements[biggest_key]
 
-        new_buffer_size = buffer_size - biggest_size
-        del buffer_elements[biggest_key]
+        self.curr_buffer_size -= biggest_size
+        del self.buffer_elements[biggest_key]
 
-        return result_to_yield, new_buffer_size
+        return result_to_yield
 
     def __iter__(self):
-        buffer_elements: DefaultDict[Any, List] = defaultdict(list)
-        buffer_size = 0
         for x in self.datapipe:
             key = self.group_key_fn(x)
 
-            buffer_elements[key].append(x)
-            buffer_size += 1
+            self.buffer_elements[key].append(x)
+            self.curr_buffer_size += 1
 
-            if self.group_size is not None and self.group_size == len(buffer_elements[key]):
-                yield self.wrapper_class(buffer_elements[key])
-                buffer_size -= len(buffer_elements[key])
-                del buffer_elements[key]
+            if self.group_size is not None and self.group_size == len(self.buffer_elements[key]):
+                yield self.wrapper_class(self.buffer_elements[key])
+                self.curr_buffer_size -= len(self.buffer_elements[key])
+                del self.buffer_elements[key]
 
-            if buffer_size == self.buffer_size:
-                (result_to_yield, buffer_size) = self._remove_biggest_key(buffer_elements, buffer_size)
+            if self.curr_buffer_size == self.max_buffer_size:
+                result_to_yield = self._remove_biggest_key()
                 if result_to_yield is not None:
                     yield self.wrapper_class(result_to_yield)
 
-        for key in tuple(buffer_elements.keys()):
-            res = buffer_elements.pop(key)
-            buffer_size -= len(res)
+        for key in tuple(self.buffer_elements.keys()):
+            res = self.buffer_elements.pop(key)
+            self.curr_buffer_size -= len(res)
             yield self.wrapper_class(res)
 
+    def reset(self) -> None:
+        self.curr_buffer_size = 0
+        self.buffer_elements = defaultdict(list)
+
     def __getstate__(self):
         if IterDataPipe.getstate_hook is not None:
             return IterDataPipe.getstate_hook(self)
-
-        if DILL_AVAILABLE:
-            dill_function = dill.dumps(self.group_key_fn)
-        else:
-            dill_function = self.group_key_fn
         state = (
             self.datapipe,
-            dill_function,
-            self.buffer_size,
+            self.group_key_fn,
+            self.max_buffer_size,
             self.group_size,
             self.guaranteed_group_size,
             self.drop_remaining,
+            self.wrapper_class,
         )
         return state
 
     def __setstate__(self, state):
         (
             self.datapipe,
-            dill_function,
-            self.buffer_size,
+            self.group_key_fn,
+            self.max_buffer_size,
             self.group_size,
             self.guaranteed_group_size,
             self.drop_remaining,
+            self.wrapper_class,
         ) = state
-        if DILL_AVAILABLE:
-            self.group_key_fn = dill.loads(dill_function)  # type: ignore[assignment]
-        else:
-            self.group_key_fn = dill_function  # type: ignore[assignment]
-        self.wrapper_class = DataChunk
+        self.curr_buffer_size = 0
+        self.buffer_elements = defaultdict(list)
+
+    def __del__(self):
+        self.buffer_elements.clear()
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index 79250237f136..8bfbe1442180 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -1,8 +1,9 @@
 from io import BufferedIOBase
 from typing import Any, Callable, Iterable, Iterator, Sized, Tuple
 
-from torch.utils.data import IterDataPipe, functional_datapipe
-from torch.utils.data.datapipes.utils.common import deprecation_warning
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import _deprecation_warning
 from torch.utils.data.datapipes.utils.decoder import (
     Decoder,
     basichandlers as decoder_basichandlers,
@@ -10,6 +11,8 @@
     extension_extract_fn
 )
 
+__all__ = ["RoutedDecoderIterDataPipe", ]
+
 
 @functional_datapipe('routed_decode')
 class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]):
@@ -40,7 +43,12 @@ def __init__(self,
         if not handlers:
             handlers = (decoder_basichandlers, decoder_imagehandler('torch'))
         self.decoder = Decoder(*handlers, key_fn=key_fn)
-        deprecation_warning(type(self).__name__)
+        _deprecation_warning(
+            type(self).__name__,
+            deprecation_version="1.12",
+            removal_version="1.13",
+            old_functional_name="routed_decode",
+        )
 
     def add_handler(self, *handler: Callable) -> None:
         self.decoder.add_handler(*handler)
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 818a27ebcdf7..22d03e89c243 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,22 +1,13 @@
-import warnings
-from typing import Callable, Iterator, TypeVar
+from typing import Callable, Iterator, Optional, TypeVar
 
-from torch.utils.data import IterDataPipe, functional_datapipe
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+from torch.utils.data.datapipes.utils.common import _check_lambda_fn, _deprecation_warning
 
-T_co = TypeVar('T_co', covariant=True)
-
-try:
-    import dill
+__all__ = ["FilterIterDataPipe", ]
 
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    DILL_AVAILABLE = True
-except ImportError:
-    DILL_AVAILABLE = False
+T_co = TypeVar('T_co', covariant=True)
 
 
 @functional_datapipe('filter')
@@ -27,35 +18,69 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
     Args:
         datapipe: Iterable DataPipe being filtered
         filter_fn: Customized function mapping an element to a boolean.
-        drop_empty_batches: By default, drops a batch if it is empty after filtering instead of keeping an empty list
+        drop_empty_batches (Deprecated): By default, drops a batch if it is empty after filtering instead of keeping an empty list
+        input_col: Index or indices of data which ``filter_fn`` is applied, such as:
+
+            - ``None`` as default to apply ``filter_fn`` to the data directly.
+            - Integer(s) is used for list/tuple.
+            - Key(s) is used for dict.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def is_even(n):
+        ...     return n % 2 == 0
+        >>> dp = IterableWrapper(range(5))
+        >>> filter_dp = dp.filter(filter_fn=is_even)
+        >>> list(filter_dp)
+        [0, 2, 4]
     """
     datapipe: IterDataPipe
     filter_fn: Callable
     drop_empty_batches: bool
 
-    def __init__(self,
-                 datapipe: IterDataPipe,
-                 filter_fn: Callable,
-                 drop_empty_batches: bool = True,
-                 ) -> None:
+    def __init__(
+        self,
+        datapipe: IterDataPipe,
+        filter_fn: Callable,
+        drop_empty_batches: Optional[bool] = None,
+        input_col=None,
+    ) -> None:
         super().__init__()
         self.datapipe = datapipe
-        # Partial object has no attribute '__name__', but can be pickled
-        if hasattr(filter_fn, '__name__') and filter_fn.__name__ == '<lambda>' and not DILL_AVAILABLE:
-            warnings.warn("Lambda function is not supported for pickle, please use "
-                          "regular python function or functools.partial instead.")
+
+        _check_lambda_fn(filter_fn)
         self.filter_fn = filter_fn  # type: ignore[assignment]
+
+        if drop_empty_batches is None:
+            drop_empty_batches = True
+        else:
+            _deprecation_warning(
+                type(self).__name__,
+                deprecation_version="1.12",
+                removal_version="1.14",
+                old_argument_name="drop_empty_batches",
+            )
         self.drop_empty_batches = drop_empty_batches
 
+        self.input_col = input_col
+
+    def _apply_filter_fn(self, data) -> bool:
+        if self.input_col is None:
+            return self.filter_fn(data)
+        elif isinstance(self.input_col, (list, tuple)):
+            args = tuple(data[col] for col in self.input_col)
+            return self.filter_fn(*args)
+        else:
+            return self.filter_fn(data[self.input_col])
+
     def __iter__(self) -> Iterator[T_co]:
-        res: bool
         for data in self.datapipe:
             filtered = self._returnIfTrue(data)
             if self._isNonEmpty(filtered):
                 yield filtered
 
     def _returnIfTrue(self, data):
-        condition = self.filter_fn(data)
+        condition = self._apply_filter_fn(data)
 
         if df_wrapper.is_column(condition):
             # We are operating on DataFrames filter here
@@ -79,21 +104,3 @@ def _isNonEmpty(self, data):
         r = data is not None and \
             not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches)
         return r
-
-    def __getstate__(self):
-        if IterDataPipe.getstate_hook is not None:
-            return IterDataPipe.getstate_hook(self)
-
-        if DILL_AVAILABLE:
-            dill_function = dill.dumps(self.filter_fn)
-        else:
-            dill_function = self.filter_fn
-        state = (self.datapipe, dill_function, self.drop_empty_batches)
-        return state
-
-    def __setstate__(self, state):
-        (self.datapipe, dill_function, self.drop_empty_batches) = state
-        if DILL_AVAILABLE:
-            self.filter_fn = dill.loads(dill_function)  # type: ignore[assignment]
-        else:
-            self.filter_fn = dill_function  # type: ignore[assignment]
diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index 3a731f1e9a01..974c37108956 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -1,15 +1,27 @@
 from typing import Tuple
-from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
 
+__all__ = ["StreamReaderIterDataPipe", ]
 
+
+@functional_datapipe('read_from_stream')
 class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
     r"""
-    Given IO streams and their label names, yields bytes with label name in a tuple.
+    Given IO streams and their label names, yields bytes with label
+    name in a tuple (functional name: ``read_from_stream``).
 
     Args:
         datapipe: Iterable DataPipe provides label/URL and byte stream
         chunk: Number of bytes to be read from stream per iteration.
             If ``None``, all bytes will be read util the EOF.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper, StreamReader
+        >>> from io import StringIO
+        >>> dp = IterableWrapper([("alphabet", StringIO("abcde"))])
+        >>> list(StreamReader(dp, chunk=1))
+        [('alphabet', 'a'), ('alphabet', 'b'), ('alphabet', 'c'), ('alphabet', 'd'), ('alphabet', 'e')]
     """
     def __init__(self, datapipe, chunk=None):
         self.datapipe = datapipe
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index e3c57ccaae5a..7278b6221cbd 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -1,6 +1,8 @@
 import copy
 import warnings
-from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+
+__all__ = ["IterableWrapperIterDataPipe", ]
 
 
 class IterableWrapperIterDataPipe(IterDataPipe):
@@ -13,9 +15,15 @@ class IterableWrapperIterDataPipe(IterDataPipe):
             iterator. The copy is made when the first element is read in ``iter()``.
 
     .. note::
-      If ``deepcopy`` is explicitly set to ``False``, users should ensure
-      that the data pipeline doesn't contain any in-place operations over
-      the iterable instance to prevent data inconsistency across iterations.
+        If ``deepcopy`` is explicitly set to ``False``, users should ensure
+        that the data pipeline doesn't contain any in-place operations over
+        the iterable instance to prevent data inconsistency across iterations.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> list(dp)
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     """
     def __init__(self, iterable, deepcopy=True):
         self.iterable = iterable
diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py
index b06030dc24cb..0056c290996b 100644
--- a/torch/utils/data/datapipes/map/callable.py
+++ b/torch/utils/data/datapipes/map/callable.py
@@ -1,19 +1,9 @@
-import warnings
+from torch.utils.data.datapipes.utils.common import _check_lambda_fn
 from typing import Callable, TypeVar
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
 
-from torch.utils.data import MapDataPipe, functional_datapipe
-
-try:
-    import dill
-
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    DILL_AVAILABLE = True
-except ImportError:
-    DILL_AVAILABLE = False
+__all__ = ["MapperMapDataPipe", "default_fn"]
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -35,6 +25,18 @@ class MapperMapDataPipe(MapDataPipe[T_co]):
     Args:
         datapipe: Source MapDataPipe
         fn: Function being applied to each item
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+        >>> def add_one(x):
+        ...     return x + 1
+        >>> dp = SequenceWrapper(range(10))
+        >>> map_dp_1 = dp.map(add_one)
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     """
     datapipe: MapDataPipe
     fn: Callable
@@ -46,12 +48,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
-        # Partial object has no attribute '__name__', but can be pickled
-        if hasattr(fn, '__name__') and fn.__name__ == '<lambda>' and not DILL_AVAILABLE:
-            warnings.warn(
-                "Lambda function is not supported for pickle, please use "
-                "regular python function or functools.partial instead."
-            )
+        _check_lambda_fn(fn)
         self.fn = fn  # type: ignore[assignment]
 
     def __len__(self) -> int:
@@ -59,18 +56,3 @@ def __len__(self) -> int:
 
     def __getitem__(self, index) -> T_co:
         return self.fn(self.datapipe[index])
-
-    def __getstate__(self):
-        if DILL_AVAILABLE:
-            dill_function = dill.dumps(self.fn)
-        else:
-            dill_function = self.fn
-        state = (self.datapipe, dill_function)
-        return state
-
-    def __setstate__(self, state):
-        (self.datapipe, dill_function) = state
-        if DILL_AVAILABLE:
-            self.fn = dill.loads(dill_function)  # type: ignore[assignment]
-        else:
-            self.fn = dill_function  # type: ignore[assignment]
diff --git a/torch/utils/data/datapipes/map/combinatorics.py b/torch/utils/data/datapipes/map/combinatorics.py
index 1839ec468799..07b8856dc2ff 100644
--- a/torch/utils/data/datapipes/map/combinatorics.py
+++ b/torch/utils/data/datapipes/map/combinatorics.py
@@ -1,8 +1,11 @@
 import random
 
-from torch.utils.data import MapDataPipe, functional_datapipe
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
 from typing import Iterator, List, Optional, TypeVar
 
+__all__ = ["ShufflerMapDataPipe", ]
+
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -23,6 +26,13 @@ class ShufflerMapDataPipe(MapDataPipe[T_co]):
     Args:
         datapipe: MapDataPipe being shuffled
         indices: a list of indices of the MapDataPipe. If not provided, we assume it uses 0-based indexing
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> shuffle_dp = dp.shuffle()
+        >>> list(shuffle_dp)
+        [0, 4, 1, 6, 3, 2, 9, 5, 7, 8]
     """
     datapipe: MapDataPipe[T_co]
 
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 6547aa9bd5e6..32284e7f279a 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -1,6 +1,9 @@
-from torch.utils.data import MapDataPipe, functional_datapipe
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
 from typing import Sized, Tuple, TypeVar
 
+__all__ = ["ConcaterMapDataPipe", "ZipperMapDataPipe"]
+
 T_co = TypeVar('T_co', covariant=True)
 
 
@@ -16,6 +19,14 @@ class ConcaterMapDataPipe(MapDataPipe):
 
     Args:
         datapipes: Map DataPipes being concatenated
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp1 = SequenceWrapper(range(3))
+        >>> dp2 = SequenceWrapper(range(3))
+        >>> concat_dp = dp1.concat(dp2)
+        >>> list(concat_dp)
+        [0, 1, 2, 0, 1, 2]
     """
     datapipes: Tuple[MapDataPipe]
     length: int
@@ -53,6 +64,14 @@ class ZipperMapDataPipe(MapDataPipe[Tuple[T_co, ...]]):
 
     Args:
         *datapipes: Map DataPipes being aggregated
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp1 = SequenceWrapper(range(3))
+        >>> dp2 = SequenceWrapper(range(10, 13))
+        >>> zip_dp = dp1.zip(dp2)
+        >>> list(zip_dp)
+        [(0, 10), (1, 11), (2, 12)]
     """
     datapipes: Tuple[MapDataPipe[T_co], ...]
     length: int
diff --git a/torch/utils/data/datapipes/map/grouping.py b/torch/utils/data/datapipes/map/grouping.py
index b11752ea95b5..44931a305051 100644
--- a/torch/utils/data/datapipes/map/grouping.py
+++ b/torch/utils/data/datapipes/map/grouping.py
@@ -1,6 +1,8 @@
-from torch.utils.data import MapDataPipe, functional_datapipe, DataChunk
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe, DataChunk
 from typing import List, Optional, Sized, TypeVar
 
+__all__ = ["BatcherMapDataPipe", ]
 
 T = TypeVar('T')
 
@@ -16,6 +18,13 @@ class BatcherMapDataPipe(MapDataPipe[DataChunk]):
         datapipe: Iterable DataPipe being batched
         batch_size: The size of each batch
         drop_last: Option to drop the last batch if it's not full
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> batch_dp = dp.batch(batch_size=2)
+        >>> list(batch_dp)
+        [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]
     """
     datapipe: MapDataPipe
     batch_size: int
diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py
index 46e289a71a87..80fd830a9d81 100644
--- a/torch/utils/data/datapipes/map/utils.py
+++ b/torch/utils/data/datapipes/map/utils.py
@@ -1,6 +1,8 @@
 import copy
 import warnings
-from torch.utils.data import MapDataPipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
+
+__all__ = ["SequenceWrapperMapDataPipe", ]
 
 
 class SequenceWrapperMapDataPipe(MapDataPipe):
@@ -16,6 +18,15 @@ class SequenceWrapperMapDataPipe(MapDataPipe):
       that data pipeline doesn't contain any in-place operations over
       the iterable instance, in order to prevent data inconsistency
       across iterations.
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> list(dp)
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+        >>> dp = SequenceWrapper({'a': 100, 'b': 200, 'c': 300, 'd': 400})
+        >>> dp['a']
+        100
     """
     def __init__(self, sequence, deepcopy=True):
         if deepcopy:
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index e15be6999abd..5a318b92c070 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -3,22 +3,20 @@
 import warnings
 
 from io import IOBase
-from typing import Iterable, List, Tuple, Union
+from typing import Iterable, List, Tuple, Union, Optional
 
-try:
-    import dill
+from torch.utils.data._utils.serialization import DILL_AVAILABLE
 
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    DILL_AVAILABLE = True
-except ImportError:
-    DILL_AVAILABLE = False
+__all__ = [
+    "StreamWrapper",
+    "get_file_binaries_from_pathnames",
+    "get_file_pathnames_from_root",
+    "match_masks",
+    "validate_pathname_binary_tuple",
+]
 
 
-def check_lambda_fn(fn):
+def _check_lambda_fn(fn):
     # Partial object has no attribute '__name__', but can be pickled
     if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not DILL_AVAILABLE:
         warnings.warn(
@@ -78,7 +76,7 @@ def onerror(err : OSError):
                 dirs.sort()
 
 
-def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str):
+def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str, encoding: Optional[str] = None):
     if not isinstance(pathnames, Iterable):
         pathnames = [pathnames, ]
 
@@ -89,7 +87,7 @@ def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str):
         if not isinstance(pathname, str):
             raise TypeError("Expected string type for pathname, but got {}"
                             .format(type(pathname)))
-        yield pathname, StreamWrapper(open(pathname, mode))
+        yield pathname, StreamWrapper(open(pathname, mode, encoding=encoding))
 
 
 def validate_pathname_binary_tuple(data: Tuple[str, IOBase]):
@@ -106,12 +104,51 @@ def validate_pathname_binary_tuple(data: Tuple[str, IOBase]):
         )
 
 
-def deprecation_warning(name, new_name: str = ""):
-    new_name_statement = ""
-    if new_name:
-        new_name_statement = f" Please use {new_name} instead."
-    warnings.warn(f"{name} and its functional API are deprecated and will be removed from the package `torch`." +
-                  new_name_statement, DeprecationWarning)
+def _deprecation_warning(
+    old_class_name: str,
+    *,
+    deprecation_version: str,
+    removal_version: str,
+    old_functional_name: str = "",
+    old_argument_name: str = "",
+    new_class_name: str = "",
+    new_functional_name: str = "",
+    new_argument_name: str = "",
+) -> None:
+    if new_functional_name and not old_functional_name:
+        raise ValueError("Old functional API needs to be specified for the deprecation warning.")
+    if new_argument_name and not old_argument_name:
+        raise ValueError("Old argument name needs to be specified for the deprecation warning.")
+
+    if old_functional_name and old_argument_name:
+        raise ValueError("Deprecating warning for functional API and argument should be separated.")
+
+    msg = f"`{old_class_name}()`"
+    if old_functional_name:
+        msg = f"{msg} and its functional API `.{old_functional_name}()` are"
+    elif old_argument_name:
+        msg = f"The argument `{old_argument_name}` of {msg} is"
+    else:
+        msg = f"{msg} is"
+    msg = (
+        f"{msg} deprecated since {deprecation_version} and will be removed in {removal_version}."
+        f"\nSee https://github.com/pytorch/data/issues/163 for details."
+    )
+
+    if new_class_name or new_functional_name:
+        msg = f"{msg}\nPlease use"
+        if new_class_name:
+            msg = f"{msg} `{new_class_name}()`"
+        if new_class_name and new_functional_name:
+            msg = f"{msg} or"
+        if new_functional_name:
+            msg = f"{msg} `.{new_functional_name}()`"
+        msg = f"{msg} instead."
+
+    if new_argument_name:
+        msg = f"{msg}\nPlease use `{old_class_name}({new_argument_name}=)` instead."
+
+    warnings.warn(msg, FutureWarning)
 
 
 class StreamWrapper:
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index 7886f42e9e7b..da74516ca919 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -11,8 +11,18 @@
 from torch.utils.data.datapipes.utils.common import StreamWrapper
 
 
-__all__ = ["basichandlers", "imagehandler", "videohandler", "audiohandler",
-           "mathandler", "Decoder", "extension_extract_fn"]
+__all__ = [
+    "Decoder",
+    "ImageHandler",
+    "MatHandler",
+    "audiohandler",
+    "basichandlers",
+    "extension_extract_fn",
+    "handle_extension",
+    "imagehandler",
+    "mathandler",
+    "videohandler",
+]
 
 
 ################################################################
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 5411e7eacc74..4aa039174b7c 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -1,9 +1,6 @@
 import bisect
-import functools
 import warnings
 from typing import (
-    Callable,
-    Dict,
     Generic,
     Iterable,
     Iterator,
@@ -17,36 +14,22 @@
 # No 'default_generator' in torch/__init__.pyi
 from torch import default_generator, randperm
 from torch._utils import _accumulate
-from torch.utils.data._typing import _DataPipeMeta
 
 from ... import Generator, Tensor
 
+__all__ = [
+    "Dataset",
+    "IterableDataset",
+    "TensorDataset",
+    "ConcatDataset",
+    "ChainDataset",
+    "Subset",
+    "random_split",
+]
+
 T_co = TypeVar('T_co', covariant=True)
 T = TypeVar('T')
 
-UNTRACABLE_DATAFRAME_PIPES = ['batch',  # As it returns DataChunks
-                              'groupby',   # As it returns DataChunks
-                              '_dataframes_as_tuples',  # As it unpacks DF
-                              'trace_as_dataframe',  # As it used to mark DF for tracing
-                              ]
-
-class DataChunk(list, Generic[T]):
-    def __init__(self, items):
-        super().__init__(items)
-        self.items = items
-
-    def as_str(self, indent=''):
-        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
-        return res
-
-    def __iter__(self) -> Iterator[T]:
-        for i in super().__iter__():
-            yield i
-
-    def raw_iterator(self) -> T:
-        for i in self.items:
-            yield i
-
 
 class Dataset(Generic[T_co]):
     r"""An abstract class representing a :class:`Dataset`.
@@ -75,61 +58,6 @@ def __add__(self, other: 'Dataset[T_co]') -> 'ConcatDataset[T_co]':
     # in pytorch/torch/utils/data/sampler.py
 
 
-class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
-    r"""
-    Map-style DataPipe.
-
-    All datasets that represent a map from keys to data samples should subclass this.
-    Subclasses should overwrite :meth:`__getitem__`, supporting fetching a
-    data sample for a given, unique key. Subclasses can also optionally overwrite
-    :meth:`__len__`, which is expected to return the size of the dataset by many
-    :class:`~torch.utils.data.Sampler` implementations and the default options
-    of :class:`~torch.utils.data.DataLoader`.
-
-    These DataPipes can be invoked in two ways, using the class constructor or applying their
-    functional form onto an existing `MapDataPipe` (available to most but not all DataPipes).
-
-    Note:
-        :class:`~torch.utils.data.DataLoader` by default constructs an index
-        sampler that yields integral indices. To make it work with a map-style
-        DataPipe with non-integral indices/keys, a custom sampler must be provided.
-
-    Example:
-        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
-        >>> dp = SequenceWrapper(range(10))
-        >>> map_dp_1 = dp.map(lambda x: x + 1)  # Using functional form
-        >>> list(map_dp_1)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)  # Using class constructor
-        >>> list(map_dp_2)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        >>> batch_dp = map_dp_1.batch(batch_size=2)
-        >>> list(batch_dp)  # [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-    """
-    functions: Dict[str, Callable] = {}
-
-    def __getattr__(self, attribute_name):
-        if attribute_name in MapDataPipe.functions:
-            function = functools.partial(MapDataPipe.functions[attribute_name], self)
-            return function
-        else:
-            raise AttributeError("'{0}' object has no attribute '{1}".format(self.__class__.__name__, attribute_name))
-
-    @classmethod
-    def register_function(cls, function_name, function):
-        cls.functions[function_name] = function
-
-    @classmethod
-    def register_datapipe_as_function(cls, function_name, cls_to_register):
-        if function_name in cls.functions:
-            raise Exception("Unable to add DataPipe function name {} as it is already taken".format(function_name))
-
-        def class_function(cls, source_dp, *args, **kwargs):
-            result_pipe = cls(source_dp, *args, **kwargs)
-            return result_pipe
-
-        function = functools.partial(class_function, cls_to_register)
-        cls.functions[function_name] = function
-
-
 class IterableDataset(Dataset[T_co]):
     r"""An iterable Dataset.
 
@@ -242,109 +170,6 @@ def __add__(self, other: Dataset[T_co]):
     # See NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
 
 
-class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta):
-    r"""
-    Iterable-style DataPipe.
-
-    All DataPipes that represent an iterable of data samples should subclass this.
-    This style of DataPipes is particularly useful when data come from a stream, or
-    when the number of samples is too large to fit them all in memory.
-
-    All subclasses should overwrite :meth:`__iter__`, which would return an
-    iterator of samples in this DataPipe.
-
-    `IterDataPipe` is lazily initialized and its elements are computed only when ``next()`` is called
-    on its iterator.
-
-    These DataPipes can be invoked in two ways, using the class constructor or applying their
-    functional form onto an existing `IterDataPipe` (available to most but not all DataPipes).
-    You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple
-    operations in succession.
-
-    Note:
-        When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
-        item in the DataPipe will be yielded from the :class:`~torch.utils.data.DataLoader`
-        iterator. When :attr:`num_workers > 0`, each worker process will have a
-        different copy of the DataPipe object, so it is often desired to configure
-        each copy independently to avoid having duplicate data returned from the
-        workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
-        process, returns information about the worker. It can be used in either the
-        dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
-        :attr:`worker_init_fn` option to modify each copy's behavior.
-
-    Example:
-        >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
-        >>> dp = IterableWrapper(range(10))
-        >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
-        >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form
-        >>> list(map_dp_1)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        >>> list(map_dp_2)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0)
-        >>> list(filter_dp)  # [2, 4, 6, 8, 10]
-    """
-    functions: Dict[str, Callable] = {}
-    reduce_ex_hook : Optional[Callable] = None
-    getstate_hook: Optional[Callable] = None
-
-    def __getattr__(self, attribute_name):
-        if attribute_name in IterDataPipe.functions:
-            function = functools.partial(IterDataPipe.functions[attribute_name], self)
-            return function
-        else:
-            raise AttributeError("'{0}' object has no attribute '{1}".format(self.__class__.__name__, attribute_name))
-
-    @classmethod
-    def register_function(cls, function_name, function):
-        cls.functions[function_name] = function
-
-    @classmethod
-    def register_datapipe_as_function(cls, function_name, cls_to_register, enable_df_api_tracing=False):
-        if function_name in cls.functions:
-            raise Exception("Unable to add DataPipe function name {} as it is already taken".format(function_name))
-
-        def class_function(cls, enable_df_api_tracing, source_dp, *args, **kwargs):
-            result_pipe = cls(source_dp, *args, **kwargs)
-            if isinstance(result_pipe, IterDataPipe):
-                if enable_df_api_tracing or isinstance(source_dp, DFIterDataPipe):
-                    if function_name not in UNTRACABLE_DATAFRAME_PIPES:
-                        result_pipe = result_pipe.trace_as_dataframe()
-
-            return result_pipe
-
-        function = functools.partial(class_function, cls_to_register, enable_df_api_tracing)
-        cls.functions[function_name] = function
-
-    def __getstate__(self):
-        if IterDataPipe.getstate_hook is not None:
-            return IterDataPipe.getstate_hook(self)
-        return self.__dict__
-
-    def __reduce_ex__(self, *args, **kwargs):
-        if IterDataPipe.reduce_ex_hook is not None:
-            try:
-                return IterDataPipe.reduce_ex_hook(self)
-            except NotImplementedError:
-                pass
-        return super().__reduce_ex__(*args, **kwargs)
-
-    @classmethod
-    def set_getstate_hook(cls, hook_fn):
-        if IterDataPipe.getstate_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing getstate_hook")
-        IterDataPipe.getstate_hook = hook_fn
-
-    @classmethod
-    def set_reduce_ex_hook(cls, hook_fn):
-        if IterDataPipe.reduce_ex_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing reduce_ex_hook")
-        IterDataPipe.reduce_ex_hook = hook_fn
-
-
-class DFIterDataPipe(IterDataPipe):
-    def _is_dfpipe(self):
-        return True
-
-
 class TensorDataset(Dataset[Tuple[Tensor, ...]]):
     r"""Dataset wrapping tensors.
 
@@ -440,7 +265,7 @@ def __len__(self):
         total = 0
         for d in self.datasets:
             assert isinstance(d, IterableDataset), "ChainDataset only supports IterableDataset"
-            total += len(d)
+            total += len(d)  # type: ignore[arg-type]
         return total
 
 
@@ -482,7 +307,7 @@ def random_split(dataset: Dataset[T], lengths: Sequence[int],
         generator (Generator): Generator used for the random permutation.
     """
     # Cannot verify that dataset is Sized
-    if sum(lengths) != len(dataset):
+    if sum(lengths) != len(dataset):    # type: ignore[arg-type]
         raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
 
     indices = randperm(sum(lengths), generator=generator).tolist()
diff --git a/torch/utils/data/dataset.pyi b/torch/utils/data/dataset.pyi
deleted file mode 100644
index c05ae48c0ffa..000000000000
--- a/torch/utils/data/dataset.pyi
+++ /dev/null
@@ -1,125 +0,0 @@
-# This base template ("dataset.pyi.in") is generated from mypy stubgen with minimal editing for code injection
-# The output file will be "dataset.pyi".
-# Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
-# classes/objects here, even though we are not injecting extra code into them at the moment.
-
-from ... import Generator as Generator, Tensor as Tensor
-from torch import default_generator as default_generator, randperm as randperm
-from torch.utils.data._typing import _DataPipeMeta
-from typing import Any, Callable, Dict, Generic, Iterable, Iterator, List, Optional, Sequence, Tuple, TypeVar
-
-T_co = TypeVar('T_co', covariant=True)
-T = TypeVar('T')
-UNTRACABLE_DATAFRAME_PIPES: Any
-
-
-class DataChunk(list, Generic[T]):
-    items: Any = ...
-    def __init__(self, items: Any) -> None: ...
-    def as_str(self, indent: str = ...): ...
-    def __iter__(self) -> Iterator[T]: ...
-    def raw_iterator(self) -> T: ...
-
-class Dataset(Generic[T_co]):
-    def __getitem__(self, index: Any) -> T_co: ...
-    def __add__(self, other: Dataset[T_co]) -> ConcatDataset[T_co]: ...
-
-class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
-    functions: Dict[str, Callable] = ...
-    def __getattr__(self, attribute_name: Any): ...
-    @classmethod
-    def register_function(cls, function_name: Any, function: Any) -> None: ...
-    @classmethod
-    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any): ...
-    # Functional form of 'BatcherMapDataPipe'
-    def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> MapDataPipe: ...
-    # Functional form of 'ConcaterMapDataPipe'
-    def concat(self, *datapipes: MapDataPipe) -> MapDataPipe: ...
-    # Functional form of 'MapperMapDataPipe'
-    def map(self, fn: Callable= ...) -> MapDataPipe: ...
-    # Functional form of 'ShufflerMapDataPipe'
-    def shuffle(self, *, indices: Optional[List] = None) -> MapDataPipe: ...
-    # Functional form of 'ZipperMapDataPipe'
-    def zip(self, *datapipes: MapDataPipe[T_co]) -> MapDataPipe: ...
-
-class IterableDataset(Dataset[T_co]):
-    def __iter__(self) -> Iterator[T_co]: ...
-    def __add__(self, other: Dataset[T_co]) -> Any: ...
-
-class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta):
-    functions: Dict[str, Callable] = ...
-    reduce_ex_hook: Optional[Callable] = ...
-    getstate_hook: Optional[Callable] = ...
-    def __getattr__(self, attribute_name: Any): ...
-    @classmethod
-    def register_function(cls, function_name: Any, function: Any) -> None: ...
-    @classmethod
-    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any, enable_df_api_tracing: bool = ...): ...
-    def __getstate__(self): ...
-    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
-    @classmethod
-    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
-    @classmethod
-    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
-    # Functional form of 'BatcherIterDataPipe'
-    def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> IterDataPipe: ...
-    # Functional form of 'CollatorIterDataPipe'
-    def collate(self, collate_fn: Callable= ...) -> IterDataPipe: ...
-    # Functional form of 'ConcaterIterDataPipe'
-    def concat(self, *datapipes: IterDataPipe) -> IterDataPipe: ...
-    # Functional form of 'DemultiplexerIterDataPipe'
-    def demux(self, num_instances: int, classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000) -> List[IterDataPipe]: ...
-    # Functional form of 'FilterIterDataPipe'
-    def filter(self, filter_fn: Callable, drop_empty_batches: bool = True) -> IterDataPipe: ...
-    # Functional form of 'ForkerIterDataPipe'
-    def fork(self, num_instances: int, buffer_size: int = 1000) -> List[IterDataPipe]: ...
-    # Functional form of 'GrouperIterDataPipe'
-    def groupby(self, group_key_fn: Callable, *, buffer_size: int = 10000, group_size: Optional[int] = None, guaranteed_group_size: Optional[int] = None, drop_remaining: bool = False) -> IterDataPipe: ...
-    # Functional form of 'MapperIterDataPipe'
-    def map(self, fn: Callable, input_col=None, output_col=None) -> IterDataPipe: ...
-    # Functional form of 'MultiplexerIterDataPipe'
-    def mux(self, *datapipes) -> IterDataPipe: ...
-    # Functional form of 'RoutedDecoderIterDataPipe'
-    def routed_decode(self, *handlers: Callable, key_fn: Callable= ...) -> IterDataPipe: ...
-    # Functional form of 'ShardingFilterIterDataPipe'
-    def sharding_filter(self) -> IterDataPipe: ...
-    # Functional form of 'ShufflerIterDataPipe'
-    def shuffle(self, *, default: bool = True, buffer_size: int = 10000, unbatch_level: int = 0) -> IterDataPipe: ...
-    # Functional form of 'UnBatcherIterDataPipe'
-    def unbatch(self, unbatch_level: int = 1) -> IterDataPipe: ...
-    # Functional form of 'ZipperIterDataPipe'
-    def zip(self, *datapipes: IterDataPipe) -> IterDataPipe: ...
-
-class DFIterDataPipe(IterableDataset): ...
-
-class TensorDataset(Dataset[Tuple[Tensor, ...]]):
-    tensors: Tuple[Tensor, ...]
-    def __init__(self, *tensors: Tensor) -> None: ...
-    def __getitem__(self, index: Any): ...
-    def __len__(self): ...
-
-class ConcatDataset(Dataset[T_co]):
-    datasets: List[Dataset[T_co]]
-    cumulative_sizes: List[int]
-    @staticmethod
-    def cumsum(sequence: Any): ...
-    def __init__(self, datasets: Iterable[Dataset]) -> None: ...
-    def __len__(self): ...
-    def __getitem__(self, idx: Any): ...
-    @property
-    def cummulative_sizes(self): ...
-
-class ChainDataset(IterableDataset):
-    datasets: Any = ...
-    def __init__(self, datasets: Iterable[Dataset]) -> None: ...
-    def __iter__(self) -> Any: ...
-    def __len__(self): ...
-
-class Subset(Dataset[T_co]):
-    dataset: Dataset[T_co]
-    indices: Sequence[int]
-    def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None: ...
-    def __getitem__(self, idx: Any): ...
-    def __len__(self): ...
-
-def random_split(dataset: Dataset[T], lengths: Sequence[int], generator: Optional[Generator]=...) -> List[Subset[T]]: ...
diff --git a/torch/utils/data/dataset.pyi.in b/torch/utils/data/dataset.pyi.in
deleted file mode 100644
index ea24af33e259..000000000000
--- a/torch/utils/data/dataset.pyi.in
+++ /dev/null
@@ -1,89 +0,0 @@
-# This base template ("dataset.pyi.in") is generated from mypy stubgen with minimal editing for code injection
-# The output file will be "dataset.pyi".
-# Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
-# classes/objects here, even though we are not injecting extra code into them at the moment.
-
-from ... import Generator as Generator, Tensor as Tensor
-from torch import default_generator as default_generator, randperm as randperm
-from torch.utils.data._typing import _DataPipeMeta
-from typing import Any, Callable, Dict, Generic, Iterable, Iterator, List, Optional, Sequence, Tuple, TypeVar
-
-T_co = TypeVar('T_co', covariant=True)
-T = TypeVar('T')
-UNTRACABLE_DATAFRAME_PIPES: Any
-
-
-class DataChunk(list, Generic[T]):
-    items: Any = ...
-    def __init__(self, items: Any) -> None: ...
-    def as_str(self, indent: str = ...): ...
-    def __iter__(self) -> Iterator[T]: ...
-    def raw_iterator(self) -> T: ...
-
-class Dataset(Generic[T_co]):
-    def __getitem__(self, index: Any) -> T_co: ...
-    def __add__(self, other: Dataset[T_co]) -> ConcatDataset[T_co]: ...
-
-class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
-    functions: Dict[str, Callable] = ...
-    def __getattr__(self, attribute_name: Any): ...
-    @classmethod
-    def register_function(cls, function_name: Any, function: Any) -> None: ...
-    @classmethod
-    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any): ...
-    ${MapDataPipeMethods}
-
-class IterableDataset(Dataset[T_co]):
-    def __iter__(self) -> Iterator[T_co]: ...
-    def __add__(self, other: Dataset[T_co]) -> Any: ...
-
-class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta):
-    functions: Dict[str, Callable] = ...
-    reduce_ex_hook: Optional[Callable] = ...
-    getstate_hook: Optional[Callable] = ...
-    def __getattr__(self, attribute_name: Any): ...
-    @classmethod
-    def register_function(cls, function_name: Any, function: Any) -> None: ...
-    @classmethod
-    def register_datapipe_as_function(cls, function_name: Any, cls_to_register: Any, enable_df_api_tracing: bool = ...): ...
-    def __getstate__(self): ...
-    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
-    @classmethod
-    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
-    @classmethod
-    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
-    ${IterDataPipeMethods}
-
-class DFIterDataPipe(IterableDataset): ...
-
-class TensorDataset(Dataset[Tuple[Tensor, ...]]):
-    tensors: Tuple[Tensor, ...]
-    def __init__(self, *tensors: Tensor) -> None: ...
-    def __getitem__(self, index: Any): ...
-    def __len__(self): ...
-
-class ConcatDataset(Dataset[T_co]):
-    datasets: List[Dataset[T_co]]
-    cumulative_sizes: List[int]
-    @staticmethod
-    def cumsum(sequence: Any): ...
-    def __init__(self, datasets: Iterable[Dataset]) -> None: ...
-    def __len__(self): ...
-    def __getitem__(self, idx: Any): ...
-    @property
-    def cummulative_sizes(self): ...
-
-class ChainDataset(IterableDataset):
-    datasets: Any = ...
-    def __init__(self, datasets: Iterable[Dataset]) -> None: ...
-    def __iter__(self) -> Any: ...
-    def __len__(self): ...
-
-class Subset(Dataset[T_co]):
-    dataset: Dataset[T_co]
-    indices: Sequence[int]
-    def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None: ...
-    def __getitem__(self, idx: Any): ...
-    def __len__(self): ...
-
-def random_split(dataset: Dataset[T], lengths: Sequence[int], generator: Optional[Generator]=...) -> List[Subset[T]]: ...
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index 1c1b331aa0ff..fae8dce66dd2 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -5,6 +5,7 @@
 from . import Sampler, Dataset
 import torch.distributed as dist
 
+__all__ = ["DistributedSampler", ]
 
 T_co = TypeVar('T_co', covariant=True)
 
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index d155d52c1e72..34be21edf1bd 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -2,24 +2,32 @@
 import pickle
 
 from torch.utils.data import IterDataPipe, MapDataPipe
+from torch.utils.data._utils.serialization import DILL_AVAILABLE
 
-from typing import Any, Dict
+from typing import Any, Dict, Set, Tuple, Type, Union
 
+__all__ = ["traverse", ]
+
+DataPipe = Union[IterDataPipe, MapDataPipe]
 reduce_ex_hook = None
 
 
-def stub_unpickler():
+def _stub_unpickler():
     return "STUB"
 
 
 # TODO(VitalyFedyunin): Make sure it works without dill module installed
-def list_connected_datapipes(scan_obj, only_datapipe):
-
+def _list_connected_datapipes(scan_obj, only_datapipe, cache):
     f = io.BytesIO()
     p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is
+    if DILL_AVAILABLE:
+        from dill import Pickler as dill_Pickler
+        d = dill_Pickler(f)
+    else:
+        d = None
 
     def stub_pickler(obj):
-        return stub_unpickler, ()
+        return _stub_unpickler, ()
 
     captured_connections = []
 
@@ -31,32 +39,52 @@ def getstate_hook(obj):
         return state
 
     def reduce_hook(obj):
-        if obj == scan_obj:
+        if obj == scan_obj or obj in cache:
             raise NotImplementedError
         else:
             captured_connections.append(obj)
-            return stub_unpickler, ()
+            return _stub_unpickler, ()
+
+    datapipe_classes: Tuple[Type[DataPipe]] = (IterDataPipe, MapDataPipe)  # type: ignore[assignment]
 
     try:
-        IterDataPipe.set_reduce_ex_hook(reduce_hook)
-        if only_datapipe:
-            IterDataPipe.set_getstate_hook(getstate_hook)
-        p.dump(scan_obj)
-    except AttributeError:  # unpickable DataPipesGraph
-        pass  # TODO(VitalyFedyunin): We need to tight this requirement after migrating from old DataLoader
+        for cls in datapipe_classes:
+            cls.set_reduce_ex_hook(reduce_hook)
+            if only_datapipe:
+                cls.set_getstate_hook(getstate_hook)
+        try:
+            p.dump(scan_obj)
+        except (pickle.PickleError, AttributeError, TypeError):
+            if DILL_AVAILABLE:
+                d.dump(scan_obj)
+            else:
+                raise
     finally:
-        IterDataPipe.set_reduce_ex_hook(None)
-        if only_datapipe:
-            IterDataPipe.set_getstate_hook(None)
+        for cls in datapipe_classes:
+            cls.set_reduce_ex_hook(None)
+            if only_datapipe:
+                cls.set_getstate_hook(None)
+        if DILL_AVAILABLE:
+            from dill import extend as dill_extend
+            dill_extend(False)  # Undo change to dispatch table
     return captured_connections
 
 
 def traverse(datapipe, only_datapipe=False):
-    if not isinstance(datapipe, IterDataPipe):
-        raise RuntimeError("Expected `IterDataPipe`, but {} is found".format(type(datapipe)))
+    cache: Set[DataPipe] = set()
+    return _traverse_helper(datapipe, only_datapipe, cache)
+
+
+# Add cache here to prevent infinite recursion on DataPipe
+def _traverse_helper(datapipe, only_datapipe, cache):
+    if not isinstance(datapipe, (IterDataPipe, MapDataPipe)):
+        raise RuntimeError("Expected `IterDataPipe` or `MapDataPipe`, but {} is found".format(type(datapipe)))
 
-    items = list_connected_datapipes(datapipe, only_datapipe)
-    d: Dict[IterDataPipe, Any] = {datapipe: {}}
+    cache.add(datapipe)
+    items = _list_connected_datapipes(datapipe, only_datapipe, cache)
+    d: Dict[DataPipe, Any] = {datapipe: {}}
     for item in items:
-        d[datapipe].update(traverse(item, only_datapipe))
+        # Using cache.copy() here is to prevent recursion on a single path rather than global graph
+        # Single DataPipe can present multiple times in different paths in graph
+        d[datapipe].update(_traverse_helper(item, only_datapipe, cache.copy()))
     return d
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index 940f30c7f03f..02fa32d2f0e9 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -1,4 +1,13 @@
 import torch.utils.data.graph
+from torch.utils.data.datapipes.iter import Shuffler
+import warnings
+
+__all__ = [
+    "apply_sharding",
+    "apply_shuffle_seed",
+    "apply_shuffle_settings",
+    "get_all_graph_pipes",
+]
 
 
 def get_all_graph_pipes(graph):
@@ -27,9 +36,33 @@ def apply_sharding(datapipe, num_of_instances, instance_id):
 
 
 def apply_shuffle_settings(datapipe, shuffle):
-    if shuffle is not None:
-        graph = torch.utils.data.graph.traverse(datapipe, only_datapipe=True)
-        all_pipes = get_all_graph_pipes(graph)
-        for pipe in all_pipes:
-            if hasattr(pipe, 'set_shuffle_settings'):
-                pipe.set_shuffle_settings(shuffle)
+    if shuffle is None:
+        return datapipe
+
+    graph = torch.utils.data.graph.traverse(datapipe, only_datapipe=True)
+    all_pipes = get_all_graph_pipes(graph)
+    shufflers = {pipe for pipe in all_pipes if isinstance(pipe, Shuffler)}
+    if not shufflers and shuffle:
+        warnings.warn(
+            "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. "
+            "Be aware that the default buffer size might not be sufficient for your task."
+        )
+        datapipe = datapipe.shuffle()
+        shufflers = {datapipe}
+
+    for shuffler in shufflers:
+        shuffler.set_shuffle(shuffle)
+
+    return datapipe
+
+
+def apply_shuffle_seed(datapipe, rng):
+    graph = torch.utils.data.graph.traverse(datapipe, only_datapipe=True)
+    all_pipes = get_all_graph_pipes(graph)
+    shufflers = {pipe for pipe in all_pipes if isinstance(pipe, Shuffler)}
+
+    for shuffler in shufflers:
+        shuffle_seed = int(torch.empty((), dtype=torch.int64).random_(generator=rng).item())
+        shuffler.set_seed(shuffle_seed)
+
+    return datapipe
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 780dd4c4167f..db9f74202554 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -3,8 +3,18 @@
 
 from typing import Iterator, Iterable, Optional, Sequence, List, TypeVar, Generic, Sized, Union
 
+__all__ = [
+    "BatchSampler",
+    "RandomSampler",
+    "Sampler",
+    "SequentialSampler",
+    "SubsetRandomSampler",
+    "WeightedRandomSampler",
+]
+
 T_co = TypeVar('T_co', covariant=True)
 
+
 class Sampler(Generic[T_co]):
     r"""Base class for all Samplers.
 
@@ -222,14 +232,27 @@ def __init__(self, sampler: Union[Sampler[int], Iterable[int]], batch_size: int,
         self.drop_last = drop_last
 
     def __iter__(self) -> Iterator[List[int]]:
-        batch = []
-        for idx in self.sampler:
-            batch.append(idx)
-            if len(batch) == self.batch_size:
-                yield batch
-                batch = []
-        if len(batch) > 0 and not self.drop_last:
-            yield batch
+        # Implemented based on the benchmarking in https://github.com/pytorch/pytorch/pull/76951
+        if self.drop_last:
+            sampler_iter = iter(self.sampler)
+            while True:
+                try:
+                    batch = [next(sampler_iter) for _ in range(self.batch_size)]
+                    yield batch
+                except StopIteration:
+                    break
+        else:
+            batch = [0] * self.batch_size
+            idx_in_batch = 0
+            for idx in self.sampler:
+                batch[idx_in_batch] = idx
+                idx_in_batch += 1
+                if idx_in_batch == self.batch_size:
+                    yield batch
+                    idx_in_batch = 0
+                    batch = [0] * self.batch_size
+            if idx_in_batch > 0:
+                yield batch[:idx_in_batch]
 
     def __len__(self) -> int:
         # Can only be called if self.sampler has __len__ implemented
diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py
deleted file mode 100644
index e47a4f8a3417..000000000000
--- a/torch/utils/ffi/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-raise ImportError("torch.utils.ffi is deprecated. Please use cpp extensions instead.")
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 73e3bc218285..bd3c0a739f15 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -7864,6 +7864,8 @@
         ("cusparseCcsrgeam2", ("hipsparseCcsrgeam2", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseZcsrgeam2", ("hipsparseZcsrgeam2", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseXbsrsm2_zeroPivot", ("hipsparseXbsrsm2_zeroPivot", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseXbsrsv2_zeroPivot", ("hipsparseXbsrsv2_zeroPivot", CONV_MATH_FUNC, API_SPARSE)),
         (
             "cusparseXcoosort_bufferSizeExt",
             ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE),
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
old mode 100644
new mode 100755
index ab541d07375e..54f1ece42719
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -117,15 +117,16 @@ def match_extensions(filename: str, extensions: Iterable) -> bool:
     """Helper method to see if filename ends with certain extension"""
     return any(filename.endswith(e) for e in extensions)
 
+def _fnmatch(filepath, patterns):
+    return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
+
 def matched_files_iter(
         root_path: str,
-        includes: Iterable = ('*',),
+        includes: Iterable = (),
         ignores: Iterable = (),
         extensions: Iterable = (),
         out_of_place_only: bool = False,
         is_pytorch_extension: bool = False) -> Iterator[str]:
-    def _fnmatch(filepath, patterns):
-        return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
 
     exact_matches = set(includes)
 
@@ -145,7 +146,8 @@ def _fnmatch(filepath, patterns):
             if "third_party" in dirs:
                 dirs.remove("third_party")
         for filename in filenames:
-            filepath = os.path.join(rel_dirpath, filename)
+            filepath = os.path.join(abs_dirpath, filename)
+            rel_filepath = os.path.join(rel_dirpath, filename)
             # We respect extensions, UNLESS you wrote the entire
             # filename verbatim, in which case we always accept it
             if (
@@ -154,9 +156,9 @@ def _fnmatch(filepath, patterns):
                 and (match_extensions(filepath, extensions) or filepath in exact_matches)
             ):
                 if not is_pytorch_extension:  # for pytorch extensions, consider all files
-                    if not is_pytorch_file(filepath) and not is_caffe2_gpu_file(filepath):
+                    if not is_pytorch_file(rel_filepath) and not is_caffe2_gpu_file(rel_filepath):
                         continue
-                    if out_of_place_only and not is_out_of_place(filepath):
+                    if out_of_place_only and not is_out_of_place(rel_filepath):
                         continue
                 yield filepath
 
@@ -165,59 +167,23 @@ def preprocess_file_and_save_result(
         output_directory: str,
         filepath: str,
         all_files: Iterable,
-        includes: Iterable,
+        header_include_dirs: Iterable,
         stats: Dict[str, List],
         hip_clang_launch: bool,
         is_pytorch_extension: bool,
         clean_ctx: GeneratedFileCleaner,
         show_progress: bool) -> None:
-    result = preprocessor(output_directory, filepath, all_files, includes, stats,
+    result = preprocessor(output_directory, filepath, all_files, header_include_dirs, stats,
                           hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
 
     fin_path = os.path.abspath(os.path.join(output_directory, filepath))
     # Show what happened
-    if show_progress:
+    if show_progress and "ignored" not in str(result["status"]):
         print(
             fin_path, "->",
-            result["hipified_path"], result["status"])
-
-    if result["hipified_path"] is not None:
-        HIPIFY_FINAL_RESULT[fin_path] = result
-
-
-def preprocess(
-        output_directory: str,
-        all_files: Iterable,
-        includes: Iterable,
-        show_detailed: bool = False,
-        show_progress: bool = True,
-        hip_clang_launch: bool = False,
-        is_pytorch_extension: bool = False,
-        clean_ctx: Optional[GeneratedFileCleaner] = None) -> HipifyFinalResult:
-    """
-    Call preprocessor on selected files.
-
-    Arguments)
-        show_detailed - Show a detailed summary of the transpilation process.
-    """
-
-    if clean_ctx is None:
-        clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
-
-    # Preprocessing statistics.
-    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
-
-    for filepath in all_files:
-        preprocess_file_and_save_result(output_directory, filepath, all_files, includes, stats,
-                                        hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+            result["hipified_path"], result["status"], flush=True)
 
-    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr)
-
-    # Show detailed summary
-    if show_detailed:
-        compute_stats(stats)
-
-    return HIPIFY_FINAL_RESULT
+    HIPIFY_FINAL_RESULT[fin_path] = result
 
 
 def compute_stats(stats):
@@ -544,16 +510,17 @@ def replace_extern_shared(input_string):
     return output_string
 
 
-def get_hip_file_path(filepath, is_pytorch_extension=False):
+def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
     """
     Returns the new name of the hipified file
     """
     # At the moment, some PyTorch source files are HIPified in place.  The predicate
     # is_out_of_place tells us if this is the case or not.
-    if not is_pytorch_extension and not is_out_of_place(filepath):
-        return filepath
+    assert(not os.path.isabs(rel_filepath))
+    if not is_pytorch_extension and not is_out_of_place(rel_filepath):
+        return rel_filepath
 
-    dirpath, filename = os.path.split(filepath)
+    dirpath, filename = os.path.split(rel_filepath)
     root, ext = os.path.splitext(filename)
 
     # Here's the plan:
@@ -597,6 +564,7 @@ def get_hip_file_path(filepath, is_pytorch_extension=False):
     orig_dirpath = dirpath
 
     dirpath = dirpath.replace('cuda', 'hip')
+    dirpath = dirpath.replace('CUDA', 'HIP')
     dirpath = dirpath.replace('THC', 'THH')
 
     root = root.replace('cuda', 'hip')
@@ -614,36 +582,39 @@ def get_hip_file_path(filepath, is_pytorch_extension=False):
     return os.path.join(dirpath, root + ext)
 
 
-def is_out_of_place(filepath):
-    if filepath.startswith("torch/"):
+def is_out_of_place(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("torch/"):
         return False
-    if filepath.startswith("tools/autograd/templates/"):
+    if rel_filepath.startswith("tools/autograd/templates/"):
         return False
     return True
 
 
 # Keep this synchronized with includes/ignores in build_amd.py
-def is_pytorch_file(filepath):
-    if filepath.startswith("aten/"):
-        if filepath.startswith("aten/src/ATen/core/"):
+def is_pytorch_file(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("aten/"):
+        if rel_filepath.startswith("aten/src/ATen/core/"):
             return False
         return True
-    if filepath.startswith("torch/"):
+    if rel_filepath.startswith("torch/"):
         return True
-    if filepath.startswith("tools/autograd/templates/"):
+    if rel_filepath.startswith("tools/autograd/templates/"):
         return True
     return False
 
 
-def is_cusparse_file(filepath):
-    if is_pytorch_file(filepath):
-        return "sparse" in filepath.lower()
+def is_cusparse_file(rel_filepath):
+    if is_pytorch_file(rel_filepath):
+        return "sparse" in rel_filepath.lower()
     return False
 
-def is_caffe2_gpu_file(filepath):
-    if filepath.startswith("c10/cuda"):
+def is_caffe2_gpu_file(rel_filepath):
+    assert(not os.path.isabs(rel_filepath))
+    if rel_filepath.startswith("c10/cuda"):
         return True
-    filename = os.path.basename(filepath)
+    filename = os.path.basename(rel_filepath)
     _, ext = os.path.splitext(filename)
     return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
 
@@ -752,31 +723,36 @@ def pattern(self):
 Returns a dict with the following keys:
     "hipified_path" : absolute path of hipified source file
     "status"        : "ok"      if hipified file was written out
-                      "skipped" if an identical hipified file already existed
-                      "ignored" if the source file was a hipified file itself
+                      "skipped" if an identical hipified file already existed or hipified file couldn't be written out
+                      "ignored" if the source file was a hipified file itself or not meant to be hipified
 """
 def preprocessor(
         output_directory: str,
         filepath: str,
         all_files: Iterable,
-        includes: Iterable,
+        header_include_dirs: Iterable,
         stats: Dict[str, List],
         hip_clang_launch: bool,
         is_pytorch_extension: bool,
         clean_ctx: GeneratedFileCleaner,
         show_progress: bool) -> HipifyResult:
     """ Executes the CUDA -> HIP conversion on the specified file. """
+    if filepath not in all_files:
+        return {"hipified_path": None, "status": "[ignored, not to be hipified]"}
+
     fin_path = os.path.abspath(os.path.join(output_directory, filepath))
+    rel_filepath = os.path.relpath(filepath, output_directory)
 
     with open(fin_path, 'r', encoding='utf-8') as fin:
         if fin.readline() == HIPIFY_C_BREADCRUMB:
-            return {"hipified_path": None, "status": "ignored"}
+            return {"hipified_path": None, "status": "[ignored, input is hipified output]"}
         fin.seek(0)
         output_source = fin.read()
 
     orig_output_source = output_source
 
-    fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension)))
+    # get_hip_file_path needs a relative path to work correctly
+    fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(rel_filepath, is_pytorch_extension)))
     if not os.path.exists(os.path.dirname(fout_path)):
         clean_ctx.makedirs(os.path.dirname(fout_path))
 
@@ -791,9 +767,9 @@ def pt_sparse_repl(m):
     if is_pytorch_extension:
         output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
     else:
-        if is_cusparse_file(filepath):
+        if is_cusparse_file(rel_filepath):
             output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_sparse_repl, output_source)
-        elif is_pytorch_file(filepath):
+        elif is_pytorch_file(rel_filepath):
             output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
         else:
             def c2_repl(m):
@@ -808,8 +784,10 @@ def repl(m):
             if (
                 f.startswith("ATen/cuda")
                 or f.startswith("ATen/native/cuda")
+                or f.startswith("ATen/native/nested/cuda")
                 or f.startswith("ATen/native/quantized/cuda")
                 or f.startswith("ATen/native/sparse/cuda")
+                or f.startswith("ATen/native/transformers/cuda")
                 or f.startswith("THC/")
                 or (f.startswith("THC") and not f.startswith("THCP"))
             ):
@@ -827,8 +805,8 @@ def repl(m):
                         header_filepath = header_path_to_check
                 # If not found, look in include dirs one by one and first match wins
                 if header_filepath is None:
-                    for include in includes:
-                        header_dir_to_check = os.path.join(output_directory, os.path.dirname(include))
+                    for header_include_dir in header_include_dirs:
+                        header_dir_to_check = os.path.join(output_directory, header_include_dir)
                         header_path_to_check = os.path.abspath(os.path.join(header_dir_to_check, f))
                         if os.path.exists(header_path_to_check):
                             header_dir = header_dir_to_check
@@ -839,12 +817,12 @@ def repl(m):
                 # Hipify header file first if needed
                 if header_filepath not in HIPIFY_FINAL_RESULT:
                     preprocess_file_and_save_result(output_directory,
-                                                    os.path.relpath(header_filepath, output_directory),
-                                                    all_files, includes, stats, hip_clang_launch, is_pytorch_extension,
-                                                    clean_ctx, show_progress)
-                value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
-                assert value is not None
-                return templ.format(os.path.relpath(value, header_dir))
+                                                    header_filepath,
+                                                    all_files, header_include_dirs, stats, hip_clang_launch,
+                                                    is_pytorch_extension, clean_ctx, show_progress)
+                hipified_header_filepath = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
+                return templ.format(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None
+                                                    else header_filepath, header_dir))
 
             return m.group(0)
         return repl
@@ -878,7 +856,7 @@ def repl(m):
         and orig_output_source == output_source
         and os.path.dirname(fin_path) == os.path.dirname(fout_path)
     ):
-        return {"hipified_path": fin_path, "status": "ok"}
+        return {"hipified_path": fin_path, "status": "[skipped, no changes]"}
 
     # Add hipify breadcrumb for C-style files to avoid re-hipification
     if fin_path != fout_path and match_extensions(fin_path, (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".hpp")):
@@ -892,13 +870,13 @@ def repl(m):
         try:
             with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout:
                 fout.write(output_source)
-            return {"hipified_path": fout_path, "status": "ok"}
+            return {"hipified_path": fout_path, "status": "[ok]"}
         except PermissionError as e:
             print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}",
                   file=sys.stderr)
-            return {"hipified_path": fin_path, "status": "skipped"}
+            return {"hipified_path": fin_path, "status": "[skipped, no permissions]"}
     else:
-        return {"hipified_path": fout_path, "status": "skipped"}
+        return {"hipified_path": fout_path, "status": "[skipped, already hipified]"}
 
 def file_specific_replacement(filepath, search_string, replace_string, strict=False):
     with openf(filepath, "r+") as f:
@@ -993,14 +971,17 @@ def hipify(
     project_directory: str,
     show_detailed: bool = False,
     extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"),
+    header_extensions: Iterable = (".cuh", ".h", ".hpp"),
     output_directory: str = "",
-    includes: Iterable = (),
+    header_include_dirs: Iterable = (),
+    includes: Iterable = ('*',),
     extra_files: Iterable = (),
     out_of_place_only: bool = False,
     ignores: Iterable = (),
     show_progress: bool = True,
     hip_clang_launch: bool = False,
     is_pytorch_extension: bool = False,
+    hipify_extra_files_only: bool = False,
     clean_ctx: Optional[GeneratedFileCleaner] = None
 ) -> HipifyFinalResult:
     if project_directory == "":
@@ -1016,6 +997,10 @@ def hipify(
         project_directory.rstrip("/")
         output_directory = project_directory + "_amd"
 
+    if project_directory != output_directory:
+        includes = [include.replace(project_directory, output_directory) for include in includes]
+        ignores = [ignore.replace(project_directory, output_directory) for ignore in ignores]
+
     # Copy from project directory to output directory if not done already.
     if not os.path.exists(output_directory):
         shutil.copytree(project_directory, output_directory)
@@ -1025,19 +1010,42 @@ def hipify(
                                         out_of_place_only=out_of_place_only,
                                         is_pytorch_extension=is_pytorch_extension))
     all_files_set = set(all_files)
-    # Convert extra_files to relative paths since all_files has all relative paths
     for f in extra_files:
-        f_rel = os.path.relpath(f, output_directory)
-        if f_rel not in all_files_set:
-            all_files.append(f_rel)
-
-    # Start Preprocessor
-    return preprocess(
-        output_directory,
-        all_files,
-        includes,
-        show_detailed=show_detailed,
-        show_progress=show_progress,
-        hip_clang_launch=hip_clang_launch,
-        is_pytorch_extension=is_pytorch_extension,
-        clean_ctx=clean_ctx)
+        if not os.path.isabs(f):
+            f = os.path.join(output_directory, f)
+        if f not in all_files_set:
+            all_files.append(f)
+
+    # List all files in header_include_paths to ensure they are hipified
+    from pathlib import Path
+    for header_include_dir in header_include_dirs:
+        if os.path.isabs(header_include_dir):
+            header_include_dir_path = Path(header_include_dir)
+        else:
+            header_include_dir_path = Path(os.path.join(output_directory, header_include_dir))
+        for path in header_include_dir_path.rglob('*'):
+            if (
+                path.is_file()
+                and _fnmatch(str(path), includes)
+                and (not _fnmatch(str(path), ignores))
+                and match_extensions(path.name, header_extensions)
+            ):
+                all_files.append(str(path))
+
+    if clean_ctx is None:
+        clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
+
+    # Preprocessing statistics.
+    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
+
+    for filepath in (all_files if not hipify_extra_files_only else extra_files):
+        preprocess_file_and_save_result(output_directory, filepath, all_files, header_include_dirs,
+                                        stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+
+    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr)
+
+    # Show detailed summary
+    if show_detailed:
+        compute_stats(stats)
+
+    return HIPIFY_FINAL_RESULT
diff --git a/torch/utils/jit/log_extract.py b/torch/utils/jit/log_extract.py
new file mode 100644
index 000000000000..d9d0e442c1db
--- /dev/null
+++ b/torch/utils/jit/log_extract.py
@@ -0,0 +1,113 @@
+from contextlib import contextmanager
+from typing import Any, List, Tuple, cast
+import random
+import torch
+import time
+from torch.utils.benchmark import Timer
+
+def extract_ir(filename: str) -> List[str]:
+    BEGIN = "<GRAPH_EXPORT>"
+    END = "</GRAPH_EXPORT>"
+    pfx = None
+    current = ""
+    graphs = []
+    with open(filename, "r") as f:
+        split_strs = f.read().split(BEGIN)
+        for i, split_str in enumerate(split_strs):
+            if i == 0:
+                continue
+            end_loc = split_str.find(END)
+            if end_loc == -1:
+                continue
+            s = split_str[:end_loc]
+            pfx = split_strs[i - 1].splitlines()[-1]
+            lines = [x[len(pfx):] for x in s.splitlines(keepends=True)]
+            graphs.append(''.join(lines))
+
+    return graphs
+
+
+def make_tensor_from_type(inp_type: torch._C.TensorType):
+    size = inp_type.sizes()
+    stride = inp_type.strides()
+    device = inp_type.device()
+    dtype = inp_type.dtype()
+    assert size is not None
+    assert stride is not None
+    assert device is not None
+    assert dtype is not None
+    return torch.empty_strided(size=size, stride=stride, device=device, dtype=dtype)
+
+def load_graph_and_inputs(ir: str) -> Tuple[Any, List[Any]]:
+    graph = torch._C.parse_ir(ir, parse_tensor_constants=True)
+    graph.makeMultiOutputIntoTuple()
+    inputs = []
+    for inp in graph.inputs():
+        if isinstance(inp.type(), torch._C.FloatType):
+            inputs.append(random.uniform(.1, 100))
+        elif isinstance(inp.type(), torch._C.IntType):
+            inputs.append(random.randint(1, 100))
+        elif isinstance(inp.type(), torch._C.TensorType):
+            tensorType = cast(torch._C.TensorType, inp.type())
+            inputs.append(make_tensor_from_type(tensorType))
+        elif isinstance(inp.type(), torch._C.BoolType):
+            inputs.append(random.randint(0, 1) == 1)
+        else:
+            raise NotImplementedError(f"A default value is not implemented for type {inp.type()}")
+
+    func = torch._C._create_function_from_graph("forward", graph)
+    torch._C._jit_pass_erase_shape_information(func.graph)
+    return (func, inputs)
+
+def time_cuda(fn, inputs, test_runs):
+    t = Timer(stmt="fn(*inputs)", globals={"fn": fn, "inputs" : inputs})
+    times = t.blocked_autorange()
+    return times.median * 1000  # time in ms
+
+def time_cpu(fn, inputs, test_runs):
+    s = time.perf_counter()
+    for _ in range(test_runs):
+        fn(*inputs)
+    e = time.perf_counter()
+    return (e - s) / test_runs * 1000  # time in ms
+
+def run_test(ir, inputs, *, warmup_runs=10, test_runs=20) -> float:
+    graph, _ = load_graph_and_inputs(ir)
+    for _ in range(warmup_runs):
+        graph(*inputs)
+
+    is_cpu = None
+    for input in inputs:
+        if isinstance(input, torch.Tensor):
+            is_cpu = input.device.type == "cpu"
+            break
+    assert is_cpu is not None
+
+    out = time_cpu(graph, inputs, test_runs) if is_cpu else time_cuda(graph, inputs, test_runs)
+    return out
+
+@contextmanager
+def no_fuser(*args, **kwargs):
+    old_optimize = torch._C._get_graph_executor_optimize(False)
+    try:
+        yield
+    finally:
+        torch._C._get_graph_executor_optimize(old_optimize)
+
+def run_baseline_no_fusion(ir, inputs) -> float:
+    with no_fuser():
+        return run_test(ir, inputs)
+
+
+def run_nnc(ir, inputs, dynamic) -> float:
+    try:
+        strat = [("DYNAMIC", 10)] if dynamic else [("STATIC", 10)]
+        old_strat = torch.jit.set_fusion_strategy(strat)
+        with torch.jit.fuser("fuser1"):
+            return run_test(ir, inputs)
+    finally:
+        torch.jit.set_fusion_strategy(old_strat)
+
+def run_nvfuser(ir, inputs) -> float:
+    with torch.jit.fuser("fuser2"):
+        return run_test(ir, inputs)
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index 2fdf8ad6e6d6..6c105d0b123c 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -180,6 +180,26 @@ def forward(self, x):
             False,  # cuda_enabled
         )
 
+class MkldnnPrelu(torch.jit.ScriptModule):
+    def __init__(self, dense_module, dtype):
+        super(MkldnnPrelu, self).__init__()
+        self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.training)
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = state[0].to_mkldnn()
+        self.training = state[1]
+
+    @torch.jit.script_method
+    def forward(self, x):
+        x_mkldnn = x if x.is_mkldnn else x.to_mkldnn()
+        y_mkldnn = torch.prelu(x_mkldnn, self.weight)
+        y = y_mkldnn if x.is_mkldnn else y_mkldnn.to_dense()
+        return y
 
 def to_mkldnn(module, dtype=torch.float):
     assert dtype in [torch.float, torch.bfloat16], \
@@ -198,6 +218,8 @@ def m_fn(m, d):
             # For batchnorm bf16 path, OneDNN requires weight and bias need fp32 dtype.
             # so it doesn't need dtype argument.
             return MkldnnBatchNorm(m)
+        elif isinstance(m, torch.nn.PReLU):
+            return MkldnnPrelu(m, d)
         else:
             return m
 
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 64287faf2a1e..60d711e9bba1 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -120,7 +120,11 @@ def hierarchical_pickle(data):
         }
     if isinstance(data, torch.utils.show_pickle.FakeObject):
         typename = f"{data.module}.{data.name}"
-        if typename.startswith("__torch__.") or typename.startswith("torch.jit.LoweredModule."):
+        if (
+            typename.startswith("__torch__.") or
+            typename.startswith("torch.jit.LoweredWrapper.") or
+            typename.startswith("torch.jit.LoweredModule.")
+        ):
             assert data.args == ()
             return {
                 "__module_type__": typename,
@@ -254,7 +258,22 @@ def ist(s):
             # Parse debug info and add begin/end markers if not present
             # to ensure that we cover the entire source code.
             debug_info_t = pickle.loads(raw_debug)
-            assert isinstance(debug_info_t, tuple)
+            text_table = None
+
+            if (len(debug_info_t) == 3 and
+                    isinstance(debug_info_t[0], str) and
+                    debug_info_t[0] == 'FORMAT_WITH_STRING_TABLE'):
+                _, text_table, content = debug_info_t
+
+                def parse_new_format(line):
+                    # (0, (('', '', 0), 0, 0))
+                    num, ((text_indexes, fname_idx, offset), start, end), tag = line
+                    text = ''.join(text_table[x] for x in text_indexes)  # type: ignore[index]
+                    fname = text_table[fname_idx]  # type: ignore[index]
+                    return num, ((text, fname, offset), start, end), tag
+
+                debug_info_t = map(parse_new_format, content)
+
             debug_info = list(debug_info_t)
             if not debug_info:
                 debug_info.append((0, (('', '', 0), 0, 0)))
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 4f48d6fb487a..ae2e83aa4277 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -81,10 +81,10 @@ def persistent_load(self, pid):
     # for strings that catches the decode exception and replaces it with
     # a sentinel object.
     def load_binunicode(self):
-        strlen, = struct.unpack("<I", self.read(4))
+        strlen, = struct.unpack("<I", self.read(4))  # type: ignore[attr-defined]
         if strlen > sys.maxsize:
             raise Exception("String too long.")
-        str_bytes = self.read(strlen)
+        str_bytes = self.read(strlen)  # type: ignore[attr-defined]
         obj: Any
         try:
             obj = str(str_bytes, "utf-8", "surrogatepass")
@@ -92,8 +92,8 @@ def load_binunicode(self):
             if not self.catch_invalid_utf8:
                 raise
             obj = FakeObject("builtin", "UnicodeDecodeError", (str(exn),))
-        self.append(obj)
-    dispatch[pickle.BINUNICODE[0]] = load_binunicode
+        self.append(obj)  # type: ignore[attr-defined]
+    dispatch[pickle.BINUNICODE[0]] = load_binunicode  # type: ignore[assignment]
 
     @classmethod
     def dump(cls, in_stream, out_stream):
diff --git a/torch/utils/tensorboard/__init__.py b/torch/utils/tensorboard/__init__.py
index 7512ed6fab42..b6c437e90a4f 100644
--- a/torch/utils/tensorboard/__init__.py
+++ b/torch/utils/tensorboard/__init__.py
@@ -1,8 +1,10 @@
 import tensorboard
 from distutils.version import LooseVersion
 
-if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):
-    raise ImportError('TensorBoard logging requires TensorBoard version 1.15 or above')
+if not hasattr(tensorboard, "__version__") or LooseVersion(
+    tensorboard.__version__
+) < LooseVersion("1.15"):
+    raise ImportError("TensorBoard logging requires TensorBoard version 1.15 or above")
 
 del LooseVersion
 del tensorboard
diff --git a/torch/utils/tensorboard/_caffe2_graph.py b/torch/utils/tensorboard/_caffe2_graph.py
index 3cd3a3608fed..155b0ad846b0 100644
--- a/torch/utils/tensorboard/_caffe2_graph.py
+++ b/torch/utils/tensorboard/_caffe2_graph.py
@@ -15,7 +15,7 @@
 
 
 def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
-    '''
+    """
     Make the name unique by appending a unique number to the name. Used for SSA.
 
     Args:
@@ -27,19 +27,19 @@ def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
 
     Returns:
         x (string): A version of name that is not in seen.
-    '''
+    """
     assert name is not None
     i = min_version
-    x = '%s_%d' % (name, i) if i else name
+    x = "%s_%d" % (name, i) if i else name
     while x in seen:
         i += 1
-        x = '%s_%d' % (name, i)
+        x = "%s_%d" % (name, i)
     seen.add(x)
     return x
 
 
 def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
-    '''
+    """
     Convert some of the common names in Caffe2 to tensorflow.
     NOTE: The common names in both Caffe2 and Tensorflow are currently
         hardcoded, if either side changes at some point, then this code should
@@ -53,7 +53,7 @@ def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
 
     Returns:
         None. The _rename_all() call modifies blob_name_tracker and ops in-place.
-    '''
+    """
     WEIGHT = re.compile(r"(_w)$")
     WEIGHT_ = re.compile(r"(_w_)")
     BN = re.compile(r"(_bn)$")
@@ -67,18 +67,19 @@ def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
     BRANCH = re.compile(r"(_branch)")
 
     def f(name):
-        inter_name = WEIGHT_.sub('/weight_', WEIGHT.sub('/weight', name))
-        inter_name = BN_.sub('/batchnorm_', BN.sub('/batchnorm', inter_name))
-        inter_name = BIAS_.sub('/bias_', BIAS.sub('/bias', inter_name))
-        inter_name = SCALE_.sub('/scale_', SCALE.sub('/scale', inter_name))
-        inter_name = SUM_.sub('/sum_', SUM.sub('/sum', inter_name))
-        new_name = BRANCH.sub('/branch', inter_name)
+        inter_name = WEIGHT_.sub("/weight_", WEIGHT.sub("/weight", name))
+        inter_name = BN_.sub("/batchnorm_", BN.sub("/batchnorm", inter_name))
+        inter_name = BIAS_.sub("/bias_", BIAS.sub("/bias", inter_name))
+        inter_name = SCALE_.sub("/scale_", SCALE.sub("/scale", inter_name))
+        inter_name = SUM_.sub("/sum_", SUM.sub("/sum", inter_name))
+        new_name = BRANCH.sub("/branch", inter_name)
         return new_name
+
     _rename_all(shapes, blob_name_tracker, ops, f)
 
 
 def _convert_to_ssa(shapes, blob_name_tracker, ops):
-    '''
+    """
     Convert an operator graph to SSA (i.e. out-of-place).
     i.e. blobs will be renamed so that each blob is produced only once.
 
@@ -90,7 +91,7 @@ def _convert_to_ssa(shapes, blob_name_tracker, ops):
 
     Returns:
         None. Modifies blob_name_tracker and ops in-place.
-    '''
+    """
     ir = core.IR(ops)
     seen: Set[str] = set()
     versioned: Dict[Tuple[str, int], int] = {}
@@ -132,7 +133,7 @@ def ssa_name(name: str, versions: Dict[str, int]) -> int:
 
 
 def _get_blob_names(ops):
-    '''
+    """
     Get all the operator input and output blobs and perform dedup on their names.
 
     Args:
@@ -140,7 +141,7 @@ def _get_blob_names(ops):
 
     Returns:
         set containing distinct inputs and outputs from 'ops'
-    '''
+    """
     names = set()
     for op in ops:
         names.update(op.input)
@@ -149,7 +150,7 @@ def _get_blob_names(ops):
 
 
 def _remap_keys(old_dict, rename_fn):
-    '''
+    """
     Rename keys of 'old_dict' according to 'rename_fn'.
 
     Args:
@@ -159,15 +160,14 @@ def _remap_keys(old_dict, rename_fn):
 
     Returns:
         None. Modifies old_dict in-place.
-    '''
-    new_dict = {rename_fn(key): value for key,
-                value in old_dict.items()}
+    """
+    new_dict = {rename_fn(key): value for key, value in old_dict.items()}
     old_dict.clear()
     old_dict.update(new_dict)
 
 
 def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
-    '''
+    """
     Rename all the names in the operators.
 
     Args:
@@ -180,13 +180,12 @@ def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
     Returns:
         None. Modifies shapes, blob_name_tracker and ops in-place using the
             specified 'rename_fn'.
-    '''
+    """
     seen: Set[str] = set()
     renamed: Dict[Tuple[str, int], int] = {}
 
     def g(name):
-        """ Collision-free version of f.
-        """
+        """Collision-free version of f."""
         if name is None:
             return None
         if name in renamed:
@@ -230,16 +229,18 @@ def _add_gradient_scope(shapes, blob_name_tracker, ops):
     Returns:
         None. Modifies shapes, blob_name_tracker and ops in-place by renaming.
     """
+
     def f(name):
-        if '_grad' in name:
-            return 'GRADIENTS/{}'.format(name)
+        if "_grad" in name:
+            return "GRADIENTS/{}".format(name)
         else:
             return name
+
     _rename_all(shapes, blob_name_tracker, ops, f)
 
 
 def _replace_colons(shapes, blob_name_tracker, ops, repl):
-    '''
+    """
     `:i` has a special meaning in Tensorflow. This function replaces all colons
     with $ to avoid any possible conflicts.
 
@@ -254,14 +255,16 @@ def _replace_colons(shapes, blob_name_tracker, ops, repl):
     Returns:
         None. Modifies blob_name_tracker in-place.
 
-    '''
+    """
+
     def f(name):
-        return name.replace(':', repl)
+        return name.replace(":", repl)
+
     _rename_all(shapes, blob_name_tracker, ops, f)
 
 
 def _fill_missing_operator_names(ops):
-    '''
+    """
     Give missing operators a name.
     We expect C2 operators to be generally unnamed. This gives them a scope
     (inferred from their outputs) and a name after their type. Duplicates will
@@ -272,7 +275,7 @@ def _fill_missing_operator_names(ops):
 
     Returns:
         None: Modifies 'ops' in-place.
-    '''
+    """
     seen = set()
     for op in ops:
         # Make sure operator names don't collide with blobs.
@@ -282,18 +285,17 @@ def _fill_missing_operator_names(ops):
         if op.name:
             name = op.name
         elif op.output or op.input:
-            name_list = [os.path.dirname(name)
-                         for name in op.output or op.input]
+            name_list = [os.path.dirname(name) for name in op.output or op.input]
             scope = os.path.commonprefix(name_list)
             name = os.path.join(scope, op.type)
         else:
             name = op.type
-        assert(name)
+        assert name
         op.name = _make_unique_name(seen, name)
 
 
 def _tf_device(device_option):
-    '''
+    """
     Handle the devices.
 
     Args:
@@ -306,10 +308,13 @@ def _tf_device(device_option):
     Returns:
         Formatted string representing device information contained in
             device_option.
-    '''
+    """
     if not device_option.HasField("device_type"):
         return ""
-    if device_option.device_type == caffe2_pb2.CPU or device_option.device_type == caffe2_pb2.MKLDNN:
+    if (
+        device_option.device_type == caffe2_pb2.CPU
+        or device_option.device_type == caffe2_pb2.MKLDNN
+    ):
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
         return "/gpu:{}".format(device_option.device_id)
@@ -317,7 +322,7 @@ def _tf_device(device_option):
 
 
 def _add_tf_shape(attr_dict, ints):
-    '''
+    """
     Converts a list of ints to a TensorShapeProto representing the dimensions of
     a blob/object.
 
@@ -327,17 +332,17 @@ def _add_tf_shape(attr_dict, ints):
 
     Returns:
         None. Modifies attr_dict in-place.
-    '''
+    """
     shape_proto = TensorShapeProto()
     for i in ints:
         dim = TensorShapeProto.Dim()
         dim.size = i
         shape_proto.dim.extend([dim])
-    attr_dict['_output_shapes'].list.shape.extend([shape_proto])
+    attr_dict["_output_shapes"].list.shape.extend([shape_proto])
 
 
 def _set_tf_attr(attr_dict, arg):
-    '''
+    """
     Add attributes to a node. Key is the arg.name, and values can be shape,
         floats, strings, ints or an empty list.
 
@@ -347,9 +352,9 @@ def _set_tf_attr(attr_dict, arg):
 
     Returns:
         None. Modifies attr_dict in-place.
-    '''
+    """
     k = arg.name
-    if k == 'shape' and arg.ints:
+    if k == "shape" and arg.ints:
         _add_tf_shape(attr_dict, arg.ints)
         return
     # Float
@@ -363,7 +368,7 @@ def _set_tf_attr(attr_dict, arg):
     # String
     if arg.HasField("s"):
         attr_dict[k].s = (
-            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode('utf-8')
+            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode("utf-8")
         )
         return
     if arg.floats:
@@ -374,8 +379,7 @@ def _set_tf_attr(attr_dict, arg):
         return
     if arg.strings:
         attr_dict[k].list.s.extend(
-            s if isinstance(s, bytes) else str(s).encode('utf-8')
-            for s in arg.strings
+            s if isinstance(s, bytes) else str(s).encode("utf-8") for s in arg.strings
         )
         return
     # The value is an empty list.
@@ -383,7 +387,7 @@ def _set_tf_attr(attr_dict, arg):
 
 
 def _operator_to_node(shapes, op):
-    '''
+    """
     Converts an operator to a node in a TF graph.
 
     Args:
@@ -392,7 +396,7 @@ def _operator_to_node(shapes, op):
 
     Returns:
         n: The TF graph node created from op.
-    '''
+    """
     assert op.name, op
     n = NodeDef()
     n.name = op.name
@@ -411,7 +415,7 @@ def _operator_to_node(shapes, op):
 
 
 def _operator_to_node_simp(op, inter_blobs, seen):
-    '''
+    """
     Convert the operators to nodes.
 
     Args:
@@ -421,7 +425,7 @@ def _operator_to_node_simp(op, inter_blobs, seen):
 
     Returns:
         nodes: Nodes representing 'op' and the outputs of 'op'
-    '''
+    """
     assert op
     nodes = []
     outputs = [o for o in op.output if o not in inter_blobs]
@@ -445,7 +449,7 @@ def _operator_to_node_simp(op, inter_blobs, seen):
             name_list = list(outputs)
             scope = os.path.commonprefix(name_list)
             name = os.path.join(scope, op.type)
-        assert(name)
+        assert name
         op.name = _make_unique_name(seen, name)
         device = _tf_device(op.device_option)
 
@@ -454,7 +458,7 @@ def _operator_to_node_simp(op, inter_blobs, seen):
             n = NodeDef()
             n.name = output
             n.input.extend([op.name])
-            n.op = 'Blob'
+            n.op = "Blob"
             n.device = device
             nodes.append(n)
 
@@ -472,7 +476,7 @@ def _operator_to_node_simp(op, inter_blobs, seen):
 
 
 def _blob_to_node(producing_ops, shapes, name):
-    '''
+    """
     Converts a blob (operator input or output) to a node in a TF graph.
 
     Args:
@@ -483,7 +487,7 @@ def _blob_to_node(producing_ops, shapes, name):
 
     Returns:
         n: The TF graph node created from this blob.
-    '''
+    """
     assert name
     n = NodeDef()
     n.name = name
@@ -491,15 +495,15 @@ def _blob_to_node(producing_ops, shapes, name):
     # outputs. See _operators_to_graph_def.
     produced_by = producing_ops.get(name, [])
     if len(produced_by) > 0:
-        n.op = 'Blob'
+        n.op = "Blob"
     else:
         # This blob is not produced but is instead a TF Placeholder where a
         # value is passed in.
-        n.op = 'Placeholder'
-    n.input.extend('%s:%d' % (p_op.name, i) for p_op, i in produced_by)
+        n.op = "Placeholder"
+    n.input.extend("%s:%d" % (p_op.name, i) for p_op, i in produced_by)
     if produced_by:
         device = produced_by[0][0].device_option
-        if (all(producer[0].device_option == device for producer in produced_by)):
+        if all(producer[0].device_option == device for producer in produced_by):
             n.device = _tf_device(device)
     if shapes and name in shapes:
         _add_tf_shape(n.attr, shapes[name])
@@ -507,7 +511,7 @@ def _blob_to_node(producing_ops, shapes, name):
 
 
 def _clear_debug_info(ops, perform_clear):
-    '''
+    """
     Removes debug information from operators, they are copious.
 
     Args:
@@ -520,17 +524,17 @@ def _clear_debug_info(ops, perform_clear):
         None. Modifies the list of Caffe2 operators in-place and removes the
         'debug_info' field.
 
-    '''
+    """
     if not perform_clear:
         return
 
     for op in ops:
-        if op.HasField('debug_info'):
-            op.ClearField('debug_info')
+        if op.HasField("debug_info"):
+            op.ClearField("debug_info")
 
 
 def _check_if_forward(blob):
-    '''
+    """
     Blobs with names containing '_m' or 'grad' are part of the backward pass.
         This function references facebookresearch/Detectron/detectron/utils/net.py.
 
@@ -539,13 +543,13 @@ def _check_if_forward(blob):
 
     Returns:
         Boolean representing whether this blob is part of the forward pass
-    '''
+    """
     #
-    return (blob.find('__m') < 0 or blob.find('grad') < 0)
+    return blob.find("__m") < 0 or blob.find("grad") < 0
 
 
 def _check_if_cpu(blob):
-    '''
+    """
     Check if the blob's name starts with '_gpu'.
 
     Args:
@@ -553,12 +557,12 @@ def _check_if_cpu(blob):
 
     Returns:
         Boolean representing whether this blob is associated with a gpu
-    '''
-    return not blob.startswith('_gpu')
+    """
+    return not blob.startswith("_gpu")
 
 
 def _compute_in_out(ops):
-    '''
+    """
     Find the input, intermediate and output nodes of a set of operators.
 
     Args:
@@ -568,7 +572,7 @@ def _compute_in_out(ops):
         input_blobs: The input nodes of the set of operators
         inter_blobs: The intermediate nodes of the set of operators
         output_blobs: The output nodes of the set of operators
-    '''
+    """
     in_blobs = set()
     out_blobs = set()
 
@@ -580,14 +584,14 @@ def _compute_in_out(ops):
 
     input_blobs = list(in_blobs.difference(out_blobs))
     output_blobs = list(out_blobs.difference(in_blobs))
-    inter_blobs = {b for b in output_blobs if b.startswith('_')}
+    inter_blobs = {b for b in output_blobs if b.startswith("_")}
     output_blobs = [b for b in output_blobs if b not in inter_blobs]
 
     return input_blobs, inter_blobs, output_blobs
 
 
 def _filter_ops(ops, filter_fn, perform_filter):
-    '''
+    """
     Filter unwanted operators based on criteria in 'filter_fn'.
 
     Args:
@@ -599,7 +603,7 @@ def _filter_ops(ops, filter_fn, perform_filter):
 
     Returns:
         new_ops: Subset of ops containing a subset of their inputs and outputs.
-    '''
+    """
     if not perform_filter:
         return ops
 
@@ -624,14 +628,14 @@ def _filter_ops(ops, filter_fn, perform_filter):
 def _operators_to_graph_def(
     shapes,
     ops,
-    colon_replacement='$',
+    colon_replacement="$",
     with_ssa=True,
     with_gradient_scope=True,
     blob_name_tracker=None,
     show_simplified=False,
-    custom_rename=None
+    custom_rename=None,
 ):
-    '''
+    """
     Main function to convert set of operators to a graph.
 
     Args:
@@ -661,7 +665,7 @@ def _operators_to_graph_def(
     Returns:
         current_graph: GraphDef representing the computation graph formed by the
             set of operators.
-    '''
+    """
     if blob_name_tracker is not None:
         blob_name_tracker.clear()
     else:
@@ -670,8 +674,7 @@ def _operators_to_graph_def(
     blob_name_tracker.update(_get_blob_names(ops))
 
     _clear_debug_info(ops, show_simplified)  # clear_debug_info
-    ops = _filter_ops(ops, _check_if_forward,
-                      show_simplified)  # show_forward_only
+    ops = _filter_ops(ops, _check_if_forward, show_simplified)  # show_forward_only
     ops = _filter_ops(ops, _check_if_cpu, show_simplified)  # show_cpu_only
     if custom_rename:
         _rename_all(shapes, blob_name_tracker, ops, custom_rename)
@@ -690,9 +693,11 @@ def _operators_to_graph_def(
     current_graph = GraphDef()
     seen = set(input_blobs)
     for op in ops:
-        nodes_from_op = _operator_to_node_simp(op, inter_blobs, seen) if \
-            show_simplified else \
-            [_operator_to_node(shapes, op)]  # .extend() expects an iterable
+        nodes_from_op = (
+            _operator_to_node_simp(op, inter_blobs, seen)
+            if show_simplified
+            else [_operator_to_node(shapes, op)]
+        )  # .extend() expects an iterable
         current_graph.node.extend(nodes_from_op)
         for input_blob in op.input:
             blobs.add(input_blob)
@@ -711,7 +716,7 @@ def _operators_to_graph_def(
 
 
 def _propagate_device_option(net_def):
-    '''
+    """
     Propagate the device options from net to operators.
 
     Args:
@@ -723,7 +728,7 @@ def _propagate_device_option(net_def):
             modifies the op device_option in-place to be the net device_option
             if the op has no pre-existing device_option, and leaves the op as-is
             if it already has a device_option.
-    '''
+    """
     if not net_def.HasField("device_option"):
         return
     for op in net_def.op:
@@ -732,7 +737,7 @@ def _propagate_device_option(net_def):
 
 
 def _try_get_shapes(nets):
-    '''
+    """
     Get missing shapes for all blobs contained in the nets.
 
     Args:
@@ -742,19 +747,19 @@ def _try_get_shapes(nets):
         Dictionary containing blob name to shape/dimensions mapping. The net
             is a computation graph that is composed of operators, and the
             operators have input and output blobs, each with their own dims.
-    '''
+    """
     try:
         # Note: this will inspect the workspace for better or worse.
         # We don't care about the types, only the shapes
         shapes, _ = workspace.InferShapesAndTypes(nets)
         return shapes
     except Exception as e:
-        logging.warning('Failed to compute shapes: %s', e)
+        logging.warning("Failed to compute shapes: %s", e)
         return {}
 
 
 def model_to_graph_def(model, **kwargs):
-    '''
+    """
     Convert a Caffe2 model to a Tensorflow graph. This function extracts
     'param_init_net' and 'net' from the model and passes it to nets_to_graph()
     for further processing.
@@ -766,13 +771,13 @@ def model_to_graph_def(model, **kwargs):
     Returns:
         Call to nets_to_graph_def() with extracted 'param_init_net', 'net' and
             **kwargs. See _operators_to_graph_def for detailed **kwargs.
-    '''
+    """
     nets = [model.param_init_net, model.net]
     return nets_to_graph_def(nets, **kwargs)
 
 
 def nets_to_graph_def(nets, shapes=None, **kwargs):
-    '''
+    """
     Convert a set of Caffe2 nets to a Tensorflow graph.
 
     Args:
@@ -783,7 +788,7 @@ def nets_to_graph_def(nets, shapes=None, **kwargs):
     Returns:
         Call to protos_to_graph_def() with the extracted NetDef protobufs and
             **kwargs. See _operators_to_graph_def for detailed **kwargs.
-    '''
+    """
     # if shapes is None:
     #     shapes = _try_get_shapes(nets)
     # _try_get_shapes(nets) depends on workspace.InferShapesAndTypes(nets),
@@ -795,7 +800,7 @@ def nets_to_graph_def(nets, shapes=None, **kwargs):
 
 
 def protos_to_graph_def(net_defs, shapes=None, **kwargs):
-    '''
+    """
     Convert a set of Caffe2 net definitions to a Tensorflow graph.
 
     Args:
@@ -807,7 +812,7 @@ def protos_to_graph_def(net_defs, shapes=None, **kwargs):
         Call to _operators_to_graph_def() with the extracted operators from the
             NetDefs and **kwargs. See _operators_to_graph_def for detailed
             **kwargs.
-    '''
+    """
     for net in net_defs:
         _propagate_device_option(net)
     shapes = copy.deepcopy(shapes or {})
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index 52d237cb1a0f..385719295e82 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -22,7 +22,10 @@ def make_np(x):
     if isinstance(x, torch.Tensor):
         return _prepare_pytorch(x)
     raise NotImplementedError(
-        'Got {}, but numpy array, torch tensor, or caffe2 blob name are expected.'.format(type(x)))
+        "Got {}, but numpy array, torch tensor, or caffe2 blob name are expected.".format(
+            type(x)
+        )
+    )
 
 
 def _prepare_pytorch(x):
@@ -32,5 +35,6 @@ def _prepare_pytorch(x):
 
 def _prepare_caffe2(x):
     from caffe2.python import workspace
+
     x = workspace.FetchBlob(x)
     return x
diff --git a/torch/utils/tensorboard/_embedding.py b/torch/utils/tensorboard/_embedding.py
index 8bf83e975634..7d7d180c2809 100644
--- a/torch/utils/tensorboard/_embedding.py
+++ b/torch/utils/tensorboard/_embedding.py
@@ -10,14 +10,14 @@ def make_tsv(metadata, save_path, metadata_header=None):
     if not metadata_header:
         metadata = [str(x) for x in metadata]
     else:
-        assert len(metadata_header) == len(metadata[0]), \
-            'len of header must be equal to the number of columns in metadata'
-        metadata = ['\t'.join(str(e) for e in l)
-                    for l in [metadata_header] + metadata]
+        assert len(metadata_header) == len(
+            metadata[0]
+        ), "len of header must be equal to the number of columns in metadata"
+        metadata = ["\t".join(str(e) for e in l) for l in [metadata_header] + metadata]
 
-    metadata_bytes = tf.compat.as_bytes('\n'.join(metadata) + '\n')
+    metadata_bytes = tf.compat.as_bytes("\n".join(metadata) + "\n")
     fs = tf.io.gfile.get_filesystem(save_path)
-    fs.write(fs.join(save_path, 'metadata.tsv'), metadata_bytes, binary_mode=True)
+    fs.write(fs.join(save_path, "metadata.tsv"), metadata_bytes, binary_mode=True)
 
 
 # https://github.com/tensorflow/tensorboard/issues/44 image label will be squared
@@ -31,9 +31,11 @@ def make_sprite(label_img, save_path):
     arranged_img_CHW = make_grid(make_np(label_img), ncols=nrow)
 
     # augment images so that #images equals nrow*nrow
-    arranged_augment_square_HWC = np.zeros((arranged_img_CHW.shape[2], arranged_img_CHW.shape[2], 3))
+    arranged_augment_square_HWC = np.zeros(
+        (arranged_img_CHW.shape[2], arranged_img_CHW.shape[2], 3)
+    )
     arranged_img_HWC = arranged_img_CHW.transpose(1, 2, 0)  # chw -> hwc
-    arranged_augment_square_HWC[:arranged_img_HWC.shape[0], :, :] = arranged_img_HWC
+    arranged_augment_square_HWC[: arranged_img_HWC.shape[0], :, :] = arranged_img_HWC
     im = Image.fromarray(np.uint8((arranged_augment_square_HWC * 255).clip(0, 255)))
 
     with BytesIO() as buf:
@@ -41,30 +43,30 @@ def make_sprite(label_img, save_path):
         im_bytes = buf.getvalue()
 
     fs = tf.io.gfile.get_filesystem(save_path)
-    fs.write(fs.join(save_path, 'sprite.png'), im_bytes, binary_mode=True)
+    fs.write(fs.join(save_path, "sprite.png"), im_bytes, binary_mode=True)
 
 
 def get_embedding_info(metadata, label_img, filesys, subdir, global_step, tag):
     info = EmbeddingInfo()
     info.tensor_name = "{}:{}".format(tag, str(global_step).zfill(5))
-    info.tensor_path = filesys.join(subdir, 'tensors.tsv')
+    info.tensor_path = filesys.join(subdir, "tensors.tsv")
     if metadata is not None:
-        info.metadata_path = filesys.join(subdir, 'metadata.tsv')
+        info.metadata_path = filesys.join(subdir, "metadata.tsv")
     if label_img is not None:
-        info.sprite.image_path = filesys.join(subdir, 'sprite.png')
+        info.sprite.image_path = filesys.join(subdir, "sprite.png")
         info.sprite.single_image_dim.extend([label_img.size(3), label_img.size(2)])
     return info
 
 
 def write_pbtxt(save_path, contents):
     fs = tf.io.gfile.get_filesystem(save_path)
-    config_path = fs.join(save_path, 'projector_config.pbtxt')
+    config_path = fs.join(save_path, "projector_config.pbtxt")
     fs.write(config_path, tf.compat.as_bytes(contents), binary_mode=True)
 
 
 def make_mat(matlist, save_path):
     fs = tf.io.gfile.get_filesystem(save_path)
-    with tf.io.gfile.GFile(fs.join(save_path, 'tensors.tsv'), 'wb') as f:
+    with tf.io.gfile.GFile(fs.join(save_path, "tensors.tsv"), "wb") as f:
         for x in matlist:
             x = [str(i.item()) for i in x]
-            f.write(tf.compat.as_bytes('\t'.join(x) + '\n'))
+            f.write(tf.compat.as_bytes("\t".join(x) + "\n"))
diff --git a/torch/utils/tensorboard/_onnx_graph.py b/torch/utils/tensorboard/_onnx_graph.py
index b47bac67a65c..8a47f29ae5aa 100644
--- a/torch/utils/tensorboard/_onnx_graph.py
+++ b/torch/utils/tensorboard/_onnx_graph.py
@@ -7,6 +7,7 @@
 
 def load_onnx_graph(fname):
     import onnx
+
     m = onnx.load(fname)
     g = m.graph
     return parse(g)
@@ -16,39 +17,48 @@ def parse(graph):
     nodes_proto = []
     nodes = []
     import itertools
+
     for node in itertools.chain(graph.input, graph.output):
         nodes_proto.append(node)
 
     for node in nodes_proto:
         print(node.name)
         shapeproto = TensorShapeProto(
-            dim=[TensorShapeProto.Dim(size=d.dim_value) for d in node.type.tensor_type.shape.dim])
-        nodes.append(NodeDef(
-            name=node.name.encode(encoding='utf_8'),
-            op='Variable',
-            input=[],
-            attr={
-                'dtype': AttrValue(type=node.type.tensor_type.elem_type),
-                'shape': AttrValue(shape=shapeproto),
-            })
+            dim=[
+                TensorShapeProto.Dim(size=d.dim_value)
+                for d in node.type.tensor_type.shape.dim
+            ]
+        )
+        nodes.append(
+            NodeDef(
+                name=node.name.encode(encoding="utf_8"),
+                op="Variable",
+                input=[],
+                attr={
+                    "dtype": AttrValue(type=node.type.tensor_type.elem_type),
+                    "shape": AttrValue(shape=shapeproto),
+                },
+            )
         )
 
     for node in graph.node:
         _attr = []
         for s in node.attribute:
-            _attr.append(' = '.join([str(f[1]) for f in s.ListFields()]))
-        attr = ', '.join(_attr).encode(encoding='utf_8')
+            _attr.append(" = ".join([str(f[1]) for f in s.ListFields()]))
+        attr = ", ".join(_attr).encode(encoding="utf_8")
         print(node.output[0])
-        nodes.append(NodeDef(
-            name=node.output[0].encode(encoding='utf_8'),
-            op=node.op_type,
-            input=node.input,
-            attr={'parameters': AttrValue(s=attr)},
-        ))
+        nodes.append(
+            NodeDef(
+                name=node.output[0].encode(encoding="utf_8"),
+                op=node.op_type,
+                input=node.input,
+                attr={"parameters": AttrValue(s=attr)},
+            )
+        )
 
     # two pass token replacement, appends opname to object id
     mapping = {}
     for node in nodes:
-        mapping[node.name] = node.op + '_' + node.name
+        mapping[node.name] = node.op + "_" + node.name
 
     return GraphDef(node=nodes, versions=VersionDef(producer=22))
diff --git a/torch/utils/tensorboard/_proto_graph.py b/torch/utils/tensorboard/_proto_graph.py
index f2656dee6363..0700e0ffab35 100644
--- a/torch/utils/tensorboard/_proto_graph.py
+++ b/torch/utils/tensorboard/_proto_graph.py
@@ -1,3 +1,4 @@
+from typing import Optional
 from tensorboard.compat.proto.node_def_pb2 import NodeDef
 from tensorboard.compat.proto.attr_value_pb2 import AttrValue
 from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
@@ -11,10 +12,10 @@ def attr_value_proto(dtype, shape, s):
     """
     attr = {}
     if s is not None:
-        attr['attr'] = AttrValue(s=s.encode(encoding='utf_8'))
+        attr["attr"] = AttrValue(s=s.encode(encoding="utf_8"))
     if shape is not None:
         shapeproto = tensor_shape_proto(shape)
-        attr['_output_shapes'] = AttrValue(list=AttrValue.ListValue(shape=[shapeproto]))
+        attr["_output_shapes"] = AttrValue(list=AttrValue.ListValue(shape=[shapeproto]))
     return attr
 
 
@@ -25,14 +26,15 @@ def tensor_shape_proto(outputsize):
     return TensorShapeProto(dim=[TensorShapeProto.Dim(size=d) for d in outputsize])
 
 
-def node_proto(name,
-               op='UnSpecified',
-               input=None,
-               dtype=None,
-               shape=None,  # type: tuple
-               outputsize=None,
-               attributes=''
-               ):
+def node_proto(
+    name,
+    op="UnSpecified",
+    input=None,
+    dtype=None,
+    shape: Optional[tuple] = None,
+    outputsize=None,
+    attributes="",
+):
     """Creates an object matching
     https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/node_def.proto
     """
@@ -41,8 +43,8 @@ def node_proto(name,
     if not isinstance(input, list):
         input = [input]
     return NodeDef(
-        name=name.encode(encoding='utf_8'),
+        name=name.encode(encoding="utf_8"),
         op=op,
         input=input,
-        attr=attr_value_proto(dtype, outputsize, attributes)
+        attr=attr_value_proto(dtype, outputsize, attributes),
     )
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index feed2fb09a59..99d051b72bda 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -9,21 +9,38 @@
 import torch
 from ._proto_graph import node_proto
 
-methods_OP = ['attributeNames', 'hasMultipleOutputs', 'hasUses', 'inputs',
-              'kind', 'outputs', 'outputsSize', 'scopeName']
+methods_OP = [
+    "attributeNames",
+    "hasMultipleOutputs",
+    "hasUses",
+    "inputs",
+    "kind",
+    "outputs",
+    "outputsSize",
+    "scopeName",
+]
 # Some additional methods to explure for methods_IO are
 #
 #   'unique' (type int)
 #   'type' (type <Tensor<class 'torch._C.Type'>>)
 #
 # But the below are sufficient for now.
-methods_IO = ['node', 'offset', 'debugName']
+methods_IO = ["node", "offset", "debugName"]
+
+GETATTR_KIND = "prim::GetAttr"
+CLASSTYPE_KIND = "ClassType"
 
-GETATTR_KIND = 'prim::GetAttr'
-CLASSTYPE_KIND = 'ClassType'
 
 class NodeBase(object):
-    def __init__(self, debugName=None, inputs=None, scope=None, tensor_size=None, op_type='UnSpecified', attributes=''):
+    def __init__(
+        self,
+        debugName=None,
+        inputs=None,
+        scope=None,
+        tensor_size=None,
+        op_type="UnSpecified",
+        attributes="",
+    ):
         # TODO; Specify a __slots__ for this class or potentially
         # used namedtuple instead
         self.debugName = debugName
@@ -37,9 +54,11 @@ def __repr__(self):
         repr = []
         repr.append(str(type(self)))
         for m in dir(self):
-            if '__' not in m:
-                repr.append(m + ': ' + str(getattr(self, m)) + str(type(getattr(self, m))))
-        return '\n'.join(repr) + '\n\n'
+            if "__" not in m:
+                repr.append(
+                    m + ": " + str(getattr(self, m)) + str(type(getattr(self, m)))
+                )
+        return "\n".join(repr) + "\n\n"
 
 
 class NodePy(NodeBase):
@@ -49,7 +68,7 @@ def __init__(self, node_cpp, valid_methods):
         self.inputs = []
 
         for m in valid_methods:
-            if m == 'inputs' or m == 'outputs':
+            if m == "inputs" or m == "outputs":
                 list_of_node = list(getattr(node_cpp, m)())
                 io_unique_names = []
                 io_tensor_sizes = []
@@ -61,7 +80,7 @@ def __init__(self, node_cpp, valid_methods):
                         io_tensor_sizes.append(None)
 
                 setattr(self, m, io_unique_names)
-                setattr(self, m + 'tensor_size', io_tensor_sizes)
+                setattr(self, m + "tensor_size", io_tensor_sizes)
 
             else:
                 setattr(self, m, getattr(node_cpp, m)())
@@ -73,16 +92,18 @@ def __init__(self, node_cpp, input_or_output=None):
         try:
             tensor_size = node_cpp.type().sizes()
         except RuntimeError:
-            tensor_size = [1, ]  # fail when constant model is used.
+            tensor_size = [
+                1,
+            ]  # fail when constant model is used.
         self.tensor_size = tensor_size
         # Kind attribute string is purely descriptive and will be shown
         # in detailed information for the node in TensorBoard's graph plugin.
         #
         # NodePyOP nodes get this from their kind() method.
-        self.kind = 'Parameter'
+        self.kind = "Parameter"
         if input_or_output:
             self.input_or_output = input_or_output
-            self.kind = 'IO Node'
+            self.kind = "IO Node"
 
 
 class NodePyOP(NodePy):
@@ -90,7 +111,9 @@ def __init__(self, node_cpp):
         super(NodePyOP, self).__init__(node_cpp, methods_OP)
         # Replace single quote which causes strange behavior in TensorBoard
         # TODO: See if we can remove this in the future
-        self.attributes = str({k: node_cpp[k] for k in node_cpp.attributeNames()}).replace("'", ' ')
+        self.attributes = str(
+            {k: node_cpp[k] for k in node_cpp.attributeNames()}
+        ).replace("'", " ")
         self.kind = node_cpp.kind()
 
 
@@ -115,11 +138,12 @@ class GraphPy(object):
     in a heuristic kind of way. Bookkeeping is done with shallowest_scope_name
     and scope_name_appeared.
     """
+
     def __init__(self):
         self.nodes_op = []
         self.nodes_io = OrderedDict()
         self.unique_name_to_scoped_name = {}
-        self.shallowest_scope_name = 'default'
+        self.shallowest_scope_name = "default"
         self.scope_name_appeared = []
 
     def append(self, x):
@@ -129,7 +153,7 @@ def append(self, x):
             self.nodes_op.append(x)
 
     def printall(self):
-        print('all nodes')
+        print("all nodes")
         for node in self.nodes_op:
             print(node)
         for key in self.nodes_io:
@@ -138,41 +162,54 @@ def printall(self):
     def find_common_root(self):
         for fullscope in self.scope_name_appeared:
             if fullscope:
-                self.shallowest_scope_name = fullscope.split('/')[0]
+                self.shallowest_scope_name = fullscope.split("/")[0]
 
     def populate_namespace_from_OP_to_IO(self):
         for node in self.nodes_op:
             for node_output, outputSize in zip(node.outputs, node.outputstensor_size):
                 self.scope_name_appeared.append(node.scopeName)
-                self.nodes_io[node_output] = NodeBase(node_output,
-                                                      node.inputs,
-                                                      node.scopeName,
-                                                      outputSize,
-                                                      op_type=node.kind,
-                                                      attributes=node.attributes)
+                self.nodes_io[node_output] = NodeBase(
+                    node_output,
+                    node.inputs,
+                    node.scopeName,
+                    outputSize,
+                    op_type=node.kind,
+                    attributes=node.attributes,
+                )
 
         self.find_common_root()
 
         for node in self.nodes_op:
             for input_node_id in node.inputs:
-                self.unique_name_to_scoped_name[input_node_id] = node.scopeName + '/' + input_node_id
+                self.unique_name_to_scoped_name[input_node_id] = (
+                    node.scopeName + "/" + input_node_id
+                )
 
         for key, node in self.nodes_io.items():
             if type(node) == NodeBase:
-                self.unique_name_to_scoped_name[key] = node.scope + '/' + node.debugName
-            if hasattr(node, 'input_or_output'):
-                self.unique_name_to_scoped_name[key] = node.input_or_output + '/' + node.debugName
-
-            if hasattr(node, 'scope') and node.scope is not None:
-                self.unique_name_to_scoped_name[key] = node.scope + '/' + node.debugName
-                if node.scope == '' and self.shallowest_scope_name:
-                    self.unique_name_to_scoped_name[node.debugName] = self.shallowest_scope_name + '/' + node.debugName
+                self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
+            if hasattr(node, "input_or_output"):
+                self.unique_name_to_scoped_name[key] = (
+                    node.input_or_output + "/" + node.debugName
+                )
+
+            if hasattr(node, "scope") and node.scope is not None:
+                self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
+                if node.scope == "" and self.shallowest_scope_name:
+                    self.unique_name_to_scoped_name[node.debugName] = (
+                        self.shallowest_scope_name + "/" + node.debugName
+                    )
 
         # replace name
         for key, node in self.nodes_io.items():
-            self.nodes_io[key].inputs = [self.unique_name_to_scoped_name[node_input_id] for node_input_id in node.inputs]
+            self.nodes_io[key].inputs = [
+                self.unique_name_to_scoped_name[node_input_id]
+                for node_input_id in node.inputs
+            ]
             if node.debugName in self.unique_name_to_scoped_name:
-                self.nodes_io[key].debugName = self.unique_name_to_scoped_name[node.debugName]
+                self.nodes_io[key].debugName = self.unique_name_to_scoped_name[
+                    node.debugName
+                ]
 
     def to_proto(self):
         """
@@ -183,11 +220,15 @@ def to_proto(self):
         # PyTorch supports it
         nodes = []
         for v in self.nodes_io.values():
-            nodes.append(node_proto(v.debugName,
-                                    input=v.inputs,
-                                    outputsize=v.tensor_size,
-                                    op=v.kind,
-                                    attributes=v.attributes))
+            nodes.append(
+                node_proto(
+                    v.debugName,
+                    input=v.inputs,
+                    outputsize=v.tensor_size,
+                    op=v.kind,
+                    attributes=v.attributes,
+                )
+            )
         return nodes
 
 
@@ -208,26 +249,32 @@ def parse(graph, trace, args=None, omit_useless_nodes=True):
     nodes_py = GraphPy()
     for node in graph.inputs():
         if omit_useless_nodes:
-            if len(node.uses()) == 0:  # number of user of the node (= number of outputs/ fanout)
+            if (
+                len(node.uses()) == 0
+            ):  # number of user of the node (= number of outputs/ fanout)
                 continue
 
         if node.type().kind() != CLASSTYPE_KIND:
-            nodes_py.append(NodePyIO(node, 'input'))
+            nodes_py.append(NodePyIO(node, "input"))
 
     attr_to_scope: Dict[Any, str] = dict()
     for node in graph.nodes():
         if node.kind() == GETATTR_KIND:
-            attr_name = node.s('name')
+            attr_name = node.s("name")
             attr_key = node.output().debugName()
             parent = node.input().node()
-            if parent.kind() == GETATTR_KIND:  # If the parent node is not the top-level "self" node
-                parent_attr_name = parent.s('name')
+            if (
+                parent.kind() == GETATTR_KIND
+            ):  # If the parent node is not the top-level "self" node
+                parent_attr_name = parent.s("name")
                 parent_attr_key = parent.output().debugName()
                 parent_scope = attr_to_scope[parent_attr_key]
-                attr_scope = parent_scope.split('/')[-1]
-                attr_to_scope[attr_key] = '{}/{}.{}'.format(parent_scope, attr_scope, attr_name)
+                attr_scope = parent_scope.split("/")[-1]
+                attr_to_scope[attr_key] = "{}/{}.{}".format(
+                    parent_scope, attr_scope, attr_name
+                )
             else:
-                attr_to_scope[attr_key] = '__module.{}'.format(attr_name)
+                attr_to_scope[attr_key] = "__module.{}".format(attr_name)
             # We don't need classtype nodes; scope will provide this information
             if node.output().type().kind() != CLASSTYPE_KIND:
                 node_py = NodePyOP(node)
@@ -237,7 +284,7 @@ def parse(graph, trace, args=None, omit_useless_nodes=True):
             nodes_py.append(NodePyOP(node))
 
     for i, node in enumerate(graph.outputs()):  # Create sink nodes for output ops
-        node_pyio = NodePyIO(node, 'output')
+        node_pyio = NodePyIO(node, "output")
         node_pyio.debugName = "output.{}".format(i + 1)
         node_pyio.inputs = [node.debugName()]
         nodes_py.append(node_pyio)
@@ -246,27 +293,25 @@ def parse_traced_name(module):
         if isinstance(module, torch.jit.TracedModule):
             module_name = module._name
         else:
-            module_name = getattr(module, 'original_name', "Module")
+            module_name = getattr(module, "original_name", "Module")
         return module_name
 
     alias_to_name = dict()
     base_name = parse_traced_name(trace)
-    for name, module in trace.named_modules(prefix='__module'):
+    for name, module in trace.named_modules(prefix="__module"):
         mod_name = parse_traced_name(module)
-        attr_name = name.split('.')[-1]
-        alias_to_name[name] = '{}[{}]'.format(mod_name, attr_name)
+        attr_name = name.split(".")[-1]
+        alias_to_name[name] = "{}[{}]".format(mod_name, attr_name)
 
     for node in nodes_py.nodes_op:
-        module_aliases = node.scopeName.split('/')
+        module_aliases = node.scopeName.split("/")
         replacements = [
-            alias_to_name[alias]
-            if alias in alias_to_name
-            else alias.split('.')[-1]
+            alias_to_name[alias] if alias in alias_to_name else alias.split(".")[-1]
             for alias in module_aliases
         ]
         node.scopeName = base_name
         if any(replacements):
-            node.scopeName += '/' + '/'.join(replacements)
+            node.scopeName += "/" + "/".join(replacements)
 
     nodes_py.populate_namespace_from_OP_to_IO()
     return nodes_py.to_proto()
@@ -286,14 +331,16 @@ def graph(model, args, verbose=False, use_strict_trace=True):
         `torch.jit.trace`. Pass False when you want the tracer to
         record your mutable container types (list, dict)
     """
-    with torch.onnx.select_model_mode_for_export(model, torch.onnx.TrainingMode.EVAL):  # TODO: move outside of torch.onnx?
+    with torch.onnx.select_model_mode_for_export(
+        model, torch.onnx.TrainingMode.EVAL
+    ):  # TODO: move outside of torch.onnx?
         try:
             trace = torch.jit.trace(model, args, strict=use_strict_trace)
             graph = trace.graph
             torch._C._jit_pass_inline(graph)
         except RuntimeError as e:
             print(e)
-            print('Error occurs, No graph saved')
+            print("Error occurs, No graph saved")
             raise e
 
     if verbose:
@@ -309,7 +356,9 @@ def graph(model, args, verbose=False, use_strict_trace=True):
     # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/test/graph-test.ts
     # and
     # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/step_stats.proto
-    stepstats = RunMetadata(step_stats=StepStats(dev_stats=[DeviceStepStats(device="/device:CPU:0")]))
+    stepstats = RunMetadata(
+        step_stats=StepStats(dev_stats=[DeviceStepStats(device="/device:CPU:0")])
+    )
     return GraphDef(node=list_of_nodes, versions=VersionDef(producer=22)), stepstats
     # The producer version has been reverse engineered from standard
     # TensorBoard logged data.
diff --git a/torch/utils/tensorboard/_utils.py b/torch/utils/tensorboard/_utils.py
index 8f0e97137def..648ab10fdec8 100644
--- a/torch/utils/tensorboard/_utils.py
+++ b/torch/utils/tensorboard/_utils.py
@@ -46,18 +46,17 @@ def _prepare_video(V):
     b, t, c, h, w = V.shape
 
     if V.dtype == np.uint8:
-        V = np.float32(V) / 255.
+        V = np.float32(V) / 255.0
 
     def is_power2(num):
         return num != 0 and ((num & (num - 1)) == 0)
 
     # pad to nearest power of 2, all at once
     if not is_power2(V.shape[0]):
-        len_addition = int(2**V.shape[0].bit_length() - V.shape[0])
-        V = np.concatenate(
-            (V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
+        len_addition = int(2 ** V.shape[0].bit_length() - V.shape[0])
+        V = np.concatenate((V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
 
-    n_rows = 2**((b.bit_length() - 1) // 2)
+    n_rows = 2 ** ((b.bit_length() - 1) // 2)
     n_cols = V.shape[0] // n_rows
 
     V = np.reshape(V, newshape=(n_rows, n_cols, t, c, h, w))
@@ -69,8 +68,7 @@ def is_power2(num):
 
 def make_grid(I, ncols=8):
     # I: N1HW or N3HW
-    assert isinstance(
-        I, np.ndarray), 'plugin error, should pass numpy array here'
+    assert isinstance(I, np.ndarray), "plugin error, should pass numpy array here"
     if I.shape[1] == 1:
         I = np.concatenate([I, I, I], 1)
     assert I.ndim == 4 and I.shape[1] == 3
@@ -85,7 +83,7 @@ def make_grid(I, ncols=8):
         for x in range(ncols):
             if i >= nimg:
                 break
-            canvas[:, y * H:(y + 1) * H, x * W:(x + 1) * W] = I[i]
+            canvas[:, y * H : (y + 1) * H, x * W : (x + 1) * W] = I[i]
             i = i + 1
     return canvas
 
@@ -95,27 +93,35 @@ def make_grid(I, ncols=8):
 
 
 def convert_to_HWC(tensor, input_format):  # tensor: numpy array
-    assert(len(set(input_format)) == len(input_format)), "You can not use the same dimension shordhand twice. \
-        input_format: {}".format(input_format)
-    assert(len(tensor.shape) == len(input_format)), "size of input tensor and input format are different. \
-        tensor shape: {}, input_format: {}".format(tensor.shape, input_format)
+    assert len(set(input_format)) == len(
+        input_format
+    ), "You can not use the same dimension shordhand twice. \
+        input_format: {}".format(
+        input_format
+    )
+    assert len(tensor.shape) == len(
+        input_format
+    ), "size of input tensor and input format are different. \
+        tensor shape: {}, input_format: {}".format(
+        tensor.shape, input_format
+    )
     input_format = input_format.upper()
 
     if len(input_format) == 4:
-        index = [input_format.find(c) for c in 'NCHW']
+        index = [input_format.find(c) for c in "NCHW"]
         tensor_NCHW = tensor.transpose(index)
         tensor_CHW = make_grid(tensor_NCHW)
         return tensor_CHW.transpose(1, 2, 0)
 
     if len(input_format) == 3:
-        index = [input_format.find(c) for c in 'HWC']
+        index = [input_format.find(c) for c in "HWC"]
         tensor_HWC = tensor.transpose(index)
         if tensor_HWC.shape[2] == 1:
             tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC], 2)
         return tensor_HWC
 
     if len(input_format) == 2:
-        index = [input_format.find(c) for c in 'HW']
+        index = [input_format.find(c) for c in "HW"]
         tensor = tensor.transpose(index)
         tensor = np.stack([tensor, tensor, tensor], 2)
         return tensor
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index f8a572ed948a..5c8d9cdefff7 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -20,30 +20,52 @@
 from ._utils import _prepare_video, convert_to_HWC
 
 
+logger = logging.getLogger(__name__)
+
+
 def _calc_scale_factor(tensor):
     converted = tensor.numpy() if not isinstance(tensor, np.ndarray) else tensor
     return 1 if converted.dtype == np.uint8 else 255
 
 
-def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, color='black', color_text='black', thickness=2):
+def _draw_single_box(
+    image,
+    xmin,
+    ymin,
+    xmax,
+    ymax,
+    display_str,
+    color="black",
+    color_text="black",
+    thickness=2,
+):
     from PIL import ImageDraw, ImageFont
+
     font = ImageFont.load_default()
     draw = ImageDraw.Draw(image)
     (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
-    draw.line([(left, top), (left, bottom), (right, bottom),
-               (right, top), (left, top)], width=thickness, fill=color)
+    draw.line(
+        [(left, top), (left, bottom), (right, bottom), (right, top), (left, top)],
+        width=thickness,
+        fill=color,
+    )
     if display_str:
         text_bottom = bottom
         # Reverse list and print from bottom to top.
         text_width, text_height = font.getsize(display_str)
         margin = np.ceil(0.05 * text_height)
         draw.rectangle(
-            [(left, text_bottom - text_height - 2 * margin),
-             (left + text_width, text_bottom)], fill=color
+            [
+                (left, text_bottom - text_height - 2 * margin),
+                (left + text_width, text_bottom),
+            ],
+            fill=color,
         )
         draw.text(
             (left + margin, text_bottom - text_height - margin),
-            display_str, fill=color_text, font=font
+            display_str,
+            fill=color_text,
+            font=font,
         )
     return image
 
@@ -70,17 +92,24 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
     import torch
     from six import string_types
     from tensorboard.plugins.hparams.api_pb2 import (
-        Experiment, HParamInfo, MetricInfo, MetricName, Status, DataType
+        Experiment,
+        HParamInfo,
+        MetricInfo,
+        MetricName,
+        Status,
+        DataType,
     )
     from tensorboard.plugins.hparams.metadata import (
         PLUGIN_NAME,
         PLUGIN_DATA_VERSION,
         EXPERIMENT_TAG,
         SESSION_START_INFO_TAG,
-        SESSION_END_INFO_TAG
+        SESSION_END_INFO_TAG,
     )
     from tensorboard.plugins.hparams.plugin_data_pb2 import (
-        HParamsPluginData, SessionEndInfo, SessionStartInfo
+        HParamsPluginData,
+        SessionEndInfo,
+        SessionStartInfo,
     )
 
     # TODO: expose other parameters in the future.
@@ -93,11 +122,15 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
     # hparam_infos=[hp], metric_infos=[mt], user='tw')
 
     if not isinstance(hparam_dict, dict):
-        logging.warning('parameter: hparam_dict should be a dictionary, nothing logged.')
-        raise TypeError('parameter: hparam_dict should be a dictionary, nothing logged.')
+        logger.warning("parameter: hparam_dict should be a dictionary, nothing logged.")
+        raise TypeError(
+            "parameter: hparam_dict should be a dictionary, nothing logged."
+        )
     if not isinstance(metric_dict, dict):
-        logging.warning('parameter: metric_dict should be a dictionary, nothing logged.')
-        raise TypeError('parameter: metric_dict should be a dictionary, nothing logged.')
+        logger.warning("parameter: metric_dict should be a dictionary, nothing logged.")
+        raise TypeError(
+            "parameter: metric_dict should be a dictionary, nothing logged."
+        )
 
     hparam_domain_discrete = hparam_domain_discrete or {}
     if not isinstance(hparam_domain_discrete, dict):
@@ -116,7 +149,6 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
             )
     hps = []
 
-
     ssi = SessionStartInfo()
     for k, v in hparam_dict.items():
         if v is None:
@@ -192,14 +224,14 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
             ssi.hparams[k].number_value = v
             hps.append(HParamInfo(name=k, type=DataType.Value("DATA_TYPE_FLOAT64")))
             continue
-        raise ValueError('value should be one of int, float, str, bool, or torch.Tensor')
+        raise ValueError(
+            "value should be one of int, float, str, bool, or torch.Tensor"
+        )
 
-    content = HParamsPluginData(session_start_info=ssi,
-                                version=PLUGIN_DATA_VERSION)
+    content = HParamsPluginData(session_start_info=ssi, version=PLUGIN_DATA_VERSION)
     smd = SummaryMetadata(
         plugin_data=SummaryMetadata.PluginData(
-            plugin_name=PLUGIN_NAME,
-            content=content.SerializeToString()
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
         )
     )
     ssi = Summary(value=[Summary.Value(tag=SESSION_START_INFO_TAG, metadata=smd)])
@@ -211,18 +243,16 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
     content = HParamsPluginData(experiment=exp, version=PLUGIN_DATA_VERSION)
     smd = SummaryMetadata(
         plugin_data=SummaryMetadata.PluginData(
-            plugin_name=PLUGIN_NAME,
-            content=content.SerializeToString()
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
         )
     )
     exp = Summary(value=[Summary.Value(tag=EXPERIMENT_TAG, metadata=smd)])
 
-    sei = SessionEndInfo(status=Status.Value('STATUS_SUCCESS'))
+    sei = SessionEndInfo(status=Status.Value("STATUS_SUCCESS"))
     content = HParamsPluginData(session_end_info=sei, version=PLUGIN_DATA_VERSION)
     smd = SummaryMetadata(
         plugin_data=SummaryMetadata.PluginData(
-            plugin_name=PLUGIN_NAME,
-            content=content.SerializeToString()
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
         )
     )
     sei = Summary(value=[Summary.Value(tag=SESSION_END_INFO_TAG, metadata=smd)])
@@ -290,13 +320,15 @@ def histogram_raw(name, min, max, num, sum, sum_squares, bucket_limits, bucket_c
       A scalar `Tensor` of type `string`. The serialized `Summary` protocol
       buffer.
     """
-    hist = HistogramProto(min=min,
-                          max=max,
-                          num=num,
-                          sum=sum,
-                          sum_squares=sum_squares,
-                          bucket_limit=bucket_limits,
-                          bucket=bucket_counts)
+    hist = HistogramProto(
+        min=min,
+        max=max,
+        num=num,
+        sum=sum,
+        sum_squares=sum_squares,
+        bucket_limit=bucket_limits,
+        bucket=bucket_counts,
+    )
     return Summary(value=[Summary.Value(tag=name, histo=hist)])
 
 
@@ -324,7 +356,7 @@ def histogram(name, values, bins, max_bins=None):
 def make_histogram(values, bins, max_bins=None):
     """Convert values into a histogram proto using logic from histogram.cc."""
     if values.size == 0:
-        raise ValueError('The input has no element.')
+        raise ValueError("The input has no element.")
     values = values.reshape(-1)
     counts, limits = np.histogram(values, bins=bins)
     num_bins = len(counts)
@@ -332,8 +364,12 @@ def make_histogram(values, bins, max_bins=None):
         subsampling = num_bins // max_bins
         subsampling_remainder = num_bins % subsampling
         if subsampling_remainder != 0:
-            counts = np.pad(counts, pad_width=[[0, subsampling - subsampling_remainder]],
-                            mode="constant", constant_values=0)
+            counts = np.pad(
+                counts,
+                pad_width=[[0, subsampling - subsampling_remainder]],
+                mode="constant",
+                constant_values=0,
+            )
         counts = counts.reshape(-1, subsampling).sum(axis=-1)
         new_limits = np.empty((counts.size + 1,), limits.dtype)
         new_limits[:-1] = limits[:-1:subsampling]
@@ -351,23 +387,27 @@ def make_histogram(values, bins, max_bins=None):
     # included, we include an empty bin left.
     # If start == 0, we need to add an empty one left, otherwise we can just include the bin left to the
     # first nonzero-count bin:
-    counts = counts[start - 1:end] if start > 0 else np.concatenate([[0], counts[:end]])
-    limits = limits[start:end + 1]
+    counts = (
+        counts[start - 1 : end] if start > 0 else np.concatenate([[0], counts[:end]])
+    )
+    limits = limits[start : end + 1]
 
     if counts.size == 0 or limits.size == 0:
-        raise ValueError('The histogram is empty, please file a bug report.')
+        raise ValueError("The histogram is empty, please file a bug report.")
 
     sum_sq = values.dot(values)
-    return HistogramProto(min=values.min(),
-                          max=values.max(),
-                          num=len(values),
-                          sum=values.sum(),
-                          sum_squares=sum_sq,
-                          bucket_limit=limits.tolist(),
-                          bucket=counts.tolist())
+    return HistogramProto(
+        min=values.min(),
+        max=values.max(),
+        num=len(values),
+        sum=values.sum(),
+        sum_squares=sum_sq,
+        bucket_limit=limits.tolist(),
+        bucket=counts.tolist(),
+    )
 
 
-def image(tag, tensor, rescale=1, dataformats='NCHW'):
+def image(tag, tensor, rescale=1, dataformats="NCHW"):
     """Outputs a `Summary` protocol buffer with images.
     The summary has up to `max_images` summary values containing images. The
     images are built from `tensor` which must be 3-D with shape `[height, width,
@@ -402,17 +442,17 @@ def image(tag, tensor, rescale=1, dataformats='NCHW'):
     return Summary(value=[Summary.Value(tag=tag, image=image)])
 
 
-def image_boxes(tag, tensor_image, tensor_boxes, rescale=1, dataformats='CHW', labels=None):
-    '''Outputs a `Summary` protocol buffer with images.'''
+def image_boxes(
+    tag, tensor_image, tensor_boxes, rescale=1, dataformats="CHW", labels=None
+):
+    """Outputs a `Summary` protocol buffer with images."""
     tensor_image = make_np(tensor_image)
     tensor_image = convert_to_HWC(tensor_image, dataformats)
     tensor_boxes = make_np(tensor_boxes)
-    tensor_image = tensor_image.astype(
-        np.float32) * _calc_scale_factor(tensor_image)
-    image = make_image(tensor_image.astype(np.uint8),
-                       rescale=rescale,
-                       rois=tensor_boxes,
-                       labels=labels)
+    tensor_image = tensor_image.astype(np.float32) * _calc_scale_factor(tensor_image)
+    image = make_image(
+        tensor_image.astype(np.uint8), rescale=rescale, rois=tensor_boxes, labels=labels
+    )
     return Summary(value=[Summary.Value(tag=tag, image=image)])
 
 
@@ -421,19 +461,22 @@ def draw_boxes(disp_image, boxes, labels=None):
     num_boxes = boxes.shape[0]
     list_gt = range(num_boxes)
     for i in list_gt:
-        disp_image = _draw_single_box(disp_image,
-                                      boxes[i, 0],
-                                      boxes[i, 1],
-                                      boxes[i, 2],
-                                      boxes[i, 3],
-                                      display_str=None if labels is None else labels[i],
-                                      color='Red')
+        disp_image = _draw_single_box(
+            disp_image,
+            boxes[i, 0],
+            boxes[i, 1],
+            boxes[i, 2],
+            boxes[i, 3],
+            display_str=None if labels is None else labels[i],
+            color="Red",
+        )
     return disp_image
 
 
 def make_image(tensor, rescale=1, rois=None, labels=None):
     """Convert a numpy representation of an image to Image protobuf"""
     from PIL import Image
+
     height, width, channel = tensor.shape
     scaled_height = int(height * rescale)
     scaled_width = int(width * rescale)
@@ -442,14 +485,17 @@ def make_image(tensor, rescale=1, rois=None, labels=None):
         image = draw_boxes(image, rois, labels=labels)
     image = image.resize((scaled_width, scaled_height), Image.ANTIALIAS)
     import io
+
     output = io.BytesIO()
-    image.save(output, format='PNG')
+    image.save(output, format="PNG")
     image_string = output.getvalue()
     output.close()
-    return Summary.Image(height=height,
-                         width=width,
-                         colorspace=channel,
-                         encoded_image_string=image_string)
+    return Summary.Image(
+        height=height,
+        width=width,
+        colorspace=channel,
+        encoded_image_string=image_string,
+    )
 
 
 def video(tag, tensor, fps=4):
@@ -467,13 +513,15 @@ def make_video(tensor, fps):
     try:
         import moviepy  # noqa: F401
     except ImportError:
-        print('add_video needs package moviepy')
+        print("add_video needs package moviepy")
         return
     try:
         from moviepy import editor as mpy
     except ImportError:
-        print("moviepy is installed, but can't import moviepy.editor.",
-              "Some packages could be missing [imageio, requests]")
+        print(
+            "moviepy is installed, but can't import moviepy.editor.",
+            "Some packages could be missing [imageio, requests]",
+        )
         return
     import tempfile
 
@@ -482,7 +530,7 @@ def make_video(tensor, fps):
     # encode sequence of images into gif string
     clip = mpy.ImageSequenceClip(list(tensor), fps=fps)
 
-    filename = tempfile.NamedTemporaryFile(suffix='.gif', delete=False).name
+    filename = tempfile.NamedTemporaryFile(suffix=".gif", delete=False).name
     try:  # newer version of moviepy use logger instead of progress_bar argument.
         clip.write_gif(filename, verbose=False, logger=None)
     except TypeError:
@@ -491,30 +539,33 @@ def make_video(tensor, fps):
         except TypeError:
             clip.write_gif(filename, verbose=False)
 
-    with open(filename, 'rb') as f:
+    with open(filename, "rb") as f:
         tensor_string = f.read()
 
     try:
         os.remove(filename)
     except OSError:
-        logging.warning('The temporary file used by moviepy cannot be deleted.')
+        logger.warning("The temporary file used by moviepy cannot be deleted.")
 
-    return Summary.Image(height=h, width=w, colorspace=c, encoded_image_string=tensor_string)
+    return Summary.Image(
+        height=h, width=w, colorspace=c, encoded_image_string=tensor_string
+    )
 
 
 def audio(tag, tensor, sample_rate=44100):
     array = make_np(tensor)
     array = array.squeeze()
     if abs(array).max() > 1:
-        print('warning: audio amplitude out of range, auto clipped.')
+        print("warning: audio amplitude out of range, auto clipped.")
         array = array.clip(-1, 1)
-    assert(array.ndim == 1), 'input tensor should be 1 dimensional.'
-    array = (array * np.iinfo(np.int16).max).astype('<i2')
+    assert array.ndim == 1, "input tensor should be 1 dimensional."
+    array = (array * np.iinfo(np.int16).max).astype("<i2")
 
     import io
     import wave
+
     fio = io.BytesIO()
-    wave_write = wave.open(fio, 'wb')
+    wave_write = wave.open(fio, "wb")
     wave_write.setnchannels(1)
     wave_write.setsampwidth(2)
     wave_write.setframerate(sample_rate)
@@ -522,11 +573,13 @@ def audio(tag, tensor, sample_rate=44100):
     wave_write.close()
     audio_string = fio.getvalue()
     fio.close()
-    audio = Summary.Audio(sample_rate=sample_rate,
-                          num_channels=1,
-                          length_frames=array.shape[-1],
-                          encoded_audio_string=audio_string,
-                          content_type='audio/wav')
+    audio = Summary.Audio(
+        sample_rate=sample_rate,
+        num_channels=1,
+        length_frames=array.shape[-1],
+        encoded_audio_string=audio_string,
+        content_type="audio/wav",
+    )
     return Summary(value=[Summary.Value(tag=tag, audio=audio)])
 
 
@@ -536,11 +589,15 @@ def custom_scalars(layout):
         charts = []
         for chart_name, chart_meatadata in v.items():
             tags = chart_meatadata[1]
-            if chart_meatadata[0] == 'Margin':
+            if chart_meatadata[0] == "Margin":
                 assert len(tags) == 3
-                mgcc = layout_pb2.MarginChartContent(series=[layout_pb2.MarginChartContent.Series(value=tags[0],
-                                                                                                  lower=tags[1],
-                                                                                                  upper=tags[2])])
+                mgcc = layout_pb2.MarginChartContent(
+                    series=[
+                        layout_pb2.MarginChartContent.Series(
+                            value=tags[0], lower=tags[1], upper=tags[2]
+                        )
+                    ]
+                )
                 chart = layout_pb2.Chart(title=chart_name, margin=mgcc)
             else:
                 mlcc = layout_pb2.MultilineChartContent(tag=tags)
@@ -549,54 +606,84 @@ def custom_scalars(layout):
         categories.append(layout_pb2.Category(title=k, chart=charts))
 
     layout = layout_pb2.Layout(category=categories)
-    plugin_data = SummaryMetadata.PluginData(plugin_name='custom_scalars')
+    plugin_data = SummaryMetadata.PluginData(plugin_name="custom_scalars")
     smd = SummaryMetadata(plugin_data=plugin_data)
-    tensor = TensorProto(dtype='DT_STRING',
-                         string_val=[layout.SerializeToString()],
-                         tensor_shape=TensorShapeProto())
-    return Summary(value=[Summary.Value(tag='custom_scalars__config__', tensor=tensor, metadata=smd)])
+    tensor = TensorProto(
+        dtype="DT_STRING",
+        string_val=[layout.SerializeToString()],
+        tensor_shape=TensorShapeProto(),
+    )
+    return Summary(
+        value=[
+            Summary.Value(tag="custom_scalars__config__", tensor=tensor, metadata=smd)
+        ]
+    )
 
 
 def text(tag, text):
     plugin_data = SummaryMetadata.PluginData(
-        plugin_name='text', content=TextPluginData(version=0).SerializeToString())
+        plugin_name="text", content=TextPluginData(version=0).SerializeToString()
+    )
     smd = SummaryMetadata(plugin_data=plugin_data)
-    tensor = TensorProto(dtype='DT_STRING',
-                         string_val=[text.encode(encoding='utf_8')],
-                         tensor_shape=TensorShapeProto(dim=[TensorShapeProto.Dim(size=1)]))
-    return Summary(value=[Summary.Value(tag=tag + '/text_summary', metadata=smd, tensor=tensor)])
+    tensor = TensorProto(
+        dtype="DT_STRING",
+        string_val=[text.encode(encoding="utf_8")],
+        tensor_shape=TensorShapeProto(dim=[TensorShapeProto.Dim(size=1)]),
+    )
+    return Summary(
+        value=[Summary.Value(tag=tag + "/text_summary", metadata=smd, tensor=tensor)]
+    )
 
 
-def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, num_thresholds=127, weights=None):
+def pr_curve_raw(
+    tag, tp, fp, tn, fn, precision, recall, num_thresholds=127, weights=None
+):
     if num_thresholds > 127:  # weird, value > 127 breaks protobuf
         num_thresholds = 127
     data = np.stack((tp, fp, tn, fn, precision, recall))
     pr_curve_plugin_data = PrCurvePluginData(
-        version=0, num_thresholds=num_thresholds).SerializeToString()
+        version=0, num_thresholds=num_thresholds
+    ).SerializeToString()
     plugin_data = SummaryMetadata.PluginData(
-        plugin_name='pr_curves', content=pr_curve_plugin_data)
+        plugin_name="pr_curves", content=pr_curve_plugin_data
+    )
     smd = SummaryMetadata(plugin_data=plugin_data)
-    tensor = TensorProto(dtype='DT_FLOAT',
-                         float_val=data.reshape(-1).tolist(),
-                         tensor_shape=TensorShapeProto(
-                             dim=[TensorShapeProto.Dim(size=data.shape[0]), TensorShapeProto.Dim(size=data.shape[1])]))
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=data.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=data.shape[0]),
+                TensorShapeProto.Dim(size=data.shape[1]),
+            ]
+        ),
+    )
     return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
 
 
 def pr_curve(tag, labels, predictions, num_thresholds=127, weights=None):
     # weird, value > 127 breaks protobuf
     num_thresholds = min(num_thresholds, 127)
-    data = compute_curve(labels, predictions,
-                         num_thresholds=num_thresholds, weights=weights)
+    data = compute_curve(
+        labels, predictions, num_thresholds=num_thresholds, weights=weights
+    )
     pr_curve_plugin_data = PrCurvePluginData(
-        version=0, num_thresholds=num_thresholds).SerializeToString()
+        version=0, num_thresholds=num_thresholds
+    ).SerializeToString()
     plugin_data = SummaryMetadata.PluginData(
-        plugin_name='pr_curves', content=pr_curve_plugin_data)
+        plugin_name="pr_curves", content=pr_curve_plugin_data
+    )
     smd = SummaryMetadata(plugin_data=plugin_data)
-    tensor = TensorProto(dtype='DT_FLOAT',
-                         float_val=data.reshape(-1).tolist(),
-                         tensor_shape=TensorShapeProto(
-                             dim=[TensorShapeProto.Dim(size=data.shape[0]), TensorShapeProto.Dim(size=data.shape[1])]))
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=data.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=data.shape[0]),
+                TensorShapeProto.Dim(size=data.shape[1]),
+            ]
+        ),
+    )
     return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
 
 
@@ -615,12 +702,14 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
         bucket_indices,
         bins=num_thresholds,
         range=histogram_range,
-        weights=float_labels * weights)
+        weights=float_labels * weights,
+    )
     fp_buckets, _ = np.histogram(
         bucket_indices,
         bins=num_thresholds,
         range=histogram_range,
-        weights=(1.0 - float_labels) * weights)
+        weights=(1.0 - float_labels) * weights,
+    )
 
     # Obtain the reverse cumulative sum.
     tp = np.cumsum(tp_buckets[::-1])[::-1]
@@ -632,7 +721,9 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
     return np.stack((tp, fp, tn, fn, precision, recall))
 
 
-def _get_tensor_summary(name, display_name, description, tensor, content_type, components, json_config):
+def _get_tensor_summary(
+    name, display_name, description, tensor, content_type, components, json_config
+):
     """Creates a tensor summary with summary metadata.
 
     Args:
@@ -665,15 +756,20 @@ def _get_tensor_summary(name, display_name, description, tensor, content_type, c
         components,
         tensor.shape,
         description,
-        json_config=json_config)
+        json_config=json_config,
+    )
 
-    tensor = TensorProto(dtype='DT_FLOAT',
-                         float_val=tensor.reshape(-1).tolist(),
-                         tensor_shape=TensorShapeProto(dim=[
-                             TensorShapeProto.Dim(size=tensor.shape[0]),
-                             TensorShapeProto.Dim(size=tensor.shape[1]),
-                             TensorShapeProto.Dim(size=tensor.shape[2]),
-                         ]))
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=tensor.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=tensor.shape[0]),
+                TensorShapeProto.Dim(size=tensor.shape[1]),
+                TensorShapeProto.Dim(size=tensor.shape[2]),
+            ]
+        ),
+    )
 
     tensor_summary = Summary.Value(
         tag=metadata.get_instance_name(name, content_type),
@@ -686,33 +782,35 @@ def _get_tensor_summary(name, display_name, description, tensor, content_type, c
 
 def _get_json_config(config_dict):
     """Parses and returns JSON string from python dictionary."""
-    json_config = '{}'
+    json_config = "{}"
     if config_dict is not None:
         json_config = json.dumps(config_dict, sort_keys=True)
     return json_config
 
 
 # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/mesh/summary.py
-def mesh(tag, vertices, colors, faces, config_dict, display_name=None, description=None):
+def mesh(
+    tag, vertices, colors, faces, config_dict, display_name=None, description=None
+):
     """Outputs a merged `Summary` protocol buffer with a mesh/point cloud.
 
-      Args:
-        tag: A name for this summary operation.
-        vertices: Tensor of shape `[dim_1, ..., dim_n, 3]` representing the 3D
-          coordinates of vertices.
-        faces: Tensor of shape `[dim_1, ..., dim_n, 3]` containing indices of
-          vertices within each triangle.
-        colors: Tensor of shape `[dim_1, ..., dim_n, 3]` containing colors for each
-          vertex.
-        display_name: If set, will be used as the display name in TensorBoard.
-          Defaults to `name`.
-        description: A longform readable description of the summary data. Markdown
-          is supported.
-        config_dict: Dictionary with ThreeJS classes names and configuration.
-
-      Returns:
-        Merged summary for mesh/point cloud representation.
-      """
+    Args:
+      tag: A name for this summary operation.
+      vertices: Tensor of shape `[dim_1, ..., dim_n, 3]` representing the 3D
+        coordinates of vertices.
+      faces: Tensor of shape `[dim_1, ..., dim_n, 3]` containing indices of
+        vertices within each triangle.
+      colors: Tensor of shape `[dim_1, ..., dim_n, 3]` containing colors for each
+        vertex.
+      display_name: If set, will be used as the display name in TensorBoard.
+        Defaults to `name`.
+      description: A longform readable description of the summary data. Markdown
+        is supported.
+      config_dict: Dictionary with ThreeJS classes names and configuration.
+
+    Returns:
+      Merged summary for mesh/point cloud representation.
+    """
     from tensorboard.plugins.mesh.plugin_data_pb2 import MeshPluginData
     from tensorboard.plugins.mesh import metadata
 
@@ -722,15 +820,24 @@ def mesh(tag, vertices, colors, faces, config_dict, display_name=None, descripti
     tensors = [
         (vertices, MeshPluginData.VERTEX),
         (faces, MeshPluginData.FACE),
-        (colors, MeshPluginData.COLOR)
+        (colors, MeshPluginData.COLOR),
     ]
     tensors = [tensor for tensor in tensors if tensor[0] is not None]
-    components = metadata.get_components_bitmask([
-        content_type for (tensor, content_type) in tensors])
+    components = metadata.get_components_bitmask(
+        [content_type for (tensor, content_type) in tensors]
+    )
 
     for tensor, content_type in tensors:
         summaries.append(
-            _get_tensor_summary(tag, display_name, description, tensor,
-                                content_type, components, json_config))
+            _get_tensor_summary(
+                tag,
+                display_name,
+                description,
+                tensor,
+                content_type,
+                components,
+                json_config,
+            )
+        )
 
     return Summary(value=summaries)
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index cd7f751a32f5..a4b23d093587 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -14,14 +14,29 @@
 
 from ._convert_np import make_np
 from ._embedding import (
-    make_mat, make_sprite, make_tsv, write_pbtxt, get_embedding_info,
+    make_mat,
+    make_sprite,
+    make_tsv,
+    write_pbtxt,
+    get_embedding_info,
 )
 from ._onnx_graph import load_onnx_graph
 from ._pytorch_graph import graph
 from ._utils import figure_to_image
 from .summary import (
-    scalar, histogram, histogram_raw, image, audio, text,
-    pr_curve, pr_curve_raw, video, custom_scalars, image_boxes, mesh, hparams
+    scalar,
+    histogram,
+    histogram_raw,
+    image,
+    audio,
+    text,
+    pr_curve,
+    pr_curve_raw,
+    video,
+    custom_scalars,
+    image_boxes,
+    mesh,
+    hparams,
 )
 
 
@@ -35,7 +50,7 @@ class FileWriter(object):
     training.
     """
 
-    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=''):
+    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=""):
         """Creates a `FileWriter` and an event file.
         On construction the writer creates a new event file in `log_dir`.
         The other arguments to the constructor control the asynchronous writes to
@@ -58,7 +73,8 @@ def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=''):
         # actually the ones passing in a PosixPath
         log_dir = str(log_dir)
         self.event_writer = EventFileWriter(
-            log_dir, max_queue, flush_secs, filename_suffix)
+            log_dir, max_queue, flush_secs, filename_suffix
+        )
 
     def get_logdir(self):
         """Returns the directory where event file will be written."""
@@ -109,7 +125,8 @@ def add_graph(self, graph_profile, walltime=None):
         self.add_event(event, None, walltime)
 
         trm = event_pb2.TaggedRunMetadata(
-            tag='step1', run_metadata=stepstats.SerializeToString())
+            tag="step1", run_metadata=stepstats.SerializeToString()
+        )
         event = event_pb2.Event(tagged_run_metadata=trm)
         self.add_event(event, None, walltime)
 
@@ -157,8 +174,15 @@ class SummaryWriter(object):
     training.
     """
 
-    def __init__(self, log_dir=None, comment='', purge_step=None, max_queue=10,
-                 flush_secs=120, filename_suffix=''):
+    def __init__(
+        self,
+        log_dir=None,
+        comment="",
+        purge_step=None,
+        max_queue=10,
+        flush_secs=120,
+        filename_suffix="",
+    ):
         """Creates a `SummaryWriter` that will write out events and summaries
         to the event file.
 
@@ -205,9 +229,11 @@ def __init__(self, log_dir=None, comment='', purge_step=None, max_queue=10,
         if not log_dir:
             import socket
             from datetime import datetime
-            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
+
+            current_time = datetime.now().strftime("%b%d_%H-%M-%S")
             log_dir = os.path.join(
-                'runs', current_time + '_' + socket.gethostname() + comment)
+                "runs", current_time + "_" + socket.gethostname() + comment
+            )
         self.log_dir = log_dir
         self.purge_step = purge_step
         self.max_queue = max_queue
@@ -220,10 +246,10 @@ def __init__(self, log_dir=None, comment='', purge_step=None, max_queue=10,
         self._get_file_writer()
 
         # Create default bins for histograms, see generate_testdata.py in tensorflow/tensorboard
-        v = 1E-12
+        v = 1e-12
         buckets = []
         neg_buckets = []
-        while v < 1E20:
+        while v < 1e20:
             buckets.append(v)
             neg_buckets.append(-v)
             v *= 1.1
@@ -247,15 +273,21 @@ def _check_caffe2_blob(self, item):
     def _get_file_writer(self):
         """Returns the default FileWriter instance. Recreates it if closed."""
         if self.all_writers is None or self.file_writer is None:
-            self.file_writer = FileWriter(self.log_dir, self.max_queue,
-                                          self.flush_secs, self.filename_suffix)
+            self.file_writer = FileWriter(
+                self.log_dir, self.max_queue, self.flush_secs, self.filename_suffix
+            )
             self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
             if self.purge_step is not None:
                 most_recent_step = self.purge_step
                 self.file_writer.add_event(
-                    Event(step=most_recent_step, file_version='brain.Event:2'))
+                    Event(step=most_recent_step, file_version="brain.Event:2")
+                )
                 self.file_writer.add_event(
-                    Event(step=most_recent_step, session_log=SessionLog(status=SessionLog.START)))
+                    Event(
+                        step=most_recent_step,
+                        session_log=SessionLog(status=SessionLog.START),
+                    )
+                )
                 self.purge_step = None
         return self.file_writer
 
@@ -299,7 +331,7 @@ def add_hparams(
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_hparams")
         if type(hparam_dict) is not dict or type(metric_dict) is not dict:
-            raise TypeError('hparam_dict and metric_dict should be dictionary.')
+            raise TypeError("hparam_dict and metric_dict should be dictionary.")
         exp, ssi, sei = hparams(hparam_dict, metric_dict, hparam_domain_discrete)
 
         if not run_name:
@@ -349,6 +381,7 @@ def add_scalar(
         torch._C._log_api_usage_once("tensorboard.logging.add_scalar")
         if self._check_caffe2_blob(scalar_value):
             from caffe2.python import workspace
+
             scalar_value = workspace.FetchBlob(scalar_value)
 
         summary = scalar(
@@ -394,16 +427,25 @@ def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None
             if fw_tag in self.all_writers.keys():
                 fw = self.all_writers[fw_tag]
             else:
-                fw = FileWriter(fw_tag, self.max_queue, self.flush_secs,
-                                self.filename_suffix)
+                fw = FileWriter(
+                    fw_tag, self.max_queue, self.flush_secs, self.filename_suffix
+                )
                 self.all_writers[fw_tag] = fw
             if self._check_caffe2_blob(scalar_value):
                 from caffe2.python import workspace
+
                 scalar_value = workspace.FetchBlob(scalar_value)
-            fw.add_summary(scalar(main_tag, scalar_value),
-                           global_step, walltime)
+            fw.add_summary(scalar(main_tag, scalar_value), global_step, walltime)
 
-    def add_histogram(self, tag, values, global_step=None, bins='tensorflow', walltime=None, max_bins=None):
+    def add_histogram(
+        self,
+        tag,
+        values,
+        global_step=None,
+        bins="tensorflow",
+        walltime=None,
+        max_bins=None,
+    ):
         """Add histogram to summary.
 
         Args:
@@ -434,15 +476,27 @@ def add_histogram(self, tag, values, global_step=None, bins='tensorflow', wallti
         torch._C._log_api_usage_once("tensorboard.logging.add_histogram")
         if self._check_caffe2_blob(values):
             from caffe2.python import workspace
+
             values = workspace.FetchBlob(values)
-        if isinstance(bins, str) and bins == 'tensorflow':
+        if isinstance(bins, str) and bins == "tensorflow":
             bins = self.default_bins
         self._get_file_writer().add_summary(
-            histogram(tag, values, bins, max_bins=max_bins), global_step, walltime)
+            histogram(tag, values, bins, max_bins=max_bins), global_step, walltime
+        )
 
-    def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
-                          bucket_limits, bucket_counts, global_step=None,
-                          walltime=None):
+    def add_histogram_raw(
+        self,
+        tag,
+        min,
+        max,
+        num,
+        sum,
+        sum_squares,
+        bucket_limits,
+        bucket_counts,
+        global_step=None,
+        walltime=None,
+    ):
         """Adds histogram with raw data.
 
         Args:
@@ -494,20 +548,20 @@ def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_histogram_raw")
         if len(bucket_limits) != len(bucket_counts):
-            raise ValueError('len(bucket_limits) != len(bucket_counts), see the document.')
+            raise ValueError(
+                "len(bucket_limits) != len(bucket_counts), see the document."
+            )
         self._get_file_writer().add_summary(
-            histogram_raw(tag,
-                          min,
-                          max,
-                          num,
-                          sum,
-                          sum_squares,
-                          bucket_limits,
-                          bucket_counts),
+            histogram_raw(
+                tag, min, max, num, sum, sum_squares, bucket_limits, bucket_counts
+            ),
             global_step,
-            walltime)
+            walltime,
+        )
 
-    def add_image(self, tag, img_tensor, global_step=None, walltime=None, dataformats='CHW'):
+    def add_image(
+        self, tag, img_tensor, global_step=None, walltime=None, dataformats="CHW"
+    ):
         """Add image data to summary.
 
         Note that this requires the ``pillow`` package.
@@ -518,6 +572,8 @@ def add_image(self, tag, img_tensor, global_step=None, walltime=None, dataformat
             global_step (int): Global step value to record
             walltime (float): Optional override default walltime (time.time())
               seconds after epoch of event
+            dataformats (string): Image data format specification of the form
+              CHW, HWC, HW, WH, etc.
         Shape:
             img_tensor: Default is :math:`(3, H, W)`. You can use ``torchvision.utils.make_grid()`` to
             convert a batch of tensor into 3xHxW format or call ``add_images`` and let us do the job.
@@ -552,11 +608,15 @@ def add_image(self, tag, img_tensor, global_step=None, walltime=None, dataformat
         torch._C._log_api_usage_once("tensorboard.logging.add_image")
         if self._check_caffe2_blob(img_tensor):
             from caffe2.python import workspace
+
             img_tensor = workspace.FetchBlob(img_tensor)
         self._get_file_writer().add_summary(
-            image(tag, img_tensor, dataformats=dataformats), global_step, walltime)
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime
+        )
 
-    def add_images(self, tag, img_tensor, global_step=None, walltime=None, dataformats='NCHW'):
+    def add_images(
+        self, tag, img_tensor, global_step=None, walltime=None, dataformats="NCHW"
+    ):
         """Add batched image data to summary.
 
         Note that this requires the ``pillow`` package.
@@ -596,12 +656,23 @@ def add_images(self, tag, img_tensor, global_step=None, walltime=None, dataforma
         torch._C._log_api_usage_once("tensorboard.logging.add_images")
         if self._check_caffe2_blob(img_tensor):
             from caffe2.python import workspace
+
             img_tensor = workspace.FetchBlob(img_tensor)
         self._get_file_writer().add_summary(
-            image(tag, img_tensor, dataformats=dataformats), global_step, walltime)
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime
+        )
 
-    def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None,
-                             walltime=None, rescale=1, dataformats='CHW', labels=None):
+    def add_image_with_boxes(
+        self,
+        tag,
+        img_tensor,
+        box_tensor,
+        global_step=None,
+        walltime=None,
+        rescale=1,
+        dataformats="CHW",
+        labels=None,
+    ):
         """Add image and draw bounding boxes on the image.
 
         Args:
@@ -626,17 +697,29 @@ def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None,
         torch._C._log_api_usage_once("tensorboard.logging.add_image_with_boxes")
         if self._check_caffe2_blob(img_tensor):
             from caffe2.python import workspace
+
             img_tensor = workspace.FetchBlob(img_tensor)
         if self._check_caffe2_blob(box_tensor):
             from caffe2.python import workspace
+
             box_tensor = workspace.FetchBlob(box_tensor)
         if labels is not None:
             if isinstance(labels, str):
                 labels = [labels]
             if len(labels) != box_tensor.shape[0]:
                 labels = None
-        self._get_file_writer().add_summary(image_boxes(
-            tag, img_tensor, box_tensor, rescale=rescale, dataformats=dataformats, labels=labels), global_step, walltime)
+        self._get_file_writer().add_summary(
+            image_boxes(
+                tag,
+                img_tensor,
+                box_tensor,
+                rescale=rescale,
+                dataformats=dataformats,
+                labels=labels,
+            ),
+            global_step,
+            walltime,
+        )
 
     def add_figure(self, tag, figure, global_step=None, close=True, walltime=None):
         """Render matplotlib figure into an image and add it to summary.
@@ -653,9 +736,21 @@ def add_figure(self, tag, figure, global_step=None, close=True, walltime=None):
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_figure")
         if isinstance(figure, list):
-            self.add_image(tag, figure_to_image(figure, close), global_step, walltime, dataformats='NCHW')
+            self.add_image(
+                tag,
+                figure_to_image(figure, close),
+                global_step,
+                walltime,
+                dataformats="NCHW",
+            )
         else:
-            self.add_image(tag, figure_to_image(figure, close), global_step, walltime, dataformats='CHW')
+            self.add_image(
+                tag,
+                figure_to_image(figure, close),
+                global_step,
+                walltime,
+                dataformats="CHW",
+            )
 
     def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
         """Add video data to summary.
@@ -674,9 +769,12 @@ def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_video")
         self._get_file_writer().add_summary(
-            video(tag, vid_tensor, fps), global_step, walltime)
+            video(tag, vid_tensor, fps), global_step, walltime
+        )
 
-    def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None):
+    def add_audio(
+        self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None
+    ):
         """Add audio data to summary.
 
         Args:
@@ -692,9 +790,11 @@ def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, wallti
         torch._C._log_api_usage_once("tensorboard.logging.add_audio")
         if self._check_caffe2_blob(snd_tensor):
             from caffe2.python import workspace
+
             snd_tensor = workspace.FetchBlob(snd_tensor)
         self._get_file_writer().add_summary(
-            audio(tag, snd_tensor, sample_rate=sample_rate), global_step, walltime)
+            audio(tag, snd_tensor, sample_rate=sample_rate), global_step, walltime
+        )
 
     def add_text(self, tag, text_string, global_step=None, walltime=None):
         """Add text data to summary.
@@ -712,13 +812,16 @@ def add_text(self, tag, text_string, global_step=None, walltime=None):
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_text")
         self._get_file_writer().add_summary(
-            text(tag, text_string), global_step, walltime)
+            text(tag, text_string), global_step, walltime
+        )
 
     def add_onnx_graph(self, prototxt):
         torch._C._log_api_usage_once("tensorboard.logging.add_onnx_graph")
         self._get_file_writer().add_onnx_graph(load_onnx_graph(prototxt))
 
-    def add_graph(self, model, input_to_model=None, verbose=False, use_strict_trace=True):
+    def add_graph(
+        self, model, input_to_model=None, verbose=False, use_strict_trace=True
+    ):
         """Add graph data to summary.
 
         Args:
@@ -731,16 +834,21 @@ def add_graph(self, model, input_to_model=None, verbose=False, use_strict_trace=
                 record your mutable container types (list, dict)
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_graph")
-        if hasattr(model, 'forward'):
+        if hasattr(model, "forward"):
             # A valid PyTorch model should have a 'forward' method
-            self._get_file_writer().add_graph(graph(model, input_to_model, verbose, use_strict_trace))
+            self._get_file_writer().add_graph(
+                graph(model, input_to_model, verbose, use_strict_trace)
+            )
         else:
             # Caffe2 models do not have the 'forward' method
             from caffe2.proto import caffe2_pb2
             from caffe2.python import core
             from ._caffe2_graph import (
-                model_to_graph_def, nets_to_graph_def, protos_to_graph_def
+                model_to_graph_def,
+                nets_to_graph_def,
+                protos_to_graph_def,
             )
+
             if isinstance(model, list):
                 if isinstance(model[0], core.Net):
                     current_graph = nets_to_graph_def(model)
@@ -749,8 +857,7 @@ def add_graph(self, model, input_to_model=None, verbose=False, use_strict_trace=
             else:
                 # Handles cnn.CNNModelHelper, model_helper.ModelHelper
                 current_graph = model_to_graph_def(model)
-            event = event_pb2.Event(
-                graph_def=current_graph.SerializeToString())
+            event = event_pb2.Event(graph_def=current_graph.SerializeToString())
             self._get_file_writer().add_event(event)
 
     @staticmethod
@@ -762,7 +869,15 @@ def _encode(rawstr):
         retval = retval.replace("\\", "%%%02x" % (ord("\\")))
         return retval
 
-    def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, tag='default', metadata_header=None):
+    def add_embedding(
+        self,
+        mat,
+        metadata=None,
+        label_img=None,
+        global_step=None,
+        tag="default",
+        metadata_header=None,
+    ):
         """Add embedding projector data to summary.
 
         Args:
@@ -811,22 +926,30 @@ def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, ta
         if fs.exists(save_path):
             if fs.isdir(save_path):
                 print(
-                    'warning: Embedding dir exists, did you set global_step for add_embedding()?')
+                    "warning: Embedding dir exists, did you set global_step for add_embedding()?"
+                )
             else:
-                raise Exception("Path: `%s` exists, but is a file. Cannot proceed." % save_path)
+                raise Exception(
+                    "Path: `%s` exists, but is a file. Cannot proceed." % save_path
+                )
         else:
             fs.makedirs(save_path)
 
         if metadata is not None:
             assert mat.shape[0] == len(
-                metadata), '#labels should equal with #data points'
+                metadata
+            ), "#labels should equal with #data points"
             make_tsv(metadata, save_path, metadata_header=metadata_header)
 
         if label_img is not None:
-            assert mat.shape[0] == label_img.shape[0], '#images should equal with #data points'
+            assert (
+                mat.shape[0] == label_img.shape[0]
+            ), "#images should equal with #data points"
             make_sprite(label_img, save_path)
 
-        assert mat.ndim == 2, 'mat should be 2D, where mat.size(0) is the number of data points'
+        assert (
+            mat.ndim == 2
+        ), "mat should be 2D, where mat.size(0) is the number of data points"
         make_mat(mat, save_path)
 
         # Filesystem doesn't necessarily have append semantics, so we store an
@@ -835,16 +958,25 @@ def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, ta
         if not hasattr(self, "_projector_config"):
             self._projector_config = ProjectorConfig()
         embedding_info = get_embedding_info(
-            metadata, label_img, fs, subdir, global_step, tag)
+            metadata, label_img, fs, subdir, global_step, tag
+        )
         self._projector_config.embeddings.extend([embedding_info])
 
         from google.protobuf import text_format
+
         config_pbtxt = text_format.MessageToString(self._projector_config)
         write_pbtxt(self._get_file_writer().get_logdir(), config_pbtxt)
 
-
-    def add_pr_curve(self, tag, labels, predictions, global_step=None,
-                     num_thresholds=127, weights=None, walltime=None):
+    def add_pr_curve(
+        self,
+        tag,
+        labels,
+        predictions,
+        global_step=None,
+        num_thresholds=127,
+        weights=None,
+        walltime=None,
+    ):
         """Adds precision recall curve.
         Plotting a precision-recall curve lets you understand your model's
         performance under different threshold settings. With this function,
@@ -879,18 +1011,24 @@ def add_pr_curve(self, tag, labels, predictions, global_step=None,
         labels, predictions = make_np(labels), make_np(predictions)
         self._get_file_writer().add_summary(
             pr_curve(tag, labels, predictions, num_thresholds, weights),
-            global_step, walltime)
-
-    def add_pr_curve_raw(self, tag, true_positive_counts,
-                         false_positive_counts,
-                         true_negative_counts,
-                         false_negative_counts,
-                         precision,
-                         recall,
-                         global_step=None,
-                         num_thresholds=127,
-                         weights=None,
-                         walltime=None):
+            global_step,
+            walltime,
+        )
+
+    def add_pr_curve_raw(
+        self,
+        tag,
+        true_positive_counts,
+        false_positive_counts,
+        true_negative_counts,
+        false_negative_counts,
+        precision,
+        recall,
+        global_step=None,
+        num_thresholds=127,
+        weights=None,
+        walltime=None,
+    ):
         """Adds precision recall curve with raw data.
 
         Args:
@@ -909,19 +1047,24 @@ def add_pr_curve_raw(self, tag, true_positive_counts,
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_pr_curve_raw")
         self._get_file_writer().add_summary(
-            pr_curve_raw(tag,
-                         true_positive_counts,
-                         false_positive_counts,
-                         true_negative_counts,
-                         false_negative_counts,
-                         precision,
-                         recall,
-                         num_thresholds,
-                         weights),
+            pr_curve_raw(
+                tag,
+                true_positive_counts,
+                false_positive_counts,
+                true_negative_counts,
+                false_negative_counts,
+                precision,
+                recall,
+                num_thresholds,
+                weights,
+            ),
             global_step,
-            walltime)
+            walltime,
+        )
 
-    def add_custom_scalars_multilinechart(self, tags, category='default', title='untitled'):
+    def add_custom_scalars_multilinechart(
+        self, tags, category="default", title="untitled"
+    ):
         """Shorthand for creating multilinechart. Similar to ``add_custom_scalars()``, but the only necessary argument
         is *tags*.
 
@@ -932,11 +1075,15 @@ def add_custom_scalars_multilinechart(self, tags, category='default', title='unt
 
             writer.add_custom_scalars_multilinechart(['twse/0050', 'twse/2330'])
         """
-        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars_multilinechart")
-        layout = {category: {title: ['Multiline', tags]}}
+        torch._C._log_api_usage_once(
+            "tensorboard.logging.add_custom_scalars_multilinechart"
+        )
+        layout = {category: {title: ["Multiline", tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
-    def add_custom_scalars_marginchart(self, tags, category='default', title='untitled'):
+    def add_custom_scalars_marginchart(
+        self, tags, category="default", title="untitled"
+    ):
         """Shorthand for creating marginchart. Similar to ``add_custom_scalars()``, but the only necessary argument
         is *tags*, which should have exactly 3 elements.
 
@@ -947,9 +1094,11 @@ def add_custom_scalars_marginchart(self, tags, category='default', title='untitl
 
             writer.add_custom_scalars_marginchart(['twse/0050', 'twse/2330', 'twse/2006'])
         """
-        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars_marginchart")
+        torch._C._log_api_usage_once(
+            "tensorboard.logging.add_custom_scalars_marginchart"
+        )
         assert len(tags) == 3
-        layout = {category: {title: ['Margin', tags]}}
+        layout = {category: {title: ["Margin", tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
     def add_custom_scalars(self, layout):
@@ -974,7 +1123,16 @@ def add_custom_scalars(self, layout):
         torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars")
         self._get_file_writer().add_summary(custom_scalars(layout))
 
-    def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict=None, global_step=None, walltime=None):
+    def add_mesh(
+        self,
+        tag,
+        vertices,
+        colors=None,
+        faces=None,
+        config_dict=None,
+        global_step=None,
+        walltime=None,
+    ):
         """Add meshes or 3D point clouds to TensorBoard. The visualization is based on Three.js,
         so it allows users to interact with the rendered object. Besides the basic definitions
         such as vertices, faces, users can further provide camera parameter, lighting condition, etc.
@@ -1026,7 +1184,9 @@ def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict=None, glo
             writer.close()
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_mesh")
-        self._get_file_writer().add_summary(mesh(tag, vertices, colors, faces, config_dict), global_step, walltime)
+        self._get_file_writer().add_summary(
+            mesh(tag, vertices, colors, faces, config_dict), global_step, walltime
+        )
 
     def flush(self):
         """Flushes the event file to disk.
diff --git a/torchgen/BUILD.bazel b/torchgen/BUILD.bazel
new file mode 100644
index 000000000000..d1a0db360d23
--- /dev/null
+++ b/torchgen/BUILD.bazel
@@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
diff --git a/torchgen/BUILD.buck b/torchgen/BUILD.buck
new file mode 100644
index 000000000000..50774c38f7e7
--- /dev/null
+++ b/torchgen/BUILD.buck
@@ -0,0 +1,23 @@
+python_library(
+    name = "torchgen",
+    srcs = glob(
+        ["**/*.py"],
+    ),
+    base_module = "torchgen",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//third_party:pyyaml",
+        "//third_party:typing-extensions",
+    ],
+)
+
+python_binary(
+    name = "gen",
+    main_module = "torchgen.gen",
+    visibility = [
+        "PUBLIC",
+    ],
+    deps = [
+        ":torchgen",
+    ],
+)
diff --git a/torch/fx/experimental/fx_acc/__init__.py b/torchgen/__init__.py
similarity index 100%
rename from torch/fx/experimental/fx_acc/__init__.py
rename to torchgen/__init__.py
diff --git a/torchgen/api/__init__.py b/torchgen/api/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torchgen/api/autograd.py b/torchgen/api/autograd.py
new file mode 100644
index 000000000000..01875dcb006c
--- /dev/null
+++ b/torchgen/api/autograd.py
@@ -0,0 +1,516 @@
+from dataclasses import dataclass
+import re
+from typing import Optional, Sequence, Set, List, Tuple, Match
+
+from torchgen.api import cpp
+from torchgen.api.types import Binding, NamedCType
+from torchgen.model import (
+    NativeFunction,
+    Type,
+    SchemaKind,
+    NativeFunctionsViewGroup,
+)
+from torchgen.utils import IDENT_REGEX
+
+# Represents a saved attribute involved in backward calculation.
+# Note that it can be a derived property of an input argument, e.g.:
+# we could save `other.scalar_type()` instead of the entire `other` tensor.
+@dataclass(frozen=True)
+class SavedAttribute:
+    # The NamedCType holds the updated name and cpp type of the attribute
+    # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type`
+    nctype: NamedCType
+
+    # The expression to read the derived property at save time, e.g.:
+    # `other.scalar_type()`.
+    expr: str
+
+
+# Represents a backward formula that calculates derivatives for one
+# or more tensors.
+@dataclass(frozen=True)
+class Derivative:
+    # The formula string (legit C++ expression).
+    # Note that expressions against input arguments have been replaced with the
+    # corresponding saved attributes.
+    # E.g.:
+    #  raw formula: `mul_tensor_backward(grad, self, other.scalar_type())`
+    #         here: `mul_tensor_backward(grad, self, other_scalar_type)`
+    formula: str
+
+    # The formula string before input argument replacement
+    original_formula: str
+
+    # Names of the arguments for which this formula calculates derivatives.
+    var_names: Tuple[str, ...]
+
+    # Saved inputs that are referenced by the formula.
+    saved_inputs: Tuple[SavedAttribute, ...]
+
+    # Saved outputs that are referenced by the formula.
+    saved_outputs: Tuple[SavedAttribute, ...]
+
+    # Gradients that are referenced by name in the formula.
+    named_gradients: Set[str]
+
+
+# Represents a forward formula that calculates forward derivatives
+# for one tensor.
+@dataclass(frozen=True)
+class ForwardDerivative:
+    # The formula string (legit C++ expression).
+    # Note that special keywords such as "linear" or "element_wise" have been
+    # replaced by the automatically generated formula.
+    formula: str
+
+    # Name of the output arguments for which this formula calculates forward
+    # derivatives
+    var_names: Tuple[str, ...]
+
+    # Type of the output arguments for which this formula calculates forward
+    # derivatives
+    var_types: Tuple[Type, ...]
+
+    # Inputs for which the forward derivatives are required for this formula
+    required_inputs_fw_grad: Optional[Tuple[str, ...]]
+
+    # Inputs for which the primal is required for this formula
+    required_inputs_primal: Optional[Tuple[str, ...]]
+
+    # Flag to specify if this formula requires the original value of self
+    # This is only used by inplace operations
+    required_original_self_value: bool
+
+    # If this formula is specified in derivatives.yaml or if we are re-using the
+    # out of place formula for inplace
+    is_reusing_outplace_formula: bool
+
+
+# Represents differentiability info for a NativeFunction.
+@dataclass(frozen=True)
+class DifferentiabilityInfo:
+    # The base name read from derivatives.yaml.
+    name: str
+
+    # The matching native function.
+    #
+    # There can be multiple NativeFunction having the same base name:
+    #  - different overloads with different types of input arguments;
+    #  - in-place/out/functional variants of the same function;
+    #
+    # We first use the schema string (under the 'name' key) in derivatives.yaml
+    # to find the NativeFunction having the same schema string.
+    # Then we find the in-place/out/functional variants of the matching function.
+    # Among these variants, we choose the one having the same name as the
+    # derivatives.yaml entry. If there is no exact match, then we choose the
+    # in-place variant.
+    # TODO: maybe the logic to search for all variants is no longer necessary?
+    func: NativeFunction
+
+    # The name of the generated autograd function.
+    # It's set only if we will calculate a derivative, i.e.
+    # 'args_with_derivatives' is not empty.
+    op: Optional[str]
+
+    # The derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable inputs
+    derivatives: Sequence[Derivative]
+
+    # The forward derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable outputs
+    forward_derivatives: Sequence[ForwardDerivative]
+
+    # The union of 'saved_inputs' of all 'derivatives'.
+    all_saved_inputs: Sequence[SavedAttribute]
+
+    # The union of 'saved_outputs' of all 'derivatives'.
+    all_saved_outputs: Sequence[SavedAttribute]
+
+    # All named gradients that are available for use, in the same
+    # order as in the grads vector.
+    available_named_gradients: Sequence[str]
+
+    # The named gradients that are used in any of the derivatives.
+    # Invariant: all(name in available_named_gradients for name in used_named_gradients)
+    used_named_gradients: Set[str]
+
+    # The function's input arguments for which it calculates derivatives.
+    # It's the union of 'var_names' of all 'derivatives', sorted by the
+    # argument order in the function schema.
+    args_with_derivatives: Sequence[Binding]
+
+    # Names of arguments whose derivative formula is 'non_differentiable'.
+    non_differentiable_arg_names: Sequence[str]
+
+    # Raw data read from derivatives.yaml.
+    output_differentiability: Optional[List[bool]]
+
+    # output_differentiability in derivatives.yaml can be a list of
+    # conditions that express if the output is differentiable. In this case,
+    # the number of conditions must match the number of outputs
+    # (NB: we only support one condition right now).
+    # output_differentiability gets populated with True for each condition,
+    # while output_differentiability_conditions gets populated with the conditions
+    output_differentiability_conditions: Optional[List[str]]
+
+    @property
+    def has_derivatives(self) -> bool:
+        return len(self.args_with_derivatives) > 0
+
+    # Generates a new DifferentiabilityInfo using the exact same set of derivative information,
+    # but with a new operator name.
+    # This is used when generating "copy" variants of view ops,
+    # which are able to use the exact same derivative formula as the original view op
+    # See Note [Codegen'd {view}_copy Operators]
+    def create_view_copy_from_view_derivative(
+        self, g: NativeFunctionsViewGroup
+    ) -> Optional["DifferentiabilityInfo"]:
+        if g.view_copy is None:
+            return None
+        f = g.view_copy
+
+        name_split_by_period = self.name.split(".", maxsplit=2)
+        # Append a "_copy" to the base name of the operator (but keep the overload name the same)
+        view_copy_name = f"{name_split_by_period[0]}_copy." + ".".join(
+            name_split_by_period[1:]
+        )
+        view_copy_op_name = None if self.op is None else f"{self.op}_copy"
+
+        return DifferentiabilityInfo(
+            # Use the "_copy" version of name/func/op
+            name=view_copy_name,
+            func=f,
+            op=view_copy_op_name,
+            # But keep all derivative info the same
+            derivatives=self.derivatives,
+            forward_derivatives=self.forward_derivatives,
+            all_saved_inputs=self.all_saved_inputs,
+            all_saved_outputs=self.all_saved_outputs,
+            available_named_gradients=self.available_named_gradients,
+            used_named_gradients=self.used_named_gradients,
+            args_with_derivatives=self.args_with_derivatives,
+            non_differentiable_arg_names=self.non_differentiable_arg_names,
+            output_differentiability=self.output_differentiability,
+            output_differentiability_conditions=self.output_differentiability_conditions,
+        )
+
+
+def uses_ident(info: Optional[DifferentiabilityInfo], ident: str) -> bool:
+    if info is None:
+        return False
+    for derivative in info.derivatives:
+        formula = derivative.formula
+        if re.search(IDENT_REGEX.format(ident), formula):
+            return True
+    return False
+
+
+def uses_retain_variables(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "retain_variables")
+
+
+def uses_single_grad(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "grad")
+
+
+# Represents a differentiable `Argument`.
+# How is it different from the `Argument` type?
+# - It's processed Arguments which are differentiable and only used in the
+#   context of the autograd codegen;
+# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
+@dataclass(frozen=True)
+class DifferentiableInput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+# Represents a differentiable `Return`.
+# How it it different from the `Return` type?
+# - The name in `Return` is optional. Here it is always populated using the same
+#   `cpp.return_names()` method.
+#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
+# - It's processed Returns which are differentiable, in compliance with the
+#   `output_differentiability` field defined in derivatives.yaml (if specified),
+#   and are only used in the context of the autograd codegen;
+@dataclass(frozen=True)
+class DifferentiableOutput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+@dataclass(frozen=True)
+class NativeFunctionWithDifferentiabilityInfo:
+    func: NativeFunction
+    info: Optional[DifferentiabilityInfo]
+    fw_derivatives: Sequence[ForwardDerivative]
+
+
+# TODO: Update comment below since it is out of date.
+def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
+    """How are we going to call the underlying implementation of a
+    declaration?  There are two strategies:
+        - use_derived: we want to call the implementation on CPUDoubleType
+          (or a similar, derived Type instance).  Because these derived
+          instances deal in Tensors, not Variables (it's a completely different
+          object, so it doesn't dispatch back to VariableType), code on
+          this dispatch path needs to wrap/unwrap tensors.  If the
+          derived implementation takes and returns tensors, the
+          implementation is usually differentiable (although we also use
+          the derived dispatch path for non-differentiable functions
+          that we still want to dispatch on the derived Type instance;
+          e.g., size())
+        - use_type: we want to call the implementation on Type, because
+          it is implemented concretely, and the functions it invokes will
+          get dispatched back to VariableType (which will ensure that they
+          are differentiable.)
+    """
+    if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives):
+        # If the function is abstract (not implemented on at::Type), we must
+        # call the implementation on the derived type with unpacked tensors.
+
+        # If the function has a derivative specified and is concrete, we could
+        # call either implementation. We prefer the calling the derived
+        # type's implementation with unpacked tensors because it is more
+        # performant in some cases: any internal calls to other ATen functions
+        # won't have the history tracked.
+
+        # If the function has a type dispatched argument (i.e. is a factory),
+        # we prefer calling the derived type's implementation both because it is
+        # more performant and to ensure factory functions return tensors with _version
+        # of 0 (probably not strictly necessary, but nice to have to keeps versions simple
+        # to understand.
+
+        return "use_derived"
+    else:
+        # If the function is concrete (we don't have to override it) and we
+        # didn't declare it in derivatives.yaml, we'll assume that it is
+        # actually implemented out of differentiable functions. (This
+        # assumption might not hold, but then you'll see gradcheck fail.)
+        return "use_type"
+
+
+def match_differentiability_info(
+    native_functions: List[NativeFunction],
+    differentiability_infos: Sequence[DifferentiabilityInfo],
+) -> List[NativeFunctionWithDifferentiabilityInfo]:
+    """Sets the "derivative" key on declarations to matching autograd function
+    In-place functions will use the out-of-place derivative definition if there
+    is no in-place specific derivative.
+    """
+
+    info_by_schema = {info.func.func: info for info in differentiability_infos}
+    functional_info_by_signature = {
+        info.func.func.signature(strip_default=True): info
+        for info in differentiability_infos
+        if info.func.func.kind() == SchemaKind.functional
+    }
+
+    def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]:
+        if f.func in info_by_schema:
+            return info_by_schema[f.func], True
+
+        # if there is no exact match look for the out-of-place signature.
+        # i.e mul() for mul_() or mul_out()
+        return (
+            functional_info_by_signature.get(f.func.signature(strip_default=True)),
+            False,
+        )
+
+    result: List[NativeFunctionWithDifferentiabilityInfo] = []
+    for f in native_functions:
+        info, is_exact_match = find_info(f)
+
+        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
+        # 'self' derivatives of an inplace function, so we must check for this case.
+        if f.func.kind() == SchemaKind.inplace and (info is not None):
+            for derivative in info.derivatives:
+                if "self" in derivative.var_names:
+                    for saved_input in derivative.saved_inputs:
+                        assert "strides_or_error" not in saved_input.expr, (
+                            "Calling '.strides()' in the 'self' derivative formula of an "
+                            f"in-place function is not supported: {f.func}"
+                        )
+
+        # For functions that have a single def for out-of-place and inplace (like abs())
+        if info and info.forward_derivatives:
+            forward_derivatives = info.forward_derivatives
+
+            if f.func.kind() == SchemaKind.inplace:
+                # For inplace functions there is a little bit of work to do:
+                #  1) Validate the formula and make sure the input that is modified in not used:
+                #    - If there is a formula for the inplace variant of the function (is_exact_match == True) then
+                #      we make sure that the original value of the input that is being modified inplace (self_p) is
+                #      not used in the formula. Note that the formula can use "original_self_p" here and that would
+                #      trigger a clone of the original input.
+                #    - If we are re-using the out of place formula (is_exact_match == False) then we replace every
+                #      occurrence of self_p and self_t by original_self_p and original_self_t. These will be
+                #      populated by cloned version of the original input (either the clone done by the backward AD
+                #      logic if self is also used in a backward formula or a special clone that we add).
+                #  2) At this point, there cannot be a self_p in the formula.
+                #  3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is
+                #     simply called self (as it is modified inplace).
+                #  4) Update the required primals data in case it used to contain "result" but should now contain
+                #     "self"
+                #  5) If it is not an exact match, the user formula is not modifying the existing forward grad
+                #     inplace as it should. So add some code that makes sure that we do so if the forward grad
+                #     already exists.
+
+                assert (
+                    len(info.forward_derivatives) == 1
+                )  # Only single output inplace should exist
+                fw_info = info.forward_derivatives[0]
+                formula = fw_info.formula
+
+                def replace_self_with_original_self(formula: str, postfix: str) -> str:
+                    def repl(m: Match[str]) -> str:
+                        return f"{m.group(1)}original_self{postfix}{m.group(2)}"
+
+                    return re.sub(IDENT_REGEX.format(f"self{postfix}"), repl, formula)
+
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    if is_exact_match:
+                        # For manually defined formulas, don't allow the original value to be used
+                        raise RuntimeError(
+                            f'The formula for "{f.func.name}" is using the original value of self '
+                            "that is being modified inplace. This would lead to wrong forward gradients. "
+                            'Please use "result" in the formula only.'
+                        )
+                    else:
+                        # When the original formula is out of place, we save a clone of the primal
+                        # value to be able to access this value if needed
+                        # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t"
+                        formula = replace_self_with_original_self(formula, "_p")
+                        formula = replace_self_with_original_self(formula, "_t")
+
+                # replace "result" from the formula by "self_p"
+                def repl(m: Match[str]) -> str:
+                    return f"{m.group(1)}self_p{m.group(2)}"
+
+                formula = re.sub(IDENT_REGEX.format("result"), repl, formula)
+
+                required_primals = fw_info.required_inputs_primal
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    required_primals = (
+                        required_primals + ("self",) if required_primals else ("self",)
+                    )
+
+                if not is_exact_match:
+                    # NOTE [In-place forward AD formula Optimization]
+                    #
+                    # This optimization transforms the formula to directly do inplace, i.e.
+                    # instead of self_t.copy_(self_t.op()) we do self_t.op_() when the following are met:
+                    #
+                    # 1) the formula satisfies the pattern: "self_t.op(*args)"
+                    # 2) "op" in (1) needs to be the same as the op the derivative is for
+                    #
+                    # (2) may seem too strict, but currently the only ops that satisfy (1) also satisfy (2)
+                    # If there is a need, we can relax (2) to allow any op that has an in-place variant
+                    is_single_method_on_self_t = False
+                    match = re.fullmatch(r"self_t.([\w]*)\((.*)\)", formula)
+                    if match:
+                        op_name, between_parens = match.group(1), match.group(2)
+
+                        # We want to...
+                        #   Match: self_t.op1(other_p.op2(arg))
+                        #   Avoid: self_t.op1(args) + self_t.op2(args)
+                        #   Avoid: self_t.op1(other_p.op2(arg)) + self_t.op2(args)
+                        def check_parens_nest_level_gt_zero(s: str) -> bool:
+                            level = 1
+                            for ch in s:
+                                if ch == ")":
+                                    level -= 1
+                                    if level == 0:
+                                        return False
+                                if ch == "(":
+                                    level += 1
+                            return True
+
+                        is_single_method_on_self_t = check_parens_nest_level_gt_zero(
+                            between_parens
+                        )
+                    directly_do_inplace = (
+                        is_single_method_on_self_t and op_name == info.name
+                    )
+
+                    if directly_do_inplace:
+                        formula = f"self_t_raw.defined() ? self_t_raw.{op_name}_({between_parens}) : {formula}"
+                    else:
+                        # Make sure that the forward grad is modified inplace when the original formula
+                        # is out of place
+                        formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}"
+
+                required_original_self_value = bool(
+                    re.search(IDENT_REGEX.format("original_self_p"), formula)
+                )
+
+                forward_derivatives = [
+                    ForwardDerivative(
+                        formula=formula,
+                        var_names=("self",),
+                        var_types=fw_info.var_types,
+                        required_inputs_fw_grad=fw_info.required_inputs_fw_grad,
+                        required_inputs_primal=required_primals,
+                        required_original_self_value=required_original_self_value,
+                        is_reusing_outplace_formula=not is_exact_match,
+                    ),
+                ]
+        else:
+            forward_derivatives = []
+
+        result.append(
+            NativeFunctionWithDifferentiabilityInfo(
+                func=f, info=info, fw_derivatives=forward_derivatives
+            )
+        )
+
+    return result
+
+
+def is_differentiable(
+    name: str, type: Type, info: Optional[DifferentiabilityInfo]
+) -> bool:
+    return type.is_tensor_like() and (
+        info is None or name not in info.non_differentiable_arg_names
+    )
+
+
+def gen_differentiable_outputs(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> List[DifferentiableOutput]:
+    f = fn.func
+    info = fn.info
+    outputs: List[DifferentiableOutput] = [
+        DifferentiableOutput(
+            name=name, type=ret.type, cpp_type=cpp.return_type(ret).cpp_type()
+        )
+        for name, ret in zip(cpp.return_names(f), f.func.returns)
+    ]
+    output_differentiability = info.output_differentiability if info else None
+    if output_differentiability is not None:
+        if len(output_differentiability) != len(outputs):
+            raise RuntimeError(
+                f"The length of output_differentiability ({len(output_differentiability)}), "
+                f"does not match the number of outputs ({len(outputs)})."
+            )
+        differentiable_outputs: List[DifferentiableOutput] = []
+        if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
+            raise RuntimeError(
+                "output_differentiability=False for inplace operation (version_counter won't get updated)"
+            )
+        for differentiable, output in zip(output_differentiability, outputs):
+            if differentiable:
+                differentiable_outputs.append(output)
+        return differentiable_outputs
+    candidate_differentiable_outputs = list(
+        filter(lambda r: is_differentiable(r.name, r.type, info), outputs)
+    )
+    if uses_single_grad(info):
+        return candidate_differentiable_outputs[:1]
+    else:
+        return candidate_differentiable_outputs
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
new file mode 100644
index 000000000000..12e4b5733e60
--- /dev/null
+++ b/torchgen/api/cpp.py
@@ -0,0 +1,405 @@
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    NamedCType,
+    CType,
+    MutRefCType,
+    ArrayCType,
+    ListCType,
+    VectorCType,
+    ArrayRefCType,
+    OptionalCType,
+    TupleCType,
+    SpecialArgName,
+    boolT,
+    scalarT,
+    tensorListT,
+    dimnameListT,
+    tensorT,
+    voidT,
+    longT,
+    BaseTypeToCppMapping,
+    intArrayRefT,
+    optionalIntArrayRefT,
+    tensorOptionsT,
+    symIntArrayRefT,
+)
+from torchgen import local
+from torchgen.utils import assert_never
+from typing import Optional, Sequence, Union, List, Set
+
+# This file describes the translation of JIT schema to the public C++
+# API, which is what people use when they call functions like at::add.
+#
+# Prominent characteristics of the C++ API:
+#
+#   - dtype, layout, device and pin_memory are collected into
+#     a single C++ type TensorOptions  (the native functions API
+#     also has this, but tensor options is really most relevant
+#     for the C++ API; it makes calling kwarg factory functions
+#     pleasant)
+#
+#   - defaulting lives here (in fact, the dispatcher is completely
+#     oblivious of defaults!)
+#
+# BTW: policy on name collisions: we try not to have types with
+# collisions, but functions are fair game to collide
+
+
+def name(func: FunctionSchema, *, faithful_name_for_out_overloads: bool = False) -> str:
+    name = str(func.name.name)
+    if func.is_out_fn():
+        if faithful_name_for_out_overloads:
+            name += "_outf"
+        else:
+            name += "_out"
+
+    return name
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type, *, binds: ArgName, remove_non_owning_ref_types: bool = False
+) -> Optional[NamedCType]:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        if remove_non_owning_ref_types:
+            if t.name == BaseTy.str:
+                raise AssertionError(
+                    "string ref->value conversion: not implemented yet"
+                )
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size))
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occuring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+def argumenttype_type(
+    t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types
+    )
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: remove these special cases, ArrayRef fallthrough works fine
+        if str(t.elem) == "int":
+            if remove_non_owning_ref_types:
+                return NamedCType(binds, VectorCType(BaseCType(longT)))
+            else:
+                return NamedCType(binds, BaseCType(intArrayRefT))
+        elif str(t.elem) == "Tensor":
+            return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ArrayRefCType(BaseCType(scalarT)))
+        elif str(t.elem) == "SymInt":
+            return NamedCType(binds, BaseCType(symIntArrayRefT))
+        elif str(t.elem) == "Dimname":
+            return NamedCType(binds, BaseCType(dimnameListT))
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(
+                binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+            )
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool) -> CType:
+    # placeholder is ignored
+    r = valuetype_type(t, binds="__placeholder__")
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert (
+            not mutable
+        ), "Native functions should never return a mutable tensor list. They should return void."
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return) -> CType:
+    return returntype_type(r.type, mutable=r.is_write)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return]) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0])
+    else:
+        return TupleCType([return_type(r) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: List[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "c10::nullopt",  # UGH this one is type directed
+    "Mean": "at::Reduction::Mean",
+    "[]": "{}",
+    "contiguous_format": "MemoryFormat::Contiguous",
+    "long": "at::kLong",
+}
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "c10::nullopt"
+
+        return default_expr(d, t.elem)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+
+
+def argument(
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
+    *,
+    cpp_no_default_args: Set[str],
+    method: bool,
+    faithful: bool,
+    has_tensor_options: bool,
+) -> List[Binding]:
+    def sub_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        if faithful:
+            return (
+                sub_argument(a.dtype)
+                + sub_argument(a.layout)
+                + sub_argument(a.device)
+                + sub_argument(a.pin_memory)
+            )
+        else:
+            default = None
+            # Enforced by NativeFunction.__post_init__
+            assert "options" not in cpp_no_default_args
+            if all(x.default == "None" for x in a.all()):
+                default = "{}"
+            elif a.dtype.default == "long":
+                default = "at::kLong"  # TODO: this is wrong
+            return [
+                Binding(
+                    nctype=NamedCType("options", BaseCType(tensorOptionsT)),
+                    name="options",
+                    default=default,
+                    argument=a,
+                )
+            ]
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments, *, faithful: bool, method: bool, cpp_no_default_args: Set[str]
+) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
diff --git a/torchgen/api/dispatcher.py b/torchgen/api/dispatcher.py
new file mode 100644
index 000000000000..ad1f17f71940
--- /dev/null
+++ b/torchgen/api/dispatcher.py
@@ -0,0 +1,102 @@
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+
+from torchgen.api.types import ArgName, Binding, NamedCType, CType
+from torchgen.api import cpp
+from torchgen.utils import concatMap, assert_never
+
+import itertools
+from typing import Sequence, List, Union
+
+# This file describes the translation of JIT schema to the dispatcher
+# API, the *unboxed* calling convention by which invocations through
+# the dispatcher are made.  Historically, the dispatcher API matched
+# the C++ API, but with the establishment of the boxed API, we've
+# made changes to the dispatcher API to so that the unboxed API
+# better aligns with the boxed API.  The dispatcher API hooks heavily
+# into our template based boxing/unboxing machinery, so changes
+# to this convention will usually need template updates too.
+#
+# Prominent characteristics of the dispatcher API:
+#
+#   - dtype, layout, device and pin_memory are represented as separate
+#     arguments.
+#
+
+
+def name(func: FunctionSchema) -> str:
+    return cpp.name(func)
+
+
+def argumenttype_type(
+    t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False
+) -> NamedCType:
+    # This is a faux amis.  If it makes sense in the future to add
+    # more special cases here, or invert things so cpp.argument_type
+    # calls this, or just completely inline the function, please do
+    # it.
+    return cpp.argumenttype_type(
+        t,
+        mutable=mutable,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+
+
+def argument_type(
+    a: Argument, *, binds: ArgName, remove_non_owning_ref_types: bool = False
+) -> NamedCType:
+    return argumenttype_type(
+        a.type,
+        mutable=a.is_write,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+
+
+def returns_type(rs: Sequence[Return]) -> CType:
+    # At present, there is no difference. But there could be!
+    return cpp.returns_type(rs)
+
+
+def jit_arguments(func: FunctionSchema) -> List[Argument]:
+    def to_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Argument]:
+        if isinstance(a, Argument):
+            return [a]
+        elif isinstance(a, SelfArgument):
+            return [a.argument]
+        elif isinstance(a, TensorOptionsArguments):
+            return [a.dtype, a.layout, a.device, a.pin_memory]
+        else:
+            assert_never(a)
+
+    return list(
+        concatMap(
+            to_argument,
+            itertools.chain(
+                func.arguments.positional, func.arguments.kwarg_only, func.arguments.out
+            ),
+        )
+    )
+
+
+def argument(a: Argument, *, remove_non_owning_ref_types: bool = False) -> Binding:
+    return Binding(
+        nctype=argument_type(
+            a, binds=a.name, remove_non_owning_ref_types=remove_non_owning_ref_types
+        ),
+        name=a.name,
+        argument=a,
+    )
+
+
+def arguments(func: FunctionSchema) -> List[Binding]:
+    return [argument(a) for a in jit_arguments(func)]
diff --git a/torchgen/api/functionalization.py b/torchgen/api/functionalization.py
new file mode 100644
index 000000000000..22ce2c3c4d00
--- /dev/null
+++ b/torchgen/api/functionalization.py
@@ -0,0 +1,174 @@
+from torchgen.model import (
+    FunctionSchema,
+    BaseTy,
+    BaseType,
+    NativeFunctionsViewGroup,
+    Argument,
+)
+from torchgen.api.types import (
+    Binding,
+    NamedCType,
+    ConstRefCType,
+    BaseCType,
+    CType,
+    tensorT,
+    longT,
+    boolT,
+)
+from torchgen.api import dispatcher
+from typing import List, Optional
+
+
+# This file describes the translation of JIT schema to API's used
+# when creating view lambdas that are used by the functionalization pass.
+# There are two types of lambdas: forward lambdas and reverse lambdas.
+# These API's mostly follow the dispatcher API, with a few quirks:
+# - The lambda capture has to convert reference types to value types
+# - While the forward lambda just directly calls into the at::_ops API
+#   (following the dispatcher convention), the logic here for the reverse lambda
+#   is responsible for generating both the call-site, and the declarations
+#   (which are implemented manually in the at::functionalization::impl namespace).
+
+# The lambdas generated for each view op in the functionalization pass are of the form
+# [capture_arguments](outer_arguments) -> returns_type {
+#     return name(inner_arguments);
+# }
+
+# Define some specific lambda input arguments.
+base_binding = Binding(
+    name="base",
+    nctype=NamedCType(name="base", type=ConstRefCType(BaseCType(tensorT))),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+mutated_view_binding = Binding(
+    name="mutated_view",
+    nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+mutated_view_idx_binding = Binding(
+    name="mutated_view_idx",
+    nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)),
+    argument=Argument(
+        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+    ),
+    default=None,
+)
+reapply_views_binding = Binding(
+    name="reapply_views",
+    nctype=NamedCType(name="reapply_views", type=BaseCType(boolT)),
+    argument=Argument(
+        name="reapply_views", type=BaseType(BaseTy.bool), default=None, annotation=None
+    ),
+    default=None,
+)
+
+# The lambda capture itself doesn't have a name.
+# The name returned here corresponds to the name of the inner function called by the lambda.
+def name(
+    g: NativeFunctionsViewGroup,
+    *,
+    is_reverse: bool,
+    include_namespace: bool,
+    reapply_views: Optional[bool] = None,
+) -> str:
+    if reapply_views is None:
+        # reapply_views is only important for the fwd lambda,
+        # since we always plumb the runtime "reapply_views" argument into the reverse function.
+        assert is_reverse
+    if is_reverse:
+        # for the reverse: the name of the inverse function always involves "view_copy",
+        # and we plumb the "reapply_views" flag into that function.
+        # (We could avoid doing that, but that would require writing out twice as many view inverse functions).
+        assert g.view_copy is not None
+        api_name = g.view_copy.func.name.unambiguous_name()
+        # in the reverse case, we codegen both the call-sites (which need the full namespace) and the declarations (which don't)
+        if include_namespace:
+            return f"at::functionalization::FunctionalInverses::{api_name}_inverse"
+        else:
+            return f"{api_name}_inverse"
+    # in the forward case, we just directly call into the at::_ops API (so we always need the namespace)
+    assert include_namespace
+    assert g.view_copy is not None
+    api_name = (
+        g.view.func.name.unambiguous_name()
+        if reapply_views
+        else g.view_copy.func.name.unambiguous_name()
+    )
+    return f"at::_ops::{api_name}::call"
+
+
+def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> List[Binding]:
+    # capture arguments include all arguments except `self`.
+    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
+    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    non_self_args = args[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+    all_bindings = [reapply_views_binding] + non_self_value_bindings
+    return all_bindings
+
+
+def returns_type(func: FunctionSchema) -> CType:
+    # Assertion: all view ops return tensor-like outputs
+    assert len(func.returns) >= 1
+    for ret in func.returns:
+        assert ret.type.is_tensor_like()
+    # However, the return type of the lambda is always an individual tensor.
+    # For multi-tensor outputs, each tensor needs to be tracked individually.
+    return BaseCType(tensorT)
+
+
+def outer_arguments(*, is_reverse: bool) -> List[Binding]:
+    if is_reverse:
+        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
+    else:
+        return [base_binding, mutated_view_idx_binding]
+
+
+def inner_call_index(func: FunctionSchema) -> Optional[Binding]:
+    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
+    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
+    if len(func.returns) > 1 or (
+        len(func.returns) == 1 and func.returns[0].type.is_list_like()
+    ):
+        return mutated_view_idx_binding
+    return None
+
+
+def inner_arguments(func: FunctionSchema, is_reverse: bool) -> List[Binding]:
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    non_self_args = args[1:]
+    # The forward lambda calls the at::_ops API, while the reverse lambda calls the view inverse API.
+    # Both of these follow the dispatcher API.
+    non_self_bindings = [dispatcher.argument(a) for a in non_self_args]
+    if not is_reverse:
+        # the forward lambda swaps out the original tensor argument with the lambd arg "base"
+        return [base_binding] + non_self_bindings
+    else:
+        # the reverse lambda does the same, but with an additional "mutated_view" arg
+        # additionally, we have a calling convention: for view ops that return multiple tensor outputs
+        # their corresponding view_inverse function takes in an additional index argument.
+        index_binding = inner_call_index(func)
+        if index_binding is not None:
+            return [
+                base_binding,
+                mutated_view_binding,
+                reapply_views_binding,
+                index_binding,
+            ] + non_self_bindings
+        else:
+            return [
+                base_binding,
+                mutated_view_binding,
+                reapply_views_binding,
+            ] + non_self_bindings
diff --git a/torchgen/api/lazy.py b/torchgen/api/lazy.py
new file mode 100644
index 000000000000..ff74f4ab34bd
--- /dev/null
+++ b/torchgen/api/lazy.py
@@ -0,0 +1,327 @@
+from typing import List, Union, Tuple, Optional
+from torchgen.model import (
+    Type,
+    BaseTy,
+    BaseType,
+    OptionalType,
+    ListType,
+    OperatorName,
+    FunctionSchema,
+    Return,
+    TensorOptionsArguments,
+    Argument,
+)
+from torchgen.api.types import (
+    CType,
+    BaseCppType,
+    BaseCType,
+    OptionalCType,
+    NamedCType,
+    deviceT,
+    layoutT,
+    VectorCType,
+    boolT,
+    longT,
+    doubleT,
+    ListCType,
+    stringT,
+    scalarT,
+    scalarTypeT,
+    memoryFormatT,
+    SymIntT,
+)
+
+
+_valueT = None
+
+
+def getValueT() -> BaseCppType:
+    global _valueT
+    if not _valueT:
+        raise NotImplementedError(
+            "The value type needs to be set with setValueT() in run_gen_lazy_tensor()"
+        )
+
+    return _valueT
+
+
+def setValueT(val: BaseCppType) -> None:
+    global _valueT
+    _valueT = val
+
+
+# this is a bad hack. I need to refactor the data model to represent each arg in the schema as an object,
+# making it easier to represent special properties of an arg.
+tensorListValueT = BaseCppType("torch::lazy", "Value")
+
+
+def process_ir_type(
+    typ: Type,
+) -> Union[BaseCType, VectorCType, OptionalCType, ListCType]:
+    """
+    This function takes a type from NativeFunctions and converts it for use with
+    lazy tensor codegen.
+
+    Type conversion for lazy currently consists of
+     (1) changing at::Tensors into lazy::Values
+     (2) wrapping everything in a BaseCType
+     (3) making cpp-reference types into cpp-value types (e.g. vector instead of IntArrayRef)
+
+    (1) converts at::Tensors to lazy::Values (which wrap lazy::Nodes, with which Lazy IR represents tensors.)
+    There is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like'
+
+    This is incomplete- there are assertions in places that it's expected to need to add
+    more types as the codegen is used with more operators.
+    """
+    if isinstance(typ, BaseType):
+        if typ.name == BaseTy.Tensor:
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.Scalar:
+            # at::scalar has special handling,
+            # and is wrapped in an lazy::Value just like at::tensor
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.ScalarType:
+            return BaseCType(scalarTypeT)
+        elif typ.name == BaseTy.int:
+            return BaseCType(longT)
+        elif typ.name == BaseTy.SymInt:
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.bool:
+            return BaseCType(boolT)
+        elif typ.name == BaseTy.float:
+            return BaseCType(doubleT)
+        elif typ.name == BaseTy.str:
+            return BaseCType(stringT)
+        elif typ.name == BaseTy.Device:
+            return BaseCType(deviceT)
+        elif typ.name == BaseTy.Layout:
+            return BaseCType(layoutT)
+        elif typ.name == BaseTy.MemoryFormat:
+            return BaseCType(memoryFormatT)
+        else:
+            raise AssertionError(f"TODO add support for type {repr(typ)}")
+    elif isinstance(typ, OptionalType):
+        return OptionalCType(process_ir_type(typ.elem))
+    elif isinstance(typ, ListType):
+        if str(typ.elem) == "Tensor?":
+            # TODO(whc) is this actually correct? or should it use a Vector like above
+            return ListCType(OptionalCType(BaseCType(getValueT())))
+        elif str(typ.elem) == "Tensor":
+            # this is a TensorList which comes in from GetTensorList as a Value
+            return BaseCType(tensorListValueT)
+        else:
+            return VectorCType(process_ir_type(typ.elem))
+    else:
+        raise AssertionError(f"unrecognized type {repr(typ)}")
+
+
+def isValueType(typ: CType) -> bool:
+    """
+    Given a type, determine if it is a Value-like type.  This is equivalent to
+    being Tensor-like, but assumes the type has already been transformed.
+    """
+    if isinstance(typ, BaseCType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        return typ.type == getValueT() or typ.type == scalarT or typ.type == SymIntT
+    elif isinstance(typ, (OptionalCType, ListCType, VectorCType)):
+        return isValueType(typ.elem)
+    return False
+
+
+def isSymIntType(typ: Type) -> bool:
+    return isinstance(typ, BaseType) and typ.name == BaseTy.SymInt
+
+
+def isWrappedScalarType(typ: Type) -> bool:
+    """
+    Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value.
+    Since we literally change the type from scalarT to valueT, information is lost.
+    This function helps build a list of wrapped scalars to save that information
+    """
+    if isinstance(typ, BaseType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        return typ.name == BaseTy.Scalar
+    elif isinstance(typ, (OptionalType, ListType)):
+        return isWrappedScalarType(typ.elem)
+    return False
+
+
+def isGeneratorType(typ: Type) -> bool:
+    if isinstance(typ, BaseType):
+        return typ.name == BaseTy.Generator
+    elif isinstance(typ, (OptionalType)):
+        return isGeneratorType(typ.elem)
+    return False
+
+
+class LazyArgument:
+    name: str
+    orig_type: Type
+    lazy_type_: Optional[CType]
+    is_wrapped_scalar: bool
+    is_generator: bool
+    is_symint_or_list: bool
+
+    # true if this argument is or contains a lazy IR value
+    is_lazy_value: bool
+
+    def __init__(self, arg: Argument):
+        self.name = arg.name
+        self.orig_type = arg.type
+        self.is_optional = isinstance(arg.type, OptionalType)
+        self.is_generator = isGeneratorType(arg.type)
+        if self.is_generator:
+            assert (
+                self.is_optional
+            ), "We expect all generators are optional since currently they are"
+            # there is no handling for generators in TorchScript IR (or XLA)
+            # so we fall back to eager if the (optional)generator has value, and otherwise
+            # its null and safe to exclude from lazy IR
+            self.lazy_type_ = None
+        else:
+            self.lazy_type_ = process_ir_type(arg.type)
+        self.is_wrapped_scalar = isWrappedScalarType(arg.type)
+        self.is_symint_or_list = isSymIntType(arg.type)
+
+        self.is_lazy_value = not self.is_generator and isValueType(self.lazy_type)
+
+    @property
+    def lazy_type(self) -> CType:
+        assert (
+            self.lazy_type_ is not None
+        ), f"Attempted to access lazy_type for invalid argument {self.name}"
+        return self.lazy_type_
+
+
+# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node.
+# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML),
+# but carries type information from a native FunctionSchema modified for use with IR nodes,
+# and preserving original argument names.
+class LazyIrSchema:
+    # The name of the operator this function schema describes.
+    name: "OperatorName"
+
+    positional_args: Tuple[LazyArgument, ...]
+    keyword_args: Tuple[LazyArgument, ...]
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Tuple["Return", ...]
+
+    # if this schema has a Generator arg, list its orig ctype/name but don't
+    # build a LazyArgument since lazy IR doesn't support it
+    generator_arg: Optional[NamedCType] = None
+
+    def __init__(self, func: FunctionSchema):
+
+        positional_args = []
+        for arg_field in ["pre_self_positional", "self_arg", "post_self_positional"]:
+            if arg_field == "self_arg" and func.arguments.self_arg is not None:
+                arg = getattr(func.arguments, "self_arg").argument
+                positional_args.append(LazyArgument(arg))
+            elif getattr(func.arguments, arg_field) is not None:
+                positional_args.extend(
+                    [LazyArgument(arg) for arg in getattr(func.arguments, arg_field)]
+                )
+        self.positional_args = tuple(positional_args)
+
+        keyword_args = []
+        for arg_field in [
+            "pre_tensor_options_kwarg_only",
+            "tensor_options",
+            "post_tensor_options_kwarg_only",
+            "out",
+        ]:
+            curr_args = getattr(func.arguments, arg_field)
+            if curr_args is not None:
+                if isinstance(curr_args, TensorOptionsArguments):
+                    curr_args = curr_args.all()
+                for arg in curr_args:
+                    if isGeneratorType(arg.type):
+                        assert (
+                            self.generator_arg is None
+                        ), "We expect there is only one generator arg"
+                        self.generator_arg = NamedCType(arg.name, arg.type)
+                keyword_args.extend([LazyArgument(arg) for arg in curr_args])
+        self.keyword_args = tuple(keyword_args)
+        self.name = func.name
+        self.returns = func.returns
+
+    @property
+    def node_name(self) -> str:
+        """
+        Return camel-case version of op in node.
+
+        Note: This function also appends any `overload_name` in the operation.
+        For example, if the op is `bitwise_and.Tensor`, the returned name
+        will be `BitwiseAndTensor`.
+        """
+        op_name = f"{self.name.name}_{self.name.overload_name}".lower()
+        return "".join(word.capitalize() or "" for word in op_name.split("_"))
+
+    @property
+    def aten_name(self) -> str:
+        return f"{self.name.name}"
+
+    @property
+    def base_name(self) -> str:
+        return f"{self.name.name.base}"
+
+    def filtered_args(
+        self,
+        positional: bool = True,
+        keyword: bool = True,
+        values: bool = True,
+        scalars: bool = True,
+        generator: bool = False,
+    ) -> List[LazyArgument]:
+        # This function maintains the sorted order of arguments but provides different filtered views.
+        # Some parts of the code care about kwargs vs args (TS lowerings),
+        # other parts care about whether they need to wrap the arg in a lazy value or leave it alone.
+        # Generators are special cased, as they are needed for fallback/shape-inference but not supported
+        # in TS lowerings and therefore also omitted from lazy IR.
+        args: List[LazyArgument] = []
+        if positional:
+            args.extend(self.positional_args)
+        if keyword:
+            args.extend(self.keyword_args)
+
+        if values and scalars and generator:
+            return args
+        elif values and scalars:
+            return [a for a in args if not a.is_generator]
+        elif values:
+            return [a for a in args if a.is_lazy_value]
+        elif scalars:
+            return [
+                a
+                for a in args
+                if not a.is_lazy_value and (generator or not a.is_generator)
+            ]
+
+        return []
+
+    @property
+    def positional_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=True, scalars=False
+        )
+
+    @property
+    def positional_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=False, scalars=True
+        )
+
+    @property
+    def keyword_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=True, scalars=False
+        )
+
+    @property
+    def keyword_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=False, scalars=True
+        )
diff --git a/tools/codegen/api/meta.py b/torchgen/api/meta.py
similarity index 77%
rename from tools/codegen/api/meta.py
rename to torchgen/api/meta.py
index cd55d59f281f..ad488d303d46 100644
--- a/tools/codegen/api/meta.py
+++ b/torchgen/api/meta.py
@@ -1,4 +1,4 @@
-from tools.codegen.model import NativeFunctionsGroup
+from torchgen.model import NativeFunctionsGroup
 
 # Follows dispatcher calling convention, but:
 #   - Mutable arguments not allowed.  Meta functions are always
@@ -6,6 +6,7 @@
 #   - No tensor returns; instead we return a TensorMeta describing
 #     the tensor in question
 
+
 def name(g: NativeFunctionsGroup) -> str:
     # use the overload name from the functional version
-    return str(g.functional.func.name).replace('.', '_')
+    return str(g.functional.func.name).replace(".", "_")
diff --git a/torchgen/api/native.py b/torchgen/api/native.py
new file mode 100644
index 000000000000..47610022e55a
--- /dev/null
+++ b/torchgen/api/native.py
@@ -0,0 +1,143 @@
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    NamedCType,
+    CType,
+    MutRefCType,
+    ListCType,
+    OptionalCType,
+    tensorT,
+    scalarT,
+    layoutT,
+    deviceT,
+    boolT,
+    scalarTypeT,
+)
+from torchgen.api import cpp
+from torchgen import local
+from torchgen.utils import assert_never
+
+from typing import Union, Sequence, List, Optional
+
+# This file describes the translation of JIT schema to the native functions API.
+# This looks a lot like the C++ API (which makes historical sense, because the
+# idea was you wrote native functions to implement functions in the C++ API),
+# but over time we have evolved the C++ API without actually changing our
+# native:: kernels.  The intention is to make native API and dispatcher API
+# line up as closely as possible, since this results in the least overhead
+# (no translation is needed from dispatcher API to native API).
+
+
+def name(func: FunctionSchema) -> str:
+    name = str(func.name.name)
+    # TODO: delete this!
+    if func.is_out_fn():
+        name += "_out"
+    if func.name.overload_name:
+        name += f"_{func.name.overload_name}"
+    return name
+
+
+def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType:
+    if str(t) == "Tensor?":
+        tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT))
+        if mutable and not local.use_const_ref_for_mutable_tensors():
+            return NamedCType(binds, MutRefCType(tensor_type))
+        else:
+            return NamedCType(binds, ConstRefCType(tensor_type))
+    elif str(t) == "Tensor?[]":
+        return NamedCType(
+            binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+        )
+    elif str(t) == "Scalar":
+        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+    elif str(t) == "Scalar?":
+        return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+    return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
+
+
+def returns_type(rs: Sequence[Return]) -> CType:
+    return cpp.returns_type(rs)
+
+
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+def argument(
+    a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool
+) -> List[Binding]:
+    # Ideally, we NEVER default native functions.  However, there are a number
+    # of functions that call native:: directly and rely on the defaulting
+    # existing.  So for BC, we generate defaults for non-out variants (but not
+    # for out variants, where it is impossible to generate an appropriate
+    # default)
+    should_default = not is_out
+    if isinstance(a, Argument):
+        default: Optional[str] = None
+        if should_default and a.default is not None:
+            default = cpp.default_expr(a.default, a.type)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, SelfArgument):
+        # Erase SelfArgument from the distinction
+        return argument(a.argument, is_out=is_out)
+    elif isinstance(a, TensorOptionsArguments):
+        default = None
+        if should_default:
+            default = "{}"
+        # TODO: Not sure why the arguments assigned here are for
+        # TensorOptionsArguments and not the constituent pieces.  It seems
+        # to matter
+        return [
+            Binding(
+                nctype=NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))),
+                name="dtype",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("layout", OptionalCType(BaseCType(layoutT))),
+                name="layout",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("device", OptionalCType(BaseCType(deviceT))),
+                name="device",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("pin_memory", OptionalCType(BaseCType(boolT))),
+                name="pin_memory",
+                default=default,
+                argument=a,
+            ),
+        ]
+    else:
+        assert_never(a)
+
+
+def arguments(func: FunctionSchema) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(func.arguments.non_out)
+    args.extend(func.arguments.out)
+    return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())]
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
new file mode 100644
index 000000000000..64ce1a9700f7
--- /dev/null
+++ b/torchgen/api/python.py
@@ -0,0 +1,1399 @@
+from dataclasses import dataclass
+from typing import Optional, Union, Sequence, Set, List, Dict, Tuple
+
+from torchgen.api.types import Binding, CppSignature, CppSignatureGroup
+from torchgen.api import cpp
+from torchgen.gen import pythonify_default
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    Type,
+    Variant,
+)
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           Data Models
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+# [Notes] python binding codegen
+#
+# The Python binding codegen produces code that takes the input list of
+# PyObjects, finds the matching ATen C++ function using PythonArgParser,
+# converts the PyObjects into C++ types and calls the ATen C++ function:
+#
+# +--------+  parsing   +------------------------+  binding   +-----------------------+
+# | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch |
+# +--------+            +------------------------+            +-----------------------+
+#
+# The following examples demonstrate the data models the Python binding
+# codegen needs to deal with and the tasks it needs to accomplish. It
+# helps understand the purpose of the new data types we introduced below.
+#
+#  - Function Schema (source of truth)
+#
+#      aten::empty.names(int[] size, *, Dimname[]? names,
+#                        ScalarType? dtype=None, Layout? layout=None,
+#                        Device? device=None, bool? pin_memory=None,
+#                        MemoryFormat? memory_format=None) -> Tensor
+#
+#  - Python Signature
+#
+#    It's used to generate input schema string for PythonArgParser.
+#    Note: TensorOptions fields are reordered and the additional
+#    'requires_grad' field is added:
+#
+#      empty(IntArrayRef size, *, DimnameList? names,
+#            MemoryFormat? memory_format=None, ScalarType dtype=None,
+#            Layout layout=torch.strided, Device device=None,
+#            bool pin_memory=False, bool requires_grad=False)
+#
+#  - C++ Signature
+#
+#    It's used to generate C++ lambda formals & dispatch call.
+#    Note: the scattered TensorOptions fields are packed into 'options'.
+#
+#      auto dispatch_empty =
+#          [](IntArrayRef size, c10::optional<DimnameList> names,
+#             const TensorOptions & options,
+#             c10::optional<MemoryFormat> memory_format) -> Tensor {
+#          pybind11::gil_scoped_release no_gil;
+#          return torch::empty(size, names, options, memory_format);
+#      };
+#
+#  - Binding between Python Arguments and C++ Arguments
+#
+#    Given a set of Python Arguments in scope, we need produce the
+#    binding expressions that translate the Python API into C++ API:
+#
+#            Python Args               Cpp Args       Binding Exprs
+#     -----------------------------------------------------------------
+#         0: size                      size           '_r.intlist(0)'
+#         1: names                     names          'names' [special init]
+#         2: memory_format -------+
+#         3: dtype         -----+-|--> options        'options' [special packing]
+#         4: layout            /  |
+#         5: device           /   +--> memory_format  '_r.memoryformatOptional(2)'
+#         6: pin_memory      /
+#         7: requires_grad -+
+#
+#    So the full dispatch expression would look like:
+#
+#      dispatch_empty(_r.intlist(0), names, options,
+#                     _r.memoryformatOptional(2))
+#
+#    Where does 'names' come from? It involves special local init:
+#
+#      auto __names = _r.toDimnameListOptional(1);
+#      c10::optional<DimnameList> names =
+#          __names ? c10::make_optional(DimnameList(__names.value()))
+#                  : c10::nullopt;
+#
+#    Where does 'options' come from? It involves special local init
+#    for TensorOptions. Note that Python side has the additional
+#    'requires_grad' field:
+#
+#      const auto options = TensorOptions()
+#          .dtype(_r.scalartype(3))
+#          .device(_r.device(5))
+#          .layout(_r.layoutOptional(4))
+#          .requires_grad(_r.toBool(7))
+#          .pinned_memory(_r.toBool(6));
+#
+#    In some other cases one Python Argument can map to multiple C++
+#    Arguments. For example:
+#
+#     aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False)
+#       -> (Tensor values, Tensor indices)
+#
+#            Python Args               Cpp Args          Binding Exprs
+#     ---------------------------------------------------------------------
+#                               +----> max               'out[0]'
+#                              /-----> max_values        'out[1]
+#         0: input            /        self              '_r.tensor(0)'
+#         1: dim             /         dim               '_r.dimname(1)'
+#         2: keepdim        /          keepdim           '_r.toBool(2)'
+#         3: out      -----+           [local init] out  '_r.tensorlist_n<2>(3)'
+#
+#    As demonstrated above, the binding can involve reordering,
+#    packing, unpacking and special local inits.
+#
+#
+#  Let's look at a concrete example:
+#
+#      static PythonArgParser parser({
+#        "abs(Tensor input, *, Tensor out=None)",
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- Python Schema, represented by PythonSignature and PythonArgument
+#
+#      }, /*traceable=*/true);
+#
+#      ParsedArgs<2> parsed_args;
+#      auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
+#
+#      ...
+#
+#      if (_r.isNone(1)) {
+#          ~~~~~~~~~~~~  <--- Scattered PythonArgParser output (arg name = 'out')
+#                             represented by PythonArgParserOutputExpr
+#
+#        // aten::abs(Tensor self) -> Tensor
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- NativeFunction schema, base version
+#
+#        auto dispatch_abs = [](const Tensor & self) -> Tensor {
+#                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#                             ^
+#                             +--- dispatch_lambda_args / dispatch_lambda_return_str
+#                                  generated from NativeFunction / CppSignature
+#                                  (deprecated PythonSignature is special)
+#                                  arguments are represented by DispatchLambdaArgument
+#
+#          pybind11::gil_scoped_release no_gil;
+#          return self.abs();
+#                 ~~~~~~~~~~~  <--- cpp_dispatch_target / cpp_dispatch_exprs
+#                                   generated from NativeFunction / CppSignature
+#        };
+#        return wrap(dispatch_abs(_r.tensor(0)));
+#                                 ~~~~~~~~~~~~~
+#                                  ^
+#                                  +--- dispatch_lambda_exprs
+#                                       binding PythonArgParserOutputExpr (python args)
+#                                       and DispatchLambdaArgument (c++ args)
+#
+#      } else {
+#        // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+#        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#         ^
+#         +--- NativeFunction schema, out-variant
+#
+#        auto dispatch_abs_out = [](Tensor out, const Tensor & self) -> Tensor {
+#          pybind11::gil_scoped_release no_gil;
+#          return at::abs_out(out, self);
+#        };
+#        return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0)));
+#      }
+#
+#
+# [Notes] python interface codegen
+# The python dataclasses below are used used to generate both python binding code
+# and pyi type hint signatures.
+# In theory these two should look very similar, but there are number of differences
+# in how pyi signatures vs. python_arg_parser signatures are generated.
+# These differences have been encapsulated in signature_str() vs. signature_str_pyi()
+# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments.
+# For examples, only pyi signatures include return types.
+
+
+@dataclass(frozen=True)
+class PythonReturns:
+    returns: Tuple[Return, ...]
+
+
+@dataclass(frozen=True)
+class PythonArgument:
+    name: str
+    type: Type
+    default: Optional[str]
+
+    # Used to generate the default init expr for some PythonArgParser outputs, e.g.:
+    #
+    #   _r.layoutWithDefault(3, layout_from_backend(self.options().backend())))
+    #                           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    #                            ^
+    #                            +--- default_init str
+    default_init: Optional[str]
+
+    # Compute argument formal for python argument parsing.
+    # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
+    def argument_str(self, *, method: bool = False) -> str:
+        type_str = argument_type_str(self.type).replace("const ", "").replace(" &", "")
+
+        name = self.name
+        # s/self/input/ outside method bindings
+        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
+        # for the parse string
+        if name == "self" and type_str == "Tensor" and not method:
+            name = "input"
+
+        # add default
+        if self.default is not None:
+            default = {
+                "nullptr": "None",
+                "c10::nullopt": "None",
+                "{}": "None",
+            }.get(self.default, self.default)
+            return f"{type_str} {name}={default}"
+        else:
+            return f"{type_str} {name}"
+
+    def argument_str_pyi(
+        self, *, method: bool = False, deprecated: bool = False
+    ) -> str:
+        type_str = argument_type_str_pyi(self.type)
+
+        name = self.name
+        # s/self/input/ outside method bindings
+        # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
+        # for the parse string
+        if name == "self" and type_str == "Tensor" and not method and not deprecated:
+            name = "input"
+
+        if name == "from":  # from is a Python keyword...
+            name += "_"
+
+        # pyi merges the _out and functional variants into the same signature, with an optional out arg
+        if name == "out" and type_str == "Tensor" and not deprecated:
+            type_str = "Optional[" + type_str + "]"
+
+        # pyi deprecated signatures don't get defaults for their out arg
+        treat_as_no_default = (
+            deprecated
+            and isinstance(self, PythonOutArgument)
+            and self.default == "None"
+        )
+
+        # add default
+        if self.default is not None and not treat_as_no_default:
+            if (
+                isinstance(self.type, ListType)
+                and self.type.elem == BaseType(BaseTy.int)
+                and self.default.startswith("{")
+                and self.default.endswith("}")
+            ):
+                default = "(" + self.default[1:-1] + ")"
+            else:
+                default = {
+                    "nullptr": "None",
+                    "c10::nullopt": "None",
+                    "{}": "None",
+                    "MemoryFormat::Contiguous": "contiguous_format",
+                    "QScheme::PER_TENSOR_AFFINE": "per_tensor_affine",
+                }.get(self.default, self.default)
+            return f"{name}: {type_str}={default}"
+        else:
+            return f"{name}: {type_str}"
+
+
+@dataclass(frozen=True)
+class PythonOutArgument(PythonArgument):
+    # In Python signature multiple output fields are packed into one 'out' argument.
+    # When binding to C++, it's first binded to a local 'out' variable:
+    #   'auto out = _r.tensorlist_n<2>(2);',
+    # then binded to scattered C++ output arguments as 'out[0]', 'out[1]', and etc.
+    # TODO: maybe don't need keep scattered out fields for python signature?
+    outputs: Tuple[PythonArgument, ...]
+
+    @staticmethod
+    def from_outputs(
+        outputs: Tuple[PythonArgument, ...]
+    ) -> Optional["PythonOutArgument"]:
+        if not outputs:
+            return None
+
+        size = len(outputs)
+        if size == 1:
+            return PythonOutArgument(
+                name=outputs[0].name,
+                type=outputs[0].type,
+                default="None",
+                default_init=None,
+                outputs=outputs,
+            )
+        elif size > 1:
+            if any(map(lambda a: not a.type.is_tensor_like(), outputs)):
+                raise RuntimeError(f"Unsupported output type: {outputs}")
+            return PythonOutArgument(
+                name="out",
+                # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None?
+                type=ListType(BaseType(BaseTy.Tensor), size),
+                default="None",
+                default_init=None,
+                outputs=outputs,
+            )
+        raise AssertionError(r"Unexpected PythonOutArgument size")
+
+
+@dataclass(frozen=True)
+class PythonSignature:
+    # Base operator name, without inplace/outplace suffix.
+    name: str
+
+    # Positional arguments.
+    # TODO: create a dedicated SelfArgument type for 'self'?
+    input_args: Tuple[PythonArgument, ...]
+
+    # Keyword arguments excluding the 'out' argument and scattered kwargs belonging
+    # to TensorOptions (dtype, layout, device, pin_memory, requires_grad, etc).
+    input_kwargs: Tuple[PythonArgument, ...]
+
+    output_args: Optional[PythonOutArgument]
+
+    # Return types, which are only used by pyi
+    returns: PythonReturns
+
+    # These are scattered kwargs arguments belonging to TensorOptions.
+    # When binding to C++, they are packed into a TensorOptions object 'options'.
+    # It's possible that the C++ signature doesn't take TensorOptions object (e.g.
+    # for out variant), in which case they will be used as scattered fields without
+    # being packed into 'options'.
+    # TODO: maybe create a PythonTensorOptionsArgument?
+    tensor_options_args: Tuple[PythonArgument, ...]
+
+    # method or function signature?
+    method: bool
+
+    @property
+    def deprecated(self) -> bool:
+        return False
+
+    def arguments(
+        self, *, skip_outputs: bool = False, skip_tensor_options: bool = False
+    ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]:
+        result: List[Union[PythonArgument, PythonOutArgument]] = []
+        result.extend(self.input_args)
+        result.extend(self.input_kwargs)
+        if self.output_args is not None and not skip_outputs:
+            result.append(self.output_args)
+        if not skip_tensor_options:
+            result.extend(self.tensor_options_args)
+        return tuple(result)
+
+    def arguments_count(self) -> int:
+        return len(self.arguments())
+
+    def output_idx(self) -> int:
+        return len(self.input_args) + len(self.input_kwargs)
+
+    # [old codegen] Compute the Python function signature for argument parsing,
+    # as specified in torch/csrc/utils/python_arg_parser.h.  WARNING:
+    # this is NOT the same type signature as specified by PEP 484
+    # as understood by mypy; our format was independently developed
+    # and has some quirks to make it more suitable specifically
+    # for error parsing.
+    #
+    # For a translation to mypy-valid type signatures, see
+    # signature_str_pyi().
+    def signature_str(self, *, skip_outputs: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = list(
+            map(lambda a: a.argument_str(method=self.method), args)
+        )
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        return f'{self.name}({", ".join(schema_formals)})'
+
+    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = list(
+            map(lambda a: a.argument_str_pyi(method=self.method), args)
+        )
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        # only pyi signatures include returns
+        returns_str = returns_str_pyi(self)
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
+        # only pyi uses vararg signatures
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = list(
+            map(lambda a: a.argument_str_pyi(method=self.method), args)
+        )
+        # vararg only applies to pyi signatures. vararg variants are not generated for all signatures
+        num_args = self.arguments_count()
+        num_positionalargs = len(self.input_args)
+
+        have_vararg_version = False
+        if num_args > 0:
+            vararg_type = args[0].type
+            if (
+                isinstance(vararg_type, ListType)
+                and str(vararg_type.elem) == "int"
+                and num_positionalargs == 1
+            ):
+                have_vararg_version = True
+
+        if not have_vararg_version:
+            return None
+        # Below are the major changes in vararg vs. regular pyi signatures
+        # vararg signatures also omit the asterix
+        schema_formals[0] = "*" + args[0].name + ": _int"
+
+        returns_str = returns_str_pyi(self)
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+
+# The deprecated python signature involves some special logic, so create a
+# dedicated data model to store these extra properties.
+@dataclass(frozen=True)
+class PythonSignatureDeprecated(PythonSignature):
+    # We need keep the order of arguments in deprecated signature.
+    # Particularly, method signature might have 'self' not at the beginning, e.g.:
+    #   addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
+    # When generating lambda function signature we need follow the exact order (even for method=True):
+    #   [](Scalar beta, const Tensor & self, const Tensor & mat1, const Tensor & mat2) -> Tensor
+    deprecated_args_names: Tuple[str, ...]
+
+    # The deprecated signature might miss some arguments that the corresponding
+    # C++ signature expects. We need store the constant default values to pass in.
+    # For example:
+    #   [deprecate signature]: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2)
+    #   [func schema]: aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    #   [func call]: self.addmm(mat1, mat2, beta, 1)
+    # We store ['self', 'mat1', 'mat2', 'beta', '1'] in this case.
+    deprecated_args_exprs: Tuple[str, ...]
+
+    @property
+    def deprecated(self) -> bool:
+        return True
+
+    def signature_str(self, *, skip_outputs: bool = False) -> str:
+        return (
+            PythonSignature.signature_str(self, skip_outputs=skip_outputs)
+            + "|deprecated"
+        )
+
+    def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = list(
+            map(lambda a: a.argument_str_pyi(method=self.method, deprecated=True), args)
+        )
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, "*")
+
+        returns_str = returns_str_pyi(self)
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]:
+        # the codegen doesn't include vararg variants for deprecated signatures
+        return None
+
+
+# This struct is used to hold the PythonSignature and its corresponding
+# NativeFunction BEFORE grouping base and out-variant functions.
+# Why not store NativeFunction in PythonSignature or construct PythonSignature
+# from NativeFunction? Because they are not 1-1 mapped.
+# One native function could have both deprecated and non-deprecated python
+# signatures - NativeFunction doesn't contain information to construct the
+# deprecated python signature.
+# One python signature is used to handle both the base and the out-variant
+# function - see 'PythonSignatureGroup'.
+@dataclass(frozen=True)
+class PythonSignatureNativeFunctionPair:
+    signature: PythonSignature
+    function: NativeFunction
+
+
+# We merge pairs of functions with signatures that are equivalent mod
+# output arguments, and use a single entry in the python_arg_parser sig
+# list for both (output arguments become optional).
+@dataclass(frozen=True)
+class PythonSignatureGroup:
+    # The signature used for Python argument parsing. The outplace signature
+    # is preferred if exists, because it can be used to parse inputs for both
+    # the out-place variant and the base version (with output omitted).
+    signature: PythonSignature
+
+    # The regular ATen declaration (e.g. conv2d)
+    base: NativeFunction
+
+    # The out variant (e.g. conv2d_out)
+    outplace: Optional[NativeFunction]
+
+
+# C++ function dispatch is wrapped in a lambda function. The lambda function
+# has almost the same signature as the C++ function, only with some small
+# variants - see details below.
+# This data model is used to represent arguments of the lambda function
+# signature.
+@dataclass(frozen=True)
+class DispatchLambdaArgument:
+    name: str
+    type_str: str
+    is_out_arg: bool
+
+
+# To pass PyObjects arguments to C++ function (via the lambda wrapper),
+# we need first convert PyObjects into simple C++ objects. This work
+# is done by PythonArgParser.
+# This data model is used to represent the output of PythonArgParser.
+# It has 1-1 mapping with PythonArgument in PythonSignature.
+@dataclass(frozen=True)
+class PythonArgParserOutputExpr:
+    # argument name
+    name: str
+
+    # RHS expression to reference PythonArgParser output.
+    expr: str
+
+    # In some special cases we need create different expr, e.g.:
+    # '_r.isNone(1)' instead of '_r.tensor(1)'.
+    index: int
+
+    # The python argument it maps to.
+    argument: PythonArgument
+
+    @property
+    def is_none_expr(self) -> str:
+        return f"_r.isNone({self.index})"
+
+
+# To pass PythonArgParser output to the lambda wrapper, we need bind
+# PythonArgParserOutputExpr to DispatchLambdaArgument.
+# They are not always 1-1 mapped, e.g. scattered TensorOptions fields
+# need be packed into a TensorOptions object, which is the argument
+# that the lambda function wrapper takes.
+@dataclass(frozen=True)
+class DispatchLambdaArgumentExprs:
+    # The exprs that provide the binding for lambda arguments, e.g.:
+    #
+    #   'self' -> '_r.tensor(0)'
+    #   'min' -> 'out[0]' / 'min_indices' -> 'out[1]'
+    #   'options' -> 'options'
+    #
+    # It has 1-1 mapping with DispatchLambdaArgument.
+    exprs: Sequence[str]
+
+    # Special local inits, which might introduce new variables that
+    # the 'exprs' above reference, e.g.:
+    #
+    #   'auto out = _r.tensorlist_n<2>(2);'
+    #
+    inits: Sequence[str]
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Helper Functions
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
+    return CppSignatureGroup.from_native_function(f, method=method).signature
+
+
+def has_tensor_options(f: NativeFunction) -> bool:
+    return f.func.arguments.tensor_options is not None
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Python Signature
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# 'simple_type' was introduced by the old codegen, which is slightly
+# different from the python schema type, e.g.: doesn't have '?' suffix
+# for optional Tensor/TensorList; doesn't have '[size]' suffix for list type.
+def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return "Tensor"
+        elif t.name == BaseTy.int:
+            return "int64_t"
+        elif t.name == BaseTy.float:
+            return "double"
+        elif t.name == BaseTy.str:
+            return "c10::string_view"
+        elif t.name in [
+            BaseTy.bool,
+            BaseTy.QScheme,
+            BaseTy.Scalar,
+            BaseTy.ScalarType,
+            BaseTy.Generator,
+            BaseTy.Storage,
+            BaseTy.Layout,
+            BaseTy.Device,
+            BaseTy.MemoryFormat,
+            BaseTy.Dimname,
+            BaseTy.Stream,
+            BaseTy.ConstQuantizerPtr,
+            BaseTy.SymInt,
+        ]:
+            # These python schema type names line up with their function schema names
+            return t.name.name
+
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            # Is it desired to keep '?' for simple_type with new style dispatcher?
+            return "Tensor?"
+        elem = argument_type_str(t.elem, simple_type=simple_type)
+        if elem == "Layout":
+            # TODO: fix this special case in PythonArgParser?
+            return "Layout"
+        else:
+            return f"{elem}?"
+
+    elif isinstance(t, ListType):
+        size = t.size if not simple_type else None
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return f"::std::array<bool,{t.size}>"
+        elif str(t.elem) == "int":
+            return f"IntArrayRef[{size}]" if size is not None else "IntArrayRef"
+        elif str(t.elem) == "SymInt":
+            return f"SymIntArrayRef[{size}]" if size is not None else "SymIntArrayRef"
+        elif str(t.elem) == "Tensor":
+            return f"TensorList[{size}]" if size is not None else "TensorList"
+        elif str(t.elem) == "Scalar":
+            return f"ScalarList[{size}]" if size is not None else "ScalarList"
+        elif str(t.elem) == "Tensor?":
+            if simple_type:
+                return "c10::List<c10::optional<Tensor>>"
+            else:
+                return "const c10::List<c10::optional<Tensor>> &"
+        elif str(t.elem) == "Dimname":
+            return f"DimnameList[{size}]" if size is not None else "DimnameList"
+        elem = argument_type_str(t.elem, simple_type=simple_type)
+        return f"ArrayRef<{elem}>"
+
+    raise RuntimeError(f"unrecognized type {repr(t)}")
+
+
+def argument_type_size(t: Type) -> Optional[int]:
+    l = t.is_list_like()
+    if l is not None and str(l.elem) != "bool":
+        return l.size
+    else:
+        return None
+
+
+def argument(a: Argument) -> PythonArgument:
+    return PythonArgument(
+        name=a.name,
+        type=a.type,
+        # TODO: directly translate a.default to python default
+        default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
+        if a.default is not None
+        else None,
+        default_init=None,
+    )
+
+
+# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen
+def signature(
+    f: NativeFunction, *, method: bool = False, pyi: bool = False
+) -> PythonSignature:
+    args: List[Argument] = []
+    args.extend(f.func.arguments.pre_self_positional)
+    # Skip SelfArgument if this is method.
+    if not method and f.func.arguments.self_arg is not None:
+        args.append(f.func.arguments.self_arg.argument)
+    args.extend(f.func.arguments.post_self_positional)
+    args.extend(f.func.arguments.pre_tensor_options_kwarg_only)
+    # Skip TensorOptionsArguments. Python side TensorOptions
+    # arguments are created based on different rules - see below.
+    args.extend(f.func.arguments.post_tensor_options_kwarg_only)
+    args.extend(f.func.arguments.out)
+
+    input_arg_set = set(a.name for a in f.func.arguments.flat_positional)
+    kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
+    out_arg_set = set(a.name for a in f.func.arguments.out)
+
+    input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args)))
+    input_kwargs = tuple(
+        map(argument, filter(lambda a: a.name in kwarg_only_set, args))
+    )
+    outputs = tuple(map(argument, filter(lambda a: a.name in out_arg_set, args)))
+
+    # Reintroduce the scattered fields of TensorOptions for Python.
+    # Compared to the cpp counterpart, the python arguments have new property
+    # (default_init) and a new argument 'requires_grad', which require some
+    # special handlings.
+    # [old codegen] TODO: because these aren't guaranteed to be 100% faithful
+    # to the original versions in the yaml, this recreation is a potential
+    # source of drift between eager and JIT. Pull this logic out to a shared place.
+
+    has_tensor_input_arg = any(
+        a.type.is_tensor_like() for a in f.func.arguments.flat_non_out
+    )
+    if any(a.name == "requires_grad" for a in f.func.schema_order_arguments()):
+        raise ValueError(
+            "argument named requires_grad is reserved, should not explicitly add it in the schema"
+        )
+
+    # [old codegen] this probably won't work if one of the returns is not a tensor,
+    # but it will produce a compile-time error that is obvious.
+    has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
+
+    name: str = cpp.name(f.func)
+    is_factory_function = f.category_override == "factory" or (
+        has_tensor_return and not has_tensor_input_arg
+    )
+    is_like_or_new_function = (
+        f.category_override in ("new", "like")
+        or name.startswith("new_")
+        or name.endswith("_like")
+    )
+
+    tensor_options_args: List[PythonArgument] = []
+    if is_factory_function or is_like_or_new_function:
+        tensor_options_args.append(
+            PythonArgument(
+                name="dtype",
+                type=BaseType(BaseTy.ScalarType),
+                default="None" if pyi else _dtype_default_type_hack(name),
+                default_init="self.scalar_type()" if is_like_or_new_function else None,
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="layout",
+                type=OptionalType(BaseType(BaseTy.Layout)),
+                default="strided" if pyi else "torch.strided",
+                default_init="self.layout()" if is_like_or_new_function else None,
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="device",
+                type=BaseType(BaseTy.Device),
+                default="None",
+                default_init="self.device()" if is_like_or_new_function else None,
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="pin_memory",
+                type=BaseType(BaseTy.bool),
+                default="False",
+                default_init=None,
+            )
+        )
+        tensor_options_args.append(
+            PythonArgument(
+                name="requires_grad",
+                type=BaseType(BaseTy.bool),
+                default="False",
+                default_init=None,
+            )
+        )
+
+    returns = PythonReturns(returns=f.func.returns)
+
+    return PythonSignature(
+        name=str(f.func.name.name),
+        input_args=input_args,
+        input_kwargs=input_kwargs,
+        output_args=PythonOutArgument.from_outputs(outputs),
+        tensor_options_args=tuple(tensor_options_args),
+        returns=returns,
+        method=method,
+    )
+
+
+# TODO blowtorch
+# note: removing this will be BC-breaking. A quick test shows that
+# randperm will otherwise default its dtype to torch.float64
+def _dtype_default_type_hack(name: str) -> str:
+    if name.startswith("randperm") or name == "tril_indices" or name == "triu_indices":
+        return "torch.int64"
+    else:
+        return "None"
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                          Python Interface
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
+    if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)):
+        return []
+    else:
+        if any(map(lambda r: r.name is None, returns)):
+            # When building on Windows, `PyStructSequence_UnnamedField` could not be
+            # resolved by the linker for some reason, which cause error in building:
+            #
+            # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
+            # PyStructSequence_UnnamedField
+            #
+            # Thus, at this point in time, we do not support unnamed
+            # fields in namedtuple; you must either name all fields,
+            # or none of them.
+            raise ValueError("Unnamed field is not supported by codegen")
+
+        return list(map(lambda r: str(r.name), returns))
+
+
+def argument_type_str_pyi(t: Type) -> str:
+    add_optional = False
+    if isinstance(t, OptionalType):
+        t = t.elem
+        add_optional = True
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.int:
+            ret = "_int"
+        if t.name == BaseTy.SymInt:
+            ret = "SymInt"
+        elif t.name == BaseTy.float:
+            ret = "_float"
+        elif t.name == BaseTy.str:
+            ret = "str"
+        elif t.name == BaseTy.Scalar:
+            ret = "Number"
+        elif t.name == BaseTy.ScalarType:
+            ret = "_dtype"
+        elif t.name == BaseTy.bool:
+            ret = "_bool"
+        elif t.name == BaseTy.QScheme:
+            ret = "_qscheme"
+        elif t.name == BaseTy.Layout:
+            ret = "_layout"
+        elif t.name == BaseTy.Device:
+            ret = "Union[_device, str, None]"
+        elif t.name == BaseTy.MemoryFormat:
+            ret = "memory_format"
+        elif t.name == BaseTy.Dimname:
+            ret = "Union[str, ellipsis, None]"
+        elif t.name in [BaseTy.Tensor, BaseTy.Generator, BaseTy.Storage, BaseTy.Stream]:
+            # These python schema type names line up with their function schema names
+            ret = t.name.name
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == "int":
+            ret = "Union[_int, _size]" if t.size is not None else "_size"
+        elif t.is_tensor_like():
+            # TODO: this doesn't seem right...
+            # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]]
+            # It should probably translate to   Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]]
+            if isinstance(t.elem, OptionalType):
+                add_optional = True
+            ret = (
+                "Union[Tensor, Tuple[Tensor, ...], List[Tensor]]"
+                if t.size is not None
+                else "Union[Tuple[Tensor, ...], List[Tensor]]"
+            )
+        elif str(t.elem) == "float":
+            ret = "Sequence[_float]"
+        else:
+            elem = argument_type_str_pyi(t.elem)
+            ret = f"Sequence[{elem}]"
+
+    if add_optional:
+        ret = "Optional[" + ret + "]"
+    return ret
+
+    raise RuntimeError(f"unrecognized type {repr(t)}")
+
+
+def return_type_str_pyi(t: Type) -> str:
+    # Where arguments are open to accepting Union, return types should return
+    # concrete types
+
+    if isinstance(t, OptionalType):
+        inner = return_type_str_pyi(t.elem)
+        return f"Optional[{inner}]"
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Device:
+            return "_device"
+        elif t.name == BaseTy.Dimname:
+            ret = "Optional[str]"
+        else:
+            return argument_type_str_pyi(t)
+
+    if isinstance(t, ListType):
+        inner = return_type_str_pyi(t.elem)
+        return f"List[{inner}]"
+
+    return argument_type_str_pyi(t)
+
+
+def returns_named_tuple_pyi(signature: PythonSignature) -> Optional[Tuple[str, str]]:
+    python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns]
+    namedtuple_name = signature.name
+    field_names = namedtuple_fieldnames(signature.returns.returns)
+    if field_names:
+        tuple_args = [
+            f'("{name}", {typ})' for name, typ in zip(field_names, python_returns)
+        ]
+        namedtuple_def = f'NamedTuple("{namedtuple_name}", [{", ".join(tuple_args)}])'
+        return namedtuple_name, namedtuple_def
+    return None
+
+
+def returns_str_pyi(signature: PythonSignature) -> str:
+    field_names = namedtuple_fieldnames(signature.returns.returns)
+    if field_names:
+        return f"torch.return_types.{signature.name}"
+
+    python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns]
+    if len(python_returns) > 1:
+        return "Tuple[" + ", ".join(python_returns) + "]"
+    if len(python_returns) == 1:
+        return python_returns[0]
+    return "None"
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        C++ Function Dispatch
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# This section provides APIs to generate the code that does C++ function
+# dispatch. The C++ function call is wrapped by a lambda function.
+# For example:
+#
+#    // aten::selu_(Tensor(a!) self) -> Tensor(a!)
+#    auto dispatch_selu_ = [](Tensor self) -> Tensor {
+#      pybind11::gil_scoped_release no_gil;
+#      return at::selu_(self);
+#    };
+#
+# The lambda function's signature follows the C++ signature in common
+# cases, e.g.:
+#
+#   // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+#   [](const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
+#
+# For out variant the 'out' argument's type is changed from 'Tensor &'
+# to 'Tensor'. It's because when calling the lambda it passes in the
+# PythonArgParser output '_r.tensor(3)', which is stack allocated object
+# and needs to pass by value. Also see comments in 'dispatch_lambda_return_str()'.
+#
+#   // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+#   [](Tensor out, const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor
+#
+# For multi-output case it can keep using reference type because the
+# PythonArgParser output has been unpacked to local variables, e.g.:
+#
+#   // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *,
+#   //     Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+#   [](Tensor & max, Tensor & max_values, const Tensor & self, Dimname dim, bool keepdim) -> std::tuple<Tensor,Tensor>
+#
+# For deprecated python signature, it should follow deprecated python arg order.
+# TODO: This is to keep same byte-for-byte result as the old codegen - maybe unnecessary?
+
+
+def dispatch_lambda_args(
+    ps: PythonSignature, f: NativeFunction
+) -> Tuple[DispatchLambdaArgument, ...]:
+    # Start with cpp arguments - dispatch lambda signature always include 'self'
+    cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments()
+
+    # Special reorder logic for deprecated python signature
+    if isinstance(ps, PythonSignatureDeprecated):
+        m: Dict[str, Binding] = dict((a.name, a) for a in cpp_args)
+        # reorder according to the deprecated signature
+        # ignore 'out' argument when binding to non-output function.
+        ordered_args = filter(
+            lambda n: n != "out" or f.func.is_out_fn(), ps.deprecated_args_names
+        )
+        cpp_args = list(map(lambda n: m[n], ordered_args))
+
+    out_args: Set[str] = set(a.name for a in f.func.arguments.out)
+
+    # Convert from cpp argument to lambda argument
+    def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
+        type_str = cpp_arg.type
+        is_out_arg = cpp_arg.name in out_args
+        if ps.method and cpp_arg.name == "self":
+            # For method's 'self', we can use 'const Tensor &' and simply ignore mutability!
+            type_str = "const at::Tensor &"
+        else:
+            # For other cases we need prevent dangling refs to temps (unless it's
+            # unpacked scattered output)
+            # The reason is explained in the comments above and in 'dispatch_lambda_return_str()'.
+            # TODO: avoid this special handling?
+            ensure_temp_safe = len(out_args) <= 1 or not is_out_arg
+            if ensure_temp_safe:
+                type_str = {
+                    "at::Tensor &": "at::Tensor",
+                }.get(type_str, type_str)
+        return DispatchLambdaArgument(
+            name=cpp_arg.name,
+            type_str=type_str,
+            is_out_arg=is_out_arg,
+        )
+
+    return tuple(map(dispatch_lambda_arg, cpp_args))
+
+
+# [old codegen] XXX: if you got here because of an assertion failure, it doesn't mean
+# it's enough to just extend the list here. Before you do this, make sure
+# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h.
+SUPPORTED_RETURN_TYPES = {
+    "at::Tensor",
+    "::std::tuple<at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,double,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t>",
+    "::std::tuple<at::Tensor,at::Tensor,double,at::Tensor,int64_t>",
+    "::std::tuple<double,int64_t>",
+    "::std::tuple<at::Tensor,::std::vector<at::Tensor>>",
+    "::std::vector<at::Tensor>",
+    "at::Scalar",
+    "bool",
+    "int64_t",
+    "void*",
+    "void",
+    "at::QScheme",
+    "double",
+    "at::IntArrayRef",
+    "at::ScalarType",
+}
+
+
+def dispatch_lambda_return_str(f: NativeFunction) -> str:
+    # [old codegen] Remove type annotation (e.g. 'Tensor' rather than 'Tensor &')
+    # because the dispatch lambdas take mutable arguments *by value*, not
+    # by reference. If you then return a reference to such an argument, you
+    # will now have a pointer to a dangling stack entry. Not good.
+    #
+    # You want:
+    #
+    #   auto dispatch_selu_ = [](Tensor self) -> Tensor { ...; return at::selu_(self); };
+    #                                            ^^^^^^
+    #
+    # *not*
+    #
+    #   auto dispatch_selu_ = [](Tensor self) -> Tensor& { ...; return at::selu_(self); };
+    #                                            ^^^^^^^
+    #
+    # (NB: We can't make dispatch_selu_ take Tensor&, because the enclosing
+    # codegen looks like dispatch_selu_(_r.tensor(0)), and you can't take a
+    # mutable reference to temporary.  Maybe we could assign it to a
+    # variable itself.)
+    returns_without_annotation = tuple(
+        map(lambda r: Return(r.name, r.type, None), f.func.returns)
+    )
+    return_str = cpp.returns_type(returns_without_annotation).cpp_type()
+    if return_str not in SUPPORTED_RETURN_TYPES:
+        raise RuntimeError(f"{f.func.name} returns unsupported type {return_str}")
+    return return_str
+
+
+def cpp_dispatch_target(f: NativeFunction) -> str:
+    name = cpp.name(f.func)
+    if Variant.method in f.variants:
+        return f"self.{name}"
+    if Variant.function in f.variants:
+        if has_tensor_options(f) or f.func.name.name.base.endswith("_like"):
+            namespace = "torch"
+        else:
+            namespace = "at"
+        return f"{namespace}::{name}"
+    raise RuntimeError(f"could not dispatch, neither function nor method: {f.func}")
+
+
+def cpp_dispatch_exprs(
+    f: NativeFunction,
+    *,
+    python_signature: Optional[PythonSignature] = None,
+) -> Tuple[str, ...]:
+    cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments()
+
+    exprs: Tuple[str, ...] = tuple()
+    if not isinstance(python_signature, PythonSignatureDeprecated):
+        # By default the exprs are consistent with the C++ signature.
+        exprs = tuple(map(lambda a: a.name, cpp_args))
+    else:
+        # For deprecated python signature we may need fill in some constants.
+        exprs = tuple(
+            filter(
+                lambda n: n != "out" or f.func.is_out_fn(),
+                python_signature.deprecated_args_exprs,
+            )
+        )
+
+    if Variant.method in f.variants:
+        exprs = tuple(filter("self".__ne__, exprs))
+
+    return exprs
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                     Python / C++ Args Binding
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# We explicitly enumerate the PythonArgParser unpacking methods for all
+# supported types. This might be more verbose than necessary, partially
+# because of the irregularity of unpacking method naming, partially
+# because we want to mimic the old codegen behavior - to reject
+# unexpected and/or unsupported cases which the old codegen rejects.
+# For certain cases it is intentionally more restrictive than necessary,
+# e.g.: it doesn't accepts doublelist with definite size.
+def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
+    if has_default and str(t) not in ("ScalarType", "Device", "Layout?"):
+        raise RuntimeError(f"type '{t}' does not supported unpacking with default")
+
+    if isinstance(t, BaseType):
+        if t.name in [
+            BaseTy.Tensor,
+            BaseTy.Stream,
+            BaseTy.Storage,
+            BaseTy.Scalar,
+            BaseTy.Dimname,
+        ]:
+            # These unpack methods line up with their schema names
+            return t.name.name.lower()
+        elif t.name == BaseTy.ScalarType:
+            return "scalartypeWithDefault" if has_default else "scalartype"
+        elif t.name == BaseTy.Device:
+            return "deviceWithDefault" if has_default else "device"
+        elif t.name == BaseTy.int:
+            return "toInt64"
+        elif t.name == BaseTy.SymInt:
+            return "toSymInt"
+        elif t.name == BaseTy.bool:
+            return "toBool"
+        elif t.name == BaseTy.float:
+            return "toDouble"
+        elif t.name == BaseTy.str:
+            return "stringView"
+        elif t.name == BaseTy.Layout:
+            return "layout"
+
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            return "optionalTensor"
+
+        elif isinstance(t.elem, BaseType):
+            if t.elem.name in [
+                BaseTy.ScalarType,
+                BaseTy.Scalar,
+                BaseTy.int,
+                BaseTy.bool,
+                BaseTy.float,
+                BaseTy.str,
+            ]:
+                # Regular cases: append 'Optional' to elem's unpacking method
+                return arg_parser_unpack_method(t.elem, False) + "Optional"
+            elif t.elem.name == BaseTy.MemoryFormat:
+                return "memoryformatOptional"
+            elif t.elem.name == BaseTy.Generator:
+                return "generator"
+            elif t.elem.name == BaseTy.Layout:
+                return "layoutWithDefault" if has_default else "layoutOptional"
+            elif t.elem.name == BaseTy.Device:
+                return "deviceWithDefault" if has_default else "deviceOptional"
+
+        elif isinstance(t.elem, ListType):
+            if str(t.elem.elem) == "int":
+                # accept definite size
+                return "intlistOptional"
+            elif str(t.elem) == "float[]":
+                return "doublelistOptional"
+            elif str(t.elem) == "Dimname[]":
+                return "toDimnameListOptional"
+
+    elif isinstance(t, ListType):
+        if str(t.elem) == "Tensor":
+            # accept and use definite size
+            if t.size is not None:
+                return f"tensorlist_n<{t.size}>"
+            else:
+                return "tensorlist"
+        elif str(t.elem) == "Tensor?":
+            return "list_of_optional_tensors"
+        elif str(t.elem) == "Dimname":
+            # accept definite size
+            return "dimnamelist"
+        elif str(t.elem) == "int":
+            # accept definite size
+            return "intlist"
+        elif str(t) == "float[]":
+            return "doublelist"
+        elif str(t.elem) == "SymInt":
+            # accept definite size
+            return "symintlist"
+        elif str(t) == "Scalar[]":
+            return "scalarlist"
+    raise RuntimeError(f"type '{t}' is not supported by PythonArgParser")
+
+
+# Return RHS expression for python argument using PythonArgParser output.
+# e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)'
+def arg_parser_output_expr(
+    arg_index: int, a: PythonArgument
+) -> PythonArgParserOutputExpr:
+    has_default = a.default_init is not None
+    unpack_method = arg_parser_unpack_method(a.type, has_default)
+    default = f", {a.default_init}" if has_default else ""
+    expr = f"_r.{unpack_method}({arg_index}{default})"
+
+    return PythonArgParserOutputExpr(
+        name=a.name,
+        expr=expr,
+        index=arg_index,
+        argument=a,
+    )
+
+
+# Returns a map with key = arg_name and value = PythonArgParserOutputExpr.
+def arg_parser_output_exprs(
+    ps: PythonSignature, f: NativeFunction
+) -> Dict[str, PythonArgParserOutputExpr]:
+    return {
+        e.name: e
+        for i, a in enumerate(ps.arguments())
+        for e in (arg_parser_output_expr(i, a),)
+    }
+
+
+# argument name to type for scattered tensor options fields
+TENSOR_OPTIONS_FIELDS = {
+    "dtype": "ScalarType",
+    "device": "Device",
+    "layout": "Layout?",
+    "pin_memory": "bool",
+    "requires_grad": "bool",
+}
+
+# bind arg parser outputs (python args) with dispatch lambda arguments (c++ args).
+def dispatch_lambda_exprs(
+    ps: PythonSignature, f: NativeFunction
+) -> DispatchLambdaArgumentExprs:
+    # This method is to bind 'arg_parser_outputs' and 'lambda_args' by producing
+    # 'inits' and 'lambda_args_exprs' for each lambda argument using arg parser
+    # outputs.
+    arg_parser_outputs = arg_parser_output_exprs(ps, f)
+    lambda_args = dispatch_lambda_args(ps, f)
+    inits: List[str] = []
+    lambda_args_exprs: Dict[str, str] = dict()
+
+    has_toptions = has_tensor_options(f)
+
+    # 1. special inits/unpacking to provide binding exprs for lambda arguments.
+    for a in ps.arguments(skip_tensor_options=True):
+        name = a.name
+        arg_parser_expr = arg_parser_outputs[a.name].expr
+
+        if has_toptions and name == "self":
+            # TODO: why this needs to be special case?
+            inits.extend(
+                [
+                    f"auto self = {arg_parser_expr};",
+                ]
+            )
+            lambda_args_exprs[name] = name
+        elif (
+            isinstance(a, PythonOutArgument)
+            and len(a.outputs) > 1
+            and f.func.is_out_fn()
+        ):
+            inits.extend(
+                [
+                    f"auto out = {arg_parser_expr};",
+                ]
+            )
+            for i, out_arg in enumerate(a.outputs):
+                lambda_args_exprs[out_arg.name] = f"out[{i}]"
+        elif str(a.type) == "Dimname[]?":
+            # [old codegen]
+            # TODO: make this part of something more general, or get rid of it.
+            # optional<ArrayRef<T>> are special. The PythonArgParser returns an
+            # optional<vector<T>>, which cannot be implicitly converted to
+            # optional<ArrayRef<T>>. One needs to unwrap the optional and rewrap.
+            inits.extend(
+                [
+                    f"auto __{name} = {arg_parser_expr};",
+                    f"c10::optional<DimnameList> {name} = __{name} ? c10::make_optional(DimnameList(__{name}.value())) : c10::nullopt;",  # noqa: B950
+                ]
+            )
+            lambda_args_exprs[name] = name
+        else:
+            # default case - directly using PythonArgParser output expr
+            lambda_args_exprs[name] = arg_parser_expr
+
+    # method's self is passed directly to python binding, rather than parsed
+    if ps.method:
+        lambda_args_exprs["self"] = "self"
+
+    # 2. special packing/checking for TensorOptions.
+    tensor_options_args_names = list(map(lambda a: a.name, ps.tensor_options_args))
+    if has_toptions:
+        if f.func.is_out_fn():
+            raise RuntimeError(f"{f.func}: tensor options with output arg")
+        for a in ps.tensor_options_args:
+            if a.name not in TENSOR_OPTIONS_FIELDS:
+                raise RuntimeError(
+                    f"{f.func}: unrecognized tensor options field '{a.name}' in python binding arguments"
+                )
+            if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name):
+                raise RuntimeError(
+                    f"{f.func}: unrecognized type '{str(a.type)}' for tensor options field '{a.name}'"
+                )
+        if not all(
+            map(lambda a: a in tensor_options_args_names, TENSOR_OPTIONS_FIELDS.keys())
+        ):
+            raise RuntimeError(
+                f"{f.func}: incomplete tensor options args: {tensor_options_args_names}"
+            )
+
+        inits.append(
+            f"""\
+const auto options = TensorOptions()
+    .dtype({arg_parser_outputs['dtype'].expr})
+    .device({arg_parser_outputs['device'].expr})
+    .layout({arg_parser_outputs['layout'].expr})
+    .requires_grad({arg_parser_outputs['requires_grad'].expr})
+    .pinned_memory({arg_parser_outputs['pin_memory'].expr});
+torch::utils::maybe_initialize_cuda(options);
+"""
+        )
+        lambda_args_exprs["options"] = "options"
+
+    # 3. special case - access scattered TensorOptions fields without packing
+    # TODO: maybe move to the generator side as it's not related to binding.
+    if not has_toptions and tensor_options_args_names:
+        if "dtype" in tensor_options_args_names:
+            # we're an output-arg variant, check these args against output tensor
+            if not f.func.is_out_fn():
+                raise RuntimeError(
+                    f"{f.func}: dtype in tensor_options_args without output arg"
+                )
+            if not all(
+                map(lambda a: a in tensor_options_args_names, ("layout", "device"))
+            ):
+                raise RuntimeError(
+                    f"{f.func}: incomplete tensor options for output check"
+                )
+
+            inits.append(
+                f"""\
+check_out_type_matches({arg_parser_outputs['out'].expr}, {arg_parser_outputs['dtype'].expr},
+                       {arg_parser_outputs['dtype'].is_none_expr}, {arg_parser_outputs['layout'].expr},
+                       {arg_parser_outputs['device'].expr}, {arg_parser_outputs['device'].is_none_expr});
+"""
+            )
+        # we'll set requires_grad on outgoing tensor
+        if "requires_grad" not in tensor_options_args_names:
+            raise RuntimeError(
+                f'{f.func}: expected "requires_grad" in tensor_options_args absent, but found [{tensor_options_args_names}]'
+            )
+
+    return DispatchLambdaArgumentExprs(
+        exprs=tuple(map(lambda a: lambda_args_exprs[a.name], lambda_args)),
+        inits=inits,
+    )
diff --git a/tools/codegen/api/structured.py b/torchgen/api/structured.py
similarity index 76%
rename from tools/codegen/api/structured.py
rename to torchgen/api/structured.py
index a8c714a293f4..2a0ecd918292 100644
--- a/tools/codegen/api/structured.py
+++ b/torchgen/api/structured.py
@@ -1,14 +1,36 @@
-from tools.codegen.model import (Argument, BaseTy, BaseType, ListType,
-                                 NativeFunctionsGroup, OptionalType,
-                                 SelfArgument, TensorOptionsArguments, Type)
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunctionsGroup,
+    OptionalType,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
 
-from tools.codegen.api.types import (ArgName, BaseCType, Binding, ArrayRefCType,
-                                     ConstRefCType, OptionalCType, NamedCType,
-                                     tensorT, scalarT, intArrayRefT, dimnameListT,
-                                     optionalTensorRefT, optionalScalarRefT)
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ArrayRefCType,
+    ConstRefCType,
+    OptionalCType,
+    NamedCType,
+    tensorT,
+    scalarT,
+    intArrayRefT,
+    dimnameListT,
+    optionalTensorRefT,
+    optionalScalarRefT,
+    optionalIntArrayRefT,
+    iTensorListRefT,
+    iOptTensorListRefT,
+)
 
-from tools.codegen.api import cpp
-from tools.codegen.utils import assert_never
+from torchgen.api import cpp
+from torchgen.utils import assert_never
 
 from typing import Union, List
 
@@ -37,30 +59,32 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType:
             return NamedCType(binds, BaseCType(optionalTensorRefT))
         elif t.elem == BaseType(BaseTy.Scalar):
             return NamedCType(binds, BaseCType(optionalScalarRefT))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         return NamedCType(binds, OptionalCType(elem.type))
     elif isinstance(t, ListType):
         if t.elem == BaseType(BaseTy.Tensor):
-            raise AssertionError(
-                "list of tensor not supported by structured yet; to implement this "
-                "resolve torch::List issue, see "
-                "https://fb.workplace.com/groups/894363187646754/permalink/1149276442155426"
-            )
-        # TODO: delete these special cases; see tools.codegen.api.cpp--these
+            return NamedCType(binds, BaseCType(iTensorListRefT))
+        elif t.elem == OptionalType(BaseType(BaseTy.Tensor)):
+            return NamedCType(binds, BaseCType(iOptTensorListRefT))
+        # TODO: delete these special cases; see torchgen.api.cpp--these
         # must be changed in tandem, but there are problems; see
         # https://github.com/pytorch/pytorch/pull/51485
-        elif str(t.elem) == 'int':
+        elif str(t.elem) == "int":
             return NamedCType(binds, BaseCType(intArrayRefT))
-        elif str(t.elem) == 'Dimname':
+        elif str(t.elem) == "Dimname":
             return NamedCType(binds, BaseCType(dimnameListT))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         return NamedCType(binds, ArrayRefCType(elem.type))
     else:
         raise AssertionError(f"unrecognized type {repr(t)}")
 
+
 def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
     return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
 
+
 # returns_type intentionally omitted, because structured kernels never "return";
 # instead, they always indirectly report their outputs (in the case of a meta
 # function, by calling set_output; in the case of an impl function, by writing
@@ -69,12 +93,14 @@ def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
 # Structured kernels are never defaulted
 def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]:
     if isinstance(a, Argument):
-        return [Binding(
-            nctype=argument_type(a, binds=a.name),
-            name=a.name,
-            default=None,
-            argument=a,
-        )]
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name),
+                name=a.name,
+                default=None,
+                argument=a,
+            )
+        ]
     elif isinstance(a, SelfArgument):
         return argument(a.argument)
     elif isinstance(a, TensorOptionsArguments):
@@ -82,6 +108,7 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B
     else:
         assert_never(a)
 
+
 def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
 
@@ -89,7 +116,9 @@ def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
         # A list of parameters for the impl function with
         # certain parameters replaced with precomputed counterparts
         # as specified in native_functions.yaml.
-        non_out_args_replaced: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+        non_out_args_replaced: List[
+            Union[Argument, TensorOptionsArguments, SelfArgument]
+        ] = []
         for a in g.out.func.arguments.non_out:
             if isinstance(a, Argument) and a.name in g.out.precomputed.replace:
                 # If a is in precompute.replace, append the parameters
@@ -110,11 +139,13 @@ def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
     args.extend(g.out.func.arguments.out)
     return [r for arg in args for r in argument(arg)]
 
+
 def meta_arguments(g: NativeFunctionsGroup) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
     args.extend(g.functional.func.arguments.non_out)
     return [r for arg in args for r in argument(arg)]
 
+
 def out_arguments(g: NativeFunctionsGroup) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
     args.extend(g.out.func.arguments.out)
diff --git a/torchgen/api/translate.py b/torchgen/api/translate.py
new file mode 100644
index 000000000000..3d05c531734c
--- /dev/null
+++ b/torchgen/api/translate.py
@@ -0,0 +1,353 @@
+from typing import Dict, Sequence, List, NoReturn, Union
+from torchgen.api.types import (
+    ListCType,
+    tensorListT,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    Expr,
+    MutRefCType,
+    OptionalCType,
+    NamedCType,
+    SpecialArgName,
+    tensorT,
+    memoryFormatT,
+    tensorOptionsT,
+    scalarTypeT,
+    boolT,
+    deviceT,
+    layoutT,
+    optionalTensorRefT,
+    iTensorListRefT,
+    iOptTensorListRefT,
+    scalarT,
+    optionalScalarRefT,
+    VectorCType,
+    longT,
+    intArrayRefT,
+    scalar_t,
+    opmath_t,
+    optionalIntArrayRefT,
+)
+
+# This file implements a small program synthesis engine that implements
+# conversions between one API to another.
+#
+# The key data type in this file in NamedCType, short for Named C++ semantic type.  A NamedCType
+# represents a C++ type, plus semantic information about what it represents.
+# For example, consider the argument "bool pin_memory"; its normal C++ type is
+# "bool", but its C++ semantic type also keeps track that this represents a
+# "pin_memory"; you can't just use a random other boolean in a context where you
+# need a "pin_memory"!
+#
+# The translator takes a list of needed NamedCTypes, and then figures out how
+# to construct expressions with these NamedCTypes from the given bindings.  Many
+# of these expressions are trivial (I need a Tensor other; there's a Tensor
+# other scope); others are more nontrivial and may require packing/unpacking.
+# Some examples of non-trivial action:
+#
+#   - Need the "dtype" binding?  Well, maybe "dtype" isn't available
+#     in the context, instead, "options" is, and you need to extract
+#     it from there.  (Gather)
+#
+#   - Need the "context" binding?  Well, maybe "context" isn't available
+#     in the context, and you need to construct it from "dtype", "device",
+#     etc.  (Scatter)
+#
+#   - Need the "memory_format" binding?  Well, actually, it's available
+#     from both "memory_format" and "options", so you had better make sure
+#     they are consistent.  (Join)
+
+options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT)))
+
+longVec_ctype = VectorCType(BaseCType(longT))
+optionalLongVec_ctype = OptionalCType(VectorCType(BaseCType(longT)))
+optionalScalar_ctype = OptionalCType(BaseCType(scalarT))
+optionalTensor_ctype = OptionalCType(BaseCType(tensorT))
+
+
+class UnsatError(RuntimeError):
+    pass
+
+
+# Given a set of in-scope bindings and a set of target bindings, synthesize
+# a list of expressions that uses only the in-scope bindings (bindings) that
+# have all of the types of goals.  You may want to use this function if
+# you're generating code for a function like:
+#
+#   void f({args}) {
+#     g({exprs}); // g is a different API
+#   }
+#
+# and you need to generate "exprs".
+#
+# Typically, a list of Bindings is convenient to get (you usually call something
+# like arguments() to get them); but technically you only need less information:
+# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for
+# 'goals', an (ordered) list of NamedCType goals is sufficient.  If you are doing
+# something more complicated, e.g., tracking the set of bindings in a context,
+# you may find using these smaller types more convenient.
+def translate(
+    bindings: Sequence[Union[Expr, Binding]],
+    goals: Sequence[Union[NamedCType, Binding]],
+    *,
+    method: bool = False,
+    allow_expensive_conversions: bool = False,
+) -> List[Expr]:
+
+    binding_exprs: List[Expr] = []
+    for b in bindings:
+        if isinstance(b, Binding):
+            binding_exprs.append(
+                Expr(
+                    expr=b.name,
+                    type=b.nctype,
+                )
+            )
+        else:
+            binding_exprs.append(b)
+
+    goal_ctypes: List[NamedCType] = []
+    for g in goals:
+        if isinstance(g, Binding):
+            goal_ctypes.append(g.nctype)
+        else:
+            goal_ctypes.append(g)
+
+    # Add all the bindings to the context
+    ctx: Dict[NamedCType, str] = {}
+    for b in binding_exprs:
+        ctx[b.type] = b.expr
+
+        # While we're at it, do some simple forward inference, looking through
+        # constructors.
+        #
+        # NB: When should you do forward inference versus backward inference?
+        # The general idea:
+        #
+        #   - Backward inference WHEN the goal gets smaller
+        #   - Forward inference WHEN the hypothesis gets smaller
+        #
+        # This helps ensure termination: backward inference starts with a goal
+        # and tries to make it simpler and simpler until it's trivial; if the
+        # goal can grow in size, we blow up to a really huge goal size.
+        # Similarly, with forward inference we take hypotheses and decompose
+        # them into simpler hypotheses; if hypotheses could expand in size,
+        # we also have potential nontermination.  (In the code below, forward
+        # inference is only ever carried out at a single step, but you could
+        # imagine repeated application of forward inference being profitable.)
+        #
+        # A good starting point in the literature for exploring more about proof
+        # search are these lecture notes
+        # https://www.cs.cmu.edu/~fp/courses/oregon-m10/04-focusing.pdf
+        #
+        # TODO: My kingdom for a pattern matcher
+        # https://www.python.org/dev/peps/pep-0634/
+        #
+        # TODO: This could get us in recomputation trouble if b.expr is nontrivial.
+        # Fix this by implementing some sort of sharing so that if multiple
+        # goals share the same expression, we only compute it once.  This seems
+        # to matter in practice as compiler is often unwilling to CSE nontrivial
+        # expressions like scalar.to<scalar_t>()
+        t = b.type
+        if (
+            isinstance(t, ConstRefCType)
+            and isinstance(t.elem, OptionalCType)
+            and isinstance(t.elem.elem, BaseCType)
+            and str(t.elem.elem.type) == "at::Tensor"
+        ):
+            ctx[
+                NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT)))
+            ] = f"({b.expr}.has_value() ? *{b.expr} : at::Tensor())"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalTensorRefT))
+            ] = f"(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())"
+
+        if t.type == ConstRefCType(BaseCType(scalarT)):
+            ctx[NamedCType(t.name, BaseCType(opmath_t))] = f"({b.expr}).to<opmath_t>()"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalScalarRefT))
+            ] = f"({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())"
+
+        if t.type == BaseCType(scalar_t):
+            ctx[
+                NamedCType(t.name, BaseCType(opmath_t))
+            ] = f"static_cast<opmath_t>({b.expr})"
+
+        # [Note: ITensorListRef]
+        if t.type == BaseCType(tensorListT):
+            ctx[
+                NamedCType(t.name, BaseCType(iTensorListRefT))
+            ] = f"at::ITensorListRef({b.expr})"
+
+        # [Note: IOptTensorListRef]
+        if t.type == ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))):
+            ctx[
+                NamedCType(t.name, BaseCType(iOptTensorListRefT))
+            ] = f"at::IOptTensorListRef({b.expr})"
+
+    # Add implicit bindings if the generated code is inside a Tensor method
+    if method:
+        ctx[
+            NamedCType("self", MutRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        ctx[
+            NamedCType("self", ConstRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        # This is better!  Byte-for-byte compat
+        # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this"
+
+    def unsat(goal: NamedCType) -> NoReturn:
+        ctx_desc = "\n".join(
+            f"  {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items()
+        )
+        raise UnsatError(
+            f"""
+Failed to synthesize the expression "{goal.cpp_type()} {goal.name}".
+When I failed, the following bindings were available in the context:
+
+{ctx_desc}
+
+This probably means there is a missing rule in the rules of torchgen.api.translate.
+Check this module for more information.
+"""
+        )
+
+    # A shitty backtracking search implementation.  It's shitty because it
+    # does backtracking via stack (bad idea!) and for the most part tries to
+    # avoid backtracking.  In particular, if
+    # direct=True, we won't try to do any fancy synthesis, just trivial
+    # conversions (e.g., "T a" is OK for "const T& a").  So all of the
+    # existing rules in this function simply try to solve immediately,
+    # and bail if things don't work out.
+    def solve(goal: NamedCType, *, direct: bool) -> str:
+        def direct_solve(goal: NamedCType) -> str:
+            return solve(goal, direct=True)
+
+        if goal in ctx:
+            # Trivial
+            return ctx[goal]
+
+        # const & is satisfied with mutable &
+        if isinstance(goal.type, ConstRefCType):
+            try:
+                # WARNING: not strictly decreasing; be careful not
+                # to add a direct conversion that goes satisfies
+                # mutable& with const&
+                return solve(
+                    NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct
+                )
+            except UnsatError:
+                pass
+
+        # mutable & is satisfied with value
+        if isinstance(goal.type, MutRefCType):
+            try:
+                return solve(NamedCType(goal.name, goal.type.elem), direct=direct)
+            except UnsatError:
+                pass
+
+        if direct:
+            unsat(goal)
+
+        # For now, all of these rules are mutually exclusive.
+        if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))):
+            memory_format = direct_solve(
+                NamedCType(
+                    SpecialArgName.possibly_redundant_memory_format,
+                    OptionalCType(BaseCType(memoryFormatT)),
+                )
+            )
+            # No need to join "memory_format" and "options" if the target API takes "options" directly.
+            # Otherwise it will cause the redundant memory_format error.
+            if options_ctype in goal_ctypes:
+                return memory_format
+            try:
+                options = direct_solve(options_ctype)
+                return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})"
+            except UnsatError:
+                return memory_format
+        elif goal == NamedCType("options", BaseCType(tensorOptionsT)):
+            dtype = direct_solve(
+                NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT)))
+            )
+            pin_memory = direct_solve(
+                NamedCType("pin_memory", OptionalCType(BaseCType(boolT)))
+            )
+            device = direct_solve(
+                NamedCType("device", OptionalCType(BaseCType(deviceT)))
+            )
+            layout = direct_solve(
+                NamedCType("layout", OptionalCType(BaseCType(layoutT)))
+            )
+            return f"TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})"
+
+        elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))):
+            options = direct_solve(options_ctype)
+            return f"optTypeMetaToScalarType({options}.dtype_opt())"
+
+        elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))):
+            options = direct_solve(options_ctype)
+            return f"{options}.layout_opt()"
+
+        elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))):
+            options = direct_solve(options_ctype)
+            return f"{options}.device_opt()"
+
+        elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))):
+            options = direct_solve(options_ctype)
+            return f"{options}.pinned_memory_opt()"
+
+        # We can always do translations from value types to reference types, like vector<int> -> IntArrayRef
+        elif goal.type == BaseCType(intArrayRefT):
+            return direct_solve(NamedCType(goal.name, longVec_ctype))
+        elif goal.type == BaseCType(optionalIntArrayRefT):
+            return direct_solve(NamedCType(goal.name, optionalLongVec_ctype))
+        elif goal.type == BaseCType(optionalScalarRefT):
+            return direct_solve(NamedCType(goal.name, optionalScalar_ctype))
+        elif goal.type == BaseCType(optionalTensorRefT):
+            return direct_solve(NamedCType(goal.name, optionalTensor_ctype))
+
+        # Note [translation from C++ reference to value types]
+        # The below cases are all for when we have an argument with a reference type,
+        # and a corresponding goal with a value type.
+        # These are needed when we populate the inputs to a lambda capture and we need
+        # to guarantee the lifetime of each captured argument.
+        # We guard it with an explicit kwarg because converting to a value type is expensive
+        # (O(n)) to convert from IntArrayRef to vector<int>),
+        # so the caller of translate() should be explicit that they need it.
+        if allow_expensive_conversions:
+            if goal.type == VectorCType(BaseCType(longT)):
+                intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT))
+                argname = direct_solve(intArrayRef_ctype)
+                return f"{argname}.vec()"
+            elif goal.type == OptionalCType(VectorCType(BaseCType(longT))):
+                optionalIntArrayRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalIntArrayRefT)
+                )
+                argname = direct_solve(optionalIntArrayRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}->vec()) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalScalarRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalScalarRefT)
+                )
+                argname = direct_solve(optionalScalarRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalTensorRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalTensorRefT)
+                )
+                argname = direct_solve(optionalTensorRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            # Technically, we also need to handle cases of C++ containers holding reference types.
+            # But there currently aren't any ops that require lambda capture codegen
+            # With arguments like std::vector<IntArrayRef>.
+            # If that changes, we'll have to add the translation here.
+
+        unsat(goal)
+
+    return [Expr(solve(g, direct=False), g) for g in goal_ctypes]
diff --git a/torchgen/api/types.py b/torchgen/api/types.py
new file mode 100644
index 000000000000..f990083bd120
--- /dev/null
+++ b/torchgen/api/types.py
@@ -0,0 +1,757 @@
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    NativeFunction,
+    BackendIndex,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    SelfArgument,
+    TensorOptionsArguments,
+    BaseTy,
+    ScalarType,
+)
+from dataclasses import dataclass
+from typing import Optional, Union, Sequence, TypeVar, List, Set, Dict
+from enum import Enum
+
+_T = TypeVar("_T")
+
+# An ArgName is just the str name of the argument in schema;
+# but in some special circumstances, we may add a little extra
+# context.  The Enum SpecialArgName covers all of these cases;
+# grep for their construction sites to see when they can occr.
+
+SpecialArgName = Enum("SpecialArgName", ("possibly_redundant_memory_format",))
+ArgName = Union[str, SpecialArgName]
+
+# This class shouldn't be created directly; instead, use/create one of the singletons below.
+@dataclass(frozen=True)
+class BaseCppType:
+    ns: Optional[str]
+    name: str
+
+    def __str__(self) -> str:
+        if self.ns is None or self.ns == "":
+            return self.name
+        return f"{self.ns}::{self.name}"
+
+
+# The set of all non-templated, valid, fully-qualified names of C++ types that are used in the codegen.
+# Templated types get their own dataclass, mainly to make namespace parsing easier.
+byteT = BaseCppType("", "uint8_t")
+charT = BaseCppType("", "int8_t")
+shortT = BaseCppType("", "int16_t")
+# It would be more symmetric for this to be called intT, but it easy to mix
+# this up with JIT int (which is int64_t in C++), so we intentionally don't
+# define intT to make it obvious when you've stuffed it up
+int32T = BaseCppType("", "int32_t")
+longT = BaseCppType("", "int64_t")
+halfT = BaseCppType("at", "Half")
+doubleT = BaseCppType("", "double")
+floatT = BaseCppType("", "float")
+complexHalfT = BaseCppType(
+    "c10", "complex<c10::Half>"
+)  # stuffing template param here is an abuse
+complexFloatT = BaseCppType("c10", "complex<float>")
+complexDoubleT = BaseCppType("c10", "complex<double>")
+boolT = BaseCppType("", "bool")
+bfloat16T = BaseCppType("at", "BFloat16")
+voidT = BaseCppType("", "void")
+stringT = BaseCppType("c10", "string_view")
+generatorT = BaseCppType("at", "Generator")
+scalarTypeT = BaseCppType("at", "ScalarType")
+tensorT = BaseCppType("at", "Tensor")
+optionalTensorRefT = BaseCppType("at", "OptionalTensorRef")
+tensorListT = BaseCppType("at", "TensorList")
+iTensorListRefT = BaseCppType("at", "ITensorListRef")
+iOptTensorListRefT = BaseCppType("at", "IOptTensorListRef")
+dimnameT = BaseCppType("at", "Dimname")
+dimnameListT = BaseCppType("at", "DimnameList")
+layoutT = BaseCppType("at", "Layout")
+deviceT = BaseCppType("at", "Device")
+scalarT = BaseCppType("at", "Scalar")
+optionalScalarRefT = BaseCppType("at", "OptionalScalarRef")
+memoryFormatT = BaseCppType("at", "MemoryFormat")
+qschemeT = BaseCppType("at", "QScheme")
+storageT = BaseCppType("at", "Storage")
+streamT = BaseCppType("at", "Stream")
+intArrayRefT = BaseCppType("at", "IntArrayRef")
+optionalIntArrayRefT = BaseCppType("at", "OptionalIntArrayRef")
+tensorOptionsT = BaseCppType("at", "TensorOptions")
+typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
+tensorGeometryT = BaseCppType("at", "TensorGeometry")
+SymIntT = BaseCppType("c10", "SymInt")
+symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
+
+# Types representing template parameters.  Technically, we probably shouldn't
+# represent them this way in codegen, but it was pretty convenient.
+scalar_t = BaseCppType("", "scalar_t")
+opmath_t = BaseCppType("", "opmath_t")
+
+ScalarTypeToCppMapping: Dict[ScalarType, BaseCppType] = {
+    ScalarType.Byte: byteT,
+    ScalarType.Char: charT,
+    ScalarType.Short: shortT,
+    ScalarType.Int: int32T,
+    ScalarType.Long: longT,
+    ScalarType.Half: halfT,
+    ScalarType.Float: floatT,
+    ScalarType.Double: doubleT,
+    ScalarType.ComplexHalf: complexHalfT,
+    ScalarType.ComplexFloat: complexFloatT,
+    ScalarType.ComplexDouble: complexDoubleT,
+    ScalarType.Bool: boolT,
+    ScalarType.BFloat16: bfloat16T,
+}
+
+BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.Generator: generatorT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Dimname: dimnameT,
+    BaseTy.Layout: layoutT,
+    BaseTy.Device: deviceT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+    BaseTy.QScheme: qschemeT,
+    BaseTy.Storage: storageT,
+    BaseTy.Stream: streamT,
+    BaseTy.SymInt: SymIntT,
+}
+
+# CTypes encode C++ type structure as needed for translation.
+
+
+@dataclass(frozen=True)
+class BaseCType:
+    type: BaseCppType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return str(self.type)
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def cpp_type_registration_declarations(self) -> str:
+        return str(self.type).replace("at::", "")
+
+    def remove_const_ref(self) -> "CType":
+        return self
+
+
+@dataclass(frozen=True)
+class ConstRefCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
+        return f"const {self.elem.cpp_type()} &"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"const {self.elem.cpp_type_registration_declarations()} &"
+
+    def remove_const_ref(self) -> "CType":
+        return self.elem.remove_const_ref()
+
+
+@dataclass(frozen=True)
+class MutRefCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
+        return f"{self.elem.cpp_type()} &"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"{self.elem.cpp_type_registration_declarations()} &"
+
+    def remove_const_ref(self) -> "CType":
+        return self.elem.remove_const_ref()
+
+
+@dataclass(frozen=True)
+class OptionalCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::optional<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::optional<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ListCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::List<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::List<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ListCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"at::ArrayRef<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"ArrayRef<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayRefCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class VectorCType:
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"::std::vector<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"::std::vector<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return VectorCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayCType:
+    elem: "CType"
+    size: int
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"::std::array<{self.elem.cpp_type()},{self.size}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"::std::array<{self.elem.cpp_type_registration_declarations()},{self.size}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayCType(self.elem.remove_const_ref(), self.size)
+
+
+@dataclass(frozen=True)
+class TupleCType:
+    elems: List["CType"]
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f'::std::tuple<{",".join([e.cpp_type() for e in self.elems])}>'
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f'::std::tuple<{",".join([e.cpp_type_registration_declarations() for e in self.elems])}>'
+
+    def remove_const_ref(self) -> "CType":
+        return TupleCType([e.remove_const_ref() for e in self.elems])
+
+
+@dataclass(frozen=True)
+class VectorizedCType:
+    # This template is explicitly specialized, so the only valid
+    # elems are those we have specializations for (e.g., float, double, ...)
+    # scalar_t is also a common argument here (when we are codegen in
+    # a templated context)
+    elem: BaseCType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return f"at::vec::Vectorized<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        raise NotImplementedError
+
+    def remove_const_ref(self) -> "CType":
+        return self
+
+
+CType = Union[
+    BaseCType,
+    OptionalCType,
+    ConstRefCType,
+    MutRefCType,
+    ListCType,
+    ArrayRefCType,
+    ArrayCType,
+    VectorCType,
+    TupleCType,
+    VectorizedCType,
+]
+
+# A NamedCType is short for Named C++ semantic type.  A NamedCType represents a C++ type, plus
+# semantic information about what it represents.  For example, consider the
+# argument "bool pin_memory"; its normal C++ type is "bool", but its C++
+# semantic type also keeps track that this represents a "pin_memory"; you can't
+# just use a random other boolean in a context where you need a "pin_memory"!
+#
+
+
+@dataclass(frozen=True)
+class NamedCType:
+    name: ArgName
+    type: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return self.type.cpp_type(strip_ref=strip_ref)
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def cpp_type_registration_declarations(self) -> str:
+        return self.type.cpp_type_registration_declarations()
+
+    def remove_const_ref(self) -> "NamedCType":
+        return NamedCType(self.name, self.type.remove_const_ref())
+
+    def with_name(self, name: str) -> "NamedCType":
+        return NamedCType(name, self.type)
+
+
+# A binding represents any C++ binding site for a formal parameter.
+# We don't distinguish between binding sites for different APIs;
+# instead, all of the important distinctions are encoded in CType,
+# which you can use to figure out if a given Binding is appropriate
+# for use in another context.  (See torchgen.api.translate)
+
+
+@dataclass(frozen=True)
+class Binding:
+    name: str
+    nctype: NamedCType
+    argument: Union[Argument, TensorOptionsArguments, SelfArgument]
+    # TODO: maybe don't represent default here
+    default: Optional[str] = None
+
+    def rename(self, name: str) -> "Binding":
+        return Binding(
+            name=name,
+            nctype=self.nctype,
+            argument=self.argument,
+            default=self.default,
+        )
+
+    @property
+    def type(self) -> str:
+        return self.nctype.cpp_type()
+
+    def no_default(self) -> "Binding":
+        return Binding(
+            name=self.name,
+            nctype=self.nctype,
+            default=None,
+            argument=self.argument,
+        )
+
+    def decl(self, *, func_ptr_cast: bool = False) -> str:
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+
+        # casting only needs to know the type
+        if func_ptr_cast:
+            return f"{self.type}"
+        else:
+            return f"{self.type} {self.name}{mb_default}"
+
+    # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml
+    # TODO: Kill this when we eventually remove it!
+    def decl_registration_declarations(self) -> str:
+        type_s = self.nctype.cpp_type_registration_declarations()
+        mb_default = ""
+        if self.default is not None:
+            mb_default = f"={self.default}"
+        return f"{type_s} {self.name}{mb_default}"
+
+    def defn(self) -> str:
+        return f"{self.type} {self.name}"
+
+    def with_name(self, name: str) -> "Binding":
+        return Binding(
+            name=name, nctype=self.nctype, argument=self.argument, default=self.default
+        )
+
+
+# An Expr is a C++ expression.  It has a C++ string representing its syntax,
+# as well as a CType saying what it provides.
+
+
+@dataclass(frozen=True)
+class Expr:
+    expr: str
+    type: NamedCType
+
+
+# A CppSignature represents a single overload in the C++ API.  For
+# any given function schema, there may be multiple CppSignatures
+# corresponding to it, based on how we desugar to C++.  See also
+# CppSignatureGroup.
+@dataclass(frozen=True)
+class CppSignature:
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # Is this a C++ signature for a method, i.e. Tensor::my_op(...)?
+    method: bool
+
+    # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API
+    # (i.e. with a potential TensorOptions argument and out arguments in the front)
+    faithful: bool
+
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: Set[str]
+
+    # Is this a fallback C++ binding?  Fallback bindings are enabled by
+    # manual_cpp_binding: True and are alternate, non-public API that
+    # lets manual C++ binding implementors access the binding that would
+    # have been automatically generated
+    fallback_binding: bool = False
+
+    # Return the unpacked argument structure of this signature,
+    # discarding information about which arguments are semantically
+    # related to each other.
+    def arguments(self) -> Sequence[Binding]:
+        return cpp.arguments(
+            self.func.arguments,
+            faithful=self.faithful,
+            method=self.method,
+            cpp_no_default_args=self.cpp_no_default_args,
+        )
+
+    def name(self) -> str:
+        n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful)
+        if self.fallback_binding:
+            n = f"__dispatch_{n}"
+        return n
+
+    # Render the C++ declaration for this signature
+    def decl(
+        self,
+        *,
+        name: Optional[str] = None,
+        prefix: str = "",
+        is_redispatching_fn: bool = False,
+    ) -> str:
+        returns_type = cpp.returns_type(self.func.returns).cpp_type()
+        cpp_args = [a.decl() for a in self.arguments()]
+        if is_redispatching_fn:
+            cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args
+        cpp_args_str = ", ".join(cpp_args)
+        if name is None:
+            name = prefix + self.name()
+        return f"{returns_type} {name}({cpp_args_str})"
+
+    # Render the C++ definition for this signature, not including
+    # the body (with curly braces)
+    def defn(
+        self,
+        *,
+        name: Optional[str] = None,
+        prefix: str = "",
+        is_redispatching_fn: bool = False,
+    ) -> str:
+        returns_type = cpp.returns_type(self.func.returns).cpp_type()
+        cpp_args = [a.defn() for a in self.arguments()]
+        if is_redispatching_fn:
+            cpp_args = ["c10::DispatchKeySet dispatchKeySet"] + cpp_args
+        cpp_args_str = ", ".join(cpp_args)
+        if name is None:
+            name = prefix + self.name()
+        return f"{returns_type} {name}({cpp_args_str})"
+
+    def ptr_type(self) -> str:
+        args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{cpp.returns_type(self.func.returns).cpp_type()} (*)({args_types_str})"
+
+    # Return the C++ function type, e.g., something like int(bool)
+    def type(self) -> str:
+        args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{cpp.returns_type(self.func.returns).cpp_type()} ({args_types_str})"
+
+
+# Represents group of all CppSignatures associated with a
+# FunctionSchema.  Right now, that's the regular, user-visible
+# signature, as well as a "faithful" signature which doesn't
+# have grouping.
+@dataclass(frozen=True)
+class CppSignatureGroup:
+    func: FunctionSchema
+    signature: CppSignature
+    faithful_signature: Optional[CppSignature]
+
+    def most_faithful_signature(self) -> CppSignature:
+        if self.faithful_signature:
+            return self.faithful_signature
+        else:
+            return self.signature
+
+    @staticmethod
+    def from_native_function(
+        f: NativeFunction, *, method: bool, fallback_binding: bool = False
+    ) -> "CppSignatureGroup":
+        func = f.func
+        faithful_signature: Optional[CppSignature]
+        if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
+            faithful_signature = CppSignature(
+                func=func,
+                faithful=True,
+                method=method,
+                fallback_binding=fallback_binding,
+                cpp_no_default_args=f.cpp_no_default_args,
+            )
+        else:
+            faithful_signature = None
+        signature = CppSignature(
+            func=func,
+            faithful=False,
+            method=method,
+            fallback_binding=fallback_binding,
+            cpp_no_default_args=f.cpp_no_default_args,
+        )
+        return CppSignatureGroup(
+            func=func,
+            signature=signature,
+            faithful_signature=faithful_signature,
+        )
+
+
+@dataclass(frozen=True)
+class DispatcherSignature:
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # Allows you to prepend an arbitrary prefix to the signature name.
+    # This is useful for parts of the codegen that generate wrappers around kernels,
+    # and need to avoid naming collisions.
+    prefix: str = ""
+
+    def arguments(self) -> List[Binding]:
+        return dispatcher.arguments(self.func)
+
+    def name(self) -> str:
+        return self.prefix + dispatcher.name(self.func)
+
+    def decl(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def defn(
+        self, name: Optional[str] = None, *, is_redispatching_fn: bool = False
+    ) -> str:
+        args = [a.defn() for a in self.arguments()]
+        if is_redispatching_fn:
+            args = ["c10::DispatchKeySet dispatchKeySet"] + args
+        args_str = ", ".join(args)
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def exprs(self) -> List[Expr]:
+        return [Expr(a.name, a.nctype) for a in self.arguments()]
+
+    def returns_type(self) -> CType:
+        return dispatcher.returns_type(self.func.returns)
+
+    def ptr_type(self) -> str:
+        dispatcher_args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{self.returns_type().cpp_type()} (*)({dispatcher_args_types_str})"
+
+    # Return the C++ function type, e.g., something like int(bool)
+    def type(self) -> str:
+        dispatcher_args_types_str = ", ".join(a.type for a in self.arguments())
+        return f"{self.returns_type().cpp_type()} ({dispatcher_args_types_str})"
+
+    @staticmethod
+    def from_schema(func: FunctionSchema, *, prefix: str = "") -> "DispatcherSignature":
+        return DispatcherSignature(func, prefix)
+
+
+@dataclass(frozen=True)
+class NativeSignature:
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    prefix: str = ""
+
+    def name(self) -> str:
+        return self.prefix + native.name(self.func)
+
+    def decl(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})"
+
+    def defn(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        if name is None:
+            name = self.name()
+        return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})"
+
+    def ptr_type(self) -> str:
+        # don't include defaults in type signature!
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        return f"{native.returns_type(self.func.returns).cpp_type()} (*)({args_str})"
+
+    def arguments(self) -> List[Binding]:
+        return native.arguments(self.func)
+
+    def returns_type(self) -> CType:
+        return native.returns_type(self.func.returns)
+
+    def dispatcher_exprs(self) -> List[Expr]:
+        return translate.translate(
+            self.arguments(), dispatcher.arguments(self.func), method=False
+        )
+
+
+@dataclass(frozen=True)
+class ViewInverseSignature:
+    g: NativeFunctionsViewGroup
+
+    def name(self) -> str:
+        assert self.g.view_copy is not None
+        return functionalization.name(self.g, is_reverse=True, include_namespace=False)
+
+    def decl(self) -> str:
+        assert self.g.view_copy is not None
+        return_type = functionalization.returns_type(self.g.view_copy.func)
+        decls = [
+            a.decl()
+            for a in functionalization.inner_arguments(
+                self.g.view_copy.func, is_reverse=True
+            )
+        ]
+        return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
+
+
+@dataclass(frozen=True)
+class FunctionalizationLambda:
+    g: NativeFunctionsViewGroup
+
+    # are we generating the forward lambda or the reverse lambda?
+    is_reverse: bool
+
+    def captures(self) -> List[Expr]:
+        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
+        # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed,
+        # and plumb it into the lambda.
+        outer_ctx = dispatcher.arguments(self.g.view.func) + [
+            functionalization.reapply_views_binding
+        ]
+        capture_bindings = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        # allow_expensive_conversions is set because we want to convert
+        # some reference types (IntArrayRef) to value types (vector<int64_t>).
+        capture_exprs = translate.translate(
+            outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True
+        )
+        return capture_exprs
+
+    def decl(self) -> str:
+        return_type = functionalization.returns_type(self.g.view.func)
+        capture_str = ", ".join(
+            f"{val.type.name} = {val.expr}" for val in self.captures()
+        )
+        decls = [
+            a.decl()
+            for a in functionalization.outer_arguments(is_reverse=self.is_reverse)
+        ]
+        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
+
+    def inner_call(self, *, reapply_views: Optional[bool] = None) -> str:
+        inner_call_name = functionalization.name(
+            self.g,
+            is_reverse=self.is_reverse,
+            include_namespace=True,
+            reapply_views=reapply_views,
+        )
+
+        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
+        capture_ctx = functionalization.capture_arguments(
+            self.g.view.func, is_reverse=self.is_reverse
+        )
+        full_ctx = arg_ctx + capture_ctx
+
+        assert self.g.view_copy is not None
+        call_bindings = functionalization.inner_arguments(
+            self.g.view_copy.func, is_reverse=self.is_reverse
+        )
+        maybe_index = functionalization.inner_call_index(self.g.view_copy.func)
+        call_exprs = [
+            e.expr for e in translate.translate(full_ctx, call_bindings, method=False)
+        ]
+        if not self.is_reverse and maybe_index is not None:
+            return f'{inner_call_name}({", ".join(call_exprs)})[{maybe_index.name}];'
+        else:
+            return f'{inner_call_name}({", ".join(call_exprs)});'
+
+    @staticmethod
+    def from_func(
+        g: NativeFunctionsViewGroup, *, is_reverse: bool
+    ) -> "FunctionalizationLambda":
+        return FunctionalizationLambda(g, is_reverse)
+
+
+@dataclass(frozen=True)
+class StructuredImplSignature:
+    g: NativeFunctionsGroup
+    name: str
+
+    def defn(self, name: Optional[str] = None) -> str:
+        args_str = ", ".join(a.defn() for a in self.arguments())
+        return f"TORCH_IMPL_FUNC({self.name})({args_str})"
+
+    def arguments(self) -> List[Binding]:
+        return structured.impl_arguments(self.g)
+
+
+# Helper functions
+
+
+def kernel_signature(
+    f: NativeFunction, backend_index: BackendIndex, *, prefix: str = ""
+) -> Union["NativeSignature", "DispatcherSignature"]:
+    # Note [External Backends Follow Dispatcher API]
+    # Kernel signatures for in-tree backends follow the "native" API,
+    # while kernels for out-of-tree backends follow the dispatcher API.
+    # See the comments in `native.py` for details, but historically there have been
+    # some small differences in schema convention between them and the Dispatcher API.
+    # Any differences that require translating between the two will results in a runtime cost,
+    # so we'd like to keep the differences as small as possible.
+    # With external backends, we'd like to enforce that they write their kernels with schemas
+    # that match the Dispatcher API directly, if they can.
+    if backend_index.external:
+        return DispatcherSignature.from_schema(f.func, prefix=prefix)
+    else:
+        return NativeSignature(f.func, prefix)
+
+
+# Functions only, no types
+from torchgen.api import (
+    cpp,
+    dispatcher,
+    native,
+    translate,
+    functionalization,
+    structured,
+)
diff --git a/torchgen/api/ufunc.py b/torchgen/api/ufunc.py
new file mode 100644
index 000000000000..5836e276240e
--- /dev/null
+++ b/torchgen/api/ufunc.py
@@ -0,0 +1,209 @@
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    NativeFunctionsGroup,
+    Type,
+    DispatchKey,
+)
+
+import torchgen.api.types as api_types
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    NamedCType,
+    scalarT,
+    CType,
+    BaseCppType,
+)
+
+from torchgen.api import cpp, structured
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+def schema_kernel_name(func: FunctionSchema, dispatch_key: DispatchKey) -> str:
+    assert func.is_out_fn(), "ufunc.kernel_name should only be invoked on out schemas"
+    return f"ufunc_{func.name.name}_{dispatch_key}"
+
+
+def kernel_name(g: NativeFunctionsGroup, dispatch_key: DispatchKey) -> str:
+    return schema_kernel_name(g.out.func, dispatch_key)
+
+
+# Tensors are omitted (as they are stored in TensorIterator), everything else is
+# passed along  (technically, we can pass tensors along too, it just wastes
+# argument registers)
+#
+# NB: used for CPU only
+def dispatchstub_type(t: Type, *, binds: ArgName) -> Optional[NamedCType]:
+    r = cpp.valuetype_type(t, binds=binds)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+    elif t == BaseType(BaseTy.Tensor):
+        return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def opmath_type(scalar_t: BaseCppType) -> BaseCppType:
+    if scalar_t == api_types.scalar_t:
+        return api_types.opmath_t
+    raise NotImplementedError
+
+
+# NB: Tensors in constructor are stored in opmath_t, not scalar_t
+# because Tensor in constructor = its a scalar tensor partially applied =
+# it can be higher precision and we want to compute in that higher precision
+#
+# NB: CUDA only
+def ufunctor_ctor_type(t: Type, *, binds: ArgName, scalar_t: BaseCppType) -> NamedCType:
+    r = cpp.valuetype_type(t, binds=binds)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
+    elif t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Only Tensors ever get passed directly to operator()
+#
+# NB: CUDA only
+# (Actually, this works for CPU too)
+def ufunctor_apply_type(
+    t: Type, *, binds: ArgName, scalar_t: BaseCppType
+) -> NamedCType:
+    if t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, BaseCType(scalar_t))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# The actual ufunc template function the user writes.  Everything here
+# is done in the computation type.  compute_t is opmath_t in CUDA and scalar_t
+# in CPU
+def ufunc_type(t: Type, *, binds: ArgName, compute_t: CType) -> NamedCType:
+    r = cpp.valuetype_type(t, binds=binds)
+    if r is not None:
+        return r
+
+    if t == BaseType(BaseTy.Scalar):
+        return NamedCType(binds, compute_t)
+    elif t == BaseType(BaseTy.Tensor):
+        return NamedCType(binds, compute_t)
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def ufunctor_ctor_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
+    return Binding(
+        nctype=ufunctor_ctor_type(a.type, binds=a.name, scalar_t=scalar_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+def ufunctor_apply_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
+    return Binding(
+        nctype=ufunctor_apply_type(a.type, binds=a.name, scalar_t=scalar_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+def ufunc_argument(a: Argument, compute_t: CType) -> Binding:
+    return Binding(
+        nctype=ufunc_type(a.type, binds=a.name, compute_t=compute_t),
+        name=a.name,
+        default=None,
+        argument=a,
+    )
+
+
+@dataclass(frozen=True)
+class UfunctorBindings:
+    ctor: List[Binding]
+    apply: List[Binding]
+
+
+# ufunctors are a CUDA-only concept representing functors that take some of
+# their arguments on a host-side constructor, and the rest in the device-side
+# apply.  E.g.,
+#
+# template <typename scalar_t>
+# struct CUDAFunctorOnSelf_add {
+#   using opmath_t = at::opmath_type<scalar_t>;
+#   opmath_t other_;
+#   opmath_t alpha_;
+#   CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) : other_(other), alpha_(alpha) {}
+#   __device__ scalar_t operator()(scalar_t self) {
+#     return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
+#   }
+# };
+#
+# The ctor refers to the constructor CUDAFunctorOnSelf_add, while apply refers
+# to the operator() definition
+def ufunctor_arguments(
+    g: NativeFunctionsGroup, *, scalar_tensor_idx: Optional[int], scalar_t: BaseCppType
+) -> UfunctorBindings:
+    ctor = []
+    apply = []
+    for a in g.functional.func.arguments.flat_non_out:
+        if a.type.is_tensor_like():
+            if scalar_tensor_idx == 0:
+                # put it in the ctor anyway
+                ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
+                scalar_tensor_idx = None
+            else:
+                if scalar_tensor_idx is not None:
+                    scalar_tensor_idx -= 1
+                apply.append(ufunctor_apply_argument(a, scalar_t=scalar_t))
+        else:
+            ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
+    assert scalar_tensor_idx is None
+    return UfunctorBindings(ctor=ctor, apply=apply)
+
+
+# ufuncs are the inner loop template functions that you wrote in ufunc/add.h
+# which do the actual computation in question.  E.g.,
+#
+# template <typename T>
+# C10_HOST_DEVICE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ {
+#   return self + alpha * other;
+# }
+#
+# In this file, we refer to T as compute_t which is bound by caller
+def ufunc_arguments(g: NativeFunctionsGroup, *, compute_t: CType) -> List[Binding]:
+    return [
+        ufunc_argument(a, compute_t=compute_t)
+        for a in g.functional.func.arguments.flat_non_out
+    ]
+
+
+# Stubs are the DispatchStub trampolines that CPU kernels use to get to their
+# vectorized versions.  E.g.,
+#
+# using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+# DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
+def stub_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    # stubs drop all tensor arguments (they are implicit in the TensorIterator
+    # argument and keep everything else)
+    return [
+        r
+        for a in g.out.func.arguments.flat_non_out
+        if not a.type.is_tensor_like()
+        for r in structured.argument(a)
+    ]
diff --git a/torchgen/api/unboxing.py b/torchgen/api/unboxing.py
new file mode 100644
index 000000000000..06595353de29
--- /dev/null
+++ b/torchgen/api/unboxing.py
@@ -0,0 +1,237 @@
+from typing import List, Tuple
+
+from torchgen.api import cpp
+from torchgen.api.types import Binding, CType, CppSignatureGroup
+from torchgen.model import (
+    Argument,
+    NativeFunction,
+    Type,
+    BaseType,
+    OptionalType,
+    ListType,
+    BaseTy,
+)
+
+# This file generates the code for unboxing wrappers, i.e., the glue logic to unbox a boxed operator and convert the
+# ivalues from stack to correct arguments to the unboxed kernel, based on corresponding JIT schema. This codegen is
+# an alternative way to generate unboxing wrappers similar to the existing C++ metaprogramming approach but gets the
+# job done statically. These generated unboxing wrappers will be useful under the scenario where we need to register
+# a fixed set of operators known at compile time and thus can save some time in runtime initialization phase.
+#
+# Here's an example on how the codegen works:
+#
+# - Function Schema (source of truth)
+#
+#      aten::empty.names(int[] size, *, Dimname[]? names,
+#                        ScalarType? dtype=None, Layout? layout=None,
+#                        Device? device=None, bool? pin_memory=None,
+#                        MemoryFormat? memory_format=None) -> Tensor
+# - Argument Conversion
+#       Generates C++ code to convert an ivalue (from stack) to its underlying C++ type.
+#    - int[] size
+#        ```cpp
+#           const c10::List<c10::IValue> size_list_in = (std::move(peek(stack, 0, 7))).toList();
+#
+#           std::vector<int64_t> size_vec;
+#           for (c10::IValue size_elem: size_list_in) {
+#               int64_t size_base = size_elem.to<int64_t>();
+#               size_vec.push_back(size_base);
+#           }
+#           at::ArrayRef<int64_t> size_list_out(size_vec);
+#                                 ~~~~~~~~~~~~~ <-- The converted argument from ivalues in the stack.
+#                                                   Will be passed to unboxed kernel.
+#       ```
+#    - Dimname[]? names
+#       ```cpp
+#           c10::optional<c10::IValue> names_opt = (std::move(peek(stack, 1, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ArrayRef<at::Dimname>> names_opt_out;
+#           if (names_opt.has_value()) {
+#                         ~~~~~~~~~~~ <-- Unwrapping optional shell
+#               const c10::IValue names_opt_in = names_opt.value();
+#               const c10::List<c10::IValue> names_list_in = names_opt_in.toList();
+#
+#               std::vector<at::Dimname> names_vec;
+#               for (c10::IValue names_elem: names_list_in) {
+#                                ~~~~~~~~~~~~~~~~~~~~~~~~~ <-- Unrolling list, then convert elements one by one.
+#                   at::Dimname names_base = names_elem.to<at::Dimname>();
+#                   names_vec.push_back(names_base);
+#               }
+#               at::ArrayRef<at::Dimname> names_list_out(names_vec);
+#
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>(names_list_out);
+#           } else {
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>();
+#           }
+#       ```
+#    - ScalarType? dtype (similarly for the rest of the arguments)
+#       ```cpp
+#           c10::optional<c10::IValue> dtype_opt = (std::move(peek(stack, 2, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ScalarType> dtype_opt_out;
+#           if (dtype_opt.has_value()) {
+#               const c10::IValue dtype_opt_in = dtype_opt.value();
+#               at::ScalarType dtype_base = dtype_opt_in.to<at::ScalarType>();
+#                                                        ~~~~~~~~~~~~~~~~~~~~ <-- For base types, convert ivalue to it
+#                                                                                 directly using ".to<T>()" API.
+#               dtype_opt_out = c10::optional<at::ScalarType>(dtype_base);
+#           } else {
+#               dtype_opt_out = c10::optional<at::ScalarType>();
+#           }
+#       ```
+#
+# - Unboxed Kernel Call
+#   ```cpp
+#       auto result_ = torch::empty(
+#           size_list_out,
+#           names_opt_out,
+#           options,
+#           memory_format_opt_out
+#       );
+#   ```
+#
+# - Push Result Back to Stack
+#   ```cpp
+#       drop(stack, 7);
+#       pack(stack, std::move(result_));
+#   ```
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+# Convert all the arguments in a NativeFunction to C++ code
+def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]:
+    # we need the 'self' argument so method needs to be False
+    args = (
+        CppSignatureGroup.from_native_function(f, method=False)
+        .most_faithful_signature()
+        .arguments()
+    )
+    code_list = [
+        f"c10::IValue {args[i].name} = std::move(peek(stack, {i}, {len(args)}));"
+        for i in range(len(args))
+    ] + [""]
+    binding_list = []
+    for i, arg in enumerate(args):
+        # expecting only Argument
+        if not isinstance(arg.argument, Argument):
+            raise Exception(
+                f"Unexpected argument type, expecting `Argument` but got {arg}"
+            )
+        argument: Argument = arg.argument
+        unboxed_name, _, code, decl = argumenttype_ivalue_convert(
+            argument.type, argument.name, mutable=argument.is_write
+        )
+        code_list.extend(decl)
+        code_list.extend(code)
+        binding_list.append(arg.with_name(unboxed_name))
+    return binding_list, code_list
+
+
+# Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+# (1) the C++ code necessary to unbox the argument
+# (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+def argumenttype_ivalue_convert(
+    t: Type, arg_name: str, *, mutable: bool = False
+) -> Tuple[str, CType, List[str], List[str]]:
+    ctype = cpp.argumenttype_type(t=t, mutable=mutable, binds=arg_name).type
+
+    if isinstance(t, BaseType):
+        out_name = f"{arg_name}_base"
+        code, decl = _gen_code_base_type(
+            arg_name=arg_name, out_name=out_name, ctype=ctype
+        )
+    elif isinstance(t, OptionalType):
+        out_name = f"{arg_name}_opt_out"
+        code, decl = _gen_code_optional_type(
+            arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+        )
+    elif isinstance(t, ListType):
+        out_name = f"{arg_name}_list_out"
+        code, decl = _gen_code_list_type(
+            arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+        )
+    else:
+        raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+    return out_name, ctype, code, decl
+
+
+def _gen_code_base_type(
+    arg_name: str, out_name: str, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    return [
+        f"{ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"
+    ], []
+
+
+def _gen_code_optional_type(
+    arg_name: str, out_name: str, t: OptionalType, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_opt_in"
+    res_name, _, res_code, decl = argumenttype_ivalue_convert(t.elem, in_name)
+    return (
+        f"""
+c10::optional<c10::IValue> {arg_name}_opt = {arg_name}.toOptional<c10::IValue>();
+{ctype.cpp_type(strip_ref=True)} {out_name};
+if ({arg_name}_opt.has_value()) {{
+    const c10::IValue {in_name} = {arg_name}_opt.value();
+    {connector.join(res_code)}
+    {out_name} = {ctype.cpp_type(strip_ref=True)}({res_name});
+}} else {{
+    {out_name} = {ctype.cpp_type(strip_ref=True)}();
+}}
+        """.split(
+            "\n"
+        ),
+        decl,
+    )
+
+
+def _gen_code_list_type(
+    arg_name: str, out_name: str, t: ListType, ctype: CType
+) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_list_in"
+    elem_name = f"{arg_name}_elem"
+    code = [f"const c10::List<c10::IValue> {in_name} = {arg_name}.toList();"]
+    res_name, res_ctype, res_code, decl = argumenttype_ivalue_convert(t.elem, elem_name)
+    # handle list type with size, e.g., bool[4]
+    if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool and t.size:
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name} = as_array<{res_ctype.cpp_type(strip_ref=True)}, {t.size}>({in_name});
+            """.split(
+                "\n"
+            )
+        )
+    # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+    elif isinstance(t.elem, OptionalType):
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name};
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {out_name}.push_back({res_name});
+}}
+            """.split(
+                "\n"
+            )
+        )
+    else:
+        # use ArrayRef as default.
+        vec_name = arg_name + "_vec"
+        # need to bring vector instantiation out of scope so that ArrayRef has valid data
+        decl.append(f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};")
+        code.extend(
+            f"""
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {vec_name}.push_back({res_name});
+}}
+{ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+            """.split(
+                "\n"
+            )
+        )
+    return code, decl
diff --git a/torchgen/build.bzl b/torchgen/build.bzl
new file mode 100644
index 000000000000..d00078a3cfc1
--- /dev/null
+++ b/torchgen/build.bzl
@@ -0,0 +1,16 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "torchgen",
+        srcs = rules.glob(["**/*.py"]),
+        deps = [
+            rules.requirement("PyYAML"),
+            rules.requirement("typing-extensions"),
+        ],
+        visibility = ["//visibility:public"],
+    )
+
+    rules.py_binary(
+        name = "gen",
+        srcs = [":torchgen"],
+        visibility = ["//visibility:public"],
+    )
diff --git a/torchgen/code_template.py b/torchgen/code_template.py
new file mode 100644
index 000000000000..1f76692e5cd9
--- /dev/null
+++ b/torchgen/code_template.py
@@ -0,0 +1,106 @@
+import re
+from typing import Match, Optional, Sequence, Mapping
+
+# match $identifier or ${identifier} and replace with value in env
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block substitution by indenting to that depth and putting each element
+# of the list on its own line
+# if the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+
+
+class CodeTemplate:
+    # Python 2.7.5 has a bug where the leading (^[^\n\S]*)? does not work,
+    # workaround via appending another [^\n\S]? inside
+
+    substitution_str = r"(^[^\n\S]*[^\n\S]?)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})"
+
+    # older versions of Python have a bug where \w* does not work,
+    # so we need to replace with the non-shortened version [a-zA-Z0-9_]*
+    # https://bugs.python.org/issue18647
+
+    substitution_str = substitution_str.replace(r"\w", r"[a-zA-Z0-9_]")
+
+    substitution = re.compile(substitution_str, re.MULTILINE)
+
+    pattern: str
+    filename: str
+
+    @staticmethod
+    def from_file(filename: str) -> "CodeTemplate":
+        with open(filename, "r") as f:
+            return CodeTemplate(f.read(), filename)
+
+    def __init__(self, pattern: str, filename: str = "") -> None:
+        self.pattern = pattern
+        self.filename = filename
+
+    def substitute(
+        self, env: Optional[Mapping[str, object]] = None, **kwargs: object
+    ) -> str:
+        if env is None:
+            env = {}
+
+        def lookup(v: str) -> object:
+            assert env is not None
+            return kwargs[v] if v in kwargs else env[v]
+
+        def indent_lines(indent: str, v: Sequence[object]) -> str:
+            return "".join(
+                [indent + l + "\n" for e in v for l in str(e).splitlines()]
+            ).rstrip()
+
+        def replace(match: Match[str]) -> str:
+            indent = match.group(1)
+            key = match.group(2)
+            comma_before = ""
+            comma_after = ""
+            if key[0] == "{":
+                key = key[1:-1]
+                if key[0] == ",":
+                    comma_before = ", "
+                    key = key[1:]
+                if key[-1] == ",":
+                    comma_after = ", "
+                    key = key[:-1]
+            v = lookup(key)
+            if indent is not None:
+                if not isinstance(v, list):
+                    v = [v]
+                return indent_lines(indent, v)
+            elif isinstance(v, list):
+                middle = ", ".join([str(x) for x in v])
+                if len(v) == 0:
+                    return middle
+                return comma_before + middle + comma_after
+            else:
+                return str(v)
+
+        return self.substitution.sub(replace, self.pattern)
+
+
+if __name__ == "__main__":
+    c = CodeTemplate(
+        """\
+    int foo($args) {
+
+        $bar
+            $bar
+        $a+$b
+    }
+    int commatest(int a${,stuff})
+    int notest(int a${,empty,})
+    """
+    )
+    print(
+        c.substitute(
+            args=["hi", 8],
+            bar=["what", 7],
+            a=3,
+            b=4,
+            stuff=["things...", "others"],
+            empty=[],
+        )
+    )
diff --git a/torchgen/context.py b/torchgen/context.py
new file mode 100644
index 000000000000..ab0b90dcb732
--- /dev/null
+++ b/torchgen/context.py
@@ -0,0 +1,111 @@
+from torchgen.utils import S, T, context
+from torchgen.model import (
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    BackendIndex,
+    DispatchKey,
+)
+import torchgen.local as local
+
+import functools
+from typing import TypeVar, Union, Iterator, Callable, Dict, Optional
+import contextlib
+
+# Helper functions for defining generators on things in the model
+
+F = TypeVar(
+    "F",
+    NativeFunction,
+    NativeFunctionsGroup,
+    NativeFunctionsViewGroup,
+    Union[NativeFunction, NativeFunctionsGroup],
+    Union[NativeFunction, NativeFunctionsViewGroup],
+)
+
+F2 = TypeVar(
+    "F2",
+    NativeFunction,
+    Optional[NativeFunction],
+    bool,
+)
+
+
+@contextlib.contextmanager
+def native_function_manager(
+    g: Union[NativeFunctionsGroup, NativeFunctionsViewGroup, NativeFunction]
+) -> Iterator[None]:
+    if isinstance(g, NativeFunctionsGroup):
+        # By default, we associate all errors with structured native functions
+        # with the out variant.  In some cases, it might be better to have
+        # a more specific place to hang things; if so, use
+        # native_function_manager again on the inside
+        f = g.out
+    elif isinstance(g, NativeFunctionsViewGroup):
+        # We associate errors with the view operator
+        f = g.view
+    else:
+        f = g
+    with context(lambda: f"in native_functions.yaml line {f.loc}:\n  {f.func}"):
+        with local.parametrize(
+            use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors
+        ):
+            yield
+
+
+# Given a function that operates on NativeFunction, wrap it into a new function
+# that sets some appropriate context managers for that native function.
+# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound
+# (you will get an error if we try to access the local variables without having
+# set them).
+def with_native_function(func: Callable[[F], T]) -> Callable[[F], T]:
+    @functools.wraps(func)
+    def wrapper(f: F) -> T:
+        with native_function_manager(f):
+            return func(f)
+
+    return wrapper
+
+
+def with_native_function_and(func: Callable[[F, F2], T]) -> Callable[[F, F2], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, f2: F2) -> T:
+        # The first native_function is assumed to be the one with the appropriate context.
+        with native_function_manager(f):
+            return func(f, f2)
+
+    return wrapper
+
+
+def method_with_native_function(func: Callable[[S, F], T]) -> Callable[[S, F], T]:
+    @functools.wraps(func)
+    def wrapper(slf: S, f: F) -> T:
+        with native_function_manager(f):
+            return func(slf, f)
+
+    return wrapper
+
+
+# Convenience decorator for functions that explicitly take in a BackendIndex,
+# instead of indirectly taking one in as a closure
+def with_native_function_and_index(
+    func: Callable[[F, BackendIndex], T]
+) -> Callable[[F, BackendIndex], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, backend_index: BackendIndex) -> T:
+        with native_function_manager(f):
+            return func(f, backend_index)
+
+    return wrapper
+
+
+# Convenience decorator for functions that explicitly take in a Dict of BackendIndices
+def with_native_function_and_indices(
+    func: Callable[[F, Dict[DispatchKey, BackendIndex]], T]
+) -> Callable[[F, Dict[DispatchKey, BackendIndex]], T]:
+    @functools.wraps(func)
+    def wrapper(f: F, backend_indices: Dict[DispatchKey, BackendIndex]) -> T:
+        with native_function_manager(f):
+            return func(f, backend_indices)
+
+    return wrapper
diff --git a/torchgen/decompositions/gen_jit_decompositions.py b/torchgen/decompositions/gen_jit_decompositions.py
new file mode 100644
index 000000000000..7cfbb803f9b8
--- /dev/null
+++ b/torchgen/decompositions/gen_jit_decompositions.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+import os
+from pathlib import Path
+
+from torch.jit._decompositions import decomposition_table
+
+# from torchgen.code_template import CodeTemplate
+
+DECOMP_HEADER = r"""
+/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python torchgen/decompositions/gen_jit_decompositions.py
+ */
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/decomposition_registry_util.h>
+
+namespace torch {
+namespace jit {
+
+
+const std::string decomp_funcs =
+R"("""
+
+
+DECOMP_CENTER = r"""
+)";
+
+const std::string& GetSerializedDecompositions() {
+  return decomp_funcs;
+}
+
+const OperatorMap<std::string>& GetDecompositionMapping() {
+  // clang-format off
+ static const OperatorMap<std::string> decomposition_mapping {
+"""
+
+DECOMP_END = r"""
+  };
+  // clang-format on
+
+  return decomposition_mapping;
+}
+
+} // namespace jit
+} // namespace torch
+"""
+
+
+DECOMPOSITION_UTIL_FILE_NAME = "decomposition_registry_util.cpp"
+
+
+def gen_serialized_decompisitions() -> str:
+    return "\n".join(
+        [scripted_func.code for scripted_func in decomposition_table.values()]
+    )
+
+
+def gen_decomposition_mappings() -> str:
+    decomposition_mappings = []
+    for schema, scripted_func in decomposition_table.items():
+        decomposition_mappings.append(
+            '    {"' + schema + '", "' + scripted_func.name + '"},'
+        )
+    return "\n".join(decomposition_mappings)
+
+
+def write_decomposition_util_file(path: str) -> None:
+    decomposition_str = gen_serialized_decompisitions()
+    decomposition_mappings = gen_decomposition_mappings()
+    file_components = [
+        DECOMP_HEADER,
+        decomposition_str,
+        DECOMP_CENTER,
+        decomposition_mappings,
+        DECOMP_END,
+    ]
+    print("writing file to : ", path + "/" + DECOMPOSITION_UTIL_FILE_NAME)
+    with open(os.path.join(path, DECOMPOSITION_UTIL_FILE_NAME), "wb") as out_file:
+        final_output = "".join(file_components)
+        out_file.write(final_output.encode("utf-8"))
+
+
+def main() -> None:
+    pytorch_dir = Path(__file__).resolve().parents[3]
+    upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "runtime"
+    write_decomposition_util_file(str(upgrader_path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchgen/dest/__init__.py b/torchgen/dest/__init__.py
new file mode 100644
index 000000000000..2ac52939f63a
--- /dev/null
+++ b/torchgen/dest/__init__.py
@@ -0,0 +1,16 @@
+from .lazy_ir import GenLazyIR as GenLazyIR
+from .lazy_ir import GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition
+from .lazy_ir import GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition
+from .register_dispatch_key import (
+    RegisterDispatchKey as RegisterDispatchKey,
+    gen_registration_helpers as gen_registration_helpers,
+    gen_registration_headers as gen_registration_headers,
+)
+from .native_functions import (
+    compute_native_function_declaration as compute_native_function_declaration,
+)
+from .ufunc import (
+    compute_ufunc_cuda as compute_ufunc_cuda,
+    compute_ufunc_cpu as compute_ufunc_cpu,
+    compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
+)
diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py
new file mode 100644
index 000000000000..66c9e3d749eb
--- /dev/null
+++ b/torchgen/dest/lazy_ir.py
@@ -0,0 +1,525 @@
+from abc import ABC
+from typing import List, Optional, Union
+from dataclasses import dataclass
+from torchgen.context import method_with_native_function
+from torchgen.model import BackendIndex, NativeFunction, NativeFunctionsGroup
+from torchgen.api.types import (
+    BaseCType,
+    OptionalCType,
+    VectorCType,
+    kernel_signature,
+    deviceT,
+)
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.lazy import (
+    LazyIrSchema,
+    LazyArgument,
+    getValueT,
+    isValueType,
+    tensorListValueT,
+)
+from torchgen.dest.lazy_ts_lowering import ts_lowering_body
+
+
+def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str:
+    """
+    Given a LazyArgument,
+    generate a c++ string for materializing an rvalue of that arg for passing into
+    a lazy Node constructor.
+    """
+
+    if isValueType(arg.lazy_type):
+        if isinstance(arg.lazy_type, BaseCType):
+            if arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            elif arg.lazy_type.type is tensorListValueT:
+                return f"lazy_{arg.name}_tensorlist"
+            elif arg.is_symint_or_list:
+                cpp_type = arg.lazy_type.cpp_type()
+                return (
+                    f"{cpp_type}(std::dynamic_pointer_cast<torch::lazy::SymbolicIntNode>"
+                    f"({arg.name}.toSymbolicIntNode())->node_, 0)"
+                )
+            return f"lazy_{arg.name}->GetIrValue()"
+        elif isinstance(arg.lazy_type, OptionalCType):
+            if arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            return (
+                f"lazy_{arg.name} ? "
+                f"c10::make_optional(lazy_{arg.name}->GetIrValue()) : "
+                "c10::nullopt"
+            )
+        else:
+            raise AssertionError(
+                f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+            )
+    else:
+        if isinstance(arg.lazy_type, VectorCType) and isinstance(
+            arg.lazy_type.elem, BaseCType
+        ):
+            return f"std::vector<{arg.lazy_type.elem.type}>({arg.name}.begin(), {arg.name}.end())"
+        elif (
+            isinstance(arg.lazy_type, OptionalCType)
+            and isinstance(arg.lazy_type.elem, VectorCType)
+            and isinstance(arg.lazy_type.elem.elem, BaseCType)
+        ):
+            return f"torch::lazy::ToOptionalVector<{arg.lazy_type.elem.elem.type}>({arg.name})"
+        else:
+            return f"{arg.name}"
+
+
+def node_ctor_inputs(schema: LazyIrSchema) -> str:
+    """
+    Produce a formatted string with the arguments as passed into the constructor of a node class.
+    """
+    node_ctor_values = [
+        node_ctor_arg_rvalue_string(arg) for arg in schema.filtered_args()
+    ]
+    return ", ".join(node_ctor_values)
+
+
+def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str:
+    """
+    Generate code that falls back to eager conditioned on a predicate
+    """
+    fallback_args = ",\n                ".join(
+        [str(arg.name) for arg in schema.filtered_args(generator=True)]
+    )
+    if len(overload_name):
+        aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})"
+    else:
+        aten_op_str = f"ATEN_OP({schema.aten_name})"
+    or_has_generator = ""
+    if schema.generator_arg:
+        # generators are always optional and there is never more than one, at least currently
+        or_has_generator = f" || ({schema.generator_arg.name}.has_value() && {schema.generator_arg.name}->defined())"
+    return f"""
+        if (force_eager_fallback({aten_symbol(schema)}){or_has_generator}) {{
+            return at::native::call_fallback_fn<&ltc_eager_fallback, {aten_op_str}>::call(
+                {fallback_args}
+            );
+        }}
+"""
+
+
+def aten_symbol(schema: LazyIrSchema) -> str:
+    missing_interned_strings = {
+        "sigmoid_backward",
+    }
+    if schema.aten_name in missing_interned_strings:
+        return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")'
+    return f"at::aten::{schema.aten_name}"
+
+
+@dataclass(frozen=True)
+class GenLazyIR(ABC):
+    backend_index: BackendIndex
+    node_base: str
+
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        return self.gen(f)
+
+    # there is no lowering functionality generated unless this IR base class is subclassed and
+    # implemented as a backend-specific node
+    def lowering_function(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
+        return ""
+
+    def can_be_reused_function(
+        self, f: Union[NativeFunctionsGroup, NativeFunction], node_ctor_args: str
+    ) -> str:
+        return f"""bool CanBeReused({node_ctor_args}) const {{
+    return false;
+    }}"""
+
+    def node_base_ctor_call(self, schema: LazyIrSchema) -> str:
+        # backends can customize the way the node base class constructor is called,
+        # as long as all of its arguments can be generated from information available from the schema
+        base_ctor_value_args_list = []
+        for arg in schema.filtered_args(values=True, scalars=False):
+            if isinstance(arg.lazy_type, BaseCType) or isinstance(
+                arg.lazy_type, VectorCType
+            ):
+                base_ctor_value_args_list.append(f"{arg.name}")
+            elif isinstance(arg.lazy_type, OptionalCType):
+                base_ctor_value_args_list.append(f"{arg.name}.value_or(kNullValue)")
+            else:
+                raise AssertionError(
+                    f"Unsupported type ({arg.lazy_type}) - add support if necessary"
+                )
+        base_ctor_value_args = ", ".join(base_ctor_value_args_list)
+
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        scalar_hashes = ", ".join([f"{a.name}" for a in scalar_args])
+
+        return f"""{self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}),
+              {{{base_ctor_value_args}}}, std::move(shapes),
+              /* num_outputs */ {len(schema.returns)},
+              torch::lazy::MHash({scalar_hashes}))"""
+
+    def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        # for now, we just want one IR class decl and soon after also the method defs
+        # and we use the functional version not out/inplace.
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        schema = LazyIrSchema(func)
+        all_args = schema.filtered_args()
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+
+        node_ctor_args = ", ".join(
+            [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
+        )
+        scalar_initializers = ",\n        ".join(
+            [f"{a.name}({a.name})" for a in scalar_args]
+        )
+        comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else ""
+        scalar_decls = "\n  ".join(
+            [
+                f"std::string {a.name};"
+                if a.lazy_type.cpp_type() == "c10::string_view"
+                else f"{a.lazy_type.cpp_type()} {a.name};"
+                for a in scalar_args
+            ]
+        )
+        optional_values = [
+            arg.name
+            for arg in schema.filtered_args(values=True, scalars=False)
+            if isinstance(arg.lazy_type, OptionalCType)
+        ]
+        has_optional_decls = "\n  ".join(
+            [f"bool has_{value}: 1;" for value in optional_values]
+        )
+        has_optional_defs = "\n    ".join(
+            [f"has_{value} = !!{value};" for value in optional_values]
+        )
+        members_to_string = []
+        for arg in scalar_args:
+            if isinstance(arg.lazy_type, OptionalCType):
+                members_to_string.append(
+                    f"""if ({arg.name}.has_value()) {{
+      ss << ", {arg.name}=" << {arg.name}.value();
+    }} else {{
+      ss << ", {arg.name}=null";
+    }}"""
+                )
+            else:
+                members_to_string.append(f'ss << ", {arg.name}=" << {arg.name};')
+        members_to_string_str = "\n    ".join(members_to_string)
+
+        return [
+            f"""\
+class {schema.node_name} : public {self.node_base} {{
+ public:
+  static torch::lazy::OpKind ClassOpKind() {{
+    return torch::lazy::OpKind({aten_symbol(schema)});
+  }}
+
+  {schema.node_name}({node_ctor_args}, std::vector<torch::lazy::Shape>&& shapes)
+
+      : {self.node_base_ctor_call(schema)}{comma_if_scalar_initializers}
+        {scalar_initializers}
+
+  {{
+    {has_optional_defs}
+  }}
+
+  std::string ToString() const override {{
+    std::stringstream ss;
+    ss << {self.node_base}::ToString();
+    {members_to_string_str}
+    return ss.str();
+  }}
+
+  {self.can_be_reused_function(f, node_ctor_args)}
+
+  {self.lowering_function(f)}
+
+  {scalar_decls}
+  {has_optional_decls}
+
+}};
+
+""",
+        ]
+
+
+@dataclass(frozen=True)
+class GenTSLazyIR(GenLazyIR):
+    def lowering_function(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
+        return f"""torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+    torch::lazy::TSLoweringContext* loctx) const override {{
+    {ts_lowering_body(f)}
+  }}"""
+
+    def can_be_reused_function(
+        self, f: Union[NativeFunctionsGroup, NativeFunction], node_ctor_args: str
+    ) -> str:
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        schema = LazyIrSchema(func)
+
+        value_comparsion = []
+        for arg in schema.positional_values:
+            if isinstance(arg.lazy_type, OptionalCType):
+                value_comparsion.append(
+                    f"operand(i++) == {arg.name}.value_or(kNullValue)"
+                )
+            else:
+                value_comparsion.append(f"operand(i++) == {arg.name}")
+        for arg in schema.positional_scalars:
+            value_comparsion.append(f"this->{arg.name} == {arg.name}")
+        for arg in schema.keyword_values:
+            value_comparsion.append(f"operand(i++) == {arg.name}")
+        for arg in schema.keyword_scalars:
+            value_comparsion.append(f"this->{arg.name} == {arg.name}")
+        value_comparsion_str = " &&\n        ".join(value_comparsion)
+
+        return f"""bool CanBeReused({node_ctor_args}) const {{
+    size_t i = 0;
+    return ({value_comparsion_str});
+  }}"""
+
+
+@dataclass(frozen=True)
+class GenLazyNativeFuncDefinition:
+    class_method_name: str
+    backend_index: BackendIndex
+    tensor_class: str
+    gen_forced_fallback_code: bool
+    backend_namespace: str
+    get_tensorlist: str
+    get_tensor_or_wrap_number: str
+    try_get_tensor: str
+    metrics_counter: str
+    create_tensor: str
+    create_from_first_tensor: bool
+    create_aten_from_ltc_tensor: str
+    tuple_aten_from_ltc_tensors: str
+    lazy_tensor_ptr: str
+    get_device_fn: str
+
+    def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        # Generates lazy_{name} variables for LazyTensors wrapping input tensors
+        lazy_tensor_decls: List[str] = []
+        for arg in value_args:
+            if arg.is_wrapped_scalar:
+                if isinstance(arg.lazy_type, OptionalCType):
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} = {arg.name} ?
+                c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})):
+                c10::nullopt;"""
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} =
+                torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name});"""
+                    )
+            elif arg.is_symint_or_list:
+                continue  # values are extracted in isValueType
+            elif isinstance(arg.lazy_type, BaseCType):
+                if arg.lazy_type.type is tensorListValueT:
+                    lazy_tensor_decls.append(
+                        f"auto lazy_{arg.name}_tensorlist = "
+                        f"{self.backend_namespace}::{self.get_tensorlist}({arg.name});"
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                        f"{self.backend_namespace}::{self.get_tensor_or_wrap_number}({arg.name}, *common_device);"
+                    )
+            elif isinstance(arg.lazy_type, OptionalCType):
+                # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it
+                # until we encounter a real world example.
+                lazy_tensor_decls.append(
+                    f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                    f"{self.backend_namespace}::{self.try_get_tensor}({arg.name}.value_or(at::Tensor()));"
+                )
+            else:
+                raise AssertionError(
+                    f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+                )
+        return ("\n        ").join(lazy_tensor_decls)
+
+    def force_eager_fallback(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        if self.gen_forced_fallback_code:
+            return gen_fallback_code(schema, overload_name=func.func.name.overload_name)
+        return ""
+
+    def metrics(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        return f"{self.metrics_counter};"
+
+    def get_device(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        optional_device = OptionalCType(BaseCType(deviceT))
+        optional_devices = [
+            a.name for a in scalar_args if a.lazy_type == optional_device
+        ]
+        assert (
+            len(value_types_names) > 0 or len(optional_devices) > 0
+        ), "Expected at least one Value or Device type"
+        get_device_str = (
+            f"{self.get_device_fn}({', '.join(value_types_names + optional_devices)})"
+        )
+        return f"""auto common_device = {get_device_str};
+        TORCH_INTERNAL_ASSERT(common_device);
+        """
+
+    def shape_inference(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        all_args = schema.filtered_args()
+        returns_length = len(schema.returns)
+        # call the meta kernel if it exists, to compute output shape/dtype for our IR
+        if func.structured or func.structured_delegate is not None:
+            meta_out = """std::vector<torch::lazy::Shape> shapes{
+        torch::lazy::Shape(out_meta.scalar_type(), out_meta.sizes().vec())};"""
+            if returns_length > 1:
+
+                def this_shape(i: int) -> str:
+                    return f"torch::lazy::Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())"
+
+                shapes_str = ",".join([this_shape(i) for i in range(returns_length)])
+                meta_out = "std::vector<torch::lazy::Shape> shapes{" + shapes_str + "};"
+
+            shape_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(a.name) for a in all_args)});
+            {meta_out}"""
+        else:
+            shape_sig = ComputeShapeSignature(metadata.kernel, func)
+            shape_str = f"""
+            auto shapes = {shape_sig.shape_call};"""
+
+        shape_str += f"""
+            TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});"""
+
+        # Calculating which dimensions are symbolic
+        func_schema_str = "aten::" + str(func.func)
+        shape_str += f"""
+            if(torch::lazy::symbolicShapeEnabled()){{
+                std::vector<torch::jit::IValue> inputs = {{ {', '.join(str(a.name) for a in all_args)} }};
+                char* schema_str = "{func_schema_str}";
+                applySymbolicShapesOnLT(schema_str, inputs, shapes);
+            }}
+        """
+        return shape_str
+
+    def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        node_ctor_input_str = node_ctor_inputs(schema)
+        return f"""torch::lazy::NodePtr node = torch::lazy::ReuseNode<{schema.node_name}>({node_ctor_input_str});
+        if (!node) {{
+            {self.shape_inference(func, schema)}
+            node = torch::lazy::MakeNode<{schema.node_name}>({node_ctor_input_str}, std::move(shapes));
+            CacheNode(node);
+        }}
+        """
+
+    def create_lazy_tensor(self, first_tensor_name: Optional[str] = None) -> str:
+        # xla uses an instance method for tensor creation, for the time being
+        if self.create_from_first_tensor:
+            # TODO(whc) remove this if XLA switches to using static method for creation
+            assert (
+                first_tensor_name is not None
+            ), "Requires first tensor to create lazy tensor"
+            return f"{first_tensor_name}.{self.create_tensor}"
+        return f"{self.backend_namespace}::{self.create_tensor}"
+
+    def return_aten_tensor(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        returns_length = len(schema.returns)
+        value_args = schema.filtered_args(values=True, scalars=False)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        first_tensor_name = value_types_names[0] if len(value_types_names) > 0 else None
+        bridge_str = f"""auto result = {self.create_aten_from_ltc_tensor}(
+                {self.create_lazy_tensor(first_tensor_name)}(std::move(node), *common_device));"""
+
+        if returns_length > 1:
+            assert (
+                len(value_types_names) > 0
+            ), "Code below assumes there is at least one tensor arg"
+            bridge_str = f"""std::vector<{self.lazy_tensor_ptr}> lazy_tensors;
+        for (int i = 0; i < {returns_length}; i++) {{
+            lazy_tensors.push_back({self.create_lazy_tensor(first_tensor_name)}({getValueT()}(node, i), *common_device));
+        }}
+        auto result = {self.tuple_aten_from_ltc_tensors}<{returns_length}>(lazy_tensors);"""
+
+        if schema.name.name.inplace or func.func.is_out_fn():
+            assert returns_length == 1, (
+                "We assumed there was no such case where an op is an in-place variant "
+                f"and has tuple outputs, but got tuple of len {returns_length}."
+            )
+            bridge_str = f"""lazy_{first_tensor_name}->SetInPlaceIrValue(node);
+        auto& result = {first_tensor_name};"""
+
+        bridge_str += """
+        return result;"""
+        return bridge_str
+
+    @method_with_native_function
+    def __call__(self, func: NativeFunction) -> List[str]:
+        sig = kernel_signature(func, self.backend_index)
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        schema = LazyIrSchema(func.func)
+        return [
+            f"""\
+    {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{
+        {self.force_eager_fallback(func, schema)}
+        {self.metrics(func, schema)}
+        {self.get_device(func, schema)}
+        {self.lazy_tensor_decls(func, schema)}
+        {self.build_ir_node(func, schema)}
+        {self.return_aten_tensor(func, schema)}
+    }};\n
+    """
+        ]
+
+
+class ComputeShapeSignature:
+    """
+    Here we use the base name as the suffix of the signature to avoid generating for in-place variants.
+    """
+
+    def __init__(self, kernel_name: str, f: NativeFunction):
+        self.__schema = LazyIrSchema(f.func)
+        self.__dispatch_args = ", ".join(
+            [a.decl() for a in dispatcher.arguments(f.func)]
+        )
+        self.__call_args = ", ".join(
+            [f"{arg.name}" for arg in self.__schema.filtered_args(generator=True)]
+        )
+        self.__kernel_name = kernel_name
+
+    def __decl_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__dispatch_args})"
+
+    def __call_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__call_args})"
+
+    @property
+    def shape_decl(self) -> str:
+        return f"TORCH_API std::vector<torch::lazy::Shape> compute_shape_{self.__decl_suffix()}"
+
+    @property
+    def shape_call(self) -> str:
+        return f"torch::lazy::compute_shape_{self.__call_suffix()}"
+
+
+@dataclass(frozen=True)
+class GenLazyShapeInferenceDefinition:
+    backend_index: BackendIndex
+    tensor_class: str
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> List[str]:
+        sig = kernel_signature(f, self.backend_index)
+        metadata = self.backend_index.get_kernel(f)
+        assert metadata is not None
+
+        # Only generate shape/dtype fn for non-structured kernels,
+        # since we just use the meta function for structured kernels
+        if not f.structured and f.structured_delegate is None:
+            shape_sig = ComputeShapeSignature(metadata.kernel, f)
+            return ["\n".join([f"{shape_sig.shape_decl};"])]
+        else:
+            return []
diff --git a/torchgen/dest/lazy_ts_lowering.py b/torchgen/dest/lazy_ts_lowering.py
new file mode 100644
index 000000000000..34470d776f66
--- /dev/null
+++ b/torchgen/dest/lazy_ts_lowering.py
@@ -0,0 +1,53 @@
+from typing import Union
+from torchgen.model import NativeFunction, NativeFunctionsGroup
+from torchgen.api.lazy import LazyIrSchema
+from torchgen.api.types import OptionalCType
+
+
+def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
+    # for now, we just want one IR class decl and soon after also the method defs
+    # and we use the functional version not out/inplace.
+    func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+    schema = LazyIrSchema(func)
+
+    emplace_arguments = []
+    for arg in schema.positional_args:
+        if arg.is_lazy_value:
+            if isinstance(arg.lazy_type, OptionalCType):
+                emplace_arguments.append(
+                    f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr"
+                )
+                continue
+            emplace_arguments.append("loctx->GetOutputOp(operand(i++))")
+            continue
+        emplace_arguments.append(f'"{arg.name}", {arg.name}')
+
+    emplace_arguments_str = "\n    ".join(
+        [f"arguments.emplace_back({a});" for a in emplace_arguments]
+    )
+    emplace_kwarg_values = [
+        f'"{arg.name}", loctx->GetOutputOp(operand(i++))'
+        for arg in schema.keyword_values
+    ]
+    emplace_kwarg_scalars = [
+        f'"{arg.name}", {arg.name}' for arg in schema.keyword_scalars
+    ]
+    emplace_kwarguments = "\n    ".join(
+        [
+            f"kwarguments.emplace_back({a});"
+            for a in emplace_kwarg_values + emplace_kwarg_scalars
+        ]
+    )
+    return f"""\
+    std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve({len(emplace_arguments)});
+    kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)});
+    size_t i = 0;
+    {emplace_arguments_str}
+    {emplace_kwarguments}
+    torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)});
+
+    return {schema.aten_name}_out;
+"""
diff --git a/torchgen/dest/native_functions.py b/torchgen/dest/native_functions.py
new file mode 100644
index 000000000000..67db9795f11e
--- /dev/null
+++ b/torchgen/dest/native_functions.py
@@ -0,0 +1,63 @@
+from typing import List, Union, Optional
+
+from torchgen.context import with_native_function_and_index
+from torchgen.utils import mapMaybe
+from torchgen.model import NativeFunction, NativeFunctionsGroup, BackendIndex
+from torchgen.api.types import kernel_signature
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+
+
+@with_native_function_and_index
+def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> Optional[str]:
+    sig = kernel_signature(f, backend_index)
+    metadata = backend_index.get_kernel(f)
+    if metadata is None:
+        return None
+    if "legacy::" in metadata.kernel:
+        return None
+    else:
+        prefix = "static" if backend_index.external else "TORCH_API"
+        return f"{prefix} {sig.decl(name=metadata.kernel)};"
+
+
+@with_native_function_and_index
+def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> List[str]:
+    meta_name = meta.name(g)
+    out_args = structured.impl_arguments(g)
+    metadata = backend_index.get_kernel(g)
+    if metadata is None:
+        return []
+    prefix = "" if backend_index.external else "TORCH_API "
+    return [
+        f"""\
+struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{
+void impl({', '.join(a.decl() for a in out_args)});
+}};
+"""
+    ]
+
+
+# Generates NativeFunctions.h, a list of forward declarations of all
+# actual kernel definitions we keep in aten/src/ATen/native/
+@with_native_function_and_index
+def compute_native_function_declaration(
+    g: Union[NativeFunctionsGroup, NativeFunction], backend_index: BackendIndex
+) -> List[str]:
+    metadata = backend_index.get_kernel(g)
+    if isinstance(g, NativeFunctionsGroup):
+        if metadata is not None and metadata.structured:
+            if backend_index.external:
+                # Structured hasn't been tested with external backends yet.
+                raise AssertionError(
+                    "Structured external backend functions are not implemented yet."
+                )
+            else:
+                return gen_structured(g, backend_index)
+        else:
+            return list(
+                mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions())
+            )
+    else:
+        x = gen_unstructured(g, backend_index)
+        return [] if x is None else [x]
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
new file mode 100644
index 000000000000..e6ca469f54f4
--- /dev/null
+++ b/torchgen/dest/register_dispatch_key.py
@@ -0,0 +1,940 @@
+from typing import List, Optional, Tuple, Union
+import itertools
+from typing_extensions import Literal
+from dataclasses import dataclass
+import textwrap
+
+from torchgen.context import method_with_native_function, native_function_manager
+from torchgen.utils import Target, mapMaybe, assert_never
+from torchgen.model import (
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    SchemaKind,
+    TensorOptionsArguments,
+    DeviceCheckType,
+    Argument,
+    is_cuda_dispatch_key,
+    BackendIndex,
+    gets_generated_out_inplace_wrapper,
+)
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CppSignature,
+    CppSignatureGroup,
+    Expr,
+    MutRefCType,
+    kernel_signature,
+    NativeSignature,
+    tensorT,
+    NamedCType,
+    DispatcherSignature,
+)
+import torchgen.api.meta as meta
+import torchgen.api.cpp as cpp
+import torchgen.api.structured as structured
+from torchgen.api.translate import translate
+from torchgen.selective_build.selector import SelectiveBuilder
+
+
+def gen_registration_headers(
+    backend_index: BackendIndex,
+    per_operator_headers: bool,
+    rocm: bool,
+) -> List[str]:
+    if per_operator_headers:
+        headers = ["#include <ATen/ops/as_strided_native.h>"]
+    else:
+        headers = ["#include <ATen/NativeFunctions.h>"]
+
+    if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta):
+        headers.append("#include <ATen/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.CUDA:
+        if rocm:
+            headers.append("#include <ATen/hip/EmptyTensor.h>")
+        else:
+            headers.append("#include <ATen/cuda/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.MPS:
+        headers.append("#include <ATen/mps/EmptyTensor.h>")
+    elif per_operator_headers:
+        headers += [
+            "#include <ATen/ops/empty.h>",
+            "#include <ATen/ops/empty_strided.h>",
+            "#include <ATen/ops/_copy_from_and_resize.h>",
+            "#include <ATen/ops/_copy_from.h>",
+        ]
+    else:
+        headers.append("#include <ATen/Functions.h>")
+
+    return headers
+
+
+def gen_empty_impl_names(
+    backend_index: BackendIndex,
+) -> Tuple[Optional[str], Optional[str]]:
+    empty_impl = None
+    empty_strided_impl = None
+
+    if backend_index.dispatch_key in (
+        DispatchKey.Meta,
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.MPS,
+    ):
+        dispatch = str(backend_index.dispatch_key).lower()
+        empty_impl = f"at::detail::empty_{dispatch}"
+        empty_strided_impl = f"at::detail::empty_strided_{dispatch}"
+    elif backend_index.dispatch_key in (
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.QuantizedCPU,
+        DispatchKey.QuantizedCUDA,
+    ):
+        empty_impl = "at::empty"
+        empty_strided_impl = "at::empty_strided"
+
+    return empty_impl, empty_strided_impl
+
+
+def gen_create_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.Meta:
+        empty_options = "options.device(at::kMeta)"
+    else:
+        empty_options = "options"
+
+    empty_impl, empty_strided_impl = gen_empty_impl_names(backend_index)
+    if empty_impl is None:
+        return []
+
+    return [
+        f"""
+Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (strides.empty()) {{
+      return {empty_impl}(sizes, {empty_options});
+  }} else {{
+      return {empty_strided_impl}(sizes, strides, {empty_options});
+  }}
+}}
+"""
+    ]
+
+
+def gen_maybe_create_proxy_helper(backend_index: BackendIndex) -> List[str]:
+    _, empty_strided_impl = gen_empty_impl_names(backend_index)
+    return (
+        []
+        if empty_strided_impl is None
+        else [
+            f"""
+c10::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (out.strides() != strides) {{
+    return {empty_strided_impl}(sizes, strides, options);
+  }}
+  return c10::nullopt;
+}}
+"""
+        ]
+    )
+
+
+def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]:
+    return [
+        """
+void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
+  TORCH_CHECK(options.dtype() == out.dtype(),
+      "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
+  TORCH_CHECK(options.device() == out.device(),
+      "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
+  const bool resized = at::native::resize_output(out, sizes);
+  // Only restride if a resize occurred; otherwise we ignore the (advisory)
+  // strides from the meta function and directly use the output tensor's
+  // preexisting strides
+  if (resized) {
+    if (!strides.empty()) {
+      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+      at::native::as_strided_(out, sizes, strides);
+    } else if (options.memory_format_opt().has_value()) {
+      out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+    }
+  }
+}
+"""
+    ]
+
+
+def gen_check_inplace_helper(backend_index: BackendIndex) -> List[str]:
+    return [
+        """
+void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
+  // These checks are needed on those operators that:
+  //   1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
+  //   2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
+  // For other operators (e.g. 'add'), 'TensorIterator' already checks
+  // these things separately.
+  TORCH_CHECK(options.dtype() == self.dtype(),
+      "Bad in-place call: ",
+      "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
+  TORCH_CHECK(options.device() == self.device(),
+      "Bad in-place call: ",
+      "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
+  TORCH_CHECK(sizes == self.sizes(),
+      "Bad in-place call: ",
+      "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
+}
+"""
+    ]
+
+
+def gen_registration_helpers(backend_index: BackendIndex) -> List[str]:
+    return [
+        *gen_create_out_helper(backend_index),
+        *gen_resize_out_helper(backend_index),
+        *gen_check_inplace_helper(backend_index),
+        *gen_maybe_create_proxy_helper(backend_index),
+    ]
+
+
+# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp).
+#
+#   - The primary function of this file is to register all of the
+#     implementations for the given dispatch key to the dispatcher,
+#     so they are available for use in PyTorch.  If dispatch is
+#     None, we generate schema (def) registrations and catchall
+#     registrations.
+#   - The secondary function of this file is to generate a wrapper
+#     around functions.  In CPUType these wrappers do nothing
+#     (and should be removed), but in other cases they handle
+#     DeviceGuard. A small extra benefit of wrappers is they
+#     are not overloaded, so they can be used in the registration
+#     API without having to disambiguate which overload you want
+#     (as would be the case if you directly registered native::
+#     functions).
+#   - The tertiary function of this file is to generate *static*
+#     cpp API bindings which can be used to bypass dispatcher
+#     directly to kernels, but with user-friendly cpp-style API
+@dataclass(frozen=True)
+class RegisterDispatchKey:
+    backend_index: BackendIndex
+
+    target: Union[
+        Literal[Target.ANONYMOUS_DEFINITION],
+        Literal[Target.NAMESPACED_DEFINITION],
+        Literal[Target.NAMESPACED_DECLARATION],
+        Literal[Target.REGISTRATION],
+    ]
+
+    # Selector object to determine which operators to generate
+    # registration code for.
+    selector: SelectiveBuilder
+
+    # Whether or not we are actually code-genning for ROCm
+    rocm: bool
+
+    # The namespace that the kernels are written in. This is just `at::native` for in-tree kernels.
+    cpp_namespace: str
+
+    # The class that all unstructured native functions live under. This is used to improve
+    # compiler error messages when a kernel writer adds a native function with the wrong signature.
+    # This is only used in unstructured kernels, since structured kernels already live in a class.
+    # Finally, this field is currently Optional because it is only used by external backends.
+    # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating
+    # all of the existing kernel signatures scattered across aten/src/ATen/native.
+    class_method_name: Optional[str]
+
+    # Only set to true in lightweight dispatch. If lightweight dispatch is enabled we are registering
+    # operators into JIT op registry, thus we need to avoid generating code to register into the dispatcher.
+    skip_dispatcher_op_registration: bool
+
+    @staticmethod
+    def gen_device_check(
+        type: DeviceCheckType, args: List[Argument], method_name: str
+    ) -> str:
+        if type == DeviceCheckType.NoCheck:
+            return "  // No device check\n"
+
+        device_check = "c10::optional<Device> common_device = nullopt;\n"
+        device_check += "(void)common_device; // Suppress unused variable warning\n"
+        for arg in args:
+            # Only tensor like arguments are eligible
+            if arg.type.is_tensor_like():
+                device_check += f"""
+  c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");"""
+        return device_check
+
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        if isinstance(f, NativeFunctionsGroup):
+            g: NativeFunctionsGroup = f
+            # Note: We call gen_structured() if the operator is marked structured, regardless of the backend.
+            # gen_structured() has special logic to handle auto-generated kernels.
+            if g.structured:
+                return self.gen_structured(g)
+            else:
+                return list(
+                    mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())
+                )
+        elif isinstance(f, NativeFunction):
+            r = self.gen_unstructured(f)
+            return [] if r is None else [r]
+        else:
+            assert_never(f)
+
+    def wrapper_kernel_sig(
+        self, f: NativeFunction
+    ) -> Union[NativeSignature, DispatcherSignature]:
+        # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names.
+        return kernel_signature(
+            f, self.backend_index, prefix=f"wrapper_{f.func.name.overload_name}_"
+        )
+
+    def gen_out_inplace_wrapper(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup]
+    ) -> Optional[str]:
+        if g is None:
+            return None
+        k = f.func.kind()
+        if k is SchemaKind.inplace:
+            copy_op = "at::_copy_from"
+        elif k is SchemaKind.out:
+            copy_op = "at::_copy_from_and_resize"
+        else:
+            raise AssertionError("gen_out_inplace_wrapper called on a functional op")
+
+        sig = self.wrapper_kernel_sig(f)
+        name = sig.name()
+
+        func_res = f"{name}_tmp"
+        return_names = cpp.return_names(f)
+        if len(return_names) > 1:
+            updates = "\n  ".join(
+                f"{copy_op}(std::get<{i}>({func_res}), {ret_name});"
+                for i, ret_name in enumerate(return_names)
+            )
+            returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})'
+        else:
+            ret_name = return_names[0]
+            updates = f"{copy_op}({func_res}, {ret_name});"
+            returns = ret_name
+
+        functional_sig = self.wrapper_kernel_sig(g.functional)
+        wrapper_name = sig.name()
+
+        return f"""\
+{sig.defn(name=wrapper_name)} {{
+  auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))});
+  {updates}
+  return {returns};
+}}
+"""
+
+    def gen_structured(self, g: NativeFunctionsGroup) -> List[str]:
+        metadata = self.backend_index.get_kernel(g)
+        if self.backend_index.dispatch_key == DispatchKey.Meta:
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify Meta dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif metadata is None or not metadata.structured:
+            return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()))
+
+        structured_gen = StructuredRegisterDispatchKey(
+            self.backend_index,
+            self.target,
+            self.selector,
+            self.rocm,
+            self.cpp_namespace,
+            self.class_method_name,
+            self.skip_dispatcher_op_registration,
+            g,
+        )
+        return list(mapMaybe(structured_gen.gen_one, g.functions()))
+
+    def gen_unstructured(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup] = None
+    ) -> Optional[str]:
+        with native_function_manager(f):
+            inplace_meta = False
+            gets_out_inplace_wrapper = False
+            if not self.backend_index.has_kernel(f):
+                if (
+                    self.backend_index.dispatch_key == DispatchKey.Meta
+                    and f.func.kind() is SchemaKind.inplace
+                    and
+                    # Defer to composites for meta implementation
+                    not f.has_composite_kernel
+                    and
+                    # Inplace list operations are not supported
+                    len(f.func.returns) == 1
+                ):
+                    inplace_meta = True
+                elif (
+                    not self.backend_index.use_out_as_primary
+                    and g is not None
+                    and gets_generated_out_inplace_wrapper(f, g, self.backend_index)
+                ):
+                    # We want to generate inplace/out wrappers, that don't have a kernel for the backend.
+                    gets_out_inplace_wrapper = True
+                else:
+                    return None
+            if f.manual_kernel_registration:
+                return None
+
+            if (
+                self.target is Target.REGISTRATION
+                and not self.selector.is_native_function_selected(f)
+            ):
+                return None
+
+            sig = self.wrapper_kernel_sig(f)
+
+            name = sig.name()
+            returns_type = sig.returns_type().cpp_type()
+            args = sig.arguments()
+            args_str = ", ".join(a.defn() for a in args)
+
+            # See Note [Direct dispatch bindings]
+            cpp_sig_group = CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=False
+            )
+
+            if self.target is Target.NAMESPACED_DECLARATION:
+                result = f"TORCH_API {cpp_sig_group.signature.decl()};\n"
+                if cpp_sig_group.faithful_signature is not None:
+                    result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n"
+                return result
+            elif self.target is Target.NAMESPACED_DEFINITION:
+
+                def generate_defn(cpp_sig: CppSignature) -> str:
+                    return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+
+                result = generate_defn(cpp_sig_group.signature)
+                if cpp_sig_group.faithful_signature is not None:
+                    result += generate_defn(cpp_sig_group.faithful_signature)
+                return result
+            elif self.target is Target.ANONYMOUS_DEFINITION:
+                # short circuit for inplace_meta
+                if inplace_meta:
+                    assert f.func.arguments.self_arg is not None
+                    self_arg_name = f.func.arguments.self_arg.argument.name
+                    # TODO: handle in place on tensor list
+                    return f"""
+{returns_type} {name}({args_str}) {{
+  TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(),
+    "Cannot inplace into non-meta tensor with meta tensor argument");
+  return {self_arg_name};
+}}
+"""
+
+                # short circuit for generated inplace/out wrappers
+                if gets_out_inplace_wrapper:
+                    return self.gen_out_inplace_wrapper(f, g)
+
+                metadata = self.backend_index.get_kernel(f)
+                if metadata is None:
+                    return None
+                if self.class_method_name is None:
+                    impl_name = f"{self.cpp_namespace}::{metadata.kernel}"
+                else:
+                    impl_name = f"{self.cpp_namespace}::{self.class_method_name}::{metadata.kernel}"
+
+                args_exprs_str = ", ".join(a.name for a in args)
+
+                device_check = "  // No device check\n"
+                # Backends that require device guards presumably also require device checks.
+                if self.backend_index.device_guard:
+                    device_check_args = itertools.chain(
+                        f.func.arguments.out, f.func.arguments.flat_positional
+                    )
+                    device_check = RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), name
+                    )
+
+                device_guard = "// DeviceGuard omitted"  # default
+                if f.device_guard and self.backend_index.device_guard:
+                    has_tensor_options = any(
+                        isinstance(a, TensorOptionsArguments)
+                        for a in f.func.arguments.non_out
+                    )
+                    if has_tensor_options:
+                        # kernel is creating a tensor
+                        device_guard = """
+  const DeviceGuard device_guard(device_or_default(device));"""
+
+                        # CUDA requires special handling
+                        if is_cuda_dispatch_key(self.backend_index.dispatch_key):
+                            device_guard = (
+                                f"globalContext().lazyInitCUDA();\n{device_guard}"
+                            )
+                    else:
+                        # kernel is operating on existing tensors
+
+                        # There is precedence for which argument we use to do
+                        # device guard.  This describes the precedence order.
+                        self_arg = (
+                            [f.func.arguments.self_arg.argument]
+                            if f.func.arguments.self_arg is not None
+                            else []
+                        )
+                        candidate_args = itertools.chain(
+                            self_arg,
+                            f.func.arguments.out,
+                            f.func.arguments.flat_positional,
+                        )
+
+                        # Only tensor like arguments are eligible
+                        device_of = next(
+                            (
+                                f"{a.name}"
+                                for a in candidate_args
+                                if a.type.is_tensor_like()
+                            ),
+                            None,
+                        )
+                        if device_of is not None:
+                            device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
+
+                return f"""\
+namespace {{
+
+{returns_type} {name}({args_str}) {{
+  {device_check}
+
+  {device_guard}
+  return {impl_name}({args_exprs_str});
+}}
+
+}} // anonymous namespace
+"""
+
+            elif self.target is Target.REGISTRATION:
+                if f.manual_kernel_registration or self.skip_dispatcher_op_registration:
+                    return None
+                else:
+                    payload = f"TORCH_FN({name})"
+                    return f'm.impl("{f.func.name}",\n{payload});\n'
+            else:
+                assert_never(self.target)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           STRUCTURED
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@dataclass(frozen=True)
+class StructuredRegisterDispatchKey(RegisterDispatchKey):
+    g: NativeFunctionsGroup
+
+    def gen_class_set_output_functions(
+        self, k: SchemaKind, parent_class: str, generate_super: bool
+    ) -> str:
+        if generate_super:
+            set_output_super = f"{parent_class}::set_output_raw_strided(output_idx, sizes, strides, options, names);"
+        else:
+            set_output_super = ""
+
+        def gen_set_output_function(name: str, maybe_create_proxy: bool) -> str:
+            maybe_star = "*" if k is SchemaKind.functional else ""
+            return f"""
+void set_output_{name}(
+    int64_t output_idx, IntArrayRef sizes, IntArrayRef strides,
+    TensorOptions options, DimnameList names
+) override {{
+{textwrap.indent(self.gen_class_set_output_body(k, maybe_create_proxy), "    ")}
+    if (!names.empty()) {{
+      namedinference::propagate_names({maybe_star}outputs_[output_idx], names);
+    }}
+    // super must happen after, so that downstream can use maybe_get_output
+    // to retrieve the output
+{textwrap.indent(set_output_super, "    ")}
+}}
+"""
+
+        return f"""
+{gen_set_output_function("strided", maybe_create_proxy=True)}
+{gen_set_output_function("raw_strided", maybe_create_proxy=False)}
+"""
+
+    def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) -> str:
+        if self.backend_index.dispatch_key in [
+            DispatchKey.CUDA,
+            DispatchKey.MPS,
+            DispatchKey.CompositeExplicitAutograd,
+        ]:
+            maybe_set_guard = """
+auto current_device = guard_.current_device();
+if (C10_UNLIKELY(current_device.has_value())) {
+  TORCH_INTERNAL_ASSERT(*current_device == options.device(),
+    "structured kernels don't support multi-device outputs");
+} else {
+  guard_.reset_device(options.device());
+}
+"""
+            maybe_set_guard_line = maybe_set_guard + "\n"
+        else:
+            maybe_set_guard_line = maybe_set_guard = ""
+
+        if maybe_create_proxy:
+            create_proxy = """
+auto maybe_proxy = maybe_create_proxy(out, sizes, strides, options);
+if (C10_UNLIKELY(maybe_proxy.has_value())) {
+    proxy_outputs_[output_idx] = c10::ExclusivelyOwned<Tensor>(std::move(maybe_proxy).value());
+}
+"""
+        else:
+            create_proxy = ""
+
+        if k is SchemaKind.functional:
+            assert self.backend_index.dispatch_key in (
+                DispatchKey.Meta,
+                DispatchKey.CPU,
+                DispatchKey.CUDA,
+                DispatchKey.MPS,
+                DispatchKey.CompositeExplicitAutograd,
+            )
+            return f"""{maybe_set_guard_line}
+outputs_[output_idx] = create_out(sizes, strides, options);"""
+        elif k is SchemaKind.inplace:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+check_inplace(out, sizes, options);
+{create_proxy}"""
+        elif k is SchemaKind.out:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+resize_out(out, sizes, strides, options);
+{create_proxy}"""
+        else:
+            assert_never(k)
+
+    # returns the definition of a ctor, as well as how to construct
+    # this class to a variable named op
+    def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str:
+        if k is SchemaKind.functional:
+            return ""
+        elif k is SchemaKind.inplace:
+            # TODO: Make sure out argument is guaranteed to be self
+            return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}"
+        elif k is SchemaKind.out:
+            out_args = ", ".join(f"Tensor& out{i}" for i in range(returns))
+            out_refs = ", ".join(f"std::ref(out{i})" for i in range(returns))
+            return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}"
+        else:
+            assert_never(k)
+
+    def gen_class(
+        self,
+        f: NativeFunction,
+        k: SchemaKind,
+        *,
+        class_name: str,
+        parent_class: str,
+        generate_super: bool,
+    ) -> str:
+        if k is SchemaKind.functional:
+            output_type = "c10::ExclusivelyOwned<Tensor>"
+            output_value = "*outputs_[output_idx]"
+            proxy_field = ""
+        elif k is SchemaKind.inplace:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? **proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<c10::ExclusivelyOwned<Tensor>>, {len(f.func.returns)}> proxy_outputs_;"
+        elif k is SchemaKind.out:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? **proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<c10::ExclusivelyOwned<Tensor>>, {len(f.func.returns)}> proxy_outputs_;"
+
+        if self.backend_index.dispatch_key == DispatchKey.CUDA:
+            if self.rocm:
+                guard_field = "c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;"
+            else:
+                guard_field = "c10::cuda::OptionalCUDAGuard guard_;"
+        elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        elif self.backend_index.dispatch_key == DispatchKey.MPS:
+            # TODO: Move to OptionalMPSGuard.
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        else:
+            guard_field = ""
+
+        indent = " " * 4
+        class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns))
+        lines = (
+            f"struct {class_name} final : public {parent_class} {{",
+            f"{textwrap.indent(class_ctor_str, indent)}",
+            f"{textwrap.indent(self.gen_class_set_output_functions(k, parent_class, generate_super), indent)}",
+            "    const Tensor& maybe_get_output(int64_t output_idx) override {",
+            f"      return {output_value};\n",
+            "    }",
+            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",
+            f"{textwrap.indent(proxy_field, indent)}",
+            f"{textwrap.indent(guard_field, indent)}",
+            "};",
+        )
+        return "\n".join(line for line in lines if line)
+
+    @method_with_native_function
+    def gen_one(self, f: NativeFunction) -> Optional[str]:
+        assert not f.manual_kernel_registration
+
+        if (
+            self.target is Target.REGISTRATION
+            and not self.selector.is_native_function_selected(f)
+        ):
+            return None
+
+        # TODO: Now, there is something interesting going on here.  In the code below,
+        # we generate CompositeExplicitAutograd implementations of functional and inplace
+        # based on the out implementation.  But in fact, out is definable by
+        # functional too (just not very efficiently), and this is honestly the
+        # MORE likely situation for a backend implementor.  How do we pick?
+        # Well, taking a page from Haskell type classes and default methods,
+        # we could conceivably register a circular definition (out in terms
+        # of functional, and functional in terms of out) and just require
+        # someone to implement one or the other.  We'd have to do a little bit
+        # of work to not register one of these "weak" definitions unless there
+        # is a strong definition somewhere in the DAG!  So it's not implemented yet.
+        if (
+            self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd
+            and f.func.kind() is SchemaKind.out
+        ):
+            # Never generate a default implementation for out, that's what you
+            # have to define as a backend implementor
+            return None
+
+        # Note [Direct dispatch bindings]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Signature of the non-dispatched function we'll expose in a header
+        # (e.g., at::cpu::add).  We don't generate methods (TODO: do this
+        # when CPUTensor class is a thing); nor do we generate fallback
+        # bindings for manual_cpp_binding functions.
+        cpp_sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=False
+        )
+
+        # Signature of the wrapper function we'll register to the dispatcher
+        sig = NativeSignature(f.func, prefix="wrapper_")
+
+        if self.target is Target.NAMESPACED_DECLARATION:
+            result = f"TORCH_API {cpp_sig_group.signature.decl()};\n"
+            if cpp_sig_group.faithful_signature is not None:
+                result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n"
+            return result
+
+        elif self.target is Target.NAMESPACED_DEFINITION:
+
+            def generate_defn(cpp_sig: CppSignature) -> str:
+                return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+
+            result = generate_defn(cpp_sig_group.signature)
+            if cpp_sig_group.faithful_signature is not None:
+                result += generate_defn(cpp_sig_group.faithful_signature)
+            return result
+
+        elif self.target is Target.ANONYMOUS_DEFINITION:
+
+            k = f.func.kind()
+
+            # Construct the body of the wrapper function with signature sig
+            sig_body = []
+            # We'll use context to keep track of any variables we've brought
+            # into scope while generating code
+            context: List[Union[Binding, Expr]] = list(sig.arguments())
+
+            # Initialize the class corresponding to this structured
+            # operator; feeding it the output argument(s) if it is known
+            if self.backend_index.dispatch_key is DispatchKey.Meta:
+                class_name = f"structured_{meta.name(self.g)}_meta_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            elif (
+                self.backend_index.dispatch_key is DispatchKey.CompositeExplicitAutograd
+            ):
+                # TODO: dedup this branch
+                class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            else:
+                metadata = self.backend_index.get_kernel(self.g)
+                assert metadata is not None
+                class_name = f"structured_{metadata.kernel}_{k.name}"
+                parent_class = f"{self.cpp_namespace}::structured_{metadata.kernel}"
+
+            if self.backend_index.device_guard:
+                device_check_args = itertools.chain(
+                    f.func.arguments.out, f.func.arguments.flat_positional
+                )
+                sig_body.append(
+                    RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), sig.name()
+                    )
+                )
+
+            if k is SchemaKind.functional:
+                sig_body.append(f"{class_name} op;")
+            elif k is SchemaKind.inplace:
+                sig_body.append(f"{class_name} op(self);")
+            elif k is SchemaKind.out:
+                out_args_str = ", ".join(a.name for a in f.func.arguments.out)
+                sig_body.append(f"{class_name} op({out_args_str});")
+
+            # Translate the input native arguments into structured
+            # arguments for the meta call
+            meta_exprs = ", ".join(
+                e.expr
+                for e in translate(
+                    context, structured.meta_arguments(self.g), method=False
+                )
+            )
+
+            if self.g.out.precomputed:
+                # If this function group has precomputed elements, the meta function
+                # returns a struct containing them which must be saved so that it
+                # can be unpacked when generating code to call the impl.
+                sig_body.append(f"auto precompute = op.meta({meta_exprs});")
+
+                # Put all of the contents of the precompute struct into the context
+                # so that translate will be able to return the correct args for the
+                # call to the impl.
+                precomputed_values = [
+                    *self.g.out.precomputed.replace.values(),
+                    self.g.out.precomputed.add,
+                ]
+                for precomputed_elems in precomputed_values:
+                    for arg in precomputed_elems:
+                        context.append(
+                            Expr(
+                                expr=f"precompute.{arg.name}",
+                                type=structured.argument_type(arg, binds=arg.name),
+                            )
+                        )
+
+                # Add a use of the precompute struct so FB internal compilers don't
+                # complain that there is an unused variable.
+                sig_body.append("(void)precompute;")
+            else:
+                sig_body.append(f"op.meta({meta_exprs});")
+
+            # After running meta, op.outputs_ is guaranteed to be valid;
+            # add it to the context
+            out_args = structured.out_arguments(self.g)
+            for i, out_arg in enumerate(out_args):
+                assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type
+
+                if k is SchemaKind.out:
+                    expr = f"op.maybe_get_output({i})"
+                else:
+                    maybe_star = "*" if k is SchemaKind.functional else ""
+                    expr = f"{maybe_star}op.outputs_[{i}]"
+
+                context.append(
+                    Expr(
+                        expr=expr,
+                        # TODO: Stop hardcoding that the output type is a Tensor.  Note
+                        # that for the codegen here this is fine because outputs_ is
+                        # hardcoded to be tensor already
+                        type=NamedCType(
+                            out_arg.nctype.name, MutRefCType(BaseCType(tensorT))
+                        ),
+                    )
+                )
+
+            # With the expanded context, do the impl call (if not a meta
+            # function)
+            if self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
+                # TODO: https://github.com/pytorch/pytorch/issues/53023
+                out_sig_group = CppSignatureGroup.from_native_function(
+                    self.g.out, method=False, fallback_binding=f.manual_cpp_binding
+                )
+                out_sig = out_sig_group.most_faithful_signature()
+                api_name = out_sig.name()
+                out_exprs = ", ".join(
+                    e.expr
+                    for e in translate(context, out_sig.arguments(), method=False)
+                )
+                # TODO: I think this means structured won't work with method
+                # only functions (but maybe you're saved by faithful? iunno.)
+                # NB: Originally I wrote this as an at::redispatch call, but
+                # I got in trouble because that meant I needed a DispatchKeySet
+                # in the wrapper function, which meant I needed a DispatchKeySet
+                # in the DispatchKeyFunctions declarations, but the defined API
+                # there does NOT permit a dispatch key set.  I think you can
+                # probably unwind this by calling some function to do the TLS
+                # fetch and get the DispatchKeySet when you don't have it, but
+                # I didn't do it for this version
+                sig_body.append(f"at::{api_name}({out_exprs});")
+            elif self.backend_index.dispatch_key != DispatchKey.Meta:
+                impl_exprs = ", ".join(
+                    e.expr
+                    for e in translate(
+                        context, structured.impl_arguments(self.g), method=False
+                    )
+                )
+                sig_body.append(f"op.impl({impl_exprs});")
+
+            # Go over each output, and check if there is a proxy created for it.
+            # If so, copy it over to the original output.
+            if k is SchemaKind.out or k is SchemaKind.inplace:
+                for i in range(len(f.func.returns)):
+                    sig_body.append(
+                        f"if (op.proxy_outputs_[{i}].has_value()) op.outputs_[{i}].get().copy_(**op.proxy_outputs_[{i}]);"
+                    )
+
+            # Destructively return the final tensors
+            # TODO: Do this in translate instead
+            if k is SchemaKind.functional:
+                if len(f.func.returns) == 1:
+                    ret_expr = "std::move(op.outputs_[0]).take()"  # small optimization
+                else:
+                    moved = ", ".join(
+                        f"std::move(op.outputs_[{i}]).take()"
+                        for i in range(len(f.func.returns))
+                    )
+                    ret_expr = f"std::make_tuple({moved})"
+            elif k is SchemaKind.inplace:
+                ret_expr = "self"
+            elif k is SchemaKind.out:
+                if len(f.func.returns) == 1:
+                    ret_expr = f.func.arguments.out[0].name
+                else:
+                    refs = ", ".join(a.name for a in f.func.arguments.out)
+                    ret_expr = f"std::forward_as_tuple({refs})"
+            sig_body.append(f"return {ret_expr};")
+
+            sig_body_str = "\n".join(sig_body)
+
+            # For an overview of what this template code looks like, see
+            # https://github.com/pytorch/rfcs/pull/9
+            return f"""\
+{self.gen_class(
+f, k,
+class_name=class_name,
+parent_class=parent_class,
+generate_super=self.g.out.structured_inherits is not None
+)}
+
+{sig.defn()} {{
+{sig_body_str}
+}}
+"""
+
+        elif self.target is Target.REGISTRATION:
+            return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
+        else:
+            assert_never(self.target)
+            # Silence mypy's "Missing return statement" error
+            return None
diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py
new file mode 100644
index 000000000000..4b81c4218f2e
--- /dev/null
+++ b/torchgen/dest/ufunc.py
@@ -0,0 +1,552 @@
+from dataclasses import dataclass
+from typing import Union, Optional, List, Tuple, Dict, Sequence
+from torchgen.api.translate import translate
+from torchgen.model import (
+    NativeFunctionsGroup,
+    ScalarType,
+    UfuncKey,
+    DispatchKey,
+    BaseType,
+    BaseTy,
+    Argument,
+)
+import torchgen.api.ufunc as ufunc
+from torchgen.api.ufunc import UfunctorBindings
+from torchgen.api.types import (
+    StructuredImplSignature,
+    scalar_t,
+    opmath_t,
+    Binding,
+    CType,
+    BaseCType,
+    Expr,
+    NamedCType,
+    ScalarTypeToCppMapping,
+    VectorizedCType,
+)
+from torchgen.context import with_native_function
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                  CUDA STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# NB: not bothering to generate dispatch stub forward declaration in header,
+# we can just paste it whereever necessary
+
+# TODO: use BackendIndex
+# dispatch_key: DispatchKey  # only CPU/CUDA right now
+
+
+# Represents functors for implementing CUDA ufuncs.
+# Functors are templated by scalar_t because when USERS instantiate functors
+# they are templated.  A functor looks something like this:
+#
+#   template <typename scalar_t>
+#   struct CUDAFunctorOnSelf_add {
+#     using opmath_t = at::opmath_type<scalar_t>;
+#     opmath_t other_;
+#     opmath_t alpha_;
+#     CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
+#         : other_(other), alpha_(alpha) {}
+#     __device__ scalar_t operator()(scalar_t self) {
+#       return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
+#     }
+#   };
+#
+@dataclass(frozen=True)
+class UfunctorSignature:
+    g: NativeFunctionsGroup
+    scalar_tensor_idx: Optional[int]
+    name: str
+
+    def arguments(self) -> UfunctorBindings:
+        return ufunc.ufunctor_arguments(
+            self.g, scalar_tensor_idx=self.scalar_tensor_idx, scalar_t=scalar_t
+        )
+
+    def fields(self) -> List[Binding]:
+        # fields are renamed to have a trailing underscore, as is conventional
+        return [b.rename(f"{b.name}_") for b in self.arguments().ctor]
+
+    def returns_type(self) -> CType:
+        # TODO: don't hardcode; return type will be inferred based on tags on
+        # the native function
+        return BaseCType(scalar_t)
+
+    def decl_fields(self) -> str:
+        return "\n".join(f"{f.type} {f.name};" for f in self.fields())
+
+    def inline_defn_ctor(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().ctor)
+        # NB: hypothetically could do this with translate but the
+        # transition here is very regular
+        init_str = ", ".join(f"{a.name}_({a.name})" for a in self.arguments().ctor)
+        return f"{self.name}({args_str}) : {init_str} {{}}"
+
+    def decl_apply(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().apply)
+        return f"{self.returns_type().cpp_type()} operator()({args_str}) const"
+
+
+@dataclass(frozen=True)
+class UfuncSignature:
+    g: NativeFunctionsGroup
+    name: str
+    compute_t: CType
+
+    def arguments(self) -> List[Binding]:
+        return ufunc.ufunc_arguments(self.g, compute_t=self.compute_t)
+
+    def call(self, ctx: Sequence[Union[Binding, Expr]]) -> str:
+        return f"{self.name}({', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+
+# steps:
+#   1. take the functional signature
+#   2. use api.ufunc to convert it to template signature.  this establishes
+#      the type of the template function
+#   3. use api.ufunc (II) to generate a split struct / operator() signature.
+#      this establish context in which we call the template signature
+#
+# StructuredImplSignature context
+#   ~> functor constructor sig
+#
+# Functor constructor context
+#   ~> functor fields sig
+#
+# Functor apply context (functor fields + functor apply sig)
+#   ~> template sig
+#
+
+
+def eligible_for_binary_scalar_specialization(g: NativeFunctionsGroup) -> bool:
+    num_tensors = sum(
+        1 for a in g.functional.func.arguments.flat_non_out if a.type.is_tensor_like()
+    )
+    return num_tensors == 2
+
+
+def compute_ufunc_cuda_functors(
+    g: NativeFunctionsGroup,
+) -> Tuple[Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]], str]:
+    # First, build the functors.
+    ufunctor_sigs: Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]] = {}
+    ufunctors: List[str] = []
+    loops = g.out.ufunc_inner_loop
+    scalar_tensor_idx_lookup = {
+        UfuncKey.CUDAFunctorOnSelf: 1,
+        UfuncKey.CUDAFunctorOnOther: 0,
+        UfuncKey.CUDAFunctor: None,
+    }
+    if eligible_for_binary_scalar_specialization(g):
+        keys = [
+            UfuncKey.CUDAFunctorOnSelf,
+            UfuncKey.CUDAFunctorOnOther,
+            UfuncKey.CUDAFunctor,
+        ]
+    else:
+        keys = [UfuncKey.CUDAFunctor]
+        for k in [UfuncKey.CUDAFunctorOnSelf, UfuncKey.CUDAFunctorOnOther]:
+            assert k not in loops, f"cannot use {k} on non-binary function"
+    for k in keys:
+        # If the key was directly defined, skip functor codegen; we assume the
+        # user already done it for us
+        if k in loops:
+            ufunctor_sig = UfunctorSignature(
+                g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=loops[k].name
+            )
+            for dtype in loops[k].supported_dtypes:
+                ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+            continue
+
+        # Note [ScalarOnly and Generic must match names for CUDA]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Otherwise, look in ANY of the generic entries.  For simplicity of
+        # codegen, both ScalarOnly and Generic are defined, the ufunc name
+        # must match  (if they didn't match, we'd have to generate distinct
+        # functors per dtype, which is awful, so we're not going to do it unless
+        # someone really forces us to)
+        ufunc_name = None
+        supported_dtypes = set()
+        for lk in [UfuncKey.ScalarOnly, UfuncKey.Generic]:
+            if lk not in loops:
+                continue
+            if ufunc_name is None:
+                ufunc_name = loops[lk].name
+            else:
+                # See Note [ScalarOnly and Generic must match names for CUDA]
+                assert (
+                    ufunc_name == loops[lk].name
+                ), "ScalarOnly and Generic must have same ufunc name"
+            supported_dtypes |= loops[lk].supported_dtypes
+        assert ufunc_name is not None
+
+        name = f"{k}_{ufunc_name}"
+        ufunctor_sig = UfunctorSignature(
+            g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=name
+        )
+        for dtype in supported_dtypes:
+            ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+
+        ufunc_sig = UfuncSignature(
+            g, name=f"ufunc::{ufunc_name}", compute_t=BaseCType(opmath_t)
+        )
+        apply_ctx = ufunctor_sig.fields() + ufunctor_sig.arguments().apply
+        ufunctors.append(
+            f"""
+template <typename scalar_t>
+struct {ufunctor_sig.name} {{
+  using opmath_t = at::opmath_type<scalar_t>;
+  {ufunctor_sig.decl_fields()}
+  {ufunctor_sig.inline_defn_ctor()}
+  __device__ {ufunctor_sig.decl_apply()} {{
+    return {ufunc_sig.call(apply_ctx)};
+  }}
+}};
+"""
+        )
+
+    return ufunctor_sigs, "\n".join(ufunctors)
+
+
+@dataclass(frozen=True)
+class BinaryScalarSpecializationConfig:
+    scalar_idx: int
+    ctor_tensor: str
+    ufunc_key: UfuncKey
+
+
+BinaryScalarSpecializationConfigs = [
+    BinaryScalarSpecializationConfig(
+        scalar_idx=0,
+        ctor_tensor="self",
+        ufunc_key=UfuncKey.CUDAFunctorOnOther,
+    ),
+    BinaryScalarSpecializationConfig(
+        scalar_idx=1,
+        ctor_tensor="other",
+        ufunc_key=UfuncKey.CUDAFunctorOnSelf,
+    ),
+]
+
+
+def compute_ufunc_cuda_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfunctorSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    body = "using opmath_t = at::opmath_type<scalar_t>;"
+    body += "if (false) {}\n"  # for ease of codegen
+    for config in BinaryScalarSpecializationConfigs:
+        if config.ufunc_key not in inner_loops:
+            continue
+        ufunctor_sig = inner_loops[config.ufunc_key]
+        scalar_idx = config.scalar_idx + 1
+        # Make a copy and at the same time widen the type (not permissible
+        # without copy; we don't want to mutate the input argument anyway)
+        ctx: List[Union[Expr, Binding]] = list(parent_ctx)
+        ctx.append(
+            Expr(
+                expr=f"iter.scalar_value<opmath_t>({scalar_idx})",
+                type=NamedCType(config.ctor_tensor, BaseCType(opmath_t)),
+            )
+        )
+        ufunctor_ctor_exprs_str = ", ".join(
+            a.expr for a in translate(ctx, ufunctor_sig.arguments().ctor)
+        )
+
+        # NB: ufunctor must be allocated before iter.remove_operand is called,
+        # as it relies on iter
+        body += f"""\
+else if (iter.is_cpu_scalar({scalar_idx})) {{
+  {ufunctor_sig.name}<scalar_t> ufunctor({ufunctor_ctor_exprs_str});
+  iter.remove_operand({scalar_idx});
+  gpu_kernel(iter, ufunctor);
+}}"""
+
+    ufunctor_sig = inner_loops[UfuncKey.CUDAFunctor]
+    ufunctor_ctor_exprs_str = ", ".join(
+        a.expr for a in translate(parent_ctx, ufunctor_sig.arguments().ctor)
+    )
+    body += f"""
+else {{
+  gpu_kernel(iter, {ufunctor_sig.name}<scalar_t>({ufunctor_ctor_exprs_str}));
+}}
+    """
+    return body
+
+
+@with_native_function
+def compute_ufunc_cuda(g: NativeFunctionsGroup) -> str:
+    # First, build the functors, indexing them by dtype
+    ufunctor_sigs, ufunctors = compute_ufunc_cuda_functors(g)
+
+    # Next, build the conditionals
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CUDA))
+    dtype_cases = []
+    for dtype, inner_ufunctor_sigs in ufunctor_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_PRIVATE_CASE_TYPE("{sig.name}", at::ScalarType::{dtype}, {ScalarTypeToCppMapping[dtype]},
+  [&]() {{
+    {compute_ufunc_cuda_dtype_body(g, dtype, inner_ufunctor_sigs, sig.arguments())}
+  }}
+)
+"""
+        )
+
+    dtype_cases_str = "\n".join(dtype_cases)
+
+    stub_sig = StubSignature(g)
+
+    return f"""
+{ufunctors}
+
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+
+{stub_sig.kernel_defn()} {{
+  at::ScalarType st = iter.common_dtype();
+  RECORD_KERNEL_FUNCTION_DTYPE("{sig.name}", st);
+  switch (st) {{
+    {dtype_cases_str}
+    default:
+      TORCH_CHECK(false, "{sig.name}", " not implemented for '", toString(st), "'");
+  }}
+}}
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+
+{sig.defn()} {{
+  {stub_sig.direct_call(sig.arguments())};
+}}
+"""
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                   CPU STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@dataclass(frozen=True)
+class StubSignature:
+    g: NativeFunctionsGroup
+
+    @property
+    def name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_stub"
+
+    @property
+    def kernel_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_kernel"
+
+    @property
+    def type_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_fn"
+
+    def arguments(self) -> List[Binding]:
+        return ufunc.stub_arguments(self.g)
+
+    def type(self) -> str:
+        cpp_args = self.arguments()
+        return f"void(*)(TensorIteratorBase&, {', '.join(a.type for a in cpp_args)})"
+
+    def dispatch_decl(self) -> str:
+        return f"DECLARE_DISPATCH({self.type_name}, {self.name})"
+
+    def dispatch_defn(self) -> str:
+        return f"DEFINE_DISPATCH({self.name})"
+
+    def kernel_defn(self) -> str:
+        return f"void {self.kernel_name}(TensorIteratorBase& iter, {', '.join(a.defn() for a in self.arguments())})"
+
+    def type_defn(self) -> str:
+        return f"using {self.type_name} = {self.type()}"
+
+    # must be called from context where this is TensorIteratorBase*
+    def call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.name}(device_type(), *this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+    # used in CUDA to skip the unnecessary dynamic dispatch
+    def direct_call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.kernel_name}(*this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+
+
+@with_native_function
+def compute_ufunc_cpu(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CPU))
+
+    return f"""
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+{stub_sig.dispatch_defn()};
+
+{sig.defn()} {{
+  {stub_sig.call(sig.arguments())};
+}}
+"""
+
+
+def compute_ufunc_cpu_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfuncSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    assert UfuncKey.CPUScalar in inner_loops, f"{dtype}, {inner_loops.keys()}"
+    assert inner_loops.keys() <= {UfuncKey.CPUScalar, UfuncKey.CPUVector}
+    scalar_loop = inner_loops[UfuncKey.CPUScalar]
+    vec_loop = None
+    if UfuncKey.CPUVector in inner_loops:
+        vec_loop = inner_loops[UfuncKey.CPUVector]
+
+    # NB: We DON'T use translate here, because translate is
+    # incapable of CSE'ing the scalar accesses in case it is also
+    # used by Vectorized; also, the unpacking here is very simple
+    # and only affects Scalar; everything else is implicitly captured
+    # by the lambda
+
+    # Setup scalar in scope
+    body = []
+    ctx = []
+    for b in parent_ctx:
+        if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+            BaseTy.Scalar
+        ):
+            continue
+        body.append(f"auto _s_{b.name} = {b.name}.to<scalar_t>();")
+        ctx.append(Expr(f"_s_{b.name}", NamedCType(b.nctype.name, BaseCType(scalar_t))))
+    if vec_loop is not None:
+        for b in parent_ctx:
+            if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+                BaseTy.Scalar
+            ):
+                continue
+            body.append(
+                f"auto _v_{b.name} = at::vec::Vectorized<scalar_t>(_s_{b.name});"
+            )
+            ctx.append(
+                Expr(
+                    f"_v_{b.name}",
+                    NamedCType(b.nctype.name, VectorizedCType(BaseCType(scalar_t))),
+                )
+            )
+
+    # Setup lambda signature
+    # NB: simplified version of ufunctor_arguments
+    scalar_bindings = []
+    vec_bindings = []
+    for a in g.functional.func.arguments.flat_non_out:
+        if not a.type.is_tensor_like():
+            continue
+        assert a.type == BaseType(BaseTy.Tensor)
+        scalar_bindings.append(
+            Binding(
+                name=a.name,
+                nctype=NamedCType(a.name, BaseCType(scalar_t)),
+                argument=a,
+            )
+        )
+        if vec_loop is not None:
+            vec_bindings.append(
+                Binding(
+                    name=a.name,
+                    nctype=NamedCType(a.name, VectorizedCType(BaseCType(scalar_t))),
+                    argument=a,
+                )
+            )
+
+    def with_ctx(b: Sequence[Binding]) -> List[Union[Expr, Binding]]:
+        r: List[Union[Expr, Binding]] = []
+        r.extend(ctx)
+        r.extend(b)
+        return r
+
+    body_str = "\n".join(body)
+    if vec_loop is not None:
+        return f"""
+{body_str}
+cpu_kernel_vec(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }},
+  [=]({', '.join(b.decl() for b in vec_bindings)}) {{ return {vec_loop.call(with_ctx(vec_bindings))}; }}
+);
+"""
+    else:
+        return f"""
+{body_str}
+cpu_kernel(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }}
+);
+"""
+
+
+@with_native_function
+def compute_ufunc_cpu_kernel(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+
+    # Reindex the ufunc by dtypes; processing generic/scalaronly as well
+    loops = g.out.ufunc_inner_loop
+    ufunc_sigs: Dict[ScalarType, Dict[UfuncKey, UfuncSignature]] = {}
+    for k in [UfuncKey.CPUScalar, UfuncKey.CPUVector]:
+        lks = []
+        # ORDER MATTERS: this specifies overriding precedence
+        if k in loops:  # should happen rarely
+            lks.append(k)
+        if UfuncKey.ScalarOnly in loops and k is UfuncKey.CPUScalar:
+            lks.append(UfuncKey.ScalarOnly)
+        if UfuncKey.Generic in loops:
+            lks.append(UfuncKey.Generic)
+        # TODO: don't hardcode ufunc:: namespace here, should be centralized smh
+        for lk in lks:
+            for dtype in loops[lk].supported_dtypes:
+                compute_t: CType
+                if k is UfuncKey.CPUScalar:
+                    compute_t = BaseCType(scalar_t)
+                elif k is UfuncKey.CPUVector:
+                    compute_t = VectorizedCType(BaseCType(scalar_t))
+                else:
+                    raise AssertionError()
+                inner_ufunc_sigs = ufunc_sigs.setdefault(dtype, {})
+                if k not in inner_ufunc_sigs:
+                    inner_ufunc_sigs[k] = UfuncSignature(
+                        g, name=f"ufunc::{loops[lk].name}", compute_t=compute_t
+                    )
+
+    # Build the conditionals
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunc_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_PRIVATE_CASE_TYPE("{stub_sig.name}", at::ScalarType::{dtype}, {ScalarTypeToCppMapping[dtype]},
+  [&]() {{
+    {compute_ufunc_cpu_dtype_body(g, dtype, inner_ufunc_sigs, stub_sig.arguments())}
+  }}
+)
+"""
+        )
+
+    dtype_cases_str = "\n".join(dtype_cases)
+    return f"""
+namespace {{
+
+{stub_sig.kernel_defn()} {{
+  at::ScalarType st = iter.common_dtype();
+  RECORD_KERNEL_FUNCTION_DTYPE("{stub_sig.name}", st);
+  switch (st) {{
+    {dtype_cases_str}
+    default:
+      TORCH_CHECK(false, "{stub_sig.name}", " not implemented for '", toString(st), "'");
+  }}
+}}
+
+}} // anonymous namespace
+
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+"""
diff --git a/torchgen/gen.py b/torchgen/gen.py
new file mode 100644
index 000000000000..8d6576181141
--- /dev/null
+++ b/torchgen/gen.py
@@ -0,0 +1,2509 @@
+import os
+from typing import List, Dict, Optional, Tuple, Set, Any, Union, Sequence, TypeVar
+from typing_extensions import Literal
+import yaml
+from collections import OrderedDict, defaultdict, namedtuple
+import argparse
+import pathlib
+import json
+from dataclasses import dataclass
+import functools
+
+from torchgen.model import (
+    STRUCTURED_DISPATCH_KEYS,
+    Argument,
+    DispatchKey,
+    FunctionSchema,
+    Location,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+    BackendIndex,
+    BackendMetadata,
+    OptionalType,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+    Variant,
+    is_cuda_dispatch_key,
+    is_generic_dispatch_key,
+    is_ufunc_dispatch_key,
+    NativeFunctionsViewGroup,
+    ViewSchemaKind,
+    BaseOperatorName,
+)
+from torchgen.api.types import (
+    Binding,
+    CppSignatureGroup,
+    DispatcherSignature,
+    NamedCType,
+    NativeSignature,
+    SpecialArgName,
+)
+from torchgen.api import cpp
+import torchgen.api.dispatcher as dispatcher
+import torchgen.api.native as native
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+from torchgen.api.translate import translate
+from torchgen.code_template import CodeTemplate
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import (
+    Target,
+    concatMap,
+    context,
+    mapMaybe,
+    YamlDumper,
+    YamlLoader,
+    FileManager,
+    assert_never,
+    make_file_manager,
+)
+from torchgen.context import (
+    method_with_native_function,
+    native_function_manager,
+    with_native_function_and_indices,
+    with_native_function,
+)
+import torchgen.dest as dest
+from torchgen.gen_functionalization_type import (
+    gen_functionalization_definition,
+    gen_functionalization_registration,
+    gen_functionalization_view_inverse_declaration,
+    gen_composite_view_copy_kernel,
+)
+
+T = TypeVar("T")
+
+# Welcome to the ATen code generator v2!  The ATen code generator is
+# responsible for parsing native_functions.yaml and then generating
+# various generated files (e.g., TypeDefault.cpp) based on the operators
+# defined in this file.  This means that the code generator knows how to
+# parse function schema, and then translate this into various C++ types
+# and boilerplate code.
+#
+# Some things to know about this file when you modify it:
+#
+# - This file has STRICT mypy typechecking.  Typecheck it with
+#   `mypy --config mypy-strict.ini` in the root source directory
+#
+# - Most of the heavy lifting lives in external modules:
+#   - 'model' has the data model for native_functions.yaml.  The classes
+#     in those file represent what you see when you look at
+#     a native_functions.yaml
+#   - 'api' has conversions for how to translate JIT schema into
+#     the various C++ APIs that the codegen interacts with.  There
+#     are in fact THREE different C++ APIs: the public C++ API,
+#     the dispatcher API, and the legacy disaptcher API.  See each
+#     of these respective files for more information
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                         HELPER FUNCTIONS
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+class NamespaceHelper:
+    """A helper for constructing the namespace open and close strings for a nested set of namespaces.
+
+    e.g. for namespace_str torch::lazy,
+
+    prologue:
+    namespace torch {
+    namespace lazy {
+
+    epilogue:
+    } // namespace lazy
+    } // namespace torch
+    """
+
+    def __init__(self, namespace_str: str):
+        # cpp_namespace can be a colon joined string such as torch::lazy
+        cpp_namespaces = namespace_str.split("::")
+        self.prologue_ = "\n".join([f"namespace {n} {{" for n in cpp_namespaces])
+        self.epilogue_ = "\n".join(
+            [f"}} // namespace {n}" for n in reversed(cpp_namespaces)]
+        )
+
+    @property
+    def prologue(self) -> str:
+        return self.prologue_
+
+    @property
+    def epilogue(self) -> str:
+        return self.epilogue_
+
+
+# A custom loader for YAML to let us also keep track of line numbers
+# of each entry in the YAML file
+class LineLoader(YamlLoader):
+    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
+        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
+        # Add 1 so line numbering starts at 1
+        mapping["__line__"] = node.start_mark.line + 1
+        return mapping
+
+
+_GLOBAL_PARSE_NATIVE_YAML_CACHE = {}
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
+ParsedYaml = namedtuple("ParsedYaml", ["native_functions", "backend_indices"])
+
+
+def parse_native_yaml_struct(
+    es: object,
+    valid_tags: Set[str],
+    ignore_keys: Optional[Set[DispatchKey]] = None,
+    path: str = "<stdin>",
+) -> ParsedYaml:
+    assert isinstance(es, list)
+    rs: List[NativeFunction] = []
+    bs: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]] = defaultdict(dict)
+    for e in es:
+        assert isinstance(e.get("__line__"), int), e
+        loc = Location(path, e["__line__"])
+        funcs = e.get("func")
+        with context(lambda: f"in {loc}:\n  {funcs}"):
+            func, m = NativeFunction.from_yaml(e, loc, valid_tags, ignore_keys)
+            rs.append(func)
+            BackendIndex.grow_index(bs, m)
+    error_check_native_functions(rs)
+    # Default dict is to prevent the codegen from barfing when we have a dispatch key that has no kernels yet.
+    indices: Dict[DispatchKey, BackendIndex] = defaultdict(
+        lambda: BackendIndex(
+            dispatch_key=DispatchKey.Undefined,
+            use_out_as_primary=True,
+            external=False,
+            device_guard=False,
+            index={},
+        )
+    )
+    for k, v in bs.items():
+        # All structured in-tree operators are implemented in terms of their out operator.
+        indices[k] = BackendIndex(
+            dispatch_key=k,
+            use_out_as_primary=True,
+            external=False,
+            # Only cuda-like devices in tree require device guards
+            device_guard=is_cuda_dispatch_key(k),
+            index=v,
+        )
+    return ParsedYaml(rs, indices)
+
+
+def parse_tags_yaml_struct(es: object, path: str = "<stdin>") -> Set[str]:
+    assert isinstance(es, list)
+    rs: Set[str] = set()
+    for e in es:
+        assert isinstance(e.get("__line__"), int), e
+        loc = Location(path, e["__line__"])
+        tags = e.get("tag")
+        with context(lambda: f"in {loc}:\n  {tags}"):
+            e_i = e.copy()
+            name = e_i.pop("tag")
+            desc = e_i.pop("desc", "")
+            # ensure that each tag has a non-empty description
+            assert desc != ""
+            rs.add(name)
+    return rs
+
+
+@functools.lru_cache(maxsize=None)
+def parse_tags_yaml(path: str) -> Set[str]:
+    # TODO: parse tags.yaml and create a tags database (a dict of tag name mapping to a Tag object)
+    with open(path, "r") as f:
+        es = yaml.load(f, Loader=LineLoader)
+        valid_tags = parse_tags_yaml_struct(es, path=path)
+    return valid_tags
+
+
+def parse_native_yaml(
+    path: str, tags_yaml_path: str, ignore_keys: Optional[Set[DispatchKey]] = None
+) -> ParsedYaml:
+    # TODO: parse tags.yaml and create a tags database (a dict of tag name mapping to a Tag object)
+    global _GLOBAL_PARSE_NATIVE_YAML_CACHE
+    if path not in _GLOBAL_PARSE_NATIVE_YAML_CACHE:
+        valid_tags = parse_tags_yaml(tags_yaml_path)
+        with open(path, "r") as f:
+            es = yaml.load(f, Loader=LineLoader)
+        _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] = parse_native_yaml_struct(
+            es, valid_tags, ignore_keys, path=path
+        )
+
+    return _GLOBAL_PARSE_NATIVE_YAML_CACHE[path]
+
+
+# Some assertions are already performed during parsing, but those are only within a single NativeFunction.
+# Assertions here are meant to be performed across NativeFunctions.
+def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None:
+    func_map: Dict[OperatorName, NativeFunction] = {}
+    base_func_map: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list)
+    for f in funcs:
+        func_map[f.func.name] = f
+        base_func_map[f.func.name.name].append(f)
+    for f in funcs:
+        if f.structured_delegate is not None:
+            delegate_func = func_map[f.structured_delegate]
+            assert delegate_func.structured, (
+                f"{f.func.name} is marked as a structured_delegate pointing to "
+                f"{f.structured_delegate}, but {f.structured_delegate} is not marked as structured. "
+                f"Consider adding 'structured=True' to the delegated operator"
+            )
+        if "inplace_view" in f.tags:
+            base_name = f.func.name.name
+            overload_name = f.func.name.overload_name
+            assert base_name.inplace, (
+                f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming "
+                "convention for inplace ops - the codegen expects the base name to have a trailing underscore. "
+            )
+            out_of_place_base_name = BaseOperatorName(
+                base_name.base, False, base_name.dunder_method
+            )
+            assert len(base_func_map[out_of_place_base_name]) > 0, (
+                f"{f.func.name} is marked with tag: inplace_view. The codegen expects there to be a corresponding "
+                f"out-of-place view op with the name '{base_name}' and matching schema, but it didn't find one. "
+            )
+
+
+def cpp_string(s: str) -> str:
+    """Convert a python string into a c++ string literal"""
+    s = s.replace("\\", "\\\\")
+    s = s.replace('"', '\\"')
+    s = s.replace("\a", "\\a")
+    s = s.replace("\b", "\\b")
+    s = s.replace("\f", "\\f")
+    s = s.replace("\n", "\\n")
+    s = s.replace("\v", "\\v")
+    s = s.replace("\t", "\\t")
+    return f'"{s}"'
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        C++ CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+# Most functions in this section are curried: they consist of a function
+# that takes some parameters (e.g., what is to be generated) which itself
+# returns a function that actually maps NativeFunction to the code
+# to be generated.  This pattern makes it convenient to use map, concatMap
+# and similar functional combinators.
+
+
+def static_dispatch_keys(backends: List[BackendIndex]) -> List[DispatchKey]:
+    if len(backends) == 0:
+        return []
+    else:
+        return [backend.dispatch_key for backend in backends] + [
+            DispatchKey.CompositeImplicitAutograd,
+            DispatchKey.CompositeExplicitAutograd,
+        ]
+
+
+def get_static_dispatch_backend(
+    f: NativeFunction, backend_index: BackendIndex
+) -> Optional[DispatchKey]:
+    if f.structured_delegate is not None or backend_index.has_kernel(f):
+        # TODO: for ops with structured_delegate it should check the dispatch table of
+        # the out variant instead. For now, these structured ops all have CPU/CUDA kernels
+        # so we always dispatch to the `backend`, but this could be wrong when we
+        # migrate math/default_backend ops to use structured delegate.
+        return backend_index.dispatch_key
+    elif f.has_composite_explicit_autograd_kernel:
+        return DispatchKey.CompositeExplicitAutograd
+    elif f.has_composite_implicit_autograd_kernel:
+        return DispatchKey.CompositeImplicitAutograd
+    return None
+
+
+def static_dispatch_ops_header(
+    f: NativeFunction, backend_index: List[BackendIndex]
+) -> Optional[str]:
+    if backend_index is None or f.manual_kernel_registration:
+        return None
+
+    output = []
+    for index in backend_index:
+        dispatch_key = get_static_dispatch_backend(f, index)
+        if dispatch_key is not None:
+            output.append(
+                f"#include <ATen/ops/{f.root_name}_{dispatch_key.lower()}_dispatch.h>"
+            )
+    return "\n".join(output)
+
+
+def static_dispatch_extra_headers(backends: List[BackendIndex]) -> List[str]:
+    return [
+        f"#include <ATen/{dispatch_key}Functions.h>"
+        for dispatch_key in static_dispatch_keys(backends)
+    ]
+
+
+# Translates arguments of a native function from DispatcherSignature form to CppSignature form with support for
+# supporting usecases even when there is a memory_format argument along with tensor_option arguments.
+# This usecase is not covered by tools.codegen.api.translate() yet as its application is limited to static dispatch
+def translate_args_dispatcher_to_cpp(
+    f: NativeFunction,
+) -> str:
+
+    # Adds SpecialArgName.possibly_redundant_memory_format NamedCType for memory_format bindings
+    def add_spl_memory_format_binding(input_bindings: List[Binding]) -> List[Binding]:
+        output_bindings: List[Binding] = []
+        for binding in input_bindings:
+            if binding.name == "memory_format":
+                spl_mem_format_binding = Binding(
+                    nctype=NamedCType(
+                        SpecialArgName.possibly_redundant_memory_format,
+                        binding.nctype.type,
+                    ),
+                    name=binding.name,
+                    default=binding.default,
+                    argument=binding.argument,
+                )
+                output_bindings.append(spl_mem_format_binding)
+            else:
+                output_bindings.append(binding)
+        return output_bindings
+
+    disp_sig = DispatcherSignature.from_schema(f.func)
+    cpp_sig = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    ).signature
+    disp_bindings = disp_sig.arguments()
+    # When last argument of CPP signature has SpecialArgName.possibly_redundant_memory_format NCType,
+    # get memory_format bindings of dispatcher signature to have the same NCType as well
+    for arg in cpp_sig.arguments():
+        if arg.nctype.name == SpecialArgName.possibly_redundant_memory_format:
+            disp_bindings = add_spl_memory_format_binding(disp_sig.arguments())
+            break
+    exprs = translate(disp_bindings, cpp_sig.arguments())
+    return ", ".join(a.expr for a in exprs)
+
+
+def generate_static_dispatch_backend_call(
+    f: NativeFunction,
+    backend_index: BackendIndex,
+) -> str:
+    name = DispatcherSignature.from_schema(f.func).name()
+    exprs = translate_args_dispatcher_to_cpp(f)
+    return f"return at::{backend_index.dispatch_key.lower()}::{name}({exprs});"
+
+
+def generate_static_dispatch_fallback_call(
+    f: NativeFunction,
+    backend_indices: List[BackendIndex],
+) -> str:
+    name = DispatcherSignature.from_schema(f.func).name()
+    exprs = translate_args_dispatcher_to_cpp(f)
+    if f.has_composite_explicit_autograd_kernel:
+        return f"return at::{DispatchKey.CompositeExplicitAutograd.lower()}::{name}({exprs});"
+    elif f.has_composite_implicit_autograd_kernel:
+        return f"return at::{DispatchKey.CompositeImplicitAutograd.lower()}::{name}({exprs});"
+    else:
+        return f"""TORCH_CHECK(false, "Static dispatch does not support {name} for\
+{', '.join([str(index.dispatch_key)for index in backend_indices])} ");"""
+
+
+def static_dispatch(
+    f: NativeFunction,
+    backend_indices: List[BackendIndex],
+) -> str:
+    if len(backend_indices) == 0 or f.manual_kernel_registration:
+        return ""
+
+    keys = [
+        b
+        for b in backend_indices
+        if b.has_kernel(f)
+        or (
+            f.structured_delegate is not None
+            and b.dispatch_key in STRUCTURED_DISPATCH_KEYS
+        )
+    ]
+    if len(keys) == 1:
+        return generate_static_dispatch_backend_call(f, keys[0])
+    elif len(keys) == 0:
+        return generate_static_dispatch_fallback_call(f, backend_indices)
+
+    sig = DispatcherSignature.from_schema(f.func)
+    native_tensor_args = [
+        a.name
+        for a in sig.arguments()
+        if isinstance(a.argument, SelfArgument)
+        or isinstance(a.argument, Argument)
+        and a.argument.type.is_tensor_like()
+    ]
+    tensor_args = ", ".join(native_tensor_args)
+    tensor_opts = f.func.arguments.tensor_options
+
+    stmts = []
+    subexprs: List[str] = []
+    if tensor_opts is not None:
+        subexprs.append(
+            "DispatchKeySet(c10::computeDispatchKey(dtype, layout, device))"
+        )
+    if tensor_args != "":
+        subexprs.append(f"c10::detail::multi_dispatch_key_set({tensor_args})")
+    stmts.append(f"""DispatchKeySet _dk_set = {' | '.join(subexprs)};""")
+    stmts.append("DispatchKey _dk = c10::highestPriorityBackendTypeId(_dk_set);")
+
+    dispatch_code = []
+    for index in keys:
+        dispatch_code.append(f"""case DispatchKey::{index.dispatch_key}:""")
+        dispatch_code.append(
+            f"""\t{generate_static_dispatch_backend_call(f, index)};"""
+        )
+
+    fallback = generate_static_dispatch_fallback_call(f, backend_indices)
+    connector = "\n\t\t"
+
+    return f"""
+    {connector.join(stmts)}
+    switch (_dk) {{
+        {connector.join(dispatch_code)}
+        default:
+            {fallback}
+    }}
+    """
+
+
+# Generates RegisterSchema.cpp.  Depending on the selector, either
+# all schemas are registered, or only some are (in the case of
+# selective build)
+@dataclass(frozen=True)
+class RegisterSchema:
+    selector: SelectiveBuilder
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if not self.selector.is_native_function_selected(f):
+            return None
+        return f"m.def({cpp_string(str(f.func))});\n"
+
+
+# Generates Operators.h and Operators.cpp.
+# These provide macros that, given an operator and overload name, allow users
+# to access an "un-overloaded" function version of the operator. This
+# is useful for extension writers who want to (1) want to decltype the operator
+# and (2) don't want to worry about method-only operators.
+@dataclass(frozen=True)
+class ComputeOperators:
+    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    static_dispatch_backend_indices: List[BackendIndex]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+        sig = DispatcherSignature.from_schema(f.func)
+        name = f.func.name.unambiguous_name()
+        call_method_name = "call"
+        redispatch_method_name = "redispatch"
+
+        if self.target is Target.DECLARATION:
+            # Note [The ATen Operators API]
+            # The ATen Operators API lives in the at::_ops namespace, and contains compile-time
+            # metadata about each operator + entry points into the Dispatcher.
+            # The C++ function, method, and redispatch API's are all implemented as wrappers
+            # into various bits of the structs defined here.
+            #
+            # Important characteristics about the Operators API:
+            # (1) It follows the Dispatcher API.
+            #     This is kind of necessary to avoid overhead.
+            #     For example: if it followed the C++ API, then all of the faithful C++ factory functions
+            #     would need to wrap their arguments into TensorOptions only to unwrap them again.
+            # (2) Overload names are disambiguated.
+            #     This is helpful for pytorch extenders who would like to decltype() an aten operator,
+            #     that has overloads, e.g. decltype(at::_ops::mul_Tensor::call)
+            # (3) No argument defaulting is allowed.
+            #     This is more of an implementation detail to avoid #include cycles,
+            #     since TensorBody.h (which defines the Tensor class) needs to include this file.
+            # (4) manual_cpp_bindings and faithful names are not included in the API.
+            #     This applies to stuff like __dispatch__is_complex(), and add_outf().
+            #     These aren't "real aten ops", they're just additional functions provided by the C++ API.
+            #     They're implemented as wrappers in Functions.h that call into the actual operators
+            #     defined here, i.e. at::_ops::is_complex::call() and at::_ops::add_out::call().
+            #     This means that ATEN_OP(is_complex) will not fastpath, and will go through the dispatcher.
+            return f"""
+struct TORCH_API {name} {{
+  using schema = {sig.type()};
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))})
+  static {sig.defn(name=call_method_name, is_redispatching_fn=False)};
+  static {sig.defn(name=redispatch_method_name, is_redispatching_fn=True)};
+}};"""
+
+        elif self.target is Target.DEFINITION:
+            defns = f"""
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}")
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}")
+STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))})
+
+// aten::{f.func}
+static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{
+  return c10::Dispatcher::singleton()
+      .findSchemaOrThrow({name}::name, {name}::overload_name)
+      .typed<{name}::schema>();
+}}
+"""
+            for is_redispatching_fn in [False, True]:
+                if is_redispatching_fn:
+                    dispatcher_exprs_str = ", ".join(
+                        ["dispatchKeySet"] + [a.name for a in sig.arguments()]
+                    )
+                    dispatcher_call = "redispatch"
+                    method_name = f"{name}::{redispatch_method_name}"
+                else:
+                    method_name = f"{name}::{call_method_name}"
+                    dispatcher_exprs_str = ", ".join([a.name for a in sig.arguments()])
+                    dispatcher_call = "call"
+
+                fn_body = f"""
+    static auto op = create_{name}_typed_handle();
+    return op.{dispatcher_call}({dispatcher_exprs_str});"""
+
+                if (
+                    not is_redispatching_fn
+                    and len(self.static_dispatch_backend_indices) > 0
+                ):
+                    # call() should go through static dispatch
+                    fn_body = static_dispatch(
+                        f, backend_indices=self.static_dispatch_backend_indices
+                    )
+                defns += f"""
+// aten::{f.func}
+{sig.defn(name=method_name, is_redispatching_fn=is_redispatching_fn)} {{
+    {fn_body}
+}}
+"""
+            return defns
+        else:
+            assert_never(self.target)
+
+
+# Generates Functions.h, which provides the functional public C++ API,
+# and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeFunction:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if Variant.function not in f.variants:
+            return None
+
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=f.manual_cpp_binding
+        )
+
+        def generate_defn(faithful: bool) -> str:
+            if faithful:
+                sig = sig_group.faithful_signature
+                assert sig is not None
+            else:
+                sig = sig_group.signature
+
+            # See Note [The ATen Operators API]
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments())
+            exprs_str = ", ".join([e.expr for e in exprs])
+
+            return f"""
+// aten::{f.func}
+TORCH_API inline {sig.decl()} {{
+    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
+}}
+"""
+
+        result = generate_defn(False)
+        if sig_group.faithful_signature is not None:
+            result += generate_defn(True)
+
+        return result
+
+
+# Generates TensorBody.h. This file provides the object-oriented (method-based)
+# public C++ API, and the scaffolding to call into the dispatcher from these functions.
+@dataclass(frozen=True)
+class ComputeTensorMethod:
+    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    static_dispatch_backend_indices: List[BackendIndex]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if Variant.method not in f.variants:
+            return None
+
+        assert not f.func.is_out_fn()
+        assert f.func.arguments.self_arg is not None
+
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=True, fallback_binding=f.manual_cpp_binding
+        )
+
+        if self.target is Target.DECLARATION:
+            result = f"{sig_group.signature.decl()} const;\n"
+            if sig_group.faithful_signature is not None:
+                result += f"{sig_group.faithful_signature.decl()} const;\n"
+            return result
+
+        if self.target is not Target.DEFINITION:
+            assert_never(self.target)
+
+        def generate_defn(faithful: bool) -> str:
+            if faithful:
+                sig = sig_group.faithful_signature
+                assert sig is not None
+            else:
+                sig = sig_group.signature
+
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments(), method=True)
+            exprs_str = ", ".join([e.expr for e in exprs])
+
+            return f"""
+// aten::{f.func}
+inline {sig.defn(prefix="Tensor::")} const {{
+    return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
+}}
+"""
+
+        result = generate_defn(faithful=False)
+        if sig_group.faithful_signature is not None:
+            result += generate_defn(faithful=True)
+
+        return result
+
+
+# Generates RedispatchFunctions.h.
+# This is similar to the C++ API defined in Functions.h, but provides access
+# to the dispatcher's redispatch API.
+@dataclass(frozen=True)
+class ComputeRedispatchFunction:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        # We unconditionally generate function variants of the redispatch API.
+        # This is mainly because we can namespace functions separately, but not methods,
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=f.manual_cpp_binding
+        )
+
+        def generate_defn(faithful: bool) -> str:
+            if faithful:
+                sig = sig_group.faithful_signature
+                assert sig is not None
+            else:
+                sig = sig_group.signature
+
+            target_sig = DispatcherSignature.from_schema(f.func)
+            exprs = translate(sig.arguments(), target_sig.arguments())
+            exprs_str = ", ".join(["dispatchKeySet"] + [a.expr for a in exprs])
+
+            return f"""
+// aten::{f.func}
+TORCH_API inline {sig.decl(is_redispatching_fn=True)} {{
+    return at::_ops::{f.func.name.unambiguous_name()}::redispatch({exprs_str});
+}}
+"""
+
+        result = generate_defn(False)
+        if sig_group.faithful_signature is not None:
+            result += generate_defn(True)
+
+        return result
+
+
+# Generates ATenOpList.cpp, a runtime accessible list of all aten
+# operators.
+# TODO: This was historically used to help some JIT interop code
+# figure out whether or not to treat aten namespace'd operators
+# one way or another, we should reevaluate if this is actually needed.
+@with_native_function
+def compute_aten_op(f: NativeFunction) -> str:
+    return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},'
+
+
+# Generates MetaFunctions.h
+def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]:
+    if not g.structured:
+        return None
+    with native_function_manager(g.out):
+        name = meta.name(g)
+        args = structured.meta_arguments(g)
+        args_str = ", ".join(a.decl() for a in args)
+        parent_class = g.out.structured_inherits
+        if parent_class is None:
+            parent_class = "at::impl::MetaBase"
+        meta_return = "void"
+        precomputed = g.out.precomputed if g.structured else None
+
+        if precomputed:
+            # Generate the template declaration with one bool parameter for each
+            # precomputed element. Each parameter is true if the corresponding (in
+            # terms of position) precomputed element has been set.
+            precomputed_values = [*precomputed.replace.values(), precomputed.add]
+            precomputed_elements = [
+                elem for replace_list in precomputed_values for elem in replace_list
+            ]
+            precomputed_template_parameters = [
+                elem.name.upper() for elem in precomputed_elements
+            ]
+            precomputed_template_params_str = ", ".join(
+                f"bool {param} = false" for param in precomputed_template_parameters
+            )
+            precompute_template_decl = f"template <{precomputed_template_params_str}>"
+
+            # Generate a string containing declarations of all precomputed elements.
+            precomputed_elements_with_cpp_types = [
+                structured.argument_type(elem, binds=elem.name)
+                for elem in precomputed_elements
+            ]
+
+            precomputed_elements_decl = ";\n".join(
+                f"{elem.cpp_type(strip_ref=True)} {elem.name}"
+                for elem in precomputed_elements_with_cpp_types
+            )
+
+            # Generate "setter" methods for each precomputed element. Each method will return
+            # a new instance of precompute_out with the template parameter that corresponds to
+            # the member set by the method to true (to indicate that it has been set).
+            setter_methods = []
+            for i, elem in enumerate(precomputed_elements):
+                # Generate the signature. The return type will be the same
+                # as the type of `this` but with the template parameter
+                # corresponding to the element set by this method set to true.
+                # The assert generated below will ensure that this template
+                # parameter is false on the type of `this`.
+                return_ty_templates = ", ".join(
+                    precomputed_template_parameters[:i]
+                    + ["true"]
+                    + precomputed_template_parameters[i + 1 :]
+                )
+                return_ty = f"precompute_out<{return_ty_templates}>"
+                elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(
+                    strip_ref=True
+                )
+                signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)"
+
+                # Generate an assert which checks that the
+                # template parameter corresponding to the precomputed
+                # element that is set by this method is false on the
+                # class corresponding to the object that `this` points to.
+                # This ensures that each element can be set only once.
+                assert_msg = f'"{precomputed_elements[i].name} already set"'
+                assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});"
+
+                # Generate the new object construction block. All state
+                # except the element that this method sets is copied from the
+                # object that `this` points to. The value for the element that
+                # the method sets is taken from a method parameter.
+                construction_stmts = []
+                construction_stmts.append(f"{return_ty} ret;")
+
+                for j, elem in enumerate(precomputed_elements):
+                    if i == j:
+                        construction_stmts.append(f"ret.{elem.name} = value;")
+                    else:
+                        construction_stmts.append(
+                            f"ret.{elem.name} = this->{elem.name};"
+                        )
+
+                construction_stmts.append("return ret;")
+                construction_block = "\n".join(construction_stmts)
+
+                setter_methods.append(
+                    f"""
+                    {signature} {{
+                        {assert_stmt}
+                        {construction_block}
+                    }}
+                """
+                )
+            setter_methods_decl = "\n".join(setter_methods)
+
+            # Meta should return an instance of the struct containing the precomputed elements.
+            meta_return_template_params = ", ".join(
+                ["true"] * len(precomputed_template_parameters)
+            )
+            # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return
+            # type (which has a variable number of template parameters).
+            meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;"
+            meta_return = "meta_return_ty"
+            precomputed_decl = f"""
+                {precompute_template_decl}
+                struct TORCH_API precompute_out {{
+                    {setter_methods_decl}
+                    {precomputed_elements_decl};
+            }};"""
+        else:
+            meta_return_typedef = ""
+            precomputed_decl = ""
+
+        return f"""\
+struct TORCH_API structured_{name} : public {parent_class} {{
+    {precomputed_decl}
+    {meta_return_typedef}
+    {meta_return} meta({args_str});
+}};
+"""
+
+
+def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool:
+    name = str(f.func.name.name)
+    if name.endswith("_like") or name.startswith("new_"):
+        return False
+    if f.func.arguments.tensor_options is None:
+        return False
+    return selector.is_native_function_selected(f)
+
+
+# Generates RegisterBackendSelect.cpp, a series of kernels which provide
+# specialized computation of dispatch key for operator signatures which cannot
+# be easily done automatically using templating.
+@dataclass(frozen=True)
+class ComputeBackendSelect:
+    target: Union[Literal[Target.DEFINITION], Literal[Target.REGISTRATION]]
+
+    # Selector object to determine which operators to generate
+    # registration code for.
+    selector: SelectiveBuilder
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        if not needs_backend_select(f, self.selector):
+            return None
+
+        name = native.name(f.func)
+        native_sig = NativeSignature(f.func)
+
+        native_tensor_args = [
+            a
+            for a in native_sig.arguments()
+            if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like()
+        ]
+
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+
+        sig: Union[NativeSignature, DispatcherSignature]
+        sig = dispatcher_sig
+        dispatcher_exprs = dispatcher_sig.exprs()
+        dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
+
+        if self.target is Target.DEFINITION:
+            # I don't think there's actually a good reason to generate
+            # these two cases differently
+            # The first case could probably be improved though- it calls computeDispatchKeySet(),
+            # which looks at TLS dispatch keys- there should not be any by the time we reach backend select.
+            if native_tensor_args:
+                tensor_args = ", ".join(a.name for a in native_tensor_args)
+                compute_dk = f"""\
+DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args});
+DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
+DispatchKeySet _dk = c10::impl::computeDispatchKeySet(_dk_set, _dk_mask);"""
+            else:
+                compute_dk = (
+                    f"DispatchKeySet _dk = c10::DispatchKeySet({dispatch_key});"
+                )
+            return f"""\
+// aten::{f.func}
+C10_ALWAYS_INLINE
+{sig.defn(name)} {{
+  {compute_dk}
+  return at::_ops::{f.func.name.unambiguous_name()}::redispatch(
+      _dk, {', '.join(a.expr for a in dispatcher_exprs)});
+}}
+"""
+        elif self.target is Target.REGISTRATION:
+            return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
+        else:
+            assert_never(self.target)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                       YAML CODE GENERATION
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def format_yaml(data: object) -> str:
+    # Ignore alias in Dumper
+    YamlDumper.ignore_aliases = lambda self, data: True  # type: ignore[assignment]
+
+    # Support serializing OrderedDict
+    def dict_representer(dumper: Any, data: Any) -> Any:
+        return dumper.represent_dict(data.items())
+
+    YamlDumper.add_representer(OrderedDict, dict_representer)  # type: ignore[no-untyped-call]
+    # Some yaml parsers (e.g. Haskell's) don't understand line breaks.
+    # width=1e9 turns off optional line breaks and improves
+    # the portability of the outputted yaml.
+    return yaml.dump(data, default_flow_style=False, Dumper=YamlDumper, width=1e9)  # type: ignore[no-any-return, call-overload]
+
+
+# For some reason, some defaults we write to YAML are written as native
+# YAML objects, rather than doing them uniformly as strings.  This
+# function detects those cases and converts them into native Python
+# objects.
+def pythonify_default(s: str) -> object:
+    if s == "true":
+        return True
+    elif s == "false":
+        return False
+
+    try:
+        return int(s)
+    except ValueError:
+        try:
+            return float(s)
+        except ValueError:
+            return s
+
+
+# What is a dynamic type?  Over time, the semantic meaning of
+# dynamic type has degraded to meaninglessness (in the old days,
+# it captured dtype-ness of types, but that has gone away with
+# the removal of TH).  These days, it's mostly the same thing as
+# the C++ API argument type, except that Tensor and Tensor?
+# arguments simply present as Tensor.
+#
+# TODO: Get rid of dynamic_type, after getting tools/autograd
+# to use the new codegen framework
+def dynamic_type(t: Type) -> str:
+    if isinstance(t, OptionalType):
+        return dynamic_type(t.elem)
+    # Note we don't use t.is_tensor_like() here because it would
+    # also include Tensor[]
+    if str(t) == "Tensor":
+        return "at::Tensor"
+    return cpp.argumenttype_type(t, mutable=False, binds="__placeholder__").cpp_type()
+
+
+def compute_method_of_yaml(variants: Set[Variant]) -> List[str]:
+    # This is written out explicitly to ensure that Tensor and
+    # namespace are put into the list in the right order
+    method_of = ["Type"]
+    if Variant.method in variants:
+        method_of.append("Tensor")
+    if Variant.function in variants:
+        method_of.append("namespace")
+    return method_of
+
+
+def compute_returns_yaml(
+    f: NativeFunction,
+) -> Tuple[List[Dict[str, str]], Dict[str, str]]:
+    # Note [name and field_name]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # To understand name_to_field_name, we must first talk about this
+    # schema:
+    #
+    #   lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
+    #
+    # There is something very odd about this schema: it is an out
+    # variant of the function (that is to say, it will convert into
+    # at::lstsq_out() in the C++ API), but the names of the output
+    # return arguments don't match the keyword argument names of
+    # the inputs.  It TURNS OUT that in this situation, the historical
+    # Declarations.yaml we want to output is this (abbreviated to
+    # only show relevant fields):
+    #
+    #   arguments:
+    #     ...
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #     ...
+    #
+    #   returns:
+    #   - field_name: solution
+    #     name: X
+    #   - field_name: QR
+    #     name: qr
+    #
+    # The name of the return fields is stored in 'field_name', and the
+    # name of the arguments is stored in 'name'.  So when we process
+    # arguments, we need a way to get at the corresponding return.  At
+    # the moment, this is most conveniently done by constructing a
+    # mapping from name (the argument concept) to field_name (the
+    # return concept) while processing return arguments, since we don't
+    # directly maintain this correspondence in the modeling of function
+    # schema itself.
+    #
+    # See also https://github.com/pytorch/pytorch/issues/43114
+    name_to_field_name: Dict[str, str] = {}
+
+    # Compute the returns field of the YAML entry
+    names = cpp.return_names(f)
+    returns = []
+    for i, (r, name) in enumerate(zip(f.func.returns, names)):
+        ret = {
+            "dynamic_type": dynamic_type(r.type),
+            "name": name,
+            "type": cpp.return_type(r).cpp_type(),
+        }
+
+        if r.name:
+            # See Note [name and field_name]
+            ret["field_name"] = r.name
+            if f.func.is_out_fn():
+                name_to_field_name[f.func.arguments.out[i].name] = r.name
+
+        returns.append(ret)
+
+    return returns, name_to_field_name
+
+
+# arguments in yaml roughly corresponds to the public C++ API
+def compute_cpp_argument_yaml(
+    cpp_a: Binding,
+    *,
+    schema_order: bool,
+    kwarg_only_set: Set[str],
+    out_arg_set: Set[str],
+    name_to_field_name: Dict[str, str],
+) -> object:
+    if isinstance(cpp_a.argument, TensorOptionsArguments):
+        arg: Dict[str, object] = {
+            "annotation": None,
+            "dynamic_type": "at::TensorOptions",
+            "is_nullable": False,
+            "name": cpp_a.name,
+            "type": cpp_a.type,
+            "kwarg_only": True,
+        }
+        if cpp_a.default is not None:
+            arg["default"] = cpp_a.default
+        return arg
+    elif isinstance(cpp_a.argument, SelfArgument):
+        raise AssertionError()
+    elif isinstance(cpp_a.argument, Argument):
+        return compute_argument_yaml(
+            cpp_a.argument,
+            schema_order=schema_order,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+
+
+def compute_argument_yaml(
+    a: Argument,
+    *,
+    schema_order: bool,
+    kwarg_only_set: Set[str],
+    out_arg_set: Set[str],
+    name_to_field_name: Dict[str, str],
+) -> object:
+    arg: Dict[str, object] = {
+        "annotation": str(a.annotation) if a.annotation else None,
+        "dynamic_type": dynamic_type(a.type),
+        "is_nullable": a.type.is_nullable(),
+        "name": a.name,
+        "type": cpp.argument_type(a, binds="__placeholder__").cpp_type(),
+    }
+    if a.default is not None:
+        arg["default"] = pythonify_default(cpp.default_expr(a.default, a.type))
+    if a.name in kwarg_only_set:
+        arg["kwarg_only"] = True
+    if a.name in out_arg_set:
+        arg["output"] = True
+        arg["allocate"] = True
+        # See Note [name and field_name]
+        if a.name in name_to_field_name:
+            arg["field_name"] = name_to_field_name[a.name]
+    # Historically, booleans don't get their size recorded, because it
+    # is already built into the cpp type (e.g., std::array<bool, 4>)
+    l = a.type.is_list_like()
+    if l is not None and l.size is not None and str(l.elem) != "bool":
+        arg["size"] = l.size
+    return arg
+
+
+@with_native_function
+def compute_declaration_yaml(f: NativeFunction) -> object:
+    returns, name_to_field_name = compute_returns_yaml(f)
+
+    # These sets are used to conveniently test if an argument is a
+    # kwarg-only or out argument
+    kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
+    out_arg_set = set(a.name for a in f.func.arguments.out)
+
+    sig_group = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    cpp_args = sig_group.signature.arguments()
+    arguments = [
+        compute_cpp_argument_yaml(
+            cpp_a,
+            schema_order=False,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+        for cpp_a in cpp_args
+    ]
+
+    schema_order_jit_arguments = list(f.func.schema_order_arguments())
+
+    schema_order_arguments = [
+        compute_argument_yaml(
+            a,
+            schema_order=True,
+            kwarg_only_set=kwarg_only_set,
+            out_arg_set=out_arg_set,
+            name_to_field_name=name_to_field_name,
+        )
+        for a in schema_order_jit_arguments
+    ]
+
+    cpp_schema_order_types = [
+        # NB: method here doesn't matter
+        r.type
+        for a in schema_order_jit_arguments
+        for r in cpp.argument(
+            a,
+            method=False,
+            cpp_no_default_args=set(),
+            faithful=False,
+            has_tensor_options=False,
+        )
+    ]
+
+    cpp_returns = cpp.returns_type(f.func.returns).cpp_type()
+    schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})"
+
+    is_factory_method = (
+        any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args)
+        and Variant.method not in f.variants
+    )
+
+    return OrderedDict(
+        [
+            ("name", cpp.name(f.func)),
+            ("operator_name", str(f.func.name.name)),
+            ("overload_name", str(f.func.name.overload_name)),
+            ("manual_kernel_registration", f.manual_kernel_registration),
+            (
+                "category_override",
+                f.category_override if f.category_override is not None else "",
+            ),
+            ("schema_string", f"aten::{f.func}"),
+            ("arguments", arguments),
+            ("schema_order_cpp_signature", schema_order_cpp_signature),
+            ("schema_order_arguments", schema_order_arguments),
+            ("method_of", compute_method_of_yaml(f.variants)),
+            ("mode", "native"),
+            ("python_module", "" if f.python_module is None else f.python_module),
+            ("returns", returns),
+            ("inplace", f.func.name.name.inplace),
+            ("is_factory_method", is_factory_method),
+            ("abstract", f.is_abstract),
+            ("device_guard", f.device_guard),
+            ("with_gil", False),
+            ("deprecated", False),
+            ("has_math_kernel", f.has_composite_implicit_autograd_kernel),
+        ]
+    )
+
+
+# See Note [Auto generated composite kernels]
+def has_autogenerated_composite_kernel(f: NativeFunction) -> bool:
+    return (f.structured or f.structured_delegate is not None) and (
+        f.func.kind() == SchemaKind.functional or f.func.kind() == SchemaKind.inplace
+    )
+
+
+@with_native_function_and_indices
+def compute_registration_declarations(
+    f: NativeFunction, backend_indices: Dict[DispatchKey, BackendIndex]
+) -> str:
+    name = dispatcher.name(f.func)
+    returns_type = dispatcher.returns_type(
+        f.func.returns
+    ).cpp_type_registration_declarations()
+    args = dispatcher.arguments(f.func)
+    args_str = ", ".join(a.no_default().decl_registration_declarations() for a in args)
+    comment_data: Dict[str, str] = {
+        "schema": f"aten::{f.func}",
+        # TODO: What exactly is the semantics of the 'dispatch' field?
+        "dispatch": str(
+            {k for k, v in backend_indices.items() if v.has_kernel(f)}
+            != {DispatchKey.CompositeImplicitAutograd}
+        ),
+        "default": str(f.has_composite_kernel or has_autogenerated_composite_kernel(f)),
+    }
+    return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)}
+"""
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           RUN IT ALL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def get_custom_build_selector(
+    provided_op_registration_allowlist: Optional[List[str]],
+    op_selection_yaml_path: Optional[str],
+) -> SelectiveBuilder:
+    assert not (
+        provided_op_registration_allowlist is not None
+        and op_selection_yaml_path is not None
+    ), (
+        "Both provided_op_registration_allowlist and "
+        + "op_selection_yaml_path can NOT be provided at the "
+        + "same time."
+    )
+
+    op_registration_allowlist: Optional[Set[str]] = None
+    if provided_op_registration_allowlist is not None:
+        op_registration_allowlist = set(provided_op_registration_allowlist)
+
+    if op_registration_allowlist is not None:
+        selector = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            op_registration_allowlist,
+            True,
+            False,
+        )
+    elif op_selection_yaml_path is not None:
+        selector = SelectiveBuilder.from_yaml_path(op_selection_yaml_path)
+    else:
+        selector = SelectiveBuilder.get_nop_selector()
+
+    return selector
+
+
+def pre_group_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]]:
+    pre_grouped_native_functions: Dict[
+        FunctionSchema, Dict[SchemaKind, NativeFunction]
+    ] = defaultdict(dict)
+    for f in native_functions:
+        d = pre_grouped_native_functions[f.func.signature()]
+        assert f.func.kind() not in d
+        d[f.func.kind()] = f
+    return pre_grouped_native_functions
+
+
+def get_grouped_by_view_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Sequence[Union[NativeFunction, NativeFunctionsViewGroup]]:
+    def maybe_create_view_group(
+        d: Dict[ViewSchemaKind, NativeFunction]
+    ) -> List[Union[NativeFunction, NativeFunctionsViewGroup]]:
+        funcs: List[Union[NativeFunction, NativeFunctionsViewGroup]] = []
+        if ViewSchemaKind.aliasing not in d:
+            # Case 1: this op / op group is not aliasing, so we don't create a view group.
+            # return the original (ungrouped) native functions instead.
+            for func in d.values():
+                funcs.append(func)
+        else:
+            # Case 2: this op group contains an aliasing op, so we create a ViewGroup for it.
+            # The handling for out= ops here is unfortunate.
+            # out= ops don't really make sense for view operators.
+            # However, we have at least one existing {view}_copy.out operator in native_functions.yaml.
+            # It shouldn't be part of a view group, so we explicitly don't group it.
+            # There currently aren't any out= view ops (and there probably shouldn't be).
+            # We also expect that when we hit this case, the `non_aliasing` op in the dict
+            # *must* be a view_copy op (this is asserted in the NativeFunctionsViewGroup constructor)
+            if ViewSchemaKind.out in d:
+                funcs.append(d[ViewSchemaKind.out])
+
+            funcs.append(
+                NativeFunctionsViewGroup(
+                    view=d[ViewSchemaKind.aliasing],
+                    view_copy=d.get(ViewSchemaKind.non_aliasing, None),
+                    view_inplace=d.get(ViewSchemaKind.inplace, None),
+                )
+            )
+        return funcs
+
+    grouped_by_views: Dict[
+        FunctionSchema, Dict[ViewSchemaKind, NativeFunction]
+    ] = defaultdict(dict)
+    for f in native_functions:
+        schema = f.func.view_signature()
+        assert f.view_schema_kind not in grouped_by_views[schema]
+        grouped_by_views[schema][f.view_schema_kind] = f
+
+    return list(concatMap(maybe_create_view_group, grouped_by_views.values()))
+
+
+def get_grouped_native_functions(
+    native_functions: Sequence[NativeFunction],
+) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
+    def flatten_pre_group(
+        d: Dict[SchemaKind, NativeFunction]
+    ) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]:
+        r = NativeFunctionsGroup.from_dict(d)
+        if r is None:
+            return list(d.values())
+        else:
+            return [r]
+
+    # TODO: how come ValuesView isn't a Sequence lol
+    pre_grouped_native_functions = pre_group_native_functions(native_functions)
+    return list(
+        concatMap(flatten_pre_group, list(pre_grouped_native_functions.values()))
+    )
+
+
+def gen_aggregated_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    functions_keys: Set[DispatchKey],
+    dispatch_keys: Sequence[DispatchKey],
+    rocm: bool,
+) -> None:
+    # Buck doesn't support dynamic output files, so we aggregate all operator
+    # headers into a single file
+    cpu_fm.write(
+        "NativeMetaFunctions.h",
+        lambda: {
+            "NativeMetaFunctions_includes": [],
+            "NativeMetaFunctions_declarations": list(
+                mapMaybe(compute_meta_function_declaration, structured_native_functions)
+            ),
+        },
+    )
+    method_native_functions = [
+        fn for fn in native_functions if Variant.method in fn.variants
+    ]
+    non_method_native_functions = [
+        fn for fn in native_functions if fn not in method_native_functions
+    ]
+    cpu_fm.write(
+        "MethodOperators.h",
+        lambda: {
+            "MethodOperators_includes": [],
+            "MethodOperators_declarations": list(
+                mapMaybe(
+                    ComputeOperators(
+                        Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    method_native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write(
+        "Operators.h",
+        lambda: {
+            "Operators_includes": ["#include <ATen/MethodOperators.h>"],
+            "Operators_declarations": list(
+                mapMaybe(
+                    ComputeOperators(
+                        Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    non_method_native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write(
+        "Functions.h",
+        lambda: {
+            "static_dispatch_extra_headers": static_dispatch_extra_headers(
+                static_dispatch_idx
+            ),
+            "Functions_includes": ["#include <ATen/Operators.h>"],
+            "Functions_declarations": list(
+                mapMaybe(
+                    ComputeFunction(),
+                    native_functions,
+                )
+            ),
+        },
+    )
+    cpu_fm.write(
+        "NativeFunctions.h",
+        lambda: {
+            "NativeFunctions_includes": ["#include <ATen/NativeMetaFunctions.h>"],
+            "NativeFunctions_declarations": list(
+                concatMap(
+                    # Convert to a set first to remove duplicate kernel names.
+                    # Backends are allowed to repeat kernel names; only generate the declaration once!
+                    lambda f: list(
+                        OrderedDict.fromkeys(
+                            concatMap(
+                                lambda backend_idx: dest.compute_native_function_declaration(
+                                    f, backend_idx
+                                ),
+                                backend_indices.values(),
+                            )
+                        )
+                    ),
+                    grouped_native_functions,
+                )
+            ),
+        },
+    )
+
+    for dispatch_key in dispatch_keys:
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        if dispatch_key in functions_keys:
+            inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
+
+            fm.write_with_template(
+                f"{dispatch_key}Functions.h",
+                "DispatchKeyFunctions.h",
+                lambda: {
+                    "dispatch_key": str(dispatch_key),
+                    "inline_headers": inl_headers,
+                },
+            )
+            fm.write_with_template(
+                f"{dispatch_key}Functions_inl.h",
+                "DispatchKeyFunctions_inl.h",
+                lambda: {
+                    "DispatchKeyFunctions_inl_includes": [],
+                    "dispatch_namespace": dispatch_key.lower(),
+                    "dispatch_namespaced_declarations": list(
+                        concatMap(
+                            dest.RegisterDispatchKey(
+                                backend_indices[dispatch_key],
+                                Target.NAMESPACED_DECLARATION,
+                                selector,
+                                rocm=rocm,
+                                cpp_namespace="at::native",
+                                class_method_name=None,
+                                skip_dispatcher_op_registration=False,
+                            ),
+                            grouped_native_functions,
+                        )
+                    ),
+                },
+            )
+
+        del fm
+
+
+def gen_per_operator_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    ops_fm: FileManager,
+    functions_keys: Set[DispatchKey],
+    dispatch_keys: Sequence[DispatchKey],
+    rocm: bool,
+) -> None:
+    # For CMake builds, split operator declarations into separate headers in
+    # the ATen/ops folder to split up header dependencies
+    functions_by_root_name: Dict[str, List[NativeFunction]] = defaultdict(lambda: [])
+    for fn in native_functions:
+        functions_by_root_name[fn.root_name].append(fn)
+
+    grouped_functions_by_root_name: Dict[
+        str, List[Union[NativeFunction, NativeFunctionsGroup]]
+    ] = defaultdict(lambda: [])
+    for group in grouped_native_functions:
+        name = group.root_name
+        grouped_functions_by_root_name[name].append(group)
+
+    for name, functions in functions_by_root_name.items():
+        ops_fm.write_with_template(
+            f"{name}_ops.h",
+            "Operator.h",
+            lambda: {
+                "declarations": list(
+                    mapMaybe(
+                        ComputeOperators(
+                            Target.DECLARATION,
+                            static_dispatch_backend_indices=static_dispatch_idx,
+                        ),
+                        functions,
+                    )
+                ),
+            },
+        )
+
+        ops_fm.write_with_template(
+            f"{name}.h",
+            "Function.h",
+            lambda: {
+                "static_dispatch_ops_headers": list(
+                    mapMaybe(
+                        lambda fn: static_dispatch_ops_header(
+                            fn, backend_index=static_dispatch_idx
+                        ),
+                        functions,
+                    )
+                ),
+                "operator_includes": f"#include <ATen/ops/{name}_ops.h>",
+                "function_definitions": list(
+                    mapMaybe(
+                        ComputeFunction(),
+                        functions,
+                    )
+                ),
+            },
+        )
+
+        grouped_functions = grouped_functions_by_root_name.get(name, [])
+        structured_functions = [
+            fn
+            for fn in grouped_functions
+            if isinstance(fn, NativeFunctionsGroup) and fn.structured
+        ]
+        is_structured = len(structured_functions) > 0
+
+        if is_structured:
+            ops_fm.write_with_template(
+                f"{name}_meta.h",
+                "NativeMetaFunction.h",
+                lambda: {
+                    "meta_function_declarations": list(
+                        mapMaybe(
+                            compute_meta_function_declaration, structured_functions
+                        )
+                    ),
+                },
+            )
+
+        ops_fm.write_with_template(
+            f"{name}_native.h",
+            "NativeFunction.h",
+            lambda: {
+                "extra_includes": (
+                    f"#include <ATen/ops/{name}_meta.h>" if is_structured else []
+                ),
+                "native_function_declarations": list(
+                    concatMap(
+                        # Convert to a set first to remove duplicate kernel names.
+                        # Backends are allowed to repeat kernel names; only generate the declaration once!
+                        lambda f: list(
+                            OrderedDict.fromkeys(
+                                concatMap(
+                                    lambda backend_idx: dest.compute_native_function_declaration(
+                                        f, backend_idx
+                                    ),
+                                    backend_indices.values(),
+                                )
+                            )
+                        ),
+                        grouped_functions,
+                    )
+                ),
+            },
+        )
+
+    for category, suffix in [
+        ("Functions", ""),
+        ("Operators", "_ops"),
+        ("NativeMetaFunctions", "_meta"),
+        ("NativeFunctions", "_native"),
+    ]:
+        cpu_fm.write(
+            f"{category}.h",
+            lambda: {
+                f"{category}_includes": [
+                    f"#include <ATen/ops/{name}{suffix}.h>"
+                    for name in sorted(functions_by_root_name.keys())
+                ],
+                f"{category}_declarations": [],
+            },
+        )
+
+    for dispatch_key in dispatch_keys:
+        if dispatch_key not in functions_keys:
+            continue
+
+        dispatch_namespace = dispatch_key.lower()
+        dispatch_names = []
+
+        for name, functions in functions_by_root_name.items():
+            grouped_functions = grouped_functions_by_root_name.get(name, [])
+            declarations = list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_indices[dispatch_key],
+                        Target.NAMESPACED_DECLARATION,
+                        selector,
+                        rocm=rocm,
+                        cpp_namespace="at::native",
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    grouped_functions,
+                )
+            )
+
+            if len(declarations) == 0:
+                continue
+
+            dispatch_names.append(name)
+            ops_fm.write_with_template(
+                f"{name}_{dispatch_namespace}_dispatch.h",
+                "DispatchKeyFunction.h",
+                lambda: {
+                    "dispatch_namespace": dispatch_namespace,
+                    "dispatch_namespaced_declarations": declarations,
+                },
+            )
+
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
+
+        fm.write_with_template(
+            f"{dispatch_key}Functions.h",
+            "DispatchKeyFunctions.h",
+            lambda: {
+                "dispatch_key": str(dispatch_key),
+                "inline_headers": inl_headers,
+            },
+        )
+        fm.write_with_template(
+            f"{dispatch_key}Functions_inl.h",
+            "DispatchKeyFunctions_inl.h",
+            lambda: {
+                "dispatch_namespace": dispatch_namespace,
+                "DispatchKeyFunctions_inl_includes": [
+                    f"#include <ATen/ops/{name}_{dispatch_namespace}_dispatch.h>"
+                    for name in sorted(dispatch_names)
+                ],
+                "dispatch_namespaced_declarations": [],
+            },
+        )
+        del fm
+
+    cpu_fm.write(
+        "MethodOperators.h",
+        lambda: {
+            "MethodOperators_includes": sorted(
+                f"#include <ATen/ops/{name}_ops.h>"
+                for name, functions in functions_by_root_name.items()
+                if any(Variant.method in fn.variants for fn in functions)
+            ),
+            "MethodOperators_declarations": [],
+        },
+    )
+
+
+def gen_headers(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    static_dispatch_idx: List[BackendIndex],
+    selector: SelectiveBuilder,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    core_fm: FileManager,
+    cpu_fm: FileManager,
+    cuda_fm: FileManager,
+    ops_fm: FileManager,
+    dispatch_keys: Sequence[DispatchKey],
+    functions_keys: Set[DispatchKey],
+    rocm: bool,
+    per_operator_headers: bool,
+) -> None:
+    if per_operator_headers:
+        gen_per_operator_headers(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            ops_fm=ops_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=rocm,
+        )
+    else:
+        gen_aggregated_headers(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=rocm,
+        )
+
+    core_fm.write(
+        "TensorBody.h",
+        lambda: {
+            "tensor_method_declarations": list(
+                mapMaybe(
+                    ComputeTensorMethod(
+                        target=Target.DECLARATION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    native_functions,
+                )
+            ),
+            "tensor_method_definitions": list(
+                mapMaybe(
+                    ComputeTensorMethod(
+                        target=Target.DEFINITION,
+                        static_dispatch_backend_indices=static_dispatch_idx,
+                    ),
+                    native_functions,
+                )
+            ),
+        },
+    )
+
+    cpu_fm.write(
+        "RedispatchFunctions.h",
+        lambda: {
+            "function_redispatch_definitions": list(
+                mapMaybe(ComputeRedispatchFunction(), native_functions)
+            ),
+        },
+    )
+
+    cpu_fm.write(
+        "RegistrationDeclarations.h",
+        lambda: {
+            "registration_declarations": [
+                compute_registration_declarations(f, backend_indices)
+                for f in native_functions
+            ],
+        },
+    )
+
+    def gen_aten_interned_strings() -> Dict[str, str]:
+        attrs = set()  # All function argument names
+        names = set()  # All ATen function names
+        for func in native_functions:
+            names.add(str(func.func.name.name))
+            # Some operators don't have a functional variant but we still create a
+            # symbol without the underscore
+            names.add(func.func.name.name.base)
+
+            for arg in func.func.schema_order_arguments():
+                attrs.add(arg.name)
+
+        # These are keywords in C++, so aren't valid symbol names
+        # https://en.cppreference.com/w/cpp/language/operator_alternative
+        names -= set(
+            [
+                "and",
+                "and_eq",
+                "bitand",
+                "bitor",
+                "compl",
+                "not",
+                "not_eq",
+                "or",
+                "or_eq",
+                "xor",
+                "xor_eq",
+            ]
+        )
+
+        return {
+            "aten_symbols": " \\\n".join(
+                [f"_(aten, {name})" for name in sorted(names)]
+            ),
+            "attr_symbols": " \\\n".join(
+                [f"_(attr, {name})" for name in sorted(attrs)]
+            ),
+        }
+
+    core_fm.write("aten_interned_strings.h", gen_aten_interned_strings)
+
+
+def gen_source_files(
+    *,
+    native_functions: Sequence[NativeFunction],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    native_functions_with_view_groups: Sequence[
+        Union[NativeFunction, NativeFunctionsViewGroup]
+    ],
+    selector: SelectiveBuilder,
+    static_dispatch_idx: List[BackendIndex],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    core_fm: FileManager,
+    cpu_fm: FileManager,
+    cpu_vec_fm: FileManager,
+    cuda_fm: FileManager,
+    dispatch_keys: Sequence[DispatchKey],
+    functions_keys: Set[DispatchKey],
+    rocm: bool,
+    force_schema_registration: bool,
+    per_operator_headers: bool,
+    skip_dispatcher_op_registration: bool,
+) -> None:
+    extra_cuda_headers = """\
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/cuda/CUDAContext.h>"""
+    if rocm:
+        extra_cuda_headers = """\
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/ATenHIPGeneral.h>
+#include <ATen/hip/HIPDevice.h>
+#include <ATen/hip/HIPContext.h>"""
+
+    for dispatch_key in dispatch_keys:
+        fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+
+        if per_operator_headers:
+
+            def operator_headers() -> List[str]:
+                headers = []
+                for g in grouped_native_functions:
+                    is_registered = False
+                    if backend_index.has_kernel(g):
+                        is_registered = True
+                    # The above has_kernel test on a group will only test for
+                    # the existence of out dispatch, because that's how
+                    # structured kernels work. But sometimes functions can be
+                    # grouped but not be structured, and then you need to check
+                    # each individual piece, as they may have manual dispatch
+                    # entries.
+                    elif isinstance(g, NativeFunctionsGroup) and any(
+                        backend_index.has_kernel(fn) for fn in g.functions()
+                    ):
+                        is_registered = True
+                    # TODO: this condition is a bit questionable
+                    elif g.structured and dispatch_key in (
+                        DispatchKey.Meta,
+                        DispatchKey.CompositeExplicitAutograd,
+                    ):
+                        is_registered = True
+                    if not is_registered:
+                        continue
+
+                    headers.append(f"#include <ATen/ops/{g.root_name}_native.h>")
+                    if dispatch_key == DispatchKey.CompositeExplicitAutograd:
+                        headers.append(f"#include <ATen/ops/{g.root_name}.h>")
+                    if dispatch_key in functions_keys:
+                        headers.append(
+                            f"#include <ATen/ops/{g.root_name}_{dispatch_namespace}_dispatch.h>"
+                        )
+
+                return sorted(set(headers))
+
+        else:
+
+            def operator_headers() -> List[str]:
+                headers = ["#include <ATen/NativeFunctions.h>"]
+                if dispatch_key == DispatchKey.CompositeExplicitAutograd:
+                    headers.append("#include <ATen/Functions.h>")
+                if dispatch_key in functions_keys:
+                    headers.append(f"#include <ATen/{dispatch_key!s}Functions.h>")
+                return headers
+
+        backend_index = backend_indices[dispatch_key]
+        dispatch_registrations_body = (
+            ""
+            if skip_dispatcher_op_registration
+            else "\n".join(
+                list(
+                    concatMap(
+                        dest.RegisterDispatchKey(
+                            backend_index,
+                            Target.REGISTRATION,
+                            selector,
+                            rocm=rocm,
+                            cpp_namespace="at::native",
+                            class_method_name=None,
+                            skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+                        ),
+                        grouped_native_functions,
+                    )
+                )
+            )
+        )
+        static_template = CodeTemplate(
+            """\
+TORCH_LIBRARY_IMPL(aten, $dispatch_key, m) {
+    $dispatch_registrations_body
+};"""
+        )
+        static_init_dispatch_registrations = static_template.substitute(
+            dispatch_key=dispatch_key,
+            dispatch_registrations_body=dispatch_registrations_body,
+        )
+        dispatch_namespace = str(dispatch_key).lower()
+        fm.write_with_template(
+            f"Register{dispatch_key}.cpp",
+            "RegisterDispatchKey.cpp",
+            lambda: {
+                "extra_cuda_headers": extra_cuda_headers
+                if is_cuda_dispatch_key(dispatch_key)
+                else "",
+                "external_backend_headers": "",
+                "dispatch_headers": dest.gen_registration_headers(
+                    backend_index, per_operator_headers, rocm
+                ),
+                "ops_headers": operator_headers(),
+                "DispatchKey": dispatch_key,
+                "dispatch_namespace": dispatch_key.lower(),
+                "dispatch_helpers": dest.gen_registration_helpers(backend_index),
+                "dispatch_namespaced_definitions": list(
+                    concatMap(
+                        dest.RegisterDispatchKey(
+                            backend_index,
+                            Target.NAMESPACED_DEFINITION,
+                            selector,
+                            rocm=rocm,
+                            cpp_namespace="at::native",
+                            class_method_name=None,
+                            skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+                        ),
+                        grouped_native_functions,
+                    )
+                ),
+                "dispatch_anonymous_definitions": list(
+                    concatMap(
+                        dest.RegisterDispatchKey(
+                            backend_index,
+                            Target.ANONYMOUS_DEFINITION,
+                            selector,
+                            rocm=rocm,
+                            cpp_namespace="at::native",
+                            class_method_name=None,
+                            skip_dispatcher_op_registration=skip_dispatcher_op_registration,
+                        ),
+                        grouped_native_functions,
+                    )
+                ),
+                "static_init_dispatch_registrations": static_init_dispatch_registrations,
+                "deferred_dispatch_registrations": "",
+            },
+        )
+
+        for g in structured_native_functions:
+            if not g.out.ufunc_inner_loop or not is_ufunc_dispatch_key(dispatch_key):
+                continue
+            name = g.functional.func.name.name
+            if dispatch_key is DispatchKey.CPU:
+                assert fm is cpu_fm
+                fm.write_with_template(
+                    f"UfuncCPU_{name}.cpp",
+                    "UfuncCPU.cpp",
+                    lambda: {
+                        "meta_declaration": compute_meta_function_declaration(g),
+                        "native_declaration": dest.compute_native_function_declaration(
+                            g, backend_indices[dispatch_key]
+                        ),
+                        "native_definitions": dest.compute_ufunc_cpu(g),
+                    },
+                )
+                cpu_vec_fm.write_with_template(
+                    f"UfuncCPUKernel_{name}.cpp",
+                    "UfuncCPUKernel.cpp",
+                    lambda: {
+                        "name": name,
+                        "native_definitions": dest.compute_ufunc_cpu_kernel(g),
+                    },
+                )
+            elif dispatch_key is DispatchKey.CUDA:
+                cuda_headers = "#include <ATen/native/cuda/Loops.cuh>"
+                if rocm:
+                    cuda_headers = "#include <ATen/native/hip/Loops.cuh>"
+                fm.write_with_template(
+                    f"UfuncCUDA_{name}.cu",
+                    "UfuncCUDA.cu",
+                    lambda: {
+                        "name": name,
+                        "cuda_headers": cuda_headers,
+                        "meta_declaration": compute_meta_function_declaration(g),
+                        "native_declaration": dest.compute_native_function_declaration(
+                            g, backend_indices[dispatch_key]
+                        ),
+                        "native_definitions": dest.compute_ufunc_cuda(g),
+                    },
+                )
+            else:
+                raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
+
+        del fm
+
+    # BackendSelect is generated specially
+    def gen_backend_select() -> Dict[str, List[str]]:
+        relevant_fns = [
+            fn for fn in native_functions if needs_backend_select(fn, selector)
+        ]
+        return {
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in relevant_fns
+            ],
+            "backend_select_method_definitions": list(
+                mapMaybe(
+                    ComputeBackendSelect(Target.DEFINITION, selector), relevant_fns
+                )
+            ),
+            "backend_select_function_registrations": list(
+                mapMaybe(
+                    ComputeBackendSelect(Target.REGISTRATION, selector), relevant_fns
+                )
+            ),
+        }
+
+    cpu_fm.write("RegisterBackendSelect.cpp", gen_backend_select)
+
+    schema_selector = selector
+    if force_schema_registration:
+        schema_selector = SelectiveBuilder.get_nop_selector()
+    cpu_fm.write(
+        "RegisterSchema.cpp",
+        lambda: {
+            "schema_registrations": []
+            if skip_dispatcher_op_registration
+            else list(mapMaybe(RegisterSchema(schema_selector), native_functions)),
+        },
+    )
+
+    def key_func(
+        fn: Union[NativeFunction, NativeFunctionsGroup, NativeFunctionsViewGroup]
+    ) -> str:
+        return fn.root_name
+
+    cpu_fm.write_sharded(
+        "Operators.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {
+            "operator_headers": [f"#include <ATen/ops/{fn.root_name}.h>"],
+            "definitions": [
+                ComputeOperators(
+                    Target.DEFINITION,
+                    static_dispatch_backend_indices=static_dispatch_idx,
+                )(fn)
+            ],
+        },
+        base_env={
+            "static_dispatch_extra_headers": static_dispatch_extra_headers(
+                static_dispatch_idx
+            ),
+        },
+        num_shards=5,
+        sharded_keys={
+            "operator_headers",
+            "definitions",
+            "static_dispatch_extra_headers",
+        },
+    )
+
+    cpu_fm.write("Functions.cpp", lambda: {})
+
+    core_fm.write("TensorMethods.cpp", lambda: {})
+
+    core_fm.write(
+        "ATenOpList.cpp",
+        lambda: {
+            "aten_ops": list(mapMaybe(compute_aten_op, native_functions)),
+        },
+    )
+
+    # We need to easily map from [inplace_op_name] -> [functional_op] for the functionalization pass,
+    # so here I generate a mapping from every operator name to its corresponding functional NativeFunction (if it exist).
+    pre_grouped_d: Dict[
+        FunctionSchema, Dict[SchemaKind, NativeFunction]
+    ] = pre_group_native_functions(native_functions)
+    to_functional_op: Dict[OperatorName, Optional[NativeFunction]] = {
+        k: v
+        for d in [
+            {
+                f.func.name: pre_grouped_d[func][SchemaKind.functional]
+                if SchemaKind.functional in pre_grouped_d[func].keys()
+                else None
+                for f in pre_grouped_d[func].values()
+            }
+            for func in pre_grouped_d.keys()
+        ]
+        for k, v in d.items()
+    }
+
+    def functionalization_env_callable(
+        g: Union[NativeFunction, NativeFunctionsViewGroup]
+    ) -> Dict[str, List[str]]:
+        def gen_op_headers(
+            g: Union[NativeFunction, NativeFunctionsViewGroup]
+        ) -> List[str]:
+            if isinstance(g, NativeFunctionsViewGroup):
+                # view ops always get a functionalization kernel
+                headers = [
+                    f"#include <ATen/ops/{g.view.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+                ]
+                if g.view_copy is not None:
+                    headers += [
+                        f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
+                        f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
+                    ]
+                return headers
+            else:
+                f = g
+                return [
+                    f"#include <ATen/ops/{f.root_name}_native.h>",
+                    f"#include <ATen/ops/{f.root_name}_ops.h>",
+                ]
+
+        return {
+            "ops_headers": gen_op_headers(g),
+            "func_definitions": gen_functionalization_definition(
+                selector,
+                g,
+                # We need to manually map inplace ops to their out-of-place variants
+                # (we can't do this with NativeFunctionsGroup today because not all inplace ops have out= variants)
+                None
+                if isinstance(g, NativeFunctionsViewGroup)
+                else to_functional_op.get(g.func.name, None),
+            ),
+            "func_registrations": gen_functionalization_registration(
+                selector,
+                g,
+                backend_indices[DispatchKey.CompositeImplicitAutograd],
+            ),
+        }
+
+    cpu_fm.write_sharded(
+        "RegisterFunctionalization.cpp",
+        native_functions_with_view_groups,
+        key_fn=key_func,
+        env_callable=functionalization_env_callable,
+        num_shards=4,
+        sharded_keys={
+            "ops_headers",
+            "func_definitions",
+            "func_registrations",
+            "func_add_back_views_definitions",
+            "func_add_back_views_registrations",
+        },
+    )
+
+    cpu_fm.write(
+        "FunctionalInverses.h",
+        lambda: {
+            "view_inverse_declarations": list(
+                mapMaybe(
+                    lambda g: gen_functionalization_view_inverse_declaration(
+                        selector, g
+                    ),
+                    [
+                        g
+                        for g in native_functions_with_view_groups
+                        if isinstance(g, NativeFunctionsViewGroup)
+                    ],
+                )
+            )
+        },
+    )
+
+    # Note [view_copy NativeFunctions]
+    # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd
+    # needs to have a corresponding non-aliasing {view}_copy variant.
+    # Backends that use functionalization and don't know how to handle aliasing ops
+    # are expected to implement kernels for these {view}_copy kernels instead.
+    # The code for {view}_copy operators in core is pretty boilerplate-heavy however,
+    # so we codegen the following:
+    # (1) A CompositeExplicitAutograd kernel for every {view}_copy operator.
+    #     These are never explicitly invoked by the functionalization pass,
+    #     but they could theoretically be called from user code (I added these kernels for completeness,
+    #     since the ops are part of the public API).
+    # (2) A derivative formula for every {view}_copy operator
+    #     {view}_copy operators can re-use the same derivative formulas as their {view} op counterparts,
+    #     so rather than stamping all of the entries out in derivatives.yaml,
+    #     we codegen them in.
+    #     This is similar to how autograd codegen doesn't require inplace ops to have a derivatives.yaml entry.
+    cpu_fm.write(
+        "CompositeViewCopyKernels.cpp",
+        lambda: {
+            "ops_headers": [
+                "\n".join(
+                    f"#include <ATen/ops/{f.root_name}_ops.h>"
+                    for f in (
+                        [g.view] if g.view_copy is None else [g.view, g.view_copy]
+                    )
+                )
+                for g in native_functions_with_view_groups
+                if isinstance(g, NativeFunctionsViewGroup)
+            ],
+            "CompositeViewCopyKernel_Definitions": list(
+                mapMaybe(
+                    gen_composite_view_copy_kernel,
+                    [
+                        g
+                        for g in native_functions_with_view_groups
+                        if isinstance(g, NativeFunctionsViewGroup)
+                    ],
+                )
+            ),
+        },
+    )
+
+
+def gen_declarations_yaml(
+    cpu_fm: FileManager, native_functions: Sequence[NativeFunction]
+) -> None:
+    cpu_fm.write(
+        "Declarations.yaml",
+        lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions]),
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate ATen source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="aten/src/ATen",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dependencies",
+        help="output a list of dependencies into the given file and exit",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="run without writing any files (still updates outputs)",
+    )
+    parser.add_argument(
+        "--per-operator-headers",
+        action="store_true",
+        help="generate separate headers per operator in ATen/ops",
+    )
+    parser.add_argument(
+        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+    )
+    parser.add_argument(
+        "--rocm",
+        action="store_true",
+        help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
+    )
+    parser.add_argument(
+        "--mps",
+        action="store_true",
+        help="Generate MPS registration code when set",
+    )
+    # TODO: --op_registration_whitelist will be removed when all call-sites
+    # for gen.py are moved over to using the operator YAML file for mobile
+    # custom build.
+    parser.add_argument(
+        "--op_registration_whitelist",
+        nargs="*",
+        help="filter op registrations by the whitelist (if set); "
+        "each item is `namespace`::`operator name` without overload name; "
+        "e.g.: aten::empty aten::conv2d ...",
+    )
+    parser.add_argument(
+        "--op_selection_yaml_path",
+        help="Provide a path to the operator selection (for custom build) YAML "
+        "that contains the information about the set of selected operators "
+        "and their categories (training, ...). Each operator is either a "
+        "full operator name with overload or just a bare operator name. "
+        "The operator names also contain the namespace prefix (e.g. aten::)",
+    )
+    parser.add_argument(
+        "--backend_whitelist",
+        nargs="*",
+        help="filter dispatch backend by the whitelist (if set), "
+        "e.g.: CPU CUDA QuantizedCPU ...",
+    )
+    parser.add_argument(
+        "--static_dispatch_backend",
+        nargs="*",
+        help="generate static dispatch code for the specific backend (if set)",
+    )
+    parser.add_argument(
+        "--skip_dispatcher_op_registration",
+        action="store_true",
+        help="Avoid registering operators into the dispatcher.",
+    )
+    parser.add_argument(
+        "--force_schema_registration",
+        action="store_true",
+        help="force it to generate schema-only registrations for all ops, including"
+        "those that are not listed on --op_registration_whitelist",
+    )
+    parser.add_argument(
+        "--generate",
+        type=str,
+        nargs="*",
+        choices=["headers", "sources", "declarations_yaml"],
+        default=["headers", "sources", "declarations_yaml"],
+        help="Generate only a subset of files",
+    )
+
+    options = parser.parse_args()
+
+    selector = get_custom_build_selector(
+        options.op_registration_whitelist,
+        options.op_selection_yaml_path,
+    )
+
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml")
+
+    from torchgen.model import dispatch_keys
+
+    # TODO: stop generating CUDA kernels for non-CUDA builds
+    ignore_keys = set()
+    if not options.mps:
+        ignore_keys.add(DispatchKey.MPS)
+
+        if DispatchKey.MPS in dispatch_keys:
+            del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)]
+
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path, ignore_keys)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+    structured_native_functions = [
+        g for g in grouped_native_functions if isinstance(g, NativeFunctionsGroup)
+    ]
+    native_functions_with_view_groups = get_grouped_by_view_native_functions(
+        native_functions
+    )
+
+    template_dir = os.path.join(options.source_path, "templates")
+
+    # NB: It is mandatory to NOT use os.path.join here, as the install directory
+    # will eventually be ingested by cmake, which does not respect Windows style
+    # path slashes.  If you switch this to use os.path.join, you'll get an error
+    # like:
+    #
+    #   Syntax error in cmake code when parsing string
+    #
+    #     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
+    #
+    #   Invalid character escape '\c'.
+    core_install_dir = f"{options.install_dir}/core"
+    pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
+    ops_install_dir = f"{options.install_dir}/ops"
+    pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)
+
+    core_fm = make_file_manager(options=options, install_dir=core_install_dir)
+    cpu_fm = make_file_manager(options=options)
+    cpu_vec_fm = make_file_manager(options=options)
+    cuda_fm = make_file_manager(options=options)
+    ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
+
+    extra_cuda_headers = """\
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/cuda/CUDAContext.h>"""
+    if options.rocm:
+        extra_cuda_headers = """\
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/ATenHIPGeneral.h>
+#include <ATen/hip/HIPDevice.h>
+#include <ATen/hip/HIPContext.h>"""
+
+    # Only a limited set of dispatch keys get CPUFunctions.h headers generated
+    # for them; this is the set
+    functions_keys = {
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.CompositeImplicitAutograd,
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.Meta,
+    }
+    if options.backend_whitelist:
+        dispatch_keys = [
+            k
+            for k in dispatch_keys
+            if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist
+        ]
+
+    static_dispatch_idx: List[BackendIndex] = []
+    if options.static_dispatch_backend:
+        static_dispatch_idx = [
+            backend_indices[DispatchKey.parse(key)]
+            for key in options.static_dispatch_backend
+        ]
+        for key in options.static_dispatch_backend:
+            dp_key = DispatchKey.parse(key)
+            if dp_key not in functions_keys:
+                functions_keys.add(dp_key)
+
+    if "sources" in options.generate:
+        gen_source_files(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            native_functions_with_view_groups=native_functions_with_view_groups,
+            selector=selector,
+            static_dispatch_idx=static_dispatch_idx,
+            backend_indices=backend_indices,
+            core_fm=core_fm,
+            cpu_fm=cpu_fm,
+            cpu_vec_fm=cpu_vec_fm,
+            cuda_fm=cuda_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=options.rocm,
+            force_schema_registration=options.force_schema_registration,
+            per_operator_headers=options.per_operator_headers,
+            skip_dispatcher_op_registration=options.skip_dispatcher_op_registration,
+        )
+
+    if "headers" in options.generate:
+        gen_headers(
+            native_functions=native_functions,
+            grouped_native_functions=grouped_native_functions,
+            structured_native_functions=structured_native_functions,
+            static_dispatch_idx=static_dispatch_idx,
+            selector=selector,
+            backend_indices=backend_indices,
+            core_fm=core_fm,
+            cpu_fm=cpu_fm,
+            cuda_fm=cuda_fm,
+            ops_fm=ops_fm,
+            dispatch_keys=dispatch_keys,
+            functions_keys=functions_keys,
+            rocm=options.rocm,
+            per_operator_headers=options.per_operator_headers,
+        )
+
+    if "declarations_yaml" in options.generate:
+        gen_declarations_yaml(native_functions=native_functions, cpu_fm=cpu_fm)
+
+    if options.output_dependencies:
+        depfile_path = pathlib.Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        for fm, prefix in [
+            (cpu_fm, ""),
+            (cpu_vec_fm, "cpu_vec_"),
+            (core_fm, "core_"),
+            (cuda_fm, "cuda_"),
+            (ops_fm, "ops_"),
+        ]:
+            varname = prefix + depfile_stem
+            path = depfile_path.parent / (prefix + depfile_name)
+            fm.write_outputs(varname, str(path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
new file mode 100644
index 000000000000..a84d42aec5ac
--- /dev/null
+++ b/torchgen/gen_backend_stubs.py
@@ -0,0 +1,557 @@
+import pathlib
+import argparse
+import os
+import yaml
+import re
+from collections import namedtuple, Counter, defaultdict
+from typing import List, Dict, Union, Sequence, Optional
+from torchgen.gen import (
+    get_grouped_native_functions,
+    parse_native_yaml,
+    NamespaceHelper,
+)
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import Target, concatMap, context, YamlLoader, FileManager
+from torchgen.context import native_function_manager
+from torchgen.code_template import CodeTemplate
+import torchgen.dest as dest
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.types import DispatcherSignature
+
+
+# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
+# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping)
+ParsedExternalYaml = namedtuple(
+    "ParsedExternalYaml",
+    ["backend_key", "autograd_key", "class_name", "cpp_namespace", "backend_indices"],
+)
+
+
+def parse_backend_yaml(
+    backend_yaml_path: str,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> ParsedExternalYaml:
+
+    native_functions_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f
+        for f in concatMap(
+            lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()),
+            grouped_native_functions,
+        )
+    }
+
+    with open(backend_yaml_path, "r") as f:
+        yaml_values = yaml.load(f, Loader=YamlLoader)
+    assert isinstance(yaml_values, dict)
+
+    valid_keys = [
+        "backend",
+        "class_name",
+        "cpp_namespace",
+        "extra_headers",
+        "supported",
+        "autograd",
+        "full_codegen",
+    ]
+
+    backend = yaml_values.pop("backend", None)
+    assert backend is not None, 'You must provide a value for "backend"'
+
+    class_name = yaml_values.pop("class_name", None)
+
+    cpp_namespace = yaml_values.pop("cpp_namespace", None)
+    assert cpp_namespace is not None, 'You must provide a value for "cpp_namespace"'
+
+    # Mostly just defaulting to false to stick with LazyTensor convention.
+    use_out_as_primary = yaml_values.pop("use_out_as_primary", False)
+    assert isinstance(
+        use_out_as_primary, bool
+    ), f"You must provide either True or False for use_out_as_primary. Provided: {use_out_as_primary}"
+
+    use_device_guard = yaml_values.pop("device_guard", False)
+    assert isinstance(
+        use_device_guard, bool
+    ), f"You must provide either True or False for device_guard. Provided: {use_device_guard}"
+
+    supported = yaml_values.pop("supported", [])
+    if supported is None:
+        supported = []  # Allow an empty list of supported ops
+    assert isinstance(
+        supported, list
+    ), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})'
+
+    supported_autograd = yaml_values.pop("autograd", [])
+    assert isinstance(
+        supported_autograd, list
+    ), f'expected "autograd" to be a list, but got: {supported_autograd}'
+
+    # full_codegen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
+    full_codegen = yaml_values.pop("full_codegen", [])
+    supported.extend(full_codegen)
+
+    assert (
+        len(yaml_values.keys()) == 0
+    ), f'{backend_yaml_path} contains unexpected keys: {", ".join(yaml_values.keys())}. \
+Only the following keys are supported: {", ".join(valid_keys)}'
+
+    def create_backend_index(
+        backend_ops: List[str],
+        dispatch_key: DispatchKey,
+        *,
+        use_out_as_primary: bool,
+        use_device_guard: bool,
+    ) -> BackendIndex:
+        metadata: Dict[OperatorName, BackendMetadata] = {}
+        for op in backend_ops:
+            op_name = OperatorName.parse(op)
+            assert (
+                op_name in native_functions_map
+            ), f"Found an invalid operator name: {op_name}"
+            # See Note [External Backends Follow Dispatcher API]
+            kernel_name = dispatcher.name(native_functions_map[op_name].func)
+            # TODO: allow structured external backends later.
+            m = BackendMetadata(kernel=kernel_name, structured=False)
+            metadata[op_name] = m
+        return BackendIndex(
+            dispatch_key=dispatch_key,
+            use_out_as_primary=use_out_as_primary,
+            external=True,
+            device_guard=use_device_guard,
+            index=metadata,
+        )
+
+    backend_key: Optional[DispatchKey] = None
+    if len(supported) > 0:
+        with context(
+            lambda: f'The provided value for "backend" must be a valid DispatchKey, but got {backend}.'
+        ):
+            backend_key = DispatchKey.parse(backend)
+
+        backend_idx = create_backend_index(
+            supported,
+            backend_key,
+            use_out_as_primary=use_out_as_primary,
+            use_device_guard=use_device_guard,
+        )
+        assert backend_key not in backend_indices
+        backend_indices[backend_key] = backend_idx
+
+    autograd_key: Optional[DispatchKey] = None
+    if len(supported_autograd) > 0:
+        with context(
+            lambda: f'The "autograd" key was specified, which indicates that you would like to override \
+the behavior of autograd for some operators on your backend. However "Autograd{backend}" is not a valid DispatchKey.'
+        ):
+            autograd_key = DispatchKey.parse(f"Autograd{backend}")
+
+        autograd_idx = create_backend_index(
+            supported_autograd,
+            autograd_key,
+            use_out_as_primary=use_out_as_primary,
+            use_device_guard=use_device_guard,
+        )
+        assert autograd_key not in backend_indices
+        backend_indices[autograd_key] = autograd_idx
+
+    for g in grouped_native_functions:
+        if isinstance(g, NativeFunction):
+            forward_kernels = (
+                []
+                if backend_key is None
+                else [
+                    m
+                    for m in [backend_indices[backend_key].get_kernel(g)]
+                    if m is not None
+                ]
+            )
+            backward_kernels = (
+                []
+                if autograd_key is None
+                else [
+                    m
+                    for m in [backend_indices[autograd_key].get_kernel(g)]
+                    if m is not None
+                ]
+            )
+        else:
+            forward_kernels = (
+                []
+                if backend_key is None
+                else [
+                    m
+                    for m in [
+                        backend_indices[backend_key].get_kernel(f)
+                        for f in g.functions()
+                    ]
+                    if m is not None
+                ]
+            )
+            backward_kernels = (
+                []
+                if autograd_key is None
+                else [
+                    m
+                    for m in [
+                        backend_indices[autograd_key].get_kernel(f)
+                        for f in g.functions()
+                    ]
+                    if m is not None
+                ]
+            )
+
+        forward_kernels = [f for f in forward_kernels if f is not None]
+        backward_kernels = [f for f in backward_kernels if f is not None]
+        assert (
+            len(forward_kernels) == 0 or len(backward_kernels) == 0
+        ), f'Currently, all variants of an op must either be registered to a backend key, or to a backend\'s \
+autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! \
+{forward_kernels[0].kernel} is listed under "supported", but {backward_kernels[0].kernel} is listed under "autograd".'
+
+    return ParsedExternalYaml(
+        backend_key, autograd_key, class_name, cpp_namespace, backend_indices
+    )
+
+
+def error_on_missing_kernels(
+    native_functions: Sequence[NativeFunction],
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    backend_key: DispatchKey,
+    autograd_key: Optional[DispatchKey],
+    class_name: str,
+    kernel_defn_file_path: str,
+    full_codegen: Optional[List[OperatorName]] = None,
+) -> None:
+    try:
+        with open(kernel_defn_file_path, "r") as f:
+            backend_defns = f.read()
+    except IOError:
+        raise AssertionError(
+            f"Unable to read from the specified impl_path file: {kernel_defn_file_path}"
+        )
+
+    if full_codegen is None:
+        full_codegen = []
+
+    expected_backend_op_names: List[OperatorName] = (
+        list(backend_indices[backend_key].index.keys()) + []
+        if autograd_key is None
+        else list(backend_indices[autograd_key].index.keys())
+    )
+    expected_backend_native_funcs: List[NativeFunction] = [
+        f
+        for f in native_functions
+        if f.func.name in expected_backend_op_names and f.func.name not in full_codegen
+    ]
+    expected_backend_kernel_name_counts: Dict[str, List[NativeFunction]] = defaultdict(
+        list
+    )
+    for native_f in expected_backend_native_funcs:
+        expected_backend_kernel_name_counts[dispatcher.name(native_f.func)].append(
+            native_f
+        )
+
+    kernel_defn_regex = rf"{class_name}::([\w\d]*)\([^\)]*\)\s*{{"
+    actual_backend_kernel_name_counts = Counter(
+        re.findall(kernel_defn_regex, backend_defns)
+    )
+
+    missing_kernels_err_msg = ""
+    for expected_name, funcs in expected_backend_kernel_name_counts.items():
+        expected_overload_count = len(funcs)
+        actual_overload_count = actual_backend_kernel_name_counts[expected_name]
+        if expected_overload_count != actual_overload_count:
+
+            def create_decl(f: NativeFunction) -> str:
+                with native_function_manager(f):
+                    return DispatcherSignature.from_schema(f.func).decl()
+
+            expected_schemas_str = "\n".join([create_decl(f) for f in funcs])
+            missing_kernels_err_msg += f"""
+{class_name} is missing a kernel definition for {expected_name}. We found {actual_overload_count} kernel(s) with that name,
+but expected {expected_overload_count} kernel(s). The expected function schemas for the missing operator are:
+{expected_schemas_str}
+
+"""
+    assert missing_kernels_err_msg == "", missing_kernels_err_msg
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate backend stub files")
+    parser.add_argument(
+        "-s",
+        "--source_yaml",
+        help="path to source yaml file containing operator external definitions",
+    )
+    parser.add_argument("-o", "--output_dir", help="output directory")
+    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
+    parser.add_argument(
+        "--impl_path",
+        type=str,
+        default=None,
+        help="path to the source C++ file containing kernel definitions",
+    )
+    options = parser.parse_args()
+
+    run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path)
+
+
+def gen_dispatchkey_nativefunc_headers(
+    fm: FileManager,
+    class_name: str,
+    cpp_namespace: str,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_dispatch_key: DispatchKey,
+    autograd_dispatch_key: Optional[DispatchKey],
+    backend_name: str = "",
+) -> None:
+    assert class_name is not None
+    generated_comment = (
+        "Autogenerated file by gen_backend_stubs.py. Do not edit directly!"
+    )
+
+    # Convert to a set first to remove duplicate kernel names.
+    # Backends are allowed to repeat kernel names; only generate the declaration once!
+    # Sort for deterministic output.
+    backend_declarations = list(
+        sorted(
+            set(
+                concatMap(
+                    lambda f: dest.compute_native_function_declaration(
+                        f, backend_indices[backend_dispatch_key]
+                    ),
+                    grouped_native_functions,
+                )
+            )
+        )
+    )
+    autograd_declarations = list(
+        sorted(
+            set(
+                concatMap(
+                    lambda f: []
+                    if autograd_dispatch_key is None
+                    else dest.compute_native_function_declaration(
+                        f, backend_indices[autograd_dispatch_key]
+                    ),
+                    grouped_native_functions,
+                )
+            )
+        )
+    )
+
+    ns_helper = NamespaceHelper(cpp_namespace)
+    fm.write_with_template(
+        f"{backend_dispatch_key}NativeFunctions.h",
+        "DispatchKeyNativeFunctions.h",
+        lambda: {
+            "generated_comment": generated_comment,
+            "namespace_prologue": ns_helper.prologue,
+            "class_name": class_name,
+            "namespace_epilogue": ns_helper.epilogue,
+            "dispatch_declarations": backend_declarations + autograd_declarations,
+            "BackendName": backend_name,
+            "DispatchKey": backend_dispatch_key,
+        },
+    )
+
+
+def gen_dispatcher_registrations(
+    fm: FileManager,
+    output_dir: str,
+    class_name: str,
+    cpp_namespace: str,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+    backend_dispatch_key: DispatchKey,
+    dispatch_key: DispatchKey,
+    selector: "SelectiveBuilder",
+    # build_in_tree is true for lazy TS backend and affects include paths, not used for external backends
+    build_in_tree: bool = False,
+    per_operator_headers: bool = False,
+    backend_name: str = "",
+    eager_registration: bool = True,
+) -> None:
+    headers = [
+        f"{output_dir}/{backend_dispatch_key}NativeFunctions.h",
+    ]
+    if build_in_tree:
+        external_backend_headers_str = "\n".join(f"#include <{h}>" for h in headers)
+    else:
+        external_backend_headers_str = "\n".join(f'#include "{h}"' for h in headers)
+
+    assert class_name is not None
+    backend_index = backend_indices[dispatch_key]
+
+    dispatch_registrations_body = list(
+        concatMap(
+            dest.RegisterDispatchKey(
+                backend_index,
+                Target.REGISTRATION,
+                selector,
+                rocm=False,
+                cpp_namespace=cpp_namespace,
+                class_method_name=f"{class_name}",
+                skip_dispatcher_op_registration=False,
+            ),
+            grouped_native_functions,
+        )
+    )
+    deferred_dispatch_registrations = ""
+    static_init_dispatch_registrations = ""
+    if eager_registration:
+        static_template = CodeTemplate(
+            """\
+TORCH_LIBRARY_IMPL(aten, $dispatch_key, m) {
+    $dispatch_registrations_body
+};"""
+        )
+        static_init_dispatch_registrations = static_template.substitute(
+            dispatch_key=dispatch_key,
+            dispatch_registrations_body=dispatch_registrations_body,
+        )
+    else:
+        deferred_template = CodeTemplate(
+            """\
+TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions() {
+    static auto m = MAKE_TORCH_LIBRARY_IMPL(aten, $dispatch_key);
+    $dispatch_registrations_body
+}"""
+        )
+        deferred_dispatch_registrations = deferred_template.substitute(
+            backend_name=backend_name,
+            dispatch_key=dispatch_key,
+            dispatch_registrations_body=dispatch_registrations_body,
+        )
+
+    fm.write_with_template(
+        f"Register{dispatch_key}.cpp",
+        "RegisterDispatchKey.cpp",
+        lambda: {
+            "static_init_dispatch_registrations": static_init_dispatch_registrations,
+            "deferred_dispatch_registrations": deferred_dispatch_registrations,
+            "extra_cuda_headers": "",
+            "external_backend_headers": external_backend_headers_str,
+            "ops_headers": "#include <ATen/Functions.h>"
+            if not per_operator_headers
+            else "",
+            "DispatchKey": dispatch_key,
+            "dispatch_namespace": dispatch_key.lower(),
+            "dispatch_headers": dest.gen_registration_headers(
+                backend_index, per_operator_headers=per_operator_headers, rocm=False
+            ),
+            "dispatch_helpers": dest.gen_registration_helpers(backend_index),
+            "dispatch_namespaced_definitions": "",
+            "dispatch_anonymous_definitions": list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_index,
+                        Target.ANONYMOUS_DEFINITION,
+                        selector,
+                        rocm=False,
+                        cpp_namespace=cpp_namespace,
+                        class_method_name=f"{class_name}",
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    grouped_native_functions,
+                )
+            ),
+        },
+    )
+
+
+def run(
+    source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str] = None
+) -> None:
+
+    # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
+    pytorch_root = pathlib.Path(__file__).parent.parent.absolute()
+    template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
+
+    def make_file_manager(install_dir: str) -> FileManager:
+        return FileManager(
+            install_dir=install_dir, template_dir=template_dir, dry_run=dry_run
+        )
+
+    fm = make_file_manager(output_dir)
+
+    native_yaml_path = os.path.join(
+        pytorch_root, "aten/src/ATen/native/native_functions.yaml"
+    )
+    tags_yaml_path = os.path.join(pytorch_root, "aten/src/ATen/native/tags.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+    parsed_backend_yaml = parse_backend_yaml(
+        source_yaml, grouped_native_functions, backend_indices
+    )
+    backend_key = parsed_backend_yaml.backend_key
+    autograd_key = parsed_backend_yaml.autograd_key
+    cpp_namespace = parsed_backend_yaml.cpp_namespace
+    class_name = parsed_backend_yaml.class_name
+    backend_indices = parsed_backend_yaml.backend_indices
+
+    selector = SelectiveBuilder.get_nop_selector()
+
+    if backend_key is None:
+        # This could be useful if a backend wants to quickly set up a noop yaml file but doesn't have any kernels ready yet.
+        return
+
+    if class_name is None:
+        # class_name is an optional argument to backend yaml file.
+        # if specified it allows an external backend to override
+        # the name of the class that all generated kernel definitions live under.
+        # if not specified, its value is given as native_function_class_name.
+        class_name = backend_indices[backend_key].native_function_class_name()
+    assert class_name is not None
+
+    if impl_path is not None:
+        error_on_missing_kernels(
+            native_functions,
+            backend_indices,
+            backend_key,
+            autograd_key,
+            class_name,
+            impl_path,
+        )
+
+    gen_dispatchkey_nativefunc_headers(
+        fm,
+        class_name,
+        cpp_namespace,
+        backend_indices,
+        grouped_native_functions,
+        backend_key,
+        autograd_key,
+    )
+
+    for dispatch_key in (
+        [backend_key] if autograd_key is None else [backend_key, autograd_key]
+    ):
+        gen_dispatcher_registrations(
+            fm,
+            output_dir,
+            class_name,
+            cpp_namespace,
+            backend_indices,
+            grouped_native_functions,
+            backend_key,
+            dispatch_key,
+            selector,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
new file mode 100644
index 000000000000..c6cf76744f95
--- /dev/null
+++ b/torchgen/gen_functionalization_type.py
@@ -0,0 +1,584 @@
+from torchgen.api import cpp, dispatcher
+from torchgen.api.types import (
+    DispatcherSignature,
+    Binding,
+    FunctionalizationLambda,
+    ViewInverseSignature,
+    NativeSignature,
+    CType,
+    BaseCType,
+    VectorCType,
+    tensorListT,
+    tensorT,
+)
+from torchgen.api.translate import translate
+from torchgen.context import (
+    with_native_function,
+    with_native_function_and,
+    native_function_manager,
+)
+from torchgen.model import (
+    Argument,
+    NativeFunction,
+    SchemaKind,
+    BackendIndex,
+    FunctionSchema,
+    SelfArgument,
+    TensorOptionsArguments,
+    BaseType,
+    BaseTy,
+    NativeFunctionsViewGroup,
+    ListType,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+
+from typing import List, Optional, Union, Tuple, Callable
+
+# This file contains codegen that relates to the functionalization pass.
+# It includes:
+# - gen_functionalization_definition
+#     Generates dispatcher kernel definitions for the functionalization pass.
+# - gen_functionalization_registration
+#     Generates dispatcher kernel registrations for the functionalization pass.
+# - gen_functionalization_view_inverse_declaration
+#     Generates a declaration for an "inverse view", for every view op
+#     that is needed in functionalization. We manually implement their definitions.
+# - gen_composite_view_copy_kernel
+#     Generates view_copy() composite kernels for all view_copy operators.
+
+# Generates the body of the default composite C++ kernel for a {view}_copy NativeFunction
+# See Note [view_copy NativeFunctions]
+@with_native_function
+def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]:
+
+    if g.view_copy is None:
+        return None
+    # view_copy is a native signature, since we're generating an at::native:: kernel
+    view_copy_sig = NativeSignature(g.view_copy.func)
+    # view is a dispatcher signature, since we're calling into the at::_ops API
+    view_sig = DispatcherSignature(g.view.func)
+
+    view_api_name = g.view.func.name.unambiguous_name()
+    exprs = ", ".join(
+        [e.expr for e in translate(view_copy_sig.arguments(), view_sig.arguments())]
+    )
+
+    # view ops today always return either a Tensor or a list of Tensors
+    assert len(g.view.func.returns) == 1
+    assert g.view.func.returns[0].type == BaseType(
+        BaseTy.Tensor
+    ) or g.view.func.returns[0].type == ListType(BaseType(BaseTy.Tensor), None)
+
+    if g.view.func.returns[0].type == BaseType(BaseTy.Tensor):
+        return_cloned_output = """\
+  return output.clone();"""
+    else:
+        # If the return type is a list, we need to clone each tensor in the list.
+        return_cloned_output = f"""\
+  {view_copy_sig.returns_type().cpp_type()} out_clone;
+  for (const auto i : c10::irange(output.size())) {{
+    out_clone.push_back(output[i].clone());
+  }}
+  return out_clone;"""
+
+    # The default generated composite kernel for {view}_copy() operators just clones
+    # the input tensor, and runs the underlying view on the clone.
+    return f"""
+{view_copy_sig.defn()} {{
+  auto output = at::_ops::{view_api_name}::call({exprs});
+  {return_cloned_output}
+}}
+"""
+
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+
+# This function constructs the return statement for the kernels that contain mutations
+# It mostly just needs to special case multi-output returns to wrap the result in a tuple
+def return_str(f: NativeFunction) -> str:
+    # Need to check both # outs and # returns. Why?
+    # out= ops with a mutable Tensor(a!)[] argument are expected to have a void return type.
+    if len(f.func.arguments.out) != 0 and len(f.func.returns) != 0:
+        if len(f.func.arguments.out) > 1:
+            return_names = ", ".join(a.name for a in f.func.arguments.out)
+            return f"return {DispatcherSignature.from_schema(f.func).returns_type().cpp_type()}({return_names});"
+        else:
+            return f"return {f.func.arguments.out[0].name}"
+    if f.func.arguments.self_arg is not None and len(f.func.returns) != 0:
+        return f"return {f.func.arguments.self_arg.argument.name}"
+    return ""
+
+
+def wrapper_name(func: FunctionSchema) -> str:
+    if func.name.overload_name:
+        return f"{cpp.name(func)}_{func.name.overload_name}"
+    else:
+        return cpp.name(func)
+
+
+def is_tensor_like(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> bool:
+    return isinstance(a, SelfArgument) or (
+        isinstance(a, Argument) and a.type.is_tensor_like()
+    )
+
+
+# We need to wrap / unwrap various arguments from the op in the functionalization kernels.
+# Some op schemas include non-owning types though (like TensorList),
+# and when we unwrap them we expect to get out an owning type!.
+# We also return a lambda that tells you how to conver the non-owning type argument into the owning type.
+def get_owning_type(t: CType) -> Tuple[CType, Callable[[str], str]]:
+    if t == BaseCType(tensorListT):
+        return VectorCType(BaseCType(tensorT)), lambda x: f"{x}.vec()"
+    # There are technically other non-owning types out there (like IntArrayRef),
+    # but functionalization only actually cares about the ones involving tensors.
+    return t, lambda x: x
+
+
+# unwraps all tensor-like arguments, returning:
+# (1) a string containing all of the logic that does the unwrapping
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def unwrap_tensor_args(
+    sig: DispatcherSignature, *, is_view_op: bool
+) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if is_tensor_like(arg.argument):
+            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
+            unwrapped_name = f"{arg.name}_"
+            # For most ops, the functionalization needs to sync any pending updates on the input tensors
+            # before calling the operator, since otherwise the operator will act on stale data.
+            # For view ops though, we can continue to defer syncing until the tensor is used by
+            # a non-view operator.
+            maybe_sync_input = (
+                "" if is_view_op else f"at::functionalization::impl::sync({arg.name});"
+            )
+            unwrapped_type, conversion_fn = get_owning_type(
+                arg.nctype.remove_const_ref().type
+            )
+            unwrapped_tensor_args.append(
+                f"""
+      {unwrapped_type.cpp_type()} {unwrapped_name};
+      if (at::functionalization::impl::isFunctionalTensor({arg.name})) {{
+        {maybe_sync_input}
+        {unwrapped_name} = at::functionalization::impl::from_functional_tensor({arg.name});
+      }} else {{
+        {unwrapped_name} = {conversion_fn(arg.name)};
+      }}"""
+            )
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
+            context.append(arg)
+    unwrap_tensor_args_str = "\n      ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+
+
+# converts  all tensor-like arguments to meta tensors, which are used to compute stride info. Returns:
+# (1) a string containing all of the logic that does the conversions.
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if is_tensor_like(arg.argument):
+            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
+            # for tensor inputs, we want to unwrap them before passing them into the redispatch calls.
+            a_ = arg.name
+            unwrapped_name = f"{arg.name}_meta"
+            unwrapped_tensor_args.append(
+                f"auto {unwrapped_name} = at::native::empty_strided_meta({a_}.sizes(), {a_}.strides(), \
+/*dtype=*/c10::make_optional({a_}.scalar_type()), /*layout=*/c10::make_optional({a_}.layout()), \
+/*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);"
+            )
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            # for non-tensor inputs, we want to pass them directly into the redispatch calls.
+            context.append(arg)
+    unwrap_tensor_args_str = "\n        ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+
+
+# The functionalization codegen currently expects view op schemas to have this form:
+# foo(Tensor(a), ...) -> Tensor(a) (e.g. transpose)
+# foo(Tensor(a!), ...) -> Tensor(a!) (e.g. transpose_)
+def assert_view_op_properties(func: FunctionSchema) -> None:
+    def is_alias(a: Argument) -> bool:
+        return a.annotation is not None
+
+    args = func.arguments.flat_non_out
+    # The first argument is a tensor with an alias semantics (annotations)
+    assert len(args) > 0 and args[0].type == BaseType(
+        BaseTy.Tensor
+    ), f"""In the functionalization codegen, we expect the first argument of every view operator to be a tensor,
+but found an argument of type {str(args[0].type)} for operator: {str(func.name)}."""
+    # No other arguments have aliasing semantics
+    assert is_alias(args[0]) and not any(
+        is_alias(a) for a in args[1:]
+    ), """In the functionalization codegen, we expect the first argument of every view operator to alias the output.
+View operators with multiple aliasing inputs aren't supported yet. Found an operator that doesn't satisfy this constraint"""
+
+
+# Generates the Functionalization kernel for:
+# - ops that create aliases (e.g. transpose())
+# - ops that are views AND mutations (e.g. transpose_())
+def emit_view_functionalization_body(
+    g: NativeFunctionsViewGroup, *, view_inplace: bool
+) -> str:
+    if view_inplace:
+        # This op is both an inplace op AND a view op.
+        # See Note [Functionalization Pass - Inplace View Ops] for details.
+        # I currently have the view meta call into the out-of-place variant of the view, to avoid
+        # having to define an extra ~20 inplace {view}_inverse_ functions.
+        # Most view ops don't have NativeFunctionGroup's both, because we don't define out= variants for view ops.
+        # I'm assuming that every inplace-view op has a corresponding out-of-place view op,
+        # with the same name but the trailing underscore removed.
+        # This is currently asserted at parse time in gen.py (see error_check_native_functions).
+        assert g.view_inplace is not None
+        f = g.view_inplace
+    else:
+        f = g.view
+
+    assert g.view_copy is not None
+    with native_function_manager(f):
+        call_sig = DispatcherSignature.from_schema(g.view_copy.func)
+
+        # the "view_copy" op name that the functionalization kernels need to call
+        api_name = g.view_copy.func.name.unambiguous_name()
+        # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors)
+        # "no-op"ing in this context is just redispatching to the original op.
+        noop_api_name = f.func.name.unambiguous_name()
+
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+        assert_view_op_properties(f.func)
+        view_tensor_name = dispatcher_sig.arguments()[0].name
+
+        return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type()
+
+        unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(
+            dispatcher_sig, is_view_op=True
+        )
+        view_redispatch_args = [
+            e.expr
+            for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)
+        ]
+
+        forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False)
+        reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True)
+
+        # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
+        meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
+        meta_call_args = [
+            e.expr for e in translate(meta_call_ctx, call_sig.arguments(), method=False)
+        ]
+
+        if "inplace_view" in f.tags:
+            # See Note [Functionalization Pass - Inplace View Ops] for more details
+            return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{
+        // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper.
+        {unwrap_tensor_args_str}
+        at::AutoDispatchSkipFunctionalize guard;
+        return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+      }}
+      auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }}
+      );
+      at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta);
+      {return_type} reference_tensor_output;
+      {{
+        at::AutoDispatchSkipFunctionalize guard;
+        {meta_conversion_str}
+        reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)});
+      }}
+      // See  Note [Propagating strides in the functionalization pass]
+      at::functionalization::impl::set_sizes_strides_offset({view_tensor_name}, reference_tensor_output);
+      return {view_tensor_name};
+    }}
+"""
+
+        else:
+            return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      {unwrap_tensor_args_str}
+      if (!at::functionalization::impl::isFunctionalTensor({view_tensor_name})) {{
+        // functionalization is re-entrant, but will no-op if it wasn't passed a FunctionalTensorWrapper.
+        at::AutoDispatchSkipFunctionalize guard;
+        return at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+      }}
+      auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+      {return_type} tmp_output;
+      {return_type} reference_tensor_output;
+      {{
+        at::AutoDispatchSkipFunctionalize guard;
+        {meta_conversion_str}
+        reference_tensor_output = at::_ops::{noop_api_name}::call({', '.join(meta_call_args)});
+        if (reapply_views) {{
+          tmp_output = at::_ops::{noop_api_name}::call({', '.join(view_redispatch_args)});
+        }} else {{
+          tmp_output = at::_ops::{api_name}::call({', '.join(view_redispatch_args)});
+        }}
+      }}
+      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+        {forward_lambda.decl()} {{
+          if (reapply_views) {{
+            return {forward_lambda.inner_call(reapply_views=True)}
+          }} else {{
+            return {forward_lambda.inner_call(reapply_views=False)}
+          }}
+        }},
+        {reverse_lambda.decl()} {{
+          return {reverse_lambda.inner_call()}
+        }}
+      );
+      auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
+      // See  Note [Propagating strides in the functionalization pass]
+      at::functionalization::impl::set_sizes_strides_offset(out, reference_tensor_output);
+      return out;
+    }}
+"""
+
+
+# Generates the Functionalization kernel for:
+# - mutation ops (inplace and out= ops)
+@with_native_function_and
+def emit_inplace_functionalization_body(
+    f: NativeFunction, functional_op: Optional[NativeFunction]
+) -> str:
+    # mutation case
+    assert modifies_arguments(f)
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+
+    unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(
+        dispatcher_sig, is_view_op=False
+    )
+
+    mutated_names = [
+        a.name
+        for a in f.func.arguments.flat_all
+        if a.type.is_tensor_like() and a.annotation is not None
+    ]
+    non_mutated_names = [
+        a.name
+        for a in f.func.arguments.flat_all
+        if a.type.is_tensor_like() and a.annotation is None
+    ]
+    # all mutable inputs must be functional tensors in order to participate in functionalization
+    check_all_mutated_args_are_functional = " && ".join(
+        ["true"]
+        + [
+            f"at::functionalization::impl::isFunctionalTensor({a})"
+            for a in mutated_names
+        ]
+    )
+    check_any_non_mutated_args_are_functional = " || ".join(
+        ["false"]
+        + [
+            f"at::functionalization::impl::isFunctionalTensor({a})"
+            for a in non_mutated_names
+        ]
+    )
+    # These are used in the cases where we don't functionalize and redispatch to the inplace op
+    # case 1: we hit an inplace op that doesn't have an out-of-place equivalent
+    # case 2: we hit an inplace ops but our inputs are not functional tensors (in which case our kernel just no-ops)
+    inplace_exprs = [
+        e.expr
+        for e in translate(unwrapped_args_ctx, dispatcher_sig.arguments(), method=False)
+    ]
+
+    if functional_op is None:
+        # We can't functionalize this inplace op, since we don't know what the corresponding functional op is.
+        return_type = (
+            dispatcher.returns_type(f.func.returns).remove_const_ref().cpp_type()
+        )
+        warn_str = f"""Note: the functionalization pass encountered an operator ({str(f.func.name)}) that it could not \
+functionalize, because it couldn't find an out-of-place equivalent of the operator to call. \
+Instead, it's calling the inplace/view operator directly. \
+If this causes problems in your program, consider upstreaming the out-of-place op to PyTorch."""
+
+        return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      if (c10::impl::tls_local_dispatch_key_set().included_.has(c10::DispatchKey::Functionalize)) {{
+          TORCH_WARN("{warn_str}");
+      }}
+      {unwrap_tensor_args_str}
+      at::AutoDispatchSkipFunctionalize guard;
+      // Redispatch as normally otherwise, since XLA has its own lowerings for special inplace ops.
+      at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(inplace_exprs)});
+      {return_str(f)};
+    }}
+"""
+    else:
+        # call the out-of-place variant of the op
+        return_type = (
+            dispatcher.returns_type(functional_op.func.returns)
+            .remove_const_ref()
+            .cpp_type()
+        )
+        functional_sig = DispatcherSignature.from_schema(functional_op.func)
+        functional_exprs = [
+            e.expr
+            for e in translate(
+                unwrapped_args_ctx, functional_sig.arguments(), method=False
+            )
+        ]
+
+    if f.func.is_out_fn():
+        mutable_input_post_processing = "\n".join(
+            [
+                f"""
+      at::functionalization::impl::replace_(
+        {a.name}, {'std::get<' + str(i) + '>(tmp_output)' if len(f.func.returns) > 1 else 'tmp_output'});
+      at::functionalization::impl::commit_update({a.name});"""
+                for (i, a) in enumerate(f.func.arguments.out)
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+    else:
+        mutable_input_post_processing = "\n".join(
+            [
+                f"""
+      at::functionalization::impl::replace_({a.name}, tmp_output);
+      at::functionalization::impl::commit_update({a.name});"""
+                for a in f.func.arguments.flat_all
+                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
+            ]
+        )
+
+    return f"""
+    {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
+      {unwrap_tensor_args_str}
+      if (!({check_all_mutated_args_are_functional})) {{
+        if (({check_any_non_mutated_args_are_functional})) {{
+         // case 1: trying to mutate a non functional tensor with a functional tensor is an error
+         TORCH_INTERNAL_ASSERT(false,
+           "mutating a non-functional tensor with a functional tensor is not allowed.",
+           " Please ensure that all of your inputs are wrapped inside of a functionalize() call.");
+        }} else {{
+         // case 2: arguments are not functional tensors, so we no-op and redispatch.
+         at::AutoDispatchSkipFunctionalize guard;
+         at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(inplace_exprs)});
+        {return_str(f)};
+        }}
+      }} else {{
+        {return_type} tmp_output;
+        {{
+          at::AutoDispatchSkipFunctionalize guard;
+          tmp_output = at::_ops::{functional_op.func.name.unambiguous_name()}::call({', '.join(functional_exprs)});
+        }}
+        {mutable_input_post_processing}
+        {return_str(f)};
+      }}
+    }}"""
+
+
+# The below functions generate RegisterFunctionalization.cpp
+# These files provide the kernels that run the functionalization pass, which can be opted into
+# per backend (e.g. XLA or Vulkan), or as a composable transform (functionalize() in functorch).
+
+# See Note [Functionalization Pass: View Inverses].
+def gen_functionalization_view_inverse_declaration(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> Optional[str]:
+    # For every (non-composite) view op, we need a corresponding "inverse view" function.
+    # This generates the declarations so we get a good compiler error when someone adds a new view.
+    @with_native_function
+    def emit_decl_helper(g: NativeFunctionsViewGroup) -> Optional[str]:
+        if g.view.has_composite_implicit_autograd_kernel:
+            return None
+        view_copy_inverse_sig = ViewInverseSignature(g)
+        return view_copy_inverse_sig.decl()
+
+    return emit_decl_helper(g)
+
+
+def gen_functionalization_registration(
+    selector: SelectiveBuilder,
+    g: Union[NativeFunction, NativeFunctionsViewGroup],
+    composite_implicit_autograd_index: BackendIndex,
+) -> List[str]:
+    @with_native_function
+    def emit_registration_helper(f: NativeFunction) -> str:
+        if f.has_composite_implicit_autograd_kernel:
+            metadata = composite_implicit_autograd_index.get_kernel(f)
+            assert metadata is not None
+            native_api_name = metadata.kernel
+            sig = DispatcherSignature.from_schema(f.func)
+            # Note [Composite view ops in the functionalization pass]
+            # We don't need to worry about implemententing functionalization kernels for views with
+            # CompositeImplicitAutograd kernels, because we can just decompose them into their base operators.
+            # We can't just opt the entire Functionalization dispatch key into the composite keyset though,
+            # because we don't want to decompose non-view ops that are composite, like `at::ones`.
+            registration_str = (
+                f"static_cast<{sig.ptr_type()}>(at::native::{native_api_name})"
+            )
+        else:
+            # non-composite view ops (and inplace ops) get a normal registration.
+            registration_str = f"TORCH_FN(functionalization::{wrapper_name(f.func)})"
+        return f'm.impl("{f.func.name}", {registration_str});'
+
+    # Don't generate kernels in mobile build
+    if not selector.include_all_operators:
+        return []
+
+    if isinstance(g, NativeFunctionsViewGroup):
+        # functionalization needs to register kernels for view + view_inplace ops
+        view_str = [emit_registration_helper(g.view)]
+        if g.view_inplace is not None:
+            assert g.view_inplace.is_view_op
+            view_str.append(emit_registration_helper(g.view_inplace))
+        return view_str
+    else:
+        f = g
+        if str(f.func.name) == "lift":
+            # See Note [Functionalization <> torch.Tensor constructor]
+            return []
+        assert not f.is_view_op
+        # functionalization needs to generate and register kernals for inplace ops.
+        # We *also* need to directly register CompositeImplicitAUtograd kernels
+        # so that they decompose properly before functioanlization.
+        if modifies_arguments(f) or f.has_composite_implicit_autograd_kernel:
+            return [emit_registration_helper(f)]
+    return []
+
+
+def gen_functionalization_definition(
+    selector: SelectiveBuilder,
+    g: Union[NativeFunction, NativeFunctionsViewGroup],
+    functional_op: Optional[NativeFunction],
+) -> List[str]:
+    # Don't generate kernels in mobile build
+    if not selector.include_all_operators:
+        return []
+
+    if isinstance(g, NativeFunctionsViewGroup):
+        # Case 1: emit view -> view_copy kernels for the functionalization pass
+        view_defs = []
+        if not g.composite:
+            # invariant: NativeFunctionsViewGroup's always have a view_copy operator
+            # if the view is not composite (implicit autograd)
+            assert g.view_copy is not None
+            view_defs.append(emit_view_functionalization_body(g, view_inplace=False))
+            if g.view_inplace is not None:
+                view_defs.append(emit_view_functionalization_body(g, view_inplace=True))
+        return view_defs
+    else:
+        # Case 2: emit inplace -> out-of-place kernels for the functionalization pass
+        f = g
+        if modifies_arguments(f):
+            return [emit_inplace_functionalization_body(f, functional_op)]
+    return []
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
new file mode 100644
index 000000000000..d160ef2ef486
--- /dev/null
+++ b/torchgen/gen_lazy_tensor.py
@@ -0,0 +1,538 @@
+import pathlib
+import argparse
+import os
+import re
+import yaml
+from collections import namedtuple, Counter
+from typing import (
+    List,
+    Dict,
+    Union,
+    Sequence,
+    Optional,
+    Callable,
+    Iterable,
+    Iterator,
+    Tuple,
+    Type,
+)
+from torchgen.api.types import BaseCppType
+from torchgen.dest.lazy_ir import GenLazyIR, GenTSLazyIR
+from torchgen.gen import (
+    get_grouped_native_functions,
+    parse_native_yaml,
+    NamespaceHelper,
+)
+
+from torchgen.api.lazy import setValueT
+
+from torchgen.model import (
+    FunctionSchema,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import concatMap, YamlLoader, FileManager
+import torchgen.dest as dest
+from .gen_backend_stubs import (
+    parse_backend_yaml,
+    error_on_missing_kernels,
+    gen_dispatchkey_nativefunc_headers,
+    gen_dispatcher_registrations,
+)
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                        Lazy Tensor Codegen
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# Overview
+# ~~~~~~~~
+#
+# This codegen script builds on existing data models and helpers used
+# by all ATen backends, and adds new functionality specific to lazy
+# tensor backends.
+#
+# Inputs:
+# - <backend>_native_functions.yaml: controls which operators are
+#   supported by the backend.
+#
+# Outputs:
+# (for all backends)
+# <DispatchKey>Ir.h defines Lazy IR classes to be constructed during tracing
+# - opt-in: also generate 'lowering' methods for the TorchScript backend only
+# <DispatchKey>NativeFunctions.cpp defines implementations of native functions which perform lazy tracing
+# - opt-in: 'full_codegen' section of backend yaml; 'supported' section omits these implementations
+# <DispatchKey>NativeFunctions.h declares implementations of native functions for both 'supported' and 'full_codegen'
+# ops
+#
+# Register<DispatchKey>.cpp registers all op implementations with the dispatcher
+# RegisterAutograd<DispatchKey>.cpp registers all autograd implementations with the dispatcher
+#
+# Validation Helpers:
+# - Shape Inference: errs if any ops in backend yaml require shape inference not provided by meta kernels or
+#   implementations in torch/csrc/lazy/core/shape_inference.*
+# - native function impls: errs if any 'supported' ops do not have an implementation defined in the backend
+#   (non-codegen) implementation file
+#
+#
+# About the Data Model
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Modeled after ATen codegen, the first step is to parse yaml and build a data model for the operators
+# we care about.  In this case, the <backend>_native_functions yaml defines a subset of the core operators
+# (defined in more detail in the main native_functions.yaml), which will be supported by your backend.
+# Backends can list ops in two categories:
+#  - `supported` ops require hand-implementations but still get codegenned declarations and registrations
+#  - `full_codegen` ops get implementations (and IR classes) generated too
+#
+# Each native function is modeled as an object with a schema, and each schema has objects representing their
+# arguments.  Much of the codegen is manipulation of the arguments and their types.  For example, lazy tensor
+# backends need to transform 'at::Tensor' arguments into 'lazy::Value' objects, as well as replacing reference
+# types (stringref) with actual string objects, and this is done by manipulating the data model objects.
+# - see api/lazy.py for the lazy data model
+#
+# Once the data model is set up, the rest of this script processes a number of templates for output CPP file
+# and fills in the template values using helpers in `dest/lazy_ir.py` and `dest/lazy_ts_lowering.py`.  These
+# helpers mostly iterate over functions and their arguments, outputting different c++ snippets.
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key.
+# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping, full_codegen)
+ParsedExternalYaml = namedtuple(
+    "ParsedExternalYaml",
+    ["backend_key", "autograd_key", "cpp_namespace", "backend_indices", "full_codegen"],
+)
+
+
+def parse_full_codegen_ops(
+    backend_yaml_path: str,
+    grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
+) -> List[OperatorName]:
+
+    native_functions_map: Dict[OperatorName, NativeFunction] = {
+        f.func.name: f
+        for f in concatMap(
+            lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()),
+            grouped_native_functions,
+        )
+    }
+
+    with open(backend_yaml_path, "r") as f:
+        yaml_values = yaml.load(f, Loader=YamlLoader)
+    assert isinstance(yaml_values, dict)
+
+    full_codegen = yaml_values.pop("full_codegen", [])
+    assert isinstance(
+        full_codegen, list
+    ), f'expected "full_codegen" to be a list, but got: {full_codegen}'
+    full_codegen = [OperatorName.parse(name) for name in full_codegen]
+
+    return full_codegen
+
+
+def validate_shape_inference_header(
+    shape_inference_hdr: str, expected_shape_infr_decls: List[str]
+) -> None:
+    try:
+        with open(shape_inference_hdr, "r") as f:
+            shape_infr_decls = f.read()
+            shape_infr_decl_lines = set(shape_infr_decls.split("\n"))
+    except IOError:
+        raise AssertionError(
+            f"Unable to read from the specified shape_inference_hdr file: {shape_inference_hdr}"
+        )
+
+    shape_infr_regex = r"compute_shape_(\w+)"
+    actual_shape_infr_name_counts = Counter(
+        re.findall(shape_infr_regex, shape_infr_decls)
+    )
+    # TODO(whc) add a check for shape inference functions that have meta kernels implement and should be retired.
+
+    for decl in expected_shape_infr_decls:
+        assert (
+            decl in shape_infr_decl_lines
+        ), f"""Missing shape inference function.\n
+Please add declare this function in {shape_inference_hdr}:\n
+and implement it in the the corresponding shape_inference.cpp file.\n
+{decl}"""
+
+
+class default_args:
+    node_base: str = "Node"
+    node_base_hdr: Optional[str] = None
+    shape_inference_hdr: str = "torch/csrc/lazy/core/shape_inference.h"
+    tensor_class: str = "torch::lazy::LazyTensor"
+    tensor_class_hdr: str = "torch/csrc/lazy/core/tensor.h"
+    lazy_ir_generator: Type[GenLazyIR] = GenLazyIR
+    backend_name: str = "TorchScript"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate Lazy Tensor backend files")
+    parser.add_argument(
+        "-s",
+        "--source_yaml",
+        help="path to source yaml file containing operator external definitions",
+    )
+    parser.add_argument("-o", "--output_dir", help="output directory")
+    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
+    parser.add_argument(
+        "--impl_path",
+        type=str,
+        default=None,
+        help="path to the source C++ file containing kernel definitions",
+    )
+    parser.add_argument(
+        "--gen_ts_lowerings",
+        action="store_true",
+        help="Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions",
+    )
+    parser.add_argument(
+        "--node_base",
+        type=str,
+        default=default_args.node_base,
+        help="Name of backend specific custom Lazy IR Node base class",
+    )
+    parser.add_argument(
+        "--node_base_hdr",
+        type=str,
+        default=default_args.node_base_hdr,
+        help="Path to header file defining custom Lazy IR Node base class",
+    )
+    parser.add_argument(
+        "--shape_inference_hdr",
+        type=str,
+        default=default_args.shape_inference_hdr,
+        help="Path to header file defining custom Lazy shape inference functions",
+    )
+    parser.add_argument(
+        "--tensor_class",
+        type=str,
+        default=default_args.tensor_class,
+        help="Name of backend specific custom Lazy Tensor class",
+    )
+    parser.add_argument(
+        "--tensor_class_hdr",
+        type=str,
+        default=default_args.tensor_class_hdr,
+        help="Path to header file defining custom Lazy Tensor class",
+    )
+    parser.add_argument(
+        "--backend_name",
+        type=str,
+        default=default_args.backend_name,
+        help="Name of the backend to generate",
+    )
+    options = parser.parse_args()
+
+    # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
+    torch_root = pathlib.Path(__file__).parent.parent.parent.absolute()
+    aten_path = str(torch_root / "aten" / "src" / "ATen")
+    lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator
+    if options.gen_ts_lowerings:
+        lazy_ir_generator = GenTSLazyIR
+
+    run_gen_lazy_tensor(
+        aten_path,
+        options.source_yaml,
+        options.output_dir,
+        options.dry_run,
+        options.impl_path,
+        options.node_base,
+        options.node_base_hdr,
+        options.tensor_class,
+        options.tensor_class_hdr,
+        options.shape_inference_hdr,
+        lazy_ir_generator,
+        options.backend_name,
+    )
+
+
+def run_gen_lazy_tensor(
+    aten_path: str,
+    source_yaml: str,
+    output_dir: str,
+    dry_run: bool,
+    impl_path: Optional[str],
+    node_base: str = default_args.node_base,
+    node_base_hdr: Optional[str] = default_args.node_base_hdr,
+    tensor_class: str = default_args.tensor_class,
+    tensor_class_hdr: str = default_args.tensor_class_hdr,
+    shape_inference_hdr: str = default_args.shape_inference_hdr,
+    lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator,
+    # build_in_tree is true for TS backend and affects include paths
+    build_in_tree: bool = False,
+    # per_operator_headers changes whether ATen/Functions.h or individual operator headers are used
+    # it must match how ATen was built
+    per_operator_headers: bool = False,
+    backend_name: str = default_args.backend_name,
+    gen_forced_fallback_code: bool = False,
+    # the following arguments are temporary customization points for xla backend migration.
+    # do not rely on them otherwise, they should be removed once migration is complete
+    backend_namespace: str = "torch::lazy",
+    get_tensorlist: str = "GetTensorList",
+    get_tensor_or_wrap_number: str = "GetLtcTensorOrCreateForWrappedNumber",
+    try_get_tensor: str = "TryGetLtcTensor",
+    metrics_counter: str = 'TORCH_LAZY_FN_COUNTER("lazy::")',
+    create_tensor: str = "LazyTensor::Create",
+    create_from_first_tensor: bool = False,
+    create_aten_from_ltc_tensor: str = "torch::lazy::CreateAtenFromLtcTensor",
+    tuple_aten_from_ltc_tensors: str = "torch::lazy::TupleAtenFromLtcTensors",
+    lazy_value_class: str = "torch::lazy::Value",
+    lazy_tensor_ptr: str = "LazyTensorPtr",
+    get_device_fn: str = "torch::lazy::GetBackendDevice",
+) -> None:
+    lv_tokens = lazy_value_class.split("::")
+    lv_class = lv_tokens[-1]
+    lv_ns = "::".join(lv_tokens[:-1])
+    setValueT(BaseCppType(lv_ns, lv_class))
+    template_dir = os.path.join(aten_path, "templates")
+
+    def make_file_manager(install_dir: str) -> FileManager:
+        return FileManager(
+            install_dir=install_dir, template_dir=template_dir, dry_run=dry_run
+        )
+
+    fm = make_file_manager(output_dir)
+
+    native_yaml_path = os.path.join(aten_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(aten_path, "native/tags.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+    grouped_native_functions = get_grouped_native_functions(native_functions)
+
+    def sort_native_function(f: Union[NativeFunctionsGroup, NativeFunction]) -> str:
+        """
+        We sort the native function because of the note in concat_map_codegen.
+        TODO(alanwaketan): Remove this sorting hack once all ops are grouped properly.
+        """
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        return str(func.name.name)
+
+    grouped_native_functions = sorted(
+        grouped_native_functions, key=sort_native_function
+    )
+    parsed_backend_yaml = parse_backend_yaml(
+        source_yaml, grouped_native_functions, backend_indices
+    )
+    backend_key = parsed_backend_yaml.backend_key
+    autograd_key = parsed_backend_yaml.autograd_key
+    cpp_namespace = parsed_backend_yaml.cpp_namespace
+    backend_indices = parsed_backend_yaml.backend_indices
+    full_codegen = parse_full_codegen_ops(source_yaml, grouped_native_functions)
+
+    def concat_map_codegen(
+        func: Callable[[NativeFunction], Sequence[str]],
+        xs: Iterable[Union[NativeFunctionsGroup, NativeFunction]],
+        *,
+        codegenInplaceVariant: bool = False,
+    ) -> Iterator[str]:
+        """
+        We code-gen for the functional variant, which is all we need for IR classes/lowerings/shape inferences, but we
+        only code-gen additional entries for the inplace variant for the native functions.
+        Note: If xs is not sorted, there may be an edge case when generating IR classes. Considering relu and relu_, if
+        we encounter relu_ before relu. we will then generate an IR class with op = at::aten::relu_ for both relu and
+        relu_ which will cause problems for relu.
+        TODO(alanwaketan): Once all ops are grouped properly, we should no longer need this hack.
+        """
+        generated = set()
+
+        def gen_key(func: FunctionSchema) -> Tuple[str, str]:
+            # we want to generate unique entries for overloads of functional variants,
+            # but not for inplace variants unless explicitly told `codegenInplaceVariant`
+            return (func.name.name.base, func.name.overload_name)
+
+        for x in xs:
+            f = x.functional if isinstance(x, NativeFunctionsGroup) else x
+            # For the 'or'd terms:
+            # 1. codegenInplaceVariant means we can generate the in-place variant corresponding items.
+            # 2. not f.func.name.name.inplace means the op is not a in-place variant, so we can generate the item.
+            # 3. f.func.name.name.base not in generated means even for in-place ops we still need to generate the item
+            # as if they were the functional variants for one time.
+            if f.func.name in full_codegen and (
+                codegenInplaceVariant
+                or not f.func.name.name.inplace
+                or gen_key(f.func) not in generated
+            ):
+                generated.add(gen_key(f.func))
+                for r in func(f):
+                    yield r
+
+    selector = SelectiveBuilder.get_nop_selector()
+
+    assert backend_key is not None
+    class_name = backend_indices[backend_key].native_function_class_name()
+
+    if impl_path is not None:
+        error_on_missing_kernels(
+            native_functions,
+            backend_indices,
+            backend_key,
+            autograd_key,
+            class_name,
+            impl_path,
+            full_codegen,
+        )
+
+    """ Validate Shape Inference Definitions
+
+    Generated lazy native functions all perform shape inference, by first using a meta:: kernel
+    if available for that op, and otherwise using a 'compute_shape_{op}' function instead.  The generator
+    knows the call signature for compute_shape_{op} becuase it matches the nativefunction (and meta::) signature,
+    so it just has to check whether the op is structured and generate a call for one or the other.  It's up to the dev
+    to supply the missing compute_shape_{op} function, but the codegen at least warns you about this and provides
+    the expected signature which can be copy-pasted into shape_inference.h.
+
+    compute_shape_{op} functions are handwritten and should be replaced over time as ops get ported
+    to structured kernels.
+
+    See torch/csrc/lazy/core/shape_inference.cpp #READ THIS! for more information.
+    """
+    if shape_inference_hdr is not None:
+        expected_shape_infr_decls = list(
+            concat_map_codegen(
+                dest.GenLazyShapeInferenceDefinition(
+                    backend_indices[backend_key], tensor_class
+                ),
+                grouped_native_functions,
+                codegenInplaceVariant=True,
+            )
+        )
+
+        validate_shape_inference_header(shape_inference_hdr, expected_shape_infr_decls)
+    assert class_name is not None
+
+    # Generate nativefunction declarations
+    # Note, eager registrations is set to False for the lazy TS backend as another LTC backend
+    # may want to register their own lazy kernels instead of registering the TS ones.
+    # The registration will lazily happen when init_ts_backend is called.
+    gen_dispatchkey_nativefunc_headers(
+        fm,
+        class_name,
+        cpp_namespace,
+        backend_indices,
+        grouped_native_functions,
+        backend_key,
+        autograd_key,
+        backend_name,
+    )
+
+    # Generate Dispatcher registrations which hook up the nativefunctions
+    for dispatch_key in (
+        [backend_key] if autograd_key is None else [backend_key, autograd_key]
+    ):
+        gen_dispatcher_registrations(
+            fm,
+            output_dir,
+            class_name,
+            cpp_namespace,
+            backend_indices,
+            grouped_native_functions,
+            backend_key,
+            dispatch_key,
+            selector,
+            build_in_tree=build_in_tree,
+            per_operator_headers=per_operator_headers,
+            backend_name=backend_name,
+            eager_registration=False,
+        )
+
+    # Generate native function impls that build IR nodes
+    ns_helper = NamespaceHelper(cpp_namespace)
+    fm.write_with_template(
+        f"{backend_key}NativeFunctions.cpp",
+        "DispatchKeyNativeFunctions.cpp",
+        lambda: {
+            "includes": [
+                f"#include <{path}>"
+                for path in [
+                    tensor_class_hdr,
+                    shape_inference_hdr,
+                    "ATen/Functions.h",
+                    "ATen/MetaFunctions.h",
+                    "ATen/Operators.h",
+                    "ATen/native/CPUFallback.h",
+                    "torch/csrc/lazy/core/ir_builder.h",
+                    "torch/csrc/lazy/core/lazy_graph_executor.h",
+                    "torch/csrc/lazy/core/metrics.h",
+                    "torch/csrc/lazy/core/shape.h",
+                    f"{output_dir}/{backend_key}NativeFunctions.h",
+                    f"{output_dir}/LazyIr.h",
+                ]
+                + (
+                    ["torch/csrc/lazy/ts_backend/ts_eager_fallback.h"]
+                    if gen_forced_fallback_code
+                    else []
+                )
+            ],
+            "native_functions_include": "",
+            "namespace_prologue": ns_helper.prologue,
+            "namespace_epilogue": ns_helper.epilogue,
+            "native_function_definitions": list(
+                concat_map_codegen(
+                    dest.GenLazyNativeFuncDefinition(
+                        f"{backend_key}NativeFunctions",
+                        backend_indices[backend_key],
+                        tensor_class,
+                        gen_forced_fallback_code,
+                        backend_namespace,
+                        get_tensorlist,
+                        get_tensor_or_wrap_number,
+                        try_get_tensor,
+                        metrics_counter,
+                        create_tensor,
+                        create_from_first_tensor,
+                        create_aten_from_ltc_tensor,
+                        tuple_aten_from_ltc_tensors,
+                        lazy_tensor_ptr,
+                        get_device_fn,
+                    ),
+                    grouped_native_functions,
+                    codegenInplaceVariant=True,
+                )
+            ),
+        },
+    )
+    # Generate IR node classes
+    fm.write_with_template(
+        "LazyIr.h",
+        "LazyIr.h",
+        lambda: {
+            "lazy_ir_sysinc": [
+                f"#include <{path}>"
+                for path in [
+                    "ATen/core/Formatting.h",
+                    "c10/core/ScalarType.h",
+                    "c10/util/Optional.h",
+                    "torch/csrc/lazy/core/hash.h",
+                    "torch/csrc/lazy/core/ir.h",
+                    "torch/csrc/lazy/core/shape.h",
+                    "vector",
+                ]
+            ],
+            "lazy_ir_inc": [
+                f'#include "{path}"'
+                for path in [node_base_hdr if node_base_hdr is not None else None]
+                if path is not None
+            ],
+            "ir_declarations": list(
+                concat_map_codegen(
+                    lazy_ir_generator(backend_indices[backend_key], node_base),
+                    grouped_native_functions,
+                )
+            ),
+            "namespace_prologue": ns_helper.prologue,
+            "namespace_epilogue": ns_helper.epilogue,
+        },
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/codegen/local.py b/torchgen/local.py
similarity index 90%
rename from tools/codegen/local.py
rename to torchgen/local.py
index e1b7e6e41d58..dd570dd8d7ee 100644
--- a/tools/codegen/local.py
+++ b/torchgen/local.py
@@ -14,17 +14,22 @@
 # sites are eliminated.  If you don't have a plan for how to get there,
 # DON'T add a new entry here.
 
+
 class Locals(threading.local):
     use_const_ref_for_mutable_tensors: Optional[bool] = None
 
+
 _locals = Locals()
 
+
 def use_const_ref_for_mutable_tensors() -> bool:
-    assert _locals.use_const_ref_for_mutable_tensors is not None, \
-        "need to initialize local.use_const_ref_for_mutable_tensors with " \
+    assert _locals.use_const_ref_for_mutable_tensors is not None, (
+        "need to initialize local.use_const_ref_for_mutable_tensors with "
         "local.parametrize"
+    )
     return _locals.use_const_ref_for_mutable_tensors
 
+
 @contextmanager
 def parametrize(*, use_const_ref_for_mutable_tensors: bool) -> Iterator[None]:
     old_use_const_ref_for_mutable_tensors = _locals.use_const_ref_for_mutable_tensors
@@ -32,4 +37,6 @@ def parametrize(*, use_const_ref_for_mutable_tensors: bool) -> Iterator[None]:
         _locals.use_const_ref_for_mutable_tensors = use_const_ref_for_mutable_tensors
         yield
     finally:
-        _locals.use_const_ref_for_mutable_tensors = old_use_const_ref_for_mutable_tensors
+        _locals.use_const_ref_for_mutable_tensors = (
+            old_use_const_ref_for_mutable_tensors
+        )
diff --git a/torchgen/model.py b/torchgen/model.py
new file mode 100644
index 000000000000..e0888344a825
--- /dev/null
+++ b/torchgen/model.py
@@ -0,0 +1,2111 @@
+import re
+
+from torchgen.utils import assert_never
+
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Iterator, Tuple, Set, Sequence, Callable, Union
+from enum import Enum, auto
+import itertools
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           DATA MODEL
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+# Some general principles for our data model.
+#
+# - Stop using C++ data types as the internal data representation
+#   format.  Instead, the internal data structures are centered
+#   around JIT schema representation.  This avoid a big problem
+#   with the old codegen where we read in all the types from
+#   native_functions.yaml and then immediately had to retranslate
+#   them into C++ types.
+#
+# - More semantic data representation.  Instead of representing
+#   everything as dicts and strings, we define dataclasses for
+#   every interesting entity the code generation has to deal with.
+#   These dataclasses have strong semantic invariants: for example,
+#   we generally require them to roundtrip losslessly into the
+#   form they were parsed from.  These structures are immutable
+#   and you're expected to populate information once during
+#   construction.
+
+# Represent a source location; used for better error reporting
+@dataclass(frozen=True)
+class Location:
+    file: str
+    line: int
+
+    def __str__(self) -> str:
+        return "{}:{}".format(self.file, self.line)
+
+
+# Valid values of the 'variants' field in native_functions.yaml
+Variant = Enum("Variant", ("function", "method"))
+
+# NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
+class DispatchKey(Enum):
+    Undefined = 0
+    CatchAll = Undefined
+
+    Dense = auto()
+    FPGA = auto()
+    ORT = auto()
+    MPS = auto()
+    Vulkan = auto()
+    Metal = auto()
+    MKLDNN = auto()
+    OpenGL = auto()
+    OpenCL = auto()
+    IDEEP = auto()
+    Quantized = auto()
+    CustomRNGKeyId = auto()
+    MkldnnCPU = auto()
+    Sparse = auto()
+    SparseCsrCPU = auto()
+    SparseCsrCUDA = auto()
+
+    ZeroTensor = auto()
+    Meta = auto()
+    BackendSelect = auto()
+    Named = auto()
+    AutogradOther = auto()
+    AutogradFunctionality = auto()
+    AutogradNestedTensor = auto()
+    Tracer = auto()
+    Autocast = auto()
+    Batched = auto()
+    VmapMode = auto()
+    TESTING_ONLY_GenericWrapper = auto()
+    TESTING_ONLY_GenericMode = auto()
+    EndOfFunctionalityKeys = TESTING_ONLY_GenericMode
+
+    CPU = auto()
+    CUDA = auto()
+    HIP = auto()
+    XLA = auto()
+    Lazy = auto()
+    IPU = auto()
+    XPU = auto()
+    NestedTensor = auto()
+    PrivateUse1 = auto()
+    PrivateUse2 = auto()
+    PrivateUse3 = auto()
+
+    QuantizedCPU = auto()
+    QuantizedCUDA = auto()
+    QuantizedXPU = auto()
+
+    SparseCPU = auto()
+    SparseCUDA = auto()
+    SparseHIP = auto()
+    SparseXPU = auto()
+
+    NestedTensorCPU = auto()
+    NestedTensorCUDA = auto()
+
+    AutogradCPU = auto()
+    AutogradCUDA = auto()
+    AutogradXLA = auto()
+    AutogradLazy = auto()
+    AutogradIPU = auto()
+    AutogradMPS = auto()
+    AutogradXPU = auto()
+    AutogradPrivateUse1 = auto()
+    AutogradPrivateUse2 = auto()
+    AutogradPrivateUse3 = auto()
+
+    Autograd = auto()
+    CompositeImplicitAutograd = auto()
+    CompositeExplicitAutograd = auto()
+    EndOfAliasKeys = CompositeExplicitAutograd
+
+    CPUTensorId = CPU
+    CUDATensorId = CUDA
+    PrivateUse1_PreAutograd = AutogradPrivateUse1
+    PrivateUse2_PreAutograd = AutogradPrivateUse2
+    PrivateUse3_PreAutograd = AutogradPrivateUse3
+
+    def __str__(self) -> str:
+        return self.name
+
+    def lower(self) -> str:
+        return str(self).lower()
+
+    @staticmethod
+    def parse(value: str) -> "DispatchKey":
+        for k, v in DispatchKey.__members__.items():
+            if k == value:
+                return v
+        raise AssertionError(f"unknown dispatch key {value}")
+
+
+STRUCTURED_DISPATCH_KEYS = {DispatchKey.MPS, DispatchKey.CUDA, DispatchKey.CPU}
+UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}
+
+# Set of supported dispatch keys
+dispatch_keys = [
+    DispatchKey.CPU,
+    DispatchKey.SparseCPU,
+    DispatchKey.SparseCsrCPU,
+    DispatchKey.MkldnnCPU,
+    DispatchKey.CUDA,
+    DispatchKey.MPS,
+    DispatchKey.SparseCUDA,
+    DispatchKey.SparseCsrCUDA,
+    DispatchKey.QuantizedCPU,
+    DispatchKey.QuantizedCUDA,
+    DispatchKey.CompositeImplicitAutograd,
+    DispatchKey.CompositeExplicitAutograd,
+    DispatchKey.NestedTensorCPU,
+    DispatchKey.NestedTensorCUDA,
+    # Meta is a magic key: it is automatically generated for structured
+    # kernels
+    DispatchKey.Meta,
+    DispatchKey.ZeroTensor,
+]
+
+# Dispatch keys that "support all backends".  These codegen slightly differently
+# then backend specific keys.
+def is_generic_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in {
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.CompositeImplicitAutograd,
+    }
+
+
+# CUDA specific dispatch keys
+def is_cuda_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in {
+        DispatchKey.CUDA,
+        DispatchKey.QuantizedCUDA,
+        DispatchKey.SparseCUDA,
+        DispatchKey.SparseCsrCUDA,
+        DispatchKey.NestedTensorCUDA,
+        DispatchKey.AutogradCUDA,
+        DispatchKey.CUDATensorId,
+    }
+
+
+# Structured kernel generation is only supported for certain key types;
+# otherwise use old-style
+def is_structured_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in STRUCTURED_DISPATCH_KEYS
+
+
+def is_ufunc_dispatch_key(dk: DispatchKey) -> bool:
+    # For now, ufunc dispatch keys coincide with structured keys
+    return dk in UFUNC_DISPATCH_KEYS
+
+
+# This is oddly named ScalarType and not DType for symmetry with C++
+class ScalarType(Enum):
+    Byte = auto()
+    Char = auto()
+    Short = auto()
+    Int = auto()
+    Long = auto()
+    Half = auto()
+    Float = auto()
+    Double = auto()
+    ComplexHalf = auto()
+    ComplexFloat = auto()
+    ComplexDouble = auto()
+    Bool = auto()
+    BFloat16 = auto()
+
+    def __str__(self) -> str:
+        return self.name
+
+    @staticmethod
+    def maybe_parse(value: str) -> Optional["ScalarType"]:
+        for k, v in ScalarType.__members__.items():
+            if k == value:
+                return v
+        return None
+
+    @staticmethod
+    def parse(value: str) -> "ScalarType":
+        mb_r = ScalarType.maybe_parse(value)
+        assert mb_r is not None, f"unknown dtype {value}"
+        return mb_r
+
+    @staticmethod
+    def parse_set(values: str) -> Set["ScalarType"]:
+        dtypes: Set[ScalarType] = set()
+        for value in values.split(", "):
+            if value in DTYPE_CLASSES:
+                dtypes.update(DTYPE_CLASSES[value])
+            else:
+                dtypes.add(ScalarType.parse(value))
+        return dtypes
+
+
+DTYPE_CLASSES: Dict[str, Set[ScalarType]] = {}
+# NB: Integral doesn't include boolean
+DTYPE_CLASSES["Integral"] = {
+    ScalarType.Byte,
+    ScalarType.Char,
+    ScalarType.Int,
+    ScalarType.Long,
+    ScalarType.Short,
+}
+# NB: Floating doesn't include low precision types
+DTYPE_CLASSES["Floating"] = {ScalarType.Float, ScalarType.Double}
+DTYPE_CLASSES["Complex"] = {ScalarType.ComplexFloat, ScalarType.ComplexDouble}
+DTYPE_CLASSES["All"] = DTYPE_CLASSES["Integral"] | DTYPE_CLASSES["Floating"]
+DTYPE_CLASSES["AllAndComplex"] = DTYPE_CLASSES["All"] | DTYPE_CLASSES["Complex"]
+DTYPE_CLASSES["FloatingAndComplex"] = (
+    DTYPE_CLASSES["Floating"] | DTYPE_CLASSES["Complex"]
+)
+
+
+# Represents the valid entries for ufunc_inner_loop in native_functions.yaml.
+# NB: if you add a new UfuncKey, you will teach torchgen.dest.ufunc how
+# to process it.  Most logic will ignore keys they don't understand, so your
+# new key will get silently ignored until you hook in logic to deal with it.
+class UfuncKey(Enum):
+    # These are low level keys that represent exactly one particular
+    # instantiation of the kernel produced by codegen
+    CUDAFunctor = auto()
+    CUDAFunctorOnOther = auto()
+    CUDAFunctorOnSelf = auto()
+
+    CPUScalar = auto()
+    CPUVector = auto()
+
+    # These are the ones users will usually specify, and
+    # implicitly "fill in" the low level keys
+    ScalarOnly = auto()  # CUDA*, CPUScalar
+    Generic = auto()  # CUDA*, CPU*
+
+    def __str__(self) -> str:
+        return self.name
+
+    @staticmethod
+    def parse(value: str) -> "UfuncKey":
+        for k, v in UfuncKey.__members__.items():
+            if k == value:
+                return v
+        raise AssertionError(f"unknown ufunc key {value}")
+
+
+class DeviceCheckType(Enum):
+    NoCheck = 0
+    ExactSame = 1
+
+
+ViewSchemaKind = Enum("ViewSchemaKind", ("aliasing", "inplace", "out", "non_aliasing"))
+
+# The basic input to the code generation is native_functions.yaml.
+# The name "native", BTW, comes from the distinction between native
+# functions and legacy TH functions.  The legacy TH functions are gone,
+# but the "native" descriptor has stuck.
+#
+# NativeFunction models a single entry in native_functions.yaml.  Its
+# fields roughly correspond to what you would see in the YAML itself,
+# but after canonicalization and parsing has occurred.
+#
+# You can see some of the overall design patterns for how we setup
+# dataclasses in this class, but we will defer a complete discussion
+# of this at FunctionSchema.
+@dataclass(frozen=True)
+class NativeFunction:
+    # The function schema of the operator in question.  This schema
+    # has been parsed; see FunctionSchema for more about its structure.
+    # (This type is quoted as we are forward referencing a type
+    # defined later in the file.  I opted for this ordering of the
+    # classes for expository clarity.)
+    func: "FunctionSchema"
+
+    # Whether or not to generate mutable tensor arguments like regular
+    # ones
+    use_const_ref_for_mutable_tensors: bool
+
+    # Whether or not to omit automatic generation of a DeviceGuard
+    device_guard: bool
+
+    # How to emit automatic generation of device check
+    device_check: DeviceCheckType
+
+    # What python module to put the function in
+    python_module: Optional[str]
+
+    # TODO: figure out what this does
+    category_override: Optional[str]
+
+    # If no variants are specified in native_functions.yaml, this is
+    # assumed to be {'function'}.
+    variants: Set[Variant]
+
+    # Whether or not we should skip generating registrations for
+    # this kernel.  This is a bit of a double-edged sword, as manual
+    # registrations don't participate in codegen-based selective build!
+    manual_kernel_registration: bool
+
+    # Whether or not to skip generating TensorMethod/Functions bindings
+    # for this kernel.  Technically, this doesn't actually skip generating
+    # the binding; instead, the binding gets generated to __dispatch_{funcname}
+    # so you can make use of the normal binding if you need it.
+    manual_cpp_binding: bool
+
+    # The location in the YAML file were this native function entry was
+    # defined.  This is for conveniently reporting error messages!
+    loc: "Location"
+
+    # If non-empty, this kernel is subject to ufunc codegen.
+    # Sorted by ufunc_key
+    ufunc_inner_loop: Dict[UfuncKey, "UfuncInnerLoop"]
+
+    # Whether or not this out functions is a "structured kernel".  Structured
+    # kernels are defined a little differently from normal kernels; in
+    # particular, their shape checking logic is defined separately from
+    # the kernel.  Only out functions can be structured; other functions
+    # delegate to the out function using the structured_delegate keyword.
+    # Every structured kernel must have at least an out and a functional
+    # variant.
+    structured: bool
+
+    # Whether or not this non-out function is a structured kernel, defined
+    # in terms of the out kernel referenced by the string here.
+    structured_delegate: Optional["OperatorName"]
+
+    # Only valid for structured kernels.  Specifies alternative of what
+    # to inherit from when defining the meta class for the structured
+    # operator.  This will usually be TensorIteratorBase.  This also
+    # changes the semantics of set_output to call the parent class.
+    structured_inherits: Optional[str]
+
+    # Structured kernels can declare elements as "precomputed". These elements
+    # are returned by the meta function in one struct and passed to the impl
+    # function in lieu of certain kernel arguments that these precomputed
+    # elements supersede. Information about the names and types of these
+    # precomputed elements and how they correspond to kernel arguments is stored
+    # in this member, if applicable.
+    precomputed: Optional["Precompute"]
+
+    # Argument names whose default  should be excluded from the C++ interface.
+    # Intended for resolving overload ambiguities between signatures.
+    cpp_no_default_args: Set[str]
+
+    # Note [Abstract ATen methods]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # An abstract ATen method is one whose dispatch differs between
+    # types.  These are implemented in derived types (with a
+    # standard (throwing) definition in Type).  A concrete ATen
+    # method is one which has the same dispatch for all types;
+    # we just implement it in the base Type.  This is exposed
+    # in Declarations.yaml via a field named 'abstract'.
+    is_abstract: bool
+
+    # Whether or not the NativeFunction contains a backend-agnostic kernel
+    has_composite_implicit_autograd_kernel: bool
+    has_composite_explicit_autograd_kernel: bool
+
+    # Tags are used to describe semantic information about (groups of) operators,
+    # That aren't easily inferrable directly from the operator's schema.
+    tags: Set[str]
+
+    # NB: The benefit of defining a dataclass is that we automatically get
+    # a constructor defined for all the fields we specify.  No need
+    # to explicitly write it out.
+
+    # We parse both the NativeFunction + backend-specific information about it, which it stored in a corresponding BackendIndex.
+    @staticmethod
+    def from_yaml(
+        ei: Dict[str, object],
+        loc: "Location",
+        valid_tags: Set[str],
+        ignore_keys: Optional[Set[DispatchKey]] = None,
+    ) -> Tuple[
+        "NativeFunction", Dict[DispatchKey, Dict["OperatorName", "BackendMetadata"]]
+    ]:
+        """
+        Parse a NativeFunction from a dictionary as directly parsed
+        from native_functions.yaml
+        """
+        e = ei.copy()
+
+        funcs = e.pop("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        func = FunctionSchema.parse(funcs)
+
+        cpp_no_default_args_list = e.pop("cpp_no_default_args", [])
+        assert isinstance(cpp_no_default_args_list, list)
+        cpp_no_default_args = set(cpp_no_default_args_list)
+
+        use_const_ref_for_mutable_tensors = e.pop(
+            "use_const_ref_for_mutable_tensors", False
+        )
+        assert isinstance(use_const_ref_for_mutable_tensors, bool)
+
+        variants_s = e.pop("variants", "function")
+        assert isinstance(variants_s, str)
+        variants: Set[Variant] = set()
+        for v in variants_s.split(", "):
+            if v == "function":
+                variants.add(Variant.function)
+            elif v == "method":
+                variants.add(Variant.method)
+            else:
+                raise AssertionError(f"illegal variant {v}")
+
+        manual_kernel_registration = e.pop("manual_kernel_registration", False)
+        assert isinstance(
+            manual_kernel_registration, bool
+        ), f"not a bool: {manual_kernel_registration}"
+
+        manual_cpp_binding = e.pop("manual_cpp_binding", False)
+        assert isinstance(manual_cpp_binding, bool), f"not a bool: {manual_cpp_binding}"
+
+        device_guard = e.pop("device_guard", True)
+        assert isinstance(device_guard, bool), f"not a bool: {device_guard}"
+
+        device_check_s = e.pop("device_check", None)
+        assert device_check_s is None or isinstance(
+            device_check_s, str
+        ), f"not a str: {device_check_s}"
+        device_check: DeviceCheckType
+        if device_check_s is None:
+            device_check = DeviceCheckType.ExactSame
+        else:
+            device_check = DeviceCheckType[device_check_s]
+
+        structured = e.pop("structured", False)
+        assert isinstance(structured, bool), f"not a bool: {structured}"
+
+        structured_delegate_s = e.pop("structured_delegate", None)
+        assert structured_delegate_s is None or isinstance(
+            structured_delegate_s, str
+        ), f"not a str: {structured_delegate}"
+        structured_delegate: Optional[OperatorName] = None
+        if structured_delegate_s is not None:
+            structured_delegate = OperatorName.parse(structured_delegate_s)
+
+        structured_inherits = e.pop("structured_inherits", None)
+        assert structured_inherits is None or isinstance(
+            structured_inherits, str
+        ), f"not a str: {structured_inherits}"
+
+        python_module = e.pop("python_module", None)
+        assert python_module is None or isinstance(
+            python_module, str
+        ), f"not a str: {python_module}"
+        assert (
+            python_module is None or Variant.method not in variants
+        ), "functions in modules cannot be methods"
+
+        category_override = e.pop("category_override", None)
+        assert category_override is None or isinstance(
+            category_override, str
+        ), f"not a str: {category_override}"
+
+        precomputed_dict = e.pop("precomputed", None)
+        assert precomputed_dict is None or structured is True
+        precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None
+
+        tags_s = e.pop("tags", "")
+        assert isinstance(tags_s, str)
+        tags: Set[str] = set()
+        if len(tags_s) > 0:
+            assert len(valid_tags) > 0
+            for t in tags_s.split(", "):
+                # TODO: verify that the tag is valid and has an entry in tags.yaml
+                if t in valid_tags:
+                    tags.add(t)
+                else:
+                    raise AssertionError(f"illegal tag {t}")
+        assert isinstance(tags, set)
+
+        from torchgen.api import cpp
+
+        raw_dispatch = e.pop("dispatch", None)
+        assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
+        dispatch: Dict[DispatchKey, BackendMetadata] = {}
+        if raw_dispatch is not None:
+            assert not manual_kernel_registration, (
+                "cannot specify both manual_kernel_registration and dispatch; with "
+                "manual registration, dispatch has no effect!"
+            )
+            redundant_composite_implicit_autograd = False
+            for ks, v in raw_dispatch.items():
+                if ks == "__line__":
+                    continue  # not worth tracking line numbers for dispatch entries
+                assert isinstance(ks, str), e
+                for k in ks.split(","):
+                    dispatch_key = DispatchKey.parse(k.strip())
+                    if ignore_keys and dispatch_key in ignore_keys:
+                        continue
+                    assert dispatch_key in dispatch_keys, (
+                        f"Dispatch key {dispatch_key} of kernel {v} "
+                        "is not a supported dispatch key."
+                    )
+                    # Why is 'structured' included? External backends (e.g.
+                    # XLA) opt into which ops are structured independently
+                    # of which in-tree ops are structured
+                    dispatch[dispatch_key] = BackendMetadata(
+                        v,
+                        structured=structured
+                        and is_structured_dispatch_key(dispatch_key),
+                    )
+                    if (
+                        dispatch_key is DispatchKey.CompositeImplicitAutograd
+                        and v == cpp.name(func)
+                    ):
+                        redundant_composite_implicit_autograd = True
+
+            assert not (len(dispatch) == 1 and redundant_composite_implicit_autograd), (
+                "unnecessary dispatch table for this function; just delete the dispatch "
+                "key entirely"
+            )
+            # if a function is a structured delegate, deleting the dispatch
+            # table is NOT semantics preserving
+            assert structured_delegate or dispatch.keys() != {
+                DispatchKey.CompositeImplicitAutograd
+            }, (
+                f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} "
+                f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}.  Rename your implementation to the expected "
+                "name, then delete the dispatch table"
+            )
+        elif not structured and structured_delegate is None:
+            dispatch[DispatchKey.CompositeImplicitAutograd] = BackendMetadata(
+                cpp.name(func), structured=False
+            )
+
+        assert not (
+            DispatchKey.CompositeExplicitAutograd in dispatch
+            and DispatchKey.CompositeImplicitAutograd in dispatch
+        ), (
+            "cannot specify both CompositeExplicitAutograd and CompositeImplicitAutograd on a single kernel; each "
+            "strictly subsumes the other.  If you wanted to provide an explicit autograd "
+            "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only"
+        )
+
+        raw_ufunc_inner_loop = e.pop("ufunc_inner_loop", {})
+        ufunc_inner_loop = {}
+        if isinstance(raw_ufunc_inner_loop, str):
+            ufunc_inner_loop[UfuncKey.Generic] = UfuncInnerLoop.parse(
+                raw_ufunc_inner_loop, UfuncKey.Generic
+            )
+        elif isinstance(raw_ufunc_inner_loop, dict):
+            for k, vo in raw_ufunc_inner_loop.items():
+                if k == "__line__":
+                    continue
+                assert isinstance(k, str), f"ufunc_inner_loop key is not a str: {k}"
+                assert isinstance(vo, str), f"ufunc_inner_loop value is not a str: {v}"
+                ufunc_key = UfuncKey.parse(k)
+                ufunc_inner_loop[ufunc_key] = UfuncInnerLoop.parse(vo, ufunc_key)
+        else:
+            raise AssertionError(
+                f"ufunc_inner_loop not str or dict: {raw_ufunc_inner_loop}"
+            )
+        # Program the BackendIndex for the implicit dispatch entry from ufunc
+        if ufunc_inner_loop:
+            assert structured, "ufunc must be structured"
+            for dispatch_key in UFUNC_DISPATCH_KEYS:
+                assert (
+                    dispatch_key not in dispatch
+                ), f"ufunc should not have explicit dispatch entry for {dispatch_key}"
+                dispatch[dispatch_key] = BackendMetadata(
+                    kernel=ufunc.schema_kernel_name(func, dispatch_key), structured=True
+                )
+
+        if structured_delegate:
+            # Structured functions MUST have a dispatch table
+            is_abstract = True
+        else:
+            is_abstract = dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}
+
+        has_composite_implicit_autograd_kernel = (
+            DispatchKey.CompositeImplicitAutograd in dispatch.keys()
+        )
+        has_composite_explicit_autograd_kernel = (
+            DispatchKey.CompositeExplicitAutograd in dispatch.keys()
+        )
+
+        # We aren't going to store dispatch metadata inline in NativeFunctions;
+        # instead it is separately indexed by backend (so other backends can
+        # add more dispatch entries after the fact).  Reindex the individual
+        # metadata by OperatorName!
+        backend_metadata = {k: {func.name: v} for k, v in dispatch.items()}
+
+        # don't care if it exists or not; make it easier to use this function
+        # with other yaml parsers that aren't setting __line__ in the dict
+        e.pop("__line__", None)
+        assert not e, f"leftover entries: {e}"
+
+        # Asserts that we can't do in post_init, because they rely on backend-specific info
+        if structured_delegate is not None:
+            for key in STRUCTURED_DISPATCH_KEYS:
+                assert key not in dispatch, (
+                    f"if structured_delegate, then must not have {key} in dispatch dictionary "
+                    "(it is delegated!)"
+                )
+
+        return (
+            NativeFunction(
+                func=func,
+                use_const_ref_for_mutable_tensors=use_const_ref_for_mutable_tensors,
+                variants=variants,
+                structured=structured,
+                structured_delegate=structured_delegate,
+                structured_inherits=structured_inherits,
+                precomputed=precomputed,
+                ufunc_inner_loop=ufunc_inner_loop,
+                manual_kernel_registration=manual_kernel_registration,
+                manual_cpp_binding=manual_cpp_binding,
+                python_module=python_module,
+                category_override=category_override,
+                device_guard=device_guard,
+                device_check=device_check,
+                loc=loc,
+                cpp_no_default_args=cpp_no_default_args,
+                is_abstract=is_abstract,
+                has_composite_implicit_autograd_kernel=has_composite_implicit_autograd_kernel,
+                has_composite_explicit_autograd_kernel=has_composite_explicit_autograd_kernel,
+                tags=tags,
+            ),
+            backend_metadata,
+        )
+
+    def validate_unstructured(self) -> None:
+        # TODO: probably better to accumulate these errors and report them all
+        # at once
+        assert not self.structured, (
+            "This function is structured, but there was "
+            "no valid functional variant of it."
+        )
+        assert self.structured_delegate, (
+            "This function delegates to another structured out function, "
+            "but no valid function was found (the delegate may not exist, or it has the wrong type)"
+        )
+
+    # __post_init__ functions in dataclasses can be used to do extra
+    # validation after construction.
+    #
+    # Notice that we don't do any type validation here.  In fact, we
+    # rely exclusively on mypy to check if you've done types correctly!
+    # Validation is for nontrivial invariants that cannot be (conveniently)
+    # encoded in the type system.
+    def __post_init__(self) -> None:
+        if self.func.arguments.out:
+            assert self.variants == {Variant.function}, (
+                "Native functions with out arguments MUST "
+                "be declared with only function variant; e.g., variants: function; "
+                "otherwise you will tickle a Python argument binding bug "
+                "(which usually manifests itself as the result variable being undefined.)"
+            )
+        if self.structured:
+            assert self.func.kind() == SchemaKind.out, (
+                "Put structured field on the out= "
+                "variant of a function; did you mean structured_delegate?"
+            )
+            assert (
+                self.device_guard
+            ), "device_guard: False is not respected by structured kernels"
+        if self.structured_delegate:
+            assert self.func.kind() != SchemaKind.out, (
+                "structured_delegate field not allowed "
+                "on out= functions; did you mean structured?"
+            )
+            assert (
+                self.device_guard
+            ), "device_guard: False is not respected by structured kernels"
+        # Technically, with the asserts above, this assert is impossible to
+        # happen
+        assert not (
+            self.structured and self.structured_delegate
+        ), "Cannot have both structured and structured_delegate on function"
+        defaulted_arguments = {
+            a.name for a in self.func.schema_order_arguments() if a.default is not None
+        }
+        invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
+        assert len(invalid_args) == 0, f"Invalid cpp_no_default_args: {invalid_args}"
+        if self.structured_inherits is not None:
+            assert (
+                self.structured
+            ), "structured_inherits must also imply structured: True"
+        if str(self.func.name).startswith("_foreach"):
+            assert self.device_check == DeviceCheckType.NoCheck, (
+                "foreach kernels fall back to slow path when tensor are on different devices, "
+                "device_check not allowed to be enabled"
+            )
+
+    @property
+    def has_composite_kernel(self) -> bool:
+        return (
+            self.has_composite_implicit_autograd_kernel
+            or self.has_composite_explicit_autograd_kernel
+        )
+
+    @property
+    def is_view_op(self) -> bool:
+        rets = self.func.returns
+        is_non_mutating_view = len(rets) > 0 and any(
+            r.annotation is not None and not r.annotation.is_write for r in rets
+        )
+        is_inplace_view = "inplace_view" in self.tags
+        is_wildcard_view = any(
+            inp.annotation is not None and inp.annotation.alias_set_after != ""
+            for inp in self.func.schema_order_arguments()
+        )
+        return is_non_mutating_view or is_inplace_view or is_wildcard_view
+
+    @property
+    def view_schema_kind(self) -> ViewSchemaKind:
+        # This covers both "ordinary" inplace ops, and inplace_views
+        if self.func.name.name.inplace:
+            return ViewSchemaKind.inplace
+        elif self.func.is_out_fn():
+            return ViewSchemaKind.out
+        elif self.is_view_op:
+            return ViewSchemaKind.aliasing
+        else:
+            return ViewSchemaKind.non_aliasing
+
+    @property
+    def root_name(self) -> str:
+        return self.func.name.name.base
+
+
+SchemaKind = Enum("SchemaKind", ("functional", "inplace", "out"))
+
+# A structured kernel is guaranteed to have a functional and out variant, and
+# optionally an inplace variant.
+#
+# NB: we create NativeFunctionsGroup *even if* the function is not
+# actually annotated structured.  Test the structured boolean to see if it
+# actually is structured or not.
+@dataclass(frozen=True)
+class NativeFunctionsGroup:
+    functional: NativeFunction
+    inplace: Optional[NativeFunction]
+    out: NativeFunction
+
+    @property
+    def structured(self) -> bool:
+        # Whether or not the operator has a meta() function. This information is backend-agnostic.
+        return self.out.structured
+
+    def __post_init__(self) -> None:
+        test_sig: FunctionSchema = self.functional.func.signature()
+        for f in self.functions():
+            if test_sig != f.func.signature():
+                raise AssertionError(
+                    "NativeFunctionsGroup constructed from two NativeFunctions "
+                    f"that don't have matching signatures: {test_sig} != {f.func.signature()}"
+                )
+        assert self.functional.func.kind() == SchemaKind.functional
+        assert self.out.func.kind() == SchemaKind.out
+        if self.inplace is not None:
+            assert self.inplace.func.kind() == SchemaKind.inplace
+
+        if self.structured:
+            # For now, structured composite kernels are not supported (need some
+            # design work to figure out how to make the composite case work)
+            assert not self.out.has_composite_implicit_autograd_kernel
+
+            assert self.functional.structured_delegate == self.out.func.name, (
+                f"{self.functional.func.name} delegates to {self.functional.structured_delegate} "
+                f"but its actual delegate is {self.out.func.name}"
+            )
+            if self.inplace is not None:
+                assert self.inplace.structured_delegate == self.out.func.name
+
+    def signature(self) -> "FunctionSchema":
+        return self.out.func.signature()
+
+    def functions(self) -> Iterator[NativeFunction]:
+        yield self.functional
+        yield self.out
+        if self.inplace is not None:
+            yield self.inplace
+
+    @property
+    def root_name(self) -> str:
+        return self.functional.root_name
+
+    @staticmethod
+    def from_dict(
+        d: Dict[SchemaKind, NativeFunction]
+    ) -> Optional["NativeFunctionsGroup"]:
+        assert d
+        if len(d) == 1:
+            return None
+        d = dict(d)  # non-destructive updates please
+        functional = d.pop(SchemaKind.functional, None)
+        inplace = d.pop(SchemaKind.inplace, None)
+        out = d.pop(SchemaKind.out, None)
+        assert not d
+        assert functional is not None
+        # There are a few operators which only have functional/inplace variants;
+        # these don't count as structured for our purposes here
+        if out is None:
+            return None
+
+        return NativeFunctionsGroup(
+            functional=functional,
+            inplace=inplace,
+            out=out,
+        )
+
+
+@dataclass(frozen=True)
+class BackendMetadata:
+    # The name of the backend kernel, for a given operator
+    # for in-tree backends. These names come directly from the 'dispatch" field
+    # in native_functions.yaml. The dispatch entry is optional; in that
+    # case, that is equivalent to having written:
+    #
+    #   dispatch:
+    #       CompositeImplicitAutograd: $operator_name
+    kernel: str
+    # Whether or not the operator has a structured kernel implemented, for this particular backend.
+    # For in-tree backends, they all have the same value for structured- this is listed
+    # in native_functions.yaml.
+    # However, external backends like XLA can indendently toggle which ops are structured.
+    structured: bool
+
+
+@dataclass(frozen=True)
+class UfuncInnerLoop:
+    name: str
+    supported_dtypes: Set[ScalarType]
+    # key is stored here because it affects the semantics of name,
+    # so its helpful to have them together for further processing
+    ufunc_key: UfuncKey
+
+    @staticmethod
+    def parse(value: str, ufunc_key: UfuncKey) -> "UfuncInnerLoop":
+        name, supported_dtypes_str = value.split(" ", 1)
+        assert supported_dtypes_str[0] == "("
+        assert supported_dtypes_str[-1] == ")"
+        supported_dtypes = set()
+        for k in supported_dtypes_str[1:-1].split(", "):
+            supported_dtypes |= ScalarType.parse_set(k)
+        return UfuncInnerLoop(
+            name=name, supported_dtypes=supported_dtypes, ufunc_key=ufunc_key
+        )
+
+
+# BackendIndex represents a backend.
+# The BackendIndex encodes per-operator information that is potentially different
+# for each backend. The most obvious example is the name of the kernel
+# (the 'dispatch' entry in native_functions.yaml).
+# However, there can be other examples of different backends having different information.
+# External backends can choose to opt their kernels to be structured independently from in-tree backends,
+# which means that this information isn't inherentely tied to a NativeFunction- it's different per backend.
+@dataclass(frozen=True)
+class BackendIndex:
+    dispatch_key: DispatchKey
+    # Mainly important for structured kernels, this determines which variant in the operator group is used to implement the others.
+    # All in-tree ops use out kernels, while XLA uses functional kernels.
+    use_out_as_primary: bool
+    # Whether the backend requires a device guard, and device checks.
+    # For in-tree backends, this is currently just CUDA/HIP
+    # For out-of-tree backends, this is currently just Intel XPU
+    device_guard: bool
+    # Whether the backend is in-tree (CPU/CUDA) or out-of-tree (XLA)
+    external: bool
+    # Other backend-specific information that is on a per-operator basis
+    index: Dict["OperatorName", BackendMetadata]
+
+    @staticmethod
+    def grow_index(
+        parent_index: Dict[DispatchKey, Dict["OperatorName", BackendMetadata]],
+        child_index: Dict[DispatchKey, Dict["OperatorName", BackendMetadata]],
+    ) -> None:
+        for k, v in child_index.items():
+            for op_name, metadata in v.items():
+                assert (
+                    op_name not in parent_index[k]
+                ), f"duplicate operator {op_name} for dispatch key {k}"
+                parent_index[k][op_name] = metadata
+
+    def primary(self, g: NativeFunctionsGroup) -> NativeFunction:
+        if self.use_out_as_primary:
+            return g.out
+        else:
+            return g.functional
+
+    def has_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> bool:
+        m = self.get_kernel(g)
+        return m is not None
+
+    def get_kernel(
+        self, g: Union[NativeFunction, NativeFunctionsGroup]
+    ) -> Optional[BackendMetadata]:
+        if isinstance(g, NativeFunction):
+            f = g
+        elif isinstance(g, NativeFunctionsGroup):
+            f = self.primary(g)
+        else:
+            assert_never(f)
+        if f.func.name not in self.index:
+            return None
+        return self.index[f.func.name]
+
+    def native_function_class_name(self) -> Optional[str]:
+        if self.external:
+            return f"{str(self.dispatch_key)}NativeFunctions"
+        else:
+            # TODO: This discrepancy isn't required; we could also generated
+            # a class for in-tree kernels. It'll just require carefully
+            # updating every kernel definition + callsite of every in-tree aten kernel.
+            return None
+
+
+# The function schema is undoubtedly the most important data structure
+# in all of the codegen, as it defines the type signature for operators,
+# and most of the code generation we do is type directed (e.g., look at
+# the types, decide what to do.  Think about how we code generate
+# C++ function stubs!)
+#
+# We will also see in this class the general structure for how we model
+# data in this code generation.  A few notable properties to point out
+# ahead of time:
+#
+#   - These dataclasses are a *lossless* representation of the strings
+#     they are parsed from.  In fact, we assert that given the
+#     information stored in the dataclass, we can exactly reconstruct
+#     the string we parsed from (and assert this inside the parse
+#     definition).  There are a few reasons for this:
+#
+#       - If you find that it is difficult to reconstruct the string
+#         given a dataclass, that is a clue that you are data
+#         representation is wrong.
+#
+#       - It helps ensure that all relevant information is present
+#         in the dataclass, so that downstream users aren't tempted
+#         to reparse the original string to get some information
+#         that was omitted.
+#
+#       - It forces you to represent the data in-memory in the same way
+#         it is recorded textually, which makes the dataclasses easier
+#         to understand for someone who is familiar with the
+#         textual format.  (As a tradeoff, it means you have to model
+#         the syntax, even when it is inconvenient.  But maybe that means
+#         the syntax is bad!)  If you don't understand the internal
+#         representation, go look at the printing code to see how
+#         it maps onto the surface syntax!
+#
+#       - It makes it easy to test the parsing code, as parsing code
+#         that is inconsistent with the string code will fail early
+#         and loudly.  (As a tradeoff, it makes the parsing code a bit
+#         brittle (in particular, with trivial whitespace changes you
+#         are likely to trigger an assert error).
+#
+#     In general, try to make the __str__ code as simple as possible
+#     (even at the cost of more complex parsing logic.)  Additionally,
+#     try to minimize redundancy in data representation.  (Precomputed
+#     fields are OK though: they are defined as a simple function on
+#     the canonical representation in question.)
+#
+#   - These dataclasses are all frozen; once constructed their
+#     values never change.  This makes it easy to tell where any
+#     given data came from: just look to the constructor.  As a
+#     tradeoff, you can't easily "decorate" a schema with extra
+#     information from a post-facto analysis.  We impose this
+#     restriction to make these structures more understandable.
+#
+@dataclass(frozen=True)
+class FunctionSchema:
+    # The name of the operator this function schema describes.
+    name: "OperatorName"
+
+    arguments: "Arguments"
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Tuple["Return", ...]
+
+    def schema_order_arguments(self) -> Iterator["Argument"]:
+        return itertools.chain(
+            self.arguments.flat_positional,
+            self.arguments.flat_kwarg_only,
+            self.arguments.out,
+        )
+
+    @staticmethod
+    def parse(func: str) -> "FunctionSchema":
+        # We should probably get a proper parser here
+        assert (
+            " -> " in func
+        ), "function schema missing return type (spaces are mandatory)"
+        last_index = func.rfind(" -> ")
+        func_decl = func[:last_index]
+        return_decl = func[last_index + len(" -> ") :]
+        ops, args = func_decl.split("(", 1)
+        assert args[-1] == ")", "Expecting closing )"
+        args = args[:-1]
+        name = OperatorName.parse(ops)
+        arguments = Arguments.parse(args)
+        returns = parse_returns(return_decl)
+        r = FunctionSchema(name=name, arguments=arguments, returns=returns)
+        assert str(r) == func, f"{str(r)} != {func}"
+        return r
+
+    def __post_init__(self) -> None:
+        for arg, ret in zip(self.arguments.out, self.returns):
+            assert arg.annotation == ret.annotation, (
+                "Out arguments must have matching return Tensor; furthermore, "
+                "the ith-argument needs to correspond to the ith return"
+            )
+        # Invariant: we expect out arguments to appear as keyword arguments in the schema.
+        # This means that all mutable returns should be aliased to a keyword argument
+        # (except for "self", which we explicitly don't treat as an out argument because of its use in methods)
+        # See Note [is_out_fn]
+        out_and_self = list(self.arguments.out) + [
+            arg for arg in self.arguments.flat_positional if arg.name == "self"
+        ]
+        mutable_returns = [
+            ret
+            for ret in self.returns
+            if ret.annotation is not None and ret.annotation.is_write
+        ]
+        for ret in mutable_returns:
+            assert any([ret.annotation == arg.annotation for arg in out_and_self]), (
+                'All mutable returns must be aliased either to a keyword argument, or to "self". '
+                "Did you forget to mark an out argument as keyword-only?"
+            )
+        if self.arguments.out:
+            # out= ops that return their mutable inputs are only really useful for method chaining.
+            # And method chaining is only really useful if the thing you're returning is a plain Tensor.
+            # So ideally, we'd enforce that out= ops with a single plain mutable tensor should return the tensor,
+            # and all other types of out= op schemas should return void.
+            # There are a bunch of existing out= ops that return tuples of tensors though, so we're stuck with allowing that.
+            if any(a.type != BaseType(BaseTy.Tensor) for a in self.arguments.out):
+                assert (
+                    len(self.returns) == 0
+                ), "out= ops that accept tensor lists as out arguments "
+                "are expected to have no return type (since you can't do method chaining on them)"
+            else:
+                assert len(self.arguments.out) == len(
+                    self.returns
+                ), "Must return as many arguments as there are out arguments, or no return at all"
+
+        if self.name.name.inplace:
+            self_a = self.arguments.self_arg
+            assert (
+                self_a
+                and self_a.argument.annotation
+                and self_a.argument.annotation.is_write
+            )
+            if self_a.argument.type == BaseType(BaseTy.Tensor):
+                # All inplace ops with an ordinary `Tensor self` argument should return self,
+                # to allow for method chaining.
+                assert (
+                    len(self.returns) == 1
+                    and self.returns[0].annotation == self_a.argument.annotation
+                )
+            else:
+                # You can't method chain on non-tensor self arguments though (like a List[Tensor])
+                # so in all other cases we expect the return type to be none.
+                assert len(self.returns) == 0
+
+    def is_out_fn(self) -> bool:
+        # Note [is_out_fn]
+        #
+        # out functions are the variants which take an explicit out= argument
+        # to populate into.  We need to know if a schema corresponds to an
+        # out function for several reasons:
+        #
+        #   - They codegen differently in C++ API
+        #       - codegen to at::add_out rather than at::add
+        #       - out argument is moved to front of C++ argument list
+        #
+        # out functions are DEFINED to be any function with a keyword-only
+        # argument that is mutable.  In principle, this could lead to a
+        # false positive if you define a function that mutates a
+        # kwarg only argument, but this isn't the "true" output of this
+        # function.  A more robust definition that would work in this
+        # case would also look at:
+        #
+        #   - The output types.  Out functions take in the arguments
+        #     they mutate and then return them again; this is sort
+        #     of "definitionally" what makes something an out function.
+        #     Historically, we DO check this for consistency.
+        #   - Correspondence with pure variant.  An out function
+        #     should have a signature equivalent to its pure variant,
+        #     but just with extra kwargs for the output elements.  This
+        #     is difficult to actually check for and historically
+        #     we only do this check in tools/
+        return bool(self.arguments.out)
+
+    def kind(self) -> SchemaKind:
+        """
+        What kind of schema is this?  A functional schema is one
+        that returns a newly allocated output; an inplace schema
+        modifies the self argument inplace; an out schema writes
+        the result into an explicitly provided out argument.
+        """
+        is_inplace = self.name.name.inplace
+        is_out = bool(self.arguments.out)
+        assert not (is_inplace and is_out)
+        if is_inplace:
+            return SchemaKind.inplace
+        elif is_out:
+            return SchemaKind.out
+        else:
+            return SchemaKind.functional
+
+    def signature(
+        self, *, strip_default: bool = False, strip_view_copy_name: bool = False
+    ) -> "FunctionSchema":
+        """
+        Certain schemas are 'related', in that they are simply
+        inplace/out/functional versions of the same function.  This method
+        factors these schemas into the "core" functional signature which
+        is equal across all versions.
+
+        Here is what normalization happens to the schema to convert
+        it to a signature:
+        - The overload name is stripped (name is retained, since
+          it expresses semantic content about what the function does)
+        - Inplace is set False
+        - Out arguments are stripped
+        - Mutability annotations are stripped  (this is sound
+          because you cannot overload on mutability annotation)
+        - Return names are stripped since they are not overloadable and
+          some variants have return names but some not
+
+        Finally, we want to be able to pair up related "view" and their
+        corresponding "view_copy" operators. We do this by optionally
+        stripping the trailing "_copy" from the base name.
+        """
+
+        def strip_ret_annotation(r: Return) -> Return:
+            return Return(
+                name=None,
+                type=r.type,
+                annotation=None,
+            )
+
+        base_name = self.name.name.base
+        if strip_view_copy_name and base_name.endswith("_copy"):
+            base_name = base_name.replace("_copy", "")
+
+        return FunctionSchema(
+            name=OperatorName(
+                name=BaseOperatorName(
+                    base=base_name,
+                    inplace=False,
+                    dunder_method=self.name.name.dunder_method,
+                ),
+                overload_name="",  # stripped
+            ),
+            arguments=self.arguments.signature(strip_default=strip_default),
+            returns=tuple(map(strip_ret_annotation, self.returns)),
+        )
+
+    def view_signature(self) -> "FunctionSchema":
+        return self.signature(strip_view_copy_name=True)
+
+    @property
+    def modifies_arguments(self) -> bool:
+        return self.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+    def __str__(self) -> str:
+        all_arguments_str = str(self.arguments)
+        if len(self.returns) == 1:
+            returns = str(self.returns[0])  # omit parentheses
+        else:
+            returns = "(" + ", ".join(map(str, self.returns)) + ")"
+        return f"{self.name}({all_arguments_str}) -> {returns}"
+
+
+# Here is the rest of the data model, described more briefly.
+
+# Simplified version for what actually shows up in built-ins.
+# Look at alias_info.h for expanded syntax.  If you need the structure,
+# you also need to make this structure recursive so it can be lined
+# up with the type components too.  For primitives this isn't really
+# necessary
+@dataclass(frozen=True)
+class Annotation:
+    # Typically only has one element.  Not actually a set so
+    # we can conveniently assume it is canonically ordered
+    alias_set: Tuple[str, ...]
+    is_write: bool
+    alias_set_after: str
+
+    @staticmethod
+    def parse(ann: str) -> "Annotation":
+        # Only handling afterSet == Wildcard for now
+        becomes_wildcard_index = ann.find(" -> *")
+        if becomes_wildcard_index != -1:
+            after_set = "*"
+            # TODO: im not good enough with regexes to ignore -> *
+            m = re.match(
+                r"^([a-z])(!?)(!?)$",
+                ann[:becomes_wildcard_index]
+                + ann[becomes_wildcard_index + len(" -> *") :],
+            )
+        else:
+            after_set = ""
+            m = re.match(r"^([a-z])(!?)(!?)$", ann)
+        assert m is not None, f"unrecognized alias annotation {ann}"
+        alias_set = (m.group(1),)
+        is_write = m.group(2) == "!"
+        r = Annotation(
+            alias_set=alias_set, is_write=is_write, alias_set_after=after_set
+        )
+        assert str(r) == ann, f"{r} != {ann}"
+        return r
+
+    def __str__(self) -> str:
+        alias_set = "|".join(self.alias_set)
+        if self.alias_set_after:
+            alias_set = f'{alias_set}{" -> "}{self.alias_set_after}'
+        is_write = "!" if self.is_write else ""
+        return f"{alias_set}{is_write}"
+
+
+# The base class for the type system.  This is also loosely modeled
+# off of jit_type.h, but we've simplified the hierarchy to focus
+# in on the aspects of the type system that matter for code generation
+# (for example, there's no SingleElementType subclass anymore).
+# You never actually construct a Type; usually it's going to be one
+# of the subclasses.  If Python had ADTs this would be one!
+@dataclass(frozen=True)
+class Type:
+    @staticmethod
+    def parse(t: str) -> "Type":
+        r = Type._parse(t)
+        assert str(r) == t, f"{r} != {t}"
+        return r
+
+    @staticmethod
+    def _parse(t: str) -> "Type":
+        m = re.match(r"^(.+)\?$", t)
+        if m is not None:
+            return OptionalType(Type.parse(m.group(1)))
+        m = re.match(r"^(.+)\[([0-9]+)?\]$", t)
+        if m is not None:
+            size = int(m.group(2)) if m.group(2) is not None else None
+            return ListType(elem=Type.parse(m.group(1)), size=size)
+        try:
+            return BaseType(BaseTy[t])
+        except KeyError:
+            raise RuntimeError(f"unrecognized type {t}")
+
+    def __str__(self) -> str:
+        raise NotImplementedError
+
+    # WARNING: These concepts are not very well-defined.  For example,
+    # is "int?" nullable? How about "int?[]".  They are defined
+    # so we can conveniently generate legacy Declarations.yaml but
+    # really we should probably just remove these at some point
+
+    def is_tensor_like(self) -> bool:
+        raise NotImplementedError
+
+    def is_nullable(self) -> bool:
+        raise NotImplementedError
+
+    def is_list_like(self) -> Optional["ListType"]:
+        raise NotImplementedError
+
+
+# Base types are simple, atomic types with no further structure
+BaseTy = Enum(
+    "BaseTy",
+    (
+        "Generator",
+        "ScalarType",
+        "Tensor",
+        "int",
+        "Dimname",
+        "float",
+        "str",
+        "bool",
+        "Layout",
+        "Device",
+        "Scalar",
+        "MemoryFormat",
+        "QScheme",
+        "Storage",
+        "Stream",
+        "SymInt",
+        "ConstQuantizerPtr",  # TODO: rename
+    ),
+)
+
+
+@dataclass(frozen=True)
+class BaseType(Type):
+    name: BaseTy
+
+    def __str__(self) -> str:
+        return f"{self.name.name}"
+
+    def is_tensor_like(self) -> bool:
+        return self.name == BaseTy.Tensor
+
+    def is_nullable(self) -> bool:
+        return False
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return None
+
+
+# Optional types may be specified, or may also be validly given None
+@dataclass(frozen=True)
+class OptionalType(Type):
+    elem: Type
+
+    def __str__(self) -> str:
+        return f"{self.elem}?"
+
+    def is_tensor_like(self) -> bool:
+        return self.elem.is_tensor_like()
+
+    def is_nullable(self) -> bool:
+        return True
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return self.elem.is_list_like()
+
+
+# List types specify that we may have multiples of an element.  We
+# also support explicit sizes on list types, but these have
+# some nontrivial semantics!  (However, for C++ API purposes, explicit
+# sizes are mostly erased from the type system.)
+#
+# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g.,
+# int[] elaborates differently than bool[3]!
+@dataclass(frozen=True)
+class ListType(Type):
+    elem: Type
+    size: Optional[int]
+
+    def __str__(self) -> str:
+        size = f"{self.size}" if self.size else ""
+        return f"{self.elem}[{size}]"
+
+    def is_tensor_like(self) -> bool:
+        return self.elem.is_tensor_like()
+
+    def is_nullable(self) -> bool:
+        return self.elem.is_nullable()
+
+    def is_list_like(self) -> Optional["ListType"]:
+        return self
+
+
+@dataclass(frozen=True)
+class Argument:
+    # NB: I didn't put kwarg_only as a boolean field here, unlike
+    # c10::Argument, so that printing works correctly
+
+    name: str
+    type: Type
+    default: Optional[str]
+
+    # The semantics of the annotation field are a little strange.
+    #
+    # Alias annotations parametrize Tensors (since Tensors are the only things
+    # that can alias.)  This motivates why I write Tensor(a!)?  (and not, for
+    # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor,
+    # which may be optional (i.e., the alias annotation should bind first to
+    # Tensor, before the optional postfix annotation).
+    #
+    # However, despite being a property of Tensor, we (and c10::Argument)
+    # store the annotation at the top level of the Argument, rather than
+    # inside the embedded Tensor type.  In the C++ version of this
+    # class, we then go through great lengths to mimic the type
+    # structure in the annotation structure so we can correlate
+    # annotations with types.
+    #
+    # Now, it turns out, in all applications in code generation, the
+    # structure of annotated types is very simple.  So we just hard
+    # code it here.  But if we ever do get anything more complex, this
+    # model will have to change!
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> "Argument":
+        name: str
+        default: Optional[str]
+        type_and_annot, name_and_default = arg.rsplit(" ", 1)
+        if "=" in name_and_default:
+            name, default = name_and_default.split("=")
+        else:
+            name = name_and_default
+            default = None
+        # TODO: deduplicate annotation matching with Return
+        match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in [
+                "",
+                "?",
+                "[]",
+            ], "unrecognized alias analysis form with Tensor"
+            type_s = "Tensor" + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Argument(
+            name=name,
+            type=type,
+            default=default,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f"{str(r)} != {arg}"
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f"{self.type}"
+        if self.annotation:
+            assert type in ["Tensor", "Tensor?", "Tensor[]"]
+            type = type.replace("Tensor", f"Tensor({self.annotation})")
+        if self.name is None:
+            return type
+        else:
+            mb_default = ""
+            if self.default:
+                mb_default = f"={self.default}"
+            return f"{type} {self.name}{mb_default}"
+
+
+@dataclass(frozen=True)
+class Return:
+    name: Optional[str]
+    type: Type
+    annotation: Optional[Annotation]
+
+    @staticmethod
+    def parse(arg: str) -> "Return":
+        name: Optional[str]
+        if " " in arg:
+            type_and_annot, name = arg.rsplit(" ", 1)
+        else:
+            type_and_annot = arg
+            name = None
+        match = re.match(r"Tensor\((.+)\)(.*)", type_and_annot)
+        annotation: Optional[Annotation]
+        if match:
+            # If you update this, make sure the __str__ still works too
+            assert match.group(2) in [
+                "",
+                "?",
+                "[]",
+            ], "unrecognized alias analysis form with Tensor"
+            type_s = "Tensor" + match.group(2)
+            annotation = Annotation.parse(match.group(1))
+        else:
+            type_s = type_and_annot
+            annotation = None
+        type = Type.parse(type_s)
+        r = Return(
+            name=name,
+            type=type,
+            annotation=annotation,
+        )
+        assert str(r) == arg, f"{str(r)} != {arg}"
+        return r
+
+    @property
+    def is_write(self) -> bool:
+        return self.annotation is not None and self.annotation.is_write
+
+    def __str__(self) -> str:
+        type = f"{self.type}"
+        if self.annotation:
+            assert type in ["Tensor", "Tensor?", "Tensor[]"]
+            type = type.replace("Tensor", f"Tensor({self.annotation})")
+        if self.name is None:
+            return type
+        else:
+            return f"{type} {self.name}"
+
+
+# Represents the self argument for functions that may be methods
+@dataclass(frozen=True)
+class SelfArgument:
+    argument: Argument
+
+
+# Bundle of arguments that represent a TensorOptions.  This is mostly
+# relevant for the public C++ API but we bake it into the core data
+# model because other APIs often have to interact with it
+@dataclass(frozen=True)
+class TensorOptionsArguments:
+    dtype: Argument
+    layout: Argument
+    device: Argument
+    pin_memory: Argument
+
+    def all(self) -> Sequence[Argument]:
+        return [self.dtype, self.layout, self.device, self.pin_memory]
+
+
+@dataclass(frozen=True)
+class Arguments:
+    # pre_self_positional is usually empty, but is notably non-empty
+    # for where.self, where the condition argument comes before the
+    # self argument
+    pre_self_positional: Tuple[Argument, ...]
+    self_arg: Optional[SelfArgument]
+    post_self_positional: Tuple[Argument, ...]
+
+    pre_tensor_options_kwarg_only: Tuple[Argument, ...]
+    tensor_options: Optional[TensorOptionsArguments]
+    # post_tensor_options is typically memory format, which should be
+    # part of tensor options but isn't right now, and is usually
+    # placed after the tensor options arguments
+    post_tensor_options_kwarg_only: Tuple[Argument, ...]
+
+    # Unlike in the previous codegen, we have factored out 'out' arguments
+    # in the canonical representation, removing them from kwarg
+    # arguments.  This choice is justified by numerous downstream
+    # transformations which treat out arguments specially; additionally,
+    # you can see that canonicity is not violated!
+    out: Tuple[Argument, ...]  # these are also kwarg-only
+
+    @property
+    def flat_non_out(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.flat_positional)
+        ret.extend(self.flat_kwarg_only)
+        return ret
+
+    @property
+    def flat_positional(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_self_positional)
+        if self.self_arg is not None:
+            ret.append(self.self_arg.argument)
+        ret.extend(self.post_self_positional)
+        return ret
+
+    # NB: doesn't contain out arguments
+    @property
+    def flat_kwarg_only(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_tensor_options_kwarg_only)
+        if self.tensor_options is not None:
+            ret.extend(self.tensor_options.all())
+        ret.extend(self.post_tensor_options_kwarg_only)
+        return ret
+
+    @property
+    def flat_all(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.flat_positional)
+        ret.extend(self.flat_kwarg_only)
+        ret.extend(self.out)
+        return ret
+
+    @property
+    def non_out(
+        self,
+    ) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
+        ret.extend(self.positional)
+        ret.extend(self.kwarg_only)
+        return ret
+
+    @property
+    def positional(self) -> Sequence[Union[Argument, SelfArgument]]:
+        ret: List[Union[Argument, SelfArgument]] = []
+        ret.extend(self.pre_self_positional)
+        if self.self_arg is not None:
+            ret.append(self.self_arg)
+        ret.extend(self.post_self_positional)
+        return ret
+
+    @property
+    def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, TensorOptionsArguments]] = []
+        ret.extend(self.pre_tensor_options_kwarg_only)
+        if self.tensor_options is not None:
+            ret.append(self.tensor_options)
+        ret.extend(self.post_tensor_options_kwarg_only)
+        return ret
+
+    @property
+    def all(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]:
+        ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
+        ret.extend(self.positional)
+        ret.extend(self.kwarg_only)
+        ret.extend(self.out)
+        return ret
+
+    def signature(self, *, strip_default: bool = False) -> "Arguments":
+        # dataclasses.replace could be used here, but it is less
+        # type safe so for now I've opted to type everything out
+        def strip_arg_annotation(a: Argument) -> Argument:
+            return Argument(
+                name=a.name,
+                type=a.type,
+                default=a.default if not strip_default else None,
+                annotation=None,
+            )
+
+        return Arguments(
+            pre_self_positional=tuple(
+                map(strip_arg_annotation, self.pre_self_positional)
+            ),
+            self_arg=SelfArgument(strip_arg_annotation(self.self_arg.argument))
+            if self.self_arg is not None
+            else None,
+            post_self_positional=tuple(
+                map(strip_arg_annotation, self.post_self_positional)
+            ),
+            pre_tensor_options_kwarg_only=tuple(
+                map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)
+            ),
+            # NB: tensor_options guaranteed to not have any alias annotations
+            tensor_options=self.tensor_options,
+            post_tensor_options_kwarg_only=tuple(
+                map(strip_arg_annotation, self.post_tensor_options_kwarg_only)
+            ),
+            # out arguments are dropped in signature
+            out=(),
+        )
+
+    @staticmethod
+    def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]:
+        positional: List[Argument] = []
+        kwarg_only: List[Argument] = []
+        out: List[Argument] = []
+        arguments_acc = positional
+
+        # TODO: Use a real parser here; this will get bamboozled
+        # by signatures that contain things like std::array<bool, 2> (note the space)
+        for arg in args.split(", "):
+            if not arg:
+                continue
+            if arg == "*":
+                assert (
+                    arguments_acc is positional
+                ), "invalid syntax: kwarg-only specifier * can only occur once"
+                arguments_acc = kwarg_only
+                continue
+            parg = Argument.parse(arg)
+            # Currently, we rely directly on the invariant that there are NO
+            # kwarg-only mutating arguments.  If you want to relax this,
+            # we will need a more semantic way of matching that takes
+            # into account return arguments.  In that case, you will have
+            # to manage out computation a level up, in FunctionSchema.  See Note
+            # [is_out_fn]
+            if parg.annotation is not None and parg.annotation.is_write:
+                if arguments_acc is positional:
+                    pass  # do nothing
+                elif arguments_acc is kwarg_only:
+                    arguments_acc = out
+            else:
+                assert arguments_acc is not out
+            arguments_acc.append(parg)
+
+        return positional, kwarg_only, out
+
+    @staticmethod
+    def parse(args: str) -> "Arguments":
+        """
+        Input: 'int x, int y, int z'
+        """
+
+        # We do this in two phases.  First we parse into three
+        # main categories: positional, kwarg_only, out.
+        # Then, we reparse positional and kwarg_only to separate
+        # out the self argument and tensor options arguments.
+
+        positional, kwarg_only, out = Arguments._preparse(args)
+
+        # Split self argument
+        self_ix = None
+        for i, a in enumerate(positional):
+            if a.name == "self":
+                self_ix = i
+                break
+        pre_self_positional: List[Argument]
+        self_arg: Optional[SelfArgument]
+        post_self_positional: List[Argument]
+        if self_ix is not None:
+            pre_self_positional = positional[:self_ix]
+            self_arg = SelfArgument(positional[self_ix])
+            post_self_positional = positional[self_ix + 1 :]
+        else:
+            pre_self_positional = []
+            self_arg = None
+            post_self_positional = positional
+
+        # Group tensor options arguments
+        pre_tensor_options_kwarg_only: List[Argument] = []
+        tensor_options: Optional[TensorOptionsArguments] = None
+        post_tensor_options_kwarg_only: List[Argument] = []
+        kwarg_only_acc = pre_tensor_options_kwarg_only
+
+        def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
+            return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
+
+        predicates = [  # order matters
+            pred("dtype", Type.parse("ScalarType")),
+            pred("layout", Type.parse("Layout")),
+            pred("device", Type.parse("Device")),
+            pred("pin_memory", Type.parse("bool")),
+        ]
+
+        i = 0
+        while i < len(kwarg_only):
+            # If there is enough space...
+            if i <= len(kwarg_only) - len(predicates):
+                # And the next len(predicates) arguments look like TensorOptions arguments
+                if all(
+                    p(a)
+                    for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])
+                ):
+                    assert kwarg_only_acc is pre_tensor_options_kwarg_only
+                    # Group them together as one argument
+                    tensor_options = TensorOptionsArguments(
+                        dtype=kwarg_only[i],
+                        layout=kwarg_only[i + 1],
+                        device=kwarg_only[i + 2],
+                        pin_memory=kwarg_only[i + 3],
+                    )
+                    i += len(predicates)
+                    kwarg_only_acc = post_tensor_options_kwarg_only
+                    continue
+            kwarg_only_acc.append(kwarg_only[i])
+            i += 1
+
+        return Arguments(
+            pre_self_positional=tuple(pre_self_positional),
+            self_arg=self_arg,
+            post_self_positional=tuple(post_self_positional),
+            pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only),
+            tensor_options=tensor_options,
+            post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only),
+            out=tuple(out),
+        )
+
+    def __str__(self) -> str:
+        all_arguments: List[str] = []
+        all_arguments.extend(map(str, self.flat_positional))
+        if self.flat_kwarg_only or self.out:
+            all_arguments.append("*")
+        all_arguments.extend(map(str, self.flat_kwarg_only))
+        all_arguments.extend(map(str, self.out))
+        return ", ".join(all_arguments)
+
+    def __post_init__(self) -> None:
+        # TODO: These invariants are weirdly asymmetric?
+        # TODO: Fancier types?
+        if self.self_arg is None:
+            assert not self.pre_self_positional
+        if self.tensor_options is None:
+            assert not self.post_tensor_options_kwarg_only
+
+
+# Names that validly are __iXXX__ indicating inplace operations.
+# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods
+# NB: PyTorch hasn't actually implemented all of these
+AUGMENTED_ASSIGNMENT_NAMES = [
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "mod",
+    "pow",
+    "lshift",
+    "rshift",
+    "and",
+    "xor",
+    "or",
+]
+
+# A BaseOperatorName is what we think of the operator name, without
+# the overload name.  Unusually, we don't represent this as just a
+# string; instead, we directly represent a few important semantic
+# bits of information we derive from the string: namely whether
+# or not it's inplace (add_) and whether or not it's a double-underscore
+# method (__add__)
+@dataclass(frozen=True)
+class BaseOperatorName:
+    base: str
+    inplace: bool
+    dunder_method: bool
+
+    @staticmethod
+    def parse(op: str) -> "BaseOperatorName":
+        assert op != ""
+        assert not op.endswith("_out"), (
+            "_out suffix is reserved and not permitted for operator names; "
+            "did you mean to specify an out overload name instead?"
+        )
+        m = re.match(r"^__([^_]+)__$", op)
+        if m is not None:
+            dunder_method = True
+            base = m.group(1)
+            if any(base == f"i{n}" for n in AUGMENTED_ASSIGNMENT_NAMES):
+                inplace = True
+                base = base[1:]
+            else:
+                inplace = False
+                # temporary, this is not intrinsically true but
+                # has been historically true for dunder methods
+                # we support  (but, if we ever got, say, __int__, this would
+                # be wrong!)
+                assert base[0] != "i"
+        else:
+            dunder_method = False
+            base = op
+            if base[-1] == "_":
+                inplace = True
+                base = base[:-1]
+            else:
+                inplace = False
+        r = BaseOperatorName(base=base, inplace=inplace, dunder_method=dunder_method)
+        assert str(r) == op, f"{str(r)} != {op}"
+        return r
+
+    def __str__(self) -> str:
+        if self.dunder_method:
+            i = "i" if self.inplace else ""
+            return f"__{i}{self.base}__"
+        else:
+            i = "_" if self.inplace else ""
+            return f"{self.base}{i}"
+
+
+# Operator name is the base operator name along with the (typically not
+# user visible) overload string.
+@dataclass(frozen=True)
+class OperatorName:
+    name: BaseOperatorName
+    overload_name: str
+
+    @staticmethod
+    def parse(op_name: str) -> "OperatorName":
+        if "." in op_name:
+            name, overload_name = op_name.split(".", 1)
+        else:
+            name = op_name
+            overload_name = ""
+        r = OperatorName(name=BaseOperatorName.parse(name), overload_name=overload_name)
+        assert str(r) == op_name, f"{str(r)} != {op_name}"
+        return r
+
+    def __str__(self) -> str:
+        if self.overload_name:
+            return f"{self.name}.{self.overload_name}"
+        else:
+            return f"{self.name}"
+
+    # NB: This must be synchronized with the naming scheme in
+    # aten/src/ATen/templates/Operators.h
+    # Given a function schema "aten::op.overload(...)",
+    # If there is no overload name, this returns f"{op}"
+    # If there is an overload name, this returns f"{op}_{overload}"
+    def unambiguous_name(self) -> str:
+        if self.overload_name:
+            return f"{self.name}_{self.overload_name}"
+        else:
+            return f"{self.name}"
+
+    def remove_inplace(self) -> "OperatorName":
+        return OperatorName(
+            name=BaseOperatorName(
+                base=self.name.base,
+                inplace=False,
+                dunder_method=self.name.dunder_method,
+            ),
+            overload_name=self.overload_name,
+        )
+
+
+def gets_generated_out_inplace_wrapper(
+    f: NativeFunction, g: NativeFunctionsGroup, b: BackendIndex
+) -> bool:
+    return (
+        f.func.kind() is not SchemaKind.functional
+        and not b.has_kernel(f)
+        and b.has_kernel(g.functional)
+    )
+
+
+# NativeFunction objects that are views (f.is_view_op returns True)
+# are added into a `NativeFunctionsViewGroup`, which we can use to
+# easily access the generated (optional) view_copy NativeFunction.
+# It's convenient to group them together, so we pair them up in NativeFunctionsViewGroup.
+# See Note [Codegen'd {view}_copy Operators]
+#
+# One property of this representation is that in order for a view-like op to be part of
+# a NativeFunctionsViewGroup, the "aliasing" version of that view op must exist.
+# There's one case where that doesn't happen: we have a non-aliasing `narrow_copy.out` op,
+# but don't have corresponding aliasing `narrow.out` op.
+# This means that `narrow_copy.out` won't appear as a NativeFunctionsViewGroup.
+@dataclass(frozen=True)
+class NativeFunctionsViewGroup:
+    view: NativeFunction
+    # Note: the {view}_copy operator is optional because we currently don't generate copy variants
+    # for all view ops. Notably, we don't generate them for CompositeImplicitAutograd views
+    # (we already get them "for free" through decomposition)
+    view_copy: Optional[NativeFunction]
+    # view_inplace ops are also optional, but every view_inplace op should have out-of-place variant.
+    view_inplace: Optional[NativeFunction]
+
+    def __post_init__(self) -> None:
+        assert self.view.is_view_op
+        if self.view_copy is None:
+            assert not gets_generated_view_copy(self.view), (
+                f"{str(self.view.func.name)} appears to be a new operator that aliases its inputs."
+                " The codegen expects you to add a corresponding operator to native_functions.yaml:"
+                f" {get_view_copy_name(self.view)!s}."
+                " See Note [view_copy NativeFunctions] for details."
+            )
+        else:
+            assert self.view_copy.func.name.name.base.endswith("_copy")
+            assert self.view.func.signature() == self.view_copy.func.signature(
+                strip_view_copy_name=True
+            )
+            assert "view_copy" in self.view_copy.tags, (
+                f"{str(self.view_copy.func.name), str(self.view.tags)} appears to be a view_copy operator. The codegen expects"
+                " view_copy operators to be annotated with the 'view_copy' tag in native_functions.yaml."
+                " See Note [view_copy NativeFunction] for details."
+            )
+        if self.view_inplace is not None:
+            assert self.view.func.signature() == self.view_inplace.func.signature()
+
+        if self.view.has_composite_implicit_autograd_kernel:
+            if self.view_inplace is not None:
+                assert self.view_inplace.has_composite_implicit_autograd_kernel, (
+                    f"{str(self.view.func.name)} and {str(self.view_inplace.func.name)} must either"
+                    " both have CompositeImplicitAutograd kernels, or both not have composite kernels."
+                )
+
+    def functions(self, *, include_copy: bool = True) -> Iterator[NativeFunction]:
+        yield self.view
+        if self.view_inplace is not None:
+            yield self.view_inplace
+        if self.view_copy is not None and include_copy:
+            yield self.view_copy
+
+    @property
+    def root_name(self) -> str:
+        return self.view.root_name
+
+    @property
+    def composite(self) -> bool:
+        # We currently assert that the "group" is consistent.
+        # If the view op is composite, then its view_inplace op is too.
+        return self.view.has_composite_implicit_autograd_kernel
+
+
+def gets_generated_view_copy(f: NativeFunction) -> bool:
+    # Only aliasing (view) operators get a copy variant.
+    if not f.is_view_op:
+        return False
+    # We don't need to bother generating copy variants for CompositeImplicitAutograd ops,
+    # because we can let them decompose into base view ops.
+    if f.has_composite_implicit_autograd_kernel:
+        return False
+    # We also don't need to generate copy variants for inplace views.
+    if "inplace_view" in f.tags:
+        return False
+    return True
+
+
+# Given a NativeFunction that corresponds to a view op,
+# returns the OperatorName of the corresponding "copy" variant of the op.
+def get_view_copy_name(f: NativeFunction) -> "OperatorName":
+    # Right now, when asking for a view op's corresponding "view_copy" name
+    # we assert for sanity that the op is allowed to have a generated view_copy variant.
+    # (We can do this because "gets_generated_view_copy()" tell us which ops get a generated view_copy op).
+    # However, narrow_copy() already exists as an op directly in native_functions.yaml.
+    # I'm hardcoding narrow_copy here for now to maintain the assert,
+    # But we could also just get rid of the assert.
+    list_of_ops_with_explicit_view_copy_operators = ["narrow"]
+    if str(f.func.name) not in list_of_ops_with_explicit_view_copy_operators:
+        assert gets_generated_view_copy(f)
+
+    base_name = f"{f.func.name.name.base}_copy"
+    view_copy_name = OperatorName(
+        name=BaseOperatorName(
+            base=base_name, inplace=False, dunder_method=f.func.name.name.dunder_method
+        ),
+        overload_name=f.func.name.overload_name,
+    )
+    return view_copy_name
+
+
+# Helper functions for parsing argument lists (both inputs and returns)
+
+
+def parse_returns(return_decl: str) -> Tuple[Return, ...]:
+    """
+    Input: '()'
+    Output: []
+    """
+    if return_decl == "()":
+        return ()
+    if return_decl[0] == "(" and return_decl[-1] == ")":
+        return_decl = return_decl[1:-1]
+    return tuple(Return.parse(arg) for arg in return_decl.split(", "))
+
+
+# A Precompute instance consists of a map from kernel argument name
+# to the list of Argument instances that should replace that
+# kernel argument in the impl function.
+@dataclass(frozen=True)
+class Precompute:
+    # A map from kernel argument name -> a list of precomputed
+    # elements that replaces/supersedes it.
+    replace: Dict[str, List[Argument]]
+    # List of precomputed args added without replacement
+    add: List[Argument]
+
+    @staticmethod
+    def parse(src: object) -> "Precompute":
+        assert isinstance(src, list)
+
+        # src is a list of strings of the format:
+        #   {kernel param name} -> {replacement decl}[, {replacement decl}, ...]
+        #   [{add decl}[, {add decl}, ...]]
+        # The last line is optional and contains the precomputed parameters that are
+        # added without replacement.
+        # The other lines are parsed to get the names of which precomputed elements
+        # should replace which kernel arguments.
+        add_args = []
+        if " -> " not in src[-1]:
+            add_list = src[-1].split(",")
+            add_args = [Argument.parse(name.strip()) for name in add_list]
+            src = src[:-1]
+
+        replace = {}
+        for raw_replace_item in src:
+            assert isinstance(raw_replace_item, str)
+            assert " -> " in raw_replace_item, (
+                "precomputed parameters without replacement"
+                " are allowed only in the last line"
+            )
+
+            arg, with_list_raw = raw_replace_item.split(" -> ")
+            with_list = with_list_raw.split(",")
+            with_list_args = [Argument.parse(name.strip()) for name in with_list]
+            replace[arg] = with_list_args
+
+        r = Precompute(replace=replace, add=add_args)
+        assert r.to_list() == src, "r.to_list() != src"
+        return r
+
+    def to_list(self) -> List[str]:
+        replace_list = []
+        for kernel_param, replacement_params in self.replace.items():
+            replacements = ", ".join(str(param) for param in replacement_params)
+            replace_list.append(f"{kernel_param} -> {replacements}")
+
+        return replace_list
+
+
+import torchgen.api.ufunc as ufunc
diff --git a/torchgen/operator_versions/__init__.py b/torchgen/operator_versions/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/codegen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py
similarity index 77%
rename from tools/codegen/operator_versions/gen_mobile_upgraders.py
rename to torchgen/operator_versions/gen_mobile_upgraders.py
index 5721d7086f81..54c5b3a5628a 100644
--- a/tools/codegen/operator_versions/gen_mobile_upgraders.py
+++ b/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -5,9 +5,12 @@
 from typing import Any, Dict, List
 
 import torch
-from tools.codegen.code_template import CodeTemplate
+from torchgen.code_template import CodeTemplate
 from torch.jit.generate_bytecode import generate_upgraders_bytecode
-from tools.codegen.operator_versions.gen_mobile_upgraders_constant import MOBILE_UPGRADERS_HEADER_DESCRIPTION
+from torchgen.operator_versions.gen_mobile_upgraders_constant import (
+    MOBILE_UPGRADERS_HEADER_DESCRIPTION,
+)
+
 
 class ByteCode(Enum):
     instructions = 1
@@ -16,72 +19,96 @@ class ByteCode(Enum):
     operators = 4
     register_size = 5
 
-EXCLUDED_OP_SET = ([
-    "aten::full.names", "aten::full.out", "aten::full",
-])
 
-EXCLUE_UPGRADER_SET = ([
-    "full_0_4", "full_out_0_4"
-])
+EXCLUDED_OP_SET = [
+    "aten::full.names",
+    "aten::full.out",
+    "aten::full",
+]
+
+EXCLUE_UPGRADER_SET = ["full_0_4", "full_out_0_4"]
 
-ONE_INSTRUCTION = CodeTemplate("""
-    Instruction{OpCode::${operator_name}, ${X}, ${N}},""")
+ONE_INSTRUCTION = CodeTemplate(
+    """
+    Instruction{OpCode::${operator_name}, ${X}, ${N}},"""
+)
 
-INSTRUCTION_LIST = CodeTemplate("""std::vector<Instruction>({
+INSTRUCTION_LIST = CodeTemplate(
+    """std::vector<Instruction>({
         ${instruction_list}
-    }), // instructions list""")
+    }), // instructions list"""
+)
 
-ONE_CONSTANT = CodeTemplate("""
-    c10::IValue(${constant}),""")
+ONE_CONSTANT = CodeTemplate(
+    """
+    c10::IValue(${constant}),"""
+)
 
-CONSTANT_LIST = CodeTemplate("""std::vector<c10::IValue>({
+CONSTANT_LIST = CodeTemplate(
+    """std::vector<c10::IValue>({
         ${constant_list}
-    }), // constants list""")
+    }), // constants list"""
+)
 
 CONSTANTS_LIST_EMPTY = """std::vector<c10::IValue>(), // constants list"""
 
 ONE_TYPE = CodeTemplate("""c10::parseType("${type_str}"),""")
 
-TYPE_LIST = CodeTemplate("""std::vector<c10::TypePtr>({
+TYPE_LIST = CodeTemplate(
+    """std::vector<c10::TypePtr>({
         ${type_list}
-    }), // types list""")
+    }), // types list"""
+)
 
 TYPE_LIST_EMPTY = """std::vector<c10::TypePtr>(), // types list"""
 
-ONE_OPERATOTR_STRING = CodeTemplate("""
-    OperatorString({"${operator_name}", "${overload_name}", ${num_of_args}}),""")
+ONE_OPERATOTR_STRING = CodeTemplate(
+    """
+    OperatorString({"${operator_name}", "${overload_name}", ${num_of_args}}),"""
+)
 
-OPERATOR_STRING_LIST = CodeTemplate("""
+OPERATOR_STRING_LIST = CodeTemplate(
+    """
     std::vector<OperatorString>({
         ${operator_string_list}
-    }), // operators list""")
+    }), // operators list"""
+)
 
-ONE_UPGRADER_FUNCTION = CodeTemplate("""
+ONE_UPGRADER_FUNCTION = CodeTemplate(
+    """
     mobile::Function::registerFunc(
         "${upgrader_name}",
         ${instruction_list},
         ${constant_list},
         ${type_list},
         ${register_size}
-    )""")
+    )"""
+)
 
-ONE_UPGRADER_SRC = CodeTemplate("""
+ONE_UPGRADER_SRC = CodeTemplate(
+    """
     ByteCodeFunctionWithOperator({
         ${bytecode_function},
         ${operator_string_list}
-    }),""")
+    }),"""
+)
 
 
-ONE_UPGRADER_IN_VERSION_MAP = CodeTemplate("""Upgrader({${upgrader_min_version}, ${upgrader_max_version}, "${upgrader_name}", ${bytecode_func_index}})""")  # noqa: E501
+ONE_UPGRADER_IN_VERSION_MAP = CodeTemplate(
+    """Upgrader({${upgrader_min_version}, ${upgrader_max_version}, "${upgrader_name}", ${bytecode_func_index}})"""
+)  # noqa: E501
 
-ONE_OPERATOR_IN_VERSION_MAP = CodeTemplate("""
+ONE_OPERATOR_IN_VERSION_MAP = CodeTemplate(
+    """
     {std::string("${operator_name}"),
         std::vector<Upgrader>({
             ${upgrader_list_in_version_map}
-        })},""")
+        })},"""
+)
 
 
-OPERATOR_VERSION_MAP = CodeTemplate("""
+OPERATOR_VERSION_MAP = CodeTemplate(
+    """
 const std::unordered_map<std::string, std::vector<Upgrader>>
 getOperatorVersionMapForMobile() {
   static std::unordered_map<std::string, std::vector<Upgrader>>
@@ -90,10 +117,13 @@ class ByteCode(Enum):
       });
   return operatorVersionMapForMobile;
 }
-""")
+"""
+)
 
 
-UPGRADER_CPP_SRC = CodeTemplate(MOBILE_UPGRADERS_HEADER_DESCRIPTION + """
+UPGRADER_CPP_SRC = CodeTemplate(
+    MOBILE_UPGRADERS_HEADER_DESCRIPTION
+    + """
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/mobile/upgrader_mobile.h>
 
@@ -119,8 +149,7 @@ class ByteCode(Enum):
         upgrader_function.function.append_operator(
             op.name,
             op.overload_name,
-            op.num_specified_args,
-            caffe2::serialize::kMaxSupportedFileFormatVersion);
+            op.num_specified_args);
       }
     }
     return upgrader_function_list;
@@ -134,20 +163,26 @@ class ByteCode(Enum):
 
 } // namespace jit
 } // namespace torch
-""")
+"""
+)
 
 UPGRADER_MOBILE_FILE_NAME = "upgrader_mobile.cpp"
 
-UPGRADER_ELEMENT = CodeTemplate("""\
+UPGRADER_ELEMENT = CodeTemplate(
+    """\
 Upgrader({${min_version}, ${max_version}, ${operator_name}, ${index}}),
-""")
+"""
+)
 
-PER_OPERATOR_UPGRADER_LIST = CodeTemplate("""\
+PER_OPERATOR_UPGRADER_LIST = CodeTemplate(
+    """\
 {
   std::string(${operator_name}),
   std::vector<Upgrader>({${upgrader_list}});
 }
-""")
+"""
+)
+
 
 def construct_instruction(instruction_list_from_yaml: List[Any]) -> str:
     instruction_list_part = []
@@ -159,7 +194,10 @@ def construct_instruction(instruction_list_from_yaml: List[Any]) -> str:
                 N=instruction[2],
             )
         )
-    return INSTRUCTION_LIST.substitute(instruction_list="".join(instruction_list_part).lstrip("\n"))
+    return INSTRUCTION_LIST.substitute(
+        instruction_list="".join(instruction_list_part).lstrip("\n")
+    )
+
 
 def construct_constants(constants_list_from_yaml: List[Any]) -> str:
     constants_list_part = []
@@ -177,15 +215,15 @@ def construct_constants(constants_list_from_yaml: List[Any]) -> str:
         else:
             raise ValueError(
                 f"The type of {constant_from_yaml} is {type(constant_from_yaml)}. "
-                "Please add change in construct_constants function in gen_mobile_upgraders.py.")
-        constants_list_part.append(
-            ONE_CONSTANT.substitute(
-                constant=convert_constant
+                "Please add change in construct_constants function in gen_mobile_upgraders.py."
             )
-        )
+        constants_list_part.append(ONE_CONSTANT.substitute(constant=convert_constant))
     if len(constants_list_part) == 0:
         return CONSTANTS_LIST_EMPTY
-    return CONSTANT_LIST.substitute(constant_list="".join(constants_list_part).lstrip("\n"))
+    return CONSTANT_LIST.substitute(
+        constant_list="".join(constants_list_part).lstrip("\n")
+    )
+
 
 def construct_operators(operator_list_from_yaml: List[Any]) -> str:
     operator_list_part = []
@@ -197,28 +235,32 @@ def construct_operators(operator_list_from_yaml: List[Any]) -> str:
                 num_of_args=operator[2],
             )
         )
-    return OPERATOR_STRING_LIST.substitute(operator_string_list="".join(operator_list_part).lstrip("\n"))
+    return OPERATOR_STRING_LIST.substitute(
+        operator_string_list="".join(operator_list_part).lstrip("\n")
+    )
+
 
 def construct_types(types_tr_list_from_yaml: List[Any]) -> str:
     types_tr_list_part = []
     for types_tr in types_tr_list_from_yaml:
-        types_tr_list_part.append(
-            ONE_TYPE.substitute(
-                type_str=types_tr
-            )
-        )
+        types_tr_list_part.append(ONE_TYPE.substitute(type_str=types_tr))
     if len(types_tr_list_part) == 0:
         return TYPE_LIST_EMPTY
     return TYPE_LIST.substitute(type_list="".join(types_tr_list_part).lstrip("\n"))
 
+
 def construct_register_size(register_size_from_yaml: int) -> str:
-    if (not isinstance(register_size_from_yaml, int)):
+    if not isinstance(register_size_from_yaml, int):
         raise ValueError(
             f"Input register size is {register_size_from_yaml} and"
-            "it's type is {type(register_size_from_yaml)}. An int type is expected.")
+            "it's type is {type(register_size_from_yaml)}. An int type is expected."
+        )
     return str(register_size_from_yaml)
 
-def construct_version_maps(upgrader_bytecode_function_to_index_map: Dict[str, Any]) -> str:
+
+def construct_version_maps(
+    upgrader_bytecode_function_to_index_map: Dict[str, Any]
+) -> str:
     version_map = torch._C._get_operator_version_map()
     sorted_version_map_ = sorted(version_map.items(), key=lambda item: item[0])  # type: ignore[no-any-return]
     sorted_version_map = {name: lst for name, lst in sorted_version_map_}
@@ -234,7 +276,9 @@ def construct_version_maps(upgrader_bytecode_function_to_index_map: Dict[str, An
         assert len(upgrader_ranges) == len(upgrader_entries)
         for idx, upgrader_entry in enumerate(upgrader_entries):
             upgrader_name = upgrader_entry.upgrader_name
-            bytecode_function_index = upgrader_bytecode_function_to_index_map[upgrader_name]
+            bytecode_function_index = upgrader_bytecode_function_to_index_map[
+                upgrader_name
+            ]
             upgraders_in_version_map_part.append(
                 ONE_UPGRADER_IN_VERSION_MAP.substitute(
                     upgrader_min_version=upgrader_ranges[idx].min_version,
@@ -246,14 +290,19 @@ def construct_version_maps(upgrader_bytecode_function_to_index_map: Dict[str, An
         operator_list_in_version_map_part.append(
             ONE_OPERATOR_IN_VERSION_MAP.substitute(
                 operator_name=op_name,
-                upgrader_list_in_version_map="".join(upgraders_in_version_map_part)
+                upgrader_list_in_version_map="".join(upgraders_in_version_map_part),
             )
         )
     return OPERATOR_VERSION_MAP.substitute(
-        operator_list_in_version_map="".join(operator_list_in_version_map_part).lstrip("\n")
+        operator_list_in_version_map="".join(operator_list_in_version_map_part).lstrip(
+            "\n"
+        )
     )
 
-def get_upgrader_bytecode_function_to_index_map(upgrader_dict: List[Dict[str, Any]]) -> Dict[str, Any]:
+
+def get_upgrader_bytecode_function_to_index_map(
+    upgrader_dict: List[Dict[str, Any]]
+) -> Dict[str, Any]:
     upgrader_bytecode_function_to_index_map = {}
     index = 0
     for upgrader_bytecode in upgrader_dict:
@@ -264,9 +313,12 @@ def get_upgrader_bytecode_function_to_index_map(upgrader_dict: List[Dict[str, An
             index += 1
     return upgrader_bytecode_function_to_index_map
 
+
 def write_cpp(cpp_path: str, upgrader_dict: List[Dict[str, Any]]) -> None:
     body_parts = []
-    upgrader_bytecode_function_to_index_map = get_upgrader_bytecode_function_to_index_map(upgrader_dict)
+    upgrader_bytecode_function_to_index_map = (
+        get_upgrader_bytecode_function_to_index_map(upgrader_dict)
+    )
     version_map_src = construct_version_maps(upgrader_bytecode_function_to_index_map)
     all_upgrader_src_string = []
     for upgrader_bytecode in upgrader_dict:
@@ -308,19 +360,22 @@ def write_cpp(cpp_path: str, upgrader_dict: List[Dict[str, Any]]) -> None:
 
     upgrader_file_content = UPGRADER_CPP_SRC.substitute(
         operator_version_map=version_map_src,
-        upgrader_bytecode="".join(all_upgrader_src_string).lstrip("\n"))
+        upgrader_bytecode="".join(all_upgrader_src_string).lstrip("\n"),
+    )
     body_parts.append(upgrader_file_content)
     print("writing file to : ", cpp_path + "/" + UPGRADER_MOBILE_FILE_NAME)
-    with open(
-        os.path.join(cpp_path, UPGRADER_MOBILE_FILE_NAME), "wb"
-    ) as out_file:
+    with open(os.path.join(cpp_path, UPGRADER_MOBILE_FILE_NAME), "wb") as out_file:
         final_output = "".join(body_parts)
         out_file.write(upgrader_file_content.encode("utf-8"))
 
+
 def sort_upgrader(upgrader_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    sorted_upgrader_list = sorted(upgrader_list, key=lambda one_upgrader: next(iter(one_upgrader)))
+    sorted_upgrader_list = sorted(
+        upgrader_list, key=lambda one_upgrader: next(iter(one_upgrader))
+    )
     return sorted_upgrader_list
 
+
 def main() -> None:
 
     upgrader_list = generate_upgraders_bytecode()
@@ -332,5 +387,6 @@ def main() -> None:
     upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "mobile"
     write_cpp(str(upgrader_path), sorted_upgrader_list)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/torchgen/operator_versions/gen_mobile_upgraders_constant.py b/torchgen/operator_versions/gen_mobile_upgraders_constant.py
new file mode 100644
index 000000000000..04b5ad887e54
--- /dev/null
+++ b/torchgen/operator_versions/gen_mobile_upgraders_constant.py
@@ -0,0 +1,7 @@
+MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py
+ */
+"""
diff --git a/torchgen/selective_build/__init__.py b/torchgen/selective_build/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/codegen/selective_build/operator.py b/torchgen/selective_build/operator.py
similarity index 77%
rename from tools/codegen/selective_build/operator.py
rename to torchgen/selective_build/operator.py
index 68d4ba634fc1..ca80f5ad7f2a 100644
--- a/tools/codegen/selective_build/operator.py
+++ b/torchgen/selective_build/operator.py
@@ -8,7 +8,7 @@
 # situations where binary size comes at a premium.
 #
 @dataclass(frozen=True)
-class SelectiveBuildOperator():
+class SelectiveBuildOperator:
     # The name of the operator. This includes the aten::, etc... prefix
     # The operator name may or may not have the overload name. If this
     # operator name does not specify an overload name, the way to determine
@@ -48,29 +48,39 @@ class SelectiveBuildOperator():
     _debug_info: Optional[Tuple[str, ...]]
 
     @staticmethod
-    def from_yaml_dict(op_name: str, op_info: Dict[str, object]) -> 'SelectiveBuildOperator':
-        allowed_keys = {'name', 'is_root_operator', 'is_used_for_training', 'include_all_overloads', 'debug_info'}
+    def from_yaml_dict(
+        op_name: str, op_info: Dict[str, object]
+    ) -> "SelectiveBuildOperator":
+        allowed_keys = {
+            "name",
+            "is_root_operator",
+            "is_used_for_training",
+            "include_all_overloads",
+            "debug_info",
+        }
 
         if len(set(op_info.keys()) - allowed_keys) > 0:
-            raise Exception("Got unexpected top level keys: {}".format(
-                ",".join(set(op_info.keys()) - allowed_keys),
-            ))
+            raise Exception(
+                "Got unexpected top level keys: {}".format(
+                    ",".join(set(op_info.keys()) - allowed_keys),
+                )
+            )
 
-        if 'name' in op_info:
-            assert op_name == op_info['name']
+        if "name" in op_info:
+            assert op_name == op_info["name"]
 
-        is_root_operator = op_info.get('is_root_operator', True)
+        is_root_operator = op_info.get("is_root_operator", True)
         assert isinstance(is_root_operator, bool)
 
-        is_used_for_training = op_info.get('is_used_for_training', True)
+        is_used_for_training = op_info.get("is_used_for_training", True)
         assert isinstance(is_used_for_training, bool)
 
-        include_all_overloads = op_info.get('include_all_overloads', True)
+        include_all_overloads = op_info.get("include_all_overloads", True)
         assert isinstance(include_all_overloads, bool)
 
         debug_info: Optional[Tuple[str, ...]] = None
-        if 'debug_info' in op_info:
-            di_list = op_info['debug_info']
+        if "debug_info" in op_info:
+            di_list = op_info["debug_info"]
             assert isinstance(di_list, list)
             debug_info = tuple(map(lambda x: str(x), di_list))
 
@@ -83,7 +93,9 @@ def from_yaml_dict(op_name: str, op_info: Dict[str, object]) -> 'SelectiveBuildO
         )
 
     @staticmethod
-    def from_legacy_operator_name_without_overload(name: str) -> 'SelectiveBuildOperator':
+    def from_legacy_operator_name_without_overload(
+        name: str,
+    ) -> "SelectiveBuildOperator":
         return SelectiveBuildOperator(
             name=name,
             is_root_operator=True,
@@ -94,19 +106,19 @@ def from_legacy_operator_name_without_overload(name: str) -> 'SelectiveBuildOper
 
     def to_dict(self) -> Dict[str, object]:
         ret: Dict[str, object] = {
-            'is_root_operator': self.is_root_operator,
-            'is_used_for_training': self.is_used_for_training,
-            'include_all_overloads': self.include_all_overloads,
+            "is_root_operator": self.is_root_operator,
+            "is_used_for_training": self.is_used_for_training,
+            "include_all_overloads": self.include_all_overloads,
         }
         if self._debug_info is not None:
-            ret['debug_info'] = self._debug_info
+            ret["debug_info"] = self._debug_info
 
         return ret
 
 
 def merge_debug_info(
-        lhs: Optional[Tuple[str, ...]],
-        rhs: Optional[Tuple[str, ...]],
+    lhs: Optional[Tuple[str, ...]],
+    rhs: Optional[Tuple[str, ...]],
 ) -> Optional[Tuple[str, ...]]:
     # Ensure that when merging, each entry shows up just once.
     if lhs is None and rhs is None:
@@ -116,8 +128,8 @@ def merge_debug_info(
 
 
 def combine_operators(
-        lhs: 'SelectiveBuildOperator',
-        rhs: 'SelectiveBuildOperator') -> 'SelectiveBuildOperator':
+    lhs: "SelectiveBuildOperator", rhs: "SelectiveBuildOperator"
+) -> "SelectiveBuildOperator":
     if str(lhs.name) != str(rhs.name):
         raise Exception(
             "Expected both arguments to have the same name, but got '{}' and '{}' instead".format(
@@ -140,9 +152,10 @@ def combine_operators(
         _debug_info=merge_debug_info(lhs._debug_info, rhs._debug_info),
     )
 
+
 def merge_operator_dicts(
-        lhs: Dict[str, SelectiveBuildOperator],
-        rhs: Dict[str, SelectiveBuildOperator],
+    lhs: Dict[str, SelectiveBuildOperator],
+    rhs: Dict[str, SelectiveBuildOperator],
 ) -> Dict[str, SelectiveBuildOperator]:
     operators: Dict[str, SelectiveBuildOperator] = {}
     for (op_name, op) in list(lhs.items()) + list(rhs.items()):
diff --git a/torchgen/selective_build/selector.py b/torchgen/selective_build/selector.py
new file mode 100644
index 000000000000..e65ecf5eaf45
--- /dev/null
+++ b/torchgen/selective_build/selector.py
@@ -0,0 +1,285 @@
+from typing import Dict, Set, Optional, Tuple, List
+import yaml
+
+from dataclasses import dataclass
+
+from torchgen.model import NativeFunction
+from torchgen.selective_build.operator import (
+    SelectiveBuildOperator,
+    merge_debug_info,
+    merge_operator_dicts,
+    strip_operator_overload_name,
+)
+
+# A SelectiveBuilder holds information extracted from the selective build
+# YAML specification.
+#
+# It includes information about the build's selectivity, the debug_info
+# associated with this selective build (opaque string), and the set of
+# operators that should be included in the build.
+#
+@dataclass(frozen=True)
+class SelectiveBuilder:
+
+    # If true, then the build is not selective, and includes all
+    # operators.
+    include_all_operators: bool
+
+    # Debug Information at the selective/custom build level.
+    _debug_info: Optional[Tuple[str, ...]]
+
+    # A dictionary of operator -> operator metadata.
+    operators: Dict[str, SelectiveBuildOperator]
+
+    # A dictionary of selected kernel tags and dtypes. Typically a
+    # PyTorch Operator Kernel (function) may have many code paths
+    # that are specialized for many many Tensor dtypes, so it's not
+    # one per kernel function, but there could be many per kernel
+    # function. The tag isn't a kernel function name, but some fragment
+    # of the kernel function implementation itself.
+    kernel_metadata: Dict[str, List[str]]
+
+    # A set of all the custom torch bind classes used by the selected models
+    # Stored as a set internally to remove duplicates proactively, but written
+    # as a list to yamls
+    custom_classes: Set[str]
+
+    # A set of all the build features used by the selected models
+    # Stored as a set internally to remove duplicates proactively, but written
+    # as a list to yamls
+    build_features: Set[str]
+
+    # If true, then fragments for all dtypes for all kernel functions
+    # are included as well as all custom classes. This is typically set when any one of the
+    # operator lists is generated from a mechanism other than
+    # tracing based selective build.
+    include_all_non_op_selectives: bool
+
+    @staticmethod
+    def get_nop_selector() -> "SelectiveBuilder":
+        return SelectiveBuilder.from_yaml_dict({"include_all_operators": True})
+
+    @staticmethod
+    def from_yaml_dict(data: Dict[str, object]) -> "SelectiveBuilder":
+        valid_top_level_keys = {
+            "include_all_non_op_selectives",
+            "include_all_operators",
+            "debug_info",
+            "operators",
+            "kernel_metadata",
+            "custom_classes",
+            "build_features",
+        }
+        top_level_keys = set(data.keys())
+        if len(top_level_keys - valid_top_level_keys) > 0:
+            raise Exception(
+                "Got unexpected top level keys: {}".format(
+                    ",".join(top_level_keys - valid_top_level_keys),
+                )
+            )
+        include_all_operators = data.get("include_all_operators", False)
+        assert isinstance(include_all_operators, bool)
+
+        debug_info = None
+        if "debug_info" in data:
+            di_list = data["debug_info"]
+            assert isinstance(di_list, list)
+
+            debug_info = tuple(map(lambda x: str(x), di_list))
+
+        operators = {}
+        operators_dict = data.get("operators", {})
+        assert isinstance(operators_dict, dict)
+
+        for (k, v) in operators_dict.items():
+            operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v)
+
+        kernel_metadata = {}
+        kernel_metadata_dict = data.get("kernel_metadata", {})
+        assert isinstance(kernel_metadata_dict, dict)
+
+        for (k, v) in kernel_metadata_dict.items():
+            kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v))
+
+        custom_classes = data.get("custom_classes", [])
+        custom_classes = set(custom_classes)  # type: ignore[arg-type]
+
+        build_features = data.get("build_features", [])
+        build_features = set(build_features)  # type: ignore[arg-type]
+
+        include_all_non_op_selectives = data.get("include_all_non_op_selectives", False)
+        assert isinstance(include_all_non_op_selectives, bool)
+
+        return SelectiveBuilder(
+            include_all_operators,
+            debug_info,
+            operators,
+            kernel_metadata,
+            custom_classes,  # type: ignore[arg-type]
+            build_features,  # type: ignore[arg-type]
+            include_all_non_op_selectives,
+        )
+
+    @staticmethod
+    def from_yaml_str(config_contents: str) -> "SelectiveBuilder":
+        contents = yaml.safe_load(config_contents)
+        return SelectiveBuilder.from_yaml_dict(contents)
+
+    @staticmethod
+    def from_yaml_path(config_path: str) -> "SelectiveBuilder":
+        with open(config_path, "r") as f:
+            contents = yaml.safe_load(f)
+            return SelectiveBuilder.from_yaml_dict(contents)
+
+    @staticmethod
+    def from_legacy_op_registration_allow_list(
+        allow_list: Set[str], is_root_operator: bool, is_used_for_training: bool
+    ) -> "SelectiveBuilder":
+        operators = {}
+        for op in allow_list:
+            operators[op] = {
+                "name": op,
+                "is_root_operator": is_root_operator,
+                "is_used_for_training": is_used_for_training,
+                "include_all_overloads": True,
+            }
+        return SelectiveBuilder.from_yaml_dict(
+            {
+                "operators": operators,
+                "include_all_non_op_selectives": True,
+            }
+        )
+
+    def is_operator_selected(self, name: str) -> bool:
+        if self.include_all_operators:
+            return True
+
+        if name in self.operators:
+            return True
+        name = strip_operator_overload_name(name)
+        return name in self.operators and self.operators[name].include_all_overloads
+
+    def is_native_function_selected(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected(op_name)
+
+    def is_operator_selected_for_training(self, name: str) -> bool:
+        if not self.is_operator_selected(name):
+            return False
+        if self.include_all_operators:
+            return True
+
+        not_training_op = SelectiveBuildOperator(
+            name="",
+            is_root_operator=False,
+            is_used_for_training=False,
+            include_all_overloads=False,
+            _debug_info=None,
+        )
+        op = not_training_op
+        if name in self.operators:
+            op = self.operators[name]
+
+        name = strip_operator_overload_name(name)
+        base_op = not_training_op
+        if name in self.operators:
+            base_op = self.operators[name]
+
+        return op.is_used_for_training or (
+            base_op.include_all_overloads and base_op.is_used_for_training
+        )
+
+    def is_native_function_selected_for_training(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected_for_training(op_name)
+
+    def is_root_operator(self, name: str) -> bool:
+        if not self.is_operator_selected(name):
+            return False
+        if self.include_all_operators:
+            return True
+
+        if name in self.operators:
+            op: SelectiveBuildOperator = self.operators[name]
+            return op.is_root_operator
+        name = strip_operator_overload_name(name)
+        if name not in self.operators:
+            return False
+        base_op: SelectiveBuildOperator = self.operators[name]
+        return base_op.include_all_overloads and base_op.is_root_operator
+
+    def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool:
+        if self.include_all_operators or self.include_all_non_op_selectives:
+            return True
+
+        return (
+            kernel_tag in self.kernel_metadata
+            and dtype in self.kernel_metadata[kernel_tag]
+        )
+
+    def to_dict(self) -> Dict[str, object]:
+        ret: Dict[str, object] = {
+            "include_all_non_op_selectives": self.include_all_non_op_selectives,
+            "include_all_operators": self.include_all_operators,
+        }
+        operators = {}
+        for (op_name, op) in self.operators.items():
+            operators[op_name] = op.to_dict()
+        ret["operators"] = operators
+
+        if self._debug_info is not None:
+            ret["debug_info"] = sorted(self._debug_info)
+
+        ret["kernel_metadata"] = {
+            k: sorted(list(v)) for (k, v) in self.kernel_metadata.items()
+        }
+
+        ret["custom_classes"] = sorted(self.custom_classes)
+
+        ret["build_features"] = sorted(self.build_features)
+
+        return ret
+
+
+def merge_kernel_metadata(
+    lhs: Dict[str, List[str]],
+    rhs: Dict[str, List[str]],
+) -> Dict[str, List[str]]:
+    kernel_metadata: Dict[str, List[str]] = {}
+    for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()):
+        dtypes_copy = set(dtypes)
+        if tag_name in kernel_metadata:
+            dtypes_copy |= set(kernel_metadata[tag_name])
+
+        kernel_metadata[tag_name] = list(dtypes_copy)
+
+    return kernel_metadata
+
+
+def combine_selective_builders(
+    lhs: SelectiveBuilder, rhs: SelectiveBuilder
+) -> SelectiveBuilder:
+    include_all_operators = lhs.include_all_operators or rhs.include_all_operators
+    debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
+    operators = merge_operator_dicts(lhs.operators, rhs.operators)
+    kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata)
+    include_all_non_op_selectives = (
+        lhs.include_all_non_op_selectives or rhs.include_all_non_op_selectives
+    )
+    custom_classes = lhs.custom_classes.union(rhs.custom_classes)
+    build_features = lhs.build_features.union(rhs.build_features)
+    return SelectiveBuilder(
+        include_all_operators,
+        debug_info,
+        operators,
+        kernel_metadata,
+        custom_classes,
+        build_features,
+        include_all_non_op_selectives,
+    )
+
+
+def op_name_from_native_function(f: NativeFunction) -> str:
+    # This was originally read from the 'operator_name_with_overload' field in the
+    # declaration dict, which was the part before the first '(' in 'schema_string'.
+    return f"aten::{f.func.name}"
diff --git a/torchgen/shape_functions/gen_jit_shape_functions.py b/torchgen/shape_functions/gen_jit_shape_functions.py
new file mode 100644
index 000000000000..d833d757f724
--- /dev/null
+++ b/torchgen/shape_functions/gen_jit_shape_functions.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+import os
+from pathlib import Path
+from itertools import chain
+
+from torch.jit._shape_functions import (
+    shape_compute_graph_mapping,
+    bounded_compute_graph_mapping,
+)
+
+SHAPE_HEADER = r"""
+/**
+ * @generated
+ * This is an auto-generated file. Please do not modify it by hand.
+ * To re-generate, please run:
+ * cd ~/pytorch && python
+ * torchgen/shape_functions/gen_jit_shape_functions.py
+ */
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/serialized_shape_function_registry.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+// clang-format off
+
+namespace torch {
+namespace jit {
+
+
+std::string shape_funcs = ""
+"""
+
+
+DECOMP_CENTER = r"""
+
+
+const std::string& GetSerializedShapeFunctions() {
+  return shape_funcs;
+}
+
+"""
+
+DECOMP_END = r"""
+
+// clang-format on
+
+} // namespace jit
+} // namespace torch
+"""
+
+
+SERIALIZED_SHAPE_UTIL_FILE_NAME = "serialized_shape_function_registry.cpp"
+
+
+def gen_serialized_decompisitions() -> str:
+    already_serialized_names = set()
+    unique_funcs = []
+    all_funcs = chain(
+        shape_compute_graph_mapping.values(), *bounded_compute_graph_mapping.values()
+    )
+    for scripted_func in all_funcs:
+        if scripted_func.name in already_serialized_names:
+            continue
+        already_serialized_names.add(scripted_func.name)
+        unique_funcs.append(scripted_func)
+
+    output_strs = []
+    curr_str = ""
+    for scripted_func in unique_funcs:
+        serialized_code = scripted_func.code
+        # technically its higher but give a buffer bc there are weird rules
+        # around some characters
+        # TODO: this was the limit I found by googling but it seems way
+        # too short ?
+        MAX_MSFT_STR_LEN = 2000
+        if len(curr_str) + len(serialized_code) <= MAX_MSFT_STR_LEN:
+            curr_str += "\n" + serialized_code
+        else:
+            output_strs.append(curr_str)
+            curr_str = scripted_func.code
+    output_strs.append(curr_str)
+
+    final_output = ""
+    # Windows compiler doesnt correctly handle adjacent
+    # string literals
+    for output_str in output_strs:
+        start = '+ std::string(R"=====('
+        end = '\n)=====")\n'
+        final_output += start + output_str + end
+    final_output += ";"
+    return final_output
+
+
+SHAPE_SCHEMA_START = r"""
+const OperatorMap<std::string>& GetShapeFunctionMappings() {
+ static const OperatorMap<std::string> shape_mappings {
+"""
+
+SHAPE_SCHEMA_END = r"""
+  };
+
+  return shape_mappings;
+}
+"""
+
+
+def gen_shape_mappings() -> str:
+    shape_mappings = []
+    for schema, scripted_func in shape_compute_graph_mapping.items():
+        shape_mappings.append('    {"' + schema + '", "' + scripted_func.name + '"},')
+    return SHAPE_SCHEMA_START + "\n".join(shape_mappings) + SHAPE_SCHEMA_END
+
+
+BOUNDED_SCHEMA_START = r"""
+const OperatorMap<std::pair<std::string, std::string>>& GetBoundedShapeMappings() {
+ static const OperatorMap<std::pair<std::string, std::string>> shape_mappings {
+"""
+
+
+def gen_bounded_mappings() -> str:
+    bounded_mappings = []
+    for schema, (lower_func, upper_func) in bounded_compute_graph_mapping.items():
+        map_str = (
+            '    {"'
+            + schema
+            + '", {"'
+            + lower_func.name
+            + '", "'
+            + upper_func.name
+            + '"}},'
+        )
+        bounded_mappings.append(map_str)
+    return BOUNDED_SCHEMA_START + "\n".join(bounded_mappings) + SHAPE_SCHEMA_END
+
+
+def write_decomposition_util_file(path: str) -> None:
+    decomposition_str = gen_serialized_decompisitions()
+    shape_mappings = gen_shape_mappings()
+    bounded_mappings = gen_bounded_mappings()
+    file_components = [
+        SHAPE_HEADER,
+        decomposition_str,
+        DECOMP_CENTER,
+        shape_mappings,
+        bounded_mappings,
+        DECOMP_END,
+    ]
+    print("writing file to : ", path + "/" + SERIALIZED_SHAPE_UTIL_FILE_NAME)
+    with open(os.path.join(path, SERIALIZED_SHAPE_UTIL_FILE_NAME), "wb") as out_file:
+        final_output = "".join(file_components)
+        out_file.write(final_output.encode("utf-8"))
+
+
+def main() -> None:
+    pytorch_dir = Path(__file__).resolve().parents[2]
+    upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "runtime"
+    write_decomposition_util_file(str(upgrader_path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchgen/static_runtime/__init__.py b/torchgen/static_runtime/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torchgen/static_runtime/config.py b/torchgen/static_runtime/config.py
new file mode 100644
index 000000000000..9c5166719cf6
--- /dev/null
+++ b/torchgen/static_runtime/config.py
@@ -0,0 +1,351 @@
+from torchgen.model import NativeFunctionsGroup
+
+from typing import Dict
+
+
+def func_name_base_str(g: NativeFunctionsGroup) -> str:
+    return str(g.functional.func.name.name.base)
+
+
+is_hand_written_ops_ = frozenset(
+    (
+        "abs",
+        "add",
+        "addmm",
+        "all",
+        "any",
+        "argmin",
+        "bmm",
+        "clamp",
+        "clamp_min",
+        "cumsum",
+        "div",
+        "fmod",
+        "index_select",
+        "leaky_relu",
+        "linear",
+        "log",
+        "matmul",
+        "mul",
+        "narrow_copy",
+        "nonzero",
+        "pow",
+        "remainder",
+        "sigmoid",
+        "sign",
+        "sub",
+        "tanh",
+    )
+)
+
+
+def is_hand_written(g: NativeFunctionsGroup) -> bool:
+    name_base = func_name_base_str(g)
+    return name_base in is_hand_written_ops_
+
+
+def override_test_values(arg_map: Dict[str, str], op_name: str, index: int) -> None:
+    assert index == 0 or index == 1
+    if op_name == "addr":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["vec1"] = "at::rand({6})"
+            arg_map["vec2"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["vec1"] = "at::rand({22})"
+            arg_map["vec2"] = "at::rand({22})"
+        return
+    if op_name == "mv":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["vec"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["vec"] = "at::rand({22})"
+        return
+    if op_name == "addbmm":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+        return
+    if op_name == "cross":
+        if index == 0:
+            arg_map["self"] = "at::rand({3, 3, 3})"
+            arg_map["other"] = "at::rand({3, 3, 3})"
+        else:
+            arg_map["self"] = "at::rand({22, 3, 22})"
+            arg_map["other"] = "at::rand({22, 3, 22})"
+        return
+    if op_name == "take":
+        if index == 0:
+            arg_map["index"] = "at::randint(0, 216, {20}, torch::kInt64)"
+        else:
+            arg_map["index"] = "at::randint(0, 1000, {100}, torch::kInt64)"
+        return
+    if op_name == "take_along_dim":
+        if index == 0:
+            arg_map["indices"] = "at::argsort(self0, 1)"
+        else:
+            arg_map["indices"] = "at::argsort(self1, 1)"
+        return
+    if op_name == "masked_select":
+        if index == 0:
+            arg_map["mask"] = "at::randn({6, 6, 6}) > 0.5"
+        else:
+            arg_map["mask"] = "at::rand({22, 22, 22}) > 0.5"
+        return
+    if op_name == "orgqr":
+        if index == 0:
+            arg_map["input2"] = "at::rand({6, 6})"
+        else:
+            arg_map["input2"] = "at::rand({22, 22})"
+        return
+    if op_name == "ormqr":
+        if index == 0:
+            arg_map["input2"] = "at::rand({6, 6})"
+        else:
+            arg_map["input2"] = "at::rand({22, 22})"
+        return
+    if op_name == "quantile":
+        if index == 0:
+            arg_map["q"] = "at::rand({6})"
+            arg_map["interpolation"] = '"linear"'
+        else:
+            arg_map["q"] = "at::rand({22})"
+            arg_map["interpolation"] = '"linear"'
+        return
+    if op_name == "nanquantile":
+        if index == 0:
+            arg_map["q"] = "at::rand({6})"
+            arg_map["interpolation"] = '"linear"'
+        else:
+            arg_map["q"] = "at::rand({22})"
+            arg_map["interpolation"] = '"linear"'
+        return
+    if op_name == "multi_margin_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name == "multilabel_margin_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6, 6}, torch::kInt64)"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22, 22}, torch::kInt64)"
+        return
+    if op_name == "nll_loss":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6})"
+            arg_map["target"] = "at::randint(6, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22})"
+            arg_map["target"] = "at::randint(22, {22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name == "nll_loss2d":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6, 6})"
+            arg_map["target"] = "at::randint(6, {6, 6, 6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22, 22})"
+            arg_map["target"] = "at::randint(22, {22, 22, 22}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({22})"
+        return
+    if op_name in (
+        "fft_fft",
+        "fft_ifft",
+        "fft_rfft",
+        "fft_irfft",
+        "fft_hfft",
+        "fft_ihfft",
+    ):
+        arg_map["norm"] = '"forward"'
+        return
+    if op_name == "linalg_tensorinv":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6, 6})"
+            arg_map["ind"] = "2"
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22, 22})"
+            arg_map["ind"] = "2"
+        return
+    if op_name == "addmv":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["mat"] = "at::rand({2, 2})"
+            arg_map["vec"] = "at::rand({2})"
+        else:
+            arg_map["self"] = "at::rand({35})"
+            arg_map["mat"] = "at::rand({35, 35})"
+            arg_map["vec"] = "at::rand({35})"
+        return
+    if op_name == "acosh":
+        if index == 0:
+            arg_map["self"] = "at::rand({2, 2, 2}) + at::ones({2, 2, 2})"
+        else:
+            arg_map["self"] = "at::rand({5, 5, 5}) + at::ones({5, 5, 5})"
+        return
+    if op_name == "adaptive_max_pool2d_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::randint(-3, 2, {2,2,2})"
+            arg_map["self"] = "at::randint(-3, 2, {2,2,2})"
+            arg_map["indices"] = "at::randint(0, 1, {2,2,2}, at::kLong)"
+        else:
+            arg_map["grad_output"] = "at::randint(-3, 3, {3,3,3})"
+            arg_map["self"] = "at::randint(-3, 2, {3,3,3})"
+            arg_map["indices"] = "at::randint(0, 1, {3,3,3}, at::kLong)"
+        return
+    if op_name == "adaptive_max_pool3d_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::randint(-3, 2, {2,2,2,2})"
+            arg_map["self"] = "at::randint(-3, 2, {2,2,2,2})"
+            arg_map["indices"] = "at::randint(0, 1, {2,2,2,2}, at::kLong)"
+        else:
+            arg_map["grad_output"] = "at::randint(-3, 3, {3,3,3,3})"
+            arg_map["self"] = "at::randint(-3, 2, {3,3,3,3})"
+            arg_map["indices"] = "at::randint(0, 1, {3,3,3,3}, at::kLong)"
+        return
+    if op_name == "gather":
+        if index == 0:
+            arg_map["self"] = "at::randint(1, 100, {2,2,2}, at::kInt)"
+            arg_map["dim"] = "1"
+            arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)"
+            arg_map["sparse_grad"] = "false"
+        else:
+            arg_map["self"] = "at::randint(1, 100, {5,5,5}, at::kInt)"
+            arg_map["dim"] = "1"
+            arg_map["index"] = "at::randint(0, 4, {5,5,5}, torch::kInt64)"
+            arg_map["sparse_grad"] = "false"
+        return
+    if op_name == "gelu":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 6, 6})"
+            arg_map["approximate"] = '"tanh"'
+        else:
+            arg_map["self"] = "at::rand({22, 22, 22})"
+            arg_map["approximate"] = '"tanh"'
+        return
+    if op_name == "gelu_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({6, 6, 6})"
+            arg_map["self"] = "at::rand({6, 6, 6})"
+            arg_map["approximate"] = '"tanh"'
+        else:
+            arg_map["grad_output"] = "at::rand({22, 22, 22})"
+            arg_map["self"] = "at::rand({22, 22, 22})"
+            arg_map["approximate"] = '"tanh"'
+        return
+    if op_name == "index_add":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 1, {2}, at::kInt)"
+            arg_map["source"] = "at::rand({2})"
+            arg_map["alpha"] = "2"
+        else:
+            arg_map["self"] = "at::rand({16})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 10, {16}, at::kInt)"
+            arg_map["source"] = "at::rand({16})"
+            arg_map["alpha"] = "2"
+        return
+    if op_name == "index_copy":
+        if index == 0:
+            arg_map["self"] = "at::rand({2})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 1, {2}, at::kLong)"
+            arg_map["source"] = "at::rand({2})"
+        else:
+            arg_map["self"] = "at::rand({32})"
+            arg_map["dim"] = "0"
+            arg_map["index"] = "at::randint(0, 10, {32}, at::kLong)"
+            arg_map["source"] = "at::rand({32})"
+        return
+    if op_name == "linalg_cross":
+        if index == 0:
+            arg_map["self"] = "at::rand({6, 3, 6})"
+            arg_map["other"] = "at::rand({6, 3, 6})"
+            arg_map["dim"] = "1"
+        else:
+            arg_map["self"] = "at::rand({22, 3, 22})"
+            arg_map["other"] = "at::rand({22, 3, 22})"
+            arg_map["dim"] = "1"
+        return
+    if op_name == "nll_loss_backward":
+        if index == 0:
+            arg_map["grad_output"] = "at::rand({})"
+            arg_map["self"] = "at::rand({6})"
+            arg_map["target"] = "at::randint(0, 5, {6}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({6})"
+            arg_map["reduction"] = "1"
+            arg_map["ignore_index"] = "1"
+            arg_map["total_weight"] = "at::rand({})"
+        else:
+            arg_map["grad_output"] = "at::rand({})"
+            arg_map["self"] = "at::rand({36})"
+            arg_map["target"] = "at::randint(0, 11, {36}, torch::kInt64)"
+            arg_map["weight"] = "at::rand({36})"
+            arg_map["reduction"] = "1"
+            arg_map["ignore_index"] = "1"
+            arg_map["total_weight"] = "at::rand({})"
+        return
+    if op_name in ["scatter", "scatter_add", "_scatter_reduce"]:
+        if index == 0:
+            arg_map["self"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)"
+            arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)"
+            arg_map["src"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)"
+        else:
+            arg_map["self"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)"
+            arg_map["index"] = "at::randint(0, 1, {5,5,5}, torch::kInt64)"
+            arg_map["src"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)"
+        if "reduce" in arg_map:
+            arg_map["reduce"] = '"sum"' if op_name == "_scatter_reduce" else '"add"'
+        return
+    if op_name == "scatter_reduce":
+        arg_map["reduce"] = '"mean"'
+        if index == 0:
+            arg_map["index"] = "at::randint(6, {6, 6, 6}, torch::kInt64)"
+        else:
+            arg_map["index"] = "at::randint(22, {22, 22, 22}, torch::kInt64)"
+        return
+    if op_name == "special_zeta":
+        if index == 0:
+            arg_map["self"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})"
+            arg_map["other"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})"
+        else:
+            arg_map["self"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})"
+            arg_map["other"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})"
+        return
+    if op_name == "_convert_indices_from_csr_to_coo":
+        if index == 0:
+            arg_map["crow_indices"] = "torch::tensor({1}, torch::kInt32)"
+            arg_map["col_indices"] = "torch::tensor({0, 1, 0}, torch::kInt32)"
+            arg_map["out_int32"] = "false"
+        else:
+            arg_map["crow_indices"] = "torch::tensor({0}, torch::kInt32)"
+            arg_map[
+                "col_indices"
+            ] = "torch::tensor({0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2}, torch::kInt32)"
+            arg_map["out_int32"] = "false"
+        return
+    if op_name == "_convert_indices_from_coo_to_csr":
+        if index == 0:
+            arg_map["self"] = "at::randint(0, 3, {2}, at::kInt)"
+            arg_map["size"] = "10"
+            arg_map["out_int32"] = "false"
+        else:
+            arg_map["self"] = "at::randint(0, 3, {12}, at::kInt)"
+            arg_map["size"] = "24"
+            arg_map["out_int32"] = "false"
+        return
diff --git a/torchgen/static_runtime/gen_static_runtime_ops.py b/torchgen/static_runtime/gen_static_runtime_ops.py
new file mode 100644
index 000000000000..5d975d80a186
--- /dev/null
+++ b/torchgen/static_runtime/gen_static_runtime_ops.py
@@ -0,0 +1,180 @@
+from torchgen import gen
+from torchgen.context import native_function_manager
+from torchgen.model import DispatchKey, NativeFunctionsGroup
+from torchgen.static_runtime import generator
+
+import argparse
+import itertools
+import os
+from typing import Sequence
+
+# Given a list of `grouped_native_functions` sorted by their op names, return a list of
+# lists each of which groups ops that share the base name. For example, `mean` and
+# `mean.dim` are grouped together by this function.
+
+
+def group_functions_by_op_name(
+    grouped_native_functions: Sequence[NativeFunctionsGroup],
+) -> Sequence[Sequence[NativeFunctionsGroup]]:
+    if not grouped_native_functions:
+        return []
+    groups = []
+    current_op_name = None
+    current_group = None
+
+    def is_supported(g: NativeFunctionsGroup) -> bool:
+        with native_function_manager(g):
+            return generator.is_supported(g)
+
+    eligible_ops = (g for g in grouped_native_functions if is_supported(g))
+    groups = [
+        list(group)
+        for k, group in (
+            itertools.groupby(
+                eligible_ops, key=lambda g: g.functional.func.name.name.base
+            )
+        )
+    ]
+
+    return groups
+
+
+def clang_format(cpp_file_path: str) -> None:
+    import subprocess
+
+    subprocess.run(["clang-format", "-i", cpp_file_path])
+
+
+def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None:
+    code = "\n".join(cpp_ops)
+    generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN
+#include <torch/csrc/jit/runtime/static/ops.h>
+
+#include <ATen/CPUFunctions.h>
+#include <ATen/InferSize.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Fill.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/cpu/SerialStackImpl.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/qembeddingbag.h>
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+#include <ATen/quantized/QTensorImpl.h>
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/csrc/jit/runtime/static/te_wrapper.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+namespace torch {{
+namespace jit {{
+
+{code}
+
+}} // namespace jit
+}} // namespace torch
+"""
+    with open(file_path, "w") as f:
+        f.write(generated)
+    clang_format(file_path)
+
+
+def write_test_cpp(cpp_ops: Sequence[str], file_path: str) -> None:
+    code = "\n".join(cpp_ops)
+    generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/torch.h>
+
+#include "test_utils.h"
+
+using namespace caffe2;
+using namespace torch;
+using namespace torch::jit;
+using namespace torch::jit::test;
+using c10::IValue;
+
+{code}
+
+"""
+    with open(file_path, "w") as f:
+        f.write(generated)
+    clang_format(file_path)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate ATen source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="caffe2/aten/src/ATen",
+    )
+    parser.add_argument(
+        "-p",
+        "--generated-ops-cpp-path",
+        help="path to directory to generate op dispatcher .cpp file",
+        default="caffe2/torch/csrc/jit/runtime/static/generated_ops.cpp",
+    )
+    parser.add_argument(
+        "-t",
+        "--generated-ops-test-cpp-path",
+        help="path to directory to generate op dispatcher .cpp file",
+        default="caffe2/benchmarks/static_runtime/test_generated_ops.cc",
+    )
+    options = parser.parse_args()
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml")
+    parsed_yaml = gen.parse_native_yaml(native_yaml_path, tags_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+    grouped_native_functions = gen.get_grouped_native_functions(native_functions)
+    structured_native_functions = [
+        g for g in grouped_native_functions if isinstance(g, NativeFunctionsGroup)
+    ]
+    supported_function_groups = group_functions_by_op_name(structured_native_functions)
+
+    gen_out_variant_dispatcher = generator.GenOutVariantDispatcher()
+    result = [
+        gen_out_variant_dispatcher(groups, backend_indices[DispatchKey.CPU])
+        for groups in supported_function_groups
+    ]
+
+    gen_out_variant_dispatcher_test_case = generator.GenOutVariantDispatcherTestCase()
+    test_result = [
+        gen_out_variant_dispatcher_test_case(groups)
+        for groups in supported_function_groups
+    ]
+
+    write_cpp(result, options.generated_ops_cpp_path)
+    write_test_cpp(test_result, options.generated_ops_test_cpp_path)
+
+    print("total grouped native ops: %d" % len(grouped_native_functions))
+    print("structured grouped native ops: %d" % len(structured_native_functions))
+    supported_grouped_functions = sum(
+        [len(groups) for groups in supported_function_groups]
+    )
+    print("generated grouped native ops: %d" % supported_grouped_functions)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchgen/static_runtime/generator.py b/torchgen/static_runtime/generator.py
new file mode 100644
index 000000000000..9d3fea22e6ea
--- /dev/null
+++ b/torchgen/static_runtime/generator.py
@@ -0,0 +1,474 @@
+import torchgen.api.cpp as cpp
+from torchgen.context import native_function_manager
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BaseTy,
+    FunctionSchema,
+    OptionalType,
+    SelfArgument,
+    BaseType,
+    NativeFunctionsGroup,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.static_runtime import config
+
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+
+def has_alias(
+    arguments: Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]
+) -> bool:
+    for arg in arguments:
+        annotation = getattr(arg, "annotation", None)
+        if not annotation:
+            continue
+        alias_set = getattr(annotation, "alias_set", ())
+        if alias_set:
+            return True
+    return False
+
+
+BLOCKED_OPS = frozenset(
+    (
+        # non cpu ops
+        "sparse_sampled_addmm",
+        "hspmm",
+        # sparse ops
+        "sspaddmm",
+        # deprecated ops
+        "floor_divide",
+        "ger",
+        # buggy ops
+        "conj_physical",  # P495807361
+        "binary_cross_entropy",  # P496394764
+        "arccosh",
+        # uncommon ops
+        "cholesky",
+        "lu_solve",
+        "linalg_cholesky",
+        "linalg_householder_product",
+        "_compute_linear_combination",
+    )
+)
+
+
+def is_supported(g: NativeFunctionsGroup) -> bool:
+    base_op_name = g.out.func.name.name.base
+    if base_op_name in BLOCKED_OPS:
+        return False
+    if config.is_hand_written(g):
+        return False
+    if not g.structured:
+        # In case of unstructured op, we check if it has out variant implementation.
+        # The out variant implementation satisfies the minimum requirement that it has the output tensor as the last
+        # parameter.
+        if (
+            not hasattr(g, "out")
+            or not str(g.out.func).endswith("Tensor(a!) out) -> Tensor(a!)")
+            or not str(g.out.func.name).endswith(".out")
+        ):
+            return False
+    if has_alias(g.out.func.arguments.non_out):
+        # This op may create an alias of inputs.
+        return False
+    if len(g.out.func.arguments.out) > 1:
+        # More than 1 output values.
+        return False
+    if "at::Tensor &" != cpp.returns_type(g.out.func.returns).cpp_type():
+        # Returns a non-Tensor value.
+        return False
+    for arg in g.out.func.schema_order_arguments():
+        maybe_method = ivalue_type_conversion_method(arg.type)
+        if not maybe_method:
+            # Type converting is unsupported yet.
+            return False
+    return True
+
+
+def ivalue_type_conversion_method(
+    arg_type: Union[BaseType, OptionalType, Type]
+) -> Optional[Tuple[bool, str]]:
+    """
+    Return the method call expression of `c10::ivalue' to convert its contained value to
+    the expected value of `arg_type` type. For example, for `arg_type` == BaseTy.Tensor,
+    this function returns ".toTensor()", so that it can be appended to the ivalue's
+    variable name to get the value of the expected type.
+    """
+    type_conversion_methods = {
+        BaseTy.Tensor: ((True, "toTensor()"), (False, "toOptional<at::Tensor>()")),
+        BaseTy.int: ((False, "toInt()"), (False, "toOptional<int64_t>()")),
+        BaseTy.bool: ((False, "toBool()"), (False, "toOptional<bool>()")),
+        BaseTy.Scalar: ((False, "toScalar()"), (False, "toOptional<at::Scalar>()")),
+        BaseTy.ScalarType: (
+            (False, "toScalarType()"),
+            (False, "toOptional<at::ScalarType>()"),
+        ),
+        BaseTy.str: (
+            (False, "toStringView()"),
+            (False, "toOptional<c10::string_view>()"),
+        ),
+    }
+
+    base_ty_object = None
+    if isinstance(arg_type, BaseType):
+        base_ty_object = arg_type.name
+    elif isinstance(arg_type, OptionalType):
+        if not isinstance(arg_type.elem, BaseType):
+            # ListType is currently unsupported.
+            return None
+        base_ty_object = arg_type.elem.name
+    else:
+        return None
+
+    if base_ty_object not in type_conversion_methods:
+        return None
+    methods = type_conversion_methods[base_ty_object]
+    if isinstance(arg_type, BaseType):
+        return methods[0]
+    return methods[1]
+
+
+should_use_int_tensor_ops_ = frozenset(
+    (
+        "bitwise_not",
+        "bitwise_and",
+        "bitwise_or",
+        "bitwise_xor",
+        "gcd",
+        "lcm",
+        "scatter",
+        "gather",
+        "_convert_indices_from_coo_to_csr",
+        "_convert_indices_from_csr_to_coo",
+    )
+)
+
+
+def should_use_int_tensor(op_name: str) -> bool:
+    return op_name in should_use_int_tensor_ops_
+
+
+test_tensor_dim_ops_1_ = frozenset(
+    (
+        "addmv",
+        "index_add",
+        "_convert_indices_from_coo_to_csr",
+        "_convert_indices_from_csr_to_coo",
+        "nll_loss_backward",
+        "dot",
+        "vdot",
+        "outer",
+        "ger",
+    )
+)
+test_tensor_dim_ops_2_ = frozenset(
+    ("addmm", "mm", "nuclear_norm", "diag", "_addmm_activation")
+)
+
+
+def test_tensor_dim(op_name: str) -> int:
+    if op_name in test_tensor_dim_ops_1_:
+        return 1
+    if op_name in test_tensor_dim_ops_2_:
+        return 2
+    return 3
+
+
+def test_value_expression(
+    arg_type: Union[BaseType, OptionalType, Type], index: int, op_name: str
+) -> str:
+    num_tensors = 16 if index == 0 else 64
+    num_dim = test_tensor_dim(op_name)
+    size_per_dim = math.ceil(num_tensors / float(num_dim))
+    size_per_dim += size_per_dim % 2
+    tensor_size_ex = "{%s}" % (",".join([f"{size_per_dim}"] * num_dim))
+    if should_use_int_tensor(op_name):
+        tensor_expression = f"at::randint(1, 100, {tensor_size_ex}, at::kInt)"
+    else:
+        tensor_expression = f"at::rand({tensor_size_ex})"
+
+    value_expressions = {
+        BaseTy.Tensor: tensor_expression,
+        BaseTy.int: "1",
+        BaseTy.bool: "false",
+        BaseTy.Scalar: "2",
+        BaseTy.ScalarType: "at::ScalarType::Float",
+        BaseTy.str: '"floor"',
+    }
+
+    base_ty_object = None
+    if isinstance(arg_type, BaseType):
+        base_ty_object = arg_type.name
+    else:
+        assert isinstance(arg_type, OptionalType) and isinstance(
+            arg_type.elem, BaseType
+        )
+        base_ty_object = arg_type.elem.name
+    assert base_ty_object in value_expressions, "not expected type"
+    value_expression = value_expressions[base_ty_object]
+    return value_expression
+
+
+def generate_test_value_definitions(g: NativeFunctionsGroup, index: int) -> str:
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    schema_name = schema.name.name.base
+    arg_map = {}
+    for arg in schema.schema_order_arguments():
+        test_value_exp = test_value_expression(arg.type, index, schema_name)
+        arg_map[arg.name] = test_value_exp
+    config.override_test_values(arg_map, schema_name, index)
+    arg_populations = []
+    for arg_name, arg_value in arg_map.items():
+        arg_populations.append(f"auto {arg_name}{index} = {arg_value}")
+    return ";\n    ".join(arg_populations) + ";"
+
+
+def generate_test_value_names(g: NativeFunctionsGroup, index: int) -> str:
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    return ",".join(f"{arg.name}{index}" for arg in schema.schema_order_arguments())
+
+
+generate_test_ir_arguments_base_ty_to_type_str_ = {
+    BaseTy.Tensor: "Tensor",
+    BaseTy.int: "int",
+    BaseTy.float: "float",
+    BaseTy.str: "str",
+    BaseTy.Scalar: "int",
+    BaseTy.ScalarType: "int",
+    BaseTy.bool: "bool",
+}
+
+
+def generate_test_ir_arguments(
+    g: NativeFunctionsGroup,
+) -> List[Tuple[str, Optional[str]]]:
+    def ir_argument(arg: Argument) -> Tuple[str, Optional[str]]:
+        t = arg.type
+        add_optional = False
+        if isinstance(t, OptionalType):
+            t = t.elem
+            add_optional = True
+        assert isinstance(t, BaseType)
+        type_str = None
+        if t.name in generate_test_ir_arguments_base_ty_to_type_str_:
+            type_str = generate_test_ir_arguments_base_ty_to_type_str_[t.name]
+        if type_str and add_optional:
+            type_str = f"{type_str}?"
+        return ("%" + arg.name, type_str)
+
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    return [ir_argument(arg) for arg in schema.schema_order_arguments()]
+
+
+def generate_arg_extraction(g: NativeFunctionsGroup) -> str:
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    arg_populations = []
+    for i, arg in enumerate(schema.schema_order_arguments()):
+        maybe_method = ivalue_type_conversion_method(arg.type)
+        assert maybe_method
+        is_reference, type_conversion_method = maybe_method
+        reference = "&" if is_reference else ""
+        arg_populations.append(
+            f"const auto{reference} {arg.name} = p_node->Input({i}).{type_conversion_method}"
+        )
+    return ";\n    ".join(arg_populations) + ";"
+
+
+def get_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str:
+    kernel = backend_index.get_kernel(g.functional)
+    if g.structured or kernel is None:
+        return cpp.name(g.functional.func)
+    return kernel.kernel
+
+
+def get_out_kernel_name(g: NativeFunctionsGroup, backend_index: BackendIndex) -> str:
+    kernel = backend_index.get_kernel(g.out)
+    if g.structured or kernel is None:
+        return cpp.name(g.out.func)
+    return kernel.kernel
+
+
+def generate_non_out_variant_call(
+    g: NativeFunctionsGroup, backend_index: BackendIndex
+) -> str:
+    schema = g.functional.func
+    assert not schema.is_out_fn()
+    kernel_name = get_kernel_name(g, backend_index)
+    arg_names = (arg.name for arg in schema.schema_order_arguments())
+    namespace_name = "cpu" if g.structured else "native"
+    return f'at::{namespace_name}::{kernel_name}({",".join(arg_names)})'
+
+
+def generate_out_variant_call(
+    g: NativeFunctionsGroup, backend_index: BackendIndex
+) -> str:
+    schema = g.out.func
+    assert schema.is_out_fn()
+    arg_names = []
+    kernel_name = get_out_kernel_name(g, backend_index)
+    if g.structured:
+        # structured op starts with the output tensor argument.
+        arg_names = [out_arg.name for out_arg in schema.arguments.out]
+    else:
+        arg_names = []
+    for arg in schema.arguments.non_out:
+        if isinstance(arg, SelfArgument):
+            arg_names.append(arg.argument.name)
+        else:
+            assert isinstance(arg, Argument)
+            arg_names.append(arg.name)
+    if not g.structured:
+        assert len(schema.arguments.out) == 1
+        arg_names.append(schema.arguments.out[0].name)
+    cpp_func_name = cpp.name(schema)
+    cpp_arg_names = ",".join(arg_names)
+    namespace_name = "cpu" if g.structured else "native"
+    return f"at::{namespace_name}::{kernel_name}({cpp_arg_names})"
+
+
+no_memory_resize_ops = frozenset(
+    (
+        "isin.Scalar_Tensor",
+        "index_add",
+        "dot",
+        "vdot",
+        "nuclear_norm",
+        "histc",
+        "l1_loss",
+        "multi_margin_loss",
+        "multilabel_margin_loss",
+        "nll_loss",
+        "nll_loss2d",
+    )
+)
+
+
+def should_check_resize(schema: FunctionSchema) -> bool:
+    schema_str = str(schema)
+    type_variant_op_name = schema_str[: schema_str.find("(")]
+    return type_variant_op_name not in no_memory_resize_ops
+
+
+def op_name_from_group(g: NativeFunctionsGroup) -> str:
+    return g.functional.func.name.name.base
+
+
+class GenOutVariantDispatcher:
+    def __call__(
+        self, groups: Sequence[NativeFunctionsGroup], backend_index: BackendIndex
+    ) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsGroup)
+                generated_type_variant = self.op_generator(g, backend_index)
+                generated_type_variants.append(generated_type_variant)
+        op_name = op_name_from_group(groups[0])
+        body = "\n".join(generated_type_variants)
+        generated = f"""
+REGISTER_OPERATOR_FUNCTOR(
+    aten::{op_name},
+    aten_{op_name},
+    [](Node* n) -> SROperator {{
+      {body}
+      LogAndDumpSchema(n);
+      return nullptr;
+    }});
+"""
+        return generated
+
+    def op_generator(self, g: NativeFunctionsGroup, backend_index: BackendIndex) -> str:
+        functional = g.functional
+        schema = str(functional.func)
+        op_name = op_name_from_group(g)
+        populated_argument = generate_arg_extraction(g)
+        functional_variant_call = generate_non_out_variant_call(g, backend_index)
+        assert len(g.out.func.arguments.out) == 1
+        out_variable_name = str(g.out.func.arguments.out[0].name)
+        out_variant_call = generate_out_variant_call(g, backend_index)
+        generated = f"""
+      if (n->matches(torch::schema("aten::{schema}"))) {{
+        return [](ProcessedNode* p_node) {{
+          {populated_argument}
+          if (p_node->Output(0).isNone()) {{
+            p_node->Output(0) = {functional_variant_call};
+            return;
+          }}
+          auto& {out_variable_name} = p_node->Output(0).toTensor();
+          fastResizeToZero({out_variable_name});
+          {out_variant_call};
+        }};
+      }}"""
+        return generated
+
+
+class GenOutVariantDispatcherTestCase:
+    def __call__(self, groups: Sequence[NativeFunctionsGroup]) -> str:
+        if not groups:
+            return ""
+        generated_type_variants = []
+        for g in groups:
+            with native_function_manager(g):
+                assert is_supported(g)
+                assert isinstance(g, NativeFunctionsGroup)
+                generated_type_variant = self.test_case_generator(g)
+                generated_type_variants.append(generated_type_variant)
+        return "\n".join(generated_type_variants)
+
+    def test_case_generator(self, g: NativeFunctionsGroup) -> str:
+        functional = g.functional
+        schema = str(functional.func)
+        assert schema.find("(") > 0
+        type_variant_op_name = schema[: schema.find("(")].replace(".", "_")
+        op_name = op_name_from_group(g)
+        assert type_variant_op_name.startswith(op_name)
+
+        arg_types = generate_test_ir_arguments(g)
+        arg_declarations = ", ".join(
+            (
+                arg_name if arg_type is None else f"{arg_name}: {arg_type}"
+                for arg_name, arg_type in arg_types
+            )
+        )
+        arg_names = ", ".join((arg_name for arg_name, _ in arg_types))
+        assert (
+            len(functional.func.returns) == 1
+            and isinstance(functional.func.returns[0].type, BaseType)
+            and functional.func.returns[0].type.name is BaseTy.Tensor
+        )
+        test_value_definitions = generate_test_value_definitions(g, 0)
+        test_value_names = generate_test_value_names(g, 0)
+        test_value_definitions2 = generate_test_value_definitions(g, 1)
+        test_value_names2 = generate_test_value_names(g, 1)
+        check_resize = "true" if should_check_resize(functional.func) else "false"
+        generated = f"""
+TEST(StaticRuntime, autogen_{type_variant_op_name}) {{
+  const std::string script = R"IR(
+    graph({arg_declarations}):
+        %bias: None = prim::Constant()
+        %ret = aten::{op_name}({arg_names})
+        %cloned = aten::clone(%ret, %bias)
+        return (%cloned)
+  )IR";
+
+  {test_value_definitions}
+  std::vector<IValue> args{{{test_value_names}}};
+  testStaticRuntime(script, args, {{}}, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize});
+
+  {test_value_definitions2}
+  std::vector<IValue> args2{{{test_value_names2}}};
+  testStaticRuntime(script, args, args2, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize});
+
+}}
+"""
+        return generated
diff --git a/torchgen/utils.py b/torchgen/utils.py
new file mode 100644
index 000000000000..1067d5ace28e
--- /dev/null
+++ b/torchgen/utils.py
@@ -0,0 +1,398 @@
+import contextlib
+import functools
+import hashlib
+import os
+import re
+import textwrap
+import sys
+from argparse import Namespace
+from dataclasses import (
+    fields,
+    is_dataclass,
+)
+from typing import (
+    Tuple,
+    List,
+    Iterable,
+    Iterator,
+    Callable,
+    Sequence,
+    TypeVar,
+    Optional,
+    Dict,
+    Any,
+    Union,
+    Set,
+    NoReturn,
+)
+from enum import Enum
+
+from torchgen.code_template import CodeTemplate
+
+# Safely load fast C Yaml loader/dumper if they are available
+try:
+    from yaml import CSafeLoader as Loader
+except ImportError:
+    from yaml import SafeLoader as Loader  # type: ignore[misc]
+
+try:
+    from yaml import CSafeDumper as Dumper
+except ImportError:
+    from yaml import SafeDumper as Dumper  # type: ignore[misc]
+YamlDumper = Dumper
+
+# A custom loader for YAML that errors on duplicate keys.
+# This doesn't happen by default: see https://github.com/yaml/pyyaml/issues/165
+class YamlLoader(Loader):
+    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
+        mapping = []
+        for key_node, value_node in node.value:
+            key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
+            assert (
+                key not in mapping
+            ), f"Found a duplicate key in the yaml. key={key}, line={node.start_mark.line}"
+            mapping.append(key)
+        mapping = super().construct_mapping(node, deep=deep)  # type: ignore[no-untyped-call]
+        return mapping
+
+
+# Many of these functions share logic for defining both the definition
+# and declaration (for example, the function signature is the same), so
+# we organize them into one function that takes a Target to say which
+# code we want.
+#
+# This is an OPEN enum (we may add more cases to it in the future), so be sure
+# to explicitly specify with Union[Literal[Target.XXX]] what targets are valid
+# for your use.
+Target = Enum(
+    "Target",
+    (
+        # top level namespace (not including at)
+        "DEFINITION",
+        "DECLARATION",
+        # TORCH_LIBRARY(...) { ... }
+        "REGISTRATION",
+        # namespace { ... }
+        "ANONYMOUS_DEFINITION",
+        # namespace cpu { ... }
+        "NAMESPACED_DEFINITION",
+        "NAMESPACED_DECLARATION",
+    ),
+)
+
+# Matches "foo" in "foo, bar" but not "foobar". Used to search for the
+# occurrence of a parameter in the derivative formula
+IDENT_REGEX = r"(^|\W){}($|\W)"
+
+# TODO: Use a real parser here; this will get bamboozled
+def split_name_params(schema: str) -> Tuple[str, List[str]]:
+    m = re.match(r"(\w+)(\.\w+)?\((.*)\)", schema)
+    if m is None:
+        raise RuntimeError(f"Unsupported function schema: {schema}")
+    name, _, params = m.groups()
+    return name, params.split(", ")
+
+
+T = TypeVar("T")
+S = TypeVar("S")
+
+# These two functions purposely return generators in analogy to map()
+# so that you don't mix up when you need to list() them
+
+# Map over function that may return None; omit Nones from output sequence
+def mapMaybe(func: Callable[[T], Optional[S]], xs: Iterable[T]) -> Iterator[S]:
+    for x in xs:
+        r = func(x)
+        if r is not None:
+            yield r
+
+
+# Map over function that returns sequences and cat them all together
+def concatMap(func: Callable[[T], Sequence[S]], xs: Iterable[T]) -> Iterator[S]:
+    for x in xs:
+        for r in func(x):
+            yield r
+
+
+# Conveniently add error context to exceptions raised.  Lets us
+# easily say that an error occurred while processing a specific
+# context.
+@contextlib.contextmanager
+def context(msg_fn: Callable[[], str]) -> Iterator[None]:
+    try:
+        yield
+    except Exception as e:
+        # TODO: this does the wrong thing with KeyError
+        msg = msg_fn()
+        msg = textwrap.indent(msg, "  ")
+        msg = f"{e.args[0]}\n{msg}" if e.args else msg
+        e.args = (msg,) + e.args[1:]
+        raise
+
+
+# A little trick from https://github.com/python/mypy/issues/6366
+# for getting mypy to do exhaustiveness checking
+# TODO: put this somewhere else, maybe
+def assert_never(x: NoReturn) -> NoReturn:
+    raise AssertionError("Unhandled type: {}".format(type(x).__name__))
+
+
+@functools.lru_cache(maxsize=None)
+def _read_template(template_fn: str) -> CodeTemplate:
+    return CodeTemplate.from_file(template_fn)
+
+
+# String hash that's stable across different executions, unlike builtin hash
+def string_stable_hash(s: str) -> int:
+    sha1 = hashlib.sha1(s.encode("latin1")).digest()
+    return int.from_bytes(sha1, byteorder="little")
+
+
+# A small abstraction for writing out generated files and keeping track
+# of what files have been written (so you can write out a list of output
+# files)
+class FileManager:
+    install_dir: str
+    template_dir: str
+    dry_run: bool
+    filenames: Set[str]
+
+    def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None:
+        self.install_dir = install_dir
+        self.template_dir = template_dir
+        self.filenames = set()
+        self.dry_run = dry_run
+
+    def _write_if_changed(self, filename: str, contents: str) -> None:
+        old_contents: Optional[str]
+        try:
+            with open(filename, "r") as f:
+                old_contents = f.read()
+        except IOError:
+            old_contents = None
+        if contents != old_contents:
+            # Create output directory if it doesn't exist
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            with open(filename, "w") as f:
+                f.write(contents)
+
+    def write_with_template(
+        self,
+        filename: str,
+        template_fn: str,
+        env_callable: Callable[[], Union[str, Dict[str, Any]]],
+    ) -> None:
+        filename = "{}/{}".format(self.install_dir, filename)
+        assert filename not in self.filenames, "duplicate file write {filename}"
+        self.filenames.add(filename)
+        if not self.dry_run:
+            env = env_callable()
+            if isinstance(env, dict):
+                # TODO: Update the comment reference to the correct location
+                if "generated_comment" not in env:
+                    comment = "@" + "generated by torchgen/gen.py"
+                    comment += " from {}".format(os.path.basename(template_fn))
+                    env["generated_comment"] = comment
+                template = _read_template(os.path.join(self.template_dir, template_fn))
+                self._write_if_changed(filename, template.substitute(env))
+            elif isinstance(env, str):
+                self._write_if_changed(filename, env)
+            else:
+                assert_never(env)
+
+    def write(
+        self,
+        filename: str,
+        env_callable: Callable[[], Union[str, Union[str, Dict[str, Any]]]],
+    ) -> None:
+        self.write_with_template(filename, filename, env_callable)
+
+    def write_sharded(
+        self,
+        filename: str,
+        items: Iterable[T],
+        *,
+        key_fn: Callable[[T], str],
+        env_callable: Callable[[T], Dict[str, List[str]]],
+        num_shards: int,
+        base_env: Optional[Dict[str, Any]] = None,
+        sharded_keys: Set[str],
+    ) -> None:
+
+        everything: Dict[str, Any] = {"shard_id": "Everything"}
+        shards: List[Dict[str, Any]] = [
+            {"shard_id": f"_{i}"} for i in range(num_shards)
+        ]
+        all_shards = [everything] + shards
+
+        if base_env is not None:
+            for shard in all_shards:
+                shard.update(base_env)
+
+        for key in sharded_keys:
+            for shard in all_shards:
+                if key in shard:
+                    assert isinstance(
+                        shard[key], list
+                    ), "sharded keys in base_env must be a list"
+                    shard[key] = shard[key].copy()
+                else:
+                    shard[key] = []
+
+        def merge_env(into: Dict[str, List[str]], from_: Dict[str, List[str]]) -> None:
+            for k, v in from_.items():
+                assert k in sharded_keys, f"undeclared sharded key {k}"
+                into[k] += v
+
+        if self.dry_run:
+            # Dry runs don't write any templates, so incomplete environments are fine
+            items = ()
+
+        for item in items:
+            key = key_fn(item)
+            sid = string_stable_hash(key) % num_shards
+            env = env_callable(item)
+
+            merge_env(shards[sid], env)
+            merge_env(everything, env)
+
+        dot_pos = filename.rfind(".")
+        if dot_pos == -1:
+            dot_pos = len(filename)
+        base_filename = filename[:dot_pos]
+        extension = filename[dot_pos:]
+
+        for shard in all_shards:
+            shard_id = shard["shard_id"]
+            self.write_with_template(
+                f"{base_filename}{shard_id}{extension}", filename, lambda: shard
+            )
+
+        # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled
+        self.filenames.discard(
+            f"{self.install_dir}/{base_filename}Everything{extension}"
+        )
+
+    def write_outputs(self, variable_name: str, filename: str) -> None:
+        """Write a file containing the list of all outputs which are
+        generated by this script."""
+        content = "set({}\n    {})".format(
+            variable_name,
+            "\n    ".join('"' + name + '"' for name in sorted(self.filenames)),
+        )
+        self._write_if_changed(filename, content)
+
+
+# Helper function to generate file manager
+def make_file_manager(
+    options: Namespace, install_dir: Optional[str] = None
+) -> FileManager:
+    template_dir = os.path.join(options.source_path, "templates")
+    install_dir = install_dir if install_dir else options.install_dir
+    return FileManager(
+        install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run
+    )
+
+
+# Helper function to create a pretty representation for dataclasses
+def dataclass_repr(
+    obj: Any,
+    indent: int = 0,
+    width: int = 80,
+) -> str:
+    # built-in pprint module support dataclasses from python 3.10
+    if sys.version_info >= (3, 10):
+        from pprint import pformat
+
+        return pformat(obj, indent, width)
+
+    return _pformat(obj, indent=indent, width=width)
+
+
+def _pformat(
+    obj: Any,
+    indent: int,
+    width: int,
+    curr_indent: int = 0,
+) -> str:
+    assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}"
+
+    class_name = obj.__class__.__name__
+    # update current indentation level with class name
+    curr_indent += len(class_name) + 1
+
+    fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr]
+
+    fields_str = []
+    for name, attr in fields_list:
+        # update the current indent level with the field name
+        # dict, list, set and tuple also add indent as done in pprint
+        _curr_indent = curr_indent + len(name) + 1
+        if is_dataclass(attr):
+            str_repr = _pformat(attr, indent, width, _curr_indent)
+        elif isinstance(attr, dict):
+            str_repr = _format_dict(attr, indent, width, _curr_indent)
+        elif isinstance(attr, (list, set, tuple)):
+            str_repr = _format_list(attr, indent, width, _curr_indent)
+        else:
+            str_repr = repr(attr)
+
+        fields_str.append(f"{name}={str_repr}")
+
+    indent_str = curr_indent * " "
+    body = f",\n{indent_str}".join(fields_str)
+    return f"{class_name}({body})"
+
+
+def _format_dict(
+    attr: Dict[Any, Any],
+    indent: int,
+    width: int,
+    curr_indent: int,
+) -> str:
+    curr_indent += indent + 3
+    dict_repr = []
+    for k, v in attr.items():
+        k_repr = repr(k)
+        v_str = (
+            _pformat(v, indent, width, curr_indent + len(k_repr))
+            if is_dataclass(v)
+            else repr(v)
+        )
+        dict_repr.append(f"{k_repr}: {v_str}")
+
+    return _format(dict_repr, indent, width, curr_indent, "{", "}")
+
+
+def _format_list(
+    attr: Union[List[Any], Set[Any], Tuple[Any, ...]],
+    indent: int,
+    width: int,
+    curr_indent: int,
+) -> str:
+    curr_indent += indent + 1
+    list_repr = [
+        _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+        for l in attr
+    ]
+    start, end = ("[", "]") if isinstance(attr, list) else ("(", ")")
+    return _format(list_repr, indent, width, curr_indent, start, end)
+
+
+def _format(
+    fields_str: List[str],
+    indent: int,
+    width: int,
+    curr_indent: int,
+    start: str,
+    end: str,
+) -> str:
+    delimiter, curr_indent_str = "", ""
+    # if it exceed the max width then we place one element per line
+    if len(repr(fields_str)) >= width:
+        delimiter = "\n"
+        curr_indent_str = " " * curr_indent
+
+    indent_str = " " * indent
+    body = f", {delimiter}{curr_indent_str}".join(fields_str)
+    return f"{start}{indent_str}{body}{end}"